diff --git a/.fdignore b/.fdignore
new file mode 100644
index 000000000..f2c9f1d37
--- /dev/null
+++ b/.fdignore
@@ -0,0 +1,5 @@
+.*
+LICENSE
+*.{png,gif,mp4,pkl,avi,json,yaml,yml,txt}
+
+docs
\ No newline at end of file
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 000000000..57f9f8186
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,6 @@
+!.gitignore
+!*
+!*/*
+cache_db.json
+cache_tree.json
+vector_cache
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
deleted file mode 100644
index 353a649f5..000000000
--- a/docs/CONTRIBUTING.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Contribution Guidelines
-
-We appreciate all contributions. If you are planning to contribute back bug-fixes, docs fixes, please do so without any further discussion. If you plan to contribute new features, utility functions or extensions, please first open an issue and discuss the feature with us.
diff --git a/docs/cache_db.json b/docs/cache_db.json
new file mode 100644
index 000000000..507a6858b
--- /dev/null
+++ b/docs/cache_db.json
@@ -0,0 +1 @@
+{"_default": {"1": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/MANIFEST.in", "hash": "9eb1bd48d67fec2a416c494fe9f8a36d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9fc6d59f-4045-4899-b3b7-7a0e28825e37.json", "hash": "9ec209c50994c4f79c6bcb8fff23cd78"}}, "2": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/README.md", "hash": "4591c17048e92cdc8b06bb5a97635adb"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/60407808-dbe0-46dc-9baa-bd1bbef9e95c.json", "hash": "f0f1d9b9e8312b32c3a6065bc101c71d"}}, "3": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/README_en.md", "hash": "71c7de99078a9b4be5ab45afaa419e7e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/82e6a62a-d0cc-414a-8cc3-0f6befc1bc12.json", "hash": "5a2e58663ec5f1c34b458b01add2fff8"}}, "4": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/__init__.py", "hash": "89e5d5ba34744a53cc4cc5b03637c30b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9aea14bf-d2ff-4d11-99d2-f8c6e9e4ece3.json", "hash": "04d485f51570d5c6a4a1206626020024"}}, "5": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/main.py", "hash": "13ce26193a7bd939a0c183db44f886f8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/88c1bf3f-ff23-4b9a-a606-14fabeddedb5.json", "hash": "084e5a13133c81a5572b8f629af295dd"}}, "6": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/run.sh", "hash": "3ead497ae698f22190e99feaf49618da"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/62d7c90d-10cf-4d89-b01c-3cc72df41656.json", "hash": "15a9023720a2a21ae49ea99190b05a44"}}, "7": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/setup.py", "hash": "ef20805e3b6e5dcb43b5261a9c5227a7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/cfb3e1a6-0fc5-4b0b-8cec-a267353f68aa.json", "hash": "456c4dfe2cad48c766e0d81fdbd28508"}}, "8": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/README.md", "hash": "43e989f4c8ed2bdf0a47fcb6b4d25b90"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7341eb50-8847-4b5f-954c-8958e26d6f93.json", "hash": "92969614119f409b6315003fb2161050"}}, "9": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/AbnormalActionDetection/README.md", "hash": "edec8db6fba5fb9409df04aacea00447"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1189c1da-bdb9-4251-8e31-01e1996c3964.json", "hash": "2f90ee652aaf48d7622417a848e3f3d8"}}, "10": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/README.md", "hash": "f5277bb4d138e3d036183536e18a2b27"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/367d02ef-7ba3-457b-9be2-0fe49a2bf272.json", "hash": "16f0c62fd225c099d0d4ffbc8d6598d1"}}, "11": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/get_image_label.py", "hash": "7a05cae52b903d7b9c033293a322249f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b257cd96-eead-4082-bb8e-7698b6df2379.json", "hash": "bad48b21cce516565942fccf891a7b9a"}}, "12": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md", "hash": "a046efcb7f36df4e5a7f20e43d4a0d09"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0adabe49-fe07-4f8c-9e91-cc3393a33818.json", "hash": "50a232a5d9ed956931ef33ef156874a7"}}, "13": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py", "hash": "d376f92a66ee0d88be3c92870be9de9c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d7d9f6ff-e8e8-4f41-846a-989012f2c111.json", "hash": "40ecf41f9d987f0d2d85a067881dba57"}}, "14": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/predict.py", "hash": "c775b493790cce466e72ab36381701d6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c2f9f009-1d3a-467b-983f-70b52a617326.json", "hash": "60a03b1f88fa6b6f4ece31f26f4c30bb"}}, "15": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/action.py", "hash": "2e0bd27bbc3bf71fb763500555e76aa6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/00e9f015-dcba-409c-ac17-67ef217dbea0.json", "hash": "f25fe93fec8fff4556728f9564102e32"}}, "16": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/logger.py", "hash": "4e1a223b5d5813ae84130e382cba8c7b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/de1d48ad-e588-4563-9abd-adea83542eb6.json", "hash": "9aede055170053c97f25121f3b764d56"}}, "17": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py", "hash": "ef48d0391b1febc9c1ae050a05427139"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/87fe8510-d6e8-44ea-bc41-702869bc8a33.json", "hash": "9bd3531a5210e4e6cb97df150812cef6"}}, "18": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/model_config.py", "hash": "95d5c66771b629a9ced0dffa4f1457e9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9899d8bb-c2af-4b2f-977e-15fab28c8797.json", "hash": "a341eeca77396eebfab7328347126754"}}, "19": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py", "hash": "902ffb7ddad678e11d3a69b34667b9af"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/37ab8cf2-e1d0-4276-9749-038da87d02f7.json", "hash": "46cb6daaa33c5cefaf0fe07588d19981"}}, "20": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/audio_infer.py", "hash": "5a474707232a7797d8510c8a468b84a1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/46ef5c44-324f-4720-96fd-a36a2db1e04e.json", "hash": "e7e63500d295d35e41313ac803c20787"}}, "21": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py", "hash": "0e45cd7ce547a27180d53856b413e521"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b2ee5724-cdb5-4eef-86eb-e640c4d8d31e.json", "hash": "f2f23b0624b31683bd7806fa37e74a12"}}, "22": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py", "hash": "9f553fee1c78e7a5fa232879e6f7633e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/46c12cb0-b8fb-4249-8642-3ffc2ee2b978.json", "hash": "2aecb9d426bb337350d70327db18828b"}}, "23": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py", "hash": "b71ad52a7e5e9711a9737016d1dde508"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4c899b69-445e-40cf-8ce1-c99c3ff3da42.json", "hash": "6846aa2d279d34e37f3620d79ea17a0f"}}, "24": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/__init__.py", "hash": "477756d25781ccd41f896006e9689229"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7603bda9-6e6d-4af5-b5f3-5ff70b5f3eeb.json", "hash": "53758b60008ac5e8d9c0fcab73f95030"}}, "25": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/audio_reader.py", "hash": "177c3c2babb6dd29313a21f4765ec461"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/855278f2-de7a-4666-97dc-8f76f5bcb976.json", "hash": "8c1e377246aa3b7e86e5c7523225f4da"}}, "26": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py", "hash": "1823e3959c6419ae7955bf1f7b7d8efb"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/56c474d4-4253-4559-ad0f-108c809f42ab.json", "hash": "a0ef3cc52eb317fef5ddcdb7e06a97a6"}}, "27": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/feature_reader.py", "hash": "f5d9892e29c43d7495d66126d35171b0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/71c0a7a0-1364-471c-ad22-19596c791961.json", "hash": "043b79783af033de95e30ba16dd5550f"}}, "28": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/reader_utils.py", "hash": "8713cb4ac3cfdb64b36285cfac2c02dc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/eda0853e-c8c2-4ba8-a88b-b068e5097a4f.json", "hash": "2038a6d2b30a04363704b85bda84095e"}}, "29": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py", "hash": "4f48b52b073ea22669ff51936f8c4dda"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8bdc6b2c-d69d-441b-9c9b-0e5a04ddfd3f.json", "hash": "567072185997c013b090d4b305441187"}}, "30": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/config_utils.py", "hash": "e33611a9efd797e46c39873bb1a3caa5"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/66ee10d8-ca1c-4ae7-a954-ccbc36442fa4.json", "hash": "14df5491f3043a7c6b55e212956ca716"}}, "31": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/preprocess.py", "hash": "f2c1fc299b12c372fad066621c3938a4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4074085e-82d4-480b-a2cd-258f08196ddc.json", "hash": "4553f86f688b6b240adadd10ed8071af"}}, "32": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/process_result.py", "hash": "0adbb9796c43bd12263c24395375e3e7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5538fc80-7b52-4b10-b854-88f8952bc1e4.json", "hash": "44e03f2b88d7070149d09ad094b16591"}}, "33": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/README.md", "hash": "5e38f9cf98d5c20d0a38915f65de1e75"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a99d6a07-98f2-4236-a972-9c1f702833c9.json", "hash": "28e6fa6d2bd6a3d7175757ce9e106317"}}, "34": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/README.MD", "hash": "9cc641a124b4f90537ed9ec10f1a5150"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6e1d2d2e-6ddf-4b67-9f49-231e705fd17f.json", "hash": "551b8e97a6a4b537198fe14028d2310d"}}, "35": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/__init__.py", "hash": "66c915cc50faa0fd791f30f63de8f57c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/124984cd-1046-4edd-b77a-75171501d444.json", "hash": "b48b133763182cf3c80093a5b38feb5c"}}, "36": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/api.py", "hash": "23e1a94186b6241063e09f268520150e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ea45c3af-2eae-4f13-bebd-3fb7fb98a4f4.json", "hash": "51ed75ccd3159df7f14da8304175a8b2"}}, "37": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/main.py", "hash": "de47a930f966e2a0610276dbbcb3184e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0ac1a4dd-d2dd-4a35-9f3b-3b316552754c.json", "hash": "e26d6052cc71be72cf27fa5be9ec0527"}}, "38": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/setup.py", "hash": "d12b9fbd8262935003940d758efcd89c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8e1104d6-0fb2-4891-9ca2-0f69f5577975.json", "hash": "eec37c77b1f34a17af23f61aef98d1e4"}}, "39": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/version.py", "hash": "2d9fa9e0ea139db9fafd16f023c6cfa8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/611e4ac9-9776-4ba4-882d-f3b68370ee66.json", "hash": "7009116ec59a3892cbbea456c739df69"}}, "40": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/__init__.py", "hash": "049fe382e0e4e8120dbb442002d19293"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e575fca7-8d39-4ac0-a79a-138261037f33.json", "hash": "4e4f964a1aaf1338400c6221be3b02ea"}}, "41": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/version.py", "hash": "f4455359bcd4d3cbae741c93cd5d334b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/52ee5f2d-38b9-4f24-b70a-c634554414e2.json", "hash": "a4b0ddfaabba92511ea0c842767bc24e"}}, "42": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py", "hash": "146932fcf14014f5e3ac73e3621c1c77"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/33b4abff-6ca7-4d03-98eb-625dbfade5e9.json", "hash": "50c633d274a15f7a6b6b14b3470a9ddc"}}, "43": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py", "hash": "460a3b48f52858abf4963892983c6944"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/254f5728-b6b4-46fc-9bc5-c8740d72e8a4.json", "hash": "e3acac116117030bb37401ad332569cf"}}, "44": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py", "hash": "3640536dfcfe181aa34cab56106520b2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f1ef72ce-3ab3-47c7-b807-14cad6064db5.json", "hash": "cc1c1a17a58593708aac94b7c5e78366"}}, "45": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py", "hash": "4ff354211c3f3eddc70a4f9e93edd930"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bf9cdb75-d2a5-45c0-b919-4d876129c06e.json", "hash": "f6c208ccb1386d6d73216dfc4c277531"}}, "46": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py", "hash": "828978b85d14a7388e09807e89ef8a52"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d92244f7-c7ba-4764-99cd-90f2e2eadbbd.json", "hash": "9463ac43184303dad2f536f251bfb313"}}, "47": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py", "hash": "bd8a53db1ffaa0468d67bff16f4f1c9f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/85f1dba3-5dba-4fb4-988b-9b2c5535e7bc.json", "hash": "9457e63e4bae41ee69943e92655f0f4b"}}, "48": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py", "hash": "c8e4e108fde5a6c7b6397f0ed48b84d0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d0056eef-03dd-4253-9be2-549523effe6d.json", "hash": "7f8a3df5b0dd4befe93cfe9b06ab3d96"}}, "49": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py", "hash": "e9cd58de7a004c9b0b8f0fee0cc12917"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c4a979be-ebca-4aaa-80f4-ae5cf72a419b.json", "hash": "23301e6fee11345a020510e6ed692c53"}}, "50": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py", "hash": "f7702b19a57901d7d76090bbb8f0f2ce"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1aa47eef-6ab0-4aee-91fd-da4e1ad068c0.json", "hash": "142690386cf1d35701a5808274419256"}}, "51": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py", "hash": "e8140ae8710d4270df81688c44c44812"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d51c241c-3bc0-4d07-b366-bf5c685b32cf.json", "hash": "0e8bd1d0bb844c21408a99ef6fecd953"}}, "52": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py", "hash": "8fbff87ce137722bc9aeb3cf714a5eef"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/07e62c03-840e-4b69-a731-c0a13de042f6.json", "hash": "177c98281bbdf4c597717d883356993c"}}, "53": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py", "hash": "e522b7ac37cf8a93bd3e5f2f2a4050a1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/93ea5b89-273b-4528-9e44-c40bd93ff284.json", "hash": "b1441277019a4d9e12c936706f68e1a4"}}, "54": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py", "hash": "465edb2ea8e00e57b358f440035a01a0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bc402025-96c3-420c-aaae-0b1fb4e0d10b.json", "hash": "f759f5beced8200996f378b9709b265d"}}, "55": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py", "hash": "fea177788517637cf97089a0f2b95cd8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7a73be9d-7cbf-408a-a600-5a503347f8d8.json", "hash": "a10a29ecef9b2e6745f60c4070192f8a"}}, "56": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py", "hash": "0ccef75f727cbb2325dcbba80a238cea"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/89d41881-a163-49a9-a578-7e401dff943e.json", "hash": "32546aa4b181bc5e2755f259f1758dd4"}}, "57": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py", "hash": "4e71f4e3ed7c0545cb182078b12dfc49"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c6695e85-c27f-4792-8511-f4a351c56e99.json", "hash": "0fc67b838e49e1eae0cec049ef09fc13"}}, "58": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py", "hash": "022bb94ca41fe1b0ad1dfc1c2fb0c518"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/86654146-c598-48ef-8353-87802015719f.json", "hash": "ad6c6b612c854a4e9158021921863a25"}}, "59": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py", "hash": "723c643e20204c2468527feea4f4af95"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/127e4c33-f150-44d4-b70f-14f9e61c18cc.json", "hash": "ea517048135daba3a8339b7519f2272c"}}, "60": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py", "hash": "ad0081e38a2b7af653c7e9d5154cf7c7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d42d08af-e62d-41d7-9e7f-56974ebfd3b8.json", "hash": "b8336acd57cb68084a2253c23fbebfca"}}, "61": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py", "hash": "9f80ca5c25c348eff44fdfcfdec38ba9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/60684849-b451-4aa4-b8c8-fb5805eb66d6.json", "hash": "b00f834cdbd72891658bb993d3530e1b"}}, "62": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py", "hash": "b39467bc04653f851572d50a79743a18"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3ca025f8-4e3a-47be-be41-e1cad067fd2a.json", "hash": "749fb4779793d26ae2111c9f2c67381e"}}, "63": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py", "hash": "794d02a7af2bb39ed22ad5023b6cf469"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/92c95f7f-34ab-4b71-809d-39d9cb2e56df.json", "hash": "2b203ff775d145c00c27cc5516535555"}}, "64": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py", "hash": "a6f8aefad405ef7dd1496fb809647091"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9207f54d-f34e-430b-b1ba-3c78b87b0be4.json", "hash": "d725e4d7462277d28ccdff6c18c6684e"}}, "65": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py", "hash": "ce57f803a154df79ea01febe650f2646"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2debd80a-ec06-4938-a0db-556a83147597.json", "hash": "29c1e362270121e25aa73d751d24bc71"}}, "66": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py", "hash": "40d5fa58513db548f9bcd676db5fbcf1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/45ce3ebf-27d8-4371-a3cd-e3850e588fe8.json", "hash": "449c9d58a4c4c46652e491710ceb6ebc"}}, "67": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py", "hash": "a1d57ff2bb419323fb8cd40098c3754f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/884319ff-6b34-4e5a-9f4c-af09557da1e5.json", "hash": "0d5768de3527aaa1729085fad1b702ce"}}, "68": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py", "hash": "f78c5f117ddfd10c6a675cd551262889"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9a349970-8717-43c2-b78c-b04b5474c921.json", "hash": "d291e0a571038eeb355561302ddbbc23"}}, "69": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py", "hash": "226c8f690685e90045c530df9321c8ee"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0fea23e0-ae4e-4c5b-908f-e22fba946bd0.json", "hash": "8a96fd77db4e28ac52238c8c6b82fa08"}}, "70": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py", "hash": "9e59e5067c26b3c05aaab1a8917fbd0e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/81b22ac0-b39e-40e0-bab4-f281ec2e5eaf.json", "hash": "c9ab6d8c83fdc56188b75c6935dd1557"}}, "71": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py", "hash": "4b0f6ba0362a7e1f38e978a75ed75594"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bc87640d-9bd7-4ce9-88de-36df7379d29d.json", "hash": "d3ab963fc4809c9e9f64603eb9d2df91"}}, "72": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py", "hash": "5c2c2e138b7b27a50dabc55ec9c429f5"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9f2d24de-b1d7-492a-bb59-67a77a638ee3.json", "hash": "d709fcf014983fcddd664b1293018704"}}, "73": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py", "hash": "2dbd59ebd812e5a8705523ddd0461061"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/12eb8ee3-6d8e-4bf8-aa1b-1a6465f46372.json", "hash": "3a394a66db89388bba1b0acc7b42672a"}}, "74": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py", "hash": "41e54e6438c2eca20e254f0df7abaa79"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3afd1d71-0f56-4c75-a411-549376b01b34.json", "hash": "30db349bd29d9b939f1bfe15d2505482"}}, "75": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py", "hash": "94ada3990bb6fad5c1502487d157ac8e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c6dd135c-d0ae-423f-bfaf-4f9c9d471b16.json", "hash": "5cdf025a9d08436108566352572b68f8"}}, "76": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py", "hash": "2ba3711c06c28228ef9d6d1c379eb450"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c57980fa-dba6-436c-ba71-56c4d9826ed4.json", "hash": "79bfd4d2288a8a6647526c2ef6f42729"}}, "77": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py", "hash": "ccb77e31601f10f511ef83520c321d67"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c009818a-14d8-4300-8db0-70ffcf3fab9d.json", "hash": "9dbd532f38e54b926ca3e71779e9501e"}}, "78": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py", "hash": "f8c2eba7a22f3b7dfc35dcf1cdd4b73a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/398b9feb-19d9-447a-bdeb-ce6acf427f06.json", "hash": "7667376fa33b8ddeaba75ee6d1b337da"}}, "79": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py", "hash": "612b5d80d06931269ffb851c3d906752"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/89e4841f-b7dd-41b0-b204-7594540da115.json", "hash": "15951c84bab4d5e364f8c587217a6c4a"}}, "80": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py", "hash": "04d747b007c1a069c33bb43bc2f4ffcd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6b1c905d-49db-4909-8ae6-bf47e973e703.json", "hash": "5624ce024205c58f9c374a62bf1bb2d0"}}, "81": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/__init__.py", "hash": "119e439572161238ac24601905a702b8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/354a97b8-a143-47ed-9284-36d917d0fd1a.json", "hash": "2d1d98cfe4d371fd1f48d497eac72d7d"}}, "82": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/build_gui.py", "hash": "080dcedddb0d8a5e3d14b7ad6caf5161"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/929ed878-ca6a-465e-bfb5-f439a1a19297.json", "hash": "bfad0822185427f5665aca7058b9f874"}}, "83": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/start.py", "hash": "5b2c9358cf6049d2dbfaf5bcaeb5bc1e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/aed52e27-f4f9-4f88-8b68-834cd809a660.json", "hash": "973e16c78fcb6ba8247c77f2c709ac28"}}, "84": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/__init__.py", "hash": "1520bbcc3a21c4e7c57b483504ae31e0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/dff1d35e-1cfe-4b4e-aeb2-cc058c1f0e6f.json", "hash": "c6b74ca9fb1c3ac46a91506365b69d1a"}}, "85": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/demo.py", "hash": "5416dc9b76d316085d4091527afcc61f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/db149562-1b92-4a87-8389-fd7a3addc401.json", "hash": "0602b5d09cc90571ea0a7c8a4dc20cf6"}}, "86": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py", "hash": "f9ab5d6aad7fe31b99ca6300309482d0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/491e594e-2388-4319-80c0-77b4ae736363.json", "hash": "33a10039304e71b55e57b79b0ccb1439"}}, "87": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/ui/demo.py", "hash": "41358022f354fa717f70f425dd15ec5c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e48129d1-3eac-49d2-9ff5-4c31f24a68b1.json", "hash": "a5571ec7bc5b9c8fb95829b3a5a6cff7"}}, "88": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/widget/PaintBoard.py", "hash": "1ec38c299d41a3492e7111c40948d22a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4d09faa8-ce8a-428b-983c-98e28504bd5a.json", "hash": "60aea65493af305aa283a48f6d09ec2f"}}, "89": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/resources/cmd", "hash": "280091086953fec14ab269632cd134ca"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e503dca0-1546-4303-8a8e-4e2ba7b82c35.json", "hash": "53e364c677d0c5f35ce23125b04e8c5a"}}, "90": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/resources/QT/demo.ui", "hash": "51ee4193e9c503e94513a487cfb6db45"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a1a73c0b-b4df-44d5-aa4a-4e011115f77f.json", "hash": "48eadfa56bab4065d59114531fb389e3"}}, "91": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FightRecognition/README.md", "hash": "834e1497482511f3599329c717388ad9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/aa52229a-f66f-49fa-8f7e-59b2ef30f610.json", "hash": "0397d9e1aaf35ca25f84dda255a78b5b"}}, "92": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FigureSkating/README.md", "hash": "94aefbc9e4155b9edc637cd7cc24ea42"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/cbad7aa4-450b-4d96-94e5-3befd12c571c.json", "hash": "6e170154333ceef9f25788f1975e20bb"}}, "93": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md", "hash": "995161783d93bf4ffa5f69d65c983a25"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a141b6a6-d3e3-4f1e-af36-8b59768eb5c9.json", "hash": "04dfa521e57ae396c844ce93cc1b93a2"}}, "94": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/checkpoints/download.sh", "hash": "d9e640df0b264fc59a5c591b6443ab66"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/57ebb0bb-5f73-4cec-b336-49525daca8bd.json", "hash": "cc7a8d2840eff010068d4921f2f3f595"}}, "95": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/dataset_url.list", "hash": "6531402b68458ada0b0d9334161adcc6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e0d2abc9-e634-493e-ae40-fbf5b535b19c.json", "hash": "9a97423fe88b02dadebbd201977293bd"}}, "96": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh", "hash": "18cd098ada5ed20c43f88127e8de9ced"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/881ad374-03ef-435f-a923-dc342e4b2b00.json", "hash": "fd9aa87efc22053f539f01147c1fc261"}}, "97": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/url.list", "hash": "d4146d93c1dfaa4e37b255bde0191971"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/133c60ab-9ffd-445e-9459-52ac1d4dd941.json", "hash": "5bc27add1c85829ee196efa601dc4737"}}, "98": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/url_val.list", "hash": "bbe1159b9e26cdfc2cff1335cf695fe2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c475c2a1-a6b7-4c12-9c4b-6110396a61c1.json", "hash": "a7d20d79b7fdd65105c8004ca363b35c"}}, "99": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_frames_pcm.py", "hash": "906e6ebf80d3e5a38391180c9bde3310"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/06d1fdf4-f110-42ec-a870-5d8c495e2d54.json", "hash": "45335bd3f565a79260811cac7d809555"}}, "100": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py", "hash": "e0d94c3007c91b8a3b50156d10db6ac2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c5c75f55-929a-4e2b-9a16-b39c42a2c8a9.json", "hash": "55e4efab82e6347c3a844693147cd305"}}, "101": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py", "hash": "5d9b11ee427fee0eb70b350c27dbd75e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a851f9ec-67b0-40ab-8a4c-646dbc068aa9.json", "hash": "d3a4ec6ddf70eceac63f431955dec9cc"}}, "102": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_pptsm.py", "hash": "bba679a00245562d383390a740bd36ef"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fe14e8ee-bdf8-47b0-be0f-8c97328a5783.json", "hash": "d860cd6355763de11295c2f7dbcc70a7"}}, "103": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/extractor/extract_bmn.py", "hash": "900322d6b7e2434eaa510bf38c04a335"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b05c9015-12d3-4ab9-acb2-2925a7e10b3a.json", "hash": "54777f10cdc360dcaa5405826b775505"}}, "104": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/extractor/extract_feat.py", "hash": "0bed280279723c8b5e116807aaf42d90"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/382773ae-ed48-48ec-bf7c-7d0ca872f997.json", "hash": "3eddea11a10c8413cae656fc2f7836f7"}}, "105": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py", "hash": "a927dff5db41c03141e70597f9623eeb"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8818dbc4-c599-4efc-b86f-41c14b428207.json", "hash": "c2fa0c7662d8c150da48699bc7b2613c"}}, "106": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/predict.py", "hash": "8b7fcb6cf41c3279949a1504e0670353"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/52e7b804-02ff-454a-9a3d-fc306cd4077e.json", "hash": "fe0bf09ff546cffc89bec4d0d71227af"}}, "107": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/action.py", "hash": "2e6579ba45b6d456204183d27cdf2a1e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4491a653-9505-4cce-9c8b-ca3be2d18b07.json", "hash": "50ddd389bc7766abbe7c4f8bdae33b9b"}}, "108": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py", "hash": "fbddfc8fe74334a60bd979acec7ba5b1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ca88b2a2-aef0-4ae4-90c5-f0f306873d19.json", "hash": "a29dfd68846f4ee5a319a8f0b4711fa9"}}, "109": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py", "hash": "8431973a73c0e48d69508dc8eb50d122"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/61b4dc3e-6eac-40db-882c-b20a9c142305.json", "hash": "56ec2a4b1a243d21832d98fffe926cbb"}}, "110": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py", "hash": "42d73edfb525b6e1fcea99a3dda763c5"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/004a4436-34f9-4a92-b22d-5cf5767cc8d2.json", "hash": "45fa087b18ae817604051e87163e73a7"}}, "111": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/pptsm_infer.py", "hash": "af49f5c53dee492e0ccb632a7c1ee190"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c774b682-2827-4da6-8133-61dbceffd840.json", "hash": "e8b56b635fc4f0ec091023b1259d1d41"}}, "112": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py", "hash": "d48953d7da54bf8c5092695f752c3ae9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2fcf21c5-4cd0-4b12-a56c-5e2b57e23fb2.json", "hash": "4bf9c68df1fb98d14d8bf040de0f1779"}}, "113": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/feature_reader.py", "hash": "de7522881b48ad20c5372cdcb1924902"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f3c38d5a-f9fd-434e-a15e-72f8972e53fe.json", "hash": "a178306145342a85f837445caf08e563"}}, "114": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py", "hash": "333a890e71c2e5cf280a9ee760c28744"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7f9ae73b-0500-420c-b1ce-eac33b5e3d73.json", "hash": "558c03d837209cce8e95eeb31dec9a6e"}}, "115": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/README.md", "hash": "1544fd2eee7022770351911434c09502"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/aec3b1b1-2fd6-489b-a358-2174bd3aa0d0.json", "hash": "846eb05f7bde9123d828e1cfcfa9a58f"}}, "116": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/README_cn.md", "hash": "3d31fc1848795198af9effe1f6b6f5f1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/66865755-6b7e-42fe-96dd-ffce2b85fd05.json", "hash": "0f11f96caad6de76dd141f1f37a66a06"}}, "117": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/config.py", "hash": "ce27340a1578741c8f703b563bc2110b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/911f800a-47b0-41b0-8261-06017d9187f6.json", "hash": "97b40a409dec465aebec68fc0a5c03ad"}}, "118": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/run.sh", "hash": "213f506fbe0f1daf7ab8f2ba4e8d1560"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/03bf07da-387b-4249-9ec5-8a856389eb2f.json", "hash": "8774c1a87fc79e981e242d8608382c0f"}}, "119": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py", "hash": "f4e06e6138b7fc1c989cbe327ce1bfa7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/54e499bb-46a2-451a-98ec-6b736112b2db.json", "hash": "6641477f253cba648153160fbbdcc5f8"}}, "120": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py", "hash": "72634c5850928a50fb2f768c835b3d22"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6fd87d67-4e55-45c8-978e-44ad5f14fddb.json", "hash": "839168d9e71b5b6bcce229936a94d2d2"}}, "121": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py", "hash": "8bc98359142ab54f4ee39026eca1b768"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/918d2d55-8374-4222-b2e7-d252e0bbbd0f.json", "hash": "4b68b8cc25148045ae283d2a2e348df2"}}, "122": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/DAVIS2017.md", "hash": "e5bd36638c75a0f9d7e150974df68a31"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5a3e94c4-95f1-4cb3-8256-434f1a293916.json", "hash": "470e73d614441aaa7e7f681d0be8bb2d"}}, "123": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/DAVIS2017_cn.md", "hash": "6f582fcf61ca46b307b97b821a906a0f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/42da3399-dbce-42b4-8259-d88277b5384d.json", "hash": "1e2f278977730d95925e1541009137c5"}}, "124": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py", "hash": "880c6032a929cb02420060a07cf3bfa7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d8181f92-5168-4dcf-996f-09bcce0cd488.json", "hash": "5ad803f5d8a7df0daa6552b9703fa53b"}}, "125": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py", "hash": "685389dfc43121ace2212192c35ca9e5"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/294e5c0a-0354-42fc-a66a-9bd9614a892d.json", "hash": "a20bf5e17b2183402025e88252dae7d8"}}, "126": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/helpers.py", "hash": "2b7f0f9cee18a4ba4bcc112a48065a91"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/af9fe02c-7c3c-481f-8311-f5310a02ab73.json", "hash": "7a3bc2d4a9d1c3bfa8bc9b04c10410a6"}}, "127": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/samplers.py", "hash": "2e2ec435d60c46db57959ce06cc108da"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d853bea9-87b8-4332-a755-4f5ce7f250cc.json", "hash": "d43c35150bdd1ae51ce95b33ee19b568"}}, "128": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py", "hash": "388296c934e3aa72721cf72580c28354"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1425e044-d978-4014-9936-3d6c863fb682.json", "hash": "cd46150b32301987e8f63f04129264f3"}}, "129": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/aspp.py", "hash": "0801d7182957ef417a98902fb8293b33"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1feadf68-13cb-4302-9caf-d3d0a954aea9.json", "hash": "3d3711b08c3443a169954e254f2f897f"}}, "130": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/decoder.py", "hash": "c6335952eb4f70d0b75f49754b9fa7a6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b8b83e13-cc8e-4883-9d38-44941a776dd2.json", "hash": "1f96426ae34416c97d03be7e7ec9e331"}}, "131": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/deeplab.py", "hash": "c1264875ad08a61945a5f87655cff6ff"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/73925f43-4347-466a-8d2e-03273cc295ba.json", "hash": "a63fdf4219ca5a949161422bd6a91445"}}, "132": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/loss.py", "hash": "189c813b51f8274d480c2a2b214b1d2d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a899b6a8-5eef-4b1f-b7bf-6b709f297854.json", "hash": "486817a521c0c6819e857b78abf10d11"}}, "133": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/__init__.py", "hash": "d578910e29a9d77fecd1bf9f7f11126e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/61a34886-574f-4ef7-b81c-18e6b5d404a3.json", "hash": "f620b102167228e991593cbb284f5b3e"}}, "134": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py", "hash": "2e9ffeba4aa4085c60b0efebc49933df"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c03d34db-0b44-4eaf-b7cd-10271f89d591.json", "hash": "f2c49dcf1b527a6fa631d72aebe587bb"}}, "135": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py", "hash": "4aa325d94c5c106326cb98b9ded3fdb7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c8ac3bc9-9b4c-4e2f-a884-01d4a4db219f.json", "hash": "a7e8b83dd97bf349574968f98ca0f9cc"}}, "136": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/resnet.py", "hash": "f1ca2745c0ad60d041fd66b03fec2448"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7660795f-9adf-4ac1-95f7-99eb92def73f.json", "hash": "2a163d1a2f29f577d7e62b2b5ae4c6d6"}}, "137": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py", "hash": "f1de5d62e9484c79c50d3cf97011daba"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e878569f-fa06-4db7-b67c-3dc1f40f3333.json", "hash": "177309ed9a7d65edbc121d20cc0fffe1"}}, "138": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py", "hash": "50edf1895547d57fa9d65574868f62b6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/478e000f-a0fc-403b-88e2-d2301a2afd4b.json", "hash": "f3b39d3482d4d236d8eba93ec05e7d54"}}, "139": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/mask_damaging.py", "hash": "4eed8f6355d806a88ad792573bf28a49"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/401cb792-b6b6-4218-b04f-42235071fca2.json", "hash": "601e59e77d6d874477d6e1557cfc9464"}}, "140": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/meters.py", "hash": "b6ceab7790b29c732cf0124701ac7d83"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fe7a66ca-368e-4787-ac41-61bb6031d076.json", "hash": "3a1846f2021f1071faacec50b1b86c90"}}, "141": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/utils.py", "hash": "a2f9501c5303da5f7cfd514e0b93b323"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/334d3227-e933-4a5b-b9a2-192dee28148a.json", "hash": "31155fbbd83d941dcecb8ce79b910788"}}, "142": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/README.md", "hash": "7b24bc62d874caf3128288eed85f36e9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e371deba-14da-422e-a073-d06fb490edd5.json", "hash": "3e66c0c15264f7f227f36c6bdf431ae9"}}, "143": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/download.sh", "hash": "30247ed078c653d38c3704be3492c1c9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/89b44dba-75d2-4f44-b775-2d9b8dd6dd76.json", "hash": "534d4b44fed8a39a029efc2e7cbdb5a2"}}, "144": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/eval_and_save_model.sh", "hash": "103c737e538ecb1a9372bc71dda371f2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/31782bb7-e595-45e6-9f2a-de9c0ea82e0e.json", "hash": "4cfc5f28efec0b0c2843e1b1cf0af979"}}, "145": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/inference.sh", "hash": "47bdbdaecfd9fe2d0b94c056dfe4fed5"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/387159e6-5d8e-4b52-a71c-0d46a3d29883.json", "hash": "4633332f97f3121b419cc7155661c69f"}}, "146": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/train.sh", "hash": "0f7d23d2a6a6542ca825340b6a5f1a33"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a2822f22-d245-4950-aab3-a46099ed9e18.json", "hash": "5d0c5f7281e58d1dd7330beb33370e66"}}, "147": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py", "hash": "9a46af60f700f9fd000e2725dc877a8d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5845a5da-55cb-4023-80e7-a48cbe9a0448.json", "hash": "094ad10b30501c6bf25656626cee08fd"}}, "148": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/config.py", "hash": "0334c83c2e44d08302c524bbeb559dd2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/468ec0a1-622d-48fe-993d-08d86f88d526.json", "hash": "fe9b18dbead6bcb04023e73fd43b4824"}}, "149": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py", "hash": "5b02a5cc081be04bb06e1bc5bfc1cbe2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6ff25d46-6c9b-49dc-88ce-c9f0fdfdd53e.json", "hash": "e35357611fc1e0d1516350c14ccb6ddf"}}, "150": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py", "hash": "f1f084e151dd007ea04ab0eb2b816f93"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/42efd6d7-5026-4a7c-b967-1321ca165543.json", "hash": "19b7759cd5b4b313dbf9344f8381d465"}}, "151": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py", "hash": "7e297f1af3067fc3302c87bb59699ecc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e6df6256-e18b-4ad5-98b8-5e8f9ec0824e.json", "hash": "f32314db2d01a1f76c3a201fcd7ee3dd"}}, "152": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py", "hash": "e2cc60ef7e9e66f31e83908d1ae411f0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/10e6c062-a9f7-4bdc-83c7-fdf664335942.json", "hash": "8b3953cbef47b35ef70953d5c098f0c7"}}, "153": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py", "hash": "928d68210c875dcfe50ef83490487e50"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9b86ea07-cecc-4dcb-b37e-5c9ca154d616.json", "hash": "ba8816f96388f15d6e43e667283c8688"}}, "154": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py", "hash": "40694db68ea9b21a2c0abeb3b52d1ccc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/413c1454-cc8c-4299-a143-292ac01c3855.json", "hash": "5c6a494f4d1e4232717bce2930cdc1ab"}}, "155": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py", "hash": "ffd17710ea114214afaa5a16da6c4733"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d4cf6c01-e76c-4b45-bddb-e8335224bdb4.json", "hash": "fb28209f8bf602044a586659294b4005"}}, "156": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py", "hash": "6c75e0cf5c8da2443da4e8b7ebda7c4c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8ea88484-a9f1-4daa-8c3d-5b3291deee68.json", "hash": "d9ab3e5fc653a4b8f459467d30b390e4"}}, "157": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py", "hash": "086cff9944d17eb305c0f0f6835c73f4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/326f0f75-c6ba-4259-8a55-bc71c6c6be8b.json", "hash": "b97970e0d6aaca5b0e4a99256fc7343b"}}, "158": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py", "hash": "fae37e42d2a79041d9232734cd711edd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/487818af-e889-45da-85cd-ee8335367a08.json", "hash": "d37f87c75394925e3adcc63d86589dac"}}, "159": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py", "hash": "6806a6132d939640a3da5c348df59723"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/34eb7bf8-5164-4b64-84b1-483de0790aa1.json", "hash": "5276f0b4130680fde098783bf7dfb787"}}, "160": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py", "hash": "93230b78688b26cf5a396bc21a9b32b1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/aa748ea4-6901-47a5-af9a-c291ee4adb6a.json", "hash": "1a3ab8a3fc3a755f634df143a0a0f3b9"}}, "161": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/PP-Care/Readme.md", "hash": "4b2f559a5b734036f55176b0d2ac8632"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ff189f5d-9b37-4d8c-8994-fadb9831d5b6.json", "hash": "e339efb03c948d1b747a1b7b345ca089"}}, "162": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/README.md", "hash": "dca9ebb5884c89b40b7419803123ef15"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c2429992-e65c-4762-a20d-99a8ffe36e80.json", "hash": "eb78c90ae09fc69e26572bf7554d2952"}}, "163": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/datasets/prepare_dataset.py", "hash": "438dcb0f45a3ad4731e6d24104393540"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1242c244-2702-4893-9d20-f2b36989dc42.json", "hash": "3cca911da5a3e164509ff62a0162ae71"}}, "164": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/README.md", "hash": "ae007ab62eabfba478ae87478fc6dd85"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ebad8e11-686c-45de-be4f-69becec66ed9.json", "hash": "414d0899dfb65b5d337eb4c5b94805ab"}}, "165": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/README_en.md", "hash": "2a6232cc68a18da196f6bec0d6de3660"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5c276339-0250-4b32-8f3e-e5eec13f32c4.json", "hash": "a17468d2bdf600f0007c6d0438b3e8a1"}}, "166": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py", "hash": "03bb5e16717fa0ba84011c6f16b48f70"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b68f2b94-9a8b-4e54-b303-e2f1d8fec70a.json", "hash": "ae6ca4c8593c9db9a744478fe4f5c368"}}, "167": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/test.py", "hash": "64d4f50bb2e62e66f951bcfad58bf909"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e9b1554b-f961-40dd-bcee-c0ce8ece8f10.json", "hash": "313e5c4556987625069c1efa8bdaca8c"}}, "168": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/train.py", "hash": "509fe661be872432cec297fc9ca2ade6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/249d4907-6cf9-47a6-8199-4eb5c9f53708.json", "hash": "98fc575c98aa664555411b63aec377ef"}}, "169": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/__init__.py", "hash": "47ef066dd585d725ea3742ddbaf8789f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/afc1c7f3-617b-4fa5-8a8c-2ed81cf76d66.json", "hash": "c47c4579834d6eb5ac36a0c5c0d7f6f6"}}, "170": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py", "hash": "0588c6e7c97c8976f4fd07984930e8e0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/537299c8-cfa2-4eb5-a8b7-1f055d82abc1.json", "hash": "862f5f2c63bf0805178f02b0f4e69a01"}}, "171": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_model.py", "hash": "4aefc24fd71680d642fb77ac601678b1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/eb9035c9-4dfc-4366-87b0-d019e7c33732.json", "hash": "1cfe9c45a1cf2fc319cdae6a81453601"}}, "172": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py", "hash": "9ae9ee4612d8bc528e3cc1860c9e5bcf"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c415ca52-7b9b-4718-a2eb-06307b263e3a.json", "hash": "d5486cf61f588329d211ba3a0a46b623"}}, "173": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data/download_features.sh", "hash": "86c8b8e0dea1d89807ecd3878d95aedd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0bd12859-cb6d-45f6-9e9d-231b130a9b71.json", "hash": "afb8c3def3e2b4147bb237ce3bc12bf5"}}, "174": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py", "hash": "e490ca41ab093349d8834d25008073dc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fd74e93d-2298-44b2-9f58-9e3e7162ad1b.json", "hash": "a4fe76f4dc0c8625f822b7001a86655a"}}, "175": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/data_loaders.py", "hash": "1b2600ef5da4450c2a1ac1dfcf448e64"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b3fa42ae-a035-4215-ae3a-448d2f68ffca.json", "hash": "3feb40bd14b038b2565ca0f614014dfd"}}, "176": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/__init__.py", "hash": "02ae470ff50e86640056414cd3db6f6d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/96d6083f-7eee-4521-879b-900fb8576159.json", "hash": "fcbb278379f8ec6e0b5291671d78b5a2"}}, "177": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/log_parser.py", "hash": "109b816b2b91147f0c554df697be1e54"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/98ccb6a1-3f49-4753-a871-0e0f550fa97f.json", "hash": "2690410bd119b3bdca634d3994004d33"}}, "178": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/logger.py", "hash": "2671ab3b0c2c7df473623139fffa4b60"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/945ca542-cb79-4595-ab34-f63406fa5b18.json", "hash": "016c3184b30bb0796649cf322709f32f"}}, "179": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/loss.py", "hash": "c31c104de3d87d807c7a33515569645f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3633f597-c414-410b-98c3-14cb08e86f41.json", "hash": "0bcc0822b779e5e0f5c2da16c8c7a665"}}, "180": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py", "hash": "ec94bc8ffb8d772310933350b3f8e9b3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c3e8e8c6-a656-4684-ab01-2fe7572e9672.json", "hash": "af3433bd3de50a073d1092171ee943ba"}}, "181": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py", "hash": "67469a4810d0ceaa27dfd80effbb9dd2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c5d6e433-0c24-4fd7-9ca9-aa3327cb1592.json", "hash": "194b202292f7bfdfbdc42d07821e21ef"}}, "182": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/net_vlad.py", "hash": "44824f70e78eb4c4ff2b5e48ac217159"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3e162e69-611f-4080-90c9-982004493277.json", "hash": "2a02d1de782a4b7e16839b63384d0905"}}, "183": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/text.py", "hash": "049b9b1baaaf50e5808d98776d48f623"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7dea8362-0386-43b2-81c1-149bc1ac3971.json", "hash": "e877da244e7122d3cfdb46bd378360c5"}}, "184": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/__init__.py", "hash": "0b5d1071ea30870a8c57bd4097af5c8f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e016e8ac-9cdd-4db3-9ef2-8c5fcbd417c6.json", "hash": "3ab3e43f965907a2e9e58912ebb862bc"}}, "185": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py", "hash": "988f18ef42e7528df5ca4ddada97b81e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f76e78d8-071f-4bc5-b451-fa90ab75cd5d.json", "hash": "b9bdedcc8f0730a4ec8a6c8a53ac5a80"}}, "186": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/__init__.py", "hash": "2146d86ede43ed4a72b70edb080066f7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5fab68b5-7be8-46d4-bb8b-df4716f7fbde.json", "hash": "5152cf9c9f12ee9e25bca3c2a4503516"}}, "187": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py", "hash": "2d44ac7bb2764cb88469a98180ade5fa"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f080dcdf-69e5-4f71-94ad-a6005467e31e.json", "hash": "2cb8cb4079d3dcfd7bd214618e00af58"}}, "188": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/fix_bad_label.py", "hash": "2af4336a249c6067192e1d7573c963e1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/68a58ce5-b0c3-46e1-b9a4-31e20350a75a.json", "hash": "31bae3af1eaf6a47a511cdc3c0ab13dc"}}, "189": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/get_instance_for_bmn.py", "hash": "50125ad13d7171cb8c1e89dbbe0317f9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/881ff712-039a-47aa-a3e1-af424b8c7e02.json", "hash": "51b8b92aedd6d917c9d2511c5dc706f6"}}, "190": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/gts_format_transfer.py", "hash": "eb78bc6040b1387ea20dcafde7a3f6a3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b69c8507-e3c3-4461-a3ec-789e44965073.json", "hash": "678091bff1539345e5f0f583110c8805"}}, "191": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/val_split.py", "hash": "a56ce129e9ceba1aa2194d7969be5a1a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bfcf0ae4-ed3d-4567-81a9-0db6efb71708.json", "hash": "dfffbd22e17f202f4d429a55916cc17d"}}, "192": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/ActionRecognition/README.md", "hash": "782385d9e972dd14a1c6864debf49997"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5040f134-25fe-4f2c-acdc-a26d37111ea0.json", "hash": "fad1dbf1d473109457d44252d6dfead2"}}, "193": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/datasets/script/submission_format_transfer.py", "hash": "fed917058bedd7297ed7f2f221f1166b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9eca83d3-4536-4538-aa94-583aaef2a2ec.json", "hash": "ccef420acfc6ef3eff8f379c746e27cb"}}, "194": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py", "hash": "ecf79383c2dd289aa3f5f31bed2ce54e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/29e71c71-93fa-4b6d-8c00-8886b0e95962.json", "hash": "aad268c85dd538fc067502012f4d7016"}}, "195": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py", "hash": "cd21ec74c272a2634e29252933240359"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e24cd3d1-be52-44ab-a1fb-b58bc82552de.json", "hash": "3708fb936f4a3249feb3250d6241e45b"}}, "196": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/predict.py", "hash": "4a1e78fec8eed0fb4e7e9ad52a5995cf"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/78685746-196c-4dc7-9e9a-64862315217c.json", "hash": "b6ffdef715b3f14c6a88d3c4fbfe7b3a"}}, "197": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/action.py", "hash": "141aa15e22ca830ba7fc39a96c741f6b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2dbf4402-58a7-4db6-8e75-1ec98bde3314.json", "hash": "b7e6387c540eaeb6bd31f23c2823c0f9"}}, "198": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/logger.py", "hash": "dd2f713bf118644ed022ba231052b3f7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/01afed3e-a390-4c93-bb7f-7e39d3402189.json", "hash": "cc3b32f72a62969c8e276424acf23a1f"}}, "199": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py", "hash": "20be8f2ef54a5fef7b9be88e58c5631e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/92ab2251-9095-46cf-9122-551bb4a0a6c6.json", "hash": "205a8e0d2fd75e0d8492c90b3c133f34"}}, "200": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/audio_infer.py", "hash": "471e6abb6668efca1b698d8c722ac5f9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9d53e584-f7dc-4af6-9ac4-f2cc858b3ebb.json", "hash": "aea02f8bd8b6ccff687b6d393caf6505"}}, "201": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py", "hash": "f1aa75866714c41473ad58131f667e07"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d983f2b4-946d-4f7d-ab91-3cebf598f1b5.json", "hash": "5f2cb6b62497d2d6a17cb63dda386ff1"}}, "202": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py", "hash": "ce6967fd76161b9c20ed52c360332721"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/64b7f641-e3fa-4a01-ae6d-2ac87e05c4fb.json", "hash": "1f84cc5e20b9484919986fb405b2d263"}}, "203": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/pptsm_infer.py", "hash": "8f56d7b6e11c879ba3061f1417e7091a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d63d5d70-0864-4989-9f57-975f9beffc92.json", "hash": "4643c993bcb2a47543e03bcc65684bb8"}}, "204": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/__init__.py", "hash": "5cfe35acf6bc825520e717658d0e2684"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/00d2d52c-8283-48ca-bf88-f0f0a36b5b49.json", "hash": "3dd3fdc57b1f91725199c0c6e79d1ee5"}}, "205": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py", "hash": "5f943cf2f1d6761ca36d3621e961dffe"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/be92fedf-3fa2-4fde-bed4-4d1ec067a31c.json", "hash": "fbb9e43d70088f4faf36e2156203db62"}}, "206": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/feature_reader.py", "hash": "7440ba8161455d29470518ff4d4dbc25"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/34a9d680-3937-45e1-9d1d-379d0f3be01e.json", "hash": "b4a870ac6a93b785beabe2434b7ed76a"}}, "207": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/reader_utils.py", "hash": "b99d031a82b38968c2c5cc57ecbf6899"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/34da3e09-faf6-4ce8-83de-df2c137c5653.json", "hash": "66db3cca3fb2e3cdc9f7c96fb4d78c6a"}}, "208": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/config_utils.py", "hash": "3a670c8e2fd655236648e35e8797277d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6cc7ca89-cbfe-416d-b637-65338839e3f4.json", "hash": "12c856e0e7e9d95f317bd6b3d918642c"}}, "209": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/preprocess.py", "hash": "997e100a61a3f30433cdfceb0d24e02f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/69a563ed-fcd4-4c89-b365-43dad88d4cc6.json", "hash": "023341d77d277fa3226751360ec75e30"}}, "210": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/process_result.py", "hash": "05a8b04a5977c2b91f93d5f046f61f18"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a44076cf-c7ae-4a77-abec-caa1eafc8fd6.json", "hash": "fdbf9b19c79a86492621a5059620878d"}}, "211": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/README.md", "hash": "eef0d0d7eba0352299a9652f1f292b08"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0dfe1f64-db69-47ea-a14e-d7398914567f.json", "hash": "0d91d0e35bf1fb22b71d8b3d3c7f8723"}}, "212": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/main.py", "hash": "697cf9213adeb9d4f64b78d43c6957ed"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f3177cee-e9ca-4e0b-9977-8a67079bc9c6.json", "hash": "4b0e981e552ec1b29ef2afcc23584d2d"}}, "213": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/run.sh", "hash": "2ded79b156865969dada860091c98a91"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f5b5215c-464b-4ef3-bbce-265eda1cb10b.json", "hash": "5251a70cc460e40280960d0db24ce9ad"}}, "214": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/save_model.sh", "hash": "50b2e6b81faa92b700065a1083765be9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/aa25705c-75de-4665-bdcc-bdb5440b0d48.json", "hash": "b2812a489e19b6d449ce82a56ca482be"}}, "215": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/setup.py", "hash": "f8e37c615f860f3cf4df6081e8358ec2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/683b79f6-e8d9-41ea-8f71-b0ddc8cdb9cf.json", "hash": "be49343a28b30b6dd4803aef9c484f01"}}, "216": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/__init__.py", "hash": "3a9197d370afa5027ca5dfcba386ae3a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d1efb8e7-a9de-40ad-ba64-9fab006ae520.json", "hash": "e5aae1d3ad37e1c07cd9f41c59a8116c"}}, "217": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/version.py", "hash": "d2822a5ae396e2657039bdf33f266af3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b39f53e1-fbb8-4712-b30c-f03e86d2006c.json", "hash": "ea4926fd13e50b1a59018c35a3bdb0ef"}}, "218": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py", "hash": "c2471766df82b46bd08826245171850b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3dffe712-135b-480f-8096-a59d64202e9d.json", "hash": "45410e8895d78eef9b0c8d1be79ec723"}}, "219": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/builder.py", "hash": "70bd0e27772508052cdd47b1d4cb9cdb"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/909a3d87-fe65-4b1b-ba04-656643af5c4a.json", "hash": "391bc996f74906dacf12a750b88c275c"}}, "220": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/registry.py", "hash": "a3bc67ccf0ae4da8bc9689bda2c1dfb3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8da98a26-70c5-4704-ac2b-a021a899805a.json", "hash": "9555999d1f0c3fcd0bb0f2c463e678d5"}}, "221": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py", "hash": "f0aa01392cf8b2c877363a4f6ab0f050"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c0d9077b-2b9a-4163-9af7-0ed37ff0075b.json", "hash": "d461bc9a66064ce0b572d75e89671d3b"}}, "222": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py", "hash": "61bf58ded192f151cd88ef575ff01b8c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3868dce9-c7e8-4031-a111-90b800d1f64e.json", "hash": "193dd372aa77cd97d1171297bda5a8da"}}, "223": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py", "hash": "542e5da5cffb2b0ef8cd3871bf73a1bd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/de118e7c-a70a-4755-8475-975d2312c0be.json", "hash": "0b9d9707fdebb750eaf63a5f8b9e8f97"}}, "224": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py", "hash": "7749d8a14588c9a09a6de0cb776192a8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ba3194ef-fcc2-4d0d-9a6c-2f6c76db0854.json", "hash": "d0ab81d7555e79cbd420a115c2468137"}}, "225": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py", "hash": "b0ad74fc0c92384d257f9e2f4ed65f97"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7b570af5-cab4-4c09-9a63-3dacb62b23c9.json", "hash": "db8f4a05adbff719288f91fefd8a0ea1"}}, "226": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py", "hash": "c5ed6b21b6b9537db1a06b43d3d732d3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/195955fc-23f4-4ec7-8751-022d035c39d1.json", "hash": "fe432dde2672bfabd5b0e08ed6498210"}}, "227": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py", "hash": "2fa53cc9af389cef160b216b9bb58dc7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/92e86a04-a8ed-4bb9-b909-a920495c6ac7.json", "hash": "8969579d419d2fa9177400b20d25371b"}}, "228": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py", "hash": "2066199caf7b626dacf47ece07623f55"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c78113cf-3fe3-4d28-a49a-7d5f79595a37.json", "hash": "e8b920537d2413fca2efd22c5eb38cf5"}}, "229": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py", "hash": "1d3a68b8e85ff6f27f607045f0e0a3dd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/21a03f52-a095-4440-ac9c-3d9d95ece5a1.json", "hash": "a7476c6c599265575b5dc3d44cc537a0"}}, "230": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py", "hash": "1f5e6e64f5a2841e4846492c0194a6fc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2cedbf37-69dd-428a-baa3-e50f088fa25f.json", "hash": "1b21efbca70ec6abc699391e0f485d59"}}, "231": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py", "hash": "5d271321a82dc8e5ee437e27fe650fe0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2e744e01-920a-4c0b-8699-31b8562984ee.json", "hash": "f29d4ff16a1c4199fa1922029d1c85ca"}}, "232": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/base.py", "hash": "c55c04c8f418aed0def4510998e362fd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/50c4b9b3-d253-4b49-96f6-0c5f6f76e603.json", "hash": "219b8a8b5c744086c0bc441bfa85997b"}}, "233": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/build.py", "hash": "98df5e46684522e36596815ec14bb872"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/386fe09e-4bc7-49c0-96ec-e7f0c2a9ba17.json", "hash": "a094097a7aa6730a5339d020a835b7ae"}}, "234": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py", "hash": "2c6bf5e5258bb2b20f6ea288d9aadc9f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4a9f6fd5-205b-4d02-99f9-ba84b7a3b91f.json", "hash": "0ce96633e4a033c96f54f31711d7294d"}}, "235": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py", "hash": "25d06935397036269b58940c188e916b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1172e92b-4c2a-47c0-a6fa-962a124fb2c5.json", "hash": "3dd8958b3af1a26fcf0935dcb3582c20"}}, "236": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py", "hash": "2bc52624ee5320e29e785e86ded4553c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/990248c3-d36d-4951-ba5c-f6108aeb20bc.json", "hash": "28b57cb2b61d22fe80332b7559be142a"}}, "237": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py", "hash": "a917a8a6b30275dec6f37670cd0ddca1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/38e736e9-f0a8-46f8-9d49-1e3493acdee5.json", "hash": "d2b7b003ce2201167f577635619db912"}}, "238": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py", "hash": "5602b45e84c28bf9ace3f37801921b36"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f907552f-8df7-4c75-ab4a-cc979ff2cd10.json", "hash": "ef42098a5455d4195e6995022a080c7b"}}, "239": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py", "hash": "9289f119424bca77d2d242bdfeab4be7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1e9290c5-1d3a-441b-9232-fa75817c48a4.json", "hash": "4e75bbc047b9e41337fef72a5dc665e5"}}, "240": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py", "hash": "37422b83a81709fb87bb3bf26d4f8bc8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/772d794a-a336-4a0e-ae99-bd5b4df98427.json", "hash": "1f46f9176eaf3b78ee42f44b2017d9db"}}, "241": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py", "hash": "23bcf0c97d1c89211683bea08e392c5d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bcfb23d4-6122-42c0-867e-bc6b1215ad1c.json", "hash": "b28b15fcc0db86ffa380b85f5426db04"}}, "242": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py", "hash": "415b374cfe7ed4c50e36be772775ba22"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e792b9c3-5b22-449e-b47c-ea429581ee3e.json", "hash": "64688e464c91559804d88373b4d1cf50"}}, "243": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py", "hash": "1ade95c11a43a7cd8f7a96590114325f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6d190f3b-28cc-4448-a3fc-ef0b99294dd5.json", "hash": "b701e6678e3644d13914cd6b147efe01"}}, "244": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py", "hash": "a99e8e9241c9b382fd4682aea50ba531"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fe050e44-807f-4ebe-8ee9-e38632bda6fa.json", "hash": "8dca6f6e4dc8125ebef91cce3563f2f9"}}, "245": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py", "hash": "0386a2334591cfe40a644e60b03225ec"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c7831c49-96f7-4f4b-9cd7-6917facb27c9.json", "hash": "01240663c0fca5bcc7718934a0ecb616"}}, "246": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py", "hash": "9f21e67fab8a3cce0b0f8a4d89667655"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7fbf6288-c444-4967-974b-6d98e8a221e7.json", "hash": "b78880f9558b859c311d8cd5437a99f4"}}, "247": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py", "hash": "8dd9d0a5d822412dbbf5360c122a38bb"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bf30a94e-f8ad-448a-a836-7c2845544b79.json", "hash": "637c715209eaaa45a5b224a13a4305cb"}}, "248": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py", "hash": "f76dfb2d0030bf2195db6a1ccdc1e8c2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b4c114ef-374a-44b4-ad67-bac255580cab.json", "hash": "ab109fc56025a484988bf5e67cd7e731"}}, "249": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py", "hash": "330d1cdaf70b5de1e82abaada5b38af6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/225fb15b-381f-479b-a3a1-53e2129c8cd8.json", "hash": "ef42cecc37e4e620445ac5bd6cc007f3"}}, "250": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py", "hash": "0ca9b3c057a29a8735c195299702075c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ebac55f6-d77b-4017-8599-98910e1050b9.json", "hash": "06715f6ad340819096a569f15bbac16b"}}, "251": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py", "hash": "cc97f7fbef27fc8a9d443a57606dddd0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0e4af0b2-83f3-4e9a-9047-263fafe76cba.json", "hash": "dd51ab5006a2b27216da668a56bf199f"}}, "252": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py", "hash": "eb477cd3261222c02ab83ec9396f8f39"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c92c0d2c-eab9-4a11-92e3-bb38d8c091e8.json", "hash": "b689d6a81b84581712345650f4b0ab89"}}, "253": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py", "hash": "471cb05e9e303bfe5573595fc4918738"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0754c534-e1de-4f8c-b0c1-e9d4562e2b1d.json", "hash": "86831d39e9d873017d7ecb0f1f101e03"}}, "254": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py", "hash": "36fb71809f61aeb3bd244411489eada3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8a4593c5-a977-4422-a8c9-2b87abf2054a.json", "hash": "3bfd8a96c9e0dc7558164034f02bc0a1"}}, "255": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py", "hash": "67668be3292b46d52930380629b33810"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/50866672-274d-4c7f-94c1-0ff51774db70.json", "hash": "d3fecec4f0f5706d9638f9b70470a02f"}}, "256": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py", "hash": "67370c239018b4fc2ce9964e3b7b94fd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/51e06ebf-f1dc-4c8b-b8ca-8d4734756a79.json", "hash": "ab38ec1fb570695d6098bd06191b5d5c"}}, "257": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/lr.py", "hash": "979fd57940da233a10810578f2827266"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/aac9c369-be5b-4511-b19d-174876a181d3.json", "hash": "56d6d89276f275405413bf792947bc24"}}, "258": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py", "hash": "a1a42937c2ad694eb6e867635f18d09a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9b7cc702-4908-444a-aa8a-f03e223706d9.json", "hash": "06b0f4e588e7b9d25426c843a84068ce"}}, "259": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py", "hash": "282165a84b09e8f1e81777a61ac696b8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3a21a7b5-da83-452c-b74a-d98a60640456.json", "hash": "2651729a1db50a76e7e2fbc519f64c4c"}}, "260": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/test.py", "hash": "14d3784d5c82e8e8d8711d3bb29de9a6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/109cd714-53b4-43e0-877e-96d62b02a891.json", "hash": "1b92b61d35ba10ef91262d48c2a32c0d"}}, "261": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py", "hash": "8946bfa6543bc27279d1ea74b7e38fa7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e3c1398d-5459-42c5-89bd-7091b4244fc2.json", "hash": "c6687c80e3a46a32366d3d3adcc5b7bb"}}, "262": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py", "hash": "2e503cdee29a95a3600eddfc9be54d30"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/69960b55-1729-4511-930a-f4f153b795a5.json", "hash": "d79191829f5e14fa176b66d01115d4a9"}}, "263": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py", "hash": "344785cb2f1cc0059bd0d6c99a7ef684"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/dbfff6f8-70f2-4512-89a1-d45becd0a54f.json", "hash": "ff52df906e742db06598188ce79b2634"}}, "264": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py", "hash": "583a41584473d40ddf96fad91692bbbd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/cc552703-8a7a-499d-bbad-f142b2601dbb.json", "hash": "03889933671ac981535df60ffed78128"}}, "265": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py", "hash": "91eea582008adeaadafd9095fbeec687"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6577a076-ca7d-434e-a249-6d0fbbefe72c.json", "hash": "9e931f1a410959826b5c926a55ae1463"}}, "266": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/logger.py", "hash": "bd698c6d9ee932e5013f753a9e4128f7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/24e4e098-d91f-4056-ab8a-129403184220.json", "hash": "6f1f616ffd59332e93fb763924b4fec0"}}, "267": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py", "hash": "ea73c6b56c4edb5109c5e26b8ae505c1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7dd5bc6c-f748-4f35-a4ba-f76321564cb0.json", "hash": "59f93c9ec3359fbc5b1b0bc832f95969"}}, "268": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/record.py", "hash": "9efe39a353c3ba2550d4d4deb3c71b9d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3abbc625-0fe6-4346-b69e-b63e8e1e2634.json", "hash": "f40848aeee846290e7381e4517d43ee3"}}, "269": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/registry.py", "hash": "4a9b1123c2b3f9f7dfd19cd378e7ee89"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2d143039-84f2-4840-a4b5-3828135abccf.json", "hash": "fc1fcd5137255e2ca64ac271c266da3b"}}, "270": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py", "hash": "77fec2f4e4735b7cf45ca86bca8dbec7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f78e1ab0-c730-4610-aada-6b2232160b79.json", "hash": "ee73fe9e89a94267c5890265e94ad65e"}}, "271": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/FineTune.md", "hash": "5724f9cf35a6663879273cf1fc56c2e9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8dd65d2e-d515-4060-a2e7-336f403298db.json", "hash": "567f11080b42a21c0895dffdcfea40eb"}}, "272": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/README.md", "hash": "78a7460fa9d34cd4cabdb785cbf83b7e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/caf4379e-498e-4140-bd1c-dcf4eb7d5dc9.json", "hash": "0ef8bf0b1060aac852ae5ffe33ffabc3"}}, "273": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/Run.md", "hash": "87f3ae72487efd9289c5b88e4472d67e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/50c87944-4812-4383-b04b-bddfb3fcb327.json", "hash": "b47933dd4d72470f1ebf2c1d9b47b064"}}, "274": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/Test.md", "hash": "9466b296161f3541a15300b2eadd8cc7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f056e460-dd90-4b51-8857-d2e4ff737d5c.json", "hash": "ae69c0b3e32c4c23db33f6ecabe95993"}}, "275": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/eval.py", "hash": "8e7b66e654938f7323c5aa9ea6ee3c54"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0563d2d7-2a00-442d-814d-edfb8fdf8064.json", "hash": "c7c6a1f1e8a5be19dfe9cac3a1702d27"}}, "276": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/predict.py", "hash": "f0fdd0ab16e155bb669526cbea7ca1ac"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/743e660a-5a99-4037-aa2b-61f14ad50502.json", "hash": "b3a0f83bebcce79325d0ba1ce60347b6"}}, "277": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py", "hash": "66acce2872240695d212062623eed9c8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/207c687e-6ba7-45cd-ab2d-3dea0024a48a.json", "hash": "00fab4a4968bcda33853e2f22dd84a18"}}, "278": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/tsn_extractor.py", "hash": "49884394a16aac6d00b797f8939056c2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/18afc32c-df62-4b9b-a373-19c278db53f3.json", "hash": "a04ddaece2897c5b63f42a2069e87b2b"}}, "279": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py", "hash": "b221aca93ab5849cdbdeb4e237a57fab"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/33d49b0b-7b36-4022-84a8-496d8342a35b.json", "hash": "b5178dd551feab0e656228d9339541f2"}}, "280": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/__init__.py", "hash": "5ed3f8785487fa3b8d8e38d13e7d90cf"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/526a992a-fb80-424c-8273-2748fcece73e.json", "hash": "3c8cb34269e5cdc6d8bf8929d1881cd6"}}, "281": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py", "hash": "705f9bdd486330bb7c57ad0e9652c9ad"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/af8f088d-6df5-47e6-b9d7-5c875125ad33.json", "hash": "1ae6c8a33adc98a66c56af6cbaf3b198"}}, "282": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/kinetics/accuracy_metrics.py", "hash": "49eaba5fa1e35d93db70a2928b83c097"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6328dbec-c3c6-4f2a-9834-8760ba5bdc9f.json", "hash": "dc8dbef3e979a3d0631f807076989297"}}, "283": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py", "hash": "5578a6fd0976529bb5d10836cbf4d33b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e50b2b3e-3714-41e3-8c30-d1fb8613eb3a.json", "hash": "71650ba8ebba94a3325b0e851e2c0a02"}}, "284": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py", "hash": "417e910ea8caf35df81116af633ff929"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8d7f9b6a-41f7-4abc-a197-22246acbf883.json", "hash": "6919440bbf018ed190686f03fe11a3c3"}}, "285": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py", "hash": "1d1aa2e28c3cba3a82f3270a5e8e7477"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b408b5dd-77b5-4ca8-91d9-e5d409f6e20b.json", "hash": "b067cd28f7b212c1478acc69d5781ffb"}}, "286": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/__init__.py", "hash": "6fe14234ace51802d894a4be87ce1954"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/346792a9-3f14-4fe1-91d3-fa862c3ec732.json", "hash": "bdd7b10bbabc26351d0d1328cdb98c70"}}, "287": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/model.py", "hash": "1646a3eec0d5cef71ac345cc97b6bd2f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d55dded7-4589-4a37-8eec-5c58124315a8.json", "hash": "4fccfbf46968bd776be09f011f3963cf"}}, "288": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/utils.py", "hash": "1ecd998f855d6f511c322336191c87d3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ddf9a8cb-3fe1-4111-bdaa-5eb4ec613f53.json", "hash": "1040ff3c58c161a11e52c146978a323c"}}, "289": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/__init__.py", "hash": "031e1e656b88a28fc776726be383cec4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bdc17646-5d44-4a81-aa6c-f1256733ce9a.json", "hash": "eb334ebf9b1c6ad7328d09e14c025eea"}}, "290": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py", "hash": "017cbdeeb8ee6107a578de8d78e85065"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/916b43f1-7297-4ac5-9be5-e1b9bc622090.json", "hash": "8a1d7ed01640a265138a1b893bfd9337"}}, "291": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/lstm_attention.py", "hash": "255264e42b6684f5909256b240519875"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1a1eeeb7-7406-42a4-9b1d-489aa086755c.json", "hash": "57c571c5432055c85a4172283e4997a9"}}, "292": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/__init__.py", "hash": "a90a5d7d963015b8e54e14cc271e50e3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fc12e4fe-cfd4-4ba1-b47a-fa2b0843d027.json", "hash": "1faba5c937e5f59a0c5a47a319a0e321"}}, "293": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn.py", "hash": "4ce788eda50518c554f17c9a800391f6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a6cf2a24-d0f7-45b7-a739-51ff35bf4be7.json", "hash": "25adc5294c5d4647f39131dcd813e494"}}, "294": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py", "hash": "8e24af68d270337779462b06f1feb9bc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5f54ee4a-1c2d-458b-9a28-bddedb1dd5af.json", "hash": "df3abe6c8be1bc50f2065b89c34e97b4"}}, "295": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/__init__.py", "hash": "d8377bd6870eed78e58c26dd9d8c1db2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9f2d6a4a-011e-45b3-92df-3a3591749907.json", "hash": "67b002c7b940a7d1e912552b038ddb03"}}, "296": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/feature_reader.py", "hash": "cf733ed761730cfcd5cfd2815ec75bea"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b8c21e07-10b3-4d28-896c-6b6a23de6d2d.json", "hash": "3cf6cfd420d43f6e33a42270ddfcc1a6"}}, "297": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py", "hash": "0da2d1ef0e71edadc2d765a63a041a95"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c50f6cb2-5742-447d-895d-a6e75865037d.json", "hash": "d0786b3e6cf8f3f636d82c0dc3882ad3"}}, "298": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/reader_utils.py", "hash": "d50bef7abf1cfe56ee8a9396dadb4a32"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/dd6a35fd-c4e2-4773-a81c-ce934d329392.json", "hash": "d8c1ca9594c49b3c86650481f4096edd"}}, "299": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/config_utils.py", "hash": "9f85dcad160edd971f8a0b4cccdf3eec"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1d6e284b-2445-4be9-9b0a-d6056e4139c4.json", "hash": "65f9d48aa4e1d7250074df41a56bb71b"}}, "300": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/train_utils.py", "hash": "83c1dd20a9c4aecca41154b8cb595ee2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f626dc61-6e8a-4e48-96eb-e4cb181d2383.json", "hash": "b3dfc6ae97a06bb1208eb6e9409334bb"}}, "301": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/utility.py", "hash": "35fa8507d09ce01912e52f2a9ebd8f49"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a1262941-422f-4a13-b7cd-83a7d78c5289.json", "hash": "02903d877afccc9d5eefdff9d04ef215"}}, "302": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/README.md", "hash": "f6a58ec14cf77e74cc060583534fd5cc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/cc63a58f-7f9a-405d-ab67-aa85d3dbc875.json", "hash": "96473cb7fbb4419278e25808e75c717a"}}, "303": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/run_all.sh", "hash": "13646469a3db3cdebc669952815db541"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/36691932-c5e5-450a-a5f5-f7d207b048e3.json", "hash": "489ba8159f008f820efa7a7e2bd7e7ef"}}, "304": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/run_benchmark.sh", "hash": "d779387ff0474a49d25434b2e059d68d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3699dc05-7c89-49b1-93b8-78e4f66ac812.json", "hash": "4323d81a3f768611f5efe7702ded3f97"}}, "305": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/prepare_asrf_data.py", "hash": "89ce05dc673803a635f6d2f745c8cfc2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2cfd042c-4704-4187-8777-a90abf09ff85.json", "hash": "498df051e2a0e388f4d035cf637c39c4"}}, "306": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/transform_segmentation_label.py", "hash": "3c6fdd484312747d667bcce575fee2c6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2017add4-c3af-4d8a-9e69-400b8b3d6db6.json", "hash": "1125c2e7a93496a95cad754376cf823d"}}, "307": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/download_dataset.sh", "hash": "1592b70d19db829a92c802f9213c02ed"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b79c18a8-79c5-4932-9188-5acc155abc96.json", "hash": "85a3954db8a57f131cc9dd3b45c03903"}}, "308": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py", "hash": "5da1376f297785cba705ce0845e1748e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9d237c1c-3826-4eda-97e5-868ff713987b.json", "hash": "74401d748ab1a77dab8620ba76c83c66"}}, "309": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_skes_data.py", "hash": "bb24a055b19bd12abf92ab543b63c6ae"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4e2977e4-6b4e-4937-b357-76a12da3b124.json", "hash": "d9aae5d9ad1e36e9b9dd7d080d44ad11"}}, "310": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py", "hash": "dbb589ea20e517c0e3c2b760031a7ee2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c3a1bf5b-7d7c-4c67-9903-6185834c09f9.json", "hash": "ee7d6962292d0dd3edb9426c6fb10a0f"}}, "311": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md", "hash": "07bc5d61676a6573ada0e78be4b6c412"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f796ef44-34e0-4d10-b159-03f772bc8352.json", "hash": "ce199ea92312ae74faa411f74582b375"}}, "312": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md", "hash": "0e18a986d69450b96def2a7ef16aed9d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c725aee3-d95d-4fa6-851f-2cc08e6e39cd.json", "hash": "41b9609250e454feb627c2368cee00cc"}}, "313": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/external-cmake/auto-log.cmake", "hash": "089946ec67dcd0c2b5d94a20ab854050"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0bc264e7-d7fa-48cf-9fb5-ce251a6cd3c3.json", "hash": "d851fd8aef067e3b5be3550cdff1114b"}}, "314": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/postprocess_op.h", "hash": "b4a0ead37fddbaa53f4da2fa2a39ed80"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9540f655-8d4a-4d20-b072-692b7b466706.json", "hash": "3bc47bca160bac8247708827d7444d83"}}, "315": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/preprocess_op.h", "hash": "ceef0bf625ad1d421bccd61c6249b6f4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a666a2c0-92cd-438a-80c1-99bff79e4caf.json", "hash": "f8fffe0b5f00cbd393767ac282b5fa26"}}, "316": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/utility.h", "hash": "fda2c177f59562e75c5ec86419c2e4b6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e8aaf0b3-2c5a-4019-bb0c-9710ba03a18f.json", "hash": "bdfb9e1e39f3ac16b86a75a505ac5643"}}, "317": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/video_rec.h", "hash": "fdd4667e89d43b7d5ada27e16495951c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/00423058-f625-4c51-ac8e-e6156ee8430a.json", "hash": "af28b5c8cd97876b953f58fe1995a2dc"}}, "318": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/main.cpp", "hash": "ef8fcf38ccd43ddc1accdf255ecac150"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5d9caa45-642a-4200-943c-4e886465283e.json", "hash": "c5b7a6430d4a27a7e9c0fececf9e16c0"}}, "319": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/postprocess_op.cpp", "hash": "3d29fadc03c8797ad72bff3824589a58"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7bda9261-294f-4912-b1ae-397c5520e9e2.json", "hash": "a6af375bfd4d5f1f402527892e00fdcb"}}, "320": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/preprocess_op.cpp", "hash": "5dbe8cbaa90113cfc599d867612bf1e3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fffdaf50-3d31-4f87-bb69-f4d87abb409e.json", "hash": "72d0dad941a2222848fe504786ec7d43"}}, "321": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/utility.cpp", "hash": "c250dfe09d2500932c00fa618ad25ef9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/326037c1-296b-4b1d-8c56-5d011e6df707.json", "hash": "15ece74697cd2b254e47ea3c5b397b32"}}, "322": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp", "hash": "cdb2148445642af882957b3cc0a4906a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9b615ae9-82b6-4320-aff8-c1adac9d33d2.json", "hash": "fbf8cc977f3c1df080eef0814ac832ee"}}, "323": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/tools/build.sh", "hash": "1cf48bdae3f9648defea6232a3dbe342"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e11e8e99-e1c2-42f5-9d8e-2c3f4c4baba4.json", "hash": "0d86a59d52642aff527c2015e5f31be9"}}, "324": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/paddle_env_install.sh", "hash": "ddd5b8fc1e0c43ee861b0541b3fbdb9b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ebe347ec-d419-4e14-bda2-c3e934c9aa94.json", "hash": "e72e0a72a47c788238b0e2eb6b2e974d"}}, "325": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/preprocess_ops.py", "hash": "ecf08bcbb69d3f23e508bdec2c4216d9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/28eff272-aa18-4b97-8e97-b84c16992ace.json", "hash": "c7de0befad7b12f2ca89b966cacbe786"}}, "326": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme.md", "hash": "de0967f4321f4f971c6466a9618ca88a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/da8df420-ae41-4731-8dc2-585281aa76be.json", "hash": "5bb7dc8abf3242bb0aec8ad1ab88f33b"}}, "327": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme_en.md", "hash": "5fd761a2921245fa3cb52487fb60e1f8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1e0a27ad-0bb8-4301-b8c2-7f1936b08d23.json", "hash": "dc195a27bc492e20237240866f46e43d"}}, "328": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/run_cpp_serving.sh", "hash": "209bcf7169cea680c5f37d4b8f71bbb0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e07625a1-98f7-4d70-bfa2-89cc8328f863.json", "hash": "8d352144ca7a59a616f014697a547cdd"}}, "329": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/serving_client.py", "hash": "6cbf5d143bc00ce0845fc420119c0200"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/496e7372-817b-427a-803a-7bf4a1e61f3e.json", "hash": "888922150717625c0231786944aa2808"}}, "330": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/predict_onnx.py", "hash": "7f878829472e716002b53ff372ff9d74"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3c27a33b-b94d-4fed-9cd1-3ef6fa629d86.json", "hash": "e536917303e140af759776bca8f9691b"}}, "331": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/readme.md", "hash": "6a3ee644078390590f2825b16298648d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a2f3589f-d018-481b-8f96-ea16db092304.json", "hash": "dc5922491799902196a3a20042754820"}}, "332": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/readme_en.md", "hash": "98925614cabd6948f962a2ea16f3d1c0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fc9c3324-d108-479a-aba6-5d4fb5a89d52.json", "hash": "b95b223f8e265df2183329a7cee05bb0"}}, "333": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/pipeline_http_client.py", "hash": "0c29107ebd3df0191480aeeafda99d0f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f15cf972-cd8a-425f-8be9-c63d25acdc96.json", "hash": "259bbf307cfc1fd2e56b158e46ee0e93"}}, "334": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/pipeline_rpc_client.py", "hash": "fe597e48cd70d8f3bf27a7271123f743"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/584f8aff-900a-4892-a9a9-3f8ed867f6b8.json", "hash": "c09337b6f5f32446abe8353a67758b3a"}}, "335": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme.md", "hash": "06787a7b11f7b976cce56007f13353d1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/577fb8a8-7e54-4187-8ddc-dd58514de87f.json", "hash": "25a5ba987ded53ec4af307840ef32656"}}, "336": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md", "hash": "a790ff17ece1a4b6ffa408684dff4552"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/49a26df6-daa0-45ce-b445-d01a9f4aa985.json", "hash": "543a04a95d48a825cd92fec91891c51f"}}, "337": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/recognition_web_service.py", "hash": "39b1b98000b327a4c907df75884119d6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a48c140a-2627-42d9-b89b-c74ece158c02.json", "hash": "e2fbbfbc8eaafc613f24937bdff2bcf6"}}, "338": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/utils.py", "hash": "3e0d472295256703d14b2fd0378cdbc9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/80255346-6676-43d5-834a-00d48abd162e.json", "hash": "df7c2a66e184f13390829b656beeda34"}}, "339": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/quant_post_static.py", "hash": "d82e54561a586df5279077880e85a384"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7bb95d6a-909e-4bcf-b406-a0b4122a932d.json", "hash": "a2d823c469f6801915fc67c270f60741"}}, "340": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme.md", "hash": "da3458e9ed7f468c7521930b1ae9d02b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f9ca0655-5981-4b00-b49a-1a8ad20ed2ac.json", "hash": "5b280e5ebf2f8a9ac99a021dd2cd3ea7"}}, "341": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme_en.md", "hash": "05ed792fc87cdd7f48d4b12c73b4f53a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7a591640-3392-49a3-8249-3d5b8f8f9bd3.json", "hash": "d13b69ef573524db07bef6eaa5422bd3"}}, "342": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/benchmark.md", "hash": "bc7453bcb384b0a4b63b3e17b4697dc7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e24e448a-f4f4-4034-a077-c81c2af43313.json", "hash": "4e65793e74830106694f337c3023ab29"}}, "343": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/install.md", "hash": "fde36c769794049c7e822e1bd91593d5"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/916a4e33-c5fa-4891-a1bc-0700e02e5986.json", "hash": "c447b5f8c1480a88205a1619390be23b"}}, "344": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/quick_start.md", "hash": "388a08456930b02dfce23b42d40bc3fd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/23ba667d-92f7-4c25-a37d-1fcd0aa5ab20.json", "hash": "68ea465a6bbcc0e234e6ee2d833ee51d"}}, "345": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tools.md", "hash": "bc789c013948225d0be67d08f9635e56"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/75ac2cd6-4dbc-4b74-9b97-d5f8a78af490.json", "hash": "b9274bdabd04035ea1f2c69b0710c319"}}, "346": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/usage.md", "hash": "591fee3e96174299522e5a45b817107d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1b7cfb16-3135-4285-88ab-77f1b90d4dd9.json", "hash": "0b29dc319e6937e5c3b575497675ae5f"}}, "347": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/AVA.md", "hash": "abbe149fc090cf70d84f93bdf4d4d08d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7e9227b9-283e-44fb-b67b-72a781c41bd2.json", "hash": "707b92cff8ababbc2518192e015f05a2"}}, "348": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ActivityNet.md", "hash": "edeebc40e5a35ca2406ffe59758e9239"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/50ce1e9f-ecb1-49a0-9e7f-bab5a7669020.json", "hash": "3538b8b9f2635d17ce83fca753cac694"}}, "349": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md", "hash": "689e625ccd970d1f476142ced3b1e947"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/46462441-92b5-4cd2-9738-198b110c5ae8.json", "hash": "0c2d9b5f689f578d99a03ed6c8250d73"}}, "350": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/README.md", "hash": "33aae389d4d2d86ac8d84e7c46e0c2b9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6743942c-ab21-45d8-aa16-a29816e42569.json", "hash": "0ebd3d4dae90a46b13c017cabf2b9c8d"}}, "351": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/SegmentationDataset.md", "hash": "684c9acc3184603fbba0db5871298661"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/75917cee-6bd6-49b8-9390-93ec47bdaa44.json", "hash": "30efc423fb8224fc5be2ab16a0513723"}}, "352": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/fsd.md", "hash": "9ffeb2579ed22c2a43eee2b824704074"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e2249165-137e-4134-a239-582b14c701ed.json", "hash": "891c23f2fa079502f67960ccd3e1ca0b"}}, "353": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/k400.md", "hash": "4fa64c41269d892ed467ff70119297c0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d4bc0edc-611c-42b9-b972-d1e7597de4c6.json", "hash": "6b5d756c06894d24589dabbb50f4306c"}}, "354": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/msrvtt.md", "hash": "308ad91fa39bcbab39a9ec4fda30daec"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/355ad68e-66eb-428c-b635-fbad5b4f117a.json", "hash": "bb9d2eac426ed56fe86606105e975ad3"}}, "355": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ntu-rgbd.md", "hash": "addadaa4e7d27911a390ca58fde537bc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ab64bc73-0873-454d-b99f-286199ecee89.json", "hash": "ef622eee95fabb5a0ae4e44110d8754a"}}, "356": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ucf101.md", "hash": "fb8b401af4c50aefbaefb0189e61236e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e4c769ba-4090-47ea-a65c-3c5b9e860a4d.json", "hash": "dc4a80ee92aae6fd927d0b1b9eeabd61"}}, "357": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ucf24.md", "hash": "8fc517c2c90e779f9558094d7de50373"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8dddd838-dabc-4192-8aba-da185088ea18.json", "hash": "b80129c34ecc8d4c4de0b308c4efd139"}}, "358": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/youtube8m.md", "hash": "3be155a67a7a6498fbd62621e759cbf4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f9491c07-0bd8-4d14-b486-087bb091307f.json", "hash": "3c898f47753191d581063879067d188c"}}, "359": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/README.md", "hash": "c45b0dd5da4482667795a0aa89c28a13"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b8f5865e-b61d-4b37-ad93-afbc4bc13b5c.json", "hash": "387c7c45bf8f737ba192a7276ec1dd70"}}, "360": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md", "hash": "9a183ffbeba2e5db714e60096835bb7f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/43c05170-0416-4c9f-b927-084bc4f984f2.json", "hash": "f4200cd508a587555c1a4e0b414f256d"}}, "361": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/estimation/adds.md", "hash": "4496acd7fda2463ab2e7bf9bdbc88f05"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8ed1f1ea-7e25-4cda-be10-917e93f1c559.json", "hash": "408377d8e23f04a5c1440d2d5622c3dd"}}, "362": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/bmn.md", "hash": "add03335c88db04b04c53d4e6e44bac7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ab0e18d8-71f3-4fda-9fe4-197ba22746fa.json", "hash": "cb95e34b25cea43fdd9394a1693300b2"}}, "363": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/yowo.md", "hash": "8bcf04e6ef4d96550f3f6b8c34017689"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b87949ed-ea6c-4a52-a6a6-5f681a0797a0.json", "hash": "6d8e84425d56488a44a3ddf174443d20"}}, "364": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/multimodal/actbert.md", "hash": "5207631edda80df7df1e2c1685a73e2d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5a13773a-b59f-4c5a-98b2-946896833581.json", "hash": "405e9ddc3d3cdcc3a4bf85b041ab02c9"}}, "365": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/partition/transnetv2.md", "hash": "7fc8747c2b13a900d3443e2f7027d6a0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8ab7f923-4447-4b3e-b2be-9f313238cbfb.json", "hash": "11f43136b6dca5caeae9ddfd616a482e"}}, "366": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn.md", "hash": "17283e41568cd0f83c7ef76edfb189d6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/08106cdb-f567-4dfc-9b16-c78a14078db6.json", "hash": "c2a6de51d7962d8c45c99f615a875838"}}, "367": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn2s.md", "hash": "7c4b85dfa4ab4b644ec0416de4716d18"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/95a099dd-1734-422a-b059-32270567b137.json", "hash": "42319068b78557ad04b689ab12f6d4f8"}}, "368": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/attention_lstm.md", "hash": "a36805ddba79905aca63fe50a46f1bd0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6a1dd251-9bc5-473b-b8bc-e9eca8392b41.json", "hash": "5cdf1cb15a36e99bf27fc1241043c3be"}}, "369": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/ctrgcn.md", "hash": "54030bb3c3bc9f4cf419bc1f7ece6587"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b53b1bbf-1f62-413b-a59b-6c457320f2e3.json", "hash": "252f161a6b9a76d9c3b0522a16915be6"}}, "370": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/movinet.md", "hash": "4f46c8a13fc694b3fe4561acd5d9cdf0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/be9b7893-fa30-4e2c-afd6-c05e2ddfbc5c.json", "hash": "4ddc4199d4ca2e47b0e9e85144f5a3e2"}}, "371": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/posec3d.md", "hash": "dbb6557a4b3d763c7802da295e13711d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/45cb0c72-1c7b-45ae-96be-26a6cf633a8a.json", "hash": "b5369ded6e673291d8043b2cc6040ccb"}}, "372": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md", "hash": "b2a5759e5e2665cdd55198e72848200c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1ce4b7d2-cf40-4f79-94ce-2aa2e8d987c6.json", "hash": "4bf4b73aadba7d723fc95525f007407b"}}, "373": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsm.md", "hash": "caaa087fa2c37c0f93103abbc114c8b7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f63ffb7f-05fc-4a28-97a0-f967428fef40.json", "hash": "dfa901ec4b2b52cf86235e4d081f11c5"}}, "374": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsn.md", "hash": "49fc0d7fc8a98595a2154171d50c63dc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/63391ca2-e149-46d8-82eb-deeed9ffec78.json", "hash": "a1e978c553790fa6e0ac106fab774cc7"}}, "375": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/slowfast.md", "hash": "c4497aad4310a5963f8a79f3471d09be"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/11972a29-3bec-4821-8499-f55a5df7b0d4.json", "hash": "b03871fd78471d119fd2903c08bd74fa"}}, "376": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/stgcn.md", "hash": "39daa193665d4187d700980c596719bc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7ca67506-e8c9-4595-873a-57dc7c294081.json", "hash": "8bd4e068fd3f98dcabee1951c5e00350"}}, "377": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/timesformer.md", "hash": "3e314ec8afcf85535a12f0b8ab6a085e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8b576a44-8ebe-49cb-b188-25d9facca9a9.json", "hash": "2e8ccad007a65b7a683221a0ffb890c6"}}, "378": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tokenshift_transformer.md", "hash": "f071258951fc424b7ad6bb9981b231b3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b723ccf4-e1a5-4405-9960-ff6582d1c5be.json", "hash": "b03f1a896dd1954522f22ef1a0945e1e"}}, "379": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md", "hash": "2041ac88bdacdbc41c468897c5747f89"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a8ed17f3-26c4-40e5-975f-47afb18bc21b.json", "hash": "d247aaff1da2d73a042b8496a48031b3"}}, "380": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn.md", "hash": "71e029eaa6a48891effec6e9d1ac5bae"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/07eb3e9e-e741-437f-bd66-a0e0e9f673a5.json", "hash": "71133ecf5a84cb4dda9acc331c6a4dd3"}}, "381": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn_dali.md", "hash": "81f35bdd5305a9681be33362c036fd1f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/89984d3f-723d-4fe2-a003-7b328d68234f.json", "hash": "4d395d260e8c5d1fe8b1ce6006a45a37"}}, "382": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/videoswin.md", "hash": "8885b1c860d8c50229a03915514510ac"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c151342c-384a-4441-b8d2-e548e4ddbe63.json", "hash": "9938ec85915e2697c99354e1d724a444"}}, "383": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/asrf.md", "hash": "fc584e9966e756b7ea5aeb9a050edfa5"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8bc6f049-6538-4bfd-aa7b-2aaaddb9979c.json", "hash": "370d5ab382fa21ed102a96e291b4cbb5"}}, "384": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/cfbi.md", "hash": "9dccfc625a2b58efc861aa00a4621829"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/75dbef95-cd79-46a1-83f1-5311534490bc.json", "hash": "7a48016111763afad98ec8fd08010929"}}, "385": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/mstcn.md", "hash": "7d1b95c40783292a30f2a9f370f6c146"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6b83d6d1-3f2f-49f7-88b0-8b6ffdd861c6.json", "hash": "29b0ee0c43584e690451a4b2d8449164"}}, "386": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Action Recognition Datasets", "hash": "e3825400b7f8082c16376e5f1692c88e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4613d6ed-c878-4172-a444-515ae063a628.json", "hash": "fefb9d5d096eaa57cdea0627aaa41024"}}, "387": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Action Recognition Papers", "hash": "2d9e8d975a148a372172bacf5e939258"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b5f9fd57-a23f-449a-a23d-75e461cb6d96.json", "hash": "eeedfbe5a45125770fc71dfe4599407f"}}, "388": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Spatio-Temporal Action Detection Papers", "hash": "61f464cd5c41024323136b135550bd94"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/28804844-7817-4113-9692-e83d3de8eb8e.json", "hash": "f991966f3fa85368ed4dba9000e5e85a"}}, "389": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/TSM.md", "hash": "94f541fe6799adee6821414d88d3ea6a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c179667d-88cb-428a-9539-25b36f5743ec.json", "hash": "94b5a50aa0d2532e8bb110c0128bf1f5"}}, "390": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Temporal Action Detection Papers", "hash": "691a19419526c7c23f77d685eaa7d053"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f0663793-8f98-4731-9883-073b01aa42c6.json", "hash": "9f35244b2f3d964949e410e37b1eed00"}}, "391": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/accelerate.md", "hash": "51bb2bddd0912d2c41a8bbef0b6bf7cf"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4a0caae3-5a32-4b12-8eaf-ea69856a0f76.json", "hash": "23501357a8e4a23730ea521babc47993"}}, "392": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/config.md", "hash": "c0cb1a388f74c8a7834a0f3397b0b637"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/af8e2a48-0b54-431a-92d4-49dbc536be17.json", "hash": "f62ea2bfb520b3a0178e2147c0d19af4"}}, "393": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/customized_usage.md", "hash": "0c808587c7d88f88afec32c5e0e4d2f3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/06f299fd-a77a-4245-8915-86813f4c644b.json", "hash": "3b5f45f45403534a22bccb61c3f4a526"}}, "394": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/demos", "hash": "38ae33a1e6b257c0111f335a4cb8c288"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/97ee9329-376e-4b29-bc36-82c6df1c716a.json", "hash": "35fea1e79f0e7979a42faffb0afd281d"}}, "395": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/deployment.md", "hash": "4d4c654c485cf2b4edcdab29b8ad09e3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9ed623f3-e39a-4c0a-b74f-f97a22c24d1a.json", "hash": "e8f62cf56d8dea26f3d34e83bc257e5a"}}, "396": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/modular_design.md", "hash": "0de76da88ae49a429e1b68eb7092d77d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b694d4e3-92ea-4eb1-85b6-86ac64c4464b.json", "hash": "4ce923e4fdb4c21b8fab9c8c084b7a3d"}}, "397": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/pp-tsm.md", "hash": "de84164aa8c3379c4faf79c9121cff65"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/548fa608-fac1-4cf7-8d94-aa8296587051.json", "hash": "de132d478b802463f379b7ad2cfccbe3"}}, "398": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md", "hash": "1d65e4c49d409f7f4c253a6445c0387e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/14314a35-bbab-42ff-9a94-5625a4383107.json", "hash": "7a98b606f03058098359e1509e88d180"}}, "399": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/__init__.py", "hash": "1ac8ce1c3300ce7b6867a4633bafc76e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7759f95e-9948-4737-aaf5-eba6ca9b081c.json", "hash": "17e5251e520c6c76ebbea299d9fa8c81"}}, "400": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/builder.py", "hash": "383445c217d5617fb2d87819239e360b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d60d1fcc-23d3-4128-930f-5277c3778174.json", "hash": "05888757484d5e9a19b2e331a1918370"}}, "401": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py", "hash": "343631dce293137a670d7bdac6a0ed26"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4a6e8677-5817-4bc6-b864-c058c6d70d4c.json", "hash": "780825183f1391e1fd90378d741a642b"}}, "402": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/registry.py", "hash": "c30cb91997f920d11b78aac43c387b18"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9c22d291-cc3d-4f0e-8dc5-8896129cbc20.json", "hash": "3e40c66eef7c5fc14b10ba98e6fbb894"}}, "403": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI.py", "hash": "7c189333e7c4b4f4fbb2b11272cc6ee7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6462bc26-102a-44ca-91a8-18cb0728b1b2.json", "hash": "4674a7f02d57794b5659407022b063ee"}}, "404": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI_SlowFast.py", "hash": "de623d86451459753f4b0f7cef2e623c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a8948d8c-051c-49b9-a87d-93f20686de6e.json", "hash": "073837e65ebac5933c26d015ea655151"}}, "405": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/__init__.py", "hash": "c11aa732eb0ca77da3e7d4766aaba228"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e4afe476-f448-4c8f-b18d-130062cc65aa.json", "hash": "5c862e0c55bd387a4d443e4755c02392"}}, "406": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/actbert_dataset.py", "hash": "03dd4b23cfa05c3da71d0ea222e03299"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7ca86e1e-5661-4976-bb34-3680e0ab54f3.json", "hash": "8677a50e0ea9f7a33ef17dfd79f58495"}}, "407": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/asrf_dataset.py", "hash": "41fc072447756f9efc42e546cc153536"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c195ce2b-9608-498c-b992-61b33f2343e1.json", "hash": "1074f3594f66ce7807dad0a8346e5fe7"}}, "408": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py", "hash": "a88d3ba47b258509b8036e1d0b70ec5f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bff320f1-95c0-46f2-8cc0-af254f2c92ff.json", "hash": "ed904540490055d1d747ef07e6a9659f"}}, "409": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/base.py", "hash": "d9772c70569116f9b26a84a7143077b2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b04f83e0-4d87-450f-8743-ca5dc71128b5.json", "hash": "cae39bd458559a43d113f14106b41981"}}, "410": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/bmn_dataset.py", "hash": "c5a3426375201c0ab8133a83b5ac9fb0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ccdce669-2f90-4127-9495-93e9ef0f9b0e.json", "hash": "10a2f57753476a70ff5402d87945a984"}}, "411": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/davis_dataset.py", "hash": "16efce61769497a46d90925df0bb77ba"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6de2121f-318f-4bea-a0d9-5d3a9b77d550.json", "hash": "b4b3b64f9daed4f7f1ef37cddf78ceac"}}, "412": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/feature.py", "hash": "ac37dca4a79f3843b3f74091ab03a48d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5cf2faea-1634-403f-86a4-7941b7b7f934.json", "hash": "fb0def000a1ff7eb4d7b29c840390dc7"}}, "413": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/frame.py", "hash": "18a99308d1bab235aff5583136e6c614"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6d049a5b-64d9-43c3-9132-f86a22f43e35.json", "hash": "d72657e17a405449e624e4a746c763f1"}}, "414": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ms_tcn_dataset.py", "hash": "c3b4720f141a2693f82009e87d900722"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/628ec3ce-6382-4f0b-87a9-72ac9a842cb3.json", "hash": "923277ef905cf8efd3344ffde52f11c4"}}, "415": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/msrvtt.py", "hash": "6eb0603df6d2aab2b0e286cd0dce0615"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4e27e6ff-cf4b-435a-ac68-53362cde9b5d.json", "hash": "8ec932baaec4c0f922ceff5a8acbd901"}}, "416": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/oxford.py", "hash": "510fe311f20c24a6fac9d9e223e3a266"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/618b1404-93d6-41b8-9866-5e268b6b4b59.json", "hash": "21ccc5f9fff3eaf656e406ef4b75b0fb"}}, "417": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/skeleton.py", "hash": "0cefa709936cc208448278b9846298db"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/69888b77-5f29-424b-a6fd-2c258d265b45.json", "hash": "c02edcf4a80ace1a9a5ea15389f57ae7"}}, "418": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/slowfast_video.py", "hash": "f6e851a930389607cc4d0da1a56b1e27"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/77f0b9a0-1a2b-4ea1-924e-36bac16c4347.json", "hash": "35ea9e3d7fcd40309ede7e20078a251b"}}, "419": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ucf101_skeleton.py", "hash": "5902de739929fc7a6ab11d2fe1f635fa"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/36a13f58-7b2d-47c3-8431-d877fccb0deb.json", "hash": "c894a5aeeb47b81efaf00b3f6dcbf484"}}, "420": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ucf24_dataset.py", "hash": "948f24b03c09108067ae40bfd8524af3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5c0275bb-fc51-4c0c-8d57-15e6c45e913c.json", "hash": "a8f5e518294e5ea370933de5ba815c27"}}, "421": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/video.py", "hash": "a6dd95701abb126f7747a61d43f4bc62"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e124873b-e846-4561-baea-8915df68a541.json", "hash": "facca01b80640af7d14033df2b0b4714"}}, "422": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/__init__.py", "hash": "3c526806a92294e65d24ffbb847c8aef"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7d0bc801-4d49-45e1-a566-6e18b8bca7e0.json", "hash": "60eebdcf7cdb121cac507e82ac8897f4"}}, "423": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py", "hash": "484669a48dce10e403297dc51e7b0075"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/63b14d4a-9e6b-4edc-a580-fc8ac0c6255b.json", "hash": "6442e96f3cd8514051280e392de6a169"}}, "424": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py", "hash": "82f641a5a1cd27c306deb231e2788e5b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d191fe21-0341-4511-a025-55caf07e6fb1.json", "hash": "e4d0630bf2587f8d274d97e9d512e53a"}}, "425": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py", "hash": "c709bbc5c80e31674d9f3ceca9e0e83c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/23c777e0-a6ea-4c5b-90a7-b744b8660384.json", "hash": "aea3b1cff281f4d0a48d454dda0db713"}}, "426": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py", "hash": "10093acc167b56b9a8dc353b6abb691e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bb921512-02e5-4d0d-ad4c-d6d72943394d.json", "hash": "621698fe9d90b40919ea8e23d78814d4"}}, "427": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_image.py", "hash": "d2420b4fdc6677bb384eae7359acd926"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ee4e7b22-14bb-46d9-ac33-7bc46cdb0c38.json", "hash": "4744b3ab71ea9c84bd5a300744c06fec"}}, "428": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler.py", "hash": "ec5366cb71e4dcb710e69074b2056d32"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ffd105f7-d1d3-4e7a-9576-c6cc2185a1ed.json", "hash": "33f652dd2f9375e32606fbfff0b1b18f"}}, "429": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py", "hash": "c67a37b8130c519b8f6e4fdddaf454ab"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bb325240-2947-45dd-a561-15c2b11d70d3.json", "hash": "4d6b6a5d61224a4deb2aa457f205d3c8"}}, "430": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/mix.py", "hash": "7bb560d18a74e19d3d6850c37da59680"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1e54cef0-a235-46a7-ab45-c9ac47e4b976.json", "hash": "abf1e06a732a9b8f7a49dff56acb5109"}}, "431": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py", "hash": "8f5d2205bb2aefef0f48fc4c2168160f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0a8c871b-e0b6-4ae0-af77-f9cac3ab432a.json", "hash": "1a5945a0842fd7430b628c0cc893386f"}}, "432": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py", "hash": "dc17e7f21dda5e92206ade6974ddf67c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2cc8b836-a7ef-4eab-aabe-28f3ee4f2da3.json", "hash": "140cc84bad36a61a04e84c0b7b663c06"}}, "433": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py", "hash": "d34a104c3425a1c70474f012cb32f439"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9096484c-68d0-4d1c-bb82-be7b63d078f7.json", "hash": "de78edb122a01caefec527c43ded6c74"}}, "434": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ucf24.py", "hash": "68ba6760cb0b9954f1af0de2a62b956d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2789548c-6052-454a-b1c9-a17b3c27bb6c.json", "hash": "55aebfeeea453d5fc99135e9617eb1b1"}}, "435": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/segmentation.py", "hash": "86df8a714e63c8ac711c38f2324ee468"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/db437a82-93f4-4ebd-bc38-4234e0ea8457.json", "hash": "77a1bb063c776f46d6ef1c456b389695"}}, "436": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/segmentation_pipline.py", "hash": "307f93025dd6b816b7954e72304e845c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/417ea4e8-07a5-41cb-9904-3076ff6620e1.json", "hash": "c92d838347bd3de99212cde4604fc8bd"}}, "437": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py", "hash": "90fc57b4dd1eb8e15856ad6fb776fac1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/65bcaa19-d2b8-46f9-83f1-c9ff07f19e09.json", "hash": "68696d4de51708876204a32178c26bb0"}}, "438": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/__init__.py", "hash": "c8d687faac42dbd291ba7726f84e33a8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f94edaed-272f-46c5-92f6-29e73f7a0e9a.json", "hash": "978a75be0c757a0cd6248a2a7264f330"}}, "439": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_metric.py", "hash": "fe9b2ac857b31e1e4497b8e8694bdb8b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5c3df183-8b2a-4db7-af6f-5c152335213a.json", "hash": "77e3a7bb7e604217c1b0cafcd0450759"}}, "440": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py", "hash": "99ac91fcd1644edc574b53021305db8d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/31dfc0aa-3043-4679-8ce5-2bf63a00a92f.json", "hash": "c524f031ae0d68029d2178573b02c6cf"}}, "441": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/base.py", "hash": "74e04196b8d86568c0dd17478c14d518"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/858c078f-36a0-4e77-911f-9b31ee8cf7ee.json", "hash": "ab580a65f9de108abd134e99211fa41f"}}, "442": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py", "hash": "f7dd37ebfcf14c5476c9e958ffb65b49"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4d06c7ec-ce63-4739-aff3-681d841badf7.json", "hash": "43eb2c6ce572a9528a238a18d6a249fc"}}, "443": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/center_crop_metric.py", "hash": "9c0df58948b8ced521de78bb9fbeaa30"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b78583dd-b911-4165-bd81-f7a00b2893c8.json", "hash": "638718ad86dadf48e9f1d0d311865c79"}}, "444": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/center_crop_metric_MRI.py", "hash": "1a77c232a4083cdc0ee6a6ac25dee150"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/47abd731-e6ef-4b69-9c7a-4f6bb0d86c93.json", "hash": "3fb39e3da1d08530d4b615fc50d1e569"}}, "445": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/depth_metric.py", "hash": "bbc0f4c161f859e6796887d0482e1b0a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5f429bc7-5b87-405c-8393-ba3eef352764.json", "hash": "a9a2f8abfc6a35456e739816c71eb97e"}}, "446": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/msrvtt_metric.py", "hash": "4c32690dba7ccb4d2d1e63cefa6cd329"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5319de35-5759-4c56-a6a3-0b026f570722.json", "hash": "afae38c929661d8331ae49ef4c91fddb"}}, "447": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/multi_crop_metric.py", "hash": "76cdce6219a18efe392fe2b071e506b6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/35c844fa-68b5-4457-abfb-24c68e2bdf80.json", "hash": "5f31ff2678a3d917573e1eb9beb20d9e"}}, "448": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/recall.py", "hash": "907e741b8f036531e71b97f3eb1d1996"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2a982ad5-ef2a-4e9a-baf5-63578a23e618.json", "hash": "c3bbf12387b0eb5286f48638cdbd1c12"}}, "449": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py", "hash": "08e3aa07191f7458b71719cc90cf7614"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5c655b59-6f3c-4a98-974a-5487f8e4108a.json", "hash": "5308d0e594cf0be8532db2bbc90f60f6"}}, "450": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/skeleton_metric.py", "hash": "d7bbbed7a7e3bf49de1997317f4726ee"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8672a5da-5798-4c08-8e08-f15f4978ddcc.json", "hash": "44daaf7859a3ff4900fc1d19890996dd"}}, "451": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/transnetv2_metric.py", "hash": "4d33253537c98461533e113a59afbd15"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/34596d26-8de0-4e85-9571-2b797bdf81bf.json", "hash": "1220f2bd8cbef564c51b91d4ea78b5ae"}}, "452": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py", "hash": "4e22f749108b0b40a65d392c95a86c0b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9c9b671b-0b87-4c96-a0cc-f02fb04bbdcc.json", "hash": "15803b4be49d05447f183fc025f5c2ec"}}, "453": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py", "hash": "3a00cd9a5166d5b141606eae4b345c02"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9d891651-5fcc-4ad2-b8a8-193ba882fcb7.json", "hash": "7ebd3f497570c7abb7e132a11578119c"}}, "454": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/yowo_metric.py", "hash": "40863e3ea7d2ee676ec72e93090af36c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/94337949-c250-4aa2-8a25-eba76cf708c9.json", "hash": "34522655da1cbb4887c35307e63df692"}}, "455": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/__init__.py", "hash": "0f48a2dc3933f8e506fb5b8885756655"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/020ce968-d897-4a26-8839-6a0db38089c1.json", "hash": "cc1c512abbd59a761a5d8fd2de672147"}}, "456": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py", "hash": "f9d59b55380790bd4e17c4900333c9df"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/437b6fc8-ae66-4984-9f7e-ab5d28eb045c.json", "hash": "85371ad80366906c376bf850754be580"}}, "457": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/README.md", "hash": "f18be0c7017db89f3c4e1bc76b26a185"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ad0de3a8-4889-410f-a868-c65bc2f74d46.json", "hash": "ee3c74024a2c98c396b428c7a6c4c185"}}, "458": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py", "hash": "df3e08c95609a32bb9ee690eac480104"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/89fc3cf0-d88c-4d92-99d8-9d47400bcb20.json", "hash": "6e27cbd804c667e463f2de450acbf25c"}}, "459": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_list.py", "hash": "99a5d84fde5d455c173ead9d8a09a797"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/462dc8cb-b385-4035-b04c-9d58c98e7e51.json", "hash": "bf57a035a372869a43788e6233b49754"}}, "460": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_ops.py", "hash": "d12452097dd3d67cfdee184185e6770b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b542398e-9d06-4161-b8bd-fc07ee5e983d.json", "hash": "2f3f4e521f9cc70b888462afc5d1f84e"}}, "461": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py", "hash": "c20e82fde113e4c06c9005527cb646e3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fda7878c-b567-4c38-aaa8-71eeca454894.json", "hash": "9f289f210162e41c140b54684b256901"}}, "462": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py", "hash": "aa206d254fd987cecd497de2eadb3446"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9b695b9a-1208-48a8-a906-7c88c21cbc10.json", "hash": "30f42ffca4bf8379d2475d486a538a55"}}, "463": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py", "hash": "00b5b51997a96bd9d785fdf5ac5a18a4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a5dedd80-9664-46bf-9695-2c6fd91d4193.json", "hash": "3a2e470212d6920a4c9d4a7bc4036fe0"}}, "464": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py", "hash": "8b0d8de29e18248f35cfca9bdec5b708"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/15ebb946-02e0-453d-a13e-f3597abefd86.json", "hash": "6f73048799511167e10133b9147c7693"}}, "465": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/eval_util.py", "hash": "0ecafa0d7544f5df6c8844b7cba6088f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d10cc70a-a304-4cad-9ce9-9d6361dcdd2c.json", "hash": "1db45eb367342dc7657579f5e87821d0"}}, "466": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py", "hash": "90f227fe379199699912cd82daac32fe"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5014415b-96eb-4188-85fe-6c50f490f696.json", "hash": "21c0871122092be71b30ab3bb10cbc4c"}}, "467": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/__init__.py", "hash": "0bd6e31ae9eda78b1cc2b62033ec35ba"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/89a0021d-5489-4dbc-8ebc-6befe7e119dd.json", "hash": "0376bb9d8ef56843954bba83c14e1487"}}, "468": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py", "hash": "08e83568093ea44514027305132eab72"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a1806a3b-995a-43b5-8dff-6baa6c3fb101.json", "hash": "dd7c073444011037c31f2cb6e87b227f"}}, "469": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/builder.py", "hash": "7dc3392bb1b7862f0dfcf07cc13f4920"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6b644553-9c32-4362-acb9-3a20852badfc.json", "hash": "c281f3c36e1163faebda1b77f4d534a9"}}, "470": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/registry.py", "hash": "7013e08891f44cbbdd99320306368aaf"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c1634b2d-7ec7-4f71-9305-985386018d58.json", "hash": "76f79ef233ded0f120d85bf92bb6285b"}}, "471": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/weight_init.py", "hash": "4aed95d2bcc69135959a8c6dde16175d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e881da45-26b7-4628-a8c3-aaa333d4076c.json", "hash": "dc1454c133250e949a0b195eb3c1afb6"}}, "472": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/__init__.py", "hash": "4aefff3fa4a1d6015df8728e727ca364"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5c03e93c-1a74-4df0-8602-f5ce3df30ca0.json", "hash": "632fda75b819071a611a0d42c52e6492"}}, "473": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py", "hash": "9f7541650a12a8853504cf78e4ba4c13"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/dff18248-7c27-421b-ad0f-4520190aaf15.json", "hash": "c4bf1e45e0e1c4c6d71f72854eac939a"}}, "474": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/__init__.py", "hash": "ee59dae6aca38cd8821e55f3987378f1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/46b06b32-b631-43dd-9c96-079811df2c97.json", "hash": "6b7704d876aef30c45ea268c3d13be08"}}, "475": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py", "hash": "c59e805366227db5ca2b288e31c3fe72"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1a584934-3d9e-422a-a9a6-81d39d0bccc0.json", "hash": "64fb25decd6bf9f95b261f6f9cbc31af"}}, "476": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py", "hash": "89531a68f8eacc03f94e8dcd68510be8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f9e68dc7-a04f-4c73-b284-687a44ba1c23.json", "hash": "0e37749d6dff55e0c8dc565e31106ec2"}}, "477": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn.py", "hash": "f5796fb49ecf1e6b5c06cac68f0fdc51"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ec1b8f6c-47a8-4ec9-b8df-9b4258268a4e.json", "hash": "b65311bffade469d721f8624eeeddb62"}}, "478": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn2s.py", "hash": "f5068e65f25b7f58cd2d9d287258cc84"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bbf3f8f6-d6b5-4fc0-8cfe-4890421b5c77.json", "hash": "caf927eb7fe2bddd5f890c4e2f81c8b3"}}, "479": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/asrf.py", "hash": "849fa1c197a87004b47f3ff6dd9c69c7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8bc86c23-1eb5-460c-92d4-d58ace8bfbc3.json", "hash": "62682dc4a73ef46fa9fac3bd52a6f256"}}, "480": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py", "hash": "0c90bc28f624029a33d466e4cd806195"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d5186a9d-dd54-42b0-9435-930938996858.json", "hash": "19e993f98354c8d3a62ad9e03d68b4f3"}}, "481": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/cfbi.py", "hash": "27052d7bec7a9e2328a8c5be1306a4f2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d15fdcdd-2e5b-43be-9fe9-712b6f255d6a.json", "hash": "16a0c1f5ac14daa6f488815a75aa8f8c"}}, "482": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py", "hash": "fdb1b339ac782727072e9a04aaa92f93"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/164872e3-44dd-4f30-91e5-cb0d20319c74.json", "hash": "7c5f4b7febe997c344991adef0f31d60"}}, "483": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/darknet.py", "hash": "56760538d816fa9d0f2e31f33005332a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ba74ccbf-caf5-4b98-8d47-579014659578.json", "hash": "db56a601e15e8fc0bf21e3ea11c3faff"}}, "484": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py", "hash": "2d3847bd50661b43ffa3a8ee031dd092"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/67f9dbca-b1f4-4ef2-93a4-4af3f80cadfa.json", "hash": "ef1b9d5771e91613731272780db1b765"}}, "485": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py", "hash": "89598f5dc2136cc2077594462bb4d1a0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7669eb12-1b11-4ab0-bde0-dcc25e0cbfd4.json", "hash": "0e503548a4515c7c55d654e89a4a5c65"}}, "486": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ms_tcn.py", "hash": "73c68be11fc5d14301c9485d35a57b4e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/57e1d0db-97fa-4149-8697-27535f8ae236.json", "hash": "39b66f742eb45f26968a6634636be9b1"}}, "487": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py", "hash": "299bd2163efa2890e3dae3178ab4b05e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/425817af-113d-48b2-881e-e1becef9b76b.json", "hash": "ba1c71d28373a0427013ef4406f13804"}}, "488": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py", "hash": "23f649273236ebe4d648500e218505bd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/62d092fb-8910-4f90-ad82-f1ef3acdcaa3.json", "hash": "f03cd182cf31da2301ea8f7b35fd8808"}}, "489": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py", "hash": "4873b4690b3a313fae7fd231bd6b3df4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/036e6775-6c6e-4d70-b4be-f225cf60a4ff.json", "hash": "cf422554f2e1f0e546dab6d4cd429e6b"}}, "490": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py", "hash": "1fdc031bf0cbb455ea9804c846476e88"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a369eff4-1cfe-467f-9b5d-d71b4ab8a4f1.json", "hash": "acd1cdaab50c306efe6cf0fadcab0d32"}}, "491": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py", "hash": "4b369d4fb977282b26e69b871992baee"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d9841904-8c07-4146-a303-83733a517a10.json", "hash": "2bd84b30474eab1a21c5ba4ab460396f"}}, "492": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py", "hash": "b973ae9d8d5de6afddebb6199b80db9a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c7eaf2b9-2f77-4f59-abd9-7a2cba8fbd07.json", "hash": "93185ba343a01125de60e21a4f47c6a9"}}, "493": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py", "hash": "56bc812e7ccb8cc4daafb8928e6426ca"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1ef7a7d7-f557-4529-95b5-1a41b11cb022.json", "hash": "7c2b84ca259b427d35593cd6252d6492"}}, "494": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py", "hash": "bd9d74b4683a3577e47abec0d0891ffa"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/69686180-705e-42c2-bf0c-84e8e04bb6bb.json", "hash": "e956a521a96de3d36deb98ea7a620ba1"}}, "495": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py", "hash": "f26b7d9091ead644fc2c310f47ab8765"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/697c842e-2194-4f41-9c5e-8c5b11a49a12.json", "hash": "1b735586209f4278db920ee1b26a80a8"}}, "496": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py", "hash": "76c0e4217c50d560c6652b2f9da26f62"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9fee75a3-1e39-420a-a64f-f40e1abb9157.json", "hash": "5adf988ba5f24ebed1e32697bd80120c"}}, "497": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py", "hash": "98bc5a333101f9209acf366ea1bc2a30"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/930c71be-937d-41c7-b08e-fa16358f3a52.json", "hash": "42cbe6709c1ff4e3cd83dbad5d2c6ad9"}}, "498": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py", "hash": "4fc154a563e341d9242897e1e94976aa"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/cd86fd79-9925-4ba8-b715-582e518aa42e.json", "hash": "46bf60a870ad63381e6393afb5be39d1"}}, "499": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py", "hash": "f17ea09dff424d91acabec1147e053ed"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bc1deee6-4bc8-430d-bb7d-3f7f654f0040.json", "hash": "6a6ddb03b815affd54633034866104be"}}, "500": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnext101.py", "hash": "9fa30f2c303cdfb5ab54c4af0cd75bb0"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/25008972-aec6-44bf-b83e-21babe2c3e0c.json", "hash": "656b6e8e29d6b2c10a8c9bc12d8d9b5b"}}, "501": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py", "hash": "5c9466d02755ce43176c1f2ae0f0e9d8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/10f9f848-c5bd-4ec0-9f2f-6c8f72cc311c.json", "hash": "6a3c2f003b02254e72845050d1cb960d"}}, "502": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py", "hash": "4d09f4d33086b96393b5767b9e0f44c7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/08612506-c012-4cf5-bdad-9241be0885d5.json", "hash": "d46dc0c14d5f3bce8ca85812e2e87f91"}}, "503": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py", "hash": "3d42c00c4238076a4c57bf892299f931"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f310e506-75d8-42f7-b153-4159889d74ab.json", "hash": "956b28a5e63b4de313549673ed443748"}}, "504": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py", "hash": "9e19629b1697294b7c0c6b054f51660b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2a3ee5e8-0ed0-470e-8093-d3d61b489df6.json", "hash": "d8c8e754ac4ab1e7680ccc046ec5701a"}}, "505": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py", "hash": "75452e7c1129500664263a2a39703a38"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/21fc55b2-e95e-4aec-8338-e3a65b639b4a.json", "hash": "0c78db69ec062080a79297184b4d272a"}}, "506": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py", "hash": "eaa72eba6cda08df7f1257b4b1c70eb1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/630fe9e7-2ea2-4aa7-98fb-3fac2e58863d.json", "hash": "1aafdcc9623440d734cef26d39dc887f"}}, "507": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/yowo.py", "hash": "e7ab6a742aeb91edb84128181aa4d007"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/cb151237-0521-4dc7-bd9a-f0e86568e947.json", "hash": "0dd00bc48aeb37e65aae3cae9c3c8360"}}, "508": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/__init__.py", "hash": "26b67a637f95811a095803c90070c98e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b59019a2-7952-46da-80c1-45bf72d06907.json", "hash": "30f8936d3c5063b02c22ba5aae760627"}}, "509": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/__init__.py", "hash": "b9d0589335402cf2ca4fd0beea0f7244"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/dc0c9c03-7070-47aa-8803-6519cf11b045.json", "hash": "37defb8b18b1f15310aee734a68fd26c"}}, "510": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/base.py", "hash": "d8fc360f699d85d96cdc2a1598b021bf"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/84255506-9b77-41c0-9f9a-be657b23b846.json", "hash": "63a81110e18e2461d6e5d778a3fd527e"}}, "511": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/fast_rcnn.py", "hash": "42d2dfb91c4a0f2627726f317cb26dc7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/23cc0f4d-18b7-4a32-a685-092f8cd2c6b8.json", "hash": "2212215cb5683eae6804f33bbe6e04f0"}}, "512": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py", "hash": "d22e5dafbb72bd78429de5a092ad7f42"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7c2a72a9-c669-4627-9673-d8f974b475f5.json", "hash": "5701875e5d50f2f9299e7dd2fe74a912"}}, "513": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/__init__.py", "hash": "2bcf4e274efcbd8d4643608aa1aefe9a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/af9be0dc-2f2c-4e4e-9b38-9456a5780c3f.json", "hash": "d88a7d506923bad5dbe982d0ee6de10a"}}, "514": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/base.py", "hash": "fe3b51108aac230ae5dbfa1bd29b8909"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ff13c874-3963-4a5b-8ad0-9c4109589c9f.json", "hash": "dc6c1a8ce692cfa55fd0519cc869b8fb"}}, "515": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/depth_estimator.py", "hash": "47ed82e1f96d1104c784d51bfda05980"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/455ffff8-f746-4f1a-9660-e6765fa42cf5.json", "hash": "8e48a911c04452d378018760628bb753"}}, "516": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/__init__.py", "hash": "b768338e742ddda786c8603bd56061ae"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b637999b-258b-4c27-8b5b-e39d939d9727.json", "hash": "56042219e58d036c0a5dd28e1e10f125"}}, "517": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/base.py", "hash": "ede6722f5957b21458d29af5e4bb6363"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a664fb08-433c-49c1-9290-2307b6dbef2e.json", "hash": "b2f47c9010867c9f0825ff43e4a74d12"}}, "518": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/bmn_localizer.py", "hash": "4610dfa96858042971451be5b710dfef"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/24375566-77e4-4025-9c4f-d6ce173bd1c8.json", "hash": "065e165f1104a80293f9e87ec92bb239"}}, "519": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py", "hash": "615e05eaf33191dd5f9f1ef01e43c61a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b74049ae-1829-471d-9215-cc98adbb882d.json", "hash": "8e89e5c02b7b15a333108a9cbde13b25"}}, "520": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py", "hash": "57de74a2ee5ca1c14834324d7f1e2df8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d5630c58-45b2-41fc-b189-8cc4a1c58f4f.json", "hash": "9253d6e3d9b0f4602921bf859cf6e225"}}, "521": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/__init__.py", "hash": "70282428eea8bcd2f3f93b250d76f9f8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/dec208dc-4d3b-4719-9553-f1d29184b1b2.json", "hash": "45a96ce36ba1a57212d2d6f4ab6b43d0"}}, "522": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/actbert.py", "hash": "df1130cd8b9ca26f953d4d888e56440d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8d8ad343-569e-45a7-a33f-92251d72be90.json", "hash": "e807121ff08e9c3f3b83e2ff6e5a155c"}}, "523": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/base.py", "hash": "0b86b5dad447b8d39bbe3a0b9406a741"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/54c8f988-1d1e-473c-8ad0-7b86dabb0428.json", "hash": "1db0accf1e71c47d9ace532789aa4f07"}}, "524": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/__init__.py", "hash": "736934ccecd72142cacfd0327d1cff1a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8d2bff4b-1fa7-4560-bc92-160a274fd5a9.json", "hash": "a25fae80b234d62a07b2fbc13a3af7fd"}}, "525": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/base.py", "hash": "908cc08c515f27faed4239b5122779f4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/632b319c-6ccf-4e84-9185-616f39959d82.json", "hash": "a40308520932289775d6fc413da87fb6"}}, "526": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py", "hash": "9c112b2b2e0c5aa092fad035892e7491"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9c6566fc-5944-4999-8a72-8663167f668b.json", "hash": "46db90b86beff9d7f022739b3d913256"}}, "527": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/__init__.py", "hash": "0bf435bf54be33c6e58cdd9565a30a27"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/865d27dc-fad6-4333-b206-a70e9019bd81.json", "hash": "aa7193353b371b80df1230764355809b"}}, "528": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/base.py", "hash": "281318402b61d96dd3f250fa6c7b6a31"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c9b6c38b-cb97-4d36-a421-1e806dc14d11.json", "hash": "8049c1e9331ce75dffbd01c7a5f8a968"}}, "529": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer1d.py", "hash": "9814b8fc599b1dd323e9cf35c1aa8a22"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e18d8052-04a5-4da4-9819-5fc4c1383835.json", "hash": "7bf06545fbb8f0ca0fa8c1b083e372b2"}}, "530": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer2d.py", "hash": "63aba7f7fd6864ebd2dd5f02bad457e5"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fa8044e1-d121-43f2-ace0-5c4af43dbfdf.json", "hash": "5651106645d8c4f2bd40032f4e1d1dd2"}}, "531": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3d.py", "hash": "83eda11618479c9e0df116e992fe2b9b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f0bc3308-bc79-4065-ad74-396d26f0a5ed.json", "hash": "8d8d9dcee647093c59293b3dac185069"}}, "532": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py", "hash": "bfa9e9010ab7e943fb82c5b5cb147109"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d4d32b73-4eb8-4bfd-9b8b-cfea23010106.json", "hash": "8a124e0913ed3acbf46680283be5a145"}}, "533": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py", "hash": "612d5aad4282d28452b1a9efc9072e17"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6e9bbbd2-a926-421a-bbb0-ba8383f3362a.json", "hash": "bf597b4555371c1a0da81c0a07c3926e"}}, "534": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerMRI.py", "hash": "b97510238ba20b22ff4b44e7a5c6f54b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6fa4e1bb-e7ce-485a-b81a-1905868b1ed0.json", "hash": "432ca94a92e7491449396f067afcba24"}}, "535": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py", "hash": "8cd7b502c79e1bccfd6e18893fbcfb57"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b9e6e2c3-2cdd-4666-a7f4-8180d78457cb.json", "hash": "84ef0344e0a4250b9337c290afb08759"}}, "536": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py", "hash": "ad2c72e3f7b6fd77158a73d82ab00618"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0b0d02e7-2362-44bd-bc2e-463de08cb7d5.json", "hash": "cce7ee9829b0ee91d3fd18abc8084438"}}, "537": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py", "hash": "22debcf0b30af070617e3397a9cb5048"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7aa3a469-2617-466a-aa43-c2a519e30fdd.json", "hash": "8d5bddc9e7a5600fbeb13875f0bfb031"}}, "538": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py", "hash": "7e6056f1567cdd15482e0574d4093ee4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/73bbcc1a-2f27-45c9-8e1a-c6aca413d5a7.json", "hash": "eef2f1013b3df647e933036195468a19"}}, "539": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/__init__.py", "hash": "722e382a113f0b35f5bc209482c7fe4a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/7935ded9-60ba-4abb-8e9d-09fca22a25d8.json", "hash": "d0d7525e1c5a621300e4ce3f3d680a50"}}, "540": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/base.py", "hash": "dd14fab5889c9d55cfbd15f3b0847657"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/cd0367d0-1762-4da5-9fc5-3268989e55a1.json", "hash": "d2b48caaad309c8652852b25ed39bb9b"}}, "541": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py", "hash": "ab25886c167e22357ecb20561be1e7a4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d9632cda-1d99-44f9-bc5b-a3bddef2f276.json", "hash": "a8f9b6f3e6c2aa762fd4f7e02314d54f"}}, "542": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py", "hash": "f58237aac36933936f2a56cf65164867"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c0fee295-486e-411f-81b5-35d6374afee3.json", "hash": "1f2825ce44ca13e372382f4d2ff0265b"}}, "543": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/__init__.py", "hash": "64123ea6dfd9851c10456ad5084fea2a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a7c6c8d8-6afa-4f54-a92a-f368d815c37e.json", "hash": "2bd254ea98e9f7d214b76d9252972fe2"}}, "544": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/asrf.py", "hash": "89a3d65b1e8ed4c59a05c4f6c1a84587"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ba1bf883-3c09-41d1-a3a7-fcc67613df5e.json", "hash": "bfee0c96d0772f47506e146b638cc7cd"}}, "545": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/base.py", "hash": "d5a8f23d3bdfad5031bac4e959c37f1b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2ef6c407-6499-4b33-b88c-692844b93c1e.json", "hash": "2b7805c56f4dfce9702f3092228f6177"}}, "546": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/ms_tcn.py", "hash": "eca81f2e5f0a8ced1ead14236cc716ff"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/892f214e-5286-4407-b58d-a45a0a31366a.json", "hash": "0e6b6811075b46a8e7fe785a30a7157d"}}, "547": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py", "hash": "e983fd456a784c27b9aa45e81dda88bd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/cea7c6ee-4308-4274-ace8-0fb1637d5d62.json", "hash": "c7cc9263a5653814d6b78a7527472d3c"}}, "548": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/__init__.py", "hash": "d3dac01f4e8da77118c0947f054efbbd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/73d53f98-0749-46af-9c68-554b9e10f342.json", "hash": "3c441040abe55aed619e640f54a32f2a"}}, "549": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/adds_head.py", "hash": "0c9579d65f9030af4e85d7b94fc37b87"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6f66441f-8644-4395-8219-f58f193d243f.json", "hash": "79ec838bf4881e60aea881f0143e9b77"}}, "550": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/agcn2s_head.py", "hash": "07b710744a33ddd054e13115a55f5b52"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ee479cee-44cb-4434-bee5-a847ff5495a3.json", "hash": "7d905a8ebad83f7ff51edddea1ab4d25"}}, "551": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/asrf_head.py", "hash": "d27d552e5c70f38d2d87601e49f1845d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4cff8309-3d7b-4715-9d30-609fc3431144.json", "hash": "de9385618327df41da8ea69c23cba0e4"}}, "552": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py", "hash": "f24d71b8cdee4305d54ebde429ad1fd4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/311bc662-f84a-48a4-b3b0-9dd6651566f7.json", "hash": "d68f3a42e7bce90f99f9aaaa17ff808c"}}, "553": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/base.py", "hash": "3a0aefc451d53430f31f919f5b893a8e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1a454a9c-ff43-4008-9b16-77e5a4df958d.json", "hash": "49d93266bf7545be3861762d9dae4441"}}, "554": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py", "hash": "71c79022e018a244ef9f8fe9deea4536"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/79783676-1913-47e3-83a8-1b2389777349.json", "hash": "ed322851ae67ab906efe211b0cae87a1"}}, "555": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py", "hash": "db82d463796911a63ea802dea860133e"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ea007cde-8602-4b41-ad03-a2e699e98c59.json", "hash": "34d3064748bc17229da72316feade259"}}, "556": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ctrgcn_head.py", "hash": "173f6f6d1bf48bce24eeaf67cacf9293"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5797c0a4-1a31-4b45-bb26-5a35b4cf2ce2.json", "hash": "5cf96e8fd6af43a75f14d78846b5dbb4"}}, "557": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/i3d_head.py", "hash": "47b27359cdbf1eaf2274ede62d541898"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fdb59e4b-8fde-4c7a-8847-ac96f52a9f6d.json", "hash": "a73d445c85e6412179d08a1d80ee36c9"}}, "558": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/movinet_head.py", "hash": "bdd48e64a3b30e36b0be9606c18c71f9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e3428502-31fd-4c16-ae7e-815bce30744d.json", "hash": "4c9d80da512615e6d1512c052faef9d3"}}, "559": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ms_tcn_head.py", "hash": "de572808530015e3be183a77b5cede5b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c22bdd3d-0b55-49b5-906d-f139a46e5ffe.json", "hash": "eee839f212c8cf95c09a569c9d2bca71"}}, "560": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptimesformer_head.py", "hash": "dea0177f1407ef19d5ebbb07c76f3433"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9540c56a-4dbe-4ffb-abf9-efb046cf75de.json", "hash": "c3bb7b4a39ea48e479c1bcd744180638"}}, "561": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsm_head.py", "hash": "d8d4a6c94f0321764db20bba3bc6e79b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/64633a42-32ba-4b11-a641-59aadaf57e1e.json", "hash": "9f623a1adcceb431a39c1bda94e4a732"}}, "562": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsn_head.py", "hash": "ed0650401b3fef802fc3e4f8db83d811"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9184c97c-2073-483f-84da-9aff0d4e4d06.json", "hash": "131b3d43e84a31ab18ed9868c6c9d030"}}, "563": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_extractor.py", "hash": "53eda48c762e5903ce152ef1f3572b88"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8d127c8b-b6bb-428d-adf0-597f36b6649f.json", "hash": "da438a7cb63d048e9e97fbe06e781bcd"}}, "564": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_head.py", "hash": "e3644a81312536ea8a9061267156d92c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ee9b6955-c8bc-401b-9325-c73bd62ce982.json", "hash": "3eb8f1483c2dd482d57942cc55b078eb"}}, "565": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/single_straight3d.py", "hash": "8702786f048e2174d68b0f70b079ce4c"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4f50e070-a940-4db2-83a2-23168083178f.json", "hash": "06b6e0f9773510b71ef899aeaff71ab6"}}, "566": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/slowfast_head.py", "hash": "e549345f9be07fdfcca2714f785cef75"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/2685801b-a741-4770-9e5c-857dc1bf3b84.json", "hash": "06985fb42050be87ea07180ee261f39e"}}, "567": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/stgcn_head.py", "hash": "1cbe9581f87912c3b6044832cd175ed3"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1025a9ea-3d28-4c37-9e52-1ca7ee001ee0.json", "hash": "5e96df705c679db2b01f8190bbb03e34"}}, "568": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/timesformer_head.py", "hash": "44341a905b9216c8bffedb20c134c7a2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a77fb5b6-3247-4e1f-8b9e-2d094f60b5a8.json", "hash": "1e88abda4f3014663e84fc29c93d45ae"}}, "569": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/token_shift_head.py", "hash": "5dc6d52e7d3d282d545d851faf44098b"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/74717c3a-94e8-48a6-82cc-b5cc16dc457b.json", "hash": "64e0a747dc7fd11abda28b38b32ace8f"}}, "570": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/transnetv2_head.py", "hash": "73a09474a153b579da2484052d4b8c33"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/328051fb-fd67-458d-8465-2d715eefefcb.json", "hash": "38602c3afa9743a4ce26ae9605744418"}}, "571": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/tsm_head.py", "hash": "8687ceb66af217f04c0f078d88150356"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4cb0d2f5-3f59-492f-8273-48b5fec17c86.json", "hash": "6693222c23321a99607c30a02fe74992"}}, "572": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/tsn_head.py", "hash": "f1a2d3c2777713997cb44d52d25e4ea4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/40dbbb97-f039-42fd-a668-0f9b89bc7745.json", "hash": "64dec748e3adc1a4956ee3126be43939"}}, "573": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/__init__.py", "hash": "ce9883252dcae8afa7a0e4e4caab63d7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/94a1a1ca-0f94-45cf-ad4f-6f8a8d7d67d2.json", "hash": "7e66a21f181d047dfc6ad2c4eee80ca7"}}, "574": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/actbert_loss.py", "hash": "951010ac91ca54fed9f19df1eeca5424"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/70cc0e1d-d3a5-4774-8f93-8fd07d6bfeea.json", "hash": "9dca300d7d03eb7aa95ba0d3eb10e6d9"}}, "575": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py", "hash": "cea21596d396cee6d39bbae23f367b67"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f7463aec-0911-4b21-8ae9-c3076545d82b.json", "hash": "3a51a880014c0ef7d2b42a80059a8dd8"}}, "576": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/base.py", "hash": "7e3aa2925576ba0ac25b10c1c9fd3e59"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/57d1a9e8-3e06-4b6c-be05-05a9659ff66b.json", "hash": "f442835305ab800de916924163f3a359"}}, "577": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/bmn_loss.py", "hash": "62c8e4b81ed049e7dbba14121d28d26f"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/44af1e9d-a252-4491-9e4b-218ec8498ceb.json", "hash": "f010b76a35717cd3f71f03e315261142"}}, "578": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/cross_entropy_loss.py", "hash": "ef27d6a2641f10d4742e7c0b26ff43a4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c17f64d9-ec50-4c57-8589-cf6f4c226113.json", "hash": "d995ddc88bb7b1a2f4b0b402f379f77e"}}, "579": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py", "hash": "23e8485ae572680862bae05e8690ec40"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/82fc4d1b-6775-44e6-ae19-cb50e584e271.json", "hash": "d735c02743eb6115c4976b2cd63a5b26"}}, "580": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/distillation_loss.py", "hash": "21dc48ed456928d066419672b90b8eb2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/db83d6c4-7d6c-4325-8507-33f19b81fd7b.json", "hash": "d5534609b498d6df24f45bff048af6b3"}}, "581": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/transnetv2_loss.py", "hash": "a17015fa1e7abb621102bc85838e3adf"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/20a62983-2c0e-419c-bea1-c9a43141235c.json", "hash": "361f5f16aaec607c955e3fd391bf96d1"}}, "582": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py", "hash": "10b02da6f08a54bfdd5564c39171dc7a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e73084d8-adcc-4bfe-bfb7-f533358c0599.json", "hash": "7848c35028f7163a76a7916038818e8d"}}, "583": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/samplers/__init__.py", "hash": "defe76def1f7bd1476e6804af39e57d7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4b99f1e3-4868-4798-b9f8-fe5ed1e1cfed.json", "hash": "49891d953cb22abdf8c286fe5901bb3d"}}, "584": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/samplers/random_sampler.py", "hash": "60b22ce6205e5f3a75d7c8c29c9f2105"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/1068a225-d288-45b3-9649-5d785f56f7a9.json", "hash": "d288be2118c4548cc508fd02b09608c8"}}, "585": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/__init__.py", "hash": "4d0b088350703dfce3ffaefe979a42d1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/22d919b3-064e-43b2-8d4a-c40b0e1afd3b.json", "hash": "79f966936b19407112b3e94175814a97"}}, "586": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py", "hash": "c81fee6d8e1be4d7a010d10b0f6247cb"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/bc14a46f-d682-47ed-9056-8668543c4710.json", "hash": "2c0f7ed782787c6844a5e0909f4f1ff9"}}, "587": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/lr.py", "hash": "50f240c0bb5478dcdae3b6916ce48ea8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/25b47995-09f5-4019-8d32-ecd9a15146e1.json", "hash": "6ac1965d250c395854d4003426cf3533"}}, "588": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/optimizer.py", "hash": "6e5a2bd63b5e55b82bc258c8632db9b2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5b85de12-586e-4b81-9578-d4b6c7687199.json", "hash": "fe88b7d99c73773ed40847781199f2f8"}}, "589": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/__init__.py", "hash": "218e55cdcc17fed0e85fd2e2e7431af8"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9aee4f8d-0e16-441a-8b8e-242b67a6e02a.json", "hash": "3b2f982c211a82750511b38d7c15b08a"}}, "590": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/test.py", "hash": "cf35de0b07c9d15726c436e2f6c730f4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/40c75bd6-da2e-49e5-90a6-02102a291481.json", "hash": "23d3e501929d9c02b2e947df0f3b7de1"}}, "591": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py", "hash": "3ea6e61f517bcef62a009700b1dc38cc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f7bc72bd-4574-4ec4-9c02-69b414705b8b.json", "hash": "f42a7269df4a208e09781786efcbc229"}}, "592": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_dali.py", "hash": "e09e5c303760c3f4f03dcc52156c32ff"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/d3766e63-d8f2-49e9-90d8-d9a35099b357.json", "hash": "e9ea8018cd595f05439198a84c728422"}}, "593": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py", "hash": "6e3e3bf21ee05611a8f959563d0569bb"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/f79d64da-db70-4480-9ad6-0bddaa5c4dde.json", "hash": "af07e38bb5ca1dc01d6f692f2ff9c954"}}, "594": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/config.py", "hash": "995dd2057b9fae9bba9ea3bdc77231b4"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0e980b5c-ff4e-4180-abe2-cdd08cfcb21b.json", "hash": "f6ad7b4af9fcd92c295ed3aaf02160e3"}}, "595": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/precise_bn.py", "hash": "a18b9ab3af29eb1d251c8c96c427bb61"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0eda88ac-5c9d-46e3-b2a1-110fb5b0077f.json", "hash": "b2533a3478c4633d0788d954f8835978"}}, "596": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/profiler.py", "hash": "8ac44ce01586f1d0a313c46c409e3926"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a58fa289-adcc-4987-9e12-b5292e91058b.json", "hash": "b55685c2b38ee9261a59ea58c3d5b4ba"}}, "597": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/record.py", "hash": "88644777c5b82070297dea1e623f2eff"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/288152c2-ba5b-46b3-96a6-dc101461e129.json", "hash": "f6a75276954df403f3a9d496402db53b"}}, "598": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py", "hash": "3c6081f7498350704360362085746043"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9b8b9be5-e57b-45ac-af36-0ea1debb288c.json", "hash": "3827b5dcf93334a1bb5a99a8524113c2"}}, "599": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/__init__.py", "hash": "3302af08a896a35e4947b4717f055eec"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b67dd6ad-fced-42c9-84df-1df4809132dc.json", "hash": "55353b471019692ee9f34ea04180155e"}}, "600": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py", "hash": "d3e5637c90902d455d4882021147b32d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/95a0b7ca-1040-4328-868c-0d127a7bbaca.json", "hash": "c254bf6a7603c18e8519e0a9e97258d9"}}, "601": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/interval_helper.py", "hash": "b669aca38175b620485b786ea77a6daf"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/8fb27118-de58-4d64-9c32-3cf7c53614bd.json", "hash": "f4dec0c5487d40abe10496b68715ed4b"}}, "602": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py", "hash": "43600ddf25051683c6f48fef238ae5fe"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/c7cc8d2b-c7d1-425d-9153-51d6e9cad3cd.json", "hash": "10c525cc15e7100f6f6634d13fe0a643"}}, "603": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py", "hash": "c5cbddf8f25d2b548ad9fd703bdd0410"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/6963a95f-f75e-4530-869d-4afbc5a040d6.json", "hash": "26b351cd31f3b79f4ab10d9bf6772a80"}}, "604": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/short_sampler.py", "hash": "b83823e5c3ffdcdf5f487b859c5e47d1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/91c7470d-ffc8-4732-bca4-350b25078914.json", "hash": "a0908fbfe5c4854c00998401770fe2f7"}}, "605": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/README.md", "hash": "a91965d87c4cf33820c79eb06095eb42"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/b14fe4fe-b2cb-4ca6-8e59-953f307b7852.json", "hash": "5c7fc09650773928df5cb589079f25c9"}}, "606": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh", "hash": "41b6e6cda62856636134414f30ff1488"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/41f3e7a8-79e7-42b4-a57e-9533db88921c.json", "hash": "e4a0779d4b1b774a70d7eb2931af8d58"}}, "607": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/common_func.sh", "hash": "2dfbab2ae9747f581bfc684ed3adbfde"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/4b0429f3-5899-40ff-ba98-adb02f3d59f4.json", "hash": "aa855eb6eeadb1ab1b2aabcc63d9906b"}}, "608": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/compare_results.py", "hash": "a3a89a219257fae4eb998fa84befbbce"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/9a7eb176-dea7-494e-87cd-4eff0b41bece.json", "hash": "14227cbb6db1ebaa086b53b432769d2e"}}, "609": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/extract_loss.py", "hash": "89b5adfe1c47b4fa3d9c53d442aaedb7"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/de134f00-e75e-4048-9479-4f34324ea55d.json", "hash": "3dd99ec206d8277936baf82f8e2477c6"}}, "610": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh", "hash": "feaf23d6826b332d56d54ab75dda873d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/058c279b-9974-4624-a870-601b853baf65.json", "hash": "d09a9a719ec3c64bbeb05aecd2acb03a"}}, "611": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh", "hash": "640e6620d0a3a1905040d4a97a681315"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5cbc0b06-bf90-4596-b043-079aaad0a819.json", "hash": "375d23aec32c220b407f46ad31eaecce"}}, "612": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_paddle2onnx.sh", "hash": "df6e18bf0cf2abd6532eb4651e80bcb6"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3a1c90c1-39ee-49d9-b34e-b8b77b32fbce.json", "hash": "615486725ae61b9665940f1f3d99198f"}}, "613": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_ptq_inference_python.sh", "hash": "7961a9d1d8a93f694558e8cf06ed1cdb"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/03076d14-da10-47c4-8e24-22c47b2332ef.json", "hash": "0c27c5421205c447baeb598f47b1b1c9"}}, "614": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_cpp.sh", "hash": "6f35518fb6fee8f8920359853d337ddd"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/0d57e211-8fb4-43a9-9c4c-ed6cedd56ac5.json", "hash": "42fa627b8a16f65937d75f92db279982"}}, "615": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_python.sh", "hash": "7c187c58d04c8664d1cc695cd3e36202"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/37739699-d923-4eae-b76e-e27b23c261f6.json", "hash": "7f41d34bbb75d6d1190d9c01cf0656ca"}}, "616": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_dy2static_python.sh", "hash": "f986961ac045da5c8f76f6e90d068e85"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/a019b3ed-d20a-4304-8541-226fd237bf3b.json", "hash": "7a9065de7e26c39f238af2c88c2c6473"}}, "617": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh", "hash": "a48d7c2d45e5a4e22bd1c8ddc9e184c9"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/43a2a3f1-ec47-40cf-aae2-e3c51154519a.json", "hash": "7ab16aa7492bb7d501d15bd3f49a2b88"}}, "618": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python_npu.sh", "hash": "92f3fb124444af468d45e68af40db4f5"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/e37bfe44-05e0-44a9-b61d-6353ad324cd8.json", "hash": "448a21470ab1d232c2d500f323cb3c10"}}, "619": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python_xpu.sh", "hash": "d44371394364db40cd7bdbe0f6ae9646"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/79e414f3-ed2d-4816-8fbd-a796aa4b4532.json", "hash": "b3ce4f9bfa9948cdc412ed26de9222a4"}}, "620": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/tools/__init__.py", "hash": "b16463f4f59b496d921fb86fc3378de1"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/88e997ea-065f-4556-8e56-abf2d8e84e1e.json", "hash": "a266c9dec3994eb981472b15f960a9c5"}}, "621": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py", "hash": "c9c887a57fc6ddece678601cdc8c6d9a"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/ac359f3c-86a3-4578-8965-6ef841b5b557.json", "hash": "421669b17fb6f8420d7f2ddf0fea83c5"}}, "622": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py", "hash": "aab7400f9b2d219b387de5f42b17c7a2"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/05c34c33-37bd-4d40-93e1-b9758902d882.json", "hash": "c931a8f2f428c334366394c4ba604310"}}, "623": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py", "hash": "458fe0cb4320a740e357430de5322d82"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/5bd04d81-0bdb-4e0d-9f02-626dd02a7f21.json", "hash": "173e904d268cbabedbfba15ace7848cb"}}, "624": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/tools/summary.py", "hash": "d36f25c934ab1a5404fac262c7f8720d"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/35885247-4755-4325-8af8-b4d9692791e2.json", "hash": "f73ab082705a5f23caeab5cf1033a25e"}}, "625": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py", "hash": "11a5c58584f6214ecfbc7fac06365397"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/fa96bb1c-e294-4472-97b3-2fb3e373dc5e.json", "hash": "a043d50ae39bc42596522c057621b216"}}, "626": {"source": {"path": "/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py", "hash": "dcd89e23dcb78962a6c2c6a2a8b20ecc"}, "target": {"path": "/media/root/Prima/works/PaddleVideo/docs/doc/3f8ca3d4-39cf-43a3-a4bb-98ff2cd75d93.json", "hash": "2ef291d8bdd2415b10ffa5974d16708b"}}}}
\ No newline at end of file
diff --git a/docs/cache_title.json b/docs/cache_title.json
new file mode 100644
index 000000000..11a16ea00
--- /dev/null
+++ b/docs/cache_title.json
@@ -0,0 +1 @@
+{"_default": {"1": {"path": "/MANIFEST.in", "hash": "51011ab5248241a5342dc66d29b0bf6c", "title": "PaddleVideo Manifest Configuration"}, "2": {"path": "/README.md", "hash": "040580f5fc094937e3f2bd7a44398a10", "title": "Advanced Video Processing with PaddleVideo"}, "3": {"path": "/README.md:1-22", "hash": "4cce85ce5aeb1ebec74d5433ea057d60", "title": "Advanced Video Processing with PaddleVideo"}, "4": {"path": "/README.md:25-58", "hash": "d0104ca49a4e7ce35bafadac28a1ecfe", "title": "PaddleVideo: Comprehensive Video Tech Course and Code"}, "5": {"path": "/README.md:59-75", "hash": "723dc73199205226f2a5a204daa701ba", "title": "PaddleVideo Documentation Table of Contents"}, "6": {"path": "/README_en.md", "hash": "cd6ede8dea665e0f1dcd30634becb920", "title": "PaddleVideo: Deep Learning for Video Processing"}, "7": {"path": "/README_en.md:1-20", "hash": "f4c779b72ae4186b05ebe61b0229730d", "title": "PaddleVideo: Industrial and Academic Video Toolset"}, "8": {"path": "/README_en.md:20-43", "hash": "ec8d741392534965bd155d28e9ea8af1", "title": "PaddleVideo: Comprehensive Video AI Platform"}, "9": {"path": "/README_en.md:44-65", "hash": "b4dee3c3b64c767cc12188c0c2880ccf", "title": "PaddleVideo: Video Processing Deep Learning Library"}, "10": {"path": "/__init__.py", "hash": "9325bc821164e2318951b0ab91c9b9fe", "title": "Licensed Python Module: PaddleVideo"}, "11": {"path": "/applications/AbnormalActionDetection/README.md", "hash": "b2f28c4e5f6643c1d50ddafb259efa9b", "title": "Abnormal Action Detection with PaddleVideo"}, "12": {"path": "/applications/AbnormalActionDetection/README.md:1-40", "hash": "5bc59ef806ab6332c42acf44c432a03c", "title": "Abnormal Action Detection with PaddleVideo"}, "13": {"path": "/applications/AbnormalActionDetection/README.md:42-114", "hash": "4288c9cd07ccecde5ff1aca2746a9c3b", "title": "Abnormal Action Detection Pipeline"}, "14": {"path": "/applications/AbnormalActionDetection/README.md:115-153", "hash": "8034e08ba6e24d19f74caadca666bec8", "title": "Video Action Detection with PaddleVideo"}, "15": {"path": "/applications/Anti-UAV/README.md", "hash": "ce453c9978f837a13c37ada1f9784f80", "title": "Detect UAVs in Restricted Areas with PaddleDetection"}, "16": {"path": "/applications/Anti-UAV/README.md:1-21", "hash": "699916c39e2a29a77ff52a24ac3189e3", "title": "Paddle-Anti-UAV: Detecting Flying UAVs"}, "17": {"path": "/applications/Anti-UAV/README.md:23-36", "hash": "ccd88661848474b9ed00c415219e426d", "title": "UAV Detection with PP-YOLO and PaddleDetection"}, "18": {"path": "/applications/Anti-UAV/README.md:36-39", "hash": "94b3f624e802a25496b9bfb6c14c76d2", "title": "Customize Anti-UAV Demo with PaddleVideo"}, "19": {"path": "/applications/Anti-UAV/get_image_label.py", "hash": "a2df3de2513796ad42402ee9f6be0aa9", "title": "Object Detection and Labeling Tool"}, "20": {"path": "/applications/Anti-UAV/get_image_label.py:1-53", "hash": "976acbc2829be2e0dfc8046bf2565066", "title": "Initialize Directories and Info"}, "21": {"path": "/applications/Anti-UAV/get_image_label.py:54-77", "hash": "fff3d7e205c46314804d2e3183d6e8d0", "title": "Object Detection and Labeling in Images"}, "22": {"path": "/applications/Anti-UAV/get_image_label.py:78-101", "hash": "1eba698bbbe52fa7c2d2a7d81b515944", "title": "Write and Annotate Image Data"}, "23": {"path": "/applications/Anti-UAV/get_image_label.py:102-128", "hash": "733ce5e6560513c1676d018c7b205448", "title": "Labeling Frames by Object Presence"}, "24": {"path": "/applications/Anti-UAV/get_image_label.py:129-151", "hash": "a7481af59d716db456669816b284cd65", "title": "Bounding Box Image Labelling"}, "25": {"path": "/applications/Anti-UAV/get_image_label.py:152-164", "hash": "d75223f8ccfc29c5333252bf2fad9128", "title": "Writing Annotation and Image Data to JSON Files"}, "26": {"path": "/applications/BasketballAction/README.md", "hash": "298e57a8dda588bd5f7692888b7b25f8", "title": "Basketball Action Detection App"}, "27": {"path": "/applications/BasketballAction/README.md:1-69", "hash": "5de11621443af81d7f77c148ddf30c5c", "title": "Basketball Action Detection with PaddlePaddle"}, "28": {"path": "/applications/BasketballAction/README.md:70-99", "hash": "0a285dc09212a0b6113e705a09e1d6b5", "title": "Basketball Action Dataset Structure"}, "29": {"path": "/applications/BasketballAction/README.md:100-135", "hash": "d0516eff0898001d285536de4988c1ca", "title": "Prepare, Train, and Convert ppTSM Model"}, "30": {"path": "/applications/BasketballAction/README.md:136-163", "hash": "e66eaaba6b5ff7b41f97784bdf2562f3", "title": "BasketballAction: Feature Extraction & BMN Training"}, "31": {"path": "/applications/BasketballAction/README.md:165-206", "hash": "6d8e8045f3ca8765af0355bc67fb6c16", "title": "Preparing and Training BMN Model for Basketball Action Dataset"}, "32": {"path": "/applications/BasketballAction/README.md:207-243", "hash": "2791f6b95d3da3618adc0a29cd5290ec", "title": "BMN-Based Basketball Action Predictions"}, "33": {"path": "/applications/BasketballAction/README.md:244-284", "hash": "68e1ebbdfeb4bc12cfdb9fc38dfa3f6a", "title": "BasketballAction LSTM Training Data Structure"}, "34": {"path": "/applications/BasketballAction/README.md:285-319", "hash": "f04b42a03e28bee26fdabea4b8c24b69", "title": "PaddleVideo BasketballAction: LSTM Data Formats"}, "35": {"path": "/applications/BasketballAction/README.md:320-365", "hash": "06261df6b09b5953989109041bd5f907", "title": "LSTM Model Inference and Evaluation"}, "36": {"path": "/applications/BasketballAction/README.md:366-389", "hash": "f4bef9f4d37cd85e3f802451c5422a72", "title": "Optimized Action Detection with TSM and BMN"}, "37": {"path": "/applications/BasketballAction/README.md:389-389", "hash": "7cc294d6dd9b283df95dd8e5dd5a235f", "title": "Author List"}, "38": {"path": "/applications/BasketballAction/predict/action_detect/action.py", "hash": "bfe70b60b0dba3fbb7a78e5f6024c08d", "title": "Basketball Action Detector"}, "39": {"path": "/applications/BasketballAction/predict/action_detect/action.py:1-44", "hash": "0f87161ac0577d2c3b140f8fa5cabec8", "title": "Basketball Action Detection with Python"}, "40": {"path": "/applications/BasketballAction/predict/action_detect/action.py:45-71", "hash": "5673b8ef127cb63650d8f5eaba451585", "title": "ModelPredict Class Initialization and Configuration"}, "41": {"path": "/applications/BasketballAction/predict/action_detect/action.py:72-104", "hash": "6900a7da0667e5df7bd1772e412c031d", "title": "InferModel for Action Prediction"}, "42": {"path": "/applications/BasketballAction/predict/action_detect/action.py:105-133", "hash": "4209a678258f4434b3ea59029447301d", "title": "Video Action Detection and Feature Extraction"}, "43": {"path": "/applications/BasketballAction/predict/action_detect/action.py:134-152", "hash": "6fee7518e5cba95c22618bcfa0012713", "title": "Feature Extraction and Storage"}, "44": {"path": "/applications/BasketballAction/predict/action_detect/action.py:153-174", "hash": "a5e48f1341b9641f32b4e5b590935744", "title": "Video Feature Inference and Storage"}, "45": {"path": "/applications/BasketballAction/predict/action_detect/logger.py", "hash": "96e8f1f7751ef922ea41922fca4c9f7b", "title": "Custom Logger for News Stripper"}, "46": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py", "hash": "51936031fa85e7a0694a404fcc023694", "title": "Audio Feature Extraction and Processing"}, "47": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:1-41", "hash": "6abfda1f3582b909d81361b265777f50", "title": "Audio Feature Extraction Functions"}, "48": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:44-68", "hash": "904984a133fe6004aa422875b0c855b3", "title": "Mel Spectrogram Matrix Creation"}, "49": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:69-93", "hash": "a75e021411d82ab3b84628a9161cc6b3", "title": "Calculate MFCC for Audio Data"}, "50": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:94-116", "hash": "1dc192beafb2315543d04c680691b5b3", "title": "Mel Spectrogram Feature Extraction"}, "51": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:117-139", "hash": "8c1574afd1ade70f6da96610979c7224", "title": "Audio Feature Extraction and Preprocessing"}, "52": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:140-158", "hash": "ba681cd41370584a1277047c80023dab", "title": "Extract Audio Features for Wav File"}, "53": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py", "hash": "574c1392fd9221b3a4d9820af61e3c18", "title": "Model-Based Audio Feature Extraction"}, "54": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:1-42", "hash": "d62bdfd06cac9bd81c9723dd16f88c72", "title": "Audio Feature Extraction Model"}, "55": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:43-51", "hash": "f6b4f7a1dcb2fbc9a190fcd8c8269de4", "title": "Audio Feature List Generator"}, "56": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py", "hash": "48665dcb4614e556d4cc94b8dcd68918", "title": "VGGish Parameters for Basketball Action Detection"}, "57": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:1-29", "hash": "35f25d08141da44ecc7dc8129f8bf6e4", "title": "Global VGGish Parameters"}, "58": {"path": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:30-37", "hash": "7c78a76b29a405198566b746a0d741c3", "title": "Adam Optimizer with Epsilon Value"}, "59": {"path": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py", "hash": "2bbeef0ddfde34d58b47dfeebfdec54c", "title": "Audio Inference with InferModel"}, "60": {"path": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:1-37", "hash": "5f64d83960167a63c7691d9850260085", "title": "Audio Inference Model Initialization"}, "61": {"path": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:39-69", "hash": "a6cb67c793098f417ae4702531184a7c", "title": "Audio Inference Model"}, "62": {"path": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:71-80", "hash": "90a686d19257837b37d04597974a7d57", "title": "Audio Infer: Model Prediction and Time Calculation"}, "63": {"path": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py", "hash": "877aa64dab8aed20ed74a43d4669c362", "title": "Basketball Action BMN Inferencing"}, "64": {"path": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:1-37", "hash": "0a929815b326e5689ec7babe9f8f0c0e", "title": "BMN Inferencing Class Initialization"}, "65": {"path": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:38-63", "hash": "ddd597a350b5209e89a85b7ba2af591a", "title": "Basketball Action Detection Model"}, "66": {"path": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:64-86", "hash": "dc77f169a448096e17822bd24b321db0", "title": "Action Detection Model"}, "67": {"path": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:87-111", "hash": "eb3cf523601a71513ae00edcab0ff096", "title": "Boundary Mask Prediction"}, "68": {"path": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:112-131", "hash": "422c29a3915399549ee7e19a2e97e59a", "title": "Average Model Predictions for Action Detection"}, "69": {"path": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:133-155", "hash": "f66e6c5e4778e544bca1c3baa24321e6", "title": "Inference Time for Action Detection"}, "70": {"path": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py", "hash": "e6de4f579cb00c2b268658f7a78d8c62", "title": "LSTM-based Basketball Action Detection"}, "71": {"path": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:1-36", "hash": "7e669227e352f2fe805e0ff493a7681e", "title": "LSTM Inferencing Model in BasketballAction"}, "72": {"path": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:37-61", "hash": "0a0c313f9fe364e41a6f5d606d566f9f", "title": "GPU-Optimized LSTM Action Detector"}, "73": {"path": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:62-90", "hash": "2c30772601f48d2661cde36676eded46", "title": "LSTM Basketball Action Detection"}, "74": {"path": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:91-112", "hash": "dade7c6b9a6350b55333c6af38108ea8", "title": "LSTM-Based Action Detection"}, "75": {"path": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:113-141", "hash": "aca733d36668ffcd496f1b1f305e2c03", "title": "LSTM Model for Action Detection"}, "76": {"path": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:142-145", "hash": "ae0a2ebd517d97b94a9970dea5f72552", "title": "JSON Data Logging and Time Tracking"}, "77": {"path": "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py", "hash": "ce44f5f726303ddcb82a1d40262c84d7", "title": "PPTSM Action Detection Infer Model"}, "78": {"path": "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py:1-38", "hash": "ef25e41fffbf103bf8a7e69da1d303fe", "title": "PPTSM Action Detection Model Inference"}, "79": {"path": "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py:40-69", "hash": "64fc9ae8dac66f205e5db46a93eca091", "title": "PaddleVideo Action Prediction"}, "80": {"path": "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py:70-83", "hash": "099116356100c8134fe29569ce4a1a0e", "title": "Python Model Prediction with Timing"}, "81": {"path": "/applications/BasketballAction/predict/action_detect/reader/__init__.py", "hash": "2e9ec402585026301d4b3692fdd50115", "title": "Alphabetical Action Readers"}, "82": {"path": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py", "hash": "0e3ba10a6e651c7cbe1d69febd61976f", "title": "Audio Reader for YouTube-8M Dataset"}, "83": {"path": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:1-37", "hash": "3711b62cae432d7e6f17732977e00689", "title": "AudioReader Class for YouTube-8M Dataset"}, "84": {"path": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:38-70", "hash": "57909ba726c505de60d0f5557fa2d537", "title": "Audio Reader for Multiple Models"}, "85": {"path": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:71-78", "hash": "5c8185ee6f26bcf580be40fbd4ccfc05", "title": "Audio Batch Manager"}, "86": {"path": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py", "hash": "7b99e067cd9126596237aa7df6f2b720", "title": "BMNINF Reader for Basketball Action Prediction"}, "87": {"path": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:1-49", "hash": "1344cb1e63f2cf510ad1280cbb3e97a5", "title": "BMNINF Reader: Generating Proposals for BMN Models"}, "88": {"path": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:50-73", "hash": "47c77ed4c5d02db7028ec5b1a4006230", "title": "BMNINF Reader Initialization"}, "89": {"path": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:74-105", "hash": "fc054050240a1ea5a9416d76b7c55c38", "title": "BMNInf Reader Class"}, "90": {"path": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:106-141", "hash": "ad9fa762a095f81076785f3f53d29d1d", "title": "BMNINF Reader Functionality"}, "91": {"path": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:142-151", "hash": "a131cef54b9a68f0fc643b956b8dc4a0", "title": "Video Data Reader Class"}, "92": {"path": "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py", "hash": "85e2610aca5d5eb085194701e10ba993", "title": "FeatureReader: YouTube-8M Dataset Reader and Model Support"}, "93": {"path": "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py:1-33", "hash": "313ba89d6f29e5e156bdc16baba45d54", "title": "FeatureReader: Efficient YouTube-8M Data Reader"}, "94": {"path": "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py:35-71", "hash": "489bda0b8e0e14bf07e56ba4f0c6b514", "title": "Feature Reader: Data Batches for Basketball"}, "95": {"path": "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py:72-86", "hash": "dcbff9500e88d488875d63bca732ed2b", "title": "Batching Action Features Reader"}, "96": {"path": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py", "hash": "ccf03df124e3510d96b3843769ada5b9", "title": "Video Reader Utils"}, "97": {"path": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:1-34", "hash": "2d7e71819e9cc679872d392193085df1", "title": "Customizable Reader Not Found Error Handling"}, "98": {"path": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:35-83", "hash": "55bd23bae418a6b6869d19bfc6c6896c", "title": "Video Data Reader Zoo"}, "99": {"path": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:84-109", "hash": "5704627a68b275ca50c91a745697dfb3", "title": "Singleton Reader Registry"}, "100": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py", "hash": "2d29f888005a3975a0f029209d51a224", "title": "TSMINF Image Reader"}, "101": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:1-37", "hash": "bc05d07b103f62d5f0de9759c9eab625", "title": "TSMINF Reader: Efficient JPG Video Dataset Reader"}, "102": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:38-66", "hash": "c7dfaf3de58d548bda5db38558a700ea", "title": "Configuring TSN Inference Reader"}, "103": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:67-97", "hash": "e2560c1a53f5cf8387484d1c17a43b43", "title": "BasketballAction Video Reader"}, "104": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:98-120", "hash": "8a50fdcce8ea7693f60867d86e70c44f", "title": "Multithreaded Video Frame to Image Conversion"}, "105": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:122-144", "hash": "08e2401bcb21b55ee4983161845fcfde", "title": "Fault-Tolerant Image Reader"}, "106": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:145-172", "hash": "a3ce1a2779ecfa11978e5eed5a9e4031", "title": "Transformative Image Reader: Applied Action Detection"}, "107": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:173-203", "hash": "068ee2441b734a10d40772d187320110", "title": "Image Transformation Function"}, "108": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:204-239", "hash": "8e5a5f1a63cd3d7de61b18717d455148", "title": "Random Crop Size Generator"}, "109": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:240-262", "hash": "aeeb79d6174f4ae7efeaf4a20147f3cc", "title": "Crop Position Calculator"}, "110": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:263-298", "hash": "b4018286865661e1212b59ccdfe1b8d7", "title": "Random Cropped Image Group"}, "111": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:300-338", "hash": "23e6ba369e55e2163e0db095831e8238", "title": "Image Preprocessing for ML Models"}, "112": {"path": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:339-366", "hash": "614e89003e6673c0c635f1bc4e6b7af3", "title": "Image Cropper and Resizer"}, "113": {"path": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py", "hash": "68c51c32b69a155437b9003c02f6ba68", "title": "BasketballAction Config Utils"}, "114": {"path": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:1-46", "hash": "c46afb8f771fae69f34040f56b550828", "title": "PaddleVideo BasketballAction Config Utils"}, "115": {"path": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:47-79", "hash": "09cfac2e9709615fe65250db7409611a", "title": "Config Parser and Printer"}, "116": {"path": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:80-80", "hash": "c9ab799842bd7e51591e67ac336ac1c7", "title": "Logger for Code Separation"}, "117": {"path": "/applications/BasketballAction/predict/action_detect/utils/preprocess.py", "hash": "43307e52ad097b996f3fbd921f6362c7", "title": "FFmpeg Functions for Video Processing"}, "118": {"path": "/applications/BasketballAction/predict/action_detect/utils/process_result.py", "hash": "65fe1acf9920ac04515c18bdff134a95", "title": "Action Detection with NMS Filtration"}, "119": {"path": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:1-39", "hash": "5a93568b8d58788695fb70eb8711ee83", "title": "Non-Maximum Suppression Algorithm for Bounding Boxes"}, "120": {"path": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:40-76", "hash": "faed7797f1ad7c66ae518c29f6b9be93", "title": "Non-Maximal Suppression for Bounding Boxes"}, "121": {"path": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:77-107", "hash": "39aef6bea8c50f6fa7fcfb461bc43c82", "title": "Video Detection Filtering and Sorting"}, "122": {"path": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:108-129", "hash": "9f95b1aa88d6bbf303b67bd591c7ad57", "title": "Action Detection Processing"}, "123": {"path": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:130-144", "hash": "ac7d8b3a54886185736b3323db968d5f", "title": "NMS Action Result Processor"}, "124": {"path": "/applications/BasketballAction/predict/eval.py", "hash": "9cfd165da96fde629d77425d33c84851", "title": "Optimal IOU Threshold for Basketball"}, "125": {"path": "/applications/BasketballAction/predict/eval.py:1-36", "hash": "c0cabbcb0a70178df5f04b65e256bfca", "title": "Load Ground Truth Annotations (gts)"}, "126": {"path": "/applications/BasketballAction/predict/eval.py:37-67", "hash": "ad0e31650fbefcd882c02b50b3449bef", "title": "Evaluating Basketball Action Predictions"}, "127": {"path": "/applications/BasketballAction/predict/eval.py:68-93", "hash": "6849b1ee3990bbc7466275cfe159afce", "title": "Box Sorting and Conversion Function"}, "128": {"path": "/applications/BasketballAction/predict/eval.py:94-120", "hash": "66c9448c5b431533f9ca853e504c8cc6", "title": "Box Evaluation Metrics Calculator"}, "129": {"path": "/applications/BasketballAction/predict/eval.py:121-144", "hash": "924865ac8627ea81aa8ff4b729dc2168", "title": "IoU-based Metric Calculation for Object Detection"}, "130": {"path": "/applications/BasketballAction/predict/eval.py:146-161", "hash": "33e7b507025d2a53cab3a371674a6fcd", "title": "Precision and Recall Calculator"}, "131": {"path": "/applications/BasketballAction/predict/eval.py:162-189", "hash": "4a0fa77f91eadfbf2111c16cecb05a46", "title": "Calculate F1 Score from Predictions"}, "132": {"path": "/applications/BasketballAction/predict/eval.py:190-218", "hash": "020665e02edcc60d1f7c78c465e4f7c2", "title": "Video Action Detection Model Evaluation"}, "133": {"path": "/applications/BasketballAction/predict/eval.py:219-237", "hash": "8e323e84cdd16739b5bd0a5471758526", "title": "Optimal IOU Threshold for Basketball"}, "134": {"path": "/applications/BasketballAction/predict/predict.py", "hash": "89bee6a3470af3d39c7d7724e3af9d21", "title": "Basketball Action Prediction"}, "135": {"path": "/applications/BasketballAction/predict/predict.py:2-33", "hash": "07b4e3bff1ff99d956db9b4d1e3ff04a", "title": "Basketball Action Predictor"}, "136": {"path": "/applications/BasketballAction/predict/predict.py:34-35", "hash": "9dfa6e92926308d7f1b7eb838cfd619f", "title": "Write Indented JSON to File"}, "137": {"path": "/applications/EIVideo/EIVideo/README.MD", "hash": "c249ee3417abad3260afe53069107481", "title": "CLI Guide for EIVideo Annotation Tool"}, "138": {"path": "/applications/EIVideo/EIVideo/__init__.py", "hash": "e4c73450a57bed7e75139a54ff57178b", "title": "EIVideo __init__.py: Root Paths and Constants"}, "139": {"path": "/applications/EIVideo/EIVideo/api.py", "hash": "3cdcbb684d9486ef38a1d8aaa780b782", "title": "JSON Video Annotation Tool"}, "140": {"path": "/applications/EIVideo/EIVideo/api.py:1-39", "hash": "651801941194611bf24bd479affb8632", "title": "Image Handling Functions"}, "141": {"path": "/applications/EIVideo/EIVideo/api.py:40-67", "hash": "51a9b17bb0bb1ab96e5650f5c0003aac", "title": "PNG to JSON Image Parsing"}, "142": {"path": "/applications/EIVideo/EIVideo/api.py:68-101", "hash": "6431255103bffa93af337d68e240a2b2", "title": "Video Processing: Save, Load, and Annotate JSON"}, "143": {"path": "/applications/EIVideo/EIVideo/api.py:102-130", "hash": "9a9dd34d09c03ea945b60a44df989c20", "title": "Image Resizing and Processing"}, "144": {"path": "/applications/EIVideo/EIVideo/api.py:131-134", "hash": "f745fbcba7410d454e4bcb18e78b92b3", "title": "JSON Overlay Dictionary Saving"}, "145": {"path": "/applications/EIVideo/EIVideo/main.py", "hash": "fb60ad789571033efcc60ddc6a17702d", "title": "PaddleVideo Training with Distributed Support"}, "146": {"path": "/applications/EIVideo/EIVideo/main.py:1-29", "hash": "dce623c65f692d573aa52c1ceb357f84", "title": "PaddleVideo Training Script"}, "147": {"path": "/applications/EIVideo/EIVideo/main.py:30-53", "hash": "26cc4a22923fb4f65e062c19d5a792d2", "title": "Command Line Arguments for EIVideo"}, "148": {"path": "/applications/EIVideo/EIVideo/main.py:54-82", "hash": "a19895013e9c9b35e644ab520dbd9e4a", "title": "Command-Line Arguments for Training Control"}, "149": {"path": "/applications/EIVideo/EIVideo/main.py:83-116", "hash": "2e05bbfb49231625cfbb30d227a85e83", "title": "Command-line Arguments Parser for Video Testing"}, "150": {"path": "/applications/EIVideo/EIVideo/paddlevideo/__init__.py", "hash": "ffb301b1e71c7ebff0b2564fb855f2ef", "title": "PaddleVideo Library Initialization"}, "151": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py", "hash": "4ebcf1008359584b24d9b2520e58df22", "title": "Loading EIVideo Modules"}, "152": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py", "hash": "45c2b084d9ffc77311307a568807bf1f", "title": "Graceful Termination PaddleVideo Dataset Loader"}, "153": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:1-31", "hash": "65c2dd69710bb2b3a3151a4df9fec52e", "title": "Building Pipeline with PaddleVideo"}, "154": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:32-80", "hash": "7fbd939985907168261514f93306245e", "title": "Dataset Loader Builder"}, "155": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:81-106", "hash": "2282479a3c3711e0e4e4d225731bbd6d", "title": "Paddle Dataloader Builder"}, "156": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:107-134", "hash": "0e5d1fca24ce0ae0bd4f146e6ccc9cbc", "title": "Mix Collate Function for Stacked Batches"}, "157": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:135-151", "hash": "8b4b38eaf330a0b23dd5b4f5a1151fff", "title": "Signal Handler Setup for Process and Group"}, "158": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py", "hash": "8d0df7baee32e8ec9ef98eff1f677943", "title": "EIVideo Image Preprocessing Pipeline"}, "159": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py", "hash": "d1dff462d80b23d18ca4ae63aba9e2fb", "title": "Flexible Pipeline Transformation with Compose"}, "160": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:1-31", "hash": "9a6f31ff7974f3a0c693a344b565babb", "title": "Compose Class for Pipeline Composition"}, "161": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:32-59", "hash": "a63d62c298c795ae2288cfc27b2b97c5", "title": "Compose Class Sequential Transform Composition"}, "162": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:60-76", "hash": "7015aad7ed7341310f7f5ddfc150b3e2", "title": "Compose Pipeline Class"}, "163": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py", "hash": "785a5df685fabdb35e62a85bb1002868", "title": "Paddle Video Image Preprocessing"}, "164": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:1-43", "hash": "8512a9e42e06788632c0474e040559bc", "title": "RandomScale\\_manet Pipeline"}, "165": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:46-75", "hash": "4f62fae17c491927716ceb70dfc20653", "title": "Resize Image Pipeline"}, "166": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:76-109", "hash": "7372a66c2074bf55837712edf43978bc", "title": "Custom Image Crop Transform"}, "167": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:111-134", "hash": "70cfbea7f349494ff56dd75582c8822d", "title": "Cropped Labels from Image"}, "168": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:135-163", "hash": "e1c6cd400c53d0f7c86d7733b4b584fb", "title": "Random Region Flipping Transform"}, "169": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:164-198", "hash": "89110683aeef66e7952aa6d9436b66ee", "title": "Custom Image Transforms for PaddlePipelines"}, "170": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:199-220", "hash": "d1a9edc111f4e09940b6a3c9e1f6851d", "title": "Scribble Image to Foreground Mask"}, "171": {"path": "/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py", "hash": "78c9d0502880b135c5db97193284a2cc", "title": "Organizing PaddleVideo Functionalities with Registries"}, "172": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py", "hash": "516462c31e2b5e4b352e764efea6f1ac", "title": "PaddleVideo Metrics Initialization"}, "173": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py", "hash": "f4da35e3aaa5401e0d447e0131ea5688", "title": "Abstract Base Metric Class for PaddleVideo's EIVideo"}, "174": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py", "hash": "a358e2f5d18276139017e2df56bf08cb", "title": "Apache-Licensed EIVideo Metric Builder"}, "175": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py", "hash": "df96b61f43a6063fd36b4ec66ff63de9", "title": "Registry-Based Metric Management"}, "176": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py", "hash": "62dcda21329f7886c0d8a7a372604ec5", "title": "VOS Metric: Video Object Segmentation"}, "177": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:1-38", "hash": "6a68e47a1ca076a2e3068161531324ae", "title": "VOS Metric Class Registration"}, "178": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:39-67", "hash": "cf8c1cee298b86eb554a8556e65b6374", "title": "Video Processing Class Initialization"}, "179": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:68-90", "hash": "f79f68194752f243d50c38181f93d7fe", "title": "VOS Metric Initialization"}, "180": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:91-114", "hash": "09e845612c40f1769ca916bd13705e39", "title": "Embedding Preparation for EIVideo"}, "181": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:115-131", "hash": "c922f954c7c1acd863a6ec8ddcb3292b", "title": "Video Object Segmentation Metric"}, "182": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:133-148", "hash": "44e53fd8bc3cf01d49a7bc7b6d7b8eec", "title": "Data Augmentation and Label Averaging"}, "183": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:149-168", "hash": "174caf3bed1bfb67c2bc36ddec45c3cc", "title": "Frame-wise Flipped Label Generation"}, "184": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:169-192", "hash": "da6ed58d873c3ae8610df9d0f6d488d2", "title": "Average Time per Frame Calculation"}, "185": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:193-211", "hash": "bb1764a4c4f000224a34a4af7ec5fde6", "title": "Frame Rate Metrics and Tensor Manipulation"}, "186": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:212-224", "hash": "21e77dfe5e52d4837273361606aa5a11", "title": "Tracking Sequence Numbers"}, "187": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:225-238", "hash": "8f288883006a6e658f97591683c11c2e", "title": "Range of Indices in Code"}, "188": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:239-252", "hash": "6378be9aa104498965cbd64fd4bd5741", "title": "VOS Metric Sequence"}, "189": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:253-271", "hash": "c15e836d9e071c29ebb94944abd0bf30", "title": "Zip Folder and Image Mask Functions"}, "190": {"path": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:272-279", "hash": "b0f4f52ac7a38bdbac83e7369cade775", "title": "Zipping Metrics and Saving Results"}, "191": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py", "hash": "ffd80bfd08d264631a624a591f36e615", "title": "PaddleVideo Model Registry"}, "192": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py:1-23", "hash": "5c09503793e169a95918ff6c5c45afb3", "title": "PaddleVideo Import Script"}, "193": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py:24-27", "hash": "49e69f1a883d66144a4aaaa195ebf87b", "title": "PaddleVideo Library Variables and Functions"}, "194": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py", "hash": "cf637cb33c9d0ea5110ad7599e702a77", "title": "DeepLab Import Statement"}, "195": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py", "hash": "020a8ea6f19d5e871821e10bcd5f7ee2", "title": "ASPP-MANET Backbone Initialization"}, "196": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:1-32", "hash": "215affe762795bcb4153aad9960f04e0", "title": "ASPP Layer Implementation in ASV Model"}, "197": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:33-62", "hash": "ec12485f5700e7393c6ddc3d9be3f1a8", "title": "ASPP Network Initialization"}, "198": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:63-85", "hash": "6489e4ce6abd019da706b6724fcab3f3", "title": "ASPP Module with Dilation and Pooling"}, "199": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:86-117", "hash": "dd3bac8cd9a3f1a72b595e530bd0500a", "title": "ASPP-MANET Backbone Class Definition"}, "200": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:118-124", "hash": "ef53c6756a5e432d1849d7b47e3a3575", "title": "ASPP Model Initialization"}, "201": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py", "hash": "23d81ac43b4ebc4669ea756037277ede", "title": "Manet Decoder Layer with Conv, BatchNorm, ReLU"}, "202": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py:1-30", "hash": "8807ee20e797145bd7582bdb79f7122a", "title": "Manet Decoder Class"}, "203": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py:31-59", "hash": "a0f8db291f8aa7a572dba15aee914c3f", "title": "Manet Decoder Block: Conv-BN-ReLU"}, "204": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py:60-65", "hash": "c60e3eb59c4cdf59b8af6860e061bd78", "title": "Manet Decoder Initialization"}, "205": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py", "hash": "ad98ab7bf1cf9c5e01e7d225a36920f7", "title": "Static BatchNorm2d and DeepLab Backbone"}, "206": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py:1-26", "hash": "6956c81df30e45b7b9d34cf696f94856", "title": "Frozen Batch Normalization Layer"}, "207": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py:27-61", "hash": "4e00ece5c7a75b92e72b6542b9c8a9f7", "title": "DeepLab Network Backbone"}, "208": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py:62-90", "hash": "02ba98d8872a5d88a6616d235983e375", "title": "DeepLab Model Creation and Evaluation"}, "209": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py", "hash": "2fd940a37b0e19796967ef1f0dd5c23c", "title": "ResNet-MANET Model Coding"}, "210": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:1-31", "hash": "4930d38db40addb55e2577b378384a34", "title": "Bottleneck ResNet Definition"}, "211": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:32-75", "hash": "77f63308da0931d690168c2bf6114002", "title": "ResNet: Efficient Video Backbone"}, "212": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:76-101", "hash": "a4fedbdd2098d2e79de7ce0dd0c9dbbf", "title": "ResNet-MANET Backbone Builder"}, "213": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:102-127", "hash": "c8ee43bc82bc0e431108a10e23fbd77e", "title": "ResNet-MANET Model Creation"}, "214": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:128-159", "hash": "3e35b076cb903ec57aded1d2d247d516", "title": "Create ResNet Residual Block with Downsampling"}, "215": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:160-191", "hash": "dac728ef971c1267451033ad5225373a", "title": "ResNet-MANET Backbone Model"}, "216": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:192-227", "hash": "aa1c038347f37edeb02901a96fad6f28", "title": "ResNet101 BatchNorm Backbone"}, "217": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:230-245", "hash": "9365cf397dffe20a78f9d09390084de1", "title": "ResNet101 Model JSONizer"}, "218": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py", "hash": "6484cd728db6c3ad1eb4cde915904350", "title": "PaddleVideo Model Builder"}, "219": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:1-19", "hash": "3f28547d2593019bda500421c35206f4", "title": "Model Registration and Building Utilities"}, "220": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:22-73", "hash": "ae8878431be6d7f8f0c0bbe264d17d83", "title": "Video Processing Model Components Builder"}, "221": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:74-116", "hash": "486070a3ec2d9f1c0fe3cd45f7048c20", "title": "Model Builder: Configurable PaddleVideo Models"}, "222": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:117-125", "hash": "845ba5ea66799e49ec06d1aed9671c90", "title": "Video Analysis Framework Builder"}, "223": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py", "hash": "93d6eabc081961b16616c604d109e1c0", "title": "PaddleVideo Framework: BaseSegment & Manet Definitions"}, "224": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py", "hash": "469f38214046adb09e27eff05d53ad55", "title": "Python Segment Framework Initialization"}, "225": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py", "hash": "354fdf8f91d2fa8007de829cd0191786", "title": "Semi-Video Segmentation Base Class"}, "226": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py:2-30", "hash": "fe8d57aabde833feeb62136aace75c59", "title": "Semi-Video Object Segmentation Base Class"}, "227": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py:31-59", "hash": "041299b867e83dab32004120d922ea41", "title": "Model Initialization and Processing"}, "228": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py:60-95", "hash": "c272b24ad1e2d135142ca72f34c73803", "title": "Abstract Step Methods for Video Modeling"}, "229": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py", "hash": "45a463c8c3984f6fd5f579ccd4094117", "title": "Manet Stage 1 Video Segmentation"}, "230": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:1-26", "hash": "5304a7106dd7fa0313e6103e18d63a0c", "title": "MANET Model Imports for Video Tasks"}, "231": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:27-61", "hash": "7dc12a73f21500373208efcba2c40335", "title": "Manet Model Definition and Implementation Plan"}, "232": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:62-87", "hash": "eedad2bdf260ee0330b98d79be716dfc", "title": "Model Initialization and Evaluation"}, "233": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:88-109", "hash": "383c740343028c1a7aacbd0cbc77428f", "title": "Model State Check and Segmentation"}, "234": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:110-134", "hash": "074a8d4685584e9cc05e90251a4b1fcc", "title": "Manet Stage 1: Initialization and Embeddings"}, "235": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:136-157", "hash": "cd9e1220a8883718fb0649ee6066b7e4", "title": "Manet Stage1: Batch Image Transformation"}, "236": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:158-176", "hash": "843a3046ef9550ac0f9c46705d37bdd8", "title": "Reference Frame Embedding Initialization"}, "237": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:177-195", "hash": "0f375f84649cae1ae171474f7c3eb410", "title": "Save Interactive Scribble Image"}, "238": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:196-216", "hash": "68680f69440f52eba8c5429840a078a1", "title": "Scribble-Based Mask Generation"}, "239": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:217-234", "hash": "6621b5a22a8aab519c13e54638e84bda", "title": "Manet Stage 1 Segmentation Model Iteration"}, "240": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:235-254", "hash": "eb20d50d6d7beaf812a45300bc6f6e8b", "title": "Temp Dictionary Creation for Labels"}, "241": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:255-274", "hash": "82ed835a8f0886180da16ecbb10e69da", "title": "Scribble-based Labeling for Video Annotation"}, "242": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:275-292", "hash": "b8a8c50a8affe5dd51e84500736d3106", "title": "Local and Global Map Calculation for Segmentation"}, "243": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:293-310", "hash": "f71102a3e2fd745d4161cfb515cbda0c", "title": "Manet Segment Annotation Check"}, "244": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:311-327", "hash": "f7055dfa985b962b1f00ae5cb982932d", "title": "Mask Creation and Storage in Video Model"}, "245": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:328-347", "hash": "467318252178981a7af640411f5708a1", "title": "Save and Propagate Frames"}, "246": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:348-365", "hash": "df088f05b73f0ac969be5487cef15581", "title": "Manet Segmentation Model Function"}, "247": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:366-383", "hash": "fe68a13b229ef859c45ec7f2f77c5c05", "title": "Dynamic SegHead Model for Video Prediction"}, "248": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:384-402", "hash": "ab8d80ac78b7f38775954d72f95844f5", "title": "Image Segmentation Model Predictions"}, "249": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:403-417", "hash": "2557459f19de9c0adc4903fe9cc33e91", "title": "Auto-Segmentation Framework"}, "250": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py", "hash": "63f9ec9ad178a824297ef6910763054c", "title": "IntVOS: Nearest Neighbor Attention for Video Segmentation"}, "251": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:1-37", "hash": "fe5f2d73afd9f3feaf2ae047c8990c8c", "title": "Pairwise Squared L2 Distance Calculation"}, "252": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:38-60", "hash": "b19bbb7424dbc4c223855bf45d486e82", "title": "Pairwise Distance Calculation for Nearest Neighbor Attention"}, "253": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:61-83", "hash": "174b9e65f2ea8b4b1330e42dcbda2061", "title": "Nearest Neighbor Distance Calculator"}, "254": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:84-113", "hash": "44da1add36d4c314a5c5ea49aaea37b3", "title": "Nearest Neighbor Features Calculation"}, "255": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:114-134", "hash": "31f821243bdc03815587b66602695d3d", "title": "Nearest Neighbor Feature Calculation"}, "256": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:135-158", "hash": "6e734d047c8337fbb44d4bffebeb520b", "title": "Split and Apply Chunks"}, "257": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:159-181", "hash": "0912c9485c3d3c7020ad4966950b7f3f", "title": "Nearest Neighbor Features Calculation"}, "258": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:182-201", "hash": "113c348be6d7b9f08055e50f978969b5", "title": "Nearest Neighbor Calculator"}, "259": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:202-224", "hash": "c4fec2ae2220c95feb98633467cee77a", "title": "Nearest Neighbor Tensor Reshaping"}, "260": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:225-252", "hash": "b40dd6b8de6eb3702a355245af35f782", "title": "Squared L2 Distance Calculator"}, "261": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:253-278", "hash": "1593fba60a0eceaec6c9904688064baf", "title": "Nearest Neighbor Features for Video Matching"}, "262": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:279-298", "hash": "4a6709ccc8871206281f9683cbf4bbec", "title": "Nearest Neighbor Feature Extraction"}, "263": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:300-325", "hash": "3fbeb58dddf4412d9630a99ef8c9dd9f", "title": "Local Distance and Offset Masks Calculation"}, "264": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:326-358", "hash": "7fea14a5369b05db4fc897bee1996165", "title": "Feature Extraction and Masking for IntVOS"}, "265": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:359-390", "hash": "8332282b008f261e1a1f1f33f8658713", "title": "Convolutional Neural Network Architecture for Image Processing"}, "266": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:391-418", "hash": "a3f95a70ffe0d39bb5b5f073475f21ef", "title": "Custom CNN Layer for Image Feature Extraction"}, "267": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:421-442", "hash": "8d6aaec9bccc2a59f5cb877523bad8c6", "title": "Split Separable Conv2D Layer"}, "268": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:443-488", "hash": "0d3594cf1c2a61351f58df200e68b73a", "title": "Dynamic Segmentation Architecture"}, "269": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:489-506", "hash": "3f0f26394c24a83c58bcc8927929e5b1", "title": "IntVOS Class Definition"}, "270": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:507-530", "hash": "a5879d4d7149c4e8deccdcc9508d9d03", "title": "Dynamic Semantic Segmentation Network Initialization"}, "271": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:531-559", "hash": "5a2ac31e57ec590da2322a5d0cb2ceae", "title": "IntVOS Model Head: Loss Calculation"}, "272": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:560-588", "hash": "092c55599b43ba142d6d259ee0047b62", "title": "Split-Apply-Combine: Prop SegHead Function"}, "273": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:589-622", "hash": "64e6525c02f6fe1b7140c5a47ebeb5dc", "title": "IntVOS: Prop Segmentation Head"}, "274": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:623-646", "hash": "33865b1737edcf85537c3d0fcc165f28", "title": "IntVOS: Feature Embedding Function"}, "275": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:647-665", "hash": "6fcb74f3b0c3d782942da7962b53616f", "title": "Nearest Neighbor Features Calculation"}, "276": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:667-687", "hash": "ecead76d5a97fcafa37a8496d9901fe4", "title": "Sequence Name Check and Update"}, "277": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:688-707", "hash": "903a562eaa23c9efd24a0cf653e193f9", "title": "Nearest Neighbor Features for Previous Frame"}, "278": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:708-724", "hash": "35526a5cf5e6f0235192be7f70df09f9", "title": "Map Dictionaries Check"}, "279": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:725-741", "hash": "4356022621b4baba29436d4882a56ebc", "title": "InterVOS Frame Processing"}, "280": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:742-763", "hash": "9ffe670a0764b558fac8a8ed6147f0f5", "title": "Video Modeling: Local Maps and Interaction Numbers"}, "281": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:764-787", "hash": "75b7dcd36835680270d0e37b3792ee9e", "title": "Defining Int_seghead Function"}, "282": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:788-813", "hash": "6eae7debd6a853926d4909baec28aa3b", "title": "Local Distance Map Calculation in IntVOS"}, "283": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:814-832", "hash": "aaaa59f8477814ab68155f304f10aecb", "title": "Update Global Map with Nearest Neighbors"}, "284": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:833-854", "hash": "bfac055d1154d5560d9b73d870d80362", "title": "Updating Global and Local Maps in Video Model"}, "285": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:856-878", "hash": "85d74f002ae6ee30aee39c5745a86ed5", "title": "Dynamic Object Scene Embeddings"}, "286": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:879-893", "hash": "990b3813da0effae564e4ab5037f5ec5", "title": "Segmentation Prediction in IntVOS"}, "287": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py", "hash": "f5d9e4c0f75f98e9bd5aa84e302ca02a", "title": "Copyright and Imports in PaddleVideo Heads"}, "288": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py", "hash": "dcff2e36669adb0046ec39b94682799c", "title": "Registry Management in PaddleVideo's EIVideo"}, "289": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py:1-27", "hash": "41d34bca0fd20493c29c23c9e82558b4", "title": "Video Component Registry in PaddleVideo's EIVideo"}, "290": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py:28-31", "hash": "9edd7c0fadd8d524fbe4ce151e8019a9", "title": "Four Model Registries for Efficient Video Processing"}, "291": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py", "hash": "f836414d6e4487d7a53064cee688f43b", "title": "Customizable PaddlePaddle Weight Initialization"}, "292": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:1-36", "hash": "a4e39816683d02d3875550fa3302d901", "title": "Weight Initialization Functions"}, "293": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:37-66", "hash": "730facfd4a8dc6061fe7c9ac0aaa9afc", "title": "Truncated Normal Weight Initialization"}, "294": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:68-98", "hash": "8e42ab32c1bd99bb5836c8ef7e10e8a0", "title": "Truncated Normal Weight Initialization"}, "295": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:99-130", "hash": "25cc197bad1d611e6b82038e4fc9e86b", "title": "Truncated and Kaiming Normal Weight Init"}, "296": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:131-157", "hash": "eb485f578e2d8c3c7ef45a63f8eb17d0", "title": "Initialize Weights with Normal Distribution"}, "297": {"path": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:158-158", "hash": "48e0b144a4073fff150af40e868b946e", "title": "Random Weight Initialization"}, "298": {"path": "/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py", "hash": "ad4ef0535e4944a400a4949a7ab1ae4e", "title": "Importing Test Model Function"}, "299": {"path": "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py", "hash": "17623cd1b341c0d84c2cc7b1a8422b92", "title": "Multi-Card Model Testing"}, "300": {"path": "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py:1-31", "hash": "c059a19cb2a640db4a273bde3d346912", "title": "Model Testing without Gradient"}, "301": {"path": "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py:32-39", "hash": "9ceac2080fd741ddd93c503a39ea5820", "title": "Multi-card Test Configuration"}, "302": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py", "hash": "11a097247079a44b22b703aec2640924", "title": "PaddleVideo Library Utilities"}, "303": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py", "hash": "f83a439f771717559e12a420144707b5", "title": "Building Objects with Config and Registry"}, "304": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py:1-31", "hash": "a54bbce1587d9d975c033d691a75cf89", "title": "Build Object Function"}, "305": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py:32-35", "hash": "01e0be5913d2e7da16af7e0dfe5327fc", "title": "Build and Validate Object Class"}, "306": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py", "hash": "4de317a6037cb5b11bc599ffe7dfb554", "title": "Config Parser and Checker"}, "307": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:1-34", "hash": "9723ad71e3e4ac1d7587388959b7a5fc", "title": "Config Utilities Setup"}, "308": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:35-67", "hash": "b4449b3ecf2be557c0f8fa3cfdaca297", "title": "Config Parser and Printer"}, "309": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:68-109", "hash": "6c9628430b2be05dc6492df3dcac1287", "title": "Config Utilities and Visualization Functions"}, "310": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:110-139", "hash": "3517e871e60bffc4faea99c5cc4322af", "title": "Config Override Function"}, "311": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:140-170", "hash": "e8f88ba30cc39aa0b6d9a1bcca98d88a", "title": "Dynamic Config Overrides"}, "312": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:171-174", "hash": "c3c7fc8e31bd05056f4dd79d92cd7ed9", "title": "Check and Print Config"}, "313": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py", "hash": "e0dab607c4f876623c7a502c5b5bdf99", "title": "Distributed Computing Utilities"}, "314": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py", "hash": "60ca93f2ed075ed2c8be2ee2f0a09be9", "title": "Customizing PaddleVideo Logging"}, "315": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:1-38", "hash": "9e67c4508db20e6a05edf853dc6e7f9c", "title": "Colorful Logger: Setting Up Colors for Logging Messages in PaddleVideo"}, "316": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:39-71", "hash": "493b6a787bc8f720911dc12b6349e726", "title": "PaddleVideo Logger Initialization"}, "317": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:72-100", "hash": "ded062c3eabbf41a0248b763aa23a846", "title": "Custom Logger Configuration for Python"}, "318": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:101-113", "hash": "bb1b5be6ba13385164d9cc43e870bcf5", "title": "Initialize and Set Logger Level"}, "319": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py", "hash": "89c2984099194a5f8a8befc04a6faa79", "title": "OpenCV-Powered PyTorch Image Processing"}, "320": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1-28", "hash": "84c086b6951787f99f43791353ca023b", "title": "Define Paddle Tensor Type Hints"}, "321": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:29-41", "hash": "067c3de0ccdead594bcf7f03b8068247", "title": "Unstructured Numeric Sequence"}, "322": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:42-54", "hash": "af346b41d910ecd01b05a97645762eee", "title": "Extract Integer List"}, "323": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:55-67", "hash": "1fae030b1c57e734544d077daf7d9553", "title": "List of Integers: Purpose Unclear"}, "324": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:68-105", "hash": "3c23a278c93fb20cd4a5394755404229", "title": "Masked Damager Function"}, "325": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:106-146", "hash": "ea6b1fcff32d44c2f036d051ad194b4f", "title": "Morphology-Based Mask Overlay"}, "326": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:147-172", "hash": "a44999afd8755e746139631009514576", "title": "Generate Overlay Images and JSON List"}, "327": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:173-206", "hash": "e79842394b9d78efd3be1d9cbb068cd0", "title": "Video Frame Loading and Labeling Utility"}, "328": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:207-236", "hash": "be809b41a60f162610642b72e22011ac", "title": "Efficient Scribble Label Processing Functions"}, "329": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:239-272", "hash": "756d573d733e1c6b88bb23ef952f6ef2", "title": "Load and Save Pretrained Model Function"}, "330": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:273-304", "hash": "8be8fd71fbc9379dc02033e7314cf5a0", "title": "Damage Masks Generator"}, "331": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:305-330", "hash": "afc567bb06bf1d200ccd8a8577134fbe", "title": "Mask Damage Utility"}, "332": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:331-361", "hash": "effc9d297128678dc65b16885b3d18d0", "title": "Randomly Shifting Numpy Mask"}, "333": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:362-388", "hash": "a5dc85e260c5dcfbed6e3544cc78a669", "title": "Randomly Manipulating Binary Masks for AI Robustness"}, "334": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:389-422", "hash": "22a77b0c9e808307d7e89d5a7858946e", "title": "Binary Mask Rotation and Scaling"}, "335": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:423-466", "hash": "f5a66f73e685909f6046c3eac2241b63", "title": "PaddleVideo Utilities"}, "336": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:467-500", "hash": "bfb27ee7844c02eaba13a425bfb06447", "title": "Tensor to PIL Image Conversion"}, "337": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:501-529", "hash": "6d06f64fe8487a0883e9fd8af67bd048", "title": "Adjusting Image Format for Compatibility"}, "338": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:530-553", "hash": "b844eeadcc338703047f71233c0b6ac0", "title": "Image Mode Validator"}, "339": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:554-578", "hash": "330bcd4eaf9cde0e1b5bda54e7c653e5", "title": "Compatibility Checker"}, "340": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:580-615", "hash": "5ac6ec9ad185d337bab304c1eabb8a3d", "title": "Paddle-Torch Dictionary Converter"}, "341": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:616-640", "hash": "1de600f73d0f8a801e173ea575f9e4c1", "title": "Gradient Norm Clipping Function"}, "342": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:641-666", "hash": "e70a2f4ad62f683ad26fcfd728cafabc", "title": "Total Norm of Parameters Calculation"}, "343": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:667-682", "hash": "0f949c9d2e6cbe47e6aab46e7eb28f2d", "title": "Manet Gradient Scaling"}, "344": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:683-716", "hash": "a263c38a887649c375b9cebd06c7e9f8", "title": "Max Index Gathering with PaddlePaddle"}, "345": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:717-745", "hash": "7da9dd40be0406375200ea1606c0ac00", "title": "Tensor Sampling and Reshaping"}, "346": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:746-774", "hash": "d770ababc2c9e2ecd028eebea080afa7", "title": "Tensor Initialization Functions"}, "347": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:775-806", "hash": "d779d361752a0d50c7bf33164fd573ab", "title": "Normalizing Tensor with PyTorch's Paddle"}, "348": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:807-833", "hash": "cc16f945d925b55f8676c606b48ea0de", "title": "Recommended Gain for Nonlinearity Functions"}, "349": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:834-859", "hash": "0704f9e7d8481e6b27450ccf9372e88d", "title": "Nonlinearity Mapping Function"}, "350": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:862-895", "hash": "3ee61d2991fef01bc0f04808da1d53ee", "title": "Uniform and Normal Tensor Initialization Functions"}, "351": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:896-923", "hash": "2284ce5b27cc865ee67caa70945bdbd5", "title": "Truncated Normal and Constant Tensor Filling"}, "352": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:924-966", "hash": "015866c38564ccb2e1fff4bc2a7202ce", "title": "Tensor Filling Methods: Constant, Ones, Zeros, and Identity Matrix"}, "353": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:968-998", "hash": "4277b09ab5b2d456bd064af325b543e0", "title": "Dirac and Identity Tensor Initializers"}, "354": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1000-1028", "hash": "aadc8e5150623c24e76eb31f45f8d1fa", "title": "Manet Initializer Function"}, "355": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1029-1060", "hash": "eb4d50bbe59ce8dc54450c19a202438d", "title": "Xavier Uniform Initialization for Tensors"}, "356": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1062-1089", "hash": "768307d5b90ee8090fe3a948eaf6384f", "title": "Xavier Initialization for Tensors"}, "357": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1091-1120", "hash": "a4da7ac460ce0dc3653a22448eb8e516", "title": "Xavier Normal Distribution Initialization in PyTorch"}, "358": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1121-1145", "hash": "e292484189cbb0b886de494daa70b2bb", "title": "Kaiming Uniform Initialization Function"}, "359": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1146-1170", "hash": "8c26d6c18b08dd94967ccb5172d2266d", "title": "Kaiming Normal Tensor Initialization"}, "360": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1171-1196", "hash": "4d1f70de3c85a86606ead61137c1144b", "title": "Orthogonal Matrix and Kaiming Initialization Functions"}, "361": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1197-1233", "hash": "d96ccb4aca33c0d517f946d3408f23a0", "title": "Sparsity-Ensured Normal Tensor Initialization"}, "362": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1234-1268", "hash": "e20c00acc63771d2b04b582bafdf00a9", "title": "Sparsity-Init Torch Tensor\n(or)\nSparse Torch Tensor Init\n(or)\nTorch Tensor Sparse Initializer"}, "363": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1269-1295", "hash": "bae353059469eba47a1a041eb6f62010", "title": "Deprecating Init Methods: Torch.nn to PaddleVideo"}, "364": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py", "hash": "0e0bbc2243536eb2347f16a7e1e1ef70", "title": "Precise Batch Normalization Improvement"}, "365": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:1-30", "hash": "273850870b51f1af31409be82f67a1da", "title": "Improved Batch Norm in EIVideo"}, "366": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:31-54", "hash": "39e98cb71eca93d4e4662969871e1cf1", "title": "Precise Batch Normalization Statistics"}, "367": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:55-80", "hash": "051d0a84e53d52d28eec3ee62653a616", "title": "Precise BN Training Algorithm"}, "368": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:81-84", "hash": "73667949d3794383b4f20e2cbabf0e8b", "title": "Updating BatchNorm Layers in Model Training"}, "369": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py", "hash": "71fc94521b41663a468987ce6b4a9145", "title": "PaddlePaddle Profiler Initialization"}, "370": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:1-29", "hash": "915c6c3127eea51d9f94940df9dbf54d", "title": "Global Variables and Profiler Options"}, "371": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:30-52", "hash": "4a7ecf8be729828011fcf69a35f68557", "title": "Profiler Options Class"}, "372": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:53-76", "hash": "cd6b0eae761d553604279f6dad567c15", "title": "Profile Parser from String"}, "373": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:77-104", "hash": "a1ed4afe806c1caeccd30e95a1e2b959", "title": "Operator-Level Profiling with PaddlePaddle's Profiler"}, "374": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:105-110", "hash": "6bb5b0511e51ed77273e28c6dcf01f41", "title": "Profiler Step Incrementer"}, "375": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py", "hash": "d6b97603e44f68a1d21cc0199c573139", "title": "PaddleVideo Record: Metrics Tracking and Logging"}, "376": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:1-32", "hash": "dfc55d3bc42eafac4e2aa3aee2cc8fce", "title": "Metrics Logger for Paddle Video"}, "377": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:33-49", "hash": "2ccf6902b0407acf7394eb15df6580a0", "title": "Framework-Specific Metric Recording"}, "378": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:50-72", "hash": "366b74e95ee7e7e1e51fcbf8387e5538", "title": "Average Meter for Metrics Tracking"}, "379": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:73-113", "hash": "2283f2f8e7867260af5f8aa64b77f42a", "title": "Tracking Metrics Class"}, "380": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:114-141", "hash": "7942a73e9fcc7a588c964f560b579a16", "title": "Epoch Metrics Logger"}, "381": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:142-157", "hash": "dcb099329731cb2a183018a4c50493ec", "title": "Epoch Metrics Logger"}, "382": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py", "hash": "37f5a552289064dfc46330490dcf80ac", "title": "Registry: Name-to-Object Mapping and Registration"}, "383": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:1-34", "hash": "aa7f2ff72d1c3cea9b0e8c8444bff553", "title": "Registry Class for Custom Modules"}, "384": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:36-70", "hash": "78cbec2e7904bafb22083914eb1e0021", "title": "Object Registry Manager"}, "385": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:71-96", "hash": "496426f6a6b3d703db5356ef28669ea3", "title": "Registry Class and Methods"}, "386": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py", "hash": "6f5248104cb247195c807157bb34f54e", "title": "ViT Adaptor with PaddlePaddle Compatibility"}, "387": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:1-28", "hash": "a509f5b63a5e89a093c3dda01edc9a09", "title": "ViT Model Adaptation for Existing Architecture"}, "388": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:29-49", "hash": "41565172e7e51dfb5528faba907974bf", "title": "Maintaining 'pos_embed' Tensor Consistency"}, "389": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:51-71", "hash": "ae609013da632e5f85398ef43b461d63", "title": "Adjusting Time Embedding Shape"}, "390": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:72-96", "hash": "a550518360b4b02499bf023190ab4205", "title": "Temporal State Dictionary Merge"}, "391": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:97-127", "hash": "8931e5ead2a81e57e80edcef255e88d3", "title": "Loading Pre-trained Model Parameters"}, "392": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:128-152", "hash": "cbcb83a36e3b60ae73cf37c412902c32", "title": "Resnet and Vision Transformer Weights Loader"}, "393": {"path": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:153-182", "hash": "4c723f63a295950c383f592b9405f5ec", "title": "Load and Save PaddlePaddle Models"}, "394": {"path": "/applications/EIVideo/EIVideo/paddlevideo/version.py", "hash": "6ca7d799557f98e977461c1e4bfd1c3e", "title": "PaddleVideo Version Info"}, "395": {"path": "/applications/EIVideo/EIVideo/setup.py", "hash": "98b861269945162787fe1380d6b7bf23", "title": "Code Credits and Sources"}, "396": {"path": "/applications/EIVideo/EIVideo/version.py", "hash": "c3ef742ac9b372ae98f46762a5d5bfaa", "title": "EIVideo Version Information"}, "397": {"path": "/applications/EIVideo/QEIVideo/__init__.py", "hash": "5f4b93320e7872df3b7e77431453b36c", "title": "QEIVideo Path and Version"}, "398": {"path": "/applications/EIVideo/QEIVideo/build_gui.py", "hash": "00684c73f9f566bd8f7ffe7cc244020c", "title": "Video GUI with PyQt5: Functionality Overview"}, "399": {"path": "/applications/EIVideo/QEIVideo/build_gui.py:1-36", "hash": "48c48fd458f46770f674bda16f62f949", "title": "PyQt5 GUI Builder Script"}, "400": {"path": "/applications/EIVideo/QEIVideo/build_gui.py:37-59", "hash": "eee56dd45a8c45b92e8bb1960135b56c", "title": "Progress Bar and Play Button Functionality"}, "401": {"path": "/applications/EIVideo/QEIVideo/build_gui.py:60-78", "hash": "a44569ae83612ec209d03680621c4aa0", "title": "Interactive Video Controls: Stop, Start, Select"}, "402": {"path": "/applications/EIVideo/QEIVideo/build_gui.py:79-107", "hash": "f41c84412ea14706d9084c968b465fe2", "title": "Video Processing GUI with Eraser Mode"}, "403": {"path": "/applications/EIVideo/QEIVideo/build_gui.py:109-135", "hash": "445598738d9bcefcc61e489a80102739", "title": "GUI Application Functions"}, "404": {"path": "/applications/EIVideo/QEIVideo/build_gui.py:136-151", "hash": "ae00c1caa32d5e5b7919d3936b1ba2a0", "title": "Update Frame in QEIVideo GUI"}, "405": {"path": "/applications/EIVideo/QEIVideo/gui/__init__.py", "hash": "6831e707cecf1216f8844622dabebd22", "title": "PaddleVideo's EIVideo Copyright Comment Block"}, "406": {"path": "/applications/EIVideo/QEIVideo/gui/demo.py", "hash": "e83f89f70bfd6ac0373ff859d725ce3f", "title": "DrawFrame Class for QT UI"}, "407": {"path": "/applications/EIVideo/QEIVideo/gui/demo.py:1-36", "hash": "6bfff55c5eb106142912c168eca1c19e", "title": "Interactive QWidget Drawing Class"}, "408": {"path": "/applications/EIVideo/QEIVideo/gui/demo.py:39-62", "hash": "aa90ac03139d916aad9f9e72744ddfce", "title": "DemoUI Frame Drawing Initialization"}, "409": {"path": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py", "hash": "11bc888ac07e07c627b3160c123671c7", "title": "EIVideo App UI Initialization"}, "410": {"path": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:1-32", "hash": "30a2b419d3cf245f2702990ee052c891", "title": "Video Application Main Window Initialization"}, "411": {"path": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:33-56", "hash": "bb594f045fd060b4ac44a64d10d6dc2d", "title": "UI Initialization in Video App"}, "412": {"path": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:57-77", "hash": "3b5a064653cf561771fa2858e242482f", "title": "GUI Setup for EIVideo"}, "413": {"path": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:78-101", "hash": "a423dc2749669e38522b1227b55a2df0", "title": "Painting App UI Setup"}, "414": {"path": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:102-122", "hash": "b2407957451ba06bad7962b10c53c150", "title": "Video Player UI Creation and Interaction"}, "415": {"path": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:123-142", "hash": "46cde7caac9308835de20c51955d34c8", "title": "Initializing Push Buttons and Layouts"}, "416": {"path": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:143-164", "hash": "9362f62f3802f3d7a2d779ce61be235d", "title": "Creating App's Main UI"}, "417": {"path": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:165-167", "hash": "30e1c17013b9c31648e8b7f7bb4831b2", "title": "GUI Element Updates in MainWindow"}, "418": {"path": "/applications/EIVideo/QEIVideo/start.py", "hash": "e687d4712aa6e455cabdaed11537d3b5", "title": "Launch QEIVideo GUI with Python"}, "419": {"path": "/applications/EIVideo/QEIVideo/tools/__init__.py", "hash": "6831e707cecf1216f8844622dabebd22", "title": "PaddleVideo QEIVideo Comment Block"}, "420": {"path": "/applications/EIVideo/QEIVideo/ui/__init__.py", "hash": "6831e707cecf1216f8844622dabebd22", "title": "EIVideo UI Init File Comment"}, "421": {"path": "/applications/EIVideo/QEIVideo/ui/demo.py", "hash": "56f03d75cd32f0ab7c93c1268f7fb427", "title": "Interactive PyQt5 Video Player UI"}, "422": {"path": "/applications/EIVideo/QEIVideo/ui/demo.py:1-25", "hash": "830d99ce02e389ffb5bcb57ae0290a3c", "title": "PyQt5 UI Generated Main Window Code"}, "423": {"path": "/applications/EIVideo/QEIVideo/ui/demo.py:26-41", "hash": "cd09b65720b1dc03b19afcb112b22f64", "title": "Video Player Interface Setup"}, "424": {"path": "/applications/EIVideo/QEIVideo/ui/demo.py:42-58", "hash": "054ea23803da10b934a7f0d8e0440391", "title": "Creating Video Player Buttons and Slider"}, "425": {"path": "/applications/EIVideo/QEIVideo/ui/demo.py:59-77", "hash": "43c473f9758b6783385a4e21a73d5e6a", "title": "GUI Layout for Video Player Application"}, "426": {"path": "/applications/EIVideo/QEIVideo/ui/demo.py:78-97", "hash": "87067f3a2d7be44ad96aa7c913928b8d", "title": "Creating Tab Widget with QProgressBar and QLabel"}, "427": {"path": "/applications/EIVideo/QEIVideo/ui/demo.py:98-113", "hash": "af10d2288e5e37fb6d942722e84b7577", "title": "QEIVideo UI Configuration"}, "428": {"path": "/applications/EIVideo/QEIVideo/version.py", "hash": "c3ef742ac9b372ae98f46762a5d5bfaa", "title": "EIVideo Version Info"}, "429": {"path": "/applications/EIVideo/QEIVideo/widget/PaintBoard.py", "hash": "f9cc390ff6dbc8c7cf3c9af4f6067161", "title": "PaintBoard: QWidget for Drawing & Erasing"}, "430": {"path": "/applications/EIVideo/QEIVideo/widget/PaintBoard.py:1-40", "hash": "e5f09b215fbcaf3472f0bc1640686abf", "title": "PaintBoard: Custom QWidget for Graphic Editing"}, "431": {"path": "/applications/EIVideo/QEIVideo/widget/PaintBoard.py:42-78", "hash": "123033653bd622694ec539a253e7558a", "title": "PaintBoard Class Functions"}, "432": {"path": "/applications/EIVideo/QEIVideo/widget/PaintBoard.py:80-106", "hash": "fbdbbac9c603805e7dc328483ecbb104", "title": "Mouse Event Handler for PaintBoard Drawing"}, "433": {"path": "/applications/EIVideo/README.md", "hash": "5e2d267957cba8b3187c707c9a03060c", "title": "EIVideo: Windows Video Annotation Tool"}, "434": {"path": "/applications/EIVideo/README.md:1-15", "hash": "2963f34fa2cd3f8181f58f6112fe815f", "title": "Interactive Intelligent Video Annotation Tool"}, "435": {"path": "/applications/EIVideo/README.md:16-49", "hash": "dbbb009fad1bcdbcf5b67cac14f76a1d", "title": "Interactive Video Annotation Toolbox"}, "436": {"path": "/applications/EIVideo/README.md:51-85", "hash": "89512fef8bcba4baa9953a5aa0d7de69", "title": "Introducing EIVideo: Customizable Interactive Video Annotation"}, "437": {"path": "/applications/EIVideo/README.md:86-119", "hash": "60b97b9b0a09e9132954a3f7c2d249b2", "title": "QEIVideo Installation and Roadmap Guide"}, "438": {"path": "/applications/EIVideo/README.md:121-123", "hash": "dc2ec17af962645ebebd6b3dac27a7c7", "title": "Emoji and Resource Sources"}, "439": {"path": "/applications/EIVideo/resources/QT/demo.ui", "hash": "c649ed8a427a19065c1bb208b25d4ecf", "title": "Qt Video Demo UI Designer"}, "440": {"path": "/applications/EIVideo/resources/QT/demo.ui:1-44", "hash": "48ab41836da16236cc9a0f01704de954", "title": "Main Window Interface Design"}, "441": {"path": "/applications/EIVideo/resources/QT/demo.ui:45-86", "hash": "939f60dd38274c323cdcacfd2afc8a4b", "title": "UI Design: Video Open Button"}, "442": {"path": "/applications/EIVideo/resources/QT/demo.ui:87-125", "hash": "59f536818ad234ae52bfdf19a2155a0a", "title": "UI Design with Chinese Buttons"}, "443": {"path": "/applications/EIVideo/resources/QT/demo.ui:126-169", "hash": "87751b7fd3fc4aced3c99b8bc9a71ef4", "title": "User Interface Layout Design"}, "444": {"path": "/applications/EIVideo/resources/QT/demo.ui:170-212", "hash": "39a464653554e6400b28e8a819640b2f", "title": "Qt UI Layout Design"}, "445": {"path": "/applications/EIVideo/resources/QT/demo.ui:213-236", "hash": "e9315496ed6101196c778bc94239ba30", "title": "Qt Application User Interface Layout: Demo.ui"}, "446": {"path": "/applications/EIVideo/resources/cmd", "hash": "4af3a4b1d3b5ce1aec92d53c162cd0a7", "title": "Updating EIVideo: PaddleGit Operations"}, "447": {"path": "/applications/FightRecognition/README.md", "hash": "b7f747cf2e8e66bc7a64caad15ace628", "title": "Fight Recognition Model Guide"}, "448": {"path": "/applications/FightRecognition/README.md:1-29", "hash": "70f11f26315c9ea863a4bbbd71006d53", "title": "Fight Recognition with PaddleVideo PP-TSM"}, "449": {"path": "/applications/FightRecognition/README.md:31-55", "hash": "4f185ea312cf5058453d876a2666fd17", "title": "Python Script Executes Fight Prediction Model"}, "450": {"path": "/applications/FightRecognition/README.md:56-75", "hash": "3c7fe054238e8ea9701da3da147d3db7", "title": "Fight Detection Datasets and Training Approach"}, "451": {"path": "/applications/FightRecognition/README.md:77-118", "hash": "ceb2c29db31d118a0f436eaa56b92daf", "title": "Multi-Dataset Fight Recognition Tool"}, "452": {"path": "/applications/FightRecognition/README.md:119-160", "hash": "cfbd5ddf01fda2c9b09e39951e2d9702", "title": "Train and Validate Video Lists Generation"}, "453": {"path": "/applications/FightRecognition/README.md:162-192", "hash": "c46763cd7c3857df49e8f41ad143894f", "title": "Cut Video Function"}, "454": {"path": "/applications/FightRecognition/README.md:193-245", "hash": "7a33e0716c574ae47602c35ae1e3b3bf", "title": "End of Model Training Code Snippet"}, "455": {"path": "/applications/FightRecognition/README.md:246-248", "hash": "91f2eabd399491fc9ce76f1c922e6dae", "title": "Loading and Saving Pre-Trained Model"}, "456": {"path": "/applications/FigureSkating/README.md", "hash": "58966ed550525814944169eab59d1b92", "title": "OpenPose for Figure Skating Analysis"}, "457": {"path": "/applications/FigureSkating/README.md:1-46", "hash": "267f4428f0be8a76f31123b36d99942b", "title": "Figure Skating Action Data Processing with OpenPose"}, "458": {"path": "/applications/FigureSkating/README.md:48-92", "hash": "ca42421616fc509bf9c68ebf0a4973ee", "title": "Training Figure Skating Models with Video Data"}, "459": {"path": "/applications/FootballAction/README.md", "hash": "dfa03f2f666c738847d5fdecaaa310b1", "title": "FootballAction Model Improvements in PaddleVideo"}, "460": {"path": "/applications/FootballAction/README.md:1-54", "hash": "3e1ba8bf8c9ddcb64386df8608c9578c", "title": "Soccer Action Detection Algorithm in PaddleVideo"}, "461": {"path": "/applications/FootballAction/README.md:55-118", "hash": "d3e635a0cc79854c81384e651a2b557e", "title": "Football Action Dataset Preprocessing"}, "462": {"path": "/applications/FootballAction/README.md:119-141", "hash": "84bb11d8bd91cd2a118ae1c953fde66d", "title": "Comprehensive FootballAction Dataset Directory"}, "463": {"path": "/applications/FootballAction/README.md:142-201", "hash": "87c3e876cb82a17755d38f3fdc05b447", "title": "Download, Run and Train PP-TSM for FootballAction"}, "464": {"path": "/applications/FootballAction/README.md:202-230", "hash": "2bd47e1415b0ee84f45918f994e1106a", "title": "Updating Recognizer2D and Exporting PP-TSM Model"}, "465": {"path": "/applications/FootballAction/README.md:231-275", "hash": "beee241493fb3ece4c38a1e7fa488541", "title": "Replacing Output Tensor and Extracting Features"}, "466": {"path": "/applications/FootballAction/README.md:276-320", "hash": "3c59f210914e9b1459b0d59a9209dfb1", "title": "BMN Dataset Creation Script"}, "467": {"path": "/applications/FootballAction/README.md:321-362", "hash": "68dcee7efc686721d592d16dfc9ee134", "title": "BMN Model Export and Prediction"}, "468": {"path": "/applications/FootballAction/README.md:363-408", "hash": "a1be3b1503171ddb62c0852563f7031e", "title": "Attention LSTM Improvements in FootballAction"}, "469": {"path": "/applications/FootballAction/README.md:409-441", "hash": "20025ecbb6c6ea36f2f756261ec85329", "title": "LSTM Training Data Snippet"}, "470": {"path": "/applications/FootballAction/README.md:442-493", "hash": "1b25f87fedfbb4824e0184244d2b70a8", "title": "LSTM Training and Prediction Code"}, "471": {"path": "/applications/FootballAction/README.md:494-513", "hash": "940a57a86453088cd9f26023bd4f9bb2", "title": "Improved PP-TSM Model for Football Action Detection"}, "472": {"path": "/applications/FootballAction/checkpoints/download.sh", "hash": "f940b6f312bfef7732b66fb298859b40", "title": "FootballAction Checkpoints Download Script"}, "473": {"path": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list", "hash": "92a97efd512e27af6ed85783bfd54b05", "title": "EuroCup2016 Video Dataset URLs"}, "474": {"path": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:1-11", "hash": "6ccd57fe5b4cde5925c2ba638027b46a", "title": "EuroCup2016 Dataset URLs"}, "475": {"path": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:12-22", "hash": "6eee21af04fda99d7411b063974efda7", "title": "EuroCup2016 Dataset Download URLs"}, "476": {"path": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:23-33", "hash": "4094f2600859cbf9997e683aef122ed5", "title": "EuroCup2016 Video URLs List"}, "477": {"path": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:34-44", "hash": "ee004d61db334d703792070c4422cc06", "title": "EuroCup2016 Dataset URL Listing"}, "478": {"path": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:45-49", "hash": "a34eb5dd74b8ed1a7953245e47aa72f1", "title": "EuroCup2016 Video URLs"}, "479": {"path": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh", "hash": "65ab339aaae79826021b462771787af7", "title": "Download EuroCup2016 Videos"}, "480": {"path": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:1-13", "hash": "ca093fbbbd4d014f931342d0ca2a05c0", "title": "Download EuroCup2016 Videos"}, "481": {"path": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:14-24", "hash": "07a3d517280a30bf980d8c81077b0c42", "title": "EuroCup2016 Video Download Script"}, "482": {"path": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:25-35", "hash": "ebd09658ff8b95ae18de7c93e0f260bd", "title": "EuroCup2016 Video Download"}, "483": {"path": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:36-46", "hash": "495201e0218b78ab0a5e0b25eba4e88c", "title": "EuroCup2016 Video Download Script"}, "484": {"path": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:47-51", "hash": "f9fa392aa2eb9b762689268e36d7fcd8", "title": "Download EuroCup2016 Mp4 Files"}, "485": {"path": "/applications/FootballAction/datasets/EuroCup2016/url.list", "hash": "bb1facd61a3eb4355dd3d6337d4bb197", "title": "EuroCup2016 Video URLs"}, "486": {"path": "/applications/FootballAction/datasets/EuroCup2016/url.list:1-26", "hash": "e1be6b544ab601d7aa8905297b4700cc", "title": "FootballAction EuroCup2016 URL List"}, "487": {"path": "/applications/FootballAction/datasets/EuroCup2016/url.list:27-49", "hash": "9d36cb5bff5983fb8952bf3dd2d4015a", "title": "EuroCup2016: FootballAction URLs List"}, "488": {"path": "/applications/FootballAction/datasets/EuroCup2016/url_val.list", "hash": "f1d7e89584a2e3a4285416d8c451c3ac", "title": "Video URL List for EuroCup2016"}, "489": {"path": "/applications/FootballAction/datasets/script/get_frames_pcm.py", "hash": "6e0b648179bcd5e7381f6bc305dedeab", "title": "Parallel FFmpeg Frame and Audio Extraction"}, "490": {"path": "/applications/FootballAction/datasets/script/get_frames_pcm.py:1-37", "hash": "ae54f2492860b747b9e17fb1bd1bf41b", "title": "Extract Frames and PCM Audio from Videos"}, "491": {"path": "/applications/FootballAction/datasets/script/get_frames_pcm.py:38-54", "hash": "275d0148743fcd72733b248616d9d1d8", "title": "Multithreaded MP4 Parser"}, "492": {"path": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py", "hash": "d174cb10d11499853563d28f6f1461a5", "title": "BMN Instance Extraction Script"}, "493": {"path": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:1-42", "hash": "25a57faf4608a8eb4125a7e0c6d50799", "title": "BMN GT Data Processor Script"}, "494": {"path": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:43-69", "hash": "2527b815b98622d054660c79e1fd7e52", "title": "Filtering Actions by Duration"}, "495": {"path": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:70-102", "hash": "214a140b0cbfd131b40e760ca9e14b93", "title": "BMN Window GT Data Combination"}, "496": {"path": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:103-128", "hash": "40cc973185c3ace062f3791567482d27", "title": "Segmenting Actions with Before/After IDs"}, "497": {"path": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:129-147", "hash": "a5c44e165a0ba3c6bd04871066d90bce", "title": "Random Video Segment Selection and Annotation"}, "498": {"path": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:148-178", "hash": "471f28b72af5cc3c050cae4feb510e8b", "title": "Saving Features with get_instance_for_bmn"}, "499": {"path": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:180-205", "hash": "6cd433831e2d5ad7626b8f3cfa9c2527", "title": "Reshaping and Concatenating Feature Arrays"}, "500": {"path": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:206-216", "hash": "2c63d989df57d4f379f7017a3ba75cd6", "title": "BMN Data Processing Pipeline"}, "501": {"path": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py", "hash": "9140430e67a018e62064e227f7e4bbdd", "title": "Python Script for Football Dataset Preparation"}, "502": {"path": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:1-44", "hash": "f0f822dbeefed5e1e973c5a36ae30b52", "title": "IoU/IOA Calculator for LSTM Models"}, "503": {"path": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:45-80", "hash": "eff5b98fa2a65c44cc1b52ca4a77d508", "title": "IoU and IOA Comparison Tool"}, "504": {"path": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:82-110", "hash": "c2bbcc3f0afca6a0a249fd40696c8b40", "title": "Evaluate Proposals with IoU Threshold"}, "505": {"path": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:111-130", "hash": "0617a4ade66a624a118fd92581972ac0", "title": "Splitting Datasets for Football Actions"}, "506": {"path": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:132-161", "hash": "65c502aa005d81df362a28b825cb1c2f", "title": "Save Video Features and Labels to Files"}, "507": {"path": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:162-172", "hash": "bc9d48111b50eaae4fbf7c4494d5af78", "title": "Label File Processing Script"}, "508": {"path": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py", "hash": "3d65c4c69e88ce5f4c898ff7a2b2a46b", "title": "Action Detection and Dataset Generation"}, "509": {"path": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:1-38", "hash": "4a9682074e74e7e5b021dd3c15161c38", "title": "Action Instance Extractor"}, "510": {"path": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:39-65", "hash": "5f9d63cf2e91deedba14d0ad18fa52ae", "title": "Generating Positive and Negative Action Instances"}, "511": {"path": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:66-96", "hash": "7333c072491a2d6941e30bc022cdbfb2", "title": "Multiprocessing Dataset Instantiation and Saving"}, "512": {"path": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:97-97", "hash": "a87ca59399a789be7fae7271eb403868", "title": "File Path for Validation List"}, "513": {"path": "/applications/FootballAction/extractor/extract_bmn.py", "hash": "1421dd6382d6576937e8817889a9fd01", "title": "Video Classification and Detection Script"}, "514": {"path": "/applications/FootballAction/extractor/extract_bmn.py:1-49", "hash": "32e16eb189525a8d15d6f382e45421e6", "title": "Video Classification Model with Baidu Cloud"}, "515": {"path": "/applications/FootballAction/extractor/extract_bmn.py:50-83", "hash": "379b349eca67273b8c5964e8d856441f", "title": "Video Feature Extraction and Proposal Prediction"}, "516": {"path": "/applications/FootballAction/extractor/extract_bmn.py:84-91", "hash": "9d7290753ea6dcf51c1dd547e516b221", "title": "JSON Proposal Saver"}, "517": {"path": "/applications/FootballAction/extractor/extract_feat.py", "hash": "7096dd6c857c10e67372da641f4deaa0", "title": "Baidu Cloud Model-based Video Classifier"}, "518": {"path": "/applications/FootballAction/extractor/extract_feat.py:1-50", "hash": "bbc23420dd38f741873595c954b3a2e1", "title": "Baidu Cloud Action Video Classifier"}, "519": {"path": "/applications/FootballAction/extractor/extract_feat.py:51-74", "hash": "3bf9e1c01cc935f95769c1636dd1aaaa", "title": "Video Feature Extraction and Conversion"}, "520": {"path": "/applications/FootballAction/extractor/extract_feat.py:75-100", "hash": "8a55605be9cda3c773f63f21f15c324e", "title": "Video Feature Extractor and Classifier"}, "521": {"path": "/applications/FootballAction/predict/action_detect/action.py", "hash": "da861500873f28133b4919691e06c839", "title": "Baidu Cloud Action Detection System using ML"}, "522": {"path": "/applications/FootballAction/predict/action_detect/action.py:1-44", "hash": "984eb62574574c129b35906e19149c52", "title": "Baidu Action Detection System"}, "523": {"path": "/applications/FootballAction/predict/action_detect/action.py:45-71", "hash": "072be86a09859c2accef331de0cfd1f3", "title": "Initialize ModelPredict Object"}, "524": {"path": "/applications/FootballAction/predict/action_detect/action.py:72-103", "hash": "70424839176ad576523e3d3bb96bf50d", "title": "Action Detection Model Initialization"}, "525": {"path": "/applications/FootballAction/predict/action_detect/action.py:104-132", "hash": "a322a71fb862149c692c3bdab4277bfb", "title": "Action Detection Methods and Tracking in Football"}, "526": {"path": "/applications/FootballAction/predict/action_detect/action.py:133-151", "hash": "d9e9fcee7351dbb147a1057a1d4a8fdb", "title": "Configure PPTSM Model and Predict Features"}, "527": {"path": "/applications/FootballAction/predict/action_detect/action.py:152-173", "hash": "06d3df5b0b2de41d2a2af2e14ae95c95", "title": "Video Feature Processing for Action Detection"}, "528": {"path": "/applications/FootballAction/predict/action_detect/logger.py", "hash": "96e8f1f7751ef922ea41922fca4c9f7b", "title": "Custom Logger for News Stripper"}, "529": {"path": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py", "hash": "a94bb4e20910e72134245d0b879cca1e", "title": "MFCC-based Action Detection in Football"}, "530": {"path": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:1-38", "hash": "4702f0e8cf47583759f7482a6880437b", "title": "MFCC Feature Extraction Algorithm"}, "531": {"path": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:39-69", "hash": "6f7f7d0d1b6a1c9e5432fb1212bf4421", "title": "Mel Scale Audio Feature Extraction"}, "532": {"path": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:70-90", "hash": "3c30d5a8da69931833fc303f42f83b82", "title": "Extract MFCC Features from Speech Audio"}, "533": {"path": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:91-111", "hash": "11ef8eb43f7eacbcd98e306593d4f1fb", "title": "Spectrogram Calculator Function"}, "534": {"path": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:112-136", "hash": "125f6bdd50fa4941414901088de62bbf", "title": "MFCC-based Audio Feature Extraction"}, "535": {"path": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:137-157", "hash": "ae10f74ac4269ca100f43f95778f849b", "title": "Audio Feature Extraction for Action Detection"}, "536": {"path": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:159-182", "hash": "e5b7e273b3abb97ee7e0e6036cf0e7e4", "title": "Audio Feature Extraction from WAV Files"}, "537": {"path": "/applications/FootballAction/predict/action_detect/mfcc/model_config.py", "hash": "574c1392fd9221b3a4d9820af61e3c18", "title": "ModelAudio: Extract, Slice, Predict"}, "538": {"path": "/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py", "hash": "48665dcb4614e556d4cc94b8dcd68918", "title": "VGGish Model Parameters and Configurations"}, "539": {"path": "/applications/FootballAction/predict/action_detect/models/audio_infer.py", "hash": "2bbeef0ddfde34d58b47dfeebfdec54c", "title": "Audio Inference with InferModel"}, "540": {"path": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py", "hash": "074abbe2fbc626716f1de15c2a5a001c", "title": "BMN Infer Action Detection"}, "541": {"path": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:1-37", "hash": "93533251d93f9f2f9e570a281ef3305f", "title": "BMN Infer App Class Definition"}, "542": {"path": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:38-63", "hash": "e02d14513a02c9dfaadacc2737e827dd", "title": "BMN Inference Process"}, "543": {"path": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:64-86", "hash": "ca5814796e1ddd927c31d010d057d788", "title": "Boundary Score Calculator"}, "544": {"path": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:87-111", "hash": "3f94668e726f5bed6178e98f4dc56b2c", "title": "Boundary-Based Mask Selection"}, "545": {"path": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:112-131", "hash": "4fd27d7157428a1de80d44e9195c8aad", "title": "Average-Window Action Detection"}, "546": {"path": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:133-156", "hash": "30f8f60b188935444a6f41d26bf9abdb", "title": "BMN Inference & JSON Saving"}, "547": {"path": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py", "hash": "fbacdaaf91a8cebc44fc38186c2d0a40", "title": "Efficient LSTM Football Action Prediction"}, "548": {"path": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:1-36", "hash": "37088191473f81e93a65076cf01be2f0", "title": "Football Action Inference with PaddlePaddle"}, "549": {"path": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:37-61", "hash": "df8f39306fb9a87cd744036be49f9d0b", "title": "LSTM Model for Video Action Detection"}, "550": {"path": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:62-91", "hash": "68f29a4db96714d062cab1d209d769cf", "title": "LSTM Data Processing and Prediction"}, "551": {"path": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:92-110", "hash": "d8ff49c3b7d4385715b5bde4c2dc8c1a", "title": "LSTM Inferencing for Action Detection"}, "552": {"path": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:111-137", "hash": "df1360af774442a97192ef01f3f42c30", "title": "Initialize InferModel and Load Data"}, "553": {"path": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:138-152", "hash": "4d1e3028252b2a7bc95356cae7d7bed7", "title": "Efficient Action Detection"}, "554": {"path": "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py", "hash": "fe973f7c13dbb8712f0ed8dfdfd0a49f", "title": "PPTSM Inference for Football Actions"}, "555": {"path": "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py:1-38", "hash": "3941d738fe47213a7847b236734ad5ef", "title": "PPTSM Model Inference Class"}, "556": {"path": "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py:40-67", "hash": "201ea3bf16089c38227c0c806604b558", "title": "PPTSM Inference Script"}, "557": {"path": "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py:69-78", "hash": "3e11a570095628a2f08feca8617dfed7", "title": "Football Action Prediction Model"}, "558": {"path": "/applications/FootballAction/predict/action_detect/reader/__init__.py", "hash": "2e9ec402585026301d4b3692fdd50115", "title": "Alphabetical Action Readers"}, "559": {"path": "/applications/FootballAction/predict/action_detect/reader/audio_reader.py", "hash": "0e3ba10a6e651c7cbe1d69febd61976f", "title": "AudioReader for YouTube-8M Dataset"}, "560": {"path": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py", "hash": "94990484dcd22cd9fa9a1800fbd8510b", "title": "BMNINF Reader for Football Action Detection"}, "561": {"path": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:1-49", "hash": "1a3d2a196ebcaca92af9ec63fe6b406e", "title": "BMNINF Reader: FootballAction Data Reader"}, "562": {"path": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:50-73", "hash": "5c43cf56ba560576ffa4050c6838946a", "title": "Bmninf Reader Initialization"}, "563": {"path": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:74-105", "hash": "e66ec537f47dffe9fdb075285b22a99e", "title": "Football Action Detection Reader"}, "564": {"path": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:106-138", "hash": "10fb330de77f6bb66b7fd73a7832f810", "title": "BMNINF Reader Function"}, "565": {"path": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:139-155", "hash": "c35c1c7c108fd1aa616343b4316302cd", "title": "Video Batch Reader for Football Action Detection"}, "566": {"path": "/applications/FootballAction/predict/action_detect/reader/feature_reader.py", "hash": "c2f3c29ec55adda10774455c82248fcd", "title": "Attention-Based LSTM Feature Reader"}, "567": {"path": "/applications/FootballAction/predict/action_detect/reader/feature_reader.py:1-33", "hash": "35c4c2542af117ce89f91cdce150b3f1", "title": "Attention-Based LSTM Feature Reader"}, "568": {"path": "/applications/FootballAction/predict/action_detect/reader/feature_reader.py:35-71", "hash": "973807570f9f8f9b17ff0b08418255be", "title": "Feature Reader Initialization"}, "569": {"path": "/applications/FootballAction/predict/action_detect/reader/feature_reader.py:72-86", "hash": "cbae92eefcffc03da619a285959bff2a", "title": "Multi-Feature Reader"}, "570": {"path": "/applications/FootballAction/predict/action_detect/reader/reader_utils.py", "hash": "ccf03df124e3510d96b3843769ada5b9", "title": "Video Reader Utils"}, "571": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py", "hash": "18ba7580b69661263367fe31df6e5fab", "title": "Threaded TSMINF Reader for Football Action Detection"}, "572": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:1-38", "hash": "4950e12ccb918c5ab75fc7fbd61a11c3", "title": "TSMINF Video Reader Class"}, "573": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:39-64", "hash": "e18d87ba3b935cf217f111f9a1235482", "title": "TSN Video Reader Initialization"}, "574": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:65-97", "hash": "b1cae6f53bc63019d79612de6dcc0396", "title": "Video Image Batch Reader for Inference"}, "575": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:98-119", "hash": "d4451cae9992a1e798a333db51ba8e0d", "title": "Multithreaded Video Image Reader"}, "576": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:120-141", "hash": "43d1ef51e6f44af23ca3edb01890f8f4", "title": "Image Data Inference and Transformation"}, "577": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:143-180", "hash": "d7e15106eb6f6907852b266d62166029", "title": "Image Transformation Function"}, "578": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:181-212", "hash": "e5364ca93e2129bd700f6464651526b3", "title": "Image Preprocessing for Football Action Detection"}, "579": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:213-242", "hash": "76b6b9384a581dc488f52b35fd8954cd", "title": "Random Crop with Offset Adjustment"}, "580": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:243-267", "hash": "aa4895f28ce2e1c239791459c6b535be", "title": "Random Crop Sizes for Action Detection"}, "581": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:268-307", "hash": "c4ee2e9e162fcfc049a99e74c38c0e2c", "title": "Image Processing Functions"}, "582": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:308-349", "hash": "75ae96c497211c416c7fe36f380f93cc", "title": "Image Group Manipulation Techniques"}, "583": {"path": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:350-357", "hash": "7aef93cb28b42c72e85fc57a303271f1", "title": "Adaptive Image Resizer"}, "584": {"path": "/applications/FootballAction/predict/action_detect/utils/config_utils.py", "hash": "68c51c32b69a155437b9003c02f6ba68", "title": "Config Utils for Basketball Action"}, "585": {"path": "/applications/FootballAction/predict/action_detect/utils/preprocess.py", "hash": "43307e52ad097b996f3fbd921f6362c7", "title": "FFmpeg Tools for Video Processing"}, "586": {"path": "/applications/FootballAction/predict/action_detect/utils/process_result.py", "hash": "65fe1acf9920ac04515c18bdff134a95", "title": "Action Detection with NMS Filtering"}, "587": {"path": "/applications/FootballAction/predict/eval.py", "hash": "b9872952f5e2c00ca322fafd868bd2a0", "title": "Evaluating Model Performance with F1 Scores"}, "588": {"path": "/applications/FootballAction/predict/eval.py:1-36", "hash": "0b162a0b85b76a232c7b693f5996ed1a", "title": "Initializing Ground Truth Data"}, "589": {"path": "/applications/FootballAction/predict/eval.py:37-67", "hash": "98a3561696ac97ae087ab1e41b0e8e55", "title": "IoU and Proposal Conversion Functions"}, "590": {"path": "/applications/FootballAction/predict/eval.py:68-93", "hash": "dc773e8134c36b6970c18ab9859a4ee0", "title": "Filtered Boxes and Ground Truth Conversion"}, "591": {"path": "/applications/FootballAction/predict/eval.py:94-120", "hash": "807d0d5b53780df588c1d4b2aa9c2f26", "title": "Intersection over Union Evaluation Functions"}, "592": {"path": "/applications/FootballAction/predict/eval.py:121-144", "hash": "a380ce3a5d2738fae8c3785d6e702c9f", "title": "Box IOU Evaluator"}, "593": {"path": "/applications/FootballAction/predict/eval.py:146-161", "hash": "19a874157d39640795ab03fe57f09ebc", "title": "Subtask Precision and Recall Calculator"}, "594": {"path": "/applications/FootballAction/predict/eval.py:162-189", "hash": "66b59a8ce7170653b7d82a2cbc40df5d", "title": "FootballAction Prediction Evaluation"}, "595": {"path": "/applications/FootballAction/predict/eval.py:190-218", "hash": "f1c73de3f672b56e952d485317c4400c", "title": "Football Action Prediction Evaluation"}, "596": {"path": "/applications/FootballAction/predict/eval.py:219-237", "hash": "c6ca29857b7ff4eec2eebd96d02a7ef0", "title": "Optimal Threshold Selection"}, "597": {"path": "/applications/FootballAction/predict/predict.py", "hash": "539db312ace0ff2bbfbb0194db289e3f", "title": "Football Action Detection Model Prediction"}, "598": {"path": "/applications/FootballAction/predict/predict.py:1-33", "hash": "8deae072592ee40b516d571f03592666", "title": "Video Action Detection Script"}, "599": {"path": "/applications/FootballAction/predict/predict.py:35-37", "hash": "a9de60b87f9374cf2472d46b2790060d", "title": "JSON Data Output in FootballAction App"}, "600": {"path": "/applications/Ma-Net/README.md", "hash": "0dc4dc4d62f327af9afaf9aa6660ea52", "title": "MA-Net Model for PaddleVideo: DAVIS Dataset Training & Testing"}, "601": {"path": "/applications/Ma-Net/README.md:1-35", "hash": "22b1caf0f175e1d57e9bcadcbf86e85d", "title": "Ma-Net: PaddleVideo's CVPR2020 Implementation"}, "602": {"path": "/applications/Ma-Net/README.md:36-47", "hash": "a5bd787f8fe9f67d7cb41587a222a019", "title": "Run Local Environment Script"}, "603": {"path": "/applications/Ma-Net/README_cn.md", "hash": "4c89171284b6d939068c7b29bc254be7", "title": "Ma-Net\u89c6\u9891\u5206\u5272\u5b9e\u73b0README\uff08\u4e2d\u6587\uff09"}, "604": {"path": "/applications/Ma-Net/config.py", "hash": "63a2c4750a344716b5ed6aa8b7ded441", "title": "Ma-Net Training Setup"}, "605": {"path": "/applications/Ma-Net/config.py:1-32", "hash": "8ed0e8a910135fa0f36f713144080335", "title": "Configuring Ma-Net Parameters"}, "606": {"path": "/applications/Ma-Net/config.py:33-53", "hash": "ae8a0dcb0a3418ddb210c1c67ec12449", "title": "Ma-Net App Config: CLI Arguments"}, "607": {"path": "/applications/Ma-Net/config.py:54-70", "hash": "b028e9662b9f3807ab6db495ead83ba7", "title": "Ma-Net Model Configuration"}, "608": {"path": "/applications/Ma-Net/config.py:71-88", "hash": "a1e34cdaf0e229a7825e04cbcdbaa43c", "title": "Ma-Net Configuration Arguments"}, "609": {"path": "/applications/Ma-Net/config.py:90-96", "hash": "6608ff432d1c0f6939611ff392b2caad", "title": "Default Initialization and Epoch Calculation"}, "610": {"path": "/applications/Ma-Net/dataloaders/DAVIS2017.md", "hash": "fcb159f0202407b0caf3aad997120a17", "title": "DAVIS2017 Dataset Download and Setup"}, "611": {"path": "/applications/Ma-Net/dataloaders/DAVIS2017_cn.md", "hash": "d598ccf7846d8f4583261e89ae6cb9b9", "title": "DAVIS2017 Dataset for Ma-Net"}, "612": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py", "hash": "939e5106072b53132c7bf7ecf67ae3e7", "title": "Data Augmentation for Video Object Detection"}, "613": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:1-35", "hash": "5c2918cc3f6bcb44d728123acd889c34", "title": "Uniform Image Rescaling Class"}, "614": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:36-69", "hash": "379e1036aa45e9ec94f812a0baace987", "title": "Custom Image Resizer Transform"}, "615": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:70-98", "hash": "7b17a7865d7dce72a15ad74ebb0f12c1", "title": "Random Crop with Sufficient Scribble Elements"}, "616": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:99-124", "hash": "35daee6d99f89f9b098f29306a606cf2", "title": "Adaptive Image Crop and Resize"}, "617": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:125-154", "hash": "bb49e55ef2af5a38574bf039345ad703", "title": "ScaleNRotate Class for Image Transformations"}, "618": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:155-189", "hash": "d2ca0a7a4f7f9e8d9ae01eae9dbfae1b", "title": "Random Scaling, Rotation, and Warping Transform"}, "619": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:191-229", "hash": "b5d77348b56c539d96ff2e7bf34d46b4", "title": "Data Augmentation Techniques in Ma-Net"}, "620": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:230-261", "hash": "806db4bdd000d17dbc4c29d8260cdd54", "title": "Normalizing and Initializing Custom Scribble Interaction"}, "621": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:262-288", "hash": "6c4f6c5b6a987aa5c54e91d2f02cf115", "title": "Scribble Segmentation with Bresenham"}, "622": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:289-310", "hash": "00e83f9c9497142282bb26a4258fee51", "title": "Generating GT Masks from Scribbles"}, "623": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:312-330", "hash": "1de53d16668d5c5fd856128f624a1e57", "title": "Dilated Mask Annotation Rounds Computation"}, "624": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:331-366", "hash": "88fa76756e9b32749797bda75d7c8d7b", "title": "Ma-Net Data Loader: Video OD Transform"}, "625": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:369-405", "hash": "9aaa44844bc81136e43bb0aa6f2235c3", "title": "Edge Mask Generation in Ma-Net Dataloader"}, "626": {"path": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:406-416", "hash": "8a32cb59d81252860b49b25fff0f3711", "title": "Edge Mask Creation with Parsing Mask"}, "627": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py", "hash": "48b31ebafce84da9d9ffb2870aff5a31", "title": "DAVIS 2017 Dataset Preprocessing for Ma-Net"}, "628": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:1-40", "hash": "2962f6f6efe372f67df5d84bed63c3c2", "title": "DAVIS 2017 Test Data Manager"}, "629": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:41-73", "hash": "6f8d0b1031a2844e08d76017cfcac629", "title": "DAVIS2017 Dataset Initialization and Loading"}, "630": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:74-109", "hash": "1c72174a11406b3de332e071743dc269", "title": "DAVIS 2017 Semantic Segmentation Dataset Loader"}, "631": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:110-135", "hash": "cd9c32e99664a5ab42737662ee90f19d", "title": "File Sequence Extension and Preprocessing"}, "632": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:136-161", "hash": "7aa24bf7b8c447885a9404ce7a084d11", "title": "DAVIS 2017 Dataset Loader Code"}, "633": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:162-186", "hash": "ba43d8c73440ab039025d8bae5115593", "title": "Load Images and Labels from Path"}, "634": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:187-219", "hash": "1213e21ca52bd8e79310e97131be7976", "title": "DAVIS 2017 Video Object Detection Data Loader"}, "635": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:220-244", "hash": "175629ca418f24b5d3ccfa222cae2347", "title": "DAVIS2017 Mask Reader and Dictionary Creation"}, "636": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:246-273", "hash": "72769adb701783caf462b01550f83010", "title": "DAVIS 2017 Data Loader Initiation"}, "637": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:274-299", "hash": "e92da0b53cb83095d5488c826eb42c2e", "title": "DAVIS 2017 Dataset Loader"}, "638": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:300-320", "hash": "8acf622ba48e80bdf1174f76c06fbd06", "title": "Davis Frame Processing: Loader"}, "639": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:322-344", "hash": "1c53d5272ad02591be7da5f9c751c6dc", "title": "Random Scribble Label Assigner"}, "640": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:345-374", "hash": "155e6e5b88e9797468d5789be8085ab1", "title": "Image Dataloader and Transform"}, "641": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:375-400", "hash": "059d8b0fbc6d1046d33aff4d6a6b3998", "title": "Data Loading Function for Sequence Lists"}, "642": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:401-431", "hash": "ade6c45bc1f2a7a0dfba94599cc552cf", "title": "DAVIS 2017 Dataset Class Definition"}, "643": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:432-456", "hash": "768628968a8ae2f6393b1364886cb057", "title": "DAVIS Dataset Custom Dataloader"}, "644": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:457-485", "hash": "6eda4f5bd481ee22cb071feb3638195b", "title": "Custom Dataloader for Adjacent Frames"}, "645": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:486-506", "hash": "8f48daf95ff5f389b75c3d7aef200969", "title": "Loading Data for Video Sequences"}, "646": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:507-531", "hash": "e4f039961d76f8399995d5a22abb3f8b", "title": "Preparing DAVIS Dataset for Model"}, "647": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:532-562", "hash": "24549bff32e80e6a9673e2f5f8db6569", "title": "Ma-Net Dataset Creator"}, "648": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:563-585", "hash": "dc9bfcab1edfa0fbc926c593cb12bae9", "title": "Update Frame and Scribble Masks in Dataset"}, "649": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:586-610", "hash": "f5bffd0bcd7d3db3ea126d17260facad", "title": "Random JSON Label Dataset Initialization"}, "650": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:611-633", "hash": "fc65f22d21c6992414fe2ed126c01a03", "title": "JSON Parsing and Image Loading"}, "651": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:634-662", "hash": "7928ed8111a86cc9efc8b92d97daf7b0", "title": "Validate Sequence Existence and Preprocess Data"}, "652": {"path": "/applications/Ma-Net/dataloaders/davis_2017_f.py:664-672", "hash": "ddcc60af1aedbea8a052627351b8b66e", "title": "JSON Dataset Preprocessing"}, "653": {"path": "/applications/Ma-Net/dataloaders/helpers.py", "hash": "38ab4b8ec523d174f97e52db60013d3e", "title": "Functions for Tensor to Image Conversion and Model Naming"}, "654": {"path": "/applications/Ma-Net/dataloaders/helpers.py:1-46", "hash": "142ec8018b50a2a1ca6498f5badfcb75", "title": "Image Processing Helpers"}, "655": {"path": "/applications/Ma-Net/dataloaders/helpers.py:47-78", "hash": "6c6dc5678e52ee34c03d2a4c0686fbb5", "title": "Model Name Construction and Image Computation Functions"}, "656": {"path": "/applications/Ma-Net/dataloaders/helpers.py:79-81", "hash": "2fc384721c1d4a164aa142cf0a7c30c3", "title": "Enhancing Background with Dilation"}, "657": {"path": "/applications/Ma-Net/dataloaders/samplers.py", "hash": "36c2d97eb1924090f6b087bd256b1a52", "title": "Random Identity Sampler"}, "658": {"path": "/applications/Ma-Net/dataloaders/samplers.py:1-31", "hash": "d1c35953495bdfe7d9abe8395bcfc4b5", "title": "RandomIdentitySampler Class"}, "659": {"path": "/applications/Ma-Net/dataloaders/samplers.py:32-42", "hash": "7c529cae4f9478f861aefcdf8b4a4e4d", "title": "Random Identity Sampler"}, "660": {"path": "/applications/Ma-Net/networks/IntVOS.py", "hash": "3a31fc79235278686ae17f80b652acf8", "title": "Ma-Net: IntVOS Video Segmentation"}, "661": {"path": "/applications/Ma-Net/networks/IntVOS.py:1-42", "hash": "43a41ad08d57a9b71fc51a97e1929912", "title": "Pairwise Distance Calculation in PaddlePaddle Video OD"}, "662": {"path": "/applications/Ma-Net/networks/IntVOS.py:43-65", "hash": "9a08b50b2a3e2d7e98575a60ae8bb983", "title": "Pairwise Distance Calculator"}, "663": {"path": "/applications/Ma-Net/networks/IntVOS.py:66-88", "hash": "3d4f55183bea542fc232c920ac012f48", "title": "K-Nearest Neighbor Search with Padding Distance"}, "664": {"path": "/applications/Ma-Net/networks/IntVOS.py:89-118", "hash": "aaa3908937c802deec5790c9388258bc", "title": "Nearest Neighbor Feature Calculation"}, "665": {"path": "/applications/Ma-Net/networks/IntVOS.py:119-141", "hash": "eaec7f06570612fd5504d7403809dff8", "title": "KNN Search with Chunking"}, "666": {"path": "/applications/Ma-Net/networks/IntVOS.py:142-169", "hash": "c10b8548627d50bc5b987075c012d2a0", "title": "Nearest Neighbor Feature Computation"}, "667": {"path": "/applications/Ma-Net/networks/IntVOS.py:170-186", "hash": "d4ce4ef70dbbb7481d270e5d88ce437c", "title": "Nearest Neighbor Distance Calculation"}, "668": {"path": "/applications/Ma-Net/networks/IntVOS.py:187-211", "hash": "9c9c4e9c1c247396611bf6fd11145c8d", "title": "Nearest Neighbor Features Calculation"}, "669": {"path": "/applications/Ma-Net/networks/IntVOS.py:212-235", "hash": "2fdde311dff5841bbec258b2416f2370", "title": "Local Neighbor Feature Extraction"}, "670": {"path": "/applications/Ma-Net/networks/IntVOS.py:236-261", "hash": "a2d854d9b7142f88923ae93e07c94c4f", "title": "Boundary-Cross Correlation Sigmoid Transpose"}, "671": {"path": "/applications/Ma-Net/networks/IntVOS.py:262-287", "hash": "21e18da7422427208eb1be0a5807bef1", "title": "Pairwise L2 Distances Calculator"}, "672": {"path": "/applications/Ma-Net/networks/IntVOS.py:288-312", "hash": "09fdc574d2599aff4cbd641e0e440697", "title": "Local Downsampling in IntVOS Network"}, "673": {"path": "/applications/Ma-Net/networks/IntVOS.py:313-336", "hash": "a98d7ac77d2a9f0f16f9066ec914cde7", "title": "Sliding Window Distance Calculator"}, "674": {"path": "/applications/Ma-Net/networks/IntVOS.py:337-365", "hash": "4373b023efc7bf4ded4b24f80cb97289", "title": "Spatial Cross-Correlation Sampler"}, "675": {"path": "/applications/Ma-Net/networks/IntVOS.py:366-392", "hash": "651c7ccc6205fa488216a61f9895a3d2", "title": "IntVOS Nearest Neighbor Compute Function"}, "676": {"path": "/applications/Ma-Net/networks/IntVOS.py:393-421", "hash": "fab803b3c9bdd07c41bdcef419551c1c", "title": "Nearest Neighbor Feature Calculation"}, "677": {"path": "/applications/Ma-Net/networks/IntVOS.py:422-454", "hash": "3ff95cae963d0bb52b6a9cebb3fcef16", "title": "Offset Masks and Distance Tensor Calculation"}, "678": {"path": "/applications/Ma-Net/networks/IntVOS.py:455-486", "hash": "e4ad751bbdd23af02a04997f8124aef9", "title": "Residual Block and Segmentation Head for Ma-Net"}, "679": {"path": "/applications/Ma-Net/networks/IntVOS.py:487-513", "hash": "2964406da926ef7a3ad02f9dba987195", "title": "IntSegHead: Segmentation Neural Network"}, "680": {"path": "/applications/Ma-Net/networks/IntVOS.py:516-537", "hash": "41c3ec80601615667bd9a089a4c0cd40", "title": "Separable Conv Layer with BatchNorm"}, "681": {"path": "/applications/Ma-Net/networks/IntVOS.py:538-571", "hash": "96ce05d3220f3a23049e21f1cb5f57ab", "title": "Dynamic Segmentation Heads"}, "682": {"path": "/applications/Ma-Net/networks/IntVOS.py:572-589", "hash": "977f598e2315daa24449b19fc755040e", "title": "Initializing Network Architecture Components"}, "683": {"path": "/applications/Ma-Net/networks/IntVOS.py:590-616", "hash": "eb7c95488e93f7f93abfb0fc9471fce9", "title": "Dynamic Segmentation Network Forward Pass"}, "684": {"path": "/applications/Ma-Net/networks/IntVOS.py:617-640", "hash": "f32679e2da61875580ef48fc3a350c45", "title": "Splitting Input, Calling prop_seghead"}, "685": {"path": "/applications/Ma-Net/networks/IntVOS.py:641-664", "hash": "4a37273d727f36170a5612b93cc5b34e", "title": "IntVOS Feature Extraction"}, "686": {"path": "/applications/Ma-Net/networks/IntVOS.py:665-685", "hash": "5b6e12ed617174d762b12991c88d48da", "title": "Interpolated Feature Embedding Extraction"}, "687": {"path": "/applications/Ma-Net/networks/IntVOS.py:686-704", "hash": "f41da18327a4abfe5e54d326106cabb2", "title": "Extracting Nearest Neighbors per Object"}, "688": {"path": "/applications/Ma-Net/networks/IntVOS.py:705-725", "hash": "75a88d4e83697897913e29c9c73c2767", "title": "Check and Update Global Map Embedding"}, "689": {"path": "/applications/Ma-Net/networks/IntVOS.py:726-745", "hash": "8657707ed963d6421016b34d4681f12d", "title": "Nearest Neighbor Feature Extraction for Video Sequences"}, "690": {"path": "/applications/Ma-Net/networks/IntVOS.py:746-764", "hash": "bef6e06e1da1c905345291b8877e5f40", "title": "Sequence Map Initialization and Updating"}, "691": {"path": "/applications/Ma-Net/networks/IntVOS.py:765-781", "hash": "1d06721b43c607a7d85dd68c35012246", "title": "Updating Previous Frame Features"}, "692": {"path": "/applications/Ma-Net/networks/IntVOS.py:782-803", "hash": "e6c8e1590f63d521fcee2dd2c7d222f3", "title": "Frame Feature Handling and Concatenation"}, "693": {"path": "/applications/Ma-Net/networks/IntVOS.py:804-829", "hash": "f3a882a1a72e0d2e791f555e2ffe718a", "title": "int_seghead Function Overview"}, "694": {"path": "/applications/Ma-Net/networks/IntVOS.py:830-853", "hash": "8f7d19c24c8283958d083a056d5c93cf", "title": "Interpolating Ma-Net Scribble Labels"}, "695": {"path": "/applications/Ma-Net/networks/IntVOS.py:854-877", "hash": "321cee970fec50aed86d8f36aea7cb93", "title": "Updating Global and Local Maps: IntVOS.py:854-877"}, "696": {"path": "/applications/Ma-Net/networks/IntVOS.py:878-897", "hash": "81298636241d5f2c0eba48c75425be79", "title": "Updating Distance and Temporary Dictionaries"}, "697": {"path": "/applications/Ma-Net/networks/IntVOS.py:898-921", "hash": "69fb417002a72eaf96d503fca7abf356", "title": "Tensor Operations for Segmentation Model"}, "698": {"path": "/applications/Ma-Net/networks/IntVOS.py:922-927", "hash": "f9305043d0784e17c5cf7c2836e8a5b1", "title": "Transposing Tensor and Storing in Dictionary"}, "699": {"path": "/applications/Ma-Net/networks/aspp.py", "hash": "9622cd5ecf9845613b8a6820a5167685", "title": "ASPP Module: ASPP Pyramid Pooling in Ma-Net"}, "700": {"path": "/applications/Ma-Net/networks/aspp.py:1-34", "hash": "f18da07329fb0ddd3c0da8930ec49c18", "title": "ASPP Module: Hierarchical Atrous Spatial Pooling"}, "701": {"path": "/applications/Ma-Net/networks/aspp.py:35-66", "hash": "fe91812a5d883b336acf9fd0d82502c4", "title": "ASPP Class: Building ASPP Network Modules"}, "702": {"path": "/applications/Ma-Net/networks/aspp.py:67-89", "hash": "af51ba4f90eb39433ca8ebf426f40689", "title": "ASPP Modules and Global Average Pooling Layer"}, "703": {"path": "/applications/Ma-Net/networks/aspp.py:90-123", "hash": "48cf61350c8f02a6a57d8e1742239263", "title": "ASPP Module in Ma-Net's CNN"}, "704": {"path": "/applications/Ma-Net/networks/backbone/__init__.py", "hash": "a7d7a1afe9ec012da61c08865efe6a9c", "title": "Build Backbone Networks"}, "705": {"path": "/applications/Ma-Net/networks/backbone/drn.py", "hash": "f32cbebd4a86d78e14dfa736f20cf3c4", "title": "Deep Residual Networks in PaddlePaddle"}, "706": {"path": "/applications/Ma-Net/networks/backbone/drn.py:1-29", "hash": "89b8d61ba199725a4d03988187bf968b", "title": "BasicBlock Class in DRN Network"}, "707": {"path": "/applications/Ma-Net/networks/backbone/drn.py:30-65", "hash": "2d2a04112254cf6396ea28649f0f04cd", "title": "Residual Bottleneck Block"}, "708": {"path": "/applications/Ma-Net/networks/backbone/drn.py:66-103", "hash": "26c397b15152e9ada951da5051411a6e", "title": "Deep Residual Network Model"}, "709": {"path": "/applications/Ma-Net/networks/backbone/drn.py:104-130", "hash": "6690a3c70872b867edd2139e9ad9d043", "title": "DRN Network: Convolutional and Pooling Architecture"}, "710": {"path": "/applications/Ma-Net/networks/backbone/drn.py:131-147", "hash": "d9cdf9306ef9b0e0bf9dba34b29459dc", "title": "DRN Network Architecture"}, "711": {"path": "/applications/Ma-Net/networks/backbone/drn.py:148-170", "hash": "dad11caae23d12f51595b63b7cdad987", "title": "Defining MA-Net Backbone Layers"}, "712": {"path": "/applications/Ma-Net/networks/backbone/drn.py:171-193", "hash": "8dd84cf2b79d8d1ee3d5b52271bddccc", "title": "DRN Network Layer Construction"}, "713": {"path": "/applications/Ma-Net/networks/backbone/drn.py:194-234", "hash": "25dc28a76fee2ba8f0434b17cac0fcbc", "title": "Deep Residual Network Backbone Architecture"}, "714": {"path": "/applications/Ma-Net/networks/backbone/drn.py:236-257", "hash": "1d2eb445bf7f2169482e257ae9f5b328", "title": "DRN_A Class in Ma-Net Backbone"}, "715": {"path": "/applications/Ma-Net/networks/backbone/drn.py:258-279", "hash": "be9ef2810371f8b6061b22ea67b90345", "title": "Creating Layers with _make_layer"}, "716": {"path": "/applications/Ma-Net/networks/backbone/drn.py:281-318", "hash": "d1540d45bbe54885fe7d03045e6b30c9", "title": "DRN Model Functions in Ma-Net Backbone"}, "717": {"path": "/applications/Ma-Net/networks/backbone/drn.py:319-349", "hash": "bb2ece4f4af3fcf7b183b162cddb07ea", "title": "Initializing DRN Models with Pre-Trained Weights"}, "718": {"path": "/applications/Ma-Net/networks/backbone/drn.py:350-380", "hash": "2a868812d7c9d580e1c6e715c6ea7afe", "title": "DRN Model Functions with Configurations"}, "719": {"path": "/applications/Ma-Net/networks/backbone/drn.py:381-400", "hash": "da54ad7145df89ecee74435fe836448b", "title": "DRN Model Definition and Pretrained Weights Loading"}, "720": {"path": "/applications/Ma-Net/networks/backbone/mobilenet.py", "hash": "b606be608209605b950c4f7951683a7e", "title": "Ma-Net: MobileNetV2 Backbone Initialization"}, "721": {"path": "/applications/Ma-Net/networks/backbone/mobilenet.py:1-33", "hash": "f62d8a613ef221fc319bbb8d24f8374a", "title": "MobileNet Network Layer Definition"}, "722": {"path": "/applications/Ma-Net/networks/backbone/mobilenet.py:34-63", "hash": "122b5ec131a8e897234696b8684a4728", "title": "MobileNet Layer Creation: Convolutional Neural Network"}, "723": {"path": "/applications/Ma-Net/networks/backbone/mobilenet.py:64-99", "hash": "94186a29945773d9163836a4459a2c10", "title": "MobileNetV2 Model Definition"}, "724": {"path": "/applications/Ma-Net/networks/backbone/mobilenet.py:100-127", "hash": "f35590688a5cbc54028f96b3ba8ba9d3", "title": "MobileNet Backbone for Ma-Net Application"}, "725": {"path": "/applications/Ma-Net/networks/backbone/mobilenet.py:128-157", "hash": "8c0d376cb63e8adc7786392ce73d4c43", "title": "Preparing MobileNet Backbone for Feature Extraction"}, "726": {"path": "/applications/Ma-Net/networks/backbone/mobilenet.py:158-163", "hash": "25d0aeadb7074681a6660d6b001f7966", "title": "Kaiming Normal Init and Batch Norm for Mobilenet"}, "727": {"path": "/applications/Ma-Net/networks/backbone/resnet.py", "hash": "f6e7c78b6ad96a9082e078d9e6eb972b", "title": "ResNet Architecture with Batch Normalization"}, "728": {"path": "/applications/Ma-Net/networks/backbone/resnet.py:1-33", "hash": "1c02f22c10061ae7cdd2962741d64578", "title": "Bottleneck ResNet Backbone Definition"}, "729": {"path": "/applications/Ma-Net/networks/backbone/resnet.py:34-77", "hash": "f1c356add7fb708d4e2ab157793d7b55", "title": "ResNet Architecture Design: BatchNorm, ReLU, Downsample"}, "730": {"path": "/applications/Ma-Net/networks/backbone/resnet.py:78-103", "hash": "033fa26cc28140fa405df14f9ba8ddb0", "title": "Initializing ResNet Backbone: Conv, BN, Pool and Residual Blocks"}, "731": {"path": "/applications/Ma-Net/networks/backbone/resnet.py:104-126", "hash": "418fe45f6c6e672e7999b23dc5b694d8", "title": "ResNet Network with Batch Normalization"}, "732": {"path": "/applications/Ma-Net/networks/backbone/resnet.py:127-157", "hash": "4374b62e3deab7486c5b2d17237e43cc", "title": "ResNet Block Builder Function"}, "733": {"path": "/applications/Ma-Net/networks/backbone/resnet.py:158-186", "hash": "d90194e7e7bc9f1d9a7ea1ae45c82de3", "title": "ResNet Residual Block Builder"}, "734": {"path": "/applications/Ma-Net/networks/backbone/resnet.py:187-220", "hash": "bcb684d0feb36f5e7a4b280363fae55d", "title": "ResNet Network Definition"}, "735": {"path": "/applications/Ma-Net/networks/backbone/resnet.py:221-239", "hash": "c48ee841adb570a9769ad9afac171cf8", "title": "ResNet-101 Model Function"}, "736": {"path": "/applications/Ma-Net/networks/backbone/xception.py", "hash": "ae099b01b6bdebe8213abd6f3bda9559", "title": "AlignedXception Backbone for Image Classification"}, "737": {"path": "/applications/Ma-Net/networks/backbone/xception.py:1-34", "hash": "3eedc32a8db453647da523049e6e6cf1", "title": "Separable Conv Layer with BatchNorm"}, "738": {"path": "/applications/Ma-Net/networks/backbone/xception.py:35-67", "hash": "31dac4fba0728fd3ff28d1bb1af111c2", "title": "Xception Block Layer Initialization and Forward"}, "739": {"path": "/applications/Ma-Net/networks/backbone/xception.py:68-102", "hash": "08373364db83049baaadf56b5e24fa20", "title": "Xception Backbone Network Creation"}, "740": {"path": "/applications/Ma-Net/networks/backbone/xception.py:103-144", "hash": "ee651b0a058221818dc6a37059179db1", "title": "AlignedXception Network Code"}, "741": {"path": "/applications/Ma-Net/networks/backbone/xception.py:145-175", "hash": "25816d18822d3d183d508fc3a648df8c", "title": "AlignedXception Initialization Code"}, "742": {"path": "/applications/Ma-Net/networks/backbone/xception.py:176-201", "hash": "ce6159c58644571350d2f6f4918c6731", "title": "Xception Backbone: Block Architecture"}, "743": {"path": "/applications/Ma-Net/networks/backbone/xception.py:202-225", "hash": "52c3834ac3320265499f871b63282c13", "title": "Xception Block Creation and Implementation"}, "744": {"path": "/applications/Ma-Net/networks/backbone/xception.py:226-249", "hash": "db6251f7273f955033b2d03cc360fc89", "title": "Repeated Convolutions and Batch Normalization in Xception"}, "745": {"path": "/applications/Ma-Net/networks/backbone/xception.py:250-273", "hash": "2ab2edf28a83e785b91046b8a5971ca1", "title": "Xception Convolutions and Block Initialization"}, "746": {"path": "/applications/Ma-Net/networks/backbone/xception.py:274-297", "hash": "2d9a36f922db7206c9130a3278fa65bc", "title": "Xception Blocks in Ma-Net's Image Classification"}, "747": {"path": "/applications/Ma-Net/networks/backbone/xception.py:298-323", "hash": "fb992dacdfa26e5d8dccd6f666f470ed", "title": "Xception Block Configurations"}, "748": {"path": "/applications/Ma-Net/networks/backbone/xception.py:324-348", "hash": "255f0eea67b6e6a94acc67c95e7320c3", "title": "Xception: Separable Conv Layers"}, "749": {"path": "/applications/Ma-Net/networks/backbone/xception.py:349-390", "hash": "5dc5096965651dfbbecae6e11743f4ac", "title": "Xception Network Architecture"}, "750": {"path": "/applications/Ma-Net/networks/backbone/xception.py:391-427", "hash": "4d5f6796cba8273ef56c252d9bb03e5a", "title": "Xception Model: Neural Network for Image Classification"}, "751": {"path": "/applications/Ma-Net/networks/backbone/xception.py:429-447", "hash": "5f6f126f4ea0557a37ee9345fcb6f748", "title": "Updating Pre-trained Xception Model Weights"}, "752": {"path": "/applications/Ma-Net/networks/backbone/xception.py:448-455", "hash": "a68680e7662b63dd2bfcb6910b148b71", "title": "Renaming Conv and BN Parameters"}, "753": {"path": "/applications/Ma-Net/networks/decoder.py", "hash": "ed74fcfb39997669136e6a4fcc6cb011", "title": "Decoder Network Construction"}, "754": {"path": "/applications/Ma-Net/networks/decoder.py:1-32", "hash": "2526014e4bbe8e9cf34cca4db5ed423e", "title": "Decoder Layer for Feature Classification"}, "755": {"path": "/applications/Ma-Net/networks/decoder.py:33-62", "hash": "0aa76fd43ce935081179c02c1125296b", "title": "Decoder Network Architecture"}, "756": {"path": "/applications/Ma-Net/networks/decoder.py:65-66", "hash": "809d980da27ef420a28de0b2a333e562", "title": "Build Decoder Network Function"}, "757": {"path": "/applications/Ma-Net/networks/deeplab.py", "hash": "090bd2225f11dfb59b1ecbb10813584f", "title": "Freezing Batch Norm Layers in DeepLab"}, "758": {"path": "/applications/Ma-Net/networks/deeplab.py:1-31", "hash": "7312d7a1473d01731f9e93b6f4d10719", "title": "Frozen Batch Normalization for DeepLab"}, "759": {"path": "/applications/Ma-Net/networks/deeplab.py:32-64", "hash": "d87dce98aab0fd2a15106b799bc2eec0", "title": "DeepLab Class Definition"}, "760": {"path": "/applications/Ma-Net/networks/deeplab.py:65-81", "hash": "5d0904b985eedaa44fac5025f51325b3", "title": "Get ConvBN Layers' Parameters"}, "761": {"path": "/applications/Ma-Net/networks/loss.py", "hash": "5e764f7f30349f69e97ddddaeb510ab6", "title": "Custom Loss Function for Image Classification"}, "762": {"path": "/applications/Ma-Net/networks/loss.py:1-28", "hash": "a6af1f58c3f795a5deb42b0a0259332f", "title": "Custom BCE Loss Function"}, "763": {"path": "/applications/Ma-Net/networks/loss.py:29-44", "hash": "e2267a59e232d140e4a09a175c69c270", "title": "Hard Example Mining Loss"}, "764": {"path": "/applications/Ma-Net/networks/loss.py:45-67", "hash": "a7985ae7ed98f2cda5cc94ce40573178", "title": "Custom Loss Function with Hard Example Mining"}, "765": {"path": "/applications/Ma-Net/networks/loss.py:68-87", "hash": "c470740a5dcca8412174f31a44b72a5d", "title": "Top K Percent Pixel Loss"}, "766": {"path": "/applications/Ma-Net/networks/loss.py:88-109", "hash": "21eba121c08cd7c5b215bc08b72076d9", "title": "Hard Example Mining and Top-k Pixel Selection Loss"}, "767": {"path": "/applications/Ma-Net/networks/loss.py:110-130", "hash": "d22de70faaaaa0d5e529f77272da0503", "title": "Hard Example Mining Loss Function"}, "768": {"path": "/applications/Ma-Net/networks/loss.py:131-148", "hash": "83467e50f3b63063c1ccef61691dc894", "title": "Weighted Hard Example Mining Loss"}, "769": {"path": "/applications/Ma-Net/networks/loss.py:149-153", "hash": "80d4bf09d5ca72bab028ed1336a34c06", "title": "Top-k Mean Loss Calculation"}, "770": {"path": "/applications/Ma-Net/run.sh", "hash": "d896f29116237312216d0c2af4601d57", "title": "DeeplabV3_coco DAVIS Dataset Training and Testing"}, "771": {"path": "/applications/Ma-Net/run.sh:1-13", "hash": "830372ab50d564c65554c83ce06cd5e3", "title": "Train DeeplabV3 on DAVIS Dataset"}, "772": {"path": "/applications/Ma-Net/run.sh:13-15", "hash": "7aab5abd7565a5cc51c4949b811d8cc3", "title": "Testing Video Object Segmentation"}, "773": {"path": "/applications/Ma-Net/test.py", "hash": "79f21711edb5a2f29b2d2d96eb55bafb", "title": "DAVIS2017 Video Object Detection with PaddlePaddle"}, "774": {"path": "/applications/Ma-Net/test.py:1-39", "hash": "aa4dbd01451c8217220d30d942fd09bc", "title": "Data Preprocessing for DAVIS2017"}, "775": {"path": "/applications/Ma-Net/test.py:40-62", "hash": "383f1173c5275b320794e2dfef1d8ea3", "title": "Video Analysis Configuration Loading"}, "776": {"path": "/applications/Ma-Net/test.py:63-87", "hash": "85d5244dda8169a7e0310dd4a6bed25c", "title": "Preparing Image Dictionary for Model Training"}, "777": {"path": "/applications/Ma-Net/test.py:88-113", "hash": "62b3bc7b623e37e5a16214bceaf6c386", "title": "Interactive Session Initialization"}, "778": {"path": "/applications/Ma-Net/test.py:115-139", "hash": "e60eb8bf957da6439833d2a15896e802", "title": "Scribble Sequence Retrieval and Memory Initialization"}, "779": {"path": "/applications/Ma-Net/test.py:140-163", "hash": "7c5760d6139559b74c5eb8db6f0c93c6", "title": "Interaction Detection Code: File Writing and Embedding"}, "780": {"path": "/applications/Ma-Net/test.py:164-182", "hash": "4d38126eae4403bbc6e1a9296497d686", "title": "Extracting and Concatenating Embeddings"}, "781": {"path": "/applications/Ma-Net/test.py:183-203", "hash": "01607597a7091320343ec7f1dab4cd12", "title": "Scribble Labeling in Ma-Net"}, "782": {"path": "/applications/Ma-Net/test.py:204-224", "hash": "625512be4ba6a34b1ae3a496b0fb3d6f", "title": "Save Scribble Image with Palette"}, "783": {"path": "/applications/Ma-Net/test.py:226-244", "hash": "6f14c868166de4ea3c4740cca07e69a3", "title": "Segmentation Model Initialization"}, "784": {"path": "/applications/Ma-Net/test.py:245-262", "hash": "70d19b1a40b1357fe1295cd2ee8dff9e", "title": "Ma-Net Labeling: Predict, Resize, Max"}, "785": {"path": "/applications/Ma-Net/test.py:263-279", "hash": "61b781a174007c9b054f6887d9edf944", "title": "Save Interactive Video Frame as Labeled Image"}, "786": {"path": "/applications/Ma-Net/test.py:280-298", "hash": "1e1671ece6195a63531eb1d755338df8", "title": "Video Object Segmentation Algorithm with Pre-trained Model"}, "787": {"path": "/applications/Ma-Net/test.py:299-318", "hash": "0d94c1b01ac659a2961c61750ad36e7c", "title": "Function Call with Multiple Args and Interpolation"}, "788": {"path": "/applications/Ma-Net/test.py:320-338", "hash": "7916a5777a454aae04120c4964f50064", "title": "Image Saving for Prediction Labels"}, "789": {"path": "/applications/Ma-Net/test.py:339-356", "hash": "9f67ff3fceb3b70b64dd6a701c147435", "title": "Folder and Image Saving Reset"}, "790": {"path": "/applications/Ma-Net/test.py:357-374", "hash": "6db24a5477d228a0e04c052a83a85bb0", "title": "Video Object Detection with PaddlePaddle's Prop Seghead"}, "791": {"path": "/applications/Ma-Net/test.py:375-394", "hash": "88c4b123c89d6ab2042a8ae1fd822c38", "title": "Dynamic Object Detection and Classification"}, "792": {"path": "/applications/Ma-Net/test.py:395-412", "hash": "7ecac88445c657900bd596f92720427c", "title": "Save Image in Directory Structure"}, "793": {"path": "/applications/Ma-Net/test.py:413-436", "hash": "9b80fe136b13a7293e6badcce9ba8f11", "title": "Interactive Image Classification System"}, "794": {"path": "/applications/Ma-Net/test.py:437-468", "hash": "f047dc5e22660bc663428f6be5f62eed", "title": "Filtering Scribble Labels in Ma-Net"}, "795": {"path": "/applications/Ma-Net/test.py:469-485", "hash": "ef4d65471a715ff115ef66990e722a43", "title": "75 Colors Palette Definition"}, "796": {"path": "/applications/Ma-Net/test.py:486-498", "hash": "fc33b83bdb94e2fb6969714e718f495f", "title": "List of Sequential Numbers"}, "797": {"path": "/applications/Ma-Net/test.py:499-511", "hash": "d33358abf28d0dc4a17b550a87da627a", "title": "Incrementing Loop"}, "798": {"path": "/applications/Ma-Net/test.py:512-525", "hash": "74c6c8636dc7d7ea6c21babe96dd4dd2", "title": "Enigmatic Numerical Sequence"}, "799": {"path": "/applications/Ma-Net/train_stage1.py", "hash": "02f4513405043b84bf5ac11a7b50a2c0", "title": "Ma-Net Video Detection Training"}, "800": {"path": "/applications/Ma-Net/train_stage1.py:1-34", "hash": "247114078616f3fb855af0d5fd5abc21", "title": "Train Stage 1: Ma-Net Setup"}, "801": {"path": "/applications/Ma-Net/train_stage1.py:35-59", "hash": "9ef7d1dfd1f77f9a81fb13d9b890050b", "title": "Training Ma-Net in Stage 1"}, "802": {"path": "/applications/Ma-Net/train_stage1.py:61-87", "hash": "2cce2ce93f41cfc6f23def0c96299fbd", "title": "Training Stage: Ma-Net Model Initiation"}, "803": {"path": "/applications/Ma-Net/train_stage1.py:88-114", "hash": "1d1b997f0390ba977cdcf94303a63567", "title": "Dataset Preparation and Training Setup"}, "804": {"path": "/applications/Ma-Net/train_stage1.py:116-144", "hash": "0f3f418ffa2a924249e8b48e17757f0d", "title": "Model Resumption and Training"}, "805": {"path": "/applications/Ma-Net/train_stage1.py:145-172", "hash": "e24d908dc68f182b2e855c07a6efb442", "title": "Preparing Input Data for Model Training"}, "806": {"path": "/applications/Ma-Net/train_stage1.py:173-194", "hash": "5a213989a60f4ef86ec99a1d9c6db99c", "title": "Initialize Label and Object Dictionaries"}, "807": {"path": "/applications/Ma-Net/train_stage1.py:195-217", "hash": "0ca59bc15db8371176d79b2ad7fc11f1", "title": "Video Object Detection Model Training: Stages and Loss Functions"}, "808": {"path": "/applications/Ma-Net/train_stage1.py:218-240", "hash": "b6454ffdeed21b49e831cf50be43ba9b", "title": "Image Comparison and Normalization"}, "809": {"path": "/applications/Ma-Net/train_stage1.py:241-266", "hash": "e6c37541807ba0f4ef378ada2244559b", "title": "Sigmoid Binary Cross-Entropy Masks"}, "810": {"path": "/applications/Ma-Net/train_stage1.py:267-286", "hash": "53c43969a258dadfd0e57bd98979c4a8", "title": "Loading and Preparing Test Datasets"}, "811": {"path": "/applications/Ma-Net/train_stage1.py:287-306", "hash": "e7bdcc71c2124d8600cfb83a94bc7ced", "title": "Paddle Data Loader for Test Samples"}, "812": {"path": "/applications/Ma-Net/train_stage1.py:307-326", "hash": "25b14b6bc5aa619326e7a225e95a8ca7", "title": "Feature Extraction and Model Prediction"}, "813": {"path": "/applications/Ma-Net/train_stage1.py:327-348", "hash": "b414e0ad5a3711337abbe88e7d98a8c4", "title": "Frame-by-frame Prediction Saving Function"}, "814": {"path": "/applications/Ma-Net/train_stage1.py:349-378", "hash": "1e1036cc47f0d30e8d8884d1092499e8", "title": "Training Ma-Net with Adaptive Learning Rate"}, "815": {"path": "/applications/Ma-Net/train_stage1.py:379-391", "hash": "845a6c121c94382c0dfc8204cb625294", "title": "RGB Object Values List"}, "816": {"path": "/applications/Ma-Net/train_stage1.py:392-404", "hash": "5889eb2ff5f53af9ca89333e7212004e", "title": "Sequence Numbers in Ma-Net's train_stage1.py"}, "817": {"path": "/applications/Ma-Net/train_stage1.py:405-417", "hash": "f7363eaafbbe04bfd4136665bcf6e71b", "title": "Image Sequence Codes"}, "818": {"path": "/applications/Ma-Net/train_stage1.py:418-429", "hash": "b8946d12ee630a54a3990949fbb1ace7", "title": "Training Manager's Code and Function Call"}, "819": {"path": "/applications/Ma-Net/train_stage2.py", "hash": "bfa00ca6ce8dadc0fdc22c9cab75f6b9", "title": "Training Ma-Net Stage 2 with Learning Rates"}, "820": {"path": "/applications/Ma-Net/train_stage2.py:1-34", "hash": "fba0118827c25bc1d7fe94334898e948", "title": "Initialize Environment for Training"}, "821": {"path": "/applications/Ma-Net/train_stage2.py:35-61", "hash": "26bfa3c523b3335a07359e5826d788f2", "title": "DataLoader Initialization and Configuration"}, "822": {"path": "/applications/Ma-Net/train_stage2.py:62-89", "hash": "021462bd8cd2a4010fa57426b6e7b01a", "title": "Initialize Manager Object for VOS Training"}, "823": {"path": "/applications/Ma-Net/train_stage2.py:91-119", "hash": "415ce0c868fb23602d898da0323e4cad", "title": "Train Stage 2: Ma-Net Model Init & Optimization"}, "824": {"path": "/applications/Ma-Net/train_stage2.py:120-145", "hash": "d8a3eff369d68eabda88552b7c5b957a", "title": "Ma-Net: Training Stage 2"}, "825": {"path": "/applications/Ma-Net/train_stage2.py:146-170", "hash": "62e960b6bd8a2c039b5ed9d7005a4b28", "title": "Model Resuming and Training Loop"}, "826": {"path": "/applications/Ma-Net/train_stage2.py:171-191", "hash": "f4c1de6e229928461bdbbc284406a020", "title": "Dataset Initialization and Training Loop"}, "827": {"path": "/applications/Ma-Net/train_stage2.py:192-212", "hash": "4ad9e3547275c23894cc91ee3088addb", "title": "Training Stage 2: Setting Up Model and Feature Extraction"}, "828": {"path": "/applications/Ma-Net/train_stage2.py:213-229", "hash": "7853cbb365be2042e0838f2eac9cabd7", "title": "Image Classification Code Snippet Initialization"}, "829": {"path": "/applications/Ma-Net/train_stage2.py:230-247", "hash": "9ef7ccc8b052be36dfd8c0e57780eb48", "title": "Initialize and Process Sequences"}, "830": {"path": "/applications/Ma-Net/train_stage2.py:248-265", "hash": "dbf9c6e11afd0f3a79b19b7908929550", "title": "Label and Object Dictionary Handling"}, "831": {"path": "/applications/Ma-Net/train_stage2.py:266-287", "hash": "73978bc0d5faf2f96743dfa138403939", "title": "Training Stage 2: Updates and Visualizations"}, "832": {"path": "/applications/Ma-Net/train_stage2.py:288-306", "hash": "e0e17970d910cf4ca5f419985ca5adf7", "title": "Label and Prediction Visualization"}, "833": {"path": "/applications/Ma-Net/train_stage2.py:307-324", "hash": "db0adacc207e055a7e9d0b99dc154c22", "title": "Segmenting Image with Binary Cross-Entropy"}, "834": {"path": "/applications/Ma-Net/train_stage2.py:325-350", "hash": "c3ab7317d6329d02b4b8b9791f984487", "title": "Save Network at Intervals During Training"}, "835": {"path": "/applications/Ma-Net/train_stage2.py:351-367", "hash": "1caf4b6a98c64089422ad313e55d368c", "title": "Training Stage 2: Data Loader Setup"}, "836": {"path": "/applications/Ma-Net/train_stage2.py:368-386", "hash": "381382c1f40623e0ba23874016b6c3f1", "title": "Scribble Labeling and Image Processing in Stage 2"}, "837": {"path": "/applications/Ma-Net/train_stage2.py:387-406", "hash": "24d645c5f8201b471908bbe721a15c12", "title": "Model Training: Concatenating Labels and GPU Check"}, "838": {"path": "/applications/Ma-Net/train_stage2.py:407-423", "hash": "f910388fb870b0f6704fd7bc2b5a8f2b", "title": "Interpolated Image Classification with Interactor"}, "839": {"path": "/applications/Ma-Net/train_stage2.py:424-439", "hash": "3a298bfdd82751fdcffc8dd6d58b3504", "title": "Resizing and Updating Image Labels"}, "840": {"path": "/applications/Ma-Net/train_stage2.py:441-462", "hash": "1e5e55fa6b9ece8f565453254034fb28", "title": "Round-Based Video Model Training"}, "841": {"path": "/applications/Ma-Net/train_stage2.py:464-481", "hash": "e411b67fbad1027af723f881045e195b", "title": "Training Stage 2: Ma-Net Data Preparation"}, "842": {"path": "/applications/Ma-Net/train_stage2.py:482-500", "hash": "839e54dc602dcaee200ca6844aa3e8ae", "title": "Train Dataset Update and Model Training Progress"}, "843": {"path": "/applications/Ma-Net/train_stage2.py:501-525", "hash": "4692c0ff8d2935bcd8f05497d2b48c45", "title": "Efficient ROI Operation for Scribble Labels"}, "844": {"path": "/applications/Ma-Net/train_stage2.py:526-556", "hash": "7d6bd9b417cb91972ceb4cfed7ede998", "title": "Training Stage 2: Load, Train, Save Network"}, "845": {"path": "/applications/Ma-Net/train_stage2.py:557-573", "hash": "742f3f55103f9d2589c0171a6cbe72da", "title": "RGB Palette Generation Code"}, "846": {"path": "/applications/Ma-Net/train_stage2.py:574-586", "hash": "7fabf8081bd1e069a8086cee3e8fb32c", "title": "List of Numbers (81-150)"}, "847": {"path": "/applications/Ma-Net/train_stage2.py:587-599", "hash": "16d7a66846811affbcc0c9013c3e7c6f", "title": "Code Purpose Unclear"}, "848": {"path": "/applications/Ma-Net/train_stage2.py:600-612", "hash": "19fa6b33ff43e48d16ee92d6e98f5d40", "title": "Manager Training with Image Dimensions"}, "849": {"path": "/applications/Ma-Net/utils/api.py", "hash": "79c6bc610759cd63f9246845d88f1c04", "title": "Universal Tensor Utility API"}, "850": {"path": "/applications/Ma-Net/utils/api.py:1-49", "hash": "10fdd858fa5544f91dfb6e86f8dc9e62", "title": "Utility Functions for PyTorch-Paddle Conversion"}, "851": {"path": "/applications/Ma-Net/utils/api.py:50-83", "hash": "7b8d696a42772fe37b6ce4a485b1dd00", "title": "Tensor and Image Conversion Utilities"}, "852": {"path": "/applications/Ma-Net/utils/api.py:84-112", "hash": "0ce2ee51c32d331c5a10a1c3c5ae05b9", "title": "Compatibility Check: Adjust and Convert Image Data Types"}, "853": {"path": "/applications/Ma-Net/utils/api.py:113-136", "hash": "84e871b0f587545503de4a1b2c9c6b07", "title": "Mode Validator for Image Data Types"}, "854": {"path": "/applications/Ma-Net/utils/api.py:137-161", "hash": "95f3964351457a0b14835412a2a0939a", "title": "Verify Image Mode and Data Type"}, "855": {"path": "/applications/Ma-Net/utils/api.py:163-198", "hash": "6ed4298200728c9841a9a8b8972ba12e", "title": "Identity Class and Data Conversion Function"}, "856": {"path": "/applications/Ma-Net/utils/api.py:199-223", "hash": "3cc5269478864e350956e187182a308c", "title": "Gradient Norm Clipping Function"}, "857": {"path": "/applications/Ma-Net/utils/api.py:224-250", "hash": "b47d79e16e4bc0461c3741892db6b979", "title": "Max Absolute Value Finder"}, "858": {"path": "/applications/Ma-Net/utils/api.py:251-274", "hash": "fb07933157997624b1be215cd2a5f5d0", "title": "Ma-Net: Non-finite Parameter Clipping"}, "859": {"path": "/applications/Ma-Net/utils/api.py:275-307", "hash": "897024f9f654e8599d86fe40602cb42a", "title": "Maximum Value and Index Extractor"}, "860": {"path": "/applications/Ma-Net/utils/api.py:308-338", "hash": "005d904add4218a73915be62110ede07", "title": "Weight Initialization without Gradient Calculation"}, "861": {"path": "/applications/Ma-Net/utils/api.py:339-364", "hash": "f47c67e34ba7aceebe5d809a1294f60e", "title": "Truncated Normal Initialization"}, "862": {"path": "/applications/Ma-Net/utils/api.py:366-398", "hash": "2a37231df861e96e489f18bc0c773bf0", "title": "Tensor Transformations and Nonlinearity Gains"}, "863": {"path": "/applications/Ma-Net/utils/api.py:399-425", "hash": "8f99984a592e6a5e24adc839a90ddf74", "title": "Gain Calculator for Non-linear Functions"}, "864": {"path": "/applications/Ma-Net/utils/api.py:426-454", "hash": "4492313731135deb8e6215e19123f981", "title": "Initializing Tensor Distributions"}, "865": {"path": "/applications/Ma-Net/utils/api.py:455-483", "hash": "bef1fb328872f8bc266f409aad4cbfc5", "title": "Truncated Normal Tensor Initialization"}, "866": {"path": "/applications/Ma-Net/utils/api.py:484-526", "hash": "c1ccfbb813f0a00d2768522a62f6e97f", "title": "Initializing Tensor Functions in PyTorch"}, "867": {"path": "/applications/Ma-Net/utils/api.py:528-562", "hash": "4adbc7f9986c16394ae5e9065245f680", "title": "Preserving Identity in Linear and Conv Layers: Functions"}, "868": {"path": "/applications/Ma-Net/utils/api.py:563-592", "hash": "d90bf167683790f36ad98a43c0adc44c", "title": "Convolutional Layer Weights Init with Dirac Delta"}, "869": {"path": "/applications/Ma-Net/utils/api.py:593-627", "hash": "4312f41468cbe365ee37424254567b15", "title": "PaddlePaddle Tensor Utilities"}, "870": {"path": "/applications/Ma-Net/utils/api.py:628-655", "hash": "174d309192f6eb1088f7dbb3b4f3283a", "title": "Glorot Initialization in Ma-Net API"}, "871": {"path": "/applications/Ma-Net/utils/api.py:656-687", "hash": "920387d05af528640d87af0dc8b8a15a", "title": "Xavier/Glorot Tensor Initialization"}, "872": {"path": "/applications/Ma-Net/utils/api.py:688-709", "hash": "3cb27420b9a233bb332139c853e174e0", "title": "Uniform Tensor Filler"}, "873": {"path": "/applications/Ma-Net/utils/api.py:710-732", "hash": "56fff455f052e13471843b333c663e34", "title": "Kaiming Uniform Initialization in PyTorch"}, "874": {"path": "/applications/Ma-Net/utils/api.py:733-758", "hash": "0f9cf0839acf33cf4eef4fa65727619d", "title": "Kaiming Weight Initialization"}, "875": {"path": "/applications/Ma-Net/utils/api.py:759-789", "hash": "e50e1c1ae4e2273d450dfab0661b9767", "title": "QR Factorization of Tensors"}, "876": {"path": "/applications/Ma-Net/utils/api.py:790-822", "hash": "32e967cd7c19ec86b95825c181ab8264", "title": "QR Decomposition and Scaling"}, "877": {"path": "/applications/Ma-Net/utils/api.py:824-857", "hash": "133edd935d14c2e04d1e18b6363dadb2", "title": "Kaiming Normal Initializer Function"}, "878": {"path": "/applications/Ma-Net/utils/mask_damaging.py", "hash": "83ca8c2ef29bd52648f71310f469bd96", "title": "Mask Damager: Rotation and Translation in PaddleVideo"}, "879": {"path": "/applications/Ma-Net/utils/mask_damaging.py:1-36", "hash": "86b0b04daf039faea79a6540bb08ff2d", "title": "Mask Damager: Random Transformations for Labels"}, "880": {"path": "/applications/Ma-Net/utils/mask_damaging.py:37-73", "hash": "c8cb33c5ee71c1dbbde6e1bf052dc032", "title": "Mask Damaging Functions"}, "881": {"path": "/applications/Ma-Net/utils/mask_damaging.py:74-98", "hash": "af71abbffa022d66390ab221e0963fad", "title": "Mask Damaging Function"}, "882": {"path": "/applications/Ma-Net/utils/mask_damaging.py:99-129", "hash": "9b9eb601cd6f5c50a10e4333432ba641", "title": "Mask Damaging in PaddleVideo Library"}, "883": {"path": "/applications/Ma-Net/utils/mask_damaging.py:130-155", "hash": "bc5619fcddbf27ac17c5aa849d2fb0a6", "title": "Random Mask Damage Functions"}, "884": {"path": "/applications/Ma-Net/utils/mask_damaging.py:156-170", "hash": "141407f14329c81583ec22fb395324ac", "title": "Rotated and Translated Masks"}, "885": {"path": "/applications/Ma-Net/utils/meters.py", "hash": "1fc138c51bf4ad7354035da8e89092e5", "title": "AverageMeter Class: Compute and Store Average"}, "886": {"path": "/applications/Ma-Net/utils/utils.py", "hash": "3c65bff3639def5c6cf194533c382d2e", "title": "Label to RGB Conversion"}, "887": {"path": "/applications/MultimodalVideoTag/README.md", "hash": "dc9e6c63a62375d533b88595cf303329", "title": "Multimodal Video Tagging with PaddlePaddle 2.0"}, "888": {"path": "/applications/MultimodalVideoTag/README.md:1-37", "hash": "93df97124e57baf08804a15c6a5a3943", "title": "Multimodal Video Classification with PaddlePaddle"}, "889": {"path": "/applications/MultimodalVideoTag/README.md:38-65", "hash": "eecce3612f229191908898dab4f29464", "title": "Training, Evaluation, and Inference with Multimodal Video Tagging"}, "890": {"path": "/applications/MultimodalVideoTag/README.md:67-77", "hash": "366919cf2079d291a1d1c8d3fee4a4f6", "title": "Multimodal Video Tagging with Attention Clusters"}, "891": {"path": "/applications/MultimodalVideoTag/download.sh", "hash": "6ffef4fe898b05c0862258caaa93d6b5", "title": "Download ErnIE Model and Dataset"}, "892": {"path": "/applications/MultimodalVideoTag/eval_and_save_model.sh", "hash": "4ad940035aa0714d2e2c793c8d788cc5", "title": "Env Var Set, Eval & Save Model Script"}, "893": {"path": "/applications/MultimodalVideoTag/inference.sh", "hash": "3804f7c7b5fa40c0b0987fa7f5b5ab8f", "title": "GPU-Based Inference Script"}, "894": {"path": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py", "hash": "d8b4d5257a229c3c486323c4b5dc5c4f", "title": "Multimodal Video Accuracy Metrics Calculator"}, "895": {"path": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:1-35", "hash": "76dc56a7cce457931b7c6c14fe743635", "title": "Accuracy Metrics Calculator"}, "896": {"path": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:36-68", "hash": "165d5a278783147bc6b78c6041d5591a", "title": "Multimodal Video Tag Accuracy Metrics"}, "897": {"path": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:69-95", "hash": "eb0870e6c86af0b28f918b4be7cfa0df", "title": "Video Tagging Metrics Computation"}, "898": {"path": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:96-125", "hash": "a3a976e167b5f65dc07d3f3b36866abd", "title": "Multilabel Top-K Accuracy"}, "899": {"path": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:126-158", "hash": "65dc6f414e16725c41d47053fd93f3bd", "title": "Top-K Accuracy Calculation"}, "900": {"path": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:159-160", "hash": "3b4f0b960dd797ab68cd3f0cabe2e3ec", "title": "Top-K Hits for Multilabel Prediction"}, "901": {"path": "/applications/MultimodalVideoTag/scenario_lib/config.py", "hash": "d520811dd6af9d3e55d58aab8308e85a", "title": "Config Parser and Merger for Multimodal Video Tag"}, "902": {"path": "/applications/MultimodalVideoTag/scenario_lib/config.py:1-52", "hash": "059dc711a63f7d52479c8d0fcdd564eb", "title": "Config Parser and Merger Function"}, "903": {"path": "/applications/MultimodalVideoTag/scenario_lib/config.py:53-71", "hash": "0dab57ea9258e8f9985d9bc31afb7c63", "title": "Config Updater and Printer"}, "904": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py", "hash": "e87ac39072a18f0298ef132f5724f294", "title": "Multimodal Video Tag Datareader"}, "905": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py", "hash": "d11e619fb37cbb06a3751fa1e668b548", "title": "ERNIE Reader for Multimodal Video Tagging"}, "906": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:1-35", "hash": "5d6750408f1ae81edbfd48877703d080", "title": "Ernie Reader: MultimodalVideoTag's Python Component"}, "907": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:37-74", "hash": "c607f39fcfde13dc2471b8a1e51f18b2", "title": "CSV Reader: BaseReader Class"}, "908": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:75-102", "hash": "5eff42f11ab17064359390b02290636f", "title": "Initializing and Configuring ERNIE Task Reader"}, "909": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:103-131", "hash": "9120fe518947ae8abb22889c782fdddc", "title": "Record Creation from Text Tokenization"}, "910": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:132-151", "hash": "c59f01e47038fa7b9219b00a3140abaf", "title": "Ensuring Correct BERT/ERNIE Sequences"}, "911": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:152-179", "hash": "4dd77272a165f142f1d6707a7e44bb33", "title": "ERNIE Input Preparation"}, "912": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:180-207", "hash": "25528878999a9c3f8afa0b6f4b026ebe", "title": "ERNIE Batch Record Generation"}, "913": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:208-235", "hash": "d5846fa3e3c738878cc9864aab7f0f6c", "title": "Padding Ernie Batch Reader"}, "914": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:236-257", "hash": "fb2139030507e552e5a6cccb504103c5", "title": "ERNIE Task Data Processing"}, "915": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:258-289", "hash": "2b71edad180a73f0d3df8a9f8355297f", "title": "ERNIE Text Data Generation"}, "916": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:290-317", "hash": "795b05428b5c2e4255311c954ab0deb0", "title": "Pad Instances to Max Sequence Length"}, "917": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:318-334", "hash": "a30af4131efc068507855c8dae77d55f", "title": "Preparing Return List in ERNIE Task Reader"}, "918": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py", "hash": "45b67f724640f7a0f3d922b51b811bb2", "title": "Multimodal Video Feature Reader"}, "919": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:1-39", "hash": "22bd62387b8faf5f2161f77d492b2bd0", "title": "FeatureReader: Multimodal Video Feature Data Reader"}, "920": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:40-67", "hash": "dc4061e08c6b21719d3c4c3cad604786", "title": "YouTube-8M Data Reader: LSTM, Attention Cluster, NextVlad"}, "921": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:68-95", "hash": "43275c5b97880e360981f79455adf8e9", "title": "Multimodal Data Reader Algorithm"}, "922": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:96-113", "hash": "9b9b9776739d23696d3fbdaa42dc1d56", "title": "Multi-Modal Dataset Feature Reader"}, "923": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:114-140", "hash": "ed33317741dd84edc2b1f4051197bb23", "title": "Multimodal Video Data Reader"}, "924": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:143-173", "hash": "0873755e1e902920a7052eef28870def", "title": "Function for Loading Video Files and Labels"}, "925": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:174-212", "hash": "fe419213819fd161ff91819c50970bfa", "title": "Label Data Manipulation Functions"}, "926": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:213-251", "hash": "39a4e9553ce276904d59ca2a3d796c7c", "title": "Efficient Data Reader for Multimodal Video Analysis"}, "927": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:252-274", "hash": "d76e7fe7e9a3b1a79a1424652f043c29", "title": "Load and Return Dictionary of Words and Indices"}, "928": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py", "hash": "4129231bc35b3f4234236327a042d738", "title": "Reader Manager: Custom Exceptions and Singleton Design"}, "929": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py:1-30", "hash": "c918e29733b2aec77e0f340f7a6901d0", "title": "Custom Exception for Missing Reader"}, "930": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py:31-73", "hash": "7fdb3313376721e31c58987b8f929ecf", "title": "Video Data Reader Utilities"}, "931": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py:74-91", "hash": "62bb80195ea21201f642dd9a773d0089", "title": "Reader Manager: Singleton for Registering and Retrieving Readers"}, "932": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py", "hash": "3b5f09da8c8f5637bca41c2f02f0aae8", "title": "Text Tokenization for Multimodal Video Tagging"}, "933": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:1-32", "hash": "1c505a4dbf637d772011564a7c434cea", "title": "Python Unicode Converter Function"}, "934": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:33-61", "hash": "508a3584082d1af33731748fc63aceb4", "title": "Universal Printable Text Encoder"}, "935": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:62-96", "hash": "a4272074431412da762bc6b766de1dc1", "title": "Vocabulary File Handler"}, "936": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:97-133", "hash": "2f8085c953d0dad7396afb381b4b63e2", "title": "FullTokenizer: Efficient Tokenization Class"}, "937": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:134-168", "hash": "ed7267011f946f9c4e08acee6679476e", "title": "End-to-End Tokenization with CharTokenizer"}, "938": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:169-197", "hash": "a85a06e3e42a6ddafa97403453198ef5", "title": "Basic Tokenizer for Text Tokenization"}, "939": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:198-229", "hash": "1ab54ef916786856448bd885b0150392", "title": "Chinese Text Tokenization and Processing"}, "940": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:230-259", "hash": "62ca54d06e5bd8358acacb7beb41046f", "title": "Text Tokenization Functions"}, "941": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:260-282", "hash": "97a32078cbb4b69c1f8a3fe8cacdac22", "title": "CJK Unicode Checker"}, "942": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:283-315", "hash": "51b2079033eb44b2f827872fc68a9502", "title": "Greedy Wordpiece Tokenizer"}, "943": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:316-348", "hash": "5e159469a794e6e13da4bf8e787ef11f", "title": "Tokenization and Unknown Word Handling"}, "944": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:349-382", "hash": "82860a8f2a0edb7f4d3e0719270a1f62", "title": "Tokenizing String Functions"}, "945": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:383-405", "hash": "ca61b02bd0fb82471f35f013be82b63a", "title": "Detect Punctuation and Chinese Characters"}, "946": {"path": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:406-441", "hash": "cbb0af8c222cf56a9b793c09e02c1582", "title": "Chinese Text Tokenizer"}, "947": {"path": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py", "hash": "b095b5967fcec00d5d91f913eff1da0e", "title": "Multimodal Video Tagging with PaddlePaddle"}, "948": {"path": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:1-37", "hash": "4248451b6c2825a8c9b48211a63cc462", "title": "Multimodal Video Tag Evaluation Code"}, "949": {"path": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:38-64", "hash": "86b7ba8afb398a6770407e6e3e78bbf8", "title": "Paddle Video Eval Argument Parser"}, "950": {"path": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:65-94", "hash": "ea6d4e7c1c84f5fd3a5f6337aa609298", "title": "Evaluate and Save Inference Model"}, "951": {"path": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:95-123", "hash": "08e58f6bd9ef368290c8c0d7721666b7", "title": "Save and Evaluate Model"}, "952": {"path": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:124-145", "hash": "5e5af0bb56da2166ea4cb9b6560a823e", "title": "Evaluate and Save Multimodal Video Model"}, "953": {"path": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:146-159", "hash": "a68fd37f65742e2c4b8e83781ef7846c", "title": "Save Inference Model with Parameters"}, "954": {"path": "/applications/MultimodalVideoTag/scenario_lib/inference.py", "hash": "ac9ac700d564de16cea421936f9c79ff", "title": "Multimodal Video Tagging Inference"}, "955": {"path": "/applications/MultimodalVideoTag/scenario_lib/inference.py:1-38", "hash": "384870ad371612a27bea3acc6b9ef16a", "title": "Paddle Video Inference Script"}, "956": {"path": "/applications/MultimodalVideoTag/scenario_lib/inference.py:39-69", "hash": "f192c0566ed1150ef1bfc121c7332ca6", "title": "InferModel Class and Load Inference Model Function"}, "957": {"path": "/applications/MultimodalVideoTag/scenario_lib/inference.py:70-98", "hash": "eee08777cc6ce2bf7a071c405e4e1ebb", "title": "Multimodal Video Tagging Initialization"}, "958": {"path": "/applications/MultimodalVideoTag/scenario_lib/inference.py:100-122", "hash": "f3696b5182d838d7b29814b7feb42bc6", "title": "Multimodal Inference Function"}, "959": {"path": "/applications/MultimodalVideoTag/scenario_lib/inference.py:124-161", "hash": "dd0c8b573d1f7c7879a38ee1deb4f10f", "title": "Video Label Inference Function"}, "960": {"path": "/applications/MultimodalVideoTag/scenario_lib/inference.py:162-173", "hash": "275ec9c9be59c77cbc3e0d4e1de41437", "title": "MultimodalVideoTag Inference Function"}, "961": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py", "hash": "ed104a63b4db88e53211403a885d9939", "title": "Multi-modal Video Tagging with ERNIE"}, "962": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:1-34", "hash": "118b35b506e5858a01de73c7ee1ba019", "title": "AttentionLstmErnie: Combining Scenario-Classify and ERNIE"}, "963": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:35-59", "hash": "89fd33b9138e0779bce9887f6959d655", "title": "Attention LSTM ERNIE Model Initialization"}, "964": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:60-85", "hash": "2df31eaf55440b8b70acbc167c41fe73", "title": "AttentionLSTMERNIE Model Config Init"}, "965": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:86-108", "hash": "39a32a98e0cb9d660caa042152abef63", "title": "Ernie Model Data Feeding and Feature Extraction"}, "966": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:109-131", "hash": "1b310e9f4cab4e1829517ec430af91d0", "title": "ERNIE Model Initialization and Freeze"}, "967": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:132-154", "hash": "df64531a140203e5a6d7d9f8a82f1aa8", "title": "Attention-based LSTM Model for Video Tagging"}, "968": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:155-172", "hash": "0df5307c647d48f894e5bae40be30e12", "title": "Dynamic LSTM for Image Features with Attention"}, "969": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:173-194", "hash": "3e652879e60b3fb946f2c63d91480470", "title": "Multimodal LSTM with Audio and Visual Inputs"}, "970": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:195-214", "hash": "b2d62bd932b680d3643f74bc8f993404", "title": "Attention LSTM for Audio Reversal"}, "971": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:215-235", "hash": "196196a55416c090e0297c345ff4d3b0", "title": "Multimodal Video Tagging with LSTM-Attention and ERNIE"}, "972": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:236-260", "hash": "295becbb876034d62f000fe6e5fc9da7", "title": "Attention-based Neural Feature Sequence Calculation"}, "973": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:261-285", "hash": "294ab97e7b51a8a4f060aee2e3900779", "title": "Dropout and Batch Normalization for LSTM"}, "974": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:286-312", "hash": "04c6ba8fc591fcf6d8f1140bc7318c2e", "title": "Attention LSTM Ernie Model with Dropout"}, "975": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:313-334", "hash": "08218daf624cf9606b8f36c301b40a17", "title": "Loss Calculation with Piecewise Decay Optimizer"}, "976": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:336-365", "hash": "37d112e68c29ed1e4c65aed6ad870d2c", "title": "Sigmoid Loss Function in Attention LSTM"}, "977": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:366-400", "hash": "7ddc59c33ec27e951ae05fd810c27b4e", "title": "Attention LSTM ERNIE Model Functions"}, "978": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py", "hash": "fbf0f763cd99593d0760f28eef0b1357", "title": "ERNIE Multimodal Video Tagging Model"}, "979": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:1-33", "hash": "61ed68bdc7783f20e76104c78ae085de", "title": "Ernie Model Configuration"}, "980": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:34-73", "hash": "c07f9ed7c17d22660aeccf557bc4b0bb", "title": "Ernie Model Configuration Class"}, "981": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:76-106", "hash": "672ef9fc4da4171a973d876350c00ae4", "title": "ERNIE Model Class Definition"}, "982": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:107-132", "hash": "0dca3aa617960bc4ac96a30affa72b0b", "title": "ERNIE Model Initialization and Building"}, "983": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:133-158", "hash": "6c42d01e5d7e89ed996d426fff5a250b", "title": "Multimodal Video Tagging Embedding Combination"}, "984": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:159-184", "hash": "27a4e4c9f5f9904f4056806bd00ae691", "title": "Embedding Layer Initialization and Encoding"}, "985": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:185-215", "hash": "00756883f901f7281a45ebbc58b202c9", "title": "Encoder Layer Initialization"}, "986": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:216-243", "hash": "7bc8befe6db0c2717eb2d1e5ef9eb4d0", "title": "TextCNN Model for Sequence Feature Extraction"}, "987": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:244-250", "hash": "2cf28dc5c4f70fbb00dac721f647e7a6", "title": "1D Convolutional Layer Creation"}, "988": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py", "hash": "e0a57c540decf204518f97995c6b065c", "title": "Transformer Encoder for NLP"}, "989": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:1-32", "hash": "04a7b094a61c702dba2c38bf2b4a5392", "title": "Multi-Head Attention Function"}, "990": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:33-57", "hash": "9b4722ff69e5c96d02a150a98dd265df", "title": "Multi-Head Attention Layer Code"}, "991": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:58-80", "hash": "bc4daf6c9c28c45e8efeea3713e6de09", "title": "Transformer Encoder Layer Function"}, "992": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:81-104", "hash": "cd3e7f332ac2c1f786c9e78a09fe5644", "title": "Split and Combine Attention Heads in Transformer Encoder"}, "993": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:105-128", "hash": "beaae361b8cfc412a8f5bc03bac70014", "title": "Scaled Dot-Product Attention in Transformer Encoder"}, "994": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:129-154", "hash": "d66b77ac33de6e58a8f43b523a68692f", "title": "Transformer Encoder Attention Mechanism"}, "995": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:155-182", "hash": "25b564fa21badc8540d0b295d5b6e8f5", "title": "Position-wise Feed-Forward Network in Transformer Encoder"}, "996": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:183-208", "hash": "299ba686e968c7cd045578c6b979b84a", "title": "Transformer Encoder Layer for Multimodal Video Tagging"}, "997": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:209-236", "hash": "cc887da84ce7f6dc465e8b3dad62a1e8", "title": "Transformer Encoder Layer Implementation"}, "998": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:237-268", "hash": "21a7c876423f24df6cdae9e174ef8de4", "title": "Transformer Encoder Layer with MH Attention"}, "999": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:269-308", "hash": "292e20ce365289ffa4c1f32d9028ae9f", "title": "Transformer Encoder Model Definition"}, "1000": {"path": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:309-338", "hash": "29ec03a1932ac97acf7ee995e5e3410e", "title": "Transformer Encoder Function"}, "1001": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py", "hash": "91c724073b255af8239badcebab36cf5", "title": "Video Model Training with PaddlePaddle"}, "1002": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py:1-34", "hash": "d168d694d2d41a4960c00fa63fd43f3d", "title": "Train Model Using AttentionLstmErnie"}, "1003": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py:37-71", "hash": "67849f982f0f9b000afbd1a7370f8932", "title": "Command-Line Logging for Paddle Video"}, "1004": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py:72-105", "hash": "494674d4d265dc681c7a8b55b7d836ab", "title": "Training Options in MultimodalVideoTag"}, "1005": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py:106-136", "hash": "f494b27ac9ff4497aa2263145d241515", "title": "Command-Line Arguments for Model Training"}, "1006": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py:137-160", "hash": "daba0e4010e4d0437e54ae63f32a385e", "title": "Training Model Setup"}, "1007": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py:161-190", "hash": "0a42532e6f2941ddc2b5c37a634c2d19", "title": "Model Building and Execution Setup"}, "1008": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py:191-213", "hash": "a14e4f9ed35d2be0ea5e69dcae5f093a", "title": "Data Parallelism with Pre-Trained Weights"}, "1009": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py:214-231", "hash": "41f032a10121e5a1a412ecc7bc920521", "title": "Batch Size Setting in Multimodal Video Tagging"}, "1010": {"path": "/applications/MultimodalVideoTag/scenario_lib/train.py:232-263", "hash": "d914443fbd55a211a4df35db74552d96", "title": "Train Model with Custom Arguments"}, "1011": {"path": "/applications/MultimodalVideoTag/scenario_lib/utils.py", "hash": "cfe7511073519932952e8f728ef76d92", "title": "Multi-Task Framework for Video Tagging"}, "1012": {"path": "/applications/MultimodalVideoTag/scenario_lib/utils.py:1-39", "hash": "ce980ba0c5e74d8bd9e8444e9dbc4ea3", "title": "Testing with PyReader Function"}, "1013": {"path": "/applications/MultimodalVideoTag/scenario_lib/utils.py:40-67", "hash": "ab85ec1cf0e5e542017e072072a51721", "title": "PaddleVideo Test Suite"}, "1014": {"path": "/applications/MultimodalVideoTag/scenario_lib/utils.py:68-96", "hash": "0159a8d28f558b07c9d04b24e6b9ba1c", "title": "Train Model with PyReader: Epochs, Testing, and Early Stopping"}, "1015": {"path": "/applications/MultimodalVideoTag/scenario_lib/utils.py:97-119", "hash": "68abdff3526c14ae9a28f5ac68d10e05", "title": "ML Training Loop Metrics Tracker"}, "1016": {"path": "/applications/MultimodalVideoTag/scenario_lib/utils.py:120-141", "hash": "4adb302a7622825a9c64e1f4067f2d93", "title": "Epoch Training Metrics and Testing"}, "1017": {"path": "/applications/MultimodalVideoTag/scenario_lib/utils.py:142-169", "hash": "3daeb3924d0495b005d2a16fb8dc007f", "title": "Save and Stop Training Model Function"}, "1018": {"path": "/applications/MultimodalVideoTag/scenario_lib/utils.py:170-201", "hash": "8323bb03408550644c7c0ea1650b5b30", "title": "Load Pretrained Parameters"}, "1019": {"path": "/applications/MultimodalVideoTag/scenario_lib/utils.py:204-218", "hash": "57de18fbfc82b0bb51f91f4bac58614f", "title": "AttrDict: Dictionary as Class Attributes"}, "1020": {"path": "/applications/MultimodalVideoTag/train.sh", "hash": "147849fa34b9c08eea4e80b766b3dd24", "title": "Efficient GPU Training of Attention LSTM Ernie"}, "1021": {"path": "/applications/PP-Care/Readme.md", "hash": "4fc2571c043db8eb87627ccde079f014", "title": "Pre-Trained PP-Care Model for Video Understanding"}, "1022": {"path": "/applications/PP-Care/Readme.md:1-55", "hash": "472d42b6f93b63e94221071c9ffe17c4", "title": "3DMRI Classification with PaddleVideo"}, "1023": {"path": "/applications/PP-Care/Readme.md:55-81", "hash": "8a5c79acee01d6c39c6761095fe23717", "title": "Initializing PP-Care Model for MRI Data"}, "1024": {"path": "/applications/PP-Care/Readme.md:81-106", "hash": "63252fde99ded8cb3cc40a70854bdb67", "title": "Optimized PP-Care Model Testing with ResNet50"}, "1025": {"path": "/applications/PP-Care/Readme.md:107-110", "hash": "839af0aa07f474148ee9c2fa6e9fdb5d", "title": "Efficient Video Neural Networks: A Comprehensive Guide"}, "1026": {"path": "/applications/PPHuman/README.md", "hash": "f58cf75f57adac219b6168226c50b904", "title": "PaddleVideo to PP-Human Model Conversion Script"}, "1027": {"path": "/applications/PPHuman/README.md:1-21", "hash": "8c0322585be1cef77d1d301c56035349", "title": "Training Behavior Model with PaddleVideo"}, "1028": {"path": "/applications/PPHuman/README.md:22-42", "hash": "eb786632af2f98a16dcd06ee0828a341", "title": "Data Preparation for PP-Human"}, "1029": {"path": "/applications/PPHuman/README.md:44-60", "hash": "6039b7950ceaf6144fcffa546ff1845b", "title": "Keypoint Detection with Pretrained Models"}, "1030": {"path": "/applications/PPHuman/README.md:62-83", "hash": "66d12f8544e9f3a158b830a25dea899f", "title": "PPHuman: Human Keypoint Detection in Videos"}, "1031": {"path": "/applications/PPHuman/README.md:84-114", "hash": "08d371118f168dda7b7b1ccce79b0dad", "title": "PPHuman JSON to Training Data Conversion"}, "1032": {"path": "/applications/PPHuman/README.md:115-143", "hash": "77ada94e9d1ee7edbc549ceda24ca4a9", "title": "Exporting PaddleVideo Model for PP-Human"}, "1033": {"path": "/applications/PPHuman/datasets/prepare_dataset.py", "hash": "3abf30e919497b8fc6e5db86eea5ee18", "title": "Preparing Datasets for PaddleVideo and PPHuman"}, "1034": {"path": "/applications/PPHuman/datasets/prepare_dataset.py:1-34", "hash": "a78472dcaa92a9c7313151a56fde206c", "title": "UR Fall Dataset Conversion for PaddleVideo"}, "1035": {"path": "/applications/PPHuman/datasets/prepare_dataset.py:35-69", "hash": "65ae4f5621c96b2ce7a69988aa11f35f", "title": "Consistent Dataset Preparation"}, "1036": {"path": "/applications/PPHuman/datasets/prepare_dataset.py:70-98", "hash": "d8b70b9c5e334311e0ef354344d87eef", "title": "Prepare Dataset for PaddleVideo's PPHuman"}, "1037": {"path": "/applications/README.md", "hash": "7e18c6626b0c3a07ff2a999ea12180c9", "title": "PaddleVideo: Versatile Application Cases"}, "1038": {"path": "/applications/T2VLAD/README.md", "hash": "3849f659e32eb8c04d4f412682a590b7", "title": "Introducing T2VLAD: Video Retrieval Model in PaddleVideo"}, "1039": {"path": "/applications/T2VLAD/README.md:1-60", "hash": "566dc671aea41b12017b8e0f34ae7c90", "title": "T2VLAD: Text Video Retrieval Model Introduction"}, "1040": {"path": "/applications/T2VLAD/README.md:61-75", "hash": "8a595511a98a24d6ac7baa0e0878f9d1", "title": "T2VLAD Performance Metrics"}, "1041": {"path": "/applications/T2VLAD/README_en.md", "hash": "101f1ae3f446b1add2cb4ab69f7810d0", "title": "T2VLAD: Text-Video Retrieval with PaddleNLP"}, "1042": {"path": "/applications/T2VLAD/README_en.md:1-31", "hash": "524237b07590972558a480528f540a35", "title": "Install PaddleNLP Dependency"}, "1043": {"path": "/applications/T2VLAD/README_en.md:32-59", "hash": "82292cfe2b02d7334d6263ddc65dcdcc", "title": "Train and Test T2VLAD on MSRVTT Dataset"}, "1044": {"path": "/applications/T2VLAD/README_en.md:61-69", "hash": "aa33c5b9d4ae8a3cd942c5d5a6435aab", "title": "Text-Video Retrieval Model Metrics: R@1, R@5, R@10"}, "1045": {"path": "/applications/T2VLAD/base/__init__.py", "hash": "1b4a0ac909e171afe00dd51ca0ada2cd", "title": "Importing Base Modules"}, "1046": {"path": "/applications/T2VLAD/base/base_dataset.py", "hash": "7b6f97ab8a29984a473b2339c0e848e6", "title": "Video Dataset Base Class"}, "1047": {"path": "/applications/T2VLAD/base/base_dataset.py:1-36", "hash": "e1f4d0204fab8e7c8a4d38cb6046ac84", "title": "Copyright, Libraries, and Type Guarding in Python"}, "1048": {"path": "/applications/T2VLAD/base/base_dataset.py:37-76", "hash": "2ca0d6de0d48225c86115ed287c3445f", "title": "Base Dataset Class for Video Features"}, "1049": {"path": "/applications/T2VLAD/base/base_dataset.py:77-101", "hash": "6cb0eee53ac36e371ea9014b348c953f", "title": "Dataset Class Initialization"}, "1050": {"path": "/applications/T2VLAD/base/base_dataset.py:102-125", "hash": "967bfc944fc077327f973dc61c97309f", "title": "Dataset Initialization"}, "1051": {"path": "/applications/T2VLAD/base/base_dataset.py:127-152", "hash": "359f4212b3d3da8f1dc809dec31a5597", "title": "Default Video Retrieval Paths"}, "1052": {"path": "/applications/T2VLAD/base/base_dataset.py:154-175", "hash": "6a037afe95f4f1ac035c0a0ae2226122", "title": "Experts Configuration Initialization"}, "1053": {"path": "/applications/T2VLAD/base/base_dataset.py:176-197", "hash": "d34809cc07fc13e7cfd3a4e053b5a374", "title": "Initializing Arrays for Model Evaluation"}, "1054": {"path": "/applications/T2VLAD/base/base_dataset.py:199-217", "hash": "c641dfd807e10dc28a3881e701b56738", "title": "Expert Index Initialization"}, "1055": {"path": "/applications/T2VLAD/base/base_dataset.py:218-237", "hash": "f0c4233f28e6e7c05f61da1422033a33", "title": "Video Feature Preparation and Test Captioning"}, "1056": {"path": "/applications/T2VLAD/base/base_dataset.py:238-257", "hash": "8ad3394bc98adedfc3be7534d5adb32e", "title": "Token Masking and Encoding in T2VLAD"}, "1057": {"path": "/applications/T2VLAD/base/base_dataset.py:258-280", "hash": "c52eb2d091262e330f1bfb28af5cfe54", "title": "Text Feature Creation and Split Configuration"}, "1058": {"path": "/applications/T2VLAD/base/base_dataset.py:281-304", "hash": "2c634d94e2d7f6f677fe7951902aeb91", "title": "Loading and Initializing Data for PaddleVideo"}, "1059": {"path": "/applications/T2VLAD/base/base_dataset.py:305-327", "hash": "717281a518d06e255fd8e531715ac2d5", "title": "Batch Tensor Initialization"}, "1060": {"path": "/applications/T2VLAD/base/base_dataset.py:329-350", "hash": "87b064fe99f7a062fe27abe7ed02269f", "title": "Data Preparation for Experts"}, "1061": {"path": "/applications/T2VLAD/base/base_dataset.py:351-372", "hash": "14e6def564d3ca5494b54ff7ae0b7b0d", "title": "Minibatch Creation for Video and Text Features"}, "1062": {"path": "/applications/T2VLAD/base/base_dataset.py:373-397", "hash": "f2d1caccb551babf9de76909efe1087a", "title": "Video Dataset Class for Text-to-Video Retrieval"}, "1063": {"path": "/applications/T2VLAD/base/base_dataset.py:398-413", "hash": "a89da377cd6edd2ff858400377848d48", "title": "Video Frame Feature Segmentation"}, "1064": {"path": "/applications/T2VLAD/base/base_dataset.py:414-437", "hash": "c7689cc6b19066cfd8929a136572b8e4", "title": "Random Captioning with Tokenization"}, "1065": {"path": "/applications/T2VLAD/base/base_dataset.py:438-463", "hash": "def7e73aca09c3d9f9e42f85c7f342da", "title": "Video Dataset Initialization"}, "1066": {"path": "/applications/T2VLAD/base/base_dataset.py:464-492", "hash": "7c3b8f048896d0fabe64f173d64d40ca", "title": "Defining Retrieval Data and Meta Dictionary"}, "1067": {"path": "/applications/T2VLAD/base/base_dataset.py:493-516", "hash": "d573b3980cc5d4b163610e7e38d3a50c", "title": "Feature Path Generator"}, "1068": {"path": "/applications/T2VLAD/base/base_dataset.py:517-539", "hash": "3deeb6f496fb8441463c4212d365f96e", "title": "Assertion Function and Summary Stats in T2VLAD Base Dataset"}, "1069": {"path": "/applications/T2VLAD/base/base_dataset.py:540-562", "hash": "357b803d8ebdf74287c3a9a9eb110834", "title": "Partition and Analyze Datasets"}, "1070": {"path": "/applications/T2VLAD/base/base_model.py", "hash": "4e2ed14cecfb69374e41aa7a4b79a205", "title": "Base Model Abstract Class"}, "1071": {"path": "/applications/T2VLAD/base/base_model.py:1-36", "hash": "a4657a7599ddf9a939c19f998b77bdaa", "title": "Abstract Base Model for PaddleVideo"}, "1072": {"path": "/applications/T2VLAD/base/base_model.py:37-37", "hash": "cee94c8bbc8174ddaf66a348db2a4731", "title": "Trainable Parameters Counter"}, "1073": {"path": "/applications/T2VLAD/base/base_trainer.py", "hash": "efcfc8558a644c92183bb1ab07082eb1", "title": "T2VLAD Trainer: Multi-Epoch Management and Checkpoints"}, "1074": {"path": "/applications/T2VLAD/base/base_trainer.py:1-33", "hash": "75dd3069c92c832e16efeba07a66a2d5", "title": "Base Trainer Class Setup"}, "1075": {"path": "/applications/T2VLAD/base/base_trainer.py:34-60", "hash": "abdc99eb5ffc7c3e735191c17cd2f232", "title": "Initializing Base Trainer Object"}, "1076": {"path": "/applications/T2VLAD/base/base_trainer.py:62-89", "hash": "b6864c225be7b03a748791bbbd4e202e", "title": "Training Trainer Class"}, "1077": {"path": "/applications/T2VLAD/base/base_trainer.py:90-110", "hash": "6082e1869b9c23ee1de221732ee6e63e", "title": "Metrics Logging and Monitoring Enhancements"}, "1078": {"path": "/applications/T2VLAD/base/base_trainer.py:111-128", "hash": "5a451e42fca434ea0640f7b729899c28", "title": "Improved Performance Check"}, "1079": {"path": "/applications/T2VLAD/base/base_trainer.py:129-151", "hash": "52bb13751df0938c3b859af195e4a566", "title": "Early Stopping and Best Model Saving"}, "1080": {"path": "/applications/T2VLAD/base/base_trainer.py:153-170", "hash": "eb9ddec6678bd8aa9b59d976eca088c2", "title": "Flexible Model Saving Conditions"}, "1081": {"path": "/applications/T2VLAD/base/base_trainer.py:171-186", "hash": "76358c417df6ca1e15d82da3e8f8520a", "title": "Video Prediction Saving and Logging"}, "1082": {"path": "/applications/T2VLAD/base/base_trainer.py:187-210", "hash": "204ed61b521dd73ce5d434ba4bb3404f", "title": "Model Checkpoint Management & Purge"}, "1083": {"path": "/applications/T2VLAD/base/base_trainer.py:211-238", "hash": "3c69a804c3073d3edcde947b54837178", "title": "Stale Model Pruning"}, "1084": {"path": "/applications/T2VLAD/base/base_trainer.py:239-258", "hash": "3cab6a7325c5e046cad866dc1426b985", "title": "AutoSave Best Model During Training"}, "1085": {"path": "/applications/T2VLAD/data/download_features.sh", "hash": "da0fbbd4dcbada209bc9ac584cd53d14", "title": "Remote Dataset Download & Extraction"}, "1086": {"path": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py", "hash": "d1469bbafe8c7ebedafc34c01d16f6d0", "title": "MSRVTT Dataset Loader"}, "1087": {"path": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:1-29", "hash": "61e3996a11b1c0d0e946eb1842c15c83", "title": "MSRVTT Dataset Loader"}, "1088": {"path": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:30-46", "hash": "e5b8247f42069a627e9c1ed58adb34b7", "title": "Data Split Paths for MSRVTT Dataset"}, "1089": {"path": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:47-71", "hash": "61a41deb2347e5106a8fb562dc9f21c1", "title": "MSRVTT Dataset Feature Loading"}, "1090": {"path": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:72-89", "hash": "b450fe27b54972888c3c645306d9a7a9", "title": "Feature Aggregation for Expert in MSRVTT Dataset"}, "1091": {"path": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:90-108", "hash": "17e0e46213f3281272a5a8c9576b26f1", "title": "Checking and Validating Text Features"}, "1092": {"path": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:110-126", "hash": "245d5e6955adf0459a883b25acead1bb", "title": "Validating Test Sets and Missing Queries"}, "1093": {"path": "/applications/T2VLAD/data_loader/data_loaders.py", "hash": "116c8e9be1eba6fb215434a80ba92a17", "title": "Efficient Data Loader with LRU Caching"}, "1094": {"path": "/applications/T2VLAD/data_loader/data_loaders.py:1-36", "hash": "fb65577c7b0d741961a56ac0af525df4", "title": "Paddle Dataset Loader Function"}, "1095": {"path": "/applications/T2VLAD/data_loader/data_loaders.py:37-69", "hash": "58fbe1a58bb7f34ab75a1a9597fe6ff1", "title": "Create Dataset Function"}, "1096": {"path": "/applications/T2VLAD/data_loader/data_loaders.py:71-101", "hash": "9019005107fa8bac8496fc82aca062f6", "title": "Data Loader Constructor"}, "1097": {"path": "/applications/T2VLAD/data_loader/data_loaders.py:102-127", "hash": "79fc41ecd14341f2fe721d1a44d147f8", "title": "Flush and Create Dataset Loader"}, "1098": {"path": "/applications/T2VLAD/data_loader/data_loaders.py:129-145", "hash": "64f9e3dec133b4c4a709bb036b9f6b2e", "title": "Training DataLoader Creator"}, "1099": {"path": "/applications/T2VLAD/logger/__init__.py", "hash": "646f9b3b9079bdb7dfbcef7e80b9ddca", "title": "Importing T2VLAD Logger and Parser Functions"}, "1100": {"path": "/applications/T2VLAD/logger/log_parser.py", "hash": "737dbb861e20a89b4856ded9e83f25fb", "title": "Log Summary: Epoch Performance Stats"}, "1101": {"path": "/applications/T2VLAD/logger/log_parser.py:1-24", "hash": "34e4793dc42e594c30d32c6ce71430f6", "title": "Log Performance Stats with log_summary"}, "1102": {"path": "/applications/T2VLAD/logger/log_parser.py:26-56", "hash": "9502420928aa4a8aafec9d9531ba284e", "title": "Log Parser: Identifying Seeds and Metrics in T2VLAD"}, "1103": {"path": "/applications/T2VLAD/logger/log_parser.py:57-78", "hash": "cfa2ff83518c050bd72e392522b3d58e", "title": "Log Parser: Extracting Scores for Seeds"}, "1104": {"path": "/applications/T2VLAD/logger/log_parser.py:79-99", "hash": "de6ea38285328fde62d5a4afd4ce9909", "title": "Geometric Mean Seed Selection"}, "1105": {"path": "/applications/T2VLAD/logger/log_parser.py:101-104", "hash": "1fa5f166bd216bb91746ebb9b68bca14", "title": "Fixed Epoch Logging"}, "1106": {"path": "/applications/T2VLAD/logger/logger.py", "hash": "203ee422eb15db7d15596729cacac43f", "title": "Configure Logging from JSON File"}, "1107": {"path": "/applications/T2VLAD/model/loss.py", "hash": "0424c3c8c55cdebb12473c049cc71a6a", "title": "Contrastive Loss for T2VLAD"}, "1108": {"path": "/applications/T2VLAD/model/loss.py:1-28", "hash": "84a6c5d4a968bc42a0477a40a3271486", "title": "Max Margin Ranking Loss for T2VLAD"}, "1109": {"path": "/applications/T2VLAD/model/loss.py:29-61", "hash": "3ddf35108e91c86e7dddc14e8be824d9", "title": "Contrastive Loss for Image-Sentence Pairs"}, "1110": {"path": "/applications/T2VLAD/model/loss.py:62-85", "hash": "833b8e85a400388ed2a3f73dbf48b83d", "title": "Contrastive Learning Cost Calculation"}, "1111": {"path": "/applications/T2VLAD/model/loss.py:86-102", "hash": "0fb6698b9c64b82eaee1be46ef7ff3b1", "title": "Video-Level Loss Calculation in T2VLAD"}, "1112": {"path": "/applications/T2VLAD/model/metric.py", "hash": "fbef2cc5cacca04b4a62731bceb7183a", "title": "Retrieval Metrics and Visualization Tool"}, "1113": {"path": "/applications/T2VLAD/model/metric.py:1-30", "hash": "5566a45867d88d0492fa8ec1ce515ce5", "title": "Retrieval Metrics Computation"}, "1114": {"path": "/applications/T2VLAD/model/metric.py:31-58", "hash": "d20666d1e010673c6ad88478778d6b7e", "title": "Retrieval Metrics Calculation"}, "1115": {"path": "/applications/T2VLAD/model/metric.py:59-75", "hash": "d330aaa8fea39a5a300a4171bc6ec4f2", "title": "Averaging Tie-Breaking in Similarity Matrix"}, "1116": {"path": "/applications/T2VLAD/model/metric.py:76-98", "hash": "99801bdb113e341bbbc79aa93cd58d4c", "title": "Efficient Tied Scores Handling"}, "1117": {"path": "/applications/T2VLAD/model/metric.py:100-122", "hash": "277200653294fa8d74157379a249fa69", "title": "Average Rank Calculator"}, "1118": {"path": "/applications/T2VLAD/model/metric.py:124-148", "hash": "b963497b1b1a1011827a42ac7d9ed145", "title": "Retrieval Metric Computation and Validity Checks"}, "1119": {"path": "/applications/T2VLAD/model/metric.py:150-180", "hash": "1ffcf50b2a36772b85d52a3ebb4955ad", "title": "Closest Caption Retrieval Metrics"}, "1120": {"path": "/applications/T2VLAD/model/metric.py:181-199", "hash": "5877b48d9e1543f1eb98539dcf4c3a20", "title": "Optimistic or Averaging Caption Ranking"}, "1121": {"path": "/applications/T2VLAD/model/metric.py:200-224", "hash": "bceabb5196d91fe819f29e35b597ecdb", "title": "Matrix Rank Checker: Sanity-checking Code"}, "1122": {"path": "/applications/T2VLAD/model/metric.py:225-243", "hash": "8db44c7c3e25611d6c717ad4e25bc8be", "title": "Ranking Metrics Computation with Matplotlib and Numpy"}, "1123": {"path": "/applications/T2VLAD/model/model.py", "hash": "42cba9bda61cb17077852c1cbc8eb2f6", "title": "Enhanced Video Analysis with CENet"}, "1124": {"path": "/applications/T2VLAD/model/model.py:1-34", "hash": "fcce8d49221e6e380b72b839c8e02f12", "title": "Importing Libraries for T2VLAD Model"}, "1125": {"path": "/applications/T2VLAD/model/model.py:35-66", "hash": "95c5575b95e0937de32feb0f2524901d", "title": "Implementing Mish, Kronecker Product, and NaN Removal Functions"}, "1126": {"path": "/applications/T2VLAD/model/model.py:67-98", "hash": "8ee3d7bad815a14c02b4afe7b284a86a", "title": "NaN Handling in CENet Model"}, "1127": {"path": "/applications/T2VLAD/model/model.py:99-130", "hash": "de205933b09b5428e7249fd5cfbedab2", "title": "Model Initialization and Time Estimation"}, "1128": {"path": "/applications/T2VLAD/model/model.py:131-148", "hash": "b13ffc4cc0f7ec328f28202391ebaea9", "title": "Text Pooling in T2VLAD"}, "1129": {"path": "/applications/T2VLAD/model/model.py:149-179", "hash": "1c92a3009684de03e0ff16a115100ddc", "title": "Transformer Layer Implementation"}, "1130": {"path": "/applications/T2VLAD/model/model.py:180-207", "hash": "033dde6d904e463baf53ffcf6f1527af", "title": "Attention Functions in T2VLAD Model"}, "1131": {"path": "/applications/T2VLAD/model/model.py:208-237", "hash": "41f7deec50601fb0bce7c419f4da9b87", "title": "Transformer Class with Multi-Head Attention"}, "1132": {"path": "/applications/T2VLAD/model/model.py:238-275", "hash": "a4437f3c04314fe58a45bf3ac3a42238", "title": "CEModule Class Definition"}, "1133": {"path": "/applications/T2VLAD/model/model.py:277-297", "hash": "0e7adfb519d98ef147f91cf71373c661", "title": "MOE Model Initialization"}, "1134": {"path": "/applications/T2VLAD/model/model.py:298-323", "hash": "21287acf63eda84bac705ae8ccd7d36d", "title": "Model Initialization and Preparation"}, "1135": {"path": "/applications/T2VLAD/model/model.py:325-350", "hash": "c4dd759345348f0d1d5988abd6cf24aa", "title": "Gated Embedding Units for MOE Computation"}, "1136": {"path": "/applications/T2VLAD/model/model.py:351-374", "hash": "7df3eb6125c455c00f61db1efee64c3f", "title": "Gated Text Embeddings in Model.py"}, "1137": {"path": "/applications/T2VLAD/model/model.py:376-397", "hash": "1bffcb436b9ab9ba91abedf45503d15e", "title": "Multi-Modal MOE Weights and Feature Extraction"}, "1138": {"path": "/applications/T2VLAD/model/model.py:398-422", "hash": "ed733ed50a8841042c5e220f73ddc01a", "title": "Cross-View Video Localization via VLAD and MOE"}, "1139": {"path": "/applications/T2VLAD/model/model.py:423-456", "hash": "08cb18dc82ddd6b9799557e0e0d46d22", "title": "T2VLAD Model Layers Explained"}, "1140": {"path": "/applications/T2VLAD/model/model.py:458-485", "hash": "2475a2c25f51224c92b9b98b2822e2d4", "title": "Sharded Embedding Similarity Matrix Function"}, "1141": {"path": "/applications/T2VLAD/model/model.py:486-507", "hash": "8e0f4f45ca24070970b79970b7be9dd3", "title": "Video-Text Similarity Calculator"}, "1142": {"path": "/applications/T2VLAD/model/model.py:508-526", "hash": "cc2a91311246f737b390ff0daa439bec", "title": "Tensor Weights Combination and Normalization"}, "1143": {"path": "/applications/T2VLAD/model/model.py:527-533", "hash": "344603c015a05c4f4a5ae7700e76cddb", "title": "Video-Text Similarity Calculator"}, "1144": {"path": "/applications/T2VLAD/model/net_vlad.py", "hash": "54e0bfe1fe925a0ce5f69ab84914fef5", "title": "NetVLAD in T2VLAD Model Initialization"}, "1145": {"path": "/applications/T2VLAD/model/net_vlad.py:1-33", "hash": "ecf6a3fd8aea9c01b36339616c2ab86d", "title": "NetVLAD Algorithm: Implementation and Parameters"}, "1146": {"path": "/applications/T2VLAD/model/net_vlad.py:34-44", "hash": "5c73a86010505b33e78759e6f757b12f", "title": "Initializing VLAD Model Parameters"}, "1147": {"path": "/applications/T2VLAD/model/net_vlad.py:46-76", "hash": "4976b5e8ab63fc2ef9f5e25da85cea10", "title": "T2VLAD: Sanity Checks and Forward Pass"}, "1148": {"path": "/applications/T2VLAD/model/net_vlad.py:77-99", "hash": "5367835f76c5a2019477364f6ac95b89", "title": "Batch Normalized VLAD Representation Generation"}, "1149": {"path": "/applications/T2VLAD/model/net_vlad.py:100-100", "hash": "5c472512456723e03051d1652085aefe", "title": "VLAD Feature Extraction in NetVLAD Model"}, "1150": {"path": "/applications/T2VLAD/model/text.py", "hash": "c4ed57127843a9ce600cde4d9f04f4e9", "title": "Text Embedding for Video Descriptions"}, "1151": {"path": "/applications/T2VLAD/model/text.py:1-37", "hash": "3d23f4cfed112068bcfed0080a41d53c", "title": "Text Embedding Interface"}, "1152": {"path": "/applications/T2VLAD/model/text.py:38-73", "hash": "65a32dd97eec0d4f6d6025adde87d310", "title": "W2VEmbedding: Text Embedding with Word2Vec"}, "1153": {"path": "/applications/T2VLAD/model/text.py:74-101", "hash": "70295451fc0d1eaa9b29b0c79472ca0a", "title": "Initializing Text2Vec Model Class"}, "1154": {"path": "/applications/T2VLAD/model/text.py:103-130", "hash": "6a208229a744b406ba470a801880a558", "title": "Text Embedding Class"}, "1155": {"path": "/applications/T2VLAD/model/text.py:131-146", "hash": "da2e351839f2974b4f189c00c2401f16", "title": "OpenAI GPT Embedding Tokenizer"}, "1156": {"path": "/applications/T2VLAD/parse_config.py", "hash": "b24109bfd057e2ff10982c0f22ae8dee", "title": "ConfigParser: Config Management & Parsing"}, "1157": {"path": "/applications/T2VLAD/parse_config.py:1-35", "hash": "b94143530daea5c7915afca2d107c9ce", "title": "ConfigParser Class Overview"}, "1158": {"path": "/applications/T2VLAD/parse_config.py:36-62", "hash": "32bb9e8659207b2fe79c03c5cde498e7", "title": "Initializing Argument Parser and Config Loading"}, "1159": {"path": "/applications/T2VLAD/parse_config.py:63-88", "hash": "d82ffa88fadfc1025d7f0021dc066093", "title": "Config-Based Model Saving and Logging"}, "1160": {"path": "/applications/T2VLAD/parse_config.py:89-112", "hash": "4d87b71dd7ebc471ced40c12b84a90f1", "title": "Directory Purging and Recreation in parse_config.py"}, "1161": {"path": "/applications/T2VLAD/parse_config.py:113-134", "hash": "d5439cf2ee4750d25b5437a0fda08c67", "title": "Config Parser and Custom Arguments"}, "1162": {"path": "/applications/T2VLAD/parse_config.py:135-159", "hash": "a8549e62c61961420a2b39acff2b8470", "title": "Config File Processing and Class Initialization"}, "1163": {"path": "/applications/T2VLAD/parse_config.py:160-190", "hash": "0f6bb97e5acda21d3169d38fd75510f9", "title": "Overwriting Check and Config Updates"}, "1164": {"path": "/applications/T2VLAD/parse_config.py:191-232", "hash": "e98c75b3d6965101966c20abf237f4de", "title": "Parse Config Class"}, "1165": {"path": "/applications/T2VLAD/parse_config.py:233-239", "hash": "ecb06be395ee5ae6c58125859eacacdc", "title": "Nested Object Access and Modify Functions"}, "1166": {"path": "/applications/T2VLAD/test.py", "hash": "18006b0a05be55a1e15ad4cd6984f0b7", "title": "PaddleVideo: Prediction Compression and Evaluation"}, "1167": {"path": "/applications/T2VLAD/test.py:1-33", "hash": "16b8ca6f84323d52a6c012eda680726b", "title": "Compress Predictions in PaddleVideo Library"}, "1168": {"path": "/applications/T2VLAD/test.py:34-51", "hash": "a143d73ed677a788a078d3e5de025463", "title": "Input Shape Validation: Ensuring Compatibility"}, "1169": {"path": "/applications/T2VLAD/test.py:52-84", "hash": "53adf989c25f81e54493e0bba509a317", "title": "Function for Initializing Paddle.js Model and Data Loader"}, "1170": {"path": "/applications/T2VLAD/test.py:85-116", "hash": "8609245fece6d9bf89accbee9737caf9", "title": "Model Evaluation Initialization"}, "1171": {"path": "/applications/T2VLAD/test.py:117-146", "hash": "77444e194749f905b645d5365fdfc32b", "title": "Paddle Model Initialization and Dataset Preparation"}, "1172": {"path": "/applications/T2VLAD/test.py:147-167", "hash": "d07a45f3ceb68b36d17087b89359451b", "title": "Video Sub-Sample Processing with T2VLAD"}, "1173": {"path": "/applications/T2VLAD/test.py:168-190", "hash": "7e8808123797696ea0cf83137ee4f317", "title": "Metrics Calculation and Logging"}, "1174": {"path": "/applications/T2VLAD/test.py:193-206", "hash": "276d15915986d9010f3c39ede431abe7", "title": "Argument Parsing and Configuration Loading"}, "1175": {"path": "/applications/T2VLAD/train.py", "hash": "a3120c278e9d7b3d5939224cf6dfd3a0", "title": "Video Analysis Model Training Script"}, "1176": {"path": "/applications/T2VLAD/train.py:1-35", "hash": "8db8945237493c7db228dd5837094c24", "title": "PaddleVideo: Training Framework Setup"}, "1177": {"path": "/applications/T2VLAD/train.py:37-67", "hash": "0f4bfc6d1b543597ae301a0da91b6836", "title": "Experiment Initialization Function"}, "1178": {"path": "/applications/T2VLAD/train.py:68-92", "hash": "f69d4415d788ae2499ce103ab9a71956", "title": "Model Initialization and Training Setup"}, "1179": {"path": "/applications/T2VLAD/train.py:93-115", "hash": "30e2f37a5bc795646eec61288a01e928", "title": "Train Model and Save Best"}, "1180": {"path": "/applications/T2VLAD/train.py:116-133", "hash": "bfd8c84e064c4362f6cf7e95f5b332da", "title": "Command-line Arguments for Video Analysis Training"}, "1181": {"path": "/applications/T2VLAD/train.py:135-151", "hash": "9c375171540e27616724573ed68b16ca", "title": "Command-Line Training Setup"}, "1182": {"path": "/applications/T2VLAD/trainer/__init__.py", "hash": "da7ae981fc6fb2eef2419c801abd22c1", "title": "Importing Trainer Functions"}, "1183": {"path": "/applications/T2VLAD/trainer/trainer.py", "hash": "229f18465d70534598724ecbeda53feb", "title": "Memory-Efficient Video Retrieval Trainer"}, "1184": {"path": "/applications/T2VLAD/trainer/trainer.py:1-31", "hash": "08ea4dff6b4c5527ecdb14e08d992200", "title": "PaddlePaddle Video Retrieval Trainer"}, "1185": {"path": "/applications/T2VLAD/trainer/trainer.py:32-66", "hash": "ef9ea08309f0ef2e20a9f3a1b7b7aee5", "title": "Evaluation Samples Duplication and Yielding"}, "1186": {"path": "/applications/T2VLAD/trainer/trainer.py:67-89", "hash": "99b573ec80c06313cc5c9da4683549a1", "title": "Epoch-based Model Trainer Class"}, "1187": {"path": "/applications/T2VLAD/trainer/trainer.py:91-117", "hash": "b0c87ef73d9d4046c96ed8659907257d", "title": "Batch Training with Model Loss Computation"}, "1188": {"path": "/applications/T2VLAD/trainer/trainer.py:119-150", "hash": "d10e9a20b23c1a662db0f0e9970d0361", "title": "ML Model Training Loop and Scheduler"}, "1189": {"path": "/applications/T2VLAD/trainer/trainer.py:151-171", "hash": "0636827e35b10fb5925a74961e460c80", "title": "Model Evaluation Initialization"}, "1190": {"path": "/applications/T2VLAD/trainer/trainer.py:172-190", "hash": "5a994262203ec3f306cca838e348d9dd", "title": "Batch Subsampling for Video ML Model"}, "1191": {"path": "/applications/T2VLAD/trainer/trainer.py:191-209", "hash": "6c07277224137a33996b07d0b77dc550", "title": "PaddlePaddle-based Similarity Calculation for T2VLAD Training"}, "1192": {"path": "/applications/T2VLAD/trainer/trainer.py:210-228", "hash": "70d193535e44a1dd50c3ed429ed2f406", "title": "Epoch Metrics Tracking and Visualization"}, "1193": {"path": "/applications/T2VLAD/trainer/trainer.py:229-249", "hash": "c8b7f708b8c2963b2e3cfa08b70aff1a", "title": "Batch-wise Validation Metrics Calculation and Logging"}, "1194": {"path": "/applications/T2VLAD/trainer/trainer.py:250-267", "hash": "784ca49b2b1c8d7fa6904770d13a0bba", "title": "Top-K Metric Implementation"}, "1195": {"path": "/applications/T2VLAD/trainer/trainer.py:268-280", "hash": "913b6f78b9225805d1565e4cd2519378", "title": "Nested Predictions and Progress Functions"}, "1196": {"path": "/applications/T2VLAD/utils/__init__.py", "hash": "9292304a452b87320ec116ec6f752fb0", "title": "Import All from Util Module"}, "1197": {"path": "/applications/T2VLAD/utils/util.py", "hash": "b2d65261df2454c06003d12c73c1584a", "title": "Utility Functions for T2VLAD"}, "1198": {"path": "/applications/T2VLAD/utils/util.py:1-50", "hash": "47d19c0a9ce3a6f7e8aeaf19f6a4a61a", "title": "Utility Functions"}, "1199": {"path": "/applications/T2VLAD/utils/util.py:51-76", "hash": "4157dab36f990f94771495be9fd53dc5", "title": "Multifunctional Memory, Dictionary, and Expert Categorization"}, "1200": {"path": "/applications/T2VLAD/utils/util.py:77-103", "hash": "ca69c723d947979b04c7ff46a5fe6140", "title": "Temporal Expert Management and Utilities"}, "1201": {"path": "/applications/T2VLAD/utils/util.py:106-143", "hash": "6ae12a3e2a362f7a973c68cd163d4159", "title": "JSON, Hashable Dictionaries & Configuration Utilities"}, "1202": {"path": "/applications/T2VLAD/utils/util.py:144-165", "hash": "7728b39610ad75e5f408030d8f41aa00", "title": "Modality Dimensional Organization"}, "1203": {"path": "/applications/T2VLAD/utils/util.py:166-181", "hash": "91d9fb6cf4ee419c35239861fbc14f64", "title": "Determining Input-Output Dimensions for Expert Types and Temporal Methods"}, "1204": {"path": "/applications/T2VLAD/utils/util.py:182-202", "hash": "518a73dee0f02c52f839c291123dfa17", "title": "Dimensional Assignment for Experts: Util.py 182-202"}, "1205": {"path": "/applications/T2VLAD/utils/util.py:203-226", "hash": "9b0c7e05efea445108ab712d359aedf3", "title": "Configuring Expert Dimensions for T2VLAD"}, "1206": {"path": "/applications/T2VLAD/utils/util.py:227-258", "hash": "6a49751c6c2a14e7a10febcdf7810f30", "title": "Dimensionality Adjustment and Tensor Utilities"}, "1207": {"path": "/applications/T2VLAD/utils/util.py:260-284", "hash": "1b5d4df782c5547818ea1c4415abeee8", "title": "Normalize and Convert Image Tensors to Numpy Array"}, "1208": {"path": "/applications/T2VLAD/utils/util.py:285-321", "hash": "7887416340db91a8c1fafbbb27089dbe", "title": "Utility Functions in util.py"}, "1209": {"path": "/applications/T2VLAD/utils/util.py:323-327", "hash": "184145183310aff57127478922574983", "title": "Create Directory If Non-Existent"}, "1210": {"path": "/applications/TableTennis/ActionRecognition/README.md", "hash": "acec7b82be7a6a0e6833e2ad9b101678", "title": "Table Tennis Action Recognition with VideoSwinTransformer"}, "1211": {"path": "/applications/TableTennis/ActionRecognition/README.md:1-43", "hash": "5046c43b1f5295d394c19dea0cada764", "title": "Table Tennis Action Recognition with VideoSwinTransformer"}, "1212": {"path": "/applications/TableTennis/ActionRecognition/README.md:44-66", "hash": "859a7d2e62b99e34645ca5ce3a9ba6b8", "title": "TableTennis Action Recognition with VideoSwin"}, "1213": {"path": "/applications/TableTennis/ActionRecognition/README.md:68-98", "hash": "8376fff90e0c0cccdc774b748e3eed39", "title": "Visualizing Predictions in Table Tennis Action Recognition"}, "1214": {"path": "/applications/TableTennis/datasets/script/submission_format_transfer.py", "hash": "c4c132d8a8c380e7727f2761df26779e", "title": "JSON Table Tennis Data Formatter"}, "1215": {"path": "/applications/TableTennis/datasets/script/submission_format_transfer.py:1-49", "hash": "d4f37053be471762b18b4e8b62e8010a", "title": "JSON Frame Rate Conversion and Formatting"}, "1216": {"path": "/applications/TableTennis/datasets/script/submission_format_transfer.py:50-64", "hash": "c854d313ef3f90d2727c759dd43e3fa5", "title": "Table Tennis Submission Format"}, "1217": {"path": "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py", "hash": "afc581309dc64fabf5059cfaa5370a2d", "title": "TableTennis Video Inferencer"}, "1218": {"path": "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py:1-50", "hash": "1e6233115751e8e41f2dfbdeb1e75153", "title": "BMN Model Loader for Baidu Cloud"}, "1219": {"path": "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py:52-84", "hash": "34aa80592ae1ba2eed2ad3d98123ebed", "title": "Video Feature Extraction and Bmn Prediction"}, "1220": {"path": "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py:85-93", "hash": "2b51d4e59a36a396b51f9cdb3f9f78c1", "title": "JSON Inference Results Writer"}, "1221": {"path": "/applications/TableTennis/fix_bad_label.py", "hash": "0309e8c6d47d313de7be4a642e9e06b8", "title": "Fix Labels in Table Tennis App"}, "1222": {"path": "/applications/TableTennis/get_instance_for_bmn.py", "hash": "7b499546380e721a79c3620ebf76789b", "title": "BMN Model Ground Truth Data for Table Tennis"}, "1223": {"path": "/applications/TableTennis/get_instance_for_bmn.py:1-48", "hash": "c1e41c7d9acd82ee1c583fed15e8c41c", "title": "BMN Model Ground Truth Generation"}, "1224": {"path": "/applications/TableTennis/get_instance_for_bmn.py:49-74", "hash": "92264ebe973fc45565c7758c52c31207", "title": "Video Action Extraction Algorithm"}, "1225": {"path": "/applications/TableTennis/get_instance_for_bmn.py:75-108", "hash": "67ec69769c223cdebe4f673fafb6babf", "title": "Combile GTS Segments"}, "1226": {"path": "/applications/TableTennis/get_instance_for_bmn.py:109-134", "hash": "7628ac5c2ebcb66ad16e3487df3c9cfc", "title": "Segmenting and Processing Actions List for BMN"}, "1227": {"path": "/applications/TableTennis/get_instance_for_bmn.py:135-154", "hash": "efd58fae9bb839413dcb3104abddbb48", "title": "Randomized Video Segment Selection"}, "1228": {"path": "/applications/TableTennis/get_instance_for_bmn.py:155-182", "hash": "a4a6a8259138cf35e1c1e6785566682e", "title": "Video Data Segmentation and Annotation"}, "1229": {"path": "/applications/TableTennis/get_instance_for_bmn.py:183-207", "hash": "11f578b7757c6022d2210e76f91df01c", "title": "Video Feature Extraction and Parsing"}, "1230": {"path": "/applications/TableTennis/get_instance_for_bmn.py:208-227", "hash": "4dbf9651b6afdecc1d447172804151c6", "title": "Table Tennis Dataset Processing and Saving"}, "1231": {"path": "/applications/TableTennis/gts_format_transfer.py", "hash": "d284f5cd9cd43d6f9bdaa2869faa2955", "title": "JSON Format Converter"}, "1232": {"path": "/applications/TableTennis/predict/action_detect/action.py", "hash": "93465fa04de2e7291f1cf9d1bbda094d", "title": "Baidu Cloud Action Detection Script"}, "1233": {"path": "/applications/TableTennis/predict/action_detect/action.py:1-48", "hash": "d9e4552daba842f008fbe6a423bfb410", "title": "Python Action Detection with Baidu Cloud"}, "1234": {"path": "/applications/TableTennis/predict/action_detect/action.py:49-76", "hash": "65a9cc84afce4896e4ecf15787ffef9f", "title": "ModelPredict Class Initialization and Configuration"}, "1235": {"path": "/applications/TableTennis/predict/action_detect/action.py:77-108", "hash": "118e088f07486f7fa21306061319f564", "title": "Action Detection via Multimodal Feature Extraction"}, "1236": {"path": "/applications/TableTennis/predict/action_detect/action.py:109-136", "hash": "20b7553647d253f3effe82939a5e308e", "title": "Video Classification and Feature Extraction Model"}, "1237": {"path": "/applications/TableTennis/predict/action_detect/action.py:137-158", "hash": "4bd0f8600b9a8680c03f6dd73f9f6d0e", "title": "Extracting Image and Audio Features"}, "1238": {"path": "/applications/TableTennis/predict/action_detect/action.py:159-185", "hash": "33e0d6084e94c1b550b07b6c721e2bf7", "title": "Action Detection Model Inference"}, "1239": {"path": "/applications/TableTennis/predict/action_detect/action.py:186-186", "hash": "3c46aa928a7adc5786614f1a19a3914d", "title": "File Data Writing"}, "1240": {"path": "/applications/TableTennis/predict/action_detect/logger.py", "hash": "53f5b3637c8c9c3a071f142122407137", "title": "Custom Logger for Action Detection"}, "1241": {"path": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py", "hash": "44df9225891de16d0f36afef6b29ca58", "title": "Audio Features for Table Tennis Prediction"}, "1242": {"path": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:1-39", "hash": "b6d9a0e222451c783fbcad8d6b94f263", "title": "Audio Feature Extraction in TableTennis App"}, "1243": {"path": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:40-70", "hash": "fc8acc6472950d1663bb0e5258b8aecb", "title": "Audio Feature Extraction with MFCCs"}, "1244": {"path": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:71-91", "hash": "7dd5f6fcb5148f8ce60b8c5546ffed76", "title": "MFCC Feature Extraction for Speech Processing"}, "1245": {"path": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:92-112", "hash": "0b4a8af6d949ed962117340831e211f1", "title": "Generate Spectrogram from Audio Data"}, "1246": {"path": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:113-137", "hash": "8bd2e767aba04511c6a7118dce01ef36", "title": "Mel Spectrogram Conversion and Wav Data Processing"}, "1247": {"path": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:138-158", "hash": "b356029775a4bcd3298689f6cd8dc0e0", "title": "Table Tennis Audio Feature Extraction"}, "1248": {"path": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:160-183", "hash": "2fb57c03975e4e4a56340a1d68a8ded2", "title": "MFCC Extraction with VGG-16"}, "1249": {"path": "/applications/TableTennis/predict/action_detect/mfcc/model_config.py", "hash": "574c1392fd9221b3a4d9820af61e3c18", "title": "Audio Feature Extraction Model"}, "1250": {"path": "/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py", "hash": "48665dcb4614e556d4cc94b8dcd68918", "title": "Global VGGish Parameters for Action Detection"}, "1251": {"path": "/applications/TableTennis/predict/action_detect/models/audio_infer.py", "hash": "d5268ab941ce39d3589c4911b8d70db7", "title": "Audio Inference with PaddleVideo"}, "1252": {"path": "/applications/TableTennis/predict/action_detect/models/audio_infer.py:1-37", "hash": "2de0cc0fe8f56720f65df4e3c99dec9c", "title": "Audio Model Inference Initialization"}, "1253": {"path": "/applications/TableTennis/predict/action_detect/models/audio_infer.py:39-67", "hash": "5a48db26b2b0a3e73d966f71435741c8", "title": "Audio Inference Class with PaddleVideo"}, "1254": {"path": "/applications/TableTennis/predict/action_detect/models/audio_infer.py:69-78", "hash": "d50fa409ea9c08c993b1c2a011c7719e", "title": "Audio Inferencing and Prediction"}, "1255": {"path": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py", "hash": "798f43b17c8fae303b74c3306aa68e8b", "title": "GPU-Optimized Action Detection"}, "1256": {"path": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:1-39", "hash": "bf2aa376038c4947a9466faa2db2e1b2", "title": "BMN Infer Model Class"}, "1257": {"path": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:40-65", "hash": "d59ee6d035fcca507d91b19e22b78f19", "title": "Inference Model Setup and Generation"}, "1258": {"path": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:66-87", "hash": "ba0792e830b1495d8f5b0348ea644bc7", "title": "Boundary Proposition Generator"}, "1259": {"path": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:88-112", "hash": "2c49fde5b09170481c6e6ebe8355881a", "title": "Boundary-Based Prediction Model"}, "1260": {"path": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:113-135", "hash": "6861da5a3b176f065491c5cb96bc08d3", "title": "Running Average Window Predictions"}, "1261": {"path": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:137-164", "hash": "e708d79ca8b20b2f59d3ac40d13bca10", "title": "BMN Infer Model Prediction"}, "1262": {"path": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py", "hash": "61f49fa4011bf8cf63aea20684365897", "title": "LSTM-Based Table Tennis Action Detection"}, "1263": {"path": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:1-38", "hash": "a68bb081312e1133957919b718e066b9", "title": "LSTM Action Detection Model Inferencing"}, "1264": {"path": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:40-62", "hash": "c01c8f5b71d67d17f6800ca6678e1bff", "title": "LSTM Inferencing Setup"}, "1265": {"path": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:63-92", "hash": "704d2cf4c87c4a3f256c88239f7fa232", "title": "Table Tennis Action Detection Model"}, "1266": {"path": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:93-111", "hash": "3c44b1d03bdba0b51da6ecab2446a214", "title": "LSTM Action Detection Inference"}, "1267": {"path": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:113-136", "hash": "41b58faf8ab11d95addecb538d160f4f", "title": "LSTM-Based Table Tennis Action Detection"}, "1268": {"path": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:138-158", "hash": "d19d2b5f7a5c561a6eabccc3cff4a351", "title": "LSTM Predicts Table Tennis Action"}, "1269": {"path": "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py", "hash": "4daba0d8c8459ad1f852dda1f9f04e99", "title": "PPTSM Inference with PaddlePaddle"}, "1270": {"path": "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py:1-38", "hash": "29a38bfe92cc70fba93f8b35909caeae", "title": "Initialize PPTSM InferModel with PaddlePaddle"}, "1271": {"path": "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py:40-68", "hash": "fa59ead75b04e90520a231ea316df989", "title": "InferModel Video Frame Inference"}, "1272": {"path": "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py:69-77", "hash": "530bc4743cca1584885036b45291dca9", "title": "Efficient Image Inference and Prediction"}, "1273": {"path": "/applications/TableTennis/predict/action_detect/reader/__init__.py", "hash": "e80aa10801f2544132dd12590a368447", "title": "Alphabetical Reader Registration"}, "1274": {"path": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py", "hash": "3ebc1f6bf2c5d5afdaafee58d2c6a29a", "title": "BMNINF Reader for Table Tennis"}, "1275": {"path": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:1-49", "hash": "6645052c4a5973d9cd292e1e74285713", "title": "BMNINF Reader for PaddleVideo"}, "1276": {"path": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:50-72", "hash": "fc9447c52d132ee3fd4d402a6266abc4", "title": "BMNINF Reader for Table Tennis"}, "1277": {"path": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:73-103", "hash": "b8ef4c607184227d424fe3fedaa38567", "title": "Video Analysis Dataset Creation Code"}, "1278": {"path": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:104-133", "hash": "5bf98e1c9cbd4b4be997920e649d2871", "title": "CTCN Model Reader for Table Tennis Action Detection"}, "1279": {"path": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:134-154", "hash": "acacfa735c775828f2ab13bb702c5781", "title": "Inference Reader: Iterating and Yielding Batches"}, "1280": {"path": "/applications/TableTennis/predict/action_detect/reader/feature_reader.py", "hash": "ea8814661b1e19367e995adc01a68929", "title": "Table Tennis Action Detection Reader"}, "1281": {"path": "/applications/TableTennis/predict/action_detect/reader/feature_reader.py:1-34", "hash": "3df0e1ef9a2b7a18dcfa3192fdc267a0", "title": "YouTube-8M Dataset Feature Reader"}, "1282": {"path": "/applications/TableTennis/predict/action_detect/reader/feature_reader.py:36-71", "hash": "4cf086e9919632ac1b6939bfa30c7919", "title": "Table Tennis Action Detector Feature Reader"}, "1283": {"path": "/applications/TableTennis/predict/action_detect/reader/feature_reader.py:72-91", "hash": "f90bba92e165376c4bae3c8c23cb9e2e", "title": "Feature Extractor: Table Tennis Action Prediction"}, "1284": {"path": "/applications/TableTennis/predict/action_detect/reader/reader_utils.py", "hash": "1cf7a67375551557c23d8fc74b80d8b8", "title": "Handling Errors in TableTennis Reader"}, "1285": {"path": "/applications/TableTennis/predict/action_detect/reader/reader_utils.py:1-33", "hash": "44ea3247f788801708aa83f0c6fab14c", "title": "Reader Error Handling in PaddleVideo TableTennis"}, "1286": {"path": "/applications/TableTennis/predict/action_detect/reader/reader_utils.py:34-81", "hash": "bec22c02be51f6889c4721ece66356d5", "title": "Video Reader Classes and Registry"}, "1287": {"path": "/applications/TableTennis/predict/action_detect/reader/reader_utils.py:82-107", "hash": "e5d5deec66afb3706010717423afa211", "title": "Singleton Reader Registration Utilities"}, "1288": {"path": "/applications/TableTennis/predict/action_detect/utils/config_utils.py", "hash": "3ad548cbc9d705a0ab69a545c42623fe", "title": "Config Utilities for TableTennis"}, "1289": {"path": "/applications/TableTennis/predict/action_detect/utils/config_utils.py:1-47", "hash": "8cbd019486bd442bb0529c84c679d41c", "title": "Config Utils for TableTennis"}, "1290": {"path": "/applications/TableTennis/predict/action_detect/utils/config_utils.py:48-80", "hash": "a6767b4b25cd43b1b7ce37b1f63ad8dd", "title": "Config Utils for AttrDict Manipulation"}, "1291": {"path": "/applications/TableTennis/predict/action_detect/utils/config_utils.py:81-81", "hash": "42b405179529f6092c96ffa42a2ec624", "title": "Context Changer Logging"}, "1292": {"path": "/applications/TableTennis/predict/action_detect/utils/preprocess.py", "hash": "cc39dd0af799de49f7c95e85d75d8198", "title": "Preprocess.py: Video, Audio, Image Toolkit"}, "1293": {"path": "/applications/TableTennis/predict/action_detect/utils/process_result.py", "hash": "9f3fc1e9815b047a61ae087d908fe6f8", "title": "One-Dimensional NMS for Video Analysis"}, "1294": {"path": "/applications/TableTennis/predict/action_detect/utils/process_result.py:1-39", "hash": "da8838534b17f2fb5e8e315f7154fa45", "title": "Video Action Detection Result Calculator"}, "1295": {"path": "/applications/TableTennis/predict/action_detect/utils/process_result.py:42-78", "hash": "6335272fc27ac8777649b51213e9ac80", "title": "Non-Maximal Suppression for Bounding Boxes"}, "1296": {"path": "/applications/TableTennis/predict/action_detect/utils/process_result.py:79-110", "hash": "d16babe93e6d3cfcb734a319454052ba", "title": "Efficient Video Property Processing and Classification"}, "1297": {"path": "/applications/TableTennis/predict/action_detect/utils/process_result.py:111-136", "hash": "e3a6763b8baca5c3e42beb216ea75003", "title": "Sorting Prop Filter Timestamps for Action Detection"}, "1298": {"path": "/applications/TableTennis/predict/action_detect/utils/process_result.py:137-155", "hash": "1b99eac94c589d2843861be0bc101faf", "title": "Non-Max Suppression Result Function"}, "1299": {"path": "/applications/TableTennis/predict/eval.py", "hash": "1fb8e078879c908529973c8216aca7a4", "title": "Optimized Table Tennis Predictions"}, "1300": {"path": "/applications/TableTennis/predict/eval.py:1-41", "hash": "4c2f49595b830a9360e89a2d535607bf", "title": "Loading and Processing Ground Truth Data"}, "1301": {"path": "/applications/TableTennis/predict/eval.py:42-73", "hash": "90ea47c2270434e5b23cc6723bf53624", "title": "Computer Vision Interval Union: Filter Proposals"}, "1302": {"path": "/applications/TableTennis/predict/eval.py:74-108", "hash": "4fbbdce86c407013f4540c9b4b314276", "title": "Converters for Boxes and Labels"}, "1303": {"path": "/applications/TableTennis/predict/eval.py:109-142", "hash": "73d4e0ac365db690a55976471d69a5a2", "title": "Evaluating Model Performance on Video Frames"}, "1304": {"path": "/applications/TableTennis/predict/eval.py:145-166", "hash": "0aa09a25c32f31b52ce68a95f784bf00", "title": "IoU-based Box Evaluation"}, "1305": {"path": "/applications/TableTennis/predict/eval.py:167-186", "hash": "90d8fb3304eaee5dbf4b54f5592e3a09", "title": "Precision-Recall Calculator"}, "1306": {"path": "/applications/TableTennis/predict/eval.py:187-210", "hash": "d9a22d379da23324e894209bcb75e82a", "title": "Table Tennis Prediction Model Evaluation"}, "1307": {"path": "/applications/TableTennis/predict/eval.py:212-239", "hash": "d75cfc7d847df2575de724919c9eef98", "title": "Table Tennis Video Analysis Model Evaluator"}, "1308": {"path": "/applications/TableTennis/predict/eval.py:240-270", "hash": "12078adb494ea9f65a0b42ebecf60970", "title": "Table Tennis Prediction Evaluation"}, "1309": {"path": "/applications/TableTennis/predict/eval.py:271-287", "hash": "ef2b51cee2c58030d8a809334b03a443", "title": "Optimizing IOU and Scores for F1 Evaluation"}, "1310": {"path": "/applications/TableTennis/predict/predict.py", "hash": "fa344d2aa345ab1013a9dae21dd11ce8", "title": "TableTennis Video Prediction"}, "1311": {"path": "/applications/TableTennis/predict/predict.py:1-35", "hash": "1e886a2e690f58e1c462a4e6cfd39630", "title": "Video Prediction Setup: PaddleVideo's TableTennis"}, "1312": {"path": "/applications/TableTennis/predict/predict.py:36-36", "hash": "e1e481cfaa70f445ad6d97b8715e623b", "title": "Saving Data to File"}, "1313": {"path": "/applications/TableTennis/val_split.py", "hash": "7e394ef2864eeac0738e68b8db01e749", "title": "JSON Split for Validation and Training Sets"}, "1314": {"path": "/applications/VideoQualityAssessment/README.md", "hash": "1a9aa48f48816c5357cec5b07e025400", "title": "PaddlePaddle 2.1 Video Quality Assessment Model"}, "1315": {"path": "/applications/VideoQualityAssessment/README.md:1-58", "hash": "39e89e979c8985131099444fcf296b99", "title": "Video Quality Assessment Model with PaddlePaddle"}, "1316": {"path": "/applications/VideoQualityAssessment/README.md:59-98", "hash": "9a890456fe343a2680e27ebc9511039b", "title": "Multigpu Distributed Training with PaddleVideo"}, "1317": {"path": "/applications/VideoQualityAssessment/README.md:101-144", "hash": "4f12de5dda7d26d9c920e5477c04d029", "title": "Epoch Analysis and Fine-tuning"}, "1318": {"path": "/applications/VideoQualityAssessment/README.md:145-179", "hash": "f080dff2404216752b99484dc2c6be67", "title": "PaddleVideo: TSM Regression for Video Quality"}, "1319": {"path": "/applications/VideoQualityAssessment/README.md:181-189", "hash": "75130cb72c39490b6667461f19b1a3ed", "title": "Video Quality Assessment with SROCC and PLCC"}, "1320": {"path": "/applications/VideoQualityAssessment/main.py", "hash": "bcb3e1b1dc1a0eaec7bcc4b82f5a0427", "title": "Video Quality Assessment Training and Testing with PaddleVideo"}, "1321": {"path": "/applications/VideoQualityAssessment/main.py:1-30", "hash": "c73533a0c478058cac1ab10abf8be3f1", "title": "Training Models with PaddleVideo"}, "1322": {"path": "/applications/VideoQualityAssessment/main.py:31-52", "hash": "747059b7041a2c0ed51539d9f6930e69", "title": "Command-Line Arguments for Video Quality Assessment"}, "1323": {"path": "/applications/VideoQualityAssessment/main.py:53-88", "hash": "b6a15b732e19caf5d73838c486566ce1", "title": "Command Line Args for Model Training and Testing"}, "1324": {"path": "/applications/VideoQualityAssessment/paddlevideo/__init__.py", "hash": "009ca96f7dacbeed46d40a860ce22cce", "title": "PaddleVideo License and Import"}, "1325": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py", "hash": "271817cba2de0ccb76cfec66932d6a99", "title": "Video Dataset Loader for PaddleVideo"}, "1326": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py", "hash": "180ec5be148951be0f3aeab220a9a0e4", "title": "PaddleVideo Dataset Builder and Loader"}, "1327": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:1-33", "hash": "6a8676fcc933b817121eb6543858b9cc", "title": "Video Pipeline Builder in Python"}, "1328": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:34-74", "hash": "07ddf700828dddce60abd032aedf3531", "title": "Video Quality Assessment Dataset and Dataloader Builder"}, "1329": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:75-97", "hash": "a0587a07e07cc25047720791510ea4fd", "title": "DistributedBatchSampler Creation"}, "1330": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:99-126", "hash": "8c2f6dc02d2e0860741c5edaa9fec8e5", "title": "Create and Manage DataLoaders with Signal Handlers"}, "1331": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py", "hash": "96e2d1dc05e5be2416fb58057c19d75c", "title": "Python Video Dataset Module"}, "1332": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py", "hash": "d1de88ed1b4c93c4db41534b7acbbcf6", "title": "Video Dataset Loader Class"}, "1333": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py:1-34", "hash": "086b0ea28e215d96ad5cb18c8fe54950", "title": "Base Dataset Class for Custom Loaders"}, "1334": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py:36-62", "hash": "64614ea2e7b0a3ca6be7d32b86802a96", "title": "Video Index Loader Class"}, "1335": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py:63-83", "hash": "df50ac335ccb3c2ac8d0770c7a11bcf2", "title": "Dataset Class with Train-Test Methods"}, "1336": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py", "hash": "78dd426ff8e79de1a561da847f9dea07", "title": "Frame-Rec Dataset Loader for PaddleVideo"}, "1337": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:1-32", "hash": "9a864fcf32a710f9d99f88929264adf5", "title": "FrameRecDataset: PaddleVideo's Action Recognition Dataset"}, "1338": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:32-62", "hash": "0a79990418aa3806966103dc94b081d3", "title": "Video Index File Loader Class"}, "1339": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:63-88", "hash": "6c8d0413195eb2d8134194b09624e26f", "title": "Frame Dataset Preparer"}, "1340": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:89-110", "hash": "bd5bca0950ad2481b502d66172e142cf", "title": "Retry Loading Frames in Exception Handling"}, "1341": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py", "hash": "8c22f8cc709659c66a9c9a15cedaa2bf", "title": "PaddleVideo Dataset Loader"}, "1342": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:1-32", "hash": "5c1602a2fdc4a7c9deeb75345c68dba0", "title": "Python Video Dataset Loader"}, "1343": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:33-58", "hash": "8df1967a9bc974926e61ae353bc0329c", "title": "Video Dataset Loader Initialization"}, "1344": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:59-81", "hash": "d0866574499c99fb740453ffc52259b7", "title": "Video Dataset Loader: Robust Read, Pipeline, and Testing"}, "1345": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:82-95", "hash": "3fc94efffbc0a1c217c4e885288162eb", "title": "Retry-Based Video File Reader"}, "1346": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py", "hash": "1a0d88d0ba100a2c1c8d95c0babcac50", "title": "PaddleVideo Pipeline Modules"}, "1347": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py:1-40", "hash": "263ab0578de7c0c9b638de77b534b2e8", "title": "PaddleVideo Loader Pipelines"}, "1348": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py:41-50", "hash": "1133c8338e9fe3dcdc9b320c1ad0408b", "title": "PaddleVideo Pipeline Modules"}, "1349": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py", "hash": "2c7c06e54c0f6949744437bfb54629b8", "title": "Multi-Scale Image Augmentation in PaddleVideo"}, "1350": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:1-35", "hash": "c97f1607facb00da5dec52b41185002c", "title": "Scale Class for Image Resizing"}, "1351": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:36-61", "hash": "cd5ba2d5e201f70853aa9039162044cf", "title": "Preserve Aspect Ratio Resizing"}, "1352": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:62-95", "hash": "78d06ff1b62db331c0afee3534d871e5", "title": "Random Crop Pipeline for PaddleVideo"}, "1353": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:97-130", "hash": "aefb001e024bc7f3a7b5f7b8050c689f", "title": "Center Crop Image Augmentation"}, "1354": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:131-160", "hash": "7afa4a23d4bd6aa1f617fc181cd1b6dd", "title": "MultiScaleCrop: Image Resizing and Cropping"}, "1355": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:161-192", "hash": "4d951878e37f52f6a2b4dce16bcf0310", "title": "Random Crop Size Sampling"}, "1356": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:193-215", "hash": "e7a77305f997864c5c9264138c71c34b", "title": "Crop Position List Generator"}, "1357": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:216-247", "hash": "ca94e62ad0a32dd644a0b6374a77d4f3", "title": "Random Crop and Flip Augmentation"}, "1358": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:248-281", "hash": "f4a0943211b4466078af2b849aecfb05", "title": "Random Flip and Image Pipeline"}, "1359": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:282-310", "hash": "2e83fef5fe6e85281c12c46e2806c452", "title": "PIL Images to Numpy Array Augmentation"}, "1360": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:311-344", "hash": "fd09330c1ff3f1b44141daed688df323", "title": "Dynamic Image Scaling Augmentation"}, "1361": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:345-370", "hash": "1fbd91e5b97717d733cd686f7db08b1c", "title": "Jitter Resize and Random Scale Augmentation"}, "1362": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:371-403", "hash": "67472f99bd64628f160b52fe9f46401d", "title": "MultiCrop Image Resizer"}, "1363": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:404-430", "hash": "398a524543a3d697f4c6a3b3dfc1e7eb", "title": "Random Cropping Augmentation"}, "1364": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:431-452", "hash": "87a4a1197969682ce9958301aee86c8e", "title": "Image Cropping with Random Offsets"}, "1365": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:453-484", "hash": "412eedbf917ffb03e7d734263fd71eef", "title": "Image Crop and Append Function"}, "1366": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:485-498", "hash": "27c91c057786cc824e672702d01da099", "title": "Slower Pathway Frame Selection"}, "1367": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py", "hash": "49e9aa50dbb405cf3ad59c8dac740dd4", "title": "Compose Class for Video Pipeline"}, "1368": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py:1-33", "hash": "086ec70eb9d0c62d4d80a00eaab5798e", "title": "Compose Class Pipeline"}, "1369": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py:34-61", "hash": "faf28522e72258a688091d5e09e4b597", "title": "Compose Class Sequentially Combines Transforms"}, "1370": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py:62-79", "hash": "70dd31918af6130abb0e5ea492de552c", "title": "Video Pipeline Composer"}, "1371": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py", "hash": "e0af7f275b1c2dece16e75c219c271b9", "title": "PaddleVideo: MP4 Decoding and Feature Extraction"}, "1372": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:1-42", "hash": "ff53cc021fa903eb7952cbb2c8d4c106", "title": "PaddleVideo: MP4 Decoding"}, "1373": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:43-80", "hash": "1557a977c8078c819391602e7ab46265", "title": "Multi-Decoder for Data Types"}, "1374": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:81-113", "hash": "f038f329ab08c2481db1c0792b28a764", "title": "Decoding Pipeline for .pkl Files"}, "1375": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:114-139", "hash": "916b219bd6d196a00c6a3e1406f379b5", "title": "Initialize Feature Paddings and Masks"}, "1376": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:140-165", "hash": "7d859e577055785bc23bf463f762503a", "title": "Decode, Dequantize, One-Hot Labels"}, "1377": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py", "hash": "905af2eb67b6a7167bfb79f00589f8ac", "title": "Mixup for Video Quality Assessment"}, "1378": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py:1-36", "hash": "d3910f5902bad63002b74ef992f39a1d", "title": "Mixup for Video Quality Assessment"}, "1379": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py:37-72", "hash": "daa1d186e315213c5fe151b35265b0f0", "title": "Cutmix: Mixing Images and Labels in Datasets"}, "1380": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py:74-91", "hash": "84798e213023ce96a1420a5af2c224c8", "title": "Random Bounding Box Data Augmentation"}, "1381": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py", "hash": "bace34aa6d4422dbfa05a945d135949b", "title": "Custom Sampler for PIL-based Video Frames"}, "1382": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:1-32", "hash": "b9e91da9a785d568b8efa5830bdbb507", "title": "Sampler: Efficient Video Frame Sampling"}, "1383": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:33-70", "hash": "b599d5af712631fa4ecca48d872496f4", "title": "Sample Pipeline Class Definition"}, "1384": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:71-96", "hash": "4ada42d1ea9befba36d48e370d473c44", "title": "Video Frame Index Calculator"}, "1385": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:97-102", "hash": "3fd943ce1f02fee8f84d4b4c6ecf5c4e", "title": "Frame Indexing in Video Loader"}, "1386": {"path": "/applications/VideoQualityAssessment/paddlevideo/loader/registry.py", "hash": "a312ad6f5e2761b580b30bf084fa90e5", "title": "PaddleVideo Registry Management"}, "1387": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py", "hash": "b30dbab3c2a0b1aaa73ae2fc3ddf1bd9", "title": "Video Quality Assessment Metrics Initiation"}, "1388": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py", "hash": "fd19f46e4be632e3ce4d6bee345ca315", "title": "BaseMetric: Foundation for Video Quality Metrics"}, "1389": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py:1-36", "hash": "2645be2267d7aaac86a722cbbcd2b36b", "title": "Base Metric Class for Video Quality Assessment"}, "1390": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py:37-39", "hash": "eb8959a36b1d82b9eb0028be013e0c81", "title": "Base Class Accumulate Method"}, "1391": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/build.py", "hash": "1335e2fdd322045e27c0d41ba098206d", "title": "PaddleVideo Metric Builder"}, "1392": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py", "hash": "cabb18365957ed89a5c56fc9aba4c922", "title": "Pearson and Spearman Correlation Metric"}, "1393": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py:1-35", "hash": "fcbba15682443df0cd1d6ea219fc0bb1", "title": "Video Quality Metric Class Definition"}, "1394": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py:36-62", "hash": "3080da7ba4b21ba59d5327ed2c930c70", "title": "Pearson and Spearman Correlation Calculator Class"}, "1395": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py:64-72", "hash": "bf880ab1e34b72c5a7404e34a002a316", "title": "Calculate PLCC and SROCC from Output and Label Pair"}, "1396": {"path": "/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py", "hash": "ee90d4e36fd327578b4c5eaa81f0d350", "title": "PaddleVideo Metrics Registry"}, "1397": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py", "hash": "9056ed690005965067f2a6923f12b2e8", "title": "PaddleVideo Modeling"}, "1398": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py:1-24", "hash": "a3eb8d9db754ae9555befe158ff533a4", "title": "Universal Model Registration"}, "1399": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py:26-45", "hash": "c2b4acc3dd0c8ce53f678bb90c869e29", "title": "Video Model Building Toolkit"}, "1400": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py", "hash": "2b46a96824ad00d7b43582d740157e48", "title": "ResNet and ResNetTweaksTSM Imported"}, "1401": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py", "hash": "88da5f08cb869a57f68574aeb6efd71c", "title": "Dynamically Configurable ResNet Backbone"}, "1402": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:1-35", "hash": "29c7f811707199d730fa146bd0caf284", "title": "ConvBNLayer: PaddlePaddle Backbone Customization"}, "1403": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:36-58", "hash": "97d4171e438232b0f369ad51aec68742", "title": "ConvBNLayer Class Definition"}, "1404": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:59-89", "hash": "02d19295275c95ba854e76609e6ebcc5", "title": "ResNet Convolutional Layer Design"}, "1405": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:90-113", "hash": "052eb1da6d39995bd7d9d824850a3506", "title": "Bottleneck Block Construction in PaddleVideo"}, "1406": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:114-146", "hash": "640c6b62f05ccc2eb5d6523cbc90b3f1", "title": "Convolutional Neural Network BasicBlock"}, "1407": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:147-174", "hash": "a3917b6aa490d7c5262ce6ee592006ec", "title": "ResNet Forward Function Definition"}, "1408": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:175-210", "hash": "e1937759298b445a3829d5d17ebde5ce", "title": "ResNet Backbone Creator"}, "1409": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:211-232", "hash": "a6d6840f4092581f601c377f966fe365", "title": "ResNet Model Definition"}, "1410": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:233-252", "hash": "ec22b724f69bc25b9e31ce03cf09073a", "title": "Dynamic ResNet Model Architecture"}, "1411": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:253-270", "hash": "0d2fb4a6b482488f91d4bcd1c4b41030", "title": "Backbone Model Weight Initialization"}, "1412": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:271-290", "hash": "3e3bd9982244fd36e6f27389fc54b973", "title": "Forward Pass for Backbone: Conv and Pooling"}, "1413": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py", "hash": "654222887c82712c6619466c3323bcef", "title": "TSM-ResNet-C Model"}, "1414": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:1-34", "hash": "dff2e1f3a22049c42aa0366df4b8bc5b", "title": "ConvBNLayer: Convolutional BatchNorm Layer"}, "1415": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:36-60", "hash": "9cce8b484957dca98ca62602fa50b490", "title": "ConvBNLayer: Batch Normalization and Activation Layer"}, "1416": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:61-85", "hash": "29c2cf0e18cea7df1106d2d8ee98d22b", "title": "TSM ResNet Backbone Definition"}, "1417": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:86-114", "hash": "b83c9c06d6a6a203c4bf440c7f32b880", "title": "Bottleneck Block in ResNet Model"}, "1418": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:115-140", "hash": "e26511fc4e07ec8ba4103f3d01999320", "title": "Custom ResNet-D Layer with Optional Pooling"}, "1419": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:141-170", "hash": "629665bb179f2a478041e4525d96454c", "title": "TSM Convolutional Block with Shortcut Connection"}, "1420": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:171-206", "hash": "70104addbf817ecd98cc5a5ce1ca21c3", "title": "ResNet TSM Backbone with Shortcuts"}, "1421": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:207-235", "hash": "8902043e6020fb88ac54c269874102c0", "title": "ResNetTweaksTSM Instance Initialization"}, "1422": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:236-258", "hash": "354bb1e5446cb48e08311b1674d931cb", "title": "TSM-ResNet Backbone Definition"}, "1423": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:259-278", "hash": "622665f93d970b95323f07a1ccf0e616", "title": "TSM ResNet Model Builder"}, "1424": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:279-297", "hash": "9b5a944e27e8c1f2a255cff6b5c0ec7c", "title": "ResNet Weights Initialization"}, "1425": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:297-317", "hash": "c2e8e9663c3a91ec54b0314062bcd8ca", "title": "Initializing Backbone for Video Quality Assessment"}, "1426": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:318-328", "hash": "7873b1f99655b51417906aa23b7c23f8", "title": "ResNet-C Backbone for Video Quality"}, "1427": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py", "hash": "0d7be2035e3789c744dcd3d6f3a6d819", "title": "Computer Vision Model Builders"}, "1428": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py:1-36", "hash": "6ff4abd0b1e4fe47956f9be0b2c1e28d", "title": "Building Computer Vision Model Components"}, "1429": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py:39-52", "hash": "de08fe385e3c808abc1bdecbe6240c14", "title": "Building Localizer and Model Functions"}, "1430": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py", "hash": "df1e5d9adf1eaa5b6f6ee69f0f19103e", "title": "Importing Recognizers for Video Modeling"}, "1431": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py", "hash": "d3c62b0746c8599c92b1ed960d7469fc", "title": "Importing Recognizers in Video Framework"}, "1432": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py", "hash": "25bbbe5d2d22984ec79e5483835aadfa", "title": "Base Recognizer Class for PaddleVideo"}, "1433": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py:1-38", "hash": "8d1b92020590839a99fdea989e629010", "title": "Base Recognizer: Train, Validate, Test Init"}, "1434": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py:39-75", "hash": "99df2497584dad339a865c644a0e07ed", "title": "Base Model Recognizer Initialization"}, "1435": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py:76-97", "hash": "6f9129d3f977e33db92de221f0fa95ff", "title": "Abstract Recognizer Model Base Class"}, "1436": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py", "hash": "fce895b58badfb24a396eceb6f3f0695", "title": "Training Recognizer2D for 2D Models"}, "1437": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py:1-29", "hash": "c4ad2e041c612c3a4989d1a1fc2802d7", "title": "PaddleVideo's 2D Recognizer Model Training"}, "1438": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py:31-52", "hash": "08043deddbd1aed5fe29d50b7f2c4ac2", "title": "Recognizer2D: Validation and Test Methods"}, "1439": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py", "hash": "ff93c75b0c0e5f936c1c621f1c61607c", "title": "PaddleVideo Heads for Video Quality Assessment"}, "1440": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py", "hash": "792596d1dacf82b8316c6835fe8a14b0", "title": "VideoQualityAssessment BaseHead Class"}, "1441": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:1-36", "hash": "57235a6c5fcd6b43a0117af622f36f10", "title": "PaddleVideo BaseHead Class Definition"}, "1442": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:37-67", "hash": "d2223b844526405901daddb77c7d8953", "title": "BaseHead: Initializing PaddleVideo's Head Network"}, "1443": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:69-96", "hash": "26591804bd8e2c60fb45a7ad0c59593f", "title": "VideoQualityAssessment Base Head Definition"}, "1444": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:97-120", "hash": "3cdb297b9155fda69049275cda3f8183", "title": "Label Smoothing Loss Calculator"}, "1445": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:121-143", "hash": "3b5b15513609f6a6080abf448668ded0", "title": "Label Smooth Loss and Accuracy Functions"}, "1446": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py", "hash": "9c63f670c0a9f33cba293b97d6ac743b", "title": "TSM Recurrent Head: TSN-Based Classifier for Video QA"}, "1447": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:1-33", "hash": "e09feb4a425dc34f75d5d64dddc22e9b", "title": "TSM RecHead: TSN-Based Classifier for TSMs"}, "1448": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:34-62", "hash": "7cce04a11d600c4b9dd009337b19f9a8", "title": "Uniform Weights Initialization for FC Layer"}, "1449": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:63-91", "hash": "06d5bee6a82895cdcb7e4d3bceb3a5cd", "title": "TSM Recognition Head Definition"}, "1450": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:92-122", "hash": "5260c180aa2166cd3b4b295fec1ccff1", "title": "Loss Function for Score Prediction Model"}, "1451": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:123-149", "hash": "29176d771b6523864abd738378d1b1c1", "title": "Loss Calculation for TSM-REC Head"}, "1452": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:150-153", "hash": "bb6f256ebcda3f004cbfdcc52ecbb609", "title": "Squeeze and Label Smooth Loss Calculation"}, "1453": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py", "hash": "bfb31dc7ad91ac72ae410aeeb44bce28", "title": "TSN Head: Video Quality Assessment"}, "1454": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:1-31", "hash": "2034b3e9285f8322145b405979c67180", "title": "TSN Head: PaddlePaddle Video Quality Assessment"}, "1455": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:32-64", "hash": "dd5e8b038a74fe3636b88110bfc50116", "title": "Image Classification Head with GAP and Dropout"}, "1456": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:65-96", "hash": "ff95d076590704fadc38d28799c407b5", "title": "Forward Pass Function for Neural Network Head"}, "1457": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:97-97", "hash": "aee4831a79739a3e74b2745cd96e5a0a", "title": "Return Calculated Score"}, "1458": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py", "hash": "89d694f7ba12e86564cdb7cfa60b4d39", "title": "PaddleVideo Loss Functions"}, "1459": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py", "hash": "8fcff239ea631b0edc04c334fa9c124d", "title": "Base Loss Function in PaddleVideo"}, "1460": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py:1-33", "hash": "ac9b38b3d243566dfdd27ed8c6104f5c", "title": "Base Loss Function in PaddleVideo"}, "1461": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py:34-51", "hash": "515d71faec4f3a82fef7edeff0d495c0", "title": "Abstract Loss Function Base Class"}, "1462": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py", "hash": "6630bf556d88f5cff1dbb07f520caadf", "title": "L1 Loss for Video Quality"}, "1463": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py:1-33", "hash": "834cd3b336a62f15f9ec7ec299c8ab08", "title": "L1 Loss for Video Quality"}, "1464": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py:34-38", "hash": "56ab626d7b2ac32cfd6a761e8876a772", "title": "L1 Loss Calculator"}, "1465": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py", "hash": "d3c21bd708dd0dc7f38ef72523e5be33", "title": "Custom SmoothL1 Loss in Video Quality Assessment"}, "1466": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py:1-33", "hash": "779cc57e98eec524a4612720b9142159", "title": "Custom Smooth L1 Loss Function"}, "1467": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py:34-39", "hash": "7ecef77c2631770efa5225f0bb741628", "title": "Smooth L1 Loss Calculation"}, "1468": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py", "hash": "2239c3a5ce78d21d40203d506804d79e", "title": "Model Registry in PaddleVideo's Video Quality Assessment"}, "1469": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py", "hash": "f936096a9957982375e000db703092ec", "title": "Weight Initialization in PaddlePaddle"}, "1470": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py:1-36", "hash": "d3026a66e0467d65d1d51f0a52e2b1d7", "title": "Weight Initialization in PaddlePaddle Layers"}, "1471": {"path": "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py:37-55", "hash": "a75ee2a521bbdb217d0bcc73ee08bdeb", "title": "Neural Network Weight Initialization"}, "1472": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py", "hash": "ca932b3b14098f24391c0376697f2272", "title": "Video Quality Assessment Optimizer"}, "1473": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py", "hash": "63e95afb7c32f70b3905376de20f52b3", "title": "Custom Learning Rate Schedulers for PaddleVideo"}, "1474": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:1-33", "hash": "2ee153163b3a22639123e012730fb230", "title": "CustomWarmupCosineDecay Scheduler"}, "1475": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:34-55", "hash": "e58b82c334cca50977e08a743364c1df", "title": "Customizable Learning Rate Scheduler"}, "1476": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:56-81", "hash": "1f2402ff7597771c1c723f285cbfed57", "title": "CustomWarmupCosineDecay: Cosine Decay Learning Rate Optimizer"}, "1477": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:82-108", "hash": "af75182f1f399acaa299107cd3e3e234", "title": "Warmup-Cosine LR Scheduler for Video Quality"}, "1478": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:109-134", "hash": "05276312e44d8a5dc809800c13890c0c", "title": "Customizable Learning Rate Scheduler"}, "1479": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:135-160", "hash": "3971fde25375d3b95c1c22bd67d83e51", "title": "Custom Learning Rate Scheduler for Optimizers"}, "1480": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:161-196", "hash": "13e4c2d4850010851ac691fb3eb33aea", "title": "Custom Learning Rate Scheduler for PaddleVideo"}, "1481": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:197-201", "hash": "a4060b42ce0f1d678e644e0d28e41d5f", "title": "Custom Learning Rate Scheduler"}, "1482": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py", "hash": "1149ca3e17e9af94d457bc19cac3ef0a", "title": "VideoQualityAssessment LR Scheduler"}, "1483": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py:1-33", "hash": "f880fcd799f1820615ed596b48d61ac9", "title": "Learning Rate Scheduler Builder"}, "1484": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py:35-49", "hash": "43646cfc2599bf675f80b36277bd8779", "title": "Learning Rate Configurer"}, "1485": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py", "hash": "59472e93ced58a78ac0b8141922aec16", "title": "Weight Decay Optimizer Scheduler"}, "1486": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py:1-36", "hash": "a71bd772c29ebb9b3a15b53b82a1d439", "title": "Configurable Optimizer Builder\n\nTitle within 3 to 7 words: Configurable Optimizer Builder"}, "1487": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py:37-68", "hash": "c002bd389147b64f6486074e0ee6a7a7", "title": "Adam Optimizer with L2Decay and L1Decay Regularization"}, "1488": {"path": "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py:69-79", "hash": "6b5f1ef93f35a74006b352bb9d6a0b33", "title": "Weight Decay Optimizer Configurator"}, "1489": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py", "hash": "30445fa3b5e2b3ea1cebddbcec743d69", "title": "Training and Testing Functions"}, "1490": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py", "hash": "1980ce1b7e2d15f94bab6854e2add6df", "title": "Paddle Video Testing with Multi-Card Datasets"}, "1491": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py:1-35", "hash": "4c690aaf19ac77c9310f28a166280cc0", "title": "Test Model Using Paddle Framework"}, "1492": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py:36-66", "hash": "a6c3fd44b716042e12e5474c153d393a", "title": "Multi-Card GPU Model Training"}, "1493": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py:67-78", "hash": "bd8b36851fc5935e13d81d288b57902d", "title": "Batch Size, Metric Building, Iteration & Accumulation"}, "1494": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py", "hash": "e0db0db8f80e051bb038fe5155ad327b", "title": "Efficient Video Quality Assessment Training with PaddleVideo"}, "1495": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:1-28", "hash": "fd2c94dad2538f61872e2df993678478", "title": "Video Quality Assessment with PaddleVideo"}, "1496": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:29-61", "hash": "91bec0655f7b97b7ab9aa38c2829f4f2", "title": "GPU-Accelerated Model Training"}, "1497": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:62-89", "hash": "5c5cff0b795476c5caebea4f42f8bda1", "title": "Efficient Model Training Setup"}, "1498": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:90-113", "hash": "8a86e4562395988e2db32b9aade6d546", "title": "Resume Training with Data Loader and Optimizer"}, "1499": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:114-147", "hash": "782922fb64dd8cf18c6a3798c705eb5a", "title": "Training Model Iteration"}, "1500": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:148-171", "hash": "cb7d3a5bb1b4aaa1ad36a75a98d9b9df", "title": "Model Training Step in Video Quality Assessment"}, "1501": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:173-198", "hash": "e25d32b55af7d086e2df5985f1ee2c0a", "title": "Backward Propagation Optimizer"}, "1502": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:199-229", "hash": "3f753dc07d986c56e6fdd56b183eebe3", "title": "Training Video Quality Model: Update Learning Rate"}, "1503": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:230-254", "hash": "a8a7945444fbd460ce62c210257cf134", "title": "Validation Metrics Updater"}, "1504": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:255-276", "hash": "41c15f3d83bc2f0ed29e7d108becf923", "title": "Optimizer and Model State Saver"}, "1505": {"path": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:278-295", "hash": "32f3da58eb7eb661ca7711aa95e32dcf", "title": "Model Validation and Saving in Training Process"}, "1506": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py", "hash": "89548a651b0a623fb6c8aa5fc54690c6", "title": "PaddleVideo Utilities Module"}, "1507": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py", "hash": "3c23259bbff56c40854c5425b7845402", "title": "Python Config Module Builder"}, "1508": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py:1-30", "hash": "1e2d80d516362e050f59245618913ea6", "title": "Build Module from Config"}, "1509": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py:32-36", "hash": "79fe22bf5c9b7bfeb4cc938b6364c21a", "title": "Retrieve and Instantiate Object Classes from Registry"}, "1510": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py", "hash": "6a38c1183db117403f01c4664518a54b", "title": "Config Management in PaddleVideo"}, "1511": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:1-35", "hash": "22d6c733e5f058eec90ce1aea770f855", "title": "PaddleVideo Config Class & Logger"}, "1512": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:38-71", "hash": "c76364d656b8a1d72c26abe604ae8e9b", "title": "Config Parser Functions"}, "1513": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:72-110", "hash": "98ef66a21b2a7e489234e21ae2e7896f", "title": "Config File Utilities"}, "1514": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:111-142", "hash": "5912b06a97e117f2a7befd0b98ca8cdc", "title": "Config Override Function"}, "1515": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:143-174", "hash": "a1cf7d107eabc62aa75b6beea96d93ef", "title": "Config Override Function"}, "1516": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:175-180", "hash": "8df73a0a96389348ae5a4618a79fa15d", "title": "Config Parser and Validator"}, "1517": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py", "hash": "e0cdddedb9226731605f52ed2cd5471c", "title": "Distributed Computation Utilities"}, "1518": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py:1-35", "hash": "63eb0ff6b978aa6becd2f040b980ca74", "title": "Distributed Computation Utilities"}, "1519": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py:36-36", "hash": "0066016f4fdbaa655a182e5e20bcc486", "title": "Returning Modified Objects"}, "1520": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py", "hash": "3ea64a782c24febea31215b3ebebc7ec", "title": "Distributed Logger for PaddleVideo"}, "1521": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:1-40", "hash": "aac067a414805c913dec4f49af497869", "title": "PaddleVideo's Logger Class"}, "1522": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:41-74", "hash": "c3b4f2ab38310801767f75057ffb1179", "title": "Initialize PaddleVideo Logger"}, "1523": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:75-103", "hash": "0bb4d1a06835c61f9817ed654fe47bb0", "title": "Distributed App Logging Config"}, "1524": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:104-117", "hash": "97fdeca9a1820437c900970ed33104de", "title": "Setup and Retrieve Logger"}, "1525": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py", "hash": "5ce109fc47bbbfaa109bdd6798af4c1c", "title": "Precise Batch Normalization Update"}, "1526": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:1-31", "hash": "7d272fc737f40bb801d1fcd09793a367", "title": "Precise Batch Normalization for PaddleVideo"}, "1527": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:32-55", "hash": "901cbb8af2cb1fdd7c1f380bf28b031f", "title": "Precise BN Stats for Improved Validation"}, "1528": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:56-82", "hash": "b2972dd7691b867d78733e12dddff771", "title": "Precise Batch Normalization Accumulation"}, "1529": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:83-84", "hash": "c89ad2834c8b121f0aace6971731d763", "title": "Dynamic Batch Normalization Update"}, "1530": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py", "hash": "c56fa3f4c0b16e2946c85537d0a64a5b", "title": "Training Metrics Logger"}, "1531": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:1-29", "hash": "0f90873be9196f562840b850ecfe0b6a", "title": "Building Record List for PaddleVideo"}, "1532": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:30-51", "hash": "9951931d31102c7281b0cb93cb154773", "title": "Building Record List for Metric Tracking"}, "1533": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:52-90", "hash": "aa2f6a4cad65f0b86129724c21cf67ce", "title": "Record List and Average Meter Definition"}, "1534": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:91-115", "hash": "6d576b4ccd9f3e631d0d6cc84a721680", "title": "Batch and Epoch Metric Logger"}, "1535": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:117-122", "hash": "9778ef7d726f3a7398ae952261d3263d", "title": "Epoch Logger with Color Coding"}, "1536": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py", "hash": "347c71f8273cfc36338fc2dd3d7ac8fc", "title": "Registry-Based Module Customization"}, "1537": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py:1-35", "hash": "0b950c7011f77e0bb9101deedc14227a", "title": "Registry for Customizable Modules"}, "1538": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py:37-72", "hash": "542715d8799b825d83813f274a200495", "title": "Registry Class for Building Modules"}, "1539": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py:73-98", "hash": "b38e5b62d02dfa758856b3904f8b46b1", "title": "Registry Class and Function Registration"}, "1540": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py", "hash": "3ca01f4097422a347e1332844b4f7eb8", "title": "Save/Load Weights Utilities"}, "1541": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py:1-37", "hash": "7cf10193c2440861e64380a62d080d8e", "title": "Load Checkpoint for Video Quality Assessment"}, "1542": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py:38-63", "hash": "ddf7120aa09b20df86b93bd2d215f387", "title": "Loading Checkpoint Weights in Paddle"}, "1543": {"path": "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py:64-87", "hash": "4df47b949ade28584e84d3229bc617fb", "title": "Paddle Save, Load, and Create Directory Functions"}, "1544": {"path": "/applications/VideoQualityAssessment/paddlevideo/version.py", "hash": "95ac075b24d8b9990f61532a65a4a733", "title": "PaddleVideo Version Info"}, "1545": {"path": "/applications/VideoQualityAssessment/run.sh", "hash": "c6186ae37117ed34988d05dd4f1b5d31", "title": "TSM Model Training in PaddlePaddle"}, "1546": {"path": "/applications/VideoQualityAssessment/run.sh:1-19", "hash": "b7adf6b24dcb5098078eb0850ae634f6", "title": "CUDA-PaddlePaddle Shell Script"}, "1547": {"path": "/applications/VideoQualityAssessment/run.sh:20-20", "hash": "44a6089bdd9bd9ff7f8ee641f5c722aa", "title": "Custom Predict Model with Disabled Benchmarking"}, "1548": {"path": "/applications/VideoQualityAssessment/save_model.sh", "hash": "46996602fb17596940c2c78a93876a40", "title": "Export Best Model for Video Quality Assessment"}, "1549": {"path": "/applications/VideoQualityAssessment/setup.py", "hash": "d81b40bd9b3b882ae68b3e27e0ed077a", "title": "PaddleVideo: Video Analysis Toolkit"}, "1550": {"path": "/applications/VideoQualityAssessment/setup.py:1-34", "hash": "f3f41a07826de5c4f626256302ff01fa", "title": "Setting Up PaddleVideo Package"}, "1551": {"path": "/applications/VideoQualityAssessment/setup.py:35-56", "hash": "fe67522ff5cc9e941969a7f0d5e5a100", "title": "ppvideo: PaddlePaddle-Based Video Package Setup"}, "1552": {"path": "/applications/VideoQualityAssessment/setup.py:57-57", "hash": "4d9f2430b04cc15798a6e89028e52bfa", "title": "Creating Empty Tuple"}, "1553": {"path": "/applications/VideoTag/FineTune.md", "hash": "53c69f401eff3719b9cfa3325631f347", "title": "Fine-Tuning VideoTag: AttentionLSTM & TSN"}, "1554": {"path": "/applications/VideoTag/FineTune.md:1-32", "hash": "6cb43bd2846a2528a62490cc5ca0456c", "title": "Fine-Tuning VideoTag Models"}, "1555": {"path": "/applications/VideoTag/FineTune.md:34-81", "hash": "da4e2e2f5fb851708cf5b181ad5800d8", "title": "TSN Features Extraction and AttentionLSTM Fine-tuning"}, "1556": {"path": "/applications/VideoTag/FineTune.md:83-113", "hash": "f558887f1d2cb2329ec2dcd8141c8d5c", "title": "Fine-tuning AttentionLSTM in VideoTag"}, "1557": {"path": "/applications/VideoTag/FineTune.md:115-152", "hash": "35f2349f0e6bb7e36ae118d6cdead774", "title": "PaddleVideo Fine-Tuning Guide"}, "1558": {"path": "/applications/VideoTag/FineTune.md:153-188", "hash": "cfd693f6a8b34d4c96b105bb5a670559", "title": "TSN Model Training, Evaluation and Prediction"}, "1559": {"path": "/applications/VideoTag/FineTune.md:190-206", "hash": "0091aeddce8735626702687e8c386496", "title": "Preparing Data for TSN and AttentionLSTM"}, "1560": {"path": "/applications/VideoTag/README.md", "hash": "affe37ce7d65bfec8881a2e388f73817", "title": "Large-scale Video Classification with PaddlePaddle"}, "1561": {"path": "/applications/VideoTag/Run.md", "hash": "16f6ca4fc2c2427b1c07c868120672cc", "title": "VideoTag App Installation and Usage"}, "1562": {"path": "/applications/VideoTag/Run.md:1-54", "hash": "7b9fdc303dcfd2b32e0be23c1dd83c28", "title": "Install and Prepare Data for VideoTag"}, "1563": {"path": "/applications/VideoTag/Run.md:55-105", "hash": "098acda81a4232f8b75f10e7fbd6bbe6", "title": "Video Tag Testing Guide"}, "1564": {"path": "/applications/VideoTag/Run.md:106-109", "hash": "f5c9f264a7049c309da734c7a26487a9", "title": "Video Classification Dictionary"}, "1565": {"path": "/applications/VideoTag/Test.md", "hash": "134ad72acf76fe346ff0fd6435b25c52", "title": "VideoTag Testing Guide"}, "1566": {"path": "/applications/VideoTag/eval.py", "hash": "d716750360b7ba61b55fb51d87fae2e8", "title": "PaddlePaddle Evaluation Setup"}, "1567": {"path": "/applications/VideoTag/eval.py:1-33", "hash": "ec2d061046f2bd6f71351234870c848f", "title": "Setting Up PaddlePaddle Application Environment"}, "1568": {"path": "/applications/VideoTag/eval.py:34-64", "hash": "2c988cc4a458d34b58c49d381a3d5ebd", "title": "Command Line Argument Parser Function"}, "1569": {"path": "/applications/VideoTag/eval.py:65-95", "hash": "11663a7f4a610f55659db484cb33fcb3", "title": "Test Model Evaluation Function"}, "1570": {"path": "/applications/VideoTag/eval.py:96-122", "hash": "c2c3ef36416a1c958328e6c4cae752d0", "title": "Batch-by-batch Model Evaluation and Metrics"}, "1571": {"path": "/applications/VideoTag/eval.py:123-134", "hash": "c5768bd1fd4e616bd7fd675a7df58466", "title": "Automating Paddle Test Metrics and GPU Checks"}, "1572": {"path": "/applications/VideoTag/metrics/__init__.py", "hash": "72caeea884f99f4ccb4f5b1b35f6c4ff", "title": "Import Metrics Function for Video Analysis"}, "1573": {"path": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py", "hash": "677ea02c9627937841eeaa2073f86c50", "title": "AccuracyMetrics Calculator"}, "1574": {"path": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:1-34", "hash": "56c84620c2edf55bd93d1ce80e31e2fb", "title": "PaddleVideo MetricsCalculator Class"}, "1575": {"path": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:35-62", "hash": "ca89daa2cbbfc876f668b70d653bb8a1", "title": "Accuracy Metrics Computation"}, "1576": {"path": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:63-90", "hash": "12a799794e81385f27c1a481f6000d44", "title": "Average Loss and Accuracy Metrics"}, "1577": {"path": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:92-107", "hash": "92bea9f41accc85cc4804ec900d337c6", "title": "Top-K Accuracy Metric"}, "1578": {"path": "/applications/VideoTag/metrics/metrics_util.py", "hash": "98671b37115c597b8648fa8946aa8749", "title": "Video Metrics Evaluator Class"}, "1579": {"path": "/applications/VideoTag/metrics/metrics_util.py:1-33", "hash": "ff37a28056d8b536b7a384938c1114ef", "title": "Video Metrics Evaluation Utilities"}, "1580": {"path": "/applications/VideoTag/metrics/metrics_util.py:34-69", "hash": "65afe446a499a98d2440358f98ce0da2", "title": "Youtube8m Metrics Calculation"}, "1581": {"path": "/applications/VideoTag/metrics/metrics_util.py:70-90", "hash": "cd42ec15caf0140dfb4699f0a2a4fb47", "title": "Accumulating Metrics for Video Tagging"}, "1582": {"path": "/applications/VideoTag/metrics/metrics_util.py:92-113", "hash": "53b26f42aebc8c11dcd7973ed5d57d87", "title": "VideoTag: Logging Final Results for Each Video"}, "1583": {"path": "/applications/VideoTag/metrics/metrics_util.py:114-135", "hash": "932a19611d4f6d8411b0e769ffcb2c37", "title": "Video Tagging Metrics Calculator"}, "1584": {"path": "/applications/VideoTag/metrics/metrics_util.py:138-163", "hash": "664512b7281a6c996558b79d63a39906", "title": "Kinetics400 Metrics Calculator"}, "1585": {"path": "/applications/VideoTag/metrics/metrics_util.py:164-187", "hash": "f26bc5bcf650ef978704de6e46ccdaf4", "title": "Evaluate Video Predictions and Losses"}, "1586": {"path": "/applications/VideoTag/metrics/metrics_util.py:188-210", "hash": "62784b4d5d32f32731562f9771d72727", "title": "Infer Results Printer Function"}, "1587": {"path": "/applications/VideoTag/metrics/metrics_util.py:212-237", "hash": "91a786c642233a3b9ebe526dae10ac45", "title": "Metrics Utilities: Save, Calculate, and Log"}, "1588": {"path": "/applications/VideoTag/metrics/metrics_util.py:238-278", "hash": "20e1a9e27432ea9dd71f844054f7ec1b", "title": "MetricsZoo Class for Metrics Management"}, "1589": {"path": "/applications/VideoTag/metrics/metrics_util.py:279-279", "hash": "6c93cc00e676d9d4918cba3f4676af88", "title": "Registering TSN Metric"}, "1590": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py", "hash": "496e07c367316dea02129f6c9866d303", "title": "Interpolated Average Precision Calculator"}, "1591": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:1-23", "hash": "d6445d1b440eec05e6c95f741aa8e864", "title": "Interpolated Average Precision Calculator"}, "1592": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:25-55", "hash": "584a49984d26cb597b8affe133ba9aed", "title": "Interpolated Average Precision Calculator"}, "1593": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:57-86", "hash": "db7db67821fcde5ce348b3a546ab813a", "title": "Average Precision Calculator Class"}, "1594": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:87-108", "hash": "083d2314411f46f8b54b8489dc9ba991", "title": "Average Precision Calculator"}, "1595": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:109-134", "hash": "eb327d3285328733190d44849895a266", "title": "Average Precision Calculator"}, "1596": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:136-166", "hash": "69cfef1223ce2584bc56f8c0d3544686", "title": "Non-Interpolated Average Precision Calculator"}, "1597": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:168-192", "hash": "0a57aeaf7b7d5e30122989cc0f264029", "title": "Average Precision Calculator"}, "1598": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:193-220", "hash": "80167967805e2ddf7cbe272f92a18ef8", "title": "Average Precision Calculator"}, "1599": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:221-256", "hash": "232aa8f24bed0209f05a264b0ff3eb00", "title": "Average Precision Calculator"}, "1600": {"path": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:257-274", "hash": "7025780b7319af9be3a1a3fa3f0040ca", "title": "Normalized Predictions: Min-Max Scaling"}, "1601": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py", "hash": "ba8fecad55bd602f5fed2c7eb6b22a1a", "title": "PaddleVideo Metrics for Model Evaluation"}, "1602": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py:1-28", "hash": "5acbe7806f88f63d044b4f7a6bab6281", "title": "YouTube8M Evaluation Utilities"}, "1603": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py:30-59", "hash": "6b0bf3c68eecf1535e4c6d1dfcfe836b", "title": "Precision-Recall Average Hit at One"}, "1604": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py:60-87", "hash": "44025cb46595e955e6f842fafcbd0a56", "title": "Global Average Precision Calculation"}, "1605": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py:88-109", "hash": "2802c7967bdeea49b57825bf73ea45ed", "title": "Top K Video Predictions Evaluation"}, "1606": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py:110-135", "hash": "1d0a3f8e68c6a10a0b89bfe41ecc850b", "title": "Top-K Video Classification Evaluation"}, "1607": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py:136-164", "hash": "530f7781187acecbcb8d51fbee7c2801", "title": "Evaluation Metrics Class"}, "1608": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py:165-190", "hash": "e5e14eeb99f2ac623bb78d9393e68ea4", "title": "Batch Metrics Calculation Function"}, "1609": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py:191-219", "hash": "fee4b195e8b589e49fa77f562ead1c84", "title": "Epoch Metrics Calculator"}, "1610": {"path": "/applications/VideoTag/metrics/youtube8m/eval_util.py:220-244", "hash": "69bb49c9818a1b13399d0b900f9ee770", "title": "YouTube8m Metrics Evaluator"}, "1611": {"path": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py", "hash": "da75a3e928616fe8679559be26ee25c4", "title": "YouTube-8m Mean Average Precision Calculator"}, "1612": {"path": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:1-27", "hash": "37a562c586626d3d311800356cb3175e", "title": "Mean Average Precision Calculator"}, "1613": {"path": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:28-58", "hash": "c5e7b7fb3c0e72923a079a7eb39388f0", "title": "Mean Average Precision Calculation in YouTube8M"}, "1614": {"path": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:59-79", "hash": "a988dc888b2e2e1d3f7717e8a1cfba90", "title": "Mean Average Precision Calculator"}, "1615": {"path": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:80-111", "hash": "9ecf27e1664f066e9b3e4426bdcfcc89", "title": "Mean Average Precision Calculator"}, "1616": {"path": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:112-113", "hash": "6fbc03385d05d32795afd35c633bb841", "title": "Mean Average Precision Calculator"}, "1617": {"path": "/applications/VideoTag/models/__init__.py", "hash": "bba86ce05faed28c97fcd1e7f3647922", "title": "Model Registry in VideoTag"}, "1618": {"path": "/applications/VideoTag/models/attention_lstm/__init__.py", "hash": "153ddf4aa28b2b5e631fed1495fe1b02", "title": "Import Attention LSTM Functions"}, "1619": {"path": "/applications/VideoTag/models/attention_lstm/attention_lstm.py", "hash": "72f6ccd6c840a696635b1e2b1518c933", "title": "Attention LSTM Video Tagging Model"}, "1620": {"path": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:1-31", "hash": "3af41a7eb0183285e1a9aed459039f11", "title": "Attention LSTM Model Definition"}, "1621": {"path": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:33-54", "hash": "62809f066c4595a96fec8145c234de94", "title": "Attention LSTM Model Configuration"}, "1622": {"path": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:55-76", "hash": "38efa3a77a164008585b523fff21b796", "title": "Initializing Feature and Label Inputs"}, "1623": {"path": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:77-103", "hash": "c840dae7528a57f8172ea50427409bf7", "title": "LSTM Attention Model with Multi-Input Features"}, "1624": {"path": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:104-125", "hash": "fa862df27d886106e74dde62a49183ec", "title": "Attention LSTM Model for Video Tagging"}, "1625": {"path": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:126-151", "hash": "7660a05c71aba9cf98b616c5d775f857", "title": "Attention LSTM Model with Learning Rate Decay"}, "1626": {"path": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:152-180", "hash": "bb184c883d27ce4a9e5e5a873a55358a", "title": "Attention LSTM Model Class Definition"}, "1627": {"path": "/applications/VideoTag/models/attention_lstm/lstm_attention.py", "hash": "a7c51ffe17fccab9521050af8171af0c", "title": "Bidirectional LSTM Attention Model"}, "1628": {"path": "/applications/VideoTag/models/attention_lstm/lstm_attention.py:1-31", "hash": "3ab8c58bfaf2b9cadeb0cf9f482d2768", "title": "LSTM Attention Model Code"}, "1629": {"path": "/applications/VideoTag/models/attention_lstm/lstm_attention.py:32-58", "hash": "30da813ec0b14b0f562fd5ade9b8c022", "title": "Dynamic LSTM for Video Tagging"}, "1630": {"path": "/applications/VideoTag/models/attention_lstm/lstm_attention.py:60-83", "hash": "9cc043ca6d5549e02aa66c63f63c08df", "title": "Dynamic LSTM Model for Sequence Classification"}, "1631": {"path": "/applications/VideoTag/models/model.py", "hash": "e0cf1296f183c6e89f858a6052031544", "title": "Python Module: PaddleVideo's VideoTag Model Handler"}, "1632": {"path": "/applications/VideoTag/models/model.py:1-36", "hash": "3d55d5ae5389b6c7671d5b82eeda74dd", "title": "VideoTag Model Initialization"}, "1633": {"path": "/applications/VideoTag/models/model.py:37-69", "hash": "46eaff64d9ce7d4b1efaaa9cf9fa2bb3", "title": "Custom Exceptions and Model Base Class"}, "1634": {"path": "/applications/VideoTag/models/model.py:70-105", "hash": "77777583e300f8964ec0c37f9414c96d", "title": "Subclassing Model Class for Implementation"}, "1635": {"path": "/applications/VideoTag/models/model.py:107-139", "hash": "5c6bf60427650610934b34b256998789", "title": "Model Utilities: Dataset, Weights, and Pretraining"}, "1636": {"path": "/applications/VideoTag/models/model.py:140-167", "hash": "4814b694e6c59b940bf4d37664e136b7", "title": "Weight Handling Functions"}, "1637": {"path": "/applications/VideoTag/models/model.py:168-192", "hash": "93f088549c54f8158b5075fbb2cbbabe", "title": "ModelZoo: Managing and Retrieving Models"}, "1638": {"path": "/applications/VideoTag/models/tsn/__init__.py", "hash": "675a7ffc6f26a8a3a23e4cc4d2ea9a7b", "title": "Effortless TSN Import"}, "1639": {"path": "/applications/VideoTag/models/tsn/tsn.py", "hash": "7f7dbe649ea5de50a104ccef37e6dcf0", "title": "TSN Model Initialization"}, "1640": {"path": "/applications/VideoTag/models/tsn/tsn.py:1-34", "hash": "52d099369c342dbd3f60dc48ed9b4e20", "title": "TSN Model Class"}, "1641": {"path": "/applications/VideoTag/models/tsn/tsn.py:35-52", "hash": "be028f21e52fb65e5d39da739b89812d", "title": "TSN Model Initialization"}, "1642": {"path": "/applications/VideoTag/models/tsn/tsn.py:53-75", "hash": "63fecae7fc1f6f65a5bc89ecb29a57ad", "title": "TSN Model Input Generation"}, "1643": {"path": "/applications/VideoTag/models/tsn/tsn.py:77-101", "hash": "b2b96091e9b949beeeac72c9c40a6e9b", "title": "TSN Model Configurable Parameters"}, "1644": {"path": "/applications/VideoTag/models/tsn/tsn.py:102-129", "hash": "15abafe1c274a4e70ac8d4acfe8e9ee3", "title": "Piecewise Learning Rate Decay Optimizer"}, "1645": {"path": "/applications/VideoTag/models/tsn/tsn.py:130-159", "hash": "8fafe7609e5b9b8d6f30aa40c5752f07", "title": "Multi-Mode Model with Pre-Trained Weights"}, "1646": {"path": "/applications/VideoTag/models/tsn/tsn.py:160-165", "hash": "cb0dbb4a92917da1b7db509483b8bdb5", "title": "Prune Pretrained Parameters"}, "1647": {"path": "/applications/VideoTag/models/tsn/tsn_res_model.py", "hash": "1d9f65120a105308487d525db7bcd164", "title": "TSN ResNet Model in PaddlePaddle"}, "1648": {"path": "/applications/VideoTag/models/tsn/tsn_res_model.py:1-34", "hash": "ccf2d0a7da4e638966905f2ea03f7f7f", "title": "TSN ResNet Model Definition"}, "1649": {"path": "/applications/VideoTag/models/tsn/tsn_res_model.py:35-63", "hash": "bf138cb37216212bf2413251c0f929b4", "title": "Convolutional Layer with Batch Normalization"}, "1650": {"path": "/applications/VideoTag/models/tsn/tsn_res_model.py:65-86", "hash": "545b1750bc75fbcc1cdf430f603361c9", "title": "Bottleneck Block and Shortcut Functions"}, "1651": {"path": "/applications/VideoTag/models/tsn/tsn_res_model.py:87-118", "hash": "9580c8929a831fab32f2ea1e6d8c4ba3", "title": "TSN ResNet Model: Conv-Batch Normalization Layers"}, "1652": {"path": "/applications/VideoTag/models/tsn/tsn_res_model.py:119-142", "hash": "45685a61796394beb6f48245f271f627", "title": "ResNet Model Implementation with PaddlePaddle"}, "1653": {"path": "/applications/VideoTag/models/tsn/tsn_res_model.py:143-161", "hash": "66ba4bf0d074ee31770369ca76c44c63", "title": "Adaptive Average Pooling and Softmax Output"}, "1654": {"path": "/applications/VideoTag/models/utils.py", "hash": "ae047952ffb6194fc72c19b2be407cc8", "title": "Comprehensive File Operations Utility"}, "1655": {"path": "/applications/VideoTag/models/utils.py:1-36", "hash": "2222542d668f40d548584ed8b70abdfd", "title": "Decompress and Download Utilities"}, "1656": {"path": "/applications/VideoTag/models/utils.py:39-47", "hash": "b5e1e8bcf5ce69e9b75ebf7564204806", "title": "AttrDict Class: Access Attributes Easily"}, "1657": {"path": "/applications/VideoTag/predict.py", "hash": "fed05e80e9fbbd65d35a4d2bc784a354", "title": "PaddleVideo: Predicting Video Tags with AI"}, "1658": {"path": "/applications/VideoTag/predict.py:1-37", "hash": "b2dea5f024eef0e22acc829492b2e94f", "title": "Import and Initialization Script"}, "1659": {"path": "/applications/VideoTag/predict.py:38-64", "hash": "bbb03a9dc905282a8effdab272b0bebd", "title": "Setting Up Logger and Parsing Arguments"}, "1660": {"path": "/applications/VideoTag/predict.py:65-87", "hash": "325cd19a56d74de2c5d4d82f502db728", "title": "Video Tag Prediction Python Script"}, "1661": {"path": "/applications/VideoTag/predict.py:88-115", "hash": "671df5265c32d6302245063c3c9389bb", "title": "Building PaddleVideo Inference Model"}, "1662": {"path": "/applications/VideoTag/predict.py:117-141", "hash": "f6f2e992709ed5b8787902ac93b9b02e", "title": "Video Tag Prediction Model"}, "1663": {"path": "/applications/VideoTag/predict.py:143-171", "hash": "243d1dceca3ad2b8a07366af05955b62", "title": "Average Processing Time Logger"}, "1664": {"path": "/applications/VideoTag/reader/__init__.py", "hash": "a5c3c5d8894c9f098d531f02c5fa7ffd", "title": "Alphabetical Reader Registration"}, "1665": {"path": "/applications/VideoTag/reader/feature_reader.py", "hash": "282474179a67cfdf47e10ea9938bf82e", "title": "DataReader: LSTM-based YouTube Dataset Processing"}, "1666": {"path": "/applications/VideoTag/reader/feature_reader.py:1-34", "hash": "4ae4802aa988c532c71803c722a14946", "title": "Youtube-8M Dataset LSTM Feature Reader"}, "1667": {"path": "/applications/VideoTag/reader/feature_reader.py:35-64", "hash": "d58874f442af6a35bb11212913448db7", "title": "Feature Reader Initialization"}, "1668": {"path": "/applications/VideoTag/reader/feature_reader.py:65-80", "hash": "e28f27b80de7c15bac356aad4b02ee2f", "title": "One-Hot Video Frame Labeling"}, "1669": {"path": "/applications/VideoTag/reader/kinetics_reader.py", "hash": "040d30ed18827899c23f3cec8ea96ad1", "title": "Efficient Kinetics Dataset Reader"}, "1670": {"path": "/applications/VideoTag/reader/kinetics_reader.py:1-41", "hash": "0a76d0da43479bcceb86d8db90bc37d6", "title": "PaddleVideo's Kinetics Reader: Frame Data and License"}, "1671": {"path": "/applications/VideoTag/reader/kinetics_reader.py:42-79", "hash": "b45b31d4b22e9e0382b3eb211e673aab", "title": "Kinetics Reader: MP4/PKL Dataset Access"}, "1672": {"path": "/applications/VideoTag/reader/kinetics_reader.py:80-98", "hash": "3ac2850577c49a77ee9fe719c79d09dd", "title": "Kinetics Reader Initialization"}, "1673": {"path": "/applications/VideoTag/reader/kinetics_reader.py:99-121", "hash": "e98747160930df3f3c48a6320164b010", "title": "Configure Video Reader: Batch Size, File List, Random Seed"}, "1674": {"path": "/applications/VideoTag/reader/kinetics_reader.py:122-151", "hash": "f6264196146b8de8417f5822e3c93fbd", "title": "MP4 Reader Function"}, "1675": {"path": "/applications/VideoTag/reader/kinetics_reader.py:152-176", "hash": "cf0423543750b610a6e99bb05c328d11", "title": "Frames and Labels: Kinetics Reader"}, "1676": {"path": "/applications/VideoTag/reader/kinetics_reader.py:177-204", "hash": "56ae223d228651c5e7fa0dd0f1399ff8", "title": "Video Frame Loader and Error Handler"}, "1677": {"path": "/applications/VideoTag/reader/kinetics_reader.py:205-233", "hash": "d8f8f6a1d69bc5ff1f5ffddeeabe2e46", "title": "Video Decoder Selector"}, "1678": {"path": "/applications/VideoTag/reader/kinetics_reader.py:234-266", "hash": "cfd277c89177fdb4898a69b82ca94617", "title": "Data Augmentation for Image Processing"}, "1679": {"path": "/applications/VideoTag/reader/kinetics_reader.py:269-305", "hash": "23356222c75b7bb6558f32a9b27bcd1b", "title": "Versatile Image and Video Processing Functions"}, "1680": {"path": "/applications/VideoTag/reader/kinetics_reader.py:306-340", "hash": "27eefd17dae145c7a75d5a502b8abf66", "title": "Frame Subset Selector"}, "1681": {"path": "/applications/VideoTag/reader/kinetics_reader.py:341-367", "hash": "3375abb31265e55ef7756349a4954889", "title": "Video Frame Duration Analyzer"}, "1682": {"path": "/applications/VideoTag/reader/reader_utils.py", "hash": "96e5a241e0b2b1a901f7e896b8034edb", "title": "Reader Zoo Class and Utilities"}, "1683": {"path": "/applications/VideoTag/reader/reader_utils.py:1-31", "hash": "41ac5d6c5c797ecebf8e01d9f5b61ac7", "title": "Importing Libraries and Defining Reader Exceptions"}, "1684": {"path": "/applications/VideoTag/reader/reader_utils.py:32-70", "hash": "2eebf43092b9f925325d01f7c2763b0c", "title": "Video Reader Classes and Registry"}, "1685": {"path": "/applications/VideoTag/reader/reader_utils.py:71-80", "hash": "fa54334cfe369e581c2a00baf1735bde", "title": "Register and Retrieve Readers Class"}, "1686": {"path": "/applications/VideoTag/train.py", "hash": "a9cbad99274e254ae5335c74b1a5a0d2", "title": "VideoTag: CUDA-Powered Model Training and Saving"}, "1687": {"path": "/applications/VideoTag/train.py:1-32", "hash": "f3580ad99140f470e6d40a266b7b8321", "title": "VideoTag: Setting Up and Importing"}, "1688": {"path": "/applications/VideoTag/train.py:33-60", "hash": "b6377d4f28796dff69c844a50ab7d2a8", "title": "Argparse Configuration and Default Values"}, "1689": {"path": "/applications/VideoTag/train.py:61-82", "hash": "1b00ac68ffe7b2e8cb4613499150f8b6", "title": "Command Line Arguments for Training Program"}, "1690": {"path": "/applications/VideoTag/train.py:83-109", "hash": "878d22aa908ec2eab4413896510165e6", "title": "Command Line Argument Parsing"}, "1691": {"path": "/applications/VideoTag/train.py:110-134", "hash": "f53058fed436cd6cba43a45843e668c6", "title": "Training Model Initialization"}, "1692": {"path": "/applications/VideoTag/train.py:136-161", "hash": "44dd9e120328d834d3fa2045e37e9997", "title": "GPU-Aware Program Compilation"}, "1693": {"path": "/applications/VideoTag/train.py:162-181", "hash": "296f6d1a2e5339caf6a4edc4c9db79d3", "title": "Batch Size and Data Loading Setup"}, "1694": {"path": "/applications/VideoTag/train.py:182-205", "hash": "6d79f73551f92d520da8d2f573e79fce", "title": "Video Tagging Model Training with PaddlePaddle"}, "1695": {"path": "/applications/VideoTag/train.py:206-212", "hash": "c9be8dd4ca6eefafb9d207cb0e1dc747", "title": "Directory Check and Training Initiation"}, "1696": {"path": "/applications/VideoTag/tsn_extractor.py", "hash": "43f2f750921ad828a47d3314e2ec6cb6", "title": "Video Inference and Feature Extraction"}, "1697": {"path": "/applications/VideoTag/tsn_extractor.py:1-37", "hash": "1020a24b8f774df53c5b092128ff4c47", "title": "Python Script for PaddlePaddle Model Training"}, "1698": {"path": "/applications/VideoTag/tsn_extractor.py:38-66", "hash": "81c0250459c76c0d68cf6b95313da6c6", "title": "Command-Line Arguments for Model Training"}, "1699": {"path": "/applications/VideoTag/tsn_extractor.py:67-93", "hash": "d1f84b01b39a8ae3b38c76c64de8998d", "title": "Command Line Arguments and Parsing in TsnExtractor"}, "1700": {"path": "/applications/VideoTag/tsn_extractor.py:94-118", "hash": "f5293080ebd110612a69ab51cd63c206", "title": "Infer Model Initialization"}, "1701": {"path": "/applications/VideoTag/tsn_extractor.py:120-144", "hash": "150153126a0d38708fd2f9c78ca841e2", "title": "Model Weights Downloader and Inferencer"}, "1702": {"path": "/applications/VideoTag/tsn_extractor.py:145-158", "hash": "e94116cda27666fc0276a5fc39a504f2", "title": "Extract and Log Features for Inference"}, "1703": {"path": "/applications/VideoTag/utils/config_utils.py", "hash": "f18131592ad31cf79da3fec92fa93f71", "title": "Config Handler for VideoTag"}, "1704": {"path": "/applications/VideoTag/utils/config_utils.py:1-37", "hash": "809882757a8111e3274f41bea256e2dc", "title": "VideoTag Config Utils"}, "1705": {"path": "/applications/VideoTag/utils/config_utils.py:38-73", "hash": "59092fe8103e520fbbe4af0bb3378554", "title": "YAML Config Processing and Merging Utils"}, "1706": {"path": "/applications/VideoTag/utils/config_utils.py:74-75", "hash": "a155aaed7cdfaea866ff16b33515c82e", "title": "Config Log Separation"}, "1707": {"path": "/applications/VideoTag/utils/train_utils.py", "hash": "51c390e23dc5ad31a4e7399967a0821b", "title": "Train Utils with PaddlePaddle"}, "1708": {"path": "/applications/VideoTag/utils/train_utils.py:1-32", "hash": "cf0031b4ca4cfd4f1d7eb07f424eb91b", "title": "Logging Learning Rate in PaddlePaddle"}, "1709": {"path": "/applications/VideoTag/utils/train_utils.py:33-57", "hash": "f749c60d8bc719595b4e5515c2749bae", "title": "Retrieve and Print Learning Rate\n(or)\nLearning Rate Retrieval and Display"}, "1710": {"path": "/applications/VideoTag/utils/train_utils.py:58-80", "hash": "bd7952ddb63fc1f4351ce26fc6188a04", "title": "Train Model with Dataloader Function"}, "1711": {"path": "/applications/VideoTag/utils/train_utils.py:81-109", "hash": "88bd0ec991a75f19ecb58f751cf72b5f", "title": "Epoch Loop Initialization"}, "1712": {"path": "/applications/VideoTag/utils/train_utils.py:110-135", "hash": "d0fa34e7a631af133be1bfcf7bcd34de", "title": "Training Progress Tracker"}, "1713": {"path": "/applications/VideoTag/utils/train_utils.py:136-159", "hash": "fe8b40c470db9c253544b4afa1324a53", "title": "Model Saving and Testing Procedure"}, "1714": {"path": "/applications/VideoTag/utils/train_utils.py:161-161", "hash": "ef56ee7015191efc4d76aa94df585e26", "title": "Incomplete Code Snippet"}, "1715": {"path": "/applications/VideoTag/utils/utility.py", "hash": "e1841f8d7ee769a6fe725b41f997feec", "title": "Python Utility: PaddlePaddle Checker"}, "1716": {"path": "/applications/VideoTag/utils/utility.py:1-37", "hash": "9b483364c49237cdd68050cb63fd3648", "title": "Python Utility Script"}, "1717": {"path": "/applications/VideoTag/utils/utility.py:39-66", "hash": "af682fde4c8fe4b7c488ae461bd5054b", "title": "Compatibility and GPU Handling for PaddlePaddle"}, "1718": {"path": "/applications/VideoTag/utils/utility.py:67-70", "hash": "d07b5334ddccd6e5ec4d260a7924441f", "title": "Check Version Installation"}, "1719": {"path": "/applications/VideoTag/videotag_test.py", "hash": "5d0905c00faf919931ca229cf3fcf8ef", "title": "Efficient Video Tagging with PaddlePaddle"}, "1720": {"path": "/applications/VideoTag/videotag_test.py:1-34", "hash": "1f06306e3b7348bd1d0e6188fde7647b", "title": "VideoTag Test Log Config"}, "1721": {"path": "/applications/VideoTag/videotag_test.py:35-62", "hash": "b5162d95f820af6597d9be0ea830b69a", "title": "Command-Line Argument Parser for VideoTag"}, "1722": {"path": "/applications/VideoTag/videotag_test.py:63-86", "hash": "be79fd1199d58f7a734cbc6da171c1c8", "title": "Command-Line Arguments for Video Tagging"}, "1723": {"path": "/applications/VideoTag/videotag_test.py:87-111", "hash": "c3c52696214583c8bfa9aa7b6a4e1855", "title": "Video Classification Model with PaddlePaddle"}, "1724": {"path": "/applications/VideoTag/videotag_test.py:112-134", "hash": "da78cf0634b8cafcc0c8be976c8fbb37", "title": "Video Tagging Inference Model"}, "1725": {"path": "/applications/VideoTag/videotag_test.py:135-154", "hash": "28ad9acce37edbfc8c2586798032d48d", "title": "Extractor Setup and Timing in PaddleVideo"}, "1726": {"path": "/applications/VideoTag/videotag_test.py:156-177", "hash": "2238ce35f8201d3ce6788b155bb2b054", "title": "Configure and Prepare Input Data"}, "1727": {"path": "/applications/VideoTag/videotag_test.py:179-199", "hash": "0774cbe8eb1718e3e4c5f6de529159c6", "title": "Efficient Model Predictor Setup"}, "1728": {"path": "/applications/VideoTag/videotag_test.py:201-221", "hash": "6106c27cfcdc5f18d1d1d14f3e63dd0b", "title": "DataFeeder Initialization and Model Execution"}, "1729": {"path": "/applications/VideoTag/videotag_test.py:222-238", "hash": "7f81fcba1ba488e91a8ef463d73a7011", "title": "Inference Time Logger"}, "1730": {"path": "/benchmark/TimeSformer/README.md", "hash": "09c748fc99382f675ca3a370325409f0", "title": "TimeSformer Benchmarking Guide"}, "1731": {"path": "/benchmark/TimeSformer/run_all.sh", "hash": "0ff8a76d640c8e8e3dfa284b13ee0e47", "title": "TimeSformer Benchmarking Script"}, "1732": {"path": "/benchmark/TimeSformer/run_all.sh:1-20", "hash": "d7f5a67d615ec9b268eb4499c02c5200", "title": "TimeSformer Model Benchmark Setup"}, "1733": {"path": "/benchmark/TimeSformer/run_all.sh:20-47", "hash": "70717b1ee3155134b62224605c4ecfc6", "title": "TimeSformer Dataset Prep & Benchmark"}, "1734": {"path": "/benchmark/TimeSformer/run_all.sh:47-57", "hash": "ce8c2cf92873d887c4f02570e74e1105", "title": "Multi-GPU Performance Testing for TimeSformer"}, "1735": {"path": "/benchmark/TimeSformer/run_benchmark.sh", "hash": "658adb2db8269996dd9172b6cc6a1e07", "title": "TimeSformer Benchmarking"}, "1736": {"path": "/benchmark/TimeSformer/run_benchmark.sh:1-28", "hash": "1a7a88f78f71acacca00026edbd524fd", "title": "TimeSformer Benchmark Script"}, "1737": {"path": "/benchmark/TimeSformer/run_benchmark.sh:29-48", "hash": "418fef8fc6337b4b4e167535a3783d8a", "title": "Run TimeSformer Benchmark"}, "1738": {"path": "/benchmark/TimeSformer/run_benchmark.sh:49-77", "hash": "046fc124f07bad83398d22fe057c56ab", "title": "Run TimeSformer Benchmark"}, "1739": {"path": "/data/50salads/prepare_asrf_data.py", "hash": "858fe004037c6c05c0a9b035674c1724", "title": "Prepare ASRF Data for 50Salads"}, "1740": {"path": "/data/50salads/prepare_asrf_data.py:1-42", "hash": "211d25ad4089f60e5ca42fe730726b89", "title": "Dataset Class ID Mapping and Argument Parsing"}, "1741": {"path": "/data/50salads/prepare_asrf_data.py:43-74", "hash": "dbd2afa713d0ad31ada5cc8e0afab845", "title": "Setup Dataset Directory and Index Classes"}, "1742": {"path": "/data/50salads/prepare_asrf_data.py:76-106", "hash": "b644d7de14b70a1b0ffaf37911c56f37", "title": "Preparing ASRF Data for Salad Classification"}, "1743": {"path": "/data/50salads/prepare_asrf_data.py:107-113", "hash": "13ac1065185180c4257d8b44c6383342", "title": "Checks Direct Execution"}, "1744": {"path": "/data/50salads/transform_segmentation_label.py", "hash": "1afb823e1d8c8008c6d1ba77ff711c64", "title": "Video Data Labeling Tool"}, "1745": {"path": "/data/50salads/transform_segmentation_label.py:1-34", "hash": "8686527bbfd5cf81201e68dd70febeea", "title": "Label Conversion Tool"}, "1746": {"path": "/data/50salads/transform_segmentation_label.py:35-55", "hash": "6a2b7e0c6beb8c6382c271b0e02f2186", "title": "Action Detection and Labeling in Transform Segmentation"}, "1747": {"path": "/data/50salads/transform_segmentation_label.py:56-90", "hash": "43ff121cdfebbeca1bbde7fc0230529b", "title": "Video Segmentation Label Conversion"}, "1748": {"path": "/data/50salads/transform_segmentation_label.py:91-119", "hash": "6d7b3f793eb276934668e75b6765a5ef", "title": "Generate Action Labels from Segmentation"}, "1749": {"path": "/data/50salads/transform_segmentation_label.py:120-147", "hash": "1ff5e7a223b266c75ef39cb14375a257", "title": "Segmentation Label Writer"}, "1750": {"path": "/data/50salads/transform_segmentation_label.py:148-173", "hash": "029bfed3e9fd5e2c99e56dd7e57bbe09", "title": "Label File Processing and Conversion"}, "1751": {"path": "/data/50salads/transform_segmentation_label.py:174-195", "hash": "4e874d280c40ff77154c71ca1ad67688", "title": "Command Line Arguments Parser for Label Conversion"}, "1752": {"path": "/data/ntu-rgb-d/download_dataset.sh", "hash": "02bbc12719dccf7f2788d9f94f26f010", "title": "Download and Extract Skeleton Data"}, "1753": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py", "hash": "f4f1cff114e7953d8e6268a0b75adbb9", "title": "NTU Dataset Data Cleaning & Denoising"}, "1754": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:1-38", "hash": "35802a95df545a35575dfe8def193375", "title": "Setting Up Directories and Loggers"}, "1755": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:39-63", "hash": "8be1a38e8f458c0d4c76fed48816ac3d", "title": "Organized Logging in NTU RGB-D Data Processing"}, "1756": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:64-86", "hash": "acfcc5a88f559c502ee1a4f32bffc10f", "title": "Denoising Skeleton Frames: Multiple Loggers"}, "1757": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:87-116", "hash": "f959492281097e0df2ea12c1129463a7", "title": "Threshold-Based Body Filtration"}, "1758": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:117-147", "hash": "72b21756b65b660ef4a1cf04585f1f5e", "title": "Denoising Bodies by Spread Threshold"}, "1759": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:148-172", "hash": "4599758b8e937a1d61073beb01cdc2eb", "title": "Noisy Frame Filter"}, "1760": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:173-198", "hash": "25bfe119077b68d6d821a5e02df8bd94", "title": "Denoising Body Motion Data"}, "1761": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:200-225", "hash": "719fb02663ab342587841e62a55df1f0", "title": "Denoising Bodies Data by Frame Length and Spread"}, "1762": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:226-252", "hash": "8a11fc6e287cd6363657381ec6c0c082", "title": "Denoising NTU RGB-D Data with Motion Integration"}, "1763": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:253-280", "hash": "f7789870d1e0ae3911ac85c7dd2511e5", "title": "Extract Joints and Colors from Body Data"}, "1764": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:281-303", "hash": "78f1fe80eabf46dd6af653af67ba74b1", "title": "Missing Frame Detection and Update"}, "1765": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:305-329", "hash": "bdc028d3736c1a6f931f445cef733a5d", "title": "Extracting Bodies Data and Points"}, "1766": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:331-358", "hash": "e95ea0799ada4e15932a659d38f7f09a", "title": "Denoising and Extracting Data from NTU RGB-D Dataset"}, "1767": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:359-377", "hash": "1047522fb284320576feffbb5a1cb2ed", "title": "Extracting Actor Data from NTU-RGB-D"}, "1768": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:378-403", "hash": "fe6f3897656942994104591f0355e886", "title": "Extracting and Denoising Skeleton Data"}, "1769": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:405-419", "hash": "8df0c3b1f8f9fd5bb4ad6bfbf88f3b32", "title": "Raw Skeleton Data Processing"}, "1770": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:420-445", "hash": "2cb1a300c69b5b5436312ed191cd2a1a", "title": "Raw Skeleton Sequence Data Processing"}, "1771": {"path": "/data/ntu-rgb-d/get_raw_denoised_data.py:446-471", "hash": "a9abe992f4ca13ed85368258bbb7746a", "title": "Data Extraction and Analysis Tool"}, "1772": {"path": "/data/ntu-rgb-d/get_raw_skes_data.py", "hash": "ac41b02960935a8640ed390e4c1cdbf8", "title": "NTU Skeleton Data Extractor"}, "1773": {"path": "/data/ntu-rgb-d/get_raw_skes_data.py:1-28", "hash": "e31925071e166d8957f0f6e86cc7129d", "title": "Extracting Skeleton Data"}, "1774": {"path": "/data/ntu-rgb-d/get_raw_skes_data.py:29-58", "hash": "c50d647c672a118d8736df0bcbf79d10", "title": "Joint Counting from .skeleton Files"}, "1775": {"path": "/data/ntu-rgb-d/get_raw_skes_data.py:59-76", "hash": "be680cdab0456fc38f93ea9177707ec6", "title": "Extract and Update Body Data"}, "1776": {"path": "/data/ntu-rgb-d/get_raw_skes_data.py:77-100", "hash": "dc62c3304493edcb0c14fa39324b80c6", "title": "NTU Skeleton Data Retriever"}, "1777": {"path": "/data/ntu-rgb-d/get_raw_skes_data.py:101-130", "hash": "50cb1636548940511a774d1ea0f23d51", "title": "Combine Raw Skeleton Data Files"}, "1778": {"path": "/data/ntu-rgb-d/get_raw_skes_data.py:132-157", "hash": "c5935bf3c7d742aac76b069e595301f3", "title": "NTU Dataset Filter & Save"}, "1779": {"path": "/data/ntu-rgb-d/seq_transformation.py", "hash": "71904de86abccb870e504a82dc312bcb", "title": "NTU-RGB-D Dataset Transformation"}, "1780": {"path": "/data/ntu-rgb-d/seq_transformation.py:1-34", "hash": "14167af7fe38301eb88bb7a1ee1b0683", "title": "Directory Check and Frame Filtering"}, "1781": {"path": "/data/ntu-rgb-d/seq_transformation.py:35-63", "hash": "e729c22af21a80e98d9a2774ef64eea1", "title": "Seq Transformation: Filtering and Calculating Origin Points"}, "1782": {"path": "/data/ntu-rgb-d/seq_transformation.py:64-89", "hash": "d0f0661adda269586f2cd5a50d1922d3", "title": "Sequence Transformation for NTU RGB+D Dataset"}, "1783": {"path": "/data/ntu-rgb-d/seq_transformation.py:90-118", "hash": "dd230da585f67f213051bcdf527d1082", "title": "Skeleton Alignment and Frame Count Update"}, "1784": {"path": "/data/ntu-rgb-d/seq_transformation.py:119-150", "hash": "7852436edf73cd944c38f716bc86c5b6", "title": "Sequence Transformation and Encoding Functions"}, "1785": {"path": "/data/ntu-rgb-d/seq_transformation.py:151-176", "hash": "6814ac5e5952f161af23933e67fd799e", "title": "Train-Validation Split Function"}, "1786": {"path": "/data/ntu-rgb-d/seq_transformation.py:178-204", "hash": "26f1e72341577116d838592cafbda7a0", "title": "Evaluating and Initializing Data Paths"}, "1787": {"path": "/data/ntu-rgb-d/seq_transformation.py:205-235", "hash": "22e575a24cbdbd651dbbeabe4cfbc956", "title": "Get Indices for Cross-Subject or View Evaluation"}, "1788": {"path": "/data/ntu-rgb-d/seq_transformation.py:236-263", "hash": "670fae7515a8f6551504be6db29f340d", "title": "NTU Load and Preprocessing"}, "1789": {"path": "/data/ntu-rgb-d/seq_transformation.py:264-266", "hash": "dc4bc3a66946deaf767d5a82b2dfd1e3", "title": "Split and Process NTU Dataset"}, "1790": {"path": "/deploy/cpp_infer/external-cmake/auto-log.cmake", "hash": "b1c6a7df6492e87b0b4e8ac556b6d773", "title": "Including Git External Project with CMake"}, "1791": {"path": "/deploy/cpp_infer/include/postprocess_op.h", "hash": "ebb2e6da0b138a6ff956d64bdd527f8c", "title": "Softmax Inplace Run for PaddleVideo"}, "1792": {"path": "/deploy/cpp_infer/include/postprocess_op.h:1-39", "hash": "9f59c55d646a8a3d3e30628894bd3e70", "title": "Softmax In-Place Transformation"}, "1793": {"path": "/deploy/cpp_infer/include/postprocess_op.h:40-43", "hash": "a75aa6b5f9cf714f47d9ff360fb27c88", "title": "Postprocess Vector Float Iterators"}, "1794": {"path": "/deploy/cpp_infer/include/preprocess_op.h", "hash": "12114a6d00b2fbe27092d939fc305bbb", "title": "Image Preprocessing Operations"}, "1795": {"path": "/deploy/cpp_infer/include/preprocess_op.h:1-39", "hash": "2b80d8d5f8f5c1c6f87bb8142da1cbda", "title": "Normalize Class in PaddleVideo Library"}, "1796": {"path": "/deploy/cpp_infer/include/preprocess_op.h:40-74", "hash": "7f8b8bb1b758f880e71eec4c6a460ff5", "title": "Versatile Image Preprocessing for PaddleVideo"}, "1797": {"path": "/deploy/cpp_infer/include/utility.h", "hash": "90b7c5e0a8d6ece9ae7f31a843ec1028", "title": "Utility Functions for PaddleVideo"}, "1798": {"path": "/deploy/cpp_infer/include/utility.h:1-40", "hash": "5f003242bf21c7c35f2a0101e8c3ad36", "title": "Utility Class for PaddleVideo"}, "1799": {"path": "/deploy/cpp_infer/include/utility.h:42-54", "hash": "a62d54da182522c4a569360f8d9728fa", "title": "Utility Functions for PaddleVideo"}, "1800": {"path": "/deploy/cpp_infer/include/video_rec.h", "hash": "b59f2ef89e2101d745e04d76bc089826", "title": "VideoRecognizer: OpenCV-PaddlePaddle Integration"}, "1801": {"path": "/deploy/cpp_infer/include/video_rec.h:1-34", "hash": "2205daaa30173c141f4bdeaebb2226a7", "title": "OpenCV & PaddlePaddle Licensing and Video Recording API"}, "1802": {"path": "/deploy/cpp_infer/include/video_rec.h:36-57", "hash": "25695ecb26e64ce2c6f48ee95bb4d5e3", "title": "VideoRecognizer Object Creation and Configuration"}, "1803": {"path": "/deploy/cpp_infer/include/video_rec.h:58-86", "hash": "27caf78e53d3618bcf072381484c5cb8", "title": "Video Recognition Class Initialization"}, "1804": {"path": "/deploy/cpp_infer/include/video_rec.h:87-105", "hash": "a7d381bebe4fcde73e08bfc1fdb65aed", "title": "VideoRecognizer Initialization Code"}, "1805": {"path": "/deploy/cpp_infer/readme.md", "hash": "a21f98e099f7d9e8964cd0cdab891848", "title": "C++ PaddleVideo Deployment Error"}, "1806": {"path": "/deploy/cpp_infer/readme.md:1-45", "hash": "c797a4970d73cc44a73568b20c4d0dc3", "title": "Deploying PaddleVideo Models with C++"}, "1807": {"path": "/deploy/cpp_infer/readme.md:46-91", "hash": "b731c9ddb6eea858f127eb117071122e", "title": "Compiling OpenCV for C++ Video Inference"}, "1808": {"path": "/deploy/cpp_infer/readme.md:93-125", "hash": "82a1274485ed53d882cffd467b3b5006", "title": "Two Ways to Obtain Paddle Prediction Library"}, "1809": {"path": "/deploy/cpp_infer/readme.md:126-170", "hash": "41e98b65702408c4074817ba7f7bc27a", "title": "Compiling Paddle Inference API Library: Steps and Build Parameters"}, "1810": {"path": "/deploy/cpp_infer/readme.md:172-213", "hash": "72514677bbb2d8729cb8c623e4d2e9e0", "title": "Compiling PaddleVideo C++ Demo for Inference"}, "1811": {"path": "/deploy/cpp_infer/readme.md:214-259", "hash": "acb68f2225c283e4607831c1f3e1bfdd", "title": "C++ Prediction Demo Instructions"}, "1812": {"path": "/deploy/cpp_infer/readme.md:260-273", "hash": "60323553dc9fac916587c47c806d66c5", "title": "Video Recognition Model Execution Parameters"}, "1813": {"path": "/deploy/cpp_infer/readme.md:274-289", "hash": "8d21217a62b877963331a053fc33107f", "title": "Model Configuration and Detection Demo"}, "1814": {"path": "/deploy/cpp_infer/readme.md:290-304", "hash": "6027319fefcb287982193325ddba76ba", "title": "Optimizing Inference Engine Configuration"}, "1815": {"path": "/deploy/cpp_infer/readme.md:304-324", "hash": "da3ef92d009bd77ba7f32ef5ced83eb0", "title": "C++ Inference Time & Libcudnn Issue"}, "1816": {"path": "/deploy/cpp_infer/readme_en.md", "hash": "82920315b8bb9e67fe472c6684fac603", "title": "PaddleVideo Linux Deployment Guide"}, "1817": {"path": "/deploy/cpp_infer/readme_en.md:1-20", "hash": "4ad53256f941515db69fc528e5bd9c3d", "title": "C++ Deployment Guide for PaddleVideo"}, "1818": {"path": "/deploy/cpp_infer/readme_en.md:20-37", "hash": "53f9fddac15b57958b64af61897f4582", "title": "Linux Video Reading Setup"}, "1819": {"path": "/deploy/cpp_infer/readme_en.md:39-76", "hash": "082fb201f6939a29eec8bb20dc63543a", "title": "OpenCV Linux Compilation Guide"}, "1820": {"path": "/deploy/cpp_infer/readme_en.md:77-110", "hash": "040e6e20a4152b10539efd785ba88fde", "title": "OpenCV Library Setup and C++ Video Inference"}, "1821": {"path": "/deploy/cpp_infer/readme_en.md:110-123", "hash": "b93f83922fc91aeab1a07ccb85c9250d", "title": "Downloading and Unzipping Paddle Inference Library"}, "1822": {"path": "/deploy/cpp_infer/readme_en.md:123-150", "hash": "0d3545bc2905f69bcf611fb64b3948d2", "title": "Install Paddle Prediction Library: C++ Edition"}, "1823": {"path": "/deploy/cpp_infer/readme_en.md:150-173", "hash": "22feb5333a24d8e1e3ad5a793593acc8", "title": "Library Generation and Version Information"}, "1824": {"path": "/deploy/cpp_infer/readme_en.md:174-203", "hash": "56c452fd8fa91ef39bbdbc46f2599b51", "title": "Compiling PaddleVideo C++ Demo Instructions"}, "1825": {"path": "/deploy/cpp_infer/readme_en.md:204-231", "hash": "ed7c973130858310f6122a5fa5dfe8e7", "title": "TensorRT Deployment with PaddleVideo"}, "1826": {"path": "/deploy/cpp_infer/readme_en.md:232-258", "hash": "2e8c1bc0f5b6048e0ebcb3339954f822", "title": "Customize PaddleVideo Inference Parameters"}, "1827": {"path": "/deploy/cpp_infer/readme_en.md:259-271", "hash": "f24e1cae8586235fa1da3f354688a275", "title": "Video Recognition Model Configuration Parameters"}, "1828": {"path": "/deploy/cpp_infer/readme_en.md:273-289", "hash": "43623dfa15189f9c29df318ddf033987", "title": "TensorRT CPP Inference Code Snippet"}, "1829": {"path": "/deploy/cpp_infer/readme_en.md:290-308", "hash": "b9b7a62cf6cd15763316a1a323559ac7", "title": "Missing CUDA Library: Inference Details"}, "1830": {"path": "/deploy/cpp_infer/readme_en.md:309-316", "hash": "f2d41359132830a6289d85a9ef4bc1cf", "title": "CMake: Missing libcudnn.so Error"}, "1831": {"path": "/deploy/cpp_infer/src/main.cpp", "hash": "addfd1d04728c59846cf3f1d7f44600c", "title": "OpenCV Video Recognition in C++"}, "1832": {"path": "/deploy/cpp_infer/src/main.cpp:1-35", "hash": "8469c1d47f5afc720a126ea0a12e8868", "title": "OpenCV License and Headers"}, "1833": {"path": "/deploy/cpp_infer/src/main.cpp:37-54", "hash": "96d6d513b81a61d3f41eb133796aa136", "title": "Inference Parameters"}, "1834": {"path": "/deploy/cpp_infer/src/main.cpp:55-85", "hash": "f66914ecebec904c84906bf0d2a1a1a1", "title": "Batch Video Processing with Video Recognition"}, "1835": {"path": "/deploy/cpp_infer/src/main.cpp:86-109", "hash": "234f2282da3675e8688c789b4cb2b99e", "title": "Batch Video Frame Recognition with PaddleVideo"}, "1836": {"path": "/deploy/cpp_infer/src/main.cpp:110-138", "hash": "357a39f1d8d9ffbbf991685a3ea169a5", "title": "Video Inference Parameter Check"}, "1837": {"path": "/deploy/cpp_infer/src/main.cpp:139-170", "hash": "8a60fdb9a906b3147f6dfd167a39706c", "title": "Validate and Launch Recording Mode"}, "1838": {"path": "/deploy/cpp_infer/src/main.cpp:171-173", "hash": "45fdeb3c6bc2e95daf9f652e4085945e", "title": "Program Termination and Return Statement"}, "1839": {"path": "/deploy/cpp_infer/src/postprocess_op.cpp", "hash": "3fa0cbb18b9cd1e1dda68f75cf591f93", "title": "Softmax In-Place Normalization"}, "1840": {"path": "/deploy/cpp_infer/src/postprocess_op.cpp:1-26", "hash": "e02a8661b4c6119a6e676d6e7d9daa1d", "title": "Softmax In-place Implementation in PaddleVideo"}, "1841": {"path": "/deploy/cpp_infer/src/postprocess_op.cpp:27-50", "hash": "e8abf5fb64e1fd430a5c865adfa583d0", "title": "Softmax Implementation"}, "1842": {"path": "/deploy/cpp_infer/src/preprocess_op.cpp", "hash": "6c0649972821c6dc91fe14c97c4cc35b", "title": "Image Preprocessing for PaddleVideo Inference"}, "1843": {"path": "/deploy/cpp_infer/src/preprocess_op.cpp:1-36", "hash": "e654502d7d2a8e751bae0354ebfdfa58", "title": "Permute Class for OpenCV and Paddle API"}, "1844": {"path": "/deploy/cpp_infer/src/preprocess_op.cpp:37-66", "hash": "0e4e4c521ceaad1575e4202d19924e2a", "title": "Image Channel Preprocessing"}, "1845": {"path": "/deploy/cpp_infer/src/preprocess_op.cpp:67-104", "hash": "714894fc7a91f74e8c091b5db0fab33f", "title": "Resizable and Croppable Image Processing"}, "1846": {"path": "/deploy/cpp_infer/src/preprocess_op.cpp:105-132", "hash": "01c3b3dd6b76782b0b4691be2a57b3c3", "title": "Ten-Crop Image Preprocessing"}, "1847": {"path": "/deploy/cpp_infer/src/preprocess_op.cpp:133-135", "hash": "3a4d39478905ed4b7a90721fa22456e0", "title": "Pre-processing for PaddleVideo"}, "1848": {"path": "/deploy/cpp_infer/src/utility.cpp", "hash": "6a43516cc449447a8e67ec130c8e8b7c", "title": "Utility Functions in PaddleVideo Library"}, "1849": {"path": "/deploy/cpp_infer/src/utility.cpp:1-33", "hash": "c4ffc749704d52caec5c256b88d773a9", "title": "Utility Function in PaddleVideo Library"}, "1850": {"path": "/deploy/cpp_infer/src/utility.cpp:34-67", "hash": "a8ae6f84a6f8d46a61c73bff67528fc7", "title": "Reads Label File and Retrieves Directory Contents"}, "1851": {"path": "/deploy/cpp_infer/src/utility.cpp:68-93", "hash": "4befb993da6e4a402913653ee74cb4c4", "title": "Directory File Path Vectorization and Image Bounding Box"}, "1852": {"path": "/deploy/cpp_infer/src/utility.cpp:94-118", "hash": "e85b751693b9fa9c49a8872d3da9435f", "title": "Crop and Standardize Image Points"}, "1853": {"path": "/deploy/cpp_infer/src/utility.cpp:119-146", "hash": "f2a74201471c054b68d66c0c9b63e5b6", "title": "Perspective Image Transformation"}, "1854": {"path": "/deploy/cpp_infer/src/utility.cpp:147-181", "hash": "bd2e9b5665eafacec80a01a06d084fc0", "title": "Video Frame Sampler"}, "1855": {"path": "/deploy/cpp_infer/src/utility.cpp:182-192", "hash": "59ba5f8107851b2ff0a27a947476d1d9", "title": "Video Frame Sampling"}, "1856": {"path": "/deploy/cpp_infer/src/video_rec.cpp", "hash": "836250b8e4de4d1e82181e27301b9b36", "title": "AI-powered Video Processing"}, "1857": {"path": "/deploy/cpp_infer/src/video_rec.cpp:1-26", "hash": "88d3e44ad259d174e9212c5767b49196", "title": "Batch Size Operations"}, "1858": {"path": "/deploy/cpp_infer/src/video_rec.cpp:27-55", "hash": "59ccde7d84c616f8fdff890bb285a6bd", "title": "Video Frame Preprocessing for Inference"}, "1859": {"path": "/deploy/cpp_infer/src/video_rec.cpp:56-80", "hash": "97088254cd9b7bd9ddfe80e2640bc8c2", "title": "Video Frame Processing and Conversion"}, "1860": {"path": "/deploy/cpp_infer/src/video_rec.cpp:81-105", "hash": "25d2ab1e7c02729a7d29846bb7f158e8", "title": "Batch Segment Data Preprocessing"}, "1861": {"path": "/deploy/cpp_infer/src/video_rec.cpp:106-129", "hash": "a90a129a8a1d76831d45f8ad5d854bba", "title": "Image Preprocessing for Video Frames"}, "1862": {"path": "/deploy/cpp_infer/src/video_rec.cpp:130-152", "hash": "a867e2a034804cee42373d85bff596fb", "title": "Vector Initialization and Inference"}, "1863": {"path": "/deploy/cpp_infer/src/video_rec.cpp:153-175", "hash": "3f41c4909a877d9296865429c4f258cf", "title": "Softmax-Based AI Inference"}, "1864": {"path": "/deploy/cpp_infer/src/video_rec.cpp:177-198", "hash": "8dc2b9ff69baaee23ecfa2297a8c8dc3", "title": "Post-processing Object Detection Results"}, "1865": {"path": "/deploy/cpp_infer/src/video_rec.cpp:199-223", "hash": "58a029bb7828216d5d9edcc277dd94bd", "title": "Paddle Video Recognizer Initialization"}, "1866": {"path": "/deploy/cpp_infer/src/video_rec.cpp:225-247", "hash": "74b5b43198c63bd61742db96c3b3fdf4", "title": "Configure TensorRT Engine for Video Models"}, "1867": {"path": "/deploy/cpp_infer/src/video_rec.cpp:248-271", "hash": "73925307266dc4bea3f4a808c0580c24", "title": "Configure TensorRT Parameters"}, "1868": {"path": "/deploy/cpp_infer/src/video_rec.cpp:272-304", "hash": "ac6a0ea584d3fb1949bf75c9ebc06c58", "title": "Initialize PaddleVideo Predictor with TRT Options"}, "1869": {"path": "/deploy/cpp_infer/tools/build.sh", "hash": "f7c57092eba3704c78d033aa2d5269f7", "title": "Build C++ Inference Script"}, "1870": {"path": "/deploy/cpp_serving/paddle_env_install.sh", "hash": "9efe88629217769cf3ffab14c7ae7138", "title": "PaddleVideo C++ Serving Environment Setup"}, "1871": {"path": "/deploy/cpp_serving/paddle_env_install.sh:1-22", "hash": "9c84a788abbdc049e797c5311710d58c", "title": "Install TensorRT and PaddleVideo Dependencies"}, "1872": {"path": "/deploy/cpp_serving/paddle_env_install.sh:23-35", "hash": "06fae4d23e5a99228fcb440d6cf0af61", "title": "PaddleVideo C++ Serving Environment Setup"}, "1873": {"path": "/deploy/cpp_serving/preprocess_ops.py", "hash": "564d0f42205c6cb3b7a53a666ba103ac", "title": "Preprocessing Functions in CPP Serving"}, "1874": {"path": "/deploy/cpp_serving/preprocess_ops.py:1-34", "hash": "ef4ffd55cb3742448c3cf38602444776", "title": "Image Processing Composition"}, "1875": {"path": "/deploy/cpp_serving/preprocess_ops.py:35-76", "hash": "efe287bd94a14f1ac56335517289e00f", "title": "Video Preprocessing Function"}, "1876": {"path": "/deploy/cpp_serving/preprocess_ops.py:77-111", "hash": "8d60737e32245e0dbeb64975a5f85e94", "title": "Video Preprocessing Function"}, "1877": {"path": "/deploy/cpp_serving/preprocess_ops.py:113-126", "hash": "c6d325e4f89f75a6fde3ce25d1bba428", "title": "Model-Based Preprocess Function"}, "1878": {"path": "/deploy/cpp_serving/readme.md", "hash": "7f3497e8c4a08e9cf5fc6369679ce438", "title": "Deploy Paddle Serving with Docker"}, "1879": {"path": "/deploy/cpp_serving/readme.md:1-32", "hash": "d6c98e46a0c133f987e152af08bd7a80", "title": "Deploy Paddle Serving with Docker"}, "1880": {"path": "/deploy/cpp_serving/readme.md:34-64", "hash": "f08488a23b78a35a49bbaa44e3d08606", "title": "Speed Up PaddleServing Installation and Deployment"}, "1881": {"path": "/deploy/cpp_serving/readme.md:65-81", "hash": "375da90f6f09946e06e62c11f2367ae2", "title": "PaddleVideo Deployment Guide"}, "1882": {"path": "/deploy/cpp_serving/readme.md:82-118", "hash": "28a9b63eb2e8f68f5895c5fa7825a932", "title": "Rename Alias to 'outputs' for Fetch Variable"}, "1883": {"path": "/deploy/cpp_serving/readme.md:119-158", "hash": "9ce1226bb2543b2d801c0ecfaddcca9a", "title": "Deploy C++ Serving Server"}, "1884": {"path": "/deploy/cpp_serving/readme.md:160-164", "hash": "5971390198b5f347eb3dc902a4b4a68c", "title": "Disable Proxies Before Starting Service"}, "1885": {"path": "/deploy/cpp_serving/readme_en.md", "hash": "7f7055144e2f8aadf4b2a1734c8a093d", "title": "Accelerated Docker PaddleServing Deployment"}, "1886": {"path": "/deploy/cpp_serving/readme_en.md:1-17", "hash": "f664ae01ba03169c921e5b27e5f401e3", "title": "PaddleServing Docker Installation Guide"}, "1887": {"path": "/deploy/cpp_serving/readme_en.md:18-41", "hash": "e44e3df75e7c3c41d19f00874156be0a", "title": "Install Docker Container for PaddlePaddle Serving"}, "1888": {"path": "/deploy/cpp_serving/readme_en.md:42-65", "hash": "e58da088fa93839a6a81fca906b5e289", "title": "Speed Up PaddleServing Deployment with Action Recognition"}, "1889": {"path": "/deploy/cpp_serving/readme_en.md:66-79", "hash": "f87fa89e3eaf15e4c03d3dd537e4b1df", "title": "Directory and Model Specification for PaddleVideo Inference"}, "1890": {"path": "/deploy/cpp_serving/readme_en.md:79-94", "hash": "4a7299e4d5f42940233dae8c6a17b34c", "title": "Update Model Files and Configs"}, "1891": {"path": "/deploy/cpp_serving/readme_en.md:96-122", "hash": "4d9a3ea8b586062bf896d879c2faa7ea", "title": "Compatibility Rename Function for Model Deployment"}, "1892": {"path": "/deploy/cpp_serving/readme_en.md:123-152", "hash": "ee95483c9ce461694572f400365a8275", "title": "C++ PaddleVideo Serving Setup"}, "1893": {"path": "/deploy/cpp_serving/readme_en.md:152-165", "hash": "a9c377c9a51ffc8f4807210253d01e20", "title": "Proxy Settings in Cpp Serving Deployment"}, "1894": {"path": "/deploy/cpp_serving/run_cpp_serving.sh", "hash": "39f3673268def1a972681289ee3a3b89", "title": "Deploy PaddleVideo Server with PP-TSM/TSN"}, "1895": {"path": "/deploy/cpp_serving/serving_client.py", "hash": "bd57c4804e4e6c09c64ba8f0b34050a9", "title": "PaddleServing and PaddleVideo Integration"}, "1896": {"path": "/deploy/cpp_serving/serving_client.py:1-32", "hash": "24bc04ac19e4ce3240304205b1dcb3d3", "title": "Postprocess Paddle Serving Predictions"}, "1897": {"path": "/deploy/cpp_serving/serving_client.py:33-62", "hash": "ab5ad420bd1eb7c139278b7b4d04896c", "title": "CPP Serving Client Function"}, "1898": {"path": "/deploy/cpp_serving/serving_client.py:63-95", "hash": "e889b260019e1aea298e1608e4d02f33", "title": "Video Prediction Client in Python"}, "1899": {"path": "/deploy/paddle2onnx/predict_onnx.py", "hash": "d1dbdd2d6f221b9683de8a906073ade3", "title": "Paddle2ONNX Video Detection"}, "1900": {"path": "/deploy/paddle2onnx/predict_onnx.py:1-31", "hash": "9f278c45006c1a594c2bf49900be2614", "title": "PaddleVideo Inference Environment Setup"}, "1901": {"path": "/deploy/paddle2onnx/predict_onnx.py:32-54", "hash": "5e539a3eecedd04fb8004cf26b0466b9", "title": "Parse ONNX Prediction Parameters"}, "1902": {"path": "/deploy/paddle2onnx/predict_onnx.py:57-92", "hash": "ecd1f83a64047022513ead3dd9bc2b5f", "title": "Onnx Predictor Creation and Inference"}, "1903": {"path": "/deploy/paddle2onnx/predict_onnx.py:94-122", "hash": "5fcb0cb913ece2277cda93f31f12adfe", "title": "Building ONNX Inference Helper"}, "1904": {"path": "/deploy/paddle2onnx/predict_onnx.py:123-153", "hash": "44267a224897264892ec21a069421445", "title": "Batch Video Inference with Paddle2Onnx Predictor"}, "1905": {"path": "/deploy/paddle2onnx/predict_onnx.py:154-171", "hash": "27f726b4888b32e61670b0fd2dad351a", "title": "Benchmarked Predict: Autolog and Postprocess"}, "1906": {"path": "/deploy/paddle2onnx/readme.md", "hash": "e40bc2fcc416b538fe89d44c7fec43aa", "title": "Paddle to ONNX Conversion for Inference"}, "1907": {"path": "/deploy/paddle2onnx/readme.md:1-48", "hash": "edcc3627f0cce5a93babac904ed4dfea", "title": "Paddle2ONNX Model Conversion"}, "1908": {"path": "/deploy/paddle2onnx/readme.md:49-70", "hash": "7397a322904ab38fdcac29c0b1eb868e", "title": "ONNX-Paddle Inference Parity"}, "1909": {"path": "/deploy/paddle2onnx/readme_en.md", "hash": "b7f346c9d38e045a5d5dddfd76596040", "title": "Deploy Paddle2ONNX for PP-TSN Prediction"}, "1910": {"path": "/deploy/paddle2onnx/readme_en.md:1-28", "hash": "2595fb6814180d687e8f58dbd80c32b8", "title": "Paddle2ONNX Model Conversion"}, "1911": {"path": "/deploy/paddle2onnx/readme_en.md:29-61", "hash": "1fe060d5eaa43ac07aedb5e21d5247ad", "title": "Paddle2ONNX: Model Conversion and Prediction"}, "1912": {"path": "/deploy/paddle2onnx/readme_en.md:62-70", "hash": "d4b6bd175a5e36f142bdfcc8f2d5dbd6", "title": "Generate Output for Video File using PaddleVideo"}, "1913": {"path": "/deploy/python_serving/pipeline_http_client.py", "hash": "19cff51357732cb79ae0e6a63482f78f", "title": "Video Model Serving Pipeline with HTTP Client"}, "1914": {"path": "/deploy/python_serving/pipeline_http_client.py:1-30", "hash": "8d7759c8fbf5a667ea02612974beb2cb", "title": "Python PaddleVideo Serving Client"}, "1915": {"path": "/deploy/python_serving/pipeline_http_client.py:31-62", "hash": "3c5aa8fc31f4b81cf7abfd41c89a8cb9", "title": "Video HTTP Client"}, "1916": {"path": "/deploy/python_serving/pipeline_http_client.py:63-70", "hash": "44a81044b801fda605a7d67909579091", "title": "POST Request with JSON Data in Python"}, "1917": {"path": "/deploy/python_serving/pipeline_rpc_client.py", "hash": "3b5e6fdb0a4fa24c5836d0fb3b3f21f9", "title": "PaddleVideo Model Web Serving"}, "1918": {"path": "/deploy/python_serving/pipeline_rpc_client.py:1-29", "hash": "ee7370c60565c2368e6c3f05b9db048c", "title": "Handling PaddleVideo Model Execution"}, "1919": {"path": "/deploy/python_serving/pipeline_rpc_client.py:30-60", "hash": "0a2211bff9d692903f1cf69f17ff2d9e", "title": "Command Line RPC Client for Video Processing"}, "1920": {"path": "/deploy/python_serving/pipeline_rpc_client.py:61-68", "hash": "c3d95e1e743ce561f725cf3c5f082966", "title": "Video Prediction with PaddleVideo Client"}, "1921": {"path": "/deploy/python_serving/readme.md", "hash": "4479e38aa6703ff2aeeab898fc6656af", "title": "Deploy PaddlePaddle Model for Serving"}, "1922": {"path": "/deploy/python_serving/readme.md:1-32", "hash": "74ca79e2aa5a63df2e766c707aa65d24", "title": "Deploy PaddleServing Model on Linux"}, "1923": {"path": "/deploy/python_serving/readme.md:33-58", "hash": "bdd9612beac31c3f76c16b3e4bcd4ec0", "title": "Install PaddlePaddle for CPU and GPU"}, "1924": {"path": "/deploy/python_serving/readme.md:59-83", "hash": "ef020ed4f739914337dcdf8e676d9d28", "title": "Converting PaddlePaddle Model for Server Deployment"}, "1925": {"path": "/deploy/python_serving/readme.md:84-103", "hash": "8b02c578f52bec94666f19f9844dbba1", "title": "Configure PP-TSM Model Transformation Parameters"}, "1926": {"path": "/deploy/python_serving/readme.md:105-152", "hash": "79839fea34a2f52577a3ba6f5b900949", "title": "PaddleVideo Deployment: Input-Output Config"}, "1927": {"path": "/deploy/python_serving/readme.md:154-185", "hash": "75a89badba9b6cde948d0e19320760c0", "title": "RPC-based PaddleVideo Prediction"}, "1928": {"path": "/deploy/python_serving/readme_en.md", "hash": "73b11684ee82a88c4ccd4158a098d312", "title": "Deploying PaddleServing for Deep Learning via HTTP"}, "1929": {"path": "/deploy/python_serving/readme_en.md:1-16", "hash": "77c005f1501c5516693091b74e0b0424", "title": "Deploying Deep Learning Model with PaddleServing"}, "1930": {"path": "/deploy/python_serving/readme_en.md:17-41", "hash": "6214f5bcfbd297386e2a737a230825dc", "title": "Install PaddleServing for CPU and GPU"}, "1931": {"path": "/deploy/python_serving/readme_en.md:42-63", "hash": "dabedccebfb2fd677ca022509d56c7d1", "title": "Deploy Behavior Recognition Service with PaddleServing"}, "1932": {"path": "/deploy/python_serving/readme_en.md:64-83", "hash": "803912110e40dda062f844b4cdad3339", "title": "Model Conversion for Server Deployment"}, "1933": {"path": "/deploy/python_serving/readme_en.md:83-94", "hash": "d398c45bdc00fa22b903fb8dd1fdac26", "title": "PP-TSM Inference Model Conversion and Serving"}, "1934": {"path": "/deploy/python_serving/readme_en.md:95-119", "hash": "bead76939fe9f4043197424a956aa18b", "title": "Config File Alias Name Modification for Model Compatibility"}, "1935": {"path": "/deploy/python_serving/readme_en.md:120-145", "hash": "19583c30c092ef4693239f696111e55a", "title": "Start PaddleVideo Service with Python"}, "1936": {"path": "/deploy/python_serving/readme_en.md:146-175", "hash": "4c283e68598c4fe983d2b338c3874bdd", "title": "Python Web Service for Model Prediction"}, "1937": {"path": "/deploy/python_serving/readme_en.md:175-185", "hash": "147f8ab4a4e4d6c7411e3733813fa19b", "title": "Closing Proxy, Starting Service"}, "1938": {"path": "/deploy/python_serving/recognition_web_service.py", "hash": "4e7ebfecc9fd5abc2a4cc98cf11b784e", "title": "PaddleVideo Web Service Setup"}, "1939": {"path": "/deploy/python_serving/recognition_web_service.py:1-28", "hash": "4f51143a2d79e6a77c40e641434d3e18", "title": "Building Image Recognition Web Service Base"}, "1940": {"path": "/deploy/python_serving/recognition_web_service.py:29-62", "hash": "3a68268430c51ed3bf99416c127881bd", "title": "Preprocessing Function for Recognition Models"}, "1941": {"path": "/deploy/python_serving/recognition_web_service.py:63-102", "hash": "6259d22907ef23150f40f2239f70f38c", "title": "Video Processing Class in Recognition Web Service"}, "1942": {"path": "/deploy/python_serving/recognition_web_service.py:103-125", "hash": "14179bb8284d844289699cc91a0b2f6a", "title": "Decode and Reshape Frames Data"}, "1943": {"path": "/deploy/python_serving/recognition_web_service.py:126-149", "hash": "5785b05ac7cf485ed08a17d170e2fad1", "title": "Image Preprocessing and Post-Processing Methods"}, "1944": {"path": "/deploy/python_serving/recognition_web_service.py:150-182", "hash": "74e5d6b4af9115da400f9ecb1fac2fe3", "title": "Video Web Service Input Parser"}, "1945": {"path": "/deploy/python_serving/recognition_web_service.py:183-208", "hash": "a9a55be55faaaa31792629d69d53dc86", "title": "Command-Line Parsing for PaddleVideo Service"}, "1946": {"path": "/deploy/python_serving/utils.py", "hash": "d326a8520c1a4409e50713b2003dbf04", "title": "Video to Base64 Conversion Utils"}, "1947": {"path": "/deploy/python_serving/utils.py:1-37", "hash": "d6bc4eb7065f0181a1e393b02162f495", "title": "Video and Numpy Array Conversion Utilities"}, "1948": {"path": "/deploy/python_serving/utils.py:39-78", "hash": "e05e24efd2ebaab91e4ebbd17f3a1d8e", "title": "Video Frames Parser"}, "1949": {"path": "/deploy/python_serving/utils.py:79-81", "hash": "9e624fa4b8c7bb99038b41c9bf184f7c", "title": "Joining File Paths from List"}, "1950": {"path": "/deploy/slim/quant_post_static.py", "hash": "feeec92b531ce09f54dc31237256f5ef", "title": "Quantized Model for GPU Efficiency"}, "1951": {"path": "/deploy/slim/quant_post_static.py:1-32", "hash": "aa9419013b5b65f121b6c57029360f18", "title": "Python Licensing and Libraries"}, "1952": {"path": "/deploy/slim/quant_post_static.py:33-63", "hash": "52d39e746a3a842f6aa52f16e1d56fb4", "title": "Post-Training Quantization Function"}, "1953": {"path": "/deploy/slim/quant_post_static.py:65-84", "hash": "92392139e0dff5c8e3d6dac133d3d25b", "title": "Dynamic Dataset Loading for Quantization"}, "1954": {"path": "/deploy/slim/quant_post_static.py:86-114", "hash": "4f1a5c1ca7ae1720621a2888d966d740", "title": "Post-Training Quantization with Static Graph"}, "1955": {"path": "/deploy/slim/quant_post_static.py:117-120", "hash": "d9d1a0e9f9b9bd8db5045a418d007b00", "title": "Post-Training Quantization Function"}, "1956": {"path": "/deploy/slim/readme.md", "hash": "d1655321d1fd391e4eb3bc369190a0ab", "title": "Model Compression with PaddleSlim"}, "1957": {"path": "/deploy/slim/readme.md:2-44", "hash": "93124766dad8e81f16283d8036c8648d", "title": "PaddleSlim: Model Compression for PaddleVideo"}, "1958": {"path": "/deploy/slim/readme.md:46-91", "hash": "3b8b304f1337bb03b0f0a3e2ca8e56a7", "title": "Offline Quantization in PaddleVideo"}, "1959": {"path": "/deploy/slim/readme.md:93-133", "hash": "0516d1e6221e235169641048eb8135c9", "title": "PaddleVideo Quantized Model Deployment"}, "1960": {"path": "/deploy/slim/readme_en.md", "hash": "5311b38c10d20fdcd899e2d236deaf65", "title": "Efficient Model Compression for PaddleVideo"}, "1961": {"path": "/deploy/slim/readme_en.md:1-9", "hash": "9e5880df6e9cc72e6b510df801f25096", "title": "Efficient PaddleVideo Model Compression with PaddleSlim"}, "1962": {"path": "/deploy/slim/readme_en.md:10-30", "hash": "fdbaa6b8fb134834e2dc3e0c3e2564b0", "title": "PaddleSlim: Model Compression Tools"}, "1963": {"path": "/deploy/slim/readme_en.md:31-64", "hash": "2b1def45d4cc887c32c64bc525e70e63", "title": "Installing PaddleSlim, Model Preparation & Offline Quantization"}, "1964": {"path": "/deploy/slim/readme_en.md:64-87", "hash": "1d78546bab2c3bcb9b984010fd09025f", "title": "Offline Quantization in PaddleVideo"}, "1965": {"path": "/deploy/slim/readme_en.md:87-111", "hash": "dfbdf88b9850d56c3757df801f999662", "title": "Deploying PP-TSM Model for Prediction"}, "1966": {"path": "/deploy/slim/readme_en.md:112-132", "hash": "a21f042144b58ed96f9b120351934f57", "title": "Model Pruning and Deployment with PaddleLite"}, "1967": {"path": "/english_documents/benchmark.md", "hash": "bf41cfc8bcc6efdc937b5c1e82e0dcaa", "title": "PaddleVideo: Benchmarking Speed and Action Segmentation"}, "1968": {"path": "/english_documents/benchmark.md:1-27", "hash": "7323f41a7baf40867e970cec660e776b", "title": "PaddleVideo Speed Benchmark"}, "1969": {"path": "/english_documents/benchmark.md:29-45", "hash": "815e06e6cdd9960e0f8d49018e83715d", "title": "PaddleVideo Model Comparison"}, "1970": {"path": "/english_documents/benchmark.md:47-64", "hash": "185387b41fc079429e82f4d270c323c4", "title": "Sequential Action Segmentation Model Comparison"}, "1971": {"path": "/english_documents/benchmark.md:64-68", "hash": "95a9459d88f95d964adec3d50a0e78dc", "title": "PaddleVideo Benchmarking: Test Time & Parameters"}, "1972": {"path": "/english_documents/benchmark.md:68-69", "hash": "dfcf3fe37a1cbb6e4067990173fc945b", "title": "Reasoning Model Tested on GPU with Batch Size 2"}, "1973": {"path": "/english_documents/dataset/AVA.md", "hash": "c17aada34077f745f0ff9bd80e3ad105", "title": "AVA Dataset Preparation Process"}, "1974": {"path": "/english_documents/dataset/AVA.md:1-23", "hash": "fa739470479c9a88308ec6faa7a40e7c", "title": "AVA Dataset Preparation Process"}, "1975": {"path": "/english_documents/dataset/AVA.md:26-78", "hash": "e501e01c6f034b8f4197cb8b53793481", "title": "Preparing AVA Dataset for Action Recognition"}, "1976": {"path": "/english_documents/dataset/AVA.md:79-112", "hash": "d4eebe26a35757d005ad2d4e6da6ce92", "title": "Folder Structure for AVA Dataset in PaddleVideo"}, "1977": {"path": "/english_documents/dataset/AVA.md:113-113", "hash": "caad05b01c04fd43ec23aa1f0260ae0e", "title": "Video Frame Count Calculator"}, "1978": {"path": "/english_documents/dataset/ActivityNet.md", "hash": "e79c70ff0f94449e42859f48c97f18b2", "title": "ActivityNet Dataset Preparation"}, "1979": {"path": "/english_documents/dataset/ActivityNet.md:1-24", "hash": "661e9d51ee7cc7188b03e5b818cb9ec0", "title": "ActivityNet: Large-Scale Video Dataset for Understanding"}, "1980": {"path": "/english_documents/dataset/ActivityNet.md:25-45", "hash": "fc0e773742e749af47ef8afcdcf653cd", "title": "ActivityNet Dataset Video Feature Extraction"}, "1981": {"path": "/english_documents/dataset/ActivityNet.md:46-77", "hash": "4af9d8fe13bf8a7ebc4c14664b05fa73", "title": "ActivityNet Annotations Structure"}, "1982": {"path": "/english_documents/dataset/ActivityNet.md:78-80", "hash": "fc9f0d87ab9313a5ee23b7b8d4fecdda", "title": "Update Configuration Paths"}, "1983": {"path": "/english_documents/dataset/Oxford_RobotCar.md", "hash": "1ed8e2370dd95a3caa7f03ec04511a6c", "title": "Oxford-RobotCar Dataset Preparation"}, "1984": {"path": "/english_documents/dataset/Oxford_RobotCar.md:1-24", "hash": "f7cc365570118ccf13a3fd80d51176fa", "title": "Oxford-RobotCar Dataset Preparation"}, "1985": {"path": "/english_documents/dataset/Oxford_RobotCar.md:25-46", "hash": "7b5c10f822bf49244284d5eea80cbf12", "title": "BibTeX Citations for Datasets"}, "1986": {"path": "/english_documents/dataset/Oxford_RobotCar.md:47-64", "hash": "5c24e3da84c484bc9869cc752cdd21ec", "title": "Oxford RobotCar Dataset Download"}, "1987": {"path": "/english_documents/dataset/Oxford_RobotCar.md:65-80", "hash": "86ac248f3dc559cbfe31b844cf573fe4", "title": "RobotCar Dataset Training Links"}, "1988": {"path": "/english_documents/dataset/Oxford_RobotCar.md:81-98", "hash": "34de97189d77e54077790e8f98a624ed", "title": "Oxford RobotCar Dataset File URLs"}, "1989": {"path": "/english_documents/dataset/Oxford_RobotCar.md:101-114", "hash": "9a56ed532a6fc3eb203bb90ef6286b1c", "title": "Dynamic Frame Filtering and Timestamp Renaming"}, "1990": {"path": "/english_documents/dataset/Oxford_RobotCar.md:115-137", "hash": "e475accb2f6c5dfb644c68da4e8d42fd", "title": "RobotCar Dataset with CycleGAN"}, "1991": {"path": "/english_documents/dataset/Oxford_RobotCar.md:137-150", "hash": "45f7153c6ea075cdc56134e08fb2597e", "title": "Oxford-RobotCar Dataset Structure"}, "1992": {"path": "/english_documents/dataset/Oxford_RobotCar.md:151-162", "hash": "9c43a9e8efbe3a99190c318afae2e125", "title": "Directory Structure: Day/Night Training & Verification Images"}, "1993": {"path": "/english_documents/dataset/README.md", "hash": "7fdb17f2e3267d391ff31264bc34d657", "title": "Comprehensive Action Datasets Table"}, "1994": {"path": "/english_documents/dataset/README.md:1-28", "hash": "2d081a3219273c790577f710872098dd", "title": "Action Recognition Datasets Table"}, "1995": {"path": "/english_documents/dataset/README.md:29-58", "hash": "296c4853065a263acf5660ca70db8635", "title": "Dataset Table: Skeleton, Depth, Text"}, "1996": {"path": "/english_documents/dataset/README.md:58-73", "hash": "5731a12f712a8ee3f37124cb4e9282f4", "title": "HTML Table of Multimodal Datasets with Publication Years"}, "1997": {"path": "/english_documents/dataset/SegmentationDataset.md", "hash": "c8d70a1507664d3e13770dafb077884b", "title": "Video Action Segmentation Dataset"}, "1998": {"path": "/english_documents/dataset/fsd.md", "hash": "5f983d0a9030937fc201010fdb11a6f1", "title": "Figure Skating Dataset Overview"}, "1999": {"path": "/english_documents/dataset/fsd.md:1-26", "hash": "777541f4b28d3bdbfb9f97808f5309b5", "title": "Figure Skating OpenPose Dataset"}, "2000": {"path": "/english_documents/dataset/fsd.md:26-47", "hash": "411d1b714d46761a212576a941f84b3f", "title": "Tensor Structure and Joint Points in Dataset"}, "2001": {"path": "/english_documents/dataset/fsd.md:49-55", "hash": "942f93bf5819d9b6775199d1597e2974", "title": "Train Dataset Details"}, "2002": {"path": "/english_documents/dataset/k400.md", "hash": "052d763295b00bee6fd02739ce910bf6", "title": "Kinetics-400 Dataset Download and Extraction"}, "2003": {"path": "/english_documents/dataset/k400.md:1-27", "hash": "e8eae4bf4ae438ec3d8941704f6ca471", "title": "Kinetics-400 Dataset Download Options"}, "2004": {"path": "/english_documents/dataset/k400.md:29-65", "hash": "75c4dbe14d08a955bc0cb3e2f61e0cab", "title": "Accelerating Network Training with Videos"}, "2005": {"path": "/english_documents/dataset/k400.md:65-78", "hash": "11e937a2881fdb0a40e47e9236db5f93", "title": "Extracting K400 Video Frames"}, "2006": {"path": "/english_documents/dataset/msrvtt.md", "hash": "4986bf8f0441aad59069db265821a6fa", "title": "MSR-VTT: Video Transformers Dataset"}, "2007": {"path": "/english_documents/dataset/msrvtt.md:1-29", "hash": "01f9232dc380e4a38994e62336018b2b", "title": "MSR-VTT Dataset Overview"}, "2008": {"path": "/english_documents/dataset/msrvtt.md:31-73", "hash": "92d61f11e66409e1d69c8e73b33a6d6c", "title": "ActBERT MSR-VTT Dataset Download"}, "2009": {"path": "/english_documents/dataset/msrvtt.md:74-79", "hash": "37991f8ff98175ad22198ac8392331a5", "title": "Multi-Modal Transformer Database"}, "2010": {"path": "/english_documents/dataset/ntu-rgbd.md", "hash": "c252d72a2f3f63d22d10e3af8c4b94de", "title": "NTU RGB+D Dataset Preparation for CTR-GCN"}, "2011": {"path": "/english_documents/dataset/ntu-rgbd.md:1-23", "hash": "acf90dbe30a6556d825e64655d7b41f7", "title": "NTU-RGB+D Dataset Overview"}, "2012": {"path": "/english_documents/dataset/ntu-rgbd.md:23-59", "hash": "16d4c5d4913904c52d34a3b9975d4a6c", "title": "NTU-RGB-D Dataset Download and Unzipping"}, "2013": {"path": "/english_documents/dataset/ntu-rgbd.md:60-93", "hash": "e900c6a83d2b3df35a14d42d38476c64", "title": "Preparing NTU-RGBD Dataset for CTR-GCN"}, "2014": {"path": "/english_documents/dataset/ntu-rgbd.md:94-129", "hash": "cb7d7a67ba990e97c4ae91948d038b5f", "title": "NTU-RGBD Dataset Overview"}, "2015": {"path": "/english_documents/dataset/ntu-rgbd.md:130-158", "hash": "49fb5382e59800e458789addfc4cf029", "title": "NTU RGB+D Dataset Organization and Preprocessing"}, "2016": {"path": "/english_documents/dataset/ucf101.md", "hash": "e985fda6e01703ab898b04d9d09038ce", "title": "UCF101 Dataset Organization"}, "2017": {"path": "/english_documents/dataset/ucf101.md:1-40", "hash": "b366b6d73deec39d03815f3d90ac7b08", "title": "UCF101 Dataset Download and Extraction"}, "2018": {"path": "/english_documents/dataset/ucf101.md:41-81", "hash": "04c99f403e910289990d92416f06cacb", "title": "UCF101 Dataset File Organization"}, "2019": {"path": "/english_documents/dataset/ucf101.md:82-86", "hash": "bbbf4c992aae5180774bf9114cee9872", "title": "UCF101: Video Categories and Clips"}, "2020": {"path": "/english_documents/dataset/ucf24.md", "hash": "d1762ae31d0fad47b6f80b5969df4cce", "title": "UCF24 Dataset Preparation Guide"}, "2021": {"path": "/english_documents/dataset/ucf24.md:1-20", "hash": "3bfd7eb733494f9f6b95a2fa207e536d", "title": "UCF24 Dataset Preparation with PaddleVideo"}, "2022": {"path": "/english_documents/dataset/ucf24.md:22-60", "hash": "fa5de639d90d3f4f49cea303b50ddeec", "title": "UCF24 Dataset Preparation with PaddleVideo"}, "2023": {"path": "/english_documents/dataset/ucf24.md:61-73", "hash": "f1f983074de705e3b8c53c29a9cd0b86", "title": "UCF101 Dataset File Structure"}, "2024": {"path": "/english_documents/dataset/youtube8m.md", "hash": "805ffe470acd55f093fd12a0ed6da271", "title": "YouTube-8M: Massive Video Classification Dataset"}, "2025": {"path": "/english_documents/dataset/youtube8m.md:1-20", "hash": "65f9b29860c9d37eb4e6ddae2ea9a8ba", "title": "Large-scale Video Classification Data Set"}, "2026": {"path": "/english_documents/dataset/youtube8m.md:21-44", "hash": "2782aa5079d8b958fdc64a11ec96fa2f", "title": "Prepare Dataset for PaddlePaddle"}, "2027": {"path": "/english_documents/dataset/youtube8m.md:45-56", "hash": "01e912800778b1c31b6c3156d39c8093", "title": "Pkl File Splitting and List Generation"}, "2028": {"path": "/english_documents/install.md", "hash": "0abefc05fab4980f675fec18ab833319", "title": "PaddlePaddle & PaddleVideo Installation Guide"}, "2029": {"path": "/english_documents/install.md:1-41", "hash": "805a8829733b2411783e1c2e0dab0779", "title": "PaddlePaddle GPU Installation Guide"}, "2030": {"path": "/english_documents/install.md:42-72", "hash": "d53e092bc54bd9936a5140b8698ea6f0", "title": "Install and Configure PaddleVideo"}, "2031": {"path": "/english_documents/model_zoo/README.md", "hash": "90f4ad203dc48b8eab15ab39071080ff", "title": "Model Zoo: Action Recognition and Segmentation Models"}, "2032": {"path": "/english_documents/model_zoo/README.md:1-26", "hash": "c4619cfdedcf705594ef97467122ce9e", "title": "Action Recognition Model Zoo"}, "2033": {"path": "/english_documents/model_zoo/README.md:27-61", "hash": "5d366ccafeced071c34afdde9f898ac2", "title": "AI Model Zoo: Action Recognition & Segmentation Models"}, "2034": {"path": "/english_documents/model_zoo/README.md:62-100", "hash": "926e774f8688ec7f6aeb32dab2d1e529", "title": "PaddleVideo Model Zoo Table"}, "2035": {"path": "/english_documents/model_zoo/README.md:101-106", "hash": "f489551a8c1a11fb82882e4cfab963a9", "title": "Empty HTML Table Cell or Row"}, "2036": {"path": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md", "hash": "113247221fdbae7373b6bd89e816c242", "title": "SlowFast_FasterRCNN Action Detection Tutorial"}, "2037": {"path": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:1-24", "hash": "b2696453949f9c6e9a847ebceec87c50", "title": "SlowFast_FasterRCNN: Video Action Detection Model"}, "2038": {"path": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:26-64", "hash": "8bb24462dde388100a9f5d9f28ac2df5", "title": "AVA Dataset Video Processing Guide"}, "2039": {"path": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:65-90", "hash": "c7d435f1ac54f27bdf4c3115357f7cf6", "title": "Training and Testing SlowFast Faster RCNN on AVA Dataset"}, "2040": {"path": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:93-126", "hash": "a226d21aa586e160cee9d649379bb4e7", "title": "Action Detection with SlowFast+FasterRCNN"}, "2041": {"path": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:127-129", "hash": "2aef124654d86688521c8e470000d06b", "title": "GPU Acceleration, No TensorRT"}, "2042": {"path": "/english_documents/model_zoo/estimation/adds.md", "hash": "43e73a263e2e84709d97e287f9a66120", "title": "ADDS-DepthNet: Estimating Depth with Day & Night Images"}, "2043": {"path": "/english_documents/model_zoo/estimation/adds.md:1-23", "hash": "7ed4c86c4064ce57f59f377585935489", "title": "ADDS-DepthNet: Self-Supervised Monocular Depth Estimation"}, "2044": {"path": "/english_documents/model_zoo/estimation/adds.md:26-49", "hash": "a3f9a56648d5e9a3ac68e2091a0a28dc", "title": "Adding Pre-Trained Model to Oxford RobotCar Dataset"}, "2045": {"path": "/english_documents/model_zoo/estimation/adds.md:50-72", "hash": "178e03d6246cad68ec69a6da6e32cfc2", "title": "Train and Test ADDS-DepthNet with Oxford RobotCar Dataset"}, "2046": {"path": "/english_documents/model_zoo/estimation/adds.md:74-90", "hash": "9b1ccbdea207c51dc306053ea2663ea4", "title": "ADDS Model Testing on RobotCar Dataset"}, "2047": {"path": "/english_documents/model_zoo/estimation/adds.md:92-107", "hash": "361ebe390204d98af617368bc501d710", "title": "Model Performance Comparison Table"}, "2048": {"path": "/english_documents/model_zoo/estimation/adds.md:107-124", "hash": "79792d0a8db2e5456e67efd82bbf11f3", "title": "Predicting Depth Maps with PaddlePaddle's ADDS Model"}, "2049": {"path": "/english_documents/model_zoo/estimation/adds.md:126-133", "hash": "10dc87df8980c819b91d72211795f8de", "title": "Self-supervised Monocular Depth Estimation"}, "2050": {"path": "/english_documents/model_zoo/localization/bmn.md", "hash": "8dbac2263cb59166f57b4d9c44da9cdb", "title": "BMN Model for Action Proposal Generation"}, "2051": {"path": "/english_documents/model_zoo/localization/bmn.md:1-35", "hash": "235ff75f2431a2f4f762b1ec2f880d00", "title": "BMN Model: Training and Evaluation Modules"}, "2052": {"path": "/english_documents/model_zoo/localization/bmn.md:36-68", "hash": "a0e3e00fa04b93bd6df3e58cfd5f97f4", "title": "BMN Localization Model Deployment with PaddlePaddle"}, "2053": {"path": "/english_documents/model_zoo/localization/bmn.md:70-96", "hash": "f376ff1f413ce4be304bdb539755a2bd", "title": "BMN Model Inference: Export and Predict"}, "2054": {"path": "/english_documents/model_zoo/localization/bmn.md:97-104", "hash": "0fabfd11d9e1b0c87b623f357b6a891b", "title": "BMN: Temporal Action Proposal Inference Results"}, "2055": {"path": "/english_documents/model_zoo/localization/yowo.md", "hash": "1bbfe4d48c46609a58a22456fe6ee25e", "title": "YOWO: Efficient Feature Extraction Model"}, "2056": {"path": "/english_documents/model_zoo/localization/yowo.md:1-36", "hash": "0347574ac20dc55edac68a58ba2197c6", "title": "YOWO: Spatio-Temporal Feature Extraction for Localization"}, "2057": {"path": "/english_documents/model_zoo/localization/yowo.md:36-60", "hash": "d650f212059cec5dedc14576eefe7097", "title": "YOWO Localizer: Download and Train with PaddleVideo"}, "2058": {"path": "/english_documents/model_zoo/localization/yowo.md:61-80", "hash": "d24944892e5a2002e177e74f8214e778", "title": "Faster AMP Mixed-Precision Training for Yowo"}, "2059": {"path": "/english_documents/model_zoo/localization/yowo.md:80-101", "hash": "8756082eabb9d38d3ea1130ed33cbfe4", "title": "Evaluating YOWO Model Performance and Exporting Inference Model"}, "2060": {"path": "/english_documents/model_zoo/localization/yowo.md:102-122", "hash": "b22d9a6832681e97ff02a71b350590b3", "title": "Generate and Predict YOWO Model"}, "2061": {"path": "/english_documents/model_zoo/localization/yowo.md:124-138", "hash": "e0cf1ceaee934b6c80d2e7c4ef4b07c6", "title": "YOWO Model Predicts HorseRiding with 0.8 Confidence"}, "2062": {"path": "/english_documents/model_zoo/multimodal/actbert.md", "hash": "5301712b74e42792f3b6efe3294f7ee1", "title": "ActBERT: Multimodal Pretrain Task for Video-Language Tasks"}, "2063": {"path": "/english_documents/model_zoo/multimodal/actbert.md:1-25", "hash": "51fc68b26ce73f0f948daed91a79eddc", "title": "Introducing ActBERT: Multimodal Pretrain Task for Video-Language Tasks"}, "2064": {"path": "/english_documents/model_zoo/multimodal/actbert.md:26-65", "hash": "0994cf5052cbfe061873a20957d85fb5", "title": "Training ActBERT on HowTo100M Dataset"}, "2065": {"path": "/english_documents/model_zoo/multimodal/actbert.md:66-98", "hash": "e9c596bc029674d4716d21a0cfd716aa", "title": "Training ActBERT with AMP and MSR-VTT"}, "2066": {"path": "/english_documents/model_zoo/partition/transnetv2.md", "hash": "334177f207b1c9cd5d3f84a1a797182d", "title": "TransNetV2: Deep Learning Shot Transition Detection"}, "2067": {"path": "/english_documents/model_zoo/partition/transnetv2.md:1-28", "hash": "d827054ccfd1806be4b70ce1923577a7", "title": "TransNetV2: Video Segmentation with DDCNN V2"}, "2068": {"path": "/english_documents/model_zoo/partition/transnetv2.md:28-62", "hash": "386fc3d814ca0ace557b9027ff42d90a", "title": "TransNetV2 Inference Model Guide"}, "2069": {"path": "/english_documents/model_zoo/partition/transnetv2.md:64-80", "hash": "557cc79fb7635fd199ead25990ef6aaf", "title": "TransNetV2 Prediction Demo"}, "2070": {"path": "/english_documents/model_zoo/recognition/agcn.md", "hash": "b1d12a873aabb52e762218b7fc00f96d", "title": "AGCN: Enhanced Video Recognition via Multi-Stream Graph Convs"}, "2071": {"path": "/english_documents/model_zoo/recognition/agcn.md:1-46", "hash": "585e0020d872975bfe040db9ace21bc7", "title": "Adaptive Graph Convolution Network (AGCN) Implementation"}, "2072": {"path": "/english_documents/model_zoo/recognition/agcn.md:49-84", "hash": "ef1c893b923bebdb9dee457892213b52", "title": "AGCN Test Scripts and Results"}, "2073": {"path": "/english_documents/model_zoo/recognition/agcn.md:87-117", "hash": "b903b713b8f4d62aba1f73bf29f2f95f", "title": "AGCN Video Recognition Model Usage"}, "2074": {"path": "/english_documents/model_zoo/recognition/agcn.md:118-129", "hash": "7aceb3b2092e74c9c35b94035c27a18b", "title": "Multi-Stream Adaptive Graph Convolutional Network for Action Recognition"}, "2075": {"path": "/english_documents/model_zoo/recognition/agcn2s.md", "hash": "24d57571b5d1a79216a1d69dce80f486", "title": "AGCN2s: Enhanced Motion Recognition Model"}, "2076": {"path": "/english_documents/model_zoo/recognition/agcn2s.md:1-20", "hash": "f4e3fe815e94da959285bdc85e75b81c", "title": "Introducing 2s-AGCN for Bone Motion Recognition"}, "2077": {"path": "/english_documents/model_zoo/recognition/agcn2s.md:20-40", "hash": "00831997d10696210d7f7051c9fc8e1d", "title": "AGCN2S: Skeleton-based Gesture Recognition Network"}, "2078": {"path": "/english_documents/model_zoo/recognition/agcn2s.md:41-71", "hash": "aaae6079ebfef62c45e2e834bb7fd11e", "title": "2s-AGCN Test Scripts & Results"}, "2079": {"path": "/english_documents/model_zoo/recognition/agcn2s.md:73-79", "hash": "637764d1ca9d7dd1e7b1bb69cfcd803a", "title": "AGCN-2s Model Checkpoints"}, "2080": {"path": "/english_documents/model_zoo/recognition/agcn2s.md:81-103", "hash": "540727c9aa2a57dd5cae5d64edd40721", "title": "Exporting AGCN2s for Action Recognition"}, "2081": {"path": "/english_documents/model_zoo/recognition/agcn2s.md:104-112", "hash": "356f500296e7cc8722ca9e5b4c433520", "title": "AGCN2S Model Prediction Engine"}, "2082": {"path": "/english_documents/model_zoo/recognition/attention_lstm.md", "hash": "8c8f9c4f9302fc7e146325ec75c03d87", "title": "AttentionLSTM Model for YouTube-8M Classification"}, "2083": {"path": "/english_documents/model_zoo/recognition/attention_lstm.md:1-19", "hash": "ae0dd148e1e9429e22fd85792ea39913", "title": "Attention-LSTM for Video Recognition"}, "2084": {"path": "/english_documents/model_zoo/recognition/attention_lstm.md:21-45", "hash": "444c70f1b48e20d84dbbd0c866656131", "title": "Attention LSTM on Youtube-8M with 8 GPUs"}, "2085": {"path": "/english_documents/model_zoo/recognition/attention_lstm.md:47-68", "hash": "3725522451b59acc5930087ac44b7e2d", "title": "Export and Use AttentionLSTM Model"}, "2086": {"path": "/english_documents/model_zoo/recognition/attention_lstm.md:69-84", "hash": "7c545c1919d4267e112aadef3c6514b1", "title": "AttentionLSTM for Video Classification"}, "2087": {"path": "/english_documents/model_zoo/recognition/ctrgcn.md", "hash": "db76cd0c7c5db81d5df3a0e099a9e47b", "title": "Bone-Based Behavior Recognition with CTR-GCN"}, "2088": {"path": "/english_documents/model_zoo/recognition/ctrgcn.md:1-39", "hash": "6f22af889f16bac78141737bfbf8ba7c", "title": "Bone-Based Behavior Recognition with CTR-GCN"}, "2089": {"path": "/english_documents/model_zoo/recognition/ctrgcn.md:41-74", "hash": "5087204f4b1ee828a8ed3e168990ef87", "title": "PaddlePaddle CTR-GCN Model: NTU Dataset Training & Testing"}, "2090": {"path": "/english_documents/model_zoo/recognition/ctrgcn.md:74-90", "hash": "ab60e317dace634cd90247c4e9a0e7b8", "title": "CTRGCN Model Performance on NTU-RGB+D Dataset"}, "2091": {"path": "/english_documents/model_zoo/recognition/ctrgcn.md:93-121", "hash": "af916b213b5d9060fd152d54f2f1439e", "title": "PaddleVideo's CTRGCN Model for Action Recognition"}, "2092": {"path": "/english_documents/model_zoo/recognition/ctrgcn.md:122-128", "hash": "677b4e2846dc8d1637455cb06e7af622", "title": "Top-1 Action Recognition Scores"}, "2093": {"path": "/english_documents/model_zoo/recognition/movinet.md", "hash": "1099e089f1f00ee23ce88a83d79e4dc7", "title": "PaddleVideo's Efficient MoViNet Model"}, "2094": {"path": "/english_documents/model_zoo/recognition/movinet.md:1-40", "hash": "71f2e198740c74709c8438dda8aa5eaf", "title": "MoViNet: Efficient Video Reasoning Model"}, "2095": {"path": "/english_documents/model_zoo/recognition/movinet.md:41-73", "hash": "6dce496a1a1712934404c14778d1183a", "title": "MoViNet Testing and Inference Guide"}, "2096": {"path": "/english_documents/model_zoo/recognition/movinet.md:74-91", "hash": "bd55bbb9c0ce383a5922ac5325251102", "title": "MoViNet Model Configuration"}, "2097": {"path": "/english_documents/model_zoo/recognition/posec3d.md", "hash": "089b9fad898589f82e42d2a35c527ccc", "title": "PoseC3D: Skeleton-Based Action Recognition on UCF101"}, "2098": {"path": "/english_documents/model_zoo/recognition/posec3d.md:1-24", "hash": "c6a35ae2a54111574535a947c0c64342", "title": "PoseC3D: Skeleton-based Action Recognition"}, "2099": {"path": "/english_documents/model_zoo/recognition/posec3d.md:24-39", "hash": "891cfb695ce28459a2939e7860a53d11", "title": "Training PoseC3D on UCF101 with Pre-trained Weights"}, "2100": {"path": "/english_documents/model_zoo/recognition/posec3d.md:40-82", "hash": "fa4b406e059b6a172453a76d16d15b5a", "title": "PoseC3D Model Testing and Inference Guide"}, "2101": {"path": "/english_documents/model_zoo/recognition/posec3d.md:83-100", "hash": "918ed65c46d6268f10219e74e08ed2e8", "title": "Inferring PoseC3D without TensorRT and GPU Acceleration"}, "2102": {"path": "/english_documents/model_zoo/recognition/pp-timesformer.md", "hash": "9eb36dd5e37933b23fc89695fc395c98", "title": "Enhanced Video Recognition with PP-TimeSformer"}, "2103": {"path": "/english_documents/model_zoo/recognition/pp-timesformer.md:1-29", "hash": "428f69e5dbf6df796ac307cbad609de4", "title": "PP-TimeSformer: Video Classification Model"}, "2104": {"path": "/english_documents/model_zoo/recognition/pp-timesformer.md:31-58", "hash": "2a04e24c6afe51f26b6059e404e7c5fd", "title": "Download and Prepare Data for Video Recognition"}, "2105": {"path": "/english_documents/model_zoo/recognition/pp-timesformer.md:60-75", "hash": "e5d69e738428377c94043c2bb32ddeed", "title": "Efficient Video Recognition with PaddlePaddle Timesformer"}, "2106": {"path": "/english_documents/model_zoo/recognition/pp-timesformer.md:78-92", "hash": "edd72321e6814cb45d63f003b3a1cebe", "title": "PP-TimeSformer Test Accuracy"}, "2107": {"path": "/english_documents/model_zoo/recognition/pp-timesformer.md:93-108", "hash": "2132c904123ea790a12b7e2d8cc2a4b7", "title": "Launching PaddleVideo with Vision Transformer and UniformCrop"}, "2108": {"path": "/english_documents/model_zoo/recognition/pp-timesformer.md:108-120", "hash": "df00752ead46dada169e1e5dd313a222", "title": "Export PP-TimeSformer Model for Video Recognition"}, "2109": {"path": "/english_documents/model_zoo/recognition/pp-timesformer.md:121-147", "hash": "1eecd509036ab18615cdb2cbd42796bc", "title": "PaddlePaddle's ppTimeSformer for Video Recognition"}, "2110": {"path": "/english_documents/model_zoo/recognition/pp-timesformer.md:147-156", "hash": "bff8e12dd0112b85bad6c2bb75df00ee", "title": "PP-Timesformer for Video Classification"}, "2111": {"path": "/english_documents/model_zoo/recognition/pp-tsm.md", "hash": "cf84f7f4b5f0cd27a71a7927866dbc0b", "title": "Optimized PP-TSM for Action Recognition"}, "2112": {"path": "/english_documents/model_zoo/recognition/pp-tsm.md:1-31", "hash": "836df62cc06d41e03bfd9e095e3d3a07", "title": "Optimized PP-TSM for Action Recognition"}, "2113": {"path": "/english_documents/model_zoo/recognition/pp-tsm.md:31-64", "hash": "c964843b5932e512cfa776c917b49dac", "title": "Training TSM Models on Kinetics and UCF"}, "2114": {"path": "/english_documents/model_zoo/recognition/pp-tsm.md:64-90", "hash": "fe72b73c28561b9a01c6f9317ceb8c17", "title": "Training PP-TSM with Pretrained Model"}, "2115": {"path": "/english_documents/model_zoo/recognition/pp-tsm.md:92-122", "hash": "3bfac1352ab7584a0fae211a1c53136d", "title": "PP-TSM: Kinetics-400 Training & Testing"}, "2116": {"path": "/english_documents/model_zoo/recognition/pp-tsm.md:122-127", "hash": "e7baac2d91794f2707bb6e02a92d72c8", "title": "Pre-trained PP-TSM Models"}, "2117": {"path": "/english_documents/model_zoo/recognition/pp-tsm.md:129-159", "hash": "f22a086b838e0a62dc9b604dc5fe5fd8", "title": "Export and Use PPTSM Model for Video Classification"}, "2118": {"path": "/english_documents/model_zoo/recognition/pp-tsm.md:160-167", "hash": "41d9b4c4734c05cb4ec7ca765b61e264", "title": "Top1 Prediction: Archery"}, "2119": {"path": "/english_documents/model_zoo/recognition/pp-tsn.md", "hash": "fdeb7d2bfe5264d1b9fe451f5cb982d0", "title": "PP-TSN: Enhanced TSN with Mixed-Precision"}, "2120": {"path": "/english_documents/model_zoo/recognition/pp-tsn.md:1-30", "hash": "ad7f820beadf6ed0fea8abea4b12be07", "title": "PP-TSN Model Documentation"}, "2121": {"path": "/english_documents/model_zoo/recognition/pp-tsn.md:33-61", "hash": "23d66fd79f879bac52e39af1ef7475a0", "title": "Training PP-TSN on Kinetics-400 with 8 GPUs"}, "2122": {"path": "/english_documents/model_zoo/recognition/pp-tsn.md:63-81", "hash": "63c5a5dd50b33b024dbb159da8d44c1c", "title": "Accelerating PP-TSN Training with AMP"}, "2123": {"path": "/english_documents/model_zoo/recognition/pp-tsn.md:81-95", "hash": "5d136aae7be31ac950d1d219d9d73e1b", "title": "Distinct Testing Method for PP-TSN Model"}, "2124": {"path": "/english_documents/model_zoo/recognition/pp-tsn.md:96-105", "hash": "c9489594fdf769dc4616d0f81d3b7ed3", "title": "PP-TSN Model Test Results on Kinetics-400"}, "2125": {"path": "/english_documents/model_zoo/recognition/pp-tsn.md:105-125", "hash": "b84361f5bd078c21243a263ecfeb1db4", "title": "PP-TSN Model Export and Inference"}, "2126": {"path": "/english_documents/model_zoo/recognition/pp-tsn.md:126-146", "hash": "c3ad6675bc72013fd73919fa7b9915f2", "title": "PP-TSN Video Recognition Inference"}, "2127": {"path": "/english_documents/model_zoo/recognition/slowfast.md", "hash": "4cc519177824719106bce060e4e36de2", "title": "SlowFast Model: Multigrid Training for Video Recognition"}, "2128": {"path": "/english_documents/model_zoo/recognition/slowfast.md:1-38", "hash": "cf1633c6dc21a3ff9feb74f5f44ebbe1", "title": "SlowFast: Video Recognition Model Docs"}, "2129": {"path": "/english_documents/model_zoo/recognition/slowfast.md:39-58", "hash": "f4ecc1e7a5e6db27afd58138cacd1726", "title": "Multigrid-Accelerated SlowFast Training"}, "2130": {"path": "/english_documents/model_zoo/recognition/slowfast.md:61-79", "hash": "bd13dc5429ba282cd07fd817a56cbad1", "title": "Testing SlowFast Model in PaddleVideo"}, "2131": {"path": "/english_documents/model_zoo/recognition/slowfast.md:82-112", "hash": "f6a05638a1e451b73b18e62d05f46684", "title": "SlowFast Model Export and Inference Guide"}, "2132": {"path": "/english_documents/model_zoo/recognition/slowfast.md:113-120", "hash": "31d8f233ca844d100563593512e58346", "title": "SlowFast Networks for Video Recognition Code"}, "2133": {"path": "/english_documents/model_zoo/recognition/stgcn.md", "hash": "592f050230fa363ca5cc91b417a44cbd", "title": "ST-GCN Action Recognition Model Training and Testing"}, "2134": {"path": "/english_documents/model_zoo/recognition/stgcn.md:1-49", "hash": "e21feb84b4dbc6d08840910e67fbf68c", "title": "Skeleton-based Action Recognition with ST-GCN"}, "2135": {"path": "/english_documents/model_zoo/recognition/stgcn.md:50-89", "hash": "260990439b016019664c7c02dce42a4b", "title": "Test ST-GCN Model on FSD and NTU-RGB+D Datasets"}, "2136": {"path": "/english_documents/model_zoo/recognition/stgcn.md:90-115", "hash": "5514eabc4aceaade7fb2a81ce9c20142", "title": "Export and Predict with STGCN Model"}, "2137": {"path": "/english_documents/model_zoo/recognition/stgcn.md:116-129", "hash": "f8320b26d30ab433c5a54546d21f414c", "title": "STGCN Recognition Model"}, "2138": {"path": "/english_documents/model_zoo/recognition/timesformer.md", "hash": "f0ad58c2c863fe5d762ab4f8b8c848f3", "title": "TimeSformer: Top Video Classifier"}, "2139": {"path": "/english_documents/model_zoo/recognition/timesformer.md:1-26", "hash": "a71855d03b584f47073830ec4929e5be", "title": "TimeSformer: Efficient Video Classification"}, "2140": {"path": "/english_documents/model_zoo/recognition/timesformer.md:28-57", "hash": "661f24eed9447fcde922cbe13d00acef", "title": "Train Timesformer on Kinetics-400 with 8 GPUs"}, "2141": {"path": "/english_documents/model_zoo/recognition/timesformer.md:58-72", "hash": "99d12fe6501137e4d78b9e81a6bfe34c", "title": "Training Timesformer on Multiple GPUs with AMP"}, "2142": {"path": "/english_documents/model_zoo/recognition/timesformer.md:75-90", "hash": "c1a943809c1b3e3af02c8e037a49f3c4", "title": "Optimizing TimeSformer Testing"}, "2143": {"path": "/english_documents/model_zoo/recognition/timesformer.md:93-107", "hash": "e758a1d2925b123bf639997963fd43db", "title": "Export TimeSformer Inference Model"}, "2144": {"path": "/english_documents/model_zoo/recognition/timesformer.md:108-133", "hash": "1b3bf297dc6789c8fb51e77f5ffc8f96", "title": "TimeSformer Predicts Video Class"}, "2145": {"path": "/english_documents/model_zoo/recognition/timesformer.md:133-137", "hash": "710a35447d13a1c3c7c27c6935d070e8", "title": "TimeSformer: Space-Time Attention for Video Recognition"}, "2146": {"path": "/english_documents/model_zoo/recognition/tokenshift_transformer.md", "hash": "774af1550180e994eae8f33b9ca3420d", "title": "TokenShift Transformer: Versatile Video Classifier"}, "2147": {"path": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:1-36", "hash": "a09895c4ad41066d8b3a0177c2c5d699", "title": "Token Shift Vision Transformer"}, "2148": {"path": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:36-63", "hash": "239c5a86dbf384a9720ff8539a9d627c", "title": "TokenShift Transformer: UCF-101 Training Guide"}, "2149": {"path": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:64-78", "hash": "c1d255e4ce66eba075bf0606afbc49ac", "title": "Token Shift Transformer Training on UCF101 Dataset"}, "2150": {"path": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:78-93", "hash": "eb140f62aa4563e8f0631bb59fef79f7", "title": "VisionTransformer Testing on UCF-101"}, "2151": {"path": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:93-116", "hash": "061416a8d3a2638aedeb7e1eec520f1f", "title": "TokenShift Vision Transformer Inference Guide"}, "2152": {"path": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:117-125", "hash": "011d38a5bac31e2ca24ab07ef7eee365", "title": "Top-1 Prediction: Brushing Teeth, Confidence 0.99"}, "2153": {"path": "/english_documents/model_zoo/recognition/tsm.md", "hash": "a913de807fced367954e30ba622dcace", "title": "Training TSM: ResNet-50 PaddlePaddle AMP UCF-101 Kinetics-400"}, "2154": {"path": "/english_documents/model_zoo/recognition/tsm.md:1-33", "hash": "296d30a9d67bd770709926b168e428e4", "title": "TSM Video Understanding with ResNet-50"}, "2155": {"path": "/english_documents/model_zoo/recognition/tsm.md:35-62", "hash": "f9f3d2665e0ec59c9819d653ac7ade0e", "title": "Training TSM Model on Kinetics-400 with PaddleVideo"}, "2156": {"path": "/english_documents/model_zoo/recognition/tsm.md:64-91", "hash": "3029330bde30c2b8abbbb83645ec4805", "title": "Training TSM Model with PaddlePaddle and AMP"}, "2157": {"path": "/english_documents/model_zoo/recognition/tsm.md:91-118", "hash": "0511b357c5dd743c6f7c76697b1f9e12", "title": "TSM Model Training on UCF-101 Dataset"}, "2158": {"path": "/english_documents/model_zoo/recognition/tsm.md:118-144", "hash": "09ab259f1ef723bc39d3f350eb67a159", "title": "TSM Model Training with PaddleVideo"}, "2159": {"path": "/english_documents/model_zoo/recognition/tsm.md:145-166", "hash": "04898d8ed37b2625a6d7e8552978eed9", "title": "CUDNN Batch Normalization Testing Script"}, "2160": {"path": "/english_documents/model_zoo/recognition/tsm.md:168-181", "hash": "99a726e6ef4be2c38d41be9c60dedf3a", "title": "TSM Models with ResNet50 and Sampling Methods"}, "2161": {"path": "/english_documents/model_zoo/recognition/tsm.md:182-203", "hash": "89f479d906f7aaef36a4f104626316d3", "title": "TSM Model Inference with PaddlePaddle"}, "2162": {"path": "/english_documents/model_zoo/recognition/tsm.md:203-221", "hash": "d94763139ef322954ce7fb8f583173c2", "title": "TSM Training Strategy: Momentum, L2 Decay"}, "2163": {"path": "/english_documents/model_zoo/recognition/tsn.md", "hash": "030deaaa6c6fcf3c543cc962c35ceba2", "title": "TSN: 2D-CNN Video Classification with Sparse Sampling"}, "2164": {"path": "/english_documents/model_zoo/recognition/tsn.md:1-20", "hash": "966c7cf3958801ed9c09da25a13c92c5", "title": "Global TSN for Video Classification"}, "2165": {"path": "/english_documents/model_zoo/recognition/tsn.md:21-48", "hash": "2bc0b115d42a9ea68f15038dd598b692", "title": "Training TSN on Kinetics-400 Dataset"}, "2166": {"path": "/english_documents/model_zoo/recognition/tsn.md:49-65", "hash": "4822482170b75197d7046273da531e18", "title": "Start Training TSN Model with Kinetics-400 and 8 GPUs\"\n\"Test TSN Model in Test Mode: TenCrop vs. CenterCrop"}, "2167": {"path": "/english_documents/model_zoo/recognition/tsn.md:66-81", "hash": "26eb8cf9c2ebc345f56dbcce77e29e25", "title": "TSN Model Testing and Inference"}, "2168": {"path": "/english_documents/model_zoo/recognition/tsn.md:82-103", "hash": "67b27db7a0e431fca0ea56c019ed0eeb", "title": "GPU-Accelerated TSN Model for Video Recognition"}, "2169": {"path": "/english_documents/model_zoo/recognition/tsn.md:103-119", "hash": "c2900aa6bf29da53a687e82098da9266", "title": "Multi-Scale Random Cropping for Frame Enhancement"}, "2170": {"path": "/english_documents/model_zoo/recognition/tsn.md:121-123", "hash": "d3177e5f3471b96282a31067ae8f68c7", "title": "TSN Implementation in PaddleVideo"}, "2171": {"path": "/english_documents/model_zoo/recognition/tsn_dali.md", "hash": "0e972dbcb0d970636e1c4c9f743e552f", "title": "Accelerating TSN with DALI"}, "2172": {"path": "/english_documents/model_zoo/recognition/tsn_dali.md:1-45", "hash": "fcfe856e510096d5925bc23e78a27f7c", "title": "Accelerating TSN Training with DALI"}, "2173": {"path": "/english_documents/model_zoo/recognition/tsn_dali.md:45-82", "hash": "b86fcb06a5e894adf68df18cf75c14af", "title": "TSN-DALI Training with PaddleVideo"}, "2174": {"path": "/english_documents/model_zoo/recognition/tsn_dali.md:84-98", "hash": "33499eead026403ab3665e7eb101f586", "title": "TSN Action Recognition with DALI"}, "2175": {"path": "/english_documents/model_zoo/recognition/videoswin.md", "hash": "e0c3374499c81bdc6d1bb04860c95986", "title": "Swin-Transformer for Video Accuracy"}, "2176": {"path": "/english_documents/model_zoo/recognition/videoswin.md:1-33", "hash": "9c8d79e784b6bb6e5150f769768a7d79", "title": "Video-Swin Transformer Model Card"}, "2177": {"path": "/english_documents/model_zoo/recognition/videoswin.md:35-60", "hash": "9ec1400f8c0b3b3eedcf1156aa1d6cb4", "title": "Training VideoSwin on Kinetics400 with 8 GPUs"}, "2178": {"path": "/english_documents/model_zoo/recognition/videoswin.md:60-75", "hash": "1006875352c10dd70b88d58fe103053f", "title": "Faster Video-Swin-Transformer Training with Mixed Precision"}, "2179": {"path": "/english_documents/model_zoo/recognition/videoswin.md:77-89", "hash": "e67e945424e28cf30b849bf4733c5b4d", "title": "Optimized Video-Swin-Transformer Testing: UniformCrop for Accuracy"}, "2180": {"path": "/english_documents/model_zoo/recognition/videoswin.md:89-92", "hash": "9cdb2928ffc4d20f80d1b4828a63f700", "title": "Pre-Trained Swin-Transformer Checkpoints"}, "2181": {"path": "/english_documents/model_zoo/recognition/videoswin.md:94-117", "hash": "56c530057f84420b6d469207218f342d", "title": "Export and Predict in PaddleVideo"}, "2182": {"path": "/english_documents/model_zoo/recognition/videoswin.md:119-131", "hash": "0a7f9bd35cfa10f70efa6b383efe183a", "title": "VideoSwin-Transformer for Prediction"}, "2183": {"path": "/english_documents/model_zoo/segmentation/asrf.md", "hash": "59fb23e103bbd582b9c115a28262cbe5", "title": "ASRF: Enhanced Video Segmentation with PaddlePaddle"}, "2184": {"path": "/english_documents/model_zoo/segmentation/asrf.md:1-35", "hash": "c40326ed974490520de15f57722a9b4e", "title": "ASRF: Enhanced Video Segmentation Model with PaddlePaddle"}, "2185": {"path": "/english_documents/model_zoo/segmentation/asrf.md:37-55", "hash": "b31d2422ed4c073bce94eb21cf154b8c", "title": "Training ASRF, Testing MS-TCN with Pre-trained Model"}, "2186": {"path": "/english_documents/model_zoo/segmentation/asrf.md:57-80", "hash": "f87b0a18c7aad3d4f320257926c00b13", "title": "MS-TCN Accuracy and Edit Distance on Three Datasets"}, "2187": {"path": "/english_documents/model_zoo/segmentation/asrf.md:81-100", "hash": "a5cdb18ef31f877657d77a01c307b8b1", "title": "ASRF_gtea Model Weights & F1"}, "2188": {"path": "/english_documents/model_zoo/segmentation/asrf.md:102-131", "hash": "60ffdee8a77461b4d807fa542199d73d", "title": "ASRF Model Inference with PaddleVideo Example"}, "2189": {"path": "/english_documents/model_zoo/segmentation/asrf.md:132-139", "hash": "91aeb49638ca3209c7e87baf60dc1ce4", "title": "Write Inference Results to Separate Files"}, "2190": {"path": "/english_documents/model_zoo/segmentation/cfbi.md", "hash": "e64b08615f24a05b8eade670c5998cda", "title": "CFBI Video Object Segmentation Model"}, "2191": {"path": "/english_documents/model_zoo/segmentation/cfbi.md:1-29", "hash": "bf7858bfc9eb1db73016b8c896d83321", "title": "CFBI: Foreground-Background Collaborative Segmentation"}, "2192": {"path": "/english_documents/model_zoo/segmentation/cfbi.md:31-46", "hash": "684f546c42ec63d46a9eba48b8d60764", "title": "Training and Evaluating CFBIp Segmentation Model on DAVIS Dataset"}, "2193": {"path": "/english_documents/model_zoo/segmentation/mstcn.md", "hash": "f84601da7ebde670d4471c8806f2180d", "title": "MS-TCN Model Evaluation and Comparison"}, "2194": {"path": "/english_documents/model_zoo/segmentation/mstcn.md:1-35", "hash": "98fdc9cef6a76cdba9ae01b5a8266428", "title": "Optimized MS-TCN for Precise Video Segmentation"}, "2195": {"path": "/english_documents/model_zoo/segmentation/mstcn.md:36-52", "hash": "bb3a91801b38b50cfec7eb7064de6b6e", "title": "MSTCN Segmentation Training and Testing"}, "2196": {"path": "/english_documents/model_zoo/segmentation/mstcn.md:54-78", "hash": "d157b8a7f7ba51580227d572d82c788b", "title": "MSTCN vs Paper Model: Dataset Comparison"}, "2197": {"path": "/english_documents/model_zoo/segmentation/mstcn.md:79-108", "hash": "71314c1348e38e8b11b1299937a85a59", "title": "Export and Use MSTCN Inference Model"}, "2198": {"path": "/english_documents/model_zoo/segmentation/mstcn.md:109-130", "hash": "d94b46326705080ff380304310ba1684", "title": "Configuring MSTCN Segmentation Model"}, "2199": {"path": "/english_documents/quick_start.md", "hash": "b5befe7b83946fb59e4cac41c852a45c", "title": "PaddleVideo Quick Start Guide: Installation and Usage"}, "2200": {"path": "/english_documents/quick_start.md:1-36", "hash": "1dc43797a2fbc5af8c748295720c2f9e", "title": "Quick Start Guide: PaddleVideo Installation and Usage"}, "2201": {"path": "/english_documents/quick_start.md:38-76", "hash": "b5b2386e29a69f8cb5b648e38bfca34e", "title": "Install and Run PP-Video"}, "2202": {"path": "/english_documents/quick_start.md:78-107", "hash": "4888c122e22c92cfb3eae01787b715ba", "title": "Video Inference with PaddleVideo and PP-TSM_v2"}, "2203": {"path": "/english_documents/quick_start.md:107-122", "hash": "0e18a4945ec46fc1ccf8b538e22a1927", "title": "PaddleVideo Model Parameters"}, "2204": {"path": "/english_documents/quick_start.md:123-142", "hash": "4d0c04c6d646418d48b25d6f051ed5ac", "title": "Consistent Top-5 Classification Performance"}, "2205": {"path": "/english_documents/quick_start.md:143-157", "hash": "492ce8ef095b5c508a42386773ff5707", "title": "PaddleVideo's Action Recognition Model"}, "2206": {"path": "/english_documents/tools.md", "hash": "be983306a84628fb8bb662a45165c2c1", "title": "PaddleVideo Tools Guide"}, "2207": {"path": "/english_documents/tutorials/Action Recognition Datasets", "hash": "c2690ec45e937cabcd4f5825929413eb", "title": "Action Recognition Datasets: A Comprehensive List"}, "2208": {"path": "/english_documents/tutorials/Action Recognition Papers", "hash": "a9107861a65372870e4e7b6efbbc6065", "title": "Top Action Recognition Papers"}, "2209": {"path": "/english_documents/tutorials/Action Recognition Papers:1-16", "hash": "d7a994e5af0a0100a415c01bbeb3111f", "title": "Top Action Recognition Papers for AI"}, "2210": {"path": "/english_documents/tutorials/Action Recognition Papers:17-28", "hash": "0caa1d853c37c85654879f12828f9ef5", "title": "Action Recognition Papers: State-of-the-Art Models"}, "2211": {"path": "/english_documents/tutorials/Action Recognition Papers:29-29", "hash": "742c9419c1ca8d3c20323923cea9badd", "title": "Trajectory-Pooled Deep Convolutional Descriptors for Action Recognition"}, "2212": {"path": "/english_documents/tutorials/Spatio-Temporal Action Detection Papers", "hash": "9d2d506c856530eb9a4dda9e4a3cbe2c", "title": "Spatio-Temporal Action Detection Papers (2015-2017)"}, "2213": {"path": "/english_documents/tutorials/Spatio-Temporal Action Detection Papers:1-13", "hash": "d080bd704a0cef7686b9cd293f998960", "title": "Spatio-Temporal Action Detection Papers List"}, "2214": {"path": "/english_documents/tutorials/Spatio-Temporal Action Detection Papers:14-24", "hash": "6950cf41cad1721ede14f75c4217587f", "title": "Spatio-Temporal Action Detection Papers 2015-2017"}, "2215": {"path": "/english_documents/tutorials/Spatio-Temporal Action Detection Papers:25-30", "hash": "60be8da6e8fff989bdf0a652211e9545", "title": "Spatio-Temporal Action Detection Papers Overview"}, "2216": {"path": "/english_documents/tutorials/TSM.md", "hash": "270270fcdd38094d6af5d3af783f300c", "title": "TSM: Video Understanding with Spatio-Temporal Balance"}, "2217": {"path": "/english_documents/tutorials/TSM.md:1-5", "hash": "51835d06b096f3b683bc80c38ca4285c", "title": "Introducing TSM: Efficient Video Understanding Model"}, "2218": {"path": "/english_documents/tutorials/TSM.md:6-10", "hash": "31b26b2b585eba92f19e9ea4ac047670", "title": "Efficient Video Understanding with Temporal Shift Module"}, "2219": {"path": "/english_documents/tutorials/TSM.md:11-21", "hash": "4f0186e13d840285ed2fa99d4105e881", "title": "TSM: Balancing 2D and 3D CNNs for Video Understanding"}, "2220": {"path": "/english_documents/tutorials/TSM.md:22-27", "hash": "f5054bbdbea1969e294beecd6c0e534e", "title": "TSM: 2D Conv for Spatial-Temporal Info"}, "2221": {"path": "/english_documents/tutorials/TSM.md:28-40", "hash": "50b60128c0be4efad7e3e2d68a890818", "title": "TSM: Bi-Direction, UNI-Direction, and Residual Variants"}, "2222": {"path": "/english_documents/tutorials/TSM.md:40-58", "hash": "eeccdf4e03531c8cb05d957e4d9e715f", "title": "Accelerating TSM Implementation in PaddlePaddle"}, "2223": {"path": "/english_documents/tutorials/TSM.md:60-73", "hash": "4b474181aa1f9195f64f844f5aec2a86", "title": "Implementing TSM in PaddlePaddle"}, "2224": {"path": "/english_documents/tutorials/Temporal Action Detection Papers", "hash": "29dc2144b510165e398eaf802c6d0928", "title": "Temporal Action Detection Papers"}, "2225": {"path": "/english_documents/tutorials/Temporal Action Detection Papers:1-12", "hash": "66536be3f26a66ac5385e7cc8511b788", "title": "Temporal Action Detection Papers: A Comprehensive List"}, "2226": {"path": "/english_documents/tutorials/Temporal Action Detection Papers:12-21", "hash": "0bca8d325a651af53e21a25c84c07537", "title": "Temporal Action Detection Papers Collection"}, "2227": {"path": "/english_documents/tutorials/Temporal Action Detection Papers:22-24", "hash": "16c1d297e27589bfdd02797104567933", "title": "Temporal Action Detection Papers: A Comprehensive Guide"}, "2228": {"path": "/english_documents/tutorials/accelerate.md", "hash": "5d860066d7076df5ae274abeab098bbf", "title": "Dual-Language Tutorial: Accelerate"}, "2229": {"path": "/english_documents/tutorials/config.md", "hash": "63184f63d3ca9ac4f25e8f295334aaac", "title": "Dependency Injection with PaddleVideo: Config-based Modularity"}, "2230": {"path": "/english_documents/tutorials/config.md:1-37", "hash": "bd9bb7657d3143cf6f58eeddf8692f87", "title": "IOC/DI for Modular PaddleVideo"}, "2231": {"path": "/english_documents/tutorials/config.md:39-89", "hash": "eff0aa359458584b92b69cebfe92c232", "title": "DI with Register and Builder: Module Mapping Tutorial"}, "2232": {"path": "/english_documents/tutorials/config.md:90-117", "hash": "47bb970366a7b7abd64293e1485b3254", "title": "Dependency Injection via Config-Driven Class Instantiation"}, "2233": {"path": "/english_documents/tutorials/config.md:118-131", "hash": "0b2fe065958b1c42c17d349fb40ffecd", "title": "Command-Line Arguments for Training Script"}, "2234": {"path": "/english_documents/tutorials/customized_usage.md", "hash": "c5d315001b04656e30c63ff90457c493", "title": "Customizing PaddleVideo Framework Tutorial"}, "2235": {"path": "/english_documents/tutorials/demos", "hash": "20cade6441a851551ca8536565bb8565", "title": "Multi-Task Action Recognition Demo"}, "2236": {"path": "/english_documents/tutorials/deployment.md", "hash": "a479eab2428b0ba55a35573c285a99b1", "title": "Converting Dygraph Models to Static for Deployment"}, "2237": {"path": "/english_documents/tutorials/deployment.md:1-24", "hash": "8412d694b66259fd31196d557e4c13b8", "title": "Dynamic to Static: Deploying Dygraph Models with PaddleVideo"}, "2238": {"path": "/english_documents/tutorials/deployment.md:24-48", "hash": "d5c9c46b119ddc443e5ba20ade2e0583", "title": "PaddleInference Video Inference Testing"}, "2239": {"path": "/english_documents/tutorials/modular_design.md", "hash": "586998d06534985ebc571ef150cbea04", "title": "Bilingual Modular Design Tutorial"}, "2240": {"path": "/english_documents/tutorials/pp-tsm.md", "hash": "d006d97f564de0acc37776727531cc78", "title": "Introducing PP-TSM: Enhanced Video Recognition Model"}, "2241": {"path": "/english_documents/tutorials/pp-tsm.md:1-22", "hash": "b72487a4dfd7eb0f513d590b4c94fca5", "title": "High-Performance Video Recognition with PP-TSM"}, "2242": {"path": "/english_documents/tutorials/pp-tsm.md:23-32", "hash": "ceadc0f168236c4d668fe6f0b4fcc0e6", "title": "Optimizing Model Performance with Enhanced Techniques"}, "2243": {"path": "/english_documents/tutorials/summarize.md", "hash": "12cba0e3c021be561d1b8e88b42e0ef2", "title": "Video Action Recognition: Deep Learning Techniques"}, "2244": {"path": "/english_documents/tutorials/summarize.md:1-18", "hash": "080e3819b483b9f5cc84eb53b2a79305", "title": "Action Recognition: Applications and Classification in Multiple Fields"}, "2245": {"path": "/english_documents/tutorials/summarize.md:18-32", "hash": "cb4151a8807d317eef4e3a4c940256d7", "title": "Multi-modal Video Classification Tasks"}, "2246": {"path": "/english_documents/tutorials/summarize.md:33-49", "hash": "e464f82299daf8014dd049812d3b6958", "title": "Temporal Action Classification and Dense Captioning"}, "2247": {"path": "/english_documents/tutorials/summarize.md:51-72", "hash": "85d04c3678afaac4922b18776449adbe", "title": "Popular Video Action Datasets: Overview and Challenges"}, "2248": {"path": "/english_documents/tutorials/summarize.md:73-87", "hash": "074d1f8851cd177714876324016c18e2", "title": "HMDB51 vs Kinetics: Action Recognition Datasets"}, "2249": {"path": "/english_documents/tutorials/summarize.md:88-104", "hash": "19bdeaf19c10080ea5ddd7f418e65697", "title": "Kinetics: Action Recognition Benchmark"}, "2250": {"path": "/english_documents/tutorials/summarize.md:105-121", "hash": "a5aff108bd10cf82fef1e818add334c1", "title": "Dataset Comparison: Mexaction2 and ActivityNet"}, "2251": {"path": "/english_documents/tutorials/summarize.md:122-138", "hash": "a97c5a724d5afc62f8a45c2a0f842a9b", "title": "Action Recognition with Manual and Deep Learning Methods"}, "2252": {"path": "/english_documents/tutorials/summarize.md:139-154", "hash": "1051b7eea9c3ec46dde34373d20dd00c", "title": "Deep Learning in Video Classification"}, "2253": {"path": "/english_documents/tutorials/summarize.md:156-173", "hash": "60c4f2bc91fce07f394721ed684b63d0", "title": "ActivityNet Competition: Large-Scale Action Recognition from YouTube Videos"}, "2254": {"path": "/english_documents/tutorials/summarize.md:174-193", "hash": "8af2df3589acfdd946eb12146d18b62f", "title": "Action Recognition and Video Classification References"}, "2255": {"path": "/english_documents/tutorials/summarize.md:194-206", "hash": "89f1ad6e1fda5cb68de69c05fb750126", "title": "Video Recognition Paper References List"}, "2256": {"path": "/english_documents/usage.md", "hash": "4286d2d8e49dde7eefefcc9afa1db745", "title": "Efficient PaddleVideo Training on Linux"}, "2257": {"path": "/english_documents/usage.md:1-28", "hash": "495da780efaf293100ba00eb8c825914", "title": "Setting Up PaddleVideo Environment"}, "2258": {"path": "/english_documents/usage.md:29-71", "hash": "bba9f4a37d2723f9ccd49a143395ad18", "title": "Train and Test Models with PaddlePaddle"}, "2259": {"path": "/english_documents/usage.md:71-89", "hash": "1acb0f0ab7deebbf660e7f0a306f5e3e", "title": "Training and Validation Log Format"}, "2260": {"path": "/english_documents/usage.md:91-132", "hash": "8982845dfa72b59a45fa30f414c56ab5", "title": "PaddleVideo: Resume, Finetune, Test Usage"}, "2261": {"path": "/english_documents/usage.md:134-174", "hash": "e04f5a5c51e0b473412e2c4f0e0ece44", "title": "Distributed PaddleVideo Testing and Inference"}, "2262": {"path": "/english_documents/usage.md:175-177", "hash": "d621f8496d3ef81c989ae7ff941babc9", "title": "Enabling/Disabling GPU in PaddleVideo"}, "2263": {"path": "/main.py", "hash": "44c28736406f18f314f6b1327ea13397", "title": "Distributed PaddleVideo Training"}, "2264": {"path": "/main.py:1-29", "hash": "606fc3acbfb34c8311513b679b9f9284", "title": "Train PaddleVideo Model with Argparse"}, "2265": {"path": "/main.py:30-52", "hash": "49a2808e3666c7990f90b6d0efbbeedf", "title": "Command Line Arguments for PaddleVideo"}, "2266": {"path": "/main.py:53-84", "hash": "da7ed2286aa3ddef875f55cbbce53ec3", "title": "Command-Line AMP Training Customization"}, "2267": {"path": "/main.py:87-118", "hash": "6514b1c513a1430a6dc485d317b0e4c9", "title": "Configure and Parse Arguments for Paddle's Main Function"}, "2268": {"path": "/main.py:120-141", "hash": "f303080180843bd489461b0d08b8fada", "title": "Command-Line Driven Model Training"}, "2269": {"path": "/paddlevideo/__init__.py", "hash": "ffb301b1e71c7ebff0b2564fb855f2ef", "title": "Initializing PaddleVideo"}, "2270": {"path": "/paddlevideo/loader/__init__.py", "hash": "12531d48e8da8904d3b6ab9510fa7eb3", "title": "PaddleVideo Dataset and Loader"}, "2271": {"path": "/paddlevideo/loader/builder.py", "hash": "b20086d1b53e4f19f1f5054ede75f88c", "title": "Distributed PaddleVideo Data Loader"}, "2272": {"path": "/paddlevideo/loader/builder.py:1-29", "hash": "5bd09ab778a17fc504594d31de57c0bf", "title": "PaddleVideo Pipeline Builder"}, "2273": {"path": "/paddlevideo/loader/builder.py:30-74", "hash": "9120b45bc11570b1efff48287b286a7a", "title": "Paddle Video Data Loader Builder"}, "2274": {"path": "/paddlevideo/loader/builder.py:75-96", "hash": "06f3674b9e3ba7bed247b0a9ea286a8b", "title": "Data Sampler for ML/DL Models"}, "2275": {"path": "/paddlevideo/loader/builder.py:97-132", "hash": "4da799b0633246eefc0b1cb88165327a", "title": "Variable-Length Batch Data Loader"}, "2276": {"path": "/paddlevideo/loader/dali_loader.py", "hash": "b1e4eb41722afe3a5073d91eba395762", "title": "Dali Loader: Video Processing with PaddleOps"}, "2277": {"path": "/paddlevideo/loader/dali_loader.py:1-32", "hash": "08c0dee99a4d5bd2f3fa02ea52234994", "title": "Dali Loader: Importing PaddlePaddle Iterator"}, "2278": {"path": "/paddlevideo/loader/dali_loader.py:35-65", "hash": "9a8bc59bfa2acd08c5fc74b15f562d67", "title": "Dali Loader: Class Initialization and DALI Reader Building"}, "2279": {"path": "/paddlevideo/loader/dali_loader.py:66-88", "hash": "3a83c296ba95be3596ff916fcb9a951b", "title": "Sharding Data Distribution Code Snippet"}, "2280": {"path": "/paddlevideo/loader/dali_loader.py:89-111", "hash": "8e1b65e2901bc994a136a63185138de0", "title": "Dali Loader: Parallel Video Preprocessing"}, "2281": {"path": "/paddlevideo/loader/dali_loader.py:112-142", "hash": "53c3a174b65e4d95f97c5ed11cd58207", "title": "DALI Video Iterator for PaddleVideo"}, "2282": {"path": "/paddlevideo/loader/dali_loader.py:143-160", "hash": "0583e56be573b628a35e1d5a45da6257", "title": "Dali Video Loader Initialization"}, "2283": {"path": "/paddlevideo/loader/dali_loader.py:161-176", "hash": "e929b3f3c25fe15d658699754996b4d0", "title": "DALI Loader for Image Processing"}, "2284": {"path": "/paddlevideo/loader/dali_loader.py:177-202", "hash": "b0dcc06ff159f61979c8de493704e08a", "title": "DALI Loader Operations"}, "2285": {"path": "/paddlevideo/loader/dali_loader.py:203-206", "hash": "0dbd31a933121e1ab0419f7f2ca00a4e", "title": "Loader Methods and Length Determination"}, "2286": {"path": "/paddlevideo/loader/dataset/MRI.py", "hash": "e2d9d55781c2b85fd6c4e763faea6cab", "title": "MRI Dataset Loader in PaddleVideo"}, "2287": {"path": "/paddlevideo/loader/dataset/MRI.py:1-31", "hash": "9574f6b0a8dfa7a9cc481edb686a892a", "title": "PaddleVideo: MRI Dataset Loader"}, "2288": {"path": "/paddlevideo/loader/dataset/MRI.py:31-61", "hash": "dd88fa69bd109c05919349384425844d", "title": "MRI Dataset Initialization"}, "2289": {"path": "/paddlevideo/loader/dataset/MRI.py:62-86", "hash": "30dc526faf4663e2f3c2561bcb9fbeb2", "title": "Loader: Handling Missing MRI Files"}, "2290": {"path": "/paddlevideo/loader/dataset/MRI.py:87-108", "hash": "d122b4129b5daf581be8daf4ca647b76", "title": "Retry Loading Frames: Exception Handling in MRI Dataset"}, "2291": {"path": "/paddlevideo/loader/dataset/MRI.py:109-109", "hash": "4622165fe52dcfbe2fd300bd2404f23f", "title": "MRI Dataset Loader: Numpy Arrays from Results"}, "2292": {"path": "/paddlevideo/loader/dataset/MRI_SlowFast.py", "hash": "0a519f4036c572e0ccefd8ca968e50c6", "title": "MRI SlowFast Dataset Loader"}, "2293": {"path": "/paddlevideo/loader/dataset/MRI_SlowFast.py:1-31", "hash": "f24cb32526c703363c74614385e926c6", "title": "MRI_SlowFast Dataset Loader"}, "2294": {"path": "/paddlevideo/loader/dataset/MRI_SlowFast.py:31-61", "hash": "fb1800c893ea60af49ff9c6a8140fa75", "title": "MRI Dataset Loader Class"}, "2295": {"path": "/paddlevideo/loader/dataset/MRI_SlowFast.py:62-86", "hash": "99f09c08c6fb9c1603fac738fc9b7b4c", "title": "Paddle Video: MRI Dataset Loading"}, "2296": {"path": "/paddlevideo/loader/dataset/MRI_SlowFast.py:87-108", "hash": "3daa26aa34997462c42a47ea0ef2f7fc", "title": "Retry Loader with Error Logging"}, "2297": {"path": "/paddlevideo/loader/dataset/MRI_SlowFast.py:109-111", "hash": "70763cda1b5486cb6cfb6ac3ad13fe67", "title": "MRI Dataset Loader"}, "2298": {"path": "/paddlevideo/loader/dataset/__init__.py", "hash": "c867b8d28a633d6c145c47b36f2553f1", "title": "PaddleVideo Datasets: Load and Understand"}, "2299": {"path": "/paddlevideo/loader/dataset/__init__.py:1-25", "hash": "26d6e1ceca3c3b1efd935d8471e7111a", "title": "PaddleVideo Dataset Importer"}, "2300": {"path": "/paddlevideo/loader/dataset/__init__.py:26-41", "hash": "5c03595c46ad8b68744f40ae42019bc8", "title": "Importing and Exporting Datasets"}, "2301": {"path": "/paddlevideo/loader/dataset/actbert_dataset.py", "hash": "e25cfa618651a0cc12e56af47e57fd8a", "title": "ActBERT Dataset Setup in PaddlePaddle"}, "2302": {"path": "/paddlevideo/loader/dataset/actbert_dataset.py:1-31", "hash": "00f873ddbe30353443da94f1c0190948", "title": "Setting Up ActBERT Dataset in PaddleVideo"}, "2303": {"path": "/paddlevideo/loader/dataset/actbert_dataset.py:32-66", "hash": "8d53b80c0eff6f65d483179fa926f589", "title": "Class ActBertDataset Loader"}, "2304": {"path": "/paddlevideo/loader/dataset/actbert_dataset.py:67-74", "hash": "a654df98d44cc9de34ff82b941b84b66", "title": "ActBERT Dataset Preparation"}, "2305": {"path": "/paddlevideo/loader/dataset/asrf_dataset.py", "hash": "83fd932b91e8640ad85024e07d9775b6", "title": "ASRF Dataset Loader"}, "2306": {"path": "/paddlevideo/loader/dataset/asrf_dataset.py:1-38", "hash": "e81f050bf59e4cfaa84162379a472324", "title": "ASRF Dataset: Action Segmentation Videos"}, "2307": {"path": "/paddlevideo/loader/dataset/asrf_dataset.py:39-68", "hash": "572e9a05548382c1d5c2bb636213ad25", "title": "ASRF Dataset Loader"}, "2308": {"path": "/paddlevideo/loader/dataset/asrf_dataset.py:69-92", "hash": "30ba0f80d7bbd5fed3ed210c4ed1f566", "title": "PaddleVideo Dataset Loader"}, "2309": {"path": "/paddlevideo/loader/dataset/asrf_dataset.py:94-104", "hash": "0a091d9cf1c707e21e40bdd31be44d0a", "title": "Boundary Data Loading and Processing"}, "2310": {"path": "/paddlevideo/loader/dataset/ava_dataset.py", "hash": "a3adc13af624ffd0f2f373603b3a331e", "title": "AVA Dataset Class in PaddleVideo"}, "2311": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:1-32", "hash": "fd5230ab257ec029514dd98c3b960d43", "title": "AVA Dataset Class in PaddleVideo"}, "2312": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:33-62", "hash": "97338e794f26b21180f8d2ddc86fe76b", "title": "AVA Dataset Initialization and Validation"}, "2313": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:63-93", "hash": "9e27aab0275590477e55a05f52408bce", "title": "AVA Dataset Initialization and Validation"}, "2314": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:94-122", "hash": "1cf7c15a9d21bed30975239d408676f9", "title": "Excluding Mismatched Entity Boxes"}, "2315": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:123-148", "hash": "852b86c06a488b930867cf23c74c2e52", "title": "AVA Dataset Loader"}, "2316": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:149-170", "hash": "ceb0429097b71fcdea8dddd8bb6833f6", "title": "AVA Dataset Video Processing"}, "2317": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:171-197", "hash": "f3f5b3d51140634b7f3a72df78ddf7c0", "title": "Initialize and Append Video Information"}, "2318": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:198-221", "hash": "137e37ac27d8aa2a227fd34d18a03bf7", "title": "Filtering and Padding AVA Dataset Proposals"}, "2319": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:222-240", "hash": "339a6a4f2f32f8439ef4eb58bfc64bb8", "title": "Feature Padding in Ava Dataset"}, "2320": {"path": "/paddlevideo/loader/dataset/ava_dataset.py:241-249", "hash": "549ff783af458b1ad991fab23d46ba4c", "title": "AVA Dataset Preparation and Evaluation"}, "2321": {"path": "/paddlevideo/loader/dataset/base.py", "hash": "39ad39c8b39a78966e690354999d1a8c", "title": "BaseDataset: Loading and Preparing PaddlePaddle Data"}, "2322": {"path": "/paddlevideo/loader/dataset/base.py:1-32", "hash": "61efd3ddf628049c96c4f2f020ae1817", "title": "BaseDataset: Python Class for PaddlePaddle Datasets"}, "2323": {"path": "/paddlevideo/loader/dataset/base.py:34-59", "hash": "20e09235bd284e6489db569e9eb6f254", "title": "Initializing Base Dataset Class"}, "2324": {"path": "/paddlevideo/loader/dataset/base.py:60-80", "hash": "2e6a4f416fb784c91e75ffac4e53f42e", "title": "Dataset Class for Paddle.io Video Loading"}, "2325": {"path": "/paddlevideo/loader/dataset/bmn_dataset.py", "hash": "c1bec29bf2ffe9c9d29510ac93a214a2", "title": "BMN Dataset Loader"}, "2326": {"path": "/paddlevideo/loader/dataset/bmn_dataset.py:1-36", "hash": "36a888afd139303e464a26379e0c57be", "title": "BMNDataset: Action Localization Videos"}, "2327": {"path": "/paddlevideo/loader/dataset/bmn_dataset.py:38-64", "hash": "98e62e11b6250e47dbfd1721a3e954e1", "title": "Video Index Loading and Sorting in BMN Dataset"}, "2328": {"path": "/paddlevideo/loader/dataset/bmn_dataset.py:65-72", "hash": "3c8d42adda7aa170bfc31a539e7f7534", "title": "Prepare Test Data with BMN Dataset"}, "2329": {"path": "/paddlevideo/loader/dataset/davis_dataset.py", "hash": "30416a71f8fe6561eb67e49ae2c68992", "title": "Davis Dataset for Video Segmentation"}, "2330": {"path": "/paddlevideo/loader/dataset/davis_dataset.py:1-37", "hash": "9f3d53d8b3ca5fa812e540706c336043", "title": "PaddleVideo's VOS Dataset Processing"}, "2331": {"path": "/paddlevideo/loader/dataset/davis_dataset.py:38-66", "hash": "4032590b84cad76572075e7e170712b4", "title": "Davis Dataset Initialization"}, "2332": {"path": "/paddlevideo/loader/dataset/davis_dataset.py:67-94", "hash": "4c8d5e85d065162d65efefb7680856c1", "title": "DAVIS Dataset Image Loader"}, "2333": {"path": "/paddlevideo/loader/dataset/davis_dataset.py:96-127", "hash": "a8b4107865ddec41e41daecc78dcca69", "title": "Dataset Sample Generation"}, "2334": {"path": "/paddlevideo/loader/dataset/davis_dataset.py:128-158", "hash": "e451f49ecc04265a41ca8251ae668139", "title": "Davis 2017 Dataset Initialization"}, "2335": {"path": "/paddlevideo/loader/dataset/davis_dataset.py:159-182", "hash": "a14ff96351e9d79706094f2007f5b1eb", "title": "VOS Test Dataset Preparation"}, "2336": {"path": "/paddlevideo/loader/dataset/davis_dataset.py:183-189", "hash": "a0b650c2d1275d396b79b66980619a85", "title": "Dataset Loading in PaddleVideo"}, "2337": {"path": "/paddlevideo/loader/dataset/feature.py", "hash": "94f71b85842b6af98ff57a7d8f24185b", "title": "FeatureDataset: PaddleVideo's Action Recognition Tool"}, "2338": {"path": "/paddlevideo/loader/dataset/feature.py:1-36", "hash": "11fe5e8bcd62cb4e1df972437ee53344", "title": "FeatureDataset: Action Recognition in PaddleVideo"}, "2339": {"path": "/paddlevideo/loader/dataset/feature.py:38-63", "hash": "923fab937361e43de03a0bcde5d486ce", "title": "Video Dataset Loader Methods"}, "2340": {"path": "/paddlevideo/loader/dataset/feature.py:64-80", "hash": "72c6bbec091c3933cce874ca2b604c3f", "title": "Preparing Test Data with Pipeline"}, "2341": {"path": "/paddlevideo/loader/dataset/frame.py", "hash": "46ce97ecc09d15b4847dda60a0b07b6a", "title": "PaddleVideo: Efficient Video Datasets"}, "2342": {"path": "/paddlevideo/loader/dataset/frame.py:1-31", "hash": "2e86c972a65af70952161bf29357d5b6", "title": "Frame Dataset Class in PaddleVideo"}, "2343": {"path": "/paddlevideo/loader/dataset/frame.py:31-61", "hash": "b58640212da7c3b8a7ee93ef52077d02", "title": "Video Index Loader Class"}, "2344": {"path": "/paddlevideo/loader/dataset/frame.py:62-86", "hash": "f71634b80bcb834aaea41580a4fd62f2", "title": "Frame Data Reader with Exception Handling"}, "2345": {"path": "/paddlevideo/loader/dataset/frame.py:87-108", "hash": "a99a5e2043f4198cb190c60bbc8e9790", "title": "Exception Handling for Loading Frames"}, "2346": {"path": "/paddlevideo/loader/dataset/frame.py:111-136", "hash": "33fe7236b34ed2e51c5d77ca8a7cc0b5", "title": "FrameDataset for Sports Videos"}, "2347": {"path": "/paddlevideo/loader/dataset/frame.py:137-158", "hash": "47bf4345faeb4e85ebcf19df8089699b", "title": "Frame Directory Data Processing"}, "2348": {"path": "/paddlevideo/loader/dataset/frame.py:159-177", "hash": "c084dba0e1af602b61693c9e98b02345", "title": "Retry Corrupted Video Data Preparation"}, "2349": {"path": "/paddlevideo/loader/dataset/ms_tcn_dataset.py", "hash": "859e25ba5373595c10f89bafee143957", "title": "MS-TCN Dataset Loader"}, "2350": {"path": "/paddlevideo/loader/dataset/ms_tcn_dataset.py:1-38", "hash": "2f7f6c6af539436a5b04dee5f4a626df", "title": "MS-TCN Dataset Registration"}, "2351": {"path": "/paddlevideo/loader/dataset/ms_tcn_dataset.py:39-68", "hash": "c31dee3baf78e3d4e59aac5ee2a6fdeb", "title": "MS-Tcn Dataset Initialization"}, "2352": {"path": "/paddlevideo/loader/dataset/ms_tcn_dataset.py:69-95", "hash": "7315cce1a9a6fc1105b0f4e8fddac6ee", "title": "Video Feature and Label Dataset Loader"}, "2353": {"path": "/paddlevideo/loader/dataset/ms_tcn_dataset.py:97-110", "hash": "efc502005b1116b23bce91112a4eb4c6", "title": "Video Dataset Label Loading Function"}, "2354": {"path": "/paddlevideo/loader/dataset/msrvtt.py", "hash": "9b1972e6ea256145cf949c47f8a0feb9", "title": "MSRVTT Dataset Preparation"}, "2355": {"path": "/paddlevideo/loader/dataset/msrvtt.py:1-31", "hash": "f253d07dabf5fa1bf5cffd078a5ba75d", "title": "Python Script: LMDB & PaddleLNPTok"}, "2356": {"path": "/paddlevideo/loader/dataset/msrvtt.py:32-67", "hash": "b2cb8f0b58ad15bc93c6b6da6ad02902", "title": "MSR-VTT Dataset Loader"}, "2357": {"path": "/paddlevideo/loader/dataset/msrvtt.py:68-93", "hash": "dfe6c17146aeb3860b5cdc53c5452eb4", "title": "Video Caption Tokenization with BertTokenizer"}, "2358": {"path": "/paddlevideo/loader/dataset/msrvtt.py:94-120", "hash": "7e9e5a63923dae146c5a7bacac768806", "title": "Video Data Processing and Loader"}, "2359": {"path": "/paddlevideo/loader/dataset/msrvtt.py:121-142", "hash": "0e7cf8deb980d74c2e67a03892731784", "title": "Image Box Resizing and Feature Concatenation"}, "2360": {"path": "/paddlevideo/loader/dataset/msrvtt.py:143-163", "hash": "8d7a26a8092e69f637bb582f7249e018", "title": "MSRVTT Dataset Loading and Feature Extraction"}, "2361": {"path": "/paddlevideo/loader/dataset/msrvtt.py:165-187", "hash": "15d5504809fc2382ef736d77bbda68ce", "title": "Padding and Conversion for Dataset"}, "2362": {"path": "/paddlevideo/loader/dataset/msrvtt.py:188-220", "hash": "2199d9239198064597257c3eb6c12618", "title": "Data Preparation in MSR-VTT Dataset"}, "2363": {"path": "/paddlevideo/loader/dataset/oxford.py", "hash": "43c790407dfc0d8e648cdff21955dd73", "title": "MonoDataset: PaddleVideo Oxford Dataset"}, "2364": {"path": "/paddlevideo/loader/dataset/oxford.py:1-37", "hash": "571d2cc2812df583ce321c65cfdcbb95", "title": "Creating MonoDataset for PaddleVideo"}, "2365": {"path": "/paddlevideo/loader/dataset/oxford.py:39-62", "hash": "6dfda28527b22fa26333e97240a24d6f", "title": "Oxford Dataset Loader"}, "2366": {"path": "/paddlevideo/loader/dataset/skeleton.py", "hash": "7af4cd74f60b5a2d874c3a8235f1327c", "title": "SkeletonDataset: Action Recognition Loader"}, "2367": {"path": "/paddlevideo/loader/dataset/skeleton.py:1-34", "hash": "6ab0aa395f41b2c6a9b5516b7fc14155", "title": "Skeleton Dataset: Action Recognition Loader"}, "2368": {"path": "/paddlevideo/loader/dataset/skeleton.py:35-55", "hash": "38544bbf9e24b971d3c7ff20020a0fa8", "title": "Skeleton Data Loader Class"}, "2369": {"path": "/paddlevideo/loader/dataset/skeleton.py:56-78", "hash": "b7ccecd3815d14e14637e67f340d2cfe", "title": "Skeleton DataLoader"}, "2370": {"path": "/paddlevideo/loader/dataset/slowfast_video.py", "hash": "cf3232c0c91a7234e000882913e17500", "title": "SF Video Dataset: PaddleVideo's Action Recognition"}, "2371": {"path": "/paddlevideo/loader/dataset/slowfast_video.py:1-31", "hash": "ba1e3266e04db515d50b62c50343bed0", "title": "SlowFast Video Dataset"}, "2372": {"path": "/paddlevideo/loader/dataset/slowfast_video.py:32-64", "hash": "7927952a5c093d4fa3aaf0a308814773", "title": "SlowFast Video Dataset"}, "2373": {"path": "/paddlevideo/loader/dataset/slowfast_video.py:65-87", "hash": "6b3c9799602ab4047ae07125918c7174", "title": "Random Seed and Index Loading"}, "2374": {"path": "/paddlevideo/loader/dataset/slowfast_video.py:88-112", "hash": "8c48ab964b59af1e7bfca52eca181414", "title": "Resilient Video Dataset Processing"}, "2375": {"path": "/paddlevideo/loader/dataset/slowfast_video.py:113-137", "hash": "ef42aa2939a20ea5da10575ba2d68ead", "title": "Retry and Logging Video Loader"}, "2376": {"path": "/paddlevideo/loader/dataset/slowfast_video.py:138-143", "hash": "8c259d9d49aff1cb34fe4b1f7746ff3d", "title": "Size of Dataset Calculator"}, "2377": {"path": "/paddlevideo/loader/dataset/ucf101_skeleton.py", "hash": "2b107e035abb4b7929cf3ef2a93e8cc1", "title": "UCF101 Skeleton Dataset PaddleVideo Loader"}, "2378": {"path": "/paddlevideo/loader/dataset/ucf101_skeleton.py:1-35", "hash": "98faf73c53c8f3e67f5593d77fabffef", "title": "UCF101 Skeleton Dataset Loader"}, "2379": {"path": "/paddlevideo/loader/dataset/ucf101_skeleton.py:36-66", "hash": "aa314b9a1fb4c9bbbdb105a75910c553", "title": "UCF101 Skeleton Annotation Loader"}, "2380": {"path": "/paddlevideo/loader/dataset/ucf101_skeleton.py:67-89", "hash": "9840cf1c2744fbf36e2125305efdd6ad", "title": "UCf101 Skeleton Dataset Preparation"}, "2381": {"path": "/paddlevideo/loader/dataset/ucf24_dataset.py", "hash": "396cbcb1e55c7a39a3c1794cac34ef6c", "title": "Ucf24Dataset Class for PaddleVideo"}, "2382": {"path": "/paddlevideo/loader/dataset/ucf24_dataset.py:1-30", "hash": "b33ece26e28f7bbe768c7e35b193afee", "title": "UCF24 Dataset Python Class"}, "2383": {"path": "/paddlevideo/loader/dataset/ucf24_dataset.py:31-59", "hash": "c7bb89a470032d476c2cd93a5e583e05", "title": "Ucf24Dataset: Video Data Loader"}, "2384": {"path": "/paddlevideo/loader/dataset/ucf24_dataset.py:60-76", "hash": "340143c019c5e7f7074182754eb376ef", "title": "UCF24 Dataset Preparation and Conversion"}, "2385": {"path": "/paddlevideo/loader/dataset/video.py", "hash": "afd63647d4fa7f3a725e388c2ede37b7", "title": "Video Dataset Loader"}, "2386": {"path": "/paddlevideo/loader/dataset/video.py:1-31", "hash": "55e40944fb9a83b272c3781434d90be0", "title": "VideoDataset: Loading and Transforming Raw Videos"}, "2387": {"path": "/paddlevideo/loader/dataset/video.py:32-57", "hash": "5291e4be761c70cb32cea2ad34ebbe95", "title": "Video Loader Class Initialization"}, "2388": {"path": "/paddlevideo/loader/dataset/video.py:58-80", "hash": "2914c458f2e0192a43d3902f39b38d33", "title": "Video Dataset Preparer"}, "2389": {"path": "/paddlevideo/loader/dataset/video.py:81-95", "hash": "d6f1a55e44be37ad2e7f262bec9e12b8", "title": "Robust Video Loading and Testing"}, "2390": {"path": "/paddlevideo/loader/pipelines/__init__.py", "hash": "c80a5807780d3dc5e01c5e4421fef368", "title": "Video Processing Pipelines"}, "2391": {"path": "/paddlevideo/loader/pipelines/__init__.py:1-20", "hash": "c79221f81d0429cc1e680a5ac376ac87", "title": "Pipeline Initialization in PaddleVideo"}, "2392": {"path": "/paddlevideo/loader/pipelines/__init__.py:21-38", "hash": "a7cf4082ac287c0fe68a11c9ddb0f04c", "title": "PaddleVideo Pipelines"}, "2393": {"path": "/paddlevideo/loader/pipelines/__init__.py:39-55", "hash": "1b8fc3f3f6a2853751c54292729d847f", "title": "Customizable PaddleVideo Pipelines"}, "2394": {"path": "/paddlevideo/loader/pipelines/__init__.py:56-56", "hash": "d9c239db287380391a4b887498fd023e", "title": "Empty Code Alert"}, "2395": {"path": "/paddlevideo/loader/pipelines/anet_pipeline.py", "hash": "6eafb8d603d1e9d1f6d526a99d32e957", "title": "PaddleVideo: IoU-based Feature Extraction"}, "2396": {"path": "/paddlevideo/loader/pipelines/anet_pipeline.py:1-32", "hash": "d2590645397863832521bae7f31d2d01", "title": "PaddleVideo Feature Data Loader"}, "2397": {"path": "/paddlevideo/loader/pipelines/anet_pipeline.py:33-62", "hash": "6580dd17e6dca23452dc27b777f4e576", "title": "Temporal Matching Windows Generator"}, "2398": {"path": "/paddlevideo/loader/pipelines/anet_pipeline.py:63-90", "hash": "54f9f2259969e68254abb74f70c89e8f", "title": "Anchors Intersection Calculator"}, "2399": {"path": "/paddlevideo/loader/pipelines/anet_pipeline.py:91-115", "hash": "794e498e1487a1a8892aa2dff955905e", "title": "Ground Truth Initialization in Video Pipeline"}, "2400": {"path": "/paddlevideo/loader/pipelines/anet_pipeline.py:116-140", "hash": "d0bd33bce6ee4e086a64420b43f6ef1c", "title": "Intersection Over Union Calculation for Anchor Boxes"}, "2401": {"path": "/paddlevideo/loader/pipelines/anet_pipeline.py:141-150", "hash": "ce01b5fb034f28ff2896bfe2a053431b", "title": "Annotating IOU Maps"}, "2402": {"path": "/paddlevideo/loader/pipelines/augmentations.py", "hash": "e7bfe2e6bb3e7939723463bdfcc5e3b5", "title": "Enhanced PaddleVideo Loader with Augmentation"}, "2403": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1-34", "hash": "548268937c767bfc2e40c17b99bed85e", "title": "Scaling Images with PaddleVideo Loader"}, "2404": {"path": "/paddlevideo/loader/pipelines/augmentations.py:35-61", "hash": "a707733c84ea8e6eca5ee3392a604598", "title": "Resize Image Class"}, "2405": {"path": "/paddlevideo/loader/pipelines/augmentations.py:62-88", "hash": "2994b8870e4a34c1515769aff2547ccd", "title": "Resizing Images in PaddleVideo Pipeline"}, "2406": {"path": "/paddlevideo/loader/pipelines/augmentations.py:89-108", "hash": "0066180b4d0b468132251e4423eb6251", "title": "Resizing Image with Aspect Ratio Preservation"}, "2407": {"path": "/paddlevideo/loader/pipelines/augmentations.py:109-138", "hash": "c08fdeed1adee6790ed7e77ce152003c", "title": "Image Augmentation Pipeline Defined"}, "2408": {"path": "/paddlevideo/loader/pipelines/augmentations.py:139-164", "hash": "435fef1c324c9de096762eff45b25075", "title": "Random Crop Augmentation in PaddleVideo"}, "2409": {"path": "/paddlevideo/loader/pipelines/augmentations.py:165-197", "hash": "3efce07732a6c7fac9f74d04a5ee16f9", "title": "Random Resizing and Cropping Pipeline"}, "2410": {"path": "/paddlevideo/loader/pipelines/augmentations.py:198-219", "hash": "81cc9d51681e68726058d9f5a0be24cf", "title": "Random Crop Generator"}, "2411": {"path": "/paddlevideo/loader/pipelines/augmentations.py:220-249", "hash": "f76d03acae5b820a821f8ee3cdec61d9", "title": "Image Cropper for PaddleVideo"}, "2412": {"path": "/paddlevideo/loader/pipelines/augmentations.py:250-276", "hash": "93bc321a8f4cdbc2a374e191a0507be1", "title": "Center Cropping Image Class"}, "2413": {"path": "/paddlevideo/loader/pipelines/augmentations.py:277-297", "hash": "515a05a43d5a7c733f9feb4fd9713d34", "title": "Center Crop Images in Augmentations"}, "2414": {"path": "/paddlevideo/loader/pipelines/augmentations.py:298-325", "hash": "03ba993de4c1f28ab3bc5b183bdac0bb", "title": "MultiScaleCrop: Flexible Image Augmentation"}, "2415": {"path": "/paddlevideo/loader/pipelines/augmentations.py:326-360", "hash": "eeb822872bcb49d00d8b07b191c5c85a", "title": "Multi-Scale Image Cropper"}, "2416": {"path": "/paddlevideo/loader/pipelines/augmentations.py:361-384", "hash": "5d1166a576fae5903857feea6a79cf79", "title": "Random Crop with Grid Offsets"}, "2417": {"path": "/paddlevideo/loader/pipelines/augmentations.py:385-405", "hash": "5ee665c3195516b9e67ab31fcabd9434", "title": "Random Cropping for Image Augmentation"}, "2418": {"path": "/paddlevideo/loader/pipelines/augmentations.py:406-440", "hash": "9b4e47b75bd37e36edc12aca02abc021", "title": "Random Flip Image Augmentation Pipeline"}, "2419": {"path": "/paddlevideo/loader/pipelines/augmentations.py:441-473", "hash": "a98ff29c4e69c81f9835cb0f71f3e55c", "title": "Random Image Flipping and Brightness Adjustment"}, "2420": {"path": "/paddlevideo/loader/pipelines/augmentations.py:474-508", "hash": "880f9b7c36c2c9def3744accd234a2e1", "title": "Random Image Augmentations in PaddleVideo"}, "2421": {"path": "/paddlevideo/loader/pipelines/augmentations.py:509-546", "hash": "ba3aa6f4acd340c3c2a24fba4a4a6ba3", "title": "Random Saturation and Hue Pipeline Transforms"}, "2422": {"path": "/paddlevideo/loader/pipelines/augmentations.py:547-577", "hash": "0fa32f56c4d9a116d39956afb082171b", "title": "Gamma and Color Jitter Augmentation in PaddleVideo"}, "2423": {"path": "/paddlevideo/loader/pipelines/augmentations.py:578-611", "hash": "ee21c79ad8f79736e7c9166da512a75a", "title": "Random Gamma Adjustment Pipeline"}, "2424": {"path": "/paddlevideo/loader/pipelines/augmentations.py:612-638", "hash": "d4275dd1a63b4e42040490418658b9cc", "title": "Image to NumpyArray Transpose Class"}, "2425": {"path": "/paddlevideo/loader/pipelines/augmentations.py:639-665", "hash": "179e981ff99c33fbd08b5cfbc22d764c", "title": "Normalization Class in PaddleVideo's Loader Pipelines"}, "2426": {"path": "/paddlevideo/loader/pipelines/augmentations.py:667-693", "hash": "b7c7bea6051bffc2cb6667f0f6a3bd54", "title": "Image Normalization Class"}, "2427": {"path": "/paddlevideo/loader/pipelines/augmentations.py:694-722", "hash": "a74f33922883d93c880fcb4dd5445dbf", "title": "Image Normalization and Scaling Pipeline"}, "2428": {"path": "/paddlevideo/loader/pipelines/augmentations.py:724-749", "hash": "0ad6de646f875c90c2483af010071c28", "title": "Jitter Resize Image Sequence Function"}, "2429": {"path": "/paddlevideo/loader/pipelines/augmentations.py:750-774", "hash": "fd0c2eb7782b8d894a3386388864efd8", "title": "Resize Image Pipeline"}, "2430": {"path": "/paddlevideo/loader/pipelines/augmentations.py:777-805", "hash": "8f5c83c9534177748e910f6502cf0456", "title": "MultiCenterCrop Class: Image Cropping Operations"}, "2431": {"path": "/paddlevideo/loader/pipelines/augmentations.py:807-834", "hash": "d5eef1f92d1ffd056f97e5ddf90f1f24", "title": "Image Cropping Augmentations in PyAV"}, "2432": {"path": "/paddlevideo/loader/pipelines/augmentations.py:835-865", "hash": "b7cef974bc83b5b043dd09fcd653d66e", "title": "MultiCrop Pipeline for Paddle Tensor"}, "2433": {"path": "/paddlevideo/loader/pipelines/augmentations.py:866-891", "hash": "aeffa5dae39978b7282ec7cc33d0e4ae", "title": "Random Crop Augmentation Class"}, "2434": {"path": "/paddlevideo/loader/pipelines/augmentations.py:892-914", "hash": "6af6d0bb75deb3b2a2f0d26f550dfae8", "title": "Image Size Check and Crop Generation"}, "2435": {"path": "/paddlevideo/loader/pipelines/augmentations.py:915-944", "hash": "4a8b646b7e95391e776b81aa5c5f34bb", "title": "Crop Offsets Calculator"}, "2436": {"path": "/paddlevideo/loader/pipelines/augmentations.py:946-975", "hash": "a983b804c5d461d1a7cbf9b0be4ab111", "title": "GroupFullResSample Pipeline"}, "2437": {"path": "/paddlevideo/loader/pipelines/augmentations.py:977-1007", "hash": "e47bab9612f462fd050f611c60009463", "title": "Image Augmentation via Crops and Flips"}, "2438": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1008-1037", "hash": "8c32da9316bc6a4bf146741828176332", "title": "10-Crop Image Class"}, "2439": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1038-1069", "hash": "579393f042aeceafad7c1ecd149a0c38", "title": "UniformCrop Pipeline for Image Sampling"}, "2440": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1070-1099", "hash": "0165d8fccc541480e24b4bcf9d1645f3", "title": "Image Offset Determination for Cropping"}, "2441": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1100-1127", "hash": "bb5855edbdfd390e5274d2b9b40bc686", "title": "Image Augmentation Pipeline"}, "2442": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1128-1156", "hash": "47655206156ee740c051f2add7cea72b", "title": "Image Augmentation for PaddleVideo"}, "2443": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1157-1193", "hash": "aceaf9db61d135386f81e9635c70b1b9", "title": "Color Jitter Augmentation"}, "2444": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1194-1227", "hash": "6f56dc5deb5331041d0da4b68c117e09", "title": "ColorFlipAugmenter"}, "2445": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1228-1257", "hash": "ea67e59be8aa02a41c3f6acef2aea4e2", "title": "Image Augmentation Pipeline"}, "2446": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1258-1294", "hash": "0aa07295e7c4f4c316d06685ed57e955", "title": "YowoAug: Versatile Image Augmentation"}, "2447": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1295-1322", "hash": "1b3764c65421e1428e8f7d0d1a796f2b", "title": "Image Augmentation and Detection Functions"}, "2448": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1323-1353", "hash": "c8ef3d47db1508951b91813a34e6de05", "title": "Resizing and Normalizing Bounding Boxes"}, "2449": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1354-1390", "hash": "bd3f3255e64e4fbc7009f8d2ebacb4cb", "title": "Bounding Box Jitter Augmentation"}, "2450": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1391-1419", "hash": "a5a469c2707a8b2ea831a57ac8d144fe", "title": "Image Augmentation and Label Manipulation"}, "2451": {"path": "/paddlevideo/loader/pipelines/augmentations.py:1420-1427", "hash": "505be0de691e21d66d680cd44b3028a6", "title": "Image Resizing and Conversion Augmentation"}, "2452": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py", "hash": "13797dcc144d7371e5950813faee4e15", "title": "AVA Dataset Image Augmentation and Resizing in PaddleVideo"}, "2453": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:1-34", "hash": "8724ded58914bfe587aed115595da8d3", "title": "AVA Image Augmentations in PaddleVideo"}, "2454": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:35-64", "hash": "412e094a80fa46e7fe5c0194ff30b976", "title": "Properly Initializing Lazy Operations"}, "2455": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:65-96", "hash": "629dc6bf06dca92d8851c47121d68062", "title": "AVA Image Augmentation Functions"}, "2456": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:97-130", "hash": "8f15054c32c4141ff89f5a22efe88312", "title": "Imresize: Scaling Images with Flexibility"}, "2457": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:131-160", "hash": "94149ecc6b87f4209a655c3425fd7691", "title": "Image Resizing Function for CV2 and Pillow"}, "2458": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:161-193", "hash": "5695a3872fdec6bd21912a9d7ea6755b", "title": "EntityBoxCrop Scale Class"}, "2459": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:194-224", "hash": "61ef8998345496ca6ae258c2272c3fac", "title": "Cropping Object Detection Proposals"}, "2460": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:225-249", "hash": "f14d96a624baf9a56384733f811b9bea", "title": "Horizontal Flipping of Entity Boxes"}, "2461": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:250-284", "hash": "14c59a656cdc6c03907c3d27d61e1e7b", "title": "Resizing Image Pipeline Augmentation"}, "2462": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:286-303", "hash": "20c5f1a6c2dd235104f689680f3fec77", "title": "Image Augmentation Function in PaddleVideo"}, "2463": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:304-334", "hash": "c45518869608b9a2173295ff6e434e26", "title": "Initialize Resize Augmentation Object"}, "2464": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:335-363", "hash": "ca78f1d4ef45c493cb77f0927e54d9ee", "title": "Image Resizing Augmentation"}, "2465": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:364-393", "hash": "ec76ca7bc9f97332294285c378fd7e0b", "title": "Random Rescaling Image Augmentation"}, "2466": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:395-423", "hash": "96a1c3aa55f14e344232e7bec3926f5a", "title": "Resize Augmentation Transform"}, "2467": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:424-455", "hash": "f0baf6b3df8717a8e9aba72d199448f7", "title": "Rescale Augmentation for PaddleVideo"}, "2468": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:456-487", "hash": "e44ee0f0b7e4eaf05eebc05336172972", "title": "Image Resizing and Cropping in PaddleVideo"}, "2469": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:488-517", "hash": "9a63ea0e20914d085dfcd4e6e69d9e38", "title": "Random Cropping Augmentation"}, "2470": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:519-544", "hash": "a1665cbc9f336448eadd01f2e33fc6a4", "title": "Crop Quadruple Adjustment"}, "2471": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:546-571", "hash": "8a2c61e8a507b875131f4988e2926d2d", "title": "Augmentations for Video Frames"}, "2472": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:572-609", "hash": "0b4ca83df92049a8ef21de905adbae12", "title": "In-Place Flipping and Image Negation"}, "2473": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:610-631", "hash": "0bec591115787cdcdb7892e1d17614e8", "title": "Flip Augmentation in PaddleVideo"}, "2474": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:632-660", "hash": "6100b357ac63aa08bdcd41ece518c054", "title": "Flip Augmentation for Images"}, "2475": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:661-693", "hash": "0868a0c37a316f677f4d57eab33b947f", "title": "Image Augmentation Class"}, "2476": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:696-720", "hash": "5f9dcaa6e7bfc66531adcc00378965f8", "title": "Normalize Image Augmentation"}, "2477": {"path": "/paddlevideo/loader/pipelines/augmentations_ava.py:722-749", "hash": "79ed8b8c45f0fc45781c48f711f14a4a", "title": "AVA Image Normalization Augmentation Pipeline"}, "2478": {"path": "/paddlevideo/loader/pipelines/compose.py", "hash": "d1dff462d80b23d18ca4ae63aba9e2fb", "title": "Flexible Transformation Composition"}, "2479": {"path": "/paddlevideo/loader/pipelines/decode.py", "hash": "07ed6a0da55fed7e1471a2b192f19a6b", "title": "TimeSformer Video Decoder Pipeline"}, "2480": {"path": "/paddlevideo/loader/pipelines/decode.py:1-32", "hash": "61dbba05035d57d7b4c466941646c030", "title": "Video Clip Processing Pipeline"}, "2481": {"path": "/paddlevideo/loader/pipelines/decode.py:33-69", "hash": "48cf456c5a394de0f675e3dccba9d222", "title": "MP4 Decoder Class for Frame Extraction"}, "2482": {"path": "/paddlevideo/loader/pipelines/decode.py:70-98", "hash": "77d50b933deaa25bc57f3cda92c3f9da", "title": "Video Decoder Pipeline"}, "2483": {"path": "/paddlevideo/loader/pipelines/decode.py:99-125", "hash": "34bfc65b5153d2b41bbc2389ad12844a", "title": "Video Duration Check and Decoding Indices"}, "2484": {"path": "/paddlevideo/loader/pipelines/decode.py:127-150", "hash": "f862d4a93e1926a3c28168de4f185434", "title": "Seek, Decode, Filter Frames"}, "2485": {"path": "/paddlevideo/loader/pipelines/decode.py:151-177", "hash": "4decf9f2490903afbbb63e46dc1d4869", "title": "Decode and Sort Video Frames"}, "2486": {"path": "/paddlevideo/loader/pipelines/decode.py:178-222", "hash": "bc0d6e593da87ad772c7b5dcc812b868", "title": "Pipeline Classes for Data Decoding"}, "2487": {"path": "/paddlevideo/loader/pipelines/decode.py:223-249", "hash": "bb17944ebfa567edab16227224d3b690", "title": "Preparing Data for Model in PaddleVideo Loader"}, "2488": {"path": "/paddlevideo/loader/pipelines/decode.py:250-275", "hash": "6472f0459e9efd50ff9cc3da51cc6ff0", "title": "Video Feature Pad and Dequantize"}, "2489": {"path": "/paddlevideo/loader/pipelines/decode.py:276-310", "hash": "b53d3230dcf932629982beb5a3133a85", "title": "ActionFeatureDecoder: Feature Decoding Class"}, "2490": {"path": "/paddlevideo/loader/pipelines/decode.py:311-338", "hash": "e1bba14a39dafb24921269b059e68875", "title": "Data Preprocessing for PaddlePaddle Video Pipeline"}, "2491": {"path": "/paddlevideo/loader/pipelines/decode.py:339-347", "hash": "1e606df02604ac9b410b53bf49c6447f", "title": "Pad and Concatenate Feature Data"}, "2492": {"path": "/paddlevideo/loader/pipelines/decode_image.py", "hash": "c1e64c0b8ce0651857922dc76fbeabb7", "title": "Decoding Images with PaddleVideo"}, "2493": {"path": "/paddlevideo/loader/pipelines/decode_image.py:1-37", "hash": "9af8e70277101796c94d44ca6b6e2792", "title": "PaddleVideo Image Decoder Pipeline"}, "2494": {"path": "/paddlevideo/loader/pipelines/decode_image.py:38-66", "hash": "ca4d677423f7f4adccf6ce5953a247e0", "title": "Image Decoding Pipeline Class"}, "2495": {"path": "/paddlevideo/loader/pipelines/decode_image.py:67-89", "hash": "002f9f3cfcdf9ba0534362a3df9d45f2", "title": "Decode Image Pipeline Methods"}, "2496": {"path": "/paddlevideo/loader/pipelines/decode_image.py:91-116", "hash": "3ff85b82ef0a445fce49f391c20b1e0d", "title": "Depth Image Resizer"}, "2497": {"path": "/paddlevideo/loader/pipelines/decode_image.py:117-149", "hash": "aa678514723b0d13ced6a8fac1c4ae17", "title": "Image Decoding and Organization"}, "2498": {"path": "/paddlevideo/loader/pipelines/decode_image.py:150-179", "hash": "097764e33d7423211913504a8526684b", "title": "Decode Image Pipeline: Setup and Side Detection"}, "2499": {"path": "/paddlevideo/loader/pipelines/decode_image.py:180-206", "hash": "201ec6955c04b90a662dcd2c456bd2e4", "title": "Pipeline for Decoding Images in PaddleVideo"}, "2500": {"path": "/paddlevideo/loader/pipelines/decode_sampler.py", "hash": "d0a710bc329d307946f9ff09de4b76cc", "title": "Video Decoder Pipeline: Load, Decode, Clip"}, "2501": {"path": "/paddlevideo/loader/pipelines/decode_sampler.py:1-30", "hash": "12cb99d2c716cc9af95bad759b6d5442", "title": "Fast Decoding and Sampling with DecodeSampler"}, "2502": {"path": "/paddlevideo/loader/pipelines/decode_sampler.py:31-55", "hash": "591bdffeaac5070bd3fc622dbfd4ebeb", "title": "Video Frame Sampler Class Initialization"}, "2503": {"path": "/paddlevideo/loader/pipelines/decode_sampler.py:57-81", "hash": "214f98c5539df68bd7ffe409a462d5fb", "title": "MP4 Decoder with Short Cycle Adjustment"}, "2504": {"path": "/paddlevideo/loader/pipelines/decode_sampler.py:82-93", "hash": "5ca6dc495cfb1b2e1b574f1c1194267b", "title": "Decode Image Frames Pipeline"}, "2505": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py", "hash": "6da97e8db8c8d73c0b339526dabb3fab", "title": "MRI Frame Decoder and Sampler"}, "2506": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:1-36", "hash": "cf2557dfad507d73940d1bd481aed31e", "title": "SFMRI Decoder and Sampler"}, "2507": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:37-64", "hash": "4c3cc29682bf527e863a5d71cde73d73", "title": "MRI Frame Segmenter"}, "2508": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:65-94", "hash": "2b7caf4c9b7a1ad5f58e73476c114f3e", "title": "MRI Sampler Decode Pipeline"}, "2509": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:95-115", "hash": "ca12508ce219016359b828322bd5a7f7", "title": "Video Sampling Frame Handler"}, "2510": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:116-135", "hash": "a168a9ff7e9d5898a741dee1100c54ec", "title": "Sampling Indices Calculator"}, "2511": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:136-157", "hash": "495464a919b785b231096526d93ab159", "title": "Randomly Selecting Frames Offsets"}, "2512": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:158-180", "hash": "908e779db21837ca4338b3cff28f3619", "title": "Frame Index Assignment in MRI Decode Pipeline"}, "2513": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:181-203", "hash": "0f4d265a2de1036abdc16cf591b72e4b", "title": "Generate Offsets for TSM"}, "2514": {"path": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:204-224", "hash": "06000da7f31c19c006bd0b08530b4164", "title": "Calculate Segment Offsets for 's' and 'f' Frames"}, "2515": {"path": "/paddlevideo/loader/pipelines/mix.py", "hash": "e57c0a9f56bb8e50edcd4cab5a8be81d", "title": "VideoMix: Augmented Image Classification with Controlled Mixup and Cutmix"}, "2516": {"path": "/paddlevideo/loader/pipelines/mix.py:1-34", "hash": "2ffecbbbbbe92b2d6c89d2ac29b05a05", "title": "Mixup Class for PaddleVideo"}, "2517": {"path": "/paddlevideo/loader/pipelines/mix.py:35-70", "hash": "efc498922847c8b23ff92a6e0006f155", "title": "Cutmix: A Mixup Operator for Images"}, "2518": {"path": "/paddlevideo/loader/pipelines/mix.py:72-103", "hash": "5645f8631db884a54bd36bf564e6bd5a", "title": "CutMix and MixUp Video Data Augmentation"}, "2519": {"path": "/paddlevideo/loader/pipelines/mix.py:104-116", "hash": "d0c60f3f99a019c605aaa86b00f48310", "title": "Random Mixup or Cutmix for Paddle Video"}, "2520": {"path": "/paddlevideo/loader/pipelines/multimodal.py", "hash": "66c2b3d3ace9c46f8e68eef2e4c40bc9", "title": "Multimodal PaddleVideo Pipeline Expansion"}, "2521": {"path": "/paddlevideo/loader/pipelines/multimodal.py:1-35", "hash": "8301a42b060eb3fce1bd629094ee3af7", "title": "Feature Padding Class in PaddlePaddle Video Analysis"}, "2522": {"path": "/paddlevideo/loader/pipelines/multimodal.py:36-59", "hash": "a626cf7ee547b44fa8314b1cb7338896", "title": "Multimodal Paddle Video Loader"}, "2523": {"path": "/paddlevideo/loader/pipelines/multimodal.py:60-81", "hash": "007b743573fb4d7da92597fd1dbe1071", "title": "Multimodal Data Preprocessing"}, "2524": {"path": "/paddlevideo/loader/pipelines/multimodal.py:82-113", "hash": "a8f7bd8ab5e3814569ded7f48757bde8", "title": "Random Caption Selector Pipeline"}, "2525": {"path": "/paddlevideo/loader/pipelines/multimodal.py:114-151", "hash": "34c1541fd0dcd5b0423c7c6d4197028b", "title": "Random Labeling and Masking in Multimodal Pipeline"}, "2526": {"path": "/paddlevideo/loader/pipelines/multimodal.py:152-175", "hash": "a3203991c4929a9a8cc7ca7ca57ade89", "title": "Multimodal Data Loader for TensorFlow"}, "2527": {"path": "/paddlevideo/loader/pipelines/multimodal.py:176-201", "hash": "e50d5922388859316ef5bbfab6c183ba", "title": "Masking Tokens for LM Tasks"}, "2528": {"path": "/paddlevideo/loader/pipelines/multimodal.py:202-225", "hash": "ba394a6cee3eca1c25f34ecece726d9f", "title": "Token Masking in PaddleVideo"}, "2529": {"path": "/paddlevideo/loader/pipelines/multimodal.py:226-255", "hash": "723c93b1b0046a6d8c8d20ef6981414c", "title": "Randomizing Functions in Multimodal Pipeline"}, "2530": {"path": "/paddlevideo/loader/pipelines/multimodal.py:256-285", "hash": "9e49257cacde5bf0dbdf2b3cf6f37a7b", "title": "Random Masking Function"}, "2531": {"path": "/paddlevideo/loader/pipelines/multimodal.py:286-308", "hash": "92f27f698dc17ea2e8408ca37c3bcb72", "title": "Multimodal Pipeline: BERT-based Feature Concatenation"}, "2532": {"path": "/paddlevideo/loader/pipelines/multimodal.py:309-333", "hash": "38257caa51ebdf842bab84b3104ba5d0", "title": "Preparing Input for Multimodal Pipeline"}, "2533": {"path": "/paddlevideo/loader/pipelines/multimodal.py:334-359", "hash": "ae330c6474504f623e1e48653325dc37", "title": "Zero-Padding Sequences to Max Length"}, "2534": {"path": "/paddlevideo/loader/pipelines/multimodal.py:360-380", "hash": "73e97bc99d5776fdaefdabbdd0e767ab", "title": "Multimodal Pipeline: Feature Extraction"}, "2535": {"path": "/paddlevideo/loader/pipelines/sample.py", "hash": "e96144cc0657890b2365cf992a6d39cd", "title": "Efficient Frame Sampling with PaddleVideo"}, "2536": {"path": "/paddlevideo/loader/pipelines/sample.py:1-38", "hash": "48223a254a88b5388887546203589a0d", "title": "Python PaddleVideo Image Processing Pipeline"}, "2537": {"path": "/paddlevideo/loader/pipelines/sample.py:39-66", "hash": "9cbf02849bff686f233720b89fe8b768", "title": "Sampler Frame Selection"}, "2538": {"path": "/paddlevideo/loader/pipelines/sample.py:67-96", "hash": "f6b0f8f53672cad51b325e23bd22c821", "title": "Image Format Converter Class"}, "2539": {"path": "/paddlevideo/loader/pipelines/sample.py:97-119", "hash": "355ce3f8662d36654a604bfa3d7c1b8c", "title": "Video Decoding Pipeline with Multiple Backends"}, "2540": {"path": "/paddlevideo/loader/pipelines/sample.py:120-144", "hash": "f2bae25aa356aafe47e43e9fb914c3ea", "title": "Video Frame Sampler: Handles Dense and Non-Dense Scenarios"}, "2541": {"path": "/paddlevideo/loader/pipelines/sample.py:145-171", "hash": "dc69f8a1532b2f48d8fa0eb8e1e2e22e", "title": "Clip Offsets Calculator"}, "2542": {"path": "/paddlevideo/loader/pipelines/sample.py:172-199", "hash": "92d28c1ace1b072a81bfb07deb4215f2", "title": "Video Frame Sampling Algorithm"}, "2543": {"path": "/paddlevideo/loader/pipelines/sample.py:200-223", "hash": "6d92ea5e1cb667cd84d312b3150dc652", "title": "Video Frame Sampler Algorithm"}, "2544": {"path": "/paddlevideo/loader/pipelines/sample.py:224-245", "hash": "6fa7b126e4e0935bdddcb6676a499d6f", "title": "Sampling Position Determination Algorithm"}, "2545": {"path": "/paddlevideo/loader/pipelines/sample.py:246-268", "hash": "247306142fc2442bc0b0cd661378fba8", "title": "Indexing Frames by Duration"}, "2546": {"path": "/paddlevideo/loader/pipelines/sample.py:270-292", "hash": "6ac440435b574f7daf99e14f3832a24b", "title": "Random Frame Sampler"}, "2547": {"path": "/paddlevideo/loader/pipelines/sample.py:293-327", "hash": "d41f74cf04e7c83ca5e4e73d101cfc7d", "title": "Video Sampler Class for PyTorch"}, "2548": {"path": "/paddlevideo/loader/pipelines/sample.py:328-358", "hash": "fe650a7f86f8a2bed44ef5946ea6cd8b", "title": "Video Image Sampler in Paddle Video Pipeline"}, "2549": {"path": "/paddlevideo/loader/pipelines/sample.py:359-382", "hash": "6d826ac04edf6dae978df15f52401874", "title": "Video Sample Indexing Algorithm"}, "2550": {"path": "/paddlevideo/loader/pipelines/sample_ava.py", "hash": "e91a11440a5fc241db709a490d3116e5", "title": "SampleAVA Pipeline for PaddleVideo"}, "2551": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:1-35", "hash": "775db9745a59205217a7c21bada98d04", "title": "PaddleVideo: Frame Sampler"}, "2552": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:36-61", "hash": "a803107a19fd623efe83353f2ac2f91e", "title": "AVA Frame Sampling Initialization"}, "2553": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:62-82", "hash": "e52c62c97ff51f68fb5b42ee5560c764", "title": "Clip Offset Calculator"}, "2554": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:83-107", "hash": "66efe74c8097be70c95a9dc93fb0f2b9", "title": "AVA Video Sampler"}, "2555": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:108-129", "hash": "c7f717e3ac8d2db4a06e588be0e2b2dd", "title": "Wrap-Around Frame Index Handling"}, "2556": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:130-166", "hash": "9fc85f8e778242000bc81ae1881ae6e6", "title": "Abstract Class for Storage Backends"}, "2557": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:167-190", "hash": "644d69db3b17c0dc4f8a7d2b33728e27", "title": "Registering Backends: SampleAVA Pipeline Code"}, "2558": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:191-225", "hash": "7af58b2b803028af1c297025b2331800", "title": "FileClient: Handling File Operations and Pillow Image Conversion"}, "2559": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:226-247", "hash": "facbe8b65b7be6c98649661f3419ae56", "title": "Pillow Image to Numpy Array Converter"}, "2560": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:248-270", "hash": "f5f8e839f136099fb0ff4e53c3ad9c47", "title": "Image Conversion to Numpy Array"}, "2561": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:271-301", "hash": "7485fce1eaab46de1cae999100411b0b", "title": "Pipeline for Decoding Frames in Sample_AVA.py"}, "2562": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:303-326", "hash": "a560e86c6e9147fc2025fb24a5e57cf2", "title": "Resizing and Scaling Pipeline"}, "2563": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:327-354", "hash": "5255c6d99baa8e0de41736cdd51e02f1", "title": "SampleAVAFrames Class Overview"}, "2564": {"path": "/paddlevideo/loader/pipelines/sample_ava.py:355-374", "hash": "f7f56a23dd566af9d2dcdf27279124c0", "title": "AVA Sample Frame Indexer"}, "2565": {"path": "/paddlevideo/loader/pipelines/sample_ucf24.py", "hash": "2c4dfebf632d691b8d7b853a114a324e", "title": "UCF24 Frame Sampler"}, "2566": {"path": "/paddlevideo/loader/pipelines/sample_ucf24.py:1-33", "hash": "b1afe9d9b1291fb4881d1f23257a8a3a", "title": "Video Sampler Class: SamplerUCF24"}, "2567": {"path": "/paddlevideo/loader/pipelines/sample_ucf24.py:34-65", "hash": "cff97457829bd1b1e44f280cbe7a7dc0", "title": "Video Clip Pipeline Creation"}, "2568": {"path": "/paddlevideo/loader/pipelines/sample_ucf24.py:66-69", "hash": "138d2eeae7bb05f5f5bf4b03d9644ab4", "title": "Keyframe Indexer"}, "2569": {"path": "/paddlevideo/loader/pipelines/segmentation.py", "hash": "b4617809da5e4dfec194445c86c20e8f", "title": "PaddleVideo: Enhanced Segmentation and Transformation"}, "2570": {"path": "/paddlevideo/loader/pipelines/segmentation.py:1-32", "hash": "2efbbeed6024866f6cf1ed0bad579da8", "title": "PaddleVideo Segmentation Pipeline Code"}, "2571": {"path": "/paddlevideo/loader/pipelines/segmentation.py:33-65", "hash": "880ecf608c49d4c3902d65ae26c66be1", "title": "Multi-Scale Image Segmentation Function"}, "2572": {"path": "/paddlevideo/loader/pipelines/segmentation.py:67-92", "hash": "8b646c06f82bce95a2b3728ffa919449", "title": "Image Resizing and Flipping Pipeline"}, "2573": {"path": "/paddlevideo/loader/pipelines/segmentation.py:93-124", "hash": "a5e38a6bc0139ee8e9eba03a30390408", "title": "MultiNorm Image Preprocessing"}, "2574": {"path": "/paddlevideo/loader/pipelines/segmentation.py:125-130", "hash": "780e88e70888605b04e9a2757d6cdea9", "title": "Image Normalization and Transposition"}, "2575": {"path": "/paddlevideo/loader/pipelines/segmentation_pipline.py", "hash": "45f9ab88ea44e60757e7680dffb042c9", "title": "Segmentation Sampler Python Class"}, "2576": {"path": "/paddlevideo/loader/pipelines/segmentation_pipline.py:1-35", "hash": "c3b0fff5597e13470623b02b8c4368bd", "title": "Segmentation Sampler Python Pipeline"}, "2577": {"path": "/paddlevideo/loader/pipelines/segmentation_pipline.py:36-40", "hash": "e06cf98749d8f3c370f0296f4278949c", "title": "Segmentation Pipeline Code Snippet"}, "2578": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py", "hash": "0c6078a6f6584c6a9b93bd56364ce1c3", "title": "Skeleton Pipeline for PaddleVideo"}, "2579": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1-34", "hash": "cf1ce62eb6a3d0a4f30dbe7994575520", "title": "Skeleton Pipeline Registration"}, "2580": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:36-68", "hash": "cc10108b2989770eabb157d9b1addc8a", "title": "Lazy Operation Initialization"}, "2581": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:69-101", "hash": "0d98dcf84a2c2f7fe5a180056fcfd9ea", "title": "Skeleton Pipeline: Auto-Padding and Feature Extraction"}, "2582": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:102-133", "hash": "886d30a6f7f094ba7b86471c41b015c3", "title": "Skeleton Pipeline Data Padding"}, "2583": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:134-170", "hash": "debaec96d7168a9b5d89fcf7b0f88317", "title": "Skeleton Pipeline Classes"}, "2584": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:173-202", "hash": "4a89fb1c92a28089d718d4558a28d24e", "title": "Random Rotation Skeleton Class"}, "2585": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:204-229", "hash": "f53caf25ccc6e44e0124006f1a15feb9", "title": "Random Rotation Applier"}, "2586": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:230-265", "hash": "bc0899fedc2f147d54fd73742257e600", "title": "Skeleton Pipeline Class for Cropping"}, "2587": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:266-286", "hash": "a13ca9fc6ef9ea60c1b071de6e4357e7", "title": "Randomly Cropped and Biased Skeleton Data Processing"}, "2588": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:287-316", "hash": "03d78abdca7d11f96a7e04935584cfd1", "title": "Skeleton Data Transformation for Video Analysis"}, "2589": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:317-344", "hash": "34e80164a644a1bc1cc58d944c1cd4dc", "title": "Skeleton Pipeline for PaddleVideo"}, "2590": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:345-372", "hash": "c618b23f40b4c29a4b51055f0acd25f2", "title": "Training Clip Sampler"}, "2591": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:373-401", "hash": "57f9faee2cb1d60609c089d8fe6bfac6", "title": "Skeleton Clip Index Determination"}, "2592": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:402-427", "hash": "362f978bb6d2bf6fd20debb3cfcf2163", "title": "Random Frame Index Selection"}, "2593": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:428-459", "hash": "06b1a49a60b638d5f611a53a42a24af0", "title": "Skeleton Pipeline Class for PaddleVideo"}, "2594": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:460-493", "hash": "aee15f2de0e3280ad38b5c27bfe610fa", "title": "PoseDecode Class Loads Pose Keypoints"}, "2595": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:494-528", "hash": "521cff006703d2076b2ed01c630a2a92", "title": "PoseCompact: Compact Keypoint Representation"}, "2596": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:529-546", "hash": "e74b967f49d7f8079d12dc43ebc416a7", "title": "Expand Bounding Boxes in Skeleton Pipeline"}, "2597": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:548-585", "hash": "f1e84a40f190de10f5d96ed5d5def92b", "title": "Skeleton Pipeline Class"}, "2598": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:587-611", "hash": "ffa733326c2aeac7a702a4be7dee2b47", "title": "Bounding Box Adjustment for Skeleton Pipeline"}, "2599": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:613-640", "hash": "b0bf72e2253e89a1a7dd1f3ba3d19f27", "title": "Crop-Based Skeleton Detection"}, "2600": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:641-669", "hash": "ee9fce904b6e2b862fe3e09331125e6c", "title": "Cropping Bounding Boxes in Skeleton Pipeline"}, "2601": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:670-693", "hash": "54288f10e51e69f8cbb54cd5d4177959", "title": "Random Resized Crop Pipeline V2"}, "2602": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:694-719", "hash": "2fef9ab9a0b797a21b77df39b84ac496", "title": "Initialize Class for Cropping Bounding Box"}, "2603": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:720-741", "hash": "6eae779f2359f685bcdc67349eb46cc3", "title": "Random Crop Bounding Box Generator"}, "2604": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:742-766", "hash": "aaa5fd2c845f3b27fa050c308d87dd1f", "title": "Random Crop with Aspect Ratios"}, "2605": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:767-791", "hash": "0b1078d15581f0324c38480d71e00f5b", "title": "Crop Quadruple Adjustment for Skeleton Pipeline"}, "2606": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:792-815", "hash": "2a2bd0485fc129fd9d6e64a7a19864c2", "title": "Image Cropping and Keypoints Extraction"}, "2607": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:816-843", "hash": "3b4d0252485182df66e360dac411722a", "title": "Skeleton Pipeline for PaddleVideo"}, "2608": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:844-880", "hash": "d27bcff26be0a60e1b6e6078d58748f5", "title": "CenterCrop_V2 Pipeline and is\\_seq\\_of Function"}, "2609": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:881-909", "hash": "27af2f4c09a583e4d32772777b392b61", "title": "CenterCrop Augmentation Class"}, "2610": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:910-936", "hash": "32ed08979bd1d9951cb8dcc32b9d0bb4", "title": "Update Image Shape and Crop Coordinates"}, "2611": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:937-957", "hash": "efa922b9645f9d5496a31a55d7cf2379", "title": "Handling Flip Operation in Skeleton Pipeline"}, "2612": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:958-984", "hash": "dfcb46b740835043e2c65094ac307b43", "title": "Flip_V2 Pipeline Registration and Functionality"}, "2613": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:985-1006", "hash": "e9308737203900f05aedd65cc7e58945", "title": "SkeletonPipeline: Direction-Based Keypoint Flipping"}, "2614": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1007-1034", "hash": "3f9b174776e404d07a15be2086fad7d7", "title": "Skeleton Pipeline Image and Keypoints Flipping"}, "2615": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1036-1065", "hash": "1d88d695b771c669773029d3e399f649", "title": "Horizontal Flip Augmentation in PaddleVideo"}, "2616": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1066-1090", "hash": "8d1bd9a3b563bea78206166f4a061a71", "title": "Flip and Flip Labels in Skeleton Pipeline"}, "2617": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1091-1117", "hash": "595e7be903800d12e5eefee0aa4785ee", "title": "SkeletonPipeline: Flip and Register"}, "2618": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1119-1144", "hash": "f9a31106731d99261eba19d47184cfa9", "title": "Image Data Formatting Class"}, "2619": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1145-1169", "hash": "9c07978e5b3a57ded21a0270f9885e25", "title": "SkeletonPipeline: Image Processing and Reshaping"}, "2620": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1170-1197", "hash": "d476b189108056a7725a342a2a2db39c", "title": "Image Data Converter: NHWC/NCHW/NPTCHW with Collapse Option"}, "2621": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1198-1220", "hash": "a8e3f022378c96fdf1b62c940ecb6e23", "title": "Collect Pipeline Class"}, "2622": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1221-1238", "hash": "628b2c9059ce564e5e1b1ff3235a7d5e", "title": "Default Image Data Loading Parameters"}, "2623": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1239-1275", "hash": "84136e9b5764375a566448c1acf926f2", "title": "Skeleton Pipeline Class"}, "2624": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1276-1294", "hash": "d68298b2fc172904ffe6bdec0deac53d", "title": "GeneratePoseTarget: Heatmap Generator"}, "2625": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1295-1318", "hash": "5a70d0eedf53b044f9e8202e77b7167b", "title": "Skeleton Pipeline Initialization"}, "2626": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1319-1349", "hash": "b41b98058845d2946d284915c61bc0d1", "title": "SkeletonPipeline Heatmap Generation"}, "2627": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1350-1375", "hash": "adf6a29dde94e2ab86ed4d2b58f09184", "title": "Gaussian Kernel Patch Heatmap Generation"}, "2628": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1377-1398", "hash": "64f10a4e281ad7c106ef82f13a138a11", "title": "Pseudo Heatmap Generation"}, "2629": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1399-1427", "hash": "c0c560754f899171faceb8f80ce9c9bc", "title": "Keypoint Distance Calculator"}, "2630": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1429-1453", "hash": "8ecabbaa09a15f6871aef3801aae3fc1", "title": "Gaussian Kernel Heatmap Dominant Point Calculator"}, "2631": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1455-1483", "hash": "8b4c1b312a91f207152c54df573a17ce", "title": "Generate Heatmaps for Keypoints and Limbs"}, "2632": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1484-1517", "hash": "2aaddd2227e46f1d2aa22cdc6da2ed79", "title": "Pseudo Heatmap Generator for Skeleton Sequences"}, "2633": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1518-1547", "hash": "75c14245313a5015744d4aa1247a09d6", "title": "Heatmap Generator from Keypoints"}, "2634": {"path": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1548-1554", "hash": "4dbdf3bebe73a6fb49780eaa395e50fa", "title": "Skeleton Pipeline Formatting"}, "2635": {"path": "/paddlevideo/loader/registry.py", "hash": "427b6d09fb7379ed2157da934047955e", "title": "PaddleVideo Registry Definition"}, "2636": {"path": "/paddlevideo/metrics/ActivityNet/__init__.py", "hash": "0270f824442a5b01503b600f1cd8ac40", "title": "Public API Added for ANETproposal"}, "2637": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py", "hash": "9e35f7d43b19c34993650ce1896effcc", "title": "ActivityNet Proposal Metrics"}, "2638": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:1-29", "hash": "3830cb8af9974328636764da8294b2ca", "title": "ActivityNet Metrics: AR@N & AUC"}, "2639": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:31-54", "hash": "cac2eb8d5061ec87b2fcd5f2510f8280", "title": "Initializing Class with Files and Defaults"}, "2640": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:55-77", "hash": "46fefb644a7385731bd8d55644735239", "title": "Blocked Video Metrics Script"}, "2641": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:79-102", "hash": "0674aca243e45b186266c8fba0c260e9", "title": "Read Ground Truth JSON and Return DataFrame"}, "2642": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:103-130", "hash": "4cd3cd74f5dd2b4e9ebb9bdab0c753b3", "title": "Proposal JSON Parser"}, "2643": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:132-158", "hash": "40b63921394aff41514882c244d6ce86", "title": "ActivityNet Proposal Evaluation Metrics"}, "2644": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:159-182", "hash": "c4f57a7c571e5ccaaf901fa55c474fc3", "title": "ActivityNet Proposal Metric"}, "2645": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:183-202", "hash": "f91461ffb1b5fc043abe3078ccef33a3", "title": "Average Recall Calculator"}, "2646": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:203-231", "hash": "7a97c81f8804afaa80c007cd46cb5292", "title": "Max Average Proposals Per Video"}, "2647": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:232-255", "hash": "f1d313e28cb1852d17d57ac752c5df6a", "title": "Exception Handling for Proposals and Ground Truth"}, "2648": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:257-278", "hash": "ecf3d48cd313f7c319ee09823820c74b", "title": "Average Recall Calculator"}, "2649": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:279-298", "hash": "a9e5c5242f1f4b153ad1e47cff359ac2", "title": "Threshold-Based True Positives Calculation"}, "2650": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:299-324", "hash": "2349a1e75e66794711d476d029d13e1c", "title": "Efficient Video Metrics Calculation"}, "2651": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:325-349", "hash": "9bf31be6e8cf325aea6e860db6067e44", "title": "TIOU Calculation Function"}, "2652": {"path": "/paddlevideo/metrics/ActivityNet/anet_prop.py:350-359", "hash": "6d5a9680ea911a7eb8cadc5ec89d786f", "title": "IoU Calculation for Segment Intersection"}, "2653": {"path": "/paddlevideo/metrics/__init__.py", "hash": "6a954e46472a837fd5dd1762ba8995e9", "title": "PaddleVideo Metrics Library"}, "2654": {"path": "/paddlevideo/metrics/__init__.py:1-25", "hash": "dc3f4e7557448feb9010b41deeea43ba", "title": "Video Metric Suite"}, "2655": {"path": "/paddlevideo/metrics/__init__.py:26-36", "hash": "f8b63dad51c7f6757c83185fd3c946be", "title": "Comprehensive Metric Import for PaddleVideo"}, "2656": {"path": "/paddlevideo/metrics/ava_evaluation/README.md", "hash": "f4d2521494de8a8b63ecfa0288fb9d12", "title": "AVA Evaluation Metrics in PaddleVideo"}, "2657": {"path": "/paddlevideo/metrics/ava_evaluation/metrics.py", "hash": "1f2182ce45889bb95e2c9e70e246b26a", "title": "AVA Metrics Calculation"}, "2658": {"path": "/paddlevideo/metrics/ava_evaluation/metrics.py:1-30", "hash": "9f11b1ce1e96d02156242490be812b5f", "title": "Precision and Recall Metrics Function"}, "2659": {"path": "/paddlevideo/metrics/ava_evaluation/metrics.py:32-58", "hash": "5231aee6105f555809a46accf60ea65e", "title": "AVA Evaluation Array Validation"}, "2660": {"path": "/paddlevideo/metrics/ava_evaluation/metrics.py:59-88", "hash": "5e6ea6a8ee6fa5fa3820411929ab8ba5", "title": "Average Precision and Recall Calculation"}, "2661": {"path": "/paddlevideo/metrics/ava_evaluation/metrics.py:89-111", "hash": "052a31eb858dfe3b49f8d6513cca6dba", "title": "Validate and Concatenate Precision-Recall Arrays"}, "2662": {"path": "/paddlevideo/metrics/ava_evaluation/metrics.py:112-137", "hash": "0bb057142e294672b0f5dc268a4536f8", "title": "Average Precision and CorLoc Metric"}, "2663": {"path": "/paddlevideo/metrics/ava_evaluation/metrics.py:138-143", "hash": "fc850b0166c6cd8a5c4d5da032e56555", "title": "Average Class Precision"}, "2664": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_list.py", "hash": "3596e3bea0593ddafd613ed41b8e50b3", "title": "Validating Bounding Box Coordinates"}, "2665": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:1-26", "hash": "6f51b11d05146524ad776691aa931f9e", "title": "BoxList Class for Bounding Boxes"}, "2666": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:28-52", "hash": "fbc489886a2c19f71ecaef661e39eda7", "title": "Numpy Box List Class"}, "2667": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:54-81", "hash": "b0491ae796be435f33099f2319abe8d9", "title": "Box Collection Manager: Count, Retrieve, and Add Data"}, "2668": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:82-117", "hash": "8245674d79ee5298fddcbadbdc1cd47a", "title": "Numpy Box List Class"}, "2669": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:118-138", "hash": "42c1dd5a90a5c166265719de810f0fcf", "title": "Valid Box Check"}, "2670": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py", "hash": "6a54e93b875a763b5cf4bda29dbaf1ba", "title": "Numpy Box Operations for IoU"}, "2671": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:1-29", "hash": "d2f9334e13ed2ac24e0594a11de319b3", "title": "Bounding Box Operations for Numpy Arrays"}, "2672": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:31-57", "hash": "db84fb56e0e85aa493929537b174c418", "title": "Box Area and Intersection Calculations"}, "2673": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:58-90", "hash": "d1e492195a83fd2f8ff9a06d1f36a357", "title": "Pairwise IOU/IOA Computation for Box Collections"}, "2674": {"path": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:91-98", "hash": "ef868be27e75d09b3c411bdee231b58c", "title": "Pairwise IoU Calculation for Bounding Boxes"}, "2675": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py", "hash": "3264d2f809b20af661f43df242c38021", "title": "AVA Object Detection Evaluation"}, "2676": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:1-21", "hash": "119877ede59583de119a05b0f366f6c6", "title": "PaddleVideo Object Detection Evaluation"}, "2677": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:22-58", "hash": "d10ef18168383130744beac0267f9b8f", "title": "Object Detection Evaluator Class"}, "2678": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:59-86", "hash": "e95f522aa341a832af1b29ff37914f5d", "title": "AVA Object Detection Evaluation Metric"}, "2679": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:87-120", "hash": "a8625f55fe89585dd0e195efcadfa03f", "title": "Object Detection Evaluator: Ava Metrics"}, "2680": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:121-139", "hash": "b5d11964d49233446e747fa5e11a438d", "title": "Object Detection Evaluation Class"}, "2681": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:140-160", "hash": "2b13ba11ef661bdadcfd7b0f77f46e9a", "title": "Object Detection Evaluation Module"}, "2682": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:161-179", "hash": "c7ce0e8dde7f5e57b614b438dc2d0431", "title": "Validating Groundtruth Image Addition"}, "2683": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:180-198", "hash": "3d05134d80f2d1bd924315223c46a5ef", "title": "Checking Existing Image in Object Detection Evaluation"}, "2684": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:199-219", "hash": "4d1722b93c4ad651f0e310bba7809986", "title": "Ground Truth Difficulty Check"}, "2685": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:220-236", "hash": "1d984dc9f6aa12e468e6873a690c7a42", "title": "Single Image Detection Evaluation"}, "2686": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:237-259", "hash": "71693f9330a32aafeca71a99e4afc522", "title": "Detection Class Retrieval and Mask Validation"}, "2687": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:260-290", "hash": "26f2f3c6cef5abccdb5b39e687b51b40", "title": "AVA Object Detection Evaluation"}, "2688": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:291-315", "hash": "a91b92af7cc1701f863ef2a4e5371460", "title": "Mean Average Precision Calculator"}, "2689": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:316-338", "hash": "946320c45521d5514ed3b01693ddfe60", "title": "Object Detection Metrics Calculation"}, "2690": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:339-375", "hash": "70f629e5b801fc0338a24cebfb871e9d", "title": "PaddleVideo Object Detection Evaluation"}, "2691": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:376-402", "hash": "42ac52b15276fe54ad0310f60b527531", "title": "Object Detection Evaluation Setup"}, "2692": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:404-430", "hash": "924057516908778baa78e861e07b01bb", "title": "Object Detection Evaluation Functions"}, "2693": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:432-447", "hash": "b86e7aa4dc8c1315838bf439651bde32", "title": "AVA Object Detection Evaluation"}, "2694": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:448-467", "hash": "f0f8ea09dbe40677e2870854eae0a776", "title": "Add Ground Truth Data to Database"}, "2695": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:468-493", "hash": "a65ba8f0e1b609f7eff48d262c78da13", "title": "Single Image Evaluation Function"}, "2696": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:494-516", "hash": "1d0384e216c79cce1fbaf43b7133ae32", "title": "Create Numpy Array from Detection Masks"}, "2697": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:518-536", "hash": "962c93358ccae61a33d25ea632e6c0ec", "title": "Ground Truth Init for Object Detection"}, "2698": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:537-561", "hash": "be36ef6babd8e45d91cd960d648df3b6", "title": "PaddleVideo: Object Detection Metrics Calculation"}, "2699": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:562-583", "hash": "976bc5a4022f1ec2c68eb6b358d02b2d", "title": "Ground Truth Update for Object Detection"}, "2700": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:584-605", "hash": "15f5cf82114fa12c4874e5bb6bf59aed", "title": "Object Detection Evaluation Metrics"}, "2701": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:606-629", "hash": "283c38db9c7e0dafeb11599397d94d6f", "title": "AVA-Based Object Detection Evaluation"}, "2702": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:630-651", "hash": "017d6553af81e89009acc2dd138f4682", "title": "Object Detection Evaluation Metrics"}, "2703": {"path": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:652-658", "hash": "2dd8b73625e85fe3825d034c568c4d33", "title": "Object Detection Metrics Calculator"}, "2704": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py", "hash": "1997401b8c0a7ffbf8d3760581e8c6a8", "title": "AVA Performance Metrics in PaddleVideo"}, "2705": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:1-20", "hash": "e4f7b4602590dac17ba3be4e8768e08f", "title": "Single Image AVA Evaluation"}, "2706": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:21-53", "hash": "23a3c0cb26b0a55a0c0bda5db3f00aae", "title": "Single Image Detection Metrics Evaluator"}, "2707": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:54-73", "hash": "d502d3818d8f686150bf5da03fe998bf", "title": "AVA Evaluation: True Positives, False Positives, and Ignored Detections"}, "2708": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:74-91", "hash": "612c979731201397a0f5a26908e94e16", "title": "AVA Metric Calculation"}, "2709": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:92-115", "hash": "4d251a5e2b18f0687d23b59456ef7947", "title": "Invalid Detection Box Removal for Object Evaluation"}, "2710": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:116-143", "hash": "ad1964a8c021d5906760cb257cef9fda", "title": "Per-Image AVA Evaluation"}, "2711": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:144-161", "hash": "6ca7396509a1e92b237fd4d57c123520", "title": "AVA Metrics: Per-Image Evaluation"}, "2712": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:162-183", "hash": "e0c2f822edf21bcc0192aea0eca7cddb", "title": "Checking and Storing Masks for AVA Evaluation"}, "2713": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:184-202", "hash": "4f163fbfd90c62393ee2e589cca16840", "title": "Per-Class Array Extraction"}, "2714": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:203-228", "hash": "26f609a06033269c662a04798ba4c0e4", "title": "Per-Image AVA Evaluation"}, "2715": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:229-249", "hash": "dd1806953f1450187381f4f69d5afd7f", "title": "Intersection Metrics: IoU and IoA"}, "2716": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:250-276", "hash": "aee100fed4d8244330fe22ba929c612a", "title": "Per-Image AVA Evaluation Labeling"}, "2717": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:277-295", "hash": "cca9f03519357fe4b5c1fdd02d1f879d", "title": "Difficult Box Evaluation"}, "2718": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:296-322", "hash": "e2ebcab5eaa91ac01a62bb42131c807f", "title": "True Positive Detection via IoU and Scores"}, "2719": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:323-344", "hash": "ef809d341a3fffa2bd6cd0c8b405e622", "title": "AVA Per-Image Evaluation"}, "2720": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:345-371", "hash": "b68c02569eb5aeec2d60ea2581cbd9eb", "title": "Class-Specific Array Retriever"}, "2721": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:372-392", "hash": "d9711057d2b9b381390bed9e2c2b7a7b", "title": "AVA Metrics Function"}, "2722": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:393-421", "hash": "3eda8828e90a291bdd2de4b6235810fd", "title": "Class-Specific Results Extraction and Invalid Box Removal"}, "2723": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:422-443", "hash": "d99beb397e3c0009cc4eae1a02ba3816", "title": "Filter and Slice Input Arrays"}, "2724": {"path": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:444-452", "hash": "01008177cbdb311510cd78b889098f38", "title": "Bounding Boxes and Scores"}, "2725": {"path": "/paddlevideo/metrics/ava_evaluation/standard_fields.py", "hash": "c495fddcb51e1dfe1876081872b8ef92", "title": "Standard Fields for AVA Evaluation"}, "2726": {"path": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:1-26", "hash": "e10be32f96e0250a7a9933d24371aa4f", "title": "Standard Object Detection Fields"}, "2727": {"path": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:27-46", "hash": "766a895b229c1530ee5f57084dad9e5a", "title": "Standard Fields for AVA Evaluation"}, "2728": {"path": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:47-66", "hash": "49c1db15dea29e1854841976599e8d2f", "title": "AVA Evaluation Dictionary Defined"}, "2729": {"path": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:67-86", "hash": "ef6bbc8af474726a6157e277c1050d0e", "title": "Standard AWA Field Definitions"}, "2730": {"path": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:89-113", "hash": "96c36ceaed1ef5a3160f9ddece712776", "title": "Standard Video Object Detector Naming Conventions"}, "2731": {"path": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:114-115", "hash": "93a1d41d0803b4aa247c17b45a03eda1", "title": "Standard Fields: Detection Metrics"}, "2732": {"path": "/paddlevideo/metrics/ava_metric.py", "hash": "f003e9d72cba4848b475f6a0edd07fcc", "title": "AVAMetric: PaddleVideo's Metric for Video Object Detection"}, "2733": {"path": "/paddlevideo/metrics/ava_metric.py:1-34", "hash": "cf35d782f526be7719cf6fab55e23b3f", "title": "AVAMetric: PaddleVideo Metric"}, "2734": {"path": "/paddlevideo/metrics/ava_metric.py:35-60", "hash": "d77a12377b4335c7aa061433b3ac431c", "title": "Video Metrics Initialization in PaddlePaddle"}, "2735": {"path": "/paddlevideo/metrics/ava_metric.py:61-90", "hash": "0760b8005ec394a8bd4122803b90a728", "title": "AVA Metrics Calculation and Logging Class"}, "2736": {"path": "/paddlevideo/metrics/ava_metric.py:92-92", "hash": "22be346acf5f8d463ed179b31d7eae6f", "title": "Class Method Returns Record List"}, "2737": {"path": "/paddlevideo/metrics/ava_utils.py", "hash": "3617204fb1ab28b433c6e173214bfe14", "title": "AVA Metrics Utilities for Video Object Detection"}, "2738": {"path": "/paddlevideo/metrics/ava_utils.py:1-31", "hash": "ab735f28535b4bfc057a2caaf8a03b04", "title": "AVA Metrics Evaluation Utilities"}, "2739": {"path": "/paddlevideo/metrics/ava_utils.py:32-64", "hash": "555db02ba40c2346588333d18563e69b", "title": "CSV Results Conversion Functions"}, "2740": {"path": "/paddlevideo/metrics/ava_utils.py:66-97", "hash": "3fd44ef430a94fd93b25123e094c15ad", "title": "Utility Functions for Video Analysis"}, "2741": {"path": "/paddlevideo/metrics/ava_utils.py:99-120", "hash": "3088bdcead2890ec1e82ed9ae8b2c33b", "title": "CSV to Dictionaries: AVA Metrics"}, "2742": {"path": "/paddlevideo/metrics/ava_utils.py:122-147", "hash": "d180a298640899bb3409181d60a10eb4", "title": "CSV Object Detection Results Merger"}, "2743": {"path": "/paddlevideo/metrics/ava_utils.py:149-181", "hash": "0f43571f4e7b0a9f5800a06b6f0e40ad", "title": "Excluding Images and Labelmap without Protocol Buffers"}, "2744": {"path": "/paddlevideo/metrics/ava_utils.py:182-210", "hash": "7df7e95ee5250090d39363f264f00686", "title": "Mean Average Precision for AVA Evaluation"}, "2745": {"path": "/paddlevideo/metrics/ava_utils.py:211-240", "hash": "e2a79632d12a4706ff561869e6563965", "title": "AVA Proposal Generation"}, "2746": {"path": "/paddlevideo/metrics/ava_utils.py:241-265", "hash": "2fe967d06cea07a8507520a3e43c4366", "title": "Average Recall and mAP Calculation"}, "2747": {"path": "/paddlevideo/metrics/ava_utils.py:266-286", "hash": "58b67a0887f57ac4d4e5d408e946526e", "title": "Single Image Pascal Evaluator Addition"}, "2748": {"path": "/paddlevideo/metrics/ava_utils.py:287-320", "hash": "508bdb76d31d4aed5cb529938898ffba", "title": "AVA Metrics Code Snippet"}, "2749": {"path": "/paddlevideo/metrics/ava_utils.py:323-357", "hash": "70eebbd7788eb01d382bfef501bf14e5", "title": "Collecting Results Across GPUs"}, "2750": {"path": "/paddlevideo/metrics/ava_utils.py:358-384", "hash": "e71802067a35cc49e0adeb9021fd3d10", "title": "AVA Evaluation Utils"}, "2751": {"path": "/paddlevideo/metrics/ava_utils.py:385-394", "hash": "f36b2a0750a2b99f5b176585f12968e5", "title": "Mean Average Precision Computation Code"}, "2752": {"path": "/paddlevideo/metrics/base.py", "hash": "1d9e08910c08f6a2a6a8f71e804e5b1d", "title": "PaddleVideo Metrics Base Class"}, "2753": {"path": "/paddlevideo/metrics/base.py:1-31", "hash": "9baa105e92338b3f573b74c1e5e19f4e", "title": "PaddleVideo Metrics Base Class Initialization"}, "2754": {"path": "/paddlevideo/metrics/base.py:33-52", "hash": "3c83471a1f603b972399f7f9ff439c7a", "title": "All-Gather and Concatenation Function"}, "2755": {"path": "/paddlevideo/metrics/bmn_metric.py", "hash": "85711ed6b2e42c607f5423085d1bc87f", "title": "BMN Metric for Paddle Video"}, "2756": {"path": "/paddlevideo/metrics/bmn_metric.py:1-32", "hash": "94b30cca892d0bef8e8e2ec8006885c8", "title": "Intersection over Union Calculation Code"}, "2757": {"path": "/paddlevideo/metrics/bmn_metric.py:33-63", "hash": "47b41a2b062fe7ee0bc5dbab893996de", "title": "Bounding Box Metrics Calculation"}, "2758": {"path": "/paddlevideo/metrics/bmn_metric.py:65-98", "hash": "6d249192a8a66cfec153ebabfec52568", "title": "BMN Metric: Object Detection Algorithm"}, "2759": {"path": "/paddlevideo/metrics/bmn_metric.py:99-127", "hash": "42f8f9840d29146a7a85c605f0d396b1", "title": "Initializing BMN Metric Class in PaddleVideo"}, "2760": {"path": "/paddlevideo/metrics/bmn_metric.py:128-156", "hash": "07b2a94594a0bf5168144bacf7f4e316", "title": "Class Variables and Metrics Initialization"}, "2761": {"path": "/paddlevideo/metrics/bmn_metric.py:157-182", "hash": "2b09ac4567a01518978e5a201469bc54", "title": "Boundary Detection Score Vector List Generation"}, "2762": {"path": "/paddlevideo/metrics/bmn_metric.py:183-206", "hash": "731e1f0bbc0ee8985e5d14bc9689b028", "title": "Post-Process Video Metrics Calculation"}, "2763": {"path": "/paddlevideo/metrics/bmn_metric.py:207-229", "hash": "16c4c4e8163fca2ed761d5ce17999537", "title": "Parallel Video Processing with bmn_post_Processing"}, "2764": {"path": "/paddlevideo/metrics/bmn_metric.py:230-256", "hash": "f508563769c5b97341ed9f022fcf6fdc", "title": "Parallel Video Processing with Multiprocessing"}, "2765": {"path": "/paddlevideo/metrics/bmn_metric.py:257-282", "hash": "0c06e10b8f22393d44730ea71344794b", "title": "Soft NMS Processing"}, "2766": {"path": "/paddlevideo/metrics/bmn_metric.py:283-304", "hash": "3d1d8a1a80a8626ffe241fd7e6d9f687", "title": "Calculate Metrics with ANETproposal"}, "2767": {"path": "/paddlevideo/metrics/build.py", "hash": "a358e2f5d18276139017e2df56bf08cb", "title": "Building Metrics with Apache License"}, "2768": {"path": "/paddlevideo/metrics/center_crop_metric.py", "hash": "3ea7e1390eac18ad559928d6cb1ad318", "title": "Center Crop Metric: PaddleVideo's Batch-Aware Class"}, "2769": {"path": "/paddlevideo/metrics/center_crop_metric.py:1-31", "hash": "a50c5c147e3896bdd9d7ba31385515d7", "title": "CenterCrop Metric Registration"}, "2770": {"path": "/paddlevideo/metrics/center_crop_metric.py:32-55", "hash": "9fca7238830c7c5f2917e9c0800ec57d", "title": "Batch-Initializing Metric for Multi-GPU Data"}, "2771": {"path": "/paddlevideo/metrics/center_crop_metric.py:56-79", "hash": "786f1356a03dc186ddaeb7f8dbadf446", "title": "Batch Processing Metric"}, "2772": {"path": "/paddlevideo/metrics/center_crop_metric_MRI.py", "hash": "961eaf76db5adc723e0f8f76ea3c60b0", "title": "Top-1/5 Accuracy Tracker"}, "2773": {"path": "/paddlevideo/metrics/center_crop_metric_MRI.py:1-33", "hash": "57ce59859113cd76cccf5c651db8611c", "title": "CenterCropMetric_MRI: Video Metric Class"}, "2774": {"path": "/paddlevideo/metrics/center_crop_metric_MRI.py:34-60", "hash": "d121ecf19865522c7454628bbdf961a8", "title": "Top-1/5 Accuracy Calculator"}, "2775": {"path": "/paddlevideo/metrics/center_crop_metric_MRI.py:61-61", "hash": "1b0a0709fdfa6c5881d12b814d0fab6c", "title": "Mean of Top-1 Accuracy"}, "2776": {"path": "/paddlevideo/metrics/depth_metric.py", "hash": "9a1b4f8d8e1441757a7b8268da8085b2", "title": "DepthMetric: Distributed Batch Processing"}, "2777": {"path": "/paddlevideo/metrics/depth_metric.py:1-34", "hash": "4bf0f160ce23b808075b501ee0ad5b43", "title": "Depth Metric: Distributed Computing"}, "2778": {"path": "/paddlevideo/metrics/depth_metric.py:35-57", "hash": "764ed528aaf28eea14df0710ca145988", "title": "Distributed All-Reduce Metrics Averaging"}, "2779": {"path": "/paddlevideo/metrics/depth_metric.py:58-77", "hash": "8bc2a5ca2b1436111f07420f96bdaeaf", "title": "Batch Processing and Metric Accumulation"}, "2780": {"path": "/paddlevideo/metrics/msrvtt_metric.py", "hash": "cd373e941ad00646bf78f8a8ba7bb818", "title": "MSR-VTT Metrics Computation"}, "2781": {"path": "/paddlevideo/metrics/msrvtt_metric.py:1-31", "hash": "789ac39b7dce364509b77bb878738f16", "title": "MSRVTT Metric Initialization"}, "2782": {"path": "/paddlevideo/metrics/msrvtt_metric.py:32-56", "hash": "0effc52f070411a97b93852cc5a04015", "title": "MSR-VTT Rank Metrics Calculator"}, "2783": {"path": "/paddlevideo/metrics/msrvtt_metric.py:57-62", "hash": "076fcf90b2e995befac29db89ba77eb9", "title": "MSRVTT Metric Accumulator"}, "2784": {"path": "/paddlevideo/metrics/multi_crop_metric.py", "hash": "e54bc81b2bbe99db2c87fb98e2e84171", "title": "Multi-Crop Metric in PaddleVideo"}, "2785": {"path": "/paddlevideo/metrics/multi_crop_metric.py:1-35", "hash": "68af44846195a615fe2102049edce01f", "title": "MultiCrop Metric: PaddleVideo Class"}, "2786": {"path": "/paddlevideo/metrics/multi_crop_metric.py:36-61", "hash": "241679c8a2aa523211583883125a8e5f", "title": "Multi-Crop Metric Initialization"}, "2787": {"path": "/paddlevideo/metrics/multi_crop_metric.py:62-83", "hash": "0ca2bb7a19e230dbeea0a4a9f8c26b15", "title": "Multi-Crop Ensemble Metric"}, "2788": {"path": "/paddlevideo/metrics/multi_crop_metric.py:84-104", "hash": "c4392530943d3e5587687ae58b0b692c", "title": "Multi-Crop Metric Calculation"}, "2789": {"path": "/paddlevideo/metrics/multi_crop_metric.py:105-108", "hash": "a6fd9fc7d83fac8cb8b398a7e967610a", "title": "Multi-Crop Metric Average Accuracy Logging"}, "2790": {"path": "/paddlevideo/metrics/recall.py", "hash": "dc0ebb076210ec5e15297eca4edade46", "title": "Paddle Video Recall Metrics Calculation"}, "2791": {"path": "/paddlevideo/metrics/recall.py:1-27", "hash": "c701156b0a43f569e3dac0cc37d72736", "title": "PaddleRecall: Object Detection Recall Calculator"}, "2792": {"path": "/paddlevideo/metrics/recall.py:29-62", "hash": "28fb53c75a39211b5f30fcba32b35d47", "title": "Precision-Recall Curve Calculation"}, "2793": {"path": "/paddlevideo/metrics/recall.py:64-84", "hash": "cb11171bef0fae4751bb44a71f6c64bc", "title": "Object Detection Recall Calculator"}, "2794": {"path": "/paddlevideo/metrics/registry.py", "hash": "df96b61f43a6063fd36b4ec66ff63de9", "title": "Registry-Based Metrics Management"}, "2795": {"path": "/paddlevideo/metrics/segmentation_metric.py", "hash": "cc3baa4548149e4a7979a7c2ae529943", "title": "Label Change Detection Metric"}, "2796": {"path": "/paddlevideo/metrics/segmentation_metric.py:1-35", "hash": "769e6aecd931f1358818625850caddac", "title": "Segmentation Metric Function"}, "2797": {"path": "/paddlevideo/metrics/segmentation_metric.py:36-57", "hash": "7b277b0d1dc994a3ac42bfb3359a59a1", "title": "Segmentation Score Calculator"}, "2798": {"path": "/paddlevideo/metrics/segmentation_metric.py:58-91", "hash": "ea10763a94b38cb3eb84a6ea7f981b9e", "title": "Segmentation Metric: Labeling and Distance Calculation"}, "2799": {"path": "/paddlevideo/metrics/segmentation_metric.py:92-126", "hash": "e1102702ca11126753ec11d1743a419e", "title": "Levenstein Distance for Video Segmentation"}, "2800": {"path": "/paddlevideo/metrics/segmentation_metric.py:128-161", "hash": "ec235f7ebab22cad166778066373b395", "title": "Segmentation Metric: Precision, Recall, F1"}, "2801": {"path": "/paddlevideo/metrics/segmentation_metric.py:162-191", "hash": "c1e4ca0fb704718ba88ec32bb443bf4e", "title": "Refining Object Detection Proposals"}, "2802": {"path": "/paddlevideo/metrics/segmentation_metric.py:192-230", "hash": "cf28309fa121daf755db64382e9af7b0", "title": "Average Recall Calculation for Video Segmentation"}, "2803": {"path": "/paddlevideo/metrics/segmentation_metric.py:231-264", "hash": "26ead2af1e4b4af9bb2a75e4026d7a50", "title": "Segmentation Metric Initialization"}, "2804": {"path": "/paddlevideo/metrics/segmentation_metric.py:265-295", "hash": "a2bb5163f5b654b10177fad5ffa5f7be", "title": "Accuracy Calculation via Segmentation"}, "2805": {"path": "/paddlevideo/metrics/segmentation_metric.py:296-330", "hash": "b48f7816e0d784c57dcb7fd78ef3710f", "title": "Segmentation Metrics Accumulation"}, "2806": {"path": "/paddlevideo/metrics/segmentation_metric.py:331-356", "hash": "9079930823437473b2514afba1a8b82a", "title": "Segmentation Metrics Calculator"}, "2807": {"path": "/paddlevideo/metrics/segmentation_metric.py:358-385", "hash": "fedd9d20baae84b1ff156f4cfbce746b", "title": "Segmentation Metric Calculator"}, "2808": {"path": "/paddlevideo/metrics/segmentation_metric.py:386-389", "hash": "e1e5363e9e346736b7493fead5ed22a2", "title": "Initialize Proposal Metrics List"}, "2809": {"path": "/paddlevideo/metrics/skeleton_metric.py", "hash": "36030e3b8d5b889bdc7ed01b4b45e618", "title": "SkeletonMetric: PaddleVideo's Skeleton-Based Metric Tool"}, "2810": {"path": "/paddlevideo/metrics/skeleton_metric.py:1-38", "hash": "f186f301073b5eb6920d1847c12687ff", "title": "Skeleton Metric Calculator"}, "2811": {"path": "/paddlevideo/metrics/skeleton_metric.py:39-65", "hash": "2a5ac263d90ff3ee29764899253c8f48", "title": "Metrics Tracking Class"}, "2812": {"path": "/paddlevideo/metrics/skeleton_metric.py:66-88", "hash": "76b8cdf90efdd64282869c8bb803fa1d", "title": "Accuracy Calculator"}, "2813": {"path": "/paddlevideo/metrics/skeleton_metric.py:89-96", "hash": "0c79bf0b7ff3bbdd9c0ea469ca1ffb46", "title": "Save Values to File: Skeleton Metric Logging"}, "2814": {"path": "/paddlevideo/metrics/transnetv2_metric.py", "hash": "c25f88b14f22c9dc6eebb75f2f77d5b4", "title": "TransNetV2 Metric Calculator"}, "2815": {"path": "/paddlevideo/metrics/transnetv2_metric.py:1-34", "hash": "b3ec9ed345f302a5daa7f87cd58bfbf8", "title": "Predictions to Scenes: Identifying Scene Changes"}, "2816": {"path": "/paddlevideo/metrics/transnetv2_metric.py:35-57", "hash": "1a81d85ea59bb73a3b34d9ef92d7d837", "title": "Transnet V2 Metric Conversion"}, "2817": {"path": "/paddlevideo/metrics/transnetv2_metric.py:58-80", "hash": "2ba3e9cb23aa5891afd03847277f64f0", "title": "TransNet V2 Metric Calculation"}, "2818": {"path": "/paddlevideo/metrics/transnetv2_metric.py:81-120", "hash": "29f867d8def7ff10ba52464fab4e3cfe", "title": "Transnetv2 Metric Calculator"}, "2819": {"path": "/paddlevideo/metrics/transnetv2_metric.py:121-152", "hash": "6474954f70fac29d28de0a85a249938f", "title": "TransNetV2 Metric Calculation"}, "2820": {"path": "/paddlevideo/metrics/transnetv2_metric.py:153-174", "hash": "64ecb25dfa39362a3a69500810d05c5a", "title": "Machine Learning Metric Calculator"}, "2821": {"path": "/paddlevideo/metrics/ucf24_utils.py", "hash": "2157170f291aa52e6d4853b80dca73b4", "title": "UCF24 Metrics: PaddleVideo Utility Functions"}, "2822": {"path": "/paddlevideo/metrics/ucf24_utils.py:1-33", "hash": "8f83d13f2d62d07bf3fdd4117da47480", "title": "Average Precision Metrics in UCF101 Dataset"}, "2823": {"path": "/paddlevideo/metrics/ucf24_utils.py:34-81", "hash": "56cdf272548b1f75432ee2807c74e198", "title": "Bounding Box Converter Utilities"}, "2824": {"path": "/paddlevideo/metrics/ucf24_utils.py:82-122", "hash": "ac3dea98ba843de71b1a709216743f82", "title": "Absolute Bounding Box Conversion and Visualization"}, "2825": {"path": "/paddlevideo/metrics/ucf24_utils.py:123-148", "hash": "fa8b0d8fb42db92ba237c57f9df8f418", "title": "Draw Text Box Around Rectangle"}, "2826": {"path": "/paddlevideo/metrics/ucf24_utils.py:149-165", "hash": "4de895be70e37614469bc971bbb7e88b", "title": "Ucf24Metrics Constructor"}, "2827": {"path": "/paddlevideo/metrics/ucf24_utils.py:166-181", "hash": "213e25c9bd8b71fc986db1f1e1149f58", "title": "BoundingBox Class Definition"}, "2828": {"path": "/paddlevideo/metrics/ucf24_utils.py:183-207", "hash": "b83d900e14098f6403bf30137d7c15eb", "title": "Relative to Absolute Bounding Box Conversion"}, "2829": {"path": "/paddlevideo/metrics/ucf24_utils.py:208-232", "hash": "41304ae0094f57f3d303728020d12ce2", "title": "Bounding Box Class for Image Formats"}, "2830": {"path": "/paddlevideo/metrics/ucf24_utils.py:233-266", "hash": "be2c8a317365d979ac4d82c54a957914", "title": "Detection Result Class with Compare Method"}, "2831": {"path": "/paddlevideo/metrics/ucf24_utils.py:268-294", "hash": "ac24812e6083d1467f47257736b8bac8", "title": "Bounding Box Comparison and Cloning"}, "2832": {"path": "/paddlevideo/metrics/ucf24_utils.py:297-332", "hash": "59e6afd2737173b80708494702ab59b1", "title": "Bounding Box Collection Class"}, "2833": {"path": "/paddlevideo/metrics/ucf24_utils.py:333-359", "hash": "cad22b93369cb4aaecad9932a7842728", "title": "Bounding Box Utilities"}, "2834": {"path": "/paddlevideo/metrics/ucf24_utils.py:360-380", "hash": "9d1ec6c440e702988eae342c2c600cb4", "title": "Pascal VOC Metrics Calculation"}, "2835": {"path": "/paddlevideo/metrics/ucf24_utils.py:381-397", "hash": "1f25f6730ff578263eb625d94c17c417", "title": "Class Metrics List"}, "2836": {"path": "/paddlevideo/metrics/ucf24_utils.py:398-422", "hash": "12b72114ab3dd25ea9b81532dd467432", "title": "Detection Metrics Initialization and Sorting"}, "2837": {"path": "/paddlevideo/metrics/ucf24_utils.py:423-445", "hash": "65182bdc7393a8e801474b0b51cf2aad", "title": "Detection Metrics Calculation"}, "2838": {"path": "/paddlevideo/metrics/ucf24_utils.py:446-465", "hash": "667fb1e97d7a481ab207a186d781dfa4", "title": "Precision, Recall, Average Precision Calculation"}, "2839": {"path": "/paddlevideo/metrics/ucf24_utils.py:466-495", "hash": "cfc804b7d4dada4a9c2deeb87d58c0dd", "title": "Calculate Average Precision for Classes"}, "2840": {"path": "/paddlevideo/metrics/ucf24_utils.py:496-523", "hash": "caa052cd196d1b8565a5eea5f75f8dcd", "title": "Interpolated Average Precision Calculation"}, "2841": {"path": "/paddlevideo/metrics/ucf24_utils.py:524-553", "hash": "641e154e93431fb71f847a1191545158", "title": "Calculating AP, AUC, and IoU in Video Metrics"}, "2842": {"path": "/paddlevideo/metrics/ucf24_utils.py:554-583", "hash": "e11d308d637b2c4c64bb32faa4bd56bf", "title": "Bounding Box Intersection Utility"}, "2843": {"path": "/paddlevideo/metrics/ucf24_utils.py:584-617", "hash": "b01d038dde77f5e284c1b52754d1296f", "title": "Bounding Box Intersection and Union Calculation"}, "2844": {"path": "/paddlevideo/metrics/ucf24_utils.py:618-648", "hash": "689bfc6c7bacd37750eca4b5c5316828", "title": "Validate Image Size and Coordinate Type Functions"}, "2845": {"path": "/paddlevideo/metrics/ucf24_utils.py:649-680", "hash": "2bc35449f2ceac4b8cb13230689083b6", "title": "Bounding Box Reader Function"}, "2846": {"path": "/paddlevideo/metrics/ucf24_utils.py:681-711", "hash": "828521da23a92b44cc6f81cf20752a6a", "title": "Bounding Box Analyzer"}, "2847": {"path": "/paddlevideo/metrics/ucf24_utils.py:712-743", "hash": "a0b46170c985b74b40a07d17963ae654", "title": "Mean Average Precision Calculator"}, "2848": {"path": "/paddlevideo/metrics/ucf24_utils.py:744-772", "hash": "53986650f001acb684a35d1d2a30559b", "title": "Average Precision Calculation"}, "2849": {"path": "/paddlevideo/metrics/ucf24_utils.py:773-783", "hash": "b45d64bedcd6d6b15c401d66096b9212", "title": "Mean Average Precision Calculator"}, "2850": {"path": "/paddlevideo/metrics/vos_metric.py", "hash": "915a8759484e2e5ef33f0869381073e5", "title": "VOS Metric: Video Object Segmentation"}, "2851": {"path": "/paddlevideo/metrics/vos_metric.py:1-38", "hash": "62eda92996aa477431934c9849c2bf53", "title": "VOS Metric: PaddleVideo Segmentation"}, "2852": {"path": "/paddlevideo/metrics/vos_metric.py:39-68", "hash": "200b31defdeca55bf28f4e16a0ba0ed4", "title": "VOS Metric Class Initialization"}, "2853": {"path": "/paddlevideo/metrics/vos_metric.py:69-91", "hash": "99ab5eec36253eac7628ecaff2875012", "title": "Data Loading and Processing Loop"}, "2854": {"path": "/paddlevideo/metrics/vos_metric.py:93-113", "hash": "3a2e0d9c42608d86e2ce09b8397a34d4", "title": "Prepare Data for Video Object Detection Model"}, "2855": {"path": "/paddlevideo/metrics/vos_metric.py:115-129", "hash": "f1be161f171716b96d7aecacb46b8eab", "title": "Introducing New Labels in VOS Metric"}, "2856": {"path": "/paddlevideo/metrics/vos_metric.py:131-147", "hash": "e8380516825c6b1c32f561e898a01711", "title": "Average Max Prediction"}, "2857": {"path": "/paddlevideo/metrics/vos_metric.py:149-168", "hash": "0cb7def920d404e362d76f262c64b2d1", "title": "Frame-wise Mask Updating and Timing"}, "2858": {"path": "/paddlevideo/metrics/vos_metric.py:169-191", "hash": "12ff7e0c1c3831f4bee116459e773a33", "title": "Average Time per Frame Calculator"}, "2859": {"path": "/paddlevideo/metrics/vos_metric.py:192-209", "hash": "ffda2c06adcac07ac0ecaca236fafa4e", "title": "Flip and Save Mask Tensor"}, "2860": {"path": "/paddlevideo/metrics/vos_metric.py:210-222", "hash": "d5b679eb56f5acc8bdb7e72e4a9d8e01", "title": "Unknown Variable Range Identification"}, "2861": {"path": "/paddlevideo/metrics/vos_metric.py:223-236", "hash": "cf513ebb9c2f6bedcb32d5f885507c61", "title": "Consecutive Integers Range"}, "2862": {"path": "/paddlevideo/metrics/vos_metric.py:237-250", "hash": "0a2d4661faa1494e3dae1431ca6859a5", "title": "Frame Metrics Analysis"}, "2863": {"path": "/paddlevideo/metrics/vos_metric.py:251-272", "hash": "ce5736294af41c423d5a5d0924b2671e", "title": "Masking and Saving Images in PaddleVideo Metrics"}, "2864": {"path": "/paddlevideo/metrics/vos_metric.py:273-276", "hash": "771f0b745f83fe03c9a184900e806ed8", "title": "Metrics Calculation Class Zip Savior"}, "2865": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py", "hash": "41aadc62c802f8fcdc4267be85ef557d", "title": "Average Precision Calculator for VOD"}, "2866": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:1-23", "hash": "2f02e25004c76d711a83db8d544f1f83", "title": "Interpolated Average Precision Calculator"}, "2867": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:25-55", "hash": "a35e75c835d69b4261b9b5f97f665ebe", "title": "Average Precision Calculator for Long Lists"}, "2868": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:57-86", "hash": "07ab196c211ee40a6b43b96f0626efcb", "title": "Average Precision Calculator Class"}, "2869": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:87-108", "hash": "fb9cab3f4d396b62ca789bce97a170ed", "title": "Average Precision Calculator"}, "2870": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:109-134", "hash": "f230c6ba4def426c0691906f666417a5", "title": "Average Precision Calculator"}, "2871": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:136-166", "hash": "6f58dc07af8cb38188b90ce5fc61ef44", "title": "Non-Interpolated Average Precision Calculator"}, "2872": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:168-192", "hash": "cc6778d11aea92ab1e3edf1381b90876", "title": "Non-Interpolated Average Precision Calculator"}, "2873": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:193-220", "hash": "32ff75ab9498acb3cb5ed8c5ce61e943", "title": "Non-Interpolated Average Precision Calculator"}, "2874": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:221-256", "hash": "506ea1b5420db66da24bff18b2ecb7a5", "title": "Average Precision Calculator"}, "2875": {"path": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:257-274", "hash": "cf50a45f26bc42e41054c14df54c7691", "title": "Normalized Predictions for Average Precision"}, "2876": {"path": "/paddlevideo/metrics/youtube8m/eval_util.py", "hash": "02429df6c99f35dcd61f8564dfbcb3f4", "title": "Paddlevideo Metrics Evaluation Utility"}, "2877": {"path": "/paddlevideo/metrics/youtube8m/eval_util.py:1-29", "hash": "6ba64c18b0159b6415daec92ef65f97c", "title": "Eval Util for Model Metrics"}, "2878": {"path": "/paddlevideo/metrics/youtube8m/eval_util.py:32-60", "hash": "a749d663a39c044a19963e816a717668", "title": "Video-level Annotation Precision"}, "2879": {"path": "/paddlevideo/metrics/youtube8m/eval_util.py:61-90", "hash": "88ee4e53916e83bf6ea5aa7d17167814", "title": "Video Average Precision Calculator"}, "2880": {"path": "/paddlevideo/metrics/youtube8m/eval_util.py:91-116", "hash": "42b217af7d7b320cadfe13731c1e73d7", "title": "Global Average Precision Calculation"}, "2881": {"path": "/paddlevideo/metrics/youtube8m/eval_util.py:117-137", "hash": "259858e9971ced1b918f07f44441dca6", "title": "Top-k Triplet Prediction Evaluation"}, "2882": {"path": "/paddlevideo/metrics/youtube8m/eval_util.py:138-167", "hash": "3484d024d6ab08d3b961e0280c0bb6c3", "title": "Top-K Prediction Evaluation Metrics"}, "2883": {"path": "/paddlevideo/metrics/youtube8m/eval_util.py:169-193", "hash": "5812363db00851131bddfb5e14a2d737", "title": "HitOneMetric: Evaluating Metrics in Video Prediction Task"}, "2884": {"path": "/paddlevideo/metrics/youtube8m/eval_util.py:194-205", "hash": "aa6284061eaa228dd7b47c8fcd3a5ae1", "title": "Calculating Gap in YouTube8m Evaluation"}, "2885": {"path": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py", "hash": "3a6b52474e0f8fc1b0ef29272e8d9fc1", "title": "Mean Average Precision Calculator"}, "2886": {"path": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:1-27", "hash": "cc511a80842322b8b997ba14ce6fff30", "title": "Mean Average Precision Calculator"}, "2887": {"path": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:28-59", "hash": "a6761286bf12f710723f2473bc33f1e6", "title": "Binary Classification Dataset Generation"}, "2888": {"path": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:60-80", "hash": "70f1d4ba19f0ed433c69a2a07198faf3", "title": "Mean Average Precision Calculator for Video Classification"}, "2889": {"path": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:81-112", "hash": "3a58d9388b273aa2da74b4c2aa9c1fcb", "title": "Mean Average Precision Calculator"}, "2890": {"path": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:113-114", "hash": "a736d8805b15e5b715f26733c4294cdc", "title": "Mean Average Precision Calculator"}, "2891": {"path": "/paddlevideo/metrics/yowo_metric.py", "hash": "7c0e54f1c7d0481a9c7bff57bf53be76", "title": "YOWO Metric Integration in PaddleVideo"}, "2892": {"path": "/paddlevideo/metrics/yowo_metric.py:1-30", "hash": "d53bcb6c7ac6a890b3a6ff894dda3d2e", "title": "YOWOMetric: PaddleVideo Metrics"}, "2893": {"path": "/paddlevideo/metrics/yowo_metric.py:31-62", "hash": "1d53a2b1c96df648d435eda21733e10b", "title": "BMN Metrics Initialization and Update"}, "2894": {"path": "/paddlevideo/metrics/yowo_metric.py:63-82", "hash": "0f345694c671f8e25c72442650bb6718", "title": "YOLOv5 Box Metrics Accumulator"}, "2895": {"path": "/paddlevideo/modeling/__init__.py", "hash": "488966a787e392178e2b8543857b5772", "title": "Video Recognition Modeling in PaddleVideo"}, "2896": {"path": "/paddlevideo/modeling/__init__.py:1-22", "hash": "3d751e0507daeeefbf85a20d6bb13105", "title": "PaddleVideo Modeling Library"}, "2897": {"path": "/paddlevideo/modeling/__init__.py:23-37", "hash": "d90466a4e75e22f691d9df1d19e4ec38", "title": "Initializing PaddleVideo Models and Functions"}, "2898": {"path": "/paddlevideo/modeling/assigners/__init__.py", "hash": "b84c04f109097502374e3de072b6161f", "title": "Importing MaxIoUAssignerAVA in PaddleVideo"}, "2899": {"path": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py", "hash": "4bc679cc0efdb7df18b1405f90605397", "title": "MaxIOUAssignerAVA: Assigning Results Efficiently"}, "2900": {"path": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:1-27", "hash": "1646824dbf7c1b4c3e2e7338b9a0610f", "title": "Max IoU Assigner AVA Class"}, "2901": {"path": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:28-49", "hash": "7a8c5854ee2f3ea1352f9fc773ddb8f1", "title": "MaxIoUAssignerAVA Initialization"}, "2902": {"path": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:50-75", "hash": "8b9bd9c31f2ef8a68068ba0c931470ee", "title": "Max IOU Assigner: Assigning GT Boxes"}, "2903": {"path": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:76-93", "hash": "e8e26cce699dee0caf2d33f22aec707e", "title": "Max IoU Assigner"}, "2904": {"path": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:94-109", "hash": "d1977482b366ef6a5ecdcf327375519a", "title": "Max IOU Assigner Algorithm"}, "2905": {"path": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:110-126", "hash": "7bcf4f22ab1f730c928fbadc100a7ea7", "title": "Max IOU Assigner: AVA Dataset Handling"}, "2906": {"path": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:127-148", "hash": "cbad570eaa675dcecea0e27a7f26a71b", "title": "Max IOU Assigner Implementation"}, "2907": {"path": "/paddlevideo/modeling/backbones/__init__.py", "hash": "e23df3fde12adb8739fd070328e85b88", "title": "Versatile Backbone Models in PaddleVideo"}, "2908": {"path": "/paddlevideo/modeling/backbones/__init__.py:1-27", "hash": "bc17074d4cf04323d788071bc1bcb773", "title": "Backbone Models Initialization in PaddleVideo"}, "2909": {"path": "/paddlevideo/modeling/backbones/__init__.py:28-55", "hash": "85d40b2dbb2e4e247d3f9fe343c3c437", "title": "PaddleVideo Backbone Models"}, "2910": {"path": "/paddlevideo/modeling/backbones/__init__.py:56-60", "hash": "fbeb11233a6ed72a3f2f192782528d2e", "title": "PaddleVideo Backbones List"}, "2911": {"path": "/paddlevideo/modeling/backbones/actbert.py", "hash": "6868ab9c78271cb980934dc163c8d96b", "title": "Multimodal BERT Embeddings for Video Action Recognition"}, "2912": {"path": "/paddlevideo/modeling/backbones/actbert.py:1-32", "hash": "0da4c8a154f652e9d5e532ad4de17370", "title": "PaddlePaddle BertEmbeddings Class"}, "2913": {"path": "/paddlevideo/modeling/backbones/actbert.py:33-52", "hash": "f6167c5a10c1bf90e5fa85ba4d3639a1", "title": "ActBERT Embeddings Initialization"}, "2914": {"path": "/paddlevideo/modeling/backbones/actbert.py:53-75", "hash": "8508f305bf5abc4ad9ca7bcb8fa63b62", "title": "ActBert: Video Action Recognition Backbone"}, "2915": {"path": "/paddlevideo/modeling/backbones/actbert.py:76-100", "hash": "27045e1f4524435a9f11a3f9aa9972f2", "title": "Bert Self-Attention and Embedding Layers"}, "2916": {"path": "/paddlevideo/modeling/backbones/actbert.py:101-123", "hash": "4d2230a96135f66e2b7930f9b58db21e", "title": "BertSelfAttention Class Definition"}, "2917": {"path": "/paddlevideo/modeling/backbones/actbert.py:124-144", "hash": "401783c08483e2c4a9d4272086706a40", "title": "Multi-Head Attention in ACT-BERT"}, "2918": {"path": "/paddlevideo/modeling/backbones/actbert.py:146-169", "hash": "516a7b040a7649397d45ba0ef830bce2", "title": "BertSelfOutput Layer Implementation"}, "2919": {"path": "/paddlevideo/modeling/backbones/actbert.py:170-192", "hash": "3e66128fc157f6162117135e8daf4dc6", "title": "ActBert: Transformer Backbone Model"}, "2920": {"path": "/paddlevideo/modeling/backbones/actbert.py:193-219", "hash": "4ce208b9f79524877d15c74678f43c1c", "title": "Attention-Based Transformer with Dropout"}, "2921": {"path": "/paddlevideo/modeling/backbones/actbert.py:220-246", "hash": "b963a7a280dcfe446ecb66cbdcdb9351", "title": "BertEntAttention: Vision Attention Class"}, "2922": {"path": "/paddlevideo/modeling/backbones/actbert.py:247-267", "hash": "0af87eb02d10fa64c49a32321bdb9448", "title": "Self-Attention Layers in ACTBERT"}, "2923": {"path": "/paddlevideo/modeling/backbones/actbert.py:268-299", "hash": "91f6cae682855dde6c490a7466cfc01d", "title": "Attention Mechanism for Vision and Text"}, "2924": {"path": "/paddlevideo/modeling/backbones/actbert.py:300-321", "hash": "5fdef2334285520932dbe311eea0a0a0", "title": "Multi-Head Attention Operation in ActBERT"}, "2925": {"path": "/paddlevideo/modeling/backbones/actbert.py:322-342", "hash": "247cd7c7fcc31e9ce2e58f0d95b0d0ef", "title": "Dropout-based Attention Scoring in ActBERT"}, "2926": {"path": "/paddlevideo/modeling/backbones/actbert.py:343-361", "hash": "206d2f979f233a9f40e205802350ac8e", "title": "Multi-scale Context Fusion in ActBERT"}, "2927": {"path": "/paddlevideo/modeling/backbones/actbert.py:363-381", "hash": "6c0d16d57d500a4f38fba2cf55cf2e2b", "title": "Cross-Attention in Transformers"}, "2928": {"path": "/paddlevideo/modeling/backbones/actbert.py:382-409", "hash": "0324b9057e289628cf0d5e526d22c64c", "title": "BertEntOutput: Layer Normalization and Dropout"}, "2929": {"path": "/paddlevideo/modeling/backbones/actbert.py:410-440", "hash": "004a5a04d4724ab3ae3d8b830c8904dc", "title": "Attention-Based Bert Layer with Dropout"}, "2930": {"path": "/paddlevideo/modeling/backbones/actbert.py:441-461", "hash": "3053dc4dab5104d84ab0c5ae1046d4e9", "title": "Bert Layer and Connection Layer Classes"}, "2931": {"path": "/paddlevideo/modeling/backbones/actbert.py:462-487", "hash": "b318be98785bb70dfb5057e2b4984306", "title": "BertConnectionLayer Initialization"}, "2932": {"path": "/paddlevideo/modeling/backbones/actbert.py:488-512", "hash": "f7f3c5b7b5336ece47d32829d80253a2", "title": "ActBERT Input Streams Model"}, "2933": {"path": "/paddlevideo/modeling/backbones/actbert.py:513-539", "hash": "5ccbd2d95009116baf564dd1bbeb7f11", "title": "Compute Layer Outputs for ActBert Pathways"}, "2934": {"path": "/paddlevideo/modeling/backbones/actbert.py:540-576", "hash": "d3768e90a4d154ad869199e50399627d", "title": "BertEncoder: Initializing BERT Encoder Parameters"}, "2935": {"path": "/paddlevideo/modeling/backbones/actbert.py:577-594", "hash": "a11d0cf26247deb0f77595da57609992", "title": "ACT Bert Layer Initialization"}, "2936": {"path": "/paddlevideo/modeling/backbones/actbert.py:595-622", "hash": "390c0f1859130aab834a47746c442a27", "title": "ActBERT: Multimodal Model for Text, Vision, and Action Embeddings"}, "2937": {"path": "/paddlevideo/modeling/backbones/actbert.py:623-645", "hash": "701ae7d374bb6f89d0f4de985df60610", "title": "Initializing Encoder Layers in ActBERT Model"}, "2938": {"path": "/paddlevideo/modeling/backbones/actbert.py:647-669", "hash": "db74773ac87ad2310a98f6f23a50fac7", "title": "Multi-Modal Embedding with Attention Probs"}, "2939": {"path": "/paddlevideo/modeling/backbones/actbert.py:670-693", "hash": "f9f0921d3cfed8612001abe2903146be", "title": "ActBERT Encoder Layers"}, "2940": {"path": "/paddlevideo/modeling/backbones/actbert.py:695-729", "hash": "656c4876d61ec18972c255e67cbd4a90", "title": "ActBert Pooler Class and Model Initialization"}, "2941": {"path": "/paddlevideo/modeling/backbones/actbert.py:730-759", "hash": "a92fa2016e3abb4677a08855e2ed49f2", "title": "Customized Bert Model Initialization"}, "2942": {"path": "/paddlevideo/modeling/backbones/actbert.py:760-775", "hash": "80f970dd467326a29798858e9456b668", "title": "ACTBERT: Multi-modal Action Model Initiation"}, "2943": {"path": "/paddlevideo/modeling/backbones/actbert.py:776-800", "hash": "c7a01c9ce07bc213c28f164d9dea49e5", "title": "ActBERT Model: Encoding Text, Action, and Visual Features"}, "2944": {"path": "/paddlevideo/modeling/backbones/actbert.py:801-819", "hash": "48a0849453a67cf70935f208da4c8bb1", "title": "Mask Generation for ActBERT"}, "2945": {"path": "/paddlevideo/modeling/backbones/actbert.py:820-838", "hash": "439d0e21ccbb3bd418d0bdc8b6db5fb6", "title": "ACTBERT Extended Mask Creation"}, "2946": {"path": "/paddlevideo/modeling/backbones/actbert.py:839-865", "hash": "18247f82a70c14144562dbbdce7981bc", "title": "Multimodal ACTBERT Backbone Encoding"}, "2947": {"path": "/paddlevideo/modeling/backbones/actbert.py:866-892", "hash": "27fbe39f4a3a2ec18a7f4b709a597825", "title": "BERT Prediction Heads: Transform and Classes"}, "2948": {"path": "/paddlevideo/modeling/backbones/actbert.py:893-909", "hash": "47296f887c6b1cbac796cd29f4158c94", "title": "BertLMPredictionHead Initialization"}, "2949": {"path": "/paddlevideo/modeling/backbones/actbert.py:910-937", "hash": "2ea52f96071119c4e8d951ce2902b092", "title": "Attention and Feedforward in BERT"}, "2950": {"path": "/paddlevideo/modeling/backbones/actbert.py:938-956", "hash": "bc24dba75be517628bdb518d768b6cbc", "title": "BertPreTrainingHeads Class Initialization"}, "2951": {"path": "/paddlevideo/modeling/backbones/actbert.py:957-986", "hash": "07d102fb8cefd77770875a692ed1792f", "title": "Multi-Modal ACT-BERT Model"}, "2952": {"path": "/paddlevideo/modeling/backbones/actbert.py:987-1018", "hash": "e9b36157b48b1254e5ca3c79f9be1d70", "title": "Custom ACT-BERT Backbone Model for Multi-Modality"}, "2953": {"path": "/paddlevideo/modeling/backbones/actbert.py:1019-1034", "hash": "18964dbc2212b0fdc2adf937cdff7c36", "title": "ActBERT Input Parameters"}, "2954": {"path": "/paddlevideo/modeling/backbones/actbert.py:1035-1047", "hash": "43bf1835e40fbc7da8cec0055dab8ca4", "title": "Fixed BertLayer Parameters"}, "2955": {"path": "/paddlevideo/modeling/backbones/actbert.py:1048-1058", "hash": "a398bb6137bd244772b6d176ae4081f3", "title": "Transformer Model Default Parameters"}, "2956": {"path": "/paddlevideo/modeling/backbones/actbert.py:1059-1092", "hash": "9e7c573ffb8192880e187a80133e7937", "title": "Initialize ActBert Model"}, "2957": {"path": "/paddlevideo/modeling/backbones/actbert.py:1093-1116", "hash": "2019b29f5a0deee118c2f6d4bb1756f9", "title": "ActBERT Model Initialization"}, "2958": {"path": "/paddlevideo/modeling/backbones/actbert.py:1117-1137", "hash": "57bfd678e702fffda36aa6990f4584b1", "title": "ActBERT Input Layout"}, "2959": {"path": "/paddlevideo/modeling/backbones/actbert.py:1138-1158", "hash": "b59dea233adfd2192b6ae64ee289456a", "title": "ActBERT Function: Multimodal Prediction and Sequence Relationship"}, "2960": {"path": "/paddlevideo/modeling/backbones/adds.py", "hash": "37a2febfef9dfc56aef7fed1e0f389b0", "title": "PaddleVideo: Enhanced Modeling Backbones"}, "2961": {"path": "/paddlevideo/modeling/backbones/adds.py:1-30", "hash": "54f52d8306dc94f279ebc0155ace07ff", "title": "PaddlePaddle Backbones Registration"}, "2962": {"path": "/paddlevideo/modeling/backbones/adds.py:31-67", "hash": "b357e332304dc55ee759c91465389a57", "title": "Depth Prediction and Feature Extraction Functions"}, "2963": {"path": "/paddlevideo/modeling/backbones/adds.py:68-104", "hash": "526bf54fda523ad880d1244864553bf1", "title": "Transpose Conv with BatchNorm and Activation"}, "2964": {"path": "/paddlevideo/modeling/backbones/adds.py:105-151", "hash": "e7c2c24414313e018c7f03863edb5e30", "title": "Transformation Matrix Conversion"}, "2965": {"path": "/paddlevideo/modeling/backbones/adds.py:152-188", "hash": "16e6e974d0a2c7d1095be58785cc71c7", "title": "Rotation Operations on 3D Vectors"}, "2966": {"path": "/paddlevideo/modeling/backbones/adds.py:189-231", "hash": "3848705d7a802a58281f561dbb3ff202", "title": "Efficient Disparity Smoothness Loss Calculation"}, "2967": {"path": "/paddlevideo/modeling/backbones/adds.py:232-264", "hash": "d84c81d9ed3beb2fa69a2c480eb42697", "title": "ResNet Model with Multi-Input Images"}, "2968": {"path": "/paddlevideo/modeling/backbones/adds.py:265-294", "hash": "4c0fb660db2a370353f8150ecde52963", "title": "ResNet Model Creation Code"}, "2969": {"path": "/paddlevideo/modeling/backbones/adds.py:295-330", "hash": "b846b747d774fc5af344446efe3a10ff", "title": "Conv3x3 and Depth Backprojection"}, "2970": {"path": "/paddlevideo/modeling/backbones/adds.py:331-355", "hash": "c5b7b42e5b080a9757333d894dd15525", "title": "PaddleVideo Backbone Parameter Initialization"}, "2971": {"path": "/paddlevideo/modeling/backbones/adds.py:357-385", "hash": "e3d8b4a9ae284c1ccbd02b502b309519", "title": "Camera Projection in Project3D"}, "2972": {"path": "/paddlevideo/modeling/backbones/adds.py:386-417", "hash": "3f4ff089649ce7d16b25d703da2ff8fb", "title": "SSIM Loss Calculator from Pixel Coords"}, "2973": {"path": "/paddlevideo/modeling/backbones/adds.py:419-441", "hash": "14ad50be90168c9ea956687206763d6a", "title": "Multi-Input ResNet Model in PaddleVideo"}, "2974": {"path": "/paddlevideo/modeling/backbones/adds.py:442-466", "hash": "90b3e0f0db8302b93f9018180ae84014", "title": "ConvBN Layer Initialization"}, "2975": {"path": "/paddlevideo/modeling/backbones/adds.py:467-497", "hash": "4215d3cf191d5ae6fd104325ef74e7bf", "title": "ConvBN Layer Custom Class"}, "2976": {"path": "/paddlevideo/modeling/backbones/adds.py:498-528", "hash": "0d015a98a046d9468166df78f855c2c4", "title": "BasicBlock Class Definition"}, "2977": {"path": "/paddlevideo/modeling/backbones/adds.py:529-563", "hash": "13d18aa58b8278058673b03a96e55df5", "title": "ResNet V1.5 Bottleneck Layer Definition"}, "2978": {"path": "/paddlevideo/modeling/backbones/adds.py:564-597", "hash": "2a7d8491eadeb6c228fbe664a2dfb573", "title": "Bottleneck Convolutional Neural Network"}, "2979": {"path": "/paddlevideo/modeling/backbones/adds.py:599-631", "hash": "9bfbf5a82349c05eed460e19c6199e0d", "title": "DepthDecoder Class Definition"}, "2980": {"path": "/paddlevideo/modeling/backbones/adds.py:633-660", "hash": "0dc206bc2d110e3db4d85ae442a7e8d5", "title": "Decoder Convolutional Network Architecture"}, "2981": {"path": "/paddlevideo/modeling/backbones/adds.py:661-686", "hash": "283d92a398c0011e4a5b06b35bc19e37", "title": "Convolutional PoseDecoder Layer"}, "2982": {"path": "/paddlevideo/modeling/backbones/adds.py:688-725", "hash": "41a1063a56b8338f371f2f271c6b2425", "title": "ResNet Encoder with Adds Convolution"}, "2983": {"path": "/paddlevideo/modeling/backbones/adds.py:726-753", "hash": "cb5a462ed2e64f2411990b385e8167d6", "title": "ResNet Backbone Creation and Checks"}, "2984": {"path": "/paddlevideo/modeling/backbones/adds.py:754-776", "hash": "108895d00bd1965a66d9e53b609cf538", "title": "Shared Encoders and Decoder Backbone"}, "2985": {"path": "/paddlevideo/modeling/backbones/adds.py:777-797", "hash": "2057e3d08c1250dab9380506d18342a9", "title": "Convolutional Layers with Batch Normalization"}, "2986": {"path": "/paddlevideo/modeling/backbones/adds.py:798-817", "hash": "2ecb5a4722d9091c9f7b90d4c9b9efc9", "title": "Normalizing Image Input for Day Encoder"}, "2987": {"path": "/paddlevideo/modeling/backbones/adds.py:818-834", "hash": "a0147e1413c1e959697b860cfb7b063b", "title": "Day-Night Encoder Convolutions"}, "2988": {"path": "/paddlevideo/modeling/backbones/adds.py:835-861", "hash": "608cd0efb87b9bf7bdd38590e49793fd", "title": "Day-Night Model Features Extraction"}, "2989": {"path": "/paddlevideo/modeling/backbones/adds.py:862-889", "hash": "8edd777de943b977a71f51c2ff386250", "title": "Resnet Encoder Pypaddle Module"}, "2990": {"path": "/paddlevideo/modeling/backbones/adds.py:890-917", "hash": "571fa446968b68e091d4710d7826f6cc", "title": "ResNet Backbone Model with Multi-Image Inputs"}, "2991": {"path": "/paddlevideo/modeling/backbones/adds.py:918-949", "hash": "d0afb98cf1c72399756689f8d8da8c1e", "title": "ADDS Depth Estimation Network"}, "2992": {"path": "/paddlevideo/modeling/backbones/adds.py:950-972", "hash": "60a3e6b19ecb697223cea50b1a11d921", "title": "Model Initialization and Configuration"}, "2993": {"path": "/paddlevideo/modeling/backbones/adds.py:973-996", "hash": "382ab51041003217b59077d7b6abe267", "title": "Backbone Model for Pose Estimation Init"}, "2994": {"path": "/paddlevideo/modeling/backbones/adds.py:997-1019", "hash": "cc428eede3aa05f67c70bda302a7f1ce", "title": "Day-Night Backbone Model with Depth Feature Extraction"}, "2995": {"path": "/paddlevideo/modeling/backbones/adds.py:1020-1044", "hash": "7ee599a07d25d64b80e37b016de6a651", "title": "Handling Dict and Non-Dict Model Inputs"}, "2996": {"path": "/paddlevideo/modeling/backbones/adds.py:1046-1074", "hash": "ad49c8f9b8fcae7b7ce13c8d00be5c5d", "title": "Night/Day Pose Prediction Function"}, "2997": {"path": "/paddlevideo/modeling/backbones/adds.py:1075-1096", "hash": "7da278c6f7a74770b661892d63f94769", "title": "Calculates Camera Transformation Parameters"}, "2998": {"path": "/paddlevideo/modeling/backbones/adds.py:1097-1122", "hash": "c096162ac31211fcf153efcd4476044e", "title": "Depth Estimation with Displacement Interpolation"}, "2999": {"path": "/paddlevideo/modeling/backbones/adds.py:1123-1142", "hash": "caae6da4b2669a9a4ab0148a09e6ceb2", "title": "Grid Sampling and Masking for Night Scenes"}, "3000": {"path": "/paddlevideo/modeling/backbones/adds.py:1143-1146", "hash": "143d48df8a8fd2827a78374e748c6f43", "title": "Selecting Input Data from Dictionary"}, "3001": {"path": "/paddlevideo/modeling/backbones/agcn.py", "hash": "0b54af11c1f610dc2760f1d3b1fe6a8a", "title": "Adaptive Graph Convolutional Networks (AGCN) Backbone"}, "3002": {"path": "/paddlevideo/modeling/backbones/agcn.py:1-27", "hash": "557de042d709c02e868cdda422bced99", "title": "PaddlePaddle GCN Class Definition"}, "3003": {"path": "/paddlevideo/modeling/backbones/agcn.py:28-57", "hash": "6176ffe51f2db643bc2eac4ab0637a48", "title": "3D Spatio-Temporal Convolutional Block"}, "3004": {"path": "/paddlevideo/modeling/backbones/agcn.py:58-84", "hash": "c961c4ef76bdc214f7a008fb73ce66d0", "title": "GCN-TCN Residual Block Init."}, "3005": {"path": "/paddlevideo/modeling/backbones/agcn.py:87-110", "hash": "e0bc0e63fdce09ea465f61f8c9bfaa98", "title": "Adaptive Graph Convolutional Network (AGCN) Improvement"}, "3006": {"path": "/paddlevideo/modeling/backbones/agcn.py:111-128", "hash": "52d780627df6f634c342b78521a65529", "title": "AGCN Backbone: Custom Normalization and Pooling"}, "3007": {"path": "/paddlevideo/modeling/backbones/agcn2s.py", "hash": "f98dd76b978a59f05a7c78dd066c3974", "title": "AGCN2S Graph Convolutions in PaddlePaddle"}, "3008": {"path": "/paddlevideo/modeling/backbones/agcn2s.py:1-32", "hash": "84d9835404acf4a0813b5b73694f17c5", "title": "Temporal Convolutional Network Layer in PaddlePaddle"}, "3009": {"path": "/paddlevideo/modeling/backbones/agcn2s.py:33-65", "hash": "d384cb3dcf8c610e4c36e580832795ba", "title": "AGCN Unit: Learning Spatio-Temporal Features"}, "3010": {"path": "/paddlevideo/modeling/backbones/agcn2s.py:66-91", "hash": "6cdd6e828bd983a8a8ad49bb157b6fa7", "title": "AGCN2S Neural Network Backbone Definition"}, "3011": {"path": "/paddlevideo/modeling/backbones/agcn2s.py:92-121", "hash": "fedfd07ca74a92cf3a7a34859b7e95a7", "title": "AGCN-TS: Temporal Series Modeling with GCN and TCN"}, "3012": {"path": "/paddlevideo/modeling/backbones/agcn2s.py:122-144", "hash": "7eaa96b064b7d5e7afc887037150b7c1", "title": "Graph Class for NTURGB+D Dataset"}, "3013": {"path": "/paddlevideo/modeling/backbones/agcn2s.py:146-176", "hash": "4b4f0681a029ce9f324dcedf49dd07c2", "title": "Adjacency Matrix Conversion Functions"}, "3014": {"path": "/paddlevideo/modeling/backbones/agcn2s.py:177-212", "hash": "a7cc97c44a2ec264e23a89785ced10f4", "title": "Graph Convolutional Neural Network Layer (GCNN)"}, "3015": {"path": "/paddlevideo/modeling/backbones/agcn2s.py:213-229", "hash": "0376efd3716dbc9bc090cef16517bd73", "title": "AGCN2S Transformation Layers"}, "3016": {"path": "/paddlevideo/modeling/backbones/asrf.py", "hash": "a6ac6d4ea530091babd6e7926db93a66", "title": "ASRF: PaddleVideo Backbone Initiation"}, "3017": {"path": "/paddlevideo/modeling/backbones/asrf.py:1-30", "hash": "10a23849463d44042eef2cfccebc29e5", "title": "Asrf Backbone Model Registration"}, "3018": {"path": "/paddlevideo/modeling/backbones/asrf.py:33-65", "hash": "f3b40ded48a39e51144827bfafaffbd4", "title": "ASRF: Customizable Convolutional Backbone for CV"}, "3019": {"path": "/paddlevideo/modeling/backbones/asrf.py:66-75", "hash": "df236668f0758c13b7c79742c1aa8c5e", "title": "ASRF Backbone Initialization and Forward Method"}, "3020": {"path": "/paddlevideo/modeling/backbones/bmn.py", "hash": "a4fcecf0864fa2a7522c54e021154e06", "title": "BMN Backbone for Paddle Video"}, "3021": {"path": "/paddlevideo/modeling/backbones/bmn.py:1-28", "hash": "b7604968084b7bf64eb305e7706ca415", "title": "Boundary-Matching Pair Mask Generator"}, "3022": {"path": "/paddlevideo/modeling/backbones/bmn.py:29-53", "hash": "7042f3bf6cd9dcf606e7c7e3bbbb695f", "title": "Generating Sample Masks for Boundary-Matching Maps"}, "3023": {"path": "/paddlevideo/modeling/backbones/bmn.py:54-77", "hash": "e60959713a58d0492f61f75fc70cfcb4", "title": "Video Frame Mask Generation Code"}, "3024": {"path": "/paddlevideo/modeling/backbones/bmn.py:78-103", "hash": "9b2426cd9be2608d066aa6d3b2555f50", "title": "BMN Layer for Temporal Action Proposal Generation"}, "3025": {"path": "/paddlevideo/modeling/backbones/bmn.py:104-137", "hash": "9c5f27497475b308bf24b2241e179a61", "title": "BMN: Backbone Model with ConvLayers"}, "3026": {"path": "/paddlevideo/modeling/backbones/bmn.py:138-163", "hash": "440629d4c8292ef85882bdbae17ed896", "title": "Conv1D Block for BMN Model"}, "3027": {"path": "/paddlevideo/modeling/backbones/bmn.py:164-189", "hash": "3d4933abd2c3a98ae244ca8f21e782f8", "title": "Initializing TEM and PEM Modules in Backbone Network"}, "3028": {"path": "/paddlevideo/modeling/backbones/bmn.py:190-215", "hash": "78ec95979958789f1b0282aab49b912a", "title": "BMN Backbone Model Initialization"}, "3029": {"path": "/paddlevideo/modeling/backbones/bmn.py:216-246", "hash": "f9818fcc7daf9d04ef24086b8cf22014", "title": "2D Conv Layers for BMSN Backbone"}, "3030": {"path": "/paddlevideo/modeling/backbones/bmn.py:247-283", "hash": "a89c4846306b97df44d5ebef96d5a908", "title": "Video Analysis Backbone Model: BMN"}, "3031": {"path": "/paddlevideo/modeling/backbones/bmn.py:284-290", "hash": "7adeb2f0caaf01b50f8f9da2e33a8811", "title": "Convolutional Neural Network Backbone"}, "3032": {"path": "/paddlevideo/modeling/backbones/cfbi.py", "hash": "84c90cd16331c8b71ceed8e73577c267", "title": "CFBI Model: FPN-DeepLab Backbone"}, "3033": {"path": "/paddlevideo/modeling/backbones/cfbi.py:1-28", "hash": "4a21cd57b7e93c73a12eb0dbda8bcc62", "title": "FPN Layer Definition"}, "3034": {"path": "/paddlevideo/modeling/backbones/cfbi.py:29-54", "hash": "aaaef5ff7967d1a7b51d9d1b4a8bafaa", "title": "CFBI Backbone Model Architecture"}, "3035": {"path": "/paddlevideo/modeling/backbones/cfbi.py:56-84", "hash": "8ba487a6e65fd33a91235454e5a03109", "title": "CFBI: DeepLab-FPN Backbone Model"}, "3036": {"path": "/paddlevideo/modeling/backbones/cfbi.py:85-88", "hash": "7a2820fdc97c10a36f54efddf7f98586", "title": "CFBI: Multi-scale Feature Extraction"}, "3037": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py", "hash": "b68c1109349ce09a20a84bb36db023e9", "title": "Introducing CTRGCN Backbone for Video Models"}, "3038": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:1-31", "hash": "a8421e8c3b26ab466801827268cc9434", "title": "CtrGCN Backbone Setup"}, "3039": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:32-66", "hash": "d45bc2277c55afbbe9a4be1d46c1e01e", "title": "Defining CTRGC: Convolutional Layer"}, "3040": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:67-93", "hash": "b174ec5fde322ed7d6b0f37095eba9a5", "title": "Convolutional Temporal RGPN Backbone"}, "3041": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:94-127", "hash": "4ae33295f2dd4f2311a30ea6c8e5d4a2", "title": "Temporal Convolution Backbone"}, "3042": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:128-155", "hash": "846834dad40ef9ab2c0d4b828ad3a7e6", "title": "MultiScale Temporal Conv Layer"}, "3043": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:156-182", "hash": "1cd25c9c14ee85cbae8548b0ba24cc8d", "title": "Conv-Temporal RGN Backbone: Video Analysis Model"}, "3044": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:183-211", "hash": "7c5d41cc7ee843633abb88f0c0c16495", "title": "Conv-Temporal Residual Group Convolutional Network Backbone"}, "3045": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:212-250", "hash": "c76eae6db565ab8e0199ebb4b32ce210", "title": "Temporal and Graph Convolutional Network Units"}, "3046": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:251-276", "hash": "7276eaa21d2ae732017b7c0dd7893ff7", "title": "CTRGC Model Initialization"}, "3047": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:277-306", "hash": "a9d6c6429842af5ea03692abde04b9c8", "title": "Adaptive CTR GCN Initialization"}, "3048": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:307-335", "hash": "928b69c27f0be9ddf63ea0097393440e", "title": "TCN-GCN Unit Definition"}, "3049": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:336-363", "hash": "feb494348367103baf0ad536625382a1", "title": "CTRGCN: Residual Graph Convolutional Network"}, "3050": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:364-397", "hash": "af76f6b7f946fa8619ba3cf805c037d5", "title": "Generating Adjacency Matrices for CTRGCN"}, "3051": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:398-426", "hash": "e5a21f04ca69da7da88a2999235678d1", "title": "CTRGCN: Skeleton Action Model"}, "3052": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:427-455", "hash": "618d77923142bca0b283412dfaa1b24f", "title": "CTRGCN: TCN-GCN Model Initialization"}, "3053": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:456-477", "hash": "c3bec9e7555de459dc04bd322b0f06c1", "title": "Deep TCN-GCN Architecture for CTRGCN Model"}, "3054": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:478-511", "hash": "28c7091e7c30fe9c87c9f73007a76fb3", "title": "TCN-GCN Neural Network Model"}, "3055": {"path": "/paddlevideo/modeling/backbones/ctrgcn.py:512-514", "hash": "f7179ec1a463768b18f189f553df4e66", "title": "Final Neural Network Layer 10 Application"}, "3056": {"path": "/paddlevideo/modeling/backbones/darknet.py", "hash": "306a8cd8fc576d602d1837e2810029e2", "title": "Darknet Backbone with ConvBNLayer"}, "3057": {"path": "/paddlevideo/modeling/backbones/darknet.py:1-32", "hash": "2996a471779b51ac4472914ac90a90f4", "title": "Darknet ConvBN Layer Definition"}, "3058": {"path": "/paddlevideo/modeling/backbones/darknet.py:33-61", "hash": "9ae3475fa49503d39dfc32304d2dd356", "title": "Darknet Convolutional Block with BN and Leaky ReLU"}, "3059": {"path": "/paddlevideo/modeling/backbones/darknet.py:62-92", "hash": "9a2721de8e247fccea94289cb3d0d490", "title": "Darknet Backbone with ConvBNLayer and MaxPooling"}, "3060": {"path": "/paddlevideo/modeling/backbones/darknet.py:93-115", "hash": "f7955b6f2e7fbccb98b95416544d457f", "title": "Darknet Layer Transpose Dimensions"}, "3061": {"path": "/paddlevideo/modeling/backbones/darknet.py:116-129", "hash": "0f07b10fa1cb3b02656a22711d5d4864", "title": "Darknet Backbone: ConvBNLayer Sequence"}, "3062": {"path": "/paddlevideo/modeling/backbones/darknet.py:130-150", "hash": "b5126b14fa295c902c32c19570c55b88", "title": "Darknet Neural Network Backbone Design"}, "3063": {"path": "/paddlevideo/modeling/backbones/darknet.py:151-165", "hash": "5cbcab83c0edb6cb905e5d693f397fab", "title": "Darknet Convolutional Branching"}, "3064": {"path": "/paddlevideo/modeling/backbones/deeplab.py", "hash": "93dbc83efd62dc5f1c026d8d500d49f9", "title": "DeepLab Network Construction"}, "3065": {"path": "/paddlevideo/modeling/backbones/deeplab.py:1-33", "hash": "3b645ceb7e148d1455afd9c795488f68", "title": "Fixed Batch Normalization Layer"}, "3066": {"path": "/paddlevideo/modeling/backbones/deeplab.py:34-59", "hash": "ba3187ee65a81a2b0ad90a06d0614552", "title": "DeepLab Bottleneck Layer Initialization"}, "3067": {"path": "/paddlevideo/modeling/backbones/deeplab.py:60-86", "hash": "2c0909b89d0b76fb0414eaa00931ead1", "title": "Bottleneck Conv Neuron Layer for DeepLab"}, "3068": {"path": "/paddlevideo/modeling/backbones/deeplab.py:87-130", "hash": "f5769a2ab73a8014995e860ce2e45a16", "title": "ResNet: Residual Blocks with Conv and BatchNorm"}, "3069": {"path": "/paddlevideo/modeling/backbones/deeplab.py:131-152", "hash": "911951e1e520bbf4384eeadebe11807b", "title": "DeepLab Model Creation with Conv Layers and BatchNorm"}, "3070": {"path": "/paddlevideo/modeling/backbones/deeplab.py:153-176", "hash": "1bea05aa65ab2cf8e502fac6b8751c49", "title": "DeepLab Backbone Classification Layer Design"}, "3071": {"path": "/paddlevideo/modeling/backbones/deeplab.py:177-207", "hash": "3058ac7b63898556c86be5c9a0bb1372", "title": "DeepLab Module Creation Function"}, "3072": {"path": "/paddlevideo/modeling/backbones/deeplab.py:208-240", "hash": "64ac081dc8580ed0bf07debf1966b647", "title": "DeepLab ConvNet Function"}, "3073": {"path": "/paddlevideo/modeling/backbones/deeplab.py:241-269", "hash": "ec86c2916e5caa90de47dd0ed8156937", "title": "DeepLab ASPP Model Extraction"}, "3074": {"path": "/paddlevideo/modeling/backbones/deeplab.py:270-307", "hash": "21a6ef848333d30346313e7b99365acc", "title": "DeepLab ASPP Module Initialization"}, "3075": {"path": "/paddlevideo/modeling/backbones/deeplab.py:308-330", "hash": "0a432fbbabd38ec6882aaaf0389c57c6", "title": "Dynamic ASPP Modules in DeepLab Backbone"}, "3076": {"path": "/paddlevideo/modeling/backbones/deeplab.py:332-363", "hash": "7da09401b65b148753681e6127abd6c4", "title": "DeepLab Backbone for Image Segmentation"}, "3077": {"path": "/paddlevideo/modeling/backbones/deeplab.py:364-395", "hash": "f50ebd3329a0beaf9a4d6d1a569ec4cd", "title": "DeepLab Decoder Class"}, "3078": {"path": "/paddlevideo/modeling/backbones/deeplab.py:396-426", "hash": "af3ee85468de0d0d6b8818ed00192b2d", "title": "DeepLab Model for Segmentation"}, "3079": {"path": "/paddlevideo/modeling/backbones/deeplab.py:427-454", "hash": "24e128e0bdd5344ef13ac5dcb2731d71", "title": "DeepLab Model Implementation"}, "3080": {"path": "/paddlevideo/modeling/backbones/movinet.py", "hash": "a302bfecfee2f2ecd3e604c080380d0d", "title": "MoViNet: Mobile Video Analysis Model"}, "3081": {"path": "/paddlevideo/modeling/backbones/movinet.py:1-27", "hash": "8b9cbe75f2d2e65a613bcb77e13335ad", "title": "MOViNet Configuration"}, "3082": {"path": "/paddlevideo/modeling/backbones/movinet.py:28-55", "hash": "dd33540272c41527352f7bf57762e06e", "title": "MobileNetV2 Architecture Defined"}, "3083": {"path": "/paddlevideo/modeling/backbones/movinet.py:56-94", "hash": "39705352fffe0a3b19a938abbfcab294", "title": "Conv2dBNActivation Layer for MoviNet"}, "3084": {"path": "/paddlevideo/modeling/backbones/movinet.py:95-121", "hash": "e7cc68013aca91f573e26c8c6bc59213", "title": "Convolutional Neural Network Layers with Batch Normalization"}, "3085": {"path": "/paddlevideo/modeling/backbones/movinet.py:122-147", "hash": "f4b0621c7111350fdbe82bc6f5ed6d90", "title": "Conv3D Layer Creation"}, "3086": {"path": "/paddlevideo/modeling/backbones/movinet.py:148-177", "hash": "a92cd17d060b0b723d3eb58f53af0ebd", "title": "ConvBlock3D: Causal Convolutional Module"}, "3087": {"path": "/paddlevideo/modeling/backbones/movinet.py:178-196", "hash": "408112b914bf28884f91a5467d3ec22e", "title": "Conv Type Check and Initialization"}, "3088": {"path": "/paddlevideo/modeling/backbones/movinet.py:197-216", "hash": "bbb4975ea6a56064dc9ede43575d3b10", "title": "Defining Conv Layers in Movinet Backbone"}, "3089": {"path": "/paddlevideo/modeling/backbones/movinet.py:217-238", "hash": "fb354d010d2a3138ff4591bf46d8aca6", "title": "Convolutional Video Backbone"}, "3090": {"path": "/paddlevideo/modeling/backbones/movinet.py:239-269", "hash": "58ec4e14fe27bcf5624398b2fa5f1f34", "title": "Temporal Causal Average Pooling 3D"}, "3091": {"path": "/paddlevideo/modeling/backbones/movinet.py:270-296", "hash": "fca960d149ec066fa3935c31891cae19", "title": "CausalModule: Cumulative Sum and Activation Control"}, "3092": {"path": "/paddlevideo/modeling/backbones/movinet.py:297-322", "hash": "316d42da22ac811dc22f5cf7d291725a", "title": "SqueezeExcitation Layer Class"}, "3093": {"path": "/paddlevideo/modeling/backbones/movinet.py:323-347", "hash": "988591b8f3b8c54e9240804dd398e5a5", "title": "Scale-Aware Spatial Pyramid Pooling"}, "3094": {"path": "/paddlevideo/modeling/backbones/movinet.py:350-382", "hash": "68faea4af3cdc76d8f6d6313a5cb770c", "title": "BasicBneck Neural Network Layer"}, "3095": {"path": "/paddlevideo/modeling/backbones/movinet.py:383-404", "hash": "97181d9978a3b91eb56a442a7486658c", "title": "3D ConvBlock for MoviNet Backbone"}, "3096": {"path": "/paddlevideo/modeling/backbones/movinet.py:405-427", "hash": "6114d1b97d9584ebc415a187d8928232", "title": "ConvBlock3D Creation: Stride, Channels and Causal Convolution"}, "3097": {"path": "/paddlevideo/modeling/backbones/movinet.py:428-464", "hash": "c45048016d029d5ada75c7fe639418e6", "title": "MoViNet: Video Backbone Model"}, "3098": {"path": "/paddlevideo/modeling/backbones/movinet.py:465-487", "hash": "7d796eb9a9e404d934bd32ce08f80588", "title": "MOViNet Model Definition"}, "3099": {"path": "/paddlevideo/modeling/backbones/movinet.py:488-510", "hash": "2c20e56cc6cf1802f3372a61d13fb423", "title": "MOViNet Customizable Model Creation"}, "3100": {"path": "/paddlevideo/modeling/backbones/movinet.py:511-539", "hash": "37197c7571972487a8d871063d9e7743", "title": "MoviNet 3D CNN Backbone"}, "3101": {"path": "/paddlevideo/modeling/backbones/movinet.py:541-572", "hash": "17d78086e0c95c271ea233ae202a775d", "title": "MoviNet Backbone Class"}, "3102": {"path": "/paddlevideo/modeling/backbones/movinet.py:573-574", "hash": "92b55d206590d1a1b81e94b3f61ae34f", "title": "Movinet 3D Causal Instance Generation"}, "3103": {"path": "/paddlevideo/modeling/backbones/ms_tcn.py", "hash": "2be430f8aab90780090ae870946e9d0a", "title": "Kaiming Uniform Initialization for MSTCN Backbone"}, "3104": {"path": "/paddlevideo/modeling/backbones/ms_tcn.py:1-32", "hash": "8e48f220f89e292b615af45b9e36aca2", "title": "MS TCN Initialization"}, "3105": {"path": "/paddlevideo/modeling/backbones/ms_tcn.py:34-68", "hash": "94b1cbe68270fbc75b2039edcecba130", "title": "Kaiming Uniform Initialization in MS-TCN"}, "3106": {"path": "/paddlevideo/modeling/backbones/ms_tcn.py:69-100", "hash": "9a7b4686b501d693a919410af1544b7b", "title": "SingleStage MS-TCN Model"}, "3107": {"path": "/paddlevideo/modeling/backbones/ms_tcn.py:101-132", "hash": "065eb70e24c8f0ead3e32b3d6190b1dc", "title": "Dilated Residual Layers in MSTCN Backbone"}, "3108": {"path": "/paddlevideo/modeling/backbones/ms_tcn.py:133-154", "hash": "25c9ba4d1825a7f638812b505c74f725", "title": "MS TCN Model Initialization"}, "3109": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py", "hash": "b619a08a8910a5bbe2b51642fc1785fb", "title": "MobileNetV2 Backbones for PaddlePaddle"}, "3110": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:1-30", "hash": "a6184ad59b1da63a24dc29b2b890d950", "title": "MobileNetV2 Backbone Code"}, "3111": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:32-58", "hash": "97b8470cf5c3812187a4ca9403bc941c", "title": "PaddlePaddle MobileNetV2: ConvBNLayer"}, "3112": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:59-85", "hash": "8c6079e3e79f7db1e18cd497ff7febfb", "title": "Inverted Residual Unit Class"}, "3113": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:86-103", "hash": "553eb9d8ee736c1aae931e1a958adbe5", "title": "Initializing and Defining Convolutional Layers in PPTSM-MV2 Backbone"}, "3114": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:105-132", "hash": "5b11cc0138b8b967ef6927f40b6eb9f4", "title": "Inverted Residual Blocks for PPTSM MV2"}, "3115": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:133-151", "hash": "4f12277b699b50b83cca69a54ed173fe", "title": "PPTSM_MV2 Residual Units Creation"}, "3116": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:152-187", "hash": "3f5c25af2b26fc415185310dd4bc64e7", "title": "PPTSM-MV2 and MobileNet Model"}, "3117": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:188-207", "hash": "e71f6b5556366690400370aa469e62f1", "title": "PPTSM-MV2 Backbone Initialization"}, "3118": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:208-232", "hash": "f16f33d73f184689874f7e02251865b7", "title": "PPTSM-MV2 Backbone Implementation"}, "3119": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:233-266", "hash": "a189744ec4b3c7388c6ae1be48a10ae5", "title": "PPTSM MobileNetV2 Model Initialization"}, "3120": {"path": "/paddlevideo/modeling/backbones/pptsm_mv2.py:267-282", "hash": "47da330f2155df56e2971632a9b49b7e", "title": "Scaled MobileNet Functions in PaddleVideo"}, "3121": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py", "hash": "c6e5fad6c599ff54486a4f4cac6730a1", "title": "PPTSM-Mv3 Backbone in PaddleVideo"}, "3122": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:1-28", "hash": "55a66790711897340dd9dff546463169", "title": "PaddleVideo: PPTSM-MV3 Backbone"}, "3123": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:30-52", "hash": "df20a3a621c4fd8718fb090311f23e86", "title": "MobileNetV3 Backbones: Stages and URLs"}, "3124": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:53-79", "hash": "71581279dd38439337dbcb5867e0c83e", "title": "PPTSM-Mv3 Backbone Versions"}, "3125": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:80-118", "hash": "bd48aa71ad51a0c6621a745ce16a72ca", "title": "MobileNetV3: Custom PyTorch Layer Definition"}, "3126": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:119-142", "hash": "fc0869b151975b92f71f0a94e43ba282", "title": "Configurable MobileNetV3 Model Function"}, "3127": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:143-168", "hash": "1f27175ece3fc1e9f764fb4e87146cf9", "title": "PPTSM-MV3 Backbone Architecture"}, "3128": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:169-194", "hash": "dc2606dfed9c34b02b203586208f82f2", "title": "PPTSM-MV3 Model Architecture"}, "3129": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:195-222", "hash": "398d930db9eb10f3a94085760215d965", "title": "PPTSM_MV3 Neural Network Model"}, "3130": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:223-258", "hash": "fb15e58f1a346619cd987d2fadf6df8a", "title": "ConvBNLayer: Video Classification Backbone"}, "3131": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:259-292", "hash": "5af576bda3bef2e1b3f79c1ac5354e06", "title": "ResidualUnit: Expand Conv Layer"}, "3132": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:293-312", "hash": "aa72f0cafbdf6e22a47bf2d2472bf55c", "title": "PPTSM_MV3 Block Design"}, "3133": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:314-348", "hash": "b7fa75f34540333339ed3ddafa28281c", "title": "PPTSM-MV3 Backbone: Temporal Shifting and SE Module"}, "3134": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:349-376", "hash": "6aa1d879936095c0ca510f6ee69d9c8c", "title": "Convolutional Neural Network Layer for PPTSM-MobileNetV3_small_x1_0"}, "3135": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:377-405", "hash": "08e8419605406c05cfd46380c1053fd2", "title": "Create MobileNetV3 Models via PaddlePaddle"}, "3136": {"path": "/paddlevideo/modeling/backbones/pptsm_mv3.py:406-408", "hash": "3b5f01d2aafab4c70f964b97ad7af48e", "title": "PPTSM-MV3 Backbone Instance Creation"}, "3137": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py", "hash": "b3cecb1c76ffe9b7b7320a2c84959f49", "title": "PPTSMv2 Video Backbone Python Module"}, "3138": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:1-27", "hash": "3e1b67e30314798bf7f101c986890306", "title": "PaddlePaddle Neural Network Backbone Module"}, "3139": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:29-64", "hash": "6b904833b4d4eebdad99c4c2a2a32ddb", "title": "PPLCNetV2 Backbone for Video Processing"}, "3140": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:65-96", "hash": "38cce59157e9164479cb0b4ec1f72f03", "title": "PPTSMV2: ConvBN Encoder with Attention"}, "3141": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:97-127", "hash": "fec8f26c22eccd90e5819abbc3c890a3", "title": "P3-SE Module: Conv2D, BatchNorm2D, ReLU, SEModule"}, "3142": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:128-161", "hash": "be91ce732a74dbf5daf28182da378b1c", "title": "Depthwise Separable Conv Layer Initialization"}, "3143": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:162-185", "hash": "d76e6b3cf8ec64bd2b43d7097a4c938e", "title": "PPTSM Backbone Model Initialization"}, "3144": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:186-209", "hash": "9cdff6b625075b1daf1475281e5a0633", "title": "Downsample Convolution Layer with SE Module"}, "3145": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:210-238", "hash": "c27b1c5dd0443acb2b514840cb509c04", "title": "PPTSM_v2 Backbone: Convolutional Deep Learning Model"}, "3146": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:239-269", "hash": "d3eefdeafa8cbd7683879e31b4cfd31d", "title": "PPTSM_V2 Backbone Implementation"}, "3147": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:270-303", "hash": "6390adc8b9e7b90b374066e5632abac5", "title": "PPTSM_v2_LCNet: A Backbone Neural Network"}, "3148": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:304-324", "hash": "fc79f52e1b50e9806f7043381ced1478", "title": "PPTSM-v2 Backbone Model: DepthwiseSeparable Stages"}, "3149": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:325-347", "hash": "93d29ef72beae8a88dbc1c96f9ff8709", "title": "PPTSM_V2 Backbone: PaddleVideo Model"}, "3150": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:348-372", "hash": "c723b7d83de5874e98ba9f7842570683", "title": "PPTSM_v2 Backbone: Weights and Efficiency"}, "3151": {"path": "/paddlevideo/modeling/backbones/pptsm_v2.py:373-405", "hash": "45207241f8894efdbeff4e55cee03c36", "title": "PPTSM_v2 Backbone: Video Analysis Model"}, "3152": {"path": "/paddlevideo/modeling/backbones/resnet.py", "hash": "2f26a0393ce97a8f274b46e0f7450d31", "title": "Dynamic ResNet Backbone Model"}, "3153": {"path": "/paddlevideo/modeling/backbones/resnet.py:1-34", "hash": "b572888cc977d360f81f2cc2eb573820", "title": "ConvBN Layer Class in ResNet"}, "3154": {"path": "/paddlevideo/modeling/backbones/resnet.py:35-58", "hash": "9c21811cdc30489005645fd30de15638", "title": "ConvBNLayer Custom Layer"}, "3155": {"path": "/paddlevideo/modeling/backbones/resnet.py:59-89", "hash": "3e16305b2805b104f8c77b7881f555c2", "title": "ResNet Module with BN and Activation"}, "3156": {"path": "/paddlevideo/modeling/backbones/resnet.py:90-111", "hash": "41354287fcb88509e9afad16350c67a6", "title": "ResNet Backbone: ConvBNLayer Creation"}, "3157": {"path": "/paddlevideo/modeling/backbones/resnet.py:112-143", "hash": "1eac3842e1f9895ffa70330bdda2f4d1", "title": "ResNet Block Creation in PyTorch"}, "3158": {"path": "/paddlevideo/modeling/backbones/resnet.py:144-178", "hash": "c9b218dcdf40e47d47f60eab827c5f84", "title": "ResNet Backbone Model Definition"}, "3159": {"path": "/paddlevideo/modeling/backbones/resnet.py:179-208", "hash": "2c96f267d3ae2467058e615f6e941ca0", "title": "ResNet Class Definition"}, "3160": {"path": "/paddlevideo/modeling/backbones/resnet.py:210-229", "hash": "beeebbcd94b2ed8b45e608fd73cffbd9", "title": "Dynamic ResNet Bottleneck Blocks"}, "3161": {"path": "/paddlevideo/modeling/backbones/resnet.py:230-252", "hash": "71e5dbe942f1a61d8eb55732cd760fce", "title": "Defining ResNet Model Layers"}, "3162": {"path": "/paddlevideo/modeling/backbones/resnet.py:253-268", "hash": "da9cc39e247eafd6b06d12f5293b9997", "title": "Pretrained ResNet Loading Path"}, "3163": {"path": "/paddlevideo/modeling/backbones/resnet.py:270-283", "hash": "e4a44513b9f7bdd645acae88ee7ea168", "title": "ResNet Forward Function Definition"}, "3164": {"path": "/paddlevideo/modeling/backbones/resnet3d.py", "hash": "295e9fdcfa348450a8b64f8b96f57dcf", "title": "3D ResNet Model in PaddleVideo"}, "3165": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:1-37", "hash": "71afc249d8df392d2dfa51a241c55694", "title": "Simplifying ConvNet Layers with ConvBNLayer"}, "3166": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:38-56", "hash": "43a105317fd5807d314801124ccb99d0", "title": "Extended Conv2D Layer for ResNet3D"}, "3167": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:57-89", "hash": "2dda73b02268924cf5289e056daba22f", "title": "3D ConvBN Layer Definition"}, "3168": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:90-115", "hash": "e63969981e0433b09559f6e2f66eaa24", "title": "Bottleneck3D Class Definition"}, "3169": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:116-140", "hash": "a223f417566331ef6a118bdc4b2e264e", "title": "ResNet3D Block Configurations"}, "3170": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:141-171", "hash": "28eef9576bb878cb38e7775807ef6b76", "title": "Initializing 3D ResNet Backbone Model"}, "3171": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:172-198", "hash": "36f2ea91399c3aa9bc08cc42f4675491", "title": "Dilated 3D ResNet Conv Layers"}, "3172": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:199-239", "hash": "6cc91ad1ec008d0da4d21afc2d7efeb0", "title": "ResNet3D Block Definition"}, "3173": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:242-263", "hash": "b740566d67abeb1afcc0fde5233d2add", "title": "Customizable ResNet 3D Backbone"}, "3174": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:264-282", "hash": "304ff74b54b6dd438fce2c242c765db6", "title": "ResNet3D Backbone Parameters"}, "3175": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:283-302", "hash": "b88c224f2c6476484702685137777dd0", "title": "ResNet3D Parameters and Architecture Settings"}, "3176": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:303-331", "hash": "fdd0190e1f3696f6e49b33060510e4de", "title": "ResNet3D Backbone: 3D Deep Learning Model"}, "3177": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:332-357", "hash": "f89bf5fe1d03f80edc0a166e0754f102", "title": "ResNet3D Model Initializer"}, "3178": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:358-387", "hash": "53863e4358f9ea8393a10a32cbdebfd5", "title": "Configuring ResNet3D Model Attributes"}, "3179": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:388-412", "hash": "eb1d5c82f597ab513c6c3acb8ab7405b", "title": "ResNet3D Layer Function"}, "3180": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:414-440", "hash": "cb6f6b0a535094ccad9bfe98bdce3ba8", "title": "Creating ResNet3D Residual Layers"}, "3181": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:441-457", "hash": "ece5aca85f65d6175cf08b5d4614ffec", "title": "Customizable ResNet3D Backbone Model"}, "3182": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:458-481", "hash": "a50f507149f972cdeae3b9c11a8099bb", "title": "ResNet3D Residual Layer Creation"}, "3183": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:482-509", "hash": "ebc2e91a3a6886a6504e10062363e879", "title": "Customizable ResNet3D Architecture"}, "3184": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:510-537", "hash": "cb6b0d0f75e9bdef79cfadff47565990", "title": "3D Conv Resnet Inflation"}, "3185": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:538-561", "hash": "1722c487c3072bad9ba662840b8ede96", "title": "Inflating 2D ConvNet to 3D ResNet3D"}, "3186": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:562-586", "hash": "4c99e9e62d6398f1c1c4252c3f5c49ae", "title": "ResNet3D Param Loading & Stem Layer Creation"}, "3187": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:587-620", "hash": "6cf295c803457a4e2b55df196accce40", "title": "ResNet3D: Convolutional and Pooling Layers"}, "3188": {"path": "/paddlevideo/modeling/backbones/resnet3d.py:621-641", "hash": "f552cbf0a939457312bbb5e44971d5f1", "title": "ResNet-3D Backbone Model Training"}, "3189": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py", "hash": "410dc3eeb473b70d31aeb1f3daff9d2a", "title": "Slowfast ResNet3d Backbone"}, "3190": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:1-30", "hash": "ea3a89034fd73b161dc13d7ab598039f", "title": "Slowfast ResNet3d: Reduced Fast Pathway"}, "3191": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:31-60", "hash": "25150af0b60afe1eee0b1a7ba369a5d6", "title": "ResNet3D: Slowfast Residual Layer"}, "3192": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:62-79", "hash": "2b21a6e3153447dd9a78422534e5c626", "title": "Defining Residual Module with Parameters"}, "3193": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:80-98", "hash": "7cecba1e78d9d9288184bb45f6d31e92", "title": "Build Residual Layers"}, "3194": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:100-129", "hash": "685ae30ee2a4cc586da717785f5f83c2", "title": "Downsampling Block for ResNet3D"}, "3195": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:130-157", "hash": "cabaf2c3cad6d659db9309cb93ed1b62", "title": "Resnet3D Backbone Creation in PaddleVideo"}, "3196": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:158-180", "hash": "4d2c54b404048a38e93fca7b79b7e23a", "title": "Loading and Resizing 2D Model Parameters"}, "3197": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:181-210", "hash": "b1d8dec33a4796416a67151731ec2708", "title": "ResNet3D Slow Only Pad Extension"}, "3198": {"path": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:211-214", "hash": "6ffd2191f5b2589534d2ff7af82e52c8", "title": "Resnet3D Slow-Only Strides and Dilations"}, "3199": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py", "hash": "76c37b1130688600443295b080a853c5", "title": "SlowFast: Video Recognition Backbone"}, "3200": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:1-33", "hash": "f8a1e7e7ed7646603b2f872c67cb60c2", "title": "ResNet SlowFast Backbone Initiation"}, "3201": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:34-66", "hash": "cc779de0a2eac28a1d110cb2b3098968", "title": "BottleneckTransform: Tx1x1, 1x3x3, Variable Kernel Sizes"}, "3202": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:67-87", "hash": "d8c780f5eec69bfe4eded094f8d36529", "title": "BottleneckTransform Class Definition"}, "3203": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:88-113", "hash": "5fcee0b7059041f21f1ca0f3c56cb4b5", "title": "Conv3D Layer for ResNet_SlowFast Backbone"}, "3204": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:114-139", "hash": "43e484311b1e3bad43aef502d0633073", "title": "3D Convolutional Layer with Batch Normalization"}, "3205": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:140-180", "hash": "ff4a279a571c2697d6138ef6b724bb3f", "title": "SlowFast ResNet Blocks"}, "3206": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:181-198", "hash": "462deefe8effcdddd10633f96816c0c7", "title": "Defining ResNet Bottleneck Arguments"}, "3207": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:199-237", "hash": "68113f9e36b662a83ade48a29f3c43fa", "title": "ResBlock Class for Deep Neural Networks"}, "3208": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:238-260", "hash": "713dd3b84d75f55c764f06fd9724402c", "title": "ResNet SlowFast Backbone Model"}, "3209": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:261-294", "hash": "1fa470895861cf878b111d8806e54134", "title": "SlowFast ResNet Stage"}, "3210": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:295-312", "hash": "46a056c4878dbad9869e0d3ce512fa32", "title": "ResStage Class Constructor"}, "3211": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:313-330", "hash": "ae48aab8439ae24443970625bb139ac6", "title": "ResStage Class for Residual Block"}, "3212": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:331-372", "hash": "66c001f807055d847fac7d9930e1df2c", "title": "ResNet SlowFast Model Initialization"}, "3213": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:373-404", "hash": "9226bc9be204e7606b9e86e1c77043d6", "title": "Slow-Fast ResNet Backbone with Pathways"}, "3214": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:405-432", "hash": "1906c4d12e8dd29178a876b05f7896bc", "title": "ResNet Basic Stem Module Definition"}, "3215": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:433-466", "hash": "0c18017c67ff0b7492777e034827cc2b", "title": "SlowFast 3D Stem Module"}, "3216": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:467-492", "hash": "7dcd535fadba37d8356d47294694f60f", "title": "Resnet Slowfast Model Initialization"}, "3217": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:494-518", "hash": "68b7fb814a680b3d09bd549b24f94429", "title": "ResNet SlowFast Stem and Fusion"}, "3218": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:519-544", "hash": "ab19eadaaa2f971b089565aea22a2820", "title": "FuseFastToSlow Convolutional Layer"}, "3219": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:545-572", "hash": "a2f07245be6af7cdaf768a21d4467b6f", "title": "ResNetSlowFast Model in PaddlePaddle"}, "3220": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:574-607", "hash": "b641571102a81ad9b829cee061c0b409", "title": "ResNetSlowFast: Video Recognition Architecture"}, "3221": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:608-632", "hash": "7ae10a70ce42148f2a30eea05d96d056", "title": "SlowFast Model Construction"}, "3222": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:633-657", "hash": "30b6433e3b123f1db2e2d840390ca518", "title": "SlowFast ResNet Backbone Initialization"}, "3223": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:659-678", "hash": "1a38dd45254193be69982e83119304f4", "title": "ResStage Configuration in ResNet SlowFast"}, "3224": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:679-704", "hash": "079e644ed52e59a8110b949c570fca96", "title": "ResNet SlowFast Model Architecture"}, "3225": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:705-733", "hash": "3669b4a77022aa248754be03b4159c11", "title": "ResNet SlowFast Layer Definitions"}, "3226": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:734-762", "hash": "c87a465cca62b8c6fe17995d4fca50d8", "title": "SlowFast ResNet Feature Extraction"}, "3227": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:763-788", "hash": "fc88d7de3dab4905a5f383bdc81eba75", "title": "ResNet Slowfast Model: Initialization and Forwarding"}, "3228": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast.py:790-795", "hash": "87ca4374c685dc76a0c8b1a4fbd8dc9c", "title": "ResNet SlowFast Final Layer"}, "3229": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py", "hash": "b1c36e6e8ccfc3f2e1246a2c9e153707", "title": "ResNet SlowFast MRI Model"}, "3230": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:1-33", "hash": "a29162cca09d08d1c98a150447f69332", "title": "PaddleVideo Backbone Initialization"}, "3231": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:34-66", "hash": "2965f7bd419469367527eb4518212205", "title": "BottleneckTransform: Temporal Conv Layers"}, "3232": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:67-87", "hash": "ac01a4f8467ac421b1ef98222482e160", "title": "BottleneckTransform Class in ResNet SlowFast MRI Model"}, "3233": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:88-113", "hash": "2ab2a09349f840e4a0fca6620178067e", "title": "Initiating 3D ConvLayers with BatchNorm and Stride"}, "3234": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:114-139", "hash": "c5f1d489fa7c0613eeb1529f95e3b011", "title": "Conv3D Layer with BN in Resnet-Slowfast MRI"}, "3235": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:140-180", "hash": "7150b7b2bf5021e7fc14f92ea9d4e673", "title": "SlowFast MRI ResNet Model"}, "3236": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:181-198", "hash": "d3c8b4b774e95e02b44cb66b84b9c0a5", "title": "Define ResNet Bottleneck"}, "3237": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:199-237", "hash": "c1ed6033ff1840dcc9da3a89f35e9957", "title": "ResBlock with Skip Connection"}, "3238": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:238-260", "hash": "11024f390083c7e62a522597168a1822", "title": "ResNet SlowFast MRI Model with BN & BottleneckTransform"}, "3239": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:261-294", "hash": "531b8228768565204840acfa3b757168", "title": "ResStage for 3D ResNet SlowFast Networks"}, "3240": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:295-312", "hash": "f16a3445572e456fcd2dee597d729cc1", "title": "ResStage Initialization"}, "3241": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:313-330", "hash": "383945abf6564556a95bf8a2bcaf94a1", "title": "ResStage Initialization: Resnet Slowfast MRI Backbone"}, "3242": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:331-372", "hash": "a168a89ece0e9b8cfbf20c95ab2c11f4", "title": "ResNet-SlowFast MRI Construction"}, "3243": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:373-404", "hash": "724c9c69304d9c43c284db48ecd362ca", "title": "SlowFast ResNet Backbone Initiation"}, "3244": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:405-432", "hash": "55b23715450e0f9cb5f58e9b207b1dfe", "title": "ResNetBasicStem: Kernel, Stride, Padding"}, "3245": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:433-466", "hash": "b567f5bc5f9a46b03ee90e668f823427", "title": "SlowFast Video Stem Module"}, "3246": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:467-492", "hash": "bb6959c2dcc98909cd4445a8629da8d6", "title": "Resnet Slowfast MRI Backbone"}, "3247": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:494-518", "hash": "6926cb4b286809ce1c3fe91a4e1d900b", "title": "ResNet SlowFast MRI Fusion"}, "3248": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:519-544", "hash": "7952982f5a9afee1ea74b5b49278185d", "title": "FuseFastToSlow Initialization"}, "3249": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:545-572", "hash": "bd9012942a29cdedf39aef8190a73521", "title": "ResNetSlowFast_MRI: Fusion Conv Layer"}, "3250": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:574-607", "hash": "3183c0f524499ed9494e7ecd14408f5b", "title": "ResNetSlowFast_MRI Model Initialization"}, "3251": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:608-632", "hash": "0534a0e51e0e4d1c1ae5dec0f705ed47", "title": "SlowFast Model Architecture"}, "3252": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:633-657", "hash": "052a1453087e58ab92dab2664cc21e3c", "title": "ResNet SlowFast MRI Backbone"}, "3253": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:659-678", "hash": "e0fcfeae7461d84ad9183f045107d545", "title": "Defining ResStage Layer Parameters"}, "3254": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:679-704", "hash": "351a00b914e73b4fbf4443d67063e7f6", "title": "ResNet SlowFast Model Initialization"}, "3255": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:705-733", "hash": "ac0e2c75db1d6e5239c71654d1b6dfcc", "title": "ResNet SlowFast MRI Model Initialization"}, "3256": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:734-762", "hash": "8162ee531c3162e3a292a04a48162304", "title": "ResNet SlowFast MRI Video Analysis Model"}, "3257": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:763-793", "hash": "32e623172ac947470296d31d6a381489", "title": "SlowFast 3D ResNet Model Initialization"}, "3258": {"path": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:794-796", "hash": "5a7d58f4971b403060a810a79625913b", "title": "SlowFast ResNet Fusion Layer"}, "3259": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py", "hash": "df7a018f1375f44c2e4c9fee4f666457", "title": "ResNet-TSM Backbone Update"}, "3260": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:1-30", "hash": "9a169b61f713b7417bf53a682e9ff7dd", "title": "ConvBNLayer: Combining Conv2D and BatchNorm2D"}, "3261": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:31-53", "hash": "df6acc03ec4023c61e0560c41c2bb6a1", "title": "Custom ConvBNLayer Class Definition"}, "3262": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:54-84", "hash": "4acdd29f187fb086afa9d2f155d2aa49", "title": "BottleneckBlock: Conv, BatchNorm, Activation"}, "3263": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:85-108", "hash": "c1851a9e1800073a79390a5151563972", "title": "BottleneckBlock Class Definition"}, "3264": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:109-134", "hash": "735bb8638185e33e0f3ec188d02d5517", "title": "TSM Backbone: Layer Initialization"}, "3265": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:135-164", "hash": "9a67fe609aa52d2526b5d91451c67766", "title": "Temporal Shift ResNet Backbone"}, "3266": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:165-202", "hash": "f9989c26be1d901c46929ae79517c895", "title": "Residual TSM Block: Alleviating Vanishing Gradients"}, "3267": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:203-241", "hash": "f9b31503ac41862c9383609c9b9f5223", "title": "ResNet TSM Backbone Model: Flexible Depths"}, "3268": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:242-273", "hash": "f9d3639c0cd0499577dc739dc41dbb28", "title": "ResNet-TSM Backbone Configuration"}, "3269": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:274-293", "hash": "5aca1583caa34c3fe1dfc0ca49828d33", "title": "ResNet TSM Bottleneck Blocks Creation"}, "3270": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:294-316", "hash": "e58bafc7602c6e0bcdfa25ea59cf8ddd", "title": "ResNet TSM Backbone Initialization"}, "3271": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:317-332", "hash": "99906c7b9e5b16f668a6378d592b1ee9", "title": "Initializing ResNet TSM Backbone Parameters"}, "3272": {"path": "/paddlevideo/modeling/backbones/resnet_tsm.py:333-353", "hash": "ed95d1825c76306ab6ace510aea0b7ae", "title": "ResNet TSM Forward Function"}, "3273": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py", "hash": "c00200559d1e7db7c30351f6ead8e5e0", "title": "ResNet-TSM MRI Backbone"}, "3274": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:1-32", "hash": "d80fb03775940a4bffdbd1ef2d3e092a", "title": "Convolutional Batch Normalization Layer"}, "3275": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:33-58", "hash": "7269412985591c20082a1050ca1c400d", "title": "ConvBNLayer Class Definition"}, "3276": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:59-83", "hash": "577d08ed9c6786de8bca38aff93e0000", "title": "ResNet-D: Pooling and Convolution Initialization"}, "3277": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:84-112", "hash": "0fc947bd602d2e28e60d31cda5885303", "title": "ResNetTSM_MRI Backbone Design"}, "3278": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:113-134", "hash": "ed038f4e4b5d0e976ea1699682110369", "title": "ResNet-D Branch Creation"}, "3279": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:136-166", "hash": "ce155ee4ea52deac2dcb58f1e00de1ba", "title": "ResNet-TSM Backbone with Temporal Shifts"}, "3280": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:167-200", "hash": "b4a87ad67626cf635ebc0919e8f4fc0d", "title": "ResNet TSM Backbone with BN and Leaky ReLU"}, "3281": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:201-229", "hash": "528cfcfcb1adf5a02c731b78ca24e9ed", "title": "ResNetTSM_MRI Class Initialization"}, "3282": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:230-252", "hash": "35f0360cdfd0d88fffdef9f7eeba6266", "title": "ResNet-TSM Backbone in PaddleVideo"}, "3283": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:253-271", "hash": "a0b9b4e0acb2c8c0fdd94e1fe4a61a58", "title": "Dynamic BottleneckBlock Naming"}, "3284": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:272-294", "hash": "a7e8af26df2d96d4af312d811c68ccf4", "title": "Dynamic ResNet TSM Backbone Creation"}, "3285": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:295-311", "hash": "4e8a168cb399726e5990366053666861", "title": "Initializing Backbone Neural Network"}, "3286": {"path": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:313-327", "hash": "3bbb03125c5acc8a5a56023f643b2776", "title": "Convolutional Layer Iterations"}, "3287": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py", "hash": "e1017f39d02cdbb8829e92c3062ea769", "title": "ResNet-TSN Model for PaddlePaddle"}, "3288": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:1-29", "hash": "782b7c9700b57db24014c32aec6d443b", "title": "ResNet-TSN Backbone Model Initialization"}, "3289": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:31-58", "hash": "ea167b3a5ad2412bc719f5ab9b742649", "title": "ConvBNPoolingLayer Class Definition"}, "3290": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:59-89", "hash": "d917b851e25b4ee26da543c6d6b34c5a", "title": "Resnet_TSN Backbone Model Definition"}, "3291": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:90-111", "hash": "618268671dd14fc68e16e6ff753bc2c7", "title": "BottleneckBlock: ResNet's Core Layer"}, "3292": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:112-144", "hash": "4565859aa43e5748904266485e4144c8", "title": "ResNet TSN Backbone with Conv and Shortcut Connection"}, "3293": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:145-167", "hash": "5cb940aa6a90638126e0609e9dbec78f", "title": "Defining BasicBlock for ResNet TSN MRI Model"}, "3294": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:168-202", "hash": "cf240e13e8ab5877f14d96792a3abba3", "title": "ResNetTSN_MRI Backbone Definition"}, "3295": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:204-232", "hash": "73761d62be0420ce7c44eb5e65e21bfe", "title": "ResNet TSN Backbone Initialization"}, "3296": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:233-253", "hash": "cf9a7f589fd9f3b45bb0bafb9cf3a2c9", "title": "ResNet TSN Model with Multiple Branch Inputs"}, "3297": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:254-275", "hash": "8163c78e1c2c2fd5aab9e41441c286d4", "title": "ResNet-TSN Bottleneck Block Initialization"}, "3298": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:276-294", "hash": "5739ef37916729ac772bfa365206d732", "title": "ResNet TSN Model Creation"}, "3299": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:295-311", "hash": "ed948e335a362c75c1e256f9e9f040aa", "title": "ResNet TSN Backbone Weight Initialization"}, "3300": {"path": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:313-331", "hash": "bff3cc81ea81525cd1f6bb3cc3dd9fea", "title": "Initializing and Checking Model Path"}, "3301": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py", "hash": "b1c1965060841beb75ba9c1b9623c33a", "title": "TSM ResNet Backbone for Temporal Segment Networks"}, "3302": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:1-31", "hash": "324bc8e91cc152daaab465c3bf4696f6", "title": "TSM ResNet Backbone Imports"}, "3303": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:32-54", "hash": "63f6a7f8da6dbb7b328dd6c00f0a2da8", "title": "ConvBNLayer: Combined Conv2D and BatchNorm2D Layers"}, "3304": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:55-78", "hash": "61416475f94a7b6901e302d3b928e0d0", "title": "ResNet-D Tweak ConvBN Layer"}, "3305": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:79-105", "hash": "cc1b6412a25bc6f6162a090f30a0af4f", "title": "Tweakable Convolutional Neural Network with Batch Normalization"}, "3306": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:106-132", "hash": "cca61b09bc716aee0988db3591cf6043", "title": "BottleneckBlock in ResNet-TSM"}, "3307": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:133-159", "hash": "602154f1982a3d9fba6ef88fd537c0a3", "title": "Resnet TSM Backbone: Forward Method"}, "3308": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:160-192", "hash": "9ba0d0c63bad533275b225b512a78547", "title": "Temporal Shifted ResNet Backbone"}, "3309": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:193-217", "hash": "1f9aa2d312ab5e1ceb2a97197bade0fe", "title": "ResNet TSM Block Definition"}, "3310": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:218-253", "hash": "2d989f2db9613c9e72a507541b62af20", "title": "ResNet TSM Backbone Model Init & Forward"}, "3311": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:254-279", "hash": "00e39f971dd6f95704f9327df9a4b55b", "title": "ResNet Model Depth Customization"}, "3312": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:280-301", "hash": "85d74f23dab55cb6eaa1d77436f306cb", "title": "TSM-ResNet Backbone with Tweaks"}, "3313": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:302-322", "hash": "f3dbae556575bccdf6cf993347500a50", "title": "ResNet Block Assignment Algorithm"}, "3314": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:323-339", "hash": "48f4ffe62901d60669c20cb894b71179", "title": "ResNet TSM Backbone Weights Init"}, "3315": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:340-362", "hash": "844fbf7cf3eb27202406abd4fe0b8b2f", "title": "Backbone Weights Initialization"}, "3316": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py", "hash": "f27398edca1352044711ba1e1def3961", "title": "ResNet TSN Model Backbones: PaddleVideo"}, "3317": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:1-29", "hash": "9ae6665e5256851f16c943f909e0ec08", "title": "ResNet Tweaks TSN Model Backbone"}, "3318": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:31-58", "hash": "32d20b89ba6e43b2dc71b58e020cfc88", "title": "Tailored ResNet Backbone with Pooling"}, "3319": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:59-89", "hash": "7f729f77290427564f6458c77a0b043a", "title": "ResNet-TSN Tweaks Backbone"}, "3320": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:90-111", "hash": "f4ad77f8136334bab88485ee5cec900f", "title": "Bottleneck Block: ResNet Parameter Reduction"}, "3321": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:112-144", "hash": "d9b0bdc7d70dd75139fa604f18e74263", "title": "ResNet Block Implementation"}, "3322": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:145-167", "hash": "f082a8720c0d058dc0dd00168b8d24af", "title": "BasicBlock Convolutional Layers and BatchNorm"}, "3323": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:168-203", "hash": "583278b3266af5064dd347b19f97d0bd", "title": "ResNetTweaksTSN Backbone: Convolution, Shortcut, and ReLU"}, "3324": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:204-232", "hash": "a8e738c65db825d78a4ce6b69a5840fe", "title": "ResNet Backbone Configurations"}, "3325": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:233-254", "hash": "9df72be5a8fc2f3d9b1146264f390ce8", "title": "First Layer ResNet Backbone Definition"}, "3326": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:255-276", "hash": "4b2a1ed083bed6b390c97719e8230271", "title": "ResNet-TSN Backbone Code"}, "3327": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:277-296", "hash": "d783dfb92fbe7be4b05c5722205e3d7a", "title": "Dynamic ResNet Tweaks for TSN"}, "3328": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:297-314", "hash": "7fb38e7b0a9b1b7ff08d6d0962cf7a89", "title": "Pre-Trained Weights Initialization in ResNet Backbone"}, "3329": {"path": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:315-328", "hash": "e0945651f520fdea379578d1659f93f4", "title": "ResNet Tweaks TSN Initialization"}, "3330": {"path": "/paddlevideo/modeling/backbones/resnext101.py", "hash": "0d7563caca06db964e53ce7a0fcdd480", "title": "ResNeXt-101 in PaddlePaddle"}, "3331": {"path": "/paddlevideo/modeling/backbones/resnext101.py:1-31", "hash": "6916baf0b107458101302ba2d89497a6", "title": "ConvBNLayer Class in PaddlePaddle"}, "3332": {"path": "/paddlevideo/modeling/backbones/resnext101.py:32-55", "hash": "75c93aebcfea165ccc58d5f7c18a14b8", "title": "ConvBNLayer Initialization"}, "3333": {"path": "/paddlevideo/modeling/backbones/resnext101.py:56-82", "hash": "356cdf29637ca2a5ca86127b27e5f571", "title": "ResNeXt101 Bottleneck Block and Downsampling"}, "3334": {"path": "/paddlevideo/modeling/backbones/resnext101.py:83-122", "hash": "c7fc05fa04552941b236a329aba7bdc8", "title": "ResNeXt Model Definition"}, "3335": {"path": "/paddlevideo/modeling/backbones/resnext101.py:123-148", "hash": "e35a35a62d5fbbca04b6a20ea36bbe0f", "title": "ResNext101 Backbone Architecture"}, "3336": {"path": "/paddlevideo/modeling/backbones/resnext101.py:149-176", "hash": "d1f3521390699ee4cfda1c947ae99a3a", "title": "ResNeXt-101 Model Implementation"}, "3337": {"path": "/paddlevideo/modeling/backbones/resnext101.py:177-187", "hash": "d04a5057bb510e6de0af0cf64bf21f8b", "title": "ResNext101: Constructing and Applying Layers"}, "3338": {"path": "/paddlevideo/modeling/backbones/stgcn.py", "hash": "f564cebceac268c66fb3977da4d482bd", "title": "STGCN: Skeleton-based Action Recognition Backbone"}, "3339": {"path": "/paddlevideo/modeling/backbones/stgcn.py:1-37", "hash": "43805073d93a8529c3d48c5bd9c68401", "title": "STGCN Backbone Definition"}, "3340": {"path": "/paddlevideo/modeling/backbones/stgcn.py:38-80", "hash": "89ece095569cf9c677af6b185fc72e77", "title": "Graph Hopping and Normalization"}, "3341": {"path": "/paddlevideo/modeling/backbones/stgcn.py:81-104", "hash": "fdb0f7da4e63ff4f5a59cbf5a87f3f78", "title": "Hop Distance Initialization in ST-GCN"}, "3342": {"path": "/paddlevideo/modeling/backbones/stgcn.py:105-120", "hash": "82e63e11b5506c2be6747b9fbb45beb4", "title": "Node Initialization in STGCN and COCO Keypoint Backbones"}, "3343": {"path": "/paddlevideo/modeling/backbones/stgcn.py:121-143", "hash": "7d16f48635f2c16ad5d5d83cdbd2bff8", "title": "Adjacency Matrix Initialization for STGCN"}, "3344": {"path": "/paddlevideo/modeling/backbones/stgcn.py:144-174", "hash": "703822fea14d8ae57d5b08c4cbca3667", "title": "ConvTemporalGraphical Layer Initialization in STGCN"}, "3345": {"path": "/paddlevideo/modeling/backbones/stgcn.py:175-209", "hash": "fbc469e4eaf9661c78ed0a26a5697da8", "title": "Temporal Graph Convolutions in STGCN"}, "3346": {"path": "/paddlevideo/modeling/backbones/stgcn.py:210-251", "hash": "c51c8efabb4eba54ab47a35742dae631", "title": "STGCN Model: Spatial and Temporal Processing"}, "3347": {"path": "/paddlevideo/modeling/backbones/stgcn.py:251-278", "hash": "07dc4b981729502cae16f9a00e4e7f43", "title": "Skeleton Action Recognition with STGCN"}, "3348": {"path": "/paddlevideo/modeling/backbones/stgcn.py:279-300", "hash": "6873eb5551ee14111d37dff15c743950", "title": "ST-GCN Block Initialization"}, "3349": {"path": "/paddlevideo/modeling/backbones/stgcn.py:301-327", "hash": "de921dd57ce01c14dfe572ffe559c05d", "title": "StGCN Edge Importance Initialization"}, "3350": {"path": "/paddlevideo/modeling/backbones/stgcn.py:328-343", "hash": "d1c88eca206df1f2d5fb54aa82e9ea4c", "title": "ST-GCN Pooling and Reshaping"}, "3351": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py", "hash": "b516fc69528e2b134eeb7b840a81eb5d", "title": "Swin Transformer 3D Backbone in PaddleVideo"}, "3352": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:1-33", "hash": "a1eb32272963f0b5e110eb0f6d1f8417", "title": "Swin Transformer Stochastic Depth"}, "3353": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:34-64", "hash": "232738576c71b8acafaa53b52de83c95", "title": "Stochastic Depth DropPath Layer"}, "3354": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:65-99", "hash": "174392b3293daaeb37bd16c22e544a04", "title": "Defining Swin Transformer Layer"}, "3355": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:100-137", "hash": "af53d162a1535c8085da766abc98e486", "title": "Swin Transformer Window Rearrangement"}, "3356": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:138-161", "hash": "4b15885733e802c6b90ce13212f513d4", "title": "Window-based Multi-Head Self Attention with Relative Position Bias"}, "3357": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:162-185", "hash": "7ec1f82c25cfbe31f9b5dcc647e77bf8", "title": "Swin Transformer Self-Attention Initialization"}, "3358": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:186-204", "hash": "0275bd3467f5abe5830c41e2560e688e", "title": "Swin Transformer: Relative Position Encoding"}, "3359": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:205-232", "hash": "4e32b34c957d81c4ac04a5dd0d4be8e3", "title": "Swin Transformer Backbone Initialization"}, "3360": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:233-261", "hash": "21d07bba4a7a9b43cbd1bab8034136f2", "title": "Swin Transformer Block 3D Implementation"}, "3361": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:262-282", "hash": "438a747abd84a81503589143c9ae9913", "title": "Swin Transformer Backbone Initialization"}, "3362": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:283-307", "hash": "46b06f39a563ae54bc20ef66b7098909", "title": "Swin Transformer Backbone Initialization"}, "3363": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:308-330", "hash": "0f04445a026a546034852e46e0830eaf", "title": "Swin Transformer Backbone Initialization"}, "3364": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:331-353", "hash": "283d9509465f1aa29e03ac7e77433763", "title": "Swin Transformer: Cyclic Shift and Windowed Self-Attention"}, "3365": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:354-390", "hash": "f71cc1322cf5837f6d2d27e7433105b2", "title": "Swin Transformer Forward Pass"}, "3366": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:391-426", "hash": "6fcbaed527f0519a157789e69aa6852c", "title": "Swin Transformer Image Backbone"}, "3367": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:427-443", "hash": "f86955aa2354d5bc840cec7a0e1156e8", "title": "Swin Transformer Attention Mask Generation"}, "3368": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:444-464", "hash": "aef4fddf6299aae015e683d401be169b", "title": "Customizable Swin Transformer Layer"}, "3369": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:464-493", "hash": "a714d26f8156733980ce51537c88a007", "title": "Swin Transformer 3D Block Definition"}, "3370": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:494-522", "hash": "89f56b7cab1d60c6a04897f04b7538a7", "title": "Swin Transformer Block for PaddleVideo"}, "3371": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:523-551", "hash": "a42c695b9e44366f7251c1bba6c857e9", "title": "Video Patch Embedding for Swin Transformer"}, "3372": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:553-581", "hash": "0358895b1c6be023d0b0a1e55c8f230b", "title": "Swin Transformer Backbone Initialization"}, "3373": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:582-604", "hash": "32503e43ac42ef30bf0b097b2241b04b", "title": "Swin Transformer 3D Backbone: Paddle Video"}, "3374": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:605-627", "hash": "aad844096d8ac1728893cdd18c4d0664", "title": "Swin Transformer Initialization Parameters"}, "3375": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:628-659", "hash": "a8a4da3d59d4cef4a9c8dac61e826b3e", "title": "Swin Transformer Model Initialization"}, "3376": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:660-687", "hash": "ee3634586cd5ffb03fb3270c90f4be05", "title": "Swin Transformer Backbone Initialization"}, "3377": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:689-720", "hash": "cf40c652cd018d3ed9ef42cc3f0c7eaa", "title": "Swin Transformer Initialization"}, "3378": {"path": "/paddlevideo/modeling/backbones/swin_transformer.py:721-742", "hash": "6c33f9aa3216fbf69cac4401c8aa512f", "title": "Swin Transformer Layer Processing"}, "3379": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py", "hash": "b438a9d1f817b38cc9308e17df0bc136", "title": "Shift-ViT: Versatile Image Processing Backbone"}, "3380": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:1-37", "hash": "39670b02fae73dee6d7d509d2f741cf8", "title": "Shifted Vision Transformer Backbone"}, "3381": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:38-65", "hash": "4ac59c9cea14eb517a69601f8afe4142", "title": "Stochastic Depth Drop Path Implementation"}, "3382": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:66-104", "hash": "acef93c443550140e63ed11579065fc0", "title": "Self-Attention Mechanism Implementation"}, "3383": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:105-138", "hash": "b33362025e710e066cdd046c15686b47", "title": "Self-Attention Module: TOShift-ViT Class"}, "3384": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:139-164", "hash": "a58da9ef4603dacc22a733ec27236025", "title": "Object Initialization in Toshift_VIT Model"}, "3385": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:165-186", "hash": "907137db33c26c1a044f8c5621c2eba2", "title": "Temporal Attention Initialization"}, "3386": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:187-213", "hash": "20fba74ec232916ea6dbe09181a56f79", "title": "Token-Shifting ViT Model Initialization"}, "3387": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:214-245", "hash": "2f021115cf6e846b0c50686e21333831", "title": "ToshiftVIT: Custom Backbone for Vision Transformer Model"}, "3388": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:246-278", "hash": "0cc6fd9582bbea1791656f200446459a", "title": "TokenShift Vision Transformer Class"}, "3389": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:279-305", "hash": "b6050c5828f6aaf28c213516fc5e859e", "title": "Toshift ViT Class Initialization"}, "3390": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:306-330", "hash": "abfee8739a36af65df8768616d820f97", "title": "Transformer Backbone Model Setup"}, "3391": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:331-360", "hash": "14db32bd076093b3f3eb6279636f91df", "title": "Toshift_VIT Model Initialization"}, "3392": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:361-386", "hash": "b59b77b4293bbf266de088c3207dad43", "title": "TOShiftViT: Initializing and Processing Features"}, "3393": {"path": "/paddlevideo/modeling/backbones/toshift_vit.py:387-413", "hash": "0dfda89d206e34a8cce52f8786dfd1f6", "title": "Positional Embedding and Attention Blocks"}, "3394": {"path": "/paddlevideo/modeling/backbones/transnetv2.py", "hash": "3e7d2ce68cd321e11745707718ccce63", "title": "OctConv3D Enhances TransNetV2 Backbone"}, "3395": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:1-28", "hash": "1849c11df70650ec580ba8d1708f7574", "title": "OctConv3D Layer Creation"}, "3396": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:30-43", "hash": "ff35a8cb4c2a4968c64acb84fd61aa21", "title": "Interleaved 3D Convolutional Paths for TransNetV2"}, "3397": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:44-58", "hash": "401d7db53a139d6dbb910eb89ed39e00", "title": "TransNetV2: Conv, Upsample, Downsample Backbone"}, "3398": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:60-90", "hash": "88d12da63c635f4fb9610390d2e5084e", "title": "TransNetV2: Versatile Transformation Functions"}, "3399": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:91-104", "hash": "86f2a8dc25cc65fef2931a4ca9cdf23c", "title": "TransNetV2 Backbone Conv Layers"}, "3400": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:105-131", "hash": "e3d3cdcde22dbcc269bd409cbbf0a0a3", "title": "TransnetV2: Configurable Conv3D Backbone"}, "3401": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:132-150", "hash": "aa8bedc88183a8579f2d4c8f5f8872d7", "title": "TransNetV2 Model Definition"}, "3402": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:151-179", "hash": "6433d11242a70b8b6b66bedbcf6b348b", "title": "Stacked DDCNNV2: Neural Network Layer"}, "3403": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:181-207", "hash": "cfe5e0e80982454d0528e0cc6be7bd82", "title": "TransNetV2 Backbone Initialization"}, "3404": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:208-232", "hash": "080854f6a821506864983f16a76428aa", "title": "Stochastic Depth ResNet Block"}, "3405": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:234-260", "hash": "cd8a328137db1212426e22f92c680751", "title": "TransNetV2 Layer Sequence"}, "3406": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:261-282", "hash": "d7a72f4b38075b5c15661c1908bf9abf", "title": "TransNetV2 Backbone Initialization"}, "3407": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:283-307", "hash": "52fb2d0d55f5c88e2397744895272121", "title": "FrameSimilarity Layer Initialization"}, "3408": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:309-330", "hash": "b12048f4a339fc5ae68f1617190d4ba6", "title": "TransNetV2 Model Initialization and Similarity Calculation"}, "3409": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:331-346", "hash": "2a90096dc59d7d2009e4d1c1ae635a44", "title": "Tensor Indices and Regression"}, "3410": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:347-364", "hash": "d20fcecfda35a2050afe0f277b3437be", "title": "TransNetV2 Conv3D Model"}, "3411": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:365-390", "hash": "0c1e9c14ed98fb8e06178afdef6ad09f", "title": "TransnetV2 Frame Concatenation"}, "3412": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:391-411", "hash": "36ec74cb9991ddb709333eeb700bd715", "title": "Color Histograms in TransnetV2"}, "3413": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:412-429", "hash": "392806c3e69e4fac69b9724a9c9dc4b2", "title": "Video Frame Histogram Comparison"}, "3414": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:430-446", "hash": "5c96067017fedd0db232034012518ed2", "title": "TransNetV2 Lookup Operation"}, "3415": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:449-473", "hash": "91d59d86aab44d36781b571728bbb939", "title": "TransNetV2: Shot Transition Detection Model"}, "3416": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:473-484", "hash": "ffa2108ee65aaf803b5ad1bfd4fc35bc", "title": "TransNetV2 ResNet Features Initialization"}, "3417": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:485-508", "hash": "9fcbecc776eb56706faa9135346ea395", "title": "TransNetv2 Backbone Initialization"}, "3418": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:510-526", "hash": "e298e21c299ae507e36acd02fc4cf1c0", "title": "TransNetV2 Neural Network Model Initialization"}, "3419": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:527-548", "hash": "a9e859e5595b9f65daa8d1b5168d87cd", "title": "TransNetV2 Model Architecture"}, "3420": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:549-571", "hash": "5172989db6b799a17edac1ccf6c26014", "title": "TransNetV2 Feature Extraction and Pooling"}, "3421": {"path": "/paddlevideo/modeling/backbones/transnetv2.py:572-581", "hash": "c0477aa5c70e6ef5259986b60107eee6", "title": "Transnetv2 Classification"}, "3422": {"path": "/paddlevideo/modeling/backbones/vit.py", "hash": "c76d4a70f0ca56ce2f979f3482eebe23", "title": "Vision Transformer Backbone in PaddleVideo"}, "3423": {"path": "/paddlevideo/modeling/backbones/vit.py:1-37", "hash": "408b0eb1f17f36a7341bbc268c73d71b", "title": "Drop Path Functions in PaddleVideo"}, "3424": {"path": "/paddlevideo/modeling/backbones/vit.py:38-65", "hash": "ff869156b066cd6c5f001503b93c5366", "title": "Stochastic Depth Dropout Paths for Vision Transformers"}, "3425": {"path": "/paddlevideo/modeling/backbones/vit.py:66-104", "hash": "6e874695a2ecb3800bf33d25718ad130", "title": "Vision Transformer Backbone: MLP & Attention"}, "3426": {"path": "/paddlevideo/modeling/backbones/vit.py:105-138", "hash": "1d13b50b60f0b6e09f1b731e4e5def86", "title": "Multi-Head Attention Layer Initialization"}, "3427": {"path": "/paddlevideo/modeling/backbones/vit.py:139-166", "hash": "bff04c7bfbb0d0fa8baf3d95379a0ec3", "title": "Vision Transformer Initializer"}, "3428": {"path": "/paddlevideo/modeling/backbones/vit.py:167-185", "hash": "4cb61d83cd9a5549bbdb78e9634d6f75", "title": "Vision Transformer Backbone Initialization"}, "3429": {"path": "/paddlevideo/modeling/backbones/vit.py:186-210", "hash": "64c21f79e69612dad4f2f8352536579a", "title": "VI Forward Method"}, "3430": {"path": "/paddlevideo/modeling/backbones/vit.py:211-235", "hash": "51283af92e4a8245b4152d36a988b56d", "title": "Spatial Attention in Vision Transformer Model"}, "3431": {"path": "/paddlevideo/modeling/backbones/vit.py:236-267", "hash": "fbc95e2d04846c209c67d153a989eec3", "title": "Vision Transformer Backbone: Averaging and Patch Embedding"}, "3432": {"path": "/paddlevideo/modeling/backbones/vit.py:268-298", "hash": "6eda2e04e8a97f799aa2f32b297dcee8", "title": "VisionTransformer: Paddle Video Backbone"}, "3433": {"path": "/paddlevideo/modeling/backbones/vit.py:299-327", "hash": "e920126bcde00906a3bb031d984e1571", "title": "Initialize ViT Backbone Model Parameters"}, "3434": {"path": "/paddlevideo/modeling/backbones/vit.py:328-350", "hash": "3aae59338b35bd29fcf9f1c61c2dbbe3", "title": "Initializing Vision Transformer Components"}, "3435": {"path": "/paddlevideo/modeling/backbones/vit.py:351-379", "hash": "2c0ce9caf48e93a75c4580ef3baf72cf", "title": "Vision Transformer Model Initialization"}, "3436": {"path": "/paddlevideo/modeling/backbones/vit.py:380-405", "hash": "4c7ef97b01afb5cc66e07c29579eb994", "title": "Vision Transformer Forward Initialization"}, "3437": {"path": "/paddlevideo/modeling/backbones/vit.py:406-430", "hash": "117a5075ec3714ab6a7a632f3c3853f2", "title": "Relative Position Embeddings in Vision Transformer Model"}, "3438": {"path": "/paddlevideo/modeling/backbones/vit.py:431-455", "hash": "67c775000dedca37bf529cb2f4effedc", "title": "VIT Time Embedding and Attention Processing"}, "3439": {"path": "/paddlevideo/modeling/backbones/vit.py:456-465", "hash": "6044521cdc323aa7d06df568ed79f510", "title": "Vit Model Frame Averaging Embeddings"}, "3440": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py", "hash": "a548272c7d6b08482a90b78c5d10df21", "title": "VisionTransformer_tweaks: Time-Based Feature Modification"}, "3441": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:1-32", "hash": "163a0037b25c37a6632cb1f8dcfda775", "title": "VisionTransformer_tweaks Model"}, "3442": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:35-69", "hash": "78352dc8e36e1346ef26cb25a3d04bc9", "title": "Dropout and Bounding Box Functions"}, "3443": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:70-107", "hash": "8ace0dc760c31b91ed452f0a3a2215af", "title": "Vit Tweak: Mlp Class with DropPath"}, "3444": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:108-144", "hash": "0401902aa6b16a647ab6f29097bbc013", "title": "Multi-Head Self-Attention Layer for Transformers"}, "3445": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:145-178", "hash": "191644ca105916df53648c86415e00fa", "title": "Attention Block Class with QKV Decomposition"}, "3446": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:179-202", "hash": "6b722b6f674b8db929e19361d8ea469e", "title": "Dynamic Norm Layer Instantiation"}, "3447": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:203-221", "hash": "1dd8e532402098aaf473db917d76023a", "title": "Temporal Attention Module Initialization"}, "3448": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:223-247", "hash": "afdd811a27dcf7e98c719f834abc51f9", "title": "Flexible ViT Backbone with MLP and Attention Types"}, "3449": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:248-271", "hash": "0086b93679364518612178f754ffddbc", "title": "Spatial Attention in ViT Models"}, "3450": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:273-302", "hash": "11c9d5308350dd7336516a92f9b3ba90", "title": "PatchEmbed Class in PaddleVideo Library"}, "3451": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:303-331", "hash": "63acd61c3f3ecb4c9eb0f76cddf1d5a8", "title": "Vision Transformer with Patch Input"}, "3452": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:332-360", "hash": "25b329afe0081ad6b0180838891a034b", "title": "Vit Model Initialization"}, "3453": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:361-384", "hash": "0e89cbbaa0a22e51f56928484a67e76f", "title": "Initializing Transformer Embeddings"}, "3454": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:385-414", "hash": "d3e20fec384349febb7c119899ebf086", "title": "Transformer Model Customization"}, "3455": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:415-441", "hash": "cb06972d145661ed99980ac66175e4ee", "title": "Initializing Backbone Network Parameters"}, "3456": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:442-464", "hash": "60855cda21ff7311d108d7fb6dfe8e17", "title": "Transformer Model Forward Pass: Positional Embedding Reshaping"}, "3457": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:465-487", "hash": "b7ca73dab6fe4bfcc7d31a35dd659447", "title": "Vit Tweaks: Position & Time Embeddings"}, "3458": {"path": "/paddlevideo/modeling/backbones/vit_tweaks.py:488-515", "hash": "875fdb7c9204895c5bb0fe1223694a0a", "title": "Dynamic Frame Averaging for Vision Transformers"}, "3459": {"path": "/paddlevideo/modeling/backbones/yowo.py", "hash": "6d813eb363bbe30b4d86f2d5f8860de7", "title": "YOWO Video Backbone for Paddle"}, "3460": {"path": "/paddlevideo/modeling/backbones/yowo.py:1-28", "hash": "ac8f79947a7d1b74b8f9eca5ce085236", "title": "CAM Module: PaddleVideo Backbone Custom Layer"}, "3461": {"path": "/paddlevideo/modeling/backbones/yowo.py:29-53", "hash": "0d1a96e0efd74d43e31eb4b4d8145603", "title": "Channel-wise Attention CFAMPBlock"}, "3462": {"path": "/paddlevideo/modeling/backbones/yowo.py:54-79", "hash": "b15f2e6b60845f1f5e55e0a7896a3772", "title": "YOWO Backbone: ConvLayers, BN, ReLU, CAM_Module"}, "3463": {"path": "/paddlevideo/modeling/backbones/yowo.py:80-108", "hash": "5c26f0be612429a40fa78e5f816ef574", "title": "YOWO Model Initialization"}, "3464": {"path": "/paddlevideo/modeling/backbones/yowo.py:109-129", "hash": "1c8e90f5d0b9a72fccdfbfa11df2c7e7", "title": "YOLOv5 Backbone Loading Weights"}, "3465": {"path": "/paddlevideo/modeling/backbones/yowo.py:130-150", "hash": "9fa2c93e462f148bb07d56e9b41acc19", "title": "YOWO Model Loading and Processing"}, "3466": {"path": "/paddlevideo/modeling/bbox_utils.py", "hash": "1eb491813fff5a32cb1eae9cbac785f7", "title": "Bounding Box Utilities"}, "3467": {"path": "/paddlevideo/modeling/bbox_utils.py:1-30", "hash": "2a73114bd0d84dfde2d5219f8eac6993", "title": "Bounding Box Delta Calculator"}, "3468": {"path": "/paddlevideo/modeling/bbox_utils.py:31-63", "hash": "7c1b9f6e26a9e5c855fa3f46557b737a", "title": "Weighted Bounding Box Differential Conversion"}, "3469": {"path": "/paddlevideo/modeling/bbox_utils.py:65-102", "hash": "22e54b8f788c0470e3fd4047a09b39cd", "title": "Bounding Box Utilities"}, "3470": {"path": "/paddlevideo/modeling/bbox_utils.py:103-139", "hash": "93d47f796fe21906e485498b12b3a16c", "title": "Bounding Boxes Filters and Overlaps Calculation"}, "3471": {"path": "/paddlevideo/modeling/bbox_utils.py:140-176", "hash": "22a59432756f7f35e6782e664f7a6b75", "title": "Grid and YOLO Box Utilities"}, "3472": {"path": "/paddlevideo/modeling/bbox_utils.py:177-204", "hash": "f8be8d60697747632f78c3351d40e48a", "title": "Bounding Box Utilities"}, "3473": {"path": "/paddlevideo/modeling/bbox_utils.py:205-237", "hash": "7bda48665388a7e7affdcb5cd5ca38b5", "title": "Bounding Box IoU Calculator"}, "3474": {"path": "/paddlevideo/modeling/bbox_utils.py:238-268", "hash": "510bc42ed7e2b00ef6d99ae4d3442190", "title": "Intersection Over Union Calculator for Bounding Boxes"}, "3475": {"path": "/paddlevideo/modeling/bbox_utils.py:269-304", "hash": "69eac5d5a3497a69cb90ee2547cd6db1", "title": "Rotated Bounding Box Conversion"}, "3476": {"path": "/paddlevideo/modeling/bbox_utils.py:305-339", "hash": "63d645cc81c60f97c93978715a50119a", "title": "Bounding Box Regression Computation"}, "3477": {"path": "/paddlevideo/modeling/bbox_utils.py:340-378", "hash": "d7325d30b52547b54f022d6d99e6da35", "title": "Delta BBox Calculator"}, "3478": {"path": "/paddlevideo/modeling/bbox_utils.py:379-417", "hash": "ff92e35e48ebc7662a7c0ed08a77ad22", "title": "Decoding Bounding Boxes in Paddle Video"}, "3479": {"path": "/paddlevideo/modeling/bbox_utils.py:418-447", "hash": "c602fe8c7dea13ca1fb1b98648254d99", "title": "Bounding Box Dimensions and Angle Calculator"}, "3480": {"path": "/paddlevideo/modeling/bbox_utils.py:448-475", "hash": "4a4f68045c9d4d210ed43f52440e014e", "title": "Find Best Begin Point in Coordinates"}, "3481": {"path": "/paddlevideo/modeling/bbox_utils.py:476-503", "hash": "1bc8dc77e5d310147d21104a3e9c60a7", "title": "Rotated Rectangle to Polygon Conversion"}, "3482": {"path": "/paddlevideo/modeling/bbox_utils.py:504-528", "hash": "9835ef8fc487bbf5a103b3c4465accb4", "title": "Rotating Rectangles to Polygons"}, "3483": {"path": "/paddlevideo/modeling/builder.py", "hash": "432511b5183f421d54cc0ecbdaa26b7e", "title": "Video Model Builder with Paddle"}, "3484": {"path": "/paddlevideo/modeling/builder.py:1-19", "hash": "1669a9d36b0d8fd7b265ef111097b3ab", "title": "Video Object Detection Model Builder"}, "3485": {"path": "/paddlevideo/modeling/builder.py:22-73", "hash": "8618aaa75ac267357e95266674533d2e", "title": "Model Builder Functions"}, "3486": {"path": "/paddlevideo/modeling/builder.py:74-116", "hash": "d439be6fb4327f79c3bac31be9fae7cd", "title": "Dynamically Building Paddle Video Components"}, "3487": {"path": "/paddlevideo/modeling/builder.py:117-127", "hash": "367a6276d74ebb3ebc37a50f08f976bd", "title": "Framework-Based Model Builder"}, "3488": {"path": "/paddlevideo/modeling/framework/__init__.py", "hash": "d49482c4dcead0a03cb3aefe8d39b3f6", "title": "PaddleVideo Framework Base Classes"}, "3489": {"path": "/paddlevideo/modeling/framework/__init__.py:1-24", "hash": "7e305ff962e1f8983d8d7c805ffff655", "title": "PaddleVideo Framework Base Classes"}, "3490": {"path": "/paddlevideo/modeling/framework/__init__.py:25-28", "hash": "97213dbae08d7f28284ca6f7f1fe2cf0", "title": "Model Classes in PaddleVideo Framework"}, "3491": {"path": "/paddlevideo/modeling/framework/detectors/__init__.py", "hash": "74d69ae58b26f238cf2cf4a52e30933d", "title": "Detector Imports in PaddleVideo"}, "3492": {"path": "/paddlevideo/modeling/framework/detectors/base.py", "hash": "cbdfde8ee2714d2022f05e905dad6857", "title": "Abstract BaseDetector Class"}, "3493": {"path": "/paddlevideo/modeling/framework/detectors/base.py:1-36", "hash": "1a1b462fae43ab6b14ce12fdc4165783", "title": "Base Detector Class: Foundation for Detection Models"}, "3494": {"path": "/paddlevideo/modeling/framework/detectors/base.py:37-51", "hash": "78af2c1b9ee31995ed603dd4a19a8f13", "title": "Abstract Base Classes for ML Training, Validation, and Testing"}, "3495": {"path": "/paddlevideo/modeling/framework/detectors/fast_rcnn.py", "hash": "995c0920fe164fdca7fbf01f5ac940e4", "title": "Fast R-CNN Detector"}, "3496": {"path": "/paddlevideo/modeling/framework/detectors/fast_rcnn.py:1-30", "hash": "fca462237f532a3cebc98d4f3dc5fa50", "title": "Fast R-CNN Detector"}, "3497": {"path": "/paddlevideo/modeling/framework/detectors/fast_rcnn.py:31-34", "hash": "5a95b7228206c205e13f8982a18f8829", "title": "Fast RCNN Detector Builder"}, "3498": {"path": "/paddlevideo/modeling/framework/detectors/two_stage.py", "hash": "4a436e3a42ab688dcb93bf11a51eb687", "title": "Two-Stage Slowfast Detector"}, "3499": {"path": "/paddlevideo/modeling/framework/detectors/two_stage.py:1-32", "hash": "995e8abd8351d5a038cd1b66642e5cc3", "title": "Two-Stage Detector Base Class"}, "3500": {"path": "/paddlevideo/modeling/framework/detectors/two_stage.py:33-64", "hash": "eb4a046d83796722acdec9b2b29b0577", "title": "Two-Stage Object Detector Class"}, "3501": {"path": "/paddlevideo/modeling/framework/detectors/two_stage.py:66-91", "hash": "ddef4a084e2def9405e69f65266996af", "title": "Two-Stage Detector Feature Extraction and Loss Computation"}, "3502": {"path": "/paddlevideo/modeling/framework/detectors/two_stage.py:92-124", "hash": "4a4f16803a1559b2b8f7689bd33fb7ae", "title": "SlowFast Model Detectors: Val, Test, Infer"}, "3503": {"path": "/paddlevideo/modeling/framework/detectors/two_stage.py:125-152", "hash": "5d52e1edda180f25665886c2a7895558", "title": "Two-Stage Detector Data Retrieval"}, "3504": {"path": "/paddlevideo/modeling/framework/detectors/two_stage.py:153-176", "hash": "0b899b997adff404d829a4f0925b1f37", "title": "Two-Stage Detector: GT Bounding Boxes Generation"}, "3505": {"path": "/paddlevideo/modeling/framework/detectors/two_stage.py:178-186", "hash": "8bd3f18946bfd6950df46e331a75871a", "title": "Selecting Entity IDs with Paddle's Index Select"}, "3506": {"path": "/paddlevideo/modeling/framework/estimators/__init__.py", "hash": "4d6dd35e01f8874e14c1ce9f539d10a7", "title": "Estimators Import and Definition"}, "3507": {"path": "/paddlevideo/modeling/framework/estimators/base.py", "hash": "542f6746ef6bcf445680ee009e28a944", "title": "PaddleVideo BaseEstimator Class"}, "3508": {"path": "/paddlevideo/modeling/framework/estimators/base.py:1-34", "hash": "9567eb47ee9c89ecbf73794c1f8865f8", "title": "BaseEstimator: PaddleVideo's Estimator Foundation"}, "3509": {"path": "/paddlevideo/modeling/framework/estimators/base.py:35-66", "hash": "bd9178c299fbd0c650e96fc7b6a487f2", "title": "Versatile Estimator Framework"}, "3510": {"path": "/paddlevideo/modeling/framework/estimators/base.py:67-82", "hash": "63038ac9856cabe6c130e5a2f83008c2", "title": "Abstract Methods for Model Validation"}, "3511": {"path": "/paddlevideo/modeling/framework/estimators/depth_estimator.py", "hash": "c5ab2f35918c01f76881ee9ee7afa13a", "title": "DepthEstimator: Feature Extraction and Loss Metrics"}, "3512": {"path": "/paddlevideo/modeling/framework/estimators/depth_estimator.py:1-31", "hash": "e9a1512a1fc55fe0e21cc9a52648b649", "title": "DepthEstimator: Framework for Feature Extraction"}, "3513": {"path": "/paddlevideo/modeling/framework/estimators/depth_estimator.py:32-58", "hash": "6b241c504e96a2e43926623cbed9125f", "title": "Depth Estimator Steps: Train, Validate, Test, Infer"}, "3514": {"path": "/paddlevideo/modeling/framework/estimators/depth_estimator.py:59-59", "hash": "b74c9a1fcc3531c44f51d48e92b7dec0", "title": "Depth Estimator Results"}, "3515": {"path": "/paddlevideo/modeling/framework/localizers/__init__.py", "hash": "2e9f2c432aca496aa9d1249d5bce9364", "title": "PaddleVideo Localizers: Handling Video Localization Tasks"}, "3516": {"path": "/paddlevideo/modeling/framework/localizers/base.py", "hash": "e5ec47c319c94a025edfb590989c5993", "title": "Localization Model Base Class (Python)"}, "3517": {"path": "/paddlevideo/modeling/framework/localizers/base.py:1-27", "hash": "04b5021c52bede6c08ba0fb219f7bf50", "title": "Base Class for Localization Models"}, "3518": {"path": "/paddlevideo/modeling/framework/localizers/base.py:28-56", "hash": "0719a456a7f6130fadd3aca7007d54ca", "title": "Localizer Model Initialization"}, "3519": {"path": "/paddlevideo/modeling/framework/localizers/base.py:58-74", "hash": "26d325c8400d5fa9f99effce415bb1d3", "title": "Abstract Classes for Model Steps"}, "3520": {"path": "/paddlevideo/modeling/framework/localizers/bmn_localizer.py", "hash": "4310de76da1da7ee89afcc44f8b3759c", "title": "BMN Localizer Model for PaddleVideo"}, "3521": {"path": "/paddlevideo/modeling/framework/localizers/bmn_localizer.py:1-36", "hash": "76eb9dca1647809b36f22b3ecef89d0e", "title": "BMNLocalizer: PaddleVideo's Localization Framework"}, "3522": {"path": "/paddlevideo/modeling/framework/localizers/bmn_localizer.py:37-69", "hash": "a282cc9666cff1dc7b4e10234b6b34ca", "title": "BMN Localizer: Training, Testing, Inferring"}, "3523": {"path": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py", "hash": "160109cbcd5687b25197f81fd3b0c6c9", "title": "YOWO Localizer Evaluation"}, "3524": {"path": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:1-33", "hash": "9a63e9a69e42867c7724c1aeab5d7110", "title": "YOWO Localizer Class: PaddleVideo"}, "3525": {"path": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:34-67", "hash": "8e43173b3be367041e8d059fd8125eb3", "title": "Model Training and Validation Process"}, "3526": {"path": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:68-90", "hash": "4dfcf237c9e7d2149de26e67c482f4db", "title": "Non-Maximum Suppression and Precision Calculation"}, "3527": {"path": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:91-125", "hash": "9549755e1497945f55e84cbb1be9f3c2", "title": "YOLOv3 Localizer Metrics Calculator"}, "3528": {"path": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:126-147", "hash": "8652bdc3ba23c929ba1371ea477b2987", "title": "YOWO Localizer Evaluation: Precision, Recall, F-Score"}, "3529": {"path": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:149-161", "hash": "fb118b9272a8a2654e7892ed14651a89", "title": "YOWO Localizer Functions"}, "3530": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py", "hash": "bfd609ca9c7b5e7486b5ccfea57d2a01", "title": "YOWO Utils: Non-Max Suppression and Tensor Movement"}, "3531": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:1-36", "hash": "81e65b633220ba59e7710cc486090d0e", "title": "YOWO Localizers: Truths Length & NMS"}, "3532": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:37-67", "hash": "8bb9b5d2a1cfbb681e39ded4a518b00d", "title": "Functional Definitions for Bounding Boxes"}, "3533": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:68-94", "hash": "44b5fd6b05da6ea2163aee830f09edeb", "title": "Box Regression via Reshaping and Sigmoid"}, "3534": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:96-122", "hash": "c999fd6586965e8fc54de0491e0a5ec5", "title": "YOLOv2 Bounding Box Processing"}, "3535": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:123-153", "hash": "4898fc30c21ee470c6257a68b23fd8fd", "title": "YOLO Bounding Box Extractor"}, "3536": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:154-178", "hash": "73bd6c07816f417b8e3889d8cd08cc73", "title": "YOLO Object Detection Utilities"}, "3537": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:179-213", "hash": "4fc87ce334a37ae4aa963b18e7234b66", "title": "Intersection-Over-Union Calculator"}, "3538": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:214-241", "hash": "9eae82f4b1cb058e894545e6c0707c9b", "title": "Intersection Over Union Code"}, "3539": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:244-268", "hash": "882e1193aae713b79068d0150da1ab5b", "title": "YOLO Localizer Ground Truth Builder"}, "3540": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:269-288", "hash": "f1f497f1f527b6dd2413c2d4d208b931", "title": "IoU Calculation for YoWo Localizers"}, "3541": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:289-315", "hash": "17f927d886761fb64547112fbff82521", "title": "IoU-Based Masking for Bounding Box Confidences"}, "3542": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:316-338", "hash": "3e2ad226e82d6fec29625fd8a264bd3f", "title": "YOLO Anchor Box Selection"}, "3543": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:340-357", "hash": "0e4d063f1ce69d716427323cb6442d81", "title": "Object Localization Metric"}, "3544": {"path": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:358-359", "hash": "0a56a06a90c434a7f18729605a7b8dd2", "title": "Localizers Framework Functions"}, "3545": {"path": "/paddlevideo/modeling/framework/multimodal/__init__.py", "hash": "16e7ede9cfcb091a61695a433e4542d1", "title": "Multimodal Model Initialization"}, "3546": {"path": "/paddlevideo/modeling/framework/multimodal/actbert.py", "hash": "0af35f36994c2a91a8dead5e3b0a16a3", "title": "Introducing ActBert: Multimodal Model Training"}, "3547": {"path": "/paddlevideo/modeling/framework/multimodal/actbert.py:1-27", "hash": "f5899f7de5a08fd93a845ca488a06c98", "title": "ActBert: Multimodal Model Framework"}, "3548": {"path": "/paddlevideo/modeling/framework/multimodal/actbert.py:28-46", "hash": "eb2ab6a430aef7ed347c6aacc366235e", "title": "ActBert Dataset Train and Val Steps"}, "3549": {"path": "/paddlevideo/modeling/framework/multimodal/actbert.py:47-64", "hash": "bc11af1fc51ee84d436ff6db24c9b75f", "title": "Multimodal ACT-BERT Model"}, "3550": {"path": "/paddlevideo/modeling/framework/multimodal/base.py", "hash": "1f8ab578f31cc44de34c93a94367910c", "title": "Multimodal Base Class for PaddleVideo"}, "3551": {"path": "/paddlevideo/modeling/framework/multimodal/base.py:1-32", "hash": "398ac04aae0d84d1b7576dd38037cc0a", "title": "Multimodal Base Class for PaddleVideo"}, "3552": {"path": "/paddlevideo/modeling/framework/multimodal/base.py:33-63", "hash": "f962038917c4f2f29b108d316c47e6c1", "title": "Multimodal Base Class with Selectable Step Functions"}, "3553": {"path": "/paddlevideo/modeling/framework/multimodal/base.py:65-81", "hash": "bc615f7c65319e237596891b9892e639", "title": "Abstract Methods for Validation, Testing, and Inference"}, "3554": {"path": "/paddlevideo/modeling/framework/partitioners/__init__.py", "hash": "9706e51cb51c8bbcaff3c7d46aad7ec8", "title": "PaddleVideo Partitioner Initialization"}, "3555": {"path": "/paddlevideo/modeling/framework/partitioners/base.py", "hash": "b12171257da0c516ebeafdc2f0a00b61", "title": "BaseModelPartitioner: PaddleVideo's Modeling Framework"}, "3556": {"path": "/paddlevideo/modeling/framework/partitioners/base.py:1-27", "hash": "891eeced2764f40573ef0d33d3502940", "title": "Python Base Partitioner Class for PaddleVideo"}, "3557": {"path": "/paddlevideo/modeling/framework/partitioners/base.py:28-55", "hash": "babffbaf165f5b72fe6d3b81aad8b244", "title": "Partitioned Model Initialization"}, "3558": {"path": "/paddlevideo/modeling/framework/partitioners/base.py:56-84", "hash": "ff61a4d3bb1badcbe950560d65189e4f", "title": "Base Model Partitioner Class"}, "3559": {"path": "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py", "hash": "1b52da32dddedad198e15ef9e15a2bc5", "title": "TransNetV2 Partitioner in PaddleVideo"}, "3560": {"path": "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py:1-32", "hash": "ca7c9e4964ee0959310270f0f2bd54a6", "title": "TransNetV2 Partitioner for PaddleVideo"}, "3561": {"path": "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py:33-54", "hash": "001629ec2b99c9e826e0104f77a0cb80", "title": "TransNetV2 Partitioner Loss Metrics"}, "3562": {"path": "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py:55-68", "hash": "5be6e179da11bb81bc528d38fb9e9469", "title": "TransnetV2 Partitioner Methods"}, "3563": {"path": "/paddlevideo/modeling/framework/recognizers/__init__.py", "hash": "f00bf8d794a8c50aaf356e7716dcf297", "title": "PaddleVideo Recognizers: Action & Motion"}, "3564": {"path": "/paddlevideo/modeling/framework/recognizers/__init__.py:1-23", "hash": "88c96cc42fe9b23590c21b51d7cd61d0", "title": "PaddleVideo Recognizers: A Versatile Toolkit"}, "3565": {"path": "/paddlevideo/modeling/framework/recognizers/__init__.py:25-30", "hash": "6575e4b81f36d1dcff2b7d76c06824fe", "title": "PaddleVideo Recognizer Models"}, "3566": {"path": "/paddlevideo/modeling/framework/recognizers/base.py", "hash": "87bd61d7e664633212c7a6a9aecfa2cb", "title": "Base Recognizer Model in PaddleVideo"}, "3567": {"path": "/paddlevideo/modeling/framework/recognizers/base.py:1-33", "hash": "acf2a27498e11e141ce3c4605a980427", "title": "Base Recognizer Class: Override Train, Valid, Test Steps"}, "3568": {"path": "/paddlevideo/modeling/framework/recognizers/base.py:34-66", "hash": "17dd47ff9cff4a55cbddb92c41a09120", "title": "Initialize and Train Model's Head"}, "3569": {"path": "/paddlevideo/modeling/framework/recognizers/base.py:67-81", "hash": "8d9745a49b0cbb2c4c951998d78f76d9", "title": "Abstract Base Recognizer Steps in PaddleVideo"}, "3570": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py", "hash": "28efe264f754e49622d7b8ba41c53ebc", "title": "1D Recognizer Model in PaddleVideo"}, "3571": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:1-29", "hash": "fb6d0be018e373c206e280ddc8b50396", "title": "1D Recognizer Model Framework in PaddleVideo"}, "3572": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:30-61", "hash": "4ad7be8e2f9093dae0c2de154ab75cf8", "title": "1D Recognizer Model Processing Image and Audio Data"}, "3573": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:62-91", "hash": "fa24583216596690f9aec569f142d5ff", "title": "1D Recognizer Model Framework"}, "3574": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:93-111", "hash": "fe823c1191d815f90137b78bb01bdcf0", "title": "Shared Implementation in Validating, Testing, and Inferring Steps"}, "3575": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer2d.py", "hash": "e6cdba583a0043f48446e18ad905b818", "title": "2D Video Recognizer in PaddleVideo"}, "3576": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer2d.py:1-27", "hash": "fec983d0706190c97bf9074358b5c119", "title": "2D Recognizer Model Framework in PaddleVideo"}, "3577": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer2d.py:28-60", "hash": "cdeaa6c74cca5ce4e85fe9ca3cf5b5fa", "title": "Video Analysis Recognizer2D Model"}, "3578": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer2d.py:60-69", "hash": "81949ad652723c8d49ffbbad137391ad", "title": "Recognizer2D Class and Methods: Forward Net and Infer Step"}, "3579": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer3d.py", "hash": "f18d123d3bfc63ba0d696b749da24354", "title": "Recognizer3D: 3D Object Recognition Framework"}, "3580": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer3d.py:1-33", "hash": "691508594c5da66b4991d9c8892fc609", "title": "3D Recognizer Framework in PaddleVideo"}, "3581": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer3d.py:34-64", "hash": "010581b320ebd5c756bad73528758b6f", "title": "Training and Validation Steps in 3D Recognizer Model"}, "3582": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer3d.py:66-93", "hash": "2377a47bc3341f0043bf7a5ee2f73469", "title": "Reshape Input for ResNet3dSlowOnly"}, "3583": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py", "hash": "57b8b2e1589049872840cfb9fc302345", "title": "3D Recognizer Model in PaddleVideo"}, "3584": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py:1-31", "hash": "a111a274dc01d959d30afab201a33309", "title": "3D MRI Recognizer Framework"}, "3585": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py:32-65", "hash": "124260c888336561acd5fb509e9060ef", "title": "3D MRI Recognizer Model Training and Testing"}, "3586": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py:66-81", "hash": "5cb52f5e0497ab27bf3bef6cb76e492a", "title": "Dual Test/Infer Steps in Recognizer3D MRI Model"}, "3587": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py", "hash": "5ea9dd786f7dca9ccebbb509bfdcc366", "title": "Recognizer Distillation in PaddleVideo"}, "3588": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:1-34", "hash": "7c943f774171e5e019db600cd30cedb5", "title": "Recognizer Distillation Layer for PaddleVideo"}, "3589": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:35-60", "hash": "f595a20998203c7d1a49618c94e002f7", "title": "Distillation Model Initialization"}, "3590": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:61-85", "hash": "f7fbc6c12861130fe6fe2b80061ca956", "title": "Distillation Recognizer Model Initialization"}, "3591": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:86-114", "hash": "842cd7f75c9acf70054a881301df2fc7", "title": "Distillation Recognizer Framework"}, "3592": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:116-136", "hash": "38b544effbcccc2331f384c76311b010", "title": "Distillation Recognizer Loss Calculation"}, "3593": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:137-165", "hash": "7a51104811e2fa84116f6f4d8493cf7e", "title": "Training Step and Recognizer Distillation"}, "3594": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:167-193", "hash": "e95bd3ca5d93db6cf2e45af44dbc61fd", "title": "Distillation Recognizer Class"}, "3595": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:195-224", "hash": "ddba3abca9223c0adeca36b00b59694e", "title": "Evaluating Student Model in Image Recognition"}, "3596": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:225-231", "hash": "7943604882b0857a01814cc9ba0d9bd7", "title": "Selecting and Applying Model"}, "3597": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py", "hash": "d694e080a32b5624e83c1a554c3ec535", "title": "2D Image Classifier with PaddleVideo's RecognizerMRI"}, "3598": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py:1-27", "hash": "b474c3acb56c78223374723247eb69b5", "title": "PaddleVideo: RecognizerMRI Framework"}, "3599": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py:28-59", "hash": "e32f873677f0e265d1294dc1c0aaab83", "title": "Image Classification Model Definition"}, "3600": {"path": "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py:60-76", "hash": "6065a096715c2b1e4e0fb0da6dd9adb5", "title": "Testing Steps without Head Loss"}, "3601": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py", "hash": "d9c7bf9bc838d87918547f0f3675f88b", "title": "GCN Recognizer Model Framework"}, "3602": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py:1-33", "hash": "abe7c09dda29e16f30d1796a861fa1ea", "title": "GCN Recognizer Model Framework"}, "3603": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py:34-66", "hash": "cfa10fac94afa8724a4e49bc9a9de7ab", "title": "RecognizerGCN: Image Classification Model"}, "3604": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py:67-87", "hash": "71a5cbc82176b956cf9eeb3e655f5216", "title": "GCN Recognizer Model with Test and Infer Steps"}, "3605": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py", "hash": "e0b2dd9c4b3defc77a842ec5e6d73c78", "title": "MoViNet Recognizer Framework"}, "3606": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py:1-33", "hash": "77ad568b5621c91737800dfa713a8d91", "title": "MoViNet Frame Recognizer Class"}, "3607": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py:34-57", "hash": "fd0bb6cd2d9e5f2edf91cba32e608200", "title": "Training and Validation Steps in Recognizer"}, "3608": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py:58-78", "hash": "95579ca288a54230ff94ab674fbf945b", "title": "Model's Forward, Test, and Infer Steps"}, "3609": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py", "hash": "f882ae128808665c2a12c89357697094", "title": "Transformer-Based Recognizer Model"}, "3610": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:1-31", "hash": "67ed64365bb234950eb25d6a471ba2c0", "title": "Transformer-Based Recognizer Framework"}, "3611": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:33-62", "hash": "097207192256865824a31d86ac09c24f", "title": "Training, Validation, Testing Steps in Recognizer Transformer"}, "3612": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:63-86", "hash": "b57d83a7799f255738c36403ea268c0c", "title": "Multi-View Image Inference Model"}, "3613": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:87-98", "hash": "4003abce501babe5332315a2604997bf", "title": "Averaging Method for Recognizer Transformer"}, "3614": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py", "hash": "7d982b2acc16e3ce873d6e6be5bcdfc3", "title": "Recognizer-Transformer MRI Model Code"}, "3615": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:1-32", "hash": "5ddcb09c062f38721723a1e440923c21", "title": "RecognizerTransformer_MRI Model Definition"}, "3616": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:33-63", "hash": "bb4f33c0e491ec4c211e6ce38bfac30a", "title": "Recognizer Transformer Image Classifier"}, "3617": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:65-89", "hash": "ab5eb1965ab7a7463cbcb2d15ccf0aaf", "title": "Average-View Model Inference"}, "3618": {"path": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:90-104", "hash": "f95b391fecbd9a96cbddfdffd609b377", "title": "Combining Scores in Recognizer Transformer"}, "3619": {"path": "/paddlevideo/modeling/framework/segment/__init__.py", "hash": "e8b235c791b00b096fb4ee3cc5bf81a2", "title": "PaddleVideo Segment Models"}, "3620": {"path": "/paddlevideo/modeling/framework/segment/base.py", "hash": "a0560f0c7726e7f962384b92e97af38a", "title": "Semi-Video Object Segmentation Base Class"}, "3621": {"path": "/paddlevideo/modeling/framework/segment/base.py:1-29", "hash": "ce321359973aac51bb5524e797e5c2c6", "title": "Abstract Base Class for Semi-Video Object Segmentation"}, "3622": {"path": "/paddlevideo/modeling/framework/segment/base.py:30-57", "hash": "7ddade8e35f3f9d58d537e2497bc54e7", "title": "Segment Model Initialization and Forward Pass"}, "3623": {"path": "/paddlevideo/modeling/framework/segment/base.py:58-90", "hash": "48aaeea3786c7f0439b85da4723e4024", "title": "Abstract Class for Model Training Phases"}, "3624": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py", "hash": "e88dfb54fd64b0dd9a32f286ffeda48c", "title": "CFBI Model for AI Segmentation"}, "3625": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:1-30", "hash": "6c2c6799efd19e4ee3ccddef66fee0d4", "title": "CFBI Model Python Class"}, "3626": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:31-56", "hash": "664c7a59e89d8f49face423863dc7384", "title": "CFBI Framework Testing"}, "3627": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:57-84", "hash": "57e8be59a380245223c5b29a5982dea5", "title": "PaddleVideo CFBI Else Block"}, "3628": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:85-108", "hash": "3d16e2663ef4a1583f29a11ad415d284", "title": "Segmentation Head Initialization"}, "3629": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:109-127", "hash": "6af60fde61459db547327599920c409f", "title": "Resizing CFBI in PaddleVideo"}, "3630": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:128-144", "hash": "ecd05783b0a8da6f7b4b997f2c196894", "title": "Preparing Frame Embeddings for Attention and Loss Calculation"}, "3631": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:145-164", "hash": "f5d13eab6e7b59d782a789d680bc995c", "title": "Distance Bias Assignment for Frame Sequences"}, "3632": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:165-184", "hash": "d2a195c752c1e2e8406b4499828a49c5", "title": "NotImplementedError Handler"}, "3633": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:185-200", "hash": "9991c900e946ed8cab50b6f10e60f1e0", "title": "Global Matching Evaluation in CFBI Model"}, "3634": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:201-216", "hash": "8b453f365f35b2578a3f9b58d72545d8", "title": "Preparing Input for Local Matching Function"}, "3635": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:217-237", "hash": "80074bea902c3fde52ce98ffe6bd6a8f", "title": "Global/Local Background Subtraction for Image Segmentation"}, "3636": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:238-256", "hash": "ba1d9a6d563f95b4215a739a544fba4e", "title": "Video Segmentation Model Code"}, "3637": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:257-279", "hash": "af0d2bebdefaf1f249c8f5b1a75c62be", "title": "CFBI Attention Calculation"}, "3638": {"path": "/paddlevideo/modeling/framework/segment/cfbi.py:280-286", "hash": "3e78e916165eeb1ae690b1ae248557d7", "title": "Append and Pass for Attention Model"}, "3639": {"path": "/paddlevideo/modeling/framework/segment/utils.py", "hash": "497e15db17786182646ee5ae62a6223d", "title": "PaddleVideo Framework for Object Segment Matching"}, "3640": {"path": "/paddlevideo/modeling/framework/segment/utils.py:1-31", "hash": "c0cc8d91db604453246ced342838737a", "title": "Foreground to Background Distance Conversion"}, "3641": {"path": "/paddlevideo/modeling/framework/segment/utils.py:32-68", "hash": "3f7dc16213e97778465ca73c95cad07e", "title": "Pairwise L2 Distance Matrix Calculator"}, "3642": {"path": "/paddlevideo/modeling/framework/segment/utils.py:69-92", "hash": "0dc1dfe8e6f3c9fad785c9992e45db32", "title": "Pairwise Distance Computation and Feature Extraction"}, "3643": {"path": "/paddlevideo/modeling/framework/segment/utils.py:93-113", "hash": "9d9ea24dbdc859f67fb669f0decbb6ae", "title": "Nearest Neighbor Features Calculator"}, "3644": {"path": "/paddlevideo/modeling/framework/segment/utils.py:114-138", "hash": "525fafd395f1a2f3cfb5209951fbdeff", "title": "Query-Reference Frame Feature Computation"}, "3645": {"path": "/paddlevideo/modeling/framework/segment/utils.py:139-167", "hash": "bfa506e19548fb3bac0b4df9129d5eb3", "title": "Global Matching with Embedding Chunks"}, "3646": {"path": "/paddlevideo/modeling/framework/segment/utils.py:168-186", "hash": "df49b94353a60cc8462281100848d967", "title": "Nearest Neighbor Distance Calculator"}, "3647": {"path": "/paddlevideo/modeling/framework/segment/utils.py:187-209", "hash": "2a781d43b4b01d2cfc6d5d504b1ad5ff", "title": "Spatial Pyramid Pooling Point Padding"}, "3648": {"path": "/paddlevideo/modeling/framework/segment/utils.py:210-230", "hash": "6f5196a57c504eb288c1f781f7848524", "title": "Segmentation Method in PaddleVideo Library"}, "3649": {"path": "/paddlevideo/modeling/framework/segment/utils.py:231-257", "hash": "3f182156b1f010e5b60ee5f4b4b6358d", "title": "Nearest Neighbor Video Segment Matching"}, "3650": {"path": "/paddlevideo/modeling/framework/segment/utils.py:258-277", "hash": "50b1a6ec33ad1720ed2f1589af07a974", "title": "Distance to Nearest Neighbor Calculator"}, "3651": {"path": "/paddlevideo/modeling/framework/segment/utils.py:278-299", "hash": "ce45741ef3b1393666fa33edfe4b54d0", "title": "Atrous Tensor Matching Function"}, "3652": {"path": "/paddlevideo/modeling/framework/segment/utils.py:300-318", "hash": "891402125ce07de2c526421bd24efe59", "title": "Image Segmentation Atrous Rate Code"}, "3653": {"path": "/paddlevideo/modeling/framework/segment/utils.py:320-338", "hash": "ebf38e5c8a2f6ed84bd6f26741d4737e", "title": "Concatenate and Pad Embeddings"}, "3654": {"path": "/paddlevideo/modeling/framework/segment/utils.py:339-356", "hash": "8d51670269b7ee742e055f2564590727", "title": "Embedding Reshaper"}, "3655": {"path": "/paddlevideo/modeling/framework/segment/utils.py:357-375", "hash": "3154c480d8c863951e25fe2a8bc5ada0", "title": "Atrous Spatial Pyramid Pooling Padder"}, "3656": {"path": "/paddlevideo/modeling/framework/segment/utils.py:376-394", "hash": "dc05a70d8a6977a27b8c96741513a269", "title": "Flattened Embeddings Conversion"}, "3657": {"path": "/paddlevideo/modeling/framework/segment/utils.py:395-415", "hash": "a57dbfe21dac58547a52967bb1068630", "title": "Feature Selection and Reshaping for Segment Matching"}, "3658": {"path": "/paddlevideo/modeling/framework/segment/utils.py:416-442", "hash": "5eef61da1b434782f5c9a8a9161226b7", "title": "Pairwise L2 Distance Nearest Neighbor Features"}, "3659": {"path": "/paddlevideo/modeling/framework/segment/utils.py:443-464", "hash": "b757c8522e7c829c0dc0faaad9c36d2c", "title": "Downsampling with Bilinear Interpolation"}, "3660": {"path": "/paddlevideo/modeling/framework/segment/utils.py:465-492", "hash": "2da581c56596518b1f1fc73eed414e5a", "title": "Atrous Dilation Pairwise Distance Calculation"}, "3661": {"path": "/paddlevideo/modeling/framework/segment/utils.py:493-513", "hash": "3902f2ec1b1d4adf794b691a45d1c668", "title": "Pairwise L2 Distance Compute Function"}, "3662": {"path": "/paddlevideo/modeling/framework/segment/utils.py:514-537", "hash": "b192741c54fce49a9180e0aa37dabb5f", "title": "Downsizing and Padding Tensors"}, "3663": {"path": "/paddlevideo/modeling/framework/segment/utils.py:539-566", "hash": "054b65ee6b2c2dd5080678abff41204b", "title": "Distance Calculator for Frame Embeddings"}, "3664": {"path": "/paddlevideo/modeling/framework/segment/utils.py:567-584", "hash": "a520cab335bbf72d0381d1ebc7e942f3", "title": "Nearest Neighbor Video Segmentation"}, "3665": {"path": "/paddlevideo/modeling/framework/segment/utils.py:585-609", "hash": "c9f1d434fa8d52a82b07a0e4258c504d", "title": "Parallel Nearest Neighbor Calculation"}, "3666": {"path": "/paddlevideo/modeling/framework/segment/utils.py:610-635", "hash": "4b00d078ed4df7d55eb3c5847c026635", "title": "Pairwise Distance Calculator"}, "3667": {"path": "/paddlevideo/modeling/framework/segment/utils.py:637-662", "hash": "6a1a0846e2d9c8fb76802a8dbd07c535", "title": "Atrous Spatial Pyramid Pooling in PaddlePaddle"}, "3668": {"path": "/paddlevideo/modeling/framework/segment/utils.py:663-684", "hash": "1f8e02a2559ebb6143657b4476cd5ab4", "title": "Image Segmentation with Distance Matrix"}, "3669": {"path": "/paddlevideo/modeling/framework/segment/utils.py:685-709", "hash": "c5f23d19670afcdaea34fdc7d2a757b9", "title": "Attention Heads Calculator"}, "3670": {"path": "/paddlevideo/modeling/framework/segment/utils.py:710-736", "hash": "012b4c92ac73b78d29efcf302a444700", "title": "Attention Head Evaluation"}, "3671": {"path": "/paddlevideo/modeling/framework/segment/utils.py:737-754", "hash": "4c20b3affc9ab8f56a1c2e92315325c0", "title": "Total Head Calculation with Stability"}, "3672": {"path": "/paddlevideo/modeling/framework/segmenters/__init__.py", "hash": "84720a22568b454d78570812ff4885cc", "title": "PaddleVideo Segmenter Modules"}, "3673": {"path": "/paddlevideo/modeling/framework/segmenters/asrf.py", "hash": "2c6c3ba13ba3c982786b73d20a1b9639", "title": "ASRF Segmentation Model in PaddleVideo"}, "3674": {"path": "/paddlevideo/modeling/framework/segmenters/asrf.py:1-33", "hash": "7e189d4dda23d8c064b10fb64e3d73b2", "title": "ASRF: PaddleVideo Segmenter Model"}, "3675": {"path": "/paddlevideo/modeling/framework/segmenters/asrf.py:34-67", "hash": "48bcbfb8f15be7695903268f2b20e564", "title": "Segmentation Model Training Code"}, "3676": {"path": "/paddlevideo/modeling/framework/segmenters/asrf.py:69-100", "hash": "0914e42ec1b2c02631d34a965d03eee4", "title": "ASRF Model Validation Step"}, "3677": {"path": "/paddlevideo/modeling/framework/segmenters/asrf.py:101-129", "hash": "407932551cf071e6ced64fd14e3bb490", "title": "ASRF Segmentation Model Inference"}, "3678": {"path": "/paddlevideo/modeling/framework/segmenters/asrf.py:130-143", "hash": "a4eb4042150f8dd54290d8f6f19c296d", "title": "Forward Pass and Sigmoid Application"}, "3679": {"path": "/paddlevideo/modeling/framework/segmenters/base.py", "hash": "ef455dfd5240fa7a2b29e96e46fa5d88", "title": "BaseSegmenter: Foundation for PaddleVideo Segmenters"}, "3680": {"path": "/paddlevideo/modeling/framework/segmenters/base.py:1-30", "hash": "c3d53cbed71982a888d363248b81ac1d", "title": "BaseSegmenter: Foundation for All Segmenters"}, "3681": {"path": "/paddlevideo/modeling/framework/segmenters/base.py:32-63", "hash": "12042e4c8d98b1693b0b15eb42ad1a4b", "title": "Segmenter Base Class Init"}, "3682": {"path": "/paddlevideo/modeling/framework/segmenters/base.py:64-99", "hash": "152307638c62e0f76ebf2b94bda2aca3", "title": "Trainable Segmenter Base Class"}, "3683": {"path": "/paddlevideo/modeling/framework/segmenters/base.py:100-100", "hash": "d23bac234f663a671d44802b96010000", "title": "NotImplementedError in Base Segmenter"}, "3684": {"path": "/paddlevideo/modeling/framework/segmenters/ms_tcn.py", "hash": "ce778e99471f4b5431757041068f53ce", "title": "MS-TCN Video Segmentation Tool"}, "3685": {"path": "/paddlevideo/modeling/framework/segmenters/ms_tcn.py:1-33", "hash": "c67db889536cc475144e88679fe5860f", "title": "MS-TCN Video Segmenter"}, "3686": {"path": "/paddlevideo/modeling/framework/segmenters/ms_tcn.py:34-70", "hash": "595aa0fdb789fd0e6f3f1051a8d1a4d0", "title": "MS-TCN Segmenter Training and Validation"}, "3687": {"path": "/paddlevideo/modeling/framework/segmenters/ms_tcn.py:72-101", "hash": "03d30d1f9e40a7023468953618d7f1da", "title": "MS-TCN Model: Train, Test, and Infer Functions"}, "3688": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py", "hash": "189ea9ac00e66401b1aba144023078ec", "title": "Gaussian Smoothing in PaddlePaddle"}, "3689": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:1-30", "hash": "a48b826fb16b9aeff7931d38791a5a65", "title": "Gaussian Smoothing in PaddlePaddle"}, "3690": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:31-62", "hash": "64f1cb2f355206f59b94432f3fc5fa28", "title": "Gaussian Kernel Initialization and Application"}, "3691": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:63-95", "hash": "1d31cbc04070ae3396fbb36ce4e0bf6c", "title": "1D Convolution and Argrelmax Functions for Image Processing"}, "3692": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:97-146", "hash": "54d8103e4aaab63b98b8db1cd909e3df", "title": "Tensor Conversion Functions"}, "3693": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:149-176", "hash": "5eb52370ac1e5053ab45ee80f2e0f52b", "title": "Boundary-Based Action Segmentation"}, "3694": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:177-203", "hash": "cf0eac81fb2fde7b5d9dfc026dfac6d3", "title": "Majority Class Action Segmentation"}, "3695": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:204-242", "hash": "8a4d9ba05198313b24a6cfccc02efa80", "title": "Smoothing and Relabeling Functions"}, "3696": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:243-270", "hash": "413d8b8367b44e773d25e830da844889", "title": "ASRF Post-Processing for Action Segmentation"}, "3697": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:271-301", "hash": "78233ac8db297db218db7bcdd3a824d4", "title": "Tensor Fan-In/Out and Refinement Function"}, "3698": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:302-335", "hash": "d2897532974c2dc6898e136a354222b2", "title": "Neural Network Weight Initialization Code"}, "3699": {"path": "/paddlevideo/modeling/framework/segmenters/utils.py:336-343", "hash": "064edf5e1b1d7e970fbc3cd55023f4cc", "title": "Initialize Weights and Biases for Neural Network Layer"}, "3700": {"path": "/paddlevideo/modeling/heads/__init__.py", "hash": "7a0e36dc6b9dcff4a4dd4333d6eaf407", "title": "Importing Video Heads from PaddleVideo"}, "3701": {"path": "/paddlevideo/modeling/heads/__init__.py:1-25", "hash": "349392f1597ee247a5d3d66ec6ee03d5", "title": "Importing PaddleVideo Heads"}, "3702": {"path": "/paddlevideo/modeling/heads/__init__.py:26-49", "hash": "476acb972e05f6108931983936b6d01c", "title": "Versatile Video Heads Import"}, "3703": {"path": "/paddlevideo/modeling/heads/adds_head.py", "hash": "8ef3a0eda00742eaa08c953b245cc26f", "title": "AddsHead: Object Detection in PaddleVideo"}, "3704": {"path": "/paddlevideo/modeling/heads/adds_head.py:1-33", "hash": "e0b58fb5570c14b298febc678ea729b2", "title": "AddsHead Class Definition"}, "3705": {"path": "/paddlevideo/modeling/heads/adds_head.py:34-62", "hash": "aff9ccc22d3a0f8b92fdfdc28364a7e2", "title": "AddsHead: Initialization and Forward Pass"}, "3706": {"path": "/paddlevideo/modeling/heads/adds_head.py:63-95", "hash": "44b2902e96508beff09982ad5ad9411e", "title": "AddsHead: Compute Error Metrics"}, "3707": {"path": "/paddlevideo/modeling/heads/adds_head.py:96-117", "hash": "224adee361ffcb654f99baf6d9039f65", "title": "Multi-GPU Tensor Averaging"}, "3708": {"path": "/paddlevideo/modeling/heads/adds_head.py:118-144", "hash": "a604d2b45124844162f941e756c1f1ab", "title": "Error Metrics in Depth Prediction"}, "3709": {"path": "/paddlevideo/modeling/heads/adds_head.py:146-146", "hash": "7cd3ab5af0463b1117ddcf036b5ebbeb", "title": "Metrics for Regression Models"}, "3710": {"path": "/paddlevideo/modeling/heads/agcn2s_head.py", "hash": "8542a224b4593d99658eaac21e4a8761", "title": "AGCN2s Head: PaddleVideo's Versatile Model Component"}, "3711": {"path": "/paddlevideo/modeling/heads/agcn2s_head.py:1-32", "hash": "1e521fdb4e7f64dd25b1cfa3a8198655", "title": "AGCN2s Head Class in PaddleVideo"}, "3712": {"path": "/paddlevideo/modeling/heads/agcn2s_head.py:33-56", "hash": "c23ef34f9d51f84335372fee6e09774d", "title": "Agcn2sHead: Initialize Linear Layer and Reshape"}, "3713": {"path": "/paddlevideo/modeling/heads/agcn2s_head.py:57-59", "hash": "ad339d46ae161a8e6c883da80e0eba26", "title": "Average-Then-FC Aggregation"}, "3714": {"path": "/paddlevideo/modeling/heads/asrf_head.py", "hash": "df95e21baa757ae34d65dba8826f0066", "title": "ASRF Head: Action Recognition and Metrics"}, "3715": {"path": "/paddlevideo/modeling/heads/asrf_head.py:1-32", "hash": "49f61ca50d5b3d3a5f5cfd291f851ce3", "title": "ASRF Head: PaddleVideo Modeling"}, "3716": {"path": "/paddlevideo/modeling/heads/asrf_head.py:34-63", "hash": "45eb25b4cfed7be627d84915f3cbb5f1", "title": "ASRF Head Model Initialization"}, "3717": {"path": "/paddlevideo/modeling/heads/asrf_head.py:64-98", "hash": "fbb50ce57e1f173e0255d3972ebe1e50", "title": "ASRF Head Model: Forward Pass and Weights Init"}, "3718": {"path": "/paddlevideo/modeling/heads/asrf_head.py:100-136", "hash": "0912b504e69eaeda3bab8f092843a066", "title": "ASRF Head and F1 Score Calculation"}, "3719": {"path": "/paddlevideo/modeling/heads/asrf_head.py:137-170", "hash": "433903a8154e16171a8be4ef23565cda", "title": "ASRF Head: Labels and Levenshtein Distance"}, "3720": {"path": "/paddlevideo/modeling/heads/asrf_head.py:171-200", "hash": "fd20b004a3d9f12d7db0568af7da248f", "title": "Edit Score Calculation with Levenshtein Distance"}, "3721": {"path": "/paddlevideo/modeling/heads/asrf_head.py:201-212", "hash": "c4ff7771783aa50b091d27273b2d8565", "title": "ASRF Head: Calculating Metrics"}, "3722": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py", "hash": "2997780b8291d981c8fdb52e132ea5b1", "title": "LSTM Attention Mechanism for PaddleVideo"}, "3723": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:1-32", "hash": "0b3537589eacd92fc06a3fb74ee4732e", "title": "Attention LSTM Head: PaddleVideo's Neural Network Component"}, "3724": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:33-53", "hash": "add5aa2e2d67c22c54a731dafe6b267a", "title": "Bi-directional LSTM Attention Head for Video Classification"}, "3725": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:54-74", "hash": "47c26f0e1b2f42ccae3e62eb84dac556", "title": "Bidirectional LSTM Attention Head"}, "3726": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:75-95", "hash": "e6d68c45f6bd5ddb2eb92b144b2f9ae8", "title": "Attention LSTM Head in PaddleVideo"}, "3727": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:96-120", "hash": "0b1b85f759901357569614d5acd8ab6a", "title": "Attention LSTM Head"}, "3728": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:121-144", "hash": "44812590752788eaff419cf1bf7627a2", "title": "Attention LSTM Sequence Modeling Head"}, "3729": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:145-173", "hash": "e6097307d934cc41edad8fafd1fefab7", "title": "Attention LSTM Head Metrics"}, "3730": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:174-195", "hash": "17662b44d750ec1567db55aff04d5feb", "title": "Bidirectional LSTM Attention Mechanism for Multimodal Fusion"}, "3731": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:196-221", "hash": "9942f201e7b4980594e12059b3e97963", "title": "Attention-based LSTM Head in PaddleVideo"}, "3732": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:222-244", "hash": "b02af59130430792c510571c461eb5c3", "title": "Bi-directional LSTM Attention Head"}, "3733": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:245-267", "hash": "eae3c44c4ef69d188e86bcc414ba51fb", "title": "LSTM-based Attention Pooling for Neural Networks"}, "3734": {"path": "/paddlevideo/modeling/heads/attention_lstm_head.py:268-288", "hash": "c8201d17241b30ee21043e3b14ab4925", "title": "LSTM Attention Head with Loss and Metrics"}, "3735": {"path": "/paddlevideo/modeling/heads/base.py", "hash": "37064db3837c3920a65a7d12ca7ea477", "title": "PaddleVideo Classification Head: Versatile, Distributed"}, "3736": {"path": "/paddlevideo/modeling/heads/base.py:1-34", "hash": "5f8b63777c3c06713be561bf1d49de5a", "title": "Base Head Initializer: Initialize Weights for Subclasses"}, "3737": {"path": "/paddlevideo/modeling/heads/base.py:35-65", "hash": "7e6d40f745bf0fab1e5430b23d7fd7bf", "title": "PaddleVideo Classification Head Base"}, "3738": {"path": "/paddlevideo/modeling/heads/base.py:67-91", "hash": "cff327b7e16576f52fe150d2d2dd453b", "title": "Loss, Accuracy Calculator"}, "3739": {"path": "/paddlevideo/modeling/heads/base.py:92-113", "hash": "ed8dec5d7561bdd691673a9b6fd301d1", "title": "Mix-up Loss for MRI Classification"}, "3740": {"path": "/paddlevideo/modeling/heads/base.py:114-143", "hash": "816d6c2aedafa39ffea5677898fd64b1", "title": "Classification Loss Function"}, "3741": {"path": "/paddlevideo/modeling/heads/base.py:144-164", "hash": "83b222edbd2af11cdbb3a6d73201c6f1", "title": "Uniform Hard/Soft Loss Calculation"}, "3742": {"path": "/paddlevideo/modeling/heads/base.py:165-178", "hash": "62c5d3cd59b99a64f19c2474f1557239", "title": "Average Metrics Across Devices"}, "3743": {"path": "/paddlevideo/modeling/heads/bbox_head.py", "hash": "f9237353026c5a3e54e7289dc3868b48", "title": "BBoxHeadAVA: Box Detection and Evaluation"}, "3744": {"path": "/paddlevideo/modeling/heads/bbox_head.py:1-32", "hash": "3534cbdb22ad3d9c51027308116a7559", "title": "BBoxHeadAVA: Simple RoI Head with Pooling Options"}, "3745": {"path": "/paddlevideo/modeling/heads/bbox_head.py:33-61", "hash": "e54f2ca85e57e04bf5ba85751178d917", "title": "Class BBoxHeadAVA Initialization"}, "3746": {"path": "/paddlevideo/modeling/heads/bbox_head.py:62-83", "hash": "a1243715a75ebaadb34db02e7c75fe53", "title": "BBoxHead Model Initialization"}, "3747": {"path": "/paddlevideo/modeling/heads/bbox_head.py:85-106", "hash": "1d31ab148be850e10c0d3c87d5ba9855", "title": "Bbox Head Classification and Debug Image Init"}, "3748": {"path": "/paddlevideo/modeling/heads/bbox_head.py:107-126", "hash": "c238413e3b2c45dbff38ebd3914c0e6c", "title": "BBox Head Generator in PaddleVideo"}, "3749": {"path": "/paddlevideo/modeling/heads/bbox_head.py:128-152", "hash": "befe3713036d5ed1c5dacad18fe806ea", "title": "PaddleVideo Bbox Head Labeling and Comparison"}, "3750": {"path": "/paddlevideo/modeling/heads/bbox_head.py:153-171", "hash": "02716709c251af6be307a4b8827a2215", "title": "Multi-Label Recall and Precision Calculation"}, "3751": {"path": "/paddlevideo/modeling/heads/bbox_head.py:172-195", "hash": "2627fa986ea65b965a8692617b9d4620", "title": "BBox Head: Recall and Precision Calculation"}, "3752": {"path": "/paddlevideo/modeling/heads/bbox_head.py:196-218", "hash": "c17c37ca731bfd7dc36c6dd057879b5e", "title": "Bounding Box Heads: Accuracy and Loss Calculation"}, "3753": {"path": "/paddlevideo/modeling/heads/bbox_head.py:219-225", "hash": "5a41a1ac68021e4baa6876c4bc6a4a0f", "title": "Calculating BBox Scores in PaddleVideo Model"}, "3754": {"path": "/paddlevideo/modeling/heads/cfbi_head.py", "hash": "19295a2196d3be73fa69e8da45844e40", "title": "Multi-Input Collaborative Ensembler Network"}, "3755": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:1-32", "hash": "5cbbf0217b762a04d7b4c551ce1722e6", "title": "IA_Gate Layer Class Definition"}, "3756": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:33-65", "hash": "1564fbcbe8bdd339c3833b71a7125902", "title": "GCT Layer Definition and Initialization"}, "3757": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:66-95", "hash": "5ed32602fc417116d1df340592e34714", "title": "PaddleVideo's CFBI Head"}, "3758": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:96-119", "hash": "2351a8324a824e9e59421d551cd7f10a", "title": "CFBI Head: BatchNorm-ConvNet with ReLU"}, "3759": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:120-160", "hash": "2c729e70dd6b308e859f09eb484adc38", "title": "Convolutional Feature Fusion Head (CFBI)"}, "3760": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:161-193", "hash": "513bc98ed904a243f31c9a83891bc739", "title": "Convolutional Feature Fusion Block and ASPP Module"}, "3761": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:195-218", "hash": "28237bc7909541707db76ba1c5e8f6b1", "title": "ASPP Modules with Global Pooling in CFBI Head"}, "3762": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:220-251", "hash": "6114bb450f93cf4f259f9d64e0b8f86c", "title": "CFBI Head: Deep Feature Extraction and Aggregation"}, "3763": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:254-279", "hash": "a84fa2e26d49115f53326fdd703069d3", "title": "CollaborativeEnsemblerMS Class in PaddleVideo"}, "3764": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:281-306", "hash": "cd93887a252495daa16932626afba923", "title": "Multi-Stage Transformer Layer Initialization"}, "3765": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:308-332", "hash": "da9e845e49c863f8fa6d95dfd9b8aee2", "title": "Feature Extraction and Fusion Model Components"}, "3766": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:333-360", "hash": "90ffcce88afa2c3ae9c88d046f575ae2", "title": "Neural Network Architecture for CV Task"}, "3767": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:361-401", "hash": "422b34a5524a5f5e654c087eb0d5d471", "title": "Instance Segmentation Network Architecture with ASPP Module"}, "3768": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:402-433", "hash": "f3e94d097486c1fdc5b10bde90e09216", "title": "Convolutional Feature Binding IA Head"}, "3769": {"path": "/paddlevideo/modeling/heads/cfbi_head.py:435-448", "hash": "dad61005c9eceb4e1b1d30d74467874f", "title": "Augmented Background Logit Fusion"}, "3770": {"path": "/paddlevideo/modeling/heads/ctrgcn_head.py", "hash": "386f241bedb6eefd0e427e95e299be37", "title": "CTR-GCN Neural Network Head"}, "3771": {"path": "/paddlevideo/modeling/heads/ctrgcn_head.py:1-32", "hash": "8685a894e7d06ed864c235c0b7305456", "title": "CTR-GCN Head in PaddleVideo"}, "3772": {"path": "/paddlevideo/modeling/heads/ctrgcn_head.py:34-63", "hash": "45338577704a81b75c59a7d9e49dfe62", "title": "Neural Network Head Constructor with Dropout"}, "3773": {"path": "/paddlevideo/modeling/heads/ctrgcn_head.py:65-65", "hash": "469fa94828181561c4a21cf83c014cd1", "title": "FC Layer in CTRGCN Head"}, "3774": {"path": "/paddlevideo/modeling/heads/i3d_head.py", "hash": "787acd934c8e2485dfedbeaa39ecbb57", "title": "I3D Classification Head in PaddleVideo"}, "3775": {"path": "/paddlevideo/modeling/heads/i3d_head.py:1-31", "hash": "6de47f800601953380c89cc39bac31ec", "title": "I3D Head: Classification for PaddleVideo"}, "3776": {"path": "/paddlevideo/modeling/heads/i3d_head.py:32-59", "hash": "4cca39c61e0f1c6995e671e55224c837", "title": "I3D Head Class Constructor"}, "3777": {"path": "/paddlevideo/modeling/heads/i3d_head.py:60-91", "hash": "3c5da8ef43c89b581abe72175ea371ec", "title": "I3D Head: Feature Processing and Classification"}, "3778": {"path": "/paddlevideo/modeling/heads/i3d_head.py:92-95", "hash": "b3519cdaec142a96e175d22d5bf7e2af", "title": "Output Layer for PaddleVideo Classification"}, "3779": {"path": "/paddlevideo/modeling/heads/movinet_head.py", "hash": "fd4a4637458892a759578e567f7aa23e", "title": "MoViNetHead: Custom Head for Video Classification"}, "3780": {"path": "/paddlevideo/modeling/heads/ms_tcn_head.py", "hash": "ac4225305eff267ec80e86e29283bc0a", "title": "MS-TCN Head: Loss Calculation"}, "3781": {"path": "/paddlevideo/modeling/heads/ms_tcn_head.py:1-33", "hash": "2b38514f6bbe6a7832fa4068e1d940bb", "title": "MS-TCN Head: CrossEntropy and MSE Losses"}, "3782": {"path": "/paddlevideo/modeling/heads/ms_tcn_head.py:34-68", "hash": "f99a173d7ea5e36907871ab0f43e7eb0", "title": "MS-TCN Head: Loss and F1 Score Calculation"}, "3783": {"path": "/paddlevideo/modeling/heads/ms_tcn_head.py:69-105", "hash": "79a6b610568788891493fc01af8970aa", "title": "F1 Score Calculation and Label Extraction Algorithm"}, "3784": {"path": "/paddlevideo/modeling/heads/ms_tcn_head.py:106-137", "hash": "b952a96f66c0cae7ca3b195694d3e754", "title": "Edit Score Calculation Functions"}, "3785": {"path": "/paddlevideo/modeling/heads/ms_tcn_head.py:138-165", "hash": "c53db36308edb252f7893422f347c38d", "title": "F-score Calculator for Labeled Sequences"}, "3786": {"path": "/paddlevideo/modeling/heads/pptimesformer_head.py", "hash": "5db56e6fef45fa2021ca381e287711b0", "title": "PaddlePaddle TimeSformer Head"}, "3787": {"path": "/paddlevideo/modeling/heads/pptimesformer_head.py:1-30", "hash": "52f069299f69ebd485962307ca512d18", "title": "Introducing ppTimeSformerHead Class"}, "3788": {"path": "/paddlevideo/modeling/heads/pptimesformer_head.py:31-58", "hash": "f96ceedc5e775508a0888b7a80257aaf", "title": "PPTimesformerHead: Paddle Video Model Class"}, "3789": {"path": "/paddlevideo/modeling/heads/pptimesformer_head.py:59-74", "hash": "cdc21d18271d28737bfb0e5d3fdfe848", "title": "PPTimesformer Head Definition"}, "3790": {"path": "/paddlevideo/modeling/heads/pptsm_head.py", "hash": "d815fc9a525f12763be5f0d14bf9af3e", "title": "PaddlePaddle Video: PPTSMHead Initialization"}, "3791": {"path": "/paddlevideo/modeling/heads/pptsm_head.py:1-31", "hash": "e18be863a92905feb4ca09db6dbfc39a", "title": "ppTSMHead: PaddleVideo Registry Class"}, "3792": {"path": "/paddlevideo/modeling/heads/pptsm_head.py:32-58", "hash": "26d5a4e0ce88090cd215d89a2b1e70b0", "title": "PPTSM Head Initialization"}, "3793": {"path": "/paddlevideo/modeling/heads/pptsm_head.py:59-87", "hash": "ecc653049729258ad0ac89b2722fefad", "title": "PPTSM Head Initialization"}, "3794": {"path": "/paddlevideo/modeling/heads/pptsm_head.py:88-92", "hash": "530aa2b0ec8126772a39369909a9ba4c", "title": "PaddleVideo's PptsmHead FC Function"}, "3795": {"path": "/paddlevideo/modeling/heads/pptsn_head.py", "hash": "987c38f13c37dc0bbf27866190a270ef", "title": "PaddlePaddle PP-TSN Head Classification"}, "3796": {"path": "/paddlevideo/modeling/heads/pptsn_head.py:1-30", "hash": "23af7c74394c1e0910acdba8b7730179", "title": "Python PP-TSN Head Implementation"}, "3797": {"path": "/paddlevideo/modeling/heads/pptsn_head.py:31-54", "hash": "7818ff5c9f40719261c0c500b0909e6f", "title": "Adaptive Pooling PPTSN Head"}, "3798": {"path": "/paddlevideo/modeling/heads/pptsn_head.py:56-84", "hash": "9111a3c7bef36dc0b11da620a4350d2a", "title": "PaddlePaddle Classification Head Code"}, "3799": {"path": "/paddlevideo/modeling/heads/pptsn_head.py:85-103", "hash": "58f43449132426469bca8ce96c26a3ae", "title": "PPTSN Head Processing"}, "3800": {"path": "/paddlevideo/modeling/heads/roi_extractor.py", "hash": "33990fc44ccd2bcf8636b1c7bfaa24b6", "title": "RoIAlign: Region Feature Alignment"}, "3801": {"path": "/paddlevideo/modeling/heads/roi_extractor.py:1-31", "hash": "19b7ca545bf9fb355141b9598d9ea77f", "title": "RoIAlign: Feature Alignment Tool"}, "3802": {"path": "/paddlevideo/modeling/heads/roi_extractor.py:32-53", "hash": "b221d88f1a1d9b5330ea22ce8d8a5c6c", "title": "ROI Alignment with PaddlePaddle"}, "3803": {"path": "/paddlevideo/modeling/heads/roi_head.py", "hash": "4ad51b2ae316c0542fb62dac54795e6a", "title": "ROI Head for Object Detection"}, "3804": {"path": "/paddlevideo/modeling/heads/roi_head.py:1-29", "hash": "0edbd3f114dd6edccc3b3ac806890184", "title": "Bounding Box to Detection Results Converter"}, "3805": {"path": "/paddlevideo/modeling/heads/roi_head.py:30-59", "hash": "d5146cb636ece616d226c4c34496d4a7", "title": "NMS-Based Bounding Box Filtering"}, "3806": {"path": "/paddlevideo/modeling/heads/roi_head.py:60-93", "hash": "1ab9958834151b73f681ac553c73fa38", "title": "PaddlePaddle RoI Head Class"}, "3807": {"path": "/paddlevideo/modeling/heads/roi_head.py:94-114", "hash": "5b1f64b769bd9382a22ad59ce38304c5", "title": "Bbox Head Initialization and Feature Extraction"}, "3808": {"path": "/paddlevideo/modeling/heads/roi_head.py:115-134", "hash": "800707b11c5a44606274117ddf2d5070", "title": "ROI Head: Bbox Loss Calculation and Assignment"}, "3809": {"path": "/paddlevideo/modeling/heads/roi_head.py:135-158", "hash": "ecf4c9d7ce9fdcc15d55e040ed31b95d", "title": "RoI Head BBox Prediction Functions"}, "3810": {"path": "/paddlevideo/modeling/heads/roi_head.py:159-177", "hash": "26a2c768fbb45fa29ecd3ec409f9961c", "title": "Detect Bboxes Without Augmentation"}, "3811": {"path": "/paddlevideo/modeling/heads/single_straight3d.py", "hash": "9cbfb8cebb39b11227958311b2bf9975", "title": "Single Straight 3D ROI Extractor"}, "3812": {"path": "/paddlevideo/modeling/heads/single_straight3d.py:1-28", "hash": "eb478a5275e1a78926014308569b1fbf", "title": "SingleRoIExtractor3D: RoI Extractor for 3D Features"}, "3813": {"path": "/paddlevideo/modeling/heads/single_straight3d.py:29-55", "hash": "c889f95ba01d8fa56feb4258cd5cb4d7", "title": "3D Head Feature Extraction"}, "3814": {"path": "/paddlevideo/modeling/heads/single_straight3d.py:56-79", "hash": "23bfa60f0f7c998bff9525762ee45aca", "title": "Spatio-Temporal Feature Extraction and ROI Pooling"}, "3815": {"path": "/paddlevideo/modeling/heads/slowfast_head.py", "hash": "a89efdfa957537ca710abe439859fd74", "title": "SlowFast 3D Head Initialization"}, "3816": {"path": "/paddlevideo/modeling/heads/slowfast_head.py:1-30", "hash": "af7596ee359c82f614ed4b6d606033cf", "title": "SlowFast Head: PaddleVideo ResNeXt 3D Projection"}, "3817": {"path": "/paddlevideo/modeling/heads/slowfast_head.py:31-56", "hash": "7c72119b924e24a529cc3d67159bba80", "title": "SlowFast_Head: Concatenating Multi-Pathway Classifier"}, "3818": {"path": "/paddlevideo/modeling/heads/slowfast_head.py:57-83", "hash": "b68ae62cd818f669a9167102da6ae170", "title": "Initializing SlowFast Head Model Parameters"}, "3819": {"path": "/paddlevideo/modeling/heads/slowfast_head.py:84-113", "hash": "325e112fb20162f0ab952a2cee301ee1", "title": "SlowFast Head Model Initialization"}, "3820": {"path": "/paddlevideo/modeling/heads/slowfast_head.py:114-137", "hash": "80822321a533a4d2d921e6e50f723355", "title": "SlowFast Head: Pooling and Dropout Operations"}, "3821": {"path": "/paddlevideo/modeling/heads/stgcn_head.py", "hash": "e4df9fc01a835f963337097e37f7d39c", "title": "STGCN Head Initialization and Forward Pass"}, "3822": {"path": "/paddlevideo/modeling/heads/stgcn_head.py:1-32", "hash": "6ee61791a4679a3558c0458b1bc6569a", "title": "STGCN Head: PaddlePaddle's Video Modeling Class"}, "3823": {"path": "/paddlevideo/modeling/heads/stgcn_head.py:33-50", "hash": "b7485b7c63f9c81714097f0f9b77f879", "title": "Convolutional STGCN Head"}, "3824": {"path": "/paddlevideo/modeling/heads/timesformer_head.py", "hash": "49c9f82bda568463bb26f1a8f4b83c7e", "title": "TimeSformer Head: TimeSformer's Model Head"}, "3825": {"path": "/paddlevideo/modeling/heads/timesformer_head.py:1-29", "hash": "3da3b9642d00a4bd3a3c23d911f8b42b", "title": "TimeSformer Head Class"}, "3826": {"path": "/paddlevideo/modeling/heads/timesformer_head.py:30-60", "hash": "383b6239d6130a3fde64845574392bfd", "title": "TimeSformer Head: PaddlePaddle's Dynamic Initialization"}, "3827": {"path": "/paddlevideo/modeling/heads/timesformer_head.py:61-70", "hash": "acf77cc1674e99bf0422f8865fa5054e", "title": "Fully Connected Layer with Dropout Clarification"}, "3828": {"path": "/paddlevideo/modeling/heads/token_shift_head.py", "hash": "dd3c9a896ad1dd0e99bb38bd69affb6f", "title": "TokenShiftHead: Paddle's Classification Framework"}, "3829": {"path": "/paddlevideo/modeling/heads/token_shift_head.py:1-30", "hash": "6eff9dac6f9cbe9fa94a634e9a7dc48f", "title": "TokenShiftHead: Transformer Classification Task Head"}, "3830": {"path": "/paddlevideo/modeling/heads/token_shift_head.py:31-60", "hash": "eedb60d205d528c6d64af374b60ec073", "title": "Initializing Token Shift Head Parameters"}, "3831": {"path": "/paddlevideo/modeling/heads/token_shift_head.py:61-79", "hash": "bd02a8d3773e1eb8787a2375095e6c24", "title": "TokenShiftHead: Classification Scores for Each Frame"}, "3832": {"path": "/paddlevideo/modeling/heads/transnetv2_head.py", "hash": "0f5d585b500e1ed3bd876e9db74487f4", "title": "TransNetV2Head: Loss and F1 Score in Computer Vision"}, "3833": {"path": "/paddlevideo/modeling/heads/transnetv2_head.py:1-29", "hash": "dbbaedcf623519a8c344a70164cccee8", "title": "TransNetV2Head: CV Model Base Class"}, "3834": {"path": "/paddlevideo/modeling/heads/transnetv2_head.py:30-45", "hash": "48f28be9bf4d1e121ad81071e3084cb3", "title": "TransnetV2 Head: Loss and F1 Score Calculation"}, "3835": {"path": "/paddlevideo/modeling/heads/tsm_head.py", "hash": "3631d54cf25af541978aa6584cf4abde", "title": "TSM Head: PaddleVideo's Temporal Segment Network"}, "3836": {"path": "/paddlevideo/modeling/heads/tsm_head.py:1-33", "hash": "d9fee9bb3a31c95410e3499fbc2983e5", "title": "TSM Head Class"}, "3837": {"path": "/paddlevideo/modeling/heads/tsm_head.py:34-57", "hash": "e981e974e14750d4e0654f924db79982", "title": "TSM Head: PyTorch Class Initialization"}, "3838": {"path": "/paddlevideo/modeling/heads/tsm_head.py:58-89", "hash": "f0b494929e7bc13bf10614f13d36fa5d", "title": "TSM Head Initialization"}, "3839": {"path": "/paddlevideo/modeling/heads/tsm_head.py:90-99", "hash": "b0d5e36e0da4013c6f30008f90bafc3d", "title": "Temporal Segment Network Head Score Averaging"}, "3840": {"path": "/paddlevideo/modeling/heads/tsn_head.py", "hash": "ed0423612cb96dea2651accb3af11357", "title": "TSN Head: Image Classification in PaddleVideo"}, "3841": {"path": "/paddlevideo/modeling/heads/tsn_head.py:1-30", "hash": "77d4da21a18c3b4f20e275e57679b60c", "title": "TSN Head: Image Classification Model"}, "3842": {"path": "/paddlevideo/modeling/heads/tsn_head.py:31-63", "hash": "8d7e7aebb4aec836ecb9385316afb7e4", "title": "TSN Head Initialization"}, "3843": {"path": "/paddlevideo/modeling/heads/tsn_head.py:64-93", "hash": "192c53b686c9caeac3a86967bf897948", "title": "TSN Head: Average Pooling and Classification"}, "3844": {"path": "/paddlevideo/modeling/losses/__init__.py", "hash": "f61640bbd74cfe81d8569d855246f600", "title": "Comprehensive Losses for PaddleVideo"}, "3845": {"path": "/paddlevideo/modeling/losses/__init__.py:1-26", "hash": "f34b998f0c7bca3d7767589f4b63ec07", "title": "Extensive Loss Functions for PaddleVideo"}, "3846": {"path": "/paddlevideo/modeling/losses/__init__.py:27-29", "hash": "423d62feecd711ef9d9661062a07cf2a", "title": "Loss Functions for PaddleVideo"}, "3847": {"path": "/paddlevideo/modeling/losses/actbert_loss.py", "hash": "6df5e98f142dafac115756a39ac1604a", "title": "ActBert Loss Functions"}, "3848": {"path": "/paddlevideo/modeling/losses/actbert_loss.py:1-32", "hash": "b94cfb9dffdcf5cb201ae021812b38d6", "title": "ActBertLoss: Custom Loss for ActBert Model"}, "3849": {"path": "/paddlevideo/modeling/losses/actbert_loss.py:33-50", "hash": "aaf30a8d16328a89219949fdda85e186", "title": "ActBert Loss: Visual Classification with KLDivLoss"}, "3850": {"path": "/paddlevideo/modeling/losses/actbert_loss.py:51-75", "hash": "8cf69c4f9b970c6994880b7d6bc3f8b8", "title": "Multi-Loss Calculation in ActBERT Model"}, "3851": {"path": "/paddlevideo/modeling/losses/asrf_loss.py", "hash": "d33700d7ad66223e82feebc5c2cc8369", "title": "Custom Loss Functions for Video Modeling"}, "3852": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:1-32", "hash": "625cc67dd6b859ffecf3882aa15f668e", "title": "TMSE Loss: Temporal MSE for Action Segmentation"}, "3853": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:33-66", "hash": "452685c4044c89af4d0c659d1d032741", "title": "ASRF and Temporal MSE Loss Functions"}, "3854": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:67-92", "hash": "651f3d0254476926640215569401638f", "title": "Gaussian-weighted MSE Loss in Paddle"}, "3855": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:94-126", "hash": "0962cc3b8d1ba515701dbe8fb3f7724b", "title": "ASRF and Focal Loss Calculations"}, "3856": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:128-167", "hash": "dc2ed5da22c9d0bc7c967505c3c1f213", "title": "Action Segmentation Loss: Flexible Implementations"}, "3857": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:168-198", "hash": "493d84379741396286fb3ceaa091f2c5", "title": "Initialize Loss Functions and Weights"}, "3858": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:200-221", "hash": "12c7cc54485146b357c19ddcefe81ec0", "title": "ASRF Loss: CrossEntropy with Class Weights"}, "3859": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:222-248", "hash": "67fee7fc6d6b4886191f9fbe1cdc520b", "title": "Adjustable Sensitivity Ranking Fusion Loss"}, "3860": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:250-291", "hash": "2a86a423d54927263ebcecbce9bd613c", "title": "Boundary Regression Loss Function Combination"}, "3861": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:292-321", "hash": "489b41bbb0c022cc730e46a1730195f9", "title": "Positive Weight Calculator"}, "3862": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:322-359", "hash": "8551e37f098a314872414c247026d0f2", "title": "Multicriterion ASR Loss Function"}, "3863": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:360-373", "hash": "765c6e45b455ef565d96c60b9dfb6ef7", "title": "Initialize ActionSegmentationLoss Object"}, "3864": {"path": "/paddlevideo/modeling/losses/asrf_loss.py:374-401", "hash": "c32d531e658c441e6532d2488fca8f02", "title": "Custom Loss Function for Video Modeling Framework"}, "3865": {"path": "/paddlevideo/modeling/losses/base.py", "hash": "2db2a4be2ecb18fc8460926d6258f304", "title": "PaddlePaddle Loss Base Class"}, "3866": {"path": "/paddlevideo/modeling/losses/base.py:1-31", "hash": "adc80ba40f2a0eca6b8891b4c12e6d95", "title": "Base Loss Function in PaddlePaddle"}, "3867": {"path": "/paddlevideo/modeling/losses/base.py:32-49", "hash": "a15a273f35f2605c4ede1dae032efe62", "title": "Weighted Loss Initialization and Forward Pass"}, "3868": {"path": "/paddlevideo/modeling/losses/bmn_loss.py", "hash": "ff73d098f17f4851c450bb36764b6372", "title": "BMN Loss for PaddleVideo"}, "3869": {"path": "/paddlevideo/modeling/losses/bmn_loss.py:1-32", "hash": "aca797ce49351053042921a339d55d6b", "title": "BMN Loss Function for PaddleVideo"}, "3870": {"path": "/paddlevideo/modeling/losses/bmn_loss.py:33-55", "hash": "8059995d2425393437cd8b9ddba2df3a", "title": "Binary Mask Network Loss"}, "3871": {"path": "/paddlevideo/modeling/losses/bmn_loss.py:56-77", "hash": "bf02a920eefc3aac3333c505f134133e", "title": "Bi-directional Masked Object Detection Loss"}, "3872": {"path": "/paddlevideo/modeling/losses/bmn_loss.py:78-101", "hash": "9a9ada15455bcc24a76c697f1a134cfc", "title": "Uniform Mask Multiplication and Ratio Calculation"}, "3873": {"path": "/paddlevideo/modeling/losses/bmn_loss.py:102-126", "hash": "7b8acfcc30686af0577022b4e42ae27e", "title": "BMN Loss Calculation"}, "3874": {"path": "/paddlevideo/modeling/losses/bmn_loss.py:127-147", "hash": "c2e19224db25651cc03f528ac016d0e2", "title": "Forward Function: BMN Loss Calculation"}, "3875": {"path": "/paddlevideo/modeling/losses/bmn_loss.py:149-155", "hash": "d94dd662e2bbb79ec49661795c3045af", "title": "BMN Loss Calculation: PEM & TEAM Detection"}, "3876": {"path": "/paddlevideo/modeling/losses/cross_entropy_loss.py", "hash": "68bf4de77eff03950a8751c492f8738a", "title": "CrossEntropy Loss Function in PaddlePaddle"}, "3877": {"path": "/paddlevideo/modeling/losses/cross_entropy_loss.py:1-30", "hash": "9a1bb154ed274edb939c94662b16c67d", "title": "Custom Cross Entropy Loss in PaddlePaddle"}, "3878": {"path": "/paddlevideo/modeling/losses/cross_entropy_loss.py:31-36", "hash": "bc7f6c7f17042cac9df3d8919c2a560e", "title": "Calculate CrossEntropy Loss in Paddle"}, "3879": {"path": "/paddlevideo/modeling/losses/depth_loss.py", "hash": "94194edd127d30f4cd325266f2f96b33", "title": "Depth Loss Calculation for PaddleVideo"}, "3880": {"path": "/paddlevideo/modeling/losses/depth_loss.py:1-29", "hash": "8458f5a75cb4739dce9e614979b46cdb", "title": "Smoothness Loss Function"}, "3881": {"path": "/paddlevideo/modeling/losses/depth_loss.py:30-67", "hash": "4f0f3df184c134d3810091b4d168f60a", "title": "Depth Loss: DiffLoss and MSE for Disparity Estimation"}, "3882": {"path": "/paddlevideo/modeling/losses/depth_loss.py:68-104", "hash": "4f50cf05b517ef3cdcdc236ccd52cdd8", "title": "Structured Loss Functions for PaddlePaddle"}, "3883": {"path": "/paddlevideo/modeling/losses/depth_loss.py:106-137", "hash": "0f0abfbb3b9209f330868cb0c0c582b9", "title": "SSIM Loss Calculation in ADDSLoss"}, "3884": {"path": "/paddlevideo/modeling/losses/depth_loss.py:138-173", "hash": "7e19625ac425e8211893a864101befb5", "title": "Scale-Based Depth Loss Calculation"}, "3885": {"path": "/paddlevideo/modeling/losses/depth_loss.py:174-197", "hash": "f1504cc4e4cfbae5b26795d276bbbb42", "title": "Depth Loss Computation Algorithm"}, "3886": {"path": "/paddlevideo/modeling/losses/depth_loss.py:199-223", "hash": "271659957241546a16783bfb4b7d3dfb", "title": "Depth Loss Calculation"}, "3887": {"path": "/paddlevideo/modeling/losses/depth_loss.py:225-250", "hash": "98c67927da903e7836f9a2c586fd9bab", "title": "Computing Day-Night Losses for Video"}, "3888": {"path": "/paddlevideo/modeling/losses/depth_loss.py:251-276", "hash": "7bcb76e448f6c699dfadbe4f2d2b1fc1", "title": "Depth and Reconstruction Losses in Day-Night Scenes"}, "3889": {"path": "/paddlevideo/modeling/losses/depth_loss.py:278-290", "hash": "504522f06bd08d92a7d5f61b43e20f40", "title": "Depth Loss Update"}, "3890": {"path": "/paddlevideo/modeling/losses/distillation_loss.py", "hash": "3e56305826ba0d63f91b6ad3a3318a61", "title": "Distillation & KL Divergence Losses"}, "3891": {"path": "/paddlevideo/modeling/losses/distillation_loss.py:1-30", "hash": "d94481369c0a27fa95d06d4cdf5f036c", "title": "Distillation Entropy Loss Class"}, "3892": {"path": "/paddlevideo/modeling/losses/distillation_loss.py:31-60", "hash": "c1f88ee0a3f36da348be06542d6a9843", "title": "Distillation-Aware CrossEntropy Loss with Weighted Average"}, "3893": {"path": "/paddlevideo/modeling/losses/distillation_loss.py:61-79", "hash": "c0efe085d2a7e1bc02ef185efded4d6b", "title": "Kullback-Leibler Divergence Loss Class"}, "3894": {"path": "/paddlevideo/modeling/losses/transnetv2_loss.py", "hash": "8f1862da4f0453e0051af28d54862648", "title": "TransNetV2 Loss Calculation"}, "3895": {"path": "/paddlevideo/modeling/losses/transnetv2_loss.py:1-28", "hash": "ef943971c6dbde4166946012d3ba089b", "title": "TransNetV2 Loss Calculator"}, "3896": {"path": "/paddlevideo/modeling/losses/transnetv2_loss.py:30-54", "hash": "bda5798196c7c724c78c5f999a779c3e", "title": "TransNetV2 Loss Function"}, "3897": {"path": "/paddlevideo/modeling/losses/transnetv2_loss.py:56-56", "hash": "c06362f66c8157a0a6864689f7d9211c", "title": "TransNetV2 Total Loss Calculation"}, "3898": {"path": "/paddlevideo/modeling/losses/yowo_loss.py", "hash": "1c9fd70d7a5631d5a61c305a09dc8666", "title": "YOLO Loss Functions in PaddleVideo"}, "3899": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:1-31", "hash": "03fcb46914151ac11319969d3cdd010b", "title": "Focal Loss: Focusing on Hard Examples"}, "3900": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:33-55", "hash": "17375f075fdf6bd88abf6425a262801c", "title": "Focal Loss with Alpha, Gamma, and Size Average"}, "3901": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:56-87", "hash": "ccbb72dc9c456c4c0fe8e23717b588cc", "title": "Yowo Loss Function: GPU Optimized and Customizable"}, "3902": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:88-112", "hash": "86ec0f111e466048dddb8bd90c9962f1", "title": "Region Loss with Focal Loss and Threshold"}, "3903": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:113-137", "hash": "550636ebffdd5caad40180e620d6b85c", "title": "Sigmoid Transformation for YOLO Anchors"}, "3904": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:138-155", "hash": "f7cc9cbf5b8d076b2e314e05659d9397", "title": "YOLOv5 Loss Assignment"}, "3905": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:156-169", "hash": "9712965b8836d42507b345034a278d1e", "title": "Prepare Data for Object Detection Training"}, "3906": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:170-181", "hash": "a141d80ccbbe92f2f94c73fb77b3a4d7", "title": "Anchor Width-Height Assignments for YOWO Loss"}, "3907": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:183-199", "hash": "fec5ddbe85fcebcdbd5f0f1299d0abcf", "title": "YOLOv3 Loss Calculation in PaddleVideo"}, "3908": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:199-210", "hash": "2b11946efd7abb17bf25db8aa4a8f5fe", "title": "YOLO Loss Setup"}, "3909": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:211-237", "hash": "34a67a50ec21f4a1562d876e48d78914", "title": "GPU Variables Loss Calculation"}, "3910": {"path": "/paddlevideo/modeling/losses/yowo_loss.py:238-249", "hash": "c928b5e938a77a74db6570d31f7ae10c", "title": "YOWO Loss: Coordinate and Classification"}, "3911": {"path": "/paddlevideo/modeling/registry.py", "hash": "a4bdd3d8401ddf8b5a60c92bc22492da", "title": "Efficient Model Registry Organization"}, "3912": {"path": "/paddlevideo/modeling/registry.py:1-27", "hash": "ae9386e7a49df56a03ee08c5bf71e9bc", "title": "Model Registry Organization"}, "3913": {"path": "/paddlevideo/modeling/registry.py:28-31", "hash": "61db931447ef50230d42bd44c6fdda68", "title": "Model Registries for Paddle Video"}, "3914": {"path": "/paddlevideo/modeling/samplers/__init__.py", "hash": "f71e47fc8c9e9869c372a360431983bd", "title": "Importing RandomSampler Class and Licensing Information"}, "3915": {"path": "/paddlevideo/modeling/samplers/random_sampler.py", "hash": "ba79d15c6f2bb6948ebc136aacede585", "title": "Random Sampler for Bbox Sampling"}, "3916": {"path": "/paddlevideo/modeling/samplers/random_sampler.py:1-28", "hash": "e23700cd98671582d4a4b1e7340982e6", "title": "Random Sampling Class Definition"}, "3917": {"path": "/paddlevideo/modeling/samplers/random_sampler.py:29-55", "hash": "210c9338346983c783dc54a872d0c58f", "title": "Initializing Sampler Bounding Boxes"}, "3918": {"path": "/paddlevideo/modeling/samplers/random_sampler.py:56-92", "hash": "549fe38951bec250d526f8e8587ee5e2", "title": "RandomSampler: Randomly Sampling Bounding Boxes"}, "3919": {"path": "/paddlevideo/modeling/samplers/random_sampler.py:93-114", "hash": "fa2262bec46b4f314e90269ac4859bd6", "title": "Random Sampler for Imbalanced Classes"}, "3920": {"path": "/paddlevideo/modeling/samplers/random_sampler.py:115-139", "hash": "5f03a4fcbecbc29ea1b0b0007bf9afb9", "title": "Random Sampler: Positive and Negative Sample Selection"}, "3921": {"path": "/paddlevideo/modeling/samplers/random_sampler.py:140-146", "hash": "be5ab46709d9accf65fe071d25493f3b", "title": "Zero-Check Random Sampler"}, "3922": {"path": "/paddlevideo/modeling/weight_init.py", "hash": "228286f83524ae8e3e67808f5d744faf", "title": "Weight Initialization in PaddlePaddle"}, "3923": {"path": "/paddlevideo/modeling/weight_init.py:1-36", "hash": "12230ca790b64fcbfc5bc9149d76f85d", "title": "Weight Initialization for PaddlePaddle Layers"}, "3924": {"path": "/paddlevideo/modeling/weight_init.py:37-66", "hash": "6f4f37410b80207f3aa125c7d5f62bd9", "title": "Truncated Normal Weight Initialization"}, "3925": {"path": "/paddlevideo/modeling/weight_init.py:68-98", "hash": "8aaf662e76ad966b356f712de2bac498", "title": "Truncated Gaussian Tensor Weight Init"}, "3926": {"path": "/paddlevideo/modeling/weight_init.py:99-130", "hash": "6e8cb481f11c114f48bede00df617ac8", "title": "Convolutional Layer Weight Initialization"}, "3927": {"path": "/paddlevideo/modeling/weight_init.py:131-156", "hash": "ce1ebba0dc8551c8339cff6212cc95a9", "title": "Neural Network Weight Initialization"}, "3928": {"path": "/paddlevideo/modeling/weight_init.py:157-157", "hash": "a673ec052d80914639ecba2760975d77", "title": "Initialize Tensor with Values"}, "3929": {"path": "/paddlevideo/solver/__init__.py", "hash": "2d1b6e1c8e3481d93693e2f673e1fc8c", "title": "Solver Package Imports"}, "3930": {"path": "/paddlevideo/solver/custom_lr.py", "hash": "4f640bf9207ef966562c2f3d0c4bf430", "title": "Custom Learning Rate Schedulers for PaddleVideo"}, "3931": {"path": "/paddlevideo/solver/custom_lr.py:1-31", "hash": "686b84282932ffb07f680f8219c5dad6", "title": "Custom Warmup-Cosine Decay LR Scheduler"}, "3932": {"path": "/paddlevideo/solver/custom_lr.py:32-54", "hash": "cf46af48f50f2a31f782a40e4c3d502b", "title": "Cosine Annealing Learning Rate Scheduler"}, "3933": {"path": "/paddlevideo/solver/custom_lr.py:55-80", "hash": "086b6b40ce1ea25cfb77ecfac5db98e6", "title": "Custom Learning Rate Scheduler for PaddleVideo"}, "3934": {"path": "/paddlevideo/solver/custom_lr.py:81-106", "hash": "dcca060978c0d8a1f4998a7d6a31a719", "title": "Custom Learning Rate Scheduler with Warmup and Decay"}, "3935": {"path": "/paddlevideo/solver/custom_lr.py:107-133", "hash": "21325b18356de39934720c7896193fe5", "title": "Customizable Piecewise Decay Learning Rate Scheduler"}, "3936": {"path": "/paddlevideo/solver/custom_lr.py:134-158", "hash": "0ed1cf81201096fd592cb6eaccec3825", "title": "Custom Learning Rate Scheduler"}, "3937": {"path": "/paddlevideo/solver/custom_lr.py:159-188", "hash": "a6552ca371a951367850a4475d47ecca", "title": "Warmup Custom Learning Rate Policy"}, "3938": {"path": "/paddlevideo/solver/custom_lr.py:189-222", "hash": "7fa78593dbf67ad5822ae3f1e10dee7d", "title": "Customizable Warmup Cosine Decay Learning Rate Scheduler"}, "3939": {"path": "/paddlevideo/solver/custom_lr.py:223-249", "hash": "4245b2eacad56153e37a41b4ff287824", "title": "Custom Learning Rate Scheduler"}, "3940": {"path": "/paddlevideo/solver/custom_lr.py:251-282", "hash": "e309e4ad28f0560e04d2604a120b14e5", "title": "Custom Learning Rate Scheduler"}, "3941": {"path": "/paddlevideo/solver/custom_lr.py:283-305", "hash": "2c621c295ff96bd575678a3832cd884e", "title": "Custom Learning Rate Scheduler"}, "3942": {"path": "/paddlevideo/solver/custom_lr.py:306-332", "hash": "b5e104c85c5212105dc4f6599cbff87e", "title": "Custom Warmup Adjust Decay Scheduler"}, "3943": {"path": "/paddlevideo/solver/custom_lr.py:333-338", "hash": "dbfb50baffc9658e1ab898032d8eef23", "title": "Custom Warmup Learning Rate"}, "3944": {"path": "/paddlevideo/solver/lr.py", "hash": "4241bb08f9afbd56f380f0724d6d23fa", "title": "Learning Rate Scheduler Builder"}, "3945": {"path": "/paddlevideo/solver/lr.py:1-28", "hash": "9c0d92047f388c7204eed6460ce207e6", "title": "Learning Rate Scheduler Builder"}, "3946": {"path": "/paddlevideo/solver/lr.py:30-52", "hash": "4643f4c5d126b65f9a72f6742b3503cb", "title": "Custom Learning Rate Scheduler"}, "3947": {"path": "/paddlevideo/solver/optimizer.py", "hash": "762f7825b6d2acb832cb3bd56e5c2e3e", "title": "Python Optimizer Configurations"}, "3948": {"path": "/paddlevideo/solver/optimizer.py:1-31", "hash": "881b23097c1591c6ac710f277337ebb4", "title": "Building PaddleVideo's Optimizer"}, "3949": {"path": "/paddlevideo/solver/optimizer.py:32-63", "hash": "d8fa03bd06e71bca8e47928680b93975", "title": "Optimizer Configuration and Learning Rate Scheduler"}, "3950": {"path": "/paddlevideo/solver/optimizer.py:64-85", "hash": "3c59d078a1b3ef862c183535f0772063", "title": "AMP-Aware Optimizer Function"}, "3951": {"path": "/paddlevideo/solver/optimizer.py:86-109", "hash": "409522bce5a252b5db3628c80c292506", "title": "L1-L2 Weight Decay Optimizer Config"}, "3952": {"path": "/paddlevideo/solver/optimizer.py:110-133", "hash": "58a1390ccb4c6ab2bed48b05c2483ddf", "title": "Multi-Precision Learning Rate Scheduler"}, "3953": {"path": "/paddlevideo/solver/optimizer.py:134-136", "hash": "1812c20346fc34e19f0ac1fd0fe7e4f3", "title": "Optimizer Factory Function"}, "3954": {"path": "/paddlevideo/tasks/__init__.py", "hash": "329e7e606d1f77433b28213d93548337", "title": "PaddleVideo Tasks Initialization"}, "3955": {"path": "/paddlevideo/tasks/test.py", "hash": "857bf5660755efbab95696e2fba42300", "title": "Parallel Testing with PaddlePaddle"}, "3956": {"path": "/paddlevideo/tasks/test.py:1-32", "hash": "8395e2179a873b01a27e7e025025b314", "title": "Parallel PaddlePaddle Model Testing"}, "3957": {"path": "/paddlevideo/tasks/test.py:34-61", "hash": "d753f7a43c27abec9211d10d97ba5a7f", "title": "Model Initialization and Configuration"}, "3958": {"path": "/paddlevideo/tasks/test.py:62-90", "hash": "3df5791ddb8019f575605e9e7acedfb0", "title": "Model Evaluation Loop"}, "3959": {"path": "/paddlevideo/tasks/train.py", "hash": "9d53805884dfb657b53da1a3dc3fcde9", "title": "Distributed Training with PaddlePaddle Fleet API"}, "3960": {"path": "/paddlevideo/tasks/train.py:1-27", "hash": "cf36ab1402539e12740170fced0f0c5f", "title": "Video Task Training Framework"}, "3961": {"path": "/paddlevideo/tasks/train.py:28-51", "hash": "5d1e2a40e2a50d6be8a271b3f52dbc0d", "title": "Training Model with PaddleVideo"}, "3962": {"path": "/paddlevideo/tasks/train.py:52-75", "hash": "96683855e51a4a52d4fd8620a9f1c2ba", "title": "Gradient Accumulation for Distributed PaddlePaddle Training"}, "3963": {"path": "/paddlevideo/tasks/train.py:76-96", "hash": "5555b80473927042bd3b5b94a7828244", "title": "Global Batch Size Configuration"}, "3964": {"path": "/paddlevideo/tasks/train.py:97-124", "hash": "661ffc460bcc3a156811709a7ee25451", "title": "Static Model Conversion for Training and Validation"}, "3965": {"path": "/paddlevideo/tasks/train.py:125-150", "hash": "cb1495d28ba861f7dc15e6dc48d9746c", "title": "Training PaddleVideo with Datasets and Optimizers"}, "3966": {"path": "/paddlevideo/tasks/train.py:151-172", "hash": "f548bf11474f9f74973ed8ac87201ce3", "title": "Training Mode Checker and Handler"}, "3967": {"path": "/paddlevideo/tasks/train.py:173-204", "hash": "7b9d6dd792ec3f462dfbf9648cd9168b", "title": "Efficient Model Training with Paddle's DataParallel"}, "3968": {"path": "/paddlevideo/tasks/train.py:206-229", "hash": "672602f36e2a855189f129c8fc6caa08", "title": "Efficient AMP Training and Gradient Scaling"}, "3969": {"path": "/paddlevideo/tasks/train.py:230-253", "hash": "935d943f1a04ec0f9c75f132d8cd80b5", "title": "Gradient Descent and Backward Pass in Train.py"}, "3970": {"path": "/paddlevideo/tasks/train.py:254-277", "hash": "706376b3cb35db1c3c8b26a76a856ff3", "title": "Gradient Clearing & Optimizer Progress"}, "3971": {"path": "/paddlevideo/tasks/train.py:278-306", "hash": "b4aaba6ed977b63074dce84599c0f3b3", "title": "PaddleVideo Model Training and Evaluation"}, "3972": {"path": "/paddlevideo/tasks/train.py:307-330", "hash": "1f646f0c85d53e317a088ffaac47fafd", "title": "Training Model in PaddleVideo"}, "3973": {"path": "/paddlevideo/tasks/train.py:331-351", "hash": "cb9ad2409c6a6199eb16f9157795f184", "title": "Evaluate Dataset and Log Performance Metrics"}, "3974": {"path": "/paddlevideo/tasks/train.py:352-373", "hash": "10c6921f2c61776ee5e4f8fe40425add", "title": "Parallel Update: PreciseBN Accuracy Check"}, "3975": {"path": "/paddlevideo/tasks/train.py:374-395", "hash": "3820f2415da224fd9e6f9d362008bead", "title": "Precise Batch Normalization and Validation in Deep Learning"}, "3976": {"path": "/paddlevideo/tasks/train.py:396-417", "hash": "d826ca20169188b0009dae01d744b924", "title": "Saving Best Model and Metric Logging"}, "3977": {"path": "/paddlevideo/tasks/train.py:418-426", "hash": "45a6daf9bbe1413b8c89b1b1794244de", "title": "Periodic Model Saving"}, "3978": {"path": "/paddlevideo/tasks/train_dali.py", "hash": "23c7f192dacdeecd7765410ab90606cb", "title": "Train DALI with PaddleVideo"}, "3979": {"path": "/paddlevideo/tasks/train_dali.py:1-25", "hash": "20dd3826eb3e717c3d057a0f701ed1ae", "title": "PaddleVideo: TSN-Dali Dataset Loading and Preparation"}, "3980": {"path": "/paddlevideo/tasks/train_dali.py:26-63", "hash": "531bf4ed5bf29af9a79963fcdc1c1eb4", "title": "DALI Initialization and Training for TSN Model"}, "3981": {"path": "/paddlevideo/tasks/train_dali.py:64-88", "hash": "f4b78a3093149ac1d60b2a82dc72c754", "title": "Model Training Pipeline with Resume and Finetuning"}, "3982": {"path": "/paddlevideo/tasks/train_dali.py:89-116", "hash": "b1b9983cde6953cb302060102d9eca11", "title": "Training Model with Backpropagation"}, "3983": {"path": "/paddlevideo/tasks/train_dali.py:117-141", "hash": "67a29e5adecc73155d1d86f1775fb82d", "title": "Train DALI: Batch Normalization and Saving Progress"}, "3984": {"path": "/paddlevideo/tasks/train_dali.py:143-143", "hash": "3062cff0adc1046da7e3572938717bc8", "title": "Model Training Completion Logged"}, "3985": {"path": "/paddlevideo/tasks/train_multigrid.py", "hash": "3eaa4757a1b4e5682f42caa554028f0d", "title": "Training Multigrid Models in PaddleVideo"}, "3986": {"path": "/paddlevideo/tasks/train_multigrid.py:1-27", "hash": "0b37f54321906c2757b36b5d4cfa8b24", "title": "Setting Up PaddleVideo Environment"}, "3987": {"path": "/paddlevideo/tasks/train_multigrid.py:28-50", "hash": "23d9e6310435da93cfbc2f9d3c50541b", "title": "Multigrid Data Loader Construction"}, "3988": {"path": "/paddlevideo/tasks/train_multigrid.py:51-77", "hash": "b64afbffac8cf4e9680074e1b9665b98", "title": "Adjust Batch Size for Multigrid Training"}, "3989": {"path": "/paddlevideo/tasks/train_multigrid.py:78-110", "hash": "86653a60abc315ea318fa6aaf0b83086", "title": "Training PaddleVideo Model with DataLoaders and Parallelization"}, "3990": {"path": "/paddlevideo/tasks/train_multigrid.py:111-146", "hash": "8ab554c556bcc2702489442759bb1687", "title": "Multigrid Training Initialization"}, "3991": {"path": "/paddlevideo/tasks/train_multigrid.py:148-179", "hash": "685a9e041d389fde78008ee5272d1e72", "title": "Multigrid Model Training Setup"}, "3992": {"path": "/paddlevideo/tasks/train_multigrid.py:181-210", "hash": "b7f6481faf20e37400c72124b2c602b4", "title": "Multi-grid Training Optimizer Construction"}, "3993": {"path": "/paddlevideo/tasks/train_multigrid.py:211-235", "hash": "e5a80b2fbb9206064dd11ec470677650", "title": "Training Multigrid Models"}, "3994": {"path": "/paddlevideo/tasks/train_multigrid.py:236-262", "hash": "c5dc615569a2c7d8eda328c0c9044579", "title": "Adaptive Learning Rate Optimization"}, "3995": {"path": "/paddlevideo/tasks/train_multigrid.py:264-288", "hash": "8c44c2823b837cb420e79793328b2590", "title": "Batch-wise Evaluation and Logging"}, "3996": {"path": "/paddlevideo/tasks/train_multigrid.py:290-313", "hash": "fd268207e0cdde5bf175f00a8bbc4032", "title": "Batch Normalization & Performance Logging"}, "3997": {"path": "/paddlevideo/tasks/train_multigrid.py:314-335", "hash": "88f18dd9a636cbbd853a58d71609e837", "title": "Automatic Model Saving and Evaluation in PaddleVideo"}, "3998": {"path": "/paddlevideo/utils/__init__.py", "hash": "11a097247079a44b22b703aec2640924", "title": "PaddleVideo Utils: Imports, Build, Save & Load"}, "3999": {"path": "/paddlevideo/utils/build_utils.py", "hash": "f83a439f771717559e12a420144707b5", "title": "Build Utility Function"}, "4000": {"path": "/paddlevideo/utils/config.py", "hash": "506a810492d0ed734b0ae4f298f185c3", "title": "Config Management Utilities"}, "4001": {"path": "/paddlevideo/utils/config.py:1-34", "hash": "081a6b736cce947e6d1429e43321992e", "title": "Config Handling and Setup"}, "4002": {"path": "/paddlevideo/utils/config.py:35-67", "hash": "4022140bd21023dd723cb7f4de2de755", "title": "Config Parsing and Dict Visualization Functions"}, "4003": {"path": "/paddlevideo/utils/config.py:68-109", "hash": "bb6898eb4f3d327070968a9a0257cee8", "title": "Config Manipulation Functions"}, "4004": {"path": "/paddlevideo/utils/config.py:110-139", "hash": "f9f0c5f4a99dab4875f6f31790034f1f", "title": "Recursive Config Override Function"}, "4005": {"path": "/paddlevideo/utils/config.py:140-170", "hash": "c2ce69eac999f798ff005e7fae7959b6", "title": "Config Utilities: Load, Update and Display"}, "4006": {"path": "/paddlevideo/utils/config.py:171-174", "hash": "90381d7e083d83d183502c07e13e1429", "title": "Verify and Print Config"}, "4007": {"path": "/paddlevideo/utils/dist_utils.py", "hash": "e0dab607c4f876623c7a502c5b5bdf99", "title": "Distributed Computing Utilities"}, "4008": {"path": "/paddlevideo/utils/logger.py", "hash": "60ca93f2ed075ed2c8be2ee2f0a09be9", "title": "Colorful Logging for PaddleVideo"}, "4009": {"path": "/paddlevideo/utils/multigrid/__init__.py", "hash": "6bd1017c23601b2bedd5aa0ed1d2fb37", "title": "Multigrid Scheduler Imports"}, "4010": {"path": "/paddlevideo/utils/multigrid/batchnorm_helper.py", "hash": "c4b921d84e56deff8a46d4e475c22d8b", "title": "Batch Normalization for PyTorch Multigrid"}, "4011": {"path": "/paddlevideo/utils/multigrid/batchnorm_helper.py:1-36", "hash": "8a0c7742dc20618ed0e2db695dc283aa", "title": "Sub-BatchNorm Helper"}, "4012": {"path": "/paddlevideo/utils/multigrid/batchnorm_helper.py:37-64", "hash": "ff0578366203b7e9d585bf407c1773f4", "title": "Multi-Split Batch Normalization"}, "4013": {"path": "/paddlevideo/utils/multigrid/batchnorm_helper.py:65-85", "hash": "0970c30911caeba522749ef1f527fe9b", "title": "BatchNorm Layer Initialization"}, "4014": {"path": "/paddlevideo/utils/multigrid/batchnorm_helper.py:86-108", "hash": "efcb91d2540d04c52487d83e6964851c", "title": "BatchNorm3D Instantiation and Aggregation"}, "4015": {"path": "/paddlevideo/utils/multigrid/batchnorm_helper.py:109-135", "hash": "e46826068279ddb36edf33c32fcdaa7d", "title": "Batch Normalization Helper Class"}, "4016": {"path": "/paddlevideo/utils/multigrid/batchnorm_helper.py:136-142", "hash": "79a7bacc1e74d7ac165c1dfb91d6e533", "title": "BatchNorm Multiplication and Normalization"}, "4017": {"path": "/paddlevideo/utils/multigrid/interval_helper.py", "hash": "f7417c6c47a0b94e17eb6437ca8e4d83", "title": "Multigrid Evaluation Function"}, "4018": {"path": "/paddlevideo/utils/multigrid/multigrid.py", "hash": "774a1b519d4323554603eccb7ecb045b", "title": "Multigrid Schedule Management"}, "4019": {"path": "/paddlevideo/utils/multigrid/multigrid.py:1-25", "hash": "debf935fcb21ae49a1c64c14b36e07ca", "title": "Multigrid Scheduling Class Definition"}, "4020": {"path": "/paddlevideo/utils/multigrid/multigrid.py:26-50", "hash": "521c09731da649f24a468f66a68b6dc5", "title": "Multi-Grid Training Schedule Initialization"}, "4021": {"path": "/paddlevideo/utils/multigrid/multigrid.py:51-74", "hash": "87c22fc3b93aad620ba07c727e975853", "title": "Long Cycle Shape Update Function"}, "4022": {"path": "/paddlevideo/utils/multigrid/multigrid.py:75-94", "hash": "64840b86141cc3eaf324f2dbcda12f07", "title": "Multigrid Configuration and Update Settings"}, "4023": {"path": "/paddlevideo/utils/multigrid/multigrid.py:95-115", "hash": "7fb342ac0529e51e1ac0e150b6160509", "title": "Multigrid Configuration Checker"}, "4024": {"path": "/paddlevideo/utils/multigrid/multigrid.py:116-141", "hash": "7bfb45395989787469d83cb87a2a2a74", "title": "Multi-Grid Training Schedule Calculator"}, "4025": {"path": "/paddlevideo/utils/multigrid/multigrid.py:142-169", "hash": "6ef53bdcbbf5e8d0d1cba98d06653cf9", "title": "Multigrid Training Schedule in PaddleVideo"}, "4026": {"path": "/paddlevideo/utils/multigrid/multigrid.py:171-191", "hash": "480d50b79aa057b43433012888274fde", "title": "Multigrid Iteration Calculator"}, "4027": {"path": "/paddlevideo/utils/multigrid/multigrid.py:193-224", "hash": "32d792c440fbc55e35fce33ed2ed64cb", "title": "Multigrid Learning Rate Scheduler"}, "4028": {"path": "/paddlevideo/utils/multigrid/multigrid.py:225-233", "hash": "36fb1d3065fb0a7acd2d497aa5b3ba3a", "title": "Schedule-Based Shape Iterator"}, "4029": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py", "hash": "7e7911e27a34cd2515dc83e6d14163da", "title": "Ensuring State Dict Consistency in PaddleVideo"}, "4030": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py:1-31", "hash": "e067ccbd2ddfaae72ad03b237ed47f44", "title": "Converting Sub-BN to Normal BN Parameters"}, "4031": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py:32-58", "hash": "a2ebecc290c42d76873d77343ee50c96", "title": "Sub-BN Conversion for Checkpoint Loading"}, "4032": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py:59-81", "hash": "cd83f56e0160e37af6770184dbdd6a30", "title": "Shape Comparison and Concatenation"}, "4033": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py:82-103", "hash": "1d2276455240da6525bc4842e30f8bd0", "title": "Modify Optimizer State Dict Keys"}, "4034": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py:104-135", "hash": "598034f34bec4e5594af2b05a4292fd4", "title": "Compare Optimizer and Model Parameters"}, "4035": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py:136-163", "hash": "fcdc1ac8bd290c69660be432034c4eab", "title": "Update BN/Sub-BN Key Names"}, "4036": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py:164-190", "hash": "72d25c1c3217ef4db50fce9c19164a62", "title": "Save and Load Helper Functions"}, "4037": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py:191-216", "hash": "aadce11faa449cb23f46be4a31b8915e", "title": "Checkpoint Loader and Shape Comparison"}, "4038": {"path": "/paddlevideo/utils/multigrid/save_load_helper.py:217-237", "hash": "055c11a050505d1fc71d5b1fdc24ac85", "title": "Loading Weights and Optimizer State: SaveLoadHelper"}, "4039": {"path": "/paddlevideo/utils/multigrid/short_sampler.py", "hash": "55d4472ff6cf99d6e3d2985d936bf83b", "title": "Efficient Distributed Video Data Loading"}, "4040": {"path": "/paddlevideo/utils/multigrid/short_sampler.py:1-28", "hash": "0e8aed59128b16fa21e162d1b509c38b", "title": "Distributed ShortSampler for Dynamic Batch Sizing"}, "4041": {"path": "/paddlevideo/utils/multigrid/short_sampler.py:29-51", "hash": "733fb814c88e238cc39c75364c85b7fa", "title": "MultiGrid Initializer"}, "4042": {"path": "/paddlevideo/utils/multigrid/short_sampler.py:52-79", "hash": "5747af55730f75085350f3e223415544", "title": "Multigrid Sampler Initialization"}, "4043": {"path": "/paddlevideo/utils/multigrid/short_sampler.py:80-102", "hash": "0088c62515ecb39b1c1f7fb71727c47d", "title": "Balanced Subsampling with Modulo Handling"}, "4044": {"path": "/paddlevideo/utils/multigrid/short_sampler.py:103-130", "hash": "5d3e69e7dbdee27649557a2cb85b8b99", "title": "Dynamic Batch Sampler"}, "4045": {"path": "/paddlevideo/utils/multigrid/short_sampler.py:131-146", "hash": "412580f6e641b14d3dfcbff1ef864cfc", "title": "Efficient Video Sampler for PaddleVideo"}, "4046": {"path": "/paddlevideo/utils/precise_bn.py", "hash": "b862a992ff15e89aeec0112e55991b99", "title": "Precise Batch Normalization Acceleration"}, "4047": {"path": "/paddlevideo/utils/precise_bn.py:1-34", "hash": "1947e5c3b17259bac45fd6f86fa8b2b6", "title": "Precise Batch Normalization: Accuracy and Efficiency Boost"}, "4048": {"path": "/paddlevideo/utils/precise_bn.py:35-56", "hash": "efc2d580a4c68756a712b6fcdd9e7c6f", "title": "Precise BN Stats Recomputation"}, "4049": {"path": "/paddlevideo/utils/precise_bn.py:58-83", "hash": "789ee22f0724f880e1f884e702b99f31", "title": "Accurate Batch Normalization Update"}, "4050": {"path": "/paddlevideo/utils/precise_bn.py:84-94", "hash": "c6df7d9431553361fbf1bcb560397fb9", "title": "Accurate Batch Normalization Update"}, "4051": {"path": "/paddlevideo/utils/profiler.py", "hash": "54486a0418777b38ee758c526958dbd2", "title": "PaddleVideo Profiler: Performance Analysis and Optimization"}, "4052": {"path": "/paddlevideo/utils/profiler.py:1-29", "hash": "c584cb9c2164cfad6b7de74ec3f76280", "title": "PaddleVideo Profiler Module Init"}, "4053": {"path": "/paddlevideo/utils/profiler.py:30-53", "hash": "a86b0b1d419869daa79c27adaeddd117", "title": "Profiler Options Class"}, "4054": {"path": "/paddlevideo/utils/profiler.py:54-77", "hash": "4c6d9e1fa7d9e5893867dafa8282b298", "title": "Python Profiler: Option Parser and Batch Range"}, "4055": {"path": "/paddlevideo/utils/profiler.py:79-105", "hash": "7004dc7ea2ae9e15d4bcc9803ff673a3", "title": "Operator-Level Timing Profiler with PaddlePaddle"}, "4056": {"path": "/paddlevideo/utils/profiler.py:106-128", "hash": "4849e235de5a04ea59ff802482b05480", "title": "Profiler Object Initialization"}, "4057": {"path": "/paddlevideo/utils/record.py", "hash": "89d9c54c6141558972e55576091c0ff9", "title": "Efficient Training Metrics Recording"}, "4058": {"path": "/paddlevideo/utils/record.py:1-32", "hash": "561a1659f637bea7da2bacb51ba1ca1f", "title": "Record Builder and Logger Setup"}, "4059": {"path": "/paddlevideo/utils/record.py:33-48", "hash": "c7b973de2e925fb4393ad07a345e3626", "title": "Averaging Metrics in PaddleVideo Record"}, "4060": {"path": "/paddlevideo/utils/record.py:49-65", "hash": "f6b20ae30ae072ad5a43cafca3dbe014", "title": "Conditional Metric Addition"}, "4061": {"path": "/paddlevideo/utils/record.py:67-105", "hash": "7c55fc16a894d704cbb9139b23073315", "title": "Record Dictionary with AverageMeter Objects"}, "4062": {"path": "/paddlevideo/utils/record.py:106-136", "hash": "a81494e7362391ed4dd7ac1fee63ec43", "title": "Batch Logging and Metrics Calculation"}, "4063": {"path": "/paddlevideo/utils/record.py:137-155", "hash": "c70c6ef3e023b070ad47583dc796cd7c", "title": "Training Progress Logger"}, "4064": {"path": "/paddlevideo/utils/record.py:157-168", "hash": "cfa8224ed6475d43cf1e3e82d0f24474", "title": "Mean Metric String Calculation and Formatting"}, "4065": {"path": "/paddlevideo/utils/registry.py", "hash": "37f5a552289064dfc46330490dcf80ac", "title": "Registry Class for Object Mapping and Registration"}, "4066": {"path": "/paddlevideo/utils/save_load.py", "hash": "59c047b941ff6607e7522e136183b37e", "title": "Model Save and Load in PaddlePaddle"}, "4067": {"path": "/paddlevideo/utils/save_load.py:1-30", "hash": "37116fb1a09f01012e93a998394c83ee", "title": "Swin Model Transfer in PaddleVideo"}, "4068": {"path": "/paddlevideo/utils/save_load.py:31-61", "hash": "140d4956535b0425335a7053afb06eb6", "title": "Ensuring Model State Consistency"}, "4069": {"path": "/paddlevideo/utils/save_load.py:62-82", "hash": "23169d296a5b59e716276d071dd6591c", "title": "Loading Weights for Position Bias"}, "4070": {"path": "/paddlevideo/utils/save_load.py:83-105", "hash": "49e51abc0e01236c07df66e993c17d7b", "title": "Model Parameter Transformation for ViT Models"}, "4071": {"path": "/paddlevideo/utils/save_load.py:106-126", "hash": "2444073049a4a318e541f3482fa8f616", "title": "Adjusting Positional Embeddings for Patch Count"}, "4072": {"path": "/paddlevideo/utils/save_load.py:127-147", "hash": "cba65dccf2e0d87f62d7eee066d7f24a", "title": "Loading Weights: Model Shape Check and Progress Bar"}, "4073": {"path": "/paddlevideo/utils/save_load.py:148-172", "hash": "164181959af7aaf52dae2ebe28dbb62b", "title": "ResNet18 Weight Adaptation"}, "4074": {"path": "/paddlevideo/utils/save_load.py:173-197", "hash": "b1dc759c94f720a4f11cb1432a47b42a", "title": "Dynamic Weights Loading with Progress Updates"}, "4075": {"path": "/paddlevideo/utils/save_load.py:198-226", "hash": "edadf6c942c8c9723397e306bd556dee", "title": "Load Pre-trained Model Parameters"}, "4076": {"path": "/paddlevideo/utils/save_load.py:227-248", "hash": "f2733dfcb841f22a097ab13eacd2058e", "title": "Model Weights and Dictionary Loading"}, "4077": {"path": "/paddlevideo/utils/save_load.py:249-282", "hash": "e227ae94f75902b36862117cbd8c55c5", "title": "Save and Load Utilities"}, "4078": {"path": "/paddlevideo/utils/save_load.py:283-289", "hash": "75b9a3073324075c8e36ed75a41dd897", "title": "Save and Load Functions with Paddle"}, "4079": {"path": "/paddlevideo/version.py", "hash": "6ca7d799557f98e977461c1e4bfd1c3e", "title": "PaddleVideo Version Info"}, "4080": {"path": "/run.sh", "hash": "3d75e2aa31ca5441cc8ca54480a39215", "title": "PaddlePaddle: Train, Test, Export, Infer"}, "4081": {"path": "/run.sh:1-18", "hash": "25aef41f81dff63c299afd070f61dd8a", "title": "Distributed CUDA Training with 8 GPUs"}, "4082": {"path": "/run.sh:20-36", "hash": "e03ccab9a96164ec05eeb7eca65ffc2a", "title": "PaddlePaddle Video Recognition Training Script"}, "4083": {"path": "/run.sh:38-54", "hash": "d2da813149fd3a81ff9fe8f121449839", "title": "Distributed Deep Learning Training with PaddlePaddle"}, "4084": {"path": "/run.sh:54-74", "hash": "b5661db9a9f4810ee9679abf9a3dc57f", "title": "Distributed Deep Learning Training and Testing Script"}, "4085": {"path": "/run.sh:76-89", "hash": "0781e4d439704f76f0336f00ea3f8fad", "title": "PaddleVideo Test and Inference Guide"}, "4086": {"path": "/setup.py", "hash": "b27f50e69bc7496f9d22c965c7222578", "title": "PaddleVideo: Python Video Understanding Utility"}, "4087": {"path": "/setup.py:1-31", "hash": "69120aff15be69361914728a544acbb9", "title": "PaddleVideo Setup with Setuptools"}, "4088": {"path": "/setup.py:32-53", "hash": "6de2461bb05c009c2f03ba0d94284125", "title": "Setting Up PaddleVideo Package"}, "4089": {"path": "/setup.py:54-56", "hash": "8c7f15bf6206bfacc6f55a5a61fa892e", "title": "Python 3.7 Setup Metadata Classification"}, "4090": {"path": "/test_tipc/README.md", "hash": "b26f3a882d5ef5935d16edb3cc093f1b", "title": "TIPC-Enabled PaddleVideo Tutorial"}, "4091": {"path": "/test_tipc/README.md:2-30", "hash": "481458e0926b33cf1f298c7ecd06ce52", "title": "PaddleVideo TIPC Overview"}, "4092": {"path": "/test_tipc/README.md:31-55", "hash": "7ba83b1b7f5c5e2fbd89fee11f8e41c1", "title": "Test Tool for PaddleVideo: Supported Models and Configurations"}, "4093": {"path": "/test_tipc/README.md:56-76", "hash": "b579f10f7e7748f69288b4c8ffeae6a9", "title": "Directory Structure and Testing Scripts of PaddleVideo test_tipc Project"}, "4094": {"path": "/test_tipc/README.md:77-112", "hash": "e37811cf7cfff2fdfe183588fba1a0d2", "title": "Simplified TIPC Testing Process"}, "4095": {"path": "/test_tipc/README.md:114-126", "hash": "7681466dcbc76d838c627c8d01a9b2d9", "title": "Clear and Consistent Naming Conventions for PaddleVideo"}, "4096": {"path": "/test_tipc/README.md:127-133", "hash": "6fdb65f9fe4d3c65e169ff2e33e3f082", "title": "PaddleVideo Testing: Comprehensive Cases and Functionalities"}, "4097": {"path": "/test_tipc/benchmark_train.sh", "hash": "676dd55af820b6ca11b7f4553a461bf0", "title": "PaddleVideo Benchmark Training"}, "4098": {"path": "/test_tipc/benchmark_train.sh:1-42", "hash": "127b3c6ffb302d6307de31b64f850f1e", "title": "PaddlePaddle GPU Benchmark Training Script"}, "4099": {"path": "/test_tipc/benchmark_train.sh:43-86", "hash": "42b89c0307a6624111a05bcfcf42916a", "title": "Manipulating Configs for PaddleVideo"}, "4100": {"path": "/test_tipc/benchmark_train.sh:87-123", "hash": "9ac3e38ab7dc8cb6fddba34c2e633cc0", "title": "Training Model with Parameters"}, "4101": {"path": "/test_tipc/benchmark_train.sh:124-158", "hash": "e58f2d7959d8f76a875f622390a3b7e0", "title": "Benchmark/Train Environment Modification"}, "4102": {"path": "/test_tipc/benchmark_train.sh:159-197", "hash": "15969c8b3b628c9cd7e37e2b3d3ef8e1", "title": "Benchmark Configuration Code Snippet"}, "4103": {"path": "/test_tipc/benchmark_train.sh:198-220", "hash": "7b55ddc573c3274cadb0672cd23199f0", "title": "Batch Size and Precision Training: PaddleVideo Benchmark"}, "4104": {"path": "/test_tipc/benchmark_train.sh:221-234", "hash": "6892e758ef1703ffa2c45a3e0318fed9", "title": "Directory Creation and Logging Setup"}, "4105": {"path": "/test_tipc/benchmark_train.sh:235-253", "hash": "c3a04f99141ca12e1164d267ad7a3f12", "title": "Non-Profiled Script Execution"}, "4106": {"path": "/test_tipc/benchmark_train.sh:255-274", "hash": "8bc816ee649ce11a849e78f6914fd6b4", "title": "Python Log File Analysis Script"}, "4107": {"path": "/test_tipc/benchmark_train.sh:275-288", "hash": "ba860de6c5da340abbd47da2a161a0ac", "title": "Speeding Up TimeSformer Training"}, "4108": {"path": "/test_tipc/benchmark_train.sh:289-308", "hash": "a48acc439aa15e471a8e2b695a96b082", "title": "Benchmark Training Script"}, "4109": {"path": "/test_tipc/benchmark_train.sh:309-318", "hash": "718aa821101b1e883960ccc5f2fffce4", "title": "Benchmark Training Iteration"}, "4110": {"path": "/test_tipc/common_func.sh", "hash": "aa802229059de54f7ae8e5371c505f01", "title": "Common Functions for Parsing and Status Checks"}, "4111": {"path": "/test_tipc/common_func.sh:1-58", "hash": "ca557e6232da6490a3fe1971cf47a8fd", "title": "Parameter Parsing and Status Functions"}, "4112": {"path": "/test_tipc/common_func.sh:59-66", "hash": "71eea5b12cb3822076efd5475284b7a9", "title": "Status Logging Function"}, "4113": {"path": "/test_tipc/compare_results.py", "hash": "16e46ea53c41d054acf77e71f1f4325c", "title": "Log Parser and Comparer"}, "4114": {"path": "/test_tipc/compare_results.py:1-40", "hash": "ac3cd79d073180f8572211a851899826", "title": "Command-line Parser and Shell Executor"}, "4115": {"path": "/test_tipc/compare_results.py:42-64", "hash": "b27e9c5347b28181ccd023d4606c7ff7", "title": "Python/C++ Inference Result Parser"}, "4116": {"path": "/test_tipc/compare_results.py:65-89", "hash": "74d74e5946ff39655b0422d477e1c5d9", "title": "Parse Log File Function"}, "4117": {"path": "/test_tipc/compare_results.py:90-118", "hash": "a225c75073ff916a3e0ae67258707a6d", "title": "Three Functions for Ground Truth Data Processing"}, "4118": {"path": "/test_tipc/compare_results.py:119-146", "hash": "dbca1a96b4007da0cc196c2e9b6872c9", "title": "Validate Code Predictions with Ground Truth"}, "4119": {"path": "/test_tipc/compare_results.py:147-170", "hash": "2f6981aeede04715f23ef16bc8e5f72c", "title": "Compare and Validate Results"}, "4120": {"path": "/test_tipc/compare_results.py:171-171", "hash": "2815935ffc6c63a48a6cad48a5ad9032", "title": "Filename Formatting for Comparison"}, "4121": {"path": "/test_tipc/extract_loss.py", "hash": "ec06580b547bdbaa506468450a3078b6", "title": "Extract and Calculate Loss Expressions"}, "4122": {"path": "/test_tipc/extract_loss.py:1-28", "hash": "d4346c9c4c80498461ccb89ded94d95d", "title": "Loss Expression Parser"}, "4123": {"path": "/test_tipc/extract_loss.py:29-71", "hash": "c4d7b5f687fc3ce5e5ef708222dd60a7", "title": "Regular Expression Parsing and Validation Functions"}, "4124": {"path": "/test_tipc/extract_loss.py:74-102", "hash": "0a6276070770f721d91c94757640eafd", "title": "Function for Tuples Calculation and Printing"}, "4125": {"path": "/test_tipc/prepare.sh", "hash": "ddfa3049540baa3d78cfd8265e6a1a6f", "title": "Preparing Video Detection Models in PaddlePaddle"}, "4126": {"path": "/test_tipc/prepare.sh:1-44", "hash": "68debf86a8e755ddad47bf8796fc254c", "title": "Prepare Environment for PaddlePaddle Video Object Detection"}, "4127": {"path": "/test_tipc/prepare.sh:45-67", "hash": "3f7d5b52c7ab5251e7df0ed6d772fb65", "title": "Conditional Download Tasks for Models and Datasets"}, "4128": {"path": "/test_tipc/prepare.sh:68-87", "hash": "eccbb1a6918979c9a6a75245bf8f9e1b", "title": "Model-Specific Data Download Script"}, "4129": {"path": "/test_tipc/prepare.sh:88-105", "hash": "2cb3b4c3985f990f181c1d1108a13b32", "title": "Preparing Data and Weights for Models"}, "4130": {"path": "/test_tipc/prepare.sh:106-127", "hash": "8be682d0d70cf70c0a84243c364bba9b", "title": "Conditional Data Download and Extraction"}, "4131": {"path": "/test_tipc/prepare.sh:128-149", "hash": "8eeb287b9d60163884c7d3def9837db1", "title": "Model Weights and Data Preprocessing"}, "4132": {"path": "/test_tipc/prepare.sh:150-168", "hash": "052df1a31d0200a82f04728b9b4d36b0", "title": "Pretraining with Whole Data"}, "4133": {"path": "/test_tipc/prepare.sh:169-188", "hash": "8c1ebd0b83a9874234c36cc7967332b9", "title": "Download Model Weights and Data"}, "4134": {"path": "/test_tipc/prepare.sh:189-205", "hash": "5c6d21b40fb6d895f40f7f5b43c4296d", "title": "TSM Data Preparation Script"}, "4135": {"path": "/test_tipc/prepare.sh:206-223", "hash": "54c3d4f32ea8512c9cfcdb60938a8063", "title": "Model-Specific Data Preparation Script"}, "4136": {"path": "/test_tipc/prepare.sh:224-241", "hash": "cd4e3d4d94fa194e1b67d73377f14ff3", "title": "Preparing AttentionLSTM Model Environment"}, "4137": {"path": "/test_tipc/prepare.sh:242-261", "hash": "a3352396987efcf8bb2d30d1f08b7d5f", "title": "Preparing Kinetics400 for PaddleVideo"}, "4138": {"path": "/test_tipc/prepare.sh:262-285", "hash": "1e4dba7fecc18bdc57094f4848bd2475", "title": "Handling Model Pretraining Scenarios"}, "4139": {"path": "/test_tipc/prepare.sh:286-308", "hash": "d7a9408c83059130734ee9ad89ab8f3f", "title": "Model-Specific Data Download and Preparation"}, "4140": {"path": "/test_tipc/prepare.sh:309-329", "hash": "ddb3e8d94e4d904d37a76ee90e3c0345", "title": "Model-Specific Pretrained File Downloads"}, "4141": {"path": "/test_tipc/prepare.sh:329-345", "hash": "aced318c8263864ff65f39baa9d09799", "title": "Prepare Dataset for AttentionLSTM Model"}, "4142": {"path": "/test_tipc/prepare.sh:346-370", "hash": "c7ed561a7da559198e82b953e3114904", "title": "Model-Based Actions in TIPC Preparation"}, "4143": {"path": "/test_tipc/prepare.sh:371-385", "hash": "586879774a0792f4de3a092d198f7e7a", "title": "Model-Based Weights Download"}, "4144": {"path": "/test_tipc/prepare.sh:386-406", "hash": "325960f5091eff440c7749afed538e53", "title": "Script Downloads Pre-trained Model Weights"}, "4145": {"path": "/test_tipc/prepare.sh:407-427", "hash": "1f7cb9bcc2f82c2dc0fc143d1482de4c", "title": "Model Name Check and Download"}, "4146": {"path": "/test_tipc/prepare.sh:428-446", "hash": "0d6a8fc38429e2d98bf05a1a7e174044", "title": "PaddleVideo Model Weights Download"}, "4147": {"path": "/test_tipc/prepare.sh:447-468", "hash": "0f0ea3ec8ed8d2479ecad541252b412d", "title": "Model-Specific Data Downloads"}, "4148": {"path": "/test_tipc/prepare.sh:469-497", "hash": "f030b0d2310ab62f0a4a6b8cc001cd51", "title": "Model-Specific Data Preparation"}, "4149": {"path": "/test_tipc/prepare.sh:498-520", "hash": "6bf3b97f881b2d4842ca97332ca095ab", "title": "Prepare Inference Models"}, "4150": {"path": "/test_tipc/prepare.sh:521-552", "hash": "08b65176c167d85f9d37ea90ea3afe57", "title": "Model Check and Download for TIPC"}, "4151": {"path": "/test_tipc/prepare.sh:553-577", "hash": "c20268728fcd5c2ec85dea410faac0bc", "title": "Mode-Based Actions in TIPC Script"}, "4152": {"path": "/test_tipc/test_inference_cpp.sh", "hash": "d923868ce505066108dc35fff410ffe3", "title": "PaddleVideo Inference Testing"}, "4153": {"path": "/test_tipc/test_inference_cpp.sh:1-29", "hash": "9b0ed3be59f7a5d58a304b698c17093b", "title": "Bash Script for C++ Inference Parser"}, "4154": {"path": "/test_tipc/test_inference_cpp.sh:30-58", "hash": "82754e6ec777e19c8ed005a6e63692d0", "title": "PaddleVideo C++ Inference Setup"}, "4155": {"path": "/test_tipc/test_inference_cpp.sh:59-72", "hash": "692c330ff724fc3910cb4afab7362cdc", "title": "Skipping MKLDNN Quantized Tests"}, "4156": {"path": "/test_tipc/test_inference_cpp.sh:73-85", "hash": "2812e9a00914778e7e4395d43a434f94", "title": "Inference Script Configuration and Execution"}, "4157": {"path": "/test_tipc/test_inference_cpp.sh:86-101", "hash": "0129d9f6f39c2769670a5e76b045e2fe", "title": "TRT Precision Combinations Test"}, "4158": {"path": "/test_tipc/test_inference_cpp.sh:102-112", "hash": "1a65887ad8960141b38e5a0fbd9ae5c6", "title": "Inference CPP Script Execution"}, "4159": {"path": "/test_tipc/test_inference_cpp.sh:114-146", "hash": "0fe436cad971c10a5b3abe44e4f735ef", "title": "Hardware Support and OpenCV Setup"}, "4160": {"path": "/test_tipc/test_inference_cpp.sh:147-178", "hash": "ac80e48a800c8831a64703d9e5cbdabc", "title": "Building PaddleVideo Libraries and Demo"}, "4161": {"path": "/test_tipc/test_inference_cpp.sh:179-225", "hash": "dc3a5d4d111c083ca25ac6aa68619b93", "title": "Configuring PaddleVideo and Running Inference Tests"}, "4162": {"path": "/test_tipc/test_inference_cpp.sh:226-228", "hash": "e004e1b74b84a131b72461fbf15d795a", "title": "Executing C++ Inference Commands"}, "4163": {"path": "/test_tipc/test_paddle2onnx.sh", "hash": "e85b0b297d16d876f100d4f162ed5e35", "title": "Automating Paddle2ONNX Conversion in test_tipc/test_paddle2onnx.sh"}, "4164": {"path": "/test_tipc/test_paddle2onnx.sh:1-32", "hash": "12e61ed8600367ecbf6aac05410e9533", "title": "Paddle2Onnx: Extracting Model Details from Log Files"}, "4165": {"path": "/test_tipc/test_paddle2onnx.sh:33-58", "hash": "c049e83da4c036d7762e2b44846ebd4b", "title": "Setting Up Paddle2Onnx Inference"}, "4166": {"path": "/test_tipc/test_paddle2onnx.sh:59-73", "hash": "c91c304871bba1b3e5c196a117ea6603", "title": "Paddle2Onnx Conversion and Inference Logging"}, "4167": {"path": "/test_tipc/test_paddle2onnx.sh:74-81", "hash": "191569a6f35978a379d84ebbb7295318", "title": "Test: Export Count, IFS, and Echo Message"}, "4168": {"path": "/test_tipc/test_ptq_inference_python.sh", "hash": "61dcf7006e8e1ee9a254a1d739ada749", "title": "PaddleVideo GPU/CPU Inference Test"}, "4169": {"path": "/test_tipc/test_ptq_inference_python.sh:1-29", "hash": "26dd3f9ebda2934b2b48f677f3e0f743", "title": "Python Shell Script for Model Inference"}, "4170": {"path": "/test_tipc/test_ptq_inference_python.sh:30-52", "hash": "45c022fa59dcacbfb8593340ac6af1f0", "title": "Retrieving Config Values for Trainer and Inference"}, "4171": {"path": "/test_tipc/test_ptq_inference_python.sh:55-74", "hash": "7ce64950597872a130620a8b9efcbd32", "title": "Python-Powered GPU/CPU Inference Logging"}, "4172": {"path": "/test_tipc/test_ptq_inference_python.sh:74-88", "hash": "c4d876926be54ae5654d9be84f00c2a8", "title": "Looped GPU Inference Testing"}, "4173": {"path": "/test_tipc/test_ptq_inference_python.sh:89-112", "hash": "b86f37a284752fdd65039cde62d2f5a8", "title": "Hardware-Optimized PaddleVideo Inference"}, "4174": {"path": "/test_tipc/test_ptq_inference_python.sh:113-129", "hash": "fb98515f3512ec390e22765ab69a79a4", "title": "Model Export Preparation and Check"}, "4175": {"path": "/test_tipc/test_ptq_inference_python.sh:130-132", "hash": "89b379a2646fdc5b5d1d73695c5c814c", "title": "Python Inference Calling"}, "4176": {"path": "/test_tipc/test_serving_infer_cpp.sh", "hash": "ca1c74f247817532d51402f90f892958", "title": "Streamline Bash Model Serving with GPU"}, "4177": {"path": "/test_tipc/test_serving_infer_cpp.sh:1-28", "hash": "a586c1e50dcf27edf8f74e37015f1030", "title": "Custom Bash Script for Configuration and Image Classification"}, "4178": {"path": "/test_tipc/test_serving_infer_cpp.sh:29-54", "hash": "0b6d9f79d65cc7187b75cbdbdedec1da", "title": "Initialize Model and Config Files"}, "4179": {"path": "/test_tipc/test_serving_infer_cpp.sh:55-73", "hash": "c52b5a9c01a5b506dcd5f04f4c937016", "title": "Setup C++ Server and Client on GPU"}, "4180": {"path": "/test_tipc/test_serving_infer_cpp.sh:73-100", "hash": "978a719ddf231ae7cad8cb8d21a7d604", "title": "PaddlePaddle Serving Server Test"}, "4181": {"path": "/test_tipc/test_serving_infer_cpp.sh:103-107", "hash": "efc9f112c26719a3f18e1ec713ee5a56", "title": "Incrementing \"Count\" in Web Service Test"}, "4182": {"path": "/test_tipc/test_serving_infer_python.sh", "hash": "17562dd7d829c63330b67ecbbd90e37c", "title": "Automating Model Serving with Bash"}, "4183": {"path": "/test_tipc/test_serving_infer_python.sh:1-29", "hash": "ba65cba05e6787cd7ebd057d6abac8f6", "title": "Bash Script Configures Model Inference Environment"}, "4184": {"path": "/test_tipc/test_serving_infer_python.sh:30-54", "hash": "8d2ddb45efcff92e073ddd9a77112660", "title": "Model Serving Code Execution"}, "4185": {"path": "/test_tipc/test_serving_infer_python.sh:56-77", "hash": "b540ab7ed1e4f8a201b1e8035e6e0e4e", "title": "Automated Web Service Deployment with Python"}, "4186": {"path": "/test_tipc/test_serving_infer_python.sh:78-105", "hash": "4f9832f5f37bd961a0c26545d9e2f2f4", "title": "CUDA Test Environment Setup and Cleanup"}, "4187": {"path": "/test_tipc/test_train_dy2static_python.sh", "hash": "005ab5f02c7b5a1eb387c842a6f51158", "title": "Dygraph vs Dy2Static Model Comparison"}, "4188": {"path": "/test_tipc/test_train_dy2static_python.sh:1-30", "hash": "c77ac5277244c696e6a29adb9bd9e633", "title": "Configure and Initialize Environment"}, "4189": {"path": "/test_tipc/test_train_dy2static_python.sh:31-57", "hash": "9ddf5ab164f3fc06a23a9f1dcdf7d7a8", "title": "Configure, Run and Analyze Dygraph and Dy2Static Models"}, "4190": {"path": "/test_tipc/test_train_dy2static_python.sh:58-73", "hash": "6c881959c5d8f19f62190a3c48b12073", "title": "Diff and Log Comparison of Models"}, "4191": {"path": "/test_tipc/test_train_inference_python.sh", "hash": "689fc011d73300f662dc6d5c4d101248", "title": "PaddleVideo Model Optimizer"}, "4192": {"path": "/test_tipc/test_train_inference_python.sh:1-30", "hash": "86932e068fb74e763997d774b5b68d4f", "title": "Parse Training Parameters"}, "4193": {"path": "/test_tipc/test_train_inference_python.sh:31-56", "hash": "6ceebd38cbf7b06123b0ec8e48d52966", "title": "Parsing Key-Value Configurations"}, "4194": {"path": "/test_tipc/test_train_inference_python.sh:57-79", "hash": "364d284666e1df0b710af6c9662be7a7", "title": "Configuration Parser and Variable Assigner"}, "4195": {"path": "/test_tipc/test_train_inference_python.sh:80-104", "hash": "084e16c10c2fd97f30cce4ea662c3116", "title": "Config File Parsing for Inference Parameters"}, "4196": {"path": "/test_tipc/test_train_inference_python.sh:105-125", "hash": "a077d879780ad5232f863e6a01f77a9a", "title": "Configuration Extraction for Test and Train"}, "4197": {"path": "/test_tipc/test_train_inference_python.sh:126-157", "hash": "4a700de8d127a6f0212389c114c8481c", "title": "Inference Code Configuration & Logging"}, "4198": {"path": "/test_tipc/test_train_inference_python.sh:158-170", "hash": "edc035912719f70f4b86555bf8061d5a", "title": "Iterating Over Precision Values"}, "4199": {"path": "/test_tipc/test_train_inference_python.sh:171-181", "hash": "7dd34870c9ae3591419f0a34f5540ec1", "title": "Automating Test Loop with Python Script"}, "4200": {"path": "/test_tipc/test_train_inference_python.sh:182-198", "hash": "0474b2d3934d48c93fabc4e507f5af79", "title": "Optimizing Inference Parameters"}, "4201": {"path": "/test_tipc/test_train_inference_python.sh:200-212", "hash": "843ceef86da9ef6b6338fbb45cb0bca4", "title": "Inference Parameter Configuration"}, "4202": {"path": "/test_tipc/test_train_inference_python.sh:214-243", "hash": "07c31185ab4bfb120e8c96139fef1d5f", "title": "Inference Model Testing with PaddleVideo"}, "4203": {"path": "/test_tipc/test_train_inference_python.sh:244-274", "hash": "a08a0d31106ce243f151bf5129f04f5b", "title": "Multi-GPU Inference Loop"}, "4204": {"path": "/test_tipc/test_train_inference_python.sh:275-303", "hash": "8aafcd5a509d1d8c939600510c85dfb0", "title": "GPU Environment Variable Setup"}, "4205": {"path": "/test_tipc/test_train_inference_python.sh:304-325", "hash": "357114ad7c04365cd7eca6f6ce10bc86", "title": "Conditional Assignment of Train and Export Tasks"}, "4206": {"path": "/test_tipc/test_train_inference_python.sh:326-347", "hash": "5634a45760e712e4a516199874b75f25", "title": "Setting Model Training Parameters"}, "4207": {"path": "/test_tipc/test_train_inference_python.sh:348-367", "hash": "3f5dcb683cc2466b097d187eadf9c92a", "title": "Distributed PaddleVideo Training and Inference"}, "4208": {"path": "/test_tipc/test_train_inference_python.sh:368-378", "hash": "77059604b9813cf31bdc017efe660b1c", "title": "Multi-GPU/Machine Training with PaddlePaddle"}, "4209": {"path": "/test_tipc/test_train_inference_python.sh:378-395", "hash": "82ffec666068b56761076e72b5f03d11", "title": "Train PaddleVideo Model with Parameters"}, "4210": {"path": "/test_tipc/test_train_inference_python.sh:396-410", "hash": "ebaed66ff9c9dcb7810847c5759a4a8d", "title": "Evaluate Model Parameters and Commands"}, "4211": {"path": "/test_tipc/test_train_inference_python.sh:410-426", "hash": "ad031d6f0d46b7f218bf2dbe8aaedcf0", "title": "Setting up Variables for Inference"}, "4212": {"path": "/test_tipc/test_train_inference_python.sh:426-433", "hash": "8fe49fcfbae70857bee7bfc71ddc1fc0", "title": "Set CUDA Devices for Inference and Training"}, "4213": {"path": "/test_tipc/test_train_inference_python_npu.sh", "hash": "c37386bac832f03bfc6ff8eeb81cdb7b", "title": "NPU Script Updates and Config Changes"}, "4214": {"path": "/test_tipc/test_train_inference_python_npu.sh:1-39", "hash": "d3efd5ce0f96332b291375daf44e24b9", "title": "Switching to NPU Execution Script"}, "4215": {"path": "/test_tipc/test_train_inference_python_npu.sh:40-42", "hash": "2ded2853e55e8cc7f9c4c08657f9cf7a", "title": "Bash Script Execution"}, "4216": {"path": "/test_tipc/test_train_inference_python_xpu.sh", "hash": "500271acc9aabb6fffa58d587d1449c0", "title": "Update XPU Execution Script"}, "4217": {"path": "/test_tipc/test_train_inference_python_xpu.sh:1-39", "hash": "bf2d5a1d321c2777726ae0b351e900d1", "title": "PaddleVideo XPU Configuration Update"}, "4218": {"path": "/test_tipc/test_train_inference_python_xpu.sh:40-42", "hash": "23af3c2705952b485a3e5c60a900eddc", "title": "Bash Command Execution and Logging"}, "4219": {"path": "/tools/__init__.py", "hash": "4239003f054a409f26b826c8c9e993b9", "title": "Tools Package Initialization"}, "4220": {"path": "/tools/ava_predict.py", "hash": "b7a7935d51fcf24b5130f3d7b789ef65", "title": "AVA Model Inference and Action Detection"}, "4221": {"path": "/tools/ava_predict.py:1-32", "hash": "a1f24c113bf1f413d2374c749828ff33", "title": "AVA Action Unit Detection Python Script"}, "4222": {"path": "/tools/ava_predict.py:33-68", "hash": "7993f99151ac0b75fbf408830ac2d3ea", "title": "AVA Annotation Utilities"}, "4223": {"path": "/tools/ava_predict.py:69-98", "hash": "6c1158a712457d9c88fae698fe2a4082", "title": "Visualize Frames with Predicted Annotations"}, "4224": {"path": "/tools/ava_predict.py:99-125", "hash": "496ff8c1d132bec1eac74532aae588e6", "title": "Image Box Annotation Visualizer"}, "4225": {"path": "/tools/ava_predict.py:126-160", "hash": "3f5afe89575c969ecf5fb245f7b71f72", "title": "Video Frame Extractor"}, "4226": {"path": "/tools/ava_predict.py:161-191", "hash": "9f26304bd44a02c2e09669f8d54ecc4b", "title": "PaddleVideo Inference with AVA Predict"}, "4227": {"path": "/tools/ava_predict.py:192-222", "hash": "f6cbb61c58622743a5dc4cec6a734086", "title": "AVA Predict Function Arguments and Result Packaging"}, "4228": {"path": "/tools/ava_predict.py:223-264", "hash": "c5a5b1c0ae82e010d08e24e3cc9df786", "title": "Label Prediction Function"}, "4229": {"path": "/tools/ava_predict.py:267-294", "hash": "3814f7971e174f66a66bbc50296e0241", "title": "Human Detection via Frame Paths"}, "4230": {"path": "/tools/ava_predict.py:296-334", "hash": "84308b6fd183dbf8d27c84c10571ddc5", "title": "Reads Detection Results File for Bounding Box Proposals"}, "4231": {"path": "/tools/ava_predict.py:337-365", "hash": "1a396daee1246c2b0139e545b5d3447d", "title": "Extract Frames and Set Up Pipelines"}, "4232": {"path": "/tools/ava_predict.py:366-395", "hash": "915e761a7bb1cf4633086690709985e4", "title": "AVA Prediction Code Snippet"}, "4233": {"path": "/tools/ava_predict.py:396-421", "hash": "0ff08cd06459cf6d97a71477ffa679b2", "title": "SpatioTemporal Action Detection Code"}, "4234": {"path": "/tools/ava_predict.py:422-455", "hash": "ce5009a3685bc0ff1b965a50b50dcb4e", "title": "Tensorize and Predict"}, "4235": {"path": "/tools/ava_predict.py:456-481", "hash": "245fea629b9a7c5d7f9e1e492b539aa3", "title": "Action Score Thresholding in AVA Predict"}, "4236": {"path": "/tools/ava_predict.py:482-509", "hash": "7a47b3ef3a0b0202520cb22d41daff82", "title": "Video Frame Processing and Visualization Tool"}, "4237": {"path": "/tools/export_model.py", "hash": "470dd8ab24807413bc5e5191245b0efe", "title": "PaddleVideo Model Exporter"}, "4238": {"path": "/tools/export_model.py:1-32", "hash": "d84fe2c8871bed64511e5f31d6cadb84", "title": "PaddleVideo Model Export Tool"}, "4239": {"path": "/tools/export_model.py:33-57", "hash": "2d5b044c8222c90168bf3f258a658adb", "title": "Export Model Script"}, "4240": {"path": "/tools/export_model.py:58-87", "hash": "cba0b4b3a9ed545055a749bf737bbabd", "title": "Model Export and Config Trimming in PaddleVideo"}, "4241": {"path": "/tools/export_model.py:88-117", "hash": "fda2fa8a5f104e938882df2e0f5d7978", "title": "Model-Specific Input Shape Definition"}, "4242": {"path": "/tools/export_model.py:118-143", "hash": "d0e1f463bbf3a199aac953705fa3083a", "title": "Model Input Specification in PaddleVideo's Export Function"}, "4243": {"path": "/tools/export_model.py:144-172", "hash": "83820f8c1265c5b024e4164f396b150b", "title": "Input Specifications for PaddleVideo Models"}, "4244": {"path": "/tools/export_model.py:173-204", "hash": "cf8ec18bca11d170593ad88c92cc569d", "title": "Input Specifications for Various Model Names"}, "4245": {"path": "/tools/export_model.py:205-236", "hash": "dd2d1ef622e06ac9a9e7bd8547f9b6bb", "title": "Model Input Specification Generator"}, "4246": {"path": "/tools/export_model.py:237-267", "hash": "ab4cce8c1bbf4a1599c9e6b6fa9fde28", "title": "Export Model: Step-by-Step"}, "4247": {"path": "/tools/predict.py", "hash": "522fce27100f32ed99256dd075cb0e88", "title": "Paddle Video Tool: Command-Line Inference"}, "4248": {"path": "/tools/predict.py:1-32", "hash": "25c55f52523f791e967f195be211b47d", "title": "Import-Heavy Function Definition"}, "4249": {"path": "/tools/predict.py:33-59", "hash": "9faf9649001b118f02c05530ad7febc2", "title": "Command-Line Arguments for Paddle Video"}, "4250": {"path": "/tools/predict.py:60-84", "hash": "4dc334f48c9d7088311a451f77070ed8", "title": "Configuring Paddle Video Predictor Arguments"}, "4251": {"path": "/tools/predict.py:85-107", "hash": "346095242fc21c125a89c409078f9768", "title": "Optimizing PaddleVideo for Inference"}, "4252": {"path": "/tools/predict.py:108-134", "hash": "cc9aa9793cc24c73bc7a948659cca7e0", "title": "TensorRT Engine Setup for ST-GCN"}, "4253": {"path": "/tools/predict.py:136-173", "hash": "7b1c3b474eeac50408beb3d7da2ee81b", "title": "Building Paddle Predictor in Python"}, "4254": {"path": "/tools/predict.py:174-201", "hash": "ffa119223bd0b52e85cba29c6957c93e", "title": "Model Inference Processing"}, "4255": {"path": "/tools/predict.py:202-227", "hash": "6c48f1b6e04f09b9251edc4bbd7f030c", "title": "Video Prediction Pipeline"}, "4256": {"path": "/tools/predict.py:228-251", "hash": "ffcc6d55e86af57e6e08efac9e38c522", "title": "Directory Creation and Inference Processing"}, "4257": {"path": "/tools/predict.py:252-275", "hash": "b34add53fc4cbab8da9d14253673c28e", "title": "Installing auto_log and Configuring AutoLogger"}, "4258": {"path": "/tools/predict.py:276-306", "hash": "5c0baa3fc9204a18d1d7e0794dc10236", "title": "Batch Inference Tool"}, "4259": {"path": "/tools/predict.py:308-327", "hash": "7d0c3959f1dda330a84c0ab9b078aee4", "title": "Benchmarking Inference and Post-Processing Time"}, "4260": {"path": "/tools/summary.py", "hash": "f5f12a6a80e0e0c760e6c192a2c0094d", "title": "Model Summary and FLOPs Calculation"}, "4261": {"path": "/tools/summary.py:1-34", "hash": "6bb4f2234d9c73a7a4120611e3dc8194", "title": "Parsing Command Line Arguments in PaddleVideo"}, "4262": {"path": "/tools/summary.py:35-69", "hash": "7dbcbf051ae14e5539da37795e7834bb", "title": "Argument Parsing for Config File and Model Building"}, "4263": {"path": "/tools/summary.py:70-82", "hash": "5ac351fe65a4df9e85397a5c31498bdc", "title": "Model Summary and FLOPs Calculator"}, "4264": {"path": "/tools/utils.py", "hash": "cf6f260e979f2661a6a32b85666a64f1", "title": "PaddleVideo-based Action Recognition & Human Detection"}, "4265": {"path": "/tools/utils.py:1-34", "hash": "bc74bd256109c786defef4882c68cda4", "title": "Import, Error Handling and License Info"}, "4266": {"path": "/tools/utils.py:35-58", "hash": "85eadeb720de9e911e0e5291a6b865ac", "title": "Importing and Building PaddleVideo Models"}, "4267": {"path": "/tools/utils.py:60-86", "hash": "16c3e953fdeed2c0abd0489a406c96d6", "title": "Building Inference Helper with Registry"}, "4268": {"path": "/tools/utils.py:87-121", "hash": "3dcd81866e63dfb1f7266854c3383ac8", "title": "Abstract Class for Batch Preprocessing"}, "4269": {"path": "/tools/utils.py:123-147", "hash": "694c47cc5b3eac80ba5a9cabf918ade2", "title": "Softmax Postprocessing Function"}, "4270": {"path": "/tools/utils.py:148-176", "hash": "881c08ca0136a84f91d65c4ceb1a3012", "title": "Video Classifier Helper"}, "4271": {"path": "/tools/utils.py:177-211", "hash": "eedfd78877467073c8634240557e9512", "title": "Image Processing Class for PaddleVideo"}, "4272": {"path": "/tools/utils.py:212-245", "hash": "d047ccb2e00db719c2529951970cee67", "title": "Video Preprocessing Class"}, "4273": {"path": "/tools/utils.py:246-278", "hash": "a3a4ca481f28bc37efcfd29e6ac99b26", "title": "BMN Inference Helper Class and Postprocessing"}, "4274": {"path": "/tools/utils.py:279-302", "hash": "dca2029db93eaad15024d6a27e24f4fb", "title": "Calculates Snippet Xmin and Xmax Values"}, "4275": {"path": "/tools/utils.py:303-328", "hash": "2df732a70800128c27e7af197b3d9505", "title": "Non-Max Suppression for Bounding Boxes"}, "4276": {"path": "/tools/utils.py:329-362", "hash": "dd171b15a41d8d1d20530b05fe8aaf58", "title": "TokenShift Inference Helper Class"}, "4277": {"path": "/tools/utils.py:363-395", "hash": "95afe472138b676fd7bca4191ca1771e", "title": "Preprocessing for TimeSformer Inference"}, "4278": {"path": "/tools/utils.py:396-427", "hash": "8a9670e4a30870f7157389d10b8f5764", "title": "Video Processing Pipeline"}, "4279": {"path": "/tools/utils.py:428-458", "hash": "5d56db4d1f79618fcb0849166a0a59e3", "title": "Video Preprocessing Class"}, "4280": {"path": "/tools/utils.py:459-489", "hash": "5203f36d07a5d401b62827fd64dfa165", "title": "Image Preprocessing and Postprocessing Tool"}, "4281": {"path": "/tools/utils.py:490-516", "hash": "790b882f1a1ceafcdad9391cfb2c82d9", "title": "Extract Top K Classes from Tensor"}, "4282": {"path": "/tools/utils.py:517-542", "hash": "46183c597ee35452e833ad43e2e290d1", "title": "Video Frame Processing Function"}, "4283": {"path": "/tools/utils.py:543-573", "hash": "a72829b63692f2e2c17bc03466b6a9d9", "title": "Text Overlay on Video Frames"}, "4284": {"path": "/tools/utils.py:574-595", "hash": "3f33e19476c002be83beccb575e4aa18", "title": "Video Frame Processing and GIF Generation"}, "4285": {"path": "/tools/utils.py:596-620", "hash": "1a59156db17ecd823829985360a08729", "title": "Process and Save GIF with postprocess Function"}, "4286": {"path": "/tools/utils.py:621-651", "hash": "c656097f4a94b2c89f4438d2d2e8ae41", "title": "SlowFast Video Inference Helper"}, "4287": {"path": "/tools/utils.py:652-682", "hash": "148bd4fc351882d2d6d7460e1164a41e", "title": "Video Frame Preprocessing and Postprocessing Function"}, "4288": {"path": "/tools/utils.py:683-706", "hash": "3598cab474978512a2fc2de133669cee", "title": "Top Classes and Scores from STGCN Inference"}, "4289": {"path": "/tools/utils.py:707-740", "hash": "e5d0f0f996f8605cc9119ea3800d1f03", "title": "CTRGCN Inference Helper Class"}, "4290": {"path": "/tools/utils.py:741-775", "hash": "7e2cd2ee5c189825f139e25274d88988", "title": "Preprocessing Data Class"}, "4291": {"path": "/tools/utils.py:776-807", "hash": "51b1f4727c9d9094145fc6dd47cedc84", "title": "Preprocessing and MSTCN Inference Helper Classes"}, "4292": {"path": "/tools/utils.py:809-840", "hash": "07b6b9a5f05e596add3427706b41d71a", "title": "Video Feature File Handling Class"}, "4293": {"path": "/tools/utils.py:841-867", "hash": "2e53b4a2a4427f6391d8e6ff0e498eae", "title": "Video Feature Processing and Text File Generation"}, "4294": {"path": "/tools/utils.py:868-898", "hash": "72c3cfc8d7d46e6be664038022221195", "title": "Initializing ASRF Inference Helper"}, "4295": {"path": "/tools/utils.py:899-932", "hash": "7fe3d48f089063fbd03a2f82ba28f1bf", "title": "Feature Loading and Processing Class"}, "4296": {"path": "/tools/utils.py:933-959", "hash": "0315746ebfcb4fc8dac07e489893606c", "title": "Action-Labeled Video Processor"}, "4297": {"path": "/tools/utils.py:960-993", "hash": "68f2dcbbcbc20d4778420a3b0a3f554d", "title": "Attention LSTM Inference Preprocessor"}, "4298": {"path": "/tools/utils.py:994-1022", "hash": "7a2e3be7ad8235bf6f836d3f392eae00", "title": "Video Inference with TransNetV2 Model"}, "4299": {"path": "/tools/utils.py:1023-1051", "hash": "b9686a95e49e504416ac829417f0d8e8", "title": "FFmpeg Import and Frame Batching"}, "4300": {"path": "/tools/utils.py:1052-1074", "hash": "4403ff3aaff246c5598577f2185f3620", "title": "Video Frame Iterator: Converting Predictions to Scenes"}, "4301": {"path": "/tools/utils.py:1075-1103", "hash": "c2490e7df2170dbe16b3347471bae0b5", "title": "Video Scene List Processing Algorithm"}, "4302": {"path": "/tools/utils.py:1105-1133", "hash": "96334075c75a458608be6dcd65a10f19", "title": "Frame Visualization Tool"}, "4303": {"path": "/tools/utils.py:1134-1149", "hash": "e05b3f259440ce0a9fae1c2124430ed3", "title": "Single and All Frame Predictions"}, "4304": {"path": "/tools/utils.py:1150-1169", "hash": "cb4d618f806b9dd01341b5534adb320e", "title": "Shot Boundary Scene Converter"}, "4305": {"path": "/tools/utils.py:1171-1201", "hash": "9bdac883fd453a93477cb6cd6a8bc2fa", "title": "ADDS Inference Helper Initialization"}, "4306": {"path": "/tools/utils.py:1202-1237", "hash": "c4bdaf9b932388fa2cde29793473233b", "title": "Image Preprocessing Class and Method"}, "4307": {"path": "/tools/utils.py:1238-1264", "hash": "4e30f14d9794fe832259a6bbd4863e09", "title": "Post-Process Outputs and Save Depth Maps"}, "4308": {"path": "/tools/utils.py:1266-1291", "hash": "81aae45e3139064d41d4f1af4a43c6ae", "title": "Image Conversion Function and Class"}, "4309": {"path": "/tools/utils.py:1292-1321", "hash": "7988e9489cdc8dcf78513811893ecada", "title": "Init and Extract Frames for Video Analysis"}, "4310": {"path": "/tools/utils.py:1322-1344", "hash": "9112d6b71554e270483a608ed8bbc497", "title": "Preprocessing Frames and Labels"}, "4311": {"path": "/tools/utils.py:1345-1369", "hash": "350bdeef68a2ef1667be52fa12890a51", "title": "Object Detection Frame Processing"}, "4312": {"path": "/tools/utils.py:1370-1400", "hash": "5b291cb9b5781c2fe37c60f610ff1464", "title": "Data Pipeline: Append Proposals and Scores"}, "4313": {"path": "/tools/utils.py:1401-1433", "hash": "3104afcf0ca2f71cabf4b5011fd64747", "title": "Human Detection Class with Pre/Post-Processing"}, "4314": {"path": "/tools/utils.py:1434-1464", "hash": "8658c68a65b3004cccc2361b6f78485f", "title": "Iterating and Appending Predictions"}, "4315": {"path": "/tools/utils.py:1466-1490", "hash": "337931bd1a9f5a9413527792b26f17b7", "title": "Frame Sequence Visualizer"}, "4316": {"path": "/tools/utils.py:1492-1523", "hash": "c4efeab560a79895941568e51bc936c4", "title": "Pose Estimation Class for Image Processing"}, "4317": {"path": "/tools/utils.py:1524-1548", "hash": "026c860ac5f50c0095ecda510a0e99a5", "title": "PaddleVideo Image Processing"}, "4318": {"path": "/tools/utils.py:1549-1574", "hash": "584876e73f2d691646d96465a2df7839", "title": "YOWO Image Classification/Detection Initialization"}, "4319": {"path": "/tools/utils.py:1575-1606", "hash": "1b57de0a9dbd3436a66b84f490a179e3", "title": "Video Input Preprocessing: OpenCV Frame Reading and Resizing"}, "4320": {"path": "/tools/utils.py:1608-1638", "hash": "39379424242e3a3af9b6cf9c65ab5b97", "title": "Normalize and Reshape Images for Classification"}, "4321": {"path": "/tools/utils.py:1639-1660", "hash": "519b8da83cc91da8424e88f493f33f69", "title": "Object Detection and Recognition Algorithm"}, "4322": {"path": "/tools/utils.py:1661-1670", "hash": "09f64f74f5bcd79f7ec52ed509ad94d0", "title": "Video Object Detection System Algorithm"}, "4323": {"path": "/tools/wheel.py", "hash": "3aa3a7e8af74448b5a6a21ac3fe5b600", "title": "Video Classification Wheel Tool"}, "4324": {"path": "/tools/wheel.py:1-24", "hash": "367ec341c4bdab58ad852656f42f21ce", "title": "Apache License Notice"}, "4325": {"path": "/tools/wheel.py:25-64", "hash": "918ffd9f9a9b36a75a4888f848128f30", "title": "PaddleVideo Model Environment Setup"}, "4326": {"path": "/tools/wheel.py:65-93", "hash": "e8705890a76efa485d07718d3f3da972", "title": "Command Line Parser Function"}, "4327": {"path": "/tools/wheel.py:94-115", "hash": "8c13db1ccec633a0b625c0f3c4d0c93e", "title": "Initializing Argument Parser with Default Values"}, "4328": {"path": "/tools/wheel.py:116-145", "hash": "497181fe947b4ca04d37b4ae105c185b", "title": "Parse and Download with Progress"}, "4329": {"path": "/tools/wheel.py:146-168", "hash": "c53dc1da22d025529149523dfdae4919", "title": "Download and Save Inference Model"}, "4330": {"path": "/tools/wheel.py:169-197", "hash": "bcf9bed27efada6416580b25d953d62d", "title": "Initializing Paddle Predictor with Flags and Configs"}, "4331": {"path": "/tools/wheel.py:198-232", "hash": "be96333481a76367584a113fcd47c7f1", "title": "PaddleVideo Predictor Setup"}, "4332": {"path": "/tools/wheel.py:233-253", "hash": "96637bef113010199e8875c7e7ec96cd", "title": "Model Download and Configuration"}, "4333": {"path": "/tools/wheel.py:254-277", "hash": "d592df1458a8e5fe2621b58fdd0352d2", "title": "Video Label Prediction Code"}, "4334": {"path": "/tools/wheel.py:279-301", "hash": "d49bc6783f675bd26753ffc98c1770e5", "title": "URL Video Processing"}, "4335": {"path": "/tools/wheel.py:303-327", "hash": "734ec23aa6c6d01fdd6e7bd3534842ae", "title": "Batch Inference Looping"}, "4336": {"path": "/tools/wheel.py:328-353", "hash": "4f47b258f9c6bf2c473810207a55f5df", "title": "Iterating and Labeling Results in PaddleVideo"}, "4337": {"path": "/tools/wheel.py:354-354", "hash": "aea680926b3a51c05d08874baee8c9ed", "title": "Entry Point for Script Execution"}}}
\ No newline at end of file
diff --git a/docs/codeview.html b/docs/codeview.html
new file mode 100755
index 000000000..761065f49
--- /dev/null
+++ b/docs/codeview.html
@@ -0,0 +1,669 @@
+<!DOCTYPE html>
+<html lang="en">
+<!-- visit with anchor: #mycode.12-14 -->
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <link rel="icon"
+        href="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' fill='currentColor' class='bi bi-code-square' viewBox='0 0 16 16'%3E%3Cpath d='M14 1a1 1 0 0 1 1 1v12a1 1 0 0 1-1 1H2a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1zM2 0a2 2 0 0 0-2 2v12a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V2a2 2 0 0 0-2-2z'/%3E%3Cpath d='M6.854 4.646a.5.5 0 0 1 0 .708L4.207 8l2.647 2.646a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 0 1 .708 0zm2.292 0a.5.5 0 0 0 0 .708L11.793 8l-2.647 2.646a.5.5 0 0 0 .708.708l3-3a.5.5 0 0 0 0-.708l-3-3a.5.5 0 0 0-.708 0z'/%3E%3C/svg%3E"
+        type="image/svg+xml">
+    <title>Code View</title>
+    <script src="https://cdn.jsdelivr.net/npm/mark.js"></script>
+
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/themes/prism.min.css">
+    <!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/components/prism-core.min.js"></script> -->
+
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/prism.min.js"></script>
+    <script
+        src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/plugins/autoloader/prism-autoloader.min.js"></script>
+
+    <!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/components/prism-python.min.js"></script> -->
+    <link rel="stylesheet"
+        href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/plugins/line-numbers/prism-line-numbers.css"
+        data-noprefix />
+    <script
+        src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/plugins/line-numbers/prism-line-numbers.min.js"></script>
+
+    <link rel="stylesheet"
+        href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/plugins/line-highlight/prism-line-highlight.css">
+    <script>
+        (function () {
+
+            if (typeof Prism === 'undefined' || typeof document === 'undefined' || !document.querySelector) {
+                return;
+            }
+
+            var LINE_NUMBERS_CLASS = 'line-numbers';
+            var LINKABLE_LINE_NUMBERS_CLASS = 'linkable-line-numbers';
+            var NEW_LINE_EXP = /\n(?!$)/g;
+
+            /**
+             * @param {string} selector
+             * @param {ParentNode} [container]
+             * @returns {HTMLElement[]}
+             */
+            function $$(selector, container) {
+                return Array.prototype.slice.call((container || document).querySelectorAll(selector));
+            }
+
+            /**
+             * Returns whether the given element has the given class.
+             *
+             * @param {Element} element
+             * @param {string} className
+             * @returns {boolean}
+             */
+            function hasClass(element, className) {
+                return element.classList.contains(className);
+            }
+
+            /**
+             * Calls the given function.
+             *
+             * @param {() => any} func
+             * @returns {void}
+             */
+            function callFunction(func) {
+                func();
+            }
+
+            // Some browsers round the line-height, others don't.
+            // We need to test for it to position the elements properly.
+            var isLineHeightRounded = (function () {
+                var res;
+                return function () {
+                    if (typeof res === 'undefined') {
+                        var d = document.createElement('div');
+                        d.style.fontSize = '13px'; // larger.
+                        d.style.lineHeight = '1.5';
+                        d.style.padding = '0';
+                        d.style.border = '0';
+                        d.innerHTML = '&nbsp;<br />&nbsp;';
+                        document.body.appendChild(d);
+                        // Browsers that round the line-height should have offsetHeight === 38
+                        // The others should have 39.
+                        res = d.offsetHeight === 38;
+                        document.body.removeChild(d);
+                    }
+                    return res;
+                };
+            }());
+
+            /**
+             * Returns the top offset of the content box of the given parent and the content box of one of its children.
+             *
+             * @param {HTMLElement} parent
+             * @param {HTMLElement} child
+             */
+            function getContentBoxTopOffset(parent, child) {
+                var parentStyle = getComputedStyle(parent);
+                var childStyle = getComputedStyle(child);
+
+                /**
+                 * Returns the numeric value of the given pixel value.
+                 *
+                 * @param {string} px
+                 */
+                function pxToNumber(px) {
+                    return +px.substr(0, px.length - 2);
+                }
+
+                return child.offsetTop
+                    + pxToNumber(childStyle.borderTopWidth)
+                    + pxToNumber(childStyle.paddingTop)
+                    - pxToNumber(parentStyle.paddingTop);
+            }
+
+            /**
+             * Returns whether the Line Highlight plugin is active for the given element.
+             *
+             * If this function returns `false`, do not call `highlightLines` for the given element.
+             *
+             * @param {HTMLElement | null | undefined} pre
+             * @returns {boolean}
+             */
+            function isActiveFor(pre) {
+                if (!pre || !/pre/i.test(pre.nodeName)) {
+                    return false;
+                }
+
+                if (pre.hasAttribute('data-line')) {
+                    return true;
+                }
+
+                if (pre.id && Prism.util.isActive(pre, LINKABLE_LINE_NUMBERS_CLASS)) {
+                    // Technically, the line numbers plugin is also necessary but this plugin doesn't control the classes of
+                    // the line numbers plugin, so we can't assume that they are present.
+                    return true;
+                }
+
+                return false;
+            }
+
+            var scrollIntoView = true;
+
+            Prism.plugins.lineHighlight = {
+                /**
+                 * Highlights the lines of the given pre.
+                 *
+                 * This function is split into a DOM measuring and mutate phase to improve performance.
+                 * The returned function mutates the DOM when called.
+                 *
+                 * @param {HTMLElement} pre
+                 * @param {string | null} [lines]
+                 * @param {string} [classes='']
+                 * @returns {() => void}
+                 */
+                highlightLines: function highlightLines(pre, lines, classes) {
+                    lines = typeof lines === 'string' ? lines : (pre.getAttribute('data-line') || '');
+
+                    var ranges = lines.replace(/\s+/g, '').split(',').filter(Boolean);
+                    var offset = +pre.getAttribute('data-line-offset') || 0;
+
+                    var parseMethod = isLineHeightRounded() ? parseInt : parseFloat;
+                    var lineHeight = parseMethod(getComputedStyle(pre).lineHeight);
+                    var hasLineNumbers = Prism.util.isActive(pre, LINE_NUMBERS_CLASS);
+                    var codeElement = pre.querySelector('code');
+                    var parentElement = hasLineNumbers ? pre : codeElement || pre;
+                    var mutateActions = /** @type {(() => void)[]} */ ([]);
+                    var lineBreakMatch = codeElement.textContent.match(NEW_LINE_EXP);
+                    var numberOfLines = lineBreakMatch ? lineBreakMatch.length + 1 : 1;
+                    /**
+                     * The top offset between the content box of the <code> element and the content box of the parent element of
+                     * the line highlight element (either `<pre>` or `<code>`).
+                     *
+                     * This offset might not be zero for some themes where the <code> element has a top margin. Some plugins
+                     * (or users) might also add element above the <code> element. Because the line highlight is aligned relative
+                     * to the <pre> element, we have to take this into account.
+                     *
+                     * This offset will be 0 if the parent element of the line highlight element is the `<code>` element.
+                     */
+                    var codePreOffset = !codeElement || parentElement == codeElement ? 0 : getContentBoxTopOffset(pre, codeElement);
+
+                    ranges.forEach(function (currentRange) {
+                        var range = currentRange.split('-');
+
+                        var start = +range[0];
+                        var end = +range[1] || start;
+                        end = Math.min(numberOfLines + offset, end);
+
+                        if (end < start) {
+                            return;
+                        }
+
+                        /** @type {HTMLElement} */
+                        var line = pre.querySelector('.line-highlight[data-range="' + currentRange + '"]') || document.createElement('div');
+
+                        mutateActions.push(function () {
+                            line.setAttribute('aria-hidden', 'true');
+                            line.setAttribute('data-range', currentRange);
+                            line.className = (classes || '') + ' line-highlight';
+                        });
+
+                        // if the line-numbers plugin is enabled, then there is no reason for this plugin to display the line numbers
+
+                        // TODO: fix line wrapping problem.
+                        if (hasLineNumbers && Prism.plugins.lineNumbers) {
+                            var startNode = Prism.plugins.lineNumbers.getLine(pre, start);
+                            var endNode = Prism.plugins.lineNumbers.getLine(pre, end);
+
+                            if (startNode) {
+                                var top = startNode.offsetTop + codePreOffset + 'px';
+                                mutateActions.push(function () {
+                                    line.style.top = top;
+                                });
+                            }
+
+                            if (endNode) {
+                                var height = (endNode.offsetTop - startNode.offsetTop) + endNode.offsetHeight + 'px';
+                                mutateActions.push(function () {
+                                    line.style.height = height;
+                                });
+                            }
+                        } else {
+                            mutateActions.push(function () {
+                                line.setAttribute('data-start', String(start));
+
+                                if (end > start) {
+                                    line.setAttribute('data-end', String(end));
+                                }
+
+                                line.style.top = (start - offset - 1) * lineHeight + codePreOffset + 'px';
+
+                                line.textContent = new Array(end - start + 2).join(' \n');
+                            });
+                        }
+
+                        mutateActions.push(function () {
+                            line.style.width = pre.scrollWidth + 'px';
+                        });
+
+                        mutateActions.push(function () {
+                            // allow this to play nicely with the line-numbers plugin
+                            // need to attack to pre as when line-numbers is enabled, the code tag is relatively which screws up the positioning
+                            parentElement.appendChild(line);
+                        });
+                    });
+
+                    var id = pre.id;
+                    if (hasLineNumbers && Prism.util.isActive(pre, LINKABLE_LINE_NUMBERS_CLASS) && id) {
+                        // This implements linkable line numbers. Linkable line numbers use Line Highlight to create a link to a
+                        // specific line. For this to work, the pre element has to:
+                        //  1) have line numbers,
+                        //  2) have the `linkable-line-numbers` class or an ascendant that has that class, and
+                        //  3) have an id.
+
+                        if (!hasClass(pre, LINKABLE_LINE_NUMBERS_CLASS)) {
+                            // add class to pre
+                            mutateActions.push(function () {
+                                pre.classList.add(LINKABLE_LINE_NUMBERS_CLASS);
+                            });
+                        }
+
+                        var start = parseInt(pre.getAttribute('data-start') || '1');
+
+                        // iterate all line number spans
+                        $$('.line-numbers-rows > span', pre).forEach(function (lineSpan, i) {
+                            var lineNumber = i + start;
+                            lineSpan.onclick = function () {
+                                var hash = id + '.' + lineNumber;
+
+                                // this will prevent scrolling since the span is obviously in view
+                                scrollIntoView = false;
+                                location.hash = hash;
+                                setTimeout(function () {
+                                    scrollIntoView = true;
+                                }, 1);
+                            };
+                        });
+                    }
+
+                    return function () {
+                        mutateActions.forEach(callFunction);
+                    };
+                }
+            };
+
+
+            function applyHash() {
+                var hash = location.hash.slice(1);
+
+                // Remove pre-existing temporary lines
+                $$('.temporary.line-highlight').forEach(function (line) {
+                    line.parentNode.removeChild(line);
+                });
+
+                var range = (hash.match(/\.([\d,-]+)$/) || [, ''])[1];
+
+                if (!range || document.getElementById(hash)) {
+                    return;
+                }
+
+                var id = hash.slice(0, hash.lastIndexOf('.'));
+                var pre = document.getElementById(id);
+                // if (pre.classList.contains("wrap_pre")){return}
+
+                if (!pre) {
+                    return;
+                }
+
+                // now we have hash.
+
+                if (!pre.hasAttribute('data-line')) {
+                    pre.setAttribute('data-line', '');
+                }
+
+                var mutateDom = Prism.plugins.lineHighlight.highlightLines(pre, range, 'temporary ');
+                mutateDom();
+
+                if (scrollIntoView) {
+                    document.querySelector('.temporary.line-highlight').scrollIntoView();
+                }
+            }
+
+            var fakeTimer = 0; // Hack to limit the number of times applyHash() runs
+
+            Prism.hooks.add('before-sanity-check', function (env) {
+                var pre = env.element.parentElement;
+                if (!isActiveFor(pre)) {
+                    return;
+                }
+
+                /*
+                 * Cleanup for other plugins (e.g. autoloader).
+                 *
+                 * Sometimes <code> blocks are highlighted multiple times. It is necessary
+                 * to cleanup any left-over tags, because the whitespace inside of the <div>
+                 * tags change the content of the <code> tag.
+                 */
+                var num = 0;
+                $$('.line-highlight', pre).forEach(function (line) {
+                    num += line.textContent.length;
+                    line.parentNode.removeChild(line);
+                });
+                // Remove extra whitespace
+                if (num && /^(?: \n)+$/.test(env.code.slice(-num))) {
+                    env.code = env.code.slice(0, -num);
+                }
+            });
+
+            Prism.hooks.add('complete', function completeHook(env) {
+                var pre = env.element.parentElement;
+                if (!isActiveFor(pre)) {
+                    return;
+                }
+
+                clearTimeout(fakeTimer);
+
+                var hasLineNumbers = Prism.plugins.lineNumbers;
+                var isLineNumbersLoaded = env.plugins && env.plugins.lineNumbers;
+
+                if (hasClass(pre, LINE_NUMBERS_CLASS) && hasLineNumbers && !isLineNumbersLoaded) {
+                    Prism.hooks.add('line-numbers', completeHook);
+                } else {
+                    var mutateDom = Prism.plugins.lineHighlight.highlightLines(pre);
+                    mutateDom();
+                    fakeTimer = setTimeout(applyHash, 1);
+                }
+            });
+
+            window.addEventListener('hashchange', applyHash);
+
+            function conditionalApplyHash() {
+                const pre_elem = document.getElementById('mycode');
+                const loaded = pre_elem.getAttribute('data-src-status');
+                if (loaded == "loaded") {
+                    applyHash();
+                    // Prism.highlightElement(pre_elem);
+
+                    // Prism.highlightAll()
+                    // pre_elem.classList.add("wrap_pre");
+                    // setTimeout(() => {pre_elem.classList.add("wrap_pre");}, 1000);
+
+                    // pre_elem.style.whiteSpace="pre-wrap !important";
+                } else {
+                    setTimeout(conditionalApplyHash, 500)
+                }
+            }
+            function getQueryParams() {
+                var search = window.location.search.substring(1); // Remove leading '?'
+                var queryParams = {};
+                search.split('&').forEach(function (pair) {
+                    var parts = pair.split('=');
+                    var key = decodeURIComponent(parts[0]);
+                    var value = decodeURIComponent(parts[1]);
+                    queryParams[key] = value;
+                });
+                return queryParams;
+            }
+            function loadedAction() {
+                // console.log('location search:', window.location.search);
+                const section_elem = document.getElementById('code-section');
+                const pre_elem = document.createElement('pre');
+                const queryParams = getQueryParams(window.location.search);
+                // const queryParams = new URLSearchParams(window.location.search);
+                const language = queryParams.language;
+                const code_path = queryParams.file;
+                const project_name = queryParams.project;
+                const keywords = queryParams.keywords;
+                var keywordList
+                try {
+                    keywordList = JSON.parse(keywords);
+
+                } catch (e) { }
+                document.title = `Code view of: ${code_path} - Project: ${project_name}`;
+                const h1_element = document.getElementById('code-path');
+                // h1_element.textContent = code_path.slice('src/'.length);
+                // debugger;
+                // let mycodepath = project_name + "/" + code_path.slice('src/'.length);
+                // `?full=true#${manchor}`
+                let myhtml = `<a href="tree.html?full=true#/">${project_name}</a>`;
+                let myslices = code_path.slice('src/'.length).split('/')
+                // debugger;
+                let accpath = ""
+                for (let i in myslices) {
+                    s = myslices[i];
+                    accpath += ("/" + s)
+                    var maccpath = accpath
+                    if (i != (myslices.length - 1)) {
+                        maccpath = accpath + "/"
+                    }
+                    myhtml += `/<a href="tree.html?full=true#${maccpath}">${s}</a>`
+                }
+
+                h1_element.innerHTML = myhtml;
+                // pre_elem.className = `language-${language}`
+                pre_elem.id = "mycode";
+                const code_elem = document.createElement('code');
+                code_elem.className = `language-${language}`
+                var xhr = new XMLHttpRequest();
+                console.log("trying: " + code_path)
+
+                xhr.open('GET', code_path, false); // The third parameter is set to false for synchronous request
+                xhr.send(null);
+                if (xhr.status == 200) {
+                    code_elem.textContent = xhr.responseText;
+                } else {
+
+                    var xhr = new XMLHttpRequest();
+                    // TODO: mitigate this evil hack by passing more info of the original project.
+                    // the reason is that github does not allow accessing file with '_' as prefix.
+                    // is that disabled for security reasons?
+                    var newLink = `https://raw.githubusercontent.com/James4Ever0/${project_name}/main/docs/` + code_path
+                    console.log("trying: " + newLink)
+                    xhr.open('GET', newLink, false); // The third parameter is set to false for synchronous request
+                    xhr.send(null);
+
+                    if (xhr.status == 200) {
+                        code_elem.textContent = xhr.responseText;
+                    } else {
+
+                        var xhr = new XMLHttpRequest();
+                        // TODO: mitigate this evil hack by passing more info of the original project.
+                        var newLink = `https://raw.githubusercontent.com/James4Ever0/${project_name}/master/docs/` + code_path
+                        xhr.open('GET', newLink, false); // The third parameter is set to false for synchronous request
+                        xhr.send(null);
+
+                        if (xhr.status == 200) {
+                            code_elem.textContent = xhr.responseText;
+                        } else {
+                            var xhr = new XMLHttpRequest();
+                            // TODO: mitigate this evil hack by passing more info of the original project.
+                            var newLink = `https://raw.githubusercontent.com/James4Ever0/${project_name}_doc/master/` + code_path
+                            console.log("trying: " + newLink)
+
+                            xhr.open('GET', newLink, false); // The third parameter is set to false for synchronous request
+                            xhr.send(null);
+
+                            if (xhr.status == 200) {
+                                code_elem.textContent = xhr.responseText;
+                            } else {
+                                var xhr = new XMLHttpRequest();
+                                // TODO: mitigate this evil hack by passing more info of the original project.
+                                var newLink = `https://raw.githubusercontent.com/James4Ever0/${project_name}_doc/main/` + code_path
+                                console.log("trying: " + newLink)
+
+                                xhr.open('GET', newLink, false); // The third parameter is set to false for synchronous request
+                                xhr.send(null);
+
+                                if (xhr.status == 200) {
+                                    code_elem.textContent = xhr.responseText;
+                                } else {
+
+                                    code_elem.textContent = "Failed to load code."
+                                }
+                            }
+                        }
+                    }
+                }
+
+                pre_elem.appendChild(code_elem);
+                // pre_elem.setAttribute("data-src", code_path);
+                section_elem.appendChild(pre_elem)
+                // Prism.highlightAllUnder(pre_elem);
+                Prism.highlightAllUnder(document.getElementById("code-div"));
+                // Prism.highlightAllUnder(document.getElementById("code-section"));
+                if (window.location.hash != "") {
+                    // conditionalApplyHash();
+                    applyHash();
+                } else {
+                    // Prism.highlightAll();
+                    // Prism.highlightElement(pre_elem);
+                    // setTimeout(() => {pre_elem.classList.add("wrap_pre");}, 1000);
+                    // pre_elem.style.whiteSpace="pre-wrap !important";
+                }
+
+                if (keywordList != undefined) {
+                    setInterval(() => {
+                        const markInstance = new Mark(document.getElementById('code-div'));
+                        markInstance.unmark(); // Clear previous marks
+                        markInstance.mark(keywordList);
+                    }, 1000)
+                }
+                // applyHash();
+                // Prism.highlightElement(pre_elem, () => {applyHash()});
+                // Prism.highlightElement(pre_elem).then(applyHash);
+            }
+            // window.addEventListener('load', loadedAction);
+            document.addEventListener('DOMContentLoaded', loadedAction);
+            window.addEventListener('resize', function () {
+                var actions = $$('pre')
+                    .filter(isActiveFor)
+                    .map(function (pre) {
+                        return Prism.plugins.lineHighlight.highlightLines(pre);
+                    });
+                actions.forEach(callFunction);
+            });
+
+        }());
+    </script>
+    <style>
+        /* 
+        html, body{
+            background-color: transparent;
+        } */
+        #code-path {
+            overflow-x: scroll;
+            direction: rtl;
+            /* opt1 */
+            /* word-wrap: break-word; 
+            overflow-wrap: break-word */
+            /* opt2 */
+            /* white-space: nowrap;
+            overflow: hidden;
+            text-overflow: ellipsis;
+            direction: rtl; */
+        }
+
+        html,
+        body {
+            /* max-width: 900px; */
+            /* margin: 0 auto; */
+            /* margin-left: 5%;
+            margin-right: 5%; */
+            height: 100%;
+            display: flex;
+            flex-direction: column;
+            font-family: 'Roboto', sans-serif;
+
+            /* Adjust based on the height of the header */
+        }
+
+        section {
+            flex: 1;
+            /* margin: 0 auto; */
+            /* Provide spacing to accommodate the fixed header */
+            /* Ensure the section fills the remaining viewport height */
+            overflow-y: auto;
+            /* overflow-x:hidden; */
+            /* white-space: pre-wrap; */
+            /* Enable vertical scrolling if content exceeds viewport height */
+        }
+
+        /* .line-highlight {
+            background-color: rgba(228, 239, 12, 0.07) !important;
+        } */
+
+        /* do this after jump */
+        /* .line-highlight {
+            background-color: rgba(0, 0, 0, 0) !important;
+            background-image: linear-gradient(to right, rgba(121, 96, 72, 0.1) 70%, rgba(121, 96, 72, 0)) !important;
+        } */
+
+        pre {
+            /* white-space: pre-line !important; */
+            white-space: pre-wrap !important;
+        }
+
+        .container {
+            display: flex;
+            flex-direction: column;
+            align-items: left;
+            justify-content: center;
+            text-align: left;
+        }
+
+        .monospace-text {
+            font-family: "Courier New", monospace;
+            color: #333;
+        }
+
+        @media (max-width: 767px) {
+
+            .container {
+                padding-left: 10px;
+            }
+
+
+            html,
+            body {
+
+                margin-left: 3%;
+                margin-right: 3%;
+            }
+
+            #code-section {
+                font-size: 14px;
+            }
+        }
+
+        #code-section {
+            border: 1px solid #ccc;
+        }
+
+        /* Styles for desktop devices */
+        @media (min-width: 768px) {
+            .container {
+                padding-left: 1.6%;
+            }
+
+            html,
+            body {
+                margin-left: 5%;
+                margin-right: 5%;
+            }
+
+            #code-section {
+                font-size: 17px;
+            }
+
+        }
+    </style>
+</head>
+
+<!-- <body> -->
+
+<body class="line-numbers">
+    <header class="container" data-plugin-header="line-numbers">
+        <p id="code-path" class="monospace-text">Code Preview</p>
+    </header>
+    <div id="code-div">
+        <section id="code-section">
+            <!-- <pre id="mycode" class="language-python" data-src="code_view_demo.py">
+  </pre> -->
+        </section>
+    </div>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/docs/data/0.json b/docs/data/0.json
new file mode 100644
index 000000000..1f640ef27
--- /dev/null
+++ b/docs/data/0.json
@@ -0,0 +1,541 @@
+{
+    "0": {
+        "file_id": 0,
+        "content": "/MANIFEST.in",
+        "type": "filepath"
+    },
+    "1": {
+        "file_id": 0,
+        "content": "This code is specifying the files and directories to include in the package distribution for PaddleVideo. It includes important documents like LICENSE and README, utilities scripts like utils.py and ava_predict.py, and key data file Kinetics-400_label_list.txt. Additionally, it uses a recursive-include to incorporate all .py and .txt files within the paddlevideo directory.",
+        "type": "summary"
+    },
+    "2": {
+        "file_id": 0,
+        "content": "include LICENSE\ninclude README.md\ninclude tools/__init__.py\ninclude tools/utils.py\ninclude tools/ava_predict.py\ninclude tools/wheel.py\ninclude data/k400/Kinetics-400_label_list.txt\nrecursive-include paddlevideo/ *.py *.txt",
+        "type": "code",
+        "location": "/MANIFEST.in:1-9"
+    },
+    "3": {
+        "file_id": 0,
+        "content": "This code is specifying the files and directories to include in the package distribution for PaddleVideo. It includes important documents like LICENSE and README, utilities scripts like utils.py and ava_predict.py, and key data file Kinetics-400_label_list.txt. Additionally, it uses a recursive-include to incorporate all .py and .txt files within the paddlevideo directory.",
+        "type": "comment"
+    },
+    "4": {
+        "file_id": 1,
+        "content": "/README.md",
+        "type": "filepath"
+    },
+    "5": {
+        "file_id": 1,
+        "content": "PaddleVideo is a Python library for advanced video processing, featuring industry-specific models and data production to deployment pipeline support. The documentation includes sections on distillation, inference deployment, datasets, application scenarios, and licensing information (Apache 2.0).",
+        "type": "summary"
+    },
+    "6": {
+        "file_id": 1,
+        "content": "[English](README_en.md) | 中文\n# PaddleVideo\n![python version](https://img.shields.io/badge/python-3.7+-orange.svg) ![paddle version](https://img.shields.io/badge/PaddlePaddle-2.3.1-blue)\n## 简介\nPaddleVideo旨在打造一套丰富、领先且实用的Video工具库，旨在帮助开发者更好的进行视频领域的学术研究和产业实践。\n<div align=\"center\">\n  <img src=\"docs/images/home.gif\" width=\"450px\"/><br>\n</div>\n## 近期更新\n- 开源视频标注工具🌟[BILS](./docs/zh-CN/annotation_tools.md)，欢迎下载安装包体验～\n- 发布轻量化行为识别模型**🔥[PP-TSMv2](./docs/zh-CN/model_zoo/recognition/pp-tsm_v2.md)**, Kinetics-400精度75.16%，25fps的10s视频cpu推理时间仅需456ms.各模型性能对比[benchmark](./docs/zh-CN/benchmark.md).\n- 新增[知识蒸馏](./docs/zh-CN/distillation.md)功能.\n- 新增基于transformer的行为识别模型[TokenShift](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/tokenshift_transformer.md).\n- 新增基于骨骼点的行为识别模型[2s-ACGN](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/agcn2s.md)、[CTR-GCN](./docs/zh-CN/model_zoo/recognition/ctrgcn.md).\n- 新增单阶段时空动作检测模型[YOWO](./docs/zh-CN/model_zoo/localization/yowo.md).",
+        "type": "code",
+        "location": "/README.md:1-22"
+    },
+    "7": {
+        "file_id": 1,
+        "content": "PaddleVideo is a Python library for advanced video processing, providing extensive and cutting-edge tools to assist researchers and industry professionals in the field of computer vision. The recent updates include an open-source video annotation tool (BILS), a lightweight action recognition model (PP-TSMv2), knowledge distillation functionality, transformer-based models, and single-stage action detection models (YOWO).",
+        "type": "comment"
+    },
+    "8": {
+        "file_id": 1,
+        "content": "👀 🌟  **《产业级视频技术与应用案例》系列课程回放链接**:  https://aistudio.baidu.com/aistudio/course/introduce/6742 🌟\n​\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t  💖 **欢迎大家扫码入群讨论** 💖\n<div align=\"center\">\n  <img src=\"docs/images/user_group.png\" width=250/></div>\n- 添加成功后回复【视频】加入交流群\n## 特性\n支持多种Video相关前沿算法，在此基础上打造产业级特色模型[PP-TSM](docs/zh-CN/model_zoo/recognition/pp-tsm.md)和[PP-TSMv2](docs/zh-CN/model_zoo/recognition/pp-tsm_v2.md)，并打通数据生产、模型训练、压缩、预测部署全流程。\n<div align=\"center\">\n    <img src=\"./docs/images/features.png\" width=\"700\">\n</div>\n## 快速开始\n- 一行命令快速使用: [快速开始](./docs/zh-CN/quick_start.md)\n## 场景应用\nPaddleVideo场景应用覆盖体育、互联网、工业、医疗行业，在PP-TSM的基础能力之上，以案例的形式展示利用场景数据微调、模型优化方法、数据增广等内容，为开发者实际落地提供示范与启发。详情可查看[应用](./applications/)。\n## 文档教程\n- [快速开始](./docs/zh-CN/quick_start.md)\n- [安装说明](./docs/zh-CN/install.md)\n- [训练/测试/推理全流程使用指南](./docs/zh-CN/usage.md)\n- [PP-TSM行为识别🔥](./docs/zh-CN/model_zoo/recognition/pp-tsm.md)\n  - [模型库](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#7)\n  - [模型训练](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#4)\n  - [模型压缩](./deploy/slim/)\n      - [模型量化](./deploy/slim/readme.md)",
+        "type": "code",
+        "location": "/README.md:25-58"
+    },
+    "9": {
+        "file_id": 1,
+        "content": "This code is for PaddleVideo, a series of industry-level video technology and application case courses. It supports various video cutting-edge algorithms, creates industry-specific models PP-TSM and PP-TSMv2, and covers the entire data production, model training, compression, and deployment pipeline. The code provides quick start instructions, scene application examples, and documentation for tutorials on different topics such as recognition, model library, and model compression. It also includes links to join discussion groups and course replay.",
+        "type": "comment"
+    },
+    "10": {
+        "file_id": 1,
+        "content": "      - [知识蒸馏](./docs/zh-CN/distillation.md)\n  - [推理部署](./deploy/)\n      - [基于Python预测引擎推理](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#62)\n      - [基于C++预测引擎推理](./deploy/cpp_infer/readme.md)\n      - [服务端部署](./deploy/python_serving/readme.md)\n      - [Paddle2ONNX模型转化与预测](./deploy/paddle2onnx/readme.md)\n      - [Benchmark](./docs/zh-CN/benchmark.md)\n- [前沿算法与模型](./docs/zh-CN/model_zoo/README.md)🚀\n- [数据集](./docs/zh-CN/dataset/README.md)\n- [场景应用](./applications/README.md)\n- [数据标注](./docs/zh-CN/annotation_tools.md)\n- [赛事支持](./docs/zh-CN/competition.md)\n- [贡献代码](./docs/zh-CN/contribute/README.md)\n## 许可证书\n本项目的发布受[Apache 2.0 license](LICENSE)许可认证。",
+        "type": "code",
+        "location": "/README.md:59-75"
+    },
+    "11": {
+        "file_id": 1,
+        "content": "This code provides a table of contents for the PaddleVideo documentation, including sections on distillation, inference deployment using Python and C++ engines, server-side deployment, converting to ONNX models, state-of-the-art algorithms and models, datasets, application scenarios, data labeling tools, competition support, contributing code, and licensing information (Apache 2.0).",
+        "type": "comment"
+    },
+    "12": {
+        "file_id": 2,
+        "content": "/README_en.md",
+        "type": "filepath"
+    },
+    "13": {
+        "file_id": 2,
+        "content": "PaddleVideo is a deep learning library for video processing, offering pre-trained models, training, compression, inference, and deployment options, along with installation guides, datasets, and annotation tools under the Apache 2.0 license.",
+        "type": "summary"
+    },
+    "14": {
+        "file_id": 2,
+        "content": "[简体中文](README.md) | English\n# PaddleVideo\n![python version](https://img.shields.io/badge/python-3.7+-orange.svg) ![paddle version](https://img.shields.io/badge/PaddlePaddle-2.0-blue)\n## Introduction\nPaddleVideo is a toolset for video tasks prepared for the industry and academia. This repository provides examples and best practice guildelines for exploring deep learning algorithm in the scene of video area.\n<div align=\"center\">\n  <img src=\"docs/images/home.gif\" width=\"450px\"/><br>\n</div>\n## Update:\n- release **🔥[PP-TSMv2](./docs/zh-CN/model_zoo/recognition/pp-tsm.md)**, an lite action recognition model, top1_acc on Kinetics-400 is 74.38%，cpu inference time on 10s video with 25fps is only 433ms. [benchmark](./docs/zh-CN/benchmark.md).\n- add [Knowledge Distilltion](./docs/zh-CN/distillation.md) framework code.\n- add [TokenShift](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/tokenshift_transformer.md), [2s-ACGN](https://github.com/PaddlePaddle/PaddleVideo/b",
+        "type": "code",
+        "location": "/README_en.md:1-20"
+    },
+    "15": {
+        "file_id": 2,
+        "content": "This code is for PaddleVideo, a toolset for video tasks in industry and academia. It provides examples and best practice guidelines for deep learning algorithms in the video domain. Recent updates include the release of PP-TSMv2 (lite action recognition model), addition of Knowledge Distillation framework code, and TokenShift and 2s-ACGN models. Python version required is 3.7+, and it uses PaddlePaddle version 2.0.",
+        "type": "comment"
+    },
+    "16": {
+        "file_id": 2,
+        "content": "lob/develop/docs/zh-CN/model_zoo/recognition/agcn2s.md) and [CTR-GCN](./docs/zh-CN/model_zoo/recognition/ctrgcn.md) model.\n​ 💖 **Welcome to scan the code and join the group discussion** 💖\n<div align=\"center\">\n  <img src=\"docs/images/user_group.png\" width=250/></div>\n- Scan the QR code below with your Wechat and reply \"video\", you can access to official technical exchange group. Look forward to your participation.\n## Features\nPaddleVideo support a variety of cutting-edge algorithms related to video, and developed industrial featured models/solution [PP-TSM](docs/zh-CN/model_zoo/recognition/pp-tsm.md) and [PP-TSMv2](docs/zh-CN/model_zoo/recognition/pp-tsm.md) on this basis, and get through the whole process of data production, model training, compression, inference and deployment.\n<div align=\"center\">\n    <img src=\"./docs/images/features_en.png\" width=\"700\">\n</div>\n## Quick Start\n- One line of code quick use: [Quick Start](./docs/zh-CN/quick_start.md)\n## Tutorials\n- [Quick Start](./docs/zh-CN/quick_start.md)",
+        "type": "code",
+        "location": "/README_en.md:20-43"
+    },
+    "17": {
+        "file_id": 2,
+        "content": "This code is from the \"PaddleVideo\" project's README file. It introduces PaddleVideo as a platform that supports various cutting-edge video algorithms, developed industrial featured models like PP-TSM and PP-TSMv2, and provides a full process of data production, model training, compression, inference, and deployment. The code also mentions the availability of quick start guides and tutorials to make it easier for users to get started with PaddleVideo.",
+        "type": "comment"
+    },
+    "18": {
+        "file_id": 2,
+        "content": "- [Installation](./docs/zh-CN/install.md)\n- [Usage](./docs/zh-CN/usage.md)\n- [PP-TSM🔥](./docs/zh-CN/model_zoo/recognition/pp-tsm.md)\n  - [Model Zoo](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#7)\n  - [Model training](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#4)\n  - [Model Compression](./deploy/slim/)\n      - [Model Quantization](./deploy/slim/readme.md)\n      - [Knowledge Distillation](./docs/zh-CN/distillation.md)\n  - [Inference and Deployment](./deploy/)\n      - [Python Inference](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#62)\n      - [C++ Inference](./deploy/cpp_infer/readme.md)\n      - [Serving](./deploy/python_serving/readme.md)\n      - [Paddle2ONNX](./deploy/paddle2onnx/readme.md)\n      - [Benchmark](./docs/zh-CN/benchmark.md)\n- [Academic algorithms](./docs/en/model_zoo/README.md)🚀\n- [Datasets](./docs/en/dataset/README.md)\n- [Data Annotation](./applications/BILS)\n- [Contribute](./docs/zh-CN/contribute/README.md)\n## License\nPaddleVideo is released under the [Apache 2.0 license](LICENSE).",
+        "type": "code",
+        "location": "/README_en.md:44-65"
+    },
+    "19": {
+        "file_id": 2,
+        "content": "This code outlines the main components of PaddleVideo, a deep learning library for video processing. It includes installation instructions, usage guidelines, model zoo (pre-trained models), model training, model compression techniques such as quantization and knowledge distillation, inference and deployment options including Python, C++, and serving, academic algorithms, datasets, data annotation tool (BILS), and licensing information under Apache 2.0 license.",
+        "type": "comment"
+    },
+    "20": {
+        "file_id": 3,
+        "content": "/__init__.py",
+        "type": "filepath"
+    },
+    "21": {
+        "file_id": 3,
+        "content": "This code is a Python module with the license and copyright information. It imports a class named PaddleVideo from the tools package, and defines its availability as part of the __all__ list.",
+        "type": "summary"
+    },
+    "22": {
+        "file_id": 3,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n__all__ = ['PaddleVideo']\nfrom .tools import PaddleVideo",
+        "type": "code",
+        "location": "/__init__.py:1-16"
+    },
+    "23": {
+        "file_id": 3,
+        "content": "This code is a Python module with the license and copyright information. It imports a class named PaddleVideo from the tools package, and defines its availability as part of the __all__ list.",
+        "type": "comment"
+    },
+    "24": {
+        "file_id": 4,
+        "content": "/applications/AbnormalActionDetection/README.md",
+        "type": "filepath"
+    },
+    "25": {
+        "file_id": 4,
+        "content": "This PaddleVideo code performs video action detection, using SlowFast+FasterRCNN model for abnormal behavior detection, with data preparation, training, evaluation, inference, and deployment. The code exports, deploys static models, and provides deployment instructions.",
+        "type": "summary"
+    },
+    "26": {
+        "file_id": 4,
+        "content": "# 异常行为识别\n## 内容\n- [模型简介](#模型简介)\n- [数据准备](#数据准备)\n- [模型训练](#模型训练)\n- [模型评估](#模型评估)\n- [模型推理](#模型推理)\n- [模型部署](#模型部署)\n- [参考论文](#参考论文)\n## 模型简介\n该代码库用于异常行为检测, 基于paddle2.2版本开发，结合PaddleVideo中的SlowFast+FasterRCNN模型实现7个异常行为的检测。\n主要框架如下：\n<div align=\"center\">\n  <img src=\"./images/SlowFast_FasterRCNN.png\" width=\"640px\"/><br>\n</div>\nAIStudio项目: [基于时空信息的异常行为检测](https://aistudio.baidu.com/aistudio/projectdetail/3431613)\n## 数据准备\n### Step1 稀疏抽取视频帧\n首先稀疏抽取视频帧用于检测每帧中人的位置：\n```\ncd data/ava/script && bash extract_video_frames.sh abnormal_action_videos abnormal_action_frames 2\n```\n* 第一个参数abnormal_action_videos：被抽帧的视频根目录；\n* 第二个参数abnormal_action_frames：抽取的视频帧存放目录；\n* 第三个参数2：抽帧帧率。\n### Step2 目标检测\n用成熟的可检测人的目标检测模型检测上述步骤抽得的视频帧中的人。如PaddleDetection套件中的基于coco数据集训练得到的[PP-YOLOv2](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/ppyolo)模型。\n### Step3 生成pkl文件\n将上述步骤得到的每个视频帧的检测结果进行转化，得到SlowFast_FasterRCNN模型需要的输入格式。注意我们只需要人的检测结果，其他目标不需要。\nSlowFast_FasterRCNN模型需要的proposals是pkl格式文件，该文件以字典形式存储检测结果，字典的key是视频帧的索引（video_id+frame_id拼接得到），value是一个list，每个元素是检测得到的人的位置信息和置信度。",
+        "type": "code",
+        "location": "/applications/AbnormalActionDetection/README.md:1-40"
+    },
+    "27": {
+        "file_id": 4,
+        "content": "This code is for abnormal behavior detection using the PaddleVideo framework with SlowFast+FasterRCNN model, consisting of 6 steps: data preparation (sparse frame extraction, target detection, generating pkl files), model training, evaluation, inference, and deployment.",
+        "type": "comment"
+    },
+    "28": {
+        "file_id": 4,
+        "content": "```\n{\n    打架,0001:\n        [[0.036    0.098    0.55     0.979    0.995518] # x1,y1,x2,y2,score\n        [0.443    0.04     0.99     0.989    0.977824]]\n}\n```\n### Step4 密集抽取视频帧\n对视频数据进行密集抽帧。\nSlowFast_FasterRCNN输入的视频帧是密集帧，因此需要再次对视频进行抽帧。具体命令如下：\n```\ncd data/ava/script && bash extract_video_frames.sh abnormal_action_videos abnormal_action_frames_30fps 30\n```\n具体参数同步骤1，只不过次数抽帧率为30fps。\n### Step5 准备标签数据\n标签数据以pbtxt文件个数存储，本案例具体如下（注意行为标签id从1开始）：\n```\nitem {\n  name: \"挥棍\"\n  id: 1\n}\nitem {\n  name: \"打架\"\n  id: 2\n}\nitem {\n  name: \"踢东西\"\n  id: 3\n}\nitem {\n  name: \"追逐\"\n  id: 4\n}\nitem {\n  name: \"争吵\"\n  id: 5\n}\nitem {\n  name: \"快速奔跑\"\n  id: 6\n}\nitem {\n  name: \"摔倒\"\n  id: 7\n}\n```\n## 模型训练\n异常行为检测模型基于在AVA数据集上训练得到模型进行迁移学习。具体训练命令如下：\n```\npython main.py --validate -w AVA_SlowFast_FastRcnn_best.pdparams \\\n -c configs/abnoraml_action.yaml\n```\n - w 预训练模型路径\n - c 配置文件路径\n## 模型评估\n```\npython main.py --test \\\n   -w abnormal_action_SlowFast_FastRcnn.pdparams \\\n   -c configs/abnoraml_action.yaml\n```\n## 模型推理\n基于动态图的推理：\n```\npython tools/ava_predict.py \\\n  -c configs/abnoraml_action.yaml \\\n  -w abnormal_action_SlowFast_FastRcnn.pdparams \\",
+        "type": "code",
+        "location": "/applications/AbnormalActionDetection/README.md:42-114"
+    },
+    "29": {
+        "file_id": 4,
+        "content": "Step 4: Extracts video frames at a rate of 30fps for SlowFast_FasterRCNN input.\nStep 5: Stores label data as pbtxt files with action IDs starting from 1.\nModel training using pre-trained AVA model and config file.\nModel evaluation on abnormal action detection.\nModel inference using dynamic graph execution.",
+        "type": "comment"
+    },
+    "30": {
+        "file_id": 4,
+        "content": "  --video_path data/wave_9.mp4 \\\n  --detection_model_name 'faster_rcnn/faster_rcnn_r50_fpn_1x_coco' \\\n  --detection_model_weights 'faster_rcnn_r50_fpn_1x_coco.pdparams'\n```\n- video_path 视频路径\n- detection_model_name 检测模型名称\n- detection_model_weights 检测模型权重路径\n基于静态图模型进行推理：\n导出模型，动态图模型转换为静态图模型：\n```\npython tools/export_model.py \\\n  -c configs/abnoraml_action.yaml \\\n  -o inference_output \\\n  -p abnormal_action_SlowFast_FastRcnn.pdparams\n```\n- o 导出模型存放文件夹\n- p 被导出模型路径\n基于导出的模型做推理：\n```\npython tools/predict.py \\\n    -c configs/abnoraml_action.yaml \\\n    --input_file \"data/wave_9.mp4\" \\\n    --model_file \"inference_output/abnormal_action_SlowFast_FastRcnn.pdmodel\" \\\n    --params_file \"inference_output/abnormal_action_SlowFast_FastRcnn.pdiparams\" \\\n    --use_gpu=True \\\n    --use_tensorrt=False\n```\n## 模型部署\n请参考[Paddle Inference示例](https://paddle-inference.readthedocs.io/en/latest/quick_start/python_demo.html)\n## 参考论文\n- [SlowFast Networks for Video Recognition](https://arxiv.org/pdf/1812.03982.pdf), Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, Kaiming He",
+        "type": "code",
+        "location": "/applications/AbnormalActionDetection/README.md:115-153"
+    },
+    "31": {
+        "file_id": 4,
+        "content": "This code is for video action detection using PaddleVideo. It exports a static model, converts dynamic to static model, performs inference, and deploys the model. The parameters include video path, detection model name, and weights path. Deployment instructions are provided with a reference to a relevant paper.",
+        "type": "comment"
+    },
+    "32": {
+        "file_id": 5,
+        "content": "/applications/Anti-UAV/README.md",
+        "type": "filepath"
+    },
+    "33": {
+        "file_id": 5,
+        "content": "This code provides instructions to detect UAVs in restricted areas using PaddleDetection, with data preparation and dependency installation steps. Users can customize the configuration file and trained model for specific use cases.",
+        "type": "summary"
+    },
+    "34": {
+        "file_id": 5,
+        "content": "# Paddle-Anti-UAV\nAnti-UAV base on PaddleDetection\n## Background\nUAVs are very popular and we can see them in many public spaces, such as parks and playgrounds. Most people use UAVs for taking photos.\nHowever, many areas like airport forbiden UAVs since they are potentially dangerous. In this case, we need to detect the flying UAVs in\nthese areas.\nIn this repository, we show how to train a detection model using [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection).\n## Data preparation\nThe dataset can be found [here](https://anti-uav.github.io/dataset/). We direcly download the ```test-dev``` split composed of 140 videos\ntrain the detection model.\n* Download the ```test-dev``` dataset.\n* Run `unzip Anti_UAV_test_dev.zip -d Anti_UAV`.\n* Run `python get_image_label.py`. In this step, you may change the path to the videos and the value of `interval`.\nAfter the above steps, you will get a MSCOCO-style datasst for object detection.\n## Install PaddleDetection\nPlease refer to this [link](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.3/docs/tutorials/INSTALL.md).",
+        "type": "code",
+        "location": "/applications/Anti-UAV/README.md:1-21"
+    },
+    "35": {
+        "file_id": 5,
+        "content": "This code is for the Paddle-Anti-UAV application that uses PaddleDetection to detect flying UAVs in restricted areas. It provides details on data preparation, where to download and unzip the dataset, and how to install PaddleDetection.",
+        "type": "comment"
+    },
+    "36": {
+        "file_id": 5,
+        "content": "We use `python=3.7`, `Paddle=2.2.1`, `CUDA=10.2`.\n## Train PP-YOLO\nWe use [PP-YOLO](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.3/configs/ppyolo) as the detector.\n* Run `git clone https://github.com/PaddlePaddle/PaddleDetection.git`. Note that you should finish this step when you install PaddleDetection.\n* Move the anti-UAV dataset to `dataset`.\n* Move `anti_uav.yml` to `configs/datasets`, move `ppyolo_r50vd_dcn_1x_antiuav.yml` to `configs/ppyolo` and move `ppyolo_r50vd_dcn_antiuav.yml`\nto `configs/ppyolo/_base`.\n* Keep the value of `anchors` in `configs/ppyolo/_base/ppyolo_reader.yml` the same as `ppyolo_r50vd_dcn_antiuav.yml`.\n* Run `python -m paddle.distributed.launch --log_dir=./ppyolo_dygraph/ --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_antiuav.yml &>ppyolo_dygraph.log 2>&1 &`.\nNote that you may change the arguments, such as `batch_size` and `gups`.\n## Inference\nPlease refer to the infernce section on this [webpage](https://github.com/Paddle",
+        "type": "code",
+        "location": "/applications/Anti-UAV/README.md:23-36"
+    },
+    "37": {
+        "file_id": 5,
+        "content": "The code outlines the process to train and use PP-YOLO for UAV detection using PaddleDetection in a specific environment. It involves cloning a repository, moving dataset files, adjusting configurations, and running training and inference commands with specific arguments.",
+        "type": "comment"
+    },
+    "38": {
+        "file_id": 5,
+        "content": "Paddle/PaddleDetection/blob/release/2.3/docs/tutorials/GETTING_STARTED.md). You can just switch the configeration file and trained model to your own files.\n![](https://github.com/qingzwang/Paddle-Anti-UAV/blob/main/demo1.gif)\n![](https://github.com/qingzwang/Paddle-Anti-UAV/blob/main/demo.gif)",
+        "type": "code",
+        "location": "/applications/Anti-UAV/README.md:36-39"
+    },
+    "39": {
+        "file_id": 5,
+        "content": "The code snippet is referring to the README file of an anti-UAV application based on PaddleVideo. It demonstrates two GIFs showing the demo in action and mentions that users can customize the configuration file and trained model for their own use cases.",
+        "type": "comment"
+    },
+    "40": {
+        "file_id": 6,
+        "content": "/applications/Anti-UAV/get_image_label.py",
+        "type": "filepath"
+    },
+    "41": {
+        "file_id": 6,
+        "content": "The code initializes paths, defines train_info and val_info dictionaries, sets interval variables, processes video frames, draws rectangles around objects based on labels, saves images with label information, and writes data to train.json and val.json files after processing all data from given folders.",
+        "type": "summary"
+    },
+    "42": {
+        "file_id": 6,
+        "content": "import cv2\nimport os\nimport json\n# please change it to your path\npath = '/workspace/wangqingzhong/Anti_UAV'\nannotation_path = 'annotations'\ntrain_img_path = 'train_imgs'\nval_img_path = 'val_imgs'\nif not os.path.exists(annotation_path):\n    os.makedirs(annotation_path)\nif not os.path.exists(train_img_path):\n    os.makedirs(train_img_path)\nif not os.path.exists(val_img_path):\n    os.makedirs(val_img_path)\ntrain_info = {\n    'images': [],\n    'type':\n    'instances',\n    'annotations': [],\n    'categories': [{\n        \"supercategory\": \"none\",\n        \"id\": 1,\n        \"name\": \"drone\"\n    }, {\n        \"supercategory\": \"none\",\n        \"id\": 2,\n        \"name\": \"noise\"\n    }]\n}\nval_info = {\n    'images': [],\n    'type':\n    'instances',\n    'annotations': [],\n    'categories': [{\n        \"supercategory\": \"none\",\n        \"id\": 1,\n        \"name\": \"drone\"\n    }, {\n        \"supercategory\": \"none\",\n        \"id\": 2,\n        \"name\": \"noise\"\n    }]\n}\n# you can change it\ninterval = 5\ndirs = os.listdir(path)\ntrain_img_id = 0\nval_img_id = 0\nfor d in dirs:",
+        "type": "code",
+        "location": "/applications/Anti-UAV/get_image_label.py:1-53"
+    },
+    "43": {
+        "file_id": 6,
+        "content": "The code initializes necessary paths and directories for annotation, training, and validation image paths. It creates the required directories if they do not exist. The code defines train_info and val_info as dictionaries containing information about images, annotations, and categories. It sets the interval variable for file processing and then iterates through each directory in the given path.",
+        "type": "comment"
+    },
+    "44": {
+        "file_id": 6,
+        "content": "    if 'new' in d:\n        video_file = os.path.join(path, d, 'IR.mp4')\n        label_file = os.path.join(path, d, 'IR_label.json')\n        labels = json.load(open(label_file, 'r'))\n        exits = labels['exist']\n        gt_bbox = labels['gt_rect']\n        assert len(exits) == len(gt_bbox)\n        videocap = cv2.VideoCapture(video_file)\n        i = 0\n        while True:\n            success, frame = videocap.read()\n            if success:\n                if i % interval == 0:\n                    img_name = d + '_' + str(i) + '.jpg'\n                    cv2.imwrite(os.path.join(val_img_path, img_name), frame)\n                    height, width, depth = frame.shape\n                    x, y, w, h = gt_bbox[i]\n                    isexist = exits[i]\n                    if isexist:\n                        category_id = 1\n                    else:\n                        category_id = 2\n                    draw_frame = cv2.rectangle(frame, (x, y), (x + w, y + h),\n                                               (0, 255, 0), 2)",
+        "type": "code",
+        "location": "/applications/Anti-UAV/get_image_label.py:54-77"
+    },
+    "45": {
+        "file_id": 6,
+        "content": "This code reads an image file and its label from a specified path. It then processes each frame of the video, drawing a rectangle around the object in the frame based on the provided labels. If the object exists, category_id is set to 1; otherwise, it's set to 2. Each processed frame is saved as an image file with its corresponding label information.",
+        "type": "comment"
+    },
+    "46": {
+        "file_id": 6,
+        "content": "                    img_name_draw = d + '_' + str(i) + 'draw.jpg'\n                    cv2.imwrite(os.path.join(val_img_path, img_name_draw),\n                                draw_frame)\n                    img_info = {\n                        'file_name': img_name,\n                        'height': float(height),\n                        'width': float(width),\n                        'id': val_img_id\n                    }\n                    ann_info = {\n                        'area': float(w) * float(h),\n                        'iscrowd': 0,\n                        'bbox': [float(x),\n                                 float(y),\n                                 float(w),\n                                 float(h)],\n                        'category_id': category_id,\n                        'ignore': 0,\n                        'image_id': val_img_id,\n                        'id': val_img_id + 1\n                    }\n                    val_info['images'].append(img_info)\n                    val_info['annotations'].append(ann_info)",
+        "type": "code",
+        "location": "/applications/Anti-UAV/get_image_label.py:78-101"
+    },
+    "47": {
+        "file_id": 6,
+        "content": "This code writes an image, creates image information (file name, height, width, and id), and annotation information (area, iscrowd, bbox coordinates, category_id, ignore, image_id, and id). It then appends the image and annotation information to the existing val_info data structure.",
+        "type": "comment"
+    },
+    "48": {
+        "file_id": 6,
+        "content": "                    val_img_id += 1\n                i += 1\n            else:\n                print('finish {}'.format(d))\n                break\n    else:\n        video_file = os.path.join(path, d, 'IR.mp4')\n        label_file = os.path.join(path, d, 'IR_label.json')\n        labels = json.load(open(label_file, 'r'))\n        exits = labels['exist']\n        gt_bbox = labels['gt_rect']\n        assert len(exits) == len(gt_bbox)\n        videocap = cv2.VideoCapture(video_file)\n        i = 0\n        while True:\n            success, frame = videocap.read()\n            if success:\n                if i % interval == 0:\n                    img_name = d + '_' + str(i) + '.jpg'\n                    cv2.imwrite(os.path.join(train_img_path, img_name), frame)\n                    height, width, depth = frame.shape\n                    x, y, w, h = gt_bbox[i]\n                    isexist = exits[i]\n                    if isexist:\n                        category_id = 1\n                    else:\n                        category_id = 2",
+        "type": "code",
+        "location": "/applications/Anti-UAV/get_image_label.py:102-128"
+    },
+    "49": {
+        "file_id": 6,
+        "content": "Code reads a video and its corresponding label file, then extracts frames based on labels and saves them. If the object exists in the frame, it is labeled as category_id 1, otherwise as category_id 2. The process continues until all frames have been processed or a \"finish\" message is encountered.",
+        "type": "comment"
+    },
+    "50": {
+        "file_id": 6,
+        "content": "                    draw_frame = cv2.rectangle(frame, (x, y), (x + w, y + h),\n                                               (0, 255, 0), 2)\n                    img_name_draw = d + '_' + str(i) + 'draw.jpg'\n                    cv2.imwrite(os.path.join(train_img_path, img_name_draw),\n                                draw_frame)\n                    img_info = {\n                        'file_name': img_name,\n                        'height': height,\n                        'width': width,\n                        'id': train_img_id\n                    }\n                    ann_info = {\n                        'area': float(w) * float(h),\n                        'iscrowd': 0,\n                        'bbox': [float(x),\n                                 float(y),\n                                 float(w),\n                                 float(h)],\n                        'category_id': category_id,\n                        'ignore': 0,\n                        'image_id': train_img_id,\n                        'id': train_img_id + 1",
+        "type": "code",
+        "location": "/applications/Anti-UAV/get_image_label.py:129-151"
+    },
+    "51": {
+        "file_id": 6,
+        "content": "This code draws a rectangle around the detected object in an image, saves the image with the drawn rectangle, and creates two dictionaries (image and annotation information) to be used for training purposes.",
+        "type": "comment"
+    },
+    "52": {
+        "file_id": 6,
+        "content": "                    }\n                    train_info['images'].append(img_info)\n                    train_info['annotations'].append(ann_info)\n                    train_img_id += 1\n                i += 1\n            else:\n                print('finish {}'.format(d))\n                break\nwith open('annotations/train.json', 'w') as f:\n    json.dump(train_info, f)\nwith open('annotations/val.json', 'w') as f:\n    json.dump(val_info, f)",
+        "type": "code",
+        "location": "/applications/Anti-UAV/get_image_label.py:152-164"
+    },
+    "53": {
+        "file_id": 6,
+        "content": "Code writes image and annotation information to train.json and val.json files after processing all data from given folders, ending the loop when done.",
+        "type": "comment"
+    },
+    "54": {
+        "file_id": 7,
+        "content": "/applications/BasketballAction/README.md",
+        "type": "filepath"
+    },
+    "55": {
+        "file_id": 7,
+        "content": "PaddleVideo's BasketballAction app uses PaddlePaddle 2.0 and models for basketball action detection, achieving an F1-score of 80.14%. Developed by authors including hari and Joonseok Lee, it optimizes based on speed, time distribution, and feature fusion methods.",
+        "type": "summary"
+    },
+    "56": {
+        "file_id": 7,
+        "content": "# 篮球动作检测模型\n## 内容\n- [模型简介](#模型简介)\n- [数据准备](#数据准备)\n- [模型训练](#模型训练)\n- [模型评估](#模型评估)\n- [模型推理](#模型推理)\n- [模型优化](#模型优化)\n- [模型部署](#模型部署)\n- [参考论文](#参考论文)\n## 模型简介\n该代码库用于篮球动作检测+识别, 基于paddle2.0版本开发，结合PaddleVideo中的ppTSM, BMN, attentionLSTM的多个视频模型进行视频时空二阶段检测算法。\n主要分为如下几步\n - 特征抽取\n    - 图像特性，ppTSM\n    - 音频特征，Vggsound\n - proposal提取，BMN\n - LSTM，动作分类 + 回归\n## 数据准备\n数据集处理代码\n```\n参考https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/datasets\n```\n- 数据集label格式\n```\n{\n    \"0\": \"背景\",\n    \"1\": \"回放\",\n    \"2\": \"进球-三分球\",\n    \"3\": \"进球-两分球\",\n    \"4\": \"进球-扣篮\",\n    \"5\": \"罚球\",\n    \"6\": \"跳球\"\n}\n```\n- 数据集gts处理, 将原始标注数据处理成如下json格式\n```\n{\n    'fps': 5,\n    'gts': [\n        {\n            'url': 'xxx.mp4',\n            'total_frames': 6341,\n            'actions': [\n                {\n                    \"label_ids\": [6],\n                    \"label_names\": [\"跳球\"],\n                    \"start_id\": 395,\n                    \"end_id\": 399\n                },\n                ...\n            ]\n        },\n        ...\n    ]\n}\n```\n- 数据集抽帧, 由mp4, 得到frames和pcm, 这里需要添加ffmpeg环境\n```\ncd datasets/script && python get_frames_pcm.py",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:1-69"
+    },
+    "57": {
+        "file_id": 7,
+        "content": "This code is for basketball action detection, using PaddlePaddle 2.0 and incorporating various video models from PaddleVideo (ppTSM, BMN, attentionLSTM). The process includes image feature extraction with ppTSM, proposal extraction with BMN, and LSTM-based action classification and regression. Dataset preparation involves data handling, label format specification, gts processing to JSON format, and abstracting frames from mp4 files using ffmpeg.",
+        "type": "comment"
+    },
+    "58": {
+        "file_id": 7,
+        "content": "```\n- 数据预处理后保存格式如下\n```\n   |--  datasets                   # 训练数据集和处理脚本\n        |--  basketball            # xx数据集\n            |--  mp4               # 原始视频.mp4\n            |--  frames            # 图像帧, fps=5, '.jpg'格式\n            |--  pcm               # 音频pcm, 音频采样率16000，采用通道数1\n            |--  url.list          # 视频列表\n            |--  label_train.json  # 训练集原始gts\n            |--  label_val.json    # 验证集原始gts\n```\n## 模型训练\n代码参考足球动作检测：https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction\n将该代码库的文件夹 [datasets](https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/datasets)，[extractor](https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/extractor)，[train_lstm](https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/train_lstm)， 拷贝到本代码库复用。\n - image 采样频率fps=5，如果有些动作时间较短，可以适当提高采样频率\n - BMN windows=200，即40s，所以测试自己的数据时，视频时长需大于40s\n### 基础镜像\n```\ndocker pull tmtalgo/paddleaction:action-detection-v2\n```\n### step1 ppTSM训练\n我们提供了篮球数据训练的模型，参考checkpoints_basketball。如果使用提供的pptsm模型，可直接跳过下边的pptsm训练数据处理和训练步骤。",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:70-99"
+    },
+    "59": {
+        "file_id": 7,
+        "content": "This code describes the storage location and structure of a basketball action dataset, including video files (mp4), image frames, audio files (pcm), and JSON files containing ground truth data. It also references the PaddleVideo footbal",
+        "type": "comment"
+    },
+    "60": {
+        "file_id": 7,
+        "content": "如果需要在自己的数据上训练，ppTSM训练代码为：https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\nppTSM文档参考：https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/pp-tsm.md\n#### step1.1  ppTSM 训练数据处理\n由frames结合gts生成训练所需要的正负样本\n```\ncd ${BasketballAction}\ncd datasets/script && python get_instance_for_tsn.py\n# 文件名按照如下格式\n'{}_{}_{}_{}'.format(video_basename, start_id, end_id, label)\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                   # 训练数据集和处理脚本\n        |--  basketball           # xx数据集\n            |--  input_for_tsn     # tsn/tsm训练的数据\n```\n#### step1.2 ppTSM模型训练\n```\n# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\ncd ${PaddleVideo}\n# 修改config.yaml参数修改为 ${BasketballAcation}/configs_train/pptsm_basketball.yaml\npython -B -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    --log_dir=$save_dir/logs \\\n    main.py  \\\n    --validate \\\n    -c {BasketballAcation}/configs_train/pptsm_basketball.yaml \\\n    -o output_dir=$save_dir\n```\n#### step1.3 ppTSM模型转为预测模式\n```\n# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:100-135"
+    },
+    "61": {
+        "file_id": 7,
+        "content": "Step 1.1: Prepare ppTSM training data by combining frames and gts to generate positive and negative samples, following the format '{}_{}_{}_{}'.format(video_basename, start_id, end_id, label).\n\nStep 1.2: Train ppTSM model using the prepared dataset by modifying config.yaml parameters and running main.py with distributed launch script.\n\nStep 1.3: Convert trained ppTSM model to prediction mode for inference.",
+        "type": "comment"
+    },
+    "62": {
+        "file_id": 7,
+        "content": "$cd {PaddleVideo}\npython tools/export_model.py -c ${BasketballAcation}/configs_train/pptsm_basketball.yaml \\\n                               -p ${pptsm_train_dir}/checkpoints/models_pptsm/ppTSM_epoch_00057.pdparams \\\n                               -o {BasketballAcation}/checkpoints/ppTSM\n```\n####  step1.4 基于ppTSM视频特征提取\nimage and audio特征提取，保存到datasets features文件夹下\n```\ncd ${BasketballAcation}\ncd extractor && python extract_feat.py\n# 特征维度, image(2048) + audio(1024) + pcm(640)\n# 特征保存格式如下，将如下dict保存在pkl格式，用于接下来的BMN训练\nvideo_features = {'image_feature': np_image_features,\n                  'audio_feature': np_audio_features\n                  'pcm_feature': np_pcm_features}\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                   # 训练数据集和处理脚本\n        |--  basketball            # xx数据集\n            |--  features          # 视频的图像+音频特征\n```\n### step2 BMN训练\nBMN训练代码为：https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\nBMN文档参考：https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/localization/bmn.md",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:136-163"
+    },
+    "63": {
+        "file_id": 7,
+        "content": "The provided code is related to a PaddleVideo application called BasketballAction. It performs two steps - extracting features from images and audio, and training a BMN model. The extracted features are stored in the datasets/basketball/features directory. The BMN training code can be found at this GitHub link, and more information about BMN can be found in this documentation.",
+        "type": "comment"
+    },
+    "64": {
+        "file_id": 7,
+        "content": "#### step2.1 BMN训练数据处理\n用于提取二分类的proposal，windows=40，根据gts和特征得到BMN训练所需要的数据集\n```\ncd ${BasketballAcation}\ncd datasets/script && python get_instance_for_bmn.py\n# 数据格式\n{\n    \"719b0a4bcb1f461eabb152298406b861_753_793\": {\n        \"duration_second\": 40.0,\n        \"duration_frame\": 200,\n        \"feature_frame\": 200,\n        \"subset\": \"train\",\n        \"annotations\": [\n            {\n                \"segment\": [\n                    15.0,\n                    22.0\n                ],\n                \"label\": \"6.0\",\n                \"label_name\": \"跳球\"\n            }\n        ]\n    },\n    ...\n}\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                   # 训练数据集和处理脚本\n        |--  basketball            # xx数据集\n            |--  input_for_bmn     # bmn训练的proposal         \n```\n#### step2.2  BMN模型训练\n```\n# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\ncd ${PaddleVideo}\n# 修改config.yaml参数修为${BasketballAcation}/configs_train/bmn_basketball.yaml\npython -B -m paddle.distributed.launch \\\n     --gpus=\"0,1\" \\\n     --log_dir=$out_dir/logs \\\n     main.py  \\",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:165-206"
+    },
+    "65": {
+        "file_id": 7,
+        "content": "Step 2.1 involves processing the Basketball Action dataset to generate binary proposals for BMN training, with a window size of 40. This is done using the get_instance_for_bmn.py script in the datasets/script directory. The resulting data format consists of instance identifiers, duration and feature frame numbers, subset information (train or test), and annotations containing segment locations and labels. Step 2.2 involves training the BMN model, requiring modification of the config.yaml file with Basketball Action-specific parameters and launching the main.py script using PaddlePaddle's distributed training functionality.",
+        "type": "comment"
+    },
+    "66": {
+        "file_id": 7,
+        "content": "     --validate \\\n     -c ${BasketballAcation}/configs_train/bmn_basketball.yaml \\\n     -o output_dir=$out_dir\n```\n#### step2.3 BMN模型转为预测模式\n```\n# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\n${PaddleVideo}\npython tools/export_model.py -c $${BasketballAcation}/configs_train/bmn_basketball.yaml \\\n                               -p ${bmn_train_dir}/checkpoints/models_bmn/bmn_epoch16.pdparams \\\n                               -o {BasketballAcation}/checkpoints/BMN\n```\n#### step2.4  BMN模型预测\n得到动作proposal信息： start_id, end_id, score\n```\ncd ${BasketballAcation}\ncd extractor && python extract_bmn.py\n# 数据格式\n[\n    {\n        \"video_name\": \"c9516c903de3416c97dae91a59e968d7\",\n        \"num_proposal\": 5534,\n        \"bmn_results\": [\n            {\n                \"start\": 7850.0,\n                \"end\": 7873.0,\n                \"score\": 0.77194699622342\n            },\n            {\n                \"start\": 4400.0,\n                \"end\": 4443.0,\n                \"score\": 0.7663803287641536\n            },\n            ...\n        ]",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:207-243"
+    },
+    "67": {
+        "file_id": 7,
+        "content": "The code is used in the BasketballAction application of the PaddleVideo library. It converts the BMN model to prediction mode (step2.3), exports it, and then performs BMN-based predictions (step2.4) for obtaining action proposal information like start, end, and score. This helps identify basketball actions from given videos.",
+        "type": "comment"
+    },
+    "68": {
+        "file_id": 7,
+        "content": "    },\n    ...\n]\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                   # 训练数据集和处理脚本\n        |--  basketball            # xx数据集\n            |--  feature_bmn\n                 |--  prop.json    # bmn 预测结果\n```\n### step3 LSTM训练\nLSTM训练代码为：train_lstm\n#### step3.1  LSTM训练数据处理\n将BMN得到的proposal截断并处理成LSTM训练所需数据集\n```\ncd ${BasketballAcation}\ncd datasets/script && python get_instance_for_lstm.py\n# 数据格式1，label_info\n{\n    \"fps\": 5,\n    \"results\": [\n        {\n            \"url\": \"https://xxx.mp4\",\n            \"mode\": \"train\",        # train or validation\n            \"total_frames\": 6128,\n            \"num_gts\": 93,\n            \"num_proposals\": 5043,\n            \"proposal_actions\": [\n                {\n                    \"label\": 6,\n                    \"norm_iou\": 0.7575757575757576,\n                    \"norm_ioa\": 0.7575757575757576,\n                    \"norm_start\": -0.32,\n                    \"proposal\": {\n                        \"start\": 5011,\n                        \"end\": 5036,\n                        \"score\": 0.7723643666324231\n                    },",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:244-284"
+    },
+    "69": {
+        "file_id": 7,
+        "content": "This code represents a JSON object containing information about a dataset for LSTM training. It includes the frame rate (fps), whether it's for training or validation, the total number of frames, the number of ground truth (gt) instances, and the number of proposals. The proposals contain details like label, normalized IOU, start time, end time, and score. This data is used to train a LSTM model in BasketballAction application.",
+        "type": "comment"
+    },
+    "70": {
+        "file_id": 7,
+        "content": "                    \"hit_gts\": {\n                        \"label_ids\": [\n                            6\n                        ],\n                        \"label_names\": [\n                            \"跳球\"\n                        ],\n                        \"start_id\": 5003,\n                        \"end_id\": 5036\n                    }\n                },\n                ...\n        },\n        ...\n}\n# 数据格式2，LSTM训练所需要的feature\n{\n    'features': np.array(feature_hit, dtype=np.float32),    # TSM audio and pcm 特征, 可根据需求选择组合\n    'feature_fps': 5,                                       # fps = 5\n    'label_info': {'norm_iou': 0.5, 'label': 3, ...},       # 数据格式1中的'proposal_actions'\n    'video_name': 'c9516c903de3416c97dae91a59e968d7'        # video_name\n}\n# 数据格式3，LSTM训练所需label.txt\n'{} {}'.format(filename, label)\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                   # 训练数据集和处理脚本\n        |--  basketball            # xx数据集\n            |--  input_for_lstm    # LSTM训练数据集\n```\n#### step3.2  LSTM训练\n```\n#conf.yaml修改为 ${BasketballAcation}/configs_train/lstm_basketball.yaml",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:285-319"
+    },
+    "71": {
+        "file_id": 7,
+        "content": "This code is from the PaddleVideo library's BasketballAction application and provides information on data formats for LSTM training. The first format contains label information, start and end IDs in a JSON object. The second format includes features like audio, pcm, fps, and label info in a NumPy array. The third format is the label.txt file for LSTM training. After completing these steps, the trained data will be stored in the \"input_for_lstm\" folder within the BasketballAction dataset folder.",
+        "type": "comment"
+    },
+    "72": {
+        "file_id": 7,
+        "content": "cd ${BasketballAcation}\npython -u scenario_lib/train.py \\\n    --model_name=ActionNet \\\n    --config=${BasketballAcation}/configs_train/lstm_basketball.yaml \\\n    --save_dir=${out_dir}\"/models_lstm/\" \\\n    --log_interval=5 \\\n    --valid_interval=1\n```\n#### step3.3 LSTM模型转为预测模式\n```\n${BasketballAcation}\npython tools/export_model.py -c ${BasketballAction}/train_lstm/conf/conf.yaml \\\n                               -p ${lstm_train_dir}/checkpoints/models_lstm/bmn_epoch29.pdparams \\\n                               -o {BasketballAcation}/checkpoints/LSTM\n```\n## 模型推理\n测试数据格式，可参考使用样例\n```\nwget https://videotag.bj.bcebos.com/Applications/basketball/datasets.tar.gz\n```\n测试模型，可使用我们提供的模型\n```\nwget https://videotag.bj.bcebos.com/Applications/basketball/checkpoints_basketball.tar.gz\n```\n运行预测代码\n```\ncd ${BasketballAction}\ncd predict\n# 如果使用自己训练的模型，请将各训练过程中转换的inference模型放到predict库\n# cp -rf ../checkpoints checkpoints_basketball\npython predict.py\n```\n产出文件\n```\n${BasketballAction}/predict/results.json\n```\n## 模型评估\n```\ncd ${BasketballAction}\ncd predict\npython eval.py results.json",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:320-365"
+    },
+    "73": {
+        "file_id": 7,
+        "content": "The code is converting the trained LSTM model to prediction mode, running model inference on testing data, and evaluating the results. This process involves using pre-prepared datasets and models from provided URLs for easy execution of prediction and evaluation.",
+        "type": "comment"
+    },
+    "74": {
+        "file_id": 7,
+        "content": "```\n## 模型优化\n在实际使用场景中可根据视频内容尝试优化策略\n- 可根据动作运动速度，调整抽帧采样率，本代码默认为fps=5\n- 统计动作的时间分布，调整bmn采样窗口\n- 根据图像和音频的关联程度，调整图像和音频特征的融合方式：本代码将图像特征和音频在时间维度对齐，融合后再进入模型训练。也可尝试分别模型训练后，加权融合等\n- 本代码的解决方案也可用于其他动作检测。变换场景后，图像特征重新训练效果更好。音频特征采用的VGGSound训练，如果使用场景仍为生活场景，可直接复用。\n## 模型部署\n本代码解决方案在动作的检测和召回指标F1-score=80.14%\n<div align=\"center\">\n  <img src=\"images/BasketballAction_demo.gif\" width=\"640px\"/><br>\n</div>\n## 参考论文\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han\n- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.\n- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen\n- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kot",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:366-389"
+    },
+    "75": {
+        "file_id": 7,
+        "content": "This code uses the TSM and BMN models for efficient video understanding, allowing for action detection with a F1-score of 80.14%. The code can optimize based on motion speed, time distribution, and feature fusion methods, and is applicable to other action detection scenarios.",
+        "type": "comment"
+    },
+    "76": {
+        "file_id": 7,
+        "content": "hari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan",
+        "type": "code",
+        "location": "/applications/BasketballAction/README.md:389-389"
+    },
+    "77": {
+        "file_id": 7,
+        "content": "Code represents authors of a paper or contributors to the project, including hari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan.",
+        "type": "comment"
+    },
+    "78": {
+        "file_id": 8,
+        "content": "/applications/BasketballAction/predict/action_detect/action.py",
+        "type": "filepath"
+    },
+    "79": {
+        "file_id": 8,
+        "content": "The Python code initializes a ModelPredict class for basketball action detection using image, audio, and property features. It infers actions by extracting features and utilizing prepared models for classification before saving the output in 'results.json'.",
+        "type": "summary"
+    },
+    "80": {
+        "file_id": 8,
+        "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport functools\nimport numpy as np\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport mfcc.feature_extractor as mfcc_extractor\nimport models.pptsm_infer as image_model\nimport models.audio_infer as audio_model\nimport models.bmn_infer as prop_model\nimport models.lstm_infer as classify_model\nimport logger\nlogger = logger.Logger()\ndef record_time_info(func):\n    \"\"\"decorator func to log cost time for func\n    \"\"\"\n    @functools.wraps(func)\n    def timer(*args):\n        \"\"\"log cost time for func\n        \"\"\"\n        logger.info(\"function [{}] processing ...\".format(func.__name__))\n        start_time = time.time()\n        retval = func(*args)\n        cost_time = round(time.time() - start_time, 5)\n        logger.info(\"function [{}] run time: {:.2f} min\".format(func.__name__, cost_time / 60))\n        return retval\n    return timer\nclass ActionDetection(object):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/action.py:1-44"
+    },
+    "81": {
+        "file_id": 8,
+        "content": "The code is a Python file containing a class for performing basketball action detection using a combination of image, audio, and property features. It uses various models for feature extraction and classification. The `record_time_info` function is a decorator to log the processing time of different functions.",
+        "type": "comment"
+    },
+    "82": {
+        "file_id": 8,
+        "content": "    \"\"\"ModelPredict\"\"\"\n    def __init__(self, cfg_file=\"configs/configs.yaml\"):\n        cfg = parse_config(cfg_file)\n        self.configs = cfg\n        print_configs(self.configs, \"Infer\")\n        name = 'COMMON'\n        self.DEBUG          = cfg[name]['DEBUG']\n        self.BMN_ONLY       = cfg[name]['BMN_ONLY']\n        self.LSTM_ONLY      = cfg[name]['LSTM_ONLY']\n        self.PCM_ONLY       = cfg[name]['PCM_ONLY']\n        if self.LSTM_ONLY:\n            self.prop_dict = {}\n            for dataset in ['EuroCup2016']:\n                prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)\n                json_data = json.load(open(prop_json, 'r'))\n                for item in json_data:\n                    basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n                    basename = basename + '/' + item['video_name'] + '.mp4'\n                    self.prop_dict[basename] = item['bmn_results']\n    @record_time_info\n    def load_model(self):\n        \"\"\"\n        load_model\n        \"\"\"",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/action.py:45-71"
+    },
+    "83": {
+        "file_id": 8,
+        "content": "The code initializes a ModelPredict class by parsing a configuration file, setting instance variables based on the configurations, and loading a model. The configurations include settings for debugging, whether to use only BMN (Basketball Motion Network), LSTM (Long Short-Term Memory) or PCM (Prediction of Coming Movement) models, and a dictionary of properties for specific datasets. The load_model method is decorated with @record_time_info, which suggests it records the time taken to execute this function.",
+        "type": "comment"
+    },
+    "84": {
+        "file_id": 8,
+        "content": "        if not self.DEBUG:\n            self.image_model = image_model.InferModel(self.configs)\n            if not self.PCM_ONLY:\n                self.audio_model = audio_model.InferModel(self.configs)\n        if not self.LSTM_ONLY:\n            self.prop_model = prop_model.InferModel(self.configs)\n        if not self.BMN_ONLY:\n            self.classify_model = classify_model.InferModel(self.configs)\n        logger.info(\"==> Action Detection prepared.\")\n    @record_time_info\n    def infer(self, imgs_path, pcm_path, fps=5):\n        \"\"\"\n        extract_feature\n        \"\"\"\n        print(\"imgs_path  = \", imgs_path)\n        self.imgs_path = imgs_path\n        self.pcm_path = pcm_path\n        self.configs['COMMON']['fps'] = fps\n        logger.info(\"==> input video {}\".format(os.path.basename(self.imgs_path)))\n        # step 1: extract feature\n        video_features = self.extract_feature()\n        # step2: get proposal\n        bmn_results = self.extract_proposal(video_features)\n        # step3: classify \n        material = {'feature': video_features, 'proposal': bmn_results}",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/action.py:72-104"
+    },
+    "85": {
+        "file_id": 8,
+        "content": "Code creates InferModels for image, audio, and classification tasks depending on the configurations. The main function infers action by extracting features from images and audio, then classifying them using the prepared models.",
+        "type": "comment"
+    },
+    "86": {
+        "file_id": 8,
+        "content": "        action_results = self.video_classify(material)\n        return bmn_results, action_results\n    @record_time_info\n    def video_classify(self, material):\n        \"\"\"video classify\"\"\"\n        if self.BMN_ONLY:\n            return []\n        action_results = self.classify_model.predict(self.configs, material=material) \n        logger.info('action shape {}'.format(np.array(action_results).shape))\n        return action_results\n    @record_time_info\n    def extract_proposal(self, video_features):\n        \"\"\"extract proposal\"\"\"\n        if self.LSTM_ONLY:\n            basename = self.imgs_path.replace('frames', 'mp4') + '.mp4'\n            bmn_results = self.prop_dict[basename]\n            return bmn_results\n        bmn_results = self.prop_model.predict(self.configs, material=video_features)\n        logger.info('proposal shape {}'.format(np.array(bmn_results).shape))\n        return bmn_results\n    @record_time_info\n    def extract_feature(self):\n        \"\"\"extract feature\"\"\"\n        if not self.DEBUG:\n            image_path_list = get_images(self.imgs_path)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/action.py:105-133"
+    },
+    "87": {
+        "file_id": 8,
+        "content": "This code defines classes for video feature extraction, proposal generation, and action detection. It utilizes model prediction with configured parameters and logs the shapes of results.",
+        "type": "comment"
+    },
+    "88": {
+        "file_id": 8,
+        "content": "            self.configs['PPTSM']['frame_list'] = image_path_list\n            self.configs['AUDIO']['pcm_file'] = self.pcm_path\n            image_features = self.image_model.predict(self.configs)\n            if self.PCM_ONLY:\n                sample_rate = self.configs['AUDIO']['sample_rate']\n                pcm_features = mfcc_extractor.extract_pcm(self.pcm_path, sample_rate)\n                audio_features = []\n            else:\n                audio_features, pcm_features = self.audio_model.predict(self.configs)\n            np_image_features = np.array(image_features, dtype=np.float32)\n            np_audio_features = np.array(audio_features, dtype=np.float32)\n            np_pcm_features = np.array(pcm_features, dtype=np.float32)\n            video_features = {'image_feature': np_image_features,\n                              'audio_feature': np_audio_features,\n                              'pcm_feature': np_pcm_features}\n        else:\n            feature_path = self.imgs_path.replace(\"frames\", \"features\") + '.pkl'",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/action.py:134-152"
+    },
+    "89": {
+        "file_id": 8,
+        "content": "The code configures the model inputs, predicts image and audio features, and stores them in the video_features dictionary. If PCM_ONLY is true, it extracts pcm_features separately. Otherwise, it predicts audio_features along with image_features. If no features are available, it sets feature_path to the image path's corresponding features file.",
+        "type": "comment"
+    },
+    "90": {
+        "file_id": 8,
+        "content": "            video_features = pickle.load(open(feature_path, 'rb'))\n        logger.info(\"feature shape {} {} {}\".format(video_features['image_feature'].shape,\n                                                    video_features['audio_feature'].shape,\n                                                    video_features['pcm_feature'].shape))\n        return video_features\nif __name__ == '__main__':\n    model_predict = ActionDetection(cfg_file=\"../configs/configs.yaml\")\n    model_predict.load_model()\n    imgs_path = \"/home/work/datasets/EuroCup2016/frames/1be705a8f67648da8ec4b4296fa80895\"\n    pcm_path = \"/home/work/datasets/EuroCup2016/pcm/1be705a8f67648da8ec4b4296fa80895.pcm\"\n    bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n    results = {'bmn_results': bmn_results, 'action_results': action_results}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/action.py:153-174"
+    },
+    "91": {
+        "file_id": 8,
+        "content": "This code loads video features from file, checks the shape of image_feature, audio_feature, and pcm_feature arrays, and returns the video features. It then calls the ActionDetection model to infer on given image and audio paths, storing the results in bmn_results and action_results variables. Finally, it saves these results in a 'results.json' file.",
+        "type": "comment"
+    },
+    "92": {
+        "file_id": 9,
+        "content": "/applications/BasketballAction/predict/action_detect/logger.py",
+        "type": "filepath"
+    },
+    "93": {
+        "file_id": 9,
+        "content": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler for the logger. The handler is configured to log INFO level messages and uses a specific log format and date format.",
+        "type": "summary"
+    },
+    "94": {
+        "file_id": 9,
+        "content": "\"\"\"\nlogger\n\"\"\"\nimport os\nimport logging\nclass Logger(logging.Logger):\n    \"\"\"Customized logger for news stripper\n    \"\"\"\n    def __init__(self):\n        super(Logger, self).__init__(self)\n        if not os.path.exists('logs'):\n            os.mkdir('logs')\n        handler = logging.FileHandler(\"logs/action_detect.log\")\n        # handler.setLevel(logging.DEBUG)\n        handler.setLevel(logging.INFO)\n        format = \"%(levelname)s: %(asctime)s: %(filename)s:%(lineno)d %(message)s\"\n        datefmt = \"%y-%m-%d %H:%M:%S\"\n        formatter = logging.Formatter(format, datefmt)\n        handler.setFormatter(formatter)\n        self.addHandler(handler)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/logger.py:1-23"
+    },
+    "95": {
+        "file_id": 9,
+        "content": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler for the logger. The handler is configured to log INFO level messages and uses a specific log format and date format.",
+        "type": "comment"
+    },
+    "96": {
+        "file_id": 10,
+        "content": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py",
+        "type": "filepath"
+    },
+    "97": {
+        "file_id": 10,
+        "content": "This code extracts audio features, converts data to [-1.0, +1.0] range, applies log mel spectrogram, frames into examples for further processing, reads pcm data as bytes, and prints the shape of resulting examples batch.",
+        "type": "summary"
+    },
+    "98": {
+        "file_id": 10,
+        "content": "\"\"\"\naudio feature extract\n\"\"\"\n# coding: utf-8\nimport os\nimport numpy as np\nimport pickle\nimport mfcc.vgg_params as vgg_params\ndef frame(data, window_length, hop_length):\n    \"\"\"\n    frame\n    \"\"\"\n    num_samples = data.shape[0]\n    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))\n    shape = (num_frames, window_length) + data.shape[1:]\n    strides = (data.strides[0] * hop_length, ) + data.strides\n    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)\ndef periodic_hann(window_length):\n    \"\"\"\n    periodic_hann\n    \"\"\"\n    return 0.5 - (0.5 *\n                  np.cos(2 * np.pi / window_length * np.arange(window_length)))\ndef stft_magnitude(signal, fft_length, hop_length=None, window_length=None):\n    \"\"\"\n    stft_magnitude\n    \"\"\"\n    frames = frame(signal, window_length, hop_length)\n    window = periodic_hann(window_length)\n    windowed_frames = frames * window\n    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))\n_MEL_BREAK_FREQUENCY_HERTZ = 700.0\n_MEL_HIGH_FREQUENCY_Q = 1127.0",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:1-41"
+    },
+    "99": {
+        "file_id": 10,
+        "content": "This code defines functions for audio feature extraction, including framing the audio data, applying a window function, and computing the short-time Fourier transform (STFT) magnitude. The _MEL_BREAK_FREQUENCY_HERTZ and _MEL_HIGH_FREQUENCY_Q variables are used for converting frequency values to Mel scale.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/1.json b/docs/data/1.json
new file mode 100644
index 000000000..32ed2d532
--- /dev/null
+++ b/docs/data/1.json
@@ -0,0 +1,541 @@
+{
+    "100": {
+        "file_id": 10,
+        "content": "def hertz_to_mel(frequencies_hertz):\n    \"\"\"\n    hertz_to_mel\n    \"\"\"\n    return _MEL_HIGH_FREQUENCY_Q * np.log(1.0 + (frequencies_hertz /\n                                                 _MEL_BREAK_FREQUENCY_HERTZ))\ndef spectrogram_to_mel_matrix(num_mel_bins=20,\n                              num_spectrogram_bins=129,\n                              audio_sample_rate=8000,\n                              lower_edge_hertz=125.0,\n                              upper_edge_hertz=3800.0):\n    \"\"\"\n    spectrogram_to_mel_matrix\n    \"\"\"\n    nyquist_hertz = audio_sample_rate / 2.\n    if lower_edge_hertz >= upper_edge_hertz:\n        raise ValueError(\"lower_edge_hertz %.1f >= upper_edge_hertz %.1f\" %\n                         (lower_edge_hertz, upper_edge_hertz))\n    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz,\n                                         num_spectrogram_bins)\n    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)\n    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),\n                                 hertz_to_mel(upper_edge_hertz),",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:44-68"
+    },
+    "101": {
+        "file_id": 10,
+        "content": "This code defines two functions: 'hertz_to_mel' and 'spectrogram_to_mel_matrix'. The 'hertz_to_mel' function converts frequencies in hertz to the Mel scale. The 'spectrogram_to_mel_matrix' function creates a mel spectrogram matrix from a given number of mel bins, spectrogram bins, audio sample rate, and frequency limits. It first calculates the spectrogram bins frequencies and then converts them to the Mel scale for each bin edge.",
+        "type": "comment"
+    },
+    "102": {
+        "file_id": 10,
+        "content": "                                 num_mel_bins + 2)\n    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))\n    for i in range(num_mel_bins):\n        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]\n        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /\n                       (center_mel - lower_edge_mel))\n        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /\n                       (upper_edge_mel - center_mel))\n        mel_weights_matrix[:,\n                           i] = np.maximum(0.0,\n                                           np.minimum(lower_slope, upper_slope))\n    mel_weights_matrix[0, :] = 0.0\n    return mel_weights_matrix\ndef log_mel_spectrogram(data,\n                        audio_sample_rate=8000,\n                        log_offset=0.0,\n                        window_length_secs=0.025,\n                        hop_length_secs=0.010,\n                        **kwargs):\n    \"\"\"\n    log_mel_spectrogram\n    \"\"\"\n    window_length_samples = int(round(audio_sample_rate * window_length_secs))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:69-93"
+    },
+    "103": {
+        "file_id": 10,
+        "content": "The code defines a function to calculate the mel-frequency cepstral coefficients (MFCC) for audio data. It initializes an empty matrix for storing the MFCCs, and then iterates through each frequency band. For each band, it calculates the lower and upper slopes of the triangular filter used in the MFCC calculation. The code ensures that the calculated values do not go below zero or exceed the maximum value. Finally, it sets the first row of the matrix to zeros before returning the resulting mel-frequency cepstral coefficients.",
+        "type": "comment"
+    },
+    "104": {
+        "file_id": 10,
+        "content": "    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))\n    fft_length = 2**int(np.ceil(np.log(window_length_samples) / np.log(2.0)))\n    spectrogram = stft_magnitude(data,\n                                 fft_length=fft_length,\n                                 hop_length=hop_length_samples,\n                                 window_length=window_length_samples)\n    mel_spectrogram = np.dot(\n        spectrogram,\n        spectrogram_to_mel_matrix(num_spectrogram_bins=spectrogram.shape[1],\n                                  audio_sample_rate=audio_sample_rate,\n                                  **kwargs))\n    return np.log(mel_spectrogram + log_offset)\ndef wav_to_example(wav_data, sample_rate):\n    \"\"\"\n    wav_to_example\n    \"\"\"\n    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype\n    pad_zero_num = int(sample_rate * (vgg_params.STFT_WINDOW_LENGTH_SECONDS -\n                                      vgg_params.STFT_HOP_LENGTH_SECONDS))\n    wav_data_extend = np.hstack((wav_data, np.zeros(pad_zero_num)))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:94-116"
+    },
+    "105": {
+        "file_id": 10,
+        "content": "Code extracts audio features like STFT spectrogram, converts to Mel scale, and returns log of Mel spectrogram after padding zeroes.",
+        "type": "comment"
+    },
+    "106": {
+        "file_id": 10,
+        "content": "    wav_data = wav_data_extend\n    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]\n    if len(wav_data.shape) > 1:\n        wav_data = np.mean(wav_data, axis=1)\n    log_mel = log_mel_spectrogram(\n        wav_data,\n        audio_sample_rate=vgg_params.SAMPLE_RATE,\n        log_offset=vgg_params.LOG_OFFSET,\n        window_length_secs=vgg_params.STFT_WINDOW_LENGTH_SECONDS,\n        hop_length_secs=vgg_params.STFT_HOP_LENGTH_SECONDS,\n        num_mel_bins=vgg_params.NUM_MEL_BINS,\n        lower_edge_hertz=vgg_params.MEL_MIN_HZ,\n        upper_edge_hertz=vgg_params.MEL_MAX_HZ)\n    # Frame features into examples.\n    features_sample_rate = 1.0 / vgg_params.STFT_HOP_LENGTH_SECONDS\n    example_window_length = int(\n        round(vgg_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))\n    example_hop_length = int(\n        round(vgg_params.EXAMPLE_HOP_SECONDS * features_sample_rate))\n    log_mel_examples = frame(log_mel,\n                             window_length=example_window_length,\n                             hop_length=example_hop_length)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:117-139"
+    },
+    "107": {
+        "file_id": 10,
+        "content": "This code extracts and preprocesses audio features from a wav file. It converts the wav data to [-1.0, +1.0] range, applies log mel spectrogram, frames into examples with specific window lengths and hop lengths for further processing.",
+        "type": "comment"
+    },
+    "108": {
+        "file_id": 10,
+        "content": "    return log_mel_examples\ndef extract_pcm(pcm_file, sample_rate):\n    with open(pcm_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype=np.int16)\n    examples = wav_to_example(audio_data, sample_rate)\n    return examples\nif __name__ == \"__main__\":\n    wav_file = sys.argv[1]\n    print(\"wav_file = \", wav_file)\n    with open(wav_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype = np.int16)\n    examples_batch = wav_to_example(audio_data, 16000)\n    print(\"examples_batch.shape\", examples_batch.shape)   ",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:140-158"
+    },
+    "109": {
+        "file_id": 10,
+        "content": "The code extracts audio features from a wav file using pcm data. It reads the pcm data as bytes, converts it to np.int16 array, applies the wav_to_example function with sample rate 16000 to convert audio data into examples batch, and prints the shape of the resulting examples_batch.",
+        "type": "comment"
+    },
+    "110": {
+        "file_id": 11,
+        "content": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py",
+        "type": "filepath"
+    },
+    "111": {
+        "file_id": 11,
+        "content": "The ModelAudio class extracts audio features using wav_to_example and slices the data into parts, calculating features for each part. The predict method appends these features to a list and returns the audio feature list after dividing by sample rate.",
+        "type": "summary"
+    },
+    "112": {
+        "file_id": 11,
+        "content": "\"\"\"\naudio model config\n\"\"\"\nimport numpy as np\nimport mfcc.feature_extractor as feature_extractor\nclass ModelAudio(object):\n    \"\"\"\n    modelAudio\n    \"\"\"\n    def __init__(self, configs, use_gpu=1):\n        self.use_gpu = use_gpu\n        self.audio_fps = configs.COMMON.fps\n        self.audio_feat_scale = configs.TSN.audio_scale\n        self.sample_rate = 16000\n    def predict_slice(self, wav_data, sample_rate):\n        \"\"\"\n        audio predict\n        \"\"\"\n        examples_batch = feature_extractor.wav_to_example(\n            wav_data, sample_rate)[0]\n        return examples_batch\n    def predict_audio(self, audio_file):\n        \"\"\"\n        predict_audio\n        \"\"\"\n        audio_feature_list = []\n        # read pcm\n        sample_rate = self.sample_rate\n        try:\n            with open(audio_file, \"rb\") as f:\n                pcm_data = f.read()\n            audio_data = np.fromstring(pcm_data, dtype=np.int16)\n            audio_status = \"audio load success\"\n        except Exception as e:\n            audio_data = []\n            audio_status = \"audio load failed\"",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:1-42"
+    },
+    "113": {
+        "file_id": 11,
+        "content": "The code defines a ModelAudio class which takes in audio-related configurations and performs audio feature extraction using the feature_extractor module's wav_to_example function. The class also predicts audio by converting PCM data to numpy array and handles audio file reading exceptions.",
+        "type": "comment"
+    },
+    "114": {
+        "file_id": 11,
+        "content": "        step = 1\n        len_video = int(len(audio_data) / sample_rate)\n        print(len_video)\n        for i in range(0, len_video, step):\n            audio_data_part = audio_data[i * sample_rate:(i + step) *\n                                         sample_rate]\n            feature_audio = self.predict_slice(audio_data_part, sample_rate)\n            audio_feature_list.append(feature_audio)\n        return audio_feature_list",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:43-51"
+    },
+    "115": {
+        "file_id": 11,
+        "content": "The code slices the audio data into parts of size 'step' and calculates features for each part using a predict method, then appends the features to a list. The length of the entire audio data is divided by the sample rate to determine how many steps can fit in it. This function returns the audio feature list.",
+        "type": "comment"
+    },
+    "116": {
+        "file_id": 12,
+        "content": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py",
+        "type": "filepath"
+    },
+    "117": {
+        "file_id": 12,
+        "content": "The code defines global parameters for the VGGish model, including architectural constants, hyperparameters, and optimizer settings. It extracts audio features from spectrogram patches using PCA quantization and embedding processing, with options to adjust STFT window and hop lengths, mel frequency bins, and learning rate.",
+        "type": "summary"
+    },
+    "118": {
+        "file_id": 12,
+        "content": "\"\"\"Global parameters for the VGGish model.\nSee vggish_slim.py for more information.\n\"\"\"\n# Architectural constants.\nNUM_FRAMES = 50  # Frames in input mel-spectrogram patch.\nNUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.\nEMBEDDING_SIZE = 128  # Size of embedding layer.\n# Hyperparameters used in feature and example generation.\nSAMPLE_RATE = 16000\nSTFT_WINDOW_LENGTH_SECONDS = 0.040\nSTFT_HOP_LENGTH_SECONDS = 0.020\nNUM_MEL_BINS = NUM_BANDS\nMEL_MIN_HZ = 125\nMEL_MAX_HZ = 7500\nLOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.\nEXAMPLE_WINDOW_SECONDS = 1.00  # Each example contains 96 10ms frames\nEXAMPLE_HOP_SECONDS = 1.00  # with zero overlap.\n# Parameters used for embedding postprocessing.\nPCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'\nPCA_MEANS_NAME = 'pca_means'\nQUANTIZE_MIN_VAL = -2.0\nQUANTIZE_MAX_VAL = +2.0\n# Hyperparameters used in training.\nINIT_STDDEV = 0.01  # Standard deviation used to initialize weights.\nLEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:1-29"
+    },
+    "119": {
+        "file_id": 12,
+        "content": "This code sets global parameters for the VGGish model. It defines architectural constants, hyperparameters for feature and example generation, embedding postprocessing, and training. The VGGish model is used to extract audio features from spectrogram patches, with options for PCA-based quantization and embedding processing. Hyperparameters control the STFT window and hop lengths, mel frequency bins, and learning rate for Adam optimizer.",
+        "type": "comment"
+    },
+    "120": {
+        "file_id": 12,
+        "content": "ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.\n# Names of ops, tensors, and features.\nINPUT_OP_NAME = 'vggish/input_features'\nINPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'\nOUTPUT_OP_NAME = 'vggish/embedding'\nOUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'\nAUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:30-37"
+    },
+    "121": {
+        "file_id": 12,
+        "content": "This code sets the Adam optimizer's epsilon value to 1e-8, defines names for input and output operations, tensors, and features. It also assigns the name \"audio_embedding\" to a feature.",
+        "type": "comment"
+    },
+    "122": {
+        "file_id": 13,
+        "content": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py",
+        "type": "filepath"
+    },
+    "123": {
+        "file_id": 13,
+        "content": "The \"InferModel\" class is for audio inference, initializing the model and creating a predictor object. It takes input, performs inference, returns output, and measures time taken. The code loads an audio file, sets path, performs prediction, prints shape, first output, and time.",
+        "type": "summary"
+    },
+    "124": {
+        "file_id": 13,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"audio infer\"\"\"\n    def __init__(self, cfg, name='AUDIO'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:1-37"
+    },
+    "125": {
+        "file_id": 13,
+        "content": "This code defines a class named \"InferModel\" for audio inference. It initializes the model by reading configuration files, enabling GPU usage, and creating a predictor object. The input name and handle are stored for later use during inference.",
+        "type": "comment"
+    },
+    "126": {
+        "file_id": 13,
+        "content": "        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        pcm_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = np.array(data, dtype = 'float32')\n            output = self.infer(inputs)\n            feature_list.append(np.squeeze(output))\n            pcm_list.append(inputs)\n        feature_values = np.vstack(feature_list)\n        pcm_values = np.vstack(pcm_list)\n        return feature_values, pcm_values\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:39-69"
+    },
+    "127": {
+        "file_id": 13,
+        "content": "The code defines a model that takes audio input, performs inference using the predictor, and returns output. The predict method reads data from infer_config and for each iteration, it prepares inputs, runs inference, collects feature lists and pcm lists, then combines them into feature_values and pcm_values before returning.",
+        "type": "comment"
+    },
+    "128": {
+        "file_id": 13,
+        "content": "    pcm_path = '/home/work/datasets/WorldCup2018/pcm/6e577252c4004961ac7caa738a52c238.pcm'\n    t0 = time.time()\n    cfg['AUDIO']['pcm_file'] = pcm_path\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print(outputs[0])\n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:71-80"
+    },
+    "129": {
+        "file_id": 13,
+        "content": "This code loads an audio file, sets the path for it in the configuration file, performs prediction on the model, prints the shape and first output of the prediction, and calculates and prints the time taken in minutes.",
+        "type": "comment"
+    },
+    "130": {
+        "file_id": 14,
+        "content": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py",
+        "type": "filepath"
+    },
+    "131": {
+        "file_id": 14,
+        "content": "This code defines a class for bmn inferencing, initializes a PaddleVideo model using a config file, and detects basketball actions in videos through sliding window techniques. Results are stored and displayed along with inference time.",
+        "type": "summary"
+    },
+    "132": {
+        "file_id": 14,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import process_proposal\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"bmn infer\"\"\"\n    def __init__(self, cfg, name='BMN'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        self.nms_thread          = cfg[name]['nms_thread']\n        self.min_pred_score      = cfg[name]['score_thread']\n        self.min_frame_thread    = cfg['COMMON']['fps']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:1-37"
+    },
+    "133": {
+        "file_id": 14,
+        "content": "The code defines a class InferModel, which is used for bmn inferencing. It initializes the model using a configuration file and sets properties such as GPU memory, device ID, minimum prediction score threshold, and frame processing thread count.",
+        "type": "comment"
+    },
+    "134": {
+        "file_id": 14,
+        "content": "        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n        self.output3_tensor = self.predictor.get_output_handle(output_names[2])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        output3 = self.output3_tensor.copy_to_cpu()\n        return output1, output2, output3\n    def generate_props(self, pred_bmn, pred_start, pred_end, max_window=200, min_window=5):\n        \"\"\"generate_props\"\"\"",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:38-63"
+    },
+    "135": {
+        "file_id": 14,
+        "content": "This code is for a basketball action detection model using PaddleVideo. It creates a predictor, defines input and output tensors, runs inference, and generates properties based on predictions for start and end times of an action.",
+        "type": "comment"
+    },
+    "136": {
+        "file_id": 14,
+        "content": "        video_len = min(pred_bmn.shape[-1], min(pred_start.shape[-1], pred_end.shape[-1]))\n        pred_bmn = pred_bmn[0, :, :] * pred_bmn[1, :, :]\n        start_mask = self.boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = self.boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_results = []\n        for idx in range(min_window, max_window):\n            for jdx in range(video_len):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < video_len and start_mask[start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = start_index\n                    xmax = end_index\n                    xmin_score = pred_start[start_index]\n                    xmax_score = pred_end[end_index]\n                    bmn_score = pred_bmn[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bmn_score\n                    score_results.append([xmin, xmax, conf_score])\n        return score_results\n    def boundary_choose(self, score_list):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:64-86"
+    },
+    "137": {
+        "file_id": 14,
+        "content": "This code performs action detection by predicting start and end points, as well as the confidence score for a specific action within a video. It calculates the score_results based on valid start and end indices, taking into account the start and end masks. The boundary_choose function is used to choose the boundaries of the action from the given score list.",
+        "type": "comment"
+    },
+    "138": {
+        "file_id": 14,
+        "content": "        \"\"\"boundary_choose\"\"\"\n        max_score = max(score_list)\n        mask_high = (score_list > max_score * 0.5)\n        score_list = list(score_list)\n        score_middle = np.array([0.0] + score_list + [0.0])\n        score_front = np.array([0.0, 0.0] + score_list)\n        score_back = np.array(score_list + [0.0, 0.0])\n        mask_peak = ((score_middle > score_front) & (score_middle > score_back))\n        mask_peak = mask_peak[1:-1]\n        mask = (mask_high | mask_peak).astype('float32')\n        return mask\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs      = [items[0] for items in data]\n            winds       = [items[1] for items in data]\n            feat_info   = [items[2] for items in data]\n            feature_T   = feat_info[0][0]\n            feature_N   = feat_info[0][1]\n            inputs = np.array(inputs)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:87-111"
+    },
+    "139": {
+        "file_id": 14,
+        "content": "This code defines two functions, \"boundary_choose\" and \"predict\". The \"boundary_choose\" function takes a score list as input and uses it to generate three different arrays for scoring in front, middle, and back positions. It then creates a mask for the highest peak by comparing these three score arrays. Finally, it returns a binary mask representing boundary locations. The \"predict\" function initializes an infer reader, iterates through data from this reader, processes inputs, and features information to generate feature_T and feature_N.",
+        "type": "comment"
+    },
+    "140": {
+        "file_id": 14,
+        "content": "            pred_bmn, pred_sta, pred_end = self.infer(inputs)\n            if infer_iter == 0:\n                sum_pred_bmn = np.zeros((2, feature_N, feature_T))\n                sum_pred_sta = np.zeros((feature_T, ))\n                sum_pred_end = np.zeros((feature_T, ))\n                sum_pred_cnt = np.zeros((feature_T, ))\n            for idx, sub_wind in enumerate(winds):\n                sum_pred_bmn[:, :, sub_wind[0]: sub_wind[1]] += pred_bmn[idx]\n                sum_pred_sta[sub_wind[0]: sub_wind[1]] += pred_sta[idx]\n                sum_pred_end[sub_wind[0]: sub_wind[1]] += pred_end[idx]\n                sum_pred_cnt[sub_wind[0]: sub_wind[1]] += np.ones((sub_wind[1] - sub_wind[0], ))\n        pred_bmn = sum_pred_bmn / sum_pred_cnt\n        pred_sta = sum_pred_sta / sum_pred_cnt\n        pred_end = sum_pred_end / sum_pred_cnt\n        score_result = self.generate_props(pred_bmn, pred_sta, pred_end)\n        results = process_proposal(score_result, self.min_frame_thread, self.nms_thread, self.min_pred_score)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:112-131"
+    },
+    "141": {
+        "file_id": 14,
+        "content": "The code is calculating the average of multiple model predictions for each sliding window and then dividing it by the total number of windows to get the final prediction. These predictions are used to generate proposals, which are further processed based on some parameters like minimum frame threshold, NMS thread, and minimum prediction score.",
+        "type": "comment"
+    },
+    "142": {
+        "file_id": 14,
+        "content": "        return results\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    t0 = time.time()\n    outputs = model.predict(cfg, video_features)\n    t1 = time.time()\n    results = {'proposal': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) \n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:133-155"
+    },
+    "143": {
+        "file_id": 14,
+        "content": "The code defines a model for action detection, loads configuration file and video features from file paths, predicts the actions using the model, stores results in a dictionary, writes the result to 'results.json', and finally prints the time taken for inference in minutes.",
+        "type": "comment"
+    },
+    "144": {
+        "file_id": 15,
+        "content": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py",
+        "type": "filepath"
+    },
+    "145": {
+        "file_id": 15,
+        "content": "The code initializes an LSTM-based model for basketball action detection using PaddlePaddle's inference API, with preprocessing and GPU memory optimization functions. It loads a pre-trained model, predicts actions in videos, and saves results in JSON format without ASCII conversion.",
+        "type": "summary"
+    },
+    "146": {
+        "file_id": 15,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import get_action_result\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"lstm infer\"\"\"\n    def __init__(self, cfg, name='ACTION'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        self.topk           = cfg[name]['topk']\n        self.frame_offset   = cfg[name]['nms_offset']\n        self.nms_thread     = cfg[name]['nms_thread']\n        self.cls_thread     = cfg[name]['classify_score_thread']\n        self.iou_thread     = cfg[name]['iou_score_thread']\n        self.label_map_file = cfg['COMMON']['label_dic']",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:1-36"
+    },
+    "147": {
+        "file_id": 15,
+        "content": "This code is for an LSTM-based inferencing model in the BasketballAction application. It includes functions for preprocessing, processing results, and using PaddlePaddle's inference API. The class InferModel initializes the model based on a configuration file that contains information such as model and parameter files, GPU memory, device ID, and thread settings for different tasks like NMS and classification scoring. It also includes a label mapping file for classification purposes.",
+        "type": "comment"
+    },
+    "148": {
+        "file_id": 15,
+        "content": "        self.fps            = cfg['COMMON']['fps']\n        self.nms_id         = 5\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input1_tensor = self.predictor.get_input_handle(input_names[0])\n        #self.input2_tensor = self.predictor.get_input_handle(input_names[1])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n    def infer(self, input1_arr, input1_lod, input2_arr=None, input2_lod=None):\n        \"\"\"infer\"\"\"\n        self.input1_tensor.copy_from_cpu(input1_arr)\n        self.input1_tensor.set_lod(input1_lod)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:37-61"
+    },
+    "149": {
+        "file_id": 15,
+        "content": "This code initializes an LSTM-based predictor model for action detection. It sets FPS, NMS ID, and configures the model to enable GPU usage and memory optimization. The code then creates a zero copy feed fetch operator and assigns input and output tensors for the infer method.",
+        "type": "comment"
+    },
+    "150": {
+        "file_id": 15,
+        "content": "        if not input2_arr is None:\n            self.input2_tensor.copy_from_cpu(input2_arr)\n            self.input2_tensor.set_lod(input2_lod)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        # print(output.shape)\n        return output1, output2\n    def pre_process(self, input):\n        \"\"\"pre process\"\"\"\n        input_arr = []\n        input_lod = [0]\n        start_lod = 0\n        end_lod = 0\n        for sub_item in input:\n            end_lod = start_lod + len(sub_item)\n            input_lod.append(end_lod)\n            input_arr.extend(sub_item)\n            start_lod = end_lod\n        input_arr = np.array(input_arr)\n        return input_arr, [input_lod]\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)\n        results = []\n        for infer_iter, data in enumerate(infer_reader()):\n            video_id = [[items[-2], items[-1]] for items in data]",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:62-90"
+    },
+    "151": {
+        "file_id": 15,
+        "content": "This code appears to be part of a Python class that uses LSTM models for action detection in basketball videos. It preprocesses input data, runs the predictor, and returns output1 and output2 as results. The pre_process function takes an input, creates lod (lengths of dimensions) and arranges sub-items in a specific order to prepare it for the model. The predict function uses a reader to iterate through data, performing action detection on each video frame and returning the results.",
+        "type": "comment"
+    },
+    "152": {
+        "file_id": 15,
+        "content": "            input1 = [items[0] for items in data]\n            input1_arr, input1_lod = self.pre_process(input1)\n            output1, output2 = self.infer(input1_arr, input1_lod)\n            predictions_id = output1 \n            predictions_iou = output2\n            for i in range(len(predictions_id)):\n                topk_inds = predictions_id[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds_id = predictions_id[i][topk_inds]\n                preds_iou = predictions_iou[i][0]\n                results.append((video_id[i], preds_id.tolist(), topk_inds.tolist(), preds_iou.tolist()))\n        predict_result = get_action_result(results, self.label_map_file, self.fps, \n                                           self.cls_thread, self.iou_thread, \n                                           self.nms_id, self.nms_thread, self.frame_offset)\n        return predict_result\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:91-112"
+    },
+    "153": {
+        "file_id": 15,
+        "content": "This code is a function that performs action detection on video frames using an LSTM model. It preprocesses the input data, infers predictions from the model, selects the top-k detections for each frame, and then combines these results to generate an action detection result. The results are returned after post-processing with additional functions. A main function is also provided that can be used to run inference on a video with specific configuration settings.",
+        "type": "comment"
+    },
+    "154": {
+        "file_id": 15,
+        "content": "    model = InferModel(cfg)\n    # proposal total\n    prop_dict = {}\n    for dataset in ['EuroCup2016', 'WorldCup2018']:\n        prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)\n        json_data = json.load(open(prop_json, 'r'))\n        for item in json_data:\n            basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n            basename = basename + '/' + item['video_name'] + '.mp4'\n            prop_dict[basename] = item['bmn_results']\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    # proposal\n    basename = imgs_path.replace('frames', 'mp4') + '.mp4'\n    bmn_results = prop_dict[basename]\n    material = {'feature': video_features, 'proposal': bmn_results}\n    t0 = time.time()\n    outputs = model.predict(cfg, material)\n    t1 = time.time()\n    results = {'actions': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:113-141"
+    },
+    "155": {
+        "file_id": 15,
+        "content": "The code loads and initializes a pre-trained LSTM model for action detection. It then retrieves the video features and proposal information from JSON files. Finally, it uses the loaded model to predict actions based on the given material (features and proposals) and saves the results in a json file named 'results.json'.",
+        "type": "comment"
+    },
+    "156": {
+        "file_id": 15,
+        "content": "       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) \n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:142-145"
+    },
+    "157": {
+        "file_id": 15,
+        "content": "The code dumps the results in JSON format with indentation and without converting special characters to ASCII. Then, it writes this data to a file and prints the time taken in minutes.",
+        "type": "comment"
+    },
+    "158": {
+        "file_id": 16,
+        "content": "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py",
+        "type": "filepath"
+    },
+    "159": {
+        "file_id": 16,
+        "content": "This code defines a PaddleVideo-based `InferModel` class for action detection using PPTSM with inference and prediction methods. It loads model, config file, specifies image paths, predicts on images, prints output shape, and time taken for prediction.",
+        "type": "summary"
+    },
+    "160": {
+        "file_id": 16,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"pptsm infer\"\"\"\n    def __init__(self, cfg, name='PPTSM'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py:1-38"
+    },
+    "161": {
+        "file_id": 16,
+        "content": "This code defines a class `InferModel` that uses the PPTSM (Pose-aware Two-Stream Temporal Segmentation Model) for action detection. The model is initialized with a configuration file specifying the model and parameter files, as well as GPU memory and device ID settings. The configuration is optimized for efficient inference using feed fetch operations disabled and enabling memory optimization. The input tensor handle for the model is also retrieved.",
+        "type": "comment"
+    },
+    "162": {
+        "file_id": 16,
+        "content": "        output_names = self.predictor.get_output_names()\n        print(\"output_names = \", output_names)\n        #self.output_tensor = self.predictor.get_output_handle(output_names[1])\n        self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = [items[:-1] for items in data]\n            inputs = np.array(inputs)\n            output = self.infer(inputs)\n            #print(\"inputs\", inputs.shape)\n            #print(\"outputs\", output.shape)\n            feature_list.append(np.squeeze(output))\n        feature_list = np.vstack(feature_list)\n        return feature_list\nif __name__ == \"__main__\":",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py:40-69"
+    },
+    "163": {
+        "file_id": 16,
+        "content": "This code defines a class with methods for inferring and predicting actions from the PaddleVideo framework. It uses the PaddlePaddle library for inference and gets output names and handles to extract the results. The code also includes a main function that can be run if the file is executed directly.",
+        "type": "comment"
+    },
+    "164": {
+        "file_id": 16,
+        "content": "    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238/' \n    imgs_list = get_images(imgs_path)\n    t0 = time.time()\n    cfg['PPTSM']['frame_list'] = imgs_list\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py:70-83"
+    },
+    "165": {
+        "file_id": 16,
+        "content": "This code loads a model, config file, and specifies image paths. It then predicts using the loaded model on images in the specified path and prints the shape of the output as well as the time taken to perform prediction. The comment is suitable for code chunks that explain what each section does, like loading a model, reading input files, or performing computations.",
+        "type": "comment"
+    },
+    "166": {
+        "file_id": 17,
+        "content": "/applications/BasketballAction/predict/action_detect/reader/__init__.py",
+        "type": "filepath"
+    },
+    "167": {
+        "file_id": 17,
+        "content": "This code imports and registers various readers for different formats (TSM, PPTSM, AUDIO, BMN, ACTION) to read map files for the model. The readers are registered in alphabetical order.",
+        "type": "summary"
+    },
+    "168": {
+        "file_id": 17,
+        "content": "\"\"\"\nread map for model\n\"\"\"\nfrom reader.reader_utils import regist_reader, get_reader\nimport reader.tsminf_reader as tsminf_reader\nimport reader.audio_reader as audio_reader\nimport reader.bmninf_reader as bmninf_reader\nimport reader.feature_reader as feature_reader\n# regist reader, sort by alphabet\nregist_reader(\"TSM\", tsminf_reader.TSMINFReader)\nregist_reader(\"PPTSM\", tsminf_reader.TSMINFReader)\nregist_reader(\"AUDIO\", audio_reader.AudioReader)\nregist_reader(\"BMN\", bmninf_reader.BMNINFReader)\nregist_reader(\"ACTION\", feature_reader.FeatureReader)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/__init__.py:1-15"
+    },
+    "169": {
+        "file_id": 17,
+        "content": "This code imports and registers various readers for different formats (TSM, PPTSM, AUDIO, BMN, ACTION) to read map files for the model. The readers are registered in alphabetical order.",
+        "type": "comment"
+    },
+    "170": {
+        "file_id": 18,
+        "content": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py",
+        "type": "filepath"
+    },
+    "171": {
+        "file_id": 18,
+        "content": "The code creates an AudioReader class for youtube-8M dataset, initializing audio readers and loading pcm data. It manages audio batches by appending audios to batch_out until reaching the specified batch size, then yields the batch. Any remaining audios are yielded upon completion.",
+        "type": "summary"
+    },
+    "172": {
+        "file_id": 18,
+        "content": "\"\"\"\naudio reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nimport os\nimport _pickle as cPickle\n#from .reader_utils import DataReader\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\nimport numpy as np\nimport random\nimport code\nfrom .reader_utils import DataReader\nimport mfcc.feature_extractor as feature_extractor\nclass AudioReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:1-37"
+    },
+    "173": {
+        "file_id": 18,
+        "content": "This code defines an AudioReader class for the youtube-8M dataset, which reads features extracted by prior networks. It imports necessary libraries and modules, such as numpy, random, code, DataReader from reader_utils, feature_extractor from mfcc, pickle for file input/output, and StringIO or BytesIO depending on the availability of cPickle. The class inherits from DataReader, indicating it follows a standard data reading structure, and uses a feature extractor to extract audio features.",
+        "type": "comment"
+    },
+    "174": {
+        "file_id": 18,
+        "content": "    This is for the three models: lstm, attention cluster, nextvlad\n    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        # set batch size and file list\n        self.sample_rate = cfg[self.name.upper()]['sample_rate']\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        self.pcm_file = cfg[self.name.upper()]['pcm_file']\n        self.material = material\n    def create_reader(self):\n        \"\"\"create_reader\"\"\"\n        with open(self.pcm_file, \"rb\") as f:\n            pcm_data = f.read()\n        audio_data = np.fromstring(pcm_data, dtype=np.int16)\n        examples = feature_extractor.wav_to_example(audio_data, self.sample_rate)\n        # print(examples.shape)\n        def reader():\n            \"\"\"reader\"\"\"\n            batch_out = []\n            batch_out_pre = []\n            for audio in examples:\n                # batch_out.append([audio])",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:38-70"
+    },
+    "175": {
+        "file_id": 18,
+        "content": "This code initializes an audio reader for three models (LSTM, Attention Cluster, NextVlad). It takes parameters such as name, mode, and configuration file. The batch size, sample rate, and file list are set according to the given configuration. The pcm data is loaded from a binary file and converted to numpy array. Finally, a reader function is defined that iterates through examples and appends them to batches.",
+        "type": "comment"
+    },
+    "176": {
+        "file_id": 18,
+        "content": "                batch_out.append(audio)\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:71-78"
+    },
+    "177": {
+        "file_id": 18,
+        "content": "This code is creating and managing audio batches in the audio reader class. It appends each audio to batch_out until it reaches the specified batch size, then yields the batch and resets batch_out. If there are remaining audios in batch_out after the loop ends, it yields them before returning the reader object.",
+        "type": "comment"
+    },
+    "178": {
+        "file_id": 19,
+        "content": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py",
+        "type": "filepath"
+    },
+    "179": {
+        "file_id": 19,
+        "content": "The code defines BMNINFReader class for reading and processing BMN model data, includes get_sw_prop function generating proposals, filters less than one-second proposals, performs calculations, and creates a reader class to load video data for training or prediction.",
+        "type": "summary"
+    },
+    "180": {
+        "file_id": 19,
+        "content": "\"\"\"\n# @File  : bmninf_reader.py  \n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport os\nimport random\nimport pickle\nimport json\nimport numpy as np\nimport multiprocessing\nimport numpy as np\nfrom .reader_utils import DataReader\ndef get_sw_prop(duration, window=200, step=10):\n    \"\"\"\n    get_sw_prop\n    \"\"\"\n    pr = []\n    local_boxes = []\n    for k in np.arange(0, duration - window + step, step):\n        start_id = k\n        end_id = min(duration, k + window)\n        if end_id - start_id < window:\n            start_id = end_id - window\n        local_boxes = (start_id, end_id)\n        pr.append(local_boxes)\n    def valid_proposal(duration, span):\n        \"\"\"\n        valid_proposal\n        \"\"\"\n        # fileter proposals\n        # a valid proposal should have at least one second in the video\n        real_span = min(duration, span[1]) - span[0]\n        return real_span >= 1\n    pr = list(filter(lambda x: valid_proposal(duration, x), pr))\n    return pr\nclass BMNINFReader(DataReader):\n    \"\"\"\n    Data reader for BMN model, which was stored as features extracted by prior networks",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:1-49"
+    },
+    "181": {
+        "file_id": 19,
+        "content": "This code is defining a class BMNINFReader, which extends DataReader and provides functionality for reading data from BMN model. It includes a function get_sw_prop, which generates proposals of a specific window size and step over a given duration. The class also filters out any proposals that are less than one second long.",
+        "type": "comment"
+    },
+    "182": {
+        "file_id": 19,
+        "content": "    dataset cfg: feat_path, feature path,\n                 tscale, temporal length of BM map,\n                 dscale, duration scale of BM map,\n                 anchor_xmin, anchor_xmax, the range of each point in the feature sequence,\n                 batch_size, batch size of input data,\n                 num_threads, number of threads of data processing\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.tscale = cfg[self.name.upper()]['tscale']  # 200\n        self.dscale = cfg[self.name.upper()]['dscale']  # 200\n        self.tgap = 1. / self.tscale\n        self.step = cfg[self.name.upper()]['window_step']\n        self.material = material\n        src_feature = self.material\n        image_feature = src_feature['image_feature']\n        pcm_feature = src_feature['pcm_feature']\n        pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))\n        min_length = min(image_feature.shape[0], pcm_feature.shape[0])\n        image_feature = image_feature[:min_length, :]",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:50-73"
+    },
+    "183": {
+        "file_id": 19,
+        "content": "This code initializes a class that reads BMNINF data. It takes arguments for name, mode, and configuration (cfg). The tscale and dscale are set from the config file. The tgap, step, image_feature, and pcm_feature variables are calculated and reshaped accordingly. Minimum length is found to ensure both features have same length.",
+        "type": "comment"
+    },
+    "184": {
+        "file_id": 19,
+        "content": "        pcm_feature = pcm_feature[:min_length, :]\n        self.features = np.concatenate((image_feature, pcm_feature), axis=1)\n        self.duration = len(self.features)\n        self.window = self.tscale\n        self.get_dataset_dict()\n        self.get_match_map()\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        if (mode == 'test') or (mode == 'infer'):\n            self.num_threads = 1  # set num_threads as 1 for test and infer\n    def get_dataset_dict(self):\n        \"\"\"\n        get_dataset_dict\n        \"\"\"\n        self.video_list = get_sw_prop(self.duration, self.window, self.step)\n    def get_match_map(self):\n        \"\"\"\n        get_match_map\n        \"\"\"\n        match_map = []\n        for idx in range(self.tscale):\n            tmp_match_window = []\n            xmin = self.tgap * idx\n            for jdx in range(1, self.tscale + 1):\n                xmax = xmin + self.tgap * jdx\n                tmp_match_window.append([xmin, xmax])\n            match_map.append(tmp_match_window)\n        match_map = np.array(match_map)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:74-105"
+    },
+    "185": {
+        "file_id": 19,
+        "content": "This code defines a class with methods for getting dataset dictionary and match map. It takes configuration file as input, extracts relevant features from images and pcm data, sets batch size and number of threads based on mode, and creates video list and match map using duration, window size, step, and gap values.",
+        "type": "comment"
+    },
+    "186": {
+        "file_id": 19,
+        "content": "        match_map = np.transpose(match_map, [1, 0, 2])\n        match_map = np.reshape(match_map, [-1, 2])\n        self.match_map = match_map\n        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]\n        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]\n    def load_file(self, video_wind):\n        \"\"\"\n        load_file\n        \"\"\"\n        start_feat_id = video_wind[0]\n        end_feat_id = video_wind[1]\n        video_feat = self.features[video_wind[0]: video_wind[1]]\n        video_feat = video_feat.T\n        video_feat = video_feat.astype(\"float32\")\n        return video_feat\n    def create_reader(self):\n        \"\"\"\n        reader creator for ctcn model\n        \"\"\"\n        return self.make_infer_reader()\n    def make_infer_reader(self):\n        \"\"\"\n        reader for inference\n        \"\"\"\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            # for video_name in self.video_list:\n            for video_wind in self.video_list:\n                video_idx = self.video_list.index(video_wind)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:106-141"
+    },
+    "187": {
+        "file_id": 19,
+        "content": "The code is a reader for BMNINF files. It transposes, reshapes, and stores match_map data. The load_file function loads video features based on start and end feature IDs. The create_reader function creates an inferencer reader. Finally, the make_infer_reader function returns a reader for inference tasks that iterates through video windows.",
+        "type": "comment"
+    },
+    "188": {
+        "file_id": 19,
+        "content": "                video_feat = self.load_file(video_wind)\n                batch_out.append((video_feat, video_wind, [self.duration, self.dscale]))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:142-151"
+    },
+    "189": {
+        "file_id": 19,
+        "content": "This code defines a reader class that loads and processes video data, creating batches of features for model training or prediction. It uses the `load_file` method to read video files and appends them to the `batch_out` list. When the list reaches the specified `batch_size`, it yields the batch and resets the list. If there are remaining items in the list upon exiting the function, it yields those final batches.",
+        "type": "comment"
+    },
+    "190": {
+        "file_id": 20,
+        "content": "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py",
+        "type": "filepath"
+    },
+    "191": {
+        "file_id": 20,
+        "content": "The FeatureReader class in Python reads data for YouTube-8M dataset, supports LSTM, Attention Cluster, and NextVLAD models, initializes feature reader with parameters, shuffles proposals, generates batches, and yields when batch size is reached.",
+        "type": "summary"
+    },
+    "192": {
+        "file_id": 20,
+        "content": "\"\"\"\nattention-lstm feature reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\nimport numpy as np\nimport random\nimport code\nfrom .reader_utils import DataReader\nclass FeatureReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm, attention cluster, nextvlad",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py:1-33"
+    },
+    "193": {
+        "file_id": 20,
+        "content": "This code is a Python class called FeatureReader, which inherits from DataReader. It serves as a data reader for the YouTube-8M dataset, using features extracted by prior networks. It supports three models: LSTM, Attention Cluster, and NextVLAD. The class imports necessary libraries and modules to read, parse, and manipulate the dataset efficiently.",
+        "type": "comment"
+    },
+    "194": {
+        "file_id": 20,
+        "content": "    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        self.feature = material['feature']\n        self.proposal = material['proposal']\n        self.fps = 5\n    def create_reader(self):\n        \"\"\"\n        create_reader\n        \"\"\"\n        image_feature_list = self.feature['image_feature']\n        audio_feature_list = self.feature['audio_feature']\n        pcm_feature_list = self.feature['pcm_feature']\n        pcm_feature_list = pcm_feature_list.reshape((pcm_feature_list.shape[0] * 5, 640))\n        fl = self.proposal\n        if self.mode == 'train':\n            random.shuffle(fl)\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            for prop_info in fl:\n                start_id = int(prop_info['start'])\n                end_id = int(prop_info['end'])",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py:35-71"
+    },
+    "195": {
+        "file_id": 20,
+        "content": "The code initializes a feature reader, takes in parameters such as name, mode, configuration, and material (featuring image, audio, and pcm features). It shuffles the proposals if in training mode. The reader function generates batches of data by iterating through the proposal list, extracting relevant features from specific ID ranges, and storing them in a batch_out list.",
+        "type": "comment"
+    },
+    "196": {
+        "file_id": 20,
+        "content": "                bmn_score = float(prop_info['score'])\n                try:\n                    image_feature = image_feature_list[start_id: end_id]\n                    audio_feature = audio_feature_list[int(start_id / self.fps): int(end_id / self.fps)]\n                    pcm_feature = pcm_feature_list[start_id: end_id]\n                    image_feature = np.concatenate((image_feature, pcm_feature), axis=1)\n                    batch_out.append((image_feature, audio_feature, 0, prop_info))\n                    if len(batch_out) == self.batch_size:\n                        yield batch_out\n                        batch_out = []\n                except Exception as e:\n                    continue\n        return reader",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py:72-86"
+    },
+    "197": {
+        "file_id": 20,
+        "content": "This code snippet is part of a feature reader for an action detection system. It reads image, audio, and pcm features from feature lists, concatenates them if needed, creates a batch, and yields the batch when it reaches the specified batch size.",
+        "type": "comment"
+    },
+    "198": {
+        "file_id": 21,
+        "content": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py",
+        "type": "filepath"
+    },
+    "199": {
+        "file_id": 21,
+        "content": "This code defines ReaderNotFoundError and ReaderZoo classes for video input data readers, offering a singleton reader_zoo and functions to register and get specific readers. The get_reader function returns the reader instance based on name, mode, configuration, and material, while raising ReaderNotFoundError if not found.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/10.json b/docs/data/10.json
new file mode 100644
index 000000000..8edf0ede2
--- /dev/null
+++ b/docs/data/10.json
@@ -0,0 +1,542 @@
+{
+    "1000": {
+        "file_id": 91,
+        "content": "                {\n                    \"label\": 6,\n                    \"norm_iou\": 0.7575757575757576,\n                    \"norm_ioa\": 0.7575757575757576,\n                    \"norm_start\": -0.32,\n                    \"proposal\": {\n                        \"start\": 5011,\n                        \"end\": 5036,\n                        \"score\": 0.7723643666324231\n                    },\n                    \"hit_gts\": {\n                        \"label_ids\": [\n                            6\n                        ],\n                        \"label_names\": [\n                            \"换人\"\n                        ],\n                        \"start_id\": 5003,\n                        \"end_id\": 5036\n                    }\n                },\n                ...\n        },\n        ...\n}\n```\n- LSTM训练所需要的feature数据格式如下:\n```\n{\n    'features': np.array(feature_hit, dtype=np.float32),    # iamge和audio 特征\n    'feature_fps': 5,                                       # fps = 5\n    'label_info': {'norm_iou': 0.5, 'label': 3, ...},       # 数据格式1中的'proposal_actions'",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:409-441"
+    },
+    "1001": {
+        "file_id": 91,
+        "content": "This code snippet represents the structure of a single data sample used in training an LSTM model for action recognition. The data includes image and audio features, frame-level labels, and metrics like IOU (Intersection over Union) and IOA (Intersection over Area). The \"proposal\" field contains start, end, and score values to define the segment of interest. The \"hit_gts\" field provides ground truth information about the labeled action segments within the sample.",
+        "type": "comment"
+    },
+    "1002": {
+        "file_id": 91,
+        "content": "    'video_name': 'c9516c903de3416c97dae91a59e968d7'        # video_name\n}\n```\n- LSTM训练所需文件列表数据格式如下：\n```\n'{} {}'.format(filename, label)\n```\n##### step3.2  LSTM训练\n训练启动命令如下:\n```bash\npython -B -m paddle.distributed.launch \\\n     --gpus=\"0,1,2,3\" \\\n     --log_dir=./football/logs_lstm \\\n     main.py  \\\n     --validate \\\n     -c applications/FootballAction/train_proposal/configs/lstm_football.yaml \\\n     -o output_dir=./football/lstm\n```\n##### step3.3 导出LSTM推理模型\n模型导出命令如下:\n```bash\npython tools/export_model.py -c applications/FootballAction/train_proposal/configs/lstm_football.yaml \\\n                              -p ./football/lstm/AttentionLSTM_best.pdparams  \\\n                               -o ./football/inference_model\n```\n<a name=\"模型推理\"></a>\n### 5.2 模型推理\n运行预测代码\n```\ncd predict && python predict.py\n```\n- 默认使用我们提供的于训练文件进行预测，如使用个人训练的模型文件，请对应修改[配置文件](./predict/configs/configs.yaml)中的参数路径\n- 产出文件：results.json\n<a name=\"模型评估\"></a>\n### 5.3 模型评估\n```\n# 包括bmn proposal 评估和最终action评估\ncd predict && python eval.py results.json\n```\n<a name=\"模型优化\"></a>",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:442-493"
+    },
+    "1003": {
+        "file_id": 91,
+        "content": "This code represents the data format for listing necessary files for LSTM training, and provides commands for launching LSTM training, exporting the trained model for inference, running predictions with default or custom models, and evaluating the model's performance.",
+        "type": "comment"
+    },
+    "1004": {
+        "file_id": 91,
+        "content": "### 5.4 模型优化\n- 基础特征模型（图像）替换为PP-TSM，准确率由84%提升到94%\n- 基础特征模型（音频）没变动\n- 准确率提升，precision和recall均有大幅提升，F1-score从0.57提升到0.82\n<a name=\"模型部署\"></a>\n### 5.5 模型部署\n本代码解决方案在动作的检测和召回指标F1-score=82%\n<a name=\"参考论文\"></a>\n### 6. 参考论文\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han\n- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.\n- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen\n- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:494-513"
+    },
+    "1005": {
+        "file_id": 91,
+        "content": "The code discusses model optimization, where the base feature model (image) is replaced with PP-TSM, resulting in a 94% accuracy improvement. The audio base feature remains unchanged. This leads to an F1-score increase from 0.57 to 0.82. It also mentions model deployment and provides references for related papers.",
+        "type": "comment"
+    },
+    "1006": {
+        "file_id": 92,
+        "content": "/applications/FootballAction/checkpoints/download.sh",
+        "type": "filepath"
+    },
+    "1007": {
+        "file_id": 92,
+        "content": "This script downloads and extracts four tar files (audio, pptsm, bmn, lstm) related to the FootballAction application within PaddleVideo. The tar files are then deleted after extraction.",
+        "type": "summary"
+    },
+    "1008": {
+        "file_id": 92,
+        "content": "# audio\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/audio.tar\n# pptsm\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/pptsm.tar\n# bmn\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/bmn.tar\n# lstm\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/lstm.tar\ntar -xvf audio.tar\ntar -xvf pptsm.tar\ntar -xvf bmn.tar\ntar -xvf lstm.tar\nrm -f audio.tar\nrm -f pptsm.tar\nrm -f bmn.tar\nrm -f lstm.tar",
+        "type": "code",
+        "location": "/applications/FootballAction/checkpoints/download.sh:1-18"
+    },
+    "1009": {
+        "file_id": 92,
+        "content": "This script downloads and extracts four tar files (audio, pptsm, bmn, lstm) related to the FootballAction application within PaddleVideo. The tar files are then deleted after extraction.",
+        "type": "comment"
+    },
+    "1010": {
+        "file_id": 93,
+        "content": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list",
+        "type": "filepath"
+    },
+    "1011": {
+        "file_id": 93,
+        "content": "This code provides URLs to download 13 EuroCup2016 video files from BCEBOS cloud storage for potential analysis or training data.",
+        "type": "summary"
+    },
+    "1012": {
+        "file_id": 93,
+        "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/63e51df254d2402fac703b6c4fdb4ea9.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/76b5f7ee28d942988c6b224bfac136bd.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/250b88724acf40dbb6d7e8ccb400ef38.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/c9516c903de3416c97dae91a59e968d7.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/e1982c90cdd74abaacc4d0692070b400.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/1be705a8f67648da8ec4b4296fa80895.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/de23c0b2be3a4eb1990c5c657061fb29.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/2754615de6e64c4fb95ce1a8095dc1c1.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/299fe30d8f3b4a45b89313fe31f9f3c0.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6cc7db52c5ef4e70b401a5e00d8dd67a.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/22e89747689e4f7e83e3620620c93269.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:1-11"
+    },
+    "1013": {
+        "file_id": 93,
+        "content": "List of EuroCup2016 dataset video URLs for download.",
+        "type": "comment"
+    },
+    "1014": {
+        "file_id": 93,
+        "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/2ceb6c549fc64305a06a75acb355642b.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/719b0a4bcb1f461eabb152298406b861.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/259856b769044b4d8dc94076deb356bf.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d0bd3eab1e794f0f9501c353a6d37827.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/19eb47cc736240d6b2dd930ab69da839.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/4435b708af6d48519a6b726144147d51.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/ea16ad2a020643529e257bd6cb11b3c3.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/eeebffbd4ec74222a9c2d0775d79b689.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/8cfb4e605af44055b1576c37eb0e3209.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6bca62b57cc449c6935f0b17f28d06be.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/70cfc31e520840b2afca458f93a01ce4.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:12-22"
+    },
+    "1015": {
+        "file_id": 93,
+        "content": "This code provides URLs to download various video files from a specified location for the EuroCup2016 dataset.",
+        "type": "comment"
+    },
+    "1016": {
+        "file_id": 93,
+        "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6496960935e845578e391a5916739752.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d6d25403a4bb4784aecff5f21fd00dc5.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/3e23d452a082403391f8abfb87bf2fb4.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/4c5d9d9af4f044c4a68d134061dc264f.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6994844c64b44c26b935cee9604bef0a.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d6322cb95f6a4402ac80432b561abd5d.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/2c8b5587083a4784a51622e4fec87ccd.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/5faa60d70ed141de8560110e840f2048.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/45d08bc5cb0f424f9ed9d7874eb561cd.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6630aaf0e32146088d0b624e9288f071.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f2edbee29c1b4966b3a410260f78fbe3.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:23-33"
+    },
+    "1017": {
+        "file_id": 93,
+        "content": "Lists URLs of 13 EuroCup2016 .mp4 video files hosted on BCEBOS cloud storage.",
+        "type": "comment"
+    },
+    "1018": {
+        "file_id": 93,
+        "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f24116fdd6a54214991db32f7dddef67.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/0265731a0c6f4a9398c88db8e3d4a3bc.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/02d2de09997f4215b06e3b00ff0502a0.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/9c231896c56a43f291a5e190949f4333.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/4afbbf9afcd44dfea45b044117cccb48.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/745db97a080d4f44b450dc17a2bcf069.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/5933d0ce17854483b81a318d7d45a34e.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d2cfef2da9f84237a6950c7f6659655c.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/5572686cb90f440988ded956a60e555d.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/8962ac5a332346e180c79d701ae0a175.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f6e64ee9b13a4088b24c45c257894c1e.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:34-44"
+    },
+    "1019": {
+        "file_id": 93,
+        "content": "This code lists URLs for video files belonging to the EuroCup2016 dataset, stored on a specific BCEBOS server.",
+        "type": "comment"
+    },
+    "1020": {
+        "file_id": 93,
+        "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f6ed2b612b3d43baa0726be8b14ebe7c.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/8ab7b0cba5744eb3b6fb10003dfda383.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/1f0a0698e38d493988fe42a50f7e8723.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/737fdb054ca141f2a45013c1740dd0a0.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/bab63a9bcf204e4b99c4a887a01bfd60.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:45-49"
+    },
+    "1021": {
+        "file_id": 93,
+        "content": "This code contains a list of URLs for EuroCup2016 video files stored in \"paddle-model-ecology.bj.bcebos.com/data/EuroCup2016\". Each URL represents an MP4 file related to the event, potentially used for analysis or training data.",
+        "type": "comment"
+    },
+    "1022": {
+        "file_id": 94,
+        "content": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh",
+        "type": "filepath"
+    },
+    "1023": {
+        "file_id": 94,
+        "content": "The script downloads 12 EuroCup2016 dataset videos using 'wget' command, creating a \"mp4\" directory and accessing bj.bcebos.com server under tmt-pub/datasets/EuroCup2016 directory.",
+        "type": "summary"
+    },
+    "1024": {
+        "file_id": 94,
+        "content": "mkdir mp4\ncd mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/63e51df254d2402fac703b6c4fdb4ea9.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/76b5f7ee28d942988c6b224bfac136bd.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/250b88724acf40dbb6d7e8ccb400ef38.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/c9516c903de3416c97dae91a59e968d7.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/e1982c90cdd74abaacc4d0692070b400.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/1be705a8f67648da8ec4b4296fa80895.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/de23c0b2be3a4eb1990c5c657061fb29.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/2754615de6e64c4fb95ce1a8095dc1c1.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/299fe30d8f3b4a45b89313fe31f9f3c0.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6cc7db52c5ef4e70b401a5e00d8dd67a.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/22e89747689e4f7e83e3620620c93269.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:1-13"
+    },
+    "1025": {
+        "file_id": 94,
+        "content": "This script creates a new directory called \"mp4\" and then changes into it. It then uses the wget command to download 12 MP4 video files from a specified URL, one after another. The purpose of this script is likely to download all the videos in the EuroCup2016 dataset for further use or processing.",
+        "type": "comment"
+    },
+    "1026": {
+        "file_id": 94,
+        "content": "wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/2ceb6c549fc64305a06a75acb355642b.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/719b0a4bcb1f461eabb152298406b861.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/259856b769044b4d8dc94076deb356bf.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d0bd3eab1e794f0f9501c353a6d37827.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/19eb47cc736240d6b2dd930ab69da839.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/4435b708af6d48519a6b726144147d51.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/ea16ad2a020643529e257bd6cb11b3c3.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/eeebffbd4ec74222a9c2d0775d79b689.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/8cfb4e605af44055b1576c37eb0e3209.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6bca62b57cc449c6935f0b17f28d06be.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/70cfc31e520840b2afca458f93a01ce4.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:14-24"
+    },
+    "1027": {
+        "file_id": 94,
+        "content": "This code is using wget to download multiple video files from a specified URL. The videos are part of the EuroCup2016 dataset, and each file has a unique identifier.",
+        "type": "comment"
+    },
+    "1028": {
+        "file_id": 94,
+        "content": "wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6496960935e845578e391a5916739752.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d6d25403a4bb4784aecff5f21fd00dc5.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/3e23d452a082403391f8abfb87bf2fb4.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/4c5d9d9af4f044c4a68d134061dc264f.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6994844c64b44c26b935cee9604bef0a.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d6322cb95f6a4402ac80432b561abd5d.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/2c8b5587083a4784a51622e4fec87ccd.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/5faa60d70ed141de8560110e840f2048.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/45d08bc5cb0f424f9ed9d7874eb561cd.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6630aaf0e32146088d0b624e9288f071.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f2edbee29c1b4966b3a410260f78fbe3.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:25-35"
+    },
+    "1029": {
+        "file_id": 94,
+        "content": "This code is using the wget command to download multiple video files from a specific URL. The videos are part of the EuroCup2016 dataset.",
+        "type": "comment"
+    },
+    "1030": {
+        "file_id": 94,
+        "content": "wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f24116fdd6a54214991db32f7dddef67.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/0265731a0c6f4a9398c88db8e3d4a3bc.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/02d2de09997f4215b06e3b00ff0502a0.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/9c231896c56a43f291a5e190949f4333.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/4afbbf9afcd44dfea45b044117cccb48.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/745db97a080d4f44b450dc17a2bcf069.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/5933d0ce17854483b81a318d7d45a34e.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d2cfef2da9f84237a6950c7f6659655c.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/5572686cb90f440988ded956a60e555d.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/8962ac5a332346e180c79d701ae0a175.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f6e64ee9b13a4088b24c45c257894c1e.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:36-46"
+    },
+    "1031": {
+        "file_id": 94,
+        "content": "The code is using the 'wget' command to download multiple MP4 video files from different URLs, presumably to create or expand a local dataset of EuroCup2016 videos.",
+        "type": "comment"
+    },
+    "1032": {
+        "file_id": 94,
+        "content": "wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f6ed2b612b3d43baa0726be8b14ebe7c.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/8ab7b0cba5744eb3b6fb10003dfda383.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/1f0a0698e38d493988fe42a50f7e8723.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/737fdb054ca141f2a45013c1740dd0a0.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/bab63a9bcf204e4b99c4a887a01bfd60.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:47-51"
+    },
+    "1033": {
+        "file_id": 94,
+        "content": "This code uses the wget command to download multiple mp4 files from a specific URL related to EuroCup2016 dataset. It downloads each file one by one, indicated by the different file names. The files are being downloaded from the bj.bcebos.com server under tmt-pub/datasets/EuroCup2016 directory.",
+        "type": "comment"
+    },
+    "1034": {
+        "file_id": 95,
+        "content": "/applications/FootballAction/datasets/EuroCup2016/url.list",
+        "type": "filepath"
+    },
+    "1035": {
+        "file_id": 95,
+        "content": "The given code contains a list of unique URLs for MP4 video files from the \"EuroCup2016\" dataset in the \"FootballAction\" application, which can be used for training or testing purposes.",
+        "type": "summary"
+    },
+    "1036": {
+        "file_id": 95,
+        "content": "mp4/63e51df254d2402fac703b6c4fdb4ea9.mp4\nmp4/76b5f7ee28d942988c6b224bfac136bd.mp4\nmp4/250b88724acf40dbb6d7e8ccb400ef38.mp4\nmp4/c9516c903de3416c97dae91a59e968d7.mp4\nmp4/e1982c90cdd74abaacc4d0692070b400.mp4\nmp4/1be705a8f67648da8ec4b4296fa80895.mp4\nmp4/de23c0b2be3a4eb1990c5c657061fb29.mp4\nmp4/2754615de6e64c4fb95ce1a8095dc1c1.mp4\nmp4/299fe30d8f3b4a45b89313fe31f9f3c0.mp4\nmp4/6cc7db52c5ef4e70b401a5e00d8dd67a.mp4\nmp4/22e89747689e4f7e83e3620620c93269.mp4\nmp4/2ceb6c549fc64305a06a75acb355642b.mp4\nmp4/719b0a4bcb1f461eabb152298406b861.mp4\nmp4/259856b769044b4d8dc94076deb356bf.mp4\nmp4/d0bd3eab1e794f0f9501c353a6d37827.mp4\nmp4/19eb47cc736240d6b2dd930ab69da839.mp4\nmp4/4435b708af6d48519a6b726144147d51.mp4\nmp4/ea16ad2a020643529e257bd6cb11b3c3.mp4\nmp4/eeebffbd4ec74222a9c2d0775d79b689.mp4\nmp4/8cfb4e605af44055b1576c37eb0e3209.mp4\nmp4/6bca62b57cc449c6935f0b17f28d06be.mp4\nmp4/70cfc31e520840b2afca458f93a01ce4.mp4\nmp4/6496960935e845578e391a5916739752.mp4\nmp4/d6d25403a4bb4784aecff5f21fd00dc5.mp4\nmp4/3e23d452a082403391f8abfb87bf2fb4.mp4\nmp4/4c5d9d9af4f044c4a68d134061dc264f.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/url.list:1-26"
+    },
+    "1037": {
+        "file_id": 95,
+        "content": "This code contains a list of URLs to mp4 video files from the \"EuroCup2016\" dataset in the \"FootballAction\" application. These videos may be used for training or testing purposes.",
+        "type": "comment"
+    },
+    "1038": {
+        "file_id": 95,
+        "content": "mp4/6994844c64b44c26b935cee9604bef0a.mp4\nmp4/d6322cb95f6a4402ac80432b561abd5d.mp4\nmp4/2c8b5587083a4784a51622e4fec87ccd.mp4\nmp4/5faa60d70ed141de8560110e840f2048.mp4\nmp4/45d08bc5cb0f424f9ed9d7874eb561cd.mp4\nmp4/6630aaf0e32146088d0b624e9288f071.mp4\nmp4/f2edbee29c1b4966b3a410260f78fbe3.mp4\nmp4/f24116fdd6a54214991db32f7dddef67.mp4\nmp4/0265731a0c6f4a9398c88db8e3d4a3bc.mp4\nmp4/02d2de09997f4215b06e3b00ff0502a0.mp4\nmp4/9c231896c56a43f291a5e190949f4333.mp4\nmp4/4afbbf9afcd44dfea45b044117cccb48.mp4\nmp4/745db97a080d4f44b450dc17a2bcf069.mp4\nmp4/5933d0ce17854483b81a318d7d45a34e.mp4\nmp4/d2cfef2da9f84237a6950c7f6659655c.mp4\nmp4/5572686cb90f440988ded956a60e555d.mp4\nmp4/8962ac5a332346e180c79d701ae0a175.mp4\nmp4/f6e64ee9b13a4088b24c45c257894c1e.mp4\nmp4/f6ed2b612b3d43baa0726be8b14ebe7c.mp4\nmp4/8ab7b0cba5744eb3b6fb10003dfda383.mp4\nmp4/1f0a0698e38d493988fe42a50f7e8723.mp4\nmp4/737fdb054ca141f2a45013c1740dd0a0.mp4\nmp4/bab63a9bcf204e4b99c4a887a01bfd60.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/url.list:27-49"
+    },
+    "1039": {
+        "file_id": 95,
+        "content": "This code contains a list of URLs pointing to MP4 video files from the \"EuroCup2016\" dataset in the \"FootballAction\" application of the PaddleVideo library. The URLs are unique identifiers for each video file, allowing for easy access and retrieval.",
+        "type": "comment"
+    },
+    "1040": {
+        "file_id": 96,
+        "content": "/applications/FootballAction/datasets/EuroCup2016/url_val.list",
+        "type": "filepath"
+    },
+    "1041": {
+        "file_id": 96,
+        "content": "This code snippet appears to be a list of URLs pointing to various MP4 video files. The file names are hashed strings, indicating that the videos may have been previously used for storage or identification purposes.",
+        "type": "summary"
+    },
+    "1042": {
+        "file_id": 96,
+        "content": "mp4/5572686cb90f440988ded956a60e555d.mp4\nmp4/f6e64ee9b13a4088b24c45c257894c1e.mp4\nmp4/259856b769044b4d8dc94076deb356bf.mp4\nmp4/1f0a0698e38d493988fe42a50f7e8723.mp4\nmp4/8cfb4e605af44055b1576c37eb0e3209.mp4",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/EuroCup2016/url_val.list:1-5"
+    },
+    "1043": {
+        "file_id": 96,
+        "content": "This code snippet appears to be a list of URLs pointing to various MP4 video files. The file names are hashed strings, indicating that the videos may have been previously used for storage or identification purposes.",
+        "type": "comment"
+    },
+    "1044": {
+        "file_id": 97,
+        "content": "/applications/FootballAction/datasets/script/get_frames_pcm.py",
+        "type": "filepath"
+    },
+    "1045": {
+        "file_id": 97,
+        "content": "This code utilizes ffmpeg to extract frames and PCM audio from video files, creating folders if necessary. It can process multiple MP4 files in parallel with up to 10 workers using the \"extract_frames\", \"extract_pcm\", and \"process\" functions.",
+        "type": "summary"
+    },
+    "1046": {
+        "file_id": 97,
+        "content": "\"\"\"\nget frames and pcm from video\n\"\"\"\nimport os\nfrom concurrent import futures\ndataset = \"../EuroCup2016\"\nurl_list = os.path.join(dataset, 'url.list')\ndst_frames = os.path.join(dataset, 'frames')\ndst_pcm = os.path.join(dataset, 'pcm')\nif not os.path.exists(dst_frames):\n    os.mkdir(dst_frames)\nif not os.path.exists(dst_pcm):\n    os.mkdir(dst_pcm)\ndef extract_frames(video_name, out_folder, fps=5):\n    if os.path.exists(out_folder):\n        os.system('rm -rf ' + out_folder + '/*')\n        os.system('rm -rf ' + out_folder)\n    os.makedirs(out_folder)\n    cmd = 'ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (video_name, fps,\n                                                      out_folder, '%08d')\n    os.system(cmd)\ndef extract_pcm(video_name, file_name_pcm):\n    cmd = 'ffmpeg -y -i %s -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' % (\n        video_name, file_name_pcm)\n    os.system(cmd)\ndef process(line):\n    print(line)\n    mp4_name = os.path.join(dataset, line)\n    basename = os.path.basename(line).split('.')[0]\n    folder_frame = os.path.join(dst_frames, basename)",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_frames_pcm.py:1-37"
+    },
+    "1047": {
+        "file_id": 97,
+        "content": "This code retrieves frames and Pulse Code Modulation (PCM) audio from video files. It uses the ffmpeg tool for extraction, creating folders if they don't exist already, and removes existing files before processing new ones. The \"extract_frames\" function takes a video name and output folder to extract frames at a specified frame rate. The \"extract_pcm\" function converts audio from a video file to PCM format using ffmpeg. The \"process\" function prints each line, presumably for tracking progress or errors.",
+        "type": "comment"
+    },
+    "1048": {
+        "file_id": 97,
+        "content": "    filename_pcm = os.path.join(dst_pcm, basename + '.pcm')\n    # extract\n    extract_frames(mp4_name, folder_frame)\n    extract_pcm(mp4_name, filename_pcm)\nif __name__ == \"__main__\":\n    with open(url_list, 'r') as f:\n        lines = f.readlines()\n    lines = [k.strip() for k in lines]\n    # multi thread\n    with futures.ProcessPoolExecutor(max_workers=10) as executer:\n        fs = [executer.submit(process, line) for line in lines]\n    #for line in lines:\n    #    process(line)\n    print(\"done\")",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_frames_pcm.py:38-54"
+    },
+    "1049": {
+        "file_id": 97,
+        "content": "Code is reading a list of URLs, extracting frames and audio from each MP4 file, then executing the process in multiple threads with up to 10 workers.",
+        "type": "comment"
+    },
+    "1050": {
+        "file_id": 98,
+        "content": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py",
+        "type": "filepath"
+    },
+    "1051": {
+        "file_id": 98,
+        "content": "The code processes ground truth data, generates output for bmn, and combines GT data for each frame. It selects video segments, defines instance parameters, converts label data to BMN format, and saves as a numpy array and JSON labeled file.",
+        "type": "summary"
+    },
+    "1052": {
+        "file_id": 98,
+        "content": "\"\"\"\nget instance for bmn\n使用winds=40的滑窗，将所有子窗口的长度之和小于winds的进行合并\n合并后，父窗口代表bmn训练数据，子窗口代表tsn训练数据\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nbmn_window = 40\ndataset = \"../EuroCup2016\"\nfeat_dir = dataset + '/features'\nout_dir = dataset + '/input_for_bmn'\nlabel_files = {\n    'train': 'label_cls8_train.json',\n    'validation': 'label_cls8_val.json'\n}\nglobal fps\ndef gen_gts_for_bmn(gts_data):\n    \"\"\"\n    @param, gts_data, original gts for action detection\n    @return, gts_bmn, output gts dict for bmn\n    \"\"\"\n    fps = gts_data['fps']\n    gts_bmn = {'fps': fps, 'gts': []}\n    for sub_item in gts_data['gts']:\n        url = sub_item['url']\n        max_length = sub_item['total_frames']\n        # 特征提取没有获取所有帧特征，这里load feature获取准确max_length\n        #feat_path = feat_dir + '/' + os.path.basename(url).replace('.mp4', '.pkl')\n        #feature_video = pickle.load(open(feat_path, 'rb'))['features']\n        #max_length = int(len(feature_video) * 1.0 / fps)\n        gts_bmn['gts'].append({\n            'url': url,",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:1-42"
+    },
+    "1053": {
+        "file_id": 98,
+        "content": "This code reads original ground truth (gts) data for action detection, sets the frame per second (fps), and generates output gts dict for bmn. It processes each sub-item in the gts_data['gts'], extracts the URL, maximum video length, and load features if not already present. The code then creates a new dictionary with fps and gts list as output gts data for bmn.",
+        "type": "comment"
+    },
+    "1054": {
+        "file_id": 98,
+        "content": "            'total_frames': max_length,\n            'root_actions': []\n        })\n        sub_actions = sub_item['actions']\n        # duration > bmn_window， 直接删除\n        for idx, sub_action in enumerate(sub_actions):\n            if sub_action['end_id'] - sub_action['start_id'] > bmn_window:\n                sub_actions.pop(idx)\n        root_actions = [sub_actions[0]]\n        # before_id, 前一动作的最后一帧\n        # after_id, 后一动作的第一帧\n        before_id = 0\n        for idx in range(1, len(sub_actions)):\n            cur_action = sub_actions[idx]\n            duration = (cur_action['end_id'] - root_actions[0]['start_id'])\n            if duration > bmn_window:\n                after_id = cur_action['start_id']\n                gts_bmn['gts'][-1]['root_actions'].append({\n                    'before_id':\n                    before_id,\n                    'after_id':\n                    after_id,\n                    'actions':\n                    root_actions\n                })\n                before_id = root_actions[-1]['end_id']",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:43-69"
+    },
+    "1055": {
+        "file_id": 98,
+        "content": "This code is filtering out sub-actions that exceed a specified duration (bmn_window). It then creates a root_action list from the remaining sub-actions. The code also keeps track of the before_id and after_id to create 'gts' dictionary entries, which include the before_id, after_id, and root_actions for each group of actions that do not exceed the bmn_window duration.",
+        "type": "comment"
+    },
+    "1056": {
+        "file_id": 98,
+        "content": "                root_actions = [cur_action]\n            else:\n                root_actions.append(cur_action)\n            if idx == len(sub_actions) - 1:\n                after_id = max_length\n                gts_bmn['gts'][-1]['root_actions'].append({\n                    'before_id':\n                    before_id,\n                    'after_id':\n                    after_id,\n                    'actions':\n                    root_actions\n                })\n    return gts_bmn\ndef combile_gts(gts_bmn, gts_process, mode):\n    \"\"\"\n    1、bmn_window 范围内只有一个动作，只取一个目标框\n    2、bmn_window 范围内有多个动作，取三个目标框(第一个动作、最后一个动作、所有动作)\n    \"\"\"\n    global fps\n    fps = gts_process['fps']\n    duration_second = bmn_window * 1.0\n    duration_frame = bmn_window * fps\n    feature_frame = duration_frame\n    for item in gts_process['gts']:\n        url = item['url']\n        basename = os.path.basename(url).split('.')[0]\n        root_actions = item['root_actions']\n        for root_action in root_actions:\n            segments = []\n            # all actions",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:70-102"
+    },
+    "1057": {
+        "file_id": 98,
+        "content": "This function combines ground truth (GT) data for each frame within a bmn_window range. If there is only one action in the bmn_window, it takes that action; otherwise, it considers three actions: first, last, and all. It then creates segments based on these actions and returns the combined GT data.",
+        "type": "comment"
+    },
+    "1058": {
+        "file_id": 98,
+        "content": "            segments.append({\n                'actions': root_action['actions'],\n                'before_id': root_action['before_id'],\n                'after_id': root_action['after_id']\n            })\n            if len(root_action['actions']) > 1:\n                # first action\n                segments.append({\n                    'actions': [root_action['actions'][0]],\n                    'before_id':\n                    root_action['before_id'],\n                    'after_id':\n                    root_action['actions'][1]['start_id']\n                })\n                # last action\n                segments.append({\n                    'actions': [root_action['actions'][-1]],\n                    'before_id':\n                    root_action['actions'][-2]['end_id'],\n                    'after_id':\n                    root_action['after_id']\n                })\n            for segment in segments:\n                before_id = segment['before_id']\n                after_id = segment['after_id']\n                actions = segment['actions']",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:103-128"
+    },
+    "1059": {
+        "file_id": 98,
+        "content": "This code appends segments to a list based on the number of actions in root_action. If there is more than one action, it separates the first and last action into their own segments using before_id and after_id values. Finally, it loops through the segments list using a for loop to assign before_id, after_id, and actions to each segment.",
+        "type": "comment"
+    },
+    "1060": {
+        "file_id": 98,
+        "content": "                box0 = int(max(actions[-1]['end_id'] - bmn_window, before_id))\n                box1 = int(min(actions[0]['start_id'], after_id - bmn_window))\n                if box0 <= box1:\n                    cur_start = random.randint(box0, box1)\n                    cur_end = cur_start + bmn_window\n                    name = '{}_{}_{}'.format(basename, cur_start, cur_end)\n                    annotations = []\n                    for action in actions:\n                        label = str(1.0 * action['label_ids'][0])\n                        label_name = action['label_names'][0]\n                        seg0 = 1.0 * (action['start_id'] - cur_start)\n                        seg1 = 1.0 * (action['end_id'] - cur_start)\n                        annotations.append({\n                            'segment': [seg0, seg1],\n                            'label': label,\n                            'label_name': label_name\n                        })\n                    gts_bmn[name] = {\n                        'duration_second': duration_second,",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:129-147"
+    },
+    "1061": {
+        "file_id": 98,
+        "content": "This code selects a random segment of video from a list of actions, assigns a label to it, and stores the segment information in a dictionary. It uses the start and end IDs of each action to determine the range for the random start point and calculates the segment's position relative to the cur_start value. The code also handles edge cases where the box0 is less than or equal to box1 and creates the annotation dictionary with label and label_name information, as well as the segment duration in seconds.",
+        "type": "comment"
+    },
+    "1062": {
+        "file_id": 98,
+        "content": "                        'duration_frame': duration_frame,\n                        'feature_frame': feature_frame,\n                        'subset': mode,\n                        'annotations': annotations\n                    }\n    return gts_bmn\ndef save_feature_to_numpy(gts_bmn, folder):\n    global fps\n    print('save feature for bmn ...')\n    if not os.path.exists(folder):\n        os.mkdir(folder)\n    process_gts_bmn = {}\n    for item, value in gts_bmn.items():\n        basename, start_id, end_id = item.split('_')\n        if not basename in process_gts_bmn:\n            process_gts_bmn[basename] = []\n        process_gts_bmn[basename].append({\n            'name': item,\n            'start': int(start_id),\n            'end': int(end_id)\n        })\n    for item, values in process_gts_bmn.items():\n        feat_path = os.path.join(feat_dir, item + '.pkl')\n        print(feat_path)\n        feature = pickle.load(open(feat_path, 'rb'))\n        image_feature = feature['image_feature']\n        pcm_feature = feature['pcm_feature']",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:148-178"
+    },
+    "1063": {
+        "file_id": 98,
+        "content": "The code defines a function `get_instance_for_bmn` that returns a dictionary containing various parameters for an instance, and another function `save_feature_to_numpy` which saves feature data to a file. The features are split into two types: image and pcm, stored in a dictionary named \"feature\" with keys 'image_feature' and 'pcm_feature'. The code then loops through the dictionaries, creating sub-dictionaries for each item with their corresponding start and end indexes, before saving them to a file.",
+        "type": "comment"
+    },
+    "1064": {
+        "file_id": 98,
+        "content": "        pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))\n        min_length = min(image_feature.shape[0], pcm_feature.shape[0])\n        if min_length == 0:\n            continue\n        image_feature = image_feature[:min_length, :]\n        pcm_feature = pcm_feature[:min_length, :]\n        feature_video = np.concatenate((image_feature, pcm_feature), axis=1)\n        for value in values:\n            save_cut_name = os.path.join(folder, value['name'])\n            start_frame = (value['start']) * fps\n            end_frame = (value['end']) * fps\n            if end_frame > len(feature_video):\n                del gts_bmn[value['name']]\n                continue\n            feature_cut = [\n                feature_video[i] for i in range(start_frame, end_frame)\n            ]\n            np_feature_cut = np.array(feature_cut, dtype=np.float32)\n            np.save(save_cut_name, np_feature_cut)\n    return gts_bmn\nif __name__ == \"__main__\":\n    if not os.path.exists(out_dir):\n        os.mkdir(out_dir)\n    gts_bmn = {}",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:180-205"
+    },
+    "1065": {
+        "file_id": 98,
+        "content": "Reshapes pcm_feature for concatenation, sets min_length based on shorter of two feature arrays, continues if min_length is 0, slices image_feature and pcm_feature to match min_length, concatenates along axis 1 to create feature_video, iterates through values dictionary, creates save_cut_name path, calculates start and end frames in seconds, checks if end frame exceeds length of feature_video, removes key from gts_bmn if end_frame is greater than feature_video length, generates list of feature_video slices within range of start and end frames, converts to numpy array for floating point numbers, saves np_feature_cut as .npy file with name derived from value's 'name'. Returns gts_bmn dictionary.",
+        "type": "comment"
+    },
+    "1066": {
+        "file_id": 98,
+        "content": "    for item, value in label_files.items():\n        label_file = os.path.join(dataset, value)\n        gts_data = json.load(open(label_file, 'rb'))\n        gts_process = gen_gts_for_bmn(gts_data)\n        gts_bmn = combile_gts(gts_bmn, gts_process, item)\n    gts_bmn = save_feature_to_numpy(gts_bmn, out_dir + '/feature')\n    with open(out_dir + '/label.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(gts_bmn, indent=4, ensure_ascii=False)\n        f.write(data)",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:206-216"
+    },
+    "1067": {
+        "file_id": 98,
+        "content": "The code is iterating over the label_files, loading JSON data from each file, processing it for BMN format, combining it with existing gts_bmn, and saving the final result as a numpy array and JSON formatted label.",
+        "type": "comment"
+    },
+    "1068": {
+        "file_id": 99,
+        "content": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py",
+        "type": "filepath"
+    },
+    "1069": {
+        "file_id": 99,
+        "content": "This code calculates IoU and IOA, checks hits, stores relevant info in a dictionary, prints URLs for each video using gts data, and splits video features into training and validation datasets for handling football actions. It also separates video features and labels into training and validation sets, storing the data in .pkl files for later use.",
+        "type": "summary"
+    },
+    "1070": {
+        "file_id": 99,
+        "content": "\"\"\"\nget instance for lstm\n根据gts计算每个proposal_bmn的iou、ioa、label等信息\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\ndataset = \"../EuroCup2016\"\nfeat_dir = dataset + '/features'\nprop_file = dataset + '/feature_bmn/prop.json'\nout_dir = dataset + '/input_for_lstm'\nlabel_files = {\n    'train': 'label_cls8_train.json',\n    'validation': 'label_cls8_val.json'\n}\ndef IoU(e1, e2):\n    \"\"\"\n    clc iou and ioa\n    \"\"\"\n    area1 = e1[\"end\"] - e1[\"start\"]\n    area2 = e2[\"end\"] - e2[\"start\"]\n    x1 = np.maximum(e1[\"start\"], e2[\"start\"])\n    x2 = np.minimum(e1[\"end\"], e2[\"end\"])\n    inter = np.maximum(0.0, x2 - x1)\n    iou = 0.0 if (area1 + area2 -\n                  inter) == 0 else inter * 1.0 / (area1 + area2 - inter)\n    ioa = 0.0 if area2 == 0 else inter * 1.0 / area2\n    return iou, ioa\ndef clc_iou_of_proposal(proposal, gts):\n    hit_gts = {}\n    label = 0\n    norm_start = 0.\n    hit = False\n    for gt in gts:\n        e1 = {'start': proposal['start'], 'end': proposal['end']}\n        e2 = {'start': gt['start_id'], 'end': gt['end_id']}",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:1-44"
+    },
+    "1071": {
+        "file_id": 99,
+        "content": "This code computes the IoU (intersection over union) and IOA (intersection over area) for proposals and ground truths in a dataset. It takes proposal bounding boxes and ground truth bounding boxes as inputs, calculates their intersections and unions, and outputs the resulting IoUs and IOAs. The calculated IoU and IOA values will be used to determine the labels for the LSTM model's input data.",
+        "type": "comment"
+    },
+    "1072": {
+        "file_id": 99,
+        "content": "        iou, ioa = IoU(e1, e2)\n        if iou > 0:\n            hit = True\n            hit_gts = gt\n            label = hit_gts['label_ids'][0]\n            norm_start = (gt['start_id'] - proposal['start']) * 1.0 / (\n                proposal['end'] - proposal['start'])\n            break\n    res = {\n        'label': label,\n        'norm_iou': iou,\n        'norm_ioa': ioa,\n        'norm_start': norm_start,\n        'proposal': proposal,\n        'hit_gts': hit_gts\n    }\n    return res\ndef get_bmn_info(gts_data, proposal_data, res_bmn, mode, score_threshold=0.01):\n    \"\"\"\n    @param, gts_data, original gts for action detection\n    @param, proposal_data, proposal actions from bmn\n    @param, mode, train or validation\n    @return, None.\n    \"\"\"\n    fps = gts_data['fps']\n    res_bmn['fps'] = fps\n    for gts_item in gts_data['gts']:\n        url = gts_item['url']\n        print(url)\n        max_length = gts_item['total_frames']\n        video_name = os.path.basename(url).split('.')[0]\n        if not video_name in proposal_data:\n            continue",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:45-80"
+    },
+    "1073": {
+        "file_id": 99,
+        "content": "This code calculates IoU and IOA between two sets of data, then checks if there is a hit. It stores the label, normalized start, and other relevant information in a dictionary and returns it. The get_bmn_info function takes gts and proposal data and prints the URL for each video, iterating through the gts data.",
+        "type": "comment"
+    },
+    "1074": {
+        "file_id": 99,
+        "content": "        gts_actions = gts_item['actions']\n        prop_actions = proposal_data[video_name]\n        res_bmn['results'].append({\n            'url': url,\n            'mode': mode,\n            'total_frames': max_length,\n            'num_gts': len(gts_actions),\n            'num_proposals': len(prop_actions),\n            'proposal_actions': []\n        })\n        for proposal in prop_actions:\n            if proposal['score'] < score_threshold:\n                continue\n            proposal['start'] = int(proposal['start'] * 1.0 / fps)\n            proposal['end'] = int(proposal['end'] * 1.0 / fps)\n            gts_info = clc_iou_of_proposal(proposal, gts_actions)\n            res_bmn['results'][-1]['proposal_actions'].append(gts_info)\n    return res_bmn\ndef save_feature(label_info, out_dir):\n    print('save feature ...')\n    fps = label_info['fps']\n    out_feature_dir = out_dir + '/feature'\n    out_feature_dir = os.path.abspath(out_feature_dir)\n    if not os.path.exists(out_feature_dir):\n        os.mkdir(out_feature_dir)",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:82-110"
+    },
+    "1075": {
+        "file_id": 99,
+        "content": "The code retrieves ground truth (GT) actions and proposal actions from a dataset, then evaluates the Intersection over Union (IoU) of each proposal with the GT actions. If a proposal's score is below a threshold, it is skipped. The IoU values are appended to the 'results' list within a dictionary, along with other information such as URL, mode, total frames, number of GT and proposal actions. Finally, the function returns the dictionary. A separate function saves features in an output directory, creating one if necessary.",
+        "type": "comment"
+    },
+    "1076": {
+        "file_id": 99,
+        "content": "    fid_train = open(out_dir + '/train.txt', 'w')\n    fid_val = open(out_dir + '/val.txt', 'w')\n    for res in label_info['results']:\n        basename = os.path.basename(res['url']).split('.')[0]\n        print(basename, res['num_proposals'])\n        mode = res['mode']\n        fid = fid_train if mode == 'train' else fid_val\n        feature_path = os.path.join(feat_dir, basename + '.pkl')\n        feature_data = pickle.load(open(feature_path, 'rb'))\n        image_feature = feature_data['image_feature']\n        audio_feature = feature_data['audio_feature']\n        max_len_audio = len(audio_feature)\n        for proposal in res['proposal_actions']:\n            label = proposal['label']\n            start_id = proposal['proposal']['start']\n            end_id = proposal['proposal']['end']\n            # get hit feature\n            image_feature_hit = image_feature[start_id * fps:end_id * fps]\n            audio_feature_hit = audio_feature[min(start_id, max_len_audio\n                                                  ):min(end_id, max_len_audio)]",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:111-130"
+    },
+    "1077": {
+        "file_id": 99,
+        "content": "This code is splitting video features into training and validation datasets, handling audio-visual data for football actions. It reads the results from label_info and writes image and audio feature segments to train.txt or val.txt files based on mode (train/val). The code iterates through proposal actions, extracting the corresponding image and audio features.",
+        "type": "comment"
+    },
+    "1078": {
+        "file_id": 99,
+        "content": "            # save\n            anno_info = {\n                'image_feature': np.array(image_feature_hit, dtype=np.float32),\n                'audio_feature': np.array(audio_feature_hit, dtype=np.float32),\n                'feature_fps': fps,\n                'label_info': proposal,\n                'video_name': basename\n            }\n            save_name = '{}/{}_{}_{}.pkl'.format(out_feature_dir, basename,\n                                                 start_id, end_id)\n            with open(save_name, 'wb') as f:\n                pickle.dump(anno_info, f, protocol=pickle.HIGHEST_PROTOCOL)\n            fid.write('{} {}\\n'.format(save_name, label))\n    fid_train.close()\n    fid_val.close()\n    print('done!')\nif __name__ == \"__main__\":\n    if not os.path.exists(out_dir):\n        os.mkdir(out_dir)\n    prop_data = json.load(open(prop_file, 'rb'))\n    proposal_data = {}\n    for item in prop_data:\n        proposal_data[os.path.basename(\n            item['video_name'])] = item['bmn_results']\n    # get label info\n    res_bmn = {'fps': 0, 'results': []}",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:132-161"
+    },
+    "1079": {
+        "file_id": 99,
+        "content": "This code saves video features and labels into separate files for training and validation sets. It creates a dictionary of feature information and label, then dumps this data into a .pkl file with the appropriate naming format. Finally, it writes the file name and corresponding label into another file.",
+        "type": "comment"
+    },
+    "1080": {
+        "file_id": 99,
+        "content": "    for item, value in label_files.items():\n        label_file = os.path.join(dataset, value)\n        gts_data = json.load(open(label_file, 'rb'))\n        res_bmn = get_bmn_info(gts_data, proposal_data, res_bmn, item)\n    with open(out_dir + '/label_info.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(res_bmn, indent=4, ensure_ascii=False)\n        f.write(data)\n    # save feature\n    save_feature(res_bmn, out_dir)",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:162-172"
+    },
+    "1081": {
+        "file_id": 99,
+        "content": "This code reads label files, loads and processes the data, then saves the processed data (label information) and optional features to specific directories.",
+        "type": "comment"
+    },
+    "1082": {
+        "file_id": 100,
+        "content": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py",
+        "type": "filepath"
+    },
+    "1083": {
+        "file_id": 100,
+        "content": "The code processes video data, extracts action instances, creates frames and stores them as pickle files for a dataset, with potential data splitting for training and validation.",
+        "type": "summary"
+    },
+    "1084": {
+        "file_id": 100,
+        "content": "\"\"\"\nget instance for tsn\npositive: 标注后的动作区间，一个区间所有frames生成一个pkl\nnegative: 标注后的非动作区间，随机取N个区间生成N个pkl，每个区间长度等于最近的前一个动作区间的长度\n\"\"\"\nimport os\nimport json\nimport numpy as np\nimport random\nimport pickle\nfrom concurrent import futures\ndataset = \"../EuroCup2016\"\nframes_dir = dataset + '/frames'\nlabel_files = {'train': 'label_cls8_train.json', 'val': 'label_cls8_val.json'}\ndef process(item, fps, save_folder):\n    actions_pos = []\n    actions_neg = []\n    url = item['url']\n    print(url)\n    basename = os.path.basename(url).split('.')[0]\n    actions = item['actions']\n    # pos\n    for action in actions:\n        actions_pos.append({\n            'label': action['label_ids'],\n            'start': action['start_id'] * fps,\n            'end': action['end_id'] * fps\n        })\n    # neg\n    for idx, pos in enumerate(actions_pos):\n        if idx == len(actions_pos) - 1:\n            break\n        len_pos = pos['end'] - pos['start']\n        duration_start = [pos['end'], actions_pos[idx + 1]['start'] - len_pos]\n        if duration_start[1] - duration_start[0] < 3:",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:1-38"
+    },
+    "1085": {
+        "file_id": 100,
+        "content": "This code is processing video data by extracting positive and negative action instances. Positive action instances are frames corresponding to annotated action intervals, while negative action instances are randomly selected frames from non-action intervals. The code reads JSON files containing labels and frame information, then processes each item by appending the start and end times of the action intervals. The length of positive action intervals is used to determine the start time for negative action intervals, with a minimum duration constraint between them.",
+        "type": "comment"
+    },
+    "1086": {
+        "file_id": 100,
+        "content": "            continue\n        for k in range(1, 3):\n            start_frame = random.randint(duration_start[0], duration_start[1])\n            end_frame = start_frame + len_pos\n            actions_neg.append({\n                'label': [0],\n                'start': start_frame,\n                'end': end_frame\n            })\n    # save pkl\n    for item in np.concatenate((actions_pos, actions_neg), axis=0):\n        start = item['start']\n        end = item['end']\n        label = item['label']\n        label_str = str(label[0])\n        if len(item['label']) == 2:\n            label_str = label_str + '-' + str(label[1])\n        frames = []\n        for ii in range(start, end + 1):\n            img = os.path.join(frames_dir, basename, '%08d.jpg' % ii)\n            with open(img, 'rb') as f:\n                data = f.read()\n            frames.append(data)\n        # print(label_str)\n        outname = '%s/%s_%08d_%08d_%s.pkl' % (save_folder, basename, start, end,\n                                              label_str)\n        with open(outname, 'wb') as f:",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:39-65"
+    },
+    "1087": {
+        "file_id": 100,
+        "content": "Code is iterating over frames and creating positive (label=1) and negative (label=0) action instances. It randomly sets the start frame, calculates end frame, appends to 'actions_pos' or 'actions_neg'. Then concatenates both lists, loops through items in the list, extracts start/end frames, label, and iterates over frames range to read images and store them in 'frames'. It forms an output file name using base name, start and end frame numbers, and labels, then writes frames to a .pkl file.",
+        "type": "comment"
+    },
+    "1088": {
+        "file_id": 100,
+        "content": "            pickle.dump((basename, label, frames), f, -1)\ndef gen_instance_pkl(label_data, save_folder):\n    fps = label_data['fps']\n    gts = label_data['gts']\n    with futures.ProcessPoolExecutor(max_workers=10) as executer:\n        fs = [executer.submit(process, gt, fps, save_folder) for gt in gts]\n    #for gt in gts:\n    #    process(gt, fps, save_folder)\nif __name__ == \"__main__\":\n    for item, value in label_files.items():\n        save_folder = os.path.join(dataset, 'input_for_pptsm', item)\n        if not os.path.exists(save_folder):\n            os.makedirs(save_folder)\n        label_file = os.path.join(dataset, value)\n        label_data = json.load(open(label_file, 'rb'))\n        gen_instance_pkl(label_data, save_folder)\n    # gen train val list\n    #data_dir = '../EuroCup2016/input_for_pptsm/'\n    data_dir = os.path.abspath(os.path.join(dataset, 'input_for_pptsm'))\n    os.system('find ' + data_dir + '/train -name \"*.pkl\" > ' + data_dir +\n              '/train.list')\n    os.system('find ' + data_dir + '/val -name \"*.pkl\" > ' + data_dir +",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:66-96"
+    },
+    "1089": {
+        "file_id": 100,
+        "content": "The code is creating instances for a dataset, processing data using multiprocessing, and saving them as pickle files. It also generates train and val lists of pickle files for further usage.",
+        "type": "comment"
+    },
+    "1090": {
+        "file_id": 100,
+        "content": "              '/val.list')",
+        "type": "code",
+        "location": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:97-97"
+    },
+    "1091": {
+        "file_id": 100,
+        "content": "This line of code is likely specifying a file path for a validation list ('val.list') which could be used in the context of data splitting or model evaluation on a separate dataset subset.",
+        "type": "comment"
+    },
+    "1092": {
+        "file_id": 101,
+        "content": "/applications/FootballAction/extractor/extract_bmn.py",
+        "type": "filepath"
+    },
+    "1093": {
+        "file_id": 101,
+        "content": "This code utilizes the BAIDU CLOUD action to classify videos, extracts features, and performs prediction using a pre-trained model. It saves proposal counts and bounding box results in a JSON file with UTF-8 encoding and indentation.",
+        "type": "summary"
+    },
+    "1094": {
+        "file_id": 101,
+        "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport shutil\nimport numpy as np\nsys.path.append(\"../predict/action_detect\")\nimport models.bmn_infer as prop_model\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport utils.config_utils as config_utils\nimport logger\nlogger = logger.Logger()\ndef load_model(cfg_file=\"configs/configs.yaml\"):\n    \"\"\"\n    load_model\n    \"\"\"\n    logger.info(\"load model ... \")\n    global infer_configs\n    infer_configs = parse_config(cfg_file)\n    print_configs(infer_configs, \"Infer\")\n    t0 = time.time()\n    global prop_model\n    prop_model = prop_model.InferModel(infer_configs)\n    t1 = time.time()\n    logger.info(\"step0: load model time: {} min\\n\".format((t1 - t0) * 1.0 / 60))\ndef video_classify(video_name):\n    \"\"\"\n    extract_feature\n    \"\"\"\n    logger.info('predict ... ')\n    logger.info(video_name)\n    imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")",
+        "type": "code",
+        "location": "/applications/FootballAction/extractor/extract_bmn.py:1-49"
+    },
+    "1095": {
+        "file_id": 101,
+        "content": "This code is loading a model for video classification using the BAIDU CLOUD action. It first loads the configuration file, then prints the configurations, and finally initializes the InferModel class with the loaded configurations. The `video_classify` function takes a video name as input, likely to perform feature extraction or prediction on that video.",
+        "type": "comment"
+    },
+    "1096": {
+        "file_id": 101,
+        "content": "    pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n    # step 1: extract feature\n    feature_path = video_name.replace(\".mp4\", \".pkl\").replace(\"mp4\", \"features\")\n    video_features = pickle.load(open(feature_path, 'rb'))\n    # step2: get proposal\n    t0 = time.time()\n    bmn_results = prop_model.predict(infer_configs, material=video_features)\n    t1 = time.time()\n    logger.info(np.array(bmn_results).shape)\n    logger.info(\"step2: proposal time: {} min\".format((t1 - t0) * 1.0 / 60))\n    return bmn_results\nif __name__ == '__main__':\n    dataset_dir = \"../datasets/EuroCup2016\"\n    if not os.path.exists(dataset_dir + '/feature_bmn'):\n        os.mkdir(dataset_dir + '/feature_bmn')\n    results = []\n    load_model()\n    video_url = os.path.join(dataset_dir, 'url.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]\n    for line in lines:\n        bmn_results = video_classify(line)\n        results.append({\n            'video_name': os.path.basename(line).split('.')[0],",
+        "type": "code",
+        "location": "/applications/FootballAction/extractor/extract_bmn.py:50-83"
+    },
+    "1097": {
+        "file_id": 101,
+        "content": "This code extracts features from videos and predicts proposals using a pre-trained model. It loads the necessary configurations, creates feature directories if they don't exist, reads video URLs from a file, processes each video to obtain bounding box results, and saves these results into a list of dictionaries for further analysis or processing.",
+        "type": "comment"
+    },
+    "1098": {
+        "file_id": 101,
+        "content": "            'num_proposal': len(bmn_results),\n            'bmn_results': bmn_results\n        })\n    with open(dataset_dir + '/feature_bmn/prop.json', 'w',\n              encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)",
+        "type": "code",
+        "location": "/applications/FootballAction/extractor/extract_bmn.py:84-91"
+    },
+    "1099": {
+        "file_id": 101,
+        "content": "This code saves the number of proposals and a list of bounding box results for each proposal in a JSON file, formatting it with indentation and using UTF-8 encoding.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/11.json b/docs/data/11.json
new file mode 100644
index 000000000..d0f886a1b
--- /dev/null
+++ b/docs/data/11.json
@@ -0,0 +1,541 @@
+{
+    "1100": {
+        "file_id": 102,
+        "content": "/applications/FootballAction/extractor/extract_feat.py",
+        "type": "filepath"
+    },
+    "1101": {
+        "file_id": 102,
+        "content": "This Python script loads Baidu Cloud models for video classification, extracts audio and pcm features, logs details, saves features in a pickle file, creates \"features\" directory if necessary, and classifies videos from a specified dataset directory.",
+        "type": "summary"
+    },
+    "1102": {
+        "file_id": 102,
+        "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport shutil\nimport numpy as np\nsys.path.append(\"../predict/action_detect\")\nimport models.pptsm_infer as image_model\nimport models.audio_infer as audio_model\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport utils.config_utils as config_utils\nimport logger\nlogger = logger.Logger()\ndef load_model(cfg_file=\"configs/configs.yaml\"):\n    \"\"\"\n    load_model\n    \"\"\"\n    logger.info(\"load model ... \")\n    global infer_configs\n    infer_configs = parse_config(cfg_file)\n    print_configs(infer_configs, \"Infer\")\n    t0 = time.time()\n    global image_model, audio_model\n    image_model = image_model.InferModel(infer_configs)\n    audio_model = audio_model.InferModel(infer_configs)\n    t1 = time.time()\n    logger.info(\"step0: load model time: {} min\\n\".format((t1 - t0) * 1.0 / 60))\ndef video_classify(video_name):\n    \"\"\"\n    extract_feature\n    \"\"\"\n    logger.info('predict ... ')",
+        "type": "code",
+        "location": "/applications/FootballAction/extractor/extract_feat.py:1-50"
+    },
+    "1103": {
+        "file_id": 102,
+        "content": "This Python script is for the Baidu Cloud action, loading and initializing image and audio models according to a given configuration file. It also provides a function to classify videos. The script logs information about model loading time and the progress of video classification.",
+        "type": "comment"
+    },
+    "1104": {
+        "file_id": 102,
+        "content": "    logger.info(video_name)\n    imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")\n    pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n    # step 1: extract feature\n    t0 = time.time()\n    image_path_list = get_images(imgs_path)\n    infer_configs['PPTSM']['frame_list'] = image_path_list\n    infer_configs['AUDIO']['pcm_file'] = pcm_path\n    image_features = image_model.predict(infer_configs)\n    audio_features, pcm_features = audio_model.predict(infer_configs)\n    np_image_features = np.array(image_features, dtype=np.float32)\n    np_audio_features = np.array(audio_features, dtype=np.float32)\n    np_pcm_features = np.array(pcm_features, dtype=np.float32)\n    t1 = time.time()\n    logger.info('{} {} {}'.format(np_image_features.shape,\n                                  np_audio_features.shape,\n                                  np_pcm_features.shape))\n    logger.info(\"step1: feature extract time: {} min\".format(\n        (t1 - t0) * 1.0 / 60))\n    video_features = {\n        'image_feature': np_image_features,",
+        "type": "code",
+        "location": "/applications/FootballAction/extractor/extract_feat.py:51-74"
+    },
+    "1105": {
+        "file_id": 102,
+        "content": "Extracting video features, specifically images and audio. Converting extracted features to numpy arrays. Logging shapes of the arrays and time taken for feature extraction.",
+        "type": "comment"
+    },
+    "1106": {
+        "file_id": 102,
+        "content": "        'audio_feature': np_audio_features,\n        'pcm_feature': np_pcm_features\n    }\n    # save feature\n    feature_path = video_name.replace(\".mp4\", \".pkl\").replace(\"mp4\", \"features\")\n    feat_pkl_str = pickle.dumps(video_features,\n                                protocol=pickle.HIGHEST_PROTOCOL)\n    with open(feature_path, 'wb') as fout:\n        fout.write(feat_pkl_str)\nif __name__ == '__main__':\n    dataset_dir = \"../datasets/EuroCup2016\"\n    if not os.path.exists(dataset_dir + '/features'):\n        os.mkdir(dataset_dir + '/features')\n    load_model()\n    video_url = os.path.join(dataset_dir, 'url.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]\n    for line in lines:\n        video_classify(line)",
+        "type": "code",
+        "location": "/applications/FootballAction/extractor/extract_feat.py:75-100"
+    },
+    "1107": {
+        "file_id": 102,
+        "content": "The code extracts audio and pcm features from video files, saves them in a pickle file named after the original video, creates a \"features\" directory if it doesn't exist, then classifies each video based on its location in a specified dataset directory.",
+        "type": "comment"
+    },
+    "1108": {
+        "file_id": 103,
+        "content": "/applications/FootballAction/predict/action_detect/action.py",
+        "type": "filepath"
+    },
+    "1109": {
+        "file_id": 103,
+        "content": "This code employs machine learning and deep learning for Baidu Cloud's action detection system, including preprocessing, feature extraction, model application, and time-tracked execution. It configures PPTSM model, predicts features from data, creates a video feature dictionary, loads pre-existing features, checks shapes, and writes results to JSON file.",
+        "type": "summary"
+    },
+    "1110": {
+        "file_id": 103,
+        "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport functools\nimport numpy as np\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport mfcc.feature_extractor as mfcc_extractor\nimport models.pptsm_infer as image_model\nimport models.audio_infer as audio_model\nimport models.bmn_infer as prop_model\nimport models.lstm_infer as classify_model\nimport logger\nlogger = logger.Logger()\ndef record_time_info(func):\n    \"\"\"decorator func to log cost time for func\n    \"\"\"\n    @functools.wraps(func)\n    def timer(*args):\n        \"\"\"log cost time for func\n        \"\"\"\n        logger.info(\"function [{}] processing ...\".format(func.__name__))\n        start_time = time.time()\n        retval = func(*args)\n        cost_time = round(time.time() - start_time, 5)\n        logger.info(\"function [{}] run time: {:.2f} min\".format(func.__name__, cost_time / 60))\n        return retval\n    return timer\nclass ActionDetection(object):",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/action.py:1-44"
+    },
+    "1111": {
+        "file_id": 103,
+        "content": "This code is for the Baidu Cloud action detection system, which uses machine learning and deep learning models to classify actions from both audio and image inputs. It includes utilities for preprocessing data, extracting features, and applying various models including image, audio, and propensity models. The code also has a logger module to log processing time information. A class ActionDetection is defined which likely handles the overall action detection process.",
+        "type": "comment"
+    },
+    "1112": {
+        "file_id": 103,
+        "content": "    \"\"\"ModelPredict\"\"\"\n    def __init__(self, cfg_file=\"configs/configs.yaml\"):\n        cfg = parse_config(cfg_file)\n        self.configs = cfg\n        print_configs(self.configs, \"Infer\")\n        name = 'COMMON'\n        self.DEBUG          = cfg[name]['DEBUG']\n        self.BMN_ONLY       = cfg[name]['BMN_ONLY']\n        self.LSTM_ONLY      = cfg[name]['LSTM_ONLY']\n        self.PCM_ONLY       = cfg[name]['PCM_ONLY']\n        if self.LSTM_ONLY:\n            self.prop_dict = {}\n            for dataset in ['EuroCup2016']:\n                prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)\n                json_data = json.load(open(prop_json, 'r'))\n                for item in json_data:\n                    basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n                    basename = basename + '/' + item['video_name'] + '.mp4'\n                    self.prop_dict[basename] = item['bmn_results']\n    @record_time_info\n    def load_model(self):\n        \"\"\"\n        load_model\n        \"\"\"",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/action.py:45-71"
+    },
+    "1113": {
+        "file_id": 103,
+        "content": "This code initializes a ModelPredict object with various configuration settings and properties. It reads configuration data from a specified file, prints relevant information for debugging, and sets attributes related to model components such as BMN_ONLY, LSTM_ONLY, and PCM_ONLY. If LSTM_ONLY is set to true, it populates a prop_dict with video names and their corresponding BMN results for later use in the load_model function.",
+        "type": "comment"
+    },
+    "1114": {
+        "file_id": 103,
+        "content": "        if not self.DEBUG:\n            self.image_model = image_model.InferModel(self.configs)\n            if not self.PCM_ONLY:\n                self.audio_model = audio_model.InferModel(self.configs)\n        if not self.LSTM_ONLY:\n            self.prop_model = prop_model.InferModel(self.configs)\n        if not self.BMN_ONLY:\n            self.classify_model = classify_model.InferModel(self.configs)\n        logger.info(\"==> Action Detection prepared.\")\n    @record_time_info\n    def infer(self, imgs_path, pcm_path, fps=5):\n        \"\"\"\n        extract_feature\n        \"\"\"\n        self.imgs_path = imgs_path\n        self.pcm_path = pcm_path\n        self.configs['COMMON']['fps'] = fps\n        logger.info(\"==> input video {}\".format(os.path.basename(self.imgs_path)))\n        # step 1: extract feature\n        video_features = self.extract_feature()\n        # step2: get proposal\n        bmn_results = self.extract_proposal(video_features)\n        # step3: classify \n        material = {'feature': video_features, 'proposal': bmn_results}",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/action.py:72-103"
+    },
+    "1115": {
+        "file_id": 103,
+        "content": "This code initializes models for image, audio, property prediction, and action classification. If DEBUG is not set, it creates InferModel instances for each model type. It then extracts features from input video, gets proposals using BMN (Bidirectional Mixture of Experts Network), and classifies the actions based on these features and proposals.",
+        "type": "comment"
+    },
+    "1116": {
+        "file_id": 103,
+        "content": "        action_results = self.video_classify(material)\n        return bmn_results, action_results\n    @record_time_info\n    def video_classify(self, material):\n        \"\"\"video classify\"\"\"\n        if self.BMN_ONLY:\n            return []\n        action_results = self.classify_model.predict(self.configs, material=material) \n        logger.info('action shape {}'.format(np.array(action_results).shape))\n        return action_results\n    @record_time_info\n    def extract_proposal(self, video_features):\n        \"\"\"extract proposal\"\"\"\n        if self.LSTM_ONLY:\n            basename = self.imgs_path.replace('frames', 'mp4') + '.mp4'\n            bmn_results = self.prop_dict[basename]\n            return bmn_results\n        bmn_results = self.prop_model.predict(self.configs, material=video_features)\n        logger.info('proposal shape {}'.format(np.array(bmn_results).shape))\n        return bmn_results\n    @record_time_info\n    def extract_feature(self):\n        \"\"\"extract feature\"\"\"\n        if not self.DEBUG:\n            image_path_list = get_images(self.imgs_path)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/action.py:104-132"
+    },
+    "1117": {
+        "file_id": 103,
+        "content": "The code contains multiple methods: `video_classify`, `extract_proposal`, and `extract_feature`. The `video_classify` method predicts actions using a classification model, while the `extract_proposal` method extracts proposals (BMN results) for an input video. Both methods are decorated with the `@record_time_info` decorator to track execution time. The `extract_feature` method extracts features from images in the given path and is only executed if `DEBUG` flag is not set.",
+        "type": "comment"
+    },
+    "1118": {
+        "file_id": 103,
+        "content": "            self.configs['PPTSM']['frame_list'] = image_path_list\n            self.configs['AUDIO']['pcm_file'] = self.pcm_path\n            image_features = self.image_model.predict(self.configs)\n            if self.PCM_ONLY:\n                sample_rate = self.configs['AUDIO']['sample_rate']\n                pcm_features = mfcc_extractor.extract_pcm(self.pcm_path, sample_rate)\n                audio_features = []\n            else:\n                audio_features, pcm_features = self.audio_model.predict(self.configs)\n            np_image_features = np.array(image_features, dtype=np.float32)\n            np_audio_features = np.array(audio_features, dtype=np.float32)\n            np_pcm_features = np.array(pcm_features, dtype=np.float32)\n            video_features = {'image_feature': np_image_features,\n                              'audio_feature': np_audio_features,\n                              'pcm_feature': np_pcm_features}\n        else:\n            feature_path = self.imgs_path.replace(\"frames\", \"features\") + '.pkl'",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/action.py:133-151"
+    },
+    "1119": {
+        "file_id": 103,
+        "content": "This code configures the PPTSM model with image and audio data, then predicts features for both. If PCM_ONLY is True, it extracts MFCC from pcm file. It creates a video feature dictionary containing the predicted image, audio, and (if applicable) pcm features. If no input images are given, it loads the corresponding features from the specified feature path.",
+        "type": "comment"
+    },
+    "1120": {
+        "file_id": 103,
+        "content": "            video_features = pickle.load(open(feature_path, 'rb'))\n        logger.info(\"feature shape {} {} {}\".format(video_features['image_feature'].shape,\n                                                    video_features['audio_feature'].shape,\n                                                    video_features['pcm_feature'].shape))\n        return video_features\nif __name__ == '__main__':\n    model_predict = ActionDetection(cfg_file=\"../configs/configs.yaml\")\n    model_predict.load_model()\n    imgs_path = \"/home/work/datasets/EuroCup2016/frames/1be705a8f67648da8ec4b4296fa80895\"\n    pcm_path = \"/home/work/datasets/EuroCup2016/pcm/1be705a8f67648da8ec4b4296fa80895.pcm\"\n    bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n    results = {'bmn_results': bmn_results, 'action_results': action_results}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/action.py:152-173"
+    },
+    "1121": {
+        "file_id": 103,
+        "content": "Code loads pre-existing video features from a file, checks their shapes and returns them for further processing. It then creates an instance of the ActionDetection model, loads the model, and calls infer function with image and audio paths. Finally, it writes results to a JSON file.",
+        "type": "comment"
+    },
+    "1122": {
+        "file_id": 104,
+        "content": "/applications/FootballAction/predict/action_detect/logger.py",
+        "type": "filepath"
+    },
+    "1123": {
+        "file_id": 104,
+        "content": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler for the logger. The handler is configured to log INFO level messages and uses a specific log format and date format.",
+        "type": "summary"
+    },
+    "1124": {
+        "file_id": 104,
+        "content": "\"\"\"\nlogger\n\"\"\"\nimport os\nimport logging\nclass Logger(logging.Logger):\n    \"\"\"Customized logger for news stripper\n    \"\"\"\n    def __init__(self):\n        super(Logger, self).__init__(self)\n        if not os.path.exists('logs'):\n            os.mkdir('logs')\n        handler = logging.FileHandler(\"logs/action_detect.log\")\n        # handler.setLevel(logging.DEBUG)\n        handler.setLevel(logging.INFO)\n        format = \"%(levelname)s: %(asctime)s: %(filename)s:%(lineno)d %(message)s\"\n        datefmt = \"%y-%m-%d %H:%M:%S\"\n        formatter = logging.Formatter(format, datefmt)\n        handler.setFormatter(formatter)\n        self.addHandler(handler)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/logger.py:1-23"
+    },
+    "1125": {
+        "file_id": 104,
+        "content": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler for the logger. The handler is configured to log INFO level messages and uses a specific log format and date format.",
+        "type": "comment"
+    },
+    "1126": {
+        "file_id": 105,
+        "content": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py",
+        "type": "filepath"
+    },
+    "1127": {
+        "file_id": 105,
+        "content": "The code extracts audio features using MFCC and STFT for action detection in FootballAction. It includes spectrogram bins conversion, data normalization, and resampling with examples using a WAV file.",
+        "type": "summary"
+    },
+    "1128": {
+        "file_id": 105,
+        "content": "\"\"\"\naudio feature extract\n\"\"\"\n# coding: utf-8\nimport os\nimport numpy as np\nimport pickle\nimport mfcc.vgg_params as vgg_params\ndef frame(data, window_length, hop_length):\n    \"\"\"\n    frame\n    \"\"\"\n    num_samples = data.shape[0]\n    #print(\"window_length , hop_length\", window_length, hop_length)\n    #print(\"num_sample = \", num_samples)\n    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))\n    #print(\" num_frames = \", num_frames)\n    shape = (num_frames, window_length) + data.shape[1:]\n    #print(\" shape = \", shape)\n    strides = (data.strides[0] * hop_length, ) + data.strides\n    #print(\"data.strides = \", data.strides)\n    #print(\"strides = \", strides)\n    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)\ndef periodic_hann(window_length):\n    \"\"\"\n    periodic_hann\n    \"\"\"\n    return 0.5 - (0.5 *\n                  np.cos(2 * np.pi / window_length * np.arange(window_length)))\ndef stft_magnitude(signal, fft_length, hop_length=None, window_length=None):\n    \"\"\"\n    stft_magnitude",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:1-38"
+    },
+    "1129": {
+        "file_id": 105,
+        "content": "This code extracts audio features using the Mel-frequency cepstral coefficients (MFCC) method. It defines a function \"frame\" to slice data into frames, another function \"periodic_hann\" for windowing using periodic Hann window, and finally a function \"stft_magnitude\" for computing Short Time Fourier Transform (STFT) magnitude from signal. The code likely uses these functions in combination to extract MFCC features from audio data.",
+        "type": "comment"
+    },
+    "1130": {
+        "file_id": 105,
+        "content": "    \"\"\"\n    frames = frame(signal, window_length, hop_length)\n    window = periodic_hann(window_length)\n    windowed_frames = frames * window\n    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))\n_MEL_BREAK_FREQUENCY_HERTZ = 700.0\n_MEL_HIGH_FREQUENCY_Q = 1127.0\ndef hertz_to_mel(frequencies_hertz):\n    \"\"\"\n    hertz_to_mel\n    \"\"\"\n    return _MEL_HIGH_FREQUENCY_Q * np.log(1.0 + (frequencies_hertz /\n                                                 _MEL_BREAK_FREQUENCY_HERTZ))\ndef spectrogram_to_mel_matrix(num_mel_bins=20,\n                              num_spectrogram_bins=129,\n                              audio_sample_rate=8000,\n                              lower_edge_hertz=125.0,\n                              upper_edge_hertz=3800.0):\n    \"\"\"\n    spectrogram_to_mel_matrix\n    \"\"\"\n    nyquist_hertz = audio_sample_rate / 2.\n    if lower_edge_hertz >= upper_edge_hertz:\n        raise ValueError(\"lower_edge_hertz %.1f >= upper_edge_hertz %.1f\" %\n                         (lower_edge_hertz, upper_edge_hertz))",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:39-69"
+    },
+    "1131": {
+        "file_id": 105,
+        "content": "This code defines functions for converting frequencies from Hertz to Mel scale, and creating a mel spectrum matrix from a spectrogram. It also includes validation checks to ensure lower edge frequency is less than the upper edge frequency. The Mel scale is used in audio processing for approximating human auditory perception of sound.",
+        "type": "comment"
+    },
+    "1132": {
+        "file_id": 105,
+        "content": "    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz,\n                                         num_spectrogram_bins)\n    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)\n    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),\n                                 hertz_to_mel(upper_edge_hertz),\n                                 num_mel_bins + 2)\n    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))\n    for i in range(num_mel_bins):\n        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]\n        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /\n                       (center_mel - lower_edge_mel))\n        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /\n                       (upper_edge_mel - center_mel))\n        mel_weights_matrix[:,\n                           i] = np.maximum(0.0,\n                                           np.minimum(lower_slope, upper_slope))\n    mel_weights_matrix[0, :] = 0.0\n    return mel_weights_matrix\ndef log_mel_spectrogram(data,",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:70-90"
+    },
+    "1133": {
+        "file_id": 105,
+        "content": "This function calculates mel-frequency cepstral coefficients (MFCC) from speech audio data. It converts spectrogram bins to hertz and mel scales, creates band edges for mel analysis, computes mel weights matrix using triangular interpolation, and sets the first row of the matrix to zero.",
+        "type": "comment"
+    },
+    "1134": {
+        "file_id": 105,
+        "content": "                        audio_sample_rate=8000,\n                        log_offset=0.0,\n                        window_length_secs=0.025,\n                        hop_length_secs=0.010,\n                        **kwargs):\n    \"\"\"\n    log_mel_spectrogram\n    \"\"\"\n    window_length_samples = int(round(audio_sample_rate * window_length_secs))\n    #print(\"audio_sample_rate = \", audio_sample_rate)\n    #print(\"window_length_secs = \", window_length_secs)\n    #print(\"window_length_sample \", window_length_samples)\n    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))\n    #print(\"hop_length_samples \", hop_length_samples)\n    fft_length = 2**int(np.ceil(np.log(window_length_samples) / np.log(2.0)))\n    #print(\" fft_lengt = \", fft_length)\n    spectrogram = stft_magnitude(data,\n                                 fft_length=fft_length,\n                                 hop_length=hop_length_samples,\n                                 window_length=window_length_samples)\n    #print(\" spectrogram.shape = \", spectrogram.shape)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:91-111"
+    },
+    "1135": {
+        "file_id": 105,
+        "content": "This function takes in audio data and parameters such as audio sample rate, window length in seconds, hop length in seconds, and other optional keywords. It calculates the window length samples and hop length samples based on the provided audio sample rate. It then determines the FFT length by taking the next highest power of 2 from the window length samples. Finally, it computes the spectrogram using the STFT (Short-Time Fourier Transform) magnitude with the calculated parameters and returns the resulting spectrogram.",
+        "type": "comment"
+    },
+    "1136": {
+        "file_id": 105,
+        "content": "    mel_spectrogram = np.dot(\n        spectrogram,\n        spectrogram_to_mel_matrix(num_spectrogram_bins=spectrogram.shape[1],\n                                  audio_sample_rate=audio_sample_rate,\n                                  **kwargs))\n    return np.log(mel_spectrogram + log_offset)\ndef wav_to_example(wav_data, sample_rate):\n    \"\"\"\n    wav_to_example\n    \"\"\"\n    #sample_rate, wav_data = wavfile.read(wav_file)\n    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype\n    #wav_data = wav_data[:16000*30]\n    #print(\" wav_data \", wav_data.shape)\n    #print(\" wav_data \", wav_data.shape)\n    pad_zero_num = int(sample_rate * (vgg_params.STFT_WINDOW_LENGTH_SECONDS -\n                                      vgg_params.STFT_HOP_LENGTH_SECONDS))\n    wav_data_extend = np.hstack((wav_data, np.zeros(pad_zero_num)))\n    wav_data = wav_data_extend\n    #print(\" wav_data \", wav_data.shape)\n    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]\n    #print(\" wav_data after convert to -1 1\", wav_data)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:112-136"
+    },
+    "1137": {
+        "file_id": 105,
+        "content": "The code extracts audio features from a WAV file using short-time Fourier transform (STFT) and applies Mel-frequency cepstral coefficients (MFCCs). It reads the WAV file, pads zeros if necessary to match desired window length, scales the data to be between -1 and 1, and then calculates STFT. Finally, it computes MFCCs from the spectrogram and returns the log of the result plus a small offset for numerical stability.",
+        "type": "comment"
+    },
+    "1138": {
+        "file_id": 105,
+        "content": "    #if wav_data.shape[0] > max_second * sample_rate:\n    #    wav_data = wav_data[:max_second * sample_rate, :]\n    if len(wav_data.shape) > 1:\n        wav_data = np.mean(wav_data, axis=1)\n    #print(\" wav_data after mean\", wav_data.shape, len(wav_data.shape), wav_data)\n    # Resample to the rate assumed by vgg.\n    #if sample_rate != vgg_params.SAMPLE_RATE:\n    #    wav_data = resampy.resample(wav_data, sample_rate, vgg_params.SAMPLE_RATE)\n    log_mel = log_mel_spectrogram(\n        wav_data,\n        audio_sample_rate=vgg_params.SAMPLE_RATE,\n        log_offset=vgg_params.LOG_OFFSET,\n        window_length_secs=vgg_params.STFT_WINDOW_LENGTH_SECONDS,\n        hop_length_secs=vgg_params.STFT_HOP_LENGTH_SECONDS,\n        num_mel_bins=vgg_params.NUM_MEL_BINS,\n        lower_edge_hertz=vgg_params.MEL_MIN_HZ,\n        upper_edge_hertz=vgg_params.MEL_MAX_HZ)\n    # Frame features into examples.\n    features_sample_rate = 1.0 / vgg_params.STFT_HOP_LENGTH_SECONDS\n    example_window_length = int(\n        round(vgg_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:137-157"
+    },
+    "1139": {
+        "file_id": 105,
+        "content": "This code performs feature extraction on audio data for action detection in the FootballAction application. It ensures that the audio data is within specified bounds, applies mean normalization if necessary, resamples to a fixed rate, and then generates log mel spectrogram features. These features are framed into examples at a specific sample rate and window length for use by VGG model.",
+        "type": "comment"
+    },
+    "1140": {
+        "file_id": 105,
+        "content": "    example_hop_length = int(\n        round(vgg_params.EXAMPLE_HOP_SECONDS * features_sample_rate))\n    log_mel_examples = frame(log_mel,\n                             window_length=example_window_length,\n                             hop_length=example_hop_length)\n    return log_mel_examples\ndef extract_pcm(pcm_file, sample_rate):\n    with open(pcm_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype=np.int16)\n    examples = wav_to_example(audio_data, sample_rate)\n    return examples\nif __name__ == \"__main__\":\n    wav_file = sys.argv[1]\n    print(\"wav_file = \", wav_file)\n    with open(wav_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype = np.int16)\n    examples_batch = wav_to_example(audio_data, 16000)\n    print(\"examples_batch.shape\", examples_batch.shape)   ",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:159-182"
+    },
+    "1141": {
+        "file_id": 105,
+        "content": "The code defines a function that extracts audio features from WAV files. It calculates the hop length based on the example window length and sample rate, and applies a log Mel spectrum to the audio data. It also includes a separate function for extracting examples from PCM files and converting them into examples at a given sample rate. The main part of the code demonstrates how to use the functions by reading a WAV file, printing its shape after processing with the feature extraction functions.",
+        "type": "comment"
+    },
+    "1142": {
+        "file_id": 106,
+        "content": "/applications/FootballAction/predict/action_detect/mfcc/model_config.py",
+        "type": "filepath"
+    },
+    "1143": {
+        "file_id": 106,
+        "content": "The ModelAudio class extracts audio features using wav_to_example and slices the data into parts, calculating features for each part. The predict method appends these features to a list and returns the audio feature list after dividing by sample rate.",
+        "type": "summary"
+    },
+    "1144": {
+        "file_id": 106,
+        "content": "\"\"\"\naudio model config\n\"\"\"\nimport numpy as np\nimport mfcc.feature_extractor as feature_extractor\nclass ModelAudio(object):\n    \"\"\"\n    modelAudio\n    \"\"\"\n    def __init__(self, configs, use_gpu=1):\n        self.use_gpu = use_gpu\n        self.audio_fps = configs.COMMON.fps\n        self.audio_feat_scale = configs.TSN.audio_scale\n        self.sample_rate = 16000\n    def predict_slice(self, wav_data, sample_rate):\n        \"\"\"\n        audio predict\n        \"\"\"\n        examples_batch = feature_extractor.wav_to_example(\n            wav_data, sample_rate)[0]\n        return examples_batch\n    def predict_audio(self, audio_file):\n        \"\"\"\n        predict_audio\n        \"\"\"\n        audio_feature_list = []\n        # read pcm\n        sample_rate = self.sample_rate\n        try:\n            with open(audio_file, \"rb\") as f:\n                pcm_data = f.read()\n            audio_data = np.fromstring(pcm_data, dtype=np.int16)\n            audio_status = \"audio load success\"\n        except Exception as e:\n            audio_data = []\n            audio_status = \"audio load failed\"",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:1-42"
+    },
+    "1145": {
+        "file_id": 106,
+        "content": "The code defines a ModelAudio class which takes in audio-related configurations and performs audio feature extraction using the feature_extractor module's wav_to_example function. The class also predicts audio by converting PCM data to numpy array and handles audio file reading exceptions.",
+        "type": "comment"
+    },
+    "1146": {
+        "file_id": 106,
+        "content": "        step = 1\n        len_video = int(len(audio_data) / sample_rate)\n        print(len_video)\n        for i in range(0, len_video, step):\n            audio_data_part = audio_data[i * sample_rate:(i + step) *\n                                         sample_rate]\n            feature_audio = self.predict_slice(audio_data_part, sample_rate)\n            audio_feature_list.append(feature_audio)\n        return audio_feature_list",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:43-51"
+    },
+    "1147": {
+        "file_id": 106,
+        "content": "The code slices the audio data into parts of size 'step' and calculates features for each part using a predict method, then appends the features to a list. The length of the entire audio data is divided by the sample rate to determine how many steps can fit in it. This function returns the audio feature list.",
+        "type": "comment"
+    },
+    "1148": {
+        "file_id": 107,
+        "content": "/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py",
+        "type": "filepath"
+    },
+    "1149": {
+        "file_id": 107,
+        "content": "The code defines global parameters for the VGGish model, including architectural constants, hyperparameters, and optimizer settings. It extracts audio features from spectrogram patches using PCA quantization and embedding processing, with options to adjust STFT window and hop lengths, mel frequency bins, and learning rate.",
+        "type": "summary"
+    },
+    "1150": {
+        "file_id": 107,
+        "content": "\"\"\"Global parameters for the VGGish model.\nSee vggish_slim.py for more information.\n\"\"\"\n# Architectural constants.\nNUM_FRAMES = 50  # Frames in input mel-spectrogram patch.\nNUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.\nEMBEDDING_SIZE = 128  # Size of embedding layer.\n# Hyperparameters used in feature and example generation.\nSAMPLE_RATE = 16000\nSTFT_WINDOW_LENGTH_SECONDS = 0.040\nSTFT_HOP_LENGTH_SECONDS = 0.020\nNUM_MEL_BINS = NUM_BANDS\nMEL_MIN_HZ = 125\nMEL_MAX_HZ = 7500\nLOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.\nEXAMPLE_WINDOW_SECONDS = 1.00  # Each example contains 96 10ms frames\nEXAMPLE_HOP_SECONDS = 1.00  # with zero overlap.\n# Parameters used for embedding postprocessing.\nPCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'\nPCA_MEANS_NAME = 'pca_means'\nQUANTIZE_MIN_VAL = -2.0\nQUANTIZE_MAX_VAL = +2.0\n# Hyperparameters used in training.\nINIT_STDDEV = 0.01  # Standard deviation used to initialize weights.\nLEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:1-29"
+    },
+    "1151": {
+        "file_id": 107,
+        "content": "This code sets global parameters for the VGGish model. It defines architectural constants, hyperparameters for feature and example generation, embedding postprocessing, and training. The VGGish model is used to extract audio features from spectrogram patches, with options for PCA-based quantization and embedding processing. Hyperparameters control the STFT window and hop lengths, mel frequency bins, and learning rate for Adam optimizer.",
+        "type": "comment"
+    },
+    "1152": {
+        "file_id": 107,
+        "content": "ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.\n# Names of ops, tensors, and features.\nINPUT_OP_NAME = 'vggish/input_features'\nINPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'\nOUTPUT_OP_NAME = 'vggish/embedding'\nOUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'\nAUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:30-37"
+    },
+    "1153": {
+        "file_id": 107,
+        "content": "This code sets the Adam optimizer's epsilon value to 1e-8, defines names for input and output operations, tensors, and features. It also assigns the name \"audio_embedding\" to a feature.",
+        "type": "comment"
+    },
+    "1154": {
+        "file_id": 108,
+        "content": "/applications/FootballAction/predict/action_detect/models/audio_infer.py",
+        "type": "filepath"
+    },
+    "1155": {
+        "file_id": 108,
+        "content": "The \"InferModel\" class is for audio inference, initializing the model and creating a predictor object. It takes input, performs inference, returns output, and measures time taken. The code loads an audio file, sets path, performs prediction, prints shape, first output, and time.",
+        "type": "summary"
+    },
+    "1156": {
+        "file_id": 108,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"audio infer\"\"\"\n    def __init__(self, cfg, name='AUDIO'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:1-37"
+    },
+    "1157": {
+        "file_id": 108,
+        "content": "This code defines a class named \"InferModel\" for audio inference. It initializes the model by reading configuration files, enabling GPU usage, and creating a predictor object. The input name and handle are stored for later use during inference.",
+        "type": "comment"
+    },
+    "1158": {
+        "file_id": 108,
+        "content": "        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        pcm_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = np.array(data, dtype = 'float32')\n            output = self.infer(inputs)\n            feature_list.append(np.squeeze(output))\n            pcm_list.append(inputs)\n        feature_values = np.vstack(feature_list)\n        pcm_values = np.vstack(pcm_list)\n        return feature_values, pcm_values\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:39-69"
+    },
+    "1159": {
+        "file_id": 108,
+        "content": "The code defines a model that takes audio input, performs inference using the predictor, and returns output. The predict method reads data from infer_config and for each iteration, it prepares inputs, runs inference, collects feature lists and pcm lists, then combines them into feature_values and pcm_values before returning.",
+        "type": "comment"
+    },
+    "1160": {
+        "file_id": 108,
+        "content": "    pcm_path = '/home/work/datasets/WorldCup2018/pcm/6e577252c4004961ac7caa738a52c238.pcm'\n    t0 = time.time()\n    cfg['AUDIO']['pcm_file'] = pcm_path\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print(outputs[0])\n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:71-80"
+    },
+    "1161": {
+        "file_id": 108,
+        "content": "This code loads an audio file, sets the path for it in the configuration file, performs prediction on the model, prints the shape and first output of the prediction, and calculates and prints the time taken in minutes.",
+        "type": "comment"
+    },
+    "1162": {
+        "file_id": 109,
+        "content": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py",
+        "type": "filepath"
+    },
+    "1163": {
+        "file_id": 109,
+        "content": "This code defines a Paddle Inference engine class for the \"bmn infer\" application, performing action detection through averaging predictions, generating proposal results, and processing score outcomes. It initializes models, loads data, performs inference, saves results, outputs masks, and prints execution time.",
+        "type": "summary"
+    },
+    "1164": {
+        "file_id": 109,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import process_proposal\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"bmn infer\"\"\"\n    def __init__(self, cfg, name='BMN'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        self.nms_thread          = cfg[name]['nms_thread']\n        self.min_pred_score      = cfg[name]['score_thread']\n        self.min_frame_thread    = cfg['COMMON']['fps']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:1-37"
+    },
+    "1165": {
+        "file_id": 109,
+        "content": "This code defines a class `InferModel` for the \"bmn infer\" application. It imports necessary modules and utilities, sets up model configuration parameters from a JSON file, and initializes the Paddle Inference engine with specified model and parameter files. The GPU memory and device ID are also configured according to the input JSON file. Additionally, some threshold values for NMS (non-maximum suppression) and minimum prediction scores are set.",
+        "type": "comment"
+    },
+    "1166": {
+        "file_id": 109,
+        "content": "        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n        self.output3_tensor = self.predictor.get_output_handle(output_names[2])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        output3 = self.output3_tensor.copy_to_cpu()\n        return output1, output2, output3\n    def generate_props(self, pred_bmn, pred_start, pred_end, max_window=200, min_window=5):\n        \"\"\"generate_props\"\"\"",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:38-63"
+    },
+    "1167": {
+        "file_id": 109,
+        "content": "The code initializes a predictor and sets up input/output tensors for inferencing. It then runs the inference process, copying input data from CPU and output results to CPU, allowing for further processing or analysis. The generate_props function generates properties based on predictions, start, and end timestamps, with adjustable window size.",
+        "type": "comment"
+    },
+    "1168": {
+        "file_id": 109,
+        "content": "        video_len = min(pred_bmn.shape[-1], min(pred_start.shape[-1], pred_end.shape[-1]))\n        pred_bmn = pred_bmn[0, :, :] * pred_bmn[1, :, :]\n        start_mask = self.boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = self.boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_results = []\n        for idx in range(min_window, max_window):\n            for jdx in range(video_len):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < video_len and start_mask[start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = start_index\n                    xmax = end_index\n                    xmin_score = pred_start[start_index]\n                    xmax_score = pred_end[end_index]\n                    bmn_score = pred_bmn[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bmn_score\n                    score_results.append([xmin, xmax, conf_score])\n        return score_results\n    def boundary_choose(self, score_list):",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:64-86"
+    },
+    "1169": {
+        "file_id": 109,
+        "content": "This code is calculating action boundaries from predicted start and end frames, along with a binary mask network (BMN) score. It extracts relevant data from the input, loops through the range of potential window sizes, checks if start and end indices fall within the video length and if boundary masks are activated. If these conditions are met, it calculates the confidence score and appends to the results list. Finally, it returns the list of action boundaries with their respective scores.",
+        "type": "comment"
+    },
+    "1170": {
+        "file_id": 109,
+        "content": "        \"\"\"boundary_choose\"\"\"\n        max_score = max(score_list)\n        mask_high = (score_list > max_score * 0.5)\n        score_list = list(score_list)\n        score_middle = np.array([0.0] + score_list + [0.0])\n        score_front = np.array([0.0, 0.0] + score_list)\n        score_back = np.array(score_list + [0.0, 0.0])\n        mask_peak = ((score_middle > score_front) & (score_middle > score_back))\n        mask_peak = mask_peak[1:-1]\n        mask = (mask_high | mask_peak).astype('float32')\n        return mask\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs      = [items[0] for items in data]\n            winds       = [items[1] for items in data]\n            feat_info   = [items[2] for items in data]\n            feature_T   = feat_info[0][0]\n            feature_N   = feat_info[0][1]\n            inputs = np.array(inputs)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:87-111"
+    },
+    "1171": {
+        "file_id": 109,
+        "content": "This code defines a function that chooses the boundary based on score. The function then returns a mask representing the chosen boundary. The predict function reads data from an infer_reader and loops through each iteration, extracting inputs and feature information.",
+        "type": "comment"
+    },
+    "1172": {
+        "file_id": 109,
+        "content": "            pred_bmn, pred_sta, pred_end = self.infer(inputs)\n            if infer_iter == 0:\n                sum_pred_bmn = np.zeros((2, feature_N, feature_T))\n                sum_pred_sta = np.zeros((feature_T, ))\n                sum_pred_end = np.zeros((feature_T, ))\n                sum_pred_cnt = np.zeros((feature_T, ))\n            for idx, sub_wind in enumerate(winds):\n                sum_pred_bmn[:, :, sub_wind[0]: sub_wind[1]] += pred_bmn[idx]\n                sum_pred_sta[sub_wind[0]: sub_wind[1]] += pred_sta[idx]\n                sum_pred_end[sub_wind[0]: sub_wind[1]] += pred_end[idx]\n                sum_pred_cnt[sub_wind[0]: sub_wind[1]] += np.ones((sub_wind[1] - sub_wind[0], ))\n        pred_bmn = sum_pred_bmn / sum_pred_cnt\n        pred_sta = sum_pred_sta / sum_pred_cnt\n        pred_end = sum_pred_end / sum_pred_cnt\n        score_result = self.generate_props(pred_bmn, pred_sta, pred_end)\n        results = process_proposal(score_result, self.min_frame_thread, self.nms_thread, self.min_pred_score)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:112-131"
+    },
+    "1173": {
+        "file_id": 109,
+        "content": "The code performs action detection by averaging predictions from multiple windows and then generates proposal results. It takes input, infers predictions for each window, sums the predictions within their corresponding windows, divides them by the count of frames in the window to get average predictions, and passes these averages to generate_props function to produce score_result. The process_proposal function is then used to process the score result based on some parameters like minimum frame thread, nms thread, and minimum prediction score to obtain the final results.",
+        "type": "comment"
+    },
+    "1174": {
+        "file_id": 109,
+        "content": "        return results\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    t0 = time.time()\n    outputs = model.predict(cfg, video_features)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    results = {'proposal': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) \n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:133-156"
+    },
+    "1175": {
+        "file_id": 109,
+        "content": "This code initializes a model, loads data, and performs inference. It then saves the results to a JSON file and prints the time taken for execution.",
+        "type": "comment"
+    },
+    "1176": {
+        "file_id": 110,
+        "content": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py",
+        "type": "filepath"
+    },
+    "1177": {
+        "file_id": 110,
+        "content": "This code initializes an LSTM model in PaddlePaddle for football action prediction and predicts sequences using pre-trained models, measuring time efficiency.",
+        "type": "summary"
+    },
+    "1178": {
+        "file_id": 110,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import get_action_result\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"lstm infer\"\"\"\n    def __init__(self, cfg, name='ACTION'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        self.topk           = cfg[name]['topk']\n        self.frame_offset   = cfg[name]['nms_offset']\n        self.nms_thread     = cfg[name]['nms_thread']\n        self.cls_thread     = cfg[name]['classify_score_thread']\n        self.iou_thread     = cfg[name]['iou_score_thread']\n        self.label_map_file = cfg['COMMON']['label_dic']",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:1-36"
+    },
+    "1179": {
+        "file_id": 110,
+        "content": "The code defines a class \"InferModel\" that uses PaddlePaddle's inference API to predict football actions. It initializes with a configuration file specifying the model, parameters, and other settings for inference. The class contains variables such as topk, frame_offset, nms_thread, cls_thread, iou_score_thread, and label_map_file.",
+        "type": "comment"
+    },
+    "1180": {
+        "file_id": 110,
+        "content": "        self.fps            = cfg['COMMON']['fps']\n        self.nms_id         = 5\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input1_tensor = self.predictor.get_input_handle(input_names[0])\n        self.input2_tensor = self.predictor.get_input_handle(input_names[1])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n    def infer(self, input1_arr, input1_lod, input2_arr=None, input2_lod=None):\n        \"\"\"infer\"\"\"\n        self.input1_tensor.copy_from_cpu(input1_arr)\n        self.input1_tensor.set_lod(input1_lod)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:37-61"
+    },
+    "1181": {
+        "file_id": 110,
+        "content": "This code initializes a LSTM model for video action detection. It sets the FPS, NMS ID, and configures GPU memory optimization. It also sets up input/output tensors for inference on the model.",
+        "type": "comment"
+    },
+    "1182": {
+        "file_id": 110,
+        "content": "        if not input2_arr is None:\n            self.input2_tensor.copy_from_cpu(input2_arr)\n            self.input2_tensor.set_lod(input2_lod)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        # print(output.shape)\n        return output1, output2\n    def pre_process(self, input):\n        \"\"\"pre process\"\"\"\n        input_arr = []\n        input_lod = [0]\n        start_lod = 0\n        end_lod = 0\n        for sub_item in input:\n            end_lod = start_lod + len(sub_item)\n            input_lod.append(end_lod)\n            input_arr.extend(sub_item)\n            start_lod = end_lod\n        input_arr = np.array(input_arr)\n        # print(input_arr.shape)\n        # print([input_lod])\n        return input_arr, [input_lod]\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)\n        results = []\n        for infer_iter, data in enumerate(infer_reader()):",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:62-91"
+    },
+    "1183": {
+        "file_id": 110,
+        "content": "This code defines a class with methods for pre-processing, predicting, and potentially post-processing data. The pre_process method takes in an input and converts it into a suitable format for the model. The predict method uses a reader to iterate over data and generates results. The LSTM model is run after processing the input, returning the outputs.",
+        "type": "comment"
+    },
+    "1184": {
+        "file_id": 110,
+        "content": "            video_id = [[items[-2], items[-1]] for items in data]\n            input1 = [items[0] for items in data]\n            input2 = [items[1] for items in data]\n            input1_arr, input1_lod = self.pre_process(input1)\n            input2_arr, input2_lod = self.pre_process(input2)\n            output1, output2 = self.infer(input1_arr, input1_lod, input2_arr, input2_lod)\n            # output1, output2 = self.infer(input1_arr, input1_lod)\n            predictions_id = output1 \n            predictions_iou = output2\n            for i in range(len(predictions_id)):\n                topk_inds = predictions_id[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds_id = predictions_id[i][topk_inds]\n                preds_iou = predictions_iou[i][0]\n                results.append((video_id[i], preds_id.tolist(), topk_inds.tolist(), preds_iou.tolist()))\n        predict_result = get_action_result(results, self.label_map_file, self.fps, \n                                           self.cls_thread, self.iou_thread, ",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:92-110"
+    },
+    "1185": {
+        "file_id": 110,
+        "content": "The code takes in data and preprocesses it into input1_arr, input1_lod, input2_arr, and input2_lod. It then runs an infer function on these inputs to get output1 and output2. The code then extracts predictions_id and predictions_iou from the outputs. It sorts topk_inds in reverse order and appends video_id, preds_id, topk_inds, and preds_iou to the results list. Finally, it calls get_action_result with the results, label_map_file, fps, cls_thread, and iou_thread as arguments.",
+        "type": "comment"
+    },
+    "1186": {
+        "file_id": 110,
+        "content": "                                           self.nms_id, self.nms_thread, self.frame_offset)\n        return predict_result\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    # proposal total\n    prop_dict = {}\n    for dataset in ['EuroCup2016', 'WorldCup2018']:\n        prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)\n        json_data = json.load(open(prop_json, 'r'))\n        for item in json_data:\n            basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n            basename = basename + '/' + item['video_name'] + '.mp4'\n            prop_dict[basename] = item['bmn_results']\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    # proposal\n    basename = imgs_path.replace('frames', 'mp4') + '.mp4'",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:111-137"
+    },
+    "1187": {
+        "file_id": 110,
+        "content": "The code initializes an InferModel object with a given configuration file. It then loads proposal data from 'EuroCup2016' and 'WorldCup2018' datasets, storing them in a dictionary. The code also specifies the path for image frames and video features, which will be used for further processing.",
+        "type": "comment"
+    },
+    "1188": {
+        "file_id": 110,
+        "content": "    bmn_results = prop_dict[basename]\n    material = {'feature': video_features, 'proposal': bmn_results}\n    t0 = time.time()\n    outputs = model.predict(cfg, material)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    # print(outputs.shape)\n    t1 = time.time()\n    results = {'actions': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) \n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:138-152"
+    },
+    "1189": {
+        "file_id": 110,
+        "content": "The code predicts action sequences from video features using a pre-trained model, and saves the results in a JSON file. It measures and prints the time taken for prediction.",
+        "type": "comment"
+    },
+    "1190": {
+        "file_id": 111,
+        "content": "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py",
+        "type": "filepath"
+    },
+    "1191": {
+        "file_id": 111,
+        "content": "This code defines the InferModel class for inference using a pre-trained PPTSM model, taking config files and performing inference on image data from a specified path, predicting football actions and printing output shape and time taken.",
+        "type": "summary"
+    },
+    "1192": {
+        "file_id": 111,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"pptsm infer\"\"\"\n    def __init__(self, cfg, name='PPTSM'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py:1-38"
+    },
+    "1193": {
+        "file_id": 111,
+        "content": "The code defines a class called InferModel that initializes and prepares the PPTSM model for inference. It takes a configuration file as input, which includes details such as model files, parameter files, GPU memory, and device ID. The code sets up configurations to optimize GPU memory usage and enable zero-copy operations. It then creates a predictor object with these configurations and retrieves the input tensor handle.",
+        "type": "comment"
+    },
+    "1194": {
+        "file_id": 111,
+        "content": "        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[1])\n        #self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = [items[:-1] for items in data]\n            inputs = np.array(inputs)\n            output = self.infer(inputs)\n            feature_list.append(np.squeeze(output))\n        feature_list = np.vstack(feature_list)\n        return feature_list\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py:40-67"
+    },
+    "1195": {
+        "file_id": 111,
+        "content": "The code defines a class that performs inference using a pre-trained model. It gets the output names and handles from the predictor, runs inference on input data, and returns the output. The main function reads a configuration file and creates an instance of the InferModel class to perform inference based on the specified config.",
+        "type": "comment"
+    },
+    "1196": {
+        "file_id": 111,
+        "content": "    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238/'\n    imgs_list = get_images(imgs_path)\n    t0 = time.time()\n    cfg['PPTSM']['frame_list'] = imgs_list\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py:69-78"
+    },
+    "1197": {
+        "file_id": 111,
+        "content": "This code loads image data from a specific path, uses the model to predict action for each frame, and prints the output shape and time taken for inference. It seems to be part of an application related to FootballAction.",
+        "type": "comment"
+    },
+    "1198": {
+        "file_id": 112,
+        "content": "/applications/FootballAction/predict/action_detect/reader/__init__.py",
+        "type": "filepath"
+    },
+    "1199": {
+        "file_id": 112,
+        "content": "This code imports and registers various readers for different formats (TSM, PPTSM, AUDIO, BMN, ACTION) to read map files for the model. The readers are registered in alphabetical order.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/12.json b/docs/data/12.json
new file mode 100644
index 000000000..9a515a7a5
--- /dev/null
+++ b/docs/data/12.json
@@ -0,0 +1,543 @@
+{
+    "1200": {
+        "file_id": 112,
+        "content": "\"\"\"\nread map for model\n\"\"\"\nfrom reader.reader_utils import regist_reader, get_reader\nimport reader.tsminf_reader as tsminf_reader\nimport reader.audio_reader as audio_reader\nimport reader.bmninf_reader as bmninf_reader\nimport reader.feature_reader as feature_reader\n# regist reader, sort by alphabet\nregist_reader(\"TSM\", tsminf_reader.TSMINFReader)\nregist_reader(\"PPTSM\", tsminf_reader.TSMINFReader)\nregist_reader(\"AUDIO\", audio_reader.AudioReader)\nregist_reader(\"BMN\", bmninf_reader.BMNINFReader)\nregist_reader(\"ACTION\", feature_reader.FeatureReader)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/__init__.py:1-15"
+    },
+    "1201": {
+        "file_id": 112,
+        "content": "This code imports and registers various readers for different formats (TSM, PPTSM, AUDIO, BMN, ACTION) to read map files for the model. The readers are registered in alphabetical order.",
+        "type": "comment"
+    },
+    "1202": {
+        "file_id": 113,
+        "content": "/applications/FootballAction/predict/action_detect/reader/audio_reader.py",
+        "type": "filepath"
+    },
+    "1203": {
+        "file_id": 113,
+        "content": "The code creates an AudioReader class for youtube-8M dataset, initializing audio readers and loading pcm data. It manages audio batches by appending audios to batch_out until reaching the specified batch size, then yields the batch. Any remaining audios are yielded upon completion.",
+        "type": "summary"
+    },
+    "1204": {
+        "file_id": 113,
+        "content": "\"\"\"\naudio reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nimport os\nimport _pickle as cPickle\n#from .reader_utils import DataReader\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\nimport numpy as np\nimport random\nimport code\nfrom .reader_utils import DataReader\nimport mfcc.feature_extractor as feature_extractor\nclass AudioReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:1-37"
+    },
+    "1205": {
+        "file_id": 113,
+        "content": "This code defines an AudioReader class for the youtube-8M dataset, which reads features extracted by prior networks. It imports necessary libraries and modules, such as numpy, random, code, DataReader from reader_utils, feature_extractor from mfcc, pickle for file input/output, and StringIO or BytesIO depending on the availability of cPickle. The class inherits from DataReader, indicating it follows a standard data reading structure, and uses a feature extractor to extract audio features.",
+        "type": "comment"
+    },
+    "1206": {
+        "file_id": 113,
+        "content": "    This is for the three models: lstm, attention cluster, nextvlad\n    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        # set batch size and file list\n        self.sample_rate = cfg[self.name.upper()]['sample_rate']\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        self.pcm_file = cfg[self.name.upper()]['pcm_file']\n        self.material = material\n    def create_reader(self):\n        \"\"\"create_reader\"\"\"\n        with open(self.pcm_file, \"rb\") as f:\n            pcm_data = f.read()\n        audio_data = np.fromstring(pcm_data, dtype=np.int16)\n        examples = feature_extractor.wav_to_example(audio_data, self.sample_rate)\n        # print(examples.shape)\n        def reader():\n            \"\"\"reader\"\"\"\n            batch_out = []\n            batch_out_pre = []\n            for audio in examples:\n                # batch_out.append([audio])",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:38-70"
+    },
+    "1207": {
+        "file_id": 113,
+        "content": "This code initializes an audio reader for three models (LSTM, Attention Cluster, NextVlad). It takes parameters such as name, mode, and configuration file. The batch size, sample rate, and file list are set according to the given configuration. The pcm data is loaded from a binary file and converted to numpy array. Finally, a reader function is defined that iterates through examples and appends them to batches.",
+        "type": "comment"
+    },
+    "1208": {
+        "file_id": 113,
+        "content": "                batch_out.append(audio)\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:71-78"
+    },
+    "1209": {
+        "file_id": 113,
+        "content": "This code is creating and managing audio batches in the audio reader class. It appends each audio to batch_out until it reaches the specified batch size, then yields the batch and resets batch_out. If there are remaining audios in batch_out after the loop ends, it yields them before returning the reader object.",
+        "type": "comment"
+    },
+    "1210": {
+        "file_id": 114,
+        "content": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py",
+        "type": "filepath"
+    },
+    "1211": {
+        "file_id": 114,
+        "content": "The BMNINFReader class in PaddleVideo reads and processes data from the BMN model for football action detection, filtering invalid proposals and handling image/audio data. This code creates a batch reader that pairs video features with names and scales, yielding batches until completion.",
+        "type": "summary"
+    },
+    "1212": {
+        "file_id": 114,
+        "content": "\"\"\"\n# @File  : bmninf_reader.py  \n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport os\nimport random\nimport pickle\nimport json\nimport numpy as np\nimport multiprocessing\nimport numpy as np\nfrom .reader_utils import DataReader\ndef get_sw_prop(duration, window=200, step=10):\n    \"\"\"\n    get_sw_prop\n    \"\"\"\n    pr = []\n    local_boxes = []\n    for k in np.arange(0, duration - window + step, step):\n        start_id = k\n        end_id = min(duration, k + window)\n        if end_id - start_id < window:\n            start_id = end_id - window\n        local_boxes = (start_id, end_id)\n        pr.append(local_boxes)\n    def valid_proposal(duration, span):\n        \"\"\"\n        valid_proposal\n        \"\"\"\n        # fileter proposals\n        # a valid proposal should have at least one second in the video\n        real_span = min(duration, span[1]) - span[0]\n        return real_span >= 1\n    pr = list(filter(lambda x: valid_proposal(duration, x), pr))\n    return pr\nclass BMNINFReader(DataReader):\n    \"\"\"\n    Data reader for BMN model, which was stored as features extracted by prior networks",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:1-49"
+    },
+    "1213": {
+        "file_id": 114,
+        "content": "This code defines a class called BMNINFReader which is a data reader for the BMN model. It reads data that has been extracted by prior networks and uses the \"get_sw_prop\" function to filter out invalid proposals. The get_sw_prop function calculates proposal regions based on a given duration, window size, and step size. Proposals with less than one second in the video are filtered out. This data reader is part of the PaddleVideo package for FootballAction application.",
+        "type": "comment"
+    },
+    "1214": {
+        "file_id": 114,
+        "content": "    dataset cfg: feat_path, feature path,\n                 tscale, temporal length of BM map,\n                 dscale, duration scale of BM map,\n                 anchor_xmin, anchor_xmax, the range of each point in the feature sequence,\n                 batch_size, batch size of input data,\n                 num_threads, number of threads of data processing\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.tscale = cfg[self.name.upper()]['tscale']  # 200\n        self.dscale = cfg[self.name.upper()]['dscale']  # 200\n        # self.subset = cfg[self.name.upper()]['subset']\n        self.tgap = 1. / self.tscale\n        self.step = cfg[self.name.upper()]['window_step']\n        self.material = material\n        src_feature = self.material\n        image_feature = src_feature['image_feature']\n        pcm_feature = src_feature['pcm_feature']\n        pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))\n        # print(rgb_feature.shape, audio_feature.shape, pcm_feature.shape)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:50-73"
+    },
+    "1215": {
+        "file_id": 114,
+        "content": "This code initializes a class, likely for data reading and processing. It takes parameters such as name, mode, configuration (cfg), and material. It sets attributes like temporal length (tscale) and duration scale (dscale) from the configuration. The code reshapes pcm_feature to fit the needed shape.",
+        "type": "comment"
+    },
+    "1216": {
+        "file_id": 114,
+        "content": "        min_length = min(image_feature.shape[0], pcm_feature.shape[0])\n        #if min_length == 0:\n        #    continue\n        image_feature = image_feature[:min_length, :]\n        pcm_feature = pcm_feature[:min_length, :]\n        self.features = np.concatenate((image_feature, pcm_feature), axis=1)\n        self.duration = len(self.features)\n        self.window = self.tscale\n        self.get_dataset_dict()\n        self.get_match_map()\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        if (mode == 'test') or (mode == 'infer'):\n            self.num_threads = 1  # set num_threads as 1 for test and infer\n    def get_dataset_dict(self):\n        \"\"\"\n        get_dataset_dict\n        \"\"\"\n        self.video_list = get_sw_prop(self.duration, self.window, self.step)\n    def get_match_map(self):\n        \"\"\"\n        get_match_map\n        \"\"\"\n        match_map = []\n        for idx in range(self.tscale):\n            tmp_match_window = []\n            xmin = self.tgap * idx\n            for jdx in range(1, self.tscale + 1):",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:74-105"
+    },
+    "1217": {
+        "file_id": 114,
+        "content": "This code reads image and audio data for video analysis, concatenates them into a feature vector, sets the duration, window size, and batch size. It then retrieves the list of videos to process and creates a match map for analyzing video frames. The code is part of a machine learning model used in football action detection.",
+        "type": "comment"
+    },
+    "1218": {
+        "file_id": 114,
+        "content": "                xmax = xmin + self.tgap * jdx\n                tmp_match_window.append([xmin, xmax])\n            match_map.append(tmp_match_window)\n        match_map = np.array(match_map)\n        match_map = np.transpose(match_map, [1, 0, 2])\n        match_map = np.reshape(match_map, [-1, 2])\n        self.match_map = match_map\n        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]\n        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]\n    def load_file(self, video_wind):\n        \"\"\"\n        load_file\n        \"\"\"\n        start_feat_id = video_wind[0]\n        end_feat_id = video_wind[1]\n        video_feat = self.features[video_wind[0]: video_wind[1]]\n        video_feat = video_feat.T\n        video_feat = video_feat.astype(\"float32\")\n        return video_feat\n    def create_reader(self):\n        \"\"\"\n        reader creator for ctcn model\n        \"\"\"\n        return self.make_infer_reader()\n    def make_infer_reader(self):\n        \"\"\"\n        reader for inference\n        \"\"\"\n        def reader():",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:106-138"
+    },
+    "1219": {
+        "file_id": 114,
+        "content": "This code is for creating a reader function to handle BMNINF file loading and defining the match_map attribute. It defines the load_file function, create_reader function, and make_infer_reader function. The load_file function loads features from a given video window range, converts them to float32 type, and transposes the data. The create_reader function creates a reader for the CTCN model. The make_infer_reader function defines a reader for inference purposes.",
+        "type": "comment"
+    },
+    "1220": {
+        "file_id": 114,
+        "content": "            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            # for video_name in self.video_list:\n            for video_wind in self.video_list:\n                video_idx = self.video_list.index(video_wind)\n                video_feat = self.load_file(video_wind)\n                batch_out.append((video_feat, video_wind, [self.duration, self.dscale]))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:139-155"
+    },
+    "1221": {
+        "file_id": 114,
+        "content": "This code creates a batch reader for video data in a football action detection application. It loads features from videos, pairs them with their corresponding names and scales, and yields batches of this data until the batch size is reached or all videos are processed.",
+        "type": "comment"
+    },
+    "1222": {
+        "file_id": 115,
+        "content": "/applications/FootballAction/predict/action_detect/reader/feature_reader.py",
+        "type": "filepath"
+    },
+    "1223": {
+        "file_id": 115,
+        "content": "The code defines an attention-based LSTM feature reader for the FootballAction application in PaddleVideo, handling data reading from the youtube-8M dataset. It reads features from image, audio, and pcm_lists, concatenates them, yields batches, and continues even if exceptions occur.",
+        "type": "summary"
+    },
+    "1224": {
+        "file_id": 115,
+        "content": "\"\"\"\nattention-lstm feature reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\nimport numpy as np\nimport random\nimport code\nfrom .reader_utils import DataReader\nclass FeatureReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm, attention cluster, nextvlad",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/feature_reader.py:1-33"
+    },
+    "1225": {
+        "file_id": 115,
+        "content": "This code is for an attention-based LSTM feature reader, used in the FootballAction application of PaddleVideo. It imports necessary libraries, handles potential import errors, and defines a class called FeatureReader which inherits from DataReader to handle data reading specifically for the youtube-8M dataset that contains features extracted by prior networks. The code is licensed under Apache 2.0 license.",
+        "type": "comment"
+    },
+    "1226": {
+        "file_id": 115,
+        "content": "    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        self.feature = material['feature']\n        self.proposal = material['proposal']\n        self.fps = 5\n    def create_reader(self):\n        \"\"\"\n        create_reader\n        \"\"\"\n        image_feature_list = self.feature['image_feature']\n        audio_feature_list = self.feature['audio_feature']\n        pcm_feature_list = self.feature['pcm_feature']\n        pcm_feature_list = pcm_feature_list.reshape((pcm_feature_list.shape[0] * 5, 640))\n        fl = self.proposal\n        if self.mode == 'train':\n            random.shuffle(fl)\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            for prop_info in fl:\n                start_id = int(prop_info['start'])\n                end_id = int(prop_info['end'])",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/feature_reader.py:35-71"
+    },
+    "1227": {
+        "file_id": 115,
+        "content": "This code initializes a feature reader object for the FootballAction application, taking in parameters such as name, mode, and configuration (cfg). It then creates a reader function that iterates through proposal features and extracts relevant data based on start and end IDs. The extracted features are stored in batch_out.",
+        "type": "comment"
+    },
+    "1228": {
+        "file_id": 115,
+        "content": "                bmn_score = float(prop_info['score'])\n                try:\n                    image_feature = image_feature_list[start_id: end_id]\n                    audio_feature = audio_feature_list[int(start_id / self.fps): int(end_id / self.fps)]\n                    pcm_feature = pcm_feature_list[start_id: end_id]\n                    # image_feature = np.concatenate((image_feature, pcm_feature), axis=1)\n                    batch_out.append((image_feature, audio_feature, 0, prop_info))\n                    if len(batch_out) == self.batch_size:\n                        yield batch_out\n                        batch_out = []\n                except Exception as e:\n                    continue\n        return reader",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/feature_reader.py:72-86"
+    },
+    "1229": {
+        "file_id": 115,
+        "content": "This code reads features from image, audio, and pcm_feature lists based on start_id and end_id. It concatenates the image and pcm_features along axis=1. If batch_size is reached, it yields the batch and resets the batch_out list. The code continues even if an exception occurs.",
+        "type": "comment"
+    },
+    "1230": {
+        "file_id": 116,
+        "content": "/applications/FootballAction/predict/action_detect/reader/reader_utils.py",
+        "type": "filepath"
+    },
+    "1231": {
+        "file_id": 116,
+        "content": "This code defines ReaderNotFoundError and ReaderZoo classes for video input data readers, offering a singleton reader_zoo and functions to register and get specific readers. The get_reader function returns the reader instance based on name, mode, configuration, and material, while raising ReaderNotFoundError if not found.",
+        "type": "summary"
+    },
+    "1232": {
+        "file_id": 116,
+        "content": "\"\"\"\nreader_util\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nclass ReaderNotFoundError(Exception):\n    \"\"\"\n    \"Error: reader not found\"\n    \"\"\"\n    def __init__(self, reader_name, avail_readers):\n        super(ReaderNotFoundError, self).__init__()\n        self.reader_name = reader_name\n        self.avail_readers = avail_readers\n    def __str__(self):\n        msg = \"Reader {} Not Found.\\nAvailiable readers:\\n\".format(\n            self.reader_name)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:1-34"
+    },
+    "1233": {
+        "file_id": 116,
+        "content": "This code defines a class \"ReaderNotFoundError\" for handling reader not found exceptions with the possibility to specify the unavailable reader name and available readers.",
+        "type": "comment"
+    },
+    "1234": {
+        "file_id": 116,
+        "content": "        for reader in self.avail_readers:\n            msg += \"  {}\\n\".format(reader)\n        return msg\nclass DataReader(object):\n    \"\"\"\n    data reader for video input\n    \"\"\"\n    def __init__(self, model_name, mode, cfg):\n        self.name = model_name\n        self.mode = mode\n        self.cfg = cfg\n    def create_reader(self):\n        \"\"\"\n        Not implemented\n        \"\"\"\n        pass\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"\n        get_config_from_sec\n        \"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ReaderZoo(object):\n    \"\"\"\n    ReaderZoo\n    \"\"\"\n    def __init__(self):\n        \"\"\"\n        __init__\n        \"\"\"\n        self.reader_zoo = {}\n    def regist(self, name, reader):\n        \"\"\"\n        regist\n        \"\"\"\n        assert reader.__base__ == DataReader, \"Unknow model type {}\".format(\n            type(reader))\n        self.reader_zoo[name] = reader\n    def get(self, name, mode, cfg, material=None):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:35-83"
+    },
+    "1235": {
+        "file_id": 116,
+        "content": "This code defines classes for video input data readers and a reader zoo. The DataReader class initializes with a model name, mode, and configuration. It has methods to create readers (not implemented) and get config from sections. The ReaderZoo class manages registered readers in a zoo, allowing easy access and usage of different reader types for video input data.",
+        "type": "comment"
+    },
+    "1236": {
+        "file_id": 116,
+        "content": "        \"\"\"\n        get\n        \"\"\"\n        for k, v in self.reader_zoo.items():\n            if k == name:\n                return v(name, mode, cfg, material)\n        raise ReaderNotFoundError(name, self.reader_zoo.keys())\n# singleton reader_zoo\nreader_zoo = ReaderZoo()\ndef regist_reader(name, reader):\n    \"\"\"\n    regist_reader\n    \"\"\"\n    reader_zoo.regist(name, reader)\ndef get_reader(name, mode, cfg, material=None):\n    \"\"\"\n    get_reader\n    \"\"\"\n    reader_model = reader_zoo.get(name, mode, cfg, material)\n    return reader_model.create_reader()",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:84-109"
+    },
+    "1237": {
+        "file_id": 116,
+        "content": "This code defines a singleton reader_zoo and provides functions for registering readers and getting a specific reader. The get_reader function returns the created reader instance based on the provided name, mode, configuration (cfg), and material (if any). If the reader is not found, it raises ReaderNotFoundError with available reader names as information.",
+        "type": "comment"
+    },
+    "1238": {
+        "file_id": 117,
+        "content": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py",
+        "type": "filepath"
+    },
+    "1239": {
+        "file_id": 117,
+        "content": "The TSMINFReader class is a specialized data reader for JPG video datasets, utilizing threading for image preprocessing and data augmentation. It improves action detection models by manipulating football game images via functions like \"crop_and_resize\", \"group_random_crop\", and \"group_random_flip\" to fit target size and apply random crop sizes for augmentation.",
+        "type": "summary"
+    },
+    "1240": {
+        "file_id": 117,
+        "content": "\"\"\"\ntsn frame reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport random\nimport functools\nimport concurrent.futures\nimport multiprocessing\nimport numpy as np\nimport paddle\nfrom PIL import Image, ImageEnhance\nfrom .reader_utils import DataReader\nclass TSMINFReader(DataReader):\n    \"\"\"\n    Data reader for video dataset of jpg folder.\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        super(TSMINFReader, self).__init__(name, mode, cfg)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:1-38"
+    },
+    "1241": {
+        "file_id": 117,
+        "content": "The code defines a class called TSMINFReader that inherits from DataReader. It is a data reader for video datasets in the JPG format and can be used in specific modes with different configurations. The class takes parameters such as name, mode, cfg, and material (optional) to initialize its instance.",
+        "type": "comment"
+    },
+    "1242": {
+        "file_id": 117,
+        "content": "        name = name.upper()\n        self.seg_num        = cfg[name]['seg_num']\n        self.seglen         = cfg[name]['seglen']\n        self.short_size     = cfg[name]['short_size']\n        self.target_size    = cfg[name]['target_size']\n        self.batch_size     = cfg[name]['batch_size']\n        self.reader_threads = cfg[name]['reader_threads']\n        self.buf_size       = cfg[name]['buf_size']\n        self.video_path     = cfg[name]['frame_list']\n        self.img_mean       = np.array(cfg[name]['image_mean']).reshape([3, 1, 1]).astype(np.float32)\n        self.img_std        = np.array(cfg[name]['image_std']).reshape([3, 1, 1]).astype(np.float32)\n        self.material = material\n    def create_reader(self):\n        \"\"\"\n        batch loader for TSN\n        \"\"\"\n        _reader = self._inference_reader_creator_longvideo(\n                self.video_path,\n                self.mode,\n                seg_num=self.seg_num,\n                seglen=self.seglen,\n                short_size=self.short_size,\n                target_size=self.target_size,",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:39-64"
+    },
+    "1243": {
+        "file_id": 117,
+        "content": "This code initializes the TSN video reader by setting various attributes based on provided configuration (name) and then calls a function to create the reader object with specified parameters. It also sets image mean and std values for normalization, and stores the material type.",
+        "type": "comment"
+    },
+    "1244": {
+        "file_id": 117,
+        "content": "                img_mean=self.img_mean,\n                img_std=self.img_std,\n                num_threads = self.reader_threads,\n                buf_size = self.buf_size)\n        def _batch_reader():\n            batch_out = []\n            for imgs, label in _reader():\n                if imgs is None:\n                    continue\n                batch_out.append((imgs, label))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 1:\n                yield batch_out[:-1]\n        return _batch_reader\n    def _inference_reader_creator_longvideo(self, video_path, mode, seg_num, seglen,\n                                  short_size, target_size, img_mean, img_std, num_threads, buf_size):\n        \"\"\"\n        inference reader for video\n        \"\"\"\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            def image_buf(image_id_path_buf):\n                \"\"\"\n                image_buf reader\n                \"\"\"  ",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:65-97"
+    },
+    "1245": {
+        "file_id": 117,
+        "content": "This code defines a class with an image batch reader for inference on video data. The reader function reads images from the specified video path, applying mean and standard deviation normalization. It also sets the number of threads, buffer size, and creates a batch generator using _batch_reader method.",
+        "type": "comment"
+    },
+    "1246": {
+        "file_id": 117,
+        "content": "                try:\n                    img_path = image_id_path_buf[1]\n                    img = Image.open(img_path).convert(\"RGB\")\n                    image_id_path_buf[2] = img\n                except:\n                    image_id_path_buf[2] = None\n            frame_len = len(video_path)\n            read_thread_num = seg_num\n            for i in range(0, frame_len, read_thread_num):\n                image_list_part = video_path[i: i + read_thread_num]\n                image_id_path_buf_list = []\n                for k in range(len(image_list_part)):\n                    image_id_path_buf_list.append([k, image_list_part[k], None])\n                with concurrent.futures.ThreadPoolExecutor(max_workers=read_thread_num) as executor:\n                    executor.map(lambda image_id_path_buf: image_buf(image_id_path_buf), image_id_path_buf_list)\n                imgs_seg_list = [x[2] for x in image_id_path_buf_list]\n                # add the fault-tolerant for bad image\n                for k in range(len(image_id_path_buf_list)):",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:98-119"
+    },
+    "1247": {
+        "file_id": 117,
+        "content": "The code segment is responsible for reading images from a video file in chunks using multiple threads. It opens each image, converts it to RGB format and stores them in an array. The code handles exceptions for opening bad or missing images and uses the ThreadPoolExecutor class from concurrent futures module to execute operations asynchronously with maximum worker threads specified. Finally, it creates a list of images from the segments and adds a fault-tolerant mechanism to handle bad images.",
+        "type": "comment"
+    },
+    "1248": {
+        "file_id": 117,
+        "content": "                    img_buf = image_id_path_buf_list[k][2]\n                    pad_id = 1\n                    while pad_id < seg_num and img_buf is None:\n                        img_buf = imgs_seg_list[(k + pad_id)%seg_num][2]\n                    if img_buf is None:\n                        logger.info(\"read img erro from {} to {}\".format(i, i + read_thread_num))\n                        exit(0)\n                    else:\n                        imgs_seg_list[k] = img_buf\n                for pad_id in range(len(imgs_seg_list), seg_num):\n                    imgs_seg_list.append(imgs_seg_list[-1])\n                yield imgs_seg_list      \n        def inference_imgs_transform(imgs_list, mode, seg_num, seglen, short_size,\\\n                                    target_size, img_mean, img_std):\n            \"\"\"\n            inference_imgs_transform\n            \"\"\" \n            imgs_ret = imgs_transform(imgs_list, mode, seg_num, seglen, short_size,\n                        target_size, img_mean, img_std)\n            label_ret = 0",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:120-141"
+    },
+    "1249": {
+        "file_id": 117,
+        "content": "This code aims to read image data and perform inference by transforming the images. It appends missing image buffers to imgs_seg_list, handles reading errors, and yields the complete list of transformed images. The imgs_transform function performs further transformations on the input images based on provided parameters.",
+        "type": "comment"
+    },
+    "1250": {
+        "file_id": 117,
+        "content": "            return imgs_ret, label_ret\n        mapper = functools.partial(\n            inference_imgs_transform,\n            mode=mode,\n            seg_num=seg_num,\n            seglen=seglen,\n            short_size=short_size,\n            target_size=target_size,\n            img_mean=img_mean,\n            img_std=img_std)\n        return paddle.reader.xmap_readers(mapper, reader, num_threads, buf_size, order=True)\ndef imgs_transform(imgs,\n                   mode,\n                   seg_num,\n                   seglen,\n                   short_size,\n                   target_size,\n                   img_mean,\n                   img_std,\n                   name=''):\n    \"\"\"\n    imgs_transform\n    \"\"\"\n    imgs = group_scale(imgs, short_size)\n    if mode == 'train':\n        if name == \"TSM\":\n            imgs = group_multi_scale_crop(imgs, short_size)\n        imgs = group_random_crop(imgs, target_size)\n        imgs = group_random_flip(imgs)\n    else:\n        imgs = group_center_crop(imgs, target_size)\n    np_imgs = (np.array(imgs[0]).astype('float32').transpose(",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:143-180"
+    },
+    "1251": {
+        "file_id": 117,
+        "content": "This code defines a function `imgs_transform` which performs various image transformations on input images based on the given mode. It groups images by scale, crops them randomly if in training mode (using TSM or center crop otherwise), and applies horizontal flips. The function returns the transformed images as a numpy array.",
+        "type": "comment"
+    },
+    "1252": {
+        "file_id": 117,
+        "content": "        (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255\n    for i in range(len(imgs) - 1):\n        img = (np.array(imgs[i + 1]).astype('float32').transpose(\n            (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255\n        np_imgs = np.concatenate((np_imgs, img))\n    imgs = np_imgs\n    imgs -= img_mean\n    imgs /= img_std\n    imgs = np.reshape(imgs, (seg_num, seglen * 3, target_size, target_size))\n    return imgs\ndef group_multi_scale_crop(img_group, target_size, scales=None, \\\n        max_distort=1, fix_crop=True, more_fix_crop=True):\n    \"\"\"\n    group_multi_scale_crop\n    \"\"\"\n    scales = scales if scales is not None else [1, .875, .75, .66]\n    input_size = [target_size, target_size]\n    im_size = img_group[0].size\n    # get random crop offset\n    def _sample_crop_size(im_size):\n        \"\"\"\n         _sample_crop_size\n        \"\"\"\n        image_w, image_h = im_size[0], im_size[1]\n        base_size = min(image_w, image_h)\n        crop_sizes = [int(base_size * x) for x in scales]\n        crop_h = [",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:181-212"
+    },
+    "1253": {
+        "file_id": 117,
+        "content": "This code is responsible for loading and preprocessing images for an action detection model in a football game. It resizes, normalizes, and concatenates the images, then applies data augmentation techniques to create a more diverse dataset for training the model. The `group_multi_scale_crop` function generates crop offsets and resizes the images with different scales, providing a robust dataset for improving the model's performance in recognizing various actions.",
+        "type": "comment"
+    },
+    "1254": {
+        "file_id": 117,
+        "content": "            input_size[1] if abs(x - input_size[1]) < 3 else x\n            for x in crop_sizes\n        ]\n        crop_w = [\n            input_size[0] if abs(x - input_size[0]) < 3 else x\n            for x in crop_sizes\n        ]\n        pairs = []\n        for i, h in enumerate(crop_h):\n            for j, w in enumerate(crop_w):\n                if abs(i - j) <= max_distort:\n                    pairs.append((w, h))\n        crop_pair = random.choice(pairs)\n        if not fix_crop:\n            w_offset = random.randint(0, image_w - crop_pair[0])\n            h_offset = random.randint(0, image_h - crop_pair[1])\n        else:\n            w_step = (image_w - crop_pair[0]) / 4\n            h_step = (image_h - crop_pair[1]) / 4\n            ret = list()\n            ret.append((0, 0))  # upper left\n            if w_step != 0:\n                ret.append((4 * w_step, 0))  # upper right\n            if h_step != 0:\n                ret.append((0, 4 * h_step))  # lower left\n            if h_step != 0 and w_step != 0:\n                ret.append((4 * w_step, 4 * h_step))  # lower right",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:213-242"
+    },
+    "1255": {
+        "file_id": 117,
+        "content": "This code calculates crop sizes, generates pairs of crop heights and widths, and randomly selects a pair to use for cropping an image. If the 'fix_crop' parameter is True, it also applies random offsets or steps to adjust the position of the cropped area in the image.",
+        "type": "comment"
+    },
+    "1256": {
+        "file_id": 117,
+        "content": "            if h_step != 0 or w_step != 0:\n                ret.append((2 * w_step, 2 * h_step))  # center\n            if more_fix_crop:\n                ret.append((0, 2 * h_step))  # center left\n                ret.append((4 * w_step, 2 * h_step))  # center right\n                ret.append((2 * w_step, 4 * h_step))  # lower center\n                ret.append((2 * w_step, 0 * h_step))  # upper center\n                ret.append((1 * w_step, 1 * h_step))  # upper left quarter\n                ret.append((3 * w_step, 1 * h_step))  # upper right quarter\n                ret.append((1 * w_step, 3 * h_step))  # lower left quarter\n                ret.append((3 * w_step, 3 * h_step))  # lower righ quarter\n            w_offset, h_offset = random.choice(ret)\n            crop_info = {\n                'crop_w': crop_pair[0],\n                'crop_h': crop_pair[1],\n                'offset_w': w_offset,\n                'offset_h': h_offset\n                }\n        return crop_info\n    crop_info = _sample_crop_size(im_size)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:243-267"
+    },
+    "1257": {
+        "file_id": 117,
+        "content": "This code sample is from a video action detection application and generates random crop sizes for data augmentation. It considers different crop positions based on the step values provided, then randomly selects one of them to create a dictionary of crop information including width, height, offset for width, and offset for height.",
+        "type": "comment"
+    },
+    "1258": {
+        "file_id": 117,
+        "content": "    crop_w = crop_info['crop_w']\n    crop_h = crop_info['crop_h']\n    offset_w = crop_info['offset_w']\n    offset_h = crop_info['offset_h']\n    crop_img_group = [\n        img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))\n        for img in img_group\n    ]\n    ret_img_group = [\n        img.resize((input_size[0], input_size[1]), Image.BILINEAR)\n        for img in crop_img_group\n    ]\n    return ret_img_group\ndef group_random_crop(img_group, target_size):\n    \"\"\"\n    group_random_crop\n    \"\"\"\n    w, h = img_group[0].size\n    th, tw = target_size, target_size\n    assert (w >= target_size) and (h >= target_size), \\\n          \"image width({}) and height({}) should be larger than crop size\".format(w, h)\n    out_images = []\n    x1 = random.randint(0, w - tw)\n    y1 = random.randint(0, h - th)\n    for img in img_group:\n        if w == tw and h == th:\n            out_images.append(img)\n        else:\n            out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n    return out_images\ndef group_random_flip(img_group):",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:268-307"
+    },
+    "1259": {
+        "file_id": 117,
+        "content": "The code contains three functions: \"crop_and_resize\" which crops and resizes images based on provided crop information, \"group_random_crop\" which randomly crops a group of images to the target size, and \"group_random_flip\" which performs random horizontal flipping on a group of images.",
+        "type": "comment"
+    },
+    "1260": {
+        "file_id": 117,
+        "content": "    \"\"\"\n    group_random_flip\n    \"\"\"\n    v = random.random()\n    if v < 0.5:\n        ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]\n        return ret\n    else:\n        return img_group\ndef group_center_crop(img_group, target_size):\n    \"\"\"\n    group_center_crop\n    \"\"\"\n    img_crop = []\n    for img in img_group:\n        w, h = img.size\n        th, tw = target_size, target_size\n        assert (w >= target_size) and (h >= target_size), \\\n             \"image width({}) and height({}) should be larger than crop size\".format(w, h)\n        x1 = int(round((w - tw) / 2.))\n        y1 = int(round((h - th) / 2.))\n        img_crop.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n    return img_crop\ndef group_scale(imgs, target_size):\n    \"\"\"\n    group_scale\n    \"\"\"\n    resized_imgs = []\n    for i in range(len(imgs)):\n        img = imgs[i]\n        w, h = img.size\n        if (w <= h and w == target_size) or (h <= w and h == target_size):\n            resized_imgs.append(img)\n            continue\n        if w < h:\n            ow = target_size",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:308-349"
+    },
+    "1261": {
+        "file_id": 117,
+        "content": "The code defines three functions: `group_random_flip`, `group_center_crop`, and `group_scale`. These functions are used to manipulate image groups by flipping, cropping, or resizing them to fit a target size.",
+        "type": "comment"
+    },
+    "1262": {
+        "file_id": 117,
+        "content": "            oh = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n        else:\n            oh = target_size\n            ow = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n    return resized_imgs",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:350-357"
+    },
+    "1263": {
+        "file_id": 117,
+        "content": "This code resizes images according to the aspect ratio. If the image's aspect ratio is 4:3, it resizes to target_size; otherwise, it resizes to target_size and then calculates a new height and width for the image. It appends these resized images to the 'resized_imgs' list and returns this list.",
+        "type": "comment"
+    },
+    "1264": {
+        "file_id": 118,
+        "content": "/applications/FootballAction/predict/action_detect/utils/config_utils.py",
+        "type": "filepath"
+    },
+    "1265": {
+        "file_id": 118,
+        "content": "The code is from PaddleVideo's BasketballAction application, importing modules and defining AttrDict class. It loads config file into an AttrDict object, processes nested dictionaries, prints configurations, and logs a separator line using the logger module for organization and readability purposes.",
+        "type": "summary"
+    },
+    "1266": {
+        "file_id": 118,
+        "content": "\"\"\"\nconfig_utils\n\"\"\"\n#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport yaml\nimport ast\nimport logger\nlogger = logger.Logger()\nCONFIG_SECS = [\n    'train',\n    'valid',\n    'test',\n    'infer',\n]\nclass AttrDict(dict):\n    \"\"\"\n    AttrDict\n    \"\"\"\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef parse_config(cfg_file):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:1-46"
+    },
+    "1267": {
+        "file_id": 118,
+        "content": "This code is from the PaddleVideo library's BasketballAction application. It imports yaml and ast modules, as well as a logger class. The code defines a constant list of section names (train, valid, test, infer). It also defines an AttrDict class to handle dictionaries with attributes like getattr and setattr methods. The parse_config function is defined which takes a configuration file as input.",
+        "type": "comment"
+    },
+    "1268": {
+        "file_id": 118,
+        "content": "    \"\"\"Load a config file into AttrDict\"\"\"\n    import yaml\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef create_attr_dict(yaml_config):\n    \"\"\"create_attr_dict\"\"\"\n    for key, value in yaml_config.items():\n        if isinstance(value, dict):\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = ast.literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\n    return\ndef print_configs(cfg, mode):\n    \"\"\"print_configs\"\"\"\n    logger.info(\"---------------- {:>5} Arguments ----------------\".format(\n        mode))\n    for sec, sec_items in cfg.items():\n        logger.info(\"{}:\".format(sec))\n        for k, v in sec_items.items():\n            logger.info(\"    {}:{}\".format(k, v))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:47-79"
+    },
+    "1269": {
+        "file_id": 118,
+        "content": "This code is responsible for loading a configuration file into an AttrDict object, processing the nested dictionary structure, and printing the configurations. It uses the yaml library to load the file, and the create_attr_dict function to handle nested dictionaries and convert strings to appropriate data types. The print_configs function prints the configuration in a formatted manner for readability.",
+        "type": "comment"
+    },
+    "1270": {
+        "file_id": 118,
+        "content": "    logger.info(\"-------------------------------------------------\")",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:80-80"
+    },
+    "1271": {
+        "file_id": 118,
+        "content": "This code snippet is logging a separator line using the logger module. The purpose of this logger statement might be to visually separate different sections or parts of the code for readability and organization purposes.",
+        "type": "comment"
+    },
+    "1272": {
+        "file_id": 119,
+        "content": "/applications/FootballAction/predict/action_detect/utils/preprocess.py",
+        "type": "filepath"
+    },
+    "1273": {
+        "file_id": 119,
+        "content": "This code contains four functions that utilize the FFmpeg tool for handling video and audio files. \"ffmpeg_frames\" extracts frames from a given MP4 file, \"ffmpeg_pcm\" extracts audio in PCM format, \"ffmpeg_mp4\" downloads an MP4 file, and \"get_images\" lists the images inside a specified image directory.",
+        "type": "summary"
+    },
+    "1274": {
+        "file_id": 119,
+        "content": "\"\"\" extract frames and pcm\"\"\"\nimport os\nimport sys\nimport shutil\ndef ffmpeg_frames(mp4_addr, frame_out_folder, fps=5):\n    \"\"\"ffmpeg_frames\"\"\"\n    if os.path.exists(frame_out_folder):\n        shutil.rmtree(frame_out_folder)\n    os.makedirs(frame_out_folder)\n    cmd = './src/utils/ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (mp4_addr, fps, frame_out_folder, '%08d')\n    os.system(cmd)\ndef ffmpeg_pcm(mp4_addr, save_file_name):\n    \"\"\"ffmpeg_pcm\"\"\"\n    cmd = './src/utils/ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' \\\n        % (mp4_addr, save_file_name)\n    os.system(cmd)\ndef ffmpeg_mp4(mp4_url, mp4_addr):\n    \"\"\"ffmpeg_mp4\"\"\"\n    cmd = \"wget %s -O %s -q\" % (mp4_url, mp4_addr)\n    print (\"cmd = \", cmd)\n    os.system(cmd)\ndef get_images(image_path):\n    \"\"\"get_images\"\"\"\n    images = sorted(os.listdir(image_path))\n    images = images\n    images_path_list = [image_path + '/' + im for im in images]\n    return images_path_list",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/preprocess.py:1-35"
+    },
+    "1275": {
+        "file_id": 119,
+        "content": "This code contains four functions that utilize the FFmpeg tool for handling video and audio files. \"ffmpeg_frames\" extracts frames from a given MP4 file, \"ffmpeg_pcm\" extracts audio in PCM format, \"ffmpeg_mp4\" downloads an MP4 file, and \"get_images\" lists the images inside a specified image directory.",
+        "type": "comment"
+    },
+    "1276": {
+        "file_id": 120,
+        "content": "/applications/FootballAction/predict/action_detect/utils/process_result.py",
+        "type": "filepath"
+    },
+    "1277": {
+        "file_id": 120,
+        "content": "The code retrieves data, applies NMS to bounding box proposals, filters detected actions from videos using NMS, and stores relevant information in the \"video_results\" list. It defines a function `get_action_result` that takes inputs and performs NMS on processed results.",
+        "type": "summary"
+    },
+    "1278": {
+        "file_id": 120,
+        "content": "\"\"\"\n# @File  : process_result.py  \n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport sys\nimport os\nimport re\nimport numpy as np\nimport pickle\nimport json\nimport logger\nlogger = logger.Logger()\ndef get_data_res(label_map, data, topk):\n    \"\"\"get_data_res\"\"\"\n    sum_vid = len(data)\n    video_result = []\n    for i in range(sum_vid):\n        vid_name = data[i][0][0]\n        # true_label predict_start predict_end predict_score predict_len gt_iou gt_start gt_ioa\n        feature_start_id = float(data[i][0][1]['start'])\n        feature_end_id = float(data[i][0][1]['end'])\n        feature_stage1_score = data[i][0][1]['score']\n        predict_res = []\n        for k in range(topk):\n            score_top = data[i][1][k]\n            labelid_top = data[i][2][k]\n            label_iou = data[i][3]\n            labelname_top = label_map[str(labelid_top)]\n            video_result.append([feature_start_id, feature_end_id, labelid_top, labelname_top, score_top, label_iou])\n    return video_result\ndef base_nms(bboxes, thresh, delta=0, nms_id=2):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:1-39"
+    },
+    "1279": {
+        "file_id": 120,
+        "content": "This code defines two functions: `get_data_res` and `base_nms`. The first function takes in a label map, data (a list of features), and a topk value. It iterates through each video in the data, extracts relevant information from the feature, and appends this information to a new list called `video_result`. Finally, it returns the `video_result` list. The second function is an incomplete definition for a non-maximum suppression algorithm used for bounding boxes. It takes in bboxes (bounding box coordinates), thresh (threshold value), delta (optional parameter with default value 0), and nms_id (an identifier for the NMS operation, with a default value of 2).",
+        "type": "comment"
+    },
+    "1280": {
+        "file_id": 120,
+        "content": "    \"\"\"\n    One-dimensional non-maximal suppression\n    :param bboxes: [[vid, label, st, ed, score, ...], ...]\n    :param thresh:\n    :return:\n    \"\"\"\n    \"\"\"\n    t1 = bboxes[:, 0]\n    t2 = bboxes[:, 1]\n    scores = bboxes[:, nms_id]\n    \"\"\"\n    t1 = np.array([max(0, x[0] - delta) for x in bboxes])\n    t2 = np.array([x[1] + delta for x in bboxes])\n    scores = np.array([x[nms_id] for x in bboxes])\n    durations = t2 - t1\n    order = scores.argsort()[::-1]\n    keep = []\n    while order.size > 0:\n        i = order[0]\n        keep.append(i)\n        tt1 = np.maximum(t1[i], t1[order[1:]])\n        tt2 = np.minimum(t2[i], t2[order[1:]])\n        intersection = tt2 - tt1\n        IoU = intersection / (durations[i] + durations[order[1:]] - intersection).astype(float)\n        inds = np.where(IoU <= thresh)[0]\n        order = order[inds + 1]\n    return [bboxes[i] for i in keep]\ndef process_proposal(source_prop_box, min_frame_thread=5, nms_thresh=0.7, score_thresh=0.01):\n    \"\"\"process_video_prop\"\"\"\n    prop_box = []\n    for items in source_prop_box:",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:40-76"
+    },
+    "1281": {
+        "file_id": 120,
+        "content": "This code performs non-maximal suppression on bounding box proposals. It filters out overlapping boxes by keeping only those with the highest scores and discarding the rest. The function process_proposal takes source bounding box proposals, applies non-maximal suppression with a threshold, and returns the filtered results.",
+        "type": "comment"
+    },
+    "1282": {
+        "file_id": 120,
+        "content": "        start_frame = float(items[0])\n        end_frame = float(items[1])\n        score = float(items[2])\n        if end_frame - start_frame < min_frame_thread or score < score_thresh:\n            continue\n        prop_box.append([start_frame, end_frame, score])\n    prop_box_keep = base_nms(prop_box, nms_thresh)\n    prop_res = []\n    for res in prop_box_keep:\n        prop_res.append({'start': res[0], 'end': res[1], 'score': res[2]})\n    return prop_res\ndef process_video_classify(video_prop, fps, score_thread, iou_thread, \\\n                           nms_id=5, nms_thread=0.01, nms_delta=10, backgroundid=0):\n    \"\"\"process_video_classify\"\"\"\n    prop_filter = []\n    for item in video_prop:\n        if item[2] == backgroundid:\n            continue\n        prop_filter.append(item)\n    # prop_filter = sorted(prop_filter, key=lambda x: x[nms_id], reverse=True)\n    prop_filter = base_nms(prop_filter, nms_thread, nms_delta, nms_id)\n    prop_filter = sorted(prop_filter, key=lambda x: x[0])\n    video_results = []\n    for item in prop_filter:",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:77-107"
+    },
+    "1283": {
+        "file_id": 120,
+        "content": "This code is part of a video classification process. It filters and sorts the detected actions in a video, discarding background or weak detections. The results are stored in 'prop_res' and 'video_results'. The code applies non-maximum suppression (NMS) to filter and sort the detections based on frame duration, score threshold, and other parameters like fps, nms_thread, and nms_delta.",
+        "type": "comment"
+    },
+    "1284": {
+        "file_id": 120,
+        "content": "        start_sec = item[0] / fps\n        end_sec = item[1] / fps\n        start_id_frame = item[0]\n        end_id_frame = item[1]\n        # start_time = \"%02d:%02d:%02d\" % ((start_id_frame / fps) / 3600, \\\n        #     ((start_id_frame / fps) % 3600) / 60, (start_id_frame / fps) % 60)\n        # end_time = \"%02d:%02d:%02d\" % ((end_id_frame / fps) / 3600, \\\n        #     ((end_id_frame / fps) % 3600) / 60, (end_id_frame / fps) % 60)\n        start_time = int(start_id_frame / fps)\n        end_time = int(end_id_frame / fps)\n        label_id = item[2]\n        label_name = item[3]\n        label_classify_score = item[4]\n        label_iou_score = item[5]\n        if label_classify_score > score_thread and label_iou_score > iou_thread:\n            video_results.append({\"start_time\": start_time,\n                                  \"end_time\": end_time,\n                                  \"label_id\": label_id,\n                                  \"label_name\": label_name,\n                                  \"classify_score\": label_classify_score,",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:108-129"
+    },
+    "1285": {
+        "file_id": 120,
+        "content": "This code calculates the start and end time in seconds, frame IDs, and other relevant details of detected actions from a video. It then appends these details as a dictionary to the \"video_results\" list if the classify score and IoU score exceed certain thresholds.",
+        "type": "comment"
+    },
+    "1286": {
+        "file_id": 120,
+        "content": "                                  \"iou_score\": label_iou_score})\n    return video_results\ndef get_action_result(result_info, label_map_file, fps, score_thread=0, \\\n                      iou_thread=0, nms_id=5, nms_thread=0.01, frame_offset=10, topk=1):\n    \"\"\"get_action_result\"\"\"\n    label_map = json.load(open(label_map_file, 'r', encoding='utf-8'))\n    org_result = get_data_res(label_map, result_info, topk)\n    nms_result = process_video_classify(org_result, fps, score_thread, iou_thread, nms_id, nms_thread, frame_offset)\n    return nms_result",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:130-144"
+    },
+    "1287": {
+        "file_id": 120,
+        "content": "This code defines a function `get_action_result` that takes in `result_info`, `label_map_file`, `fps`, `score_thread`, `iou_thread`, `nms_id`, `nms_thread`, and `frame_offset` as inputs. It reads the label map from `label_map_file`, processes the result data using `get_data_res` function, performs non-maximum suppression (NMS) on the processed results with specified parameters, and returns the final NMS results.",
+        "type": "comment"
+    },
+    "1288": {
+        "file_id": 121,
+        "content": "/applications/FootballAction/predict/eval.py",
+        "type": "filepath"
+    },
+    "1289": {
+        "file_id": 121,
+        "content": "The code evaluates precision, recall, and F1 scores for a model's predictions using IoU thresholds and label ranges. It iterates through score thresholds, selects the best F1 score, and saves the results.",
+        "type": "summary"
+    },
+    "1290": {
+        "file_id": 121,
+        "content": "\"\"\"\nget instance for lstm\n根据gts计算每个proposal_bmn的iou、ioa、label等信息\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nimport io\nsys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding = 'utf-8')\ndataset = \"/home/work/datasets\"\nlabel_index_file = './configs/index_label_football_8.json'\neval_datasets = ['EuroCup2016']\nlabel_files = {'train': 'label_cls8_train.json',\n               'validation': 'label_cls8_val.json'}\nglobal fps, mode\nlabel_index = json.load(open(label_index_file, 'rb'))\ndef load_gts():\n    global fps\n    gts_data = {'fps': 0, 'gts': {}}\n    for eval_data in eval_datasets:\n        for item, value in label_files.items():\n            label_file = '{}/{}/{}'.format(dataset, eval_data, value)\n            gts = json.load(open(label_file, 'rb'))\n            gts_data['fps'] = gts['fps']\n            fps = gts['fps']\n            for gt in gts['gts']:\n                gt['mode'] = item\n                basename = '{}/{}/mp4/{}'.format(dataset, eval_data, os.path.basename(gt['url']))",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/eval.py:1-36"
+    },
+    "1291": {
+        "file_id": 121,
+        "content": "This code imports necessary libraries, defines global variables fps and mode, loads a JSON file containing indexed labels for 8 categories, and initializes a gts_data dictionary with the frame rate (fps) and an empty dictionary to store ground truth data. It also iterates over eval_datasets, label_files, and individual gt data to update fps, populate gts_data, and assign mode for each ground truth item.",
+        "type": "comment"
+    },
+    "1292": {
+        "file_id": 121,
+        "content": "                gts_data['gts'][basename] = gt\n    return gts_data['gts']\ndef computeIoU(e1, e2):\n    \"\"\"\n    clc iou and ioa\n    \"\"\"\n    if not (e1['label'] == e2['label'] and e1['basename'] == e2['basename']):\n        return 0.\n    area1 = e1[\"end\"] - e1[\"start\"]\n    area2 = e2[\"end\"] - e2[\"start\"]\n    x1 = np.maximum(e1[\"start\"], e2[\"start\"])\n    x2 = np.minimum(e1[\"end\"], e2[\"end\"])\n    inter = np.maximum(0.0, x2 - x1)\n    iou = 0.0 if (area1 + area2 - inter) == 0 else inter * 1.0 / (area1 + area2 - inter)\n    if not mode == 'proposal':\n        iou = 0.0 if area2 == 0 else inter * 1.0 / area2\n    return iou\ndef convert_proposal(boxes, basename, score_threshold=0.01):\n    boxes = sorted(boxes, key=lambda x:float(x['score']), reverse=True)\n    res = []\n    for box in boxes:\n        if not float(box['score']) >= score_threshold:\n            continue\n        res.append({'basename': basename,\n                    'start': int(float(box['start']) / fps),\n                    'end': int(float(box['end']) / fps),\n                    'label': 0})",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/eval.py:37-67"
+    },
+    "1293": {
+        "file_id": 121,
+        "content": "The code defines functions for evaluating ground truth (GT) labels and computing Intersection over Union (IoU). It also includes a function to convert proposals with score threshold filtering. The IoU function calculates the area of intersection, the area of union, and returns the IoU value. The 'computeIoU' function can be used for both regular and proposal modes. The 'convert_proposal' function sorts boxes based on scores and selects those above a given threshold to generate new proposals. It assigns each proposal an ID and calculates their respective start and end times.",
+        "type": "comment"
+    },
+    "1294": {
+        "file_id": 121,
+        "content": "    return res\ndef convert_classify(boxes, basename, iou_threshold, score_threshold):\n    boxes = sorted(boxes, key=lambda x:(float(x['classify_score']), float(x['iou_score'])), reverse=True)\n    def convert_time_to_frame(time_type):\n        return int(time_type)\n        h, m, s = time_type.split(':')\n        return int(h) * 3600 + int(m) * 60 + int(s)\n    res = []\n    for box in boxes:\n        if not (box['iou_score'] >= iou_threshold and\n                box['classify_score'] >= score_threshold):\n            continue\n        res.append({'basename': basename,\n                    'start': convert_time_to_frame(box['start_time']),\n                    'end': convert_time_to_frame(box['end_time']),\n                    'label': box['label_id']})\n    return res\ndef convert_groundtruth(boxes, basename, phase=None):\n    res = []\n    for box in boxes:\n        for item in box['label_ids']:\n            label = 0 if phase == 'proposal' else item\n            res.append({'basename': basename,\n                        'start': box['start_id'],",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/eval.py:68-93"
+    },
+    "1295": {
+        "file_id": 121,
+        "content": "This code is defining a function called convert_classify that takes in boxes, basename, iou_threshold and score_threshold as parameters. The function sorts the boxes based on their classify_score and iou_score in descending order. If both iou_score and classify_score meet the threshold values, it appends the box details to a list named res. It returns this list of results. \n\nThe code also defines another function called convert_groundtruth that takes in boxes, basename and phase as parameters. This function iterates through each box and its corresponding label IDs. If the phase is 'proposal', it assigns a value of 0 to the label variable; otherwise, it assigns the item from box['label_ids']. It appends the result to a list named res.",
+        "type": "comment"
+    },
+    "1296": {
+        "file_id": 121,
+        "content": "                        'end': box['end_id'],\n                        'label': label})\n    return res\ndef print_head(iou):\n    print(\"\\nioa = {:.1f}\".format(iou))\n    res_str = ''\n    for item in ['label_name']:\n        res_str += '{:<12s}'.format(item)\n    for item in ['label_id', 'precision', 'recall', 'hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10s}'.format(item)\n    print(res_str)\ndef print_result(res_dict, label='avg'):\n    if label == 'avg':\n        res_str = '{:<22s}'.format(str(label))\n    else:\n        res_str = '{0:{2}<6s}{1:<10s}'.format(label_index[str(label)], str(label), chr(12288))\n    for item in ['prec', 'recall']:\n        res_str += '{:<10.4f}'.format(res_dict[item])\n    for item in ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10d}'.format(res_dict[item])\n    print(res_str)\ndef evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = False):\n    iou_map = [computeIoU(resId, gtsId) for resId in res_boxes \\\n                                        for gtsId in gts_boxes]",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/eval.py:94-120"
+    },
+    "1297": {
+        "file_id": 121,
+        "content": "This code defines four functions: \"evaluation\", \"print_result\", \"print_head\", and \"computeIoU\". The \"evaluation\" function computes the intersection over union (IoU) between predicted boxes and ground truth boxes. It then passes these results to \"print_head\" and \"print_result\" for displaying progress and final evaluation results, respectively. The other two functions are used internally by the main \"evaluation\" function.",
+        "type": "comment"
+    },
+    "1298": {
+        "file_id": 121,
+        "content": "    iou_map = np.array(iou_map).reshape((len(res_boxes), len(gts_boxes)))\n    hit_map_prop_total = np.max(iou_map, axis=1)\n    hit_map_index_total = np.argmax(iou_map, axis=1)\n    res_dict = ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']\n    for iou_threshold in iou_range:\n        if show_sub:\n            print_head(iou_threshold)\n        iou_prop = np.array([k >= iou_threshold for k in hit_map_prop_total])\n        average_results = {}\n        for label_id in label_range:\n            sub_results = {}\n            label_prop = np.array([k['label'] == label_id for k in res_boxes])\n            label_gts = np.array([k['label'] == label_id for k in gts_boxes])\n            sub_results['num_prop'] = sum(label_prop)\n            sub_results['num_gts'] = sum(label_gts)\n            if sub_results['num_prop'] == 0:\n                hit_prop_index = []\n            else:\n                hit_prop_index = label_prop & iou_prop\n            sub_results['hit_prop'] = sum(hit_prop_index)\n            sub_results['hit_gts'] = len(set(hit_map_index_total[hit_prop_index]))",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/eval.py:121-144"
+    },
+    "1299": {
+        "file_id": 121,
+        "content": "This code calculates the precision, recall, and F1 score for a set of predicted boxes and ground truth boxes. It iterates over a range of intersection over union (IOU) thresholds and label ranges to produce average results. The results are stored in dictionaries for each IOU threshold and label range combination.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/13.json b/docs/data/13.json
new file mode 100644
index 000000000..86afb8880
--- /dev/null
+++ b/docs/data/13.json
@@ -0,0 +1,544 @@
+{
+    "1300": {
+        "file_id": 121,
+        "content": "            sub_results['prec'] = 0.0 if sub_results['num_prop'] == 0 \\\n                                      else sub_results['hit_prop'] * 1.0 / sub_results['num_prop']\n            sub_results['recall'] = 0.0 if sub_results['num_gts'] == 0 \\\n                                        else sub_results['hit_gts'] * 1.0 / sub_results['num_gts']\n            if show_sub:\n                print_result(sub_results, label=label_id)\n            for item in res_dict:\n                if not item in average_results:\n                    average_results[item] = 0\n                average_results[item] += sub_results[item]\n        if len(label_range) == 1:   # proposal 不需要输出average值\n            continue\n        average_results['prec'] = 0.0 if average_results['num_prop'] == 0 \\\n                                      else average_results['hit_prop'] * 1.0 / average_results['num_prop']\n        average_results['recall'] = 0.0 if average_results['num_gts'] == 0 \\\n                                        else average_results['hit_gts'] * 1.0 / average_results['num_gts']",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/eval.py:146-161"
+    },
+    "1301": {
+        "file_id": 121,
+        "content": "This code calculates precision and recall values for various subtasks, averages them if applicable, and stores the results in a dictionary. It also prints the result for each subtask if show_sub is set to True.",
+        "type": "comment"
+    },
+    "1302": {
+        "file_id": 121,
+        "content": "        if show_sub:\n            print_result(average_results)\n        average_results['F1'] = 0.0 if (average_results['prec'] + average_results['recall'] == 0) \\\n                                    else 2 * average_results['prec'] * average_results['recall'] / \\\n                                            (average_results['prec'] + average_results['recall'])\n        return average_results\ndef get_eval_results(predicts, gts_data, phase, iou_threshold = 0.3, score_threshold = 0.3, show_sub = False):\n    global mode\n    mode = phase\n    res_boxes = []\n    gts_boxes = []\n    for ped_data in predicts:\n        basename = ped_data['video_name']\n        # eval sub data\n        such_eval = False\n        for eval_name in eval_datasets:\n            if eval_name in basename:\n                such_eval = True\n                break\n        if not such_eval:\n            continue\n        gts = gts_data[basename]['actions']\n        if phase == 'proposal':\n            res_boxes.extend(convert_proposal(ped_data['bmn_results'], basename, score_threshold))",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/eval.py:162-189"
+    },
+    "1303": {
+        "file_id": 121,
+        "content": "This code calculates evaluation results for FootballAction model predictions. It checks if the prediction is for a specific evaluation dataset and then processes proposal phase results, extracting bounding box coordinates from predicted proposals, and appending them to res_boxes list. F1 score is calculated based on precision and recall values. The function returns the average evaluation results (F1, precision, recall, IoU) for each video in the predicts dataset.",
+        "type": "comment"
+    },
+    "1304": {
+        "file_id": 121,
+        "content": "            gts_boxes.extend(convert_groundtruth(gts, basename, phase='proposal'))\n            label_range = [0]\n            iou_range = np.arange(0.1, 1, 0.1)\n        else:\n            res_boxes.extend(convert_classify(ped_data['action_results'], basename, iou_threshold, score_threshold))\n            gts_boxes.extend(convert_groundtruth(gts, basename))\n            label_range = range(1, len(label_index))\n            iou_range = np.arange(0.5, 0.6, 0.1)\n    eval_results = evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = show_sub)\n    return eval_results\nif __name__ == \"__main__\":\n    result_file = sys.argv[1]\n    predicts = json.load(open(result_file, 'r', encoding='utf-8'))\n    gts_data = load_gts()\n    get_eval_results(predicts, gts_data, 'proposal', \n                     score_threshold = 0.03,\n                     show_sub = True)\n    #get_eval_results(predicts, gts_data, 'actions')\n    best_F1 = -0.1\n    best_res = {}\n    best_iou_threshold = 0.\n    best_score_threshold = 0.\n    for iou_threshold in np.arange(0.1, 0.9, 0.1):",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/eval.py:190-218"
+    },
+    "1305": {
+        "file_id": 121,
+        "content": "The code handles the evaluation of football action predictions. If ground truth is given, it extends proposal boxes and sets label range and iou range accordingly; otherwise, it extends classify results, ground truth boxes, and sets label range and iou range. It then calculates evaluation results using the specified functions. The code also allows for testing different iou_threshold and score_threshold combinations to find the best ones.",
+        "type": "comment"
+    },
+    "1306": {
+        "file_id": 121,
+        "content": "        for score_threshold in np.arange(0.1, 1, 0.1):\n            avg_res = get_eval_results(predicts, gts_data, 'actions', \n                                       iou_threshold = iou_threshold,\n                                       score_threshold = score_threshold,\n                                       show_sub = False)\n            if best_F1 < avg_res['F1']:\n                best_F1 = avg_res['F1']\n                best_res = avg_res\n                best_iou_threshold = iou_threshold\n                best_score_threshold = score_threshold\n    print(\"best iou threshold = {:.1f}\".format(best_iou_threshold))\n    print(\"best score threshold = {:.1f}\".format(best_score_threshold))\n    print('best F1 score = {:.4f}'.format(best_F1))\n    print_head(0.5)\n    print_result(best_res)\n    get_eval_results(predicts, gts_data, 'actions', iou_threshold = best_iou_threshold,\n                                                    score_threshold = best_score_threshold,\n                                                    show_sub = True)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/eval.py:219-237"
+    },
+    "1307": {
+        "file_id": 121,
+        "content": "The code iterates through score thresholds and calculates the average results for each threshold. It selects the best F1 score, stores corresponding iou_threshold and score_threshold. Finally, it prints these values along with a headline and a detailed result, then saves the best results by running the get_eval_results function again.",
+        "type": "comment"
+    },
+    "1308": {
+        "file_id": 122,
+        "content": "/applications/FootballAction/predict/predict.py",
+        "type": "filepath"
+    },
+    "1309": {
+        "file_id": 122,
+        "content": "The code loads an action detection model, reads a list of videos from a URL file, and for each video, it prints its name, creates paths for image frames and audio, calls the infer function to get bone-coordinate matrices (bmn_results) and action results, stores them in a list named 'results', and writes JSON data to \"results.json\" file.",
+        "type": "summary"
+    },
+    "1310": {
+        "file_id": 122,
+        "content": "import os\nimport sys\nimport json\nsys.path.append('action_detect')\nfrom action import ActionDetection\nif __name__ == '__main__':\n    #dataset_dir = \"/workspace/PaddleVideo/applications/FootballAction/datasets/EuroCup2016\"\n    dataset_dir = \"../datasets/EuroCup2016\"\n    model_predict = ActionDetection(cfg_file=\"./configs/configs.yaml\")\n    model_predict.load_model()\n    video_url = os.path.join(dataset_dir, 'url_val.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]\n    results = []\n    for line in lines:\n        video_name = line\n        print(video_name)\n        imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")\n        pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n        bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n        results.append({\n            'video_name': line,\n            'bmn_results': bmn_results,\n            'action_results': action_results\n        })",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/predict.py:1-33"
+    },
+    "1311": {
+        "file_id": 122,
+        "content": "This code is importing necessary libraries, setting the path to access an action detection model. The model is loaded and a list of videos are read from a URL file. For each video, its name is printed, the required paths for image frames and audio are created, and the action detection model's infer function is called to get bone-coordinate matrices (bmn_results) and action results. These results are then stored in a list named 'results'.",
+        "type": "comment"
+    },
+    "1312": {
+        "file_id": 122,
+        "content": "    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)",
+        "type": "code",
+        "location": "/applications/FootballAction/predict/predict.py:35-37"
+    },
+    "1313": {
+        "file_id": 122,
+        "content": "Writes JSON data to \"results.json\" file, ensuring UTF-8 encoding and readable indentation for improved readability.",
+        "type": "comment"
+    },
+    "1314": {
+        "file_id": 123,
+        "content": "/applications/Ma-Net/README.md",
+        "type": "filepath"
+    },
+    "1315": {
+        "file_id": 123,
+        "content": "The code is for the PaddleVideo application's MA-Net model, supporting testing and training on DAVIS dataset with pretrained models for stage1 and stage1+stage2. It runs \"run_local.sh\" script to execute local environment for the MA-Net model in PaddleVideo.",
+        "type": "summary"
+    },
+    "1316": {
+        "file_id": 123,
+        "content": "[简体中文](README_cn.md) | English\n# Ma-Net\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n## Introduction\nThis is the paddle implementation of the CVPR2020 paper \"[Memory aggregation networks for efficient interactive video object segmentation](https://arxiv.org/abs/2003.13246)\".\n![avatar](images/1836-teaser.gif)\nThis code currently supports model test and model training on DAVIS  dataset,  and model inference on any given video will be provided in few days.\n## Data\nPlease refer to DAVIS data download and preparation doc [DAVIS-data](dataloaders/DAVIS2017.md)\n## Train and Test\n- You can download [pertained model for stage1](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/DeeplabV3_coco.pdparams) decompress it for stage1 training。\n- You can download [trained model of stage1](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MaNet_davis2017_stage1.pdparams) decompress it for stage2 training directly skipping stage1 training。\n```",
+        "type": "code",
+        "location": "/applications/Ma-Net/README.md:1-35"
+    },
+    "1317": {
+        "file_id": 123,
+        "content": "This code is for the PaddleVideo application, specifically Ma-Net, a CVPR2020 paper implementation. It currently supports model testing and training on DAVIS dataset, with inference on any video coming soon. Two pretrained models are provided for stage1 and stage1+stage2 training.",
+        "type": "comment"
+    },
+    "1318": {
+        "file_id": 123,
+        "content": "sh run_local.sh\n```\n- You can download [our model](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MANet_davis2017.pdparams) decompress it for testing.\nTest accuracy in DAVIS2017:\n| J@60  |  AUC  |\n| :---: | :---: |\n| 0.761 | 0.749 |",
+        "type": "code",
+        "location": "/applications/Ma-Net/README.md:36-47"
+    },
+    "1319": {
+        "file_id": 123,
+        "content": "This code snippet executes the \"run_local.sh\" script, which is used to run the local environment for the MA-Net model in the PaddleVideo application.",
+        "type": "comment"
+    },
+    "1320": {
+        "file_id": 124,
+        "content": "/applications/Ma-Net/README_cn.md",
+        "type": "filepath"
+    },
+    "1321": {
+        "file_id": 124,
+        "content": "This is the Chinese version of README file for Ma-Net video segmentation model implementation in Paddle. It introduces a Paddle implementation of CVPR 2020 paper, \"Memory aggregation networks for efficient interactive video object segmentation\". The code currently supports model training and testing on DAVIS dataset, with future support for model inference on any given video. Download stage1 pre-trained model or trained model for direct training, and use the provided model for testing. Achieved J@60 and AUC scores of 0.761 and 0.749 respectively on DAVIS2017 test set.",
+        "type": "summary"
+    },
+    "1322": {
+        "file_id": 124,
+        "content": "[English](README.md) | 简体中文\n# Ma-Net视频切分模型\n## 内容\n- [模型简介](#模型简介)\n- [数据准备](#数据准备)\n- [模型训练](#模型训练)\n- [模型测试](#模型测试)\n- [模型推理](#模型推理)\n## 模型简介\n这是CVPR2020论文\"[Memory aggregation networks for efficient interactive video object segmentation](https://arxiv.org/abs/2003.13246)\"的Paddle实现。\n![avatar](images/1836-teaser.gif)\n此代码目前支持在 DAVIS 数据集上进行模型测试和模型训练，并且将在之后提供对任何给定视频的模型推理。\n## 数据准备\nDAVIS数据下载及准备请参考[DAVIS2017数据准备](dataloaders/DAVIS2017_cn.md)\n## 模型训练与测试\n- 您可以下载[paddle版本的stage1预训练模型](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/DeeplabV3_coco.pdparams) 解压缩它以用于训练的第一阶段。\n- 您可以下载[stage1训练结果模型](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MaNet_davis2017_stage1.pdparams) 解压缩它以直接训练的第二阶段跳过第一阶段的训练。\n  ```bash\n  sh run.sh\n  ```\n- 您可以下载[我们的模型](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MANet_davis2017.pdparams) 解压缩它以用于测试。\n在 DAVIS2017上的测试精度:\n| J@60  |  AUC  |\n| :---: | :---: |\n| 0.761 | 0.749 |",
+        "type": "code",
+        "location": "/applications/Ma-Net/README_cn.md:1-46"
+    },
+    "1323": {
+        "file_id": 124,
+        "content": "This is the Chinese version of README file for Ma-Net video segmentation model implementation in Paddle. It introduces a Paddle implementation of CVPR 2020 paper, \"Memory aggregation networks for efficient interactive video object segmentation\". The code currently supports model training and testing on DAVIS dataset, with future support for model inference on any given video. Download stage1 pre-trained model or trained model for direct training, and use the provided model for testing. Achieved J@60 and AUC scores of 0.761 and 0.749 respectively on DAVIS2017 test set.",
+        "type": "comment"
+    },
+    "1324": {
+        "file_id": 125,
+        "content": "/applications/Ma-Net/config.py",
+        "type": "filepath"
+    },
+    "1325": {
+        "file_id": 125,
+        "content": "The code imports libraries and sets up an argument parser for PaddleVideo's Ma-Net application. It defines a function, configures model parameters, and trains video object detection tasks. The `--TEST_CHECKPOINT` and `--TEST_MODE` arguments are initialized with default values, and the training epoch count is calculated based on batch size and total steps.",
+        "type": "summary"
+    },
+    "1326": {
+        "file_id": 125,
+        "content": "import paddle\nimport argparse\nimport os\nimport sys\nimport cv2\nimport time\ndef str2bool(v):\n    if isinstance(v, bool):\n        return v\n    if v.lower() in ('yes', 'true', 't', 'y', '1'):\n        return True\n    elif v.lower() in ('no', 'false', 'f', 'n', '0'):\n        return False\n    else:\n        raise argparse.ArgumentTypeError('Boolean value expected.')\nparser = argparse.ArgumentParser(description='intvos config')\nparser.add_argument('--ROOT_DIR',\n                    type=str,\n                    default=os.path.abspath(\n                        os.path.join(os.path.dirname(\"__file__\"))))\nparser.add_argument('--EXP_NAME', type=str, default='deeplabv3+coco')\nparser.add_argument('--SAVE_RESULT_DIR', type=str, default='../afs/result/')\nparser.add_argument('--SAVE_VOS_RESULT_DIR', type=str, default='')\nparser.add_argument('--NUM_WORKER', type=int, default=4)\nparser.add_argument('--KNNS', type=int, default=1)\nparser.add_argument('--PRETRAINED_MODEL',\n                    type=str,\n                    default='./model_best.pth.tar')",
+        "type": "code",
+        "location": "/applications/Ma-Net/config.py:1-32"
+    },
+    "1327": {
+        "file_id": 125,
+        "content": "This code is importing necessary libraries and defining a function. It also sets up an argument parser, and provides default values for various parameters including the root directory, experiment name, save result directories, number of workers, KNNs, and pre-trained model path.",
+        "type": "comment"
+    },
+    "1328": {
+        "file_id": 125,
+        "content": "parser.add_argument(\n    '--RESULT_ROOT',\n    type=str,\n    default=os.path.join('../afs/vos_result/result_total_80000'))\n######DATA_CONFIG\nparser.add_argument('--DATA_NAME', type=str, default='COCO2017')\nparser.add_argument('--DATA_AUG', type=str2bool, default=True)\nparser.add_argument('--DATA_WORKERS', type=int, default=4)\nparser.add_argument('--DATA_RESCALE', type=int, default=416)\nparser.add_argument('--DATA_RANDOMCROP', type=int, default=416)\nparser.add_argument('--DATA_RANDOMROTATION', type=int, default=0)\nparser.add_argument('--DATA_RANDOM_H', type=int, default=10)\nparser.add_argument('--DATA_RANDOM_S', type=int, default=10)\nparser.add_argument('--DATA_RANDOM_V', type=int, default=10)\nparser.add_argument('--DATA_RANDOMFLIP', type=float, default=0.5)\nparser.add_argument('--DATA_ROOT', type=str, default='../data/DAVIS')\n######MODEL_CONFIG\nparser.add_argument('--MODEL_NAME', type=str, default='deeplabv3plus')\nparser.add_argument('--MODEL_BACKBONE', type=str, default='res101_atrous')\nparser.add_argument('--MODEL_OUTPUT_STRIDE', type=int, default=16)",
+        "type": "code",
+        "location": "/applications/Ma-Net/config.py:33-53"
+    },
+    "1329": {
+        "file_id": 125,
+        "content": "This code snippet is from the 'config.py' file in the PaddleVideo/applications/Ma-Net directory, and it defines command line arguments for the application. It sets default values for parameters related to result storage location, data configuration, and model configuration. These arguments can be overridden when running the application by specifying them on the command line.",
+        "type": "comment"
+    },
+    "1330": {
+        "file_id": 125,
+        "content": "parser.add_argument('--MODEL_ASPP_OUTDIM', type=int, default=256)\nparser.add_argument('--MODEL_SHORTCUT_DIM', type=int, default=48)\nparser.add_argument('--MODEL_SHORTCUT_KERNEL', type=int, default=1)\nparser.add_argument('--MODEL_NUM_CLASSES', type=int, default=21)\nparser.add_argument('--MODEL_SEMANTIC_EMBEDDING_DIM', type=int, default=100)\nparser.add_argument('--MODEL_HEAD_EMBEDDING_DIM', type=int, default=256)\nparser.add_argument('--MODEL_LOCAL_DOWNSAMPLE', type=str2bool, default=True)\nparser.add_argument('--MODEL_MAX_LOCAL_DISTANCE', type=int, default=12)\nparser.add_argument('--MODEL_SELECT_PERCENT', type=float, default=0.8)\nparser.add_argument('--MODEL_USEIntSeg', type=str2bool, default=False)\n######TRAIN_CONFIG\nparser.add_argument('--TRAIN_LR', type=float, default=0.0007)\nparser.add_argument('--TRAIN_LR_GAMMA', type=float, default=0.1)\nparser.add_argument('--TRAIN_MOMENTUM', type=float, default=0.9)\nparser.add_argument('--TRAIN_WEIGHT_DECAY', type=float, default=0.00004)\nparser.add_argument('--TRAIN_POWER', type=float, default=0.9)",
+        "type": "code",
+        "location": "/applications/Ma-Net/config.py:54-70"
+    },
+    "1331": {
+        "file_id": 125,
+        "content": "This code snippet is from the \"config.py\" file in PaddleVideo's Ma-Net application, and it sets various model parameters like output dimension, shortcut dimensions, kernel size, number of classes, embedding dimensions, downsampling method, selection percentage, and training parameters such as learning rate, gamma, momentum, weight decay, and power. These parameters are used to configure and train the Ma-Net model for video object detection tasks.",
+        "type": "comment"
+    },
+    "1332": {
+        "file_id": 125,
+        "content": "parser.add_argument('--TRAIN_BATCH_SIZE', type=int, default=2)\nparser.add_argument('--TRAIN_SHUFFLE', type=str2bool, default=True)\nparser.add_argument('--TRAIN_CLIP_GRAD_NORM', type=float, default=5.)\nparser.add_argument('--TRAIN_MINEPOCH', type=int, default=9)\nparser.add_argument('--TRAIN_TOTAL_STEPS', type=int, default=101000)\nparser.add_argument('--TRAIN_LOSS_LAMBDA', type=int, default=0)\nparser.add_argument('--TRAIN_TBLOG', type=str2bool, default=False)\nparser.add_argument('--TRAIN_BN_MOM', type=float,\n                    default=0.9997)  # fixed. difs between paddle and torch.\nparser.add_argument('--TRAIN_TOP_K_PERCENT_PIXELS', type=float, default=0.15)\nparser.add_argument('--TRAIN_HARD_MINING_STEP', type=int, default=50000)\nparser.add_argument('--TRAIN_LR_STEPSIZE', type=int, default=2000)\nparser.add_argument('--TRAIN_INTER_USE_TRUE_RESULT',\n                    type=str2bool,\n                    default=True)\nparser.add_argument('--TRAIN_RESUME_DIR', type=str, default='')\nparser.add_argument('--LOG_DIR', type=str, default=os.path.join('./log'))",
+        "type": "code",
+        "location": "/applications/Ma-Net/config.py:71-88"
+    },
+    "1333": {
+        "file_id": 125,
+        "content": "This code snippet is part of the configuration file for the Ma-Net application in PaddleVideo. It includes various arguments and their default values for training the model, such as batch size, shuffling, gradient norm, number of epochs, total steps, loss lambda, logging settings, BN momentum, top K percent pixels, hard mining step, LR step size, and resuming from a specific directory.",
+        "type": "comment"
+    },
+    "1334": {
+        "file_id": 125,
+        "content": "parser.add_argument('--TEST_CHECKPOINT',\n                    type=str,\n                    default='save_step_100000.pth')\nparser.add_argument('--TEST_MODE', type=str2bool, default=False)\ncfg = parser.parse_args()\ncfg.TRAIN_EPOCHS = int(200000 * cfg.TRAIN_BATCH_SIZE / 60.)",
+        "type": "code",
+        "location": "/applications/Ma-Net/config.py:90-96"
+    },
+    "1335": {
+        "file_id": 125,
+        "content": "This code snippet initializes the `--TEST_CHECKPOINT` and `--TEST_MODE` arguments using default values, then calculates the number of training epochs based on the batch size and the total number of steps.",
+        "type": "comment"
+    },
+    "1336": {
+        "file_id": 126,
+        "content": "/applications/Ma-Net/dataloaders/DAVIS2017.md",
+        "type": "filepath"
+    },
+    "1337": {
+        "file_id": 126,
+        "content": "This code snippet provides instructions for downloading the DAVIS2017 dataset and organizing its folder structure within the PaddleVideo project directory. It also provides a link to access the file \"DAVIS2017/ImageSets/2017/v_a_l_instances.txt\" if needed.",
+        "type": "summary"
+    },
+    "1338": {
+        "file_id": 126,
+        "content": "[简体中文](../../zh-CN/dataset/DAVIS2017.md) | English\n# DAVIS2017 Data Preparation\n## 1.Data Download\nDownload [DAVIS2017](https://data.vision.ee.ethz.ch/csergi/share/davis/DAVIS-2017-trainval-480p.zip) and [scribbles](https://data.vision.ee.ethz.ch/csergi/share/DAVIS-Interactive/DAVIS-2017-scribbles-trainval.zip) into one folder. Please refer to [DAVIS](https://davischallenge.org/davis2017/code.html).\nIf you need the file \"DAVIS2017/ImageSets/2017/v_a_l_instances.txt\", please refer to the link [google]( https://drive.google.com/file/d/1aLPaQ_5lyAi3Lk3d2fOc_xewSrfcrQlc/view?usp=sharing)\n## 2.Folder Structure\nIn the context of the whole project (for Ma-Net only), the folder structure will look like:\n```shell\nPaddleVideo\n├── configs\n├── paddlevideo\n├── docs\n├── tools\n├── data\n│ \t└── DAVIS2017\n│   │ \t├── Annotations\n│   │ \t├── ImageSets\n│   │ \t├── JPEGImages\n│   │ \t└── Scribbles\n```",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/DAVIS2017.md:1-27"
+    },
+    "1339": {
+        "file_id": 126,
+        "content": "This code snippet provides instructions for downloading the DAVIS2017 dataset and organizing its folder structure within the PaddleVideo project directory. It also provides a link to access the file \"DAVIS2017/ImageSets/2017/v_a_l_instances.txt\" if needed.",
+        "type": "comment"
+    },
+    "1340": {
+        "file_id": 127,
+        "content": "/applications/Ma-Net/dataloaders/DAVIS2017_cn.md",
+        "type": "filepath"
+    },
+    "1341": {
+        "file_id": 127,
+        "content": "This code provides instructions on how to download and organize the DAVIS2017 dataset for use in the Ma-Net application. It includes links to the necessary datasets, such as the DAVIS-2017-trainval-480p.zip and DAVIS-2017-scribbles-trainval.zip files, and provides a template directory structure for organizing the data within the PaddleVideo project.",
+        "type": "summary"
+    },
+    "1342": {
+        "file_id": 127,
+        "content": "[English](../../en/dataset/DAVIS2017.md) | 简体中文\n# DAVIS2017 数据集准备\n## 1.数据下载\n下载 [DAVIS2017](https://data.vision.ee.ethz.ch/csergi/share/davis/DAVIS-2017-trainval-480p.zip) 和 [scribbles](https://data.vision.ee.ethz.ch/csergi/share/DAVIS-Interactive/DAVIS-2017-scribbles-trainval.zip)到同一个文件夹中。请参阅[DAVIS](https://davischallenge.org/davis2017/code.html).\n如果您需要文件\"DAVIS2017/ImageSets/2017/v_a_l_instances.txt\"，请参阅[google](https://drive.google.com/file/d/1aLPaQ_5lyAi3Lk3d2fOc_xewSrfcrQlc/view?usp=sharing)链接\n## 2.目录结构\n整个项目(Ma-Net)的目录结构如下所示：\n```shell\nPaddleVideo\n├── configs\n├── paddlevideo\n├── docs\n├── tools\n├── data\n│ \t└── DAVIS2017\n│   │ \t├── Annotations\n│   │ \t├── ImageSets\n│   │ \t├── JPEGImages\n│   │ \t└── Scribbles\n```",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/DAVIS2017_cn.md:1-27"
+    },
+    "1343": {
+        "file_id": 127,
+        "content": "This code provides instructions on how to download and organize the DAVIS2017 dataset for use in the Ma-Net application. It includes links to the necessary datasets, such as the DAVIS-2017-trainval-480p.zip and DAVIS-2017-scribbles-trainval.zip files, and provides a template directory structure for organizing the data within the PaddleVideo project.",
+        "type": "comment"
+    },
+    "1344": {
+        "file_id": 128,
+        "content": "/applications/Ma-Net/dataloaders/custom_transforms_f.py",
+        "type": "filepath"
+    },
+    "1345": {
+        "file_id": 128,
+        "content": "This code performs data augmentation using resizing, cropping, and scaling/rotating transformations with cv2 libraries, offering fixed or random options. It also initializes segmentation variables, computes dilated areas, generates edge masks, and handles various data types for PaddlePaddle's video object detection task.",
+        "type": "summary"
+    },
+    "1346": {
+        "file_id": 128,
+        "content": "import os\nimport random\nimport cv2\nimport numpy as np\nimport paddle\nfrom PIL import Image\nimport dataloaders.helpers as helpers\nfrom davisinteractive.utils.operations import bresenham\nfrom paddle.vision.transforms import functional as F\ncv2.setNumThreads(0)\nNEW_BRANCH = True\nclass Resize(object):\n    \"\"\"Rescale the image in a sample to a given size.\n    Args:\n        output_size (tuple or int): Desired output size. If tuple, output is\n            matched to output_size. If int, smaller of image edges is matched\n            to output_size keeping aspect ratio the same.\n    \"\"\"\n    def __init__(self, output_size):\n        assert isinstance(output_size, (int, tuple))\n        if isinstance(output_size, int):\n            self.output_size = (output_size, output_size)\n        else:\n            self.output_size = output_size\n    #        self.seg_interpolation = cv2.INTER_CUBIC if is_continuous else cv2.INTER_NEAREST\n    #        self.fix = fix\n    def __call__(self, sample):\n        img1 = sample['img1']\n        # img2 = sample['img2']",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:1-35"
+    },
+    "1347": {
+        "file_id": 128,
+        "content": "The code defines a Resize class that rescales images in a sample to the given output size. It accepts either an integer for uniform resizing or a tuple for specific dimensions. If the output size is not compatible with the image aspect ratio, it will be scaled proportionally. This class can process samples with one or more images (e.g., 'img1' and 'img2').",
+        "type": "comment"
+    },
+    "1348": {
+        "file_id": 128,
+        "content": "        # ref_img=sample['ref_img']\n        h, w = img1.shape[:2]\n        if self.output_size == (h, w):\n            return sample\n        else:\n            new_h, new_w = self.output_size\n        new_h, new_w = int(new_h), int(new_w)\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':\n                flagval = cv2.INTER_CUBIC\n            else:\n                flagval = cv2.INTER_NEAREST\n            tmp = cv2.resize(tmp, dsize=(new_w, new_h), interpolation=flagval)\n            sample[elem] = tmp\n        return sample\nclass RandomCrop(object):\n    \"\"\"Crop randomly the image in a sample.\n    Args:\n        output_size (tuple or int): Desired output size. If int, square crop\n            is made.\n    \"\"\"\n    def __init__(self, output_size, step=None):\n        assert isinstance(output_size, (int, tuple))\n        if isinstance(output_size, int):\n            self.output_size = (output_size, output_size)",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:36-69"
+    },
+    "1349": {
+        "file_id": 128,
+        "content": "This code is a custom transform that resizes images in a sample to a specific output size. It checks if the current image size matches the desired output size, and if not, it uses cv2.resize() function to resize each image in the sample while maintaining aspect ratio for specified elements (img1, img2, ref_img) using INTER_CUBIC interpolation and others using INTER_NEAREST. It returns the modified sample with images resized according to the output size specified. The RandomCrop class is used to crop an image randomly to a given output size.",
+        "type": "comment"
+    },
+    "1350": {
+        "file_id": 128,
+        "content": "        else:\n            assert len(output_size) == 2\n            self.output_size = output_size\n        self.step = step\n    def __call__(self, sample):\n        image = sample['img1']\n        h, w = image.shape[:2]\n        new_h, new_w = self.output_size\n        new_h = h if new_h >= h else new_h\n        new_w = w if new_w >= w else new_w\n        is_contain_obj = False\n        if self.step is None:\n            while not is_contain_obj:\n                #                step += 1\n                top = np.random.randint(0, h - new_h + 1)\n                left = np.random.randint(0, w - new_w + 1)\n                ref_scribble_label = sample['ref_scribble_label']\n                new_ref_scribble_label = ref_scribble_label[top:top + new_h,\n                                                            left:left + new_w]\n                if len(np.unique(new_ref_scribble_label)) == 1:\n                    continue\n                else:\n                    for elem in sample.keys():\n                        if 'meta' in elem:",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:70-98"
+    },
+    "1351": {
+        "file_id": 128,
+        "content": "This code is part of a custom transform for image cropping. It takes an input sample, selects a random top and left position to crop the image, and checks if the corresponding reference scribble label has enough unique elements to proceed. If not, it continues selecting new positions until it finds one with enough unique elements in the scribble label. The step variable keeps track of how many times this loop has been executed.",
+        "type": "comment"
+    },
+    "1352": {
+        "file_id": 128,
+        "content": "                            continue\n                        tmp = sample[elem]\n                        tmp = tmp[top:top + new_h, left:left + new_w]\n                        sample[elem] = tmp\n                    break\n        else:\n            st = 0\n            while not is_contain_obj and st < self.step:\n                st += 1\n                top = np.random.randint(0, h - new_h + 1)\n                left = np.random.randint(0, w - new_w + 1)\n                ref_scribble_label = sample['ref_scribble_label']\n                new_ref_scribble_label = ref_scribble_label[top:top + new_h,\n                                                            left:left + new_w]\n                if len(np.unique(\n                        new_ref_scribble_label)) == 1 or st < self.step - 1:\n                    continue\n                else:\n                    for elem in sample.keys():\n                        if 'meta' in elem:\n                            continue\n                        tmp = sample[elem]\n                        tmp = tmp[top:top + new_h, left:left + new_w]",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:99-124"
+    },
+    "1353": {
+        "file_id": 128,
+        "content": "This code is randomly selecting a region in the image and adjusting it to the new size while ensuring that there is at least one object present in the cropped region. It then updates the corresponding image and label based on this new crop.",
+        "type": "comment"
+    },
+    "1354": {
+        "file_id": 128,
+        "content": "                        sample[elem] = tmp\n                    break\n        return sample\nclass ScaleNRotate(object):\n    \"\"\"Scale (zoom-in, zoom-out) and Rotate the image and the ground truth.\n    Args:\n        two possibilities:\n        1.  rots (tuple): (minimum, maximum) rotation angle\n            scales (tuple): (minimum, maximum) scale\n        2.  rots [list]: list of fixed possible rotation angles\n            scales [list]: list of fixed possible scales\n    \"\"\"\n    def __init__(self, rots=(-30, 30), scales=(.75, 1.25)):\n        assert (isinstance(rots, type(scales)))\n        self.rots = rots\n        self.scales = scales\n    def __call__(self, sample):\n        if type(self.rots) == tuple:\n            # Continuous range of scales and rotations\n            rot = (self.rots[1] - self.rots[0]) * random.random() - \\\n                  (self.rots[1] - self.rots[0]) / 2\n            sc = (self.scales[1] - self.scales[0]) * random.random() - \\\n                 (self.scales[1] - self.scales[0]) / 2 + 1\n        elif type(self.rots) == list:",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:125-154"
+    },
+    "1355": {
+        "file_id": 128,
+        "content": "The code defines a class called \"ScaleNRotate\" which applies scaling and rotation transformations to images and their corresponding ground truth. It takes two possible arguments for rotations and scales, either as tuples or lists. If the argument is a tuple, it randomly selects a rotation and scale within the defined range. If the argument is a list, it applies one of the fixed possible rotations and scales from the provided list. The code also initializes the instance variables \"rots\" and \"scales\" based on the input arguments.",
+        "type": "comment"
+    },
+    "1356": {
+        "file_id": 128,
+        "content": "            # Fixed range of scales and rotations\n            rot = self.rots[random.randint(0, len(self.rots))]\n            sc = self.scales[random.randint(0, len(self.scales))]\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            h, w = tmp.shape[:2]\n            center = (w / 2, h / 2)\n            assert (center != 0)  # Strange behaviour warpAffine\n            M = cv2.getRotationMatrix2D(center, rot, sc)\n            if ((tmp == 0) | (tmp == 1)).all():\n                flagval = cv2.INTER_NEAREST\n            else:\n                flagval = cv2.INTER_CUBIC\n            tmp = cv2.warpAffine(tmp, M, (w, h), flags=flagval)\n            sample[elem] = tmp\n        return sample\nclass RandomScale(object):\n    \"\"\"Randomly resize the image and the ground truth to specified scales.\n    Args:\n        scales (list): the list of scales\n    \"\"\"\n    def __init__(self, scales=[0.75, 1, 1.25]):\n        self.scales = scales\n    def __call__(self, sample):",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:155-189"
+    },
+    "1357": {
+        "file_id": 128,
+        "content": "This code applies random scaling, rotation, and warping to an image and its corresponding metadata. It selects a random scale and rotation from predefined ranges for each element in the sample, adjusting the shape of the image and preserving its center point. The cv2.warpAffine function is used to perform the transformation, using interpolation flags based on whether the original image contains only 0s and 1s or not. Finally, it returns the transformed sample.",
+        "type": "comment"
+    },
+    "1358": {
+        "file_id": 128,
+        "content": "        # Fixed range of scales\n        sc = self.scales[random.randint(0, len(self.scales) - 1)]\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':\n                flagval = cv2.INTER_CUBIC\n            else:\n                flagval = cv2.INTER_NEAREST\n            tmp = cv2.resize(tmp, None, fx=sc, fy=sc, interpolation=flagval)\n            sample[elem] = tmp\n        return sample\nclass RandomHorizontalFlip(object):\n    \"\"\"Horizontally flip the given image and ground truth randomly with a probability of 0.5.\"\"\"\n    def __init__(self, prob):\n        self.p = prob\n    def __call__(self, sample):\n        if random.random() < self.p:\n            for elem in sample.keys():\n                if 'meta' in elem:\n                    continue\n                tmp = sample[elem]\n                tmp = cv2.flip(tmp, flipCode=1)\n                sample[elem] = tmp\n        return sample\nclass SubtractMeanImage(object):",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:191-229"
+    },
+    "1359": {
+        "file_id": 128,
+        "content": "The code includes classes for resizing, horizontally flipping, and subtracting the mean image from input data. The resizing function adjusts image size based on a randomly chosen scale from a fixed range. The RandomHorizontalFlip class flips images with a probability of 0.5. The SubtractMeanImage class subtracts a pre-calculated mean image from input images, presumably to normalize pixel values.",
+        "type": "comment"
+    },
+    "1360": {
+        "file_id": 128,
+        "content": "    def __init__(self, mean, change_channels=False):\n        self.mean = mean\n        self.change_channels = change_channels\n    def __call__(self, sample):\n        for elem in sample.keys():\n            if 'image' in elem:\n                if self.change_channels:\n                    sample[elem] = sample[elem][:, :, [2, 1, 0]]\n                sample[elem] = np.subtract(\n                    sample[elem], np.array(self.mean, dtype=np.float32))\n        return sample\n    def __str__(self):\n        return 'SubtractMeanImage' + str(self.mean)\nclass CustomScribbleInteractive(object):\n    def __init__(self,\n                 scribbles,\n                 first_frame,\n                 dilation=9,\n                 nocare_area=None,\n                 bresenham=True,\n                 use_previous_mask=False,\n                 previous_mask_path=None):\n        self.scribbles = scribbles\n        self.dilation = dilation\n        self.nocare_area = nocare_area\n        self.bresenham = bresenham\n        self.first_frame = first_frame",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:230-261"
+    },
+    "1361": {
+        "file_id": 128,
+        "content": "This code defines two classes, 'SubtractMeanImage' and 'CustomScribbleInteractive'. The former subtracts the mean from each image in a sample to normalize them. The latter initializes an object for custom scribble interactive functionality with parameters like scribbles, first frame, dilation, nocare_area, bresenham, use_previous_mask, and previous_mask_path.",
+        "type": "comment"
+    },
+    "1362": {
+        "file_id": 128,
+        "content": "        self.use_previous_mask = use_previous_mask\n        self.previous_mask_path = previous_mask_path\n    def __call__(self, sample):\n        meta = sample['meta']\n        frame_num = int(meta['frame_id'])\n        im_size = meta['im_size']\n        # Initialize gt to zeros, no-care areas to ones\n        scr_gt = np.zeros(im_size)\n        scr_nocare = np.ones(im_size)\n        mask = np.zeros(im_size)\n        mask_neg = np.zeros(im_size)\n        # Get all the scribbles for the current frame\n        for scribble in self.scribbles[frame_num]:\n            points_scribble = np.round(\n                np.array(scribble['path']) * np.array(\n                    (im_size[1], im_size[0]))).astype(int)\n            if self.bresenham and len(points_scribble) > 1:\n                all_points = bresenham(points_scribble)\n            else:\n                all_points = points_scribble\n            # Check if scribble is of same id to mark as foreground, otherwise as background\n            if scribble['object_id'] == meta['obj_id']:",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:262-288"
+    },
+    "1363": {
+        "file_id": 128,
+        "content": "This code initializes variables for segmentation mask, no-care area, and scribbles. It iterates over the scribbles of a specific frame and determines whether the scribble is foreground or background based on the object ID. The Bresenham algorithm is applied if specified in the configuration to generate all points for each scribble.",
+        "type": "comment"
+    },
+    "1364": {
+        "file_id": 128,
+        "content": "                mask[all_points[:, 1] - 1, all_points[:, 0] - 1] = 1\n            else:\n                mask_neg[all_points[:, 1] - 1, all_points[:, 0] - 1] = 1\n        if self.nocare_area is None:\n            nz = np.where(mask > 0)\n            nocare_area = int(.5 * np.sqrt(\n                (nz[0].max() - nz[0].min()) * (nz[1].max() - nz[1].min())))\n        else:\n            nocare_area = 100\n        # In case we are reading the first human annotation round\n        if frame_num == self.first_frame:\n            # Compute dilated foreground, background, and no-care area\n            scr_gt, scr_nocare = helpers.gt_from_scribble(\n                mask, dilation=self.dilation, nocare_area=nocare_area)\n            scr_gt_neg, _ = helpers.gt_from_scribble(mask_neg,\n                                                     dilation=self.dilation,\n                                                     nocare_area=None)\n            # Negative examples included in the training\n            scr_gt[scr_gt_neg > 0] = 0\n            scr_nocare[scr_gt_neg > 0] = 0",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:289-310"
+    },
+    "1365": {
+        "file_id": 128,
+        "content": "This code segment appears to be responsible for generating ground truth (GT) masks from human-drawn scribbles. If the first frame is encountered, it computes dilated foreground and background masks along with a no-care area. It also excludes negative examples from the training set. The mask and nocare_area are computed based on the conditions in the code snippet.",
+        "type": "comment"
+    },
+    "1366": {
+        "file_id": 128,
+        "content": "        # For annotation rounds generated by the robot\n        else:\n            # Compute dilated foreground, background, and no-care area\n            scr_gt_extra, _ = helpers.gt_from_scribble(mask,\n                                                       dilation=self.dilation,\n                                                       nocare_area=None)\n            scr_gt_neg, _ = helpers.gt_from_scribble(mask_neg,\n                                                     dilation=self.dilation,\n                                                     nocare_area=None)\n            # Ignore pixels that are not foreground\n            if not self.use_previous_mask:\n                scr_nocare_extra = 1. - scr_gt_extra\n            else:\n                scr_nocare_extra = \\\n                    (cv2.imread(os.path.join(self.previous_mask_path, meta['seq_name'], str(meta['obj_id']),\n                                             meta['frame_id'] + '.png'), 0) > 0.8 * 255).astype(np.float32)\n            # Negative examples included in training",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:312-330"
+    },
+    "1367": {
+        "file_id": 128,
+        "content": "This code computes dilated foreground, background, and no-care area for annotation rounds generated by the robot. It first generates scr_gt_extra and scr_gt_neg using the gt_from_scribble function from helpers module. Then it ignores pixels that are not foreground if use_previous_mask is False. Else, it reads a previous mask image, converts it into float32 format and assigns pixel values greater than 0.8*255 to 1. These computations will be used in the subsequent operations of the code.",
+        "type": "comment"
+    },
+    "1368": {
+        "file_id": 128,
+        "content": "            scr_gt_extra[scr_gt_neg > 0] = 0\n            scr_nocare_extra[scr_gt_neg > 0] = 0\n            scr_gt = np.maximum(scr_gt, scr_gt_extra)\n            scr_nocare_extra[scr_gt > 0] = 0\n            scr_nocare = np.minimum(scr_nocare, scr_nocare_extra)\n        sample['scribble_gt'] = scr_gt\n        sample['scribble_void_pixels'] = scr_nocare\n        return sample\nclass ToTensor(object):\n    \"\"\"Convert ndarrays in sample to Tensors.\"\"\"\n    def __call__(self, sample):\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            if tmp.ndim == 2:\n                tmp = tmp[:, :, np.newaxis]\n            else:\n                tmp = tmp / 255.\n                tmp -= (0.485, 0.456, 0.406)\n                tmp /= (0.229, 0.224, 0.225)\n            # swap color axis because\n            # numpy image: H x W x C\n            # paddle image: C X H X W\n            tmp = tmp.transpose([2, 0, 1])\n            sample[elem] = paddle.to_tensor(tmp)\n        return sample",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:331-366"
+    },
+    "1369": {
+        "file_id": 128,
+        "content": "This code is part of a data loader in the Ma-Net application. It transforms image and mask data for PaddlePaddle's video object detection task. The code handles scribble ground truth (scribble_gt) and scribble void pixels (scribble_void_pixels), applying necessary adjustments to ensure correct formatting and values. It then uses the ToTensor class to convert ndarrays in samples to tensors, handling color axis swapping due to differences between numpy and PaddlePaddle image formats.",
+        "type": "comment"
+    },
+    "1370": {
+        "file_id": 128,
+        "content": "class GenerateEdge(object):\n    \"\"\"\n    \"\"\"\n    def __init__(self, edgesize=1):\n        self.edgesize = edgesize\n    def __call__(self, sample):\n        \"\"\"\n        \"\"\"\n        if \"label2\" in sample:\n            label2 = sample['label2']\n            kernel_size = 2 * self.edgesize + 1\n            maskedge = np.zeros_like(label2)\n            maskedge[np.where(label2[:, 1:] != label2[:, :-1])] = 1\n            maskedge[np.where(label2[1:, :] != label2[:-1, :])] = 1\n            maskedge = cv2.dilate(\n                maskedge, np.ones((kernel_size, kernel_size), dtype=np.uint8))\n            sample[\"edge_mask\"] = maskedge\n        else:\n            raise RuntimeError(\n                \"We need parsing mask to generate the edge mask.\")\n        return sample\nclass GenerateEdge_2(object):\n    \"\"\"\n    \"\"\"\n    def __init__(self, edgesize=1):\n        self.edgesize = edgesize\n    def __call__(self, sample):\n        \"\"\"\n        \"\"\"\n        if \"ref_frame_gt\" in sample:\n            label2 = sample['ref_frame_gt']\n            kernel_size = 2 * self.edgesize + 1",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:369-405"
+    },
+    "1371": {
+        "file_id": 128,
+        "content": "The code defines two classes, `GenerateEdge` and `GenerateEdge_2`, which generate edge masks from the input samples. The edge masks are generated based on whether there is a \"label2\" or \"ref_frame_gt\" present in the sample. If these labels are present, a kernel is applied to create an edge mask, which is then added to the sample. If they are not present, a RuntimeError is raised.",
+        "type": "comment"
+    },
+    "1372": {
+        "file_id": 128,
+        "content": "            maskedge = np.zeros_like(label2)\n            maskedge[np.where(label2[:, 1:] != label2[:, :-1])] = 1\n            maskedge[np.where(label2[1:, :] != label2[:-1, :])] = 1\n            maskedge = cv2.dilate(\n                maskedge, np.ones((kernel_size, kernel_size), dtype=np.uint8))\n            sample[\"edge_mask\"] = maskedge\n        else:\n            raise RuntimeError(\n                \"We need parsing mask to generate the edge mask.\")\n        return sample",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/custom_transforms_f.py:406-416"
+    },
+    "1373": {
+        "file_id": 128,
+        "content": "This code checks if a parsing mask is provided. If it is, it creates an edge mask by comparing the labels horizontally and vertically. It then dilates the resulting mask using cv2.dilate and assigns it to sample[\"edge_mask\"]. If no parsing mask is provided, it raises a RuntimeError.",
+        "type": "comment"
+    },
+    "1374": {
+        "file_id": 129,
+        "content": "/applications/Ma-Net/dataloaders/davis_2017_f.py",
+        "type": "filepath"
+    },
+    "1375": {
+        "file_id": 129,
+        "content": "This code initializes a DAVIS 2017 dataset class for loading and preprocessing, creates a custom dataloader, prepares input for Ma-Net model, and outputs JSON files with sequence data.",
+        "type": "summary"
+    },
+    "1376": {
+        "file_id": 129,
+        "content": "from __future__ import division\nimport json\nimport os\nimport shutil\nimport numpy as np\nimport paddle, cv2\nfrom random import choice\nfrom paddle.io import Dataset\nimport json\nfrom PIL import Image\nfrom davisinteractive.utils.scribbles import scribbles2mask, annotated_frames\nimport sys\nsys.path.append(\"..\")\nfrom config import cfg\nimport time\nclass DAVIS2017_Test_Manager():\n    def __init__(self,\n                 split='val',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False,\n                 seq_name=None):\n        self.split = split\n        self.db_root_dir = root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_name = seq_name\n    def get_image(self, idx):\n        frame_name = str(idx)\n        while len(frame_name) != 5:\n            frame_name = '0' + frame_name\n        imgpath = os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                               str(self.seq_name), frame_name + '.jpg')\n        img = cv2.imread(imgpath)\n        img = np.array(img, dtype=np.float32)",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:1-40"
+    },
+    "1377": {
+        "file_id": 129,
+        "content": "This code snippet defines a DAVIS2017_Test_Manager class for loading and managing test data from the DAVIS 2017 dataset. It accepts parameters such as split, root directory, transformations to apply, and sequence name. The get_image() method retrieves an image from the specified directory based on the index, reads it into a numpy array, and converts it into float32 data type. This class can be used for loading test images in the DAVIS 2017 dataset for further processing or analysis.",
+        "type": "comment"
+    },
+    "1378": {
+        "file_id": 129,
+        "content": "        sample = {'img': img}\n        if self.transform is not None:\n            sample = self.transform(sample)\n        return sample\nclass DAVIS2017_Feature_Extract(Dataset):\n    def __init__(self,\n                 split='val',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False,\n                 seq_name=None):\n        self.split = split\n        self.db_root_dir = root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_name = seq_name\n        self.img_list = np.sort(\n            os.listdir(\n                os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                             str(seq_name))))\n    def __len__(self):\n        return len(self.img_list)\n    def __getitem__(self, idx):\n        img = self.img_list[idx]\n        imgpath = os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                               str(self.seq_name), img)\n        current_img = cv2.imread(imgpath)\n        current_img = np.array(current_img, dtype=np.float32)",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:41-73"
+    },
+    "1379": {
+        "file_id": 129,
+        "content": "This code is initializing a dataset for DAVIS2017, which contains images and their features. It loads the image list from a specified directory, and applies optional transformations to the samples before returning them. The dataset supports different splits (e.g., training or validation) and allows for specifying an optional sequence name.",
+        "type": "comment"
+    },
+    "1380": {
+        "file_id": 129,
+        "content": "        h, w, _ = current_img.shape\n        sample = {'img1': current_img}\n        sample['meta'] = {\n            'seq_name': self.seq_name,\n            'h_w': (h, w),\n            'img_path': imgpath\n        }\n        if self.transform is not None:\n            sample = self.transform(sample)\n        return sample\nclass DAVIS2017_VOS_Test(Dataset):\n    \"\"\"\n    \"\"\"\n    def __init__(self,\n                 split='val',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False,\n                 result_root=None,\n                 seq_name=None):\n        self.split = split\n        self.db_root_dir = root\n        self.result_root = result_root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_name = seq_name\n        self.seq_list_file = os.path.join(\n            self.db_root_dir, 'ImageSets', '2017',\n            '_'.join(self.split) + '_instances.txt')\n        self.seqs = []\n        for splt in self.split:\n            with open(\n                    os.path.join(self.db_root_dir, 'ImageSets', '2017',",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:74-109"
+    },
+    "1381": {
+        "file_id": 129,
+        "content": "The code defines a DAVIS2017_VOS_Test dataset class which loads data from the DAVIS 2017 dataset for semantic segmentation tasks. It takes various parameters such as split, root directory, transformation function, if RGB images are required, result root directory, and sequence name. It reads a list of sequences from a file and returns an image sample along with its metadata (sequence name, height, width, and image path).",
+        "type": "comment"
+    },
+    "1382": {
+        "file_id": 129,
+        "content": "                                 self.split + '.txt')) as f:\n                seqs_tmp = f.readlines()\n            seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n            self.seqs.extend(seqs_tmp)\n        if not self._check_preprocess():\n            self._preprocess()\n        assert self.seq_name in self.seq_dict.keys(\n        ), '{} not in {} set.'.format(self.seq_name, '_'.join(self.split))\n        names_img = np.sort(\n            os.listdir(\n                os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                             str(seq_name))))\n        img_list = list(\n            map(lambda x: os.path.join('JPEGImages/480p/', str(seq_name), x),\n                names_img))\n        name_label = np.sort(\n            os.listdir(\n                os.path.join(self.db_root_dir, 'Annotations/480p/',\n                             str(seq_name))))\n        labels = list(\n            map(lambda x: os.path.join('Annotations/480p/', str(seq_name), x),\n                name_label))\n        if not os.path.isfile(",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:110-135"
+    },
+    "1383": {
+        "file_id": 129,
+        "content": "The code reads sequences from a file and extends the existing sequence list. It then checks if preprocessing is required and performs it if necessary. The code asserts that the sequence name exists in the dictionary of sequences. Next, it retrieves image names and label names, creating lists of image paths and label paths respectively. Finally, it ensures that a specific file exists.",
+        "type": "comment"
+    },
+    "1384": {
+        "file_id": 129,
+        "content": "                os.path.join(self.result_root, seq_name, name_label[0])):\n            if not os.path.exists(os.path.join(self.result_root, seq_name)):\n                os.makedirs(os.path.join(self.result_root, seq_name))\n                shutil.copy(\n                    os.path.join(self.db_root_dir, labels[0]),\n                    os.path.join(self.result_root, seq_name, name_label[0]))\n            else:\n                shutil.copy(\n                    os.path.join(self.db_root_dir, labels[0]),\n                    os.path.join(self.result_root, seq_name, name_label[0]))\n        self.first_img = names_img[0]\n        self.first_label = name_label[0]\n        self.img_list = names_img[1:]\n    def __len__(self):\n        return len(self.img_list)\n    def __getitem__(self, idx):\n        img = self.img_list[idx]\n        imgpath = os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                               str(self.seq_name), img)\n        num_frame = int(img.split('.')[0])\n        ref_img = os.path.join(self.db_root_dir, 'JPEGImages/480p/',",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:136-161"
+    },
+    "1385": {
+        "file_id": 129,
+        "content": "This code creates a data loader for the DAVIS 2017 dataset. It checks if the result directory exists, and if not, it creates it and copies the label file to the new directory. If the directory already exists, it simply copies the label file. The function then sets the first image and its label, as well as the remaining images in the list. Lastly, it defines the length of the dataset and a method for getting items from the dataset at specific indices.",
+        "type": "comment"
+    },
+    "1386": {
+        "file_id": 129,
+        "content": "                               str(self.seq_name), self.first_img)\n        prev_frame = num_frame - 1\n        prev_frame = str(prev_frame)\n        while len(prev_frame) != 5:\n            prev_frame = '0' + prev_frame\n        prev_img = os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                                str(self.seq_name),\n                                prev_frame + '.' + img.split('.')[-1])\n        current_img = cv2.imread(imgpath)\n        current_img = np.array(current_img, dtype=np.float32)\n        ref_img = cv2.imread(ref_img)\n        ref_img = np.array(ref_img, dtype=np.float32)\n        prev_img = cv2.imread(prev_img)\n        prev_img = np.array(prev_img, dtype=np.float32)\n        ref_label = os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                 str(self.seq_name), self.first_label)\n        ref_label = Image.open(ref_label)\n        ref_label = np.array(ref_label, dtype=np.uint8)\n        prev_label = os.path.join(\n            self.result_root, str(self.seq_name),",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:162-186"
+    },
+    "1387": {
+        "file_id": 129,
+        "content": "The code snippet is responsible for loading images and labels from a specific path. It handles image path formatting, ensures all frames have 5 digits, reads images using cv2, converts them to numpy arrays with float32 dtype, and retrieves reference labels by opening and converting the label file to uint8 dtype.",
+        "type": "comment"
+    },
+    "1388": {
+        "file_id": 129,
+        "content": "            prev_frame + '.' + self.first_label.split('.')[-1])\n        prev_label = Image.open(prev_label)\n        prev_label = np.array(prev_label, dtype=np.uint8)\n        obj_num = self.seq_dict[self.seq_name][-1]\n        sample = {\n            'ref_img': ref_img,\n            'prev_img': prev_img,\n            'current_img': current_img,\n            'ref_label': ref_label,\n            'prev_label': prev_label\n        }\n        sample['meta'] = {\n            'seq_name': self.seq_name,\n            'frame_num': num_frame,\n            'obj_num': obj_num,\n            'current_name': img\n        }\n        if self.transform is not None:\n            sample = self.transform(sample)\n        return sample\n    def _check_preprocess(self):\n        _seq_list_file = self.seq_list_file\n        if not os.path.isfile(_seq_list_file):\n            return False\n        else:\n            self.seq_dict = json.load(open(self.seq_list_file, 'r'))\n            return True\n    def _preprocess(self):\n        self.seq_dict = {}\n        for seq in self.seqs:",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:187-219"
+    },
+    "1389": {
+        "file_id": 129,
+        "content": "This code appears to be part of a data loader for a video object detection task. It loads frames and labels from a specific dataset (DAVIS 2017) and creates samples for each frame. The _check_preprocess function checks if the sequence list file exists, and if so, it loads the dictionary of sequences. The _preprocess function initializes an empty dictionary for the sequence dictionary and iterates over the specified sequences to process them.",
+        "type": "comment"
+    },
+    "1390": {
+        "file_id": 129,
+        "content": "            # Read object masks and get number of objects\n            name_label = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/', seq)))\n            label_path = os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                      seq, name_label[0])\n            _mask = np.array(Image.open(label_path))\n            _mask_ids = np.unique(_mask)\n            n_obj = _mask_ids[-1]\n            self.seq_dict[seq] = list(range(1, n_obj + 1))\n        with open(self.seq_list_file, 'w') as outfile:\n            outfile.write('{{\\n\\t\"{:s}\": {:s}'.format(\n                self.seqs[0], json.dumps(self.seq_dict[self.seqs[0]])))\n            for ii in range(1, len(self.seqs)):\n                outfile.write(',\\n\\t\"{:s}\": {:s}'.format(\n                    self.seqs[ii], json.dumps(self.seq_dict[self.seqs[ii]])))\n            outfile.write('\\n}\\n')\n        print('Preprocessing finished')\nclass DAVIS2017_VOS_Train(Dataset):\n    \"\"\"DAVIS2017 dataset for training",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:220-244"
+    },
+    "1391": {
+        "file_id": 129,
+        "content": "This code reads object masks from DAVIS 2017 dataset, obtains the number of objects, and creates a dictionary containing sequence names as keys and their corresponding unique object IDs as values. The dictionary is then saved to a file in JSON format for further use in the DAVIS2017_VOS_Train class, which serves as the training dataset for the DAVIS 2017 dataset.",
+        "type": "comment"
+    },
+    "1392": {
+        "file_id": 129,
+        "content": "    Return: imgs: N*2*3*H*W,label: N*2*1*H*W, seq-name: N, frame_num:N\n    \"\"\"\n    def __init__(self,\n                 split='train',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False):\n        self.split = split\n        self.db_root_dir = root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_list_file = os.path.join(\n            self.db_root_dir, 'ImageSets', '2017',\n            '_'.join(self.split) + '_instances.txt')\n        self.seqs = []\n        for splt in self.split:\n            with open(\n                    os.path.join(self.db_root_dir, 'ImageSets', '2017',\n                                 self.split + '.txt')) as f:\n                seqs_tmp = f.readlines()\n            seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n            self.seqs.extend(seqs_tmp)\n        self.imglistdic = {}\n        if not self._check_preprocess():\n            self._preprocess()\n        self.sample_list = []\n        for seq_name in self.seqs:\n            images = np.sort(",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:246-273"
+    },
+    "1393": {
+        "file_id": 129,
+        "content": "This code initializes a class for loading and preprocessing data from the DAVIS 2017 dataset. It takes parameters such as split, root directory, transformation functions, and RGB mode. The code reads sequence lists and checks if pre-processing is necessary before creating a list of samples to be loaded.",
+        "type": "comment"
+    },
+    "1394": {
+        "file_id": 129,
+        "content": "                os.listdir(\n                    os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                                 seq_name.strip())))\n            images_path = list(\n                map(\n                    lambda x: os.path.join('JPEGImages/480p/', seq_name.strip(),\n                                           x), images))\n            lab = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                 seq_name.strip())))\n            lab_path = list(\n                map(\n                    lambda x: os.path.join('Annotations/480p/', seq_name.strip(\n                    ), x), lab))\n            self.imglistdic[seq_name] = (images, lab)\n    def __len__(self):\n        return len(self.seqs)\n    def __getitem__(self, idx):\n        seqname = self.seqs[idx]\n        imagelist, lablist = self.imglistdic[seqname]\n        prev_img = np.random.choice(imagelist[:-1], 1)\n        prev_img = prev_img[0]\n        frame_num = int(prev_img.split('.')[0]) + 1",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:274-299"
+    },
+    "1395": {
+        "file_id": 129,
+        "content": "The code defines a class for loading data from the DAVIS 2017 dataset, extracting image and annotation files based on the given sequence name. It also provides methods to get the length of the dataset and retrieve specific items by index. The dataset is organized into 'JPEGImages/480p' and 'Annotations/480p' directories with corresponding sequence names. It selects a random previous image from the list, increments its frame number by 1 to get the next image, and returns both the image and annotation files.",
+        "type": "comment"
+    },
+    "1396": {
+        "file_id": 129,
+        "content": "        next_frame = str(frame_num)\n        while len(next_frame) != 5:\n            next_frame = '0' + next_frame\n        ###############################Processing two adjacent frames and labels\n        img2path = os.path.join('JPEGImages/480p/', seqname,\n                                next_frame + '.' + prev_img.split('.')[-1])\n        img2 = cv2.imread(os.path.join(self.db_root_dir, img2path))\n        img2 = np.array(img2, dtype=np.float32)\n        imgpath = os.path.join('JPEGImages/480p/', seqname, prev_img)\n        img1 = cv2.imread(os.path.join(self.db_root_dir, imgpath))\n        img1 = np.array(img1, dtype=np.float32)\n        ###############\n        labelpath = os.path.join(\n            'Annotations/480p/', seqname,\n            prev_img.split('.')[0] + '.' + lablist[0].split('.')[-1])\n        label1 = Image.open(os.path.join(self.db_root_dir, labelpath))\n        label2path = os.path.join('Annotations/480p/', seqname,\n                                  next_frame + '.' + lablist[0].split('.')[-1])\n        label2 = Image.open(os.path.join(self.db_root_dir, label2path))",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:300-320"
+    },
+    "1397": {
+        "file_id": 129,
+        "content": "Processing two adjacent frames and labels: Reads next image, prepares previous image and their corresponding labels from file paths.",
+        "type": "comment"
+    },
+    "1398": {
+        "file_id": 129,
+        "content": "        label1 = np.array(label1, dtype=np.uint8)\n        label2 = np.array(label2, dtype=np.uint8)\n        ###################\n        ref_img = np.random.choice(imagelist, 1)\n        ref_img = ref_img[0]\n        ref_img_name = ref_img\n        ref_scribble_label = Image.open(\n            os.path.join(\n                self.db_root_dir, 'Annotations/480p/', seqname,\n                ref_img_name.split('.')[0] + '.' + lablist[0].split('.')[-1]))\n        ref_scribble_label = np.array(ref_scribble_label, dtype=np.uint8)\n        while len(np.unique(ref_scribble_label)) < self.seq_dict[seqname][\n                -1] + 1 or ref_img == prev_img or ref_img == (\n                    next_frame + '.' + prev_img.split('.')[-1]):\n            ref_img = np.random.choice(imagelist, 1)\n            ref_img = ref_img[0]\n            ref_img_name = ref_img\n            ref_scribble_label = Image.open(\n                os.path.join(\n                    self.db_root_dir, 'Annotations/480p/', seqname,\n                    ref_img_name.split('.')[0] + '.' +",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:322-344"
+    },
+    "1399": {
+        "file_id": 129,
+        "content": "This code randomly selects a reference image and associated scribble label for each video frame, ensuring the labels are unique and not from the same or consecutive frames. It also ensures that the selected images have corresponding annotations in the 480p folder.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/14.json b/docs/data/14.json
new file mode 100644
index 000000000..d2f433ce9
--- /dev/null
+++ b/docs/data/14.json
@@ -0,0 +1,549 @@
+{
+    "1400": {
+        "file_id": 129,
+        "content": "                    lablist[0].split('.')[-1]))\n            ref_scribble_label = np.array(ref_scribble_label, dtype=np.int64)\n        ref_img = os.path.join('JPEGImages/480p/', seqname, ref_img)\n        ref_img = cv2.imread(os.path.join(self.db_root_dir, ref_img))\n        ref_img = np.array(ref_img, dtype=np.float32)\n        ####\n        ###################\n        if self.rgb:\n            img1 = img1[:, :, [2, 1, 0]]\n            img2 = img2[:, :, [2, 1, 0]]\n            ref_img = ref_img[:, :, [2, 1, 0]]\n        obj_num = self.seq_dict[seqname][-1]\n        sample = {\n            'ref_img': ref_img,\n            'img1': img1,\n            'img2': img2,\n            'ref_scribble_label': ref_scribble_label,\n            'label1': label1,\n            'label2': label2\n        }\n        sample['meta'] = {\n            'seq_name': seqname,\n            'frame_num': frame_num,\n            'obj_num': obj_num\n        }\n        if self.transform is not None:\n            sample = self.transform(sample)\n        sample['ref_scribble_label'] = paddle.to_tensor(",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:345-374"
+    },
+    "1401": {
+        "file_id": 129,
+        "content": "This code reads an image, splits it into RGB channels if required, and stores it in a dictionary along with other images and labels. It also assigns metadata to the sample. The transform is applied if not None.",
+        "type": "comment"
+    },
+    "1402": {
+        "file_id": 129,
+        "content": "            sample['ref_scribble_label'], dtype='int64')\n        sample['label1'] = paddle.to_tensor(sample['label1'], dtype='int64')\n        sample['label2'] = paddle.to_tensor(sample['label2'], dtype='int64')\n        return sample\n    ########################\n    def _check_preprocess(self):\n        _seq_list_file = self.seq_list_file\n        if not os.path.isfile(_seq_list_file):\n            return False\n        else:\n            self.seq_dict = json.load(open(self.seq_list_file, 'r'))\n            return True\n    def _preprocess(self):\n        self.seq_dict = {}\n        for seq in self.seqs:\n            # Read object masks and get number of objects\n            name_label = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/', seq)))\n            label_path = os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                      seq, name_label[0])\n            _mask = np.array(Image.open(label_path))\n            _mask_ids = np.unique(_mask)",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:375-400"
+    },
+    "1403": {
+        "file_id": 129,
+        "content": "The code defines a function that loads and preprocesses data from a specific source. It checks if the sequence list file exists and then proceeds to read object masks, getting the number of objects in each sequence.",
+        "type": "comment"
+    },
+    "1404": {
+        "file_id": 129,
+        "content": "            n_obj = _mask_ids[-1]\n            self.seq_dict[seq] = list(range(1, n_obj + 1))\n        with open(self.seq_list_file, 'w') as outfile:\n            outfile.write('{{\\n\\t\"{:s}\": {:s}'.format(\n                self.seqs[0], json.dumps(self.seq_dict[self.seqs[0]])))\n            for ii in range(1, len(self.seqs)):\n                outfile.write(',\\n\\t\"{:s}\": {:s}'.format(\n                    self.seqs[ii], json.dumps(self.seq_dict[self.seqs[ii]])))\n            outfile.write('\\n}\\n')\n        print('Preprocessing finished')\nclass DAVIS2017_Train(Dataset):\n    \"\"\"DAVIS2017 dataset for training\n    Return: imgs: N*2*3*H*W,label: N*2*1*H*W, seq-name: N, frame_num:N\n    \"\"\"\n    def __init__(self,\n                 split='train',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False):\n        self.split = split\n        self.db_root_dir = root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_list_file = os.path.join(\n            self.db_root_dir, 'ImageSets', '2017',",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:401-431"
+    },
+    "1405": {
+        "file_id": 129,
+        "content": "This code defines a class for the DAVIS2017 dataset used in training. It initializes the dataset object based on specified parameters, writes a sequence list file containing frame numbers for each sequence, and provides the functionality to load images, masks, and other data required for training.",
+        "type": "comment"
+    },
+    "1406": {
+        "file_id": 129,
+        "content": "            '_'.join(self.split) + '_instances.txt')\n        self.seqs = []\n        for splt in self.split:\n            with open(\n                    os.path.join(self.db_root_dir, 'ImageSets', '2017',\n                                 self.split + '.txt')) as f:\n                seqs_tmp = f.readlines()\n            seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n            self.seqs.extend(seqs_tmp)\n        if not self._check_preprocess():\n            self._preprocess()\n        self.sample_list = []\n        for seq_name in self.seqs:\n            images = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                                 seq_name.strip())))\n            images_path = list(\n                map(\n                    lambda x: os.path.join('JPEGImages/480p/', seq_name.strip(),\n                                           x), images))\n            lab = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/',",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:432-456"
+    },
+    "1407": {
+        "file_id": 129,
+        "content": "This code is creating a custom dataloader for the DAVIS dataset. It loads the image and annotation files, sorts them by name, checks if preprocessing needs to be done, and then forms a sample list containing the image paths and labels. The result will be used for training or testing purposes in the Ma-Net application.",
+        "type": "comment"
+    },
+    "1408": {
+        "file_id": 129,
+        "content": "                                 seq_name.strip())))\n            lab_path = list(\n                map(\n                    lambda x: os.path.join('Annotations/480p/', seq_name.strip(\n                    ), x), lab))\n            for img_path, label_path in zip(images_path[:-1], lab_path[:-1]):\n                tmp_dic = {\n                    'img': img_path,\n                    'label': label_path,\n                    'seq_name': seq_name,\n                    'frame_num': img_path.split('/')[-1].split('.')[0]\n                }\n                self.sample_list.append(tmp_dic)\n    def __len__(self):\n        return len(self.sample_list)\n    def __getitem__(self, idx):\n        tmp_sample = self.sample_list[idx]\n        imgpath = tmp_sample['img']\n        labelpath = tmp_sample['label']\n        seqname = tmp_sample['seq_name']\n        frame_num = int(tmp_sample['frame_num']) + 1\n        next_frame = str(frame_num)\n        while len(next_frame) != 5:\n            next_frame = '0' + next_frame\n        ###############################Processing two adjacent frames and labels",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:457-485"
+    },
+    "1409": {
+        "file_id": 129,
+        "content": "The code creates a custom dataloader for a dataset with two adjacent frames and their corresponding labels. It takes the images and label paths, appends them to a list of dictionaries, and handles any necessary padding to ensure frame numbers have 5 digits. The length of the dataloader is determined by the number of samples in the sample_list, and the __getitem__ method retrieves specific samples based on their index.",
+        "type": "comment"
+    },
+    "1410": {
+        "file_id": 129,
+        "content": "        img2path = os.path.join('JPEGImages/480p/', seqname,\n                                next_frame + '.' + imgpath.split('.')[-1])\n        img2 = cv2.imread(os.path.join(self.db_root_dir, img2path))\n        img2 = np.array(img2, dtype=np.float32)\n        img1 = cv2.imread(os.path.join(self.db_root_dir, imgpath))\n        img1 = np.array(img1, dtype=np.float32)\n        ###############\n        label1 = Image.open(os.path.join(self.db_root_dir, labelpath))\n        label2path = os.path.join('Annotations/480p/', seqname,\n                                  next_frame + '.' + labelpath.split('.')[-1])\n        label2 = Image.open(os.path.join(self.db_root_dir, label2path))\n        label1 = np.array(\n            label1, dtype=np.int32\n        )  # fixed, uint8->int32, because layers.stack does not support uint8\n        label2 = np.array(\n            label2, dtype=np.int32\n        )  # fixed, uint8->int32, because layers.stack does not support uint8\n        ###################\n        ref_tmp_dic = self.ref_frame_dic[seqname]",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:486-506"
+    },
+    "1411": {
+        "file_id": 129,
+        "content": "The code reads image and label files for a video sequence from their respective directories, converts them to numpy arrays of dtype float32 and int32 respectively for compatibility with the model's stack function. It also retrieves reference frame information from ref_frame_dic for the given sequence.",
+        "type": "comment"
+    },
+    "1412": {
+        "file_id": 129,
+        "content": "        ref_img = ref_tmp_dic['ref_frame']\n        ref_scribble_label = ref_tmp_dic['scribble_label']\n        ref_img = cv2.imread(os.path.join(self.db_root_dir, ref_img))\n        ref_img = np.array(ref_img, dtype=np.float32)\n        ref_frame_gt = ref_tmp_dic['ref_frame_gt']\n        ref_frame_gt = Image.open(os.path.join(self.db_root_dir, ref_frame_gt))\n        ref_frame_gt = np.array(\n            ref_frame_gt, dtype=np.int32\n        )  # fixed, uint8->int32, because layers.stack does not support uint8\n        ref_frame_num = ref_tmp_dic['ref_frame_num']\n        ###################\n        if self.rgb:\n            img1 = img1[:, :, [2, 1, 0]]\n            img2 = img2[:, :, [2, 1, 0]]\n            ref_img = ref_img[:, :, [2, 1, 0]]\n        obj_num = self.seq_dict[seqname][-1]\n        sample = {\n            'ref_img': ref_img,\n            'img1': img1,\n            'img2': img2,\n            'ref_scribble_label': ref_scribble_label,\n            'label1': label1,\n            'label2': label2,\n            'ref_frame_gt': ref_frame_gt",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:507-531"
+    },
+    "1413": {
+        "file_id": 129,
+        "content": "This code reads the reference image, scribble label, and ground truth frame from a dictionary. It then converts them to appropriate data types for processing. If rgb=True, it changes the color order. It also gets the total number of objects in the sequence. Finally, it creates a sample dictionary with all these elements.",
+        "type": "comment"
+    },
+    "1414": {
+        "file_id": 129,
+        "content": "        }\n        if 'prev_round_label' in ref_tmp_dic:\n            prev_round_label = ref_tmp_dic['prev_round_label']\n            prev_round_label = prev_round_label.squeeze()\n            prev_round_label = prev_round_label.numpy()\n            sample = {\n                'ref_img': ref_img,\n                'img1': img1,\n                'img2': img2,\n                'ref_scribble_label': ref_scribble_label,\n                'label1': label1,\n                'label2': label2,\n                'ref_frame_gt': ref_frame_gt,\n                'prev_round_label': prev_round_label\n            }\n        sample['meta'] = {\n            'seq_name': seqname,\n            'frame_num': frame_num,\n            'obj_num': obj_num,\n            'ref_frame_num': ref_frame_num\n        }\n        if self.transform is not None:\n            sample = self.transform(sample)\n        return sample\n    def update_ref_frame_and_label(self,\n                                   round_scribble=None,\n                                   frame_num=None,\n                                   prev_round_label_dic=None):",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:532-562"
+    },
+    "1415": {
+        "file_id": 129,
+        "content": "This code defines a function that creates a sample containing images, labels, and metadata for the Ma-Net model. It also includes a separate function to update the reference frame and label based on user input. The sample is then transformed using a specified transform if one is provided.",
+        "type": "comment"
+    },
+    "1416": {
+        "file_id": 129,
+        "content": "        ##########Update reference frame and scribbles\n        for seq in self.seqs:\n            scribble = round_scribble[seq]\n            if frame_num is None:\n                scr_frame = annotated_frames(scribble)[0]\n            else:\n                scr_frame = frame_num[seq]\n                scr_frame = int(scr_frame)\n            scr_f = str(scr_frame)\n            while len(scr_f) != 5:\n                scr_f = '0' + scr_f\n            ref_frame_path = os.path.join('JPEGImages/480p', seq,\n                                          scr_f + '.jpg')\n            #######################\n            ref_frame_gt = os.path.join('Annotations/480p/', seq,\n                                        scr_f + '.png')\n            #########################\n            ref_tmp = cv2.imread(os.path.join(self.db_root_dir, ref_frame_path))\n            h_, w_ = ref_tmp.shape[:2]\n            scribble_masks = scribbles2mask(scribble, (h_, w_))\n            if frame_num is None:\n                scribble_label = scribble_masks[scr_frame]",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:563-585"
+    },
+    "1417": {
+        "file_id": 129,
+        "content": "Updating the reference frame and scribbles for each sequence in the dataset. If no frame number is given, uses the first frame from annotated_frames list. Ensures frame number is 5 digits long. Retrieves the corresponding reference image path and ground truth mask path. Reads the reference image. Resizes the image based on its height and width. Generates scribble masks for each frame using the provided scribble. If no frame number given, selects the first frame's scribble mask.",
+        "type": "comment"
+    },
+    "1418": {
+        "file_id": 129,
+        "content": "            else:\n                scribble_label = scribble_masks[0]\n            self.ref_frame_dic[seq] = {\n                'ref_frame': ref_frame_path,\n                'scribble_label': scribble_label,\n                'ref_frame_gt': ref_frame_gt,\n                'ref_frame_num': scr_frame\n            }\n            if prev_round_label_dic is not None:\n                self.ref_frame_dic[seq] = {\n                    'ref_frame': ref_frame_path,\n                    'scribble_label': scribble_label,\n                    'ref_frame_gt': ref_frame_gt,\n                    'ref_frame_num': scr_frame,\n                    'prev_round_label': prev_round_label_dic[seq]\n                }\n    def init_ref_frame_dic(self):\n        self.ref_frame_dic = {}\n        scribbles_path = os.path.join(self.db_root_dir, 'Scribbles')\n        for seq in self.seqs:\n            selected_json = np.random.choice(\n                ['001.json', '002.json', '003.json'], 1)\n            selected_json = selected_json[0]\n            scribble = os.path.join(self.db_root_dir, 'Scribbles', seq,",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:586-610"
+    },
+    "1419": {
+        "file_id": 129,
+        "content": "The code initializes a dictionary for reference frames, storing information such as reference frame path, scribble label, and ground truth data. If there is a previous round's label dictionary, it also includes the previous round's label in the current dictionary entry. It uses the database root directory to find the Scribbles folder and selects a random JSON file for each sequence.",
+        "type": "comment"
+    },
+    "1420": {
+        "file_id": 129,
+        "content": "                                    selected_json)\n            with open(scribble) as f:\n                scribble = json.load(f)\n                #    print(scribble)\n                scr_frame = annotated_frames(scribble)[0]\n                scr_f = str(scr_frame)\n                while len(scr_f) != 5:\n                    scr_f = '0' + scr_f\n                ref_frame_path = os.path.join('JPEGImages/480p', seq,\n                                              scr_f + '.jpg')\n                ref_tmp = cv2.imread(\n                    os.path.join(self.db_root_dir, ref_frame_path))\n                h_, w_ = ref_tmp.shape[:2]\n                scribble_masks = scribbles2mask(scribble, (h_, w_))\n                ########################\n                ref_frame_gt = os.path.join('Annotations/480p/', seq,\n                                            scr_f + '.png')\n                ########################\n                scribble_label = scribble_masks[scr_frame]\n                self.ref_frame_dic[seq] = {\n                    'ref_frame': ref_frame_path,",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:611-633"
+    },
+    "1421": {
+        "file_id": 129,
+        "content": "Reading JSON file for annotated frame, extracting frame path and loading reference image using OpenCV, determining the shape of the reference image, extracting the mask from the scribble, storing reference frame path in ref_frame_dic.",
+        "type": "comment"
+    },
+    "1422": {
+        "file_id": 129,
+        "content": "                    'scribble_label': scribble_label,\n                    'ref_frame_gt': ref_frame_gt,\n                    'ref_frame_num': scr_frame\n                }\n    ########################\n    def _check_preprocess(self):\n        _seq_list_file = self.seq_list_file\n        if not os.path.isfile(_seq_list_file):\n            return False\n        else:\n            self.seq_dict = json.load(open(self.seq_list_file, 'r'))\n            return True\n    def _preprocess(self):\n        self.seq_dict = {}\n        for seq in self.seqs:\n            # Read object masks and get number of objects\n            name_label = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/', seq)))\n            label_path = os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                      seq, name_label[0])\n            _mask = np.array(Image.open(label_path))\n            _mask_ids = np.unique(_mask)\n            n_obj = _mask_ids[-1]\n            self.seq_dict[seq] = list(range(1, n_obj + 1))",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:634-662"
+    },
+    "1423": {
+        "file_id": 129,
+        "content": "The code reads a list of sequences from the sequence_list file and checks if it exists. If the file does not exist, it returns False; otherwise, it loads the sequence dictionary using json.load() and then proceeds to preprocess each sequence by reading object masks and finding the number of objects in the masks. The code stores this information in a dictionary format for later use.",
+        "type": "comment"
+    },
+    "1424": {
+        "file_id": 129,
+        "content": "        with open(self.seq_list_file, 'w') as outfile:\n            outfile.write('{{\\n\\t\"{:s}\": {:s}'.format(\n                self.seqs[0], json.dumps(self.seq_dict[self.seqs[0]])))\n            for ii in range(1, len(self.seqs)):\n                outfile.write(',\\n\\t\"{:s}\": {:s}'.format(\n                    self.seqs[ii], json.dumps(self.seq_dict[self.seqs[ii]])))\n            outfile.write('\\n}\\n')\n        print('Preprocessing finished')",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/davis_2017_f.py:664-672"
+    },
+    "1425": {
+        "file_id": 129,
+        "content": "The code writes a JSON file containing video sequences and their corresponding dictionaries, which will be used for the dataset. It iterates over each sequence, formats the output as JSON strings in the file, and finishes by printing \"Preprocessing finished\".",
+        "type": "comment"
+    },
+    "1426": {
+        "file_id": 130,
+        "content": "/applications/Ma-Net/dataloaders/helpers.py",
+        "type": "filepath"
+    },
+    "1427": {
+        "file_id": 130,
+        "content": "The code defines functions for converting tensors to images, applying masks, normalizing images, and constructing model names. It also includes functions for computing foreground and nocare area using OpenCV's dilation operation with optional parameters, returning the 'nocare' along with the original foreground image.",
+        "type": "summary"
+    },
+    "1428": {
+        "file_id": 130,
+        "content": "import numpy as np\nimport cv2\ndef tens2image(im):\n    tmp = np.squeeze(im.numpy())\n    if tmp.ndim == 2:\n        return tmp\n    else:\n        return tmp.transpose((1, 2, 0))\ndef overlay_mask(im, ma, color=np.array([255, 0, 0]) / 255.0):\n    assert np.max(im) <= 1.0\n    ma = ma.astype(np.bool)\n    im = im.astype(np.float32)\n    alpha = 0.5\n    fg = im * alpha + np.ones(\n        im.shape) * (1 - alpha) * color  # np.array([0,0,255])/255.0\n    # Whiten background\n    alpha = 1.0\n    bg = im.copy()\n    bg[ma == 0] = im[ma == 0] * alpha + np.ones(im[ma == 0].shape) * (1 - alpha)\n    bg[ma == 1] = fg[ma == 1]\n    # [-2:] is s trick to be compatible both with opencv 2 and 3\n    contours = cv2.findContours(ma.copy().astype(np.uint8), cv2.RETR_TREE,\n                                cv2.CHAIN_APPROX_SIMPLE)[-2:]\n    cv2.drawContours(bg, contours[0], -1, (0.0, 0.0, 0.0), 1)\n    return bg\ndef im_normalize(im):\n    \"\"\"\n    Normalize image\n    \"\"\"\n    imn = (im - im.min()) / max((im.max() - im.min()), 1e-8)\n    return imn\ndef construct_name(p, prefix):",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/helpers.py:1-46"
+    },
+    "1429": {
+        "file_id": 130,
+        "content": "ens2image: Converts a tensor to an image by removing dimensions and transposing if necessary.\n\noverlay_mask: Applies a mask on top of an image, allowing for color overlay and background adjustment.\n\nim_normalize: Normalizes an image by scaling pixel values between 0 and 1 based on the range of values.",
+        "type": "comment"
+    },
+    "1430": {
+        "file_id": 130,
+        "content": "    \"\"\"\n    Construct the name of the model\n    p: dictionary of parameters\n    prefix: the prefix\n    name: the name of the model - manually add \".pth\" to follow the convention\n    \"\"\"\n    name = prefix\n    for key in p.keys():\n        if (type(p[key]) != tuple) and (type(p[key]) != list):\n            name = name + '_' + str(key) + '-' + str(p[key])\n        else:\n            name = name + '_' + str(key) + '-' + str(p[key][0])\n    return name\ndef gt_from_scribble(scr, dilation=11, nocare_area=21):\n    # Compute foreground\n    if scr.max() == 1:\n        kernel_fg = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,\n                                              (dilation, dilation))\n        fg = cv2.dilate(scr.astype(np.uint8),\n                        kernel=kernel_fg).astype(scr.dtype)\n    else:\n        fg = scr\n    # Compute nocare area\n    if nocare_area is None:\n        nocare = None\n    else:\n        kernel_nc = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,\n                                              (nocare_area, nocare_area))",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/helpers.py:47-78"
+    },
+    "1431": {
+        "file_id": 130,
+        "content": "The code defines a function to construct the name of a model by concatenating keys and values from the input dictionary. It also includes two additional functions: one for computing foreground based on a given scribble image, and another for computing a nocare area with optional dilation and size parameters.",
+        "type": "comment"
+    },
+    "1432": {
+        "file_id": 130,
+        "content": "        nocare = cv2.dilate(fg, kernel=kernel_nc) - fg\n    return fg, nocare",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/helpers.py:79-81"
+    },
+    "1433": {
+        "file_id": 130,
+        "content": "The code uses OpenCV's dilation operation to enhance the background care region by dilating the foreground image with a given kernel. The resulting 'nocare' is then returned along with the original foreground image.",
+        "type": "comment"
+    },
+    "1434": {
+        "file_id": 131,
+        "content": "/applications/Ma-Net/dataloaders/samplers.py",
+        "type": "filepath"
+    },
+    "1435": {
+        "file_id": 131,
+        "content": "The code defines a RandomIdentitySampler class that randomly samples N identities and K instances from a dataset, generating a random sample of identities and instances with the ability to replace or not while selecting new instances.",
+        "type": "summary"
+    },
+    "1436": {
+        "file_id": 131,
+        "content": "from __future__ import absolute_import\nfrom collections import defaultdict\nimport numpy as np\nimport paddle\nfrom paddle.io import Sampler\nclass RandomIdentitySampler(Sampler):\n    \"\"\"\n    Randomly sample N identities, then for each identity,\n    randomly sample K instances, therefore batch size is N*K.\n    Code imported from https://github.com/Cysu/open-reid/blob/master/reid/utils/data/sampler.py.\n    Args:\n        data_source (Dataset): dataset to sample from.\n        num_instances (int): number of instances per identity.\n    \"\"\"\n    def __init__(self, sample_list, num_instances=1):\n        self.sample_list = sample_list\n        self.num_instances = num_instances\n        self.index_dic = defaultdict(list)\n        for index, tmp_dic in enumerate(self.sample_list):\n            pid = tmp_dic['seq_name']\n            self.index_dic[pid].append(index)\n        self.pids = list(self.index_dic.keys())\n        self.num_identities = len(self.pids)\n    def __iter__(self):\n        indices = np.random.permutation(self.num_identities)",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/samplers.py:1-31"
+    },
+    "1437": {
+        "file_id": 131,
+        "content": "This code defines a RandomIdentitySampler class that randomly samples N identities and then K instances from a given dataset, resulting in a batch size of N*K. It imports necessary libraries and initializes class variables. The __iter__ method generates a random permutation of the identities.",
+        "type": "comment"
+    },
+    "1438": {
+        "file_id": 131,
+        "content": "        ret = []\n        for i in indices:\n            pid = self.pids[i]\n            t = self.index_dic[pid]\n            replace = False if len(t) >= self.num_instances else True\n            t = np.random.choice(t, size=self.num_instances, replace=replace)\n            ret.extend(t)\n        return iter(ret)\n    def __len__(self):\n        return self.num_identities * self.num_instances",
+        "type": "code",
+        "location": "/applications/Ma-Net/dataloaders/samplers.py:32-42"
+    },
+    "1439": {
+        "file_id": 131,
+        "content": "This code generates a random sample of identities and instances from a given list of indices. It checks if the length of the current index is greater than or equal to the number of desired instances, and then chooses either to replace or not while selecting new instances. The selected instances are stored in a list and returned as an iterator. The method also provides the total number of samples by multiplying the number of identities with the number of instances.",
+        "type": "comment"
+    },
+    "1440": {
+        "file_id": 132,
+        "content": "/applications/Ma-Net/networks/IntVOS.py",
+        "type": "filepath"
+    },
+    "1441": {
+        "file_id": 132,
+        "content": "The model employs PaddlePaddle for object matching, k-nearest neighbor search, SpatialCorrelationSampler for pairwise distances, and a neural network with separable convolutional layers for semantic segmentation. The Ma-Net's int_seghead updates global and local maps for sequence processing and performs tensor operations for video object segmentation.",
+        "type": "summary"
+    },
+    "1442": {
+        "file_id": 132,
+        "content": "import os\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport sys\nsys.path.append(\"..\")\nfrom config import cfg\nimport time\nimport paddle.nn.functional as F\nfrom utils.api import int_, float_, long_\nfrom utils.api import kaiming_normal_\n#############################################################GLOBAL_DIST_MAP\nMODEL_UNFOLD = True\nWRONG_LABEL_PADDING_DISTANCE = 1e20\ndef _pairwise_distances(x, y, ys=None):\n    \"\"\"Computes pairwise squared l2 distances between tensors x and y.\n    Args:\n    x: Tensor of shape [n, feature_dim].\n    y: Tensor of shape [m, feature_dim].\n    Returns:\n    Float32 distances tensor of shape [n, m].\n    \"\"\"\n    xs = paddle.sum(x * x, 1)\n    xs = xs.unsqueeze(1)\n    if ys is None:\n        ys = paddle.sum(y * y, 1)\n        ys = ys.unsqueeze(0)\n    else:\n        ys = ys\n    d = xs + ys - 2. * paddle.matmul(x, paddle.t(y))\n    return d, ys\n##################\ndef _flattened_pairwise_distances(reference_embeddings, query_embeddings, ys):\n    \"\"\"Calculates flattened tensor of pairwise distances between ref and query.",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:1-42"
+    },
+    "1443": {
+        "file_id": 132,
+        "content": "This code snippet is from a PaddlePaddle-based video object detection model. It defines functions for calculating pairwise distances between embeddings and initializes some global variables. The model is designed to take reference and query embeddings as input, compute pairwise squared L2 distances, and returns the flattened tensor of distances. This distance calculation is likely used in the matching process of objects in the video frames.",
+        "type": "comment"
+    },
+    "1444": {
+        "file_id": 132,
+        "content": "    Args:\n    reference_embeddings: Tensor of shape [..., embedding_dim],\n      the embedding vectors for the reference frame\n    query_embeddings: Tensor of shape [n_query_images, height, width,\n      embedding_dim], the embedding vectors for the query frames.\n    Returns:\n    A distance tensor of shape [reference_embeddings.size / embedding_dim,\n    query_embeddings.size / embedding_dim]\n    \"\"\"\n    embedding_dim = query_embeddings.shape[-1]\n    reference_embeddings = reference_embeddings.reshape([-1, embedding_dim])\n    first_dim = -1\n    query_embeddings = query_embeddings.reshape([first_dim, embedding_dim])\n    dists, ys = _pairwise_distances(query_embeddings, reference_embeddings, ys)\n    return dists, ys\ndef _nn_features_per_object_for_chunk(reference_embeddings, query_embeddings,\n                                      wrong_label_mask, k_nearest_neighbors,\n                                      ys):\n    \"\"\"Extracts features for each object using nearest neighbor attention.\n  Args:\n    reference_embeddings: Tensor of shape [n_chunk, embedding_dim],",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:43-65"
+    },
+    "1445": {
+        "file_id": 132,
+        "content": "The code calculates the distance between reference and query embeddings, performing pairwise distances calculations using the _pairwise_distances function. The result is a distance tensor with shape [reference_embeddings.size / embedding_dim, query_embeddings.size / embedding_dim]. This function also includes _nn_features_per_object_for_chunk which extracts features for each object using nearest neighbor attention.",
+        "type": "comment"
+    },
+    "1446": {
+        "file_id": 132,
+        "content": "      the embedding vectors for the reference frame.\n    query_embeddings: Tensor of shape [m_chunk, embedding_dim], the embedding\n      vectors for the query frames.\n    wrong_label_mask:\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n  Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [m_chunk, n_objects, feature_dim].\n    \"\"\"\n    #    reference_embeddings_key = reference_embeddings\n    #    query_embeddings_key = query_embeddings\n    dists, ys = _flattened_pairwise_distances(reference_embeddings,\n                                              query_embeddings, ys)\n    dists = (paddle.unsqueeze(dists, 1) +\n             paddle.unsqueeze(float_(wrong_label_mask), 0) *\n             WRONG_LABEL_PADDING_DISTANCE)\n    if k_nearest_neighbors == 1:\n        features = paddle.min(dists, 2, keepdim=True)\n    else:\n        dists, _ = paddle.topk(-dists, k=k_nearest_neighbors, axis=2)\n        dists = -dists\n        valid_mask = (dists < WRONG_LABEL_PADDING_DISTANCE)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:66-88"
+    },
+    "1447": {
+        "file_id": 132,
+        "content": "This code calculates pairwise distances between reference and query embedding vectors, selects the k-nearest neighbors, and returns the nearest neighbor features. It takes into account a wrong_label_mask and padding distance, which helps handle incorrect labels and avoid noisy data.",
+        "type": "comment"
+    },
+    "1448": {
+        "file_id": 132,
+        "content": "        masked_dists = dists * valid_mask.float()\n        pad_dist = paddle.max(masked_dists, axis=2, keepdim=True)[0].tile(\n            (1, 1, masked_dists.shape[-1]))\n        dists = paddle.where(valid_mask, dists, pad_dist)\n        # take mean of distances\n        features = paddle.mean(dists, axis=2, keepdim=True)\n    return features, ys\n###\ndef _selected_pixel(ref_labels_flat, ref_emb_flat):\n    index_list = paddle.arange(len(ref_labels_flat))\n    index_list = index_list\n    index_ = paddle.masked_select(index_list, ref_labels_flat != -1)\n    index_ = long_(index_)\n    ref_labels_flat = paddle.index_select(ref_labels_flat, index_, 0)\n    ref_emb_flat = paddle.index_select(ref_emb_flat, index_, 0)\n    return ref_labels_flat, ref_emb_flat\n###\ndef _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,\n        ref_obj_ids, k_nearest_neighbors, n_chunks):\n    \"\"\"Calculates the nearest neighbor features per object in chunks to save mem.",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:89-118"
+    },
+    "1449": {
+        "file_id": 132,
+        "content": "The code calculates the nearest neighbor features for each object in chunks to save memory. It starts by masking and averaging distances between reference and query embeddings, then selects relevant indices from flattened arrays to calculate nearest neighbors for objects. The function takes `reference_embeddings_flat`, `query_embeddings_flat`, `reference_labels_flat`, `ref_obj_ids`, `k_nearest_neighbors`, and `n_chunks` as input and returns the features and labels.",
+        "type": "comment"
+    },
+    "1450": {
+        "file_id": 132,
+        "content": "    Uses chunking to bound the memory use.\n    Args:\n    reference_embeddings_flat: Tensor of shape [n, embedding_dim],\n      the embedding vectors for the reference frame.\n    query_embeddings_flat: Tensor of shape [m, embedding_dim], the embedding\n      vectors for the query frames.\n    reference_labels_flat: Tensor of shape [n], the class labels of the\n      reference frame.\n    ref_obj_ids: int tensor of unique object ids in the reference labels.\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n    n_chunks: Integer, the number of chunks to use to save memory\n      (set to 1 for no chunking).\n    Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [m, n_objects, feature_dim].\n    \"\"\"\n    chunk_size = int_(\n        np.ceil((float_(query_embeddings_flat.shape[0]) / n_chunks).numpy()))\n    if cfg.TEST_MODE:\n        reference_labels_flat, reference_embeddings_flat = _selected_pixel(\n            reference_labels_flat, reference_embeddings_flat)\n    wrong_label_mask = (reference_labels_flat != paddle.unsqueeze(",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:119-141"
+    },
+    "1451": {
+        "file_id": 132,
+        "content": "This code performs k-nearest neighbor search using chunking to save memory. It takes embedding vectors for reference and query frames, their class labels, object ids, the number of nearest neighbors, and the number of chunks as input. It calculates the chunk size based on the number of query frames and the specified number of chunks. If TEST_MODE is enabled, it selects some pixels from the input. Then, it checks if the reference labels are equal to unsqueezed object ids for each query frame and creates a mask for wrong labels. It returns nearest neighbor features of shape [m, n_objects, feature_dim].",
+        "type": "comment"
+    },
+    "1452": {
+        "file_id": 132,
+        "content": "        ref_obj_ids, 1))\n    all_features = []\n    for n in range(n_chunks):\n        if n == 0:\n            ys = None\n        if n_chunks == 1:\n            query_embeddings_flat_chunk = query_embeddings_flat\n        else:\n            chunk_start = n * chunk_size\n            chunk_end = (n + 1) * chunk_size\n            query_embeddings_flat_chunk = query_embeddings_flat[\n                chunk_start:chunk_end]\n        features, ys = _nn_features_per_object_for_chunk(\n            reference_embeddings_flat, query_embeddings_flat_chunk,\n            wrong_label_mask, k_nearest_neighbors, ys)\n        all_features.append(features)\n    if n_chunks == 1:\n        nn_features = all_features[0]\n    else:\n        nn_features = paddle.concat(all_features, axis=0)\n    return nn_features\ndef nearest_neighbor_features_per_object(reference_embeddings,\n                                         query_embeddings,\n                                         reference_labels,\n                                         k_nearest_neighbors,\n                                         gt_ids=None,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:142-169"
+    },
+    "1453": {
+        "file_id": 132,
+        "content": "This code calculates nearest neighbor features for each object across multiple chunks. It splits the query embeddings into different chunks, then computes the features for each chunk individually. If there is only one chunk, it returns the features directly. Otherwise, it concatenates all the computed features along axis 0 and returns them as nearest neighbor features.",
+        "type": "comment"
+    },
+    "1454": {
+        "file_id": 132,
+        "content": "                                         n_chunks=100):\n    \"\"\"Calculates the distance to the nearest neighbor per object.\n    For every pixel of query_embeddings calculate the distance to the\n    nearest neighbor in the (possibly subsampled) reference_embeddings per object.\n    Args:\n    reference_embeddings: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the reference frame.\n    query_embeddings: Tensor of shape [n_query_images, height, width,\n      embedding_dim], the embedding vectors for the query frames.\n    reference_labels: Tensor of shape [height, width, 1], the class labels of\n      the reference frame.\n    max_neighbors_per_object: Integer, the maximum number of candidates\n      for the nearest neighbor query per object after subsampling,\n      or 0 for no subsampling.\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n    gt_ids: Int tensor of shape [n_objs] of the sorted unique ground truth\n      ids in the first frame. If None, it will be derived from",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:170-186"
+    },
+    "1455": {
+        "file_id": 132,
+        "content": "This function calculates the distance between nearest neighbors in reference_embeddings and query_embeddings for each object. It uses the provided reference_labels to determine objects, subsamples if max_neighbors_per_object is specified, and considers k_nearest_neighbors. The gt_ids are used for determining unique ground truth ids in the first frame.",
+        "type": "comment"
+    },
+    "1456": {
+        "file_id": 132,
+        "content": "      reference_labels.\n    n_chunks: Integer, the number of chunks to use to save memory\n      (set to 1 for no chunking).\n    Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [n_query_images, height, width, n_objects, feature_dim].\n    gt_ids: An int32 tensor of the unique sorted object ids present\n      in the reference labels.\n    \"\"\"\n    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])\n    h, w, _ = query_embeddings.shape\n    reference_labels_flat = reference_labels.reshape([-1])\n    if gt_ids is None:\n        ref_obj_ids = paddle.unique(reference_labels_flat)[-1]\n        ref_obj_ids = np.arange(0, ref_obj_ids + 1)\n        gt_ids = paddle.to_tensor(ref_obj_ids)\n        gt_ids = int_(gt_ids)\n    else:\n        gt_ids = int_(paddle.arange(0, gt_ids + 1))\n    embedding_dim = query_embeddings.shape[-1]\n    query_embeddings_flat = query_embeddings.reshape([-1, embedding_dim])\n    reference_embeddings_flat = reference_embeddings.reshape(\n        [-1, embedding_dim])",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:187-211"
+    },
+    "1457": {
+        "file_id": 132,
+        "content": "This function calculates the nearest neighbor features for query images using reference embeddings and labels. It first asserts that the shape of reference embeddings matches the shape of reference labels. Then, it flattens the reference labels and checks if gt_ids (ground truth ids) are provided. If not, it finds unique object ids in the reference labels, creates a tensor with those ids, and converts them to integer type. Else, it converts the given gt_ids to integers. The function reshapes the query and reference embeddings, calculates embedding dimensions, and returns the nearest neighbor features and gt_ids.",
+        "type": "comment"
+    },
+    "1458": {
+        "file_id": 132,
+        "content": "    nn_features = _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,\n        gt_ids, k_nearest_neighbors, n_chunks)\n    nn_features_dim = nn_features.shape[-1]\n    nn_features = nn_features.reshape(\n        [1, h, w, gt_ids.shape[0], nn_features_dim])\n    return nn_features.cuda(), gt_ids\n########################################################################LOCAL_DIST_MAP\ndef local_pairwise_distances(x, y, max_distance=9):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n    Optimized implementation using correlation_cost.\n    Args:\n    x: Float32 tensor of shape [height, width, feature_dim].\n    y: Float32 tensor of shape [height, width, feature_dim].\n    max_distance: Integer, the maximum distance in pixel coordinates\n      per dimension which is considered to be in the search window.\n    Returns:\n    Float32 distances tensor of shape\n      [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    if cfg.MODEL_LOCAL_DOWNSAMPLE:",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:212-235"
+    },
+    "1459": {
+        "file_id": 132,
+        "content": "This code chunk performs nearest neighbor feature extraction for each object in the image, reshapes it, and then returns it along with gt_ids. The local_pairwise_distances function computes pairwise squared l2 distances using a local search window. It is used to compare features between different points in an optimized manner, considering a maximum distance per dimension.",
+        "type": "comment"
+    },
+    "1460": {
+        "file_id": 132,
+        "content": "        #####\n        ori_h, ori_w, _ = x.shape\n        x = x.transpose([2, 0, 1]).unsqueeze(0)\n        x = F.avg_pool2d(x, (2, 2), (2, 2))\n        y = y.transpose([2, 0, 1]).unsqueeze(0)\n        y = F.avg_pool2d(y, (2, 2), (2, 2))\n        x = x.squeeze(0).transpose([1, 2, 0])\n        y = y.squeeze(0).transpose([1, 2, 0])\n        corr = cross_correlate(x, y, max_distance=max_distance)\n        xs = paddle.sum(x * x, 2, keepdim=True)\n        ys = paddle.sum(y * y, 2, keepdim=True)\n        ones_ys = paddle.ones_like(ys)\n        ys = cross_correlate(ones_ys, ys, max_distance=max_distance)\n        d = xs + ys - 2 * corr\n        # Boundary should be set to Inf.\n        tmp = paddle.zeros_like(d)\n        boundary = paddle.equal(\n            cross_correlate(ones_ys, ones_ys, max_distance=max_distance), 0)\n        d = paddle.where(boundary, tmp.fill_(float_('inf')), d)\n        d = (paddle.nn.functional.sigmoid(d) - 0.5) * 2\n        d = d.transpose([2, 0, 1]).unsqueeze(0)\n        d = F.interpolate(d,\n                          size=(ori_h, ori_w),",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:236-261"
+    },
+    "1461": {
+        "file_id": 132,
+        "content": "This code is performing cross-correlation between two input tensors and applying a boundary condition to the resulting tensor. It then applies sigmoid activation, resizes the tensor to original dimensions, and transposes it back to the original shape before unsqueezing the last dimension.",
+        "type": "comment"
+    },
+    "1462": {
+        "file_id": 132,
+        "content": "                          mode='bilinear',\n                          align_corners=True)\n        d = d.squeeze(0).transpose([1, 2, 0])\n    else:\n        corr = cross_correlate(x, y, max_distance=max_distance)\n        xs = paddle.sum(x * x, 2, keepdim=True)\n        ys = paddle.sum(y * y, 2, keepdim=True)\n        ones_ys = paddle.ones_like(ys)\n        ys = cross_correlate(ones_ys, ys, max_distance=max_distance)\n        d = xs + ys - 2 * corr\n        # Boundary should be set to Inf.\n        tmp = paddle.zeros_like(d)\n        boundary = paddle.equal(\n            cross_correlate(ones_ys, ones_ys, max_distance=max_distance), 0)\n        d = paddle.where(boundary, tmp.fill_(float_('inf')), d)\n    return d\ndef local_pairwise_distances2(x, y, max_distance=9):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n    Naive implementation using map_fn.\n    Used as a slow fallback for when correlation_cost is not available.\n    Args:\n    x: Float32 tensor of shape [height, width, feature_dim].\n    y: Float32 tensor of shape [height, width, feature_dim].",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:262-287"
+    },
+    "1463": {
+        "file_id": 132,
+        "content": "This code calculates the pairwise squared l2 distances between two tensors using either correlation or cross-correlation method, depending on the mode. In correlation mode, it uses bilinear interpolation and aligns corners. Otherwise, it uses cross-correlate function with a max_distance parameter. It also handles boundary cases by setting values to infinity where necessary.",
+        "type": "comment"
+    },
+    "1464": {
+        "file_id": 132,
+        "content": "    max_distance: Integer, the maximum distance in pixel coordinates\n      per dimension which is considered to be in the search window.\n    Returns:\n    Float32 distances tensor of shape\n      [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    if cfg.MODEL_LOCAL_DOWNSAMPLE:\n        ori_h, ori_w, _ = x.shape\n        x = paddle.transpose(x, [2, 0, 1]).unsqueeze(0)\n        x = F.avg_pool2d(x, (2, 2), (2, 2))\n        y = paddle.transpose(y, [2, 0, 1]).unsqueeze(0)\n        y = F.avg_pool2d(y, (2, 2), (2, 2))\n        _, channels, height, width = x.shape\n        padding_val = 1e20\n        padded_y = F.pad(\n            y, (max_distance, max_distance, max_distance, max_distance),\n            mode='constant',\n            value=padding_val)\n        offset_y = F.unfold(padded_y, kernel_sizes=[height, width]).reshape(\n            [1, channels, height, width, -1])\n        x = x.reshape([1, channels, height, width, 1])\n        minus = x - offset_y\n        dists = paddle.sum(paddle.multiply(minus, minus),\n                           axis=1).reshape([1, height, width,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:288-312"
+    },
+    "1465": {
+        "file_id": 132,
+        "content": "This code section performs local downsampling on the input tensors x and y. It first transposes the tensors and applies average pooling with a 2x2 kernel to reduce their size. Then, it pads the result of y with a large value, calculates offsets using unfolding, subtracts them from x, and sums squared differences across channels. The result is a distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].",
+        "type": "comment"
+    },
+    "1466": {
+        "file_id": 132,
+        "content": "                                            -1]).transpose([0, 3, 1, 2])\n        dists = (paddle.nn.functional.sigmoid(dists) - 0.5) * 2\n        dists = F.interpolate(dists,\n                              size=[ori_h, ori_w],\n                              mode='bilinear',\n                              align_corners=True)\n        dists = dists.squeeze(0).transpose([1, 2, 0])\n    else:\n        padding_val = 1e20\n        padded_y = nn.functional.pad(\n            y, (0, 0, max_distance, max_distance, max_distance, max_distance),\n            mode='constant',\n            value=padding_val)\n        height, width, _ = x.shape\n        dists = []\n        for y_start in range(2 * max_distance + 1):\n            y_end = y_start + height\n            y_slice = padded_y[y_start:y_end]\n            for x_start in range(2 * max_distance + 1):\n                x_end = x_start + width\n                offset_y = y_slice[:, x_start:x_end]\n                dist = paddle.sum(paddle.pow((x - offset_y), 2), dim=2)\n                dists.append(dist)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:313-336"
+    },
+    "1467": {
+        "file_id": 132,
+        "content": "This code calculates the distance between a set of 2D points and another point, in a sliding window manner. It handles two cases: when the first point set has been divided into smaller blocks for faster computation, and when it hasn't. The result is stored in dists as a list of distance matrices.",
+        "type": "comment"
+    },
+    "1468": {
+        "file_id": 132,
+        "content": "        dists = paddle.stack(dists, dim=2)\n    return dists\nclass SpatialCorrelationSampler:\n    pass\ndef cross_correlate(x, y, max_distance=9):\n    \"\"\"Efficiently computes the cross correlation of x and y.\n  Optimized implementation using correlation_cost.\n  Note that we do not normalize by the feature dimension.\n  Args:\n    x: Float32 tensor of shape [height, width, feature_dim].\n    y: Float32 tensor of shape [height, width, feature_dim].\n    max_distance: Integer, the maximum distance in pixel coordinates\n      per dimension which is considered to be in the search window.\n  Returns:\n    Float32 tensor of shape [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    corr_op = SpatialCorrelationSampler(kernel_size=1,\n                                        patch_size=2 * max_distance + 1,\n                                        stride=1,\n                                        dilation_patch=1,\n                                        padding=0)\n    xs = x.transpose(2, 0, 1)\n    xs = paddle.unsqueeze(xs, 0)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:337-365"
+    },
+    "1469": {
+        "file_id": 132,
+        "content": "This code defines the SpatialCorrelationSampler class and a function called cross_correlate. The cross_correlate function takes two tensors, x and y, of shape [height, width, feature_dim] as inputs. It computes the cross correlation of these tensors using an optimized implementation from the SpatialCorrelationSampler class. The output tensor has a shape of [height, width, (2 * max_distance + 1) ** 2].",
+        "type": "comment"
+    },
+    "1470": {
+        "file_id": 132,
+        "content": "    ys = y.transpose(2, 0, 1)\n    ys = paddle.unsqueeze(ys, 0)\n    corr = corr_op(xs, ys)\n    bs, _, _, hh, ww = corr.shape\n    corr = corr.reshape([bs, -1, hh, ww])\n    corr = paddle.squeeze(corr, 0)\n    corr = corr.transpose(1, 2, 0)\n    return corr\ndef local_previous_frame_nearest_neighbor_features_per_object(\n        prev_frame_embedding,\n        query_embedding,\n        prev_frame_labels,\n        gt_ids,\n        max_distance=12):\n    \"\"\"Computes nearest neighbor features while only allowing local matches.\n  Args:\n    prev_frame_embedding: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the last frame.\n    query_embedding: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the query frames.\n    prev_frame_labels: Tensor of shape [height, width, 1], the class labels of\n      the previous frame.\n    gt_ids: Int Tensor of shape [n_objs] of the sorted unique ground truth\n      ids in the first frame.\n    max_distance: Integer, the maximum distance allowed for local matching.",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:366-392"
+    },
+    "1471": {
+        "file_id": 132,
+        "content": "This code is part of the IntVOS model and defines a function that computes nearest neighbor features, allowing only local matches. It takes previous frame embedding, query embedding, previous frame labels, ground truth IDs and maximum distance as input. It transposes and reshapes the tensors before returning the computed correlations.",
+        "type": "comment"
+    },
+    "1472": {
+        "file_id": 132,
+        "content": "  Returns:\n    nn_features: A float32 np.array of nearest neighbor features of shape\n      [1, height, width, n_objects, 1].\n    \"\"\"\n    d = local_pairwise_distances2(query_embedding,\n                                  prev_frame_embedding,\n                                  max_distance=max_distance)\n    height, width = prev_frame_embedding.shape[:2]\n    if MODEL_UNFOLD:\n        labels = float_(prev_frame_labels).transpose([2, 0, 1]).unsqueeze(0)\n        padded_labels = F.pad(labels, (\n            2 * max_distance,\n            2 * max_distance,\n            2 * max_distance,\n            2 * max_distance,\n        ))\n        offset_labels = F.unfold(padded_labels,\n                                 kernel_sizes=[height, width],\n                                 strides=[2, 2]).reshape([height, width, -1, 1])\n        offset_masks = paddle.equal(\n            offset_labels,\n            float_(gt_ids).unsqueeze(0).unsqueeze(0).unsqueeze(0))\n    else:\n        masks = paddle.equal(prev_frame_labels,\n                             gt_ids.unsqueeze(0).unsqueeze(0))",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:393-421"
+    },
+    "1473": {
+        "file_id": 132,
+        "content": "This function calculates the nearest neighbor features using local pairwise distances. If MODEL_UNFOLD is set, it pads and unfolds the labels for offset masks generation. Else, it directly creates masks by comparing prev_frame_labels to gt_ids.",
+        "type": "comment"
+    },
+    "1474": {
+        "file_id": 132,
+        "content": "        padded_masks = nn.functional.pad(masks, (\n            0,\n            0,\n            max_distance,\n            max_distance,\n            max_distance,\n            max_distance,\n        ))\n        offset_masks = []\n        for y_start in range(2 * max_distance + 1):\n            y_end = y_start + height\n            masks_slice = padded_masks[y_start:y_end]\n            for x_start in range(2 * max_distance + 1):\n                x_end = x_start + width\n                offset_mask = masks_slice[:, x_start:x_end]\n                offset_masks.append(offset_mask)\n        offset_masks = paddle.stack(offset_masks, axis=2)\n    d_tiled = d.unsqueeze(-1).tile((1, 1, 1, gt_ids.shape[0]))\n    pad = paddle.ones_like(d_tiled)\n    d_masked = paddle.where(offset_masks, d_tiled, pad)\n    dists = paddle.min(d_masked, axis=2)\n    dists = dists.reshape([1, height, width, gt_ids.shape[0], 1])\n    return dists\n##############################################################\n#################\nclass _res_block(nn.Layer):\n    def __init__(self, in_dim, out_dim):",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:422-454"
+    },
+    "1475": {
+        "file_id": 132,
+        "content": "This code applies padding to masks and creates offset masks by slicing the padded masks. It then constructs a 3D tensor of offset masks using Paddle's stack function. It also tiles the input data 'd' along the gt_ids dimension and creates a padding tensor. The code then computes the minimum distance between the tiled input and the masked data, resulting in distances tensor. Finally, it reshapes the distances tensor to have a specific shape and returns it.\nThe _res_block class is a layer that takes an input dimension (in_dim) and an output dimension (out_dim).",
+        "type": "comment"
+    },
+    "1476": {
+        "file_id": 132,
+        "content": "        super(_res_block, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,\n                               out_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.relu1 = nn.ReLU()\n        self.bn1 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg.TRAIN_BN_MOM)\n        self.conv2 = nn.Conv2D(out_dim,\n                               out_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.relu2 = nn.ReLU()\n        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg.TRAIN_BN_MOM)\n    def forward(self, x):\n        res = x\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu2(x)\n        x += res\n        return x\n####################\nclass IntSegHead(nn.Layer):\n    def __init__(self,\n                 in_dim=(cfg.MODEL_SEMANTIC_EMBEDDING_DIM + 3),",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:455-486"
+    },
+    "1477": {
+        "file_id": 132,
+        "content": "This code defines a Residual Block and an Instance Segmentation Head for the Ma-Net model. The Residual Block consists of two 3x3 convolutions, batch normalization, and ReLU activations, while the IntSegHead layer takes in a specific input dimension for instance segmentation tasks.",
+        "type": "comment"
+    },
+    "1478": {
+        "file_id": 132,
+        "content": "                 emb_dim=cfg.MODEL_HEAD_EMBEDDING_DIM):\n        super(IntSegHead, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,\n                               emb_dim,\n                               kernel_size=7,\n                               stride=1,\n                               padding=3)\n        self.bn1 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg.TRAIN_BN_MOM)\n        self.relu1 = nn.ReLU(True)\n        self.res1 = _res_block(emb_dim, emb_dim)\n        self.res2 = _res_block(emb_dim, emb_dim)\n        self.conv2 = nn.Conv2D(256, emb_dim, kernel_size=3, stride=1, padding=1)\n        self.bn2 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg.TRAIN_BN_MOM)\n        self.relu2 = nn.ReLU(True)\n        self.conv3 = nn.Conv2D(emb_dim, 1, 1, 1)\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.res1(x)\n        x = self.res2(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu2(x)\n        x = self.conv3(x)\n        return x",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:487-513"
+    },
+    "1479": {
+        "file_id": 132,
+        "content": "This code defines a neural network class called \"IntSegHead\" for segmentation tasks. It consists of multiple convolutional and batch normalization layers, followed by ReLU activations. The output is passed through another convolutional layer before being fed into the final convolutional layer to produce the result.",
+        "type": "comment"
+    },
+    "1480": {
+        "file_id": 132,
+        "content": "class _split_separable_conv2d(nn.Layer):\n    def __init__(self, in_dim, out_dim, kernel_size=7):\n        super(_split_separable_conv2d, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,\n                               in_dim,\n                               kernel_size=kernel_size,\n                               stride=1,\n                               padding=int((kernel_size - 1) / 2),\n                               groups=in_dim)\n        self.relu1 = nn.ReLU(True)\n        self.bn1 = paddle.nn.BatchNorm2D(in_dim, momentum=cfg.TRAIN_BN_MOM)\n        self.conv2 = nn.Conv2D(in_dim, out_dim, kernel_size=1, stride=1)\n        self.relu2 = nn.ReLU(True)\n        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg.TRAIN_BN_MOM)\n        kaiming_normal_(self.conv1.weight, mode='fan_out', nonlinearity='relu')\n        kaiming_normal_(self.conv2.weight, mode='fan_out', nonlinearity='relu')\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.conv2(x)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:516-537"
+    },
+    "1481": {
+        "file_id": 132,
+        "content": "This code defines a custom layer \"_split_separable_conv2d\" that performs separable convolution using two consecutive 2D convolutions. It consists of two 2D convolutions separated by Batch Normalization and ReLU activation functions. The first convolution is followed by Batch Normalization and ReLU, while the second convolution is also followed by another Batch Normalization and ReLU. Weights are initialized using Kaiming Normal initialization for both convolutions.",
+        "type": "comment"
+    },
+    "1482": {
+        "file_id": 132,
+        "content": "        x = self.bn2(x)\n        x = self.relu2(x)\n        return x\nclass DynamicSegHead(nn.Layer):\n    def __init__(self,\n                 in_dim=(cfg.MODEL_SEMANTIC_EMBEDDING_DIM + 3),\n                 embed_dim=cfg.MODEL_HEAD_EMBEDDING_DIM,\n                 kernel_size=1):\n        super(DynamicSegHead, self).__init__()\n        self.layer1 = _split_separable_conv2d(in_dim, embed_dim)\n        self.layer2 = _split_separable_conv2d(embed_dim, embed_dim)\n        self.layer3 = _split_separable_conv2d(embed_dim, embed_dim)\n        self.layer4 = _split_separable_conv2d(embed_dim, embed_dim)\n        self.conv = nn.Conv2D(embed_dim, 1, 1, 1)\n        kaiming_normal_(self.conv.weight, mode='fan_out', nonlinearity='relu')\n    def forward(self, x):\n        x = self.layer1(x)\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        x = self.conv(x)\n        return x\n##################\n###############\nclass IntVOS(nn.Layer):\n    def __init__(self, cfg, feature_extracter):\n        super(IntVOS, self).__init__()",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:538-571"
+    },
+    "1483": {
+        "file_id": 132,
+        "content": "The code defines a DynamicSegHead class with four split separable convolutional layers, followed by a 1x1 convolution. It also initializes an IntVOS class that takes in the configuration and feature extractor as parameters. The classes are used for semantic segmentation tasks.",
+        "type": "comment"
+    },
+    "1484": {
+        "file_id": 132,
+        "content": "        self.feature_extracter = feature_extracter  ##embedding extractor\n        self.feature_extracter.cls_conv = nn.Sequential()\n        self.feature_extracter.upsample4 = nn.Sequential()\n        self.semantic_embedding = None\n        self.seperate_conv = nn.Conv2D(cfg.MODEL_ASPP_OUTDIM,\n                                       cfg.MODEL_ASPP_OUTDIM,\n                                       kernel_size=3,\n                                       stride=1,\n                                       padding=1,\n                                       groups=cfg.MODEL_ASPP_OUTDIM)\n        self.bn1 = paddle.nn.BatchNorm2D(cfg.MODEL_ASPP_OUTDIM,\n                                         momentum=cfg.TRAIN_BN_MOM)\n        self.relu1 = nn.ReLU(True)\n        self.embedding_conv = nn.Conv2D(cfg.MODEL_ASPP_OUTDIM,\n                                        cfg.MODEL_SEMANTIC_EMBEDDING_DIM, 1, 1)\n        self.relu2 = nn.ReLU(True)\n        self.bn2 = paddle.nn.BatchNorm2D(cfg.MODEL_SEMANTIC_EMBEDDING_DIM,\n                                         momentum=cfg.TRAIN_BN_MOM)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:572-589"
+    },
+    "1485": {
+        "file_id": 132,
+        "content": "The code initializes components for a network architecture. It creates feature extractors, convolutional layers, batch normalization layers, and ReLU activation functions to process and extract semantic features from input data. These features will be used for tasks such as object detection or image classification.",
+        "type": "comment"
+    },
+    "1486": {
+        "file_id": 132,
+        "content": "        self.semantic_embedding = nn.Sequential(*[\n            self.seperate_conv, self.bn1, self.relu1, self.embedding_conv,\n            self.bn2, self.relu2\n        ])\n        for m in self.semantic_embedding:\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')\n        self.dynamic_seghead = DynamicSegHead()  # propagation segm head\n        if cfg.MODEL_USEIntSeg:\n            self.inter_seghead = IntSegHead(\n                in_dim=cfg.MODEL_SEMANTIC_EMBEDDING_DIM + 3)\n        else:\n            self.inter_seghead = DynamicSegHead(\n                in_dim=cfg.MODEL_SEMANTIC_EMBEDDING_DIM +\n                2)  # interaction segm head\n    def forward(self,\n                x=None,\n                ref_scribble_label=None,\n                previous_frame_mask=None,\n                normalize_nearest_neighbor_distances=True,\n                use_local_map=True,\n                seq_names=None,\n                gt_ids=None,\n                k_nearest_neighbors=1,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:590-616"
+    },
+    "1487": {
+        "file_id": 132,
+        "content": "The code initializes the network's semantic embedding layer, consisting of a sequence of convolutional layers, and applies Kaiming initialization to the weights. It also creates a dynamic segmentation head (seghead) for propagation and an interaction segmentation head based on the configuration flag MODEL_USEIntSeg. The function defines the forward pass for the network, taking in various inputs such as image data, reference labels, and masks.",
+        "type": "comment"
+    },
+    "1488": {
+        "file_id": 132,
+        "content": "                global_map_tmp_dic=None,\n                local_map_dics=None,\n                interaction_num=None,\n                start_annotated_frame=None,\n                frame_num=None):\n        x = self.extract_feature(x)\n        #         print('extract_feature:', x.mean().item())\n        ref_frame_embedding, previous_frame_embedding, current_frame_embedding = paddle.split(\n            x, num_or_sections=3, axis=0)\n        if global_map_tmp_dic is None:\n            dic = self.prop_seghead(\n                ref_frame_embedding, previous_frame_embedding,\n                current_frame_embedding, ref_scribble_label,\n                previous_frame_mask, normalize_nearest_neighbor_distances,\n                use_local_map, seq_names, gt_ids, k_nearest_neighbors,\n                global_map_tmp_dic, local_map_dics, interaction_num,\n                start_annotated_frame, frame_num, self.dynamic_seghead)\n            return dic\n        else:\n            dic, global_map_tmp_dic = self.prop_seghead(\n                ref_frame_embedding, previous_frame_embedding,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:617-640"
+    },
+    "1489": {
+        "file_id": 132,
+        "content": "This code splits the input feature into three parts, then if `global_map_tmp_dic` is None, it passes these parts and other parameters to `prop_seghead()`, which returns a dictionary. If `global_map_tmp_dic` is not None, it also passes `global_map_tmp_dic` as an additional parameter before calling `prop_seghead()`. The function then returns the returned dictionary and updates `global_map_tmp_dic`.",
+        "type": "comment"
+    },
+    "1490": {
+        "file_id": 132,
+        "content": "                current_frame_embedding, ref_scribble_label,\n                previous_frame_mask, normalize_nearest_neighbor_distances,\n                use_local_map, seq_names, gt_ids, k_nearest_neighbors,\n                global_map_tmp_dic, local_map_dics, interaction_num,\n                start_annotated_frame, frame_num, self.dynamic_seghead)\n            return dic, global_map_tmp_dic\n    def extract_feature(self, x):\n        x = self.feature_extracter(x)\n        x = self.semantic_embedding(x)\n        return x\n    def prop_seghead(self,\n                     ref_frame_embedding=None,\n                     previous_frame_embedding=None,\n                     current_frame_embedding=None,\n                     ref_scribble_label=None,\n                     previous_frame_mask=None,\n                     normalize_nearest_neighbor_distances=True,\n                     use_local_map=True,\n                     seq_names=None,\n                     gt_ids=None,\n                     k_nearest_neighbors=1,\n                     global_map_tmp_dic=None,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:641-664"
+    },
+    "1491": {
+        "file_id": 132,
+        "content": "The code defines a function that takes various inputs including frame embeddings, scribble labels, and masked frames. It performs feature extraction using a predefined feature extractor and semantic embedding. The function then returns the extracted features and temporary global map dictionaries.",
+        "type": "comment"
+    },
+    "1492": {
+        "file_id": 132,
+        "content": "                     local_map_dics=None,\n                     interaction_num=None,\n                     start_annotated_frame=None,\n                     frame_num=None,\n                     dynamic_seghead=None):\n        \"\"\"return: feature_embedding,global_match_map,local_match_map,previous_frame_mask\"\"\"\n        ###############\n        global_map_tmp_dic = global_map_tmp_dic\n        dic_tmp = {}\n        bs, c, h, w = current_frame_embedding.shape\n        if cfg.TEST_MODE:\n            scale_ref_scribble_label = float_(ref_scribble_label)\n        else:\n            scale_ref_scribble_label = paddle.nn.functional.interpolate(\n                float_(ref_scribble_label), size=(h, w), mode='nearest')\n        scale_ref_scribble_label = int_(scale_ref_scribble_label)\n        scale_previous_frame_label = paddle.nn.functional.interpolate(\n            float_(previous_frame_mask), size=(h, w), mode='nearest')\n        #         print(scale_previous_frame_label.sum())  # xx\n        #         print(previous_frame_mask.sum().item())  # xx",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:665-685"
+    },
+    "1493": {
+        "file_id": 132,
+        "content": "This code defines a function that takes various inputs and returns feature_embedding, global_match_map, local_match_map, and previous_frame_mask. It performs interpolation on the ref_scribble_label and previous_frame_mask using nearest mode to resize them to the same size as current_frame_embedding.",
+        "type": "comment"
+    },
+    "1494": {
+        "file_id": 132,
+        "content": "        scale_previous_frame_label = int_(scale_previous_frame_label)\n        #         print(scale_previous_frame_label.sum().item())  # xx\n        for n in range(bs):\n            seq_current_frame_embedding = current_frame_embedding[n]\n            seq_ref_frame_embedding = ref_frame_embedding[n]\n            seq_prev_frame_embedding = previous_frame_embedding[n]\n            seq_ref_frame_embedding = seq_ref_frame_embedding.transpose(\n                [1, 2, 0])\n            seq_current_frame_embedding = seq_current_frame_embedding.transpose(\n                [1, 2, 0])\n            seq_ref_scribble_label = scale_ref_scribble_label[n].transpose(\n                [1, 2, 0])\n            #########Global Map\n            nn_features_n, ref_obj_ids = nearest_neighbor_features_per_object(\n                reference_embeddings=seq_ref_frame_embedding,\n                query_embeddings=seq_current_frame_embedding,\n                reference_labels=seq_ref_scribble_label,\n                k_nearest_neighbors=k_nearest_neighbors,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:686-704"
+    },
+    "1495": {
+        "file_id": 132,
+        "content": "In this code snippet, we see the process of extracting nearest neighbor features per object for each batch of frames. The frames are transposed and labeled before finding the k_nearest_neighbors. These operations are performed within a loop for every frame in the batch (bs).",
+        "type": "comment"
+    },
+    "1496": {
+        "file_id": 132,
+        "content": "                gt_ids=gt_ids[n],\n                n_chunks=10)\n            if normalize_nearest_neighbor_distances:\n                nn_features_n = (paddle.nn.functional.sigmoid(nn_features_n) -\n                                 0.5) * 2\n            if global_map_tmp_dic is not None:  ###when testing, use global map memory\n                if seq_names[n] not in global_map_tmp_dic:\n                    global_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                        nn_features_n).tile([104, 1, 1, 1, 1])\n                nn_features_n = paddle.where(\n                    nn_features_n <=\n                    global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0),\n                    nn_features_n,\n                    global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0))\n                global_map_tmp_dic[seq_names[n]][\n                    frame_num[n]] = nn_features_n.detach()[0]\n            #########################Local dist map\n            seq_prev_frame_embedding = seq_prev_frame_embedding.transpose(",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:705-725"
+    },
+    "1497": {
+        "file_id": 132,
+        "content": "This code segment checks if the current sequence name exists in the global map temporary dictionary. If it does not exist, a paddle.ones_like(nn_features_n) is created and assigned to the dictionary with shape [104, 1, 1, 1, 1]. Then, the code performs a where operation using nn_features_n, comparing it to the global map value for the current sequence name and frame number. If nn_features_n is less than or equal to the global map value, it remains unchanged; otherwise, the global map value overwrites nn_features_n. The last line transposes seq_prev_frame_embedding before continuing with the next chunk of code.",
+        "type": "comment"
+    },
+    "1498": {
+        "file_id": 132,
+        "content": "                [1, 2, 0])\n            seq_previous_frame_label = scale_previous_frame_label[n].transpose(\n                [1, 2, 0])\n            if use_local_map:\n                prev_frame_nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(\n                    prev_frame_embedding=seq_prev_frame_embedding,\n                    query_embedding=seq_current_frame_embedding,\n                    prev_frame_labels=seq_previous_frame_label,\n                    gt_ids=ref_obj_ids,\n                    max_distance=cfg.MODEL_MAX_LOCAL_DISTANCE)\n            else:\n                prev_frame_nn_features_n, _ = nearest_neighbor_features_per_object(\n                    reference_embeddings=seq_prev_frame_embedding,\n                    query_embeddings=seq_current_frame_embedding,\n                    reference_labels=seq_previous_frame_label,\n                    k_nearest_neighbors=k_nearest_neighbors,\n                    gt_ids=gt_ids[n],\n                    n_chunks=20)\n                prev_frame_nn_features_n = (",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:726-745"
+    },
+    "1499": {
+        "file_id": 132,
+        "content": "The code is performing nearest neighbor feature extraction for previous frames in a video sequence. It checks if the use_local_map flag is set, and depending on its value, either uses local_previous_frame_nearest_neighbor_features_per_object function or nearest_neighbor_features_per_object function to extract features. If use_local_map is true, it takes previous frame embedding, current frame embedding, previous frame labels, reference object IDs and max distance as inputs. Otherwise, it takes previous frame embeddings, current frame embeddings, previous frame labels, k-nearest neighbors, gt_ids (for current iteration), and number of chunks as inputs. The code then assigns the extracted features to prev_frame_nn_features_n variable.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/15.json b/docs/data/15.json
new file mode 100644
index 000000000..d7775405c
--- /dev/null
+++ b/docs/data/15.json
@@ -0,0 +1,546 @@
+{
+    "1500": {
+        "file_id": 132,
+        "content": "                    paddle.nn.functional.sigmoid(prev_frame_nn_features_n) -\n                    0.5) * 2\n#             print(prev_frame_nn_features_n.mean().item(), prev_frame_nn_features_n.shape, interaction_num)  # o\n#############\n            if local_map_dics is not None:  ##When testing, use local map memory\n                local_map_tmp_dic, local_map_dist_dic = local_map_dics\n                if seq_names[n] not in local_map_dist_dic:\n                    print(seq_names[n], 'not in local_map_dist_dic')\n                    local_map_dist_dic[seq_names[n]] = paddle.zeros(104, 9)\n                if seq_names[n] not in local_map_tmp_dic:\n                    print(seq_names[n], 'not in local_map_tmp_dic')\n                    local_map_tmp_dic[seq_names[n]] = paddle.zeros_like(\n                        prev_frame_nn_features_n).unsqueeze(0).tile(\n                            [104, 9, 1, 1, 1, 1])\n                local_map_dist_dic[seq_names[n]][\n                    frame_num[n], interaction_num -\n                    1] = 1.0 / (abs(frame_num[n] - start_annotated_frame)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:746-764"
+    },
+    "1501": {
+        "file_id": 132,
+        "content": "This code segment is checking if the current sequence name is present in the local map dictionaries for distance and temporary maps. If it's not, it creates new entries with zeros initialized. The local map distance value is then updated based on the frame number and interaction number, using the absolute difference from a start annotated frame to determine the distance. This could be used in a video sequence processing context where local map dictionaries store temporary and distance maps for different sequences.",
+        "type": "comment"
+    },
+    "1502": {
+        "file_id": 132,
+        "content": "                                )  # bugs fixed.\n                local_map_tmp_dic[seq_names[n]][\n                    frame_num[n],\n                    interaction_num - 1] = prev_frame_nn_features_n.squeeze(\n                        0).detach()  # bugs fixed.\n                if interaction_num == 1:\n                    prev_frame_nn_features_n = local_map_tmp_dic[seq_names[n]][\n                        frame_num[n]][interaction_num - 1]\n                    prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                        0)\n                else:\n                    if local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 1] > \\\n                            local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 2]:\n                        prev_frame_nn_features_n = local_map_tmp_dic[\n                            seq_names[n]][frame_num[n]][interaction_num - 1]\n                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                            0)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:765-781"
+    },
+    "1503": {
+        "file_id": 132,
+        "content": "This code block appears to be part of a larger function. It seems to store and update the features of previous frames for a given sequence, based on the interaction number and frame number. If the current interaction's distance is greater than the previous one, it updates the previous frame features. The code uses dictionaries to store these features, with the frame number and interaction number as keys. The detach() function seems to remove the feature tensor from the computation graph for memory efficiency, while unsqueeze(0) reshapes the tensor to have a batch dimension.",
+        "type": "comment"
+    },
+    "1504": {
+        "file_id": 132,
+        "content": "                    else:\n                        prev_frame_nn_features_n = local_map_tmp_dic[\n                            seq_names[n]][frame_num[n]][interaction_num - 2]\n                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                            0)\n                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)\n            to_cat_previous_frame = (\n                float_(seq_previous_frame_label) == float_(ref_obj_ids)\n            )  # float comparision?\n            to_cat_current_frame_embedding = current_frame_embedding[\n                n].unsqueeze(0).tile((ref_obj_ids.shape[0], 1, 1, 1))\n            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(\n                [2, 3, 0, 1])\n            to_cat_previous_frame = float_(\n                to_cat_previous_frame.unsqueeze(-1).transpose([2, 3, 0, 1]))\n            to_cat_prev_frame_nn_feature_n = prev_frame_nn_features_n.squeeze(\n                0).transpose([2, 3, 0, 1])\n            to_cat = paddle.concat(",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:782-803"
+    },
+    "1505": {
+        "file_id": 132,
+        "content": "This code snippet is part of a neural network model for video object detection. It deals with handling previous frame features, categorizing frames based on the reference object IDs, and concatenating different tensor inputs together. The code checks if the current frame's label matches the reference object ID, unsqueezes and tiles the tensors accordingly, transposes them, and finally concatenates these transformed tensors using `paddle.concat`.",
+        "type": "comment"
+    },
+    "1506": {
+        "file_id": 132,
+        "content": "                (to_cat_current_frame_embedding, to_cat_nn_feature_n,\n                 to_cat_prev_frame_nn_feature_n, to_cat_previous_frame), 1)\n            pred_ = dynamic_seghead(to_cat)\n            pred_ = pred_.transpose([1, 0, 2, 3])\n            dic_tmp[seq_names[n]] = pred_\n        if global_map_tmp_dic is None:\n            return dic_tmp\n        else:\n            if local_map_dics is None:\n                return dic_tmp, global_map_tmp_dic\n            else:\n                return dic_tmp, global_map_tmp_dic, local_map_dics\n    def int_seghead(self,\n                    ref_frame_embedding=None,\n                    ref_scribble_label=None,\n                    prev_round_label=None,\n                    normalize_nearest_neighbor_distances=True,\n                    global_map_tmp_dic=None,\n                    local_map_dics=None,\n                    interaction_num=None,\n                    seq_names=None,\n                    gt_ids=None,\n                    k_nearest_neighbors=1,\n                    frame_num=None,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:804-829"
+    },
+    "1507": {
+        "file_id": 132,
+        "content": "This function, int_seghead, takes various inputs such as reference frame embedding, scribble label, previous round label etc. It normalizes nearest neighbor distances if specified and returns the dictionary temporary (dic_tmp) containing predicted results for each sequence, along with optional global map temporary dictionary (global_map_tmp_dic) and local map dictionaries (local_map_dics). The interaction number (interaction_num), frame number (frame_num) and list of sequence names (seq_names) are also used.",
+        "type": "comment"
+    },
+    "1508": {
+        "file_id": 132,
+        "content": "                    first_inter=True):\n        dic_tmp = {}\n        bs, c, h, w = ref_frame_embedding.shape\n        scale_ref_scribble_label = paddle.nn.functional.interpolate(\n            float_(ref_scribble_label), size=(h, w), mode='nearest')\n        scale_ref_scribble_label = int_(scale_ref_scribble_label)\n        if not first_inter:\n            scale_prev_round_label = paddle.nn.functional.interpolate(\n                float_(prev_round_label), size=(h, w), mode='nearest')\n            scale_prev_round_label = int_(scale_prev_round_label)\n        n_chunks = 500\n        for n in range(bs):\n            gt_id = paddle.arange(0, gt_ids[n] + 1)\n            gt_id = int_(gt_id)\n            seq_ref_frame_embedding = ref_frame_embedding[n]\n            ########################Local dist map\n            seq_ref_frame_embedding = paddle.transpose(seq_ref_frame_embedding,\n                                                       [1, 2, 0])\n            seq_ref_scribble_label = paddle.transpose(\n                scale_ref_scribble_label[n], [1, 2, 0])",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:830-853"
+    },
+    "1509": {
+        "file_id": 132,
+        "content": "This code segment is part of the Ma-Net network in PaddleVideo. It interpolates the reference scribble label and previous round label images, assigns ground truth IDs, and performs local distance map calculations on a sequence of frames for a batch of videos.",
+        "type": "comment"
+    },
+    "1510": {
+        "file_id": 132,
+        "content": "            nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(\n                prev_frame_embedding=seq_ref_frame_embedding,\n                query_embedding=seq_ref_frame_embedding,\n                prev_frame_labels=seq_ref_scribble_label,\n                gt_ids=gt_id,\n                max_distance=cfg.MODEL_MAX_LOCAL_DISTANCE)\n            #######\n            ######################Global map update\n            if seq_names[n] not in global_map_tmp_dic:\n                global_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                    nn_features_n).tile([104, 1, 1, 1, 1])\n            nn_features_n_ = paddle.where(\n                nn_features_n <=\n                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0),\n                nn_features_n,\n                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0))\n            ###\n            ###\n            global_map_tmp_dic[seq_names[n]][\n                frame_num[n]] = nn_features_n_.detach()[0]\n            ##################Local map update",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:854-877"
+    },
+    "1511": {
+        "file_id": 132,
+        "content": "This code segment is updating the global and local maps for a given sequence of frames. It first calculates the nearest neighbor features (nn_features_n) using the previous frame embedding, query embedding, previous frame labels, and ground truth IDs. Then it checks if this current sequence name exists in the global map temporary dictionary (global_map_tmp_dic). If not, it initializes a one-tensor for that sequence and tiles it to match the shape of nn_features_n. It then applies a where statement to compare nn_features_n with the global map tensor, selecting either nn_features_n or the global map tensor depending on which is smaller. Finally, it updates the global map temporary dictionary entry for this sequence at the current frame number with the selected tensor from the where statement.",
+        "type": "comment"
+    },
+    "1512": {
+        "file_id": 132,
+        "content": "            if local_map_dics is not None:\n                local_map_tmp_dic, local_map_dist_dic = local_map_dics\n                if seq_names[n] not in local_map_dist_dic:\n                    local_map_dist_dic[seq_names[n]] = paddle.zeros([104, 9])\n                if seq_names[n] not in local_map_tmp_dic:\n                    local_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                        nn_features_n).unsqueeze(0).tile([104, 9, 1, 1, 1, 1])\n                local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num -\n                                                               1] = 0\n                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)\n            ##################\n            to_cat_current_frame_embedding = ref_frame_embedding[n].unsqueeze(\n                0).tile((gt_id.shape[0], 1, 1, 1))\n            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(\n                [2, 3, 0, 1])\n            to_cat_scribble_mask_to_cat = (\n                float_(seq_ref_scribble_label) == float_(gt_id)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:878-897"
+    },
+    "1513": {
+        "file_id": 132,
+        "content": "The code checks if a dictionary of local maps is provided. If so, it retrieves the temporary and distance dictionaries from it. It then updates these dictionaries for the current sequence name (seq_names[n]), adding a 0 to a specific element if the sequence name is not already in the distance dictionary. Finally, it creates embedding tensors for frame and feature comparison and prepares them for concatenation.",
+        "type": "comment"
+    },
+    "1514": {
+        "file_id": 132,
+        "content": "            )  # float comparision?\n            to_cat_scribble_mask_to_cat = float_(\n                to_cat_scribble_mask_to_cat.unsqueeze(-1).transpose(\n                    [2, 3, 0, 1]))\n            if not first_inter:\n                seq_prev_round_label = scale_prev_round_label[n].transpose(\n                    [1, 2, 0])\n                to_cat_prev_round_to_cat = (\n                    float_(seq_prev_round_label) == float_(gt_id)\n                )  # float comparision?\n                to_cat_prev_round_to_cat = float_(\n                    to_cat_prev_round_to_cat.unsqueeze(-1).transpose(\n                        [2, 3, 0, 1]))\n            else:\n                to_cat_prev_round_to_cat = paddle.zeros_like(\n                    to_cat_scribble_mask_to_cat)\n                to_cat_prev_round_to_cat[0] = 1.\n            to_cat = paddle.concat(\n                (to_cat_current_frame_embedding, to_cat_scribble_mask_to_cat,\n                 to_cat_prev_round_to_cat), 1)\n            pred_ = self.inter_seghead(to_cat)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:898-921"
+    },
+    "1515": {
+        "file_id": 132,
+        "content": "This code is performing a series of operations on tensors to create the 'to_cat' tensor for use in the model. It checks if it's the first iteration and adjusts the previous round label accordingly. Then, it concatenates three different tensor inputs along the 1st axis (channel dimension) and passes the result through a segmentation head network to get the final prediction 'pred_'. This code seems to be part of a larger neural network for video object segmentation.",
+        "type": "comment"
+    },
+    "1516": {
+        "file_id": 132,
+        "content": "            pred_ = pred_.transpose([1, 0, 2, 3])\n            dic_tmp[seq_names[n]] = pred_\n        if local_map_dics is None:\n            return dic_tmp\n        else:\n            return dic_tmp, local_map_dics",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/IntVOS.py:922-927"
+    },
+    "1517": {
+        "file_id": 132,
+        "content": "This code is transposing the tensor 'pred_' and storing it in 'dic_tmp' with corresponding sequence name as key. It then checks if 'local\\_map\\_dics' is None, and returns 'dic\\_tmp' or returns both 'dic\\_tmp' and 'local\\_map\\_dics'.",
+        "type": "comment"
+    },
+    "1518": {
+        "file_id": 133,
+        "content": "/applications/Ma-Net/networks/aspp.py",
+        "type": "filepath"
+    },
+    "1519": {
+        "file_id": 133,
+        "content": "ASPPModule is a CNN layer for ASPP modules in Ma-Net, implementing atrous spatial pyramid pooling with Conv2D, BatchNorm, and ReLU activation. The class initializes instance parameters and sets dilations for ASPP modules using _ASPPModule class.",
+        "type": "summary"
+    },
+    "1520": {
+        "file_id": 133,
+        "content": "import math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom utils.api import kaiming_normal_\nclass _ASPPModule(nn.Layer):\n    def __init__(self, inplanes, planes, kernel_size, padding, dilation,\n                 BatchNorm):\n        super(_ASPPModule, self).__init__()\n        self.atrous_conv = nn.Conv2D(inplanes,\n                                     planes,\n                                     kernel_size=kernel_size,\n                                     stride=1,\n                                     padding=padding,\n                                     dilation=dilation,\n                                     bias_attr=False)\n        self.bn = BatchNorm(planes)\n        self.relu = nn.ReLU(True)\n        self._init_weight()\n    def forward(self, x):\n        x = self.atrous_conv(x)\n        x = self.bn(x)\n        return self.relu(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/aspp.py:1-34"
+    },
+    "1521": {
+        "file_id": 133,
+        "content": "ASPPModule is a convolutional neural network layer that performs atrous spatial pyramid pooling. It consists of a Conv2D layer, BatchNorm layer, and ReLU activation function for feature extraction and normalization in a hierarchical manner. The weight initialization follows the Kaiming normal distribution.",
+        "type": "comment"
+    },
+    "1522": {
+        "file_id": 133,
+        "content": "                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)\nclass ASPP(nn.Layer):\n    def __init__(self, backbone, output_stride, BatchNorm):\n        super(ASPP, self).__init__()\n        if backbone == 'drn':\n            inplanes = 512\n        elif backbone == 'mobilenet':\n            inplanes = 320\n        else:\n            inplanes = 2048\n        if output_stride == 16:\n            dilations = [1, 6, 12, 18]\n        elif output_stride == 8:\n            dilations = [1, 12, 24, 36]\n        else:\n            raise NotImplementedError\n        self.aspp1 = _ASPPModule(inplanes,\n                                 256,\n                                 1,\n                                 padding=0,\n                                 dilation=dilations[0],\n                                 BatchNorm=BatchNorm)\n        self.aspp2 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[1],",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/aspp.py:35-66"
+    },
+    "1523": {
+        "file_id": 133,
+        "content": "The code defines a class \"ASPP\" that inherits from \"nn.Layer\". It initializes the instance with parameters such as backbone, output_stride, and BatchNorm. Depending on these inputs, it sets the dilations for the ASPP modules. These ASPP modules are instances of _ASPPModule class with specified input size, output size, kernel size, and dilation rate.",
+        "type": "comment"
+    },
+    "1524": {
+        "file_id": 133,
+        "content": "                                 dilation=dilations[1],\n                                 BatchNorm=BatchNorm)\n        self.aspp3 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[2],\n                                 dilation=dilations[2],\n                                 BatchNorm=BatchNorm)\n        self.aspp4 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[3],\n                                 dilation=dilations[3],\n                                 BatchNorm=BatchNorm)\n        self.global_avg_pool = nn.Sequential(\n            nn.AdaptiveAvgPool2D((1, 1)),\n            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),\n            BatchNorm(256), nn.ReLU())\n        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)\n        self.bn1 = BatchNorm(256)\n        self.relu = nn.ReLU(True)\n        self.dropout = nn.Dropout(0.1)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/aspp.py:67-89"
+    },
+    "1525": {
+        "file_id": 133,
+        "content": "This code defines three ASPP modules and a global average pooling layer for a neural network, with batch normalization and ReLU activations applied. The convolutional layers have specific dilations and padding values.",
+        "type": "comment"
+    },
+    "1526": {
+        "file_id": 133,
+        "content": "        self._init_weight()\n    def forward(self, x):\n        x1 = self.aspp1(x)\n        x2 = self.aspp2(x)\n        x3 = self.aspp3(x)\n        x4 = self.aspp4(x)\n        x5 = self.global_avg_pool(x)\n        x5 = F.interpolate(x5,\n                           size=x4.shape[2:],\n                           mode='bilinear',\n                           align_corners=True)\n        x = paddle.concat((x1, x2, x3, x4, x5), axis=1)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        return x\n        return self.dropout(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)\ndef build_aspp(backbone, output_stride, BatchNorm):\n    return ASPP(backbone, output_stride, BatchNorm)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/aspp.py:90-123"
+    },
+    "1527": {
+        "file_id": 133,
+        "content": "The code defines a convolutional neural network (CNN) for the ASPP (Aggregated Spatial Pyramid Pooling) module in PaddleVideo's Ma-Net. It includes an initialization function, forward pass computation, and a builder function to create the ASPP module with specified parameters.",
+        "type": "comment"
+    },
+    "1528": {
+        "file_id": 134,
+        "content": "/applications/Ma-Net/networks/backbone/__init__.py",
+        "type": "filepath"
+    },
+    "1529": {
+        "file_id": 134,
+        "content": "This function builds a backbone network for the specified model (resnet, xception, drn, or mobilenet) with the given output stride and BatchNorm implementation.",
+        "type": "summary"
+    },
+    "1530": {
+        "file_id": 134,
+        "content": "from networks.backbone import resnet, xception, drn, mobilenet\ndef build_backbone(backbone, output_stride, BatchNorm):\n    if backbone == 'resnet':\n        return resnet.ResNet101(output_stride, BatchNorm)\n    elif backbone == 'xception':\n        return xception.AlignedXception(output_stride, BatchNorm)\n    elif backbone == 'drn':\n        return drn.drn_d_54(BatchNorm)\n    elif backbone == 'mobilenet':\n        return mobilenet.MobileNetV2(output_stride, BatchNorm)\n    else:\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/__init__.py:1-14"
+    },
+    "1531": {
+        "file_id": 134,
+        "content": "This function builds a backbone network for the specified model (resnet, xception, drn, or mobilenet) with the given output stride and BatchNorm implementation.",
+        "type": "comment"
+    },
+    "1532": {
+        "file_id": 135,
+        "content": "/applications/Ma-Net/networks/backbone/drn.py",
+        "type": "filepath"
+    },
+    "1533": {
+        "file_id": 135,
+        "content": "The code defines the Deep Residual Network (DRN) model and MA-Net architecture in PaddlePaddle, with various configurations and optional pre-trained weights. It also includes low-level feature retention through processing inputs and can be tested using examples.",
+        "type": "summary"
+    },
+    "1534": {
+        "file_id": 135,
+        "content": "import paddle.nn as nn\nimport math\nwebroot = 'https://tigress-web.princeton.edu/~fy/drn/models/'\nmodel_urls = {\n    'resnet50': 'https://download.pypaddle.org/models/resnet50-19c8e357.pth',\n    'drn-c-26': webroot + 'drn_c_26-ddedf421.pth',\n    'drn-c-42': webroot + 'drn_c_42-9d336e8c.pth',\n    'drn-c-58': webroot + 'drn_c_58-0a53a92c.pth',\n    'drn-d-22': webroot + 'drn_d_22-4bd2f8ea.pth',\n    'drn-d-38': webroot + 'drn_d_38-eebb45f0.pth',\n    'drn-d-54': webroot + 'drn_d_54-0e0534ff.pth',\n    'drn-d-105': webroot + 'drn_d_105-12b40979.pth'\n}\ndef conv3x3(in_planes, out_planes, stride=1, padding=1, dilation=1):\n    return nn.Conv2D(in_planes, out_planes, kernel_size=3, stride=stride,\n                     padding=padding, bias_attr=False, dilation=dilation)\nclass BasicBlock(nn.Layer):\n    expansion = 1\n    def __init__(self, inplanes, planes, stride=1, downsample=None,\n                 dilation=(1, 1), residual=True, BatchNorm=None):\n        super(BasicBlock, self).__init__()\n        self.conv1 = conv3x3(inplanes, planes, stride,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:1-29"
+    },
+    "1535": {
+        "file_id": 135,
+        "content": "This code defines a class BasicBlock, which is an extension of the nn.Layer class in PaddlePaddle's library. It contains a convolution layer with 3x3 kernel size and optional downsampling using a stride greater than 1. The BasicBlock has an expansion parameter set to 1, indicating no change in the input and output channel dimensions. There are pre-trained models available for download from the specified URLs.",
+        "type": "comment"
+    },
+    "1536": {
+        "file_id": 135,
+        "content": "                             padding=dilation[0], dilation=dilation[0])\n        self.bn1 = BatchNorm(planes)\n        self.relu = nn.ReLU()\n        self.conv2 = conv3x3(planes, planes,\n                             padding=dilation[1], dilation=dilation[1])\n        self.bn2 = BatchNorm(planes)\n        self.downsample = downsample\n        self.stride = stride\n        self.residual = residual\n    def forward(self, x):\n        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        if self.residual:\n            out += residual\n        out = self.relu(out)\n        return out\nclass Bottleneck(nn.Layer):\n    expansion = 4\n    def __init__(self, inplanes, planes, stride=1, downsample=None,\n                 dilation=(1, 1), residual=True, BatchNorm=None):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:30-65"
+    },
+    "1537": {
+        "file_id": 135,
+        "content": "This code defines a residual block with BatchNormalization and ReLU activation, using convolutions and optional downsampling. The Bottleneck class also includes a 1x1 convolution and has an expansion factor of 4.",
+        "type": "comment"
+    },
+    "1538": {
+        "file_id": 135,
+        "content": "        self.bn1 = BatchNorm(planes)\n        self.conv2 = nn.Conv2D(planes, planes, kernel_size=3, stride=stride,\n                               padding=dilation[1], bias_attr=False,\n                               dilation=dilation[1])\n        self.bn2 = BatchNorm(planes)\n        self.conv3 = nn.Conv2D(planes, planes * 4, kernel_size=1, bias_attr=False)\n        self.bn3 = BatchNorm(planes * 4)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n    def forward(self, x):\n        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass DRN(nn.Layer):\n    def __init__(self, block, layers, arch='D',\n                 channels=(16, 32, 64, 128, 256, 512, 512, 512),",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:66-103"
+    },
+    "1539": {
+        "file_id": 135,
+        "content": "This code defines a DRN (Deep Residual Network) model with residual blocks. It includes batch normalization, convolutional layers, and ReLU activation functions. The forward method applies the layers sequentially and performs residual connections if necessary.",
+        "type": "comment"
+    },
+    "1540": {
+        "file_id": 135,
+        "content": "                 BatchNorm=None):\n        super(DRN, self).__init__()\n        self.inplanes = channels[0]\n        self.out_dim = channels[-1]\n        self.arch = arch\n        if arch == 'C':\n            self.conv1 = nn.Conv2D(3, channels[0], kernel_size=7, stride=1,\n                                   padding=3, bias_attr=False)\n            self.bn1 = BatchNorm(channels[0])\n            self.relu = nn.ReLU()\n            self.layer1 = self._make_layer(\n                BasicBlock, channels[0], layers[0], stride=1, BatchNorm=BatchNorm)\n            self.layer2 = self._make_layer(\n                BasicBlock, channels[1], layers[1], stride=2, BatchNorm=BatchNorm)\n        elif arch == 'D':\n            self.layer0 = nn.Sequential(\n                nn.Conv2D(3, channels[0], kernel_size=7, stride=1, padding=3,\n                          bias_attr=False),\n                BatchNorm(channels[0]),\n                nn.ReLU()\n            )\n            self.layer1 = self._make_conv_layers(\n                channels[0], layers[0], stride=1, BatchNorm=BatchNorm)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:104-130"
+    },
+    "1541": {
+        "file_id": 135,
+        "content": "This code defines a DRN class that inherits from an unknown base class. It initializes the object with specified number of channels, layers, and architecture type ('C' or 'D'). The constructor creates different layers depending on the architecture: for 'C', it includes convolutional and pooling layers with BatchNorm and ReLU activation; for 'D', it only includes a convolutional layer followed by BatchNorm and ReLU activation, then adds more convolutional layers.",
+        "type": "comment"
+    },
+    "1542": {
+        "file_id": 135,
+        "content": "            self.layer2 = self._make_conv_layers(\n                channels[1], layers[1], stride=2, BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block, channels[2], layers[2], stride=2, BatchNorm=BatchNorm)\n        self.layer4 = self._make_layer(block, channels[3], layers[3], stride=2, BatchNorm=BatchNorm)\n        self.layer5 = self._make_layer(block, channels[4], layers[4],\n                                       dilation=2, new_level=False, BatchNorm=BatchNorm)\n        self.layer6 = None if layers[5] == 0 else \\\n            self._make_layer(block, channels[5], layers[5], dilation=4,\n                             new_level=False, BatchNorm=BatchNorm)\n        if arch == 'C':\n            self.layer7 = None if layers[6] == 0 else \\\n                self._make_layer(BasicBlock, channels[6], layers[6], dilation=2,\n                                 new_level=False, residual=False, BatchNorm=BatchNorm)\n            self.layer8 = None if layers[7] == 0 else \\\n                self._make_layer(BasicBlock, channels[7], layers[7], dilation=1,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:131-147"
+    },
+    "1543": {
+        "file_id": 135,
+        "content": "The code defines a network architecture with six potential layers (2-6) using the provided block, and two additional layers (7 & 8) if the architecture is 'C'. Each layer has a specific number of channels, layers, and dilation rate. The last three layers can be set to None if their corresponding number of layers is 0. Batch Normalization is applied to each layer.",
+        "type": "comment"
+    },
+    "1544": {
+        "file_id": 135,
+        "content": "                                 new_level=False, residual=False, BatchNorm=BatchNorm)\n        elif arch == 'D':\n            self.layer7 = None if layers[6] == 0 else \\\n                self._make_conv_layers(channels[6], layers[6], dilation=2, BatchNorm=BatchNorm)\n            self.layer8 = None if layers[7] == 0 else \\\n                self._make_conv_layers(channels[7], layers[7], dilation=1, BatchNorm=BatchNorm)\n        self._init_weight()\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                m.weight.normal_(0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2D):\n                from manet_paddle.utils.api import fill_\n                fill_(m.weight, 1)\n                from manet_paddle.utils.api import zero_\n                zero_(m.bias)\n    def _make_layer(self, block, planes, blocks, stride=1, dilation=1,\n                    new_level=True, residual=True, BatchNorm=None):",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:148-170"
+    },
+    "1545": {
+        "file_id": 135,
+        "content": "This code defines a network backbone for the MA-Net model in PaddleVideo. It includes layers 1 to 8 with optional activation, residual connections, and batch normalization. The `_init_weight` function initializes weights for convolutional and batch normalization layers, while `_make_layer` creates each layer of the backbone based on the specified parameters.",
+        "type": "comment"
+    },
+    "1546": {
+        "file_id": 135,
+        "content": "        assert dilation == 1 or dilation % 2 == 0\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes, planes * block.expansion,\n                          kernel_size=1, stride=stride, bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = list()\n        layers.append(block(\n            self.inplanes, planes, stride, downsample,\n            dilation=(1, 1) if dilation == 1 else (\n                dilation // 2 if new_level else dilation, dilation),\n            residual=residual, BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(block(self.inplanes, planes, residual=residual,\n                                dilation=(dilation, dilation), BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def _make_conv_layers(self, channels, convs, stride=1, dilation=1, BatchNorm=None):",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:171-193"
+    },
+    "1547": {
+        "file_id": 135,
+        "content": "This code is creating a network layer with multiple blocks. It checks the stride and dilation to determine if downsampling is required, then constructs a Sequential module of convolutional layers using the provided number of blocks, channels, and convolutions. The BatchNorm function is an optional parameter.",
+        "type": "comment"
+    },
+    "1548": {
+        "file_id": 135,
+        "content": "        modules = []\n        for i in range(convs):\n            modules.extend([\n                nn.Conv2D(self.inplanes, channels, kernel_size=3,\n                          stride=stride if i == 0 else 1,\n                          padding=dilation, bias_attr=False, dilation=dilation),\n                BatchNorm(channels),\n                nn.ReLU()])\n            self.inplanes = channels\n        return nn.Sequential(*modules)\n    def forward(self, x):\n        if self.arch == 'C':\n            x = self.conv1(x)\n            x = self.bn1(x)\n            x = self.relu(x)\n        elif self.arch == 'D':\n            x = self.layer0(x)\n        x = self.layer1(x)\n        x = self.layer2(x)\n        x = self.layer3(x)\n        low_level_feat = x\n        x = self.layer4(x)\n        x = self.layer5(x)\n        if self.layer6 is not None:\n            x = self.layer6(x)\n        if self.layer7 is not None:\n            x = self.layer7(x)\n        if self.layer8 is not None:\n            x = self.layer8(x)\n        return x, low_level_feat\nclass DRN_A(nn.Layer):",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:194-234"
+    },
+    "1549": {
+        "file_id": 135,
+        "content": "The code defines a DRN (Deep Residual Network) backbone with multiple layers. It first creates a list of modules containing convolutional layers, batch normalization, and ReLU activation. The `forward` function handles different architectures ('C' or 'D') and processes input through various layers while retaining low-level features. The DRN_A class extends the functionality with more layers.",
+        "type": "comment"
+    },
+    "1550": {
+        "file_id": 135,
+        "content": "    def __init__(self, block, layers, BatchNorm=None):\n        self.inplanes = 64\n        super(DRN_A, self).__init__()\n        self.out_dim = 512 * block.expansion\n        self.conv1 = nn.Conv2D(3, 64, kernel_size=7, stride=2, padding=3,\n                               bias_attr=False)\n        self.bn1 = BatchNorm(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block, 64, layers[0], BatchNorm=BatchNorm)\n        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block, 256, layers[2], stride=1,\n                                       dilation=2, BatchNorm=BatchNorm)\n        self.layer4 = self._make_layer(block, 512, layers[3], stride=1,\n                                       dilation=4, BatchNorm=BatchNorm)\n        self._init_weight()\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:236-257"
+    },
+    "1551": {
+        "file_id": 135,
+        "content": "The code defines a DRN_A class that is a type of backbone network. It has an __init__ method initializing parameters, and includes a Conv2D layer, BatchNorm layer, ReLU activation, MaxPool2D layer, and several _make_layer methods for creating different layers with varying dimensions and strides. The _init_weight method is used to initialize the weights of the convolution layers.",
+        "type": "comment"
+    },
+    "1552": {
+        "file_id": 135,
+        "content": "                m.weight.normal_(0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2D):\n                from manet_paddle.utils.api import fill_\n                fill_(m.weight, 1)\n                from manet_paddle.utils.api import zero_\n                zero_(m.bias)\n    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes, planes * block.expansion,\n                          kernel_size=1, stride=stride, bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(block(self.inplanes, planes, stride, downsample, BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(block(self.inplanes, planes,\n                                dilation=(dilation, dilation, ), BatchNorm=BatchNorm))",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:258-279"
+    },
+    "1553": {
+        "file_id": 135,
+        "content": "The code defines a function _make_layer that creates layers of a specified block with the given number of blocks, planes, and stride. It also handles downsampling if needed and initializes the weights for BatchNorm2D layers.",
+        "type": "comment"
+    },
+    "1554": {
+        "file_id": 135,
+        "content": "        return nn.Sequential(*layers)\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n        x = self.layer1(x)\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        return x\ndef drn_a_50(BatchNorm, pretrained=True):\n    model = DRN_A(Bottleneck, [3, 4, 6, 3], BatchNorm=BatchNorm)\n    if pretrained:\n        import paddlehub as hub\n        model.set_state_dict(hub.Module(name=\"resnet50_vd_animals\"))\n    return model\ndef drn_c_26(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='C', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-c-26'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_c_42(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-c-42'])",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:281-318"
+    },
+    "1555": {
+        "file_id": 135,
+        "content": "This code defines three functions: drn_a_50, drn_c_26, and drn_c_42. Each function takes a BatchNorm argument and an optional pretrained flag. The functions return different types of DRN models based on the input arguments. If pretrained is True, the code sets the model's state dictionary to a pre-trained model's weights.",
+        "type": "comment"
+    },
+    "1556": {
+        "file_id": 135,
+        "content": "        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_c_58(BatchNorm, pretrained=True):\n    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-c-58'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_22(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-22'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_24(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 2, 2], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-24'])\n        del pretrained['fc.weight']",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:319-349"
+    },
+    "1557": {
+        "file_id": 135,
+        "content": "Code defines functions for initializing DRN models with different architectures (C, D) and sizes (58, 22, 24). If `pretrained` is True, it loads pre-trained model weights from a URL and removes the last fully connected layer's weight and bias before setting the state dictionary of the model. This allows for custom downstream tasks.",
+        "type": "comment"
+    },
+    "1558": {
+        "file_id": 135,
+        "content": "        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_38(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-38'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_40(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 2, 2], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-40'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_54(BatchNorm, pretrained=True):\n    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-54'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:350-380"
+    },
+    "1559": {
+        "file_id": 135,
+        "content": "The code defines three functions, drn_d_38, drn_d_40, and drn_d_54, which return instances of the DRN model with different configurations and optional pre-trained weights. If pre-trained weights are specified, it loads them from a URL and deletes 'fc.weight' and 'fc.bias' keys before setting the state dictionary of the model.",
+        "type": "comment"
+    },
+    "1560": {
+        "file_id": 135,
+        "content": "        model.set_state_dict(pretrained)\n    return model\ndef drn_d_105(BatchNorm, pretrained=True):\n    model = DRN(Bottleneck, [1, 1, 3, 4, 23, 3, 1, 1], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-105'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\nif __name__ == \"__main__\":\n    import paddle\n    model = drn_a_50(BatchNorm=nn.BatchNorm2D, pretrained=True)\n    input = paddle.rand([1, 3, 512, 512])\n    output, low_level_feat = model(input)\n    print(output.shape)\n    print(low_level_feat.shape)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/drn.py:381-400"
+    },
+    "1561": {
+        "file_id": 135,
+        "content": "This code defines a function 'drn_d_105' that returns an instance of the DRN model with specified parameters. It also loads pre-trained weights for the model if 'pretrained' flag is set to True. The example usage at the end creates and tests an instance of the DRN model with specific parameters, using PaddlePaddle library.",
+        "type": "comment"
+    },
+    "1562": {
+        "file_id": 136,
+        "content": "/applications/Ma-Net/networks/backbone/mobilenet.py",
+        "type": "filepath"
+    },
+    "1563": {
+        "file_id": 136,
+        "content": "The code defines a MobileNetV2 model with InvertedResidual blocks for Ma-Net application, initializing the backbone network and preparing it for forward propagation while applying Kaiming normal initialization to certain layers.",
+        "type": "summary"
+    },
+    "1564": {
+        "file_id": 136,
+        "content": "import paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nimport math\nfrom utils.api import kaiming_normal_\ndef conv_bn(inp, oup, stride, BatchNorm):\n    return nn.Sequential(nn.Conv2D(inp, oup, 3, stride, 1, bias_attr=False),\n                         BatchNorm(oup), nn.ReLU6())\ndef fixed_padding(inputs, kernel_size, dilation):\n    kernel_size_effective = kernel_size + (kernel_size - 1) * (dilation - 1)\n    pad_total = kernel_size_effective - 1\n    pad_beg = pad_total // 2\n    pad_end = pad_total - pad_beg\n    padded_inputs = F.pad(inputs, (pad_beg, pad_end, pad_beg, pad_end))\n    return padded_inputs\nclass InvertedResidual(nn.Layer):\n    def __init__(self, inp, oup, stride, dilation, expand_ratio, BatchNorm):\n        super(InvertedResidual, self).__init__()\n        self.stride = stride\n        assert stride in [1, 2]\n        hidden_dim = round(inp * expand_ratio)\n        self.use_res_connect = self.stride == 1 and inp == oup\n        self.kernel_size = 3\n        self.dilation = dilation\n        if expand_ratio == 1:",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/mobilenet.py:1-33"
+    },
+    "1565": {
+        "file_id": 136,
+        "content": "This code defines a network layer for MobileNet, including convolution-batch normalization-ReLU6 operations and an inverted residual block. It utilizes padding and dilation techniques to increase the effective receptive field size of the convolutions. The InvertedResidual class handles stride, dilation, and expand_ratio parameters for the network layer.",
+        "type": "comment"
+    },
+    "1566": {
+        "file_id": 136,
+        "content": "            self.conv = nn.Sequential(\n                # dw\n                nn.Conv2D(hidden_dim,\n                          hidden_dim,\n                          3,\n                          stride,\n                          0,\n                          dilation,\n                          groups=hidden_dim,\n                          bias_attr=False),\n                BatchNorm(hidden_dim),\n                nn.ReLU6(),\n                # pw-linear\n                nn.Conv2D(hidden_dim, oup, 1, 1, 0, 1, 1, bias_attr=False),\n                BatchNorm(oup),\n            )\n        else:\n            self.conv = nn.Sequential(\n                # pw\n                nn.Conv2D(inp, hidden_dim, 1, 1, 0, 1, bias_attr=False),\n                BatchNorm(hidden_dim),\n                nn.ReLU6(),\n                # dw\n                nn.Conv2D(hidden_dim,\n                          hidden_dim,\n                          3,\n                          stride,\n                          0,\n                          dilation,\n                          groups=hidden_dim,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/mobilenet.py:34-63"
+    },
+    "1567": {
+        "file_id": 136,
+        "content": "This code defines a convolutional neural network layer for MobileNet backbone. It includes parameters such as input and output channels (inp, oup), hidden dimension (hidden_dim), stride, dilation, and whether to use pointwise (pw) or depthwise (dw) convolution. The layer is created using nn.Sequential module and includes BatchNorm and ReLU6 activation functions.",
+        "type": "comment"
+    },
+    "1568": {
+        "file_id": 136,
+        "content": "                          bias_attr=False),\n                BatchNorm(hidden_dim),\n                nn.ReLU6(),\n                # pw-linear\n                nn.Conv2D(hidden_dim, oup, 1, 1, 0, 1, bias_attr=False),\n                BatchNorm(oup),\n            )\n    def forward(self, x):\n        x_pad = fixed_padding(x, self.kernel_size, dilation=self.dilation)\n        if self.use_res_connect:\n            x = x + self.conv(x_pad)\n        else:\n            x = self.conv(x_pad)\n        return x\nclass MobileNetV2(nn.Layer):\n    def __init__(self,\n                 output_stride=8,\n                 BatchNorm=None,\n                 width_mult=1.,\n                 pretrained=True):\n        super(MobileNetV2, self).__init__()\n        block = InvertedResidual\n        input_channel = 32\n        current_stride = 1\n        rate = 1\n        interverted_residual_setting = [\n            # t, c, n, s\n            [1, 16, 1, 1],\n            [6, 24, 2, 2],\n            [6, 32, 3, 2],\n            [6, 64, 4, 2],\n            [6, 96, 3, 1],\n            [6, 160, 3, 2],",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/mobilenet.py:64-99"
+    },
+    "1569": {
+        "file_id": 136,
+        "content": "This code defines a MobileNetV2 model with InvertedResidual blocks, including convolutional layers, batch normalization, ReLU activation, and optional residual connection. The model takes in an input image of size 3xHxW and outputs a feature map of size oup x (H/stride) x (W/stride). It also supports variable width multiplier to adjust the number of channels for each block.",
+        "type": "comment"
+    },
+    "1570": {
+        "file_id": 136,
+        "content": "            [6, 320, 1, 1],\n        ]\n        # building first layer\n        input_channel = int(input_channel * width_mult)\n        self.features = [conv_bn(3, input_channel, 2, BatchNorm)]\n        current_stride *= 2\n        # building inverted residual blocks\n        for t, c, n, s in interverted_residual_setting:\n            if current_stride == output_stride:\n                stride = 1\n                dilation = rate\n                rate *= s\n            else:\n                stride = s\n                dilation = 1\n                current_stride *= s\n            output_channel = int(c * width_mult)\n            for i in range(n):\n                if i == 0:\n                    self.features.append(\n                        block(input_channel, output_channel, stride, dilation,\n                              t, BatchNorm))\n                else:\n                    self.features.append(\n                        block(input_channel, output_channel, 1, dilation, t,\n                              BatchNorm))\n                input_channel = output_channel",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/mobilenet.py:100-127"
+    },
+    "1571": {
+        "file_id": 136,
+        "content": "This code builds the MobileNet backbone for Ma-Net application. It initializes the first layer with a specific input channel and then iterates through inverted residual blocks, adjusting stride and dilation rate accordingly. The block function is used to build each block, and input channels are updated accordingly.",
+        "type": "comment"
+    },
+    "1572": {
+        "file_id": 136,
+        "content": "        self.features = nn.Sequential(*self.features)\n        self._initialize_weights()\n        if pretrained:\n            self._load_pretrained_model()\n        self.low_level_features = self.features[0:4]\n        self.high_level_features = self.features[4:]\n    def forward(self, x):\n        low_level_feat = self.low_level_features(x)\n        x = self.high_level_features(low_level_feat)\n        return x, low_level_feat\n    def _load_pretrained_model(self):\n        import paddlehub as hub\n        pretrain_dict = hub.Module(name=\"mobilenet_v2_imagenet\")\n        model_dict = {}\n        state_dict = self.state_dict()\n        for k, v in pretrain_dict.items():\n            if k in state_dict:\n                model_dict[k] = v\n        state_dict.update(model_dict)\n        self.set_state_dict(state_dict)\n    def _initialize_weights(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                # n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                # m.weight.normal_(0, math.sqrt(2. / n))",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/mobilenet.py:128-157"
+    },
+    "1573": {
+        "file_id": 136,
+        "content": "Initializes and prepares the MobileNet backbone network for forward propagation. If pretrained model is specified, loads the pretrained weights from PaddleHub's MobileNet_v2_imagenet. Otherwise, initializes the weights according to the provided configuration. The forward function extracts low-level and high-level features by passing the input through separate subsections of the feature extraction network.",
+        "type": "comment"
+    },
+    "1574": {
+        "file_id": 136,
+        "content": "                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/mobilenet.py:158-163"
+    },
+    "1575": {
+        "file_id": 136,
+        "content": "Code applies Kaiming normal initialization to certain layers (m.weight) and performs batch normalization by filling layer weights with 1 and setting bias to zero for nn.BatchNorm2D instances.",
+        "type": "comment"
+    },
+    "1576": {
+        "file_id": 137,
+        "content": "/applications/Ma-Net/networks/backbone/resnet.py",
+        "type": "filepath"
+    },
+    "1577": {
+        "file_id": 137,
+        "content": "This code defines a ResNet architecture with batch normalization and ReLU activation functions, featuring output strides of 16 or 8, multiple blocks, residual connections through convolutional layers, a residual block for ResNet-101, and optional pretrained model loading on ImageNet.",
+        "type": "summary"
+    },
+    "1578": {
+        "file_id": 137,
+        "content": "import math\nimport paddle.nn as nn\n# from reprod_log.utils import paddle2np\nimport paddle\nfrom utils.api import normal_, fill_, zero_\nclass Bottleneck(nn.Layer):\n    expansion = 4\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 stride=1,\n                 dilation=1,\n                 downsample=None,\n                 BatchNorm=None):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)\n        self.bn1 = BatchNorm(planes)\n        self.conv2 = nn.Conv2D(planes,\n                               planes,\n                               kernel_size=3,\n                               stride=stride,\n                               dilation=dilation,\n                               padding=dilation,\n                               bias_attr=False)\n        self.bn2 = BatchNorm(planes)\n        self.conv3 = nn.Conv2D(planes,\n                               planes * 4,\n                               kernel_size=1,\n                               bias_attr=False)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/resnet.py:1-33"
+    },
+    "1579": {
+        "file_id": 137,
+        "content": "This code defines a Bottleneck class for ResNet backbone, which contains three 2D convolutional layers and two batch normalization layers. It has an expansion factor of 4. The convolutional layers have configurable parameters such as inplanes, planes, stride, dilation, and downsample.",
+        "type": "comment"
+    },
+    "1580": {
+        "file_id": 137,
+        "content": "        self.bn3 = BatchNorm(planes * 4)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n    def forward(self, x):\n        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass ResNet(nn.Layer):\n    def __init__(self,\n                 block,\n                 layers,\n                 output_stride,\n                 BatchNorm,\n                 pretrained=False):\n        self.inplanes = 64\n        super(ResNet, self).__init__()\n        blocks = [1, 2, 4]\n        if output_stride == 16:\n            strides = [1, 2, 2, 1]\n            dilations = [1, 1, 1, 2]\n        elif output_stride == 8:\n            strides = [1, 2, 1, 1]",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/resnet.py:34-77"
+    },
+    "1581": {
+        "file_id": 137,
+        "content": "This code defines a ResNet architecture with BatchNorm, ReLU activation functions, and downsample layers. It allows for different output strides (16 or 8) and has multiple blocks (1, 2, 4). The forward function performs residual connections and applies the appropriate number of convolutional layers based on block specifications.",
+        "type": "comment"
+    },
+    "1582": {
+        "file_id": 137,
+        "content": "            dilations = [1, 1, 2, 4]\n        else:\n            raise NotImplementedError\n        # Modules\n        self.conv1 = nn.Conv2D(3,\n                               64,\n                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)\n        self.bn1 = BatchNorm(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block,\n                                       64,\n                                       layers[0],\n                                       stride=strides[0],\n                                       dilation=dilations[0],\n                                       BatchNorm=BatchNorm)\n        self.layer2 = self._make_layer(block,\n                                       128,\n                                       layers[1],\n                                       stride=strides[1],\n                                       dilation=dilations[1],",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/resnet.py:78-103"
+    },
+    "1583": {
+        "file_id": 137,
+        "content": "This code is initializing a ResNet backbone. It defines convolutional layers, batch normalization, and pooling layers followed by multiple residual blocks. Dilation rates are implemented for the blocks. If an unsupported option is chosen, it raises a NotImplementedError.",
+        "type": "comment"
+    },
+    "1584": {
+        "file_id": 137,
+        "content": "                                       BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block,\n                                       256,\n                                       layers[2],\n                                       stride=strides[2],\n                                       dilation=dilations[2],\n                                       BatchNorm=BatchNorm)\n        self.layer4 = self._make_MG_unit(block,\n                                         512,\n                                         blocks=blocks,\n                                         stride=strides[3],\n                                         dilation=dilations[3],\n                                         BatchNorm=BatchNorm)\n        # self.layer4 = self._make_layer(block, 512, layers[3], stride=strides[3], dilation=dilations[3], BatchNorm=BatchNorm)\n        self._init_weight()\n        if pretrained:\n            self._load_pretrained_model()\n    def _make_layer(self,\n                    block,\n                    planes,\n                    blocks,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/resnet.py:104-126"
+    },
+    "1585": {
+        "file_id": 137,
+        "content": "This code defines a ResNet network with multiple layers and blocks, using BatchNormalization for normalization. It also includes an optional pretrained model loading functionality.",
+        "type": "comment"
+    },
+    "1586": {
+        "file_id": 137,
+        "content": "                    stride=1,\n                    dilation=1,\n                    BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes, planes, stride, dilation, downsample,\n                  BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      dilation=dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def _make_MG_unit(self,\n                      block,\n                      planes,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/resnet.py:127-157"
+    },
+    "1587": {
+        "file_id": 137,
+        "content": "This code defines a function to create a residual block for a ResNet network with specific parameters such as number of blocks, stride, dilation rate, and BatchNorm layer. It returns a Sequential model containing the block layers.",
+        "type": "comment"
+    },
+    "1588": {
+        "file_id": 137,
+        "content": "                      blocks,\n                      stride=1,\n                      dilation=1,\n                      BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes,\n                  planes,\n                  stride,\n                  dilation=blocks[0] * dilation,\n                  downsample=downsample,\n                  BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, len(blocks)):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      stride=1,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/resnet.py:158-186"
+    },
+    "1589": {
+        "file_id": 137,
+        "content": "This code defines a residual block for ResNet using the input planes, number of blocks, and other parameters. It creates a downsampling layer if necessary and then appends multiple instances of the given block to form the final residual block.",
+        "type": "comment"
+    },
+    "1590": {
+        "file_id": 137,
+        "content": "                      dilation=blocks[i] * dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def forward(self, input):\n        #         print('input:', input.mean().item())\n        x = self.conv1(input)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n        x = self.layer1(x)\n        low_level_feat = x\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        return x, low_level_feat\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                fill_(m.weight, 1)\n                # normal_(m.weight, 0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2D):\n                fill_(m.weight, 1)\n                zero_(m.bias)\n        return self.sublayers()\n    def _load_pretrained_model(self):\n        # TODO\n        pretrain_dict = paddle.load(\n            '/home/lc/manet/manet_paddle/model_best.pdparams.tar')",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/resnet.py:187-220"
+    },
+    "1591": {
+        "file_id": 137,
+        "content": "The code defines a ResNet network with multiple layers, including convolution, batch normalization, and pooling. The forward function performs inference by passing the input through each layer sequentially. The _init_weight function initializes the weights of the network using either Xavier or Gaussian distribution, depending on the type of the layer. The _load_pretrained_model function loads a pre-trained model from a specified file path, but it is currently empty and marked as TODO.",
+        "type": "comment"
+    },
+    "1592": {
+        "file_id": 137,
+        "content": "        model_dict = {}\n        state_dict = self.state_dict()\n        for k, v in pretrain_dict.items():\n            if k in state_dict:\n                model_dict[k] = v\n        state_dict.update(model_dict)\n        self.set_state_dict(state_dict)\ndef ResNet101(output_stride, BatchNorm, pretrained=False):\n    \"\"\"Constructs a ResNet-101 model.\n    Args:\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\n    \"\"\"\n    model = ResNet(Bottleneck, [3, 4, 23, 3],\n                   output_stride,\n                   BatchNorm,\n                   pretrained=pretrained)\n    return model",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/resnet.py:221-239"
+    },
+    "1593": {
+        "file_id": 137,
+        "content": "This code defines a ResNet-101 model function that takes output stride, BatchNorm flag, and pretrained option as arguments. It creates a ResNet model with Bottleneck blocks, layers, output stride, and BatchNorm implementation. If pretrained is set to True, the function returns a pre-trained model on ImageNet. The code also updates the model's state dictionary by merging pretrain_dict into the state dict.",
+        "type": "comment"
+    },
+    "1594": {
+        "file_id": 138,
+        "content": "/applications/Ma-Net/networks/backbone/xception.py",
+        "type": "filepath"
+    },
+    "1595": {
+        "file_id": 138,
+        "content": "The code defines a SeparableConv2d class and layers for convolutional layers, initializes an AlignedXception network with skip connections, ReLU activations, and separable convolutions for feature extraction in the backbone architecture, and utilizes pre-trained weights for image classification tasks.",
+        "type": "summary"
+    },
+    "1596": {
+        "file_id": 138,
+        "content": "import math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\ndef fixed_padding(inputs, kernel_size, dilation):\n    kernel_size_effective = kernel_size + (kernel_size - 1) * (dilation - 1)\n    pad_total = kernel_size_effective - 1\n    pad_beg = pad_total // 2\n    pad_end = pad_total - pad_beg\n    padded_inputs = F.pad(inputs, (pad_beg, pad_end, pad_beg, pad_end))\n    return padded_inputs\nclass SeparableConv2d(nn.Layer):\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 kernel_size=3,\n                 stride=1,\n                 dilation=1,\n                 bias=False,\n                 BatchNorm=None):\n        super(SeparableConv2d, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes,\n                               inplanes,\n                               kernel_size,\n                               stride,\n                               0,\n                               dilation,\n                               groups=inplanes,\n                               bias=bias)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:1-34"
+    },
+    "1597": {
+        "file_id": 138,
+        "content": "The code defines a `SeparableConv2d` class which extends the `nn.Layer` class and implements a separable convolutional layer with optional batch normalization (`BatchNorm`) and fixed padding applied using the `fixed_padding()` function. It has input channels (`inplanes`), output channels (`planes`), kernel size, stride, dilation rate, whether to use bias or not, and an optional BatchNorm layer as parameters.",
+        "type": "comment"
+    },
+    "1598": {
+        "file_id": 138,
+        "content": "        self.bn = BatchNorm(inplanes)\n        self.pointwise = nn.Conv2D(inplanes, planes, 1, 1, 0, 1, 1, bias=bias)\n    def forward(self, x):\n        x = fixed_padding(x,\n                          self.conv1._kernel_size[0],\n                          dilation=self.conv1.dilation[0])\n        x = self.conv1(x)\n        x = self.bn(x)\n        x = self.pointwise(x)\n        return x\nclass Block(nn.Layer):\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 reps,\n                 stride=1,\n                 dilation=1,\n                 BatchNorm=None,\n                 start_with_relu=True,\n                 grow_first=True,\n                 is_last=False):\n        super(Block, self).__init__()\n        if planes != inplanes or stride != 1:\n            self.skip = nn.Conv2D(inplanes,\n                                  planes,\n                                  1,\n                                  stride=stride,\n                                  bias_attr=False)\n            self.skipbn = BatchNorm(planes)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:35-67"
+    },
+    "1599": {
+        "file_id": 138,
+        "content": "The code defines a block layer that consists of convolutional layers, batch normalization, and optional skip connections. It initializes the block layer with specified parameters such as input planes, output planes, number of repetitions, stride, dilation rate, and whether it's the last block or not. The forward method performs fixed padding on the input, applies the convolution operation, batch normalization, and finally the pointwise convolution.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/16.json b/docs/data/16.json
new file mode 100644
index 000000000..d3d78e50c
--- /dev/null
+++ b/docs/data/16.json
@@ -0,0 +1,547 @@
+{
+    "1600": {
+        "file_id": 138,
+        "content": "        else:\n            self.skip = None\n        self.relu = nn.ReLU()\n        rep = []\n        filters = inplanes\n        if grow_first:\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(inplanes,\n                                planes,\n                                3,\n                                1,\n                                dilation,\n                                BatchNorm=BatchNorm))\n            rep.append(BatchNorm(planes))\n            filters = planes\n        for i in range(reps - 1):\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(filters,\n                                filters,\n                                3,\n                                1,\n                                dilation,\n                                BatchNorm=BatchNorm))\n            rep.append(BatchNorm(filters))\n        if not grow_first:\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(inplanes,\n                                planes,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:68-102"
+    },
+    "1601": {
+        "file_id": 138,
+        "content": "The code creates a backbone network with xception structure. It initializes the skip connection, adds a ReLU activation function, and appends layers of separable convolutions with batch normalization. The number of reps determines the number of such layers. If grow_first is True, it starts with a growth block; otherwise, it ends with one.",
+        "type": "comment"
+    },
+    "1602": {
+        "file_id": 138,
+        "content": "                                3,\n                                1,\n                                dilation,\n                                BatchNorm=BatchNorm))\n            rep.append(BatchNorm(planes))\n        if stride != 1:\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(planes, planes, 3, 2, BatchNorm=BatchNorm))\n            rep.append(BatchNorm(planes))\n        if stride == 1 and is_last:\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(planes, planes, 3, 1, BatchNorm=BatchNorm))\n            rep.append(BatchNorm(planes))\n        if not start_with_relu:\n            rep = rep[1:]\n        self.rep = nn.Sequential(*rep)\n    def forward(self, inp):\n        x = self.rep(inp)\n        if self.skip is not None:\n            skip = self.skip(inp)\n            skip = self.skipbn(skip)\n        else:\n            skip = inp\n        x = x + skip\n        return x\nclass AlignedXception(nn.Layer):\n    \"\"\"\n    Modified Alighed Xception\n    \"\"\"\n    def __init__(self, output_stride, BatchNorm, pretrained=True):",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:103-144"
+    },
+    "1603": {
+        "file_id": 138,
+        "content": "This code defines a class for the AlignedXception network. It uses separable convolutions with batch normalization and optionally applies ReLU activations at different stages. The function forward performs inference by adding input skip connections and applying batch normalization to skip connections if present.",
+        "type": "comment"
+    },
+    "1604": {
+        "file_id": 138,
+        "content": "        super(AlignedXception, self).__init__()\n        if output_stride == 16:\n            entry_block3_stride = 2\n            middle_block_dilation = 1\n            exit_block_dilations = (1, 2)\n        elif output_stride == 8:\n            entry_block3_stride = 1\n            middle_block_dilation = 2\n            exit_block_dilations = (2, 4)\n        else:\n            raise NotImplementedError\n        # Entry flow\n        self.conv1 = nn.Conv2D(3, 32, 3, stride=2, padding=1, bias_attr=False)\n        self.bn1 = BatchNorm(32)\n        self.relu = nn.ReLU()\n        self.conv2 = nn.Conv2D(32, 64, 3, stride=1, padding=1, bias_attr=False)\n        self.bn2 = BatchNorm(64)\n        self.block1 = Block(64,\n                            128,\n                            reps=2,\n                            stride=2,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=False)\n        self.block2 = Block(128,\n                            256,\n                            reps=2,\n                            stride=2,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:145-175"
+    },
+    "1605": {
+        "file_id": 138,
+        "content": "This code initializes an AlignedXception network. It sets parameters based on the output_stride, defines convolutional layers and batch normalization for entry flow, and instantiates two blocks with specified dimensions and repetitions.",
+        "type": "comment"
+    },
+    "1606": {
+        "file_id": 138,
+        "content": "                            BatchNorm=BatchNorm,\n                            start_with_relu=False,\n                            grow_first=True)\n        self.block3 = Block(256,\n                            728,\n                            reps=2,\n                            stride=entry_block3_stride,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True,\n                            is_last=True)\n        # Middle flow\n        self.block4 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block5 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:176-201"
+    },
+    "1607": {
+        "file_id": 138,
+        "content": "The code defines the Xception backbone network, consisting of blocks for feature extraction. The first block (entry_block) has 3x3 convolutions and BatchNorm. The block3 has two repetitions with a stride and is the last block. Middle blocks (block4 and block5) have three repetitions with dilation applied to the filter. All blocks use BatchNorm, start with ReLU activation, and grow first with subsequent layers.",
+        "type": "comment"
+    },
+    "1608": {
+        "file_id": 138,
+        "content": "                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block6 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block7 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block8 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:202-225"
+    },
+    "1609": {
+        "file_id": 138,
+        "content": "The code defines several blocks (block4 to block8) using the Block class with specific parameters for number of input and output channels, repetitions, stride, dilation, BatchNorm implementation, starting with ReLU activation, and growing first. These blocks are used in a Xception network for image classification or detection tasks.",
+        "type": "comment"
+    },
+    "1610": {
+        "file_id": 138,
+        "content": "                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block9 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block10 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block11 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:226-249"
+    },
+    "1611": {
+        "file_id": 138,
+        "content": "The code defines three consecutive blocks (block9, block10, and block11) in a neural network architecture. Each block takes input and output channels of 728, with 3 repetitions of convolution and batch normalization layers, and an increasing dilation factor (middle_block_dilation). All blocks start with ReLU activation and grow the number of filters first.",
+        "type": "comment"
+    },
+    "1612": {
+        "file_id": 138,
+        "content": "                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block12 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block13 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block14 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:250-273"
+    },
+    "1613": {
+        "file_id": 138,
+        "content": "The code initializes four block instances, each with 728 input and output channels, performing a series of convolutions with 3 repetitions, a stride of 1, dilation determined by middle_block_dilation, using BatchNorm for normalization, starting with ReLU activation, and growing the first layer.",
+        "type": "comment"
+    },
+    "1614": {
+        "file_id": 138,
+        "content": "                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block15 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block16 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block17 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:274-297"
+    },
+    "1615": {
+        "file_id": 138,
+        "content": "The code initializes three blocks, each with 728 input and output channels, repeating the process 3 times, and applying BatchNormalization, starting with ReLU activation, and growing first. These blocks are part of the Xception network in the Ma-Net application for image classification tasks.",
+        "type": "comment"
+    },
+    "1616": {
+        "file_id": 138,
+        "content": "                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block18 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block19 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        # Exit flow\n        self.block20 = Block(728,\n                             1024,\n                             reps=2,\n                             stride=1,\n                             dilation=exit_block_dilations[0],",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:298-323"
+    },
+    "1617": {
+        "file_id": 138,
+        "content": "The code defines several blocks (block17 to block20) using the Block class. Each block has a specific number of input and output channels, repetitions, stride, dilation rate, BatchNorm type, start with relu, and grow first parameters. The last block (block20) connects its output to the next layer in the network.",
+        "type": "comment"
+    },
+    "1618": {
+        "file_id": 138,
+        "content": "                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=False,\n                             is_last=True)\n        self.conv3 = SeparableConv2d(1024,\n                                     1536,\n                                     3,\n                                     stride=1,\n                                     dilation=exit_block_dilations[1],\n                                     BatchNorm=BatchNorm)\n        self.bn3 = BatchNorm(1536)\n        self.conv4 = SeparableConv2d(1536,\n                                     1536,\n                                     3,\n                                     stride=1,\n                                     dilation=exit_block_dilations[1],\n                                     BatchNorm=BatchNorm)\n        self.bn4 = BatchNorm(1536)\n        self.conv5 = SeparableConv2d(1536,\n                                     2048,\n                                     3,\n                                     stride=1,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:324-348"
+    },
+    "1619": {
+        "file_id": 138,
+        "content": "This code defines a series of SeparableConv2d layers with associated BatchNorm layers in an Xception network. The layers have increasing output dimensions and are used for feature extraction and image classification tasks.",
+        "type": "comment"
+    },
+    "1620": {
+        "file_id": 138,
+        "content": "                                     dilation=exit_block_dilations[1],\n                                     BatchNorm=BatchNorm)\n        self.bn5 = BatchNorm(2048)\n        # Init weights\n        self._init_weight()\n        # Load pretrained model\n        if pretrained:\n            self._load_pretrained_model()\n    def forward(self, x):\n        # Entry flow\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu(x)\n        x = self.block1(x)\n        # add relu here\n        x = self.relu(x)\n        low_level_feat = x\n        x = self.block2(x)\n        x = self.block3(x)\n        # Middle flow\n        x = self.block4(x)\n        x = self.block5(x)\n        x = self.block6(x)\n        x = self.block7(x)\n        x = self.block8(x)\n        x = self.block9(x)\n        x = self.block10(x)\n        x = self.block11(x)\n        x = self.block12(x)\n        x = self.block13(x)\n        x = self.block14(x)\n        x = self.block15(x)\n        x = self.block16(x)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:349-390"
+    },
+    "1621": {
+        "file_id": 138,
+        "content": "The code defines a neural network model with 16 blocks and Batch Normalization. It initializes the weights, has an option to load pre-trained weights, and includes ReLU activation functions.",
+        "type": "comment"
+    },
+    "1622": {
+        "file_id": 138,
+        "content": "        x = self.block17(x)\n        x = self.block18(x)\n        x = self.block19(x)\n        # Exit flow\n        x = self.block20(x)\n        x = self.relu(x)\n        x = self.conv3(x)\n        x = self.bn3(x)\n        x = self.relu(x)\n        x = self.conv4(x)\n        x = self.bn4(x)\n        x = self.relu(x)\n        x = self.conv5(x)\n        x = self.bn5(x)\n        x = self.relu(x)\n        return x, low_level_feat\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                m.weight.normal_(0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2D):\n                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)\n    def _load_pretrained_model(self):\n        import paddlehub as hub\n        pretrain_dict = hub.Module(name=\"xception71_imagenet\")\n        model_dict = {}\n        state_dict = self.state_dict()",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:391-427"
+    },
+    "1623": {
+        "file_id": 138,
+        "content": "The code defines a neural network model, initializes its weights, and has methods for processing input and loading pre-trained models. The Xception model is used in image classification tasks. It consists of several convolutional layers with batch normalization and ReLU activation functions. The _init_weight method sets up the initial weights for the convolutional layers using a normal distribution. The _load_pretrained_model method allows loading a pre-trained Xception model from the PaddleHub library, which can be useful when transferring knowledge from an existing dataset to a new task.",
+        "type": "comment"
+    },
+    "1624": {
+        "file_id": 138,
+        "content": "        for k, v in pretrain_dict.items():\n            if k in model_dict:\n                if 'pointwise' in k:\n                    v = v.unsqueeze(-1).unsqueeze(-1)\n                if k.startswith('block11'):\n                    model_dict[k] = v\n                    model_dict[k.replace('block11', 'block12')] = v\n                    model_dict[k.replace('block11', 'block13')] = v\n                    model_dict[k.replace('block11', 'block14')] = v\n                    model_dict[k.replace('block11', 'block15')] = v\n                    model_dict[k.replace('block11', 'block16')] = v\n                    model_dict[k.replace('block11', 'block17')] = v\n                    model_dict[k.replace('block11', 'block18')] = v\n                    model_dict[k.replace('block11', 'block19')] = v\n                elif k.startswith('block12'):\n                    model_dict[k.replace('block12', 'block20')] = v\n                elif k.startswith('bn3'):\n                    model_dict[k] = v\n                    model_dict[k.replace('bn3', 'bn4')] = v",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:429-447"
+    },
+    "1625": {
+        "file_id": 138,
+        "content": "The code iterates through the pre-trained dictionary, updating specific keys in the model_dict. It handles 'pointwise' layers by unsqueezing the input, and adjusts keys starting with 'block11', 'block12', or 'bn3' by replacing their suffixes to match corresponding blocks. This is likely a method for adapting pre-trained weights to match the target network's structure.",
+        "type": "comment"
+    },
+    "1626": {
+        "file_id": 138,
+        "content": "                elif k.startswith('conv4'):\n                    model_dict[k.replace('conv4', 'conv5')] = v\n                elif k.startswith('bn4'):\n                    model_dict[k.replace('bn4', 'bn5')] = v\n                else:\n                    model_dict[k] = v\n        state_dict.update(model_dict)\n        self.set_state_dict(state_dict)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/backbone/xception.py:448-455"
+    },
+    "1627": {
+        "file_id": 138,
+        "content": "This code snippet renames 'conv4' and 'bn4' parameters to 'conv5' and 'bn5', respectively, before updating the model dictionary. The final state dictionary is then set as the model's state dictionary.",
+        "type": "comment"
+    },
+    "1628": {
+        "file_id": 139,
+        "content": "/applications/Ma-Net/networks/decoder.py",
+        "type": "filepath"
+    },
+    "1629": {
+        "file_id": 139,
+        "content": "Decoder neural network layer uses backbone features, 2D convolution, batch normalization, and ReLU activation for class prediction. The `build_decoder` function constructs a decoder network with specified number of classes, backbone architecture, and Batch Normalization implementation.",
+        "type": "summary"
+    },
+    "1630": {
+        "file_id": 139,
+        "content": "import math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom utils.api import kaiming_normal_\nclass Decoder(nn.Layer):\n    def __init__(self, num_classes, backbone, BatchNorm):\n        super(Decoder, self).__init__()\n        if backbone == 'resnet' or backbone == 'drn' or backbone == 'resnet_edge':\n            low_level_inplanes = 256\n        elif backbone == 'xception':\n            low_level_inplanes = 128\n        elif backbone == 'mobilenet':\n            low_level_inplanes = 24\n        else:\n            raise NotImplementedError\n        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)\n        self.bn1 = BatchNorm(48)\n        self.relu = nn.ReLU(True)\n        self.last_conv = nn.Sequential(\n            nn.Conv2D(304,\n                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(True),\n            nn.Sequential(),\n            nn.Conv2D(256,\n                      256,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/decoder.py:1-32"
+    },
+    "1631": {
+        "file_id": 139,
+        "content": "Decoder is a neural network layer that takes in features from backbone and outputs predicted classes. It initializes convolution layers with different input planes based on the specified backbone. It includes batch normalization, ReLU activation, and sequential convolutions for feature extraction.",
+        "type": "comment"
+    },
+    "1632": {
+        "file_id": 139,
+        "content": "                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(True),\n            nn.Sequential())\n        self._init_weight()\n    def forward(self, x, low_level_feat):\n        low_level_feat = self.conv1(low_level_feat)\n        low_level_feat = self.bn1(low_level_feat)\n        low_level_feat = self.relu(low_level_feat)\n        x = F.interpolate(x,\n                          size=low_level_feat.shape[2:],\n                          mode='bilinear',\n                          align_corners=True)\n        x = paddle.concat((x, low_level_feat), axis=1)\n        x = self.last_conv(x)\n        return x\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/decoder.py:33-62"
+    },
+    "1633": {
+        "file_id": 139,
+        "content": "Decoder network with 2D convolution, batch normalization, and ReLU activation. Initializes weight using Kaiming Normal distribution.",
+        "type": "comment"
+    },
+    "1634": {
+        "file_id": 139,
+        "content": "def build_decoder(num_classes, backbone, BatchNorm):\n    return Decoder(num_classes, backbone, BatchNorm)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/decoder.py:65-66"
+    },
+    "1635": {
+        "file_id": 139,
+        "content": "The function `build_decoder` takes parameters `num_classes`, `backbone`, and `BatchNorm` and returns an instance of the `Decoder` class. The purpose is to construct a decoder network for the specified number of classes, using the chosen backbone architecture and Batch Normalization implementation.",
+        "type": "comment"
+    },
+    "1636": {
+        "file_id": 140,
+        "content": "/applications/Ma-Net/networks/deeplab.py",
+        "type": "filepath"
+    },
+    "1637": {
+        "file_id": 140,
+        "content": "This code defines a `FrozenBatchNorm2d` class for batch normalization without updating statistics and a `DeepLab` class with backbone, ASPP module, decoder, and methods to freeze batch norm layers. It also provides a function that iterates through certain modules, yielding parameters requiring gradient updates for potentially applying different learning rates.",
+        "type": "summary"
+    },
+    "1638": {
+        "file_id": 140,
+        "content": "import paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom networks.aspp import build_aspp\nfrom networks.decoder import build_decoder\nfrom networks.backbone import build_backbone\nclass FrozenBatchNorm2d(nn.Layer):\n    def __init__(self, n):\n        super(FrozenBatchNorm2d, self).__init__()\n        self.register_buffer(\"weight\", paddle.ones(n))\n        self.register_buffer(\"bias\", paddle.zeros(n))\n        self.register_buffer(\"running_mean\", paddle.zeros(n))\n        self.register_buffer(\"running_var\", paddle.ones(n))\n    def forward(self, x):\n        if x.dtype == paddle.float16:\n            self.weight = self.weight.half()\n            self.bias = self.bias.half()\n            self.running_mean = self.running_mean.half()\n            self.running_var = self.running_var.half()\n        scale = self.weight * self.running_var.rsqrt()\n        bias = self.bias - self.running_mean * scale\n        scale = scale.reshape(1, -1, 1, 1)\n        bias = bias.reshape(1, -1, 1, 1)\n        return x * scale + bias\nclass DeepLab(nn.Layer):",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/deeplab.py:1-31"
+    },
+    "1639": {
+        "file_id": 140,
+        "content": "The code defines a `FrozenBatchNorm2d` class that extends the `nn.Layer` and overrides the `forward()` function to perform batch normalization without updating statistics. The `DeepLab` class inherits from `nn.Layer` and serves as a backbone for the deeplab network architecture, incorporating a backbone network, ASPP module, and decoder.",
+        "type": "comment"
+    },
+    "1640": {
+        "file_id": 140,
+        "content": "    def __init__(self,\n                 backbone='resnet',\n                 output_stride=16,\n                 num_classes=21,\n                 sync_bn=True,\n                 freeze_bn=False):\n        super(DeepLab, self).__init__()\n        if backbone == 'drn':\n            output_stride = 8\n        if freeze_bn == True:\n            print(\"Use frozen BN in DeepLab\")\n            BatchNorm = FrozenBatchNorm2d\n        else:\n            BatchNorm = nn.BatchNorm2D\n        self.backbone = build_backbone(backbone, output_stride, BatchNorm)\n        self.aspp = build_aspp(backbone, output_stride, BatchNorm)\n        self.decoder = build_decoder(num_classes, backbone, BatchNorm)\n    def forward(self, input):\n        x, low_level_feat = self.backbone(input)\n        x = self.aspp(x)\n        x = self.decoder(x, low_level_feat)\n        return x\n    def freeze_bn(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.BatchNorm2D):\n                m.eval()\n    def get_1x_lr_params(self):\n        modules = [self.backbone]",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/deeplab.py:32-64"
+    },
+    "1641": {
+        "file_id": 140,
+        "content": "This code defines the DeepLab class with an initializer that takes arguments for backbone, output stride, number of classes, and whether to freeze batch normalization layers. It also includes methods to freeze batch norm layers, retrieve parameters for 1x learning rate, and a forward pass function.",
+        "type": "comment"
+    },
+    "1642": {
+        "file_id": 140,
+        "content": "        for i in range(len(modules)):\n            for m in modules[i].named_modules():\n                if isinstance(m[1], nn.Conv2D) or isinstance(\n                        m[1], nn.BatchNorm2D):\n                    for p in m[1].parameters():\n                        if p.requires_grad:\n                            yield p\n    def get_10x_lr_params(self):\n        modules = [self.aspp, self.decoder]\n        for i in range(len(modules)):\n            for m in modules[i].named_modules():\n                if isinstance(m[1], nn.Conv2D) or isinstance(\n                        m[1], nn.BatchNorm2D):\n                    for p in m[1].parameters():\n                        if p.requires_grad:\n                            yield p",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/deeplab.py:65-81"
+    },
+    "1643": {
+        "file_id": 140,
+        "content": "This code defines a function that iterates through certain modules of the network, specifically looking for convolution and batch normalization layers. It then yields the parameters of these layers that require gradient updates. This process is used in both the main body and the get_10x_lr_params method to potentially apply different learning rates to specific parts of the model.",
+        "type": "comment"
+    },
+    "1644": {
+        "file_id": 141,
+        "content": "/applications/Ma-Net/networks/loss.py",
+        "type": "filepath"
+    },
+    "1645": {
+        "file_id": 141,
+        "content": "The given code defines a custom loss function, Added_CrossEntropyLoss, which extends nn.Layer class and optionally uses hard example mining for better training by computing the loss for top k percent pixels. This loss function is designed to improve performance in image classification tasks using a weighted sum of binary cross-entropy and pixel loss with top-k pixel selection.",
+        "type": "summary"
+    },
+    "1646": {
+        "file_id": 141,
+        "content": "import paddle\nimport paddle.nn as nn\nimport os\nclass Added_BCEWithLogitsLoss(nn.Layer):\n    def __init__(self,\n                 top_k_percent_pixels=None,\n                 hard_example_mining_step=100000):\n        super(Added_BCEWithLogitsLoss, self).__init__()\n        self.top_k_percent_pixels = top_k_percent_pixels\n        if top_k_percent_pixels is not None:\n            assert (top_k_percent_pixels > 0 and top_k_percent_pixels < 1)\n        self.hard_example_mining_step = hard_example_mining_step\n        if self.top_k_percent_pixels == None:\n            self.bceloss = nn.BCEWithLogitsLoss(reduction='mean')\n        else:\n            self.bceloss = nn.BCEWithLogitsLoss(reduction='none')\n    def forward(self, dic_tmp, y, step):\n        final_loss = 0\n        for seq_name in dic_tmp.keys():\n            pred_logits = dic_tmp[seq_name]\n            gts = y[seq_name]\n            if self.top_k_percent_pixels == None:\n                final_loss += self.bceloss(pred_logits, gts)\n            else:\n                # Only compute the loss for top k percent pixels.",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/loss.py:1-28"
+    },
+    "1647": {
+        "file_id": 141,
+        "content": "This code defines a custom loss function that extends `nn.Layer` and uses BCEWithLogitsLoss from PaddlePaddle. It has an optional argument for top_k_percent_pixels to compute the loss only for the top k percent of pixels. If top_k_percent_pixels is None, it computes the mean loss for all pixels. The function also has a hard_example_mining_step parameter that may be used in future implementations but currently unused.",
+        "type": "comment"
+    },
+    "1648": {
+        "file_id": 141,
+        "content": "                # First, compute the loss for all pixels. Note we do not put the loss\n                # to loss_collection and set reduction = None to keep the shape.\n                num_pixels = float(pred_logits.shape[2] * pred_logits.shape[3])\n                pred_logits = pred_logits.view(\n                    -1, pred_logits.shape[1],\n                    pred_logits.shape[2] * pred_logits.shape[3])\n                gts = gts.view(-1, gts.shape[1], gts.shape[2] * gts.shape[3])\n                pixel_losses = self.bceloss(pred_logits, gts)\n                if self.hard_example_mining_step == 0:\n                    top_k_pixels = int(self.top_k_percent_pixels * num_pixels)\n                else:\n                    ratio = min(1.0,\n                                step / float(self.hard_example_mining_step))\n                    top_k_pixels = int((ratio * self.top_k_percent_pixels +\n                                        (1.0 - ratio)) * num_pixels)\n                _, top_k_indices = paddle.topk(pixel_losses,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/loss.py:29-44"
+    },
+    "1649": {
+        "file_id": 141,
+        "content": "Computes the loss for all pixels, without adding to loss_collection and keeps the shape. Then, based on hard example mining step, determines the number of top K pixels to consider.",
+        "type": "comment"
+    },
+    "1650": {
+        "file_id": 141,
+        "content": "                                               k=top_k_pixels,\n                                               axis=2)\n                final_loss += nn.BCEWithLogitsLoss(weight=top_k_indices,\n                                                   reduction='mean')(\n                                                       pred_logits, gts)\n        return final_loss\nclass Added_CrossEntropyLoss(nn.Layer):\n    def __init__(self,\n                 top_k_percent_pixels=None,\n                 hard_example_mining_step=100000):\n        super(Added_CrossEntropyLoss, self).__init__()\n        self.top_k_percent_pixels = top_k_percent_pixels\n        if top_k_percent_pixels is not None:\n            assert (top_k_percent_pixels > 0 and top_k_percent_pixels < 1)\n        self.hard_example_mining_step = hard_example_mining_step\n        if self.top_k_percent_pixels == None:\n            self.celoss = nn.CrossEntropyLoss(ignore_index=255,\n                                              reduction='mean')\n        else:\n            self.celoss = nn.CrossEntropyLoss(ignore_index=255,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/loss.py:45-67"
+    },
+    "1651": {
+        "file_id": 141,
+        "content": "This code defines a custom loss function, Added_CrossEntropyLoss, that extends nn.Layer class. It has an optional parameter, top_k_percent_pixels, which determines whether to use hard example mining for better training. If this parameter is None, it falls back to using nn.CrossEntropyLoss with mean reduction. The code also initializes other attributes like self.top_k_percent_pixels and self.hard_example_mining_step based on the provided values.",
+        "type": "comment"
+    },
+    "1652": {
+        "file_id": 141,
+        "content": "                                              reduction='none')\n    def forward(self, dic_tmp, y, step):\n        final_loss = 0\n        for seq_name in dic_tmp.keys():\n            pred_logits = dic_tmp[seq_name]\n            gts = y[seq_name]\n            if self.top_k_percent_pixels == None:\n                final_loss += self.celoss(pred_logits, gts)\n            else:\n                # Only compute the loss for top k percent pixels.\n                # First, compute the loss for all pixels. Note we do not put the loss\n                # to loss_collection and set reduction = None to keep the shape.\n                num_pixels = float(pred_logits.shape[2] * pred_logits.shape[3])\n                pred_logits = pred_logits.reshape([\n                    pred_logits.shape[1],\n                    pred_logits.shape[2] * pred_logits.shape[3]\n                ]).transpose([1, 0])\n                gts = gts.reshape([gts.shape[1] * gts.shape[2]])\n                pixel_losses = self.celoss(pred_logits, gts).reshape([1, -1])",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/loss.py:68-87"
+    },
+    "1653": {
+        "file_id": 141,
+        "content": "Computes the loss for top k percent pixels by first computing the loss for all pixels, reshaping them, and then selecting only the top k percent.",
+        "type": "comment"
+    },
+    "1654": {
+        "file_id": 141,
+        "content": "                if self.hard_example_mining_step == 0:\n                    top_k_pixels = int(self.top_k_percent_pixels * num_pixels)\n                else:\n                    ratio = min(1.0,\n                                step / float(self.hard_example_mining_step))\n                    top_k_pixels = int((ratio * self.top_k_percent_pixels +\n                                        (1.0 - ratio)) * num_pixels)\n                top_k_loss, top_k_indices = paddle.topk(pixel_losses,\n                                                        k=top_k_pixels,\n                                                        axis=1)\n                final_loss += paddle.mean(top_k_loss)\n        return final_loss\nclass AddedEdge_CrossEntropyLoss(nn.Layer):\n    def __init__(self,\n                 top_k_percent_pixels=None,\n                 hard_example_mining_step=100000):\n        super(AddedEdge_CrossEntropyLoss, self).__init__()\n        self.top_k_percent_pixels = top_k_percent_pixels\n        if top_k_percent_pixels is not None:",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/loss.py:88-109"
+    },
+    "1655": {
+        "file_id": 141,
+        "content": "The code defines a class called \"AddedEdge_CrossEntropyLoss\" which extends the base Layer class. It calculates the cross-entropy loss for a classification task while implementing hard example mining and top-k pixel selection strategies to improve performance. The top_k_percent_pixels and hard_example_mining_step parameters control these strategies, with different behavior depending on the current step value. The code block provided calculates the final loss by averaging over the top-k losses.",
+        "type": "comment"
+    },
+    "1656": {
+        "file_id": 141,
+        "content": "            assert (top_k_percent_pixels > 0 and top_k_percent_pixels < 1)\n        self.hard_example_mining_step = hard_example_mining_step\n        self.celoss = None\n    def forward(self, pred_logits, gts, step):\n        pos_num = paddle.sum(gts == 1, dtype='float32')\n        neg_num = paddle.sum(gts == 0, dtype='float32')\n        weight_pos = neg_num / (pos_num + neg_num)\n        weight_neg = pos_num / (pos_num + neg_num)\n        weights = paddle.to_tensor([weight_neg, weight_pos])\n        if self.top_k_percent_pixels == None:\n            sig_pred_logits = paddle.nn.functional.sigmoid(pred_logits)\n            self.bceloss = nn.BCEWithLogitsLoss(pos_weight=weight_pos,\n                                                reduction='mean')\n            if paddle.sum(gts) == 0:\n                dcloss = 0\n            else:\n                dcloss = (paddle.sum(sig_pred_logits * sig_pred_logits) +\n                          paddle.sum(gts * gts)) / (\n                              paddle.sum(2 * sig_pred_logits * gts) + 1e-5)",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/loss.py:110-130"
+    },
+    "1657": {
+        "file_id": 141,
+        "content": "This code defines a class for a loss function with hard example mining step, top_k_percent_pixels and forward method. It calculates weights based on positive and negative numbers, and applies them to the BCEWithLogitsLoss if top_k_percent_pixels is None. The code also calculates the dcloss for cases where gts sum is 0.",
+        "type": "comment"
+    },
+    "1658": {
+        "file_id": 141,
+        "content": "            final_loss = 0.1 * self.bceloss(pred_logits, gts) + dcloss\n        else:\n            self.celoss = nn.CrossEntropyLoss(weight=weights,\n                                              ignore_index=255,\n                                              reduction='none')\n            num_pixels = float(pred_logits.shape[2] * pred_logits.shape[3])\n            pred_logits = pred_logits.view(\n                -1, pred_logits.shape[1],\n                pred_logits.shape[2] * pred_logits.shape[3])\n            gts = gts.view(-1, gts.shape[2] * gts.shape[3])\n            pixel_losses = self.celoss(pred_logits, gts)\n            if self.hard_example_mining_step == 0:\n                top_k_pixels = int(self.top_k_percent_pixels * num_pixels)\n            else:\n                ratio = min(1.0, step / float(self.hard_example_mining_step))\n                top_k_pixels = int((ratio * self.top_k_percent_pixels +\n                                    (1.0 - ratio)) * num_pixels)\n            top_k_loss, top_k_indices = paddle.topk(pixel_losses,",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/loss.py:131-148"
+    },
+    "1659": {
+        "file_id": 141,
+        "content": "The code calculates the final loss for an image classification task. If the step is not zero, it uses hard example mining to calculate the pixel losses and select top K pixels based on a ratio of the current step. The final_loss is a weighted sum of binary cross-entropy (bceloss) and pixel loss.",
+        "type": "comment"
+    },
+    "1660": {
+        "file_id": 141,
+        "content": "                                                    k=top_k_pixels,\n                                                    axis=1)\n            final_loss = paddle.mean(top_k_loss)\n        return final_loss",
+        "type": "code",
+        "location": "/applications/Ma-Net/networks/loss.py:149-153"
+    },
+    "1661": {
+        "file_id": 141,
+        "content": "This code calculates the mean loss value by taking top-k pixel values from input images, and then averages them. This can be useful in image recognition tasks where some pixels have higher importance or relevance.",
+        "type": "comment"
+    },
+    "1662": {
+        "file_id": 142,
+        "content": "/applications/Ma-Net/run.sh",
+        "type": "filepath"
+    },
+    "1663": {
+        "file_id": 142,
+        "content": "The code prepares an environment for training and testing a computer vision model using the DeeplabV3_coco pre-trained model on the DAVIS dataset. It sets parameters, saves intermediate results, and runs test.py for actual testing without IntSeg enabled.",
+        "type": "summary"
+    },
+    "1664": {
+        "file_id": 142,
+        "content": "PRETRAIN_MODEL='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/DeeplabV3_coco.pdparams'\nVOS_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/MaNet_davis2017_stage1.pdparams'\n#VOS_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/stage1'\nINT_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/MANet_davis2017.pdparams'\n#INT_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/stage2'\nINT_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/result'\nRESCALE=416\nRANDOMCROP=416\nDATA_ROOT='/home/lc/PaddleVideo/data/DAVIS'\necho 'Stage1 training'\nCUDA_VISIBLE_DEVICE=3 python train_stage1.py --SAVE_RESULT_DIR $VOS_SAVE_RESULT_DIR --PRETRAINED_MODEL $PRETRAIN_MODEL --DATA_ROOT $DATA_ROOT --TRAIN_BATCH_SIZE 2 --DATA_RESCALE $RESCALE --DATA_RANDOMCROP $RANDOMCROP --TRAIN_LR 0.0007  --MODEL_MAX_LOCAL_DISTANCE 12\necho 'Stage2 training'\npython train_stage2.py --SAVE_RESULT_DIR $INT_SAVE_RESULT_DIR --SAVE_VOS_RESULT_DIR $",
+        "type": "code",
+        "location": "/applications/Ma-Net/run.sh:1-13"
+    },
+    "1665": {
+        "file_id": 142,
+        "content": "The code sets environment variables, then runs two training scripts for a computer vision model in stages. It uses pre-trained DeeplabV3_coco model, and saves intermediate and final results to specified directories. The model is trained with specific parameters on the DAVIS dataset.",
+        "type": "comment"
+    },
+    "1666": {
+        "file_id": 142,
+        "content": "VOS_SAVE_RESULT_DIR --DATA_ROOT $DATA_ROOT --DATA_RESCALE $RESCALE --DATA_RANDOMCROP $RANDOMCROP  --PRETRAINED_MODEL $PRETRAIN_MODEL\necho 'Testing'\npython test.py --DATA_ROOT $DATA_ROOT --SAVE_RESULT_DIR $INT_SAVE_RESULT_DIR  --RESULT_ROOT $INT_RESULT_DIR --MODEL_USEIntSeg False --TEST_MODE True",
+        "type": "code",
+        "location": "/applications/Ma-Net/run.sh:13-15"
+    },
+    "1667": {
+        "file_id": 142,
+        "content": "This code is setting up a testing environment for a video object segmentation task. It specifies the data root, save result directory, pre-trained model path and then executes a test.py script to perform the actual testing. The model is set to not use IntSeg and the TEST_MODE flag is set to True, indicating that this is indeed a testing run.",
+        "type": "comment"
+    },
+    "1668": {
+        "file_id": 143,
+        "content": "/applications/Ma-Net/test.py",
+        "type": "filepath"
+    },
+    "1669": {
+        "file_id": 143,
+        "content": "The code prepares for DAVIS2017 image processing, initializes variables, and utilizes PaddlePaddle for video object detection. It involves an interactive image classification system with 8 turns, optimizing scribble labels and filtering keys.",
+        "type": "summary"
+    },
+    "1670": {
+        "file_id": 143,
+        "content": "import cv2\nimport os\nimport json\nimport paddle\nfrom PIL import Image\nimport timeit\nimport numpy as np\nfrom paddle.vision import transforms\nfrom dataloaders.davis_2017_f import DAVIS2017_Feature_Extract\nimport dataloaders.custom_transforms_f as tr\nfrom davisinteractive.session import DavisInteractiveSession\nfrom networks.deeplab import DeepLab\nfrom networks.IntVOS import IntVOS\nimport time\nfrom davisinteractive.utils.scribbles import scribbles2mask, annotated_frames\nfrom config import cfg\nfrom paddle import nn\nfrom paddle.io import DataLoader\nfrom utils.api import float_, byte_\n@paddle.no_grad()\ndef main():\n    paddle.set_device(\"gpu:0\")\n    total_frame_num_dic = {}\n    #################\n    seqs = []\n    with open(os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',\n                           'val' + '.txt')) as f:\n        seqs_tmp = f.readlines()\n        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n        seqs.extend(seqs_tmp)\n    h_w_dic = {}\n    for seq_name in seqs:\n        images = np.sort(\n            os.listdir(",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:1-39"
+    },
+    "1671": {
+        "file_id": 143,
+        "content": "The code imports necessary libraries and defines functions for processing image data from the DAVIS2017 dataset. It sets up the required transforms, initializes the network models, and reads in the dataset sequences.",
+        "type": "comment"
+    },
+    "1672": {
+        "file_id": 143,
+        "content": "                os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/',\n                             seq_name.strip())))\n        total_frame_num_dic[seq_name] = len(images)\n        im_ = cv2.imread(\n            os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/', seq_name,\n                         '00000.jpg'))\n        im_ = np.array(im_, dtype=np.float32)\n        hh_, ww_ = im_.shape[:2]\n        h_w_dic[seq_name] = (hh_, ww_)\n    _seq_list_file = os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',\n                                  'v_a_l' + '_instances.txt')\n    seq_dict = json.load(open(_seq_list_file, 'r'))\n    ##################\n    seq_imgnum_dict_ = {}\n    seq_imgnum_dict = os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',\n                                   'val_imgnum.txt')\n    if os.path.isfile(seq_imgnum_dict):\n        seq_imgnum_dict_ = json.load(open(seq_imgnum_dict, 'r'))\n    else:\n        for seq in os.listdir(os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/')):\n            seq_imgnum_dict_[seq] = len(\n                os.listdir(os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/',",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:40-62"
+    },
+    "1673": {
+        "file_id": 143,
+        "content": "This code reads a configuration file and initializes variables for video analysis. It loads image information from disk, checks if an existing imgnum dictionary is available, and if not, it populates it by iterating through directories and counting images.",
+        "type": "comment"
+    },
+    "1674": {
+        "file_id": 143,
+        "content": "                                        seq)))\n        with open(seq_imgnum_dict, 'w') as f:\n            json.dump(seq_imgnum_dict_, f)\n    ##################\n    is_save_image = False  # Save the predicted masks\n    report_save_dir = cfg.RESULT_ROOT\n    save_res_dir = cfg.SAVE_RESULT_DIR  # changed to path\n    if not os.path.exists(cfg.RESULT_ROOT):\n        os.makedirs(cfg.RESULT_ROOT)\n        # Configuration used in the challenges\n    max_nb_interactions = 8  # Maximum number of interactions\n    max_time_per_interaction = 30  # Maximum time per interaction per object\n    # Total time available to interact with a sequence and an initial set of scribbles\n    max_time = max_nb_interactions * max_time_per_interaction  # Maximum time per object\n    # Interactive parameters\n    subset = 'val'\n    host = 'localhost'  # 'localhost' for subsets train and val.\n    feature_extracter = DeepLab(backbone='resnet', freeze_bn=False)\n    model = IntVOS(cfg, feature_extracter)\n    print('model loading...')\n    saved_model_dict = save_res_dir",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:63-87"
+    },
+    "1675": {
+        "file_id": 143,
+        "content": "Creating a dictionary of image numbers and saving it, setting save flags for predicted masks, checking if results directory exists and creating it if not, defining maximum interactive parameters, importing DeepLab model and Instant VOS model, and loading the saved model from specified location.",
+        "type": "comment"
+    },
+    "1676": {
+        "file_id": 143,
+        "content": "    pretrained_dict = paddle.load(saved_model_dict)\n    load_network(model, pretrained_dict)\n    print(f'model loading from {saved_model_dict} finished!')\n    model.eval()\n    inter_file = open(os.path.join(cfg.RESULT_ROOT, 'inter_file.txt'), 'w')\n    resized_h, resized_w = 480, 854\n    ###############################\n    composed_transforms = transforms.Compose(\n        [tr.Resize((resized_h, resized_w)),\n         tr.ToTensor()])\n    ###############################\n    seen_seq = []\n    n = 0\n    max_n = 1\n    with DavisInteractiveSession(host=host,\n                                 davis_root=cfg.DATA_ROOT,\n                                 subset=subset,\n                                 report_save_dir=report_save_dir,\n                                 max_nb_interactions=max_nb_interactions,\n                                 max_time=max_time,\n                                 metric_to_optimize='J') as sess:\n        while sess.next():\n            t_total = timeit.default_timer()\n            # Get the current iteration scribbles",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:88-113"
+    },
+    "1677": {
+        "file_id": 143,
+        "content": "This code loads a pre-trained model, evaluates it, and initializes variables for processing scribbles. The code also defines a transform to resize images to specific dimensions, opens a file for writing, and sets up a DavisInteractiveSession object for iterating over interaction data. The session will continue until there are no more interactions left in the dataset.",
+        "type": "comment"
+    },
+    "1678": {
+        "file_id": 143,
+        "content": "            sequence, scribbles, first_scribble = sess.get_scribbles(\n                only_last=True)\n            h, w = h_w_dic[sequence]\n            if 'prev_label_storage' not in locals().keys():\n                prev_label_storage = paddle.zeros(\n                    [104, h, w])  # because the maximum length of frames is 104.\n            print(sequence)\n            h, w = h_w_dic[sequence]\n            if len(\n                    annotated_frames(scribbles)\n            ) == 0:  # if no scribbles return, keep masks in previous round\n                final_masks = prev_label_storage[:seq_imgnum_dict_[sequence]]\n                sess.submit_masks(final_masks.numpy())\n            else:\n                start_annotated_frame = annotated_frames(scribbles)[0]\n                pred_masks = []\n                pred_masks_reverse = []\n                if first_scribble:  # If in the first round, initialize memories\n                    n_interaction = 1\n                    eval_global_map_tmp_dic = {}\n                    local_map_dics = ({}, {})",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:115-139"
+    },
+    "1679": {
+        "file_id": 143,
+        "content": "The code is retrieving scribbles and their corresponding sequence, image dimensions are assigned based on the dictionary h_w_dic. If there are no annotated frames from the scribbles, it returns previous masks and submits them. Otherwise, it initializes memories for the first round.",
+        "type": "comment"
+    },
+    "1680": {
+        "file_id": 143,
+        "content": "                    total_frame_num = total_frame_num_dic[sequence]\n                    obj_nums = seq_dict[sequence][-1]\n                else:\n                    n_interaction += 1\n                ##\n                inter_file.write(sequence + ' ' + 'interaction' +\n                                 str(n_interaction) + ' ' + 'frame' +\n                                 str(start_annotated_frame) + '\\n')\n                ##\n                ##########################Reference image process\n                if first_scribble:  # if in the first round, extract pixel embbedings.\n                    if sequence not in seen_seq:\n                        inter_turn = 1\n                        seen_seq.append(sequence)\n                        embedding_memory = []\n                        test_dataset = DAVIS2017_Feature_Extract(\n                            root=cfg.DATA_ROOT,\n                            transform=composed_transforms,\n                            seq_name=sequence)\n                        testloader = DataLoader(test_dataset,",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:140-163"
+    },
+    "1681": {
+        "file_id": 143,
+        "content": "This code is part of an interaction detection process. It writes the interaction details to a file, including the sequence name, type, and frame number. It also checks if the sequence has been seen before and prepares embedding memory for reference image processing. The code uses DAVIS2017_Feature_Extract to extract pixel embeddings in the first round of annotations.",
+        "type": "comment"
+    },
+    "1682": {
+        "file_id": 143,
+        "content": "                                                batch_size=14,\n                                                shuffle=False,\n                                                num_workers=cfg.NUM_WORKER)\n                        for ii, sample in enumerate(testloader):\n                            imgs = sample['img1']\n                            frame_embedding = model.extract_feature(imgs)\n                            embedding_memory.append(frame_embedding)\n                        del frame_embedding\n                        embedding_memory = paddle.concat(embedding_memory, 0)\n                        _, _, emb_h, emb_w = embedding_memory.shape\n                        ref_frame_embedding = embedding_memory[\n                            start_annotated_frame]\n                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                    else:\n                        inter_turn += 1\n                        ref_frame_embedding = embedding_memory[\n                            start_annotated_frame]",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:164-182"
+    },
+    "1683": {
+        "file_id": 143,
+        "content": "This code is iterating through testloader and extracting frame embeddings for each image. The extracted embeddings are then concatenated to form a single embedding memory. If annotated frames are present, the reference frame embedding is extracted from the embedding memory.",
+        "type": "comment"
+    },
+    "1684": {
+        "file_id": 143,
+        "content": "                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                else:\n                    ref_frame_embedding = embedding_memory[\n                        start_annotated_frame]\n                    ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                ########\n                scribble_masks = scribbles2mask(scribbles, (emb_h, emb_w))\n                scribble_label = scribble_masks[start_annotated_frame]\n                scribble_sample = {'scribble_label': scribble_label}\n                scribble_sample = tr.ToTensor()(scribble_sample)\n                #                     print(ref_frame_embedding, ref_frame_embedding.shape)\n                scribble_label = scribble_sample['scribble_label']\n                scribble_label = scribble_label.unsqueeze(0)\n                ######\n                if is_save_image:\n                    ref_scribble_to_show = scribble_label.squeeze().numpy()\n                    im_ = Image.fromarray(\n                        ref_scribble_to_show.astype('uint8')).convert('P', )",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:183-203"
+    },
+    "1685": {
+        "file_id": 143,
+        "content": "The code applies scribbles to an image using a mask and generates corresponding labels. It then creates a scribble sample and converts it into a tensor. If is_save_image is True, the scribble label image is saved as a PALETTE image.",
+        "type": "comment"
+    },
+    "1686": {
+        "file_id": 143,
+        "content": "                    im_.putpalette(_palette)\n                    ref_img_name = str(start_annotated_frame)\n                    if not os.path.exists(\n                            os.path.join(cfg.RESULT_ROOT, sequence,\n                                         'interactive' + str(n_interaction),\n                                         'turn' + str(inter_turn))):\n                        os.makedirs(\n                            os.path.join(cfg.RESULT_ROOT, sequence,\n                                         'interactive' + str(n_interaction),\n                                         'turn' + str(inter_turn)))\n                    im_.save(\n                        os.path.join(cfg.RESULT_ROOT, sequence,\n                                     'interactive' + str(n_interaction),\n                                     'turn' + str(inter_turn),\n                                     'inter_' + ref_img_name + '.png'))\n                scribble_label = scribble_label\n                #######\n                if first_scribble:",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:204-224"
+    },
+    "1687": {
+        "file_id": 143,
+        "content": "This code segment saves a scribble image with the palette applied to a specific directory path based on input parameters. It first checks if the necessary directory exists and creates it if it doesn't, then proceeds to save the image within this directory. The 'first_scribble' variable is used in decision making further down the code.",
+        "type": "comment"
+    },
+    "1688": {
+        "file_id": 143,
+        "content": "                    prev_label = None\n                    prev_label_storage = paddle.zeros([104, h, w])\n                    prev_label_storage = prev_label_storage\n                else:\n                    prev_label = prev_label_storage[start_annotated_frame]\n                    prev_label = prev_label.unsqueeze(0).unsqueeze(0)\n                if not first_scribble and paddle.unique(\n                        scribble_label).shape[0] == 1:\n                    final_masks = prev_label_storage[:\n                                                     seq_imgnum_dict_[sequence]]\n                    sess.submit_masks(final_masks.numpy())\n                else:  ###inteaction segmentation head\n                    print('inteaction segmentation head')\n                    tmp_dic, local_map_dics = model.int_seghead(\n                        ref_frame_embedding=ref_frame_embedding,\n                        ref_scribble_label=scribble_label,\n                        prev_round_label=prev_label,\n                        global_map_tmp_dic=eval_global_map_tmp_dic,",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:226-244"
+    },
+    "1689": {
+        "file_id": 143,
+        "content": "This code snippet initializes variables and checks for specific conditions in a segmentation model. It handles the previous label and label storage, updates them based on certain conditions, and submits the final masks to the session if necessary. The interaction segmentation head is printed as a comment.",
+        "type": "comment"
+    },
+    "1690": {
+        "file_id": 143,
+        "content": "                        local_map_dics=local_map_dics,\n                        interaction_num=n_interaction,\n                        seq_names=[sequence],\n                        gt_ids=paddle.to_tensor([obj_nums]),\n                        frame_num=[start_annotated_frame],\n                        first_inter=first_scribble)\n                    pred_label = tmp_dic[sequence]\n                    pred_label = nn.functional.interpolate(pred_label,\n                                                           size=(h, w),\n                                                           mode='bilinear',\n                                                           align_corners=True)\n                    pred_label = paddle.argmax(pred_label, axis=1)\n                    pred_masks.append(float_(pred_label))\n                    prev_label_storage[start_annotated_frame] = float_(\n                        pred_label[0])\n                    if is_save_image:  # save image\n                        pred_label_to_save = pred_label.squeeze(0).numpy()",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:245-262"
+    },
+    "1691": {
+        "file_id": 143,
+        "content": "This code snippet is part of a PaddleVideo application called Ma-Net, which seems to be related to object detection and video analysis. The code creates a dictionary for input data, retrieves the predicted label from a temporal dictionary, applies interpolation to resize the label, selects the maximum value along an axis, appends the mask to a list of masks, stores the previous label at a specific frame, and saves the predicted label as a numpy array if needed.",
+        "type": "comment"
+    },
+    "1692": {
+        "file_id": 143,
+        "content": "                        im = Image.fromarray(\n                            pred_label_to_save.astype('uint8')).convert('P', )\n                        im.putpalette(_palette)\n                        imgname = str(start_annotated_frame)\n                        while len(imgname) < 5:\n                            imgname = '0' + imgname\n                        if not os.path.exists(\n                                os.path.join(cfg.RESULT_ROOT, sequence,\n                                             'interactive' + str(n_interaction),\n                                             'turn' + str(inter_turn))):\n                            os.makedirs(\n                                os.path.join(cfg.RESULT_ROOT, sequence,\n                                             'interactive' + str(n_interaction),\n                                             'turn' + str(inter_turn)))\n                        im.save(\n                            os.path.join(cfg.RESULT_ROOT, sequence,\n                                         'interactive' + str(n_interaction),",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:263-279"
+    },
+    "1693": {
+        "file_id": 143,
+        "content": "This code snippet saves an interactive video frame as a labeled image. It first converts the predicted label to an array, then converts it into a format suitable for saving as an image with the 'P' mode and using a specific palette. The filename is created based on the current frame number and if the directory doesn't exist, it creates one. Finally, the labeled image is saved in the specified directory.",
+        "type": "comment"
+    },
+    "1694": {
+        "file_id": 143,
+        "content": "                                         'turn' + str(inter_turn),\n                                         imgname + '.png'))\n                    #######################################\n                    if first_scribble:\n                        scribble_label = rough_ROI(scribble_label)\n                    ##############################\n                    ref_prev_label = pred_label.unsqueeze(0)\n                    prev_label = pred_label.unsqueeze(0)\n                    prev_embedding = ref_frame_embedding\n                    #### Propagation ->\n                    for ii in range(start_annotated_frame + 1, total_frame_num):\n                        current_embedding = embedding_memory[ii]\n                        current_embedding = current_embedding.unsqueeze(0)\n                        prev_label = prev_label\n                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.prop_seghead(\n                            ref_frame_embedding,\n                            prev_embedding,\n                            current_embedding,",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:280-298"
+    },
+    "1695": {
+        "file_id": 143,
+        "content": "This code is part of a video object detection algorithm. It's using a pre-trained model to generate segmentation masks for each frame in the video. The 'turn' and 'imgname' are used as file names for saving images. It applies initial scribble_label if it's the first frame, then it updates previous label and embedding for propagating prediction to next frames. It uses the model's prop_seghead function to generate predictions for each frame's segmentation mask.",
+        "type": "comment"
+    },
+    "1696": {
+        "file_id": 143,
+        "content": "                            scribble_label,\n                            prev_label,\n                            normalize_nearest_neighbor_distances=True,\n                            use_local_map=True,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            k_nearest_neighbors=cfg.KNNS,\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            start_annotated_frame=start_annotated_frame,\n                            frame_num=[ii],\n                            dynamic_seghead=model.dynamic_seghead)\n                        pred_label = tmp_dic[sequence]\n                        pred_label = nn.functional.interpolate(\n                            pred_label,\n                            size=(h, w),\n                            mode='bilinear',\n                            align_corners=True)",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:299-318"
+    },
+    "1697": {
+        "file_id": 143,
+        "content": "Code snippet is calling a function, passing several arguments such as scribble_label, prev_label, and others. It assigns the returned value to pred_label after applying interpolation on it.",
+        "type": "comment"
+    },
+    "1698": {
+        "file_id": 143,
+        "content": "                        pred_label = paddle.argmax(pred_label, axis=1)\n                        pred_masks.append(float_(pred_label))\n                        prev_label = pred_label.unsqueeze(0)\n                        prev_embedding = current_embedding\n                        prev_label_storage[ii] = float_(pred_label[0])\n                        ####\n                        if is_save_image:\n                            pred_label_to_save = pred_label.squeeze(0).numpy()\n                            im = Image.fromarray(\n                                pred_label_to_save.astype('uint8')).convert(\n                                    'P', )\n                            im.putpalette(_palette)\n                            imgname = str(ii)\n                            while len(imgname) < 5:\n                                imgname = '0' + imgname\n                            if not os.path.exists(\n                                    os.path.join(\n                                        cfg.RESULT_ROOT, sequence,\n                                        'interactive' + str(n_interaction),",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:320-338"
+    },
+    "1699": {
+        "file_id": 143,
+        "content": "Code snippet handles image saving for each prediction label in a loop, storing the prediction label as a numpy array and converting it to an image using Pillow library. It then sets the palette and saves the image with a sequence number and interaction number, creating directories if they don't exist.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/17.json b/docs/data/17.json
new file mode 100644
index 000000000..f49df61af
--- /dev/null
+++ b/docs/data/17.json
@@ -0,0 +1,550 @@
+{
+    "1700": {
+        "file_id": 143,
+        "content": "                                        'turn' + str(inter_turn))):\n                                os.makedirs(\n                                    os.path.join(\n                                        cfg.RESULT_ROOT, sequence,\n                                        'interactive' + str(n_interaction),\n                                        'turn' + str(inter_turn)))\n                            im.save(\n                                os.path.join(cfg.RESULT_ROOT, sequence,\n                                             'interactive' + str(n_interaction),\n                                             'turn' + str(inter_turn),\n                                             imgname + '.png'))\n                    #######################################\n                    prev_label = ref_prev_label\n                    prev_embedding = ref_frame_embedding\n                    #######\n                    # Propagation <-\n                    for ii in range(start_annotated_frame):\n                        current_frame_num = start_annotated_frame - 1 - ii",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:339-356"
+    },
+    "1701": {
+        "file_id": 143,
+        "content": "Code creates folders and saves image, then resets variables for frame propagation.",
+        "type": "comment"
+    },
+    "1702": {
+        "file_id": 143,
+        "content": "                        current_embedding = embedding_memory[current_frame_num]\n                        current_embedding = current_embedding.unsqueeze(0)\n                        prev_label = prev_label\n                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.prop_seghead(\n                            ref_frame_embedding,\n                            prev_embedding,\n                            current_embedding,\n                            scribble_label,\n                            prev_label,\n                            normalize_nearest_neighbor_distances=True,\n                            use_local_map=True,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            k_nearest_neighbors=cfg.KNNS,\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            start_annotated_frame=start_annotated_frame,",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:357-374"
+    },
+    "1703": {
+        "file_id": 143,
+        "content": "This code section is using PaddlePaddle, a machine learning framework. It seems to be part of an object detection model for video sequences. The function 'model.prop_seghead' is being called with multiple embeddings and labels, and it returns three outputs (tmp_dic, eval_global_map_tmp_dic, local_map_dics) based on the input parameters. Normalization and nearest neighbor distances are used in this process as well. The 'cfg.KNS' likely refers to a pre-defined constant or configuration related to k-nearest neighbors (kNN). Finally, 'n_interaction', 'start_annotated_frame' variables represent interaction numbers and starting frame for annotated frames, respectively.",
+        "type": "comment"
+    },
+    "1704": {
+        "file_id": 143,
+        "content": "                            frame_num=[current_frame_num],\n                            dynamic_seghead=model.dynamic_seghead)\n                        pred_label = tmp_dic[sequence]\n                        pred_label = nn.functional.interpolate(\n                            pred_label,\n                            size=(h, w),\n                            mode='bilinear',\n                            align_corners=True)\n                        pred_label = paddle.argmax(pred_label, axis=1)\n                        pred_masks_reverse.append(float_(pred_label))\n                        prev_label = pred_label.unsqueeze(0)\n                        prev_embedding = current_embedding\n                        ####\n                        prev_label_storage[current_frame_num] = float_(\n                            pred_label[0])\n                        ###\n                        if is_save_image:\n                            pred_label_to_save = pred_label.squeeze(0).numpy()\n                            im = Image.fromarray(",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:375-394"
+    },
+    "1705": {
+        "file_id": 143,
+        "content": "This code appears to be part of a larger function or script. It seems to involve image processing and potentially object detection or classification. The code is looping through frames of an input video, extracting features and predictions from a model, interpolating the predictions for size consistency, then appending the predicted labels (or masks) to a list. It also stores the last prediction for each frame and optionally saves one of those predictions as an image. The code appears to be part of an object detection or classification task where it is updating the output based on new frames and previous frames' outputs.",
+        "type": "comment"
+    },
+    "1706": {
+        "file_id": 143,
+        "content": "                                pred_label_to_save.astype('uint8')).convert(\n                                    'P', )\n                            im.putpalette(_palette)\n                            imgname = str(current_frame_num)\n                            while len(imgname) < 5:\n                                imgname = '0' + imgname\n                            if not os.path.exists(\n                                    os.path.join(\n                                        cfg.RESULT_ROOT, sequence,\n                                        'interactive' + str(n_interaction),\n                                        'turn' + str(inter_turn))):\n                                os.makedirs(\n                                    os.path.join(\n                                        cfg.RESULT_ROOT, sequence,\n                                        'interactive' + str(n_interaction),\n                                        'turn' + str(inter_turn)))\n                            im.save(\n                                os.path.join(cfg.RESULT_ROOT, sequence,",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:395-412"
+    },
+    "1707": {
+        "file_id": 143,
+        "content": "This code saves the predicted label as an image in a specific directory structure based on the current frame number, sequence name, and interaction turn. It ensures that the directory for the given combination of parameters exists before saving the image.",
+        "type": "comment"
+    },
+    "1708": {
+        "file_id": 143,
+        "content": "                                             'interactive' + str(n_interaction),\n                                             'turn' + str(inter_turn),\n                                             imgname + '.png'))\n                    pred_masks_reverse.reverse()\n                    pred_masks_reverse.extend(pred_masks)\n                    final_masks = paddle.concat(pred_masks_reverse, 0)\n                    sess.submit_masks(final_masks.numpy())\n            if inter_turn == 3 and n_interaction == 8:\n                del eval_global_map_tmp_dic\n                del local_map_dics\n                del embedding_memory\n                del prev_label_storage\n            t_end = timeit.default_timer()\n            print('Total time for single interaction: ' + str(t_end - t_total))\n        report = sess.get_report()\n        summary = sess.get_global_summary(\n            save_file=os.path.join(report_save_dir, 'summary.json'))\n    inter_file.close()\ndef rough_ROI(ref_scribble_labels):\n    dist = 20\n    b, _, h, w = ref_scribble_labels.shape",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:413-436"
+    },
+    "1709": {
+        "file_id": 143,
+        "content": "This code appears to be part of an interactive image classification system. The code is submitting masks for each turn and interacts up to 8 times, storing the results in memory and then clearing them after completion. At the end, it prints the total time taken for a single interaction and gets the report and summary from the session. The rough_ROI function seems to calculate distances based on input labels.",
+        "type": "comment"
+    },
+    "1710": {
+        "file_id": 143,
+        "content": "    filter_ = paddle.zeros_like(ref_scribble_labels)\n    to_fill = paddle.zeros_like(ref_scribble_labels)\n    for i in range(b):\n        no_background = (ref_scribble_labels[i] != -1)\n        no_background = no_background.squeeze(0)\n        no_b = no_background.nonzero()\n        (h_min, w_min) = paddle.min(no_b, 0)\n        (h_max, w_max) = paddle.max(no_b, 0)\n        filter_[i, 0,\n                max(h_min - dist, 0):min(h_max + dist, h - 1),\n                max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1\n    final_scribble_labels = paddle.where(byte_(filter_), ref_scribble_labels,\n                                         to_fill)\n    return final_scribble_labels\ndef load_network(net, pretrained_dict):\n    model_dict = net.state_dict()\n    # 1. filter out unnecessary keys\n    f_pretrained_dict = {}\n    for k, v in pretrained_dict.items():\n        if k in model_dict:\n            f_pretrained_dict[k] = v\n        else:\n            print(k)\n    print(len(model_dict.keys()), len(pretrained_dict.keys()))\n    # 2. overwrite entries in the existing state dict",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:437-468"
+    },
+    "1711": {
+        "file_id": 143,
+        "content": "The code is applying a filter to refine the scribble labels, where it creates a filter based on the position of non-background pixels and then applies it to the original scribble labels. The function load_network filters out unnecessary keys from pretrained_dict and overwrites entries in the state dict of the network.",
+        "type": "comment"
+    },
+    "1712": {
+        "file_id": 143,
+        "content": "    model_dict.update(pretrained_dict)\n    net.set_state_dict(model_dict)\n_palette = [\n    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,\n    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,\n    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,\n    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,\n    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,\n    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,\n    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,\n    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,\n    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,\n    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,\n    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:469-485"
+    },
+    "1713": {
+        "file_id": 143,
+        "content": "This code defines a palette with RGB values for 75 different colors.",
+        "type": "comment"
+    },
+    "1714": {
+        "file_id": 143,
+        "content": "    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,\n    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,\n    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,\n    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,\n    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,\n    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,\n    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,\n    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,\n    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,\n    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,\n    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:486-498"
+    },
+    "1715": {
+        "file_id": 143,
+        "content": "This code appears to be a sequence of numbers, which could potentially represent a list or array in the code.",
+        "type": "comment"
+    },
+    "1716": {
+        "file_id": 143,
+        "content": "    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,\n    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,\n    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,\n    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,\n    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,\n    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,\n    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,\n    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,\n    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,\n    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,\n    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:499-511"
+    },
+    "1717": {
+        "file_id": 143,
+        "content": "This code contains 210 consecutive numbers, possibly representing the iteration or indexing in a loop.",
+        "type": "comment"
+    },
+    "1718": {
+        "file_id": 143,
+        "content": "    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,\n    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,\n    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,\n    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,\n    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,\n    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,\n    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,\n    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,\n    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,\n    255, 255\n]\nif __name__ == '__main__':\n    main()",
+        "type": "code",
+        "location": "/applications/Ma-Net/test.py:512-525"
+    },
+    "1719": {
+        "file_id": 143,
+        "content": "The code consists of a sequence of integers. It's not executable and doesn't have any apparent function or variable assignments. The specific use case or purpose of these numbers is unclear without further context.",
+        "type": "comment"
+    },
+    "1720": {
+        "file_id": 144,
+        "content": "/applications/Ma-Net/train_stage1.py",
+        "type": "filepath"
+    },
+    "1721": {
+        "file_id": 144,
+        "content": "The code trains and applies Ma-Net model for video object detection, preprocesses images, visualizes results, and uses neural network segmentation and classification. The \"train_stage1.py\" file in PaddleVideo/applications/Ma-Net project creates a manager object and trains it with numbers as arguments or configuration.",
+        "type": "summary"
+    },
+    "1722": {
+        "file_id": 144,
+        "content": "import cv2\nimport paddle\nimport paddle.nn as nn\nimport os\nimport numpy as np\nfrom paddle.io import DataLoader\nimport paddle.optimizer as optim\nfrom paddle.vision import transforms\nfrom dataloaders.davis_2017_f import DAVIS2017_VOS_Train, DAVIS2017_VOS_Test\nimport dataloaders.custom_transforms_f as tr\nfrom dataloaders.samplers import RandomIdentitySampler\nfrom networks.deeplab import DeepLab\nfrom networks.IntVOS import IntVOS\nfrom networks.loss import Added_BCEWithLogitsLoss, Added_CrossEntropyLoss\nfrom config import cfg\nfrom utils.api import float_, clip_grad_norm_, int_, long_\nfrom utils.meters import AverageMeter\nfrom utils.mask_damaging import damage_masks\nfrom utils.utils import label2colormap\nfrom PIL import Image\nimport scipy.misc as sm\nimport time\n# import logging\npaddle.disable_static()\npaddle.device.set_device('gpu:0')\nclass Manager(object):\n    def __init__(self,\n                 use_gpu=True,\n                 time_budget=None,\n                 save_result_dir=cfg.SAVE_RESULT_DIR,\n                 pretrained=True,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:1-34"
+    },
+    "1723": {
+        "file_id": 144,
+        "content": "The code imports necessary libraries, defines classes for data loaders and networks, sets up device and environment configurations, and initializes the Manager class with optional parameters.",
+        "type": "comment"
+    },
+    "1724": {
+        "file_id": 144,
+        "content": "                 interactive_test=False,\n                 freeze_bn=False):\n        self.save_res_dir = save_result_dir\n        self.time_budget = time_budget\n        self.feature_extracter = DeepLab(backbone='resnet', freeze_bn=freeze_bn)\n        if pretrained:\n            pretrained_dict = paddle.load(cfg.PRETRAINED_MODEL)\n            # pretrained_dict = np.load(cfg.PRETRAINED_MODEL, allow_pickle=True).item()\n            pretrained_dict = pretrained_dict['state_dict']\n            self.load_network(self.feature_extracter, pretrained_dict)\n            print('load pretrained model successfully.')\n        self.model = IntVOS(cfg, self.feature_extracter)\n        self.use_gpu = use_gpu\n        if use_gpu:\n            self.model = self.model\n    def train(self,\n              damage_initial_previous_frame_mask=True,\n              lossfunc='cross_entropy',\n              model_resume=False):\n        ###################\n        self.model.train()\n        running_loss = AverageMeter()\n        running_time = AverageMeter()",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:35-59"
+    },
+    "1725": {
+        "file_id": 144,
+        "content": "The code initializes a model for stage 1 training in Ma-Net application. It loads pretrained model if specified and sets the model to train mode. The `train` method starts the actual training by setting the model to train mode, initializing loss meters, and looping over batches to compute loss and time metrics.",
+        "type": "comment"
+    },
+    "1726": {
+        "file_id": 144,
+        "content": "        param_list = [{\n            'params': self.model.feature_extracter.parameters()\n        }, {\n            'params': self.model.semantic_embedding.parameters()\n        }, {\n            'params': self.model.dynamic_seghead.parameters()\n        }]\n        ########\n        clip = paddle.nn.ClipGradByGlobalNorm(\n            clip_norm=cfg.TRAIN_CLIP_GRAD_NORM)\n        #         clip = None\n        optimizer = optim.Momentum(parameters=param_list,\n                                   learning_rate=cfg.TRAIN_LR,\n                                   momentum=cfg.TRAIN_MOMENTUM,\n                                   weight_decay=cfg.TRAIN_WEIGHT_DECAY,\n                                   use_nesterov=True,\n                                   grad_clip=clip)\n        self.param_list = param_list\n        ###################\n        composed_transforms = transforms.Compose([\n            tr.RandomHorizontalFlip(cfg.DATA_RANDOMFLIP),\n            tr.RandomScale(),\n            tr.RandomCrop((cfg.DATA_RANDOMCROP, cfg.DATA_RANDOMCROP), 5),",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:61-87"
+    },
+    "1727": {
+        "file_id": 144,
+        "content": "This code defines a training stage for the Ma-Net model. It initializes parameters for three parts of the model: feature extractor, semantic embedding, and dynamic segment head. An optimizer using momentum is set up with specified learning rate, momentum, weight decay, and gradient clipping. A series of data transformations are applied to input images.",
+        "type": "comment"
+    },
+    "1728": {
+        "file_id": 144,
+        "content": "            tr.Resize(cfg.DATA_RESCALE),\n            tr.ToTensor()\n        ])\n        print('dataset processing...')\n        train_dataset = DAVIS2017_VOS_Train(root=cfg.DATA_ROOT,\n                                            transform=composed_transforms)\n        trainloader = DataLoader(\n            train_dataset,\n            collate_fn=None,\n            batch_size=cfg.TRAIN_BATCH_SIZE,\n            shuffle=True,\n            num_workers=8,\n        )\n        print('dataset processing finished.')\n        if lossfunc == 'bce':\n            criterion = Added_BCEWithLogitsLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,\n                                                cfg.TRAIN_HARD_MINING_STEP)\n        elif lossfunc == 'cross_entropy':\n            criterion = Added_CrossEntropyLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,\n                                               cfg.TRAIN_HARD_MINING_STEP)\n        else:\n            print(\n                'unsupported loss funciton. Please choose from [cross_entropy,bce]'\n            )\n        max_itr = cfg.TRAIN_TOTAL_STEPS",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:88-114"
+    },
+    "1729": {
+        "file_id": 144,
+        "content": "The code initializes the training dataset and creates a data loader for it. It also defines the loss function based on user input, either binary cross-entropy or cross-entropy, and sets the maximum number of iterations to run. The dataset processing includes resizing and converting images to tensors using specified transforms.",
+        "type": "comment"
+    },
+    "1730": {
+        "file_id": 144,
+        "content": "        step = 0\n        if model_resume:\n            saved_model_ = os.path.join(self.save_res_dir, cfg.TRAIN_RESUME_DIR)\n            saved_model_ = paddle.load(saved_model_)\n            self.model = self.load_network(self.model, saved_model_)\n            step = int(cfg.RESUME_DIR.split('.')[0].split('_')[-1])\n            print('resume from step {}'.format(step))\n        while step < cfg.TRAIN_TOTAL_STEPS:\n            if step > 100001:\n                break\n            t1 = time.time()\n            if step > 0:\n                running_time.update(time.time() - t1)\n            print(\n                f'{time.asctime()}: new epoch starts. last epoch time: {running_time.avg:.3f} s.',\n            )\n            for ii, sample in enumerate(trainloader):\n                now_lr = self._adjust_lr(optimizer, step, max_itr)\n                if step >= max_itr:\n                    step += 1\n                    break\n                ref_imgs = sample['ref_img']  # batch_size * 3 * h * w\n                img1s = sample['img1']",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:116-144"
+    },
+    "1731": {
+        "file_id": 144,
+        "content": "This code initializes the model, loads previously saved data if specified and resumes training from a certain step. It keeps track of training time per epoch and adjusts learning rate as needed. The loop iterates through the dataset, performing operations on each sample until maximum number of iterations is reached.",
+        "type": "comment"
+    },
+    "1732": {
+        "file_id": 144,
+        "content": "                img2s = sample['img2']\n                ref_scribble_labels = sample[\n                    'ref_scribble_label']  # batch_size * 1 * h * w\n                label1s = sample['label1']\n                label2s = sample['label2']\n                seq_names = sample['meta']['seq_name']\n                obj_nums = sample['meta']['obj_num']\n                bs, _, h, w = img2s.shape\n                inputs = paddle.concat((ref_imgs, img1s, img2s), 0)\n                if damage_initial_previous_frame_mask:\n                    try:\n                        label1s = damage_masks(label1s)\n                    except:\n                        label1s = label1s\n                        print('damage_error')\n                ##########\n                if self.use_gpu:\n                    inputs = inputs\n                    ref_scribble_labels = ref_scribble_labels\n                    label1s = label1s\n                    label2s = label2s\n                ##########\n                tmp_dic = self.model(inputs,\n                                     ref_scribble_labels,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:145-172"
+    },
+    "1733": {
+        "file_id": 144,
+        "content": "This code segment prepares the input data for a model by concatenating images and assigning labels. It also handles any potential errors with damage masks and adjusts the label1s accordingly. The GPU usage is conditionally set based on whether `self.use_gpu` is true or false.",
+        "type": "comment"
+    },
+    "1734": {
+        "file_id": 144,
+        "content": "                                     label1s,\n                                     use_local_map=True,\n                                     seq_names=seq_names,\n                                     gt_ids=obj_nums,\n                                     k_nearest_neighbors=cfg.KNNS)\n                label_and_obj_dic = {}\n                label_dic = {}\n                for i, seq_ in enumerate(seq_names):\n                    label_and_obj_dic[seq_] = (label2s[i], obj_nums[i])\n                for seq_ in tmp_dic.keys():\n                    tmp_pred_logits = tmp_dic[seq_]\n                    tmp_pred_logits = nn.functional.interpolate(\n                        tmp_pred_logits,\n                        size=(h, w),\n                        mode='bilinear',\n                        align_corners=True)\n                    tmp_dic[seq_] = tmp_pred_logits\n                    label_tmp, obj_num = label_and_obj_dic[seq_]\n                    obj_ids = np.arange(1, obj_num + 1)\n                    obj_ids = paddle.to_tensor(obj_ids)",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:173-194"
+    },
+    "1735": {
+        "file_id": 144,
+        "content": "This code initializes label and object dictionaries using the provided label1s. It then iterates over sequence names, creating key-value pairs for the label_and_obj_dic dictionary. For each sequence, it interpolates the temporary prediction logits with bilinear mode, aligning corners. Lastly, it retrieves the label and object number from the label_and_obj_dic and generates a tensor of object ids.",
+        "type": "comment"
+    },
+    "1736": {
+        "file_id": 144,
+        "content": "                    obj_ids = int_(obj_ids)\n                    if lossfunc == 'bce':\n                        label_tmp = label_tmp.transpose([1, 2, 0])\n                        label = (float_(label_tmp) == float_(obj_ids))\n                        label = label.unsqueeze(-1).transpose([3, 2, 0, 1])\n                        label_dic[seq_] = float_(label)\n                    elif lossfunc == 'cross_entropy':\n                        label_dic[seq_] = long_(label_tmp)\n                loss = criterion(tmp_dic, label_dic, step)\n                loss = loss / bs\n                optimizer.clear_grad()\n                loss.backward()\n                optimizer.step()\n                running_loss.update(loss.item(), bs)\n                ##############Visulization during training\n                if step % 50 == 0:\n                    print(time.asctime(), end='\\t')\n                    log = 'step:{},now_lr:{} ,loss:{:.4f}({:.4f})'.format(\n                        step, now_lr, running_loss.val, running_loss.avg)\n                    print(log)",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:195-217"
+    },
+    "1737": {
+        "file_id": 144,
+        "content": "This code snippet is training a video object detection model in stages. It applies binary cross-entropy or cross-entropy loss depending on the specified loss function, updates the model's parameters using an optimizer, and tracks the average loss during training. The progress is logged every 50 steps along with the current learning rate.",
+        "type": "comment"
+    },
+    "1738": {
+        "file_id": 144,
+        "content": "                    #                     logging.info(log)\n                    show_ref_img = ref_imgs.numpy()[0]\n                    show_img1 = img1s.numpy()[0]\n                    show_img2 = img2s.numpy()[0]\n                    mean = np.array([[[0.485]], [[0.456]], [[0.406]]])\n                    sigma = np.array([[[0.229]], [[0.224]], [[0.225]]])\n                    show_ref_img = show_ref_img * sigma + mean\n                    show_img1 = show_img1 * sigma + mean\n                    show_img2 = show_img2 * sigma + mean\n                    show_gt = label2s[0]\n                    show_gt = show_gt.squeeze(0).numpy()\n                    show_gtf = label2colormap(show_gt).transpose((2, 0, 1))\n                    show_preds = tmp_dic[seq_names[0]]\n                    show_preds = nn.functional.interpolate(show_preds,\n                                                           size=(h, w),\n                                                           mode='bilinear',\n                                                           align_corners=True)",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:218-240"
+    },
+    "1739": {
+        "file_id": 144,
+        "content": "This code extracts images, applies normalization, and visualizes the reference image, two input images, and a ground truth label. The images are normalized by subtracting mean values and dividing by standard deviation. The ground truth label is converted to a color map for visualization. The predicted output is interpolated to match the size of the other images before visualizing them.",
+        "type": "comment"
+    },
+    "1740": {
+        "file_id": 144,
+        "content": "                    show_preds = show_preds.squeeze(0)\n                    if lossfunc == 'bce':\n                        show_preds = (paddle.nn.functional.sigmoid(show_preds) >\n                                      0.5)\n                        show_preds_s = paddle.zeros((h, w))\n                        for i in range(show_preds.size(0)):\n                            show_preds_s[show_preds[i]] = i + 1\n                    elif lossfunc == 'cross_entropy':\n                        show_preds_s = paddle.argmax(show_preds, axis=0)\n                    show_preds_s = show_preds_s.numpy()\n                    show_preds_sf = label2colormap(show_preds_s).transpose(\n                        (2, 0, 1))\n                    pix_acc = np.sum(show_preds_s == show_gt) / (h * w)\n                    ###########TODO\n                if step % 20000 == 0 and step != 0:\n                    self.save_network(self.model, step)\n                step += 1\n    def test_VOS(self, use_gpu=True):\n        seqs = []\n        with open(\n                os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:241-266"
+    },
+    "1741": {
+        "file_id": 144,
+        "content": "Applies sigmoid function to binary cross-entropy predictions, converts them to segmentation masks using argmax for cross entropy, calculates pixel accuracy, and saves network at specified intervals.",
+        "type": "comment"
+    },
+    "1742": {
+        "file_id": 144,
+        "content": "                             'val' + '.txt')) as f:\n            seqs_tmp = f.readlines()\n        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n        seqs.extend(seqs_tmp)\n        print('model loading...')\n        saved_model_dict = os.path.join(self.save_res_dir, cfg.TEST_CHECKPOINT)\n        pretrained_dict = paddle.load(saved_model_dict)\n        self.model = self.load_network(self.model, pretrained_dict)\n        print('model load finished')\n        self.model.eval()\n        with paddle.no_grad():\n            for seq_name in seqs:\n                print('prcessing seq:{}'.format(seq_name))\n                test_dataset = DAVIS2017_VOS_Test(root=cfg.DATA_ROOT,\n                                                  transform=tr.ToTensor(),\n                                                  result_root=cfg.RESULT_ROOT,\n                                                  seq_name=seq_name)\n                test_dataloader = DataLoader(test_dataset,\n                                             batch_size=1,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:267-286"
+    },
+    "1743": {
+        "file_id": 144,
+        "content": "Loading and preparing the test datasets for sequence processing.",
+        "type": "comment"
+    },
+    "1744": {
+        "file_id": 144,
+        "content": "                                             shuffle=False,\n                                             num_workers=0)\n                if not os.path.exists(os.path.join(cfg.RESULT_ROOT, seq_name)):\n                    os.makedirs(os.path.join(cfg.RESULT_ROOT, seq_name))\n                time_start = time.time()\n                for ii, sample in enumerate(test_dataloader):\n                    ref_img = sample['ref_img']\n                    prev_img = sample['prev_img']\n                    current_img = sample['current_img']\n                    ref_label = sample['ref_label']\n                    prev_label = sample['prev_label']\n                    obj_num = sample['meta']['obj_num']\n                    seqnames = sample['meta']['seq_name']\n                    imgname = sample['meta']['current_name']\n                    bs, _, h, w = current_img.shape\n                    inputs = paddle.concat((ref_img, prev_img, current_img), 0)\n                    if use_gpu:\n                        inputs = inputs\n                        ref_label = ref_label",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:287-306"
+    },
+    "1745": {
+        "file_id": 144,
+        "content": "This code creates a Paddle data loader for the test dataset, ensuring it doesn't overwrite existing results. It then iterates through each sample in the test loader, extracting necessary images and labels, and concatenating them into an input array. If GPU is used, the inputs and labels are transferred to GPU memory.",
+        "type": "comment"
+    },
+    "1746": {
+        "file_id": 144,
+        "content": "                        prev_label = prev_label\n                    ################\n                    t1 = time.time()\n                    tmp = self.model.extract_feature(inputs)\n                    ref_frame_embedding, previous_frame_embedding, current_frame_embedding = paddle.split(\n                        tmp, num_or_sections=3, axis=0)\n                    t2 = time.time()\n                    print('feature_extracter time:{}'.format(t2 - t1))\n                    tmp_dic = self.model.prop_seghead(\n                        ref_frame_embedding, previous_frame_embedding,\n                        current_frame_embedding, ref_label, prev_label, True,\n                        seqnames, obj_num, cfg.KNNS, self.model.dynamic_seghead)\n                    t3 = time.time()\n                    print('after time:{}'.format(t3 - t2))\n                    #######################\n                    pred_label = tmp_dic[seq_name]\n                    pred_label = nn.functional.interpolate(pred_label,\n                                                           size=(h, w),",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:307-326"
+    },
+    "1747": {
+        "file_id": 144,
+        "content": "Feature extraction and model prediction for a video frame. Time measurement for feature extractor and model execution.",
+        "type": "comment"
+    },
+    "1748": {
+        "file_id": 144,
+        "content": "                                                           mode='bilinear',\n                                                           align_corners=True)\n                    pred_label = paddle.argmax(pred_label, axis=1)\n                    pred_label = pred_label.squeeze(0)\n                    pred_label = pred_label.numpy()\n                    im = Image.fromarray(pred_label.astype('uint8')).convert(\n                        'P', )\n                    im.putpalette(_palette)\n                    im.save(\n                        os.path.join(cfg.RESULT_ROOT, seq_name,\n                                     imgname[0].split('.')[0] + '.png'))\n                    one_frametime = time.time()\n                    print('seq name:{} frame:{} time:{}'.format(\n                        seq_name, imgname[0], one_frametime - time_start))\n                    time_start = time.time()\n    def load_network(self, net, pretrained_dict):\n        # pretrained_dict = pretrained_dict\n        model_dict = net.state_dict()\n        # 1. filter out unnecessary keys",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:327-348"
+    },
+    "1749": {
+        "file_id": 144,
+        "content": "This code segment is part of a function that saves the predicted labels for each frame as an image with a palette. It extracts the predictions from a pre-trained network, converts them to an image, and saves it in a specified directory. The function also prints the time taken for processing each frame.",
+        "type": "comment"
+    },
+    "1750": {
+        "file_id": 144,
+        "content": "        pretrained_dict = {\n            k: v\n            for k, v in pretrained_dict.items() if k in model_dict\n        }\n        # 2. overwrite entries in the existing state dict\n        # for k in model_dict:\n        #     if k not in pretrained_dict:\n        #         print(k, 'not in loaded weights.')\n        model_dict.update(pretrained_dict)\n        net.set_state_dict(model_dict)\n        return net\n    def save_network(self, net, step):\n        save_path = self.save_res_dir\n        if not os.path.exists(save_path):\n            os.makedirs(save_path)\n        save_file = 'save_step_%s.pth' % (step)\n        paddle.save(net.state_dict(), os.path.join(save_path, save_file))\n    def _adjust_lr(self, optimizer, itr, max_itr):\n        now_lr = cfg.TRAIN_LR * (1 - itr / (max_itr + 1))**cfg.TRAIN_POWER\n        optimizer._param_groups[0]['lr'] = now_lr\n        return now_lr\n_palette = [\n    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,\n    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:349-378"
+    },
+    "1751": {
+        "file_id": 144,
+        "content": "This code snippet is part of a training process for a neural network called Ma-Net. It loads pretrained weights into the model and then saves the network at different steps. The learning rate is adjusted during the training process to improve performance. The _palette variable appears unrelated, as it stores RGB values for colors.",
+        "type": "comment"
+    },
+    "1752": {
+        "file_id": 144,
+        "content": "    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,\n    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,\n    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,\n    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,\n    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,\n    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,\n    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,\n    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,\n    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,\n    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,\n    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,\n    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:379-391"
+    },
+    "1753": {
+        "file_id": 144,
+        "content": "This code is a list of RGB values for different objects in an image, possibly for object detection or classification.",
+        "type": "comment"
+    },
+    "1754": {
+        "file_id": 144,
+        "content": "    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,\n    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,\n    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,\n    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,\n    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,\n    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,\n    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,\n    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,\n    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,\n    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,\n    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:392-404"
+    },
+    "1755": {
+        "file_id": 144,
+        "content": "This code appears to contain a sequence of numbers, possibly representing some kind of iteration or indexing in the larger context of the script. The exact meaning would depend on the specifics of how and where this code is used within the \"train_stage1.py\" file of the PaddleVideo/applications/Ma-Net project.",
+        "type": "comment"
+    },
+    "1756": {
+        "file_id": 144,
+        "content": "    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,\n    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,\n    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,\n    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,\n    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,\n    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,\n    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,\n    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,\n    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,\n    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,\n    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:405-417"
+    },
+    "1757": {
+        "file_id": 144,
+        "content": "The code consists of a sequence of numbers, possibly representing frame indices or image IDs.",
+        "type": "comment"
+    },
+    "1758": {
+        "file_id": 144,
+        "content": "    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,\n    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,\n    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,\n    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,\n    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,\n    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,\n    255, 255\n]\nmanager = Manager()\nmanager.train()",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage1.py:418-429"
+    },
+    "1759": {
+        "file_id": 144,
+        "content": "The code creates a manager object and calls its train function. The list of numbers appears to be arguments or configuration for the training process, but without context it's difficult to determine their exact purpose.",
+        "type": "comment"
+    },
+    "1760": {
+        "file_id": 145,
+        "content": "/applications/Ma-Net/train_stage2.py",
+        "type": "filepath"
+    },
+    "1761": {
+        "file_id": 145,
+        "content": "The code trains Ma-Net stage 2 models with adjustable learning rates, applies binary cross-entropy loss, and evaluates performance. It also performs image processing, ROI operations, and video analysis using pretrained network weights.",
+        "type": "summary"
+    },
+    "1762": {
+        "file_id": 145,
+        "content": "import cv2\nimport paddle\nimport paddle.nn as nn\nimport os\nimport numpy as np\n# from paddle.io import DataLoader\nimport paddle.optimizer as optim\nfrom paddle.vision import transforms\nfrom dataloaders.davis_2017_f import DAVIS2017_Train\nimport dataloaders.custom_transforms_f as tr\nfrom dataloaders.samplers import RandomIdentitySampler\nfrom networks.deeplab import DeepLab\nfrom networks.IntVOS import IntVOS\nfrom networks.loss import Added_BCEWithLogitsLoss, Added_CrossEntropyLoss\nfrom config import cfg\nfrom utils.api import float_, long_, byte_\nfrom utils.meters import AverageMeter\nfrom utils.mask_damaging import damage_masks, mask_damager\nfrom utils.utils import label2colormap\nfrom PIL import Image\nimport random\nimport scipy.misc as sm\nimport time\nimport davisinteractive.robot.interactive_robot as interactive_robot\npaddle.disable_static()\npaddle.device.set_device(\"gpu:0\")\nclass DataLoader(paddle.io.DataLoader):\n    def __init__(self,\n                 dataset,\n                 batch_size=1,\n                 shuffle=False,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:1-34"
+    },
+    "1763": {
+        "file_id": 145,
+        "content": "Import necessary libraries and modules, define custom data loader class, set device to GPU0, and disable static mode.",
+        "type": "comment"
+    },
+    "1764": {
+        "file_id": 145,
+        "content": "                 sampler=None,\n                 batch_sampler=None,\n                 num_workers=0,\n                 collate_fn=None,\n                 pin_memory=False,\n                 drop_last=False,\n                 timeout=0,\n                 worker_init_fn=None,\n                 multiprocessing_context=None,\n                 generator=None):\n        if isinstance(dataset[0], (tuple, list)):\n            return_list = True\n        else:\n            return_list = False\n        super().__init__(dataset,\n                         feed_list=None,\n                         places=None,\n                         return_list=return_list,\n                         batch_sampler=batch_sampler,\n                         batch_size=batch_size,\n                         shuffle=shuffle,\n                         drop_last=drop_last,\n                         collate_fn=collate_fn,\n                         num_workers=num_workers,\n                         use_buffer_reader=True,\n                         use_shared_memory=False,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:35-61"
+    },
+    "1765": {
+        "file_id": 145,
+        "content": "The code initializes a DataLoader with parameters. It checks if the dataset contains tuples or lists, then sets return_list accordingly. It initializes the DataLoader using the dataset, batch size, shuffle, etc., and returns a DataLoader object for loading data efficiently.",
+        "type": "comment"
+    },
+    "1766": {
+        "file_id": 145,
+        "content": "                         timeout=timeout,\n                         worker_init_fn=worker_init_fn)\n        if sampler is not None:\n            self.batch_sampler.sampler = sampler\nclass Manager(object):\n    def __init__(self,\n                 use_gpu=True,\n                 time_budget=None,\n                 save_result_dir=cfg.SAVE_RESULT_DIR,\n                 pretrained=True,\n                 interactive_test=False):\n        self.save_res_dir = save_result_dir\n        self.time_budget = time_budget\n        self.feature_extracter = DeepLab(backbone='resnet')\n        if pretrained:\n            pretrained_dict = paddle.load(cfg.PRETRAINED_MODEL)\n            pretrained_dict = pretrained_dict['state_dict']\n            self.load_network(self.feature_extracter, pretrained_dict)\n            print('load pretrained model successfully.')\n        self.model = IntVOS(cfg, self.feature_extracter)\n        model_filename = cfg.SAVE_VOS_RESULT_DIR\n        pd = paddle.load(model_filename)\n        self.load_network(self.model, pd)",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:62-89"
+    },
+    "1767": {
+        "file_id": 145,
+        "content": "This code initializes a Manager object with options for GPU usage, time budget, result directory, pretrained model, and interactive testing. It loads the feature extractor, DeepLab, and the VOS model using provided parameters.",
+        "type": "comment"
+    },
+    "1768": {
+        "file_id": 145,
+        "content": "        print('load stage 1 model from', model_filename)\n        self.use_gpu = use_gpu\n        if use_gpu:\n            self.model = self.model\n    ##################################\n    def train(self,\n              damage_initial_previous_frame_mask=True,\n              lossfunc='cross_entropy',\n              model_resume=False,\n              eval_total=False,\n              init_prev=False):\n        ###################\n        interactor = interactive_robot.InteractiveScribblesRobot()\n        self.model.train()\n        running_loss = AverageMeter()\n        optimizer = optim.Momentum(parameters=[{\n            'params':\n            self.model.inter_seghead.parameters()\n        }],\n                                   learning_rate=cfg.TRAIN_LR,\n                                   momentum=cfg.TRAIN_MOMENTUM,\n                                   weight_decay=cfg.TRAIN_WEIGHT_DECAY)\n        ###################\n        composed_transforms = transforms.Compose([\n            tr.RandomHorizontalFlip(cfg.DATA_RANDOMFLIP),\n            tr.RandomScale(),",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:91-119"
+    },
+    "1769": {
+        "file_id": 145,
+        "content": "This code initializes a model and optimizer for training stage 2 of the Ma-Net application. It uses a GPU if specified, sets up training parameters, and initializes transforms to apply data augmentation during training. The model's segment head is trained using Momentum optimization with specified learning rate, momentum, and weight decay values.",
+        "type": "comment"
+    },
+    "1770": {
+        "file_id": 145,
+        "content": "            tr.RandomCrop((cfg.DATA_RANDOMCROP, cfg.DATA_RANDOMCROP), 10),\n            tr.Resize(cfg.DATA_RESCALE),\n            tr.ToTensor()\n        ])\n        print('dataset processing...')\n        train_dataset = DAVIS2017_Train(root=cfg.DATA_ROOT,\n                                        transform=composed_transforms)\n        train_list = train_dataset.seqs\n        print('dataset processing finished.')\n        if lossfunc == 'bce':\n            criterion = Added_BCEWithLogitsLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,\n                                                cfg.TRAIN_HARD_MINING_STEP)\n        elif lossfunc == 'cross_entropy':\n            criterion = Added_CrossEntropyLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,\n                                               cfg.TRAIN_HARD_MINING_STEP)\n        else:\n            print(\n                'unsupported loss funciton. Please choose from [cross_entropy,bce]'\n            )\n        max_itr = cfg.TRAIN_TOTAL_STEPS\n        step = 0\n        round_ = 3\n        epoch_per_round = 30",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:120-145"
+    },
+    "1771": {
+        "file_id": 145,
+        "content": "The code initializes a dataset, applies transformations to the images, and selects the loss function. It then sets up the maximum number of iterations, and keeps track of current iteration and round numbers.",
+        "type": "comment"
+    },
+    "1772": {
+        "file_id": 145,
+        "content": "        if model_resume:\n            saved_model_ = os.path.join(self.save_res_dir,\n                                        'save_step_75000.pth')\n            saved_model_ = paddle.load(saved_model_)\n            self.model = self.load_network(self.model, saved_model_)\n            step = 75000\n            print('resume from step {}'.format(step))\n        while step < cfg.TRAIN_TOTAL_STEPS:\n            if step > 80001:\n                break\n            for r in range(round_):\n                if r == 0:  #### r==0: Train the interaction branch in the first round\n                    print('start new')\n                    global_map_tmp_dic = {}\n                    train_dataset.transform = transforms.Compose([\n                        tr.RandomHorizontalFlip(cfg.DATA_RANDOMFLIP),\n                        tr.RandomScale(),\n                        tr.RandomCrop(\n                            (cfg.DATA_RANDOMCROP, cfg.DATA_RANDOMCROP)),\n                        tr.Resize(cfg.DATA_RESCALE),\n                        tr.ToTensor()",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:146-170"
+    },
+    "1773": {
+        "file_id": 145,
+        "content": "The code checks if model resuming is enabled, and if so, loads a saved model from a specific step and updates the current model. It then enters a loop where it trains the interaction branch for each round, performing various data transformations like random flipping, scaling, cropping, resizing, and converting to tensor. The training stops after 80,001 steps or if r is not equal to 0 (first round).",
+        "type": "comment"
+    },
+    "1774": {
+        "file_id": 145,
+        "content": "                    ])\n                    train_dataset.init_ref_frame_dic()\n                trainloader = DataLoader(train_dataset,\n                                         sampler=RandomIdentitySampler(\n                                             train_dataset.sample_list),\n                                         shuffle=False,\n                                         batch_size=cfg.TRAIN_BATCH_SIZE,\n                                         num_workers=0)\n                print('round:{} start'.format(r))\n                print(len(train_dataset))\n                print(len(trainloader))\n                for epoch in range(epoch_per_round):\n                    for ii, sample in enumerate(trainloader):\n                        now_lr = self._adjust_lr(optimizer, step, max_itr)\n                        ref_imgs = sample['ref_img']  # batch_size * 3 * h * w\n                        ref_scribble_labels = sample[\n                            'ref_scribble_label']  # batch_size * 1 * h * w\n                        seq_names = sample['meta']['seq_name']",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:171-191"
+    },
+    "1775": {
+        "file_id": 145,
+        "content": "The code initializes a dataset, creates a data loader with a random sampler, and adjusts the learning rate. It then loops through the dataset for a specified number of epochs, accessing relevant sample features and labels. The length of the dataset and data loader are printed before training begins.",
+        "type": "comment"
+    },
+    "1776": {
+        "file_id": 145,
+        "content": "                        obj_nums = sample['meta']['obj_num']\n                        ref_frame_nums = sample['meta']['ref_frame_num']\n                        ref_frame_gts = sample['ref_frame_gt']\n                        bs, _, h, w = ref_imgs.shape\n                        ##########\n                        if self.use_gpu:\n                            inputs = ref_imgs\n                            ref_scribble_labels = ref_scribble_labels\n                            ref_frame_gts = ref_frame_gts\n                        ##########\n                        with paddle.no_grad():\n                            self.model.feature_extracter.eval()\n                            self.model.semantic_embedding.eval()\n                            ref_frame_embedding = self.model.extract_feature(\n                                inputs)\n                        if r == 0:\n                            first_inter = True\n                            tmp_dic = self.model.int_seghead(\n                                ref_frame_embedding=ref_frame_embedding,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:192-212"
+    },
+    "1777": {
+        "file_id": 145,
+        "content": "The code initializes variables and sets up the model for training stage 2. It handles GPU usage, evaluates the model's feature extractor and semantic embedding, and extracts feature embeddings from reference frame images. It then checks if it's processing the first inter-frame instance and calls int_seghead function with reference frame embeddings as input.",
+        "type": "comment"
+    },
+    "1778": {
+        "file_id": 145,
+        "content": "                                ref_scribble_label=ref_scribble_labels,\n                                prev_round_label=None,\n                                normalize_nearest_neighbor_distances=True,\n                                global_map_tmp_dic={},\n                                seq_names=seq_names,\n                                gt_ids=obj_nums,\n                                k_nearest_neighbors=cfg.KNNS,\n                                frame_num=ref_frame_nums,\n                                first_inter=first_inter)\n                        else:\n                            first_inter = False\n                            prev_round_label = sample['prev_round_label']\n                            prev_round_label = prev_round_label\n                            tmp_dic = self.model.int_seghead(\n                                ref_frame_embedding=ref_frame_embedding,\n                                ref_scribble_label=ref_scribble_labels,\n                                prev_round_label=prev_round_label,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:213-229"
+    },
+    "1779": {
+        "file_id": 145,
+        "content": "This code snippet seems to be a part of a larger function and appears to involve image classification tasks. The code initializes variables such as `ref_scribble_labels`, `prev_round_label`, `normalize_nearest_neighbor_distances`, `global_map_tmp_dic`, `seq_names`, `gt_ids`, `k_nearest_neighbors`, and `frame_num`. The code also checks if a variable named `first_inter` exists, and if not, initializes it as `False` along with `prev_round_label` and performs some operations using the `model.int_seghead()` method.",
+        "type": "comment"
+    },
+    "1780": {
+        "file_id": 145,
+        "content": "                                normalize_nearest_neighbor_distances=True,\n                                global_map_tmp_dic={},\n                                seq_names=seq_names,\n                                gt_ids=obj_nums,\n                                k_nearest_neighbors=cfg.KNNS,\n                                frame_num=ref_frame_nums,\n                                first_inter=first_inter)\n                        label_and_obj_dic = {}\n                        label_dic = {}\n                        for i, seq_ in enumerate(seq_names):\n                            label_and_obj_dic[seq_] = (ref_frame_gts[i],\n                                                       obj_nums[i])\n                        for seq_ in tmp_dic.keys():\n                            tmp_pred_logits = tmp_dic[seq_]\n                            tmp_pred_logits = nn.functional.interpolate(\n                                tmp_pred_logits,\n                                size=(h, w),\n                                mode='bilinear',",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:230-247"
+    },
+    "1781": {
+        "file_id": 145,
+        "content": "The code initializes an empty dictionary for label and object dictionaries. It then iterates through the sequence names, assigning the corresponding ground truth frame and object number to each sequence in the label_and_obj_dic dictionary. Next, it iterates through the temporary dictionary keys, interpolating the predicted logits of each sequence to a fixed size (h, w) using bilinear interpolation.",
+        "type": "comment"
+    },
+    "1782": {
+        "file_id": 145,
+        "content": "                                align_corners=True)\n                            tmp_dic[seq_] = tmp_pred_logits\n                            label_tmp, obj_num = label_and_obj_dic[seq_]\n                            obj_ids = np.arange(0, obj_num + 1)\n                            obj_ids = paddle.to_tensor(obj_ids)\n                            obj_ids = paddle.to_tensor(obj_ids, dtype='int64')\n                            if lossfunc == 'bce':\n                                label_tmp = label_tmp.permute(1, 2, 0)\n                                label = (float_(label_tmp) == float_(obj_ids))\n                                label = label.unsqueeze(-1).permute(3, 2, 0, 1)\n                                label_dic[seq_] = float_(label)\n                            elif lossfunc == 'cross_entropy':\n                                label_dic[seq_] = long_(label_tmp)\n                        loss = criterion(tmp_dic, label_dic, step)\n                        loss = loss / bs\n                        optimizer.clear_grad()",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:248-265"
+    },
+    "1783": {
+        "file_id": 145,
+        "content": "This code section is responsible for handling label and object dictionaries, preparing the data for different loss functions, and calculating the loss based on the provided data. It also performs necessary tensor conversions and optimizer operations.",
+        "type": "comment"
+    },
+    "1784": {
+        "file_id": 145,
+        "content": "                        loss.backward()\n                        optimizer.step()\n                        running_loss.update(loss.item(), bs)\n                        if step % 50 == 0:\n                            print(\n                                'step:{},now_lr:{} ,loss:{:.4f}({:.4f})'.format(\n                                    step, now_lr, running_loss.val,\n                                    running_loss.avg))\n                            show_ref_img = ref_imgs.numpy()[0]\n                            mean = np.array([[[0.485]], [[0.456]], [[0.406]]])\n                            sigma = np.array([[[0.229]], [[0.224]], [[0.225]]])\n                            show_ref_img = show_ref_img * sigma + mean\n                            show_gt = ref_frame_gts[0].squeeze(0).numpy()\n                            show_gtf = label2colormap(show_gt).transpose(\n                                (2, 0, 1))\n                            show_scrbble = ref_scribble_labels[0].squeeze(\n                                0).numpy()",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:266-287"
+    },
+    "1785": {
+        "file_id": 145,
+        "content": "Updating the running loss and printing details, including step, current learning rate, and loss values. Visualizing reference image and ground truth frame, and converting scribble labels to color maps.",
+        "type": "comment"
+    },
+    "1786": {
+        "file_id": 145,
+        "content": "                            show_scrbble = label2colormap(\n                                show_scrbble).transpose((2, 0, 1))\n                            if r != 0:\n                                show_prev_round_label = prev_round_label[\n                                    0].squeeze(0).numpy()\n                                show_prev_round_label = label2colormap(\n                                    show_prev_round_label).transpose((2, 0, 1))\n                            else:\n                                show_prev_round_label = np.zeros_like(show_gt)\n                                show_prev_round_label = label2colormap(\n                                    show_prev_round_label).transpose((2, 0, 1))\n                            ##########\n                            show_preds = tmp_dic[seq_names[0]]\n                            show_preds = nn.functional.interpolate(\n                                show_preds,\n                                size=(h, w),\n                                mode='bilinear',",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:288-306"
+    },
+    "1787": {
+        "file_id": 145,
+        "content": "This code is handling the visualization of labels and predictions. If r is not zero, it retrieves the previous round label, maps it to a color map, and transposes it for visualization. If r is zero, it creates a zero-filled array for the previous round label. The final step is getting the predictions for the first sequence name, interpolating them to fit the image size, and preparing them for visualization.",
+        "type": "comment"
+    },
+    "1788": {
+        "file_id": 145,
+        "content": "                                align_corners=True)\n                            show_preds = show_preds.squeeze(0)\n                            if lossfunc == 'bce':\n                                show_preds = show_preds[1:]\n                                show_preds = (\n                                    paddle.nn.functional.sigmoid(show_preds) >\n                                    0.5)\n                                marker = paddle.argmax(show_preds, axis=0)\n                                show_preds_s = paddle.zeros((h, w))\n                                for i in range(show_preds.size(0)):\n                                    tmp_mask = (marker\n                                                == i) & (show_preds[i] > 0.5)\n                                    show_preds_s[tmp_mask] = i + 1\n                            elif lossfunc == 'cross_entropy':\n                                show_preds_s = paddle.argmax(show_preds, axis=0)\n                            show_preds_s = show_preds_s.numpy()\n                            show_preds_sf = label2colormap(",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:307-324"
+    },
+    "1789": {
+        "file_id": 145,
+        "content": "This code is segmenting an image by applying a binary cross-entropy or cross-entropy loss function to the output of a PaddlePaddle neural network. The resulting segmentation map is stored in 'show_preds_s' after being converted to a numpy array and then transformed using the 'label2colormap' function.",
+        "type": "comment"
+    },
+    "1790": {
+        "file_id": 145,
+        "content": "                                show_preds_s).transpose((2, 0, 1))\n                            pix_acc = np.sum(show_preds_s == show_gt) / (h * w)\n                            ###########TODO\n                        if step % 20000 == 0 and step != 0:\n                            self.save_network(self.model, step)\n                        step += 1\n                print('trainset evaluating...')\n                print('*' * 100)\n                if cfg.TRAIN_INTER_USE_TRUE_RESULT:\n                    if r != round_ - 1:\n                        if r == 0:\n                            prev_round_label_dic = {}\n                        self.model.eval()\n                        with paddle.no_grad():\n                            round_scribble = {}\n                            frame_num_dic = {}\n                            train_dataset.transform = transforms.Compose(\n                                [tr.Resize(cfg.DATA_RESCALE),\n                                 tr.ToTensor()])\n                            trainloader = DataLoader(",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:325-350"
+    },
+    "1791": {
+        "file_id": 145,
+        "content": "This code block is responsible for saving the network at certain intervals during training. It checks if the current step is a multiple of 20,000 and not the first step, then calls save_network function to store the model's parameters. The model is also evaluated on the trainset and its performance might be influenced by the cfg.TRAIN_INTER_USE_TRUE_RESULT flag which determines whether to use the true result for evaluation. This block also resets transforms of the traindataset at specific rounds (r != round_-1).",
+        "type": "comment"
+    },
+    "1792": {
+        "file_id": 145,
+        "content": "                                train_dataset,\n                                sampler=RandomIdentitySampler(\n                                    train_dataset.sample_list),\n                                shuffle=False,\n                                batch_size=1,\n                                num_workers=0)\n                            for ii, sample in enumerate(trainloader):\n                                ref_imgs = sample[\n                                    'ref_img']  # batch_size * 3 * h * w\n                                img1s = sample['img1']\n                                img2s = sample['img2']\n                                ref_scribble_labels = sample[\n                                    'ref_scribble_label']  # batch_size * 1 * h * w\n                                label1s = sample['label1']\n                                label2s = sample['label2']\n                                seq_names = sample['meta']['seq_name']\n                                obj_nums = sample['meta']['obj_num']",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:351-367"
+    },
+    "1793": {
+        "file_id": 145,
+        "content": "The code is initializing a data loader for training stage 2. It uses the RandomIdentitySampler with the train_dataset, sets shuffle to False, batch size to 1, and num_workers to 0. Then it iterates through the trainloader, extracting ref_imgs, img1s, img2s, ref_scribble_labels, label1s, label2s, and seq_names from each sample.",
+        "type": "comment"
+    },
+    "1794": {
+        "file_id": 145,
+        "content": "                                frame_nums = sample['meta']['frame_num']\n                                bs, _, h, w = img2s.shape\n                                inputs = paddle.concat((ref_imgs, img1s, img2s),\n                                                       0)\n                                if r == 0:\n                                    ref_scribble_labels = self.rough_ROI(\n                                        ref_scribble_labels)\n                                print(seq_names[0])\n                                label1s_tocat = None\n                                for i in range(bs):\n                                    l = label1s[i]\n                                    l = l.unsqueeze(0)\n                                    l = mask_damager(l, 0.0)\n                                    l = paddle.to_tensor(l)\n                                    l = l.unsqueeze(0).unsqueeze(0)\n                                    if label1s_tocat is None:\n                                        label1s_tocat = float_(l)",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:368-386"
+    },
+    "1795": {
+        "file_id": 145,
+        "content": "This code segment is a part of an image processing and scribble labeling task. It extracts the frame numbers from sample metadata, concatenates reference images, img1, and img2. It applies rough ROI (region of interest) operation on ref_scribble_labels if r equals 0. Then, it processes the label1s, creating a tensor for scribble labels to be used in the model's input.",
+        "type": "comment"
+    },
+    "1796": {
+        "file_id": 145,
+        "content": "                                    else:\n                                        label1s_tocat = paddle.concat(\n                                            (label1s_tocat, float_(l)), 0)\n                                label1s = label1s_tocat\n                                if self.use_gpu:\n                                    inputs = inputs\n                                    ref_scribble_labels = ref_scribble_labels\n                                    label1s = label1s\n                                tmp_dic, global_map_tmp_dic = self.model(\n                                    inputs,\n                                    ref_scribble_labels,\n                                    label1s,\n                                    seq_names=seq_names,\n                                    gt_ids=obj_nums,\n                                    k_nearest_neighbors=cfg.KNNS,\n                                    global_map_tmp_dic=global_map_tmp_dic,\n                                    frame_num=frame_nums)\n                                pred_label = tmp_dic[",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:387-406"
+    },
+    "1797": {
+        "file_id": 145,
+        "content": "This code is part of a machine learning model training process. It appears to be concatenating label data (label1s_tocat) and checking if GPU usage is required. The model then processes input data, reference scribble labels, and labels (label1s) to produce outputs (tmp_dic). The specific output used is determined by the variable 'pred_label'.",
+        "type": "comment"
+    },
+    "1798": {
+        "file_id": 145,
+        "content": "                                    seq_names[0]].detach().cpu()\n                                pred_label = nn.functional.interpolate(\n                                    pred_label,\n                                    size=(h, w),\n                                    mode='bilinear',\n                                    align_corners=True)\n                                pred_label = paddle.argmax(pred_label, axis=1)\n                                pred_label = pred_label.unsqueeze(0)\n                                try:\n                                    pred_label = damage_masks(pred_label)\n                                except:\n                                    pred_label = pred_label\n                                pred_label = pred_label.squeeze(0)\n                                round_scribble[\n                                    seq_names[0]] = interactor.interact(\n                                        seq_names[0], pred_label.numpy(),\n                                        float_(label2s).squeeze(0).numpy(),",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:407-423"
+    },
+    "1799": {
+        "file_id": 145,
+        "content": "The code is performing inference for an image classification task. It detaches, interpolates and converts the predicted label tensor to obtain the final prediction. The try-except block handles potential errors when applying a function called \"damage_masks\" on the prediction label. Finally, it applies the \"interact\" function from a class called \"interactor\" on the sequence with name \"seq_names[0]\" using numpy arrays for prediction and ground truth labels.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/18.json b/docs/data/18.json
new file mode 100644
index 000000000..24c675177
--- /dev/null
+++ b/docs/data/18.json
@@ -0,0 +1,548 @@
+{
+    "1800": {
+        "file_id": 145,
+        "content": "                                        obj_nums)\n                                frame_num_dic[seq_names[0]] = frame_nums[0]\n                                pred_label = pred_label.unsqueeze(0)\n                                img_ww = Image.open(\n                                    os.path.join(cfg.DATA_ROOT,\n                                                 'JPEGImages/480p/',\n                                                 seq_names[0], '00000.jpg'))\n                                img_ww = np.array(img_ww)\n                                or_h, or_w = img_ww.shape[:2]\n                                pred_label = paddle.nn.functional.interpolate(\n                                    float_(pred_label), (or_h, or_w),\n                                    mode='nearest')\n                                prev_round_label_dic[\n                                    seq_names[0]] = pred_label.squeeze(0)\n                        train_dataset.update_ref_frame_and_label(\n                            round_scribble, frame_num_dic, prev_round_label_dic)",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:424-439"
+    },
+    "1801": {
+        "file_id": 145,
+        "content": "This code opens an image and resizes the prediction label to match the image's height and width. Then, it updates the reference frame and label in the training dataset for a specific sequence name.",
+        "type": "comment"
+    },
+    "1802": {
+        "file_id": 145,
+        "content": "                    print(f'round {r}', 'trainset evaluating finished!')\n                    print('*' * 100)\n                    self.model.train()\n                    print('updating ref frame and label')\n                    train_dataset.transform = composed_transforms\n                    print('updating ref frame and label finished!')\n                else:\n                    if r != round_ - 1:\n                        round_scribble = {}\n                        if r == 0:\n                            prev_round_label_dic = {}\n                        frame_num_dic = {}\n                        train_dataset.transform = tr.ToTensor()\n                        trainloader = DataLoader(train_dataset,\n                                                 sampler=RandomIdentitySampler(\n                                                     train_dataset.sample_list),\n                                                 shuffle=False,\n                                                 batch_size=1,\n                                                 num_workers=0)",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:441-462"
+    },
+    "1803": {
+        "file_id": 145,
+        "content": "This code segment appears to be a part of a training process for a video analysis model. It's updating the reference frame and label based on the current round, possibly in a round-based training loop. The `RandomIdentitySampler` seems to be used to load the data for this specific round.",
+        "type": "comment"
+    },
+    "1804": {
+        "file_id": 145,
+        "content": "                        self.model.eval()\n                        with paddle.no_grad():\n                            for ii, sample in enumerate(trainloader):\n                                ref_imgs = sample[\n                                    'ref_img']  # batch_size * 3 * h * w\n                                img1s = sample['img1']\n                                img2s = sample['img2']\n                                ref_scribble_labels = sample[\n                                    'ref_scribble_label']  # batch_size * 1 * h * w\n                                label1s = sample['label1']\n                                label2s = sample['label2']\n                                seq_names = sample['meta']['seq_name']\n                                obj_nums = sample['meta']['obj_num']\n                                frame_nums = sample['meta']['frame_num']\n                                bs, _, h, w = img2s.shape\n                                print(seq_names[0])\n                                label2s_ = mask_damager(label2s, 0.1)",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:464-481"
+    },
+    "1805": {
+        "file_id": 145,
+        "content": "This code is in the \"train_stage2.py\" file of PaddleVideo's Ma-Net application, and it prepares for training by setting the model to evaluation mode, disabling gradient tracking with paddle.no_grad(), iterating over training data using trainloader, and extracting necessary samples. It also applies a mask_damager to label2s with 0.1 intensity to potentially improve model performance.",
+        "type": "comment"
+    },
+    "1806": {
+        "file_id": 145,
+        "content": "                                round_scribble[\n                                    seq_names[0]] = interactor.interact(\n                                        seq_names[0],\n                                        np.expand_dims(label2s_, axis=0),\n                                        float_(label2s).squeeze(0).numpy(),\n                                        obj_nums)\n                                label2s__ = paddle.to_tensor(label2s_)\n                                frame_num_dic[seq_names[0]] = frame_nums[0]\n                                prev_round_label_dic[seq_names[0]] = label2s__\n                        print(f'round {r}', 'trainset evaluating finished!')\n                        print('*' * 100)\n                        print('updating ref frame and label')\n                        train_dataset.update_ref_frame_and_label(\n                            round_scribble, frame_num_dic, prev_round_label_dic)\n                        self.model.train()\n                        train_dataset.transform = composed_transforms",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:482-500"
+    },
+    "1807": {
+        "file_id": 145,
+        "content": "This code updates the reference frame and label for the train_dataset, sets the model to training mode, and prints progress messages. The interactor is used to interact with the first sequence's data and update the round_scribble variable. Label2s and frame_nums are used in this process, and prev_round_label_dic stores the previous round's label for future reference.",
+        "type": "comment"
+    },
+    "1808": {
+        "file_id": 145,
+        "content": "                        print('updating ref frame and label finished!')\n    #############################################\n    def rough_ROI(self, ref_scribble_labels):\n        #### b*1*h*w\n        dist = 15\n        b, _, h, w = ref_scribble_labels.shape\n        filter_ = paddle.zeros_like(ref_scribble_labels)\n        to_fill = paddle.zeros_like(ref_scribble_labels)\n        for i in range(b):\n            no_background = (ref_scribble_labels[i] != -1)\n            no_background = no_background.squeeze(0)\n            no_b = no_background.nonzero()\n            h_min, w_min = paddle.min(no_b, 0)  # fixed\n            h_max, w_max = paddle.max(no_b, 0)  # fixed\n            filter_[i, 0,\n                    max(h_min - dist, 0):min(h_max + dist, h - 1),\n                    max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1\n        final_scribble_labels = paddle.where(byte_(filter_),\n                                             ref_scribble_labels,\n                                             to_fill)  # uint8_ fixed.",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:501-525"
+    },
+    "1809": {
+        "file_id": 145,
+        "content": "This function rough_ROI takes ref_scribble_labels as input and performs a region of interest (ROI) operation. It iterates over each batch element, identifies the valid non-background regions, calculates the minimum and maximum coordinates within these regions, and then creates a filter mask. The filter mask is used to selectively copy the ref_scribble_labels into final_scribble_labels for further processing. The operation ensures that only relevant regions are considered, improving efficiency.",
+        "type": "comment"
+    },
+    "1810": {
+        "file_id": 145,
+        "content": "        return final_scribble_labels\n    def load_network(self, net, pretrained_dict):\n        # pretrained_dict = pretrained_dict\n        model_dict = net.state_dict()\n        # 1. filter out unnecessary keys\n        pretrained_dict = {\n            k: v\n            for k, v in pretrained_dict.items() if k in model_dict\n        }\n        # 2. overwrite entries in the existing state dict\n        # for k in model_dict:\n        #     if k not in pretrained_dict:\n        #         print(k, 'not in loaded weights.')\n        model_dict.update(pretrained_dict)\n        net.set_state_dict(model_dict)\n        return net\n    def save_network(self, net, step):\n        save_path = self.save_res_dir\n        if not os.path.exists(save_path):\n            os.makedirs(save_path)\n        save_file = 'save_step_%s.pth' % (step)\n        paddle.save(net.state_dict(), os.path.join(save_path, save_file))\n    def _adjust_lr(self, optimizer, itr, max_itr):\n        now_lr = cfg.TRAIN_LR * (1 - itr / (max_itr + 1))**cfg.TRAIN_POWER\n        optimizer._param_groups[0]['lr'] = now_lr",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:526-556"
+    },
+    "1811": {
+        "file_id": 145,
+        "content": "The code defines three functions: \n1. `train_stage2.py:525-555`: The `return final_scribble_labels` statement concludes the function, returning the final scribble labels after some operations on them.\n2. `load_network`: Loads pretrained weights into a network by matching the keys in the pretrained dictionary with those in the model dictionary and updating the state dict accordingly.\n3. `save_network`: Saves the current state of the network at a specified step to a given directory, creating the directory if it doesn't exist.",
+        "type": "comment"
+    },
+    "1812": {
+        "file_id": 145,
+        "content": "        return now_lr\n_palette = [\n    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,\n    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,\n    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,\n    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,\n    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,\n    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,\n    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,\n    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,\n    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,\n    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,\n    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,\n    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:557-573"
+    },
+    "1813": {
+        "file_id": 145,
+        "content": "The code defines a color palette with 81 RGB colors, ranging from black (0, 0, 0) to white (255, 255, 255). Each value represents the color's red, green, and blue components.",
+        "type": "comment"
+    },
+    "1814": {
+        "file_id": 145,
+        "content": "    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,\n    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,\n    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,\n    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,\n    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,\n    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,\n    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,\n    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,\n    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,\n    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,\n    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:574-586"
+    },
+    "1815": {
+        "file_id": 145,
+        "content": "This code represents a list of numbers ranging from 81 to 150.",
+        "type": "comment"
+    },
+    "1816": {
+        "file_id": 145,
+        "content": "    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,\n    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,\n    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,\n    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,\n    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,\n    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,\n    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,\n    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,\n    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,\n    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,\n    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:587-599"
+    },
+    "1817": {
+        "file_id": 145,
+        "content": "This code appears to be a sequence of numbers, and it is not clear what the purpose or functionality of this specific section of code is without further context.",
+        "type": "comment"
+    },
+    "1818": {
+        "file_id": 145,
+        "content": "    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,\n    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,\n    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,\n    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,\n    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,\n    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,\n    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,\n    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,\n    255, 255\n]\nmanager = Manager()\nmanager.train()",
+        "type": "code",
+        "location": "/applications/Ma-Net/train_stage2.py:600-612"
+    },
+    "1819": {
+        "file_id": 145,
+        "content": "The code snippet initializes a Manager object and calls its train() method. The list of numbers represents the dimensions of an image, potentially used for resizing or preprocessing during training.",
+        "type": "comment"
+    },
+    "1820": {
+        "file_id": 146,
+        "content": "/applications/Ma-Net/utils/api.py",
+        "type": "filepath"
+    },
+    "1821": {
+        "file_id": 146,
+        "content": "The code provides data conversion, tensor handling, input compatibility, gradient norm clipping, and gain calculations for PyTorch and PaddlePaddle. It also offers tensor initialization using various methods with backward compatibility.",
+        "type": "summary"
+    },
+    "1822": {
+        "file_id": 146,
+        "content": "import math\nimport warnings\nimport numpy\nimport numpy as np\nfrom numpy import inf\nfrom paddle import Tensor, concat, reshape, nn\nimport paddle\nfrom typing import Union, Iterable\n_tensor_or_tensors = Union[paddle.Tensor, Iterable[paddle.Tensor]]\nimport paddle\nimport PIL\nimport numbers\nimport numpy as np\nfrom PIL import Image\nfrom paddle.vision.transforms import BaseTransform\nfrom paddle.vision.transforms import functional as F\ndef torch2paddle(data):\n    try:\n        import torch\n        if isinstance(data, dict):\n            np_data = {}\n            for k, v in data.items():\n                np_data[k] = paddle.to_tensor(v.detach().numpy())\n            return np_data\n        else:\n            return paddle.to_tensor(data.detach().numpy())\n    except:\n        pass\ndef fill_(tensor: Tensor, value):\n    return tensor.set_value(paddle.full_like(tensor, value))\ndef zero_(tensor: Tensor):\n    return tensor.set_value(paddle.zeros_like(tensor))\ndef float_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='float32')\ndef long_(tensor: Tensor):",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:1-49"
+    },
+    "1823": {
+        "file_id": 146,
+        "content": "This code file contains utility functions for converting data from PyTorch to Paddle, filling tensors with values, zeroing out tensor values, and changing the tensor's dtype. It also includes a function that attempts to import PyTorch and converts corresponding data types accordingly.",
+        "type": "comment"
+    },
+    "1824": {
+        "file_id": 146,
+        "content": "    return paddle.to_tensor(tensor, dtype='int64')\ndef int_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='int32')\ndef byte_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='bool')\nclass ToPILImage(BaseTransform):\n    def __init__(self, mode=None, keys=None):\n        super(ToPILImage, self).__init__(keys)\n    def _apply_image(self, pic):\n        \"\"\"\n        Args:\n            pic (Tensor|np.ndarray): Image to be converted to PIL Image.\n        Returns:\n            PIL: Converted image.\n        \"\"\"\n        if not (isinstance(pic, paddle.Tensor) or isinstance(pic, np.ndarray)):\n            raise TypeError('pic should be Tensor or ndarray. Got {}.'.format(\n                type(pic)))\n        elif isinstance(pic, paddle.Tensor):\n            if pic.ndimension() not in {2, 3}:\n                raise ValueError(\n                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(\n                        pic.ndimension()))\n            elif pic.ndimension() == 2:\n                # if 2D image, add channel dimension (CHW)",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:50-83"
+    },
+    "1825": {
+        "file_id": 146,
+        "content": "The code provides three tensor conversion functions: `int64`, `int32`, and `byte` that convert tensors into specific data types. It also includes a class `ToPILImage` for converting images to PIL Image format with the option to specify mode and keys. The function `_apply_image` checks if the input is a tensor or numpy array, raises an error if not, and then proceeds to convert 2D or 3D images into PIL Image format by adding channel dimension if necessary.",
+        "type": "comment"
+    },
+    "1826": {
+        "file_id": 146,
+        "content": "                pic = pic.unsqueeze(0)\n        elif isinstance(pic, np.ndarray):\n            if pic.ndim not in {2, 3}:\n                raise ValueError(\n                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(\n                        pic.ndim))\n            elif pic.ndim == 2:\n                # if 2D image, add channel dimension (HWC)\n                pic = np.expand_dims(pic, 2)\n        npimg = pic\n        if isinstance(pic, paddle.Tensor) and \"float\" in str(\n                pic.numpy().dtype) and self.mode != 'F':\n            pic = pic.mul(255).byte()\n        if isinstance(pic, paddle.Tensor):\n            npimg = np.transpose(pic.numpy(), (1, 2, 0))\n        if not isinstance(npimg, np.ndarray):\n            raise TypeError(\n                'Input pic must be a paddle.Tensor or NumPy ndarray, ' +\n                'not {}'.format(type(npimg)))\n        if npimg.shape[2] == 1:\n            expected_mode = None\n            npimg = npimg[:, :, 0]\n            if npimg.dtype == np.uint8:\n                expected_mode = 'L'",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:84-112"
+    },
+    "1827": {
+        "file_id": 146,
+        "content": "Code checks if the input 'pic' is a Paddle or numpy array, adjusts dimensions if necessary, and converts data types accordingly. If 'pic' is not compatible with the code, it raises an error. This code ensures that the input image is in the correct format for further processing.",
+        "type": "comment"
+    },
+    "1828": {
+        "file_id": 146,
+        "content": "            elif npimg.dtype == np.int16:\n                expected_mode = 'I;16'\n            elif npimg.dtype == np.int32:\n                expected_mode = 'I'\n            elif npimg.dtype == np.float32:\n                expected_mode = 'F'\n            if self.mode is not None and self.mode != expected_mode:\n                raise ValueError(\n                    \"Incorrect self.mode ({}) supplied for input type {}. Should be {}\"\n                    .format(self.mode, np.dtype, expected_mode))\n            self.mode = expected_mode\n        elif npimg.shape[2] == 2:\n            permitted_2_channel_modes = ['LA']\n            if self.mode is not None and self.mode not in permitted_2_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 2D inputs\".format(\n                        permitted_2_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'LA'\n        elif npimg.shape[2] == 4:\n            permitted_4_channel_modes = ['RGBA', 'CMYK', 'RGBX']",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:113-136"
+    },
+    "1829": {
+        "file_id": 146,
+        "content": "This code checks the data type of npimg and sets the expected mode accordingly. It then compares the input's mode to the expected mode, raising a ValueError if they don't match. For 2D inputs with 2 channels, only 'LA' mode is supported; it sets self.mode to 'LA' if necessary. For 4-channel inputs, the code supports modes like 'RGBA', 'CMYK', and 'RGBX'.",
+        "type": "comment"
+    },
+    "1830": {
+        "file_id": 146,
+        "content": "            if self.mode is not None and self.mode not in permitted_4_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 4D inputs\".format(\n                        permitted_4_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'RGBA'\n        else:\n            permitted_3_channel_modes = ['RGB', 'YCbCr', 'HSV']\n            if self.mode is not None and self.mode not in permitted_3_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 3D inputs\".format(\n                        permitted_3_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'RGB'\n        if self.mode is None:\n            raise TypeError('Input type {} is not supported'.format(\n                npimg.dtype))\n        return Image.fromarray(npimg, mode=self.mode)\nclass Identity(nn.Layer):\n    r\"\"\"A placeholder identity operator that is argument-insensitive.",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:137-161"
+    },
+    "1831": {
+        "file_id": 146,
+        "content": "This code checks the input image mode and data type, ensuring it matches with the supported modes for 3D or 4D inputs. If the mode is not recognized, a ValueError is raised. If no mode is provided and the data type is np.uint8, it assigns the appropriate default mode (RGB or RGBA). Finally, if there is no mode specified and the input data type is unsupported, a TypeError is raised. The code is part of a class called Identity which seems to be an identity operator for neural networks.",
+        "type": "comment"
+    },
+    "1832": {
+        "file_id": 146,
+        "content": "    Args:\n        args: any argument (unused)\n        kwargs: any keyword argument (unused)\n    \"\"\"\n    def __init__(self, *args, **kwargs):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\ndef convert(data: dict, to, dtype=None):\n    assert isinstance(data, dict)\n    input = {}\n    for k, v in data.items():\n        if 'paddle' == to:\n            if isinstance(v, np.ndarray):\n                if dtype is not None:\n                    input[k] = paddle.to_tensor(v.astype(dtype))\n                else:\n                    input[k] = paddle.to_tensor(v)\n            else:\n                input[k] = v\n        elif 'torch' == to:\n            try:\n                import torch\n                if isinstance(v, np.ndarray):\n                    if dtype is not None:\n                        input[k] = torch.tensor(v.astype(dtype))\n                    else:\n                        input[k] = torch.tensor(v)\n                else:\n                    input[k] = v\n            except:\n                pass",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:163-198"
+    },
+    "1833": {
+        "file_id": 146,
+        "content": "This code defines a class \"Identity\" that performs identity forwarding and a function \"convert\" to convert data between Paddle and Torch formats. It takes a dictionary of data, converts it into either Paddle or Torch format based on the specified type, and returns a new dictionary with the converted data. If the data is a numpy ndarray, it can also be casted to a specific dtype.",
+        "type": "comment"
+    },
+    "1834": {
+        "file_id": 146,
+        "content": "        else:\n            if isinstance(v, np.ndarray):\n                input[k] = v.astype(to)\n            else:\n                input[k] = v\n    return input\ndef clip_grad_norm_(parameters: _tensor_or_tensors,\n                    max_norm: float,\n                    norm_type: float = 2.0,\n                    error_if_nonfinite: bool = False) -> paddle.Tensor:\n    r\"\"\"Clips gradient norm of an iterable of parameters.\n    The norm is computed over all gradients together, as if they were\n    concatenated into a single vector. Gradients are modified in-place.\n    Args:\n        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a\n            single Tensor that will have gradients normalized\n        max_norm (float or int): max norm of the gradients\n        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for\n            infinity norm.\n        error_if_nonfinite (bool): if True, an error is thrown if the total\n            norm of the gradients from :attr:``parameters`` is ``nan``,",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:199-223"
+    },
+    "1835": {
+        "file_id": 146,
+        "content": "This code defines a function that clips the gradient norm of an iterable of parameters. It takes in an iterable of Tensors (parameters) and a maximum norm value, computes the norm over all gradients, and modifies them in-place if necessary. The norm type can be specified as well, with options including 'inf' for infinity norm. If nonfinite norms are present, an error will occur unless error_if_nonfinite is set to False.",
+        "type": "comment"
+    },
+    "1836": {
+        "file_id": 146,
+        "content": "            ``inf``, or ``-inf``. Default: False (will switch to True in the future)\n    Returns:\n        Total norm of the parameters (viewed as a single vector).\n    \"\"\"\n    import time\n    if isinstance(parameters, paddle.Tensor):\n        parameters = [parameters]\n    parameters = [p for p in parameters if p.grad is not None]\n    detached_grads = [p.grad.detach() for p in parameters]\n    max_norm = float(max_norm)\n    norm_type = float(norm_type)\n    if len(parameters) == 0:\n        return paddle.to_tensor(0.)\n    if norm_type == inf:\n        norms = [p.abs().max() for p in parameters]\n        total_norm = norms[0] if len(norms) == 1 else paddle.max(\n            paddle.stack(norms))\n    else:\n        total_norm = paddle.norm(\n            paddle.stack([paddle.norm(g, norm_type) for g in detached_grads]),\n            norm_type)\n    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),\n                                                total_norm.isinf()):\n        raise RuntimeError(\n            f'The total norm of order {norm_type} for gradients from '",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:224-250"
+    },
+    "1837": {
+        "file_id": 146,
+        "content": "This function calculates the total norm of parameters viewed as a single vector. It takes parameters and optional arguments max_norm and norm_type for the maximum allowed norm value and type of norm (inf or other), respectively. If no parameters are provided, it returns 0. If max_norm is infinity, it finds the maximum absolute value among parameters. Otherwise, it calculates the norm of gradients using the provided norm_type. If error_if_nonfinite is True and the total norm is NaN or Inf, a RuntimeError is raised.",
+        "type": "comment"
+    },
+    "1838": {
+        "file_id": 146,
+        "content": "            '`parameters` is non-finite, so it cannot be clipped. To disable '\n            'this error and scale the gradients by the non-finite norm anyway, '\n            'set `error_if_nonfinite=False`')\n    clip_coef = max_norm / (total_norm + 1e-6)\n    # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so\n    # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization\n    # when the gradients do not reside in CPU memory.\n    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)\n    for i, p in enumerate(parameters):\n        p.grad.set_value(detached_grads[i] * clip_coef_clamped)  # fixed\n    return total_norm\ndef max(a: paddle.Tensor, axis=0, keepdim=True):\n    \"\"\"ndarray=numpy.array([[1, 2, 3, 4],\n           [4, 3, 2, 1],\n           [5, 6, 7, 8],\n           [8, 7, 6, 5]])\n    np.where(ndarray == np.max(ndarray))\n    (array([2, 3]), array([3, 0]))\n    ndarray[np.where(ndarray == np.max(ndarray))]\n    array([8, 8])\n    \"\"\"\n    max_ = a.max(axis).unsqueeze(-1)",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:251-274"
+    },
+    "1839": {
+        "file_id": 146,
+        "content": "This code snippet is a part of the PaddleVideo framework's Ma-Net application. It checks if 'parameters' are non-finite and clips them if not. It also defines a function to find the maximum value in a tensor, similar to numpy's max() function.",
+        "type": "comment"
+    },
+    "1840": {
+        "file_id": 146,
+        "content": "    index = paddle.argmax(a, axis=axis, keepdim=keepdim)\n    max_ = max_.numpy()\n    index = index.numpy()\n    # index = paddle.argmax(a, axis=axis, keepdim=keepdim)[-1].flatten()\n    return max_, index\ndef gather(tmp: paddle.Tensor, ind: paddle.Tensor):\n    shape = tmp.shape\n    tmp = paddle.to_tensor(tmp)\n    ind = paddle.to_tensor(ind)\n    if len(shape) == 2:\n        b = shape[0]\n        return concat([\n            reshape(paddle.gather(tmp[i, :], ind[i, :]), [1, -1])\n            for i in range(b)\n        ],\n                      axis=0)\n    elif len(shape) == 3:\n        out = []\n        for i in range(tmp.shape[0]):\n            _ = paddle.index_sample(tmp[i], ind[i])\n            out.append(_)\n        return paddle.to_tensor(out)\n    elif len(shape) == 4:\n        b, c, d = shape[:3]\n        return concat([\n            reshape(\n                concat([\n                    reshape(\n                        concat([\n                            reshape(\n                                paddle.gather(tmp[i, j, k, :], ind[i, j, k, :]),",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:275-307"
+    },
+    "1841": {
+        "file_id": 146,
+        "content": "This code calculates the maximum value in a tensor and returns the corresponding index for each dimension. It also provides functions to gather data along different dimensions using gather or index_sample operations, depending on the shape of the input tensor.",
+        "type": "comment"
+    },
+    "1842": {
+        "file_id": 146,
+        "content": "                                [1, -1]) for k in range(d)\n                        ],\n                               axis=0), [1, d, -1]) for j in range(c)\n                ],\n                       axis=0), [1, c, d, -1]) for i in range(b)\n        ],\n                      axis=0)\n    else:\n        pass\n# These no_grad_* functions are necessary as wrappers around the parts of these\n# functions that use `with torch.no_grad()`. The JIT doesn't support context\n# managers, so these need to be implemented as builtins. Using these wrappers\n# lets us keep those builtins small and re-usable.\ndef _no_grad_uniform_(tensor, a, b):\n    with paddle.no_grad():\n        tensor.set_value(paddle.uniform(tensor.shape, min=a, max=b))\n        return tensor\ndef _no_grad_normal_(tensor, mean, std):\n    with paddle.no_grad():\n        tensor.set_value(paddle.normal(shape=tensor.shape, mean=mean, std=std))\n        return tensor\ndef _no_grad_trunc_normal_(tensor, mean, std, a, b):\n    from scipy import special\n    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:308-338"
+    },
+    "1843": {
+        "file_id": 146,
+        "content": "This code defines three functions (_no_grad_uniform_, _no_grad_normal_, and _no_grad_trunc_normal_) to initialize the weights of a tensor using different distributions while ensuring the computations are performed without gradient calculation. The main purpose is to prevent unnecessary memory usage and computation time for backpropagation in cases where gradients are not required.",
+        "type": "comment"
+    },
+    "1844": {
+        "file_id": 146,
+        "content": "    def norm_cdf(x):\n        # Computes standard normal cumulative distribution function\n        return (1. + math.erf(x / math.sqrt(2.))) / 2.\n    if (mean < a - 2 * std) or (mean > b + 2 * std):\n        warnings.warn(\n            \"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. \"\n            \"The distribution of values may be incorrect.\",\n            stacklevel=2)\n    with paddle.no_grad():\n        # Values are generated by using a truncated uniform distribution and\n        # then using the inverse CDF for the normal distribution.\n        # Get upper and lower cdf values\n        l = norm_cdf((a - mean) / std)\n        u = norm_cdf((b - mean) / std)\n        # Uniformly fill tensor with values from [l, u], then translate to\n        # [2l-1, 2u-1].\n        tensor.set_value(\n            paddle.uniform(tensor.shape, min=2 * l - 1, max=2 * u - 1))\n        # tensor.uniform_(2 * l - 1, 2 * u - 1)\n        # Use inverse cdf transform for normal distribution to get truncated\n        # standard normal\n        tensor.set_value(special.erfinv(tensor))",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:339-364"
+    },
+    "1845": {
+        "file_id": 146,
+        "content": "This function initializes the weights of a neural network using truncated normal distribution. It first computes the standard normal cumulative distribution function and checks if the mean is more than 2 std away from [a, b]. If so, it issues a warning. Then it generates uniform values in [l, u] and transforms them to truncated standard normal distribution.",
+        "type": "comment"
+    },
+    "1846": {
+        "file_id": 146,
+        "content": "        # Transform to proper mean, std\n        tensor.set_value(tensor.multiply(paddle.to_tensor(std * math.sqrt(2.))))\n        tensor.add_(mean)\n        # Clamp to ensure it's in the proper range\n        tensor.clip_(min=a, max=b)\n        return tensor\ndef _no_grad_fill_(tensor, val):\n    with paddle.no_grad():\n        tensor.set_value(paddle.full_like(tensor, fill_value=val))\n        return tensor\ndef _no_grad_zero_(tensor):\n    with paddle.no_grad():\n        tensor.set_value(paddle.zeros_like(tensor))\n        return tensor\ndef calculate_gain(nonlinearity, param=None):\n    r\"\"\"Return the recommended gain value for the given nonlinearity function.\n    The values are as follows:\n    ================= ====================================================\n    nonlinearity      gain\n    ================= ====================================================\n    Linear / Identity :math:`1`\n    Conv{1,2,3}D      :math:`1`\n    Sigmoid           :math:`1`\n    Tanh              :math:`\\frac{5}{3}`\n    ReLU              :math:`\\sqrt{2}`",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:366-398"
+    },
+    "1847": {
+        "file_id": 146,
+        "content": "The provided code contains functions for transforming, filling, and zeroing tensors. It also includes a function that calculates the recommended gain value for different nonlinearity functions. The gain values are 1 for Linear/Identity, Conv{1,2,3}D, and Sigmoid; and 5/3 for Tanh, while for ReLU it is sqrt(2).",
+        "type": "comment"
+    },
+    "1848": {
+        "file_id": 146,
+        "content": "    Leaky Relu        :math:`\\sqrt{\\frac{2}{1 + \\text{negative\\_slope}^2}}`\n    SELU              :math:`\\frac{3}{4}`\n    ================= ====================================================\n    Args:\n        nonlinearity: the non-linear function (`nn.functional` name)\n        param: optional parameter for the non-linear function\n    Examples:\n        >>> gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2\n    \"\"\"\n    linear_fns = [\n        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',\n        'conv_transpose2d', 'conv_transpose3d'\n    ]\n    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':\n        return 1\n    elif nonlinearity == 'tanh':\n        return 5.0 / 3\n    elif nonlinearity == 'relu':\n        return math.sqrt(2.0)\n    elif nonlinearity == 'leaky_relu':\n        if param is None:\n            negative_slope = 0.01\n        elif not isinstance(param, bool) and isinstance(\n                param, int) or isinstance(param, float):\n            # True/False are instances of int, hence check above",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:399-425"
+    },
+    "1849": {
+        "file_id": 146,
+        "content": "This function calculates the gain value for different non-linear functions used in neural networks, such as Leaky ReLU, SELU, and others. It returns appropriate gain values depending on the specified nonlinearity parameter, considering any optional parameters as well.",
+        "type": "comment"
+    },
+    "1850": {
+        "file_id": 146,
+        "content": "            negative_slope = param\n        else:\n            raise ValueError(\n                \"negative_slope {} not a valid number\".format(param))\n        return math.sqrt(2.0 / (1 + negative_slope**2))\n    elif nonlinearity == 'selu':\n        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)\n    else:\n        raise ValueError(\"Unsupported nonlinearity {}\".format(nonlinearity))\ndef uniform_(tensor: Tensor, a: float = 0., b: float = 1.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from the uniform\n    distribution :math:`\\mathcal{U}(a, b)`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the lower bound of the uniform distribution\n        b: the upper bound of the uniform distribution\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.uniform_(w)\n    \"\"\"\n    return _no_grad_uniform_(tensor, a, b)\ndef normal_(tensor: Tensor, mean: float = 0., std: float = 1.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from the normal",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:426-454"
+    },
+    "1851": {
+        "file_id": 146,
+        "content": "The code defines two functions, `uniform_` and `normal_`, used for initializing tensors with uniform or normal distribution respectively. The `_no_grad_uniform_` function is used internally by `uniform_`. The `else` and `if-elif` structures are used to handle nonlinearity cases in the `init_` function.",
+        "type": "comment"
+    },
+    "1852": {
+        "file_id": 146,
+        "content": "    distribution :math:`\\mathcal{N}(\\text{mean}, \\text{std}^2)`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        mean: the mean of the normal distribution\n        std: the standard deviation of the normal distribution\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.normal_(w)\n    \"\"\"\n    return _no_grad_normal_(tensor, mean, std)\ndef trunc_normal_(tensor: Tensor,\n                  mean: float = 0.,\n                  std: float = 1.,\n                  a: float = -2.,\n                  b: float = 2.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from a truncated\n    normal distribution. The values are effectively drawn from the\n    normal distribution :math:`\\mathcal{N}(\\text{mean}, \\text{std}^2)`\n    with values outside :math:`[a, b]` redrawn until they are within\n    the bounds. The method used for generating the random values works\n    best when :math:`a \\leq \\text{mean} \\leq b`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        mean: the mean of the normal distribution",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:455-483"
+    },
+    "1853": {
+        "file_id": 146,
+        "content": "This code initializes or fills a tensor with values drawn from a normal distribution, truncating values outside the specified range [a, b]. The function `_no_grad_normal_` initializes a tensor with values from a normal distribution with given mean and standard deviation. The `trunc_normal_` function initializes a tensor with values from a truncated normal distribution within the specified bounds [a, b].",
+        "type": "comment"
+    },
+    "1854": {
+        "file_id": 146,
+        "content": "        std: the standard deviation of the normal distribution\n        a: the minimum cutoff value\n        b: the maximum cutoff value\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.trunc_normal_(w)\n    \"\"\"\n    return _no_grad_trunc_normal_(tensor, mean, std, a, b)\ndef constant_(tensor: Tensor, val: float) -> Tensor:\n    r\"\"\"Fills the input Tensor with the value :math:`\\text{val}`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        val: the value to fill the tensor with\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.constant_(w, 0.3)\n    \"\"\"\n    return _no_grad_fill_(tensor, val)\ndef ones_(tensor: Tensor) -> Tensor:\n    r\"\"\"Fills the input Tensor with the scalar value `1`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.ones_(w)\n    \"\"\"\n    return _no_grad_fill_(tensor, 1.)\ndef zeros_(tensor: Tensor) -> Tensor:\n    r\"\"\"Fills the input Tensor with the scalar value `0`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:484-526"
+    },
+    "1855": {
+        "file_id": 146,
+        "content": "This code defines several functions for initializing Tensor objects in PyTorch. These functions include `trunc_normal_`, `constant_`, `ones_`, and `zeros_`. The `trunc_normal_` function initializes a tensor with values drawn from a truncated normal distribution, while the other three functions fill the tensor with constant values (specified by the user), ones, or zeros respectively. These functions can be used to set the initial values of a tensor before training a neural network model.",
+        "type": "comment"
+    },
+    "1856": {
+        "file_id": 146,
+        "content": "    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.zeros_(w)\n    \"\"\"\n    return _no_grad_zero_(tensor)\ndef eye_(tensor):\n    r\"\"\"Fills the 2-dimensional input `Tensor` with the identity\n    matrix. Preserves the identity of the inputs in `Linear` layers, where as\n    many inputs are preserved as possible.\n    Args:\n        tensor: a 2-dimensional `torch.Tensor`\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.eye_(w)\n    \"\"\"\n    if tensor.ndimension() != 2:\n        raise ValueError(\"Only tensors with 2 dimensions are supported\")\n    with paddle.no_grad():\n        tensor.set_value(paddle.eye(*tensor.shape))\n    return tensor\ndef dirac_(tensor, groups=1):\n    r\"\"\"Fills the {3, 4, 5}-dimensional input `Tensor` with the Dirac\n    delta function. Preserves the identity of the inputs in `Convolutional`\n    layers, where as many input channels are preserved as possible. In case\n    of groups>1, each group of channels preserves identity\n    Args:\n        tensor: a {3, 4, 5}-dimensional `torch.Tensor`",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:528-562"
+    },
+    "1857": {
+        "file_id": 146,
+        "content": "The code above contains three functions: _no_grad_zero_, eye_, and dirac_. The _no_grad_zero_ function returns a tensor with all elements set to zero while preserving the identity of inputs in Linear layers. The eye_ function fills a 2-dimensional input tensor with an identity matrix, preserving as many inputs as possible in Linear layers. Lastly, the dirac_ function fills a 3, 4, or 5-dimensional input tensor with Dirac delta functions while preserving the identity of inputs in Convolutional layers, considering groups if specified.",
+        "type": "comment"
+    },
+    "1858": {
+        "file_id": 146,
+        "content": "        groups (optional): number of groups in the conv layer (default: 1)\n    Examples:\n        >>> w = torch.empty(3, 16, 5, 5)\n        >>> nn.init.dirac_(w)\n        >>> w = torch.empty(3, 24, 5, 5)\n        >>> nn.init.dirac_(w, 3)\n    \"\"\"\n    dimensions = tensor.ndimension()\n    if dimensions not in [3, 4, 5]:\n        raise ValueError(\n            \"Only tensors with 3, 4, or 5 dimensions are supported\")\n    sizes = tensor.shape\n    if sizes[0] % groups != 0:\n        raise ValueError('dim 0 must be divisible by groups')\n    out_chans_per_grp = sizes[0] // groups\n    min_dim = min(out_chans_per_grp, sizes[1])\n    with paddle.no_grad():\n        tensor.zero_()\n        for g in range(groups):\n            for d in range(min_dim):\n                if dimensions == 3:  # Temporal convolution\n                    tensor[g * out_chans_per_grp + d, d,\n                           tensor.shape[2] // 2] = 1\n                elif dimensions == 4:  # Spatial convolution\n                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:563-592"
+    },
+    "1859": {
+        "file_id": 146,
+        "content": "The code is a function that initializes the convolutional layer weights using Dirac delta distribution for 3, 4, or 5-dimensional tensors. It first checks if the tensor dimensions are supported and then raises an error if not. Then it calculates the number of output channels per group and minimum dimension. The code then zeroes out the tensor and initializes the weights with Dirac delta distribution for specified groups and dimensions, performing a temporal convolution in 3-dimensions or spatial convolution in 4-dimensions.",
+        "type": "comment"
+    },
+    "1860": {
+        "file_id": 146,
+        "content": "                           tensor.shape[3] // 2] = 1\n                else:  # Volumetric convolution\n                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,\n                           tensor.shape[3] // 2, tensor.shape[4] // 2] = 1\n    return tensor\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = tensor.dim()\n    if dimensions < 2:\n        raise ValueError(\n            \"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions\"\n        )\n    num_input_fmaps = tensor.shape[1]  # .size(1)\n    num_output_fmaps = tensor.shape[0]  # .size(0)\n    receptive_field_size = 1\n    if tensor.dim() > 2:\n        for s in tensor.shape[2:]:\n            receptive_field_size *= s  # fixed\n    fan_in = num_input_fmaps * receptive_field_size\n    fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef LongTensor(x):\n    return paddle.to_tensor(x, dtype='int64')\ndef IntTensor(x):\n    return paddle.to_tensor(x, dtype='int32')\ndef xavier_uniform_(tensor: Tensor, gain: float = 1.) -> Tensor:",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:593-627"
+    },
+    "1861": {
+        "file_id": 146,
+        "content": "This code defines several utility functions related to tensors in PaddlePaddle. The `_calculate_fan_in_and_fan_out` function calculates the fan-in and fan-out of a tensor, while the `LongTensor` function converts an input to a long tensor (dtype: int64). The `IntTensor` function does the same but with an int32 dtype. Lastly, `xavier_uniform_` initializes a tensor's parameters using Xavier Uniform initialization.",
+        "type": "comment"
+    },
+    "1862": {
+        "file_id": 146,
+        "content": "    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Understanding the difficulty of training deep feedforward\n    neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform\n    distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{U}(-a, a)` where\n    .. math::\n        a = \\text{gain} \\times \\sqrt{\\frac{6}{\\text{fan\\_in} + \\text{fan\\_out}}}\n    Also known as Glorot initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        gain: an optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation\n    return _no_grad_uniform_(tensor, -a, a)\ndef xavier_normal_(tensor: Tensor, gain: float = 1.) -> Tensor:\n    r\"\"\"Fills the input `Tensor` with values according to the method",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:628-655"
+    },
+    "1863": {
+        "file_id": 146,
+        "content": "This code initializes the input tensor with values following the Glorot initialization method. It uses a uniform distribution and calculates the scaling factor 'a' based on gain, fan_in, and fan_out dimensions of the tensor. The resulting tensor is sampled from a uniform distribution between -a and a. The function xavier_normal_ is a variation that also fills the input tensor but with values from a normal distribution instead of uniform.",
+        "type": "comment"
+    },
+    "1864": {
+        "file_id": 146,
+        "content": "    described in `Understanding the difficulty of training deep feedforward\n    neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal\n    distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{N}(0, \\text{std}^2)` where\n    .. math::\n        \\text{std} = \\text{gain} \\times \\sqrt{\\frac{2}{\\text{fan\\_in} + \\text{fan\\_out}}}\n    Also known as Glorot initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        gain: an optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.xavier_normal_(w)\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    return _no_grad_normal_(tensor, 0., std)\ndef _calculate_correct_fan(tensor, mode):\n    mode = mode.lower()\n    valid_modes = ['fan_in', 'fan_out']\n    if mode not in valid_modes:\n        raise ValueError(\"Mode {} not supported, please use one of {}\".format(\n            mode, valid_modes))\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:656-687"
+    },
+    "1865": {
+        "file_id": 146,
+        "content": "This function initializes a tensor with Xavier/Glorot normal distribution, using a normal distribution and a scaling factor 'gain'. The resulting tensor values are sampled from the normal distribution N(0, std^2), where std = gain * sqrt(2 / (fan_in + fan_out)). It also includes a function _calculate_correct_fan that checks for valid modes 'fan_in' or 'fan_out'.",
+        "type": "comment"
+    },
+    "1866": {
+        "file_id": 146,
+        "content": "    return fan_in if mode == 'fan_in' else fan_out\ndef kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Delving deep into rectifiers: Surpassing human-level\n    performance on ImageNet classification` - He, K. et al. (2015), using a\n    uniform distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{U}(-\\text{bound}, \\text{bound})` where\n    .. math::\n        \\text{bound} = \\text{gain} \\times \\sqrt{\\frac{3}{\\text{fan\\_mode}}}\n    Also known as He initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the negative slope of the rectifier used after this layer (only\n            used with ``'leaky_relu'``)\n        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``\n            preserves the magnitude of the variance of the weights in the\n            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:688-709"
+    },
+    "1867": {
+        "file_id": 146,
+        "content": "This function fills a tensor with values from a uniform distribution according to the method described in He et al.'s (2015) paper, using either fan_in or fan_out mode. It also takes an optional argument for the negative slope of the rectifier used after this layer when 'leaky_relu' is specified.",
+        "type": "comment"
+    },
+    "1868": {
+        "file_id": 146,
+        "content": "            backwards pass.\n        nonlinearity: the non-linear function (`nn.functional` name),\n            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    bound = math.sqrt(\n        3.0) * std  # Calculate uniform bounds from standard deviation\n    with paddle.no_grad():\n        tensor.set_value(paddle.uniform(tensor.shape, min=-bound, max=bound))\n        return tensor\ndef kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Delving deep into rectifiers: Surpassing human-level\n    performance on ImageNet classification` - He, K. et al. (2015), using a\n    normal distribution. The resulting tensor will have values sampled from",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:710-732"
+    },
+    "1869": {
+        "file_id": 146,
+        "content": "This code is a PyTorch implementation of the Kaiming Uniform initialization method, used for initializing weights in neural networks. The function takes in a tensor and sets its values according to a uniform distribution with bounds calculated based on the tensor shape and nonlinearity (default is 'leaky_relu'). It also calculates the fan and gain based on the mode and nonlinearity to determine the standard deviation for the uniform distribution. The function then uses Paddle's `uniform` method to set the values in the tensor with the calculated bounds.",
+        "type": "comment"
+    },
+    "1870": {
+        "file_id": 146,
+        "content": "    :math:`\\mathcal{N}(0, \\text{std}^2)` where\n    .. math::\n        \\text{std} = \\frac{\\text{gain}}{\\sqrt{\\text{fan\\_mode}}}\n    Also known as He initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the negative slope of the rectifier used after this layer (only\n            used with ``'leaky_relu'``)\n        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``\n            preserves the magnitude of the variance of the weights in the\n            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the\n            backwards pass.\n        nonlinearity: the non-linear function (`nn.functional` name),\n            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> kaiming_normal_(w, mode='fan_out', nonlinearity='relu')\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    with paddle.no_grad():",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:733-758"
+    },
+    "1871": {
+        "file_id": 146,
+        "content": "Function initializes weights using Kaiming normal distribution. It takes in an n-dimensional tensor, a slope value for rectifier (optional), mode as 'fan_in' or 'fan_out', and nonlinearity function. It preserves weight variance in forward pass with 'fan_in' and backward pass with 'fan_out'. Recommended to use with 'relu' or 'leaky_relu'.",
+        "type": "comment"
+    },
+    "1872": {
+        "file_id": 146,
+        "content": "        tensor.set_value(paddle.normal(shape=tensor.shape, mean=0, std=std))\n        return tensor\ndef orthogonal_(tensor, gain=1):\n    r\"\"\"Fills the input `Tensor` with a (semi) orthogonal matrix, as\n    described in `Exact solutions to the nonlinear dynamics of learning in deep\n    linear neural networks` - Saxe, A. et al. (2013). The input tensor must have\n    at least 2 dimensions, and for tensors with more than 2 dimensions the\n    trailing dimensions are flattened.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`, where :math:`n \\geq 2`\n        gain: optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.orthogonal_(w)\n    \"\"\"\n    if tensor.ndimension() < 2:\n        raise ValueError(\"Only tensors with 2 or more dimensions are supported\")\n    rows = tensor.shape[0]  # .size(0)\n    cols = tensor.numel() // rows\n    flattened = tensor.new(rows, cols).normal_(0, 1)\n    if rows < cols:\n        flattened.t_()\n    # Compute the qr factorization\n    q, r = paddle.to_tensor(np.linalg.qr(flattened.numpy()))",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:759-789"
+    },
+    "1873": {
+        "file_id": 146,
+        "content": "The code initializes a tensor with a (semi) orthogonal matrix based on the input tensor. The tensor must have at least 2 dimensions, and the trailing dimensions are flattened. If rows are less than columns, transpose the tensor. It computes the QR factorization of the tensor.",
+        "type": "comment"
+    },
+    "1874": {
+        "file_id": 146,
+        "content": "    # q, r = torch.qr(flattened)\n    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf\n    d = paddle.diag(r, 0)\n    ph = d.sign()\n    q *= ph\n    if rows < cols:\n        q.t_()\n    with paddle.no_grad():\n        tensor.view_as(q).copy_(q)\n        tensor.mul_(gain)\n    return tensor\ndef sparse_(tensor, sparsity, std=0.01):\n    r\"\"\"Fills the 2D input `Tensor` as a sparse matrix, where the\n    non-zero elements will be drawn from the normal distribution\n    :math:`\\mathcal{N}(0, 0.01)`, as described in `Deep learning via\n    Hessian-free optimization` - Martens, J. (2010).\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        sparsity: The fraction of elements in each column to be set to zero\n        std: the standard deviation of the normal distribution used to generate\n            the non-zero values\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.sparse_(w, sparsity=0.1)\n    \"\"\"\n    if tensor.ndimension() != 2:\n        raise ValueError(\"Only tensors with 2 dimensions are supported\")",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:790-822"
+    },
+    "1875": {
+        "file_id": 146,
+        "content": "Line 789: q, r = torch.qr(flattened) - Performs QR decomposition on flattened tensor\nLine 790-792: Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf - Applies diag function on r, multiplies q by ph, and transposes q if rows < cols\nLine 794-798: with paddle.no_grad(): tensor.view_as(q).copy_(q) - Uses no_grad context manager to prevent gradients from being recorded during the operation\nLine 799: tensor.mul_(gain) - Multiplies tensor by a gain factor\nLine 802: return tensor - Returns the modified tensor after applying QR decomposition and scaling",
+        "type": "comment"
+    },
+    "1876": {
+        "file_id": 146,
+        "content": "    rows, cols = tensor.shape\n    num_zeros = int(math.ceil(sparsity * rows))\n    with paddle.no_grad():\n        tensor.normal_(0, std)\n        for col_idx in range(cols):\n            row_indices = paddle.randperm(rows)\n            zero_indices = row_indices[:num_zeros]\n            tensor[zero_indices, col_idx] = 0\n    return tensor\n# for backward compatibility\ndef _make_deprecate(meth):\n    new_name = meth.__name__\n    old_name = new_name[:-1]\n    def deprecated_init(*args, **kwargs):\n        warnings.warn(\n            \"nn.init.{} is now deprecated in favor of nn.init.{}.\".format(\n                old_name, new_name),\n            stacklevel=2)\n        return meth(*args, **kwargs)\n    deprecated_init.__doc__ = r\"\"\"\n    {old_name}(...)\n    .. warning::\n        This method is now deprecated in favor of :func:`torch.nn.init.{new_name}`.\n    See :func:`~torch.nn.init.{new_name}` for details.\"\"\".format(\n        old_name=old_name, new_name=new_name)\n    deprecated_init.__name__ = old_name\n    return deprecated_init",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/api.py:824-857"
+    },
+    "1877": {
+        "file_id": 146,
+        "content": "This code defines a function that initializes the values of a tensor using the Kaiming normal distribution. It also includes a deprecated method for backward compatibility, warning users to use the new method instead.",
+        "type": "comment"
+    },
+    "1878": {
+        "file_id": 147,
+        "content": "/applications/Ma-Net/utils/mask_damaging.py",
+        "type": "filepath"
+    },
+    "1879": {
+        "file_id": 147,
+        "content": "The `mask_damager` function from PaddleVideo library takes a mask and maximum rotation angle, damages and scales it by applying rotations and translations to create damaged labels.",
+        "type": "summary"
+    },
+    "1880": {
+        "file_id": 147,
+        "content": "import numpy as np\nfrom scipy.ndimage import interpolation\ntry:\n    from skimage import morphology, transform\nexcept ImportError as e:\n    print(\n        f\"{e}, [scikit-image] package and it's dependencies is required for MA-Net.\"\n    )\nimport paddle\nimport cv2\nimport random\n####\ndef mask_damager(labels=None, p_black=0.2):\n    scales = (0.8, 1.0, 1.2)\n    kernel_size = random.randint(10, 15)\n    kernel = np.ones((kernel_size, kernel_size), np.uint8)\n    if random.random() < p_black:\n        final_label = paddle.zeros_like(labels)\n        final_label = final_label.squeeze().numpy()\n    else:\n        prot = random.randint(5, 15)\n        nrot = random.randint(-15, -5)\n        rots = [prot, nrot, 0]\n        rot = rots[random.randint(0, 2)]\n        sc = scales[random.randint(0, 2)]\n        _, _, h, w = labels.shape\n        tmp = labels.squeeze()\n        tmp = tmp.unsqueeze(-1)\n        tmp = tmp.numpy().astype(np.uint8)\n        morph_p = random.random()\n        if morph_p < 0.5:\n            tmp = cv2.morphologyEx(tmp, cv2.MORPH_OPEN, kernel)",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/mask_damaging.py:1-36"
+    },
+    "1881": {
+        "file_id": 147,
+        "content": "This code defines a function, `mask_damager`, which randomly applies transformations to the input labels. It can make the entire label black with 20% probability or rotate and scale it by random values from predefined ranges. If morphology transformation is applied with 50% probability, it uses an open operation to modify the labels using a randomly generated kernel.",
+        "type": "comment"
+    },
+    "1882": {
+        "file_id": 147,
+        "content": "        else:\n            tmp = cv2.morphologyEx(tmp, cv2.MORPH_CLOSE, kernel)\n        tmp = tmp.astype(np.uint8)\n        center = (w / 2, h / 2)\n        M = cv2.getRotationMatrix2D(center, rot, sc)\n        final_label = cv2.warpAffine(tmp, M, (w, h), cv2.INTER_NEAREST)\n    return final_label\n#####\ndef damage_masks(labels, shift=True, scale=True, rotate=True):\n    \"\"\"\n    Args:\n    labels: numpy array (batch_size * 1 * h * w)\n    \"\"\"\n    bs, _, h, w = labels.shape\n    labels = labels.transpose([0, 2, 3, 1])\n    labels = labels.numpy()\n    final_label = []\n    for i in range(bs):\n        label = labels[i]\n        damaged_label = damage_masks_np(label, shift, scale, rotate)\n        final_label.append(damaged_label)\n    final_label = np.array(final_label)\n    final_label = paddle.to_tensor(final_label)\n    final_label = final_label.transpose([0, 3, 1, 2])\n    return final_label\ndef damage_masks_np(labels, shift=True, scale=True, rotate=True):\n    \"\"\"Performs the actual mask damaging in numpy.\n    Args:\n    labels: Int32 numpy array of shape (height, width, 1).",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/mask_damaging.py:37-73"
+    },
+    "1883": {
+        "file_id": 147,
+        "content": "The code defines two functions, \"damage_masks\" and \"damage_masks_np\". The former applies mask damaging to a batch of input labels while the latter performs the actual mask damaging on individual numpy arrays. These functions can be used to alter the input masks by applying shifts, scales, and rotations. The output is then converted into a tensor for further processing.",
+        "type": "comment"
+    },
+    "1884": {
+        "file_id": 147,
+        "content": "    shift: Boolean, whether to damage the masks by shifting.\n    scale: Boolean, whether to damage the masks by scaling.\n    rotate: Boolean, whether to damage the masks by rotation.\n    dilate: Boolean, whether to damage the masks by dilation.\n    Returns:\n    The damaged version of labels.\n    \"\"\"\n    unique_labels = np.unique(labels)\n    unique_labels = np.setdiff1d(unique_labels, [0])\n    # Shuffle to get random depth ordering when combining together.\n    np.random.shuffle(unique_labels)\n    damaged_labels = np.zeros_like(labels)\n    for l in unique_labels:\n        obj_mask = (labels == l)\n        damaged_obj_mask = _damage_single_object_mask(obj_mask, shift, scale,\n                                                      rotate)\n        damaged_labels[damaged_obj_mask] = l\n    return damaged_labels\ndef _damage_single_object_mask(mask, shift, scale, rotate):\n    \"\"\"Performs mask damaging in numpy for a single object.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    shift: Boolean, whether to damage the masks by shifting.",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/mask_damaging.py:74-98"
+    },
+    "1885": {
+        "file_id": 147,
+        "content": "This function takes a mask and applies various damage operations such as shifting, scaling, rotation, or dilation to it. It first extracts unique labels from the input mask, shuffles them for random depth ordering, and creates an empty damaged labels array. Then, for each unique label, it applies the single object mask damaging function to the corresponding mask region. The damaged masks are then combined with their original labels to create the final damaged labels array.",
+        "type": "comment"
+    },
+    "1886": {
+        "file_id": 147,
+        "content": "    scale: Boolean, whether to damage the masks by scaling.\n    rotate: Boolean, whether to damage the masks by rotation.\n    dilate: Boolean, whether to damage the masks by dilation.\n    Returns:\n    The damaged version of mask.\n    \"\"\"\n    if shift:\n        mask = _shift_mask(mask)\n    if scale:\n        mask = _scale_mask(mask)\n    if rotate:\n        mask = _rotate_mask(mask)\n    return mask\ndef _shift_mask(mask, max_shift_factor=0.05):\n    \"\"\"Damages a mask for a single object by randomly shifting it in numpy.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    max_shift_factor: Float scalar, the maximum factor for random shifting.\n    Returns:\n    The shifted version of mask.\n    \"\"\"\n    nzy, nzx, _ = mask.nonzero()\n    h = nzy.max() - nzy.min()\n    w = nzx.max() - nzx.min()\n    size = np.sqrt(h * w)\n    offset = np.random.uniform(-size * max_shift_factor,\n                               size * max_shift_factor, 2)\n    shifted_mask = interpolation.shift(np.squeeze(mask, axis=2),\n                                       offset,",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/mask_damaging.py:99-129"
+    },
+    "1887": {
+        "file_id": 147,
+        "content": "This code is from the PaddleVideo library and performs mask damage on a given input mask. The mask can be damaged by shifting, scaling, and/or rotation depending on the provided boolean parameters. The function _shift_mask() shifts the mask randomly based on the maximum shift factor. The returned mask is the damaged version of the original input mask.",
+        "type": "comment"
+    },
+    "1888": {
+        "file_id": 147,
+        "content": "                                       order=0).astype('bool')[..., np.newaxis]\n    return shifted_mask\ndef _scale_mask(mask, scale_amount=0.025):\n    \"\"\"Damages a mask for a single object by randomly scaling it in numpy.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    scale_amount: Float scalar, the maximum factor for random scaling.\n    Returns:\n    The scaled version of mask.\n    \"\"\"\n    nzy, nzx, _ = mask.nonzero()\n    cy = 0.5 * (nzy.max() - nzy.min())\n    cx = 0.5 * (nzx.max() - nzx.min())\n    scale_factor = np.random.uniform(1.0 - scale_amount, 1.0 + scale_amount)\n    shift = transform.SimilarityTransform(translation=[-cx, -cy])\n    inv_shift = transform.SimilarityTransform(translation=[cx, cy])\n    s = transform.SimilarityTransform(scale=[scale_factor, scale_factor])\n    m = (shift + (s + inv_shift)).inverse\n    scaled_mask = transform.warp(mask, m) > 0.5\n    return scaled_mask\ndef _rotate_mask(mask, max_rot_degrees=3.0):\n    \"\"\"Damages a mask for a single object by randomly rotating it in numpy.",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/mask_damaging.py:130-155"
+    },
+    "1889": {
+        "file_id": 147,
+        "content": "The code contains three functions: _damage_mask, _scale_mask, and _rotate_mask. These functions are used to randomly damage a mask for a single object in the image. The _damage_mask function applies random noise to the mask by using random values from a uniform distribution and subtracting it from the original mask. The _scale_mask function scales the mask in numpy by applying a scale factor randomly generated within a specific range around 1.0. The _rotate_mask function rotates the mask by a random angle between -max_rot_degrees to max_rot_degrees degrees. These functions can be used together or separately depending on the desired damage to the mask.",
+        "type": "comment"
+    },
+    "1890": {
+        "file_id": 147,
+        "content": "    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    max_rot_degrees: Float scalar, the maximum number of degrees to rotate.\n    Returns:\n    The scaled version of mask.\n    \"\"\"\n    cy = 0.5 * mask.shape[0]\n    cx = 0.5 * mask.shape[1]\n    rot_degrees = np.random.uniform(-max_rot_degrees, max_rot_degrees)\n    shift = transform.SimilarityTransform(translation=[-cx, -cy])\n    inv_shift = transform.SimilarityTransform(translation=[cx, cy])\n    r = transform.SimilarityTransform(rotation=np.deg2rad(rot_degrees))\n    m = (shift + (r + inv_shift)).inverse\n    scaled_mask = transform.warp(mask, m) > 0.5\n    return scaled_mask",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/mask_damaging.py:156-170"
+    },
+    "1891": {
+        "file_id": 147,
+        "content": "This function takes a boolean numpy array mask and maximum rotation angle, then returns a scaled version of the mask after applying random rotations and translations.",
+        "type": "comment"
+    },
+    "1892": {
+        "file_id": 148,
+        "content": "/applications/Ma-Net/utils/meters.py",
+        "type": "filepath"
+    },
+    "1893": {
+        "file_id": 148,
+        "content": "The code defines a class called AverageMeter which computes and stores the average and current value. It has four attributes: val, avg, sum, and count. The reset function resets all values to 0, while the update function takes a value (val) and updates the running average based on the number of samples (n).",
+        "type": "summary"
+    },
+    "1894": {
+        "file_id": 148,
+        "content": "from __future__ import absolute_import\nclass AverageMeter(object):\n    \"\"\"Computes and stores the average and current value\"\"\"\n    def __init__(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def reset(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/meters.py:1-22"
+    },
+    "1895": {
+        "file_id": 148,
+        "content": "The code defines a class called AverageMeter which computes and stores the average and current value. It has four attributes: val, avg, sum, and count. The reset function resets all values to 0, while the update function takes a value (val) and updates the running average based on the number of samples (n).",
+        "type": "comment"
+    },
+    "1896": {
+        "file_id": 149,
+        "content": "/applications/Ma-Net/utils/utils.py",
+        "type": "filepath"
+    },
+    "1897": {
+        "file_id": 149,
+        "content": "This function takes a label input, converts it into an array and uses bitwise operations to create a color map representing the label values in RGB format.",
+        "type": "summary"
+    },
+    "1898": {
+        "file_id": 149,
+        "content": "import numpy as np\ndef label2colormap(label):\n    m = label.astype(np.uint8)\n    r, c = m.shape\n    cmap = np.zeros((r, c, 3), dtype=np.uint8)\n    cmap[:, :, 0] = (m & 1) << 7 | (m & 8) << 3 | (m & 64) >> 1\n    cmap[:, :, 1] = (m & 2) << 6 | (m & 16) << 2 | (m & 128) >> 2\n    cmap[:, :, 2] = (m & 4) << 5 | (m & 32) << 1\n    return cmap",
+        "type": "code",
+        "location": "/applications/Ma-Net/utils/utils.py:1-12"
+    },
+    "1899": {
+        "file_id": 149,
+        "content": "This function takes a label input, converts it into an array and uses bitwise operations to create a color map representing the label values in RGB format.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/19.json b/docs/data/19.json
new file mode 100644
index 000000000..2880958b6
--- /dev/null
+++ b/docs/data/19.json
@@ -0,0 +1,541 @@
+{
+    "1900": {
+        "file_id": 150,
+        "content": "/applications/MultimodalVideoTag/README.md",
+        "type": "filepath"
+    },
+    "1901": {
+        "file_id": 150,
+        "content": "The code trains a multimodal video classification model using PaddlePaddle 2.0, incorporating text, video image, and audio data for tagging in multimodal scenarios. It focuses on training, evaluation, optimization, and use, with performance improvements through post-processing networks, faster training speeds, and stability enhancements. Three related papers are referenced: Attention Clusters for video classification, YouTube-8M as a large-scale benchmark, and Ernie's knowledge integration for enhanced representation.",
+        "type": "summary"
+    },
+    "1902": {
+        "file_id": 150,
+        "content": "# MutimodalVideoTag 多模态视频分类模型\n---\n## 内容\n- [模型简介](#模型简介)\n- [数据准备](#数据准备)\n- [模型训练](#模型训练)\n- [模型评估](#模型评估)\n- [模型推理](#模型推理)\n- [模型优化](#模型优化)\n- [模型部署](#模型部署)\n- [参考论文](#参考论文)\n## 模型简介\n该代码库用于多模态场景下视频分类任务，基于paddle2.0版本开发，模型基于真实短视频业务数据，融合文本、视频图像、音频三种模态进行视频多模标签分类，相比纯视频图像特征，显著提升高层语义标签效果。其原理示意如下图所示。\n<p align=\"center\">\n<img src=\"images/model.png\"  hspace='10'/> <br />\nMutimodalVideoTag 多模态视频分类模型示意图\n</p>\n- 数据处理：分别对视频三个模态的数据进行处理，对视频进行抽帧，获得图像序列；抽取视频的音频pcm 文件；收集视频标题，简单进行文本长度截断，一般取50个字。\n- 特征抽取：使用预训练的 ResNet 对图像抽取高层语义特征；使用预训练的VGGish网络抽取音频特征；文本方面使用[ERNIE 1.0](https://github.com/PaddlePaddle/ERNIE)抽取文本特征，无需预先抽取，支持视频分类模型finetune\n- 序列学习：分别使用独立的LSTM 对图像特征和音频特征进行序列学习，文本方面预训练模型对字符序列进行建模，在ernie 后接入一个textcnn 网络做下游任务的迁移学习。\n- 多模融合：文本具有显式的高层语义信息，将文本特征引入到LSTM pooling 过程指导图像和音频时序权重分配，进行交叉融合，最后将文本、音频、视频特征拼接。\n- 预测结果：分类器选用sigmoid 多标签分类器，支持视频多标签输出。\n## 数据准备\n数据方面提供已经抽取好图像、音频特征的特征文件，以及标题和标签信息，模型方面提供训练好checkpoint 文件，可进行finetune、模型评估、预测。\n```\nsh download.sh\n```\n数据文件包括抽取好特征的文件夹 `feature_files`，以及记录划分的txt 文件，格式如下\n```\n文件名 \\t 标题 \\t 标签\n18e9bf08a2fc7eaa4ee9215ab42ea827.mp4 叮叮来自肖宇梁肖宇梁rainco的特别起床铃声 拍人-帅哥,拍人-秀特效,明星周边-其他明星周边",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/README.md:1-37"
+    },
+    "1903": {
+        "file_id": 150,
+        "content": "This code is for training a multimodal video classification model using PaddlePaddle 2.0, which combines text, video image, and audio data for tagging in multimodal scenarios. The provided feature files and label information are used for training and prediction.",
+        "type": "comment"
+    },
+    "1904": {
+        "file_id": 150,
+        "content": "```\n##  模型训练\n模型训练过程有如下可调模式，可在根据数据集情况进行调整，在`conf/conf.txt` 文件中\n- ernie_freeze: 用于控制文本提特征的ernie 网络是否进行finetune，因为ernie 复杂度远大于图像、视频序列学习网络，因此在某些数据集上不好训练。\n- lstm_pool_mode: 用于控制lstm 序列池化的方式，默认是\"text_guide\"表示利用文本加强池化注意力权重，如果设置为空，则默认为自注意力的权重。\n```\nsh train.sh \n```\n##  模型评估\n模型对测试集进行评估，同时支持将checkpoint 模型转为inference 模型， 可用参数'save_only' 选项控制，设置即只用于做模型转换，得到inference 模型\n```\nsh eval_and_save_model.sh\n```\n##  模型推理\n通过上一步得到的inference 模型进行预测，结果默认阈值为0.5，存储到json 文件中，在`conf/conf.txt` 文件 `threshold` 参数进行控制多标签输出的阈值。\n```\nsh inference.sh\n```\n## 模型优化\n模型方面，主要在文本分支进行了实验，实验结果显示ERNIE 在多分支下不微调，而是使用后置网络进行微调，训练速度快，且稳定，同时attention 方面使用文本信息增强图像、音频的attention 学习能一定程度提升模型效果。\n| 模型                                                         | Hit@1 | Hit@2 |\n| ------------------------------------------------------------ | ----- | ----- |\n| 文本分支ERNIE 不finetune +self-attention                     | 71.07 | 83.72 |\n| 文本分支ERNIE 不finetune +textcnn finetune + self-attention  | 72.66 | 85.01 |\n| 文本分支ERNIE 不finetune +extcnn finetune + text-guide-attention | 73.29 | 85.59 |",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/README.md:38-65"
+    },
+    "1905": {
+        "file_id": 150,
+        "content": "This code is related to the PaddleVideo/applications/MultimodalVideoTag project, which focuses on training, evaluating, optimizing, and using a multimodal model for video tagging. The code snippet provides an overview of the steps involved in this process.\n\nTraining involves adjustable parameters like 'ernie_freeze' (for controlling whether text feature extraction from Ernie network should be fine-tuned) and 'lstm_pool_mode' (for controlling LSTM sequence pooling method). The training script is executed with 'sh train.sh'.\n\nEvaluation of the model on a test set is performed using 'sh eval_and_save_model.sh', which also supports converting checkpoint models into inference models with a 'save\\_only' option.\n\nInference, executed by 'sh inference.sh', uses the previously obtained inference model to make predictions, storing results in JSON format. The threshold for multi-label output can be adjusted using the 'conf/conf.txt' file's 'threshold' parameter.\n\nFinally, the code snippet mentions that experimentation has shown better performance for certain models and techniques in the text branch of the model, specifically mentioning gains from utilizing a post-processing network, faster training speeds, and improved stability. The table also shows how different combinations of these changes affect Hit@1 and Hit@2 metrics.",
+        "type": "comment"
+    },
+    "1906": {
+        "file_id": 150,
+        "content": "## 模型部署\n<div align=\"center\">\n  <img src=\"images/show.gif\" width=\"480px\"/><br>\n</div>\n## 参考论文\n- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen\n- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan\n- [Ernie: Enhanced representation through knowledge integration](https://arxiv.org/abs/1904.09223), Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Chen, Xuyi and Zhang, Han and Tian, Xin and Zhu, Danxiang and Tian, Hao and Wu, Hua",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/README.md:67-77"
+    },
+    "1907": {
+        "file_id": 150,
+        "content": "The code is providing information about model deployment and referencing three related papers. The first paper introduces Attention Clusters for video classification, the second one presents YouTube-8M as a large-scale classification benchmark, and the third paper discusses Ernie's knowledge integration for enhanced representation.",
+        "type": "comment"
+    },
+    "1908": {
+        "file_id": 151,
+        "content": "/applications/MultimodalVideoTag/download.sh",
+        "type": "filepath"
+    },
+    "1909": {
+        "file_id": 151,
+        "content": "This script downloads the pre-trained ernie model, its corresponding checkpoints, and a test dataset using wget and tar commands for decompression.",
+        "type": "summary"
+    },
+    "1910": {
+        "file_id": 151,
+        "content": "# download ernie 1.0 model\nwget https://videotag.bj.bcebos.com/Applications/MultimodalVideoTag/model_pretrained_ernie.tar.gz\ntar -xzvf model_pretrained_ernie.tar.gz\n# download pretrain model\nwget https://videotag.bj.bcebos.com/Applications/MultimodalVideoTag/checkpoints_save.tar.gz\ntar -xzvf checkpoints_save.tar.gz\n# download test dataset\nwget https://videotag.bj.bcebos.com/Applications/MultimodalVideoTag/datasets.tar.gz\ntar -xzvf datasets.tar.gz",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/download.sh:1-11"
+    },
+    "1911": {
+        "file_id": 151,
+        "content": "This script downloads the pre-trained ernie model, its corresponding checkpoints, and a test dataset using wget and tar commands for decompression.",
+        "type": "comment"
+    },
+    "1912": {
+        "file_id": 152,
+        "content": "/applications/MultimodalVideoTag/eval_and_save_model.sh",
+        "type": "filepath"
+    },
+    "1913": {
+        "file_id": 152,
+        "content": "This code sets environment variables for GPU utilization and then runs the eval_and_save_model.py script in scenario_lib, evaluating a model named AttentionLstmErnie with provided configuration file and saving its parameters and inference models to specified directories. The \"--save_only\" flag is not used, so both evaluation and saving will occur.",
+        "type": "summary"
+    },
+    "1914": {
+        "file_id": 152,
+        "content": "# eval sh \nexport CUDA_VISIBLE_DEVICES=0\nexport FLAGS_eager_delete_tensor_gb=0.0\nexport FLAGS_sync_nccl_allreduce=1\nexport FLAGS_fast_eager_deletion_mode=1\nexport FLAGS_fraction_of_gpu_memory_to_use=0.5\nexport FLAGS_reallocate_gpu_memory_in_mb=0\nexport FLAGS_memory_fraction_of_eager_deletion=1\npython scenario_lib/eval_and_save_model.py --model_name=AttentionLstmErnie \\\n--config=./conf/conf.txt \\\n--save_model_param_dir=checkpoints_save \\\n--save_inference_model=inference_models_save \\\n# --save_only",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/eval_and_save_model.sh:1-13"
+    },
+    "1915": {
+        "file_id": 152,
+        "content": "This code sets environment variables for GPU utilization and then runs the eval_and_save_model.py script in scenario_lib, evaluating a model named AttentionLstmErnie with provided configuration file and saving its parameters and inference models to specified directories. The \"--save_only\" flag is not used, so both evaluation and saving will occur.",
+        "type": "comment"
+    },
+    "1916": {
+        "file_id": 153,
+        "content": "/applications/MultimodalVideoTag/inference.sh",
+        "type": "filepath"
+    },
+    "1917": {
+        "file_id": 153,
+        "content": "This script sets environment variables for GPU usage and then runs the \"inference.py\" Python script from the \"scenario_lib\" directory, specifying a model name (AttentionLstmErnie), configuration file path (./conf/conf.txt), saving inference models path (inference_models_save), and output file for results (output.json).",
+        "type": "summary"
+    },
+    "1918": {
+        "file_id": 153,
+        "content": "# inference sh \nexport CUDA_VISIBLE_DEVICES=0\nexport FLAGS_eager_delete_tensor_gb=0.0\nexport FLAGS_sync_nccl_allreduce=1\nexport FLAGS_fast_eager_deletion_mode=1\nexport FLAGS_fraction_of_gpu_memory_to_use=0.5\nexport FLAGS_reallocate_gpu_memory_in_mb=0\nexport FLAGS_memory_fraction_of_eager_deletion=1\npython scenario_lib/inference.py --model_name=AttentionLstmErnie \\\n--config=./conf/conf.txt \\\n--save_inference_model=inference_models_save \\\n--output='output.json'",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/inference.sh:1-12"
+    },
+    "1919": {
+        "file_id": 153,
+        "content": "This script sets environment variables for GPU usage and then runs the \"inference.py\" Python script from the \"scenario_lib\" directory, specifying a model name (AttentionLstmErnie), configuration file path (./conf/conf.txt), saving inference models path (inference_models_save), and output file for results (output.json).",
+        "type": "comment"
+    },
+    "1920": {
+        "file_id": 154,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py",
+        "type": "filepath"
+    },
+    "1921": {
+        "file_id": 154,
+        "content": "The MetricsCalculator class calculates accuracy, average loss, and mean loss for multimodal video tagging models with various top-k values. It compares predictions to actual labels in a multilabel classification and logs the metrics using a logger.",
+        "type": "summary"
+    },
+    "1922": {
+        "file_id": 154,
+        "content": "#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import unicode_literals\nfrom __future__ import print_function\nfrom __future__ import division\nimport numpy as np\nimport logging\nlogger = logging.getLogger(__name__)\nclass MetricsCalculator():\n    \"\"\"\n    MetricsCalculator\n    \"\"\"\n    def __init__(self, name, mode, metrics_args):\n        \"\"\"\n        init\n        \"\"\"\n        self.name = name\n        self.mode = mode  # 'train', 'val', 'test'",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:1-35"
+    },
+    "1923": {
+        "file_id": 154,
+        "content": "This code imports necessary libraries and defines a class for calculating metrics. The MetricsCalculator class initializes with name, mode ('train', 'val', or 'test'), and metrics_args.",
+        "type": "comment"
+    },
+    "1924": {
+        "file_id": 154,
+        "content": "        self.acc_dict = {}\n        self.top_n_list = metrics_args.MODEL.top_n\n        self.num_classes = metrics_args.MODEL.num_classes\n        self.reset()\n    def reset(self):\n        \"\"\"\n        reset\n        \"\"\"\n        logger.info('Resetting {} metrics...'.format(self.mode))\n        for topk in self.top_n_list:\n            self.acc_dict['avg_acc%d' % (topk)] = 0.0\n        self.aggr_loss = 0.0\n        self.aggr_batch_size = 0\n    def finalize_metrics(self):\n        \"\"\"finalize_metrics\n        \"\"\"\n        for key, value in self.acc_dict.items():\n            self.acc_dict[key] = value / self.aggr_batch_size\n        self.aggr_loss = self.aggr_loss / self.aggr_batch_size\n    def get_computed_metrics(self):\n        \"\"\"get_computed_metrics\n        \"\"\"\n        acc_dict = {}\n        for key, value in self.acc_dict.items():\n            acc_dict[key] = value / self.aggr_batch_size\n        aggr_loss = self.aggr_loss / self.aggr_batch_size\n        return acc_dict, aggr_loss\n    def accumulate(self, loss, softmax, labels):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:36-68"
+    },
+    "1925": {
+        "file_id": 154,
+        "content": "This code initializes an AccuracyMetrics class with metrics arguments, resets the metrics values, finalizes and calculates the accuracy and average loss for each top-n value, and returns the computed metrics. The purpose is to measure the performance of a multimodal video tagging model.",
+        "type": "comment"
+    },
+    "1926": {
+        "file_id": 154,
+        "content": "        \"\"\"accumulate\n        \"\"\"\n        cur_batch_size = softmax.shape[0]\n        # if returned loss is None for e.g. test, just set loss to be 0.\n        if loss is None:\n            cur_loss = 0.\n        else:\n            cur_loss = np.mean(np.array(loss))  #\n        self.aggr_batch_size += cur_batch_size\n        self.aggr_loss += cur_loss * cur_batch_size\n        for top_k in self.top_n_list:\n            self.acc_dict['avg_acc%d' %\n                          (top_k)] += cur_batch_size * compute_topk_accuracy(\n                              softmax, labels, top_k=top_k) * 100.\n        return\n    def finalize_and_log_out(self, info=''):\n        \"\"\"finalize_and_log_out\n        \"\"\"\n        metrics_dict, loss = self.get_computed_metrics()\n        acc_str = []\n        for name, value in metrics_dict.items():\n            acc_str.append('{}:{},'.format('%s' % name, '%.2f' % value))\n        acc_str = '\\t'.join(acc_str)\n        logger.info(info +\n                    '\\tLoss: {},\\t{}'.format('%.6f' % loss, '%s' % acc_str))",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:69-95"
+    },
+    "1927": {
+        "file_id": 154,
+        "content": "This code snippet is part of a class that accumulates metrics for video tagging. It computes the mean loss, average accuracy for different top k values, and then logs these metrics in an informative format using a logger.",
+        "type": "comment"
+    },
+    "1928": {
+        "file_id": 154,
+        "content": "        return\ndef compute_topk_correct_hits_multilabel(top_k, preds, labels):\n    '''Compute the number of corret hits'''\n    batch_size = preds.shape[0]\n    top_k_preds = np.zeros((batch_size, 10), dtype=np.float32)\n    for i in range(batch_size):\n        top_k_preds[i, :] = np.argsort(-preds[i, :])[:10]\n    correctness = np.zeros(batch_size, dtype=np.float32)\n    for i in range(batch_size):\n        correc_sum = 0\n        for label_id in range(len(labels[i])):\n            label_hit = labels[i][label_id]\n            if label_hit == 0 or label_hit < 0.1:\n                continue\n            if label_id in top_k_preds[i, :top_k].astype(np.int32).tolist():\n                # correc_sum += 1\n                correc_sum = 1\n                break\n        correctness[i] = correc_sum\n    correct_hits = sum(correctness)\n    return correct_hits\ndef compute_topk_correct_hits(top_k, preds, labels):\n    '''Compute the number of corret hits'''\n    batch_size = preds.shape[0]\n    top_k_preds = np.zeros((batch_size, top_k), dtype=np.float32)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:96-125"
+    },
+    "1929": {
+        "file_id": 154,
+        "content": "This code computes the number of correct hits for a given top_k in multilabel classification, where it calculates the top_k predictions and checks if any of them match with the actual labels. It returns the total number of correct hits across all samples in the batch.",
+        "type": "comment"
+    },
+    "1930": {
+        "file_id": 154,
+        "content": "    for i in range(batch_size):\n        top_k_preds[i, :] = np.argsort(-preds[i, :])[:top_k]\n    correctness = np.zeros(batch_size, dtype=np.int32)\n    for i in range(batch_size):\n        if labels[i] in top_k_preds[i, :].astype(np.int32).tolist():\n            correctness[i] = 1\n    correct_hits = sum(correctness)\n    return correct_hits\ndef compute_topk_accuracy(softmax, labels, top_k):\n    \"\"\"compute_topk_accuracy\n    \"\"\"\n    computed_metrics = {}\n    assert labels.shape[0] == softmax.shape[0], \"Batch size mismatch.\"\n    aggr_batch_size = labels.shape[0]\n    # aggr_top_k_correct_hits = compute_topk_correct_hits(top_k, softmax, labels)\n    aggr_top_k_correct_hits = compute_topk_correct_hits_multilabel(\n        top_k, softmax, labels)\n    # normalize results\n    computed_metrics = \\\n        float(aggr_top_k_correct_hits) / aggr_batch_size\n    return computed_metrics\nif __name__ == \"__main__\":\n    pred = np.array([[0.5, 0.2, 0.3, 0, 0]])\n    label = np.array([[0.5, 0.5, 0, 0, 0]])\n    print('pred:  ', pred)\n    print('label:  ', label)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:126-158"
+    },
+    "1931": {
+        "file_id": 154,
+        "content": "This code calculates the top-k accuracy for a batch of predictions and labels. It first computes the top-k predictions and then checks if the ground truth label is within the top-k predictions. The function returns the number of correct hits divided by the batch size to obtain the accuracy. The main section demonstrates usage with example data.",
+        "type": "comment"
+    },
+    "1932": {
+        "file_id": 154,
+        "content": "    print('Top 1 hits', compute_topk_correct_hits_multilabel(1, pred, label))\n    print('Top 5 hits', compute_topk_correct_hits_multilabel(5, pred, label))",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:159-160"
+    },
+    "1933": {
+        "file_id": 154,
+        "content": "Computing top-1 and top-5 hits for multilabel prediction using compute_topk_correct_hits_multilabel function.",
+        "type": "comment"
+    },
+    "1934": {
+        "file_id": 155,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/config.py",
+        "type": "filepath"
+    },
+    "1935": {
+        "file_id": 155,
+        "content": "This code defines a config parser and provides two functions for parsing, merging, setting, and printing configuration in different sections. The code handles section validity and updates values when merging.",
+        "type": "summary"
+    },
+    "1936": {
+        "file_id": 155,
+        "content": "\"\"\"\nconfig parser\n\"\"\"\ntry:\n    from configparser import ConfigParser\nexcept BaseException:\n    from ConfigParser import ConfigParser\nfrom utils import AttrDict\nimport logging\nlogger = logging.getLogger(__name__)\nCONFIG_SECS = [\n    'train',\n    'valid',\n    'test',\n    'infer',\n]\ndef parse_config(cfg_file):\n    \"\"\"parse_config\n    \"\"\"\n    parser = ConfigParser()\n    cfg = AttrDict()\n    parser.read(cfg_file)\n    for sec in parser.sections():\n        sec_dict = AttrDict()\n        for k, v in parser.items(sec):\n            try:\n                v = eval(v)\n            except BaseException:\n                pass\n            setattr(sec_dict, k, v)\n        setattr(cfg, sec.upper(), sec_dict)\n    return cfg\ndef merge_configs(cfg, sec, args_dict):\n    \"\"\"merge_configs\n    \"\"\"\n    assert sec in CONFIG_SECS, \"invalid config section {}\".format(sec)\n    sec_dict = getattr(cfg, sec.upper())\n    for k, v in args_dict.items():\n        if v is None:\n            continue\n        # try:\n        #     if hasattr(sec_dict, k):\n        #         setattr(sec_dict, k, v)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/config.py:1-52"
+    },
+    "1937": {
+        "file_id": 155,
+        "content": "This code defines a config parser and provides two functions: `parse_config` and `merge_configs`. The `parse_config` function reads a configuration file and returns an `AttrDict` object containing the parsed configurations for different sections ('train', 'valid', 'test', 'infer'). The `merge_configs` function takes an existing configuration object, a section name, and an optional dictionary of arguments to merge into the configuration. It checks if the section is valid before attempting to merge the new arguments. If a value is None, it is ignored during the merging process.",
+        "type": "comment"
+    },
+    "1938": {
+        "file_id": 155,
+        "content": "        # except BaseException:\n        #     pass\n        if k in sec_dict:\n            setattr(sec_dict, k, v)\n    return cfg\ndef print_configs(cfg, mode):\n    \"\"\"print_configs\n    \"\"\"\n    logger.info(\"---------------- {:>5} Arguments ----------------\".format(mode))\n    for sec, sec_items in cfg.items():\n        if isinstance(sec_items, dict) is True:\n            logger.info(\"{}:\".format(sec))\n            for k, v in sec_items.items():\n                logger.info(\"    {}:{}\".format(k, v))\n        else:\n            logger.info(\"{}:{}\".format(sec, sec_items))\n    logger.info(\"-------------------------------------------------\")",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/config.py:53-71"
+    },
+    "1939": {
+        "file_id": 155,
+        "content": "This code defines two functions, `set_config` and `print_configs`. The `set_config` function takes a dictionary (cfg) as input, iterates through its keys and values, and sets the value for each key in the configuration dictionary (sec_dict). If the key already exists in sec_dict, it updates its value. Finally, the function returns the updated configuration dictionary. The `print_configs` function prints out the configuration in a formatted way using the logger module. It iterates through the sections and their corresponding values in the configuration dictionary and logs them to the console with proper indentation and section names.",
+        "type": "comment"
+    },
+    "1940": {
+        "file_id": 156,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py",
+        "type": "filepath"
+    },
+    "1941": {
+        "file_id": 156,
+        "content": "This code imports and registers a reader named \"ATTENTIONLSTMERNIE\" from the FeatureReader class, following alphabetical order. It is part of the PaddleVideo MultimodalVideoTag project, likely for video feature extraction or analysis.",
+        "type": "summary"
+    },
+    "1942": {
+        "file_id": 156,
+        "content": "#!/usr/bin/env python\n# coding=utf-8\n\"\"\"\nCopyright 2021 Baidu.com, Inc. All Rights Reserved\nDescription: \nAuthors: wanghewei(wanghewei@baidu.com)\nLastEditors: wanghewei(wanghewei@baidu.com)\nDate: 2021-11-26 16:31:59\n\"\"\"\nfrom .reader_utils import regist_reader, get_reader\nfrom .feature_reader import FeatureReader\n# regist reader, sort by alphabet\nregist_reader(\"ATTENTIONLSTMERNIE\", FeatureReader)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py:1-13"
+    },
+    "1943": {
+        "file_id": 156,
+        "content": "This code imports and registers a reader named \"ATTENTIONLSTMERNIE\" from the FeatureReader class, following alphabetical order. It is part of the PaddleVideo MultimodalVideoTag project, likely for video feature extraction or analysis.",
+        "type": "comment"
+    },
+    "1944": {
+        "file_id": 157,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py",
+        "type": "filepath"
+    },
+    "1945": {
+        "file_id": 157,
+        "content": "This code initializes a reader class for PaddleVideo's MultimodalVideoTag application, preprocesses text, formats input sequences for BERT/ERNIE models, creates Record objects, generates batches with padding, and handles data generation for ERNIE models.",
+        "type": "summary"
+    },
+    "1946": {
+        "file_id": 157,
+        "content": "\"\"\"\nernie reader\n\"\"\"\n#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\nfrom __future__ import absolute_import\nimport sys\nimport os\nimport json\nimport random\nimport logging\nimport numpy as np\nimport six\nfrom io import open\nfrom collections import namedtuple\nfrom .tokenization import FullTokenizer, convert_to_unicode\nlog = logging.getLogger(__name__)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:1-35"
+    },
+    "1947": {
+        "file_id": 157,
+        "content": "This code is for the \"ernie\" reader, a part of PaddleVideo's MultimodalVideoTag application. It includes licensing information and various import statements for different functionalities like file handling, JSON parsing, random number generation, logging, numpy operations, and namedtuple creation. The log variable is initialized for error reporting.",
+        "type": "comment"
+    },
+    "1948": {
+        "file_id": 157,
+        "content": "if six.PY3:\n    import io\n    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')\n    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')\ndef csv_reader(fd, delimiter='\\t'):\n    \"\"\"csv_reader\n    \"\"\"\n    def gen():\n        \"\"\"gen\n        \"\"\"\n        for i in fd:\n            yield i.rstrip('\\n').split(delimiter)\n    return gen()\nclass BaseReader(object):\n    \"\"\"BaseReader\n    \"\"\"\n    def __init__(self,\n                 vocab_path,\n                 label_map_config=None,\n                 max_seq_len=512,\n                 do_lower_case=True,\n                 in_tokens=False,\n                 is_inference=False,\n                 random_seed=None,\n                 tokenizer=\"FullTokenizer\",\n                 is_classify=True,\n                 is_regression=False,\n                 for_cn=True,\n                 task_id=0):\n        self.max_seq_len = max_seq_len\n        self.tokenizer = FullTokenizer(vocab_file=vocab_path,\n                                       do_lower_case=do_lower_case)\n        self.vocab = self.tokenizer.vocab",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:37-74"
+    },
+    "1949": {
+        "file_id": 157,
+        "content": "This code snippet defines a `BaseReader` class which initializes an object with various parameters related to text preprocessing, including maximum sequence length, tokenizer, and other properties. It also includes a utility function `csv_reader` that reads data from files in CSV format. The code adjusts the Python output stream encoding if running on Python 3, ensuring consistent text handling across all outputs.",
+        "type": "comment"
+    },
+    "1950": {
+        "file_id": 157,
+        "content": "        self.pad_id = self.vocab[\"[PAD]\"]\n        self.cls_id = self.vocab[\"[CLS]\"]\n        self.sep_id = self.vocab[\"[SEP]\"]\n        self.in_tokens = in_tokens\n        self.is_inference = is_inference\n        self.for_cn = for_cn\n        self.task_id = task_id\n        np.random.seed(random_seed)\n        self.is_classify = is_classify\n        self.is_regression = is_regression\n        self.current_example = 0\n        self.current_epoch = 0\n        self.num_examples = 0\n        if label_map_config:\n            with open(label_map_config, encoding='utf8') as f:\n                self.label_map = json.load(f)\n        else:\n            self.label_map = None\n    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):\n        \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n        # This is a simple heuristic which will always truncate the longer sequence\n        # one token at a time. This makes more sense than truncating an equal percent\n        # of tokens from each, since if one sequence is very short then each token",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:75-102"
+    },
+    "1951": {
+        "file_id": 157,
+        "content": "This code initializes various attributes of the class and sets up some configurations for tokenizing input data. It also loads a label map from a file if provided, or sets it to None otherwise. The \"_truncate_seq_pair\" function truncates sequence pairs in place to the maximum length specified.",
+        "type": "comment"
+    },
+    "1952": {
+        "file_id": 157,
+        "content": "        # that's truncated likely contains more information than a longer sequence.\n        while True:\n            total_length = len(tokens_a) + len(tokens_b)\n            if total_length <= max_length:\n                break\n            if len(tokens_a) > len(tokens_b):\n                tokens_a.pop()\n            else:\n                tokens_b.pop()\n    def _convert_example_to_record(self, example, max_seq_length, tokenizer):\n        \"\"\"Converts a single `Example` into a single `Record`.\"\"\"\n        text_a = convert_to_unicode(example.text_a)\n        tokens_a = tokenizer.tokenize(text_a)\n        tokens_b = None\n        has_text_b = False\n        if isinstance(example, dict):\n            has_text_b = \"text_b\" in example.keys()\n        else:\n            has_text_b = \"text_b\" in example._fields\n        if has_text_b:\n            text_b = convert_to_unicode(example.text_b)\n            tokens_b = tokenizer.tokenize(text_b)\n        if tokens_b:\n            # Modifies `tokens_a` and `tokens_b` in place so that the total",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:103-131"
+    },
+    "1953": {
+        "file_id": 157,
+        "content": "This function converts an example into a record. It tokenizes text_a and optionally text_b, then truncates the sequences if they exceed max_seq_length by popping tokens from either tokens_a or tokens_b.",
+        "type": "comment"
+    },
+    "1954": {
+        "file_id": 157,
+        "content": "            # length is less than the specified length.\n            # Account for [CLS], [SEP], [SEP] with \"- 3\"\n            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n        else:\n            # Account for [CLS] and [SEP] with \"- 2\"\n            if len(tokens_a) > max_seq_length - 2:\n                tokens_a = tokens_a[0:(max_seq_length - 2)]\n        # The convention in BERT/ERNIE is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids: 0     0   0   0  0     0 0\n        #\n        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:132-151"
+    },
+    "1955": {
+        "file_id": 157,
+        "content": "The code ensures that the input sequences for BERT/ERNIE models are formatted correctly. If the sequence length is less than the specified maximum length, it accounts for [CLS], [SEP], and [SEP] tokens with adjustments. If the sequence length exceeds the limit, it truncates the longer token sequence accordingly. The code also assigns type_ids to indicate whether it's the first or second sequence, as these are used in the model's embedding vectors.",
+        "type": "comment"
+    },
+    "1956": {
+        "file_id": 157,
+        "content": "        # since the [SEP] token unambiguously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = []\n        text_type_ids = []\n        tokens.append(\"[CLS]\")\n        text_type_ids.append(0)\n        for token in tokens_a:\n            tokens.append(token)\n            text_type_ids.append(0)\n        tokens.append(\"[SEP]\")\n        text_type_ids.append(0)\n        if tokens_b:\n            for token in tokens_b:\n                tokens.append(token)\n                text_type_ids.append(1)\n            tokens.append(\"[SEP]\")\n            text_type_ids.append(1)\n        token_ids = tokenizer.convert_tokens_to_ids(tokens)\n        position_ids = list(range(len(token_ids)))\n        if self.is_inference:\n            Record = namedtuple('Record',",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:152-179"
+    },
+    "1957": {
+        "file_id": 157,
+        "content": "This code prepares input data for the ERNIE model by combining tokens from two input sequences (tokens_a and tokens_b) into a single sequence. It appends \"[CLS]\" at the start, \"[SEP]\" to separate the sequences, and assigns text_type_id 0 or 1 based on the source sequence. The code also converts the tokens to token ids and generates position ids for the input data. This is specifically designed for classification tasks where the \"[CLS]\" vector represents the overall sentence vector after fine-tuning the entire model.",
+        "type": "comment"
+    },
+    "1958": {
+        "file_id": 157,
+        "content": "                                ['token_ids', 'text_type_ids', 'position_ids'])\n            record = Record(token_ids=token_ids,\n                            text_type_ids=text_type_ids,\n                            position_ids=position_ids)\n        else:\n            if self.label_map:\n                label_id = self.label_map[example.label]\n            else:\n                label_id = example.label\n            Record = namedtuple('Record', [\n                'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'\n            ])\n            qid = None\n            if \"qid\" in example._fields:\n                qid = example.qid\n            record = Record(token_ids=token_ids,\n                            text_type_ids=text_type_ids,\n                            position_ids=position_ids,\n                            label_id=label_id,\n                            qid=qid)\n        return record\n    def _prepare_batch_data(self, examples, batch_size, phase=None):\n        \"\"\"generate batch records\"\"\"\n        batch_records, max_len = [], 0",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:180-207"
+    },
+    "1959": {
+        "file_id": 157,
+        "content": "This code defines a function to create a \"Record\" object, which contains token_ids, text_type_ids, position_ids (possibly label_id and qid depending on the example). It also includes another function _prepare_batch_data that generates batch records from examples. The batch size and phase are also taken as parameters in this function.",
+        "type": "comment"
+    },
+    "1960": {
+        "file_id": 157,
+        "content": "        for index, example in enumerate(examples):\n            if phase == \"train\":\n                self.current_example = index\n            record = self._convert_example_to_record(example, self.max_seq_len,\n                                                     self.tokenizer)\n            max_len = max(max_len, len(record.token_ids))\n            if self.in_tokens:\n                to_append = (len(batch_records) + 1) * max_len <= batch_size\n            else:\n                to_append = len(batch_records) < batch_size\n            if to_append:\n                batch_records.append(record)\n            else:\n                yield self._pad_batch_records(batch_records)\n                batch_records, max_len = [record], len(record.token_ids)\n        if batch_records:\n            yield self._pad_batch_records(batch_records)\nclass ExtractEmbeddingReader(BaseReader):\n    \"\"\"\n    data prepare for getting erine embedding \n    \"\"\"\n    def _pad_batch_records(self, batch_records):\n        \"\"\"\n        对字标号，位置标号特征进行固定长度补全\n        batch_records 包含多条文本的标号",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:208-235"
+    },
+    "1961": {
+        "file_id": 157,
+        "content": "This code iterates through examples and converts them to records. It then appends the records to a batch and pads the batch with zeros if it reaches the maximum size. It yields batches of records, ensuring that each batch is padded to the same length before being passed to the next step in the process. This class inherits from BaseReader and is used for getting Ernie embedding. The method _pad_batch_records pads the batch with zeros if it exceeds the maximum size, ensuring all batches are of equal length.",
+        "type": "comment"
+    },
+    "1962": {
+        "file_id": 157,
+        "content": "        return [字标号列表，文本类型列表，位置特征列表，任务标号列表，掩码列表]\n        \"\"\"\n        batch_token_ids = [record.token_ids for record in batch_records]\n        batch_text_type_ids = [\n            record.text_type_ids for record in batch_records\n        ]\n        batch_position_ids = [record.position_ids for record in batch_records]\n        # padding\n        padded_token_ids, input_mask, seq_lens = pad_batch_data(\n            batch_token_ids,\n            pad_idx=self.pad_id,\n            return_input_mask=True,\n            return_seq_lens=True,\n            max_len=self.max_seq_len)\n        padded_text_type_ids = pad_batch_data(batch_text_type_ids,\n                                              pad_idx=self.pad_id,\n                                              max_len=self.max_seq_len)\n        padded_position_ids = pad_batch_data(batch_position_ids,\n                                             pad_idx=self.pad_id,\n                                             max_len=self.max_seq_len)\n        padded_task_ids = np.ones_like(padded_token_ids,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:236-257"
+    },
+    "1963": {
+        "file_id": 157,
+        "content": "This code is processing a batch of records and padding token ids, text type ids, position ids, and task ids for an ERNIE (Enhanced Refined Network with Incremental Learning and Exploration) model. The processed data will be used as input for the model.",
+        "type": "comment"
+    },
+    "1964": {
+        "file_id": 157,
+        "content": "                                       dtype=\"int64\") * self.task_id\n        return_list = [\n            padded_token_ids, padded_text_type_ids, padded_position_ids,\n            padded_task_ids, input_mask\n        ]\n        return return_list\n    def data_generate_from_text(self, text):\n        \"\"\"\n        trans text to idx\n        input single text\n        return 5*maxlen*1\n        \"\"\"\n        Example = namedtuple('Example', ['text_a', 'label'])\n        example = Example(text, 0)\n        records = [\n            self._convert_example_to_record(example, self.max_seq_len,\n                                            self.tokenizer)\n        ]\n        pad_records = self._pad_batch_records(records)\n        text_one_hot = np.concatenate(pad_records, axis=0).astype('int64')\n        return text_one_hot\ndef pad_batch_data(insts,\n                   pad_idx=0,\n                   max_len=None,\n                   return_pos=False,\n                   return_input_mask=False,\n                   return_max_len=False,\n                   return_num_token=False,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:258-289"
+    },
+    "1965": {
+        "file_id": 157,
+        "content": "This code is related to text processing and data generation for a specific task reader. It converts input texts into indexed representations and pads the data to ensure consistent sequence lengths. The function `data_generate_from_text` takes in a single text, converts it into a record, pads the batch of records, and returns the resulting one-hot encoded text representation. The `pad_batch_data` function is used for padding other types of data as well.",
+        "type": "comment"
+    },
+    "1966": {
+        "file_id": 157,
+        "content": "                   return_seq_lens=False):\n    \"\"\"\n    Pad the instances to the max sequence length in batch, and generate the\n    corresponding position data and attention bias.\n    \"\"\"\n    return_list = []\n    if max_len is None:\n        max_len = max(len(inst) for inst in insts)\n    # Any token included in dict can be used to pad, since the paddings' loss\n    # will be masked out by weights and make no effect on parameter gradients.\n    inst_data = np.array(\n        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])\n    return_list += [inst_data.astype(\"int64\").reshape([-1, max_len, 1])]\n    # position data\n    if return_pos:\n        inst_pos = np.array([\n            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))\n            for inst in insts\n        ])\n        return_list += [inst_pos.astype(\"int64\").reshape([-1, max_len, 1])]\n    if return_input_mask:\n        # This is used to avoid attention on paddings.\n        input_mask_data = np.array(\n            [[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:290-317"
+    },
+    "1967": {
+        "file_id": 157,
+        "content": "This function pads instances to the maximum sequence length in a batch. It first calculates the max_len based on instance lengths and then adds padding to shorter instances if necessary. It creates a 3D tensor of input data, position data (if required), and attention masks (if required). These tensors are added to a return list before being returned by the function. The padding is used to make no effect on parameter gradients by being masked out with weights.",
+        "type": "comment"
+    },
+    "1968": {
+        "file_id": 157,
+        "content": "        input_mask_data = np.expand_dims(input_mask_data, axis=-1)\n        return_list += [input_mask_data.astype(\"float32\")]\n    if return_max_len:\n        return_list += [max_len]\n    if return_num_token:\n        num_token = 0\n        for inst in insts:\n            num_token += len(inst)\n        return_list += [num_token]\n    if return_seq_lens:\n        seq_lens = np.array([len(inst) for inst in insts])\n        return_list += [seq_lens.astype(\"int64\").reshape([-1])]\n    return return_list if len(return_list) > 1 else return_list[0]",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:318-334"
+    },
+    "1969": {
+        "file_id": 157,
+        "content": "This code prepares a return list by adding various elements like input_mask_data, max_len (if required), number of tokens (if required), and sequence lengths (if required) before returning the final list.",
+        "type": "comment"
+    },
+    "1970": {
+        "file_id": 158,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py",
+        "type": "filepath"
+    },
+    "1971": {
+        "file_id": 158,
+        "content": "The FeatureReader class, a DataReader subclass, reads video features using LSTM, attention cluster, and NextVlad models for YouTube-8M dataset. It handles multimodal data loading, exception handling, label manipulation, soft labels generation, and batch input feature creation. A function loads words and their indices from a file into a dictionary.",
+        "type": "summary"
+    },
+    "1972": {
+        "file_id": 158,
+        "content": "\"\"\"\nfeature reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\nimport numpy as np\nimport random\nimport os\nimport traceback\nimport pickle\npython_ver = sys.version_info\nfrom collections import defaultdict\nimport pandas as pd\nfrom .ernie_task_reader import ExtractEmbeddingReader\nfrom .reader_utils import DataReader\nclass FeatureReader(DataReader):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:1-39"
+    },
+    "1973": {
+        "file_id": 158,
+        "content": "FeatureReader class is a subclass of DataReader, which reads video features from files using Pickle. It uses pandas and includes ExtractEmbeddingReader to read Ernie tasks and provides data reader functions for train/test splits.",
+        "type": "comment"
+    },
+    "1974": {
+        "file_id": 158,
+        "content": "    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm, attention cluster, nextvlad\n    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg):\n        \"\"\"\n        init\n        \"\"\"\n        self.name = name\n        self.mode = mode\n        self.num_classes = cfg.MODEL.num_classes\n        # set batch size and file list\n        self.batch_size = cfg[mode.upper()]['batch_size']\n        self.filelist = cfg[mode.upper()]['filelist']\n        self.eigen_file = cfg.MODEL.get('eigen_file', None)\n        self.num_seg = cfg.MODEL.get('num_seg', None)\n        self.loss_type = cfg.TRAIN['loss_type']\n        vocab_file = os.path.join(cfg.TRAIN.ernie_pretrain_dict_path,\n                                  'vocab.txt')\n        self.ernie_reader = ExtractEmbeddingReader(\n            vocab_path=vocab_file,\n            max_seq_len=cfg.MODEL.text_max_len,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:40-67"
+    },
+    "1975": {
+        "file_id": 158,
+        "content": "The code initializes a data reader for YouTube-8M dataset, which contains features extracted by prior networks. It supports three models: LSTM, attention cluster, and NextVlad. The constructor takes the name, mode (train or test), and configuration parameters as inputs. It sets the batch size, file list, eigen_file (for NextVlad only), number of segments (num_seg), loss type, and initializes an ExtractEmbeddingReader using a vocab.txt file and maximum sequence length (text_max_len).",
+        "type": "comment"
+    },
+    "1976": {
+        "file_id": 158,
+        "content": "            do_lower_case=True)\n        url_title_label_file = cfg[mode.upper()]['url_title_label_file']\n        self.class_dict = load_class_file(cfg.MODEL.class_name_file)\n        self.url_title_info = load_video_file(url_title_label_file,\n                                              self.class_dict, mode)\n    def create_reader(self):\n        \"\"\"\n        create reader\n        \"\"\"\n        url_list = list(self.url_title_info.keys())\n        if self.mode == 'train':\n            random.shuffle(url_list)\n        def reader():\n            \"\"\"reader\n            \"\"\"\n            batch_out = []\n            for url in url_list:\n                try:\n                    filepath = os.path.join(\n                        self.filelist,\n                        url.split('/')[-1].split('.')[0] + '.pkl')\n                    if os.path.exists(filepath) is False:\n                        continue\n                    if python_ver < (3, 0):\n                        record = pickle.load(open(filepath, 'rb'))\n                    else:",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:68-95"
+    },
+    "1977": {
+        "file_id": 158,
+        "content": "The code loads a class dictionary and a video file information based on the given configuration. It then creates a reader function that iterates through the URLs, checks if a file exists for each URL, and skips if it doesn't. If the file exists, it loads the data (pickle format) using the appropriate pickle version for Python < 3.0 or >= 3.0.",
+        "type": "comment"
+    },
+    "1978": {
+        "file_id": 158,
+        "content": "                        record = pickle.load(open(filepath, 'rb'),\n                                             encoding='iso-8859-1')\n                    text_raw = self.url_title_info[url]['title']\n                    rgb = record['feature']['image_pkl'].astype(float)\n                    if record['feature']['audio_pkl'].shape[0] == 0:\n                        audio_pkl = np.zeros((10, 128))\n                        audio = audio_pkl.astype(float)\n                    else:\n                        audio = record['feature']['audio_pkl'].astype(float)\n                    text_one_hot = self.ernie_reader.data_generate_from_text(\n                        str(text_raw))\n                    video = record['video']\n                    if self.mode != 'infer':\n                        label = self.url_title_info[url]['label']\n                        label = [int(w) for w in label]\n                        if self.loss_type == 'sigmoid':\n                            label = make_one_hot(label, self.num_classes)\n                        elif self.loss_type == 'softmax':",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:96-113"
+    },
+    "1979": {
+        "file_id": 158,
+        "content": "This code reads data from a file, prepares and processes it into various formats. It first loads the record from a file with pickle, then extracts text, RGB image data, and audio data (defaulting to zeroes if no audio is present). The code also generates one-hot encoding for the text using the ernie_reader. It obtains the video data and depending on the mode, assigns labels either as one-hot or softmax based on the loss type specified.",
+        "type": "comment"
+    },
+    "1980": {
+        "file_id": 158,
+        "content": "                            label = make_one_soft_hot(label, self.num_classes,\n                                                      False)\n                        batch_out.append((rgb, audio, text_one_hot, label))\n                    else:\n                        batch_out.append((rgb, audio, text_one_hot, video))\n                    if len(batch_out) == self.batch_size:\n                        yield batch_out\n                        batch_out = []\n                except Exception as e:\n                    print(\"warning: load data {} failed, {}\".format(\n                        filepath, str(e)))\n                    traceback.print_exc()\n                    continue\n# if self.mode == 'infer' and len(batch_out) > 0:\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"get_config_from_sec\n        \"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:114-140"
+    },
+    "1981": {
+        "file_id": 158,
+        "content": "This code is part of a data reader for multimodal video tagging. It reads in RGB images, audio clips, and text one-hot vectors, then appends them to a batch. If a label is available, it converts the label to a softmax output; otherwise, it yields the video itself. The code handles exceptions during data loading and allows for inferencing. Configuration values are retrieved using get_config_from_sec function.",
+        "type": "comment"
+    },
+    "1982": {
+        "file_id": 158,
+        "content": "def load_video_file(label_file, class_dict, mode='train'):\n    \"\"\"\n    labelfile formate: URL \\t title \\t label1,label2\n    return dict\n    \"\"\"\n    data = pd.read_csv(label_file, sep='\\t', header=None)\n    url_info_dict = defaultdict(dict)\n    for index, row in data.iterrows():\n        url = row[0]\n        if url in url_info_dict:\n            continue\n        if pd.isna(row[1]):\n            title = \"\"\n        else:\n            title = str(row[1])\n        if mode == 'infer':\n            url_info_dict[url] = {'title': title}\n        else:\n            if pd.isna(row[2]):\n                continue\n            labels = row[2].split(',')\n            labels_idx = [class_dict[w] for w in labels if w in class_dict]\n            if len(labels_idx) < 1:\n                continue\n            if url not in url_info_dict:\n                url_info_dict[url] = {'label': labels_idx, 'title': title}\n    print('load video %d' % (len(url_info_dict)))\n    return url_info_dict\ndef dequantize(feat_vector, max_quantized_value=2., min_quantized_value=-2.):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:143-173"
+    },
+    "1983": {
+        "file_id": 158,
+        "content": "This code defines a function load_video_file() that reads a label file in tab-separated format and stores the URLs, titles, and labels into a dictionary called url_info_dict. It also contains another function dequantize(), but this one is not used in the current code block. The load_video_file() function checks for NA values and splits the labels by comma before processing. If 'mode' is set to 'infer', it only stores title information; otherwise, it processes the labels. Finally, it prints the number of processed videos and returns the url_info_dict dictionary.",
+        "type": "comment"
+    },
+    "1984": {
+        "file_id": 158,
+        "content": "    \"\"\"\n    Dequantize the feature from the byte format to the float format\n    \"\"\"\n    assert max_quantized_value > min_quantized_value\n    quantized_range = max_quantized_value - min_quantized_value\n    scalar = quantized_range / 255.0\n    bias = (quantized_range / 512.0) + min_quantized_value\n    return feat_vector * scalar + bias\nepsilon = 0.1\nsmmoth_score = (1.0 / float(210)) * epsilon\ndef label_smmoth(label_one_hot_vector):\n    \"\"\"\n    label_smmoth\n    \"\"\"\n    global smmoth_score\n    for i in range(len(label_one_hot_vector)):\n        if label_one_hot_vector[i] == 0:\n            label_one_hot_vector[i] = smmoth_score\n    return label_one_hot_vector\ndef make_one_soft_hot(label, dim=15, label_smmoth=False):\n    \"\"\"\n    make_one_soft_hot\n    \"\"\"\n    one_hot_soft_label = np.zeros(dim)\n    one_hot_soft_label = one_hot_soft_label.astype(float)\n    # multi-labelis\n    # label smmoth\n    if label_smmoth:\n        one_hot_soft_label = label_smmoth(one_hot_soft_label)\n    label_len = len(label)\n    prob = (1 - np.sum(one_hot_soft_label)) / float(label_len)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:174-212"
+    },
+    "1985": {
+        "file_id": 158,
+        "content": "This code contains a series of functions for handling and manipulating label data. The 'feature_reader' function dequantizes feature values, while the 'label_smmoth' function modifies a one-hot label vector by replacing zeros with a specific smoothness value. The 'make_one_soft_hot' function creates a one-hot soft label based on the input label and applies label smoothing if specified.",
+        "type": "comment"
+    },
+    "1986": {
+        "file_id": 158,
+        "content": "    for ind in label:\n        one_hot_soft_label[ind] += prob\n    #one_hot_soft_label = label_smmoth(one_hot_soft_label)\n    return one_hot_soft_label\ndef make_one_hot(label, dim=15):\n    \"\"\"\n    make_one_hot\n    \"\"\"\n    one_hot_soft_label = np.zeros(dim)\n    one_hot_soft_label = one_hot_soft_label.astype(float)\n    for ind in label:\n        one_hot_soft_label[ind] = 1\n    return one_hot_soft_label\ndef generate_random_idx(feature_len, num_seg):\n    \"\"\"\n    generate_random_idx\n    \"\"\"\n    idxs = []\n    stride = float(feature_len) / num_seg\n    for i in range(num_seg):\n        pos = (i + np.random.random()) * stride\n        idxs.append(min(feature_len - 1, int(pos)))\n    return idxs\ndef get_batch_ernie_input_feature(reader, texts):\n    \"\"\"\n    get_batch_ernie_input_feature\n    \"\"\"\n    result_list = reader.data_generate_from_texts(texts)\n    result_trans = []\n    for i in range(len(texts)):\n        result_trans.append([result_list[0][i],\\\n                             result_list[1][i],\n                             result_list[2][i],",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:213-251"
+    },
+    "1987": {
+        "file_id": 158,
+        "content": "This code defines several functions for generating one-hot labels, creating random indices, and getting batch input features for a specific application. It uses numpy arrays for efficient operations and handling multidimensional data. The functions can be used in the context of multimodal video analysis, where labels, text inputs, and other data are processed for further processing or model training.",
+        "type": "comment"
+    },
+    "1988": {
+        "file_id": 158,
+        "content": "                             result_list[3][i],\n                             result_list[4][i]])\n    return np.array(result_trans)\ndef load_class_file(class_file):\n    \"\"\"\n    load_class_file\n    \"\"\"\n    class_lines = open(class_file, 'r', encoding='utf8').readlines()\n    class_dict = {}\n    for i, line in enumerate(class_lines):\n        tmp = line.strip().split('\\t')\n        word = tmp[0]\n        index = str(i)\n        if len(tmp) == 2:\n            index = tmp[1]\n        class_dict[word] = index\n    return class_dict\nif __name__ == '__main__':\n    pass",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:252-274"
+    },
+    "1989": {
+        "file_id": 158,
+        "content": "The code contains a function that loads and returns a dictionary containing words and their corresponding indices from a class file. The function reads the lines of the file, removes any leading or trailing whitespace, splits the line based on tabs, assigns the first element as the word and the second element as the index (if available), then adds these key-value pairs to a dictionary. This dictionary is returned as the result.",
+        "type": "comment"
+    },
+    "1990": {
+        "file_id": 159,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py",
+        "type": "filepath"
+    },
+    "1991": {
+        "file_id": 159,
+        "content": "The code defines a custom exception \"ReaderNotFoundError\" and manages reader instances using a singleton ReaderZoo object. The `regist_reader` function registers new readers, while the `get_reader` function retrieves and returns an instance of the requested reader based on name, mode, and configuration.",
+        "type": "summary"
+    },
+    "1992": {
+        "file_id": 159,
+        "content": "\"\"\"\nreader utils\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nclass ReaderNotFoundError(Exception):\n    \"Error: reader not found\"\n    def __init__(self, reader_name, avail_readers):\n        super(ReaderNotFoundError, self).__init__()\n        self.reader_name = reader_name\n        self.avail_readers = avail_readers\n    def __str__(self):\n        msg = \"Reader {} Not Found.\\nAvailiable readers:\\n\".format(\n            self.reader_name)\n        for reader in self.avail_readers:",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py:1-30"
+    },
+    "1993": {
+        "file_id": 159,
+        "content": "The code defines a class \"ReaderNotFoundError\" which is an exception to handle situations when a reader is not found. It takes the name of the missing reader and a list of available readers as arguments, and provides a formatted error message with the missing reader's name and a list of available readers. This can be useful for raising custom errors in cases where the required reader cannot be found or is not compatible with the provided options.",
+        "type": "comment"
+    },
+    "1994": {
+        "file_id": 159,
+        "content": "            msg += \"  {}\\n\".format(reader)\n        return msg\nclass DataReader(object):\n    \"\"\"data reader for video input\"\"\"\n    def __init__(self, model_name, mode, cfg):\n        self.name = model_name\n        self.mode = mode\n        self.cfg = cfg\n    def create_reader(self):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"get_config_from_sec\n        \"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ReaderZoo(object):\n    \"\"\"ReaderZoo\n    \"\"\"\n    def __init__(self):\n        self.reader_zoo = {}\n    def regist(self, name, reader):\n        \"\"\"regist\n        \"\"\"\n        assert reader.__base__ == DataReader, \"Unknow model type {}\".format(\n            type(reader))\n        self.reader_zoo[name] = reader\n    def get(self, name, mode, cfg):\n        \"\"\"get\n        \"\"\"\n        for k, v in self.reader_zoo.items():\n            if k == name:\n                return v(name, mode, cfg)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py:31-73"
+    },
+    "1995": {
+        "file_id": 159,
+        "content": "This code defines classes for data readers and a reader registry. The `DataReader` class is a base class for different video input data readers, while the `ReaderZoo` class manages a dictionary of registered readers. The code snippet includes methods to register readers and retrieve them by name, mode, and configuration.",
+        "type": "comment"
+    },
+    "1996": {
+        "file_id": 159,
+        "content": "        raise ReaderNotFoundError(name, self.reader_zoo.keys())\n# singleton reader_zoo\nreader_zoo = ReaderZoo()\ndef regist_reader(name, reader):\n    \"\"\"regist_reader\n    \"\"\"\n    reader_zoo.regist(name, reader)\ndef get_reader(name, mode, cfg):\n    \"\"\"get_reader\n    \"\"\"\n    reader_model = reader_zoo.get(name, mode, cfg)\n    return reader_model.create_reader()",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py:74-91"
+    },
+    "1997": {
+        "file_id": 159,
+        "content": "This code snippet is responsible for managing reader instances, using a singleton ReaderZoo object. The `regist_reader` function allows registration of new readers, while the `get_reader` function retrieves and returns an instance of the requested reader based on the provided name, mode, and configuration.",
+        "type": "comment"
+    },
+    "1998": {
+        "file_id": 160,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py",
+        "type": "filepath"
+    },
+    "1999": {
+        "file_id": 160,
+        "content": "This code provides text to Unicode conversion and printable encoding functions, with tokenization classes for Chinese characters, punctuation splitting, and WordpieceTokenizing, preparing the text for further processing.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/2.json b/docs/data/2.json
new file mode 100644
index 000000000..dd43812d4
--- /dev/null
+++ b/docs/data/2.json
@@ -0,0 +1,543 @@
+{
+    "200": {
+        "file_id": 21,
+        "content": "\"\"\"\nreader_util\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nclass ReaderNotFoundError(Exception):\n    \"\"\"\n    \"Error: reader not found\"\n    \"\"\"\n    def __init__(self, reader_name, avail_readers):\n        super(ReaderNotFoundError, self).__init__()\n        self.reader_name = reader_name\n        self.avail_readers = avail_readers\n    def __str__(self):\n        msg = \"Reader {} Not Found.\\nAvailiable readers:\\n\".format(\n            self.reader_name)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:1-34"
+    },
+    "201": {
+        "file_id": 21,
+        "content": "This code defines a class \"ReaderNotFoundError\" for handling reader not found exceptions with the possibility to specify the unavailable reader name and available readers.",
+        "type": "comment"
+    },
+    "202": {
+        "file_id": 21,
+        "content": "        for reader in self.avail_readers:\n            msg += \"  {}\\n\".format(reader)\n        return msg\nclass DataReader(object):\n    \"\"\"\n    data reader for video input\n    \"\"\"\n    def __init__(self, model_name, mode, cfg):\n        self.name = model_name\n        self.mode = mode\n        self.cfg = cfg\n    def create_reader(self):\n        \"\"\"\n        Not implemented\n        \"\"\"\n        pass\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"\n        get_config_from_sec\n        \"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ReaderZoo(object):\n    \"\"\"\n    ReaderZoo\n    \"\"\"\n    def __init__(self):\n        \"\"\"\n        __init__\n        \"\"\"\n        self.reader_zoo = {}\n    def regist(self, name, reader):\n        \"\"\"\n        regist\n        \"\"\"\n        assert reader.__base__ == DataReader, \"Unknow model type {}\".format(\n            type(reader))\n        self.reader_zoo[name] = reader\n    def get(self, name, mode, cfg, material=None):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:35-83"
+    },
+    "203": {
+        "file_id": 21,
+        "content": "This code defines classes for video input data readers and a reader zoo. The DataReader class initializes with a model name, mode, and configuration. It has methods to create readers (not implemented) and get config from sections. The ReaderZoo class manages registered readers in a zoo, allowing easy access and usage of different reader types for video input data.",
+        "type": "comment"
+    },
+    "204": {
+        "file_id": 21,
+        "content": "        \"\"\"\n        get\n        \"\"\"\n        for k, v in self.reader_zoo.items():\n            if k == name:\n                return v(name, mode, cfg, material)\n        raise ReaderNotFoundError(name, self.reader_zoo.keys())\n# singleton reader_zoo\nreader_zoo = ReaderZoo()\ndef regist_reader(name, reader):\n    \"\"\"\n    regist_reader\n    \"\"\"\n    reader_zoo.regist(name, reader)\ndef get_reader(name, mode, cfg, material=None):\n    \"\"\"\n    get_reader\n    \"\"\"\n    reader_model = reader_zoo.get(name, mode, cfg, material)\n    return reader_model.create_reader()",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:84-109"
+    },
+    "205": {
+        "file_id": 21,
+        "content": "This code defines a singleton reader_zoo and provides functions for registering readers and getting a specific reader. The get_reader function returns the created reader instance based on the provided name, mode, configuration (cfg), and material (if any). If the reader is not found, it raises ReaderNotFoundError with available reader names as information.",
+        "type": "comment"
+    },
+    "206": {
+        "file_id": 22,
+        "content": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py",
+        "type": "filepath"
+    },
+    "207": {
+        "file_id": 22,
+        "content": "TSMINFReader is a multiprocessing-enabled video reader in jpg format, applying transformations for machine learning models. It computes crop positions, performs random cropping, resizing, flipping, and centering on groups of images with fault-tolerant image reading.",
+        "type": "summary"
+    },
+    "208": {
+        "file_id": 22,
+        "content": "\"\"\"\ntsn frame reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport random\nimport functools\nimport concurrent.futures\nimport multiprocessing\nimport numpy as np\nimport paddle\nfrom PIL import Image, ImageEnhance\nfrom .reader_utils import DataReader\nclass TSMINFReader(DataReader):\n    \"\"\"\n    Data reader for video dataset of jpg folder.\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        super(TSMINFReader, self).__init__(name, mode, cfg)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:1-37"
+    },
+    "209": {
+        "file_id": 22,
+        "content": "TSMINFReader is a data reader for video datasets in jpg format. It inherits from DataReader and takes parameters name, mode, and cfg. It supports multiprocessing for improved performance.",
+        "type": "comment"
+    },
+    "210": {
+        "file_id": 22,
+        "content": "        name = name.upper()\n        self.num_seg = cfg[name]['num_seg']\n        self.seglen = cfg[name]['seglen']\n        self.short_size = cfg[name]['short_size']\n        self.target_size = cfg[name]['target_size']\n        self.batch_size = cfg[name]['batch_size']\n        self.reader_threads = cfg[name]['reader_threads']\n        self.buf_size = cfg[name]['buf_size']\n        self.video_path = cfg[name]['frame_list']\n        self.img_mean = np.array(cfg[name]['image_mean']).reshape(\n            [3, 1, 1]).astype(np.float32)\n        self.img_std = np.array(cfg[name]['image_std']).reshape(\n            [3, 1, 1]).astype(np.float32)\n        self.material = material\n    def create_reader(self):\n        \"\"\"\n        batch loader for TSN\n        \"\"\"\n        _reader = self._inference_reader_creator_longvideo(\n            self.video_path,\n            self.mode,\n            num_seg=self.num_seg,\n            seglen=self.seglen,\n            short_size=self.short_size,\n            target_size=self.target_size,\n            img_mean=self.img_mean,",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:38-66"
+    },
+    "211": {
+        "file_id": 22,
+        "content": "The code sets various configuration parameters such as number of segments, segment length, short and target image sizes, batch size, reader threads, buffer size, video path, and image mean and standard deviation for a TSN inference reader. It then creates the TSN inference reader using these parameters.",
+        "type": "comment"
+    },
+    "212": {
+        "file_id": 22,
+        "content": "            img_std=self.img_std,\n            num_threads=self.reader_threads,\n            buf_size=self.buf_size)\n        def _batch_reader():\n            batch_out = []\n            for imgs, label in _reader():\n                if imgs is None:\n                    continue\n                batch_out.append((imgs, label))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 1:\n                yield batch_out[:-1]\n        return _batch_reader\n    def _inference_reader_creator_longvideo(self, video_path, mode, num_seg,\n                                            seglen, short_size, target_size,\n                                            img_mean, img_std, num_threads,\n                                            buf_size):\n        \"\"\"\n        inference reader for video\n        \"\"\"\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            def image_buf(image_id_path_buf):\n                \"\"\"",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:67-97"
+    },
+    "213": {
+        "file_id": 22,
+        "content": "This code defines a video inference reader for the PaddleVideo application's BasketballAction module. It creates a batch reader to process images and labels from the given video path, considering various parameters such as image mean, standard deviation, number of threads, and buffer size. The batch reader yields batches of images and labels until reaching the specified batch size or finishing processing all data.",
+        "type": "comment"
+    },
+    "214": {
+        "file_id": 22,
+        "content": "                image_buf reader\n                \"\"\"\n                try:\n                    img_path = image_id_path_buf[1]\n                    img = Image.open(img_path).convert(\"RGB\")\n                    image_id_path_buf[2] = img\n                except:\n                    image_id_path_buf[2] = None\n            frame_len = len(video_path)\n            read_thread_num = num_seg\n            for i in range(0, frame_len, read_thread_num):\n                image_list_part = video_path[i:i + read_thread_num]\n                image_id_path_buf_list = []\n                for k in range(len(image_list_part)):\n                    image_id_path_buf_list.append([k, image_list_part[k], None])\n                with concurrent.futures.ThreadPoolExecutor(\n                        max_workers=read_thread_num) as executor:\n                    executor.map(\n                        lambda image_id_path_buf: image_buf(image_id_path_buf),\n                        image_id_path_buf_list)\n                imgs_seg_list = [x[2] for x in image_id_path_buf_list]",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:98-120"
+    },
+    "215": {
+        "file_id": 22,
+        "content": "This code uses multithreading to process video frames into images. It opens image paths, converts them to RGB format, and stores them in a list for later use. The code then maps the image processing task onto multiple threads to increase efficiency. Finally, it collects the processed images from each thread and stores them in the imgs_seg_list variable.",
+        "type": "comment"
+    },
+    "216": {
+        "file_id": 22,
+        "content": "                # add the fault-tolerant for bad image\n                for k in range(len(image_id_path_buf_list)):\n                    img_buf = image_id_path_buf_list[k][2]\n                    pad_id = 1\n                    while pad_id < num_seg and img_buf is None:\n                        img_buf = imgs_seg_list[(k + pad_id) % num_seg][2]\n                    if img_buf is None:\n                        print(\"read img erro from {} to {}\".format(\n                            i, i + read_thread_num))\n                        exit(0)\n                    else:\n                        imgs_seg_list[k] = img_buf\n                for pad_id in range(len(imgs_seg_list), num_seg):\n                    imgs_seg_list.append(imgs_seg_list[-1])\n                yield imgs_seg_list\n        def inference_imgs_transform(imgs_list, mode, num_seg, seglen, short_size,\\\n                                    target_size, img_mean, img_std):\n            \"\"\"\n            inference_imgs_transform\n            \"\"\"\n            imgs_ret = imgs_transform(imgs_list, mode, num_seg, seglen,",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:122-144"
+    },
+    "217": {
+        "file_id": 22,
+        "content": "This code handles fault-tolerant reading of images for a specified range. It checks if the image buffer is None and if so, attempts to retrieve it from other segments. If an image cannot be retrieved, it prints an error message and exits. Additionally, it appends extra segments with the last image in case there are fewer than num_segments required. Finally, it yields the updated imgs_seg_list for further processing in the inference_imgs_transform function.",
+        "type": "comment"
+    },
+    "218": {
+        "file_id": 22,
+        "content": "                                      short_size, target_size, img_mean,\n                                      img_std)\n            label_ret = 0\n            return imgs_ret, label_ret\n        mapper = functools.partial(inference_imgs_transform,\n                                   mode=mode,\n                                   num_seg=num_seg,\n                                   seglen=seglen,\n                                   short_size=short_size,\n                                   target_size=target_size,\n                                   img_mean=img_mean,\n                                   img_std=img_std)\n        return paddle.reader.xmap_readers(mapper,\n                                          reader,\n                                          num_threads,\n                                          buf_size,\n                                          order=True)\ndef imgs_transform(imgs,\n                   mode,\n                   num_seg,\n                   seglen,\n                   short_size,\n                   target_size,",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:145-172"
+    },
+    "219": {
+        "file_id": 22,
+        "content": "This code defines a function `imgs_transform` that takes in images, mode, number of segments, segment length, short size, target size as input. It applies transformations to the images based on the given parameters and returns the transformed images. The `mapper` is defined as a partial function of `inference_imgs_transform`, with parameters such as mode, num_seg, seglen, short_size, target_size, img_mean, and img_std. Finally, the code returns the result of `paddle.reader.xmap_readers` which applies the mapper function to the reader, with given parameters like num_threads and buf_size.",
+        "type": "comment"
+    },
+    "220": {
+        "file_id": 22,
+        "content": "                   img_mean,\n                   img_std,\n                   name=''):\n    \"\"\"\n    imgs_transform\n    \"\"\"\n    imgs = group_scale(imgs, short_size)\n    if mode == 'train':\n        if name == \"TSM\":\n            imgs = group_multi_scale_crop(imgs, short_size)\n        imgs = group_random_crop(imgs, target_size)\n        imgs = group_random_flip(imgs)\n    else:\n        imgs = group_center_crop(imgs, target_size)\n    np_imgs = (np.array(imgs[0]).astype('float32').transpose(\n        (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255\n    for i in range(len(imgs) - 1):\n        img = (np.array(imgs[i + 1]).astype('float32').transpose(\n            (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255\n        np_imgs = np.concatenate((np_imgs, img))\n    imgs = np_imgs\n    imgs -= img_mean\n    imgs /= img_std\n    imgs = np.reshape(imgs, (num_seg, seglen * 3, target_size, target_size))\n    return imgs\ndef group_multi_scale_crop(img_group, target_size, scales=None, \\\n        max_distort=1, fix_crop=True, more_fix_crop=True):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:173-203"
+    },
+    "221": {
+        "file_id": 22,
+        "content": "The code defines a function \"imgs_transform\" that takes in images and applies various transformations depending on the mode ('train' or 'test'). For training, it performs multi-scale cropping, random cropping, and random flipping. For testing, it centers crops the images. It then normalizes the images by subtracting the mean and dividing by standard deviation. Finally, it reshapes the images into a specific format and returns them.",
+        "type": "comment"
+    },
+    "222": {
+        "file_id": 22,
+        "content": "    \"\"\"\n    group_multi_scale_crop\n    \"\"\"\n    scales = scales if scales is not None else [1, .875, .75, .66]\n    input_size = [target_size, target_size]\n    im_size = img_group[0].size\n    # get random crop offset\n    def _sample_crop_size(im_size):\n        \"\"\"\n         _sample_crop_size\n        \"\"\"\n        image_w, image_h = im_size[0], im_size[1]\n        base_size = min(image_w, image_h)\n        crop_sizes = [int(base_size * x) for x in scales]\n        crop_h = [\n            input_size[1] if abs(x - input_size[1]) < 3 else x\n            for x in crop_sizes\n        ]\n        crop_w = [\n            input_size[0] if abs(x - input_size[0]) < 3 else x\n            for x in crop_sizes\n        ]\n        pairs = []\n        for i, h in enumerate(crop_h):\n            for j, w in enumerate(crop_w):\n                if abs(i - j) <= max_distort:\n                    pairs.append((w, h))\n        crop_pair = random.choice(pairs)\n        if not fix_crop:\n            w_offset = random.randint(0, image_w - crop_pair[0])\n            h_offset = random.randint(0, image_h - crop_pair[1])",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:204-239"
+    },
+    "223": {
+        "file_id": 22,
+        "content": "This code generates a random crop size based on predefined scales and applies it to an input image. It ensures the generated crop size is close to the target size and adjusts the offset to maintain aspect ratio if necessary. The function also handles cases where the maximum distance between width and height is specified by the max_distort parameter. If fix_crop is False, it further adds random offsets to the selected crop size.",
+        "type": "comment"
+    },
+    "224": {
+        "file_id": 22,
+        "content": "        else:\n            w_step = (image_w - crop_pair[0]) / 4\n            h_step = (image_h - crop_pair[1]) / 4\n            ret = list()\n            ret.append((0, 0))  # upper left\n            if w_step != 0:\n                ret.append((4 * w_step, 0))  # upper right\n            if h_step != 0:\n                ret.append((0, 4 * h_step))  # lower left\n            if h_step != 0 and w_step != 0:\n                ret.append((4 * w_step, 4 * h_step))  # lower right\n            if h_step != 0 or w_step != 0:\n                ret.append((2 * w_step, 2 * h_step))  # center\n            if more_fix_crop:\n                ret.append((0, 2 * h_step))  # center left\n                ret.append((4 * w_step, 2 * h_step))  # center right\n                ret.append((2 * w_step, 4 * h_step))  # lower center\n                ret.append((2 * w_step, 0 * h_step))  # upper center\n                ret.append((1 * w_step, 1 * h_step))  # upper left quarter\n                ret.append((3 * w_step, 1 * h_step))  # upper right quarter",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:240-262"
+    },
+    "225": {
+        "file_id": 22,
+        "content": "This code calculates crop positions for an image based on its width and height. It generates a list of potential cropping locations, including upper left/right, lower left/right, center, center left/right, upper left quarter, upper right quarter, and center. The calculations are done in case more_fix_crop is set to True, otherwise only the basic crop positions will be included.",
+        "type": "comment"
+    },
+    "226": {
+        "file_id": 22,
+        "content": "                ret.append((1 * w_step, 3 * h_step))  # lower left quarter\n                ret.append((3 * w_step, 3 * h_step))  # lower righ quarter\n            w_offset, h_offset = random.choice(ret)\n            crop_info = {\n                'crop_w': crop_pair[0],\n                'crop_h': crop_pair[1],\n                'offset_w': w_offset,\n                'offset_h': h_offset\n            }\n        return crop_info\n    crop_info = _sample_crop_size(im_size)\n    crop_w = crop_info['crop_w']\n    crop_h = crop_info['crop_h']\n    offset_w = crop_info['offset_w']\n    offset_h = crop_info['offset_h']\n    crop_img_group = [\n        img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))\n        for img in img_group\n    ]\n    ret_img_group = [\n        img.resize((input_size[0], input_size[1]), Image.BILINEAR)\n        for img in crop_img_group\n    ]\n    return ret_img_group\ndef group_random_crop(img_group, target_size):\n    \"\"\"\n    group_random_crop\n    \"\"\"\n    w, h = img_group[0].size\n    th, tw = target_size, target_size",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:263-298"
+    },
+    "227": {
+        "file_id": 22,
+        "content": "The code defines a function `group_random_crop` that takes an image group and a target size as input, performs random cropping on each image in the group with different crop sizes, offsets, and resizes them to the specified target size. The cropped images are then returned in a group.",
+        "type": "comment"
+    },
+    "228": {
+        "file_id": 22,
+        "content": "    assert (w >= target_size) and (h >= target_size), \\\n          \"image width({}) and height({}) should be larger than crop size\".format(w, h)\n    out_images = []\n    x1 = random.randint(0, w - tw)\n    y1 = random.randint(0, h - th)\n    for img in img_group:\n        if w == tw and h == th:\n            out_images.append(img)\n        else:\n            out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n    return out_images\ndef group_random_flip(img_group):\n    \"\"\"\n    group_random_flip\n    \"\"\"\n    v = random.random()\n    if v < 0.5:\n        ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]\n        return ret\n    else:\n        return img_group\ndef group_center_crop(img_group, target_size):\n    \"\"\"\n    group_center_crop\n    \"\"\"\n    img_crop = []\n    for img in img_group:\n        w, h = img.size\n        th, tw = target_size, target_size\n        assert (w >= target_size) and (h >= target_size), \\\n             \"image width({}) and height({}) should be larger than crop size\".format(w, h)\n        x1 = int(round((w - tw) / 2.))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:300-338"
+    },
+    "229": {
+        "file_id": 22,
+        "content": "This code is used to preprocess images by cropping, flipping, and centering them for machine learning models. The \"group_center_crop\" function crops images to a specific target size while ensuring the image dimensions are larger than the crop size. The \"group_random_flip\" function randomly flips the images horizontally with a 50% chance. The preprocessed images are returned in a list format.",
+        "type": "comment"
+    },
+    "230": {
+        "file_id": 22,
+        "content": "        y1 = int(round((h - th) / 2.))\n        img_crop.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n    return img_crop\ndef group_scale(imgs, target_size):\n    \"\"\"\n    group_scale\n    \"\"\"\n    resized_imgs = []\n    for i in range(len(imgs)):\n        img = imgs[i]\n        w, h = img.size\n        if (w <= h and w == target_size) or (h <= w and h == target_size):\n            resized_imgs.append(img)\n            continue\n        if w < h:\n            ow = target_size\n            oh = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n        else:\n            oh = target_size\n            ow = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n    return resized_imgs",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:339-366"
+    },
+    "231": {
+        "file_id": 22,
+        "content": "The code defines two functions: \"crop_imgs\" and \"group_scale\". The \"crop_imgs\" function takes an image and crops it based on the provided top-left x, y coordinates, width, and height. It then appends the cropped images to a list and returns that list. The \"group_scale\" function resizes a group of images to a target size by checking if each image's dimensions already match the target size, and if not, it adjusts the dimensions using a 4:3 aspect ratio. It then appends the resized images to a list and returns that list.",
+        "type": "comment"
+    },
+    "232": {
+        "file_id": 23,
+        "content": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py",
+        "type": "filepath"
+    },
+    "233": {
+        "file_id": 23,
+        "content": "The code is from PaddleVideo's BasketballAction application, importing modules and defining AttrDict class. It loads config file into an AttrDict object, processes nested dictionaries, prints configurations, and logs a separator line using the logger module for organization and readability purposes.",
+        "type": "summary"
+    },
+    "234": {
+        "file_id": 23,
+        "content": "\"\"\"\nconfig_utils\n\"\"\"\n#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport yaml\nimport ast\nimport logger\nlogger = logger.Logger()\nCONFIG_SECS = [\n    'train',\n    'valid',\n    'test',\n    'infer',\n]\nclass AttrDict(dict):\n    \"\"\"\n    AttrDict\n    \"\"\"\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef parse_config(cfg_file):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:1-46"
+    },
+    "235": {
+        "file_id": 23,
+        "content": "This code is from the PaddleVideo library's BasketballAction application. It imports yaml and ast modules, as well as a logger class. The code defines a constant list of section names (train, valid, test, infer). It also defines an AttrDict class to handle dictionaries with attributes like getattr and setattr methods. The parse_config function is defined which takes a configuration file as input.",
+        "type": "comment"
+    },
+    "236": {
+        "file_id": 23,
+        "content": "    \"\"\"Load a config file into AttrDict\"\"\"\n    import yaml\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef create_attr_dict(yaml_config):\n    \"\"\"create_attr_dict\"\"\"\n    for key, value in yaml_config.items():\n        if isinstance(value, dict):\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = ast.literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\n    return\ndef print_configs(cfg, mode):\n    \"\"\"print_configs\"\"\"\n    logger.info(\"---------------- {:>5} Arguments ----------------\".format(\n        mode))\n    for sec, sec_items in cfg.items():\n        logger.info(\"{}:\".format(sec))\n        for k, v in sec_items.items():\n            logger.info(\"    {}:{}\".format(k, v))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:47-79"
+    },
+    "237": {
+        "file_id": 23,
+        "content": "This code is responsible for loading a configuration file into an AttrDict object, processing the nested dictionary structure, and printing the configurations. It uses the yaml library to load the file, and the create_attr_dict function to handle nested dictionaries and convert strings to appropriate data types. The print_configs function prints the configuration in a formatted manner for readability.",
+        "type": "comment"
+    },
+    "238": {
+        "file_id": 23,
+        "content": "    logger.info(\"-------------------------------------------------\")",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:80-80"
+    },
+    "239": {
+        "file_id": 23,
+        "content": "This code snippet is logging a separator line using the logger module. The purpose of this logger statement might be to visually separate different sections or parts of the code for readability and organization purposes.",
+        "type": "comment"
+    },
+    "240": {
+        "file_id": 24,
+        "content": "/applications/BasketballAction/predict/action_detect/utils/preprocess.py",
+        "type": "filepath"
+    },
+    "241": {
+        "file_id": 24,
+        "content": "This code contains four functions that utilize the FFmpeg tool for handling video and audio files. \"ffmpeg_frames\" extracts frames from a given MP4 file, \"ffmpeg_pcm\" extracts audio in PCM format, \"ffmpeg_mp4\" downloads an MP4 file, and \"get_images\" lists the images inside a specified image directory.",
+        "type": "summary"
+    },
+    "242": {
+        "file_id": 24,
+        "content": "\"\"\" extract frames and pcm\"\"\"\nimport os\nimport sys\nimport shutil\ndef ffmpeg_frames(mp4_addr, frame_out_folder, fps=5):\n    \"\"\"ffmpeg_frames\"\"\"\n    if os.path.exists(frame_out_folder):\n        shutil.rmtree(frame_out_folder)\n    os.makedirs(frame_out_folder)\n    cmd = './src/utils/ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (mp4_addr, fps, frame_out_folder, '%08d')\n    os.system(cmd)\ndef ffmpeg_pcm(mp4_addr, save_file_name):\n    \"\"\"ffmpeg_pcm\"\"\"\n    cmd = './src/utils/ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' \\\n        % (mp4_addr, save_file_name)\n    os.system(cmd)\ndef ffmpeg_mp4(mp4_url, mp4_addr):\n    \"\"\"ffmpeg_mp4\"\"\"\n    cmd = \"wget %s -O %s -q\" % (mp4_url, mp4_addr)\n    print (\"cmd = \", cmd)\n    os.system(cmd)\ndef get_images(image_path):\n    \"\"\"get_images\"\"\"\n    images = sorted(os.listdir(image_path))\n    images = images\n    images_path_list = [image_path + '/' + im for im in images]\n    return images_path_list",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/preprocess.py:1-35"
+    },
+    "243": {
+        "file_id": 24,
+        "content": "This code contains four functions that utilize the FFmpeg tool for handling video and audio files. \"ffmpeg_frames\" extracts frames from a given MP4 file, \"ffmpeg_pcm\" extracts audio in PCM format, \"ffmpeg_mp4\" downloads an MP4 file, and \"get_images\" lists the images inside a specified image directory.",
+        "type": "comment"
+    },
+    "244": {
+        "file_id": 25,
+        "content": "/applications/BasketballAction/predict/action_detect/utils/process_result.py",
+        "type": "filepath"
+    },
+    "245": {
+        "file_id": 25,
+        "content": "The code retrieves data, applies NMS to bounding box proposals, filters detected actions from videos using NMS, and stores relevant information in the \"video_results\" list. It defines a function `get_action_result` that takes inputs and performs NMS on processed results.",
+        "type": "summary"
+    },
+    "246": {
+        "file_id": 25,
+        "content": "\"\"\"\n# @File  : process_result.py  \n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport sys\nimport os\nimport re\nimport numpy as np\nimport pickle\nimport json\nimport logger\nlogger = logger.Logger()\ndef get_data_res(label_map, data, topk):\n    \"\"\"get_data_res\"\"\"\n    sum_vid = len(data)\n    video_result = []\n    for i in range(sum_vid):\n        vid_name = data[i][0][0]\n        # true_label predict_start predict_end predict_score predict_len gt_iou gt_start gt_ioa\n        feature_start_id = float(data[i][0][1]['start'])\n        feature_end_id = float(data[i][0][1]['end'])\n        feature_stage1_score = data[i][0][1]['score']\n        predict_res = []\n        for k in range(topk):\n            score_top = data[i][1][k]\n            labelid_top = data[i][2][k]\n            label_iou = data[i][3]\n            labelname_top = label_map[str(labelid_top)]\n            video_result.append([feature_start_id, feature_end_id, labelid_top, labelname_top, score_top, label_iou])\n    return video_result\ndef base_nms(bboxes, thresh, delta=0, nms_id=2):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:1-39"
+    },
+    "247": {
+        "file_id": 25,
+        "content": "This code defines two functions: `get_data_res` and `base_nms`. The first function takes in a label map, data (a list of features), and a topk value. It iterates through each video in the data, extracts relevant information from the feature, and appends this information to a new list called `video_result`. Finally, it returns the `video_result` list. The second function is an incomplete definition for a non-maximum suppression algorithm used for bounding boxes. It takes in bboxes (bounding box coordinates), thresh (threshold value), delta (optional parameter with default value 0), and nms_id (an identifier for the NMS operation, with a default value of 2).",
+        "type": "comment"
+    },
+    "248": {
+        "file_id": 25,
+        "content": "    \"\"\"\n    One-dimensional non-maximal suppression\n    :param bboxes: [[vid, label, st, ed, score, ...], ...]\n    :param thresh:\n    :return:\n    \"\"\"\n    \"\"\"\n    t1 = bboxes[:, 0]\n    t2 = bboxes[:, 1]\n    scores = bboxes[:, nms_id]\n    \"\"\"\n    t1 = np.array([max(0, x[0] - delta) for x in bboxes])\n    t2 = np.array([x[1] + delta for x in bboxes])\n    scores = np.array([x[nms_id] for x in bboxes])\n    durations = t2 - t1\n    order = scores.argsort()[::-1]\n    keep = []\n    while order.size > 0:\n        i = order[0]\n        keep.append(i)\n        tt1 = np.maximum(t1[i], t1[order[1:]])\n        tt2 = np.minimum(t2[i], t2[order[1:]])\n        intersection = tt2 - tt1\n        IoU = intersection / (durations[i] + durations[order[1:]] - intersection).astype(float)\n        inds = np.where(IoU <= thresh)[0]\n        order = order[inds + 1]\n    return [bboxes[i] for i in keep]\ndef process_proposal(source_prop_box, min_frame_thread=5, nms_thresh=0.7, score_thresh=0.01):\n    \"\"\"process_video_prop\"\"\"\n    prop_box = []\n    for items in source_prop_box:",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:40-76"
+    },
+    "249": {
+        "file_id": 25,
+        "content": "This code performs non-maximal suppression on bounding box proposals. It filters out overlapping boxes by keeping only those with the highest scores and discarding the rest. The function process_proposal takes source bounding box proposals, applies non-maximal suppression with a threshold, and returns the filtered results.",
+        "type": "comment"
+    },
+    "250": {
+        "file_id": 25,
+        "content": "        start_frame = float(items[0])\n        end_frame = float(items[1])\n        score = float(items[2])\n        if end_frame - start_frame < min_frame_thread or score < score_thresh:\n            continue\n        prop_box.append([start_frame, end_frame, score])\n    prop_box_keep = base_nms(prop_box, nms_thresh)\n    prop_res = []\n    for res in prop_box_keep:\n        prop_res.append({'start': res[0], 'end': res[1], 'score': res[2]})\n    return prop_res\ndef process_video_classify(video_prop, fps, score_thread, iou_thread, \\\n                           nms_id=5, nms_thread=0.01, nms_delta=10, backgroundid=0):\n    \"\"\"process_video_classify\"\"\"\n    prop_filter = []\n    for item in video_prop:\n        if item[2] == backgroundid:\n            continue\n        prop_filter.append(item)\n    # prop_filter = sorted(prop_filter, key=lambda x: x[nms_id], reverse=True)\n    prop_filter = base_nms(prop_filter, nms_thread, nms_delta, nms_id)\n    prop_filter = sorted(prop_filter, key=lambda x: x[0])\n    video_results = []\n    for item in prop_filter:",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:77-107"
+    },
+    "251": {
+        "file_id": 25,
+        "content": "This code is part of a video classification process. It filters and sorts the detected actions in a video, discarding background or weak detections. The results are stored in 'prop_res' and 'video_results'. The code applies non-maximum suppression (NMS) to filter and sort the detections based on frame duration, score threshold, and other parameters like fps, nms_thread, and nms_delta.",
+        "type": "comment"
+    },
+    "252": {
+        "file_id": 25,
+        "content": "        start_sec = item[0] / fps\n        end_sec = item[1] / fps\n        start_id_frame = item[0]\n        end_id_frame = item[1]\n        # start_time = \"%02d:%02d:%02d\" % ((start_id_frame / fps) / 3600, \\\n        #     ((start_id_frame / fps) % 3600) / 60, (start_id_frame / fps) % 60)\n        # end_time = \"%02d:%02d:%02d\" % ((end_id_frame / fps) / 3600, \\\n        #     ((end_id_frame / fps) % 3600) / 60, (end_id_frame / fps) % 60)\n        start_time = int(start_id_frame / fps)\n        end_time = int(end_id_frame / fps)\n        label_id = item[2]\n        label_name = item[3]\n        label_classify_score = item[4]\n        label_iou_score = item[5]\n        if label_classify_score > score_thread and label_iou_score > iou_thread:\n            video_results.append({\"start_time\": start_time,\n                                  \"end_time\": end_time,\n                                  \"label_id\": label_id,\n                                  \"label_name\": label_name,\n                                  \"classify_score\": label_classify_score,",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:108-129"
+    },
+    "253": {
+        "file_id": 25,
+        "content": "This code calculates the start and end time in seconds, frame IDs, and other relevant details of detected actions from a video. It then appends these details as a dictionary to the \"video_results\" list if the classify score and IoU score exceed certain thresholds.",
+        "type": "comment"
+    },
+    "254": {
+        "file_id": 25,
+        "content": "                                  \"iou_score\": label_iou_score})\n    return video_results\ndef get_action_result(result_info, label_map_file, fps, score_thread=0, \\\n                      iou_thread=0, nms_id=5, nms_thread=0.01, frame_offset=10, topk=1):\n    \"\"\"get_action_result\"\"\"\n    label_map = json.load(open(label_map_file, 'r', encoding='utf-8'))\n    org_result = get_data_res(label_map, result_info, topk)\n    nms_result = process_video_classify(org_result, fps, score_thread, iou_thread, nms_id, nms_thread, frame_offset)\n    return nms_result",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/utils/process_result.py:130-144"
+    },
+    "255": {
+        "file_id": 25,
+        "content": "This code defines a function `get_action_result` that takes in `result_info`, `label_map_file`, `fps`, `score_thread`, `iou_thread`, `nms_id`, `nms_thread`, and `frame_offset` as inputs. It reads the label map from `label_map_file`, processes the result data using `get_data_res` function, performs non-maximum suppression (NMS) on the processed results with specified parameters, and returns the final NMS results.",
+        "type": "comment"
+    },
+    "256": {
+        "file_id": 26,
+        "content": "/applications/BasketballAction/predict/eval.py",
+        "type": "filepath"
+    },
+    "257": {
+        "file_id": 26,
+        "content": "The code defines a function for loading annotations and includes helper functions, iterates through label ranges and thresholds to find the best combination of IOU and score threshold for evaluating basketball actions, calculates evaluation results, updates best scores, and prints these best scores along with the evaluation results.",
+        "type": "summary"
+    },
+    "258": {
+        "file_id": 26,
+        "content": "\"\"\"\nget instance for lstm\n根据gts计算每个proposal_bmn的iou、ioa、label等信息\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nimport io\nsys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding = 'utf-8')\ndataset = \"datasets/\"\nlabel_index_file = './configs_basketball/index_label_basketball_6.json'\neval_datasets = ['EuroCup2016']\nlabel_files = {'train': 'label_cls6_train.json',\n               'validation': 'label_cls6_val.json'}\nglobal fps, mode\nlabel_index = json.load(open(label_index_file, 'rb'))\ndef load_gts():\n    global fps\n    gts_data = {'fps': 0, 'gts': {}}\n    for eval_data in eval_datasets:\n        for item, value in label_files.items():\n            label_file = '{}/{}/{}'.format(dataset, eval_data, value)\n            gts = json.load(open(label_file, 'rb'))\n            gts_data['fps'] = gts['fps']\n            fps = gts['fps']\n            for gt in gts['gts']:\n                gt['mode'] = item\n                basename = '{}/{}/mp4/{}'.format(dataset, eval_data, os.path.basename(gt['url']))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/eval.py:1-36"
+    },
+    "259": {
+        "file_id": 26,
+        "content": "This code defines a function called `load_gts()` which loads ground truth annotations (gts) for video evaluation. It imports necessary modules, sets up global variables like fps and mode, and utilizes a JSON file to map labels to their indices. The gts data is stored in a dictionary with 'fps' and 'gts' keys, where 'fps' stores the frame rate and 'gts' stores individual annotations for each video. Each annotation has a 'mode' key indicating whether it's from training or validation set.",
+        "type": "comment"
+    },
+    "260": {
+        "file_id": 26,
+        "content": "                gts_data['gts'][basename] = gt\n    return gts_data['gts']\ndef computeIoU(e1, e2):\n    \"\"\"\n    clc iou and ioa\n    \"\"\"\n    if not (e1['label'] == e2['label'] and e1['basename'] == e2['basename']):\n        return 0.\n    area1 = e1[\"end\"] - e1[\"start\"]\n    area2 = e2[\"end\"] - e2[\"start\"]\n    x1 = np.maximum(e1[\"start\"], e2[\"start\"])\n    x2 = np.minimum(e1[\"end\"], e2[\"end\"])\n    inter = np.maximum(0.0, x2 - x1)\n    iou = 0.0 if (area1 + area2 - inter) == 0 else inter * 1.0 / (area1 + area2 - inter)\n    if not mode == 'proposal':\n        iou = 0.0 if area2 == 0 else inter * 1.0 / area2\n    return iou\ndef convert_proposal(boxes, basename, score_threshold=0.01):\n    boxes = sorted(boxes, key=lambda x:float(x['score']), reverse=True)\n    res = []\n    for box in boxes:\n        if not float(box['score']) >= score_threshold:\n            continue\n        res.append({'basename': basename,\n                    'start': int(float(box['start']) / fps),\n                    'end': int(float(box['end']) / fps),\n                    'label': 0})",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/eval.py:37-67"
+    },
+    "261": {
+        "file_id": 26,
+        "content": "This code snippet defines three functions: \"get_gt\", \"computeIoU\", and \"convert_proposal\". The \"get_gt\" function takes a baseline name and returns the ground truth (GT) for that specific baseline. The \"computeIoU\" function calculates the intersection over union (IoU) between two events. Lastly, the \"convert_proposal\" function converts event proposals into ground truths based on their scores, threshold, and frame rates.",
+        "type": "comment"
+    },
+    "262": {
+        "file_id": 26,
+        "content": "    return res\ndef convert_classify(boxes, basename, iou_threshold, score_threshold):\n    boxes = sorted(boxes, key=lambda x:(float(x['classify_score']), float(x['iou_score'])), reverse=True)\n    def convert_time_to_frame(time_type):\n        return int(time_type)\n        h, m, s = time_type.split(':')\n        return int(h) * 3600 + int(m) * 60 + int(s)\n    res = []\n    for box in boxes:\n        if not (box['iou_score'] >= iou_threshold and\n                box['classify_score'] >= score_threshold):\n            continue\n        res.append({'basename': basename,\n                    'start': convert_time_to_frame(box['start_time']),\n                    'end': convert_time_to_frame(box['end_time']),\n                    'label': box['label_id']})\n    return res\ndef convert_groundtruth(boxes, basename, phase=None):\n    res = []\n    for box in boxes:\n        for item in box['label_ids']:\n            label = 0 if phase == 'proposal' else item\n            res.append({'basename': basename,\n                        'start': box['start_id'],",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/eval.py:68-93"
+    },
+    "263": {
+        "file_id": 26,
+        "content": "The code defines a function `convert_classify` that takes in boxes, base name, iou threshold, and score threshold. It sorts the boxes based on their classify score and iou score in descending order. The function then loops over each box, checks if the box meets the iou and score thresholds, and appends to a list named 'res' with necessary details such as basename, start time converted to frame number, end time converted to frame number, and label id. The code also has another function `convert_groundtruth` which takes in boxes, base name, and phase (optional). It iterates over the label ids of each box and appends a dictionary to the list 'res' with necessary details such as basename, start id, and label.",
+        "type": "comment"
+    },
+    "264": {
+        "file_id": 26,
+        "content": "                        'end': box['end_id'],\n                        'label': label})\n    return res\ndef print_head(iou):\n    print(\"\\nioa = {:.1f}\".format(iou))\n    res_str = ''\n    for item in ['label_name']:\n        res_str += '{:<12s}'.format(item)\n    for item in ['label_id', 'precision', 'recall', 'hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10s}'.format(item)\n    print(res_str)\ndef print_result(res_dict, label='avg'):\n    if label == 'avg':\n        res_str = '{:<22s}'.format(str(label))\n    else:\n        res_str = '{0:{2}<6s}{1:<10s}'.format(label_index[str(label)], str(label), chr(12288))\n    for item in ['prec', 'recall']:\n        res_str += '{:<10.4f}'.format(res_dict[item])\n    for item in ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10d}'.format(res_dict[item])\n    print(res_str)\ndef evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = False):\n    iou_map = [computeIoU(resId, gtsId) for resId in res_boxes \\\n                                        for gtsId in gts_boxes]",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/eval.py:94-120"
+    },
+    "265": {
+        "file_id": 26,
+        "content": "This code contains four functions: `evaluation`, `print_result`, `print_head`, and `print_head`. These functions calculate and print the evaluation results for a set of detected boxes (res_boxes) against the ground truth boxes (gts_boxes). The code also calculates various metrics such as precision, recall, hit properties, and number of instances. It uses label ranges, IoU thresholds, and can show intermediate IoU values if specified.",
+        "type": "comment"
+    },
+    "266": {
+        "file_id": 26,
+        "content": "    iou_map = np.array(iou_map).reshape((len(res_boxes), len(gts_boxes)))\n    hit_map_prop_total = np.max(iou_map, axis=1)\n    hit_map_index_total = np.argmax(iou_map, axis=1)\n    res_dict = ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']\n    for iou_threshold in iou_range:\n        if show_sub:\n            print_head(iou_threshold)\n        iou_prop = np.array([k >= iou_threshold for k in hit_map_prop_total])\n        average_results = {}\n        for label_id in label_range:\n            sub_results = {}\n            label_prop = np.array([k['label'] == label_id for k in res_boxes])\n            label_gts = np.array([k['label'] == label_id for k in gts_boxes])\n            sub_results['num_prop'] = sum(label_prop)\n            sub_results['num_gts'] = sum(label_gts)\n            if sub_results['num_prop'] == 0:\n                hit_prop_index = []\n            else:\n                hit_prop_index = label_prop & iou_prop\n            sub_results['hit_prop'] = sum(hit_prop_index)\n            sub_results['hit_gts'] = len(set(hit_map_index_total[hit_prop_index]))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/eval.py:121-144"
+    },
+    "267": {
+        "file_id": 26,
+        "content": "This code calculates evaluation metrics for detected objects based on their Intersection over Union (IoU) with ground truth objects. It iterates through different IoU thresholds and label ranges to compute hit proportion, number of propositions, hit GTs, and number of ground truth objects for each threshold and label. The results are stored in a dictionary.",
+        "type": "comment"
+    },
+    "268": {
+        "file_id": 26,
+        "content": "            sub_results['prec'] = 0.0 if sub_results['num_prop'] == 0 \\\n                                      else sub_results['hit_prop'] * 1.0 / sub_results['num_prop']\n            sub_results['recall'] = 0.0 if sub_results['num_gts'] == 0 \\\n                                        else sub_results['hit_gts'] * 1.0 / sub_results['num_gts']\n            if show_sub:\n                print_result(sub_results, label=label_id)\n            for item in res_dict:\n                if not item in average_results:\n                    average_results[item] = 0\n                average_results[item] += sub_results[item]\n        if len(label_range) == 1:   # proposal 不需要输出average值\n            continue\n        average_results['prec'] = 0.0 if average_results['num_prop'] == 0 \\\n                                      else average_results['hit_prop'] * 1.0 / average_results['num_prop']\n        average_results['recall'] = 0.0 if average_results['num_gts'] == 0 \\\n                                        else average_results['hit_gts'] * 1.0 / average_results['num_gts']",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/eval.py:146-161"
+    },
+    "269": {
+        "file_id": 26,
+        "content": "This code calculates precision and recall values for sub-results and average results in a classification task. It handles cases where the number of true positives, true negatives, false positives or false negatives is zero by assigning precision and recall as 0. The code outputs average values only for labels with a range greater than one.",
+        "type": "comment"
+    },
+    "270": {
+        "file_id": 26,
+        "content": "        if show_sub:\n            print_result(average_results)\n        average_results['F1'] = 0.0 if (average_results['prec'] + average_results['recall'] == 0) \\\n                                    else 2 * average_results['prec'] * average_results['recall'] / \\\n                                            (average_results['prec'] + average_results['recall'])\n        return average_results\ndef get_eval_results(predicts, gts_data, phase, iou_threshold = 0.3, score_threshold = 0.3, show_sub = False):\n    global mode\n    mode = phase\n    res_boxes = []\n    gts_boxes = []\n    for ped_data in predicts:\n        basename = ped_data['video_name']\n        # eval sub data\n        such_eval = False\n        for eval_name in eval_datasets:\n            if eval_name in basename:\n                such_eval = True\n                break\n        if not such_eval:\n            continue\n        gts = gts_data[basename]['actions']\n        if phase == 'proposal':\n            res_boxes.extend(convert_proposal(ped_data['bmn_results'], basename, score_threshold))",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/eval.py:162-189"
+    },
+    "271": {
+        "file_id": 26,
+        "content": "This code calculates the F1 score for a set of predictions and ground truth data. If 'show_sub' is True, it prints the average results. It then calculates the F1 score based on precision and recall values. The function returns the average results containing precision, recall, and F1 score.",
+        "type": "comment"
+    },
+    "272": {
+        "file_id": 26,
+        "content": "            gts_boxes.extend(convert_groundtruth(gts, basename, phase='proposal'))\n            label_range = [0]\n            iou_range = np.arange(0.1, 1, 0.1)\n        else:\n            res_boxes.extend(convert_classify(ped_data['action_results'], basename, iou_threshold, score_threshold))\n            gts_boxes.extend(convert_groundtruth(gts, basename))\n            label_range = range(1, len(label_index))\n            iou_range = np.arange(0.5, 0.6, 0.1)\n    eval_results = evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = show_sub)\n    return eval_results\nif __name__ == \"__main__\":\n    result_file = sys.argv[1]\n    predicts = json.load(open(result_file, 'r', encoding='utf-8'))\n    gts_data = load_gts()\n    get_eval_results(predicts, gts_data, 'proposal', \n                     score_threshold = 0.03,\n                     show_sub = True)\n    #get_eval_results(predicts, gts_data, 'actions')\n    best_F1 = -0.1\n    best_res = {}\n    best_iou_threshold = 0.\n    best_score_threshold = 0.\n    for iou_threshold in np.arange(0.1, 0.9, 0.1):",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/eval.py:190-218"
+    },
+    "273": {
+        "file_id": 26,
+        "content": "The code is evaluating the performance of a video action detection model. It extends the ground truth boxes for proposals and classifies them based on IOU and score thresholds. It then performs evaluation using these results and displays the best F1 score.",
+        "type": "comment"
+    },
+    "274": {
+        "file_id": 26,
+        "content": "        for score_threshold in np.arange(0.1, 1, 0.1):\n            avg_res = get_eval_results(predicts, gts_data, 'actions', \n                                       iou_threshold = iou_threshold,\n                                       score_threshold = score_threshold,\n                                       show_sub = False)\n            if best_F1 < avg_res['F1']:\n                best_F1 = avg_res['F1']\n                best_res = avg_res\n                best_iou_threshold = iou_threshold\n                best_score_threshold = score_threshold\n    print(\"best iou threshold = {:.1f}\".format(best_iou_threshold))\n    print(\"best score threshold = {:.1f}\".format(best_score_threshold))\n    print('best F1 score = {:.4f}'.format(best_F1))\n    print_head(0.5)\n    print_result(best_res)\n    get_eval_results(predicts, gts_data, 'actions', iou_threshold = best_iou_threshold,\n                                                    score_threshold = best_score_threshold,\n                                                    show_sub = True)",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/eval.py:219-237"
+    },
+    "275": {
+        "file_id": 26,
+        "content": "This code is iterating through different score thresholds to find the best combination of IOU and score threshold for evaluating basketball actions. It calculates evaluation results for each threshold, updating the best scores accordingly. Finally, it prints these best scores and displays the evaluation results using a function called print_result().",
+        "type": "comment"
+    },
+    "276": {
+        "file_id": 27,
+        "content": "/applications/BasketballAction/predict/predict.py",
+        "type": "filepath"
+    },
+    "277": {
+        "file_id": 27,
+        "content": "The code initializes an ActionDetection instance, loads a model for prediction, processes video URLs by extracting frames and audio, makes predictions on these inputs, and stores the results in a JSON file. The JSON file is written without escaping special characters.",
+        "type": "summary"
+    },
+    "278": {
+        "file_id": 27,
+        "content": "import os\nimport sys\nimport json\nsys.path.append('action_detect')\nfrom action import ActionDetection\nif __name__ == '__main__':\n    dataset_dir = \"datasets/\"\n    model_predict = ActionDetection(cfg_file=\"configs_basketball/configs_basketball.yaml\")\n    model_predict.load_model()\n    video_url = os.path.join(dataset_dir, 'mp4.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, \"mp4\", os.path.basename(k.strip())) for k in lines]\n    results = []\n    for line in lines:\n        video_name = line\n        print(video_name)\n        imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")\n        pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n        bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n        results.append({'video_name': line,\n                        'bmn_results': bmn_results, \n                        'action_results': action_results})\n    with open('results.json', 'w', encoding='utf-8') as f:",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/predict.py:2-33"
+    },
+    "279": {
+        "file_id": 27,
+        "content": "The code imports necessary libraries, appends the \"action_detect\" directory to the system path, and initializes an ActionDetection instance. It then loads the model for prediction using a specified configuration file. The code reads a list of video URLs from a file, processes each video by extracting image frames and audio, and makes predictions on these inputs. Finally, it stores the results in a JSON file.",
+        "type": "comment"
+    },
+    "280": {
+        "file_id": 27,
+        "content": "       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) ",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/predict.py:34-35"
+    },
+    "281": {
+        "file_id": 27,
+        "content": "Writes JSON-formatted 'results' to file using indentation and without escaping special characters.",
+        "type": "comment"
+    },
+    "282": {
+        "file_id": 28,
+        "content": "/applications/EIVideo/EIVideo/README.MD",
+        "type": "filepath"
+    },
+    "283": {
+        "file_id": 28,
+        "content": "This is a Chinese comment for an interactive video annotation tool's Command Line Interface (CLI). It mentions installing the \"scikit-image\" package, running the program in inference mode using a specific configuration and model file, and provides a reference document link.",
+        "type": "summary"
+    },
+    "284": {
+        "file_id": 28,
+        "content": "# 交互式视频智能标注工具 - CLI(Command Line Interface)\n在开始使用之前，您需要按照以下命令安装额外的依赖包：\n```bash\npython -m pip install scikit-image\n```\n## 推理运行方式\n```shell\nC:\\Python\\Python37\\python.exe main.py --test -c E:/PaddlePaddle_Project/EIVideo/resources/backend/configs/manet.yaml -w E:/PaddlePaddle_Project/EIVideo/resources/backend/model/save_step_80000.pdparams\nC:\\Python\\Python37\\python.exe resources/backend/main.py --test -c E:/PaddlePaddle_Project/EIVideo/resources/backend/configs/manet.yaml -w E:/PaddlePaddle_Project/EIVideo/resources/backend/model/save_step_80000.pdparams\n```\n## 参考文档\n[manet](docs/zh-CN/manet.md)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/README.MD:1-15"
+    },
+    "285": {
+        "file_id": 28,
+        "content": "This is a Chinese comment for an interactive video annotation tool's Command Line Interface (CLI). It mentions installing the \"scikit-image\" package, running the program in inference mode using a specific configuration and model file, and provides a reference document link.",
+        "type": "comment"
+    },
+    "286": {
+        "file_id": 29,
+        "content": "/applications/EIVideo/EIVideo/__init__.py",
+        "type": "filepath"
+    },
+    "287": {
+        "file_id": 29,
+        "content": "This code file is an __init__.py for EIVideo application, authored by Acer Zhang on Jan 6th. It sets root path and defines constants for temporary image and JSON file paths. The join_root_path function helps construct full paths from given partial paths.",
+        "type": "summary"
+    },
+    "288": {
+        "file_id": 29,
+        "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport os\nfrom EIVideo.version import __version__\nEI_VIDEO_ROOT = os.path.abspath(os.path.dirname(__file__))\nTEMP_IMG_SAVE_PATH = \"./temp.png\"\nTEMP_JSON_SAVE_PATH = \"./save.json\"\nTEMP_JSON_FINAL_PATH = \"./final.json\"\ndef join_root_path(path: str):\n    return os.path.join(EI_VIDEO_ROOT, path)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/__init__.py:1-16"
+    },
+    "289": {
+        "file_id": 29,
+        "content": "This code file is an __init__.py for EIVideo application, authored by Acer Zhang on Jan 6th. It sets root path and defines constants for temporary image and JSON file paths. The join_root_path function helps construct full paths from given partial paths.",
+        "type": "comment"
+    },
+    "290": {
+        "file_id": 30,
+        "content": "/applications/EIVideo/EIVideo/api.py",
+        "type": "filepath"
+    },
+    "291": {
+        "file_id": 30,
+        "content": "This code retrieves images, converts them to JSON format and stores object locations. It also processes videos, creating JSON annotations on frames with functions for video loading/saving, image resizing, mask processing, and PNG saving.",
+        "type": "summary"
+    },
+    "292": {
+        "file_id": 30,
+        "content": "# Author: AP-Kai\n# Datetime: 2022/1/10\n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport json\nimport os\nfrom collections import OrderedDict\nimport cv2\nimport numpy as np\nfrom PIL import Image\nfrom EIVideo.paddlevideo.utils.manet_utils import overlay_davis\nfrom EIVideo import TEMP_JSON_SAVE_PATH, TEMP_JSON_FINAL_PATH\ndef get_images(sequence='bike-packing'):\n    img_path = os.path.join('data', sequence.strip(), 'frame')\n    img_files = os.listdir(img_path)\n    img_files.sort()\n    files = []\n    for img in img_files:\n        img_file = np.array(Image.open(os.path.join(img_path, img)))\n        files.append(img_file)\n    return np.array(files)\ndef json2frame(path):\n    print(\"now turn masks.json to frames\", path)\n    with open(path, 'r', encoding='utf-8') as f:\n        res = f.read()\n        a = json.loads(res)\n        b = a.get('overlays')\n        b_array = np.array(b)\n        frame_list = []\n        for i in range(0, len(b_array)):\n            im = Image.fromarray(np.uint8(b_array[i]))",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/api.py:1-39"
+    },
+    "293": {
+        "file_id": 30,
+        "content": "Code imports necessary libraries and defines two functions. `get_images` retrieves image files from a specified sequence, sorts them, and returns as a numpy array. `json2frame` reads a JSON file and converts its overlays into Image objects in a list format.",
+        "type": "comment"
+    },
+    "294": {
+        "file_id": 30,
+        "content": "            im = cv2.cvtColor(np.asarray(im), cv2.COLOR_RGB2BGR)\n            im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)\n            # im = np.array(b_array[i]).astype(\"uint8\")\n            # im = im.transpose((2, 0, 1))\n            # im = cv2.merge(im)\n            frame_list.append(im)\n    return frame_list\ndef png2json(image_path, sliderframenum, save_json_path):\n    image = Image.open(image_path)  # 用PIL中的Image.open打开图像\n    image = image.convert('P')\n    image_arr = np.array(image)  # 转化成numpy数组\n    image_arr = image_arr.astype(\"float32\")\n    r1 = np.argwhere(image_arr == 1)  # tuple\n    pframes = []\n    # i -> object id\n    for i in range(1, len(np.unique(image_arr))):\n        pframe = OrderedDict()\n        pframe['path'] = []\n        # Find object id in image_arr\n        r1 = np.argwhere(image_arr == i)  # tuple\n        r1 = r1.astype(\"float32\")\n        # Add path to pframe\n        for j in range(0, len(r1)):\n            r1[j][0] = r1[j][0] / 480.0\n            r1[j][1] = r1[j][1] / 910.0\n            # r1[j] = np.around(r1[j], decimals=16)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/api.py:40-67"
+    },
+    "295": {
+        "file_id": 30,
+        "content": "The code converts a PNG image to JSON format. It opens the image using PIL, converts it to grayscale, and stores the unique object IDs found in the image. For each object ID, it finds its corresponding locations in the image and adds them as paths to the pframe (an OrderedDict). Finally, it appends the pframes to a list called pframes. The function returns this list of pframes when complete.",
+        "type": "comment"
+    },
+    "296": {
+        "file_id": 30,
+        "content": "            pframe['path'].append(r1[j].tolist())\n        # Add object id, start_time, stop_time\n        pframe['object_id'] = i\n        pframe['start_time'] = sliderframenum\n        pframe['stop_time'] = sliderframenum\n        # Add pframe to pframes\n        pframes.append(pframe)\n    dic = OrderedDict()\n    dic['scribbles'] = []\n    for i in range(0, int(100)):\n        if i == sliderframenum:\n            # Add value to frame[]\n            dic['scribbles'].append(pframes)\n        else:\n            dic['scribbles'].append([])\n    json_str = json.dumps(dic)\n    with open(save_json_path, 'w') as json_file:\n        json_file.write(json_str)\ndef load_video(video_path, min_side=None):\n    frame_list = []\n    # ToDo To AP-kai: 是不是轻松干掉了m.video_path？\n    cap = cv2.VideoCapture(video_path)\n    # ToDo To AP-kai: while (cap.isOpened()): -> 不必多写个括号哈\n    while cap.isOpened():\n        _, frame = cap.read()\n        if frame is None:\n            break\n        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n        if min_side:\n            h, w = frame.shape[:2]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/api.py:68-101"
+    },
+    "297": {
+        "file_id": 30,
+        "content": "This code is related to video processing, specifically for saving and loading videos. It creates a JSON file with scribble annotations on frames. The \"load_video\" function reads the video frames and converts them to RGB format if necessary. It also supports optional minimum side parameter for resizing frames.",
+        "type": "comment"
+    },
+    "298": {
+        "file_id": 30,
+        "content": "            new_w = (w * min_side // min(w, h))\n            new_h = (h * min_side // min(w, h))\n            frame = cv2.resize(frame, (new_w, new_h),\n                               interpolation=cv2.INTER_CUBIC)\n            # .transpose([2, 0, 1])\n        frame_list.append(frame)\n    frames = np.stack(frame_list, axis=0)\n    return frames, frame_list\ndef get_scribbles():\n    # os.makedirs(TEMP_JSON_SAVE_PATH, exist_ok=True)\n    with open(TEMP_JSON_SAVE_PATH) as f:\n        print(\"load TEMP_JSON_SAVE_PATH success\")\n        scribbles = json.load(f)\n        first_scribble = True\n        yield scribbles, first_scribble\ndef submit_masks(save_path, masks, images):\n    overlays = []\n    for img_name, (mask, image) in enumerate(zip(masks, images)):\n        overlay = overlay_davis(image, mask)\n        overlays.append(overlay.tolist())\n        overlay = Image.fromarray(overlay)\n        img_name = str(img_name)\n        while len(img_name) < 5:\n            img_name = '0' + img_name\n        overlay.save(os.path.join(save_path, img_name + '.png'))",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/api.py:102-130"
+    },
+    "299": {
+        "file_id": 30,
+        "content": "Code chunk resizes images, appends them to a list, stacks the frames into an array and returns both. It also handles loading data from TEMP_JSON_SAVE_PATH and yields scribbles with a boolean flag for the first one. The last function processes masks by overlaying them onto images, saves them as PNGs in the specified save path.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/20.json b/docs/data/20.json
new file mode 100644
index 000000000..a64d4947d
--- /dev/null
+++ b/docs/data/20.json
@@ -0,0 +1,548 @@
+{
+    "2000": {
+        "file_id": 160,
+        "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#         http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Tokenization classes.\"\"\"\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\nfrom __future__ import absolute_import\nfrom io import open\nimport collections\nimport unicodedata\nimport six\ndef convert_to_unicode(text):\n    \"\"\"Converts `text` to Unicode (if it's not already), assuming utf-8 input.\"\"\"\n    if six.PY3:",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:1-32"
+    },
+    "2001": {
+        "file_id": 160,
+        "content": "This code block is the first 30 lines of a Python file and includes a comment with license information, a documentation string, and an import section. The function \"convert_to_unicode\" converts text to Unicode, assuming utf-8 input.",
+        "type": "comment"
+    },
+    "2002": {
+        "file_id": 160,
+        "content": "        if isinstance(text, str):\n            return text\n        elif isinstance(text, bytes):\n            return text.decode(\"utf-8\", \"ignore\")\n        else:\n            raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n    elif six.PY2:\n        if isinstance(text, str):\n            return text.decode(\"utf-8\", \"ignore\")\n        elif isinstance(text, unicode):\n            return text\n        else:\n            raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n    else:\n        raise ValueError(\"Not running on Python2 or Python 3?\")\ndef printable_text(text):\n    \"\"\"Returns text encoded in a way suitable for print or `tf.logging`.\"\"\"\n    # These functions want `str` for both Python2 and Python3, but in one case\n    # it's a Unicode string and in the other it's a byte string.\n    if six.PY3:\n        if isinstance(text, str):\n            return text\n        elif isinstance(text, bytes):\n            return text.decode(\"utf-8\", \"ignore\")\n        else:\n            raise ValueError(\"Unsupported string type: %s\" % (type(text)))",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:33-61"
+    },
+    "2003": {
+        "file_id": 160,
+        "content": "This code is a function named \"printable_text\" that takes a text parameter and returns it encoded in a way suitable for print or tf.logging. It handles both Python 2 and Python 3 by checking the environment using six.PY2 and six.PY3, and converting strings to str format before returning them. The function checks the type of the input text (str or bytes) and decodes it accordingly (from utf-8 encoding \"ignore\"). If the input type is not supported, it raises a ValueError.",
+        "type": "comment"
+    },
+    "2004": {
+        "file_id": 160,
+        "content": "    elif six.PY2:\n        if isinstance(text, str):\n            return text\n        elif isinstance(text, unicode):\n            return text.encode(\"utf-8\")\n        else:\n            raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n    else:\n        raise ValueError(\"Not running on Python2 or Python 3?\")\ndef load_vocab(vocab_file):\n    \"\"\"Loads a vocabulary file into a dictionary.\"\"\"\n    vocab = collections.OrderedDict()\n    with open(vocab_file, encoding='utf8') as fin:\n        for num, line in enumerate(fin):\n            items = convert_to_unicode(line.strip()).split(\"\\t\")\n            if len(items) > 2:\n                break\n            token = items[0]\n            index = items[1] if len(items) == 2 else num\n            token = token.strip()\n            vocab[token] = int(index)\n    return vocab\ndef convert_by_vocab(vocab, items):\n    \"\"\"Converts a sequence of [tokens|ids] using the vocab.\"\"\"\n    output = []\n    for item in items:\n        output.append(vocab[item])\n    return output\ndef convert_tokens_to_ids(vocab, tokens):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:62-96"
+    },
+    "2005": {
+        "file_id": 160,
+        "content": "This code handles loading and converting vocabulary files. It checks the Python version, loads a vocabulary file into an ordered dictionary, and defines functions to convert sequences of tokens or IDs using the vocab.",
+        "type": "comment"
+    },
+    "2006": {
+        "file_id": 160,
+        "content": "    \"\"\"convert_tokens_to_ids\n    \"\"\"\n    return convert_by_vocab(vocab, tokens)\ndef convert_ids_to_tokens(inv_vocab, ids):\n    \"\"\"convert_ids_to_tokens\n    \"\"\"\n    return convert_by_vocab(inv_vocab, ids)\ndef whitespace_tokenize(text):\n    \"\"\"Runs basic whitespace cleaning and splitting on a peice of text.\"\"\"\n    text = text.strip()\n    if not text:\n        return []\n    tokens = text.split()\n    return tokens\nclass FullTokenizer(object):\n    \"\"\"Runs end-to-end tokenziation.\"\"\"\n    def __init__(self, vocab_file, do_lower_case=True):\n        \"\"\"init\n        \"\"\"\n        self.vocab = load_vocab(vocab_file)\n        self.inv_vocab = {v: k for k, v in self.vocab.items()}\n        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)\n        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)\n    def tokenize(self, text):\n        \"\"\"tokenize\n        \"\"\"\n        split_tokens = []\n        for token in self.basic_tokenizer.tokenize(text):\n            for sub_token in self.wordpiece_tokenizer.tokenize(token):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:97-133"
+    },
+    "2007": {
+        "file_id": 160,
+        "content": "This code defines a FullTokenizer class for end-to-end tokenization. It utilizes two other classes, BasicTokenizer and WordpieceTokenizer, to perform basic whitespace cleaning and splitting on text data. The FullTokenizer initializes with a vocab file, load_vocab function, and an optional flag for case sensitivity. The tokenize method processes the input text by iterating over each token produced from both BasicTokenizer and WordpieceTokenizer, resulting in split tokens for further processing or analysis.",
+        "type": "comment"
+    },
+    "2008": {
+        "file_id": 160,
+        "content": "                split_tokens.append(sub_token)\n        return split_tokens\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\"convert_tokens_to_ids\n        \"\"\"\n        return convert_by_vocab(self.vocab, tokens)\n    def convert_ids_to_tokens(self, ids):\n        \"\"\"convert_ids_to_tokens\n        \"\"\"\n        return convert_by_vocab(self.inv_vocab, ids)\nclass CharTokenizer(object):\n    \"\"\"Runs end-to-end tokenziation.\"\"\"\n    def __init__(self, vocab_file, do_lower_case=True):\n        self.vocab = load_vocab(vocab_file)\n        self.inv_vocab = {v: k for k, v in self.vocab.items()}\n        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)\n    def tokenize(self, text):\n        \"\"\"tokenize\n        \"\"\"\n        split_tokens = []\n        for token in text.lower().split(\" \"):\n            for sub_token in self.wordpiece_tokenizer.tokenize(token):\n                split_tokens.append(sub_token)\n        return split_tokens\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\"convert_tokens_to_ids\n        \"\"\"",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:134-168"
+    },
+    "2009": {
+        "file_id": 160,
+        "content": "The code defines a CharTokenizer class for end-to-end tokenization. It initializes with a vocab_file and do_lower_case parameter. The class has methods to tokenize text, convert tokens to ids, and convert ids to tokens using the vocab file and inverse vocab file. The tokenization process involves lowercasing the input text, splitting it into words, and then tokenizing each word using a WordpieceTokenizer with the same vocab file.",
+        "type": "comment"
+    },
+    "2010": {
+        "file_id": 160,
+        "content": "        return convert_by_vocab(self.vocab, tokens)\n    def convert_ids_to_tokens(self, ids):\n        \"\"\"convert_ids_to_tokens\n        \"\"\"\n        return convert_by_vocab(self.inv_vocab, ids)\nclass BasicTokenizer(object):\n    \"\"\"Runs basic tokenization (punctuation splitting, lower casing, etc.).\"\"\"\n    def __init__(self, do_lower_case=True):\n        \"\"\"Constructs a BasicTokenizer.\n        Args:\n            do_lower_case: Whether to lower case the input.\n        \"\"\"\n        self.do_lower_case = do_lower_case\n    def tokenize(self, text):\n        \"\"\"Tokenizes a piece of text.\"\"\"\n        text = convert_to_unicode(text)\n        text = self._clean_text(text)\n        # This was added on November 1st, 2018 for the multilingual and Chinese\n        # models. This is also applied to the English models now, but it doesn't\n        # matter since the English models were not trained on any Chinese data\n        # and generally don't have any Chinese data in them (there are Chinese\n        # characters in the vocabulary because Wikipedia does have some Chinese",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:169-197"
+    },
+    "2011": {
+        "file_id": 160,
+        "content": "This code defines a `BasicTokenizer` class that performs basic text tokenization, including punctuation splitting and lower casing. It also includes methods for converting tokens to IDs and vice versa using vocabularies. The class has an optional `do_lower_case` parameter controlling whether the input should be lowercased or not.",
+        "type": "comment"
+    },
+    "2012": {
+        "file_id": 160,
+        "content": "        # words in the English Wikipedia.).\n        text = self._tokenize_chinese_chars(text)\n        orig_tokens = whitespace_tokenize(text)\n        split_tokens = []\n        for token in orig_tokens:\n            if self.do_lower_case:\n                token = token.lower()\n                token = self._run_strip_accents(token)\n            split_tokens.extend(self._run_split_on_punc(token))\n        output_tokens = whitespace_tokenize(\" \".join(split_tokens))\n        return output_tokens\n    def _run_strip_accents(self, text):\n        \"\"\"Strips accents from a piece of text.\"\"\"\n        text = unicodedata.normalize(\"NFD\", text)\n        output = []\n        for char in text:\n            cat = unicodedata.category(char)\n            if cat == \"Mn\":\n                continue\n            output.append(char)\n        return \"\".join(output)\n    def _run_split_on_punc(self, text):\n        \"\"\"Splits punctuation on a piece of text.\"\"\"\n        chars = list(text)\n        i = 0\n        start_new_word = True\n        output = []\n        while i < len(chars):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:198-229"
+    },
+    "2013": {
+        "file_id": 160,
+        "content": "The code segment tokenizes Chinese characters, performs lower casing if needed, strips accents from text, and splits the punctuation on a given piece of text. This process is to prepare the text for further processing in the application.",
+        "type": "comment"
+    },
+    "2014": {
+        "file_id": 160,
+        "content": "            char = chars[i]\n            if _is_punctuation(char):\n                output.append([char])\n                start_new_word = True\n            else:\n                if start_new_word:\n                    output.append([])\n                start_new_word = False\n                output[-1].append(char)\n            i += 1\n        return [\"\".join(x) for x in output]\n    def _tokenize_chinese_chars(self, text):\n        \"\"\"Adds whitespace around any CJK character.\"\"\"\n        output = []\n        for char in text:\n            cp = ord(char)\n            if self._is_chinese_char(cp):\n                output.append(\" \")\n                output.append(char)\n                output.append(\" \")\n            else:\n                output.append(char)\n        return \"\".join(output)\n    def _is_chinese_char(self, cp):\n        \"\"\"Checks whether CP is the codepoint of a CJK character.\"\"\"\n        # This defines a \"chinese character\" as anything in the CJK Unicode block:\n        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:230-259"
+    },
+    "2015": {
+        "file_id": 160,
+        "content": "This code defines functions for tokenizing and processing text data. The `_is_punctuation` function identifies punctuation characters, while the `tokenize_text` function separates words by detecting new word starts. The `_tokenize_chinese_chars` function adds whitespace around Chinese characters to separate them from surrounding text.",
+        "type": "comment"
+    },
+    "2016": {
+        "file_id": 160,
+        "content": "        #\n        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,\n        # despite its name. The modern Korean Hangul alphabet is a different block,\n        # as is Japanese Hiragana and Katakana. Those alphabets are used to write\n        # space-separated words, so they are not treated specially and handled\n        # like the all of the other languages.\n        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #\n            (cp >= 0x3400 and cp <= 0x4DBF) or  #\n            (cp >= 0x20000 and cp <= 0x2A6DF) or  #\n            (cp >= 0x2A700 and cp <= 0x2B73F) or  #\n            (cp >= 0x2B740 and cp <= 0x2B81F) or  #\n            (cp >= 0x2B820 and cp <= 0x2CEAF) or\n            (cp >= 0xF900 and cp <= 0xFAFF) or  #\n            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #\n            return True\n        return False\n    def _clean_text(self, text):\n        \"\"\"Performs invalid character removal and whitespace cleanup on text.\"\"\"\n        output = []\n        for char in text:\n            cp = ord(char)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:260-282"
+    },
+    "2017": {
+        "file_id": 160,
+        "content": "The code checks if a character falls within the CJK Unicode block, which includes Japanese and Korean characters. It returns True if any of these characters are found, indicating that the text is in one of these languages. The function also performs invalid character removal and whitespace cleanup on the given text.",
+        "type": "comment"
+    },
+    "2018": {
+        "file_id": 160,
+        "content": "            if cp == 0 or cp == 0xfffd or _is_control(char):\n                continue\n            if _is_whitespace(char):\n                output.append(\" \")\n            else:\n                output.append(char)\n        return \"\".join(output)\nclass WordpieceTokenizer(object):\n    \"\"\"Runs WordPiece tokenziation.\"\"\"\n    def __init__(self, vocab, unk_token=\"[UNK]\", max_input_chars_per_word=100):\n        self.vocab = vocab\n        self.unk_token = unk_token\n        self.max_input_chars_per_word = max_input_chars_per_word\n    def tokenize(self, text):\n        \"\"\"Tokenizes a piece of text into its word pieces.\n        This uses a greedy longest-match-first algorithm to perform tokenization\n        using the given vocabulary.\n        For example:\n            input = \"unaffable\"\n            output = [\"un\", \"##aff\", \"##able\"]\n        Args:\n            text: A single token or whitespace separated tokens. This should have\n                already been passed through `BasicTokenizer.\n        Returns:\n            A list of wordpiece tokens.",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:283-315"
+    },
+    "2019": {
+        "file_id": 160,
+        "content": "This code defines a WordpieceTokenizer class that tokenizes text into word pieces using a greedy longest-match-first algorithm and a given vocabulary. The tokenize method takes in a text input, performs tokenization by matching the longest possible substrings from the vocabulary, and returns a list of wordpiece tokens.",
+        "type": "comment"
+    },
+    "2020": {
+        "file_id": 160,
+        "content": "        \"\"\"\n        text = convert_to_unicode(text)\n        output_tokens = []\n        for token in whitespace_tokenize(text):\n            chars = list(token)\n            if len(chars) > self.max_input_chars_per_word:\n                output_tokens.append(self.unk_token)\n                continue\n            is_bad = False\n            start = 0\n            sub_tokens = []\n            while start < len(chars):\n                end = len(chars)\n                cur_substr = None\n                while start < end:\n                    substr = \"\".join(chars[start:end])\n                    if start > 0:\n                        substr = \"##\" + substr\n                    if substr in self.vocab:\n                        cur_substr = substr\n                        break\n                    end -= 1\n                if cur_substr is None:\n                    is_bad = True\n                    break\n                sub_tokens.append(cur_substr)\n                start = end\n            if is_bad:\n                output_tokens.append(self.unk_token)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:316-348"
+    },
+    "2021": {
+        "file_id": 160,
+        "content": "This code tokenizes text by splitting it into words, checks if each word is in the vocabulary. If not, it adds a special unknown token. It handles long words by splitting them into smaller parts and checking each part separately.",
+        "type": "comment"
+    },
+    "2022": {
+        "file_id": 160,
+        "content": "            else:\n                output_tokens.extend(sub_tokens)\n        return output_tokens\ndef _is_whitespace(char):\n    \"\"\"Checks whether `chars` is a whitespace character.\"\"\"\n    # \\t, \\n, and \\r are technically contorl characters but we treat them\n    # as whitespace since they are generally considered as such.\n    if char == \" \" or char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return True\n    cat = unicodedata.category(char)\n    if cat == \"Zs\":\n        return True\n    return False\ndef _is_control(char):\n    \"\"\"Checks whether `chars` is a control character.\"\"\"\n    # These are technically control characters but we count them as whitespace\n    # characters.\n    if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return False\n    cat = unicodedata.category(char)\n    if cat.startswith(\"C\"):\n        return True\n    return False\ndef _is_punctuation(char):\n    \"\"\"Checks whether `chars` is a punctuation character.\"\"\"\n    cp = ord(char)\n    # We treat all non-letter/number ASCII as punctuation.\n    # Characters such as \"^\", \"$\", and \"`\" are not in the Unicode",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:349-382"
+    },
+    "2023": {
+        "file_id": 160,
+        "content": "The code defines several functions for tokenizing a string: _is_whitespace checks if the character is whitespace, _is_control identifies control characters, and _is_punctuation classifies punctuation. The main function extends output tokens based on these character types.",
+        "type": "comment"
+    },
+    "2024": {
+        "file_id": 160,
+        "content": "    # Punctuation class but we treat them as punctuation anyways, for\n    # consistency.\n    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or\n        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):\n        return True\n    cat = unicodedata.category(char)\n    if cat.startswith(\"P\"):\n        return True\n    return False\ndef tokenize_chinese_chars(text):\n    \"\"\"Adds whitespace around any CJK character.\"\"\"\n    def _is_chinese_char(cp):\n        \"\"\"Checks whether CP is the codepoint of a CJK character.\"\"\"\n        # This defines a \"chinese character\" as anything in the CJK Unicode block:\n        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)\n        #\n        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,\n        # despite its name. The modern Korean Hangul alphabet is a different block,\n        # as is Japanese Hiragana and Katakana. Those alphabets are used to write\n        # space-separated words, so they are not treated specially and handled",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:383-405"
+    },
+    "2025": {
+        "file_id": 160,
+        "content": "This code checks if a given character is a punctuation or Chinese character by checking its Unicode category and code point range. It returns True if the character is a punctuation or Chinese character, and False otherwise. The function is used to tokenize Chinese characters in text by adding whitespace around them.",
+        "type": "comment"
+    },
+    "2026": {
+        "file_id": 160,
+        "content": "        # like the all of the other languages.\n        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #\n            (cp >= 0x3400 and cp <= 0x4DBF) or  #\n            (cp >= 0x20000 and cp <= 0x2A6DF) or  #\n            (cp >= 0x2A700 and cp <= 0x2B73F) or  #\n            (cp >= 0x2B740 and cp <= 0x2B81F) or  #\n            (cp >= 0x2B820 and cp <= 0x2CEAF) or\n            (cp >= 0xF900 and cp <= 0xFAFF) or  #\n            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #\n            return True\n        return False\n    def _is_whitespace(c):\n        \"\"\"_is_whitespace\n        \"\"\"\n        if c == \" \" or c == \"\\t\" or c == \"\\r\" or c == \"\\n\" or ord(c) == 0x202F:\n            return True\n        return False\n    output = []\n    buff = \"\"\n    for char in text:\n        cp = ord(char)\n        if _is_chinese_char(cp) or _is_whitespace(char):\n            if buff != \"\":\n                output.append(buff)\n                buff = \"\"\n            output.append(char)\n        else:\n            buff += char\n    if buff != \"\":\n        output.append(buff)\n    return output",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:406-441"
+    },
+    "2027": {
+        "file_id": 160,
+        "content": "This function tokenizes text by detecting Chinese characters and whitespace, appending non-Chinese characters to a buffer and adding the buffer to the output when a space or Chinese character is found. Finally, it appends any remaining buffer content.",
+        "type": "comment"
+    },
+    "2028": {
+        "file_id": 161,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py",
+        "type": "filepath"
+    },
+    "2029": {
+        "file_id": 161,
+        "content": "This code uses PaddlePaddle and AttentionLstmErnie for multimodal video tagging, including model building, adjusting batch size, calculating metrics, testing, and saving parameters. The main function handles argument parsing and evaluation.",
+        "type": "summary"
+    },
+    "2030": {
+        "file_id": 161,
+        "content": "\"\"\"\neval main\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\nimport time\nimport argparse\nimport logging\nimport pickle\nimport numpy as np\nimport paddle\npaddle.enable_static()\nimport paddle.static as static\nfrom accuracy_metrics import MetricsCalculator\nfrom datareader import get_reader\nfrom config import parse_config, merge_configs, print_configs\nfrom models.attention_lstm_ernie import AttentionLstmErnie\nfrom utils import test_with_pyreader\ndef parse_args():",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:1-37"
+    },
+    "2031": {
+        "file_id": 161,
+        "content": "This code is an evaluation function for a multimodal video tagging application. It imports necessary libraries, disables dynamic memory allocation, sets up the PaddlePaddle environment, and includes functions for reading data, defining the model architecture, and calculating metrics. The code also defines a \"parse_args\" function to handle command line arguments.",
+        "type": "comment"
+    },
+    "2032": {
+        "file_id": 161,
+        "content": "    \"\"\"parse_args\n    \"\"\"\n    parser = argparse.ArgumentParser(\"Paddle Video evaluate script\")\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='BaiduNet',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/conf.txt',\n                        help='path to config file of model')\n    parser.add_argument(\n        '--pretrain',\n        type=str,\n        default=None,\n        help=\n        'path to pretrain weights. None to use default weights path in  ~/.paddle/weights.'\n    )\n    parser.add_argument('--output', type=str, default=None, help='output path')\n    parser.add_argument('--use_gpu',\n                        type=bool,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument('--save_model_param_dir',\n                        type=str,\n                        default=None,\n                        help='checkpoint path')",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:38-64"
+    },
+    "2033": {
+        "file_id": 161,
+        "content": "This code defines an argument parser for the Paddle Video evaluate script. It allows users to input a model name, config file path, pretrain weights path, output path, use_gpu flag, and save_model_param_dir. The default values are provided for each argument in case they aren't specified by the user.",
+        "type": "comment"
+    },
+    "2034": {
+        "file_id": 161,
+        "content": "    parser.add_argument('--save_inference_model',\n                        type=str,\n                        default=None,\n                        help='save inference path')\n    parser.add_argument('--save_only',\n                        action='store_true',\n                        default=False,\n                        help='only save model, do not evaluate model')\n    args = parser.parse_args()\n    return args\ndef evaluate(args):\n    \"\"\"evaluate\n    \"\"\"\n    # parse config\n    config = parse_config(args.config)\n    valid_config = merge_configs(config, 'valid', vars(args))\n    print_configs(valid_config, 'Valid')\n    # build model\n    valid_model = AttentionLstmErnie(args.model_name,\n                                     valid_config,\n                                     mode='valid')\n    startup = static.Program()\n    valid_prog = static.default_main_program().clone(for_test=True)\n    with static.program_guard(valid_prog, startup):\n        paddle.disable_static()\n        valid_model.build_input(True)\n        valid_model.build_model()",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:65-94"
+    },
+    "2035": {
+        "file_id": 161,
+        "content": "This code defines command-line arguments for saving and evaluating an inference model, parses the configuration file, builds a model using AttentionLstmErnie, and sets up static programs for evaluation.",
+        "type": "comment"
+    },
+    "2036": {
+        "file_id": 161,
+        "content": "        valid_feeds = valid_model.feeds()\n        valid_outputs = valid_model.outputs()\n        valid_loss = valid_model.loss()\n        valid_pyreader = valid_model.pyreader()\n        paddle.enable_static()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(startup)\n    compiled_valid_prog = static.CompiledProgram(valid_prog)\n    # load weights\n    assert os.path.exists(args.save_model_param_dir), \\\n            \"Given save weight dir {} not exist.\".format(args.save_model_param_dir)\n    valid_model.load_test_weights_file(exe, args.save_model_param_dir,\n                                       valid_prog, place)\n    if args.save_inference_model:\n        save_model_params(exe, valid_prog, valid_model,\n                          args.save_inference_model)\n    if args.save_only is True:\n        print('save model only, exit')\n        return\n    # get reader\n    bs_denominator = 1\n    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /\n                                        bs_denominator)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:95-123"
+    },
+    "2037": {
+        "file_id": 161,
+        "content": "This code is loading the model from a specified directory, compiling the program and running it. It checks if the save weight directory exists and loads the test weights into the model. If necessary, it saves the inference model and if only saving the model is required, it exits. The batch size is adjusted by dividing it by a denominator.",
+        "type": "comment"
+    },
+    "2038": {
+        "file_id": 161,
+        "content": "    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)\n    # get metrics\n    valid_metrics = MetricsCalculator(args.model_name.upper(), 'valid',\n                                      valid_config)\n    valid_fetch_list = [valid_loss.name] + [x.name for x in valid_outputs\n                                            ] + [valid_feeds[-1].name]\n    # get reader\n    exe_places = static.cuda_places() if args.use_gpu else static.cpu_places()\n    valid_pyreader.decorate_sample_list_generator(valid_reader,\n                                                  places=exe_places)\n    test_loss, metrics_dict_test = test_with_pyreader(exe, compiled_valid_prog,\n                                                      valid_pyreader,\n                                                      valid_fetch_list,\n                                                      valid_metrics)\n    test_acc1 = metrics_dict_test['avg_acc1']\n    print(test_loss)\n    print(test_acc1)\ndef save_model_params(exe, program, model_object, save_dir):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:124-145"
+    },
+    "2039": {
+        "file_id": 161,
+        "content": "This code retrieves a valid reader, calculates metrics, and decorates the sample list generator with specified execution places. It then tests the model using the reader, program, and fetch list to obtain test loss and accuracy, which are printed. The function `save_model_params` saves the model parameters in the provided directory.",
+        "type": "comment"
+    },
+    "2040": {
+        "file_id": 161,
+        "content": "    \"\"\"save_model_params\n    \"\"\"\n    feeded_var_names = [var.name for var in model_object.feeds()][:-1]\n    static.save_inference_model(dirname=save_dir,\n                                  feeded_var_names=feeded_var_names,\n                                  main_program=program,\n                                  target_vars=model_object.outputs(),\n                                  executor=exe,\n                                  model_filename='model',\n                                  params_filename='params')\nif __name__ == \"__main__\":\n    args = parse_args()\n    evaluate(args)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:146-159"
+    },
+    "2041": {
+        "file_id": 161,
+        "content": "This code defines a function \"save_model_params\" that takes the directory path, saves the inference model, and specifies the feeded variable names, main program, target variables, executor, and filenames for the model and parameters. It also includes a main function that parses arguments and evaluates them.",
+        "type": "comment"
+    },
+    "2042": {
+        "file_id": 162,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/inference.py",
+        "type": "filepath"
+    },
+    "2043": {
+        "file_id": 162,
+        "content": "The code is a Paddle Video inference script with argparse handling and an \"InferModel\" class for model loading and prediction. It supports GPU usage, multimodal video tagging, and customizable parameters. The inference function takes videos, labels, predicts, and outputs results to a JSON file.",
+        "type": "summary"
+    },
+    "2044": {
+        "file_id": 162,
+        "content": "#!/usr/bin/env python\n# coding=utf-8\n\"\"\"\ninfer model\n\"\"\"\nimport sys\nimport os\nimport numpy as np\nimport json\nimport pickle\nimport argparse\nimport time\nimport numpy as np\nimport paddle\nfrom datareader import get_reader\nfrom config import merge_configs, parse_config, print_configs\ndef parse_args():\n    \"\"\"parse_args\n    \"\"\"\n    parser = argparse.ArgumentParser(\"Paddle Video infer script\")\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='BaiduNet',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/conf.txt',\n                        help='path to config file of model')\n    parser.add_argument('--output', type=str, default=None, help='output path')\n    parser.add_argument('--use_gpu',\n                        type=bool,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument('--save_inference_model',",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/inference.py:1-38"
+    },
+    "2045": {
+        "file_id": 162,
+        "content": "This code is a Paddle Video inference script for a specific model. It parses arguments such as the model name, config file path, output path, use_gpu, and save_inference_model flag. The script utilizes argparse to handle these command-line arguments.",
+        "type": "comment"
+    },
+    "2046": {
+        "file_id": 162,
+        "content": "                        type=str,\n                        default=None,\n                        help='save inference path')\n    args = parser.parse_args()\n    return args\nclass InferModel(object):\n    \"\"\"lstm infer\"\"\"\n    def __init__(self, cfg, name='ACTION'): \n        name = name.upper()\n        self.name           = name\n        self.threshold      = cfg.INFER.threshold\n        self.cfg            = cfg\n        self.label_map      = load_class_file(cfg.MODEL.class_name_file)\n    def load_inference_model(self, model_dir, use_gpu=True):\n        \"\"\"model_init\n        \"\"\"\n        model_file = os.path.join(model_dir, \"model\")\n        params_file = os.path.join(model_dir, \"params\")\n        config = paddle.inference.Config(model_file, params_file)\n        if use_gpu:\n            config.enable_use_gpu(1024)\n        else:\n            config.disable_gpu()\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = paddle.inference.create_predictor(config)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/inference.py:39-69"
+    },
+    "2047": {
+        "file_id": 162,
+        "content": "The code is defining a class \"InferModel\" which initializes variables such as name, threshold, and label_map. It also defines a method \"load_inference_model\" that loads the model file, configures GPU usage, and creates a predictor for inference. The code takes input arguments like model_dir, use_gpu, and other configuration parameters.",
+        "type": "comment"
+    },
+    "2048": {
+        "file_id": 162,
+        "content": "        # build input tensor and output tensor\n        self.build_input_output()\n    def build_input_output(self):\n        \"\"\"build_input_output\n        \"\"\"\n        input_names = self.predictor.get_input_names()\n        # input\n        self.input_rgb_tensor = self.predictor.get_input_handle(input_names[0])\n        self.input_audio_tensor = self.predictor.get_input_handle(input_names[1])\n        self.input_text_tensor = self.predictor.get_input_handle(input_names[2])\n        # output\n        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def preprocess_for_lod_data(self, input):\n        \"\"\"pre process\"\"\"\n        input_arr = []\n        input_lod = [0]\n        start_lod = 0\n        end_lod = 0\n        for sub_item in input:\n            end_lod = start_lod + len(sub_item)\n            input_lod.append(end_lod)\n            input_arr.extend(sub_item)\n            start_lod = end_lod\n        input_arr = np.array(input_arr)\n        return input_arr, [input_lod]",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/inference.py:70-98"
+    },
+    "2049": {
+        "file_id": 162,
+        "content": "This code initializes input and output tensors for a multimodal video tagging scenario. It builds the input tensor from RGB frames, audio data, and text data, and retrieves the output tensor. The `preprocess_for_lod_data` function converts input data into a list of arrays with length indicators (LOD) for efficient handling.",
+        "type": "comment"
+    },
+    "2050": {
+        "file_id": 162,
+        "content": "    def predict(self):\n        \"\"\"predict\"\"\"\n        infer_reader = get_reader(self.name, 'infer', self.cfg)\n        probs = []\n        video_ids = []\n        label_map_inverse = {value: key for key, value in self.label_map.items()}\n        for infer_iter, data in enumerate(infer_reader()):\n            # video_id = [[items[-2], items[-1]] for items in data]\n            rgb = [items[0] for items in data]\n            audio = [items[1] for items in data]\n            text = np.array([items[2] for items in data])\n            videos = np.array([items[3] for items in data])\n            rgb_arr, rgb_lod = self.preprocess_for_lod_data(rgb)\n            audio_arr, audio_lod = self.preprocess_for_lod_data(audio)\n            self.input_rgb_tensor.copy_from_cpu(rgb_arr.astype('float32'))\n            self.input_rgb_tensor.set_lod(rgb_lod)\n            self.input_audio_tensor.copy_from_cpu(audio_arr.astype('float32'))\n            self.input_audio_tensor.set_lod(audio_lod)\n            self.input_text_tensor.copy_from_cpu(text.astype('int64'))",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/inference.py:100-122"
+    },
+    "2051": {
+        "file_id": 162,
+        "content": "In this code, a function named \"predict\" is defined. It uses a reader to process data for inference and iterates through each instance of the data. The instances contain RGB images, audio, text, and video IDs. These instances are preprocessed for LOD (Level Of Detail) data using the preprocess_for_lod_data method. The preprocessed data is then copied to corresponding input tensors (RGB, audio, and text) for inference.",
+        "type": "comment"
+    },
+    "2052": {
+        "file_id": 162,
+        "content": "            self.predictor.run()\n            output = self.output_tensor.copy_to_cpu()\n            probs.extend(list(output))\n            video_ids.extend(videos)\n        assert len(video_ids) == len(probs)\n        result = []\n        for video_id, prob in zip(video_ids, probs):\n            label_idx = list(np.where(prob >= self.threshold)[0])\n            result.append({\n                \"video_id\": video_id,\n                \"labels\": [\n                    (label_map_inverse[str(idx)], float(prob[idx])) for idx in label_idx\n                ]\n            })\n        return result\ndef load_class_file(class_file):\n    \"\"\"\n    load_class_file\n    \"\"\"\n    class_lines = open(class_file, 'r', encoding='utf8').readlines()\n    class_dict = {}\n    for i, line in enumerate(class_lines):\n        tmp = line.strip().split('\\t')\n        word = tmp[0]\n        index = str(i)\n        if len(tmp) == 2:\n            index = tmp[1]\n        class_dict[word] = index\n    return class_dict\ndef infer(args):\n    \"\"\"\n    infer main\n    \"\"\"\n    config = parse_config(args.config)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/inference.py:124-161"
+    },
+    "2053": {
+        "file_id": 162,
+        "content": "The code defines an inference function that takes a set of videos and their associated labels, and returns the predicted labels for each video. It also includes functions to load class information from a file and parse command-line arguments. The inference function first runs the predictor on the input data, then extracts the output probabilities and corresponding video IDs. It checks that the number of video IDs matches the number of probabilities. Then, for each video-probability pair, it identifies the indices where the probability is above a certain threshold and uses these to determine the predicted labels. The resulting dictionary contains the video ID, as well as a list of tuples containing the label name and corresponding probability for each detected label.",
+        "type": "comment"
+    },
+    "2054": {
+        "file_id": 162,
+        "content": "    infer_config = merge_configs(config, 'infer', vars(args))\n    print_configs(infer_config, 'infer')\n    infer_obj = InferModel(infer_config, name=args.model_name)\n    infer_obj.load_inference_model(args.save_inference_model, use_gpu=args.use_gpu)\n    rt = infer_obj.predict()\n    if args.output:\n        with open(args.output, 'w') as f:\n            json.dump(rt, f, ensure_ascii=False, indent=4)\nif __name__ == \"__main__\":\n    args = parse_args()\n    infer(args)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/inference.py:162-173"
+    },
+    "2055": {
+        "file_id": 162,
+        "content": "This code snippet is from the \"inference.py\" file in the PaddleVideo MultimodalVideoTag application. It defines a function `infer` that performs model inference on input data and outputs the results. The code merges configs for the infer stage, prints them out, creates an InferModel object with those configs, loads the inference model from a given file (if provided), runs inference, and finally saves the results to a JSON file if requested.",
+        "type": "comment"
+    },
+    "2056": {
+        "file_id": 163,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py",
+        "type": "filepath"
+    },
+    "2057": {
+        "file_id": 163,
+        "content": "The code introduces an AttentionLstmErnie class for ERNIE-based scenario classification, implementing an LSTM-based attention model for video tagging using text and audio data. It employs dropout, batch normalization, and Neural Machine Translation approach.",
+        "type": "summary"
+    },
+    "2058": {
+        "file_id": 163,
+        "content": "#!/usr/bin/env python\n# coding=utf-8\n\"\"\"\nattention lstm add ernie model\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport paddle\nimport paddle.static as static\nfrom .ernie import ErnieConfig, ErnieModel\nclass AttentionLstmErnie(object):\n    \"\"\"\n    Base on scenario-classify (image + audio), add text information\n    use ERNIE to extract text feature\n    \"\"\"\n    def __init__(self, name, cfg, mode='train'):\n        self.cfg = cfg\n        self.name = name",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:1-34"
+    },
+    "2059": {
+        "file_id": 163,
+        "content": "This code defines the AttentionLstmErnie class, which extends the functionality of scenario-classify by incorporating text information. It uses ERNIE to extract text features and operates in either 'train' or 'infer' mode.",
+        "type": "comment"
+    },
+    "2060": {
+        "file_id": 163,
+        "content": "        self.mode = mode\n        self.py_reader = None\n        self.get_config()\n    def get_config(self):\n        \"\"\"get_config\n        \"\"\"\n        # get model configs\n        self.feature_num = self.cfg.MODEL.feature_num\n        self.feature_names = self.cfg.MODEL.feature_names\n        self.feature_dims = self.cfg.MODEL.feature_dims\n        self.feature_dtypes = self.cfg.MODEL.feature_dtypes\n        self.feature_lod_level = self.cfg.MODEL.feature_lod_level\n        self.num_classes = self.cfg.MODEL.num_classes\n        self.embedding_size = self.cfg.MODEL.embedding_size\n        self.lstm_size_img = self.cfg.MODEL.lstm_size_img\n        self.lstm_size_audio = self.cfg.MODEL.lstm_size_audio\n        self.ernie_freeze = self.cfg.MODEL.ernie_freeze\n        self.lstm_pool_mode = self.cfg.MODEL.lstm_pool_mode\n        self.drop_rate = self.cfg.MODEL.drop_rate\n        self.loss_type = self.cfg.TRAIN.loss_type\n        self.ernie_pretrain_dict_path = self.cfg.TRAIN.ernie_pretrain_dict_path\n        # get mode configs\n        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size', 1)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:35-59"
+    },
+    "2061": {
+        "file_id": 163,
+        "content": "This code initializes a model by setting attributes from a configuration file and calling the `get_config` function. The `get_config` function retrieves the model's configurations, including feature numbers, dimensions, data types, and more. It also gets mode-specific settings like batch size.",
+        "type": "comment"
+    },
+    "2062": {
+        "file_id": 163,
+        "content": "        self.num_gpus = self.get_config_from_sec(self.mode, 'num_gpus', 1)\n        if self.mode == 'train':\n            self.learning_rate = self.get_config_from_sec(\n                'train', 'learning_rate', 1e-3)\n            self.weight_decay = self.get_config_from_sec(\n                'train', 'weight_decay', 8e-4)\n            self.num_samples = self.get_config_from_sec(\n                'train', 'num_samples', 5000000)\n            self.decay_epochs = self.get_config_from_sec(\n                'train', 'decay_epochs', [5])\n            self.decay_gamma = self.get_config_from_sec(\n                'train', 'decay_gamma', 0.1)\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"get_config_from_sec\"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\n    def build_input(self, use_pyreader):\n        \"\"\"\n        build input\n        \"\"\"\n        self.feature_input = []\n        for name, dim, dtype, lod_level in zip(self.feature_names,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:60-85"
+    },
+    "2063": {
+        "file_id": 163,
+        "content": "This code is a part of the AttentionLSTMERNIE model. It initializes the number of GPUs, learning rate, weight decay, and other parameters for training mode. The function get_config_from_sec retrieves values from a configuration file using section and item names. The build_input function constructs input data by iterating over feature names, dimensions, data types, and lod levels.",
+        "type": "comment"
+    },
+    "2064": {
+        "file_id": 163,
+        "content": "                                               self.feature_dims,\n                                               self.feature_dtypes,\n                                               self.feature_lod_level):\n            self.feature_input.append(\n                static.data(shape=dim,\n                                  lod_level=lod_level,\n                                  dtype=dtype,\n                                  name=name))\n        self.label_input = static.data(shape=[self.num_classes],\n                                             dtype='float32',\n                                             name='label')\n        self.py_reader = paddle.fluid.io.PyReader(feed_list=self.feature_input +\n                                           [self.label_input],\n                                           capacity=1024,\n                                           iterable=True)\n    def ernie_encoder(self):\n        \"\"\"\n        text feature extractor\n        \"\"\"\n        ernie_config = ErnieConfig(\n            os.path.join(self.ernie_pretrain_dict_path, 'ernie_config.json'))",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:86-108"
+    },
+    "2065": {
+        "file_id": 163,
+        "content": "Code initializes the reader for data feeding into the model, sets the label input, and defines a function \"ernie_encoder\" that extracts text features using the Ernie model.",
+        "type": "comment"
+    },
+    "2066": {
+        "file_id": 163,
+        "content": "        if self.mode != 'train':\n            ernie_config['attention_probs_dropout_prob'] = 0.0\n            ernie_config['hidden_dropout_prob'] = 0.0\n        src_ids = self.feature_input[2][:, 0]\n        sent_ids = self.feature_input[2][:, 1]\n        position_ids = self.feature_input[2][:, 2]\n        task_ids = self.feature_input[2][:, 3]\n        input_mask = self.feature_input[2][:, 4].astype('float32')\n        ernie = ErnieModel(src_ids=src_ids,\n                           position_ids=position_ids,\n                           sentence_ids=sent_ids,\n                           task_ids=task_ids,\n                           input_mask=input_mask,\n                           config=ernie_config)\n        enc_out = ernie.get_sequence_output()\n        # to Freeze ERNIE param\n        if self.ernie_freeze is True:\n            enc_out.stop_gradient = True\n        # ernie cnn\n        enc_out_cnn = ernie.get_sequence_textcnn_output(enc_out, input_mask)\n        enc_out_cnn_drop = paddle.nn.functional.dropout(enc_out_cnn, p=self.drop_rate, training=(self.mode=='train'))",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:109-131"
+    },
+    "2067": {
+        "file_id": 163,
+        "content": "This code initializes an ErnieModel with features extracted from input data. If self.ernie_freeze is True, it freezes the ERNIE model's parameters to prevent further training. It then retrieves the sequence output and applies a dropout if in train mode.",
+        "type": "comment"
+    },
+    "2068": {
+        "file_id": 163,
+        "content": "        return enc_out_cnn_drop\n    def build_model(self):\n        \"\"\"build_model\n        \"\"\"\n        # ---------------- transfer from old paddle ---------------\n        # get image,audio,text feature\n        video_input_tensor = self.feature_input[0]\n        audio_input_tensor = self.feature_input[1]\n        self.ernie_feature = self.ernie_encoder()\n        # ------image------\n        lstm_forward_fc = static.nn.fc(x=video_input_tensor,\n                                          size=self.lstm_size_img * 4,\n                                          activation=None,\n                                          bias_attr=False)\n        lstm_forward, _ = paddle.fluid.layers.dynamic_lstm(input=lstm_forward_fc,\n                                                    size=self.lstm_size_img *\n                                                    4,\n                                                    is_reverse=False,\n                                                    use_peepholes=True)\n        lsmt_backward_fc = static.nn.fc(x=video_input_tensor,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:132-154"
+    },
+    "2069": {
+        "file_id": 163,
+        "content": "This code defines a function called \"build_model\" that creates and returns the model for video tagging. The model takes image, audio, and text features as input to generate attention-based LSTM features from the image. It applies fully connected layers (fc) on the image features and then passes them through dynamic LSTMs to obtain the forward and backward LSTM outputs. These outputs are used in further processing for video tagging.",
+        "type": "comment"
+    },
+    "2070": {
+        "file_id": 163,
+        "content": "                                           size=self.lstm_size_img * 4,\n                                           activation=None,\n                                           bias_attr=None)\n        lstm_backward, _ = paddle.fluid.layers.dynamic_lstm(input=lsmt_backward_fc,\n                                                     size=self.lstm_size_img *\n                                                     4,\n                                                     is_reverse=True,\n                                                     use_peepholes=True)\n        lstm_forward_img = paddle.concat(\n            x=[lstm_forward, lstm_backward], axis=1)\n        lstm_dropout = paddle.nn.functional.dropout(lstm_forward_img, p=self.drop_rate, training=(self.mode=='train'))\n        if self.lstm_pool_mode == 'text_guide':\n            lstm_weight = self.attention_weight_by_feature_seq2seq_attention(\n                self.ernie_feature, lstm_dropout, self.lstm_size_img * 2)\n        else:\n            lstm_weight = static.nn.fc(x=lstm_dropout,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:155-172"
+    },
+    "2071": {
+        "file_id": 163,
+        "content": "This code defines a dynamic LSTM layer for image features, concatenates it with the backward pass, and applies dropout if in training mode. If 'text_guide' pooling mode is selected, it computes attention weights between text features and LSTM output using seq2seq attention. Otherwise, it uses an FC layer to reduce the dimensions of the LSTM output.",
+        "type": "comment"
+    },
+    "2072": {
+        "file_id": 163,
+        "content": "                                          size=1,\n                                          activation='sequence_softmax',\n                                          bias_attr=None)\n        scaled = paddle.multiply(x=lstm_dropout,\n                                              y=lstm_weight)\n        self.lstm_pool = paddle.static.nn.sequence_pool(input=scaled,\n                                                    pool_type='sum')\n        # ------audio------\n        lstm_forward_fc_audio = static.nn.fc(\n            x=audio_input_tensor,\n            size=self.lstm_size_audio * 4,\n            activation=None,\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)))\n        lstm_forward_audio, _ = paddle.fluid.layers.dynamic_lstm(\n            input=lstm_forward_fc_audio,\n            size=self.lstm_size_audio * 4,\n            is_reverse=False,\n            use_peepholes=True)\n        lsmt_backward_fc_audio = static.nn.fc(x=audio_input_tensor,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:173-194"
+    },
+    "2073": {
+        "file_id": 163,
+        "content": "This code snippet is defining a LSTM model for processing both visual and audio inputs. It initializes a LSTM layer with dropout, applies element-wise multiplication with weights, performs sequence pooling on the output, and defines FC layers followed by LSTM for processing audio input. Regularization is applied to the audio LSTM layer using an L2 decay regularizer.",
+        "type": "comment"
+    },
+    "2074": {
+        "file_id": 163,
+        "content": "                                                 size=self.lstm_size_audio * 4,\n                                                 activation=None,\n                                                 bias_attr=False)\n        lstm_backward_audio, _ = paddle.fluid.layers.dynamic_lstm(\n            input=lsmt_backward_fc_audio,\n            size=self.lstm_size_audio * 4,\n            is_reverse=True,\n            use_peepholes=True)\n        lstm_forward_audio = paddle.concat(\n            x=[lstm_forward_audio, lstm_backward_audio], axis=1)\n        lstm_dropout_audio = paddle.nn.functional.dropout(lstm_forward_audio, p=self.drop_rate, training=(self.mode=='train'))\n        if self.lstm_pool_mode == 'text_guide':\n            lstm_weight_audio = self.attention_weight_by_feature_seq2seq_attention(\n                self.ernie_feature, lstm_dropout_audio,\n                self.lstm_size_audio * 2)\n        else:\n            lstm_weight_audio = static.nn.fc(x=lstm_dropout_audio,\n                                                size=1,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:195-214"
+    },
+    "2075": {
+        "file_id": 163,
+        "content": "This code is creating a dynamic LSTM for audio input, reversing it, concatenating the forward and backward outputs, applying dropout if in training mode, and then performing attention weight calculation based on the pooling mode.",
+        "type": "comment"
+    },
+    "2076": {
+        "file_id": 163,
+        "content": "                                                activation='sequence_softmax',\n                                                bias_attr=None)\n        scaled_audio = paddle.multiply(x=lstm_dropout_audio,\n                                                    y=lstm_weight_audio)\n        self.lstm_pool_audio = paddle.static.nn.sequence_pool(input=scaled_audio,\n                                                          pool_type='sum')\n        lstm_concat = paddle.concat(\n            x=[self.lstm_pool, self.lstm_pool_audio, self.ernie_feature],\n            axis=1,\n            name='final_concat')\n        # lstm_concat = self.add_bn(lstm_concat)\n        if self.loss_type == 'softmax':\n            self.fc = static.nn.fc(x=lstm_concat,\n                                      size=self.num_classes,\n                                      activation='softmax')\n        elif self.loss_type == 'sigmoid':\n            self.fc = static.nn.fc(x=lstm_concat,\n                                      size=self.num_classes,\n                                      activation=None)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:215-235"
+    },
+    "2077": {
+        "file_id": 163,
+        "content": "This code implements a LSTM-based attention model that combines audio and text data for video tagging. It consists of three main parts: LSTM layers, attention mechanism, and fully connected (FC) layer. The LSTM layers process the text and audio inputs separately and then concatenate them with the ERNIE feature. The attention mechanism is applied to calculate the weight for each LSTM output sequence. The FC layer has a softmax activation function if loss type is set to 'softmax', otherwise, it uses no activation when loss type is 'sigmoid'.",
+        "type": "comment"
+    },
+    "2078": {
+        "file_id": 163,
+        "content": "            self.logit = self.fc\n            self.fc = paddle.nn.functional.sigmoid(self.fc)\n        self.network_outputs = [self.fc]\n    def attention_weight_by_feature_seq2seq_attention(\n            self,\n            text_feature,\n            sequence_feature,\n            sequence_feature_dim,\n            name_prefix=\"seq2seq_attention\"):\n        \"\"\"\n        caculate weight by feature\n        Neural Machine Translation by Jointly Learning to Align and Translate\n        \"\"\"\n        text_feature_expand = paddle.static.nn.sequence_expand(text_feature,\n                                                           sequence_feature,\n                                                           ref_level=0)\n        sequence_text_concat = paddle.concat(\n            x=[sequence_feature, text_feature_expand],\n            axis=-1,\n            name='video_text_concat')\n        energy = static.nn.fc(x=sequence_text_concat,\n                                 size=sequence_feature_dim,\n                                 activation='tanh',",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:236-260"
+    },
+    "2079": {
+        "file_id": 163,
+        "content": "This code calculates attention weights for a feature sequence using a Neural Machine Translation approach. It expands the text feature across the sequence, concatenates it with the original sequence feature, and passes it through an FC layer with 'tanh' activation to calculate energy values. The calculated energy is then used to determine attention weights by feature.",
+        "type": "comment"
+    },
+    "2080": {
+        "file_id": 163,
+        "content": "                                 name=name_prefix + \"_tanh_fc\")\n        weight_vector = static.nn.fc(x=energy,\n                                        size=1,\n                                        activation='sequence_softmax',\n                                        bias_attr=None,\n                                        name=name_prefix + \"_softmax_fc\")\n        return weight_vector\n    def add_bn(self, lstm_concat):\n        \"\"\"\n        v2.5 add drop out and batch norm\n        \"\"\"\n        input_fc_proj = static.nn.fc(\n            x=lstm_concat,\n            size=8192,\n            activation=None,\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)))\n        input_fc_proj_bn = paddle.static.nn.batch_norm(\n            input=input_fc_proj,\n            act=\"relu\",\n            is_test=(not self.mode == 'train'))\n        input_fc_proj_dropout = paddle.nn.functional.dropout(\n            input_fc_proj_bn,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:261-285"
+    },
+    "2081": {
+        "file_id": 163,
+        "content": "This function adds dropout and batch normalization to the LSTM concatenation. It projects the input to 8192 dimensions using an FC layer, applies batch normalization, and then applies a relu activation if in training mode. If not in training mode (is_test), it skips the batch normalization step. Finally, it applies dropout to the result.",
+        "type": "comment"
+    },
+    "2082": {
+        "file_id": 163,
+        "content": "            p=self.drop_rate,\n            training=(self.mode=='train'))\n        input_fc_hidden = static.nn.fc(\n            x=input_fc_proj_dropout,\n            size=4096,\n            activation=None,\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)))\n        input_fc_hidden_bn = paddle.static.nn.batch_norm(\n            input=input_fc_hidden,\n            act=\"relu\",\n            is_test=(not self.mode == 'train'))\n        input_fc_hidden_dropout = paddle.nn.functional.dropout(\n            input_fc_hidden_bn,\n            p=self.drop_rate,\n            training=(self.mode=='train'))\n        return input_fc_hidden_dropout\n    def optimizer(self):\n        \"\"\"\n        optimizer\n        \"\"\"\n        assert self.mode == 'train', \"optimizer only can be get in train mode\"\n        values = [\n            self.learning_rate * (self.decay_gamma ** i)\n            for i in range(len(self.decay_epochs) + 1)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:286-312"
+    },
+    "2083": {
+        "file_id": 163,
+        "content": "This code defines an attention LSTM model using Ernie. It applies dropout, fully connected layers with batch normalization and dropout again. The optimizer function sets a learning rate that decays over specified epochs.",
+        "type": "comment"
+    },
+    "2084": {
+        "file_id": 163,
+        "content": "        ]\n        iter_per_epoch = self.num_samples / self.batch_size\n        boundaries = [e * iter_per_epoch for e in self.decay_epochs]\n        return paddle.optimizer.RMSProp(\n            learning_rate=paddle.optimizer.lr.PiecewiseDecay(values=values,\n                                                       boundaries=boundaries),\n            centered=True,\n            weight_decay=paddle.regularizer.L2Decay(\n                coeff=self.weight_decay))\n    def softlabel_cross_entropy_loss(self):\n        \"\"\"\n        softlabel_cross_entropy_loss\n        \"\"\"\n        assert self.mode != 'infer', \"invalid loss calculationg in infer mode\"\n        '''\n        cost = paddle.nn.functional.cross_entropy(input=self.network_outputs[0], \\\n                                          label=self.label_input)\n        '''\n        cost = paddle.nn.functional.cross_entropy(input=self.network_outputs[0], \\\n                                          label=self.label_input,\n                                          soft_label=True)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:313-334"
+    },
+    "2085": {
+        "file_id": 163,
+        "content": "This code defines a class that contains two methods: one for initializing an optimizer with piecewise decay learning rate and another for calculating the softlabel cross-entropy loss. The optimizer uses RMSProp algorithm and decays the learning rate based on defined epochs. The loss is calculated using the soft label version of cross entropy, suitable for certain types of neural networks.",
+        "type": "comment"
+    },
+    "2086": {
+        "file_id": 163,
+        "content": "        cost = paddle.sum(x=cost, axis=-1)\n        sum_cost = paddle.sum(x=cost)\n        self.loss_ = paddle.scale(sum_cost,\n                                        scale=self.num_gpus,\n                                        bias_after_scale=False)\n        return self.loss_\n    def sigmoid_cross_entropy_loss(self):\n        \"\"\"\n        sigmoid_cross_entropy_loss\n        \"\"\"\n        assert self.mode != 'infer', \"invalid loss calculationg in infer mode\"\n        cost = paddle.nn.functional.binary_cross_entropy(input=self.logit,\\\n                                          label=self.label_input, reduction=None)\n        cost = paddle.sum(x=cost, axis=-1)\n        sum_cost = paddle.sum(x=cost)\n        self.loss_ = paddle.scale(sum_cost,\n                                        scale=self.num_gpus,\n                                        bias_after_scale=False)\n        return self.loss_\n    def loss(self):\n        \"\"\"\n        loss\n        \"\"\"\n        if self.loss_type == 'sigmoid':\n            return self.sigmoid_cross_entropy_loss()",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:336-365"
+    },
+    "2087": {
+        "file_id": 163,
+        "content": "The code defines a loss function that takes in a loss type ('sigmoid' or others) and returns the calculated loss value. It includes functions for computing the sum of losses (sum_cost) and scaling it based on the number of GPUs (self.num_gpus). For the 'sigmoid' loss type, it uses binary cross-entropy with a reduction to be none, and calculates the mean loss over all batch elements. The scale operation is used to adjust the loss value for distributed training across multiple GPUs.",
+        "type": "comment"
+    },
+    "2088": {
+        "file_id": 163,
+        "content": "        else:\n            return self.softlabel_cross_entropy_loss()\n    def outputs(self):\n        \"\"\"\n        get outputs\n        \"\"\"\n        return self.network_outputs\n    def feeds(self):\n        \"\"\"\n        get feeds\n        \"\"\"\n        return self.feature_input if self.mode == 'infer' else self.feature_input + [\n            self.label_input\n        ]\n    def pyreader(self):\n        \"\"\"pyreader\"\"\"\n        return self.py_reader\n    def epoch_num(self):\n        \"\"\"get train epoch num\"\"\"\n        return self.cfg.TRAIN.epoch\n    def load_test_weights_file(self, exe, weights, prog, place):\n        \"\"\"\n        load_test_weights_file\n        \"\"\"\n        load_vars = [x for x in prog.list_vars() \\\n                     if isinstance(x, paddle.framework.Parameter)]\n        static.load_vars(exe,\n                           dirname=weights,\n                           vars=load_vars,\n                           filename=\"param\")",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:366-400"
+    },
+    "2089": {
+        "file_id": 163,
+        "content": "This code defines several methods for a model class. The `softlabel_cross_entropy_loss()` method returns the soft label cross-entropy loss. The `outputs()` method returns the network outputs. The `feeds()` method returns the feature and label inputs based on the current mode. The `pyreader()` method returns the PyReader object. Finally, the `epoch_num()` method returns the number of training epochs.",
+        "type": "comment"
+    },
+    "2090": {
+        "file_id": 164,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py",
+        "type": "filepath"
+    },
+    "2091": {
+        "file_id": 164,
+        "content": "The code defines an Ernie model configuration and initializes the ERNIE model using Paddle's embedding layer. It also includes a multimodal video tagging model, embeddings, data pre-processing, attention mask creation, encoder usage, and TextCNN for sequence feature extraction. The code creates 1D convolutional layers with specified parameters and returns the output.",
+        "type": "summary"
+    },
+    "2092": {
+        "file_id": 164,
+        "content": "#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Ernie model.\"\"\"\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\nfrom __future__ import absolute_import\nimport json\nimport six\nimport logging\nimport paddle\nimport paddle.static as static\nfrom io import open\nfrom .transformer_encoder import encoder, pre_process_layer\nlog = logging.getLogger(__name__)\nclass ErnieConfig(object):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:1-33"
+    },
+    "2093": {
+        "file_id": 164,
+        "content": "This code snippet contains the Ernie model class definition. It imports necessary modules, defines logging, and initializes a class for configuring the Ernie model. The class inherits from `object` and represents the configuration to be used in the Ernie model architecture.",
+        "type": "comment"
+    },
+    "2094": {
+        "file_id": 164,
+        "content": "    \"\"\"\n    Erine model config\n    \"\"\"\n    def __init__(self, config_path):\n        \"\"\"\n        init\n        \"\"\"\n        self._config_dict = self._parse(config_path)\n    def _parse(self, config_path):\n        \"\"\"\n        parse config\n        \"\"\"\n        try:\n            with open(config_path, 'r', encoding='utf8') as json_file:\n                config_dict = json.load(json_file)\n        except Exception:\n            raise IOError(\"Error in parsing Ernie model config file '%s'\" %\n                          config_path)\n        else:\n            return config_dict\n    def __getitem__(self, key):\n        \"\"\"\n        get item\n        \"\"\"\n        return self._config_dict.get(key, None)\n    def __setitem__(self, key, value):\n        \"\"\"\n        set item\n        \"\"\"\n        self._config_dict[key] = value\n    def print_config(self):\n        \"\"\"\n        print config\n        \"\"\"\n        for arg, value in sorted(six.iteritems(self._config_dict)):\n            log.info('%s: %s' % (arg, value))\n        log.info('------------------------------------------------')",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:34-73"
+    },
+    "2095": {
+        "file_id": 164,
+        "content": "This code defines a class for an Ernie model configuration. It initializes the config with a given path, parses the config file using JSON, allows getting and setting items from/to the configuration dictionary, and provides a print_config method to display the configuration in a readable format.",
+        "type": "comment"
+    },
+    "2096": {
+        "file_id": 164,
+        "content": "class ErnieModel(object):\n    \"\"\"\n    ERINE Model\n    \"\"\"\n    def __init__(self,\n                 src_ids,\n                 position_ids,\n                 sentence_ids,\n                 task_ids,\n                 input_mask,\n                 config,\n                 weight_sharing=True,\n                 use_fp16=False):\n        \"\"\"\n        init model\n        \"\"\"\n        self._emb_size = config['hidden_size']\n        self._n_layer = config['num_hidden_layers']\n        self._n_head = config['num_attention_heads']\n        self._voc_size = config['vocab_size']\n        self._max_position_seq_len = config['max_position_embeddings']\n        if config['sent_type_vocab_size']:\n            self._sent_types = config['sent_type_vocab_size']\n        else:\n            self._sent_types = config['type_vocab_size']\n        self._use_task_id = config['use_task_id']\n        if self._use_task_id:\n            self._task_types = config['task_type_vocab_size']\n        self._hidden_act = config['hidden_act']\n        self._prepostprocess_dropout = config['hidden_dropout_prob']",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:76-106"
+    },
+    "2097": {
+        "file_id": 164,
+        "content": "The code defines the ErnieModel class, which initializes an ERINE model with parameters such as source ids, position ids, sentence ids, task ids, input mask, configuration, weight sharing, and use of fp16. The class attributes are initialized based on the provided configuration.",
+        "type": "comment"
+    },
+    "2098": {
+        "file_id": 164,
+        "content": "        self._attention_dropout = config['attention_probs_dropout_prob']\n        self._weight_sharing = weight_sharing\n        self._word_emb_name = \"word_embedding\"\n        self._pos_emb_name = \"pos_embedding\"\n        self._sent_emb_name = \"sent_embedding\"\n        self._task_emb_name = \"task_embedding\"\n        self._dtype = \"float16\" if use_fp16 else \"float32\"\n        self._emb_dtype = \"float32\"\n        # Initialize all weigths by truncated normal initializer, and all biases\n        # will be initialized by constant zero by default.\n        self._param_initializer = paddle.nn.initializer.TruncatedNormal(\n            std=config['initializer_range'])\n        self._build_model(src_ids, position_ids, sentence_ids, task_ids,\n                          input_mask)\n    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,\n                     input_mask):\n        \"\"\"\n        build  model\n        \"\"\"\n        # padding id in vocabulary must be set to 0\n        emb_out = static.nn.embedding(\n            input=src_ids,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:107-132"
+    },
+    "2099": {
+        "file_id": 164,
+        "content": "This code initializes the ERNIE model parameters and builds the model. It sets various attributes such as attention dropout probability, embedding names for word, position, sentence, and task, data types, and initializer range. The _build_model function is then called to create the model using Paddle's embedding layer.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/21.json b/docs/data/21.json
new file mode 100644
index 000000000..c29f31fc0
--- /dev/null
+++ b/docs/data/21.json
@@ -0,0 +1,546 @@
+{
+    "2100": {
+        "file_id": 164,
+        "content": "            size=[self._voc_size, self._emb_size],\n            dtype=self._emb_dtype,\n            param_attr=paddle.ParamAttr(\n                name=self._word_emb_name, initializer=self._param_initializer),\n            is_sparse=False)\n        position_emb_out = static.nn.embedding(\n            input=position_ids,\n            size=[self._max_position_seq_len, self._emb_size],\n            dtype=self._emb_dtype,\n            param_attr=paddle.ParamAttr(\n                name=self._pos_emb_name, initializer=self._param_initializer))\n        sent_emb_out = static.nn.embedding(\n            sentence_ids,\n            size=[self._sent_types, self._emb_size],\n            dtype=self._emb_dtype,\n            param_attr=paddle.ParamAttr(\n                name=self._sent_emb_name, initializer=self._param_initializer))\n        # emb_out = emb_out + position_emb_out\n        # emb_out = emb_out + sent_emb_out\n        emb_out = paddle.add(x=emb_out, y=position_emb_out)\n        emb_out = paddle.add(x=emb_out, y=sent_emb_out)\n        if self._use_task_id:",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:133-158"
+    },
+    "2101": {
+        "file_id": 164,
+        "content": "This code initializes and concatenates three embeddings - word, position, and sentence - in a multimodal video tagging model. The embeddings are defined with specific sizes and data types. Two embeddings (position_emb_out and sent_emb_out) are added to the original embedding (emb_out), and then these combined embeddings are returned.",
+        "type": "comment"
+    },
+    "2102": {
+        "file_id": 164,
+        "content": "            task_emb_out = static.nn.embedding(\n                task_ids,\n                size=[self._task_types, self._emb_size],\n                dtype=self._emb_dtype,\n                param_attr=paddle.ParamAttr(\n                    name=self._task_emb_name,\n                    initializer=self._param_initializer))\n            emb_out = emb_out + task_emb_out\n        emb_out = pre_process_layer(\n            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')\n        if self._dtype == \"float16\":\n            emb_out = paddle.cast(x=emb_out, dtype=self._dtype)\n            input_mask = paddle.cast(x=input_mask, dtype=self._dtype)\n        self_attn_mask = paddle.matmul(\n            x=input_mask, y=input_mask, transpose_y=True)\n        self_attn_mask = paddle.scale(\n            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)\n        n_head_self_attn_mask = paddle.stack(\n            x=[self_attn_mask] * self._n_head, axis=1)\n        n_head_self_attn_mask.stop_gradient = True\n        self._enc_out = encoder(",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:159-184"
+    },
+    "2103": {
+        "file_id": 164,
+        "content": "This code initializes an embedding layer for task types, adds it to the embeddings, applies pre-processing with dropout if necessary, and casts the embeddings to the desired dtype. It also creates a self-attention mask, stacks it for each attention head, sets its gradient to stop during backpropagation, and passes the embeddings through an encoder.",
+        "type": "comment"
+    },
+    "2104": {
+        "file_id": 164,
+        "content": "            enc_input=emb_out,\n            attn_bias=n_head_self_attn_mask,\n            n_layer=self._n_layer,\n            n_head=self._n_head,\n            d_key=self._emb_size // self._n_head,\n            d_value=self._emb_size // self._n_head,\n            d_model=self._emb_size,\n            d_inner_hid=self._emb_size * 4,\n            prepostprocess_dropout=self._prepostprocess_dropout,\n            attention_dropout=self._attention_dropout,\n            relu_dropout=0,\n            hidden_act=self._hidden_act,\n            preprocess_cmd=\"\",\n            postprocess_cmd=\"dan\",\n            param_initializer=self._param_initializer,\n            name='encoder')\n        if self._dtype == \"float16\":\n            self._enc_out = paddle.cast(\n                x=self._enc_out, dtype=self._emb_dtype)\n    def get_sequence_output(self):\n        \"\"\"\n        get sequence output\n        \"\"\"\n        return self._enc_out\n    def get_sequence_textcnn_output(self, sequence_feature, input_mask):\n        \"\"\"\n        get sequence output\n        \"\"\"",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:185-215"
+    },
+    "2105": {
+        "file_id": 164,
+        "content": "This code is defining and initializing a model for an encoder layer in a deep learning application. The model takes several parameters such as embedding size, number of layers and heads, dropout rates, activation function, etc. It then casts the output to the specified data type if necessary. The `get_sequence_output` method returns the sequence output from the encoder layer and `get_sequence_textcnn_output` takes in a feature sequence and an input mask to generate the output.",
+        "type": "comment"
+    },
+    "2106": {
+        "file_id": 164,
+        "content": "        seq_len = paddle.sum(x=input_mask, axis=[1, 2])\n        seq_len = paddle.cast(seq_len, 'int64')\n        sequence_feature = paddle.static.nn.sequence_unpad(sequence_feature, seq_len)\n        return self.textcnn(sequence_feature)\n    def get_pooled_output(self):\n        \"\"\"Get the first feature of each sequence for classification\"\"\"\n        next_sent_feat = paddle.slice(\n            input=self._enc_out, axes=[1], starts=[0], ends=[1])\n        next_sent_feat = static.nn.fc(\n            x=next_sent_feat,\n            size=self._emb_size,\n            activation=\"tanh\",\n            weight_attr=paddle.ParamAttr(\n                name=\"pooled_fc.w_0\", initializer=self._param_initializer),\n            bias_attr=\"pooled_fc.b_0\")\n        return next_sent_feat\n    def textcnn(self, feature, name='text_cnn'):\n        \"\"\"\n        TextCNN sequence feature extraction\n        \"\"\"\n        win_sizes = [2, 3, 4]\n        hid_dim = 256\n        convs = []\n        for win_size in win_sizes:\n            conv_h = paddle.fluid.nets.sequence_conv_pool(input=feature,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:216-243"
+    },
+    "2107": {
+        "file_id": 164,
+        "content": "This code defines a TextCNN model for sequence feature extraction. It pads the input sequence, applies convolutions with various window sizes, and pools the results. The get_pooled_output function extracts the first feature of each sequence for classification by applying an FC layer with tanh activation. The textcnn function initializes a TextCNN model with specified window sizes and hidden dimensions.",
+        "type": "comment"
+    },
+    "2108": {
+        "file_id": 164,
+        "content": "                                                   num_filters=hid_dim,\n                                                   filter_size=win_size,\n                                                   act=\"tanh\",\n                                                   pool_type=\"max\")\n            convs.append(conv_h)\n        convs_out = paddle.concat(x=convs, axis=1)\n        return convs_out",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:244-250"
+    },
+    "2109": {
+        "file_id": 164,
+        "content": "This code is creating a 1D convolutional layer with specified parameters, including the number of filters, filter size, activation function, and pooling type. The resulting convolutional layers are appended to the `convs` list, and then concatenated along axis 1 to form `convs_out`. Finally, the function returns `convs_out`.",
+        "type": "comment"
+    },
+    "2110": {
+        "file_id": 165,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py",
+        "type": "filepath"
+    },
+    "2111": {
+        "file_id": 165,
+        "content": "This code defines a PaddlePaddle transformer encoder layer for normalization and training, including residual connections, dropout, self-attention mechanism, and position-wise feed-forward networks. It creates a Transformer Encoder with Scaled Dot-Product Attention for NLP tasks.",
+        "type": "summary"
+    },
+    "2112": {
+        "file_id": 165,
+        "content": "#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Transformer encoder.\"\"\"\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom functools import partial\nimport paddle\nimport paddle.static as static\ndef multi_head_attention(queries,\n                         keys,\n                         values,\n                         attn_bias,\n                         d_key,\n                         d_value,\n                         d_model,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:1-32"
+    },
+    "2113": {
+        "file_id": 165,
+        "content": "This code defines a function called \"multi_head_attention\" which performs multi-head attention operations on queries, keys, and values. The function takes in additional parameters such as attn_bias, d_key, d_value, d_model. This is part of the Transformer encoder model implementation in PaddlePaddle framework.",
+        "type": "comment"
+    },
+    "2114": {
+        "file_id": 165,
+        "content": "                         n_head=1,\n                         dropout_rate=0.,\n                         cache=None,\n                         param_initializer=None,\n                         name='multi_head_att'):\n    \"\"\"\n    Multi-Head Attention. Note that attn_bias is added to the logit before\n    computing softmax activiation to mask certain selected positions so that\n    they will not considered in attention weights.\n    \"\"\"\n    keys = queries if keys is None else keys\n    values = keys if values is None else values\n    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):\n        raise ValueError(\n            \"Inputs: quries, keys and values should all be 3-D tensors.\")\n    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):\n        \"\"\"\n        Add linear projection to queries, keys, and values.\n        \"\"\"\n        q = static.nn.fc(x=queries,\n                      size=d_key * n_head,\n                      num_flatten_dims=2,\n                      weight_attr=paddle.ParamAttr(",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:33-57"
+    },
+    "2115": {
+        "file_id": 165,
+        "content": "This code snippet defines a Multi-Head Attention layer. It takes in queries, keys (optional), and values (optional) as inputs, and performs linear projections on the queries before computing the attention weights. The function __compute_qkv also handles the case when keys or values are None by setting them to be equal to queries if needed. The inputs should all be 3-D tensors.",
+        "type": "comment"
+    },
+    "2116": {
+        "file_id": 165,
+        "content": "                          name=name + '_query_fc.w_0',\n                          initializer=param_initializer),\n                      bias_attr=name + '_query_fc.b_0')\n        k = static.nn.fc(x=keys,\n                      size=d_key * n_head,\n                      num_flatten_dims=2,\n                      weight_attr=paddle.ParamAttr(\n                          name=name + '_key_fc.w_0',\n                          initializer=param_initializer),\n                      bias_attr=name + '_key_fc.b_0')\n        v = static.nn.fc(x=values,\n                      size=d_value * n_head,\n                      num_flatten_dims=2,\n                      weight_attr=paddle.ParamAttr(\n                          name=name + '_value_fc.w_0',\n                          initializer=param_initializer),\n                      bias_attr=name + '_value_fc.b_0')\n        return q, k, v\n    def __split_heads(x, n_head):\n        \"\"\"\n        Reshape the last dimension of inpunt tensor x so that it becomes two\n        dimensions and then transpose. Specifically, input a tensor with shape",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:58-80"
+    },
+    "2117": {
+        "file_id": 165,
+        "content": "This code defines a function for the Transformer Encoder layer. It includes functions for multi-head attention, position-wise feed-forward network layers, and splits heads of input tensors. Parameters such as d_key, n_head, and param_initializer are used to define the dimensions and initialization methods for weights. The code uses Paddle's static nn library and defines the names for different FC layers within the function.",
+        "type": "comment"
+    },
+    "2118": {
+        "file_id": 165,
+        "content": "        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor\n        with shape [bs, n_head, max_sequence_length, hidden_dim].\n        \"\"\"\n        hidden_size = x.shape[-1]\n        # The value 0 in shape attr means copying the corresponding dimension\n        # size of the input as the output dimension size.\n        reshaped = paddle.reshape(\n            x=x, shape=[0, 0, n_head, hidden_size // n_head])\n        # permuate the dimensions into:\n        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]\n        return paddle.transpose(x=reshaped, perm=[0, 2, 1, 3])\n    def __combine_heads(x):\n        \"\"\"\n        Transpose and then reshape the last two dimensions of inpunt tensor x\n        so that it becomes one dimension, which is reverse to __split_heads.\n        \"\"\"\n        if len(x.shape) == 3: return x\n        if len(x.shape) != 4:\n            raise ValueError(\"Input(x) should be a 4-D Tensor.\")\n        trans_x = paddle.transpose(x, perm=[0, 2, 1, 3])\n        # The value 0 in shape attr means copying the corresponding dimension",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:81-104"
+    },
+    "2119": {
+        "file_id": 165,
+        "content": "This code is performing tensor reshaping and transposing operations to split the input tensor into multiple smaller tensors, representing different attention heads. The `__split_heads` function splits the tensor into a shape of [bs, n_head, max_sequence_length, hidden_dim], while the `__combine_heads` function reverses this process by transposing and reshaping the last two dimensions to combine the attention heads back into one dimension.",
+        "type": "comment"
+    },
+    "2120": {
+        "file_id": 165,
+        "content": "        # size of the input as the output dimension size.\n        return paddle.reshape(\n            x=trans_x,\n            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]])\n    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):\n        \"\"\"\n        Scaled Dot-Product Attention\n        \"\"\"\n        scaled_q = paddle.scale(x=q, scale=d_key**-0.5)\n        product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)\n        if attn_bias:\n            # product += attn_bias\n            product = paddle.add(x=product, y=attn_bias)\n        weights = paddle.nn.functional.softmax(x=product)\n        if dropout_rate:\n            weights = paddle.nn.functional.dropout(weights, p=dropout_rate, mode=\"upscale_in_train\", training=True)\n        out = paddle.matmul(x=weights, y=v)\n        return out\n    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)\n    if cache is not None:  # use cache and concat time steps\n        # Since the inplace reshape in __split_heads changes the shape of k and",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:105-128"
+    },
+    "2121": {
+        "file_id": 165,
+        "content": "This code defines a function that performs Scaled Dot-Product Attention. It first scales the query vector by dividing it with the square root of the key dimension, then takes the dot product between scaled query and key matrices after transposing the key matrix. If attention bias is provided, it adds it to the product. It applies softmax activation on the result to get weights, which are optionally dropout masked if a dropout rate is specified. Finally, it computes the output vector by taking the weighted sum of value vectors. This function is used in the context of Transformer Encoder layers.",
+        "type": "comment"
+    },
+    "2122": {
+        "file_id": 165,
+        "content": "        # v, which is the cache input for next time step, reshape the cache\n        # input from the previous time step first.\n        k = cache[\"k\"] = paddle.concat(\n            x=[paddle.reshape(\n                x=cache[\"k\"], shape=[0, 0, d_model]), k], axis=1)\n        v = cache[\"v\"] = paddle.concat(\n            x=[paddle.reshape(\n                x=cache[\"v\"], shape=[0, 0, d_model]), v], axis=1)\n    q = __split_heads(q, n_head)\n    k = __split_heads(k, n_head)\n    v = __split_heads(v, n_head)\n    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,\n                                                  dropout_rate)\n    out = __combine_heads(ctx_multiheads)\n    # Project back to the model size.\n    proj_out = static.nn.fc(x=out,\n                         size=d_model,\n                         num_flatten_dims=2,\n                         weight_attr=paddle.ParamAttr(\n                             name=name + '_output_fc.w_0',\n                             initializer=param_initializer),\n                         bias_attr=name + '_output_fc.b_0')",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:129-154"
+    },
+    "2123": {
+        "file_id": 165,
+        "content": "This code is reshaping the cache input for the next time step and splitting the inputs into multiple heads. It performs scaled dot product attention, combines the outputs of each head, and projects the result back to the model size using a fully connected layer.",
+        "type": "comment"
+    },
+    "2124": {
+        "file_id": 165,
+        "content": "    return proj_out\ndef positionwise_feed_forward(x,\n                              d_inner_hid,\n                              d_hid,\n                              dropout_rate,\n                              hidden_act,\n                              param_initializer=None,\n                              name='ffn'):\n    \"\"\"\n    Position-wise Feed-Forward Networks.\n    This module consists of two linear transformations with a ReLU activation\n    in between, which is applied to each position separately and identically.\n    \"\"\"\n    hidden = static.nn.fc(x=x,\n                       size=d_inner_hid,\n                       num_flatten_dims=2,\n                       activation=hidden_act,\n                       weight_attr=paddle.ParamAttr(\n                           name=name + '_fc_0.w_0',\n                           initializer=param_initializer),\n                       bias_attr=name + '_fc_0.b_0')\n    if dropout_rate:\n        hidden = paddle.nn.functional.dropout(\n            hidden,\n            p=dropout_rate,\n            mode=\"upscale_in_train\",",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:155-182"
+    },
+    "2125": {
+        "file_id": 165,
+        "content": "This code defines the position-wise feed-forward network used in a transformer encoder. It consists of two linear transformations with a ReLU activation applied to each position separately and identically. The hidden layer is passed through a dropout if dropout_rate is specified.",
+        "type": "comment"
+    },
+    "2126": {
+        "file_id": 165,
+        "content": "            training=True)\n    out = static.nn.fc(x=hidden,\n                    size=d_hid,\n                    num_flatten_dims=2,\n                    weight_attr=paddle.ParamAttr(\n                        name=name + '_fc_1.w_0', initializer=param_initializer),\n                    bias_attr=name + '_fc_1.b_0')\n    return out\ndef pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,\n                           name=''):\n    \"\"\"\n    Add residual connection, layer normalization and droput to the out tensor\n    optionally according to the value of process_cmd.\n    This will be used before or after multi-head attention and position-wise\n    feed-forward networks.\n    \"\"\"\n    for cmd in process_cmd:\n        if cmd == \"a\":  # add residual connection\n            # out = out + prev_out if prev_out else out\n            out = paddle.add(x=out, y=prev_out) if prev_out else out\n        elif cmd == \"n\":  # add layer normalization\n            out_dtype = out.dtype\n            if out_dtype == \"float16\":\n                out = paddle.cast(x=out, dtype=\"float32\")",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:183-208"
+    },
+    "2127": {
+        "file_id": 165,
+        "content": "This code defines a function for a transformer encoder layer in the PaddleVideo MultimodalVideoTag application. The layer includes a multi-head attention mechanism and a position-wise feed-forward network, with residual connections and layer normalization added before or after these operations, as specified by the process_cmd argument.",
+        "type": "comment"
+    },
+    "2128": {
+        "file_id": 165,
+        "content": "            out = static.nn.layer_norm(\n                out,\n                begin_norm_axis=len(out.shape) - 1,\n                param_attr=paddle.ParamAttr(\n                    name=name + '_layer_norm_scale',\n                    initializer=paddle.nn.initializer.Constant(value=1.)),\n                bias_attr=paddle.ParamAttr(\n                    name=name + '_layer_norm_bias',\n                    initializer=paddle.nn.initializer.Constant(value=0.)))\n            if out_dtype == \"float16\":\n                out = paddle.cast(x=out, dtype=\"float16\")\n        elif cmd == \"d\":  # add dropout\n            if dropout_rate:\n                out = paddle.nn.functional.dropout(\n                    out,\n                    p=dropout_rate,\n                    dropout_implementation=\"upscale_in_train\",\n                    training=True)\n    return out\npre_process_layer = partial(pre_post_process_layer, None)\npost_process_layer = pre_post_process_layer\ndef encoder_layer(enc_input,\n                  attn_bias,\n                  n_head,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:209-236"
+    },
+    "2129": {
+        "file_id": 165,
+        "content": "This code is part of a transformer encoder layer implementation in PaddlePaddle. It applies layer normalization, optional float16 casting, and optionally dropout for training. The pre_process_layer and post_process_layer are partial functions used for data pre-processing and post-processing respectively. The encoder_layer function takes input, attention bias, and number of heads as inputs to create a transformer encoder layer.",
+        "type": "comment"
+    },
+    "2130": {
+        "file_id": 165,
+        "content": "                  d_key,\n                  d_value,\n                  d_model,\n                  d_inner_hid,\n                  prepostprocess_dropout,\n                  attention_dropout,\n                  relu_dropout,\n                  hidden_act,\n                  preprocess_cmd=\"n\",\n                  postprocess_cmd=\"da\",\n                  param_initializer=None,\n                  name=''):\n    \"\"\"The encoder layers that can be stacked to form a deep encoder.\n    This module consits of a multi-head (self) attention followed by\n    position-wise feed-forward networks and both the two components companied\n    with the post_process_layer to add residual connection, layer normalization\n    and droput.\n    \"\"\"\n    attn_output = multi_head_attention(\n        pre_process_layer(\n            enc_input,\n            preprocess_cmd,\n            prepostprocess_dropout,\n            name=name + '_pre_att'),\n        None,\n        None,\n        attn_bias,\n        d_key,\n        d_value,\n        d_model,\n        n_head,\n        attention_dropout,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:237-268"
+    },
+    "2131": {
+        "file_id": 165,
+        "content": "This code defines a transformer encoder layer that stacks multiple layers to form a deep encoder. The encoder consists of a multi-head self-attention mechanism followed by position-wise feed-forward networks, all with residual connections and layer normalization to add dropout.",
+        "type": "comment"
+    },
+    "2132": {
+        "file_id": 165,
+        "content": "        param_initializer=param_initializer,\n        name=name + '_multi_head_att')\n    attn_output = post_process_layer(\n        enc_input,\n        attn_output,\n        postprocess_cmd,\n        prepostprocess_dropout,\n        name=name + '_post_att')\n    ffd_output = positionwise_feed_forward(\n        pre_process_layer(\n            attn_output,\n            preprocess_cmd,\n            prepostprocess_dropout,\n            name=name + '_pre_ffn'),\n        d_inner_hid,\n        d_model,\n        relu_dropout,\n        hidden_act,\n        param_initializer=param_initializer,\n        name=name + '_ffn')\n    return post_process_layer(\n        attn_output,\n        ffd_output,\n        postprocess_cmd,\n        prepostprocess_dropout,\n        name=name + '_post_ffn')\ndef encoder(enc_input,\n            attn_bias,\n            n_layer,\n            n_head,\n            d_key,\n            d_value,\n            d_model,\n            d_inner_hid,\n            prepostprocess_dropout,\n            attention_dropout,\n            relu_dropout,\n            hidden_act,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:269-308"
+    },
+    "2133": {
+        "file_id": 165,
+        "content": "This code defines a transformer encoder model. It utilizes an attention mechanism to process input sequences, followed by position-wise feed forward layers. The function takes input sequences, attention bias, number of layers, number of heads, and other parameters as inputs and returns the processed output.",
+        "type": "comment"
+    },
+    "2134": {
+        "file_id": 165,
+        "content": "            preprocess_cmd=\"n\",\n            postprocess_cmd=\"da\",\n            param_initializer=None,\n            name=''):\n    \"\"\"\n    The encoder is composed of a stack of identical layers returned by calling\n    encoder_layer.\n    \"\"\"\n    for i in range(n_layer):\n        enc_output = encoder_layer(\n            enc_input,\n            attn_bias,\n            n_head,\n            d_key,\n            d_value,\n            d_model,\n            d_inner_hid,\n            prepostprocess_dropout,\n            attention_dropout,\n            relu_dropout,\n            hidden_act,\n            preprocess_cmd,\n            postprocess_cmd,\n            param_initializer=param_initializer,\n            name=name + '_layer_' + str(i))\n        enc_input = enc_output\n    enc_output = pre_process_layer(\n        enc_output, preprocess_cmd, prepostprocess_dropout, name=\"post_encoder\")\n    return enc_output",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:309-338"
+    },
+    "2135": {
+        "file_id": 165,
+        "content": "This code defines a function to create an encoder consisting of multiple layers, where each layer is generated by calling the \"encoder_layer\" function. The encoder takes in input, attention bias, number of heads, dimensionality of keys and values, model dimensions, inner hidden dimensions, and dropout rates for preprocessing and postprocessing. The function applies each layer to the input sequentially, updating the input with each iteration. Finally, it applies a pre-processing layer to the output using specified preprocessing command and prepostprocess_dropout.",
+        "type": "comment"
+    },
+    "2136": {
+        "file_id": 166,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/train.py",
+        "type": "filepath"
+    },
+    "2137": {
+        "file_id": 166,
+        "content": "This code initializes a PaddlePaddle model for video training, sets up feeds and outputs, configures loss and optimizer, builds the model, prepares programs, trains, logs, and saves it. The main function handles arguments, checks save directory, and executes the training process.",
+        "type": "summary"
+    },
+    "2138": {
+        "file_id": 166,
+        "content": "\"\"\"\ntrain main\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\nimport time\nimport argparse\nimport logging\nimport numpy as np\nimport paddle\npaddle.enable_static()\nimport paddle.static as static\nfrom accuracy_metrics import MetricsCalculator\nfrom datareader import get_reader\nfrom config import print_configs, merge_configs, parse_config\nfrom models.attention_lstm_ernie import AttentionLstmErnie\nfrom utils import init_pretraining_params, train_with_pyreader",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/train.py:1-34"
+    },
+    "2139": {
+        "file_id": 166,
+        "content": "The code imports necessary libraries and modules, enables static mode for PaddlePaddle, initializes a model (AttentionLstmErnie), defines train_with_pyreader function, and handles config file operations. It follows the Apache License 2.0 and provides information for obtaining the license.",
+        "type": "comment"
+    },
+    "2140": {
+        "file_id": 166,
+        "content": "logging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'\nlogging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)\nlogger = logging.getLogger(__name__)\ndef parse_args():\n    \"\"\"parse_args\n    \"\"\"\n    parser = argparse.ArgumentParser(\"Paddle Video train script\")\n    parser.add_argument(\n        '--model_name',\n        type=str,\n        default='BaiduNet',\n        help='name of model to train.')\n    parser.add_argument(\n        '--config',\n        type=str,\n        default='configs/conf.txt',\n        help='path to config file of model')\n    parser.add_argument(\n        '--batch_size',\n        type=int,\n        default=None,\n        help='training batch size. None to use config file setting.')\n    parser.add_argument(\n        '--learning_rate',\n        type=float,\n        default=None,\n        help='learning rate use for training. None to use config file setting.')\n    parser.add_argument(\n        '--pretrain',\n        type=str,\n        default=None,\n        help='path to pretrain weights. None to use default weights path in  ~/.paddle/weights.'",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/train.py:37-71"
+    },
+    "2141": {
+        "file_id": 166,
+        "content": "This code sets up the logging configuration and parses command-line arguments for training a video model using Paddle Video. The default model name is 'BaiduNet', and the config file path is 'configs/conf.txt'. It also allows setting the batch size, learning rate, and pretrain weights through command-line flags.",
+        "type": "comment"
+    },
+    "2142": {
+        "file_id": 166,
+        "content": "    )\n    parser.add_argument(\n        '--resume',\n        type=str,\n        default=None,\n        help='path to resume training based on previous checkpoints. '\n        'None for not resuming any checkpoints.')\n    parser.add_argument(\n        '--use_gpu', type=bool, default=True, help='default use gpu.')\n    parser.add_argument(\n        '--no_use_pyreader',\n        action='store_true',\n        default=False,\n        help='whether to use pyreader')\n    parser.add_argument(\n        '--no_memory_optimize',\n        action='store_true',\n        default=False,\n        help='whether to use memory optimize in train')\n    parser.add_argument(\n        '--epoch_num',\n        type=int,\n        default=0,\n        help='epoch number, 0 for read from config file')\n    parser.add_argument(\n        '--valid_interval',\n        type=int,\n        default=1,\n        help='validation epoch interval, 0 for no validation.')\n    parser.add_argument(\n        '--save_dir',\n        type=str,\n        default='checkpoints',\n        help='directory name to save train snapshoot')",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/train.py:72-105"
+    },
+    "2143": {
+        "file_id": 166,
+        "content": "This code snippet from the PaddleVideo library's MultimodalVideoTag application defines command line argument options for training. Options include resuming training, GPU usage, disabling pyreader, memory optimization during training, epoch number, validation interval, and saving directory.",
+        "type": "comment"
+    },
+    "2144": {
+        "file_id": 166,
+        "content": "    parser.add_argument(\n        '--log_interval',\n        type=int,\n        default=10,\n        help='mini-batch interval to log.')\n    parser.add_argument(\n        '--save_log_name',\n        type=str,\n        default='train_val',\n        help='save to tensorboard filename recommand model name.')\n    args = parser.parse_args()\n    return args\ndef train(args):\n    \"\"\"train main\n    \"\"\"\n    # parse config\n    config = parse_config(args.config)\n    train_config = merge_configs(config, 'train', vars(args))\n    valid_config = merge_configs(config, 'valid', vars(args))\n    print_configs(train_config, 'Train')\n    train_model = AttentionLstmErnie(args.model_name, train_config, mode='train')\n    valid_model = AttentionLstmErnie(args.model_name, valid_config, mode='valid')\n    max_train_steps = train_config.TRAIN.epoch * train_config.TRAIN.num_samples // train_config.TRAIN.batch_size\n    print('max train steps %d' % (max_train_steps))\n    # build model\n    startup = static.Program()\n    train_prog = static.Program()\n    with static.program_guard(train_prog, startup):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/train.py:106-136"
+    },
+    "2145": {
+        "file_id": 166,
+        "content": "This code defines command-line arguments for the mini-batch interval to log and the save filename, parses the configuration file, creates train and valid models based on the model name and configurations, sets the maximum number of training steps, and prepares static programs for building the model.",
+        "type": "comment"
+    },
+    "2146": {
+        "file_id": 166,
+        "content": "        paddle.disable_static()\n        train_model.build_input(use_pyreader=True)\n        train_model.build_model()\n            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label\n        train_feeds = train_model.feeds()\n        train_feeds[-1].persistable = True\n            # for the output of classification model, has the form [pred]\n        train_outputs = train_model.outputs()\n        for output in train_outputs:\n            output.persistable = True\n        train_loss = train_model.loss()\n        train_loss.persistable = True\n            # outputs, loss, label should be fetched, so set persistable to be true\n        optimizer = train_model.optimizer()\n        optimizer.minimize(train_loss)\n        train_pyreader = train_model.pyreader()\n        paddle.enable_static()\n    if not args.no_memory_optimize:\n        paddle.distributed.transpiler.memory_optimize(train_prog)\n    valid_prog = static.Program()\n    with static.program_guard(valid_prog, startup):\n        paddle.disable_static()",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/train.py:137-160"
+    },
+    "2147": {
+        "file_id": 166,
+        "content": "This code snippet prepares the model for training by setting up feeds, outputs, loss, and optimizer. It also enables memory optimization if specified by arguments.",
+        "type": "comment"
+    },
+    "2148": {
+        "file_id": 166,
+        "content": "        valid_model.build_input(True)\n        valid_model.build_model()\n        valid_feeds = valid_model.feeds()\n        valid_outputs = valid_model.outputs()\n        valid_loss = valid_model.loss()\n        valid_pyreader = valid_model.pyreader()\n        paddle.enable_static()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(startup)\n    if args.resume:\n        # if resume weights is given, load resume weights directly\n        assert os.path.exists(args.resume), \\\n            \"Given resume weight dir {} not exist.\".format(args.resume)\n        def if_exist(var):\n            \"\"\"if_exist\n            \"\"\"\n            return os.path.exists(os.path.join(args.resume, var.name))\n        print('resuming ,,,,,,,,,,,,,,')\n        paddle.fluid.io.load_persistables(\n                    exe, '', main_program=train_prog, filename=args.resume)\n    else:\n        # load ernie pretrain model\n        init_pretraining_params(exe,\n                                train_config.TRAIN.ernie_pretrain_dict_path,",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/train.py:161-190"
+    },
+    "2149": {
+        "file_id": 166,
+        "content": "The code is building the model, setting up executor and place (CPU or GPU), checking if resume weights exist to load them if necessary, and initializing pre-trained parameters for Ernie model.",
+        "type": "comment"
+    },
+    "2150": {
+        "file_id": 166,
+        "content": "                                main_program=train_prog)\n        # if not in resume mode, load pretrain weights\n        # this pretrain may be only audio or video\n        if args.pretrain:\n            assert os.path.exists(args.pretrain), \\\n                \"Given pretrain weight dir {} not exist.\".format(args.pretrain)\n        if args.pretrain:\n            train_model.load_test_weights_file(exe, args.pretrain, train_prog, place)\n    build_strategy = paddle.static.BuildStrategy()\n    build_strategy.enable_inplace = True\n    compiled_train_prog = static.CompiledProgram(\n        train_prog).with_data_parallel(loss_name=train_loss.name,\n                                       build_strategy=build_strategy)\n    compiled_valid_prog = static.CompiledProgram(\n        valid_prog).with_data_parallel(share_vars_from=compiled_train_prog,\n                                       build_strategy=build_strategy)\n    # get reader\n    bs_denominator = 1\n    if (not args.no_use_pyreader) and args.use_gpu:\n        dev_list = static.cuda_places()",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/train.py:191-213"
+    },
+    "2151": {
+        "file_id": 166,
+        "content": "Loading pre-trained weights if provided, enabling inplace for faster execution and creating compiled programs with data parallelism for both training and validation programs. If not using PyReader and GPU is enabled, it sets the device list to use CUDA places.",
+        "type": "comment"
+    },
+    "2152": {
+        "file_id": 166,
+        "content": "        bs_denominator = len(dev_list)\n    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /\n                                        bs_denominator)\n    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /\n                                        bs_denominator)\n    train_reader = get_reader(args.model_name.upper(), 'train', train_config)\n    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)\n    exe_places = static.cuda_places() if args.use_gpu else static.cpu_places()\n    train_pyreader.decorate_sample_list_generator(train_reader,\n                                                  places=exe_places)\n    valid_pyreader.decorate_sample_list_generator(valid_reader,\n                                                  places=exe_places)\n    # get metrics\n    train_metrics = MetricsCalculator(args.model_name.upper(), 'train', train_config)\n    valid_metrics = MetricsCalculator(args.model_name.upper(), 'valid', valid_config)\n    # print(\"****************************valid_metrics\", valid_metrics.get())",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/train.py:214-231"
+    },
+    "2153": {
+        "file_id": 166,
+        "content": "This code sets the batch size for training and validation based on the length of the development list. It initializes train and valid readers with these batch sizes, decorates them with specified places, and creates MetricsCalculator objects to get metrics for training and validation.",
+        "type": "comment"
+    },
+    "2154": {
+        "file_id": 166,
+        "content": "    train_fetch_list = [train_loss.name] + [x.name for x in train_outputs\n                                            ] + [train_feeds[-1].name]\n    valid_fetch_list = [valid_loss.name] + [x.name for x in valid_outputs\n                                            ] + [valid_feeds[-1].name]\n    epochs = args.epoch_num or train_model.epoch_num()\n    train_with_pyreader(\n        exe,\n        train_prog,\n        compiled_train_prog,\n        train_pyreader,\n        train_fetch_list,\n        train_metrics,\n        epochs=epochs,\n        log_interval=args.log_interval,\n        valid_interval=args.valid_interval,\n        save_dir=args.save_dir,\n        save_model_name=args.model_name,\n        test_exe=compiled_valid_prog,\n        test_pyreader=valid_pyreader,\n        test_fetch_list=valid_fetch_list,\n        test_metrics=valid_metrics)\nif __name__ == \"__main__\":\n    args = parse_args()\n    logger.info(args)\n    if not os.path.exists(args.save_dir):\n        os.makedirs(args.save_dir)\n    train(args)",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/train.py:232-263"
+    },
+    "2155": {
+        "file_id": 166,
+        "content": "The code initializes training and validation fetch lists, sets the number of epochs based on argument or model default, then trains the model using the specified executor, programs, feeds, and fetch lists. It also handles logging intervals, valid intervals, save directory, and save model name. The main function parses arguments, checks if the save directory exists, and calls the train function to execute the training process.",
+        "type": "comment"
+    },
+    "2156": {
+        "file_id": 167,
+        "content": "/applications/MultimodalVideoTag/scenario_lib/utils.py",
+        "type": "filepath"
+    },
+    "2157": {
+        "file_id": 167,
+        "content": "The `test_with_pyreader` and `train_with_pyreader` functions are used in a framework to execute tests with `pyreader`, evaluate metrics, log intervals, train models, handle options like testing, saving, early stopping, measure processing time, and update metrics. The code snippet defines model saving functions, deletes directories, implements early stopping, initializes pre-trained parameters, and uses AttrDict for getter/setter functionality.",
+        "type": "summary"
+    },
+    "2158": {
+        "file_id": 167,
+        "content": "\"\"\"\nutils\n\"\"\"\n#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\nimport time\nimport traceback\nimport logging\nimport shutil\nimport numpy as np\nimport paddle\nimport paddle.static as static\nimport static as static\nlogger = logging.getLogger(__name__)\ndef test_with_pyreader(exe,\n                       compiled_test_prog,\n                       test_pyreader,\n                       test_fetch_list,\n                       test_metrics,\n                       log_interval=0):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/utils.py:1-39"
+    },
+    "2159": {
+        "file_id": 167,
+        "content": "The code defines a function `test_with_pyreader` which takes several parameters like `exe`, `compiled_test_prog`, etc., and appears to be part of a larger framework. It seems to execute a test with the help of `pyreader` for input data, fetch list for outputs, and metrics for evaluation. The function runs on an interval specified by `log_interval`.",
+        "type": "comment"
+    },
+    "2160": {
+        "file_id": 167,
+        "content": "    \"\"\"test_with_pyreader\n    \"\"\"\n    if not test_pyreader:\n        logger.error(\"[TEST] get pyreader failed.\")\n    test_metrics.reset()\n    test_iter = 0\n    label_all = []\n    pred_all = []\n    try:\n        for data in test_pyreader():\n            test_outs = exe.run(compiled_test_prog,\n                                fetch_list=test_fetch_list,\n                                feed=data)\n            loss = np.array(test_outs[0])\n            pred = np.array(test_outs[1])\n            label = np.array(test_outs[-1])\n            pred_all.extend(pred)\n            label_all.extend(label)\n            test_metrics.accumulate(loss, pred, label)\n            test_iter += 1\n        test_metrics.finalize_and_log_out(\"[TEST] Finish\")\n    except Exception as e:\n        logger.warn(\n            \"[TEST] fail to execute test or calculate metrics: {}\".format(e))\n        traceback.print_exc()\n    metrics_dict, test_loss = test_metrics.get_computed_metrics()\n    metrics_dict['label_all'] = label_all\n    metrics_dict['pred_all'] = pred_all",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/utils.py:40-67"
+    },
+    "2161": {
+        "file_id": 167,
+        "content": "The code tests a PaddleVideo application, using the \"test_pyreader\" to read data and runs it through a neural network. It accumulates and logs test metrics, handles exceptions and provides a final result with computed metrics.",
+        "type": "comment"
+    },
+    "2162": {
+        "file_id": 167,
+        "content": "    return test_loss, metrics_dict\ndef train_with_pyreader(exe, train_prog, compiled_train_prog, train_pyreader,\n                        train_fetch_list, train_metrics, epochs=10,\n                        log_interval=0, valid_interval=0,\n                        save_dir='./', save_model_name='model',\n                        test_exe=None, test_pyreader=None,\n                        test_fetch_list=None, test_metrics=None):\n    \"\"\"train_with_pyreader\n    \"\"\"\n    if not train_pyreader:\n        logger.error(\"[TRAIN] get pyreader failed.\")\n    EARLY_STOP_NUM = 20\n    early_stop = EARLY_STOP_NUM\n    global_iter = 0\n    train_iter = 0\n    iter_all = 0\n    best_test_acc1 = 0\n    for epoch in range(epochs):\n        lr = static.global_scope().find_var(\"learning_rate\").get_tensor()\n        logger.info(\n            \"------- learning rate {}, learning rate counter  -----\".format(\n                np.array(lr)))\n        if early_stop < 0:\n            logger.info('Earyly Stop !!!')\n            break\n        train_metrics.reset()",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/utils.py:68-96"
+    },
+    "2163": {
+        "file_id": 167,
+        "content": "The function `train_with_pyreader` trains a model for the specified number of epochs and returns the test loss and metrics dictionary. It also includes options for testing and saving the model, as well as early stopping based on a defined threshold. The code initializes variables and enters a loop over the number of epochs, resetting training metrics and checking if early stopping should occur before each iteration.",
+        "type": "comment"
+    },
+    "2164": {
+        "file_id": 167,
+        "content": "        global_iter += train_iter\n        epoch_periods = []\n        for data in train_pyreader():\n            try:\n                cur_time = time.time()\n                train_outs = exe.run(compiled_train_prog,\n                                     fetch_list=train_fetch_list,\n                                     feed=data)\n                iter_all += 1\n                period = time.time() - cur_time\n                epoch_periods.append(period)\n                loss = np.array(train_outs[0])\n                pred = np.array(train_outs[1])\n                label = np.array(train_outs[-1])\n                train_metrics.accumulate(loss, pred, label)\n                if log_interval > 0 and (train_iter % log_interval == 0):\n                    # eval here\n                    train_metrics.finalize_and_log_out(\n                                info='[TRAIN] Epoch {} iter {} everage: '.format(epoch, train_iter))\n                train_iter += 1\n            except Exception as e:\n                logger.info(\n                    \"[TRAIN] Epoch {}, iter {} data training failed: {}\".",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/utils.py:97-119"
+    },
+    "2165": {
+        "file_id": 167,
+        "content": "This code is part of a training loop for a machine learning model. It keeps track of the current iteration, measures the time taken for processing each data batch, and updates loss, prediction, and label metrics. If logging interval is met, it finalizes and logs the metrics for the current epoch's iteration.",
+        "type": "comment"
+    },
+    "2166": {
+        "file_id": 167,
+        "content": "                    format(epoch, train_iter, str(e)))\n        if len(epoch_periods) < 1:\n            logger.info(\n                'No iteration was executed, please check the data reader')\n            sys.exit(1)\n        logger.info(\n            '[TRAIN] Epoch {} training finished, average time: {}'.format(\n                epoch, np.mean(epoch_periods)))\n        train_metrics.finalize_and_log_out( \\\n            info='[TRAIN] Finished ... Epoch {} all iters average: '.format(epoch))\n        # save models of min loss in best acc epochs\n        if test_exe and valid_interval > 0 and (epoch +\n                                                1) % valid_interval == 0:\n            # metrics_dict,loss = train_metrics.calculator.get_computed_metrics()\n            loss, metrics_dict_test = test_with_pyreader(\n                exe, test_exe, test_pyreader, test_fetch_list, test_metrics,\n                log_interval)\n            test_acc1 = metrics_dict_test['avg_acc1']\n            if test_acc1 > best_test_acc1:\n                best_test_acc1 = test_acc1",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/utils.py:120-141"
+    },
+    "2167": {
+        "file_id": 167,
+        "content": "The code finishes an epoch of training, logs the average time taken, and finalizes training metrics. If testing is enabled and a valid interval is set, it performs testing after each valid interval iteration and saves models with the best test accuracy.",
+        "type": "comment"
+    },
+    "2168": {
+        "file_id": 167,
+        "content": "                save_model(exe, train_prog, save_dir, save_model_name,\n                           \"_epoch{}_acc{}\".format(epoch, best_test_acc1))\n                early_stop = EARLY_STOP_NUM\n            else:\n                early_stop -= 1\ndef save_model(exe, program, save_dir, model_name, postfix=None):\n    \"\"\"save_model\n    \"\"\"\n    model_path = os.path.join(save_dir, model_name + postfix)\n    if os.path.isdir(model_path):\n        shutil.rmtree(model_path)\n    # fluid.io.save_persistables(exe, model_path, main_program=program)\n    save_vars = [x for x in program.list_vars() \\\n                                 if isinstance(x, paddle.framework.Parameter)]\n    static.save_vars(exe,\n                       dirname=model_path,\n                       main_program=program,\n                       vars=save_vars,\n                       filename=\"param\")\ndef save_model_persist(exe, program, save_dir, model_name, postfix=None):\n    \"\"\"save_model\"\"\"\n    model_path = os.path.join(save_dir, model_name + postfix)\n    if os.path.isdir(model_path):",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/utils.py:142-169"
+    },
+    "2169": {
+        "file_id": 167,
+        "content": "This code snippet defines functions for saving the model at specific epochs and after training has stopped. It checks if a directory with the model name exists, deletes it if necessary, and then saves the model using either fluid.io or static methods. The save_model function takes in execution context, program, save directory, model name, and optional postfix for the file name. The save_model_persist function is similar but uses the save_model method to save the model. The code also includes a check to stop training if the early stopping condition is met.",
+        "type": "comment"
+    },
+    "2170": {
+        "file_id": 167,
+        "content": "        shutil.rmtree(model_path)\n    paddle.fluid.io.save_persistables(exe,\n                               save_dir,\n                               main_program=program,\n                               filename=model_path)\ndef init_pretraining_params(exe,\n                            pretraining_params_path,\n                            main_program,\n                            use_fp16=False):\n    \"\"\"\n    init pretrain_params\n    \"\"\"\n    assert os.path.exists(pretraining_params_path\n                          ), \"[%s] cann't be found.\" % pretraining_params_path\n    def existed_params(var):\n        \"\"\"\n        Load existed params\n        \"\"\"\n        if not isinstance(var, paddle.framework.Parameter):\n            return False\n        flag = os.path.exists(os.path.join(pretraining_params_path, var.name))\n        return flag\n    static.load_vars(exe,\n                       pretraining_params_path,\n                       main_program=main_program,\n                       predicate=existed_params)\n    logger.info(\n        \"Load pretraining parameters from {}.\".format(pretraining_params_path))",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/utils.py:170-201"
+    },
+    "2171": {
+        "file_id": 167,
+        "content": "This function initializes the pre-trained parameters for a model. It first checks if the pretraining_params_path exists, and then loads any existing variables in the main program using static.load_vars(). If var is not a Parameter instance, it will return False. Finally, it logs that the pretraining parameters were loaded from the given path.",
+        "type": "comment"
+    },
+    "2172": {
+        "file_id": 167,
+        "content": "class AttrDict(dict):\n    \"\"\"AttrDict\n    \"\"\"\n    def __getattr__(self, key):\n        \"\"\"getter\n        \"\"\"\n        return self[key]\n    def __setattr__(self, key, value):\n        \"\"\"setter\n        \"\"\"\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/scenario_lib/utils.py:204-218"
+    },
+    "2173": {
+        "file_id": 167,
+        "content": "The code defines a subclass of Python's dictionary, named AttrDict. It overrides the `__getattr__` and `__setattr__` methods to provide getter and setter functionality for dictionary keys as attributes, similar to regular class attributes.",
+        "type": "comment"
+    },
+    "2174": {
+        "file_id": 168,
+        "content": "/applications/MultimodalVideoTag/train.sh",
+        "type": "filepath"
+    },
+    "2175": {
+        "file_id": 168,
+        "content": "This script exports CUDA device settings and FLAGS for efficient GPU usage, then executes a Python file to train an Attention LSTM Ernie model using the specified configuration file. The logs are saved at specific intervals, with pre-trained checkpoints used as well.",
+        "type": "summary"
+    },
+    "2176": {
+        "file_id": 168,
+        "content": "export CUDA_VISIBLE_DEVICES=0,1\nexport FLAGS_eager_delete_tensor_gb=0.0\nexport FLAGS_sync_nccl_allreduce=1\nexport FLAGS_fast_eager_deletion_mode=1\nexport FLAGS_fraction_of_gpu_memory_to_use=0.5\nexport FLAGS_reallocate_gpu_memory_in_mb=0\nexport FLAGS_memory_fraction_of_eager_deletion=1\npython scenario_lib/train.py --model_name=AttentionLstmErnie \\\n--config=./conf/conf.txt \\\n--log_interval=20 \\\n--valid_interval=1 \\\n--save_dir=checkpoints_save_new/ \\\n--pretrain=checkpoints_save/",
+        "type": "code",
+        "location": "/applications/MultimodalVideoTag/train.sh:1-13"
+    },
+    "2177": {
+        "file_id": 168,
+        "content": "This script exports CUDA device settings and FLAGS for efficient GPU usage, then executes a Python file to train an Attention LSTM Ernie model using the specified configuration file. The logs are saved at specific intervals, with pre-trained checkpoints used as well.",
+        "type": "comment"
+    },
+    "2178": {
+        "file_id": 169,
+        "content": "/applications/PP-Care/Readme.md",
+        "type": "filepath"
+    },
+    "2179": {
+        "file_id": 169,
+        "content": "The code initializes a pre-trained PaddleVideo model for PP-Care using TSM and ResNet50 weights, executes the application, and provides accuracy metrics while referencing relevant research papers on video understanding.",
+        "type": "summary"
+    },
+    "2180": {
+        "file_id": 169,
+        "content": "# Video models for 3DMRI\n## 内容\n- [模型简介](#模型简介)\n- [数据准备](#数据准备)\n- [模型训练](#模型训练)\n- [模型测试](#模型测试)\n- [模型推理](#模型推理)\n- [实现细节](#实现细节)\n- [参考论文](#参考论文)\n在开始使用之前，您需要按照以下命令安装额外的依赖包：\n```bash\npython -m pip install SimpleITK\n```\n## 模型简介\n目前对于医学3D数据如MRI，并无太好的处理手段，大多数2D模型无法获得3D空间层面的特征，而常用的3D模型又需要较大的计算成本。而同时，3D医学数据与常见的视频数据有一定相似之处，我们尝试了通过PaddleVideo中的常见模型解决医学3DMRI数据的分类问题，获得了较好的结果。目前支持PP-TSN、PP-TSM、Slowfast和Timesformer对3DMRI的直接训练。\n## 数据准备\n数据集包括帕金森患者(PD)与正常(Con)两种类型共378个case，训练集：测试集=300：78，使用数据均为公开数据集，包括*neurocon*, *taowu*, *PPMI*和*OASIS-1*（经过选取），并经过一定格式转换，数据最后的格式均为*name.nii*或*name.nii.gz*，路径与label信息通过txt文件保存，数据集可以通过百度网盘下载：[下载链接](https://pan.baidu.com/s/1eIsHHqnkKNG5x9CGjRONEA?pwd=avug)\n- 数据集label格式\n```\n{\n   \"0\": \"Con\",\n   \"1\": \"PD\"\n}\n```\n- 数据集信息文件格式\n```\n{\n   path1 label1\n   path2 label2\n   ...\n}\n```\n- 数据保存格式\n```\n{\n   |--  datasets\n      |--  neurocon\n      |--  taowu\n      |--  PPMI\n      |--  OASIS-1\n}\n```\n## 模型训练\n#### 下载并添加预训练模型\n1. 对于PP-TSN与PP-TSM，除了可以使用ImageNet1000上训练好的预训练模型（见[PP-TSN预训练模型](../../../docs/zh-CN/model_zoo/recognition/pp-tsn.md)与[PP-",
+        "type": "code",
+        "location": "/applications/PP-Care/Readme.md:1-55"
+    },
+    "2181": {
+        "file_id": 169,
+        "content": "Introduction to video models for 3DMRI, data preparation, model training, testing, inference details, and references. Install SimpleITK dependency. Uses PaddleVideo models for 3DMRI classification. Dataset includes PD and Con cases; train/test split is 300:78. Format as *.nii or *.nii.gz. Downloaded from a Baidu link.",
+        "type": "comment"
+    },
+    "2182": {
+        "file_id": 169,
+        "content": "TSM预训练模型](../../../docs/zh-CN/model_zoo/recognition/pp-tsm.md))，也可以使用在MRI数据集上预训练的ResNet50权重座位Backbone初始化参数，通过百度网盘下载: [下载链接](https://pan.baidu.com/s/1eIsHHqnkKNG5x9CGjRONEA?pwd=avug)。对于Slowfast与TimeSformer，目前只支持是使用自然数据集的预训练模型，见[Slowfast预训练模型](../../../docs/zh-CN/model_zoo/recognition/slowfast.md)与[Timesformer预训练模型](../../../docs/zh-CN/model_zoo/recognition/timesformer.md)\n2. 打开`PaddleVideo/applications/PP-Care/configs/XXX.yaml`，将下载好的权重路径填写到下方`pretrained:`之后，以pptsn_MRI为例\n   ```yaml\n   MODEL:\n       framework: \"RecognizerMRI\"\n       backbone:\n           name: \"ResNetTSN_MRI\"\n           pretrained: 将路径填写到此处\n   ```\n#### 开始训练\n- 训练使用显卡数量与输出路径等信息均可以选择，以PP-TSN_MRI的4卡训练为例，训练启动命令如下\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_pptsn_MRI main.py  --validate -c applications/PP-Care/configs/pptsn_MRI.yaml\n  ```\n## 模型测试\n由于各模型均存在随机采样部分，且采样方式存在不同，所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数，因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标，以PP-TSN_MRI为例，命令如下：\n```bash\npython3.7 -B -m paddle.distributed.laun",
+        "type": "code",
+        "location": "/applications/PP-Care/Readme.md:55-81"
+    },
+    "2183": {
+        "file_id": 169,
+        "content": "This code provides instructions on how to initialize a pre-trained model for PaddleVideo's PP-Care application. It mentions downloading a pre-trained TSM model, initializing the backbone with ResNet50 weights trained on MRI data, and filling in the weight path in the YAML configuration file. The code also explains how to train and test the model using specific commands for PP-TSN_MRI as an example.",
+        "type": "comment"
+    },
+    "2184": {
+        "file_id": 169,
+        "content": "ch --gpus=\"0,1,2,3\" --log_dir=log_pptsn_MRI main.py  --test -c applications/PP-Care/configs/pptsn_MRI.yaml -w \"output/ppTSN_MRI/ppTSN_MRI_best.pdparams\"\n```\n当测试配置采用.yaml中参数时，在3DMRI数据的validation数据集上的测试指标如下：\n|      backbone      |     head     |  Acc  |\n| :----------------: | :----------: | :---: |\n|      ResNet50      |    PP-TSN    | 91.07 |\n|      ResNet50      |    PP-TSM    | 90.83 |\n|     3DResNet50     |   Slowfast   | 91.07 |\n| Vision Transformer |  Timesformer | 88.33 |\n训练好的模型可以通过百度网盘下载：[下载链接](https://pan.baidu.com/s/1eIsHHqnkKNG5x9CGjRONEA?pwd=avug)\n## 模型优化\n在实际使用中，可以尝试模型优化策略\n- 可以根据MRI数据分布，调整采样率\n- 本模型目前未加入过多的数据预处理策略，针对不同数据特性，在本模型基础上加入一定的预处理手段可能会使结果继续提升\n- 由于数据量与任务难度限制，本模型目前在准确率上的表现与3DResNet并无显著区别，但对于时间与空间的需求均远小于3D模型\n## 参考论文\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/pdf/1608.00859.pdf), Limin Wang, Yuanjun Xiong, Zhe Wang\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han",
+        "type": "code",
+        "location": "/applications/PP-Care/Readme.md:81-106"
+    },
+    "2185": {
+        "file_id": 169,
+        "content": "The given code executes a PaddleVideo application, PP-Care, using specific configurations and trained model weights. It tests the ResNet50 backbone with PP-TSN and PP-TSM heads on 3DMRI validation data and reports accuracy metrics. The optimized models can be downloaded from a Baidu disk link provided.",
+        "type": "comment"
+    },
+    "2186": {
+        "file_id": 169,
+        "content": "- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean\n- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.\n- [A Multigrid Method for Efficiently Training Video Models](https://arxiv.org/abs/1912.00998), Chao-Yuan Wu, Ross Girshick, et al.\n- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani",
+        "type": "code",
+        "location": "/applications/PP-Care/Readme.md:107-110"
+    },
+    "2187": {
+        "file_id": 169,
+        "content": "This code snippet contains the references for various important research papers related to video understanding using neural networks. These papers cover topics like knowledge distillation, slow-fast networks, and efficient training methods for video models.",
+        "type": "comment"
+    },
+    "2188": {
+        "file_id": 170,
+        "content": "/applications/PPHuman/README.md",
+        "type": "filepath"
+    },
+    "2189": {
+        "file_id": 170,
+        "content": "This code converts PaddleVideo's JSON files to training data, exports a model for PP-Human, and organizes it in directories suitable for behavior recognition inference.",
+        "type": "summary"
+    },
+    "2190": {
+        "file_id": 170,
+        "content": "# PP-Human 行为识别模型\n实时行人分析工具[PP-Human](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/pphuman)中集成了基于骨骼点的行为识别模块。本文档介绍如何基于[PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo/)，完成行为识别模型的训练流程。\n## 行为识别模型训练\n目前行为识别模型使用的是[ST-GCN](https://arxiv.org/abs/1801.07455)，并在[PaddleVideo训练流程](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/stgcn.md)的基础上修改适配，完成模型训练。\n### 准备训练数据\nSTGCN是一个基于骨骼点坐标序列进行预测的模型。在PaddleVideo中，训练数据为采用`.npy`格式存储的`Numpy`数据，标签则可以是`.npy`或`.pkl`格式存储的文件。对于序列数据的维度要求为`(N,C,T,V,M)`。\n以我们在PPhuman中的模型为例，其中具体说明如下：\n| 维度 | 大小 | 说明 |\n| ---- | ---- | ---------- |\n| N | 不定 | 数据集序列个数 |\n| C | 2 | 关键点坐标维度，即(x, y) |\n| T | 50 | 动作序列的时序维度（即持续帧数）|\n| V | 17 | 每个人物关键点的个数，这里我们使用了`COCO`数据集的定义，具体可见[这里](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/docs/tutorials/PrepareKeypointDataSet_cn.md#COCO%E6%95%B0%E6%8D%AE%E9%9B%86) |\n| M | 1 | 人物个数，这里我们每个动作序列只针对单人预测 |\n#### 1. 获取序列的骨骼点坐标\n对于一个待标注的序列（这里序列指一个动作片段，可以是视频或有顺序的图片集合）。可以通过模型预测或人工标注的方式获取骨骼点（也称为关键点）坐标。",
+        "type": "code",
+        "location": "/applications/PPHuman/README.md:1-21"
+    },
+    "2191": {
+        "file_id": 170,
+        "content": "Training behavior recognition model using ST-GCN on PaddleVideo.\nPrepare training data in Numpy format with dimensions (N,C,T,V,M).",
+        "type": "comment"
+    },
+    "2192": {
+        "file_id": 170,
+        "content": "- 模型预测：可以直接选用[PaddleDetection KeyPoint模型系列](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/configs/keypoint) 模型库中的模型，并根据`3、训练与测试 - 部署预测 - 检测+keypoint top-down模型联合部署`中的步骤获取目标序列的17个关键点坐标。\n- 人工标注：若对关键点的数量或是定义有其他需求，也可以直接人工标注各个关键点的坐标位置，注意对于被遮挡或较难标注的点，仍需要标注一个大致坐标，否则后续网络学习过程会受到影响。\n在完成骨骼点坐标的获取后，建议根据各人物的检测框进行归一化处理，以消除人物位置、尺度的差异给网络带来的收敛难度，这一步可以参考[这里](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/deploy/pphuman/pipe_utils.py#L352-L363)。\n#### 2. 统一序列的时序长度\n由于实际数据中每个动作的长度不一，首先需要根据您的数据和实际场景预定时序长度（在PP-Human中我们采用50帧为一个动作序列），并对数据做以下处理：\n- 实际长度超过预定长度的数据，随机截取一个50帧的片段\n- 实际长度不足预定长度的数据：补0，直到满足50帧\n- 恰好等于预定长度的数据： 无需处理\n注意：在这一步完成后，请严格确认处理后的数据仍然包含了一个完整的行为动作，不会产生预测上的歧义，建议通过可视化数据的方式进行确认。\n#### 3. 保存为PaddleVideo可用的文件格式\n在经过前两步处理后，我们得到了每个人物动作片段的标注，此时我们已有一个列表`all_kpts`，这个列表中包含多个关键点序列片段，其中每一个片段形状为(T, V, C) （在我们的例子中即(50, 17, 2)), 下面进一步将其转化为PaddleVideo可用的格式。\n- 调整维度顺序： 可通过`np.transpose`和`np.expand_dims`将每一个片段的维度转化为(C, T, V, M)的格式。\n- 将所有片段组合并保存为一个文件\n注意：这里的`class_id`是`int`类型，与其他分类任务类似。例如`0：摔倒， 1：其他`。\n至此，我们得到了可用的训练数据（`.npy`）和对应的标注文件（`.pkl`）。",
+        "type": "code",
+        "location": "/applications/PPHuman/README.md:22-42"
+    },
+    "2193": {
+        "file_id": 170,
+        "content": "The code describes the process of preparing data for PP-Human, a human action detection model. It involves obtaining key points from pre-trained models or manual annotations, normalizing the coordinates, setting a uniform sequence length, and saving the data in PaddleVideo compatible format.",
+        "type": "comment"
+    },
+    "2194": {
+        "file_id": 170,
+        "content": "#### 示例：基于UR Fall Detection Dataset的摔倒数据处理\n[UR Fall Detection Dataset](http://fenix.univ.rzeszow.pl/~mkepski/ds/uf.html)是一个包含了不同摄像机视角及不同传感器下的摔倒检测数据集。数据集本身并不包含关键点坐标标注，在这里我们使用平视视角（camera 0）的RGB图像数据，介绍如何依照上面展示的步骤完成数据准备工作。\n（1）使用[PaddleDetection关键点模型](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/configs/keypoint)完成关键点坐标的检测\n```bash\n# current path is under root of PaddleDetection\n# Step 1: download pretrained inference models.\nwget https://bj.bcebos.com/v1/paddledet/models/pipeline/mot_ppyoloe_l_36e_pipeline.zip\nwget https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip\nunzip -d output_inference/ mot_ppyoloe_l_36e_pipeline.zip\nunzip -d output_inference/ dark_hrnet_w32_256x192.zip\n# Step 2: Get the keypoint coordinarys\n# if your data is image sequence\npython deploy/python/det_keypoint_unite_infer.py --det_model_dir=output_inference/mot_ppyoloe_l_36e_pipeline/ --keypoint_model_dir=output_inference/dark_hrnet_w32_256x192 --image_dir={your image directory path} --device=GPU --save_res=True",
+        "type": "code",
+        "location": "/applications/PPHuman/README.md:44-60"
+    },
+    "2195": {
+        "file_id": 170,
+        "content": "This code is downloading pretrained models for keypoint detection using PaddleDetection and then using them to get the keypoint coordinates for an image sequence.",
+        "type": "comment"
+    },
+    "2196": {
+        "file_id": 170,
+        "content": "# if your data is video\npython deploy/python/det_keypoint_unite_infer.py --det_model_dir=output_inference/mot_ppyoloe_l_36e_pipeline/ --keypoint_model_dir=output_inference/dark_hrnet_w32_256x192 --video_file={your video file path} --device=GPU --save_res=True\n```\n这样我们会得到一个`det_keypoint_unite_image_results.json`的检测结果文件。内容的具体含义请见[这里](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/deploy/python/det_keypoint_unite_infer.py#L108)。\n这里我们需要对UR Fall中的每一段数据执行上面介绍的步骤，在每一段执行完成后及时将检测结果文件妥善保存到一个文件夹中。\n```bash\nmkdir {root of PaddleVideo}/applications/PPHuman/datasets/annotations\nmv det_keypoint_unite_image_results.json {root of PaddleVideo}/applications/PPHuman/datasets/annotations/det_keypoint_unite_image_results_{video_id}_{camera_id}.json\n```\n（2）将关键点坐标转化为训练数据\n在完成上述步骤后，我们得到的骨骼点数据形式如下：\n```\nannotations/\n├── det_keypoint_unite_image_results_fall-01-cam0-rgb.json\n├── det_keypoint_unite_image_results_fall-02-cam0-rgb.json\n├── det_keypoint_unite_image_results_fall-03-cam0-rgb.json\n├── det_keypoint_unite_image_results_fall-04-cam0-rgb.json",
+        "type": "code",
+        "location": "/applications/PPHuman/README.md:62-83"
+    },
+    "2197": {
+        "file_id": 170,
+        "content": "The provided code is a command line instruction for running the PaddleVideo's PPHuman application on video data. It uses pre-trained models to detect human keypoints in the video, resulting in a `det_keypoint_unite_image_results.json` file containing the detection results. These steps are repeated for each segment of UR Fall data. The JSON files are then saved into a specific directory structure with a naming convention based on video and camera IDs.",
+        "type": "comment"
+    },
+    "2198": {
+        "file_id": 170,
+        "content": "    ...\n├── det_keypoint_unite_image_results_fall-28-cam0-rgb.json\n├── det_keypoint_unite_image_results_fall-29-cam0-rgb.json\n└── det_keypoint_unite_image_results_fall-30-cam0-rgb.json\n```\n这里使用我们提供的脚本直接将数据转化为训练数据, 得到数据文件`train_data.npy`, 标签文件`train_label.pkl`。该脚本执行的内容包括解析json文件内容、前述步骤中介绍的整理训练数据及保存数据文件。\n```bash\n# current path is {root of PaddleVideo}/applications/PPHuman/datasets/\npython prepare_dataset.py\n```\n几点说明：\n- UR Fall的动作大多是100帧左右长度对应一个完整动作，个别视频包含一些无关动作，可以手工去除，也可以裁剪作为负样本\n- 统一将数据整理为100帧，再抽取为50帧，保证动作完整性\n- 上述包含摔倒的动作是正样本，在实际训练中也需要一些其他的动作或正常站立等作为负样本，步骤同上，但注意label的类型取1。\n这里我们提供了我们处理好的更全面的[数据](https://bj.bcebos.com/v1/paddledet/data/PPhuman/fall_data.zip)，包括其他场景中的摔倒及非摔倒的动作场景。\n### 训练与测试\n在PaddleVideo中，使用以下命令即可开始训练：\n```bash\n# current path is under root of PaddleVideo\npython main.py -c applications/PPHuman/configs/stgcn_pphuman.yaml\n# 由于整个任务可能过拟合,建议同时开启验证以保存最佳模型\npython main.py --validate -c applications/PPHuman/configs/stgcn_pphuman.yaml\n```\n在训练完成后，采用以下命令进行预测：\n```bash\npython main.py --test -c applications/PPHuman/configs/stgcn_pphuman.yaml  -w output/STGCN/STGCN_best.pdparams",
+        "type": "code",
+        "location": "/applications/PPHuman/README.md:84-114"
+    },
+    "2199": {
+        "file_id": 170,
+        "content": "Code snippet represents a list of json files in a PaddleVideo application called \"PPHuman\". These JSON files contain image results for different actions. The code suggests using a provided script to convert these data into training data, resulting in two new files: \"train_data.npy\" and \"train_label.pkl\". It mentions that some data preparation steps include parsing the JSON content and organizing the training data. There is a link for more comprehensive data available for download. The code also provides instructions on how to train and test the model using PaddleVideo's main script with specific configurations.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/22.json b/docs/data/22.json
new file mode 100644
index 000000000..469144acc
--- /dev/null
+++ b/docs/data/22.json
@@ -0,0 +1,544 @@
+{
+    "2200": {
+        "file_id": 170,
+        "content": "```\n### 导出模型推理\n- 在PaddleVideo中，通过以下命令实现模型的导出，得到模型结构文件`STGCN.pdmodel`和模型权重文件`STGCN.pdiparams`，并增加配置文件：\n```bash\n# current path is under root of PaddleVideo\npython tools/export_model.py -c applications/PPHuman/configs/stgcn_pphuman.yaml \\\n                                -p output/STGCN/STGCN_best.pdparams \\\n                                -o output_inference/STGCN\ncp applications/PPHuman/configs/infer_cfg.yml output_inference/STGCN\n# 重命名模型文件，适配PP-Human的调用\ncd output_inference/STGCN\nmv STGCN.pdiparams model.pdiparams\nmv STGCN.pdiparams.info model.pdiparams.info\nmv STGCN.pdmodel model.pdmodel\n```\n完成后的导出模型目录结构如下：\n```\nSTGCN\n├── infer_cfg.yml\n├── model.pdiparams\n├── model.pdiparams.info\n├── model.pdmodel\n```\n至此，就可以使用[PP-Human](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/pphuman)进行行为识别的推理了。",
+        "type": "code",
+        "location": "/applications/PPHuman/README.md:115-143"
+    },
+    "2201": {
+        "file_id": 170,
+        "content": "The provided code demonstrates the process of exporting a model in PaddleVideo for use in PP-Human. It creates the necessary files and renames them according to PP-Human's requirements, resulting in a structured directory that can be used for behavior recognition inference.",
+        "type": "comment"
+    },
+    "2202": {
+        "file_id": 171,
+        "content": "/applications/PPHuman/datasets/prepare_dataset.py",
+        "type": "filepath"
+    },
+    "2203": {
+        "file_id": 171,
+        "content": "The script prepares PaddleVideo's UR FALL dataset keypoints, normalizing them and handling inconsistencies for training. It also prepares a dataset for PPHuman, reading annotations, extracting data, and saving for training.",
+        "type": "summary"
+    },
+    "2204": {
+        "file_id": 171,
+        "content": "import os\nimport json\nimport numpy as np\nimport pickle\n\"\"\"\n This python script is used to convert keypoint results of UR FALL dataset\n   for training by PaddleVideo\n\"\"\"\ndef self_norm(kpt, bbox):\n    # kpt: (2, T, 17, 1),  bbox: (T, 4)\n    tl = bbox[:, 0:2]\n    wh = bbox[:, 2:]\n    tl = np.expand_dims(np.transpose(tl, (1, 0)), (2, 3))\n    wh = np.expand_dims(np.transpose(wh, (1, 0)), (2, 3))\n    res = (kpt - tl) / wh\n    res *= np.expand_dims(np.array([[384.], [512.]]), (2, 3))\n    return res\ndef convert_to_ppvideo(all_kpts, all_scores, all_bbox):\n    # shape of all_kpts is (T, 17, 2)\n    keypoint = np.expand_dims(np.transpose(all_kpts, [2, 0, 1]),\n                              -1)  #(2, T, 17, 1)\n    keypoint = self_norm(keypoint, all_bbox)\n    scores = all_scores\n    if keypoint.shape[1] > 100:\n        frame_start = (keypoint.shape[1] - 100) // 2\n        keypoint = keypoint[:, frame_start:frame_start + 100:2, :, :]\n        scores = all_scores[frame_start:frame_start + 100:2, :, :]\n    elif keypoint.shape[1] < 100:",
+        "type": "code",
+        "location": "/applications/PPHuman/datasets/prepare_dataset.py:1-34"
+    },
+    "2205": {
+        "file_id": 171,
+        "content": "This script converts keypoint results of UR FALL dataset into a format suitable for training by PaddleVideo. It normalizes keypoints using bounding boxes and adjusts the shape to be compatible with the PaddleVideo framework. The function also handles cases where the number of frames is more or less than 100.",
+        "type": "comment"
+    },
+    "2206": {
+        "file_id": 171,
+        "content": "        keypoint = np.concatenate([\n            keypoint,\n            np.zeros((2, 100 - keypoint.shape[1], 17, 1), dtype=keypoint.dtype)\n        ], 1)[:, ::2, :, :]\n        scores = np.concatenate([\n            all_scores,\n            np.zeros((100 - all_scores.shape[0], 17, 1), dtype=keypoint.dtype)\n        ], 0)[::2, :, :]\n    else:\n        keypoint = keypoint[:, ::2, :, :]\n        scores = scores[::2, :, :]\n    return keypoint, scores\ndef decode_json_path(json_path):\n    content = json.load(open(json_path))\n    content = sorted(content, key=lambda x: x[0])\n    all_kpts = []\n    all_score = []\n    all_bbox = []\n    for annos in content:\n        bboxes = annos[1]\n        kpts = annos[2][0]\n        frame_id = annos[0]\n        if len(bboxes) != 1:\n            continue\n        kpt_res = []\n        kpt_score = []\n        for kpt in kpts[0]:\n            x, y, score = kpt\n            kpt_res.append([x, y])\n            kpt_score.append([score])\n        all_kpts.append(np.array(kpt_res))\n        all_score.append(np.array(kpt_score))",
+        "type": "code",
+        "location": "/applications/PPHuman/datasets/prepare_dataset.py:35-69"
+    },
+    "2207": {
+        "file_id": 171,
+        "content": "The function `prepare_dataset` receives keypoint and scores as inputs. If the length of either is not divisible by 2, it pads them with zeros to maintain consistency. The else block simply takes every other value in both arrays. The `decode_json_path` function loads a JSON file, sorts its contents, extracts bounding boxes, keypoints, and scores from each entry, ignoring cases where there is more than one bounding box, and appends the processed data to separate lists for further processing.",
+        "type": "comment"
+    },
+    "2208": {
+        "file_id": 171,
+        "content": "        all_bbox.append([\n            bboxes[0][0], bboxes[0][1], bboxes[0][2] - bboxes[0][0],\n            bboxes[0][3] - bboxes[0][1]\n        ])\n    all_kpts_np = np.array(all_kpts)\n    all_score_np = np.array(all_score)\n    all_bbox_np = np.array(all_bbox)\n    video_anno, scores = convert_to_ppvideo(all_kpts_np, all_score_np,\n                                            all_bbox_np)\n    return video_anno, scores\nif __name__ == '__main__':\n    all_keypoints = []\n    all_labels = [[], []]\n    all_scores = []\n    for i, path in enumerate(os.listdir(\"annotations\")):\n        video_anno, score = decode_json_path(os.path.join(\"annotations\", path))\n        all_keypoints.append(video_anno)\n        all_labels[0].append(str(i))\n        all_labels[1].append(0)  #label 0 means falling\n        all_scores.append(score)\n    all_data = np.stack(all_keypoints, 0)\n    all_score_data = np.stack(all_scores, 0)\n    np.save(f\"train_data.npy\", all_data)\n    pickle.dump(all_labels, open(f\"train_label.pkl\", \"wb\"))\n    np.save(\"kptscore_data.npy\", all_score_data)",
+        "type": "code",
+        "location": "/applications/PPHuman/datasets/prepare_dataset.py:70-98"
+    },
+    "2209": {
+        "file_id": 171,
+        "content": "This code prepares a dataset for PaddleVideo's PPHuman application. It reads annotations from \"annotations\" folder, extracts keypoints, labels, and scores, then saves them into numpy arrays and pickle file for training.",
+        "type": "comment"
+    },
+    "2210": {
+        "file_id": 172,
+        "content": "/applications/README.md",
+        "type": "filepath"
+    },
+    "2211": {
+        "file_id": 172,
+        "content": "This code lists various application cases in PaddleVideo, including football action detection, basketball action detection, table tennis action recognition, figure skating action identification, video tagging, multimodal video classification, video quality assessment, 3DMRI medical image recognition, video interactive segmentation tool, UAV detection, abnormal behavior detection, and human analysis scenario action recognition.",
+        "type": "summary"
+    },
+    "2212": {
+        "file_id": 172,
+        "content": "# 应用案例\n## 1. 概览\n| Applications | Descriptions |\n| :--------------- | :-------- |\n| [FootballAction](./FootballAction) | 足球动作检测方案|\n| [BasketballAction](./BasketballAction) | 篮球动作检测方案 |\n| [TableTennis](./TableTennis) | 乒乓球动作识别方案|\n| [FigureSkating](./FigureSkating) | 花样滑冰动作识别方案|\n| [VideoTag](./VideoTag) | 3000类大规模视频分类方案 |\n| [MultimodalVideoTag](./MultimodalVideoTag) | 多模态视频分类方案|\n| [VideoQualityAssessment](.s/VideoQualityAssessment) | 视频质量评估方案|\n| [PP-Care](./PP-Care) | 3DMRI医疗图像识别方案 |\n| [EIVideo](./EIVideo) | 视频交互式分割工具|\n| [Anti-UAV](./Anti-UAV) |无人机检测方案|\n| [AbnormalActionDetection](./AbnormalActionDetection) |异常行为检测方案|\n| [PP-Human](./PPHuman) | 行人分析场景动作识别方案 |",
+        "type": "code",
+        "location": "/applications/README.md:1-18"
+    },
+    "2213": {
+        "file_id": 172,
+        "content": "This code lists various application cases in PaddleVideo, including football action detection, basketball action detection, table tennis action recognition, figure skating action identification, video tagging, multimodal video classification, video quality assessment, 3DMRI medical image recognition, video interactive segmentation tool, UAV detection, abnormal behavior detection, and human analysis scenario action recognition.",
+        "type": "comment"
+    },
+    "2214": {
+        "file_id": 173,
+        "content": "/applications/T2VLAD/README.md",
+        "type": "filepath"
+    },
+    "2215": {
+        "file_id": 173,
+        "content": "This code introduces T2VLAD, a text video retrieval model by Baidu. It provides data preparation, training, and testing steps on MSR-VTT dataset, along with performance metrics and checkpoint information in PaddleVideo.",
+        "type": "summary"
+    },
+    "2216": {
+        "file_id": 173,
+        "content": "[English](./README_en.md) | 简体中文\n# T2VLAD: 基于局部全局对齐的文本视频检索\n---\n## 内容\n- [模型简介](#模型简介)\n- [数据准备](#数据准备)\n- [模型训练](#模型训练)\n- [模型测试](#模型测试)\n- [参考论文](#参考论文)\n在开始使用之前，您需要按照以下命令安装额外的依赖包：\n```bash\npython -m pip install paddlenlp\n```\n同时确保paddle版本为2.2.2。\n## 模型简介\nT2VLAD是百度在CVPR2021提出的文本视频检索模型。文本视频检索是一项具有挑战的任务，旨在基于自然语言处理描述搜索相关视频内容。这个问题的关键是在联合嵌入空间中测量文本-视频的相似性。T2VLAD设计了一种有效的全局-局部对齐方法，在三个标准的文本视频检索基准上取得了一致的改进，并以明显的优势超越了最先进的技术。\n<div align=\"center\">\n<img src=\"./imgs/t2vlad.png\" height=400 width=700 hspace='10'/> <br />\n</div>\n## 数据准备\nMSR-VTT数据下载及准备请参考 [MSR-VTT数据准备](../../docs/zh-CN/dataset/msrvtt.md)\n## 模型训练\n### MSR-VTT数据集训练\n下载数据并添加到 `data/MSRVTT` 文件夹下。\n#### 开始训练\n- 训练启动命令如下:\n```bash\nexport CUDA_VISIBLE_DEVICES=0\npython3.7 train.py --config ./configs/msrvtt_transformers.json\n```\nT2VLAD在训练时使用了Ranger优化器，这里我们暂时没有支持Ranger优化器到的实现，目前可以使用AdamW优化器来完成训练。\n## 模型测试\n- 对下游任务：文本-视频检索，在MSR-VTT数据集上评估性能，评估脚本启动方式如下：\n```bash\nexport CUDA_VISIBLE_DEVICES=0\npython3.7 test.py --config ./configs/msrvtt_transformers.json --resume ./T2VLAD_msrvtt.pdparams\n```\nMSR-VTT数据集测试精度:",
+        "type": "code",
+        "location": "/applications/T2VLAD/README.md:1-60"
+    },
+    "2217": {
+        "file_id": 173,
+        "content": "This code provides an introduction to the T2VLAD model, a text video retrieval model proposed by Baidu at CVPR 2021. It explains how to prepare data, train the model, and test it on the MSR-VTT dataset. The code also includes instructions for installing dependencies and running the necessary commands.",
+        "type": "comment"
+    },
+    "2218": {
+        "file_id": 173,
+        "content": "Text $\\rightarrow$ Video\n| R@1  | R@5  | R@10 | Median R |                         checkpoints                          |\n| :--: | :--: | :--: | :------: | :----------------------------------------------------------: |\n| 29.5 | 59.0 | 70.1 |   4      | [T2VLAD.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/T2VLAD_msrvtt.pdparams) |\nVideo $\\rightarrow$ Text\n| R@1  | R@5  | R@10 | Median R |\n| :--: | :--: | :--: | :------: |\n| 26.1 | 54.7 | 68.1 |   4      |\n## 参考论文\n- [T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval\n](https://arxiv.org/pdf/2104.10054.pdf), Xiaohan Wang, Linchao Zhu, Yi Yang",
+        "type": "code",
+        "location": "/applications/T2VLAD/README.md:61-75"
+    },
+    "2219": {
+        "file_id": 173,
+        "content": "This code provides performance metrics and checkpoint information for a T2VLAD model in PaddleVideo. The Text-to-Video R@1, R@5, R@10, and Median R values are shown, along with the corresponding checkpoints' links. Video-to-Text R@1, R@5, R@10, and Median R values are also given. The reference paper for T2VLAD is provided.",
+        "type": "comment"
+    },
+    "2220": {
+        "file_id": 174,
+        "content": "/applications/T2VLAD/README_en.md",
+        "type": "filepath"
+    },
+    "2221": {
+        "file_id": 174,
+        "content": "The code installs PaddleNLP, trains T2VLAD on MSRVTT dataset, and demonstrates retrieval performance with metrics R@1, R@5, R@10, and median rank at 26.1, 54.7, 68.1, and 4 respectively. Based on the paper \"T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval\" by Xiaohan Wang et al.",
+        "type": "summary"
+    },
+    "2222": {
+        "file_id": 174,
+        "content": "[简体中文](./README.md) | English\n# T2VLAD\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Reference](#Reference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install paddlenlp\n```\n## Introduction\nT2VLAD is proposed by Baidu in CVPR2021 for text-video retrieval. Text-video retrieval is a challenging task that aims to search relevant video contents based on natural language descriptions. The key to this problem is to measure text- video similarities in a joint embedding space. T2VLAD designs an efficient global-local alignment method. This model achieves consistent improvements on three standard text-video retrieval benchmarks and outperform the state- of-the-art by a clear margin.\n<div align=\"center\">\n<img src=\"./imgs/t2vlad.png\" height=400 width=700 hspace='10'/> <br />\n</div>\n## Data\nPlease refer to MSR-VTT data download and preparation doc [MSR-VTT data](../../docs/en/dataset/msrvtt.md)\n## Train\n### Train on MSR-VTT",
+        "type": "code",
+        "location": "/applications/T2VLAD/README_en.md:1-31"
+    },
+    "2223": {
+        "file_id": 174,
+        "content": "Code snippet for installing additional dependencies:\n```bash\npython -m pip install paddlenlp\n```\nThis code is for installing Paddlepaddle Natural Language Processing (NLP) library, which is a required dependency for running T2VLAD application.",
+        "type": "comment"
+    },
+    "2224": {
+        "file_id": 174,
+        "content": "Download data then move to `data/MSRVTT` folder.\n#### Start training\n- Train T2VLAD on MSRVTT scripts:\n```bash\nexport CUDA_VISIBLE_DEVICES=0\npython3.7 train.py --config ./configs/msrvtt_transformers.json\n```\nT2VLAD uses the Ranger optimizer during training. We haven't supported the implementation of Ranger optimizer, for now, the AdamW optimizer can be used to complete the training.\n## Test\n- Evaluation performs on downstream task, i.e. text-video clip retrieval on MSR-VTT dataset, test accuracy can be obtained using scripts:\n```bash\nexport CUDA_VISIBLE_DEVICES=0\npython3.7 test.py --config ./configs/msrvtt_transformers.json --resume ./T2VLAD_msrvtt.pdparams\n```\nAccuracy on MSR-VTT:\nText $\\rightarrow$ Video\n| R@1  | R@5  | R@10 | Median R |                         checkpoints                          |\n| :--: | :--: | :--: | :------: | :----------------------------------------------------------: |\n| 29.5 | 59.0 | 70.1 |   4      | [T2VLAD.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/T2VLAD_msrvtt.pdparams) |",
+        "type": "code",
+        "location": "/applications/T2VLAD/README_en.md:32-59"
+    },
+    "2225": {
+        "file_id": 174,
+        "content": "This code outlines the steps to train and test T2VLAD on the MSRVTT dataset. It requires downloading data, moving it to a specified folder, and executing training and testing scripts with appropriate configuration files. The Ranger optimizer is not currently supported, so AdamW should be used instead. Test accuracy results are provided for text-to-video clip retrieval on the MSRVTT dataset.",
+        "type": "comment"
+    },
+    "2226": {
+        "file_id": 174,
+        "content": "Video $\\rightarrow$ Text\n| R@1  | R@5  | R@10 | Median R |\n| :--: | :--: | :--: | :------: |\n| 26.1 | 54.7 | 68.1 |   4      |\n## Reference\n- [T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval\n](https://arxiv.org/pdf/2104.10054.pdf), Xiaohan Wang, Linchao Zhu, Yi Yang",
+        "type": "code",
+        "location": "/applications/T2VLAD/README_en.md:61-69"
+    },
+    "2227": {
+        "file_id": 174,
+        "content": "This code shows the retrieval performance of a Text-Video Retrieval model, with metrics R@1, R@5, R@10, and median rank at 26.1, 54.7, 68.1, and 4 respectively. The reference is the paper \"T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval\" by Xiaohan Wang et al.",
+        "type": "comment"
+    },
+    "2228": {
+        "file_id": 175,
+        "content": "/applications/T2VLAD/base/__init__.py",
+        "type": "filepath"
+    },
+    "2229": {
+        "file_id": 175,
+        "content": "This code is importing two modules, 'base_model' and 'base_trainer', from the current package's subfolders. These modules likely contain the base model and trainer classes for further use in the application.",
+        "type": "summary"
+    },
+    "2230": {
+        "file_id": 175,
+        "content": "from .base_model import *\nfrom .base_trainer import *",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/__init__.py:1-2"
+    },
+    "2231": {
+        "file_id": 175,
+        "content": "This code is importing two modules, 'base_model' and 'base_trainer', from the current package's subfolders. These modules likely contain the base model and trainer classes for further use in the application.",
+        "type": "comment"
+    },
+    "2232": {
+        "file_id": 176,
+        "content": "/applications/T2VLAD/base/base_dataset.py",
+        "type": "filepath"
+    },
+    "2233": {
+        "file_id": 176,
+        "content": "The `BaseDataset` class serves as a base for creating video feature datasets, handling missing values and encoding text while supporting efficient dataset partitioning.",
+        "type": "summary"
+    },
+    "2234": {
+        "file_id": 176,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport time\nimport json\nimport random\nimport paddle\nimport inspect\nimport logging\nimport functools\nimport data_loader\nimport numpy as np\nimport pickle as pkl\nfrom pathlib import Path\nfrom abc import abstractmethod\nfrom typing import Dict, Union\nfrom numpy.random import randint\nfrom typeguard import typechecked\nfrom collections import OrderedDict\nfrom zsvision.zs_utils import memcache\ntry:\n    from paddlenlp.transformers import BertTokenizer\nexcept ImportError as e:\n    print(\n        f\"{e}, [paddlenlp] package and it's dependencies is required for T2VLAD.\"",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:1-36"
+    },
+    "2235": {
+        "file_id": 176,
+        "content": "Copyright and license information, importing necessary libraries, and type guarding.",
+        "type": "comment"
+    },
+    "2236": {
+        "file_id": 176,
+        "content": "    )\nfrom utils import ensure_tensor, expert_tensor_storage\n# For SLURM usage, buffering makes it difficult to see events as they happen, so we set\n# the global print statement to enforce flushing\nprint = functools.partial(print, flush=True)\nclass BaseDataset(paddle.io.Dataset):\n    @staticmethod\n    @abstractmethod\n    @typechecked\n    def dataset_paths() -> Dict[str, Union[Path, str]]:\n        \"\"\"Generates a datastructure containing all the paths required to load features\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def sanity_checks(self):\n        \"\"\"Run sanity checks on loaded data\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def load_features(self):\n        \"\"\"Load features from disk\n        \"\"\"\n        raise NotImplementedError\n    @typechecked\n    def __init__(\n        self,\n        data_dir: Path,\n        eval_only: bool,\n        use_zeros_for_missing: bool,\n        text_agg: str,\n        text_feat: str,\n        split_name: str,\n        cls_partition: str,\n        root_feat_folder: str,",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:37-76"
+    },
+    "2237": {
+        "file_id": 176,
+        "content": "This code defines a base class `BaseDataset` for creating and loading video features dataset. It contains methods for generating required paths, performing sanity checks on loaded data, and loading features from disk. The class is abstract and requires subclass implementation of these methods. It also includes utility functions and settings like `dataset_paths`, `sanity_checks`, and `load_features`.",
+        "type": "comment"
+    },
+    "2238": {
+        "file_id": 176,
+        "content": "        text_dim: int,\n        num_test_captions: int,\n        restrict_train_captions: int,\n        max_tokens: Dict[str, int],\n        logger: logging.Logger,\n        raw_input_dims: Dict[str, int],\n        feat_aggregation: Dict[str, Dict],\n    ):\n        self.eval_only = eval_only\n        self.logger = logger\n        self.text_feat = text_feat\n        self.data_dir = data_dir\n        self.text_dim = text_dim\n        self.restrict_train_captions = restrict_train_captions\n        self.max_tokens = max_tokens\n        self.cls_partition = cls_partition\n        self.num_test_captions = num_test_captions\n        self.feat_aggregation = feat_aggregation\n        self.root_feat = data_dir / root_feat_folder\n        self.experts = set(raw_input_dims.keys())\n        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n        # This attributes can be overloaded by different datasets, so it must be set\n        # before the `load_features() method call`\n        self.restrict_test_captions = None\n        self.text_features = None",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:77-101"
+    },
+    "2239": {
+        "file_id": 176,
+        "content": "The code above defines a class for a dataset, with various parameters such as text_dim, num_test_captions, and max_tokens. It sets the necessary attributes including logger, text_feat, data_dir, and experts. The class also initializes the tokenizer and sets the restrict_test_captions and text_features attributes before calling load_features() method.",
+        "type": "comment"
+    },
+    "2240": {
+        "file_id": 176,
+        "content": "        self.label_features = None\n        self.video_labels = None\n        self.raw_captions = None\n        self.features = None\n        self.word2int = json.load(open('word2int.json'))\n        # Use a single caption per video when forming training minibatches (different\n        # captions from the same video may still be used across different minibatches)\n        self.captions_per_video = 1\n        self.ordered_experts = list(raw_input_dims.keys())\n        # Training and test lists are set by dataset-specific subclasses\n        self.partition_lists = {}\n        self.configure_train_test_splits(split_name=split_name)\n        # All retrieval-based tasks use a single dataloader (and handle the retrieval\n        # data separately), whereas for classification we use one dataloader for\n        # training and one for validation.\n        self.logger.info(\"The current task is retrieval\")\n        self.sample_list = self.partition_lists[\"train\"]\n        self.num_samples = len(self.sample_list)\n        num_val = len(self.partition_lists[\"val\"])",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:102-125"
+    },
+    "2241": {
+        "file_id": 176,
+        "content": "This code initializes class variables for a dataset object. It sets the label features, video labels, raw captions, and features to None. It loads the word2int mapping from a JSON file. The code allows for one caption per video in training minibatches. It creates an ordered list of experts based on input dimensions. The training and test lists are set by dataset-specific subclasses. The code is for retrieval tasks and uses a single dataloader, handling retrieval data separately. It sets the sample list to the training partition and calculates the total number of samples.",
+        "type": "comment"
+    },
+    "2242": {
+        "file_id": 176,
+        "content": "        self.raw_input_dims = raw_input_dims\n        # we store default paths to enable visualisations (this can be overloaded by\n        # dataset-specific classes)\n        self.video_path_retrieval = [\n            f\"videos/{x}.mp4\" for x in self.partition_lists[\"val\"]\n        ]\n        # NOTE: We use nans rather than zeros to indicate missing faces, unless we wish\n        # to test single modality strength, which requires passing zeroed features for\n        # missing videos\n        if use_zeros_for_missing:\n            self.MISSING_VAL = 0\n        else:\n            self.MISSING_VAL = np.nan\n        # load the dataset-specific features into memory\n        self.load_features()\n        if text_agg == \"avg\":\n            self.logger.info(\"averaging the text features...\")\n            for key, val in self.text_features.items():\n                self.text_features[key] = [\n                    np.mean(x, 0, keepdims=1) for x in val\n                ]\n            self.logger.info(\"finished averaging the text features\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:127-152"
+    },
+    "2243": {
+        "file_id": 176,
+        "content": "The code sets default paths for video retrieval, defines missing value strategy based on use_zeros_for_missing argument, loads dataset-specific features into memory and averages text features when text_agg is set to \"avg\".",
+        "type": "comment"
+    },
+    "2244": {
+        "file_id": 176,
+        "content": "        self.trn_config = {}\n        self.raw_config = {}\n        self.tensor_storage = expert_tensor_storage(self.experts,\n                                                    self.feat_aggregation)\n        for static_expert in self.tensor_storage[\"fixed\"]:\n            if static_expert in self.feat_aggregation:\n                if \"trn_seg\" in self.feat_aggregation[static_expert].keys():\n                    self.trn_config[static_expert] = \\\n                        self.feat_aggregation[static_expert][\"trn_seg\"]\n                if \"raw\" in self.feat_aggregation[static_expert][\"temporal\"]:\n                    self.raw_config[static_expert] = 1\n        retrieval = {\n            expert: np.zeros(\n                (num_val, self.max_tokens[expert], raw_input_dims[expert]))\n            for expert in self.tensor_storage[\"variable\"]\n        }\n        retrieval.update({\n            expert: np.zeros((num_val, raw_input_dims[expert]))\n            for expert in self.tensor_storage[\"fixed\"]\n        })\n        self.retrieval = retrieval",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:154-175"
+    },
+    "2245": {
+        "file_id": 176,
+        "content": "This code initializes training and raw configuration dictionaries, creates a tensor storage object, iterates through static experts, adds their relevant configurations to the dictionaries, and then builds a retrieval dictionary for both fixed and variable experts.",
+        "type": "comment"
+    },
+    "2246": {
+        "file_id": 176,
+        "content": "        self.test_ind = {\n            expert: paddle.ones([num_val])\n            for expert in self.experts\n        }\n        self.raw_captions_retrieval = [None] * num_val\n        # avoid evaluation on missing queries\n        self.query_masks = np.zeros((num_val, num_test_captions))\n        self.text_token_mask = np.zeros((num_val, num_test_captions))\n        self.text_retrieval = np.zeros((num_val, self.num_test_captions,\n                                        self.max_tokens[\"text\"], self.text_dim))\n        self.cap_retrieval = paddle.zeros(\n            [num_val, self.num_test_captions, self.max_tokens[\"text\"]],\n            dtype='int64'\n        )  #self.cap_retrieval = th.zeros((num_val, self.num_test_captions, self.max_tokens[\"text\"]))\n        self.att_retrieval = paddle.zeros(\n            [num_val, self.num_test_captions, self.max_tokens[\"text\"]],\n            dtype='int64'\n        )  #self.att_retrieval = th.zeros((num_val, self.num_test_captions, self.max_tokens[\"text\"]))\n        save_cap = []\n        for ii, video_name in enumerate(self.partition_lists[\"val\"]):",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:176-197"
+    },
+    "2247": {
+        "file_id": 176,
+        "content": "The code is initializing various arrays and tensors for evaluating the model on validation data. It sets up masks, retrieval tensors for text, captions, and attention, and prepares an empty list for saving the validation captions. This code is part of a larger function that appears to be setting up a dataset for video captioning or related task.",
+        "type": "comment"
+    },
+    "2248": {
+        "file_id": 176,
+        "content": "            self.raw_captions_retrieval[ii] = self.raw_captions[video_name]\n            for expert in self.tensor_storage[\"fixed\"].intersection(\n                    self.experts):\n                feats = self.features[expert][video_name]\n                drop = self.has_missing_values(feats)\n                self.test_ind[expert][ii] = not drop\n                self.retrieval[expert][ii] = feats\n                if drop:\n                    self.retrieval[expert][ii][:] = self.MISSING_VAL\n                if self.feat_aggregation[expert].get(\"binarise\", False):\n                    keep = np.logical_not(\n                        np.isnan(self.retrieval[expert][:, 0, 0]))\n                    marker = np.ones_like(self.retrieval[expert][keep])\n                    self.retrieval[expert][keep] = marker\n            for expert in self.tensor_storage[\"variable\"].intersection(\n                    self.experts):\n                feats = self.features[expert][video_name]\n                drop = self.has_missing_values(feats)",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:199-217"
+    },
+    "2249": {
+        "file_id": 176,
+        "content": "This code initializes the retrieval and test indices for each expert in both fixed and variable tensor storage. It handles missing values by replacing them with 'MISSING_VAL' and binarizing non-missing features using marker values if requested.",
+        "type": "comment"
+    },
+    "2250": {
+        "file_id": 176,
+        "content": "                self.test_ind[expert][ii] = not drop\n                if drop:\n                    self.retrieval[expert][ii][:] = self.MISSING_VAL\n                if self.feat_aggregation[expert].get(\"binarise\", False):\n                    keep = np.logical_not(\n                        np.isnan(self.retrieval[expert][:, 0, 0]))\n                    marker = np.ones_like(self.retrieval[expert][keep])\n                    self.retrieval[expert][keep] = marker\n                if self.test_ind[expert][ii]:\n                    keep = min(self.max_tokens[expert], len(feats))\n                    self.retrieval[expert][ii, :keep, :] = feats[:keep]\n            candidates_sentences = self.text_features[video_name]\n            if self.restrict_test_captions is not None:\n                keep_sent_idx = self.restrict_test_captions[video_name]\n                candidates_sentences = [candidates_sentences[keep_sent_idx]]\n            self.query_masks[ii, :len(candidates_sentences)] = 1\n            for test_caption_idx in range(self.num_test_captions):",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:218-237"
+    },
+    "2251": {
+        "file_id": 176,
+        "content": "The code is handling the process of selecting video features and test captions for a specific expert. It drops certain entries, sets missing values where needed, applies binarization if required, and limits the number of tokens based on maximum token limit. It also restricts test captions if specified by the user. Finally, it sets query masks to prepare for further processing.",
+        "type": "comment"
+    },
+    "2252": {
+        "file_id": 176,
+        "content": "                if len(candidates_sentences) <= test_caption_idx:\n                    break\n                keep = min(len(candidates_sentences[test_caption_idx]),\n                           self.max_tokens[\"text\"])\n                self.text_token_mask[ii, test_caption_idx] = keep\n                sent = self.raw_captions_retrieval[ii][test_caption_idx]\n                sent = \" \".join(sent)\n                sent = sent.strip()\n                encoded_dict = self.tokenizer.__call__(\n                    sent,\n                    max_seq_len=self.max_tokens[\"text\"],\n                    pad_to_max_seq_len=True,\n                    return_attention_mask=True,\n                    truncation_strategy='longest_first')\n                cap_ids = paddle.to_tensor(encoded_dict['input_ids'])\n                attention_mask = paddle.to_tensor(\n                    encoded_dict['attention_mask'])\n                save_cap.append(sent)\n                self.cap_retrieval[ii, test_caption_idx, :] = cap_ids\n                self.att_retrieval[ii, test_caption_idx, :] = attention_mask",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:238-257"
+    },
+    "2253": {
+        "file_id": 176,
+        "content": "This code is iterating over a list of candidate sentences, breaking when the index exceeds the list length. For each sentence, it sets the number of tokens to keep based on the maximum allowed and masks the corresponding tokens. It then encodes the sentence into tokenized input IDs and attention mask for PaddlePaddle's model, appending the original sentence to a save list, storing the tokenized inputs in 'cap_retrieval', and the attention masks in 'att_retrieval'.",
+        "type": "comment"
+    },
+    "2254": {
+        "file_id": 176,
+        "content": "                if ii % 500 == 0 and test_caption_idx == 0:\n                    msg = (\n                        f\"{ii}/{len(self.partition_lists['val'])} will evaluate \"\n                        f\"sentence {test_caption_idx} out of \"\n                        f\"{len(candidates_sentences)} (has {keep} words) \"\n                        f\"{video_name}\")\n                    self.logger.info(msg)\n                text_feats = candidates_sentences[test_caption_idx][:keep]\n                if text_feats.shape[0] == 0:\n                    text_feats = 0\n                    raise ValueError(\"empty text features!\")\n                self.text_retrieval[ii, test_caption_idx, :keep, :] = text_feats\n        with open('run_cap.pkl', 'wb') as f:\n            pkl.dump(save_cap, f)\n        self.sanity_checks()\n    def configure_train_test_splits(self, split_name):\n        \"\"\"Partition the datset into train/val/test splits.\n        Args:\n            split_name (str): the name of the split\n        \"\"\"\n        self.paths = type(self).dataset_paths()",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:258-280"
+    },
+    "2255": {
+        "file_id": 176,
+        "content": "The code is checking the progress of a dataset evaluation, creating text features for each sentence in the list, storing them in an array and then dumping the saved captions into a file called 'run_cap.pkl'. It also includes a function to configure train/test splits of the dataset.",
+        "type": "comment"
+    },
+    "2256": {
+        "file_id": 176,
+        "content": "        print(\"loading training/val splits....\")\n        tic = time.time()\n        for subset, path in self.paths[\"subset_list_paths\"][split_name].items():\n            root_feat = Path(self.root_feat)\n            subset_list_path = root_feat / path\n            if subset == \"train\" and self.eval_only:\n                rows = []\n            else:\n                with open(subset_list_path) as f:\n                    rows = f.read().splitlines()\n            self.partition_lists[subset] = rows\n        print(\"done in {:.3f}s\".format(time.time() - tic))\n        self.split_name = split_name\n    def collate_data(self, data):\n        batch_size = len(data)\n        tensors = {}\n        for expert in self.tensor_storage[\"fixed\"]:\n            if expert in self.trn_config.keys():\n                tensors[expert] = paddle.to_tensor(\n                    np.zeros((batch_size, self.trn_config[expert],\n                              self.raw_input_dims[expert])))\n            else:\n                tensors[expert] = paddle.to_tensor(",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:281-304"
+    },
+    "2257": {
+        "file_id": 176,
+        "content": "The code loads training/validation splits, reads and stores them in partition lists for later use, and initializes tensor storage for the PaddleVideo application.",
+        "type": "comment"
+    },
+    "2258": {
+        "file_id": 176,
+        "content": "                    np.zeros((batch_size, self.raw_input_dims[expert])))\n        # Track which indices of each modality are available in the present batch\n        ind = {\n            expert: paddle.to_tensor(np.zeros(batch_size))\n            for expert in self.experts\n        }\n        tensors.update({\n            expert: paddle.to_tensor(\n                np.zeros((batch_size, self.max_tokens[expert],\n                          self.raw_input_dims[expert])))\n            for expert in self.tensor_storage[\"variable\"]\n        })\n        text_tensor = paddle.to_tensor(\n            np.zeros((batch_size, self.captions_per_video,\n                      self.max_tokens[\"text\"], self.text_dim)))\n        text_token_mask = paddle.to_tensor(\n            np.zeros((batch_size, self.captions_per_video)))\n        text_cap_id = paddle.zeros([batch_size, self.max_tokens[\"text\"]],\n                                   dtype='int64')\n        text_att_mask = paddle.zeros([batch_size, self.max_tokens[\"text\"]],\n                                     dtype='int64')",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:305-327"
+    },
+    "2259": {
+        "file_id": 176,
+        "content": "This code initializes tensors for a batch of data in a dataset. It creates zero-initialized tensors for each expert (modality), and separate tensors for text data including token masks, cap IDs, and attention mask. These will be filled with actual data as the batch is processed.",
+        "type": "comment"
+    },
+    "2260": {
+        "file_id": 176,
+        "content": "        for ii, _ in enumerate(data):\n            datum = data[ii]\n            for expert in self.experts:\n                ind[expert][ii] = datum[f\"{expert}_ind\"]\n            for expert in self.tensor_storage[\"fixed\"]:\n                tensors[expert][ii] = datum[expert]\n            for expert in self.tensor_storage[\"variable\"]:\n                if ind[expert][ii]:\n                    keep = min(len(datum[expert]), self.max_tokens[expert])\n                    if keep:\n                        tensors[expert][ii, :keep, :] = datum[expert][:keep]\n                else:\n                    tensors[expert][ii, :, :] = self.MISSING_VAL\n            text = datum[\"text\"]\n            cap_id = datum[\"cap_id\"]\n            att_mask = datum[\"att_mask\"]\n            text_cap_id[ii, :] = paddle.to_tensor(cap_id)\n            text_att_mask[ii, :] = paddle.to_tensor(att_mask)\n            for jj in range(self.captions_per_video):\n                keep = min(len(text[jj]), self.max_tokens[\"text\"])\n                text_tensor[ii, jj, :keep, :] = text[jj][:keep]",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:329-350"
+    },
+    "2261": {
+        "file_id": 176,
+        "content": "This code iterates through a dataset, extracting data for various experts and creating tensors from it. It handles missing values and stores text and mask information in separate tensors.",
+        "type": "comment"
+    },
+    "2262": {
+        "file_id": 176,
+        "content": "                text_token_mask[ii, jj] = keep\n        ind = {key: ensure_tensor(val) for key, val in ind.items()}\n        experts = OrderedDict(\n            (expert, paddle.to_tensor(tensors[expert], dtype='float32'))\n            for expert in self.ordered_experts)\n        for expert in self.experts:\n            if self.feat_aggregation[expert].get(\"binarise\", False):\n                replace = np.logical_not(paddle.isnan(experts[expert][:, 0, 0]))\n                experts[expert][replace] = paddle.ones_like(\n                    experts[expert][replace])\n        minibatch = {\"experts\": experts, \"ind\": ind}\n        minibatch[\"text\"] = paddle.to_tensor(text_tensor, dtype='float32')\n        minibatch[\"cap_id\"] = paddle.to_tensor(text_cap_id, dtype='int64')\n        minibatch[\"att_mask\"] = paddle.to_tensor(text_att_mask, dtype='int64')\n        minibatch[\"text_token_mask\"] = paddle.to_tensor(text_token_mask)\n        return minibatch\n    def process_sent(self, sent, max_words, EOS: int = 1, UNK: int = 2):\n        # set EOS=1, UNK=2 by default, consistent with file 'word2int.json'.",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:351-372"
+    },
+    "2263": {
+        "file_id": 176,
+        "content": "This code creates a minibatch for video features and text data. It applies binarization to some features, converts tensors, and prepares inputs for machine learning models. The process_sent function sets default values for EOS and UNK consistent with the word2int.json file.",
+        "type": "comment"
+    },
+    "2264": {
+        "file_id": 176,
+        "content": "        tokens = [self.word2int.get(w, UNK) for w in sent]\n        tokens = tokens[:max_words]\n        tokens_len = len(tokens)\n        tokens = np.array(tokens + [EOS] * (max_words - tokens_len))\n        return tokens, tokens_len\n    def __len__(self):\n        return self.num_samples\n    def __getitem__(self, idx):\n        if idx < self.num_samples:\n            vid = self.sample_list[idx]\n            features = {}\n            for expert in self.experts:\n                if expert not in self.trn_config.keys():\n                    if expert in self.raw_config.keys():\n                        features[expert] = np.mean(self.features[expert][vid],\n                                                   axis=0)\n                    else:\n                        features[expert] = self.features[expert][vid]\n                else:\n                    raw_frame_feats = self.features[expert][vid]\n                    new_length = 1\n                    num_frames = raw_frame_feats.shape[0]\n                    avg_duration = ((num_frames - new_length + 1) //",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:373-397"
+    },
+    "2265": {
+        "file_id": 176,
+        "content": "This code defines a dataset class that loads and processes video features for text-to-video retrieval. It takes a list of videos, extracts expert features, and pads them to a fixed length. The class also supports indexing and has methods for getting the number of samples in the dataset.",
+        "type": "comment"
+    },
+    "2266": {
+        "file_id": 176,
+        "content": "                                    self.trn_config[expert])\n                    assert avg_duration > 0, \"average duration must be positive\"\n                    if avg_duration > 0:\n                        # maybe we could change to use average for each tiny segment\n                        # seems like use everything per iter\n                        offsets = np.multiply(\n                            list(range(self.trn_config[expert])), avg_duration)\n                        offsets += randint(avg_duration,\n                                           size=self.trn_config[expert])\n                        new_frame_feats = np.zeros(\n                            (self.trn_config[expert], raw_frame_feats.shape[1]))\n                        for idx, xx in enumerate(offsets):\n                            new_frame_feats[idx, :] = raw_frame_feats[xx, :]\n                        msg = \"returning a wrong feature != segment num\"\n                        assert new_frame_feats.shape[0] == self.trn_config[\n                            expert], msg",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:398-413"
+    },
+    "2267": {
+        "file_id": 176,
+        "content": "The code segments video frame features into smaller segments with a specified average duration, accounts for the last segment if the duration is not divisible by the specified interval, and ensures the number of new feature segments matches the expected number.",
+        "type": "comment"
+    },
+    "2268": {
+        "file_id": 176,
+        "content": "                        features[expert] = new_frame_feats\n            ind = {}\n            for expert in self.ordered_experts:\n                if expert in self.tensor_storage[\"flaky\"]:\n                    ind[expert] = not self.has_missing_values(features[expert])\n                else:\n                    ind[expert] = 1\n            # Handle some inconsistencies between how the text features are stored\n            text = self.text_features[vid]\n            if isinstance(text, list):\n                pick = np.random.choice(len(text), size=self.captions_per_video)\n                sent = self.raw_captions[vid][pick[0]]\n                sent = \" \".join(sent)\n                sent = sent.strip()\n                text = np.array(text)[pick]\n                encoded_dict = self.tokenizer.__call__(\n                    sent,\n                    max_seq_len=self.max_tokens[\"text\"],\n                    pad_to_max_seq_len=True,\n                    return_attention_mask=True,\n                    truncation_strategy='longest_first')",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:414-437"
+    },
+    "2269": {
+        "file_id": 176,
+        "content": "This code is responsible for handling inconsistencies in text features storage. It randomly selects a caption from a list of captions for a given video, applies tokenization, and ensures that the sequence length does not exceed a maximum threshold. The result is stored in the encoded_dict variable.",
+        "type": "comment"
+    },
+    "2270": {
+        "file_id": 176,
+        "content": "                cap_id = encoded_dict['input_ids']\n                token_type_ids = encoded_dict['token_type_ids']\n                attention_mask = encoded_dict['attention_mask']\n            else:\n                pick = None\n                text = np.random.choice(text, size=self.captions_per_video)\n        # Return both the missing indices as well as the tensors\n        sample = {\"text\": text}\n        sample.update({\"cap_id\": cap_id})\n        sample.update({\"att_mask\": attention_mask})\n        sample.update({f\"{key}_ind\": val for key, val in ind.items()})\n        sample.update(features)\n        return sample\n    def get_retrieval_data(self):\n        experts = OrderedDict(\n            (expert, paddle.to_tensor(self.retrieval[expert], dtype='float32'))\n            for expert in self.ordered_experts)\n        retrieval_data = {\n            \"text\":\n            paddle.to_tensor(ensure_tensor(self.text_retrieval),\n                             dtype='float32'),\n            \"experts\":\n            experts,\n            \"cap_id\":",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:438-463"
+    },
+    "2271": {
+        "file_id": 176,
+        "content": "This code is initializing a sample for video dataset, using either given or randomly chosen text. It creates a dictionary with cap_id, attention mask, and other tensors as key-value pairs, and returns the sample. The get_retrieval_data function converts retrieval data to tensors and adds them to a dictionary containing text and experts keys before returning it.",
+        "type": "comment"
+    },
+    "2272": {
+        "file_id": 176,
+        "content": "            paddle.to_tensor(self.cap_retrieval, dtype='int64'),\n            \"att_mask\":\n            paddle.to_tensor(self.att_retrieval, dtype='int64'),\n            \"ind\":\n            self.test_ind,\n            \"text_token_mask\":\n            paddle.to_tensor(self.text_token_mask)\n        }\n        meta = {\n            \"query_masks\": self.query_masks,\n            \"raw_captions\": self.raw_captions_retrieval,\n            \"paths\": self.video_path_retrieval,\n        }\n        return retrieval_data, meta\n    def has_missing_values(self, x):\n        return isinstance(x, float) and np.isnan(x)\n    def visual_feat_paths(self, model_spec, tag=None):\n        \"\"\"Canonical path lookup for visual features\n        \"\"\"\n        if model_spec not in self.ordered_experts:\n            self.logger.info(\n                f\"Skipping load for {model_spec} (feature not requested)\")\n            return f\"SKIPPED-{model_spec}\"\n        feat_type, model_name, _ = model_spec.split(\".\")\n        aggs = self.feat_aggregation[model_spec]\n        base = f\"aggregated_{feat_type.replace('-', '_')}\"",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:464-492"
+    },
+    "2273": {
+        "file_id": 176,
+        "content": "The function defines a dictionary 'retrieval_data' containing cap_retrieval, att_mask, test_ind, and text_token_mask. It also defines the 'meta' dictionary containing query_masks, raw_captions, and paths. The function returns both 'retrieval_data' and 'meta'. The code provides a path lookup for visual features and skips loading if the feature is not requested.",
+        "type": "comment"
+    },
+    "2274": {
+        "file_id": 176,
+        "content": "        required = (\"fps\", \"pixel_dim\", \"stride\")\n        fps, pixel_dim, stride = [aggs.get(x, None) for x in required]\n        if feat_type in {\"facecrops\", \"faceboxes\"}:\n            base = f\"{base}_{fps}fps_{pixel_dim}px_stride{stride}\"\n        elif feat_type not in {\"ocr\", \"speech\", \"audio\"}:\n            base = f\"{base}_{fps}fps_{pixel_dim}px_stride{stride}\"\n        for option in \"offset\", \"inner_stride\":\n            if aggs.get(option, None) is not None:\n                base += f\"_{option}{aggs[option]}\"\n        feat_paths = []\n        for agg in aggs[\"temporal\"].split(\"-\"):\n            fname = f\"{model_name}-{agg}\"\n            if aggs[\"type\"] == \"logits\":\n                fname = f\"{fname}-logits\"\n            if tag is not None:\n                fname += f\"-{tag}\"\n            feat_paths.append(Path(base) / f\"{fname}.pickle\")\n        return feat_paths\n    def log_assert(self, bool_, msg=\"\", verbose=True):\n        \"\"\"Use assertions that will be written to the logs. This is a recipe from:\n        http://code.activestate.com/recipes/577074-logging-asserts/",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:493-516"
+    },
+    "2275": {
+        "file_id": 176,
+        "content": "The code defines a function that generates feature paths based on the provided arguments. It assembles a base string with parameters like fps, pixel_dim, and stride. If the feature type is \"facecrops\" or \"faceboxes\", it includes those parameters in the base string. For other types except for \"ocr\", \"speech\", and \"audio\", it also includes those parameters in the base string. It then adds optional parameters like offset and inner_stride if present. Finally, it generates a feature path list with file names and appends the tag if provided. The function also defines a logging assertion function that writes assertions to logs using a recipe from an external link.",
+        "type": "comment"
+    },
+    "2276": {
+        "file_id": 176,
+        "content": "        \"\"\"\n        try:\n            assert bool_, msg\n        except AssertionError:\n            # construct an exception message from the code of the calling frame\n            last_stackframe = inspect.stack()[-2]\n            source_file, line_no, func = last_stackframe[1:4]\n            source = f\"Traceback (most recent call last):\\n\" + \\\n                     f\" File {source_file}, line {line_no}, in {func}\\n\"\n            if verbose:\n                # include more lines than that where the statement was made\n                source_code = open(source_file).readlines()\n                source += \"\".join(source_code[line_no - 3:line_no + 1])\n            else:\n                source += last_stackframe[-2][0].strip()\n            self.logger.debug(f\"{msg}\\n{source}\")\n            raise AssertionError(f\"{msg}\\n{source}\")\n    def summary_stats(self):\n        \"\"\"Report basic statistics about feature availability and variable lengths\n        across the different subsets of the data.\n        \"\"\"\n        self.logger.info(\"Computing feature stats...\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:517-539"
+    },
+    "2277": {
+        "file_id": 176,
+        "content": "The code snippet is a function that checks an assertion. If the assertion fails, it constructs an exception message containing the traceback from the calling frame and raises an AssertionError with this message. Another function called \"summary_stats\" reports basic statistics about feature availability and variable lengths across different data subsets.",
+        "type": "comment"
+    },
+    "2278": {
+        "file_id": 176,
+        "content": "        queries = self.ordered_experts + [\"text\"]\n        for subset, keep in self.partition_lists.items():\n            keep = set(keep)\n            print(f\"Summary for {subset}\")\n            for expert in queries:\n                if expert in self.features:\n                    feats = self.features[expert]\n                else:\n                    feats = self.text_features\n                vals = [feats[key] for key in keep]\n                missing = 0\n                sizes = []\n                for val in vals:\n                    if self.has_missing_values(val):\n                        missing += 1\n                    else:\n                        sizes.append(len(val))\n                if sizes:\n                    stat_str = (f\"min: {np.min(sizes):4}, \"\n                                f\"max: {np.max(sizes):4}, \"\n                                f\"mean: {np.mean(sizes):.1f}\")\n                    print(\n                        f\"{subset}: missing: {missing:4}, {stat_str} {expert}\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_dataset.py:540-562"
+    },
+    "2279": {
+        "file_id": 176,
+        "content": "This code partitions datasets based on predefined subsets and checks the sizes of the features. It prints a summary for each subset, counting missing values and displaying the minimum, maximum, and mean sizes of features. This ensures that the dataset is properly partitioned and allows for efficient analysis.",
+        "type": "comment"
+    },
+    "2280": {
+        "file_id": 177,
+        "content": "/applications/T2VLAD/base/base_model.py",
+        "type": "filepath"
+    },
+    "2281": {
+        "file_id": 177,
+        "content": "The code defines an abstract base class for all PaddleVideo models, requiring implementation of forward method and including a trainable parameter count in __str__ output. It also imports libraries, checks stop_gradient flag, and calculates parameter shapes.",
+        "type": "summary"
+    },
+    "2282": {
+        "file_id": 177,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle.nn as nn\nfrom abc import abstractmethod\nclass BaseModel(nn.Layer):\n    \"\"\"\n    Base class for all models\n    \"\"\"\n    @abstractmethod\n    def forward(self, *inputs):\n        \"\"\"\n        Forward pass logic\n        :return: Model output\n        \"\"\"\n        raise NotImplementedError\n    def __str__(self):\n        \"\"\"\n        Model prints with number of trainable parameters\n        \"\"\"\n        model_parameters = filter(lambda p: p.stop_gradient==False, self.parameters())\n        params = sum([np.prod(p.shape) for p in model_parameters])",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_model.py:1-36"
+    },
+    "2283": {
+        "file_id": 177,
+        "content": "This code defines an abstract base class for all models in the PaddleVideo application. It requires derived classes to implement the `forward` method and includes a `__str__` method that prints the model with the number of trainable parameters. The code also imports necessary libraries, checks for stop_gradient flag on parameters, and calculates product of shape for each parameter.",
+        "type": "comment"
+    },
+    "2284": {
+        "file_id": 177,
+        "content": "        return super().__str__() + f\"\\nTrainable parameters: {params}\"",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_model.py:37-37"
+    },
+    "2285": {
+        "file_id": 177,
+        "content": "The code returns a string containing the superclass's __str__ method, followed by the number of trainable parameters in the current instance.",
+        "type": "comment"
+    },
+    "2286": {
+        "file_id": 178,
+        "content": "/applications/T2VLAD/base/base_trainer.py",
+        "type": "filepath"
+    },
+    "2287": {
+        "file_id": 178,
+        "content": "The code introduces a trainer class for PaddleVideo's T2VLAD application, managing features like multi-epoch training, monitoring performance metrics and model saving during training. It also manages model checkpoints to prevent storage overload.",
+        "type": "summary"
+    },
+    "2288": {
+        "file_id": 178,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport re\nimport copy\nimport time\nimport paddle\nimport pickle\nimport numpy as np\nfrom pathlib import Path\nfrom abc import abstractmethod\nclass BaseTrainer:\n    \"\"\" Base class for all trainers\n    \"\"\"\n    def __init__(self, model, loss, metrics, optimizer, config, mini_train,\n                 num_keep_ckpts, skip_tboard):\n        self.config = config\n        self.logger = config.get_logger(\n            'trainer', config['trainer']['verbosity'])\n        self.model = model\n        self.loss = loss\n        self.metrics = metrics",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:1-33"
+    },
+    "2289": {
+        "file_id": 178,
+        "content": "This code defines a base class for all trainers. It takes in parameters such as model, loss function, metrics to track, optimizer, and configuration. It also initializes the logger and sets up the necessary components for training.",
+        "type": "comment"
+    },
+    "2290": {
+        "file_id": 178,
+        "content": "        self.optimizer = optimizer\n        self.num_keep_ckpts = num_keep_ckpts\n        self.skip_tboard = skip_tboard or mini_train\n        # This property can be overriden in the subclass\n        self.skip_first_n_saves = 0\n        cfg_trainer = config['trainer']\n        self.epochs = cfg_trainer['epochs']\n        self.save_period = cfg_trainer['save_period']\n        self.monitor = cfg_trainer.get('monitor', 'off')\n        self.save_only_best = cfg_trainer.get(\"save_only_best\", True)\n        self.val_freq = cfg_trainer['val_freq']\n        # configuration to monitor model performance and save best\n        if self.monitor == 'off':\n            self.mnt_mode = 'off'\n            self.mnt_best = 0\n        else:\n            self.mnt_mode, self.mnt_metric = self.monitor.split()\n            assert self.mnt_mode in ['min', 'max']\n            self.mnt_best = np.inf if self.mnt_mode == 'min' else -np.inf\n            self.early_stop = cfg_trainer.get('early_stop', np.inf)\n        self.start_epoch = 1\n        self.model_dir = config.save_dir",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:34-60"
+    },
+    "2291": {
+        "file_id": 178,
+        "content": "This code is initializing the base trainer object with parameters from a configuration file. It sets optimizer, number of checkpoints to keep, whether to skip TensorBoard logging or not, and overridable properties like skipping the first N saves. It also assigns epochs, save period, monitor mode for model performance evaluation, best score to compare against, starts training from epoch 1, and sets the model directory.",
+        "type": "comment"
+    },
+    "2292": {
+        "file_id": 178,
+        "content": "        self.include_optim_in_save_model = config[\"trainer\"].get(\"include_optim_in_save_model\", 1)\n        if config.resume is not None:\n            self._resume_model(config.resume)\n    @abstractmethod\n    def _train_epoch(self, epoch):\n        \"\"\"Training logic for an epoch\n        :param epoch: Current epoch number\n        \"\"\"\n        raise NotImplementedError\n    def train(self):\n        \"\"\"Full training logic.  Responsible for iterating over epochs, early stopping,\n        modeling and logging metrics.\n        \"\"\"\n        for epoch in range(self.start_epoch, self.epochs + 1):\n            result, cached_preds = self._train_epoch(epoch)\n            if epoch % self.val_freq != 0:\n                continue\n            # save logged informations into log dict\n            log = {'epoch': epoch}\n            for key, value in result.items():\n                if key == 'metrics':\n                    log.update({mtr.__name__: value[i]\n                                for i, mtr in enumerate(self.metrics)})\n                elif key == 'val_metrics':",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:62-89"
+    },
+    "2293": {
+        "file_id": 178,
+        "content": "This code defines a base trainer class for PaddleVideo's T2VLAD application. It includes methods to train for multiple epochs, handle resume from a saved state, and log training metrics. The trainer iterates over each epoch and calls the _train_epoch method to perform training logic. If validation frequency is set, it logs results at specified epochs.",
+        "type": "comment"
+    },
+    "2294": {
+        "file_id": 178,
+        "content": "                    log.update({'val_' + mtr.__name__: value[i]\n                                for i, mtr in enumerate(self.metrics)})\n                elif key == 'nested_val_metrics':\n                    # NOTE: currently only supports two layers of nesting\n                    for subkey, subval in value.items():\n                        for subsubkey, subsubval in subval.items():\n                            log[f\"val_{subkey}_{subsubkey}\"] = subsubval\n                else:\n                    log[key] = value\n            # print logged informations to the screen\n            for key, value in log.items():\n                self.logger.info('    {:15s}: {}'.format(str(key), value))\n            # eval model according to configured metric, save best # ckpt as trained_model\n            not_improved_count = 0\n            best = False\n            if self.mnt_mode != 'off':\n                try:\n                    # check whether specified metric improved or not, according to\n                    # specified metric(mnt_metric)",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:90-110"
+    },
+    "2295": {
+        "file_id": 178,
+        "content": "The code updates the log with metrics values, handles nested metrics, prints logged information to the screen, and checks if the metric improved for monitoring mode.",
+        "type": "comment"
+    },
+    "2296": {
+        "file_id": 178,
+        "content": "                    lower = log[self.mnt_metric] <= self.mnt_best\n                    higher = log[self.mnt_metric] >= self.mnt_best\n                    improved = (self.mnt_mode == 'min' and lower) or \\\n                               (self.mnt_mode == 'max' and higher)\n                except KeyError:\n                    msg = \"Warning: Metric '{}' not found, perf monitoring is disabled.\"\n                    self.logger.warning(msg.format(self.mnt_metric))\n                    self.mnt_mode = 'off'\n                    improved = False\n                    not_improved_count = 0\n                    raise ValueError(\"Pick a metric that will save models!!!!!!!!\")\n                if improved:\n                    self.mnt_best = log[self.mnt_metric]\n                    # TODO(Samuel): refactor the code so that we don't move the model\n                    # off the GPU or duplicate on the GPU (we should be able to safely\n                    # copy the state dict directly to CPU)\n                    copy_model = copy.deepcopy(self.model)",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:111-128"
+    },
+    "2297": {
+        "file_id": 178,
+        "content": "This code checks if the performance metric (mnt_metric) has improved and updates the best value accordingly. If the metric is not found, it disables performance monitoring and sets improved to False. It also raises a ValueError asking the user to choose a relevant metric.",
+        "type": "comment"
+    },
+    "2298": {
+        "file_id": 178,
+        "content": "                    self.best_model = {\"epoch\": epoch, \"model\": copy_model}\n                    not_improved_count = 0\n                    best = True\n                else:\n                    not_improved_count += 1\n                if not_improved_count > self.early_stop:\n                    self.logger.info(\"Val performance didn\\'t improve for {} epochs. \"\n                                     \"Training stops.\".format(self.early_stop))\n                    break\n            if self.save_only_best:\n                if epoch == self.epochs:\n                    best_model = self.best_model\n                    self.model = best_model[\"model\"]\n                    print(f\"saving the best model to disk (epoch {epoch})\")\n                    self._save_model(best_model[\"epoch\"], save_best=True)\n                continue\n            # If modeling is done intermittently, still save models that outperform\n            # the best metric\n            # save_best = best and not self.mnt_metric == \"epoch\"\n            save_best = True",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:129-151"
+    },
+    "2299": {
+        "file_id": 178,
+        "content": "This code snippet is responsible for early stopping and saving the best model. If validation performance does not improve after a certain number of epochs (early_stop), training stops. The best model is saved if save_only_best is True and only at the end of the epochs. Otherwise, any model that outperforms the current best metric will be saved intermittently.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/23.json b/docs/data/23.json
new file mode 100644
index 000000000..fdcdb894c
--- /dev/null
+++ b/docs/data/23.json
@@ -0,0 +1,543 @@
+{
+    "2300": {
+        "file_id": 178,
+        "content": "            # Due to the fast runtime/slow HDD combination, modeling can dominate\n            # the total training time, so we optionally skip models for some of\n            # the first epochs\n            if epoch < self.skip_first_n_saves and not self.save_only_best:\n                msg = f\"Skipping model save at epoch {epoch} <= {self.skip_first_n_saves}\"\n                self.logger.info(msg)\n                continue\n            if epoch % self.save_period == 0 and save_best:\n                self._save_model(epoch, save_best=best)\n                print(\"This epoch, the save best :{}\".format(best))\n                if best:\n                    for key, cached in cached_preds.items():\n                        log_dir = Path(self.config.log_dir)\n                        prediction_path = log_dir / f\"{key}_preds.txt\"\n                        prediction_logits_path = log_dir / f\"{key}_preds_logits.npy\"\n                        np.save(prediction_logits_path, cached[\"preds\"])\n                        gt_logits_path = log_dir / f\"{key}_gt_logits.npy\"",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:153-170"
+    },
+    "2301": {
+        "file_id": 178,
+        "content": "This code snippet is used to control the frequency and conditions of model saving during training. It checks if the current epoch is less than a specified number (`self.skip_first_n_saves`) and if `self.save_only_best` is set to False. If either condition is true, it skips saving the model at that epoch. If both conditions are false or the first condition is false but the second one is true, it saves the model every `self.save_period` epochs when `save_best` is set to True. Additionally, if this epoch's save is considered the best (`best` is True), it logs all predictions for each key in `cached_preds`.",
+        "type": "comment"
+    },
+    "2302": {
+        "file_id": 178,
+        "content": "                        np.save(gt_logits_path, cached[\"labels\"].cpu().numpy())\n                        vid_names = []\n                        sort_predict = np.argsort(cached[\"preds\"])[:, ::-1]\n                        with open(str(prediction_path), 'w') as f:\n                            for kk in range(cached[\"preds\"].shape[0]):\n                                pred_classes = [str(v) for v in sort_predict[kk, :]]\n                                vid_name = cached[\"vid_name\"][kk]\n                                if key == \"test\":\n                                    vid_name = vid_name[kk].split('/')[-1] + '.mp4'\n                                row = f\"{vid_name} {' '.join(pred_classes)}\"\n                                print(row, file=f)\n                                vid_names.append(vid_name)\n                        save_name_path = log_dir / f\"{key}_vid_name.pkl\"\n                        with open(save_name_path, 'wb') as f:\n                            pickle.dump(vid_names, f)\n                        self.logger.info(f\"All {key} preds saved\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:171-186"
+    },
+    "2303": {
+        "file_id": 178,
+        "content": "Saves the ground-truth labels and predicted classes for each video, writing them to disk in a specified format. It also saves the video names associated with these predictions and logs a message when all preds have been saved.",
+        "type": "comment"
+    },
+    "2304": {
+        "file_id": 178,
+        "content": "                        self.logger.info(f\"Wrote result to: {str(prediction_path)}\")\n            if epoch > self.num_keep_ckpts:\n                self.purge_stale_models()\n    def purge_stale_models(self):\n        \"\"\"Remove models that are no longer neededself.\n        NOTE: This function assumes that the `best` model has already been renamed\n        to have a format that differs from `model-epoch<num>.pth`\n        \"\"\"\n        all_ckpts = list(self.model_dir.glob(\"*.pdparams\"))\n        found_epoch_ckpts = list(self.model_dir.glob(\"model-epoch*.pdparams\"))\n        if len(all_ckpts) <= self.num_keep_ckpts:\n            return\n        msg = \"Expected at the best model to have been renamed to a different format\"\n        if not len(all_ckpts) > len(found_epoch_ckpts):\n            print(\"Warning, purging model, but the best epoch was not saved!\")\n        # assert len(all_ckpts) > len(found_epoch_ckpts), msg\n        # purge the oldest models\n        regex = r\".*model-epoch(\\d+)[.pdparams$\"\n        epochs = [int(re.search(regex, str(x)).groups()[0]) for x in found_epoch_ckpts]",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:187-210"
+    },
+    "2305": {
+        "file_id": 178,
+        "content": "This code is responsible for managing the storage of model checkpoints and purging old or unnecessary models. It keeps track of the number of models to keep (`num_keep_ckpts`) and removes older ones if necessary. The `purge_stale_models()` function checks if all the checkpoints follow the expected format, then purges the oldest models by removing them from storage.",
+        "type": "comment"
+    },
+    "2306": {
+        "file_id": 178,
+        "content": "        sorted_ckpts = sorted(list(zip(epochs, found_epoch_ckpts)), key=lambda x: -x[0])\n        for epoch, stale_ckpt in sorted_ckpts[self.num_keep_ckpts:]:\n            tic = time.time()\n            stale_ckpt.unlink()\n            msg = f\"removing stale model [epoch {epoch}] [took {time.time() - tic:.2f}s]\"\n            self.logger.info(msg)\n    def _save_model(self, epoch, save_best=False):\n        \"\"\"Saving models\n        :param epoch: current epoch number\n        :param log: logging information of the epoch\n        :param save_best: if True, rename the saved model to 'trained_model.pdparams'\n        \"\"\"\n        arch = type(self.model).__name__\n        state = {\n            'arch': arch,\n            'epoch': epoch,\n            'state_dict': self.model.state_dict(),\n            'monitor_best': self.mnt_best,\n            'config': self.config\n        }\n        if self.include_optim_in_save_model:\n            state[\"optimizer\"] = self.optimizer.state_dict()\n        filename = str(self.model_dir /\n                       'model-epoch{}.pdparams'.format(epoch))",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:211-238"
+    },
+    "2307": {
+        "file_id": 178,
+        "content": "This code snippet is responsible for saving and removing stale models during the training process. It saves model checkpoints at each epoch, keeps a specified number of the most recent ones, and deletes older checkpoints. The _save_model function saves the model state along with its architecture, current epoch, optimizer state if included, and configuration details into a .pdparams file in the specified directory.",
+        "type": "comment"
+    },
+    "2308": {
+        "file_id": 178,
+        "content": "        tic = time.time()\n        self.logger.info(\"Saving model: {} ...\".format(filename))\n        paddle.save(state, filename)\n        self.logger.info(f\"Done in {time.time() - tic:.3f}s\")\n        if save_best:\n            self.logger.info(\"Updating 'best' model: {} ...\".format(filename))\n            best_path = str(self.model_dir / 'trained_model.pdparams')\n            paddle.save(state, best_path)\n            self.logger.info(f\"Done in {time.time() - tic:.3f}s\")\n    def _resume_model(self, resume_path):\n        \"\"\" Resume from saved models\n        :param resume_path: model path to be resumed\n        \"\"\"\n        resume_path = str(resume_path)\n        self.logger.info(\"Loading model: {} ...\".format(resume_path))\n        model = paddle.load(resume_path)\n        self.model.load_dict(model)\n        self.logger.info(f\"model loaded. Resume training from epoch {self.start_epoch}\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/base/base_trainer.py:239-258"
+    },
+    "2309": {
+        "file_id": 178,
+        "content": "Saves model with optional best model update after training completion. Allows resuming training from a previously saved state.",
+        "type": "comment"
+    },
+    "2310": {
+        "file_id": 179,
+        "content": "/applications/T2VLAD/data/download_features.sh",
+        "type": "filepath"
+    },
+    "2311": {
+        "file_id": 179,
+        "content": "Downloading and extracting datasets for MSRVTT dataset from remote server.",
+        "type": "summary"
+    },
+    "2312": {
+        "file_id": 179,
+        "content": "mkdir MSRVTT\ncd MSRVTT\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/aggregated_text_feats.tar\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/mmt_feats.tar\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/raw-captions.pkl\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/train_list_jsfusion.txt\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/val_list_jsfusion.txt\ntar -xvf aggregated_text_feats.tar\ntar -xvf mmt_feats.tar",
+        "type": "code",
+        "location": "/applications/T2VLAD/data/download_features.sh:1-9"
+    },
+    "2313": {
+        "file_id": 179,
+        "content": "Downloading and extracting datasets for MSRVTT dataset from remote server.",
+        "type": "comment"
+    },
+    "2314": {
+        "file_id": 180,
+        "content": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py",
+        "type": "filepath"
+    },
+    "2315": {
+        "file_id": 180,
+        "content": "This code defines MSR-Vtt dataset paths, performs type checking, and loads features for a specific expert, handling aggregation, caching, and saving raw captions. It also checks text features, verifies format, size, and number of test captions, calculates missing queries, and raises errors for incorrect query mask sum.",
+        "type": "summary"
+    },
+    "2316": {
+        "file_id": 180,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport copy\nfrom pathlib import Path\nfrom utils import memory_summary\nfrom typeguard import typechecked\nfrom typing import Dict, Union, List\nfrom base.base_dataset import BaseDataset\nfrom zsvision.zs_utils import memcache, concat_features\nclass MSRVTT(BaseDataset):\n    @staticmethod\n    @typechecked\n    def dataset_paths() -> Dict[str, Union[str, List[str], Path, Dict]]:\n        subset_paths = {}\n        split_name = \"jsfusion\"\n        train_list_path = \"train_list_jsfusion.txt\"\n        test_list_path = \"val_list_jsfusion.txt\"",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:1-29"
+    },
+    "2317": {
+        "file_id": 180,
+        "content": "The code snippet is part of the MSRVTT class in the PaddleVideo library. It defines a dataset for MSR-Vtt, a large-scale video description dataset. The dataset_paths method returns the paths to train and test data splits. This method is typechecked to ensure that input types match expected data structures.",
+        "type": "comment"
+    },
+    "2318": {
+        "file_id": 180,
+        "content": "        # NOTE: The JSFusion split (referred to as 1k-A in the paper) uses all\n        # videos, but randomly samples a single caption per video from the test\n        # set for evaluation. To reproduce this evaluation, we use the indices\n        # of the test captions, and restrict to this subset during eval.\n        js_test_cap_idx_path = \"jsfusion_val_caption_idx.pkl\"\n        subset_paths[split_name] = {\"train\": train_list_path, \"val\": test_list_path}\n        custom_paths = {\n            \"features_audio\": [\"mmt_feats/features.audio.pkl\"],\n            \"features_flow\": [\"mmt_feats/features.flow_agg.pkl\"],\n            \"features_rgb\": [\"mmt_feats/features.rgb_agg.pkl\"],\n            \"features_scene\": [\"mmt_feats/features.scene.pkl\"],\n            \"features_face\": [\"mmt_feats/features.face_agg.pkl\"],\n            \"features_ocr\": [\"mmt_feats/features.ocr.pkl\"],\n            \"features_s3d\": [\"mmt_feats/features.s3d.pkl\"],\n            \"features_speech\": [\"mmt_feats/features.speech.pkl\"],\n        }\n        text_feat_paths = {",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:30-46"
+    },
+    "2319": {
+        "file_id": 180,
+        "content": "This code defines the data split paths for training and validation sets, as well as custom feature paths for different types of features. The JSFusion test caption indices path is also specified to reproduce a specific evaluation subset.",
+        "type": "comment"
+    },
+    "2320": {
+        "file_id": 180,
+        "content": "            \"openai\": \"w2v_MSRVTT_openAIGPT.pickle\",\n        }\n        text_feat_paths = {key: Path(\"aggregated_text_feats\") / fname\n                           for key, fname in text_feat_paths.items()}\n        feature_info = {\n            \"custom_paths\": custom_paths,\n            \"subset_list_paths\": subset_paths,\n            \"text_feat_paths\": text_feat_paths,\n            \"raw_captions_path\": \"raw-captions.pkl\",\n            \"js_test_cap_idx_path\": js_test_cap_idx_path,\n        }\n        return feature_info\n    def load_features(self):\n        root_feat = Path(self.root_feat)\n        feat_names = {}\n        custom_path_key = \"custom_paths\"\n        feat_names.update(self.paths[custom_path_key])\n        features = {}\n        for expert, rel_names in feat_names.items():\n            if expert not in self.ordered_experts:\n                continue\n            feat_paths = tuple([root_feat / rel_name for rel_name in rel_names])\n            if len(feat_paths) == 1:\n                features[expert] = memcache(feat_paths[0])",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:47-71"
+    },
+    "2321": {
+        "file_id": 180,
+        "content": "This code is loading features from the MSRVTT dataset. It defines paths for text features and raw captions, and then updates a dictionary with custom paths, subset list paths, text feature paths, raw caption path, and JS test caption index path. The load_features method retrieves these paths and loads the features accordingly.",
+        "type": "comment"
+    },
+    "2322": {
+        "file_id": 180,
+        "content": "            else:\n                # support multiple forms of feature (e.g. max and avg pooling). For\n                # now, we only support direct concatenation\n                msg = f\"{expert}: Only direct concatenation of muliple feats is possible\"\n                print(f\"Concatenating aggregates for {expert}....\")\n                is_concat = self.feat_aggregation[expert][\"aggregate\"] == \"concat\"\n                self.log_assert(is_concat, msg=msg)\n                axis = self.feat_aggregation[expert][\"aggregate-axis\"]\n                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter\n                print(f\"concat cache info: {x}\")\n                features_ = concat_features(feat_paths, axis=axis)\n                memory_summary()\n                # Make separate feature copies for each split to allow in-place filtering\n                features[expert] = copy.deepcopy(features_)\n        self.features = features\n        self.raw_captions = memcache(root_feat / self.paths[\"raw_captions_path\"])",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:72-89"
+    },
+    "2323": {
+        "file_id": 180,
+        "content": "This code is handling feature aggregation for a specific expert. It checks if the aggregation method is \"concat\" and then concatenates the features based on the given axis. If not, it throws an error message. The code also caches information about the concatenated features, copies the features for each split, and stores them in the 'features' dictionary. Finally, it saves raw captions using memcache and updates self.raw_captions.",
+        "type": "comment"
+    },
+    "2324": {
+        "file_id": 180,
+        "content": "        text_feat_path = root_feat / self.paths[\"text_feat_paths\"][self.text_feat]\n        self.text_features = memcache(text_feat_path)\n        if self.restrict_train_captions:\n            # hash the video names to avoid O(n) lookups in long lists\n            train_list = set(self.partition_lists[\"train\"])\n            for key, val in self.text_features.items():\n                if key not in train_list:\n                    continue\n                if not self.split_name == \"full-test\":\n                    # Note that we do not perform this sanity check for the full-test\n                    # split, because the text features in the cached dataset will\n                    # already have been cropped to the specified\n                    # `resstrict_train_captions`\n                    expect = {19, 20}\n                    msg = f\"expected train text feats as lists with length {expect}\"\n                    has_expected_feats = isinstance(val, list) and len(val) in expect\n                    self.log_assert(has_expected_feats, msg=msg)",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:90-108"
+    },
+    "2325": {
+        "file_id": 180,
+        "content": "This code retrieves text features from the cache and checks if they belong to a specific training set. It also verifies that the train text features are in the expected format (a list with length 19 or 20).",
+        "type": "comment"
+    },
+    "2326": {
+        "file_id": 180,
+        "content": "                # restrict to the first N captions (deterministic)\n                self.text_features[key] = val[:self.restrict_train_captions]\n        self.summary_stats()\n    def sanity_checks(self):\n        if self.num_test_captions == 20:\n            if len(self.partition_lists[\"val\"]) == 2990:\n                missing = 6\n            elif len(self.partition_lists[\"val\"]) == 1000:\n                missing = 2\n            elif len(self.partition_lists[\"val\"]) == 497:\n                missing = 0\n            else:\n                raise ValueError(\"unrecognised test set\")\n            msg = \"Expected to find two missing queries in MSRVTT for full eval\"\n            correct_missing = self.query_masks.sum() == self.query_masks.size - missing\n            self.log_assert(correct_missing, msg=msg)",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:110-126"
+    },
+    "2327": {
+        "file_id": 180,
+        "content": "The code checks if the number of test captions is set to 20 and verifies that the corresponding validation list size matches expected values. It calculates the missing queries based on the validation list size and raises a ValueError for unrecognized test sets. The code asserts that the difference between query mask sum and its size should be equal to the number of missing queries, with an error message if not correct.",
+        "type": "comment"
+    },
+    "2328": {
+        "file_id": 181,
+        "content": "/applications/T2VLAD/data_loader/data_loaders.py",
+        "type": "filepath"
+    },
+    "2329": {
+        "file_id": 181,
+        "content": "Creates a data loader class with LRU caching, PaddlePaddle library, and MSRVTT dataset for efficient training data access. Supports refreshing, clearing caches, setting args, creating datasets, printing cache info, and storing dataloader in an instance variable.",
+        "type": "summary"
+    },
+    "2330": {
+        "file_id": 181,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport logging\nimport functools\nfrom pathlib import Path\nfrom typing import Dict, List\nfrom typeguard import typechecked\nfrom zsvision.zs_utils import memcache\nfrom data_loader.MSRVTT_dataset import MSRVTT\nfrom utils import HashableDict, HashableOrderedDict\n@functools.lru_cache(maxsize=64, typed=False)\ndef dataset_loader(\n        use_zeros_for_missing: bool,\n        eval_only: bool,\n        data_dir: str,\n        text_agg: str,\n        text_feat: str,\n        split_name: str,\n        dataset_name: str,\n        cls_partition: str,",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/data_loaders.py:1-36"
+    },
+    "2331": {
+        "file_id": 181,
+        "content": "This code is a function for loading datasets, using LRU caching and PaddlePaddle library. It takes parameters like `use_zeros_for_missing`, `eval_only`, `data_dir`, `text_agg`, `text_feat`, `split_name`, `dataset_name`, and `cls_partition`. It imports MSRVTT dataset for loading specific datasets.",
+        "type": "comment"
+    },
+    "2332": {
+        "file_id": 181,
+        "content": "        root_feat_folder: str,\n        text_dim: int,\n        num_test_captions: int,\n        restrict_train_captions: int,\n        logger: logging.Logger,\n        max_tokens: Dict[str, int],\n        raw_input_dims: HashableOrderedDict,\n        feat_aggregation: HashableDict,\n):\n    print(f\"refreshing cache for {dataset_name} data loader [{split_name}]\")\n    kwargs = dict(\n        data_dir=Path(data_dir),\n        text_dim=text_dim,\n        logger=logger,\n        eval_only=eval_only,\n        text_agg=text_agg,\n        text_feat=text_feat,\n        max_tokens=max_tokens,\n        split_name=split_name,\n        cls_partition=cls_partition,\n        raw_input_dims=raw_input_dims,\n        root_feat_folder=root_feat_folder,\n        feat_aggregation=feat_aggregation,\n        num_test_captions=num_test_captions,\n        use_zeros_for_missing=use_zeros_for_missing,\n        restrict_train_captions=restrict_train_captions,\n    )\n    if dataset_name == \"MSRVTT\":\n        dataset = MSRVTT(**kwargs)\n    return dataset\nclass ExpertDataLoader:",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/data_loaders.py:37-69"
+    },
+    "2333": {
+        "file_id": 181,
+        "content": "Function `create_dataset` takes parameters to create an instance of a specific dataset class (MSRVTT in this case) with specified options. The function returns the created dataset object.",
+        "type": "comment"
+    },
+    "2334": {
+        "file_id": 181,
+        "content": "    @typechecked\n    def __init__(\n            self,\n            eval_only: bool,\n            use_zeros_for_missing: bool,\n            text_dim: int,\n            batch_size: int,\n            num_workers: int,\n            num_test_captions: int,\n            data_dir: str,\n            text_agg: str,\n            text_feat: str,\n            split_name: str,\n            dataset_name: str,\n            root_feat_folder: str,\n            max_tokens: Dict[str, int],\n            raw_input_dims: Dict[str, int],\n            feat_aggregation: Dict[str, Dict],\n            logger: logging.Logger,\n            restrict_train_captions: int = 0,\n            drop_last: bool = False,\n            refresh_lru_cache: bool = False,\n    ):\n        # Ensure that the dictionaries are hashable to allow use of caching\n        raw_input_dims = HashableOrderedDict(raw_input_dims)\n        feat_aggregation = HashableDict(feat_aggregation)\n        max_tokens = HashableDict(max_tokens)\n        if refresh_lru_cache:\n            logger.info(\"Explicitly refreshing dataloader and cuda cache\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/data_loaders.py:71-101"
+    },
+    "2335": {
+        "file_id": 181,
+        "content": "This code is a constructor for a data loader class that takes various parameters like eval_only, use_zeros_for_missing, text_dim, batch_size, etc. It initializes the object and ensures dictionaries are hashable to enable caching, and provides an optional refresh of dataloader and cuda cache.",
+        "type": "comment"
+    },
+    "2336": {
+        "file_id": 181,
+        "content": "            dataset_loader.cache_clear()\n            memcache.cache_clear()\n        common_kwargs = dict(\n            logger=logger,\n            data_dir=data_dir,\n            text_dim=text_dim,\n            text_agg=text_agg,\n            eval_only=eval_only,\n            text_feat=text_feat,\n            max_tokens=max_tokens,\n            dataset_name=dataset_name,\n            split_name=split_name,\n            root_feat_folder=root_feat_folder,\n            use_zeros_for_missing=use_zeros_for_missing,\n            num_test_captions=num_test_captions,\n            raw_input_dims=raw_input_dims,\n            feat_aggregation=feat_aggregation,\n            restrict_train_captions=restrict_train_captions,\n        )\n        dataset = dataset_loader(cls_partition=\"train\", **common_kwargs)\n        x = dataset_loader.cache_info()  # pylint: disable=no-value-for-parameter\n        logger.info(f\"cache info {x}\")\n        self.dataloaders = {\"dataset\": dataset}\n        self.dataloaders[\"retrieval\"] = dataset.get_retrieval_data()",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/data_loaders.py:102-127"
+    },
+    "2337": {
+        "file_id": 181,
+        "content": "This code clears dataset and memory caches, sets common arguments for a specific dataset loader function, creates the dataset with these args, prints cache information, and stores the created dataloader in an instance variable.",
+        "type": "comment"
+    },
+    "2338": {
+        "file_id": 181,
+        "content": "        if not eval_only:\n            train_loader = paddle.io.DataLoader(\n                dataset=dataset,\n                batch_size=batch_size,\n                num_workers=num_workers,\n                collate_fn=dataset.collate_data,\n                drop_last=drop_last,\n                shuffle=True,\n            )\n            self.dataloaders[\"train\"] = train_loader\n        logger.info(f\"Loading data loaders with {num_workers} workers\")\n        self.num_test_captions = num_test_captions\n        self.dataset_name = dataset_name\n    def __getitem__(self, key):\n        return self.dataloaders[key]",
+        "type": "code",
+        "location": "/applications/T2VLAD/data_loader/data_loaders.py:129-145"
+    },
+    "2339": {
+        "file_id": 181,
+        "content": "This function creates a DataLoader for training data with specified parameters and stores it in the self.dataloaders dictionary. It also logs the number of workers used and sets num_test_captions and dataset_name variables. The __getitem__ method returns the dataloader based on the provided key from the self.dataloaders dictionary.",
+        "type": "comment"
+    },
+    "2340": {
+        "file_id": 182,
+        "content": "/applications/T2VLAD/logger/__init__.py",
+        "type": "filepath"
+    },
+    "2341": {
+        "file_id": 182,
+        "content": "This code imports all functions and classes from the logger and log_parser modules in the T2VLAD application of PaddleVideo.",
+        "type": "summary"
+    },
+    "2342": {
+        "file_id": 182,
+        "content": "from .logger import *\nfrom .log_parser import *",
+        "type": "code",
+        "location": "/applications/T2VLAD/logger/__init__.py:1-2"
+    },
+    "2343": {
+        "file_id": 182,
+        "content": "This code imports all functions and classes from the logger and log_parser modules in the T2VLAD application of PaddleVideo.",
+        "type": "comment"
+    },
+    "2344": {
+        "file_id": 183,
+        "content": "/applications/T2VLAD/logger/log_parser.py",
+        "type": "filepath"
+    },
+    "2345": {
+        "file_id": 183,
+        "content": "The `log_summary` function gathers performance stats, identifies seeds, searches metrics, and calculates scores for epochs. If evaluation mode is \"fixed_num_epochs,\" it logs the fixed training length, then calculates mean and standard deviation for each metric in aggregated scores using numpy functions.",
+        "type": "summary"
+    },
+    "2346": {
+        "file_id": 183,
+        "content": "import re\nimport scipy.stats\nimport logging\nimport numpy as np\nfrom collections import defaultdict\ndef log_summary(logger, log_path, eval_mode=\"test_run\", fixed_num_epochs=None):\n    \"\"\"Extract performace statistics from experiment log files.\n    Args:\n        logger (logger): reference to primary logging instance\n        log_path (Path): the path to the log file\n        eval_mode (str): the method use to collect the statistics. Can be one of:\n            `test_run`, `fixed_num_epochs` or `geometric_mean`\n    NOTE: The `eval_mode` argument differs by dataset: for datasets which provide a\n    validation set, we use validation set performance to complete a single test run.  For\n    datasets where no validation set is available, we aim to match prior work by either\n    fixing the number of training epochs, or selecting directly from validation set\n    performance (Details can be found in the supplementary material of the paper.)\n    \"\"\"\n    with open(str(log_path), \"r\") as f:\n        log = f.read().splitlines()",
+        "type": "code",
+        "location": "/applications/T2VLAD/logger/log_parser.py:1-24"
+    },
+    "2347": {
+        "file_id": 183,
+        "content": "The function `log_summary` extracts performance statistics from log files and takes arguments such as a logger reference, log file path, evaluation mode (test run, fixed number of epochs or geometric mean), and optional fixed number of epochs. The log is read, and the performance statistics are extracted based on the given evaluation mode.",
+        "type": "comment"
+    },
+    "2348": {
+        "file_id": 183,
+        "content": "    # keep track of the random seed used for the part of the logfile being processed\n    current_seed = None\n    # Regex tag for finding the seed\n    seed_tag = \"Setting experiment random seed to\"\n    if eval_mode == \"test_run\":\n        subset = \"test\"\n    else:\n        subset = \"val\"\n    for mode in \"t2v\", \"v2t\":\n        logger.info(\"\")\n        logger.info(\"----------------------------------------------------\")\n        logger.info(f\"[{mode}] loaded log file with {len(log)} lines....\")\n        logger.info(\"----------------------------------------------------\")\n        # Search for the following metrics\n        scores = {\n            \"R1\": defaultdict(list),\n            \"R5\": defaultdict(list),\n            \"R10\": defaultdict(list),\n            \"R50\": defaultdict(list),\n            \"MedR\": defaultdict(list),\n            \"MeanR\": defaultdict(list),\n        }\n        for row in log:\n            if seed_tag in row:\n                # Search for the log file entry describing the current random seed\n                match = re.search(seed_tag + \" (\\d+)$\", row)  # NOQA",
+        "type": "code",
+        "location": "/applications/T2VLAD/logger/log_parser.py:26-56"
+    },
+    "2349": {
+        "file_id": 183,
+        "content": "This code is parsing a log file, identifying the random seed used for each part of the log. It searches for specific metrics and extracts information related to \"R1\", \"R5\", \"R10\", \"R50\", \"MedR\", and \"MeanR\" in two modes: \"t2v\" and \"v2t\". It also differentiates between evaluation modes like \"test_run\" and \"val\".",
+        "type": "comment"
+    },
+    "2350": {
+        "file_id": 183,
+        "content": "                assert len(match.groups()) == 1, \"expected a single regex match\"\n                current_seed = match.groups()[0]\n            if f\"{subset}_{mode}_metrics\" in row:\n                tokens = row.split(\" \")\n                for key in scores:\n                    tag = f\"{subset}_{mode}_metrics_{key}:\"\n                    if tag in tokens:\n                        pos = tokens.index(tag) + 1\n                        val = tokens[pos]\n                        val = float(val)\n                        assert current_seed is not None, \"failed to determine the seed\"\n                        scores[key][current_seed].append(val)\n        agg_scores = {\"R1\": [], \"R5\": [], \"R10\": [], \"R50\": [], \"MedR\": [], \"MeanR\": []}\n        # compute the best performance for a single epoch (i.e. sharing the same model\n        # to compute all stats)\n        geometric_stats = defaultdict(list)\n        best_epochs = {}\n        if eval_mode == \"geometric_mean\":\n            raise NotImplementedError(\"Need to fix this for new log format\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/logger/log_parser.py:57-78"
+    },
+    "2351": {
+        "file_id": 183,
+        "content": "This code is parsing log data, extracting relevant metrics and scores for a specific seed. It asserts that the log matches the expected format and then populates a dictionary of scores for each seed. If the log contains a specific tag, it extracts the corresponding value and adds it to the appropriate score list. Finally, it defines an empty dictionary for aggregation and raises a NotImplementedError if evaluating in geometric mean mode as it needs to be fixed for new log format.",
+        "type": "comment"
+    },
+    "2352": {
+        "file_id": 183,
+        "content": "            consider = [\"R1\", \"R5\", \"R10\"]\n            seeds = list(scores[\"R1\"].keys())\n            for seed in seeds:\n                for metric, subdict in scores.items():\n                    if metric in consider:\n                        geometric_stats[seed].append(subdict[seed])\n                gms_raw = np.array(geometric_stats[seed])\n                geo_means = scipy.stats.mstats.gmean(gms_raw, axis=0)\n                best_epochs[seed] = np.argmax(geo_means)\n        for metric, subdict in scores.items():\n            for seed, values in subdict.items():\n                if eval_mode == \"test_run\":\n                    stat = values[0]\n                elif eval_mode == \"fixed_num_epochs\":\n                    stat = values[fixed_num_epochs - 1]\n                elif \"LSMDC\" in log_path and eval_mode == \"geometric_mean\":\n                    stat = values[best_epochs[seed]]\n                else:\n                    raise ValueError(f\"unrecognised eval_mode: {eval_mode}\")\n                agg_scores[metric].append(stat)",
+        "type": "code",
+        "location": "/applications/T2VLAD/logger/log_parser.py:79-99"
+    },
+    "2353": {
+        "file_id": 183,
+        "content": "Code calculates scores for different seeds and metrics, then selects the best epochs based on geometric means. It then determines the final score statistic for each metric depending on the eval_mode, and appends it to agg_scores.",
+        "type": "comment"
+    },
+    "2354": {
+        "file_id": 183,
+        "content": "        if eval_mode == \"fixed_num_epochs\":\n            logger.info(f\"Reporting stats with fixed training length: {fixed_num_epochs}\")\n        for metric, values in agg_scores.items():\n            logger.info(f\"{metric}: {np.mean(values):.1f}, {np.std(values, ddof=1):.1f}\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/logger/log_parser.py:101-104"
+    },
+    "2355": {
+        "file_id": 183,
+        "content": "This code snippet checks if the evaluation mode is set to \"fixed_num_epochs\". If so, it logs a message indicating the fixed training length. Then, for each metric in the aggregated scores, it calculates the mean and standard deviation using numpy's `np.mean()` and `np.std()`, respectively, and logs the values.",
+        "type": "comment"
+    },
+    "2356": {
+        "file_id": 184,
+        "content": "/applications/T2VLAD/logger/logger.py",
+        "type": "filepath"
+    },
+    "2357": {
+        "file_id": 184,
+        "content": "This code sets up the logging configuration based on a provided JSON file. If the file is found, it modifies the filename paths in the configuration according to the save_dir and then uses logging.config.dictConfig() to configure the logging system. If the file is not found, it uses basicConfig() with default level to set up logging. The function returns the filename for the \"info_file_handler\".",
+        "type": "summary"
+    },
+    "2358": {
+        "file_id": 184,
+        "content": "import os\nimport logging\nimport logging.config\nfrom pathlib import Path\nfrom utils import read_json\ndef setup_logging(save_dir, log_config='logger/logger_config.json',\n                  default_level=logging.INFO):\n    \"\"\"Setup logging configuration.\"\"\"\n    print(os.getcwd())\n    log_config = Path(log_config)\n    print(f\"log config: {log_config} exists: {log_config.exists()}\")\n    if log_config.is_file():\n        config = read_json(log_config)\n        # modify logging paths based on run config\n        for _, handler in config['handlers'].items():\n            if 'filename' in handler:\n                handler['filename'] = str(save_dir / handler['filename'])\n        logging.config.dictConfig(config)\n    else:\n        print(f\"Warning: logging configuration file is not found in {log_config}.\")\n        logging.basicConfig(level=default_level)\n    return config[\"handlers\"][\"info_file_handler\"][\"filename\"]",
+        "type": "code",
+        "location": "/applications/T2VLAD/logger/logger.py:1-25"
+    },
+    "2359": {
+        "file_id": 184,
+        "content": "This code sets up the logging configuration based on a provided JSON file. If the file is found, it modifies the filename paths in the configuration according to the save_dir and then uses logging.config.dictConfig() to configure the logging system. If the file is not found, it uses basicConfig() with default level to set up logging. The function returns the filename for the \"info_file_handler\".",
+        "type": "comment"
+    },
+    "2360": {
+        "file_id": 185,
+        "content": "/applications/T2VLAD/model/loss.py",
+        "type": "filepath"
+    },
+    "2361": {
+        "file_id": 185,
+        "content": "The code implements max margin ranking loss and calculates cosine similarity between images and sentences, including a ContrastiveLoss class for contrastive learning. It also computes the cost for contrastive learning and video-level loss in T2VLAD models with masks, comparisons, and scalings.",
+        "type": "summary"
+    },
+    "2362": {
+        "file_id": 185,
+        "content": "\"\"\"This module contains an implementation of the max margin ranking loss, slightly\nmodified from this code:\nhttps://github.com/antoine77340/Mixture-of-Embedding-Experts/blob/master/loss.py\nThe modification is the `fix_norm` conditional, which removes zero terms from the\ndiagonal when performing the averaging calculation.\nOriginal licence below.\n\"\"\"\n# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\ndef cosine_sim(im, s):",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/loss.py:1-28"
+    },
+    "2363": {
+        "file_id": 185,
+        "content": "This code snippet contains an implementation of the max margin ranking loss, modified from a source code, and includes functions to calculate cosine similarity between images and sentences. The original code is licensed under the Apache License 2.0.",
+        "type": "comment"
+    },
+    "2364": {
+        "file_id": 185,
+        "content": "  '''cosine similarity between all the image and sentence pairs\n  '''\n  inner_prod = im.mm(s.t())\n  im_norm = paddle.sqrt((im ** 2).sum(axis=1).reshape([-1, 1]) + 1e-18) \n  s_norm = paddle.sqrt((s ** 2).sum(axis=1).reshape([-1, 1]) + 1e-18)\n  sim = inner_prod / (im_norm * s_norm)\n  return sim\nclass ContrastiveLoss(nn.Layer):\n  '''compute contrastive loss\n  '''\n  def __init__(self, margin=0, max_violation=True, direction='bi', topk=1):\n    '''Args:\n      direction: i2t for negative sentence, t2i for negative image, bi for both\n    '''\n    super().__init__()\n    self.margin = margin\n    self.max_violation = max_violation\n    self.direction = direction\n    self.topk = topk\n  def forward(self, scores, margin=None, average_batch=True):\n    '''\n    Args:\n      scores: image-sentence score matrix, (batch, batch)\n        the same row of im and s are positive pairs, different rows are negative pairs\n    '''\n    if margin is None:\n      margin = self.margin\n    batch_size = scores.shape[0] \n    diagonal = paddle.diagonal(scores).reshape([batch_size, 1])",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/loss.py:29-61"
+    },
+    "2365": {
+        "file_id": 185,
+        "content": "This code calculates cosine similarity between image and sentence pairs, and defines a ContrastiveLoss class to compute contrastive loss for contrastive learning.",
+        "type": "comment"
+    },
+    "2366": {
+        "file_id": 185,
+        "content": "    # mask to clear diagonals which are positive pairs\n    pos_masks = paddle.eye(batch_size).astype('bool') \n    batch_topk = min(batch_size, self.topk)\n    if self.direction == 'i2t' or self.direction == 'bi':\n      d1 = diagonal.expand_as(scores) # same collumn for im2s (negative sentence)\n      # compare every diagonal score to scores in its collumn\n      # caption retrieval\n      cost_s = (margin + scores - d1).clip(min=0)\n      cost_s[pos_masks] =  0 \n      if self.max_violation:\n        cost_s, _ = paddle.topk(cost_s, batch_topk, axis=1)\n        cost_s = cost_s / batch_topk\n        if average_batch:\n          cost_s = cost_s / batch_size\n      else:\n        if average_batch:\n          cost_s = cost_s / (batch_size * (batch_size - 1))\n      cost_s = paddle.sum(cost_s)\n    if self.direction == 't2i' or self.direction == 'bi':\n      d2 = diagonal.t().expand_as(scores) # same row for s2im (negative image)\n      # compare every diagonal score to scores in its row\n      cost_im = (margin + scores - d2).clip(min=0)",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/loss.py:62-85"
+    },
+    "2367": {
+        "file_id": 185,
+        "content": "This code segment calculates the cost for negative pairs in a contrastive learning task. It first creates masks to clear diagonal values, then compares each diagonal score with scores within its column or row (depending on direction), and applies a margin to create positive pairs. The cost is calculated based on the max violation method and averaged according to specific conditions.",
+        "type": "comment"
+    },
+    "2368": {
+        "file_id": 185,
+        "content": "      cost_im[pos_masks] = 0 \n      if self.max_violation:\n        cost_im, _ = paddle.topk(cost_im, batch_topk, axis=0)\n        cost_im = cost_im / batch_topk\n        if average_batch:\n          cost_im = cost_im / batch_size\n      else:\n        if average_batch:\n          cost_im = cost_im / (batch_size * (batch_size - 1))\n      cost_im = paddle.sum(cost_im)\n    if self.direction == 'i2t':\n      return cost_s\n    elif self.direction == 't2i':\n      return cost_im\n    else:\n      return cost_s + cost_im",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/loss.py:86-102"
+    },
+    "2369": {
+        "file_id": 185,
+        "content": "This code calculates the video-level loss in a T2VLAD model. It first sets the positions of the correct matches to 0, then applies various scaling operations based on parameters. Finally, it sums the resulting cost and returns the appropriate value depending on the direction (i2t or t2i).",
+        "type": "comment"
+    },
+    "2370": {
+        "file_id": 186,
+        "content": "/applications/T2VLAD/model/metric.py",
+        "type": "filepath"
+    },
+    "2371": {
+        "file_id": 186,
+        "content": "This code calculates retrieval metrics, offers sorting and visualization options, handles tie-breaking efficiently, and computes ranking metrics for input data using NumPy, SciPy, and Matplotlib.",
+        "type": "summary"
+    },
+    "2372": {
+        "file_id": 186,
+        "content": "# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport numbers\nimport scipy.stats\nimport numpy as np\nfrom pathlib import Path\nfrom sklearn.metrics import average_precision_score\ndef t2v_metrics(sims, query_masks=None):\n    \"\"\"Compute retrieval metrics from a similiarity matrix.\n    Args:\n        sims (th.Tensor): N x M matrix of similarities between embeddings, where\n             x_{i,j} = <text_embd[i], vid_embed[j]>\n        query_masks (th.Tensor): mask any missing queries from the dataset (two videos",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:1-30"
+    },
+    "2373": {
+        "file_id": 186,
+        "content": "This code is computing retrieval metrics from a similarity matrix. It takes two tensors as inputs, sims and query_masks. The sims tensor contains NxM matrix of similarities between embeddings, where x_{i,j} = <text_embd[i], vid_embed[j]>. The query_masks tensor is optional and is used to mask any missing queries from the dataset. It then calculates various retrieval metrics such as average precision score, mean average precision, and other related statistics.",
+        "type": "comment"
+    },
+    "2374": {
+        "file_id": 186,
+        "content": "             in MSRVTT only have 19, rather than 20 captions)\n    Returns:\n        (dict[str:float]): retrieval metrics\n    \"\"\"\n    assert sims.ndim == 2, \"expected a matrix\"\n    num_queries, num_vids = sims.shape\n    dists = -sims\n    sorted_dists = np.sort(dists, axis=1)\n    if False:\n        import sys\n        import matplotlib\n        from pathlib import Path\n        matplotlib.use(\"Agg\")\n        import matplotlib.pyplot as plt\n        sys.path.insert(0, str(Path.home() / \"coding/src/zsvision/python\"))\n        from zsvision.zs_iterm import zs_dispFig # NOQA\n        plt.matshow(dists)\n        zs_dispFig()\n        import ipdb; ipdb.set_trace()\n    # The indices are computed such that they slice out the ground truth distances\n    # from the psuedo-rectangular dist matrix\n    queries_per_video = num_queries // num_vids\n    gt_idx = [[np.ravel_multi_index([ii, jj], (num_queries, num_vids))\n              for ii in range(jj * queries_per_video, (jj + 1) * queries_per_video)]\n              for jj in range(num_vids)]",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:31-58"
+    },
+    "2375": {
+        "file_id": 186,
+        "content": "This function calculates retrieval metrics for a given similarity matrix, and it ensures the matrix has two dimensions. It sorts the distances in the matrix and provides an option to visualize it using matplotlib. The code also computes the ground truth indices for each video, given the number of queries and videos.",
+        "type": "comment"
+    },
+    "2376": {
+        "file_id": 186,
+        "content": "    gt_idx = np.array(gt_idx)\n    gt_dists = dists.reshape(-1)[gt_idx.reshape(-1)]\n    gt_dists = gt_dists[:, np.newaxis]\n    rows, cols = np.where((sorted_dists - gt_dists) == 0)  # find column position of GT\n    # --------------------------------\n    # NOTE: Breaking ties\n    # --------------------------------\n    # We sometimes need to break ties (in general, these should occur extremely rarely,\n    # but there are pathological cases when they can distort the scores, such as when\n    # the similarity matrix is all zeros). Previous implementations (e.g. the t2i\n    # evaluation function used\n    # here: https://github.com/niluthpol/multimodal_vtt/blob/master/evaluation.py and\n    # here: https://github.com/linxd5/VSE_Pytorch/blob/master/evaluation.py#L87) generally\n    # break ties \"optimistically\".  However, if the similarity matrix is constant this\n    # can evaluate to a perfect ranking. A principled option is to average over all\n    # possible partial orderings implied by the ties. See # this paper for a discussion:",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:59-75"
+    },
+    "2377": {
+        "file_id": 186,
+        "content": "This section is handling tie-breaking in the similarity matrix, ensuring that it evaluates correctly even when there are ties. It averages over all possible partial orderings implied by the ties for a principled approach. This should occur extremely rarely but can distort scores if not handled properly.",
+        "type": "comment"
+    },
+    "2378": {
+        "file_id": 186,
+        "content": "    #    McSherry, Frank, and Marc Najork,\n    #    \"Computing information retrieval performance measures efficiently in the presence\n    #    of tied scores.\" European conference on information retrieval. Springer, Berlin, \n    #    Heidelberg, 2008.\n    # http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.145.8892&rep=rep1&type=pdf\n    # break_ties = \"optimistically\"\n    break_ties = \"averaging\"\n    if rows.size > num_queries:\n        assert np.unique(rows).size == num_queries, \"issue in metric evaluation\"\n        if break_ties == \"optimistically\":\n            _, idx = np.unique(rows, return_index=True)\n            cols = cols[idx]\n        elif break_ties == \"averaging\":\n            # fast implementation, based on this code:\n            # https://stackoverflow.com/a/49239335\n            locs = np.argwhere((sorted_dists - gt_dists) == 0)\n            # Find the split indices\n            steps = np.diff(locs[:, 0])\n            splits = np.nonzero(steps)[0] + 1\n            splits = np.insert(splits, 0, 0)",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:76-98"
+    },
+    "2379": {
+        "file_id": 186,
+        "content": "This code is computing information retrieval performance measures efficiently in the presence of tied scores, following McSherry et al. (2008). It handles ties optimistically or by averaging, and checks if the number of unique rows matches the number of queries.",
+        "type": "comment"
+    },
+    "2380": {
+        "file_id": 186,
+        "content": "            # Compute the result columns\n            summed_cols = np.add.reduceat(locs[:, 1], splits)\n            counts = np.diff(np.append(splits, locs.shape[0]))\n            avg_cols = summed_cols / counts\n            if False:\n                print(\"Running slower code to verify rank averaging across ties\")\n                # slow, but more interpretable version, used for testing\n                avg_cols_slow = [np.mean(cols[rows == idx]) for idx in range(num_queries)]\n                assert np.array_equal(avg_cols, avg_cols_slow), \"slow vs fast difference\"\n                print(\"passed num check\")\n            cols = avg_cols\n    msg = \"expected ranks to match queries ({} vs {}) \"\n    if cols.size != num_queries:\n        import ipdb; ipdb.set_trace()\n    assert cols.size == num_queries, msg\n    if False:\n        # overload mask to check that we can recover the scores for single-query\n        # retrieval\n        print(\"DEBUGGING MODE\")\n        query_masks = np.zeros_like(query_masks)\n        query_masks[:, 0] = 1  # recover single query score",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:100-122"
+    },
+    "2381": {
+        "file_id": 186,
+        "content": "This code calculates the average rank of each query by dividing the summed ranks by their respective counts. It also provides a slower, more interpretable version for testing and asserts that the size of the calculated results matches the expected number of queries. The code includes optional debugging features to verify rank averaging across ties and recover single-query scores.",
+        "type": "comment"
+    },
+    "2382": {
+        "file_id": 186,
+        "content": "    if query_masks is not None:\n        # remove invalid queries\n        assert query_masks.size == num_queries, \"invalid query mask shape\"\n        cols = cols[query_masks.reshape(-1).astype(np.bool)]\n        assert cols.size == query_masks.sum(), \"masking was not applied correctly\"\n        # update number of queries to account for those that were missing\n        num_queries = query_masks.sum()\n    if False:\n        # sanity check against old logic for square matrices\n        gt_dists_old = np.diag(dists)\n        gt_dists_old = gt_dists_old[:, np.newaxis]\n        _, cols_old = np.where((sorted_dists - gt_dists_old) == 0)\n        assert np.array_equal(cols_old, cols), \"new metric doesn't match\"\n    return cols2metrics(cols, num_queries)\ndef v2t_metrics(sims, query_masks=None):\n    \"\"\"Compute retrieval metrics from a similiarity matrix.\n    Args:\n        sims (th.Tensor): N x M matrix of similarities between embeddings, where\n             x_{i,j} = <text_embd[i], vid_embed[j]>\n        query_masks (th.Tensor): mask any missing captions from the dataset",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:124-148"
+    },
+    "2383": {
+        "file_id": 186,
+        "content": "This function computes retrieval metrics from a similarity matrix and handles invalid queries by checking if query_masks are not None, removing invalid queries, updating the number of queries, and returning the results. It also includes a sanity check against old logic for square matrices.",
+        "type": "comment"
+    },
+    "2384": {
+        "file_id": 186,
+        "content": "    Returns:\n        (dict[str:float]): retrieval metrics\n    NOTES: We find the closest \"GT caption\" in the style of VSE, which corresponds\n    to finding the rank of the closest relevant caption in embedding space:\n    github.com/ryankiros/visual-semantic-embedding/blob/master/evaluation.py#L52-L56\n    \"\"\"\n    # switch axes of text and video\n    sims = sims.T\n    if False:\n        # experiment with toy example\n        sims = np.ones((3, 3))\n        sims[0, 0] = 2\n        sims[1, 1:2] = 2\n        sims[2, :] = 2\n        query_masks = None\n    assert sims.ndim == 2, \"expected a matrix\"\n    num_queries, num_caps = sims.shape\n    dists = -sims\n    caps_per_video = num_caps // num_queries\n    break_ties = \"averaging\"\n    MISSING_VAL = 1E8\n    query_ranks = []\n    for ii in range(num_queries):\n        row_dists = dists[ii, :]\n        if query_masks is not None:\n            # Set missing queries to have a distance of infinity.  A missing query\n            # refers to a query position `n` for a video that had less than `n`",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:150-180"
+    },
+    "2385": {
+        "file_id": 186,
+        "content": "This code calculates retrieval metrics for finding the closest \"GT caption\" in embedding space. It first switches axes of text and video, then applies various operations to compute distances between queries and captions. The code handles missing values by setting them to have a distance of infinity. The result is a dictionary of retrieval metrics.",
+        "type": "comment"
+    },
+    "2386": {
+        "file_id": 186,
+        "content": "            # captions (for example, a few MSRVTT videos only have 19 queries)\n            row_dists[np.logical_not(query_masks.reshape(-1))] = MISSING_VAL\n        # NOTE: Using distance subtraction to perform the ranking is easier to make\n        # deterministic than using argsort, which suffers from the issue of defining\n        # \"stability\" for equal distances.  Example of distance subtraction code:\n        # github.com/antoine77340/Mixture-of-Embedding-Experts/blob/master/train.py\n        sorted_dists = np.sort(row_dists)\n        min_rank = np.inf\n        for jj in range(ii * caps_per_video, (ii + 1) * caps_per_video):\n            if row_dists[jj] == MISSING_VAL:\n                # skip rankings of missing captions\n                continue\n            ranks = np.where((sorted_dists - row_dists[jj]) == 0)[0]\n            if break_ties == \"optimistically\":\n                rank = ranks[0]\n            elif break_ties == \"averaging\":\n                # NOTE: If there is more than one caption per video, its possible for the",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:181-199"
+    },
+    "2387": {
+        "file_id": 186,
+        "content": "The code performs ranking of captions based on distances and handles missing values. It uses distance subtraction instead of argsort for better deterministic results. The code skips rankings of missing captions, and when ties occur, it provides options to break them optimistically or by averaging.",
+        "type": "comment"
+    },
+    "2388": {
+        "file_id": 186,
+        "content": "                # method to do \"worse than chance\" in the degenerate case when all\n                # similarities are tied.  TODO(Samuel): Address this case.\n                rank = ranks.mean()\n            if rank < min_rank:\n                min_rank = rank\n        query_ranks.append(min_rank)\n    query_ranks = np.array(query_ranks)\n    # sanity check against old version of code\n    if False:\n        sorted_dists = np.sort(dists, axis=1)\n        gt_dists_old = np.diag(dists)\n        gt_dists_old = gt_dists_old[:, np.newaxis]\n        rows_old, cols_old = np.where((sorted_dists - gt_dists_old) == 0)\n        if rows_old.size > num_queries:\n            _, idx = np.unique(rows_old, return_index=True)\n            cols_old = cols_old[idx]\n        num_diffs = (1 - (cols_old == query_ranks)).sum()\n        msg = f\"new metric doesn't match in {num_diffs} places\"\n        assert np.array_equal(cols_old, query_ranks), msg\n        # visualise the distance matrix\n        import sys\n        import matplotlib\n        matplotlib.use(\"Agg\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:200-224"
+    },
+    "2389": {
+        "file_id": 186,
+        "content": "This code snippet calculates the average rank of similarities in a matrix and checks if it's lower than the minimum rank. It also includes a sanity check against an older version of the code by comparing the calculated ranks with the diagonal elements of the distance matrix and asserts that they are equal using NumPy's array_equal function. If the assertion fails, it prints a message with the number of differences and uses matplotlib to visualize the distance matrix for debugging purposes.",
+        "type": "comment"
+    },
+    "2390": {
+        "file_id": 186,
+        "content": "        import matplotlib.pyplot as plt\n        sys.path.insert(0, str(Path.home() / \"coding/src/zsvision/python\"))\n        from zsvision.zs_iterm import zs_dispFig # NOQA\n        plt.matshow(dists)\n        zs_dispFig()\n    return cols2metrics(query_ranks, num_queries)\ndef cols2metrics(cols, num_queries):\n    metrics = {}\n    metrics[\"R1\"] = 100 * float(np.sum(cols == 0)) / num_queries\n    metrics[\"R5\"] = 100 * float(np.sum(cols < 5)) / num_queries\n    metrics[\"R10\"] = 100 * float(np.sum(cols < 10)) / num_queries\n    metrics[\"R50\"] = 100 * float(np.sum(cols < 50)) / num_queries\n    metrics[\"MedR\"] = np.median(cols) + 1\n    metrics[\"MeanR\"] = np.mean(cols) + 1\n    stats = [metrics[x] for x in (\"R1\", \"R5\", \"R10\")]\n    metrics[\"geometric_mean_R1-R5-R10\"] = scipy.stats.mstats.gmean(stats)\n    return metrics",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/metric.py:225-243"
+    },
+    "2391": {
+        "file_id": 186,
+        "content": "This code is using matplotlib to display a matrix of distances and then calculates various ranking metrics such as R1, R5, R10, MedR and MeanR for the input data. The function cols2metrics takes in two parameters: 'cols', which represents the input data, and 'num_queries', representing the total number of queries. It computes these ranking metrics using numpy and scipy libraries. Finally, it returns a dictionary containing all calculated metrics.",
+        "type": "comment"
+    },
+    "2392": {
+        "file_id": 187,
+        "content": "/applications/T2VLAD/model/model.py",
+        "type": "filepath"
+    },
+    "2393": {
+        "file_id": 187,
+        "content": "The code combines T2VLAD and BERT in a CENet model for video analysis, initializes MOE with Transformer layers, extracts visual features, and uses VLAD for cross-view localization. The function calculates video-text similarity scores, includes batch normalization, global pooling, and availability masking, reshapes weights, normalizes embeddings, computes text-video similarity with weighting, checks for NaN values, and raises ValueError if found.",
+        "type": "summary"
+    },
+    "2394": {
+        "file_id": 187,
+        "content": "# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport time\nimport itertools\nimport paddle\nimport numpy as np\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import Tensor\nfrom typing import Optional\nfrom collections import OrderedDict\nfrom base import BaseModel\nfrom model.net_vlad import NetVLAD\ntry:\n    from paddlenlp.transformers import BertModel\nexcept ImportError as e:\n    print(\n        f\"{e}, [paddlenlp] package and it's dependencies is required for T2VLAD.\"",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:1-34"
+    },
+    "2395": {
+        "file_id": 187,
+        "content": "This code snippet is importing necessary libraries and models for the T2VLAD model. It includes copyright and license information, as well as imports from base, net_vlad, paddlenlp, and various other modules. The code aims to create a T2VLAD model using PaddlePaddle framework with potential dependencies on BertModel and paddlenlp packages.",
+        "type": "comment"
+    },
+    "2396": {
+        "file_id": 187,
+        "content": "    )\nclass Mish(nn.Layer):\n    '''\n    Applies the mish function element-wise:\n    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))\n    SRC: https://github.com/digantamisra98/Mish/blob/master/Mish/Torch/mish.py\n    '''\n    def forward(self, input):\n        '''\n        Forward pass of the function.\n        '''\n        return input * paddle.tanh(F.softplus(input))\ndef kronecker_prod(t1, t2):\n    # kronecker is performed along the last dim\n    kron = paddle.bmm(t1.reshape([-1, t1.size(-1)], 1),\n                      t2.reshape([-1, 1, t2.size(-1)]))\n    return kron.reshape[(t1.shape[0], t1.shape[1], -1)]\ndef drop_nans(x, ind, validate_missing):\n    \"\"\"Remove nans, which we expect to find at missing indices.\n    Args:\n        x (paddle.Tensor): features\n        ind (paddle.Tensor): binary values denoting whether or not a given feature is present\n        validate_missing (bool): whether to validate that the missing location contains a nan.\n    Returns:\n        (paddle.tensor): the features, with the missing values masked to zero.",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:35-66"
+    },
+    "2397": {
+        "file_id": 187,
+        "content": "The code defines three functions: 'Mish', 'kronecker_prod', and 'drop_nans'. The 'Mish' function implements the mish activation function, which applies the mish formula element-wise. The 'kronecker_prod' function performs a Kronecker product of two tensors along the last dimension. Finally, the 'drop_nans' function removes NaN values from input features, considering any missing indices as containing NaN.",
+        "type": "comment"
+    },
+    "2398": {
+        "file_id": 187,
+        "content": "    \"\"\"\n    missing = paddle.nonzero(ind == 0).flatten()\n    if missing.numel():\n        if validate_missing:\n            vals = x[missing[0]]\n            assert paddle.isnan(vals.reshape(\n                [-1])[0]), \"expected nans at missing locations\"\n        #Prevent overwrite of the original tensor\n        x_ = x\n        x_[missing] = 0\n        x = x_\n    if paddle.isnan(x).sum() > 0:\n        raise ValueError(\"Still find nans after removing it!\")\n    return x\nclass CENet(BaseModel):\n    def __init__(self, text_dim, expert_dims, vlad_clusters, ghost_clusters,\n                 feat_aggregation, ce_shared_dim, use_mish, mimic_ce_dims):\n        super().__init__()\n        self.expert_dims = expert_dims\n        self.feat_aggregation = feat_aggregation\n        vlad_feat_sizes = {key: val for key, val in vlad_clusters.items()}\n        if vlad_clusters[\"text\"] == 0:\n            self.text_pooling = nn.Sequential()\n        else:\n            self.text_pooling = NetVLAD(\n                feature_size=text_dim,\n                cluster_size=vlad_clusters[\"text\"],",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:67-98"
+    },
+    "2399": {
+        "file_id": 187,
+        "content": "The code defines a CENet model and checks for any NaN values in the input tensor 'x'. It sets missing locations to 0 and raises a ValueError if there are still NaN values after removing them. The model consists of expert_dims, vlad_clusters, feat_aggregation, ce_shared_dim, use_mish, and mimic_ce_dims. The text_pooling layer is implemented as NetVLAD for feature extraction if the vlad_clusters[\"text\"] is non-zero.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/24.json b/docs/data/24.json
new file mode 100644
index 000000000..75ecc74d0
--- /dev/null
+++ b/docs/data/24.json
@@ -0,0 +1,547 @@
+{
+    "2400": {
+        "file_id": 187,
+        "content": "                ghost_clusters=ghost_clusters[\"text\"],\n            )\n            self.text_bert = BertModel.from_pretrained('bert-base-uncased')\n            text_dim = self.text_pooling.out_dim\n        self.ce = CEModule(\n            text_dim=text_dim,\n            expert_dims=expert_dims,\n            vlad_feat_sizes=vlad_feat_sizes,\n            mimic_ce_dims=mimic_ce_dims,\n            use_mish=use_mish,\n            same_dim=ce_shared_dim,\n        )\n    def forward(self,\n                experts,\n                ind,\n                cap_id=None,\n                att_mask=None,\n                text=None,\n                raw_captions=None,\n                text_token_mask=None):\n        aggregated_experts = OrderedDict()\n        # Handle all nan-checks\n        for mod in self.expert_dims:\n            experts[mod] = drop_nans(x=experts[mod],\n                                     ind=ind[mod],\n                                     validate_missing=True)\n            aggregated_experts[mod] = experts[mod]\n        start = time.time()",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:99-130"
+    },
+    "2401": {
+        "file_id": 187,
+        "content": "The code initializes a model with specified expert dimensions, and handles nan-checks for the experts. It also creates a time estimation start point.",
+        "type": "comment"
+    },
+    "2402": {
+        "file_id": 187,
+        "content": "        # When pooling multiple captions for a single video, we treat them as separate\n        # members of the minibatch, so the total pooling op does the following:\n        # pooling: B x captions_per_video x max_sentence_length x text_feat_dim\n        # -> B x captions_per_video (cluster_dim * text_feat_dim)\n        B, captions_per_video, max_words, text_feat_dim = text.shape\n        text = text.reshape([B * captions_per_video, max_words, text_feat_dim])\n        if isinstance(self.text_pooling, NetVLAD):\n            kwargs = {\"mask\": text_token_mask}\n        else:\n            kwargs = {}\n        cap_id = cap_id.reshape([B * captions_per_video, -1])\n        att_mask = att_mask.reshape([B * captions_per_video, -1])\n        att_mask = att_mask.unsqueeze(axis=[1, 2])\n        bert_out = self.text_bert(cap_id,\n                                  token_type_ids=None,\n                                  attention_mask=att_mask)\n        text = bert_out[0]\n        text, _, save_ass = self.text_pooling(text, **kwargs)",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:131-148"
+    },
+    "2403": {
+        "file_id": 187,
+        "content": "This code is reshaping the input text tensor to account for multiple captions per video, applying a pooling operation specific to the chosen text_pooling method (NetVLAD in this case), and then passing the text through a BERT model before performing pooling again. The resulting output is shaped according to the required format for further processing.",
+        "type": "comment"
+    },
+    "2404": {
+        "file_id": 187,
+        "content": "        text = text.reshape([B, captions_per_video, -1])\n        return self.ce(text, aggregated_experts, ind, raw_captions,\n                       self.text_pooling, start)\ndef _get_clones(module, N):\n    return nn.LayerList([copy.deepcopy(module) for i in range(N)])\nclass TransformerLayer(nn.Layer):\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward=2048,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 normalize_before=True):\n        super().__init__()\n        self.self_attn = nn.MultiHeadAttention(d_model, nhead, dropout=dropout)\n        # Implementation of Feedforward model\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout)\n        self.dropout2 = nn.Dropout(dropout)\n        self.activation = F.relu",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:149-179"
+    },
+    "2405": {
+        "file_id": 187,
+        "content": "The given code contains a function that performs multi-head attention, feedforward model implementation, and LayerNorm normalization in Transformer layers. The `nn.MultiHeadAttention` applies the self-attention mechanism, while `nn.Linear` layers are used for linear transformations. Dropout and ReLU activations are also applied to prevent overfitting and introduce nonlinearity respectively.",
+        "type": "comment"
+    },
+    "2406": {
+        "file_id": 187,
+        "content": "        self.normalize_before = normalize_before\n    def with_pos_embed(self, tensor, pos: Optional[Tensor]):\n        return tensor if pos is None else tensor + pos\n    def forward_post(self,\n                     src,\n                     src_mask: Optional[Tensor] = None,\n                     pos: Optional[Tensor] = None):\n        q = k = self.with_pos_embed(src, pos)\n        q = q.transpose([1, 0, 2])\n        k = k.transpose([1, 0, 2])\n        src = src.transpose([1, 0, 2])\n        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)\n        src2 = src2.transpose([1, 0, 2])\n        src = src + self.dropout1(src2)\n        src = self.norm1(src)\n        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))\n        src = src + self.dropout2(src2)\n        src = self.norm2(src)\n        return src\n    def forward_pre(self,\n                    src,\n                    src_mask: Optional[Tensor] = None,\n                    pos: Optional[Tensor] = None):\n        src2 = self.norm1(src)\n        q = k = self.with_pos_embed(src2, pos)",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:180-207"
+    },
+    "2407": {
+        "file_id": 187,
+        "content": "This code defines a class with three forward functions: `forward_post`, `forward_pre`, and an undefined `forward`. The `forward_post` function applies self-attention to the input source tensor, while the `forward_pre` function normalizes the input source tensor before applying self-attention. Both functions take an optional mask and position embedding for the input tensor. The code also includes a class attribute `normalize_before` that determines whether to normalize the input tensor or not.",
+        "type": "comment"
+    },
+    "2408": {
+        "file_id": 187,
+        "content": "        q = q.transpose([1, 0, 2])\n        k = k.transpose([1, 0, 2])\n        src2 = src2.transpose([1, 0, 2])\n        src2 = self.self_attn(q, key=k, value=src2, attn_mask=src_mask)\n        src2 = src2.transpose([1, 0, 2])\n        src = src + self.dropout1(src2)\n        src2 = self.norm2(src)\n        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))\n        src = src + self.dropout2(src2)\n        return src\n    def forward(self,\n                src,\n                src_mask: Optional[Tensor] = None,\n                pos: Optional[Tensor] = None):\n        if self.normalize_before:\n            return self.forward_pre(src, src_mask, pos)\n        return self.forward_post(src, src_mask, pos)\nclass Transformer(nn.Layer):\n    def __init__(self, encoder_layer, num_layers, norm=None):\n        super().__init__()\n        self.layers = _get_clones(encoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.norm = norm\n        self._reset_parameters()\n    def _reset_parameters(self):\n        for p in self.parameters():  # may have a problem",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:208-237"
+    },
+    "2409": {
+        "file_id": 187,
+        "content": "The code defines a Transformer class that performs multi-head self-attention and feedforward operations. The class takes an encoder_layer as input and num_layers as parameters, allowing for multiple layers of transformation. The Transformer class has a forward function that can perform the transformations before or after normalization depending on the value of normalize_before flag. The _reset_parameters function is used to reset the parameters of the class.",
+        "type": "comment"
+    },
+    "2410": {
+        "file_id": 187,
+        "content": "            if p.dim() > 1:\n                nn.initializer.XavierUniform(p)\n    def forward(self,\n                src,\n                mask: Optional[Tensor] = None,\n                pos: Optional[Tensor] = None):\n        output = src\n        for layer in self.layers:\n            output = layer(output)\n        if self.norm is not None:\n            output = self.norm(output)\n        return output\nclass CEModule(nn.Layer):\n    def __init__(self, expert_dims, text_dim, use_mish, mimic_ce_dims,\n                 vlad_feat_sizes, same_dim):\n        super().__init__()\n        modalities = list(expert_dims.keys())\n        self.expert_dims = expert_dims\n        self.modalities = modalities\n        self.mimic_ce_dims = mimic_ce_dims\n        self.same_dim = same_dim\n        self.use_mish = use_mish\n        self.vlad_feat_sizes = vlad_feat_sizes\n        self.reduce_dim = 64\n        self.moe_cg = ContextGating\n        self.vis_transformer = True\n        if self.use_mish:\n            self.non_lin = Mish()\n        else:\n            self.non_lin = nn.ReLU()",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:238-275"
+    },
+    "2411": {
+        "file_id": 187,
+        "content": "This code defines a CEModule class with expert_dims, modalities, mimic_ce_dims, vlad_feat_sizes, and same_dim parameters. It uses the Mish function for non-linear activation if use_mish is True, otherwise using ReLU. It also includes a ContextGating object and a VisTransformer boolean.",
+        "type": "comment"
+    },
+    "2412": {
+        "file_id": 187,
+        "content": "        num_mods = len(expert_dims)\n        self.moe_fc = nn.Linear(text_dim, len(expert_dims))\n        self.moe_weights = paddle.ones([1, num_mods]) / num_mods\n        # The batch size of the face input can vary (due to missing inputs), so we\n        # probably shouldn't use BN on this branch. It's probably fine to leave it\n        # n for the corresponding text inputs, (but we should switch to GN)\n        use_bns = [True for modality in self.modalities]\n        # NOTE: When use_ce is not used, the text features are projected to\n        # subspaces of different dimensions.  When use_ce is used, they must all\n        # be projected to `same_dim` (to allow fusion). The only excpetion is for an\n        # ablation in which we mimic the `same_dim` reduction to measure whether this\n        # projection influences overall performance.\n        self.repeat_temporal = {}\n        for mod in modalities:\n            self.repeat_temporal[mod] = 1\n        in_dims = [\n            expert_dims[mod][0] * self.repeat_temporal[mod]",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:277-297"
+    },
+    "2413": {
+        "file_id": 187,
+        "content": "This code is initializing a MOE (Multi-Output Expert) model with specified modalities and dimensions, setting up the linear layer, weights, and batch normalization flags. It also defines the temporal repeat for each modality and calculates the input dimensions based on expert dimensions and temporal repetitions.",
+        "type": "comment"
+    },
+    "2414": {
+        "file_id": 187,
+        "content": "            for mod in modalities\n        ]\n        agg_dims = [\n            expert_dims[mod][1] * self.repeat_temporal[mod]\n            for mod in modalities\n        ]\n        feat_dims = [\n            expert_dims[mod][0] // self.vlad_feat_sizes[mod]\n            for mod in modalities\n        ]\n        if self.vis_transformer:\n            num_encoder_layers = 1\n            d_model = 768\n            nhead = 4\n            dim_feedforward = 768\n            dropout = 0  #dropout=0.1\n            normalize_before = True\n            encoder_layer = TransformerLayer(d_model, nhead, dim_feedforward,\n                                             dropout)\n            encoder_norm = nn.LayerNorm(d_model) if normalize_before else None\n            self.transformers = Transformer(encoder_layer, num_encoder_layers,\n                                            encoder_norm)\n        if self.mimic_ce_dims:\n            dim_reducers = [ReduceDim(in_dim, same_dim) for in_dim in feat_dims]\n            self.video_dim_reduce = nn.LayerList(dim_reducers)",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:298-323"
+    },
+    "2415": {
+        "file_id": 187,
+        "content": "The code initializes and prepares model components for modalities, including dimensions for expert features and feature sizes. It also creates a transformer if visual transformation is enabled, and sets up feature reducers if cross-entropy loss dims are mimicked.",
+        "type": "comment"
+    },
+    "2416": {
+        "file_id": 187,
+        "content": "        gated_vid_embds = [\n            GatedEmbeddingUnit(in_dim, same_dim, use_bn=True)\n            for in_dim in feat_dims\n        ]\n        text_out_dims = [same_dim for _ in agg_dims]\n        self.video_GU = nn.LayerList(gated_vid_embds)\n        gated_text_embds = [\n            GatedEmbeddingUnit(text_dim, dim, use_bn=True)\n            for dim in text_out_dims\n        ]\n        self.text_GU = nn.LayerList(gated_text_embds)\n    def compute_moe_weights(self, text, ind):\n        # compute weights for all captions (including when assigned K captions to\n        # the same video)\n        B, K, D = text.shape\n        M = len(self.modalities)\n        msg = f\"expected between 1 and 10 modalities, found {M} ({self.modalities})\"\n        assert 1 <= M <= 10, msg\n        # Treat each caption independently in the softmax (which runs over modalities)\n        text = text.reshape([B * K, D])\n        moe_weights = self.moe_fc(text)  # BK x D -> BK x M\n        moe_weights = F.softmax(moe_weights, axis=1)\n        moe_weights = moe_weights.reshape([B, K, M])",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:325-350"
+    },
+    "2417": {
+        "file_id": 187,
+        "content": "The code creates GatedEmbeddingUnit instances for both video and text features of different dimensions, initializes LayerLists to store them as video_GU and text_GU. The compute_moe_weights function calculates softmax weights for multiple captions (K) assigned to the same video, with an assertion for 1-10 modalities. Text is reshaped before applying MOE fully connected layer, then reshaped back to BxKxM shape.",
+        "type": "comment"
+    },
+    "2418": {
+        "file_id": 187,
+        "content": "        return moe_weights\n    def forward(self, text, experts, ind, raw_captions, vis_vlad, stime):\n        \"\"\"Compute joint embeddings and, if requested, a confusion matrix between\n        video and text representations in the minibatch.\n        Notation: B = batch size, M = number of modalities\n        \"\"\"\n        # Pass text embeddings through gated units\n        text_embd = {}\n        # Unroll repeated captions into present minibatch\n        B, captions_per_video, feat_dim = text.shape\n        text = text.reshape([B * captions_per_video, feat_dim])\n        for modality, layer in zip(self.modalities, self.text_GU):\n            # NOTE: Due to the batch norm, the gated units are sensitive to passing\n            # in a lot of zeroes, so we do the masking step after the forwards pass\n            text_ = layer(text)\n            # We always assume that text is available for retrieval\n            text_ = text_.reshape([B, captions_per_video, -1])\n            text_embd[modality] = text_\n        text = text.reshape([B, captions_per_video, -1])",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:351-374"
+    },
+    "2419": {
+        "file_id": 187,
+        "content": "This code is implementing a method for passing text embeddings through gated units. It first reshapes the input text, then iterates over the modalities and gated units to compute the text embeddings, which are stored in a dictionary. Finally, it reshapes the result back to its original shape.",
+        "type": "comment"
+    },
+    "2420": {
+        "file_id": 187,
+        "content": "        # vladded nans are handled earlier (during pooling)\n        # We also avoid zeroing random features, since this will leak information\n        # exclude = list(self.vlad_feat_sizes.keys()) + list(self.random_feats)\n        # experts = self.mask_missing_embeddings(experts, ind, exclude=exclude)\n        # MOE weights computation + normalization - note that we use the first caption\n        # sample to predict the weights\n        moe_weights = self.compute_moe_weights(text, ind=ind)\n        text_local = text.reshape([B * captions_per_video, -1])\n        vis_local = {}\n        for modality in self.modalities:\n            vis_local[modality] = experts[modality]\n        all_vis_feat = []\n        if hasattr(self, \"video_dim_reduce\"):\n            # Embed all features to a common dimension\n            for modality, layer in zip(self.modalities, self.video_dim_reduce):\n                all_vis_feat.append(layer(vis_local[modality]))\n        all_vis_feat = paddle.concat(all_vis_feat, axis=1)\n        if self.vis_transformer:",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:376-397"
+    },
+    "2421": {
+        "file_id": 187,
+        "content": "This code section is performing MOE weights computation and feature extraction for a Multi-Modal Video Analysis task. It excludes specific features to handle NAN values, then computes the MOE weights using text data and reshapes it accordingly. The visual features are extracted for each modality, then all the visual features are concatenated along the dimension. Finally, if vis_transformer is present, it is applied on the extracted visual features.",
+        "type": "comment"
+    },
+    "2422": {
+        "file_id": 187,
+        "content": "            experts_tensor = all_vis_feat\n            experts_tensor = experts_tensor.transpose([1, 0, 2])\n            att_out = self.transformers(experts_tensor, mask=None, pos=None)\n            all_vis_feat = att_out.transpose([1, 0, 2])\n        vis_local, _, save_ass = vis_vlad(all_vis_feat, freeze=True)\n        cross_view_conf_matrix_tv = paddle.matmul(text_local, vis_local.t())\n        for modality in self.modalities:\n            experts[modality] = experts[modality].max(axis=1)\n        for modality, layer in zip(self.modalities, self.video_GU):\n            experts[modality] = layer(experts[modality])\n        cross_view_conf_matrix = sharded_cross_view_inner_product(\n            ind=ind,\n            vid_embds=experts,\n            text_embds=text_embd,\n            text_weights=moe_weights,\n            subspaces=self.modalities,\n            raw_captions=raw_captions,\n        )\n        cross_view_conf_matrix = 0.5 * cross_view_conf_matrix + 0.5 * cross_view_conf_matrix_tv\n        return {\n            \"modalities\": self.modalities,",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:398-422"
+    },
+    "2423": {
+        "file_id": 187,
+        "content": "This code performs cross-view video localization by calculating the cross-view confidence matrix using VLAD and MOE weights. It also applies transformers, max pooling, and sharded inner products for each modality. The result is a dictionary containing the modalities used in the computation.",
+        "type": "comment"
+    },
+    "2424": {
+        "file_id": 187,
+        "content": "            \"cross_view_conf_matrix\": cross_view_conf_matrix,\n        }\nclass GatedEmbeddingUnit(nn.Layer):\n    def __init__(self, input_dimension, output_dimension, use_bn):\n        super(GatedEmbeddingUnit, self).__init__()\n        self.fc = nn.Linear(input_dimension, output_dimension)\n        self.cg = ContextGating(output_dimension, add_batch_norm=use_bn)\n    def forward(self, x):\n        x = self.fc(x)\n        x = self.cg(x)\n        x = F.normalize(x)\n        return x\nclass ReduceDim(nn.Layer):\n    def __init__(self, input_dimension, output_dimension):\n        super(ReduceDim, self).__init__()\n        self.fc = nn.Linear(input_dimension, output_dimension)\n    def forward(self, x):\n        x = self.fc(x)\n        x = F.normalize(x, axis=-1)\n        return x\nclass ContextGating(nn.Layer):\n    def __init__(self, dimension, add_batch_norm=True):\n        super(ContextGating, self).__init__()\n        self.fc = nn.Linear(dimension, dimension)\n        self.add_batch_norm = add_batch_norm\n        self.batch_norm = nn.BatchNorm1D(dimension)",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:423-456"
+    },
+    "2425": {
+        "file_id": 187,
+        "content": "This code defines several neural network layers: \"GatedEmbeddingUnit\", \"ReduceDim\", and \"ContextGating\". These layers are used for feature extraction, normalization, and context gating in the T2VLAD model. The GatedEmbeddingUnit layer combines a linear transformation and context gating to produce normalized output. The ReduceDim layer reduces the dimension of input features through a linear transformation followed by normalization. The ContextGating layer performs a linear transformation and optional batch normalization for context gating.",
+        "type": "comment"
+    },
+    "2426": {
+        "file_id": 187,
+        "content": "    def forward(self, x):\n        x1 = self.fc(x)\n        if self.add_batch_norm:\n            x1 = self.batch_norm(x1)\n        x = paddle.concat([x, x1], axis=1)\n        return F.glu(x, axis=1)\ndef sharded_cross_view_inner_product(vid_embds,\n                                     text_embds,\n                                     text_weights,\n                                     subspaces,\n                                     ind,\n                                     tol=1E-5,\n                                     raw_captions=None):\n    \"\"\"Compute a similarity matrix from sharded vectors.\n    Args:\n        embds1 (dict[str:paddle.Tensor]): the set of sub-embeddings that, when\n            concatenated, form the whole. The ith shard has shape `B x K x F_i`\n            (i.e. they can differ in the last dimension).\n        embds2 (dict[str:paddle.Tensor]): same format.\n        weights2 (paddle.Tensor): weights for the shards in `embds2`.\n    Returns:\n        (paddle.tensor): similarity matrix of size `BK x BK`.\n    NOTE: If multiple captions are provided, we can aggregate their similarities to",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:458-485"
+    },
+    "2427": {
+        "file_id": 187,
+        "content": "This code defines a function for computing the similarity matrix between two sets of embeddings, which are divided into smaller shards. The function takes these sharded embeddings and weights for each set, and returns a similarity matrix of size BK x BK. The code includes batch normalization and global pooling operations in its forward pass.",
+        "type": "comment"
+    },
+    "2428": {
+        "file_id": 187,
+        "content": "    provide a single video-text similarity score.\n    \"\"\"\n    B = vid_embds[subspaces[0]].shape[0]\n    T, num_caps, _ = text_embds[subspaces[0]].shape\n    # unroll separate captions onto first dimension and treat them separately\n    sims = paddle.zeros([T * num_caps, B])\n    text_weights = text_weights.reshape([T * num_caps, -1])\n    if True:\n        mus = [round(x, 3) for x in text_weights.mean(0).numpy().tolist()]\n        stds = [round(x, 3) for x in text_weights.std(0).numpy().tolist()]\n        summary = \">>>\"\n        for mod, mu, std in zip(subspaces, mus, stds):\n            summary += f\"{mod}: {mu} +/- {std} \"\n    # mark expert availabilities along the second axis\n    available = paddle.ones([1, B, len(subspaces)], dtype=text_weights.dtype)\n    for ii, modality in enumerate(subspaces):\n        ind[modality] = paddle.to_tensor(ind[modality], dtype='float32')\n        available[:, :, ii] = ind[modality]\n    msg = \"expected `available` modality mask to only contain 0s or 1s\"\n    assert set(paddle.unique(available).cpu().numpy()).issubset(set([0,",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:486-507"
+    },
+    "2429": {
+        "file_id": 187,
+        "content": "This code calculates video-text similarity scores and handles the modalities of available experts. It initializes variables for storing similarity scores (sims) and text weights (text_weights). The code also calculates mean and standard deviation for text_weights, and stores these values as mus and stds respectively. Then it creates an availability mask for each modality, marking them either 0 or 1, with the assertion that the mask should only contain 0s or 1s.",
+        "type": "comment"
+    },
+    "2430": {
+        "file_id": 187,
+        "content": "                                                                     1])), msg\n    # set the text weights along the first axis and combine with availabilities to\n    # produce a <T x B x num_experts> tensor\n    text_weight_tensor = text_weights.reshape([T * num_caps, 1,\n                                               len(subspaces)]) * available\n    # normalise to account for missing experts\n    normalising_weights = text_weight_tensor.sum(2).reshape(\n        [T * num_caps, B, 1])\n    text_weight_tensor = paddle.divide(text_weight_tensor, normalising_weights)\n    l2_mass_text, l2_mass_vid = 1, 1\n    for idx, modality in enumerate(subspaces):\n        vid_embd_ = vid_embds[modality].reshape([B, -1]) / l2_mass_vid\n        text_embd_ = text_embds[modality].reshape([T * num_caps, -1])\n        msg = \"expected weights to be applied to text embeddings\"\n        assert text_embd_.shape[0] == text_weights.shape[0], msg\n        text_embd_ = text_embd_ / l2_mass_text\n        weighting = text_weight_tensor[:, :, idx]",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:508-526"
+    },
+    "2431": {
+        "file_id": 187,
+        "content": "This code reshapes the text_weights and combines them with availabilities to produce a tensor of size T x B x num_experts. It then normalizes these weights by accounting for missing experts. Lastly, it calculates the L2-masses for both video and text embeddings and applies the weights to the corresponding embeddings. The code also includes an assertion message to ensure correct shape matching between text_embd_ and text_weights.",
+        "type": "comment"
+    },
+    "2432": {
+        "file_id": 187,
+        "content": "        sims += weighting * paddle.matmul(text_embd_,\n                                          vid_embd_.t())  # (T x num_caps) x (B)\n    if paddle.isnan(sims).sum().item():\n        raise ValueError(\"Found nans in similarity matrix!\")\n    return sims",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/model.py:527-533"
+    },
+    "2433": {
+        "file_id": 187,
+        "content": "This code calculates the similarity between text and video embeddings, multiplying them together with a weighting factor. It then checks for NaN values in the resulting similarity matrix and raises a ValueError if any are found.",
+        "type": "comment"
+    },
+    "2434": {
+        "file_id": 188,
+        "content": "/applications/T2VLAD/model/net_vlad.py",
+        "type": "filepath"
+    },
+    "2435": {
+        "file_id": 188,
+        "content": "The NetVLAD class in PaddleVideo's \"T2VLAD\" model initializes neural network parameters, performs checks and calculations for VLAD representations with batch size x dimension K.",
+        "type": "summary"
+    },
+    "2436": {
+        "file_id": 188,
+        "content": "\"\"\"NetVLAD implementation.\n\"\"\"\n# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport numpy as np\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nclass NetVLAD(nn.Layer):\n    def __init__(self, cluster_size, feature_size, ghost_clusters=0,\n                 add_batch_norm=True):\n        super().__init__()\n        self.feature_size = feature_size\n        self.cluster_size = cluster_size\n        self.ghost_clusters = ghost_clusters\n        init_sc = (1 / math.sqrt(feature_size))",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/net_vlad.py:1-33"
+    },
+    "2437": {
+        "file_id": 188,
+        "content": "NetVLAD is a class for implementing the NetVLAD algorithm. It takes parameters such as cluster_size, feature_size, ghost_clusters, and add_batch_norm. The feature_size represents the size of each feature, while the cluster_size represents the number of clusters. Ghost_clusters determines whether to include extra clusters for better performance. Add_batch_norm is a boolean value that decides whether or not to use batch normalization in the network.",
+        "type": "comment"
+    },
+    "2438": {
+        "file_id": 188,
+        "content": "        init_sc = paddle.to_tensor(init_sc)\n        clusters = cluster_size + ghost_clusters\n        # The `clusters` weights are the `(w,b)` in the paper\n        self.clusters = paddle.create_parameter([feature_size, clusters], dtype='float32', default_initializer=nn.initializer.Assign(paddle.randn([feature_size, clusters]) * init_sc))\n        self.batch_norm1 = nn.BatchNorm1D(clusters) if add_batch_norm else None\n        self.batch_norm2 = nn.BatchNorm1D(clusters) if add_batch_norm else None\n        # The `clusters2` weights are the visual words `c_k` in the paper\n        self.clusters1 = paddle.create_parameter([1, feature_size, cluster_size], dtype='float32', default_initializer=nn.initializer.Assign(paddle.randn([1, feature_size, cluster_size]) * init_sc))\n        self.clusters2 = paddle.create_parameter([1, feature_size, cluster_size], dtype='float32', default_initializer=nn.initializer.Assign(paddle.randn([1, feature_size, cluster_size]) * init_sc)) \n        self.out_dim = self.cluster_size * feature_size",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/net_vlad.py:34-44"
+    },
+    "2439": {
+        "file_id": 188,
+        "content": "This code initializes the neural network parameters for a VLAD model. It creates two sets of cluster weights, and assigns random values within a certain range to these weights using Paddle's `paddle.randn` function with a specified initialization scale (`init_sc`). Additionally, it creates batch normalization layers (`BatchNorm1D`) for the clusters if `add_batch_norm` is True. The code also defines the output dimension as the product of cluster size and feature size.",
+        "type": "comment"
+    },
+    "2440": {
+        "file_id": 188,
+        "content": "    def sanity_checks(self, x):\n        \"\"\"Catch any nans in the inputs/clusters\"\"\"\n        if paddle.isnan(paddle.sum(x)):\n            raise ValueError(\"nan inputs\")\n        if paddle.isnan(self.clusters[0][0]): \n            raise ValueError(\"nan clusters\")\n    def forward(self, x, freeze=False, mask=None):\n        \"\"\"Aggregates feature maps into a fixed size representation.  In the following\n        notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size.\n        Args:\n            x (th.Tensor): B x N x D\n        Returns:\n            (th.Tensor): B x DK\n        \"\"\"\n        self.sanity_checks(x)\n        max_sample = x.shape[1] \n        x = x.reshape([-1, self.feature_size]) # B x N x D -> BN x D\n        if freeze == True:\n            clusters = self.clusters.detach()\n            clusters2 = self.clusters1\n            batch_norm =  self.batch_norm1\n        else:\n            clusters = self.clusters\n            clusters2 = self.clusters2\n            batch_norm =  self.batch_norm2\n        assignment = paddle.matmul(x, clusters) # (BN x D) x (D x (K+G)) -> BN x (K+G)",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/net_vlad.py:46-76"
+    },
+    "2441": {
+        "file_id": 188,
+        "content": "The code snippet is a part of the \"T2VLAD\" model in PaddleVideo. It performs sanity checks to ensure there are no NaN inputs or clusters, and then proceeds with the forward pass. In the forward function, it reshapes input, applies batch normalization, and calculates the assignment between input features and clusters. This is used for aggregating feature maps into a fixed-size representation.",
+        "type": "comment"
+    },
+    "2442": {
+        "file_id": 188,
+        "content": "        if batch_norm:\n            assignment = batch_norm(assignment)\n        assignment = F.softmax(assignment, axis=1) # BN x (K+G) -> BN x (K+G)\n        save_ass = assignment.reshape([-1, max_sample, self.cluster_size+1])\n        assignment = assignment[:, :self.cluster_size]\n        assignment = assignment.reshape([-1, max_sample, self.cluster_size]) # -> B x N x K\n        a_sum = paddle.sum(assignment, axis=1, keepdim=True) # B x N x K -> B x 1 x K\n        a = a_sum * self.clusters2\n        assignment = assignment.transpose([0, 2, 1])  # B x N x K -> B x K x N\n        x = x.reshape([-1, max_sample, self.feature_size]) # BN x D -> B x N x D\n        vlad = paddle.matmul(assignment, x) # (B x K x N) x (B x N x D) -> B x K x D\n        vlad = vlad.transpose([0, 2, 1]) # -> B x D x K\n        vlad = vlad - a\n        # L2 intra norm\n        vlad_ = F.normalize(vlad)\n        # flattening + L2 norm\n        vlad = vlad_.reshape([-1, self.cluster_size * self.feature_size])  # -> B x DK\n        vlad = F.normalize(vlad)",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/net_vlad.py:77-99"
+    },
+    "2443": {
+        "file_id": 188,
+        "content": "In this code snippet, it performs batch normalization on the assignment matrix, applies softmax for normalization, reshapes the assignment matrix multiple times, calculates a sum of clusters and multiplies by cluster centers, performs matrix multiplication to generate a VLAD representation, normalizes the intra-cluster L2 norm, and finally reshapes and applies normalization for the final VLAD representation.",
+        "type": "comment"
+    },
+    "2444": {
+        "file_id": 188,
+        "content": "        return vlad, vlad_, save_ass  # B x DK",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/net_vlad.py:100-100"
+    },
+    "2445": {
+        "file_id": 188,
+        "content": "The code is returning the VLAD (Vector of Locally Aggregated Descriptors) feature representations and their respective variables for Batch size x Dimension K.",
+        "type": "comment"
+    },
+    "2446": {
+        "file_id": 189,
+        "content": "/applications/T2VLAD/model/text.py",
+        "type": "filepath"
+    },
+    "2447": {
+        "file_id": 189,
+        "content": "The TextEmbedding interface utilizes Word2Vec for embedding video descriptions and queries. It initializes with model, dimensionality, and optional parameters, providing methods for GPT or Word2Vec extraction while ensuring CPU-only execution. The code initializes an OpenAI GPT model, tokenizes input, converts to vocabulary indices, obtains embeddings from hidden states, and returns squeezed dimensions.",
+        "type": "summary"
+    },
+    "2448": {
+        "file_id": 189,
+        "content": "\"\"\"This module defines the TextEmbedding interface for converting video descriptions and\nqueries into embeddings.\n\"\"\"\nimport zipfile\nimport functools\nfrom abc import abstractmethod\nfrom pathlib import Path\nimport numpy as np\nimport paddle\nimport gensim\nimport requests\nimport transformers\nfrom typeguard import typechecked\nfrom zsvision.zs_utils import BlockTimer\nfrom model.s3dg import S3D\nclass TextEmbedding:\n    def __init__(self, model, dim: int):\n        self.model = model\n        self.dim = dim\n        #self.device = None\n    @abstractmethod\n    def text2vec(self, text: str) -> np.ndarray:\n        \"\"\"Convert a string of text into an embedding.\n        Args:\n            text: the content to be embedded\n        Returns:\n            (d x n) array, where d is the dimensionality of the embedding and `n` is the\n                number of words that were successfully parsed from the text string.\n        NOTE: For some text embedding models (such as word2vec), not all words are\n        converted to vectors (e.g. certain kinds of stop words) - these are dropped from",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/text.py:1-37"
+    },
+    "2449": {
+        "file_id": 189,
+        "content": "This module defines the TextEmbedding interface for converting video descriptions and queries into embeddings. The class, TextEmbedding, initializes with a model and dimensionality of embedding. It has an abstract method, text2vec, that converts a string of text into an embedding, returning a (d x n) array, where d is the dimensionality of the embedding and `n` is the number of words successfully parsed from the text string. Some text embedding models may drop certain kinds of stop words.",
+        "type": "comment"
+    },
+    "2450": {
+        "file_id": 189,
+        "content": "        the output.\n        \"\"\"\n        raise NotImplementedError\n    #@typechecked\n    #def set_device(self, device: torch.device):\n    #    self.model = self.model.to(device)\n    #    self.device = device\n@functools.lru_cache(maxsize=64, typed=False)\ndef load_w2v_model_from_cache(\n        w2v_weights: Path,\n) -> gensim.models.keyedvectors.Word2VecKeyedVectors:\n    with BlockTimer(\"Loading w2v from disk\"):\n        model = gensim.models.KeyedVectors.load_word2vec_format(\n            fname=w2v_weights,\n            binary=True,\n        )\n    return model\n@typechecked\ndef fetch_model(url: str, weights_path: Path):\n    weights_path.parent.mkdir(exist_ok=True, parents=True)\n    with BlockTimer(f\"Fetching weights {url} -> {weights_path}\"):\n        resp = requests.get(url, verify=False)\n        with open(weights_path, \"wb\") as f:\n            f.write(resp.content)\nclass W2VEmbedding(TextEmbedding):\n    \"\"\"This model embeds text using the google-released implementation of the word2vec\n    model introduced in:\n        Mikolov, T., Sutskever, I., Chen, K., Corrado, G. S., & Dean, J. (2013).",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/text.py:38-73"
+    },
+    "2451": {
+        "file_id": 189,
+        "content": "This code defines a class for the W2VEmbedding model, which embeds text using the Word2Vec algorithm. It has methods for loading pre-trained Word2Vec models from disk or fetching them online. The set_device method allows specifying the device to use (CPU or GPU). The load_w2v_model_from_cache function loads a Word2Vec model from disk, and the fetch_model function downloads it from a given URL.",
+        "type": "comment"
+    },
+    "2452": {
+        "file_id": 189,
+        "content": "        Distributed representations of words and phrases and their compositionality.\n        In Advances in neural information processing systems (pp. 3111-3119).\n    For words that are present in the w2v vocabulary, a 300-dimensional embedding is\n    produced via a lookup table.\n    \"\"\"\n    @typechecked\n    def __init__(\n            self,\n            dim: int,\n            mirror: str,\n            weights_path: Path,\n            fetch_weights: bool = True,\n    ):\n        if not weights_path.exists():\n            if fetch_weights:\n                fetch_model(url=mirror, weights_path=weights_path)\n            else:\n                raise ValueError(f\"w2v weights missing at {weights_path}\")\n        model = load_w2v_model_from_cache(weights_path)\n        super().__init__(model=model, dim=dim)\n    @typechecked\n    def text2vec(self, text: str) -> np.ndarray:\n        # convert the text string to tokens that can be processed by w2v.  We handle\n        # 'a' as a special case.\n        tokens = [x for x in text.split(\" \") if x != \"a\" and x in self.model.vocab]",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/text.py:74-101"
+    },
+    "2453": {
+        "file_id": 189,
+        "content": "The code initializes a class with dimensions, mirror, and weights_path parameters. If the weights path doesn't exist, it fetches them or raises an error. It then loads the word2vec model from the cache and initializes the superclass. The text2vec method converts input text to tokens processed by w2v, excluding 'a' and tokens not in vocab.",
+        "type": "comment"
+    },
+    "2454": {
+        "file_id": 189,
+        "content": "        embeddings = []\n        for token in tokens:\n            embeddings.append(self.model.get_vector(token))\n        embeddings = np.array(embeddings)\n        # For empty sequences, we use zeros with the dimensionality of the features on\n        # the second dimension (this is the format expected by the CE codebase)\n        if embeddings.size == 0:\n            embeddings = np.zeros((0, self.dim))\n        return embeddings\n    #@typechecked\n    #def set_device(self, device: torch.device):\n    #    msg = f\"w2v only supports CPU-based execution found {device.type}\"\n    #    assert device.type == \"cpu\", msg\nclass OpenAI_GPT(TextEmbedding):\n    \"\"\"This model produces 768-embeddings using a pretrained GPT model, introduced\n    in the paper:\n    Radford, A., Narasimhan, K., Salimans, T., & Sutskever, I. (2018).\n    Improving language understanding by generative pre-training,\n    https://cdn.openai.com/research-covers/language-unsupervised/language_understanding\n    _paper.pdf\n    \"\"\"\n    def __init__(self):\n        self.tokenizer = transformers.OpenAIGPTTokenizer.from_pretrained(\"openai-gpt\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/text.py:103-130"
+    },
+    "2455": {
+        "file_id": 189,
+        "content": "The code defines a class \"TextEmbedding\" that provides methods to extract embeddings from text tokens using either the GPT model or Word2Vec. The \"get_vector\" method returns embeddings in the expected format for the CE codebase, and it handles empty sequences by returning zeros with the correct dimensionality. The class also includes a \"set_device\" method that asserts the device type is CPU-only, as GPT model only supports CPU execution.",
+        "type": "comment"
+    },
+    "2456": {
+        "file_id": 189,
+        "content": "        model = transformers.OpenAIGPTModel.from_pretrained(\"openai-gpt\")\n        model.eval()\n        super().__init__(model=model)\n    @typechecked\n    def text2vec(self, text: str) -> np.ndarray:\n        tokenized_text = self.tokenizer.tokenize(text)\n        # Convert token to vocabulary indices\n        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)\n        tokens_tensor = paddle.to_tensor(indexed_tokens, dtype='int64') #tokens_tensor = torch.LongTensor([indexed_tokens]).to(self.model.device)\n        with paddle.no_grad():\n            hidden_states = self.model(tokens_tensor)\n            embeddings = hidden_states[0].numpy()\n        return embeddings.squeeze(0)",
+        "type": "code",
+        "location": "/applications/T2VLAD/model/text.py:131-146"
+    },
+    "2457": {
+        "file_id": 189,
+        "content": "This code initializes an OpenAI GPT model, tokenizes text input, converts tokens to vocabulary indices, and obtains embeddings from the model's hidden states. The embeddings are then returned after squeezing dimensions.",
+        "type": "comment"
+    },
+    "2458": {
+        "file_id": 190,
+        "content": "/applications/T2VLAD/parse_config.py",
+        "type": "filepath"
+    },
+    "2459": {
+        "file_id": 190,
+        "content": "The \"ConfigParser\" initializes argument parsers, handles slave mode, and sets directories. It manages config parsing, experiment settings, and data loaders with logging capabilities. The code includes two functions for accessing and modifying values in a nested object using a sequence of keys.",
+        "type": "summary"
+    },
+    "2460": {
+        "file_id": 190,
+        "content": "# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport time\nimport paddle\nimport pprint\nimport logging\nfrom typing import Dict\nfrom pathlib import Path\nfrom datetime import datetime\nfrom operator import getitem\nfrom functools import reduce\nfrom mergedeep import Strategy, merge\nfrom zsvision.zs_utils import set_nested_key_val\nfrom typeguard import typechecked\nfrom utils import read_json, write_json\nfrom logger import setup_logging\nclass ConfigParser:\n    def __init__(self, args, options='', timestamp=True, slave_mode=False):",
+        "type": "code",
+        "location": "/applications/T2VLAD/parse_config.py:1-35"
+    },
+    "2461": {
+        "file_id": 190,
+        "content": "This code snippet is the beginning of a Python class, \"ConfigParser,\" which appears to be part of a larger project. The class takes in an \"args\" parameter (possibly command line arguments) and two optional parameters: \"options\" and \"timestamp\". It also has a boolean parameter named \"slave_mode\". The code imports various modules and defines the class but no specific functionality is provided in this section.",
+        "type": "comment"
+    },
+    "2462": {
+        "file_id": 190,
+        "content": "        # slave_mode - when calling the config parser form an existing process, we\n        # avoid reinitialising the logger and ignore sys.argv when argparsing.\n        # parse default and custom cli options\n        for opt in options:\n            args.add_argument(*opt.flags, default=None, type=opt.type)\n        if slave_mode:\n            args = args.parse_args(args=[])\n        else:\n            args = args.parse_args()\n        if args.resume and not slave_mode:\n            self.resume = Path(args.resume)\n        else:\n            msg_no_cfg = \"Config file must be specified\"\n            assert args.config is not None, msg_no_cfg\n            self.resume = None\n        self.cfg_fname = Path(args.config)\n        config = self.load_config(self.cfg_fname)\n        self._config = _update_config(config, options, args)\n        if self._config.get(\"eval_config\", False):\n            # validate path to evaluation file\n            eval_cfg_path = self._config.get(\"eval_config\")\n            msg = f\"eval_config was specified, but `{eval_cfg_path}` does not exist\"",
+        "type": "code",
+        "location": "/applications/T2VLAD/parse_config.py:36-62"
+    },
+    "2463": {
+        "file_id": 190,
+        "content": "The code initializes the argument parser, adds options to parse default and custom CLI options, handles slave mode (avoiding reinitializing logger), parses arguments, checks for a config file, and loads the configuration. If an evaluation config is specified, it validates the path to the evaluation file.",
+        "type": "comment"
+    },
+    "2464": {
+        "file_id": 190,
+        "content": "            assert Path(self._config.get(\"eval_config\")).exists(), msg\n        # set save_dir where trained model and log will be saved.\n        if \"tester\" in self.config:\n            save_dir = Path(self.config['tester']['save_dir'])\n        else:\n            save_dir = Path(self.config['trainer']['save_dir'])\n        timestamp = datetime.now().strftime(r\"%Y-%m-%d_%H-%M-%S\") if timestamp else \"\"\n        if slave_mode:\n            timestamp = f\"{timestamp}-eval-worker\"\n        exper_name = self.set_exper_name(args, config=config)\n        if getattr(args, \"group_id\", False):\n            subdir = Path(args.group_id) / f\"seed-{args.group_seed}\" / timestamp\n        else:\n            subdir = timestamp\n        self._save_dir = save_dir / 'models' / exper_name / subdir\n        self._log_dir = save_dir / 'log' / exper_name / subdir\n        self._exper_name = exper_name\n        self._args = args\n        # if set, remove all previous experiments with the current config\n        if vars(args).get(\"purge_exp_dir\", False):",
+        "type": "code",
+        "location": "/applications/T2VLAD/parse_config.py:63-88"
+    },
+    "2465": {
+        "file_id": 190,
+        "content": "The code sets the save directory for the trained model and logs based on whether \"tester\" or \"trainer\" is specified in the configuration. It also creates a timestamp to differentiate experiments, handles slave mode, sets the experiment name using given arguments, and if group_id and group_seed are provided, it generates subdirectories accordingly. Additionally, it checks if the user wants to purge previous experiments with the current config and removes them if true.",
+        "type": "comment"
+    },
+    "2466": {
+        "file_id": 190,
+        "content": "            for dirpath in (self._save_dir, self._log_dir):\n                config_dir = dirpath.parent\n                existing = list(config_dir.glob(\"*\"))\n                print(f\"purging {len(existing)} directories from config_dir...\")\n                tic = time.time()\n                os.system(f\"rm -rf {config_dir}\")\n                print(f\"Finished purge in {time.time() - tic:.3f}s\")\n        self.save_dir.mkdir(parents=True, exist_ok=True)\n        self.log_dir.mkdir(parents=True, exist_ok=True)\n        # save updated config file to the checkpoint dir\n        write_json(self.config, self.save_dir / 'config.json')\n        # configure logging module\n        if not slave_mode:\n            self.log_path = setup_logging(self.log_dir)\n        self.log_levels = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}\n    def set_exper_name(self, args, config):\n        # We assume that the config files are organised into directories such that\n        # each directory has the name of the dataset.\n        dataset_name = self.cfg_fname.parent.stem",
+        "type": "code",
+        "location": "/applications/T2VLAD/parse_config.py:89-112"
+    },
+    "2467": {
+        "file_id": 190,
+        "content": "The code is purging directories from a specified directory and then recreates the save_dir and log_dir directories. It writes the updated config file to the checkpoint dir, sets up logging if not in slave mode, and assumes that config files are organized into directories with the name of the dataset.",
+        "type": "comment"
+    },
+    "2468": {
+        "file_id": 190,
+        "content": "        exper_name = f\"{dataset_name}-{self.cfg_fname.stem}\"\n        if args.custom_args:\n            key_val_lists = args.custom_args.split(\"+\")\n            for key_val_pair in key_val_lists:\n                print(f\"parsing key-val pair : {key_val_pair}\")\n                key, val = key_val_pair.split(\"@\")\n                set_nested_key_val(key, val, self._config)\n                # remove periods from key names\n                key_ = key.replace(\"_.\", \"--\")\n                # remove commas from value names\n                val = val.replace(\",\", \"--\")\n                custom_tag = \"-\".join(key_.split(\".\")[-2:])\n                exper_name = f\"{exper_name}-{custom_tag}-{val}\"\n        if getattr(args, \"disable_workers\", False):\n            print(\"Disabling data loader workers....\")\n            config[\"data_loader\"][\"args\"][\"num_workers\"] = 0\n        if getattr(args, \"train_single_epoch\", False):\n            print(\"Restricting training to a single epoch....\")\n            config[\"trainer\"][\"epochs\"] = 1\n            config[\"trainer\"][\"save_period\"] = 1",
+        "type": "code",
+        "location": "/applications/T2VLAD/parse_config.py:113-134"
+    },
+    "2469": {
+        "file_id": 190,
+        "content": "This code block handles configuration parsing, custom arguments, and some optional settings. It sets the experiment name based on dataset and config file name, then processes custom arguments to set nested keys in the configuration dictionary. The code also checks for disabled data loader workers and restricts training to a single epoch if specified.",
+        "type": "comment"
+    },
+    "2470": {
+        "file_id": 190,
+        "content": "            config[\"trainer\"][\"skip_first_n_saves\"] = 0\n            exper_name = f\"{exper_name}-train-single-epoch\"\n        return exper_name\n    @staticmethod\n    @typechecked\n    def load_config(cfg_fname: Path) -> Dict:\n        config = read_json(cfg_fname)\n        # apply inheritance through config hierarchy\n        descendant, ancestors = config, []\n        while \"inherit_from\" in descendant:\n            parent_config = read_json(Path(descendant[\"inherit_from\"]))\n            ancestors.append(parent_config)\n            descendant = parent_config\n        for ancestor in ancestors:\n            merge(ancestor, config, strategy=Strategy.REPLACE)\n            config = ancestor\n        return config\n    def init(self, name, module, *args, **kwargs):\n        \"\"\"Finds a function handle with the name given as 'type' in config, and returns\n        the instance initialized with corresponding keyword args given as 'args'.\n        \"\"\"\n        module_name = self[name]['type']\n        module_args = dict(self[name]['args'])",
+        "type": "code",
+        "location": "/applications/T2VLAD/parse_config.py:135-159"
+    },
+    "2471": {
+        "file_id": 190,
+        "content": "This code snippet defines a function that loads and processes a configuration file, initializes an instance of a class with a specific name and arguments, and returns the initialized instance. The configuration loading process applies inheritance through a config hierarchy and allows skipping the first saves in training.",
+        "type": "comment"
+    },
+    "2472": {
+        "file_id": 190,
+        "content": "        msg = (f\"Fail for {module_name}\\n\"\n               f\"overwriting kwargs given in config file is not allowed\\n\"\n               f\"passed kwargs: {kwargs}\\n\"\n               f\"for module_args: {module_args})\")\n        assert all([k not in module_args for k in kwargs]), msg\n        module_args.update(kwargs)\n        return getattr(module, module_name)(*args, **module_args)\n    def __getitem__(self, name):\n        return self.config[name]\n    def __len__(self):\n        # NOTE: This is used for boolean checking deep inside ray.tune, so we required it\n        # to be defined.\n        return len(self.config)\n    def __setitem__(self, name, value):\n        self.config[name] = value\n    def __contains__(self, name):\n        return name in self.config\n    def get(self, name, default):\n        return self.config.get(name, default)\n    def keys(self):\n        return self.config.keys()\n    def get_logger(self, name, verbosity=2):\n        msg_verbosity = \"verbosity option {} is invalid. Valid options are {}.\"\n        msg_verbosity = msg_verbosity.format(verbosity, self.log_levels.keys())",
+        "type": "code",
+        "location": "/applications/T2VLAD/parse_config.py:160-190"
+    },
+    "2473": {
+        "file_id": 190,
+        "content": "Function checks if any overwriting kwargs are present in the module_args and raises an error if so. It then updates the module_args with all kwargs, returns a function call using the updated args. The code also provides functionality to access, set, check containment, get keys of the config, and get values with default option.",
+        "type": "comment"
+    },
+    "2474": {
+        "file_id": 190,
+        "content": "        assert verbosity in self.log_levels, msg_verbosity\n        logger = logging.getLogger(name)\n        logger.setLevel(self.log_levels[verbosity])\n        return logger\n    # setting read-only attributes\n    @property\n    def config(self):\n        return self._config\n    @property\n    def save_dir(self):\n        return self._save_dir\n    @property\n    def log_dir(self):\n        return self._log_dir\n    def __repr__(self):\n        return pprint.PrettyPrinter().pformat(self.__dict__)\n    def items(self):\n        return self._config.items()\n# helper functions used to update config dict with custom cli options\ndef _update_config(config, options, args):\n    for opt in options:\n        value = getattr(args, _get_opt_name(opt.flags))\n        if value is not None:\n            _set_by_path(config, opt.target, value)\n    return config\ndef _get_opt_name(flags):\n    for flg in flags:\n        if flg.startswith('--'):\n            return flg.replace('--', '')\n    return flags[0].replace('--', '')\ndef _set_by_path(tree, keys, value):",
+        "type": "code",
+        "location": "/applications/T2VLAD/parse_config.py:191-232"
+    },
+    "2475": {
+        "file_id": 190,
+        "content": "This code snippet defines a class with properties for config, save_dir, and log_dir. It also has methods to iterate over items in the config dictionary and helper functions to update the config with custom CLI options. The logger is set based on the verbosity level provided.",
+        "type": "comment"
+    },
+    "2476": {
+        "file_id": 190,
+        "content": "    \"\"\"Set a value in a nested object in tree by sequence of keys.\"\"\"\n    _get_by_path(tree, keys[:-1])[keys[-1]] = value\ndef _get_by_path(tree, keys):\n    \"\"\"Access a nested object in tree by sequence of keys.\"\"\"\n    return reduce(getitem, keys, tree)",
+        "type": "code",
+        "location": "/applications/T2VLAD/parse_config.py:233-239"
+    },
+    "2477": {
+        "file_id": 190,
+        "content": "This code snippet defines two functions, `_get_by_path` and `set_in_nested`, for accessing and modifying values in a nested object using a sequence of keys. The `_get_by_path` function uses the `reduce` function with `getitem` as the function argument to iterate through the keys and return the nested object's value, while `set_in_nested` sets a new value in a nested object by first accessing the nested object using the provided keys and then setting the final key's value.",
+        "type": "comment"
+    },
+    "2478": {
+        "file_id": 191,
+        "content": "/applications/T2VLAD/test.py",
+        "type": "filepath"
+    },
+    "2479": {
+        "file_id": 191,
+        "content": "The PaddleVideo library's function compresses predictions based on query masks and similarity scores. The code initializes a Paddle model, prepares data loaders, sets evaluation mode, processes samples, calculates metrics, evaluates models, and runs the \"evaluation\" function.",
+        "type": "summary"
+    },
+    "2480": {
+        "file_id": 191,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport copy\nimport random\nimport paddle\nimport logging\nimport argparse\nimport numpy as np\nimport model.model as module_arch\nimport model.metric as module_metric\nimport data_loader.data_loaders as module_data\nfrom typing import Tuple\nfrom pathlib import Path\nfrom typeguard import typechecked\nfrom mergedeep import Strategy, merge\nfrom parse_config import ConfigParser\nfrom trainer.trainer import verbose, ctxt_mgr\nfrom utils.util import compute_dims, compute_trn_config\n@typechecked\ndef compress_predictions(query_masks: np.ndarray, sims: np.ndarray, topk: int = 10):",
+        "type": "code",
+        "location": "/applications/T2VLAD/test.py:1-33"
+    },
+    "2481": {
+        "file_id": 191,
+        "content": "This code is part of the PaddleVideo library and contains a function named `compress_predictions`. It imports necessary libraries, defines function parameters, and utilizes various modules from the PaddleVideo library. The function compresses predictions based on query masks and similarity scores (`sims`) with optional top k values. It is type checked for data integrity.",
+        "type": "comment"
+    },
+    "2482": {
+        "file_id": 191,
+        "content": "    \"\"\"We store the indices of the top-k predictions, rather than the full similarity\n    matrix, to reduce storage requirements.\n    NOTE: The similarity matrix contains `num_queries x num_videos` elements, where\n    `num_queries = num_videos x max_num_queries_per_video`.  We first mask out\n    locations in the similarity matrix that correspond to invalid queries (these are\n    produced by videos with fewer than `max_num_queries_per_video` descriptions).\n    \"\"\"\n    # validate the input shapes\n    assert query_masks.ndim == 2, \"Expected query_masks to be a matrix\"\n    query_num_videos, query_max_per_video = query_masks.shape\n    sims_queries, sims_num_videos = sims.shape\n    msg = (f\"Expected sims and query masks to represent the same number of videos \"\n           f\"(found {sims_num_videos} v {query_num_videos}\")\n    assert query_num_videos == sims_num_videos, msg\n    msg = (f\"Expected sims and query masks to represent the same number of queries \"\n           f\"(found {sims_queries} v {query_num_videos * query_max_per_video}\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/test.py:34-51"
+    },
+    "2483": {
+        "file_id": 191,
+        "content": "Code validates input shapes, ensuring that sims and query_masks represent the same number of videos and queries. It asserts the correct dimensions for sims and query_masks to ensure compatibility in further computations, preventing potential errors.",
+        "type": "comment"
+    },
+    "2484": {
+        "file_id": 191,
+        "content": "    assert query_max_per_video * query_num_videos == sims_queries, msg\n    valid_sims = sims[query_masks.flatten().astype(np.bool)]\n    ranks = np.argsort(-valid_sims, axis=1)\n    return ranks[:, :topk]\n@typechecked\ndef get_model_and_data_loaders(\n        config: ConfigParser,\n        logger: logging.Logger,\n        model_path: Path,\n) -> Tuple[paddle.nn.Layer, module_data.ExpertDataLoader]:\n    expert_dims, raw_input_dims = compute_dims(config)\n    trn_config = compute_trn_config(config)\n    data_loaders = config.init(\n        name='data_loader',\n        module=module_data,\n        logger=logger,\n        raw_input_dims=raw_input_dims,\n        text_feat=config[\"experts\"][\"text_feat\"],\n        text_dim=config[\"experts\"][\"text_dim\"],\n        text_agg=config[\"experts\"][\"text_agg\"],\n        use_zeros_for_missing=config[\"experts\"].get(\"use_zeros_for_missing\", False),\n        eval_only=True,\n    )\n    model = config.init(\n        name='arch',\n        module=module_arch,\n        expert_dims=expert_dims,\n        text_dim=config[\"experts\"][\"text_dim\"],",
+        "type": "code",
+        "location": "/applications/T2VLAD/test.py:52-84"
+    },
+    "2485": {
+        "file_id": 191,
+        "content": "This code defines a function that takes a configuration, logger, and model path as input, returns a tuple containing a Paddle.js layer model and an ExpertDataLoader object for training data. The function first computes the expert dimensions and raw input dimensions based on the provided config, then initializes the train data loaders using the same config and returns the model and data loader tuple. The code also handles cases where some features might be missing by allowing the use of zeros to fill in such gaps.",
+        "type": "comment"
+    },
+    "2486": {
+        "file_id": 191,
+        "content": "        ce_shared_dim=config[\"experts\"].get(\"ce_shared_dim\", None),\n        feat_aggregation=config[\"data_loader\"][\"args\"][\"feat_aggregation\"],\n    )\n    model_path = config._args.resume\n    logger.info(f\"Loading checkpoint: {model_path} ...\")\n    checkpoint = paddle.load(model_path)\n    state_dict = checkpoint\n    if config['n_gpu'] > 1:\n        model = paddle.DataParallel(model)\n    model.load_dict(state_dict)\n    return model, data_loaders\ndef evaluation(config, logger=None, trainer=None):\n    if logger is None:\n        logger = config.get_logger('test')\n    if getattr(config._args, \"eval_from_training_config\", False):\n        eval_conf = copy.deepcopy(config)\n        merge(eval_conf._config, config[\"eval_settings\"], strategy=Strategy.REPLACE)\n        config = eval_conf\n    logger.info(\"Running evaluation with configuration:\")\n    logger.info(config)\n    # Set the random initial seeds\n    seed = config[\"seed\"]\n    logger.info(f\"Setting experiment random seed to {seed}\")\n    random.seed(seed)\n    np.random.seed(seed)",
+        "type": "code",
+        "location": "/applications/T2VLAD/test.py:85-116"
+    },
+    "2487": {
+        "file_id": 191,
+        "content": "This code is initializing a model and preparing it for evaluation. It loads a checkpoint from the specified model path, creates a data loader, and performs an evaluation with the given configuration. The random seed is set to ensure reproducibility of results.",
+        "type": "comment"
+    },
+    "2488": {
+        "file_id": 191,
+        "content": "    paddle.seed(seed)\n    model, data_loaders = get_model_and_data_loaders(\n        config=config,\n        logger=logger,\n        model_path=Path(config._args.resume),\n    )\n    logger.info(model)\n    metrics = [getattr(module_metric, met) for met in config['metrics']]\n    # prepare model for testing.  Note that some datasets fail to fit the retrieval\n    # set on the GPU, so we run them on the CPU\n    model.eval()\n    with paddle.no_grad():\n        samples, meta = data_loaders[\"retrieval\"]\n        #import pdb; pdb.set_trace()\n        # To use the nan-checks safely, we need make temporary copies of the data\n        all_text_num = samples['text'].shape[0]\n        text_keys = ['text', 'cap_id', 'att_mask', 'text_token_mask']\n        chk = 100\n        tck = 100 \n        if samples['text'].shape[0] % chk == 0:\n            vid_batch = samples['text'].shape[0] // chk\n        else:\n            vid_batch = samples['text'].shape[0] // chk + 1\n        if samples['text'].shape[0] % tck == 0:\n            text_batch  =  samples['text'].shape[0] // tck",
+        "type": "code",
+        "location": "/applications/T2VLAD/test.py:117-146"
+    },
+    "2489": {
+        "file_id": 191,
+        "content": "The code snippet initializes the Paddle model, data loaders, and sets the model to evaluation mode. It also prepares the retrieval dataset by checking for nan values and making temporary copies of relevant data elements based on their shape. The code then determines the number of video batches and text batches based on the dataset size.",
+        "type": "comment"
+    },
+    "2490": {
+        "file_id": 191,
+        "content": "        else: \n            text_batch  =  samples['text'].shape[0] // tck + 1\n        sub_sims = []\n        for idx in range(text_batch):\n            if idx % 5 == 0:\n                print(idx,'/',text_batch)\n            sub_samples = {}\n            for key in text_keys:\n                sub_samples.update({key: samples[key][idx*tck:idx*tck+tck]})\n            subsub_sims = []\n            for vid in range(vid_batch):\n                sub_samples['experts'] = {}\n                sub_samples['ind'] = {}\n                for expert in samples['experts'].keys():\n                    sub_samples['experts'][expert] = samples['experts'][expert][vid*chk:vid*chk+chk]\n                    sub_samples['ind'][expert] = samples['ind'][expert][vid*chk:vid*chk+chk]\n                with ctxt_mgr(sub_samples) as valid:\n                    output = model(**valid)\n                subsub_sims.append(output[\"cross_view_conf_matrix\"].cpu())\n            subsub_sims = paddle.concat(subsub_sims, axis=1)\n            sub_sims.append(subsub_sims)",
+        "type": "code",
+        "location": "/applications/T2VLAD/test.py:147-167"
+    },
+    "2491": {
+        "file_id": 191,
+        "content": "This code slices samples into sub-samples and processes them for multiple videos. It then concatenates the processed results along axis 1, storing each result in the list \"sub_sims\". This process is repeated for a batch of text and video samples. The code also includes progress printing and utilizes context management to run model operations efficiently.",
+        "type": "comment"
+    },
+    "2492": {
+        "file_id": 191,
+        "content": "        sub_sims = paddle.concat(sub_sims, axis=0)\n        sims = paddle.to_tensor(sub_sims, dtype='float32').numpy()\n        dataset = data_loaders.dataset_name\n        nested_metrics = {}\n        for metric in metrics:\n            metric_name = metric.__name__\n            res = metric(sims, query_masks=meta[\"query_masks\"])\n            verbose(epoch=0, metrics=res, name=dataset, mode=metric_name)\n            if trainer is not None:\n                if not trainer.mini_train:\n                    trainer.writer.set_step(step=0, mode=\"val\")\n                # avoid tensboard folding by prefixing\n                metric_name_ = f\"test_{metric_name}\"\n                trainer.log_metrics(res, metric_name=metric_name_, mode=\"val\")\n            nested_metrics[metric_name] = res\n    log = {}\n    for subkey, subval in nested_metrics.items():\n        for subsubkey, subsubval in subval.items():\n            log[f\"test_{subkey}_{subsubkey}\"] = subsubval\n    for key, value in log.items():\n        logger.info(\" {:15s}: {}\".format(str(key), value))",
+        "type": "code",
+        "location": "/applications/T2VLAD/test.py:168-190"
+    },
+    "2493": {
+        "file_id": 191,
+        "content": "This code calculates metrics for a dataset, concatenates sub-similarities, converts to numpy array, iterates through metrics and computes results for each metric using sims and query_masks. The results are logged for further analysis and information display.",
+        "type": "comment"
+    },
+    "2494": {
+        "file_id": 191,
+        "content": "if __name__ == '__main__':\n    args = argparse.ArgumentParser(description='PyTorch Template')\n    args.add_argument('--config', default=None, type=str, help=\"config file path\")\n    args.add_argument('--resume', default=None, help='path to checkpoint for evaluation')\n    args.add_argument('--eval_from_training_config', action=\"store_true\",\n                      help=\"if true, evaluate directly from a training config file.\")\n    args.add_argument(\"--custom_args\", help=\"qualified key,val pairs\")\n    eval_config = ConfigParser(args)\n    cfg_msg = \"For evaluation, a model checkpoint must be specified via the --resume flag\"\n    assert eval_config._args.resume, cfg_msg\n    if eval_config._config.get(\"eval_settings\", False):\n        merge(eval_config._config, eval_config[\"eval_settings\"], strategy=Strategy.REPLACE)\n        evaluation(eval_config)",
+        "type": "code",
+        "location": "/applications/T2VLAD/test.py:193-206"
+    },
+    "2495": {
+        "file_id": 191,
+        "content": "This code sets up argument parsing and configuration loading for evaluation. It checks if a model checkpoint is specified via --resume flag, then merges the main config file with eval_settings (if provided), finally calling the \"evaluation\" function.",
+        "type": "comment"
+    },
+    "2496": {
+        "file_id": 192,
+        "content": "/applications/T2VLAD/train.py",
+        "type": "filepath"
+    },
+    "2497": {
+        "file_id": 192,
+        "content": "This code imports libraries, initializes an experiment, defines functions for training a video analysis model, handles command-line arguments and ensures checkpoints are saved before running the training process.",
+        "type": "summary"
+    },
+    "2498": {
+        "file_id": 192,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport os\nimport time\nimport copy\nimport socket\nimport paddle\nimport argparse\nimport warnings\nimport numpy as np\nimport model.loss as module_loss\nimport model.model as module_arch\nimport model.metric as module_metric\nimport data_loader.data_loaders as module_data\nfrom pathlib import Path\nfrom utils import set_seeds\nfrom trainer import Trainer\nfrom test import evaluation\nfrom mergedeep import merge, Strategy\nfrom parse_config import ConfigParser\nfrom logger.log_parser import log_summary\nfrom utils import compute_dims, compute_trn_config",
+        "type": "code",
+        "location": "/applications/T2VLAD/train.py:1-35"
+    },
+    "2499": {
+        "file_id": 192,
+        "content": "The code imports necessary libraries, modules and packages for the PaddleVideo project. It also handles copyright and license information, sets seeds to ensure reproducibility, and includes utility functions for logging, model training, and data loading. It defines a Trainer class and an evaluation function, as well as parsing configuration files.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/25.json b/docs/data/25.json
new file mode 100644
index 000000000..9a7e2a554
--- /dev/null
+++ b/docs/data/25.json
@@ -0,0 +1,543 @@
+{
+    "2500": {
+        "file_id": 192,
+        "content": "def run_exp(config):\n    warnings.filterwarnings('ignore')\n    logger = config.get_logger('train')\n    expert_dims, raw_input_dims = compute_dims(config, logger)\n    trn_config = compute_trn_config(config)\n    if config._args.group_seed:\n        seeds = [int(config._args.group_seed)]\n    else:\n        seeds = [int(x) for x in config._args.seeds.split(\",\")]\n    for ii, seed in enumerate(seeds):\n        tic = time.time()\n        logger.info(f\"{ii + 1}/{len(seeds)} Setting experiment random seed to {seed}\")\n        set_seeds(seed)\n        config[\"seed\"] = seed\n        model = config.init(\n            name='arch',\n            module=module_arch,\n            expert_dims=expert_dims,\n            text_dim=config[\"experts\"][\"text_dim\"],\n            ce_shared_dim=config[\"experts\"].get(\"ce_shared_dim\", None),\n            feat_aggregation=config[\"data_loader\"][\"args\"][\"feat_aggregation\"],\n        )\n        logger.info(model)\n        data_loaders = config.init(\n            name='data_loader',\n            module=module_data,",
+        "type": "code",
+        "location": "/applications/T2VLAD/train.py:37-67"
+    },
+    "2501": {
+        "file_id": 192,
+        "content": "This code snippet defines a function `run_exp()` that initializes an experiment. It sets the random seed, initializes the model (arch) and data loaders based on the given configuration. The seeds are obtained from command line arguments, and for each seed, it logs information about the setting and proceeds with the experiment initialization.",
+        "type": "comment"
+    },
+    "2502": {
+        "file_id": 192,
+        "content": "            logger=logger,\n            raw_input_dims=raw_input_dims,\n            text_feat=config[\"experts\"][\"text_feat\"],\n            text_dim=config[\"experts\"][\"text_dim\"],\n            text_agg=config[\"experts\"][\"text_agg\"],\n            use_zeros_for_missing=config[\"experts\"].get(\"use_zeros_for_missing\", False),\n            eval_only=False,\n        )\n        loss = config.init(name=\"loss\", module=module_loss)\n        metrics = [getattr(module_metric, met) for met in config['metrics']]\n        lr_scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.0001, step_size=5, gamma=0.9)\n        optimizer = paddle.optimizer.AdamW(learning_rate=lr_scheduler, weight_decay=1e-4, parameters=model.parameters(), grad_clip=paddle.nn.ClipGradByGlobalNorm(2))\n        trainer = Trainer(\n            model,\n            loss,\n            metrics,\n            optimizer,\n            config=config,\n            data_loaders=data_loaders,\n            lr_scheduler=lr_scheduler,\n            mini_train=config._args.mini_train,\n            visualizer=None,",
+        "type": "code",
+        "location": "/applications/T2VLAD/train.py:68-92"
+    },
+    "2503": {
+        "file_id": 192,
+        "content": "Initializing a model with specific configurations and defining the loss function, metrics to track progress, learning rate scheduler for dynamic adjustments, and an optimizer (AdamW) to update model parameters. Also creating a Trainer instance which combines all these components for training the model on given data loaders.",
+        "type": "comment"
+    },
+    "2504": {
+        "file_id": 192,
+        "content": "            val_freq=config[\"trainer\"].get(\"val_freq\", 1),\n            force_cpu_val=config.get(\"force_cpu_val\", False),\n            skip_first_n_saves=config[\"trainer\"].get(\"skip_first_n_saves\", 0),\n            include_optim_in_save_model=config[\"trainer\"].get(\"include_optim_in_save_model\", 1),\n            cache_targets=set(config.get(\"cache_targets\", [])),\n        )\n        trainer.train()\n        best_model_path = config.save_dir / \"trained_model.pdparams\"\n        duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic))\n        logger.info(f\"Training took {duration}\")\n    # If multiple runs were conducted, report relevant statistics\n    if len(seeds) > 1:\n        log_summary(\n            logger=logger,\n            log_path=config.log_path,\n            eval_mode=config[\"eval_mode\"],\n            fixed_num_epochs=config[\"trainer\"][\"epochs\"],\n        )\n    print(f\"Log file stored at {config.log_path}\")\n    # Report the location of the \"best\" model of the final seeded run (here\n    # \"best\" corresponds to the model with the highest geometric mean over the",
+        "type": "code",
+        "location": "/applications/T2VLAD/train.py:93-115"
+    },
+    "2505": {
+        "file_id": 192,
+        "content": "This code sets up a trainer with specified configuration, trains the model, saves the best model at 'best_model_path', logs training duration, reports relevant statistics if multiple runs were conducted, and prints the log file location.",
+        "type": "comment"
+    },
+    "2506": {
+        "file_id": 192,
+        "content": "    # R@1, R@5 and R@10 metrics when a validation set is used, or simply the final\n    # epoch of training for fixed-length schedules).\n    print(f\"The best performing model can be found at {str(best_model_path)}\")\ndef main():\n    args = argparse.ArgumentParser(description='Main entry point for training')\n    args.add_argument('--config', help='config file path')\n    args.add_argument('--resume', help='path to latest model (default: None)')\n    args.add_argument('--mini_train', action=\"store_true\")\n    args.add_argument('--group_id', help=\"if supplied, group these experiments\")\n    args.add_argument('--disable_workers', action=\"store_true\")\n    args.add_argument('--refresh_lru_cache', action=\"store_true\")\n    args.add_argument('--train_single_epoch', action=\"store_true\")\n    args.add_argument('--purge_exp_dir', action=\"store_true\",\n                      help=\"remove all previous experiments with the given config\")\n    args.add_argument(\"--dbg\", default=\"ipdb.set_trace\")\n    args.add_argument(\"--custom_args\", help=\"qualified key,val pairs\")",
+        "type": "code",
+        "location": "/applications/T2VLAD/train.py:116-133"
+    },
+    "2507": {
+        "file_id": 192,
+        "content": "This code defines the command-line arguments for the training script of a video analysis application. The arguments include config file path, resuming from a previous model, mini-batch training option, grouping experiments by ID, disabling workers, refreshing LRU cache, training a single epoch, purging existing experiments, and debugging options.",
+        "type": "comment"
+    },
+    "2508": {
+        "file_id": 192,
+        "content": "    # Seeds can either be passed directly as a comma separated list at the command line,\n    # or individually for separate experiments as a group (used for slurm experiments)\n    seed_args = args.add_mutually_exclusive_group()\n    seed_args.add_argument('--seeds', default=\"0\", help=\"comma separated list of seeds\")\n    seed_args.add_argument('--group_seed', help=\"seed for group member\")\n    args = ConfigParser(args)\n    os.environ[\"PYTHONBREAKPOINT\"] = args._args.dbg\n    args[\"data_loader\"][\"args\"][\"refresh_lru_cache\"] = args._args.refresh_lru_cache\n    msg = (f\"Expected the number of training epochs ({args['trainer']['epochs']})\"\n           f\"to exceed the save period ({args['trainer']['save_period']}), otherwise\"\n           \" no checkpoints will be saved.\")\n    assert args[\"trainer\"][\"epochs\"] >= args[\"trainer\"][\"save_period\"], msg\n    run_exp(config=args)\nif __name__ == '__main__':\n    main()",
+        "type": "code",
+        "location": "/applications/T2VLAD/train.py:135-151"
+    },
+    "2509": {
+        "file_id": 192,
+        "content": "This code is parsing command-line arguments for seeds, setting environment variables, and asserting that the number of training epochs is greater than the save period to ensure checkpoints are saved. The function run_exp is then called with these configuration settings, and the main function is executed if the script is run directly.",
+        "type": "comment"
+    },
+    "2510": {
+        "file_id": 193,
+        "content": "/applications/T2VLAD/trainer/__init__.py",
+        "type": "filepath"
+    },
+    "2511": {
+        "file_id": 193,
+        "content": "This line imports all functions and classes from the \"trainer\" module in the same package.",
+        "type": "summary"
+    },
+    "2512": {
+        "file_id": 193,
+        "content": "from .trainer import *",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/__init__.py:1-1"
+    },
+    "2513": {
+        "file_id": 193,
+        "content": "This line imports all functions and classes from the \"trainer\" module in the same package.",
+        "type": "comment"
+    },
+    "2514": {
+        "file_id": 194,
+        "content": "/applications/T2VLAD/trainer/trainer.py",
+        "type": "filepath"
+    },
+    "2515": {
+        "file_id": 194,
+        "content": "The code utilizes PaddlePaddle library for video retrieval, trains a model, handles memory-efficient sample copies, calculates metrics, logs progress, and visualizes ranking if available. Mean Average Precision is computed, results stored, and single test caption checked during each epoch.",
+        "type": "summary"
+    },
+    "2516": {
+        "file_id": 194,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport numpy as np\nfrom base import BaseTrainer\nfrom utils import memory_summary\nfrom contextlib import contextmanager\ndef verbose(epoch, metrics, mode, name=\"TEST\"):\n    r1, r5, r10, r50 = metrics[\"R1\"], metrics[\"R5\"], metrics[\"R10\"], metrics[\"R50\"]\n    msg = f\"[{mode}]{name:s} epoch {epoch}, R@1: {r1:.1f}\"\n    msg += f\", R@5: {r5:.1f}, R@10 {r10:.1f}, R@50 {r50:.1f}\"\n    msg += f\"MedR: {metrics['MedR']:g}, MeanR: {metrics['MeanR']:.1f}\"\n    print(msg)\n@contextmanager\ndef ctxt_mgr(samples):\n    \"\"\"Provide a context for managing temporary, cloned copies of retrieval",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:1-31"
+    },
+    "2517": {
+        "file_id": 194,
+        "content": "This code is part of a larger program using the PaddlePaddle library for video retrieval. It defines a verbose function to display training metrics and a context manager to handle temporary copies of retrieval samples.",
+        "type": "comment"
+    },
+    "2518": {
+        "file_id": 194,
+        "content": "    sample tensors.\n    The rationale here is that to use nan-checking in the model (to validate the\n    positions of missing experts), we need to modify the underlying tensors. This\n    function lets the evaluation code run (and modify) temporary copies, without\n    modifying the originals.\n    \"\"\"\n    exp_dict = samples[\"experts\"].items()\n    experts = {key: val.clone() for key, val in exp_dict}\n    samples_ = {\n        \"experts\": experts,\n        \"ind\": samples[\"ind\"],\n        \"text\": samples[\"text\"],\n        \"cap_id\": samples[\"cap_id\"],\n        \"att_mask\": samples[\"att_mask\"],\n    }\n    if \"text_token_mask\" in samples:\n        samples_[\"text_token_mask\"] = samples[\"text_token_mask\"]\n    try:\n        yield samples_\n    finally:\n        del samples_\nclass Trainer(BaseTrainer):\n    \"\"\"\n    Trainer class\n    Note:\n        Inherited from BaseTrainer.\n    \"\"\"\n    def __init__(self, model, loss, metrics, optimizer, config, data_loaders,\n                 lr_scheduler, visualizer, skip_first_n_saves,\n                 include_optim_in_save_model, force_cpu_val, cache_targets=set(),",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:32-66"
+    },
+    "2519": {
+        "file_id": 194,
+        "content": "This function creates a copy of the \"experts\" tensor from the input samples and replaces it in the samples dictionary. It also includes other relevant tensors and allows for evaluation without modifying the original samples. The copied samples are yielded, and then deleted after use to avoid memory leaks.",
+        "type": "comment"
+    },
+    "2520": {
+        "file_id": 194,
+        "content": "                 num_keep_ckpts=3, mini_train=False, val_freq=1, skip_tboard=False):\n        super().__init__(model, loss, metrics, optimizer, config, mini_train=mini_train,\n                         skip_tboard=skip_tboard, num_keep_ckpts=num_keep_ckpts)\n        self.config = config\n        self.cache_targets = cache_targets\n        self.data_loaders = data_loaders\n        self.lr_scheduler = lr_scheduler\n        self.mini_train = mini_train\n        self.len_epoch = len(self.data_loaders[\"train\"])\n        self.log_step = int(np.sqrt(data_loaders[\"train\"].batch_size))\n        self.visualizer = visualizer\n        self.force_cpu_val = force_cpu_val\n        self.val_freq = val_freq\n        self.skip_first_n_saves = skip_first_n_saves\n        self.include_optim_in_save_model = include_optim_in_save_model\n        self.seen = {\"train\": 0, \"val\": 0}\n    def _train_epoch(self, epoch):\n        \"\"\"\n        Training logic for an epoch\n        :param epoch: Current training epoch.\n        :return: A log that contains all information you want to save.",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:67-89"
+    },
+    "2521": {
+        "file_id": 194,
+        "content": "This code defines a class for training a model with specific configurations, data loaders, learning rate scheduler, and more. It initializes the necessary attributes and provides a method for performing training during an epoch. The `_train_epoch` method performs training logic for an epoch and returns a log containing all relevant information.",
+        "type": "comment"
+    },
+    "2522": {
+        "file_id": 194,
+        "content": "        Note:\n            If you have additional information to record, for example:\n                > additional_log = {\"x\": x, \"y\": y}\n            merge it with log before return. i.e.\n                > log = {**log, **additional_log}\n                > return log\n            The metrics in log must have the key 'metrics'.\n        \"\"\"\n        total_loss = 0\n        self.model.train()\n        memory_summary()\n        for batch_idx, minibatch in enumerate(self.data_loaders[\"train\"]):\n            output = self.model(**minibatch)\n            if \"retrieval\" in self.data_loaders.dataloaders:\n                loss = self.loss(output[\"cross_view_conf_matrix\"])\n            else:\n                loss = self.loss(x=output[\"class_preds\"], target=labels)\n            loss.backward()\n            self.optimizer.step()\n            self.optimizer.clear_grad()\n            sample_key = list(minibatch[\"experts\"].keys())[0]\n            batch_size = minibatch[\"experts\"][sample_key].shape[0]\n            self.seen[\"train\"] += batch_size",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:91-117"
+    },
+    "2523": {
+        "file_id": 194,
+        "content": "This code trains a model and computes the loss for each batch of data in the train loader. The loss is then backpropagated, the optimizer steps, and gradients are cleared before moving on to the next batch. The batch size is also tracked as part of the seen data count.",
+        "type": "comment"
+    },
+    "2524": {
+        "file_id": 194,
+        "content": "            total_loss += loss.item()\n            if batch_idx % self.log_step == 0:\n                prog = self._progress(batch_idx)\n                self.logger.info(f\"Train Epoch: {epoch} {prog} Loss: {loss.item():.6f}\")\n            if batch_idx == self.len_epoch or (self.mini_train and batch_idx > 3):\n                break\n        log = {'loss': total_loss / self.len_epoch}\n        if epoch % self.val_freq == 0:\n            nested_log, cached_preds = self._valid_epoch(epoch)\n            log.update(nested_log)\n        else:\n            nested_log, cached_preds = {}, None\n            self.logger.info(f\"skipping val for epoch: {epoch}\")\n        self.lr_scheduler.step()\n        self.logger.info(f\"LR {self.lr_scheduler.get_lr()}\")\n        return log, cached_preds\n    def _valid_epoch(self, epoch):\n        \"\"\"Validate model after an epoch of training and store results to disk.\n        Args:\n            epoch (int): the current epoch\n        Returns:\n            A log that contains information about validation\n        NOTE: The validation metrics in log must have the key 'val_metrics'.",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:119-150"
+    },
+    "2525": {
+        "file_id": 194,
+        "content": "Training loop for a machine learning model, logging progress and validating after certain epochs. Performs validation metrics calculation and updates learning rate with a scheduler.",
+        "type": "comment"
+    },
+    "2526": {
+        "file_id": 194,
+        "content": "        \"\"\"\n        self.model.eval()\n        cached_preds = {key: {\"vid_name\": [], \"preds\": [], \"labels\": []}\n                        for key in self.cache_targets}\n        with paddle.no_grad():\n            if \"retrieval\" in self.data_loaders.dataloaders:\n                samples, meta = self.data_loaders[\"retrieval\"]\n                sample_key = list(samples[\"experts\"].keys())[0]\n                batch_size = samples[\"experts\"][sample_key].shape[0]\n                self.seen[\"val\"] += batch_size\n                num_queries = samples[\"text\"].shape[0] * samples[\"text\"].shape[1]\n                safe_queries = 1\n                text_keys = ['text', 'cap_id', 'att_mask', 'text_token_mask']\n                if num_queries > safe_queries:\n                    chk = 50\n                    tck = 50\n                    if samples['text'].shape[0] % chk == 0:\n                        vid_batch = samples['text'].shape[0] // chk\n                    else:\n                        vid_batch = samples['text'].shape[0] // chk + 1",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:151-171"
+    },
+    "2527": {
+        "file_id": 194,
+        "content": "This code is initializing the model in evaluation mode, creating a dictionary to store cached predictions, and retrieving data from dataloaders. It also checks if there are too many queries and adjusts batch size accordingly. The text_keys variable stores keys for text-related data.",
+        "type": "comment"
+    },
+    "2528": {
+        "file_id": 194,
+        "content": "                    if samples['text'].shape[0] % tck == 0:\n                        text_batch  =  samples['text'].shape[0] // tck\n                    else:\n                        text_batch  =  samples['text'].shape[0] // tck + 1\n                    sub_sims = []\n                    for idx in range(text_batch):\n                        if idx % 5 == 0:\n                            print(idx,'/',text_batch)\n                        sub_samples = {}\n                        for key in text_keys:\n                            sub_samples.update({key: samples[key][idx*tck:idx*tck+tck]})\n                        subsub_sims = []\n                        for vid in range(vid_batch):\n                            sub_samples['experts'] = {}\n                            sub_samples['ind'] = {} \n                            for expert in samples['experts'].keys():\n                                sub_samples['experts'][expert] = samples['experts'][expert][vid*chk:vid*chk+chk]\n                                sub_samples['ind'][expert] = samples['ind'][expert][vid*chk:vid*chk+chk]",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:172-190"
+    },
+    "2529": {
+        "file_id": 194,
+        "content": "This code segment calculates the number of batches for 'text' and iterates through each batch. It then creates sub-samples and subsub-sims for further processing. This seems to be part of a machine learning model training process, possibly using video data with experts and indicators as additional features. The progress is printed every 5 batches.",
+        "type": "comment"
+    },
+    "2530": {
+        "file_id": 194,
+        "content": "                            with ctxt_mgr(sub_samples) as xx:\n                                output = self.model(**xx)\n                            subsub_sims.append(output[\"cross_view_conf_matrix\"].cpu())\n                        subsub_sims = paddle.concat(subsub_sims, axis=1)\n                        sub_sims.append(subsub_sims)\n                    sims = paddle.concat(sub_sims, axis=0)\n                    sims = paddle.to_tensor(sims, dtype='float32').cpu().numpy()\n                else:\n                    with ctxt_mgr(samples) as xx:\n                        output = self.model(**xx)\n                    sims = paddle.to_tensor(output[\"cross_view_conf_matrix\"], dtype='float32').cpu().numpy()\n                # sample the loss (using only the first query for each video)\n                queries_per_vid = meta[\"query_masks\"].shape[1]\n                sims_ = paddle.to_tensor(sims).reshape([-1, queries_per_vid, sims.shape[-1]])\n                loss = self.loss(sims_[:, 0, :])\n                dataset = self.data_loaders.dataset_name",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:191-209"
+    },
+    "2531": {
+        "file_id": 194,
+        "content": "This code appears to be part of a machine learning model training process. It uses PaddlePaddle, a deep learning framework, to calculate similarity metrics (sims) between samples or sub-samples, then concatenates them based on the given condition (if sub_samples exists). If no sub_samples exist, it directly calculates sims from the samples. The code then samples the loss using only the first query for each video and reshapes the sims tensor accordingly before passing it to a loss function (self.loss). Finally, the dataset name is captured in the variable \"dataset\".",
+        "type": "comment"
+    },
+    "2532": {
+        "file_id": 194,
+        "content": "                nested_metrics = {}\n                for metric in self.metrics:\n                    metric_name = metric.__name__\n                    res = metric(sims, query_masks=meta[\"query_masks\"])\n                    if metric_name == \"mean_average_precision\":\n                        print(f\"Epoch: {epoch}, mean AP: {res['mAP']}\")\n                    else:\n                        verbose(epoch=epoch, metrics=res, name=dataset, mode=metric_name)\n                    nested_metrics[metric_name] = res\n                # TODO(Samuel) disabled visualisation for now, simple to add in later\n                num_test_caps = self.data_loaders.num_test_captions\n                if num_test_caps == 1 and meta[\"raw_captions\"] is not None:\n                    if self.visualizer is not None:\n                        self.visualizer.visualize_ranking(\n                            sims=sims,\n                            meta=meta,\n                            epoch=epoch,\n                            nested_metrics=nested_metrics,",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:210-228"
+    },
+    "2533": {
+        "file_id": 194,
+        "content": "The code is calculating metrics such as Mean Average Precision (mAP) for each epoch and storing the results in a dictionary named nested_metrics. If mAP is calculated, it prints the value. It also calls a verbose function that takes the current epoch, metrics values, dataset name, and metric name as parameters. The code checks if there is only one test caption available (num_test_captions == 1) and if raw_captions exist. If so, it visualizes the ranking by calling a visualizer function passing simulation scores (sims), meta data, current epoch, and nested_metrics as arguments.",
+        "type": "comment"
+    },
+    "2534": {
+        "file_id": 194,
+        "content": "                        )\n                return {\"nested_val_metrics\": nested_metrics}, cached_preds\n            elif \"val\" in self.data_loaders.dataloaders:\n                metrics = [x() for x in self.metrics]\n                for batch_idx, minibatch in enumerate(self.data_loaders[\"val\"]):\n                    labels = minibatch.pop(\"labels\")\n                    vid_name = minibatch.pop(\"vid_name\")\n                    output = self.model(**minibatch)\n                    if \"val\" in self.cache_targets:\n                        cached_preds[\"val\"][\"vid_name\"].append(vid_name)\n                        cached_preds[\"val\"][\"preds\"].append(output[\"class_preds\"])\n                    for metric in metrics:\n                        metric.add(output=output[\"class_preds\"], target=labels)\n                    if batch_idx % self.log_step == 0:\n                        prog = self._progress(batch_idx)\n                        self.logger.info(f\"Val Epoch: {epoch} {prog}\")\n                nested_metrics = {}\n                for metric in metrics:",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:229-249"
+    },
+    "2535": {
+        "file_id": 194,
+        "content": "Iterating over validation data, calculates metrics for each batch, logs progress during iteration. If cache_targets includes \"val\", stores predictions and labels in cached_preds dictionary.",
+        "type": "comment"
+    },
+    "2536": {
+        "file_id": 194,
+        "content": "                    if hasattr(metric, \"topk\"):\n                        res = {f\"top{key}\": val for key, val in\n                               zip(metric.topk, metric.value())}\n                        nested_metrics[\"accuracy\"] = res\n                    else:\n                        raise ValueError(f\"unsupported mettric: {type(metric)}\")\n                nested = {\"nested_val_metrics\": nested_metrics}\n                for target in self.cache_targets - {\"val\"}:\n                    for batch_idx, minibatch in enumerate(self.data_loaders[\"tiny\"]):\n                        if \"labels\" in minibatch:\n                            cached_preds[target][\"labels\"].append(minibatch.pop(\"labels\"))\n                        cached_preds[target][\"vid_name\"].append(minibatch.pop(\"vid_name\"))\n                        output = self.model(**minibatch)\n                        cached_preds[target][\"preds\"].append(output[\"class_preds\"])\n                # aggregate all cached predictions\n                for target in self.cache_targets:",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:250-267"
+    },
+    "2537": {
+        "file_id": 194,
+        "content": "This code checks if the metric has a \"topk\" attribute, then creates a dictionary of top-k values and assigns it to \"res\". If not supported, raises a ValueError. It adds the accuracy metric to nested_metrics. The code then creates a nested dictionary for cache targets other than \"val\", and iterates through each data loader in self.data_loaders[\"tiny\"]. For each batch, it checks if labels are present, appends them to cached_preds with corresponding vid name and model predictions. Finally, it aggregates all cached predictions for the specified target(s).",
+        "type": "comment"
+    },
+    "2538": {
+        "file_id": 194,
+        "content": "                    for key, val in cached_preds[target].items():\n                        cached_preds[key] = paddle.concat(val).cpu().numpy()\n                return nested, cached_preds\n    def _progress(self, batch_idx):\n        base = '[{}/{} ({:.0f}%)]'\n        if hasattr(self.data_loaders, 'n_samples'):\n            current = batch_idx * self.data_loaders.batch_size\n            total = self.data_loaders.n_samples\n        else:\n            current = batch_idx\n            total = self.len_epoch\n        return base.format(current, total, 100.0 * current / total)",
+        "type": "code",
+        "location": "/applications/T2VLAD/trainer/trainer.py:268-280"
+    },
+    "2539": {
+        "file_id": 194,
+        "content": "The code defines two functions: _compute_nested_preds and _progress. The first function computes nested predictions from cached predictions for a given target, while the second one returns a progress message based on the current batch index and total number of samples or epoch length.",
+        "type": "comment"
+    },
+    "2540": {
+        "file_id": 195,
+        "content": "/applications/T2VLAD/utils/__init__.py",
+        "type": "filepath"
+    },
+    "2541": {
+        "file_id": 195,
+        "content": "This line imports all functions, classes, and variables from the \"util\" module located in the same directory as this file. It allows easy access to all utility functions defined in the \"util\" module without explicitly specifying each function or variable.",
+        "type": "summary"
+    },
+    "2542": {
+        "file_id": 195,
+        "content": "from .util import *",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/__init__.py:1-1"
+    },
+    "2543": {
+        "file_id": 195,
+        "content": "This line imports all functions, classes, and variables from the \"util\" module located in the same directory as this file. It allows easy access to all utility functions defined in the \"util\" module without explicitly specifying each function or variable.",
+        "type": "comment"
+    },
+    "2544": {
+        "file_id": 196,
+        "content": "/applications/T2VLAD/utils/util.py",
+        "type": "filepath"
+    },
+    "2545": {
+        "file_id": 196,
+        "content": "This code imports libraries, defines functions for data processing and categorizing experts, adjusts input features, ensures Tensor format, and includes utility functions.",
+        "type": "summary"
+    },
+    "2546": {
+        "file_id": 196,
+        "content": "\"\"\"\nExclude from autoreload\n%aimport -util.utils\n\"\"\"\nimport os\nimport json\nimport random\nfrom pathlib import Path\nfrom datetime import datetime\nfrom typing import List\nfrom itertools import repeat\nfrom collections import OrderedDict\nimport numpy as np\nimport paddle\nimport psutil\nimport humanize\nfrom PIL import Image\nfrom typeguard import typechecked\n@typechecked\ndef filter_cmd_args(cmd_args: List[str], remove: List[str]) -> List[str]:\n    drop = []\n    for key in remove:\n        if key not in cmd_args:\n            continue\n        pos = cmd_args.index(key)\n        drop.append(pos)\n        if len(cmd_args) > (pos + 1) and not cmd_args[pos + 1].startswith(\"--\"):\n            drop.append(pos + 1)\n    for pos in reversed(drop):\n        cmd_args.pop(pos)\n    return cmd_args\n@typechecked\ndef set_seeds(seed: int):\n    \"\"\"Set seeds for randomisation libraries.\n    Args:\n        seed: the seed value\n    \"\"\"\n    random.seed(seed)\n    np.random.seed(seed)\n    paddle.seed(seed)\ndef memory_summary():\n    vmem = psutil.virtual_memory()\n    msg = (",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:1-50"
+    },
+    "2547": {
+        "file_id": 196,
+        "content": "This code imports necessary libraries and defines several functions. The 'filter_cmd_args' function removes specified keys from a list of command arguments while preserving the order. The 'set_seeds' function sets seeds for randomization libraries, ensuring consistent results. The 'memory_summary' function provides a summary of virtual memory usage using the 'psutil' library.",
+        "type": "comment"
+    },
+    "2548": {
+        "file_id": 196,
+        "content": "        f\">>> Currently using {vmem.percent}% of system memory \"\n        f\"{humanize.naturalsize(vmem.used)}/{humanize.naturalsize(vmem.available)}\"\n    )\n    print(msg)\ndef flatten_dict(x, keysep=\"-\"):\n    flat_dict = {}\n    for key, val in x.items():\n        if isinstance(val, dict):\n            flat_subdict = flatten_dict(val)\n            flat_dict.update({f\"{key}{keysep}{subkey}\": subval\n                              for subkey, subval in flat_subdict.items()})\n        else:\n            flat_dict.update({key: val})\n    return flat_dict\ndef expert_tensor_storage(experts, feat_aggregation):\n    expert_storage = {\"fixed\": set(), \"variable\": set(), \"flaky\": set()}\n    # fixed_sz_experts, variable_sz_experts, flaky_experts = set(), set(), set()\n    for expert, config in feat_aggregation.items():\n        if config[\"temporal\"] in {\"vlad\",  \"fixed_seg\"}:\n            expert_storage[\"variable\"].add(expert)\n        elif config[\"temporal\"] in {\"avg\", \"max\", \"avg-max\", \"max-avg\", \"avg-max-ent\", \n                                    \"max-avg-ent\"}:",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:51-76"
+    },
+    "2549": {
+        "file_id": 196,
+        "content": "This code defines three functions. The first function, `print_memory`, prints the current system memory usage in a readable format. The second function, `flatten_dict`, recursively flattens nested dictionaries into a single-level dictionary. The third function, `expert_tensor_storage`, categorizes experts based on their temporal configurations into fixed, variable, and flaky sets.",
+        "type": "comment"
+    },
+    "2550": {
+        "file_id": 196,
+        "content": "            expert_storage[\"fixed\"].add(expert)\n        else:\n            raise ValueError(f\"unknown temporal strategy: {config['temporal']}\")\n        # some \"flaky\" experts are only available for a fraction of videos - we need\n        # to pass this information (in the form of indices) into the network for any\n        # experts present in the current dataset\n        if config.get(\"flaky\", False):\n            expert_storage[\"flaky\"].add(expert)\n    # we only allocate storage for experts used by the current dataset\n    for key, value in expert_storage.items():\n        expert_storage[key] = value.intersection(set(experts))\n    return expert_storage\ndef read_json(fname):\n    with fname.open('rt') as handle:\n        return json.load(handle, object_hook=OrderedDict)\ndef path2str(x):\n    \"\"\"Recursively convert pathlib objects to strings to enable serialization\"\"\"\n    for key, val in x.items():\n        if isinstance(val, dict):\n            path2str(val)\n        elif isinstance(val, Path):\n            x[key] = str(val)",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:77-103"
+    },
+    "2551": {
+        "file_id": 196,
+        "content": "This code snippet contains a function that takes in an expert and its configuration, adds it to the appropriate storage based on its temporal strategy, and handles flaky experts. It also defines two utility functions - read_json for parsing JSON files and path2str for converting pathlib objects to strings for serialization.",
+        "type": "comment"
+    },
+    "2552": {
+        "file_id": 196,
+        "content": "def write_json(content, fname, paths2strs=False):\n    if paths2strs:\n        path2str(content)\n    with fname.open('wt') as handle:\n        json.dump(content, handle, indent=4, sort_keys=False)\ndef inf_loop(data_loader):\n    ''' wrapper function for endless data loader. '''\n    for loader in repeat(data_loader):\n        yield from loader\nclass HashableDict(dict):\n    def __hash__(self):\n        return hash(frozenset(self))\nclass HashableOrderedDict(dict):\n    def __hash__(self):\n        return hash(frozenset(self))\ndef compute_trn_config(config, logger=None):\n    trn_config = {}\n    feat_agg = config[\"data_loader\"][\"args\"][\"feat_aggregation\"]\n    for static_expert in feat_agg.keys():\n        if static_expert in feat_agg:\n            if \"trn_seg\" in feat_agg[static_expert].keys():\n                trn_config[static_expert] = feat_agg[static_expert][\"trn_seg\"]\n    return trn_config\ndef compute_dims(config, logger=None):\n    if logger is None:\n        logger = config.get_logger('utils')\n    experts = config[\"experts\"]",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:106-143"
+    },
+    "2553": {
+        "file_id": 196,
+        "content": "This code includes a function for writing JSON data, an infinite loop wrapper for data loaders, two classes for hashable dictionaries, a function to compute training configuration from a given config file, and a function to compute dimensions from the same config file.",
+        "type": "comment"
+    },
+    "2554": {
+        "file_id": 196,
+        "content": "    # TODO(Samuel): clean up the logic since it's a little convoluted\n    ordered = sorted(config[\"experts\"][\"modalities\"])\n    if experts[\"drop_feats\"]:\n        to_drop = experts[\"drop_feats\"].split(\",\")\n        logger.info(f\"dropping: {to_drop}\")\n        ordered = [x for x in ordered if x not in to_drop]\n    feat_agg = config[\"data_loader\"][\"args\"][\"feat_aggregation\"]\n    dims = []\n    arch_args = config[\"arch\"][\"args\"]\n    vlad_clusters = arch_args[\"vlad_clusters\"]\n    for expert in ordered:\n        temporal = feat_agg[expert][\"temporal\"]\n        if expert == \"face\":\n            in_dim, out_dim = experts[\"face_dim\"], experts[\"face_dim\"]\n        elif expert == \"features_scene\" and temporal == \"vlad\":\n            in_dim, out_dim = 2208 * vlad_clusters[\"features_scene\"], 2208\n        elif expert == \"features_s3d\" and temporal == \"vlad\":\n            in_dim, out_dim = 1024 * vlad_clusters[\"features_s3d\"], 1024\n        elif expert == \"features_flow\" and temporal == \"vlad\":\n            in_dim, out_dim = 1024 * vlad_clusters[\"features_flow\"], 1024",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:144-165"
+    },
+    "2555": {
+        "file_id": 196,
+        "content": "This code is organizing modalities, extracting expert settings and dimensions for different modalities like face, features_scene, features_s3d, and features_flow. It also checks if any feature should be dropped and sorts them accordingly. Finally, it assigns the input and output dimensions based on the modality and temporal aggregation.",
+        "type": "comment"
+    },
+    "2556": {
+        "file_id": 196,
+        "content": "        elif expert == \"features_rgb\" and temporal == \"vlad\":\n            in_dim, out_dim = 2048 * vlad_clusters[\"features_rgb\"], 2048\n        elif expert == \"features_ocr\" and temporal == \"vlad\":\n            in_dim, out_dim = 300 * vlad_clusters[\"features_ocr\"], 300\n        elif expert == \"features_face\" and temporal == \"vlad\":\n            in_dim, out_dim = 512 * vlad_clusters[\"features_face\"], 512\n        elif expert == \"features_speech\" and temporal == \"vlad\":\n            in_dim, out_dim = 300 * vlad_clusters[\"features_speech\"], 300\n        elif expert == \"features_audio\" and temporal == \"vlad\":\n            in_dim, out_dim = 128 * vlad_clusters[\"features_audio\"], 128\n        elif expert == \"audio\" and temporal == \"vlad\":\n            in_dim, out_dim = 128 * vlad_clusters[\"audio\"], 128\n        elif expert == \"audio\" and temporal == \"vlad\":\n            in_dim, out_dim = 128 * vlad_clusters[\"audio\"], 128\n        elif expert == \"speech\" and temporal == \"vlad\":\n            in_dim, out_dim = 300 * vlad_clusters[\"speech\"], 300",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:166-181"
+    },
+    "2557": {
+        "file_id": 196,
+        "content": "This code snippet is determining the input and output dimensions based on the expert type and temporal method used. It sets the input dimension by multiplying vlad_clusters value with respective constants, and the output dimension remains constant for each expert type and temporal method combination.",
+        "type": "comment"
+    },
+    "2558": {
+        "file_id": 196,
+        "content": "        elif expert == \"ocr\" and temporal == \"vlad\":\n            in_dim, out_dim = 300 * vlad_clusters[\"ocr\"], 300\n        elif expert == \"detection\":\n            # allow for avg pooling\n            det_clusters = arch_args[\"vlad_clusters\"].get(\"detection\", 1)\n            in_dim, out_dim = 1541 * det_clusters, 1541\n        elif expert == \"detection-sem\":\n            if config[\"data_loader\"][\"args\"].get(\"spatial_feats\", False):\n                base = 300 + 16\n            else:\n                base = 300 + 5\n            det_clusters = arch_args[\"vlad_clusters\"].get(\"detection-sem\", 1)\n            in_dim, out_dim = base * det_clusters, base\n        elif expert == \"openpose\":\n            base = 54\n            det_clusters = arch_args[\"vlad_clusters\"].get(\"openpose\", 1)\n            in_dim, out_dim = base * det_clusters, base\n        else:\n            common_dim = feat_agg[expert][\"feat_dims\"][feat_agg[expert][\"type\"]]\n            # account for aggregation of multilpe forms (e.g. avg + max pooling)\n            common_dim = common_dim * len(feat_agg[expert][\"temporal\"].split(\"-\"))",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:182-202"
+    },
+    "2559": {
+        "file_id": 196,
+        "content": "This code block assigns the input and output dimensions for different experts (e.g., OCR, detection, detection-sem, openpose) based on their respective configurations and cluster settings. It also considers aggregation types like avg or max pooling.",
+        "type": "comment"
+    },
+    "2560": {
+        "file_id": 196,
+        "content": "            in_dim, out_dim = common_dim, common_dim\n        # For the CE architecture, we need to project all features to a common\n        # dimensionality\n        if arch_args.get(\"mimic_ce_dims\", False):\n            out_dim = experts[\"ce_shared_dim\"]\n        dims.append((expert, (in_dim, out_dim)))\n    expert_dims = OrderedDict(dims)\n    if vlad_clusters[\"text\"] == 0:\n        msg = \"vlad can only be disabled for text with single tokens\"\n        assert config[\"data_loader\"][\"args\"][\"max_tokens\"][\"text\"] == 1, msg\n    if config[\"experts\"][\"text_agg\"] == \"avg\":\n        msg = \"averaging can only be performed with text using single tokens\"\n        assert config[\"arch\"][\"args\"][\"vlad_clusters\"][\"text\"] == 0\n        assert config[\"data_loader\"][\"args\"][\"max_tokens\"][\"text\"] == 1\n    # To remove the dependency of dataloader on the model architecture, we create a\n    # second copy of the expert dimensions which accounts for the number of vlad\n    # clusters\n    raw_input_dims = OrderedDict()\n    for expert, dim_pair in expert_dims.items():",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:203-226"
+    },
+    "2561": {
+        "file_id": 196,
+        "content": "This code configures the expert dimensions for a machine learning model. It checks if certain conditions are met, such as disabling VLAD for text with single tokens and using averaging only with text using single tokens. To avoid dependencies between dataloader and model architecture, it creates a second copy of expert dimensions accounting for the number of VLAD clusters.",
+        "type": "comment"
+    },
+    "2562": {
+        "file_id": 196,
+        "content": "        raw_dim = dim_pair[0]\n        if expert in {\"audio\", \"speech\", \"ocr\", \"detection\", \"detection-sem\", \"openpose\", \"features_audio\", \"features_speech\", \"features_face\", \"features_ocr\",  \"features_rgb\", \"features_flow\", \"features_s3d\", \"features_scene\",\n                      \"speech.mozilla.0\"}:\n            if feat_agg[expert][\"temporal\"] == \"vlad\":\n                raw_dim = raw_dim // vlad_clusters.get(expert, 1)\n        raw_input_dims[expert] = raw_dim\n    return expert_dims, raw_input_dims\ndef ensure_tensor(x):\n    if not isinstance(x, paddle.Tensor): #if not isinstance(x, torch.Tensor):\n        x = paddle.to_tensor(x) #    x = torch.from_numpy(x)\n    return x\nclass Timer:\n    def __init__(self):\n        self.cache = datetime.now()\n    def check(self):\n        now = datetime.now()\n        duration = now - self.cache\n        self.cache = now\n        return duration.total_seconds()\n    def reset(self):\n        self.cache = datetime.now()\ndef tensor2im(input_image, imtype=np.uint8):\n    \"\"\"\"Converts a Tensor array into a numpy image array.",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:227-258"
+    },
+    "2563": {
+        "file_id": 196,
+        "content": "This code is adjusting the dimensionality of input features for different expert models and ensuring they are in Tensor format. It also provides utility functions like Timer for measuring time durations and tensor2im to convert Tensors into numpy images.",
+        "type": "comment"
+    },
+    "2564": {
+        "file_id": 196,
+        "content": "    Parameters:\n        input_image (tensor) --  the input image tensor array\n        imtype (type)        --  the desired type of the converted numpy array\n    \"\"\"\n    if not isinstance(input_image, np.ndarray):\n        if isinstance(input_image, paddle.Tensor): #if isinstance(input_image, torch.Tensor):  # get the data from a variable\n            image_tensor = input_image #image_tensor = input_image.data\n        else:\n            return input_image\n        # convert it into a numpy array\n        image_numpy = image_tensor[0].cpu().float().numpy()\n        if image_numpy.shape[0] == 1:  # grayscale to RGB\n            image_numpy = np.tile(image_numpy, (3, 1, 1))\n        # post-processing: tranpose and scaling\n        image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0\n    else:  # if it is a numpy array, do nothing\n        image_numpy = input_image\n    return image_numpy.astype(imtype)\ndef save_image(image_numpy, image_path):\n    \"\"\"Save a numpy image to the disk\n    Parameters:\n        image_numpy (numpy array) -- input numpy array",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:260-284"
+    },
+    "2565": {
+        "file_id": 196,
+        "content": "The function normalizes and converts the input image tensor array to a numpy array. It also handles different data types and saves the numpy image to disk.",
+        "type": "comment"
+    },
+    "2566": {
+        "file_id": 196,
+        "content": "        image_path (str)          -- the path of the image\n    \"\"\"\n    image_pil = Image.fromarray(image_numpy)\n    image_pil.save(image_path)\ndef print_numpy(x, val=True, shp=False):\n    \"\"\"Print the mean, min, max, median, std, and size of a numpy array\n    Parameters:\n        val (bool) -- if print the values of the numpy array\n        shp (bool) -- if print the shape of the numpy array\n    \"\"\"\n    x = x.astype(np.float64)\n    if shp:\n        print('shape,', x.shape)\n    if val:\n        x = x.flatten()\n        print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (\n            np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))\ndef mkdirs(paths):\n    \"\"\"create empty directories if they don't exist\n    Parameters:\n        paths (str list) -- a list of directory paths\n    \"\"\"\n    if isinstance(paths, list) and not isinstance(paths, str):\n        for path in paths:\n            mkdir(path)\n    else:\n        mkdir(paths)\ndef mkdir(path):\n    \"\"\"create a single empty directory if it didn't exist",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:285-321"
+    },
+    "2567": {
+        "file_id": 196,
+        "content": "util.py contains several utility functions:\n1. \"image_to_path\" converts an image numpy array to a PIL Image and saves it at the given path.\n2. \"print_numpy\" prints statistics (mean, min, max, median, std) of a numpy array if specified.\n3. \"mkdirs\" creates empty directories if they don't exist, accepting either a list of paths or a single path.\n4. \"mkdir\" is a helper function for \"mkdirs,\" creating a single directory if it doesn't exist.",
+        "type": "comment"
+    },
+    "2568": {
+        "file_id": 196,
+        "content": "    Parameters:\n        path (str) -- a single directory path\n    \"\"\"\n    if not os.path.exists(path):\n        os.makedirs(path)",
+        "type": "code",
+        "location": "/applications/T2VLAD/utils/util.py:323-327"
+    },
+    "2569": {
+        "file_id": 196,
+        "content": "This function creates a directory if it does not exist at the given path.",
+        "type": "comment"
+    },
+    "2570": {
+        "file_id": 197,
+        "content": "/applications/TableTennis/ActionRecognition/README.md",
+        "type": "filepath"
+    },
+    "2571": {
+        "file_id": 197,
+        "content": "The code repository features a PaddlePaddle implementation of Table Tennis action recognition with the VideoSwinTransformer model, supporting feature extraction, classification, single/multi-GPU training, and pre-trained models. Running prediction generates gif files overlaid with predictions, while optimization can be done by adjusting sampling parameters or hyperparameters.",
+        "type": "summary"
+    },
+    "2572": {
+        "file_id": 197,
+        "content": "# 乒乓球动作识别模型\n## 内容\n- [模型简介](#模型简介)\n- [数据准备](#数据准备)\n- [模型训练](#模型训练)\n- [模型推理](#模型推理)\n- [模型优化](#模型优化)\n- [模型部署](#模型部署)\n- [参考论文](#参考论文)\n在开始使用之前，您需要按照以下命令安装额外的依赖包：\n```bash\npython -m pip install imageio\n```\n## 模型简介\n该代码库用于乒乓球动作识别, 基于paddle2.2版本开发，结合PaddleVideo中的VideoSwinTransformer模型，对给定的乒乓球视频进行动作分类。\n主要分为如下几步\n - 图像特征抽取，SwinTransformer3D\n - 动作分类，I3DHead\n## 数据准备\nTODO\n## 模型训练\n主要代码来自VideoSwin模型：[VideoSwin](../../../docs/zh-CN/model_zoo/recognition/videoswin.md)\n1. 使用VideoSwin在K400上的预训练模型基础上进行finetune，因此首先下载K400的预训练模型并放置到`data`目录下\n    ```bash\n    wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_k400.pdparams\n    ```\n2. 使用`TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml`配置文件进行训练\n    训练启动命令如下：\n    ```bash\n    # 单卡\n    python3.7 -u main.py --amp --validate -c applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml\n    # 多卡\n    python3.7 -u -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_videoswin_tabletennis main.py --amp --validate -c applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml",
+        "type": "code",
+        "location": "/applications/TableTennis/ActionRecognition/README.md:1-43"
+    },
+    "2573": {
+        "file_id": 197,
+        "content": "This code repository contains a PaddlePaddle implementation of Table Tennis action recognition using the VideoSwinTransformer model. The code is based on PaddlePaddle 2.2, and the training data should be placed in the \"data\" directory. It includes steps for image feature extraction and action classification using SwinTransformer3D and I3DHead respectively. Training can be done with single or multi-GPU configurations.",
+        "type": "comment"
+    },
+    "2574": {
+        "file_id": 197,
+        "content": "    ```\n## 模型评估\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_videoswin_tabletennis  main.py  --test -c configs/recognition/video_swin_transformer/videoswin_tabletennis.yaml -w \"output/VideoSwin_TableTennis/VideoSwin_TableTennis_best.pdparams\"\n```\n## 模型推理\n我们提供了一个在乒乓球数据集上训练好的模型以及一个乒乓球样例的视频pkl文件，以供测试\n```\nwget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_tennis.pdparams # 下载乒乓球数据集上训练好的模型\nwget -P data/ https://videotag.bj.bcebos.com/Data/example_tennis.pkl # 下载乒乓球样例输入视频pkl文件\n```\n### 导出推理模型\n```\npython3.7 tools/export_model.py -c applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml \\\n                                -p output/VideoSwin_TableTennis/VideoSwin_TableTennis_best.pdparams \\\n                                -o inference/VideoSwin_TableTennis\n```\n上述命令会根据传入的`.pdparams`模型，在`inference/VideoSwin_TableTennis`文件夹下生成推理模型，主要包括3个文件：`VideoSwin_TableTennis.pdiparams`、`VideoSwin_TableTennis.pdmodel`、`VideoSwin_TableTennis.info`",
+        "type": "code",
+        "location": "/applications/TableTennis/ActionRecognition/README.md:44-66"
+    },
+    "2575": {
+        "file_id": 197,
+        "content": "This code snippet is for training and testing the VideoSwin transformer model on the TableTennis dataset using PaddlePaddle. It provides instructions to download pre-trained models and example input video files, and demonstrates how to export an inference model using the provided config file and model parameters.",
+        "type": "comment"
+    },
+    "2576": {
+        "file_id": 197,
+        "content": "### 使用推理模型\n测试文件使用`.pkl`文件，其包含了已抽取的用于预测的乒乓球视频帧。\n运行预测代码\n```bash\npython3.7 tools/predict.py --input_file data/example_tennis_7.pkl \\\n                           --config applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml \\\n                           --model_file inference/VideoSwin_TableTennis/VideoSwin_TableTennis.pdmodel \\\n                           --params_file inference/VideoSwin_TableTennis/VideoSwin_TableTennis.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\n执行以上命令会产出一个原视频叠加预测结果文本(Top1类别+概率)的gif图片，保存在本目录的results文件夹下，gif文件名与输入的pkl文件名相同。\n效果如下图：\n![example_7.gif](results/example_tennis_7.gif)\n## 模型优化\n在实际使用场景中可根据视频内容尝试优化策略\n- 可根据动作持续时间的长短，调整采样的段数num_seg和段内采样的帧数seg_len\n- 可以根据数据集大小调整模型训练的超参数，包括权重衰减、DropOut概率、学习率、更换优化器等，以获得更优的结果。\n- 本代码的backbone部分可以作为视频特征提取模块，代替其它的动作识别backbone，以获得表征能力更强的视频特征，以提升整体任务的精度。\n## 模型部署\nTODO\n## 参考论文\n- [Video Swin Transformer](https://arxiv.org/pdf/2106.13230.pdf), Ze Liu, Jia Ning, Yue Cao, Yixuan Wei",
+        "type": "code",
+        "location": "/applications/TableTennis/ActionRecognition/README.md:68-98"
+    },
+    "2577": {
+        "file_id": 197,
+        "content": "Running prediction code with provided arguments will generate a gif file showing the video overlaid with predicted results (top 1 class and probability) in the results folder.\nThe model can be optimized based on video content, by adjusting sampling parameters like num_seg and seg_len or hyperparameters for training.",
+        "type": "comment"
+    },
+    "2578": {
+        "file_id": 198,
+        "content": "/applications/TableTennis/datasets/script/submission_format_transfer.py",
+        "type": "filepath"
+    },
+    "2579": {
+        "file_id": 198,
+        "content": "This code reads data from a JSON file, converts timestamps to 25 fps, and formats it according to the table tennis analysis submission format. The formatted data is written back to a new JSON file for further use or analysis.",
+        "type": "summary"
+    },
+    "2580": {
+        "file_id": 198,
+        "content": "import json\nimport math\nwith open('/workspace/bianjiang03/DATA/Output_for_bmn/prop.json') as f:\n    data = json.load(f)\nf.close()\ntransferred = dict()\n# 25 fps for all videos\nfps = 25\nfor item in data:\n    temp = []\n    for seg in item['bmn_results']:\n        temp_dict = {\n            'score': seg['score'],\n            'segment':\n            [round(seg['start'] / fps, 2),\n             round(seg['end'] / fps, 2)]\n        }\n        temp.append(temp_dict)\n    transferred[item['video_name']] = temp\ntarget_format = {\n    'version': 'A-test',\n    'results': transferred,\n    'external_data': {}\n}\njsonString = json.dumps(target_format, indent=4, ensure_ascii=False)\njsonFile = open('/workspace/bianjiang03/DATA/Output_for_bmn/submission.json',\n                'w')\njsonFile.write(jsonString)\njsonFile.close()\n# target format\n# {\n#   \"version\": NA,\n#   \"results\": {\n#     \"name_of_clip_1\": [\n#       {\n#         \"score\": 0.64,\n#         \"segment\": [2.33,3.15]\n#       },\n#       {\n#         \"score\": 0.77,\n#         \"segment\": [7.64, 7.84]\n#       }",
+        "type": "code",
+        "location": "/applications/TableTennis/datasets/script/submission_format_transfer.py:1-49"
+    },
+    "2581": {
+        "file_id": 198,
+        "content": "This code reads data from a JSON file, converts the segment timestamps to a specific frame rate (25 fps), and organizes it into the target submission format. It then writes the formatted data back to a new JSON file for further use or analysis.",
+        "type": "comment"
+    },
+    "2582": {
+        "file_id": 198,
+        "content": "#     ],\n# \t\"name_of_clip_2\": [\n#       {\n#         \"score\": 0.84,\n#         \"segment\": [9.73,10.15]\n#       },\n#       {\n#         \"score\": 0.87,\n#         \"segment\": [17.11, 17.84]\n#       }\n#     ],\n# \t...\n#   }\n#   \"external_data\": {}\n# }",
+        "type": "code",
+        "location": "/applications/TableTennis/datasets/script/submission_format_transfer.py:50-64"
+    },
+    "2583": {
+        "file_id": 198,
+        "content": "This code defines a dictionary structure representing a submission format for table tennis analysis, with \"name_of_clip\" keys holding lists of segments and scores. The \"external_data\" field is empty.",
+        "type": "comment"
+    },
+    "2584": {
+        "file_id": 199,
+        "content": "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py",
+        "type": "filepath"
+    },
+    "2585": {
+        "file_id": 199,
+        "content": "This code uses PaddleVideo library to classify videos, perform feature extraction, and predict bounding box results. It logs information, saves outputs if needed, and writes inference results into a JSON file.",
+        "type": "summary"
+    },
+    "2586": {
+        "file_id": 199,
+        "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport shutil\nimport numpy as np\nsys.path.append(\n    \"/workspace/bianjiang03/App_TableTennis/PaddleVideo/FootballAction/predict/action_detect\"\n)\nimport models.bmn_infer as prop_model\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport utils.config_utils as config_utils\nimport logger\nlogger = logger.Logger()\ndef load_model(cfg_file=\"configs/configs.yaml\"):\n    \"\"\"\n    load_model\n    \"\"\"\n    logger.info(\"load model ... \")\n    global infer_configs\n    infer_configs = parse_config(cfg_file)\n    print_configs(infer_configs, \"Infer\")\n    t0 = time.time()\n    global prop_model\n    prop_model = prop_model.InferModel(infer_configs)\n    t1 = time.time()\n    logger.info(\"step0: load model time: {} min\\n\".format((t1 - t0) * 1.0 / 60))\ndef video_classify(video_name, dataset_dir):\n    \"\"\"\n    extract_feature\n    \"\"\"\n    logger.info('predict ... ')\n    logger.info(video_name)",
+        "type": "code",
+        "location": "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py:1-50"
+    },
+    "2587": {
+        "file_id": 199,
+        "content": "This code is for Baidu Cloud action and loads the model using the BMN (Behaved Motion Network) model from a given configuration file. It also defines a function to classify videos by predicting their actions and prints the information about the video being processed. The code uses logger for logging the information.",
+        "type": "comment"
+    },
+    "2588": {
+        "file_id": 199,
+        "content": "    # step 1: extract feature\n    feature_path = dataset_dir + video_name\n    video_features = pickle.load(open(feature_path, 'rb'))\n    print('===video_features===', video_name)\n    # step2: get proposal\n    t0 = time.time()\n    bmn_results = prop_model.predict(infer_configs, material=video_features)\n    t1 = time.time()\n    logger.info(np.array(bmn_results).shape)\n    logger.info(\"step2: proposal time: {} min\".format((t1 - t0) * 1.0 / 60))\n    return bmn_results\nif __name__ == '__main__':\n    dataset_dir = '/workspace/bianjiang03/DATA/Features_competition_test_A/'\n    output_dir = '/workspace/bianjiang03/DATA'\n    if not os.path.exists(output_dir + '/Output_for_bmn'):\n        os.mkdir(output_dir + '/Output_for_bmn')\n    results = []\n    load_model()\n    directory = os.fsencode(dataset_dir)\n    for file in os.listdir(directory):\n        filename = os.fsdecode(file)\n        bmn_results = video_classify(filename, dataset_dir)\n        results.append({\n            'video_name': filename.split('.pkl')[0],\n            'num_proposal': len(bmn_results),",
+        "type": "code",
+        "location": "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py:52-84"
+    },
+    "2589": {
+        "file_id": 199,
+        "content": "This code performs video feature extraction and proposal generation using the PaddleVideo library. It first loads the video features from a pickle file, then predicts bounding box minimum notation (Bmn) results using a pre-trained model. Finally, it returns the Bmn results and saves them in an output directory if it doesn't already exist.",
+        "type": "comment"
+    },
+    "2590": {
+        "file_id": 199,
+        "content": "            'bmn_results': bmn_results\n        })\n    with open(output_dir + '/Output_for_bmn/prop.json', 'w',\n              encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)\n    print('Done with the inference!')",
+        "type": "code",
+        "location": "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py:85-93"
+    },
+    "2591": {
+        "file_id": 199,
+        "content": "This code segment writes the inference results into a JSON file. It first stores the 'bmn_results' dictionary and then writes it to a file named \"Output_for_bmn/prop.json\". The JSON data is formatted for readability with indentation and using UTF-8 encoding. Once writing is complete, it prints \"Done with the inference!\" indicating successful execution.",
+        "type": "comment"
+    },
+    "2592": {
+        "file_id": 200,
+        "content": "/applications/TableTennis/fix_bad_label.py",
+        "type": "filepath"
+    },
+    "2593": {
+        "file_id": 200,
+        "content": "The code reads files from a directory, retrieves their names without extensions, checks if the names exist in another set of labels, and deletes any mismatched labels. It then writes the fixed label file with updated sizes.",
+        "type": "summary"
+    },
+    "2594": {
+        "file_id": 200,
+        "content": "import copy\nimport json\nimport re\nimport os\nurl = '/home/aistudio/work/BMN/Input_for_bmn/feature/'\ndirectory = os.fsencode(url)\ncount = 0\ntarget_set = []\nfor file in os.listdir(directory):\n    filename = os.fsdecode(file)\n    target_name = filename.split('.npy')[0]\n    target_set.append(target_name)\n    count += 1\nprint('Feature size:', len(target_set))\nwith open('/home/aistudio/work/BMN/Input_for_bmn/label.json') as f:\n    data = json.load(f)\ndelet_set = []\nfor key in data.keys():\n    if not key in target_set:\n        delet_set.append(key)\nprint('(Label) Original size:', len(data))\nprint('(Label) Deleted size:', len(delet_set))\nfor item in delet_set:\n    data.pop(item, None)\nprint('(Label) Fixed size:', len(data))\njsonString = json.dumps(data, indent=4, ensure_ascii=False)\njsonFile = open('/home/aistudio/work/BMN/Input_for_bmn/label_fixed.json', 'w')\njsonFile.write(jsonString)\njsonFile.close()",
+        "type": "code",
+        "location": "/applications/TableTennis/fix_bad_label.py:1-37"
+    },
+    "2595": {
+        "file_id": 200,
+        "content": "The code reads files from a directory, retrieves their names without extensions, checks if the names exist in another set of labels, and deletes any mismatched labels. It then writes the fixed label file with updated sizes.",
+        "type": "comment"
+    },
+    "2596": {
+        "file_id": 201,
+        "content": "/applications/TableTennis/get_instance_for_bmn.py",
+        "type": "filepath"
+    },
+    "2597": {
+        "file_id": 201,
+        "content": "This code generates ground truth data for the BMN model in table tennis applications, using the `combile_gts` function to extract action segments from root actions. It calculates video segments, appends annotations, and returns a dataset dictionary for TableTennis.",
+        "type": "summary"
+    },
+    "2598": {
+        "file_id": 201,
+        "content": "\"\"\"\nget instance for bmn\n使用winds=8的滑窗，将所有子窗口的长度之和小于winds的进行合并\n合并后，父窗口代表bmn训练数据，子窗口代表tsn训练数据\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nimport math\n# for table tennis\nbmn_window = 8\ndataset = \"/home/aistudio/work/BMN/\"\nfeat_dir = dataset + '/Features_example'\nout_dir = dataset + '/Input_for_bmn'\nlabel_files = {\n    'train': 'label_cls14_small_train.json',\n    'validation': 'label_cls14_small_test.json'\n}\nglobal fps\ndef gen_gts_for_bmn(gts_data):\n    \"\"\"\n    @param, gts_data, original gts for action detection\n    @return, gts_bmn, output gts dict for bmn\n    \"\"\"\n    fps = gts_data['fps']\n    gts_bmn = {'fps': fps, 'gts': []}\n    for sub_item in gts_data['gts']:\n        url = sub_item['url']\n        max_length = sub_item['total_frames']\n        gts_bmn['gts'].append({\n            'url': url,\n            'total_frames': max_length,\n            'root_actions': []\n        })\n        sub_actions = sub_item['actions']\n        # 跳过没有动作的片段\n        if len(sub_actions) == 0:\n            continue\n        # duration > bmn_window， 动作持续时间大于bmn_windows，直接删除",
+        "type": "code",
+        "location": "/applications/TableTennis/get_instance_for_bmn.py:1-48"
+    },
+    "2599": {
+        "file_id": 201,
+        "content": "This code is responsible for generating ground truth data for the BMN model in a table tennis application. It takes in original gts (ground truth) data and outputs the modified gts_bmn dictionary. The code first sets the fps value from gts_data, then iterates over each sub-item in gts_data['gts']. If a sub-item has actions but its duration is less than bmn_window, it is skipped. Otherwise, the sub-item data gets stored in gts_bmn.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/26.json b/docs/data/26.json
new file mode 100644
index 000000000..48245f72b
--- /dev/null
+++ b/docs/data/26.json
@@ -0,0 +1,543 @@
+{
+    "2600": {
+        "file_id": 201,
+        "content": "        for idx, sub_action in enumerate(sub_actions):\n            if sub_action['end_id'] - sub_action['start_id'] > bmn_window:\n                sub_actions.pop(idx)\n        # 【滑动窗口，把每一个视频里的动作片段提取出来】\n        root_actions = [sub_actions[0]]\n        # before_id, 前一动作的最后一帧\n        # after_id, 后一动作的第一帧\n        before_id = 0\n        for idx in range(1, len(sub_actions)):\n            cur_action = sub_actions[idx]\n            duration = (cur_action['end_id'] - root_actions[0]['start_id'])\n            if duration > bmn_window:  # windows只能包住一个动作就包，包不住就包多个\n                after_id = cur_action['start_id']\n                gts_bmn['gts'][-1]['root_actions'].append({\n                    'before_id':\n                    before_id,\n                    'after_id':\n                    after_id,\n                    'actions':\n                    root_actions\n                })\n                before_id = root_actions[-1]['end_id']  #更新滑窗\n                root_actions = [cur_action]\n            else:\n                root_actions.append(cur_action)",
+        "type": "code",
+        "location": "/applications/TableTennis/get_instance_for_bmn.py:49-74"
+    },
+    "2601": {
+        "file_id": 201,
+        "content": "This code extracts video action segments using a sliding window and stores them in the \"root_actions\" list. If the duration of an action is too long, it splits it into multiple actions and appends them to the \"gts_bmn['gts'][-1]['root_actions']\". The \"before_id\" and \"after_id\" keep track of the first and last frame of each extracted action, while the \"bmn_window\" determines the maximum duration for a single action.",
+        "type": "comment"
+    },
+    "2602": {
+        "file_id": 201,
+        "content": "            if idx == len(sub_actions) - 1:\n                after_id = max_length\n                gts_bmn['gts'][-1]['root_actions'].append({\n                    'before_id':\n                    before_id,\n                    'after_id':\n                    after_id,\n                    'actions':\n                    root_actions\n                })\n    return gts_bmn\ndef combile_gts(gts_bmn, gts_process, mode):\n    \"\"\"\n    1、bmn_window 范围内只有一个动作，只取一个目标框\n    2、bmn_window 范围内有多个动作，取三个目标框(第一个动作、最后一个动作、所有动作)\n    \"\"\"\n    global fps\n    fps = gts_process['fps']\n    duration_second = bmn_window * 1.0\n    duration_frame = bmn_window * fps\n    feature_frame = duration_frame\n    for item in gts_process['gts']:\n        url = item['url']\n        basename = os.path.basename(url).split('.')[0]\n        root_actions = item['root_actions']\n        # 把每一个视频里的动作片段提取出来\n        for root_action in root_actions:\n            segments = []\n            # all actions\n            segments.append({\n                'actions': root_action['actions'],",
+        "type": "code",
+        "location": "/applications/TableTennis/get_instance_for_bmn.py:75-108"
+    },
+    "2603": {
+        "file_id": 201,
+        "content": "The code is defining a function `combile_gts` that takes in `gts_bmn`, `gts_process`, and `mode` as parameters. It sets `fps` based on the `gts_process` data, calculates `duration_frame` and `feature_frame`. Then it iterates over the `gts_process['gts']` list to extract action segments from each item's root actions, appending them to the `segments` list. The function returns these segments.",
+        "type": "comment"
+    },
+    "2604": {
+        "file_id": 201,
+        "content": "                'before_id': root_action['before_id'],\n                'after_id': root_action['after_id']\n            })\n            if len(root_action['actions']) > 1:  #如果有多个动作，则第一个动作和最后一个动作，额外添加一次\n                # first action\n                segments.append({\n                    'actions': [root_action['actions'][0]],\n                    'before_id':\n                    root_action['before_id'],\n                    'after_id':\n                    root_action['actions'][1]['start_id']\n                })\n                # last action\n                segments.append({\n                    'actions': [root_action['actions'][-1]],\n                    'before_id':\n                    root_action['actions'][-2]['end_id'],\n                    'after_id':\n                    root_action['after_id']\n                })\n            # 把动作片段处理成window size大小，以适配BMN输入\n            for segment in segments:\n                before_id = segment['before_id']\n                after_id = segment['after_id']\n                actions = segment['actions']",
+        "type": "code",
+        "location": "/applications/TableTennis/get_instance_for_bmn.py:109-134"
+    },
+    "2605": {
+        "file_id": 201,
+        "content": "This code processes a list of actions and splits them into segments based on the number of elements. It adds extra segments for the first and last actions if there are more than one. Then, it processes each segment to fit a window size for compatibility with BMN input.",
+        "type": "comment"
+    },
+    "2606": {
+        "file_id": 201,
+        "content": "                # before_id到after_id太长了，从里面取window_size帧，要先确定一个起始点，然后动作都要包住\n                box0 = max(actions[-1]['end_id'] - bmn_window,\n                           before_id)  #确定起始点\n                box1 = min(actions[0]['start_id'],\n                           after_id - bmn_window)  #确实起始点\n                if box0 <= box1:  # 一次检查\n                    if int(box0) - int(box1) == 0:\n                        cur_start = box0\n                    else:\n                        box0 = math.ceil(box0)\n                        box1 = int(box1)\n                        cur_start = random.randint(box0, box1)\n                    cur_end = cur_start + bmn_window\n                    cur_start = round(cur_start, 2)\n                    cur_end = round(cur_end, 2)\n                    name = '{}_{}_{}'.format(basename, cur_start, cur_end)\n                    annotations = []\n                    for action in actions:\n                        label = str(1.0 * action['label_ids'][0])\n                        label_name = action['label_names'][0]",
+        "type": "code",
+        "location": "/applications/TableTennis/get_instance_for_bmn.py:135-154"
+    },
+    "2607": {
+        "file_id": 201,
+        "content": "This code snippet is determining the start and end points for a segment of video data based on action IDs. It ensures that the segment contains the entire sequence of actions, with some randomness in selecting the starting point within the specified range. The selected segment will be used to create an instance of the TableTennis application.",
+        "type": "comment"
+    },
+    "2608": {
+        "file_id": 201,
+        "content": "                        seg0 = 1.0 * round((action['start_id'] - cur_start),\n                                           2)  #存储的是到开始位置(时间: s)的距离\n                        seg1 = 1.0 * round((action['end_id'] - cur_start), 2)\n                        annotations.append({\n                            'segment': [seg0, seg1],\n                            'label': label,\n                            'label_name': label_name\n                        })\n                    gts_bmn[name] = {\n                        'duration_second': duration_second,\n                        'duration_frame': duration_frame,\n                        'feature_frame': feature_frame,\n                        'subset': mode,\n                        'annotations': annotations\n                    }\n    return gts_bmn\ndef save_feature_to_numpy(gts_bmn, folder):\n    global fps\n    print('save feature for bmn ...')\n    if not os.path.exists(folder):\n        os.mkdir(folder)\n    process_gts_bmn = {}\n    miss = 0\n    for item, value in gts_bmn.items():\n        # split to rsplit 针对文件命名修改",
+        "type": "code",
+        "location": "/applications/TableTennis/get_instance_for_bmn.py:155-182"
+    },
+    "2609": {
+        "file_id": 201,
+        "content": "The code segment defines a function that calculates segments of video data based on start and end IDs. It then appends the calculated segments, along with their corresponding labels and label names, to an 'annotations' list. The function returns a dictionary containing information about the duration, frame rate, feature frames, subset type, and annotations for a given dataset or model (in this case, named 'bmn'). Additionally, the code defines another function that saves the calculated features to a specified folder if it doesn't exist, and handles any missing files.",
+        "type": "comment"
+    },
+    "2610": {
+        "file_id": 201,
+        "content": "        basename, start_id, end_id = item.rsplit('_', 2)\n        if not basename in process_gts_bmn:\n            process_gts_bmn[basename] = []\n        process_gts_bmn[basename].append({\n            'name': item,\n            'start': float(start_id),\n            'end': float(end_id)\n        })\n    for item, values in process_gts_bmn.items():\n        feat_path = os.path.join(feat_dir, item + '.pkl')\n        feature_video = pickle.load(open(feat_path, 'rb'))['image_feature']\n        for value in values:\n            save_cut_name = os.path.join(folder, value['name'])\n            a, b, c = save_cut_name.rsplit('_', 2)\n            if float(b) > 360:\n                print(b)\n            start_frame = round(value['start'] * fps)\n            end_frame = round(value['end'] * fps)\n            if end_frame > len(feature_video):\n                miss += 1\n                continue\n            feature_cut = [\n                feature_video[i] for i in range(start_frame, end_frame)\n            ]\n            np_feature_cut = np.array(feature_cut, dtype=np.float32)",
+        "type": "code",
+        "location": "/applications/TableTennis/get_instance_for_bmn.py:183-207"
+    },
+    "2611": {
+        "file_id": 201,
+        "content": "The code is parsing video file names and extracting features from them. It then stores these features in a dictionary with corresponding start and end timestamps, and checks if any segments exceed the video length before saving the feature cut.",
+        "type": "comment"
+    },
+    "2612": {
+        "file_id": 201,
+        "content": "            np.save(save_cut_name, np_feature_cut)\n    print('miss number (broken sample):', miss)\nif __name__ == \"__main__\":\n    if not os.path.exists(out_dir):\n        os.mkdir(out_dir)\n    gts_bmn = {}\n    for item, value in label_files.items():\n        label_file = os.path.join(dataset, value)\n        gts_data = json.load(open(label_file, 'rb'))\n        gts_process = gen_gts_for_bmn(gts_data)\n        gts_bmn = combile_gts(gts_bmn, gts_process, item)\n    with open(out_dir + '/label.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(gts_bmn, indent=4, ensure_ascii=False)\n        f.write(data)\n    save_feature_to_numpy(gts_bmn, out_dir + '/feature')",
+        "type": "code",
+        "location": "/applications/TableTennis/get_instance_for_bmn.py:208-227"
+    },
+    "2613": {
+        "file_id": 201,
+        "content": "The code is saving processed data for a table tennis dataset. It creates a dictionary 'gts_bmn' from json files, processes it using 'gen_gts_for_bmn', combines it with existing data in 'gts_bmn', and then saves it as 'label.json' and 'feature'. It also handles creating the output directory if necessary.",
+        "type": "comment"
+    },
+    "2614": {
+        "file_id": 202,
+        "content": "/applications/TableTennis/gts_format_transfer.py",
+        "type": "filepath"
+    },
+    "2615": {
+        "file_id": 202,
+        "content": "Code reads a JSON file, modifies its format, and writes it back as a new JSON file.",
+        "type": "summary"
+    },
+    "2616": {
+        "file_id": 202,
+        "content": "import json\nwith open('/home/aistudio/work/BMN/Input_for_bmn/label_fixed.json') as f:\n    data = json.load(f)\nf.close()\ntarget_format = {'taxonomy': None, 'database': data, 'version': None}\njsonString = json.dumps(target_format, indent=4, ensure_ascii=False)\njsonFile = open('/home/aistudio/work/BMN/Input_for_bmn/label_gts.json', 'w')\njsonFile.write(jsonString)\njsonFile.close()",
+        "type": "code",
+        "location": "/applications/TableTennis/gts_format_transfer.py:1-12"
+    },
+    "2617": {
+        "file_id": 202,
+        "content": "Code reads a JSON file, modifies its format, and writes it back as a new JSON file.",
+        "type": "comment"
+    },
+    "2618": {
+        "file_id": 203,
+        "content": "/applications/TableTennis/predict/action_detect/action.py",
+        "type": "filepath"
+    },
+    "2619": {
+        "file_id": 203,
+        "content": "The Python script uses Baidu Cloud for action detection and includes audio, image processing functions. It has classes like ActionDetection and ModelPredict to initialize models, extract features from video input, retrieve proposals using BMN, classify actions based on extracted features and proposals, and log debugging information. Results are stored in a JSON file.",
+        "type": "summary"
+    },
+    "2620": {
+        "file_id": 203,
+        "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport functools\nimport numpy as np\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport mfcc.feature_extractor as mfcc_extractor\nimport models.pptsm_infer as image_model\nimport models.audio_infer as audio_model\nimport models.bmn_infer as prop_model\nimport models.lstm_infer as classify_model\nimport logger\nlogger = logger.Logger()\ndef record_time_info(func):\n    \"\"\"decorator func to log cost time for func\n    \"\"\"\n    @functools.wraps(func)\n    def timer(*args):\n        \"\"\"log cost time for func\n        \"\"\"\n        logger.info(\"function [{}] processing ...\".format(func.__name__))\n        start_time = time.time()\n        retval = func(*args)\n        cost_time = round(time.time() - start_time, 5)\n        logger.info(\"function [{}] run time: {:.2f} min\".format(\n            func.__name__, cost_time / 60))\n        return retval\n    return timer\nclass ActionDetection(object):",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/action.py:1-48"
+    },
+    "2621": {
+        "file_id": 203,
+        "content": "This code is a Python script for action detection using Baidu Cloud, which includes functions for processing audio and image data to predict actions. It utilizes various models such as mfcc_extractor, image_model, audio_model, prop_model, and classify_model. The ActionDetection class is defined, which likely contains the main logic of the action detection algorithm. The record_time_info function is a decorator used to log the time taken for executing specific functions.",
+        "type": "comment"
+    },
+    "2622": {
+        "file_id": 203,
+        "content": "    \"\"\"ModelPredict\"\"\"\n    def __init__(self, cfg_file=\"configs/configs.yaml\"):\n        cfg = parse_config(cfg_file)\n        self.configs = cfg\n        print_configs(self.configs, \"Infer\")\n        name = 'COMMON'\n        self.DEBUG = cfg[name]['DEBUG']\n        self.BMN_ONLY = cfg[name]['BMN_ONLY']\n        self.LSTM_ONLY = cfg[name]['LSTM_ONLY']\n        self.PCM_ONLY = cfg[name]['PCM_ONLY']\n        if self.LSTM_ONLY:\n            self.prop_dict = {}\n            for dataset in ['EuroCup2016']:\n                prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(\n                    dataset)\n                json_data = json.load(open(prop_json, 'r'))\n                for item in json_data:\n                    basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n                    basename = basename + '/' + item['video_name'] + '.mp4'\n                    self.prop_dict[basename] = item['bmn_results']\n    @record_time_info\n    def load_model(self):\n        \"\"\"\n        load_model\n        \"\"\"\n        if not self.DEBUG:",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/action.py:49-76"
+    },
+    "2623": {
+        "file_id": 203,
+        "content": "This code defines a ModelPredict class with an initializer that reads configs from a specified file and prints them. It also checks certain conditions related to LSTM_ONLY, sets properties based on those conditions, and loads a model if not in DEBUG mode.",
+        "type": "comment"
+    },
+    "2624": {
+        "file_id": 203,
+        "content": "            self.image_model = image_model.InferModel(self.configs)\n            if not self.PCM_ONLY:\n                self.audio_model = audio_model.InferModel(self.configs)\n        if not self.LSTM_ONLY:\n            self.prop_model = prop_model.InferModel(self.configs)\n        if not self.BMN_ONLY:\n            self.classify_model = classify_model.InferModel(self.configs)\n        logger.info(\"==> Action Detection prepared.\")\n    @record_time_info\n    def infer(self, imgs_path, pcm_path, fps=5):\n        \"\"\"\n        extract_feature\n        \"\"\"\n        self.imgs_path = imgs_path\n        self.pcm_path = pcm_path\n        self.configs['COMMON']['fps'] = fps\n        logger.info(\"==> input video {}\".format(os.path.basename(\n            self.imgs_path)))\n        # step 1: extract feature\n        video_features = self.extract_feature()\n        # step2: get proposal\n        bmn_results = self.extract_proposal(video_features)\n        # step3: classify\n        material = {'feature': video_features, 'proposal': bmn_results}",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/action.py:77-108"
+    },
+    "2625": {
+        "file_id": 203,
+        "content": "The code initializes different models for image, audio, and proposal extraction, and a classifier. It then extracts features from the input video, retrieves proposals using BMN (Bidirectional Motion Model), and finally classifies the action based on these extracted features and proposals.",
+        "type": "comment"
+    },
+    "2626": {
+        "file_id": 203,
+        "content": "        action_results = self.video_classify(material)\n        return bmn_results, action_results\n    @record_time_info\n    def video_classify(self, material):\n        \"\"\"video classify\"\"\"\n        if self.BMN_ONLY:\n            return []\n        action_results = self.classify_model.predict(self.configs,\n                                                     material=material)\n        logger.info('action shape {}'.format(np.array(action_results).shape))\n        return action_results\n    @record_time_info\n    def extract_proposal(self, video_features):\n        \"\"\"extract proposal\"\"\"\n        if self.LSTM_ONLY:\n            basename = self.imgs_path.replace('frames', 'mp4') + '.mp4'\n            bmn_results = self.prop_dict[basename]\n            return bmn_results\n        bmn_results = self.prop_model.predict(self.configs,\n                                              material=video_features)\n        logger.info('proposal shape {}'.format(np.array(bmn_results).shape))\n        return bmn_results\n    @record_time_info\n    def extract_feature(self):",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/action.py:109-136"
+    },
+    "2627": {
+        "file_id": 203,
+        "content": "This code defines several methods for video classification and feature extraction. It uses a model called \"classify_model\" to predict actions based on input material, and another model called \"prop_model\" to extract proposals. The BMN_ONLY and LSTM_ONLY flags determine if certain models are used or not. The code also includes logging for debugging purposes.",
+        "type": "comment"
+    },
+    "2628": {
+        "file_id": 203,
+        "content": "        \"\"\"extract feature\"\"\"\n        if not self.DEBUG:\n            image_path_list = get_images(self.imgs_path)\n            self.configs['PPTSM']['frame_list'] = image_path_list\n            self.configs['AUDIO']['pcm_file'] = self.pcm_path\n            image_features = self.image_model.predict(self.configs)\n            if self.PCM_ONLY:\n                sample_rate = self.configs['AUDIO']['sample_rate']\n                pcm_features = mfcc_extractor.extract_pcm(\n                    self.pcm_path, sample_rate)\n                audio_features = []\n            else:\n                audio_features, pcm_features = self.audio_model.predict(\n                    self.configs)\n            np_image_features = np.array(image_features, dtype=np.float32)\n            np_audio_features = np.array(audio_features, dtype=np.float32)\n            np_pcm_features = np.array(pcm_features, dtype=np.float32)\n            video_features = {\n                'image_feature': np_image_features,\n                'audio_feature': np_audio_features,",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/action.py:137-158"
+    },
+    "2629": {
+        "file_id": 203,
+        "content": "Extracts features from images and audio in a video file for further processing. If PCM_ONLY is True, extracts only MFCC features from audio using mfcc_extractor.",
+        "type": "comment"
+    },
+    "2630": {
+        "file_id": 203,
+        "content": "                'pcm_feature': np_pcm_features\n            }\n        else:\n            feature_path = self.imgs_path.replace(\"frames\", \"features\") + '.pkl'\n            video_features = pickle.load(open(feature_path, 'rb'))\n        logger.info(\"feature shape {} {} {}\".format(\n            video_features['image_feature'].shape,\n            video_features['audio_feature'].shape,\n            video_features['pcm_feature'].shape))\n        return video_features\nif __name__ == '__main__':\n    model_predict = ActionDetection(cfg_file=\"../configs/configs.yaml\")\n    model_predict.load_model()\n    imgs_path = \"/home/work/datasets/EuroCup2016/frames/1be705a8f67648da8ec4b4296fa80895\"\n    pcm_path = \"/home/work/datasets/EuroCup2016/pcm/1be705a8f67648da8ec4b4296fa80895.pcm\"\n    bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n    results = {'bmn_results': bmn_results, 'action_results': action_results}\n    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/action.py:159-185"
+    },
+    "2631": {
+        "file_id": 203,
+        "content": "The code loads video features from frames or pcm file and returns the features in the form of a dictionary. It then proceeds to initialize an instance of the ActionDetection class, load the model, define image and audio paths, and finally calls the infer function to generate bmn_results and action_results which are stored in the results dictionary and saved into a json file.",
+        "type": "comment"
+    },
+    "2632": {
+        "file_id": 203,
+        "content": "        f.write(data)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/action.py:186-186"
+    },
+    "2633": {
+        "file_id": 203,
+        "content": "Writes data to file.",
+        "type": "comment"
+    },
+    "2634": {
+        "file_id": 204,
+        "content": "/applications/TableTennis/predict/action_detect/logger.py",
+        "type": "filepath"
+    },
+    "2635": {
+        "file_id": 204,
+        "content": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler to log action detection information into \"action_detect.log\" in the 'logs' directory. The logging level is set to INFO, which will log informational messages and above (DEBUG, WARNING, ERROR, CRITICAL).",
+        "type": "summary"
+    },
+    "2636": {
+        "file_id": 204,
+        "content": "\"\"\"\nlogger\n\"\"\"\nimport os\nimport logging\nclass Logger(logging.Logger):\n    \"\"\"Customized logger for news stripper\n    \"\"\"\n    def __init__(self):\n        super(Logger, self).__init__(self)\n        if not os.path.exists('logs'):\n            os.mkdir('logs')\n        handler = logging.FileHandler(\"logs/action_detect.log\")\n        # handler.setLevel(logging.DEBUG)\n        handler.setLevel(logging.INFO)\n        format = \"%(levelname)s: %(asctime)s: %(filename)s:%(lineno)d %(message)s\"\n        datefmt = \"%y-%m-%d %H:%M:%S\"\n        formatter = logging.Formatter(format, datefmt)\n        handler.setFormatter(formatter)\n        self.addHandler(handler)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/logger.py:1-24"
+    },
+    "2637": {
+        "file_id": 204,
+        "content": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler to log action detection information into \"action_detect.log\" in the 'logs' directory. The logging level is set to INFO, which will log informational messages and above (DEBUG, WARNING, ERROR, CRITICAL).",
+        "type": "comment"
+    },
+    "2638": {
+        "file_id": 205,
+        "content": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py",
+        "type": "filepath"
+    },
+    "2639": {
+        "file_id": 205,
+        "content": "The code extracts audio features for Table Tennis prediction, using spectrogram and Mel scale transformation, and reads WAV files with VGG-16 model for MFCC and STFT feature extraction.",
+        "type": "summary"
+    },
+    "2640": {
+        "file_id": 205,
+        "content": "\"\"\"\naudio feature extract\n\"\"\"\n# coding: utf-8\nimport os\nimport numpy as np\nimport pickle\nimport mfcc.vgg_params as vgg_params\nimport sys\ndef frame(data, window_length, hop_length):\n    \"\"\"\n    frame\n    \"\"\"\n    num_samples = data.shape[0]\n    #print(\"window_length , hop_length\", window_length, hop_length)\n    #print(\"num_sample = \", num_samples)\n    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))\n    #print(\" num_frames = \", num_frames)\n    shape = (num_frames, window_length) + data.shape[1:]\n    #print(\" shape = \", shape)\n    strides = (data.strides[0] * hop_length, ) + data.strides\n    #print(\"data.strides = \", data.strides)\n    #print(\"strides = \", strides)\n    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)\ndef periodic_hann(window_length):\n    \"\"\"\n    periodic_hann\n    \"\"\"\n    return 0.5 - (0.5 *\n                  np.cos(2 * np.pi / window_length * np.arange(window_length)))\ndef stft_magnitude(signal, fft_length, hop_length=None, window_length=None):\n    \"\"\"\n    stft_magnitude",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:1-39"
+    },
+    "2641": {
+        "file_id": 205,
+        "content": "This code is for audio feature extraction in TableTennis application. It defines functions `frame`, `periodic_hann` and `stft_magnitude`. The `frame` function resizes the data array into frames with specified window length and hop length. The `periodic_hann` function generates a periodic Hann window for the STFT operation. Finally, `stft_magnitude` calculates the magnitude of the Short-Time Fourier Transform (STFT) of an audio signal.",
+        "type": "comment"
+    },
+    "2642": {
+        "file_id": 205,
+        "content": "    \"\"\"\n    frames = frame(signal, window_length, hop_length)\n    window = periodic_hann(window_length)\n    windowed_frames = frames * window\n    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))\n_MEL_BREAK_FREQUENCY_HERTZ = 700.0\n_MEL_HIGH_FREQUENCY_Q = 1127.0\ndef hertz_to_mel(frequencies_hertz):\n    \"\"\"\n    hertz_to_mel\n    \"\"\"\n    return _MEL_HIGH_FREQUENCY_Q * np.log(1.0 + (frequencies_hertz /\n                                                 _MEL_BREAK_FREQUENCY_HERTZ))\ndef spectrogram_to_mel_matrix(num_mel_bins=20,\n                              num_spectrogram_bins=129,\n                              audio_sample_rate=8000,\n                              lower_edge_hertz=125.0,\n                              upper_edge_hertz=3800.0):\n    \"\"\"\n    spectrogram_to_mel_matrix\n    \"\"\"\n    nyquist_hertz = audio_sample_rate / 2.\n    if lower_edge_hertz >= upper_edge_hertz:\n        raise ValueError(\"lower_edge_hertz %.1f >= upper_edge_hertz %.1f\" %\n                         (lower_edge_hertz, upper_edge_hertz))",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:40-70"
+    },
+    "2643": {
+        "file_id": 205,
+        "content": "The code defines functions for feature extraction and conversion of audio signals. The \"hertz_to_mel\" function converts frequencies from Hertz to Mel scale, which is used in psychoacoustics. The \"spectrogram_to_mel_matrix\" function creates a mel-frequency cepstral coefficients (MFCC) matrix for audio spectrograms. It checks for lower and upper frequency edge validity and calculates Mel frequencies based on the provided parameters.",
+        "type": "comment"
+    },
+    "2644": {
+        "file_id": 205,
+        "content": "    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz,\n                                         num_spectrogram_bins)\n    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)\n    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),\n                                 hertz_to_mel(upper_edge_hertz),\n                                 num_mel_bins + 2)\n    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))\n    for i in range(num_mel_bins):\n        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]\n        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /\n                       (center_mel - lower_edge_mel))\n        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /\n                       (upper_edge_mel - center_mel))\n        mel_weights_matrix[:,\n                           i] = np.maximum(0.0,\n                                           np.minimum(lower_slope, upper_slope))\n    mel_weights_matrix[0, :] = 0.0\n    return mel_weights_matrix\ndef log_mel_spectrogram(data,",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:71-91"
+    },
+    "2645": {
+        "file_id": 205,
+        "content": "This code is performing Mel frequency cepstral coefficients (MFCC) feature extraction on audio data. It creates spectrogram bins in Hz, converts them to the mel scale, defines mel band edges, and computes the corresponding mel weights matrix. The function returns this matrix after setting the first row to zero. This process is commonly used for speech processing and analysis.",
+        "type": "comment"
+    },
+    "2646": {
+        "file_id": 205,
+        "content": "                        audio_sample_rate=8000,\n                        log_offset=0.0,\n                        window_length_secs=0.025,\n                        hop_length_secs=0.010,\n                        **kwargs):\n    \"\"\"\n    log_mel_spectrogram\n    \"\"\"\n    window_length_samples = int(round(audio_sample_rate * window_length_secs))\n    #print(\"audio_sample_rate = \", audio_sample_rate)\n    #print(\"window_length_secs = \", window_length_secs)\n    #print(\"window_length_sample \", window_length_samples)\n    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))\n    #print(\"hop_length_samples \", hop_length_samples)\n    fft_length = 2**int(np.ceil(np.log(window_length_samples) / np.log(2.0)))\n    #print(\" fft_lengt = \", fft_length)\n    spectrogram = stft_magnitude(data,\n                                 fft_length=fft_length,\n                                 hop_length=hop_length_samples,\n                                 window_length=window_length_samples)\n    #print(\" spectrogram.shape = \", spectrogram.shape)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:92-112"
+    },
+    "2647": {
+        "file_id": 205,
+        "content": "This code defines a function called `log_mel_spectrogram` which takes audio data, sample rate, and optional keyword arguments. It calculates window length in samples, hop length in samples, FFT length, and then uses the Short-Time Fourier Transform (STFT) to generate a spectrogram from the input audio data. The resulting spectrogram is stored in the `spectrogram` variable and its shape is printed for debugging or reference purposes.",
+        "type": "comment"
+    },
+    "2648": {
+        "file_id": 205,
+        "content": "    mel_spectrogram = np.dot(\n        spectrogram,\n        spectrogram_to_mel_matrix(num_spectrogram_bins=spectrogram.shape[1],\n                                  audio_sample_rate=audio_sample_rate,\n                                  **kwargs))\n    return np.log(mel_spectrogram + log_offset)\ndef wav_to_example(wav_data, sample_rate):\n    \"\"\"\n    wav_to_example\n    \"\"\"\n    #sample_rate, wav_data = wavfile.read(wav_file)\n    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype\n    #wav_data = wav_data[:16000*30]\n    #print(\" wav_data \", wav_data.shape)\n    #print(\" wav_data \", wav_data.shape)\n    pad_zero_num = int(sample_rate * (vgg_params.STFT_WINDOW_LENGTH_SECONDS -\n                                      vgg_params.STFT_HOP_LENGTH_SECONDS))\n    wav_data_extend = np.hstack((wav_data, np.zeros(pad_zero_num)))\n    wav_data = wav_data_extend\n    #print(\" wav_data \", wav_data.shape)\n    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]\n    #print(\" wav_data after convert to -1 1\", wav_data)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:113-137"
+    },
+    "2649": {
+        "file_id": 205,
+        "content": "Function `spectrogram_to_mel_matrix` converts the spectrogram to Mel scale. Code calculates Mel spectrogram by taking dot product of spectrogram with `spectrogram_to_mel_matrix`. The result is then log transformed to avoid numerical underflow and returned.\nThe function `wav_to_example` takes wav file data, validates sample type, pads zeros to achieve desired window length, scales the wav data to range -1 to 1 by dividing by 32768.0. It is used for audio feature extraction in TableTennis application of PaddleVideo.",
+        "type": "comment"
+    },
+    "2650": {
+        "file_id": 205,
+        "content": "    #if wav_data.shape[0] > max_second * sample_rate:\n    #    wav_data = wav_data[:max_second * sample_rate, :]\n    if len(wav_data.shape) > 1:\n        wav_data = np.mean(wav_data, axis=1)\n    #print(\" wav_data after mean\", wav_data.shape, len(wav_data.shape), wav_data)\n    # Resample to the rate assumed by vgg.\n    #if sample_rate != vgg_params.SAMPLE_RATE:\n    #    wav_data = resampy.resample(wav_data, sample_rate, vgg_params.SAMPLE_RATE)\n    log_mel = log_mel_spectrogram(\n        wav_data,\n        audio_sample_rate=vgg_params.SAMPLE_RATE,\n        log_offset=vgg_params.LOG_OFFSET,\n        window_length_secs=vgg_params.STFT_WINDOW_LENGTH_SECONDS,\n        hop_length_secs=vgg_params.STFT_HOP_LENGTH_SECONDS,\n        num_mel_bins=vgg_params.NUM_MEL_BINS,\n        lower_edge_hertz=vgg_params.MEL_MIN_HZ,\n        upper_edge_hertz=vgg_params.MEL_MAX_HZ)\n    # Frame features into examples.\n    features_sample_rate = 1.0 / vgg_params.STFT_HOP_LENGTH_SECONDS\n    example_window_length = int(\n        round(vgg_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:138-158"
+    },
+    "2651": {
+        "file_id": 205,
+        "content": "This code extracts audio features for Table Tennis prediction. It first reshapes and resamples the input wav_data if necessary, then calculates log mel spectrogram from wav_data using given parameters. Finally, it frames these features into examples with a specific window length.",
+        "type": "comment"
+    },
+    "2652": {
+        "file_id": 205,
+        "content": "    example_hop_length = int(\n        round(vgg_params.EXAMPLE_HOP_SECONDS * features_sample_rate))\n    log_mel_examples = frame(log_mel,\n                             window_length=example_window_length,\n                             hop_length=example_hop_length)\n    return log_mel_examples\ndef extract_pcm(pcm_file, sample_rate):\n    with open(pcm_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype=np.int16)\n    examples = wav_to_example(audio_data, sample_rate)\n    return examples\nif __name__ == \"__main__\":\n    wav_file = sys.argv[1]\n    print(\"wav_file = \", wav_file)\n    with open(wav_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype=np.int16)\n    examples_batch = wav_to_example(audio_data, 16000)\n    print(\"examples_batch.shape\", examples_batch.shape)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:160-183"
+    },
+    "2653": {
+        "file_id": 205,
+        "content": "This code extracts audio features from a WAV file using the VGG-16 model, specifically focusing on MFCC (Mel Frequency Cepstral Coefficients) and STFT (Short-Time Fourier Transform). The code also defines a function to convert PCM data into examples and another to extract MFCC features. Lastly, it demonstrates how to use the code by reading a WAV file and printing its shape.",
+        "type": "comment"
+    },
+    "2654": {
+        "file_id": 206,
+        "content": "/applications/TableTennis/predict/action_detect/mfcc/model_config.py",
+        "type": "filepath"
+    },
+    "2655": {
+        "file_id": 206,
+        "content": "The ModelAudio class extracts audio features using wav_to_example and slices the data into parts, calculating features for each part. The predict method appends these features to a list and returns the audio feature list after dividing by sample rate.",
+        "type": "summary"
+    },
+    "2656": {
+        "file_id": 206,
+        "content": "\"\"\"\naudio model config\n\"\"\"\nimport numpy as np\nimport mfcc.feature_extractor as feature_extractor\nclass ModelAudio(object):\n    \"\"\"\n    modelAudio\n    \"\"\"\n    def __init__(self, configs, use_gpu=1):\n        self.use_gpu = use_gpu\n        self.audio_fps = configs.COMMON.fps\n        self.audio_feat_scale = configs.TSN.audio_scale\n        self.sample_rate = 16000\n    def predict_slice(self, wav_data, sample_rate):\n        \"\"\"\n        audio predict\n        \"\"\"\n        examples_batch = feature_extractor.wav_to_example(\n            wav_data, sample_rate)[0]\n        return examples_batch\n    def predict_audio(self, audio_file):\n        \"\"\"\n        predict_audio\n        \"\"\"\n        audio_feature_list = []\n        # read pcm\n        sample_rate = self.sample_rate\n        try:\n            with open(audio_file, \"rb\") as f:\n                pcm_data = f.read()\n            audio_data = np.fromstring(pcm_data, dtype=np.int16)\n            audio_status = \"audio load success\"\n        except Exception as e:\n            audio_data = []\n            audio_status = \"audio load failed\"",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:1-42"
+    },
+    "2657": {
+        "file_id": 206,
+        "content": "The code defines a ModelAudio class which takes in audio-related configurations and performs audio feature extraction using the feature_extractor module's wav_to_example function. The class also predicts audio by converting PCM data to numpy array and handles audio file reading exceptions.",
+        "type": "comment"
+    },
+    "2658": {
+        "file_id": 206,
+        "content": "        step = 1\n        len_video = int(len(audio_data) / sample_rate)\n        print(len_video)\n        for i in range(0, len_video, step):\n            audio_data_part = audio_data[i * sample_rate:(i + step) *\n                                         sample_rate]\n            feature_audio = self.predict_slice(audio_data_part, sample_rate)\n            audio_feature_list.append(feature_audio)\n        return audio_feature_list",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:43-51"
+    },
+    "2659": {
+        "file_id": 206,
+        "content": "The code slices the audio data into parts of size 'step' and calculates features for each part using a predict method, then appends the features to a list. The length of the entire audio data is divided by the sample rate to determine how many steps can fit in it. This function returns the audio feature list.",
+        "type": "comment"
+    },
+    "2660": {
+        "file_id": 207,
+        "content": "/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py",
+        "type": "filepath"
+    },
+    "2661": {
+        "file_id": 207,
+        "content": "The code defines global parameters for the VGGish model, including architectural constants, hyperparameters, and optimizer settings. It extracts audio features from spectrogram patches using PCA quantization and embedding processing, with options to adjust STFT window and hop lengths, mel frequency bins, and learning rate.",
+        "type": "summary"
+    },
+    "2662": {
+        "file_id": 207,
+        "content": "\"\"\"Global parameters for the VGGish model.\nSee vggish_slim.py for more information.\n\"\"\"\n# Architectural constants.\nNUM_FRAMES = 50  # Frames in input mel-spectrogram patch.\nNUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.\nEMBEDDING_SIZE = 128  # Size of embedding layer.\n# Hyperparameters used in feature and example generation.\nSAMPLE_RATE = 16000\nSTFT_WINDOW_LENGTH_SECONDS = 0.040\nSTFT_HOP_LENGTH_SECONDS = 0.020\nNUM_MEL_BINS = NUM_BANDS\nMEL_MIN_HZ = 125\nMEL_MAX_HZ = 7500\nLOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.\nEXAMPLE_WINDOW_SECONDS = 1.00  # Each example contains 96 10ms frames\nEXAMPLE_HOP_SECONDS = 1.00  # with zero overlap.\n# Parameters used for embedding postprocessing.\nPCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'\nPCA_MEANS_NAME = 'pca_means'\nQUANTIZE_MIN_VAL = -2.0\nQUANTIZE_MAX_VAL = +2.0\n# Hyperparameters used in training.\nINIT_STDDEV = 0.01  # Standard deviation used to initialize weights.\nLEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:1-29"
+    },
+    "2663": {
+        "file_id": 207,
+        "content": "This code sets global parameters for the VGGish model. It defines architectural constants, hyperparameters for feature and example generation, embedding postprocessing, and training. The VGGish model is used to extract audio features from spectrogram patches, with options for PCA-based quantization and embedding processing. Hyperparameters control the STFT window and hop lengths, mel frequency bins, and learning rate for Adam optimizer.",
+        "type": "comment"
+    },
+    "2664": {
+        "file_id": 207,
+        "content": "ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.\n# Names of ops, tensors, and features.\nINPUT_OP_NAME = 'vggish/input_features'\nINPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'\nOUTPUT_OP_NAME = 'vggish/embedding'\nOUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'\nAUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'",
+        "type": "code",
+        "location": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:30-37"
+    },
+    "2665": {
+        "file_id": 207,
+        "content": "This code sets the Adam optimizer's epsilon value to 1e-8, defines names for input and output operations, tensors, and features. It also assigns the name \"audio_embedding\" to a feature.",
+        "type": "comment"
+    },
+    "2666": {
+        "file_id": 208,
+        "content": "/applications/TableTennis/predict/action_detect/models/audio_infer.py",
+        "type": "filepath"
+    },
+    "2667": {
+        "file_id": 208,
+        "content": "The code defines a \"InferModel\" class for audio inference using PaddleVideo, performs prediction on an audio file, and outputs shape, first value, and time taken. The model's predict function is called with 32-sample data as placeholders.",
+        "type": "summary"
+    },
+    "2668": {
+        "file_id": 208,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"audio infer\"\"\"\n    def __init__(self, cfg, name='AUDIO'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/audio_infer.py:1-37"
+    },
+    "2669": {
+        "file_id": 208,
+        "content": "The code above defines a class \"InferModel\" for audio inference. It initializes the model by setting the model file, parameters file, GPU memory, and device ID from a configuration file. The code enables IR optimization, memory optimization, and disables zero copy. Finally, it creates a predictor object and retrieves the input handle for the first input name.",
+        "type": "comment"
+    },
+    "2670": {
+        "file_id": 208,
+        "content": "        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        pcm_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = np.array(data, dtype='float32')\n            output = self.infer(inputs)\n            feature_list.append(np.squeeze(output))\n            pcm_list.append(inputs)\n        feature_values = np.vstack(feature_list)\n        pcm_values = np.vstack(pcm_list)\n        return feature_values, pcm_values\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/audio_infer.py:39-67"
+    },
+    "2671": {
+        "file_id": 208,
+        "content": "This code defines a class for audio inference using the PaddleVideo library. It has an `infer` method that takes input data and returns output data after running the inference, and a `predict` method that loops through inferencer data, performs inference for each data point, and returns feature values and pcm values as arrays. The main part of the code initializes an instance of this class using a configuration file (configs.yaml), which is then used for further processing.",
+        "type": "comment"
+    },
+    "2672": {
+        "file_id": 208,
+        "content": "    pcm_path = '/home/work/datasets/WorldCup2018/pcm/6e577252c4004961ac7caa738a52c238.pcm'\n    t0 = time.time()\n    cfg['AUDIO']['pcm_file'] = pcm_path\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print(outputs[0])\n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/audio_infer.py:69-78"
+    },
+    "2673": {
+        "file_id": 208,
+        "content": "This code loads an audio file, configures the model with its path, performs prediction, and prints the output shape, first output value, and time taken for the process. The model's predict function is called with a random 32-sample data array as placeholders.",
+        "type": "comment"
+    },
+    "2674": {
+        "file_id": 209,
+        "content": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py",
+        "type": "filepath"
+    },
+    "2675": {
+        "file_id": 209,
+        "content": "The \"InferModel\" class is a GPU-optimized inference function for generating action boundaries in videos using propositions and scoring functions. It calculates running averages of predictions, predicts video features, and saves results as proposals in 'results.json'.",
+        "type": "summary"
+    },
+    "2676": {
+        "file_id": 209,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import process_proposal\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"bmn infer\"\"\"\n    def __init__(self, cfg, name='BMN'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        self.nms_thread = cfg[name]['nms_thread']\n        self.min_pred_score = cfg[name]['score_thread']\n        self.min_frame_thread = cfg['COMMON']['fps']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:1-39"
+    },
+    "2677": {
+        "file_id": 209,
+        "content": "The code defines a class called \"InferModel\" which implements the bmn infer function. It initializes the model with specified configuration and enables GPU usage if available. The class takes in a config file that specifies model, parameters files, GPU memory, device ID, thread count for nms, minimum prediction score threshold, and frame thread count. The code also switches on IR optimizations and enables memory optimization for efficient execution of the model.",
+        "type": "comment"
+    },
+    "2678": {
+        "file_id": 209,
+        "content": "        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n        self.output3_tensor = self.predictor.get_output_handle(output_names[2])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        output3 = self.output3_tensor.copy_to_cpu()\n        return output1, output2, output3\n    def generate_props(self,\n                       pred_bmn,\n                       pred_start,\n                       pred_end,\n                       max_window=200,\n                       min_window=5):",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:40-65"
+    },
+    "2679": {
+        "file_id": 209,
+        "content": "The code initializes a predictor, sets input and output tensors for inference, and defines an \"infer\" method to perform inference. The \"generate_props\" function takes predictions, start and end timestamps, and generates properties based on the given parameters.",
+        "type": "comment"
+    },
+    "2680": {
+        "file_id": 209,
+        "content": "        \"\"\"generate_props\"\"\"\n        video_len = min(pred_bmn.shape[-1],\n                        min(pred_start.shape[-1], pred_end.shape[-1]))\n        pred_bmn = pred_bmn[0, :, :] * pred_bmn[1, :, :]\n        start_mask = self.boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = self.boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_results = []\n        for idx in range(min_window, max_window):\n            for jdx in range(video_len):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < video_len and start_mask[\n                        start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = start_index\n                    xmax = end_index\n                    xmin_score = pred_start[start_index]\n                    xmax_score = pred_end[end_index]\n                    bmn_score = pred_bmn[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bmn_score\n                    score_results.append([xmin, xmax, conf_score])",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:66-87"
+    },
+    "2681": {
+        "file_id": 209,
+        "content": "This code generates propositions for action boundaries in a video. It iterates through possible window sizes to find valid start and end indices, checks if start and end masks match, then computes the confidence score based on boundary scores and BNM score. The results are stored as a list of [xmin, xmax, confidence] values.",
+        "type": "comment"
+    },
+    "2682": {
+        "file_id": 209,
+        "content": "        return score_results\n    def boundary_choose(self, score_list):\n        \"\"\"boundary_choose\"\"\"\n        max_score = max(score_list)\n        mask_high = (score_list > max_score * 0.5)\n        score_list = list(score_list)\n        score_middle = np.array([0.0] + score_list + [0.0])\n        score_front = np.array([0.0, 0.0] + score_list)\n        score_back = np.array(score_list + [0.0, 0.0])\n        mask_peak = ((score_middle > score_front) & (score_middle > score_back))\n        mask_peak = mask_peak[1:-1]\n        mask = (mask_high | mask_peak).astype('float32')\n        return mask\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name,\n                                         'infer',\n                                         infer_config,\n                                         material=material)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = [items[0] for items in data]\n            winds = [items[1] for items in data]",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:88-112"
+    },
+    "2683": {
+        "file_id": 209,
+        "content": "The code defines three functions: \"action_detect.models.bmn_infer.py\" contains the \"score_results\", \"boundary_choose\", and \"predict\" functions. The \"score_results\" function returns a list of scores for each action. The \"boundary_choose\" function determines boundary scores based on peak, front, and back scores. It uses masks to identify relevant positions in the score list. Finally, the \"predict\" function initializes an infer reader, iterates through data, and gathers input data for prediction.",
+        "type": "comment"
+    },
+    "2684": {
+        "file_id": 209,
+        "content": "            feat_info = [items[2] for items in data]\n            feature_T = feat_info[0][0]\n            feature_N = feat_info[0][1]\n            inputs = np.array(inputs)\n            pred_bmn, pred_sta, pred_end = self.infer(inputs)\n            if infer_iter == 0:\n                sum_pred_bmn = np.zeros((2, feature_N, feature_T))\n                sum_pred_sta = np.zeros((feature_T, ))\n                sum_pred_end = np.zeros((feature_T, ))\n                sum_pred_cnt = np.zeros((feature_T, ))\n            for idx, sub_wind in enumerate(winds):\n                sum_pred_bmn[:, :, sub_wind[0]:sub_wind[1]] += pred_bmn[idx]\n                sum_pred_sta[sub_wind[0]:sub_wind[1]] += pred_sta[idx]\n                sum_pred_end[sub_wind[0]:sub_wind[1]] += pred_end[idx]\n                sum_pred_cnt[sub_wind[0]:sub_wind[1]] += np.ones(\n                    (sub_wind[1] - sub_wind[0], ))\n        pred_bmn = sum_pred_bmn / sum_pred_cnt\n        pred_sta = sum_pred_sta / sum_pred_cnt\n        pred_end = sum_pred_end / sum_pred_cnt",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:113-135"
+    },
+    "2685": {
+        "file_id": 209,
+        "content": "This code performs a running average of predictions from a series of windows. It calculates the sum of each prediction for each window, divides it by the count of non-zero frames in that window, and stores the results in `sum_pred_bmn`, `sum_pred_sta`, and `sum_pred_end`. Finally, it divides these sums by the corresponding counts to get the final predictions.",
+        "type": "comment"
+    },
+    "2686": {
+        "file_id": 209,
+        "content": "        score_result = self.generate_props(pred_bmn, pred_sta, pred_end)\n        results = process_proposal(score_result, self.min_frame_thread,\n                                   self.nms_thread, self.min_pred_score)\n        return results\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    t0 = time.time()\n    outputs = model.predict(cfg, video_features)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    results = {'proposal': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:137-164"
+    },
+    "2687": {
+        "file_id": 209,
+        "content": "The code initializes an instance of the InferModel class from the given configuration file. It then predicts the video features by calling the model's predict method, passing the video features as input and returns the results in the form of proposals. The output is then saved to a JSON file named 'results.json'.",
+        "type": "comment"
+    },
+    "2688": {
+        "file_id": 210,
+        "content": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py",
+        "type": "filepath"
+    },
+    "2689": {
+        "file_id": 210,
+        "content": "This code initializes a Table Tennis action detection model using LSTM, loads configurations, and processes proposals for multiple datasets. It applies inference, predicts actions on video features, sorts predictions, and saves results in JSON format.",
+        "type": "summary"
+    },
+    "2690": {
+        "file_id": 210,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import get_action_result\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"lstm infer\"\"\"\n    def __init__(self, cfg, name='ACTION'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        self.topk = cfg[name]['topk']\n        self.frame_offset = cfg[name]['nms_offset']\n        self.nms_thread = cfg[name]['nms_thread']\n        self.cls_thread = cfg[name]['classify_score_thread']\n        self.iou_thread = cfg[name]['iou_score_thread']\n        self.label_map_file = cfg['COMMON']['label_dic']\n        self.fps = cfg['COMMON']['fps']\n        self.nms_id = 5",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:1-38"
+    },
+    "2691": {
+        "file_id": 210,
+        "content": "This code defines a class named InferModel that implements an LSTM model for action detection. The model is initialized with configuration parameters, including the path to the model and parameter files, GPU memory usage, and device ID. Additional configuration settings include topk, frame_offset, nms_thread, classify_score_thread, iou_score_thread, label_dic, fps, and nms_id. These parameters control various aspects of the action detection process. The code imports necessary libraries and modules for preprocessing, config utilities, result processing, and model loading from PaddlePaddle.",
+        "type": "comment"
+    },
+    "2692": {
+        "file_id": 210,
+        "content": "        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input1_tensor = self.predictor.get_input_handle(input_names[0])\n        self.input2_tensor = self.predictor.get_input_handle(input_names[1])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n    def infer(self, input1_arr, input1_lod, input2_arr=None, input2_lod=None):\n        \"\"\"infer\"\"\"\n        self.input1_tensor.copy_from_cpu(input1_arr)\n        self.input1_tensor.set_lod(input1_lod)\n        if not input2_arr is None:\n            self.input2_tensor.copy_from_cpu(input2_arr)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:40-62"
+    },
+    "2693": {
+        "file_id": 210,
+        "content": "Initializes model and sets up input/output tensors for inferencing.",
+        "type": "comment"
+    },
+    "2694": {
+        "file_id": 210,
+        "content": "            self.input2_tensor.set_lod(input2_lod)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        # print(output.shape)\n        return output1, output2\n    def pre_process(self, input):\n        \"\"\"pre process\"\"\"\n        input_arr = []\n        input_lod = [0]\n        start_lod = 0\n        end_lod = 0\n        for sub_item in input:\n            end_lod = start_lod + len(sub_item)\n            input_lod.append(end_lod)\n            input_arr.extend(sub_item)\n            start_lod = end_lod\n        input_arr = np.array(input_arr)\n        # print(input_arr.shape)\n        # print([input_lod])\n        return input_arr, [input_lod]\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name,\n                                         'infer',\n                                         infer_config,\n                                         material=material)\n        results = []",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:63-92"
+    },
+    "2695": {
+        "file_id": 210,
+        "content": "The code is part of a model for action detection in Table Tennis. It sets the input's layout of dimension (LOD) and performs preprocessing, prediction, and returns output results. The LOD defines the shape of data along the spatial dimensions.",
+        "type": "comment"
+    },
+    "2696": {
+        "file_id": 210,
+        "content": "        for infer_iter, data in enumerate(infer_reader()):\n            video_id = [[items[-2], items[-1]] for items in data]\n            input1 = [items[0] for items in data]\n            input2 = [items[1] for items in data]\n            input1_arr, input1_lod = self.pre_process(input1)\n            input2_arr, input2_lod = self.pre_process(input2)\n            output1, output2 = self.infer(input1_arr, input1_lod, input2_arr,\n                                          input2_lod)\n            # output1, output2 = self.infer(input1_arr, input1_lod)\n            predictions_id = output1\n            predictions_iou = output2\n            for i in range(len(predictions_id)):\n                topk_inds = predictions_id[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds_id = predictions_id[i][topk_inds]\n                preds_iou = predictions_iou[i][0]\n                results.append((video_id[i], preds_id.tolist(),\n                                topk_inds.tolist(), preds_iou.tolist()))",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:93-111"
+    },
+    "2697": {
+        "file_id": 210,
+        "content": "This code iterates through a data source, preprocesses the input, and performs inference on it using a model. The resulting outputs are then sorted to obtain the top k predictions for each input. The video ID, predicted action IDs, sorted indices, and IOU scores are stored in a results list.",
+        "type": "comment"
+    },
+    "2698": {
+        "file_id": 210,
+        "content": "        predict_result = get_action_result(results, self.label_map_file,\n                                           self.fps, self.cls_thread,\n                                           self.iou_thread, self.nms_id,\n                                           self.nms_thread, self.frame_offset)\n        return predict_result\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    # proposal total\n    prop_dict = {}\n    for dataset in ['EuroCup2016', 'WorldCup2018']:\n        prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(\n            dataset)\n        json_data = json.load(open(prop_json, 'r'))\n        for item in json_data:\n            basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n            basename = basename + '/' + item['video_name'] + '.mp4'\n            prop_dict[basename] = item['bmn_results']\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:113-136"
+    },
+    "2699": {
+        "file_id": 210,
+        "content": "The code is a part of a Table Tennis action detection model implemented using LSTM (Long Short-Term Memory). It loads configurations from a YAML file, initializes the model, and processes proposals for multiple datasets. The model takes results from previous processing steps, applies inference based on labels, frame rate, and other parameters, and returns the final prediction result.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/27.json b/docs/data/27.json
new file mode 100644
index 000000000..3a5096194
--- /dev/null
+++ b/docs/data/27.json
@@ -0,0 +1,540 @@
+{
+    "2700": {
+        "file_id": 210,
+        "content": "    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    # proposal\n    basename = imgs_path.replace('frames', 'mp4') + '.mp4'\n    bmn_results = prop_dict[basename]\n    material = {'feature': video_features, 'proposal': bmn_results}\n    t0 = time.time()\n    outputs = model.predict(cfg, material)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    # print(outputs.shape)\n    t1 = time.time()\n    results = {'actions': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:138-158"
+    },
+    "2701": {
+        "file_id": 210,
+        "content": "This code loads video features and proposals, then predicts action using the LSTM model. The results are saved in a JSON file and the time taken is printed.",
+        "type": "comment"
+    },
+    "2702": {
+        "file_id": 211,
+        "content": "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py",
+        "type": "filepath"
+    },
+    "2703": {
+        "file_id": 211,
+        "content": "This code initializes an InferModel class for \"PPTSM\" model inference using PaddlePaddle and GPU, performs inference, predicts feature lists from inputs, retrieves image files, assigns them to the model, prints output shapes, calculates prediction time.",
+        "type": "summary"
+    },
+    "2704": {
+        "file_id": 211,
+        "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"pptsm infer\"\"\"\n    def __init__(self, cfg, name='PPTSM'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py:1-38"
+    },
+    "2705": {
+        "file_id": 211,
+        "content": "This code initializes an instance of the InferModel class for a specific model named \"PPTSM\". It takes in a configuration file (cfg) and sets up the necessary parameters for the model's inference process. The class uses PaddlePaddle library to create a predictor, which handles input and output data processing and enables GPU memory optimization for faster computation.",
+        "type": "comment"
+    },
+    "2706": {
+        "file_id": 211,
+        "content": "        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[1])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = [items[:-1] for items in data]\n            inputs = np.array(inputs)\n            output = self.infer(inputs)\n            feature_list.append(np.squeeze(output))\n        feature_list = np.vstack(feature_list)\n        return feature_list\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238/'",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py:40-68"
+    },
+    "2707": {
+        "file_id": 211,
+        "content": "This code is for an InferModel class that performs inference on video frames using a pre-trained model. It uses the get_output_names and get_output_handle methods from the predictor to specify the desired output tensor. The infer method takes input data, runs the inference, and returns the output tensor. The predict method reads input data from a specified directory or config file, applies inference on frames, and returns a feature list.",
+        "type": "comment"
+    },
+    "2708": {
+        "file_id": 211,
+        "content": "    imgs_list = get_images(imgs_path)\n    t0 = time.time()\n    cfg['PPTSM']['frame_list'] = imgs_list\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py:69-77"
+    },
+    "2709": {
+        "file_id": 211,
+        "content": "This code retrieves image files from a specified path, assigns them to a model for inference and prints the resulting shape of the outputs. It also calculates and displays the time taken for the prediction process.",
+        "type": "comment"
+    },
+    "2710": {
+        "file_id": 212,
+        "content": "/applications/TableTennis/predict/action_detect/reader/__init__.py",
+        "type": "filepath"
+    },
+    "2711": {
+        "file_id": 212,
+        "content": "This code imports various reader classes and registers them using the regist_reader function. It sorts the registrations alphabetically. The BMNINFReader reads data from files with a \"BMN\" extension, while FeatureReader reads action data. No readers are registered for TSM or PPTSM in this version of the code.",
+        "type": "summary"
+    },
+    "2712": {
+        "file_id": 212,
+        "content": "\"\"\"\nread map for model\n\"\"\"\nfrom reader.reader_utils import regist_reader, get_reader\n# import reader.tsminf_reader as tsminf_reader\n# import reader.audio_reader as audio_reader\nimport reader.bmninf_reader as bmninf_reader\nimport reader.feature_reader as feature_reader\n# regist reader, sort by alphabet\n# regist_reader(\"TSM\", tsminf_reader.TSMINFReader)\n# regist_reader(\"PPTSM\", tsminf_reader.TSMINFReader)\n# regist_reader(\"AUDIO\", audio_reader.AudioReader)\nregist_reader(\"BMN\", bmninf_reader.BMNINFReader)\nregist_reader(\"ACTION\", feature_reader.FeatureReader)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/__init__.py:1-15"
+    },
+    "2713": {
+        "file_id": 212,
+        "content": "This code imports various reader classes and registers them using the regist_reader function. It sorts the registrations alphabetically. The BMNINFReader reads data from files with a \"BMN\" extension, while FeatureReader reads action data. No readers are registered for TSM or PPTSM in this version of the code.",
+        "type": "comment"
+    },
+    "2714": {
+        "file_id": 213,
+        "content": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py",
+        "type": "filepath"
+    },
+    "2715": {
+        "file_id": 213,
+        "content": "The code introduces the BMNINFReader class for data reading in PaddleVideo's TableTennis app, initializes a table tennis action detection class, creates a dataset, and defines an inference reader function.",
+        "type": "summary"
+    },
+    "2716": {
+        "file_id": 213,
+        "content": "\"\"\"\n# @File  : bmninf_reader.py\n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport os\nimport random\nimport pickle\nimport json\nimport numpy as np\nimport multiprocessing\nimport numpy as np\nfrom .reader_utils import DataReader\ndef get_sw_prop(duration, window=200, step=10):\n    \"\"\"\n    get_sw_prop\n    \"\"\"\n    pr = []\n    local_boxes = []\n    for k in np.arange(0, duration - window + step, step):\n        start_id = k\n        end_id = min(duration, k + window)\n        if end_id - start_id < window:\n            start_id = end_id - window\n        local_boxes = (start_id, end_id)\n        pr.append(local_boxes)\n    def valid_proposal(duration, span):\n        \"\"\"\n        valid_proposal\n        \"\"\"\n        # fileter proposals\n        # a valid proposal should have at least one second in the video\n        real_span = min(duration, span[1]) - span[0]\n        return real_span >= 1\n    pr = list(filter(lambda x: valid_proposal(duration, x), pr))\n    return pr\nclass BMNINFReader(DataReader):\n    \"\"\"\n    Data reader for BMN model, which was stored as features extracted by prior networks",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:1-49"
+    },
+    "2717": {
+        "file_id": 213,
+        "content": "This code defines a class BMNINFReader for data reading in PaddleVideo's TableTennis application. It uses the get_sw_prop function to retrieve valid proposal spans, filters them based on having at least one second of video duration, and returns the filtered proposal list.",
+        "type": "comment"
+    },
+    "2718": {
+        "file_id": 213,
+        "content": "    dataset cfg: feat_path, feature path,\n                 tscale, temporal length of BM map,\n                 dscale, duration scale of BM map,\n                 anchor_xmin, anchor_xmax, the range of each point in the feature sequence,\n                 batch_size, batch size of input data,\n                 num_threads, number of threads of data processing\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.tscale = cfg[self.name.upper()]['tscale']  # 200\n        self.dscale = cfg[self.name.upper()]['dscale']  # 200\n        # self.subset = cfg[self.name.upper()]['subset']\n        self.tgap = 1. / self.tscale\n        self.step = cfg[self.name.upper()]['window_step']\n        self.material = material\n        src_feature = self.material\n        image_feature = src_feature['image_feature']\n        # pcm_feature = src_feature['pcm_feature']\n        # pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))\n        # print(rgb_feature.shape, audio_feature.shape, pcm_feature.shape)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:50-72"
+    },
+    "2719": {
+        "file_id": 213,
+        "content": "This code is initializing a class that reads data from BMNINF files for table tennis action detection. The class takes in arguments like name, mode, configuration (cfg), and material. It sets the temporal length of BM map (tscale) and duration scale of BM map (dscale). It also calculates other values such as step size and uses them to reshape feature data.",
+        "type": "comment"
+    },
+    "2720": {
+        "file_id": 213,
+        "content": "        # min_length = min(image_feature.shape[0], pcm_feature.shape[0])\n        #if min_length == 0:\n        #    continue\n        # image_feature = image_feature[:min_length, :]\n        # pcm_feature = pcm_feature[:min_length, :]\n        # self.features = np.concatenate((image_feature, pcm_feature), axis=1)\n        self.features = image_feature\n        self.duration = len(self.features)\n        self.window = self.tscale\n        self.get_dataset_dict()\n        self.get_match_map()\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        if (mode == 'test') or (mode == 'infer'):\n            self.num_threads = 1  # set num_threads as 1 for test and infer\n    def get_dataset_dict(self):\n        \"\"\"\n        get_dataset_dict\n        \"\"\"\n        self.video_list = get_sw_prop(self.duration, self.window, self.step)\n    def get_match_map(self):\n        \"\"\"\n        get_match_map\n        \"\"\"\n        match_map = []\n        for idx in range(self.tscale):\n            tmp_match_window = []\n            xmin = self.tgap * idx",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:73-103"
+    },
+    "2721": {
+        "file_id": 213,
+        "content": "This code is creating a dataset for video analysis. It concatenates image and audio features, sets the duration, window size, and retrieves the list of videos to be analyzed in the dataset. The code also handles test and infer modes by setting the number of threads accordingly.",
+        "type": "comment"
+    },
+    "2722": {
+        "file_id": 213,
+        "content": "            for jdx in range(1, self.tscale + 1):\n                xmax = xmin + self.tgap * jdx\n                tmp_match_window.append([xmin, xmax])\n            match_map.append(tmp_match_window)\n        match_map = np.array(match_map)\n        match_map = np.transpose(match_map, [1, 0, 2])\n        match_map = np.reshape(match_map, [-1, 2])\n        self.match_map = match_map\n        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]\n        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]\n    def load_file(self, video_wind):\n        \"\"\"\n        load_file\n        \"\"\"\n        start_feat_id = video_wind[0]\n        end_feat_id = video_wind[1]\n        video_feat = self.features[video_wind[0]:video_wind[1]]\n        video_feat = video_feat.T\n        video_feat = video_feat.astype(\"float32\")\n        return video_feat\n    def create_reader(self):\n        \"\"\"\n        reader creator for ctcn model\n        \"\"\"\n        return self.make_infer_reader()\n    def make_infer_reader(self):\n        \"\"\"",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:104-133"
+    },
+    "2723": {
+        "file_id": 213,
+        "content": "The code defines a class with methods to load video features, create a reader for ctcn model inference, and define match_map which seems to be related to table tennis action detection. The load_file method takes a window of feature ids and loads the corresponding video features. The create_reader method returns an inferential reader for the ctcn model. The make_infer_reader method is used to create the reader object.",
+        "type": "comment"
+    },
+    "2724": {
+        "file_id": 213,
+        "content": "        reader for inference\n        \"\"\"\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            # for video_name in self.video_list:\n            for video_wind in self.video_list:\n                video_idx = self.video_list.index(video_wind)\n                video_feat = self.load_file(video_wind)\n                batch_out.append(\n                    (video_feat, video_wind, [self.duration, self.dscale]))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:134-154"
+    },
+    "2725": {
+        "file_id": 213,
+        "content": "This code defines a reader function for inference that iterates over video files and appends data to a batch. It yields the batches when their size reaches the specified batch_size, and at the end of iteration if there's still remaining data.",
+        "type": "comment"
+    },
+    "2726": {
+        "file_id": 214,
+        "content": "/applications/TableTennis/predict/action_detect/reader/feature_reader.py",
+        "type": "filepath"
+    },
+    "2727": {
+        "file_id": 214,
+        "content": "This code reads the YouTube-8M dataset, featuring three models (LSTM, attention cluster, nextVlad), and is used for table tennis action detection. It uses cPickle and numpy, and a feature reader initializes for training or inference batches, extracting image, audio, and pcm features.",
+        "type": "summary"
+    },
+    "2728": {
+        "file_id": 214,
+        "content": "\"\"\"\nattention-lstm feature reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\nimport numpy as np\nimport random\nimport code\nfrom .reader_utils import DataReader\nclass FeatureReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm, attention cluster, nextvlad",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/feature_reader.py:1-34"
+    },
+    "2729": {
+        "file_id": 214,
+        "content": "This code is a data reader for the YouTube-8M dataset, which contains features extracted by prior networks for three models: LSTM, attention cluster, and nextVlad. It uses cPickle to load data from storage and numpy for numerical operations.",
+        "type": "comment"
+    },
+    "2730": {
+        "file_id": 214,
+        "content": "    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        self.feature = material['feature']\n        self.proposal = material['proposal']\n        self.fps = 5\n    def create_reader(self):\n        \"\"\"\n        create_reader\n        \"\"\"\n        image_feature_list = self.feature['image_feature']\n        audio_feature_list = self.feature['audio_feature']\n        pcm_feature_list = self.feature['pcm_feature']\n        pcm_feature_list = pcm_feature_list.reshape(\n            (pcm_feature_list.shape[0] * 5, 640))\n        fl = self.proposal\n        if self.mode == 'train':\n            random.shuffle(fl)\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            for prop_info in fl:\n                start_id = int(prop_info['start'])",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/feature_reader.py:36-71"
+    },
+    "2731": {
+        "file_id": 214,
+        "content": "This code initializes a feature reader for table tennis action detection. It takes in parameters such as name, mode, configuration, and material. The reader creates lists of image, audio, and pcm features, reshapes the pcm_feature_list, and shuffles proposal list if in train mode. It then defines a reader function that iterates through proposal list to create batches for training or inference.",
+        "type": "comment"
+    },
+    "2732": {
+        "file_id": 214,
+        "content": "                end_id = int(prop_info['end'])\n                bmn_score = float(prop_info['score'])\n                try:\n                    image_feature = image_feature_list[start_id:end_id]\n                    audio_feature = audio_feature_list[int(start_id / self.fps\n                                                           ):int(end_id /\n                                                                 self.fps)]\n                    pcm_feature = pcm_feature_list[start_id:end_id]\n                    # image_feature = np.concatenate((image_feature, pcm_feature), axis=1)\n                    batch_out.append(\n                        (image_feature, audio_feature, 0, prop_info))\n                    if len(batch_out) == self.batch_size:\n                        yield batch_out\n                        batch_out = []\n                except Exception as e:\n                    continue\n        return reader",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/feature_reader.py:72-91"
+    },
+    "2733": {
+        "file_id": 214,
+        "content": "This code segment is part of a feature reader for Table Tennis action prediction. It extracts image, audio, and pcm features from their respective lists based on start and end IDs. If batch size is reached, it yields the batch and resets the batch.",
+        "type": "comment"
+    },
+    "2734": {
+        "file_id": 215,
+        "content": "/applications/TableTennis/predict/action_detect/reader/reader_utils.py",
+        "type": "filepath"
+    },
+    "2735": {
+        "file_id": 215,
+        "content": "The code defines `ReaderZoo` class for handling errors in PaddleVideo TableTennis, allowing registration and retrieval of different types of readers using named parameters.",
+        "type": "summary"
+    },
+    "2736": {
+        "file_id": 215,
+        "content": "\"\"\"\nreader_util\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nclass ReaderNotFoundError(Exception):\n    \"\"\"\n    \"Error: reader not found\"\n    \"\"\"\n    def __init__(self, reader_name, avail_readers):\n        super(ReaderNotFoundError, self).__init__()\n        self.reader_name = reader_name\n        self.avail_readers = avail_readers\n    def __str__(self):\n        msg = \"Reader {} Not Found.\\nAvailiable readers:\\n\".format(\n            self.reader_name)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/reader_utils.py:1-33"
+    },
+    "2737": {
+        "file_id": 215,
+        "content": "This Python script defines a ReaderNotFoundError exception and a class for handling reader-related errors in the PaddleVideo TableTennis application. It includes license information and allows for checking if a specified reader is available by comparing it to a list of available readers.",
+        "type": "comment"
+    },
+    "2738": {
+        "file_id": 215,
+        "content": "        for reader in self.avail_readers:\n            msg += \"  {}\\n\".format(reader)\n        return msg\nclass DataReader(object):\n    \"\"\"\n    data reader for video input\n    \"\"\"\n    def __init__(self, model_name, mode, cfg):\n        self.name = model_name\n        self.mode = mode\n        self.cfg = cfg\n    def create_reader(self):\n        \"\"\"\n        Not implemented\n        \"\"\"\n        pass\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"\n        get_config_from_sec\n        \"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ReaderZoo(object):\n    \"\"\"\n    ReaderZoo\n    \"\"\"\n    def __init__(self):\n        \"\"\"\n        __init__\n        \"\"\"\n        self.reader_zoo = {}\n    def regist(self, name, reader):\n        \"\"\"\n        regist\n        \"\"\"\n        assert reader.__base__ == DataReader, \"Unknow model type {}\".format(\n            type(reader))\n        self.reader_zoo[name] = reader\n    def get(self, name, mode, cfg, material=None):",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/reader_utils.py:34-81"
+    },
+    "2739": {
+        "file_id": 215,
+        "content": "This code defines a `DataReader` class and a `ReaderZoo` class. The `DataReader` class is a data reader for video input, with methods such as `create_reader`, which should be implemented but is currently empty, and `get_config_from_sec`, which gets configuration from a section in the given config file. The `ReaderZoo` class registers different types of readers based on their names and ensures they inherit from the `DataReader` class. The code also includes functionality to retrieve a reader given its name, mode, configuration, and optionally a material type.",
+        "type": "comment"
+    },
+    "2740": {
+        "file_id": 215,
+        "content": "        \"\"\"\n        get\n        \"\"\"\n        for k, v in self.reader_zoo.items():\n            if k == name:\n                return v(name, mode, cfg, material)\n        raise ReaderNotFoundError(name, self.reader_zoo.keys())\n# singleton reader_zoo\nreader_zoo = ReaderZoo()\ndef regist_reader(name, reader):\n    \"\"\"\n    regist_reader\n    \"\"\"\n    reader_zoo.regist(name, reader)\ndef get_reader(name, mode, cfg, material=None):\n    \"\"\"\n    get_reader\n    \"\"\"\n    reader_model = reader_zoo.get(name, mode, cfg, material)\n    return reader_model.create_reader()",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/reader/reader_utils.py:82-107"
+    },
+    "2741": {
+        "file_id": 215,
+        "content": "This code defines a singleton reader_zoo, allows for registration of readers using the regist_reader() function, and retrieves the registered reader using get_reader() function. The reader instance is created by calling create_reader() on the retrieved reader model.",
+        "type": "comment"
+    },
+    "2742": {
+        "file_id": 216,
+        "content": "/applications/TableTennis/predict/action_detect/utils/config_utils.py",
+        "type": "filepath"
+    },
+    "2743": {
+        "file_id": 216,
+        "content": "The code is a part of PaddleVideo's TableTennis application, containing an AttrDict class and parse_config function for parsing YAML configuration files using yaml and ast libraries. It also imports the logger module for logging purposes, and logs a separator string to indicate context changes.",
+        "type": "summary"
+    },
+    "2744": {
+        "file_id": 216,
+        "content": "\"\"\"\nconfig_utils\n\"\"\"\n#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport yaml\nimport ast\nimport logger\nlogger = logger.Logger()\nCONFIG_SECS = [\n    'train',\n    'valid',\n    'test',\n    'infer',\n]\nclass AttrDict(dict):\n    \"\"\"\n    AttrDict\n    \"\"\"\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef parse_config(cfg_file):",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/utils/config_utils.py:1-47"
+    },
+    "2745": {
+        "file_id": 216,
+        "content": "The code is part of the PaddleVideo TableTennis application and contains a class called AttrDict that extends the Python dictionary functionality. The file also includes the parse_config function, which likely reads and parses configuration files. The code uses the yaml and ast libraries for processing configuration data in a format-agnostic manner. Additionally, it defines a list of configuration types (CONFIG_SECS) and utilizes the logger module for logging purposes.",
+        "type": "comment"
+    },
+    "2746": {
+        "file_id": 216,
+        "content": "    \"\"\"Load a config file into AttrDict\"\"\"\n    import yaml\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef create_attr_dict(yaml_config):\n    \"\"\"create_attr_dict\"\"\"\n    for key, value in yaml_config.items():\n        if isinstance(value, dict):\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = ast.literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\n    return\ndef print_configs(cfg, mode):\n    \"\"\"print_configs\"\"\"\n    logger.info(\n        \"---------------- {:>5} Arguments ----------------\".format(mode))\n    for sec, sec_items in cfg.items():\n        logger.info(\"{}:\".format(sec))\n        for k, v in sec_items.items():\n            logger.info(\"    {}:{}\".format(k, v))",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/utils/config_utils.py:48-80"
+    },
+    "2747": {
+        "file_id": 216,
+        "content": "This code imports the yaml library and loads a configuration file into an AttrDict object, allowing for easier manipulation of nested dictionary data. It also includes functions to create an AttrDict from a string and print the configurations in a formatted manner.",
+        "type": "comment"
+    },
+    "2748": {
+        "file_id": 216,
+        "content": "    logger.info(\"-------------------------------------------------\")",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/utils/config_utils.py:81-81"
+    },
+    "2749": {
+        "file_id": 216,
+        "content": "This code snippet logs a separator string to the logger, indicating a change in context or section within the program.",
+        "type": "comment"
+    },
+    "2750": {
+        "file_id": 217,
+        "content": "/applications/TableTennis/predict/action_detect/utils/preprocess.py",
+        "type": "filepath"
+    },
+    "2751": {
+        "file_id": 217,
+        "content": "Code file \"preprocess.py\" contains four functions: \n1. ffmpeg_frames extracts frames from a video using ffmpeg and saves them as jpg files in a specified folder, at the specified frame rate.\n2. ffmpeg_pcm extracts audio from a video and saves it as a PCM file.\n3. ffmpeg_mp4 downloads a video file from a URL to the local machine.\n4. get_images retrieves all image files in a directory, sorts them, and stores their paths in a list.",
+        "type": "summary"
+    },
+    "2752": {
+        "file_id": 217,
+        "content": "\"\"\" extract frames and pcm\"\"\"\nimport os\nimport sys\nimport shutil\ndef ffmpeg_frames(mp4_addr, frame_out_folder, fps=5):\n    \"\"\"ffmpeg_frames\"\"\"\n    if os.path.exists(frame_out_folder):\n        shutil.rmtree(frame_out_folder)\n    os.makedirs(frame_out_folder)\n    cmd = './src/utils/ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (\n        mp4_addr, fps, frame_out_folder, '%08d')\n    os.system(cmd)\ndef ffmpeg_pcm(mp4_addr, save_file_name):\n    \"\"\"ffmpeg_pcm\"\"\"\n    cmd = './src/utils/ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' \\\n        % (mp4_addr, save_file_name)\n    os.system(cmd)\ndef ffmpeg_mp4(mp4_url, mp4_addr):\n    \"\"\"ffmpeg_mp4\"\"\"\n    cmd = \"wget %s -O %s -q\" % (mp4_url, mp4_addr)\n    print(\"cmd = \", cmd)\n    os.system(cmd)\ndef get_images(image_path):\n    \"\"\"get_images\"\"\"\n    images = sorted(os.listdir(image_path))\n    images = images\n    images_path_list = [image_path + '/' + im for im in images]\n    return images_path_list",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/utils/preprocess.py:1-36"
+    },
+    "2753": {
+        "file_id": 217,
+        "content": "Code file \"preprocess.py\" contains four functions: \n1. ffmpeg_frames extracts frames from a video using ffmpeg and saves them as jpg files in a specified folder, at the specified frame rate.\n2. ffmpeg_pcm extracts audio from a video and saves it as a PCM file.\n3. ffmpeg_mp4 downloads a video file from a URL to the local machine.\n4. get_images retrieves all image files in a directory, sorts them, and stores their paths in a list.",
+        "type": "comment"
+    },
+    "2754": {
+        "file_id": 218,
+        "content": "/applications/TableTennis/predict/action_detect/utils/process_result.py",
+        "type": "filepath"
+    },
+    "2755": {
+        "file_id": 218,
+        "content": "This function calculates video results by implementing one-dimensional non-maximal suppression, removes overlapping detections, and processes video properties. It takes in various parameters such as label map file, fps, score threshold, iou threshold, and frame offset.",
+        "type": "summary"
+    },
+    "2756": {
+        "file_id": 218,
+        "content": "\"\"\"\n# @File  : process_result.py\n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport sys\nimport os\nimport re\nimport numpy as np\nimport pickle\nimport json\nimport logger\nlogger = logger.Logger()\ndef get_data_res(label_map, data, topk):\n    \"\"\"get_data_res\"\"\"\n    sum_vid = len(data)\n    video_result = []\n    for i in range(sum_vid):\n        vid_name = data[i][0][0]\n        # true_label predict_start predict_end predict_score predict_len gt_iou gt_start gt_ioa\n        feature_start_id = float(data[i][0][1]['start'])\n        feature_end_id = float(data[i][0][1]['end'])\n        feature_stage1_score = data[i][0][1]['score']\n        predict_res = []\n        for k in range(topk):\n            score_top = data[i][1][k]\n            labelid_top = data[i][2][k]\n            label_iou = data[i][3]\n            labelname_top = label_map[str(labelid_top)]\n            video_result.append([\n                feature_start_id, feature_end_id, labelid_top, labelname_top,\n                score_top, label_iou\n            ])\n    return video_result",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/utils/process_result.py:1-39"
+    },
+    "2757": {
+        "file_id": 218,
+        "content": "This function takes in label_map, data, and topk as input arguments. It calculates the video result based on the given parameters and returns it. The video result is a list of lists where each sub-list contains the feature start ID, feature end ID, label ID, label name, score, and label IOU for each action detected in the video.",
+        "type": "comment"
+    },
+    "2758": {
+        "file_id": 218,
+        "content": "def base_nms(bboxes, thresh, delta=0, nms_id=2):\n    \"\"\"\n    One-dimensional non-maximal suppression\n    :param bboxes: [[vid, label, st, ed, score, ...], ...]\n    :param thresh:\n    :return:\n    \"\"\"\n    \"\"\"\n    t1 = bboxes[:, 0]\n    t2 = bboxes[:, 1]\n    scores = bboxes[:, nms_id]\n    \"\"\"\n    t1 = np.array([max(0, x[0] - delta) for x in bboxes])\n    t2 = np.array([x[1] + delta for x in bboxes])\n    scores = np.array([x[nms_id] for x in bboxes])\n    durations = t2 - t1\n    order = scores.argsort()[::-1]\n    keep = []\n    while order.size > 0:\n        i = order[0]\n        keep.append(i)\n        tt1 = np.maximum(t1[i], t1[order[1:]])\n        tt2 = np.minimum(t2[i], t2[order[1:]])\n        intersection = tt2 - tt1\n        IoU = intersection / (durations[i] + durations[order[1:]] -\n                              intersection).astype(float)\n        inds = np.where(IoU <= thresh)[0]\n        order = order[inds + 1]\n    return [bboxes[i] for i in keep]\ndef process_proposal(source_prop_box,\n                     min_frame_thread=5,",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/utils/process_result.py:42-78"
+    },
+    "2759": {
+        "file_id": 218,
+        "content": "This code implements one-dimensional non-maximal suppression, which performs non-overlapping detection on bounding boxes. The function takes in a list of bounding boxes and removes any overlapping detections with an Intersection over Union (IoU) threshold greater than the given threshold. The resulting list contains only the non-overlapping detections.",
+        "type": "comment"
+    },
+    "2760": {
+        "file_id": 218,
+        "content": "                     nms_thresh=0.7,\n                     score_thresh=0.01):\n    \"\"\"process_video_prop\"\"\"\n    prop_box = []\n    for items in source_prop_box:\n        start_frame = float(items[0])\n        end_frame = float(items[1])\n        score = float(items[2])\n        if end_frame - start_frame < min_frame_thread or score < score_thresh:\n            continue\n        prop_box.append([start_frame, end_frame, score])\n    prop_box_keep = base_nms(prop_box, nms_thresh)\n    prop_res = []\n    for res in prop_box_keep:\n        prop_res.append({'start': res[0], 'end': res[1], 'score': res[2]})\n    return prop_res\ndef process_video_classify(video_prop, fps, score_thread, iou_thread, \\\n                           nms_id=5, nms_thread=0.01, nms_delta=10, backgroundid=0):\n    \"\"\"process_video_classify\"\"\"\n    prop_filter = []\n    for item in video_prop:\n        if item[2] == backgroundid:\n            continue\n        prop_filter.append(item)\n    # prop_filter = sorted(prop_filter, key=lambda x: x[nms_id], reverse=True)\n    prop_filter = base_nms(prop_filter, nms_thread, nms_delta, nms_id)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/utils/process_result.py:79-110"
+    },
+    "2761": {
+        "file_id": 218,
+        "content": "The code contains two functions: `process_video_prop` and `process_video_classify`. The first function processes video properties based on start frame, end frame, and score thresholds. It applies non-maximum suppression (NMS) to remove redundant or weak detections. The second function filters video properties based on background id and performs NMS for specific parameters.",
+        "type": "comment"
+    },
+    "2762": {
+        "file_id": 218,
+        "content": "    prop_filter = sorted(prop_filter, key=lambda x: x[0])\n    video_results = []\n    for item in prop_filter:\n        start_sec = item[0] / fps\n        end_sec = item[1] / fps\n        start_id_frame = item[0]\n        end_id_frame = item[1]\n        # start_time = \"%02d:%02d:%02d\" % ((start_id_frame / fps) / 3600, \\\n        #     ((start_id_frame / fps) % 3600) / 60, (start_id_frame / fps) % 60)\n        # end_time = \"%02d:%02d:%02d\" % ((end_id_frame / fps) / 3600, \\\n        #     ((end_id_frame / fps) % 3600) / 60, (end_id_frame / fps) % 60)\n        start_time = int(start_id_frame / fps)\n        end_time = int(end_id_frame / fps)\n        label_id = item[2]\n        label_name = item[3]\n        label_classify_score = item[4]\n        label_iou_score = item[5]\n        if label_classify_score > score_thread and label_iou_score > iou_thread:\n            video_results.append({\n                \"start_time\": start_time,\n                \"end_time\": end_time,\n                \"label_id\": label_id,\n                \"label_name\": label_name,",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/utils/process_result.py:111-136"
+    },
+    "2763": {
+        "file_id": 218,
+        "content": "This code sorts prop_filter based on timestamps, then iterates over the sorted list to extract start and end times, label IDs, and scores. It appends these details to video_results if the classify score is greater than a threshold and IOU score is also above the threshold.",
+        "type": "comment"
+    },
+    "2764": {
+        "file_id": 218,
+        "content": "                \"classify_score\": label_classify_score,\n                \"iou_score\": label_iou_score\n            })\n    return video_results\ndef get_action_result(result_info, label_map_file, fps, score_thread=0, \\\n                      iou_thread=0, nms_id=5, nms_thread=0.01, frame_offset=10, topk=1):\n    \"\"\"get_action_result\"\"\"\n    label_map = json.load(open(label_map_file, 'r', encoding='utf-8'))\n    org_result = get_data_res(label_map, result_info, topk)\n    nms_result = process_video_classify(org_result, fps, score_thread,\n                                        iou_thread, nms_id, nms_thread,\n                                        frame_offset)\n    return nms_result",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/action_detect/utils/process_result.py:137-155"
+    },
+    "2765": {
+        "file_id": 218,
+        "content": "This function, `get_action_result`, takes in `result_info`, `label_map_file`, `fps`, `score_thread`, `iou_thread`, `nms_id`, `nms_thread`, and `frame_offset` as parameters. It uses the `json.load()` method to load a label map from the file specified by `label_map_file`. The function then calls `get_data_res` with the loaded label map, `result_info`, and `topk` to obtain original results (`org_result`). Finally, it processes these original results using `process_video_classify()`, passing in additional parameters such as `fps`, `score_thread`, `iou_thread`, `nms_id`, `nms_thread`, and `frame_offset`. The function returns the non-maximum suppression (`nms`) result.",
+        "type": "comment"
+    },
+    "2766": {
+        "file_id": 219,
+        "content": "/applications/TableTennis/predict/eval.py",
+        "type": "filepath"
+    },
+    "2767": {
+        "file_id": 219,
+        "content": "This code performs object detection in computer vision tasks and evaluates table tennis action predictions, computing evaluation metrics to optimize F1 scores. The best performing combination is stored for future use.",
+        "type": "summary"
+    },
+    "2768": {
+        "file_id": 219,
+        "content": "\"\"\"\nget instance for lstm\n根据gts计算每个proposal_bmn的iou、ioa、label等信息\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nimport io\nsys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')\ndataset = \"/home/work/datasets\"\nlabel_index_file = './configs/index_label_football_7.json'\neval_datasets = ['EuroCup2016']\nlabel_files = {\n    'train': 'label_cls8_train.json',\n    'validation': 'label_cls8_val.json'\n}\nglobal fps, mode\nlabel_index = json.load(open(label_index_file, 'rb'))\ndef load_gts():\n    global fps\n    gts_data = {'fps': 0, 'gts': {}}\n    for eval_data in eval_datasets:\n        for item, value in label_files.items():\n            label_file = '{}/{}/{}'.format(dataset, eval_data, value)\n            gts = json.load(open(label_file, 'rb'))\n            gts_data['fps'] = gts['fps']\n            fps = gts['fps']\n            for gt in gts['gts']:\n                gt['mode'] = item\n                basename = '{}/{}/mp4/{}'.format(dataset, eval_data,\n                                                 os.path.basename(gt['url']))",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:1-41"
+    },
+    "2769": {
+        "file_id": 219,
+        "content": "This code is importing necessary libraries and defining global variables. It loads ground truth data from specified labels for different datasets and evaluation splits, setting frame per second (fps) values as well. It uses these loaded gts to calculate proposal-box related information based on ground truth sequence.",
+        "type": "comment"
+    },
+    "2770": {
+        "file_id": 219,
+        "content": "                gts_data['gts'][basename] = gt\n    return gts_data['gts']\ndef computeIoU(e1, e2):\n    \"\"\"\n    clc iou and ioa\n    \"\"\"\n    if not (e1['label'] == e2['label'] and e1['basename'] == e2['basename']):\n        return 0.\n    area1 = e1[\"end\"] - e1[\"start\"]\n    area2 = e2[\"end\"] - e2[\"start\"]\n    x1 = np.maximum(e1[\"start\"], e2[\"start\"])\n    x2 = np.minimum(e1[\"end\"], e2[\"end\"])\n    inter = np.maximum(0.0, x2 - x1)\n    iou = 0.0 if (area1 + area2 -\n                  inter) == 0 else inter * 1.0 / (area1 + area2 - inter)\n    if not mode == 'proposal':\n        iou = 0.0 if area2 == 0 else inter * 1.0 / area2\n    return iou\ndef convert_proposal(boxes, basename, score_threshold=0.01):\n    boxes = sorted(boxes, key=lambda x: float(x['score']), reverse=True)\n    res = []\n    for box in boxes:\n        if not float(box['score']) >= score_threshold:\n            continue\n        res.append({\n            'basename': basename,\n            'start': int(float(box['start']) / fps),\n            'end': int(float(box['end']) / fps),",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:42-73"
+    },
+    "2771": {
+        "file_id": 219,
+        "content": "This code defines a function to compute the intersection of union (IoU) between two intervals, and another function that converts a list of proposals into final detections based on a score threshold. The computed IoU is used to filter out unwanted proposals, and only keep those with high confidence scores. This can be useful for object detection tasks in computer vision applications.",
+        "type": "comment"
+    },
+    "2772": {
+        "file_id": 219,
+        "content": "            'label': 0\n        })\n    return res\ndef convert_classify(boxes, basename, iou_threshold, score_threshold):\n    boxes = sorted(boxes,\n                   key=lambda x:\n                   (float(x['classify_score']), float(x['iou_score'])),\n                   reverse=True)\n    def convert_time_to_frame(time_type):\n        return int(time_type)\n        h, m, s = time_type.split(':')\n        return int(h) * 3600 + int(m) * 60 + int(s)\n    res = []\n    for box in boxes:\n        if not (box['iou_score'] >= iou_threshold\n                and box['classify_score'] >= score_threshold):\n            continue\n        res.append({\n            'basename': basename,\n            'start': convert_time_to_frame(box['start_time']),\n            'end': convert_time_to_frame(box['end_time']),\n            'label': box['label_id']\n        })\n    return res\ndef convert_groundtruth(boxes, basename, phase=None):\n    res = []\n    for box in boxes:\n        for item in box['label_ids']:\n            label = 0 if phase == 'proposal' else item",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:74-108"
+    },
+    "2773": {
+        "file_id": 219,
+        "content": "The code contains three functions: 'convert_classify', 'convert_groundtruth', and 'convert_time_to_frame'. The 'convert_classify' function sorts boxes based on their classify and iou scores, then appends qualified boxes to a result list. 'convert_groundtruth' appends box labels to the result list based on the phase parameter. The 'convert_time_to_frame' function converts time strings to frames.",
+        "type": "comment"
+    },
+    "2774": {
+        "file_id": 219,
+        "content": "            res.append({\n                'basename': basename,\n                'start': box['start_id'],\n                'end': box['end_id'],\n                'label': label\n            })\n    return res\ndef print_head(iou):\n    print(\"\\nioa = {:.1f}\".format(iou))\n    res_str = ''\n    for item in ['label_name']:\n        res_str += '{:<12s}'.format(item)\n    for item in [\n            'label_id', 'precision', 'recall', 'hit_prop', 'num_prop',\n            'hit_gts', 'num_gts'\n    ]:\n        res_str += '{:<10s}'.format(item)\n    print(res_str)\ndef print_result(res_dict, label='avg'):\n    if label == 'avg':\n        res_str = '{:<22s}'.format(str(label))\n    else:\n        res_str = '{0:{2}<6s}{1:<10s}'.format(label_index[str(label)],\n                                              str(label), chr(12288))\n    for item in ['prec', 'recall']:\n        res_str += '{:<10.4f}'.format(res_dict[item])\n    for item in ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10d}'.format(res_dict[item])\n    print(res_str)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:109-142"
+    },
+    "2775": {
+        "file_id": 219,
+        "content": "This code contains three functions: \"res.append\" appends a dictionary to a list with information about video frames, \"print_head\" prints headers for table output, and \"print_result\" prints the evaluation results of the model in a formatted way. The code is likely part of an image classification or object detection algorithm that evaluates the performance of the model on a set of video frames.",
+        "type": "comment"
+    },
+    "2776": {
+        "file_id": 219,
+        "content": "def evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub=False):\n    iou_map = [computeIoU(resId, gtsId) for resId in res_boxes \\\n                                        for gtsId in gts_boxes]\n    iou_map = np.array(iou_map).reshape((len(res_boxes), len(gts_boxes)))\n    hit_map_prop_total = np.max(iou_map, axis=1)\n    hit_map_index_total = np.argmax(iou_map, axis=1)\n    res_dict = ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']\n    for iou_threshold in iou_range:\n        if show_sub:\n            print_head(iou_threshold)\n        iou_prop = np.array([k >= iou_threshold for k in hit_map_prop_total])\n        average_results = {}\n        for label_id in label_range:\n            sub_results = {}\n            label_prop = np.array([k['label'] == label_id for k in res_boxes])\n            label_gts = np.array([k['label'] == label_id for k in gts_boxes])\n            sub_results['num_prop'] = sum(label_prop)\n            sub_results['num_gts'] = sum(label_gts)\n            if sub_results['num_prop'] == 0:",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:145-166"
+    },
+    "2777": {
+        "file_id": 219,
+        "content": "Function `evaluation` takes in lists of predicted boxes (`res_boxes`) and ground truth boxes (`gts_boxes`), along with IOU and label ranges. It computes intersection over union (IoU) between each predicted box and ground truth box, creating a map of IoUs. The map is then reshaped into a 2D array for easier computation. The function calculates the maximum IoU per row in the map, and the index of this maximum value. It also loops through label and IOU ranges to calculate various statistics for subsets of labels and IOU thresholds. If `show_sub` is True, it prints a header indicating the current subset being evaluated. If there are no predicted boxes for a particular label in the current iteration, the function skips that iteration without computing results.",
+        "type": "comment"
+    },
+    "2778": {
+        "file_id": 219,
+        "content": "                hit_prop_index = []\n            else:\n                hit_prop_index = label_prop & iou_prop\n            sub_results['hit_prop'] = sum(hit_prop_index)\n            sub_results['hit_gts'] = len(\n                set(hit_map_index_total[hit_prop_index]))\n            sub_results['prec'] = 0.0 if sub_results['num_prop'] == 0 \\\n                                      else sub_results['hit_prop'] * 1.0 / sub_results['num_prop']\n            sub_results['recall'] = 0.0 if sub_results['num_gts'] == 0 \\\n                                        else sub_results['hit_gts'] * 1.0 / sub_results['num_gts']\n            if show_sub:\n                print_result(sub_results, label=label_id)\n            for item in res_dict:\n                if not item in average_results:\n                    average_results[item] = 0\n                average_results[item] += sub_results[item]\n        if len(label_range) == 1:  # proposal 不需要输出average值\n            continue\n        average_results['prec'] = 0.0 if average_results['num_prop'] == 0 \\",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:167-186"
+    },
+    "2779": {
+        "file_id": 219,
+        "content": "The code calculates precision and recall scores for a set of results. It checks if there are any hits, then calculates the hit properties and ground truths. If show_sub is True, it prints the subresults for each label. The average results are also updated based on these calculations.",
+        "type": "comment"
+    },
+    "2780": {
+        "file_id": 219,
+        "content": "                                      else average_results['hit_prop'] * 1.0 / average_results['num_prop']\n        average_results['recall'] = 0.0 if average_results['num_gts'] == 0 \\\n                                        else average_results['hit_gts'] * 1.0 / average_results['num_gts']\n        if show_sub:\n            print_result(average_results)\n        average_results['F1'] = 0.0 if (average_results['prec'] + average_results['recall'] == 0) \\\n                                    else 2 * average_results['prec'] * average_results['recall'] / \\\n                                            (average_results['prec'] + average_results['recall'])\n        return average_results\ndef get_eval_results(predicts,\n                     gts_data,\n                     phase,\n                     iou_threshold=0.3,\n                     score_threshold=0.3,\n                     show_sub=False):\n    global mode\n    mode = phase\n    res_boxes = []\n    gts_boxes = []\n    for ped_data in predicts:\n        basename = ped_data['video_name']",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:187-210"
+    },
+    "2781": {
+        "file_id": 219,
+        "content": "This code calculates average precision and recall for a table tennis prediction model. It returns an F1 score, considers IOU and score thresholds, handles different phases, and optionally prints the results.",
+        "type": "comment"
+    },
+    "2782": {
+        "file_id": 219,
+        "content": "        # eval sub data\n        such_eval = False\n        for eval_name in eval_datasets:\n            if eval_name in basename:\n                such_eval = True\n                break\n        if not such_eval:\n            continue\n        gts = gts_data[basename]['actions']\n        if phase == 'proposal':\n            res_boxes.extend(\n                convert_proposal(ped_data['bmn_results'], basename,\n                                 score_threshold))\n            gts_boxes.extend(\n                convert_groundtruth(gts, basename, phase='proposal'))\n            label_range = [0]\n            iou_range = np.arange(0.1, 1, 0.1)\n        else:\n            res_boxes.extend(\n                convert_classify(ped_data['action_results'], basename,\n                                 iou_threshold, score_threshold))\n            gts_boxes.extend(convert_groundtruth(gts, basename))\n            label_range = range(1, len(label_index))\n            iou_range = np.arange(0.5, 0.6, 0.1)\n    eval_results = evaluation(res_boxes,\n                              gts_boxes,",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:212-239"
+    },
+    "2783": {
+        "file_id": 219,
+        "content": "This code evaluates the performance of a video analysis model for table tennis. It determines if the data is an evaluation dataset and then extends the results and ground truth boxes based on the phase (proposal or classification). It sets label and iou thresholds for evaluation and finally calculates the evaluation results.",
+        "type": "comment"
+    },
+    "2784": {
+        "file_id": 219,
+        "content": "                              label_range,\n                              iou_range,\n                              show_sub=show_sub)\n    return eval_results\nif __name__ == \"__main__\":\n    result_file = sys.argv[1]\n    predicts = json.load(open(result_file, 'r', encoding='utf-8'))\n    gts_data = load_gts()\n    get_eval_results(predicts,\n                     gts_data,\n                     'proposal',\n                     score_threshold=0.03,\n                     show_sub=True)\n    #get_eval_results(predicts, gts_data, 'actions')\n    best_F1 = -0.1\n    best_res = {}\n    best_iou_threshold = 0.\n    best_score_threshold = 0.\n    for iou_threshold in np.arange(0.1, 0.9, 0.1):\n        for score_threshold in np.arange(0.1, 1, 0.1):\n            avg_res = get_eval_results(predicts,\n                                       gts_data,\n                                       'actions',\n                                       iou_threshold=iou_threshold,\n                                       score_threshold=score_threshold,\n                                       show_sub=False)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:240-270"
+    },
+    "2785": {
+        "file_id": 219,
+        "content": "This code evaluates the performance of table tennis action predictions. It takes in predicted results and ground truth data, then computes evaluation metrics for different IOU and score thresholds. The best performing combination is stored for future reference.",
+        "type": "comment"
+    },
+    "2786": {
+        "file_id": 219,
+        "content": "            if best_F1 < avg_res['F1']:\n                best_F1 = avg_res['F1']\n                best_res = avg_res\n                best_iou_threshold = iou_threshold\n                best_score_threshold = score_threshold\n    print(\"best iou threshold = {:.1f}\".format(best_iou_threshold))\n    print(\"best score threshold = {:.1f}\".format(best_score_threshold))\n    print('best F1 score = {:.4f}'.format(best_F1))\n    print_head(0.5)\n    print_result(best_res)\n    get_eval_results(predicts,\n                     gts_data,\n                     'actions',\n                     iou_threshold=best_iou_threshold,\n                     score_threshold=best_score_threshold,\n                     show_sub=True)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/eval.py:271-287"
+    },
+    "2787": {
+        "file_id": 219,
+        "content": "This code snippet is optimizing the iou and score thresholds for better F1 scores, then printing them and using the best values to get evaluation results.",
+        "type": "comment"
+    },
+    "2788": {
+        "file_id": 220,
+        "content": "/applications/TableTennis/predict/predict.py",
+        "type": "filepath"
+    },
+    "2789": {
+        "file_id": 220,
+        "content": "The code sets up an environment for video prediction using PaddleVideo's TableTennis application, initializing an ActionDetection object and loading a model to predict actions and body movements in each video, storing results in a JSON file.",
+        "type": "summary"
+    },
+    "2790": {
+        "file_id": 220,
+        "content": "import os\nimport sys\nimport json\nsys.path.append('action_detect')\nfrom action import ActionDetection\nif __name__ == '__main__':\n    dataset_dir = \"/home/work/datasets/EuroCup2016\"\n    model_predict = ActionDetection(cfg_file=\"./configs/configs.yaml\")\n    model_predict.load_model()\n    video_url = os.path.join(dataset_dir, 'url_val.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]\n    results = []\n    for line in lines:\n        video_name = line\n        print(video_name)\n        imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")\n        pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n        bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n        results.append({\n            'video_name': line,\n            'bmn_results': bmn_results,\n            'action_results': action_results\n        })\n    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/predict.py:1-35"
+    },
+    "2791": {
+        "file_id": 220,
+        "content": "This code is setting up an environment for video prediction using the PaddleVideo's TableTennis application. It appends the \"action_detect\" directory to the Python path, initializes an ActionDetection object with a configuration file, loads the model, and then iterates through a list of video URLs to predict actions and body movements in each video, storing the results in a JSON file.",
+        "type": "comment"
+    },
+    "2792": {
+        "file_id": 220,
+        "content": "        f.write(data)",
+        "type": "code",
+        "location": "/applications/TableTennis/predict/predict.py:36-36"
+    },
+    "2793": {
+        "file_id": 220,
+        "content": "Writes the data to file.",
+        "type": "comment"
+    },
+    "2794": {
+        "file_id": 221,
+        "content": "/applications/TableTennis/val_split.py",
+        "type": "filepath"
+    },
+    "2795": {
+        "file_id": 221,
+        "content": "This code loads a JSON file, splits the ground truth sequences (gts) into training and validation sets, and saves them as separate JSON files. It uses the json module for reading and writing JSON data. The original file is labeled 'label_cls14_train.json' and has gts from index 0 to 4 in the validation set, and gts from index 5 onwards in the training set. The code also writes a new validation set in '/home/aistudio/data/label_cls14_val.json' with the same fps (25).",
+        "type": "summary"
+    },
+    "2796": {
+        "file_id": 221,
+        "content": "import json\nwith open('/home/aistudio/data/label_cls14_train.json') as f:\n    data = json.load(f)\nf.close()\nval = {'gts': data['gts'][0:5], 'fps': 25}\njsonString = json.dumps(val, indent=4, ensure_ascii=False)\njsonFile = open('/home/aistudio/data/label_cls14_val.json', 'w')\njsonFile.write(jsonString)\njsonFile.close()\ntrain = {'gts': data['gts'][5:], 'fps': 25}\njsonString = json.dumps(train, indent=4, ensure_ascii=False)\njsonFile = open('/home/aistudio/data/label_cls14_train.json', 'w')\njsonFile.write(jsonString)\njsonFile.close()",
+        "type": "code",
+        "location": "/applications/TableTennis/val_split.py:1-19"
+    },
+    "2797": {
+        "file_id": 221,
+        "content": "This code loads a JSON file, splits the ground truth sequences (gts) into training and validation sets, and saves them as separate JSON files. It uses the json module for reading and writing JSON data. The original file is labeled 'label_cls14_train.json' and has gts from index 0 to 4 in the validation set, and gts from index 5 onwards in the training set. The code also writes a new validation set in '/home/aistudio/data/label_cls14_val.json' with the same fps (25).",
+        "type": "comment"
+    },
+    "2798": {
+        "file_id": 222,
+        "content": "/applications/VideoQualityAssessment/README.md",
+        "type": "filepath"
+    },
+    "2799": {
+        "file_id": 222,
+        "content": "This code develops a PaddlePaddle 2.1 video quality assessment model using ppTSM network on KonVid-150k dataset, supports multigpu distributed training and evaluation, and references two papers for improved user experience and SROCC/PLCC scores.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/28.json b/docs/data/28.json
new file mode 100644
index 000000000..610bd4340
--- /dev/null
+++ b/docs/data/28.json
@@ -0,0 +1,542 @@
+{
+    "2800": {
+        "file_id": 222,
+        "content": "# 视频质量评价模型\n---\n## 内容\n- [模型简介](#模型简介)\n- [数据准备](#数据准备)\n- [模型训练](#模型训练)\n- [模型测试](#模型测试)\n- [模型优化](#模型优化)\n- [模型部署](#模型部署)\n- [参考论文](#参考论文)\n## 模型简介\n该代码库主要基于paddle2.1版本开发，主要是在ppTSM网络模型的基础上修改的一种无参考视频质量评估方法，通过读入视频的视频帧来判断该视频的质量。\n针对视频内容的理解，可以自动分析视频内容的质量，帮助选出最优的关键帧或关键片段作为视频封面，提升视频的点击转换和用户体验。\n本项目目前支持Linux下的GPU单卡和多卡运行环境。\n## 数据准备\n```\n数据集来自公开数据集KonVid-150k，共153842个ugc视频，其中训练集(KonVid-150k-A)152265个，验证集(KonVid-150k-B)1577个\n示例数据集以及数据集官网地址: datasets/dataset_url.list\n数据集标注文件为dataset中的train.txt和eval.txt\n```\n## 模型训练\n环境安装：\n- PaddlePaddle >= 2.1.0\n- Python >= 3.7\n- PaddleX >= 2.0.0\n- CUDA >= 10.1\n- cuDNN >= 7.6.4\n- nccl >= 2.1.2\n安装Python依赖库：\nPython依赖库在[requirements.txt](https://github.com/PaddlePaddle/PaddleVideo/blob/master/requirements.txt)中给出，可通过如下命令安装：\n```\npython3.7 -m pip install --upgrade pip\npip3.7 install --upgrade -r requirements.txt\n```\n使用`paddle.distributed.launch`启动模型训练和测试脚本（`main.py`），可以更方便地启动多卡训练与测试，或直接运行(./run.sh)\n```shell\nsh run.sh\n```\n我们将所有标准的启动命令都放在了```run.sh```中，注意选择想要运行的脚本。\n参考如下方式启动模型训练，`paddle.distributed.launch`通过设置`gpus`指定GPU运行卡号，",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/README.md:1-58"
+    },
+    "2801": {
+        "file_id": 222,
+        "content": "This code is for a video quality assessment model developed using PaddlePaddle 2.1. It uses the ppTSM network and is trained on KonVid-150k dataset, which contains 153842 UGC videos. The model can analyze video content to determine its quality, improve video previews, and enhance user experience. Requires specific environment setup and dependencies like Python 3.7, CUDA 10.1, and cuDNN 7.6.4.",
+        "type": "comment"
+    },
+    "2802": {
+        "file_id": 222,
+        "content": "指定`--validate`来启动训练时评估。\n```bash\n# PaddleVideo通过launch方式启动多卡多进程训练\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    --log_dir=log_pptsm \\\n    main.py \\\n    --amp \\\n    --validate \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml\n```\n其中，`-c`用于指定配置文件的路径，可通过配置文件修改相关训练配置信息，也可以通过添加`-o`参数来更新配置：\n```bash\npython -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml \\\n    --validate \\\n    -o DATASET.batch_size=16\n```\n`-o`用于指定需要修改或者添加的参数，其中`-o DATASET.batch_size=16`表示更改batch_size大小为16。\n运行上述命令，将会输出运行日志，并默认保存在./log目录下，如：`worker.0` , `worker.1` ... , worker日志文件对应每张卡上的输出\n【train阶段】打印当前时间，当前epoch/epoch总数，当前batch id，评估指标，耗时，ips等信息：\n    [11/16 04:40:37] epoch:[  1/1  ] train step:100  loss: 5.31382 lr: 0.000250 batch_cost: 0.73082 sec, reader_cost: 0.38075 sec, ips: 5.47330 instance/sec.\n【eval阶段】打印当前时间，当前epoch/epoch总数，当前batch id，评估指标，耗时，ips等信息：\n    [11/16 04:40:37] epoch:[  1/1  ] val step:0    loss: 4.42741 batch_cost: 1.37882 sec, reader_cost: 0.00000 sec, ips: 2.90104 instance/sec.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/README.md:59-98"
+    },
+    "2803": {
+        "file_id": 222,
+        "content": "This code is running PaddleVideo's multigpu distributed training in launch mode. It specifies the GPU devices to use, sets up the log directory, and uses AMP for mixed precision training. The `--validate` flag starts the evaluation during training and allows for updating configurations using the `-o` parameter. It also prints various metrics like loss, learning rate, batch cost, reader cost, and instances per second during train and eval phases.",
+        "type": "comment"
+    },
+    "2804": {
+        "file_id": 222,
+        "content": "【epoch结束】打印当前时间，学习率，评估指标，耗时，ips等信息：\n    [11/16 04:40:37] lr=0.00012487\n    [11/16 04:40:37] train_SROCC=0.4456697876616565\n    [11/16 04:40:37] train_PLCC=0.48071880604403616\n    [11/16 04:40:37] END epoch:1   val loss_avg: 5.21620 avg_batch_cost: 0.04321 sec, avg_reader_cost: 0.00000 sec, batch_cost_sum: 112.69575 sec, avg_ips: 8.41203 instance/sec.\n当前为评估结果最好的epoch时，打印最优精度：\n    [11/16 04:40:57] max_SROCC=0.7116468111328617\n    [11/16 04:40:57] max_PLCC=0.733503995526737\n### 模型恢复训练\n如果训练任务终止，可以加载断点权重文件(优化器-学习率参数，断点文件)继续训练。\n需要指定`-o resume_epoch`参数，该参数表示从```resume_epoch```轮开始继续训练.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    --amp \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml \\\n    --validate \\\n    -o resume_epoch=5\n```\n### 模型微调\n进行模型微调（Finetune），对自定义数据集进行模型微调，需要指定 `--weights` 参数来加载预训练模型。\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    --amp \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml \\",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/README.md:101-144"
+    },
+    "2805": {
+        "file_id": 222,
+        "content": "Epoch completion: Prints current time, learning rate, evaluation metrics, training duration, and instances per second.\nBest epoch detection: Prints best precision achieved during training.\nResuming training: Loads checkpoint weights to continue training from a specified epoch.\nModel fine-tuning: Loads pre-trained model for custom dataset fine-tuning.",
+        "type": "comment"
+    },
+    "2806": {
+        "file_id": 222,
+        "content": "    --validate \\\n    --weights=./output/model_name/ppTSM_best.pdparams\n```\nPaddleVideo会自动**不加载**shape不匹配的参数\n## 模型测试\n需要指定 `--test`来启动测试模式，并指定`--weights`来加载预训练模型。\n```bash\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml \\\n    --test \\\n    --weights=./output/model_name/ppTSM_best.pdparams\n```\n## 模型优化\n在实际使用场景中可根据视频质量以及尺寸尝试优化策略\n- 可通过原图输入来替换RandomCrop:224操作，准确率由SROCC=0.8176,PLCC=0.8361提升到SROCC=0.8617,PLCC=0.8910,不同模型以及特征增强操作的效果对比如下表所示\n  |  模型  |                  特征增强                   | val_SROCC | val_PLCC |\n  | :----: | :-----------------------------------------: | :-------: | :------: |\n  | GSTVQA |                  原图输入                   |  0.7932   |  0.8006  |\n  | ppTSM  | train--RandomCrop=224  val--center_crop=224 |  0.8176   |  0.8361  |\n  | ppTSM  | train--RandomCrop=512  val--center_crop=512 |  0.8603   |  0.8822  |\n  | ppTSM  |                  原图输入                   |  0.8617   |  0.8910  |\n- 考虑应用场景视频的 aspect ratio 大都为 16：9 和 4：3 等，同时为了避免非均匀缩放拉伸带来的干扰 ，可以采用了（224x3）x(224x2)=672x448 的输入尺寸来更充分得利用有限的输入尺寸。 ",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/README.md:145-179"
+    },
+    "2807": {
+        "file_id": 222,
+        "content": "The code is launching PaddleVideo application for video quality assessment. It uses the TSM model with regression and loads the best trained weights from \"./output/model_name/ppTSM_best.pdparams\". The --test flag is used to run the model in test mode. The code also suggests optimizing strategies like using original input instead of RandomCrop, changing input size for better performance, and considering aspect ratios of 16:9 and 4:3 for improved results.",
+        "type": "comment"
+    },
+    "2808": {
+        "file_id": 222,
+        "content": "## 模型部署\n本代码解决方案在官方验证集(KonVid-150k-B)上的指标效果为SROCC=0.8176,PLCC=0.8361。\n## 参考论文\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han\n- [Quality Assessment of In-the-Wild Videos](https://dl.acm.org/citation.cfm?doid=3343031.3351028), Dingquan Li, Tingting Jiang, and Ming Jiang",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/README.md:181-189"
+    },
+    "2809": {
+        "file_id": 222,
+        "content": "This code provides a solution for video quality assessment with SROCC and PLCC scores on official validation dataset. It references two papers: TSM: Temporal Shift Module for Efficient Video Understanding and Quality Assessment of In-the-Wild Videos.",
+        "type": "comment"
+    },
+    "2810": {
+        "file_id": 223,
+        "content": "/applications/VideoQualityAssessment/main.py",
+        "type": "filepath"
+    },
+    "2811": {
+        "file_id": 223,
+        "content": "This code trains PaddleVideo models, imports libraries, defines command line arguments, and supports distributed training/testing based on --test argument.",
+        "type": "summary"
+    },
+    "2812": {
+        "file_id": 223,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nimport argparse\nfrom paddlevideo.utils import get_config\nfrom paddlevideo.tasks import train_model, test_model\nfrom paddlevideo.utils import get_dist_info\ndef parse_args():\n    \"\"\"parse_args\"\"\"\n    parser = argparse.ArgumentParser(\"PaddleVideo train script\")\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/main.py:1-30"
+    },
+    "2813": {
+        "file_id": 223,
+        "content": "This code snippet is the beginning of a Python script for PaddleVideo, specifically for training models. It imports necessary libraries and modules, defines a function to parse command line arguments, and sets up the argument parser.",
+        "type": "comment"
+    },
+    "2814": {
+        "file_id": 223,
+        "content": "                        help='config file path')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument('--test',\n                        action='store_true',\n                        help='whether to test a model')\n    parser.add_argument('--train_dali',\n                        action='store_true',\n                        help='whether to use dali to speed up training')\n    parser.add_argument('--multigrid',\n                        action='store_true',\n                        help='whether to use multigrid training')\n    parser.add_argument('-w',\n                        '--weights',\n                        type=str,\n                        help='weights for finetuning or testing')\n    parser.add_argument('--fleet',\n                        action='store_true',\n                        help='whether to use fleet run distributed training')",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/main.py:31-52"
+    },
+    "2815": {
+        "file_id": 223,
+        "content": "This code uses the ArgumentParser class to define and parse command-line arguments for a video quality assessment application. It allows specifying config file paths, overriding config options, testing a model, using DALI for training speedup, multigrid training, weights for finetuning or testing, and whether to use distributed training via fleet.",
+        "type": "comment"
+    },
+    "2816": {
+        "file_id": 223,
+        "content": "    parser.add_argument('--amp',\n                        action='store_true',\n                        help='whether to open amp training.')\n    parser.add_argument(\n        '--validate',\n        action='store_true',\n        help='whether to evaluate the checkpoint during training')\n    args = parser.parse_args()\n    return args\ndef main():\n    \"\"\"main\"\"\"\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override)\n    _, world_size = get_dist_info()\n    parallel = world_size != 1\n    if parallel:\n        paddle.distributed.init_parallel_env()\n    if args.test:\n        test_model(cfg, weights=args.weights, parallel=parallel)\n    else:\n        train_model(cfg,\n                    weights=args.weights,\n                    parallel=parallel,\n                    validate=args.validate,\n                    fleet=args.fleet,\n                    amp=args.amp)\nif __name__ == '__main__':\n    main()",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/main.py:53-88"
+    },
+    "2817": {
+        "file_id": 223,
+        "content": "This code defines command line arguments for training and testing models, and initializes distributed parallel environment if necessary. Then it calls appropriate functions based on the --test argument value.",
+        "type": "comment"
+    },
+    "2818": {
+        "file_id": 224,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/__init__.py",
+        "type": "filepath"
+    },
+    "2819": {
+        "file_id": 224,
+        "content": "This code snippet is a license notice and import statement for the PaddleVideo library in Python. It sets the copyright, licensing information, and imports the version module from the same directory.",
+        "type": "summary"
+    },
+    "2820": {
+        "file_id": 224,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .version import paddlevideo_version",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/__init__.py:1-17"
+    },
+    "2821": {
+        "file_id": 224,
+        "content": "This code snippet is a license notice and import statement for the PaddleVideo library in Python. It sets the copyright, licensing information, and imports the version module from the same directory.",
+        "type": "comment"
+    },
+    "2822": {
+        "file_id": 225,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py",
+        "type": "filepath"
+    },
+    "2823": {
+        "file_id": 225,
+        "content": "This file is a Python module for video dataset loading and processing in PaddleVideo. It contains functions to build datasets, data loaders, and batch pipelines, along with the VideoDataset class.",
+        "type": "summary"
+    },
+    "2824": {
+        "file_id": 225,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .builder import build_dataset, build_dataloader, build_batch_pipeline\nfrom .dataset import VideoDataset\n__all__ = [\n    'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset'\n]",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py:1-21"
+    },
+    "2825": {
+        "file_id": 225,
+        "content": "This file is a Python module for video dataset loading and processing in PaddleVideo. It contains functions to build datasets, data loaders, and batch pipelines, along with the VideoDataset class.",
+        "type": "comment"
+    },
+    "2826": {
+        "file_id": 226,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py",
+        "type": "filepath"
+    },
+    "2827": {
+        "file_id": 226,
+        "content": "This Python file utilizes PaddleVideo and PaddlePaddle library to construct video pipelines, defining functions for dataset, pipeline, and dataloader creation. It also includes signal handlers to terminate child processes upon receiving specific signals.",
+        "type": "summary"
+    },
+    "2828": {
+        "file_id": 226,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport signal\nimport os\nimport paddle\nfrom paddle.io import DataLoader, DistributedBatchSampler\nfrom .registry import DATASETS, PIPELINES\nfrom ..utils.build_utils import build\nfrom .pipelines.compose import Compose\nfrom paddlevideo.utils import get_logger\nimport numpy as np\nlogger = get_logger(\"paddlevideo\")\ndef build_pipeline(cfg):\n    \"\"\"Build pipeline.\n    Args:\n        cfg (dict): root config dict.\n    \"\"\"",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:1-33"
+    },
+    "2829": {
+        "file_id": 226,
+        "content": "This code is a Python file for building video pipeline in PaddleVideo, which uses PaddlePaddle library. It imports necessary modules and defines a function to build the pipeline according to the provided configuration. The logger is used for logging purposes, and numpy is imported for numerical operations.",
+        "type": "comment"
+    },
+    "2830": {
+        "file_id": 226,
+        "content": "    return Compose(cfg)\ndef build_dataset(cfg):\n    \"\"\"Build dataset.\n    Args:\n        cfg (dict): root config dict.\n    Returns:\n        dataset: dataset.\n    \"\"\"\n    #XXX: ugly code here!\n    cfg_dataset, cfg_pipeline = cfg\n    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)\n    dataset = build(cfg_dataset, DATASETS, key=\"format\")\n    return dataset\ndef build_batch_pipeline(cfg):\n    \"\"\"build batch pipeline\"\"\"\n    batch_pipeline = build(cfg, PIPELINES)\n    return batch_pipeline\ndef build_dataloader(dataset,\n                     batch_size,\n                     num_workers,\n                     places,\n                     shuffle=True,\n                     drop_last=True,\n                     multigrid=False,\n                     collate_fn_cfg=None,\n                     **kwargs):\n    \"\"\"Build Paddle Dataloader.\n    XXX explain how the dataloader work!\n    Args:\n        dataset (paddle.dataset): A PaddlePaddle dataset object.\n        batch_size (int): batch size on single card.\n        num_worker (int): num_worker",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:34-74"
+    },
+    "2831": {
+        "file_id": 226,
+        "content": "This code defines functions to build a dataset, batch pipeline, and dataloader for PaddleVideo's Video Quality Assessment application. The build_dataset function constructs the dataset using cfg config dictionary. The build_batch_pipeline function builds the batch pipeline. Lastly, the build_dataloader function creates a Paddle Dataloader using the constructed dataset and other parameters like batch size, num_workers, etc.",
+        "type": "comment"
+    },
+    "2832": {
+        "file_id": 226,
+        "content": "        shuffle(bool): whether to shuffle the data at every epoch.\n    \"\"\"\n    sampler = DistributedBatchSampler(dataset,\n                                      batch_size=batch_size,\n                                      shuffle=shuffle,\n                                      drop_last=drop_last)\n    #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.\n    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:\n    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.\n    def mix_collate_fn(batch):\n        \"\"\"mix collate fn\"\"\"\n        pipeline = build_batch_pipeline(collate_fn_cfg)\n        batch = pipeline(batch)\n        slots = []\n        for items in batch:\n            for i, item in enumerate(items):\n                if len(slots) < len(items):\n                    slots.append([item])\n                else:\n                    slots[i].append(item)\n        return [np.stack(slot, axis=0) for slot in slots]",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:75-97"
+    },
+    "2833": {
+        "file_id": 226,
+        "content": "The code creates a DistributedBatchSampler for dataset with optional shuffle and drop_last parameters, and defines a mix_collate_fn function that applies a predefined collate_fn_cfg to batch data and returns it in a specific format using build_batch_pipeline.",
+        "type": "comment"
+    },
+    "2834": {
+        "file_id": 226,
+        "content": "    #if collate_fn_cfg is not None:\n    #ugly code here. collate_fn is mix op config\n    #    collate_fn = mix_collate_fn(collate_fn_cfg)\n    data_loader = DataLoader(\n        dataset,\n        batch_sampler=sampler,\n        places=places,\n        num_workers=num_workers,\n        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,\n        return_list=True,\n        **kwargs)\n    return data_loader\ndef term_mp(sig_num, frame):\n    \"\"\" kill all child processes\n    \"\"\"\n    pid = os.getpid()\n    pgid = os.getpgid(os.getpid())\n    logger.info(\"main proc {} exit, kill process group \" \"{}\".format(pid, pgid))\n    os.killpg(pgid, signal.SIGKILL)\n    return\nsignal.signal(signal.SIGINT, term_mp)\nsignal.signal(signal.SIGTERM, term_mp)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:99-126"
+    },
+    "2835": {
+        "file_id": 226,
+        "content": "The code defines a function that returns a DataLoader object. If collate_fn_cfg is not None, it creates a mix_collate_fn and assigns it to the collate_fn variable. The returned DataLoader has its collate_fn set according to the value of collate_fn_cfg. The code also sets up signal handlers for SIGINT and SIGTERM, calling the term_mp function on receipt of either signal. The term_mp function kills all child processes in the current process group.",
+        "type": "comment"
+    },
+    "2836": {
+        "file_id": 227,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py",
+        "type": "filepath"
+    },
+    "2837": {
+        "file_id": 227,
+        "content": "This code is a Python module for video and frame datasets in PaddleVideo. It includes the VideoDataset class, FrameRecDataset class (from frame_rec module), and defines __all__.",
+        "type": "summary"
+    },
+    "2838": {
+        "file_id": 227,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .video import VideoDataset\n#from .frame import FrameDataset\nfrom .frame_rec import FrameRecDataset\n__all__ = ['VideoDataset', 'FrameRecDataset']",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py:1-21"
+    },
+    "2839": {
+        "file_id": 227,
+        "content": "This code is a Python module for video and frame datasets in PaddleVideo. It includes the VideoDataset class, FrameRecDataset class (from frame_rec module), and defines __all__.",
+        "type": "comment"
+    },
+    "2840": {
+        "file_id": 228,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py",
+        "type": "filepath"
+    },
+    "2841": {
+        "file_id": 228,
+        "content": "This code defines a dataset class for loading video information, requires subclassing to define load_file and prepare_train/test methods, prepares data for training/testing, and addresses DataLoader's dict type handling limitation.",
+        "type": "summary"
+    },
+    "2842": {
+        "file_id": 228,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os.path as osp\nimport copy\nimport numpy as np\nfrom abc import ABC, abstractmethod\nimport paddle\nfrom paddle.io import Dataset\nclass BaseDataset(Dataset, ABC):\n    \"\"\"Base class for datasets\n    All datasets should subclass it.\n    All subclass should overwrite:\n    - Method: `load_file`, load info from index file.\n    - Method: `prepare_train`, providing train data.\n    - Method: `prepare_test`, providing test data.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py:1-34"
+    },
+    "2843": {
+        "file_id": 228,
+        "content": "Base class for datasets, subclass it. Subclasses should overwrite load_file (load info from index file), prepare_train (provide train data), and prepare_test (provide test data).",
+        "type": "comment"
+    },
+    "2844": {
+        "file_id": 228,
+        "content": "    Args:\n        file_path (str): index file path.\n        pipeline (Sequence XXX)\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): whether to build test dataset. Default: False.\n    \"\"\"\n    def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):\n        super().__init__()\n        self.file_path = file_path\n        self.data_prefix = osp.realpath(data_prefix) if \\\n            data_prefix is not None and osp.isdir(data_prefix) else data_prefix\n        self.test_mode = test_mode\n        self.pipeline = pipeline\n        self.info = self.load_file()\n    @abstractmethod\n    def load_file(self):\n        \"\"\"load the video information from the index file path.\"\"\"\n        pass\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py:36-62"
+    },
+    "2845": {
+        "file_id": 228,
+        "content": "This code defines a class for loading video information from an index file path. It takes arguments such as the file_path, pipeline, data_prefix, and test_mode. The load_file method abstractly loads the video information from the index file. The prepare_train method prepares data for training/valid given the index. Note: DataLoader cannot support dict type retval, so it converts to list.",
+        "type": "comment"
+    },
+    "2846": {
+        "file_id": 228,
+        "content": "        #unsqueeze label to list\n        return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        #unsqueeze label to list\n        return results['imgs'], np.array([results['labels']])\n    def __len__(self):\n        \"\"\"get the size of the dataset.\"\"\"\n        return len(self.info)\n    def __getitem__(self, idx):\n        \"\"\" Get the sample for either training or testing given index\"\"\"\n        if self.test_mode:\n            return self.prepare_test(idx)\n        else:\n            return self.prepare_train(idx)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py:63-83"
+    },
+    "2847": {
+        "file_id": 228,
+        "content": "The code defines a dataset class with methods for preparing data for training and testing, as well as returning the size of the dataset. The test_mode flag is used to determine whether to use the prepare_test or prepare_train method when accessing the dataset. Paddle.io.DataLoader cannot currently handle dict type return values, so they are converted to lists within these methods.",
+        "type": "comment"
+    },
+    "2848": {
+        "file_id": 229,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py",
+        "type": "filepath"
+    },
+    "2849": {
+        "file_id": 229,
+        "content": "The code introduces FrameRecDataset class for PaddleVideo, loading raw frames and applying transformations. Another class reads index files, initializes base class with parameters, and handles missing frame file exceptions during training/validation.",
+        "type": "summary"
+    },
+    "2850": {
+        "file_id": 229,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass FrameRecDataset(BaseDataset):\n    \"\"\"Rawframe dataset for action recognition.\n    The dataset loads raw frames from frame files, and apply specified transform operatation them.\n    The ind",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:1-32"
+    },
+    "2851": {
+        "file_id": 229,
+        "content": "This code is part of the PaddleVideo library and defines a FrameRecDataset class for action recognition. It loads raw frames from frame files, applies specified transform operations to them, and registers the dataset with the DATASETS registry.",
+        "type": "comment"
+    },
+    "2852": {
+        "file_id": 229,
+        "content": "ecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.\n    Example of an index file:\n    .. code-block:: txt\n        file_path-1 150 1\n        file_path-2 160 1\n        file_path-3 170 2\n        file_path-4 180 2\n    Args:\n        file_path (str): Path to the index file.\n        pipeline(XXX):\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 num_retries=5,\n                 data_prefix=None,\n                 test_mode=False,\n                 suffix='img_{:05}.jpg'):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:32-62"
+    },
+    "2853": {
+        "file_id": 229,
+        "content": "This code defines a class that loads index files containing video information. The class takes an index file path, pipeline, data prefix (optional), test mode (optional) and suffix (optional) as arguments. It initializes the base class with these parameters and then has a method load_file() to read the index file and get the video information.",
+        "type": "comment"
+    },
+    "2854": {
+        "file_id": 229,
+        "content": "        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                mp4_path, frame_dir, frames_len, labels = line_split\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(\n                    dict(frame_dir=frame_dir,\n                         suffix=self.suffix,\n                         frames_len=frames_len,\n                         labels=float(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:63-88"
+    },
+    "2855": {
+        "file_id": 229,
+        "content": "This code reads a file and parses each line into frame path, directory, number of frames, and labels. It returns a list of dictionaries containing this information. The \"prepare_train\" function tries to prepare the frames for training/validation multiple times in case an exception occurs while reading the frames files.",
+        "type": "comment"
+    },
+    "2856": {
+        "file_id": 229,
+        "content": "                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:89-110"
+    },
+    "2857": {
+        "file_id": 229,
+        "content": "The code handles exceptions caused by reading missing frames files. It attempts to load the frames multiple times if there are errors, and keeps track of the number of retries. If an error occurs, it logs the error message and continues with a different index from the dataset until it successfully loads the frames.",
+        "type": "comment"
+    },
+    "2858": {
+        "file_id": 230,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py",
+        "type": "filepath"
+    },
+    "2859": {
+        "file_id": 230,
+        "content": "The code introduces a PaddleVideo class in Python for loading and processing video datasets, reading index files, applying transforms, handles corrupted files with retries, and provides error logging during training/validation.",
+        "type": "summary"
+    },
+    "2860": {
+        "file_id": 230,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass VideoDataset(BaseDataset):\n    \"\"\"Video dataset for action recognition\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:1-32"
+    },
+    "2861": {
+        "file_id": 230,
+        "content": "This code is a Python class defining a video dataset for action recognition. It loads raw videos and applies specified transforms on them using an index file with multiple lines, each indicating the properties of a video. The code is part of the PaddleVideo library.",
+        "type": "comment"
+    },
+    "2862": {
+        "file_id": 230,
+        "content": "       a sample video with the filepath and label, which are split with a whitesapce.\n       Example of a inde file:\n       .. code-block:: txt\n           path/000.mp4 1\n           path/001.mp4 1\n           path/002.mp4 2\n           path/003.mp4 2\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(self, file_path, pipeline, num_retries=5, **kwargs):\n        self.num_retries = num_retries\n        super().__init__(file_path, pipeline, **kwargs)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                filename, labels = line_split\n                #TODO(hj): Required suffix format: may mp4/avi/wmv\n                filename = filename + '.avi'\n                if self.data_prefix is not None:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:33-58"
+    },
+    "2863": {
+        "file_id": 230,
+        "content": "This code initializes a dataset class, which reads an index file containing paths to video files and their labels. It loads the index file line by line and processes each line to append video information into a list called \"info\". The filename is assumed to have .avi suffix in this case. If there is a data_prefix assigned, it will be added to the filename.",
+        "type": "comment"
+    },
+    "2864": {
+        "file_id": 230,
+        "content": "                    filename = osp.join(self.data_prefix, filename)\n                info.append(dict(filename=filename, labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for test given the index.\"\"\"",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:59-81"
+    },
+    "2865": {
+        "file_id": 230,
+        "content": "The code is a part of a video dataset loader. It handles preparing data for training, validation, and testing in a dataset with potential corrupted files. It joins filenames to the prefix, stores them along with labels in a list (info). For training/validation, it tries a set number of times to read each file due to possible corruption, applies a pipeline to the data, logs exceptions if they occur, and tries again with a random index if needed. In testing, it simply returns the prepared data without retries or error handling.",
+        "type": "comment"
+    },
+    "2866": {
+        "file_id": 230,
+        "content": "        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:82-95"
+    },
+    "2867": {
+        "file_id": 230,
+        "content": "This code attempts to read a video file and catch any exceptions caused by reading corrupted files. It uses a retry mechanism with a maximum number of retries (self.num_retries) to handle potential errors. If an exception occurs, the error is logged, and if there are more retries left, it tries again with a different random index from self.info. Once successful, it returns the images and labels as numpy arrays.",
+        "type": "comment"
+    },
+    "2868": {
+        "file_id": 231,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py",
+        "type": "filepath"
+    },
+    "2869": {
+        "file_id": 231,
+        "content": "The code imports PaddleVideo library functions for data augmentation, composition, decoding, and sampling in video analysis tasks, while using a list of pipeline modules to perform operations like mixing, cropping, and scaling.",
+        "type": "summary"
+    },
+    "2870": {
+        "file_id": 231,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .augmentations import (\n    Scale,\n    RandomCrop,\n    CenterCrop,\n    RandomFlip,\n    Image2Array,\n    Normalization,\n    JitterScale,\n    MultiCrop,\n    PackOutput,\n)\nfrom .compose import Compose\nfrom .decode import VideoDecoder, FrameDecoder\nfrom .sample import Sampler\nfrom .mix import Mixup, Cutmix\n__all__ = [\n    'Scale',\n    'RandomCrop',\n    'CenterCrop',\n    'RandomFlip',\n    'Image2Array',\n    'Normalization',",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py:1-40"
+    },
+    "2871": {
+        "file_id": 231,
+        "content": "This code imports various functions and classes from different modules in the PaddleVideo library, which are used for data augmentation, composition, decoding, and sampling in video analysis tasks.",
+        "type": "comment"
+    },
+    "2872": {
+        "file_id": 231,
+        "content": "    'Compose',\n    'VideoDecoder',\n    'FrameDecoder',\n    'Sampler',\n    'Mixup',\n    'Cutmix',\n    'JitterScale',\n    'MultiCrop',\n    'PackOutput',\n]",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py:41-50"
+    },
+    "2873": {
+        "file_id": 231,
+        "content": "The code above is a list of pipeline modules used in the PaddleVideo framework for video processing tasks. These modules perform various operations such as data augmentation, mixing, cropping, and scaling before feeding into the model for training or evaluation.",
+        "type": "comment"
+    },
+    "2874": {
+        "file_id": 232,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py",
+        "type": "filepath"
+    },
+    "2875": {
+        "file_id": 232,
+        "content": "The code introduces a \"Scale\" class for image scaling and a MultiScaleCrop pipeline in PaddleVideo. It supports random or multi-crop based on test mode, maintaining aspect ratio while resizing/cropping images. A slower pathway is created by selecting specific frames from the fast_pathway array, rearranging dimensions, and then combined with the original for a list of frames before adding to 'results' dictionary.",
+        "type": "summary"
+    },
+    "2876": {
+        "file_id": 232,
+        "content": "\"\"\"\n#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport random\nimport numpy as np\nimport math\nfrom PIL import Image\nfrom ..registry import PIPELINES\nfrom collections.abc import Sequence\n@PIPELINES.register()\nclass Scale(object):\n    \"\"\"\n    Scale images.\n    Args:\n        short_size(float | int): Short size of an image will be scaled to the short_size.\n    \"\"\"\n    def __init__(self, short_size):\n        self.short_size = short_size\n    def __call__(self, results):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:1-35"
+    },
+    "2877": {
+        "file_id": 232,
+        "content": "This code registers a new class \"Scale\" for image scaling in PaddleVideo's VideoQualityAssessment module. The Scale class takes a short_size parameter and scales the images accordingly. It is registered as part of the PIPELINES in the application.",
+        "type": "comment"
+    },
+    "2878": {
+        "file_id": 232,
+        "content": "        \"\"\"\n        Performs resize operations.\n        Args:\n            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            resized_imgs: List where each item is a PIL.Image after scaling.\n        \"\"\"\n        imgs = results['imgs']\n        resized_imgs = []\n        for i in range(len(imgs)):\n            img = imgs[i]\n            w, h = img.size\n            if (w <= h and w == self.short_size) or (h <= w\n                                                     and h == self.short_size):\n                resized_imgs.append(img)\n                continue\n            if w < h:\n                ow = self.short_size\n                oh = int(self.short_size * 4.0 / 3.0)\n                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n            else:\n                oh = self.short_size\n                ow = int(self.short_size * 4.0 / 3.0)\n                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:36-61"
+    },
+    "2879": {
+        "file_id": 232,
+        "content": "The code defines a function that resizes PIL.Image objects in a list according to their aspect ratios and the short_size provided. If an image's width is less than or equal to its height, it is appended to resized_imgs without any modification. Otherwise, if the width is greater than the height, the image is scaled to fit within a square with the given short_size, maintaining aspect ratio using bilinear interpolation. If the height is greater than the width, the image is also scaled to fit within a square with the given short_size, again maintaining aspect ratio using bilinear interpolation.",
+        "type": "comment"
+    },
+    "2880": {
+        "file_id": 232,
+        "content": "        results['imgs'] = resized_imgs\n        return results\n@PIPELINES.register()\nclass RandomCrop(object):\n    \"\"\"\n    Random crop images.\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = target_size\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        w, h = imgs[0].size\n        th, tw = self.target_size, self.target_size\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size {}\".format(\n                w, h, self.target_size)\n        crop_images = []\n        x1 = random.randint(0, w - tw)\n        y1 = random.randint(0, h - th)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:62-95"
+    },
+    "2881": {
+        "file_id": 232,
+        "content": "The code registers a custom pipeline for random cropping of images in PaddleVideo. It takes a target size as an argument and initializes the class with that target size. The __call__ method is used to perform random crop operations on a list of images. It first retrieves the original image sizes, ensures they are larger than the target size, then randomly selects x1 and y1 coordinates for the crop region, and appends the cropped image to a new list which is returned at the end.",
+        "type": "comment"
+    },
+    "2882": {
+        "file_id": 232,
+        "content": "        for img in imgs:\n            if w == tw and h == th:\n                crop_images.append(img)\n            else:\n                crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = crop_images\n        return results\n@PIPELINES.register()\nclass CenterCrop(object):\n    \"\"\"\n    Center crop images.\n    Args:\n        target_size(int): Center crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = target_size\n    def __call__(self, results):\n        \"\"\"\n        Performs Center crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            ccrop_imgs: List where each item is a PIL.Image after Center crop.\n        \"\"\"\n        imgs = results['imgs']\n        ccrop_imgs = []\n        for img in imgs:\n            w, h = img.size\n            th, tw = self.target_size, self.target_size\n            assert (w >= self.target_size) and (h >= self.target_size), \\",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:97-130"
+    },
+    "2883": {
+        "file_id": 232,
+        "content": "This code performs center cropping of images to a specified target size. It iterates through the list of images, checks if they are already at the target size, and appends them to the crop_images list. If the image is not at the target size, it crops the image to the center square of the original image and adds it to the crop_images list. The final results dictionary contains the list of cropped images. The class CenterCrop initializes with a target_size parameter and defines a __call__ method for applying the center crop operation on input images.",
+        "type": "comment"
+    },
+    "2884": {
+        "file_id": 232,
+        "content": "                \"image width({}) and height({}) should be larger than crop size {}\".format(\n                    w, h, self.target_size)\n            x1 = int(round((w - tw) / 2.))\n            y1 = int(round((h - th) / 2.))\n            ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = ccrop_imgs\n        return results\n@PIPELINES.register()\nclass MultiScaleCrop(object):\n    def __init__(\n            self,\n            target_size,  #NOTE: named target size now, but still pass short size in it!\n            scales=None,\n            max_distort=1,\n            fix_crop=True,\n            more_fix_crop=True):\n        self.target_size = target_size\n        self.scales = scales if scales else [1, .875, .75, .66]\n        self.max_distort = max_distort\n        self.fix_crop = fix_crop\n        self.more_fix_crop = more_fix_crop\n    def __call__(self, results):\n        \"\"\"\n        Performs MultiScaleCrop operations.\n        Args:\n            imgs: List where wach item is a PIL.Image.\n            XXX:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:131-160"
+    },
+    "2885": {
+        "file_id": 232,
+        "content": "MultiScaleCrop applies image resizing and cropping to an input image. The target size, scales, max_distort, fix_crop, and more_fix_crop parameters are used for image manipulation. Images are cropped into smaller ones with varying sizes based on the defined scales.",
+        "type": "comment"
+    },
+    "2886": {
+        "file_id": 232,
+        "content": "        results:\n        \"\"\"\n        imgs = results['imgs']\n        input_size = [self.target_size, self.target_size]\n        im_size = imgs[0].size\n        # get random crop offset\n        def _sample_crop_size(im_size):\n            image_w, image_h = im_size[0], im_size[1]\n            base_size = min(image_w, image_h)\n            crop_sizes = [int(base_size * x) for x in self.scales]\n            crop_h = [\n                input_size[1] if abs(x - input_size[1]) < 3 else x\n                for x in crop_sizes\n            ]\n            crop_w = [\n                input_size[0] if abs(x - input_size[0]) < 3 else x\n                for x in crop_sizes\n            ]\n            pairs = []\n            for i, h in enumerate(crop_h):\n                for j, w in enumerate(crop_w):\n                    if abs(i - j) <= self.max_distort:\n                        pairs.append((w, h))\n            crop_pair = random.choice(pairs)\n            if not self.fix_crop:\n                w_offset = random.randint(0, image_w - crop_pair[0])",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:161-192"
+    },
+    "2887": {
+        "file_id": 232,
+        "content": "This code defines a function to sample random crop sizes for image augmentation. It first calculates the possible crop sizes based on input size and scales, then filters pairs that have a difference within max_distort. Finally, it randomly chooses one of the filtered pairs for cropping and optionally adds a random offset if fix_crop is False.",
+        "type": "comment"
+    },
+    "2888": {
+        "file_id": 232,
+        "content": "                h_offset = random.randint(0, image_h - crop_pair[1])\n            else:\n                w_step = (image_w - crop_pair[0]) / 4\n                h_step = (image_h - crop_pair[1]) / 4\n                ret = list()\n                ret.append((0, 0))  # upper left\n                if w_step != 0:\n                    ret.append((4 * w_step, 0))  # upper right\n                if h_step != 0:\n                    ret.append((0, 4 * h_step))  # lower left\n                if h_step != 0 and w_step != 0:\n                    ret.append((4 * w_step, 4 * h_step))  # lower right\n                if h_step != 0 or w_step != 0:\n                    ret.append((2 * w_step, 2 * h_step))  # center\n                if self.more_fix_crop:\n                    ret.append((0, 2 * h_step))  # center left\n                    ret.append((4 * w_step, 2 * h_step))  # center right\n                    ret.append((2 * w_step, 4 * h_step))  # lower center\n                    ret.append((2 * w_step, 0 * h_step))  # upper center\n                    ret.append((1 * w_step, 1 * h_step))  # upper left quarter",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:193-215"
+    },
+    "2889": {
+        "file_id": 232,
+        "content": "This code generates a list of crop positions for an image. If the image height is greater than the second value in the crop pair, it randomly selects a horizontal offset. Otherwise, it calculates step sizes for width and height, and creates a list of crop positions using these steps. Additional crop positions are added if self.more_fix_crop is True.",
+        "type": "comment"
+    },
+    "2890": {
+        "file_id": 232,
+        "content": "                    ret.append((3 * w_step, 1 * h_step))  # upper right quarter\n                    ret.append((1 * w_step, 3 * h_step))  # lower left quarter\n                    ret.append((3 * w_step, 3 * h_step))  # lower righ quarter\n                w_offset, h_offset = random.choice(ret)\n            return crop_pair[0], crop_pair[1], w_offset, h_offset\n        crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)\n        crop_img_group = [\n            img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))\n            for img in imgs\n        ]\n        ret_img_group = [\n            img.resize((input_size[0], input_size[1]), Image.BILINEAR)\n            for img in crop_img_group\n        ]\n        results['imgs'] = ret_img_group\n        return results\n@PIPELINES.register()\nclass RandomFlip(object):\n    \"\"\"\n    Random Flip images.\n    Args:\n        p(float): Random flip images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.5):\n        self.p = p\n    def __call__(self, results):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:216-247"
+    },
+    "2891": {
+        "file_id": 232,
+        "content": "The code randomly samples crop sizes from a set of predefined ratios, crops the input images accordingly, resizes them to the desired input size, and adds the flipped or cropped images to the results dictionary. It also includes an optional RandomFlip pipeline that randomly flips the image with a given probability.",
+        "type": "comment"
+    },
+    "2892": {
+        "file_id": 232,
+        "content": "        \"\"\"\n        Performs random flip operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            flip_imgs: List where each item is a PIL.Image after random flip.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            results['imgs'] = [\n                img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs\n            ]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass Image2Array(object):\n    \"\"\"\n    transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.\n    Args:\n        transpose: whether to transpose or not, default True, False for slowfast.\n    \"\"\"\n    def __init__(self, transpose=True):\n        self.transpose = transpose\n    def __call__(self, results):\n        \"\"\"\n        Performs Image to NumpyArray operations.\n        Args:\n            imgs: List where each item is a PIL.Image.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:248-281"
+    },
+    "2893": {
+        "file_id": 232,
+        "content": "This code defines two classes: \"RandomFlip\" and \"Image2Array\". RandomFlip performs random flips on a list of PIL images, while Image2Array converts a PIL image to a numpy array with optional transpose. Both are registered as pipelines using @PIPELINES.",
+        "type": "comment"
+    },
+    "2894": {
+        "file_id": 232,
+        "content": "            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            np_imgs: Numpy array.\n        \"\"\"\n        imgs = results['imgs']\n        np_imgs = (np.stack(imgs)).astype('float32')\n        if self.transpose:\n            np_imgs = np_imgs.transpose(0, 3, 1, 2)  #nchw\n        results['imgs'] = np_imgs\n        return results\n@PIPELINES.register()\nclass Normalization(object):\n    \"\"\"\n    Normalization.\n    Args:\n        mean(Sequence[float]): mean values of different channels.\n        std(Sequence[float]): std values of different channels.\n        tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]\n    \"\"\"\n    def __init__(self, mean, std, tensor_shape=[3, 1, 1]):\n        if not isinstance(mean, Sequence):\n            raise TypeError(\n                'Mean must be list, tuple or np.ndarray, but got {type(mean)}')\n        if not isinstance(std, Sequence):\n            raise TypeError(\n                'Std must be list, tuple or np.ndarray, but got {type(std)}')\n        self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:282-310"
+    },
+    "2895": {
+        "file_id": 232,
+        "content": "This function converts a list of PIL images to a numpy array, optionally transposes it if needed, and stores the result in the 'imgs' key of the results dictionary. Additionally, the Normalization class initializes with mean and std values for normalization and reshapes them to fit the tensor shape.",
+        "type": "comment"
+    },
+    "2896": {
+        "file_id": 232,
+        "content": "        self.std = np.array(std).reshape(tensor_shape).astype(np.float32)\n    def __call__(self, results):\n        \"\"\"\n        Performs normalization operations.\n        Args:\n            imgs: Numpy array.\n        return:\n            np_imgs: Numpy array after normalization.\n        \"\"\"\n        imgs = results['imgs']\n        norm_imgs = imgs / 255.\n        norm_imgs -= self.mean\n        norm_imgs /= self.std\n        results['imgs'] = norm_imgs\n        return results\n@PIPELINES.register()\nclass JitterScale(object):\n    \"\"\"\n    Scale image, while the target short size is randomly select between min_size and max_size.\n    Args:\n        min_size: Lower bound for random sampler.\n        max_size: Higher bound for random sampler.\n    \"\"\"\n    def __init__(self,\n                 min_size,\n                 max_size,\n                 short_cycle_factors=[0.5, 0.7071],\n                 default_min_size=256):\n        self.default_min_size = default_min_size\n        self.orig_min_size = self.min_size = min_size\n        self.max_size = max_size",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:311-344"
+    },
+    "2897": {
+        "file_id": 232,
+        "content": "The code is a part of the PaddleVideo library's VideoQualityAssessment module. It performs normalization operations on image arrays and registers a JitterScale class for image scaling with random short size selection between min_size and max_size, also including cycling factors for default minimum size functionality.",
+        "type": "comment"
+    },
+    "2898": {
+        "file_id": 232,
+        "content": "        self.short_cycle_factors = short_cycle_factors\n    def __call__(self, results):\n        \"\"\"\n        Performs jitter resize operations.\n        Args:\n            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            resized_imgs: List where each item is a PIL.Image after scaling.\n        \"\"\"\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx in [0, 1]:\n            self.min_size = int(\n                round(self.short_cycle_factors[short_cycle_idx] *\n                      self.default_min_size))\n        else:\n            self.min_size = self.orig_min_size\n        imgs = results['imgs']\n        size = int(round(np.random.uniform(self.min_size, self.max_size)))\n        assert (len(imgs) >= 1) , \\\n            \"len(imgs):{} should be larger than 1\".format(len(imgs))\n        width, height = imgs[0].size\n        if (width <= height and width == size) or (height <= width\n                                                   and height == size):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:345-370"
+    },
+    "2899": {
+        "file_id": 232,
+        "content": "This code performs jitter resize operations and applies random scaling. It takes a sequence of PIL.Image, scales each item based on min_size, max_size, and short_cycle_factors. If the number of images is less than 1, it throws an error. The size is determined by randomly selecting values between min_size and max_size.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/29.json b/docs/data/29.json
new file mode 100644
index 000000000..e76f3362c
--- /dev/null
+++ b/docs/data/29.json
@@ -0,0 +1,539 @@
+{
+    "2900": {
+        "file_id": 232,
+        "content": "            return results\n        new_width = size\n        new_height = size\n        if width < height:\n            new_height = int(math.floor((float(height) / width) * size))\n        else:\n            new_width = int(math.floor((float(width) / height) * size))\n        frames_resize = []\n        for j in range(len(imgs)):\n            img = imgs[j]\n            scale_img = img.resize((new_width, new_height), Image.BILINEAR)\n            frames_resize.append(scale_img)\n        results['imgs'] = frames_resize\n        return results\n@PIPELINES.register()\nclass MultiCrop(object):\n    \"\"\"\n    Random crop image.\n    This operation can perform multi-crop during multi-clip test, as in slowfast model.\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self,\n                 target_size,\n                 default_crop_size=224,\n                 short_cycle_factors=[0.5, 0.7071],\n                 test_mode=False):\n        self.orig_target_size = self.target_size = target_size",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:371-403"
+    },
+    "2901": {
+        "file_id": 232,
+        "content": "This code resizes the input images to a specified size while maintaining aspect ratio, and then applies random crop for multi-clip testing in the MultiCrop class. The target_size parameter determines the output image's dimensions after resizing and cropping.",
+        "type": "comment"
+    },
+    "2902": {
+        "file_id": 232,
+        "content": "        self.short_cycle_factors = short_cycle_factors\n        self.default_crop_size = default_crop_size\n        self.test_mode = test_mode\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        spatial_sample_index = results['spatial_sample_index']\n        spatial_num_clips = results['spatial_num_clips']\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx in [0, 1]:\n            self.target_size = int(\n                round(self.short_cycle_factors[short_cycle_idx] *\n                      self.default_crop_size))\n        else:\n            self.target_size = self.orig_target_size  # use saved value before call\n        w, h = imgs[0].size\n        if w == self.target_size and h == self.target_size:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:404-430"
+    },
+    "2903": {
+        "file_id": 232,
+        "content": "The function performs random crop operations on images. It takes a list of PIL Images as input and returns the cropped images. The code checks if the current short cycle index is 0 or 1, in which case it adjusts the target size based on the short_cycle_factors variable. If the image size matches the target size, it skips the crop operation.",
+        "type": "comment"
+    },
+    "2904": {
+        "file_id": 232,
+        "content": "            return results\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size({},{})\".format(w, h, self.target_size, self.target_size)\n        frames_crop = []\n        if not self.test_mode:\n            x_offset = random.randint(0, w - self.target_size)\n            y_offset = random.randint(0, h - self.target_size)\n        else:  #multi-crop\n            x_gap = int(\n                math.ceil((w - self.target_size) / (spatial_num_clips - 1)))\n            y_gap = int(\n                math.ceil((h - self.target_size) / (spatial_num_clips - 1)))\n            if h > w:\n                x_offset = int(math.ceil((w - self.target_size) / 2))\n                if spatial_sample_index == 0:\n                    y_offset = 0\n                elif spatial_sample_index == spatial_num_clips - 1:\n                    y_offset = h - self.target_size\n                else:\n                    y_offset = y_gap * spatial_sample_index\n            else:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:431-452"
+    },
+    "2905": {
+        "file_id": 232,
+        "content": "This function performs image cropping with or without random cropping. If not in test mode, it randomly selects x and y offsets within the image boundaries to crop an area of size self.target_size. In test mode, it performs multi-crop by dividing the image into equal parts based on spatial_num_clips, ensuring each part has a minimum size of self.target_size.",
+        "type": "comment"
+    },
+    "2906": {
+        "file_id": 232,
+        "content": "                y_offset = int(math.ceil((h - self.target_size) / 2))\n                if spatial_sample_index == 0:\n                    x_offset = 0\n                elif spatial_sample_index == spatial_num_clips - 1:\n                    x_offset = w - self.target_size\n                else:\n                    x_offset = x_gap * spatial_sample_index\n        for img in imgs:\n            nimg = img.crop((x_offset, y_offset, x_offset + self.target_size,\n                             y_offset + self.target_size))\n            frames_crop.append(nimg)\n        results['imgs'] = frames_crop\n        return results\n@PIPELINES.register()\nclass PackOutput(object):\n    \"\"\"\n    In slowfast model, we want to get slow pathway from fast pathway based on\n    alpha factor.\n    Args:\n        alpha(int): temporal length of fast/slow\n    \"\"\"\n    def __init__(self, alpha):\n        self.alpha = alpha\n    def __call__(self, results):\n        fast_pathway = results['imgs']\n        # sample num points between start and end\n        slow_idx_start = 0",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:453-484"
+    },
+    "2907": {
+        "file_id": 232,
+        "content": "The code takes a list of images and crops them based on specified offset values to create new images with the desired target size. It then appends these cropped images to a list, stores them in 'frames_crop', and returns a dictionary containing 'imgs'. The function PackOutput is used for getting the slow pathway from the fast pathway based on the alpha factor in the SlowFast model.",
+        "type": "comment"
+    },
+    "2908": {
+        "file_id": 232,
+        "content": "        slow_idx_end = fast_pathway.shape[0] - 1\n        slow_idx_num = fast_pathway.shape[0] // self.alpha\n        slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,\n                                       slow_idx_num).astype(\"int64\")\n        slow_pathway = fast_pathway[slow_idxs_select]\n        # T H W C -> C T H W.\n        slow_pathway = slow_pathway.transpose(3, 0, 1, 2)\n        fast_pathway = fast_pathway.transpose(3, 0, 1, 2)\n        # slow + fast\n        frames_list = [slow_pathway, fast_pathway]\n        results['imgs'] = frames_list\n        return results",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:485-498"
+    },
+    "2909": {
+        "file_id": 232,
+        "content": "This code is creating a slower pathway by selecting specific frames from the fast_pathway array and rearranging the dimensions. The slower pathway is then combined with the original fast_pathway to create a list of frames, which is added to the 'results' dictionary before returning it.",
+        "type": "comment"
+    },
+    "2910": {
+        "file_id": 233,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py",
+        "type": "filepath"
+    },
+    "2911": {
+        "file_id": 233,
+        "content": "The code defines the Compose class for video transformation pipelines, composing multiple pipeline elements and handling temporary list-type parameters while including a workaround for old format config files.",
+        "type": "summary"
+    },
+    "2912": {
+        "file_id": 233,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom collections.abc import Sequence\nfrom ..registry import PIPELINES\nimport traceback\nfrom ...utils import build\nfrom ...utils import get_logger\n@PIPELINES.register()\nclass Compose(object):\n    \"\"\"\n    Composes several pipelines(include decode func, sample func, and transforms) together.\n    Note: To deal with ```list``` type cfg temporaray, like:\n        transform:\n            - Crop: # A list\n                attribute: 10",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py:1-33"
+    },
+    "2913": {
+        "file_id": 233,
+        "content": "This code defines the Compose class, which composes multiple pipelines (decode func, sample func, and transforms) together. It registers the class in the PIPELINES registry. The code also handles temporary list-type configuration parameters for flexibility.",
+        "type": "comment"
+    },
+    "2914": {
+        "file_id": 233,
+        "content": "            - Resize: # A list\n                attribute: 20\n    every key of list will pass as the key name to build a module.\n    XXX: will be improved in the future.\n    Args:\n        pipelines (list): List of transforms to compose.\n    Returns:\n        A compose object which is callable, __call__ for this Compose\n        object will call each given :attr:`transforms` sequencely.\n    \"\"\"\n    def __init__(self, pipelines):\n        #assert isinstance(pipelines, Sequence)\n        self.pipelines = []\n        for p in pipelines.values():\n            if isinstance(p, dict):\n                p = build(p, PIPELINES)\n                self.pipelines.append(p)\n            elif isinstance(p, list):\n                for t in p:\n                    #XXX: to deal with old format cfg, ugly code here!\n                    temp_dict = dict(name=list(t.keys())[0])\n                    for all_sub_t in t.values():\n                        if all_sub_t is not None:\n                            temp_dict.update(all_sub_t) \n                    t = build(temp_dict, PIPELINES)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py:34-61"
+    },
+    "2915": {
+        "file_id": 233,
+        "content": "This code is creating a Compose class which takes a list of transforms and composes them sequentially. It checks if the input is in the correct format, builds each transform using the build function from PIPELINES, and stores them in a list. The code also includes a workaround for handling old format config files that may have inconsistent key-value pairs in their lists.",
+        "type": "comment"
+    },
+    "2916": {
+        "file_id": 233,
+        "content": "                    self.pipelines.append(t)\n            elif callable(p):\n                self.pipelines.append(p)\n            else:\n                raise TypeError('pipelines must be callable or a dict,'\n                                'but got {type(p)}')\n    def __call__(self, data):\n        \"\"\"call\"\"\"\n        for p in self.pipelines:\n            try:\n                data = p(data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger = get_logger(\"paddlevideo\")\n                logger.info(\"fail to perform transform [{}] with error: \"\n                      \"{} and stack:\\n{}\".format(p, e, str(stack_info)))\n                raise e\n        return data",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py:62-79"
+    },
+    "2917": {
+        "file_id": 233,
+        "content": "This code is defining a class for video transformation pipelines. It appends callable functions or dictionaries to the pipeline list and has a __call__ method that applies each pipeline operation to data in sequence, handling exceptions and logging failures if they occur.",
+        "type": "comment"
+    },
+    "2918": {
+        "file_id": 234,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py",
+        "type": "filepath"
+    },
+    "2919": {
+        "file_id": 234,
+        "content": "The PaddleVideo library's VideoDecoder class decodes mp4 files into frames, handles RGB frames and audio, and provides data with masks. It includes functions for decoding, dequantizing feature vectors, and making one-hot labels.",
+        "type": "summary"
+    },
+    "2920": {
+        "file_id": 234,
+        "content": "\"\"\"\n#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport sys\nfrom io import BytesIO\nimport os\nimport random\nimport numpy as np\nimport pickle\nimport cv2\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass VideoDecoder(object):\n    \"\"\"\n    Decode mp4 file to frames.\n    Args:\n        filepath: the file path of mp4 file\n    \"\"\"\n    def __init__(self):\n        pass\n    def __call__(self, results):\n        \"\"\"\n        Perform mp4 decode operations.\n        return:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:1-42"
+    },
+    "2921": {
+        "file_id": 234,
+        "content": "This code is for a VideoDecoder class in the PaddleVideo library. It decodes mp4 files into frames as part of a pipeline. The class takes a file path argument and performs mp4 decode operations using the __call__ method, which processes results returned.",
+        "type": "comment"
+    },
+    "2922": {
+        "file_id": 234,
+        "content": "            List where each item is a numpy array after decoder.\n        \"\"\"\n        #XXX get info from results!!!\n        file_path = results['filename']\n        cap = cv2.VideoCapture(file_path)\n        videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n        sampledFrames = []\n        for i in range(videolen):\n            ret, frame = cap.read()\n            # maybe first frame is empty\n            if ret == False:\n                continue\n            img = frame[:, :, ::-1]\n            sampledFrames.append(img)\n        results['frames'] = sampledFrames\n        results['frames_len'] = len(sampledFrames)\n        results['format'] = 'video'\n        return results\n@PIPELINES.register()\nclass FrameDecoder(object):\n    \"\"\"just parse results\n    \"\"\"\n    def __init__(self):\n        pass\n    def __call__(self, results):\n        results['format'] = 'frame'\n        return results\n@PIPELINES.register()\nclass FeatureDecoder(object):\n    \"\"\"\n        Perform feature decode operations.e.g.youtube8m\n    \"\"\"\n    def __init__(self, num_classes, max_len=512, has_label=True):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:43-80"
+    },
+    "2923": {
+        "file_id": 234,
+        "content": "This code defines three classes for decoding different types of data: video, frames, and features. The VideoDecoder reads a video file frame by frame, the FrameDecoder parses results as individual frames, and the FeatureDecoder handles feature decode operations like YouTube8M. The results are stored in 'frames', 'frames_len', and 'format' fields respectively.",
+        "type": "comment"
+    },
+    "2924": {
+        "file_id": 234,
+        "content": "        self.max_len = max_len\n        self.num_classes = num_classes\n        self.has_label = has_label\n    def __call__(self, results):\n        \"\"\"\n        Perform feature decode operations.\n        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        #1. load pkl\n        #2. parse to rgb/audio/\n        #3. padding\n        filepath = results['filename']\n        data = pickle.load(open(filepath, 'rb'), encoding='bytes')\n        record = data\n        nframes = record[b'nframes']\n        rgb = record[b'feature'].astype(float)\n        audio = record[b'audio'].astype(float)\n        if self.has_label:\n            label = record[b'label']\n            one_hot_label = self.make_one_hot(label, self.num_classes)\n        rgb = rgb[0:nframes, :]\n        audio = audio[0:nframes, :]\n        rgb = self.dequantize(rgb,\n                              max_quantized_value=2.,\n                              min_quantized_value=-2.)\n        audio = self.dequantize(audio,\n                                max_quantized_value=2,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:81-113"
+    },
+    "2925": {
+        "file_id": 234,
+        "content": "This code is part of a decoding pipeline that loads and preprocesses data from a .pkl file. It extracts RGB frames, audio, and labels (if available), performs dequantization, and applies padding as needed. The results are returned as numpy arrays for further processing.",
+        "type": "comment"
+    },
+    "2926": {
+        "file_id": 234,
+        "content": "                                min_quantized_value=-2)\n        if self.has_label:\n            results['labels'] = one_hot_label.astype(\"float32\")\n        feat_pad_list = []\n        feat_len_list = []\n        mask_list = []\n        vitem = [rgb, audio]\n        for vi in range(2):  #rgb and audio\n            if vi == 0:\n                prefix = \"rgb_\"\n            else:\n                prefix = \"audio_\"\n            feat = vitem[vi]\n            results[prefix + 'len'] = feat.shape[0]\n            #feat pad step 1. padding\n            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),\n                                dtype=np.float32)\n            feat_pad = np.concatenate((feat, feat_add), axis=0)\n            results[prefix + 'data'] = feat_pad.astype(\"float32\")\n            #feat pad step 2. mask\n            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)\n            feat_mask_add = feat_add\n            feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),\n                                       axis=0)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:114-139"
+    },
+    "2927": {
+        "file_id": 234,
+        "content": "The code snippet initializes a list of feature paddings, lengths, and masks for both rgb and audio data. It iterates through the two types of data (rgb and audio) to populate the results dictionary with information about each type of data, including its length and padded feature data along with their respective masks.",
+        "type": "comment"
+    },
+    "2928": {
+        "file_id": 234,
+        "content": "            results[prefix + 'mask'] = feat_mask.astype(\"float32\")\n        return results\n    def dequantize(self,\n                   feat_vector,\n                   max_quantized_value=2.,\n                   min_quantized_value=-2.):\n        \"\"\"\n        Dequantize the feature from the byte format to the float format\n        \"\"\"\n        assert max_quantized_value > min_quantized_value\n        quantized_range = max_quantized_value - min_quantized_value\n        scalar = quantized_range / 255.0\n        bias = (quantized_range / 512.0) + min_quantized_value\n        return feat_vector * scalar + bias\n    def make_one_hot(self, label, dim=3862):\n        \"\"\"make one hot\"\"\"\n        one_hot_label = np.zeros(dim)\n        one_hot_label = one_hot_label.astype(float)\n        for ind in label:\n            one_hot_label[int(ind)] = 1\n        return one_hot_label",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:140-165"
+    },
+    "2929": {
+        "file_id": 234,
+        "content": "The code contains functions for decoding, dequantizing feature vectors, and making one-hot labels. The decode function stores the feature mask in a dictionary, the dequantize function scales and translates the quantized values back to float format, and the make_one_hot function creates one-hot encoded labels from given indices.",
+        "type": "comment"
+    },
+    "2930": {
+        "file_id": 235,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py",
+        "type": "filepath"
+    },
+    "2931": {
+        "file_id": 235,
+        "content": "Mixup class in PaddleVideo enhances video quality assessment by mixing images and labels from batches using adjustable alpha values, while Cutmix operator randomly selects boxes for mixing operations. Data augmentation is applied with random bounding boxes, and lambda is calculated for loss calculation.",
+        "type": "summary"
+    },
+    "2932": {
+        "file_id": 235,
+        "content": "\"\"\"\n#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport random\nimport numpy as np\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass Mixup(object):\n    \"\"\"\n    Mixup operator.\n    Args:\n        alpha(float): alpha value.\n    \"\"\"\n    def __init__(self, alpha=0.2):\n        assert alpha > 0., \\\n                'parameter alpha[%f] should > 0.0' % (alpha)\n        self.alpha = alpha\n    def __call__(self, batch):\n        imgs, labels = list(zip(*batch))\n        imgs = np.array(imgs)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py:1-36"
+    },
+    "2933": {
+        "file_id": 235,
+        "content": "This code defines a Mixup class for video quality assessment using PaddleVideo. It is an operator that randomly mixes images and labels from batches to enhance the model's learning ability, with an adjustable alpha value.",
+        "type": "comment"
+    },
+    "2934": {
+        "file_id": 235,
+        "content": "        labels = np.array(labels)\n        bs = len(batch)\n        idx = np.random.permutation(bs)\n        lam = np.random.beta(self.alpha, self.alpha)\n        lams = np.array([lam] * bs, dtype=np.float32)\n        imgs = lam * imgs + (1 - lam) * imgs[idx]\n        return list(zip(imgs, labels, labels[idx], lams))\n@PIPELINES.register()\nclass Cutmix(object):\n    \"\"\" Cutmix operator\n    Args:\n        alpha(float): alpha value.\n    \"\"\"\n    def __init__(self, alpha=0.2):\n        assert alpha > 0., \\\n                'parameter alpha[%f] should > 0.0' % (alpha)\n        self.alpha = alpha\n    def rand_bbox(self, size, lam):\n        \"\"\" rand_bbox \"\"\"\n        w = size[2]\n        h = size[3]\n        cut_rat = np.sqrt(1. - lam)\n        cut_w = np.int(w * cut_rat)\n        cut_h = np.int(h * cut_rat)\n        # uniform\n        cx = np.random.randint(w)\n        cy = np.random.randint(h)\n        bbx1 = np.clip(cx - cut_w // 2, 0, w)\n        bby1 = np.clip(cy - cut_h // 2, 0, h)\n        bbx2 = np.clip(cx + cut_w // 2, 0, w)\n        bby2 = np.clip(cy + cut_h // 2, 0, h)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py:37-72"
+    },
+    "2935": {
+        "file_id": 235,
+        "content": "This code defines the Cutmix operator, which is used to mix images and their corresponding labels in a dataset. It takes an alpha parameter that determines the mixing ratio, and randomly selects a box to cut out from each image. It then applies a random mixing operation within this box to create augmented versions of both the image and label. The final output is a list containing the original image, its original label, its new mixed label, and an array of the lambda values used for the mixing process.",
+        "type": "comment"
+    },
+    "2936": {
+        "file_id": 235,
+        "content": "        return bbx1, bby1, bbx2, bby2\n    def __call__(self, batch):\n        imgs, labels = list(zip(*batch))\n        imgs = np.array(imgs)\n        labels = np.array(labels)\n        bs = len(batch)\n        idx = np.random.permutation(bs)\n        lam = np.random.beta(self.alpha, self.alpha)\n        bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)\n        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]\n        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /\n                   (imgs.shape[-2] * imgs.shape[-1]))\n        lams = np.array([lam] * bs, dtype=np.float32)\n        return list(zip(imgs, labels, labels[idx], lams))",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py:74-91"
+    },
+    "2937": {
+        "file_id": 235,
+        "content": "This function generates random bounding boxes and applies data augmentation by replacing portions of images with random patches from the same image. It also calculates lambda, which is used for weighting the original and augmented samples in the loss calculation. The function returns the modified images, labels, original labels, and lambdas.",
+        "type": "comment"
+    },
+    "2938": {
+        "file_id": 236,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py",
+        "type": "filepath"
+    },
+    "2939": {
+        "file_id": 236,
+        "content": "This code defines a Sampler class for sampling frame IDs in video data, using PIL to read images instead of OpenCV, and returns the index of sampled frames. It can calculate indices randomly or by formula.",
+        "type": "summary"
+    },
+    "2940": {
+        "file_id": 236,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport random\nfrom PIL import Image\nfrom ..registry import PIPELINES\nimport os\nimport numpy as np\n@PIPELINES.register()\nclass Sampler(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:\n        num_seg(int): number of segments.\n        seg_len(int): number of sampled frames in each segment.\n        mode(str): 'train', 'valid'\n    Returns:\n        frames_idx: the index of sampled #frames.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:1-32"
+    },
+    "2941": {
+        "file_id": 236,
+        "content": "This code defines a Sampler class that samples frames IDs for video data. It takes arguments: num_seg (number of segments), seg_len (number of sampled frames in each segment), and mode ('train' or 'valid'). The class uses PIL to read images instead of OpenCV (cv2) for better compatibility. The sampler returns the index of sampled frames.",
+        "type": "comment"
+    },
+    "2942": {
+        "file_id": 236,
+        "content": "    \"\"\"\n    def __init__(self, num_seg, seg_len, valid_mode=False):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.valid_mode = valid_mode\n    def _get(self, frames_idx, results):\n        data_format =results['format']\n        if data_format == \"frame\":\n            frame_dir = results['frame_dir']\n            imgs = []\n            for idx in frames_idx:\n                img = Image.open(os.path.join(frame_dir, results['suffix'].format(idx))).convert('RGB')\n                imgs.append(img)\n        elif data_format == \"video\":\n            frames = np.array(results['frames'])\n            imgs = []\n            for idx in frames_idx:\n                imgbuf = frames[idx]\n                img = Image.fromarray(imgbuf, mode='RGB')\n                imgs.append(img)\n        else:\n            raise NotImplementedError\n        results['imgs'] = imgs\n        return results\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            frames_len: length of frames.\n        return:\n            sampling id.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:33-70"
+    },
+    "2943": {
+        "file_id": 236,
+        "content": "The code defines a class with an initialization function and two methods, \"_get\" and \"__call__\". The \"_get\" method takes frames_idx and results as arguments, and based on the data format (frame or video), it retrieves and appends images to imgs. If the format is not frame or video, it raises a NotImplementedError. The \"__call__\" method takes frames_len as an argument and returns a sampling id.",
+        "type": "comment"
+    },
+    "2944": {
+        "file_id": 236,
+        "content": "        \"\"\"\n        frames_len = int(results['frames_len'])\n        average_dur = int(int(frames_len) / self.num_seg)\n        frames_idx = []\n        for i in range(self.num_seg):\n            idx = 0\n            if not self.valid_mode:\n                if average_dur >= self.seg_len:\n                    idx = random.randint(0, average_dur - self.seg_len)\n                    idx += i * average_dur\n                elif average_dur >= 1:\n                    idx += i * average_dur\n                else: # average_dur = 0\n                    idx = i % frames_len\n            else:\n                if average_dur >= self.seg_len:\n                    idx = (average_dur - 1) // 2\n                    idx += i * average_dur\n                elif average_dur >= 1:\n                    idx += i * average_dur\n                else:\n                    idx = i % frames_len\n            for jj in range(idx, idx+self.seg_len):\n                if results['format'] == 'video':\n                    frames_idx.append(int(jj%frames_len))\n                elif results['format'] == 'frame':",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:71-96"
+    },
+    "2945": {
+        "file_id": 236,
+        "content": "This code calculates frame indices for video or frame data. It takes in 'frames_len' and 'num_seg' as inputs, and if 'valid_mode' is False, it generates random frame indices within the valid frame range. If 'valid_mode' is True, it calculates frame indices based on specific formulas. The output is stored in 'frames_idx'.",
+        "type": "comment"
+    },
+    "2946": {
+        "file_id": 236,
+        "content": "                    #frame from 000001\n                    frames_idx.append(jj+1)\n                else:\n                    raise NotImplementedError\n        return self._get(frames_idx, results)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:97-102"
+    },
+    "2947": {
+        "file_id": 236,
+        "content": "This code snippet is part of a class method that retrieves frames from a video file based on their index. If the frame index (jj+1) is not equal to 0, it appends the index to the frames_idx list; otherwise, it raises a NotImplementedError. The method then returns the results using the _get method with the frames_idx and results as arguments.",
+        "type": "comment"
+    },
+    "2948": {
+        "file_id": 237,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/loader/registry.py",
+        "type": "filepath"
+    },
+    "2949": {
+        "file_id": 237,
+        "content": "This code is importing modules and creating registries for pipelines and datasets in the PaddleVideo application. The registries allow easy management of different pipeline and dataset types, making it convenient to extend or customize them later on.",
+        "type": "summary"
+    },
+    "2950": {
+        "file_id": 237,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom ..utils import Registry\nPIPELINES = Registry(\"pipeline\")\nDATASETS = Registry(\"datasets\")",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/loader/registry.py:1-20"
+    },
+    "2951": {
+        "file_id": 237,
+        "content": "This code is importing modules and creating registries for pipelines and datasets in the PaddleVideo application. The registries allow easy management of different pipeline and dataset types, making it convenient to extend or customize them later on.",
+        "type": "comment"
+    },
+    "2952": {
+        "file_id": 238,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py",
+        "type": "filepath"
+    },
+    "2953": {
+        "file_id": 238,
+        "content": "This code file contains the initialization for a Video Quality Assessment application. It includes registrar for metrics, builder function for metrics and defines the QualityMetric class. The code is licensed under Apache License, Version 2.0 and distributed as-is without warranties or conditions.",
+        "type": "summary"
+    },
+    "2954": {
+        "file_id": 238,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .registry import METRIC\nfrom .build import build_metric\nfrom .quality_metric import QuqlityMetric\n__all__ = [\n    'METRIC', 'build_metric', 'QuqlityMetric'\n]",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py:1-23"
+    },
+    "2955": {
+        "file_id": 238,
+        "content": "This code file contains the initialization for a Video Quality Assessment application. It includes registrar for metrics, builder function for metrics and defines the QualityMetric class. The code is licensed under Apache License, Version 2.0 and distributed as-is without warranties or conditions.",
+        "type": "comment"
+    },
+    "2956": {
+        "file_id": 239,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py",
+        "type": "filepath"
+    },
+    "2957": {
+        "file_id": 239,
+        "content": "The BaseMetric class serves as a foundation for various video quality assessment metrics, requiring subclasses to implement the update and overridden methods. It utilizes numpy, paddle, and PaddleVideo's utils for data manipulation and distribution information.",
+        "type": "summary"
+    },
+    "2958": {
+        "file_id": 239,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\"\"\"\nfrom abc import abstractmethod\nimport numpy as np\nimport paddle\nfrom paddlevideo.utils import get_dist_info\nfrom .registry import METRIC\nclass BaseMetric(object):\n    \"\"\"Base Metric\"\"\"\n    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):\n        self.data_size = data_size\n        self.batch_size = batch_size\n        _, self.world_size = get_dist_info()\n        self.log_interval = log_interval\n    @abstractmethod\n    def update(self):\n        \"\"\"update\"\"\"\n        raise NotImplementedError\n    @abstractmethod",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py:1-36"
+    },
+    "2959": {
+        "file_id": 239,
+        "content": "This Python class, named BaseMetric, is a base class for different video quality assessment metrics. It initializes with data size, batch size, log interval, and optional keyword arguments. The update method must be implemented by subclasses to update the metric values. The class also has abstract methods that must be overridden in subclasses for actual functionality. It utilizes numpy, paddle, and PaddleVideo's utils for data manipulation and distribution information.",
+        "type": "comment"
+    },
+    "2960": {
+        "file_id": 239,
+        "content": "    def accumulate(self):\n        \"\"\"accumulate\"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py:37-39"
+    },
+    "2961": {
+        "file_id": 239,
+        "content": "This code defines an \"accumulate\" method in a base class, but it raises a NotImplementedError to indicate that subclasses must override this method with their own implementation.",
+        "type": "comment"
+    },
+    "2962": {
+        "file_id": 240,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/metrics/build.py",
+        "type": "filepath"
+    },
+    "2963": {
+        "file_id": 240,
+        "content": "This code file is a part of the PaddleVideo library and contains a function named \"build_metric\". It imports necessary modules, defines a metric registry, and provides a build function to construct metrics according to the specified configuration (cfg). The code is licensed under Apache License 2.0.",
+        "type": "summary"
+    },
+    "2964": {
+        "file_id": 240,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .registry import METRIC\nfrom ..utils import build\ndef build_metric(cfg):\n    \"\"\"build metric\"\"\"\n    return build(cfg, METRIC)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/metrics/build.py:1-23"
+    },
+    "2965": {
+        "file_id": 240,
+        "content": "This code file is a part of the PaddleVideo library and contains a function named \"build_metric\". It imports necessary modules, defines a metric registry, and provides a build function to construct metrics according to the specified configuration (cfg). The code is licensed under Apache License 2.0.",
+        "type": "comment"
+    },
+    "2966": {
+        "file_id": 241,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py",
+        "type": "filepath"
+    },
+    "2967": {
+        "file_id": 241,
+        "content": "This code calculates Pearson and Spearman correlation coefficients (PLCC & SROCC) for a given output and label pair using numpy arrays and scipy's stats functions.",
+        "type": "summary"
+    },
+    "2968": {
+        "file_id": 241,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\"\"\"\nimport numpy as np\nimport paddle\nfrom paddle.hapi.model import _all_gather\nfrom scipy import stats\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass QuqlityMetric(BaseMetric):\n    \"\"\"CenterCropQualityMetric\"\"\"\n    def __init__(self, data_size, batch_size, log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.output = []\n        self.label = []",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py:1-35"
+    },
+    "2969": {
+        "file_id": 241,
+        "content": "This code defines the QuqlityMetric class for measuring video quality. It imports necessary libraries, registers it with METRIC, and initializes attributes including data_size, batch_size, and log_interval. The output and label lists are used to store data during processing.",
+        "type": "comment"
+    },
+    "2970": {
+        "file_id": 241,
+        "content": "        self.y_pred = np.zeros(data_size)\n        self.y_test = np.zeros(data_size)\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        labels = data[1]\n        predict_output = paddle.tolist(outputs)\n        predict_label = paddle.tolist(labels)\n        predict_output_len = len(predict_output)\n        for i in range(predict_output_len):\n            self.output.append(predict_output[i][0])\n            self.label.append(predict_label[i][0])\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        test_output_np = np.array(self.output)\n        test_label_np = np.array(self.label)\n        PLCC = stats.pearsonr(test_output_np, test_label_np)[0]\n        SROCC = stats.spearmanr(test_output_np, test_label_np)[0]",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py:36-62"
+    },
+    "2971": {
+        "file_id": 241,
+        "content": "This code defines a class for calculating Pearson and Spearman correlation coefficients. The `update` method updates the metrics for each batch during training, while the `accumulate` method calculates the final Pearson (PLCC) and Spearman (SROCC) correlation coefficients after all iterations are finished.",
+        "type": "comment"
+    },
+    "2972": {
+        "file_id": 241,
+        "content": "        logger.info('[TEST] finished, PLCC= {}, SROCC= {} '.format(PLCC, SROCC))\n    def accumulate_train(self, output, label):\n        \"\"\"accumulate_train\"\"\"\n        output_np = np.array(output)\n        label_np = np.array(label)\n        PLCC = stats.pearsonr(output_np, label_np)[0]\n        SROCC = stats.spearmanr(output_np, label_np)[0]\n        return PLCC, SROCC",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py:64-72"
+    },
+    "2973": {
+        "file_id": 241,
+        "content": "This code snippet calculates the Pearson and Spearman correlation coefficients (PLCC and SROCC) for a given output and label pair. It uses numpy arrays to convert the input into numeric data types, then calculates the correlation values using scipy's stats.pearsonr and stats.spearmanr functions respectively. Finally, it returns the calculated PLCC and SROCC values.",
+        "type": "comment"
+    },
+    "2974": {
+        "file_id": 242,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py",
+        "type": "filepath"
+    },
+    "2975": {
+        "file_id": 242,
+        "content": "This code is importing the \"Registry\" class from the \"utils\" module and initializing a new instance called \"METRIC\" that will store different types of metrics. The comment indicates it is part of the PaddleVideo library for Video Quality Assessment, licensed under the Apache License 2.0.",
+        "type": "summary"
+    },
+    "2976": {
+        "file_id": 242,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom ..utils import Registry\nMETRIC = Registry('metric')",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py:1-19"
+    },
+    "2977": {
+        "file_id": 242,
+        "content": "This code is importing the \"Registry\" class from the \"utils\" module and initializing a new instance called \"METRIC\" that will store different types of metrics. The comment indicates it is part of the PaddleVideo library for Video Quality Assessment, licensed under the Apache License 2.0.",
+        "type": "comment"
+    },
+    "2978": {
+        "file_id": 243,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py",
+        "type": "filepath"
+    },
+    "2979": {
+        "file_id": 243,
+        "content": "The code imports modules, registers them in the registry, and exports a list of model-building modules with functions to build these models and specific classes like ResNet and TSNHead.",
+        "type": "summary"
+    },
+    "2980": {
+        "file_id": 243,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .backbones import ResNet\nfrom .builder import (build_backbone, build_head, build_recognizer,\n                      build_localizer, build_loss)\nfrom .heads import BaseHead, TSNHead, TSMRecHead\nfrom .losses import SmoothL1Loss, L1Loss\nfrom .framework.recognizers import BaseRecognizer, recognizer2d\nfrom .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS\nfrom .weight_init import weight_init_",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py:1-24"
+    },
+    "2981": {
+        "file_id": 243,
+        "content": "This code is importing various modules from different sub-directories and registers them in the registry. It also includes a license notice and a function for weight initialization.",
+        "type": "comment"
+    },
+    "2982": {
+        "file_id": 243,
+        "content": "__all__ = [\n    'BACKBONES',\n    'HEADS',\n    'RECOGNIZERS',\n    'LOCALIZERS',\n    'LOSSES',\n    'build_recognizer',\n    'build_localizer',\n    'build_head',\n    'build_backbone',\n    'build_loss',\n    'ResNet',\n    'TSNHead',\n    'BaseHead',\n    'TSMRecHead',\n    'BaseRecognizer',\n    'Recognizer2d',\n    'SmoothL1Loss',\n    'L1Loss',\n]",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py:26-45"
+    },
+    "2983": {
+        "file_id": 243,
+        "content": "This code exports a list of modules for model building, including backbones, heads, recognizers, localizers, and losses. It also includes functions to build these models and specific model classes like ResNet and TSNHead.",
+        "type": "comment"
+    },
+    "2984": {
+        "file_id": 244,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py",
+        "type": "filepath"
+    },
+    "2985": {
+        "file_id": 244,
+        "content": "This code file imports two backbone models (ResNet and ResNetTweaksTSM) from their respective modules, and then defines the available models in this module as 'ResNet' and 'ResNetTweaksTSM'.",
+        "type": "summary"
+    },
+    "2986": {
+        "file_id": 244,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .resnet import ResNet\nfrom .resnet_tweaks_tsm import ResNetTweaksTSM\n__all__ = ['ResNet', 'ResNetTweaksTSM']",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py:1-20"
+    },
+    "2987": {
+        "file_id": 244,
+        "content": "This code file imports two backbone models (ResNet and ResNetTweaksTSM) from their respective modules, and then defines the available models in this module as 'ResNet' and 'ResNetTweaksTSM'.",
+        "type": "comment"
+    },
+    "2988": {
+        "file_id": 245,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py",
+        "type": "filepath"
+    },
+    "2989": {
+        "file_id": 245,
+        "content": "ConvBNLayer combines Conv2D and BatchNorm2D in PaddlePaddle's ResNet class, using BasicBlock and BottleneckBlock with optional shortcut connections. The code dynamically creates layers, initializes weights, performs convolution and pooling operations, for a customizable deep learning model backbone.",
+        "type": "summary"
+    },
+    "2990": {
+        "file_id": 245,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport numpy as np\nimport math\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:1-35"
+    },
+    "2991": {
+        "file_id": 245,
+        "content": "This code defines the ConvBNLayer class, which is a combination of Conv2D and BatchNorm2D layers. It is part of a PaddlePaddle deep learning model backbone. The class takes arguments for its constructor, suggesting it is customizable or can be initialized with specific parameters. The weight initialization function is also imported to initialize the layer's weights. This could indicate that this class may involve complex neural network layers for image processing tasks like image classification or object detection.",
+        "type": "comment"
+    },
+    "2992": {
+        "file_id": 245,
+        "content": "        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:36-58"
+    },
+    "2993": {
+        "file_id": 245,
+        "content": "This code defines a ConvBNLayer class that takes parameters such as in_channels, out_channels, kernel_size, stride (default 1), groups (default 1), activation function (act) and name. It inherits from another class, super(ConvBNLayer, self). It then initializes the Conv2D layer with the provided parameters and is followed by an init_weights method for weight and bias initialization.",
+        "type": "comment"
+    },
+    "2994": {
+        "file_id": 245,
+        "content": "                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(out_channels,\n                                       weight_attr=ParamAttr(name=bn_name +\n                                                             \"_scale\"),\n                                       bias_attr=ParamAttr(bn_name + \"_offset\"))\n    def forward(self, inputs):\n        \"\"\"forward\"\"\"\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    \"\"\"BottleneckBlock\"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:59-89"
+    },
+    "2995": {
+        "file_id": 245,
+        "content": "This code defines a Convolutional Neural Network (CNN) layer with optional batch normalization and activation. It is initialized in the ResNet class, which also contains a forward function for feed-forward computation. The BottleneckBlock class extends this design to create a bottleneck block.",
+        "type": "comment"
+    },
+    "2996": {
+        "file_id": 245,
+        "content": "                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:90-113"
+    },
+    "2997": {
+        "file_id": 245,
+        "content": "This code defines a BottleneckBlock class with multiple ConvBNLayer instances for the \"branch2a\", \"branch2b\", and \"branch2c\" layers. The BottleneckBlock class is a building block for ResNet architecture in PaddleVideo, used to perform convolutional operations with specific parameters.",
+        "type": "comment"
+    },
+    "2998": {
+        "file_id": 245,
+        "content": "                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        \"\"\"forward\"\"\"\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.relu(y)\nclass BasicBlock(nn.Layer):\n    \"\"\"BasicBlock\"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:114-146"
+    },
+    "2999": {
+        "file_id": 245,
+        "content": "This code defines a class for a BasicBlock in a convolutional neural network. It contains a ConvBNLayer, another ConvBNLayer, and an optional shortcut connection. The forward function performs the operations within the block and returns the output after applying ReLU activation.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/3.json b/docs/data/3.json
new file mode 100644
index 000000000..d1075e268
--- /dev/null
+++ b/docs/data/3.json
@@ -0,0 +1,539 @@
+{
+    "300": {
+        "file_id": 30,
+        "content": "    result = {'overlays': overlays}\n    # result = {'masks': masks.tolist()}\n    with open(TEMP_JSON_FINAL_PATH, 'w') as f:\n        json.dump(result, f)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/api.py:131-134"
+    },
+    "301": {
+        "file_id": 30,
+        "content": "This code is saving a dictionary of overlays to a JSON file. It was previously also saving a list of masks, but that functionality has been commented out. The dictionary contains the overlays and the resulting JSON will be written to the specified temporary path.",
+        "type": "comment"
+    },
+    "302": {
+        "file_id": 31,
+        "content": "/applications/EIVideo/EIVideo/main.py",
+        "type": "filepath"
+    },
+    "303": {
+        "file_id": 31,
+        "content": "This code trains the PaddleVideo model using command line arguments, initializes the environment, and performs operations with distributed training and automatic mixed precision support.",
+        "type": "summary"
+    },
+    "304": {
+        "file_id": 31,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless requifFred by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport random\nimport numpy as np\nimport paddle\nfrom EIVideo.paddlevideo.tasks import (test_model)\nfrom EIVideo.paddlevideo.utils import get_config, get_dist_info\nfrom EIVideo import EI_VIDEO_ROOT, join_root_path\nDEF_CONFIG_FILE_PATH = join_root_path(\"configs/manet.yaml\")\nDEF_PARAMS_FILE_PATH = join_root_path(\"model/default_manet.pdparams\")\ndef parse_args():\n    parser = argparse.ArgumentParser(\"PaddleVideo train script\")",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/main.py:1-29"
+    },
+    "305": {
+        "file_id": 31,
+        "content": "This code is a Python script for training the PaddleVideo model. It imports necessary modules, defines functions to parse command line arguments and sets default configuration and parameter files. The script uses argparse to create an argument parser with a description \"PaddleVideo train script\". It also provides default paths for config file (\"configs/manet.yaml\") and parameter file (\"model/default_manet.pdparams\").",
+        "type": "comment"
+    },
+    "306": {
+        "file_id": 31,
+        "content": "    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default=DEF_CONFIG_FILE_PATH,\n                        help='config file path')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument('--test',\n                        action='store_true',\n                        help='whether to test a model')\n    parser.add_argument('--train_dali',\n                        action='store_true',\n                        help='whether to use dali to speed up training')\n    parser.add_argument('--multigrid',\n                        action='store_true',\n                        help='whether to use multigrid training')\n    parser.add_argument('-w',\n                        '--weights',\n                        type=str,\n                        default=DEF_PARAMS_FILE_PATH,\n                        help='weights for finetuning or testing')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/main.py:30-53"
+    },
+    "307": {
+        "file_id": 31,
+        "content": "This code defines command line arguments for the EIVideo application. It sets default values and provides help messages for config file path, overriding options, testing a model, using Dali for training speedup, multigrid training, and weights for finetuning or testing.",
+        "type": "comment"
+    },
+    "308": {
+        "file_id": 31,
+        "content": "    parser.add_argument('--fleet',\n                        action='store_true',\n                        help='whether to use fleet run distributed training')\n    parser.add_argument('--amp',\n                        action='store_true',\n                        help='whether to open amp training.')\n    parser.add_argument(\n        '--validate',\n        action='store_true',\n        help='whether to evaluate the checkpoint during training')\n    parser.add_argument(\n        '--seed',\n        type=int,\n        default=None,\n        help='fixed all random seeds when the program is running')\n    parser.add_argument(\n        '--max_iters',\n        type=int,\n        default=None,\n        help='max iterations when training(this argonly used in test_tipc)')\n    parser.add_argument(\n        '-p',\n        '--profiler_options',\n        type=str,\n        default=None,\n        help='The option of profiler, which should be in format '\n             '\\\"key1=value1;key2=value2;key3=value3\\\".')\n    parser.add_argument('--use_npu',\n                        type=bool,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/main.py:54-82"
+    },
+    "309": {
+        "file_id": 31,
+        "content": "This code snippet adds command-line arguments to a parser object. The \"--fleet\" argument enables distributed training using fleet, \"--amp\" enables automatic mixed precision training, \"--validate\" triggers checkpoint evaluation during training, \"--seed\" sets random seeds for deterministic behavior, \"--max_iters\" sets the maximum number of iterations, and \"--profiler_options\" sets profiler options in key-value pairs.",
+        "type": "comment"
+    },
+    "310": {
+        "file_id": 31,
+        "content": "                        default=False,\n                        help='whether use npu.')\n    args = parser.parse_args()\n    return args\ndef main(**kwargs):\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override)\n    # ToDo To AP-kai: 下面这行代码目的是更新配置，这样的话我们调用main(use_npu = Ture)，这时cfg.use_npu就是Ture了\n    for key, value in kwargs.items():\n        cfg.__setattr__(key, value)\n    # set seed if specified\n    seed = args.seed\n    if seed is not None:\n        assert isinstance(\n            seed,\n            int), f\"seed must be a integer when specified, but got {seed}\"\n        paddle.seed(seed)\n        np.random.seed(seed)\n        random.seed(seed)\n    _, world_size = get_dist_info()\n    parallel = world_size != 1\n    if parallel:\n        paddle.distributed.init_parallel_env()\n    final = test_model(cfg, weights=args.weights, parallel=parallel)\n    return final\nif __name__ == '__main__':\n    main(video_path='example/example1.mp4', save_path='./output')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/main.py:83-116"
+    },
+    "311": {
+        "file_id": 31,
+        "content": "This code defines a `main` function that parses command-line arguments, updates the configuration with optional kwargs, sets the random seed if specified, initializes parallel environment if necessary, and then calls `test_model` to perform some operation. Finally, it returns the final result. It is called as `main(video_path='example/example1.mp4', save_path='./output')`.",
+        "type": "comment"
+    },
+    "312": {
+        "file_id": 32,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/__init__.py",
+        "type": "filepath"
+    },
+    "313": {
+        "file_id": 32,
+        "content": "This code snippet is importing the paddlevideo_version from the version module. This suggests that this file is serving as an initialization point for the PaddleVideo library, potentially setting up necessary imports or defining constants and functions to be used throughout the library.",
+        "type": "summary"
+    },
+    "314": {
+        "file_id": 32,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .version import paddlevideo_version",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/__init__.py:1-15"
+    },
+    "315": {
+        "file_id": 32,
+        "content": "This code snippet is importing the paddlevideo_version from the version module. This suggests that this file is serving as an initialization point for the PaddleVideo library, potentially setting up necessary imports or defining constants and functions to be used throughout the library.",
+        "type": "comment"
+    },
+    "316": {
+        "file_id": 33,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py",
+        "type": "filepath"
+    },
+    "317": {
+        "file_id": 33,
+        "content": "This code imports necessary functions and classes from other modules, defines the exported symbols (build_batch_pipeline and Compose), and sets license information.",
+        "type": "summary"
+    },
+    "318": {
+        "file_id": 33,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .builder import build_batch_pipeline\nfrom .pipelines.compose import Compose\n__all__ = [\n    'build_batch_pipeline','Compose'\n]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py:1-20"
+    },
+    "319": {
+        "file_id": 33,
+        "content": "This code imports necessary functions and classes from other modules, defines the exported symbols (build_batch_pipeline and Compose), and sets license information.",
+        "type": "comment"
+    },
+    "320": {
+        "file_id": 34,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py",
+        "type": "filepath"
+    },
+    "321": {
+        "file_id": 34,
+        "content": "This code creates PaddleVideo dataset loaders and sets up signal handlers for graceful termination of a process group upon receiving SIGINT or SIGTERM signals.",
+        "type": "summary"
+    },
+    "322": {
+        "file_id": 34,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport signal\nimport os\nimport paddle\nfrom paddle.io import BatchSampler, DataLoader, DistributedBatchSampler\nfrom .pipelines.compose import Compose\nfrom .registry import DATASETS, PIPELINES, DATALOADERS, BATCH_SAMPLERS, SAMPLERS\nfrom ..utils import get_logger\nfrom ..utils.build_utils import build\nimport numpy as np\nlogger = get_logger(\"paddlevideo\")\ndef build_pipeline(cfg):\n    \"\"\"Build pipeline.\n    Args:\n        cfg (dict): root config dict.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:1-31"
+    },
+    "323": {
+        "file_id": 34,
+        "content": "This code snippet is a part of the PaddleVideo library and contains a function named build_pipeline. It imports various modules, defines a logger for logging purposes, and uses a function called build from utils. This function seems to be building some kind of pipeline based on the provided configuration (cfg). The purpose of this pipeline might be to process data or prepare it for model training in the context of PaddleVideo.",
+        "type": "comment"
+    },
+    "324": {
+        "file_id": 34,
+        "content": "    \"\"\"\n    if cfg == None:\n        return\n    return Compose(cfg)\ndef build_dataset(cfg):\n    \"\"\"Build dataset.\n    Args:\n        cfg (dict): root config dict.\n    Returns:\n        dataset: dataset.\n    \"\"\"\n    # XXX: ugly code here!\n    cfg_dataset, cfg_pipeline = cfg\n    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)\n    dataset = build(cfg_dataset, DATASETS, key=\"format\")\n    return dataset\ndef build_sampler(cfg):\n    \"\"\"Build batch_sampler.\n    Args:\n        cfg (dict): root config dict.\n    Returns:\n        batch_sampler: batch_sampler.\n    \"\"\"\n    sampler = build(cfg, SAMPLERS)\n    return sampler\ndef build_batch_pipeline(cfg):\n    batch_pipeline = build(cfg, PIPELINES)\n    return batch_pipeline\ndef build_custom_dataloader(cfg):\n    custom_dataloader = build(cfg, DATALOADERS, key='dataloader')\n    return custom_dataloader\ndef build_dataloader(dataset,\n                     batch_size,\n                     num_workers,\n                     places=None,\n                     shuffle=True,\n                     drop_last=True,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:32-80"
+    },
+    "325": {
+        "file_id": 34,
+        "content": "This code defines several functions to build different components for a dataset loader. The main function is `build_dataset` which takes a configuration dictionary and returns a dataset object after building the pipeline, dataset, sampler, and dataloader as per the given configuration. It uses other helper functions like `build_pipeline`, `build_sampler`, `build_batch_pipeline`, and `build_custom_dataloader` to build these components.",
+        "type": "comment"
+    },
+    "326": {
+        "file_id": 34,
+        "content": "                     multigrid=False,\n                     collate_fn_cfg=None,\n                     **kwargs):\n    \"\"\"Build Paddle Dataloader.\n    XXX explain how the batch_sampler work!\n    Args:\n        dataset (paddle.dataset): A PaddlePaddle dataset object.\n        batch_size (int): batch size on single card.\n        num_worker (int): num_worker\n        shuffle(bool): whether to shuffle the data at every epoch.\n    \"\"\"\n    if not kwargs.get('sampler'):\n        batch_sampler = DistributedBatchSampler(dataset,\n                                                batch_size=batch_size,\n                                                shuffle=shuffle,\n                                                drop_last=drop_last)\n    else:\n        sampler = build_sampler(kwargs['sampler'])\n        batch_sampler = BatchSampler(dataset,\n                                     sampler=sampler,\n                                     batch_size=batch_size,\n                                     shuffle=shuffle,\n                                     drop_last=drop_last)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:81-106"
+    },
+    "327": {
+        "file_id": 34,
+        "content": "The code builds a Paddle Dataloader with optional custom sampler, shuffles data if necessary, and handles distributed batch sampling. It takes dataset, batch size, number of workers, and shuffle settings as input arguments.",
+        "type": "comment"
+    },
+    "328": {
+        "file_id": 34,
+        "content": "    kwargs.update({'batch_sampler': batch_sampler})\n    # NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.\n    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:\n    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.\n    def mix_collate_fn(batch):\n        pipeline = build_batch_pipeline(collate_fn_cfg)\n        batch = pipeline(batch)\n        slots = []\n        for items in batch:\n            for i, item in enumerate(items):\n                if len(slots) < len(items):\n                    slots.append([item])\n                else:\n                    slots[i].append(item)\n        return [np.stack(slot, axis=0) for slot in slots]\n    # if collate_fn_cfg is not None:\n    # ugly code here. collate_fn is mix op config\n    #    collate_fn = mix_collate_fn(collate_fn_cfg)\n    data_loader = DataLoader(\n        dataset,\n        places=places,\n        num_workers=num_workers,\n        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:107-134"
+    },
+    "329": {
+        "file_id": 34,
+        "content": "This code defines a mix_collate_fn for handling batches of data in a specific way. It first builds a batch pipeline and applies it to the input batch. Then, it collates the batch so that each item is stacked horizontally (axis=0) into a new batch. This function is used as the collate_fn if the collate_fn_cfg is not None.",
+        "type": "comment"
+    },
+    "330": {
+        "file_id": 34,
+        "content": "        **kwargs)\n    return data_loader\ndef term_mp(sig_num, frame):\n    \"\"\" kill all child processes\n    \"\"\"\n    pid = os.getpid()\n    pgid = os.getpgid(os.getpid())\n    logger.info(\"main proc {} exit, kill process group \" \"{}\".format(pid, pgid))\n    os.killpg(pgid, signal.SIGKILL)\n    return\nsignal.signal(signal.SIGINT, term_mp)\nsignal.signal(signal.SIGTERM, term_mp)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:135-151"
+    },
+    "331": {
+        "file_id": 34,
+        "content": "This code is setting up signal handlers for SIGINT and SIGTERM signals. It retrieves the process ID (pid) and process group ID (pgid), logs a message, then sends a SIGKILL signal to all processes in the group upon receiving either of those signals.",
+        "type": "comment"
+    },
+    "332": {
+        "file_id": 35,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py",
+        "type": "filepath"
+    },
+    "333": {
+        "file_id": 35,
+        "content": "This code file contains import statements and a list of functions used for image preprocessing in PaddleVideo's EIVideo application. It includes Resize, RandomCrop, RandomHorizontalFlip, ToTensor, and RandomScale transformations specific to the \"manet\" model. These transformations are part of PaddlePaddle's video processing framework.",
+        "type": "summary"
+    },
+    "334": {
+        "file_id": 35,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .custom_transforms_f import Resize_manet, RandomCrop_manet, RandomHorizontalFlip_manet, ToTensor_manet, \\\n    RandomScale_manet\n__all__ = [\n     'Resize_manet', 'RandomCrop_manet',\n    'RandomHorizontalFlip_manet', 'ToTensor_manet', 'RandomScale_manet',\n]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py:1-21"
+    },
+    "335": {
+        "file_id": 35,
+        "content": "This code file contains import statements and a list of functions used for image preprocessing in PaddleVideo's EIVideo application. It includes Resize, RandomCrop, RandomHorizontalFlip, ToTensor, and RandomScale transformations specific to the \"manet\" model. These transformations are part of PaddlePaddle's video processing framework.",
+        "type": "comment"
+    },
+    "336": {
+        "file_id": 36,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py",
+        "type": "filepath"
+    },
+    "337": {
+        "file_id": 36,
+        "content": "The Compose class combines registry-based pipeline components like decode functions, sample functions, and transforms to apply transformations flexibly on dictionary or list inputs. It includes a workaround for old format configuration files.",
+        "type": "summary"
+    },
+    "338": {
+        "file_id": 36,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections.abc import Sequence\nfrom ..registry import PIPELINES\nimport traceback\nfrom ...utils import build\nfrom ...utils import get_logger\n@PIPELINES.register()\nclass Compose(object):\n    \"\"\"\n    Composes several pipelines(include decode func, sample func, and transforms) together.\n    Note: To deal with ```list``` type cfg temporaray, like:\n        transform:\n            - Crop: # A list\n                attribute: 10",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:1-31"
+    },
+    "339": {
+        "file_id": 36,
+        "content": "This code defines the Compose class, which composes multiple pipelines such as decode functions, sample functions, and transforms. It uses the PIPELINES registry for registration and builds pipelines based on input configurations. The code also handles temporary list-type configuration for flexibility.",
+        "type": "comment"
+    },
+    "340": {
+        "file_id": 36,
+        "content": "            - Resize: # A list\n                attribute: 20\n    every key of list will pass as the key name to build a module.\n    XXX: will be improved in the future.\n    Args:\n        pipelines (list): List of transforms to compose.\n    Returns:\n        A compose object which is callable, __call__ for this Compose\n        object will call each given :attr:`transforms` sequencely.\n    \"\"\"\n    def __init__(self, pipelines):\n        #assert isinstance(pipelines, Sequence)\n        self.pipelines = []\n        for p in pipelines.values():\n            if isinstance(p, dict):\n                p = build(p, PIPELINES)\n                self.pipelines.append(p)\n            elif isinstance(p, list):\n                for t in p:\n                    #XXX: to deal with old format cfg, ugly code here!\n                    temp_dict = dict(name=list(t.keys())[0])\n                    for all_sub_t in t.values():\n                        if all_sub_t is not None:\n                            temp_dict.update(all_sub_t) \n                    t = build(temp_dict, PIPELINES)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:32-59"
+    },
+    "341": {
+        "file_id": 36,
+        "content": "The code defines a Compose class that takes a list of transforms and composes them sequentially. It checks if the input is a dictionary or a list, builds the transform modules using build function from PIPELINES, and appends them to the pipelines list. The code also includes an ugly workaround for dealing with old format configuration files.",
+        "type": "comment"
+    },
+    "342": {
+        "file_id": 36,
+        "content": "                    self.pipelines.append(t)\n            elif callable(p):\n                self.pipelines.append(p)\n            else:\n                raise TypeError(f'pipelines must be callable or a dict,'\n                                f'but got {type(p)}')\n    def __call__(self, data):\n        for p in self.pipelines:\n            try:\n                data = p(data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger = get_logger(\"paddlevideo\")\n                logger.info(\"fail to perform transform [{}] with error: \"\n                      \"{} and stack:\\n{}\".format(p, e, str(stack_info)))\n                raise e\n        return data",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:60-76"
+    },
+    "343": {
+        "file_id": 36,
+        "content": "The code defines a class with a `__call__` method and an append function for adding pipelines. The `__call__` method applies transformations to input data by iterating over the pipelines. If any pipeline fails, it logs the error and raises an exception. Pipelines can be either callable or dictionaries, but if not, a TypeError is raised.",
+        "type": "comment"
+    },
+    "344": {
+        "file_id": 37,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py",
+        "type": "filepath"
+    },
+    "345": {
+        "file_id": 37,
+        "content": "This code defines Paddle Video's image preprocessing classes for resizing, aspect ratio adjustment, and custom cropping transforms. It performs horizontal flipping, object detection, and returns foreground/nocare masks from a scribble image.",
+        "type": "summary"
+    },
+    "346": {
+        "file_id": 37,
+        "content": "import os\nimport random\nimport cv2\nimport numpy as np\nimport paddle\nfrom PIL import Image\nfrom davisinteractive.utils.operations import bresenham\nfrom ..registry import PIPELINES\ncv2.setNumThreads(0)\nNEW_BRANCH = True\n@PIPELINES.register()\nclass RandomScale_manet(object):\n    \"\"\"Randomly resize the image and the ground truth to specified scales.\n    Args:\n        scales (list): the list of scales\n    \"\"\"\n    def __init__(self, scales=[0.75, 1, 1.25]):\n        self.scales = scales\n    def __call__(self, sample):\n        # Fixed range of scales\n        sc = self.scales[random.randint(0, len(self.scales) - 1)]\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':\n                flagval = cv2.INTER_CUBIC\n            else:\n                flagval = cv2.INTER_NEAREST\n            tmp = cv2.resize(tmp, None, fx=sc, fy=sc, interpolation=flagval)\n            sample[elem] = tmp\n        return sample",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:1-43"
+    },
+    "347": {
+        "file_id": 37,
+        "content": "This code defines a Paddle Video pipeline class, \"RandomScale\\_manet,\" which resizes the input image and its corresponding ground truth to random scales. The allowed scales are [0.75, 1, 1.25]. For elements like 'img1', 'img2', or 'ref\\_img,' it uses cv2's INTER_CUBIC interpolation. For other elements, it utilizes cv2's INTER_NEAREST interpolation. The pipeline is registered at PIPELINES for further usage.",
+        "type": "comment"
+    },
+    "348": {
+        "file_id": 37,
+        "content": "@PIPELINES.register()\nclass Resize_manet(object):\n    \"\"\"Rescale the image in a results to a given size.\n    Args:\n        output_size (tuple or int): Desired output size. If tuple, output is\n            matched to output_size. If int, smaller of image edges is matched\n            to output_size keeping aspect ratio the same.\n    \"\"\"\n    def __init__(self, output_size):\n        assert isinstance(output_size, (int, list))\n        if isinstance(output_size, int):\n            self.output_size = (output_size, output_size)\n        else:\n            self.output_size = output_size\n    #        self.seg_interpolation = cv2.INTER_CUBIC if is_continuous else cv2.INTER_NEAREST\n    #        self.fix = fix\n    def __call__(self, results):\n        img1 = results['img1']\n        h, w = img1.shape[:2]\n        if self.output_size == (h, w):\n            return results\n        else:\n            new_h, new_w = self.output_size\n        new_h, new_w = int(new_h), int(new_w)\n        for elem in results.keys():\n            if 'meta' in elem:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:46-75"
+    },
+    "349": {
+        "file_id": 37,
+        "content": "The code defines a Resize_manet class, which is a pipeline for resizing an image to a specified output size. The input could be either an integer or a tuple representing the desired output dimensions. If the input is an integer, it will use that as the smaller edge of the image and maintain aspect ratio. The code checks if the current image size matches the desired output size; if so, it returns the results without modification, otherwise it resizes the image to match the desired output size. This class is used in a computer vision context for preprocessing images.",
+        "type": "comment"
+    },
+    "350": {
+        "file_id": 37,
+        "content": "                continue\n            tmp = results[elem]\n            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':\n                flagval = cv2.INTER_CUBIC\n            else:\n                flagval = cv2.INTER_NEAREST\n            tmp = cv2.resize(tmp, dsize=(new_w, new_h), interpolation=flagval)\n            results[elem] = tmp\n        return results\n@PIPELINES.register()\nclass RandomCrop_manet(object):\n    \"\"\"Crop randomly the image in a results.\n    Args:\n        output_size (tuple or int): Desired output size. If int, square crop\n            is made.\n    \"\"\"\n    def __init__(self, output_size, step=None):\n        assert isinstance(output_size, (int, list))\n        if isinstance(output_size, int):\n            self.output_size = (output_size, output_size)\n        else:\n            assert len(output_size) == 2\n            self.output_size = output_size\n        self.step = step\n    def __call__(self, results):\n        image = results['img1']\n        h, w = image.shape[:2]\n        new_h, new_w = self.output_size",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:76-109"
+    },
+    "351": {
+        "file_id": 37,
+        "content": "This code defines a custom transform for image processing, specifically random cropping. It takes an input image and crops it randomly to the specified output size. The interpolation method used during resizing is determined by the element type in the 'results' dictionary. If the element is 'img1', 'img2', or 'ref_img', cubic interpolation is used, otherwise nearest neighbor interpolation is used. The cropped image is then stored back into the results dictionary.",
+        "type": "comment"
+    },
+    "352": {
+        "file_id": 37,
+        "content": "        new_h = h if new_h >= h else new_h\n        new_w = w if new_w >= w else new_w\n        is_contain_obj = False\n        #        while (not is_contain_obj) and (step < 5):\n        if self.step is None:\n            while not is_contain_obj:\n                #                step += 1\n                top = np.random.randint(0, h - new_h + 1)\n                left = np.random.randint(0, w - new_w + 1)\n                ref_scribble_label = results['ref_scribble_label']\n                new_ref_scribble_label = ref_scribble_label[top:top + new_h,\n                                                            left:left + new_w]\n                if len(np.unique(new_ref_scribble_label)) == 1:\n                    continue\n                else:\n                    for elem in results.keys():\n                        if 'meta' in elem:\n                            continue\n                        tmp = results[elem]\n                        tmp = tmp[top:top + new_h, left:left + new_w]\n                        results[elem] = tmp",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:111-134"
+    },
+    "353": {
+        "file_id": 37,
+        "content": "This code randomly crops an image and its associated labels to the specified new height and width. It checks if the cropped reference scribble label contains only one class label before updating other labels accordingly.",
+        "type": "comment"
+    },
+    "354": {
+        "file_id": 37,
+        "content": "                    break\n        else:\n            st = 0\n            while not is_contain_obj and st < self.step:\n                st += 1\n                top = np.random.randint(0, h - new_h + 1)\n                left = np.random.randint(0, w - new_w + 1)\n                ref_scribble_label = results['ref_scribble_label']\n                new_ref_scribble_label = ref_scribble_label[top:top + new_h,\n                                                            left:left + new_w]\n                if len(np.unique(\n                        new_ref_scribble_label)) == 1 or st < self.step - 1:\n                    continue\n                else:\n                    for elem in results.keys():\n                        if 'meta' in elem:\n                            continue\n                        tmp = results[elem]\n                        tmp = tmp[top:top + new_h, left:left + new_w]\n                        results[elem] = tmp\n                    break\n        return results\n@PIPELINES.register()\nclass RandomHorizontalFlip_manet(object):",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:135-163"
+    },
+    "355": {
+        "file_id": 37,
+        "content": "This code randomly selects a region within the original image and applies random horizontal flipping to it. It checks if the selected region contains an object (by checking if the number of unique labels in ref_scribble_label is greater than 1) and continues flipping until either an object is found or the maximum allowed steps are reached. The function then returns the modified dictionary with the updated data for each key, except 'meta'. This custom transform is registered as a pipeline module for use in image processing tasks.",
+        "type": "comment"
+    },
+    "356": {
+        "file_id": 37,
+        "content": "    \"\"\"Horizontally flip the given image and ground truth randomly with a probability of 0.5.\"\"\"\n    def __init__(self, prob):\n        self.p = prob\n    def __call__(self, results):\n        if random.random() < self.p:\n            for elem in results.keys():\n                if 'meta' in elem:\n                    continue\n                tmp = results[elem]\n                tmp = cv2.flip(tmp, flipCode=1)\n                results[elem] = tmp\n        return results\n@PIPELINES.register()\nclass ToTensor_manet(object):\n    \"\"\"Convert ndarrays in results to Tensors.\"\"\"\n    def __call__(self, results):\n        for elem in results.keys():\n            if 'meta' in elem:\n                continue\n            tmp = results[elem]\n            if tmp.ndim == 2:\n                tmp = tmp[:, :, np.newaxis]\n            else:\n                tmp = tmp / 255.\n                tmp -= (0.485, 0.456, 0.406)\n                tmp /= (0.229, 0.224, 0.225)\n            tmp = tmp.transpose([2, 0, 1])\n            results[elem] = paddle.to_tensor(tmp)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:164-198"
+    },
+    "357": {
+        "file_id": 37,
+        "content": "This code snippet contains two custom transforms for image processing. The first one, HorizontalFlip, randomly flips the given image and ground truth horizontally with a probability of 0.5. The second one, ToTensor_manet, converts ndarrays in results to Tensors by normalizing the images and reshaping them as required. Both transforms are added to the PADDLEPIPELINES registry for later use in image processing pipelines.",
+        "type": "comment"
+    },
+    "358": {
+        "file_id": 37,
+        "content": "        return results\ndef gt_from_scribble(scr, dilation=11, nocare_area=21):\n    # Compute foreground\n    if scr.max() == 1:\n        kernel_fg = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,\n                                              (dilation, dilation))\n        fg = cv2.dilate(scr.astype(np.uint8),\n                        kernel=kernel_fg).astype(scr.dtype)\n    else:\n        fg = scr\n    # Compute nocare area\n    if nocare_area is None:\n        nocare = None\n    else:\n        kernel_nc = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,\n                                              (nocare_area, nocare_area))\n        nocare = cv2.dilate(fg, kernel=kernel_nc) - fg\n    return fg, nocare",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:199-220"
+    },
+    "359": {
+        "file_id": 37,
+        "content": "This function takes in a scribble image and optionally dilation and nocare area values. It returns the foreground mask and nocare mask. If the maximum value of the scribble is 1, it computes the foreground by dilating the scribble using an ellipse kernel. Else, it assigns the scribble as the foreground. Then, if a nocare area is given, it computes the nocare mask by dilating the foreground with another ellipse kernel and subtracting the original foreground.",
+        "type": "comment"
+    },
+    "360": {
+        "file_id": 38,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py",
+        "type": "filepath"
+    },
+    "361": {
+        "file_id": 38,
+        "content": "This code is importing Registry classes from the \"utils\" module and creating four different registries named PIPELINES, DATASETS, SAMPLERS, BATCH_SAMPLERS, and DATALOADERS. These registries will be used for organizing and managing various functionalities in the PaddleVideo framework.",
+        "type": "summary"
+    },
+    "362": {
+        "file_id": 38,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nPIPELINES = Registry(\"pipeline\")\nDATASETS = Registry(\"datasets\")\nSAMPLERS = Registry(\"sampler\")\nBATCH_SAMPLERS = Registry(\"batch_sampler\")\nDATALOADERS = Registry(\"dataloader\")",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py:1-21"
+    },
+    "363": {
+        "file_id": 38,
+        "content": "This code is importing Registry classes from the \"utils\" module and creating four different registries named PIPELINES, DATASETS, SAMPLERS, BATCH_SAMPLERS, and DATALOADERS. These registries will be used for organizing and managing various functionalities in the PaddleVideo framework.",
+        "type": "comment"
+    },
+    "364": {
+        "file_id": 39,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py",
+        "type": "filepath"
+    },
+    "365": {
+        "file_id": 39,
+        "content": "This code is part of the PaddleVideo library and includes the necessary imports and declarations for the VOSMetric and build_metric functions. It also contains licensing information, specifying that it's under the Apache License, Version 2.0.",
+        "type": "summary"
+    },
+    "366": {
+        "file_id": 39,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .vos_metric import VOSMetric\nfrom .build import build_metric\n__all__ = [\n    'VOSMetric', \"build_metric\"\n]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py:1-20"
+    },
+    "367": {
+        "file_id": 39,
+        "content": "This code is part of the PaddleVideo library and includes the necessary imports and declarations for the VOSMetric and build_metric functions. It also contains licensing information, specifying that it's under the Apache License, Version 2.0.",
+        "type": "comment"
+    },
+    "368": {
+        "file_id": 40,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py",
+        "type": "filepath"
+    },
+    "369": {
+        "file_id": 40,
+        "content": "This code defines an abstract base class `BaseMetric` for metrics in PaddleVideo's EIVideo application. It initializes the metric object with data size, batch size, and world size from distributed environment. The abstract methods `update()` and `accumulate()` must be implemented by subclasses.",
+        "type": "summary"
+    },
+    "370": {
+        "file_id": 40,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nfrom EIVideo.paddlevideo.utils import get_dist_info\nclass BaseMetric(object):\n    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):\n        self.data_size = data_size\n        self.batch_size = batch_size\n        _, self.world_size = get_dist_info()\n        self.log_interval = log_interval\n    @abstractmethod\n    def update(self):\n        raise NotImplemented\n    @abstractmethod\n    def accumulate(self):\n        raise NotImplemented",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py:1-30"
+    },
+    "371": {
+        "file_id": 40,
+        "content": "This code defines an abstract base class `BaseMetric` for metrics in PaddleVideo's EIVideo application. It initializes the metric object with data size, batch size, and world size from distributed environment. The abstract methods `update()` and `accumulate()` must be implemented by subclasses.",
+        "type": "comment"
+    },
+    "372": {
+        "file_id": 41,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py",
+        "type": "filepath"
+    },
+    "373": {
+        "file_id": 41,
+        "content": "Copyright notice, Apache License v2.0, software distributed as is without warranties or conditions. Imports registry and utils modules, defines build_metric function that builds metric using provided configuration.",
+        "type": "summary"
+    },
+    "374": {
+        "file_id": 41,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import METRIC\nfrom ..utils import build\ndef build_metric(cfg):\n    return build(cfg, METRIC)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py:1-20"
+    },
+    "375": {
+        "file_id": 41,
+        "content": "Copyright notice, Apache License v2.0, software distributed as is without warranties or conditions. Imports registry and utils modules, defines build_metric function that builds metric using provided configuration.",
+        "type": "comment"
+    },
+    "376": {
+        "file_id": 42,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py",
+        "type": "filepath"
+    },
+    "377": {
+        "file_id": 42,
+        "content": "The code imports a Registry class from the utils module and initializes the METRIC as an instance of this Registry, designed to store and manage different types of metrics.",
+        "type": "summary"
+    },
+    "378": {
+        "file_id": 42,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nMETRIC = Registry('metric')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py:1-17"
+    },
+    "379": {
+        "file_id": 42,
+        "content": "The code imports a Registry class from the utils module and initializes the METRIC as an instance of this Registry, designed to store and manage different types of metrics.",
+        "type": "comment"
+    },
+    "380": {
+        "file_id": 43,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py",
+        "type": "filepath"
+    },
+    "381": {
+        "file_id": 43,
+        "content": "The VOSMetric class initializes attributes, performs data processing and augmentation for video object segmentation, measures frame rates, handles flipped labels, and frees memory. It also tracks sequences, compresses files into a zip, creates image masks, aggregates metrics, and logs results.",
+        "type": "summary"
+    },
+    "382": {
+        "file_id": 43,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport os\nimport paddle\nimport zipfile\nimport time\nfrom PIL import Image\nfrom paddle.io import DataLoader\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom EIVideo.paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass VOSMetric(BaseMetric):\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 result_root,\n                 zip_dir,\n                 log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:1-38"
+    },
+    "383": {
+        "file_id": 43,
+        "content": "This code defines a class VOSMetric that inherits from BaseMetric and is registered in the METRIC registry. It takes data_size, batch_size, result_root, zip_dir, and log_interval as parameters for metrics preparation.",
+        "type": "comment"
+    },
+    "384": {
+        "file_id": 43,
+        "content": "        self.video_num = 0\n        self.total_time = 0\n        self.total_frame = 0\n        self.total_sfps = 0\n        self.total_video_num = data_size\n        self.count = 0\n        self.result_root = result_root\n        self.zip_dir = zip_dir\n    def update(self, batch_id, data, model):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        self.video_num += 1\n        seq_dataset = data\n        seq_name = seq_dataset.seq_name\n        logger.info('Prcessing Seq {} [{}/{}]:'.format(seq_name,\n                                                       self.video_num,\n                                                       self.total_video_num))\n        seq_dataloader = DataLoader(seq_dataset,\n                                    return_list=True,\n                                    batch_size=1,\n                                    shuffle=False,\n                                    num_workers=0)\n        seq_total_time = 0\n        seq_total_frame = 0\n        ref_embeddings = []\n        ref_masks = []\n        prev_embedding = []",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:39-67"
+    },
+    "385": {
+        "file_id": 43,
+        "content": "The code initializes a class with attributes to store video processing information, such as the total number of videos, count, result root, and zip directory. The update method takes batch ID, data, and model as inputs and processes each sequence by incrementing the video number and logging the current processed sequence. It then creates a data loader, calculates the total time and frame count for the current sequence, and stores reference embeddings and masks.",
+        "type": "comment"
+    },
+    "386": {
+        "file_id": 43,
+        "content": "        prev_mask = []\n        with paddle.no_grad():\n            for frame_idx, samples in enumerate(seq_dataloader):\n                time_start = time.time()\n                all_preds = []\n                join_label = None\n                for aug_idx in range(len(samples)):\n                    if len(ref_embeddings) <= aug_idx:\n                        ref_embeddings.append([])\n                        ref_masks.append([])\n                        prev_embedding.append(None)\n                        prev_mask.append(None)\n                    sample = samples[aug_idx]\n                    ref_emb = ref_embeddings[aug_idx]\n                    ref_m = ref_masks[aug_idx]\n                    prev_emb = prev_embedding[aug_idx]\n                    prev_m = prev_mask[aug_idx]\n                    current_img = sample['current_img']\n                    if 'current_label' in sample.keys():\n                        current_label = sample['current_label']\n                        current_label = paddle.to_tensor(current_label)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:68-90"
+    },
+    "387": {
+        "file_id": 43,
+        "content": "The code initializes empty lists for reference embeddings, reference masks, previous embeddings, and previous masks. It then uses Paddle's no_grad context to iterate over a data loader with multiple samples. For each sample, it checks if the corresponding reference embedding and mask lists are long enough, appending them if necessary. It assigns current image, label, previous embedding, and mask values from the sample and converts the label into a tensor for further processing.",
+        "type": "comment"
+    },
+    "388": {
+        "file_id": 43,
+        "content": "                    else:\n                        current_label = None\n                    obj_num = sample['meta']['obj_num']\n                    imgname = sample['meta']['current_name']\n                    ori_height = sample['meta']['height']\n                    ori_width = sample['meta']['width']\n                    current_img = current_img\n                    obj_num = obj_num\n                    bs, _, h, w = current_img.shape\n                    data_batch = [\n                        ref_emb, ref_m, prev_emb, prev_m, current_img,\n                        [ori_height, ori_width], obj_num\n                    ]\n                    all_pred, current_embedding = model(data_batch,\n                                                        mode='test')\n                    if frame_idx == 0:\n                        if current_label is None:\n                            logger.info(\n                                \"No first frame label in Seq {}.\".format(\n                                    seq_name))\n                        ref_embeddings[aug_idx].append(current_embedding)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:91-114"
+    },
+    "389": {
+        "file_id": 43,
+        "content": "This code is a part of the PaddleVideo framework, specifically for the EIVideo application. It prepares the data for model input and then runs it through the model to generate predictions and embeddings. If the current label is None (first frame), it logs an information message. The code also keeps track of references embeddings based on augmentation index (aug_idx) in ref_embeddings list.",
+        "type": "comment"
+    },
+    "390": {
+        "file_id": 43,
+        "content": "                        ref_masks[aug_idx].append(current_label)\n                        prev_embedding[aug_idx] = current_embedding\n                        prev_mask[aug_idx] = current_label\n                    else:\n                        if sample['meta']['flip']:  #False\n                            all_pred = self.flip_tensor(all_pred, 3)\n                        #  In YouTube-VOS, not all the objects appear in the first frame for the first time. Thus, we\n                        #  have to introduce new labels for new objects, if necessary.\n                        if not sample['meta']['flip'] and not (\n                                current_label is None) and join_label is None:\n                            join_label = paddle.cast(current_label,\n                                                     dtype='int64')\n                        all_preds.append(all_pred)\n                        if current_label is not None:\n                            ref_embeddings[aug_idx].append(current_embedding)\n                        prev_embedding[aug_idx] = current_embedding",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:115-131"
+    },
+    "391": {
+        "file_id": 43,
+        "content": "This code appears to be part of a video object segmentation model, specifically for the YouTube-VOS task. It checks if there are new objects and updates labels accordingly while maintaining reference masks, previous embeddings, and all predictions. The code also handles flipping based on the 'meta' information of each sample.",
+        "type": "comment"
+    },
+    "392": {
+        "file_id": 43,
+        "content": "                if frame_idx > 0:\n                    all_preds = paddle.concat(all_preds, axis=0)\n                    all_preds = paddle.mean(\n                        all_preds, axis=0)  #average results if augmentation\n                    pred_label = paddle.argmax(all_preds, axis=0)\n                    if join_label is not None:\n                        join_label = paddle.squeeze(paddle.squeeze(join_label,\n                                                                   axis=0),\n                                                    axis=0)\n                        keep = paddle.cast((join_label == 0), dtype=\"int64\")\n                        pred_label = pred_label * keep + join_label * (1 -\n                                                                       keep)\n                        pred_label = pred_label\n                    current_label = paddle.reshape(\n                        pred_label, shape=[1, 1, ori_height, ori_width])\n                    flip_pred_label = self.flip_tensor(pred_label, 1)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:133-148"
+    },
+    "393": {
+        "file_id": 43,
+        "content": "This code segment is performing data augmentation and label averaging. It first concatenates previous predictions, then calculates the mean to average the results from different augmentations. If a join_label exists, it performs element-wise multiplication with a keep mask to combine with current_label. Finally, it reshapes pred_label into a 1x1xori_heightxori_width tensor and flips it along the second dimension using self.flip_tensor function.",
+        "type": "comment"
+    },
+    "394": {
+        "file_id": 43,
+        "content": "                    flip_current_label = paddle.reshape(\n                        flip_pred_label, shape=[1, 1, ori_height, ori_width])\n                    for aug_idx in range(len(samples)):\n                        if join_label is not None:\n                            if samples[aug_idx]['meta']['flip']:\n                                ref_masks[aug_idx].append(flip_current_label)\n                            else:\n                                ref_masks[aug_idx].append(current_label)\n                        if samples[aug_idx]['meta']['flip']:\n                            prev_mask[aug_idx] = flip_current_label\n                        else:\n                            prev_mask[\n                                aug_idx] = current_label  #update prev_mask\n                    one_frametime = time.time() - time_start\n                    seq_total_time += one_frametime\n                    seq_total_frame += 1\n                    obj_num = float(obj_num)\n                    logger.info('Frame: {}, Obj Num: {}, Time: {}'.format(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:149-168"
+    },
+    "395": {
+        "file_id": 43,
+        "content": "This code generates flipped labels for each frame of a video sequence. It checks the 'flip' flag in the sample metadata and appends either the current or flipped label to the corresponding list. The code also calculates the time taken per frame and updates total sequence time and frame count. Finally, it logs the frame number, object count, and time taken.",
+        "type": "comment"
+    },
+    "396": {
+        "file_id": 43,
+        "content": "                        imgname[0], obj_num, one_frametime))\n                    self.save_mask(\n                        pred_label,\n                        os.path.join(self.result_root, seq_name,\n                                     imgname[0].split('.')[0] + '.png'))\n                else:\n                    one_frametime = time.time() - time_start\n                    seq_total_time += one_frametime\n                    logger.info('Ref Frame: {}, Time: {}'.format(\n                        imgname[0], one_frametime))\n            del (ref_embeddings)\n            del (ref_masks)\n            del (prev_embedding)\n            del (prev_mask)\n            del (seq_dataset)\n            del (seq_dataloader)\n        seq_avg_time_per_frame = seq_total_time / seq_total_frame\n        self.total_time += seq_total_time\n        self.total_frame += seq_total_frame\n        total_avg_time_per_frame = self.total_time / self.total_frame\n        self.total_sfps += seq_avg_time_per_frame\n        avg_sfps = self.total_sfps / (batch_id + 1)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:169-192"
+    },
+    "397": {
+        "file_id": 43,
+        "content": "This code calculates the average time per frame for each sequence and updates the total time, total frames, and average speed. It then calculates the average speed across all sequences. The code uses a \"else\" statement to handle cases where no object is detected in an image and calculates the time taken for processing that image. It also deletes variables used within the loop to free up memory before moving on to the next sequence or batch of images.",
+        "type": "comment"
+    },
+    "398": {
+        "file_id": 43,
+        "content": "        logger.info(\"Seq {} FPS: {}, Total FPS: {}, FPS per Seq: {}\".format(\n            seq_name, 1. / seq_avg_time_per_frame,\n            1. / total_avg_time_per_frame, 1. / avg_sfps))\n    def flip_tensor(self, tensor, dim=0):\n        inv_idx = paddle.cast(paddle.arange(tensor.shape[dim] - 1, -1, -1),\n                              dtype=\"int64\")\n        tensor = paddle.index_select(x=tensor, index=inv_idx, axis=dim)\n        return tensor\n    def save_mask(self, mask_tensor, path):\n        _palette = [\n            0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128,\n            0, 128, 128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191,\n            128, 0, 64, 0, 128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0,\n            64, 0, 128, 64, 0, 0, 191, 0, 128, 191, 0, 0, 64, 128, 128, 64,\n            128, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26,\n            27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32,\n            32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:193-211"
+    },
+    "399": {
+        "file_id": 43,
+        "content": "Logger is reporting sequence frame rate, total frame rate and frame rate per sequence.\nFunction flips tensor along a specified dimension.\nFunction saves a mask tensor to a given path using specific palette colors.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/30.json b/docs/data/30.json
new file mode 100644
index 000000000..7701dba2f
--- /dev/null
+++ b/docs/data/30.json
@@ -0,0 +1,542 @@
+{
+    "3000": {
+        "file_id": 245,
+        "content": "                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     kernel_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        \"\"\"forward\"\"\"\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:147-174"
+    },
+    "3001": {
+        "file_id": 245,
+        "content": "This code defines a class with forward function. It initializes layers such as ConvBNLayer, and the shortcut connection depends on the provided 'shortcut'. The forward function performs the computations by passing inputs through the conv0 layer, then the conv1 layer, and finally adds the results of the two operations to generate the output.",
+        "type": "comment"
+    },
+    "3002": {
+        "file_id": 245,
+        "content": "        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNet(nn.Layer):\n    \"\"\"ResNet backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, pretrained=None):\n        super(ResNet, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = [64, 256, 512, 1024]\n        out_channels = [64, 128, 256, 512]\n        self.conv = ConvBNLayer(in_channels=3,\n                                out_channels=64,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:175-210"
+    },
+    "3003": {
+        "file_id": 245,
+        "content": "ResNet is a backbone class for creating ResNet models with different depths. It initializes the layers based on the input depth and supports pretrained models. The code defines supported layer sizes, sets up the block size and number of output channels. It creates an instance of ConvBNLayer for the input channel size 3 and output channel size 64.",
+        "type": "comment"
+    },
+    "3004": {
+        "file_id": 245,
+        "content": "                                kernel_size=7,\n                                stride=2,\n                                act=\"relu\",\n                                name=\"conv1\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        conv_name,\n                        BottleneckBlock(\n                            # NOTE: Be careful! Here is different from TSM model.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:211-232"
+    },
+    "3005": {
+        "file_id": 245,
+        "content": "This code snippet defines a ResNet model. It includes a convolutional layer with specified parameters, a MaxPool2D layer, and dynamically generates BottleneckBlock layers based on the desired depth. The code also checks for specific layer counts (101 or 152) in certain blocks and sets the corresponding layer names accordingly to differentiate them from other blocks.",
+        "type": "comment"
+    },
+    "3006": {
+        "file_id": 245,
+        "content": "                            in_channels=in_channels[block]\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            shortcut=shortcut,\n                            name=conv_name))\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:233-252"
+    },
+    "3007": {
+        "file_id": 245,
+        "content": "This code defines a ResNet model architecture with Bottleneck and Basic blocks. It dynamically creates the layers based on the input channels, output channels, depth, and stride values defined in the respective lists. The shortcut connection is used to skip layers or not, depending on the block number. Each block is added as a sublayer to the model's layer list.",
+        "type": "comment"
+    },
+    "3008": {
+        "file_id": 245,
+        "content": "                                   shortcut=shortcut,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:253-270"
+    },
+    "3009": {
+        "file_id": 245,
+        "content": "The code defines a function for initializing the weights of a backbone model. If pretrained weights are specified, it loads them. Otherwise, it uses specific initialization methods for Conv2D and BatchNorm2d layers. It checks if pretrained weights are provided or not and acts accordingly.",
+        "type": "comment"
+    },
+    "3010": {
+        "file_id": 245,
+        "content": "                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27\n        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        y = self.conv(inputs)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:271-290"
+    },
+    "3011": {
+        "file_id": 245,
+        "content": "The code is defining a forward pass function for the backbone, which performs convolution and pooling operations. It also initializes layer weights based on their type (Conv2D or BatchNorm2D). The comments indicate that the input shape has been merged beforehand and reshaping is not necessary.",
+        "type": "comment"
+    },
+    "3012": {
+        "file_id": 246,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py",
+        "type": "filepath"
+    },
+    "3013": {
+        "file_id": 246,
+        "content": "This code defines a ResNet model with TSM backbone, implementing the ResNet-C architecture, consisting of multiple blocks and configurable layer numbers. It initializes weights based on pretrained values and returns output after processing through all blocks.",
+        "type": "summary"
+    },
+    "3014": {
+        "file_id": 246,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport numpy as np\nimport math\nimport sys\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils.save_load import load_ckpt\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:1-34"
+    },
+    "3015": {
+        "file_id": 246,
+        "content": "This code defines a class called ConvBNLayer which is a layer consisting of Conv2D (convolutional) and BatchNorm2D layers. It appears to be part of a larger neural network model, likely used for feature extraction or classification tasks. The class also imports other useful modules such as Linear, Dropout, MaxPool2D, AvgPool2D from paddle.nn, and uses weight_init_ function from utils.save_load module to initialize layer weights.",
+        "type": "comment"
+    },
+    "3016": {
+        "file_id": 246,
+        "content": "    Args:\n        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        is_tweaks_mode (bool): switch for tweaks. Default: False.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode\n        self._pool2d_avg = AvgPool2D(kernel_size=2,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:36-60"
+    },
+    "3017": {
+        "file_id": 246,
+        "content": "This code defines a class for a ConvBNLayer, which includes a convolution layer followed by batch normalization and activation. It has optional parameters for tweaks mode, activation function, and name. Weight and bias initialization are handled in the init_weights method.",
+        "type": "comment"
+    },
+    "3018": {
+        "file_id": 246,
+        "content": "                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(out_channels,\n                                       weight_attr=ParamAttr(name=bn_name +\n                                                             \"_scale\"),\n                                       bias_attr=ParamAttr(bn_name + \"_offset\"))\n    def forward(self, inputs):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:61-85"
+    },
+    "3019": {
+        "file_id": 246,
+        "content": "This code defines a ResNet TSM backbone with stride, padding, and ceil_mode. It also includes a convolution layer, batch normalization, and activation function. The forward function takes inputs and processes them through the defined layers.",
+        "type": "comment"
+    },
+    "3020": {
+        "file_id": 246,
+        "content": "        \"\"\"forward\"\"\"\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    \"\"\"BottleneckBlock\"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 if_first=False,\n                 num_seg=8,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:86-114"
+    },
+    "3021": {
+        "file_id": 246,
+        "content": "The code defines a BottleneckBlock class which is a layer in the ResNet model. This block consists of two 3x3 convolutional layers, each followed by batch normalization and ReLU activation. The input channels, output channels, stride, and other parameters are defined for this block. This structure helps in reducing the number of parameters while preserving or even improving accuracy in deep neural networks like ResNet.",
+        "type": "comment"
+    },
+    "3022": {
+        "file_id": 246,
+        "content": "                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(\n                in_channels=in_channels,\n                out_channels=out_channels * 4,\n                kernel_size=1,\n                stride=\n                1,  #ResNet-D 2/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,\n                #             whose stride is changed to 1, works well in practice.\n                is_tweaks_mode=False if if_first else True,\n                name=name + \"_branch1\")\n        self.shortcut = shortcut\n        self.num_seg = num_seg\n    def forward(self, inputs):\n        \"\"\"forward\"\"\"",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:115-140"
+    },
+    "3023": {
+        "file_id": 246,
+        "content": "This code defines a custom layer in the ResNet-D model, with optional 2x2 pooling before convolution. The layer includes several ConvBNLayer components and a shortcut connection. If `short` is True, it adds a 2x2 average pooling layer before the convolution, whose stride is changed to 1. This works well in practice for ResNet-D 2/2.",
+        "type": "comment"
+    },
+    "3024": {
+        "file_id": 246,
+        "content": "        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.relu(y)\nclass BasicBlock(nn.Layer):\n    \"\"\"BasicBlock\"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:141-170"
+    },
+    "3025": {
+        "file_id": 246,
+        "content": "This code defines a BasicBlock class with convolutional layers, Batch Normalization, and ReLU activation functions. It also includes an optional shortcut connection. The method within the class performs temporal shifting on inputs before passing through convolutional layers and adding to the shortcut connection if applicable. Finally, it applies ReLU activation.",
+        "type": "comment"
+    },
+    "3026": {
+        "file_id": 246,
+        "content": "                                 kernel_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     kernel_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        \"\"\"forward\"\"\"\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTweaksTSM(nn.Layer):\n    \"\"\"ResNet TSM backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, num_seg=8, pretrained=None):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:171-206"
+    },
+    "3027": {
+        "file_id": 246,
+        "content": "This code defines a ResNet TSM backbone model with optional shortcut connections. It includes convolutional layers, batch normalization, and a forward function for computation. The depth of the resnet model is specified as an argument, along with optional pretrained weights.",
+        "type": "comment"
+    },
+    "3028": {
+        "file_id": 246,
+        "content": "        super(ResNetTweaksTSM, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        self.num_seg = num_seg\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = 64\n        out_channels = [64, 128, 256, 512]\n        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        self.conv1_1 = ConvBNLayer(in_channels=3,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=2,\n                                   act='relu',\n                                   name=\"conv1_1\")",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:207-235"
+    },
+    "3029": {
+        "file_id": 246,
+        "content": "This code initializes an instance of ResNetTweaksTSM and sets its parameters, including the layer depth and number of segments. It also checks if the input layer is supported and assigns the corresponding depth based on the specified layer type. The code defines a ConvBNLayer for the first convolutional layer with 3x3 kernel and relu activation.",
+        "type": "comment"
+    },
+    "3030": {
+        "file_id": 246,
+        "content": "        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   name=\"conv1_3\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:236-258"
+    },
+    "3031": {
+        "file_id": 246,
+        "content": "This code defines a ResNet model with Temporal Segment Network (TSM) backbone. It includes convolutional layers, batch normalization, ReLU activation, max pooling, and multiple blocks for the ResNet structure. The number of layers can be configured as 50, 101, or 152, affecting the block's properties.",
+        "type": "comment"
+    },
+    "3032": {
+        "file_id": 246,
+        "content": "                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' %\n                        (block, i),  #same with PaddleClas, for loading pretrain\n                        BottleneckBlock(\n                            in_channels=in_channels\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            num_seg=self.num_seg,\n                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            name=conv_name))\n                    in_channels = out_channels[block] * 4\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:259-278"
+    },
+    "3033": {
+        "file_id": 246,
+        "content": "This code defines a ResNet model with Temporal Segment Network (TSM) backbone. It creates blocks of BottleneckBlock layers and appends them to the block list based on input and output channel numbers, stride values, and other parameters. The code handles both bottleneck and standard blocks and keeps track of shortcut connections for each block.",
+        "type": "comment"
+    },
+    "3034": {
+        "file_id": 246,
+        "content": "                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D lay",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:279-297"
+    },
+    "3035": {
+        "file_id": 246,
+        "content": "The code defines a function to initialize the weights of the ResNet backbone. If a pretrained loading path is provided, it loads the weights from that path; otherwise, it follows specific weight initialization methods for Conv2D layers in the backbone.",
+        "type": "comment"
+    },
+    "3036": {
+        "file_id": 246,
+        "content": "er will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:297-317"
+    },
+    "3037": {
+        "file_id": 246,
+        "content": "This code initializes the backbone network for a video quality assessment model. It checks if pretrained weights are provided and loads them if available, or initializes the layers using Kaiming Normal for convolutional layers and constant value of 1 for batch normalization layers. The forward function defines how the backbone is executed on inputs.",
+        "type": "comment"
+    },
+    "3038": {
+        "file_id": 246,
+        "content": "        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:318-328"
+    },
+    "3039": {
+        "file_id": 246,
+        "content": "This code implements the ResNet-C architecture, which uses three 3x3 convolutions and one 7x7 convolution in the first layer, followed by max pooling and multiple blocks. The output is returned after processing through all the blocks.",
+        "type": "comment"
+    },
+    "3040": {
+        "file_id": 247,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py",
+        "type": "filepath"
+    },
+    "3041": {
+        "file_id": 247,
+        "content": "This code defines functions to build components of a computer vision model, including backbone, head, loss, recognizer, and a model builder that selects the appropriate builder based on framework type.",
+        "type": "summary"
+    },
+    "3042": {
+        "file_id": 247,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS\nfrom ..utils import build\ndef build_backbone(cfg):\n    \"\"\"Build backbone.\"\"\"\n    return build(cfg, BACKBONES)\ndef build_head(cfg):\n    \"\"\"Build head.\"\"\"\n    return build(cfg, HEADS)\ndef build_loss(cfg):\n    \"\"\"Build loss.\"\"\"\n    return build(cfg, LOSSES)\ndef build_recognizer(cfg):\n    \"\"\"Build recognizer.\"\"\"\n    return build(cfg, RECOGNIZERS, key='framework')",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py:1-36"
+    },
+    "3043": {
+        "file_id": 247,
+        "content": "The code is defining functions to build different components of a computer vision model. The `build_backbone`, `build_head`, and `build_loss` functions are used to build the backbone, head, and loss for the model respectively using the `build` function from the `utils` module. The `build_recognizer` function is used to build a recognizer component for the model using the specified framework key.",
+        "type": "comment"
+    },
+    "3044": {
+        "file_id": 247,
+        "content": "def build_localizer(cfg):\n    \"\"\"Build localizer.\"\"\"\n    return build(cfg, LOCALIZERS, key='framework')\ndef build_model(cfg):\n    cfg_copy = cfg.copy()\n    framework_type = cfg_copy.get('framework')\n    if framework_type in RECOGNIZERS:\n        return build_recognizer(cfg)\n    elif framework_type in LOCALIZERS:\n        return build_localizer(cfg)\n    else:\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py:39-52"
+    },
+    "3045": {
+        "file_id": 247,
+        "content": "This code defines functions for building localizer and model, and a build_model function that selects the appropriate builder based on the framework type specified in the configuration.",
+        "type": "comment"
+    },
+    "3046": {
+        "file_id": 248,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py",
+        "type": "filepath"
+    },
+    "3047": {
+        "file_id": 248,
+        "content": "This code is importing the recognizers module from the paddlevideo.modeling.framework package, and defining the BaseRecognizer and Recognizer2D classes as part of its API. The __all__ variable lists these two classes as the public API elements of this package.",
+        "type": "summary"
+    },
+    "3048": {
+        "file_id": 248,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .recognizers import BaseRecognizer, Recognizer2D\n__all__ = [\n    'BaseRecognizer',\n    'Recognizer2D',\n]",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py:1-22"
+    },
+    "3049": {
+        "file_id": 248,
+        "content": "This code is importing the recognizers module from the paddlevideo.modeling.framework package, and defining the BaseRecognizer and Recognizer2D classes as part of its API. The __all__ variable lists these two classes as the public API elements of this package.",
+        "type": "comment"
+    },
+    "3050": {
+        "file_id": 249,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py",
+        "type": "filepath"
+    },
+    "3051": {
+        "file_id": 249,
+        "content": "This code imports base and recognizer2d classes from the same directory and adds them to the __all__ list for access. It also includes a copyright notice, license information, and disclaimer.",
+        "type": "summary"
+    },
+    "3052": {
+        "file_id": 249,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\"\"\"\nfrom .base import BaseRecognizer\nfrom .recognizer2d import Recognizer2D\n__all__ = ['BaseRecognizer', 'Recognizer2D']",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py:1-19"
+    },
+    "3053": {
+        "file_id": 249,
+        "content": "This code imports base and recognizer2d classes from the same directory and adds them to the __all__ list for access. It also includes a copyright notice, license information, and disclaimer.",
+        "type": "comment"
+    },
+    "3054": {
+        "file_id": 250,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py",
+        "type": "filepath"
+    },
+    "3055": {
+        "file_id": 250,
+        "content": "The code creates a base class for model recognizers in PaddleVideo. It initializes the backbone and head modules' weights, extracts features using the backbone, and performs optional classification. The class provides abstract methods for training, validating, and testing steps to be implemented by subclasses.",
+        "type": "summary"
+    },
+    "3056": {
+        "file_id": 250,
+        "content": "\"\"\"\nstart\n\"\"\"\nfrom abc import abstractmethod\nfrom ... import builder\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nclass BaseRecognizer(nn.Layer):\n    \"\"\"Base class for recognizers.\n    All recognizers should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Classification head to process feature.\n    \"\"\"\n    def __init__(self, backbone=None, head=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            self.head.init_weights()",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py:1-38"
+    },
+    "3057": {
+        "file_id": 250,
+        "content": "Base class for recognizers, subclass for train_step, valid_step, test_step. Initializes backbone and head modules with weights.",
+        "type": "comment"
+    },
+    "3058": {
+        "file_id": 250,
+        "content": "        else:\n           self.head = None\n    def init_weights(self):\n        \"\"\"Initialize the model network weights. \"\"\"\n        self.backbone.init_weights(\n        )  #TODO: required? while backbone without base class\n        self.head.init_weights()\n    def extract_feature(self, imgs):\n        \"\"\"Extract features through a backbone.\n    Args:\n        imgs (paddle.Tensor) : The input images.\n        Returns:\n            feature (paddle.Tensor) : The extracted features.\n        \"\"\"\n        feature = self.backbone(imgs)\n        return feature\n    def forward(self, imgs, **kwargs):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        batches = imgs.shape[0]\n        num_segs = imgs.shape[1]\n        imgs = paddle.reshape(imgs, [-1] + list(imgs.shape[2:]))\n        if self.backbone is not None:\n            feature = self.extract_feature(imgs)\n        else:\n            feature = imgs\n        if self.head is not None:\n            cls_score = self.head(feature, num_segs)\n        else:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py:39-75"
+    },
+    "3059": {
+        "file_id": 250,
+        "content": "The code defines a base class for model recognizers. It initializes the weights of both backbone and head, extracts features using the backbone, and optionally performs classification using the head if it exists. The method also handles reshaping inputs when necessary.",
+        "type": "comment"
+    },
+    "3060": {
+        "file_id": 250,
+        "content": "            cls_score = None\n        return cls_score\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py:76-97"
+    },
+    "3061": {
+        "file_id": 250,
+        "content": "This code defines a base class for recognizer models in PaddleVideo. It provides abstract methods for training, validating, and testing steps, which must be implemented by any subclasses that inherit from this base class.",
+        "type": "comment"
+    },
+    "3062": {
+        "file_id": 251,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py",
+        "type": "filepath"
+    },
+    "3063": {
+        "file_id": 251,
+        "content": "This code defines the PaddleVideo framework's \"Recognizer2D\" class for 2D model training, with methods train_step(), recognizer2d.py, val_step, and test_step handling loss metrics calculation in different modes.",
+        "type": "summary"
+    },
+    "3064": {
+        "file_id": 251,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\"\"\"\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nimport paddle\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass Recognizer2D(BaseRecognizer):\n    \"\"\"2D recognizer model framework.\"\"\"\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        #NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py:1-29"
+    },
+    "3065": {
+        "file_id": 251,
+        "content": "This code snippet is part of the PaddleVideo framework and defines a class called \"Recognizer2D\" which is a 2D recognizer model for training. It inherits from \"BaseRecognizer\" and has a method \"train_step()\" that handles how the model trains, taking input data batch as argument and returning output. The \"recognizers\" registry is used to register this class.",
+        "type": "comment"
+    },
+    "3066": {
+        "file_id": 251,
+        "content": "        #labels = labels.squeeze()\n        #XXX: unsqueeze label to [label] ?\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self(imgs)\n        loss_metrics = self.head.loss(cls_score, labels)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self(imgs)\n        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        #NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics\n        imgs = data_batch[0]\n        cls_score = self(imgs)\n        return cls_score",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py:31-52"
+    },
+    "3067": {
+        "file_id": 251,
+        "content": "These code snippets define three different methods: recognizer2d.py, val_step, and test_step. The first method appears to be a base for the other two and is used to calculate loss metrics from input images and labels using the head's loss function. The val_step also calculates loss metrics, but in valid mode only. Lastly, the test_step does not call the head's loss function and instead returns the class scores directly.",
+        "type": "comment"
+    },
+    "3068": {
+        "file_id": 252,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py",
+        "type": "filepath"
+    },
+    "3069": {
+        "file_id": 252,
+        "content": "This code file is a part of the PaddleVideo library and contains definitions for different head models (BaseHead, TSNHead, TSMRecHead) used in Video Quality Assessment. It imports these classes from other files within the modeling/heads directory and provides them to be used by other parts of the library.",
+        "type": "summary"
+    },
+    "3070": {
+        "file_id": 252,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .base import BaseHead\nfrom .tsn_head import TSNHead\nfrom .tsm_rec_head import TSMRecHead\n__all__ = ['BaseHead', 'TSNHead', 'TSMRecHead']",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py:1-21"
+    },
+    "3071": {
+        "file_id": 252,
+        "content": "This code file is a part of the PaddleVideo library and contains definitions for different head models (BaseHead, TSNHead, TSMRecHead) used in Video Quality Assessment. It imports these classes from other files within the modeling/heads directory and provides them to be used by other parts of the library.",
+        "type": "comment"
+    },
+    "3072": {
+        "file_id": 253,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py",
+        "type": "filepath"
+    },
+    "3073": {
+        "file_id": 253,
+        "content": "The code defines a BaseHead abstract class for PaddleVideo, introduces a VideoQualityAssessment model with forward function, loss calculation, and accuracy metrics. It also contains functions for label smooth loss and top1/top5 accuracy calculations. An unimplemented function needs to be added based on the comments in the codebase.",
+        "type": "summary"
+    },
+    "3074": {
+        "file_id": 253,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport numpy as np\nfrom abc import abstractmethod\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..builder import build_loss\nfrom paddlevideo.utils import get_logger, get_dist_info\nlogger = get_logger(\"paddlevideo\")\nclass BaseHead(nn.Layer):\n    \"\"\"Base class for head part.\n    All head should subclass it.\n    All subclass should overwrite:\n    - Methods: ```init_weights```, initializing weights.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:1-36"
+    },
+    "3075": {
+        "file_id": 253,
+        "content": "This code snippet is a part of PaddleVideo library. It imports necessary libraries and defines an abstract base class \"BaseHead\" for video head parts. This class should be subclassed by all heads, which must override the \"init_weights\" method for initializing weights. The class also utilizes logger from paddlevideo to log information.",
+        "type": "comment"
+    },
+    "3076": {
+        "file_id": 253,
+        "content": "    - Methods: ```forward```, forward function.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channels in input feature.\n        loss_cfg (dict): Config for building loss. Default: dict(type='CrossEntropyLoss').\n        ls_eps (float): label smoothing epsilon. Default: 0. .\n    \"\"\"\n    def __init__(\n        self,\n        num_classes,\n        in_channels,\n        loss_cfg=dict(\n            name=\"CrossEntropyLoss\"\n        ),  #TODO(shipping): only pass a name or standard build cfg format.\n        #multi_class=False, NOTE(shipping): not supported now.\n        ls_eps=0.):\n        super().__init__()\n        self.num_classes = num_classes\n        self.in_channels = in_channels\n        self.loss_func = build_loss(loss_cfg)\n        #self.multi_class = multi_class NOTE(shipping): not supported now\n        self.ls_eps = ls_eps\n    @abstractmethod\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:37-67"
+    },
+    "3077": {
+        "file_id": 253,
+        "content": "This code is defining a base class for a head network in PaddleVideo. It has an initializer that takes the number of classes, input channels, loss configuration, and label smoothing epsilon as arguments. The loss function and other parameters are initialized inside the constructor. It also requires the implementation of an abstract method \"init_weights\" for parameter initialization.",
+        "type": "comment"
+    },
+    "3078": {
+        "file_id": 253,
+        "content": "    @abstractmethod\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        raise NotImplementedError\n    def loss(self, scores, labels, valid_mode=False, **kwargs):\n        \"\"\"Calculate the loss accroding to the model output ```scores```,\n           and the target ```labels```.\n        Args:\n            scores (paddle.Tensor): The output of the model.\n            labels (paddle.Tensor): The target output of the model.\n        Returns:\n            losses (dict): A dict containing field 'loss'(mandatory) and 'top1_acc', 'top5_acc'(optional).\n        \"\"\"\n        if len(labels) == 1:  #commonly case\n            labels = labels[0]\n            losses = dict()\n            if self.ls_eps != 0. and not valid_mode:  # label_smooth\n                loss = self.label_smooth_loss(scores, labels, **kwargs)\n            else:\n                loss = self.loss_func(scores, labels, **kwargs)\n            top1, top5 = self.get_acc(scores, labels, valid_mode)\n            losses['top1'] = top1",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:69-96"
+    },
+    "3079": {
+        "file_id": 253,
+        "content": "This code defines a base head for the VideoQualityAssessment model. It includes a forward function that is expected to be overridden by subclasses, and a loss function that calculates the loss based on model output (scores) and target (labels). The loss function also returns top1 and top5 accuracy if not in validation mode.",
+        "type": "comment"
+    },
+    "3080": {
+        "file_id": 253,
+        "content": "            losses['top5'] = top5\n            losses['loss'] = loss\n            return losses\n        elif len(labels) == 3:  # mix_up\n            labels_a, labels_b, lam = labels\n            lam = lam[0]  # get lam value\n            losses = dict()\n            if self.ls_eps != 0:\n                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)\n                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)\n            else:\n                loss_a = self.loss_func(scores, labels_a, **kwargs)\n                loss_b = self.loss_func(scores, labels_a, **kwargs)\n            loss = lam * loss_a + (1 - lam) * loss_b\n            top1a, top5a = self.get_acc(scores, labels_a, valid_mode)\n            top1b, top5b = self.get_acc(scores, labels_b, valid_mode)\n            top1 = lam * top1a + (1 - lam) * top1b\n            top5 = lam * top5a + (1 - lam) * top5b\n            losses['top1'] = top1\n            losses['top5'] = top5\n            losses['loss'] = loss\n            return losses\n        else:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:97-120"
+    },
+    "3081": {
+        "file_id": 253,
+        "content": "This code segment handles mixed-up labels where there are 3 labels (labels_a, labels_b, and lam). It calculates the loss and accuracy for both label sets with or without label smoothing. The final result is stored in a dictionary including top1, top5, and total loss.",
+        "type": "comment"
+    },
+    "3082": {
+        "file_id": 253,
+        "content": "            raise NotImplementedError\n    def label_smooth_loss(self, scores, labels, **kwargs):\n        \"\"\"label smooth loss\"\"\"\n        labels = F.one_hot(labels, self.num_classes)\n        labels = F.label_smooth(labels, epsilon=self.ls_eps)\n        labels = paddle.squeeze(labels, axis=1)\n        loss = self.loss_func(scores, labels, soft_label=True, **kwargs)\n        return loss\n    def get_acc(self, scores, labels, valid_mode):\n        \"\"\"get acc\"\"\"\n        top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)\n        top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)\n        _, world_size = get_dist_info()\n        #NOTE(shipping): deal with multi cards validate\n        if world_size > 1 and valid_mode:  #reduce sum when valid\n            top1 = paddle.distributed.all_reduce(\n                top1, op=paddle.distributed.ReduceOp.SUM) / world_size\n            top5 = paddle.distributed.all_reduce(\n                top5, op=paddle.distributed.ReduceOp.SUM) / world_size\n        return top1, top5",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:121-143"
+    },
+    "3083": {
+        "file_id": 253,
+        "content": "This code contains three functions: \"label_smooth_loss\", \"get_acc\", and an unimplemented function. The \"label_smooth_loss\" calculates the label smooth loss using one-hot encoding, label smoothing, and applies a soft loss function with a specified epsilon value. It also handles the loss calculation for cases where soft labels are needed. The \"get_acc\" function calculates both top1 and top5 accuracy values from input scores and labels. It can handle multi-card validation by performing all-reduce when validating on multiple cards. Finally, there is an unimplemented function that should be implemented based on the comments in the codebase.",
+        "type": "comment"
+    },
+    "3084": {
+        "file_id": 254,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py",
+        "type": "filepath"
+    },
+    "3085": {
+        "file_id": 254,
+        "content": "TSMRecHead is a TSNHead-based classifier head for Temporal Segment Networks, performing average pooling, optional dropout, reshaping, mean operation, and applying a fully connected layer. It uses defined loss function to compare with labels, and calculates loss based on provided labels using label smoothing and weighted average.",
+        "type": "summary"
+    },
+    "3086": {
+        "file_id": 254,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport math\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle.nn import AdaptiveAvgPool2D, Linear, Dropout\nfrom .base import BaseHead\nfrom .tsn_head import TSNHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass TSMRecHead(TSNHead):\n    \"\"\" TSM Rec Head\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:1-33"
+    },
+    "3087": {
+        "file_id": 254,
+        "content": "TSMRecHead is a TSNHead-based classifier head for Temporal Segment Networks (TSMs) with specified number of classes, input channels and registered under PaddlePaddle's HEADS registry.",
+        "type": "comment"
+    },
+    "3088": {
+        "file_id": 254,
+        "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        drop_ratio(float): drop ratio. Default: 0.8.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.001.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='L1Loss'),\n                 drop_ratio=0.8,\n                 std=0.01,\n                 data_format=\"NCHW\",\n                 **kwargs):\n        super().__init__(num_classes,\n                         in_channels,\n                         loss_cfg,\n                         drop_ratio=drop_ratio,\n                         std=std,\n                         data_format=data_format,\n                         **kwargs)\n        self.stdv = 1.0 / math.sqrt(self.in_channels * 1.0)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'Uniform',",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:34-62"
+    },
+    "3089": {
+        "file_id": 254,
+        "content": "This function initializes the weights of the FC layer using a uniform distribution, and sets the standard deviation for normal initialization. The loss_cfg argument determines the type of loss function to use, and drop_ratio is the probability of dropping connections between layers during training. The stdv value is set based on the number of input channels.",
+        "type": "comment"
+    },
+    "3090": {
+        "file_id": 254,
+        "content": "                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     low=-self.stdv,\n                     high=self.stdv)\n        self.fc.bias.learning_rate = 2.0\n        self.fc.bias.regularizer = paddle.regularizer.L2Decay(0.)\n    def forward(self, x, num_seg):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)\n        # [N * num_seg, in_channels, 1, 1]\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, 1, in_channels]\n        x = paddle.reshape(x, shape=[-1, self.in_channels])\n        # [N, in_channels]\n        score = self.fc(x)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:63-91"
+    },
+    "3091": {
+        "file_id": 254,
+        "content": "This code defines a head for TSM (Temporal Shift Module) Recognition task. It includes initialization of weights, setting learning rate, and applying L2 decay regularizer. The forward method performs average pooling, optional dropout, reshaping, mean operation, and finally passes the result through fully connected layer to obtain classification scores.",
+        "type": "comment"
+    },
+    "3092": {
+        "file_id": 254,
+        "content": "        # [N, num_class]\n        #m = paddle.nn.Sigmoid()\n        #score = m(score)\n        return score\n    def loss(self, scores, labels, valid_mode=False, **kwargs):\n        \"\"\"Calculate the loss accroding to the model output ```scores```,\n           and the target ```labels```.\n        Args:\n            scores (paddle.Tensor): The output of the model.\n            labels (paddle.Tensor): The target output of the model.\n        Returns:\n            losses (dict): A dict containing field 'loss'(mandatory).\n        \"\"\"\n        if len(labels) == 1:  #commonly case\n            output = []\n            label = []\n            labels = labels[0]\n            losses = dict()\n            loss = self.loss_func(scores, labels, **kwargs)\n            score_list = paddle.tolist(scores)\n            label_list = paddle.tolist(labels)\n            score_list_len = len(score_list)\n            for i in range(score_list_len):\n                output.append(score_list[i][0])\n                label.append(label_list[i][0])\n            losses['loss'] = loss",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:92-122"
+    },
+    "3093": {
+        "file_id": 254,
+        "content": "The code defines a loss function for a model that predicts scores and compares them with the given labels. It calculates the loss between the predicted scores and the target labels, considering cases where there is only one label. The losses are returned in a dictionary format with 'loss' as the mandatory field.",
+        "type": "comment"
+    },
+    "3094": {
+        "file_id": 254,
+        "content": "            losses['output'] = output\n            losses['label'] = label\n            return losses\n        elif len(labels) == 3:\n            labels_a, labels_b, lam = labels\n            labels_a = paddle.cast(labels_a, dtype='float32')\n            labels_b = paddle.cast(labels_b, dtype='float32')\n            lam = lam[0]  # get lam value\n            losses = dict()\n            if self.ls_eps != 0:\n                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)\n                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)\n            else:\n                loss_a = self.loss_func(scores, labels_a, **kwargs)\n                loss_b = self.loss_func(scores, labels_a, **kwargs)\n            loss = lam * loss_a + (1 - lam) * loss_b\n            losses['loss'] = loss\n            losses['output'] = output\n            losses['label'] = label\n            return losses\n        else:\n            raise NotImplementedError\n    def label_smooth_loss(self, scores, labels, **kwargs):\n        \"\"\"label smooth loss\"\"\"",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:123-149"
+    },
+    "3095": {
+        "file_id": 254,
+        "content": "This function calculates the loss based on the number of labels provided. If one label is given, it returns the output and label as losses. If three labels are given (a, b, lam), it casts the labels to float32, applies label smoothing or standard loss depending on epsilon, then calculates the weighted average loss for a and b. It returns the loss, output, and label in a dictionary.",
+        "type": "comment"
+    },
+    "3096": {
+        "file_id": 254,
+        "content": "        labels = F.label_smooth(labels, epsilon=self.ls_eps)\n        labels = paddle.squeeze(labels, axis=1)\n        loss = self.loss_func(scores, labels, **kwargs)\n        return loss",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:150-153"
+    },
+    "3097": {
+        "file_id": 254,
+        "content": "Applies label smoothing to the input labels, squeezes the labels along a specified axis, and calculates the loss using a provided loss function.",
+        "type": "comment"
+    },
+    "3098": {
+        "file_id": 255,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py",
+        "type": "filepath"
+    },
+    "3099": {
+        "file_id": 255,
+        "content": "A PaddlePaddle TSN head class for video quality assessment tasks is defined, implementing adaptive average pooling, linear transformation, dropout, and taking input number of classes and feature channels as arguments. The forward pass function applies these operations and produces classification scores, operating on tensors of dimensions N, num_seg, and num_class, with softmax activation.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/31.json b/docs/data/31.json
new file mode 100644
index 000000000..bb91746e8
--- /dev/null
+++ b/docs/data/31.json
@@ -0,0 +1,539 @@
+{
+    "3100": {
+        "file_id": 255,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle.nn import AdaptiveAvgPool2D, Linear, Dropout\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nimport paddle.nn.functional as F\n@HEADS.register()\nclass TSNHead(BaseHead):\n    \"\"\"TSN Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:1-31"
+    },
+    "3101": {
+        "file_id": 255,
+        "content": "TSNHead: PaddlePaddle Temporal Segment Network head class for video quality assessment tasks. Implements adaptive average pooling, linear transformation, dropout, and takes input number of classes and input feature channels as arguments. Registered in the HEADS registry.",
+        "type": "comment"
+    },
+    "3102": {
+        "file_id": 255,
+        "content": "        drop_ratio(float): drop ratio. Default: 0.4.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 drop_ratio=0.4,\n                 std=0.01,\n                 data_format=\"NCHW\",\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.drop_ratio = drop_ratio\n        self.std = std\n        #NOTE: global pool performance\n        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)\n        if self.drop_ratio != 0:\n            self.dropout = Dropout(p=self.drop_ratio)\n        else:\n            self.dropout = None\n        self.fc = Linear(self.in_channels, self.num_classes)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'Normal',",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:32-64"
+    },
+    "3103": {
+        "file_id": 255,
+        "content": "This code defines a class for an image classification head with dropout and global average pooling. It initializes the class with specified parameters, such as number of classes, input channels, loss configuration, drop ratio, standard deviation for initialization, and data format. The class also includes methods for initializing weights in the fully connected (FC) layer using normal distribution.",
+        "type": "comment"
+    },
+    "3104": {
+        "file_id": 255,
+        "content": "                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.,\n                     std=self.std)\n    def forward(self, x, num_seg):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        #XXX: check dropout location!\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)\n        # [N * num_seg, in_channels, 1, 1]\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, 1, in_channels]\n        x = paddle.reshape(x, shape=[-1, self.in_channels])\n        # [N, in_channels]\n        score = self.fc(x)\n        # [N, num_class]\n        #score = F.softmax(score)  #NOTE remove",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:65-96"
+    },
+    "3105": {
+        "file_id": 255,
+        "content": "This code defines a forward pass function for a neural network head. It applies average pooling, optionally applies dropout, and performs a series of reshaping and fully connected layer operations to produce classification scores. Dropout is applied if not None, and the softmax activation (NOTE: remove) is used in the original code. The code operates on tensors of various dimensions, with N representing the number of input samples, num_seg representing the number of segments or regions for each sample, and num_class representing the number of classes being classified.",
+        "type": "comment"
+    },
+    "3106": {
+        "file_id": 255,
+        "content": "        return score",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:97-97"
+    },
+    "3107": {
+        "file_id": 255,
+        "content": "This line returns the calculated score as output from the function.",
+        "type": "comment"
+    },
+    "3108": {
+        "file_id": 256,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py",
+        "type": "filepath"
+    },
+    "3109": {
+        "file_id": 256,
+        "content": "This code snippet is a part of the PaddleVideo library and it defines various loss functions, including SmoothL1Loss and L1Loss. It also provides aliases for these losses in the __all__ list.",
+        "type": "summary"
+    },
+    "3110": {
+        "file_id": 256,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .base import BaseWeightedLoss\nfrom .smooth_l1_loss import SmoothL1Loss\nfrom .l1_loss import L1Loss\n__all__ = ['SmoothL1Loss', 'L1Loss']",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py:1-21"
+    },
+    "3111": {
+        "file_id": 256,
+        "content": "This code snippet is a part of the PaddleVideo library and it defines various loss functions, including SmoothL1Loss and L1Loss. It also provides aliases for these losses in the __all__ list.",
+        "type": "comment"
+    },
+    "3112": {
+        "file_id": 257,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py",
+        "type": "filepath"
+    },
+    "3113": {
+        "file_id": 257,
+        "content": "This class defines a base loss function in PaddleVideo, requires subclasses to implement _forward method and supports an optional loss_weight parameter.",
+        "type": "summary"
+    },
+    "3114": {
+        "file_id": 257,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom abc import  abstractmethod\nimport paddle\nimport paddle.nn as nn\n#XXX use _forward?? or forward??\nclass BaseWeightedLoss(nn.Layer):\n    \"\"\"Base class for loss.\n    All subclass should overwrite the ``_forward()`` method which returns the\n    normal loss without loss weights.\n    Args:\n        loss_weight (float): Factor scalar multiplied on the loss.\n            Default: 1.0.\n    \"\"\"\n    def __init__(self, loss_weight=1.0):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py:1-33"
+    },
+    "3115": {
+        "file_id": 257,
+        "content": "Base class for loss functions in PaddleVideo, subclasses should override the _forward() method to return normal loss without weights. Contains an optional loss_weight parameter for scaling the final loss value.",
+        "type": "comment"
+    },
+    "3116": {
+        "file_id": 257,
+        "content": "        super().__init__()\n        self.loss_weight = loss_weight\n    @abstractmethod\n    def _forward(self, *args, **kwargs):\n        pass\n    def forward(self, *args, **kwargs):\n        \"\"\"Defines the computation performed at every call.\n        Args:\n            *args: The positional arguments for the corresponding\n                loss.\n            **kwargs: The keyword arguments for the corresponding\n                loss.\n        Returns:\n            paddle.Tensor: The calculated loss.\n        \"\"\"\n        return self._forward(*args, **kwargs) * self.loss_weight",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py:34-51"
+    },
+    "3117": {
+        "file_id": 257,
+        "content": "The code defines an abstract base class for a loss function. It initializes the loss weight, requires subclasses to implement the _forward method, and returns the forward pass result multiplied by the loss weight in the forward method.",
+        "type": "comment"
+    },
+    "3118": {
+        "file_id": 258,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py",
+        "type": "filepath"
+    },
+    "3119": {
+        "file_id": 258,
+        "content": "This code defines the L1Loss class for computing the L1 loss, commonly used in image and video quality assessment tasks. The code calculates the L1 loss between 'score' and 'labels', ensuring compatible data types, and returns the resulting loss.",
+        "type": "summary"
+    },
+    "3120": {
+        "file_id": 258,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass L1Loss(BaseWeightedLoss):\n    \"\"\"L1 Loss.\"\"\"\n    def _forward(self, score, labels):\n        \"\"\"Forward function.\n        Args:\n            score (paddle.Tensor): The class score.\n            labels (paddle.Tensor): The ground truth labels.\n        Returns:\n            loss (paddle.Tensor): The returned L1 loss.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py:1-33"
+    },
+    "3121": {
+        "file_id": 258,
+        "content": "This code defines a class called L1Loss that extends BaseWeightedLoss and implements the forward function for computing the L1 loss. The L1 loss is commonly used in image and video quality assessment tasks.",
+        "type": "comment"
+    },
+    "3122": {
+        "file_id": 258,
+        "content": "        \"\"\"\n        labels = labels.astype(score.dtype)\n        loss = F.l1_loss(score, labels)\n        return loss",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py:34-38"
+    },
+    "3123": {
+        "file_id": 258,
+        "content": "This code snippet calculates the L1 loss between 'score' and 'labels', ensuring they have compatible data types, and then returns the resulting loss.",
+        "type": "comment"
+    },
+    "3124": {
+        "file_id": 259,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py",
+        "type": "filepath"
+    },
+    "3125": {
+        "file_id": 259,
+        "content": "This code defines the SmoothL1Loss class as a custom loss function in PaddlePaddle's VideoQualityAssessment library, and implements _forward method to calculate smooth L1 loss between predicted scores and ground truth labels. It extends BaseWeightedLoss for handling outliers in computer vision tasks.",
+        "type": "summary"
+    },
+    "3126": {
+        "file_id": 259,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass SmoothL1Loss(BaseWeightedLoss):\n    \"\"\"smooth L1 Loss.\"\"\"\n    def _forward(self, score, labels):\n        \"\"\"Forward function.\n        Args:\n            score (paddle.Tensor): The class score.\n            labels (paddle.Tensor): The ground truth labels.\n        Returns:\n            loss (paddle.Tensor): The returned smooth L1 Loss.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py:1-33"
+    },
+    "3127": {
+        "file_id": 259,
+        "content": "This code defines the SmoothL1Loss class, a custom loss function in PaddlePaddle's VideoQualityAssessment library. It extends BaseWeightedLoss and implements a _forward method for calculating the smooth L1 loss between predicted scores and ground truth labels. The smooth L1 loss is used in computer vision tasks to handle outliers and improve robustness.",
+        "type": "comment"
+    },
+    "3128": {
+        "file_id": 259,
+        "content": "        \"\"\"\n        labels = labels.astype(score.dtype)\n        loss = F.smooth_l1_loss(score, labels)\n        return loss",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py:34-39"
+    },
+    "3129": {
+        "file_id": 259,
+        "content": "This code snippet defines a function that calculates the smooth L1 loss between two arrays, \"score\" and \"labels\". It converts labels to the data type of score, then applies F.smooth_l1_loss() to compute the loss. Finally, it returns the computed loss value.",
+        "type": "comment"
+    },
+    "3130": {
+        "file_id": 260,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py",
+        "type": "filepath"
+    },
+    "3131": {
+        "file_id": 260,
+        "content": "The code is a part of PaddleVideo's Video Quality Assessment application. It defines and registers several types of models including backbones, heads, recognizers, localizers, and losses using a registry system for easier management and organization.",
+        "type": "summary"
+    },
+    "3132": {
+        "file_id": 260,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom ..utils import Registry\nBACKBONES = Registry('backbone')\nHEADS = Registry('head')\nRECOGNIZERS = Registry('recognizer')\nLOCALIZERS = Registry('localizer')\nLOSSES = Registry('loss')",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py:1-23"
+    },
+    "3133": {
+        "file_id": 260,
+        "content": "The code is a part of PaddleVideo's Video Quality Assessment application. It defines and registers several types of models including backbones, heads, recognizers, localizers, and losses using a registry system for easier management and organization.",
+        "type": "comment"
+    },
+    "3134": {
+        "file_id": 261,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py",
+        "type": "filepath"
+    },
+    "3135": {
+        "file_id": 261,
+        "content": "The `weight_init_` function initializes layer weights in PaddlePaddle with custom functions, supporting various initialization types such as Xavier and Uniform.",
+        "type": "summary"
+    },
+    "3136": {
+        "file_id": 261,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport numpy as np\nimport paddle.nn.initializer as init\ndef weight_init_(layer,\n                 func,\n                 weight_name=None,\n                 bias_name=None,\n                 bias_value=0.0,\n                 **kwargs):\n    \"\"\"\n    In-place params init function.\n    Usage:\n    .. code-block:: python\n        import paddle\n        import numpy as np\n        data = np.ones([3, 4], dtype='float32')\n        linear = paddle.nn.Linear(4, 4)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py:1-36"
+    },
+    "3137": {
+        "file_id": 261,
+        "content": "This function, `weight_init_`, initializes the weights of a PaddlePaddle layer with user-defined functions. The function takes in a layer, an initialization function, optional weight and bias names, and additional keyword arguments. It performs an in-place parameter initialization and supports various types of initialization functions such as Xavier, Uniform, and others.",
+        "type": "comment"
+    },
+    "3138": {
+        "file_id": 261,
+        "content": "        input = paddle.to_tensor(data)\n        print(linear.weight)\n        linear(input)\n        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)\n        print(linear.weight)\n    \"\"\"\n    if hasattr(layer, 'weight') and layer.weight is not None:\n        getattr(init, func)(**kwargs)(layer.weight)\n        if weight_name is not None:\n            # override weight name\n            layer.weight.name = weight_name\n    if hasattr(layer, 'bias') and layer.bias is not None:\n        init.Constant(bias_value)(layer.bias)\n        if bias_name is not None:\n            # override bias name\n            layer.bias.name = bias_name",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py:37-55"
+    },
+    "3139": {
+        "file_id": 261,
+        "content": "This code initializes the weights and biases of a neural network layer using the PaddlePaddle framework. It checks if the layer has weight and bias attributes, then applies weight initialization functions and potentially overrides their names.",
+        "type": "comment"
+    },
+    "3140": {
+        "file_id": 262,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py",
+        "type": "filepath"
+    },
+    "3141": {
+        "file_id": 262,
+        "content": "This code imports functions from submodules \"optimizer\" and \"lr\" to build optimizer and learning rate for PaddleVideo's Video Quality Assessment application.",
+        "type": "summary"
+    },
+    "3142": {
+        "file_id": 262,
+        "content": "\"\"\"\n# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .optimizer import build_optimizer\nfrom .lr import build_lr",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py:1-17"
+    },
+    "3143": {
+        "file_id": 262,
+        "content": "This code imports functions from submodules \"optimizer\" and \"lr\" to build optimizer and learning rate for PaddleVideo's Video Quality Assessment application.",
+        "type": "comment"
+    },
+    "3144": {
+        "file_id": 263,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py",
+        "type": "filepath"
+    },
+    "3145": {
+        "file_id": 263,
+        "content": "Two learning rate scheduler classes, CustomWarmupCosineDecay and CosineAnnealingDecay, are provided for optimizing models with warm-up and stepwise cosine decay. The `CustomWarmupPiecewiseDecay` class is a custom scheduler for PaddleVideo, implementing piecewise function and warmup phase with linear decay.",
+        "type": "summary"
+    },
+    "3146": {
+        "file_id": 263,
+        "content": "\"\"\"\n# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport math\nfrom paddle.optimizer.lr import *\n\"\"\"\nPaddleVideo Learning Rate Schedule:\nYou can use paddle.optimizer.lr\nor define your custom_lr in this file.\n\"\"\"\nclass CustomWarmupCosineDecay(LRScheduler):\n    \"\"\"\n    We combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        warmup_start_lr (float): start learning rate used in warmup stage.\n        warmup_epochs (int): the number epochs of warmup.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:1-33"
+    },
+    "3147": {
+        "file_id": 263,
+        "content": "This code defines a custom learning rate scheduler called CustomWarmupCosineDecay, which combines warm-up and stepwise cosine decay for optimizing models. It extends the LRScheduler class and allows users to define specific start learning rates and the number of epochs for warm-up before applying stepwise cosine decay.",
+        "type": "comment"
+    },
+    "3148": {
+        "file_id": 263,
+        "content": "        cosine_base_lr (float|int, optional): base learning rate in cosine schedule.\n        max_epoch (int): total training epochs.\n        num_iters(int): number iterations of each epoch.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CosineAnnealingDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 warmup_start_lr,\n                 warmup_epochs,\n                 cosine_base_lr,\n                 max_epoch,\n                 num_iters,\n                 last_epoch=-1,\n                 verbose=False):\n        self.warmup_start_lr = warmup_start_lr\n        self.warmup_epochs = warmup_epochs\n        self.cosine_base_lr = cosine_base_lr\n        self.max_epoch = max_epoch\n        self.num_iters = num_iters\n        #call step() in base class, last_lr/last_epoch/base_lr will be update",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:34-55"
+    },
+    "3149": {
+        "file_id": 263,
+        "content": "This code defines a class \"CosineAnnealingDecay\" for scheduling the learning rate. It takes parameters such as base learning rate, total epochs, number of iterations per epoch, and initializes instance variables accordingly. The step() method will update the last_lr/last_epoch/base_lr based on the provided parameters.",
+        "type": "comment"
+    },
+    "3150": {
+        "file_id": 263,
+        "content": "        super(CustomWarmupCosineDecay, self).__init__(last_epoch=last_epoch,\n                                                      verbose=verbose)\n    def step(self, epoch=None):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if self.last_epoch == -1:\n                self.last_epoch += 1\n            else:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def _lr_func_cosine(self, cur_epoch, cosine_base_lr, max_epoch):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:56-81"
+    },
+    "3151": {
+        "file_id": 263,
+        "content": "The code defines a CustomWarmupCosineDecay class that extends the Optimizer. It has an __init__ method to initialize last_epoch and verbose, and a step method to update learning rate based on current epoch. The step method also handles cases where epoch is None or provided manually. Additionally, there is a _lr_func_cosine method for calculating the learning rate using a cosine function.",
+        "type": "comment"
+    },
+    "3152": {
+        "file_id": 263,
+        "content": "        \"\"\"start to cosine\"\"\"\n        return cosine_base_lr * (math.cos(math.pi * cur_epoch / max_epoch) +\n                                 1.0) * 0.5\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        lr = self._lr_func_cosine(self.last_epoch, self.cosine_base_lr,\n                                  self.max_epoch)\n        lr_end = self._lr_func_cosine(self.warmup_epochs, self.cosine_base_lr,\n                                      self.max_epoch)\n        # Perform warm up.\n        if self.last_epoch < self.warmup_epochs:\n            lr_start = self.warmup_start_lr\n            alpha = (lr_end - lr_start) / self.warmup_epochs\n            lr = self.last_epoch * alpha + lr_start\n        return lr\nclass CustomWarmupPiecewiseDecay(LRScheduler):\n    \"\"\"\n    This op combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        warmup_start_lr (float): start learning rate used in warmup stage.\n        warmup_epochs (int): the number epochs of warmup.\n        step_base_lr (float|int, optional): base learning rate in step schedule.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:82-108"
+    },
+    "3153": {
+        "file_id": 263,
+        "content": "This code defines a custom learning rate (LR) scheduler that combines warmup and stepwise-cosine decay. It starts with a warmup phase, then uses a cosine annealing LR schedule. The `get_lr` function calculates the current learning rate based on the current epoch, maximum epoch, warmup epochs, and other parameters. This scheduler is used in the \"slowfast\" model.",
+        "type": "comment"
+    },
+    "3154": {
+        "file_id": 263,
+        "content": "        max_epoch (int): total training epochs.\n        num_iters(int): number iterations of each epoch.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CustomWarmupPiecewiseDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 warmup_start_lr,\n                 warmup_epochs,\n                 step_base_lr,\n                 lrs,\n                 gamma,\n                 steps,\n                 max_epoch,\n                 num_iters,\n                 last_epoch=0,\n                 verbose=False):\n        self.warmup_start_lr = warmup_start_lr\n        self.warmup_epochs = warmup_epochs\n        self.step_base_lr = step_base_lr\n        self.lrs = lrs\n        self.gamma = gamma\n        self.steps = steps\n        self.max_epoch = max_epoch\n        self.num_iters = num_iters",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:109-134"
+    },
+    "3155": {
+        "file_id": 263,
+        "content": "This code defines a class `CustomWarmupPiecewiseDecay` for scheduling learning rates. It takes several parameters like warmup start lr, warmup epochs, step base lr, lrs (list of lr values), gamma, steps, max_epoch, num_iters, last_epoch and verbose. The constructor initializes the class with these parameters.",
+        "type": "comment"
+    },
+    "3156": {
+        "file_id": 263,
+        "content": "        self.last_epoch = last_epoch\n        self.last_lr = self.warmup_start_lr  # used in first iter\n        self.verbose = verbose\n        self._var_name = None\n    def step(self, epoch=None, rebuild=False):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if not rebuild:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def _lr_func_steps_with_relative_lrs(self, cur_epoch, lrs, base_lr, steps,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:135-160"
+    },
+    "3157": {
+        "file_id": 263,
+        "content": "This code defines a custom learning rate scheduler for optimizers, allowing the learning rate to be updated based on epochs. The `step` function is used to update the learning rate, and the `_lr_func_steps_with_relative_lrs` function seems to set the learning rates for each parameter group.",
+        "type": "comment"
+    },
+    "3158": {
+        "file_id": 263,
+        "content": "                                         max_epoch):\n        \"\"\"lr func steps with relative lrs\"\"\"\n        # get step index\n        steps = steps + [max_epoch]\n        for ind, step in enumerate(steps):\n            if cur_epoch < step:\n                break\n        return lrs[ind - 1] * base_lr\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        lr = self._lr_func_steps_with_relative_lrs(\n            self.last_epoch,\n            self.lrs,\n            self.step_base_lr,\n            self.steps,\n            self.max_epoch,\n        )\n        lr_end = self._lr_func_steps_with_relative_lrs(\n            self.warmup_epochs,\n            self.lrs,\n            self.step_base_lr,\n            self.steps,\n            self.max_epoch,\n        )\n        # Perform warm up.\n        if self.last_epoch < self.warmup_epochs:\n            lr_start = self.warmup_start_lr\n            alpha = (lr_end - lr_start) / self.warmup_epochs\n            lr = self.last_epoch * alpha + lr_start\n        return lr\nclass CustomPiecewiseDecay(PiecewiseDecay):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:161-196"
+    },
+    "3159": {
+        "file_id": 263,
+        "content": "This code defines a custom learning rate (LR) scheduler for the PaddleVideo library. It uses a piecewise function to define different LRs at various epochs, and also implements a warmup phase with a linear decay from an initial LR to the first defined LR after the warmup period. The code provides functions to calculate the LR at each epoch based on the given steps, LR values, base LR, and maximum epoch.",
+        "type": "comment"
+    },
+    "3160": {
+        "file_id": 263,
+        "content": "    \"\"\"CustomPiecewiseDecay\"\"\"\n    def __init__(self, **kargs):\n        \"\"\"start\"\"\"\n        kargs.pop('num_iters')\n        super().__init__(**kargs)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:197-201"
+    },
+    "3161": {
+        "file_id": 263,
+        "content": "This code defines a custom learning rate scheduler, which initializes an instance of the class and takes keyword arguments. The 'num_iters' argument is specifically excluded from being passed as a parameter.",
+        "type": "comment"
+    },
+    "3162": {
+        "file_id": 264,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py",
+        "type": "filepath"
+    },
+    "3163": {
+        "file_id": 264,
+        "content": "The code constructs a learning rate scheduler for PaddleVideo's VideoQualityAssessment module, using the PiecewiseDecay method and handling learning rate configurations. It creates an LR scheduler instance based on name and updates num_iters if iter_step is present.",
+        "type": "summary"
+    },
+    "3164": {
+        "file_id": 264,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport copy\nimport paddle\nfrom . import custom_lr\ndef build_lr(cfg, num_iters):\n    \"\"\"\n    Build a learning rate scheduler accroding to ```OPTIMIZER``` configuration, and it always pass into the optimizer.\n    In configuration:\n    learning_rate:\n        name: 'PiecewiseDecay'\n        boundaries: [20, 60]\n        values: [0.00025, 0.000025, 0.0000025]\n    Returns:\n        A paddle.optimizer.lr instance.\n    \"\"\"",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py:1-33"
+    },
+    "3165": {
+        "file_id": 264,
+        "content": "This code builds a learning rate scheduler according to the \"OPTIMIZER\" configuration. It uses the PiecewiseDecay method with specified boundaries and values. The learning rate scheduler is always passed into the optimizer.",
+        "type": "comment"
+    },
+    "3166": {
+        "file_id": 264,
+        "content": "    cfg_copy = cfg.copy()\n    #when learning_rate is LRScheduler\n    if cfg_copy.get('learning_rate') and isinstance(cfg_copy['learning_rate'],\n                                                    dict):\n        cfg_copy['learning_rate'] = build_lr(\n            cfg_copy['learning_rate'],\n            num_iters)  #not support only inner iter_step\n    lr_name = cfg_copy.pop('name')\n    if cfg_copy.get('iter_step'):\n        cfg_copy['num_iters'] = num_iters\n        cfg_copy.pop('iter_step')\n    return getattr(custom_lr, lr_name)(**cfg_copy)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py:35-49"
+    },
+    "3167": {
+        "file_id": 264,
+        "content": "This code handles learning rate configuration in PaddleVideo's VideoQualityAssessment module. It checks if the learning rate is a dictionary and modifies it accordingly, then creates an instance of the appropriate LR scheduler based on the specified name. If an iter_step is present, it updates num_iters before removing it.",
+        "type": "comment"
+    },
+    "3168": {
+        "file_id": 265,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py",
+        "type": "filepath"
+    },
+    "3169": {
+        "file_id": 265,
+        "content": "This code constructs an optimizer and learning rate scheduler for parameter optimization, adjustable parameters, and applies regularizers to prevent overfitting. It sets weight decay based on name and value from configuration and returns the optimizer with specified parameters.",
+        "type": "summary"
+    },
+    "3170": {
+        "file_id": 265,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport copy\nimport paddle\ndef build_optimizer(cfg, lr_scheduler, parameter_list=None):\n    \"\"\"\n    Build an optimizer and learning rate scheduler to optimize parameters accroding to ```OPTIMIZER``` field in configuration .\n    In configuration:\n    OPTIMIZER:\n        name: Momentum\n        momentum: 0.9\n        weight_decay: 0.001\n    or\n    OPTIMIZER:\n        name: Momentum\n        momentum: 0.9\n        weight_decay:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py:1-36"
+    },
+    "3171": {
+        "file_id": 265,
+        "content": "This code builds an optimizer and learning rate scheduler for parameter optimization based on the given configuration file. It allows for different optimizer types (e.g., Momentum) with adjustable parameters like momentum and weight decay.",
+        "type": "comment"
+    },
+    "3172": {
+        "file_id": 265,
+        "content": "            name: \"L1\"\n            value: 0.001\n    Momentum optimizer will be applied to optimize network and L1Decay regularizer will be applied to avoid overfit.\n    OPTIMIZER:\n        name: Adam\n        weight_decay:\n            name: \"L2\"\n            value: 0.001\n    Adam optimizer will be applied to optimize network and L2Decay regularizer will applied to avoid overfit.\n    Refer to ```https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/regularizer/L2Decay_en.html``` for more details.\n    Args:\n        cfg (dict): optimizer configuration.\n        lr_schduler: learning rate scheduler.\n        parameter_list (list): parameters to be optimized.\n    Returns:\n        optimizer (paddle.optimizer): paddle optimizer.\n    \"\"\"\n    cfg_copy = cfg.copy()\n    #XXX check none and illegal cfg!!!\n    opt_name = cfg_copy.pop('name')\n    # deal with weight decay\n    if cfg_copy.get('weight_decay'):\n        if isinstance(cfg_copy.get('weight_decay'), float) or 'L1' in cfg_copy.get('weight_decay').get('name').upper():",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py:37-68"
+    },
+    "3173": {
+        "file_id": 265,
+        "content": "This code defines an optimizer function that creates an optimizer based on the provided configuration. It uses an Adam optimizer to optimize a network and applies an L2Decay regularizer to avoid overfitting. The L1Decay regularizer can also be applied. The function takes an optimizer configuration dictionary, learning rate scheduler, and a list of parameters to be optimized as inputs and returns a paddle optimizer object. It checks for none and illegal configurations.",
+        "type": "comment"
+    },
+    "3174": {
+        "file_id": 265,
+        "content": "            cfg_copy['weight_decay'] = cfg_copy.get('weight_decay').get('value')\n        elif 'L2' in cfg_copy.get('weight_decay').get('name').upper():\n            cfg_copy['weight_decay'] = paddle.regularizer.L2Decay(cfg_copy.get('weight_decay').get('value'))\n        else:\n            raise ValueError\n    cfg_copy.pop('learning_rate')\n    return getattr(paddle.optimizer, opt_name)(lr_scheduler,\n                                               parameters=parameter_list,\n                                               **cfg_copy)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py:69-79"
+    },
+    "3175": {
+        "file_id": 265,
+        "content": "This code sets the weight decay based on its name and value from configuration. If 'L2' is in the name, it adds L2 Decay regularizer. Otherwise, it raises a ValueError. It then removes learning_rate from config and returns an optimizer with specified parameters and other configurations.",
+        "type": "comment"
+    },
+    "3176": {
+        "file_id": 266,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py",
+        "type": "filepath"
+    },
+    "3177": {
+        "file_id": 266,
+        "content": "This code is a part of PaddleVideo's VideoQualityAssessment module and it imports and defines functions for training and testing models.",
+        "type": "summary"
+    },
+    "3178": {
+        "file_id": 266,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .train import train_model\nfrom .test import test_model\n__all__ = ['train_model', 'test_model']",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py:1-20"
+    },
+    "3179": {
+        "file_id": 266,
+        "content": "This code is a part of PaddleVideo's VideoQualityAssessment module and it imports and defines functions for training and testing models.",
+        "type": "comment"
+    },
+    "3180": {
+        "file_id": 267,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py",
+        "type": "filepath"
+    },
+    "3181": {
+        "file_id": 267,
+        "content": "This code tests a model using Paddle framework, constructs multi-card datasets, enables parallel processing with DataParallel, updates state dictionary for evaluation. Batch size is set, metric object built based on configuration, and data iterated over from loader, using either parallel or sequential testing, updating metric per batch before accumulating results.",
+        "type": "summary"
+    },
+    "3182": {
+        "file_id": 267,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..metrics import build_metric\nfrom ..modeling.builder import build_model\nfrom paddlevideo.utils import load\nimport time\nlogger = get_logger(\"paddlevideo\")\n@paddle.no_grad()\ndef test_model(cfg, weights, parallel=True):\n    \"\"\"Test model entry\n    Args:\n        cfg (dict): configuration.\n        weights (str): weights path to load.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py:1-35"
+    },
+    "3183": {
+        "file_id": 267,
+        "content": "This code is a function named \"test_model\" which tests a given model using specified configuration and weights path. It uses Paddle framework and utilizes functions from paddlevideo.utils, loader.builder, metrics, and modeling.builder to perform the testing.",
+        "type": "comment"
+    },
+    "3184": {
+        "file_id": 267,
+        "content": "        parallel (bool): Whether to do multi-cards testing. Default: True.\n    \"\"\"\n    # 1. Construct model.\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    # 2. Construct dataset and dataloader.\n    cfg.DATASET.test.test_mode = True\n    dataset = build_dataset((cfg.DATASET.test, cfg.PIPELINE.test))\n    batch_size = cfg.DATASET.get(\"test_batch_size\", 1)\n    places = paddle.set_device('gpu')\n    # default num worker: 0, which means no subprocess will be created\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    dataloader_setting = dict(batch_size=batch_size,\n                              num_workers=num_workers,\n                              places=places,\n                              drop_last=False,\n                              shuffle=False)\n    data_loader = build_dataloader(dataset, **dataloader_setting)\n    model.eval()\n    state_dicts = load(weights)\n    model.set_state_dict(state_dicts)\n    # add params to metrics\n    cfg.METRIC.data_size = len(dataset)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py:36-66"
+    },
+    "3185": {
+        "file_id": 267,
+        "content": "This code constructs a model and dataset for multi-card testing. It uses DataParallel to enable parallel processing on multiple GPUs, builds the dataloader with specified settings, sets the model to evaluation mode, loads state dictionaries from weights file, and updates the model's state dictionary. The metric data size is set to the length of the dataset.",
+        "type": "comment"
+    },
+    "3186": {
+        "file_id": 267,
+        "content": "    cfg.METRIC.batch_size = batch_size\n    Metric = build_metric(cfg.METRIC)\n    for batch_id, data in enumerate(data_loader):\n        if parallel:\n            outputs = model._layers.test_step(data)\n        else:\n            outputs = model.test_step(data)\n        Metric.update(batch_id, data, outputs)\n    Metric.accumulate()",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py:67-78"
+    },
+    "3187": {
+        "file_id": 267,
+        "content": "This code sets the batch size, builds a metric object based on configuration, and iterates over data from a loader. Inside the loop, it either uses parallel or sequential testing to get outputs, then updates the metric for each batch before accumulating results.",
+        "type": "comment"
+    },
+    "3188": {
+        "file_id": 268,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py",
+        "type": "filepath"
+    },
+    "3189": {
+        "file_id": 268,
+        "content": "This code utilizes PaddleVideo library to train a video quality assessment model with GPU support, parallel processing, and distributed training. It includes data loaders, solvers, optimization, logging, and validation for efficient model training.",
+        "type": "summary"
+    },
+    "3190": {
+        "file_id": 268,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport time\nimport os.path as osp\nimport paddle\nimport paddle.distributed.fleet as fleet\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..modeling.builder import build_model\nfrom ..solver import build_lr, build_optimizer\nfrom ..metrics import build_metric\nfrom ..utils import do_preciseBN\nfrom paddlevideo.utils import get_logger, coloring\nfrom paddlevideo.utils import (AverageMeter, build_rec_record, log_batch,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:1-28"
+    },
+    "3191": {
+        "file_id": 268,
+        "content": "This code is part of the PaddleVideo library for video quality assessment. It imports necessary modules, and uses builders to construct data loaders, datasets, models, solvers, and metrics. It also includes utilities for logging and batch processing.",
+        "type": "comment"
+    },
+    "3192": {
+        "file_id": 268,
+        "content": "                               log_epoch, save, load, mkdir)\n#from paddlevideo.metrics import QualityMetric\nimport numpy as np\nfrom scipy import stats\ndef train_model(cfg,\n                weights=None,\n                parallel=True,\n                validate=True,\n                amp=False,\n                fleet=False):\n    \"\"\"Train model entry\n    Args:\n    \tcfg (dict): configuration.\n        weights (str): weights path for finetuning.\n    \tparallel (bool): Whether multi-cards training. Default: True.\n        validate (bool): Whether to do evaluation. Default: False.\n    \"\"\"\n    if fleet:\n        fleet.init(is_collective=True)\n    logger = get_logger(\"paddlevideo\")\n    batch_size = cfg.DATASET.get('batch_size', 8)\n    valid_batch_size = cfg.DATASET.get('valid_batch_size', batch_size)\n    places = paddle.set_device('gpu')\n    # default num worker: 0, which means no subprocess will be created\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    model_name = cfg.model_name\n    output_dir = cfg.get(\"output_dir\", \"./output/model_name/\")",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:29-61"
+    },
+    "3193": {
+        "file_id": 268,
+        "content": "This function trains a model with specified configuration. It uses GPU for computation, and allows for parallel processing if multiple GPUs are available. Optionally, it performs validation during training and can also be used for fleet-based distributed training. The trained model's output directory is defined in the configuration file.",
+        "type": "comment"
+    },
+    "3194": {
+        "file_id": 268,
+        "content": "    mkdir(output_dir)\n    # 1. Construct model\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    if fleet:\n        model = paddle.distributed_model(model)\n    # 2. Construct dataset and dataloader\n    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))\n    train_dataloader_setting = dict(batch_size=batch_size,\n                                    num_workers=num_workers,\n                                    collate_fn_cfg=cfg.get('MIX', None),\n                                    places=places)\n    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)\n    if validate:\n        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))\n        validate_dataloader_setting = dict(\n            batch_size=valid_batch_size,\n            num_workers=num_workers,\n            places=places,\n            drop_last=False,\n            shuffle=cfg.DATASET.get(\n                'shuffle_valid',\n                False)  #NOTE: attention lstm need shuffle valid data.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:62-89"
+    },
+    "3195": {
+        "file_id": 268,
+        "content": "Code snippet creates a directory, builds the model based on configuration, and sets up data loaders for training and validation datasets. It also handles parallelization and distributed model usage if specified.",
+        "type": "comment"
+    },
+    "3196": {
+        "file_id": 268,
+        "content": "        )\n        valid_loader = build_dataloader(valid_dataset,\n                                        **validate_dataloader_setting)\n    # 3. Construct solver.\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))\n    optimizer = build_optimizer(cfg.OPTIMIZER,\n                                lr,\n                                parameter_list=model.parameters())\n    if fleet:\n        optimizer = fleet.distributed_optimizer(optimizer)\n    # Resume\n    resume_epoch = cfg.get(\"resume_epoch\", 0)\n    if resume_epoch:\n        filename = osp.join(output_dir,\n                            model_name + \"_epoch_{}\".format(resume_epoch))\n        resume_model_dict = load(filename + '.pdparams')\n        resume_opt_dict = load(filename + '.pdopt')\n        model.set_state_dict(resume_model_dict)\n        optimizer.set_state_dict(resume_opt_dict)\n    # Finetune:\n    if weights:\n        assert resume_epoch == 0, \"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it.\"",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:90-113"
+    },
+    "3197": {
+        "file_id": 268,
+        "content": "Building a valid data loader, constructing a solver with specified optimizer and learning rate, resuming training from a previous epoch or finetuning the model.",
+        "type": "comment"
+    },
+    "3198": {
+        "file_id": 268,
+        "content": "        model_dict = load(weights)\n        model.set_state_dict(model_dict)\n    # 4. Train Model\n    ###AMP###\n    if amp:\n        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)\n    best = 0.\n    max_SROCC = 0\n    max_PLCC = 0\n    Metric = build_metric(cfg.METRIC)\n    for epoch in range(0, cfg.epochs):\n        if epoch < resume_epoch:\n            logger.info(\n                \"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... \"\n            )\n            continue\n        model.train()\n        record_list = build_rec_record(cfg.MODEL)\n        tic = time.time()\n        train_output = []\n        train_label = []\n        for i, data in enumerate(train_loader):\n            record_list['reader_time'].update(time.time() - tic)\n            # 4.1 forward\n            ###AMP###\n            if amp:\n                with paddle.amp.auto_cast(\n                        custom_black_list={\"temporal_shift\", \"reduce_mean\"}):\n                    if parallel:\n                        outputs = model._layers.train_step(data)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:114-147"
+    },
+    "3199": {
+        "file_id": 268,
+        "content": "The code loads a model and sets its state dict. Then, it proceeds to train the model if not in resume phase. It builds record_list for metrics calculation and iterates through data from train loader to forward pass and calculate metrics. If AMP is enabled, auto-casting is used during training steps.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/32.json b/docs/data/32.json
new file mode 100644
index 000000000..c46d74707
--- /dev/null
+++ b/docs/data/32.json
@@ -0,0 +1,541 @@
+{
+    "3200": {
+        "file_id": 268,
+        "content": "                        ## required for DataParallel, will remove in next version\n                        model._reducer.prepare_for_backward(\n                            list(model._find_varbase(outputs)))\n                    else:\n                        outputs = model.train_step(data)\n                train_output.extend(outputs['output'])\n                train_label.extend(outputs['label'])\n                avg_loss = outputs['loss']\n                scaled = scaler.scale(avg_loss)\n                scaled.backward()\n                # keep prior to 2.0 design\n                scaler.minimize(optimizer, scaled)\n                optimizer.clear_grad()\n            else:\n                if parallel:\n                    outputs = model._layers.train_step(data)\n                    ## required for DataParallel, will remove in next version\n                    model._reducer.prepare_for_backward(\n                        list(model._find_varbase(outputs)))\n                else:\n                    outputs = model.train_step(data)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:148-171"
+    },
+    "3201": {
+        "file_id": 268,
+        "content": "This code handles the model training step for video quality assessment. It uses the model's `train_step` function to calculate outputs and labels, then extends them to the train_output and train_label lists respectively. The average loss is calculated and scaled before its backward pass. Finally, it performs optimization by minimizing the scaler and clearing gradients.",
+        "type": "comment"
+    },
+    "3202": {
+        "file_id": 268,
+        "content": "                train_output.extend(outputs['output'])\n                train_label.extend(outputs['label'])\n                # 4.2 backward\n                avg_loss = outputs['loss']\n                avg_loss.backward()\n                # 4.3 minimize\n                optimizer.step()\n                optimizer.clear_grad()\n            # log record\n            record_list['lr'].update(optimizer._global_learning_rate(),\n                                     batch_size)\n            for name, value in outputs.items():\n                if name == 'output' or name == 'label':\n                    continue\n                record_list[name].update(value, batch_size)\n            record_list['batch_time'].update(time.time() - tic)\n            tic = time.time()\n            if i % cfg.get(\"log_interval\", 10) == 0:\n                ips = \"ips: {:.5f} instance/sec.\".format(\n                    batch_size / record_list[\"batch_time\"].val)\n                log_batch(record_list, i, epoch + 1, cfg.epochs, \"train\", ips)\n            # learning rate iter step",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:173-198"
+    },
+    "3203": {
+        "file_id": 268,
+        "content": "Code snippet performs backward propagation, optimizes model parameters, logs training progress and learning rate, updates metrics, and logs information at specified intervals.",
+        "type": "comment"
+    },
+    "3204": {
+        "file_id": 268,
+        "content": "            if cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n                lr.step()\n        # learning rate epoch step\n        if not cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n            lr.step()\n        train_PLCC, train_SROCC = Metric.accumulate_train(\n            train_output, train_label)\n        logger.info(\"train_SROCC={}\".format(train_SROCC))\n        logger.info(\"train_PLCC={}\".format(train_PLCC))\n        ips = \"ips: {:.5f} instance/sec.\".format(\n            batch_size * record_list[\"batch_time\"].count /\n            record_list[\"batch_time\"].sum)\n        log_epoch(record_list, epoch + 1, \"train\", ips)\n        eval_output = []\n        eval_label = []\n        def evaluate(best, max_SROCC, max_PLCC):\n            \"\"\"evaluate\"\"\"\n            model.eval()\n            record_list = build_rec_record(cfg.MODEL)\n            record_list.pop('lr')\n            tic = time.time()\n            for i, data in enumerate(valid_loader):\n                if parallel:\n                    outputs = model._layers.val_step(data)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:199-229"
+    },
+    "3205": {
+        "file_id": 268,
+        "content": "This code is part of a training process for a video quality assessment model. It checks if the learning rate should be updated by an iterative step, then updates it accordingly. The code calculates the train_SROCC and train_PLCC metrics to track progress, logs this information, and evaluates the model's performance on a separate validation dataset. A record of the training process is maintained to monitor batch time and other relevant statistics.",
+        "type": "comment"
+    },
+    "3206": {
+        "file_id": 268,
+        "content": "                else:\n                    outputs = model.val_step(data)\n                eval_output.extend(outputs['output'])\n                eval_label.extend(outputs['label'])\n                # log_record\n                for name, value in outputs.items():\n                    if name == 'output' or name == 'label':\n                        continue\n                    record_list[name].update(value, batch_size)\n                record_list['batch_time'].update(time.time() - tic)\n                tic = time.time()\n                if i % cfg.get(\"log_interval\", 10) == 0:\n                    ips = \"ips: {:.5f} instance/sec.\".format(\n                        batch_size / record_list[\"batch_time\"].val)\n                    log_batch(record_list, i, epoch + 1, cfg.epochs, \"val\", ips)\n            eval_PLCC, eval_SROCC = Metric.accumulate_train(\n                eval_output, eval_label)\n            logger.info(\"val_SROCC={}\".format(eval_SROCC))\n            logger.info(\"val_PLCC={}\".format(eval_PLCC))\n            if max_SROCC <= eval_SROCC and max_PLCC <= eval_PLCC:",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:230-254"
+    },
+    "3207": {
+        "file_id": 268,
+        "content": "This code is part of a model's validation step during training. It collects outputs and labels from the model, updates logging records, and logs validation metrics such as SROCC and PLCC. If these metrics are greater than the previous maximum values, it updates the max values.",
+        "type": "comment"
+    },
+    "3208": {
+        "file_id": 268,
+        "content": "                max_SROCC = eval_SROCC\n                max_PLCC = eval_PLCC\n                logger.info(\"max_SROCC={}\".format(max_SROCC))\n                logger.info(\"max_PLCC={}\".format(max_PLCC))\n                save(optimizer.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdopt\"))\n                save(model.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdparams\"))\n            ips = \"ips: {:.5f} instance/sec.\".format(\n                batch_size * record_list[\"batch_time\"].count /\n                record_list[\"batch_time\"].sum)\n            log_epoch(record_list, epoch + 1, \"val\", ips)\n            return best, max_SROCC, max_PLCC\n        # use precise bn to improve acc\n        if cfg.get(\"PRECISEBN\") and (epoch % cfg.PRECISEBN.preciseBN_interval\n                                     == 0 or epoch == cfg.epochs - 1):\n            do_preciseBN(\n                model, train_loader, parallel,\n                min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader)))",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:255-276"
+    },
+    "3209": {
+        "file_id": 268,
+        "content": "This code snippet is responsible for storing the best optimizer and model states, logging instance per second (ips) during validation phase, and optionally performing precise batch normalization if configuration allows. It returns the best parameters, maximum SROCC, and maximum PLCC values.",
+        "type": "comment"
+    },
+    "3210": {
+        "file_id": 268,
+        "content": "        # 5. Validation\n        if validate and (epoch % cfg.get(\"val_interval\", 1) == 0\n                         or epoch == cfg.epochs - 1):\n            with paddle.no_grad():\n                best, max_SROCC, max_PLCC = evaluate(best, max_SROCC, max_PLCC)\n        # 6. Save model\n        if epoch % cfg.get(\"save_interval\", 1) == 0 or epoch == cfg.epochs - 1:\n            save(\n                optimizer.state_dict(),\n                osp.join(output_dir,\n                         model_name + \"_epoch_{}.pdopt\".format(epoch)))\n            save(\n                model.state_dict(),\n                osp.join(output_dir,\n                         model_name + \"_epoch_{}.pdparams\".format(epoch)))\n    logger.info('training {model_name} finished')",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:278-295"
+    },
+    "3211": {
+        "file_id": 268,
+        "content": "This code block performs validation and model saving in a training process. It validates the model every 'val_interval' epochs or on the last epoch, and saves optimizer and model states every 'save_interval' epochs or on the last epoch. The logger then informs that training is finished.",
+        "type": "comment"
+    },
+    "3212": {
+        "file_id": 269,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py",
+        "type": "filepath"
+    },
+    "3213": {
+        "file_id": 269,
+        "content": "This code is a module for the PaddleVideo package that includes various utility functions and classes, such as Registry, build_utils, config, logger, record, dist_utils, save_load, and precise_bn. It also defines __all__ to include Registry and build.",
+        "type": "summary"
+    },
+    "3214": {
+        "file_id": 269,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .registry import Registry\nfrom .build_utils import build\nfrom .config import *\nfrom .logger import setup_logger, coloring, get_logger\nfrom .record import AverageMeter, build_record, build_rec_record, log_batch, log_epoch\nfrom .dist_utils import get_dist_info, main_only\nfrom .save_load import save, load, load_ckpt, mkdir\nfrom .precise_bn import do_preciseBN\n__all__ = ['Registry', 'build']",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py:1-25"
+    },
+    "3215": {
+        "file_id": 269,
+        "content": "This code is a module for the PaddleVideo package that includes various utility functions and classes, such as Registry, build_utils, config, logger, record, dist_utils, save_load, and precise_bn. It also defines __all__ to include Registry and build.",
+        "type": "comment"
+    },
+    "3216": {
+        "file_id": 270,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py",
+        "type": "filepath"
+    },
+    "3217": {
+        "file_id": 270,
+        "content": "This Python function builds a module from a config dictionary, checks its validity, retrieves an object class from a registry, and returns an instance with optional parameters.",
+        "type": "summary"
+    },
+    "3218": {
+        "file_id": 270,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\ndef build(cfg, registry, key='name'):\n    \"\"\"Build a module from config dict.\n    Args:\n        cfg (dict): Config dict. It should at least contain the key.\n        registry (XXX): The registry to search the type from.\n        key (str): the key.\n    Returns:\n        obj: The constructed object.\n    \"\"\"\n    assert isinstance(cfg, dict) and key in cfg\n    cfg_copy = cfg.copy()\n    obj_type = cfg_copy.pop(key)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py:1-30"
+    },
+    "3219": {
+        "file_id": 270,
+        "content": "This code snippet is a Python function that builds a module from a config dictionary. It checks if the input is a dictionary and verifies if the specified key exists. Then it makes a copy of the dictionary and removes the specified key, returning the constructed object. The registry is used to search for the type of the module.",
+        "type": "comment"
+    },
+    "3220": {
+        "file_id": 270,
+        "content": "    obj_cls = registry.get(obj_type)\n    if obj_cls is None:\n        raise KeyError('{} is not in the {} registry'.format(\n                obj_type, registry.name))\n    return obj_cls(**cfg_copy)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py:32-36"
+    },
+    "3221": {
+        "file_id": 270,
+        "content": "The code retrieves an object class from a registry based on the provided \"obj_type\", and if not found, raises a KeyError with an informative message. It then returns an instance of the retrieved class with optional configuration parameters.",
+        "type": "comment"
+    },
+    "3222": {
+        "file_id": 271,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py",
+        "type": "filepath"
+    },
+    "3223": {
+        "file_id": 271,
+        "content": "This code includes the AttrDict class and functions for managing config files in PaddleVideo. These functions handle dictionary creation, config file parsing, recursive printing, and value overriding.",
+        "type": "summary"
+    },
+    "3224": {
+        "file_id": 271,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os\nimport yaml\nfrom paddlevideo.utils.logger import coloring, get_logger, setup_logger\n__all__ = ['get_config']\nlogger = setup_logger(\"./\", name=\"paddlevideo\", level=\"INFO\")\nclass AttrDict(dict):\n    \"\"\"Attr Dict\"\"\"\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:1-35"
+    },
+    "3225": {
+        "file_id": 271,
+        "content": "This code file contains the definition of a class AttrDict, which is used to handle configurations in the PaddleVideo library. It also sets up a logger for logging information related to PaddleVideo. The __all__ variable holds the list of functions/classes that are exported by this module. This file is part of PaddleVideo's utility package.",
+        "type": "comment"
+    },
+    "3226": {
+        "file_id": 271,
+        "content": "def create_attr_dict(yaml_config):\n    \"\"\"create attr dict\"\"\"\n    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef print_dict(d, delimiter=0):\n    \"\"\"\n    Recursively visualize a dict and\n    indenting acrrording by the relationship of keys.\n    \"\"\"\n    placeholder = \"-\" * 60\n    for k, v in sorted(d.items()):\n        if isinstance(v, dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", coloring(k,",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:38-71"
+    },
+    "3227": {
+        "file_id": 271,
+        "content": "This code defines three functions: \"create_attr_dict\", \"parse_config\", and \"print_dict\". The \"create_attr_dict\" function converts specific values in a dictionary to AttrDict objects. The \"parse_config\" function loads a configuration file into an AttrDict object after applying the create_attr_dict function to it. Finally, the \"print_dict\" function recursively prints out the contents of a dictionary, indented based on their relationships.",
+        "type": "comment"
+    },
+    "3228": {
+        "file_id": 271,
+        "content": "                                                                   \"HEADER\")))\n            print_dict(v, delimiter + 4)\n        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \",\n                                         coloring(str(k), \"HEADER\")))\n            for value in v:\n                print_dict(value, delimiter + 4)\n        else:\n            logger.info(\"{}{} : {}\".format(delimiter * \" \",\n                                           coloring(k, \"HEADER\"),\n                                           coloring(v, \"OKGREEN\")))\n        if k.isupper():\n            logger.info(placeholder)\ndef print_config(config):\n    \"\"\"\n    visualize configs\n    Arguments:\n        config: configs\n    \"\"\"\n    print_dict(config)\ndef check_config(config):\n    \"\"\"\n    Check config\n    \"\"\"\n    pass\ndef override(dl, ks, v):\n    \"\"\"\n    Recursively replace dict of list\n    Args:\n        dl(dict or list): dict or list to be replaced\n        ks(list): list of keys\n        v(str): value to be replaced",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:72-110"
+    },
+    "3229": {
+        "file_id": 271,
+        "content": "This code defines functions to print and check config files. The \"print_config\" function visualizes the config file by printing its content in a structured format, while the \"check_config\" function is currently a placeholder with no implementation. The \"override\" function allows recursive replacement of values within a dictionary or list.",
+        "type": "comment"
+    },
+    "3230": {
+        "file_id": 271,
+        "content": "    \"\"\"\n    def str2num(v):\n        \"\"\"str2num\"\"\"\n        try:\n            return eval(v)\n        except Exception:\n            return v\n    assert isinstance(dl, (list, dict)), (\"{} should be a list or a dict\")\n    assert len(ks) > 0, ('lenght of keys should larger than 0')\n    if isinstance(dl, list):\n        k = str2num(ks[0])\n        if len(ks) == 1:\n            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))\n            dl[k] = str2num(v)\n        else:\n            override(dl[k], ks[1:], v)\n    else:\n        if len(ks) == 1:\n            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))\n            if not ks[0] in dl:\n                logger.warning('A new filed ({}, {}) detected!'.format(ks[0], dl))\n            dl[ks[0]] = str2num(v)\n        else:\n            assert ks[0] in dl, (\n                '({}) doesn\\'t exist in {}, a new dict field is invalid'.format(\n                    ks[0], dl))\n            override(dl[ks[0]], ks[1:], v)\ndef override_config(config, options=None):\n    \"\"\"",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:111-142"
+    },
+    "3231": {
+        "file_id": 271,
+        "content": "This code defines a function `override_config` which takes in a config and optionally an options parameter. It checks if the config is either a list or dictionary, and ensures that there are keys in the config. If the config is a list, it uses the `str2num` function to convert the first key into a number and then uses this index to set the corresponding value. If there's only one key, it checks if the index is within range before setting the value. If there are multiple keys, it calls the `override` function with the first key, remaining keys, and value. If the config is a dictionary, it checks if the first key exists in the dictionary. If it doesn't, it logs a warning about a new field being detected. It then sets the value using the first key or calls `override` for subsequent keys.",
+        "type": "comment"
+    },
+    "3232": {
+        "file_id": 271,
+        "content": "    Recursively override the config\n    Args:\n        config(dict): dict to be replaced\n        options(list): list of pairs(key0.key1.idx.key2=value)\n            such as: [\n                epochs=20',\n                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'\n            ]\n    Returns:\n        config(dict): replaced config\n    \"\"\"\n    if options is not None:\n        for opt in options:\n            assert isinstance(opt,\n                              str), (\"option({}) should be a str\".format(opt))\n            assert \"=\" in opt, (\n                \"option({}) should contain a =\"\n                \"to distinguish between key and value\".format(opt))\n            pair = opt.split('=')\n            assert len(pair) == 2, (\"there can be only a = in the option\")\n            key, value = pair\n            keys = key.split('.')\n            override(config, keys, value)\n    return config\ndef get_config(fname, overrides=None, show=True):\n    \"\"\"\n    Read config from file\n    \"\"\"\n    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:143-174"
+    },
+    "3233": {
+        "file_id": 271,
+        "content": "This code defines a function that recursively overrides the config with given options. It takes a dictionary (config) and a list of key-value pairs (options) as arguments, and returns the updated config after overriding. The function checks if the options are provided and in the correct format. If so, it splits the key-value pair, extracts the keys and values, and recursively overrides the config with these values. Finally, it returns the updated config. The code also includes a separate function that reads the config from a file and has optional parameters for overrides and displaying information.",
+        "type": "comment"
+    },
+    "3234": {
+        "file_id": 271,
+        "content": "    config = parse_config(fname)\n    override_config(config, overrides)\n    if show:\n        print_config(config)\n    check_config(config)\n    return config",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:175-180"
+    },
+    "3235": {
+        "file_id": 271,
+        "content": "This function parses a configuration file, applies any overrides, displays the config if requested, and checks its validity before returning it.",
+        "type": "comment"
+    },
+    "3236": {
+        "file_id": 272,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py",
+        "type": "filepath"
+    },
+    "3237": {
+        "file_id": 272,
+        "content": "This code includes utility functions for managing distributed computation in PaddleVideo's Video Quality Assessment application, providing current rank and world size info, and a decorator to limit function execution to the main process.",
+        "type": "summary"
+    },
+    "3238": {
+        "file_id": 272,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport functools\nimport paddle\nimport paddle.distributed as dist\ndef get_dist_info():\n    \"\"\"get_dist_info\"\"\"\n    world_size = dist.get_world_size()\n    rank = dist.get_rank()\n    return rank, world_size\ndef main_only(func):\n    \"\"\"main_only\"\"\"\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        \"\"\"wrapper\"\"\"\n        rank, _ = get_dist_info()\n        if rank == 0:\n            return func(*args, **kwargs)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py:1-35"
+    },
+    "3239": {
+        "file_id": 272,
+        "content": "This code provides utility functions for handling distributed computation in PaddleVideo's Video Quality Assessment application. The `get_dist_info()` function returns the current rank and world size, while `main_only(func)` is a decorator that ensures a function only runs on the main process (rank 0).",
+        "type": "comment"
+    },
+    "3240": {
+        "file_id": 272,
+        "content": "    return wrapper",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py:36-36"
+    },
+    "3241": {
+        "file_id": 272,
+        "content": "This function returns the modified or wrapped object, which can be a tensor, model, or other data structure.",
+        "type": "comment"
+    },
+    "3242": {
+        "file_id": 273,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py",
+        "type": "filepath"
+    },
+    "3243": {
+        "file_id": 273,
+        "content": "The code provides a logger class for PaddleVideo's Video Quality Assessment app, enabling logging for distributed apps with rank-based output to file or console. It initializes loggers and disables log event propagation when verbosity level is set to \"DEBUG\".",
+        "type": "summary"
+    },
+    "3244": {
+        "file_id": 273,
+        "content": "\"\"\"\n#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport logging\nimport os\nimport sys\nimport datetime\nfrom paddle.distributed import ParallelEnv\nColor = {\n    'RED': '\\033[31m',\n    'HEADER': '\\033[35m',  # deep purple\n    'PURPLE': '\\033[95m',  # purple\n    'OKBLUE': '\\033[94m',\n    'OKGREEN': '\\033[92m',\n    'WARNING': '\\033[93m',\n    'FAIL': '\\033[91m',\n    'ENDC': '\\033[0m'\n}\ndef coloring(message, color=\"OKGREEN\"):\n    \"\"\"coloring\"\"\"\n    assert color in Color.keys()",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:1-40"
+    },
+    "3245": {
+        "file_id": 273,
+        "content": "This code snippet is from PaddleVideo's Video Quality Assessment application, and it contains a logger class for logging messages. The logger imports necessary modules like logging, os, sys, datetime, and ParallelEnv, along with defining color codes and a function for colored output.",
+        "type": "comment"
+    },
+    "3246": {
+        "file_id": 273,
+        "content": "    if os.environ.get('COLORING', True):\n        return Color[color] + str(message) + Color[\"ENDC\"]\n    else:\n        return message\nlogger_initialized = []\ndef setup_logger(output=None, name=\"paddlevideo\", level=\"INFO\"):\n    \"\"\"\n    Initialize the paddlevideo logger and set its verbosity level to \"INFO\".\n    Args:\n        output (str): a file name or a directory to save log. If None, will not save log file.\n            If ends with \".txt\" or \".log\", assumed to be a file name.\n            Otherwise, logs will be saved to `output/log.txt`.\n        name (str): the root module name of this logger\n    Returns:\n        logging.Logger: a logger\n    \"\"\"\n    def time_zone(sec, fmt):\n        real_time = datetime.datetime.now()\n        return real_time.timetuple()\n    logging.Formatter.converter = time_zone\n    logger = logging.getLogger(name)\n    if level == \"INFO\":\n        logger.setLevel(logging.INFO)\n    elif level==\"DEBUG\":\n        logger.setLevel(logging.DEBUG)\n    logger.propagate = False\n    if level == \"DEBUG\":\n        plain_formatter = logging.Formatter(",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:41-74"
+    },
+    "3247": {
+        "file_id": 273,
+        "content": "Function `setup_logger` initializes the paddlevideo logger and sets its verbosity level to \"INFO\". It takes optional arguments for output file name or directory, and root module name. If the verbosity level is set to \"DEBUG\", the logger will have a lower threshold for logging messages. The function also disables propagation of log events to the root logger.",
+        "type": "comment"
+    },
+    "3248": {
+        "file_id": 273,
+        "content": "            \"[%(asctime)s] %(name)s %(levelname)s: %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    else:\n        plain_formatter = logging.Formatter(\n            \"[%(asctime)s] %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    # stdout logging: master only\n    local_rank = ParallelEnv().local_rank\n    if local_rank == 0:\n        ch = logging.StreamHandler(stream=sys.stdout)\n        ch.setLevel(logging.DEBUG)\n        formatter = plain_formatter\n        ch.setFormatter(formatter)\n        logger.addHandler(ch)\n    # file logging: all workers\n    if output is not None:\n        if output.endswith(\".txt\") or output.endswith(\".log\"):\n            filename = output\n        else:\n            filename = os.path.join(output, \".log.txt\")\n        if local_rank > 0:\n            filename = filename + \".rank{}\".format(local_rank)\n        # PathManager.mkdirs(os.path.dirname(filename))\n        os.makedirs(os.path.dirname(filename), exist_ok=True)\n        # fh = logging.StreamHandler(_cached_log_stream(filename)\n        fh = logging.FileHandler(filename, mode='a')",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:75-103"
+    },
+    "3249": {
+        "file_id": 273,
+        "content": "This code sets up logging configuration for a distributed application. It uses a logger to handle log messages, and based on the local rank of each process, it determines whether to output logs to standard out, standard err, or a file. If no output is provided, it defaults to a \".log.txt\" file in the specified directory. The log files for different ranks are distinguished by appending the rank number. If the directory doesn't exist, it creates one before writing the logs.",
+        "type": "comment"
+    },
+    "3250": {
+        "file_id": 273,
+        "content": "        fh.setLevel(logging.DEBUG)\n        fh.setFormatter(plain_formatter)\n        logger.addHandler(fh)\n    logger_initialized.append(name)\n    return logger\ndef get_logger(name, output=None):\n    \"\"\"get logger\"\"\"\n    logger = logging.getLogger(name)\n    if name in logger_initialized:\n        return logger\n    return setup_logger(name=name, output=name)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:104-117"
+    },
+    "3251": {
+        "file_id": 273,
+        "content": "This function `get_logger` sets up a logger with the given name. If the logger has already been initialized, it simply returns the existing logger. Otherwise, it calls `setup_logger` to initialize the logger with the given name and optional output. The logger is configured to handle debug level messages using plain formatter and this configuration is appended to the list of initialized loggers.",
+        "type": "comment"
+    },
+    "3252": {
+        "file_id": 274,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py",
+        "type": "filepath"
+    },
+    "3253": {
+        "file_id": 274,
+        "content": "This code updates batch normalization in PaddleVideo library, improving accuracy by using true mean and variance for validation during training.",
+        "type": "summary"
+    },
+    "3254": {
+        "file_id": 274,
+        "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nimport itertools\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n\"\"\"\nImplement precise bn, which is useful for improving accuracy.\n\"\"\"\ndef do_preciseBN(model, data_loader, parallel, num_iters=200):\n    \"\"\"\n    Recompute and update the batch norm stats to make them more precise. During\n    training both BN stats and the weight are changing after every iteration, so\n    the running average can not precisely reflect the actual stats of the",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:1-31"
+    },
+    "3255": {
+        "file_id": 274,
+        "content": "This code snippet is part of the PaddleVideo library and aims to implement a precise batch normalization method. The function \"do_preciseBN\" takes in a model, data loader, parallel flag, and number of iterations as parameters. It updates the batch norm stats more precisely by recomputing them after every iteration during training. This improves accuracy by better reflecting the actual stats of the dataset.",
+        "type": "comment"
+    },
+    "3256": {
+        "file_id": 274,
+        "content": "    current model.\n    In this function, the BN stats are recomputed with fixed weights, to make\n    the running average more precise. Specifically, it computes the true average\n    of per-batch mean/variance instead of the running average.\n    This is useful to improve validation accuracy.\n    Args:\n        model: the model whose bn stats will be recomputed\n        data_loader: an iterator. Produce data as input to the model\n        num_iters: number of iterations to compute the stats.\n    Return:\n        the model with precise mean and variance in bn layers.\n    \"\"\"\n    bn_layers_list = [\n        m for m in model.sublayers()\n        if any((isinstance(m, bn_type)\n                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,\n                                paddle.nn.BatchNorm3D))) and m.training\n    ]\n    if len(bn_layers_list) == 0:\n        return\n    # moving_mean=moving_mean*momentum+batch_mean*(1.−momentum)\n    # we set momentum=0. to get the true mean and variance during forward\n    momentum_actual = [bn._momentum for bn in bn_layers_list]",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:32-55"
+    },
+    "3257": {
+        "file_id": 274,
+        "content": "The function precisely computes BN stats with fixed weights for a model using a data loader and a specified number of iterations. It replaces running averages in BN layers with true mean and variance to improve validation accuracy.",
+        "type": "comment"
+    },
+    "3258": {
+        "file_id": 274,
+        "content": "    for bn in bn_layers_list:\n        bn._momentum = 0.\n    running_mean = [paddle.zeros_like(bn._mean)\n                    for bn in bn_layers_list]  #pre-ignore\n    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]\n    ind = -1\n    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):\n        logger.info(\"doing precise BN {} / {}...\".format(ind + 1, num_iters))\n        if parallel:\n            model._layers.train_step(data)\n        else:\n            model.train_step(data)\n        for i, bn in enumerate(bn_layers_list):\n            # Accumulates the bn stats.\n            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)\n            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)\n    assert ind == num_iters - 1, (\n        \"update_bn_stats is meant to run for {} iterations, but the dataloader stops at {} iterations.\"\n        .format(num_iters, ind))\n    # Sets the precise bn stats.\n    for i, bn in enumerate(bn_layers_list):\n        bn._mean.set_value(running_mean[i])",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:56-82"
+    },
+    "3259": {
+        "file_id": 274,
+        "content": "This code initializes zeroed variables and then performs precise batch normalization (BN) by accumulating the BN statistics for a specified number of iterations. It updates the mean and variance values for each Batch Normalization layer in the model, ensuring accurate and precise normalization during training.",
+        "type": "comment"
+    },
+    "3260": {
+        "file_id": 274,
+        "content": "        bn._variance.set_value(running_var[i])\n        bn._momentum = momentum_actual[i]",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:83-84"
+    },
+    "3261": {
+        "file_id": 274,
+        "content": "These lines update the batch normalization layer's variance and momentum values with the corresponding values from the running average array. This helps maintain the normal distribution of activation values in the neural network, improving performance.",
+        "type": "comment"
+    },
+    "3262": {
+        "file_id": 275,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py",
+        "type": "filepath"
+    },
+    "3263": {
+        "file_id": 275,
+        "content": "This code from PaddleVideo's VideoQualityAssessment module builds a record list for tracking metrics during training, appends metric names and AverageMeter instances for various frameworks and models, formats and logs epoch, mode, metric average, and image processing speed information with color-coded visual distinction.",
+        "type": "summary"
+    },
+    "3264": {
+        "file_id": 275,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom collections import OrderedDict\nfrom .logger import get_logger, coloring\nlogger = get_logger(\"paddlevideo\")\n__all__ = ['AverageMeter', 'build_record', 'build_rec_record', 'log_batch', 'log_epoch']\ndef build_record(cfg):\n    framework_type = cfg.get('framework')\n    record_list = [\n        (\"loss\", AverageMeter('loss', '7.5f')),\n        (\"lr\", AverageMeter('lr', 'f', need_avg=False)),\n    ]\n    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:1-29"
+    },
+    "3265": {
+        "file_id": 275,
+        "content": "This code is from the PaddleVideo library's VideoQualityAssessment module. It imports necessary classes and functions, defines logger variables, and provides a function to build a record list for loss and learning rate metrics. The framework type is specified, and if Recognizer1D is part of the specified framework, additional steps may be required.",
+        "type": "comment"
+    },
+    "3266": {
+        "file_id": 275,
+        "content": "        record_list.append((\"hit_at_one\", AverageMeter(\"hit_at_one\", '.5f')))\n        record_list.append((\"perr\", AverageMeter(\"perr\", '.5f')))\n        record_list.append((\"gap\", AverageMeter(\"gap\", '.5f')))\n    elif 'Recognizer' in cfg.framework:\n        record_list.append((\"top1\", AverageMeter(\"top1\", '.5f')))\n        record_list.append((\"top5\", AverageMeter(\"top5\", '.5f')))\n    record_list.append((\"batch_time\", AverageMeter('elapse', '.3f')))\n    record_list.append((\"reader_time\", AverageMeter('reader', '.3f')))\n    record_list = OrderedDict(record_list)\n    return record_list\ndef build_rec_record(cfg):\n    \"\"\"build rec record\"\"\"\n    framework_type = cfg.get('framework')\n    record_list = [\n        (\"loss\", AverageMeter('loss', '7.5f')),\n        (\"lr\", AverageMeter('lr', 'f', need_avg=False)),\n    ]\n    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework\n        record_list.append((\"hit_at_one\", AverageMeter(\"hit_at_one\", '.5f')))\n        record_list.append((\"perr\", AverageMeter(\"perr\", '.5f')))",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:30-51"
+    },
+    "3267": {
+        "file_id": 275,
+        "content": "This code is building a record list for tracking metrics during the training process. It appends various metric names to the record list along with their corresponding AverageMeter instances for different frameworks and models. The AverageMeter keeps track of the average value over time, and each meter has its format specifier for displaying the values. The code also includes a function build_rec_record to create the record list based on the given configuration (cfg).",
+        "type": "comment"
+    },
+    "3268": {
+        "file_id": 275,
+        "content": "        record_list.append((\"gap\", AverageMeter(\"gap\", '.5f')))\n    record_list.append((\"batch_time\", AverageMeter('elapse', '.3f')))\n    record_list.append((\"reader_time\", AverageMeter('reader', '.3f')))\n    record_list = OrderedDict(record_list)\n    return record_list\nclass AverageMeter(object):\n    \"\"\"\n    Computes and stores the average and current value\n    \"\"\"\n    def __init__(self, name='', fmt='f', need_avg=True):\n        self.name = name\n        self.fmt = fmt\n        self.need_avg = need_avg\n        self.reset()\n    def reset(self):\n        \"\"\" reset \"\"\"\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        \"\"\" update \"\"\"\n        if isinstance(val, paddle.Tensor):\n            val = float(val)\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n    @property\n    def total(self):\n        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)\n    @property\n    def total_minute(self):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:52-90"
+    },
+    "3269": {
+        "file_id": 275,
+        "content": "This code defines a function `record_list` and a class `AverageMeter`. The function creates a list of record names and their corresponding AverageMeter objects, then converts the list to an OrderedDict. The AverageMeter class computes and stores average values, resets upon initialization, updates with new values, and provides properties for displaying total sum and total sum in minutes.",
+        "type": "comment"
+    },
+    "3270": {
+        "file_id": 275,
+        "content": "        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,\n                                                            self=self)\n    @property\n    def mean(self):\n        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(\n            self=self) if self.need_avg else ''\n    @property\n    def value(self):\n        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)\ndef log_batch(metric_list, batch_id, epoch_id, total_epoch, mode, ips):\n    metric_str = ' '.join([str(m.value) for m in metric_list.values()])\n    epoch_str = \"epoch:[{:>3d}/{:<3d}]\".format(epoch_id, total_epoch)\n    step_str = \"{:s} step:{:<4d}\".format(mode, batch_id)\n    logger.info(\"{:s} {:s} {:s}s {}\".format(\n        coloring(epoch_str, \"HEADER\") if batch_id == 0 else epoch_str,\n        coloring(step_str, \"PURPLE\"), coloring(metric_str, 'OKGREEN'), ips))\ndef log_epoch(metric_list, epoch, mode, ips):\n    metric_avg = ' '.join([str(m.mean) for m in metric_list.values()] +\n                          [metric_list['batch_time'].total])",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:91-115"
+    },
+    "3271": {
+        "file_id": 275,
+        "content": "The code provides functions to log batch and epoch information for a video quality assessment task. The `log_batch` function takes in metric list, batch ID, epoch ID, total epochs, mode, and ips as input and logs the metrics, current epoch/total epochs, and step details. The `log_epoch` function calculates the mean of the metrics and logs the mean values along with the total batch time for an epoch.",
+        "type": "comment"
+    },
+    "3272": {
+        "file_id": 275,
+        "content": "    end_epoch_str = \"END epoch:{:<3d}\".format(epoch)\n    logger.info(\"{:s} {:s} {:s}s {}\".format(coloring(end_epoch_str, \"RED\"),\n                                            coloring(mode, \"PURPLE\"),\n                                            coloring(metric_avg, \"OKGREEN\"),\n                                            ips))",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:117-122"
+    },
+    "3273": {
+        "file_id": 275,
+        "content": "This code snippet is formatting and logging information related to an epoch, mode, metric average, and image processing speed. It uses the \"coloring\" function to color certain parts of the log text (RED, PURPLE, OKGREEN) for better visual distinction. The logger then logs this information with time stamp.",
+        "type": "comment"
+    },
+    "3274": {
+        "file_id": 276,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py",
+        "type": "filepath"
+    },
+    "3275": {
+        "file_id": 276,
+        "content": "The Registry class supports module customization, allowing users to register objects and retrieve them via unique names in a name-to-object mapping system.",
+        "type": "summary"
+    },
+    "3276": {
+        "file_id": 276,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nclass Registry(object):\n    \"\"\"\n    The registry that provides name -> object mapping, to support third-party users' custom modules.\n    To register an object:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        @BACKBONES.register()\n        class ResNet:\n            pass\n    Or:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        class ResNet:\n            pass\n        BACKBONES.register(ResNet)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py:1-35"
+    },
+    "3277": {
+        "file_id": 276,
+        "content": "The Registry class provides a name-to-object mapping, enabling third-party users to customize modules. To register an object, use @BACKBONES.register() or BACKBONES.register(ResNet).",
+        "type": "comment"
+    },
+    "3278": {
+        "file_id": 276,
+        "content": "    Usage: To build a module.\n    .. code-block:: python\n        backbone_name = \"ResNet\"\n        b = BACKBONES.get(backbone_name)()\n    \"\"\"\n    def __init__(self, name):\n        \"\"\"\n        Args:\n            name (str): the name of this registry\n        \"\"\"\n        self._name = name\n        self._obj_map = {}\n    def __contains__(self, key):\n        return self._obj_map.get(key) is not None\n    def _do_register(self, name, obj):\n        \"\"\"do register\"\"\"\n        assert (\n            name not in self._obj_map\n        ), \"An object named '{}' was already registered in '{}' registry!\".format(\n            name, self._name)\n        self._obj_map[name] = obj\n    def register(self, obj=None, name=None):\n        \"\"\"\n        Register the given object under the the name `obj.__name__`.\n        Can be used as either a decorator or not. See docstring of this class for usage.\n        \"\"\"\n        if obj is None:\n            # used as a decorator\n            def deco(func_or_class, name=name):\n                if name is None:\n                    name = func_or_class.__name__",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py:37-72"
+    },
+    "3279": {
+        "file_id": 276,
+        "content": "The code provides a registry class for building modules based on their names. It allows registration of objects with unique names, and can be used as a decorator or without. The usage example demonstrates how to get a backbone module using its name from the registered objects map.",
+        "type": "comment"
+    },
+    "3280": {
+        "file_id": 276,
+        "content": "                self._do_register(name, func_or_class)\n                return func_or_class\n            return deco\n        # used as a function call\n        if name is None:\n            name = obj.__name__\n        self._do_register(name, obj)\n    def get(self, name):\n        \"\"\"Get the registry record.\n        Args:\n            name (str): The class name.\n        Returns:\n            ret: The class.\n        \"\"\"\n        ret = self._obj_map.get(name)\n        if ret is None:\n            raise KeyError(\n                \"No object named '{}' found in '{}' registry!\".format(\n                    name, self._name))\n        return ret",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py:73-98"
+    },
+    "3281": {
+        "file_id": 276,
+        "content": "This code registers and retrieves objects in a registry. It allows registering functions or classes with optional names, and can retrieve the registered object by its name. The `get` function returns the class if found in the registry, otherwise raises KeyError.",
+        "type": "comment"
+    },
+    "3282": {
+        "file_id": 277,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py",
+        "type": "filepath"
+    },
+    "3283": {
+        "file_id": 277,
+        "content": "This code loads weights from a checkpoint file into a model, defines functions for saving, loading, and creating directories, using Paddle's save and load methods.",
+        "type": "summary"
+    },
+    "3284": {
+        "file_id": 277,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os\nimport os.path as osp\nimport time\nimport pickle\nfrom tqdm import tqdm\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom paddlevideo.utils import main_only\n#XXX(shipping): maybe need load N times because of different cards have different params.\n@main_only\ndef load_ckpt(model, weight_path):\n    \"\"\"\n    load_ckpt\n    \"\"\"\n    #model.set_state_dict(state_dict)\n    if not osp.isfile(weight_path):",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py:1-37"
+    },
+    "3285": {
+        "file_id": 277,
+        "content": "This code snippet is part of PaddleVideo's Video Quality Assessment application. It loads a checkpoint file into the provided model. If the weight path file does not exist, it will not be loaded, and the method returns immediately without any action. The function uses the \"os\" and \"tqdm\" libraries for file operations and progress bars, respectively. It also utilizes Paddle's \"set_state_dict\" method to load the model's parameters from the checkpoint file.",
+        "type": "comment"
+    },
+    "3286": {
+        "file_id": 277,
+        "content": "        raise IOError('{weight_path} is not a checkpoint file')\n    #state_dicts = load(weight_path)\n    logger = get_logger(\"paddlevideo\")\n    state_dicts = paddle.load(weight_path)\n    tmp = {}\n    total_len = len(model.state_dict())\n    localkeyname = [i for i in state_dicts]\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        #for item in tqdm(model.state_dict(), total=total_len, position=0):\n        for i, item in enumerate(\n                tqdm(model.state_dict(), total=total_len, position=0)):\n            name = item\n            desc.set_description('Loading %s' % name)\n            print(\"model name is {}, correspoding local name is {}\".format(\n                name, localkeyname[i]))\n            #tmp[name] = state_dicts[name]\n            tmp[name] = state_dicts[localkeyname[i]]\n            time.sleep(0.01)\n        ret_str = \"loading {:<20d} weights completed.\".format(\n            len(model.state_dict()))\n        desc.set_description(ret_str)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py:38-63"
+    },
+    "3287": {
+        "file_id": 277,
+        "content": "This code snippet is attempting to load weights from a checkpoint file into a model. It first raises an error if the provided path does not correspond to a valid checkpoint file. The logger variable is assigned for logging purposes. Then, it uses Paddle's paddle.load() function to load the state dictionaries from the specified weight_path.\n\nThe code then initializes an empty dictionary 'tmp' and calculates the total length of the model's state dictionary. It creates a local key name list by iterating through state_dicts. \n\nNext, it uses tqdm to create a progress bar for displaying the loading process. For each item in the model's state dictionary, it checks if it exists in the loaded state dictionaries and assigns the corresponding value to 'tmp'. It also updates the description of the progress bar with the current name being loaded. Finally, upon completion, it sets a final description indicating that all weights have been loaded successfully.",
+        "type": "comment"
+    },
+    "3288": {
+        "file_id": 277,
+        "content": "        model.set_state_dict(tmp)\ndef mkdir(dir):\n    \"\"\"mkdir\"\"\"\n    if not os.path.exists(dir):\n        # avoid error when train with multiple gpus\n        try:\n            os.makedirs(dir)\n        except:\n            pass\n@main_only\ndef save(obj, path):\n    \"\"\"save\"\"\"\n    paddle.save(obj, path)\ndef load(file_name):\n    \"\"\"load\"\"\"\n    if not osp.isfile(file_name):\n        raise IOError('{file_name} not exist')\n    return paddle.load(file_name)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py:64-87"
+    },
+    "3289": {
+        "file_id": 277,
+        "content": "This code defines functions for saving, loading, and creating directories. The \"save\" function uses Paddle's save method to store an object at a specified path. The \"load\" function checks if the file exists before returning its contents using Paddle's load method. Lastly, the \"mkdir\" function creates a directory at the specified location, handling errors that may occur when training with multiple GPUs.",
+        "type": "comment"
+    },
+    "3290": {
+        "file_id": 278,
+        "content": "/applications/VideoQualityAssessment/paddlevideo/version.py",
+        "type": "filepath"
+    },
+    "3291": {
+        "file_id": 278,
+        "content": "This code snippet contains the version information for PaddleVideo library. The version is set to \"0.0.1\" and it includes a copyright notice, license details, and specifies that this file should be used only in compliance with the Apache License, Version 2.0.",
+        "type": "summary"
+    },
+    "3292": {
+        "file_id": 278,
+        "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\n__all__ = [\"paddlevideo_version\"]\npaddlevideo_version = \"0.0.1\"",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/paddlevideo/version.py:1-18"
+    },
+    "3293": {
+        "file_id": 278,
+        "content": "This code snippet contains the version information for PaddleVideo library. The version is set to \"0.0.1\" and it includes a copyright notice, license details, and specifies that this file should be used only in compliance with the Apache License, Version 2.0.",
+        "type": "comment"
+    },
+    "3294": {
+        "file_id": 279,
+        "content": "/applications/VideoQualityAssessment/run.sh",
+        "type": "filepath"
+    },
+    "3295": {
+        "file_id": 279,
+        "content": "This is a shell script for PaddlePaddle, setting CUDA_VISIBLE_DEVICES and executing commands like training, testing, and exporting models using tsm architecture. It also mentions running predict.py on example.avi with model files and disabling benchmarking for \"example\" model with 8 segments.",
+        "type": "summary"
+    },
+    "3296": {
+        "file_id": 279,
+        "content": "export CUDA_VISIBLE_DEVICES=0\n# run  training\npython3.7 -B -m paddle.distributed.launch --gpus=\"0\"  --log_dir=log_pptsm  main.py --amp  --validate -c configs/recognition/tsm/pptsm_regression.yaml\n# run testing\n#python3.7 -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_pptsm main.py -c configs/recognition/tsm/pptsm_regression.yaml --test --weights=output/model_name/ppTSM_best.pdparams\n#finetune\n#python3 -m paddle.distributed.launch --gpus=\"0,1,2,3\" main.py --amp -c ./configs/recognition/tsm/pptsm_regression.yaml --validate --weights=./output/model_name/ppTSM_best.pdparams\n#resume\n#python3 -m paddle.distributed.launch --gpus=\"0,1,2,3\" main.py --amp -c ./configs/recognition/tsm/pptsm_regression.yaml --validate -o resume_epoch=2\n# export_models script\n# just use `example` as example, please replace to real name.\n#python3.7 tools/export_model.py -c configs/example.yaml -p output/model_name/ppTSM_best.pdparams -o ./inference\n# predict script\n# just use `example` as example, please replace to real name.",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/run.sh:1-19"
+    },
+    "3297": {
+        "file_id": 279,
+        "content": "The code is a shell script that sets the CUDA_VISIBLE_DEVICES environment variable and executes PaddlePaddle commands for training, testing, finetuning, resuming and exporting/predicting models. The commands use specific configurations (yaml files) for recognition tasks using tsm architecture. It mentions the file paths where necessary, such as the log directory, model weights, and output directories.",
+        "type": "comment"
+    },
+    "3298": {
+        "file_id": 279,
+        "content": "#python3.7 tools/predict.py -v example.avi --model_file \"./inference/example.pdmodel\" --params_file \"./inference/example.pdiparams\" --enable_benchmark=False --model=\"example\" --num_seg=8",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/run.sh:20-20"
+    },
+    "3299": {
+        "file_id": 279,
+        "content": "Running predict.py script on example.avi with specified model files and disabling benchmarking for the \"example\" model with 8 segments.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/33.json b/docs/data/33.json
new file mode 100644
index 000000000..b40c0e582
--- /dev/null
+++ b/docs/data/33.json
@@ -0,0 +1,541 @@
+{
+    "3300": {
+        "file_id": 280,
+        "content": "/applications/VideoQualityAssessment/save_model.sh",
+        "type": "filepath"
+    },
+    "3301": {
+        "file_id": 280,
+        "content": "The code is executing a Python script (tools/export_model.py) with specific parameters to export the best model from ./configs/recognition/tsm/pptsm.yaml, save it as ./output/ppTSM/ppTSM\\_best.pdparams and store the inference files in ./inference/. It will use 32 segments for processing.",
+        "type": "summary"
+    },
+    "3302": {
+        "file_id": 280,
+        "content": "python tools/export_model.py \\\n  -c ./configs/recognition/tsm/pptsm.yaml \\\n  -p ./output/ppTSM/ppTSM_best.pdparams \\\n  -o ./inference/ \\\n  --num_seg=32 ",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/save_model.sh:1-5"
+    },
+    "3303": {
+        "file_id": 280,
+        "content": "The code is executing a Python script (tools/export_model.py) with specific parameters to export the best model from ./configs/recognition/tsm/pptsm.yaml, save it as ./output/ppTSM/ppTSM\\_best.pdparams and store the inference files in ./inference/. It will use 32 segments for processing.",
+        "type": "comment"
+    },
+    "3304": {
+        "file_id": 281,
+        "content": "/applications/VideoQualityAssessment/setup.py",
+        "type": "filepath"
+    },
+    "3305": {
+        "file_id": 281,
+        "content": "This Python package, \"paddlevideo\", utilizes PaddlePaddle toolkits for video understanding and supports multiple Python versions. It is set up using setuptools and includes dependencies and documentation.",
+        "type": "summary"
+    },
+    "3306": {
+        "file_id": 281,
+        "content": "\"\"\"\n# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom setuptools import setup\nfrom io import open\nwith open('requirements.txt', encoding=\"utf-8-sig\") as f:\n    requirements = f.readlines()\ndef readme():\n    \"\"\"readme\"\"\"\n    with open('docs/en/whl_en.md', encoding=\"utf-8-sig\") as f:\n        README = f.read()\n    return README\nsetup(\n    name='paddlevideo', #name of .whl file\n    packages=['ppvideo'], #install package name\n    package_dir={'ppvideo': ''},\n    include_package_data=True, #Accept all data files and directories matched by MANIFEST.in",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/setup.py:1-34"
+    },
+    "3307": {
+        "file_id": 281,
+        "content": "This code is setting up a Python package using setuptools for the PaddleVideo library, specifying its name as \"paddlevideo\". It includes the necessary dependencies listed in the \"requirements.txt\" file and provides a README file located at 'docs/en/whl_en.md' for documentation purposes.",
+        "type": "comment"
+    },
+    "3308": {
+        "file_id": 281,
+        "content": "    install_requires=requirements,\n    entry_points={\"console_scripts\": [\"ppvideo= ppvideo.tools.paddlevideo_clas:main\"]},\n    version='0.0.1',\n    license='Apache License 2.0',\n    description='Awesome Video toolkits based on PaddlePaddle ',\n    long_description=readme(),\n    long_description_content_type='text/markdown',\n    url='https://github.com/PaddlePaddle/PaddleVideo',\n    download_url='https://github.com/PaddlePaddle/PaddleVideo.git',\n    keywords=[\n    'A treasure chest for video understanding powered by PaddlePaddle.'\n    ],\n    classifiers=[\n        'Intended Audience :: Developers', 'Operating System :: OS Independent',\n        'Natural Language :: Chinese (Simplified)',\n        'Programming Language :: Python :: 3',\n        'Programming Language :: Python :: 3.2',\n        'Programming Language :: Python :: 3.3',\n        'Programming Language :: Python :: 3.4',\n        'Programming Language :: Python :: 3.5',\n        'Programming Language :: Python :: 3.6',\n        'Programming Language :: Python :: 3.7', 'Topic :: Utilities'",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/setup.py:35-56"
+    },
+    "3309": {
+        "file_id": 281,
+        "content": "This code is a setup file for a Python package named \"ppvideo\" that utilizes PaddlePaddle toolkits for video understanding. It specifies installation requirements, entry points, version, license, description, URL, download link, keywords, and classifiers. The package supports multiple versions of Python and is categorized under the Utilities topic.",
+        "type": "comment"
+    },
+    "3310": {
+        "file_id": 281,
+        "content": "    ],)",
+        "type": "code",
+        "location": "/applications/VideoQualityAssessment/setup.py:57-57"
+    },
+    "3311": {
+        "file_id": 281,
+        "content": "This code is creating a tuple with empty elements. The specific purpose or usage of this tuple in the context of the setup.py file might require more information to provide an accurate and relevant comment.",
+        "type": "comment"
+    },
+    "3312": {
+        "file_id": 282,
+        "content": "/applications/VideoTag/FineTune.md",
+        "type": "filepath"
+    },
+    "3313": {
+        "file_id": 282,
+        "content": "This guide details fine-tuning the VideoTag model using custom data, covering AttentionLSTM and TSN models, feature extraction, and multi/single GPU support. The code trains, evaluates, predicts with TSN, requires specific weight files, allows save directories, and preprocesses videos into images.",
+        "type": "summary"
+    },
+    "3314": {
+        "file_id": 282,
+        "content": "# 模型微调指南\n---\n## 内容\n参考本文档，您可以使用自己的训练数据在VideoTag预训练模型上进行fine-tune，训练出自己的模型。\n文档内容包括:\n- [原理解析](#原理解析)\n- [对AttentionLSTM模型进行微调](#对AttentionLSTM模型进行微调)\n- [对TSN模型进行微调](#对TSN模型进行微调)\n- [扩展内容](#扩展内容)\n- [参考论文](#参考论文)\n## 原理解析\nVideoTag采用两阶段建模方式，由两个模型组成: TSN + AttentionLSTM。\nTemporal Segment Network (TSN) 是经典的基于2D-CNN的视频分类模型。该模型通过稀疏采样视频帧的方式，在捕获视频时序信息的同时降低了计算量。详细内容请参考论文[Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)\nAttentionLSTM以视频的特征向量作为输入，采用双向长短时记忆网络（LSTM）对所有帧特征进行编码，并增加Attention层，将每个时刻的隐状态输出与自适应权重线性加权得到最终分类向量。详细内容请参考论文[AttentionCluster](https://arxiv.org/abs/1711.09550)\nVideoTag训练时分两个阶段: 第一阶段使用少量视频样本（十万级别）训练大规模视频特征提取模型(TSN)；第二阶段使用千万级数据训练预测器(AttentionLSTM)。\nVideoTag预测时也分两个阶段: 第一阶段以视频文件作为输入，经过去除了全连接层以及损失函数层的TSN网络后得到输出特征向量；第二阶段以TSN网络输出的特征向量作为输入，经过AttentionLSTM后得到最终的分类结果。\n基于我们的预模型，您可以使用自己的训练数据进行fine-tune:\n- [对AttentionLSTM模型进行微调](#对AttentionLSTM模型进行微调)\n- [对TSN模型进行微调](#对TSN模型进行微调)\n## 对AttentionLSTM模型进行微调\nAttentionLSTM以视频特征作为输入，显存占用少，训练速度较TSN更快，因此推荐优先对AttentionLSTM模型进行微调。输入视频首先经过TSN预训练模型提取特征向量，然后将特征向量作为训练输入数据，微调AttentionLSTM模型。",
+        "type": "code",
+        "location": "/applications/VideoTag/FineTune.md:1-32"
+    },
+    "3315": {
+        "file_id": 282,
+        "content": "This is a guide for fine-tuning the VideoTag pre-trained model with custom training data, covering AttentionLSTM and TSN models, principle explanations, and reference papers.",
+        "type": "comment"
+    },
+    "3316": {
+        "file_id": 282,
+        "content": "### TSN预模型提取特征向量\n#### 数据准备\n- 预训练权重下载: 参考[样例代码运行指南-数据准备-预训练权重下载](./Run.md)\n- 准备训练数据: 准备好待训练的视频数据，并在video\\_tag/data/TsnExtractor.list文件中指定待训练的文件路径，内容格式如下:\n```\nmy_video_path/my_video_file1.mp4\nmy_video_path/my_video_file2.mp4\n...\n```\n#### 特征提取\n特征提取脚本如下:\n```\npython tsn_extractor.py --model_name=TSN --config=./configs/tsn.yaml --weights=./weights/tsn.pdparams\n```\n- 通过--weights可指定TSN权重参数的存储路径，默认为video\\_tag/weights/tsn.pdparams\n- 通过--save\\_dir可指定特征向量保存路径，默认为video\\_tag/data/tsn\\_features，不同输入视频的特征向量提取结果分文件保存在不同的npy文件中，目录形式为:\n```\nvideo_tag\n  ├──data\n    ├──tsn_features\n      ├── my_feature_file1.npy\n      ├── my_feature_file2.npy\n      ...\n```\n- tsn提取的特征向量维度为```帧数*特征维度```，默认为300 * 2048。\n### AttentionLSTM模型Fine-tune\n#### 数据准备\nVideoTag中的AttentionLSTM以TSN模型提取的特征向量作为输入。在video\\_tag/data/dataset/attention\\_lstm/train.list文件中指定待训练的文件路径和对应的标签，内容格式如下:\n```\nmy_feature_path/my_feature_file1.npy label1 label2\nmy_feature_path/my_feature_file2.npy label1\n...\n```\n- 一个输入视频可以有多个标签，标签索引为整型数据，文件名与标签之间、多个标签之间以一个空格分隔；\n- 标签索引与标签名称的之间的对应关系以list文件指定，可参考VideoTag用到的label_3396.txt文件构造，行索引对应标签索引;",
+        "type": "code",
+        "location": "/applications/VideoTag/FineTune.md:34-81"
+    },
+    "3317": {
+        "file_id": 282,
+        "content": "Extract features from TSN pre-trained model, save the extracted features in specified directory. AttentionLSTM model fine-tuning requires TSN extracted features with corresponding labels in the train.list file. Label indices are defined in a separate text file, e.g., label_3396.txt.",
+        "type": "comment"
+    },
+    "3318": {
+        "file_id": 282,
+        "content": "- 验证集、测试集以及预测数据集的构造方式同训练集类似，仅需要在video\\_tag/data/attention\\_lstm/目录下对应的list文件中指定相关文件路径/标签即可。\n#### 模型训练\n使用VideoTag中的AttentionLSTM预模型进行fine-tune训练脚本如下:\n```\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython train.py --model_name=AttentionLSTM --config=./configs/attention_lstm.yaml --pretrain=./weights/attention_lstm\n```\n- AttentionLSTM模型默认使用8卡训练，总的batch size数是1024。若使用单卡训练，请修改环境变量，脚本如下:\n```\nexport CUDA_VISIBLE_DEVICES=0\npython train.py --model_name=AttentionLSTM --config=./configs/attention_lstm-single.yaml --pretrain=./weights/attention_lstm\n```\n- 请确保训练样本数大于batch_size数\n- 通过--pretrain参数可指定AttentionLSTM预训练模型的路径，默认为./weights/attention\\_lstm；\n- 模型相关配置写在video_tag/configs/attention\\_lstm.yaml文件中，可以方便的调节各项超参数；\n- 通过--save_dir参数可指定训练模型参数的保存路径，默认为./data/checkpoints；\n#### 模型评估\n可用如下方式进行模型评估:\n```\npython eval.py --model_name=AttentionLSTM --config=./configs/attention_lstm.yaml --weights=./data/checkpoints/AttentionLSTM_epoch9.pdparams\n```\n- 通过--weights参数可指定评估需要的权重，默认为./data/checkpoints/AttentionLSTM_epoch9.pdparams；\n- 评估结果以log的形式直接打印输出GAP、Hit@1等精度指标。",
+        "type": "code",
+        "location": "/applications/VideoTag/FineTune.md:83-113"
+    },
+    "3319": {
+        "file_id": 282,
+        "content": "This code chunk is for fine-tuning the AttentionLSTM model in PaddleVideo's VideoTag application. It provides instructions for training the model with multiple GPUs or a single GPU, and specifies the configuration file and pretrained weights required. The code also demonstrates how to evaluate the trained model using eval.py script. The precision metrics printed include GAP and Hit@1.",
+        "type": "comment"
+    },
+    "3320": {
+        "file_id": 282,
+        "content": "#### 模型推断\n可用如下方式进行模型推断:\n```\npython predict.py --model_name=AttentionLSTM --config=./configs/attention_lstm.yaml --weights=./data/checkpoints/AttentionLSTM_epoch9.pdparams\n```\n- 通过--weights参数可指定推断需要的权重，默认为./data/checkpoints/AttentionLSTM_epoch9.pdparams；\n- 通过--label_file参数指定标签文件，请根据自己的数据修改，默认为./label_3396.txt;\n- 预测结果会以日志形式打印出来，同时也保存在json文件中，通过--save_dir参数可指定预测结果保存路径，默认为./data/predict_results/attention_lstm。\n## 对TSN模型进行微调\nVideoTag中使用的TSN模型以mp4文件为输入，backbone为ResNet101。\n### 数据准备\n准备好训练视频文件后，在video\\_tag/data/dataset/tsn/train.list文件中指定待训练的文件路径和对应的标签即可，内容格式如下:\n```\nmy_video_path/my_video_file1.mp4 label1\nmy_video_path/my_video_file2.mp4 label2\n...\n```\n- 一个输入视频只能有一个标签，标签索引为整型数据，标签索引与文件名之间以一个空格分隔；\n- 验证集、测试集以及预测数据集的构造方式同训练集类似，仅需要在video\\_tag/data/dataset/tsn目录下对应的list文件中指定相关文件路径/标签即可。\n#### 模型训练\n使用VideoTag中的TSN预模型进行fine-tune训练脚本如下:\n```\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython train.py --model_name=TSN --config=./configs/tsn.yaml --pretrain=./weights/tsn\n```\n- TSN模型默认使用8卡训练，总的batch size数是256。若使用单卡训练，请修改环境变量，脚本如下:\n```",
+        "type": "code",
+        "location": "/applications/VideoTag/FineTune.md:115-152"
+    },
+    "3321": {
+        "file_id": 282,
+        "content": "This code provides instructions for model inference and fine-tuning using the PaddleVideo framework's VideoTag application. It explains how to specify the model, configuration file, weights, label files, and save directory for prediction results. Additionally, it outlines the steps for preparing data, training, and executing a pre-trained TSN model in the VideoTag application.",
+        "type": "comment"
+    },
+    "3322": {
+        "file_id": 282,
+        "content": "export CUDA_VISIBLE_DEVICES=0\npython train.py --model_name=TSN --config=./configs/tsn-single.yaml --pretrain=./weights/tsn\n```\n- 通过--pretrain参数可指定TSN预训练模型的路径，示例为./weights/tsn；\n- 模型相关配置写在video_tag/configs/tsn.yaml文件中，可以方便的调节各项超参数；\n- 通过--save_dir参数可指定训练模型参数的保存路径，默认为./data/checkpoints；\n#### 模型评估\n可用如下方式进行模型评估:\n```\npython eval.py --model_name=TSN --config=./configs/tsn.yaml --weights=./data/checkpoints/TSN_epoch44.pdparams\n```\n- 通过--weights参数可指定评估需要的权重，示例为./data/checkpoints/TSN_epoch44.pdparams；\n- 评估结果以log的形式直接打印输出TOP1_ACC、TOP5_ACC等精度指标。\n#### 模型推断\n可用如下方式进行模型推断:\n```\npython predict.py --model_name=TSN --config=./configs/tsn.yaml --weights=./data/checkpoints/TSN_epoch44.pdparams --save_dir=./data/predict_results/tsn/\n```\n- 通过--weights参数可指定推断需要的权重，示例为./data/checkpoints/TSN_epoch44.pdparams；\n- 通过--label_file参数指定标签文件，请根据自己的数据修改，默认为./label_3396.txt;\n- 预测结果会以日志形式打印出来，同时也保存在json文件中，通过--save_dir参数可指定预测结果保存路径，示例为./data/predict_results/tsn。\n### 训练加速\nTSN模型默认以mp4的视频文件作为输入，训练时需要先对视频文件解码，再将解码后的数据送入网络进行训练，如果视频文件很大，这个过程将会很耗时。\n为加速训练，可以先将视频解码成图片，然后保存下来，训练时直接根据索引读取帧图片作为输入，加快训练过程。",
+        "type": "code",
+        "location": "/applications/VideoTag/FineTune.md:153-188"
+    },
+    "3323": {
+        "file_id": 282,
+        "content": "This code is for training, evaluating and predicting with the TSN model. It uses different Python scripts (train.py, eval.py, and predict.py) along with a configuration file (tsn.yaml). The TSN model requires specific weight files saved at certain locations. It also has options to specify save directories for checkpoints, evaluation results, and prediction outputs. To speed up the training process, videos can be preprocessed into images before training.",
+        "type": "comment"
+    },
+    "3324": {
+        "file_id": 282,
+        "content": "- 数据准备: 首先将视频解码，存成帧图片；然后生成帧图片的文件路径列表。实现过程可参考[ucf-101数据准备](../../../../dygraph/tsn/data/dataset/ucf101/README.md)\n- 修改配置文件: 修改配置文件./config/tsn.yaml，其中MODEL.format值改为\"frames\"，不同模式下的filelist值改为对应的帧图片文件list。\n## 扩展内容\n- 更多关于TSN模型的内容可参考PaddleCV视频库[TSN视频分类模型](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/video/models/tsn/README.md)。\n- 更多关于AttentionLSTM模型的内容可参考PaddleCV视频库[AttentionLSTM视频分类模型](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/video/models/attention_lstm)。\n## 参考论文\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool\n- [Beyond Short Snippets: Deep Networks for Video Classification](https://arxiv.org/abs/1503.08909) Joe Yue-Hei Ng, Matthew Hausknecht, Sudheendra Vijayanarasimhan, Oriol Vinyals, Rajat Monga, George Toderici",
+        "type": "code",
+        "location": "/applications/VideoTag/FineTune.md:190-206"
+    },
+    "3325": {
+        "file_id": 282,
+        "content": "The code is preparing the data by decoding videos into frames and generating a file path list for these frames. It modifies the configuration file, changing the model format to \"frames\" and updating the filelist accordingly. Additional information about TSN and AttentionLSTM models can be found in their respective PaddleCV repositories, with references provided to the original papers as well.",
+        "type": "comment"
+    },
+    "3326": {
+        "file_id": 283,
+        "content": "/applications/VideoTag/README.md",
+        "type": "filepath"
+    },
+    "3327": {
+        "file_id": 283,
+        "content": "This code is for the VideoTag, a large-scale video classification model developed by PaddlePaddle. It uses two stages of modeling - image modeling and sequence learning - to classify videos in large scale scenarios. The model involves data processing, TSN network training for feature extraction, and attention clusters, LSTM, Nextvlad for sequence learning. Results are predicted by combining multiple models, leading to increased accuracy.",
+        "type": "summary"
+    },
+    "3328": {
+        "file_id": 283,
+        "content": "# VideoTag 飞桨大规模视频分类模型\n---\n## 内容\n- [模型简介](#模型简介)\n- [使用方法](#使用方法)\n## 模型简介\n飞桨大规模视频分类模型VideoTag基于百度短视频业务千万级数据，支持3000个源于产业实践的实用标签，具有良好的泛化能力，非常适用于国内大规模（千万/亿/十亿级别）短视频分类场景的应用。VideoTag采用两阶段建模方式，即图像建模和序列学习。第一阶段，使用少量视频样本（十万级别）训练大规模视频特征提取模型(Extractor)；第二阶段，使用千万级数据训练预测器(Predictor)，最终实现在超大规模（千万/亿/十亿级别）短视频上产业应用，其原理示意如下图所示。\n<p align=\"center\">\n<img src=\"images.png\" height=220 width=800 hspace='10'/> <br />\nVideoTag模型示意图\n</p>\n- 数据处理：视频是按特定顺序排列的一组图像的集合，这些图像也称为帧。视频分类任务需要先对短视频进行解码，然后再将输出的图像帧序列灌入到VideoTag中进行训练和预测。\n- 图像建模：先从训练数据中，对每个类别均匀采样少量样本数据，构成十万量级的训练视频。然后使用TSN网络进行训练，提取所有视频帧的TSN模型分类层前一层的特征数据。在这个过程中，每一帧都被转化成相应的特征向量，一段视频被转化成一个特征序列。\n- 序列学习：采用Attention clusters、LSTM和Nextvlad对特征序列进行建模，学习各个特征之间的组合方式，进一步提高模型准确率。由于序列学习相比于图像建模耗时更短，因此可以融合多个具有互补性的序列模型。示例代码仅使用Attention\\_LSTM网络进行序列特征预测。\n- 预测结果：融合多个模型结果实现视频分类，进一步提高分类准确率。\n## 使用方法\n- [1. 如何运行样例代码](./Run.md)\n- [2. 如何使用自己的数据进行测试](./Test.md)\n- [3. 如何进行模型fine-tune](./FineTune.md)",
+        "type": "code",
+        "location": "/applications/VideoTag/README.md:1-31"
+    },
+    "3329": {
+        "file_id": 283,
+        "content": "This code is for the VideoTag, a large-scale video classification model developed by PaddlePaddle. It uses two stages of modeling - image modeling and sequence learning - to classify videos in large scale scenarios. The model involves data processing, TSN network training for feature extraction, and attention clusters, LSTM, Nextvlad for sequence learning. Results are predicted by combining multiple models, leading to increased accuracy.",
+        "type": "comment"
+    },
+    "3330": {
+        "file_id": 284,
+        "content": "/applications/VideoTag/Run.md",
+        "type": "filepath"
+    },
+    "3331": {
+        "file_id": 284,
+        "content": "This code installs PaddleVideo's VideoTag app, provides instructions for data preparation and model inference, and represents a dictionary containing information about classified video objects.",
+        "type": "summary"
+    },
+    "3332": {
+        "file_id": 284,
+        "content": "# 样例代码运行指南\n---\n## 内容\n参考本文档，您可以快速熟悉VideoTag的使用方法，观察VideoTag的预训练模型在示例视频上的预测结果。\n文档内容包括:\n- [安装说明](#安装说明)\n- [数据准备](#数据准备)\n- [模型推断](#模型推断)\n## 安装说明\n### 环境依赖：\n```\n    CUDA >= 9.0\n    cudnn >= 7.5\n```\n### 依赖安装:\n- 1.7.0 <= PaddlePaddle版本 <= 2.0.0: pip install paddlepaddle-gpu==1.8.4.post97 -i https://mirror.baidu.com/pypi/simple\n- opencv版本 >= 4.1.0: pip install opencv-python==4.2.0.32\n## 数据准备\n### 预训练权重下载\n我们提供了[TSN](https://videotag.bj.bcebos.com/video_tag_tsn.tar)和[AttentionLSTM](https://videotag.bj.bcebos.com/video_tag_lstm.tar)预训练权重，请在video\\_tag目录下新建weights目录，并将下载解压后的参数文件放在weights目录下:\n```\n    mkdir weights\n    cd weights\n    wget https://videotag.bj.bcebos.com/video_tag_tsn.tar\n    wget https://videotag.bj.bcebos.com/video_tag_lstm.tar\n    tar -zxvf video_tag_tsn.tar\n    tar -zxvf video_tag_lstm.tar\n    rm video_tag_tsn.tar -rf\n    rm video_tag_lstm.tar -rf\n    mv video_tag_tsn/* .\n    mv attention_lstm/* .\n    rm video_tag_tsn/ -rf\n    rm attention_lstm -rf\n```\n所得目录结构如下：\n```\nvideo_tag\n  ├──weights\n    ├── attention_lstm.pdmodel\n    ├── attention_lstm.pdopt  ",
+        "type": "code",
+        "location": "/applications/VideoTag/Run.md:1-54"
+    },
+    "3333": {
+        "file_id": 284,
+        "content": "This code provides installation instructions for PaddleVideo's VideoTag application, including dependencies and downloading pre-trained weights. It also outlines the data preparation process.",
+        "type": "comment"
+    },
+    "3334": {
+        "file_id": 284,
+        "content": "    ├── attention_lstm.pdparams\n    ├── tsn.pdmodel\n    ├── tsn.pdopt\n    └── tsn.pdparams\n```\n### 示例视频下载\n我们提供了[样例视频](https://videotag.bj.bcebos.com/mp4.tar)方便用户测试，请下载后解压，并将视频文件放置在video\\_tag/data/mp4目录下:\n```\n    cd data/\n    wget https://videotag.bj.bcebos.com/mp4.tar\n    tar -zxvf mp4.tar\n    rm mp4.tar -rf\n```\n所得目录结构如下：\n```\nvideo_tag\n  ├──data\n    ├── mp4\n      ├── 1.mp4\n      ├── 2.mp4\n      └── ...\n```\n## 模型推断\n模型推断的启动方式如下：\n    python videotag_test.py\n- 预测结果会以日志方式打印，示例如下:\n```\n[========video_id [ data/mp4/1.mp4 ] , topk(20) preds: ========]\nclass_id: 3110, class_name: 训练 ,  probability:  0.97730666399\nclass_id: 2159, class_name: 蹲 ,  probability:  0.945082366467\n...\n[========video_id [ data/mp4/2.mp4 ] , topk(20) preds: ========]\nclass_id: 2773, class_name: 舞蹈 ,  probability:  0.850423932076\nclass_id: 1128, class_name: 表演艺术 ,  probability:  0.0446354188025\n...\n```\n- 通过--save\\_dir可指定预测结果存储路径，默认为video\\_tag/data/VideoTag\\_results，不同输入视频的预测结果分文件保存在不同的json文件中，文件的内容格式为：\n```\n    [file_path,\n     {\"class_name\": class_name1, \"probability\": probability1, \"class_id\": class_id1},",
+        "type": "code",
+        "location": "/applications/VideoTag/Run.md:55-105"
+    },
+    "3335": {
+        "file_id": 284,
+        "content": "This code provides instructions on how to download an example video for testing, how to run model inference, and how to save the results. The example video can be downloaded from a provided link and should be extracted into a specific directory structure. The model inference script is named videotag_test.py and prints prediction probabilities. Users can specify a different output directory using the --save\\_dir parameter. The predictions are saved as JSON files in the specified directory, with each file corresponding to a video.",
+        "type": "comment"
+    },
+    "3336": {
+        "file_id": 284,
+        "content": "     {\"class_name\": class_name2, \"probability\": probability2, \"class_id\": class_id2},\n     ...\n    ]\n```",
+        "type": "code",
+        "location": "/applications/VideoTag/Run.md:106-109"
+    },
+    "3337": {
+        "file_id": 284,
+        "content": "This code represents a dictionary containing information about a classified video object. The 'class_name' key holds the name of the class, 'probability' stores the confidence level of the classification, and 'class_id' contains the identifier of the recognized class. These dictionaries are stored in an array, potentially for multiple classifications within the same video or different videos.",
+        "type": "comment"
+    },
+    "3338": {
+        "file_id": 285,
+        "content": "/applications/VideoTag/Test.md",
+        "type": "filepath"
+    },
+    "3339": {
+        "file_id": 285,
+        "content": "This code provides a guide for testing the pre-trained VideoTag model on custom data. It covers preparing test data and running inference using Python's videotag_test.py script. The video file input formats supported are mp4, mkv, and webm. Inference is performed on 300 uniformly sampled frames per video. GPU acceleration can be enabled with the --use\\_gpu flag.",
+        "type": "summary"
+    },
+    "3340": {
+        "file_id": 285,
+        "content": "# 预训练模型自测指南\n## 内容\n参考本文档，您可以快速测试VideoTag的预训练模型在自己业务数据上的预测效果。\n主要内容包括:\n- [数据准备](#数据准备)\n- [模型推断](#模型推断)\n## 数据准备\n在数据准备阶段，您需要准备好自己的测试数据，并在video\\_tag/data/VideoTag\\_test.list文件中指定待推断的测试文件路径，内容格式如下:\n```\nmy_video_path/my_video_file1.mp4\nmy_video_path/my_video_file2.mp4\n...\n```\n## 模型推断\n模型推断的启动方式如下：\n    python videotag_test.py\n- 目前支持的视频文件输入格式为：mp4、mkv和webm格式；\n- 模型会从输入的视频文件中*均匀抽取300帧*用于预测。对于较长的视频文件，建议先截取有效部分输入模型以提高预测速度；\n- 通过--use\\_gpu参数可指定是否使用gpu进行推断，默认使用gpu。对于10s左右的短视频文件，gpu推断时间约为4s；\n- 通过--filelist可指定输入list文件路径，默认为video\\_tag/data/VideoTag\\_test.list。",
+        "type": "code",
+        "location": "/applications/VideoTag/Test.md:1-31"
+    },
+    "3341": {
+        "file_id": 285,
+        "content": "This code provides a guide for testing the pre-trained VideoTag model on custom data. It covers preparing test data and running inference using Python's videotag_test.py script. The video file input formats supported are mp4, mkv, and webm. Inference is performed on 300 uniformly sampled frames per video. GPU acceleration can be enabled with the --use\\_gpu flag.",
+        "type": "comment"
+    },
+    "3342": {
+        "file_id": 286,
+        "content": "/applications/VideoTag/eval.py",
+        "type": "filepath"
+    },
+    "3343": {
+        "file_id": 286,
+        "content": "This code prepares the PaddlePaddle app environment, imports necessary libraries, handles config, defines model functions, loads test weights, logs metrics, and checks save directory. It also creates directories, logs arguments, verifies Paddle version, and runs a test function.",
+        "type": "summary"
+    },
+    "3344": {
+        "file_id": 286,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport time\nimport logging\nimport argparse\nimport ast\nimport paddle\nimport paddle.static as static\nfrom utils.config_utils import *\nimport models\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'\nlogging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)",
+        "type": "code",
+        "location": "/applications/VideoTag/eval.py:1-33"
+    },
+    "3345": {
+        "file_id": 286,
+        "content": "This code snippet sets up the environment and logging for a PaddlePaddle application. It imports necessary libraries, handles basic configuration, and sets up logging output to the console. This script seems to be part of an AI model's evaluation process, as it also includes references to reader, metrics, and model files.",
+        "type": "comment"
+    },
+    "3346": {
+        "file_id": 286,
+        "content": "logger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='AttentionCluster',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/attention_cluster.txt',\n                        help='path to config file of model')\n    parser.add_argument(\n        '--batch_size',\n        type=int,\n        default=None,\n        help='test batch size. None to use config file setting.')\n    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument(\n        '--weights',\n        type=str,\n        default='./data/checkpoints/AttentionLSTM_epoch9.pdparams',\n        help='weight path.')\n    parser.add_argument(\n        '--save_dir',\n        type=str,\n        default=os.path.join('data', 'evaluate_results'),",
+        "type": "code",
+        "location": "/applications/VideoTag/eval.py:34-64"
+    },
+    "3347": {
+        "file_id": 286,
+        "content": "This code defines a function `parse_args()` that creates an ArgumentParser to parse command line arguments. It sets defaults for model name, config file path, batch size, GPU usage, and weight path. The parser also adds help messages for each argument.",
+        "type": "comment"
+    },
+    "3348": {
+        "file_id": 286,
+        "content": "        help='output dir path, default to use ./data/evaluate_results')\n    parser.add_argument('--log_interval',\n                        type=int,\n                        default=1,\n                        help='mini-batch interval to log.')\n    args = parser.parse_args()\n    return args\ndef test(args):\n    # parse config\n    config = parse_config(args.config)\n    test_config = merge_configs(config, 'test', vars(args))\n    print_configs(test_config, \"Test\")\n    use_dali = test_config['TEST'].get('use_dali', False)\n    # build model\n    test_model = models.get_model(args.model_name, test_config, mode='test')\n    test_model.build_input(use_dataloader=False)\n    test_model.build_model()\n    test_feeds = test_model.feeds()\n    test_fetch_list = test_model.fetches()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(static.default_startup_program())\n    if args.weights:\n        assert os.path.exists(\n            args.weights), \"Given weight dir {} not exist.\".format(args.weights)",
+        "type": "code",
+        "location": "/applications/VideoTag/eval.py:65-95"
+    },
+    "3349": {
+        "file_id": 286,
+        "content": "This code defines a function `test` that takes in arguments, parses a config file, merges it with test configuration, prints the configurations, builds a model using the provided model name and configurations, feeds the model, fetches the model outputs, creates an executor based on whether to use GPU or CPU, and checks if the weight directory exists.",
+        "type": "comment"
+    },
+    "3350": {
+        "file_id": 286,
+        "content": "    weights = args.weights or test_model.get_weights()\n    logger.info('load test weights from {}'.format(weights))\n    test_model.load_test_weights(exe, weights, static.default_main_program())\n    # get reader and metrics\n    test_reader = get_reader(args.model_name.upper(), 'test', test_config)\n    test_metrics = get_metrics(args.model_name.upper(), 'test', test_config)\n    test_feeder = paddle.fluid.DataFeeder(place=place, feed_list=test_feeds)\n    epoch_period = []\n    for test_iter, data in enumerate(test_reader()):\n        cur_time = time.time()\n        test_outs = exe.run(fetch_list=test_fetch_list,\n                            feed=test_feeder.feed(data))\n        period = time.time() - cur_time\n        epoch_period.append(period)\n        test_metrics.accumulate(test_outs)\n        # metric here\n        if args.log_interval > 0 and test_iter % args.log_interval == 0:\n            info_str = '[EVAL] Batch {}'.format(test_iter)\n            test_metrics.calculate_and_log_out(test_outs, info_str)\n    if not os.path.isdir(args.save_dir):",
+        "type": "code",
+        "location": "/applications/VideoTag/eval.py:96-122"
+    },
+    "3351": {
+        "file_id": 286,
+        "content": "This code loads test weights, creates a reader and metrics for testing, runs the model with the data, calculates and logs the evaluation metrics for each batch, and checks if the save directory exists.",
+        "type": "comment"
+    },
+    "3352": {
+        "file_id": 286,
+        "content": "        os.makedirs(args.save_dir)\n    test_metrics.finalize_and_log_out(\"[EVAL] eval finished. \", args.save_dir)\nif __name__ == \"__main__\":\n    args = parse_args()\n    # check whether the installed paddle is compiled with GPU\n    check_cuda(args.use_gpu)\n    check_version()\n    logger.info(args)\n    test(args)",
+        "type": "code",
+        "location": "/applications/VideoTag/eval.py:123-134"
+    },
+    "3353": {
+        "file_id": 286,
+        "content": "This code creates directories, finalizes and logs test metrics, checks if installed Paddle is compiled with GPU, verifies Paddle version, logs arguments, and runs a test function.",
+        "type": "comment"
+    },
+    "3354": {
+        "file_id": 287,
+        "content": "/applications/VideoTag/metrics/__init__.py",
+        "type": "filepath"
+    },
+    "3355": {
+        "file_id": 287,
+        "content": "This code imports the function 'get_metrics' from the 'metrics_util' module in the same application directory. This function is likely used to calculate and retrieve various metrics related to video processing or analysis.",
+        "type": "summary"
+    },
+    "3356": {
+        "file_id": 287,
+        "content": "from .metrics_util import get_metrics",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/__init__.py:1-1"
+    },
+    "3357": {
+        "file_id": 287,
+        "content": "This code imports the function 'get_metrics' from the 'metrics_util' module in the same application directory. This function is likely used to calculate and retrieve various metrics related to video processing or analysis.",
+        "type": "comment"
+    },
+    "3358": {
+        "file_id": 288,
+        "content": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py",
+        "type": "filepath"
+    },
+    "3359": {
+        "file_id": 288,
+        "content": "The MetricsCalculator class in PaddleVideo's VideoTag application handles metric calculation, providing methods for finalizing, computing, and accumulating metrics. It calculates average loss and accuracy over multiple batches using a top-k accuracy function, accumulating per batch size before returning the final result.",
+        "type": "summary"
+    },
+    "3360": {
+        "file_id": 288,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import unicode_literals\nfrom __future__ import print_function\nfrom __future__ import division\nimport numpy as np\nimport datetime\nimport logging\nlogger = logging.getLogger(__name__)\nclass MetricsCalculator():\n    def __init__(self, name, mode):\n        self.name = name\n        self.mode = mode  # 'train', 'val', 'test'\n        self.reset()\n    def reset(self):\n        logger.info('Resetting {} metrics...'.format(self.mode))",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:1-34"
+    },
+    "3361": {
+        "file_id": 288,
+        "content": "This code is part of the PaddleVideo project's VideoTag application, and it defines a class called MetricsCalculator. It handles calculating various metrics for different modes such as train, val, or test. The code imports necessary libraries, initializes logger, and sets up the MetricsCalculator class with an initialization method (__init__) and a reset method to reset the metrics values.",
+        "type": "comment"
+    },
+    "3362": {
+        "file_id": 288,
+        "content": "        self.aggr_acc1 = 0.0\n        self.aggr_acc5 = 0.0\n        self.aggr_loss = 0.0\n        self.aggr_batch_size = 0\n    def finalize_metrics(self):\n        self.avg_acc1 = self.aggr_acc1 / self.aggr_batch_size\n        self.avg_acc5 = self.aggr_acc5 / self.aggr_batch_size\n        self.avg_loss = self.aggr_loss / self.aggr_batch_size\n    def get_computed_metrics(self):\n        json_stats = {}\n        json_stats['avg_loss'] = self.avg_loss\n        json_stats['avg_acc1'] = self.avg_acc1\n        json_stats['avg_acc5'] = self.avg_acc5\n        return json_stats\n    def calculate_metrics(self, loss, softmax, labels):\n        accuracy1 = compute_topk_accuracy(softmax, labels, top_k=1) * 100.\n        accuracy5 = compute_topk_accuracy(softmax, labels, top_k=5) * 100.\n        return accuracy1, accuracy5\n    def accumulate(self, loss, softmax, labels):\n        cur_batch_size = softmax.shape[0]\n        # if returned loss is None for e.g. test, just set loss to be 0.\n        if loss is None:\n            cur_loss = 0.\n        else:",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:35-62"
+    },
+    "3363": {
+        "file_id": 288,
+        "content": "The class initializes variables for accumulating aggregated accuracy, loss, and batch size. The `finalize_metrics` method calculates average metrics by dividing the accumulated values by the total batch size. The `get_computed_metrics` returns a JSON object containing the average loss and accuracy for top 1 and top 5 predictions. The `calculate_metrics` computes the accuracy for top 1 and top 5 predictions, and the `accumulate` method accumulates the loss and updates the batch size if the returned loss is not None.",
+        "type": "comment"
+    },
+    "3364": {
+        "file_id": 288,
+        "content": "            cur_loss = np.mean(np.array(loss))  #\n        self.aggr_batch_size += cur_batch_size\n        self.aggr_loss += cur_loss * cur_batch_size\n        accuracy1 = compute_topk_accuracy(softmax, labels, top_k=1) * 100.\n        accuracy5 = compute_topk_accuracy(softmax, labels, top_k=5) * 100.\n        self.aggr_acc1 += accuracy1 * cur_batch_size\n        self.aggr_acc5 += accuracy5 * cur_batch_size\n        return\n# ----------------------------------------------\n# other utils\n# ----------------------------------------------\ndef compute_topk_correct_hits(top_k, preds, labels):\n    '''Compute the number of corret hits'''\n    batch_size = preds.shape[0]\n    top_k_preds = np.zeros((batch_size, top_k), dtype=np.float32)\n    for i in range(batch_size):\n        top_k_preds[i, :] = np.argsort(-preds[i, :])[:top_k]\n    correctness = np.zeros(batch_size, dtype=np.int32)\n    for i in range(batch_size):\n        if labels[i] in top_k_preds[i, :].astype(np.int32).tolist():\n            correctness[i] = 1\n    correct_hits = sum(correctness)",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:63-90"
+    },
+    "3365": {
+        "file_id": 288,
+        "content": "This code calculates the average loss and accuracy over multiple batches. It uses a function called \"compute_topk_accuracy\" to calculate the accuracy for top 1 and top 5 predictions. The computed values are then accumulated per batch size, with the final result being returned.",
+        "type": "comment"
+    },
+    "3366": {
+        "file_id": 288,
+        "content": "    return correct_hits\ndef compute_topk_accuracy(softmax, labels, top_k):\n    computed_metrics = {}\n    assert labels.shape[0] == softmax.shape[0], \"Batch size mismatch.\"\n    aggr_batch_size = labels.shape[0]\n    aggr_top_k_correct_hits = compute_topk_correct_hits(top_k, softmax, labels)\n    # normalize results\n    computed_metrics = \\\n        float(aggr_top_k_correct_hits) / aggr_batch_size\n    return computed_metrics",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:92-107"
+    },
+    "3367": {
+        "file_id": 288,
+        "content": "The function `compute_topk_accuracy` computes the top-k accuracy by first asserting that the batch size of labels and softmax are equal, then it computes the correct hits for each batch element using the `compute_topk_correct_hits` function. Finally, it normalizes the results and returns the computed metric as a float value representing accuracy.",
+        "type": "comment"
+    },
+    "3368": {
+        "file_id": 289,
+        "content": "/applications/VideoTag/metrics/metrics_util.py",
+        "type": "filepath"
+    },
+    "3369": {
+        "file_id": 289,
+        "content": "This code defines a class for evaluating metrics in video analysis tasks, handling inference mode and performing tagging/classification using a model with functions for metrics update, calculator reset, logging results, and saving/retrieving metrics.",
+        "type": "summary"
+    },
+    "3370": {
+        "file_id": 289,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import unicode_literals\nfrom __future__ import print_function\nfrom __future__ import division\nimport logging\nimport os\nimport io\nimport numpy as np\nimport json\nfrom metrics.youtube8m import eval_util as youtube8m_metrics\nfrom metrics.kinetics import accuracy_metrics as kinetics_metrics\nlogger = logging.getLogger(__name__)\nclass Metrics(object):\n    def __init__(self, name, mode, metrics_args):",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:1-33"
+    },
+    "3371": {
+        "file_id": 289,
+        "content": "This code is importing necessary libraries and initializing a class called Metrics. It appears to be part of a larger module for evaluating metrics, possibly in the context of video analysis or recognition tasks. The code defines an object-oriented structure with methods that will likely handle different types of evaluation tasks based on the input name, mode, and metrics_args parameters.",
+        "type": "comment"
+    },
+    "3372": {
+        "file_id": 289,
+        "content": "        \"\"\"Not implemented\"\"\"\n        pass\n    def calculate_and_log_out(self, fetch_list, info=''):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def accumulate(self, fetch_list, info=''):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def finalize_and_log_out(self, info='', savedir='./'):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def reset(self):\n        \"\"\"Not implemented\"\"\"\n        pass\nclass Youtube8mMetrics(Metrics):\n    def __init__(self, name, mode, metrics_args):\n        self.name = name\n        self.mode = mode\n        self.num_classes = metrics_args['MODEL']['num_classes']\n        self.topk = metrics_args['MODEL']['topk']\n        self.calculator = youtube8m_metrics.EvaluationMetrics(\n            self.num_classes, self.topk)\n        if self.mode == 'infer':\n            self.infer_results = []\n    def calculate_and_log_out(self, fetch_list, info=''):\n        loss = np.mean(np.array(fetch_list[0]))\n        pred = np.array(fetch_list[1])\n        label = np.array(fetch_list[2])\n        hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:34-69"
+    },
+    "3373": {
+        "file_id": 289,
+        "content": "The code defines a class named Youtube8mMetrics that inherits from the Metrics base class. It has methods for calculating and logging metrics, accumulating results, finalizing and logging output, and resetting variables. The Youtube8mMetrics class is initialized with a name, mode, and metrics_args. The calculate_and_log_out method calculates loss, prediction, and ground truth labels, then calls the youtube8m_metrics.calculate_hit_at_one function to compute the hit at one metric.",
+        "type": "comment"
+    },
+    "3374": {
+        "file_id": 289,
+        "content": "        perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(\n            pred, label)\n        gap = youtube8m_metrics.calculate_gap(pred, label)\n        logger.info(info + ' , loss = {0}, Hit@1 = {1}, PERR = {2}, GAP = {3}'.format(\\\n                     '%.6f' % loss, '%.2f' % hit_at_one, '%.2f' % perr, '%.2f' % gap))\n    def accumulate(self, fetch_list, info=''):\n        if self.mode == 'infer':\n            predictions = np.array(fetch_list[0])\n            video_id = fetch_list[1]\n            for i in range(len(predictions)):\n                topk_inds = predictions[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds = predictions[i][topk_inds]\n                self.infer_results.append(\n                    (video_id[i], topk_inds.tolist(), preds.tolist()))\n        else:\n            loss = np.array(fetch_list[0])\n            pred = np.array(fetch_list[1])\n            label = np.array(fetch_list[2])\n            self.calculator.accumulate(loss, pred, label)",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:70-90"
+    },
+    "3375": {
+        "file_id": 289,
+        "content": "This function accumulates metrics for a video tagging application. It handles two modes: 'infer' and others. For the 'infer' mode, it gathers predictions for each video, calculates top-k indices, and appends them to a list. For other modes, it takes in loss, prediction, and label arrays, and accumulates metrics using the calculator object. It logs information including loss, Hit@1, precision at equal recall rate (PERR), and gap.",
+        "type": "comment"
+    },
+    "3376": {
+        "file_id": 289,
+        "content": "    def finalize_and_log_out(self,\n                             info='',\n                             savedir='./data/results',\n                             label_file='./label_3396.txt'):\n        if self.mode == 'infer':\n            for index, item in enumerate(self.infer_results):\n                video_id = item[0]\n                print('[========video_id [ {} ] , topk({}) preds: ========]\\n'.\n                      format(video_id, self.topk))\n                f = io.open(label_file, \"r\", encoding=\"utf-8\")\n                fl = f.readlines()\n                res_list = []\n                res_list.append(video_id)\n                for i in range(len(item[1])):\n                    class_id = item[1][i]\n                    class_prob = item[2][i]\n                    class_name = fl[class_id].split('\\n')[0]\n                    print('class_id: {},'.format(class_id), 'class_name:',\n                          class_name,\n                          ',  probability:  {} \\n'.format(class_prob))\n                    save_dict = {",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:92-113"
+    },
+    "3377": {
+        "file_id": 289,
+        "content": "This code snippet is part of the VideoTag application and it logs out the final results for each video. It reads a label file, iterates through each item's predictions, matches class ID to class name and probability, and then prints them out. The function can be called with optional parameters to specify the output directory (default: `./data/results`) and the label file path (default: `./label_3396.txt`). It is designed to run in 'infer' mode only.",
+        "type": "comment"
+    },
+    "3378": {
+        "file_id": 289,
+        "content": "                        \"'class_id\": class_id,\n                        \"class_name\": class_name,\n                        \"probability\": class_prob\n                    }\n                    res_list.append(save_dict)\n                # save infer result into output dir\n                with io.open(os.path.join(savedir,\n                                          'result' + str(index) + '.json'),\n                             'w',\n                             encoding='utf-8') as f:\n                    f.write(json.dumps(res_list, ensure_ascii=False))\n        else:\n            epoch_info_dict = self.calculator.get()\n            logger.info(info + '\\tavg_hit_at_one: {0},\\tavg_perr: {1},\\tavg_loss :{2},\\taps: {3},\\tgap:{4}'\\\n                     .format(epoch_info_dict['avg_hit_at_one'], epoch_info_dict['avg_perr'], \\\n                             epoch_info_dict['avg_loss'], epoch_info_dict['aps'], epoch_info_dict['gap']))\n    def reset(self):\n        self.calculator.clear()\n        if self.mode == 'infer':\n            self.infer_results = []",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:114-135"
+    },
+    "3379": {
+        "file_id": 289,
+        "content": "This code snippet appears to be part of a larger program that performs some sort of video tagging or classification. It includes functions to save the result of an inference operation and update metrics after each epoch, as well as a reset function for the calculator. The \"calculator\" object seems to keep track of average hit rate at one, perr, loss, aps, and gap for some type of learning algorithm or model.",
+        "type": "comment"
+    },
+    "3380": {
+        "file_id": 289,
+        "content": "class Kinetics400Metrics(Metrics):\n    def __init__(self, name, mode, metrics_args):\n        self.name = name\n        self.mode = mode\n        self.topk = metrics_args['MODEL']['topk']\n        self.calculator = kinetics_metrics.MetricsCalculator(name, mode.lower())\n        if self.mode == 'infer':\n            self.infer_results = []\n    def calculate_and_log_out(self, fetch_list, info=''):\n        if len(fetch_list) == 3:\n            loss = fetch_list[0]\n            loss = np.mean(np.array(loss))\n            pred = np.array(fetch_list[1])\n            label = np.array(fetch_list[2])\n        else:\n            loss = 0.\n            pred = np.array(fetch_list[0])\n            label = np.array(fetch_list[1])\n        acc1, acc5 = self.calculator.calculate_metrics(loss, pred, label)\n        logger.info(info + '\\tLoss: {},\\ttop1_acc: {}, \\ttop5_acc: {}'.format('%.6f' % loss, \\\n                       '%.2f' % acc1, '%.2f' % acc5))\n        return loss\n    def accumulate(self, fetch_list, info=''):\n        if self.mode == 'infer':",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:138-163"
+    },
+    "3381": {
+        "file_id": 289,
+        "content": "Class Kinetics400Metrics is used for calculating and logging metrics, accepting a name, mode, and metrics_args. It stores the topk value from metrics_args, initializes a MetricsCalculator instance with the given name and mode, and maintains an infer_results list if in inference mode. The calculate_and_log_out method takes a fetch_list as input and calculates the mean loss, accuracy for top-1 and top-5 predictions, and logs the information. It can be used to accumulate results during inference.",
+        "type": "comment"
+    },
+    "3382": {
+        "file_id": 289,
+        "content": "            predictions = np.array(fetch_list[0])\n            video_id = fetch_list[1]\n            for i in range(len(predictions)):\n                topk_inds = predictions[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds = predictions[i][topk_inds]\n                self.infer_results.append(\n                    (video_id[i], topk_inds.tolist(), preds.tolist()))\n        else:\n            if len(fetch_list) == 3:\n                loss = fetch_list[0]\n                loss = np.mean(np.array(loss))\n                pred = np.array(fetch_list[1])\n                label = np.array(fetch_list[2])\n            else:\n                loss = 0.\n                pred = np.array(fetch_list[0])\n                label = np.array(fetch_list[1])\n            self.calculator.accumulate(loss, pred, label)\n    def finalize_and_log_out(self,\n                             info='',\n                             savedir='./data/results',\n                             label_file='./label_3396.txt'):",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:164-187"
+    },
+    "3383": {
+        "file_id": 289,
+        "content": "This code appears to be a part of a machine learning model's evaluation process. It calculates top predictions and loss values for each video, accumulates them, and then logs out the results. The method \"finalize_and_log_out\" likely concludes the evaluation process and saves or outputs the final results. The code seems to handle both cases where results are available for each video (predictions and losses) and when only predictions and labels are given.",
+        "type": "comment"
+    },
+    "3384": {
+        "file_id": 289,
+        "content": "        if self.mode == 'infer':\n            for index, item in enumerate(self.infer_results):\n                video_id = item[0]\n                print('[========video_id [ {} ] , topk({}) preds: ========]\\n'.\n                      format(video_id, self.topk))\n                f = io.open(label_file, \"r\", encoding=\"utf-8\")\n                fl = f.readlines()\n                res_list = []\n                res_list.append(video_id)\n                for i in range(len(item[1])):\n                    class_id = item[1][i]\n                    class_prob = item[2][i]\n                    class_name = fl[class_id].split('\\n')[0]\n                    print('class_id: {},'.format(class_id), 'class_name:',\n                          class_name,\n                          ',  probability:  {} \\n'.format(class_prob))\n                    save_dict = {\n                        \"'class_id\": class_id,\n                        \"class_name\": class_name,\n                        \"probability\": class_prob\n                    }\n                    res_list.append(save_dict)",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:188-210"
+    },
+    "3385": {
+        "file_id": 289,
+        "content": "This code is part of a function that iterates over the 'infer_results' list and prints out the video ID, topk predictions for each class, along with their respective probabilities. It reads labels from the 'label_file', appends each prediction to a 'res_list' as a dictionary containing class_id, class_name, and probability, and then continues to the next iteration. The label file is read once per video.",
+        "type": "comment"
+    },
+    "3386": {
+        "file_id": 289,
+        "content": "                # save infer result into output dir\n                with io.open(os.path.join(savedir,\n                                          'result' + str(index) + '.json'),\n                             'w',\n                             encoding='utf-8') as f:\n                    f.write(json.dumps(res_list, ensure_ascii=False))\n        else:\n            self.calculator.finalize_metrics()\n            metrics_dict = self.calculator.get_computed_metrics()\n            loss = metrics_dict['avg_loss']\n            acc1 = metrics_dict['avg_acc1']\n            acc5 = metrics_dict['avg_acc5']\n            logger.info(info + '\\tLoss: {},\\ttop1_acc: {}, \\ttop5_acc: {}'.format('%.6f' % loss, \\\n                       '%.2f' % acc1, '%.2f' % acc5))\n    def reset(self):\n        self.calculator.reset()\n        if self.mode == 'infer':\n            self.infer_results = []\nclass MetricsNotFoundError(Exception):\n    \"Error: metrics not found\"\n    def __init__(self, metrics_name, avail_metrics):\n        super(MetricsNotFoundError, self).__init__()",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:212-237"
+    },
+    "3387": {
+        "file_id": 289,
+        "content": "The code saves the infer results into the specified output directory, finalizes and retrieves computed metrics from a calculator, logs the loss, top1_acc, and top5_acc if in 'train' mode, resets the calculator and list of infer results when resetting, and defines a MetricsNotFoundError exception for missing metrics.",
+        "type": "comment"
+    },
+    "3388": {
+        "file_id": 289,
+        "content": "        self.metrics_name = metrics_name\n        self.avail_metrics = avail_metrics\n    def __str__(self):\n        msg = \"Metrics {} Not Found.\\nAvailiable metrics:\\n\".format(\n            self.metrics_name)\n        for metric in self.avail_metrics:\n            msg += \"  {}\\n\".format(metric)\n        return msg\nclass MetricsZoo(object):\n    def __init__(self):\n        self.metrics_zoo = {}\n    def regist(self, name, metrics):\n        assert metrics.__base__ == Metrics, \"Unknow model type {}\".format(\n            type(metrics))\n        self.metrics_zoo[name] = metrics\n    def get(self, name, mode, cfg):\n        for k, v in self.metrics_zoo.items():\n            if k == name:\n                return v(name, mode, cfg)\n        raise MetricsNotFoundError(name, self.metrics_zoo.keys())\n# singleton metrics_zoo\nmetrics_zoo = MetricsZoo()\ndef regist_metrics(name, metrics):\n    metrics_zoo.regist(name, metrics)\ndef get_metrics(name, mode, cfg):\n    return metrics_zoo.get(name, mode, cfg)\n# sort by alphabet\nregist_metrics(\"ATTENTIONLSTM\", Youtube8mMetrics)",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:238-278"
+    },
+    "3389": {
+        "file_id": 289,
+        "content": "This code defines a MetricsZoo class to manage and retrieve metrics. It provides regist() and get() methods for registering and retrieving metrics by name, respectively. The MetricsZoo instance is made singleton via global variable metrics_zoo. Youtube8mMetrics are registered under the name \"ATTENTIONLSTM\".",
+        "type": "comment"
+    },
+    "3390": {
+        "file_id": 289,
+        "content": "regist_metrics(\"TSN\", Kinetics400Metrics)",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/metrics_util.py:279-279"
+    },
+    "3391": {
+        "file_id": 289,
+        "content": "The code registers the \"TSN\" metric with the Kinetics400Metrics class.",
+        "type": "comment"
+    },
+    "3392": {
+        "file_id": 290,
+        "content": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py",
+        "type": "filepath"
+    },
+    "3393": {
+        "file_id": 290,
+        "content": "The Python code's `AveragePrecisionCalculator` class calculates interpolated average precision, supports large datasets, and handles sparse prediction scores and ground truth labels for classification tasks.",
+        "type": "summary"
+    },
+    "3394": {
+        "file_id": 290,
+        "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Calculate or keep track of the interpolated average precision.\nIt provides an interface for calculating interpolated average precision for an\nentire list or the top-n ranked items. For the definition of the\n(non-)interpolated average precision:\nhttp://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf\nExample usages:\n1) Use it as a static function call to directly calculate average precision for\na short ranked list in the memory.",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:1-23"
+    },
+    "3395": {
+        "file_id": 290,
+        "content": "This code is for a Python class that calculates the interpolated average precision (IAP) of ranked items in a list. It follows the definition provided in the given link and can be used as a static function to directly calculate IAP from a short ranked list.",
+        "type": "comment"
+    },
+    "3396": {
+        "file_id": 290,
+        "content": "```\nimport random\np = np.array([random.random() for _ in xrange(10)])\na = np.array([random.choice([0, 1]) for _ in xrange(10)])\nap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)\n```\n2) Use it as an object for long ranked list that cannot be stored in memory or\nthe case where partial predictions can be observed at a time (Tensorflow\npredictions). In this case, we first call the function accumulate many times\nto process parts of the ranked list. After processing all the parts, we call\npeek_interpolated_ap_at_n.\n```\np1 = np.array([random.random() for _ in xrange(5)])\na1 = np.array([random.choice([0, 1]) for _ in xrange(5)])\np2 = np.array([random.random() for _ in xrange(5)])\na2 = np.array([random.choice([0, 1]) for _ in xrange(5)])\n# interpolated average precision at 10 using 1000 break points\ncalculator = average_precision_calculator.AveragePrecisionCalculator(10)\ncalculator.accumulate(p1, a1)\ncalculator.accumulate(p2, a2)\nap3 = calculator.peek_ap_at_n()\n```\n\"\"\"\nimport heapq\nimport random\nimport numbers",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:25-55"
+    },
+    "3397": {
+        "file_id": 290,
+        "content": "The code creates an instance of the AveragePrecisionCalculator class and uses its accumulate method to process parts of a ranked list that cannot be stored in memory or observed at once. After processing all parts, it uses the peek_interpolated_ap_at_n method to get the interpolated average precision at a given number of elements. The code also imports heapq and random modules for priority queue and random number generation respectively.",
+        "type": "comment"
+    },
+    "3398": {
+        "file_id": 290,
+        "content": "import numpy\nclass AveragePrecisionCalculator(object):\n    \"\"\"Calculate the average precision and average precision at n.\"\"\"\n    def __init__(self, top_n=None):\n        \"\"\"Construct an AveragePrecisionCalculator to calculate average precision.\n    This class is used to calculate the average precision for a single label.\n    Args:\n      top_n: A positive Integer specifying the average precision at n, or\n        None to use all provided data points.\n    Raises:\n      ValueError: An error occurred when the top_n is not a positive integer.\n    \"\"\"\n        if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):\n            raise ValueError(\"top_n must be a positive integer or None.\")\n        self._top_n = top_n  # average precision at n\n        self._total_positives = 0  # total number of positives have seen\n        self._heap = []  # max heap of (prediction, actual)\n    @property\n    def heap_size(self):\n        \"\"\"Gets the heap size maintained in the class.\"\"\"\n        return len(self._heap)\n    @property",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:57-86"
+    },
+    "3399": {
+        "file_id": 290,
+        "content": "This code defines a class `AveragePrecisionCalculator` that calculates average precision and average precision at n for a single label. It takes a `top_n` argument to specify the average precision at n or uses all provided data points if None. The class maintains a max heap of (prediction, actual) and provides a `heap_size` property to get the heap size.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/34.json b/docs/data/34.json
new file mode 100644
index 000000000..c717bbd04
--- /dev/null
+++ b/docs/data/34.json
@@ -0,0 +1,543 @@
+{
+    "3400": {
+        "file_id": 290,
+        "content": "    def num_accumulated_positives(self):\n        \"\"\"Gets the number of positive samples that have been accumulated.\"\"\"\n        return self._total_positives\n    def accumulate(self, predictions, actuals, num_positives=None):\n        \"\"\"Accumulate the predictions and their ground truth labels.\n    After the function call, we may call peek_ap_at_n to actually calculate\n    the average precision.\n    Note predictions and actuals must have the same shape.\n    Args:\n      predictions: a list storing the prediction scores.\n      actuals: a list storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      num_positives = If the 'predictions' and 'actuals' inputs aren't complete,\n      then it's possible some true positives were missed in them. In that case,\n      you can provide 'num_positives' in order to accurately track recall.\n    Raises:\n      ValueError: An error occurred when the format of the input is not the\n      numpy 1-D array or the shape of predictions and actuals does not match.",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:87-108"
+    },
+    "3401": {
+        "file_id": 290,
+        "content": "This function accumulates prediction scores and ground truth labels, allowing for the calculation of average precision after the call. The function requires both predictions and actuals to have the same shape. If inputs are incomplete, you can provide 'num_positives' to accurately track recall.",
+        "type": "comment"
+    },
+    "3402": {
+        "file_id": 290,
+        "content": "    \"\"\"\n        if len(predictions) != len(actuals):\n            raise ValueError(\n                \"the shape of predictions and actuals does not match.\")\n        if not num_positives is None:\n            if not isinstance(num_positives,\n                              numbers.Number) or num_positives < 0:\n                raise ValueError(\n                    \"'num_positives' was provided but it wan't a nonzero number.\"\n                )\n        if not num_positives is None:\n            self._total_positives += num_positives\n        else:\n            self._total_positives += numpy.size(numpy.where(actuals > 0))\n        topk = self._top_n\n        heap = self._heap\n        for i in range(numpy.size(predictions)):\n            if topk is None or len(heap) < topk:\n                heapq.heappush(heap, (predictions[i], actuals[i]))\n            else:\n                if predictions[i] > heap[0][0]:  # heap[0] is the smallest\n                    heapq.heappop(heap)\n                    heapq.heappush(heap, (predictions[i], actuals[i]))",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:109-134"
+    },
+    "3403": {
+        "file_id": 290,
+        "content": "This code snippet is a class method that checks the shape compatibility of 'predictions' and 'actuals', verifies if 'num_positives' is a nonzero number, calculates the total positives, and populates a heap. It also ensures the correctness of the predictions by comparing them to the actuals and updating the heap accordingly.",
+        "type": "comment"
+    },
+    "3404": {
+        "file_id": 290,
+        "content": "    def clear(self):\n        \"\"\"Clear the accumulated predictions.\"\"\"\n        self._heap = []\n        self._total_positives = 0\n    def peek_ap_at_n(self):\n        \"\"\"Peek the non-interpolated average precision at n.\n    Returns:\n      The non-interpolated average precision at n (default 0).\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    \"\"\"\n        if self.heap_size <= 0:\n            return 0\n        predlists = numpy.array(list(zip(*self._heap)))\n        ap = self.ap_at_n(predlists[0],\n                          predlists[1],\n                          n=self._top_n,\n                          total_num_positives=self._total_positives)\n        return ap\n    @staticmethod\n    def ap(predictions, actuals):\n        \"\"\"Calculate the non-interpolated average precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      actuals: a numpy 1-D array storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:136-166"
+    },
+    "3405": {
+        "file_id": 290,
+        "content": "This code is part of a class that calculates average precision for video tagging. It includes methods to clear accumulated predictions, peek the non-interpolated average precision at a specific point (n), and calculate the non-interpolated average precision using prediction and actual scores arrays.",
+        "type": "comment"
+    },
+    "3406": {
+        "file_id": 290,
+        "content": "    Returns:\n      The non-interpolated average precision at n.\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    Raises:\n      ValueError: An error occurred when the format of the input is not the\n      numpy 1-D array or the shape of predictions and actuals does not match.\n    \"\"\"\n        return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None)\n    @staticmethod\n    def ap_at_n(predictions, actuals, n=20, total_num_positives=None):\n        \"\"\"Calculate the non-interpolated average precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      actuals: a numpy 1-D array storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      n: the top n items to be considered in ap@n.\n      total_num_positives : (optionally) you can specify the number of total\n        positive\n      in the list. If specified, it will be used in calculation.\n    Returns:",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:168-192"
+    },
+    "3407": {
+        "file_id": 290,
+        "content": "This code calculates the non-interpolated average precision at 'n' in a list. It takes sparse prediction scores and ground truth labels as input, with any value larger than 0 treated as positives. It also allows specifying the total number of positive items in the list, which can be used for calculation if provided.",
+        "type": "comment"
+    },
+    "3408": {
+        "file_id": 290,
+        "content": "      The non-interpolated average precision at n.\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    Raises:\n      ValueError: An error occurred when\n      1) the format of the input is not the numpy 1-D array;\n      2) the shape of predictions and actuals does not match;\n      3) the input n is not a positive integer.\n    \"\"\"\n        if len(predictions) != len(actuals):\n            raise ValueError(\n                \"the shape of predictions and actuals does not match.\")\n        if n is not None:\n            if not isinstance(n, int) or n <= 0:\n                raise ValueError(\"n must be 'None' or a positive integer.\"\n                                 \" It was '%s'.\" % n)\n        ap = 0.0\n        predictions = numpy.array(predictions)\n        actuals = numpy.array(actuals)\n        # add a shuffler to avoid overestimating the ap\n        predictions, actuals = AveragePrecisionCalculator._shuffle(\n            predictions, actuals)\n        sortidx = sorted(range(len(predictions)),",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:193-220"
+    },
+    "3409": {
+        "file_id": 290,
+        "content": "The code defines a function that calculates the average precision at a specific rank, n. It checks the shape of predictions and actuals arrays and if n is positive integer or None. If any error occurs, it raises ValueError. The code also shuffles the predictions and actuals to avoid overestimating the average precision.",
+        "type": "comment"
+    },
+    "3410": {
+        "file_id": 290,
+        "content": "                         key=lambda k: predictions[k],\n                         reverse=True)\n        if total_num_positives is None:\n            numpos = numpy.size(numpy.where(actuals > 0))\n        else:\n            numpos = total_num_positives\n        if numpos == 0:\n            return 0\n        if n is not None:\n            numpos = min(numpos, n)\n        delta_recall = 1.0 / numpos\n        poscount = 0.0\n        # calculate the ap\n        r = len(sortidx)\n        if n is not None:\n            r = min(r, n)\n        for i in range(r):\n            if actuals[sortidx[i]] > 0:\n                poscount += 1\n                ap += poscount / (i + 1) * delta_recall\n        return ap\n    @staticmethod\n    def _shuffle(predictions, actuals):\n        random.seed(0)\n        suffidx = random.sample(range(len(predictions)), len(predictions))\n        predictions = predictions[suffidx]\n        actuals = actuals[suffidx]\n        return predictions, actuals\n    @staticmethod\n    def _zero_one_normalize(predictions, epsilon=1e-7):",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:221-256"
+    },
+    "3411": {
+        "file_id": 290,
+        "content": "This code calculates the average precision of a classification task by first shuffling the predictions and actuals, then iterating through the sorted list to calculate the precision at each recall step. It handles cases where total_num_positives is given or automatically calculated.",
+        "type": "comment"
+    },
+    "3412": {
+        "file_id": 290,
+        "content": "        \"\"\"Normalize the predictions to the range between 0.0 and 1.0.\n    For some predictions like SVM predictions, we need to normalize them before\n    calculate the interpolated average precision. The normalization will not\n    change the rank in the original list and thus won't change the average\n    precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      epsilon: a small constant to avoid denominator being zero.\n    Returns:\n      The normalized prediction.\n    \"\"\"\n        denominator = numpy.max(predictions) - numpy.min(predictions)\n        ret = (predictions - numpy.min(predictions)) / numpy.max(\n            denominator, epsilon)\n        return ret",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:257-274"
+    },
+    "3413": {
+        "file_id": 290,
+        "content": "This function normalizes the predictions to a range of 0.0 to 1.0 by subtracting the minimum prediction and dividing by the maximum denominator (prediction difference) with an optional epsilon value to prevent division by zero.",
+        "type": "comment"
+    },
+    "3414": {
+        "file_id": 291,
+        "content": "/applications/VideoTag/metrics/youtube8m/eval_util.py",
+        "type": "filepath"
+    },
+    "3415": {
+        "file_id": 291,
+        "content": "The PaddleVideo library evaluates video classification models using GAP, hit@one, precision error, and loss metrics. The `EvaluationMetrics` class accumulates these metrics per mini-batch or epoch using AveragePrecisionCalculator for GAP calculation.",
+        "type": "summary"
+    },
+    "3416": {
+        "file_id": 291,
+        "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Provides functions to help with evaluating models.\"\"\"\nimport datetime\nimport numpy\nfrom . import mean_average_precision_calculator as map_calculator\nfrom . import average_precision_calculator as ap_calculator\ndef flatten(l):\n    \"\"\" Merges a list of lists into a single list. \"\"\"\n    return [item for sublist in l for item in sublist]\ndef calculate_hit_at_one(predictions, actuals):\n    \"\"\"Performs a local (numpy) calculation of the hit at one.",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/eval_util.py:1-28"
+    },
+    "3417": {
+        "file_id": 291,
+        "content": "Code snippet is a part of the PaddleVideo library, providing functions to help evaluate video classification models. It includes flattening list functionality and calculates hit at one for predictions and actuals using numpy operations.",
+        "type": "comment"
+    },
+    "3418": {
+        "file_id": 291,
+        "content": "  Args:\n    predictions: Matrix containing the outputs of the model.\n      Dimensions are 'batch' x 'num_classes'.\n    actuals: Matrix containing the ground truth labels.\n      Dimensions are 'batch' x 'num_classes'.\n  Returns:\n    float: The average hit at one across the entire batch.\n  \"\"\"\n    top_prediction = numpy.argmax(predictions, 1)\n    hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]\n    return numpy.average(hits)\ndef calculate_precision_at_equal_recall_rate(predictions, actuals):\n    \"\"\"Performs a local (numpy) calculation of the PERR.\n  Args:\n    predictions: Matrix containing the outputs of the model.\n      Dimensions are 'batch' x 'num_classes'.\n    actuals: Matrix containing the ground truth labels.\n      Dimensions are 'batch' x 'num_classes'.\n  Returns:\n    float: The average precision at equal recall rate across the entire batch.\n  \"\"\"\n    aggregated_precision = 0.0\n    num_videos = actuals.shape[0]\n    for row in numpy.arange(num_videos):\n        num_labels = int(numpy.sum(actuals[row]))",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/eval_util.py:30-59"
+    },
+    "3419": {
+        "file_id": 291,
+        "content": "This code calculates the average hit at one and precision at equal recall rate for a batch of predictions and corresponding actuals. These are metrics commonly used in evaluation of machine learning models, particularly in video classification tasks. The functions take as input two matrices: 'predictions' containing model outputs and 'actuals' with ground truth labels. They return the average hit at one and precision at equal recall rate across the entire batch respectively.",
+        "type": "comment"
+    },
+    "3420": {
+        "file_id": 291,
+        "content": "        top_indices = numpy.argpartition(predictions[row],\n                                         -num_labels)[-num_labels:]\n        item_precision = 0.0\n        for label_index in top_indices:\n            if predictions[row][label_index] > 0:\n                item_precision += actuals[row][label_index]\n        item_precision /= top_indices.size\n        aggregated_precision += item_precision\n    aggregated_precision /= num_videos\n    return aggregated_precision\ndef calculate_gap(predictions, actuals, top_k=20):\n    \"\"\"Performs a local (numpy) calculation of the global average precision.\n  Only the top_k predictions are taken for each of the videos.\n  Args:\n    predictions: Matrix containing the outputs of the model.\n      Dimensions are 'batch' x 'num_classes'.\n    actuals: Matrix containing the ground truth labels.\n      Dimensions are 'batch' x 'num_classes'.\n    top_k: How many predictions to use per video.\n  Returns:\n    float: The global average precision.\n  \"\"\"\n    gap_calculator = ap_calculator.AveragePrecisionCalculator()",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/eval_util.py:60-87"
+    },
+    "3421": {
+        "file_id": 291,
+        "content": "The code calculates the global average precision (GAP) using the top_k predictions and actuals for each video. It uses a function called AveragePrecisionCalculator to calculate the metric. The function first partitions the predictions based on their values, then iterates through the top indices, adding up the correct ones to calculate item precision. Finally, it averages the item precisions across all videos to get the GAP.",
+        "type": "comment"
+    },
+    "3422": {
+        "file_id": 291,
+        "content": "    sparse_predictions, sparse_labels, num_positives = top_k_by_class(\n        predictions, actuals, top_k)\n    gap_calculator.accumulate(flatten(sparse_predictions),\n                              flatten(sparse_labels), sum(num_positives))\n    return gap_calculator.peek_ap_at_n()\ndef top_k_by_class(predictions, labels, k=20):\n    \"\"\"Extracts the top k predictions for each video, sorted by class.\n  Args:\n    predictions: A numpy matrix containing the outputs of the model.\n      Dimensions are 'batch' x 'num_classes'.\n    k: the top k non-zero entries to preserve in each prediction.\n  Returns:\n    A tuple (predictions,labels, true_positives). 'predictions' and 'labels'\n    are lists of lists of floats. 'true_positives' is a list of scalars. The\n    length of the lists are equal to the number of classes. The entries in the\n    predictions variable are probability predictions, and\n    the corresponding entries in the labels variable are the ground truth for\n    those predictions. The entries in 'true_positives' are the number of true",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/eval_util.py:88-109"
+    },
+    "3423": {
+        "file_id": 291,
+        "content": "This code extracts the top k predictions for each video, sorted by class. It returns a tuple containing the sparse_predictions, sparse_labels, and num_positives. The gap_calculator accumulates the flattened sparse_predictions, flattened sparse_labels, and sum of num_positives. Finally, it returns the average precision at n using peek_ap_at_n() from the gap_calculator.",
+        "type": "comment"
+    },
+    "3424": {
+        "file_id": 291,
+        "content": "    positives for each class in the ground truth.\n  Raises:\n    ValueError: An error occurred when the k is not a positive integer.\n  \"\"\"\n    if k <= 0:\n        raise ValueError(\"k must be a positive integer.\")\n    k = min(k, predictions.shape[1])\n    num_classes = predictions.shape[1]\n    prediction_triplets = []\n    for video_index in range(predictions.shape[0]):\n        prediction_triplets.extend(\n            top_k_triplets(predictions[video_index], labels[video_index], k))\n    out_predictions = [[] for v in range(num_classes)]\n    out_labels = [[] for v in range(num_classes)]\n    for triplet in prediction_triplets:\n        out_predictions[triplet[0]].append(triplet[1])\n        out_labels[triplet[0]].append(triplet[2])\n    out_true_positives = [numpy.sum(labels[:, i]) for i in range(num_classes)]\n    return out_predictions, out_labels, out_true_positives\ndef top_k_triplets(predictions, labels, k=20):\n    \"\"\"Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in\n  (prediction, class) format\"\"\"",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/eval_util.py:110-135"
+    },
+    "3425": {
+        "file_id": 291,
+        "content": "Function evaluates predictions and labels for each video and calculates top-k triplets (prediction, class) for each class. If k is not a positive integer, it raises ValueError. It returns out_predictions, out_labels, and out_true_positives for further analysis.",
+        "type": "comment"
+    },
+    "3426": {
+        "file_id": 291,
+        "content": "    m = len(predictions)\n    k = min(k, m)\n    indices = numpy.argpartition(predictions, -k)[-k:]\n    return [(index, predictions[index], labels[index]) for index in indices]\nclass EvaluationMetrics(object):\n    \"\"\"A class to store the evaluation metrics.\"\"\"\n    def __init__(self, num_class, top_k):\n        \"\"\"Construct an EvaluationMetrics object to store the evaluation metrics.\n    Args:\n      num_class: A positive integer specifying the number of classes.\n      top_k: A positive integer specifying how many predictions are considered per video.\n    Raises:\n      ValueError: An error occurred when MeanAveragePrecisionCalculator cannot\n        not be constructed.\n    \"\"\"\n        self.sum_hit_at_one = 0.0\n        self.sum_perr = 0.0\n        self.sum_loss = 0.0\n        self.map_calculator = map_calculator.MeanAveragePrecisionCalculator(\n            num_class)\n        self.global_ap_calculator = ap_calculator.AveragePrecisionCalculator()\n        self.top_k = top_k\n        self.num_examples = 0\n    #def accumulate(self, predictions, labels, loss):",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/eval_util.py:136-164"
+    },
+    "3427": {
+        "file_id": 291,
+        "content": "The code defines a class `EvaluationMetrics` to store various evaluation metrics for video classification. The `__init__` method initializes the metrics such as hit@one, precision error (perr), and loss. It also initializes two calculators: MeanAveragePrecisionCalculator and AveragePrecisionCalculator. The `accumulate` method updates these metrics based on predictions, labels, and loss values.",
+        "type": "comment"
+    },
+    "3428": {
+        "file_id": 291,
+        "content": "    def accumulate(self, loss, predictions, labels):\n        \"\"\"Accumulate the metrics calculated locally for this mini-batch.\n    Args:\n      predictions: A numpy matrix containing the outputs of the model.\n        Dimensions are 'batch' x 'num_classes'.\n      labels: A numpy matrix containing the ground truth labels.\n        Dimensions are 'batch' x 'num_classes'.\n      loss: A numpy array containing the loss for each sample.\n    Returns:\n      dictionary: A dictionary storing the metrics for the mini-batch.\n    Raises:\n      ValueError: An error occurred when the shape of predictions and actuals\n        does not match.\n    \"\"\"\n        batch_size = labels.shape[0]\n        mean_hit_at_one = calculate_hit_at_one(predictions, labels)\n        mean_perr = calculate_precision_at_equal_recall_rate(\n            predictions, labels)\n        mean_loss = numpy.mean(loss)\n        # Take the top 20 predictions.\n        sparse_predictions, sparse_labels, num_positives = top_k_by_class(\n            predictions, labels, self.top_k)",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/eval_util.py:165-190"
+    },
+    "3429": {
+        "file_id": 291,
+        "content": "The code defines a function \"accumulate\" that takes in predictions, labels and loss from a mini-batch. It calculates three metrics: mean_hit_at_one, mean_perr, and mean_loss. The function then returns a dictionary containing these metrics for the batch.",
+        "type": "comment"
+    },
+    "3430": {
+        "file_id": 291,
+        "content": "        self.map_calculator.accumulate(sparse_predictions, sparse_labels,\n                                       num_positives)\n        self.global_ap_calculator.accumulate(flatten(sparse_predictions),\n                                             flatten(sparse_labels),\n                                             sum(num_positives))\n        self.num_examples += batch_size\n        self.sum_hit_at_one += mean_hit_at_one * batch_size\n        self.sum_perr += mean_perr * batch_size\n        self.sum_loss += mean_loss * batch_size\n        return {\n            \"hit_at_one\": mean_hit_at_one,\n            \"perr\": mean_perr,\n            \"loss\": mean_loss\n        }\n    def get(self):\n        \"\"\"Calculate the evaluation metrics for the whole epoch.\n    Raises:\n      ValueError: If no examples were accumulated.\n    Returns:\n      dictionary: a dictionary storing the evaluation metrics for the epoch. The\n        dictionary has the fields: avg_hit_at_one, avg_perr, avg_loss, and\n        aps (default nan).\n    \"\"\"\n        if self.num_examples <= 0:",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/eval_util.py:191-219"
+    },
+    "3431": {
+        "file_id": 291,
+        "content": "This code calculates and accumulates various evaluation metrics during the epoch, including hit_at_one, perr, and loss. It then returns a dictionary with these metrics after an entire epoch of training. If no examples were accumulated during the epoch, it raises a ValueError.",
+        "type": "comment"
+    },
+    "3432": {
+        "file_id": 291,
+        "content": "            raise ValueError(\"total_sample must be positive.\")\n        avg_hit_at_one = self.sum_hit_at_one / self.num_examples\n        avg_perr = self.sum_perr / self.num_examples\n        avg_loss = self.sum_loss / self.num_examples\n        aps = self.map_calculator.peek_map_at_n()\n        gap = self.global_ap_calculator.peek_ap_at_n()\n        epoch_info_dict = {}\n        return {\n            \"avg_hit_at_one\": avg_hit_at_one,\n            \"avg_perr\": avg_perr,\n            \"avg_loss\": avg_loss,\n            \"aps\": aps,\n            \"gap\": gap\n        }\n    def clear(self):\n        \"\"\"Clear the evaluation metrics and reset the EvaluationMetrics object.\"\"\"\n        self.sum_hit_at_one = 0.0\n        self.sum_perr = 0.0\n        self.sum_loss = 0.0\n        self.map_calculator.clear()\n        self.global_ap_calculator.clear()\n        self.num_examples = 0",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/eval_util.py:220-244"
+    },
+    "3433": {
+        "file_id": 291,
+        "content": "This code defines a class for evaluating metrics in video tagging. It calculates average hit at one, perr, and loss, as well as maps and global APs. The clear method resets the metrics to zero.",
+        "type": "comment"
+    },
+    "3434": {
+        "file_id": 292,
+        "content": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py",
+        "type": "filepath"
+    },
+    "3435": {
+        "file_id": 292,
+        "content": "This code snippet calculates mean average precision for video tagging in the Youtube-8m dataset using numpy, with functions for accumulation and processing. It allows for clearing the calculator, checking if empty, and retrieving non-interpolated average precision at n for each class.",
+        "type": "summary"
+    },
+    "3436": {
+        "file_id": 292,
+        "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Calculate the mean average precision.\nIt provides an interface for calculating mean average precision\nfor an entire list or the top-n ranked items.\nExample usages:\nWe first call the function accumulate many times to process parts of the ranked\nlist. After processing all the parts, we call peek_map_at_n\nto calculate the mean average precision.\n```\nimport random\np = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:1-27"
+    },
+    "3437": {
+        "file_id": 292,
+        "content": "This code calculates the mean average precision for an entire list or top-n ranked items in a video tag application. It imports numpy and provides functions to accumulate, process parts of the ranked list, and finally calculate the mean average precision using peek_map_at_n function. The example usage demonstrates how to use this code with a sample array.",
+        "type": "comment"
+    },
+    "3438": {
+        "file_id": 292,
+        "content": "a = np.array([[random.choice([0, 1]) for _ in xrange(50)]\n     for _ in xrange(1000)])\n# mean average precision for 50 classes.\ncalculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(\n            num_class=50)\ncalculator.accumulate(p, a)\naps = calculator.peek_map_at_n()\n```\n\"\"\"\nimport numpy\nfrom . import average_precision_calculator\nclass MeanAveragePrecisionCalculator(object):\n    \"\"\"This class is to calculate mean average precision.\n  \"\"\"\n    def __init__(self, num_class):\n        \"\"\"Construct a calculator to calculate the (macro) average precision.\n    Args:\n      num_class: A positive Integer specifying the number of classes.\n      top_n_array: A list of positive integers specifying the top n for each\n      class. The top n in each class will be used to calculate its average\n      precision at n.\n      The size of the array must be num_class.\n    Raises:\n      ValueError: An error occurred when num_class is not a positive integer;\n      or the top_n_array is not a list of positive integers.",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:28-58"
+    },
+    "3439": {
+        "file_id": 292,
+        "content": "The code initializes a numpy array 'a' with random values (0 or 1) for each of the 50 classes. It then creates an instance of MeanAveragePrecisionCalculator class and accumulates the predictions and actuals using the 'accumulate' method. Finally, it retrieves the average precision at different recall levels using the 'peek_map_at_n' method.",
+        "type": "comment"
+    },
+    "3440": {
+        "file_id": 292,
+        "content": "    \"\"\"\n        if not isinstance(num_class, int) or num_class <= 1:\n            raise ValueError(\"num_class must be a positive integer.\")\n        self._ap_calculators = []  # member of AveragePrecisionCalculator\n        self._num_class = num_class  # total number of classes\n        for i in range(num_class):\n            self._ap_calculators.append(\n                average_precision_calculator.AveragePrecisionCalculator())\n    def accumulate(self, predictions, actuals, num_positives=None):\n        \"\"\"Accumulate the predictions and their ground truth labels.\n    Args:\n      predictions: A list of lists storing the prediction scores. The outer\n      dimension corresponds to classes.\n      actuals: A list of lists storing the ground truth labels. The dimensions\n      should correspond to the predictions input. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      num_positives: If provided, it is a list of numbers representing the\n      number of true positives for each class. If not provided, the number of",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:59-79"
+    },
+    "3441": {
+        "file_id": 292,
+        "content": "This code initializes an instance of AveragePrecisionCalculator with a specified number of classes. It appends an instance of AveragePrecisionCalculator to the class member _ap_calculators for each class. The accumulate method takes predictions and actuals as arguments, accumulates prediction scores and ground truth labels, treats any value greater than 0 as positives and negatives otherwise, and optionally takes num_positives as an argument if provided.",
+        "type": "comment"
+    },
+    "3442": {
+        "file_id": 292,
+        "content": "      true positives will be inferred from the 'actuals' array.\n    Raises:\n      ValueError: An error occurred when the shape of predictions and actuals\n      does not match.\n    \"\"\"\n        if not num_positives:\n            num_positives = [None for i in predictions.shape[1]]\n        calculators = self._ap_calculators\n        for i in range(len(predictions)):\n            calculators[i].accumulate(predictions[i], actuals[i],\n                                      num_positives[i])\n    def clear(self):\n        for calculator in self._ap_calculators:\n            calculator.clear()\n    def is_empty(self):\n        return ([calculator.heap_size for calculator in self._ap_calculators\n                 ] == [0 for _ in range(self._num_class)])\n    def peek_map_at_n(self):\n        \"\"\"Peek the non-interpolated mean average precision at n.\n    Returns:\n      An array of non-interpolated average precision at n (default 0) for each\n      class.\n    \"\"\"\n        aps = [\n            self._ap_calculators[i].peek_ap_at_n()\n            for i in range(self._num_class)",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:80-111"
+    },
+    "3443": {
+        "file_id": 292,
+        "content": "This code initializes a mean average precision calculator, handles accumulating predictions and actuals, allows for clearing the calculator, checks if it's empty, and retrieves non-interpolated average precision at n for each class.",
+        "type": "comment"
+    },
+    "3444": {
+        "file_id": 292,
+        "content": "        ]\n        return aps",
+        "type": "code",
+        "location": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:112-113"
+    },
+    "3445": {
+        "file_id": 292,
+        "content": "This code snippet calculates mean average precision for video tagging in the Youtube-8m dataset. It returns a list of average precisions (aps) after processing each chunk of data.",
+        "type": "comment"
+    },
+    "3446": {
+        "file_id": 293,
+        "content": "/applications/VideoTag/models/__init__.py",
+        "type": "filepath"
+    },
+    "3447": {
+        "file_id": 293,
+        "content": "Code snippet registers two models, AttentionLSTM and TSN, in the application's model registry using the functions regist_model() and get_model(). Models are sorted alphabetically for easy retrieval.",
+        "type": "summary"
+    },
+    "3448": {
+        "file_id": 293,
+        "content": "from .model import regist_model, get_model\nfrom .attention_lstm import AttentionLSTM\nfrom .tsn import TSN\n# regist models, sort by alphabet\nregist_model(\"AttentionLSTM\", AttentionLSTM)\nregist_model(\"TSN\", TSN)",
+        "type": "code",
+        "location": "/applications/VideoTag/models/__init__.py:1-7"
+    },
+    "3449": {
+        "file_id": 293,
+        "content": "Code snippet registers two models, AttentionLSTM and TSN, in the application's model registry using the functions regist_model() and get_model(). Models are sorted alphabetically for easy retrieval.",
+        "type": "comment"
+    },
+    "3450": {
+        "file_id": 294,
+        "content": "/applications/VideoTag/models/attention_lstm/__init__.py",
+        "type": "filepath"
+    },
+    "3451": {
+        "file_id": 294,
+        "content": "This line imports the functions and classes from the \"attention_lstm.py\" file in the same directory, allowing for easy access to those components within this module.",
+        "type": "summary"
+    },
+    "3452": {
+        "file_id": 294,
+        "content": "from .attention_lstm import *",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/__init__.py:1-1"
+    },
+    "3453": {
+        "file_id": 294,
+        "content": "This line imports the functions and classes from the \"attention_lstm.py\" file in the same directory, allowing for easy access to those components within this module.",
+        "type": "comment"
+    },
+    "3454": {
+        "file_id": 295,
+        "content": "/applications/VideoTag/models/attention_lstm/attention_lstm.py",
+        "type": "filepath"
+    },
+    "3455": {
+        "file_id": 295,
+        "content": "The code defines an AttentionLSTM class for a video tagging model, extending ModelBase. It initializes properties, retrieves configurations and dimensions, builds the LSTM attention model, applies fully connected layers, and uses an optimizer with piecewise learning rate decay and L2 regularization.",
+        "type": "summary"
+    },
+    "3456": {
+        "file_id": 295,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nfrom ..model import ModelBase\nfrom .lstm_attention import LSTMAttentionModel\nimport logging\nimport paddle\nimport paddle.static as static\nlogger = logging.getLogger(__name__)\n__all__ = [\"AttentionLSTM\"]\nclass AttentionLSTM(ModelBase):\n    def __init__(self, name, cfg, mode='train', is_videotag=False):\n        super(AttentionLSTM, self).__init__(name, cfg, mode)\n        self.is_videotag = is_videotag\n        self.get_config()",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:1-31"
+    },
+    "3457": {
+        "file_id": 295,
+        "content": "The code is importing necessary modules and defining a class called AttentionLSTM. It extends the ModelBase class and has an __init__ method to initialize its properties such as name, configuration, mode, and is_videotag flag. The get_config method is also defined for retrieving configuration from a file.",
+        "type": "comment"
+    },
+    "3458": {
+        "file_id": 295,
+        "content": "    def get_config(self):\n        # get model configs\n        self.feature_names = self.cfg.MODEL.feature_names\n        self.feature_dims = self.cfg.MODEL.feature_dims\n        self.num_classes = self.cfg.MODEL.num_classes\n        self.embedding_size = self.cfg.MODEL.embedding_size\n        self.lstm_size = self.cfg.MODEL.lstm_size\n        self.drop_rate = self.cfg.MODEL.drop_rate\n        # get mode configs\n        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size', 1)\n        self.num_gpus = self.get_config_from_sec(self.mode, 'num_gpus', 1)\n        if self.mode == 'train':\n            self.learning_rate = self.get_config_from_sec(\n                'train', 'learning_rate', 1e-3)\n            self.weight_decay = self.get_config_from_sec(\n                'train', 'weight_decay', 8e-4)\n            self.num_samples = self.get_config_from_sec('train', 'num_samples',\n                                                        5000000)\n            self.decay_epochs = self.get_config_from_sec(\n                'train', 'decay_epochs', [5])",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:33-54"
+    },
+    "3459": {
+        "file_id": 295,
+        "content": "The code defines a model's configuration method, retrieving feature names, dimensions, number of classes, embedding size, LSTM size, and drop rate. It also gets mode-specific configurations such as batch size, number of GPUs, learning rate, weight decay, total training samples, and epochs for learning rate decay.",
+        "type": "comment"
+    },
+    "3460": {
+        "file_id": 295,
+        "content": "            self.decay_gamma = self.get_config_from_sec('train', 'decay_gamma',\n                                                        0.1)\n    def build_input(self, use_dataloader):\n        self.feature_input = []\n        for name, dim in zip(self.feature_names, self.feature_dims):\n            self.feature_input.append(\n                static.data(shape=[None, dim],\n                           lod_level=1,\n                           dtype='float32',\n                           name=name))\n        if self.mode != 'infer':\n            self.label_input = static.data(shape=[None, self.num_classes],\n                                          dtype='float32',\n                                          name='label')\n        else:\n            self.label_input = None\n        if use_dataloader:\n            assert self.mode != 'infer', \\\n                    'dataloader is not recommendated when infer, please set use_dataloader to be false.'\n            self.dataloader = paddle.io.DataLoader.from_generator(\n                feed_list=self.feature_input + [self.label_input],",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:55-76"
+    },
+    "3461": {
+        "file_id": 295,
+        "content": "The code initializes feature and label inputs for the model, depending on the mode. It also builds a dataloader if use_dataloader is True, but not recommended in infer mode.",
+        "type": "comment"
+    },
+    "3462": {
+        "file_id": 295,
+        "content": "                capacity=8,\n                iterable=True)\n    def build_model(self):\n        att_outs = []\n        for i, (input_dim,\n                feature) in enumerate(zip(self.feature_dims,\n                                          self.feature_input)):\n            att = LSTMAttentionModel(input_dim, self.embedding_size,\n                                     self.lstm_size, self.drop_rate)\n            att_out = att.forward(feature, is_training=(self.mode == 'train'))\n            att_outs.append(att_out)\n        if len(att_outs) > 1:\n            out = paddle.concat(x=att_outs, axis=1)\n        else:\n            out = att_outs[0]  # video only, without audio in videoTag\n        fc1 = static.nn.fc(\n            x=out,\n            size=8192,\n            activation='relu',\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)),\n            name='fc1')\n        fc2 = static.nn.fc(\n            x=fc1,",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:77-103"
+    },
+    "3463": {
+        "file_id": 295,
+        "content": "This code defines a LSTM attention model with multiple input features. It concatenates output of each feature, applies fully connected layers, and returns the result.",
+        "type": "comment"
+    },
+    "3464": {
+        "file_id": 295,
+        "content": "            size=4096,\n            activation='tanh',\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)),\n            name='fc2')\n        self.logit = static.nn.fc(x=fc2, size=self.num_classes, activation=None, \\\n                              bias_attr=paddle.ParamAttr(regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                                                  initializer=paddle.nn.initializer.Normal(std=0.0)), name='output')\n        self.output = paddle.nn.functional.sigmoid(self.logit)\n    def optimizer(self):\n        assert self.mode == 'train', \"optimizer only can be get in train mode\"\n        values = [\n            self.learning_rate * (self.decay_gamma**i)\n            for i in range(len(self.decay_epochs) + 1)\n        ]\n        iter_per_epoch = self.num_samples / self.batch_size\n        boundaries = [e * iter_per_epoch for e in self.decay_epochs]\n        return paddle.optimizer.RMSProp(",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:104-125"
+    },
+    "3465": {
+        "file_id": 295,
+        "content": "This code defines an attention LSTM model for video tagging. It uses a tanh activation function, L2 decay regularizer, and normal initializer for the fully connected layers. The logit layer applies sigmoid activation to output probabilities for each class. The optimizer function sets up a learning rate schedule using RMSProp optimizer with decay epochs and boundaries.",
+        "type": "comment"
+    },
+    "3466": {
+        "file_id": 295,
+        "content": "            learning_rate=paddle.optimizer.lr.PiecewiseDecay(values=values,\n                                                       boundaries=boundaries),\n            centered=True,\n            weight_decay=paddle.regularizer.L2Decay(coeff=self.weight_decay))\n    def loss(self):\n        assert self.mode != 'infer', \"invalid loss calculationg in infer mode\"\n        cost = paddle.nn.functional.binary_cross_entropy(\n            input=self.logit, label=self.label_input, reduction=None)\n        cost = paddle.sum(x=cost, axis=-1)\n        sum_cost = paddle.sum(x=cost)\n        self.loss_ = paddle.scale(sum_cost,\n                                        scale=self.num_gpus,\n                                        bias_after_scale=False)\n        return self.loss_\n    def outputs(self):\n        return [self.output, self.logit]\n    def feeds(self):\n        return self.feature_input if self.mode == 'infer' else self.feature_input + [\n            self.label_input\n        ]\n    def fetches(self):\n        if self.mode == 'train' or self.mode == 'valid':",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:126-151"
+    },
+    "3467": {
+        "file_id": 295,
+        "content": "This code defines a model with an LSTM layer and attention mechanism. It uses piecewise learning rate decay, L2 weight decay regularization, calculates binary cross-entropy loss for classification tasks, and supports both training, validation, and inference modes.",
+        "type": "comment"
+    },
+    "3468": {
+        "file_id": 295,
+        "content": "            losses = self.loss()\n            fetch_list = [losses, self.output, self.label_input]\n        elif self.mode == 'test':\n            losses = self.loss()\n            fetch_list = [losses, self.output, self.label_input]\n        elif self.mode == 'infer':\n            fetch_list = [self.output]\n        else:\n            raise NotImplementedError('mode {} not implemented'.format(\n                self.mode))\n        return fetch_list\n    def weights_info(self):\n        return None, None\n    def load_pretrain_params(self, exe, pretrain, prog):\n        logger.info(\n            \"Load pretrain weights from {}, exclude fc layer.\".format(pretrain))\n        state_dict = paddle.static.load_program_state(pretrain)\n        dict_keys = list(state_dict.keys())\n        for name in dict_keys:\n            if \"fc_0\" in name:\n                del state_dict[name]\n                logger.info(\n                    'Delete {} from pretrained parameters. Do not load it'.\n                    format(name))\n        paddle.static.set_program_state(prog, state_dict)",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/attention_lstm.py:152-180"
+    },
+    "3469": {
+        "file_id": 295,
+        "content": "This code defines a class with three methods. The first method, `fetch_list()`, returns the fetch list for different modes ('train', 'test', or 'infer'). In 'train' mode, it calculates losses and includes them in the fetch list. In 'test' mode, it does the same. In 'infer' mode, only the output is included in the fetch list. If an unrecognized mode is provided, a `NotImplementedError` is raised. The `weights_info()` method returns no information as it is not implemented yet. Lastly, the `load_pretrain_params()` method loads pretrained weights from a given file, excluding any \"fc_0\" layer parameters, and logs a message confirming this action.",
+        "type": "comment"
+    },
+    "3470": {
+        "file_id": 296,
+        "content": "/applications/VideoTag/models/attention_lstm/lstm_attention.py",
+        "type": "filepath"
+    },
+    "3471": {
+        "file_id": 296,
+        "content": "This code defines an LSTM Attention Model class with parameters and a forward method for computation, applying LSTM layers in both directions and performing dynamic LSTM on input tensor. It uses dropout, FC layer, sequence_softmax, scaling, and sum pooling to obtain the final output.",
+        "type": "summary"
+    },
+    "3472": {
+        "file_id": 296,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.static as static\nclass LSTMAttentionModel(object):\n    \"\"\"LSTM Attention Model\"\"\"\n    def __init__(self,\n                 bias_attr,\n                 embedding_size=512,\n                 lstm_size=1024,\n                 drop_rate=0.5):\n        self.lstm_size = lstm_size\n        self.embedding_size = embedding_size\n        self.drop_rate = drop_rate\n    def forward(self, input, is_training):",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/lstm_attention.py:1-31"
+    },
+    "3473": {
+        "file_id": 296,
+        "content": "This code is for a class called LSTMAttentionModel, which represents an LSTM Attention Model. It has three parameters: bias_attr, embedding_size (default 512), lstm_size (default 1024), and drop_rate (default 0.5). The class has an __init__ method to initialize these parameters and a forward method for performing the model's computation.",
+        "type": "comment"
+    },
+    "3474": {
+        "file_id": 296,
+        "content": "        input_fc = static.nn.fc(\n            x=input,\n            size=self.embedding_size,\n            activation='tanh',\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)),\n            name='rgb_fc')\n        lstm_forward_fc = static.nn.fc(\n            x=input_fc,\n            size=self.lstm_size * 4,\n            activation=None,\n            bias_attr=False,  # video_tag\n            name='rgb_fc_forward')\n        lstm_forward, _ = paddle.fluid.layers.dynamic_lstm(input=lstm_forward_fc,\n                                                    size=self.lstm_size * 4,\n                                                    is_reverse=False,\n                                                    name='rgb_lstm_forward')\n        lsmt_backward_fc = static.nn.fc(\n            x=input_fc,\n            size=self.lstm_size * 4,\n            activation=None,\n            bias_attr=False,  #video_tag\n            name='rgb_fc_backward')",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/lstm_attention.py:32-58"
+    },
+    "3475": {
+        "file_id": 296,
+        "content": "This code initializes an LSTM layer for video tagging. It applies two fully connected layers (fc) to the input, one for forward and one for backward direction. The forward LSTM layer is created using dynamic_lstm function with size 4 times the lstm_size attribute and no reverse operation. The backward LSTM layer is also created similarly.",
+        "type": "comment"
+    },
+    "3476": {
+        "file_id": 296,
+        "content": "        lstm_backward, _ = paddle.fluid.layers.dynamic_lstm(input=lsmt_backward_fc,\n                                                     size=self.lstm_size * 4,\n                                                     is_reverse=True,\n                                                     name='rgb_lstm_backward')\n        lstm_concat = paddle.concat(x=[lstm_forward, lstm_backward],\n                                          axis=1)\n        lstm_dropout = paddle.nn.functional.dropout2d(x=lstm_concat,\n                                            p=self.drop_rate,\n                                            training=is_training)\n        lstm_weight = static.nn.fc(\n            x=lstm_dropout,\n            size=1,\n            activation='sequence_softmax',\n            bias_attr=False,  #video_tag\n            name='rgb_weight')\n        scaled = paddle.multiply(x=lstm_dropout,\n                                              y=lstm_weight)\n        lstm_pool = paddle.static.nn.sequence_pool(input=scaled, pool_type='sum')\n        return lstm_pool",
+        "type": "code",
+        "location": "/applications/VideoTag/models/attention_lstm/lstm_attention.py:60-83"
+    },
+    "3477": {
+        "file_id": 296,
+        "content": "This code performs dynamic LSTM on input tensor with forward and backward directions, concatenates the results, applies dropout, then feeds the result into an FC layer for weight assignment using sequence_softmax. The final output is obtained by scaling the previous result with the weights and applying a sum pooling.",
+        "type": "comment"
+    },
+    "3478": {
+        "file_id": 297,
+        "content": "/applications/VideoTag/models/model.py",
+        "type": "filepath"
+    },
+    "3479": {
+        "file_id": 297,
+        "content": "The Python module supports PaddleVideo's VideoTag app, includes a model class for subclassing with base methods, and handles weights, dataloader, pre-trained models, weight file paths, and downloads. It also provides a ModelZoo class for managing models and functions to get/register models.",
+        "type": "summary"
+    },
+    "3480": {
+        "file_id": 297,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport wget\nimport logging\nimport paddle\nimport paddle.static as static\ntry:\n    from configparser import ConfigParser\nexcept:\n    from ConfigParser import ConfigParser\nfrom .utils import download, AttrDict\nWEIGHT_DIR = os.path.join(os.path.expanduser('~'), '.paddle', 'weights')\nlogger = logging.getLogger(__name__)\ndef is_parameter(var):\n    return isinstance(var, paddle.framework.Parameter)\nclass NotImplementError(Exception):",
+        "type": "code",
+        "location": "/applications/VideoTag/models/model.py:1-36"
+    },
+    "3481": {
+        "file_id": 297,
+        "content": "This code is a Python module for the PaddleVideo project's VideoTag application. It imports necessary libraries, sets the storage location for weights, and defines functions for parameter checking and handling exceptions.",
+        "type": "comment"
+    },
+    "3482": {
+        "file_id": 297,
+        "content": "    \"Error: model function not implement\"\n    def __init__(self, model, function):\n        super(NotImplementError, self).__init__()\n        self.model = model.__class__.__name__\n        self.function = function.__name__\n    def __str__(self):\n        return \"Function {}() is not implemented in model {}\".format(\n            self.function, self.model)\nclass ModelNotFoundError(Exception):\n    \"Error: model not found\"\n    def __init__(self, model_name, avail_models):\n        super(ModelNotFoundError, self).__init__()\n        self.model_name = model_name\n        self.avail_models = avail_models\n    def __str__(self):\n        msg = \"Model {} Not Found.\\nAvailiable models:\\n\".format(\n            self.model_name)\n        for model in self.avail_models:\n            msg += \"  {}\\n\".format(model)\n        return msg\nclass ModelBase(object):\n    def __init__(self, name, cfg, mode='train'):\n        assert mode in ['train', 'valid', 'test', 'infer'], \\\n                \"Unknown mode type {}\".format(mode)\n        self.name = name",
+        "type": "code",
+        "location": "/applications/VideoTag/models/model.py:37-69"
+    },
+    "3483": {
+        "file_id": 297,
+        "content": "This code defines two custom exceptions, \"NotImplementError\" and \"ModelNotFoundError\", to handle specific model-related issues. The \"ModelBase\" class serves as a base for creating different models with different modes (train, valid, test, infer). The code also checks if the mode input is valid.",
+        "type": "comment"
+    },
+    "3484": {
+        "file_id": 297,
+        "content": "        self.is_training = (mode == 'train')\n        self.mode = mode\n        self.cfg = cfg\n        self.dataloader = None\n    def build_model(self):\n        \"build model struct\"\n        raise NotImplementError(self, self.build_model)\n    def build_input(self, use_dataloader):\n        \"build input Variable\"\n        raise NotImplementError(self, self.build_input)\n    def optimizer(self):\n        \"get model optimizer\"\n        raise NotImplementError(self, self.optimizer)\n    def outputs(self):\n        \"get output variable\"\n        raise NotImplementError(self, self.outputs)\n    def loss(self):\n        \"get loss variable\"\n        raise NotImplementError(self, self.loss)\n    def feeds(self):\n        \"get feed inputs list\"\n        raise NotImplementError(self, self.feeds)\n    def fetches(self):\n        \"get fetch list of model\"\n        raise NotImplementError(self, self.fetches)\n    def weights_info(self):\n        \"get model weight default path and download url\"\n        raise NotImplementError(self, self.weights_info)",
+        "type": "code",
+        "location": "/applications/VideoTag/models/model.py:70-105"
+    },
+    "3485": {
+        "file_id": 297,
+        "content": "The code is a model class that requires subclassing for implementation. It defines various methods such as build_model, build_input, optimizer, outputs, loss, feeds, and fetches. However, the actual implementation of these methods should be provided in the subclass since they are all raising NotImplementedError. The weights_info method returns model weight default path and download URL.",
+        "type": "comment"
+    },
+    "3486": {
+        "file_id": 297,
+        "content": "    def get_weights(self):\n        \"get model weight file path, download weight from Paddle if not exist\"\n        path, url = self.weights_info()\n        path = os.path.join(WEIGHT_DIR, path)\n        if not os.path.isdir(WEIGHT_DIR):\n            logger.info('{} not exists, will be created automatically.'.format(\n                WEIGHT_DIR))\n            os.makedirs(WEIGHT_DIR)\n        if os.path.exists(path):\n            return path\n        logger.info(\"Download weights of {} from {}\".format(self.name, url))\n        wget.download(url, path)\n        return path\n    def dataloader(self):\n        return self.dataloader\n    def epoch_num(self):\n        \"get train epoch num\"\n        return self.cfg.TRAIN.epoch\n    def pretrain_info(self):\n        \"get pretrain base model directory\"\n        return (None, None)\n    def get_pretrain_weights(self):\n        \"get model weight file path, download weight from Paddle if not exist\"\n        path, url = self.pretrain_info()\n        if not path:\n            return None\n        path = os.path.join(WEIGHT_DIR, path)",
+        "type": "code",
+        "location": "/applications/VideoTag/models/model.py:107-139"
+    },
+    "3487": {
+        "file_id": 297,
+        "content": "This code defines several methods for a model class. The `get_weights` method returns the weight file path, downloading it from Paddle if it doesn't exist. The `dataloader` method returns the dataloader object. The `epoch_num` method returns the train epoch number. The `pretrain_info` method returns the pre-trained base model directory. The `get_pretrain_weights` method returns the weight file path, downloading it from Paddle if necessary.",
+        "type": "comment"
+    },
+    "3488": {
+        "file_id": 297,
+        "content": "        if not os.path.isdir(WEIGHT_DIR):\n            logger.info('{} not exists, will be created automatically.'.format(\n                WEIGHT_DIR))\n            os.makedirs(WEIGHT_DIR)\n        if os.path.exists(path):\n            return path\n        logger.info(\"Download pretrain weights of {} from {}\".format(\n            self.name, url))\n        download(url, path)\n        return path\n    def load_pretrain_params(self, exe, pretrain, prog):\n        logger.info(\"Load pretrain weights from {}\".format(pretrain))\n        state_dict = paddle.static.load_program_state(pretrain)\n        paddle.static.set_program_state(prog, state_dict)\n    def load_test_weights(self, exe, weights, prog):\n        params_list = list(filter(is_parameter, prog.list_vars()))\n        static.load(prog, weights, executor=exe, var_list=params_list)\n    def get_config_from_sec(self, sec, item, default=None):\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ModelZoo(object):",
+        "type": "code",
+        "location": "/applications/VideoTag/models/model.py:140-167"
+    },
+    "3489": {
+        "file_id": 297,
+        "content": "The code includes functions for handling model weights. It checks if a directory exists, downloads pretrain weights if necessary, loads pretrain and test weights into programs, and retrieves configuration from a config file.",
+        "type": "comment"
+    },
+    "3490": {
+        "file_id": 297,
+        "content": "    def __init__(self):\n        self.model_zoo = {}\n    def regist(self, name, model):\n        assert model.__base__ == ModelBase, \"Unknow model type {}\".format(\n            type(model))\n        self.model_zoo[name] = model\n    def get(self, name, cfg, mode='train', is_videotag=False):\n        for k, v in self.model_zoo.items():\n            if k.upper() == name.upper():\n                return v(name, cfg, mode, is_videotag)\n        raise ModelNotFoundError(name, self.model_zoo.keys())\n# singleton model_zoo\nmodel_zoo = ModelZoo()\ndef regist_model(name, model):\n    model_zoo.regist(name, model)\ndef get_model(name, cfg, mode='train', is_videotag=False):\n    return model_zoo.get(name, cfg, mode, is_videotag)",
+        "type": "code",
+        "location": "/applications/VideoTag/models/model.py:168-192"
+    },
+    "3491": {
+        "file_id": 297,
+        "content": "This code defines a ModelZoo class for managing different models, allowing registration and retrieval of models based on their names. The get() function returns the model instance with the specified name, while regist() registers new model classes to the ModelZoo. The get_model() and regist_model() functions provide convenient methods to interact with the singleton ModelZoo instance.",
+        "type": "comment"
+    },
+    "3492": {
+        "file_id": 298,
+        "content": "/applications/VideoTag/models/tsn/__init__.py",
+        "type": "filepath"
+    },
+    "3493": {
+        "file_id": 298,
+        "content": "This code imports all modules and functions from the \"tsn\" subdirectory within the current package, allowing easy access to those components in other parts of the code. This is commonly used for modularity and organization in larger projects.",
+        "type": "summary"
+    },
+    "3494": {
+        "file_id": 298,
+        "content": "from .tsn import *",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/__init__.py:1-1"
+    },
+    "3495": {
+        "file_id": 298,
+        "content": "This code imports all modules and functions from the \"tsn\" subdirectory within the current package, allowing easy access to those components in other parts of the code. This is commonly used for modularity and organization in larger projects.",
+        "type": "comment"
+    },
+    "3496": {
+        "file_id": 299,
+        "content": "/applications/VideoTag/models/tsn/tsn.py",
+        "type": "filepath"
+    },
+    "3497": {
+        "file_id": 299,
+        "content": "This code initializes a TSN model class and sets parameters for segmentation, training, image statistics, layers, epochs, video data, and optimizer. It defines a VideoTag model with train, test, and infer modes, updating parameters and excluding the final layer for pre-trained weights.",
+        "type": "summary"
+    },
+    "3498": {
+        "file_id": 299,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nfrom ..model import ModelBase\nfrom .tsn_res_model import TSN_ResNet\nimport logging\nimport paddle\nimport paddle.static as static\nlogger = logging.getLogger(__name__)\n__all__ = [\"TSN\"]\nclass TSN(ModelBase):\n    def __init__(self, name, cfg, mode='train', is_videotag=False):\n        super(TSN, self).__init__(name, cfg, mode=mode)\n        self.is_videotag = is_videotag\n        self.get_config()\n    def get_config(self):\n        self.num_classes = self.get_config_from_sec('model', 'num_classes')",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn.py:1-34"
+    },
+    "3499": {
+        "file_id": 299,
+        "content": "This code imports necessary modules and defines a class TSN that extends the ModelBase class. The class takes parameters such as name, configuration, mode and is_videotag. It also has a method get_config that fetches the model configuration from the given section.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/35.json b/docs/data/35.json
new file mode 100644
index 000000000..d9afe0651
--- /dev/null
+++ b/docs/data/35.json
@@ -0,0 +1,544 @@
+{
+    "3500": {
+        "file_id": 299,
+        "content": "        self.seg_num = self.get_config_from_sec('model', 'seg_num')\n        self.seglen = self.get_config_from_sec('model', 'seglen')\n        self.image_mean = self.get_config_from_sec('model', 'image_mean')\n        self.image_std = self.get_config_from_sec('model', 'image_std')\n        self.num_layers = self.get_config_from_sec('model', 'num_layers')\n        self.num_epochs = self.get_config_from_sec('train', 'epoch')\n        self.total_videos = self.get_config_from_sec('train', 'total_videos')\n        self.base_learning_rate = self.get_config_from_sec(\n            'train', 'learning_rate')\n        self.learning_rate_decay = self.get_config_from_sec(\n            'train', 'learning_rate_decay')\n        self.l2_weight_decay = self.get_config_from_sec('train',\n                                                        'l2_weight_decay')\n        self.momentum = self.get_config_from_sec('train', 'momentum')\n        self.seg_num = self.get_config_from_sec(self.mode, 'seg_num',\n                                                self.seg_num)",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn.py:35-52"
+    },
+    "3501": {
+        "file_id": 299,
+        "content": "This code initializes various parameters for the TSN model. It sets segment number, segment length, image mean and standard deviation, number of layers, training epochs, total videos, learning rate, learning rate decay, L2 weight decay, and momentum using get_config_from_sec method.",
+        "type": "comment"
+    },
+    "3502": {
+        "file_id": 299,
+        "content": "        self.target_size = self.get_config_from_sec(self.mode, 'target_size')\n        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size')\n    def build_input(self, use_dataloader=True):\n        image_shape = [3, self.target_size, self.target_size]\n        image_shape[0] = image_shape[0] * self.seglen\n        image_shape = [None, self.seg_num] + image_shape\n        self.use_dataloader = use_dataloader\n        image = static.data(name='image', shape=image_shape, dtype='float32')\n        if self.mode != 'infer':\n            label = static.data(name='label', shape=[None, 1], dtype='int64')\n        else:\n            label = None\n        if use_dataloader:\n            assert self.mode != 'infer', \\\n                        'dataloader is not recommendated when infer, please set use_dataloader to be false.'\n            self.dataloader = paddle.io.DataLoader.from_generator(\n                feed_list=[image, label], capacity=4, iterable=True)\n        self.feature_input = [image]\n        self.label_input = label",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn.py:53-75"
+    },
+    "3503": {
+        "file_id": 299,
+        "content": "The code initializes the target size and batch size, then defines a `build_input` function to create data tensors for the model's inputs. It generates image and label tensors with specified shapes and data types, and optionally creates a DataLoader for handling data if not in infer mode. The feature and label inputs are stored as separate lists.",
+        "type": "comment"
+    },
+    "3504": {
+        "file_id": 299,
+        "content": "    def create_model_args(self):\n        cfg = {}\n        cfg['layers'] = self.num_layers\n        cfg['class_dim'] = self.num_classes\n        cfg['seg_num'] = self.seg_num\n        return cfg\n    def build_model(self):\n        cfg = self.create_model_args()\n        videomodel = TSN_ResNet(layers=cfg['layers'],\n                                seg_num=cfg['seg_num'],\n                                is_training=(self.mode == 'train'),\n                                is_extractor=self.is_videotag)\n        out = videomodel.net(input=self.feature_input[0],\n                             class_dim=cfg['class_dim'])\n        self.network_outputs = [out]\n    def optimizer(self):\n        assert self.mode == 'train', \"optimizer only can be get in train mode\"\n        epoch_points = [self.num_epochs / 3, self.num_epochs * 2 / 3]\n        total_videos = self.total_videos\n        step = int(total_videos / self.batch_size + 1)\n        bd = [e * step for e in epoch_points]\n        base_lr = self.base_learning_rate\n        lr_decay = self.learning_rate_decay",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn.py:77-101"
+    },
+    "3505": {
+        "file_id": 299,
+        "content": "The code defines a model with configurable parameters and builds the model instance. It also includes an optimizer function that adjusts learning rate based on epoch points and total videos.",
+        "type": "comment"
+    },
+    "3506": {
+        "file_id": 299,
+        "content": "        lr = [base_lr, base_lr * lr_decay, base_lr * lr_decay * lr_decay]\n        l2_weight_decay = self.l2_weight_decay\n        momentum = self.momentum\n        optimizer = paddle.optimizer.Momentum(\n            learning_rate=paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,\n                                                       values=lr),\n            momentum=momentum,\n            weight_decay=paddle.regularizer.L2Decay(coeff=l2_weight_decay))\n        return optimizer\n    def loss(self):\n        assert self.mode != 'infer', \"invalid loss calculationg in infer mode\"\n        cost = paddle.nn.functional.cross_entropy(input=self.network_outputs[0], \\\n                           label=self.label_input, ignore_index=-1)\n        self.loss_ = paddle.mean(x=cost)\n        return self.loss_\n    def outputs(self):\n        return self.network_outputs\n    def feeds(self):\n        return self.feature_input if self.mode == 'infer' else self.feature_input + [\n            self.label_input\n        ]\n    def fetches(self):\n        if self.mode == 'train' or self.mode == 'valid':",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn.py:102-129"
+    },
+    "3507": {
+        "file_id": 299,
+        "content": "This code defines a model for the VideoTag application. It creates an optimizer with a piecewise learning rate decay and L2 weight decay, calculates the loss using cross entropy, updates the loss value, returns the network outputs, and handles feeds and fetches based on the mode (train, valid or infer).",
+        "type": "comment"
+    },
+    "3508": {
+        "file_id": 299,
+        "content": "            losses = self.loss()\n            fetch_list = [losses, self.network_outputs[0], self.label_input]\n        elif self.mode == 'test':\n            losses = self.loss()\n            fetch_list = [losses, self.network_outputs[0], self.label_input]\n        elif self.mode == 'infer':\n            fetch_list = self.network_outputs\n        else:\n            raise NotImplementedError('mode {} not implemented'.format(\n                self.mode))\n        return fetch_list\n    def pretrain_info(self):\n        return None, None\n    def weights_info(self):\n        return None\n    def load_pretrain_params(self, exe, pretrain, prog):\n        def is_parameter(var):\n            return isinstance(var, paddle.framework.Parameter)\n        logger.info(\n            \"Load pretrain weights from {}, exclude fc layer.\".format(pretrain))\n        print(\"===pretrain===\", pretrain)\n        state_dict = paddle.static.load_program_state(pretrain)\n        dict_keys = list(state_dict.keys())\n        # remove fc layer when pretrain, because the number of classes in final fc may not match",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn.py:130-159"
+    },
+    "3509": {
+        "file_id": 299,
+        "content": "This code defines a model with three modes: train, test, and infer. It returns the losses, network outputs, and label inputs in train and test modes, while only returning network outputs in infer mode. The function pretrain_info() returns no information, weights_info() also returns no info, and load_pretrain_params() loads pre-trained weights from a specific file while excluding the final fully connected (fc) layer.",
+        "type": "comment"
+    },
+    "3510": {
+        "file_id": 299,
+        "content": "        for name in dict_keys:\n            if \"fc_0\" in name:\n                del state_dict[name]\n                print('Delete {} from pretrained parameters. Do not load it'.\n                      format(name))\n        paddle.static.set_program_state(prog, state_dict)",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn.py:160-165"
+    },
+    "3511": {
+        "file_id": 299,
+        "content": "The code is deleting specific keys from the pretrained parameters and then setting the program state with the updated dictionary. This could be done to avoid loading unnecessary or conflicting parameters during the model's execution.",
+        "type": "comment"
+    },
+    "3512": {
+        "file_id": 300,
+        "content": "/applications/VideoTag/models/tsn/tsn_res_model.py",
+        "type": "filepath"
+    },
+    "3513": {
+        "file_id": 300,
+        "content": "This code defines a `TSN_ResNet` class for creating Temporal Segment Network ResNet models in PaddlePaddle, using bottleneck_block function and performs adaptive average pooling, reshaping, and activation functions.",
+        "type": "summary"
+    },
+    "3514": {
+        "file_id": 300,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport time\nimport sys\nimport paddle\nimport paddle.static as static\nimport math\nclass TSN_ResNet():\n    def __init__(self,\n                 layers=50,\n                 seg_num=7,\n                 is_training=True,\n                 is_extractor=False):\n        self.layers = layers\n        self.seg_num = seg_num\n        self.is_training = is_training\n        self.is_extractor = is_extractor\n    def conv_bn_layer(self,",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn_res_model.py:1-34"
+    },
+    "3515": {
+        "file_id": 300,
+        "content": "This code defines a class `TSN_ResNet` for creating a Temporal Segment Network ResNet model. It has parameters such as layers, segment number, training flag and extractor flag. The class contains a method `conv_bn_layer()` to create a convolution-batch normalization layer.",
+        "type": "comment"
+    },
+    "3516": {
+        "file_id": 300,
+        "content": "                      input,\n                      num_filters,\n                      filter_size,\n                      stride=1,\n                      groups=1,\n                      act=None,\n                      name=None):\n        conv = paddle.static.nn.conv2d(\n            input=input,\n            num_filters=num_filters,\n            filter_size=filter_size,\n            stride=stride,\n            padding=(filter_size - 1) // 2,\n            groups=groups,\n            param_attr=paddle.ParamAttr(name=name + \"_weights\"),\n            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        return paddle.static.nn.batch_norm(\n            input=conv,\n            act=act,\n            is_test=(not self.is_training),\n            param_attr=paddle.ParamAttr(name=bn_name + \"_scale\"),\n            bias_attr=paddle.ParamAttr(bn_name + '_offset'),\n            moving_mean_name=bn_name + \"_mean\",\n            moving_variance_name=bn_name + '_variance')",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn_res_model.py:35-63"
+    },
+    "3517": {
+        "file_id": 300,
+        "content": "This function defines a convolutional layer and returns it after passing through a batch normalization layer. It takes input, number of filters, filter size, stride, groups (number of groups in the layers), activation function if any, and name as arguments. If the name is \"conv1\", the bn_name would be \"bn_conv1\" else, it would be \"bn0\", followed by the original name. The batch normalization layer takes input, activation function if any, whether it's in test mode or not, scale and offset attribute names for parameters, and names for moving mean and variance.",
+        "type": "comment"
+    },
+    "3518": {
+        "file_id": 300,
+        "content": "    def shortcut(self, input, ch_out, stride, name):\n        ch_in = input.shape[1]\n        if ch_in != ch_out or stride != 1:\n            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)\n        else:\n            return input\n    def bottleneck_block(self, input, num_filters, stride, name):\n        conv0 = self.conv_bn_layer(input=input,\n                                   num_filters=num_filters,\n                                   filter_size=1,\n                                   act='relu',\n                                   name=name + \"_branch2a\")\n        conv1 = self.conv_bn_layer(input=conv0,\n                                   num_filters=num_filters,\n                                   filter_size=3,\n                                   stride=stride,\n                                   act='relu',\n                                   name=name + \"_branch2b\")\n        conv2 = self.conv_bn_layer(input=conv1,\n                                   num_filters=num_filters * 4,\n                                   filter_size=1,",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn_res_model.py:65-86"
+    },
+    "3519": {
+        "file_id": 300,
+        "content": "This code defines two functions: 'shortcut' and 'bottleneck_block'. The shortcut function determines if input dimensions match the desired output, and returns either a convolution-batch normalization layer or the input itself. The bottleneck_block function applies two consecutive 1x1 and 3x3 convolutions with batch normalization and ReLU activations in between.",
+        "type": "comment"
+    },
+    "3520": {
+        "file_id": 300,
+        "content": "                                   act=None,\n                                   name=name + \"_branch2c\")\n        short = self.shortcut(input,\n                              num_filters * 4,\n                              stride,\n                              name=name + \"_branch1\")\n        return paddle.add(x=short, y=conv2)\n    def net(self, input, class_dim=101):\n        layers = self.layers\n        seg_num = self.seg_num\n        supported_layers = [50, 101, 152]\n        assert layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(supported_layers, layers)\n        # reshape input\n        channels = input.shape[2]\n        short_size = input.shape[3]\n        input = paddle.reshape(\n            x=input, shape=[-1, channels, short_size, short_size])\n        if layers == 50:\n            depth = [3, 4, 6, 3]\n        elif layers == 101:\n            depth = [3, 4, 23, 3]\n        elif layers == 152:\n            depth = [3, 8, 36, 3]\n        num_filters = [64, 128, 256, 512]\n        conv = self.conv_bn_layer(input=input,",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn_res_model.py:87-118"
+    },
+    "3521": {
+        "file_id": 300,
+        "content": "The code defines a function `net` that takes an input, performs operations based on the specified number of layers (50, 101 or 152), and reshapes the input. It then applies different configurations of convolutional and batch normalization layers to the input for each specified layer. The final output is the sum of two previous calculations (conv and short).",
+        "type": "comment"
+    },
+    "3522": {
+        "file_id": 300,
+        "content": "                                  num_filters=64,\n                                  filter_size=7,\n                                  stride=2,\n                                  act='relu',\n                                  name='conv1')\n        conv = paddle.nn.functional.max_pool2d(x=conv,\n                                   kernel_size=3,\n                                   stride=2,\n                                   padding=1)\n        for block in range(len(depth)):\n            for i in range(depth[block]):\n                if layers in [101, 152] and block == 2:\n                    if i == 0:\n                        conv_name = \"res\" + str(block + 2) + \"a\"\n                    else:\n                        conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                else:\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                conv = self.bottleneck_block(\n                    input=conv,\n                    num_filters=num_filters[block],\n                    stride=2 if i == 0 and block != 0 else 1,",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn_res_model.py:119-142"
+    },
+    "3523": {
+        "file_id": 300,
+        "content": "This code defines a ResNet model with multiple convolutional layers and pooling operations. It uses the PaddlePaddle library and includes a bottleneck_block function for the residual blocks. The number of filters, filter size, and stride are defined based on the layer and depth.",
+        "type": "comment"
+    },
+    "3524": {
+        "file_id": 300,
+        "content": "                    name=conv_name)\n        pool = paddle.nn.functional.adaptive_avg_pool2d(x=conv, output_size=1)\n        feature = paddle.reshape(x=pool,\n                                       shape=[-1, seg_num, pool.shape[1]])\n        if self.is_extractor:\n            out = feature\n        else:\n            out = paddle.mean(x=feature, axis=1)\n            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)\n            out = static.nn.fc(\n                x=out,\n                size=class_dim,\n                activation='softmax',\n                weight_attr=paddle.ParamAttr(\n                    initializer=paddle.nn.initializer.Uniform(low=-stdv, high=stdv)))\n        return out",
+        "type": "code",
+        "location": "/applications/VideoTag/models/tsn/tsn_res_model.py:143-161"
+    },
+    "3525": {
+        "file_id": 300,
+        "content": "This code performs adaptive average pooling, reshapes the feature map, and if not an extractor, calculates the mean along axis 1. Then, it applies a softmax activation function and returns the output.",
+        "type": "comment"
+    },
+    "3526": {
+        "file_id": 301,
+        "content": "/applications/VideoTag/models/utils.py",
+        "type": "filepath"
+    },
+    "3527": {
+        "file_id": 301,
+        "content": "This code imports modules, defines decompressing and downloading functions, ensures directory existence, deletes downloaded files post-decompression, and includes an AttrDict class for attribute access.",
+        "type": "summary"
+    },
+    "3528": {
+        "file_id": 301,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport wget\nimport tarfile\n__all__ = ['decompress', 'download', 'AttrDict']\ndef decompress(path):\n    t = tarfile.open(path)\n    t.extractall(path=os.path.split(path)[0])\n    t.close()\n    os.remove(path)\ndef download(url, path):\n    weight_dir = os.path.split(path)[0]\n    if not os.path.exists(weight_dir):\n        os.makedirs(weight_dir)\n    path = path + \".tar.gz\"\n    wget.download(url, path)\n    decompress(path)",
+        "type": "code",
+        "location": "/applications/VideoTag/models/utils.py:1-36"
+    },
+    "3529": {
+        "file_id": 301,
+        "content": "The code imports necessary modules and defines functions for decompressing and downloading files. It also ensures a directory exists before attempting to download a file, then deletes the downloaded file after decompression.",
+        "type": "comment"
+    },
+    "3530": {
+        "file_id": 301,
+        "content": "class AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value",
+        "type": "code",
+        "location": "/applications/VideoTag/models/utils.py:39-47"
+    },
+    "3531": {
+        "file_id": 301,
+        "content": "This code defines an AttrDict class, which is a subclass of dict with additional getattr and setattr methods for accessing and modifying its elements as attributes.",
+        "type": "comment"
+    },
+    "3532": {
+        "file_id": 302,
+        "content": "/applications/VideoTag/predict.py",
+        "type": "filepath"
+    },
+    "3533": {
+        "file_id": 302,
+        "content": "The script initializes a larger application, sets up logging and imports modules, predicts video tags using PaddleVideo's models, configures parameters, builds the model, prepares inputs/outputs, runs inference, checks file existence, retrieves infer reader, sets up data feeder, fetches model outputs, collects results with video IDs, logs/saves average processing time, and checks GPU availability and version compatibility before running.",
+        "type": "summary"
+    },
+    "3534": {
+        "file_id": 302,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport time\nimport logging\nimport argparse\nimport ast\nimport numpy as np\nimport paddle\nimport paddle.static as static\ntry:\n    import cPickle as pickle\nexcept:\n    import pickle\nfrom utils.config_utils import *\nimport models\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'",
+        "type": "code",
+        "location": "/applications/VideoTag/predict.py:1-37"
+    },
+    "3535": {
+        "file_id": 302,
+        "content": "This code appears to be an import and initialization script for a larger application. It sets up logging, imports various modules and libraries, checks the CUDA availability, and performs version checking. The code also includes licensing information and copyright notices.",
+        "type": "comment"
+    },
+    "3536": {
+        "file_id": 302,
+        "content": "logging.basicConfig(level=logging.DEBUG, format=FORMAT, stream=sys.stdout)\nlogger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='AttentionCluster',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/attention_cluster.txt',\n                        help='path to config file of model')\n    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument(\n        '--weights',\n        type=str,\n        default='./data/checkpoints/AttentionLSTM_epoch9.pdparams',\n        help='weight path.')\n    parser.add_argument('--batch_size',\n                        type=int,\n                        default=1,\n                        help='sample number in a batch for inference.')",
+        "type": "code",
+        "location": "/applications/VideoTag/predict.py:38-64"
+    },
+    "3537": {
+        "file_id": 302,
+        "content": "The code imports logging, sets up a logger with debug level and configures the format. It then defines a function 'parse_args' that uses argparse to set default values for model name, config file path, whether to use GPU or not, weight path, and batch size for inference.",
+        "type": "comment"
+    },
+    "3538": {
+        "file_id": 302,
+        "content": "    parser.add_argument('--filelist',\n                        type=str,\n                        default=None,\n                        help='path to inferenece data file lists file.')\n    parser.add_argument('--log_interval',\n                        type=int,\n                        default=1,\n                        help='mini-batch interval to log.')\n    parser.add_argument('--infer_topk',\n                        type=int,\n                        default=20,\n                        help='topk predictions to restore.')\n    parser.add_argument('--save_dir',\n                        type=str,\n                        default=os.path.join('data', 'predict_results',\n                                             'attention_lstm'),\n                        help='directory to store results')\n    parser.add_argument('--video_path',\n                        type=str,\n                        default=None,\n                        help='directory to store results')\n    parser.add_argument('--label_file',\n                        type=str,",
+        "type": "code",
+        "location": "/applications/VideoTag/predict.py:65-87"
+    },
+    "3539": {
+        "file_id": 302,
+        "content": "This code snippet is part of a Python script for video tag prediction. It uses an argument parser to specify input files, log intervals, top k predictions and output directory. The default directories and paths are provided if no arguments are specified by the user.",
+        "type": "comment"
+    },
+    "3540": {
+        "file_id": 302,
+        "content": "                        default='label_3396.txt',\n                        help='chinese label file path')\n    args = parser.parse_args()\n    return args\ndef infer(args):\n    # parse config\n    config = parse_config(args.config)\n    infer_config = merge_configs(config, 'infer', vars(args))\n    print_configs(infer_config, \"Infer\")\n    infer_model = models.get_model(args.model_name, infer_config, mode='infer')\n    infer_model.build_input(use_dataloader=False)\n    infer_model.build_model()\n    infer_feeds = infer_model.feeds()\n    infer_outputs = infer_model.outputs()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(static.default_startup_program())\n    filelist = args.filelist or infer_config.INFER.filelist\n    filepath = args.video_path or infer_config.INFER.get('filepath', '')\n    if filepath != '':\n        assert os.path.exists(filepath), \"{} not exist.\".format(filepath)\n    else:\n        assert os.path.exists(filelist), \"{} not exist.\".format(filelist)",
+        "type": "code",
+        "location": "/applications/VideoTag/predict.py:88-115"
+    },
+    "3541": {
+        "file_id": 302,
+        "content": "The code defines a function that takes arguments, parses the config file, and builds an inference model using PaddleVideo's models. It then builds the inputs and outputs of the model, sets up the Executor based on GPU availability, and runs the startup program. Finally, it checks if the video or filelist path exists before proceeding with the inference process.",
+        "type": "comment"
+    },
+    "3542": {
+        "file_id": 302,
+        "content": "    # get infer reader\n    infer_reader = get_reader(args.model_name.upper(), 'infer', infer_config)\n    if args.weights:\n        assert os.path.exists(\n            args.weights), \"Given weight dir {} not exist.\".format(args.weights)\n    # if no weight files specified, download weights from paddle\n    weights = args.weights or infer_model.get_weights()\n    infer_model.load_test_weights(exe, weights, static.default_main_program())\n    infer_feeder = paddle.fluid.DataFeeder(place=place, feed_list=infer_feeds)\n    fetch_list = infer_model.fetches()\n    infer_metrics = get_metrics(args.model_name.upper(), 'infer', infer_config)\n    infer_metrics.reset()\n    periods = []\n    cur_time = time.time()\n    for infer_iter, data in enumerate(infer_reader()):\n        data_feed_in = [items[:-1] for items in data]\n        video_id = [items[-1] for items in data]\n        infer_outs = exe.run(fetch_list=fetch_list,\n                             feed=infer_feeder.feed(data_feed_in))\n        infer_result_list = [item for item in infer_outs] + [video_id]",
+        "type": "code",
+        "location": "/applications/VideoTag/predict.py:117-141"
+    },
+    "3543": {
+        "file_id": 302,
+        "content": "This code retrieves an infer reader, checks and loads weights for the model, sets up a data feeder, fetches outputs from the model, and collects results with video IDs.",
+        "type": "comment"
+    },
+    "3544": {
+        "file_id": 302,
+        "content": "        prev_time = cur_time\n        cur_time = time.time()\n        period = cur_time - prev_time\n        periods.append(period)\n        infer_metrics.accumulate(infer_result_list)\n        if args.log_interval > 0 and infer_iter % args.log_interval == 0:\n            logger.info('Processed {} samples'.format(\n                (infer_iter + 1) * len(video_id)))\n    logger.info('[INFER] infer finished. average time: {}'.format(\n        np.mean(periods)))\n    if not os.path.isdir(args.save_dir):\n        os.makedirs(args.save_dir)\n    infer_metrics.finalize_and_log_out(savedir=args.save_dir,\n                                       label_file=args.label_file)\nif __name__ == \"__main__\":\n    args = parse_args()\n    # check whether the installed paddle is compiled with GPU\n    check_cuda(args.use_gpu)\n    check_version()\n    logger.info(args)\n    infer(args)",
+        "type": "code",
+        "location": "/applications/VideoTag/predict.py:143-171"
+    },
+    "3545": {
+        "file_id": 302,
+        "content": "The code calculates the average processing time for each sample, logs the information, and saves the final output. It uses a log interval to report progress, and the `infer_metrics` object accumulates data for logging and saving. The code also checks for GPU availability and version compatibility before running.",
+        "type": "comment"
+    },
+    "3546": {
+        "file_id": 303,
+        "content": "/applications/VideoTag/reader/__init__.py",
+        "type": "filepath"
+    },
+    "3547": {
+        "file_id": 303,
+        "content": "This code imports and registers two reader classes, FeatureReader and KineticsReader, with the names \"ATTENTIONLSTM\" and \"TSN\", respectively. The registration occurs in alphabetical order.",
+        "type": "summary"
+    },
+    "3548": {
+        "file_id": 303,
+        "content": "from .reader_utils import regist_reader, get_reader\nfrom .feature_reader import FeatureReader\nfrom .kinetics_reader import KineticsReader\n# regist reader, sort by alphabet\nregist_reader(\"ATTENTIONLSTM\", FeatureReader)\nregist_reader(\"TSN\", KineticsReader)",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/__init__.py:1-7"
+    },
+    "3549": {
+        "file_id": 303,
+        "content": "This code imports and registers two reader classes, FeatureReader and KineticsReader, with the names \"ATTENTIONLSTM\" and \"TSN\", respectively. The registration occurs in alphabetical order.",
+        "type": "comment"
+    },
+    "3550": {
+        "file_id": 304,
+        "content": "/applications/VideoTag/reader/feature_reader.py",
+        "type": "filepath"
+    },
+    "3551": {
+        "file_id": 304,
+        "content": "The DataReader class handles YouTube-8M dataset using LSTM models, pickle for data loading, and supports various Python versions. It sets batch size, shuffles files if training mode is on, and reads video frames with labels/filenames into batches using one-hot encoding.",
+        "type": "summary"
+    },
+    "3552": {
+        "file_id": 304,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport sys\nfrom .reader_utils import DataReader\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\nimport numpy as np\nimport random\npython_ver = sys.version_info\nclass FeatureReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm\n    dataset cfg: num_classes",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/feature_reader.py:1-34"
+    },
+    "3553": {
+        "file_id": 304,
+        "content": "This code is for a DataReader class that handles the youtube-8M dataset. The features are extracted by prior networks, specifically for LSTM models. It uses pickle to load data and BytesIO for compatibility with different python versions.",
+        "type": "comment"
+    },
+    "3554": {
+        "file_id": 304,
+        "content": "                 batch_size\n                 list\n    \"\"\"\n    def __init__(self, name, mode, cfg):\n        self.name = name\n        self.mode = mode\n        self.num_classes = cfg.MODEL.num_classes\n        # set batch size and file list\n        self.batch_size = cfg[mode.upper()]['batch_size']\n        self.filelist = cfg[mode.upper()]['filelist']\n        self.seg_num = cfg.MODEL.get('seg_num', None)\n    def create_reader(self):\n        fl = open(self.filelist).readlines()\n        fl = [line.strip() for line in fl if line.strip() != '']\n        if self.mode == 'train':\n            random.shuffle(fl)\n        def reader():\n            batch_out = []\n            for item in fl:\n                fileinfo = item.split(' ')\n                filepath = fileinfo[0]\n                rgb = np.load(filepath, allow_pickle=True)\n                nframes = rgb.shape[0]\n                label = [int(i) for i in fileinfo[1:]]\n                one_hot_label = make_one_hot(label, self.num_classes)\n                if self.mode != 'infer':",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/feature_reader.py:35-64"
+    },
+    "3555": {
+        "file_id": 304,
+        "content": "Initializes a feature reader object with specified name, mode and configuration. Sets batch size and file list from the configuration. Reads the file list, removes empty lines and shuffles if in training mode. Defines a nested function reader that iterates over each item in the file list, loads corresponding RGB data and labels, converts labels to one-hot format if not in inference mode, and returns them as batch outputs.",
+        "type": "comment"
+    },
+    "3556": {
+        "file_id": 304,
+        "content": "                    batch_out.append((rgb, one_hot_label))\n                else:\n                    batch_out.append((rgb, filepath.split('/')[-1]))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n        return reader\ndef make_one_hot(label, dim=3862):\n    one_hot_label = np.zeros(dim)\n    one_hot_label = one_hot_label.astype(float)\n    for ind in label:\n        one_hot_label[int(ind)] = 1\n    return one_hot_label",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/feature_reader.py:65-80"
+    },
+    "3557": {
+        "file_id": 304,
+        "content": "This code reads video frames and their labels/filenames into batches, using one-hot encoding for label conversion. The make_one_hot function creates a one-hot encoded vector from the given label.",
+        "type": "comment"
+    },
+    "3558": {
+        "file_id": 305,
+        "content": "/applications/VideoTag/reader/kinetics_reader.py",
+        "type": "filepath"
+    },
+    "3559": {
+        "file_id": 305,
+        "content": "The code introduces a \"KineticsReader\" class to efficiently read Kinetics dataset in mp4 and pkl formats, applying data augmentation for image/video classification tasks. It generates images for multi-threaded processing, and selects frames based on parameters for training or testing mode.",
+        "type": "summary"
+    },
+    "3560": {
+        "file_id": 305,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport cv2\nimport math\nimport random\nimport functools\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\nimport numpy as np\nimport paddle\nfrom PIL import Image, ImageEnhance\nimport logging\nfrom .reader_utils import DataReader\nlogger = logging.getLogger(__name__)\npython_ver = sys.version_info\nclass VideoRecord(object):\n    '''\n    define a class method which used to describe the frames information of videos",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:1-41"
+    },
+    "3561": {
+        "file_id": 305,
+        "content": "This code is from the PaddleVideo library's VideoTag application, specifically the kinetics_reader.py file. It imports necessary modules, defines a VideoRecord class to describe frames information of videos, and includes license and version details. The code seems to be part of a video processing framework for machine learning tasks, potentially in image or video classification.",
+        "type": "comment"
+    },
+    "3562": {
+        "file_id": 305,
+        "content": "    1. self._data[0] is the frames' path\n    2. self._data[1] is the number of frames\n    3. self._data[2] is the label of frames\n    '''\n    def __init__(self, row):\n        self._data = row\n    @property\n    def path(self):\n        return self._data[0]\n    @property\n    def num_frames(self):\n        return int(self._data[1])\n    @property\n    def label(self):\n        return int(self._data[2])\nclass KineticsReader(DataReader):\n    \"\"\"\n    Data reader for kinetics dataset of two format mp4 and pkl.\n    1. mp4, the original format of kinetics400\n    2. pkl, the mp4 was decoded previously and stored as pkl\n    In both case, load the data, and then get the frame data in the form of numpy and label as an integer.\n     dataset cfg: format\n                  num_classes\n                  seg_num\n                  short_size\n                  target_size\n                  num_reader_threads\n                  buf_size\n                  image_mean\n                  image_std\n                  batch_size\n                  list\n    \"\"\"",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:42-79"
+    },
+    "3563": {
+        "file_id": 305,
+        "content": "This code defines a class \"KineticsReader\" for reading the Kinetics dataset in two formats: mp4 and pkl. It initializes with a row of data containing the frames' path, number of frames, and label. The class has properties for accessing these data elements. The code also specifies dataset configuration options.",
+        "type": "comment"
+    },
+    "3564": {
+        "file_id": 305,
+        "content": "    def __init__(self, name, mode, cfg):\n        super(KineticsReader, self).__init__(name, mode, cfg)\n        self.format = cfg.MODEL.format\n        self.num_classes = self.get_config_from_sec('model', 'num_classes')\n        self.seg_num = self.get_config_from_sec('model', 'seg_num')\n        self.seglen = self.get_config_from_sec('model', 'seglen')\n        self.seg_num = self.get_config_from_sec(mode, 'seg_num', self.seg_num)\n        self.short_size = self.get_config_from_sec(mode, 'short_size')\n        self.target_size = self.get_config_from_sec(mode, 'target_size')\n        self.num_reader_threads = self.get_config_from_sec(\n            mode, 'num_reader_threads')\n        self.buf_size = self.get_config_from_sec(mode, 'buf_size')\n        self.fix_random_seed = self.get_config_from_sec(mode, 'fix_random_seed')\n        self.img_mean = np.array(cfg.MODEL.image_mean).reshape(\n            [3, 1, 1]).astype(np.float32)\n        self.img_std = np.array(cfg.MODEL.image_std).reshape([3, 1, 1]).astype(\n            np.float32)",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:80-98"
+    },
+    "3565": {
+        "file_id": 305,
+        "content": "This code initializes an object of the KineticsReader class, which takes in parameters like name, mode, and configuration (cfg). It retrieves various attributes from the configuration, such as number of classes, segmentation information, image sizes, reader threads, buffer size, and random seed. It also sets the mean and standard deviation values for image normalization.",
+        "type": "comment"
+    },
+    "3566": {
+        "file_id": 305,
+        "content": "        # set batch size and file list\n        self.batch_size = cfg[mode.upper()]['batch_size']\n        self.filelist = cfg[mode.upper()]['filelist']\n        if self.fix_random_seed:\n            random.seed(0)\n            np.random.seed(0)\n            self.num_reader_threads = 1\n    def create_reader(self):\n        assert os.path.exists(self.filelist), \\\n                    '{} not exist, please check the data list'.format(self.filelist)\n        _reader = self._reader_creator(self.filelist, self.mode, seg_num=self.seg_num, seglen = self.seglen, \\\n                         short_size = self.short_size, target_size = self.target_size, \\\n                         img_mean = self.img_mean, img_std = self.img_std, \\\n                         shuffle = (self.mode == 'train'), \\\n                         num_threads = self.num_reader_threads, \\\n                         buf_size = self.buf_size, format = self.format)\n        def _batch_reader():\n            batch_out = []\n            for imgs, label in _reader():\n                if imgs is None:",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:99-121"
+    },
+    "3567": {
+        "file_id": 305,
+        "content": "This code sets the batch size and file list for a video reader. It also ensures random seeds are set, limits the number of reader threads to 1 if fixing random seed, asserts that the filelist exists, creates a video reader object using a provided creator function, and defines a batch_reader generator function to iterate over the reader's output.",
+        "type": "comment"
+    },
+    "3568": {
+        "file_id": 305,
+        "content": "                    continue\n                batch_out.append((imgs, label))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n        return _batch_reader\n    def _reader_creator(self,\n                        file_list,\n                        mode,\n                        seg_num,\n                        seglen,\n                        short_size,\n                        target_size,\n                        img_mean,\n                        img_std,\n                        shuffle=False,\n                        num_threads=1,\n                        buf_size=1024,\n                        format='frames'):\n        def decode_mp4(sample, mode, seg_num, seglen, short_size, target_size,\n                       img_mean, img_std):\n            sample = sample[0].split(' ')\n            mp4_path = sample[0]\n            if mode == \"infer\":\n                label = mp4_path.split('/')[-1]\n            else:\n                label = int(sample[1])\n            try:",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:122-151"
+    },
+    "3569": {
+        "file_id": 305,
+        "content": "This code defines a function `_reader_creator` that takes in various parameters and returns another function `decode_mp4`. The returned function reads video frames from MP4 files, extracts labels if necessary, and yields batches of images and labels based on batch size and other specified parameters.",
+        "type": "comment"
+    },
+    "3570": {
+        "file_id": 305,
+        "content": "                imgs = mp4_loader(mp4_path, seg_num, seglen, mode)\n                if len(imgs) < 1:\n                    logger.error('{} frame length {} less than 1.'.format(\n                        mp4_path, len(imgs)))\n                    return None, None\n            except:\n                logger.error('Error when loading {}'.format(mp4_path))\n                return None, None\n            return imgs_transform(imgs, mode, seg_num, seglen, \\\n                         short_size, target_size, img_mean, img_std, name = self.name), label\n        def decode_frames(sample, mode, seg_num, seglen, short_size,\n                          target_size, img_mean, img_std):\n            recode = VideoRecord(sample[0].split(' '))\n            frames_dir_path = recode.path\n            if mode == \"infer\":\n                label = frames_dir_path\n            else:\n                label = recode.label\n            try:\n                imgs = frames_loader(recode, seg_num, seglen, mode)\n                if len(imgs) < 1:\n                    logger.error('{} frame length {} less than 1.'.format(",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:152-176"
+    },
+    "3571": {
+        "file_id": 305,
+        "content": "This code is defining two functions: `kinetics_reader` and `decode_frames`. The `kinetics_reader` function loads frames from a given MP4 file using `mp4_loader`, applies transformations if necessary, and returns the frames along with their corresponding labels. It also logs an error if the number of frames is less than 1. If an exception occurs during the process, it logs an error message as well. The `decode_frames` function loads frames from a specified directory (specified by the `recode` object) using the `frames_loader` function and returns the frames along with their labels. If the number of frames is less than 1, it logs an error; if an exception occurs, it also logs an error.",
+        "type": "comment"
+    },
+    "3572": {
+        "file_id": 305,
+        "content": "                        frames_dir_path, len(imgs)))\n                    return None, None\n            except:\n                logger.error('Error when loading {}'.format(frames_dir_path))\n                return None, None\n            return imgs_transform(imgs,\n                                  mode,\n                                  seg_num,\n                                  seglen,\n                                  short_size,\n                                  target_size,\n                                  img_mean,\n                                  img_std,\n                                  name=self.name), label\n        def reader_():\n            with open(file_list) as flist:\n                lines = [line.strip() for line in flist]\n                if shuffle:\n                    random.shuffle(lines)\n                for line in lines:\n                    file_path = line.strip()\n                    yield [file_path]\n        if format == 'frames':\n            decode_func = decode_frames\n        elif format == 'video':",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:177-204"
+    },
+    "3573": {
+        "file_id": 305,
+        "content": "The code snippet is responsible for loading video frames from a specified directory and handling any errors that may occur during the process. It takes the frames directory path, image format (frames or video), segment number, sequence length, short size, target size, image mean, image standard deviation, and name as input parameters. The code also defines a function reader() to read the file list and shuffle its lines if necessary. Based on the specified format (frames or video), it calls the appropriate decoding function (decode_frames or decode_video).",
+        "type": "comment"
+    },
+    "3574": {
+        "file_id": 305,
+        "content": "            decode_func = decode_mp4\n        else:\n            raise (\"Not implemented format {}\".format(format))\n        mapper = functools.partial(decode_func,\n                                   mode=mode,\n                                   seg_num=seg_num,\n                                   seglen=seglen,\n                                   short_size=short_size,\n                                   target_size=target_size,\n                                   img_mean=img_mean,\n                                   img_std=img_std)\n        return paddle.reader.decorator.xmap_readers(mapper,\n                                     reader_,\n                                     num_threads,\n                                     buf_size,\n                                     order=True)\ndef imgs_transform(imgs,\n                   mode,\n                   seg_num,\n                   seglen,\n                   short_size,\n                   target_size,\n                   img_mean,\n                   img_std,\n                   name=''):",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:205-233"
+    },
+    "3575": {
+        "file_id": 305,
+        "content": "This code selects a specific video format decoder function based on the input format. If the format is not recognized, it raises an error. It then applies transformations to the images using the selected function and returns them with additional functionality for efficient processing with multiple threads.",
+        "type": "comment"
+    },
+    "3576": {
+        "file_id": 305,
+        "content": "    imgs = group_scale(imgs, short_size)\n    np_imgs = np.array([np.array(img).astype('float32') for img in imgs])  #dhwc\n    if mode == 'train':\n        np_imgs = group_crop(np_imgs, target_size)\n        np_imgs = group_random_flip(np_imgs)\n    else:\n        np_imgs = group_crop(np_imgs, target_size, is_center=True)\n    np_imgs = np_imgs.transpose(0, 3, 1, 2) / 255  #dchw\n    np_imgs -= img_mean\n    np_imgs /= img_std\n    return np_imgs\ndef group_crop(np_imgs, target_size, is_center=True):\n    d, h, w, c = np_imgs.shape\n    th, tw = target_size, target_size\n    assert (w >= target_size) and (h >= target_size), \\\n          \"image width({}) and height({}) should be larger than crop size\".format(w, h, target_size)\n    if is_center:\n        h_off = int(round((h - th) / 2.))\n        w_off = int(round((w - tw) / 2.))\n    else:\n        w_off = random.randint(0, w - tw)\n        h_off = random.randint(0, h - th)\n    img_crop = np_imgs[:, h_off:h_off + target_size,\n                       w_off:w_off + target_size, :]\n    return img_crop",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:234-266"
+    },
+    "3577": {
+        "file_id": 305,
+        "content": "This code reads images from a dataset and performs data augmentation by cropping, flipping, and normalization. It also checks if the image dimensions are larger than the target crop size before applying the crop operation. If in 'train' mode, it randomly crops the image. Otherwise, it centers the crop. The resulting images are then normalized by subtracting the mean pixel values and dividing by standard deviation for feature extraction.",
+        "type": "comment"
+    },
+    "3578": {
+        "file_id": 305,
+        "content": "def group_random_flip(np_imgs):\n    prob = random.random()\n    if prob < 0.5:\n        ret = np_imgs[:, :, ::-1, :]\n        return ret\n    else:\n        return np_imgs\ndef group_scale(imgs, target_size):\n    resized_imgs = []\n    for i in range(len(imgs)):\n        img = imgs[i]\n        w, h = img.size\n        if (w <= h and w == target_size) or (h <= w and h == target_size):\n            resized_imgs.append(img)\n            continue\n        if w < h:\n            ow = target_size\n            oh = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n        else:\n            oh = target_size\n            ow = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n    return resized_imgs\ndef mp4_loader(filepath, nsample, seglen, mode):\n    cap = cv2.VideoCapture(filepath)\n    videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n    sampledFrames = []\n    for i in range(videolen):\n        ret, frame = cap.read()\n        # maybe first frame is empty",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:269-305"
+    },
+    "3579": {
+        "file_id": 305,
+        "content": "The code defines three functions: \"group_random_flip\" flips the image horizontally with 50% probability, \"group_scale\" resizes images to a specified target size while maintaining aspect ratio, and \"mp4_loader\" loads frames from a video file for further processing.",
+        "type": "comment"
+    },
+    "3580": {
+        "file_id": 305,
+        "content": "        if ret == False:\n            continue\n        img = frame[:, :, ::-1]\n        sampledFrames.append(img)\n    average_dur = int(len(sampledFrames) / nsample)\n    imgs = []\n    for i in range(nsample):\n        idx = 0\n        if mode == 'train':\n            if average_dur >= seglen:\n                idx = random.randint(0, average_dur - seglen)\n                idx += i * average_dur\n            elif average_dur >= 1:\n                idx += i * average_dur\n            else:\n                idx = i\n        else:\n            if average_dur >= seglen:\n                idx = (average_dur - 1) // 2\n                idx += i * average_dur\n            elif average_dur >= 1:\n                idx += i * average_dur\n            else:\n                idx = i\n        for jj in range(idx, idx + seglen):\n            imgbuf = sampledFrames[int(jj % len(sampledFrames))]\n            img = Image.fromarray(imgbuf, mode='RGB')\n            imgs.append(img)\n    return imgs\ndef frames_loader(recode, nsample, seglen, mode):\n    imgpath, num_frames = recode.path, recode.num_frames",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:306-340"
+    },
+    "3581": {
+        "file_id": 305,
+        "content": "This code reads video frames and selects a subset of them based on the provided parameters. It appends each frame in the specified sequence to sampledFrames, calculates average duration, then extracts the required number of frames with a given segment length from the list. The extracted frames are returned at the end.",
+        "type": "comment"
+    },
+    "3582": {
+        "file_id": 305,
+        "content": "    average_dur = int(num_frames / nsample)\n    imgs = []\n    for i in range(nsample):\n        idx = 0\n        if mode == 'train':\n            if average_dur >= seglen:\n                idx = random.randint(0, average_dur - seglen)\n                idx += i * average_dur\n            elif average_dur >= 1:\n                idx += i * average_dur\n            else:\n                idx = i\n        else:\n            if average_dur >= seglen:\n                idx = (average_dur - 1) // 2\n                idx += i * average_dur\n            elif average_dur >= 1:\n                idx += i * average_dur\n            else:\n                idx = i\n        for jj in range(idx, idx + seglen):\n            img = Image.open(\n                os.path.join(imgpath,\n                             'img_{:05d}.jpg'.format(jj + 1))).convert('RGB')\n            imgs.append(img)\n    return imgs",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/kinetics_reader.py:341-367"
+    },
+    "3583": {
+        "file_id": 305,
+        "content": "This code calculates the average duration of video frames and then generates a set of images by randomly selecting start points based on the mode (train or test) and segment length. It opens each image file in RGB format, converts it, and adds it to the list of images returned at the end.",
+        "type": "comment"
+    },
+    "3584": {
+        "file_id": 306,
+        "content": "/applications/VideoTag/reader/reader_utils.py",
+        "type": "filepath"
+    },
+    "3585": {
+        "file_id": 306,
+        "content": "The code defines a `ReaderZoo` class with functions for registering and retrieving readers based on their name, mode, and configuration. A custom exception class is defined for reader not found errors.",
+        "type": "summary"
+    },
+    "3586": {
+        "file_id": 306,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport pickle\nimport cv2\nimport numpy as np\nimport random\nclass ReaderNotFoundError(Exception):\n    \"Error: reader not found\"\n    def __init__(self, reader_name, avail_readers):\n        super(ReaderNotFoundError, self).__init__()\n        self.reader_name = reader_name\n        self.avail_readers = avail_readers\n    def __str__(self):\n        msg = \"Reader {} Not Found.\\nAvailiable readers:\\n\".format(\n            self.reader_name)",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/reader_utils.py:1-31"
+    },
+    "3587": {
+        "file_id": 306,
+        "content": "Importing necessary libraries, defining custom exception class for reader not found error.",
+        "type": "comment"
+    },
+    "3588": {
+        "file_id": 306,
+        "content": "        for reader in self.avail_readers:\n            msg += \"  {}\\n\".format(reader)\n        return msg\nclass DataReader(object):\n    \"\"\"data reader for video input\"\"\"\n    def __init__(self, model_name, mode, cfg):\n        self.name = model_name\n        self.mode = mode\n        self.cfg = cfg\n    def create_reader(self):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def get_config_from_sec(self, sec, item, default=None):\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ReaderZoo(object):\n    def __init__(self):\n        self.reader_zoo = {}\n    def regist(self, name, reader):\n        assert reader.__base__ == DataReader, \"Unknow model type {}\".format(\n            type(reader))\n        self.reader_zoo[name] = reader\n    def get(self, name, mode, cfg):\n        for k, v in self.reader_zoo.items():\n            if k == name:\n                return v(name, mode, cfg)\n        raise ReaderNotFoundError(name, self.reader_zoo.keys())\n# singleton reader_zoo",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/reader_utils.py:32-70"
+    },
+    "3589": {
+        "file_id": 306,
+        "content": "The code defines a DataReader class for video input and a ReaderZoo class for registering and retrieving readers. The DataReader class has an init method for setting the model name, mode, and configuration, as well as a create_reader method that must be implemented by subclasses. The ReaderZoo class registers readers using the regist method and retrieves them based on name, mode, and configuration with the get method.",
+        "type": "comment"
+    },
+    "3590": {
+        "file_id": 306,
+        "content": "reader_zoo = ReaderZoo()\ndef regist_reader(name, reader):\n    reader_zoo.regist(name, reader)\ndef get_reader(name, mode, cfg):\n    reader_model = reader_zoo.get(name, mode, cfg)\n    return reader_model.create_reader()",
+        "type": "code",
+        "location": "/applications/VideoTag/reader/reader_utils.py:71-80"
+    },
+    "3591": {
+        "file_id": 306,
+        "content": "This code defines a class `ReaderZoo` and provides two functions `regist_reader` and `get_reader`. The `ReaderZoo` is used to register different types of readers and retrieve them based on their name, mode, and configuration.",
+        "type": "comment"
+    },
+    "3592": {
+        "file_id": 307,
+        "content": "/applications/VideoTag/train.py",
+        "type": "filepath"
+    },
+    "3593": {
+        "file_id": 307,
+        "content": "The code initializes models, checks CUDA availability and version, parses command line arguments, trains a video tagging model using data parallelism and saves if needed. The code also checks the version, logs arguments, creates a directory, and proceeds to train using those arguments.",
+        "type": "summary"
+    },
+    "3594": {
+        "file_id": 307,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport argparse\nimport ast\nimport logging\nimport paddle\nimport paddle.static as static\nfrom utils.train_utils import train_with_dataloader\nimport models\nfrom utils.config_utils import *\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'",
+        "type": "code",
+        "location": "/applications/VideoTag/train.py:1-32"
+    },
+    "3595": {
+        "file_id": 307,
+        "content": "This code snippet contains the necessary import statements and license information for the VideoTag application in PaddleVideo. It also sets up the logging format and includes utility functions from other modules such as train_utils, config_utils, reader, metrics, and utility. The code checks if CUDA is available and verifies the PaddlePaddle version before proceeding with the training process.",
+        "type": "comment"
+    },
+    "3596": {
+        "file_id": 307,
+        "content": "logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)\nlogger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser(\"Paddle Video train script\")\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='AttentionCluster',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/attention_cluster.txt',\n                        help='path to config file of model')\n    parser.add_argument(\n        '--batch_size',\n        type=int,\n        default=None,\n        help='training batch size. None to use config file setting.')\n    parser.add_argument(\n        '--learning_rate',\n        type=float,\n        default=None,\n        help='learning rate use for training. None to use config file setting.')\n    parser.add_argument('--pretrain',\n                        type=str,\n                        default=None,\n                        help='path to pretrain weights.')",
+        "type": "code",
+        "location": "/applications/VideoTag/train.py:33-60"
+    },
+    "3597": {
+        "file_id": 307,
+        "content": "This code block sets up logging, defines a function parse_args which uses argparse to create an argument parser for specifying model name, config file path, batch size, learning rate and pretrain weights. It provides default values for these arguments in case they are not specified by the user.",
+        "type": "comment"
+    },
+    "3598": {
+        "file_id": 307,
+        "content": "    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument('--no_memory_optimize',\n                        action='store_true',\n                        default=False,\n                        help='whether to use memory optimize in train')\n    parser.add_argument('--epoch',\n                        type=int,\n                        default=None,\n                        help='epoch number, 0 for read from config file')\n    parser.add_argument('--valid_interval',\n                        type=int,\n                        default=1,\n                        help='validation epoch interval, 0 for no validation.')\n    parser.add_argument('--save_dir',\n                        type=str,\n                        default=os.path.join('data', 'checkpoints'),\n                        help='directory name to save train snapshoot')\n    parser.add_argument('--log_interval',\n                        type=int,",
+        "type": "code",
+        "location": "/applications/VideoTag/train.py:61-82"
+    },
+    "3599": {
+        "file_id": 307,
+        "content": "The code snippet is parsing command line arguments for a training program. The options include whether to use GPU, disable memory optimization, specify the epoch number, set validation interval, and provide a directory to save training snapshots.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/36.json b/docs/data/36.json
new file mode 100644
index 000000000..06a832fe0
--- /dev/null
+++ b/docs/data/36.json
@@ -0,0 +1,544 @@
+{
+    "3600": {
+        "file_id": 307,
+        "content": "                        default=1,\n                        help='mini-batch interval to log.')\n    parser.add_argument('--fix_random_seed',\n                        type=ast.literal_eval,\n                        default=False,\n                        help='If set True, enable continuous evaluation job.')\n    args = parser.parse_args()\n    return args\ndef train(args):\n    # parse config\n    config = parse_config(args.config)\n    train_config = merge_configs(config, 'train', vars(args))\n    valid_config = merge_configs(config, 'valid', vars(args))\n    print_configs(train_config, 'Train')\n    train_model = models.get_model(args.model_name, train_config, mode='train')\n    valid_model = models.get_model(args.model_name, valid_config, mode='valid')\n    # build model\n    startup = static.Program()\n    train_prog = static.Program()\n    if args.fix_random_seed:\n        startup.random_seed = 1000\n        train_prog.random_seed = 1000\n    with static.program_guard(train_prog, startup):\n        with paddle.utils.unique_name.guard():",
+        "type": "code",
+        "location": "/applications/VideoTag/train.py:83-109"
+    },
+    "3601": {
+        "file_id": 307,
+        "content": "This code is parsing command line arguments, loading and merging configuration files, initializing models in training and validation modes, and setting up a static program guard for building the model. It also allows the option to fix random seeds for reproducibility.",
+        "type": "comment"
+    },
+    "3602": {
+        "file_id": 307,
+        "content": "            train_model.build_input(use_dataloader=True)\n            train_model.build_model()\n            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label\n            train_feeds = train_model.feeds()\n            train_fetch_list = train_model.fetches()\n            train_loss = train_fetch_list[0]\n            optimizer = train_model.optimizer()\n            optimizer.minimize(train_loss)\n            train_dataloader = train_model.dataloader()\n    valid_prog = static.Program()\n    with static.program_guard(valid_prog, startup):\n        with paddle.utils.unique_name.guard():\n            valid_model.build_input(use_dataloader=True)\n            valid_model.build_model()\n            valid_feeds = valid_model.feeds()\n            valid_fetch_list = valid_model.fetches()\n            valid_dataloader = valid_model.dataloader()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(startup)\n    if args.pretrain:\n        train_model.load_pretrain_params(exe, args.pretrain, train_prog)",
+        "type": "code",
+        "location": "/applications/VideoTag/train.py:110-134"
+    },
+    "3603": {
+        "file_id": 307,
+        "content": "This code initializes the training and validation models, builds their inputs, models, and feeds. It also sets up the dataloaders, optimizer, and executor for the training phase. If pre-trained parameters are specified, they will be loaded before training starts.",
+        "type": "comment"
+    },
+    "3604": {
+        "file_id": 307,
+        "content": "    build_strategy = static.BuildStrategy()\n    build_strategy.enable_inplace = True\n    exec_strategy = static.ExecutionStrategy()\n    compiled_train_prog = static.CompiledProgram(\n        train_prog).with_data_parallel(loss_name=train_loss.name,\n                                       build_strategy=build_strategy,\n                                       exec_strategy=exec_strategy)\n    compiled_valid_prog = static.CompiledProgram(\n        valid_prog).with_data_parallel(share_vars_from=compiled_train_prog,\n                                       build_strategy=build_strategy,\n                                       exec_strategy=exec_strategy)\n    # get reader\n    bs_denominator = 1\n    if args.use_gpu:\n        # check number of GPUs\n        gpus = os.getenv(\"CUDA_VISIBLE_DEVICES\", \"\")\n        if gpus == \"\":\n            pass\n        else:\n            gpus = gpus.split(\",\")\n            num_gpus = len(gpus)\n            assert num_gpus == train_config.TRAIN.num_gpus, \\\n                   \"num_gpus({}) set by CUDA_VISIBLE_DEVICES \" \\",
+        "type": "code",
+        "location": "/applications/VideoTag/train.py:136-161"
+    },
+    "3605": {
+        "file_id": 307,
+        "content": "This code initializes a BuildStrategy and an ExecutionStrategy. It then creates two CompiledPrograms, one for training and one for validation, with data parallelism enabled. The number of GPUs is checked using CUDA_VISIBLE_DEVICES environment variable, and the number of GPUs must match what was set in the train configuration file.",
+        "type": "comment"
+    },
+    "3606": {
+        "file_id": 307,
+        "content": "                   \"shoud be the same as that \" \\\n                   \"set in {}({})\".format(\n                   num_gpus, args.config, train_config.TRAIN.num_gpus)\n        bs_denominator = train_config.TRAIN.num_gpus\n    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /\n                                        bs_denominator)\n    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /\n                                        bs_denominator)\n    train_reader = get_reader(args.model_name.upper(), 'train', train_config)\n    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)\n    # get metrics\n    train_metrics = get_metrics(args.model_name.upper(), 'train', train_config)\n    valid_metrics = get_metrics(args.model_name.upper(), 'valid', valid_config)\n    epochs = args.epoch or train_model.epoch_num()\n    exe_places = static.cuda_places() if args.use_gpu else static.cpu_places()\n    train_dataloader.set_sample_list_generator(train_reader, places=exe_places)",
+        "type": "code",
+        "location": "/applications/VideoTag/train.py:162-181"
+    },
+    "3607": {
+        "file_id": 307,
+        "content": "Sets batch size based on number of GPUs, initializes train and valid readers, gets metrics for training and validation, sets the sample list generator for dataloader.",
+        "type": "comment"
+    },
+    "3608": {
+        "file_id": 307,
+        "content": "    valid_dataloader.set_sample_list_generator(valid_reader, places=exe_places)\n    train_with_dataloader(exe,\n                          train_prog,\n                          compiled_train_prog,\n                          train_dataloader,\n                          train_fetch_list,\n                          train_metrics,\n                          epochs=epochs,\n                          log_interval=args.log_interval,\n                          valid_interval=args.valid_interval,\n                          save_dir=args.save_dir,\n                          save_model_name=args.model_name,\n                          fix_random_seed=args.fix_random_seed,\n                          compiled_test_prog=compiled_valid_prog,\n                          test_dataloader=valid_dataloader,\n                          test_fetch_list=valid_fetch_list,\n                          test_metrics=valid_metrics)\nif __name__ == \"__main__\":\n    args = parse_args()\n    # check whether the installed paddle is compiled with GPU\n    check_cuda(args.use_gpu)",
+        "type": "code",
+        "location": "/applications/VideoTag/train.py:182-205"
+    },
+    "3609": {
+        "file_id": 307,
+        "content": "The code trains a video tagging model using PaddlePaddle framework. It sets the sample list generator for valid data and then calls the train_with_dataloader function with various parameters such as number of epochs, log and validation intervals, and data loaders for training and testing. The function trains and tests the model, saving it if necessary. The code also checks whether the installed PaddlePaddle is compiled with GPU support based on the argument provided.",
+        "type": "comment"
+    },
+    "3610": {
+        "file_id": 307,
+        "content": "    check_version()\n    logger.info(args)\n    if not os.path.exists(args.save_dir):\n        os.makedirs(args.save_dir)\n    train(args)",
+        "type": "code",
+        "location": "/applications/VideoTag/train.py:206-212"
+    },
+    "3611": {
+        "file_id": 307,
+        "content": "This code snippet checks the version, logs the arguments, creates a directory if it doesn't exist, and then proceeds to train using those arguments.",
+        "type": "comment"
+    },
+    "3612": {
+        "file_id": 308,
+        "content": "/applications/VideoTag/tsn_extractor.py",
+        "type": "filepath"
+    },
+    "3613": {
+        "file_id": 308,
+        "content": "This script sets up a model environment, downloads weights if needed, initializes the Infer model, and runs inference on input videos while saving results and features.",
+        "type": "summary"
+    },
+    "3614": {
+        "file_id": 308,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport time\nimport logging\nimport argparse\nimport ast\nimport numpy as np\nimport paddle\nimport paddle.static as static\ntry:\n    import cPickle as pickle\nexcept:\n    import pickle\nfrom utils.config_utils import *\nimport models\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'",
+        "type": "code",
+        "location": "/applications/VideoTag/tsn_extractor.py:1-37"
+    },
+    "3615": {
+        "file_id": 308,
+        "content": "This code is a Python script with licensing information and import statements. It imports necessary libraries like numpy, paddle, and others for data processing, model training, and evaluation. The code also sets up the logging format, and checks for CUDA availability and PaddlePaddle version.",
+        "type": "comment"
+    },
+    "3616": {
+        "file_id": 308,
+        "content": "logging.basicConfig(level=logging.DEBUG, format=FORMAT, stream=sys.stdout)\nlogger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='AttentionCluster',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/attention_cluster.txt',\n                        help='path to config file of model')\n    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument(\n        '--weights',\n        type=str,\n        default=None,\n        help=\n        'weight path, None to automatically download weights provided by Paddle.'\n    )\n    parser.add_argument('--batch_size',\n                        type=int,\n                        default=1,\n                        help='sample number in a batch for inference.')",
+        "type": "code",
+        "location": "/applications/VideoTag/tsn_extractor.py:38-66"
+    },
+    "3617": {
+        "file_id": 308,
+        "content": "This code defines a function `parse_args()` to parse command-line arguments for training a model. The arguments include model name, config file path, whether to use GPU, weight path, and batch size. It uses argparse module for easy argument handling. By default, it sets the model name to 'AttentionCluster', config file path to 'configs/attention_cluster.txt', uses GPU if not specified otherwise, automatically downloads weights from Paddle if no specific path is provided, and sets batch size to 1.",
+        "type": "comment"
+    },
+    "3618": {
+        "file_id": 308,
+        "content": "    parser.add_argument('--filelist',\n                        type=str,\n                        default='./data/TsnExtractor.list',\n                        help='path to inferenece data file lists file.')\n    parser.add_argument('--log_interval',\n                        type=int,\n                        default=1,\n                        help='mini-batch interval to log.')\n    parser.add_argument('--infer_topk',\n                        type=int,\n                        default=20,\n                        help='topk predictions to restore.')\n    parser.add_argument('--save_dir',\n                        type=str,\n                        default=os.path.join('data', 'tsn_features'),\n                        help='directory to store tsn feature results')\n    parser.add_argument('--video_path',\n                        type=str,\n                        default=None,\n                        help='directory to store results')\n    args = parser.parse_args()\n    return args\ndef infer(args):\n    # parse config\n    config = parse_config(args.config)",
+        "type": "code",
+        "location": "/applications/VideoTag/tsn_extractor.py:67-93"
+    },
+    "3619": {
+        "file_id": 308,
+        "content": "The code defines command line arguments for the TsnExtractor. It sets default values and provides help messages for each argument. The function then parses these arguments to create an 'args' object, which can be used throughout the program. Additionally, the 'infer' function is defined but not implemented.",
+        "type": "comment"
+    },
+    "3620": {
+        "file_id": 308,
+        "content": "    infer_config = merge_configs(config, 'infer', vars(args))\n    print_configs(infer_config, \"Infer\")\n    infer_model = models.get_model(args.model_name,\n                                   infer_config,\n                                   mode='infer',\n                                   is_videotag=True)\n    infer_model.build_input(use_dataloader=False)\n    infer_model.build_model()\n    infer_feeds = infer_model.feeds()\n    infer_outputs = infer_model.outputs()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(static.default_startup_program())\n    filelist = args.filelist or infer_config.INFER.filelist\n    filepath = args.video_path or infer_config.INFER.get('filepath', '')\n    if filepath != '':\n        assert os.path.exists(filepath), \"{} not exist.\".format(filepath)\n    else:\n        assert os.path.exists(filelist), \"{} not exist.\".format(filelist)\n    # get infer reader\n    infer_reader = get_reader(args.model_name.upper(), 'infer', infer_config)",
+        "type": "code",
+        "location": "/applications/VideoTag/tsn_extractor.py:94-118"
+    },
+    "3621": {
+        "file_id": 308,
+        "content": "The code initializes the Infer model with provided configurations and merges them to create the infer_config. It then builds the input, model, and gets feeds and outputs for inference. The place and executor are set based on whether or not GPU is used. The filelist and video path are checked for existence before initializing the infer reader with the model name, mode (infer), and configurations.",
+        "type": "comment"
+    },
+    "3622": {
+        "file_id": 308,
+        "content": "    if args.weights:\n        assert os.path.exists(\n            args.weights), \"Given weight dir {} not exist.\".format(args.weights)\n    # if no weight files specified, download weights from paddle\n    weights = args.weights or infer_model.get_weights()\n    infer_model.load_test_weights(exe, weights, static.default_main_program())\n    infer_feeder = paddle.fluid.DataFeeder(place=place, feed_list=infer_feeds)\n    fetch_list = infer_model.fetches()\n    infer_metrics = get_metrics(args.model_name.upper(), 'infer', infer_config)\n    infer_metrics.reset()\n    if not os.path.isdir(args.save_dir):\n        os.makedirs(args.save_dir)\n    for infer_iter, data in enumerate(infer_reader()):\n        data_feed_in = [items[:-1] for items in data]\n        video_id = [items[-1] for items in data]\n        bs = len(video_id)\n        feature_outs = exe.run(fetch_list=fetch_list,\n                               feed=infer_feeder.feed(data_feed_in))\n        for i in range(bs):\n            filename = video_id[i].split('/')[-1][:-4]",
+        "type": "code",
+        "location": "/applications/VideoTag/tsn_extractor.py:120-144"
+    },
+    "3623": {
+        "file_id": 308,
+        "content": "This code snippet checks if the weights (model parameters) are provided as an argument. If not, it downloads them from Paddle's servers. Then, it loads the weights into the model and creates a DataFeeder for feeding data during inference. It also initializes metrics to measure inference performance. The code then iterates over each input video, running inference with the loaded model, and saving the results for each frame.",
+        "type": "comment"
+    },
+    "3624": {
+        "file_id": 308,
+        "content": "            np.save(os.path.join(args.save_dir, filename + '.npy'),\n                    feature_outs[0][i])  #shape: seg_num*feature_dim\n    logger.info(\"Feature extraction End~\")\nif __name__ == \"__main__\":\n    args = parse_args()\n    # check whether the installed paddle is compiled with GPU\n    check_cuda(args.use_gpu)\n    check_version()\n    logger.info(args)\n    infer(args)",
+        "type": "code",
+        "location": "/applications/VideoTag/tsn_extractor.py:145-158"
+    },
+    "3625": {
+        "file_id": 308,
+        "content": "Saves extracted features from the PaddleVideo/applications/VideoTag/tsn_extractor.py module using numpy's save function, then logs the end of feature extraction and calls infer function with argument args.",
+        "type": "comment"
+    },
+    "3626": {
+        "file_id": 309,
+        "content": "/applications/VideoTag/utils/config_utils.py",
+        "type": "filepath"
+    },
+    "3627": {
+        "file_id": 309,
+        "content": "The code is part of the PaddleVideo framework's VideoTag application, which imports libraries, sets up a logger, and handles YAML configuration. It logs each key-value pair in the config file, separated by dashed lines.",
+        "type": "summary"
+    },
+    "3628": {
+        "file_id": 309,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport yaml\nfrom .utility import AttrDict\nimport logging\nlogger = logging.getLogger(__name__)\nCONFIG_SECS = [\n    'train',\n    'valid',\n    'test',\n    'infer',\n]\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n    import yaml\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef create_attr_dict(yaml_config):",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/config_utils.py:1-37"
+    },
+    "3629": {
+        "file_id": 309,
+        "content": "This code snippet is part of the PaddleVideo framework's VideoTag application. It imports yaml and AttrDict from utility, sets up a logger for logging messages, defines four configuration section strings, and provides two functions: parse_config() to load config files into an AttrDict object and create_attr_dict() to create an AttrDict object with the specified attributes.",
+        "type": "comment"
+    },
+    "3630": {
+        "file_id": 309,
+        "content": "    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\n    return\ndef merge_configs(cfg, sec, args_dict):\n    assert sec in CONFIG_SECS, \"invalid config section {}\".format(sec)\n    sec_dict = getattr(cfg, sec.upper())\n    for k, v in args_dict.items():\n        if v is None:\n            continue\n        try:\n            if hasattr(sec_dict, k):\n                setattr(sec_dict, k, v)\n        except:\n            pass\n    return cfg\ndef print_configs(cfg, mode):\n    logger.info(\n        \"---------------- {:>5} Arguments ----------------\".format(mode))\n    for sec, sec_items in cfg.items():\n        logger.info(\"{}:\".format(sec))\n        for k, v in sec_items.items():",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/config_utils.py:38-73"
+    },
+    "3631": {
+        "file_id": 309,
+        "content": "This code includes three functions. The first function, `config_utils.py`, is for processing the yaml configuration by converting certain types into AttrDicts and evaluating string values. The second function, `merge_configs()`, merges argument dictionaries with pre-existing config section dictionaries. It skips None values and attempts to set new attributes. Finally, the third function, `print_configs()`, prints configuration arguments in a formatted manner.",
+        "type": "comment"
+    },
+    "3632": {
+        "file_id": 309,
+        "content": "            logger.info(\"    {}:{}\".format(k, v))\n    logger.info(\"-------------------------------------------------\")",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/config_utils.py:74-75"
+    },
+    "3633": {
+        "file_id": 309,
+        "content": "The code is logging information for each key-value pair in the configuration file, and then separating each set of logs with a dashed line.",
+        "type": "comment"
+    },
+    "3634": {
+        "file_id": 310,
+        "content": "/applications/VideoTag/utils/train_utils.py",
+        "type": "filepath"
+    },
+    "3635": {
+        "file_id": 310,
+        "content": "This code uses PaddlePaddle for training, imports modules, defines logging functions, and trains with a dataloader, iterating over batches to track progress, log metrics, profile performance, handle errors, and save model progress.",
+        "type": "summary"
+    },
+    "3636": {
+        "file_id": 310,
+        "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\nimport time\nimport numpy as np\nimport paddle\nimport paddle.static as static\nimport paddle.profiler as profiler\nimport logging\nimport shutil\nlogger = logging.getLogger(__name__)\ndef log_lr_and_step():\n    try:\n        # In optimizers, if learning_rate is set as constant, lr_var\n        # name is 'learning_rate_0', and iteration counter is not\n        # recorded. If learning_rate is set as decayed values from",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/train_utils.py:1-32"
+    },
+    "3637": {
+        "file_id": 310,
+        "content": "This code snippet is likely part of a larger program that uses the PaddlePaddle framework. It imports several modules, defines a function called `log_lr_and_step()`, and sets up a logger object. The purpose of this particular block may be to handle logging learning rate values and tracking the training step during the model's optimization process.",
+        "type": "comment"
+    },
+    "3638": {
+        "file_id": 310,
+        "content": "        # learning_rate_scheduler, lr_var name is 'learning_rate',\n        # and iteration counter is recorded with name '@LR_DECAY_COUNTER@',\n        # better impliment is required here\n        lr_var = static.global_scope().find_var(\"learning_rate\")\n        if not lr_var:\n            lr_var = static.global_scope().find_var(\"learning_rate_0\")\n        lr = np.array(lr_var.get_tensor())\n        lr_count = '[-]'\n        lr_count_var = static.global_scope().find_var(\"@LR_DECAY_COUNTER@\")\n        if lr_count_var:\n            lr_count = np.array(lr_count_var.get_tensor())\n        logger.info(\n            \"------- learning rate {}, learning rate counter {} -----\".format(\n                np.array(lr), np.array(lr_count)))\n    except:\n        logger.warn(\"Unable to get learning_rate and LR_DECAY_COUNTER.\")\ndef test_with_dataloader(exe,\n                         compiled_test_prog,\n                         test_dataloader,\n                         test_fetch_list,\n                         test_metrics,\n                         log_interval=0,",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/train_utils.py:33-57"
+    },
+    "3639": {
+        "file_id": 310,
+        "content": "This code retrieves the learning rate and learning rate counter from the global scope. It prints their values, but handles potential exceptions if they cannot be found or accessed.",
+        "type": "comment"
+    },
+    "3640": {
+        "file_id": 310,
+        "content": "                         save_model_name=''):\n    if not test_dataloader:\n        logger.error(\"[TEST] get dataloader failed.\")\n    test_metrics.reset()\n    test_iter = 0\n    for data in test_dataloader():\n        test_outs = exe.run(compiled_test_prog,\n                            fetch_list=test_fetch_list,\n                            feed=data)\n        test_metrics.accumulate(test_outs)\n        if log_interval > 0 and test_iter % log_interval == 0:\n            test_metrics.calculate_and_log_out(test_outs, \\\n               info = '[TEST] test_iter {} '.format(test_iter))\n        test_iter += 1\n    test_metrics.finalize_and_log_out(\"[TEST] Finish\")\ndef train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, \\\n                        train_fetch_list, train_metrics, epochs = 10, \\\n                        log_interval = 0, valid_interval = 0, save_dir = './', \\\n                        num_trainers = 1, trainer_id = 0, \\\n                        save_model_name = 'model', fix_random_seed = False, \\",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/train_utils.py:58-80"
+    },
+    "3641": {
+        "file_id": 310,
+        "content": "This code defines a function to train a model with a dataloader. It takes an executor, training program, compiled training program, train dataloader, train fetch list, and train metrics as inputs. The function iterates over the dataloader, runs the training program for each data batch, accumulates metrics, logs intermediate results if specified, and finalizes and logs out when finished.",
+        "type": "comment"
+    },
+    "3642": {
+        "file_id": 310,
+        "content": "                        compiled_test_prog = None, test_dataloader = None, \\\n                        test_fetch_list = None, test_metrics = None, \\\n                        is_profiler = None, profiler_path = None):\n    if not train_dataloader:\n        logger.error(\"[TRAIN] get dataloader failed.\")\n    epoch_periods = []\n    train_loss = 0\n    # NOTE: profiler tools, used for benchmark\n    if is_profiler:\n        prof = profiler.Profiler()\n    for epoch in range(epochs):\n        log_lr_and_step()\n        train_iter = 0\n        epoch_periods = []\n        cur_time = time.time()\n        for data in train_dataloader():\n            if is_profiler and train_iter == log_interval:\n                prof.start()\n            train_outs = exe.run(compiled_train_prog,\n                                 fetch_list=train_fetch_list,\n                                 feed=data)\n            period = time.time() - cur_time\n            epoch_periods.append(period)\n            timeStamp = time.time()\n            localTime = time.localtime(timeStamp)",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/train_utils.py:81-109"
+    },
+    "3643": {
+        "file_id": 310,
+        "content": "This code initializes variables and starts a loop over epochs. Inside the loop, it logs the learning rate and step, iterates through the training dataloader, runs the compiled program with fetched data, records the time taken for each iteration, and optionally uses profiler tools for benchmarking.",
+        "type": "comment"
+    },
+    "3644": {
+        "file_id": 310,
+        "content": "            strTime = time.strftime(\"%Y-%m-%d %H:%M:%S\", localTime)\n            if log_interval > 0 and (train_iter % log_interval == 0):\n                train_metrics.calculate_and_log_out(train_outs, \\\n                        info = '[TRAIN {}] Epoch {}, iter {}, time {}, '.format(strTime, epoch, train_iter, period))\n            train_iter += 1\n            cur_time = time.time()\n            if is_profiler:\n                prof.step()\n                if train_iter == log_interval + 5:\n                    prof.stop()\n                    prof.export(path=profiler_path, format=\"json\")\n                    return\n        if len(epoch_periods) < 1:\n            logger.info(\n                'No iteration was executed, please check the data reader')\n            sys.exit(1)\n        logger.info(\n            '[TRAIN] Epoch {} training finished, average time: {}'.format(\n                epoch, np.mean(epoch_periods[1:])))\n        if trainer_id == 0:\n            save_model(exe, train_prog, save_dir, save_model_name,\n                       \"_epoch{}\".format(epoch))",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/train_utils.py:110-135"
+    },
+    "3645": {
+        "file_id": 310,
+        "content": "This code segment tracks the training progress of a video analysis algorithm. It logs and calculates metrics at specified intervals, profiles performance if desired, and saves model progress after each epoch. If no iterations are executed, it alerts and exits with an error message to check data reader.",
+        "type": "comment"
+    },
+    "3646": {
+        "file_id": 310,
+        "content": "        if compiled_test_prog and valid_interval > 0 and (\n                epoch + 1) % valid_interval == 0:\n            test_with_dataloader(exe, compiled_test_prog, test_dataloader,\n                                 test_fetch_list, test_metrics, log_interval,\n                                 save_model_name)\n    if trainer_id == 0:\n        save_model(exe, train_prog, save_dir, save_model_name)\n    #when fix_random seed for debug\n    if fix_random_seed:\n        cards = os.environ.get('CUDA_VISIBLE_DEVICES')\n        gpu_num = len(cards.split(\",\"))\n        print(\"kpis\\ttrain_cost_card{}\\t{}\".format(gpu_num, train_loss))\n        print(\"kpis\\ttrain_speed_card{}\\t{}\".format(gpu_num,\n                                                    np.mean(epoch_periods)))\ndef save_model(exe, program, save_dir, model_name, postfix=''):\n    \"\"\"save paramters and optimizer related varaibles\"\"\"\n    if not os.path.isdir(save_dir):\n        os.makedirs(save_dir)\n    saved_model_name = model_name + postfix\n    paddle.static.save(program, os.path.join(save_dir, saved_model_name))",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/train_utils.py:136-159"
+    },
+    "3647": {
+        "file_id": 310,
+        "content": "The code is checking if it's time to test the program, saving the model if it's trainer 0, and fixing the random seed for debugging. It also saves the model with a specified name in the given directory. The code appears to be part of a larger deep learning training process.",
+        "type": "comment"
+    },
+    "3648": {
+        "file_id": 310,
+        "content": "    return",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/train_utils.py:161-161"
+    },
+    "3649": {
+        "file_id": 310,
+        "content": "The code snippet seems to be incomplete as it only contains a single line, which is the return statement. Without seeing the context or surrounding code, it's difficult to provide an accurate comment. Can you please provide more information or additional lines of code?",
+        "type": "comment"
+    },
+    "3650": {
+        "file_id": 311,
+        "content": "/applications/VideoTag/utils/utility.py",
+        "type": "filepath"
+    },
+    "3651": {
+        "file_id": 311,
+        "content": "This Python file imports necessary modules, defines AttrDict class, checks PaddlePaddle version compatibility, and handles GPU usage based on CUDA availability. It uses paddle.utils.require_version to ensure required version '1.6.0' is installed, logging errors and exiting with status code 1 if not.",
+        "type": "summary"
+    },
+    "3652": {
+        "file_id": 311,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport signal\nimport logging\nimport paddle\n__all__ = ['AttrDict']\nlogger = logging.getLogger(__name__)\ndef _term(sig_num, addition):\n    print('current pid is %s, group id is %s' % (os.getpid(), os.getpgrp()))\n    os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)\nsignal.signal(signal.SIGTERM, _term)\nsignal.signal(signal.SIGINT, _term)\nclass AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/utility.py:1-37"
+    },
+    "3653": {
+        "file_id": 311,
+        "content": "Code is a Python file with license information, importing necessary modules such as os, sys, signal and logging. It defines the AttrDict class which extends the dictionary functionality and includes signals for handling SIGTERM and SIGINT for process termination.",
+        "type": "comment"
+    },
+    "3654": {
+        "file_id": 311,
+        "content": "    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef check_cuda(use_cuda, err = \\\n    \"\\nYou can not set use_gpu = True in the model because you are using paddlepaddle-cpu.\\n \\\n    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_gpu = False to run models on CPU.\\n\"\n                                                                                                                     ):\n    try:\n        if use_cuda == True and paddle.is_compiled_with_cuda() == False:\n            print(err)\n            sys.exit(1)\n    except Exception as e:\n        pass\ndef check_version():\n    \"\"\"\n     Log error and exit when the installed version of paddlepaddle is\n     not satisfied.\n     \"\"\"\n    err = \"PaddlePaddle version 1.6 or higher is required, \" \\\n          \"or a suitable develop version is satisfied as well. \\n\" \\\n          \"Please make sure the version is good with your code.\" \\\n    try:",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/utility.py:39-66"
+    },
+    "3655": {
+        "file_id": 311,
+        "content": "This code checks if the installed PaddlePaddle version is 1.6 or higher and ensures that the user's code is compatible with the installed version. It also handles GPU usage by checking if the code should run on GPU or CPU based on the availability of CUDA in the system.",
+        "type": "comment"
+    },
+    "3656": {
+        "file_id": 311,
+        "content": "        paddle.utils.require_version('1.6.0')\n    except Exception as e:\n        logger.error(err)\n        sys.exit(1)",
+        "type": "code",
+        "location": "/applications/VideoTag/utils/utility.py:67-70"
+    },
+    "3657": {
+        "file_id": 311,
+        "content": "This code is using the paddle.utils.require_version function to check if the required version '1.6.0' is installed. If there is an exception, it logs the error and exits the program with a status code of 1.",
+        "type": "comment"
+    },
+    "3658": {
+        "file_id": 312,
+        "content": "/applications/VideoTag/videotag_test.py",
+        "type": "filepath"
+    },
+    "3659": {
+        "file_id": 312,
+        "content": "The code configures logging, imports libraries, initializes a video tagging model using PaddlePaddle and PaddleVideo. It sets up input data with efficient execution on GPU/CPU resources, measures predictor model's execution time for performance analysis or optimization within the main script function.",
+        "type": "summary"
+    },
+    "3660": {
+        "file_id": 312,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport time\nimport logging\nimport argparse\nimport ast\nimport numpy as np\nimport paddle\nimport paddle.static as static\nfrom utils.config_utils import *\nimport models\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'\nlogging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:1-34"
+    },
+    "3661": {
+        "file_id": 312,
+        "content": "Code sets the logging configuration for INFO level, defines the log format, and redirects the logs to stdout. It also imports necessary libraries and modules, and configures logging handlers.",
+        "type": "comment"
+    },
+    "3662": {
+        "file_id": 312,
+        "content": "logger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--extractor_config',\n                        type=str,\n                        default='configs/tsn.yaml',\n                        help='path to config file of model')\n    parser.add_argument('--extractor_name',\n                        type=str,\n                        default='TSN',\n                        help='extractor model name, default TSN')\n    parser.add_argument('--predictor_config',\n                        '--pconfig',\n                        type=str,\n                        default='configs/attention_lstm.yaml',\n                        help='path to config file of model')\n    parser.add_argument(\n        '--predictor_name',\n        '--pname',\n        type=str,\n        default='AttentionLSTM',\n        help='predictor model name, as AttentionLSTM, AttentionCluster, NEXTVLAD'\n    )\n    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:35-62"
+    },
+    "3663": {
+        "file_id": 312,
+        "content": "This code snippet defines a function \"parse_args()\" which uses the argparse module to parse command-line arguments. It sets default values for extractor and predictor model configurations, names, and enables GPU usage by default.",
+        "type": "comment"
+    },
+    "3664": {
+        "file_id": 312,
+        "content": "                        help='default use gpu.')\n    parser.add_argument('--extractor_weights',\n                        type=str,\n                        default='weights/tsn',\n                        help='extractor weight path')\n    parser.add_argument('--predictor_weights',\n                        '--pweights',\n                        type=str,\n                        default='weights/attention_lstm',\n                        help='predictor weight path')\n    parser.add_argument('--filelist',\n                        type=str,\n                        default='./data/VideoTag_test.list',\n                        help='path of video data, multiple video')\n    parser.add_argument('--save_dir',\n                        type=str,\n                        default='data/VideoTag_results',\n                        help='output file path')\n    parser.add_argument('--label_file',\n                        type=str,\n                        default='label_3396.txt',\n                        help='chinese label file path')\n    args = parser.parse_args()",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:63-86"
+    },
+    "3665": {
+        "file_id": 312,
+        "content": "This code snippet uses the argparse module to define command-line arguments for a video tagging application. These arguments include GPU usage, extractor and predictor weight paths, input file list, output directory, and Chinese label file path. The function `parser.parse_args()` is called at the end to return these arguments.",
+        "type": "comment"
+    },
+    "3666": {
+        "file_id": 312,
+        "content": "    return args\ndef main():\n    \"\"\"\n    Video classification model of 3000 Chinese tags.\n    videotag_extractor_prdictor (as videotag_TSN_AttentionLSTM)\n    two stages in our model:\n        1. extract feature from input video(mp4 format) using extractor\n        2. predict classification results from extracted feature  using predictor\n    we implement this using two name scopes, ie. extractor_scope and predictor_scope.\n    \"\"\"\n    if not os.path.isdir(args.save_dir):\n        os.makedirs(args.save_dir)\n    extractor_config = parse_config(args.extractor_config)\n    extractor_infer_config = merge_configs(extractor_config, 'infer',\n                                           vars(args))\n    extractor_start_time = time.time()\n    extractor_scope = paddle.static.Scope()\n    with static.scope_guard(extractor_scope):\n        extractor_startup_prog = static.Program()\n        extractor_main_prog = static.Program()\n        with static.program_guard(extractor_main_prog, extractor_startup_prog):\n            paddle.disable_static()",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:87-111"
+    },
+    "3667": {
+        "file_id": 312,
+        "content": "This code defines a video classification model with two stages: extracting features from the input video using an extractor and predicting the classification results based on those extracted features. It uses PaddlePaddle's static graph mode for performance improvement and organizes the code within name scopes \"extractor_scope\" and \"predictor_scope\". The code also checks if the save directory exists, creates it if not, and measures time taken by the extractor stage.",
+        "type": "comment"
+    },
+    "3668": {
+        "file_id": 312,
+        "content": "                # build model\n            extractor_model = models.get_model(args.extractor_name,\n                                               extractor_infer_config,\n                                               mode='infer',\n                                               is_videotag=True)\n            extractor_model.build_input(use_dataloader=False)\n            extractor_model.build_model()\n            extractor_feeds = extractor_model.feeds()\n            extractor_fetch_list = extractor_model.fetches()\n            place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n            exe = static.Executor(place)\n            exe.run(extractor_startup_prog)\n            logger.info('load extractor weights from {}'.format(\n                args.extractor_weights))\n            extractor_model.load_pretrain_params(exe,\n                                                 args.extractor_weights,\n                                                 extractor_main_prog)\n                # get reader and metrics",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:112-134"
+    },
+    "3669": {
+        "file_id": 312,
+        "content": "This code builds a model, sets up the necessary parameters for execution, and loads pre-trained weights from a specified location. The model is built in inferencing mode for video tagging tasks, and it utilizes GPU or CPU resources based on the provided arguments.",
+        "type": "comment"
+    },
+    "3670": {
+        "file_id": 312,
+        "content": "            extractor_reader = get_reader(args.extractor_name, 'infer',\n                                          extractor_infer_config)\n            extractor_feeder = paddle.fluid.DataFeeder(place=place,\n                                                feed_list=extractor_feeds)\n            feature_list = []\n            file_list = []\n            for idx, data in enumerate(extractor_reader()):\n                file_id = [item[-1] for item in data]\n                feed_data = [item[:-1] for item in data]\n                feature_out = exe.run(fetch_list=extractor_fetch_list,\n                                      feed=extractor_feeder.feed(feed_data))\n                feature_list.append(feature_out[0])  #get out from list\n                file_list.append(file_id)\n                logger.info(\n                    '========[Stage 1 Sample {} ] Extractor finished======'.\n                    format(idx))\n            paddle.enable_static()\n        extractor_end_time = time.time()\n        print('extractor_time', extractor_end_time - extractor_start_time)",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:135-154"
+    },
+    "3671": {
+        "file_id": 312,
+        "content": "The code is setting up a reader and feeder for an extractor in PaddleVideo, iterating through data from the reader, running the extractor using static mode, and logging progress. It also measures and prints the time taken for extraction.",
+        "type": "comment"
+    },
+    "3672": {
+        "file_id": 312,
+        "content": "    predictor_config = parse_config(args.predictor_config)\n    predictor_infer_config = merge_configs(predictor_config, 'infer',\n                                           vars(args))\n    # get Predictor input from Extractor output\n    predictor_feed_list = []\n    for i in range(len(feature_list)):\n        feature_out = feature_list[i]\n        if args.predictor_name == \"AttentionCluster\":\n            extractor_seg_num = extractor_infer_config.INFER.seg_num\n            predictor_seg_num = predictor_infer_config.MODEL.seg_num\n            idxs = []\n            stride = float(extractor_seg_num) / predictor_seg_num\n            for j in range(predictor_seg_num):\n                pos = (j + np.random.random()) * stride\n                idxs.append(min(extractor_seg_num - 1, int(pos)))\n            extractor_feature = feature_out[:, idxs, :].astype(\n                float)  # get from bs dim\n        else:\n            extractor_feature = feature_out.astype(float)\n        predictor_feed_data = [extractor_feature]\n        predictor_feed_list.append((predictor_feed_data, file_list[i]))",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:156-177"
+    },
+    "3673": {
+        "file_id": 312,
+        "content": "This code configures and prepares input data for a predictor model. It first parses the predictor configuration file, then merges it with command line arguments to create an inferencing configuration. Depending on the specified predictor model, it either extracts relevant segments from feature lists or uses the entire feature list. The resulting data is added to a list of inputs for the predictor model.",
+        "type": "comment"
+    },
+    "3674": {
+        "file_id": 312,
+        "content": "    predictor_start_time = time.time()\n    predictor_scope = paddle.static.Scope()\n    with static.scope_guard(predictor_scope):\n        predictor_startup_prog = static.Program()\n        predictor_main_prog = static.Program()\n        with static.program_guard(predictor_main_prog, predictor_startup_prog):\n            paddle.disable_static()\n                # parse config\n            predictor_model = models.get_model(args.predictor_name,\n                                               predictor_infer_config,\n                                               mode='infer')\n            predictor_model.build_input(use_dataloader=False)\n            predictor_model.build_model()\n            predictor_feeds = predictor_model.feeds()\n            exe.run(predictor_startup_prog)\n            logger.info('load predictor weights from {}'.format(\n                args.predictor_weights))\n            predictor_model.load_test_weights(exe, args.predictor_weights,\n                                              predictor_main_prog)",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:179-199"
+    },
+    "3675": {
+        "file_id": 312,
+        "content": "This code sets up a predictor model, builds its inputs, builds the model itself, initializes feeds, runs a startup program, loads test weights from a specified location, and performs these actions within scopes and programs for efficient execution.",
+        "type": "comment"
+    },
+    "3676": {
+        "file_id": 312,
+        "content": "            predictor_feeder = paddle.fluid.DataFeeder(place=place,\n                                                feed_list=predictor_feeds)\n            predictor_fetch_list = predictor_model.fetches()\n            predictor_metrics = get_metrics(args.predictor_name.upper(),\n                                            'infer', predictor_infer_config)\n            predictor_metrics.reset()\n            for idx, data in enumerate(predictor_feed_list):\n                file_id = data[1]\n                predictor_feed_data = data[0]\n                final_outs = exe.run(\n                    fetch_list=predictor_fetch_list,\n                    feed=predictor_feeder.feed(predictor_feed_data))\n                logger.info(\n                    '=======[Stage 2 Sample {} ] Predictor finished========'\n                    .format(idx))\n                final_result_list = [item\n                                     for item in final_outs] + [file_id]\n                predictor_metrics.accumulate(final_result_list)\n            predictor_metrics.finalize_and_log_out(",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:201-221"
+    },
+    "3677": {
+        "file_id": 312,
+        "content": "This code snippet is initializing a DataFeeder for predictor model, fetching the list of metrics for predictor model and resetting them. It then iterates over the feed data, runs the model with each data instance, accumulates the final results in the metrics object, and finally logs the output.",
+        "type": "comment"
+    },
+    "3678": {
+        "file_id": 312,
+        "content": "                savedir=args.save_dir, label_file=args.label_file)\n            paddle.enable_static()\n    predictor_end_time = time.time()\n    print('predictor_time', predictor_end_time - predictor_start_time)\nif __name__ == '__main__':\n    start_time = time.time()\n    args = parse_args()\n    print(args)\n    check_cuda(args.use_gpu)\n    check_version()\n    logger.info(args)\n    main()\n    end_time = time.time()\n    period = end_time - start_time\n    print('[INFER] infer finished. cost time: {}'.format(period))",
+        "type": "code",
+        "location": "/applications/VideoTag/videotag_test.py:222-238"
+    },
+    "3679": {
+        "file_id": 312,
+        "content": "The code measures the time taken for a predictor to run and outputs it. It also records the total time taken for inferencing and displays the result, indicating when the inference is finished. This code snippet appears within the main function of the script, suggesting that this timing information is used for performance analysis or optimization purposes.",
+        "type": "comment"
+    },
+    "3680": {
+        "file_id": 313,
+        "content": "/benchmark/TimeSformer/README.md",
+        "type": "filepath"
+    },
+    "3681": {
+        "file_id": 313,
+        "content": "This code provides instructions to run a script that executes a series of benchmark tests for the TimeSformer model in PaddleVideo. The provided script, \"run_all.sh\", performs several steps including switching to a specific branch (benchmark_dev), installing dependencies, downloading and uncompressing data, and then running various benchmarks with different parameters using another script named \"run_benchmark.sh\".",
+        "type": "summary"
+    },
+    "3682": {
+        "file_id": 313,
+        "content": "执行\n```bash\nbash ./run_all.sh down_data\n```\n即可运行.\nrun_all.sh内部的执行步骤：\n1. cd 到 ../../ (也就是 PaddleVideo 目录)\n2. 切换到benchmark_dev分支\n3. 安装 PaddleVideo 所需依赖\n4. cd 回PaddleVideo/data/ucf101\n5. wget下载数据集并解压缩，并下载预训练权重放到data目录下\n6. 再次cd 回到 ../../ (也就是 PaddleVideo 目录)\n8. 按照不同的参数执行 run_benchmark.sh 脚本",
+        "type": "code",
+        "location": "/benchmark/TimeSformer/README.md:1-14"
+    },
+    "3683": {
+        "file_id": 313,
+        "content": "This code provides instructions to run a script that executes a series of benchmark tests for the TimeSformer model in PaddleVideo. The provided script, \"run_all.sh\", performs several steps including switching to a specific branch (benchmark_dev), installing dependencies, downloading and uncompressing data, and then running various benchmarks with different parameters using another script named \"run_benchmark.sh\".",
+        "type": "comment"
+    },
+    "3684": {
+        "file_id": 314,
+        "content": "/benchmark/TimeSformer/run_all.sh",
+        "type": "filepath"
+    },
+    "3685": {
+        "file_id": 314,
+        "content": "The script prepares PaddleVideo for TimeSformer benchmarking, downloads UCF101 dataset, and performs batch experiments with various configurations on one or eight GPUs.",
+        "type": "summary"
+    },
+    "3686": {
+        "file_id": 314,
+        "content": "# 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行： paddlepaddle/paddle:latest-gpu-cuda10.2-cudnn7  paddle=2.1.2  py=37\n# 执行目录：需说明\nsed -i '/set\\ -xe/d' run_benchmark.sh\ncd ../../ # cd到PaddleVideo项目根目录下\ngit checkout benchmark_dev\nlog_path=${LOG_PATH_INDEX_DIR:-$(pwd)}  #  benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录\n# 1 安装该模型需要的依赖 (如需开启优化策略请注明)\npython -m pip install -r requirements.txt\n# 2 拷贝该模型需要数据、预训练模型\nunalias cp\ncp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs1.yaml configs/recognition/timesformer/\ncp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs1_mp.yaml configs/recognition/timesformer/\ncp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs14.yaml configs/recognition/timesformer/\ncp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs14_mp.yaml configs/recognition/timesformer/\nif [ ! -f \"data/ucf101/trainlist_benchmark_mp.txt\" ]; then\n    wget -P data/ucf101/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/trainlist_benchmark_mp.txt\nfi\nwget -P ",
+        "type": "code",
+        "location": "/benchmark/TimeSformer/run_all.sh:1-20"
+    },
+    "3687": {
+        "file_id": 314,
+        "content": "This code snippet is part of a script for running the TimeSformer model benchmark in PaddleVideo. It sets up the environment, installs required dependencies, and copies necessary configuration files and data to ensure stable performance.",
+        "type": "comment"
+    },
+    "3688": {
+        "file_id": 314,
+        "content": "data/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams\nalias cp='cp -i'\ncd data/ucf101 # 进入PaddleVideo/data/ucf101\nif [ $1 = \"down_data\" ];then\n    wget --no-check-certificate \"https://www.crcv.ucf.edu/data/UCF101/UCF101.rar\" # 下载训练数据\n    unrar x UCF101.rar # 解压\n    mv ./UCF-101 ./videos # 重命名文件夹为./videos\n    rm -rf ./UCF101.rar\nelse    # 使用本地数据\n    rm -rf videos\n    ln -s ${data_path}/dygraph_data/TSM/ucf101/videos ./videos\nfi\ncd ../../ # 返回PaddleVideo\n# 3 批量运行（如不方便批量，1，2需放到单个模型中）\nmodel_mode_list=(TimeSformer)\nfp_item_list=(fp32 fp16)\nbs_item_list=(1)    #  14\nfor model_mode in ${model_mode_list[@]}; do\n      for fp_item in ${fp_item_list[@]}; do\n          for bs_item in ${bs_item_list[@]}\n            do\n            run_mode=sp\n            log_name=video_${model_mode}_${run_mode}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8\n            echo \"index is speed, 1gpus, begin, ${log_name}\"\n            CUDA_VISIBLE_DEVICES=0 bash benchmark/${model_m",
+        "type": "code",
+        "location": "/benchmark/TimeSformer/run_all.sh:20-47"
+    },
+    "3689": {
+        "file_id": 314,
+        "content": "This script downloads and prepares the UCF101 dataset for the TimeSformer model in PaddleVideo. It checks if the user wants to download data or use local data, then proceeds accordingly. The script also sets up a loop to run batch experiments with different model modes (TimeSformer), floating point items (fp32, fp16), and batch sizes (1). The log name is based on the model mode, run mode (speed), batch size, and floating point item. It uses CUDA_VISIBLE_DEVICES=0 to specify the GPU for execution.",
+        "type": "comment"
+    },
+    "3690": {
+        "file_id": 314,
+        "content": "ode}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${model_mode} | tee ${log_path}/${log_name}_speed_1gpus 2>&1\n            sleep 60\n            run_mode=mp\n            log_name=video_${model_mode}_${run_mode}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8\n            echo \"index is speed, 8gpus, run_mode is multi_process, begin, ${log_name}\"\n            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/${model_mode}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${model_mode} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1\n            sleep 60\n            done\n      done\ndone",
+        "type": "code",
+        "location": "/benchmark/TimeSformer/run_all.sh:47-57"
+    },
+    "3691": {
+        "file_id": 314,
+        "content": "The script iterates over different model modes, batch sizes, and floating-point types. It first runs a benchmark with one GPU and logs the results, then sleeps for 60 seconds. Next, it repeats the process but uses eight GPUs in parallel. The script aims to test various configurations and collect performance data.",
+        "type": "comment"
+    },
+    "3692": {
+        "file_id": 315,
+        "content": "/benchmark/TimeSformer/run_benchmark.sh",
+        "type": "filepath"
+    },
+    "3693": {
+        "file_id": 315,
+        "content": "This code sets up benchmark tests for TimeSformer video classification models in PaddleVideo, allowing users to customize parameters and analyze logs. The train() function is used for model training with specified parameters.",
+        "type": "summary"
+    },
+    "3694": {
+        "file_id": 315,
+        "content": "#!/usr/bin/env bash\nset -xe\n# 运行示例：CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}\n# 参数说明\nfunction _set_params(){\n    run_mode=${1:-\"sp\"}          # 单卡sp|多卡mp\n    batch_size=${2:-\"1\"}\n    fp_item=${3:-\"fp32\"}        # fp32|fp16\n    model_item=${4:-\"model_item\"}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR 后续QA设置该参数\n# 添加benchmark日志解析所需参数\n    base_batch_size=${batch_size}\n    mission_name=\"视频分类\"\n    direction_id=\"0\"\n    ips_unit=\"instance/sec\"\n    skip_steps=10                     # 解析日志，有些模型前几个step耗时长，需要跳过                                    (必填)\n    keyword=\"ips:\"                 # 解析日志，筛选出数据所在行的关键字                                             (必填)\n    index=\"1\"\n    model_name=${model_item}_bs${batch_size}_${fp_item}\n#   以下不用修改   \n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}\n}\nfunction _train(){\n    echo \"Train on ${num_gpu_devices} GPUs\"",
+        "type": "code",
+        "location": "/benchmark/TimeSformer/run_benchmark.sh:1-28"
+    },
+    "3695": {
+        "file_id": 315,
+        "content": "This script is a bash file for running benchmark tests on TimeSformer video classification models. It sets parameters such as single or multi-GPU mode, batch size, floating point precision, and model item. The function _train() will be used to train the model with specified parameters.",
+        "type": "comment"
+    },
+    "3696": {
+        "file_id": 315,
+        "content": "    echo \"current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size\"\n    case ${run_mode} in\n    sp) \n        if [ ${fp_item} == 'fp32' ]; then\n            train_cmd=\"python -u main.py -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}.yaml\"\n        elif [ ${fp_item} == 'fp16' ]; then\n            train_cmd=\"python -u main.py --amp -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}.yaml\"\n        else\n            echo \"choose fp_item(fp32 or fp16)\"\n            exit 1\n        fi;;\n    mp)\n        rm -rf ./mylog\n        if [ ${fp_item} == 'fp32' ]; then\n            train_cmd=\"python -u -B -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES --log_dir=./mylog main.py \\\n            -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}_mp.yaml\"\n            log_parse_file=\"mylog/workerlog.0\"\n        elif [ ${fp_item} == 'fp16' ]; then\n            train_cmd=\"python -u -B -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES --log_dir=./mylog main.py --amp \\",
+        "type": "code",
+        "location": "/benchmark/TimeSformer/run_benchmark.sh:29-48"
+    },
+    "3697": {
+        "file_id": 315,
+        "content": "This code is running a benchmark for the TimeSformer model in PaddleVideo. It checks if fp_item is either 'fp32' or 'fp16', and then calls the main script with the appropriate configuration file, based on the mode (sp or mp) and the chosen precision. The output logs are directed to a specified directory for analysis.",
+        "type": "comment"
+    },
+    "3698": {
+        "file_id": 315,
+        "content": "            -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}_mp.yaml\"\n            log_parse_file=\"mylog/workerlog.0\"\n        else\n            echo \"choose fp_item(fp32 or fp16)\"\n            exit 1\n        fi;;\n    *) echo \"choose run_mode(sp or mp)\"; exit 1;\n    esac\n# 以下不用修改\n    timeout 15m ${train_cmd} > ${log_file} 2>&1\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n        export job_fail_flag=1\n    else\n        echo -e \"${model_name}, SUCCESS\"\n        export job_fail_flag=0\n    fi\n    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ $run_mode = \"mp\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.0 ${log_file}\n    fi\n}\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n# _train       # 如果只想产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开",
+        "type": "code",
+        "location": "/benchmark/TimeSformer/run_benchmark.sh:49-77"
+    },
+    "3699": {
+        "file_id": 315,
+        "content": "This code is part of a shell script for benchmarking the TimeSformer model. It sets up the command to run the training and checks for specific parameters like batch size and precision. It then executes the command with timeout, logs the result as success or failure, and removes intermediate log files if running in multi-process mode. The script also sources a separate file for further analysis of the log data.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/37.json b/docs/data/37.json
new file mode 100644
index 000000000..af8b979db
--- /dev/null
+++ b/docs/data/37.json
@@ -0,0 +1,546 @@
+{
+    "3700": {
+        "file_id": 316,
+        "content": "/data/50salads/prepare_asrf_data.py",
+        "type": "filepath"
+    },
+    "3701": {
+        "file_id": 316,
+        "content": "The code defines two functions, get_class2id_map and get_arguments. It reads ground truth text files, splits them by class labels, saves as .npy files, defines boundary frames for new actions, and saves these as separate .npy files. Assumes input files preprocessed and split by lines.",
+        "type": "summary"
+    },
+    "3702": {
+        "file_id": 316,
+        "content": "import argparse\nimport glob\nimport os\nimport sys\nfrom typing import Dict\nimport numpy as np\nsys.path.append(os.path.join(os.path.dirname(__file__), \"..\"))\ndataset_names = [\"50salads\", \"breakfast\", \"gtea\"]\ndef get_class2id_map(dataset: str,\n                     dataset_dir: str = \"./dataset\") -> Dict[str, int]:\n    \"\"\"\n    Args:\n        dataset: 50salads, gtea, breakfast\n        dataset_dir: the path to the datset directory\n    \"\"\"\n    assert (dataset in dataset_names\n            ), \"You have to choose 50salads, gtea or breakfast as dataset.\"\n    with open(os.path.join(dataset_dir, \"{}/mapping.txt\".format(dataset)),\n              \"r\") as f:\n        actions = f.read().split(\"\\n\")[:-1]\n    class2id_map = dict()\n    for a in actions:\n        class2id_map[a.split()[1]] = int(a.split()[0])\n    return class2id_map\ndef get_arguments() -> argparse.Namespace:\n    \"\"\"\n    parse all the arguments from command line inteface\n    return a list of parsed arguments\n    \"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"convert ground truth txt files to numpy array\")",
+        "type": "code",
+        "location": "/data/50salads/prepare_asrf_data.py:1-42"
+    },
+    "3703": {
+        "file_id": 316,
+        "content": "This code defines two functions: get_class2id_map and get_arguments. The get_class2id_map function takes a dataset name (50salads, gtea, or breakfast) and the path to the dataset directory, and returns a dictionary mapping class names to their respective IDs by reading the \"mapping.txt\" file in the specified dataset directory. The get_arguments function parses all arguments from the command line interface for converting ground truth txt files to numpy arrays.",
+        "type": "comment"
+    },
+    "3704": {
+        "file_id": 316,
+        "content": "    parser.add_argument(\n        \"--dataset_dir\",\n        type=str,\n        default=\"./dataset\",\n        help=\"path to a dataset directory (default: ./dataset)\",\n    )\n    return parser.parse_args()\ndef main() -> None:\n    args = get_arguments()\n    datasets = [\"50salads\", \"gtea\", \"breakfast\", \"baseball\"]\n    for dataset in datasets:\n        # make directory for saving ground truth numpy arrays\n        cls_save_dir = os.path.join(args.dataset_dir, dataset, \"gt_arr\")\n        if not os.path.exists(cls_save_dir):\n            os.mkdir(cls_save_dir)\n        # make directory for saving ground truth numpy arrays\n        boundary_save_dir = os.path.join(args.dataset_dir, dataset,\n                                         \"gt_boundary_arr\")\n        if not os.path.exists(boundary_save_dir):\n            os.mkdir(boundary_save_dir)\n        # class to index mapping\n        class2id_map = get_class2id_map(dataset, dataset_dir=args.dataset_dir)\n        gt_dir = os.path.join(args.dataset_dir, dataset, \"groundTruth\")\n        gt_paths = glob.glob(os.path.join(gt_dir, \"*.txt\"))",
+        "type": "code",
+        "location": "/data/50salads/prepare_asrf_data.py:43-74"
+    },
+    "3705": {
+        "file_id": 316,
+        "content": "This code sets up the dataset directory path and creates directories for saving ground truth numpy arrays. It also creates a class to index mapping using get_class2id_map function, and retrieves all groundTruth text files' paths in the specified dataset directory.",
+        "type": "comment"
+    },
+    "3706": {
+        "file_id": 316,
+        "content": "        for gt_path in gt_paths:\n            # the name of ground truth text file\n            gt_name = os.path.relpath(gt_path, gt_dir)\n            with open(gt_path, \"r\") as f:\n                gt = f.read().split(\"\\n\")[:-1]\n            gt_array = np.zeros(len(gt))\n            for i in range(len(gt)):\n                gt_array[i] = class2id_map[gt[i]]\n            # save array\n            np.save(os.path.join(cls_save_dir, gt_name[:-4] + \".npy\"), gt_array)\n            # the name of ground truth text file\n            gt_name = os.path.relpath(gt_path, gt_dir)\n            with open(gt_path, \"r\") as f:\n                gt = f.read().split(\"\\n\")[:-1]\n            # define the frame where new action starts as boundary frame\n            boundary = np.zeros(len(gt))\n            last = gt[0]\n            boundary[0] = 1\n            for i in range(1, len(gt)):\n                if last != gt[i]:\n                    boundary[i] = 1\n                    last = gt[i]\n            # save array\n            np.save(os.path.join(boundary_save_dir, gt_name[:-4] + \".npy\"),",
+        "type": "code",
+        "location": "/data/50salads/prepare_asrf_data.py:76-106"
+    },
+    "3707": {
+        "file_id": 316,
+        "content": "This code is reading ground truth text files, splitting them into arrays based on class labels, and saving these arrays as .npy files. It also defines boundary frames for new actions and saves these as separate .npy files. The code assumes that the input files are already processed and split by lines.",
+        "type": "comment"
+    },
+    "3708": {
+        "file_id": 316,
+        "content": "                    boundary)\n    print(\"Done\")\nif __name__ == \"__main__\":\n    main()",
+        "type": "code",
+        "location": "/data/50salads/prepare_asrf_data.py:107-113"
+    },
+    "3709": {
+        "file_id": 316,
+        "content": "This code snippet defines a function named \"main\" and checks if the script is being run directly. If it is, the \"main\" function is called to execute the desired task. The code prints \"Done\" after completing the specified operation.",
+        "type": "comment"
+    },
+    "3710": {
+        "file_id": 317,
+        "content": "/data/50salads/transform_segmentation_label.py",
+        "type": "filepath"
+    },
+    "3711": {
+        "file_id": 317,
+        "content": "The code processes video data, creating labels, organizing files, and parsing command line arguments. It allows for segmentation or localization labeling with features such as label conversion and ground truth processing.",
+        "type": "summary"
+    },
+    "3712": {
+        "file_id": 317,
+        "content": "import json\nimport numpy as np\nimport argparse\nimport os\nfrom tqdm import tqdm\ndef generate_mapping_list_txt(action_dict, out_path):\n    out_txt_file_path = os.path.join(out_path, \"mapping.txt\")\n    f = open(out_txt_file_path, \"w\", encoding='utf-8')\n    for key, action_name in action_dict.items():\n        str_str = str(key) + \" \" + action_name + \"\\n\"\n        f.write(str_str)\n    # add None\n    str_str = str(len(action_dict)) + \" None\" + \"\\n\"\n    f.write(str_str)\n    f.close()\ndef segmentation_convert_localization_label(prefix_data_path, out_path,\n                                            action_dict, fps):\n    label_path = os.path.join(prefix_data_path)\n    label_txt_name_list = os.listdir(label_path)\n    labels_dict = {}\n    labels_dict[\"fps\"] = fps\n    labels_list = []\n    for label_name in tqdm(label_txt_name_list, desc='label convert:'):\n        label_dict = {}\n        label_dict[\"url\"] = label_name.split(\".\")[0] + \".mp4\"\n        label_txt_path = os.path.join(prefix_data_path, label_name)\n        with open(label_txt_path, \"r\", encoding='utf-8') as f:",
+        "type": "code",
+        "location": "/data/50salads/transform_segmentation_label.py:1-34"
+    },
+    "3713": {
+        "file_id": 317,
+        "content": "This code reads label files from a specified path, converts the labels to localization format and writes them into another specified output path. It also generates mapping information between the localization format and original format. The function takes prefix_data_path (path to read data), out_path (output path for results), action_dict (dictionary of action mappings) and fps (frames per second) as input parameters. It processes each label file in the prefix_data_path, updating labels_list with converted labels, and writes them to the output path. Finally, it generates mapping information in \"mapping.txt\" format.",
+        "type": "comment"
+    },
+    "3714": {
+        "file_id": 317,
+        "content": "            gt = f.read().split(\"\\n\")[:-1]\n        label_dict[\"total_frames\"] = len(gt)\n        boundary_index_list = [0]\n        before_action_name = gt[0]\n        for index in range(1, len(gt)):\n            if before_action_name != gt[index]:\n                boundary_index_list.append(index)\n                before_action_name = gt[index]\n        actions_list = []\n        for index in range(len(boundary_index_list) - 1):\n            if gt[boundary_index_list[index]] != \"None\":\n                action_name = gt[boundary_index_list[index]]\n                start_sec = float(boundary_index_list[index]) / float(fps)\n                end_sec = float(boundary_index_list[index + 1] - 1) / float(fps)\n                action_id = action_dict[action_name]\n                label_action_dict = {}\n                label_action_dict[\"label_names\"] = action_name\n                label_action_dict[\"start_id\"] = start_sec\n                label_action_dict[\"end_id\"] = end_sec\n                label_action_dict[\"label_ids\"] = [action_id]",
+        "type": "code",
+        "location": "/data/50salads/transform_segmentation_label.py:35-55"
+    },
+    "3715": {
+        "file_id": 317,
+        "content": "This code segment reads a ground truth file line by line and counts the total frames. It then identifies action boundaries, creates an actions list, and for each action, it extracts action name, start and end time (in seconds), and the corresponding label ID from the action dictionary to create a label_action_dict. This information will be useful in transforming segmentation labels.",
+        "type": "comment"
+    },
+    "3716": {
+        "file_id": 317,
+        "content": "                actions_list.append(label_action_dict)\n        label_dict[\"actions\"] = actions_list\n        labels_list.append(label_dict)\n    labels_dict[\"gts\"] = labels_list\n    output_path = os.path.join(out_path, \"output.json\")\n    f = open(output_path, \"w\", encoding='utf-8')\n    f.write(json.dumps(labels_dict, indent=4))\n    f.close()\ndef generate_action_dict(label):\n    action_dict = {}\n    for gt in label[\"gts\"]:\n        for action in gt[\"actions\"]:\n            label_id = action[\"label_ids\"][0]\n            label_name = action[\"label_names\"][0]\n            action_dict[label_id] = label_name\n    return action_dict\ndef load_action_dict(data_path):\n    mapping_txt_path = os.path.join(data_path, \"mapping.txt\")\n    with open(mapping_txt_path, \"r\", encoding='utf-8') as f:\n        actions = f.read().split(\"\\n\")[:-1]\n    class2id_map = dict()\n    for a in actions:\n        class2id_map[a.split()[1]] = int(a.split()[0])\n    return class2id_map\ndef localization_convert_segmentation_label(label, prefix_data_path, out_path):",
+        "type": "code",
+        "location": "/data/50salads/transform_segmentation_label.py:56-90"
+    },
+    "3717": {
+        "file_id": 317,
+        "content": "This code appears to be part of a larger program that performs video segmentation and labeling. It generates a dictionary containing action labels based on provided ground truth segmentation data, converts segmentation labels into localization format, and saves the results in JSON format for further use. The function `generate_action_dict()` generates an action dictionary, `load_action_dict()` loads an action dictionary from a file, and `localization_convert_segmentation_label()` converts segmentation labels into localization format.",
+        "type": "comment"
+    },
+    "3718": {
+        "file_id": 317,
+        "content": "    path = os.path.join(out_path, \"groundTruth\")\n    isExists = os.path.exists(path)\n    if not isExists:\n        os.makedirs(path)\n        print(path + ' 创建成功')\n    else:\n        print(path + ' 目录已存在')\n    fps = float(label[\"fps\"])\n    video_list = []\n    for gt in tqdm(label[\"gts\"], desc='label convert:'):\n        video_name = gt[\"url\"].split(\".\")[0]\n        data_path = os.path.join(prefix_data_path, video_name + \".pkl\")\n        video_list.append(video_name + \".txt\")\n        feature = np.load(data_path, allow_pickle=True)[\"image_feature\"]\n        num_feture = feature.shape[0]\n        seg_label = [\"None\"] * (num_feture)\n        for action in gt[\"actions\"]:\n            start_id = action[\"start_id\"]\n            end_id = action[\"end_id\"]\n            label_name = action[\"label_names\"]\n            start_index = int(np.floor(start_id * fps))\n            end_index = int(np.floor(end_id * fps)) + 1\n            if end_index < num_feture - 1:\n                seg_label[start_index:end_index] = label_name * (end_index -",
+        "type": "code",
+        "location": "/data/50salads/transform_segmentation_label.py:91-119"
+    },
+    "3719": {
+        "file_id": 317,
+        "content": "The code checks if a directory exists and creates it if not. It then loops through each ground truth segmentation in the label, retrieves the corresponding video data, extracts relevant information like feature, action labels, start and end indices, and populates seg_label array with action labels for the specified time range.",
+        "type": "comment"
+    },
+    "3720": {
+        "file_id": 317,
+        "content": "                                                                 start_index)\n            elif start_index < num_feture - 1:\n                seg_label[start_index:] = label_name * (num_feture -\n                                                        start_index)\n            else:\n                pass\n        if len(seg_label) != num_feture:\n            seg_label = seg_label[:num_feture]\n        out_txt_file_path = os.path.join(out_path, \"groundTruth\",\n                                         video_name + \".txt\")\n        str = '\\n'\n        f = open(out_txt_file_path, \"w\", encoding='utf-8')\n        f.write(str.join(seg_label) + str)\n        f.close()\n    out_txt_file_path = os.path.join(out_path, \"train_list.txt\")\n    str = '\\n'\n    f = open(out_txt_file_path, \"w\", encoding='utf-8')\n    f.write(str.join(video_list) + str)\n    f.close()\ndef main():\n    args = get_arguments()\n    if args.mode in [\"segmentation\", \"localization\"]:\n        if args.mode == \"segmentation\":\n            with open(args.label_path, 'r', encoding='utf-8') as json_file:",
+        "type": "code",
+        "location": "/data/50salads/transform_segmentation_label.py:120-147"
+    },
+    "3721": {
+        "file_id": 317,
+        "content": "This code segment is part of a larger program that appears to be related to video data processing. The function is setting up a segmentation label and writing it to a file, as well as creating another list for training purposes. It determines the starting index based on the number of features, and fills in the label accordingly. The code then writes the label and video list to separate text files in the specified output path. This process is controlled by the \"args\" variable which contains command line arguments like \"mode\", \"label_path\", and \"out_path\".",
+        "type": "comment"
+    },
+    "3722": {
+        "file_id": 317,
+        "content": "                label = json.load(json_file)\n            action_dict = generate_action_dict(label)\n            generate_mapping_list_txt(action_dict, args.out_path)\n            localization_convert_segmentation_label(label, args.data_path,\n                                                    args.out_path)\n        elif args.mode == \"localization\":\n            action_dict = load_action_dict(args.label_path)\n            segmentation_convert_localization_label(args.data_path,\n                                                    args.out_path,\n                                                    action_dict,\n                                                    fps=25.0)\n    else:\n        raise NotImplementedError\ndef get_arguments():\n    \"\"\"\n    parse all the arguments from command line inteface\n    return a list of parsed arguments\n    \"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"convert segmentation and localization label\")\n    parser.add_argument(\"label_path\", type=str, help=\"path of a label file\")",
+        "type": "code",
+        "location": "/data/50salads/transform_segmentation_label.py:148-173"
+    },
+    "3723": {
+        "file_id": 317,
+        "content": "The code reads a label file, determines the mode (segmentation or localization), and performs corresponding operations. It uses function calls like generate_action_dict, load_action_dict, segmentation_convert_localization_label. The get_arguments function parses command line arguments.",
+        "type": "comment"
+    },
+    "3724": {
+        "file_id": 317,
+        "content": "    parser.add_argument(\n        \"data_path\",\n        type=str,\n        help=\"path of video feature or segmentation label txt.\",\n    )\n    parser.add_argument(\n        \"out_path\",\n        type=str,\n        help=\"path of output file.\",\n    )\n    parser.add_argument(\n        \"--mode\",\n        type=str,\n        default=\"segmentation\",\n        help=\"Convert segmentation label or localization label.\",\n    )\n    return parser.parse_args()\nif __name__ == \"__main__\":\n    main()",
+        "type": "code",
+        "location": "/data/50salads/transform_segmentation_label.py:174-195"
+    },
+    "3725": {
+        "file_id": 317,
+        "content": "This code snippet defines command line arguments for the input data path, output path, and mode. It then parses these arguments and returns them, allowing the program to convert segmentation or localization labels as specified by the user.",
+        "type": "comment"
+    },
+    "3726": {
+        "file_id": 318,
+        "content": "/data/ntu-rgb-d/download_dataset.sh",
+        "type": "filepath"
+    },
+    "3727": {
+        "file_id": 318,
+        "content": "This script changes directory to \"data/ntu-rgb-d\" and downloads a zip file containing skeleton data for frames 1-17. It then unzips the file, deletes the original, and downloads another zip file named \"statistics.zip\". The script creates a new folder named \"statistics\", extracts its contents into it, and removes the downloaded zip file.",
+        "type": "summary"
+    },
+    "3728": {
+        "file_id": 318,
+        "content": "cd data/ntu-rgb-d\n# download\nwget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1CUZnBtYwifVXS21yVg62T-vrPVayso5H' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p')&id=1CUZnBtYwifVXS21yVg62T-vrPVayso5H\" -O nturgbd_skeletons_s001_to_s017.zip && rm -rf /tmp/cookies.txt\nunzip nturgbd_skeletons_s001_to_s017.zip && rm -rf nturgbd_skeletons_s001_to_s017.zip\nwget https://videotag.bj.bcebos.com/Data/statistics.zip\nmkdir statistics\nunzip statistics.zip -d statistics/ && rm -rf statistics.zip",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/download_dataset.sh:1-12"
+    },
+    "3729": {
+        "file_id": 318,
+        "content": "This script changes directory to \"data/ntu-rgb-d\" and downloads a zip file containing skeleton data for frames 1-17. It then unzips the file, deletes the original, and downloads another zip file named \"statistics.zip\". The script creates a new folder named \"statistics\", extracts its contents into it, and removes the downloaded zip file.",
+        "type": "comment"
+    },
+    "3730": {
+        "file_id": 319,
+        "content": "/data/ntu-rgb-d/get_raw_denoised_data.py",
+        "type": "filepath"
+    },
+    "3731": {
+        "file_id": 319,
+        "content": "The code cleans and processes NTU RGB-D dataset data by removing noisy frames, handling missing values, updating arrays, logging counts, denoising raw skeleton data, and generating log files for sequences with multiple actors. It reads sequence data, extracts joints and color data, handles multiple actors and missing frames, and stores the processed data for further processing while counting missing data.",
+        "type": "summary"
+    },
+    "3732": {
+        "file_id": 319,
+        "content": "# ref: https://github.com/Uason-Chen/CTR-GCN/blob/main/data/ntu/get_raw_denoised_data.py\nimport os\nimport os.path as osp\nimport numpy as np\nimport pickle\nimport logging\nroot_path = './'\nraw_data_file = osp.join(root_path, 'raw_data', 'raw_skes_data.pkl')\nsave_path = osp.join(root_path, 'denoised_data')\nif not osp.exists(save_path):\n    os.mkdir(save_path)\nrgb_ske_path = osp.join(save_path, 'rgb+ske')\nif not osp.exists(rgb_ske_path):\n    os.mkdir(rgb_ske_path)\nactors_info_dir = osp.join(save_path, 'actors_info')\nif not osp.exists(actors_info_dir):\n    os.mkdir(actors_info_dir)\nmissing_count = 0\nnoise_len_thres = 11\nnoise_spr_thres1 = 0.8\nnoise_spr_thres2 = 0.69754\nnoise_mot_thres_lo = 0.089925\nnoise_mot_thres_hi = 2\nnoise_len_logger = logging.getLogger('noise_length')\nnoise_len_logger.setLevel(logging.INFO)\nnoise_len_logger.addHandler(\n    logging.FileHandler(osp.join(save_path, 'noise_length.log')))\nnoise_len_logger.info('{:^20}\\t{:^17}\\t{:^8}\\t{}'.format(\n    'Skeleton', 'bodyID', 'Motion', 'Length'))\nnoise_spr_logger = logging.getLogger('noise_spread')",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:1-38"
+    },
+    "3733": {
+        "file_id": 319,
+        "content": "This code is setting up directories and loggers for processing raw data and identifying noisy sequences. It checks if the required folders exist, creates them if not, sets up loggers to track noise length and spread thresholds, and initializes variables for the process.",
+        "type": "comment"
+    },
+    "3734": {
+        "file_id": 319,
+        "content": "noise_spr_logger.setLevel(logging.INFO)\nnoise_spr_logger.addHandler(\n    logging.FileHandler(osp.join(save_path, 'noise_spread.log')))\nnoise_spr_logger.info('{:^20}\\t{:^17}\\t{:^8}\\t{:^8}'.format(\n    'Skeleton', 'bodyID', 'Motion', 'Rate'))\nnoise_mot_logger = logging.getLogger('noise_motion')\nnoise_mot_logger.setLevel(logging.INFO)\nnoise_mot_logger.addHandler(\n    logging.FileHandler(osp.join(save_path, 'noise_motion.log')))\nnoise_mot_logger.info('{:^20}\\t{:^17}\\t{:^8}'.format('Skeleton', 'bodyID',\n                                                     'Motion'))\nfail_logger_1 = logging.getLogger('noise_outliers_1')\nfail_logger_1.setLevel(logging.INFO)\nfail_logger_1.addHandler(\n    logging.FileHandler(osp.join(save_path, 'denoised_failed_1.log')))\nfail_logger_2 = logging.getLogger('noise_outliers_2')\nfail_logger_2.setLevel(logging.INFO)\nfail_logger_2.addHandler(\n    logging.FileHandler(osp.join(save_path, 'denoised_failed_2.log')))\nmissing_skes_logger = logging.getLogger('missing_frames')\nmissing_skes_logger.setLevel(logging.INFO)",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:39-63"
+    },
+    "3735": {
+        "file_id": 319,
+        "content": "The code sets up multiple loggers for different types of output: 'noise_spread.log', 'noise_motion.log', 'denoised_failed_1.log', and 'denoised_failed_2.log'. It also creates a logger for missing frames named 'missing_frames'. Each logger is configured with a specific level of logging (INFO) and a file handler to store the logs in designated files within the specified save path. This allows for organized and easily accessible logging during program execution.",
+        "type": "comment"
+    },
+    "3736": {
+        "file_id": 319,
+        "content": "missing_skes_logger.addHandler(\n    logging.FileHandler(osp.join(save_path, 'missing_skes.log')))\nmissing_skes_logger.info('{:^20}\\t{}\\t{}'.format('Skeleton', 'num_frames',\n                                                 'num_missing'))\nmissing_skes_logger1 = logging.getLogger('missing_frames_1')\nmissing_skes_logger1.setLevel(logging.INFO)\nmissing_skes_logger1.addHandler(\n    logging.FileHandler(osp.join(save_path, 'missing_skes_1.log')))\nmissing_skes_logger1.info('{:^20}\\t{}\\t{}\\t{}\\t{}\\t{}'.format(\n    'Skeleton', 'num_frames', 'Actor1', 'Actor2', 'Start', 'End'))\nmissing_skes_logger2 = logging.getLogger('missing_frames_2')\nmissing_skes_logger2.setLevel(logging.INFO)\nmissing_skes_logger2.addHandler(\n    logging.FileHandler(osp.join(save_path, 'missing_skes_2.log')))\nmissing_skes_logger2.info('{:^20}\\t{}\\t{}\\t{}'.format('Skeleton', 'num_frames',\n                                                      'Actor1', 'Actor2'))\ndef denoising_by_length(ske_name, bodies_data):\n    \"\"\"\n    Denoising data based on the frame length for each bodyID.",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:64-86"
+    },
+    "3737": {
+        "file_id": 319,
+        "content": "Creates multiple loggers for tracking missing skeleton frames, with different handlers and levels of information. Function denoising_by_length takes a skeleton name and bodies_data as input to perform data denoising based on frame length for each bodyID.",
+        "type": "comment"
+    },
+    "3738": {
+        "file_id": 319,
+        "content": "    Filter out the bodyID which length is less or equal than the predefined threshold.\n    \"\"\"\n    noise_info = str()\n    new_bodies_data = bodies_data.copy()\n    for (bodyID, body_data) in new_bodies_data.items():\n        length = len(body_data['interval'])\n        if length <= noise_len_thres:\n            noise_info += 'Filter out: %s, %d (length).\\n' % (bodyID, length)\n            noise_len_logger.info('{}\\t{}\\t{:.6f}\\t{:^6d}'.format(\n                ske_name, bodyID, body_data['motion'], length))\n            del bodies_data[bodyID]\n    if noise_info != '':\n        noise_info += '\\n'\n    return bodies_data, noise_info\ndef get_valid_frames_by_spread(points):\n    \"\"\"\n    Find the valid (or reasonable) frames (index) based on the spread of X and Y.\n    :param points: joints or colors\n    \"\"\"\n    num_frames = points.shape[0]\n    valid_frames = []\n    for i in range(num_frames):\n        x = points[i, :, 0]\n        y = points[i, :, 1]\n        if (x.max() - x.min()) <= noise_spr_thres1 * (y.max() - y.min()):  # 0.8",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:87-116"
+    },
+    "3739": {
+        "file_id": 319,
+        "content": "Code snippet filters out bodies with a length less than or equal to the predefined threshold and finds valid frames based on the spread of X and Y. It also logs the filtered body information and returns the updated bodies data along with filter information.",
+        "type": "comment"
+    },
+    "3740": {
+        "file_id": 319,
+        "content": "            valid_frames.append(i)\n    return valid_frames\ndef denoising_by_spread(ske_name, bodies_data):\n    \"\"\"\n    Denoising data based on the spread of Y value and X value.\n    Filter out the bodyID which the ratio of noisy frames is higher than the predefined\n    threshold.\n    bodies_data: contains at least 2 bodyIDs\n    \"\"\"\n    noise_info = str()\n    denoised_by_spr = False  # mark if this sequence has been processed by spread.\n    new_bodies_data = bodies_data.copy()\n    # for (bodyID, body_data) in bodies_data.items():\n    for (bodyID, body_data) in new_bodies_data.items():\n        if len(bodies_data) == 1:\n            break\n        valid_frames = get_valid_frames_by_spread(body_data['joints'].reshape(\n            -1, 25, 3))\n        num_frames = len(body_data['interval'])\n        num_noise = num_frames - len(valid_frames)\n        if num_noise == 0:\n            continue\n        ratio = num_noise / float(num_frames)\n        motion = body_data['motion']\n        if ratio >= noise_spr_thres2:  # 0.69754\n            del bodies_data[bodyID]",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:117-147"
+    },
+    "3741": {
+        "file_id": 319,
+        "content": "The function \"denoising_by_spread\" takes a sequence of body data and filters out any bodies with a high ratio of noisy frames. It uses the spread of Y and X values to determine if a frame is valid or not. If the ratio of noisy frames exceeds a predefined threshold, the corresponding body is removed from the data. This function ensures that only clean data is used for further processing.",
+        "type": "comment"
+    },
+    "3742": {
+        "file_id": 319,
+        "content": "            denoised_by_spr = True\n            noise_info += 'Filter out: %s (spread rate >= %.2f).\\n' % (\n                bodyID, noise_spr_thres2)\n            noise_spr_logger.info('%s\\t%s\\t%.6f\\t%.6f' %\n                                  (ske_name, bodyID, motion, ratio))\n        else:  # Update motion\n            joints = body_data['joints'].reshape(-1, 25, 3)[valid_frames]\n            body_data['motion'] = min(\n                motion, np.sum(np.var(joints.reshape(-1, 3), axis=0)))\n            noise_info += '%s: motion %.6f -> %.6f\\n' % (bodyID, motion,\n                                                         body_data['motion'])\n            # TODO: Consider removing noisy frames for each bodyID\n    if noise_info != '':\n        noise_info += '\\n'\n    return bodies_data, noise_info, denoised_by_spr\ndef denoising_by_motion(ske_name, bodies_data, bodies_motion):\n    \"\"\"\n    Filter out the bodyID which motion is out of the range of predefined interval\n    \"\"\"\n    # Sort bodies based on the motion, return a list of tuples",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:148-172"
+    },
+    "3743": {
+        "file_id": 319,
+        "content": "This function filters out frames with high noise (spread rate) and updates the motion values for each bodyID. It also returns a list of tuples sorted by motion, potentially removing noisy frames for each bodyID.",
+        "type": "comment"
+    },
+    "3744": {
+        "file_id": 319,
+        "content": "    # bodies_motion = sorted(bodies_motion.items(), key=lambda x, y: cmp(x[1], y[1]), reverse=True)\n    bodies_motion = sorted(bodies_motion.items(),\n                           key=lambda x: x[1],\n                           reverse=True)\n    # Reserve the body data with the largest motion\n    denoised_bodies_data = [(bodies_motion[0][0],\n                             bodies_data[bodies_motion[0][0]])]\n    noise_info = str()\n    for (bodyID, motion) in bodies_motion[1:]:\n        if (motion < noise_mot_thres_lo) or (motion > noise_mot_thres_hi):\n            noise_info += 'Filter out: %s, %.6f (motion).\\n' % (bodyID, motion)\n            noise_mot_logger.info('{}\\t{}\\t{:.6f}'.format(\n                ske_name, bodyID, motion))\n        else:\n            denoised_bodies_data.append((bodyID, bodies_data[bodyID]))\n    if noise_info != '':\n        noise_info += '\\n'\n    return denoised_bodies_data, noise_info\ndef denoising_bodies_data(bodies_data):\n    \"\"\"\n    Denoising data based on some heuristic methods, not necessarily correct for all samples.",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:173-198"
+    },
+    "3745": {
+        "file_id": 319,
+        "content": "This code sorts the motion data for each body, discards data with low or high motion values, and returns denoised body data along with information about filtered out bodies. The denoising process is based on heuristic methods that may not be correct for all samples.",
+        "type": "comment"
+    },
+    "3746": {
+        "file_id": 319,
+        "content": "    Return:\n      denoised_bodies_data (list): tuple: (bodyID, body_data).\n    \"\"\"\n    ske_name = bodies_data['name']\n    bodies_data = bodies_data['data']\n    # Step 1: Denoising based on frame length.\n    bodies_data, noise_info_len = denoising_by_length(ske_name, bodies_data)\n    if len(bodies_data) == 1:  # only has one bodyID left after step 1\n        return bodies_data.items(), noise_info_len\n    # Step 2: Denoising based on spread.\n    bodies_data, noise_info_spr, denoised_by_spr = denoising_by_spread(\n        ske_name, bodies_data)\n    if len(bodies_data) == 1:\n        return bodies_data.items(), noise_info_len + noise_info_spr\n    bodies_motion = dict()  # get body motion\n    for (bodyID, body_data) in bodies_data.items():\n        bodies_motion[bodyID] = body_data['motion']\n    # Sort bodies based on the motion\n    # bodies_motion = sorted(bodies_motion.items(), key=lambda x, y: cmp(x[1], y[1]), reverse=True)\n    bodies_motion = sorted(bodies_motion.items(),\n                           key=lambda x: x[1],",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:200-225"
+    },
+    "3747": {
+        "file_id": 319,
+        "content": "This code performs denoising on bodies data based on frame length and spread. It first denoises the data by frame length and then by spread, if necessary. The function returns a tuple containing the denoised bodies data and the noise information for each step. The code also sorts the bodies based on their motion and returns it in a sorted manner.",
+        "type": "comment"
+    },
+    "3748": {
+        "file_id": 319,
+        "content": "                           reverse=True)\n    denoised_bodies_data = list()\n    for (bodyID, _) in bodies_motion:\n        denoised_bodies_data.append((bodyID, bodies_data[bodyID]))\n    return denoised_bodies_data, noise_info_len + noise_info_spr\n    # TODO: Consider denoising further by integrating motion method\n    # if denoised_by_spr:  # this sequence has been denoised by spread\n    #     bodies_motion = sorted(bodies_motion.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)\n    #     denoised_bodies_data = list()\n    #     for (bodyID, _) in bodies_motion:\n    #         denoised_bodies_data.append((bodyID, bodies_data[bodyID]))\n    #     return denoised_bodies_data, noise_info\n    # Step 3: Denoising based on motion\n    # bodies_data, noise_info = denoising_by_motion(ske_name, bodies_data, bodies_motion)\n    # return bodies_data, noise_info\ndef get_one_actor_points(body_data, num_frames):\n    \"\"\"\n    Get joints and colors for only one actor.\n    For joints, each frame contains 75 X-Y-Z coordinates.\n    For colors, each frame contains 25 x 2 (X, Y) coordinates.",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:226-252"
+    },
+    "3749": {
+        "file_id": 319,
+        "content": "This code retrieves denoised data from the NTU RGB-D dataset, and considers further denoising by integrating motion. It also defines a function to get joints and colors for only one actor.",
+        "type": "comment"
+    },
+    "3750": {
+        "file_id": 319,
+        "content": "    \"\"\"\n    joints = np.zeros((num_frames, 75), dtype=np.float32)\n    colors = np.ones((num_frames, 1, 25, 2), dtype=np.float32) * np.nan\n    start, end = body_data['interval'][0], body_data['interval'][-1]\n    joints[start:end + 1] = body_data['joints'].reshape(-1, 75)\n    colors[start:end + 1, 0] = body_data['colors']\n    return joints, colors\ndef remove_missing_frames(ske_name, joints, colors):\n    \"\"\"\n    Cut off missing frames which all joints positions are 0s\n    For the sequence with 2 actors' data, also record the number of missing frames for\n    actor1 and actor2, respectively (for debug).\n    \"\"\"\n    num_frames = joints.shape[0]\n    num_bodies = colors.shape[1]  # 1 or 2\n    if num_bodies == 2:  # DEBUG\n        missing_indices_1 = np.where(joints[:, :75].sum(axis=1) == 0)[0]\n        missing_indices_2 = np.where(joints[:, 75:].sum(axis=1) == 0)[0]\n        cnt1 = len(missing_indices_1)\n        cnt2 = len(missing_indices_2)\n        start = 1 if 0 in missing_indices_1 else 0\n        end = 1 if num_frames - 1 in missing_indices_1 else 0",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:253-280"
+    },
+    "3751": {
+        "file_id": 319,
+        "content": "This code segment defines a function to get raw denoised data from body_data and another function to remove missing frames in the sequence. The first function initializes joints and colors arrays, extracts relevant data from body_data, and returns the joints and colors. The second function cuts off missing frames when all joint positions are 0s and records the number of missing frames for each actor if there are two actors' data.",
+        "type": "comment"
+    },
+    "3752": {
+        "file_id": 319,
+        "content": "        if max(cnt1, cnt2) > 0:\n            if cnt1 > cnt2:\n                info = '{}\\t{:^10d}\\t{:^6d}\\t{:^6d}\\t{:^5d}\\t{:^3d}'.format(\n                    ske_name, num_frames, cnt1, cnt2, start, end)\n                missing_skes_logger1.info(info)\n            else:\n                info = '{}\\t{:^10d}\\t{:^6d}\\t{:^6d}'.format(\n                    ske_name, num_frames, cnt1, cnt2)\n                missing_skes_logger2.info(info)\n    # Find valid frame indices that the data is not missing or lost\n    # For two-subjects action, this means both data of actor1 and actor2 is missing.\n    valid_indices = np.where(joints.sum(axis=1) != 0)[0]  # 0-based index\n    missing_indices = np.where(joints.sum(axis=1) == 0)[0]\n    num_missing = len(missing_indices)\n    if num_missing > 0:  # Update joints and colors\n        joints = joints[valid_indices]\n        colors[missing_indices] = np.nan\n        global missing_count\n        missing_count += 1\n        missing_skes_logger.info('{}\\t{:^10d}\\t{:^11d}'.format(\n            ske_name, num_frames, num_missing))",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:281-303"
+    },
+    "3753": {
+        "file_id": 319,
+        "content": "This code checks if any data is missing or lost for two subjects in a video. If there are missing frames, it updates the joints and colors arrays, marks missing indices with NaN, and logs the number of missing frames and total missing counts.",
+        "type": "comment"
+    },
+    "3754": {
+        "file_id": 319,
+        "content": "    return joints, colors\ndef get_bodies_info(bodies_data):\n    bodies_info = '{:^17}\\t{}\\t{:^8}\\n'.format('bodyID', 'Interval', 'Motion')\n    for (bodyID, body_data) in bodies_data.items():\n        start, end = body_data['interval'][0], body_data['interval'][-1]\n        bodies_info += '{}\\t{:^8}\\t{:f}\\n'.format(bodyID, str([start, end]),\n                                                  body_data['motion'])\n    return bodies_info + '\\n'\ndef get_two_actors_points(bodies_data):\n    \"\"\"\n    Get the first and second actor's joints positions and colors locations.\n    # Arguments:\n        bodies_data (dict): 3 key-value pairs: 'name', 'data', 'num_frames'.\n        bodies_data['data'] is also a dict, while the key is bodyID, the value is\n        the corresponding body_data which is also a dict with 4 keys:\n          - joints: raw 3D joints positions. Shape: (num_frames x 25, 3)\n          - colors: raw 2D color locations. Shape: (num_frames, 25, 2)\n          - interval: a list which records the frame indices.\n          - motion: motion amount",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:305-329"
+    },
+    "3755": {
+        "file_id": 319,
+        "content": "Function get_bodies_info formats the bodies' data into a string with bodyID, interval (start and end frame), and motion amount.\nFunction get_two_actors_points retrieves the first and second actor's joints positions and colors locations from given data.",
+        "type": "comment"
+    },
+    "3756": {
+        "file_id": 319,
+        "content": "    # Return:\n        joints, colors.\n    \"\"\"\n    ske_name = bodies_data['name']\n    label = int(ske_name[-2:])\n    num_frames = bodies_data['num_frames']\n    bodies_info = get_bodies_info(bodies_data['data'])\n    bodies_data, noise_info = denoising_bodies_data(\n        bodies_data)  # Denoising data\n    bodies_info += noise_info\n    bodies_data = list(bodies_data)\n    if len(bodies_data) == 1:  # Only left one actor after denoising\n        if label >= 50:  # DEBUG: Denoising failed for two-subjects action\n            fail_logger_2.info(ske_name)\n        bodyID, body_data = bodies_data[0]\n        joints, colors = get_one_actor_points(body_data, num_frames)\n        bodies_info += 'Main actor: %s' % bodyID\n    else:\n        if label < 50:  # DEBUG: Denoising failed for one-subject action\n            fail_logger_1.info(ske_name)\n        joints = np.zeros((num_frames, 150), dtype=np.float32)\n        colors = np.ones((num_frames, 2, 25, 2), dtype=np.float32) * np.nan\n        bodyID, actor1 = bodies_data[0]  # the 1st actor with largest motion",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:331-358"
+    },
+    "3757": {
+        "file_id": 319,
+        "content": "This function denoises bodies data and extracts joints and colors information for each frame. If only one actor remains after denoising, it checks if the action is for two subjects (label >= 50) and retrieves joints and colors from the remaining actor. If there are still multiple actors but the action is for one subject (label < 50), it initializes joints as zeros and colors as nans.",
+        "type": "comment"
+    },
+    "3758": {
+        "file_id": 319,
+        "content": "        start1, end1 = actor1['interval'][0], actor1['interval'][-1]\n        joints[start1:end1 + 1, :75] = actor1['joints'].reshape(-1, 75)\n        colors[start1:end1 + 1, 0] = actor1['colors']\n        actor1_info = '{:^17}\\t{}\\t{:^8}\\n'.format('Actor1', 'Interval', 'Motion') + \\\n                      '{}\\t{:^8}\\t{:f}\\n'.format(bodyID, str([start1, end1]), actor1['motion'])\n        del bodies_data[0]\n        actor2_info = '{:^17}\\t{}\\t{:^8}\\n'.format('Actor2', 'Interval',\n                                                   'Motion')\n        start2, end2 = [0, 0]  # initial interval for actor2 (virtual)\n        while len(bodies_data) > 0:\n            bodyID, actor = bodies_data[0]\n            start, end = actor['interval'][0], actor['interval'][-1]\n            if min(end1, end) - max(start1,\n                                    start) <= 0:  # no overlap with actor1\n                joints[start:end + 1, :75] = actor['joints'].reshape(-1, 75)\n                colors[start:end + 1, 0] = actor['colors']\n                actor1_info += '{}\\t{:^8}\\t{:f}\\n'.format(",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:359-377"
+    },
+    "3759": {
+        "file_id": 319,
+        "content": "Code snippet extracts joints, colors and other information from actors' data and assigns them to relevant arrays. It also generates formatted information about each actor including their interval and motion. The while loop iterates through the bodies_data list, considering only those actors whose intervals do not overlap with Actor1, appending their joints and colors to the respective arrays.",
+        "type": "comment"
+    },
+    "3760": {
+        "file_id": 319,
+        "content": "                    bodyID, str([start, end]), actor['motion'])\n                # Update the interval of actor1\n                start1 = min(start, start1)\n                end1 = max(end, end1)\n            elif min(end2, end) - max(start2,\n                                      start) <= 0:  # no overlap with actor2\n                joints[start:end + 1, 75:] = actor['joints'].reshape(-1, 75)\n                colors[start:end + 1, 1] = actor['colors']\n                actor2_info += '{}\\t{:^8}\\t{:f}\\n'.format(\n                    bodyID, str([start, end]), actor['motion'])\n                # Update the interval of actor2\n                start2 = min(start, start2)\n                end2 = max(end, end2)\n            del bodies_data[0]\n        bodies_info += ('\\n' + actor1_info + '\\n' + actor2_info)\n    with open(osp.join(actors_info_dir, ske_name + '.txt'), 'w') as fw:\n        fw.write(bodies_info + '\\n')\n    return joints, colors\ndef get_raw_denoised_data():\n    \"\"\"\n    Get denoised data (joints positions and color locations) from raw skeleton sequences.",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:378-403"
+    },
+    "3761": {
+        "file_id": 319,
+        "content": "This function extracts and denoises joint positions and color locations from raw skeleton sequences. It takes intervals of actor1 and actor2, updates their intervals if there's no overlap, and then stores the information in separate variables. Finally, it writes the information to a text file.",
+        "type": "comment"
+    },
+    "3762": {
+        "file_id": 319,
+        "content": "    For each frame of a skeleton sequence, an actor's 3D positions of 25 joints represented\n    by an 2D array (shape: 25 x 3) is reshaped into a 75-dim vector by concatenating each\n    3-dim (x, y, z) coordinates along the row dimension in joint order. Each frame contains\n    two actor's joints positions constituting a 150-dim vector. If there is only one actor,\n    then the last 75 values are filled with zeros. Otherwise, select the main actor and the\n    second actor based on the motion amount. Each 150-dim vector as a row vector is put into\n    a 2D numpy array where the number of rows equals the number of valid frames. All such\n    2D arrays are put into a list and finally the list is serialized into a cPickle file.\n    For the skeleton sequence which contains two or more actors (mostly corresponds to the\n    last 11 classes), the filename and actors' information are recorded into log files.\n    For better understanding, also generate RGB+skeleton videos for visualization.\n    \"\"\"\n    with open(raw_data_file, 'rb') as fr:  # load raw skeletons data",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:405-419"
+    },
+    "3763": {
+        "file_id": 319,
+        "content": "This code reads raw skeleton data from a file, then processes and reshapes the 3D positions of each joint into a 75-dimensional vector for each frame. If there's only one actor, it fills zeros to complete the 150-dimensional vector. It selects the main and second actors based on motion amount. The resulting 2D arrays are stored in a list and serialized into a cPickle file. Additionally, log files record the filename and actors' information for skeleton sequences with two or more actors. The code also generates RGB+skeleton videos for better visualization.",
+        "type": "comment"
+    },
+    "3764": {
+        "file_id": 319,
+        "content": "        raw_skes_data = pickle.load(fr)\n    num_skes = len(raw_skes_data)\n    print('Found %d available skeleton sequences.' % num_skes)\n    raw_denoised_joints = []\n    raw_denoised_colors = []\n    frames_cnt = []\n    for (idx, bodies_data) in enumerate(raw_skes_data):\n        ske_name = bodies_data['name']\n        print('Processing %s' % ske_name)\n        num_bodies = len(bodies_data['data'])\n        if num_bodies == 1:  # only 1 actor\n            num_frames = bodies_data['num_frames']\n            body_data = list(bodies_data['data'].values())[0]\n            joints, colors = get_one_actor_points(body_data, num_frames)\n        else:  # more than 1 actor, select two main actors\n            joints, colors = get_two_actors_points(bodies_data)\n            # Remove missing frames\n            joints, colors = remove_missing_frames(ske_name, joints, colors)\n            num_frames = joints.shape[0]  # Update\n            # Visualize selected actors' skeletons on RGB videos.\n        raw_denoised_joints.append(joints)",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:420-445"
+    },
+    "3765": {
+        "file_id": 319,
+        "content": "Code reads raw skeleton sequence data from file, counts the number of sequences, and processes each sequence by extracting joints and color data. It handles single or multiple actors in a sequence, removes missing frames if necessary, and stores processed data into separate lists for further processing.",
+        "type": "comment"
+    },
+    "3766": {
+        "file_id": 319,
+        "content": "        raw_denoised_colors.append(colors)\n        frames_cnt.append(num_frames)\n        if (idx + 1) % 1000 == 0:\n            print('Processed: %.2f%% (%d / %d), ' % \\\n                  (100.0 * (idx + 1) / num_skes, idx + 1, num_skes) + \\\n                  'Missing count: %d' % missing_count)\n    raw_skes_joints_pkl = osp.join(save_path, 'raw_denoised_joints.pkl')\n    with open(raw_skes_joints_pkl, 'wb') as f:\n        pickle.dump(raw_denoised_joints, f, pickle.HIGHEST_PROTOCOL)\n    raw_skes_colors_pkl = osp.join(save_path, 'raw_denoised_colors.pkl')\n    with open(raw_skes_colors_pkl, 'wb') as f:\n        pickle.dump(raw_denoised_colors, f, pickle.HIGHEST_PROTOCOL)\n    frames_cnt = np.array(frames_cnt, dtype=np.int)\n    np.savetxt(osp.join(save_path, 'frames_cnt.txt'), frames_cnt, fmt='%d')\n    print('Saved raw denoised positions of {} frames into {}'.format(\n        np.sum(frames_cnt), raw_skes_joints_pkl))\n    print('Found %d files that have missing data' % missing_count)\nif __name__ == '__main__':\n    get_raw_denoised_data()",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_denoised_data.py:446-471"
+    },
+    "3767": {
+        "file_id": 319,
+        "content": "The code iterates over a set of skes, appends raw denoised joints and colors to lists, prints progress, and saves the data into pickle files and text file. It also counts missing data and reports it at the end.",
+        "type": "comment"
+    },
+    "3768": {
+        "file_id": 320,
+        "content": "/data/ntu-rgb-d/get_raw_skes_data.py",
+        "type": "filepath"
+    },
+    "3769": {
+        "file_id": 320,
+        "content": "This function processes skeleton data, extracts body information, updates a dictionary with the data and returns the skeleton name, body data, and frame count. It handles missing frames and calculates motion using NTU RGB-D dataset data. Additionally, it combines and processes raw skeleton data from multiple files, updating progress, filters out missing frames, logs events, and saves the filtered data into pickle files.",
+        "type": "summary"
+    },
+    "3770": {
+        "file_id": 320,
+        "content": "# ref: https://github.com/Uason-Chen/CTR-GCN/blob/main/data/ntu/get_raw_skes_data.py\nimport os.path as osp\nimport os\nimport numpy as np\nimport pickle\nimport logging\ndef get_raw_bodies_data(skes_path, ske_name, frames_drop_skes,\n                        frames_drop_logger):\n    \"\"\"\n    Get raw bodies data from a skeleton sequence.\n    Each body's data is a dict that contains the following keys:\n      - joints: raw 3D joints positions. Shape: (num_frames x 25, 3)\n      - colors: raw 2D color locations. Shape: (num_frames, 25, 2)\n      - interval: a list which stores the frame indices of this body.\n      - motion: motion amount (only for the sequence with 2 or more bodyIDs).\n    Return:\n      a dict for a skeleton sequence with 3 key-value pairs:\n        - name: the skeleton filename.\n        - data: a dict which stores raw data of each body.\n        - num_frames: the number of valid frames.\n    \"\"\"\n    ske_file = osp.join(skes_path, ske_name + '.skeleton')\n    assert osp.exists(ske_file), 'Error: Skeleton file %s not found' % ske_file",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_skes_data.py:1-28"
+    },
+    "3771": {
+        "file_id": 320,
+        "content": "This function gets raw bodies data from a skeleton sequence by loading the file and checking its existence. It returns a dictionary with three key-value pairs: name (skeleton filename), data (raw data of each body), and num_frames (number of valid frames).",
+        "type": "comment"
+    },
+    "3772": {
+        "file_id": 320,
+        "content": "    # Read all data from .skeleton file into a list (in string format)\n    print('Reading data from %s' % ske_file[-29:])\n    with open(ske_file, 'r') as fr:\n        str_data = fr.readlines()\n    num_frames = int(str_data[0].strip('\\r\\n'))\n    frames_drop = []\n    bodies_data = dict()\n    valid_frames = -1  # 0-based index\n    current_line = 1\n    for f in range(num_frames):\n        num_bodies = int(str_data[current_line].strip('\\r\\n'))\n        current_line += 1\n        if num_bodies == 0:  # no data in this frame, drop it\n            frames_drop.append(f)  # 0-based index\n            continue\n        valid_frames += 1\n        joints = np.zeros((num_bodies, 25, 3), dtype=np.float32)\n        colors = np.zeros((num_bodies, 25, 2), dtype=np.float32)\n        for b in range(num_bodies):\n            bodyID = str_data[current_line].strip('\\r\\n').split()[0]\n            current_line += 1\n            num_joints = int(str_data[current_line].strip('\\r\\n'))  # 25 joints\n            current_line += 1\n            for j in range(num_joints):",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_skes_data.py:29-58"
+    },
+    "3773": {
+        "file_id": 320,
+        "content": "Reading and processing .skeleton file data into a list, storing number of frames, ignoring frames with no bodies, extracting body IDs, and counting the number of joints for each body.",
+        "type": "comment"
+    },
+    "3774": {
+        "file_id": 320,
+        "content": "                temp_str = str_data[current_line].strip('\\r\\n').split()\n                joints[b, j, :] = np.array(temp_str[:3], dtype=np.float32)\n                colors[b, j, :] = np.array(temp_str[5:7], dtype=np.float32)\n                current_line += 1\n            if bodyID not in bodies_data:  # Add a new body's data\n                body_data = dict()\n                body_data['joints'] = joints[b]  # ndarray: (25, 3)\n                body_data['colors'] = colors[b,\n                                             np.newaxis]  # ndarray: (1, 25, 2)\n                body_data['interval'] = [valid_frames\n                                         ]  # the index of the first frame\n            else:  # Update an already existed body's data\n                body_data = bodies_data[bodyID]\n                # Stack each body's data of each frame along the frame order\n                body_data['joints'] = np.vstack(\n                    (body_data['joints'], joints[b]))\n                body_data['colors'] = np.vstack(",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_skes_data.py:59-76"
+    },
+    "3775": {
+        "file_id": 320,
+        "content": "This code reads data from a file, extracts joint and color information for each body, and updates or adds body data to a dictionary based on the body ID. The joint and color arrays are created using numpy functions, and the joints array is stacked along the frame order if the body's data already exists in the dictionary.",
+        "type": "comment"
+    },
+    "3776": {
+        "file_id": 320,
+        "content": "                    (body_data['colors'], colors[b, np.newaxis]))\n                pre_frame_idx = body_data['interval'][-1]\n                body_data['interval'].append(pre_frame_idx +\n                                             1)  # add a new frame index\n            bodies_data[bodyID] = body_data  # Update bodies_data\n    num_frames_drop = len(frames_drop)\n    assert num_frames_drop < num_frames, \\\n        'Error: All frames data (%d) of %s is missing or lost' % (num_frames, ske_name)\n    if num_frames_drop > 0:\n        frames_drop_skes[ske_name] = np.array(frames_drop, dtype=np.int)\n        frames_drop_logger.info('{}: {} frames missed: {}\\n'.format(\n            ske_name, num_frames_drop, frames_drop))\n    # Calculate motion (only for the sequence with 2 or more bodyIDs)\n    if len(bodies_data) > 1:\n        for body_data in bodies_data.values():\n            body_data['motion'] = np.sum(np.var(body_data['joints'], axis=0))\n    return {\n        'name': ske_name,\n        'data': bodies_data,\n        'num_frames': num_frames - num_frames_drop",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_skes_data.py:77-100"
+    },
+    "3777": {
+        "file_id": 320,
+        "content": "This code retrieves raw data for a specific subject's skeleton (ske_name) from the NTU RGB-D dataset. It handles missing frames, calculates motion based on body data with multiple bodyIDs and returns the skeleton name, body data and updated frame count.",
+        "type": "comment"
+    },
+    "3778": {
+        "file_id": 320,
+        "content": "    }\ndef get_raw_skes_data():\n    skes_name = np.loadtxt(skes_name_file, dtype=str)\n    num_files = skes_name.size\n    print('Found %d available skeleton files.' % num_files)\n    raw_skes_data = []\n    frames_cnt = np.zeros(num_files, dtype=np.int)\n    for (idx, ske_name) in enumerate(skes_name):\n        bodies_data = get_raw_bodies_data(skes_path, ske_name, frames_drop_skes,\n                                          frames_drop_logger)\n        raw_skes_data.append(bodies_data)\n        frames_cnt[idx] = bodies_data['num_frames']\n        if (idx + 1) % 1000 == 0:\n            print('Processed: %.2f%% (%d / %d)' % \\\n                  (100.0 * (idx + 1) / num_files, idx + 1, num_files))\n    with open(save_data_pkl, 'wb') as fw:\n        pickle.dump(raw_skes_data, fw, pickle.HIGHEST_PROTOCOL)\n    np.savetxt(osp.join(save_path, 'raw_data', 'frames_cnt.txt'),\n               frames_cnt,\n               fmt='%d')\n    print('Saved raw bodies data into %s' % save_data_pkl)\n    print('Total frames: %d' % np.sum(frames_cnt))",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_skes_data.py:101-130"
+    },
+    "3779": {
+        "file_id": 320,
+        "content": "This function retrieves raw skeleton data from multiple files, processes it, and saves the combined data in a file. It keeps track of the number of frames for each file and prints progress updates every 1000 files processed.",
+        "type": "comment"
+    },
+    "3780": {
+        "file_id": 320,
+        "content": "    with open(frames_drop_pkl, 'wb') as fw:\n        pickle.dump(frames_drop_skes, fw, pickle.HIGHEST_PROTOCOL)\nif __name__ == '__main__':\n    save_path = './'\n    skes_path = '../ntu-rgb-d/nturgb+d_skeletons/'\n    stat_path = osp.join(save_path, 'statistics')\n    if not osp.exists('./raw_data'):\n        os.makedirs('./raw_data')\n    skes_name_file = osp.join(stat_path, 'skes_available_name.txt')\n    save_data_pkl = osp.join(save_path, 'raw_data', 'raw_skes_data.pkl')\n    frames_drop_pkl = osp.join(save_path, 'raw_data', 'frames_drop_skes.pkl')\n    frames_drop_logger = logging.getLogger('frames_drop')\n    frames_drop_logger.setLevel(logging.INFO)\n    frames_drop_logger.addHandler(\n        logging.FileHandler(osp.join(save_path, 'raw_data', 'frames_drop.log')))\n    frames_drop_skes = dict()\n    get_raw_skes_data()\n    with open(frames_drop_pkl, 'wb') as fw:\n        pickle.dump(frames_drop_skes, fw, pickle.HIGHEST_PROTOCOL)",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/get_raw_skes_data.py:132-157"
+    },
+    "3781": {
+        "file_id": 320,
+        "content": "This code reads data from the NTU-RGB+D dataset, filters out frames with missing skeleton data, and saves it into two pickle files. The data is read from a specific path, and if the raw_data directory does not exist, it creates one. A logger for frames drop events is also set up and logs to a file. Finally, the code dumps the filtered frames data into another pickle file.",
+        "type": "comment"
+    },
+    "3782": {
+        "file_id": 321,
+        "content": "/data/ntu-rgb-d/seq_transformation.py",
+        "type": "filepath"
+    },
+    "3783": {
+        "file_id": 321,
+        "content": "The code imports modules, defines functions for data processing and splitting, handles evaluation cases, transforms joints, encodes labels, and saves the training/testing sets in suitable formats. It applies translation, alignment, and uses \"split_dataset\" function to create train/test indices before printing 'Done!'.",
+        "type": "summary"
+    },
+    "3784": {
+        "file_id": 321,
+        "content": "# ref: https://github.com/Uason-Chen/CTR-GCN/blob/main/data/ntu/seq_transformation.py\nimport os\nimport os.path as osp\nimport numpy as np\nimport pickle\nimport logging\nfrom sklearn.model_selection import train_test_split\nroot_path = './'\nstat_path = osp.join(root_path, 'statistics')\nsetup_file = osp.join(stat_path, 'setup.txt')\ncamera_file = osp.join(stat_path, 'camera.txt')\nperformer_file = osp.join(stat_path, 'performer.txt')\nreplication_file = osp.join(stat_path, 'replication.txt')\nlabel_file = osp.join(stat_path, 'label.txt')\nskes_name_file = osp.join(stat_path, 'skes_available_name.txt')\ndenoised_path = osp.join(root_path, 'denoised_data')\nraw_skes_joints_pkl = osp.join(denoised_path, 'raw_denoised_joints.pkl')\nframes_file = osp.join(denoised_path, 'frames_cnt.txt')\nsave_path = './'\nif not osp.exists(save_path):\n    os.mkdir(save_path)\ndef remove_nan_frames(ske_name, ske_joints, nan_logger):\n    num_frames = ske_joints.shape[0]\n    valid_frames = []\n    for f in range(num_frames):\n        if not np.any(np.isnan(ske_joints[f])):",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:1-34"
+    },
+    "3785": {
+        "file_id": 321,
+        "content": "This code imports necessary modules and defines constants for file paths. It checks if a directory exists, creates it if not, and defines a function to remove frames with NaN values while logging such occurrences.",
+        "type": "comment"
+    },
+    "3786": {
+        "file_id": 321,
+        "content": "            valid_frames.append(f)\n        else:\n            nan_indices = np.where(np.isnan(ske_joints[f]))[0]\n            nan_logger.info('{}\\t{:^5}\\t{}'.format(ske_name, f + 1,\n                                                   nan_indices))\n    return ske_joints[valid_frames]\ndef seq_translation(skes_joints):\n    for idx, ske_joints in enumerate(skes_joints):\n        num_frames = ske_joints.shape[0]\n        num_bodies = 1 if ske_joints.shape[1] == 75 else 2\n        if num_bodies == 2:\n            missing_frames_1 = np.where(ske_joints[:, :75].sum(axis=1) == 0)[0]\n            missing_frames_2 = np.where(ske_joints[:, 75:].sum(axis=1) == 0)[0]\n            cnt1 = len(missing_frames_1)\n            cnt2 = len(missing_frames_2)\n        i = 0  # get the \"real\" first frame of actor1\n        while i < num_frames:\n            if np.any(ske_joints[i, :75] != 0):\n                break\n            i += 1\n        origin = np.copy(ske_joints[i, 3:6])  # new origin: joint-2\n        for f in range(num_frames):\n            if num_bodies == 1:",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:35-63"
+    },
+    "3787": {
+        "file_id": 321,
+        "content": "The code defines a function \"seq_translation\" that iterates through multiple skeleton joints sequences. It checks for missing frames and calculates the origin point. It returns valid frames only if any are found, or logs nan indices otherwise. The code also handles cases with one or two bodies in the sequence.",
+        "type": "comment"
+    },
+    "3788": {
+        "file_id": 321,
+        "content": "                ske_joints[f] -= np.tile(origin, 25)\n            else:  # for 2 actors\n                ske_joints[f] -= np.tile(origin, 50)\n        if (num_bodies == 2) and (cnt1 > 0):\n            ske_joints[missing_frames_1, :75] = np.zeros((cnt1, 75),\n                                                         dtype=np.float32)\n        if (num_bodies == 2) and (cnt2 > 0):\n            ske_joints[missing_frames_2, 75:] = np.zeros((cnt2, 75),\n                                                         dtype=np.float32)\n        skes_joints[idx] = ske_joints  # Update\n    return skes_joints\ndef frame_translation(skes_joints, skes_name, frames_cnt):\n    nan_logger = logging.getLogger('nan_skes')\n    nan_logger.setLevel(logging.INFO)\n    nan_logger.addHandler(logging.FileHandler(\"./nan_frames.log\"))\n    nan_logger.info('{}\\t{}\\t{}'.format('Skeleton', 'Frame', 'Joints'))\n    for idx, ske_joints in enumerate(skes_joints):\n        num_frames = ske_joints.shape[0]\n        # Calculate the distance between spine base (joint-1) and spine (joint-21)",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:64-89"
+    },
+    "3789": {
+        "file_id": 321,
+        "content": "This code is performing sequence transformation for NTU RGB+D dataset. It subtracts origin from joint coordinates and handles missing frames by setting them to zero if there are only two actors. It also logs information about skeletons, frames, and joints using a logger.",
+        "type": "comment"
+    },
+    "3790": {
+        "file_id": 321,
+        "content": "        j1 = ske_joints[:, 0:3]\n        j21 = ske_joints[:, 60:63]\n        dist = np.sqrt(((j1 - j21)**2).sum(axis=1))\n        for f in range(num_frames):\n            origin = ske_joints[f, 3:\n                                6]  # new origin: middle of the spine (joint-2)\n            if (ske_joints[f, 75:] == 0).all():\n                ske_joints[f, :75] = (ske_joints[f, :75] - np.tile(origin, 25)) / \\\n                                      dist[f] + np.tile(origin, 25)\n            else:\n                ske_joints[f] = (ske_joints[f] - np.tile(origin, 50)) / \\\n                                 dist[f] + np.tile(origin, 50)\n        ske_name = skes_name[idx]\n        ske_joints = remove_nan_frames(ske_name, ske_joints, nan_logger)\n        frames_cnt[idx] = num_frames  # update valid number of frames\n        skes_joints[idx] = ske_joints\n    return skes_joints, frames_cnt\ndef align_frames(skes_joints, frames_cnt):\n    \"\"\"\n    Align all sequences with the same frame length.\n    \"\"\"\n    num_skes = len(skes_joints)\n    max_num_frames = frames_cnt.max()  # 300",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:90-118"
+    },
+    "3791": {
+        "file_id": 321,
+        "content": "This code aligns all sequences to the same frame length by subtracting the origin (middle of spine joint) from each skeleton joint, normalizing the resulting coordinates based on the distance between the new origin and original origin. It updates the number of valid frames for each sequence and returns the aligned skeleton joints and updated frame counts.",
+        "type": "comment"
+    },
+    "3792": {
+        "file_id": 321,
+        "content": "    aligned_skes_joints = np.zeros((num_skes, max_num_frames, 150),\n                                   dtype=np.float32)\n    for idx, ske_joints in enumerate(skes_joints):\n        num_frames = ske_joints.shape[0]\n        num_bodies = 1 if ske_joints.shape[1] == 75 else 2\n        if num_bodies == 1:\n            aligned_skes_joints[idx, :num_frames] = np.hstack(\n                (ske_joints, np.zeros_like(ske_joints)))\n        else:\n            aligned_skes_joints[idx, :num_frames] = ske_joints\n    return aligned_skes_joints\ndef one_hot_vector(labels):\n    num_skes = len(labels)\n    labels_vector = np.zeros((num_skes, 60))\n    for idx, l in enumerate(labels):\n        labels_vector[idx, l] = 1\n    return labels_vector\ndef split_train_val(train_indices, method='sklearn', ratio=0.05):\n    \"\"\"\n    Get validation set by splitting data randomly from training set with two methods.\n    In fact, I thought these two methods are equal as they got the same performance.\n    \"\"\"\n    if method == 'sklearn':\n        return train_test_split(train_indices,",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:119-150"
+    },
+    "3793": {
+        "file_id": 321,
+        "content": "This code is part of the PaddleVideo library and contains three functions. The first function, `seq_transformation`, takes a list of skeleton joints and transforms them into aligned positions for all frames. It handles cases where there are either one or two bodies. The second function, `one_hot_vector`, converts a list of labels into a one-hot encoded vector. Lastly, the third function, `split_train_val`, splits the training set into train and validation sets using a specified method (either 'sklearn' or user-defined) and ratio.",
+        "type": "comment"
+    },
+    "3794": {
+        "file_id": 321,
+        "content": "                                test_size=ratio,\n                                random_state=10000)\n    else:\n        np.random.seed(10000)\n        np.random.shuffle(train_indices)\n        val_num_skes = int(np.ceil(0.05 * len(train_indices)))\n        val_indices = train_indices[:val_num_skes]\n        train_indices = train_indices[val_num_skes:]\n        return train_indices, val_indices\ndef split_dataset(skes_name, skes_joints, label, performer, camera, evaluation,\n                  save_path):\n    train_indices, test_indices = get_indices(performer, camera, evaluation)\n    m = 'sklearn'  # 'sklearn' or 'numpy'\n    # Select validation set from training set\n    # train_indices, val_indices = split_train_val(train_indices, m)\n    # Save labels and num_frames for each sequence of each data set\n    train_labels = label[train_indices]\n    test_labels = label[test_indices]\n    train_x = skes_joints[train_indices]\n    # train_y = one_hot_vector(train_labels)\n    test_x = skes_joints[test_indices]\n    # test_y = one_hot_vector(test_labels)",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:151-176"
+    },
+    "3795": {
+        "file_id": 321,
+        "content": "This code defines a function to split a dataset into training and validation sets based on the input parameters. It also includes functionality for selecting the validation set from the training set using either sklearn or numpy methods, and saving labels and features (joints positions) for each sequence of each dataset.",
+        "type": "comment"
+    },
+    "3796": {
+        "file_id": 321,
+        "content": "    evaluation_path = osp.join(save_path, evaluation)\n    isExists = osp.exists(evaluation_path)\n    if not isExists:\n        os.makedirs(evaluation_path)\n    train_data_save_path = osp.join(evaluation_path, 'train_data.npy')\n    train_label_save_path = osp.join(evaluation_path, 'train_label.pkl')\n    val_data_save_path = osp.join(evaluation_path, 'val_data.npy')\n    val_label_save_path = osp.join(evaluation_path, 'val_label.pkl')\n    # reshape data\n    N, T, VC = train_x.shape\n    train_x = np.reshape(train_x, (N, T, 2, 25, 3))\n    train_x = np.transpose(train_x, (0, 4, 1, 3, 2))\n    N, T, VC = test_x.shape\n    test_x = np.reshape(test_x, (N, T, 2, 25, 3))\n    test_x = np.transpose(test_x, (0, 4, 1, 3, 2))\n    # save train\n    np.save(train_data_save_path, train_x)\n    out = [skes_name[train_indices], train_labels]\n    with open(train_label_save_path, 'wb') as f:\n        pickle.dump(out, f)\n    # save test\n    np.save(val_data_save_path, test_x)\n    out = [skes_name[test_indices], test_labels]\n    with open(val_label_save_path, 'wb') as f:",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:178-204"
+    },
+    "3797": {
+        "file_id": 321,
+        "content": "The code is creating evaluation paths, checking if they exist, and then initializing the paths for train_data.npy, train_label.pkl, val_data.npy, and val_label.pkl files. It reshapes the train and test data and saves them using np.save() function. The train and test labels are also saved separately in pickle format.",
+        "type": "comment"
+    },
+    "3798": {
+        "file_id": 321,
+        "content": "        pickle.dump(out, f)\ndef get_indices(performer, camera, evaluation='xsub'):\n    test_indices = np.empty(0)\n    train_indices = np.empty(0)\n    if evaluation == 'xsub':  # Cross Subject (Subject IDs)\n        train_ids = [\n            1, 2, 4, 5, 8, 9, 13, 14, 15, 16, 17, 18, 19, 25, 27, 28, 31, 34,\n            35, 38\n        ]\n        test_ids = [\n            3, 6, 7, 10, 11, 12, 20, 21, 22, 23, 24, 26, 29, 30, 32, 33, 36, 37,\n            39, 40\n        ]\n        # Get indices of test data\n        for idx in test_ids:\n            temp = np.where(performer == idx)[0]  # 0-based index\n            test_indices = np.hstack((test_indices, temp)).astype(np.int)\n        # Get indices of training data\n        for train_id in train_ids:\n            temp = np.where(performer == train_id)[0]  # 0-based index\n            train_indices = np.hstack((train_indices, temp)).astype(np.int)\n    else:  # Cross View (Camera IDs)\n        train_ids = [2, 3]\n        test_ids = 1\n        # Get indices of test data\n        temp = np.where(camera == test_ids)[0]  # 0-based index",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:205-235"
+    },
+    "3799": {
+        "file_id": 321,
+        "content": "This function, `get_indices`, takes performer and camera as inputs and returns the indices of training and test data based on either cross-subject or cross-view evaluation. For cross-subject, it selects train/test IDs, then finds their respective indices in the performer array. Similarly, for cross-view, it selects train/test camera IDs and finds their indices. The code handles both cases and returns the training and test indices separately.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/38.json b/docs/data/38.json
new file mode 100644
index 000000000..8ad1a141e
--- /dev/null
+++ b/docs/data/38.json
@@ -0,0 +1,544 @@
+{
+    "3800": {
+        "file_id": 321,
+        "content": "        test_indices = np.hstack((test_indices, temp)).astype(np.int)\n        # Get indices of training data\n        for train_id in train_ids:\n            temp = np.where(camera == train_id)[0]  # 0-based index\n            train_indices = np.hstack((train_indices, temp)).astype(np.int)\n    return train_indices, test_indices\nif __name__ == '__main__':\n    camera = np.loadtxt(camera_file, dtype=np.int)  # camera id: 1, 2, 3\n    performer = np.loadtxt(performer_file, dtype=np.int)  # subject id: 1~40\n    label = np.loadtxt(label_file, dtype=np.int) - 1  # action label: 0~59\n    frames_cnt = np.loadtxt(frames_file, dtype=np.int)  # frames_cnt\n    skes_name = np.loadtxt(skes_name_file, dtype=np.string_)\n    with open(raw_skes_joints_pkl, 'rb') as fr:\n        skes_joints = pickle.load(fr)  # a list\n    skes_joints = seq_translation(skes_joints)\n    skes_joints = align_frames(skes_joints,\n                               frames_cnt)  # aligned to the same frame length\n    evaluations = ['xview', 'xsub']\n    for evaluation in evaluations:",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:236-263"
+    },
+    "3801": {
+        "file_id": 321,
+        "content": "Code reads camera, performer, label, and frames_cnt from respective files. It loads skes_name and skes_joints data from file using pickle. Applies seq_translation and align_frames to skes_joints. Creates test_indices and train_ids by filtering camera ids. Returns train_indices and test_indices.",
+        "type": "comment"
+    },
+    "3802": {
+        "file_id": 321,
+        "content": "        split_dataset(skes_name, skes_joints, label, performer, camera,\n                      evaluation, save_path)\n    print('Done!')",
+        "type": "code",
+        "location": "/data/ntu-rgb-d/seq_transformation.py:264-266"
+    },
+    "3803": {
+        "file_id": 321,
+        "content": "The code is calling the \"split_dataset\" function to process data, likely dividing it into training and testing sets. The input parameters include various file names, labels, performers, cameras, evaluation criteria, and a save path. Once completed, it prints 'Done!'.",
+        "type": "comment"
+    },
+    "3804": {
+        "file_id": 322,
+        "content": "/deploy/cpp_infer/external-cmake/auto-log.cmake",
+        "type": "filepath"
+    },
+    "3805": {
+        "file_id": 322,
+        "content": "This code is used to find and include the Git package, declare an external project named \"extern_Autolog\" using FetchContent, set its base directory, specify the repository URL and tag, and finally make the external project available for use.",
+        "type": "summary"
+    },
+    "3806": {
+        "file_id": 322,
+        "content": "find_package(Git REQUIRED)\ninclude(FetchContent)\nset(FETCHCONTENT_BASE_DIR \"${CMAKE_CURRENT_BINARY_DIR}/third-party\")\nFetchContent_Declare(\n  extern_Autolog\n  PREFIX autolog\n  GIT_REPOSITORY https://github.com/LDOUBLEV/AutoLog.git\n  GIT_TAG        main\n)\nFetchContent_MakeAvailable(extern_Autolog)",
+        "type": "code",
+        "location": "/deploy/cpp_infer/external-cmake/auto-log.cmake:1-12"
+    },
+    "3807": {
+        "file_id": 322,
+        "content": "This code is used to find and include the Git package, declare an external project named \"extern_Autolog\" using FetchContent, set its base directory, specify the repository URL and tag, and finally make the external project available for use.",
+        "type": "comment"
+    },
+    "3808": {
+        "file_id": 323,
+        "content": "/deploy/cpp_infer/include/postprocess_op.h",
+        "type": "filepath"
+    },
+    "3809": {
+        "file_id": 323,
+        "content": "This code defines a Softmax class with an Inplace_Run method that applies softmax function in-place to iterator ranges of float vectors. The code also includes a virtual function for postprocessing operations in the PaddleVideo library.",
+        "type": "summary"
+    },
+    "3810": {
+        "file_id": 323,
+        "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#pragma once\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include \"include/utility.h\"\nnamespace PaddleVideo\n{\n    class Softmax\n    {\n    public:\n        virtual void Inplace_Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end);",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/postprocess_op.h:1-39"
+    },
+    "3811": {
+        "file_id": 323,
+        "content": "This code defines a class Softmax that contains a method Inplace_Run. The method takes an iterator range of a vector of floats and applies softmax function in-place to the values within this range.",
+        "type": "comment"
+    },
+    "3812": {
+        "file_id": 323,
+        "content": "        virtual std::vector<float> Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end);\n    };\n} // namespace PaddleVideo",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/postprocess_op.h:40-43"
+    },
+    "3813": {
+        "file_id": 323,
+        "content": "This code defines a virtual function that takes in two iterators to a vector of floats and returns a vector of floats as output. It is part of the PaddleVideo library's postprocessing operation namespace.",
+        "type": "comment"
+    },
+    "3814": {
+        "file_id": 324,
+        "content": "/deploy/cpp_infer/include/preprocess_op.h",
+        "type": "filepath"
+    },
+    "3815": {
+        "file_id": 324,
+        "content": "This code defines the Normalize class for image normalization, along with several preprocessing operation classes like Permute, Scale, CenterCrop, and TenCrop, which can be used in PaddleVideo library for preparing images before inference.",
+        "type": "summary"
+    },
+    "3816": {
+        "file_id": 324,
+        "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#pragma once\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\nusing namespace std;\nusing namespace paddle;\nnamespace PaddleVideo\n{\n    class Normalize\n    {\n    public:\n        virtual void Run(cv::Mat *im, const std::vector<float> &mean,",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/preprocess_op.h:1-39"
+    },
+    "3817": {
+        "file_id": 324,
+        "content": "This code defines a class called Normalize with a Run method that takes in an input image and a vector of means for normalization. It is part of the PaddleVideo library, which likely uses OpenCV for image processing tasks.",
+        "type": "comment"
+    },
+    "3818": {
+        "file_id": 324,
+        "content": "                         const std::vector<float> &scale, const bool is_scale = true);\n    };\n    // RGB -> CHW\n    class Permute\n    {\n    public:\n        virtual void Run(const cv::Mat *img, float *data);\n    };\n    class Scale\n    {\n    public:\n        virtual void Run(const cv::Mat &img, cv::Mat &resize_img,\n                         bool use_tensorrt = false,\n                         const int &short_size = 256);\n    };\n    class CenterCrop\n    {\n    public:\n        virtual void Run(const cv::Mat &img, cv::Mat &crop_img,\n                         bool use_tensorrt = false,\n                         const int &target_size = 224);\n    };\n    class TenCrop\n    {\n    public:\n        virtual void Run(const cv::Mat &img, std::vector<cv::Mat> &crop_frames,\n                         const int &begin_index,\n                         bool use_tensorrt = false,\n                         const int &target_size = 224);\n    };\n} // namespace PaddleVideo",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/preprocess_op.h:40-74"
+    },
+    "3819": {
+        "file_id": 324,
+        "content": "The code defines several classes representing image preprocessing operations, including Permute for changing RGB to CHW format, Scale for resizing images, CenterCrop for cropping images to a specific size, and TenCrop for splitting an image into multiple crops. These classes can be used in the PaddleVideo library for preparing images before running inference with deep learning models.",
+        "type": "comment"
+    },
+    "3820": {
+        "file_id": 325,
+        "content": "/deploy/cpp_infer/include/utility.h",
+        "type": "filepath"
+    },
+    "3821": {
+        "file_id": 325,
+        "content": "This class offers static methods to read dictionary files and perform utility operations related to PaddleVideo, including functions for file handling, image manipulation, value indexing, and frame sampling.",
+        "type": "summary"
+    },
+    "3822": {
+        "file_id": 325,
+        "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#pragma once\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <stdlib.h>\n#include <vector>\n#include <algorithm>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include \"opencv2/opencv.hpp\"\nnamespace PaddleVideo\n{\n    class Utility\n    {\n    public:\n        static std::vector<std::string> ReadDict(const std::string &path);",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/utility.h:1-40"
+    },
+    "3823": {
+        "file_id": 325,
+        "content": "Utility class for PaddleVideo containing static methods to read dictionary files and perform various utility operations.",
+        "type": "comment"
+    },
+    "3824": {
+        "file_id": 325,
+        "content": "        static void GetAllFiles(const char *dir_name, std::vector<std::string> &all_inputs);\n        static cv::Mat GetRotateCropImage(const cv::Mat &srcimage, std::vector<std::vector<int>> box);\n        template <class ForwardIterator> inline static size_t argmax(ForwardIterator first, ForwardIterator last)\n        {\n            return std::distance(first, std::max_element(first, last));\n        }\n        static std::vector<cv::Mat> SampleFramesFromVideo(const std::string &VideoPath, const int &num_seg, const int &seg_len);\n    };\n} // namespace PaddleVideo",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/utility.h:42-54"
+    },
+    "3825": {
+        "file_id": 325,
+        "content": "The code contains several utility functions. It has a function to get all files in a directory, another for rotating and cropping images based on bounding boxes, a template function for finding the index of maximum value in a range, and one for sampling frames from a video file. All these belong to the PaddleVideo namespace.",
+        "type": "comment"
+    },
+    "3826": {
+        "file_id": 326,
+        "content": "/deploy/cpp_infer/include/video_rec.h",
+        "type": "filepath"
+    },
+    "3827": {
+        "file_id": 326,
+        "content": "This code includes necessary headers for OpenCV and PaddlePaddle integration, defines operations like pre-processing, post-processing, and utility functions. The class creates a VideoRecognizer object with initialization variables, initializing the model and operation objects for inference steps.",
+        "type": "summary"
+    },
+    "3828": {
+        "file_id": 326,
+        "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#pragma once\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include \"paddle_api.h\"\n#include \"paddle_inference_api.h\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include <include/postprocess_op.h>\n#include <include/preprocess_op.h>\n#include <include/utility.h>",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/video_rec.h:1-34"
+    },
+    "3829": {
+        "file_id": 326,
+        "content": "This code is licensing information and includes necessary headers for OpenCV and PaddlePaddle API integration. It defines various operations such as pre-processing, post-processing, utility functions, and possibly some video recording functionality using the PaddlePaddle library.",
+        "type": "comment"
+    },
+    "3830": {
+        "file_id": 326,
+        "content": "using namespace paddle_infer;\nnamespace PaddleVideo\n{\n    class VideoRecognizer\n    {\n    public:\n        explicit VideoRecognizer(const std::string &model_dir, const std::string &inference_model_name, const bool &use_gpu, const int &num_seg,\n                                 const int &rec_batch_num, const int &gpu_id,\n                                 const int &gpu_mem, const int &cpu_math_library_num_threads,\n                                 const bool &use_mkldnn, const std::string &label_path,\n                                 const bool &use_tensorrt, const std::string &precision, const std::vector<float> &_mean = {0.406, 0.456, 0.485},\n                                 const std::vector<float> &_scale = {0.225, 0.224, 0.229})\n        {\n            this->inference_model_name = inference_model_name;\n            this->use_gpu_ = use_gpu;\n            this->num_seg = num_seg;\n            this->rec_batch_num = rec_batch_num;\n            this->gpu_id_ = gpu_id;\n            this->gpu_mem_ = gpu_mem;\n            this->cpu_math_library_num_threads_ = cpu_math_library_num_threads;",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/video_rec.h:36-57"
+    },
+    "3831": {
+        "file_id": 326,
+        "content": "This class is for creating a VideoRecognizer object, which initializes variables such as the model directory, inference model name, use of GPU, number of segments, recording batch number, GPU ID, GPU memory, CPU math library threads, use of MKLDNN, label path, and optionally sets mean and scale values for image preprocessing.",
+        "type": "comment"
+    },
+    "3832": {
+        "file_id": 326,
+        "content": "            this->use_mkldnn_ = use_mkldnn;\n            this->use_tensorrt_ = use_tensorrt;\n            this->precision_ = precision;\n            this->mean_ = _mean;\n            this->scale_ = _scale;\n            this->label_list_ = Utility::ReadDict(label_path);\n            LoadModel(model_dir);\n        }\n        // Load Paddle inference model\n        void LoadModel(const std::string &model_dir);\n        void Run(const std::vector<string> &frames_batch_path, const std::vector<std::vector<cv::Mat> > &frames_batch, std::vector<double> *times);\n    private:\n        std::string inference_model_name;\n        std::shared_ptr<Predictor> predictor_;\n        bool use_gpu_ = false;\n        int gpu_id_ = 0;\n        int rec_batch_num = 1;\n        int gpu_mem_ = 4000;\n        int cpu_math_library_num_threads_ = 4;\n        bool use_mkldnn_ = false;\n        int num_seg = 8;\n        std::vector<std::string> label_list_;\n        std::vector<float> mean_ = {0.406, 0.456, 0.485};\n        std::vector<float> scale_ = {0.225, 0.224, 0.229};",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/video_rec.h:58-86"
+    },
+    "3833": {
+        "file_id": 326,
+        "content": "This function initializes the video recognition class, sets member variables for model type (use_mkldnn_, use_tensorrt_), precision, mean values, scale values, and loads the label list from a given path. It also calls the LoadModel() function to load the inference model.",
+        "type": "comment"
+    },
+    "3834": {
+        "file_id": 326,
+        "content": "        bool is_scale_ = true;\n        bool use_tensorrt_ = false;\n        std::string precision_ = \"fp32\";\n        // Instantiate pre-process operation object(s)\n        Scale scale_op_;\n        CenterCrop centercrop_op_;\n        TenCrop tencrop_op_;\n        Normalize normalize_op_;\n        Permute permute_op_;\n        // Instantiate post-process operation object(s)\n        Softmax softmax_op_;\n    }; // class VideoRecognizer\n} // namespace PaddleVideo",
+        "type": "code",
+        "location": "/deploy/cpp_infer/include/video_rec.h:87-105"
+    },
+    "3835": {
+        "file_id": 326,
+        "content": "This code initializes various operation objects for pre-processing and post-processing steps in the VideoRecognizer class of PaddleVideo library. It also sets default values for scale, precision, and use_tensorrt.",
+        "type": "comment"
+    },
+    "3836": {
+        "file_id": 327,
+        "content": "/deploy/cpp_infer/readme.md",
+        "type": "filepath"
+    },
+    "3837": {
+        "file_id": 327,
+        "content": "This code deploys PaddleVideo models with C++, supports optional settings and displays inference results, but encounters an error searching for 'libcudnn.so' due to incorrect/missing CUDNN_LIB_DIR setting.",
+        "type": "summary"
+    },
+    "3838": {
+        "file_id": 327,
+        "content": "[English](./readme_en.md) | 简体中文\n# 服务器端C++预测\n本章节介绍PaddleVideo模型的的C++部署方法，python预测部署方法请参考各自模型的**模型推理**章节。\nC++在性能计算上优于python，因此，在大多数CPU、GPU部署场景，多采用C++的部署方式，本节将介绍如何在Linux（CPU/GPU）环境下配置C++环境并完成\nPaddleVideo模型部署。\n在开始使用之前，您需要按照以下命令安装额外的依赖包：\n```bash\npython -m pip install git+https://github.com/LDOUBLEV/AutoLog\n```\n## 1. 准备环境\n- Linux环境，推荐使用docker。\n- Windows环境，目前支持基于`Visual Studio 2019 Community`进行编译（TODO）\n* 该文档主要介绍基于Linux环境的PaddleVideo C++预测流程，如果需要在Windows下基于预测库进行C++预测，具体编译方法请参考[Windows下编译教程](./docs/windows_vs2019_build.md)（TODO）\n* **准备环境的目的是得到编译好的opencv库与paddle预测库**。\n### 1.1 编译opencv库\n* 首先需要从opencv官网上下载在Linux环境下源码编译的压缩包，并解压成文件夹。以opencv3.4.7为例，下载命令如下：\n    ```bash\n    cd deploy/cpp_infer\n    wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz\n    tar -xf 3.4.7.tar.gz\n    ```\n    解压完毕后在`deploy/cpp_infer`目录下可以得到解压出的`opencv-3.4.7`的文件夹。\n* 安装ffmpeg\n    opencv配合ffmpeg才能在linux下正常读取视频，否则可能遇到视频帧数返回为0或无法读取任何视频帧的情况\n    采用较为简单的apt安装，安装命令如下：\n    ```bash\n    apt-get update\n    apt install libavformat-dev\n    apt install libavcodec-dev",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:1-45"
+    },
+    "3839": {
+        "file_id": 327,
+        "content": "Explanation of the code: This is an introduction to deploying PaddleVideo models using C++. It provides instructions on setting up a Linux environment and compiling OpenCV and PaddlePaddle libraries for model prediction. The code also mentions the need to install additional dependencies and provides commands for downloading, extracting, and compiling the OpenCV library. Additionally, it notes that Windows support is currently under development (TODO) and requires Visual Studio 2019 Community for compilation (TODO).",
+        "type": "comment"
+    },
+    "3840": {
+        "file_id": 327,
+        "content": "    apt install libswresample-dev\n    apt install libswscale-dev\n    apt install libavutil-dev\n    apt install libsdl1.2-dev\n    apt-get install ffmpeg\n    ```\n* 准备编译opencv，首先进入`opencv-3.4.7`的文件夹，然后设置opencv源码路径`root_path`以及安装路径`install_path`。执行命令如下：\n    ```bash\n    cd opencv-3.4.7\n    root_path=$PWD  # 当前所在路径即为opencv-3.4.7的绝对路径\n    install_path=${root_path}/opencv3\n    rm -rf build\n    mkdir build\n    cd build\n    cmake .. \\\n        -DCMAKE_INSTALL_PREFIX=${install_path} \\\n        -DCMAKE_BUILD_TYPE=Release \\\n        -DBUILD_SHARED_LIBS=OFF \\\n        -DWITH_IPP=OFF \\\n        -DBUILD_IPP_IW=OFF \\\n        -DWITH_LAPACK=OFF \\\n        -DWITH_EIGEN=OFF \\\n        -DCMAKE_INSTALL_LIBDIR=lib64 \\\n        -DWITH_ZLIB=ON \\\n        -DBUILD_ZLIB=ON \\\n        -DWITH_JPEG=ON \\\n        -DBUILD_JPEG=ON \\\n        -DWITH_PNG=ON \\\n        -DBUILD_PNG=ON \\\n        -DWITH_TIFF=ON \\\n        -DBUILD_TIFF=ON \\\n        -DWITH_FFMPEG=ON\n    make -j\n    make install\n    ```\n    `make install`完成之后，会在该文件夹下生成opencv头文件和库文件，用于后面的Video推理C++代码编译。\n    最终会以安装路径`install_path`为指定路径，得到一个`opencv3`的文件夹，其文件结构如下所示。",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:46-91"
+    },
+    "3841": {
+        "file_id": 327,
+        "content": "Preparing to compile OpenCV, enter the `opencv-3.4.7` directory and set `root_path` and `install_path`. Remove existing `build` folder, create a new one, navigate into it, run cmake commands with specified options, make and install. Results in an `opencv3` folder with header files and libraries for C++ video inference code compilation.",
+        "type": "comment"
+    },
+    "3842": {
+        "file_id": 327,
+        "content": "    ```shell\n    opencv-3.4.7/\n    ├── opencv3/  # 安装在opencv3目录下\n    │   ├── bin/\n    │   ├── include/\n    │   ├── lib/\n    │   ├── lib64/\n    │   └── share/\n    ```\n### 1.2 下载或者编译Paddle预测库\n有2种方式获取Paddle预测库，下面进行详细介绍。\n#### 1.2.1 直接下载安装\n* [Paddle预测库官网](https://paddleinference.paddlepaddle.org.cn/v2.2/user_guides/download_lib.html) 上提供了不同cuda版本的Linux预测库，可以在官网查看并**选择合适的预测库版本**（建议选择paddle版本>=2.0.1版本的预测库，推荐使用2.2.2的预测库）。\n* 下载得到一个`paddle_inference.tgz`压缩包，然后将它解压成文件夹，命令如下(以机器环境为gcc8.2为例)：\n    ```bash\n    wget https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz\n    tar -xf paddle_inference.tgz\n    ```\n    最终会在当前的文件夹中生成`paddle_inference/`的子文件夹。\n#### 1.2.2 预测库源码编译\n* 如果希望获取最新预测库特性，可以从Paddle github上克隆最新代码，源码编译预测库。\n* 可以参考[Paddle预测库安装编译说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi) 的说明，从github上获取Paddle代码，然后进行编译，生成最新的预测库。使用git获取代码方法如下。\n    ```shell",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:93-125"
+    },
+    "3843": {
+        "file_id": 327,
+        "content": "In this code snippet, the user is provided with two methods to obtain Paddle prediction library. The first method involves directly downloading a pre-compiled version of the library from the official website based on the desired CUDA version and OS architecture. The second method involves cloning the latest source code from Paddle's GitHub repository and compiling it manually for the most recent features. The code also provides sample commands to download and extract a pre-compiled library or clone the Paddle source code using 'wget' and 'tar' commands.",
+        "type": "comment"
+    },
+    "3844": {
+        "file_id": 327,
+        "content": "    git clone https://github.com/PaddlePaddle/Paddle.git\n    git checkout release/2.2\n    ```\n* 进入Paddle目录后，编译方法如下。\n    ```shell\n    rm -rf build\n    mkdir build\n    cd build\n    cmake  .. \\\n        -DWITH_CONTRIB=OFF \\\n        -DWITH_MKL=ON \\\n        -DWITH_MKLDNN=ON  \\\n        -DWITH_TESTING=OFF \\\n        -DCMAKE_BUILD_TYPE=Release \\\n        -DWITH_INFERENCE_API_TEST=OFF \\\n        -DON_INFER=ON \\\n        -DWITH_PYTHON=ON\n    make -j4\n    make inference_lib_dist -j4 # 4为编译时使用核数，可根据机器情况自行修改\n    ```\n    更多编译参数选项介绍可以参考[文档说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi)。\n* 编译完成之后，可以在`build/paddle_inference_install_dir/`文件下看到生成了以下文件及文件夹。\n    ```bash\n    build/\n    └── paddle_inference_install_dir/\n        ├── CMakeCache.txt\n        ├── paddle/\n        ├── third_party/\n        └── version.txt\n    ```\n    其中`paddle`就是C++预测所需的Paddle库，`version.txt`中包含当前预测库的版本信息。\n## 2. 编译并运行预测demo\n### 2.1 将模型导出为inference model\n* 该步骤与python部署方式下的导出预测模型相同，可以参考各自模型的模型预测章节。导出的几个相关inference model文件用于模型预测。**以PP-TSM为例**，导出预测模型的目录结构如下。",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:126-170"
+    },
+    "3845": {
+        "file_id": 327,
+        "content": "The provided code demonstrates how to compile the Paddle inference API library from the source code. It explains the steps for cloning and entering the Paddle repository, setting build parameters, compiling the library using make, and creating a build directory. The comments also mention where to find more information about build parameter options and what files are generated after compilation.",
+        "type": "comment"
+    },
+    "3846": {
+        "file_id": 327,
+        "content": "    ```\n    inference/\n    └── ppTSM/\n        ├── ppTSM.pdiparams\n        ├── ppTSM.pdiparamsinfo\n        └── ppTSM.pdmodel\n    ```\n### 2.2 编译PaddleVideo C++预测demo\n* 进入到`deploy/cpp_infer`目录下，执行以下编译命令\n    ```shell\n    bash tools/build.sh\n    ```\n    `tools/build.sh`中的Paddle C++预测库、opencv等其他依赖库的地址需要换成自己机器上的实际地址。\n* 具体地，需要修改`tools/build.sh`中的环境路径，相关内容如下：\n    ```shell\n    OPENCV_DIR=your_opencv_dir\n    LIB_DIR=your_paddle_inference_dir\n    CUDA_LIB_DIR=your_cuda_lib_dir\n    CUDNN_LIB_DIR=your_cudnn_lib_dir\n    ```\n    上述参数如下(以下路径用户可根据自己机器的情况对应修改)\n    ```bash\n    OPENCV_DIR=/path/to/opencv3\n    LIB_DIR=/path/to/paddle_inference\n    CUDA_LIB_DIR=/usr/local/cuda/lib64\n    CUDNN_LIB_DIR=/usr/lib/x86_64-linux-gnu/\n    ```\n    `OPENCV_DIR`为opencv编译安装的地址\n    `LIB_DIR`为下载(`paddle_inference`文件夹)或者编译生成的Paddle预测库地址(`build/paddle_inference_install_dir`文件夹)\n    `CUDA_LIB_DIR`为cuda库文件地址，在docker中为`/usr/local/cuda/lib64`\n    `CUDNN_LIB_DIR`为cudnn库文件地址，在docker中为`/usr/lib/x86_64-linux-gnu/`。\n    **如果希望预测时开启TensorRT加速功能，那么还需要修改`tools/build.sh`3处代码**",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:172-213"
+    },
+    "3847": {
+        "file_id": 327,
+        "content": "This code is providing instructions to compile the PaddleVideo C++ prediction demo for an inference model. Users need to navigate to the \"deploy/cpp_infer\" directory and execute the `bash tools/build.sh` command. They must also modify the `tools/build.sh` script with their specific openCV, Paddle Inference, CUDA library, and CUDNN library directories before running the build script.",
+        "type": "comment"
+    },
+    "3848": {
+        "file_id": 327,
+        "content": "    1. 设置`DWITH_GPU=ON`\n    2. 设置`DWITH_TENSORRT=ON`\n    3. 设置`TENSORRT_DIR=/path/to/TensorRT-x.x.x.x`\n    **以上路径都写绝对路径，不要写相对路径**\n* 编译完成之后，会在`cpp_infer/build`文件夹下生成一个名为`ppvideo`的可执行文件。\n### 2.3 运行PaddleVideo C++预测demo\n运行方式：\n```bash\n./build/ppvideo <mode> [--param1] [--param2] [...]\n```\n其中，`mode`为必选参数，表示选择的功能，取值范围['rec']，表示**视频识别**（更多功能会陆续加入）。\n##### 1. 调用视频识别：\n```bash\n# 调用PP-TSM识别\n./build/ppvideo rec \\\n--rec_model_dir=../../inference/ppTSM \\\n--inference_model_name=ppTSM \\\n--video_dir=./example_video_dir \\\n--num_seg=8 \\\n--seg_len=1\n# 调用PP-TSN识别\n./build/ppvideo rec \\\n--rec_model_dir=../../inference/ppTSN \\\n--inference_model_name=ppTSN \\\n--video_dir=./example_video_dir \\\n--num_seg=25 \\\n--seg_len=1\n```\n更多参数如下：\n- 通用参数\n    | 参数名称      | 类型 | 默认参数        | 意义                                                         |\n    | ------------- | ---- | --------------- | ------------------------------------------------------------ |\n    | use_gpu       | bool | false           | 是否使用GPU                                                  |",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:214-259"
+    },
+    "3849": {
+        "file_id": 327,
+        "content": "This code sets the necessary environment variables and provides instructions for running PaddleVideo's C++ prediction demo. It supports video recognition mode with optional parameters such as model directory, inference model name, video directory, number of segments, and segment length. Users can choose from PP-TSM or PP-TSN models.",
+        "type": "comment"
+    },
+    "3850": {
+        "file_id": 327,
+        "content": "    | gpu_id        | int  | 0               | GPU id，使用GPU时有效                                        |\n    | gpu_mem       | int  | 4000            | 申请的GPU内存                                                |\n    | cpu_threads   | int  | 10              | CPU预测时的线程数，在机器核数充足的情况下，该值越大，预测速度越快 |\n    | enable_mkldnn | bool | false           | 是否使用mkldnn库                                             |\n    | use_tensorrt  | bool | false           | 是否使用tensorrt库                                           |\n    | precision     | str  | \"fp32\"          | 使用fp32/fp16/uint8精度来预测                                |\n    | benchmark     | bool | true            | 预测时是否开启benchmark，开启后会在最后输出配置、模型、耗时等信息。 |\n- 视频识别模型相关\n    | 参数名称       | 类型   | 默认参数                                      | 意义                                 |\n    | -------------- | ------ | --------------------------------------------- | ------------------------------------ |\n    | video_dir      | string | \"../example_video_dir\"                        | 存放将要识别的视频的文件夹路径       |",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:260-273"
+    },
+    "3851": {
+        "file_id": 327,
+        "content": "This code snippet defines various parameters for video recognition model execution. It specifies GPU ID, requested GPU memory, CPU thread count for faster predictions on machines with sufficient cores, boolean values to enable mkldnn and tensorrt libraries, precision type for predictions (fp32/fp16/uint8), and a flag to start benchmarking during prediction. The video recognition model parameters include the path to the folder containing videos to be recognized.",
+        "type": "comment"
+    },
+    "3852": {
+        "file_id": 327,
+        "content": "    | rec_model_dir  | string | \"\"                                            | 存放导出的预测模型的文件夹路径       |\n    | inference_model_name | string | \"ppTSM\"                                 | 预测模型的名称 |\n    | num_seg        | int    | 8                                             | 视频分段的段数                       |\n    | seg_len        | int    | 1                                             | 视频每段抽取的帧数                   |\n    | rec_batch_num  | int    | 1                                             | 模型预测时的batch size               |\n    | char_list_file | str    | \"../../data/k400/Kinetics-400_label_list.txt\" | 存放所有类别标号和对应名字的文本路径 |\n​\t以example_video_dir下的样例视频`example01.avi`为输入视频为例，最终屏幕上会输出检测结果如下。\n```bash\n[./inference/ppTSM]\n[./deploy/cpp_infer/example_video_dir]\ntotal videos num: 1\n./example_video_dir/example01.avi   class: 5 archery       score: 0.999556\nI1125 08:10:45.834288 13955 autolog.h:50] ----------------------- Config info -----------------------\nI1125 08:10:45.834458 13955 autolog.h:51] runtime_device: cpu",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:274-289"
+    },
+    "3853": {
+        "file_id": 327,
+        "content": "The code is configuring the model directory path, inference model name, number of video segments, length of each segment, batch size for prediction, and the file path containing class labels and names. An example input video is used to demonstrate how the code outputs the detected results on the screen, including video file, classification, and confidence score.",
+        "type": "comment"
+    },
+    "3854": {
+        "file_id": 327,
+        "content": "I1125 08:10:45.834467 13955 autolog.h:52] ir_optim: True\nI1125 08:10:45.834475 13955 autolog.h:53] enable_memory_optim: True\nI1125 08:10:45.834483 13955 autolog.h:54] enable_tensorrt: 0\nI1125 08:10:45.834518 13955 autolog.h:55] enable_mkldnn: False\nI1125 08:10:45.834525 13955 autolog.h:56] cpu_math_library_num_threads: 10\nI1125 08:10:45.834532 13955 autolog.h:57] ----------------------- Data info -----------------------\nI1125 08:10:45.834540 13955 autolog.h:58] batch_size: 1\nI1125 08:10:45.834547 13955 autolog.h:59] input_shape: dynamic\nI1125 08:10:45.834556 13955 autolog.h:60] data_num: 1\nI1125 08:10:45.834564 13955 autolog.h:61] ----------------------- Model info -----------------------\nI1125 08:10:45.834573 13955 autolog.h:62] model_name: rec\nI1125 08:10:45.834579 13955 autolog.h:63] precision: fp32\nI1125 08:10:45.834586 13955 autolog.h:64] ----------------------- Perf info ------------------------\nI1125 08:10:45.834594 13955 autolog.h:65] Total time spent(ms): 2739\nI1125 08:10:45.834602 13955 au",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:290-304"
+    },
+    "3855": {
+        "file_id": 327,
+        "content": "This code configures the inference engine with options for optimizing IR, memory optimization, TensorRT and MKLDNN support. It also sets the number of CPU threads, displays data information (batch size, input shape, data count), model name and precision, and logs the total time spent for inference.",
+        "type": "comment"
+    },
+    "3856": {
+        "file_id": 327,
+        "content": "tolog.h:67] preprocess_time(ms): 10.6524, inference_time(ms): 1269.55, postprocess_time(ms): 0.009118\n```\n### 3 FAQ\n1. 编译demo过程中出现以下错误\n    ```shell\n    make[2]: *** No rule to make target '/usr/lib/x86_64-linux-gn/libcudnn.so', needed by 'ppvideo'.  Stop.\n    make[2]: *** Waiting for unfinished jobs....\n    [ 16%] Building CXX object CMakeFiles/ppvideo.dir/src/main.cpp.o\n    [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/preprocess_op.cpp.o\n    [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/postprocess_op.cpp.o\n    [ 83%] Building CXX object CMakeFiles/ppvideo.dir/src/utility.cpp.o\n    [ 83%] Building CXX object CMakeFiles/ppvideo.dir/src/video_rec.cpp.o\n    CMakeFiles/Makefile2:95: recipe for target 'CMakeFiles/ppvideo.dir/all' failed\n    make[1]: *** [CMakeFiles/ppvideo.dir/all] Error 2\n    Makefile:83: recipe for target 'all' failed\n    make: *** [all] Error 2\n    ```\n    可能是`CUDNN_LIB_DIR`设置的不对，导致找不到该目录下的`libcudnn.so`。",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme.md:304-324"
+    },
+    "3857": {
+        "file_id": 327,
+        "content": "This code snippet displays the preprocess time, inference time, and postprocess time for a certain task. It shows that the inference time is 1269.55ms and the postprocess time is 0.009118ms. The error message indicates a problem with finding the 'libcudnn.so' library due to an incorrect or missing CUDNN_LIB_DIR setting.",
+        "type": "comment"
+    },
+    "3858": {
+        "file_id": 328,
+        "content": "/deploy/cpp_infer/readme_en.md",
+        "type": "filepath"
+    },
+    "3859": {
+        "file_id": 328,
+        "content": "This section provides Linux setup for deploying PaddleVideo models, offers Windows support, and recommends Docker. It installs OpenCV 3.4.7, sets paths, compiles Video inference code, builds prediction library with simple commands, and defines model parameters/configuration options but may encounter errors due to missing libcudnn or incorrect CUDNN_LIB_DIR setting.",
+        "type": "summary"
+    },
+    "3860": {
+        "file_id": 328,
+        "content": "English | [简体中文](./readme.md)\n# Server-side C++ prediction\nThis chapter introduces the C++ deployment method of the PaddleVideo model. For the python prediction deployment method, please refer to the **Model Reasoning** chapter of the respective model.\nC++ is better than python in terms of performance calculation. Therefore, in most CPU and GPU deployment scenarios, C++ deployment methods are mostly used. This section will introduce how to configure the C++ environment in the Linux (CPU/GPU) environment and complete it.\nPaddleVideo model deployment.\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install [paddledet](git+https://github.com/LDOUBLEV/AutoLog)\n```\n## 1. Prepare the environment\n- For Linux environment, docker is recommended.\n- Windows environment, currently supports compilation based on `Visual Studio 2019 Community` (TODO)\n* This document mainly introduces the PaddleVideo C++ prediction process based on the Linux environment. If yo",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:1-20"
+    },
+    "3861": {
+        "file_id": 328,
+        "content": "This section introduces the C++ deployment method of PaddleVideo model, which offers better performance compared to Python. It provides instructions for setting up a Linux environment with docker recommendation and mentions that Windows support is under development. Additionally, it requires installing extra dependencies like paddledet using pip.",
+        "type": "comment"
+    },
+    "3862": {
+        "file_id": 328,
+        "content": "u need to perform C++ prediction based on the prediction library under Windows, please refer to [Windows Compilation Tutorial](./docs/windows_vs2019_build.md)(TODO) for the specific compilation method\n* **The purpose of preparing the environment is to get the compiled opencv library and paddle prediction library**.\n### 1.1 Compile opencv library\n* First, you need to download the compressed package compiled from the source code in the Linux environment from the opencv official website, and unzip it into a folder. Take opencv3.4.7 as an example, the download command is as follows:\n    ```bash\n    cd deploy/cpp_infer\n    wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz\n    tar -xf 3.4.7.tar.gz\n    ```\n    After decompression, you can get the decompressed folder of `opencv-3.4.7` in the `deploy/cpp_infer` directory.\n* Install ffmpeg\n    Opencv and ffmpeg can read the video normally under linux, otherwise it may encounter the situation that the number of video frames returns to 0 or no video frame can be read",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:20-37"
+    },
+    "3863": {
+        "file_id": 328,
+        "content": "This code provides instructions for compiling the OpenCV library and installing FFmpeg to enable normal video reading under Linux. It also mentions a Windows Compilation Tutorial that needs to be completed (TODO).",
+        "type": "comment"
+    },
+    "3864": {
+        "file_id": 328,
+        "content": "    Using a relatively simple apt installation, the installation command is as follows:\n    ```bash\n    apt-get update\n    apt install libavformat-dev\n    apt install libavcodec-dev\n    apt install libswresample-dev\n    apt install libswscale-dev\n    apt install libavutil-dev\n    apt install libsdl1.2-dev\n    apt-get install ffmpeg\n    ```\n* To prepare to compile opencv, first enter the `opencv-3.4.7` folder, and then set the opencv source path `root_path` and the installation path `install_path`. The execution command is as follows:\n    ```bash\n    cd opencv-3.4.7\n    root_path=$PWD  # That is the absolute path of opencv-3.4.7\n    install_path=${root_path}/opencv3\n    rm -rf build\n    mkdir build\n    cd build\n    cmake .. \\\n        -DCMAKE_INSTALL_PREFIX=${install_path} \\\n        -DCMAKE_BUILD_TYPE=Release \\\n        -DBUILD_SHARED_LIBS=OFF \\\n        -DWITH_IPP=OFF \\\n        -DBUILD_IPP_IW=OFF \\\n        -DWITH_LAPACK=OFF \\\n        -DWITH_EIGEN=OFF \\\n        -DCMAKE_INSTALL_LIBDIR=lib64 \\\n        -DWITH_ZLIB=ON \\\n        -DBUILD_ZLIB=ON \\",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:39-76"
+    },
+    "3865": {
+        "file_id": 328,
+        "content": "This code installs necessary libraries for compiling OpenCV 3.4.7 on Linux, sets the source and installation paths, removes existing build folder, creates a new one, runs cmake to configure build options and libraries to use, and then proceeds with the compilation process.",
+        "type": "comment"
+    },
+    "3866": {
+        "file_id": 328,
+        "content": "        -DWITH_JPEG=ON \\\n        -DBUILD_JPEG=ON \\\n        -DWITH_PNG=ON \\\n        -DBUILD_PNG=ON \\\n        -DWITH_TIFF=ON \\\n        -DBUILD_TIFF=ON \\\n        -DWITH_FFMPEG=ON\n    make -j\n    make install\n    ```\n    After the completion of `make install`, opencv header files and library files will be generated in this folder, which will be used to compile the Video inference C++ code later.\n    Finally, the installation path `install_path` will be used as the specified path, and a folder of `opencv3` will be obtained. The file structure is shown below.\n    ```shell\n    opencv-3.4.7/\n    ├── opencv3/\n    │   ├── bin/\n    │   ├── include/\n    │   ├── lib/\n    │   ├── lib64/\n    │   └── share/\n    ```\n### 1.2 Download or compile Paddle prediction library\nThere are two ways to obtain the Paddle prediction library, which will be described in detail below.\n#### 1.2.1 Download and install directly\n* [Paddle prediction library official website](https://paddleinference.paddlepaddle.org.cn/v2.2/user_guides/download_li",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:77-110"
+    },
+    "3867": {
+        "file_id": 328,
+        "content": "Configuring and installing OpenCV library with specified options and building the Video inference C++ code using it.",
+        "type": "comment"
+    },
+    "3868": {
+        "file_id": 328,
+        "content": "b.html) provides different cuda versions of Linux prediction libraries, you can Check and **select the appropriate prediction library version** on the official website (it is recommended to select the prediction library with paddle version>=2.0.1, and the prediction library of 2.2.2 is recommended).\n* Download and get a `paddle_inference.tgz` compressed package, and then unzip it into a folder, the command is as follows (taking the machine environment as gcc8.2 as an example):\n    ```bash\n    wget https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz\n    tar -xf paddle_inference.tgz\n    ```\n    Eventually, a subfolder of `paddle_inference/` will be generated in the current folder.\n#### 1.2.2 Prediction library source code compilation\n* If you want to get the latest prediction library features, you can clone the latest code from Paddle github and compile the prediction library from source code.\n* You can refer t",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:110-123"
+    },
+    "3869": {
+        "file_id": 328,
+        "content": "This code provides instructions for downloading and unzipping the prediction library, or compiling it from source code if you want the latest features. It specifies the appropriate version selection on the official website (paddle version>=2.0.1, 2.2.2 recommended) and the required environment (gcc8.2). The wget command downloads the tgz package, tar extracts it into a subfolder of paddle_inference in the current folder. Alternatively, cloning the latest code from Paddle GitHub and compiling from source can be done for accessing the latest prediction library features.",
+        "type": "comment"
+    },
+    "3870": {
+        "file_id": 328,
+        "content": "o [Paddle prediction library installation and compilation instructions](https://paddleinference.paddlepaddle.org.cn/user_guides/source_compile.html) instructions from github Obtain the Paddle code, and then compile it to generate the latest prediction library. The method of using git to get the code is as follows.\n    ```shell\n    git clone https://github.com/PaddlePaddle/Paddle.git\n    git checkout release/2.2\n    ```\n* After entering the Paddle directory, the compilation method is as follows.\n    ```shell\n    rm -rf build\n    mkdir build\n    cd build\n    cmake .. \\\n        -DWITH_CONTRIB=OFF \\\n        -DWITH_MKL=ON \\\n        -DWITH_MKLDNN=ON \\\n        -DWITH_TESTING=OFF \\\n        -DCMAKE_BUILD_TYPE=Release \\\n        -DWITH_INFERENCE_API_TEST=OFF \\\n        -DON_INFER=ON \\\n        -DWITH_PYTHON=ON\n    make -j\n    make inference_lib_dist -j4 # 4为编译时使用核数，可根据机器情况自行修改\n    ```\n    You can refer to [documentation](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/b",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:123-150"
+    },
+    "3871": {
+        "file_id": 328,
+        "content": "This code provides the installation and compilation instructions for Paddle prediction library. The steps involve cloning the Paddle repository, checking out a specific release branch, configuring and building the project with CMake, and finally generating the prediction library by making and making inference_lib_dist. This process is done to ensure that users can obtain the latest and most optimized version of the prediction library for their needs.",
+        "type": "comment"
+    },
+    "3872": {
+        "file_id": 328,
+        "content": "uild_and_install_lib_cn.html#congyuanmabianyi) for more introduction of compilation parameter options.\n* After the compilation is complete, you can see the following files and folders are generated under the file `build/paddle_inference_install_dir/`.\n    ```\n    build/\n    └── paddle_inference_install_dir/\n        ├── CMakeCache.txt\n        ├── paddle/\n        ├── third_party/\n        └── version.txt\n    ```\n    Among them, `paddle` is the Paddle library required for C++ prediction, and `version.txt` contains the version information of the current prediction library.\n## 2. Compile and run the prediction demo\n### 2.1 Export the model as an inference model\n* This step is the same as the export prediction model under the python deployment mode. You can refer to the model prediction chapter of the respective model. Several related inference model files exported are used for model prediction. **Taking PP-TSM as an example**, the directory structure of the derived prediction model is as follows.\n    ```\n    inference/",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:150-173"
+    },
+    "3873": {
+        "file_id": 328,
+        "content": "Step 1: The code describes the generation of several files and folders after a successful compilation process. These include `CMakeCache.txt`, `paddle/`, `third_party/`, and `version.txt`.\n\nStep 2: Explains that among these, `paddle` is the C++ library required for prediction, while `version.txt` contains version information of the current prediction library.",
+        "type": "comment"
+    },
+    "3874": {
+        "file_id": 328,
+        "content": "    └── ppTSM/\n        ├── ppTSM.pdiparams\n        ├── ppTSM.pdiparamsinfo\n        └── ppTSM.pdmodel\n    ```\n### 2.2 Compile PaddleVideo C++ prediction demo\n* Enter the `deploy/cpp_infer` directory and execute the following compile command\n    ```shell\n    bash tools/build.sh\n    ```\n    The addresses of the Paddle C++ prediction library, opencv and other dependent libraries in `tools/build.sh` need to be replaced with the actual addresses on your own machine.\n* Specifically, you need to modify the environment path in `tools/build.sh`, the relevant content is as follows:\n    ```shell\n    OPENCV_DIR=your_opencv_dir\n    LIB_DIR=your_paddle_inference_dir\n    CUDA_LIB_DIR=/usr/local/cuda/lib64\n    CUDNN_LIB_DIR=/usr/lib/x86_64-linux-gnu/\n    ```\n    The above parameters are as follows (the following path users can modify according to their own machine conditions)\n    `OPENCV_DIR` is the address where opencv is compiled and installed\n     `LIB_DIR` is the download (`paddle_inference` folder) or the generated Paddle prediction library address (`build/paddle_inference_install_dir` folder)",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:174-203"
+    },
+    "3875": {
+        "file_id": 328,
+        "content": "This code snippet provides instructions for compiling the PaddleVideo C++ prediction demo. First, navigate to the `deploy/cpp_infer` directory. Then, execute the compile command `bash tools/build.sh`. Modify environment paths in `tools/build.sh`, such as OPENCV_DIR, LIB_DIR, CUDA_LIB_DIR, and CUDNN_LIB_DIR to match your system's configuration.",
+        "type": "comment"
+    },
+    "3876": {
+        "file_id": 328,
+        "content": "     `CUDA_LIB_DIR` is the address of the cuda library file, which is `/usr/local/cuda/lib64` in docker\n     `CUDNN_LIB_DIR` is the cudnn library file address, which is `/usr/lib/x86_64-linux-gnu/` in docker.\n     **If you want to enable TensorRT acceleration during prediction, you need to modify the code at `tools/build.sh`3**\n     1. Set `DWITH_GPU=ON`\n     2. Set `DWITH_TENSORRT=ON`\n     3. Set `TENSORRT_DIR=/path/to/TensorRT-x.x.x.x`\n    **The above paths are all absolute paths, do not use relative paths**\n* After the compilation is complete, an executable file named `ppvideo` will be generated in the `cpp_infer/build` folder.\n### 2.3 Run PaddleVideo C++ prediction demo\nOperation mode:\n```bash\n./build/ppvideo <mode> [--param1] [--param2] [...]\n```\nAmong them, `mode` is a required parameter, which means the selected function, and the value range is ['rec'], which means **video recognition** (more functions will be added in succession).\n##### 1. Call video recognition:\n```bash\n# run PP-TSM inference\n./build/ppvideo rec \\",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:204-231"
+    },
+    "3877": {
+        "file_id": 328,
+        "content": "To enable TensorRT acceleration during prediction, modify the code in `tools/build.sh` by setting `DWITH_GPU=ON`, `DWITH_TENSORRT=ON`, and providing the absolute path to TensorRT using `TENSORRT_DIR`. This allows for GPU-accelerated predictions with PaddleVideo's C++ implementation.",
+        "type": "comment"
+    },
+    "3878": {
+        "file_id": 328,
+        "content": "--rec_model_dir=../../inference/ppTSM \\\n--inference_model_name=ppTSM \\\n--video_dir=./example_video_dir \\\n--num_seg=8 \\\n--seg_len=1\n# run PP-TSN inference\n./build/ppvideo rec \\\n--rec_model_dir=../../inference/ppTSN \\\n--inference_model_name=ppTSN \\\n--video_dir=./example_video_dir \\\n--num_seg=25 \\\n--seg_len=1\n```\nMore parameters are as follows:\n- General parameters\n    | Parameter name | Type | Default parameter | Meaning |\n    | ------------- | ---- | --------------- | ------------------------------------------------------------ |\n    | use_gpu | bool | false | Whether to use GPU |\n    | gpu_id | int | 0 | GPU id, valid when using GPU |\n    | gpu_mem | int | 4000 | GPU memory requested |\n    | cpu_threads | int | 10 | The number of threads for CPU prediction. When the number of machine cores is sufficient, the larger the value, the faster the prediction speed |\n    | enable_mkldnn | bool | false | Whether to use mkldnn library |\n    | use_tensorrt | bool | false | Whether to use the tensorrt library |\n    | precision | str | \"fp32\" | Use fp32/fp16/uint8 precision to predict |",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:232-258"
+    },
+    "3879": {
+        "file_id": 328,
+        "content": "This code sets the model directory, inference model name, video directory, number of segments, and segment length for PaddleVideo's PP-TSM and PP-TSN inference. Additional parameters include use_gpu, gpu_id, gpu_mem, cpu_threads, enable_mkldnn, use_tensorrt, and precision for customizing the inference process.",
+        "type": "comment"
+    },
+    "3880": {
+        "file_id": 328,
+        "content": "    | benchmark | bool | true | Whether to enable benchmark during prediction, after enabling it, the configuration, model, time-consuming and other information will be output at the end. |\n- Video recognition model related\n    | Parameter name | Type | Default parameter | Meaning |\n    | -------------- | ------ | --------------------------------------------- | ------------------------------------ |\n    | video_dir | string | \"../example_video_dir\" | The path of the folder where the video to be recognized is stored |\n    | rec_model_dir | string | \"\" | The folder path where the exported prediction model is stored |\n    | inference_model_name | string | \"ppTSM\" | The name of the model used in the prediction |\n    | num_seg | int | 8 | Number of video segments |\n    | seg_len | int | 1 | The number of frames extracted in each segment of the video |\n    | rec_batch_num | int | 1 | Batch size during model prediction |\n    | char_list_file | str | \"../../data/k400/Kinetics-400_label_list.txt\" | The text path for storing all category labels and corresponding names |",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:259-271"
+    },
+    "3881": {
+        "file_id": 328,
+        "content": "This code provides configuration parameters for video recognition models. The `video_dir` specifies the folder path containing the video to be recognized, while `rec_model_dir` points to the exported prediction model's location. The `inference_model_name` refers to the name of the model used in prediction. `num_seg` and `seg_len` determine the number of video segments and frames per segment respectively. `rec_batch_num` indicates the batch size during model prediction, and `char_list_file` stores category labels and names.",
+        "type": "comment"
+    },
+    "3882": {
+        "file_id": 328,
+        "content": "​\tTake the sample video `example01.avi` under example_video_dir as the input video as an example, the final \tscreen will output the detection results as follows.\n```bash\n[./inference/ppTSM]\n[./deploy/cpp_infer/example_video_dir]\ntotal videos num: 1\n./example_video_dir/example01.avi   class: 5 archery       score: 0.999556\nI1125 08:10:45.834288 13955 autolog.h:50] ----------------------- Config info -----------------------\nI1125 08:10:45.834458 13955 autolog.h:51] runtime_device: cpu\nI1125 08:10:45.834467 13955 autolog.h:52] ir_optim: True\nI1125 08:10:45.834475 13955 autolog.h:53] enable_memory_optim: True\nI1125 08:10:45.834483 13955 autolog.h:54] enable_tensorrt: 0\nI1125 08:10:45.834518 13955 autolog.h:55] enable_mkldnn: False\nI1125 08:10:45.834525 13955 autolog.h:56] cpu_math_library_num_threads: 10\nI1125 08:10:45.834532 13955 autolog.h:57] ----------------------- Data info -----------------------\nI1125 08:10:45.834540 13955 autolog.h:58] batch_size: 1\nI1125 08:10:45.834547 13955 autolog.h:59] input_shape: dynamic",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:273-289"
+    },
+    "3883": {
+        "file_id": 328,
+        "content": "This code snippet demonstrates the output of the inference process for a sample video. It shows the input video, its class and score. Additionally, it provides information about the runtime device, IR optimization, memory optimization, whether TensorRT is enabled or not, the number of CPU math library threads, and data information such as batch size and input shape.",
+        "type": "comment"
+    },
+    "3884": {
+        "file_id": 328,
+        "content": "I1125 08:10:45.834556 13955 autolog.h:60] data_num: 1\nI1125 08:10:45.834564 13955 autolog.h:61] ----------------------- Model info -----------------------\nI1125 08:10:45.834573 13955 autolog.h:62] model_name: rec\nI1125 08:10:45.834579 13955 autolog.h:63] precision: fp32\nI1125 08:10:45.834586 13955 autolog.h:64] ----------------------- Perf info ------------------------\nI1125 08:10:45.834594 13955 autolog.h:65] Total time spent(ms): 2739\nI1125 08:10:45.834602 13955 autolog.h:67] preprocess_time(ms): 10.6524, inference_time(ms): 1269.55, postprocess_time(ms): 0.009118\n```\n### 3 FAQ\n1. The following error occurred during the compilation of the demo\n     ```shell\n     make[2]: *** No rule to make target '/usr/lib/x86_64-linux-gn/libcudnn.so', needed by 'ppvideo'. Stop.\n     make[2]: *** Waiting for unfinished jobs....\n     [ 16%] Building CXX object CMakeFiles/ppvideo.dir/src/main.cpp.o\n     [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/preprocess_op.cpp.o\n     [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/postprocess_op.cpp.o",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:290-308"
+    },
+    "3885": {
+        "file_id": 328,
+        "content": "The code is displaying information about the model used for inference. It mentions the model name, precision type, and total time spent on inference. Additionally, it provides a breakdown of preprocessing, inference, and post-processing times. The error message indicates that the CUDA Deep Neural Network library (libcudnn) is missing or not found during compilation.",
+        "type": "comment"
+    },
+    "3886": {
+        "file_id": 328,
+        "content": "     [83%] Building CXX object CMakeFiles/ppvideo.dir/src/utility.cpp.o\n     [ 83%] Building CXX object CMakeFiles/ppvideo.dir/src/video_rec.cpp.o\n     CMakeFiles/Makefile2:95: recipe for target 'CMakeFiles/ppvideo.dir/all' failed\n     make[1]: *** [CMakeFiles/ppvideo.dir/all] Error 2\n     Makefile:83: recipe for target 'all' failed\n     make: *** [all] Error 2\n     ````\n     It may be that `CUDNN_LIB_DIR` is set incorrectly, resulting in that `libcudnn.so` in this directory cannot be found.",
+        "type": "code",
+        "location": "/deploy/cpp_infer/readme_en.md:309-316"
+    },
+    "3887": {
+        "file_id": 328,
+        "content": "Error: CMakeFiles/ppvideo.dir/all and all target failed with Error 2 due to missing libcudnn.so, possibly caused by incorrect CUDNN_LIB_DIR setting.",
+        "type": "comment"
+    },
+    "3888": {
+        "file_id": 329,
+        "content": "/deploy/cpp_infer/src/main.cpp",
+        "type": "filepath"
+    },
+    "3889": {
+        "file_id": 329,
+        "content": "This code uses OpenCV and other libraries, processes video frames in batches with PaddleVideo's Recognition class, enables benchmarking if set, and handles main function execution and program termination.",
+        "type": "summary"
+    },
+    "3890": {
+        "file_id": 329,
+        "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include \"glog/logging.h\"\n#include \"omp.h\"\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include <include/video_rec.h>\n#include <include/utility.h>\n#include <sys/stat.h>\n#include <gflags/gflags.h>\n#include \"auto_log/autolog.h\"",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/main.cpp:1-35"
+    },
+    "3891": {
+        "file_id": 329,
+        "content": "This code file contains copyright information, license details, and includes necessary header files for OpenCV, Google Logging, GFlags, and other utilities. It also includes the header file for video_rec and utility functions. This seems to be part of a larger codebase related to video processing or analysis.",
+        "type": "comment"
+    },
+    "3892": {
+        "file_id": 329,
+        "content": "// general parameters\nDEFINE_bool(use_gpu, false, \"Infering with GPU or CPU.\");\nDEFINE_int32(gpu_id, 0, \"Device id of GPU to execute.\");\nDEFINE_int32(gpu_mem, 4000, \"GPU id when infering with GPU.\");\nDEFINE_int32(cpu_threads, 10, \"Num of threads with CPU.\");\nDEFINE_bool(enable_mkldnn, false, \"Whether use mkldnn with CPU.\");\nDEFINE_bool(use_tensorrt, false, \"Whether use tensorrt.\");\nDEFINE_string(precision, \"fp32\", \"Precision be one of fp32/fp16/int8.\");\nDEFINE_bool(benchmark, true, \"Whether to log and report benchmark information during inference.\");\n// video recognition related\nDEFINE_string(video_dir, \"\", \"Dir of input video(s).\");\nDEFINE_string(rec_model_dir, \"../example_video_dir\", \"Path of video rec inference model.\");\nDEFINE_string(inference_model_name, \"ppTSM\", \"The name of the model used in the prediction.\");\nDEFINE_int32(num_seg, 8, \"number of frames input to model, which are extracted from a video.\");\nDEFINE_int32(seg_len, 1, \"number of frames from a segment.\");\nDEFINE_int32(rec_batch_num, 1, \"rec_batch_num.\");",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/main.cpp:37-54"
+    },
+    "3893": {
+        "file_id": 329,
+        "content": "This code defines various parameters for an inference process. The use_gpu flag determines if GPU or CPU is used, gpu_id specifies the device id of the GPU, gpu_mem sets the GPU id for inferencing with GPU, cpu_threads indicates the number of threads for CPU usage, enable_mkldnn enables MKL-DNN for CPU operations, use_tensorrt utilizes TensorRT, precision selects the desired precision format (fp32/fp16/int8), benchmark tracks inference timings, and video recognition parameters include the input video directory, model path, model name, number of frames per segment, and batch number.",
+        "type": "comment"
+    },
+    "3894": {
+        "file_id": 329,
+        "content": "DEFINE_string(char_list_file, \"../../data/k400/Kinetics-400_label_list.txt\", \"Path of dictionary.\");\nusing namespace std;\nusing namespace cv;\nusing namespace PaddleVideo;\nstatic bool PathExists(const std::string& path)\n{\n#ifdef _WIN32\n    struct _stat buffer;\n    return (_stat(path.c_str(), &buffer) == 0);\n#else\n    struct stat buffer;\n    return (stat(path.c_str(), &buffer) == 0);\n#endif  // !_WIN32\n}\nint main_rec(std::vector<cv::String> &cv_all_video_names)\n{\n    std::vector<double> time_info = {0, 0, 0}; // Statement time statistics vector\n    VideoRecognizer rec(FLAGS_rec_model_dir, FLAGS_inference_model_name, FLAGS_use_gpu, FLAGS_num_seg,\n                        FLAGS_rec_batch_num, FLAGS_gpu_id,\n                        FLAGS_gpu_mem, FLAGS_cpu_threads,\n                        FLAGS_enable_mkldnn, FLAGS_char_list_file,\n                        FLAGS_use_tensorrt, FLAGS_precision); // Instantiate a video recognition object\n    int batch_num = FLAGS_rec_batch_num;\n    for (int i = 0, n = cv_all_video_names.size(); i < n; i += batch_num) // Process each video",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/main.cpp:55-85"
+    },
+    "3895": {
+        "file_id": 329,
+        "content": "Initializing a video recognition object and processing each video in batches.",
+        "type": "comment"
+    },
+    "3896": {
+        "file_id": 329,
+        "content": "    {\n        int start_idx = i;\n        int end_idx = min(i + batch_num, n);\n        std::vector<std::vector<cv::Mat> > frames_batch;\n        for (int j = start_idx; j < end_idx; ++j)\n        {\n            std::vector<cv::Mat> frames = Utility::SampleFramesFromVideo(cv_all_video_names[i], FLAGS_num_seg, FLAGS_seg_len);\n            frames_batch.emplace_back(frames);\n        }\n        std::vector<double> rec_times; // Initialization time consumption statistics\n        // Take the read several video frames and send them to the run method of the recognition class to predict\n        rec.Run(std::vector<string>(cv_all_video_names.begin() + start_idx, cv_all_video_names.begin() + end_idx), frames_batch, &rec_times);\n        time_info[0] += rec_times[0];\n        time_info[1] += rec_times[1];\n        time_info[2] += rec_times[2];\n    }\n    if (FLAGS_benchmark)\n    {\n        AutoLogger autolog(\"rec\",\n                           FLAGS_use_gpu,\n                           FLAGS_use_tensorrt,\n                           FLAGS_enable_mkldnn,",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/main.cpp:86-109"
+    },
+    "3897": {
+        "file_id": 329,
+        "content": "This code is processing a batch of video frames using PaddleVideo's Recognition class. It initializes time consumption statistics, then runs the recognition method on each frame within the specified batch and stores the results in `time_info`. Additionally, it enables benchmarking if FLAGS_benchmark flag is set.",
+        "type": "comment"
+    },
+    "3898": {
+        "file_id": 329,
+        "content": "                           FLAGS_cpu_threads,\n                           FLAGS_rec_batch_num,\n                           \"dynamic\",\n                           FLAGS_precision,\n                           time_info,\n                           cv_all_video_names.size()); // Generate detailed information on the run\n        autolog.report(); // Print running details\n    }\n    return 0;\n}\nvoid check_params(char* mode)\n{\n    if (strcmp(mode, \"rec\") == 0)\n    {\n        std::cout << \"[\" << FLAGS_rec_model_dir << \"]\" << std::endl;\n        std::cout << \"[\" << FLAGS_video_dir << \"]\" << std::endl;\n        if (FLAGS_rec_model_dir.empty() || FLAGS_video_dir.empty())\n        {\n            std::cout << \"Usage[rec]: ./ppvideo --rec_model_dir=/PATH/TO/REC_INFERENCE_MODEL/ \"\n                      << \"--video_dir=/PATH/TO/INPUT/VIDEO/\" << std::endl;\n            exit(1);\n        }\n    }\n    if (FLAGS_precision != \"fp32\" && FLAGS_precision != \"fp16\" && FLAGS_precision != \"int8\")\n    {\n        cout << \"precison should be 'fp32'(default), 'fp16' or 'int8'. \" << endl;",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/main.cpp:110-138"
+    },
+    "3899": {
+        "file_id": 329,
+        "content": "This code segment is checking the parameters for running the video inference. If it's in recording mode, it ensures that both rec_model_dir and video_dir are not empty. It also checks if the precision specified (fp32, fp16, or int8) is valid. If any error is found, it displays an appropriate usage message and exits with an error code.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/39.json b/docs/data/39.json
new file mode 100644
index 000000000..4ee87b8a6
--- /dev/null
+++ b/docs/data/39.json
@@ -0,0 +1,543 @@
+{
+    "3900": {
+        "file_id": 329,
+        "content": "        exit(1);\n    }\n}\nint main(int argc, char **argv)\n{\n    if (argc <= 1 || (strcmp(argv[1], \"rec\") != 0)) //Get user input and check\n    {\n        std::cout << \"Please choose one mode of [rec] !\" << std::endl;\n        return -1;\n    }\n    std::cout << \"mode: \" << argv[1] << endl; // Type of inference task required for output\n    // Parsing command-line\n    google::ParseCommandLineFlags(&argc, &argv, true);\n    check_params(argv[1]);\n    if (!PathExists(FLAGS_video_dir)) // Determine whether the directory where the video exists\n    {\n        std::cerr << \"[ERROR] video path not exist! video_dir: \" << FLAGS_video_dir << endl;\n        exit(1);\n    }\n    std::vector<cv::String> cv_all_video_names; // Store all video paths\n    cv::glob(FLAGS_video_dir, cv_all_video_names); // Search all videos under FLAGS_video_dir, save in cv_all_video_names\n    std::cout << \"total videos num: \" << cv_all_video_names.size() << endl; // 输出搜索到的视频个数\n    if (strcmp(argv[1], \"rec\") == 0)\n    {\n        return main_rec(cv_all_video_names); // Output the number of videos searched",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/main.cpp:139-170"
+    },
+    "3901": {
+        "file_id": 329,
+        "content": "The code checks the user input and ensures the correct mode (\"rec\") is chosen. If not, it outputs an error message and returns -1. It also validates if the video directory exists and displays the total number of videos found. Finally, it calls the main_rec function for recording mode.",
+        "type": "comment"
+    },
+    "3902": {
+        "file_id": 329,
+        "content": "    }\n    return 0;\n}",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/main.cpp:171-173"
+    },
+    "3903": {
+        "file_id": 329,
+        "content": "The code snippet represents the end of the main function where a closing curly brace is followed by a return statement, indicating successful execution and termination of the program.",
+        "type": "comment"
+    },
+    "3904": {
+        "file_id": 330,
+        "content": "/deploy/cpp_infer/src/postprocess_op.cpp",
+        "type": "filepath"
+    },
+    "3905": {
+        "file_id": 330,
+        "content": "The given code implements the Softmax function in-place, calculating exponential elements and normalizing them for PaddleVideo library tasks. The class defines a Run method that performs softmax normalization on vector elements by iteratively computing exponential values and accumulating them for normalization.",
+        "type": "summary"
+    },
+    "3906": {
+        "file_id": 330,
+        "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include <include/postprocess_op.h>\nnamespace PaddleVideo\n{\n    void Softmax::Inplace_Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end)\n    {\n        const float max_value = *std::max_element(_begin, _end);\n        float denominator = 0.0f;\n        for (auto it = _begin; it != _end; ++it)\n        {\n            *it = std::exp((*it) - max_value);\n            denominator += (*it);",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/postprocess_op.cpp:1-26"
+    },
+    "3907": {
+        "file_id": 330,
+        "content": "This code is implementing the Softmax function in-place, calculating the exponential of elements and normalizing them by summing up all the elements. This is part of the PaddleVideo library for video analysis tasks.",
+        "type": "comment"
+    },
+    "3908": {
+        "file_id": 330,
+        "content": "        }\n        for (auto it = _begin; it != _end; ++it)\n        {\n            *it /= denominator;\n        }\n    }\n    std::vector<float> Softmax::Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end)\n    {\n        std::vector<float> prob(_begin, _end);\n        const float max_value = *std::max_element(prob.begin(), prob.end());\n        float denominator = 0.0f;\n        for (auto it = _begin, it_p = prob.begin(); it != _end; ++it, ++it_p)\n        {\n            (*it_p) = std::exp((*it) - max_value);\n            denominator += (*it_p);\n        }\n        for (auto it = prob.begin(); it != prob.end(); ++it)\n        {\n            (*it) /= denominator;\n        }\n        return prob;\n    }\n} // namespace PaddleVideo",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/postprocess_op.cpp:27-50"
+    },
+    "3909": {
+        "file_id": 330,
+        "content": "This code defines a Softmax class with a Run method that performs softmax normalization on a given range of vector elements. It first calculates the maximum value in the range, then iteratively computes the exponential of each element minus the maximum value and accumulates them into a denominator for normalization. Finally, it returns the normalized probability vector.",
+        "type": "comment"
+    },
+    "3910": {
+        "file_id": 331,
+        "content": "/deploy/cpp_infer/src/preprocess_op.cpp",
+        "type": "filepath"
+    },
+    "3911": {
+        "file_id": 331,
+        "content": "This code normalizes, scales, and converts images for inference using a ten-crop technique within the PaddleVideo library's implementation of pre-processing operations.",
+        "type": "summary"
+    },
+    "3912": {
+        "file_id": 331,
+        "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include \"paddle_api.h\"\n#include \"paddle_inference_api.h\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include <include/preprocess_op.h>\nnamespace PaddleVideo\n{\n    void Permute::Run(const cv::Mat *im, float *data)",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/preprocess_op.cpp:1-36"
+    },
+    "3913": {
+        "file_id": 331,
+        "content": "This code includes necessary headers for OpenCV and Paddle API libraries, defines the Permute class which runs a permutation operation on input images and outputs data in float format.",
+        "type": "comment"
+    },
+    "3914": {
+        "file_id": 331,
+        "content": "    {\n        int rh = im->rows;\n        int rw = im->cols;\n        int rc = im->channels();\n        for (int i = 0; i < rc; ++i)\n        {\n            // Extract the i-th channel of im and write it into the array with (data + i * rh * rw) as the starting address\n            cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw), rc - 1 - i);\n        }\n    }\n    void Normalize::Run(cv::Mat *im, const std::vector<float> &mean,\n                        const std::vector<float> &scale, const bool is_scale)\n    {\n        double e = 1.0;\n        if (is_scale)\n        {\n            e /= 255.0;\n        }\n        (*im).convertTo(*im, CV_32FC3, e);\n        std::vector<cv::Mat> bgr_channels(3);\n        cv::split(*im, bgr_channels);\n        for (auto i = 0; i < bgr_channels.size(); i++)\n        {\n            bgr_channels[i].convertTo(bgr_channels[i], CV_32FC1, 1.0 / scale[i], (0.0 - mean[i]) / scale[i]);\n        }\n        cv::merge(bgr_channels, *im);\n    }\n    void Scale::Run(const cv::Mat &img, cv::Mat &resize_img, bool use_tensorrt, const int &short_size)",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/preprocess_op.cpp:37-66"
+    },
+    "3915": {
+        "file_id": 331,
+        "content": "This code block is for preprocessing images before inference. It extracts each channel from the image and performs normalization on them separately, then scales the values and merges them back into a single image. The normalization is done by subtracting the mean and dividing by the scale factor for each channel. If scaling is required, it also converts the image data type to float. Afterwards, it resizes the image if necessary.",
+        "type": "comment"
+    },
+    "3916": {
+        "file_id": 331,
+        "content": "    {\n        int h = img.rows;\n        int w = img.cols;\n        if ((w <= h && w == short_size) || (h <= w && h == short_size))\n        {\n            img.copyTo(resize_img);\n        }\n        else\n        {\n            int oh, ow;\n            if (w < h)\n            {\n                ow = short_size;\n                oh = h * ow / w;\n            }\n            else\n            {\n                oh = short_size;\n                ow = w * oh / h;\n            }\n            cv::resize(img, resize_img, cv::Size(ow, oh), 0.0f, 0.0f, cv::INTER_LINEAR);\n        }\n    }\n    void CenterCrop::Run(const cv::Mat &img, cv::Mat &crop_img, bool use_tensorrt, const int &target_size)\n    {\n        int h = img.rows;\n        int w = img.cols;\n        int crop_h = target_size;\n        int crop_w = target_size;\n        if (w < crop_w || h < crop_h)\n        {\n            printf(\"[Error] image width (%d) and height (%d) should be larger than crop size (%d)\",\n                   w, h, target_size);\n        }\n        else\n        {\n            int x1 = (w - crop_w) / 2;",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/preprocess_op.cpp:67-104"
+    },
+    "3917": {
+        "file_id": 331,
+        "content": "This function performs image resizing and cropping operations based on the input image size and target crop size. If the image size is larger than or equal to the target crop size, it resizes the image to fit within the specified bounds and crops the center of the resized image with dimensions equal to the target crop size. If the image size is smaller than the target crop size, it prints an error message stating that the image width and height should be larger than the crop size.",
+        "type": "comment"
+    },
+    "3918": {
+        "file_id": 331,
+        "content": "            int y1 = (h - crop_h) / 2;\n            crop_img = img(cv::Rect(x1, y1, crop_w, crop_h));\n        }\n    }\n    void TenCrop::Run(const cv::Mat &img, std::vector<cv::Mat> &crop_imgs, const int &begin_index, bool use_tensorrt, const int &target_size)\n    {\n        int h = img.rows;\n        int w = img.cols;\n        int crop_h = target_size;\n        int crop_w = target_size;\n        int w_step = (w - crop_w) / 4;\n        int h_step = (h - crop_h) / 4;\n        pair<int, int>offsets[5] =\n        {\n            {0,          0},\n            {4 * w_step, 0},\n            {0,          4 * h_step},\n            {4 * w_step, 4 * h_step},\n            {2 * w_step, 2 * h_step}\n        };\n        for (int i = 0; i < 5; ++i)\n        {\n            const int &j = i * 2;\n            const int &x1 = offsets[i].first;\n            const int &y1 = offsets[i].second;\n            crop_imgs[begin_index + j] = img(cv::Rect(x1, y1, crop_w, crop_h)); // cropped\n            cv::flip(img(cv::Rect(x1, y1, crop_w, crop_h)), crop_imgs[begin_index + j + 1], 0); // cropped",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/preprocess_op.cpp:105-132"
+    },
+    "3919": {
+        "file_id": 331,
+        "content": "This code applies a ten-crop technique to input image by extracting 5 pairs of horizontally and vertically cropped images from the original one. These cropped images are stored in a vector for further processing.",
+        "type": "comment"
+    },
+    "3920": {
+        "file_id": 331,
+        "content": "        }\n    }\n} // namespace PaddleVideo",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/preprocess_op.cpp:133-135"
+    },
+    "3921": {
+        "file_id": 331,
+        "content": "This code is a part of the PaddleVideo library. It appears to be inside a class called PaddleVideo, and it seems that this code block is responsible for implementing some sort of pre-processing operation or function. This operation might involve processing video frames before they are sent through an AI model for inference. The code also includes namespaces, which are used to organize the code into logical groups or modules.",
+        "type": "comment"
+    },
+    "3922": {
+        "file_id": 332,
+        "content": "/deploy/cpp_infer/src/utility.cpp",
+        "type": "filepath"
+    },
+    "3923": {
+        "file_id": 332,
+        "content": "The PaddleVideo library's code contains a utility function, ReadDict, which reads a dictionary file and performs image cropping, input point adjustment, calculates image size, and converts points to standard format. Another function captures frames from a video at specific indices and releases the video object post-capture.",
+        "type": "summary"
+    },
+    "3924": {
+        "file_id": 332,
+        "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include <dirent.h>\n#include <include/utility.h>\n#include <iostream>\n#include <ostream>\n#include <sys/stat.h>\n#include <sys/types.h>\n#include <vector>\nnamespace PaddleVideo\n{\n    std::vector<std::string> Utility::ReadDict(const std::string &path)\n    {\n        std::ifstream in(path);\n        std::string line;\n        std::vector<std::string> m_vec;\n        if (in)\n        {\n            while (getline(in, line))",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/utility.cpp:1-33"
+    },
+    "3925": {
+        "file_id": 332,
+        "content": "This code is part of the PaddleVideo library, specifically in the cpp_infer/src/utility.cpp file. It includes necessary headers for utility functions, and defines the ReadDict function within the PaddleVideo namespace. The function reads a dictionary file located at the given path, and stores each line into a vector of strings named m_vec.",
+        "type": "comment"
+    },
+    "3926": {
+        "file_id": 332,
+        "content": "            {\n                m_vec.push_back(line);\n            }\n        }\n        else\n        {\n            std::cout << \"no such label file: \" << path << \", exit the program...\"\n                      << std::endl;\n            exit(1);\n        }\n        return m_vec; // Use fstream to read the category list and return with vector\n    }\n    void Utility::GetAllFiles(const char *dir_name, std::vector<std::string> &all_inputs)\n    {\n        if (NULL == dir_name)\n        {\n            std::cout << \" dir_name is null ! \" << std::endl;\n            return;\n        }\n        struct stat s;\n        lstat(dir_name, &s);\n        if (!S_ISDIR(s.st_mode))\n        {\n            std::cout << \"dir_name is not a valid directory !\" << std::endl;\n            all_inputs.push_back(dir_name);\n            return;\n        }\n        else\n        {\n            struct dirent *filename; // return value for readdir()\n            DIR *dir;                // return value for opendir()\n            dir = opendir(dir_name);\n            if (NULL == dir)",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/utility.cpp:34-67"
+    },
+    "3927": {
+        "file_id": 332,
+        "content": "The code reads a label file and returns its contents as a vector. It also retrieves all files in a directory, adding them to a vector if the directory is valid.",
+        "type": "comment"
+    },
+    "3928": {
+        "file_id": 332,
+        "content": "            {\n                std::cout << \"Can not open dir \" << dir_name << std::endl;\n                return;\n            }\n            std::cout << \"Successfully opened the dir !\" << std::endl;\n            while ((filename = readdir(dir)) != NULL)\n            {\n                if (strcmp(filename->d_name, \".\") == 0 ||\n                    strcmp(filename->d_name, \"..\") == 0)\n                    continue;\n                // img_dir + std::string(\"/\") + all_inputs[0];\n                all_inputs.push_back(dir_name + std::string(\"/\") +\n                                     std::string(filename->d_name));\n            }\n        }\n    }\n    cv::Mat Utility::GetRotateCropImage(const cv::Mat &srcimage, std::vector<std::vector<int>> box)\n    {\n        cv::Mat image;\n        srcimage.copyTo(image);\n        std::vector<std::vector<int>> points = box;\n        int x_collect[4] = {box[0][0], box[1][0], box[2][0], box[3][0]};\n        int y_collect[4] = {box[0][1], box[1][1], box[2][1], box[3][1]};\n        int left = int(*std::min_element(x_collect, x_collect + 4));",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/utility.cpp:68-93"
+    },
+    "3929": {
+        "file_id": 332,
+        "content": "The code snippet opens a directory, reads all files except \".\" and \"..\", and adds the file paths to a vector. The GetRotateCropImage function takes an image and a bounding box as input, copies the source image, and stores x and y coordinates of the bounding box in separate arrays.",
+        "type": "comment"
+    },
+    "3930": {
+        "file_id": 332,
+        "content": "        int right = int(*std::max_element(x_collect, x_collect + 4));\n        int top = int(*std::min_element(y_collect, y_collect + 4));\n        int bottom = int(*std::max_element(y_collect, y_collect + 4));\n        cv::Mat img_crop;\n        image(cv::Rect(left, top, right - left, bottom - top)).copyTo(img_crop);\n        for (int i = 0; i < points.size(); i++)\n        {\n            points[i][0] -= left;\n            points[i][1] -= top;\n        }\n        int img_crop_width = int(sqrt(pow(points[0][0] - points[1][0], 2) +\n                                      pow(points[0][1] - points[1][1], 2)));\n        int img_crop_height = int(sqrt(pow(points[0][0] - points[3][0], 2) +\n                                       pow(points[0][1] - points[3][1], 2)));\n        cv::Point2f pts_std[4];\n        pts_std[0] = cv::Point2f(0., 0.);\n        pts_std[1] = cv::Point2f(img_crop_width, 0.);\n        pts_std[2] = cv::Point2f(img_crop_width, img_crop_height);\n        pts_std[3] = cv::Point2f(0.f, img_crop_height);\n        cv::Point2f pointsf[4];",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/utility.cpp:94-118"
+    },
+    "3931": {
+        "file_id": 332,
+        "content": "This code crops an image based on the x and y coordinates of its bounding box, then adjusts the input points accordingly. It calculates the width and height of the cropped image using the Euclidean distance formula, and converts the original input points to a standard format for further processing.",
+        "type": "comment"
+    },
+    "3932": {
+        "file_id": 332,
+        "content": "        pointsf[0] = cv::Point2f(points[0][0], points[0][1]);\n        pointsf[1] = cv::Point2f(points[1][0], points[1][1]);\n        pointsf[2] = cv::Point2f(points[2][0], points[2][1]);\n        pointsf[3] = cv::Point2f(points[3][0], points[3][1]);\n        cv::Mat M = cv::getPerspectiveTransform(pointsf, pts_std);\n        cv::Mat dst_img;\n        cv::warpPerspective(img_crop, dst_img, M,\n                            cv::Size(img_crop_width, img_crop_height),\n                            cv::BORDER_REPLICATE);\n        if (float(dst_img.rows) >= float(dst_img.cols) * 1.5)\n        {\n            cv::Mat srcCopy = cv::Mat(dst_img.rows, dst_img.cols, dst_img.depth());\n            cv::transpose(dst_img, srcCopy);\n            cv::flip(srcCopy, srcCopy, 0);\n            return srcCopy;\n        }\n        else\n        {\n            return dst_img;\n        }\n    }\n    std::vector<cv::Mat> Utility::SampleFramesFromVideo(const std::string &VideoPath, const int &num_seg, const int &seg_len)\n    {\n        cv::VideoCapture capture(VideoPath); // Create a video object",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/utility.cpp:119-146"
+    },
+    "3933": {
+        "file_id": 332,
+        "content": "This code initializes four points using cv::Point2f, gets a perspective transform matrix M using getPerspectiveTransform, warps the image using warpPerspective, checks if the resized image's rows exceed 1.5 times its columns, and if so, transposes and flips the image before returning it; otherwise, returns the resized image directly. This is part of a function that samples frames from a video file.",
+        "type": "comment"
+    },
+    "3934": {
+        "file_id": 332,
+        "content": "        if (!capture.isOpened())\n        {\n            printf(\"[Error] video cannot be opened, please check the video [%s]\\n\", VideoPath.c_str());\n            capture.release();\n            exit(1);\n        }\n        int frames_len = capture.get(cv::CAP_PROP_FRAME_COUNT); // Get the total number of video frames\n        int average_dur = int(frames_len / num_seg);\n        std::vector<int> frames_idx;\n        for (int i = 0; i < num_seg; ++i)\n        {\n            int idx = 0;\n            if (average_dur >= seg_len)\n            {\n                idx = (average_dur - 1) / 2;\n                idx += i * average_dur;\n            }\n            else if (average_dur >= 1)\n            {\n                idx += i * average_dur;\n            }\n            else\n            {\n                idx = i;\n            }\n            for (int j = idx; j < idx + seg_len; ++j)\n            {\n                frames_idx.emplace_back(j % frames_len);\n            }\n        }\n        std::vector<cv::Mat> sampled_frames;\n        cv::Mat frame; // Create an object for storing sampled frames",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/utility.cpp:147-181"
+    },
+    "3935": {
+        "file_id": 332,
+        "content": "This code snippet checks if the video can be opened and exits if it cannot. It then calculates the number of frames in the video, determines the frame indices to sample for each segment based on the length of the segment and average duration between frames, and stores the sampled frames in a vector.",
+        "type": "comment"
+    },
+    "3936": {
+        "file_id": 332,
+        "content": "        for (int i = 0; i < num_seg; ++i)\n        {\n            const int &frame_idx = frames_idx[i];\n            capture.set(cv::CAP_PROP_POS_FRAMES, frame_idx); // Set to frame_idx frame\n            capture >> frame;\n            sampled_frames.push_back(frame);\n        }\n        capture.release(); // Release the video object\n        return sampled_frames;\n    }\n} // namespace PaddleVideo",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/utility.cpp:182-192"
+    },
+    "3937": {
+        "file_id": 332,
+        "content": "This function captures frames from a video at specific indices, stores them in sampled_frames vector, and releases the video object after capture.",
+        "type": "comment"
+    },
+    "3938": {
+        "file_id": 333,
+        "content": "/deploy/cpp_infer/src/video_rec.cpp",
+        "type": "filepath"
+    },
+    "3939": {
+        "file_id": 333,
+        "content": "The code processes video frames, performs inference using an AI model and measures processing times. It preprocesses data in batches and utilizes TensorRT with GPU optimizations and MKLDNN support for efficiency.",
+        "type": "summary"
+    },
+    "3940": {
+        "file_id": 333,
+        "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include <include/video_rec.h>\nnamespace PaddleVideo\n{\n    void VideoRecognizer::Run(const std::vector<string> &frames_batch_path, const std::vector<std::vector<cv::Mat> > &frames_batch, std::vector<double> *times)\n    {\n        // Copy parameters to the function\n        int real_batch_num = frames_batch.size();\n        std::vector<cv::Mat> srcframes(real_batch_num * this->num_seg, cv::Mat());\n        for (int i = 0; i < real_batch_num; ++i)",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:1-26"
+    },
+    "3941": {
+        "file_id": 333,
+        "content": "The code initializes variables and performs batch size operations. It copies the batch of frames and resizes it to accommodate for multiple segments per frame batch. The times vector will store execution time values.",
+        "type": "comment"
+    },
+    "3942": {
+        "file_id": 333,
+        "content": "        {\n            for (int j = 0; j < this->num_seg; ++j)\n            {\n                frames_batch[i][j].copyTo(srcframes[i * this->num_seg + j]);\n            }\n        }\n        auto preprocess_start = std::chrono::steady_clock::now();\n        /* Preprocess */\n        std::vector<cv::Mat> resize_frames;\n        std::vector<cv::Mat> crop_frames;\n        std::vector<float> input;\n        int num_views = 1;\n        if (this->inference_model_name == \"ppTSM\")\n        {\n            num_views = 1;\n            // 1. Scale\n            resize_frames = std::vector<cv::Mat>(real_batch_num * this->num_seg, cv::Mat());\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    this->scale_op_.Run(srcframes[i * this->num_seg + j], resize_frames[i * this->num_seg + j], this->use_tensorrt_, 256);\n                }\n            }\n            // 2. CenterCrop\n            crop_frames = std::vector<cv::Mat>(real_batch_num * num_views * this->num_seg, cv::Mat());",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:27-55"
+    },
+    "3943": {
+        "file_id": 333,
+        "content": "This code preprocesses video frames for model inference. It copies frames from source to destination, resizes them using a scale operation, and performs center cropping. The number of views is set to 1 if the model name is \"ppTSM\". The preprocessing steps include scaling and centering cropping to ensure the frames are properly formatted for inference.",
+        "type": "comment"
+    },
+    "3944": {
+        "file_id": 333,
+        "content": "            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    this->centercrop_op_.Run(resize_frames[i * this->num_seg + j], crop_frames[i * this->num_seg + j], this->use_tensorrt_, 224);\n                }\n            }\n            // 3. Normalization(inplace operation)\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    for (int k = 0; k < num_views; ++k)\n                    {\n                        this->normalize_op_.Run(&crop_frames[i * num_views * this->num_seg + j * num_views + k], this->mean_, this->scale_, this->is_scale_);\n                    }\n                }\n            }\n            // 4. Image2Array\n            int rh = crop_frames[0].rows;\n            int rw = crop_frames[0].cols;\n            int rc = crop_frames[0].channels();\n            input = std::vector<float>(real_batch_num * num_views * this->num_seg *  crop_frames[0].rows * crop_frames[0].cols * rc, 0.0f);",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:56-80"
+    },
+    "3945": {
+        "file_id": 333,
+        "content": "This code performs image preprocessing and conversion on video frames before feeding them into a neural network. It first resizes, centers, and crops the video frames using `centercrop_op_.Run()`. Then it normalizes the frames in an in-place operation using `normalize_op_.Run()`, with the mean and scale values provided. Finally, it converts the normalized frames into a single array using the dimensions from the first frame, and stores them in the 'input' vector.",
+        "type": "comment"
+    },
+    "3946": {
+        "file_id": 333,
+        "content": "            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    for (int k = 0; k < num_views; ++k)\n                    {\n                        this->permute_op_.Run(&crop_frames[i * num_views * this->num_seg + j * num_views + k], input.data() + (i * num_views * this->num_seg + j * num_views + k) * (rh * rw * rc));\n                    }\n                }\n            }\n        }\n        else if(this->inference_model_name == \"ppTSN\")\n        {\n            num_views = 10;\n            // 1. Scale\n            resize_frames = std::vector<cv::Mat>(real_batch_num * this->num_seg, cv::Mat());\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    this->scale_op_.Run(srcframes[i * this->num_seg + j], resize_frames[i * this->num_seg + j], this->use_tensorrt_, 256);\n                }\n            }\n            // 2. TenCrop",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:81-105"
+    },
+    "3947": {
+        "file_id": 333,
+        "content": "The code is iterating over real_batch_num number of batches and num_seg segments within each batch. For each segment, it's applying a set of operations (permute, scale, TenCrop) to a series of frames. These operations are used for data preprocessing before inputting into an inference model.",
+        "type": "comment"
+    },
+    "3948": {
+        "file_id": 333,
+        "content": "            crop_frames = std::vector<cv::Mat>(real_batch_num * this->num_seg * num_views, cv::Mat());\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    this->tencrop_op_.Run(resize_frames[i * this->num_seg + j], crop_frames, (i * this->num_seg  + j) * num_views, this->use_tensorrt_, 224);\n                }\n            }\n            // 3. Normalization(inplace operation)\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    for (int k = 0; k < num_views; ++k)\n                    {\n                        this->normalize_op_.Run(&crop_frames[i * this->num_seg * num_views + j * num_views + k], this->mean_, this->scale_, this->is_scale_);\n                    }\n                }\n            }\n            // 4. Image2Array\n            int rh = crop_frames[0].rows;\n            int rw = crop_frames[0].cols;",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:106-129"
+    },
+    "3949": {
+        "file_id": 333,
+        "content": "This code performs image preprocessing for video frames. It initializes a vector of crop_frames, iterates through real_batch_num and num_seg to run resizing and cropping operations on each frame using tencrop_op_. Next, it applies normalization inplace operation on each frame using normalize_op_. Finally, it converts the processed frames into an array by extracting rows and columns size from the first crop_frame.",
+        "type": "comment"
+    },
+    "3950": {
+        "file_id": 333,
+        "content": "            int rc = crop_frames[0].channels();\n            input = std::vector<float>(real_batch_num * this->num_seg * num_views *  crop_frames[0].rows * crop_frames[0].cols * rc, 0.0f);\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    for (int k = 0; k < num_views; ++k)\n                    {\n                        this->permute_op_.Run(&crop_frames[i * this->num_seg * num_views + j * num_views + k], input.data() + (i * this->num_seg * num_views + j * num_views + k) * (rh * rw * rc));\n                    }\n                }\n            }\n        }\n        else\n        {\n            throw \"[Error] Not implemented yet\";\n        }\n        auto preprocess_end = std::chrono::steady_clock::now();\n        /* Inference */\n        auto input_names = this->predictor_->GetInputNames();\n        auto input_t = this->predictor_->GetInputHandle(input_names[0]);\n        input_t->Reshape({real_batch_num * num_views * this->num_seg, 3, crop_frames[0].rows, crop_frames[0].cols});",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:130-152"
+    },
+    "3951": {
+        "file_id": 333,
+        "content": "The code initializes a vector with zeros based on the number of frames, segments, views, and channels. It then iterates over the real batch number, segments, and views to permute and populate the input vector. Finally, it performs inference by reshaping the input tensor for prediction.",
+        "type": "comment"
+    },
+    "3952": {
+        "file_id": 333,
+        "content": "        auto inference_start = std::chrono::steady_clock::now();\n        input_t->CopyFromCpu(input.data());\n        this->predictor_->Run(); // Use the inference library to predict\n        std::vector<float> predict_batch;\n        auto output_names = this->predictor_->GetOutputNames();\n        auto output_t = this->predictor_->GetOutputHandle(output_names[0]);\n        auto predict_shape = output_t->shape();\n        // Get the number of class\n        int class_num = predict_shape[1];\n        int out_numel = std::accumulate(predict_shape.begin(), predict_shape.end(), 1, std::multiplies<int>());\n        predict_batch.resize(out_numel); // NxC\n        output_t->CopyToCpu(predict_batch.data()); // Copy the model output to predict_batch\n        // Convert output (logits) into probabilities\n        for (int i = 0; i < real_batch_num; ++i)\n        {\n            this->softmax_op_.Inplace_Run(predict_batch.begin() + i * class_num, predict_batch.begin() + (i + 1) * class_num);\n        }\n        auto inference_end = std::chrono::steady_clock::now();",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:153-175"
+    },
+    "3953": {
+        "file_id": 333,
+        "content": "This code segment performs inference using an AI model, gathers the output probabilities, and applies softmax operation to convert logits into probabilities.",
+        "type": "comment"
+    },
+    "3954": {
+        "file_id": 333,
+        "content": "        // output decode\n        auto postprocess_start = std::chrono::steady_clock::now();\n        std::vector<std::string> str_res;\n        std::vector<float>scores;\n        for (int i = 0; i < real_batch_num; ++i)\n        {\n            int argmax_idx = int(Utility::argmax(predict_batch.begin() + i * class_num, predict_batch.begin() + (i + 1) * class_num));\n            float score = predict_batch[argmax_idx];\n            scores.push_back(score);\n            str_res.push_back(this->label_list_[argmax_idx]);\n        }\n        auto postprocess_end = std::chrono::steady_clock::now();\n        for (int i = 0; i < str_res.size(); i++)\n        {\n            std::cout << frames_batch_path[i] << \"\\tclass: \" << str_res[i] << \"\\tscore: \" << scores[i] << endl;\n        }\n        std::chrono::duration<float> preprocess_diff = preprocess_end - preprocess_start;\n        times->push_back(double(preprocess_diff.count() * 1000));\n        std::chrono::duration<float> inference_diff = inference_end - inference_start;\n        times->push_back(double(inference_diff.count() * 1000));",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:177-198"
+    },
+    "3955": {
+        "file_id": 333,
+        "content": "This code snippet is responsible for post-processing the results of object detection after model inference. It calculates the class and score for each frame, outputs it, and stores the processing times.",
+        "type": "comment"
+    },
+    "3956": {
+        "file_id": 333,
+        "content": "        std::chrono::duration<float> postprocess_diff = postprocess_end - postprocess_start;\n        times->push_back(double(postprocess_diff.count() * 1000));\n    }\n    void VideoRecognizer::LoadModel(const std::string &model_dir)\n    {\n        //   AnalysisConfig config;\n        paddle_infer::Config config;\n        config.SetModel(model_dir + \"/\" + this->inference_model_name + \".pdmodel\",\n                        model_dir + \"/\" + this->inference_model_name + \".pdiparams\");\n        if (this->use_gpu_)\n        {\n            config.EnableUseGpu(this->gpu_mem_, this->gpu_id_);\n            if (this->use_tensorrt_)\n            {\n                auto precision = paddle_infer::Config::Precision::kFloat32;\n                if (this->precision_ == \"fp16\")\n                {\n                    precision = paddle_infer::Config::Precision::kHalf;\n                }\n                else if (this->precision_ == \"int8\")\n                {\n                    precision = paddle_infer::Config::Precision::kInt8;\n                }",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:199-223"
+    },
+    "3957": {
+        "file_id": 333,
+        "content": "This code initializes a Paddle Video recognizer by loading the model from a given directory. It also sets up GPU and TensorRT configurations if needed, and specifies precision based on the provided string value.",
+        "type": "comment"
+    },
+    "3958": {
+        "file_id": 333,
+        "content": "                if (this->inference_model_name == \"ppTSM\" || this->inference_model_name == \"TSM\")\n                {\n                    config.EnableTensorRtEngine(\n                        1 << 30, // workspaceSize\n                        this->rec_batch_num * this->num_seg * 1, // maxBatchSize\n                        3, // minSubgraphSize\n                        precision, // precision\n                        false,// useStatic\n                        false //useCalibMode\n                    );\n                }\n                else if(this->inference_model_name == \"ppTSN\" || this->inference_model_name == \"TSN\")\n                {\n                    config.EnableTensorRtEngine(\n                        1 << 30,\n                        this->rec_batch_num * this->num_seg * 10,\n                        3, // minSubgraphSize\n                        precision,// precision\n                        false,// useStatic\n                        false //useCalibMode\n                    );\n                }\n                else",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:225-247"
+    },
+    "3959": {
+        "file_id": 333,
+        "content": "This code checks the inference model name and configures TensorRT engine accordingly for different models like ppTSM, TSM, ppTSN, or TSN. It sets workspace size to a large value, maxBatchSize based on number of segments, minSubgraphSize to 3, precision, and disables useStatic and useCalibMode.",
+        "type": "comment"
+    },
+    "3960": {
+        "file_id": 333,
+        "content": "                {\n                    config.EnableTensorRtEngine(\n                        1 << 30, // workspaceSize\n                        this->rec_batch_num, // maxBatchSize\n                        3, // minSubgraphSize\n                        precision,// precision\n                        false,// useStatic\n                        false //useCalibMode\n                    );\n                }\n                std::cout << \"Enable TensorRT is: \" << config.tensorrt_engine_enabled() << std::endl;\n                /* some model dose not suppport dynamic shape with TRT, deactivate it by default */\n                // std::map<std::string, std::vector<int> > min_input_shape =\n                // {\n                //     {\"data_batch_0\", {1, this->num_seg, 3, 1, 1}}\n                // };\n                // std::map<std::string, std::vector<int> > max_input_shape =\n                // {\n                //     {\"data_batch_0\", {1, this->num_seg, 3, 256, 256}}\n                // };\n                // std::map<std::string, std::vector<int> > opt_input_shape =",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:248-271"
+    },
+    "3961": {
+        "file_id": 333,
+        "content": "The code enables the TensorRT engine with specific parameters, such as workspace size, max batch size, minimum subgraph size, and precision. It checks if TensorRT is enabled and deactivates it by default for models that do not support dynamic shape. The code also defines input shape ranges (min, opt) for a particular key (\"data_batch_0\").",
+        "type": "comment"
+    },
+    "3962": {
+        "file_id": 333,
+        "content": "                // {\n                //     {\"data_batch_0\", {this->rec_batch_num,  this->num_seg, 3, 224, 224}}\n                // };\n                // config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,\n                //                               opt_input_shape);\n            }\n        }\n        else\n        {\n            config.DisableGpu();\n            if (this->use_mkldnn_)\n            {\n                config.EnableMKLDNN();\n                // cache 10 different shapes for mkldnn to avoid memory leak\n                config.SetMkldnnCacheCapacity(10);\n            }\n            config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);\n        }\n        config.SwitchUseFeedFetchOps(false);\n        // true for multiple input\n        config.SwitchSpecifyInputNames(true);\n        config.SwitchIrOptim(true);\n        config.EnableMemoryOptim();\n        config.DisableGlogInfo();\n        this->predictor_ = CreatePredictor(config);\n    }\n} // namespace PaddleVideo",
+        "type": "code",
+        "location": "/deploy/cpp_infer/src/video_rec.cpp:272-304"
+    },
+    "3963": {
+        "file_id": 333,
+        "content": "This code initializes a PaddleVideo predictor with TensorRT configuration options. It sets the GPU usage, enables MKLDNN (if needed), specifies input names and optimizations, disables INFO log messages, and creates the predictor.",
+        "type": "comment"
+    },
+    "3964": {
+        "file_id": 334,
+        "content": "/deploy/cpp_infer/tools/build.sh",
+        "type": "filepath"
+    },
+    "3965": {
+        "file_id": 334,
+        "content": "This script sets the paths for OpenCV, PaddlePaddle inference, CUDA, cuDNN, and TensorRT directories. It clears existing build directory, creates a new one, navigates to it, runs cmake with specified options, and then compiles the project using 'make -j' command.",
+        "type": "summary"
+    },
+    "3966": {
+        "file_id": 334,
+        "content": "OPENCV_DIR=your_opencv_dir\nLIB_DIR=your_paddle_inference_dir\nCUDA_LIB_DIR=your_cuda_lib_dir\nCUDNN_LIB_DIR=your_cudnn_lib_dir\nTENSORRT_DIR=your_tensorRT_dir\nBUILD_DIR=build\nrm -rf ${BUILD_DIR}\nmkdir ${BUILD_DIR}\ncd ${BUILD_DIR}\ncmake .. \\\n    -DPADDLE_LIB=${LIB_DIR} \\\n    -DWITH_MKL=ON \\\n    -DWITH_GPU=OFF \\\n    -DWITH_STATIC_LIB=OFF \\\n    -DWITH_TENSORRT=OFF \\\n    -DOPENCV_DIR=${OPENCV_DIR} \\\n    -DCUDNN_LIB=${CUDNN_LIB_DIR} \\\n    -DCUDA_LIB=${CUDA_LIB_DIR} \\\n    -DTENSORRT_DIR=${TENSORRT_DIR} \\\nmake -j",
+        "type": "code",
+        "location": "/deploy/cpp_infer/tools/build.sh:1-22"
+    },
+    "3967": {
+        "file_id": 334,
+        "content": "This script sets the paths for OpenCV, PaddlePaddle inference, CUDA, cuDNN, and TensorRT directories. It clears existing build directory, creates a new one, navigates to it, runs cmake with specified options, and then compiles the project using 'make -j' command.",
+        "type": "comment"
+    },
+    "3968": {
+        "file_id": 335,
+        "content": "/deploy/cpp_serving/paddle_env_install.sh",
+        "type": "filepath"
+    },
+    "3969": {
+        "file_id": 335,
+        "content": "The code installs TensorRT and sets up Go environment for PaddleVideo C++ serving, checking CUDA version, downloading SSL libraries, and installing necessary packages.",
+        "type": "summary"
+    },
+    "3970": {
+        "file_id": 335,
+        "content": "unset GREP_OPTIONS\nfunction install_trt(){\n  CUDA_VERSION=$(nvcc --version | egrep -o \"V[0-9]+.[0-9]+\" | cut -c2-)\n  if [ $CUDA_VERSION == \"10.2\" ]; then\n    wget https://paddle-ci.gz.bcebos.com/TRT/TensorRT6-cuda10.2-cudnn7.tar.gz --no-check-certificate\n    tar -zxf TensorRT6-cuda10.2-cudnn7.tar.gz -C /usr/local\n    cp -rf /usr/local/TensorRT-6.0.1.8/include/*  /usr/include/ && cp -rf /usr/local/TensorRT-6.0.1.8/lib/* /usr/lib/\n    rm -rf TensorRT6-cuda10.2-cudnn7.tar.gz\n  elif [ $CUDA_VERSION == \"11.2\" ]; then\n    wget https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz --no-check-certificate\n    tar -zxf TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz -C /usr/local\n    cp -rf /usr/local/TensorRT-8.0.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.0.3.4/lib/* /usr/lib/\n    rm -rf TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz\n  else\n    echo \"No Cuda Found, no need to install TensorRT\"\n  fi\n}\nfunction env_install()\n{\n    apt install -y libcurl4-openssl-dev libbz2-dev",
+        "type": "code",
+        "location": "/deploy/cpp_serving/paddle_env_install.sh:1-22"
+    },
+    "3971": {
+        "file_id": 335,
+        "content": "This script installs TensorRT based on the detected CUDA version, and installs necessary libraries for PaddleVideo. It checks the CUDA version, downloads the corresponding TensorRT package, extracts it to /usr/local, and copies relevant include and lib files to their respective directories. If no CUDA version is found, it displays a message stating no Cuda Found and no need to install TensorRT.",
+        "type": "comment"
+    },
+    "3972": {
+        "file_id": 335,
+        "content": "    wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && tar xf centos_ssl.tar && rm -rf centos_ssl.tar && mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so\n    rm -rf /usr/local/go && wget -qO- https://paddle-ci.gz.bcebos.com/go1.15.12.linux-amd64.tar.gz | \\\n    tar -xz -C /usr/local && \\\n    mkdir /root/go && \\\n    mkdir /root/go/bin && \\\n    mkdir /root/go/src && \\\n    echo \"GOROOT=/usr/local/go\" >> /root/.bashrc && \\\n    echo \"GOPATH=/root/go\" >> /root/.bashrc && \\\n    echo \"PATH=/usr/local/go/bin:/root/go/bin:$PATH\" >> /root/.bashrc\n    install_trt\n}\nenv_install",
+        "type": "code",
+        "location": "/deploy/cpp_serving/paddle_env_install.sh:23-35"
+    },
+    "3973": {
+        "file_id": 335,
+        "content": "This code is installing necessary packages and setting up Go environment for PaddleVideo C++ serving. It downloads SSL libraries, installs Go 1.15.12, sets GOROOT and GOPATH variables, and installs the trt package.",
+        "type": "comment"
+    },
+    "3974": {
+        "file_id": 336,
+        "content": "/deploy/cpp_serving/preprocess_ops.py",
+        "type": "filepath"
+    },
+    "3975": {
+        "file_id": 336,
+        "content": "The code defines a Compose class for image processing steps and functions to preprocess video frames, returning input/output variables. The get_preprocess_func function selects the correct preprocessing function based on the model name. Invalid names raise ValueError.",
+        "type": "summary"
+    },
+    "3976": {
+        "file_id": 336,
+        "content": "import os\nimport sys\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, \"../../\")))\nfrom paddlevideo.loader.pipelines import (CenterCrop, Image2Array,\n                                          Normalization, Sampler, Scale,\n                                          VideoDecoder, TenCrop)\nimport numpy as np\nfrom typing import Dict, Tuple, List, Callable\nVALID_MODELS = [\"PPTSM\", \"PPTSN\"]\nimport os\nimport sys\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, \"../../\")))\nfrom paddlevideo.loader.pipelines import (CenterCrop, Image2Array,\n                                          Normalization, Sampler, Scale,\n                                          VideoDecoder, TenCrop)\nimport numpy as np\nfrom typing import Dict, Tuple, List, Callable\nVALID_MODELS = [\"PPTSM\", \"PPTSN\"]\nclass Compose:\n    def __init__(self, transforms):\n        self.transforms = transforms\n    def __call__(self, img):\n        for t in self.transforms:",
+        "type": "code",
+        "location": "/deploy/cpp_serving/preprocess_ops.py:1-34"
+    },
+    "3977": {
+        "file_id": 336,
+        "content": "This code imports necessary libraries and defines the VALID_MODELS variable. It then creates a Compose class that takes in a list of transforms, allowing for composition of multiple image processing steps to be applied sequentially.",
+        "type": "comment"
+    },
+    "3978": {
+        "file_id": 336,
+        "content": "            img = t(img)\n        return img\ndef np_softmax(x: np.ndarray, axis: int = 0) -> np.ndarray:\n    \"\"\"softmax function\n    Args:\n        x (np.ndarray): logits\n        axis (int): axis\n    Returns:\n        np.ndarray: probs\n    \"\"\"\n    x -= np.max(x, axis=axis, keepdims=True)\n    x = np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)\n    return x\ndef preprocess_PPTSM(video_path: str) -> Tuple[Dict[str, np.ndarray], List]:\n    \"\"\"preprocess\n    Args:\n        video_path (str): input video path\n    Returns:\n        Tuple[Dict[str, np.ndarray], List]: feed and fetch\n    \"\"\"\n    img_mean = [0.485, 0.456, 0.406]\n    img_std = [0.229, 0.224, 0.225]\n    seq = Compose([\n        VideoDecoder(),\n        Sampler(8, 1, valid_mode=True),\n        Scale(256),\n        CenterCrop(224),\n        Image2Array(),\n        Normalization(img_mean, img_std)\n    ])\n    results = {\"filename\": video_path}\n    results = seq(results)\n    tmp_inp = np.expand_dims(results[\"imgs\"], axis=0)  # [b,t,c,h,w]\n    tmp_inp = np.expand_dims(tmp_inp, axis=0)  # [1,b,t,c,h,w]",
+        "type": "code",
+        "location": "/deploy/cpp_serving/preprocess_ops.py:35-76"
+    },
+    "3979": {
+        "file_id": 336,
+        "content": "The code is defining a function `preprocess_PPTSM` that takes a video path as input, and applies several image preprocessing steps before returning the feed and fetch data. These steps include decoding the video frames, sampling, scaling, cropping, converting to array format, and normalization using specific mean and standard deviation values. The resulting processed data is stored in the `results` dictionary, which contains the images and metadata.",
+        "type": "comment"
+    },
+    "3980": {
+        "file_id": 336,
+        "content": "    feed = {\"data_batch_0\": tmp_inp}\n    fetch = [\"outputs\"]\n    return feed, fetch\ndef preprocess_PPTSN(video_path: str) -> Tuple[Dict[str, np.ndarray], List]:\n    \"\"\"preprocess\n    Args:\n        video_path (str): input video path\n    Returns:\n        Tuple[Dict[str, np.ndarray], List]: feed and fetch\n    \"\"\"\n    img_mean = [0.485, 0.456, 0.406]\n    img_std = [0.229, 0.224, 0.225]\n    seq = Compose([\n        VideoDecoder(),\n        Sampler(25, 1, valid_mode=True, select_left=True),\n        Scale(256, fixed_ratio=True, do_round=True, backend='cv2'),\n        TenCrop(224),\n        Image2Array(),\n        Normalization(img_mean, img_std)\n    ])\n    results = {\"filename\": video_path}\n    results = seq(results)\n    tmp_inp = np.expand_dims(results[\"imgs\"], axis=0)  # [b,t,c,h,w]\n    tmp_inp = np.expand_dims(tmp_inp, axis=0)  # [1,b,t,c,h,w]\n    feed = {\"data_batch_0\": tmp_inp}\n    fetch = [\"outputs\"]\n    return feed, fetch\ndef get_preprocess_func(model_name: str) -> Callable:\n    \"\"\"get preprocess function by model_name",
+        "type": "code",
+        "location": "/deploy/cpp_serving/preprocess_ops.py:77-111"
+    },
+    "3981": {
+        "file_id": 336,
+        "content": "The function preprocess_PPTSN takes in a video path, applies a series of image processing steps to the video frames, and returns feed and fetch variables for input and output respectively. The get_preprocess_func function returns a preprocessing function based on the given model name.",
+        "type": "comment"
+    },
+    "3982": {
+        "file_id": 336,
+        "content": "    Args:\n        model_name (str): model's name, must in `VALID_MODELS`\n    Returns:\n        Callable: preprocess function corresponding to model name\n    \"\"\"\n    if model_name == \"PPTSM\":\n        return preprocess_PPTSM\n    elif model_name == \"PPTSN\":\n        return preprocess_PPTSN\n    else:\n        raise ValueError(\n            f\"model_name must in {VALID_MODELS}, but got model_name\")",
+        "type": "code",
+        "location": "/deploy/cpp_serving/preprocess_ops.py:113-126"
+    },
+    "3983": {
+        "file_id": 336,
+        "content": "This function takes a model name as input and returns the corresponding preprocess function based on the conditionals provided. If the model name is \"PPTSM\", it will return the preprocess_PPTSM function, if the model name is \"PPTSN\" it will return preprocess_PPTSN, otherwise it raises a ValueError with an error message stating that the model name must be in VALID_MODELS.",
+        "type": "comment"
+    },
+    "3984": {
+        "file_id": 337,
+        "content": "/deploy/cpp_serving/readme.md",
+        "type": "filepath"
+    },
+    "3985": {
+        "file_id": 337,
+        "content": "The code deploys Paddle Serving, a model serving framework in PaddleVideo, through Docker on Linux with GPU and CPU options. It uses paddle-video-deploy for model conversion and includes client scripts, environment setup, and troubleshooting for missing libraries.",
+        "type": "summary"
+    },
+    "3986": {
+        "file_id": 337,
+        "content": "简体中文 | [English](./readme_en.md)\n# 模型服务化部署\n## 简介\n[Paddle Serving](https://github.com/PaddlePaddle/Serving) 旨在帮助深度学习开发者轻松部署在线预测服务，支持一键部署工业级的服务能力、客户端和服务端之间高并发和高效通信、并支持多种编程语言开发客户端。\n该部分以 HTTP 预测服务部署为例，介绍怎样在 PaddleVideo 中使用 PaddleServing 部署模型服务。目前只支持 Linux 平台部署，暂不支持 Windows 平台。\n## Serving 安装\nServing 官网推荐使用 docker 安装并部署 Serving 环境。首先需要拉取 docker 环境并创建基于 Serving 的 docker。\n```bash\n# 启动GPU docker\ndocker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel\nnvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash\nnvidia-docker exec -it test bash\n# 启动CPU docker\ndocker pull paddlepaddle/serving:0.7.0-devel\ndocker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash\ndocker exec -it test bash\n```\n进入 docker 后，需要安装 Serving 相关的 python 包。\n```bash\npython3.7 -m pip install paddle-serving-client==0.7.0\npython3.7 -m pip install paddle-serving-app==0.7.0\n#若为CPU部署环境:\npython3.7 -m pip install paddle-serving-server==0.7.0  # CPU\npython3.7 -m pip install paddlepaddle==2.2.0           # CPU",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme.md:1-32"
+    },
+    "3987": {
+        "file_id": 337,
+        "content": "This code snippet provides instructions for deploying Paddle Serving, a model serving framework, as part of the PaddleVideo codebase. It explains that this is done through Docker and covers GPU-accelerated and CPU-based installations, including the necessary package installation commands. The code assumes Linux platform, and does not support Windows at present.",
+        "type": "comment"
+    },
+    "3988": {
+        "file_id": 337,
+        "content": "#若为GPU部署环境\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102  # GPU with CUDA10.2 + TensorRT6\npython3.7 -m pip install paddlepaddle-gpu==2.2.0                   # GPU with CUDA10.2\n#其他GPU环境需要确认环境再选择执行哪一条\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101  # GPU with CUDA10.1 + TensorRT6\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112  # GPU with CUDA11.2 + TensorRT8\n```\n* 如果安装速度太慢，可以通过 `-i https://pypi.tuna.tsinghua.edu.cn/simple` 更换源，加速安装过程。\n* 更多环境和对应的安装包详见：https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md\n## 行为识别服务部署\n### 模型转换\n使用 PaddleServing 做服务化部署时，需要将保存的 inference 模型转换为 Serving 模型。下面以 PP-TSM 模型为例，介绍如何部署行为识别服务。\n- 下载 PP-TSM 推理模型并转换为 Serving 模型：\n  ```bash\n  # 进入PaddleVideo目录\n  cd PaddleVideo\n  # 下载推理模型并解压到./inference下\n  mkdir ./inference\n  pushd ./inference\n  wget  https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip\n  unzip ppTSM.zip\n  popd\n  # 转换成 Serving 模型\n  pushd deploy/cpp_serving\n  python3.7 -m paddle_serving_client.convert \\",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme.md:34-64"
+    },
+    "3989": {
+        "file_id": 337,
+        "content": "This code provides instructions for installing different versions of PaddleServing Server with various GPU environments and specifies the required pip commands. It also mentions an alternative method to speed up the installation process by changing the source. Furthermore, it highlights how to convert a PP-TSM inference model into Serving format for deploying behavior recognition service.",
+        "type": "comment"
+    },
+    "3990": {
+        "file_id": 337,
+        "content": "  --dirname ../../inference/ppTSM \\\n  --model_filename ppTSM.pdmodel \\\n  --params_filename ppTSM.pdiparams \\\n  --serving_server ./ppTSM_serving_server \\\n  --serving_client ./ppTSM_serving_client\n  popd\n  ```\n  | 参数              | 类型 | 默认值             | 描述                                                         |\n  | ----------------- | ---- | ------------------ | ------------------------------------------------------------ |\n  | `dirname`         | str  | -                  | 需要转换的模型文件存储路径，Program结构文件和参数文件均保存在此目录。 |\n  | `model_filename`  | str  | None               | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 `__model__` 作为默认的文件名 |\n  | `params_filename` | str  | None               | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None |\n  | `serving_server`  | str  | `\"serving_server\"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |\n  | `serving_client`  | str  | `\"serving_client\"` | 转换后的客户端配置文件存储路径。默认值为serving_client       |\n- 推理模型转换完成后，会在`deploy/cpp_serving`文件夹下生成 `ppTSM_serving_client` 和 `ppTSM_serving_server` 两个文件夹，具备如下格式：",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme.md:65-81"
+    },
+    "3991": {
+        "file_id": 337,
+        "content": "This code is using PaddleVideo's paddle-video-deploy to convert a model and store it in the \"inference/ppTSM\" directory. It specifies the model filename as \"ppTSM.pdmodel\", the params filename as \"ppTSM.pdiparams\", and generates serving_server and serving_client files in the \"ppTSM\" folder. The converted model will be saved in the \"deploy/cpp_serving\" directory, with the client configuration stored in the \"ppTSM_serving_client\" and server configurations stored in the \"ppTSM_serving_server\".",
+        "type": "comment"
+    },
+    "3992": {
+        "file_id": 337,
+        "content": "  ```bash\n  PaddleVideo/deploy/cpp_serving\n  ├── ppTSM_serving_client\n  │   ├── serving_client_conf.prototxt\n  │   └── serving_client_conf.stream.prototxt\n  └── ppTSM_serving_server\n      ├── ppTSM.pdiparams\n      ├── ppTSM.pdmodel\n      ├── serving_server_conf.prototxt\n      └── serving_server_conf.stream.prototxt\n  ```\n  得到模型文件之后，需要分别修改 `ppTSM_serving_client` 下的 `serving_client_conf.prototxt` 和 `ppTSM_serving_server` 下的 `serving_server_conf.prototxt`，将两份文件中`fetch_var` 下的 `alias_name` 均改为 `outputs`\n  **备注**:  Serving 为了兼容不同模型的部署，提供了输入输出重命名的功能。这样，不同的模型在推理部署时，只需要修改配置文件的`alias_name`即可，无需修改代码即可完成推理部署。\n  修改后的`serving_server_conf.prototxt`如下所示:\n  ```yaml\n  feed_var {\n    name: \"data_batch_0\"\n    alias_name: \"data_batch_0\"\n    is_lod_tensor: false\n    feed_type: 1\n    shape: 8\n    shape: 3\n    shape: 224\n    shape: 224\n  }\n  fetch_var {\n    name: \"linear_2.tmp_1\"\n    alias_name: \"outputs\"\n    is_lod_tensor: false\n    fetch_type: 1\n    shape: 400\n  }\n  ```\n### 服务部署和请求\n`cpp_serving` 目录包含了启动 pipeline 服务、C++ serving服务和发送预测请求的代码，具体包括：",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme.md:82-118"
+    },
+    "3993": {
+        "file_id": 337,
+        "content": "This code snippet shows the necessary changes to be made in `serving_client_conf.prototxt` and `serving_server_conf.prototxt` files after getting the model file. The purpose is to rename `alias_name` as 'outputs' for `fetch_var`. This allows Serving to support different models without modifying code during deployment.\n\nThe modified `serving_server_conf.prototxt` file shows a feed variable and a fetch variable with their respective names, alias names, shapes, and types.",
+        "type": "comment"
+    },
+    "3994": {
+        "file_id": 337,
+        "content": "  ```bash\n  run_cpp_serving.sh          # 启动C++ serving server端的脚本\n  pipeline_http_client.py     # client端发送数据并获取预测结果的脚本\n  paddle_env_install.sh       # 安装C++ serving环境脚本\n  preprocess_ops.py           # 存放预处理函数的文件\n  ```\n#### C++ Serving\n- 进入工作目录：\n  ```bash\n  cd deploy/cpp_serving\n  ```\n- 启动服务：\n  ```bash\n  # 在后台启动，过程中打印输出的日志会重定向保存到nohup.txt中，可以使用tailf nohup.txt查看输出\n  bash run_cpp_serving.sh\n  ```\n- 发送请求并获取结果：\n  ```bash\n  python3.7 serving_client.py \\\n  -n PPTSM \\\n  -c ./ppTSM_serving_client/serving_client_conf.prototxt \\\n  --input_file=../../data/example.avi\n  ```\n成功运行后，模型预测的结果会打印在 cmd 窗口中，结果如下：\n  ```bash\n  I0510 04:33:00.110025 37097 naming_service_thread.cpp:202] brpc::policy::ListNamingService(\"127.0.0.1:9993\"): added 1\n  I0510 04:33:01.904764 37097 general_model.cpp:490] [client]logid=0,client_cost=1640.96ms,server_cost=1623.21ms.\n  {'class_id': '[5]', 'prob': '[0.9907387495040894]'}\n  ```\n**如果过程中报错显示找不到libnvinfer.so.6，可以执行脚本`paddle_env_install.sh`安装相关环境**\n  ```bash\n  bash paddle_env_install.sh\n  ```\n## FAQ\n**Q1**： 发送请求后没有结果返回或者提示输出解码报错",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme.md:119-158"
+    },
+    "3995": {
+        "file_id": 337,
+        "content": "This code provides the instructions to set up and run a C++ serving server, send requests from a client script, and interpret the results. It also includes a script to install the required environment and troubleshoot common issues such as missing libraries.",
+        "type": "comment"
+    },
+    "3996": {
+        "file_id": 337,
+        "content": "**A1**： 启动服务和发送请求时不要设置代理，可以在启动服务前和发送请求前关闭代理，关闭代理的命令是：\n```\nunset https_proxy\nunset http_proxy\n```",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme.md:160-164"
+    },
+    "3997": {
+        "file_id": 337,
+        "content": "Avoid setting proxies when starting the service and sending requests. Disable proxies by using \"unset https_proxy\" and \"unset http_proxy\" commands beforehand.",
+        "type": "comment"
+    },
+    "3998": {
+        "file_id": 338,
+        "content": "/deploy/cpp_serving/readme_en.md",
+        "type": "filepath"
+    },
+    "3999": {
+        "file_id": 338,
+        "content": "This code accelerates PaddleServing installation with Docker, supports Linux and GPU, simplifies action recognition service deployment, and provides a C++ serving environment setup guide.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/4.json b/docs/data/4.json
new file mode 100644
index 000000000..b02f3858b
--- /dev/null
+++ b/docs/data/4.json
@@ -0,0 +1,541 @@
+{
+    "400": {
+        "file_id": 43,
+        "content": "            38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43,\n            44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49,\n            49, 50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55,\n            55, 55, 56, 56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60,\n            61, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66,\n            66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72,\n            72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77,\n            78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 83, 83,\n            83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 89,\n            89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94, 94, 94,\n            95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n            100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104,\n            104, 105, 105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:212-224"
+    },
+    "401": {
+        "file_id": 43,
+        "content": "This code represents a sequence of numbers, potentially used for various purposes within the codebase such as tracking or counting.",
+        "type": "comment"
+    },
+    "402": {
+        "file_id": 43,
+        "content": "            109, 109, 109, 110, 110, 110, 111, 111, 111, 112, 112, 112, 113,\n            113, 113, 114, 114, 114, 115, 115, 115, 116, 116, 116, 117, 117,\n            117, 118, 118, 118, 119, 119, 119, 120, 120, 120, 121, 121, 121,\n            122, 122, 122, 123, 123, 123, 124, 124, 124, 125, 125, 125, 126,\n            126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130, 130,\n            130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134,\n            135, 135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139,\n            139, 139, 140, 140, 140, 141, 141, 141, 142, 142, 142, 143, 143,\n            143, 144, 144, 144, 145, 145, 145, 146, 146, 146, 147, 147, 147,\n            148, 148, 148, 149, 149, 149, 150, 150, 150, 151, 151, 151, 152,\n            152, 152, 153, 153, 153, 154, 154, 154, 155, 155, 155, 156, 156,\n            156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160, 160, 160,\n            161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n            165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:225-238"
+    },
+    "403": {
+        "file_id": 43,
+        "content": "The code contains a sequence of numbers ranging from 109 to 169, which could be used as array indices or other numeric identifiers in the following lines. Without further context, it's difficult to determine the exact purpose of these numbers.",
+        "type": "comment"
+    },
+    "404": {
+        "file_id": 43,
+        "content": "            169, 170, 170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173,\n            174, 174, 174, 175, 175, 175, 176, 176, 176, 177, 177, 177, 178,\n            178, 178, 179, 179, 179, 180, 180, 180, 181, 181, 181, 182, 182,\n            182, 183, 183, 183, 184, 184, 184, 185, 185, 185, 186, 186, 186,\n            187, 187, 187, 188, 188, 188, 189, 189, 189, 190, 190, 190, 191,\n            191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195, 195,\n            195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199,\n            200, 200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204,\n            204, 204, 205, 205, 205, 206, 206, 206, 207, 207, 207, 208, 208,\n            208, 209, 209, 209, 210, 210, 210, 211, 211, 211, 212, 212, 212,\n            213, 213, 213, 214, 214, 214, 215, 215, 215, 216, 216, 216, 217,\n            217, 217, 218, 218, 218, 219, 219, 219, 220, 220, 220, 221, 221,\n            221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225, 225, 225,\n            226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:239-252"
+    },
+    "405": {
+        "file_id": 43,
+        "content": "This code snippet appears to be a list or sequence of numbers, possibly representing frame coordinates, timestamps, or some other numerical data used in video processing or analysis. The specific application and purpose would require further context from the surrounding code and documentation.",
+        "type": "comment"
+    },
+    "406": {
+        "file_id": 43,
+        "content": "            230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234,\n            234, 235, 235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238,\n            239, 239, 239, 240, 240, 240, 241, 241, 241, 242, 242, 242, 243,\n            243, 243, 244, 244, 244, 245, 245, 245, 246, 246, 246, 247, 247,\n            247, 248, 248, 248, 249, 249, 249, 250, 250, 250, 251, 251, 251,\n            252, 252, 252, 253, 253, 253, 254, 254, 254, 255, 255, 255\n        ]\n        mask = mask_tensor.cpu().numpy().astype('uint8')\n        mask = Image.fromarray(mask).convert('P')\n        mask.putpalette(_palette)\n        mask.save(path)\n    def zip_folder(self, source_folder, zip_dir):\n        f = zipfile.ZipFile(zip_dir, 'w', zipfile.ZIP_DEFLATED)\n        pre_len = len(os.path.dirname(source_folder))\n        for dirpath, dirnames, filenames in os.walk(source_folder):\n            for filename in filenames:\n                pathfile = os.path.join(dirpath, filename)\n                arcname = pathfile[pre_len:].strip(os.path.sep)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:253-271"
+    },
+    "407": {
+        "file_id": 43,
+        "content": "This code snippet seems to define a function `zip_folder` that compresses files within a source folder into a zip file. It also includes a nested function that creates and saves an image mask, potentially for visualization purposes. However, the context or specific functionality of these functions is not clear without additional information about the larger codebase.",
+        "type": "comment"
+    },
+    "408": {
+        "file_id": 43,
+        "content": "                f.write(pathfile, arcname)\n        f.close()\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        self.zip_folder(self.result_root, self.zip_dir)\n        logger.info('Save result to {}.'.format(self.zip_dir))",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:272-279"
+    },
+    "409": {
+        "file_id": 43,
+        "content": "This code writes data to a file and then closes it. It defines a function called accumulate that aggregates metrics when all iterations are complete. This function zips the folder, creates a zip directory, and logs a message indicating that the result is saved in the specified directory.",
+        "type": "comment"
+    },
+    "410": {
+        "file_id": 44,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py",
+        "type": "filepath"
+    },
+    "411": {
+        "file_id": 44,
+        "content": "This code is an import script for PaddleVideo, registering various modules like backbones, heads, recognizers, localizers, and losses in relevant registries. It defines key function names and includes popular models like 'DeepLab' and 'IntVOS'.",
+        "type": "summary"
+    },
+    "412": {
+        "file_id": 44,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .backbones import DeepLab\nfrom .builder import (build_backbone, build_head, build_localizer, build_loss,\n                      build_recognizer)\nfrom .heads import IntVOS\nfrom .registry import (BACKBONES, DETECTORS, HEADS, LOCALIZERS, LOSSES,\n                       PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)\nfrom .weight_init import kaiming_normal_, trunc_normal_, weight_init_\n__all__ = [\n    'BACKBONES', 'HEADS', 'RECOGNIZERS', 'LOCALIZERS', 'PARTITIONERS',",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py:1-23"
+    },
+    "413": {
+        "file_id": 44,
+        "content": "This code appears to be an import script for PaddleVideo, importing various modules such as backbones, heads, recognizers, localizers, and losses from different parts of the codebase. It also registers these items in relevant registries (e.g., BACKBONES, HEADS) and defines __all__ to include those registered items.",
+        "type": "comment"
+    },
+    "414": {
+        "file_id": 44,
+        "content": "    'LOSSES', 'build_recognizer', 'build_localizer', 'build_head',\n    'build_backbone', 'build_loss', 'DETECTORS', 'kaiming_normal_', 'trunc_normal_',\n    'weight_init_', 'DeepLab', 'IntVOS'\n]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py:24-27"
+    },
+    "415": {
+        "file_id": 44,
+        "content": "This code defines several variables and function names used in the PaddleVideo library, including loss functions ('build_loss'), backbone building ('build_backbone'), detectors ('DETECTORS'), and initialization methods ('weight_init_'). It also includes references to popular models such as 'DeepLab' and 'IntVOS'.",
+        "type": "comment"
+    },
+    "416": {
+        "file_id": 45,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py",
+        "type": "filepath"
+    },
+    "417": {
+        "file_id": 45,
+        "content": "This code is an import statement for the DeepLab class from the deeplab_manet module and specifies that it should be included in the __all__ list.",
+        "type": "summary"
+    },
+    "418": {
+        "file_id": 45,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .deeplab_manet import DeepLab\n__all__ = ['DeepLab']",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py:1-16"
+    },
+    "419": {
+        "file_id": 45,
+        "content": "This code is an import statement for the DeepLab class from the deeplab_manet module and specifies that it should be included in the __all__ list.",
+        "type": "comment"
+    },
+    "420": {
+        "file_id": 46,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py",
+        "type": "filepath"
+    },
+    "421": {
+        "file_id": 46,
+        "content": "The code initializes an ASPP network using Paddle, defines its architecture, sets parameters for BatchNorm layers and global average pooling operation, creates the ASPP-MANET backbone model class, initializes layers, and applies weight initialization.",
+        "type": "summary"
+    },
+    "422": {
+        "file_id": 46,
+        "content": "import paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom EIVideo.paddlevideo.utils.manet_utils import kaiming_normal_\nclass _ASPPModule(nn.Layer):\n    def __init__(self, inplanes, planes, kernel_size, padding, dilation,\n                 BatchNorm):\n        super(_ASPPModule, self).__init__()\n        self.atrous_conv = nn.Conv2D(inplanes,\n                                     planes,\n                                     kernel_size=kernel_size,\n                                     stride=1,\n                                     padding=padding,\n                                     dilation=dilation,\n                                     bias_attr=False)\n        self.bn = BatchNorm(planes)\n        self.relu = nn.ReLU(True)\n        self._init_weight()\n    def forward(self, x):\n        x = self.atrous_conv(x)\n        x = self.bn(x)\n        return self.relu(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:1-32"
+    },
+    "423": {
+        "file_id": 46,
+        "content": "Imports Paddle, nn, and functional modules for creating a module that implements the ASPP layer with Conv2D, BatchNorm, and ReLU layers. Initializes weights using Kaiming normal distribution.",
+        "type": "comment"
+    },
+    "424": {
+        "file_id": 46,
+        "content": "            elif isinstance(m, nn.BatchNorm2D):\n                from EIVideo.paddlevideo.utils.manet_utils import fill_\n                fill_(m.weight, 1)\n                from EIVideo.paddlevideo.utils.manet_utils import zero_\n                zero_(m.bias)\nclass ASPP(nn.Layer):\n    def __init__(self, backbone, output_stride, BatchNorm):\n        super(ASPP, self).__init__()\n        if backbone == 'drn':\n            inplanes = 512\n        elif backbone == 'mobilenet':\n            inplanes = 320\n        else:\n            inplanes = 2048\n        if output_stride == 16:\n            dilations = [1, 6, 12, 18]\n        elif output_stride == 8:\n            dilations = [1, 12, 24, 36]\n        else:\n            raise NotImplementedError\n        self.aspp1 = _ASPPModule(inplanes,\n                                 256,\n                                 1,\n                                 padding=0,\n                                 dilation=dilations[0],\n                                 BatchNorm=BatchNorm)\n        self.aspp2 = _ASPPModule(inplanes,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:33-62"
+    },
+    "425": {
+        "file_id": 46,
+        "content": "This code initializes an ASPP (Atrous Spatial Pyramid Pooling) network. The network architecture is defined based on the selected backbone and output stride. The BatchNorm layer weights are set to 1, and its biases are set to zero using functions from the manet_utils module. Dilation rates for each ASPPModule are determined based on the chosen output stride.",
+        "type": "comment"
+    },
+    "426": {
+        "file_id": 46,
+        "content": "                                 256,\n                                 3,\n                                 padding=dilations[1],\n                                 dilation=dilations[1],\n                                 BatchNorm=BatchNorm)\n        self.aspp3 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[2],\n                                 dilation=dilations[2],\n                                 BatchNorm=BatchNorm)\n        self.aspp4 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[3],\n                                 dilation=dilations[3],\n                                 BatchNorm=BatchNorm)\n        self.global_avg_pool = nn.Sequential(\n            nn.AdaptiveAvgPool2D((1, 1)),\n            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),\n            BatchNorm(256), nn.ReLU())\n        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:63-85"
+    },
+    "427": {
+        "file_id": 46,
+        "content": "This code defines an ASPP module with three branches, each having different dilation rates. It also includes a global average pooling operation and subsequent convolution layers to extract features from the input planes.",
+        "type": "comment"
+    },
+    "428": {
+        "file_id": 46,
+        "content": "        self.bn1 = BatchNorm(256)\n        self.relu = nn.ReLU(True)\n        self.dropout = nn.Dropout(0.1)\n        self._init_weight()\n    def forward(self, x):\n        x1 = self.aspp1(x)\n        x2 = self.aspp2(x)\n        x3 = self.aspp3(x)\n        x4 = self.aspp4(x)\n        x5 = self.global_avg_pool(x)\n        x5 = F.interpolate(x5,\n                           size=x4.shape[2:],\n                           mode='bilinear',\n                           align_corners=True)\n        x = paddle.concat((x1, x2, x3, x4, x5), axis=1)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        return x\n        return self.dropout(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                # n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                # m.weight.normal_(0, math.sqrt(2. / n))\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from EIVideo.paddlevideo.utils.manet_utils import fill_",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:86-117"
+    },
+    "429": {
+        "file_id": 46,
+        "content": "This code defines a class for ASPP-MANET backbone model in the ASPP-MANET network architecture. It initializes batch normalization, ReLU activation and dropout layers, with weight initialization function defined separately. The forward pass applies aspp1 to x and other similar operations on x, then concatenates them along axis=1. Conv2D layer is applied on the concatenated input, followed by BatchNorm2D and ReLU activation functions. Finally, it returns the output and also drops out some values using dropout. The weight initialization follows Kaiming Normal distribution for convolutional layers and uses fill function for batch normalization layers.",
+        "type": "comment"
+    },
+    "430": {
+        "file_id": 46,
+        "content": "                fill_(m.weight, 1)\n                from EIVideo.paddlevideo.utils.manet_utils import zero_\n                zero_(m.bias)\ndef build_aspp(backbone, output_stride, BatchNorm):\n    return ASPP(backbone, output_stride, BatchNorm)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:118-124"
+    },
+    "431": {
+        "file_id": 46,
+        "content": "This code defines a function called build_aspp that returns an instance of the ASPP class. The function initializes the model's weights to 1 and sets the bias to zero using the zero_ function from the manet_utils module.",
+        "type": "comment"
+    },
+    "432": {
+        "file_id": 47,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py",
+        "type": "filepath"
+    },
+    "433": {
+        "file_id": 47,
+        "content": "This code defines a Paddle's nn.Layer Decoder class with convolutional layers, BatchNorm, and ReLU activation functions for Manet architecture decoding. It imports the 'zero_' function to initialize all model biases to 0.",
+        "type": "summary"
+    },
+    "434": {
+        "file_id": 47,
+        "content": "import paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom EIVideo.paddlevideo.utils.manet_utils import kaiming_normal_\nclass Decoder(nn.Layer):\n    def __init__(self, num_classes, backbone, BatchNorm):\n        super(Decoder, self).__init__()\n        if backbone == 'resnet' or backbone == 'drn' or backbone == 'resnet_edge':\n            low_level_inplanes = 256\n        elif backbone == 'xception':\n            low_level_inplanes = 128\n        elif backbone == 'mobilenet':\n            low_level_inplanes = 24\n        else:\n            raise NotImplementedError\n        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)\n        self.bn1 = BatchNorm(48)\n        self.relu = nn.ReLU(True)\n        self.last_conv = nn.Sequential(\n            nn.Conv2D(304,\n                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(True),\n            nn.Sequential(),\n            nn.Conv2D(256,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py:1-30"
+    },
+    "435": {
+        "file_id": 47,
+        "content": "This code defines a Decoder class using Paddle's nn.Layer, which takes in the number of classes and backbone type as parameters. It initializes the convolutional layers for feature extraction, BatchNorm layers for normalization, and ReLU activation functions. The last_conv sequence contains multiple Conv2D, BatchNorm, and ReLU layers for further processing.",
+        "type": "comment"
+    },
+    "436": {
+        "file_id": 47,
+        "content": "                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(True),\n            nn.Sequential())\n        self._init_weight()\n    def forward(self, x, low_level_feat):\n        low_level_feat = self.conv1(low_level_feat)\n        low_level_feat = self.bn1(low_level_feat)\n        low_level_feat = self.relu(low_level_feat)\n        x = F.interpolate(x,\n                          size=low_level_feat.shape[2:],\n                          mode='bilinear',\n                          align_corners=True)\n        x = paddle.concat((x, low_level_feat), axis=1)\n        x = self.last_conv(x)\n        return x\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from EIVideo.paddlevideo.utils.manet_utils import fill_\n                fill_(m.weight, 1)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py:31-59"
+    },
+    "437": {
+        "file_id": 47,
+        "content": "This code defines a decoder block for the Manet architecture. It includes a 2D convolution layer, batch normalization, and ReLU activation. The forward function performs interpolation on input feature maps and concatenates them with low-level features before passing through a final convolution. The _init_weight function initializes the weights of the block using Kaiming initialization for convolutions and fills batch norm with a constant value.",
+        "type": "comment"
+    },
+    "438": {
+        "file_id": 47,
+        "content": "                from EIVideo.paddlevideo.utils.manet_utils import zero_\n                zero_(m.bias)\ndef build_decoder(num_classes, backbone, BatchNorm):\n    return Decoder(num_classes, backbone, BatchNorm)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py:60-65"
+    },
+    "439": {
+        "file_id": 47,
+        "content": "This code imports the function 'zero_' from EIVideo.paddlevideo.utils.manet_utils and then defines a build_decoder function that returns an instance of Decoder class with provided parameters (num_classes, backbone, BatchNorm). The zero_(m.bias) line initializes all the bias in the model (m) to 0 using the imported 'zero_' function.",
+        "type": "comment"
+    },
+    "440": {
+        "file_id": 48,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py",
+        "type": "filepath"
+    },
+    "441": {
+        "file_id": 48,
+        "content": "The code introduces a FrozenBatchNorm2d class for static batch normalization and DeepLab class as a neural network backbone, allowing freezing of BatchNorm layers. The code also provides methods to get parameters with different learning rates and creates an instance of the model, evaluates it, generates input data, and outputs its shape.",
+        "type": "summary"
+    },
+    "442": {
+        "file_id": 48,
+        "content": "import paddle\nimport paddle.nn as nn\nfrom ..registry import BACKBONES\nfrom EIVideo.paddlevideo.modeling.backbones.aspp_manet import build_aspp\nfrom EIVideo.paddlevideo.modeling.backbones.decoder_manet import build_decoder\nfrom EIVideo.paddlevideo.modeling.backbones.resnet_manet import build_backbone\nclass FrozenBatchNorm2d(nn.Layer):\n    def __init__(self, n):\n        super(FrozenBatchNorm2d, self).__init__()\n        self.register_buffer(\"weight\", paddle.ones(n))\n        self.register_buffer(\"bias\", paddle.zeros(n))\n        self.register_buffer(\"running_mean\", paddle.zeros(n))\n        self.register_buffer(\"running_var\", paddle.ones(n))\n    def forward(self, x):\n        if x.dtype == paddle.float16:\n            self.weight = self.weight.half()\n            self.bias = self.bias.half()\n            self.running_mean = self.running_mean.half()\n            self.running_var = self.running_var.half()\n        scale = self.weight * self.running_var.rsqrt()\n        bias = self.bias - self.running_mean * scale\n        scale = scale.reshape(1, -1, 1, 1)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py:1-26"
+    },
+    "443": {
+        "file_id": 48,
+        "content": "This code defines a FrozenBatchNorm2d class and imports necessary modules. The class is used to create a batch normalization layer where the parameters are frozen, meaning they will not be updated during training.",
+        "type": "comment"
+    },
+    "444": {
+        "file_id": 48,
+        "content": "        bias = bias.reshape(1, -1, 1, 1)\n        return x * scale + bias\n@BACKBONES.register()\nclass DeepLab(nn.Layer):\n    def __init__(self,\n                 backbone='resnet',\n                 output_stride=16,\n                 num_classes=21,\n                 freeze_bn=False,\n                 pretrained=None):\n        super(DeepLab, self).__init__()\n        if backbone == 'drn':\n            output_stride = 8\n        if freeze_bn == True:\n            print(\"Use frozen BN in DeepLab\")\n            BatchNorm = FrozenBatchNorm2d\n        else:\n            BatchNorm = nn.BatchNorm2D\n        self.backbone = build_backbone(output_stride, BatchNorm, pretrained)\n        self.aspp = build_aspp(backbone, output_stride, BatchNorm)\n        self.decoder = build_decoder(num_classes, backbone, BatchNorm)\n    def forward(self, input):\n        x, low_level_feat = self.backbone(input)\n        x = self.aspp(x)\n        x = self.decoder(x, low_level_feat)\n        return x\n    def freeze_bn(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.BatchNorm2D):",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py:27-61"
+    },
+    "445": {
+        "file_id": 48,
+        "content": "The code defines a DeepLab class as a neural network backbone, which uses other modules (backbone, ASPP, and decoder) for feature extraction and classification. It takes input and returns output after passing through these modules. The freeze_bn method can be called to freeze the BatchNorm layers if needed.",
+        "type": "comment"
+    },
+    "446": {
+        "file_id": 48,
+        "content": "                m.eval()\n    def get_1x_lr_params(self):\n        modules = [self.backbone]\n        for i in range(len(modules)):\n            for m in modules[i].named_modules():\n                if isinstance(m[1], nn.Conv2D) or isinstance(\n                        m[1], nn.BatchNorm2D):\n                    for p in m[1].parameters():\n                        if p.requires_grad:\n                            yield p\n    def get_10x_lr_params(self):\n        modules = [self.aspp, self.decoder]\n        for i in range(len(modules)):\n            for m in modules[i].named_modules():\n                if isinstance(m[1], nn.Conv2D) or isinstance(\n                        m[1], nn.BatchNorm2D):\n                    for p in m[1].parameters():\n                        if p.requires_grad:\n                            yield p\nif __name__ == \"__main__\":\n    model = DeepLab(backbone='resnet', output_stride=16)\n    model.eval()\n    input = paddle.rand([2, 3, 513, 513])\n    output = model(input)\n    print(output.shape)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py:62-90"
+    },
+    "447": {
+        "file_id": 48,
+        "content": "This code defines a DeepLab model with backbone options, sets the model to evaluation mode, and provides two methods for getting parameters with different learning rates. The main part of the code creates an instance of the model, evaluates it, generates random input data, and outputs its shape.",
+        "type": "comment"
+    },
+    "448": {
+        "file_id": 49,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py",
+        "type": "filepath"
+    },
+    "449": {
+        "file_id": 49,
+        "content": "The code defines a ResNet-MANET model with BatchNorm, ReLU activation, and residual blocks using convolution, batch normalization, and max pooling layers. The model is initialized and processes input to obtain output and low-level features as JSON files.",
+        "type": "summary"
+    },
+    "450": {
+        "file_id": 49,
+        "content": "import paddle.nn as nn\n# from reprod_log.utils import paddle2np\nfrom EIVideo.paddlevideo.utils.manet_utils import fill_, zero_\nclass Bottleneck(nn.Layer):\n    expansion = 4\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 stride=1,\n                 dilation=1,\n                 downsample=None,\n                 BatchNorm=None):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)\n        self.bn1 = BatchNorm(planes)\n        self.conv2 = nn.Conv2D(planes,\n                               planes,\n                               kernel_size=3,\n                               stride=stride,\n                               dilation=dilation,\n                               padding=dilation,\n                               bias_attr=False)\n        self.bn2 = BatchNorm(planes)\n        self.conv3 = nn.Conv2D(planes,\n                               planes * 4,\n                               kernel_size=1,\n                               bias_attr=False)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:1-31"
+    },
+    "451": {
+        "file_id": 49,
+        "content": "This code defines the Bottleneck class for ResNet architecture, consisting of convolutional layers and batch normalization layers. It takes parameters such as inplanes, planes, stride, dilation, downsample, and BatchNorm for initialization.",
+        "type": "comment"
+    },
+    "452": {
+        "file_id": 49,
+        "content": "        self.bn3 = BatchNorm(planes * 4)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n    def forward(self, x):\n        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass ResNet(nn.Layer):\n    def __init__(self,\n                 block,\n                 layers,\n                 output_stride,\n                 BatchNorm,\n                 pretrained=None):\n        self.inplanes = 64\n        super(ResNet, self).__init__()\n        blocks = [1, 2, 4]\n        if output_stride == 16:\n            strides = [1, 2, 2, 1]\n            dilations = [1, 1, 1, 2]\n        elif output_stride == 8:\n            strides = [1, 2, 1, 1]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:32-75"
+    },
+    "453": {
+        "file_id": 49,
+        "content": "Class \"ResNet\" is a Residual Network backbone with multiple blocks and layers. It utilizes BatchNorm for normalization, ReLU as the activation function, and supports different output strides.",
+        "type": "comment"
+    },
+    "454": {
+        "file_id": 49,
+        "content": "            dilations = [1, 1, 2, 4]\n        else:\n            raise NotImplementedError\n        # Modules\n        self.conv1 = nn.Conv2D(3,\n                               64,\n                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)\n        self.bn1 = BatchNorm(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block,\n                                       64,\n                                       layers[0],\n                                       stride=strides[0],\n                                       dilation=dilations[0],\n                                       BatchNorm=BatchNorm)\n        self.layer2 = self._make_layer(block,\n                                       128,\n                                       layers[1],\n                                       stride=strides[1],\n                                       dilation=dilations[1],",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:76-101"
+    },
+    "455": {
+        "file_id": 49,
+        "content": "The code defines a ResNet-MANET backbone model with BatchNorm and ReLU activation functions. It initializes convolution, batch normalization, ReLU, max pooling layers along with the first two residual blocks based on input parameters such as block type, number of channels, number of layers, and strides. Dilations are assigned based on the provided conditions.",
+        "type": "comment"
+    },
+    "456": {
+        "file_id": 49,
+        "content": "                                       BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block,\n                                       256,\n                                       layers[2],\n                                       stride=strides[2],\n                                       dilation=dilations[2],\n                                       BatchNorm=BatchNorm)\n        self.layer4 = self._make_MG_unit(block,\n                                         512,\n                                         blocks=blocks,\n                                         stride=strides[3],\n                                         dilation=dilations[3],\n                                         BatchNorm=BatchNorm)\n        self.init_weight()\n    def _make_layer(self,\n                    block,\n                    planes,\n                    blocks,\n                    stride=1,\n                    dilation=1,\n                    BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:102-127"
+    },
+    "457": {
+        "file_id": 49,
+        "content": "This code defines a ResNet-MANET model, creating layers and functions for the network architecture. It includes the creation of three main layers (layer1, layer2, and layer3), using blocks and specific parameters such as stride and dilation. The _make_MG_unit function is used to create an additional MG unit in the layer4. Finally, the init_weight method initializes the weight for the network.",
+        "type": "comment"
+    },
+    "458": {
+        "file_id": 49,
+        "content": "            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes, planes, stride, dilation, downsample,\n                  BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      dilation=dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def _make_MG_unit(self,\n                      block,\n                      planes,\n                      blocks,\n                      stride=1,\n                      dilation=1,\n                      BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:128-159"
+    },
+    "459": {
+        "file_id": 49,
+        "content": "This code defines a function _make_MG_unit that creates a residual block with downsampling for a ResNet model. The downsample operation is determined based on stride and inplanes values, and BatchNorm layer is optional.",
+        "type": "comment"
+    },
+    "460": {
+        "file_id": 49,
+        "content": "            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes,\n                  planes,\n                  stride,\n                  dilation=blocks[0] * dilation,\n                  downsample=downsample,\n                  BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, len(blocks)):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      stride=1,\n                      dilation=blocks[i] * dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def forward(self, input):\n        x = self.conv1(input)\n        x = self.bn1(x)\n        x = self.relu(x)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:160-191"
+    },
+    "461": {
+        "file_id": 49,
+        "content": "This code defines a ResNet-MANET backbone model. It uses BatchNorm layers and block functions to create multiple convolutional layers with different dilation rates. The forward function applies the first layer, batch normalization, and ReLU activation before returning the sequence of layers.",
+        "type": "comment"
+    },
+    "462": {
+        "file_id": 49,
+        "content": "        x = self.maxpool(x)\n        x = self.layer1(x)\n        low_level_feat = x\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        return x, low_level_feat\n    def init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                fill_(m.weight, 1)\n            elif isinstance(m, nn.BatchNorm2D):\n                fill_(m.weight, 1)\n                zero_(m.bias)\n        return self.sublayers()\ndef ResNet101(output_stride, BatchNorm, pretrained=None):\n    \"\"\"Constructs a ResNet-101 model.\n    Args:\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\n    \"\"\"\n    model = ResNet(Bottleneck, [3, 4, 23, 3],\n                   output_stride,\n                   BatchNorm,\n                   pretrained=pretrained)\n    return model\ndef build_backbone(output_stride, BatchNorm, pretrained):\n    return ResNet101(output_stride, BatchNorm, pretrained)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:192-227"
+    },
+    "463": {
+        "file_id": 49,
+        "content": "This code defines a ResNet101 model with BatchNorm and outputs the features at different stages. It initializes the weights of convolutional layers, and builds a backbone based on output stride and pretrained parameters.",
+        "type": "comment"
+    },
+    "464": {
+        "file_id": 49,
+        "content": "if __name__ == \"__main__\":\n    import paddle\n    model = ResNet101(BatchNorm=nn.BatchNorm2D,\n                      pretrained=True,\n                      output_stride=8)\n    input = paddle.rand([1, 3, 512, 512])\n    output, low_level_feat = model(input)\n    print(output.shape)\n    print(low_level_feat.shape)\n    import json\n    with open('output.txt', 'w') as f:\n        json.dump(output.tolist(), f)\n    with open('low_level_feat.txt', 'w') as f:\n        json.dump(low_level_feat.tolist(), f)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:230-245"
+    },
+    "465": {
+        "file_id": 49,
+        "content": "The code initializes a ResNet101 model, generates random input, passes it through the model to obtain output and low-level features, and saves them as JSON files.",
+        "type": "comment"
+    },
+    "466": {
+        "file_id": 50,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py",
+        "type": "filepath"
+    },
+    "467": {
+        "file_id": 50,
+        "content": "This code imports and registers various models for computer vision, defines functions to build these components based on configuration, and uses a \"build\" function to determine the model type.",
+        "type": "summary"
+    },
+    "468": {
+        "file_id": 50,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, BBOX_CODERS, PARTITIONERS, MULTIMODAL, SEGMENT\nfrom ..utils import build\nfrom .registry import (BACKBONES, BBOX_ASSIGNERS, BBOX_CODERS, BBOX_SAMPLERS,\n                       DETECTORS, ESTIMATORS, HEADS, LOCALIZERS, LOSSES,\n                       MULTIMODAL, PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:1-19"
+    },
+    "469": {
+        "file_id": 50,
+        "content": "This code imports necessary modules and registers various types of models for a computer vision application. It also includes licensing information and provides utility functions for model building.",
+        "type": "comment"
+    },
+    "470": {
+        "file_id": 50,
+        "content": "def build_backbone(cfg):\n    \"\"\"Build backbone.\"\"\"\n    return build(cfg, BACKBONES)\ndef build_roi_extractor(cfg):\n    \"\"\"Build roi extractor.\"\"\"\n    return build(cfg, ROI_EXTRACTORS)\ndef build_assigner(cfg, **default_args):\n    \"\"\"Builder of box assigner.\"\"\"\n    return build(cfg, BBOX_ASSIGNERS)\ndef build_sampler(cfg, **default_args):\n    \"\"\"Builder of box batch_sampler.\"\"\"\n    return build(cfg, BBOX_SAMPLERS)\ndef build_roi_extractor(cfg):\n    \"\"\"Build roi extractor.\"\"\"\n    return build(cfg, ROI_EXTRACTORS)\ndef build_assigner(cfg, **default_args):\n    \"\"\"Builder of box assigner.\"\"\"\n    return build(cfg, BBOX_ASSIGNERS)\ndef build_sampler(cfg, **default_args):\n    \"\"\"Builder of box batch_sampler.\"\"\"\n    return build(cfg, BBOX_SAMPLERS)\ndef build_head(cfg):\n    \"\"\"Build head.\"\"\"\n    return build(cfg, HEADS)\ndef build_loss(cfg):\n    \"\"\"Build loss.\"\"\"\n    return build(cfg, LOSSES)\ndef build_recognizer(cfg):\n    \"\"\"Build recognizer.\"\"\"\n    return build(cfg, RECOGNIZERS, key='framework')\ndef build_localizer(cfg):\n    \"\"\"Build localizer.\"\"\"",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:22-73"
+    },
+    "471": {
+        "file_id": 50,
+        "content": "The code defines functions for building various components of a video processing model, including backbone, roi extractor, assigner, sampler, head, loss, recognizer, and localizer. These functions use the `build()` method to construct the components based on the given configuration (cfg). BACKBONES, ROI_EXTRACTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, HEADS, LOSSES, RECOGNIZERS, and framework are used as parameters in the `build()` method. The functions repeat twice for each component, which could be a code formatting issue or redundancy.",
+        "type": "comment"
+    },
+    "472": {
+        "file_id": 50,
+        "content": "    return build(cfg, LOCALIZERS, key='framework')\ndef build_segmentationer(cfg):\n    \"\"\"Build detector.\"\"\"\n    return build(cfg, SEGMENT, key='framework')\ndef build_partitioner(cfg):\n    \"\"\"Build partitioner.\"\"\"\n    return build(cfg, PARTITIONERS, key='framework')\ndef build_estimator(cfg):\n    \"\"\"Build estimator.\"\"\"\n    return build(cfg, ESTIMATORS, key='framework')\ndef build_multimodal(cfg):\n    \"\"\"Build multimodal.\"\"\"\n    return build(cfg, MULTIMODAL, key='framework')\ndef build_detector(cfg):\n    \"\"\"Build multimodal.\"\"\"\n    return build(cfg, DETECTORS, key='framework')\ndef build_segment(cfg):\n    \"\"\"Build segment.\"\"\"\n    return build(cfg, SEGMENT, key='framework')\ndef build_model(cfg, key='framework'):\n    cfg_copy = cfg.copy()\n    framework_type = cfg_copy.get(key)\n    if framework_type in RECOGNIZERS:\n        return build_recognizer(cfg)\n    elif framework_type in LOCALIZERS:\n        return build_localizer(cfg)\n    elif framework_type in PARTITIONERS:\n        return build_partitioner(cfg)\n    elif framework_type in DETECTORS:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:74-116"
+    },
+    "473": {
+        "file_id": 50,
+        "content": "The code defines several functions that build different models such as recognizer, localizer, partitioner, estimator, and segment. It uses a \"build\" function to determine which model to create based on the provided configuration (cfg). The model is built by copying the cfg and checking its value for the key 'framework', then calling the appropriate function to build the desired model.",
+        "type": "comment"
+    },
+    "474": {
+        "file_id": 50,
+        "content": "        return build_detector(cfg)\n    elif framework_type in ESTIMATORS:\n        return build_estimator(cfg)\n    elif framework_type in MULTIMODAL:\n        return build_multimodal(cfg)\n    elif framework_type in SEGMENT:\n        return build_segment(cfg)\n    else:\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:117-125"
+    },
+    "475": {
+        "file_id": 50,
+        "content": "This code is selecting a function to build a video analysis framework based on the given configuration (cfg) and framework type. If the type matches any of the predefined categories, it returns the corresponding function result. Otherwise, it raises a NotImplementedError.",
+        "type": "comment"
+    },
+    "476": {
+        "file_id": 51,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py",
+        "type": "filepath"
+    },
+    "477": {
+        "file_id": 51,
+        "content": "This file contains the definitions for BaseSegment and Manet classes, both part of PaddleVideo framework. These classes are likely used in video modeling or segmentation tasks. The code is licensed under Apache License 2.0 and distributed as-is without warranties or conditions.",
+        "type": "summary"
+    },
+    "478": {
+        "file_id": 51,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .segment import BaseSegment, Manet\n__all__ = ['BaseSegment',\n           'Manet'\n]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py:1-19"
+    },
+    "479": {
+        "file_id": 51,
+        "content": "This file contains the definitions for BaseSegment and Manet classes, both part of PaddleVideo framework. These classes are likely used in video modeling or segmentation tasks. The code is licensed under Apache License 2.0 and distributed as-is without warranties or conditions.",
+        "type": "comment"
+    },
+    "480": {
+        "file_id": 52,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py",
+        "type": "filepath"
+    },
+    "481": {
+        "file_id": 52,
+        "content": "This code snippet contains copyright and license information, imports necessary modules from the PaddlePaddle framework, and defines two classes 'BaseSegment' and 'Manet'. It also specifies that these are the components included in the current module.",
+        "type": "summary"
+    },
+    "482": {
+        "file_id": 52,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseSegment\nfrom .manet_stage1 import Manet\n__all__ = [\n    'BaseSegment',\n    'Manet',\n]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py:1-19"
+    },
+    "483": {
+        "file_id": 52,
+        "content": "This code snippet contains copyright and license information, imports necessary modules from the PaddlePaddle framework, and defines two classes 'BaseSegment' and 'Manet'. It also specifies that these are the components included in the current module.",
+        "type": "comment"
+    },
+    "484": {
+        "file_id": 53,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py",
+        "type": "filepath"
+    },
+    "485": {
+        "file_id": 53,
+        "content": "This code defines a PaddlePaddle base class for semi-Video Object Segmentation, with methods for training, validating, testing, and inference, determined by the mode argument.",
+        "type": "summary"
+    },
+    "486": {
+        "file_id": 53,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseSegment(nn.Layer):\n    \"\"\"Base class for semi-Video Object Segmentation.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Head to process feature.\n        loss(dict): Loss function.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py:2-30"
+    },
+    "487": {
+        "file_id": 53,
+        "content": "This code is a base class for semi-Video Object Segmentation in PaddlePaddle. It requires subclasses to overwrite training, validation, and testing forward methods. The class also includes backbone modules for feature extraction and head modules for processing features, with specified loss functions.",
+        "type": "comment"
+    },
+    "488": {
+        "file_id": 53,
+        "content": "    \"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__()\n        if backbone != None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head != None:\n            self.head_name = head.name\n            if head.name == 'IntVOS':\n                head.update({'feature_extracter': self.backbone})\n                self.head = builder.build_head(head)\n            else:\n                self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n        if loss != None:\n            self.loss = builder.build_loss(loss)\n        else:\n            self.loss = None\n    def forward(self, data_batch, mode='infer', **kwargs):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py:31-59"
+    },
+    "489": {
+        "file_id": 53,
+        "content": "The code initializes a model by building the backbone, head, and loss components. In the forward function, it defines how the model processes input data for training or inference.",
+        "type": "comment"
+    },
+    "490": {
+        "file_id": 53,
+        "content": "        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch, **kwargs)\n        elif mode == 'valid':\n            return self.val_step(data_batch, **kwargs)\n        elif mode == 'test':\n            return self.test_step(data_batch, **kwargs)\n        elif mode == 'infer':\n            return self.infer_step(data_batch, **kwargs)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py:60-95"
+    },
+    "491": {
+        "file_id": 53,
+        "content": "This code defines a class with different step methods for training, validating, testing, and inference. The mode argument determines which method to execute based on the current task. Each step method is marked as an abstractmethod requiring subclass implementation.",
+        "type": "comment"
+    },
+    "492": {
+        "file_id": 54,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py",
+        "type": "filepath"
+    },
+    "493": {
+        "file_id": 54,
+        "content": "The code imports modules, defines a Manet class for video segmentation using PaddleVideo's Manet_Stage1 model, and implements training, inference, mask generation, parallel processing, and frame saving steps. It is for deep learning models, visualizations, and measuring time efficiency.",
+        "type": "summary"
+    },
+    "494": {
+        "file_id": 54,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom EIVideo.paddlevideo.loader.builder import build_pipeline\nfrom EIVideo.paddlevideo.loader.pipelines import ToTensor_manet\nimport os\nimport timeit\nimport paddle\nfrom PIL import Image\nfrom davisinteractive.utils.scribbles import scribbles2mask, annotated_frames\nfrom paddle import nn\nfrom EIVideo.paddlevideo.utils import load\nfrom EIVideo.paddlevideo.utils.manet_utils import float_, _palette, damage_masks, long_, write_dict, rough_ROI",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:1-26"
+    },
+    "495": {
+        "file_id": 54,
+        "content": "This code is importing necessary modules and functions from different locations, including image processing utilities and machine learning libraries. It also defines some specific functions related to the MANET model in PaddlePaddle. The code is part of a larger framework for video modeling and image segmentation tasks.",
+        "type": "comment"
+    },
+    "496": {
+        "file_id": 54,
+        "content": "from EIVideo.api import load_video, get_scribbles, submit_masks\nfrom ...builder import build_model\nfrom ...registry import SEGMENT\nfrom .base import BaseSegment\n# if cfg.MODEL.framework == \"Manet\":\n#     cfg_helper = {\"knns\": 1,\n#                   \"is_save_image\": True}\n#     cfg.update(cfg_helper)\n#     build_model(cfg['MODEL']).test_step(**cfg,\n#                                         weights=weights,\n#                                         parallel=False)\n#     return\n@SEGMENT.register()\nclass Manet(BaseSegment):\n    def __init__(self, backbone=None, head=None, **cfg):\n        super().__init__(backbone, head, **cfg)\n    def train_step(self, data_batch, step, **cfg):\n        pass\n    def val_step(self, data_batch, **kwargs):\n        pass\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        pass\n    def test_step(self, weights, parallel=True, is_save_image=True, **cfg):\n        # 1. Construct model.\n        cfg['MODEL'].head.pretrained = ''",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:27-61"
+    },
+    "497": {
+        "file_id": 54,
+        "content": "The code is defining a class \"Manet\" that inherits from the BaseSegment class for video segmentation. It has train_step, val_step, infer_step, and test_step methods which are defined but not implemented. The class checks if the model configuration is Manet and then builds the model using build_model function before calling the test_step method with additional parameters like weights and parallel set to False.",
+        "type": "comment"
+    },
+    "498": {
+        "file_id": 54,
+        "content": "        cfg['MODEL'].head.test_mode = True\n        model = build_model(cfg['MODEL'])\n        if parallel:\n            model = paddle.DataParallel(model)\n        # 2. Construct data.\n        sequence = cfg[\"video_path\"].split('/')[-1].split('.')[0]\n        obj_nums = 1\n        images, _ = load_video(cfg[\"video_path\"], 480)\n        print(\"stage1 load_video success\")\n        # [195, 389, 238, 47, 244, 374, 175, 399]\n        # .shape: (502, 480, 600, 3)\n        report_save_dir = cfg.get(\"output_dir\",\n                                  f\"./output/{cfg['model_name']}\")\n        if not os.path.exists(report_save_dir):\n            os.makedirs(report_save_dir)\n            # Configuration used in the challenges\n        max_nb_interactions = 8  # Maximum number of interactions\n        # Interactive parameters\n        model.eval()\n        state_dicts_ = load(weights)['state_dict']\n        state_dicts = {}\n        for k, v in state_dicts_.items():\n            if 'num_batches_tracked' not in k:\n                state_dicts['head.' + k] = v",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:62-87"
+    },
+    "499": {
+        "file_id": 54,
+        "content": "This code initializes the model with test mode enabled, builds it using a function, potentially makes it parallel, loads a video for data, prints \"stage1 load_video success\" message, creates a report save directory if it doesn't exist, sets the maximum number of interactions to 8, and evaluates the model.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/40.json b/docs/data/40.json
new file mode 100644
index 000000000..9302761b7
--- /dev/null
+++ b/docs/data/40.json
@@ -0,0 +1,543 @@
+{
+    "4000": {
+        "file_id": 338,
+        "content": "English | [简体中文](./readme.md)\n# Model service deployment\n## Introduction\n[Paddle Serving](https://github.com/PaddlePaddle/Serving) aims to help deep learning developers easily deploy online prediction services, support one-click deployment of industrial-grade service capabilities, high concurrency between client and server Efficient communication and support for developing clients in multiple programming languages.\nThis section takes the HTTP prediction service deployment as an example to introduce how to use PaddleServing to deploy the model service in PaddleVideo. Currently, only Linux platform deployment is supported, and Windows platform is not currently supported.\n## Serving installation\nThe Serving official website recommends using docker to install and deploy the Serving environment. First, you need to pull the docker environment and create a Serving-based docker.\n```bash\n# start GPU docker\ndocker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel\nnvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme_en.md:1-17"
+    },
+    "4001": {
+        "file_id": 338,
+        "content": "This code introduces the installation process for PaddleServing. It uses Docker to pull a GPU-based docker environment and creates a Serving-based Docker named \"test\". The port 9292 is mapped to access the serving environment, and this setup supports Linux platforms, with Windows currently unsupported.",
+        "type": "comment"
+    },
+    "4002": {
+        "file_id": 338,
+        "content": "nvidia-docker exec -it test bash\n# start CPU docker\ndocker pull paddlepaddle/serving:0.7.0-devel\ndocker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash\ndocker exec -it test bash\n```\nAfter entering docker, you need to install Serving-related python packages.\n```bash\npython3.7 -m pip install paddle-serving-client==0.7.0\npython3.7 -m pip install paddle-serving-app==0.7.0\n#If it is a CPU deployment environment:\npython3.7 -m pip install paddle-serving-server==0.7.0 #CPU\npython3.7 -m pip install paddlepaddle==2.2.0 # CPU\n#If it is a GPU deployment environment\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102 # GPU with CUDA10.2 + TensorRT6\npython3.7 -m pip install paddlepaddle-gpu==2.2.0 # GPU with CUDA10.2\n#Other GPU environments need to confirm the environment and then choose which one to execute\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101 # GPU with CUDA10.1 + TensorRT6\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112 # GPU with CUDA11.2 + TensorRT8",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme_en.md:18-41"
+    },
+    "4003": {
+        "file_id": 338,
+        "content": "Code installs necessary packages for PaddlePaddle serving client, app, server (CPU/GPU) and PaddlePaddle (CPU/GPU) in a Docker container using pip. The GPU versions are specified with different CUDA and TensorRT versions.",
+        "type": "comment"
+    },
+    "4004": {
+        "file_id": 338,
+        "content": "```\n* If the installation speed is too slow, you can change the source through `-i https://pypi.tuna.tsinghua.edu.cn/simple` to speed up the installation process.\n* For more environment and corresponding installation packages, see: https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md\n## Action recognition service deployment\n### Model conversion\nWhen using PaddleServing for service deployment, you need to convert the saved inference model into a Serving model. The following uses the PP-TSM model as an example to introduce how to deploy the action recognition service.\n- Download PP-TSM inference model and convert to Serving model:\n  ```bash\n  # Enter PaddleVideo directory\n  cd PaddleVideo\n  # Download the inference model and extract it to ./inference\n  mkdir ./inference\n  pushd ./inference\n  wget https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip\n  unzip ppTSM.zip\n  popd\n  # Convert to Serving model\n  pushd deploy/cpp_serving\n  python3.7 -m paddle_serving_client.convert \\",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme_en.md:42-65"
+    },
+    "4005": {
+        "file_id": 338,
+        "content": "This code snippet provides instructions for speeding up the installation process and deploying an action recognition service using PaddleServing. It explains how to convert a saved inference model into a Serving model, using PP-TSM as an example.",
+        "type": "comment"
+    },
+    "4006": {
+        "file_id": 338,
+        "content": "  --dirname ../../inference/ppTSM \\\n  --model_filename ppTSM.pdmodel \\\n  --params_filename ppTSM.pdiparams \\\n  --serving_server ./ppTSM_serving_server \\\n  --serving_client ./ppTSM_serving_client\n  popd\n  ```\n  | parameter | type | default value | description |\n  | ----------------- | ---- | ------------------ | ------- -------------------------------------------------- --- |\n  | `dirname` | str | - | The storage path of the model file to be converted. The program structure file and parameter file are saved in this directory. |\n  | `model_filename` | str | None | The name of the file storing the model Inference Program structure that needs to be converted. If set to None, use `__model__` as the default filename |\n  | `params_filename` | str | None | File name where all parameters of the model to be converted are stored. It needs to be specified if and only if all model parameters are stored in a single binary file. If the model parameters are stored in separate files, set it to None |\n  | `serving_",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme_en.md:66-79"
+    },
+    "4007": {
+        "file_id": 338,
+        "content": "The code is specifying the directory, model filename, and parameters filename for a PaddleVideo inference program conversion. It also sets the serving server and client executables to be used after the conversion. The `dirname` parameter holds the storage path of the converted model files. If no specific filenames are provided (model_filename or params_filename), the code defaults to \"None\" which will use default filenames (\"__model__\" and None respectively). The serving server and client executables are specified in the code to be used after the conversion process, allowing the model to be served for inference.",
+        "type": "comment"
+    },
+    "4008": {
+        "file_id": 338,
+        "content": "server` | str | `\"serving_server\"` | The storage path of the converted model files and configuration files. Default is serving_server |\n  | `serving_client` | str | `\"serving_client\"` | The converted client configuration file storage path. Default is serving_client |\n- After the inference model conversion is completed, two folders, `ppTSM_serving_client` and `ppTSM_serving_server` will be generated under the `deploy/cpp_serving` folder, with the following formats:\n  ```bash\n  PaddleVideo/deploy/cpp_serving\n  ├── ppTSM_serving_client\n  │   ├── serving_client_conf.prototxt\n  │   └── serving_client_conf.stream.prototxt\n  └── ppTSM_serving_server\n      ├── ppTSM.pdiparams\n      ├── ppTSM.pdmodel\n      ├── serving_server_conf.prototxt\n      └── serving_server_conf.stream.prototxt\n  ```\n  After getting the model file, you need to modify `serving_client_conf.prototxt` under `ppTSM_serving_client` and `serving_server_conf.prototxt` under `ppTSM_serving_server` respectively, and change `alias_name` under `fetch_var` in both files to `outputs`",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme_en.md:79-94"
+    },
+    "4009": {
+        "file_id": 338,
+        "content": "The code specifies two paths, \"serving_server\" and \"serving_client\", representing the storage locations for model files and configuration files. After model conversion, it generates two folders with associated file formats in the specified folder. Upon obtaining the model files, modify two specific text files to change `alias_name` under `fetch_var`.",
+        "type": "comment"
+    },
+    "4010": {
+        "file_id": 338,
+        "content": "  **Remarks**: In order to be compatible with the deployment of different models, Serving provides the function of input and output renaming. In this way, when different models are inferred and deployed, they only need to modify the `alias_name` of the configuration file, and the inference deployment can be completed without modifying the code.\n  The modified `serving_server_conf.prototxt` looks like this:\n  ```yaml\n  feed_var {\n    name: \"data_batch_0\"\n    alias_name: \"data_batch_0\"\n    is_lod_tensor: false\n    feed_type: 1\n    shape: 8\n    shape: 3\n    shape: 224\n    shape: 224\n  }\n  fetch_var {\n    name: \"linear_2.tmp_1\"\n    alias_name: \"outputs\"\n    is_lod_tensor: false\n    fetch_type: 1\n    shape: 400\n  }\n  ```\n### Service deployment and requests\nThe `cpp_serving` directory contains the code for starting the pipeline service, the C++ serving service and sending the prediction request, including:\n  ```bash\n  run_cpp_serving.sh # Start the script on the C++ serving server side\n  pipeline_http_client.py # The script on the client side to send data and get the prediction results",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme_en.md:96-122"
+    },
+    "4011": {
+        "file_id": 338,
+        "content": "This code demonstrates a rename function for compatibility in model deployment. The modified `serving_server_conf.prototxt` shows how to alias the input and output names in the configuration file. This allows different models to be inferred and deployed without modifying the code, only by altering the `alias_name`. The `cpp_serving` directory contains scripts for starting the pipeline service, C++ serving service, and sending prediction requests.",
+        "type": "comment"
+    },
+    "4012": {
+        "file_id": 338,
+        "content": "  paddle_env_install.sh # Install C++ serving environment script\n  preprocess_ops.py # file to store preprocessing functions\n  ```\n#### C++ Serving\n- Go to the working directory:\n  ```bash\n  cd deploy/cpp_serving\n  ```\n- Start the service:\n  ```bash\n  # Start in the background, the logs printed during the process will be redirected and saved to nohup.txt\n  bash run_cpp_serving.sh\n  ```\n- Send the request and get the result:\n```bash\npython3.7 serving_client.py \\\n-n PPTSM \\\n-c ./ppTSM_serving_client/serving_client_conf.prototxt \\\n--input_file=../../data/example.avi\n```\nAfter a successful run, the results of the model prediction will be printed in the cmd window, and the results are as follows:\n  ```bash\n  I0510 04:33:00.110025 37097 naming_service_thread.cpp:202] brpc::policy::ListNamingService(\"127.0.0.1:9993\"): added 1\n  I0510 04:33:01.904764 37097 general_model.cpp:490] [client]logid=0,client_cost=1640.96ms,server_cost=1623.21ms.\n   {'class_id': '[5]', 'prob': '[0.9907387495040894]'}\n   ```\n**If an error is re",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme_en.md:123-152"
+    },
+    "4013": {
+        "file_id": 338,
+        "content": "This code provides instructions for setting up and running a C++ serving environment for PaddleVideo. It explains how to navigate to the deployment directory, start the service, send requests using serving_client.py, and obtain the model prediction results. If an error occurs during execution, it will display the corresponding log information.",
+        "type": "comment"
+    },
+    "4014": {
+        "file_id": 338,
+        "content": "ported during the process and it shows that libnvinfer.so.6 cannot be found, you can execute the script `paddle_env_install.sh` to install the relevant environment**\n   ```bash\n   bash paddle_env_install.sh\n   ```\n## FAQ\n**Q1**: No result is returned after the request is sent or an output decoding error is prompted\n**A1**: Do not set the proxy when starting the service and sending the request. You can close the proxy before starting the service and sending the request. The command to close the proxy is:\n```\nunset https_proxy\nunset http_proxy\n```",
+        "type": "code",
+        "location": "/deploy/cpp_serving/readme_en.md:152-165"
+    },
+    "4015": {
+        "file_id": 338,
+        "content": "This code provides instructions for installing the necessary environment and resolving an issue where no result is returned or a decoding error occurs due to proxy settings. Users are advised not to set proxies when starting the service and sending requests, and should use the provided commands to close proxies beforehand. The script `paddle_env_install.sh` can be executed to install relevant environment requirements.",
+        "type": "comment"
+    },
+    "4016": {
+        "file_id": 339,
+        "content": "/deploy/cpp_serving/run_cpp_serving.sh",
+        "type": "filepath"
+    },
+    "4017": {
+        "file_id": 339,
+        "content": "This script runs PaddleVideo server using either PP-TSM or PP-TSN models on different ports. It uses the paddle_serving_server module and is executed as a background process with nohup command.",
+        "type": "summary"
+    },
+    "4018": {
+        "file_id": 339,
+        "content": "## sample script\n# run paddlevideo server with PP-TSM:\nnohup python3.7 -m paddle_serving_server.serve \\\n--model ./ppTSM_serving_server \\\n--port 9993 &\n## run paddlevideo server with PP-TSN:\n# nohup python3.7 -m paddle_serving_server.serve \\\n# --model ./ppTSN_serving_server \\\n# --port 9993 &",
+        "type": "code",
+        "location": "/deploy/cpp_serving/run_cpp_serving.sh:1-10"
+    },
+    "4019": {
+        "file_id": 339,
+        "content": "This script runs PaddleVideo server using either PP-TSM or PP-TSN models on different ports. It uses the paddle_serving_server module and is executed as a background process with nohup command.",
+        "type": "comment"
+    },
+    "4020": {
+        "file_id": 340,
+        "content": "/deploy/cpp_serving/serving_client.py",
+        "type": "filepath"
+    },
+    "4021": {
+        "file_id": 340,
+        "content": "This code uses Paddle Serving for postprocessing and PaddleVideo framework for video processing, initializing client, preprocessing input, sending data to server, receiving prediction, and printing output.",
+        "type": "summary"
+    },
+    "4022": {
+        "file_id": 340,
+        "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nfrom typing import Any, Dict\nimport numpy as np\nfrom paddle_serving_client import Client\nfrom preprocess_ops import get_preprocess_func, np_softmax\ndef postprocess(fetch_map: Dict[str, np.ndarray]) -> Dict[str, Any]:\n    \"\"\"postprocess\n    Args:\n        fetch_map (Dict[str, np.ndarray]): raw prediction\n    Returns:\n        Dict[str, Any]: postprocessed prediction\n    \"\"\"\n    score_list = fetch_map[\"outputs\"]  # [b,num_classes]",
+        "type": "code",
+        "location": "/deploy/cpp_serving/serving_client.py:1-32"
+    },
+    "4023": {
+        "file_id": 340,
+        "content": "This code snippet is importing necessary libraries and defining a function for postprocessing prediction outputs from a Paddle Serving client. The function takes raw predictions in the form of a numpy array and returns the postprocessed prediction as a dictionary containing any desired data.",
+        "type": "comment"
+    },
+    "4024": {
+        "file_id": 340,
+        "content": "    fetch_dict = {\"class_id\": [], \"prob\": []}\n    for score in score_list:\n        score = np_softmax(score, axis=0)\n        score = score.tolist()\n        max_score = max(score)\n        fetch_dict[\"class_id\"].append(score.index(max_score))\n        fetch_dict[\"prob\"].append(max_score)\n    fetch_dict[\"class_id\"] = str(fetch_dict[\"class_id\"])\n    fetch_dict[\"prob\"] = str(fetch_dict[\"prob\"])\n    return fetch_dict\ndef parse_args():\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo CPP Serving model script\")\n    parser.add_argument(\"-n\",\n                        \"--name\",\n                        type=str,\n                        default=\"PPTSM\",\n                        help=\"model's name, such as PPTSM, PPTSN...\")\n    parser.add_argument(\n        \"-c\",\n        \"--config\",\n        type=str,\n        help=\"serving client config file(serving_client_conf.prototxt) path\")\n    parser.add_argument(\"--url\",\n                        type=str,\n                        default=\"127.0.0.1:9993\",\n                        help=\"url to access cpp serving\")",
+        "type": "code",
+        "location": "/deploy/cpp_serving/serving_client.py:33-62"
+    },
+    "4025": {
+        "file_id": 340,
+        "content": "The code defines a function that calculates the class id and probability based on scores, converts them to strings, and returns a dictionary with these values. It also includes a function for parsing arguments such as model name, serving client config file path, and URL to access the CPP serving.",
+        "type": "comment"
+    },
+    "4026": {
+        "file_id": 340,
+        "content": "    parser.add_argument(\"--logid\", type=int, default=\"10000\", help=\"log id\")\n    parser.add_argument(\"--input_file\",\n                        type=str,\n                        default=\"../../data/example.avi\",\n                        help=\"input video file\")\n    return parser.parse_args()\nif __name__ == \"__main__\":\n    # parse args\n    args = parse_args()\n    url = args.url\n    logid = args.logid\n    input_file_path = args.input_file\n    model_name = args.name\n    # get preprocess by model name\n    preprocess = get_preprocess_func(model_name)\n    # initialize client object & connect\n    client = Client()\n    client.load_client_config(args.config)\n    client.connect([url])\n    # preprocess\n    feed, fetch = preprocess(input_file_path)\n    # send data & get prediction from server\n    fetch_map = client.predict(feed=feed, fetch=fetch)\n    # postprocess & output\n    result = postprocess(fetch_map)\n    print(result)",
+        "type": "code",
+        "location": "/deploy/cpp_serving/serving_client.py:63-95"
+    },
+    "4027": {
+        "file_id": 340,
+        "content": "This code is a Python function that parses command line arguments, initializes a client object for video processing, preprocesses input video file, sends data to server, receives prediction, post-processes results and prints output. It uses the PaddleVideo framework with specific model configuration file and preprocessing function based on input name.",
+        "type": "comment"
+    },
+    "4028": {
+        "file_id": 341,
+        "content": "/deploy/paddle2onnx/predict_onnx.py",
+        "type": "filepath"
+    },
+    "4029": {
+        "file_id": 341,
+        "content": "The code imports modules, sets up environment, creates an ONNX predictor for video object detection, and performs inference on batches of input files while supporting benchmarking if enabled.",
+        "type": "summary"
+    },
+    "4030": {
+        "file_id": 341,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nimport sys\nfrom os import path as osp\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../tools')))\nfrom utils import build_inference_helper, get_config\ndef parse_args():\n    def str2bool(v):\n        return v.lower() in (\"true\", \"t\", \"1\")\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Inference model script\")",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/predict_onnx.py:1-31"
+    },
+    "4031": {
+        "file_id": 341,
+        "content": "This code imports necessary modules and defines a function for parsing command-line arguments. It sets up the environment to execute PaddleVideo Inference model scripts. The code also includes license information, ensuring compliance with the Apache License, Version 2.0.",
+        "type": "comment"
+    },
+    "4032": {
+        "file_id": 341,
+        "content": "    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument(\"-i\", \"--input_file\", type=str, help=\"input file path\")\n    parser.add_argument(\"--onnx_file\", type=str, help=\"onnx model file path\")\n    # params for onnx predict\n    parser.add_argument(\"-b\", \"--batch_size\", type=int, default=1)\n    parser.add_argument(\"--use_gpu\",\n                        type=str2bool,\n                        default=False,\n                        help=\"set to False when using onnx\")\n    parser.add_argument(\"--precision\", type=str, default=\"fp32\")\n    parser.add_argument(\"--ir_optim\", type=str2bool, default=True)\n    parser.add_argument(\"--enable_benchmark\",\n                        type=str2bool,\n                        default=False,\n                        help=\"set to False when using onnx\")\n    parser.add_argument(\"--cpu_threads\", type=int, default=4)\n    return parser.parse_args()",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/predict_onnx.py:32-54"
+    },
+    "4033": {
+        "file_id": 341,
+        "content": "This code snippet is parsing command line arguments for config file, input file path, and ONNX model file path. It also includes parameters for ONNX prediction like batch size, use of GPU, precision, IR optimization, enable benchmark, and CPU threads.",
+        "type": "comment"
+    },
+    "4034": {
+        "file_id": 341,
+        "content": "def create_onnx_predictor(args, cfg=None):\n    import onnxruntime as ort\n    onnx_file = args.onnx_file\n    config = ort.SessionOptions()\n    if args.use_gpu:\n        raise ValueError(\n            \"onnx inference now only supports cpu! please set `use_gpu` to False.\"\n        )\n    else:\n        config.intra_op_num_threads = args.cpu_threads\n        if args.ir_optim:\n            config.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL\n    predictor = ort.InferenceSession(onnx_file, sess_options=config)\n    return config, predictor\ndef parse_file_paths(input_path: str) -> list:\n    if osp.isfile(input_path):\n        files = [\n            input_path,\n        ]\n    else:\n        files = os.listdir(input_path)\n        files = [\n            file for file in files\n            if (file.endswith(\".avi\") or file.endswith(\".mp4\"))\n        ]\n        files = [osp.join(input_path, file) for file in files]\n    return files\ndef main():\n    \"\"\"predict using onnx model\n    \"\"\"\n    args = parse_args()\n    cfg = get_config(args.config, show=False)",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/predict_onnx.py:57-92"
+    },
+    "4035": {
+        "file_id": 341,
+        "content": "The code defines a function to create an ONNX predictor by loading an ONNX file and setting configuration options. It also includes functions for parsing file paths and handling command-line arguments. This code is used for onnx model inference, specifically for video object detection tasks. The main function calls other utility functions to parse the input file path and load configuration settings before executing the actual prediction using the created ONNX predictor.",
+        "type": "comment"
+    },
+    "4036": {
+        "file_id": 341,
+        "content": "    model_name = cfg.model_name\n    print(f\"Inference model({model_name})...\")\n    InferenceHelper = build_inference_helper(cfg.INFERENCE)\n    inference_config, predictor = create_onnx_predictor(args)\n    # get input_tensor and output_tensor\n    input_names = predictor.get_inputs()[0].name\n    output_names = predictor.get_outputs()[0].name\n    # get the absolute file path(s) to be processed\n    files = parse_file_paths(args.input_file)\n    if args.enable_benchmark:\n        test_video_num = 12\n        num_warmup = 3\n        # instantiate auto log\n        try:\n            import auto_log\n        except ImportError as e:\n            print(f\"{e}, [git+https://github.com/LDOUBLEV/AutoLog] \"\n                  f\"package and it's dependencies is required for \"\n                  f\"python-inference when enable_benchmark=True.\")\n        pid = os.getpid()\n        autolog = auto_log.AutoLogger(\n            model_name=cfg.model_name,\n            model_precision=args.precision,\n            batch_size=args.batch_size,\n            data_shape=\"dynamic\",",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/predict_onnx.py:94-122"
+    },
+    "4037": {
+        "file_id": 341,
+        "content": "This code builds an inference helper, creates an ONNX predictor, gets input and output names, processes file paths, performs benchmarking, and initializes an auto log for the given model.",
+        "type": "comment"
+    },
+    "4038": {
+        "file_id": 341,
+        "content": "            save_path=\"./output/auto_log.lpg\",\n            inference_config=inference_config,\n            pids=pid,\n            process_name=None,\n            gpu_ids=None,\n            time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],\n            warmup=num_warmup)\n        files = [args.input_file for _ in range(test_video_num + num_warmup)]\n    # Inferencing process\n    batch_num = args.batch_size\n    for st_idx in range(0, len(files), batch_num):\n        ed_idx = min(st_idx + batch_num, len(files))\n        # auto log start\n        if args.enable_benchmark:\n            autolog.times.start()\n        # Pre process batched input\n        batched_inputs = InferenceHelper.preprocess_batch(files[st_idx:ed_idx])\n        # get pre process time cost\n        if args.enable_benchmark:\n            autolog.times.stamp()\n        # run inference\n        batched_outputs = predictor.run(\n            output_names=[output_names],\n            input_feed={input_names: batched_inputs[0]})\n        # get inference process time cost",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/predict_onnx.py:123-153"
+    },
+    "4039": {
+        "file_id": 341,
+        "content": "Code snippet performs video inference on batches of input files using a predictor. It preprocesses the batch inputs, runs inference for each batch, and records pre-processing and inference time costs if benchmarking is enabled.",
+        "type": "comment"
+    },
+    "4040": {
+        "file_id": 341,
+        "content": "        if args.enable_benchmark:\n            autolog.times.stamp()\n        InferenceHelper.postprocess(batched_outputs, not args.enable_benchmark)\n        # get post process time cost\n        if args.enable_benchmark:\n            autolog.times.end(stamp=True)\n        # time.sleep(0.01)  # sleep for T4 GPU\n    # report benchmark log if enabled\n    if args.enable_benchmark:\n        autolog.report()\nif __name__ == \"__main__\":\n    main()",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/predict_onnx.py:154-171"
+    },
+    "4041": {
+        "file_id": 341,
+        "content": "The code segment is controlling the benchmark execution. If `args.enable_benchmark` is True, it stamps the current time using autolog, then calls postprocess function on batched outputs with `not args.enable_benchmark`. After that, it ends the timer using autolog and reports the benchmark log if `args.enable_benchmark` is still True.",
+        "type": "comment"
+    },
+    "4042": {
+        "file_id": 342,
+        "content": "/deploy/paddle2onnx/readme.md",
+        "type": "filepath"
+    },
+    "4043": {
+        "file_id": 342,
+        "content": "This code converts a PaddlePaddle model to ONNX for inference using paddle2onnx and ONNXRuntime. The ONNX format enables similar usage to Paddle, with results matching Paddle predictions.",
+        "type": "summary"
+    },
+    "4044": {
+        "file_id": 342,
+        "content": "# paddle2onnx 模型转化与预测\n本章节介绍 PP-TSN 模型如何转化为 ONNX 模型，并基于 ONNX 引擎预测。\n## 1. 环境准备\n需要准备 Paddle2ONNX 模型转化环境，和 ONNX 模型预测环境。\nPaddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式，算子目前稳定支持导出 ONNX Opset 9~11，部分Paddle算子支持更低的ONNX Opset转换。\n更多细节可参考 [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)\n- 安装 Paddle2ONNX\n```bash\npython3.7 -m pip install paddle2onnx\n```\n- 安装 ONNXRuntime\n```bash\n# 建议安装 1.9.0 版本，可根据环境更换版本号\npython3.7 -m pip install onnxruntime==1.9.0\n```\n## 2. 模型转换\n- PP-TSN inference模型下载\n    ```bash\n    # 下载inference模型到PaddleVideo/inference/ppTSN/ 目录下\n    mkdir -p ./inference\n    wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip\n    # 解压inference模型\n    pushd ./inference\n    unzip ppTSN.zip\n    popd\n    ```\n- 模型转换\n    使用 Paddle2ONNX 将 Paddle inference模型转换为 ONNX 格式模型：\n    ```bash\n    paddle2onnx \\\n    --model_dir=./inference/ppTSN \\\n    --model_filename=ppTSN.pdmodel \\\n    --params_filename=ppTSN.pdiparams \\\n    --save_file=./inference/ppTSN/ppTSN.onnx \\\n    --opset_version=10 \\",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/readme.md:1-48"
+    },
+    "4045": {
+        "file_id": 342,
+        "content": "This code demonstrates the process of converting a PaddlePaddle model to an ONNX model for inference using Paddle2ONNX and ONNXRuntime. It first installs the necessary packages, downloads the PP-TSN inference model, and then uses paddle2onnx to convert the model to the ONNX format while specifying the opset version.",
+        "type": "comment"
+    },
+    "4046": {
+        "file_id": 342,
+        "content": "    --enable_onnx_checker=True\n    ```\n执行完毕后，可以发现 `./inference/ppTSN` 目录下生成了一个 ONNX 格式的模型文件 `ppTSN.onnx`\n## 3. onnx 预测\n接下来就可以用 ONNX 格式模型进行预测，其用法与paddle 预测模型类似\n执行如下命令：\n```bash\npython3.7 deploy/paddle2onnx/predict_onnx.py \\\n--input_file data/example.avi \\\n--config configs/recognition/pptsn/pptsn_k400_videos.yaml \\\n--onnx_file=./inference/ppTSN/ppTSN.onnx\n```\n结果如下：\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9998553991317749\n```\n可以验证该结果与Paddle inference的预测结果完全一致",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/readme.md:49-70"
+    },
+    "4047": {
+        "file_id": 342,
+        "content": "Enables ONNX checker to generate ONNX format model file for inference. Usage of ONNX model is similar to Paddle, and results match with Paddle inference predictions.",
+        "type": "comment"
+    },
+    "4048": {
+        "file_id": 343,
+        "content": "/deploy/paddle2onnx/readme_en.md",
+        "type": "filepath"
+    },
+    "4049": {
+        "file_id": 343,
+        "content": "This code utilizes Paddle2ONNX to convert PP-TSN model, and demonstrates prediction using ONNX engine. Environment setup involves installing necessary packages and downloading the inference model for conversion & prediction. The code generates output for video files with top-1 class and score.",
+        "type": "summary"
+    },
+    "4050": {
+        "file_id": 343,
+        "content": "# paddle2onnx model conversion and prediction\nThis chapter describes how the PP-TSN model is transformed into an ONNX model and predicted based on the ONNX engine.\n## 1. Environment preparation\nNeed to prepare Paddle2ONNX model conversion environment, and ONNX model prediction environment.\nPaddle2ONNX supports converting the PaddlePaddle model format to the ONNX model format. The operator currently supports exporting ONNX Opset 9~11 stably, and some Paddle operators support lower ONNX Opset conversion.\nFor more details, please refer to [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)\n- Install Paddle2ONNX\n```bash\npython3.7 -m pip install paddle2onnx\n```\n- Install ONNXRuntime\n```bash\n# It is recommended to install version 1.9.0, and the version number can be changed according to the environment\npython3.7 -m pip install onnxruntime==1.9.0\n```\n## 2. Model conversion\n- PP-TSN inference model download\n    ```bash\n    # Download the inference model to the PaddleVideo/inference/ppTSN/ directory",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/readme_en.md:1-28"
+    },
+    "4051": {
+        "file_id": 343,
+        "content": "This code describes how to convert a PaddlePaddle (PP-TSN) model into an ONNX model and predict using the ONNX engine. It requires environment preparation by installing Paddle2ONNX and ONNXRuntime. Afterward, PP-TSN inference model should be downloaded for conversion and prediction.",
+        "type": "comment"
+    },
+    "4052": {
+        "file_id": 343,
+        "content": "    mkdir -p ./inference\n    wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip\n    # Decompress the inference model\n    pushd ./inference\n    unzip ppTSN.zip\n    popd\n    ```\n- Model conversion\n    Convert Paddle inference models to ONNX format models using Paddle2ONNX:\n    ```bash\n    paddle2onnx \\\n    --model_dir=./inference/ppTSN \\\n    --model_filename=ppTSN.pdmodel \\\n    --params_filename=ppTSN.pdiparams \\\n    --save_file=./inference/ppTSN/ppTSN.onnx \\\n    --opset_version=10 \\\n    --enable_onnx_checker=True\n    ```\nAfter execution, you can find that a model file `ppTSN.onnx` in ONNX format is generated in the `./inference/ppTSN` directory\n## 3. onnx prediction\nNext, you can use the ONNX format model for prediction, which is similar to the paddle prediction model\nExecute the following command:\n```bash\npython3.7 deploy/paddle2onnx/predict_onnx.py \\\n--input_file data/example.avi \\\n--config configs/recognition/pptsn/pptsn_k400_videos.yaml \\\n--onnx_file=./inference/ppTSN/ppTSN.onnx",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/readme_en.md:29-61"
+    },
+    "4053": {
+        "file_id": 343,
+        "content": "The provided code is for model conversion and prediction using Paddle2ONNX. First, it downloads an inference model from a URL, decompresses it, and then converts the Paddle inference model to ONNX format. Finally, it executes an example prediction using the converted ONNX model.",
+        "type": "comment"
+    },
+    "4054": {
+        "file_id": 343,
+        "content": "```\nThe result is as follows:\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9998553991317749\n```\nIt can be verified that the result is completely consistent with the prediction result of Paddle inference",
+        "type": "code",
+        "location": "/deploy/paddle2onnx/readme_en.md:62-70"
+    },
+    "4055": {
+        "file_id": 343,
+        "content": "This code demonstrates how to generate an output for a video file using PaddleVideo. The top-1 class and score are displayed, which can be verified with the result of Paddle inference.",
+        "type": "comment"
+    },
+    "4056": {
+        "file_id": 344,
+        "content": "/deploy/python_serving/pipeline_http_client.py",
+        "type": "filepath"
+    },
+    "4057": {
+        "file_id": 344,
+        "content": "This Python script serves models in PaddleVideo framework, parses command-line arguments, and sends video data via HTTP requests using argparse, json, and requests libraries. It converts videos to numpy arrays, encodes as base64 strings, and sends to specific URL endpoints.",
+        "type": "summary"
+    },
+    "4058": {
+        "file_id": 344,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport json\nimport requests\nfrom utils import numpy_to_base64, parse_file_paths, video_to_numpy\ndef parse_args():\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Web Serving model script\")\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/PP-TSM.yaml',\n                        help='serving config file path')",
+        "type": "code",
+        "location": "/deploy/python_serving/pipeline_http_client.py:1-30"
+    },
+    "4059": {
+        "file_id": 344,
+        "content": "The code is a Python script that parses command-line arguments and defines functions for model serving in the PaddleVideo framework. It imports necessary libraries, including argparse for argument handling, json for data manipulation, requests for HTTP communication, and utils module for converting video to numpy format.",
+        "type": "comment"
+    },
+    "4060": {
+        "file_id": 344,
+        "content": "    parser.add_argument('-ptn',\n                        '--port_number',\n                        type=int,\n                        default=18080,\n                        help='http port number')\n    parser.add_argument('-i',\n                        '--input_file',\n                        type=str,\n                        help='input file path or directory path')\n    return parser.parse_args()\nif __name__ == \"__main__\":\n    args = parse_args()\n    url = f\"http://127.0.0.1:{args.port_number}/video/prediction\"\n    files_list = parse_file_paths(args.input_file)\n    for file_path in files_list:\n        # decoding video and get stacked frames as ndarray\n        decoded_frames = video_to_numpy(file_path=file_path)\n        # encode ndarray to base64 string for transportation.\n        decoded_frames_base64 = numpy_to_base64(decoded_frames)\n        # generate dict & convert to json.\n        data = {\n            \"key\": [\"frames\", \"frames_shape\"],\n            \"value\": [decoded_frames_base64,\n                      str(decoded_frames.shape)]",
+        "type": "code",
+        "location": "/deploy/python_serving/pipeline_http_client.py:31-62"
+    },
+    "4061": {
+        "file_id": 344,
+        "content": "This code defines command-line arguments for port number and input file path or directory, parses the arguments, and uses them to send video data to a server via HTTP requests. It decodes videos into frames as numpy arrays, encodes them to base64 strings, generates dictionaries with keys \"key\" and \"value\", and sends the data to a specific URL endpoint.",
+        "type": "comment"
+    },
+    "4062": {
+        "file_id": 344,
+        "content": "        }\n        data = json.dumps(data)\n        # transport to server & get get results.\n        r = requests.post(url=url, data=data, timeout=100)\n        # print result\n        print(r.json())",
+        "type": "code",
+        "location": "/deploy/python_serving/pipeline_http_client.py:63-70"
+    },
+    "4063": {
+        "file_id": 344,
+        "content": "This code snippet sends a POST request to the specified URL with the data in JSON format. It uses Python's requests library to transport the data and waits for 100 seconds for a response. The response is then printed as JSON.",
+        "type": "comment"
+    },
+    "4064": {
+        "file_id": 345,
+        "content": "/deploy/python_serving/pipeline_rpc_client.py",
+        "type": "filepath"
+    },
+    "4065": {
+        "file_id": 345,
+        "content": "This code imports required modules, handles web serving for PaddleVideo models and includes a client to make predictions by passing encoded frames and shape to the predict method. It outputs labels and probabilities as results.",
+        "type": "summary"
+    },
+    "4066": {
+        "file_id": 345,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\ntry:\n    from paddle_serving_server_gpu.pipeline import PipelineClient\nexcept ImportError:\n    from paddle_serving_server.pipeline import PipelineClient\nimport argparse\nfrom utils import numpy_to_base64, parse_file_paths, video_to_numpy\ndef parse_args():\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Web Serving model script\")\n    parser.add_argument('-c',\n                        '--config',",
+        "type": "code",
+        "location": "/deploy/python_serving/pipeline_rpc_client.py:1-29"
+    },
+    "4067": {
+        "file_id": 345,
+        "content": "This code is importing necessary modules, defining a function to parse command line arguments, and setting up a parser for those arguments. The main purpose of this file seems to be handling the execution of the PaddleVideo model in a web serving environment.",
+        "type": "comment"
+    },
+    "4068": {
+        "file_id": 345,
+        "content": "                        type=str,\n                        default='configs/PP-TSM.yaml',\n                        help='serving config file path')\n    parser.add_argument('-ptn',\n                        '--port_number',\n                        type=int,\n                        default=9993,\n                        help='rpc port number')\n    parser.add_argument('-i',\n                        '--input_file',\n                        type=str,\n                        help='input file path or directory path')\n    return parser.parse_args()\nif __name__ == \"__main__\":\n    args = parse_args()\n    client = PipelineClient()\n    client.connect([f'127.0.0.1:{args.port_number}'])\n    files_list = parse_file_paths(args.input_file)\n    for file_path in files_list:\n        # decoding video and get stacked frames as ndarray\n        decoded_frames = video_to_numpy(file_path=file_path)\n        # encode ndarray to base64 string for transportation.\n        decoded_frames_base64 = numpy_to_base64(decoded_frames)\n        # transport to server & get get results.",
+        "type": "code",
+        "location": "/deploy/python_serving/pipeline_rpc_client.py:30-60"
+    },
+    "4069": {
+        "file_id": 345,
+        "content": "This code sets up command line arguments for the serving config file path, RPC port number, and input file/directory path. It then connects to a server at the specified port, processes the input files (decoding videos and converting ndarrays to base64 strings), and transports the data to the server for processing.",
+        "type": "comment"
+    },
+    "4070": {
+        "file_id": 345,
+        "content": "        ret = client.predict(feed_dict={\n            \"frames\": decoded_frames_base64,\n            \"frames_shape\": str(decoded_frames.shape)\n        },\n                             fetch=[\"label\", \"prob\"])\n        # print result\n        print(ret)",
+        "type": "code",
+        "location": "/deploy/python_serving/pipeline_rpc_client.py:61-68"
+    },
+    "4071": {
+        "file_id": 345,
+        "content": "This code snippet uses a PaddleVideo client to make a prediction. It passes base64 encoded frames and their shape to the client's predict method, fetching both labels and probabilities as results. The print statement outputs these results.",
+        "type": "comment"
+    },
+    "4072": {
+        "file_id": 346,
+        "content": "/deploy/python_serving/readme.md",
+        "type": "filepath"
+    },
+    "4073": {
+        "file_id": 346,
+        "content": "This code deploys a PaddlePaddle model for serving using PaddleServing in PaddleVideo, supporting GPU and CPU installations on Linux platforms. Input/output variables are set for Python serving, and the RPC method is used for prediction. Results are displayed in cmd window.",
+        "type": "summary"
+    },
+    "4074": {
+        "file_id": 346,
+        "content": "简体中文 | [English](./readme_en.md)\n# 模型服务化部署\n## 简介\n[Paddle Serving](https://github.com/PaddlePaddle/Serving) 旨在帮助深度学习开发者轻松部署在线预测服务，支持一键部署工业级的服务能力、客户端和服务端之间高并发和高效通信、并支持多种编程语言开发客户端。\n该部分以 HTTP 预测服务部署为例，介绍怎样在 PaddleVideo 中使用 PaddleServing 部署模型服务。目前只支持 Linux 平台部署，暂不支持 Windows 平台。\n## Serving 安装\nServing 官网推荐使用 docker 安装并部署 Serving 环境。首先需要拉取 docker 环境并创建基于 Serving 的 docker。\n```bash\n# 启动GPU docker\ndocker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel\nnvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash\nnvidia-docker exec -it test bash\n# 启动CPU docker\ndocker pull paddlepaddle/serving:0.7.0-devel\ndocker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash\ndocker exec -it test bash\n```\n进入 docker 后，需要安装 Serving 相关的 python 包。\n```bash\npython3.7 -m pip install paddle-serving-client==0.7.0\npython3.7 -m pip install paddle-serving-app==0.7.0\npython3.7 -m pip install faiss-cpu==1.7.1post2\n#若为CPU部署环境:\npython3.7 -m pip install paddle-serving-server==0.7.0  # CPU",
+        "type": "code",
+        "location": "/deploy/python_serving/readme.md:1-32"
+    },
+    "4075": {
+        "file_id": 346,
+        "content": "This code provides instructions on how to deploy a model service using PaddleServing in the PaddleVideo platform. It starts by explaining that this deployment example uses an HTTP prediction server and is currently only supported on Linux platforms. The instructions then cover how to install Serving, specifying steps for both GPU-accelerated docker installation and CPU-only docker installation, as well as installing the necessary Python packages.",
+        "type": "comment"
+    },
+    "4076": {
+        "file_id": 346,
+        "content": "python3.7 -m pip install paddlepaddle==2.2.0           # CPU\n#若为GPU部署环境\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102  # GPU with CUDA10.2 + TensorRT6\npython3.7 -m pip install paddlepaddle-gpu==2.2.0                   # GPU with CUDA10.2\n#其他GPU环境需要确认环境再选择执行哪一条\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101  # GPU with CUDA10.1 + TensorRT6\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112  # GPU with CUDA11.2 + TensorRT8\n```\n* 如果安装速度太慢，可以通过 `-i https://pypi.tuna.tsinghua.edu.cn/simple` 更换源，加速安装过程\n* 更多环境和对应的安装包详见：https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md\n## 行为识别服务部署\n### 模型转换\n使用 PaddleServing 做服务化部署时，需要将保存的 inference 模型转换为 Serving 模型。下面以 PP-TSM 模型为例，介绍如何部署行为识别服务。\n- 下载训练好的 PP-TSM 的模型，并转化为推理模型：\n  ```bash\n  # 进入PaddleVideo目录\n  cd PaddleVideo\n  wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams\n  python3.7 tools/export_model.py \\\n  -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\",
+        "type": "code",
+        "location": "/deploy/python_serving/readme.md:33-58"
+    },
+    "4077": {
+        "file_id": 346,
+        "content": "Install PaddlePaddle for CPU and GPU environments.",
+        "type": "comment"
+    },
+    "4078": {
+        "file_id": 346,
+        "content": "  -p data/ppTSM_k400_uniform.pdparams \\\n  -o inference/ppTSM\n  ```\n- 我们也提供了转换好的推理模型，按以下命令下载并解压\n  ```bash\n  mkdir ./inference\n  wget -nc -P ./inference https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate\n  pushd ./inference\n  unzip ppTSM.zip\n  popd\n  ```\n- 用 paddle_serving_client 把转换好的推理模型再转换成易于 Server 部署的模型格式：\n  ```bash\n  python3.7 -m paddle_serving_client.convert \\\n  --dirname inference/ppTSM \\\n  --model_filename ppTSM.pdmodel \\\n  --params_filename ppTSM.pdiparams \\\n  --serving_server ./deploy/python_serving/ppTSM_serving_server/ \\\n  --serving_client ./deploy/python_serving/ppTSM_serving_client/\n  ```\n  | 参数              | 类型 | 默认值             | 描述                                                         |\n  | ----------------- | ---- | ------------------ | ------------------------------------------------------------ |\n  | `dirname`         | str  | -                  | 需要转换的模型文件存储路径，Program结构文件和参数文件均保存在此目录。 |\n  | `model_filename`  | str  | None               | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 `__model__` 作为默认的文件名 |",
+        "type": "code",
+        "location": "/deploy/python_serving/readme.md:59-83"
+    },
+    "4079": {
+        "file_id": 346,
+        "content": "This code is converting a pre-trained PaddlePaddle model to a format suitable for serving on the server. It downloads and unzips the pre-trained model, then uses paddle_serving_client to convert it into the correct format for deployment with specified directories for serving server and client. The `dirname` specifies where the pre-trained model files are stored, while `model_filename` names the Inference Program structure file, defaulting to \"__model__\" if not specified.",
+        "type": "comment"
+    },
+    "4080": {
+        "file_id": 346,
+        "content": "  | `params_filename` | str  | None               | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None |\n  | `serving_server`  | str  | `\"serving_server\"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |\n  | `serving_client`  | str  | `\"serving_client\"` | 转换后的客户端配置文件存储路径。默认值为serving_client       |\nPP-TSM 推理模型转换完成后，会在当前文件夹多出 `ppTSM_serving_server` 和 `ppTSM_serving_client` 的文件夹，具备如下格式：\n  ```bash\n  PaddleVideo/deploy/python_serving\n  ├── ppTSM_serving_server\n      ├── ppTSM.pdiparams\n      ├── ppTSM.pdmodel\n      ├── serving_server_conf.prototxt\n      └── serving_server_conf.stream.prototxt\n  ├── ppTSM_serving_client\n      ├── serving_client_conf.prototxt\n      └── serving_client_conf.stream.prototxt\n  ```\n得到模型文件之后，需要分别修改 `ppTSM_serving_server` 和 `ppTSM_serving_client` 下的文件 `serving_server_conf.prototxt`，将 两份文件中`fetch_var` 下的 `alias_name` 均改为 `outputs`\n**备注**:  Serving 为了兼容不同模型的部署，提供了输入输出重命名的功能。这样，不同的模型在推理部署时，只需要修改配置文件的`alias_name`即可，无需修改代码即可完成推理部署。\n修改后的`serving_server_conf.prototxt`如下所示:",
+        "type": "code",
+        "location": "/deploy/python_serving/readme.md:84-103"
+    },
+    "4081": {
+        "file_id": 346,
+        "content": "The code provides parameters for the PP-TSM model transformation, including a parameter file name (params_filename), and paths to store the converted model files (serving_server) and client configuration files (serving_client). The resulting files will be organized in separate folders (ppTSM_serving_server and ppTSM_serving_client), with specific formats. The alias names 'outputs' must be set for both fetch_var in serving_server_conf.prototxt to ensure compatibility and easy deployment of different models without modifying the code.",
+        "type": "comment"
+    },
+    "4082": {
+        "file_id": 346,
+        "content": "```yaml\nfeed_var {\n  name: \"data_batch_0\"\n  alias_name: \"data_batch_0\"\n  is_lod_tensor: false\n  feed_type: 1\n  shape: 8\n  shape: 3\n  shape: 224\n  shape: 224\n}\nfetch_var {\n  name: \"linear_2.tmp_1\"\n  alias_name: \"outputs\"\n  is_lod_tensor: false\n  fetch_type: 1\n  shape: 400\n}\n```\n### 服务部署和请求\n`python_serving` 目录包含了启动 pipeline 服务、C++ serving服务(TODO)和发送预测请求的代码，具体包括：\n```bash\n__init__.py\nconfigs/xxx.yaml            # 启动pipeline服务的配置文件\npipeline_http_client.py     # http方式发送pipeline预测请求的python脚本\npipeline_rpc_client.py      # rpc方式发送pipeline预测请求的python脚本\nrecognition_web_service.py  # 启动pipeline服务端的python脚本\nutils.py                    # 储存预测过程中常用的函数，如parse_file_paths, numpy_to_base64, video_to_numpy\n```\n#### Python Serving\n- 进入工作目录：\n```bash\ncd deploy/python_serving\n```\n- 启动服务：\n```bash\n# 在当前命令行窗口启动并保持在前端\npython3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml\n# 在后台启动，过程中打印输出的日志会重定向保存到log.txt中\npython3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml &>log.txt &\n```\n- 发送请求：\n```bash\n# 以http方式的发送预测请求并接受结果\npython3.7 pipeline_http_client.py -i ../../data/example.avi",
+        "type": "code",
+        "location": "/deploy/python_serving/readme.md:105-152"
+    },
+    "4083": {
+        "file_id": 346,
+        "content": "The code represents the configuration for input (\"feed_var\") and output (\"fetch_var\") variables in the PaddleVideo deployment's Python serving. The input variable has a shape of 8,3,224,224 and the output variable has a shape of 400. This code is part of the setup process for sending prediction requests to the PaddleVideo pipeline service using either HTTP or RPC methods.",
+        "type": "comment"
+    },
+    "4084": {
+        "file_id": 346,
+        "content": "# 以rpc方式的发送预测请求并接受结果\npython3.7 pipeline_rpc_client.py -i ../../data/example.avi\n```\n成功运行后，模型预测的结果会打印在 cmd 窗口中，结果如下：\n```bash\n# http方式打印的结果\n{'err_no': 0, 'err_msg': '', 'key': ['label', 'prob'], 'value': [\"['archery']\", '[0.9907388687133789]'], 'tensors': []}\n# rpc方式打印的结果\nPipelineClient::predict pack_data time:1645631086.764019\nPipelineClient::predict before time:1645631086.8485317\nkey: \"label\"\nkey: \"prob\"\nvalue: \"[\\'archery\\']\"\nvalue: \"[0.9907388687133789]\"\n```\n## FAQ\n**Q1**： 发送请求后没有结果返回或者提示输出解码报错\n**A1**： 启动服务和发送请求时不要设置代理，可以在启动服务前和发送请求前关闭代理，关闭代理的命令是：\n```\nunset https_proxy\nunset http_proxy\n```\n**Q2**： 服务端启动后没有反应，一直停在`start proxy service`不动\n**A2**： 很可能是启动过程中遇到了问题，可以在`./deploy/python_serving/PipelineServingLogs/pipeline.log`日志文件中查看详细报错信息\n更多的服务部署类型，如 `RPC 预测服务` 等，可以参考 Serving 的[github 官网](https://github.com/PaddlePaddle/Serving/tree/v0.7.0/examples)",
+        "type": "code",
+        "location": "/deploy/python_serving/readme.md:154-185"
+    },
+    "4085": {
+        "file_id": 346,
+        "content": "This code demonstrates running the PaddleVideo model for prediction using the RPC (Remote Procedure Call) method. The command \"python3.7 pipeline_rpc_client.py -i ../../data/example.avi\" is used to execute the prediction, and the results are printed in the cmd window.",
+        "type": "comment"
+    },
+    "4086": {
+        "file_id": 347,
+        "content": "/deploy/python_serving/readme_en.md",
+        "type": "filepath"
+    },
+    "4087": {
+        "file_id": 347,
+        "content": "This code shows deploying PaddleServing for deep learning model prediction via HTTP using PP-TSM models and Docker on Linux. Issues with proxy, no response; check log file for errors at \"./deploy/python_serving/PipelineServingLogs/pipeline.log\". Refer to Serving's GitHub for more deployment types like RPC prediction service.",
+        "type": "summary"
+    },
+    "4088": {
+        "file_id": 347,
+        "content": "English | [简体中文](./readme.md)\n# Model service deployment\n## Introduction\n[Paddle Serving](https://github.com/PaddlePaddle/Serving) aims to help deep learning developers easily deploy online prediction services, support one-click deployment of industrial-grade service capabilities, high concurrency between client and server Efficient communication and support for developing clients in multiple programming languages.\nThis section takes the HTTP prediction service deployment as an example to introduce how to use PaddleServing to deploy the model service in PaddleVideo. Currently, only Linux platform deployment is supported, and Windows platform is not currently supported.\n## Serving installation\nThe Serving official website recommends using docker to install and deploy the Serving environment. First, you need to pull the docker environment and create a Serving-based docker.\n```bash\n# start GPU docker\ndocker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel\nnvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash",
+        "type": "code",
+        "location": "/deploy/python_serving/readme_en.md:1-16"
+    },
+    "4089": {
+        "file_id": 347,
+        "content": "This code provides an overview of deploying a model service using PaddleServing for deep learning predictions. It uses HTTP prediction service deployment as an example and suggests installing Serving through Docker on Linux platforms, while Windows is currently not supported.",
+        "type": "comment"
+    },
+    "4090": {
+        "file_id": 347,
+        "content": "nvidia-docker exec -it test bash\n# start CPU docker\ndocker pull paddlepaddle/serving:0.7.0-devel\ndocker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash\ndocker exec -it test bash\n```\nAfter entering docker, you need to install Serving-related python packages.\n```bash\npython3.7 -m pip install paddle-serving-client==0.7.0\npython3.7 -m pip install paddle-serving-app==0.7.0\npython3.7 -m pip install faiss-cpu==1.7.1post2\n#If it is a CPU deployment environment:\npython3.7 -m pip install paddle-serving-server==0.7.0 #CPU\npython3.7 -m pip install paddlepaddle==2.2.0 # CPU\n#If it is a GPU deployment environment\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102 # GPU with CUDA10.2 + TensorRT6\npython3.7 -m pip install paddlepaddle-gpu==2.2.0 # GPU with CUDA10.2\n#Other GPU environments need to confirm the environment and then choose which one to execute\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101 # GPU with CUDA10.1 + TensorRT6\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112 # GPU with CUDA11.2 + TensorRT8",
+        "type": "code",
+        "location": "/deploy/python_serving/readme_en.md:17-41"
+    },
+    "4091": {
+        "file_id": 347,
+        "content": "Install PaddleServing server and client packages for CPU and GPU environments, depending on the deployment type.",
+        "type": "comment"
+    },
+    "4092": {
+        "file_id": 347,
+        "content": "```\n* If the installation speed is too slow, you can change the source through `-i https://pypi.tuna.tsinghua.edu.cn/simple` to speed up the installation process\n* For more environment and corresponding installation packages, see: https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md\n## Behavior recognition service deployment\n### Model conversion\nWhen using PaddleServing for service deployment, you need to convert the saved inference model into a Serving model. The following uses the PP-TSM model as an example to introduce how to deploy the behavior recognition service.\n- Download the trained PP-TSM model and convert it into an inference model:\n  ```bash\n  # Enter PaddleVideo directory\n  cd PaddleVideo\n  wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams\n  python3.7 tools/export_model.py \\\n  -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n  -p data/ppTSM_k400_uniform.pdparams \\\n  -o inference/ppTSM\n  ```\n- We also provide the converted inference model, download and unzip by the following command",
+        "type": "code",
+        "location": "/deploy/python_serving/readme_en.md:42-63"
+    },
+    "4093": {
+        "file_id": 347,
+        "content": "This code snippet provides instructions on how to deploy a behavior recognition service using PaddleServing. It explains that the model must be converted into a Serving model and provides an example of PP-TSM model conversion process. The user is guided to enter the PaddleVideo directory, download the trained PP-TSM model, convert it into an inference model, and finally, provide an option to download a pre-converted inference model if desired.",
+        "type": "comment"
+    },
+    "4094": {
+        "file_id": 347,
+        "content": "  ```bash\n  mkdir ./inference\n  wget -nc -P ./inference https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate\n  pushd ./inference\n  unzip ppTSM.zip\n  popd\n  ```\n- Use paddle_serving_client to convert the converted inference model into a model format that is easy for server deployment:\n  ```bash\n  python3.7 -m paddle_serving_client.convert \\\n  --dirname inference/ppTSM \\\n  --model_filename ppTSM.pdmodel \\\n  --params_filename ppTSM.pdiparams \\\n  --serving_server ./deploy/python_serving/ppTSM_serving_server/ \\\n  --serving_client ./deploy/python_serving/ppTSM_serving_client/\n  ```\n  | parameter | type | default value | description |\n  | ----------------- | ---- | ------------------ | ------- -------------------------------------------------- --- |\n  | `dirname` | str | - | The storage path of the model file to be converted. The program structure file and parameter file are saved in this directory. |\n  | `model_filename` | str | None | The name of the file storing the model In",
+        "type": "code",
+        "location": "/deploy/python_serving/readme_en.md:64-83"
+    },
+    "4095": {
+        "file_id": 347,
+        "content": "This code downloads a pre-trained model and converts it into a format suitable for server deployment using paddle_serving_client. The converted model is saved in the specified directory with the corresponding program and parameter files.",
+        "type": "comment"
+    },
+    "4096": {
+        "file_id": 347,
+        "content": "ference Program structure that needs to be converted. If set to None, use `__model__` as the default filename |\n  | `params_filename` | str | None | File name where all parameters of the model to be converted are stored. It needs to be specified if and only if all model parameters are stored in a single binary file. If the model parameters are stored in separate files, set it to None |\n  | `serving_server` | str | `\"serving_server\"` | The storage path of the converted model files and configuration files. Default is serving_server |\n  | `serving_client` | str | `\"serving_client\"` | The converted client configuration file storage path. Default is serving_client |\nAfter the PP-TSM inference model is converted, there will be additional folders of `ppTSM_serving_server` and `ppTSM_serving_client` in the current folder, with the following formats:\n  ```bash\n  PaddleVideo/deploy/python_serving\n  ├── ppTSM_serving_server\n      ├── ppTSM.pdiparams\n      ├── ppTSM.pdmodel\n      ├── serving_server_conf.prototxt",
+        "type": "code",
+        "location": "/deploy/python_serving/readme_en.md:83-94"
+    },
+    "4097": {
+        "file_id": 347,
+        "content": "This code defines the required parameters for converting a PaddleVideo PP-TSM inference model. Upon successful conversion, it creates `ppTSM_serving_server` and `ppTSM_serving_client` folders with necessary files for the converted model's serving.",
+        "type": "comment"
+    },
+    "4098": {
+        "file_id": 347,
+        "content": "      └── serving_server_conf.stream.prototxt\n  ├── ppTSM_serving_client\n      ├── serving_client_conf.prototxt\n      └── serving_client_conf.stream.prototxt\n  ```\nAfter getting the model files, you need to modify the files `serving_server_conf.prototxt` under `ppTSM_serving_server` and `ppTSM_serving_client` respectively, and change `alias_name` under `fetch_var` in both files to `outputs`\n**Remarks**: In order to be compatible with the deployment of different models, Serving provides the function of input and output renaming. In this way, when different models are inferred and deployed, they only need to modify the `alias_name` of the configuration file, and the inference deployment can be completed without modifying the code.\nThe modified `serving_server_conf.prototxt` looks like this:\n```yaml\nfeed_var {\n  name: \"data_batch_0\"\n  alias_name: \"data_batch_0\"\n  is_lod_tensor: false\n  feed_type: 1\n  shape: 8\n  shape: 3\n  shape: 224\n  shape: 224\n}\nfetch_var {\n  name: \"linear_2.tmp_1\"\n  alias_name: \"outputs\"\n  is_lod_tensor: false",
+        "type": "code",
+        "location": "/deploy/python_serving/readme_en.md:95-119"
+    },
+    "4099": {
+        "file_id": 347,
+        "content": "This code snippet is modifying the model configuration files `serving_server_conf.prototxt` and `serving_client_conf.stream.prototxt`. It changes the `alias_name` under `fetch_var` to \"outputs\" in both files for compatibility with different models during deployment. This allows the inference and deployment of various models without modifying the code, simply by updating the configuration file's alias names.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/41.json b/docs/data/41.json
new file mode 100644
index 000000000..bdebd64b9
--- /dev/null
+++ b/docs/data/41.json
@@ -0,0 +1,543 @@
+{
+    "4100": {
+        "file_id": 347,
+        "content": "  fetch_type: 1\n  shape: 400\n}\n```\n### Service deployment and requests\nThe `python_serving` directory contains the code for starting the pipeline service, C++ serving service (TODO) and sending prediction requests, including:\n```bash\n__init__.py\nconfigs/xxx.yaml            # start the configuration file of the pipeline service\npipeline_http_client.py     # python script for sending pipeline prediction request via http\npipeline_rpc_client.py      # python script for sending pipeline prediction request in rpc mode\nrecognition_web_service.py  # python script that starts the pipeline server\nutils.py                    # common functions used in inference, such as parse_file_paths, numpy_to_base64, video_to_numpy\n```\n#### Python Serving\n- Go to the working directory:\n```bash\ncd deploy/python_serving\n```\n- Start the service:\n```bash\n# Start in the current command line window and stay in front\npython3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml\n# Start in the background, the logs printed during the process will be redirected and saved to log.txt",
+        "type": "code",
+        "location": "/deploy/python_serving/readme_en.md:120-145"
+    },
+    "4101": {
+        "file_id": 347,
+        "content": "This code snippet is for starting the PaddleVideo pipeline service in Python using the recognition_web_service.py script. The `-n` flag specifies the name of the model, and the `-c` flag points to the configuration file for the pipeline service.",
+        "type": "comment"
+    },
+    "4102": {
+        "file_id": 347,
+        "content": "python3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml &>log.txt &\n```\n- send request:\n```bash\n# Send a prediction request in http and receive the result\npython3.7 pipeline_http_client.py -i ../../data/example.avi\n# Send a prediction request in rpc and receive the result\npython3.7 pipeline_rpc_client.py -i ../../data/example.avi\n```\nAfter a successful run, the results of the model prediction will be printed in the cmd window, and the results are as follows:\n```bash\n# http method print result\n{'err_no': 0, 'err_msg': '', 'key': ['label', 'prob'], 'value': [\"['archery']\", '[0.9907388687133789]'], 'tensors ': []}\n# The result of printing in rpc mode\nPipelineClient::predict pack_data time:1645631086.764019\nPipelineClient::predict before time:1645631086.8485317\nkey: \"label\"\nkey: \"prob\"\nvalue: \"[\\'archery\\']\"\nvalue: \"[0.9907388687133789]\"\n```\n## FAQ\n**Q1**: No result is returned after the request is sent or an output decoding error is prompted\n**A1**: Do not set the proxy when starting the service an",
+        "type": "code",
+        "location": "/deploy/python_serving/readme_en.md:146-175"
+    },
+    "4103": {
+        "file_id": 347,
+        "content": "This code is running a web service for model prediction and two client scripts to send prediction requests via HTTP and RPC, printing the results in the command line. The result shows an example output with probabilities and labels for a given input video file. If no result is returned or there's an output decoding error, it might be related to the proxy setting when starting the service.",
+        "type": "comment"
+    },
+    "4104": {
+        "file_id": 347,
+        "content": "d sending the request. You can close the proxy before starting the service and sending the request. The command to close the proxy is:\n```\nunset https_proxy\nunset http_proxy\n```\n**Q2**: There is no response after the server is started, and it has been stopped at `start proxy service`\n**A2**: It is likely that a problem was encountered during the startup process. You can view the detailed error message in the `./deploy/python_serving/PipelineServingLogs/pipeline.log` log file\nFor more service deployment types, such as `RPC prediction service`, you can refer to Serving's [github official website](https://github.com/PaddlePaddle/Serving/tree/v0.7.0/examples)",
+        "type": "code",
+        "location": "/deploy/python_serving/readme_en.md:175-185"
+    },
+    "4105": {
+        "file_id": 347,
+        "content": "Closing the proxy before starting the service and sending request using \"unset https_proxy; unset http_proxy\". No response after server started, check log file for error message at \"./deploy/python_serving/PipelineServingLogs/pipeline.log\". For more deployment types like RPC prediction service, refer to Serving's GitHub official website.",
+        "type": "comment"
+    },
+    "4106": {
+        "file_id": 348,
+        "content": "/deploy/python_serving/recognition_web_service.py",
+        "type": "filepath"
+    },
+    "4107": {
+        "file_id": 348,
+        "content": "This code sets up PaddleVideo, imports libraries, and defines preprocessing pipelines for image recognition web services using PaddlePaddle. It includes a `VideoOp` class for video operations and a \"VideoService\" class for preprocessing and post-processing methods.",
+        "type": "summary"
+    },
+    "4108": {
+        "file_id": 348,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport base64\nimport os\nimport sys\nfrom typing import Callable, Dict, List\nimport numpy as np\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))\nfrom paddle_serving_app.reader import Sequential\nfrom paddlevideo.loader.pipelines import (CenterCrop, Image2Array,\n                                          Normalization, Sampler, Scale,",
+        "type": "code",
+        "location": "/deploy/python_serving/recognition_web_service.py:1-28"
+    },
+    "4109": {
+        "file_id": 348,
+        "content": "This code is importing necessary libraries and modules, setting up the path for the PaddleVideo project, and defining several image processing pipelines including CenterCrop, Image2Array, Normalization, Sampler, and Scale. The purpose of this code is to provide a base for building an image recognition web service using PaddlePaddle.",
+        "type": "comment"
+    },
+    "4110": {
+        "file_id": 348,
+        "content": "                                          TenCrop)\ntry:\n    from paddle_serving_server_gpu.web_service import Op, WebService\nexcept ImportError:\n    from paddle_serving_server.web_service import Op, WebService\nVALID_MODELS = [\"PPTSM\", \"PPTSN\"]\ndef get_preprocess_seq(model_name: str) -> List[Callable]:\n    \"\"\"get preprocess sequence by model name\n    Args:\n        model_name (str): model name for web serving, such as 'PPTSM', 'PPTSN'\n    Returns:\n        List[Callable]: preprocess operators in list.\n    \"\"\"\n    if model_name == 'PPTSM':\n        preprocess_seq = [\n            Sampler(8, 1, valid_mode=True),\n            Scale(256),\n            CenterCrop(224),\n            Image2Array(),\n            Normalization([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n        ]\n    elif model_name == 'PPTSN':\n        preprocess_seq = [\n            Sampler(25, 1, valid_mode=True, select_left=True),\n            Scale(256, fixed_ratio=True, do_round=True, backend='cv2'),\n            TenCrop(224),\n            Image2Array(),\n            Normalization([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])",
+        "type": "code",
+        "location": "/deploy/python_serving/recognition_web_service.py:29-62"
+    },
+    "4111": {
+        "file_id": 348,
+        "content": "This code defines a function called get_preprocess_seq that returns a list of preprocessing operators based on the model name passed as an argument. The model names accepted are \"PPTSM\" and \"PPTSN\". The function checks the model name, and depending on its value, it constructs and returns a sequence of preprocess operators including Sampler, Scale, CenterCrop, Image2Array, and Normalization. These operations prepare the input data for a specific model before feeding into the model for prediction or inference.",
+        "type": "comment"
+    },
+    "4112": {
+        "file_id": 348,
+        "content": "        ]\n    else:\n        raise ValueError(\n            f\"model_name must in {VALID_MODELS}, but got {model_name}\")\n    return preprocess_seq\ndef np_softmax(x: np.ndarray, axis=0) -> np.ndarray:\n    \"\"\"softmax function\n    Args:\n        x (np.ndarray): logits.\n    Returns:\n        np.ndarray: probs.\n    \"\"\"\n    x -= np.max(x, axis=axis, keepdims=True)\n    x = np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)\n    return x\nclass VideoOp(Op):\n    def init_op(self):\n        \"\"\"init_op\n        \"\"\"\n        self.seq = Sequential(get_preprocess_seq(args.name))\n        self.label_dict = {}\n        with open(\"../../data/k400/Kinetics-400_label_list.txt\", \"r\") as fin:\n            for line in fin:\n                label_ind, label_name = line.strip().split(' ')\n                label_ind = int(label_ind)\n                self.label_dict[label_ind] = label_name.strip()\n    def preprocess(self, input_dicts: Dict, data_id: int, log_id: int):\n        \"\"\"preprocess\n        Args:\n            input_dicts (Dict): input_dicts.\n            data_id (int): data_id.",
+        "type": "code",
+        "location": "/deploy/python_serving/recognition_web_service.py:63-102"
+    },
+    "4113": {
+        "file_id": 348,
+        "content": "This code snippet defines a class `VideoOp` that initializes an object with a preprocessing sequence and a dictionary of labels. The `preprocess()` method takes input dictionaries, data ID, and log ID as arguments to perform some operation on video data. The `init_op()` method is responsible for setting up the preprocessing sequence and label dictionary.",
+        "type": "comment"
+    },
+    "4114": {
+        "file_id": 348,
+        "content": "            log_id (int): log_id.\n        Returns:\n            output_data: data for process stage.\n            is_skip_process: skip process stage or not, False default\n            prod_errcode: None default, otherwise, product errores occured.\n                          It is handled in the same way as exception.\n            prod_errinfo: \"\" default.\n        \"\"\"\n        (_, input_dict), = input_dicts.items()\n        for key in input_dict.keys():\n            if key == \"frames\":\n                frame_data = base64.b64decode(input_dict[key].encode('utf8'))\n                frame_data = np.fromstring(frame_data, np.uint8)\n            elif key == 'frames_shape':\n                shape_data = eval(input_dict[key])\n            else:\n                raise ValueError(f\"unexpected key received: {key}\")\n        frame_data = frame_data.reshape(shape_data)\n        frame_len = frame_data.shape[0]\n        frame_data = np.split(frame_data, frame_len, axis=0)\n        frame_data = [frame.squeeze(0) for frame in frame_data]\n        results = {",
+        "type": "code",
+        "location": "/deploy/python_serving/recognition_web_service.py:103-125"
+    },
+    "4115": {
+        "file_id": 348,
+        "content": "This code function takes input_dicts, decodes and reshapes the 'frames' data into numpy array, splits it based on frame length, then squeezes the dimensions and stores the result in results dictionary. It also handles unexpected keys by raising ValueError.",
+        "type": "comment"
+    },
+    "4116": {
+        "file_id": 348,
+        "content": "            'frames': frame_data,\n            'frames_len': frame_len,\n            'format': 'video',\n            'backend': 'cv2'\n        }\n        results = self.seq(results)\n        tmp_inp = np.expand_dims(results['imgs'], axis=0)  # [b,t,c,h,w]\n        # The input for the network is input_data[0], so need to add 1 dimension at the beginning\n        tmp_inp = np.expand_dims(tmp_inp, axis=0).copy()  # [1,b,t,c,h,w]\n        return {\"data_batch_0\": tmp_inp}, False, None, \"\"\n    def postprocess(self, input_dicts: Dict, fetch_dict: Dict, data_id: int,\n                    log_id: int):\n        \"\"\"postprocess\n        Args:\n            input_dicts (Dict): data returned in preprocess stage, dict(for single predict) or list(for batch predict).\n            fetch_dict (Dict): data returned in process stage, dict(for single predict) or list(for batch predict).\n            data_id (int): inner unique id, increase auto.\n            log_id (int): logid, 0 default.\n        Returns:\n            fetch_dict: fetch result must be dict type.",
+        "type": "code",
+        "location": "/deploy/python_serving/recognition_web_service.py:126-149"
+    },
+    "4117": {
+        "file_id": 348,
+        "content": "This code defines two methods: 'preprocess' and 'postprocess'. The 'preprocess' method takes input data in frames, sets the backend as cv2, expands dimensions for input to the network, and returns tmp_inp with a shape of [1,b,t,c,h,w]. The 'postprocess' method receives input_dicts from preprocess stage, fetch_dict from process stage, data_id, and log_id. It then returns the fetch result as a dictionary type.",
+        "type": "comment"
+    },
+    "4118": {
+        "file_id": 348,
+        "content": "            prod_errcode: None default, otherwise, product errores occured.\n                          It is handled in the same way as exception.\n            prod_errinfo: \"\" default.\n        \"\"\"\n        score_list = fetch_dict[\"outputs\"]\n        result = {\"label\": [], \"prob\": []}\n        for score in score_list:\n            score = np_softmax(score)\n            score = score.tolist()\n            max_score = max(score)\n            max_index = score.index(max_score)\n            result[\"label\"].append(self.label_dict[max_index])\n            result[\"prob\"].append(max_score)\n        result[\"label\"] = str(result[\"label\"])\n        result[\"prob\"] = str(result[\"prob\"])\n        return result, None, \"\"\nclass VideoService(WebService):\n    def get_pipeline_response(self, read_op):\n        \"\"\"get_pipeline_response\n        Args:\n            read_op ([type]): [description]\n        Returns:\n            [type]: [description]\n        \"\"\"\n        video_op = VideoOp(name=\"video\", input_ops=[read_op])\n        return video_op\ndef parse_args():",
+        "type": "code",
+        "location": "/deploy/python_serving/recognition_web_service.py:150-182"
+    },
+    "4119": {
+        "file_id": 348,
+        "content": "This code defines a class and a function. The class, \"VideoService\", extends the \"WebService\" class and has a method called \"get_pipeline_response\". The method takes an input operation (read_op) as its argument and returns a VideoOp object with the given read_op as its input. The function \"parse_args\" is used to parse command line arguments. It seems that this code is related to video processing and handling inputs/outputs in some kind of pipeline or web service.",
+        "type": "comment"
+    },
+    "4120": {
+        "file_id": 348,
+        "content": "    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Web Serving model script\")\n    parser.add_argument(\n        '-n',\n        '--name',\n        type=str,\n        default='PPTSM',\n        help='model name used in web serving, such as PPTSM, PPTSN...')\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/PP-TSM.yaml',\n                        help='serving config file path')\n    return parser.parse_args()\nif __name__ == '__main__':\n    # get args such as serving config yaml path.\n    args = parse_args()\n    # start serving\n    uci_service = VideoService(name=\"video\")\n    uci_service.prepare_pipeline_config(yaml_file=args.config)\n    uci_service.run_service()",
+        "type": "code",
+        "location": "/deploy/python_serving/recognition_web_service.py:183-208"
+    },
+    "4121": {
+        "file_id": 348,
+        "content": "This code parses command-line arguments, initializes a PaddleVideo VideoService object with the provided configuration file and runs the service. The name of the model used in web serving is \"PPTSM\".",
+        "type": "comment"
+    },
+    "4122": {
+        "file_id": 349,
+        "content": "/deploy/python_serving/utils.py",
+        "type": "filepath"
+    },
+    "4123": {
+        "file_id": 349,
+        "content": "This code utilizes two functions: \"numpy_to_base64\" converts numpy arrays to base64 strings and \"video_to_numpy\" reads video frames with OpenCV, returning a stack of frames as a numpy array. The parse_file_paths function retrieves file paths or directories containing .avi/.mp4 files and joins them.",
+        "type": "summary"
+    },
+    "4124": {
+        "file_id": 349,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport base64\nimport os\nimport os.path as osp\nimport cv2\nimport numpy as np\ndef numpy_to_base64(array: np.ndarray) -> str:\n    \"\"\"numpy_to_base64\n    Args:\n        array (np.ndarray): input ndarray.\n    Returns:\n        bytes object: encoded str.\n    \"\"\"\n    return base64.b64encode(array).decode('utf8')\ndef video_to_numpy(file_path: str) -> np.ndarray:\n    \"\"\"decode video with cv2 and return stacked frames\n       as numpy.",
+        "type": "code",
+        "location": "/deploy/python_serving/utils.py:1-37"
+    },
+    "4125": {
+        "file_id": 349,
+        "content": "This code contains two functions: \"numpy_to_base64\" and \"video_to_numpy\". The first function converts a numpy array to a base64 encoded string. The second function decodes a video file using OpenCV (cv2) and returns a stack of frames as a numpy array.",
+        "type": "comment"
+    },
+    "4126": {
+        "file_id": 349,
+        "content": "    Args:\n        file_path (str): video file path.\n    Returns:\n        np.ndarray: [T,H,W,C] in uint8.\n    \"\"\"\n    cap = cv2.VideoCapture(file_path)\n    videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n    decoded_frames = []\n    for i in range(videolen):\n        ret, frame = cap.read()\n        # maybe first frame is empty\n        if ret is False:\n            continue\n        img = frame[:, :, ::-1]\n        decoded_frames.append(img)\n    decoded_frames = np.stack(decoded_frames, axis=0)\n    return decoded_frames\ndef parse_file_paths(input_path: str) -> list:\n    \"\"\"get data pathes from input_path\n    Args:\n        input_path (str): input file path or directory which contains input file(s).\n    Returns:\n        list: path(es) of input file(s)\n    \"\"\"\n    assert osp.exists(input_path), \\\n        f\"{input_path} did not exists!\"\n    if osp.isfile(input_path):\n        files = [\n            input_path,\n        ]\n    else:\n        files = os.listdir(input_path)\n        files = [\n            file for file in files\n            if (file.endswith(\".avi\") or file.endswith(\".mp4\"))",
+        "type": "code",
+        "location": "/deploy/python_serving/utils.py:39-78"
+    },
+    "4127": {
+        "file_id": 349,
+        "content": "The code reads video frames from a file path and returns them as numpy array. The parse_file_paths function retrieves either the file path or directory containing .avi/.mp4 files.",
+        "type": "comment"
+    },
+    "4128": {
+        "file_id": 349,
+        "content": "        ]\n        files = [osp.join(input_path, file) for file in files]\n    return files",
+        "type": "code",
+        "location": "/deploy/python_serving/utils.py:79-81"
+    },
+    "4129": {
+        "file_id": 349,
+        "content": "This code is joining the input_path with each file in the files list and returning the resulting list of file paths.",
+        "type": "comment"
+    },
+    "4130": {
+        "file_id": 350,
+        "content": "/deploy/slim/quant_post_static.py",
+        "type": "filepath"
+    },
+    "4131": {
+        "file_id": 350,
+        "content": "The code introduces a quantization function in PaddleVideo for GPU utilization and performs post-training quantization in static graph mode, writing the quantized model for execution on specified placement. It checks if executed directly, parses command-line arguments, and calls appropriate functions based on GPU usage flag.",
+        "type": "summary"
+    },
+    "4132": {
+        "file_id": 350,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nimport os.path as osp\nimport sys\nimport numpy as np\nimport paddle\nfrom paddleslim.quant import quant_post_static\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))\nfrom paddlevideo.loader.builder import build_dataloader, build_dataset\nfrom paddlevideo.utils import get_config, get_logger\ndef parse_args():\n    def str2bool(v):",
+        "type": "code",
+        "location": "/deploy/slim/quant_post_static.py:1-32"
+    },
+    "4133": {
+        "file_id": 350,
+        "content": "This code is likely part of a larger program and it begins by defining the licensing information, then imports necessary libraries for the function. It also includes the path to other related files and defines a function parse_args(). This suggests that the function will be used later to parse command line arguments or configuration file data.",
+        "type": "comment"
+    },
+    "4134": {
+        "file_id": 350,
+        "content": "        return v.lower() in (\"true\", \"t\", \"1\")\n    parser = argparse.ArgumentParser(\"PaddleVideo Inference model script\")\n    parser.add_argument(\n        '-c',\n        '--config',\n        type=str,\n        default=\n        '../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml',\n        help='quantization config file path')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument(\"--use_gpu\",\n                        type=str2bool,\n                        default=True,\n                        help=\"whether use gpui during quantization\")\n    return parser.parse_args()\ndef post_training_quantization(cfg, use_gpu: bool = True):\n    \"\"\"Quantization entry\n    Args:\n        cfg (dict): quntization configuration.\n        use_gpu (bool, optional): whether to use gpu during quantization. Defaults to True.\n    \"\"\"\n    logger = get_logger(\"paddlevideo\")",
+        "type": "code",
+        "location": "/deploy/slim/quant_post_static.py:33-63"
+    },
+    "4135": {
+        "file_id": 350,
+        "content": "This code defines a function for post-training quantization in PaddleVideo. It includes an argument parser to specify the configuration file path and optionally override config options. The function also takes a boolean parameter for whether to use GPU during quantization, and logs messages using get_logger(\"paddlevideo\").",
+        "type": "comment"
+    },
+    "4136": {
+        "file_id": 350,
+        "content": "    place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()\n    # get defined params\n    batch_nums = cfg.DATASET.pop('batch_nums')\n    batch_size = cfg.DATASET.get('batch_size', 1)\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    inference_file_name = cfg.get('model_name', 'inference')\n    inference_model_dir = cfg.get('inference_model_dir',\n                                  f'./inference/{inference_file_name}')\n    quant_output_dir = cfg.get('quant_output_dir',\n                               osp.join(inference_model_dir, 'quant_model'))\n    # build dataloader for quantization, lite data is enough\n    slim_dataset = build_dataset((cfg.DATASET.quant, cfg.PIPELINE.quant))\n    slim_dataloader_setting = dict(batch_size=batch_size,\n                                   num_workers=num_workers,\n                                   places=place,\n                                   drop_last=False,\n                                   shuffle=False)\n    slim_loader = build_dataloader(slim_dataset, **slim_dataloader_setting)",
+        "type": "code",
+        "location": "/deploy/slim/quant_post_static.py:65-84"
+    },
+    "4137": {
+        "file_id": 350,
+        "content": "This code configures the placement (CPU or GPU) based on use_gpu flag, retrieves defined parameters from cfg, builds a dataloader for quantization with specified dataset and settings.",
+        "type": "comment"
+    },
+    "4138": {
+        "file_id": 350,
+        "content": "    logger.info(\"Build slim_loader finished\")\n    def sample_generator(loader):\n        def __reader__():\n            for indx, data in enumerate(loader):\n                # must return np.ndarray, not paddle.Tensor\n                videos = np.array(data[0])\n                yield videos\n        return __reader__\n    # execute quantization in static graph mode\n    paddle.enable_static()\n    exe = paddle.static.Executor(place)\n    logger.info(\"Staring Post-Training Quantization...\")\n    quant_post_static(executor=exe,\n                      model_dir=inference_model_dir,\n                      quantize_model_path=quant_output_dir,\n                      sample_generator=sample_generator(slim_loader),\n                      model_filename=f'{inference_file_name}.pdmodel',\n                      params_filename=f'{inference_file_name}.pdiparams',\n                      batch_size=batch_size,\n                      batch_nums=batch_nums,\n                      algo='KL')\n    logger.info(\"Post-Training Quantization finished...\")",
+        "type": "code",
+        "location": "/deploy/slim/quant_post_static.py:86-114"
+    },
+    "4139": {
+        "file_id": 350,
+        "content": "This code performs post-training quantization for a model, enabling static graph mode in PaddlePaddle and using the specified sample generator for data processing. It also utilizes a specific algorithm (KL) for quantization and writes the quantized model to disk. The execution is done with an executor on the given place.",
+        "type": "comment"
+    },
+    "4140": {
+        "file_id": 350,
+        "content": "if __name__ == '__main__':\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override)\n    post_training_quantization(cfg, args.use_gpu)",
+        "type": "code",
+        "location": "/deploy/slim/quant_post_static.py:117-120"
+    },
+    "4141": {
+        "file_id": 350,
+        "content": "The code checks if the script is being executed directly, then parses command-line arguments and gets a configuration file. It then calls a function for post-training quantization based on GPU usage flag.",
+        "type": "comment"
+    },
+    "4142": {
+        "file_id": 351,
+        "content": "/deploy/slim/readme.md",
+        "type": "filepath"
+    },
+    "4143": {
+        "file_id": 351,
+        "content": "This code introduces PaddleVideo's model compression using PaddleSlim, demonstrates PP-TSM quantized model prediction and pruning methods, providing recommendations for hyperparameters when using quantized training with pre-trained models.",
+        "type": "summary"
+    },
+    "4144": {
+        "file_id": 351,
+        "content": "## Slim功能介绍\n复杂的模型有利于提高模型的性能，但也导致模型中存在一定冗余。此部分提供精简模型的功能，包括两部分：模型量化（量化训练、离线量化）、模型剪枝。\n其中模型量化将全精度缩减到定点数减少这种冗余，达到减少模型计算复杂度，提高模型推理性能的目的。\n模型量化可以在基本不损失模型的精度的情况下，将FP32精度的模型参数转换为Int8精度，减小模型参数大小并加速计算，使用量化后的模型在移动端等部署时更具备速度优势。\n模型剪枝将CNN中不重要的卷积核裁剪掉，减少模型参数量，从而降低模型计算复杂度。\n本教程将介绍如何使用飞桨模型压缩库PaddleSlim做PaddleVideo模型的压缩。\n[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) 集成了模型剪枝、量化（包括量化训练和离线量化）、蒸馏和神经网络搜索等多种业界常用且领先的模型压缩功能，如果您感兴趣，可以关注并了解。\n在开始本教程之前，建议先了解[PaddleVideo模型的训练方法](../../docs/zh-CN/usage.md)以及[PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/latest/index.html)\n## 快速开始\n当训练出一个模型后，如果希望进一步的压缩模型大小并加速预测，可使用量化或者剪枝的方法压缩模型。\n模型压缩主要包括五个步骤：\n1. 安装 PaddleSlim\n2. 准备训练好的模型\n3. 模型压缩\n4. 导出量化推理模型\n5. 量化模型预测部署\n### 1. 安装PaddleSlim\n* 可以通过pip install的方式进行安装。\n```bash\npython3.7 -m pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple\n```\n* 如果获取PaddleSlim的最新特性，可以从源码安装。\n```bash\ngit clone https://github.com/PaddlePaddle/PaddleSlim.git\ncd Paddleslim\npython3.7 setup.py install\n```\n### 2. 准备训练好的模型\nPaddleVideo提供了一系列训练好的[模型](../../docs/zh-CN/model_zoo/README.md)，如果待量化的模型不在列表中，需要按照[常规训练](../../docs/zh-CN/usage.md)方法得到训练好的模型。",
+        "type": "code",
+        "location": "/deploy/slim/readme.md:2-44"
+    },
+    "4145": {
+        "file_id": 351,
+        "content": "This code provides an introduction to the slim functionality of PaddleVideo's model compression using PaddleSlim. It explains the purpose and benefits of model quantization and pruning, and how to use PaddleSlim for PaddleVideo model compression.",
+        "type": "comment"
+    },
+    "4146": {
+        "file_id": 351,
+        "content": "### 3. 模型压缩\n进入PaddleVideo根目录\n```bash\ncd PaddleVideo\n```\n离线量化代码位于`deploy/slim/quant_post_static.py`。\n#### 3.1 模型量化\n量化训练包括离线量化训练和在线量化训练(TODO)，在线量化训练效果更好，需加载预训练模型，在定义好量化策略后即可对模型进行量化。\n##### 3.1.1 在线量化训练\nTODO\n##### 3.1.2 离线量化\n**注意**：目前离线量化，必须使用已经训练好的模型导出的`inference model`进行量化。一般模型导出`inference model`可参考[教程](../../docs/zh-CN/usage.md#5-模型推理).\n一般来说，离线量化损失模型精度较多。\n以PP-TSM模型为例，生成`inference model`后，离线量化运行方式如下\n```bash\n# 下载并解压出少量数据用于离线量化的校准\npushd ./data/k400\nwget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\ntar -xf k400_rawframes_small.tar\npopd\n# 然后进入deploy/slim目录下\ncd deploy/slim\n# 执行离线量化命令\npython3.7 quant_post_static.py \\\n-c ../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml \\\n--use_gpu=True\n```\n除`use_gpu`外，所有的量化环境参数都在`pptsm_k400_frames_uniform_quantization.yaml`文件中进行配置\n其中`inference_model_dir`表示上一步导出的`inference model`目录路径，`quant_output_dir`表示量化模型的输出目录路径\n执行成功后，在`quant_output_dir`的目录下生成了`__model__`文件和`__params__`文件，这二者用于存储生成的离线量化模型\n类似`inference model`的使用方法，接下来可以直接用这两个文件进行预测部署，无需再重新导出模型。",
+        "type": "code",
+        "location": "/deploy/slim/readme.md:46-91"
+    },
+    "4147": {
+        "file_id": 351,
+        "content": "This code snippet explains the process of offline quantization in PaddleVideo for model compression. It mentions that the code is located in `deploy/slim/quant_post_static.py`. The snippet also details the steps involved in offline quantization, including using a pre-trained model and specifying the quantization strategy in a configuration file. The process generates an output directory with `__model__` and `__params__` files that can be used for deployment without re-exporting the model.",
+        "type": "comment"
+    },
+    "4148": {
+        "file_id": 351,
+        "content": "```bash\n# 使用PP-TSM离线量化模型进行预测\n# 回到PaddleVideo目录下\ncd ../../\n# 使用量化模型进行预测\npython3.7 tools/predict.py \\\n--input_file data/example.avi \\\n--config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n--model_file ./inference/ppTSM/quant_model/__model__ \\\n--params_file ./inference/ppTSM/quant_model/__params__ \\\n--use_gpu=True \\\n--use_tensorrt=False\n```\n输出如下：\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9997928738594055\n```\n#### 3.2 模型剪枝\nTODO\n### 4. 导出模型\nTODO\n### 5. 模型部署\n上述步骤导出的模型可以通过PaddleLite的opt模型转换工具完成模型转换。\n模型部署的可参考\n[Serving Python部署](../python_serving/readme.md)\n[Serving C++部署](../cpp_serving/readme.md)\n## 训练超参数建议\n* 量化训练时，建议加载常规训练得到的预训练模型，加速量化训练收敛。\n* 量化训练时，建议初始学习率修改为常规训练的`1/20~1/10`，同时将训练epoch数修改为常规训练的`1/5~1/2`，学习率策略方面，加上Warmup，其他配置信息不建议修改。",
+        "type": "code",
+        "location": "/deploy/slim/readme.md:93-133"
+    },
+    "4149": {
+        "file_id": 351,
+        "content": "This code snippet demonstrates the usage of PP-TSM quantized model for prediction in PaddleVideo. It directs the user to navigate into the PaddleVideo directory and then executes a python script with specific parameters such as input file, configuration file, model files, and flags for GPU and TensorRT utilization. The output shows the recognized top-1 class and score. The code also mentions additional information on how to prune models, export them, and deploy them in Python or C++ settings by referring to separate documentation sections. It provides recommendations for training hyperparameters when using quantized training with pre-trained models.",
+        "type": "comment"
+    },
+    "4150": {
+        "file_id": 352,
+        "content": "/deploy/slim/readme_en.md",
+        "type": "filepath"
+    },
+    "4151": {
+        "file_id": 352,
+        "content": "PaddleSlim is a library for model compression in PaddleVideo, offering quantization, pruning, distillation, and search for enhanced inference performance and reduced computational complexity. It can be installed via pip install and demonstrates PP-TSM offline quantization with deployment options in Python and C++ using PaddleLite's opt tool.",
+        "type": "summary"
+    },
+    "4152": {
+        "file_id": 352,
+        "content": "## Slim function introduction\nA complex model is beneficial to improve the performance of the model, but it also leads to some redundancy in the model. This part provides the function of reducing the model, including two parts: model quantization (quantization training, offline quantization), model pruning.\nAmong them, model quantization reduces the full precision to fixed-point numbers to reduce this redundancy, so as to reduce the computational complexity of the model and improve the inference performance of the model.\nModel quantization can convert FP32-precision model parameters to Int8-precision without losing the accuracy of the model, reducing the size of model parameters and speeding up the calculation. Using the quantized model has a speed advantage when deploying on mobile terminals.\nModel pruning cuts out the unimportant convolution kernels in the CNN, reduces the amount of model parameters, and thus reduces the computational complexity of the model.\nThis tutorial will introduce how to use PaddleSlim, a paddle model compression library, to compress PaddleVideo models.",
+        "type": "code",
+        "location": "/deploy/slim/readme_en.md:1-9"
+    },
+    "4153": {
+        "file_id": 352,
+        "content": "This code introduces PaddleSlim, a model compression library for compressing PaddleVideo models. It includes functions for model quantization (reducing full precision to fixed-point numbers) and model pruning (cutting unimportant convolution kernels). This improves inference performance and reduces computational complexity while preserving accuracy.",
+        "type": "comment"
+    },
+    "4154": {
+        "file_id": 352,
+        "content": "[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) integrates model pruning, quantization (including quantization training and offline quantization), distillation and neural network search and other commonly used and leading model compression functions in the industry. If you are interested, you can follow and understand.\nBefore starting this tutorial, it is recommended to understand [PaddleVideo model training method](../../docs/zh-CN/usage.md) and [PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/ latest/index.html)\n## quick start\nAfter training a model, if you want to further compress the model size and speed up prediction, you can use quantization or pruning to compress the model.\nModel compression mainly includes five steps:\n1. Install PaddleSlim\n2. Prepare the trained model\n3. Model Compression\n4. Export the quantitative inference model\n5. Quantitative Model Prediction Deployment\n### 1. Install PaddleSlim\n* It can be installed by pip install.\n```bash\npython3.7 -m pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple",
+        "type": "code",
+        "location": "/deploy/slim/readme_en.md:10-30"
+    },
+    "4155": {
+        "file_id": 352,
+        "content": "This code snippet provides a brief introduction to PaddleSlim, which offers model pruning, quantization, distillation, and neural network search for model compression. It highlights the quick start process, explaining that after training a model, quantization or pruning can be used to further compress it while speeding up predictions. The code then provides instructions on how to install PaddleSlim via pip install.",
+        "type": "comment"
+    },
+    "4156": {
+        "file_id": 352,
+        "content": "```\n* If you get the latest features of PaddleSlim, you can install it from source.\n```bash\ngit clone https://github.com/PaddlePaddle/PaddleSlim.git\ncd Paddleslim\npython3.7 setup.py install\n```\n### 2. Prepare the trained model\nPaddleVideo provides a series of trained [models](../../docs/zh-CN/model_zoo/README.md). If the model to be quantized is not in the list, you need to follow the [regular training](../ ../docs/zh-CN/usage.md) method to get the trained model.\n### 3. Model Compression\nGo to PaddleVideo root directory\n```bash\ncd PaddleVideo\n```\nThe offline quantization code is located in `deploy/slim/quant_post_static.py`.\n#### 3.1 Model Quantization\nQuantization training includes offline quantization training and online quantization training (TODO). The effect of online quantization training is better. The pre-training model needs to be loaded, and the model can be quantized after the quantization strategy is defined.\n##### 3.1.1 Online quantitative training\nTODO\n##### 3.1.2 Offline Quantization\n**Note",
+        "type": "code",
+        "location": "/deploy/slim/readme_en.md:31-64"
+    },
+    "4157": {
+        "file_id": 352,
+        "content": "In this code, it explains how to install the latest features of PaddleSlim, prepare a trained model for quantization (either using provided models or regular training), and perform model compression including offline quantization. The offline quantization process requires pre-training model loading and defining the quantization strategy.",
+        "type": "comment"
+    },
+    "4158": {
+        "file_id": 352,
+        "content": "**: For offline quantization, you must use the `inference model` exported from the trained model for quantization. For general model export `inference model`, please refer to [Tutorial](../../docs/zh-CN/usage.md#5-Model Inference).\nGenerally speaking, the offline quantization loss model has more accuracy.\nTaking the PP-TSM model as an example, after generating the `inference model`, the offline quantization operation is as follows\n```bash\n# download a small amount of data for calibration\npushd ./data/k400\nwget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\ntar -xf k400_rawframes_small.tar\npopd\n# then switch to deploy/slim\ncd deploy/slim\n# execute quantization script\npython3.7 quant_post_static.py \\\n-c ../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml \\\n--use_gpu=True\n```\nAll quantization environment parameters except `use_gpu` are configured in `pptsm_k400_frames_uniform_quantization.yaml` file\nWhere `inference_model_dir` represents the directory path of the ",
+        "type": "code",
+        "location": "/deploy/slim/readme_en.md:64-87"
+    },
+    "4159": {
+        "file_id": 352,
+        "content": "The code explains the process of offline quantization for a trained model using the PaddleVideo framework. The user must first export an inference model from the trained model and download calibration data before executing the quantization script with specific parameters. The configuration file, `pptsm_k400_frames_uniform_quantization.yaml`, contains all quantization environment parameters except for `use_gpu`.",
+        "type": "comment"
+    },
+    "4160": {
+        "file_id": 352,
+        "content": "`inference model` exported in the previous step, and `quant_output_dir` represents the output directory path of the quantization model\nAfter successful execution, the `__model__` file and the `__params__` file are generated in the `quant_output_dir` directory, which are used to store the generated offline quantization model\nSimilar to the usage of `inference model`, you can directly use these two files for prediction deployment without re-exporting the model.\n```bash\n# Use PP-TSM offline quantization model for prediction\n# Go back to the PaddleVideo directory\ncd ../../\n# Use the quantized model to make predictions\npython3.7 tools/predict.py \\\n--input_file data/example.avi \\\n--config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n--model_file ./inference/ppTSM/quant_model/__model__ \\\n--params_file ./inference/ppTSM/quant_model/__params__ \\\n--use_gpu=True \\\n--use_tensorrt=False\n```\nThe output is as follows:\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9997928738594055",
+        "type": "code",
+        "location": "/deploy/slim/readme_en.md:87-111"
+    },
+    "4161": {
+        "file_id": 352,
+        "content": "This code demonstrates how to use the PP-TSM offline quantization model for prediction. After exporting the inference model, the __model__ and __params__ files are generated in the specified output directory (quant_output_dir). These files can be used directly for prediction deployment without re-exporting the model. The provided example uses Python's tools/predict.py script to make predictions on a video file (data/example.avi), using the specified configuration (configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml). The results include the top-1 class and score.",
+        "type": "comment"
+    },
+    "4162": {
+        "file_id": 352,
+        "content": "```\n#### 3.2 Model pruning\nTODO\n### 4. Export the model\nTODO\n### 5. Model Deployment\nThe model exported in the above steps can be converted through the opt model conversion tool of PaddleLite.\nReference for model deployment\n[Serving Python Deployment](../python_serving/readme.md)\n[Serving C++ Deployment](../cpp_serving/readme.md)\n## Training hyperparameter suggestions\n* During quantitative training, it is recommended to load the pre-trained model obtained from regular training to accelerate the convergence of quantitative training.\n* During quantitative training, it is recommended to modify the initial learning rate to `1/20~1/10` of conventional training, and modify the number of training epochs to `1/5~1/2` of conventional training. In terms of learning rate strategy, add On Warmup, other configuration information is not recommended to be modified.",
+        "type": "code",
+        "location": "/deploy/slim/readme_en.md:112-132"
+    },
+    "4163": {
+        "file_id": 352,
+        "content": "This code provides an overview of model pruning, exporting the model, and deployment. It mentions using PaddleLite's opt model conversion tool for deployment and refers to two serving deployments: Python and C++. For quantitative training, it suggests loading pre-trained models, adjusting learning rates, and modifying the number of epochs while maintaining other configuration settings unchanged.",
+        "type": "comment"
+    },
+    "4164": {
+        "file_id": 353,
+        "content": "/english_documents/benchmark.md",
+        "type": "filepath"
+    },
+    "4165": {
+        "file_id": 353,
+        "content": "This code compares PaddleVideo's speed with popular frameworks, highlighting Slowfast's 2x faster speed and evaluates action segmentation model performance on Breakfast dataset. Tested on V100 GPU with batch size 2.",
+        "type": "summary"
+    },
+    "4166": {
+        "file_id": 353,
+        "content": "[简体中文](../zh-CN/benchmark.md) | English\n# Benchmark\nWe compare our results with some popular frameworks and official releases in terms of speed.\n## Environment\n### Hardware\n- 8 NVIDIA Tesla V100 (16G) GPUs\n- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz\n### Software\n- Python 3.7\n- PaddlePaddle2.0\n- CUDA 10.1\n- CUDNN 7.6.3\n- NCCL 2.1.15\n- GCC 8.2.0\n## Experiments and Statistics\nThe statistic is the average training time, including data processing and model training time, and the training speed is measured with ips(instance per second). Note that we skip the first 50 iters as they may contain the device warmup time.\nHere we compare PaddleVideo with the other video understanding toolkits in the same data and model settings.\nTo ensure the fairness of the comparison, the comparison experiments were conducted under the same hardware environment and using the same dataset. The dataset we used is generated by the [data preparation](dataset/k400.md), and in each model setting, the same data preprocessing methods are applied to make sure the same feature input.",
+        "type": "code",
+        "location": "/english_documents/benchmark.md:1-27"
+    },
+    "4167": {
+        "file_id": 353,
+        "content": "This code provides a benchmark comparison of PaddleVideo with other popular frameworks and official releases in terms of speed. It specifies the environment, hardware, and software used for the experiments. The statistics include average training time and training speed measured in instances per second (ips). The dataset is prepared according to a specific method to ensure fairness in the comparison.",
+        "type": "comment"
+    },
+    "4168": {
+        "file_id": 353,
+        "content": "Significant improvement can be observed when comparing with other video understanding framework as shown in the table below, Especially the [Slowfast](../../configs/recognition/slowfast/slowfast.yaml) model is nearly 2x faster than the counterparts.\n## Results\n### Recognizers\n| Model | batch size <sub>x</sub> gpus | PaddleVideo(ips) | Reference(ips) | MMAction2 (ips)  | PySlowFast (ips)|\n| :------: | :-------------------:|:---------------:|:---------------: | :---------------:  |:---------------: |\n| [TSM](../../configs/recognition/tsm/tsm.yaml) | 16x8 | 58.1 | 46.04(temporal-shift-module) | To do | X |\n| [PPTSM](../../configs/recognition/tsm/pptsm.yaml) | 16x8 |  57.6 | X |    X   | X |\n| [TSN](../../configs/recognition/tsn/tsn.yaml) | 16x8 |  841.1 |  To do (tsn-pytorch) | To do | X |\n| [Slowfast](../../configs/recognition/slowfast/slowfast.yaml)| 16x8 | 99.5 | X | To do | 43.2 |\n| [Attention_LSTM](../../configs/recognition/attention_lstm/attention_lstm.yaml) |  128x8  | 112.6  | X | X | X |\n### Localizers",
+        "type": "code",
+        "location": "/english_documents/benchmark.md:29-45"
+    },
+    "4169": {
+        "file_id": 353,
+        "content": "This table compares the inference performance (ips) of various video understanding models using PaddleVideo. It shows the batch size, number of GPUs used, and ips for each model. Slowfast model stands out for its 2x faster speed compared to counterparts. TSM and TSN have higher ips than others, but the reference implementation is not available.",
+        "type": "comment"
+    },
+    "4170": {
+        "file_id": 353,
+        "content": "| Model | PaddleVideo(ips) |MMAction2 (ips) |BMN(boundary matching network) (ips)|\n| :--- | :---------------: | :-------------------------------------: | :-------------------------------------: |\n| [BMN](../../configs/localization/bmn.yaml)  | 43.84 | x | x |\n### Segmenters\nThis repo provides performance and accuracy comparison between classical and popular sequential action segmentation models\n| Model | Metrics | Value | Flops(M) |Params(M) | test time(ms) bs=1 | test time(ms) bs=2 | inference time(ms) bs=1 | inference time(ms) bs=2 |\n| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |\n| MS-TCN | F1@0.5 | 38.8% | 791.360 | 0.8 | 170 | - | 10.68 | - |\n| ASRF | F1@0.5 | 55.7% | 1,283.328 | 1.3 | 190 | - | 16.34 | - |\n* Model: model name, for example: PP-TSM\n* Metrics: Fill in the indicators used in the model test, and the data set used is **breakfast**\n* Value: Fill in the value corresponding to the metrics index, and generally keep two decimal places\n* Flops(M): The floating-",
+        "type": "code",
+        "location": "/english_documents/benchmark.md:47-64"
+    },
+    "4171": {
+        "file_id": 353,
+        "content": "This code provides a comparison of performance and accuracy between classical and popular sequential action segmentation models, with metrics such as F1@0.5, model names, Flops(M), Params(M), and test/inference times for different batch sizes. It is part of a repository that aims to compare these models using the Breakfast dataset.",
+        "type": "comment"
+    },
+    "4172": {
+        "file_id": 353,
+        "content": "point computation required for one forward operation of the model can be called `paddlevideo/tools/summary.py`script calculation (different models may need to be modified slightly), keep one decimal place, and measure it with data **input tensor with shape of (1, 2048, 1000)**\n* Params(M): The model parameter quantity, together with flops, will be calculated by the script, and one decimal place will be reserved\n* test time(ms) bs=1: When the python script starts the batchsize = 1 test, the time required for a sample is kept to two decimal places. The data set used in the test is **breakfast**.\n* test time(ms) bs=2: When the python script starts the batchsize = 2 test, the time required for a sample is kept to two decimal places. The sequential action segmentation model is generally a full convolution network, so the batch of training, testing and reasoning_ Size is 1. The data set used in the test is **breakfast**.\n* inference time(ms) bs=1: When the reasoning model is tested with GPU (def",
+        "type": "code",
+        "location": "/english_documents/benchmark.md:64-68"
+    },
+    "4173": {
+        "file_id": 353,
+        "content": "This code is describing the performance measurements for a PaddleVideo model. It calculates the model parameters (M), test time, and inference time with specific batch sizes and input tensor shapes. The test data used is \"breakfast\".",
+        "type": "comment"
+    },
+    "4174": {
+        "file_id": 353,
+        "content": "ault V100) with batchsize = 1, the time required for a sample is reserved to two decimal places. The dataset used for reasoning is **breakfast**.\n* inference time(ms) bs=2: When the reasoning model is tested with GPU (default V100) with batchsize = 1, the time required for a sample is reserved to two decimal places. The sequential action segmentation model is generally a full convolution network, so the batch of training, testing and reasoning_ Size is 1. The dataset used for reasoning is **breakfast**.",
+        "type": "code",
+        "location": "/english_documents/benchmark.md:68-69"
+    },
+    "4175": {
+        "file_id": 353,
+        "content": "The code states that the reasoning model is tested on a GPU (default V100) with batch size 2. The time required for a sample is reserved to two decimal places, and the dataset used for this particular reasoning process is \"breakfast\". Additionally, it mentions that the sequential action segmentation model is generally a full convolution network, which typically has a batch size of 1 during training, testing, and reasoning.",
+        "type": "comment"
+    },
+    "4176": {
+        "file_id": 354,
+        "content": "/english_documents/dataset/AVA.md",
+        "type": "filepath"
+    },
+    "4177": {
+        "file_id": 354,
+        "content": "The AVA dataset preparation includes downloading and cutting videos, extracting frames, and organizing into PaddleVideo's rawframes, videos, and annotation folders. The code provides a function to count video frames for processing and analysis purposes.",
+        "type": "summary"
+    },
+    "4178": {
+        "file_id": 354,
+        "content": "[简体中文](../../zh-CN/dataset/k400.md) | English\n# AVA Data Preparation\nThis document mainly introduces the preparation process of AVA dataset.\nIt mainly includes five parts: Video Data Download, Prepare Annotations, Cut video files,\nExtract the RGB frames, Pulling Proposal Files,et al.\nBefore we start, please make sure that the directory is located at `$PaddleVideo/data/ava/script`.\n---\n## 1. Video data Download\nFor basic dataset information, you can refer to the official website [AVA](https://research.google.com/ava/index.html).\nFor the dataset download, you can refer to the [AVA Download](https://github.com/cvdfoundation/ava-dataset) ，\nwhich introduce the way to download the dataset. We also provide the shell script for downloading the video files\n```shell\nbash download_videos.sh\n```\nFurthermore,considering the difficulty in downloading,\nwe upload the video files to Baidu cloud disk in the form of zip packages, and users can download it by themselves according to their needs.\n[Link]() <sup>coming soon</sup>.",
+        "type": "code",
+        "location": "/english_documents/dataset/AVA.md:1-23"
+    },
+    "4179": {
+        "file_id": 354,
+        "content": "This document introduces the AVA dataset preparation process, including video data download, annotation preparation, cutting video files, extracting RGB frames, and pulling proposal files. Ensure the directory is at `$PaddleVideo/data/ava/script`. Users can download the dataset from its official site or through a provided script. Video files are also available for download via Baidu cloud disk.",
+        "type": "comment"
+    },
+    "4180": {
+        "file_id": 354,
+        "content": "**Note: the video files should be placed in `data/ava/videos`**\n---\n## 2.Prepare Annotations\nNext, you can run the following script to prepare annotations.\n```shell\nbash download_annotations.sh\n```\nThis command will download `ava_v2.1.zip` for AVA `v2.1` annotation. If you need the AVA `v2.2` annotation, you can try the following script.\n```shell\nVERSION=2.2 bash download_annotations.sh\n```\n**Note: In fact,we will also provide the annotation zip files in Baidu cloud disk**\n---\n## 3. cut video files\nCut each video from its 15th to 30th minute and make them at 30 fps.\n```shell\nbash cut_videos.sh\n```\n---\n## 4. Extract RGB Frames\nyou can use the ffmpeg to extract RGB frames by the following script.\n```shell\nbash extract_rgb_frames.sh\n```\n---\n## 5.Pulling Proposal Files\nThe scripts are adapted from FAIR's [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks).\nRun the follow scripts to fetch pre-computed proposal list.\n```shell\nbash fetch_ava_proposals.sh\n```\n---\n## 6.Folder Structure\nAfter the whole data pipeline for AVA preparation.",
+        "type": "code",
+        "location": "/english_documents/dataset/AVA.md:26-78"
+    },
+    "4181": {
+        "file_id": 354,
+        "content": "This code outlines the steps to prepare a dataset for AVA, a video action recognition task. It involves downloading and extracting annotations, cutting videos to specific time ranges, extracting RGB frames with ffmpeg, and fetching pre-computed proposal lists. The final step shows the expected folder structure of the prepared dataset.",
+        "type": "comment"
+    },
+    "4182": {
+        "file_id": 354,
+        "content": "you can get the rawframes (RGB), videos and annotation files for AVA.\nIn the context of the whole project (for AVA only), the folder structure will look like:\n```\nPaddleVideo\n├── configs\n├── paddlevideo\n├── docs\n├── tools\n├── data\n│   ├── ava\n│   │   ├── annotations\n│   │   |   ├── ava_dense_proposals_train.FAIR.recall_93.9.pkl\n│   │   |   ├── ava_dense_proposals_val.FAIR.recall_93.9.pkl\n│   │   |   ├── ava_dense_proposals_test.FAIR.recall_93.9.pkl\n│   │   |   ├── ava_train_v2.1.csv\n│   │   |   ├── ava_val_v2.1.csv\n│   │   |   ├── ava_train_excluded_timestamps_v2.1.csv\n│   │   |   ├── ava_val_excluded_timestamps_v2.1.csv\n│   │   |   ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt\n│   │   ├── videos\n│   │   │   ├── 053oq2xB3oU.mkv\n│   │   │   ├── 0f39OWEqJ24.mp4\n│   │   │   ├── ...\n│   │   ├── videos_15min\n│   │   │   ├── 053oq2xB3oU.mkv\n│   │   │   ├── 0f39OWEqJ24.mp4\n│   │   │   ├── ...\n│   │   ├── rawframes\n│   │   │   ├── 053oq2xB3oU\n|   │   │   │   ├── img_00001.jpg\n|   │   │   │   ├── img_00002.jpg\n|   │   │   │   ├── ...",
+        "type": "code",
+        "location": "/english_documents/dataset/AVA.md:79-112"
+    },
+    "4183": {
+        "file_id": 354,
+        "content": "The code represents the folder structure for AVA dataset in PaddleVideo, including rawframes (RGB), videos, and annotation files.",
+        "type": "comment"
+    },
+    "4184": {
+        "file_id": 354,
+        "content": "```",
+        "type": "code",
+        "location": "/english_documents/dataset/AVA.md:113-113"
+    },
+    "4185": {
+        "file_id": 354,
+        "content": "The code snippet defines a function that calculates the total number of frames in a given video. This can be useful for processing or analyzing videos, as it provides the necessary information about the duration of the video and how many frames to expect.",
+        "type": "comment"
+    },
+    "4186": {
+        "file_id": 355,
+        "content": "/english_documents/dataset/ActivityNet.md",
+        "type": "filepath"
+    },
+    "4187": {
+        "file_id": 355,
+        "content": "The code downloads, extracts and provides label information for 19228 videos' feature frames in \"activitynet_1.3_annotations.json\" for PaddleVideo model pre-training, using decompressed data from \"bmn_feat.tar.gz\". Users need to modify `feat_path` and `file_path` in the configuration file.",
+        "type": "summary"
+    },
+    "4188": {
+        "file_id": 355,
+        "content": "[简体中文](../../zh-CN/dataset/ActivityNet.md) | English\n# ActivityNet data preparation\n- [Introduction](#Introduction)\n- [Download](#Download)\n## Introduction\nActivityNet is a dataset for large-scale video understanding tasks, which can be used for tasks such as action localization, action recognition, etc.\n## Download\n1. The BMN model uses the processed ActivityNet 1.3 dataset. There are two ways to use it:\n    - Using our processed ActivityNet 1.3 dataset (compressed package is about 5.5G), each video has corresponding action labels, duration intervals, duration frames, duration seconds and other information\n        Download with the following command:\n        ```bash\n        wget https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz # Download the processed video feature data\n        wget https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json # Download the processed label data\n        ```\n        Or click the following hyperlinks to download:\n        [Video feature data](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)",
+        "type": "code",
+        "location": "/english_documents/dataset/ActivityNet.md:1-24"
+    },
+    "4189": {
+        "file_id": 355,
+        "content": "ActivityNet is a large-scale dataset for video understanding tasks like action localization and recognition. The code provides instructions on how to download the processed ActivityNet 1.3 dataset, consisting of videos with corresponding labels, durations, and frames. Users can choose between two methods: downloading precompressed packages or clicking provided hyperlinks.",
+        "type": "comment"
+    },
+    "4190": {
+        "file_id": 355,
+        "content": "        [Video feature data](https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json)\n        then decompression `bmn_feat.tar.gz`\n        ```bash\n        tar -xf bmn_feat.tar.gz\n        ```\n    - Extract features by yourself\n        First refer to [Download Instructions](https://github.com/activitynet/ActivityNet/tree/master/Crawler) to download the original dataset. When training this model, you need to use TSN to extract features from the source files first. You can [self-extract](https://github.com/yjxiong/temporal-segment-networks) video frame and optical flow information, and the pre-trained TSN model can be downloaded from [here](https://github.com/ yjxiong/anet2016-cuhk) download.\n    The information in the `activitynet_1.3_annotations.json` tag file is as follows:\n    ```json\n    {\n        \"v_QOlSCBRmfWY\": {\n            \"duration_second\": 82.73,\n            \"subset\": \"training\",\n            \"duration_frame\": 2067,\n            \"annotations\": [{\n                \"segment\": [6.195294851794072, 77.73085420904837],",
+        "type": "code",
+        "location": "/english_documents/dataset/ActivityNet.md:25-45"
+    },
+    "4191": {
+        "file_id": 355,
+        "content": "The code is explaining how to download and extract video feature data from the \"activitynet_1.3_annotations.json\" file for a model in PaddleVideo. It mentions decompressing the \"bmn_feat.tar.gz\" file, extracting features by yourself using TSN, and providing the necessary files and instructions to download and pre-train the TSN model. The \"activitynet_1.3_annotations.json\" file contains information about video annotations for training purposes.",
+        "type": "comment"
+    },
+    "4192": {
+        "file_id": 355,
+        "content": "                \"label\": \"Ballet\"\n            }],\n            \"feature_frame\": 2064\n        },\n        \"v_ehGHCYKzyZ8\": {\n            \"duration_second\": 61.7189999999999994,\n            \"subset\": \"training\",\n            \"duration_frame\": 1822,\n            \"annotations\": [{\n                \"segment\": [43.95990729267573, 45.401932082395355],\n                \"label\": \"Doing crunches\"\n            }],\n            \"feature_frame\": 1808\n        },\n        ...,\n        ...\n    }\n    ```\n    In the end, `19228` video feature npy files are obtained, corresponding to the `19228` label information in the `activitynet_1.3_annotations.json` file.\n2. Create a new `data/bmn_data` folder, and then unzip the video feature data after downloading and put it in this folder, and finally it should be organized into the following form:\n    ```\n    PaddleVideo\n    ├── data\n    │   ├── bmn_data\n    │   │   ├── fix_feat_100\n    │   │   │   ├── v___c8enCfzqw.npy\n    │   │   │   ├── v___dXUJsj3yo.npy\n    │   │   │   ├── ...\n    │   │   │\n    │   │   └── activitynet_1.3_annotations.json",
+        "type": "code",
+        "location": "/english_documents/dataset/ActivityNet.md:46-77"
+    },
+    "4193": {
+        "file_id": 355,
+        "content": "The code represents a dictionary containing label information and video feature frame data for 19228 videos. Each key represents a video, and the corresponding value is another dictionary with 'duration_second', 'subset', 'duration_frame', 'feature_frame' keys, and an array of 'annotations' which includes 'segment' (time range) and 'label' information. The code also mentions that there will be 19228 video feature npy files obtained from the activitynet_1.3_annotations.json file.",
+        "type": "comment"
+    },
+    "4194": {
+        "file_id": 355,
+        "content": "    ```\n3. Finally, modify the `feat_path` field in the configuration file configs/localization/bmn.yaml to specify the feature directory path, and the `file_path` field to specify the label file path.",
+        "type": "code",
+        "location": "/english_documents/dataset/ActivityNet.md:78-80"
+    },
+    "4195": {
+        "file_id": 355,
+        "content": "In the code, it is instructing to modify two fields in the configuration file. The `feat_path` field needs updating with the feature directory path, and the `file_path` should be specified for the label file path. This ensures proper data access during program execution.",
+        "type": "comment"
+    },
+    "4196": {
+        "file_id": 356,
+        "content": "/english_documents/dataset/Oxford_RobotCar.md",
+        "type": "filepath"
+    },
+    "4197": {
+        "file_id": 356,
+        "content": "The document explains the Oxford-RobotCar data preparation process for day-night depth estimation and provides related file download links. It outlines dataset preprocessing steps for ADDS-DepthNet training, including filtering, renaming, and image processing. The code showcases a directory structure with consistent training/verification sequences for day and night images.",
+        "type": "summary"
+    },
+    "4198": {
+        "file_id": 356,
+        "content": "[简体中文](../../zh-CN/dataset/Oxford_RobotCar.md) | English\n# Oxford-RobotCar-for-ADDS data preparation\n- [Introduction](#Introduction)\n- [Data Set Download](#Download)\n- [Preprocessing](#Preprocessing)\n- [1. Image De-distortion](#1-Image-de-distortion)\n- [2. Dynamic frame filter](#2-Dynamic-frame-filter)\n- [3. Image Rename](#3-Image-Rename)\n- [4. Preparation for Day-Pseudo Night Image Pair](#4-Day-Pseudo-Night-Image-Pair-Preparation)\n## Introduction\n[Oxford RobotCar Dataset](https://robotcar-dataset.robots.ox.ac.uk/) is a large-scale autonomous driving data set that contains a large amount of data in different autonomous driving scenarios.\nWhat is used here is to filter a part of the data used for day-night depth estimation from the original Oxford RobotCar data set, namely Oxford-RobotCar-for-ADDS.\nIf you want to use Oxford-RobotCar-for-ADDS, please cite the following papers:\n```latex\n@article{maddern20171,\n  title={1 year, 1000 km: The oxford robotcar dataset},\n  author={Maddern, Will and Pascoe, Geoffrey and Linegar, Chris and Newman, Paul},",
+        "type": "code",
+        "location": "/english_documents/dataset/Oxford_RobotCar.md:1-24"
+    },
+    "4199": {
+        "file_id": 356,
+        "content": "This is a brief introduction to the Oxford-RobotCar-for-ADDS data preparation document. It provides information on downloading and preprocessing the dataset for autonomous driving tasks, specifically day-night depth estimation. The original dataset can be found at the link provided in the text, and any use of this modified version should cite the referenced paper.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/42.json b/docs/data/42.json
new file mode 100644
index 000000000..565e78bbf
--- /dev/null
+++ b/docs/data/42.json
@@ -0,0 +1,541 @@
+{
+    "4200": {
+        "file_id": 356,
+        "content": "  journal={The International Journal of Robotics Research},\n  volume={36},\n  number={1},\n  pages={3--15},\n  year={2017},\n  publisher={SAGE Publications Sage UK: London, England}\n}\n```\n```latex\n@inproceedings{liu2021self,\n  title={Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation},\n  author={Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun},\n  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},\n  pages={12737--12746},\n  year={2021}\n}\n```\n## Download\n1. Download the left eye image of Bumblebee XB3 in the sequence [2014-12-09](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-09-13-21-02/) as For the training set of the daytime scene, the downloaded images are decompressed in the same folder.\n2. Download the left eye image of Bumblebee XB3 in the sequence [2014-12-16](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-16-18-44-24/) as The training set of the night scene, the downloaded images are unzipped in the same folder.",
+        "type": "code",
+        "location": "/english_documents/dataset/Oxford_RobotCar.md:25-46"
+    },
+    "4201": {
+        "file_id": 356,
+        "content": "This code represents the citation for two research papers in the format of BibTeX. The first paper is titled \"Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation\" and was published in 2021 at the IEEE/CVF International Conference on Computer Vision. The second paper is an Oxford RobotCar dataset study published in The International Journal of Robotics Research in 2017. Both papers are cited within a broader document, likely discussing the use or application of these datasets for computer vision tasks.",
+        "type": "comment"
+    },
+    "4202": {
+        "file_id": 356,
+        "content": "3. The images and depth truth values ​​of the validation set are filtered from the original data set and downloaded from the link we gave. (The data download links are below)\n    ```shell\n    https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt\n    https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt\n    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.001\n    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.002\n    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.001\n    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.002\n    https://videotag.bj.bcebos.com/Data/ADDS/day_val_451.7z\n    https://videotag.bj.bcebos.com/Data/ADDS/day_val_451_gt.7z\n    https://videotag.bj.bcebos.com/Data/ADDS/night_val_411.7z\n    https://videotag.bj.bcebos.com/Data/ADDS/night_val_411_gt.7z\n    ```\n    the original raw data download links:\n    ```shell\n    # data in day\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.001\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.002",
+        "type": "code",
+        "location": "/english_documents/dataset/Oxford_RobotCar.md:47-64"
+    },
+    "4203": {
+        "file_id": 356,
+        "content": "The code provides download links for validation and original raw data sets in Oxford RobotCar dataset, used for image and depth truth values.",
+        "type": "comment"
+    },
+    "4204": {
+        "file_id": 356,
+        "content": "    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.003\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.004\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.005\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.006\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.007\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.008\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.009\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.010\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.011\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.012\n    # data in night\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.001\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.002\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.003\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.004",
+        "type": "code",
+        "location": "/english_documents/dataset/Oxford_RobotCar.md:65-80"
+    },
+    "4205": {
+        "file_id": 356,
+        "content": "Links to parts of a 7z compressed file containing training data for day and night scenes from the Oxford RobotCar dataset.",
+        "type": "comment"
+    },
+    "4206": {
+        "file_id": 356,
+        "content": "    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.005\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.006\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.007\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.008\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.009\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.010\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.011\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.012\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.013\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.014\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.015\n    ```\n## Preprocessing\n### 1-Image-de-distortion\nUse the official toolbox [robotcar-dataset-sdk](https://github.com/ori-mrg/robotcar-dataset-sdk/tree/master/python) to pair the sequence 2014-12-09 and 2014-12- The image of 16 is de-distorted.",
+        "type": "code",
+        "location": "/english_documents/dataset/Oxford_RobotCar.md:81-98"
+    },
+    "4207": {
+        "file_id": 356,
+        "content": "This code provides a list of URLs for various file segments (005 to 015) related to the \"night_train_all.7z\" file. These files are likely part of the Oxford RobotCar dataset and are used in preprocessing steps, such as image de-distortion, which pairs sequences from specific dates. The official toolbox mentioned is necessary for this process.",
+        "type": "comment"
+    },
+    "4208": {
+        "file_id": 356,
+        "content": "### 2-Dynamic-frame-filter\nSince we use the self-supervised method, we need to filter out dynamic frames for training. The filtering principle is that the inter-frame pose change is greater than 0.1m and it is considered a dynamic frame. After filtering, the sequence of the training set is obtained.\n### 3-Image-Rename\nRename the original image timestamp to a continuous number sequence. For daytime scene correspondence, see [1209_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt), for night scene correspondence, see [1216_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt). The renamed data format is as follows:\n```\n├── oxford_processing\n    ├── day_train_all #Day training image folder (day_train_all.7z.001 ~ day_train_all.7z.012)\n    ├── night_train_all #Night training image folder (night_train_all.7z.001 ~ day_train_all.7z.015)\n    ├── day_val_451 #Daytime verification image folder (day_val_451.7z)\n    ├── day_val_451_gt #Daytime verification depth truth value folder (day_val_451_gt.7z)",
+        "type": "code",
+        "location": "/english_documents/dataset/Oxford_RobotCar.md:101-114"
+    },
+    "4209": {
+        "file_id": 356,
+        "content": "This code segment discusses two main components: 1) dynamic frame filtering for self-supervised training, and 2) renaming of original image timestamps to create continuous number sequences. The dataset contains daytime and nighttime training images, as well as daytime verification images with corresponding depth truth values.",
+        "type": "comment"
+    },
+    "4210": {
+        "file_id": 356,
+        "content": "    ├── night_val_411 #night verification image folder (night_val_411.7z)\n    └── night_val_411_gt #Night verification depth truth value folder (night_val_411_gt.7z)\n```\nannotation files download links are below:\n```shell\nhttps://videotag.bj.bcebos.com/Data/ADDS/train_files.txt\nhttps://videotag.bj.bcebos.com/Data/ADDS/val_day_files.txt\nhttps://videotag.bj.bcebos.com/Data/ADDS/val_night_files.txt\n```\nThe sequence used for training and verification is as follows:\n```\nsplits/oxford_day/train_files.txt # training sequence during the day\nsplits/oxford_night/train_files.txt # training sequence at night\nsplits/oxford_day_451/val_files.txt # verification sequence during the day\nsplits/oxford_night_411/val_files.txt # night verification sequence\n```\n### 4-Day-Pseudo-Night-Image-Pair-Preparation\nIn order to use our framework to extract the common information of day and night images, we use [CycleGAN](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix) to generate day-pseudo-night image pairs, where pseudo-n",
+        "type": "code",
+        "location": "/english_documents/dataset/Oxford_RobotCar.md:115-137"
+    },
+    "4211": {
+        "file_id": 356,
+        "content": "This code provides the location of image folders and annotation files for a robot car dataset, as well as the sequence used for training and verification. It also mentions the usage of CycleGAN to generate day-pseudo-night image pairs.",
+        "type": "comment"
+    },
+    "4212": {
+        "file_id": 356,
+        "content": "ight The night images corresponding to the daytime generated for CycleGAN, all images are scaled to 192x640, the night images are enhanced with histogram equalization, 75 epochs are trained, and the Oxford-RobotCar-for-ADDS is finally obtained. The generated day-pseudo-night image pair The data format is as follows, which can be directly used for training and verification of ADDS-DepthNet:\n```\n├── oxford_processing_forADDS\n    ├── day_train_all #Day training image folder (day_train_all.7z.001 ~ day_train_all.7z.002)\n    ├── night_train_all #Night training image folder (night_train_all.7z.001 ~ day_train_all.7z.002)\n    ├── day_val_451 #Daytime verification image folder (day_val_451.7z)\n    ├── day_val_451_gt #Daytime verification depth truth value folder (day_val_451_gt.7z)\n    ├── night_val_411 #night verification image folder (night_val_411.7z)\n    └── night_val_411_gt #Night verification depth truth value folder (night_val_411_gt.7z)\ndata\n└── oxford\n    ├── splits\n        ├── train_files.txt\n        ├── val_day_files.txt",
+        "type": "code",
+        "location": "/english_documents/dataset/Oxford_RobotCar.md:137-150"
+    },
+    "4213": {
+        "file_id": 356,
+        "content": "This code describes the file structure and data format of the Oxford-RobotCar dataset for ADDS-DepthNet training and verification. It includes daytime and nighttime images, as well as their ground truth depth values, organized into separate folders for training and validation purposes. The data has been preprocessed and scaled, with corresponding pseudo-night images generated using CycleGAN and histogram equalization applied to night images.",
+        "type": "comment"
+    },
+    "4214": {
+        "file_id": 356,
+        "content": "        └── val_night_files.txt\n    └── oxford_processing_forADDS\n        ├── day_train_all/      #Day training image folder (from day_train_all.7z.001 ~ day_train_all.7z.002)\n        ├── night_train_all/    #Night training image folder (from night_train_all.7z.001 ~ day_train_all.7z.002)\n        ├── day_val_451/        #Daytime verification image folder (from day_val_451.7z)\n        ├── day_val_451_gt/     #Daytime verification depth truth value folder (from day_val_451_gt.7z)\n        ├── night_val_411/      #night verification image folder (from night_val_411.7z)\n        └── night_val_411_gt/   #Night verification depth truth value folder (from night_val_411_gt.7z)\n```\nThe sequences used for training and verification are consistent with the foregoing.",
+        "type": "code",
+        "location": "/english_documents/dataset/Oxford_RobotCar.md:151-162"
+    },
+    "4215": {
+        "file_id": 356,
+        "content": "The code represents a directory structure containing day and night training and verification image folders, along with their respective depth truth value folders. The sequences used for both training and verification are consistent.",
+        "type": "comment"
+    },
+    "4216": {
+        "file_id": 357,
+        "content": "/english_documents/dataset/README.md",
+        "type": "filepath"
+    },
+    "4217": {
+        "file_id": 357,
+        "content": "This code presents a comprehensive table of datasets for action recognition, localization, and spatio-temporal action detection, covering various categories like Skeleton-based Action Recognition and Text-Video Retrieval. The table includes dataset names, homepages, and publication years from different conferences like CVPR and ICCV.",
+        "type": "summary"
+    },
+    "4218": {
+        "file_id": 357,
+        "content": "English | [简体中文](../../zh_CN/dataset/README.md)\n# Dataset\n## 1. Dataset List\n<table>\n  <tbody><tr>\n    <td colspan=\"4\">Action Recognition</td>\n  </tr>\n  <tr>\n    <td><a href=\"./k400.md\">Kinetics-400</a> (<a href=\"https://deepmind.com/research/open-source/kinetics/\" rel=\"nofollow\">Homepage</a>) (CVPR'2017)</td>\n    <td><a href=\"./ucf101.md\">UCF101</a> (<a href=\"https://www.crcv.ucf.edu/research/data-sets/ucf101/\" rel=\"nofollow\">Homepage</a>) (CRCV-IR-12-01)</td>\n    <td><a href=\"./ActivityNet.md\">ActivityNet</a> (<a href=\"http://activity-net.org/\" rel=\"nofollow\">Homepage</a>) (CVPR'2015)</td>\n    <td><a href=\"./youtube8m.md\">YouTube-8M</a> (<a href=\"https://research.google.com/youtube8m/\" rel=\"nofollow\">Homepage</a>) (CVPR'2017)</td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Action Localization</td>\n  </tr>\n  <tr>\n    <td><a href=\"./ActivityNet.md\">ActivityNet</a> (<a href=\"http://activity-net.org/\" rel=\"nofollow\">Homepage</a>) (CVPR'2015)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Spatio-Temporal Action Detection</td>",
+        "type": "code",
+        "location": "/english_documents/dataset/README.md:1-28"
+    },
+    "4219": {
+        "file_id": 357,
+        "content": "The code provides a table listing various datasets for action recognition, action localization, and spatio-temporal action detection. It includes links to dataset homepages and their respective publication years.",
+        "type": "comment"
+    },
+    "4220": {
+        "file_id": 357,
+        "content": "  </tr>\n  <tr>\n    <td><a href=\"./AVA.md\">AVA</a> (<a href=\"https://research.google.com/ava/index.html\" rel=\"nofollow\">Homepage</a>) (CVPR'2018)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Skeleton-based Action Recognition</td>\n  </tr>\n  <tr>\n    <td><a href=\"./ntu-rgbd.md\">NTURGB+D</a> (<a href=\"https://rose1.ntu.edu.sg/dataset/actionRecognition/\" rel=\"nofollow\">Homepage</a>) (IEEE CS'2016)</td>\n    <td><a href=\"./fsd.md\">FSD</a> (<a href=\"https://aistudio.baidu.com/aistudio/competition/detail/115/0/introduction\" rel=\"nofollow\">Homepage</a>)</td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Depth Estimation</td>\n  </tr>\n  <tr>\n    <td><a href=\"./Oxford_RobotCar.md\">Oxford-RobotCar</a> (<a href=\"https://robotcar-dataset.robots.ox.ac.uk/\" rel=\"nofollow\">Homepage</a>) (IJRR'2017)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Text-Video Retrieval</td>\n  </tr>\n  <tr>\n    <td><a href=\"docs/zh-CN/dataset/msrvtt.md\">MSR-VTT</a> (<",
+        "type": "code",
+        "location": "/english_documents/dataset/README.md:29-58"
+    },
+    "4221": {
+        "file_id": 357,
+        "content": "This code snippet is a table of datasets. It mentions dataset names, their corresponding homepages, and the year they were published in. The categories include Skeleton-based Action Recognition, Depth Estimation, and Text-Video Retrieval.",
+        "type": "comment"
+    },
+    "4222": {
+        "file_id": 357,
+        "content": "a href=\"https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/\" rel=\"nofollow\">Homepage</a>) (CVPR'2016)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Text-Video Pretrained Model</td>\n  </tr>\n  <tr>\n    <td><a href=\"docs/zh-CN/dataset/howto100m.md\">HowTo100M</a> (<a href=\"https://www.di.ens.fr/willow/research/howto100m/\" rel=\"nofollow\">Homepage</a>) (ICCV'2019)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n</tbody>\n</table>",
+        "type": "code",
+        "location": "/english_documents/dataset/README.md:58-73"
+    },
+    "4223": {
+        "file_id": 357,
+        "content": "This code appears to be part of a table within an HTML file. The table lists different datasets related to video and text, with their respective names, descriptions, and links to their homepages or documentation. It also provides information on the year of publication for each dataset, which seems to be from various conferences such as CVPR and ICCV.",
+        "type": "comment"
+    },
+    "4224": {
+        "file_id": 358,
+        "content": "/english_documents/dataset/SegmentationDataset.md",
+        "type": "filepath"
+    },
+    "4225": {
+        "file_id": 358,
+        "content": "The code introduces a video action segmentation dataset that utilizes breakfast, 50salads, and gtea datasets. The pre-training model's extracted features are used for the dataset. The dataset tree and data tree structure are provided, along with details of their folder contents.",
+        "type": "summary"
+    },
+    "4226": {
+        "file_id": 358,
+        "content": "English | [简体中文](../../zh-CN/dataset/SegmentationDataset.md)\n# Video Action Segmentation Dataset\nThe video motion segmentation model uses breakfast, 50salads and gtea data sets. The use method is to use the features extracted by the pre training model, which can be obtained from the ms-tcn official code base.[feat](https://zenodo.org/record/3625992#.Xiv9jGhKhPY)\n- Dataset tree\n```txt\n─── gtea\n    ├── features\n    │   ├── S1_Cheese_C1.npy\n    │   ├── S1_Coffee_C1.npy\n    │   ├── S1_CofHoney_C1.npy\n    │   └── ...\n    ├── groundTruth\n    │   ├── S1_Cheese_C1.txt\n    │   ├── S1_Coffee_C1.txt\n    │   ├── S1_CofHoney_C1.txt\n    │   └── ...\n    ├── splits\n    │   ├── test.split1.bundle\n    │   ├── test.split2.bundle\n    │   ├── test.split3.bundle\n    │   └── ...\n    └── mapping.txt\n```\n- data tree\n```txt\n─── data\n    ├── 50salads\n    ├── breakfast\n    ├── gtea\n    └── ...\n```",
+        "type": "code",
+        "location": "/english_documents/dataset/SegmentationDataset.md:1-35"
+    },
+    "4227": {
+        "file_id": 358,
+        "content": "The code introduces a video action segmentation dataset that utilizes breakfast, 50salads, and gtea datasets. The pre-training model's extracted features are used for the dataset. The dataset tree and data tree structure are provided, along with details of their folder contents.",
+        "type": "comment"
+    },
+    "4228": {
+        "file_id": 359,
+        "content": "/english_documents/dataset/fsd.md",
+        "type": "filepath"
+    },
+    "4229": {
+        "file_id": 359,
+        "content": "The Figure Skating Dataset offers 30 fps competition videos with Open Pose key points, and includes train_data, train_label, test_A_data, and test_B_data, downloadable from the competition's homepage. RGB datasets unavailable due to copyright reasons.",
+        "type": "summary"
+    },
+    "4230": {
+        "file_id": 359,
+        "content": "[简体中文](../../zh-CN/dataset/fsd.md) | English\n# Figure Skating Dataset\n- [Introduction](#Introduction)\n- [Download](#Download)\n---\n## Introduction\nIn figure skating, compared with other sports, human posture and trajectory show the characteristics of strong complexity, which is helpful to the research of fine-grained action recognition tasks.\nFor FSD Dataset, all video materials are collected from the Figure Skating Championships from 2017 to 2018. The frame rate of the video is uniformly standardized to 30 frames per second, and the image size is 1080 * 720 to ensure the relative consistency of the dataset. After that, we use the 2D pose estimation algorithm Open Pose to extract frame by frame key points from the video, and finally save the data in `.npy` format.\nThe directory structure of training dataset and test dataset is as follows:\n```txt\ntrain_data.npy        # 2922\ntrain_label.npy       # 2922\ntest_A_data.npy       # 628\ntest_B_data.npy       # 634\n```\n`train_label.npy` can be read using `np.",
+        "type": "code",
+        "location": "/english_documents/dataset/fsd.md:1-26"
+    },
+    "4231": {
+        "file_id": 359,
+        "content": "Figure Skating Dataset provides video materials from Figure Skating Championships (2017-2018) standardized to 30 frames per second and 1080 * 720 image size. It uses Open Pose for key points extraction and saves data in .npy format. The dataset includes train_data, train_label, test_A_data, and test_B_data with respective counts. Train_label can be read using np.",
+        "type": "comment"
+    },
+    "4232": {
+        "file_id": 359,
+        "content": "load()`, each element is an integer variable with a value between 0-29, representing the label of the action. `data.npy` can be read using `np.load()`, return a tensor with the shape of `N×C×T×V×M`, the specific meaning of each dimension is as follows:\n| Dimension | Size | Meaning\t| Notes |\n| :---- | :----: | :----: | :---- |\n| N\t| N\t| Number of samples | - |\n| C | 3\t| The coordinates and confidence of each joint point respectively |\trescale to -1~1 |\n| T\t| 1500 |\t The duration of the action\t| The actual length of some actions may be less than 1500, in such case we will pad 0 to ensure the unity of T dimension. |\n| V |\t25 | Number of joint points |\tSee the skeleton example below for the meaning of specific joint points. |\n| M |\t1\t|  Number of athletes\t| - |\nskeleton example：\n<div align=\"left\">\n  <img src=\"../../images/skeleton_example.png\" width=\"180px\"/><br>\n</div>\n## Download\nYou can get the download link after registering on the [competition homepage](https://www.datafountain.cn/competitions/519).",
+        "type": "code",
+        "location": "/english_documents/dataset/fsd.md:26-47"
+    },
+    "4233": {
+        "file_id": 359,
+        "content": "This code describes the structure and meaning of a tensor in the dataset, with dimensions N (number of samples), C (coordinates and confidence of joint points), T (duration of action), V (number of joint points), and M (number of athletes). It also includes an example image of a skeleton to illustrate the joint points' positions. The data can be downloaded from the competition homepage after registration.",
+        "type": "comment"
+    },
+    "4234": {
+        "file_id": 359,
+        "content": "| Set | Data | Label\t|\n| :---- | :----: | :----: |\n| Train\t| [train_data.npy](https://videotag.bj.bcebos.com/Data/FSD_train_data.npy)\t| [train_label.npy](https://videotag.bj.bcebos.com/Data/FSD_train_label.npy) |\n| TestA\t| comming soon\t| comming soon |\n> RGB datasets would not be provided for copyright reasons.",
+        "type": "code",
+        "location": "/english_documents/dataset/fsd.md:49-55"
+    },
+    "4235": {
+        "file_id": 359,
+        "content": "This code provides dataset information for a PaddleVideo English document. It mentions the Train set with its data and label file URLs, while noting that RGB datasets are not available due to copyright reasons. TestA's data and labels will be provided soon.",
+        "type": "comment"
+    },
+    "4236": {
+        "file_id": 360,
+        "content": "/english_documents/dataset/k400.md",
+        "type": "filepath"
+    },
+    "4237": {
+        "file_id": 360,
+        "content": "This code allows users to download the Kinetics-400 dataset using two methods and provides a script for extracting frames. The training and validation sets require around 135GB and 2TB of storage space, respectively. Two data categories are included with list file links.",
+        "type": "summary"
+    },
+    "4238": {
+        "file_id": 360,
+        "content": "[简体中文](../../zh-CN/dataset/k400.md) | English\n# Kinetics-400 Preparation\n- [Introduction](#Introduction)\n- [Download](#Download)\n- [Frames](#Frames)\n---\n## Introduction\nKinetics-400 is a commonly used benchmark dataset in the video field. Please refer to its official website [Kinetics](https://deepmind.com/research/open-source/kinetics) for details. You can refer to the official address [ActivityNet](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics), and use the download script provided to download the dataset.\n## Download\nConsidering the difficulty of downloading the K400 data set, we provide two download methods: (1) Baidu network disk download (2) Script download\n### Baidu SkyDrive Download\nNetdisk link: https://pan.baidu.com/s/1S_CGBjWOUAuxL_cCX5kMPg\nExtraction code: `ppvi`\n### Script download\n- Download the training set link list file [train_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list) and the validation set link list file [val_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list).",
+        "type": "code",
+        "location": "/english_documents/dataset/k400.md:1-27"
+    },
+    "4239": {
+        "file_id": 360,
+        "content": "This code provides information on downloading the Kinetics-400 dataset, which is commonly used in video tasks. It offers two methods for download: Baidu network disk and script download. The code also directs to official resources like the Kinetics website and ActivityNet repository for further details.",
+        "type": "comment"
+    },
+    "4240": {
+        "file_id": 360,
+        "content": "Write the download script `download.sh` as follows:\n```bash\nfile=$1\nwhile read line \ndo\n  wget \"$line\"\ndone <$file\n```\nDownload training set command:\n```bash\nbash download.sh train_link.list\n```\nDownload verification set command:\n```bash\nbash download.sh val_link.list\n```\n---\n|category | Number of data  | list file |\n| :------: | :----------: | :----: |\n|Training set | 234619  |  [train.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list)|\n|Validation set | 19761 |  [val.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list)|\n- After downloading, unzip and add the data path to list file.\n- Due to the failure of some video link, part of original data is missing. This copies need about 135G of storage space.\n> This copies is only used for academic research. If it is helpful to you, welcome to star [our project](https://github.com/PaddlePaddle/PaddleVideo)\n## Frames\nIn order to speed up the training process of the network, we first extract frames from the video file (K4",
+        "type": "code",
+        "location": "/english_documents/dataset/k400.md:29-65"
+    },
+    "4241": {
+        "file_id": 360,
+        "content": "This script downloads training and validation sets from provided links, unzips them, and adds the data paths to respective list files. Due to broken video links, approximately 135GB of storage space is required. The frames extracted from videos help in accelerating network training.",
+        "type": "comment"
+    },
+    "4242": {
+        "file_id": 360,
+        "content": "00 video file is in mp4 format). Compared with the method of network training directly through video files, the method of frames can greatly accelerate the speed of network training。\nEnter the following command to extract the frames of the K400 video file\n```python\npython extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4\n```\nAfter the video file frames are extracted, they will be stored in the specified `./rawframes` path, and the size is about 2T.\n|category | Number of data  | list file |\n| :------: | :----------: | :----: |\n|Training set | 234619  |  [train_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list)|\n|Validation set | 19761 |  [val_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list)|",
+        "type": "code",
+        "location": "/english_documents/dataset/k400.md:65-78"
+    },
+    "4243": {
+        "file_id": 360,
+        "content": "This code explains how to extract frames from the K400 video file in mp4 format using the \"extract_rawframes.py\" script, and provides the command to execute it with specified paths for videos and raw frames folders, along with level and ext parameters. The extracted frames will be stored in the ./rawframes path, occupying around 2TB of space. The code also mentions two data categories - training set (234619 files) and validation set (19761 files), along with their respective list file links for easy reference.",
+        "type": "comment"
+    },
+    "4244": {
+        "file_id": 361,
+        "content": "/english_documents/dataset/msrvtt.md",
+        "type": "filepath"
+    },
+    "4245": {
+        "file_id": 361,
+        "content": "The MSR-VTT dataset contains 10K videos available on its website, organized in a \"data\" directory for ActBERT model use. The lock.mdb file is a database used for storing and managing data related to multi-modal transformers for video retrieval as described in a 2020 ECCV paper.",
+        "type": "summary"
+    },
+    "4246": {
+        "file_id": 361,
+        "content": "[简体中文](../../zh-CN/dataset/msrvtt.md) | English\n# MSR-VTT Preparation\n- [Introduction](#1.1)\n- [Download for T2VLAD](#1.2)\n- [Download for ActBERT](#1.3)\n- [Reference](#1.4)\n<a name=\"1.1\"></a>\n## Introduction\nMSR-VTT(Microsoft Research Video to Text) is a large-scale dataset containing videos and subtitles, which is composed of 10000 video clips from 20 categories, and each video clip is annotated with 20 English sentences. We used 9000 video clips for training and 1000 for testing. For more details, please refer to the website: [MSRVTT](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)\n<a name=\"1.2\"></a>\n## Download for T2VLAD\n[T2VLAD doc](../../../applications/T2VLAD/README_en.md)\nFor ease of use, we provided extracted features of video.\nFirst, make sure to enter the following command in the `applications/T2VLAD/data` directory to download the dataset.\n```bash\nbash download_features.sh\n```\nAfter downloading, the files in the data directory are organized as follows:",
+        "type": "code",
+        "location": "/english_documents/dataset/msrvtt.md:1-29"
+    },
+    "4247": {
+        "file_id": 361,
+        "content": "This code provides an overview of the MSR-VTT dataset, its download process for T2VLAD and ActBERT applications, and references for more information. It consists of 10K video clips from 20 categories, each with 20 English sentences, and is available on the MSRVTT website.",
+        "type": "comment"
+    },
+    "4248": {
+        "file_id": 361,
+        "content": "```\n├── data\n|   ├── MSR-VTT\n|   │   ├── raw-captions.pkl\n|   │   ├── train_list_jsfusion.txt\n|   │   ├── val_list_jsfusion.txt\n|   │   ├── aggregated_text_feats\n|   |   |   ├── w2v_MSRVTT_openAIGPT.pickle\n|   |   ├── mmt_feats\n|   │   │   ├── features.audio.pkl\n|   │   │   ├── features.face_agg.pkl\n|   │   │   ├── features.flos_agg.pkl\n|   │   │   ├── features.ocr.pkl\n|   │   │   ├── features.rgb_agg.pkl\n|   │   │   ├── features.s3d.pkl\n|   │   │   ├── features.scene.pkl\n|   │   │   ├── features.speech.pkl\n```\n<a name=\"1.3\"></a>\n## Download for ActBERT\n[ActBERT doc](../model_zoo/multimodal/actbert.md)\nDownload data features:\n```\nwget https://videotag.bj.bcebos.com/Data/ActBERT/msrvtt_test.lmdb.tar\nwget https://videotag.bj.bcebos.com/Data/ActBERT/MSRVTT_JSFUSION_test.csv\n```\nDecompress the `msrvtt_test.lmdb.tar`：\n```\ntar -zxvf msrvtt_test.lmdb.tar\n```\nThe files in the data directory are organized as follows:\n```\n├── data\n|   ├── MSR-VTT\n|   │   ├── MSRVTT_JSFUSION_test.csv\n|   │   ├── msrvtt_test.lmdb\n|   │       ├── data.mdb",
+        "type": "code",
+        "location": "/english_documents/dataset/msrvtt.md:31-73"
+    },
+    "4249": {
+        "file_id": 361,
+        "content": "Code provides the instructions to download and decompress data features required for ActBERT model, specifically for MSR-VTT dataset. The data is organized in the \"data\" directory with a .lmdb file and a CSV file containing JSFusion test data.",
+        "type": "comment"
+    },
+    "4250": {
+        "file_id": 361,
+        "content": "|   │       ├── lock.mdb\n```\n<a name=\"1.4\"></a>\n## Reference\n- Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. Multi-modal transformer for video retrieval. In ECCV, 2020.",
+        "type": "code",
+        "location": "/english_documents/dataset/msrvtt.md:74-79"
+    },
+    "4251": {
+        "file_id": 361,
+        "content": "lock.mdb: Database file used for storing and managing data in codebase related to multi-modal transformer for video retrieval as described in the 2020 ECCV paper by Valentin Gabeur et al.",
+        "type": "comment"
+    },
+    "4252": {
+        "file_id": 362,
+        "content": "/english_documents/dataset/ntu-rgbd.md",
+        "type": "filepath"
+    },
+    "4253": {
+        "file_id": 362,
+        "content": "The code prepares the NTU RGB+D dataset for CTR-GCN through data organization and cleaning, involving obtaining, denoising, and transforming skeleton data using three scripts. The dataset consists of 60 action classes with two splits: Cross-subject and Cross-view.",
+        "type": "summary"
+    },
+    "4254": {
+        "file_id": 362,
+        "content": "[简体中文](../../zh-CN/dataset/ntu-rgbd.md) | English\n# NTU-RGB+D Preparation\n- [Introduction](#Introduction)\n- [ST-GCN Data Prepare](#ST-GCN_Data_Prepare)\n- [CTR-GTCN Data Prepare](#CTR-GCN_Data_Prepare)\n---\n## Introduction\nNTU-RGB+D contains 60 action classes and 56,880 video samples for skeleton-based action recognition. Please refer to its official website[NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) for more details.\nThe dataset contains two splits when dividing the training set and test set. For Cross-subject, the dataset is divided according to character id, with 40320 samples in training set and 16560 samples in test set. For Cross-view, the dataset is divided according to camera division. The samples collected by cameras 2 and 3 are training sets, including 37930 samples, and the samples collected by camera 1 are test sets, including 18960 samples.\n## ST-GCN_Data_Prepare\nST-GCN data prepare preceduce are introducted follow.\n### Download\nWe provide the download link of the p",
+        "type": "code",
+        "location": "/english_documents/dataset/ntu-rgbd.md:1-23"
+    },
+    "4255": {
+        "file_id": 362,
+        "content": "NTU-RGB+D dataset contains 60 action classes and 56,880 video samples for skeleton-based action recognition. It has two splits: Cross-subject and Cross-view. ST-GCN data preparation process introduced in the following sections.",
+        "type": "comment"
+    },
+    "4256": {
+        "file_id": 362,
+        "content": "rocessed dataset [NTU-RGB-D.tar](https://videotag.bj.bcebos.com/Data/NTU-RGB-D.tar)(~3.1G). Please download and unzip with ```tar -zxvf NTU-RGB-D.tar ``` , the directory structure is as follows：\n```txt\n─── NTU-RGB-D\n    ├── xsub\n    │   ├── train_data.npy\n    │   ├── train_label.pkl\n    │   ├── val_data.npy\n    │   └── val_label.pkl\n    └── xview\n        ├── train_data.npy\n        ├── train_label.pkl\n        ├── val_data.npy\n        └── val_label.pkl\n```\n> This is a copies from [st-gcn](https://github.com/open-mmlab/mmskeleton/blob/master/doc/SKELETON_DATA.md).\n## CTR-GCN_Data_Prepare\nCTR-GCN data prepare preceduce are introducted follow.\n### Download\nThere is script `download_dataset.sh` to download the dataset from official website [NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) in dictory `data\\ntu-rgb-d`.\n```bash\nsh data/ntu-rgb-d/download_dataset.sh\n```\nFile tree:\n```txt\n─── ntu-rgb-d\n    ├── download_dataset.sh\n    ├── nturgb+d_skeletons\n    │   ├── S001C001P001R001A001.skeleton\n    │   ├── S001C001P001R001A002.skeleton",
+        "type": "code",
+        "location": "/english_documents/dataset/ntu-rgbd.md:23-59"
+    },
+    "4257": {
+        "file_id": 362,
+        "content": "This code describes a processed dataset called NTU-RGB-D, which is approximately 3.1GB in size and requires downloading and unzipping using the command \"tar -zxvf NTU-RGB-D.tar\". The resulting directory structure contains train and val data for both xsub and xview. The code also provides a script called download_dataset.sh to facilitate downloading the dataset from the official website, and shows the file tree structure after successful download.",
+        "type": "comment"
+    },
+    "4258": {
+        "file_id": 362,
+        "content": "    │   ├── S001C001P001R001A003.skeleton\n    │   ├── S001C001P001R001A004.skeleton\n    │   ├── S001C001P001R001A005.skeleton\n    │   ├── S001C001P001R001A006.skeleton\n    │   ├── S001C001P001R001A007.skeleton\n    │   ├── ....\n    │   └── S017C003P020R002A060.skeleton\n    ├── get_raw_denoised_data.py\n    ├── get_raw_skes_data.py\n    ├── seq_transformation.py\n    └── statistics\n        ├── camera.txt\n        ├── label.txt\n        ├── performer.txt\n        ├── replication.txt\n        ├── setup.txt\n        └── skes_available_name.txt\n```\n### Prepare\nrun follow script, then data will be precessed to the data format need by CTR-GCN.\n> Note：if make dataset by yourself, please prepare `data/ntu-rgb-d/statistics/skes_available_name.txt`, which is the list of skeletons files that will be precessed.\n```bash\ncd ./data/ntu-rgb-d\n# Get skeleton of each performer\npython get_raw_skes_data.py\n# Remove the bad skeleton\npython get_raw_denoised_data.py\n# Transform the skeleton to the center of the first frame\npython seq_transformation.py",
+        "type": "code",
+        "location": "/english_documents/dataset/ntu-rgbd.md:60-93"
+    },
+    "4259": {
+        "file_id": 362,
+        "content": "The provided code describes the preparation steps for processing the NTU-RGBD dataset to be used by CTR-GCN. It involves running three separate Python scripts in order:\n1. `get_raw_skes_data.py` is responsible for obtaining the skeleton of each performer from the data folders.\n2. `get_raw_denoised_data.py` removes any bad or corrupted skeletons from the dataset.\n3. `seq_transformation.py` transforms the remaining skeletons to the center of the first frame.\nTo follow these steps, navigate to the NTU-RGBD dataset folder and run each script sequentially in your command line interface.",
+        "type": "comment"
+    },
+    "4260": {
+        "file_id": 362,
+        "content": "```\nFile tree:\n```txt\n─── ntu-rgb-d\n    ├── download_dataset.sh\n    ├── nturgb+d_skeletons\n    │   ├── S001C001P001R001A001.skeleton\n    │   ├── S001C001P001R001A002.skeleton\n    │   ├── S001C001P001R001A003.skeleton\n    │   ├── S001C001P001R001A004.skeleton\n    │   ├── S001C001P001R001A005.skeleton\n    │   ├── S001C001P001R001A006.skeleton\n    │   ├── S001C001P001R001A007.skeleton\n    │   ├── ....\n    │   └── S017C003P020R002A060.skeleton\n    ├── denoised_data\n    │   ├── actors_info\n    │   │   ├── S001C001P001R001A024.txt\n    │   │   ├── S001C001P001R001A025.txt\n    │   │   ├── S001C001P001R001A026.txt\n    │   │   ├── ....\n    │   │   ├── S017C003P020R002A059.txt\n    │   │   └── S017C003P020R002A060.txt\n    │   ├── denoised_failed_1.log\n    │   ├── denoised_failed_2.log\n    │   ├── frames_cnt.txt\n    │   ├── missing_skes_1.log\n    │   ├── missing_skes_2.log\n    │   ├── missing_skes.log\n    │   ├── noise_length.log\n    │   ├── noise_motion.log\n    │   ├── noise_spread.log\n    │   ├── raw_denoised_colors.pkl\n    │   ├── raw_denoised_joints.pkl",
+        "type": "code",
+        "location": "/english_documents/dataset/ntu-rgbd.md:94-129"
+    },
+    "4261": {
+        "file_id": 362,
+        "content": "The code represents a dataset called \"ntu-rgb-d\" containing skeleton data and associated files for denoising, logging missing skeletons, and tracking frames. The dataset is organized into folders including 'nturgb+d_skeletons' containing skeleton files per actor and 'denoised_data' with various log and pickle files related to the denoising process.",
+        "type": "comment"
+    },
+    "4262": {
+        "file_id": 362,
+        "content": "    │   └── rgb+ske\n    ├── raw_data\n    │   ├── frames_cnt.txt\n    │   ├── frames_drop.log\n    │   ├── frames_drop_skes.pkl\n    │   └── raw_skes_data.pkl\n    ├── get_raw_denoised_data.py\n    ├── get_raw_skes_data.py\n    ├── seq_transformation.py\n    ├── statistics\n    │   ├── camera.txt\n    │   ├── label.txt\n    │   ├── performer.txt\n    │   ├── replication.txt\n    │   ├── setup.txt\n    │   └── skes_available_name.txt\n    ├── xview\n    │   ├── train_data.npy\n    │   ├── train_label.pkl\n    │   ├── val_data.npy\n    │   └── val_label.pkl\n    └── xsub\n        ├── train_data.npy\n        ├── train_label.pkl\n        ├── val_data.npy\n        └── val_label.pkl\n```\n> Note：dictory `denoised_data`、`raw_data`and`nturgb+d_skeletons`, that are temporal files, can be deleted, if extracted `xview` and `xsub`.",
+        "type": "code",
+        "location": "/english_documents/dataset/ntu-rgbd.md:130-158"
+    },
+    "4263": {
+        "file_id": 362,
+        "content": "This code appears to organize various data files related to a dataset, likely for the NTU RGB+D action recognition benchmark. The directory structure includes raw data, denoised data, and preprocessed data in separate folders (xview and xsub). There are also statistics files and Python scripts for getting raw and denoised data. The notes suggest that some of the temporal files can be deleted if the extracted xview and xsub files are available.",
+        "type": "comment"
+    },
+    "4264": {
+        "file_id": 363,
+        "content": "/english_documents/dataset/ucf101.md",
+        "type": "filepath"
+    },
+    "4265": {
+        "file_id": 363,
+        "content": "This code downloads, extracts, and organizes UCF101 dataset into separate folders with training and validation sets, representing a file hierarchy for easy access.",
+        "type": "summary"
+    },
+    "4266": {
+        "file_id": 363,
+        "content": "# UCF101数据准备\nUCF101数据的相关准备。主要包括UCF101的video文件下载，video文件提取frames，以及生成文件的路径list。\n---\n## 1. 数据下载\nUCF101数据的详细信息可以参考网站[UCF101](https://www.crcv.ucf.edu/data/UCF101.php)。 为了方便用户使用，我们提供了UCF101数据的annotations文件和videos文件的下载脚本。\n### 下载annotations文件\n首先，请确保在`./data/dataset/ucf101/`目录下，输入如下UCF101数据集的标注文件的命令。\n```shell\nbash download_annotations.sh\n```\n### 下载UCF101的视频文件\n同样需要确保在`./data/dataset/ucf101/`目录下，输入下述命令下载视频文件\n```shell\nbash download_videos.sh\n```\n下载完成后视频文件会存储在`./data/dataset/ucf101/videos/`文件夹下，视频文件大小为6.8G。\n---\n## 2. 提取视频文件的frames\n为了加速网络的训练过程，我们首先对视频文件（ucf101视频文件为avi格式）提取帧 (frames)。相对于直接通过视频文件进行网络训练的方式，frames的方式能够加快网络训练的速度。\n直接输入如下命令，即可提取ucf101视频文件的frames\n``` python\npython extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext avi\n```\n视频文件frames提取完成后，会存储在`./rawframes`文件夹下，大小为56G。\n---\n## 3. 生成frames文件和视频文件的路径list\n生成视频文件的路径list，输入如下命令\n```python\npython build_ucf101_file_list.py videos/ --level 2 --format videos --out_list_path ./\n```\n生成frames文件的路径list，输入如下命令：\n```python\npython build_ucf101_file_list.py rawframes/ --level 2 --format rawframes --out_list_path ./",
+        "type": "code",
+        "location": "/english_documents/dataset/ucf101.md:1-40"
+    },
+    "4267": {
+        "file_id": 363,
+        "content": "This code provides instructions on how to download UCF101 dataset annotations and videos, extract frames from the video files, and generate file path lists for both the original videos and extracted frames. The code also includes commands to execute these steps using provided Python scripts.",
+        "type": "comment"
+    },
+    "4268": {
+        "file_id": 363,
+        "content": "```\n**参数说明**\n`videos/` 或者 `rawframes/` ： 表示视频或者frames文件的存储路径\n`--level 2` ： 表示文件的存储结构\n`--format`： 表示是针对视频还是frames生成路径list\n`--out_list_path `： 表示生的路径list文件存储位置\n# 以上步骤完成后，文件组织形式如下所示\n```\n├── data\n|   ├── dataset\n|   │   ├── ucf101\n|   │   │   ├── ucf101_{train,val}_split_{1,2,3}_rawframes.txt\n|   │   │   ├── ucf101_{train,val}_split_{1,2,3}_videos.txt\n|   │   │   ├── annotations\n|   │   │   ├── videos\n|   │   │   │   ├── ApplyEyeMakeup\n|   │   │   │   │   ├── v_ApplyEyeMakeup_g01_c01.avi\n|  \n|   │   │   │   ├── YoYo\n|   │   │   │   │   ├── v_YoYo_g25_c05.avi\n|   │   │   ├── rawframes\n|   │   │   │   ├── ApplyEyeMakeup\n|   │   │   │   │   ├── v_ApplyEyeMakeup_g01_c01\n|   │   │   │   │   │   ├── img_00001.jpg\n|   │   │   │   │   │   ├── img_00002.jpg\n|   │   │   │   │   │   ├── ...\n|   │   │   │   │   │   ├── flow_x_00001.jpg\n|   │   │   │   │   │   ├── flow_x_00002.jpg\n|   │   │   │   │   │   ├── ...\n|   │   │   │   │   │   ├── flow_y_00001.jpg\n|   │   │   │   │   │   ├── flow_y_00002.jpg\n|   │   │   │   ├── ...\n|   │   │   │   ├── YoYo",
+        "type": "code",
+        "location": "/english_documents/dataset/ucf101.md:41-81"
+    },
+    "4269": {
+        "file_id": 363,
+        "content": "This code is describing the organization of files for UCF101 dataset, specifying paths and formats. It organizes videos or frames into separate folders based on their categories and splits them into training and validation sets. The annotations folder contains information about each video or frame, while the dataset folder stores the generated path lists.",
+        "type": "comment"
+    },
+    "4270": {
+        "file_id": 363,
+        "content": "|   │   │   │   │   ├── v_YoYo_g01_c01\n|   │   │   │   │   ├── ...\n|   │   │   │   │   ├── v_YoYo_g25_c05\n```",
+        "type": "code",
+        "location": "/english_documents/dataset/ucf101.md:82-86"
+    },
+    "4271": {
+        "file_id": 363,
+        "content": "Code represents a file hierarchy in the UCF101 dataset, where each folder inside \"dataset\" corresponds to a video category and contains clips (e.g., v_YoYo_g01_c01).",
+        "type": "comment"
+    },
+    "4272": {
+        "file_id": 364,
+        "content": "/english_documents/dataset/ucf24.md",
+        "type": "filepath"
+    },
+    "4273": {
+        "file_id": 364,
+        "content": "The text outlines the process of preparing UCF24 dataset with PaddleVideo's build_split.py command, installing unrar tool and providing a download script for ease of access. The file structure contains video sequences and split files for training and testing.",
+        "type": "summary"
+    },
+    "4274": {
+        "file_id": 364,
+        "content": "English | [简体中文](../../zh-CN/dataset/ucf24.md)\n# UCF24 Data Preparation\nThis document mainly introduces the preparation process of UCF24 dataset. It mainly includes the download of the RGB frame files, the annotation files and the pathlist of the generated file.\n---\n## 1. Data Download\nDetailed information on UCF24 data can be found on the website [UCF24](http://www.thumos.info/download.html). For ease of use, PaddleVideo provides a download script for the RGB frame, annotation file of the UCF24 data.\nFirst, please ensure access to the [data/ucf24/ directory](../../../data/ucf24) and enter the following command for downloading the RGB frame, annotation file of the UCF24 dataset.\n```shell\nbash download_frames_annotations.sh\n```\n- To run this command you need to install the unrar decompression tool, which can be installed using the pip method.\n- The RGB frame files will be stored in the [data/ucf24/rgb-images/ directory](../../../data/ucf24/rgb-images)\n- The annotation files will be stored in the [data/ucf24/lables/ directory](../../../data/ucf24/labels)",
+        "type": "code",
+        "location": "/english_documents/dataset/ucf24.md:1-20"
+    },
+    "4275": {
+        "file_id": 364,
+        "content": "Introduction to UCF24 dataset preparation process, including download of RGB frame and annotation files. PaddleVideo provides a download script for easier access. Requires unrar tool installation. RGB frames stored in rgb-images directory, annotations in labels directory.",
+        "type": "comment"
+    },
+    "4276": {
+        "file_id": 364,
+        "content": "---\n## 2. File Pathlist Generation\nTo specify the format for dividing the file, enter the following command\n```python\npython build_split.py --raw_path ./splitfiles\n```\n**Description of parameters**\n`--raw_path`： indicates the storage path of the original division file\n# Folder Structure\nAfter the whole data pipeline for UCF24 preparation, the folder structure will look like:\n```\n├── data\n│   ├── ucf24\n│   |   ├── groundtruths_ucf\n│   |   ├── labels\n│   |   |   ├── Basketball\n│   |   |   |   ├── v_Basketball_g01_c01\n│   |   |   |   |   ├── 00009.txt\n│   |   |   |   |   ├── 00010.txt\n│   |   |   |   |   ├── ...\n│   |   |   |   |   ├── 00050.txt\n│   |   |   |   |   ├── 00051.txt\n│   |   |   ├── ...\n│   |   |   ├── WalkingWithDog\n│   |   |   |   ├── v_WalkingWithDog_g01_c01\n│   |   |   |   ├── ...\n│   |   |   |   ├── v_WalkingWithDog_g25_c04\n│   |   ├── rgb-images\n│   |   |   ├── Basketball\n│   |   |   |   ├── v_Basketball_g01_c01\n│   |   |   |   |   ├── 00001.jpg\n│   |   |   |   |   ├── 00002.jpg\n│   |   |   |   |   ├── ...",
+        "type": "code",
+        "location": "/english_documents/dataset/ucf24.md:22-60"
+    },
+    "4277": {
+        "file_id": 364,
+        "content": "This code describes the process of generating file path lists and the resulting folder structure for UCF24 dataset preparation using PaddleVideo's build_split.py command with the raw_path parameter, dividing data into groundtruths_ucf, labels, and rgb-images subfolders containing video clips and corresponding files.",
+        "type": "comment"
+    },
+    "4278": {
+        "file_id": 364,
+        "content": "│   |   |   |   |   ├── 00140.jpg\n│   |   |   |   |   ├── 00141.jpg\n│   |   |   ├── ...\n│   |   |   ├── WalkingWithDog\n│   |   |   |   ├── v_WalkingWithDog_g01_c01\n│   |   |   |   ├── ...\n│   |   |   |   ├── v_WalkingWithDog_g25_c04\n│   |   ├── splitfiles\n│   |   |   ├── trainlist01.txt\n│   |   |   |── testlist01.txt \n│   |   ├── trainlist.txt\n│   |   |── testlist.txt \n```",
+        "type": "code",
+        "location": "/english_documents/dataset/ucf24.md:61-73"
+    },
+    "4279": {
+        "file_id": 364,
+        "content": "The code represents a file structure of the UCF101 dataset, containing various video sequences and split files for training and testing purposes.",
+        "type": "comment"
+    },
+    "4280": {
+        "file_id": 365,
+        "content": "/english_documents/dataset/youtube8m.md",
+        "type": "filepath"
+    },
+    "4281": {
+        "file_id": 365,
+        "content": "YouTube-8M is a large video classification dataset containing over 8 million URLs and covers more than 3800 knowledge graph entities. The code splits the pkl files into smaller files for easier processing.",
+        "type": "summary"
+    },
+    "4282": {
+        "file_id": 365,
+        "content": "English | [简体中文](../../zh-CN/dataset/youtube8m.md)\n# YouTube-8M Data Preparation\n- [Introduction](#Introduction)\n- [Download](#Download)\n- [Conversion](#Conversion)\n## Introduction\nYouTube-8M is a large-scale video classification data set, containing more than 8 million video URLs. The tag system covers more than 3800 knowledge graph entities. One video corresponds to multiple tags (3-4 on average) and is labeled by machine.\n**The length of each video is between 120s and 500s\nDue to the large amount of video data, the image classification model was used to extract frame-level features in advance, and PCA was used to reduce the dimensionality of the features to obtain multi-frame 1024-dimensional features. Similarly, the audio model was used to obtain multi-frame 128-dimensional features. Audio characteristics. **\n> The dataset used here is the updated YouTube-8M data set in 2018 (May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features).\n## Download\n1. Create a new directory for storing features (take the PaddleVideo directory as an example)",
+        "type": "code",
+        "location": "/english_documents/dataset/youtube8m.md:1-20"
+    },
+    "4283": {
+        "file_id": 365,
+        "content": "English | [简体中文](../../zh-CN/dataset/youtube8m.md)\n# YouTube-8M Data Preparation\n- [Introduction](#Introduction)\n- [Download](#Download)\n- [Conversion](#Conversion)\n## Introduction\nYouTube-8M is a large-scale video classification data set, containing more than 8 million video URLs. The tag system covers more than 3800 knowledge graph entities. One video corresponds to multiple tags (3-4 on average) and is labeled by machine.\n**The length of each video is between 120s and 500s\nDue to the large amount of video data, the image classification model was used to extract frame-level features in advance, and PCA was used to reduce the dimensionality of the features to obtain multi-frame 1024-dimensional features. Similarly, the audio model was used to obtain multi-frame 128-dimensional features. Audio characteristics. **\n> The dataset used here is the updated YouTube-8M data set in 2018 (May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features).\n## Download\n1. Create a new directory for storing features (take the PaddleVideo directory as an example)",
+        "type": "comment"
+    },
+    "4284": {
+        "file_id": 365,
+        "content": "    ```bash\n    cd data/yt8m\n    mkdir frame\n    cd frame\n    ```\n2. Download the training and validation set to the frame folder\n    ```bash\n    curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python\n    curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python\n    ```\n    The download process is shown in the figure\n    ![image](https://user-images.githubusercontent.com/23737287/140709613-1e2d6ec0-a82e-474d-b220-7803065b0153.png)\n    After the data download is complete, you will get 3844 training data files and 3844 verification data files (TFRecord format)\n## Conversion\n1. Install tensorflow to read tfrecord data\n    ```bash\n    python3.7 -m pip install tensorflow-gpu==1.14.0\n    ```\n2. Convert the downloaded TFRecord file into a pickle file for PaddlePaddle to use\n    ```bash\n    cd .. # From the frame directory back to the yt8m directory\n    python3.7 tf2pkl.py ./frame ./pkl_frame/ # Convert train*.tfrecord and validate*.tfrecord in the frame folder to pkl format",
+        "type": "code",
+        "location": "/english_documents/dataset/youtube8m.md:21-44"
+    },
+    "4285": {
+        "file_id": 365,
+        "content": "Creates a frame directory, downloads the training and validation sets to it using curl, installs TensorFlow for reading TFRecord data, then converts the TFRecord files to pickle format for PaddlePaddle usage.",
+        "type": "comment"
+    },
+    "4286": {
+        "file_id": 365,
+        "content": "    ```\n3. Generate a single pkl file path set, and split pkl into multiple small pkl files based on this file, and generate the final split pkl file path required\n    ```bash\n    ls pkl_frame/train*.pkl> train.list # Write the path of train*.pkl to train.list\n    ls pkl_frame/validate*.pkl> val.list # Write the path of validate*.pkl into val.list\n    python3.7 split_yt8m.py train.list # Split each train*.pkl into multiple train*_split*.pkl\n    python3.7 split_yt8m.py val.list # Split each validate*.pkl into multiple validate*_split*.pkl\n    ls pkl_frame/train*_split*.pkl> train.list # Rewrite the path of train*_split*.pkl into train.list\n    ls pkl_frame/validate*_split*.pkl> val.list # Rewrite the path of validate*_split*.pkl into val.list\n    ``` ",
+        "type": "code",
+        "location": "/english_documents/dataset/youtube8m.md:45-56"
+    },
+    "4287": {
+        "file_id": 365,
+        "content": "This code generates a single pkl file path set and splits the pkl files into smaller files based on given file lists. It first writes the paths of \"train*.pkl\" and \"validate*.pkl\" to \"train.list\" and \"val.list\" respectively. Then, it uses the \"split_yt8m.py\" script to split each \"train*.pkl\" into multiple \"train*_split*.pkl\" files and each \"validate*.pkl\" into multiple \"validate*_split*.pkl\" files. Finally, it rewrites the paths of the smaller pkl files back into \"train.list\" and \"val.list\".",
+        "type": "comment"
+    },
+    "4288": {
+        "file_id": 366,
+        "content": "/english_documents/install.md",
+        "type": "filepath"
+    },
+    "4289": {
+        "file_id": 366,
+        "content": "This introduction explains how to install PaddlePaddle and PaddleVideo, their requirements like Python 3.7 and CUDA 10.1, enabling distribution feature, setting shared memory in Docker, cloning PaddleVideo repo, upgrading pip and requirements, installing ppvideo package, and usage example specifying model, disabling GPU, and input video file.",
+        "type": "summary"
+    },
+    "4290": {
+        "file_id": 366,
+        "content": "[简体中文](../zh-CN/install.md) | English\n# Installation\n---\n- [Introduction](#Introduction)\n- [Install PaddlePaddle](#Install-PaddlePaddle)\n- [Install PaddleVideo](#Install-PaddleVideo)\n## Introduction\nThis document introduces how to install PaddlePaddle、PaddleVideo and its requirements.\n## Install PaddlePaddle\nPython 3.7, CUDA 10.1, CUDNN7.6.4 nccl2.1.2 and later version are required at first, For now, PaddleVideo only support training on the GPU device. Please follow the instructions in the [Installation](http://www.paddlepaddle.org.cn/install/quick) if the PaddlePaddle on the device is lower than v2.0\n**Install PaddlePaddle**\n```bash\npip3 install paddlepaddle-gpu --upgrade\n```\nor compile from source code, please refer to [Installation](http://www.paddlepaddle.org.cn/install/quick).\nVerify Installation\n```python\nimport paddle\npaddle.utils.run_check()\n```\nCheck PaddlePaddle version：\n```bash\npython3 -c \"import paddle; print(paddle.__version__)\"\n```\nNote:\n- Make sure the compiled version is later than PaddlePaddle2.0.",
+        "type": "code",
+        "location": "/english_documents/install.md:1-41"
+    },
+    "4291": {
+        "file_id": 366,
+        "content": "Introduction: Describes how to install PaddlePaddle, PaddleVideo, and their requirements.\nInstall PaddlePaddle: Requires Python 3.7, CUDA 10.1, CUDNN7.6.4 nccl2.1.2 and supports GPU training only. Follow the instructions on the website if PaddlePaddle on the device is lower than v2.0.\nInstallation commands: Use pip3 to install paddlepaddle-gpu or compile from source code, following instructions on the website.",
+        "type": "comment"
+    },
+    "4292": {
+        "file_id": 366,
+        "content": "- Indicate **WITH_DISTRIBUTE=ON** when compiling, Please refer to [Instruction](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#id3) for more details.\n- When running in the docker, in order to ensure that the container has enough shared memory for data read acceleration of Paddle, please set the parameter `--shm_size=32g` at creating a docker container, if conditions permit, you can set it to a larger value.\n---\n## Install PaddleVideo\n**Clone PaddleVideo:**\n```bash\ncd path_to_clone_PaddleVideo\ngit clone https://github.com/PaddlePaddle/PaddleVideo.git\n```\n**Install requirements**\n```bash\npython3.7 -m pip install --upgrade pip\npip3.7 install --upgrade -r requirements.txt\n```\n**Install python package**\n```bash\npip3.7 install ppvideo==2.3.0\n```\nuse scripts:\n```bash\nppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'\n```",
+        "type": "code",
+        "location": "/english_documents/install.md:42-72"
+    },
+    "4293": {
+        "file_id": 366,
+        "content": "WITH_DISTRIBUTE=ON: Enables the distribution feature in PaddleVideo, refer to Instruction for more details.\nDocker shm_size: Set --shm_size=32g when creating a docker container for enough shared memory.\nClone PaddleVideo: Navigate to desired path and clone the repository from GitHub.\nRequirements upgrade: Ensure pip is up-to-date before installing requirements.txt.\nInstall python package: Use pip3.7 to install specific version of ppvideo package.\nUsage example: Specify model, disable GPU usage, and input video file when running ppvideo script.",
+        "type": "comment"
+    },
+    "4294": {
+        "file_id": 367,
+        "content": "/english_documents/model_zoo/README.md",
+        "type": "filepath"
+    },
+    "4295": {
+        "file_id": 367,
+        "content": "The code offers a comprehensive table of action recognition and segmentation models with corresponding links for further details, classified by adaptability and network type. It includes various models like AttentionLSTM, MoViNet, ST-GCN, AGCN, 2s-AGCN, CTR-GCN, BMN, MS-TCN, and ASRF, and serves as a table of contents for PaddleVideo model zoo in HTML format.",
+        "type": "summary"
+    },
+    "4296": {
+        "file_id": 367,
+        "content": "[简体中文](../../zh-CN/model_zoo/README.md) | English\n# Academic algorithms\n## 1. Introduction\nWe implemented action recgonition model and action localization model in this repo.\n## 2. Model list\n<table style=\"margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;\">\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Action recognition method</td>\n  </tr>\n  <tr>\n    <td><a href=\"./recognition/pp-tsm.md\">PP-TSM</a> (PP series)</td>\n    <td><a href=\"./recognition/pp-tsn.md\">PP-TSN</a> (PP series)</td>\n    <td><a href=\"./recognition/pp-timesformer.md\">PP-TimeSformer</a> (PP series)</td>\n    <td><a href=\"./recognition/tsn.md\">TSN</a> (2D’)</td>\n    <td><a href=\"./recognition/tsm.md\">TSM</a> (2D')</td>\n  <tr>\n    <td><a href=\"./recognition/slowfast.md\">SlowFast</a> (3D’)</td>\n    <td><a href=\"./recognition/timesformer.md\">TimeSformer</a> (Transformer')</td>\n    <td><a href=\"./recognition/videoswin.md\">VideoSwin</a> (Transformer’)</td>\n    <td><a href=\"./recognition/tokenshift_transformer.md\">TokenShift</a> (3D’)</td>",
+        "type": "code",
+        "location": "/english_documents/model_zoo/README.md:1-26"
+    },
+    "4297": {
+        "file_id": 367,
+        "content": "This code provides a table listing action recognition models and their corresponding links for further details. The models listed include PP-TSM, PP-TSN, PP-TimeSformer, TSN, TSM, SlowFast, TimeSformer, VideoSwin, and TokenShift.",
+        "type": "comment"
+    },
+    "4298": {
+        "file_id": 367,
+        "content": "    <td><a href=\"./recognition/attention_lstm.md\">AttentionLSTM</a> (RNN‘)</td>\n  </tr>\n  <tr>\n    <td><a href=\"./recognition/movinet.md\">MoViNet</a> (Lite‘)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Skeleton based action recognition</td>\n  </tr>\n  <tr>\n    <td><a href=\"./recognition/stgcn.md\">ST-GCN</a> (Custom’)</td>\n    <td><a href=\"./recognition/agcn.md\">AGCN</a> (Adaptive')</td>\n    <td><a href=\"./recognition/agcn2s.md\">2s-AGCN</a> (Adaptive')</td>\n    <td><a href=\"./recognition/ctrgcn.md\">CTR-GCN</a> (GCN‘)</td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Sequence action detection method</td>\n  </tr>\n  <tr>\n    <td><a href=\"./localization/bmn.md\">BMN</a> (One-stage')</td>\n    <td></td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">temporal segment</td>\n  </tr>\n  <tr>\n    <td><a href=\"./segmentation/mstcn.md\">MS-TCN</a> </td>\n    <td><a href=\"./segmentation/asrf.md\">ASRF</a> </td>",
+        "type": "code",
+        "location": "/english_documents/model_zoo/README.md:27-61"
+    },
+    "4299": {
+        "file_id": 367,
+        "content": "This code is part of a table of contents for an AI model repository. It lists various action recognition and segmentation models, categorized by their features like adaptability, customization, and network type (RNN, Lite, etc.). The models include AttentionLSTM, MoViNet, ST-GCN, AGCN, 2s-AGCN, CTR-GCN, BMN, MS-TCN, and ASRF. Each model is linked to its corresponding documentation file in the repository.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/43.json b/docs/data/43.json
new file mode 100644
index 000000000..a2cbece7c
--- /dev/null
+++ b/docs/data/43.json
@@ -0,0 +1,543 @@
+{
+    "4300": {
+        "file_id": 367,
+        "content": "    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Spatio-temporal motion detection method</td>\n  </tr>\n  <tr>\n    <td><a href=\"docs/en/model_zoo/detection/SlowFast_FasterRCNN_en.md\">SlowFast+Fast R-CNN</a>\n    <td></td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Multimodal</td>\n  </tr>\n  <tr>\n    <td><a href=\"./multimodal/actbert.md\">ActBERT</a> (Learning')</td>\n    <td><a href=\"../../../applications/T2VLAD/README.md\">T2VLAD</a> (Retrieval')</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Video target segmentation</td>\n  </tr>\n  <tr>\n    <td><a href=\"./segmentation/cfbi.md\">CFBI</a> (Semi')</td>\n    <td><a href=\"../../../applications/EIVideo/EIVideo/docs/en/manet.md\">MA-Net</a> (Supervised')</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Monocular depth estimation</td>\n  </tr>\n  <tr>\n    <td><a href=\"./estimation/adds.md\">ADDS</a> (Unsupervised‘)</td>",
+        "type": "code",
+        "location": "/english_documents/model_zoo/README.md:62-100"
+    },
+    "4301": {
+        "file_id": 367,
+        "content": "This code represents a table of contents with hyperlinks for different models and methods in the PaddleVideo model zoo. It includes categories such as Spatio-temporal motion detection, Multimodal, Video target segmentation, and Monocular depth estimation. Each category has a brief description of its subcategories or models, indicated by hyperlinks.",
+        "type": "comment"
+    },
+    "4302": {
+        "file_id": 367,
+        "content": "    <td></td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n</table>",
+        "type": "code",
+        "location": "/english_documents/model_zoo/README.md:101-106"
+    },
+    "4303": {
+        "file_id": 367,
+        "content": "This code represents an empty table cell or row, likely within a HTML table structure.",
+        "type": "comment"
+    },
+    "4304": {
+        "file_id": 368,
+        "content": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md",
+        "type": "filepath"
+    },
+    "4305": {
+        "file_id": 368,
+        "content": "The code documents a SlowFast_FasterRCNN model for action detection tasks, providing installation and processing instructions. It trains and tests the model with PaddleDetection and exports it for inference, using GPU acceleration and disabling TensorRT optimization.",
+        "type": "summary"
+    },
+    "4306": {
+        "file_id": 368,
+        "content": "[简体中文](../../../zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md) | English\n# SlowFast_FasterRCNN\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install moviepy\npython -m pip install et_xmlfile\npython -m pip install paddledet\n```\n## Introduction\nThe [SlowFast](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/slowfast.md) model is one of the high-precision models in the video field. For action detection task, it is also neccessary to detect the person in current frame. Therefore, the SlowFast_FasterRCNN model takes human detection results and video frames as input, extracts spatiotemporal features through the SlowFast model, and then uses FasterRCNN's head gets the actions and positions of humans in the frame.\nThe corresponding AI Studio Notebook Link：[基于SlowFast+FasterRCNN的动作识别](https://aistudio.baidu.com/aistudio/projectdetail/3267637?contributionType=1)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:1-24"
+    },
+    "4307": {
+        "file_id": 368,
+        "content": "This code provides documentation for the SlowFast_FasterRCNN model, a high-precision video model used for action detection tasks. It takes human detection results and video frames as input, uses the SlowFast model to extract spatiotemporal features, and employs FasterRCNN's head to obtain the actions and positions of humans in the frame. Users need to install additional dependencies before getting started.",
+        "type": "comment"
+    },
+    "4308": {
+        "file_id": 368,
+        "content": "For details, please refer to the paper [SlowFast Networks for Video Recognition](https://arxiv.org/pdf/1812.03982.pdf).\n## Data\nWe use [AVA dataset](https://research.google.com/ava/download.html) for action detection. The AVA v2.2 dataset contains 430 videos split into 235 for training, 64 for validation, and 131 for test. Each video has 15 minutes annotated in 1 second intervals.\n### 1 Dowload Videos\n```\nbash  download_videos.sh\n```\n### 2 Download Annotations\n```\nbash  download_annotations.sh\n```\n### 3 Download Proposals\n```\nbash  fetch_ava_proposals.sh\n```\n### 4 Cut Videos\n```\nbash  cut_videos.sh\n```\n### 5 Extract Frames\n```\nbash  extract_rgb_frames.sh\n```\nFor AVA v2.1, there is a simple introduction to some key files：\n* 'ava_videos_15min_frames' dir stores video frames extracted with FPS as the frame rate；\n* 'ava_train_v2.1.csv' file stores the trainning annotations；\n* 'ava_train_excluded_timestamps_v2.1.csv' file stores excluded timestamps；\n* 'ava_dense_proposals_train.FAIR.recall_93.9.pkl' file stores humans' bboxes and scores of key frames；",
+        "type": "code",
+        "location": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:26-64"
+    },
+    "4309": {
+        "file_id": 368,
+        "content": "The provided code is a set of instructions for downloading videos, annotations, and proposals as well as cutting videos and extracting frames from the AVA dataset. This dataset contains 430 videos annotated in 1 second intervals for action detection using SlowFast Networks.",
+        "type": "comment"
+    },
+    "4310": {
+        "file_id": 368,
+        "content": "* 'ava_action_list_v2.1_for_activitynet_2018.pbtxt' file stores为 action list.\n## Train\n* `-c`: config file path;\n* `-w`: weights of model. The pretrained model can be downloaded from the table below;\n* `--validate`: evaluate model during training.\n```\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=logdir.ava main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava.yaml\n```\n## Test\nTest model based on the best model:\n```\npython main.py --test \\\n   -w output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams \\\n   -c configs/detection/ava/ava.yaml\n```\n| architecture | depth | Pretrain Model |  frame length x sample rate  | MAP | AVA version | model |\n| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |------------- |\n| SlowFast | R50 | [Kinetics 400](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) | 8 x 8 | 23.2 | 2.1 | [`link`](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SlowFastRCNN_AVA.pdparams) |",
+        "type": "code",
+        "location": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:65-90"
+    },
+    "4311": {
+        "file_id": 368,
+        "content": "This code describes the training and testing procedures for a SlowFast model using Faster RCNN on AVA dataset. The training process requires a config file, pre-trained model weights, and evaluates the model during training with the --validate flag. Testing is done based on the best model provided with specifications like architecture, depth, pretrain model, frame length, sample rate, MAP, AVA version, and a link to the trained model.",
+        "type": "comment"
+    },
+    "4312": {
+        "file_id": 368,
+        "content": "## Inference\nThe action detection of this project is divided into two stages. In the first stage, humans' proposals are obtained, and then input into the SlowFast+FasterRCNN model for action recognition.\nFor human detection，you can use the trained model in [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection).\nInstall PaddleDetection:\n```\ncd PaddleDetection/\npip install -r requirements.txt\n!python setup.py install\n```\nDownload detection model:\n```\n# faster_rcnn_r50_fpn_1x_coco as an example\nwget https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams\n```\nexport model:\n```\npython tools/export_model.py \\\n  -c configs/detection/ava/ava.yaml \\\n  -o inference_output \\\n  -p output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams\n```\ninference based on the exported model:\n```\npython tools/predict.py \\\n    -c configs/detection/ava/ava.yaml \\\n    --input_file \"data/-IELREHXDEMO.mp4\" \\\n    --model_file \"inference_output/AVA_SlowFast_FastRcnn.pdmodel\" \\\n    --params_file \"inference_output/AVA_SlowFast_FastRcnn.pdiparams\" \\",
+        "type": "code",
+        "location": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:93-126"
+    },
+    "4313": {
+        "file_id": 368,
+        "content": "In this code, the inference process is outlined for an action detection project using SlowFast+FasterRCNN model. It requires installing PaddleDetection and downloading a detection model from provided URL. The \"export_model\" script prepares the model for inference, while the \"predict.py\" script performs inference based on the exported model.",
+        "type": "comment"
+    },
+    "4314": {
+        "file_id": 368,
+        "content": "    --use_gpu=True \\\n    --use_tensorrt=False\n```",
+        "type": "code",
+        "location": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:127-129"
+    },
+    "4315": {
+        "file_id": 368,
+        "content": "The code sets `use_gpu` to True and `use_tensorrt` to False. This means the model will use GPU acceleration and not utilize TensorRT for optimizing performance.",
+        "type": "comment"
+    },
+    "4316": {
+        "file_id": 369,
+        "content": "/english_documents/model_zoo/estimation/adds.md",
+        "type": "filepath"
+    },
+    "4317": {
+        "file_id": 369,
+        "content": "ADDS-DepthNet code estimates depth using day and night images, requiring scikit-image and matplotlib, utilizes Oxford RobotCar dataset, offers Resnet18_Imagenet pre-trained model addition, and provides training, testing instructions, and download URL. It demonstrates PaddlePaddle's predict.py tool for inference and saves results as pseudo-colored depth maps with two input images (RGB and depth estimation).",
+        "type": "summary"
+    },
+    "4318": {
+        "file_id": 369,
+        "content": "[Simplified Chinese](../../../zh-CN/model_zoo/estimation/adds.md) | English\n# ADDS-DepthNet model\n## content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install scikit-image\npython -m pip install matplotlib\n```\n## Introduction\nThis model is based on the ICCV 2021 paper **[Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628)** of Baidu Robotics and Autonomous Driving Laboratory,\nThe self-supervised monocular depth estimation model based on day and night images is reproduced, which utilizes the complementary nature of day and night image data, and slows down the large domain shift of day and night images and the accuracy of depth estimation caused by lighting changes. Impact, the most advanced depth estimation results of all-sky images have been achieved on the challenging Oxford RobotCar data set.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/estimation/adds.md:1-23"
+    },
+    "4319": {
+        "file_id": 369,
+        "content": "This code is for the ADDS-DepthNet model, which is based on a self-supervised monocular depth estimation paper by Baidu Robotics and Autonomous Driving Laboratory. The code utilizes day and night images to reproduce the model and achieve advanced depth estimation results on the Oxford RobotCar dataset, mitigating the impact of lighting changes between day and night images. Additional dependencies like scikit-image and matplotlib are required before using the model.",
+        "type": "comment"
+    },
+    "4320": {
+        "file_id": 369,
+        "content": "## Data\nFor data download and preparation of Oxford RobotCar dataset, please refer to [Oxford RobotCar dataset data preparation](../../dataset/Oxford_RobotCar.md)\n## Train\n### Oxford RobotCar dataset training\n#### Download and add pre-trained models\n1. Download the image pre-training model [resnet18.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams) as Backbone initialization parameters, or download through the wget command\n   ```bash\n   wget -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams\n   ```\n2. Open `PaddleVideo/configs/estimation/adds/adds.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL: #MODEL field\n        framework: \"DepthEstimator\" #Mandatory, indicate the type of network, associate to the'paddlevideo/modeling/framework/'.\n        backbone: #Mandatory, indicate the type of backbone, associate to the'paddlevideo/modeling/backbones/'.\n            name: 'ADDS_DepthNet'",
+        "type": "code",
+        "location": "/english_documents/model_zoo/estimation/adds.md:26-49"
+    },
+    "4321": {
+        "file_id": 369,
+        "content": "This code provides instructions for downloading and adding a pre-trained model to the Oxford RobotCar dataset. It mentions the necessary steps to download the pre-training model, Resnet18_Imagenet.pdparams, using the wget command and specifying its path in the adds.yaml file. The code also highlights the importance of filling in the correct fields in the configuration file for proper association with the relevant model types and frameworks.",
+        "type": "comment"
+    },
+    "4322": {
+        "file_id": 369,
+        "content": "            pretrained: fill in the path here\n    ```\n#### Start training\n- The Oxford RobotCar dataset uses a single card for training, and the starting command for the training method is as follows:\n    ```bash\n    python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20\n    ```\n## Test\n- The ADDS-DepthNet model is verified synchronously during training (only the day or night data is verified). You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```bash\n  Already save the best model (rmse)8.5531\n  ```\n- Because the model can only test one day or night data set at a given path in the yaml file at a time, to get the complete test score at the beginning of this document, you need to run 4 test commands and record their indicators ( 40m during the day, 60m during the day, 40m at night, 60m at night)\n- Download URL of the trained model: [ADDS_car.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ADDS_car.pdparams)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/estimation/adds.md:50-72"
+    },
+    "4323": {
+        "file_id": 369,
+        "content": "This code snippet provides instructions for training and testing the ADDS-DepthNet model using the Oxford RobotCar dataset. The provided commands initiate the training process with specific configuration file (`configs/estimation/adds/adds.yaml`) and seed value (20). Testing involves running separate commands to test day and night data sets, then recording their respective indicators. A download URL for a pre-trained model is also provided.",
+        "type": "comment"
+    },
+    "4324": {
+        "file_id": 369,
+        "content": "- The test commands are as follows:\n  ```bash\n  # Night 40m\n  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w \"output/ADDS/ADDS_best.pdparams\" -o DATASET.test.file_path=\"data/oxford/splits/oxford_day/val_night_files.txt\" -o MODEL.head.max_gt_depth=40\n  # Night 60m\n  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w \"output/ADDS/ADDS_best.pdparams\" -o DATASET.test.file_path=\"data/oxford/splits/oxford_day/val_night_files.txt\" -o MODEL.head.max_gt_depth=60\n  # Daytime 40m\n  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w \"output/ADDS/ADDS_best.pdparams\" -o DATASET.test.file_path=\"data/oxford/splits/oxford_day/val_day_files.txt\" -o MODEL.head.max_gt_depth=40\n  # Daytime 60m\n  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w \"output/ADDS/ADDS_best.pdparams\" -o DATASET.test.file_path=\"data/oxford/splits/oxford_day/val_day_files.txt\" -o MODEL.head.max_gt_depth=60\n  ```\n    The test indicators on the validation dataset of Oxford RobotCar dataset are as follows:",
+        "type": "code",
+        "location": "/english_documents/model_zoo/estimation/adds.md:74-90"
+    },
+    "4325": {
+        "file_id": 369,
+        "content": "The code provides test commands for running the ADDS model on the Oxford RobotCar dataset with varying maximum ground truth depth limits and different light conditions (night and daytime). It uses Python 3.7 to execute the main.py file from the PaddleVideo library, configs/estimation/adds/adds.yaml configuration, and specific dataset files for testing.",
+        "type": "comment"
+    },
+    "4326": {
+        "file_id": 369,
+        "content": "  | version | Max Depth | Abs Rel | Sq Rel | RMSE | RMSE log | <img src=\"https://latex.codecogs.com/svg.image?\\delta&space;<&space;1.25&space;\" title=\"\\delta < 1.25 \" /> | <img src=\"https://latex.codecogs.com/svg.image?\\delta&space;<&space;1.25^2\" title=\"\\delta < 1.25^2\" /> | <img src=\"https://latex.codecogs.com/svg.image?\\delta&space;<&space;1.25^3\" title=\"\\delta < 1.25^3\" /> |\n  | ----------- | --------- | ------- | ------ | ----- | ------- | ----------------- |------------------- | ------------------- |\n  | ours(night) | 40 | 0.209 | 1.741 | 6.031 | 0.243 | 0.708 | 0.923 | 0.975 |\n  | ours(night) | 60 | 0.207 | 2.052 | 7.888 | 0.258 | 0.686 | 0.909 | 0.970 |\n  | ours(day) | 40 | 0.114 | 0.574 | 3.411 | 0.157 | 0.860 | 0.977 | 0.993 |\n  | ours(day) | 60 | 0.119 | 0.793 | 4.842 | 0.173 | 0.838 | 0.967 | 0.991 |\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/estimation/adds/adds.yaml -p data/ADDS_car.pdparams -o inference/ADDS\n```\nThe above command will",
+        "type": "code",
+        "location": "/english_documents/model_zoo/estimation/adds.md:92-107"
+    },
+    "4327": {
+        "file_id": 369,
+        "content": "The code presents a table comparing performance metrics of different models under various conditions. It shows the version, maximum depth, and several error measures for each model. The table also includes whether or not the delta value is less than 1.25 raised to different powers. The text describes how to run a command to export an inference model using Python script with specific configuration file, pre-trained parameters, and output directory.",
+        "type": "comment"
+    },
+    "4328": {
+        "file_id": 369,
+        "content": " generate the model structure file `ADDS.pdmodel` and model weight files `ADDS.pdiparams` and `ADDS.pdiparams.info` files needed for prediction, all of which are stored in the `inference/ADDS/` directory\nFor the meaning of each parameter in the above bash command, please refer to [Model Inference Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/en/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)\n### Use predictive engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.png \\\n                           --config configs/estimation/adds/adds.yaml \\\n                           --model_file inference/ADDS/ADDS.pdmodel \\\n                           --params_file inference/ADDS/ADDS.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nAt the end of the inference, the depth map estimated by the model will be saved in pseudo-color by default.\nThe following is a sample picture and the corresponding predicted depth map：",
+        "type": "code",
+        "location": "/english_documents/model_zoo/estimation/adds.md:107-124"
+    },
+    "4329": {
+        "file_id": 369,
+        "content": "This code snippet demonstrates the usage of PaddlePaddle's predict.py tool for model inference. It uses a pre-trained model, ADDS, to estimate depth maps from input images. The model files are specified using the --model_file and --params_files parameters, while the input image file is provided with --input_file. The command also includes options for GPU usage (--use_gpu) and TensorRT acceleration (--use_tensorrt). The inference results will be saved as pseudo-colored depth maps by default.",
+        "type": "comment"
+    },
+    "4330": {
+        "file_id": 369,
+        "content": "<img src=\"../../../images/oxford_image.png\" width = \"512\" height = \"256\" alt=\"image\" align=center />\n<img src=\"../../../images/oxford_image_depth.png\" width = \"512\" height = \"256\" alt=\"depth\" align=center />\n## Reference\n- [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628), Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun",
+        "type": "code",
+        "location": "/english_documents/model_zoo/estimation/adds.md:126-133"
+    },
+    "4331": {
+        "file_id": 369,
+        "content": "The code includes two images, one for regular RGB image and the other for depth estimation from the paper \"Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation\" by Liu et al.",
+        "type": "comment"
+    },
+    "4332": {
+        "file_id": 370,
+        "content": "/english_documents/model_zoo/localization/bmn.md",
+        "type": "filepath"
+    },
+    "4333": {
+        "file_id": 370,
+        "content": "The BMN model, using three modules and the ActivityNet dataset, is trained and inferred for temporal action proposal generation with given commands. The export_model script and predict script are utilized to perform inference, providing logs as examples.",
+        "type": "summary"
+    },
+    "4334": {
+        "file_id": 370,
+        "content": "[简体中文 ](../../../zh-CN/model_zoo/localization/bmn.md) | English\n# BMN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nBMN model contains three modules: Base Module handles the input feature sequence, and out- puts feature sequence shared by the following two modules; Temporal Evaluation Module evaluates starting and ending probabilities of each location in video to generate boundary probability sequences; Proposal Evaluation Module con- tains the BM layer to transfer feature sequence to BM fea- ture map, and contains a series of 3D and 2D convolutional layers to generate BM confidence map.\n<p align=\"center\">\n<img src=\"../../../images/BMN.png\" height=300 width=400 hspace='10'/> <br />\nBMN Overview\n</p>\n## Data\nWe use ActivityNet dataset to train this model，data preparation please refer to [ActivityNet dataset](../../dataset/ActivityNet.md).\n## Train\nYou can start training by such command：\n```bash",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/bmn.md:1-35"
+    },
+    "4335": {
+        "file_id": 370,
+        "content": "The code describes the BMN model, which consists of three modules: Base Module, Temporal Evaluation Module, and Proposal Evaluation Module. It uses the ActivityNet dataset for training and provides instructions on how to start the training process using a command.",
+        "type": "comment"
+    },
+    "4336": {
+        "file_id": 370,
+        "content": "export CUDA_VISIBLE_DEVICES=0,1,2,3\npython -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_bmn main.py  --validate -c configs/localization/bmn.yaml\n```\n## Test\nYou can start testing by such command：\n```bash\npython main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00009.pdparams -o DATASET.test_batch_size=1\n```\n- For now, we only support testing with **single card** and `batch_size=1`.\n-  Please download [activity\\_net\\_1\\_3\\_new.json](https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json) label file and specify the path to `METRIC.ground_truth_filename` in config file.\n-  Args `-w` is used to specifiy the model path，you can download our model in [BMN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams)\nTest accuracy in ActivityNet1.3:\n| AR@1 | AR@5 | AR@10 | AR@100 | AUC |\n| :---: | :---: | :---: | :---: | :---: |\n| 33.26 | 49.48 | 56.86 | 75.19 | 67.23% |\n## Inference\n### export inference model\n To get model architecture file `BMN.pdmodel` and parameters file `BMN.pdiparams`, use: ",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/bmn.md:36-68"
+    },
+    "4337": {
+        "file_id": 370,
+        "content": "This code is launching a PaddlePaddle distributed localization model named BMN using 4 GPUs and running it on the provided configuration file. It also provides instructions for testing, specifying the required label file and model path, as well as inference commands to export the architecture and parameters files.",
+        "type": "comment"
+    },
+    "4338": {
+        "file_id": 370,
+        "content": "```bash\npython3.7 tools/export_model.py -c configs/localization/bmn.yaml \\\n                                -p data/BMN.pdparams \\\n                                -o inference/BMN\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example_feat.list \\\n                           --config configs/localization/bmn.yaml \\\n                           --model_file inference/BMN/BMN.pdmodel \\\n                           --params_file inference/BMN/BMN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nBMN Inference results of data/example_feat.npy :\n{'score': 0.7968077063560486, 'segment': [0.0, 122.9877]}\n{'score': 0.49097609519958496, 'segment': [12.423000000000002, 124.23]}\n{'score': 0.21395835280418396, 'segment': [39.7536, 122.9877]}\n{'score': 0.2106524258852005, 'segment': [0.0, 109.3224]}",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/bmn.md:70-96"
+    },
+    "4339": {
+        "file_id": 370,
+        "content": "The code exports the BMN model and runs inference on a set of feature files, producing output segments and scores. The export_model script requires the configuration file, the PDParams file, and outputs an inference folder. The predict script uses the configuration file, two model files, and a list of input feature files to perform inference. It prints the score and segment for each input, with example logs provided.",
+        "type": "comment"
+    },
+    "4340": {
+        "file_id": 370,
+        "content": "{'score': 0.06876271963119507, 'segment': [23.6037, 114.2916]}\n```\nInference results are saved in `data/bmn/BMN_INFERENCE_results`. \n## Reference\n- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/bmn.md:97-104"
+    },
+    "4341": {
+        "file_id": 370,
+        "content": "The code snippet represents the inference results of BMN (Boundary-Matching Network) for temporal action proposal generation. These results, containing a score and segment information, are saved in the specified directory. The BMN paper reference is provided for further information.",
+        "type": "comment"
+    },
+    "4342": {
+        "file_id": 371,
+        "content": "/english_documents/model_zoo/localization/yowo.md",
+        "type": "filepath"
+    },
+    "4343": {
+        "file_id": 371,
+        "content": "YOWO is a single-stage feature extraction network with channel fusion and attention. Pre-trained on UCF101-24, it provides model structure and weight files for prediction with high confidence.",
+        "type": "summary"
+    },
+    "4344": {
+        "file_id": 371,
+        "content": "[简体中文](../../../zh-CN/model_zoo/localization/yowo.md) | English\n# YOWO\n## Content\n- [Introduction](#Introduction)\n- [Data](#DATA)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nYOWO is a single-stage network with two branches. One branch extracts spatial features of key frames (i.e., the current frame) via 2D-CNN, while the other branch acquires spatio-temporal features of clips consisting of previous frames via 3D-CNN. To accurately aggregate these features, YOWO uses a channel fusion and attention mechanism that maximizes the inter-channel dependencies. Finally, the fused features are subjected to frame-level detection.\n<div align=\"center\">\n<img src=\"../../../images/yowo.jpg\">\n</div>\n## Data\nUCF101-24 data download and preparation please refer to [UCF101-24 data preparation](../../dataset/ucf24.md)\n## Train\n### UCF101-24 data set training\n#### Download and add pre-trained models\n1. Download the pre-training model [resnext-101-kinetics](https://vide",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/yowo.md:1-36"
+    },
+    "4345": {
+        "file_id": 371,
+        "content": "YOWO is a single-stage network with 2 branches for spatial and spatio-temporal feature extraction. It uses channel fusion and attention mechanism to aggregate features, then performs frame-level detection. UCF101-24 data preparation instructions provided. Pre-trained models like resnext-101-kinetics are needed for training.",
+        "type": "comment"
+    },
+    "4346": {
+        "file_id": 371,
+        "content": "otag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams) 和 [darknet](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam) as Backbone initialization parameters, or download through the wget command\n   ```bash\n    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam\n    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams\n   ```\n2. Open `PaddleVideo/configs/localization/yowo.yaml`, and fill in the downloaded weight storage path below `pretrained_2d:` and `pretrained_3d:` respectively\n    ```yaml\n    MODEL:\n        framework: \"YOWOLocalizer\"\n        backbone:\n            name: \"YOWO\"\n            num_class: 24\n            pretrained_2d: fill in the path of 2D pre-training model here\n            pretrained_3d: fill in the path of 3D pre-training model here\n    ```\n#### Start training\n- The UCF101-24 data set uses 1 card for training, and the start command of the training method is as follows:\n    ```bash\n    python3 main.py -c configs/localization/yowo.yaml --validate --seed=1",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/yowo.md:36-60"
+    },
+    "4347": {
+        "file_id": 371,
+        "content": "This code provides instructions for downloading and configuring pre-trained models (`darknet.pdparam` and `resnext101_kinetics.pdparams`) for the YOWOLocalizer model in PaddleVideo. The models need to be added under `pretrained_2d:` and `pretrained_3d:` respectively in the `yowo.yaml` file. After that, use the command `python3 main.py -c configs/localization/yowo.yaml --validate --seed=1` to start training on the UCF101-24 dataset using 1 card.",
+        "type": "comment"
+    },
+    "4348": {
+        "file_id": 371,
+        "content": "    ```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n    ```bash\n    python3 main.py --amp -c configs/localization/yowo.yaml --validate --seed=1\n    ```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.\n## Test\n- The YOWO model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```\n  Already save the best model (fsocre)0.8779\n  ```\n- Since the verification index of the YOWO model test mode is **Frame-mAP (@ IoU 0.5)**, which is different from the **fscore** used in the verification mode during the training process, so the v",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/yowo.md:61-80"
+    },
+    "4349": {
+        "file_id": 371,
+        "content": "Enables AMP mixed-precision for faster training, using the command 'python3 main.py --amp -c configs/localization/yowo.yaml --validate --seed=1'. Customize parameters to train or test on different datasets, following the naming format 'model_dataset name_file format_data format_sampling method.yaml'. During training, find 'best' in logs to obtain model test accuracy using Frame-mAP (@ IoU 0.5), which differs from verification fscore used during training.",
+        "type": "comment"
+    },
+    "4350": {
+        "file_id": 371,
+        "content": "erification index recorded in the training log, called `fscore `, does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:\n  ```bash\n  python3 main.py -c configs/localization/yowo.yaml --test --seed=1 -w 'output/YOWO/YOWO_epoch_00005.pdparams'\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of UCF101-24 are as follows:\n  | Model    | 3D-CNN backbone | 2D-CNN backbone | Dataset  |Input    | Frame-mAP <br>(@ IoU 0.5)    |   checkpoints  |\n  | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: |\n  | YOWO | 3D-ResNext-101 | Darknet-19 | UCF101-24 | 16-frames, d=1 | 80.94 | [YOWO.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/YOWO_epoch_00005.pdparams) |\n## Inference\n### Export inference model\n```bash\npython3 tools/export_model.py -c configs/localization/yowo.yaml -p 'output/YOWO/YOWO_epoch_00005.pdparams'",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/yowo.md:80-101"
+    },
+    "4351": {
+        "file_id": 371,
+        "content": "The code snippet shows how to evaluate the YOWO model's performance using a test mode and provides information about the input size, frame-mAP with IoU 0.5, and the checkpoint used for testing on UCF101-24 dataset. Additionally, it demonstrates how to export the inference model for future use.",
+        "type": "comment"
+    },
+    "4352": {
+        "file_id": 371,
+        "content": "```\nThe above command will generate the model structure file `YOWO.pdmodel` and the model weight file `YOWO.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Reasoning Method](../../usage.md#2-infer)\n### Use prediction engine inference\n- Download the test video [HorseRiding.avi](https://videotag.bj.bcebos.com/Data/HorseRiding.avi) for a quick experience, or via the wget command. The downloaded video should be placed in the `data/ucf24` directory:\n```bash\nwget -nc https://videotag.bj.bcebos.com/Data/HorseRiding.avi\n```\n- Run the following command for inference:\n```bash\npython3 tools/predict.py -c configs/localization/yowo.yaml -i 'data/ucf24/HorseRiding.avi' --model_file ./inference/YOWO.pdmodel --params_file ./inference/YOWO.pdiparams\n```\n- When inference is over, the prediction results in image form will be saved in the `inference/YOWO_infer` directory. The image sequence can be converted to a gif by running the following command to complete the final visualisation.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/yowo.md:102-122"
+    },
+    "4353": {
+        "file_id": 371,
+        "content": "This code explains how to generate the YOWO model structure file and weight file for prediction. It also provides instructions on how to use the prediction engine for inference using a test video, downloading it if necessary, and saving the results as an image sequence that can be converted into a gif for visualization.",
+        "type": "comment"
+    },
+    "4354": {
+        "file_id": 371,
+        "content": "```\npython3 data/ucf24/visualization.py --frames_dir ./inference/YOWO_infer/HorseRiding --duration 0.04\n```\nThe resulting visualization is as follows:\n<div align=\"center\">\n  <img  src=\"../../../images/horse_riding.gif\" alt=\"Horse Riding\">\n</div>\nIt can be seen that using the YOWO model trained on UCF101-24 to predict `data/ucf24/HorseRiding.avi`, the category of each frame output is HorseRiding with a confidence level of about 0.80.\n## Reference\n- [You Only Watch Once: A Unified CNN Architecture for Real-Time Spatiotemporal Action Localization](https://arxiv.org/pdf/1911.06644.pdf), Köpüklü O, Wei X, Rigoll G.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/localization/yowo.md:124-138"
+    },
+    "4355": {
+        "file_id": 371,
+        "content": "This code is running a visualization script for the YOWO model trained on UCF101-24. It predicts the category of frames in \"data/ucf24/HorseRiding.avi\" as HorseRiding with high confidence (about 0.80).",
+        "type": "comment"
+    },
+    "4356": {
+        "file_id": 372,
+        "content": "/english_documents/model_zoo/multimodal/actbert.md",
+        "type": "filepath"
+    },
+    "4357": {
+        "file_id": 372,
+        "content": "ActBERT is a multimodal pretrain task using global action info and TaNgled Transformer block (TNT) for text-object interactions. It outperforms state-of-the-art in video-language tasks and can be trained on HowTo100M dataset with AMP for faster training, evaluated on MSR-VTT, and found at the provided link.",
+        "type": "summary"
+    },
+    "4358": {
+        "file_id": 372,
+        "content": "[简体中文](../../../zh-CN/model_zoo/multimodal/actbert.md) | English\n# ActBERT\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Reference](#Reference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install paddlenlp\npython -m pip install lmdb\n```\n## Introduction\nActbert is proposed by Baidu in CVPR2020 for multimodal pretrain task. It leverage global action information to cat- alyze mutual interactions between linguistic texts and local regional objects.  This method introduce a TaNgled Transformer block (TNT) to encode three sources of information, i.e., global actions, local regional objects, and linguistic descriptions. ActBERT significantly outperforms the state- of-the-art in five downstream video-and-language tasks, i.e., text-video clip retrieval, video captioning, video question answering, action segmentation, and action step localization.\n<div align=\"center\">\n<img src=\"../../../images/actbert.png\" height=400 width=500 hspace='10'/> <br />",
+        "type": "code",
+        "location": "/english_documents/model_zoo/multimodal/actbert.md:1-25"
+    },
+    "4359": {
+        "file_id": 372,
+        "content": "This is an introduction to ActBERT, a multimodal pretrain task proposed by Baidu in CVPR2020. It uses global action information to analyze mutual interactions between linguistic texts and local regional objects. The method introduces TaNgled Transformer block (TNT) to encode three sources of information. ActBERT outperforms state-of-the-art in five video-and-language tasks, including text-video clip retrieval, video captioning, and action segmentation.",
+        "type": "comment"
+    },
+    "4360": {
+        "file_id": 372,
+        "content": "</div>\n## Data\nPlease refer to Kinetics400 data download and preparation doc [HowTo100M-data](../../dataset/howto100m.md)\nPlease refer to MSR-VTT data download and preparation doc [MSR-VTT-data](../../dataset/umsrvtt.md)\n## Train\n### Train on HowTo100M\n#### download pretrain-model\nPlease download [bert-base-uncased](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams) as pretraind model:\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams\n```\nand add path to `MODEL.framework.backbone.pretrained` in config file as：\n```yaml\nMODEL:\n    framework: \"ActBert\"\n    backbone:\n        name: \"BertForMultiModalPreTraining\"\n        pretrained: your weight path\n```\n- We provide training option on small data, config file is for reference only.\n#### Start training\n- Train ActBERT on HowTo100M scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_actbert  main.py  --validate -c configs/multimodal/actbert/actbert.yaml",
+        "type": "code",
+        "location": "/english_documents/model_zoo/multimodal/actbert.md:26-65"
+    },
+    "4361": {
+        "file_id": 372,
+        "content": "This code describes how to train ActBERT on HowTo100M dataset. It first requires downloading the pretrain-model \"bert-base-uncased\" from a specified URL and adding its path to the config file. Then, it provides the command to start training using the provided script with specific configuration and GPU allocation.",
+        "type": "comment"
+    },
+    "4362": {
+        "file_id": 372,
+        "content": "```\n- AMP is useful for speeding up training:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 #MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_actbert  main.py  --amp --validate -c configs/multimodal/actbert/actbert.yaml\n```\n## Test\n- Evaluation performs on downstream task, i.e. text-video clip retrieval on MSR-VTT dataset, test accuracy can be obtained using scripts:\n```bash\npython3.7 main.py --test -c configs/multimodal/actbert/actbert_msrvtt.yaml -w Actbert.pdparams\n```\nMetrics on MSR-VTT:\n| R@1 | R@5 | R@10 | Median R | Mean R | checkpoints |\n| :------: | :----------: | :----: | :----: | :----: | :----: |\n| 8.6 | 31.2 | 45.5 | 13.0 | 28.5 | [ActBERT.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ActBERT.pdparams) |\n## Reference\n- [ActBERT: Learning Global-Local Video-Text Representations\n](https://arxiv.org/abs/2011.07231), Linchao Zhu, Yi Yang",
+        "type": "code",
+        "location": "/english_documents/model_zoo/multimodal/actbert.md:66-98"
+    },
+    "4363": {
+        "file_id": 372,
+        "content": "This code shows how to train a model using PaddlePaddle with Automatic Mixed Precision (AMP) for faster training, evaluate it on the MSR-VTT dataset, and provides metrics such as R@1, R@5, R@10, and Median R. The ActBERT model can be found at the provided link in the reference section.",
+        "type": "comment"
+    },
+    "4364": {
+        "file_id": 373,
+        "content": "/english_documents/model_zoo/partition/transnetv2.md",
+        "type": "filepath"
+    },
+    "4365": {
+        "file_id": 373,
+        "content": "TransNetV2 is a deep learning-based video segmentation model for shot transition detection, using DDCNN V2 structure, RGB color histograms, and frame similarity. The provided code demonstrates usage of predict.py to infer predictions on input files, with output probabilities and lens boundaries.",
+        "type": "summary"
+    },
+    "4366": {
+        "file_id": 373,
+        "content": "[简体中文](../../../zh-CN/model_zoo/partition/transnetv2.md) | English\n# TransNetV2\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Details](#Details)\n- [Reference](#Reference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install ffmpeg-python==0.2.0\n```\n## Introduction\nTransNetV2 is a video segmentation model based on deep learning. It performs feature learning through the DDCNN V2 structure, and adds RGB color histograms and video frame similarity for more effective feature extraction, and finally obtains whether each frame is a shot boundary frame Probability, thereby completing the video segmentation. The algorithm has good effect and efficient calculation, which is very suitable for industrial landing.\n![](../../../images/transnetv2.png)\nThis code currently only supports model inference, and model training and testing will be provided in the future.\nPlease refer to the pap",
+        "type": "code",
+        "location": "/english_documents/model_zoo/partition/transnetv2.md:1-28"
+    },
+    "4367": {
+        "file_id": 373,
+        "content": "TransNetV2 is a video segmentation model based on deep learning using DDCNN V2 structure for feature learning, RGB color histograms, and video frame similarity for effective feature extraction. This code supports inference only, with training and testing to be provided later. Suitable for industrial applications, more details are available in the paper.",
+        "type": "comment"
+    },
+    "4368": {
+        "file_id": 373,
+        "content": "er for details. [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838)\n## Data\ncoming soon\n## Train\ncoming soon\n## Test\ncoming soon\n## Inference\nLoad the TransNetV2 weights trained on ClipShots and TRECVID IACC.3 dataset [TransNetV2_shots.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams), or download through the command line\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams\n```\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/partitioners/transnetv2/transnetv2.yaml -p data/TransNetV2_shots.pdparams -o inference/TransNetV2\n```\nThe above command will generate the model structure file`TransNetV2.pdmodel`and the model weight file`TransNetV2.pdiparams`required for prediction.\nFor the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-Model Reasoning)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/partition/transnetv2.md:28-62"
+    },
+    "4369": {
+        "file_id": 373,
+        "content": "This code provides instructions to load and export the TransNet V2 inference model for shot transition detection. It mentions the required weights trained on ClipShots and TRECVID IACC.3 dataset, as well as the URL to download them using wget command. The script also outlines how to use `export_model.py` tool to generate the `TransNetV2.pdmodel` and `TransNetV2.pdiparams` files for prediction purposes.",
+        "type": "comment"
+    },
+    "4370": {
+        "file_id": 373,
+        "content": "### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/partitioners/transnetv2/transnetv2.yaml \\\n                           --model_file inference/TransNetV2/TransNetV2.pdmodel \\\n                           --params_file inference/TransNetV2/TransNetV2.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nBy defining the `output_path` parameters in `transnetv2.yaml`, the prediction probability of each frame can be output to `{output_path}/example_predictions.txt`, and the predicted lens boundary is output to `{output_path}/example_scenes.txt`.\nBy defining the `visualize` parameter in `transnetv2.yaml`, the predicted results can be visualized, and the visual results are saved to `{output_path}/example_vis.png`.\n## Reference\n- [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838), Tomáš Souček, Jakub Lokoč",
+        "type": "code",
+        "location": "/english_documents/model_zoo/partition/transnetv2.md:64-80"
+    },
+    "4371": {
+        "file_id": 373,
+        "content": "This code snippet demonstrates the usage of predict.py to infer TransNetV2 model predictions for a given input file (example.avi). The model configuration is specified in transnetv2.yaml, and the trained model files are provided as inputs. Prediction probability per frame is output to example_predictions.txt and lens boundary is output to example_scenes.txt. Visualization can be enabled for better interpretation of results.",
+        "type": "comment"
+    },
+    "4372": {
+        "file_id": 374,
+        "content": "/english_documents/model_zoo/recognition/agcn.md",
+        "type": "filepath"
+    },
+    "4373": {
+        "file_id": 374,
+        "content": "This code implements AGCN model for improved ST-GCN accuracy on FSD-10 and NTU-RGBD datasets, achieving high Top-1 accuracies. It provides instructions for data preparation, training, testing, inference, evaluation, and exports an AGCN model for video recognition using Multi-stream Adaptive Graph Convolutional Networks.",
+        "type": "summary"
+    },
+    "4374": {
+        "file_id": 374,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/agcn.md) | English\n# AGCN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe implemented Adaptive Graph Convolution Network to improve the accuracy of [ST-GCN](./stgcn.md).\n## Data\nPlease refer to FSD-10 data download and preparation doc [FSD](../../dataset/fsd.md)\nPlease refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)\n## Train\n### Train on FSD\n- Train AGCN on FSD scripts:\n```bash\npython3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml\n```\n- Turn off `valid` when training, as validation dataset is not available for the competition.\n### Train on NTU-RGBD\n- Train AGCN on NTU-RGBD scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_agcn  main.py  --validate -c configs/recognition/agcn/agcn_ntucs.yaml\n```\n- config file `agcn_ntucs.yaml` corresponding to the config of AGCN on NTU-RGB+D dataset with cross-subject splits.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn.md:1-46"
+    },
+    "4375": {
+        "file_id": 374,
+        "content": "This code describes the Adaptive Graph Convolution Network (AGCN) implementation for improving the accuracy of ST-GCN, trained on FSD-10 and NTU-RGBD datasets. It provides instructions for data preparation, training, testing, and inference.",
+        "type": "comment"
+    },
+    "4376": {
+        "file_id": 374,
+        "content": "## Test\n### Test onf FSD\n- Test scripts：\n```bash\npython3.7 main.py --test -c configs/recognition/agcn/agcn_fsd.yaml  -w output/AGCN/AGCN_epoch_00100.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\n- Evaluation results will be saved in `submission.csv` file, final score can be obtained in [competition website](https://aistudio.baidu.com/aistudio/competition/detail/115).\nAccuracy on FSD dataset:\n| Test_Data | Top-1 | checkpoints |\n| :----: | :----: | :---- |\n| Test_A | 62.29 | [AGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams)|\n### Test on NTU-RGB+D\n- Test scripts：\n```bash\npython3.7 main.py --test -c configs/recognition/agcn/agcn_ntucs.yaml -w output/AGCN/AGCN_best.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on NTU-RGB+D dataset:\n| split | Top-1 | checkpoints |\n| :----: | :----: | :---- |\n| cross-subject | 83.27 | [AGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_ntucs.pdparams)|",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn.md:49-84"
+    },
+    "4377": {
+        "file_id": 374,
+        "content": "This code provides test scripts to evaluate the performance of the AGCN model on two datasets: FSD and NTU-RGB+D. The test scripts require specifying a configuration file (-c) and a weight path (-w). Evaluation results are saved in submission.csv, with final scores available on the competition website. Testing on FSD dataset returns a Top-1 accuracy of 62.29, while testing on NTU-RGB+D dataset (cross-subject split) returns a Top-1 accuracy of 83.27. The respective model checkpoints are also provided as links for further exploration.",
+        "type": "comment"
+    },
+    "4378": {
+        "file_id": 374,
+        "content": "## Inference\n### export inference model\n To get model architecture file `AGCN.pdmodel` and parameters file `AGCN.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml \\\n                                -p data/AGCN_fsd.pdparams \\\n                                -o inference/AGCN\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \\\n                           --config configs/recognition/agcn/agcn_fsd.yaml \\\n                           --model_file inference/AGCN/AGCN.pdmodel \\\n                           --params_file inference/AGCN/AGCN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/fsd10/example_skeleton.npy\n        top-1 class: 27\n        top-1 score: 0.8965644240379333",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn.md:87-117"
+    },
+    "4379": {
+        "file_id": 374,
+        "content": "This code provides instructions on how to export and use an inference model called AGCN for video recognition. It shows the command to obtain the architecture file (AGCN.pdmodel) and parameter file (AGCN.pdiparams), as well as an example of how to run prediction using the provided files, specifying input data, configuration, and whether to use GPU or not. The output includes the top-1 class and its corresponding score.",
+        "type": "comment"
+    },
+    "4380": {
+        "file_id": 374,
+        "content": "```\n## Reference\n- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin\n- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1805.07694), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu\n- [Skeleton-Based Action Recognition with Multi-Stream Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1912.06971), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu\n- Many thanks to [li7819559](https://github.com/li7819559) and [ZhaoJingjing713](https://github.com/ZhaoJingjing713) for contributing the code.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn.md:118-129"
+    },
+    "4381": {
+        "file_id": 374,
+        "content": "This code snippet implements a Multi-stream Adaptive Graph Convolutional Network for skeleton-based action recognition. It utilizes two input streams (spatial and temporal) to process the data and applies adaptive graph convolution on each stream separately, followed by concatenation of the two streams before being passed through MLP and softmax layers.",
+        "type": "comment"
+    },
+    "4382": {
+        "file_id": 375,
+        "content": "/english_documents/model_zoo/recognition/agcn2s.md",
+        "type": "filepath"
+    },
+    "4383": {
+        "file_id": 375,
+        "content": "The 2s-AGCN model, an enhanced ST-GCN version for motion recognition, utilizes dual-flow adaptive convolutional networks and focuses on second-order bone data. Code offers test scripts, accuracy results, and download links for models trained on different datasets, with PaddleVideo exporting an action recognition model using AGCN2s.",
+        "type": "summary"
+    },
+    "4384": {
+        "file_id": 375,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/2sAGCN.md) | English\n# CTR-GCN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\n![模型结构图](../../../images/agcn2s.png)\n[2s-AGCN](https://openaccess.thecvf.com/content_CVPR_2019/papers/Shi_Two-Stream_Adaptive_Graph_Convolutional_Networks_for_Skeleton-Based_Action_Recognition_CVPR_2019_paper.pdf) is an improved article on ST-GCN published in CVPR2019. It proposes a dual-flow adaptive convolutional network, which improves the shortcomings of the original ST-GCN. In the existing GCN based approach, the topology of the graph is set manually and fixed to all layers and input samples. In addition, the second-order information of bone data (bone length and orientation) is naturally more beneficial and discriminating for motion recognition, which was rarely studied in the methods at that time. Therefore, this paper puts forward a node and bones of tw",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn2s.md:1-20"
+    },
+    "4385": {
+        "file_id": 375,
+        "content": "This code provides an introduction to the 2s-AGCN model, an improved version of ST-GCN published in CVPR2019. It uses a dual-flow adaptive convolutional network and focuses on the second-order information of bone data for motion recognition.",
+        "type": "comment"
+    },
+    "4386": {
+        "file_id": 375,
+        "content": "o kinds of information fusion based on skeleton shuangliu network, and join in figure convolution adjacency matrix adaptive matrix, a sharp rise in the bones of gesture recognition accuracy, also has laid the foundation for subsequent work (the subsequent basic skeleton gesture recognition are based on the flow of network framework).\n## Data\nData download and processing are consistent with CTR-GCN. For details, please refer to [NTU-RGBD Data Preparation](../../dataset/ntu-rgbd.md)\n## Train\n### Train on NTU-RGBD\nTrain CTR-GCN on NTU-RGBD scripts using single gpu：\n```bash\n# train cross subject with bone data\npython main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucs_bone.yaml --seed 1\n# train cross subject with joint data\npython main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucs_joint.yaml --seed 1\n# train cross view with bone data\npython main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucv_bone.yaml --seed 1\n# train cross view with joint data\npython main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucv_joint.yaml --seed 1",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn2s.md:20-40"
+    },
+    "4387": {
+        "file_id": 375,
+        "content": "This code provides information on the AGCN2S model, a skeleton-based gesture recognition network. It uses data from NTU-RGBD, with details of its preparation found in another file. The code also outlines how to train the CTR-GCN model on various configurations such as cross-subject and cross-view training scenarios, using bone or joint data. This serves as a guide for running the model's training scripts.",
+        "type": "comment"
+    },
+    "4388": {
+        "file_id": 375,
+        "content": "```\nconfig file `agcn2s_ntucs_joint.yaml` corresponding to the config of 2s-AGCN on NTU-RGB+D dataset with cross-subject splits.\n## Test\n### Test on NTU-RGB+D\nTest scripts：\n```bash\n# test cross subject with bone data\npython main.py --test -c configs/recognition/2sagcn/2sagcn_ntucs_bone.yaml -w data/2SAGCN_ntucs_bone.pdparams\n# test cross subject with joint data\npython main.py --test -c configs/recognition/2sagcn/2sagcn_ntucs_joint.yaml -w data/2SAGCN_ntucs_joint.pdparams\n# test cross view with bone data\npython main.py --test -c configs/recognition/2sagcn/2sagcn_ntucv_bone.yaml -w data/2SAGCN_ntucv_bone.pdparams\n# test cross view with joint data\npython main.py --test -c configs/recognition/2sagcn/2sagcn_ntucv_joint.yaml -w data/2SAGCN_ntucv_joint.pdparams\n```\n* Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on NTU-RGB+D dataset:\n|                |  CS   |   CV   |\n| :------------: | :---: | :----: |\n| Js-AGCN(joint) | 85.8% | 94.13% |\n| Bs-AGCN(bone)  | 86.7% | 93.9%  |\nTrain log：[download](https://github.com/ELKYang/2s-AGCN-paddle/tree/main/work_dir/ntu)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn2s.md:41-71"
+    },
+    "4389": {
+        "file_id": 375,
+        "content": "The code provides test scripts for the 2s-AGCN model on the NTU-RGB+D dataset, both with cross-subject and cross-view splits. The accuracy results for joint and bone data are given, along with a download link to the training log.",
+        "type": "comment"
+    },
+    "4390": {
+        "file_id": 375,
+        "content": "VisualDL log：[download](https://github.com/ELKYang/2s-AGCN-paddle/tree/main/runs)\ncheckpoints：\n|                            CS-Js                             |                            CS-Bs                             |                            CV-Js                             |                            CV-Bs                             |\n| :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| [ntu_cs_agcn_joint](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cs_agcn_joint-48-30674.pdparams) | [ntu_cs_agcn_bone](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cs_agcn_bone-44-28170.pdparams) | [ntu_cv_agcn_joint](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cv_agcn_joint-38-22932.pdparams) | [ntu_cv_agcn_bone](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cv_agcn_bone-49-29400.pdparams) |",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn2s.md:73-79"
+    },
+    "4391": {
+        "file_id": 375,
+        "content": "Code snippet contains download links for different checkpoints of the AGCN-2s model trained on various datasets:\n1. ntu_cs_agcn_joint\n2. ntu_cs_agcn_bone\n3. ntu_cv_agcn_joint\n4. ntu_cv_agcn_bone",
+        "type": "comment"
+    },
+    "4392": {
+        "file_id": 375,
+        "content": "## Inference\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/agcn2s/2sagcn_ntucs_joint.yaml \\\n                                -p data/AGCN2s_ntucs_joint.pdparams \\\n                                -o inference/AGCN2s_ntucs_joint\n```\nTo get model architecture file `AGCN2s_ntucs_joint.pdmodel` and parameters file `AGCN2s_ntucs_joint.pdiparams`.\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \\\n                           --config configs/recognition/agcn2s/2sagcn_ntucs_joint.yaml \\\n                           --model_file inference/AGCN2s_ntucs_joint/AGCN2s_ntucs_joint.pdmodel \\\n                           --params_file inference/AGCN2s_ntucs_joint/AGCN2s_ntucs_joint.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn2s.md:81-103"
+    },
+    "4393": {
+        "file_id": 375,
+        "content": "This code is exporting and inferring a model for action recognition using PaddleVideo's AGCN2s. It uses the `export_model.py` script to generate an inference model archive, which includes the model architecture file (AGCN2s_ntucs_joint.pdmodel) and parameters file (AGCN2s_ntucs_joint.pdiparams). The `predict.py` script is then used to perform inference on input data with the specified configuration and model files, using GPU if available and disabling TensorRT.",
+        "type": "comment"
+    },
+    "4394": {
+        "file_id": 375,
+        "content": "```\n### infer result\n![预测引擎推理结果图](../../../images/agcn2s_result.png)\n## Reference\n- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://openaccess.thecvf.com/content_CVPR_2019/papers/Shi_Two-Stream_Adaptive_Graph_Convolutional_Networks_for_Skeleton-Based_Action_Recognition_CVPR_2019_paper.pdf), Lei Shi and Yifan Zhang and Jian Cheng and Hanqing Lu",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/agcn2s.md:104-112"
+    },
+    "4395": {
+        "file_id": 375,
+        "content": "This code block shows the prediction engine result for the AGCN2S model. It displays an image of the prediction results and references the original paper on Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition.",
+        "type": "comment"
+    },
+    "4396": {
+        "file_id": 376,
+        "content": "/english_documents/model_zoo/recognition/attention_lstm.md",
+        "type": "filepath"
+    },
+    "4397": {
+        "file_id": 376,
+        "content": "The AttentionLSTM model is presented, using LSTMs and an Attention layer to weigh frame features. The code trains and tests on YouTube-8M with PaddleVideo, exporting the model for classification. It accurately predicts top-1 class 11 with 0.9841 confidence.",
+        "type": "summary"
+    },
+    "4398": {
+        "file_id": 376,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/attention_lstm.md) | English\n# AttentionLSTM\n## content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nRecurrent Neural Networks (RNN) are often used in the processing of sequence data, which can model the sequence information of multiple consecutive frames of video, and are commonly used methods in the field of video classification.\nThis model uses a two-way long and short-term memory network (LSTM) to encode all the frame features of the video in sequence. Unlike the traditional method that directly uses the output of the last moment of LSTM, this model adds an Attention layer, and the hidden state output at each moment has an adaptive weight, and then linearly weights the final feature vector. The reference paper implements a two-layer LSTM structure, while **this model implements a two-way LSTM with Attention**.\nThe Attention layer can refer to the paper [AttentionCluster](https://arxiv.org/abs/1711.09550)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/attention_lstm.md:1-19"
+    },
+    "4399": {
+        "file_id": 376,
+        "content": "This code introduces the AttentionLSTM model, which utilizes two-way LSTMs to encode all video frame features and adds an Attention layer for adaptive weighting. This improves upon traditional methods by linearly weighing final feature vectors based on hidden state outputs at each moment.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/44.json b/docs/data/44.json
new file mode 100644
index 000000000..90e55e76c
--- /dev/null
+++ b/docs/data/44.json
@@ -0,0 +1,544 @@
+{
+    "4400": {
+        "file_id": 376,
+        "content": "## Data\nPaddleVide provides training and testing scripts on the Youtube-8M dataset. Youtube-8M data download and preparation please refer to [YouTube-8M data preparation](../../dataset/youtube8m.md)\n## Train\n### Youtube-8M data set training\n#### Start training\n- The Youtube-8M data set uses 8 cards for training. In the feature format, video and audio features will be used as input. The training start command of the data is as follows\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_attetion_lstm main.py --validate -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml\n  ```\n## Test\nThe command is as follows:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_attetion_lstm main.py --test -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml -w \"output/AttentionLSTM/AttentionLSTM_best.pdparams\"\n```\nWhen the test configuration uses the following parameters, the test indicators on the validation data set of Youtube-8M are as follows:",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/attention_lstm.md:21-45"
+    },
+    "4401": {
+        "file_id": 376,
+        "content": "This code provides instructions on how to train and test a model using PaddleVideo's attention LSTM on the Youtube-8M dataset. It mentions the required commands for training and testing, and also states that 8 GPUs are used during the process.",
+        "type": "comment"
+    },
+    "4402": {
+        "file_id": 376,
+        "content": "| Hit@1 | PERR | GAP | checkpoints |\n| :-----: | :---------: | :---: | ----- |\n| 89.05 | 80.49 | 86.30 | [AttentionLSTM_yt8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams) |\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \\\n                                -p data/AttentionLSTM_yt8.pdparams \\\n                                -o inference/AttentionLSTM\n```\nThe above command will generate the model structure file `AttentionLSTM.pdmodel` and the model weight file `AttentionLSTM.pdiparams` required for prediction.\nFor the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0.0/docs/en/start.md#2-infer)\n### Use prediction engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.pkl \\\n                           --config configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \\",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/attention_lstm.md:47-68"
+    },
+    "4403": {
+        "file_id": 376,
+        "content": "This code provides instructions to export an inference model and use the prediction engine for it. The exported model will be stored as AttentionLSTM.pdmodel and AttentionLSTM.pdiparams files, which are necessary for making predictions. Users can use the tools/predict.py script with the input file data/example.pkl and the configuration file configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml to perform inference using the prediction engine.",
+        "type": "comment"
+    },
+    "4404": {
+        "file_id": 376,
+        "content": "                           --model_file inference/AttentionLSTM/AttentionLSTM.pdmodel \\\n                           --params_file inference/AttentionLSTM/AttentionLSTM.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nAn example of the output is as follows:\n```bash\nCurrent video file: data/example.pkl\n         top-1 class: 11\n         top-1 score: 0.9841002225875854\n```\nIt can be seen that using the AttentionLSTM model trained on Youtube-8M to predict data/example.pkl, the output top1 category id is 11, and the confidence is 0.98.\n## Reference paper\n- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen\n- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/attention_lstm.md:69-84"
+    },
+    "4405": {
+        "file_id": 376,
+        "content": "This code executes the AttentionLSTM model for video classification on a specific file (data/example.pkl). The predicted top-1 class is 11, and the confidence is 0.9841002225875854. This result utilizes the model trained on YouTube-8M dataset, indicating its accuracy in video classification tasks.",
+        "type": "comment"
+    },
+    "4406": {
+        "file_id": 377,
+        "content": "/english_documents/model_zoo/recognition/ctrgcn.md",
+        "type": "filepath"
+    },
+    "4407": {
+        "file_id": 377,
+        "content": "CTR-GCN is a bone-based behavior recognition model using graph convolution, trained and tested with PaddlePaddle framework on the NTU-RGB+D dataset. The code snippet represents top-1 action classification with 99.9988% accuracy.",
+        "type": "summary"
+    },
+    "4408": {
+        "file_id": 377,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/ctrgcn.md) | English\n# CTR-GCN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\n[CTRGCN](https://github.com/Uason-Chen/CTR-GCN.git) is a bone based behavior recognition model proposed by iccv 2021. By applying the changes to the graph convolution of human bone data with topological structure, and using spatio-temporal graph convolution to extract spatio-temporal features for behavior recognition, the accuracy of bone based behavior recognition task is greatly improved.\n<div align=\"center\">\n<img src=\"../../../images/ctrgcn.jpg\" height=200 width=950 hspace='10'/> <br />\n</div>\n## Data\nPlease refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)\n## Train\n### Train on NTU-RGBD\n- Train CTR-GCN on NTU-RGBD scripts using single gpu：\n```bash\n# joint modality\npython main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml --seed 1",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/ctrgcn.md:1-39"
+    },
+    "4409": {
+        "file_id": 377,
+        "content": "CTR-GCN is a bone-based behavior recognition model using graph convolution on human bone data. It improves accuracy for the task by extracting spatio-temporal features with spatio-temporal graph convolution. Train CTR-GCN on NTU-RGBD data with single GPU and joint modality.",
+        "type": "comment"
+    },
+    "4410": {
+        "file_id": 377,
+        "content": "# bone modality\npython main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml --seed 1\n# motion modality\npython main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml --seed 1\n# bone motion modality\npython main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml --seed 1\n```\n- Train CTR-GCN on NTU-RGBD scriptsusing multi gpus:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_ctrgcn  main.py  --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml\n```\n- config file `ctrgcn_ntucs_joint.yaml` corresponding to the config of CTR-GCN on NTU-RGB+D dataset with cross-subject splits.\n## Test\n### Test on NTU-RGB+D\n- Test scripts：\n```bash\n# joint modality\npython3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml -w data/CTRGCN_ntucs_joint.pdparams\n# bone modality\npython3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml -w data/CTRGCN_ntucs_bone.pdparams\n# motion modality\npython",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/ctrgcn.md:41-74"
+    },
+    "4411": {
+        "file_id": 377,
+        "content": "This code snippet runs the CTR-GCN model on different modalities and datasets, performs training with multiple GPUs, and tests the trained models. It uses the PaddlePaddle framework and provides configurations for the NTU-RGB+D dataset, including joint, bone, and motion modalities. The code can be executed by providing the appropriate command line arguments to specify the model, dataset, and mode (train or test).",
+        "type": "comment"
+    },
+    "4412": {
+        "file_id": 377,
+        "content": "3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml -w data/CTRGCN_ntucs_motion.pdparams\n# bone motion modality\npython3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml -w data/CTRGCN_ntucs_bone_motion.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on NTU-RGB+D dataset:\n| split | modality | Top-1 | checkpoints |\n| :----: | :----: | :----: | :----: |\n| cross-subject | joint | 89.93 | [CTRGCN_ntucs_joint.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_joint.pdparams) |\n| cross-subject | bone | 85.24 | [CTRGCN_ntucs_bone.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone.pdparams) |\n| cross-subject | motion | 85.33 | [CTRGCN_ntucs_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_motion.pdparams) |\n| cross-subject | bone motion | 84.53 | [CTRGCN_ntucs_bone_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone_motion.pdparams) |",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/ctrgcn.md:74-90"
+    },
+    "4413": {
+        "file_id": 377,
+        "content": "This code is executing Python scripts for the CTRGCN model, which utilizes config files (-c) and pre-trained weight paths (-w). The accuracy table showcases performance on NTU-RGB+D dataset across different modalities.",
+        "type": "comment"
+    },
+    "4414": {
+        "file_id": 377,
+        "content": "## Inference\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \\\n                                -p data/CTRGCN_ntucs_joint.pdparams \\\n                                -o inference/CTRGCN\n```\n To get model architecture file `CTRGCN.pdmodel` and parameters file `CTRGCN.pdiparams`, use:\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \\\n                           --config configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \\\n                           --model_file inference/CTRGCN_joint/CTRGCN_joint.pdmodel \\\n                           --params_file inference/CTRGCN_joint/CTRGCN_joint.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example_NTU-RGB-D_sketeton.npy",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/ctrgcn.md:93-121"
+    },
+    "4415": {
+        "file_id": 377,
+        "content": "This code exports the inference model and performs inference using PaddleVideo's CTRGCN model for action recognition. The `export_model.py` script creates the architecture file (CTRGCN.pdmodel) and parameters file (CTRGCN.pdiparams). The `predict.py` script uses these files to perform inference on a given video file, specifying the configuration file for the CTRGCN model. It runs with GPU acceleration (use_gpu=True) and without TensorRT optimization (use_tensorrt=False).",
+        "type": "comment"
+    },
+    "4416": {
+        "file_id": 377,
+        "content": "        top-1 class: 4\n        top-1 score: 0.999988317489624\n```\n## Reference\n- [Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213), Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/ctrgcn.md:122-128"
+    },
+    "4417": {
+        "file_id": 377,
+        "content": "The code snippet represents the top-1 class and its corresponding score in a model's prediction for skeleton-based action recognition. The top-1 class is 4, with a top-1 score of 0.999988317489624. This information can be used to identify the recognized action from multiple options.",
+        "type": "comment"
+    },
+    "4418": {
+        "file_id": 378,
+        "content": "/english_documents/model_zoo/recognition/movinet.md",
+        "type": "filepath"
+    },
+    "4419": {
+        "file_id": 378,
+        "content": "MoViNet, a lightweight Google Research video model, improves accuracy using causal convolution and temporal ensembles. PaddleVideo's code includes training/testing info, Kinetics-400 data, inference tools, configuration file, model file, parameter file, GPU usage, TensorRT settings, and example logs for processing videos.",
+        "type": "summary"
+    },
+    "4420": {
+        "file_id": 378,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/movinet.md) | English\n# MoViNet\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nMovinet is a mobile video network developed by Google research. It uses causal convolution operator with stream buffer and temporal ensembles to improve accuracy. It is a lightweight and efficient video model that can be used for online reasoning video stream.\n## Data\nPlease refer to Kinetics400 data download and preparation doc [k400-data](../../dataset/K400.md)\n## Train\n- Train MoViNet on kinetics-400 scripts:\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_movinet main.py --validate -c configs/recognition/movinet/movinet_k400_frame.yaml\n```\n## Test\n- For uniform sampling, test accuracy can be found in training-logs by search key word `best`, such as:\n```txt\nAlready save the best model (top1 acc)0.6489",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/movinet.md:1-40"
+    },
+    "4421": {
+        "file_id": 378,
+        "content": "MoViNet is a lightweight, efficient video model developed by Google research for online reasoning on video streams. It utilizes causal convolution operator with stream buffer and temporal ensembles to improve accuracy. The code provides details on how to train and test MoViNet using Kinetics-400 data, along with instructions for accessing the training logs to find test accuracy results.",
+        "type": "comment"
+    },
+    "4422": {
+        "file_id": 378,
+        "content": "```\n- Test scripts:\n```bash\npython3.7 main.py --test -c configs/recognition/movinet/movinet_k400_frame.yaml -w output/MoViNet/MoViNet_best.pdparams\n```\nAccuracy on Kinetics400:\n| Config | Sampling method | num_seg | target_size | Top-1 | checkpoints |\n| :------: | :--------: | :-------: | :-------: | :-----: | :-----: |\n| A0 | Uniform | 50 | 172  | 66.62 | [MoViNetA0_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/MoViNetA0_k400.pdparams)  |\n## Inference\n### export inference model\n To get model architecture file `MoViNetA0.pdmodel` and parameters file `MoViNetA0.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/movinet/movinet_k400_frame.yaml \\\n                                -p data/MoViNetA0_k400.pdparams \\\n                                -o inference/MoViNetA0\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/movinet.md:41-73"
+    },
+    "4423": {
+        "file_id": 378,
+        "content": "This code provides information on testing and inference for the MoViNet model. It includes commands for running tests, accessing accuracy results on Kinetics400, exporting inference models, and using the predict tool with example input files.",
+        "type": "comment"
+    },
+    "4424": {
+        "file_id": 378,
+        "content": "                           --config configs/recognition/movinet/movinet_k400_frame.yaml \\\n                           --model_file inference/MoViNetA0/MoViNet.pdmodel \\\n                           --params_file inference/MoViNetA0/MoViNet.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.7667049765586853\n```\n## Reference\n- [MoViNets: Mobile Video Networks for Efficient Video Recognition](https://arxiv.org/abs/2103.11511)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/movinet.md:74-91"
+    },
+    "4425": {
+        "file_id": 378,
+        "content": "The code specifies the configuration file, model file, and parameter file for the MoViNet model in PaddleVideo. It also sets the use of GPU as True and TensorRT as False. The example logs show the video file being processed and the top-1 class and score for video recognition.",
+        "type": "comment"
+    },
+    "4426": {
+        "file_id": 379,
+        "content": "/english_documents/model_zoo/recognition/posec3d.md",
+        "type": "filepath"
+    },
+    "4427": {
+        "file_id": 379,
+        "content": "The code trains and validates PoseC3D, a skeleton-based action recognition model, on the UCF101 dataset, using pre-trained weights. It details testing and inference processes without GPU acceleration or TensorRT.",
+        "type": "summary"
+    },
+    "4428": {
+        "file_id": 379,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/posec3d.md) | English\n# PoseC3D\n---\n## Contents\n- [PoseC3D](#PoseC3D)\n  - [Contents](#contents)\n  - [Introduction](#introduction)\n  - [Data](#data)\n  - [Train](#train)\n    - [Train on UCF101.](#train-on-ucf101)\n  - [Test](#test)\n    - [Test onf UCF101](#test-onf-ucf101)\n  - [Inference](#inference)\n    - [export inference model](#export-inference-model)\n    - [infer](#infer)\n  - [Reference](#reference)\n## Introduction\nHuman  skeleton,  as  a  compact  representation  of  hu-man  action,  has  received  increasing  attention  in  recentyears.    Many  skeleton-based  action  recognition  methodsadopt graph convolutional networks (GCN) to extract fea-tures on top of human skeletons.   Despite the positive re-sults  shown  in  previous  works,  GCN-based  methods  aresubject  to  limitations  in  robustness,  interoperability,  andscalability.  In this work, we propose PoseC3D, a new ap-proach  to  skeleton-based  action  recognition,  which  relieson  a  3D  hea",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/posec3d.md:1-24"
+    },
+    "4429": {
+        "file_id": 379,
+        "content": "PoseC3D is a skeleton-based action recognition approach that utilizes 3D head pose features and aims to overcome the limitations of GCN-based methods in terms of robustness, interoperability, and scalability. It involves training on UCF101, testing on UCF101, exporting an inference model, and inferring using the model.",
+        "type": "comment"
+    },
+    "4430": {
+        "file_id": 379,
+        "content": "tmap  stack  instead  of  a  graph  sequence  asthe base representation of human skeletons.  Compared toGCN-based methods, PoseC3D is more effective in learningspatiotemporal features, more robust against pose estima-tion noises, and generalizes better in cross-dataset settings.Also, PoseC3D can handle multiple-person scenarios with-out additional computation cost, and its features can be eas-ily integrated with other modalities at early fusion stages,which  provides  a  great  design  space  to  further  boost  theperformance. On four challenging datasets, PoseC3D con-sistently obtains superior performance, when used alone onskeletons and in combination with the RGB modality.\n## Data\nPlease download UCF101 skeletons datasets and pretraind model weights.\n[https://aistudio.baidu.com/aistudio/datasetdetail/140593](https://aistudio.baidu.com/aistudio/datasetdetail/140593)\n## Train\n### Train on UCF101.\n- Train PoseC3D model:\n```bash\npython3.7 main.py --validate -c configs/recognition/posec3d/posec3d.yaml --weights res3d_k400.pdparams",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/posec3d.md:24-39"
+    },
+    "4431": {
+        "file_id": 379,
+        "content": "This code is for training the PoseC3D model on UCF101 dataset. It requires downloading pre-trained model weights from a specific URL. The command \"python3.7 main.py --validate -c configs/recognition/posec3d/posec3d.yaml --weights res3d_k400.pdparams\" is used to train the PoseC3D model using a provided configuration file and pre-trained weights. The trained model will be validated, likely to assess its performance.",
+        "type": "comment"
+    },
+    "4432": {
+        "file_id": 379,
+        "content": "```\n## Test\n### Test onf UCF101\n- Test scripts：\n```bash\npython3.7 main.py --test -c configs/recognition/posec3d/posec3d.yaml  -w output/PoseC3D/PoseC3D_epoch_0012.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on UCF101 dataset:\n| Test_Data | Top-1 | checkpoints |\n| :----: | :----: | :---- |\n| UCF101 test1 | 87.05 | [PoseC3D_ucf101.pdparams]() |\n## Inference\n### export inference model\n To get model architecture file `PoseC3D.pdmodel` and parameters file `PoseC3D.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/posec3d/posec3d.yaml \\\n                                -p data/PoseC3D_ucf101.pdparams \\\n                                -o inference/PoseC3D\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example_UCF101_skeleton.pkl\\\n                           --config configs/recognition/posec3d/posec3d.yaml \\",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/posec3d.md:40-82"
+    },
+    "4433": {
+        "file_id": 379,
+        "content": "This code provides instructions for testing and inference of the PoseC3D model on UCF101 dataset. The test script specifies the config file and weight path, while the inference steps explain how to export the model architecture and parameters for further usage. The link leads to additional information on model inference.",
+        "type": "comment"
+    },
+    "4434": {
+        "file_id": 379,
+        "content": "                           --model_file inference/PoseC3D/PoseC3D.pdmodel \\\n                           --params_file inference/PoseC3D/PoseC3D.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example_UCF101_skeleton.pkl\n\ttop-1 class: 0\n\ttop-1 score: 0.6731489896774292\n```\n## Reference\n- [Revisiting Skeleton-based Action Recognition](https://arxiv.org/pdf/2104.13586v1.pdf), Haodong Duan, Yue Zhao, Kai Chen, Dian Shao, Dahua Lin, Bo Dai",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/posec3d.md:83-100"
+    },
+    "4435": {
+        "file_id": 379,
+        "content": "Running PoseC3D model for inference with GPU acceleration and without TensorRT.",
+        "type": "comment"
+    },
+    "4436": {
+        "file_id": 380,
+        "content": "/english_documents/model_zoo/recognition/pp-timesformer.md",
+        "type": "filepath"
+    },
+    "4437": {
+        "file_id": 380,
+        "content": "The PP-TimeSformer model is an enhanced version of TimeSformer for video recognition tasks, trained on Kinetics-400 dataset and supports multi-GPU. It uses PaddleVideo with Vision Transformer backbone for testing and exports PP-TimeSformer for prediction using a specific config file.",
+        "type": "summary"
+    },
+    "4438": {
+        "file_id": 380,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/pp-timesformer.md) | English\n# TimeSformer Video Classification Model\n## Content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe have improved the [TimeSformer model](./timesformer.md) and obtained a more accurate 2D practical video classification model **PP-TimeSformer**. Without increasing the amount of parameters and calculations, the accuracy on the UCF-101, Kinetics-400 and other data sets significantly exceeds the original version. The accuracy on the Kinetics-400 data set is shown in the table below.\n| Version | Top1 |\n| :------ | :----: |\n| Ours ([swa](#refer-anchor-1)+distill+16frame) | 79.44 |\n| Ours ([swa](#refer-anchor-1)+distill)  | 78.87 |\n| Ours ([swa](#refer-anchor-1)) | **78.61** |\n| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/timesformer#kinetics-400) | 77.92 |\n## Data\nK400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-timesformer.md:1-29"
+    },
+    "4439": {
+        "file_id": 380,
+        "content": "This code describes the PP-TimeSformer video classification model, an improved version of the TimeSformer model. It outlines the training, testing, and inference processes, as well as providing data preparation instructions for Kinetics-400 dataset. The table shows the accuracy of different versions of the model on Kinetics-400 dataset.",
+        "type": "comment"
+    },
+    "4440": {
+        "file_id": 380,
+        "content": "UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)\n## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Download the image pre-training model [ViT_base_patch16_224_miil_21k.pdparams](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through wget command\n   ```bash\n   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"RecognizerTransformer\"\n        backbone:\n            name: \"VisionTransformer_tweaks\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-timesformer.md:31-58"
+    },
+    "4441": {
+        "file_id": 380,
+        "content": "This code snippet explains how to download and prepare data for training a video recognition model. It mentions the required data sets, pre-trained models, and the specific commands to download and configure them.",
+        "type": "comment"
+    },
+    "4442": {
+        "file_id": 380,
+        "content": "    ```bash\n    # videos data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptimesformer main.py --validate -c configs/recognition/ pptimesformer/pptimesformer_k400_videos.yaml\n    ```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n    ```bash\n    export FLAGS_conv_workspace_size_limit=800 # MB\n    export FLAGS_cudnn_exhaustive_search=1\n    export FLAGS_cudnn_batchnorm_spatial_persistent=1\n    # videos data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptimesformer main.py --amp --validate -c configs /recognition/pptimesformer/pptimesformer_k400_videos.yaml\n    ```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-timesformer.md:60-75"
+    },
+    "4443": {
+        "file_id": 380,
+        "content": "This code runs PaddlePaddle's Timesformer model for video recognition using a specific configuration file. It uses multiple GPUs and supports AMP mixed-precision training for faster processing. The script is customizable, allowing you to train or test on different datasets by modifying the configuration file's name.",
+        "type": "comment"
+    },
+    "4444": {
+        "file_id": 380,
+        "content": "## Test\n- The PP-TimeSformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```\n  Already save the best model (top1 acc)0.7258\n  ```\n- Because the sampling method of the PP-TimeSformer model test mode is a slightly slower but higher accuracy **UniformCrop**, which is different from the **RandomCrop** used in the verification mode during the training process, so the verification index recorded in the training log` topk Acc` does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:\n  ```bash\n  # 8-frames testing script\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptimesformer  main.py  --test -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml -w \"output/ppTimeSformer/ppTimeSformer_best.pdparams\"\n  # 16-frames testing script",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-timesformer.md:78-92"
+    },
+    "4445": {
+        "file_id": 380,
+        "content": "The PP-TimeSformer model is tested during training, and the best test accuracy can be found in the log with keyword \"best\". However, the verification index recorded in the log may not represent the final test score, so a separate testing script should be used to obtain the accurate result. Two such scripts are provided for 8-frames and 16-frames testing.",
+        "type": "comment"
+    },
+    "4446": {
+        "file_id": 380,
+        "content": "  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptimesformer main.py --test \\\n  -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \\\n  -o MODEL.backbone.num_seg=16 \\\n  -o MODEL.runtime_cfg.test.num_seg=16 \\\n  -o PIPELINE.test.decode.num_seg=16 \\\n  -o PIPELINE.test.sample.num_seg=16 \\\n  -w \"data/ppTimeSformer_k400_16f_distill.pdparams\"\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:\n   | backbone           | Sampling method | num_seg | target_size | Top-1 | checkpoints |\n   | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |\n   | Vision Transformer |   UniformCrop   |   8    |     224     | 78.61 | [ppTimeSformer_k400_8f.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f.pdparams) |\n   | Vision Transformer | UniformCrop | 8 | 224 | ",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-timesformer.md:93-108"
+    },
+    "4447": {
+        "file_id": 380,
+        "content": "This code is launching the PaddleVideo model for testing using Vision Transformer backbone with UniformCrop sampling method and 8 segments. It's running on multiple GPUs and using a specific configuration file, yaml, to set parameters like backbone, sampling method, number of segments, target size, and checkpoint file. The resulting test indicators are presented in tabular format for Kinetics-400 validation dataset.",
+        "type": "comment"
+    },
+    "4448": {
+        "file_id": 380,
+        "content": "78.87 | [ppTimeSformer_k400_8f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f_distill.pdparams) |\n   | Vision Transformer | UniformCrop | 16 | 224 | 79.44 | [ppTimeSformer_k400_16f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_16f_distill.pdparams) |\n- During the test, the PP-TimeSformer video sampling strategy is to use linspace sampling: in time sequence, from the first frame to the last frame of the video sequence to be sampled, `num_seg` sparse sampling points (including endpoints) are uniformly generated; spatially , Select 3 areas to sample at both ends of the long side and the middle position (left, middle, right or top, middle, and bottom). A total of 1 clip is sampled for 1 video.\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \\\n                                -p data/ppTimeSformer_k400_8f.pdparams \\",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-timesformer.md:108-120"
+    },
+    "4449": {
+        "file_id": 380,
+        "content": "The code snippet is exporting the PP-TimeSformer model for video recognition. The model uses linspace sampling strategy, uniformly generating sparse sampling points in time and space to create one clip from a single video. The command uses Python script `export_model.py`, with config file `configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml` and model parameters file `data/ppTimeSformer_k400_8f.pdparams`.",
+        "type": "comment"
+    },
+    "4450": {
+        "file_id": 380,
+        "content": "                                -o inference/ppTimeSformer\n```\nThe above command will generate the model structure file `ppTimeSformer.pdmodel` and the model weight file `ppTimeSformer.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Reasoning Method](../../start.md#2-Model Reasoning)\n### Use predictive engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \\\n                           --model_file inference/ppTimeSformer/ppTimeSformer.pdmodel \\\n                           --params_file inference/ppTimeSformer/ppTimeSformer.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nThe output example is as follows:\n```\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9997474551200867\n```\nIt can be seen that using the ppTimeSformer model trained on Ki",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-timesformer.md:121-147"
+    },
+    "4451": {
+        "file_id": 380,
+        "content": "This code is for inference using PaddlePaddle's ppTimeSformer model. The command generates the required model structure and weight files for prediction and then executes the predict.py script with the given input file, configuration, model files, and parameters. It displays the top-1 class and score for the video file provided, trained on Kinetics 400 dataset.",
+        "type": "comment"
+    },
+    "4452": {
+        "file_id": 380,
+        "content": "netics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By referring to the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.\n## Reference\n- [Is Space-TimeAttention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani\n- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean\n<div id=\"refer-anchor-1\"></div>\n- [Averaging Weights Leads to Wider Optima and Better Generalization](https://arxiv.org/abs/1803.05407v3), Pavel Izmailov, Dmitrii Podoprikhin, Timur Garipov\n- [ImageNet-21K Pretraining for the Masses](https://arxiv.org/pdf/2104.10972v4.pdf), Tal Ridnik, Emanuel Ben-Baruch, Asaf Noy",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-timesformer.md:147-156"
+    },
+    "4453": {
+        "file_id": 380,
+        "content": "This code snippet is discussing the prediction of a category name using the PP-Timesformer model, specifically for predicting the content of `data/example.avi`. The predicted category id is 5 and its corresponding category name is \"archery\". This information is derived from the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`. The code provides references to several related papers which have influenced or been used in this model's development.",
+        "type": "comment"
+    },
+    "4454": {
+        "file_id": 381,
+        "content": "/english_documents/model_zoo/recognition/pp-tsm.md",
+        "type": "filepath"
+    },
+    "4455": {
+        "file_id": 381,
+        "content": "This code presents PP-TSM, an optimized TSM model for action recognition on UCF101 and Kinetics-400 datasets using PaddlePaddle and ResNet101 as backbone. It offers pre-trained models for video classification inference and predicts 'archery' as top1 class for 'example.avi'.",
+        "type": "summary"
+    },
+    "4456": {
+        "file_id": 381,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/pp-tsm.md) | English\n# PP-TSM\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe optimized TSM model and proposed **PP-TSM** in this repo. Without increasing the number of parameters, the accuracy of TSM was significantly improved in UCF101 and Kinetics-400 datasets. Please refer to [**Tricks on PP-TSM**](https://zhuanlan.zhihu.com/p/382134297) for more details.\n| Version | Sampling method | Top1 |\n| :------ | :----------: | :----: |\n| Ours (distill) | Dense | **76.16** |\n| Ours | Dense | 75.69 |\n| [mmaction2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) | Dense | 74.55 |\n| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module) | Dense | 74.1 |\n| Version | Sampling method | Top1 |\n| :------ | :----------: | :----: |\n| Ours (distill) | Uniform | **75.11** |\n| Ours | Uniform | 74.54 |\n| [mmaction",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsm.md:1-31"
+    },
+    "4457": {
+        "file_id": 381,
+        "content": "This code describes the PP-TSM model, an optimized version of TSM for action recognition. It significantly improves accuracy in UCF101 and Kinetics-400 datasets without increasing parameters. Two sampling methods are used, Dense and Uniform, with respective top1 accuracies shown.",
+        "type": "comment"
+    },
+    "4458": {
+        "file_id": 381,
+        "content": "2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) |  Uniform | 71.90 |\n| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module)  | Uniform | 71.16 |\n## Data\nPlease refer to Kinetics400 data download and preparation doc [k400-data](../../dataset/K400.md)\nPlease refer to UCF101 data download and preparation doc [ucf101-data](../../dataset/ucf101.md)\n## Train\n### Train on kinetics-400\n#### download pretrain-model\nPlease download [ResNet50_vd_ssld_v2](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams) as pretraind model:\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams\n```\nand add path to `MODEL.framework.backbone.pretrained` in config file as：\n```yaml\nMODEL:\n    framework: \"Recognizer2D\"\n    backbone:\n        name: \"ResNetTweaksTSM\"\n        pretrained: your weight path\n```\n- If use ResNet101 as backbone, please download [ResNet101_vd_ssld_pretrained.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsm.md:31-64"
+    },
+    "4459": {
+        "file_id": 381,
+        "content": "Code snippet provides a guide for training TSM model on Kinetics-400 and UCF101 datasets. It explains how to download the pre-trained ResNet50_vd_ssld_v2 model, specifies the configuration file modification required, and provides links to related data preparation documents.",
+        "type": "comment"
+    },
+    "4460": {
+        "file_id": 381,
+        "content": "pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams) as pretraind model.\n#### Start training\n- Train PP-TSM on kinetics-400 scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml\n```\n- Train PP-TSM on kinetics-400 video data using scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml\n```\n- AMP is useful for speeding up training:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 #MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --amp --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml\n```\n- Train PP-TSM on kinetics-400 with dense sampling:",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsm.md:64-90"
+    },
+    "4461": {
+        "file_id": 381,
+        "content": "Loading pretrained model \"pdparams\" from the provided link.\nStarting training for PP-TSM on kinetics-400 using specified scripts and configurations.\nUsing AMP to speed up training.\nTraining with dense sampling also available.",
+        "type": "comment"
+    },
+    "4462": {
+        "file_id": 381,
+        "content": "```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml\n```\n- Train PP-TSM on kinetics-400 with ResNet101 as backbone using dense sampling:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense_r101.yaml\n```\n## Test\n- For uniform sampling, test accuracy can be found in training-logs by search key word `best`, such as:\n```txt\nAlready save the best model (top1 acc)0.7454\n```\n- For dense sampling, test accuracy can be obtained using scripts:\n```bash\npython3 main.py --test -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml -w output/ppTSM/ppTSM_best.pdparams\n```\nAccuracy on Kinetics400:\n| backbone | distill | Sampling method | num_seg | target_size | Top-1 | checkpoints |\n| :------: | :----------: | :----: | :----: | :----: | :----: | :---- |\n| ResNet50 | False | Uniform",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsm.md:92-122"
+    },
+    "4463": {
+        "file_id": 381,
+        "content": "This code is used to train and test the PP-TSM model on Kinetics-400 dataset. The training process utilizes PaddlePaddle distributed launch, with GPUs 0-7 for execution. It uses ResNet101 as backbone and dense sampling method for training. To obtain test accuracy, a separate script is used, specifying the configuration file and weight file path. The code also displays accuracy metrics in terms of backbone, distillation, sampling method, number of segments, target size, and top-1 accuracy for the Kinetics400 dataset.",
+        "type": "comment"
+    },
+    "4464": {
+        "file_id": 381,
+        "content": " | 8 | 224 | 74.54 | [ppTSM_k400_uniform.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams) |\n| ResNet50 | False | Dense | 8 | 224 | 75.69 | [ppTSM_k400_dense.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense.pdparams) |\n| ResNet50 | True | Uniform | 8 | 224 | 75.11 | [ppTSM_k400_uniform_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform_distill.pdparams) |\n| ResNet50 | True | Dense | 8 | 224 | 76.16 | [ppTSM_k400_dense_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense_distill.pdparams) |\n| ResNet101 | True | Uniform | 8 | 224 | 76.35 | [ppTSM_k400_uniform_distill_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_uniform_distill_r101.pdparams) |\n| ResNet101 | False | Dense | 8 | 224 | 77.15 | [ppTSM_k400_dense_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_dense_r101.pdparams) |",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsm.md:122-127"
+    },
+    "4465": {
+        "file_id": 381,
+        "content": "This code is a table of pre-trained models for PaddlePaddle Temporal Shift Module (ppTSM) with different configurations. Models are based on ResNet50 and ResNet101 architectures, using both uniform and dense distillation methods. They have different parameters, input sizes, and accuracy levels. The pdparams files are the pre-trained model weights available for download from specified URLs.",
+        "type": "comment"
+    },
+    "4466": {
+        "file_id": 381,
+        "content": "## Inference\n### export inference model\n To get model architecture file `ppTSM.pdmodel` and parameters file `ppTSM.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n                                -p data/ppTSM_k400_uniform.pdparams \\\n                                -o inference/ppTSM\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n                           --model_file inference/ppTSM/ppTSM.pdmodel \\\n                           --params_file inference/ppTSM/ppTSM.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example.avi\n\ttop-1 class: 5\n\ttop-1 score: 0.9907386302947998",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsm.md:129-159"
+    },
+    "4467": {
+        "file_id": 381,
+        "content": "This code exports the PPTSM model for inference and demonstrates how to use it for video classification. It requires the user to run two commands: one to export the model architecture file (ppTSM.pdmodel) and parameters file (ppTSM.pdiparams), and another to use the model for prediction on a video file (example.avi). The predicted output includes the top-1 class and its corresponding score.",
+        "type": "comment"
+    },
+    "4468": {
+        "file_id": 381,
+        "content": "```\nwe can get the class name using class id and map file `data/k400/Kinetics-400_label_list.txt`. The top1 prediction of `data/example.avi` is `archery`.\n## Reference\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han\n- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsm.md:160-167"
+    },
+    "4469": {
+        "file_id": 381,
+        "content": "The code retrieves the class name from class id and a map file, then shows that the top1 prediction of 'data/example.avi' is 'archery'.",
+        "type": "comment"
+    },
+    "4470": {
+        "file_id": 382,
+        "content": "/english_documents/model_zoo/recognition/pp-tsn.md",
+        "type": "filepath"
+    },
+    "4471": {
+        "file_id": 382,
+        "content": "This code introduces the PP-TSN model, an enhanced version of TSN. It describes implementation, data preparation and training processes, using mixed-precision training for speed. The PP-TSN model can be customized and tested on Kinetics-400, providing models for video file inference.",
+        "type": "summary"
+    },
+    "4472": {
+        "file_id": 382,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/pp-tsn.md) | English\n# PP-TSN\n## Content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe have improved the [TSN model](./tsn.md) and obtained a more accurate 2D practical video classification model **PP-TSN**. Without increasing the amount of parameters and calculations, the accuracy on the UCF-101, Kinetics-400 and other data sets significantly exceeds the original version. The accuracy on the Kinetics-400 data set is shown in the following table.\n| Version | Top1 |\n| :------ | :----: |\n| Ours (distill) | 75.06 |\n| Ours | **73.68** |\n| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn#kinetics-400) | 71.80 |\n## Data\nK400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)\nUCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsn.md:1-30"
+    },
+    "4473": {
+        "file_id": 382,
+        "content": "This code is a documentation for the PP-TSN model, which is an improved version of the TSN model. The documentation includes sections on introduction, data, train, test, inference, and reference. It also provides accuracy information and guidance on how to download and prepare K400 and UCF101 data.",
+        "type": "comment"
+    },
+    "4474": {
+        "file_id": 382,
+        "content": "## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Download the image distillation pre-training model [ResNet50_vd_ssld_v2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams) as the Backbone initialization parameter, or download it through wget\n   ```bash\n   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/pptsn/pptsn_k400_frames.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"Recognizer2D\"\n        backbone:\n            name: \"ResNetTweaksTSN\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:\n    ```bash\n    # frames data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --validate -c configs/recognition/ pptsn/pptsn_k400_frames.yaml",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsn.md:33-61"
+    },
+    "4475": {
+        "file_id": 382,
+        "content": "This code describes how to train the \"PPTSN\" model on the Kinetics-400 dataset using 8 GPUs. It first requires downloading a pre-trained ResNet50_vd_ssld_v2 model, then configuring its path in the yaml file, and finally running training with the provided command.",
+        "type": "comment"
+    },
+    "4476": {
+        "file_id": 382,
+        "content": "    # videos data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --validate -c configs/recognition/ pptsn/pptsn_k400_videos.yaml\n    ```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n    ```bash\n    export FLAGS_conv_workspace_size_limit=800 # MB\n    export FLAGS_cudnn_exhaustive_search=1\n    export FLAGS_cudnn_batchnorm_spatial_persistent=1\n    # frames data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --amp --validate -c configs /recognition/pptsn/pptsn_k400_frames.yaml\n    # videos data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --amp --validate -c configs /recognition/pptsn/pptsn_k400_videos.yaml\n    ```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is ",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsn.md:63-81"
+    },
+    "4477": {
+        "file_id": 382,
+        "content": "This code demonstrates how to run PaddleVideo's pp-tsn model with amp mixed-precision training for faster processing. It supports both videos and frames data formats, and allows customization of parameter configurations for different datasets.",
+        "type": "comment"
+    },
+    "4478": {
+        "file_id": 382,
+        "content": "recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.\n## Test\n- The PP-TSN model is verified during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n\t```\n  Already save the best model (top1 acc)0.7004\n\t```\n- Since the sampling method of the PP-TSN model test mode is **TenCrop**, which is slightly slower but more accurate, it is different from the **CenterCrop** used in the verification mode during the training process, so the verification index recorded in the training log is `topk Acc `Does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:\n\t```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --test -c configs/recognition/ pptsn/pptsn_k400_frames.yaml -w \"output/ppTSN/ppTSN_best.pdparams\"",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsn.md:81-95"
+    },
+    "4479": {
+        "file_id": 382,
+        "content": "The PP-TSN model's testing process is different from training verification due to the sampling method used. The final test score should be obtained after testing the best model in test mode, as opposed to using the top-k accuracy recorded during training.",
+        "type": "comment"
+    },
+    "4480": {
+        "file_id": 382,
+        "content": "\t```\n\tWhen the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:\n\t| backbone | Sampling method | distill | num_seg | target_size | Top-1 |       checkpoints       |\n\t| :------: | :-------------: | :-----: | :-----: | :---------: | :---- | :---------------------: |\n\t| ResNet50 |     TenCrop     |  False  |    3    |     224     | 73.68 | [ppTSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams) |\n\t| ResNet50 |     TenCrop     |  True   |    8    |     224     | 75.06 | [ppTSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400_8.pdparams) |\n- The PP-TSN video sampling strategy is TenCrop sampling: in time sequence, the input video is evenly divided into num_seg segments, and the middle position of each segment is sampled 1 frame; spatially, from the upper left corner, upper right corner, center point, lower left corner, and lower right corner Each",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsn.md:96-105"
+    },
+    "4481": {
+        "file_id": 382,
+        "content": "This code outlines the test results of PP-TSN model using different configurations on the validation dataset of Kinetics-400. The table presents backbone, sampling method, distillation method, number of segments, target image size, and Top-1 accuracy for each configuration. Checkpoints are also provided for each configuration. The PP-TSN video sampling strategy is TenCrop sampling, which samples frames from different positions in the video sequence and spatial areas.",
+        "type": "comment"
+    },
+    "4482": {
+        "file_id": 382,
+        "content": " of the 5 sub-regions sampled an area of 224x224, and the horizontal flip was added to obtain a total of 10 sampling results. A total of 1 clip is sampled for 1 video.\n- Distill is `True`, which means that the pre-trained model obtained by distillation is used. For the specific distillation scheme, please refer to [ppTSM Distillation Scheme]().\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_frames.yaml -p data/ppTSN_k400.pdparams -o inference/ppTSN\n```\nThe above command will generate the model structure file `ppTSN.pdmodel` and model weight files `ppTSN.pdiparams` and `ppTSN.pdiparams.info` files required for prediction, all of which are stored in the `inference/ppTSN/` directory\nFor the meaning of each parameter in the above bash command, please refer to [Model Reasoning Method](https://github.com/HydrogenSulfate/PaddleVideo/blob/PPTSN-v1/docs/en/start.md#2-infer)\n### Use prediction engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsn.md:105-125"
+    },
+    "4483": {
+        "file_id": 382,
+        "content": "The code exports the pre-trained model for inference and uses the prediction engine to perform predictions on input video files. Distillation is used for obtaining the pre-trained model, and the generated model structure file and weight files are stored in the `inference/ppTSN/` directory. The provided bash commands assist in exporting and predicting with the model respectively.",
+        "type": "comment"
+    },
+    "4484": {
+        "file_id": 382,
+        "content": "                           --config configs/recognition/pptsn/pptsn_k400_frames.yaml \\\n                           --model_file inference/ppTSN/ppTSN.pdmodel \\\n                           --params_file inference/ppTSN/ppTSN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nThe output example is as follows:\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.998979389667511\n```\nIt can be seen that using the PP-TSN model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By consulting the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.\n## Reference\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/pdf/1608.00859.pdf), Limin Wang, Yuanjun Xiong, Zhe Wang\n- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/pp-tsn.md:126-146"
+    },
+    "4485": {
+        "file_id": 382,
+        "content": "This code is running an inference on a video file using the PP-TSN model trained on Kinetics-400. The top-1 category and its corresponding confidence are being outputted for the given video file.",
+        "type": "comment"
+    },
+    "4486": {
+        "file_id": 383,
+        "content": "/english_documents/model_zoo/recognition/slowfast.md",
+        "type": "filepath"
+    },
+    "4487": {
+        "file_id": 383,
+        "content": "The SlowFast model, designed for video recognition, utilizes a Multigrid training strategy to speed up training and provides English documentation. It offers testing instructions using PaddleVideo with GPU usage details, retrieves class name from ID, predicts top1 result for \"example.avi\", and is explained in detail in the reference paper.",
+        "type": "summary"
+    },
+    "4488": {
+        "file_id": 383,
+        "content": "[简体中文 ](../../../zh-CN/model_zoo/recognition/slowfast.md) | English\n# SlowFast\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nSlowFast  involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast path-way, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition.\n<p align=\"center\">\n<img src=\"../../../images/SlowFast.png\" height=300 width=500 hspace='10'/> <br />\nSlowFast Overview\n</p>\n## Data\nWe use Kinetics-400 to train this model，data preparation please refer to [Kinetics-400 dataset](../../dataset/k400.md).\n## Train\nYou can start training by：\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_slowfast  main.py --validate -c configs/recognition/slowfast/slowfast.yaml",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/slowfast.md:1-38"
+    },
+    "4489": {
+        "file_id": 383,
+        "content": "This code is the English version of SlowFast model documentation from PaddleVideo's model_zoo. It introduces SlowFast, a video recognition model that combines low and high frame rates for spatial semantic and motion information capture. The training script and data preparation are provided.",
+        "type": "comment"
+    },
+    "4490": {
+        "file_id": 383,
+        "content": "```\n- Training would be efficent using our code. The training speed is 2x faster than the original implementation. Details can refer to [benchmark](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/en/benchmark.md).\n### Speed up training\nIt's time consuming to train SlowFast model.  So we implement [Multigrid training stragety](https://arxiv.org/abs/1912.00998) to speed up training. Training script:\n```bash\npython -B -m paddle.distributed.launch --selected_gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml\n```\nPerformance evaluation:\n| training stragety | time cost of one epoch/min | total training time/min | speed-up |\n| :------ | :-----: | :------: |:------: |\n| Multigrid | 27.25 |  9758 (6.7 days) | 2.89x |\n| Normal | 78.76 | 15438 (10.7days) | base |\nFor more details, please refer to [accelerate doc](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/tutorials/accelerate.md#%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5%E5%8A%A0%E9%80%9F).",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/slowfast.md:39-58"
+    },
+    "4491": {
+        "file_id": 383,
+        "content": "This code implements Multigrid training strategy to speed up SlowFast model training, which is time-consuming. The provided training script and performance evaluation show that using the multigrid method reduces the training time by 2.89x compared to normal training. For more details, refer to the accelerate documentation.",
+        "type": "comment"
+    },
+    "4492": {
+        "file_id": 383,
+        "content": "## Test\nYou can start testing by：\n```bash\npython -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_slowfast_test main.py --test -c  configs/recognition/slowfast/slowfast.yaml -w output/SlowFast/SlowFast_epoch_000196.pdparams\n```\n-  Args `-w` is used to specifiy the model path，you can download our model in [SlowFast.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams).\nTest accuracy in Kinetics-400:\n| Configs | Acc1 | Acc5 | Weights |\n| :---: | :---: | :---: | :---: |\n|  [slowfast.yaml](../../../../configs/recognition/slowfast/slowfast.yaml) | 74.35 | 91.33 | [slowfast_4x16.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams) |\n|  [slowfast_multigrid.yaml](../../../../configs/recognition/slowfast/slowfast_multigrid.yaml) | 75.84  | 92.33 | [slowfast_8x8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) |\n- Acc1 may be lower than that released in papaer, as ~5% data of kinetics-400 is missing. Experiments have verified that if training with the same data, we can get the same accuracy.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/slowfast.md:61-79"
+    },
+    "4493": {
+        "file_id": 383,
+        "content": "This code provides instructions for testing the SlowFast model in PaddleVideo. It uses the distributed launch command to run on multiple GPUs, specifying the log directory and the model configuration file slowfast.yaml. The test accuracy for two configurations is also shown, with a note that Acc1 may be lower due to missing data.",
+        "type": "comment"
+    },
+    "4494": {
+        "file_id": 383,
+        "content": "## Inference\n### export inference model\n To get model architecture file `SlowFast.pdmodel` and parameters file `SlowFast.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml \\\n                                -p data/SlowFast.pdparams \\\n                                -o inference/SlowFast\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/slowfast/slowfast.yaml \\\n                           --model_file inference/SlowFast/SlowFast.pdmodel \\\n                           --params_file inference/SlowFast/SlowFast.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 1.0",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/slowfast.md:82-112"
+    },
+    "4495": {
+        "file_id": 383,
+        "content": "This code provides instructions for exporting and using the SlowFast model in PaddleVideo. The first command generates the architecture file (SlowFast.pdmodel) and parameter file (SlowFast.pdiparams). The second command demonstrates how to run inference with these files on an input video, specifying the model configuration and enabling GPU usage if available. It outputs the top-1 class and score for the predicted results.",
+        "type": "comment"
+    },
+    "4496": {
+        "file_id": 383,
+        "content": "```\nwe can get the class name using class id and map file `data/k400/Kinetics-400_label_list.txt`. The top1 prediction of `data/example.avi` is `archery`.\n## Reference\n- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/slowfast.md:113-120"
+    },
+    "4497": {
+        "file_id": 383,
+        "content": "This code retrieves the class name from a given class ID using a map file and predicts the top1 result for a video named \"example.avi\". The reference provided is related to the SlowFast Networks for Video Recognition paper, which likely explains how this functionality works in detail.",
+        "type": "comment"
+    },
+    "4498": {
+        "file_id": 384,
+        "content": "/english_documents/model_zoo/recognition/stgcn.md",
+        "type": "filepath"
+    },
+    "4499": {
+        "file_id": 384,
+        "content": "The code provides instructions for training and testing ST-GCN, a skeleton-based action recognition model, on FSD and NTU-RGB+D datasets, with accuracy results given. It exports the model's architecture and parameters using `export_model.py` and allows inference with optional GPU usage via `predict.py`.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/45.json b/docs/data/45.json
new file mode 100644
index 000000000..3b3302be5
--- /dev/null
+++ b/docs/data/45.json
@@ -0,0 +1,545 @@
+{
+    "4500": {
+        "file_id": 384,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/stgcn.md) | English\n# ST-GCN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nST-GCN is skeleton-based action recognition model proposed in AAAI 2018.\n<div align=\"center\">\n<img src=\"../../../images/st-gcn.png\" height=200 width=950 hspace='10'/> <br />\n</div>\n## Data\nPlease refer to FSD data download and preparation doc [FSD](../../dataset/fsd.md)\nPlease refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)\n## Train\n### Train on FSD\n- Train ST-GCN on FSD scripts:\n```bash\npython3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml\n```\n- Turn off `valid` when training, as validation dataset is not available for the competition.\n### Train on NTU-RGBD\n- Train ST-GCN on NTU-RGBD scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_stgcn  main.py  --validate -c configs/recognition/stgcn/stgcn_ntucs.yaml",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/stgcn.md:1-49"
+    },
+    "4501": {
+        "file_id": 384,
+        "content": "This code is a documentation for ST-GCN, a skeleton-based action recognition model. It explains the model's introduction, data requirements (FSD and NTU-RGBD), training instructions on both datasets, and how to perform inference.",
+        "type": "comment"
+    },
+    "4502": {
+        "file_id": 384,
+        "content": "```\n- config file `stgcn_ntucs.yaml` corresponding to the config of ST-GCN on NTU-RGB+D dataset with cross-subject splits.\n## Test\n### Test on FSD\n- Test scripts：\n```bash\npython3.7 main.py --test -c configs/recognition/stgcn/stgcn_fsd.yaml -w output/STGCN/STGCN_epoch_00090.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\n- Evaluation results will be saved in `submission.csv` file, final score can be obtained in [competition website](https://aistudio.baidu.com/aistudio/competition/detail/115).\nAccuracy on FSD-10 dataset:\nTest_Data| Top-1 | checkpoints |\n| :----: | :----: | :---- |\n| Test_A | 59.07 | [STGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams) |\n### Test on NTU-RGB+D\n- Test scripts：\n```bash\npython3.7 main.py --test -c configs/recognition/stgcn/stgcn_ntucs.yaml -w output/STGCN/STGCN_best.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on NTU-RGB+D dataset:\n| split | Top-1 | checkpoints |",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/stgcn.md:50-89"
+    },
+    "4503": {
+        "file_id": 384,
+        "content": "This code provides instructions for testing the ST-GCN model on two datasets: FSD and NTU-RGB+D. The user is directed to run specific test scripts with provided command lines, specifying the configuration file and weight path. Results are saved in a submission.csv file and the final scores can be obtained from the competition website. Accuracy results for both datasets are also included.",
+        "type": "comment"
+    },
+    "4504": {
+        "file_id": 384,
+        "content": "| :----: | :----: | :---- |\n| cross-subject | 82.28 | [STGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_ntucs.pdparams) |\n## Inference\n### export inference model\n To get model architecture file `STGCN.pdmodel` and parameters file `STGCN.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml \\\n                                -p data/STGCN_fsd.pdparams \\\n                                -o inference/STGCN\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \\\n                           --config configs/recognition/stgcn/stgcn_fsd.yaml \\\n                           --model_file inference/STGCN/STGCN.pdmodel \\\n                           --params_file inference/STGCN/STGCN.pdiparams \\\n                           --use_gpu=True \\",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/stgcn.md:90-115"
+    },
+    "4505": {
+        "file_id": 384,
+        "content": "This code provides the commands to export the model architecture and parameters for a STGCN model, as well as how to use the model to make inferences. The `export_model.py` script is used to generate the `STGCN.pdmodel` and `STGCN.pdiparams` files. The `predict.py` script is then used for making predictions using the exported model with optional GPU usage.",
+        "type": "comment"
+    },
+    "4506": {
+        "file_id": 384,
+        "content": "                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/fsd10/example_skeleton.npy\n        top-1 class: 27\n        top-1 score: 0.9912770986557007\n```\n## Reference\n- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/stgcn.md:116-129"
+    },
+    "4507": {
+        "file_id": 384,
+        "content": "False",
+        "type": "comment"
+    },
+    "4508": {
+        "file_id": 385,
+        "content": "/english_documents/model_zoo/recognition/timesformer.md",
+        "type": "filepath"
+    },
+    "4509": {
+        "file_id": 385,
+        "content": "The TimeSformer model is a top-performing video classifier that uses time series modeling and space-time attention, trained on Kinetics-400 using 8 GPUs with mixed-precision training.",
+        "type": "summary"
+    },
+    "4510": {
+        "file_id": 385,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/timesformer.md) | English\n# TimeSformer\n## Content\n- [Introduction](#Introduction)\n- [Data](#DATA)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nTimeSformer is a video classification model based on vision transformer, which has the characteristics of no convolution, global receptive field, and strong time series modeling ability. At present, it has achieved SOTA accuracy on the Kinetics-400 data set, surpassing the classic CNN-based video classification models TSN, TSM and Slowfast, and has a shorter training time (the Kinetics-400 data set training time is 39 hourss). **This code implements the time-space separated attention cascade network in the paper**.\n<div align=\"center\">\n<img src=\"../../../images/timesformer_attention_arch.png\" alt=\"image-20210628210446041\"/><img src=\"../../../images/timesformer_attention_visualize.png\" alt=\"image-20210628210446041\"  />\n</div>\n## Data\nK400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/timesformer.md:1-26"
+    },
+    "4511": {
+        "file_id": 385,
+        "content": "This code implements the TimeSformer, a video classification model based on vision transformer with global receptive field and strong time series modeling ability. It achieves SOTA accuracy on Kinetics-400 dataset and has shorter training time compared to other models. The code showcases the time-space separated attention cascade network architecture, and requires data preparation from Kinetics-400 dataset.",
+        "type": "comment"
+    },
+    "4512": {
+        "file_id": 385,
+        "content": "UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)\n## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Download the image pre-training model [ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through the wget command\n   ```bash\n   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/timesformer/timesformer_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"RecognizerTransformer\"\n        backbone:\n            name: \"VisionTransformer\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:\n```bash",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/timesformer.md:28-57"
+    },
+    "4513": {
+        "file_id": 385,
+        "content": "Download and prepare UCF101 data, then download the ViT_base_patch16_224 pre-trained model. Update the config file with the model's path and train the Kinetics-400 dataset using 8 GPUs.",
+        "type": "comment"
+    },
+    "4514": {
+        "file_id": 385,
+        "content": "# videos data format\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_timesformer main.py --validate -c configs/recognition/ timesformer/timesformer_k400_videos.yaml\n```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 # MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\n# videos data format\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_timesformer main.py --amp --validate -c configs/recognition/ timesformer/timesformer_k400_videos.yaml\n```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/timesformer.md:58-72"
+    },
+    "4515": {
+        "file_id": 385,
+        "content": "This code executes the training of a Paddle Video model called \"timesformer\" on 8 GPUs for video data. The command is to be run in a Linux terminal, and it uses mixed-precision training with AMP (Automatic Mixed Precision) to speed up the process. The command also sets some environment variables to configure CUDA behavior. The configuration file name includes the model and dataset names as well as data format and sampling method. For more details on configuring parameters, refer to the provided link.",
+        "type": "comment"
+    },
+    "4516": {
+        "file_id": 385,
+        "content": "## Test\n- The TimeSformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```\n  Already save the best model (top1 acc)0.7258\n  ```\n- Since the sampling method of the TimeSformer model test mode is **UniformCrop** with a slower speed but higher accuracy, which is different from the **RandomCrop** used in the verification mode during the training process, so the verification index recorded in the training log is `topk Acc `Does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_timesformer main.py --test -c configs/recognition/ timesformer/timesformer_k400_videos.yaml -w \"output/TimeSformer/TimeSformer_best.pdparams\"\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/timesformer.md:75-90"
+    },
+    "4517": {
+        "file_id": 385,
+        "content": "The code provides instructions for testing the TimeSformer model, using a different sampling method in test mode for higher accuracy. The best model is identified by the log's \"best\" keyword, and final test scores are obtained after training by using the provided command.",
+        "type": "comment"
+    },
+    "4518": {
+        "file_id": 385,
+        "content": "  | backbone | Sampling method | num_seg | target_size | Top-1 | checkpoints |\n  | :----------------: | :-----: | :-----: | :---------: | :----: | :----------------------------------------------------------: |\n  | Vision Transformer | UniformCrop | 8 | 224 | 77.29 | [TimeSformer_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams) |\n- During the test, the TimeSformer video sampling strategy is to use Linspace sampling: in time sequence, num_seg sparse sampling points are uniformly generated from the video sequence to be sampled; in space, select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions are sampled. A total of 1 clip is sampled for 1 video.\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml \\\n                                -p data/TimeSformer_k400.pdparams \\\n                                -o inference/TimeSformer",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/timesformer.md:93-107"
+    },
+    "4519": {
+        "file_id": 385,
+        "content": "This code snippet is for exporting the TimeSformer inference model. It uses the PaddlePaddle framework and requires a configuration file, a pre-trained model file, and an output directory. The TimeSformer is a video recognition model that utilizes the Vision Transformer architecture and Linspace sampling strategy for its inference process.",
+        "type": "comment"
+    },
+    "4520": {
+        "file_id": 385,
+        "content": "```\nThe above command will generate the model structure file `TimeSformer.pdmodel` and the model weight file `TimeSformer.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Reasoning Method](../../start.md#2-infer)\n### Use prediction engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/timesformer/timesformer_k400_videos.yaml \\\n                           --model_file inference/TimeSformer/TimeSformer.pdmodel \\\n                           --params_file inference/TimeSformer/TimeSformer.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nThe output example is as follows:\n```\nCurrent video file: data/example.avi\n    top-1 class: 5\n    top-1 score: 0.9999722242355347\n```\nIt can be seen that using the TimeSformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confiden",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/timesformer.md:108-133"
+    },
+    "4521": {
+        "file_id": 385,
+        "content": "This code snippet demonstrates the process of using the TimeSformer model to predict the video file 'data/example.avi'. The model is trained on Kinetics-400 and the prediction command uses python3.7 to run 'tools/predict.py' with relevant parameters such as input_file, config, model_file, params_file, use_gpu, and use_tensorrt. The output shows the top-1 class and its corresponding score for the video file.",
+        "type": "comment"
+    },
+    "4522": {
+        "file_id": 385,
+        "content": "ce is 0.99. By consulting the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be seen that the predicted category name is `archery`.\n## Reference\n- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/timesformer.md:133-137"
+    },
+    "4523": {
+        "file_id": 385,
+        "content": "Code comments: The code calculates the category id and name from a table, which is used to determine the predicted category name. It references a paper on space-time attention for video understanding.",
+        "type": "comment"
+    },
+    "4524": {
+        "file_id": 386,
+        "content": "/english_documents/model_zoo/recognition/tokenshift_transformer.md",
+        "type": "filepath"
+    },
+    "4525": {
+        "file_id": 386,
+        "content": "The Token Shift Transformer is a versatile video classification model utilizing vision transformer and Token Shift Module, trained on UCF-101 dataset with mixed-precision AMP acceleration, and achieves high accuracy with \"BrushingTeeth.avi\".",
+        "type": "summary"
+    },
+    "4526": {
+        "file_id": 386,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/tokenshift_transformer.md) | English\n# Token Shift Transformer\n## Content\n- [Introduction](#Introduction)\n- [Data](#DATA)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nToken Shift Transformer is a video classification model based on vision transformer, which shares merits of strong interpretability, high discriminative power on hyper-scale data, and ﬂexibility in processing varying length inputs. Token Shift Module is a novel, zero-parameter, zero-FLOPs operator, for modeling temporal relations within each transformer encoder.\n<div align=\"center\">\n<img src=\"../../../images/tokenshift_structure.png\">\n</div>\n## Data\nUCF-101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)\n## Train\n### UCF-101 data set training\n#### Download and add pre-trained models\n1. Download the image pre-training model [ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.c",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:1-36"
+    },
+    "4527": {
+        "file_id": 386,
+        "content": "Token Shift Transformer is a video classification model using vision transformer, with a novel Token Shift Module for modeling temporal relations. It offers strong interpretability and flexibility, while being zero-parameter and zero-FLOPs. UCF-101 data preparation guide provided.",
+        "type": "comment"
+    },
+    "4528": {
+        "file_id": 386,
+        "content": "om/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through the wget command\n   ```bash\n   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"RecognizerTransformer\"\n        backbone:\n            name: \"TokenShiftVisionTransformer\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The UCF-101 data set uses 1 card for training, and the start command of the training method is as follows:\n```bash\n# videos data format\npython3 main.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --validate --seed=1234\n```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n```bash",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:36-63"
+    },
+    "4529": {
+        "file_id": 386,
+        "content": "This code provides instructions on how to download a pre-trained model and modify a configuration file for training the TokenShift Transformer model on the UCF-101 dataset using PaddlePaddle. It also highlights the need for using mixed-precision training with AMP to accelerate the training process.",
+        "type": "comment"
+    },
+    "4530": {
+        "file_id": 386,
+        "content": "python3 main.py --amp -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --validate --seed=1234\n```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.\n## Test\n- The Token Shift Transformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```\n  Already save the best model (top1 acc)0.9201\n  ```\n- Since the sampling method of the Token Shift Transformer model test mode is **uniform** sampling, which is different from the **dense** sampling used in the verification mode during the training process, so the verification index recorded in the training log",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:64-78"
+    },
+    "4531": {
+        "file_id": 386,
+        "content": "This code snippet is used to train a Token Shift Transformer model on the UCF101 dataset with a video size of 256. The model configuration file is tokShift_transformer_ucf101_256_videos.yaml, and the training is performed using automatic mixed precision (--amp flag). The model will be validated during training, and the best model's test accuracy can be found in the training log using the keyword \"best\". The test mode sampling method is uniform sampling, which differs from the dense sampling used in verification mode during training.",
+        "type": "comment"
+    },
+    "4532": {
+        "file_id": 386,
+        "content": ", called `topk Acc `, does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:\n  ```bash\n  python3 main.py --amp -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --test --seed=1234 -w 'output/TokenShiftVisionTransformer/TokenShiftVisionTransformer_best.pdparams'\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of UCF-101 are as follows:\n  | backbone | sampling method | num_seg | target_size | Top-1 | checkpoints |\n  | :----------------: | :-----: | :-----: | :---------: | :----: | :----------------------------------------------------------: |\n  | Vision Transformer | Uniform | 8 | 256 | 92.81 | [TokenShiftTransformer.pdparams](https://drive.google.com/drive/folders/1k_TpAqaJZYJE8C5g5pT9phdyk9DrY_XL?usp=sharing) |\n- Uniform sampling: Timing-wise, equal division into `num_seg",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:78-93"
+    },
+    "4533": {
+        "file_id": 386,
+        "content": "This code describes a command for testing the best model after training is complete using the TokenShift VisionTransformer on the UCF-101 dataset. The test configuration parameters include backbone, sampling method, num_seg, and target_size to obtain Top-1 accuracy. The checkpoints are available in a shared Google Drive link. Uniform sampling divides timing equally into `num_seg`.",
+        "type": "comment"
+    },
+    "4534": {
+        "file_id": 386,
+        "content": "` segments, 1 frame sampled at the middle of each segment; spatially, sampling at the center. 1 video sampled 1 clip in total.\n## Inference\n### Export inference model\n```bash\npython3 tools/export_model.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml -p 'output/TokenShiftVisionTransformer/TokenShiftVisionTransformer_best.pdparams'\n```\nThe above command will generate the model structure file `TokenShiftVisionTransformer.pdmodel` and the model weight file `TokenShiftVisionTransformer.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Reasoning Method](../../usage.md#2-infer)\n### Use prediction engine inference\n```bash\npython3 tools/predict.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml -i 'data/BrushingTeeth.avi' --model_file ./inference/TokenShiftVisionTransformer.pdmodel --params_file ./inference/TokenShiftVisionTransformer.pdiparams\n```\nThe output example is as follows:\n```\nCurrent video file: data/BrushingTeeth.avi",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:93-116"
+    },
+    "4535": {
+        "file_id": 386,
+        "content": "This code provides instructions for exporting an inference model and using the prediction engine in PaddleVideo's TokenShift Vision Transformer. The first command exports the model structure file (TokenShiftVisionTransformer.pdmodel) and the model weight file (TokenShiftVisionTransformer.pdiparams). The second command uses these files to perform inference on a specific video file (e.g., 'data/BrushingTeeth.avi').",
+        "type": "comment"
+    },
+    "4536": {
+        "file_id": 386,
+        "content": "\ttop-1 class: 19\n\ttop-1 score: 0.9959074258804321\n```\nIt can be seen that using the Token Shift Transformer model trained on UCF-101 to predict `data/BrushingTeeth.avi`, the output top1 category id is `19`, and the confidence is 0.99. By consulting the category id and name correspondence table, it can be seen that the predicted category name is `brushing_teeth`.\n## Reference\n- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tokenshift_transformer.md:117-125"
+    },
+    "4537": {
+        "file_id": 386,
+        "content": "This code snippet is displaying the top-1 category prediction and confidence score for a given video file \"BrushingTeeth.avi\" using Token Shift Transformer model trained on UCF-101 dataset. The predicted top-1 category id is 19, and its corresponding category name is \"brushing_teeth\", with a high confidence of 0.99.",
+        "type": "comment"
+    },
+    "4538": {
+        "file_id": 387,
+        "content": "/english_documents/model_zoo/recognition/tsm.md",
+        "type": "filepath"
+    },
+    "4539": {
+        "file_id": 387,
+        "content": "This code trains TSM model using ResNet-50, PaddlePaddle, and AMP on UCF-101 and Kinetics-400 datasets with Momentum optimization and L2_Decay. It supports three sampling methods, provides training details, and gives inference instructions.",
+        "type": "summary"
+    },
+    "4540": {
+        "file_id": 387,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/tsm.md) | English\n# TSM\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Details](#Details)\n- [Reference](#Reference)\n## Introduction\nTemporal Shift Module (TSM) is a popular model that attracts more attention at present.\nThe method of moving through channels greatly improves the utilization ability of temporal information without increasing any\nadditional number of parameters and calculation amount.\nMoreover, due to its lightweight and efficient characteristics, it is very suitable for industrial landing.\n  <div align=\"center\">\n  <img src=\"../../../images/tsm_architecture.png\" height=250 width=700 hspace='10'/> <br />\n  </div>\nThis code implemented **single RGB stream** of TSM networks. Backbone is ResNet-50.\nPlease refer to the ICCV 2019 paper for details [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf)\n## Data\nPlease refer to Kinetics-400 data download and preparation [k400 data preparation](../../dataset/k400.md)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsm.md:1-33"
+    },
+    "4541": {
+        "file_id": 387,
+        "content": "This code implements the TSM (Temporal Shift Module) model for video understanding using a single RGB stream and ResNet-50 as the backbone. It follows the ICCV 2019 paper for details, and requires data from Kinetics-400 which can be downloaded and prepared according to the provided instructions.",
+        "type": "comment"
+    },
+    "4542": {
+        "file_id": 387,
+        "content": "Please refer to UCF101 data download and preparation [ucf101 data preparation](../../dataset/ucf101.md)\n## Train\n### Train on the Kinetics-400 dataset\n#### download pretrain-model\n1. Please download [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams) as pretraind model:\n   ```bash\n   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/tsm/tsm_k400_frames.yaml`, and fill in the downloaded weight path below `pretrained:`\n   ```bash\n   MODEL:\n   \tframework: \"Recognizer2D\"\n   \t\tbackbone:\n   \t\tname: \"ResNetTSM\"\n   \t\tpretrained: your weight path\n   ```\n#### Start training\n- By specifying different configuration files, different data formats/data sets can be used for training. Taking the training configuration of Kinetics-400 data set + 8 cards + frames format as an example, the startup command is as follows (more training commands can be viewed in `PaddleVideo/run.sh`).",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsm.md:35-62"
+    },
+    "4543": {
+        "file_id": 387,
+        "content": "This code explains how to train a TSM (Temporal Shift Module) model on the Kinetics-400 dataset using the PaddleVideo framework. The user needs to download and replace the pretrained ResNet50_pretrain.pdparams model, then specify the new weight path in the tsm_k400_frames.yaml configuration file. Training can be started by running a specific command based on the desired configuration.",
+        "type": "comment"
+    },
+    "4544": {
+        "file_id": 387,
+        "content": "  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_frames.yaml\n  ```\n- Training Kinetics-400 dataset of videos format using scripts.\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_videos.yaml\n  ```\n- AMP is useful for speeding up training, scripts as follows:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 #MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml\n```\n- AMP works better with `NHWC` data format, scripts as follows:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 #MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\npython3.7 -B -m paddle.distributed.l",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsm.md:64-91"
+    },
+    "4545": {
+        "file_id": 387,
+        "content": "This code snippet is running a PaddlePaddle (a deep learning framework) script to train the TSM (Temporal Shift Module) model on the Kinetics-400 dataset. The model is trained for videos and frames formats separately, utilizing Automatic Mixed Precision (AMP) for faster training with some environment variable settings. AMP works better with the NHWC data format and needs specific environment variable configurations as well.",
+        "type": "comment"
+    },
+    "4546": {
+        "file_id": 387,
+        "content": "aunch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml\n```\n- For the config file usage，please refer to [config](../../tutorials/config.md).\n### Train on UCF-101 dataset\n#### download pretrain-model\n- Load the TSM model we trained on Kinetics-400 [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams), or download it through the command line\n  ```bash\n  wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams\n  ```\n- Open `PaddleVideo/configs/recognition/tsm/tsm_ucf101_frames.yaml`, and fill in the downloaded weight path below `pretrained:`\n  ```bash\n  MODEL:\n      framework: \"Recognizer2D\"\n      backbone:\n          name: \"ResNetTSM\"\n          pretrained: your weight path\n  ```\n#### Start training\n- By specifying different configuration files, different data formats/data sets can be used for training. Taking the training configuration of Kinetics-400 data set + 8 cards",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsm.md:91-118"
+    },
+    "4547": {
+        "file_id": 387,
+        "content": "This code snippet is for training the TSM (Temporal Shift Module) model on the UCF-101 dataset. It involves loading a pre-trained model, specifying the configuration file, and using 8 GPUs for training. The command launches the model with amp (automatic mixed precision) and validation mode. The provided link shows how to download the pre-trained TSM_k400 model.",
+        "type": "comment"
+    },
+    "4548": {
+        "file_id": 387,
+        "content": " + frames format as an example, the startup command is as follows (more training commands can be viewed in `PaddleVideo/run.sh`).\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml\n  ```\n- Training UCF-101 dataset of videos format using scripts.\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_ucf101_videos.yaml\n  ```\n- AMP is useful for speeding up training, scripts as follows:\n  ```bash\n  export FLAGS_conv_workspace_size_limit=800 #MB\n  export FLAGS_cudnn_exhaustive_search=1\n  export FLAGS_cudnn_batchnorm_spatial_persistent=1\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml\n  ```\n- AMP works better with `NHWC` data format, scripts as follows:\n  ```bash\n  export FLAGS_conv_workspace_size_limit=800 #MB\n  export FLAGS_cudnn_exhaustive_search=1",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsm.md:118-144"
+    },
+    "4549": {
+        "file_id": 387,
+        "content": "This code snippet provides commands to train the TSM (Temporal Shift Module) model on the UCF-101 dataset using PaddleVideo. It also demonstrates how to use AMP (Automatic Mixed Precision) for faster training and shows that it works better with `NHWC` data format. The provided commands can be executed in a terminal, specifying the required arguments like GPUs, log directory, and configuration file.",
+        "type": "comment"
+    },
+    "4550": {
+        "file_id": 387,
+        "content": "  export FLAGS_cudnn_batchnorm_spatial_persistent=1\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames_nhwc.yaml\n  ```\n## Test\nPut the weight of the model to be tested into the `output/TSM/` directory, the test command is as follows\n```bash\npython3 main.py --test -c configs/recognition/tsm/tsm.yaml -w output/TSM/TSM_best.pdparams\n```\n---\nWhen the test configuration uses the following parameters, the evaluation accuracy on the validation data set of Kinetics-400 is as follows:\n| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |\n| :--------: | :---------------: | :-------: | :-----------: | :-----: | :-----------: | :-----------: |\n| ResNet50 | Uniform         | NCHW | 8       | 224         | 71.06 | [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams)        |\nWhen the test configuration uses the following parameters, the evaluation accuracy on the validation data set of UCF-101 is as follows:",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsm.md:145-166"
+    },
+    "4551": {
+        "file_id": 387,
+        "content": "This code exports the flag for CUDNN batch normalization spatial persistent and runs a Python script to test the model with specified configuration files. The testing command takes the best model weights from a directory and evaluates the accuracy on validation datasets of Kinetics-400 and UCF-101.",
+        "type": "comment"
+    },
+    "4552": {
+        "file_id": 387,
+        "content": "| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |\n| :------: | :-------------: | :-----------------: | :-----: | :---------: | :---: | :---------: |\n| ResNet50 |     Uniform     | NCHW              |    8    |     224     | 94.42 |    [TSM_ucf101_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_nchw.pdparams)     |\n| ResNet50 |     Uniform     | NCHW+AMP |    8    |     224     | 94.40 |   [TSM_ucf101_amp_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nchw.pdparams)     |\n| ResNet50 |     Uniform     | NHWC+AMP |    8    |     224     | 94.55 |   [TSM_ucf101_amp_nhwc.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nhwc.pdparams)     |\n## Inference\n### export inference model\nTo get model architecture file `TSM.pdmodel` and parameters file `TSM.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml \\",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsm.md:168-181"
+    },
+    "4553": {
+        "file_id": 387,
+        "content": "This code provides information about different TSM (Temporal Shift Module) models trained using ResNet50 backbone with three sampling methods: Uniform, NCHW, NHWC+AMP. It shows the training strategy, number of segments, target size, and Top-1 accuracy for each model. It also mentions where to find the corresponding checkpoints and provides instructions on how to export the inference model using Python script.",
+        "type": "comment"
+    },
+    "4554": {
+        "file_id": 387,
+        "content": "                                -p data/TSM_k400.pdparams \\\n                                -o inference/TSM\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/tsm/tsm_k400_frames.yaml \\\n                           --model_file inference/TSM/TSM.pdmodel \\\n                           --params_file inference/TSM/TSM.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\n## Implementation details\n### data processing\n- The model reads the `mp4` data in the Kinetics-400 data set, first divides each piece of video data into `num_seg` segments, and then uniformly extracts 1 frame of image from each segment to obtain sparsely sampled `num_seg` video frames. Then do the same random data enhancement to this `n",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsm.md:182-203"
+    },
+    "4555": {
+        "file_id": 387,
+        "content": "This code is running a model inference for TSM (Temporal Shift Module) on an input video file using PaddlePaddle framework. It specifies the necessary arguments including the input file, configuration file, and model files. The --use_gpu and --use_tensorrt options are set to True and False respectively. The data processing step involves dividing the video into segments, extracting frames randomly, and applying random data enhancement.",
+        "type": "comment"
+    },
+    "4556": {
+        "file_id": 387,
+        "content": "um_seg` frame image, including multi-scale random cropping, random left and right flips, data normalization, etc., and finally zoom to `target_size`.\n### Training strategy\n*  Use Momentum optimization algorithm training, momentum=0.9\n*  Using L2_Decay, the weight attenuation coefficient is 1e-4\n*  Using global gradient clipping, the clipping factor is 20.0\n*  The total number of epochs is 50, and the learning rate will be attenuated by 0.1 times when the epoch reaches 20 and 40\n*  The learning rate of the weight and bias of the FC layer are respectively 5 times and 10 times the overall learning rate, and the bias does not set L2_Decay\n*  Dropout_ratio=0.5\n### Parameter initialization\n- Initialize the weight of the FC layer with the normal distribution of Normal(mean=0, std=0.001), and initialize the bias of the FC layer with a constant of 0\n## Reference\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsm.md:203-221"
+    },
+    "4557": {
+        "file_id": 387,
+        "content": "The code outlines the training strategy for TSM (Temporal Shift Module) model, which includes using Momentum optimization algorithm with L2_Decay, global gradient clipping, and attenuating the learning rate at certain epochs. It also specifies the total number of epochs, learning rates for FC layer weights and biases, Dropout ratio, and parameter initialization methods.",
+        "type": "comment"
+    },
+    "4558": {
+        "file_id": 388,
+        "content": "/english_documents/model_zoo/recognition/tsn.md",
+        "type": "filepath"
+    },
+    "4559": {
+        "file_id": 388,
+        "content": "This code introduces TSN, a 2D-CNN-based video classification solution that utilizes sparse sampling and ResNet-50 as its backbone. It trains on Kinetics-400 dataset with pre-trained weights, provides data preparation/model config details, tests different methods/backbones, and exports an \"TSN\" inference model.",
+        "type": "summary"
+    },
+    "4560": {
+        "file_id": 388,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/tsn.md) | English\n# TSN\n## Content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Details](#Details)\n- [Reference](#Reference)\n## Introduction\nTemporal Segment Network (TSN) is a classic 2D-CNN-based solution in the field of video classification. This method mainly solves the problem of long-term behavior recognition of video, and replaces dense sampling by sparsely sampling video frames, which can not only capture the global information of the video, but also remove redundancy and reduce the amount of calculation. The core idea is to average the features of each frame as the overall feature of the video, and then enter the classifier for classification. The model implemented by this code is a TSN network based on a single-channel RGB image, and Backbone uses the ResNet-50 structure.\n<div align=\"center\">\n<img src=\"../../../images/tsn_architecture.png\" height=350 width=80000 hspace='10'/> <br />",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn.md:1-20"
+    },
+    "4561": {
+        "file_id": 388,
+        "content": "This code introduces TSN (Temporal Segment Network), a 2D-CNN-based solution for video classification. It uses sparse sampling to capture global information, reduce redundancy, and decrease computational burden. The model is based on single-channel RGB images and utilizes ResNet-50 as the backbone.",
+        "type": "comment"
+    },
+    "4562": {
+        "file_id": 388,
+        "content": "</div>\nFor details, please refer to the ECCV 2016 paper [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)\n## Data\nPaddleVide provides training and testing scripts on the Kinetics-400 dataset. Kinetics-400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)\n## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Load the ResNet50 weights trained on ImageNet1000 as Backbone initialization parameters [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams), or download through the command line\n   ```bash\n   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/tsn/tsn_k400_frames.yaml`, and fill in the downloaded weight path below `pretrained:`\n   ```yaml\n   MODEL:\n       framework: \"Recognizer2D\"\n       backbone:\n           name: \"ResNet\"",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn.md:21-48"
+    },
+    "4563": {
+        "file_id": 388,
+        "content": "This code provides instructions for training the Temporal Segment Networks model on the Kinetics-400 dataset. It explains how to download and add pre-trained ResNet50 weights as initialization parameters, and specifies where to find more information about data preparation and model configuration.",
+        "type": "comment"
+    },
+    "4564": {
+        "file_id": 388,
+        "content": "           pretrained: fill in the path here\n   ```\n#### Start training\n- Kinetics-400 data set uses 8 cards for training, the training start command for frames format data is as follows\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsn main.py --validate -c configs/recognition/ tsn/tsn_k400_frames.yaml\n  ```\n## Test\nSince the sampling method of the TSN model test mode is **TenCrop** with a slower speed but higher accuracy, which is different from the **CenterCrop** used in the verification mode during the training process, the verification index `topk Acc` recorded in the training log It does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsn main.py --test -c configs/recognition/ tsn/tsn_k400_frames.yaml -w \"output/TSN/TSN_best.pdparams\"",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn.md:49-65"
+    },
+    "4565": {
+        "file_id": 388,
+        "content": "Start training: Use Kinetics-400 dataset and 8 GPUs for training, command to start the training process.\nTest: TSN model test mode uses TenCrop method for better accuracy, different from training's CenterCrop; obtain final index by testing best model after training completes.",
+        "type": "comment"
+    },
+    "4566": {
+        "file_id": 388,
+        "content": "```\nWhen the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:\n| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 |                         checkpoints                          |\n| :------: | :-------------: | :---------------: | :-----: | :---------: | :---: | :----------------------------------------------------------: |\n| ResNet50 |     TenCrop     |       NCHW        |   3    |     224     | 69.81 | [TSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams) |\n| ResNet50 |     TenCrop     |       NCHW        |   8    |     224     | 71.70 | [TSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400_8.pdparams) |\n## Inference\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml \\\n                                -p data/TSN_k400.pdparams \\\n                                -o inference/TSN",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn.md:66-81"
+    },
+    "4567": {
+        "file_id": 388,
+        "content": "The code is providing test indicator results for TSN model on the validation dataset of Kinetics-400 using different backbone, sampling methods, and training strategies. It also shows the checkpoints' URLs. Additionally, it exports an inference model named \"TSN\" into a folder called \"inference/TSN\" from the specified configuration file, model parameters, and output directory.",
+        "type": "comment"
+    },
+    "4568": {
+        "file_id": 388,
+        "content": "```\nThe above command will generate the model structure file `TSN.pdmodel` and the model weight file `TSN.pdiparams` required for prediction.\nFor the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-Model Reasoning)\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/tsn/tsn_k400_frames.yaml \\\n                           --model_file inference/TSN/TSN.pdmodel \\\n                           --params_file inference/TSN/TSN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\n## Details\n**data processing:**\n- The model reads the `mp4` data in the Kinetics-400 data set, first divides each piece of video data into `num_seg` segments, and then evenly extracts 1 frame of image from each segment to obtain sparsely sampled `num_seg` video frames , And then do the same random da",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn.md:82-103"
+    },
+    "4569": {
+        "file_id": 388,
+        "content": "This code is for generating and using the TSN model in PaddlePaddle for video recognition. It generates a model structure file (TSN.pdmodel) and weight file (TSN.pdiparams), and then uses predict.py to predict the labels of frames from a video file (example.avi) using the generated files, with GPU acceleration enabled. The model reads frames sparsely sampled from videos in the Kinetics-400 dataset, divides them into segments, extracts one frame per segment, and applies random data augmentation.",
+        "type": "comment"
+    },
+    "4570": {
+        "file_id": 388,
+        "content": "ta enhancement to this `num_seg` frame image, including multi-scale random cropping, random left and right flips, data normalization, etc., and finally zoom to `target_size`\n**training strategy:**\n- Use Momentum optimization algorithm for training, momentum=0.9\n- Using L2_Decay, the weight attenuation coefficient is 1e-4\n- Use global gradient clipping, with a clipping factor of 40.0\n- The total number of epochs is 100, and the learning rate will be attenuated by 0.1 times when the epoch reaches 40 and 80\n- Dropout_ratio=0.4\n**parameter initialization**\n- The convolutional layer of the TSN model uses Paddle's default [KaimingNormal](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/initializer/KaimingNormal_cn.html#kaimingnormal) and [Constant](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/Constant_cn.html#constant) initialization method, with Normal(mean=0, std= 0.01) normal distribution to initialize the weight of the FC layer, and a constant 0 to initialize the bias of the FC layer",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn.md:103-119"
+    },
+    "4571": {
+        "file_id": 388,
+        "content": "Enhances `num_seg` frame image with multi-scale random cropping, flips, normalization, and zooms to `target_size`. Momentum optimization is used for training, L2 decay with 1e-4 attenuation coefficient, global gradient clipping with a factor of 40.0. Total epochs are 100, learning rate decreases at epochs 40 and 80, dropout_ratio=0.4. KaimingNormal and Constant initializers used for convolutional layers and FC layer weights, respectively.",
+        "type": "comment"
+    },
+    "4572": {
+        "file_id": 388,
+        "content": "## Reference\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn.md:121-123"
+    },
+    "4573": {
+        "file_id": 388,
+        "content": "The code contains a reference to the paper \"Temporal Segment Networks: Towards Good Practices for Deep Action Recognition\" by Limin Wang et al., which provides information on the implementation of TSN model in PaddleVideo.",
+        "type": "comment"
+    },
+    "4574": {
+        "file_id": 389,
+        "content": "/english_documents/model_zoo/recognition/tsn_dali.md",
+        "type": "filepath"
+    },
+    "4575": {
+        "file_id": 389,
+        "content": "The code improves TSN model training speed with DALI in PaddleVideo, using Kinetics400/UCF101 datasets and ResNet50 pretrained models. It provides detailed guidelines for action recognition tasks, including model download, config file usage, and separate sections for tests and inferences.",
+        "type": "summary"
+    },
+    "4576": {
+        "file_id": 389,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/tsn_dali.md) | English\n# TSN DALI\n- [Introduction](#Introduction)\n- [Requirement](#Requirement)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe aims to speed up TSN model training using DALI in this code. As [nvidia DALI](https://github.com/NVIDIA/DALI) not support TSN sampling way, we reimplemented segment sampling in VideoReader.\n### Performance\nTest Environment: \n```\nCard: Tesla v100\nMemory: 4 * 16G\nCuda: 9.0\nbatch_size of single card: 32\n```\n| Training way | batch cost/s  | reader cost/s | ips:instance/sec | Speed up |\n| :--------------- | :--------: | :------------: | :------------: | :------------: |\n| DALI | 2.083 | 1.804 | 15.36597  |  1.41x |\n| Dataloader: num_workers=4 | 2.943 | 2.649 | 10.87460| base |\n| pytorch实现 | TODO | TODO | TODO | TODO | \n## Requirement\ndocker image:\n```\n    huangjun12/paddlevideo:tsn_dali_cuda9_0\n```\nTo build container, you can use:\n```bash\nnvidia-docker run --name t",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn_dali.md:1-45"
+    },
+    "4577": {
+        "file_id": 389,
+        "content": "This code aims to speed up the TSN (Two-Stream Networks) model training using DALI (Data Augmentation Library for Images and Videos) in PaddleVideo. The author reimplemented segment sampling in VideoReader as NVIDIA DALI does not support TSN sampling way. They tested the performance with a Tesla v100 GPU and reported improvements in batch cost/s, reader cost/s, and instance/sec compared to Dataloader and base implementation. The docker image for this implementation is huangjun12/paddlevideo:tsn_dali_cuda9_0.",
+        "type": "comment"
+    },
+    "4578": {
+        "file_id": 389,
+        "content": "sn-DALI -v /home:/workspace --network=host -it --shm-size 64g -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video huangjun12/paddlevideo:tsn_dali_cuda9_0 /bin/bash\n```\n## Data\n- Kinetics400 dataset please refer to [K400 data](../../dataset/k400.md)\n- UCF101 dataset please refer to [UCF101 data](../../dataset/ucf101.md)\n## Train\n### download pretrain-model\n- Please download [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams) as pretraind model:\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams\n```\nand add path to MODEL.framework.backbone.pretrained in config file as：\n```yaml\nMODEL:\n    framework: \"Recognizer2D\"\n    backbone:\n        name: \"ResNet\"\n        pretrained: your weight path\n```\n### Start training\nYou can start training by: \n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml -o log_level=\"INFO\"\n```\n- Args -c is used to specify config file，default is ```configs/recognition/tsn/tsn_dali.yaml```。",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn_dali.md:45-82"
+    },
+    "4579": {
+        "file_id": 389,
+        "content": "This code snippet is a command for running TSN (Two-Stream Network) with DALI (Data Augmentation and Layout Innovation) on PaddleVideo. It utilizes the Kinetics400 and UCF101 datasets, downloads the ResNet50 pretrained model, and starts the training process using Python and PaddlePaddle framework. The command also specifies the GPU usage and log directory for tracking progress.",
+        "type": "comment"
+    },
+    "4580": {
+        "file_id": 389,
+        "content": "- For finetune please download our trained model [TSN.pdparams]()<sup>coming soon</sup>，and specify file path with --weights. \n- For the config file usage，please refer to [config](../../tutorials/config.md).\n## Test\nPlease refer to [TSN Test](./tsn.md)\n## Inference\nPlease refer to [TSN Inference](./tsn.md)\n## Reference\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/tsn_dali.md:84-98"
+    },
+    "4581": {
+        "file_id": 389,
+        "content": "This code is providing information on how to use the TSN model for action recognition. It mentions downloading the trained model file, using a config file, and refers users to separate sections for test and inference processes. The reference section includes the original paper link.",
+        "type": "comment"
+    },
+    "4582": {
+        "file_id": 390,
+        "content": "/english_documents/model_zoo/recognition/videoswin.md",
+        "type": "filepath"
+    },
+    "4583": {
+        "file_id": 390,
+        "content": "The Video-Swin-Transformer model achieves SOTA accuracy on Kinetics-400, offering multi-scale modeling, efficient local attention features, and mixed-precision training. Code provides data prep, training, testing, and inference instructions for 8 GPUs, with pre-trained Swin-Transformer models available in PaddleVideo.",
+        "type": "summary"
+    },
+    "4584": {
+        "file_id": 390,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/videoswin.md) | English\n# Video-Swin-Transformer Video Classification Model\n## content\n- [Introduction](#Introduction)\n- [Data](#DATA)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nVideo-Swin-Transformer is a video classification model based on Swin Transformer. It utilizes Swin Transformer's multi-scale modeling and efficient local attention characteristics. It currently achieves SOTA accuracy on the Kinetics-400 data set, surpassing the same transformer structure. The TimeSformer model.\n![VideoSwin](../../../images/videoswin.jpg)\n## DATA\nK400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)\n## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Download the image pre-training model [swin_base_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams) as the Backbone initialization parameter, or download it through the wget command",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/videoswin.md:1-33"
+    },
+    "4585": {
+        "file_id": 390,
+        "content": "This is a model card for the Video-Swin-Transformer video classification model, based on Swin Transformer. It utilizes multi-scale modeling and efficient local attention features to achieve SOTA accuracy on Kinetics-400 dataset. The code provides information about data preparation, training, testing, and inference.",
+        "type": "comment"
+    },
+    "4586": {
+        "file_id": 390,
+        "content": "   ```bash\n   wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams # ImageNet pretrained model for VideoSwin_base\n   # wget https://videotag.bj.bcebos.com/PaddleVideorelease2.2/swin_small_patch4_window7_224.pdparams # Imagenet pretrained model for VideoSwin_small\n   ```\n2. Open `configs/recognition/videoswin/videoswin_base_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"RecognizerTransformer\"\n        backbone:\n            name: \"SwinTransformer3D\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:\n    ```bash\n    # videos data format\n    python3.7 -u -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_videoswin_base main.py --validate -c configs/recognition/video_swin_transformer/videoswin_base_k400_videos.yaml\n    ```\n- Turn o",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/videoswin.md:35-60"
+    },
+    "4587": {
+        "file_id": 390,
+        "content": "This code provides the steps to download a pretrained VideoSwin model, update its configuration file with the downloaded path, and finally start training it on the Kinetics400 dataset using 8 GPUs.",
+        "type": "comment"
+    },
+    "4588": {
+        "file_id": 390,
+        "content": "n amp mixed-precision training to speed up the training process. The training start command is as follows:\n    ```bash\n    export FLAGS_conv_workspace_size_limit=800 # MB\n    export FLAGS_cudnn_exhaustive_search=1\n    export FLAGS_cudnn_batchnorm_spatial_persistent=1\n    # videos data format\n    python3.7 -u -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_videoswin_base main.py --amp --validate -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml\n    ```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../../zh-CN/contribute/config.md) for parameter usage.\n## Test\n- The Video-Swin-Transformer model is verified during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/videoswin.md:60-75"
+    },
+    "4589": {
+        "file_id": 390,
+        "content": "The code sets up mixed-precision training with specific flags for faster processing. It also provides command for running the PaddleVideo model, specifically Video-Swin-Transformer, on GPUs and customizable configuration files. The accuracy is verified during training by checking for the \"best\" keyword in the log.",
+        "type": "comment"
+    },
+    "4590": {
+        "file_id": 390,
+        "content": "  ```log\n  Already save the best model (top1 acc)0.7258\n  ```\n- Since the sampling method of the Video-Swin-Transformer model test mode is a bit slower but more accurate **UniformCrop**, which is different from the **CenterCrop** used in the verification mode during the training process, so the verification recorded in the training log The index `topk Acc` does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_videoswin_base main.py --test -c configs/recognition/video_swin_transformer/videoswin_base_k400_videos.yaml -w \"output/VideoSwin_base/VideoSwin_base_best.pdparams\"\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:\n   |        backbone        | Sampling method | num_seg | target_s",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/videoswin.md:77-89"
+    },
+    "4591": {
+        "file_id": 390,
+        "content": "Code snippet shows how to test the best Video-Swin-Transformer model after training, using a different sampling method (UniformCrop) for improved accuracy. The command provided demonstrates how to execute the test with specific configuration settings and input files, resulting in evaluation metrics on the Kinetics-400 validation dataset.",
+        "type": "comment"
+    },
+    "4592": {
+        "file_id": 390,
+        "content": "ize | Top-1 |                                                        checkpoints                                                         | pretrain model |\n   | :--------------------: | :-------------: | :-----: | :---------: | :---- | :------------------------------------------------------------------------------------------------------------------------: | :----: |\n   | Swin-Transformer_base  |   UniformCrop   |   32    |     224     | 82.40 |  [SwinTransformer_k400_base.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_base_k400.pdparams)  | [swin_base_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams) |\n   | Swin-Transformer_small |   UniformCrop   |   32    |     224     | 80.18 | [SwinTransformer_k400_small.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_small_k400.pdparams) | [swin_small_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_small_patch4_window7_224.pdparams) |",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/videoswin.md:89-92"
+    },
+    "4593": {
+        "file_id": 390,
+        "content": "The table displays pre-trained model checkpoints for Swin-Transformer in PaddleVideo's model zoo, including the model size, input image size, top-1 accuracy, and corresponding URLs for downloading the pdparams files.",
+        "type": "comment"
+    },
+    "4594": {
+        "file_id": 390,
+        "content": "## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml \\\n                                -p data/VideoSwin_base_k400.pdparams \\\n                                -o inference/VideoSwin_base\n```\nThe above command will generate the model structure file `VideoSwin_base.pdmodel` and the model weight file `VideoSwin_base.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Inference](../../usage.md#2-infer)\n### Use predictive engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/videoswin/videoswin_base_k400_videos.yaml \\\n                           --model_file inference/VideoSwin_base/VideoSwin_base.pdmodel \\\n                           --params_file inference/VideoSwin_base/VideoSwin_base.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/videoswin.md:94-117"
+    },
+    "4595": {
+        "file_id": 390,
+        "content": "This code snippet provides instructions for exporting an inference model and using the predictive engine inference in PaddleVideo. The first command generates the necessary files (`.pdmodel` and `.pdiparams`) required for prediction, while the second command performs the actual prediction on a given input video file with specified configuration and model files.",
+        "type": "comment"
+    },
+    "4596": {
+        "file_id": 390,
+        "content": "The output example is as follows:\n```log\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9999829530715942\n```\nIt can be seen that using the Video-Swin-Transformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By referring to the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.\n## Reference\n- [Video Swin Transformer](https://arxiv.org/pdf/2106.13230.pdf), Ze Liu, Jia Ning, Yue Cao, Yixuan Wei",
+        "type": "code",
+        "location": "/english_documents/model_zoo/recognition/videoswin.md:119-131"
+    },
+    "4597": {
+        "file_id": 390,
+        "content": "This code showcases an example of using the Video-Swin-Transformer model trained on Kinetics-400 to predict a video file. The output includes the top-1 class and score, and referring to the category id and name correspondence table allows for identifying the predicted category name.",
+        "type": "comment"
+    },
+    "4598": {
+        "file_id": 391,
+        "content": "/english_documents/model_zoo/segmentation/asrf.md",
+        "type": "filepath"
+    },
+    "4599": {
+        "file_id": 391,
+        "content": "ASRF is an improved video action segmentation model built upon ms-tcn, utilizing PaddlePaddle framework for training and exporting inference models. It provides accuracy results and performance metrics, with examples for running inference on PaddleVideo.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/46.json b/docs/data/46.json
new file mode 100644
index 000000000..c0d8d0767
--- /dev/null
+++ b/docs/data/46.json
@@ -0,0 +1,541 @@
+{
+    "4600": {
+        "file_id": 391,
+        "content": "[简体中文](../../../zh-CN/model_zoo/segmentation/asrf.md) | English\n# ASRF : Video Action Segmentation Model\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nASRF model is an improvement on the video motion segmentation model ms-tcn, which was published on WACV in 2021. We reproduce the officially implemented pytorch code and obtain approximate results in paddlevideo.\n<p align=\"center\">\n<img src=\"../../../images/asrf.png\" height=300 width=400 hspace='10'/> <br />\nMS-TCN Overview\n</p>\n## Data\nASRF can choose 50salads, breakfast, gtea as trianing set. Please refer to Video Action Segmentation dataset download and preparation doc [Video Action Segmentation dataset](../../dataset/SegmentationDataset.md)\nUnlike MS-TCN, ASRF model requires additional data construction. The script process is as follows\n```bash\npython data/50salads/prepare_asrf_data.py --dataset_dir data/\n```\n## Train\nAfter prepare dataset, we can run sprits.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/asrf.md:1-35"
+    },
+    "4601": {
+        "file_id": 391,
+        "content": "ASRF is an improved video action segmentation model built upon ms-tcn, which was published in 2021. It utilizes the PaddlePaddle framework and can be trained on datasets such as 50salads, breakfast, or gtea. The model requires additional data construction using a specific script for preparation.",
+        "type": "comment"
+    },
+    "4602": {
+        "file_id": 391,
+        "content": "```bash\n# gtea dataset\nexport CUDA_VISIBLE_DEVICES=3\npython3.7 main.py  --validate -c configs/segmentation/asrf/asrf_gtea.yaml\n```\n- Start the training by using the above command line or script program. There is no need to use the pre training model. The video action segmentation model is usually a full convolution network. Due to the different lengths of videos, the `DATASET.batch_size` of the video action segmentation model is usually set to `1`, that is, batch training is not required. At present, only **single sample** training is supported.\n## Test\nTest MS-TCN on dataset scripts:\n```bash\npython main.py  --test -c configs/segmentation/asrf/asrf_gtea.yaml --weights=./output/ASRF/ASRF_split_1.pdparams\n```\n- The specific implementation of the index is to calculate ACC, edit and F1 scores by referring to the test script[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py) provided by the author of ms-tcn.\nThe reproduction of pytorch comes from the official [code base](https://github.com/yiskw713/asrf)",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/asrf.md:37-55"
+    },
+    "4603": {
+        "file_id": 391,
+        "content": "This code is running a training command for an action segmentation model (ASRF) on the GTEA dataset, specifically without using pre-training. It uses CUDA device 3 and a provided configuration file. The test command tests MS-TCN on a dataset using a previously trained model's weights. The index calculation in the test refers to an evaluation script provided by the original author of ms-tcn. The codebase is from the official ASRF repository in PyTorch.",
+        "type": "comment"
+    },
+    "4604": {
+        "file_id": 391,
+        "content": "- The evaluation method of data set adopts the folding verification method in ms-tcn paper, and the division method of folding is the same as that in ms-tcn paper.\nAccuracy on Breakfast dataset(4 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 67.6% | 72.4% | 74.3% | 68.9% | 56.1% |\n| pytorch | 65.8% | 71.0% | 72.3% | 66.5% | 54.9% |\n| paddle | 66.1% | 71.9% | 73.3% | 67.9% | 55.7% |\nAccuracy on 50salads dataset(5 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 84.5% | 79.3% | 82.9% | 83.5% | 77.3% |\n| pytorch | 81.4% | 75.6% | 82.7% | 81.2% | 77.2% |\n| paddle | 81.6% | 75.8% | 83.0% | 81.5% | 74.8% |\nAccuracy on gtea dataset(4 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 77.3% | 83.7% | 89.4% | 87.8% | 79.8% |\n| pytorch | 76.3% | 79.6% | 87.3% | 85.8% | 74.9% |",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/asrf.md:57-80"
+    },
+    "4605": {
+        "file_id": 391,
+        "content": "The code provides accuracy results for different models on three datasets, Breakfast, 50salads, and GTEA, using a 4 or 5-fold validation method as per the MS-TCN paper. The performance metrics include Accuracy (Acc), Edit Distance (Edit), and F1 scores at different thresholds (F1@0.1, F1@0.25, F1@0.5).",
+        "type": "comment"
+    },
+    "4606": {
+        "file_id": 391,
+        "content": "| paddle | 77.1% | 83.3% | 88.9% | 87.5% | 79.1% |\nModel weight for gtea\nTest_Data| F1@0.5 | checkpoints |\n| :----: | :----: | :---- |\n| gtea_split1 | 72.4409 | [ASRF_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_1.pdparams) |\n| gtea_split2 | 76.6666 | [ASRF_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_2.pdparams) |\n| gtea_split3 | 84.5528 | [ASRF_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_3.pdparams) |\n| gtea_split4 | 82.6771 | [ASRF_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_4.pdparams) |\n## Infer\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/segmentation/asrf/asrf_gtea.yaml \\\n                                -p data/ASRF_gtea_split_1.pdparams \\\n                                -o inference/ASRF\n```\nTo get model architecture file `ASRF.pdmodel` and parameters file `ASRF.pdiparams`, use:",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/asrf.md:81-100"
+    },
+    "4607": {
+        "file_id": 391,
+        "content": "Table showing model weight for gtea with corresponding F1@0.5 and checkpoint links, followed by command to export inference model for ASRF_gtea using given parameters.",
+        "type": "comment"
+    },
+    "4608": {
+        "file_id": 391,
+        "content": "- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\nInput file are the file list for infering, for example:\n```\nS1_Cheese_C1.npy\nS1_CofHoney_C1.npy\nS1_Coffee_C1.npy\nS1_Hotdog_C1.npy\n...\n```\n```bash\npython3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \\\n                           --config configs/segmentation/asrf/asrf_gtea.yaml \\\n                           --model_file inference/ASRF/ASRF.pdmodel \\\n                           --params_file inference/ASRF/ASRF.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```bash\nresult write in : ./inference/infer_results/S1_Cheese_C1.txt\nresult write in : ./inference/infer_results/S1_CofHoney_C1.txt\nresult write in : ./inference/infer_results/S1_Coffee_C1.txt\nresult write in : ./inference/infer_results/S1_Hotdog_C1.txt\nresult write in : ./inference/infer_results/S1_Pealate_C1.txt",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/asrf.md:102-131"
+    },
+    "4609": {
+        "file_id": 391,
+        "content": "This code provides an example of how to run model inference using the ASRF segmentation model from PaddleVideo. The input file should contain a list of .npy files, and the code demonstrates how to execute it with specific configuration, model, and parameter files. It also shows the location where the results will be written after inference is complete.",
+        "type": "comment"
+    },
+    "4610": {
+        "file_id": 391,
+        "content": "result write in : ./inference/infer_results/S1_Peanut_C1.txt\nresult write in : ./inference/infer_results/S1_Tea_C1.txt\n```\n## Reference\n- [Alleviating Over-segmentation Errors by Detecting Action Boundaries](https://arxiv.org/pdf/2007.06866v1.pdf), Yuchi Ishikawa, Seito Kasai, Yoshimitsu Aoki, Hirokatsu Kataoka",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/asrf.md:132-139"
+    },
+    "4611": {
+        "file_id": 391,
+        "content": "Writes the inference results to separate text files for \"Peanut\" and \"Tea\" scenes.",
+        "type": "comment"
+    },
+    "4612": {
+        "file_id": 392,
+        "content": "/english_documents/model_zoo/segmentation/cfbi.md",
+        "type": "filepath"
+    },
+    "4613": {
+        "file_id": 392,
+        "content": "The code implements the CFBI Video Object Segmentation model proposed by Baidu in ECCV 2020, considering background and foreground for segmentation, predicting on current frames given reference frame and previous frame. It follows DAVIS guidelines, uses \"cfbip_davis.yaml\" configuration file, pretrained weights \"CFBIp_davis.pdparams\", saves predictions to \"result_root\", provides evaluation metrics including J&F-Mean, and references checkpoint file \"CFBIp_r101_davis.pdparams\".",
+        "type": "summary"
+    },
+    "4614": {
+        "file_id": 392,
+        "content": "[简体中文](../../../zh-CN/model_zoo/recognition/cfbi.md) | English\n# CFBI\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Test](#Test)\n- [Reference](#Reference)\n## Introduction\nCFBI is a Video Object Segmentation model proposed by Baidu in ECCV 2020. This method consider background should be equally treated and thus propose Collaborative video object segmentation by Foreground-Background Integration (CFBI) approach. Our CFBI implicitly imposes the feature embedding from the target foreground object and its corresponding background to be contrastive, promoting the segmentation results accordingly.  Given the image and target segmentation of the reference frame (the first frame) and the previous frame, the model will predict the segmentation of the current frame.\n<div align=\"center\">\n<img src=\"../../../images/cfbi.png\" height=400 width=600 hspace='10'/> <br />\n</div>\n## Data\nPlease refer to DAVIS data download and preparation doc [DAVIS-data](../../dataset/davis.md)\n## Test\n- Test scripts:",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/cfbi.md:1-29"
+    },
+    "4615": {
+        "file_id": 392,
+        "content": "This code describes the CFBI Video Object Segmentation model, proposed by Baidu in ECCV 2020. It considers background as important as foreground and uses collaborative integration for segmentation. The model predicts segmentation of current frames given reference frame and previous frame. Data preparation follows DAVIS guidelines.",
+        "type": "comment"
+    },
+    "4616": {
+        "file_id": 392,
+        "content": "```bash\npython3.7 main.py --test -c configs/segmentation/cfbip_davis.yaml -w CFBIp_davis.pdparams\n```\n- Predicted results will be saved in `result_root`. To get evaluation metrics, please use [davis2017-evaluation tools](https://github.com/davisvideochallenge/davis2017-evaluation).\nMetrics on DAVIS:\n| J&F-Mean | J-Mean | J-Recall | J-Decay | F-Mean | F-Recall | F-Decay | checkpoints |\n| :------: | :-----: | :----: | :----: | :----: | :----: | :----: | :----: |\n| 0.823 | 0.793 | 0.885 | 0.083 | 0.852 | 0.932 | 0.100 | [CFBIp_r101_davis.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/CFBIp_r101_davis.pdparams) |\n## Reference\n- [Collaborative Video Object Segmentation by Foreground-Background Integration](https://arxiv.org/abs/2003.08333), Zongxin Yang, Yunchao Wei, Yi Yang",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/cfbi.md:31-46"
+    },
+    "4617": {
+        "file_id": 392,
+        "content": "This code is running a segmentation model trained using the \"cfbip_davis.yaml\" configuration file, and testing it with pretrained weights stored in \"CFBIp_davis.pdparams\". The predicted results will be saved to the \"result_root\" directory. Evaluation metrics for this model on DAVIS dataset are provided, including J&F-Mean, J-Mean, J-Recall, J-Decay, F-Mean, F-Recall and F-Decay. The checkpoint file is referenced as \"CFBIp_r101_davis.pdparams\" which can be found at the provided URL.",
+        "type": "comment"
+    },
+    "4618": {
+        "file_id": 393,
+        "content": "/english_documents/model_zoo/segmentation/mstcn.md",
+        "type": "filepath"
+    },
+    "4619": {
+        "file_id": 393,
+        "content": "This code trains and evaluates the MS-TCN video action segmentation model using provided datasets, compares performance with PaddleVideo's MSTCN, exports inference models, uses metrics like accuracy and F1 score, and runs with GPU usage enabled.",
+        "type": "summary"
+    },
+    "4620": {
+        "file_id": 393,
+        "content": "[简体中文](../../../zh-CN/model_zoo/segmentation/mstcn.md) | English\n# MS-TCN : Video Action Segmentation Model\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nMs-tcn model is a classic model of video motion segmentation model, which was published on CVPR in 2019. We optimized the officially implemented pytorch code and obtained higher precision results in paddlevideo.\n<p align=\"center\">\n<img src=\"../../../images/mstcn.PNG\" height=300 width=400 hspace='10'/> <br />\nMS-TCN Overview\n</p>\n## Data\nMS-TCN can choose 50salads, breakfast, gtea as trianing set. Please refer to Video Action Segmentation dataset download and preparation doc [Video Action Segmentation dataset](../../dataset/SegmentationDataset.md)\n## Train\nAfter prepare dataset, we can run sprits.\n```bash\n# gtea dataset\nexport CUDA_VISIBLE_DEVICES=3\npython3.7 main.py  --validate -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --seed 1538574472",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/mstcn.md:1-35"
+    },
+    "4621": {
+        "file_id": 393,
+        "content": "Introduction: MS-TCN model for video motion segmentation was published in 2019 and optimized for higher precision results in PaddleVideo.\nData: Choose from 50salads, breakfast, gtea datasets for training. Refer to Video Action Segmentation dataset download and preparation doc.\nTrain: After preparing the dataset, run scripts with provided command example.",
+        "type": "comment"
+    },
+    "4622": {
+        "file_id": 393,
+        "content": "```\n- Start the training by using the above command line or script program. There is no need to use the pre training model. The video action segmentation model is usually a full convolution network. Due to the different lengths of videos, the `DATASET.batch_size` of the video action segmentation model is usually set to `1`, that is, batch training is not required. At present, only **single sample** training is supported.\n## Test\nTest MS-TCN on dataset scripts:\n```bash\npython main.py  --test -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --weights=./output/MSTCN/MSTCN_split_1.pdparams\n```\n- The specific implementation of the index is to calculate ACC, edit and F1 scores by referring to the test script[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py) provided by the author of ms-tcn.\n- The evaluation method of data set adopts the folding verification method in ms-tcn paper, and the division method of folding is the same as that in ms-tcn paper.\nAccuracy on Breakfast dataset(4 folding verification):",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/mstcn.md:36-52"
+    },
+    "4623": {
+        "file_id": 393,
+        "content": "The code snippet provides instructions for training and testing the video action segmentation model, specifically MSTCN. It mentions that single sample training is supported, and demonstrates how to test MSTCN on a dataset using the provided command line or script program. Additionally, it explains the evaluation method used for datasets and refers to the author's provided evaluation script.",
+        "type": "comment"
+    },
+    "4624": {
+        "file_id": 393,
+        "content": "| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 66.3% | 61.7% | 48.1% | 48.1% | 37.9% |\n| paddle | 65.2% | 61.5% | 53.7% | 49.2% | 38.8% |\nAccuracy on 50salads dataset(5 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 80.7% | 67.9% | 76.3% | 74.0% | 64.5% |\n| paddle | 81.1% | 71.5% | 77.9% | 75.5% | 66.5% |\nAccuracy on gtea dataset(4 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 79.2% | 81.4% | 87.5% | 85.4% | 74.6% |\n| paddle | 76.9% | 81.8% | 86.4% | 84.7% | 74.8% |\nModel weight for gtea\nTest_Data| F1@0.5 | checkpoints |\n| :----: | :----: | :---- |\n| gtea_split1 | 70.2509 | [MSTCN_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_1.pdparams) |\n| gtea_split2 | 70.7224 | [MSTCN_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_2.pdparams) |",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/mstcn.md:54-78"
+    },
+    "4625": {
+        "file_id": 393,
+        "content": "This table compares the performance of a paper model and PaddleVideo's MSTCN model on different datasets. The metrics include accuracy (Acc), edit distance, and F1 score (F1@0.1, F1@0.25, F1@0.5). The models are validated with 5-fold cross-validation on the 50salads dataset and 4-fold on the gtea dataset. The provided checkpoints are for gtea dataset splits.",
+        "type": "comment"
+    },
+    "4626": {
+        "file_id": 393,
+        "content": "| gtea_split3 | 80.0 | [MSTCN_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_3.pdparams) |\n| gtea_split4 | 78.1609 | [MSTCN_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_4.pdparams) |\n## Infer\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \\\n                                -p data/MSTCN_gtea_split_1.pdparams \\\n                                -o inference/MSTCN\n```\nTo get model architecture file `MSTCN.pdmodel` and parameters file `MSTCN.pdiparams`, use:\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\nInput file are the file list for infering, for example:\n```\nS1_Cheese_C1.npy\nS1_CofHoney_C1.npy\nS1_Coffee_C1.npy\nS1_Hotdog_C1.npy\n...\n```\n```bash\npython3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \\",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/mstcn.md:79-108"
+    },
+    "4627": {
+        "file_id": 393,
+        "content": "This code provides instructions for exporting and using an inference model. The `export_model.py` script is used to create the architecture file (`MSTCN.pdmodel`) and parameters file (`MSTCN.pdiparams`). These files can be obtained by running the script with the given configuration file, pre-trained parameters file path, and output directory. The inference process involves providing a list of input files in the format `S1_<item>_C1.npy`. To execute the inference, run the `predict.py` script with the input file list as an argument.",
+        "type": "comment"
+    },
+    "4628": {
+        "file_id": 393,
+        "content": "                           --config configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \\\n                           --model_file inference/MSTCN/MSTCN.pdmodel \\\n                           --params_file inference/MSTCN/MSTCN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```bash\nresult write in : ./inference/infer_results/S1_Cheese_C1.txt\nresult write in : ./inference/infer_results/S1_CofHoney_C1.txt\nresult write in : ./inference/infer_results/S1_Coffee_C1.txt\nresult write in : ./inference/infer_results/S1_Hotdog_C1.txt\nresult write in : ./inference/infer_results/S1_Pealate_C1.txt\nresult write in : ./inference/infer_results/S1_Peanut_C1.txt\nresult write in : ./inference/infer_results/S1_Tea_C1.txt\n```\n## Reference\n- [MS-TCN: Multi-Stage Temporal Convolutional Network for Action Segmentation](https://arxiv.org/pdf/1903.01945.pdf), Y. Abu Farha and J. Gall.",
+        "type": "code",
+        "location": "/english_documents/model_zoo/segmentation/mstcn.md:109-130"
+    },
+    "4629": {
+        "file_id": 393,
+        "content": "The code is specifying the configuration file, model file, and parameter file for running the MSTCN (Multi-Stage Temporal Convolutional Network) segmentation model. It also sets the GPU usage to True and TensorRT to False.\nExample logs show the results being written into respective text files in the inference/infer_results folder.",
+        "type": "comment"
+    },
+    "4630": {
+        "file_id": 394,
+        "content": "/english_documents/quick_start.md",
+        "type": "filepath"
+    },
+    "4631": {
+        "file_id": 394,
+        "content": "PaddleVideo Quick Start guide covers installation, usage details, and action recognition model for classifying video files. Highlights top-5 classes with high confidence using an example command, also suggesting alternative OpenCV installation method.",
+        "type": "summary"
+    },
+    "4632": {
+        "file_id": 394,
+        "content": "English | [简体中文](../zh-CN/quick_start.md)\n# PaddleVide Quick Start\n- [1. Installation](#1)\n  - [1.1 Install PaddlePaddle](#11)\n  - [1.2 Install PaddleVideo Whl Package](#12)\n- [2. Easy-to-Use](#2)\n  - [2.1 Use by Command Line](#21)\n  - [2.2 Use by Python Code](#22)\n- [3. Arguments description](#3)\n- [4.QA](#4)\n## 1. Installation\n<a name=\"11\"></a>\n### 1.1 Install PaddlePaddle\n- If you have CUDA 9 or CUDA 10 installed on your machine, please run the following command to install\n  ```bash\n  python3.7 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple\n  ```\n- If you have no available GPU on your machine, please run the following command to install the CPU version\n  ```bash\n  python3.7 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple\n  ```\nFor more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation.\n<a name=\"12\"></a>\n### 1.2 Install PaddleVideo Whl Package\n- option1: use pypi（recommand）",
+        "type": "code",
+        "location": "/english_documents/quick_start.md:1-36"
+    },
+    "4633": {
+        "file_id": 394,
+        "content": "Code is an English version of the Quick Start guide for PaddleVideo. It provides information on how to install the necessary packages, use PaddleVideo by command line and Python code, describes arguments, and answers frequently asked questions. The code also includes instructions for installing PaddlePaddle with or without a GPU, as well as the option to install the PaddleVideo Whl Package from pypi.",
+        "type": "comment"
+    },
+    "4634": {
+        "file_id": 394,
+        "content": "```bash\npip3.7 install ppvideo==2.3.0\n```\n- option2: build and install locally\n```bash\npython3.7 setup.py bdist_wheel\npython3.7 -m pip install dist/ppvideo-2.3.0-py3-none-any.whl\n```\n## 2. Easy-to-Use\n<a name=\"21\"></a>\n### 2.1 Use by Command Line\nRun shell command：\n```bash\nppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'\n```\n- This command use `PP-TSM_v2` model to infer `data/example.avi` file in `CPU`.\n- The length of the example video is about 10s. When inference, the video is first divided into 16 segments according to the time axis, then extract one frame from each segment. Finally all frames are combined and feeded into the network.\nResults：\n```\nCurrent video file: data/example.avi\n        top-1 classes: [5]\n        top-1 scores: [1.]\n        top-1 label names: ['archery']\n```\nAs you can see, use `PP-TSM_v2` trained on Kinetics-400 to predict `data/example.avi` video，top1 prediction class_id is `5`, scores is `1.0`, class name is `archery`.\n<a name=\"22\"></a>\n### 2.2 Use by Python Code",
+        "type": "code",
+        "location": "/english_documents/quick_start.md:38-76"
+    },
+    "4635": {
+        "file_id": 394,
+        "content": "Install package using pip:\n```bash\npip3.7 install ppvideo==2.3.0\n```\nAlternatively, build and install locally:\n```bash\npython3.7 setup.py bdist_wheel\npython3.7 -m pip install dist/ppvideo-2.3.0-py3-none-any.whl\n```\nCommand to use by command line:\n```bash\nppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'\n```\nThis command uses PP-TSM_v2 model on CPU for inference on data/example.avi file, divided into 16 segments and frames combined before feeding into network. Results show top-1 prediction class_id as 5, scores as 1.0, and class name as 'archery'.",
+        "type": "comment"
+    },
+    "4636": {
+        "file_id": 394,
+        "content": "Run python code：\n```python\nfrom ppvideo import PaddleVideo\nclas = PaddleVideo(model_name='ppTSM_v2', use_gpu=False)\nvideo_file='data/example.avi'\nclas.predict(video_file)\n```\n- This code use `PP-TSM_v2` model to infer `data/example.avi` file in `CPU`.\nResults:\n```\nCurrent video file: data/example.avi\n        top-1 classes: [5]\n        top-1 scores: [1.]\n        top-1 label names: ['archery']\n```\nAs you can see, use `PP-TSM_v2` trained on Kinetics-400 to predict `data/example.avi` video，top1 prediction class_id is `5`, scores is `1.0`, class name is `archery`.\n<a name=\"3\"></a>\n## 3. Arguments description\n| name | type | description |\n| :---: | :---: | :--- |\n| model_name | str | optional, model name, `'ppTSM'` or `'ppTSM_v2'`. If None, please specify the path of your inference model by args `model_file` and `params_file`. |\n| video_file | str | required, Video file path, supported format: single video file path, or folder containing multiple videos. |\n| use_gpu | bool | whether to use GPU，default True。 |\n| nu",
+        "type": "code",
+        "location": "/english_documents/quick_start.md:78-107"
+    },
+    "4637": {
+        "file_id": 394,
+        "content": "This code uses the PaddleVideo library with the PP-TSM_v2 model for video inference on a CPU. It predicts the top-1 class, score, and label name of the provided 'data/example.avi' video file. The model is trained on Kinetics-400 dataset. Arguments include model name (PP-TSM or PP-TSM_v2), video file path, GPU usage, and other optional parameters.",
+        "type": "comment"
+    },
+    "4638": {
+        "file_id": 394,
+        "content": "m_seg | int | The number of segments used in the TSM model, which is also the number of frames extracted from the video. 8 for `ppTSM`, 16 for `ppTSM_v2`, default 16. |\n| short_size | int |  short size of frame, default 256.|\n| target_size | int | target size of frame, default 224.|\n| model_file | str | optional，inference model(`.pdmodel`)path. |\n| params_file | str | optional, inference modle(`.pdiparams`) path. |\n| batch_size | int | Batch size, default 1.|\n| use_fp16 | bool | whether to use float16，default False.|\n| use_tensorrt | bool| whether to use Tensorrt, default False.|\n| gpu_mem | int | use GPU memory, default 8000.|\n| enable_mkldnn | bool | whether to use MKLDNN, default False.|\n| top_k | int | top_k, default 1. |\n| label_name_path | str | This file consists the relation of class_id and class_name. Default use `data/k400/Kinetics-400_label_list.txt` of Kinetics-400. You can replace it with your own label file. |\ncommand example1：\n```bash\nppvideo --model_name='ppTSM_v2' --num_seg=16 --video_file=\"data/mp4\" --batch_size=2  --top_k=5",
+        "type": "code",
+        "location": "/english_documents/quick_start.md:107-122"
+    },
+    "4639": {
+        "file_id": 394,
+        "content": "The code defines several parameters for the PaddleVideo model including the number of segments, short and target frame sizes, model file paths, batch size, use of float16, TensorRT, MKLDNN, top_k, and label name path. It also provides a command example usage of the model with specified parameters.",
+        "type": "comment"
+    },
+    "4640": {
+        "file_id": 394,
+        "content": "```\nResults：\n```txt\nCurrent video file: data/mp4/example3.avi\n        top-5 classes: [  5 345 311 159 327]\n        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]\n        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']\nCurrent video file: data/mp4/example2.avi\n        top-5 classes: [  5 345 311 159 327]\n        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]\n        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']\nCurrent video file: data/mp4/example.avi\n        top-5 classes: [  5 345 311 159 327]\n        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]\n        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']\nCurrent video file: data/mp4/example1.avi\n        top-5 classes: [  5 345 311 159 327]\n        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]",
+        "type": "code",
+        "location": "/english_documents/quick_start.md:123-142"
+    },
+    "4641": {
+        "file_id": 394,
+        "content": "The code displays the top-5 classes, scores, and label names for five different video files. It shows that the classifier consistently identifies the same top-5 classes with high confidence for each video file, indicating a reliable classification performance.",
+        "type": "comment"
+    },
+    "4642": {
+        "file_id": 394,
+        "content": "        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']\n```\ncommand example1：\n```bash\nppvideo --model_name='ppTSM' --num_seg=8 --video_file=\"data/mp4\" --batch_size=2  --top_k=5\n```\n<a name=\"4\"></a>\n## 4. QA\n1. opecv-python Installation maybe slow, you can try:\n```\npython3.7 -m pip install opencv-python==4.2.0.32 -i https://pypi.doubanio.com/simple\n```",
+        "type": "code",
+        "location": "/english_documents/quick_start.md:143-157"
+    },
+    "4643": {
+        "file_id": 394,
+        "content": "The code provides a list of top-5 label names for PaddleVideo's action recognition model. The command example demonstrates how to run the model with specific parameters, such as the model name, number of video segments, input video file, and batch size. Additionally, it suggests an alternative installation method for OpenCV-python if the regular installation is slow.",
+        "type": "comment"
+    },
+    "4644": {
+        "file_id": 395,
+        "content": "/english_documents/tools.md",
+        "type": "filepath"
+    },
+    "4645": {
+        "file_id": 395,
+        "content": "This code provides usage instructions for various tools in PaddleVideo. It shows how to retrieve model parameters, calculate FLOPs, and test an exported model (coming soon). The code examples use Python 3.7 and require specific configuration files.",
+        "type": "summary"
+    },
+    "4646": {
+        "file_id": 395,
+        "content": "[简体中文](../zh-CN/tools.md) | English\n# Tools\nThis page includes the usage of some useful tools in PaddleVideo.\n## Params\nTo get the params of a model.\n```shell\npython3.7 tools/summary.py -c configs/recognization/tsm/tsm.yaml\n```\n## FLOPS\nto print FLOPs.\n```shell\npython3.7 tools/summary.py -c configs/recognization/tsm/tsm.yaml --FLOPs\n```\n## Test the export model <sup>coming soon</sup>",
+        "type": "code",
+        "location": "/english_documents/tools.md:1-22"
+    },
+    "4647": {
+        "file_id": 395,
+        "content": "This code provides usage instructions for various tools in PaddleVideo. It shows how to retrieve model parameters, calculate FLOPs, and test an exported model (coming soon). The code examples use Python 3.7 and require specific configuration files.",
+        "type": "comment"
+    },
+    "4648": {
+        "file_id": 396,
+        "content": "/english_documents/tutorials/Action Recognition Datasets",
+        "type": "filepath"
+    },
+    "4649": {
+        "file_id": 396,
+        "content": "This code provides a list of useful action recognition datasets along with their respective links for further reference. These datasets are essential for training and evaluating action recognition models, each serving its specific purpose in the field of computer vision.",
+        "type": "summary"
+    },
+    "4650": {
+        "file_id": 396,
+        "content": "Usefull Action Recognition Datasets.\n    AVA,  https://arxiv.org/abs/1705.08421\n    Kinetics, https://arxiv.org/abs/1705.06950\n    YouTube-8M, https://arxiv.org/abs/1609.08675\n    ActivityNet, http://www.cv-foundation.org/openaccess/content_cvpr_2015/html/Heilbron_ActivityNet_A_Large-Scale_2015_CVPR_paper.html\n    Moments in Time, https://arxiv.org/pdf/1801.03150.pdf\n    Charades, https://arxiv.org/abs/1604.01753\n    EPIC-Kitchens, https://arxiv.org/abs/1804.02748\n    THUMOS, https://arxiv.org/abs/1604.06182\n    UCF-101, http://crcv.ucf.edu/papers/UCF101_CRCV-TR-12-01.pdf\n    HMDB51, http://serre-lab.clps.brown.edu/wp-content/uploads/2012/08/Kuehne_etal_iccv11.pdf",
+        "type": "code",
+        "location": "/english_documents/tutorials/Action Recognition Datasets:1-12"
+    },
+    "4651": {
+        "file_id": 396,
+        "content": "This code provides a list of useful action recognition datasets along with their respective links for further reference. These datasets are essential for training and evaluating action recognition models, each serving its specific purpose in the field of computer vision.",
+        "type": "comment"
+    },
+    "4652": {
+        "file_id": 397,
+        "content": "/english_documents/tutorials/Action Recognition Papers",
+        "type": "filepath"
+    },
+    "4653": {
+        "file_id": 397,
+        "content": "This code provides a list of papers on action recognition and video classification, including TSN, SlowFast Networks, X3D, ECO, 3D ResNet, etc. The paper \"Action Recognition with Trajectory-Pooled Deep-Convolutional Descriptors\" presents an efficient method for recognizing actions from video sequences using deep convolutional descriptors and trajectory pooling.",
+        "type": "summary"
+    },
+    "4654": {
+        "file_id": 397,
+        "content": "Useful Papers on Action Recognition and Video Classification.\nTSN: Temporal Segment Networks: Towards Good Practices for Deep Action Recognition, ECCV 2016\nTSM: Temporal Shift Module for Efficient Video Understanding, ICCV 2019\nSlowFast Networks for Video Recognition, ICCV 2019\nNon-local Neural Networks, CVPR 2018\nA Multigrid Method for Efficiently Training Video Models, CVPR2020\nX3D: Progressive Network Expansion for Efficient Video Recognition, CVPR2020\nECO: Efficient Convolutional Network for Online Video Understanding, ECCV 2018\n3D Resnet: Would Mega-scale Datasets Further Enhance Spatiotemporal 3D CNNs, CVPR 2018\nTPN: Temporal Pyramid Network for Action Recognition, CVPR 2020\nEvaNet: Evolving Space-Time Neural Architectures for Videos, ICCV 2019\nRepFlow: Representation Flow for Action Recognition, CVPR 2019\nMARS: Motion-Augmented RGB Stream for Action Recognition, CVPR 2019\nStNet: Local and Global Spatial-Temporal Modeling for Human Action Recognition, AAAI 2019\nAttention Cluster: Purely Attention Based Local Feature Integration for Video Classification",
+        "type": "code",
+        "location": "/english_documents/tutorials/Action Recognition Papers:1-16"
+    },
+    "4655": {
+        "file_id": 397,
+        "content": "This code contains a list of useful papers related to action recognition and video classification, including TSN, TSM, SlowFast Networks, Non-local Neural Networks, X3D, ECO, 3D ResNet, TPN, EvaNet, RepFlow, MARS, StNet, and Attention Cluster.",
+        "type": "comment"
+    },
+    "4656": {
+        "file_id": 397,
+        "content": "NeXtVLAD: An Efficient Neural Network to Aggregate Frame-level Features for Large-scale Video Classification\nC-TCN: Action localization Model by Baidu, the Champion model of ActivityNet 2018\nNeural Graph Matching Networks for Fewshot 3D Action Recognition - M. Guo et al., ECCV2018.  \nTemporal 3D ConvNets using Temporal Transition Layer - A. Diba et al., CVPRW2018.  \nTemporal 3D ConvNets: New Architecture and Transfer Learning for Video Classification - A. Diba et al., arXiv2017.  \nAttentional Pooling for Action Recognition - R. Girdhar and D. Ramanan, NIPS2017.  \nFully Context-Aware Video Prediction - Byeon et al, arXiv2017.  \nHidden Two-Stream Convolutional Networks for Action Recognition - Y. Zhu et al, arXiv2017.  \nDynamic Image Networks for Action Recognition - H. Bilen et al, CVPR2016.   \nLong-term Recurrent Convolutional Networks for Visual Recognition and Description - J. Donahue et al, CVPR2015.  \nDescribing Videos by Exploiting Temporal Structure - L. Yao et al, ICCV2015. \nReal-time Action Recognition with Enhanced Motion Vector CNNs - B. Zhang et al, CVPR2016.  ",
+        "type": "code",
+        "location": "/english_documents/tutorials/Action Recognition Papers:17-28"
+    },
+    "4657": {
+        "file_id": 397,
+        "content": "This code contains references to various research papers in the field of action recognition and video classification, highlighting different models and architectures for these tasks.",
+        "type": "comment"
+    },
+    "4658": {
+        "file_id": 397,
+        "content": "Action Recognition with Trajectory-Pooled Deep-Convolutional Descriptors - L. Wang et al, CVPR2015. ",
+        "type": "code",
+        "location": "/english_documents/tutorials/Action Recognition Papers:29-29"
+    },
+    "4659": {
+        "file_id": 397,
+        "content": "This code refers to a paper titled \"Action Recognition with Trajectory-Pooled Deep-Convolutional Descriptors\" published in CVPR 2015 by authors L. Wang et al. This paper presents a method for action recognition using deep convolutional descriptors and trajectory pooling, offering an efficient approach to analyze and recognize actions from video sequences.",
+        "type": "comment"
+    },
+    "4660": {
+        "file_id": 398,
+        "content": "/english_documents/tutorials/Spatio-Temporal Action Detection Papers",
+        "type": "filepath"
+    },
+    "4661": {
+        "file_id": 398,
+        "content": "Summary: A comprehensive list of notable Spatio-Temporal Action Detection papers and authors from 2015-2017, covering major conferences like ICCV, BMVC, ECCV, and arXiv.",
+        "type": "summary"
+    },
+    "4662": {
+        "file_id": 398,
+        "content": "Usefull Spatio-Temporal Action Detection Papers.\n    A Better Baseline for AVA - R. Girdhar et al., ActivityNet Workshop, CVPR2018.\n    Real-Time End-to-End Action Detection with Two-Stream Networks - A. El-Nouby and G. Taylor, arXiv2018.\n    Human Action Localization with Sparse Spatial Supervision - P. Weinzaepfel et al., arXiv2017.\n    Unsupervised Action Discovery and Localization in Videos - K. Soomro and M. Shah, ICCV2017.\n    Spatial-Aware Object Embeddings for Zero-Shot Localization and Classification of Actions - P. Mettes and C. G. M. Snoek, ICCV2017.\n    Action Tubelet Detector for Spatio-Temporal Action Localization - V. Kalogeiton et al, ICCV2017. \n    Tube Convolutional Neural Network (T-CNN) for Action Detection in Videos - R. Hou et al, ICCV2017. \n    Chained Multi-stream Networks Exploiting Pose, Motion, and Appearance for Action Classification and Detection - M. Zolfaghari et al, ICCV2017. \n    TORNADO: A Spatio-Temporal Convolutional Regression Network for Video Action Proposal - H. Zhu et al., ICCV2017.",
+        "type": "code",
+        "location": "/english_documents/tutorials/Spatio-Temporal Action Detection Papers:1-13"
+    },
+    "4663": {
+        "file_id": 398,
+        "content": "The code provides a list of useful Spatio-Temporal Action Detection papers and their corresponding authors, year of publication, and conference or journal.",
+        "type": "comment"
+    },
+    "4664": {
+        "file_id": 398,
+        "content": "    Online Real time Multiple Spatiotemporal Action Localisation and Prediction - G. Singh et al, ICCV2017. \n    AMTnet: Action-Micro-Tube regression by end-to-end trainable deep architecture - S. Saha et al, ICCV2017.\n    Am I Done? Predicting Action Progress in Videos - F. Becattini et al, BMVC2017.\n    Generic Tubelet Proposals for Action Localization - J. He et al, arXiv2017.\n    Incremental Tube Construction for Human Action Detection - H. S. Behl et al, arXiv2017.\n    Multi-region two-stream R-CNN for action detection - X. Peng and C. Schmid. ECCV2016. \n    Spot On: Action Localization from Pointly-Supervised Proposals - P. Mettes et al, ECCV2016.\n    Deep Learning for Detecting Multiple Space-Time Action Tubes in Videos - S. Saha et al, BMVC2016. \n    Learning to track for spatio-temporal action localization - P. Weinzaepfel et al. ICCV2015.\n    Action detection by implicit intentional motion clustering - W. Chen and J. Corso, ICCV2015.\n    Finding Action Tubes - G. Gkioxari and J. Malik CVPR2015. ",
+        "type": "code",
+        "location": "/english_documents/tutorials/Spatio-Temporal Action Detection Papers:14-24"
+    },
+    "4665": {
+        "file_id": 398,
+        "content": "This code provides a list of notable research papers related to action detection and localization in videos, published between 2015 and 2017. The papers are from various conferences such as ICCV, BMVC, ECCV, and arXiv, demonstrating the advancements made in this field during that period.",
+        "type": "comment"
+    },
+    "4666": {
+        "file_id": 398,
+        "content": "    APT: Action localization proposals from dense trajectories - J. Gemert et al, BMVC2015. \n    Spatio-Temporal Object Detection Proposals - D. Oneata et al, ECCV2014.\n    Action localization with tubelets from motion - M. Jain et al, CVPR2014.\n    Spatiotemporal deformable part models for action detection - Y. Tian et al, CVPR2013. \n    Action localization in videos through context walk - K. Soomro et al, ICCV2015.\n    Fast Action Proposals for Human Action Detection and Search - G. Yu and J. Yuan, CVPR2015. ",
+        "type": "code",
+        "location": "/english_documents/tutorials/Spatio-Temporal Action Detection Papers:25-30"
+    },
+    "4667": {
+        "file_id": 398,
+        "content": "List of papers on action localization and detection in videos.",
+        "type": "comment"
+    },
+    "4668": {
+        "file_id": 399,
+        "content": "/english_documents/tutorials/TSM.md",
+        "type": "filepath"
+    },
+    "4669": {
+        "file_id": 399,
+        "content": "The Temporal Shift Module (TSM) is an efficient video understanding technique that balances performance and efficiency, capturing spatial-temporal features. Suitable for both online and offline videos, it focuses on temporal information and has a simple 2-line implementation.",
+        "type": "summary"
+    },
+    "4670": {
+        "file_id": 399,
+        "content": "# 1. Background&Motivation\nAt present, the video data on the Internet is increasing rapidly, and the time users spend watching short videos and small videos is also increasing rapidly. How to analyze, process and classify the massive video resources quickly and accurately is an urgent problem to be solved. The video understanding technology can analyze the video content in multiple dimensions, understand the video semantics, and automatically classify and label the video, which greatly saves the efficiency of manual audit and costs. At the same time, accurate user recommendation is realized to improve the experience effect.\nIn this paper, we will introduce the classic model **TSM (Temporal Shift Module)** in the field of video understanding, which is proposed by **MIT** and **IBM Watson AI Lab** `Ji Lin, Chuang Gan and Songhan, etc`, to achieve the balance between effeiciency and performance and improve video understanding ability.\nThe most relevant video understanding model to TSM is the **Temporal Segment Network (TSN)** published by Limin Wang",
+        "type": "code",
+        "location": "/english_documents/tutorials/TSM.md:1-5"
+    },
+    "4671": {
+        "file_id": 399,
+        "content": "This code snippet provides background and motivation for TSM (Temporal Shift Module), a classic model in video understanding proposed by MIT and IBM Watson AI Lab. The TSM aims to balance efficiency and performance while improving the ability to analyze video content in various dimensions. It is related to the Temporal Segment Network (TSN) published by Limin Wang.",
+        "type": "comment"
+    },
+    "4672": {
+        "file_id": 399,
+        "content": "a series of works represented such as I3D, S3D and P3D, which carry out end-to-end joint spatial-temporal modeling through 3D convolution. Although this series of works can capture spatial-temporal features, compared with TSN, the transition from 2D convolution to 3D convolution inevitably introduces extra computation. TSM cleverly uses the idea of temporal dimension feature map shift, theoretically achieving the purpose of feature fusion and joint modeling among different frames with zero extra computing overhead compared with TSN.\n**Paper Address:** [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383v2.pdf)\nLet's have a look at the following example: if the video is played from left to right and then from right to left respectively, the subjects will give different but correct interpretation of the video, indicating that the understanding of the video is strongly dependent on the temporal information of the video. Yes !, It is the motivation why TSM is proposed.",
+        "type": "code",
+        "location": "/english_documents/tutorials/TSM.md:6-10"
+    },
+    "4673": {
+        "file_id": 399,
+        "content": "This code describes the Temporal Shift Module (TSM), which is a method for efficient video understanding that avoids extra computation by using temporal dimension feature map shift. It is based on the concept of capturing spatial-temporal features, with a focus on temporal information in videos. This approach aims to achieve feature fusion and joint modeling among different frames without adding extra computational overhead compared to TSN.",
+        "type": "comment"
+    },
+    "4674": {
+        "file_id": 399,
+        "content": "<p align=\"center\">\n<img src=\"../../images/temporal.png\" height=188 width=500 hspace='10'/> <br />\n</p>\nIt looks interesting, next,let's dive into the core modules of TSM.\n# 2. Dark technologies used in TSM\nOn the basis of traditional image analysis, video analysis needs researchers to supplement the modeling structure of temporal information. At present, 2D CNN and 3D CNN are the two most commonly used methods in video understanding: using 2D CNN model requires less computation but will lose part of the time information; While using 3D CNN has a good effect but a large amount of computation. Faced with such a situation, Ji Lin, Chuang Gan and Song Han et al. from MIT and IBM Watson AI Lab proposed the Temp Shift Module (TSM) Module. By embedding the time displacement module into 2D CNN, they can easily achieve the same video understanding ability as 3D CNN without adding any additional calculation and parameters.\n<p align=\"center\">\n<img src=\"../../images/tsm_intr.png\" height=188 width=500 hspace='10'/> <br />",
+        "type": "code",
+        "location": "/english_documents/tutorials/TSM.md:11-21"
+    },
+    "4675": {
+        "file_id": 399,
+        "content": "The code is an introduction to Temporal Shift Module (TSM) in video understanding, highlighting the trade-offs between 2D and 3D CNN methods, and how TSM embeds time displacement into 2D CNN for equivalent performance without additional computation or parameters.",
+        "type": "comment"
+    },
+    "4676": {
+        "file_id": 399,
+        "content": "</p>\nThe rows and columns of the matrix in the figure above represent the temporal and channel dimensions of the feature graph, respectively. In TSM module, some channels are moved forward one step int the temporal dimension, and some channels are moved backward one step in the temporal dimension, and the gaps after the displacement are zeroed. In this way, context interaction on the temporal dimension is introduced into the feature graph. The channel movement operation can make the current frame contain the channel information of the two adjacent frames. In this way, the 2D convolution operation can directly extract the spatial-temporal information of the video just like the 3D convolution.\nIt improves the modeling ability of the model in time dimension. based on this basis, the researchers further subdivided the module into TSM module suitable for online video and TSM module suitable for offline video.\n<p align=\"center\">\n<img src=\"../../images/tsm_architecture.png\" height=188 width=500 hspace='10'/> <br />",
+        "type": "code",
+        "location": "/english_documents/tutorials/TSM.md:22-27"
+    },
+    "4677": {
+        "file_id": 399,
+        "content": "This code describes the TSM (Temporal Segment Networks) module, which introduces context interaction on the temporal dimension in feature graphs. It does this by moving some channels forward and backward one step in the temporal dimension, filling gaps with zeros. The channel movement allows 2D convolution to extract spatial-temporal information like 3D convolution. This improves model ability in time dimension and has TSM modules suitable for online and offline videos.",
+        "type": "comment"
+    },
+    "4678": {
+        "file_id": 399,
+        "content": "</p>\nBi-Direction TSM module can obtain past and future spatial and temporal information, which is suitable for offline video with high throughput. However, UNI-Direction TSM module is only suitable for low delay online video recognition compared with the present and past spatio-temporal information.\nIn addition, the author also considered the insertion position of TSM modules and compared two TSM insertion methods: **Residual TSM** and **in-place TSM**. The author found that **Residual TSM** could achieve better performance than **in-place TSM**, At the same time, author explained that **in-place TSM** may affect the extraction of spatial information.\n<p align=\"center\">\n<img src=\"../../images/residual_tsm.png\" height=188 width=500 hspace='10'/> <br />\n</p>\nTSM module looks **So Easy!!**, the next question is how to implement ?\n# 3. The core codes of TSM\nNow that the principle is clear, let's look at how the code works. First let's have a look the torch version tsm. Unfortunately, the Torch fr",
+        "type": "code",
+        "location": "/english_documents/tutorials/TSM.md:28-40"
+    },
+    "4679": {
+        "file_id": 399,
+        "content": "Bi-Direction TSM module handles past and future spatial and temporal information, suitable for high throughput offline videos. UNI-Direction TSM is more appropriate for low delay online video recognition. Residual TSM performs better than in-place TSM but may affect spatial information extraction. Torch version tsm implementation to follow.",
+        "type": "comment"
+    },
+    "4680": {
+        "file_id": 399,
+        "content": "amework does not provide an API for TSM, so we will have to do it by ourselves. The code is shown below:\n<p align=\"center\">\n<img src=\"../../images/torch_tsm.png\" height=160 width=500 hspace='10'/> <br />\n</p>\nThis means that you only need to add four lines of code to TSN's codebase then you can **double the accuracy in Something-Something datasets!!** what a simple and efficient model!\nBut...，\n**paddlepaddle** framework take the needs of the majority of users into account and have achieve TSM OP,then users can use it easily.\n<p align=\"center\">\n<img src=\"../../images/tsm_op.png\" height=300 width=400 hspace='10'/> <br />\n</p>\nSo you no longer have to achieve it by yourself, **it cab be called directly!!! , it can be called directly!!! , it can be called directly!!!** The important thing must say three times.\nDo you think that it is the end of the this topic?  **Too young Too simple !!!**\nWe have also optimized it to increase speed by 5 times while reducing memory consumption. See the acceleration documentation [accelerate.md](./accelerate.md) for more information.",
+        "type": "code",
+        "location": "/english_documents/tutorials/TSM.md:40-58"
+    },
+    "4681": {
+        "file_id": 399,
+        "content": "The code demonstrates a TSM model implementation in PaddlePaddle framework, allowing users to achieve Temporal Shift Module (TSM) operations without writing additional code. It significantly improves accuracy and efficiency on Something-Something datasets. The provided images visually explain the TSM implementation and the optimized version (TSM OP). Additionally, the documentation refers users to the acceleration documentation for further information on speeding up the model while reducing memory consumption.",
+        "type": "comment"
+    },
+    "4682": {
+        "file_id": 399,
+        "content": "Let's have a look at how TSM is implemented using **paddlepaddle**:\n`import paddle.nn.functional as F`\n`shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)`\n**Only two lines codes !!!**, isn't it easy ?\n# Reference\n[1] [Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018](https://arxiv.org/pdf/1811.08383v2.pdf).\n[2] [Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016](https://arxiv.org/abs/1608.00859).",
+        "type": "code",
+        "location": "/english_documents/tutorials/TSM.md:60-73"
+    },
+    "4683": {
+        "file_id": 399,
+        "content": "The provided code demonstrates an implementation of the Temporal Shift Module (TSM) in PaddlePaddle. It only requires two lines of code and uses the `temporal_shift` function from `paddle.nn.functional`. This makes it easy to implement the TSM for efficient video understanding, as referenced by Lin Ji et al. and Limin Wang et al.",
+        "type": "comment"
+    },
+    "4684": {
+        "file_id": 400,
+        "content": "/english_documents/tutorials/Temporal Action Detection Papers",
+        "type": "filepath"
+    },
+    "4685": {
+        "file_id": 400,
+        "content": "This code lists three research papers on action detection and localization in videos: Fast Action Proposals, Bag-of-fragments, and Action Localization through Context Walk.",
+        "type": "summary"
+    },
+    "4686": {
+        "file_id": 400,
+        "content": "Usefull Temporal Action Detection Papers. \n    Rethinking the Faster R-CNN Architecture for Temporal Action Localization - Yu-Wei Chao et al., CVPR2018\n    Weakly Supervised Action Localization by Sparse Temporal Pooling Network - Phuc Nguyen et al., CVPR 2018\n    Temporal Deformable Residual Networks for Action Segmentation in Videos - P. Lei and S. Todrovic., CVPR2018.\n    End-to-End, Single-Stream Temporal Action Detection in Untrimmed Videos - Shayamal Buch et al., BMVC 2017\n    Cascaded Boundary Regression for Temporal Action Detection - Jiyang Gao et al., BMVC 2017\n    Temporal Tessellation: A Unified Approach for Video Analysis - Kaufman et al., ICCV2017. \n    Temporal Action Detection with Structured Segment Networks - Y. Zhao et al., ICCV2017. \n    Temporal Context Network for Activity Localization in Videos - X. Dai et al., ICCV2017.\n    Detecting the Moment of Completion: Temporal Models for Localising Action Completion - F. Heidarivincheh et al., arXiv2017.\n    CDC: Convolutional-De-",
+        "type": "code",
+        "location": "/english_documents/tutorials/Temporal Action Detection Papers:1-12"
+    },
+    "4687": {
+        "file_id": 400,
+        "content": "This code is a list of useful temporal action detection papers, each with their respective authors and conference/journal they were published in.",
+        "type": "comment"
+    },
+    "4688": {
+        "file_id": 400,
+        "content": "Convolutional Networks for Precise Temporal Action Localization in Untrimmed Videos - Z. Shou et al, CVPR2017.\n    SST: Single-Stream Temporal Action Proposals - S. Buch et al, CVPR2017.\n    R-C3D: Region Convolutional 3D Network for Temporal Activity Detection - H. Xu et al, arXiv2017. [code] [project web] [PyTorch]\n    DAPs: Deep Action Proposals for Action Understanding - V. Escorcia et al, ECCV2016. \n    Online Action Detection using Joint Classification-Regression Recurrent Neural Networks - Y. Li et al, ECCV2016. \n    Temporal Action Localization in Untrimmed Videos via Multi-stage CNNs - Z. Shou et al, CVPR2016. \n    Fast Temporal Activity Proposals for Efficient Detection of Human Actions in Untrimmed Videos - F. Heilbron et al, CVPR2016. \n    Actionness Estimation Using Hybrid Fully Convolutional Networks - L. Wang et al, CVPR2016. \n    Learning Activity Progression in LSTMs for Activity Detection and Early Detection - S. Ma et al, CVPR2016.\n    End-to-end Learning of Action Detection from Frame Glimpses in Videos - S. Yeung et al, CVPR2016. ",
+        "type": "code",
+        "location": "/english_documents/tutorials/Temporal Action Detection Papers:12-21"
+    },
+    "4689": {
+        "file_id": 400,
+        "content": "This code contains references to various papers on temporal action detection, localization, and understanding in untrimmed videos. It includes papers from different authors and years, with some including PyTorch implementation and project web links.",
+        "type": "comment"
+    },
+    "4690": {
+        "file_id": 400,
+        "content": "    Fast Action Proposals for Human Action Detection and Search - G. Yu and J. Yuan, CVPR2015. \n    Bag-of-fragments: Selecting and encoding video fragments for event detection and recounting - P. Mettes et al, ICMR2015.\n    Action localization in videos through context walk - K. Soomro et al, ICCV2015.",
+        "type": "code",
+        "location": "/english_documents/tutorials/Temporal Action Detection Papers:22-24"
+    },
+    "4691": {
+        "file_id": 400,
+        "content": "This code provides references to three research papers related to action detection and localization in videos: \n1. Fast Action Proposals by Yu & Yuan (CVPR2015), \n2. Bag-of-fragments by Mettes et al. (ICMR2015), and \n3. Action Localization through Context Walk by Soomro et al. (ICCV2015).",
+        "type": "comment"
+    },
+    "4692": {
+        "file_id": 401,
+        "content": "/english_documents/tutorials/accelerate.md",
+        "type": "filepath"
+    },
+    "4693": {
+        "file_id": 401,
+        "content": "This code snippet is providing two links to tutorials, one in English and the other in Simplified Chinese (简体中文). The English tutorial can be accessed at \"../../zh-CN/tutorials/accelerate.md\" and the Chinese one at the current location \"PaddleVideo/english_documents/tutorials/accelerate.md\".",
+        "type": "summary"
+    },
+    "4694": {
+        "file_id": 401,
+        "content": "[简体中文](../../zh-CN/tutorials/accelerate.md) | English",
+        "type": "code",
+        "location": "/english_documents/tutorials/accelerate.md:1-1"
+    },
+    "4695": {
+        "file_id": 401,
+        "content": "This code snippet is providing two links to tutorials, one in English and the other in Simplified Chinese (简体中文). The English tutorial can be accessed at \"../../zh-CN/tutorials/accelerate.md\" and the Chinese one at the current location \"PaddleVideo/english_documents/tutorials/accelerate.md\".",
+        "type": "comment"
+    },
+    "4696": {
+        "file_id": 402,
+        "content": "/english_documents/tutorials/config.md",
+        "type": "filepath"
+    },
+    "4697": {
+        "file_id": 402,
+        "content": "The code demonstrates how PaddleVideo uses Inversion of Control and Dependency Injection for improved modularity, resolving coupling issues through factory classes and configuration files. It creates class instances based on configs and applies design patterns for dependency injection, using a config file for architecture, dataset, pipeline, and optimizer configurations.",
+        "type": "summary"
+    },
+    "4698": {
+        "file_id": 402,
+        "content": "# Configs design\n---\nThis page shows how PaddleVideo use the basic IOC/DI technology to decouple and control the whole framework. It is flexible to increase modularity of this system and make it extensible. At last, we will explain the details of config yaml and script args.\n## Design\nFirst, when we create a new class, it is common to new a instance like:\n```python\nclass TSM():\n    pass\nmodel = TSM(init_attributes)\n```\nwhen more classes are created, the coupling relationship between the calling and called method will increase sharply, obviously, we can create a factory class to solve it, like that:\n```python\nif model_name == \"TSM\":\n    model = TSM()\nelif model_name == \"TSN\":\n    model = TSN()\nelif ...\n```\nand\n```python\noptimizer_cfg = dict(name:\"MOMENTUM\", params: XXX)\nif optimizer_cfg.name = \"MOMENTUM\":\n    optimizer = MOMENTUM(optimizer_cfg.pop(name))\nelif:\n    ...\n```\nmore and more conditions have to be created though. like widly used in the Java or other platforms, we apply ```inversion of control``` and ```Dependency Inversion``` to decuople.",
+        "type": "code",
+        "location": "/english_documents/tutorials/config.md:1-37"
+    },
+    "4699": {
+        "file_id": 402,
+        "content": "This code discusses the use of Inversion of Control (IOC) and Dependency Injection (DI) in PaddleVideo, a framework for video processing. It explains how these techniques help decouple and control the framework, increasing modularity and extensibility. The code demonstrates how traditional class instantiation can lead to coupling issues, and how IOC/DI can solve them by creating factory classes and using configuration files.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/47.json b/docs/data/47.json
new file mode 100644
index 000000000..46365e1a5
--- /dev/null
+++ b/docs/data/47.json
@@ -0,0 +1,541 @@
+{
+    "4700": {
+        "file_id": 402,
+        "content": "Second, to implenment DI, we build two components:\n- Register, to regist a class\n- Builder, to new an instance\n1. Register\nWe implenment a getter and a setter function to map string to an instance.\n[source code](../../paddlevideo/utils/registry.py)\n```python\n#excerpt from source code.\nclass Registry():\n    def __init__(self, name):\n        self._name = name\n        self._obj_map = {}\n    #mapping name -> object\n    def register(self,  obj, name):\n        self._obj_map[name] = obj\n    #get object\n    def get(self, name):\n        ret = self._obj_map.get(name)\n        return ret\n```\nIt provides name -> object mapping. For example, To register an object:\n```python\n    BACKBONES = Registry('backbone')\n    class ResNet:\n        pass\n    BACKBONES.register(ResNet)\n```\nOr, use a decorator\n```python\n    BACKBONES = Registry('backbone') #new a Register\n    @BACKBONES.register() #regist resnet as a backbone.\n    class ResNet:\n        pass\n```\n2. Builder\nTo obtain a registed module.\n```python\n    # Usage: To build a module.\n    backbone_name = \"ResNet\"",
+        "type": "code",
+        "location": "/english_documents/tutorials/config.md:39-89"
+    },
+    "4701": {
+        "file_id": 402,
+        "content": "This code demonstrates the implementation of Dependency Injection (DI) using a Register and Builder. The Register provides name-to-object mapping, allowing objects to be registered with a specific name. The Builder facilitates obtaining registered modules by accepting a module's name and returning the corresponding instance.",
+        "type": "comment"
+    },
+    "4702": {
+        "file_id": 402,
+        "content": "    b = BACKBONES.get(backbone_name)()\n```\nso that we can new(register) an instance in **where it declared**, not **where it called**, a basic DI sub-system has been created now.\nWe apply this design on many places, such as: PIPELINE, BACKBONE, HEAD, LOSS, METRIC and so on.\nFinally, We build all of the framework components from config yaml which matches the source code one by one, **It means the attributes in a configuration field is same as the init atrributes of the mathced class**, and to indicate a specified class, we always use ```name``` to mark it. like:\n```yaml\nhead:\n    name: \"TSMHead\"  # class name\n    num_classes: 400 # TSMHead class init attributes\n    ...\n```\n---\n## config yaml details\nWe separate the config to several parts, in high level:\n- **MODEL:** Architecture configuration, such as HEAD module, BACKBONE module.\n- **DATASET:** DATASET and dataloader configuration.\n- **PIPELINE:** pipeline of processing configuration.\n- **OPTIMIZER:** Optimizer configuration.\nand some unique global configurations, like",
+        "type": "code",
+        "location": "/english_documents/tutorials/config.md:90-117"
+    },
+    "4703": {
+        "file_id": 402,
+        "content": "This code snippet is creating an instance of a class based on its name specified in the configuration file. It applies this design to various components like PIPELINE, BACKBONE, HEAD, LOSS, and METRIC for dependency injection. The attributes in the configuration field match the initialization attributes of the corresponding class. The config file separates architecture, dataset, pipeline, and optimizer configurations, along with global settings.",
+        "type": "comment"
+    },
+    "4704": {
+        "file_id": 402,
+        "content": "- model_name\n- log_interval\n- epochs\n- resume_epoch\n- log_level\n...\nTraining script args\n-  **--validate**: switch validate mode on or not\n-  **--test**: switch test mode on or not\n-  **--weights**: weights path\n-  **-c**: config yaml path\n-  **-o**: override args, one can use it like: -o DATASET.batch_size=16",
+        "type": "code",
+        "location": "/english_documents/tutorials/config.md:118-131"
+    },
+    "4705": {
+        "file_id": 402,
+        "content": "This code snippet is describing command-line arguments for a training script. The user can switch validate or test mode on/off, provide weights and config paths, and override specific args using \"-o\" option. It also mentions the available commands for each argument.",
+        "type": "comment"
+    },
+    "4706": {
+        "file_id": 403,
+        "content": "/english_documents/tutorials/customized_usage.md",
+        "type": "filepath"
+    },
+    "4707": {
+        "file_id": 403,
+        "content": "This code provides information about customizing different aspects of the PaddleVideo framework, including dataset, network, solvers, metrics, and debug tools. It discusses finetuning, adding new augmentations and batch augments in the pipeline, modular design, changing frameworks, initializing functions, loss functions, step/epoch decay, creating customized solvers, adding new data processing, records, and metrics, as well as using debug levels and FAQ.",
+        "type": "summary"
+    },
+    "4708": {
+        "file_id": 403,
+        "content": "[简体中文](../../zh-CN/tutorials/customized_usage.md) | English\n# Customized Usage\n## Customized Dataset\n1. finetune\nPlease refer to [finetune](../start.md#model_finetune) if only change a \"regular\" dataset.\n2. customized pipeline\n  - add new augments\n  - add new batch augments\n  **Note**: Be care of checking the difference of different modes.\n## Customized Network\n1. module function\nPlease refer to [modular desigh](modular_design.md) for more information.\n2. customized framework\n  - change framework\n  - change initialized function\n  - customized loss\n## Customized Solvers\n1. step decay and epoch decay\n2. customized solvers\n## Customized metrics\n  - add new data processing\n  - add new record\n  - add new metrics\n## Debug tools\n1. Debug level\n2. FAQ",
+        "type": "code",
+        "location": "/english_documents/tutorials/customized_usage.md:1-44"
+    },
+    "4709": {
+        "file_id": 403,
+        "content": "This code provides information about customizing different aspects of the PaddleVideo framework, including dataset, network, solvers, metrics, and debug tools. It discusses finetuning, adding new augmentations and batch augments in the pipeline, modular design, changing frameworks, initializing functions, loss functions, step/epoch decay, creating customized solvers, adding new data processing, records, and metrics, as well as using debug levels and FAQ.",
+        "type": "comment"
+    },
+    "4710": {
+        "file_id": 404,
+        "content": "/english_documents/tutorials/demos",
+        "type": "filepath"
+    },
+    "4711": {
+        "file_id": 404,
+        "content": "The code outlines six different tasks in action recognition and detection using various algorithms, including TSN (Two-Stream Convolutional Networks), TSM (Temporal Shift Module), SlowFast Networks, LSTM (Long Short-Term Memory), and BNM (Boundary-aware Multi-scale Network). The tasks include single-class action recognition, multi-class action recognition, action localization, spatio-temporal action detection, 3000-class tagging application, and highlights detection application.",
+        "type": "summary"
+    },
+    "4712": {
+        "file_id": 404,
+        "content": "some useful demo todo.  \n1、single-class action recognition， tsn/tsm/slowfast  \n2、multi-class action recognition，lstm  \n3、action localization，bmn  \n4、spatio temporal action detection，todo  \n5、3000-class tagging application(videotag)：tsn+lstm  \n6、Highlights detection application：bmn+tsn+lstm  ",
+        "type": "code",
+        "location": "/english_documents/tutorials/demos:1-8"
+    },
+    "4713": {
+        "file_id": 404,
+        "content": "The code outlines six different tasks in action recognition and detection using various algorithms, including TSN (Two-Stream Convolutional Networks), TSM (Temporal Shift Module), SlowFast Networks, LSTM (Long Short-Term Memory), and BNM (Boundary-aware Multi-scale Network). The tasks include single-class action recognition, multi-class action recognition, action localization, spatio-temporal action detection, 3000-class tagging application, and highlights detection application.",
+        "type": "comment"
+    },
+    "4714": {
+        "file_id": 405,
+        "content": "/english_documents/tutorials/deployment.md",
+        "type": "filepath"
+    },
+    "4715": {
+        "file_id": 405,
+        "content": "This code explains how to convert dygraph models to static models for inference and deployment using PaddleInference, and provides examples on video inference testing with predict.py and benchmarking. Support for C++ infer and PaddleHub Serving deploy are coming soon.",
+        "type": "summary"
+    },
+    "4716": {
+        "file_id": 405,
+        "content": "[简体中文](../../zh-CN/tutorials/deployment.md) | English\n# Inference\n## How to convert dygraph model to static model?\nTo infer and deploy a model, we need export an inference model, or called to_static: `convert dygraph model to static model`, at first.\n```python\npython3.7 tools/export_model.py -c config_file -o output_path -p params_file\n```\nNote: In `export_model.py`, It will build a model again, and then loading the prarams. But some init params in the infer phase is different from the train phase.\nwe add `num_seg` for TSM in advanced, please add more params or modify them if it is necessary.\nplease refer to [official documents](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/04_dygraph_to_static/index_cn.html) for more information.\n## How to test the export model?\nPaddleVideo supports a test script to test the exported model.\n```python\npython3.7 tools/test_export_model.py -p params_file -i inference_folder -c config_file\n```\nWe just print the output shape, please feel free to ex",
+        "type": "code",
+        "location": "/english_documents/tutorials/deployment.md:1-24"
+    },
+    "4717": {
+        "file_id": 405,
+        "content": "The code provides instructions on how to convert a dygraph model to a static model for inference and deployment, as well as testing the exported model using PaddleVideo's test script. The conversion is done using the \"export_model.py\" script with appropriate arguments, and some additional parameters are added for TSM. Refer to official documents for more information.",
+        "type": "comment"
+    },
+    "4718": {
+        "file_id": 405,
+        "content": "tend it. Avtually, only test a video file by PaddleInference can make sure the exported model is right.\n## How to use PaddleInference?\nPaddleVideo supports ```tools/predict.py``` to infer\n```python\npython3.7 tools/predict.py -v example.avi --model_file \"./inference/example.pdmodel\" --params_file \"./inference/example.pdiparams\" --enable_benchmark=False --model=\"example\" --num_seg=8\n ```\n## How to test inference speed?\nPaddleVideo support a script to test inference speed\n```python\npython3.7 tools/predict.py --enable_benchmark=True --model_file=模型文件 --params_file=参数文件\n```\n## How to use C++ infer?\n<sup> coming soon</sup>\n# Deployment\n## How to use PaddleHub Serving deploy?\n<sup> coming soon</sup>\n## How to use PaddleLite deploy?\n<sup> coming soon</sup>",
+        "type": "code",
+        "location": "/english_documents/tutorials/deployment.md:24-48"
+    },
+    "4719": {
+        "file_id": 405,
+        "content": "This code explains how to use the PaddleInference tool for testing video inference, providing examples on using predict.py and enabling benchmarking. It also mentions that support for C++ infer is coming soon, as well as instructions on using PaddleHub Serving deploy and PaddleLite deploy, which will be added later.",
+        "type": "comment"
+    },
+    "4720": {
+        "file_id": 406,
+        "content": "/english_documents/tutorials/modular_design.md",
+        "type": "filepath"
+    },
+    "4721": {
+        "file_id": 406,
+        "content": "This code provides a link to the Chinese version of the modular design tutorial and the English version, allowing users to switch between languages based on their preference.",
+        "type": "summary"
+    },
+    "4722": {
+        "file_id": 406,
+        "content": "[简体中文](../../zh-CN/tutorials/modular_design.md) | English",
+        "type": "code",
+        "location": "/english_documents/tutorials/modular_design.md:1-1"
+    },
+    "4723": {
+        "file_id": 406,
+        "content": "This code provides a link to the Chinese version of the modular design tutorial and the English version, allowing users to switch between languages based on their preference.",
+        "type": "comment"
+    },
+    "4724": {
+        "file_id": 407,
+        "content": "/english_documents/tutorials/pp-tsm.md",
+        "type": "filepath"
+    },
+    "4725": {
+        "file_id": 407,
+        "content": "This code introduces the PP-TSM, a high-performance and efficient video recognition model optimized based on TSM in PaddleVideo. It outlines various strategies like ImageNet pretraining, data augmentation, and optimizer improvements to enhance performance and achieve fast inference speed on V101 GPU with top-1 accuracy on UCF101 and Kinetics400 datasets.",
+        "type": "summary"
+    },
+    "4726": {
+        "file_id": 407,
+        "content": "# High performance recognition 2D architecture PP-TSM\nPP-TSM：An Effective and Efficient video-recognition model   \nPP-TSM is an optimized model based on TSM in PaddleVideo,   \nwhose performance (top-1 on UCF101 and Kinetics400) and inference spped   \nare better than TSM paper(https://arxiv.org/abs/1811.08383 ) and   \nother open source TSM，PaddlePaddle2.0(available on pip now) or   \nDaily Version( https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-dev )   \nis required to run PP-TSM.    \nWhen only use ImageNet for pretrain and only use 8X1 sample，  \nPP-TSM’s top1 reached to 89.5% and 73.5% on UCF101 and Kinetics400,   \nand inference speed of FP32 on single V100 is 147 VPS on Kinectics400 dataset.  \ninference speed of FP16 with TensorRT on single V100 isTODO.  \nAs far as we know, under the same conditions,    \ntop1=73.5% on Kinetics400 is the best performance for 2D video model until now.  \nPP-TSM improved performance and speed of TSM with following methods:   \n1、Model Tweaks: ResNet50vd  ，+2.5%  ",
+        "type": "code",
+        "location": "/english_documents/tutorials/pp-tsm.md:1-22"
+    },
+    "4727": {
+        "file_id": 407,
+        "content": "This code describes the PP-TSM, a high-performance and efficient video recognition model optimized based on TSM in PaddleVideo. It mentions its better performance and inference speed compared to TSM paper and other open source TSM models. Requires PaddlePaddle2.0 for execution. When using ImageNet for pretrain and 8X1 sample, it achieves high top-1 accuracy on UCF101 and Kinetics400 datasets with fast inference speed on V100 GPU.",
+        "type": "comment"
+    },
+    "4728": {
+        "file_id": 407,
+        "content": "2、ImageNet pretrain weights based on Knowledge Distillation  ， +1.3%    \n3、beter batch size  ，+0.2%   \n4、beter L2  ，+0.3%  \n5、label_smoothing  ，+0.2%  \n6、beter lr decay  ，+0.15%  \n7、Data augmentation  ，+0.3%  \n8、beter epoch num  ，+0.15%  \n9、bn strategy  ，+0.4%  \n10、integrated PaddleInference  \n11、more strategies todo: Knowledge Distillation、optimizer and so on.  ",
+        "type": "code",
+        "location": "/english_documents/tutorials/pp-tsm.md:23-32"
+    },
+    "4729": {
+        "file_id": 407,
+        "content": "This code outlines several strategies implemented to improve the performance of a model, including ImageNet pretraining, better batch size and L2 values, label smoothing, better learning rate decay, data augmentation, and updated epoch numbers. The code also mentions using Knowledge Distillation, optimizer improvements, and plans for integrating PaddleInference.",
+        "type": "comment"
+    },
+    "4730": {
+        "file_id": 408,
+        "content": "/english_documents/tutorials/summarize.md",
+        "type": "filepath"
+    },
+    "4731": {
+        "file_id": 408,
+        "content": "Video classification tasks involve recognizing actions through RGB images and skeleton data. Concepts include temporal action localization, dense-captioning events, popular datasets, feature extraction, motion representation, and classification using deep learning methods since 2014.",
+        "type": "summary"
+    },
+    "4732": {
+        "file_id": 408,
+        "content": "[简体中文](../../zh-CN/tutorials/summarize.md) | English\n# Introduction for video classification(action recognition)\n## Wide range of application scenarios\nVideo classification has a wide range of applications in many fields, such as online video platforms such as short videos, offline such as security, transportation, quality inspection and other fields。\n## Multiple subtasks\nSimilar to image tasks, video tasks can also be divided into two categories: **classification (recognition) and detection**, and these two types of tasks can be specifically subdivided by combining different scenes：\n+ Task1：Trimmed Action Recognition. Users input a trimmed video,which contains only single action,then a video tag will be output by model as depicted in fig below:\n<p align=\"center\">\n<img src=\"../../images/action_classification.png\" height=300 width=700 hspace='10'/> <br />\n Action Classification\n</p>\n  In terms of the data modality used, classification tasks can be further subdivided into classification based on si",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:1-18"
+    },
+    "4733": {
+        "file_id": 408,
+        "content": "Introduction to video classification (action recognition) with various applications in different fields, including online platforms and offline sectors like security, transportation, and quality inspection. Tasks include classification/recognition and detection, further subdivided by combining different scenes.",
+        "type": "comment"
+    },
+    "4734": {
+        "file_id": 408,
+        "content": "ngle modality data, classification based on multi-modality data, classification based on RGB images and classification based on human skeleton, etc, as shown in the figure below:\n  <p align=\"center\">\n  <img src=\"../../images/multimodality.png\" height=300 width=500 hspace='10'/> <br />\n multi-modality\n  </p>\nIn terms of the perspective of video, it can also be divided into first-person action recognition, \nthird-person action recognition, single perspective action recognition and multi-perspective fusion action recognition. \nUsers who are interested in these fields can refer to relevant literatures.\n+ Task2：Untrimmed Video Classification. \nUnlike trimmed videos, untrimmed videos often contain multiple actions and have a long time span. \nThere are a lot of movements that we may need not paying attention to. Through the global analysis of the input long video, and then make a soft classify to mutiple categories.\n+ Task3：Temporal Action Proposal. It is similar to the ROI extraction in the image detection task. ",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:18-32"
+    },
+    "4735": {
+        "file_id": 408,
+        "content": "This code is describing different types of classification tasks in video analysis. It covers multi-modality data, RGB images, human skeleton data, and various perspectives such as first-person, third-person, and multiple perspectives. Additionally, it mentions untrimmed videos, temporal action proposals, and ROI extraction in image detection tasks.",
+        "type": "comment"
+    },
+    "4736": {
+        "file_id": 408,
+        "content": "The task is to find the video clips that may contain action in a long video with a lot of actions.\n+ Task4：Temporal Action Localization. Compared with the temporal action proposal task as mentioned above, \ntemporal action localization task is more consistent with detection task in the field of imgae, \nit requires not only to find the video segments with possible actions from the video but also to classify them,\nas shown in the figure below\n <p align=\"center\">\n<img src=\"../../images/action_detection.png\" height=200 width=1000 hspace='10'/> <br />\n Action Detection\n</p>\n+ Task5：Dense-Captioning Events. The reason why it is called dense captioning events is mainly \nbecause that this task requires video action description on the basis of temporal action localization \n(detection). That is to say, the task needs to locate the actions in a **untrimmed** video,in **temporal \ndimension** and describe the behavior of the **whole video** after obtaining many video segments which contain actions.\n## Introduction of datasets",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:33-49"
+    },
+    "4737": {
+        "file_id": 408,
+        "content": "Task 4: Temporal Action Localization - find video segments with possible actions, classify them.\nTask 5: Dense-Captioning Events - describe untrimmed videos' actions in temporal dimension.",
+        "type": "comment"
+    },
+    "4738": {
+        "file_id": 408,
+        "content": "### Classification datasets\nThe training and validation of the model cannot be done without comprehensive, \nlarge and well annotated datasets. With the deepening of research on video action recognition, \nmore and more datasets are applied to the research in this field. \nTypical datasets are as follows:\n+ KTH[<sup>1</sup>](#1)\nKTH dataset is an early small action recognition dataset, \nincluding 599 videos of 6 types of actions (walking, jumping, running, punching, waving and clapping). \nThe background is relatively still, except for the zoom in and out of the camera, \nthe camera movement is relatively slight. Since this data set is relatively small, \nit is easy to overfit when training heavy 3D networks, \nso most current researches are not based on this it.\n+ UCF10[<sup>2</sup>](#2)\nUCF101 is a medium-size dataset in which most videos are from YouTube. \nIt contains 13,320 videos with 101 types of actions. \nEach type of action is performed by 25 people, each of whom performs 4-7 sets of actions. \nThe UCF101 and HMDB51 datasets used to be the benchmarks to evaluate the effectiveness of action ",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:51-72"
+    },
+    "4739": {
+        "file_id": 408,
+        "content": "The code provides a brief overview of popular video action recognition datasets, such as KTH and UCF101. It mentions that the datasets are essential for training and validating models, but overfitting may occur with larger 3D networks on smaller datasets like KTH.",
+        "type": "comment"
+    },
+    "4740": {
+        "file_id": 408,
+        "content": "recognition model for a long time before the Kinetics dataset was released.\n+ HMDB51[<sup>3</sup>](#3)\nBrown University's proposed dataset named HMDB51 was released in 2011. \nMost of the videos come from movies, \nbut some come from public databases and online video libraries such as YouTube. \nThe datasets contains 6849 samples divided into 51 classes, \neach of which contains at least 101 samples.\n+ Kinetics[<sup>4</sup>](#4)\nKinetics is the most important large-scale action recognition dataset, which was proposed by Google's DeepMind team in 2017. The video data also comes from YouTube, with 400 categories (now expanded to 700 categories) and more than 300,000 videos (now expanded to 600,000 videos), each lasting about 10 seconds. \nThe action categories are mainly divided into three categories: \"human\", \"human and animal\", \"human and human interaction\". Kinetics can train 3D-RESNET up to 152 layers without over-fitting, \nwhich solves the problem that the previous training dataset is too small to train deep 3D network. ",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:73-87"
+    },
+    "4741": {
+        "file_id": 408,
+        "content": "HMDB51 is a dataset proposed by Brown University in 2011, consisting of movie and online video sources. It contains 6849 samples across 51 classes with at least 101 samples each. Kinetics is the largest action recognition dataset, created by Google's DeepMind team in 2017. It uses YouTube videos, now expanded to 600k videos in 700 categories. The categories are divided into human, human and animal, and human and human interaction. Kinetics can train deep networks like 3D-RESNET up to 152 layers without overfitting, solving the issue of small training datasets.",
+        "type": "comment"
+    },
+    "4742": {
+        "file_id": 408,
+        "content": "Kinetics has replaced UCF101 and HMDB51 as the benchmark in the field of action recognition. \nAt present, most studies use this dataset for evaluation and pre-training.\n+ Something-Something[<sup>5</sup>](#5)\nSomethingV1 contains 108,499 annotated videos (V2 has expanded to 220,847), each of which last two to six seconds. These videos contain 174 kinds of actions. Different from the previous dataset, \nthe identification of this data set requires stronger time information, \nso this dataset has a very important reference value in testing the temporal modeling ability of the model.\nIn addition to the above datasets, there are Charades[<sup>6</sup>](#6) dataset for complex Action recognition, Breakfast Action[<sup>7</sup>](#7), and Sports 1M[<sup>8</sup>](#8).\n### Detection datasets\n+ THUMOS 2014\nThis dataset is from THUMOS Challenge 2014, Its training set is UCF101, validation set and test set include 1010 and 1574 undivided video clips respectively. In the action detection task, only 20 kinds of unsegmented videos of actions were labeled with sequential action fragments, ",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:88-104"
+    },
+    "4743": {
+        "file_id": 408,
+        "content": "Kinetics is the benchmark for action recognition, replacing UCF101 and HMDB51. Most studies use this dataset for evaluation and pre-training. SomethingV1 has 108,499 annotated videos with 174 kinds of actions, requiring strong temporal modeling ability. Other datasets include Charades (complex action recognition), Breakfast Action, Sports 1M, THUMOS 2014 (action detection).",
+        "type": "comment"
+    },
+    "4744": {
+        "file_id": 408,
+        "content": "including 200 validation sets (3007 action fragments) and 213 test sets (3358 action fragments).\n+ MEXaction2\nThe Mexaction2 dataset contains two types of action: horse riding and bullfighting. \nThe dataset consists of three parts: YouTube videos, horseback riding videos in UCF101, and INA videos. \nYouTube clips and horseback riding videos in UCF101 are short segmented video clips that are used as training sets. \nThe INA video is a long unsegmented video with a total length of 77 hours, \nand it is divided into three parts: training, validation and test. \nThere are 1336 action segments in the training set, 310 in the validation set and 329 in the test set. \nMoreover, the Mexaction2 dataset is characterized by very long unsegmented video lengths, \nand marked action segments only account for a very low proportion of the total video length.\n+ ActivityNet\nAt present the largest database, also contains two tasks of classification and detection. \nThis dataset only provides a YouTube link to the video, not a direct download of the video, ",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:105-121"
+    },
+    "4745": {
+        "file_id": 408,
+        "content": "The code describes two datasets, Mexaction2 and ActivityNet. Mexaction2 has horse riding and bullfighting actions, split into training, validation, and test sets. It includes YouTube clips, UCF101 horseback riding videos, and an unsegmented 77-hour INA video with low marked action proportions. ActivityNet is the largest database, including classification and detection tasks, but only provides YouTube links without direct downloads.",
+        "type": "comment"
+    },
+    "4746": {
+        "file_id": 408,
+        "content": "so you also need to use the YouTube download tool in Python to automatically download the videos. \nThe dataset contains 200 action categories, 20,000 (training + verification + test set) videos, \nand a total of about 700 hours of video.\n## Introduction of classic models\nAs shown in the figure, \nthe action recognition framework mainly includes three steps: \nfeature extraction, motion representation and classification. \nHow to extract spatiotemporal features of video is the core problem of action recognition and video classification.\n <p align=\"center\">\n<img src=\"../../images/action_framework.png\" height=300 width=700 hspace='10'/> <br />\nFramework of action recognition\n</p>\nAccording to different methods, action recognition (video classification) methods can be generally summarized into two stages: \nmanual feature-based method and deep learning-based method. \nTypical motion descriptors in the manual feature-based method stage include DTP and IDT, \nwhich are also the most excellent motion descriptors accepted by most researchers before deep-learning is applied in this field. ",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:122-138"
+    },
+    "4747": {
+        "file_id": 408,
+        "content": "The code discusses the process of feature extraction, motion representation and classification in action recognition. It highlights two stages - manual feature-based method and deep learning-based method. It mentions DTP and IDT as typical motion descriptors used before deep-learning was applied. The code also shows a framework diagram for action recognition.",
+        "type": "comment"
+    },
+    "4748": {
+        "file_id": 408,
+        "content": "Interinterested readers may refer to the relevant references at the end of this paper. \nSince 2014, deep learning methods have been gradually applied to the field of video classification. \nAt present, deep learning-based methods have become a hotspot of research in both academic and the practice, and the  effect is far beyond the motion features of manual design. \nSince 2014, many classic network structures have been put forward by the researchers regarding the problem of how to represent motion characteristics, \nas shown in the figure below:\n <p align=\"center\">\n<img src=\"../../images/classic_model.png\" height=300 width=700 hspace='10'/> <br />\nClassic Models\n</p>\nAt present,Paddlevideo has contained several classic models such as:TSN[<sup>9</sup>](#9),TSM[<sup>10</sup>](#10),slowfast[<sup>11</sup>](#11),et al.In the future,\nwe will analyze the classic models and papers in these fields. Please look forward to it\n## Introduction of competetion\n+ [ActivityNet](http://activity-net.org/challenges/2020/challenge.html)",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:139-154"
+    },
+    "4749": {
+        "file_id": 408,
+        "content": "The code discusses the application of deep learning methods in video classification since 2014, highlighting their effectiveness beyond manual motion design. It mentions various classic network structures proposed by researchers for representing motion characteristics and includes images to illustrate these models. The code also introduces PaddleVideo's inclusion of such models like TSN, TSM, slowfast, etc., and anticipates future analysis of these classical models and papers in the field. Additionally, it references an ActivityNet competition for further context.",
+        "type": "comment"
+    },
+    "4750": {
+        "file_id": 408,
+        "content": "ActivityNet is a large-scale action recognition competition. Since 2016, \nit has been held simultaneously with CVPR every year. Up to this year, \nit has been held for 4 consecutive sessions. It focuses on identifying everyday, high-level, goal-oriented activities from \nuser-generated videos taken from the Internet video portal YouTube. \nAt present, ActivityNet competition has become the most influential competition in the field of action recognition.\n## Reference\n<div id='1'>\n[1] Schuldt C, Laptev I, Caputo B.Recognizing Human Actions: A Local SVM Approach Proceedings of International Conference on Pattern Recognition. Piscataway, NJ: IEEE, 2004:23-26\n</div>\n<br/>\n<div id='2'>\n[2] Soomro K, Zamir A R, Shah M. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. arXiv:1212.0402,2012.\n</div>\n<br/>\n<div id='3'>\n[3] Kuehne H, Jhuang H, Garrote E, et al. HMDB: a large video database for human motion recognition Proceedings of IEEE International Conference on Computer Vision. Piscataway, NJ: IEEE, 2011:2556-2563.",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:156-173"
+    },
+    "4751": {
+        "file_id": 408,
+        "content": "This code snippet provides information about the ActivityNet competition, which is a large-scale action recognition event held annually since 2016. It focuses on identifying everyday activities from user-generated YouTube videos and has become the most influential in the field of action recognition. The code also includes references to relevant research papers for further reading.",
+        "type": "comment"
+    },
+    "4752": {
+        "file_id": 408,
+        "content": "</div>\n<br/>\n<div id='4'>\n[4] Carreira J , Zisserman A . Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2017:6299-6308.\n</div>\n<br/>\n<div id='5'>\n[5] Goyal R, Kahou S E, Michalski V. The “something something” video database for learning and evaluating visual common sense. arXiv:1706.04261,2017.\n</div>\n<br/>\n<div id='6'>\n[6] Sigurdsson G A , Varol Gül, Wang Xiaolong, et al. Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding. arXiv: 604.01753,2016\n</div>\n<br/>\n<div id='7'>\n[7] Kuehne H, Arslan A, Serre T. The Language of Actions Recovering the Syntax and Semantics of Goal-Directed Human Activities  Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014.\n</div>\n<br/>\n<div id='8'>\n[8] Karpathy A , Toderici G , Shetty S , et al. Large-Scale Video Classification with Convolutional Neural Networks Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014:1725-1732.",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:174-193"
+    },
+    "4753": {
+        "file_id": 408,
+        "content": "This code provides references to various research papers related to action recognition and video classification, such as the \"Quo Vadis, Action Recognition?\" paper by Carreira and Zisserman, and the \"Hollywood in Homes\" paper by Sigurdsson et al. These references are from well-known conferences like IEEE Conference on Computer Vision and Pattern Recognition (CVPR) and arXiv preprints.",
+        "type": "comment"
+    },
+    "4754": {
+        "file_id": 408,
+        "content": "</div>\n<br/>\n<div id='9'>\n[9] Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016.\n</div>\n<br/>\n<div id='10'>\n[10] Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018.\n</div>\n<br/>\n<div id='11'>\n[11] Feichtenhofer C , Fan Haoqi , Malik J , et al. SlowFast Networks for Video Recognition. arXiv:1812.03982,2018.\n</div>",
+        "type": "code",
+        "location": "/english_documents/tutorials/summarize.md:194-206"
+    },
+    "4755": {
+        "file_id": 408,
+        "content": "The code represents a list of references for papers related to video recognition. Each reference has an identifier (id) and the corresponding paper details like authors, title, and publication information.",
+        "type": "comment"
+    },
+    "4756": {
+        "file_id": 409,
+        "content": "/english_documents/usage.md",
+        "type": "filepath"
+    },
+    "4757": {
+        "file_id": 409,
+        "content": "This code sets up PaddleVideo on Linux, supports multi-card training and testing with PaddlePaddle. It provides log format for phases, resumes sessions, fine-tunes with pretrained params and best accuracy achieved. The code launches PaddleVideo in distributed mode with 4 GPUs, tests, exports model, introduces `use_gpu` parameter, and benchmark results are available in benchmark document.",
+        "type": "summary"
+    },
+    "4758": {
+        "file_id": 409,
+        "content": "[简体中文](../zh-CN/usage.md) | English\n# Usage\n---\nPlease refer to [installation documents](./install.md) to prepare the enviroment, and follow the steps mentioned in the [data preparation documents](./dataset/) to construct dataset, we will take you through the basic functions supported by PaddleVideo, all of it takes the ucf101 dataset with frame format as example.\nPaddleVideo only support linux operation system and GPU running time environment now.\nDefault detination folder of PaddleVideo files. running the [example config](../../configs/example.yaml) as example.\n```\nPaddleVideo\n    ├── paddlevideo\n    ├── ... #other source codes\n    ├── output #ouput destination\n    |    ├── example\n    |    |   ├── example_best.pdparams #path_to_weights\n    |    |   └── ...  \n    |    └── ...  \n    ├── log  #log file destination.\n    |    ├── worker.0\n    |    ├── worker.1\n    |    └── ...  \n    └── inference #inference files destination.\n         ├── .pdiparams file\n         ├── .pdimodel file\n         └── .pdiparmas.info file",
+        "type": "code",
+        "location": "/english_documents/usage.md:1-28"
+    },
+    "4759": {
+        "file_id": 409,
+        "content": "This code provides instructions for setting up the environment, preparing data using the PaddleVideo library, and explains its supported functions. It also mentions that it only supports Linux operation systems with GPU environments and gives an example of how to run the library. The code outlines the default destination folders for output, log files, and inference files.",
+        "type": "comment"
+    },
+    "4760": {
+        "file_id": 409,
+        "content": "```\n<a name=\"1\"></a>\n## 1. Train and Test\nStart running multi-cards training scripts or test scripts by `paddle.distributed.launch`, or run the `run.sh` directly.\n```bash\nsh run.sh\n```\nWe put all the start commands in advanced in the ```run.sh```, please uncomment the selected one to run.\n<a name=\"model_train\"></a>\n### 1.1 Train\nSwitch `--validate` on to validating while training.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    --validate \\\n    -c ./configs/example.yaml\n```\nIndicating `-c` to set configuration, and one can flexible add `-o` in the script to update it.\n```bash\npython -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/example.yaml \\\n    --validate \\\n    -o DATASET.batch_size=16\n```\nIndicating `-o DATASET.batch_size=16` can update batch size to 16, please refer to [configuration](tutorials/config.md#config-yaml-details) for more information.\nAfter starting training, log files will generated, ",
+        "type": "code",
+        "location": "/english_documents/usage.md:29-71"
+    },
+    "4761": {
+        "file_id": 409,
+        "content": "This code demonstrates how to train and test a model using PaddlePaddle, a popular deep learning framework. The training process involves running multi-card training scripts or tests by executing the `paddle.distributed.launch` command with appropriate arguments such as GPU selection, script path, and optional configuration file. The configuration file allows for flexible updates like changing batch sizes on the fly. After starting the training, log files are generated for tracking progress and analysis.",
+        "type": "comment"
+    },
+    "4762": {
+        "file_id": 409,
+        "content": "and its format is shown as below, it will output to both the screen and files. Default destination of log is under the `.log/` folder, and stored in the files named like `worker.0`, `worker.1` ...\n[train phase] current time, current epoch/ total epoch, batch id, metrics, elapse time, ips, etc.:\n    [12/28 17:31:26] epoch:[ 1/80 ] train step:0   loss: 0.04656 lr: 0.000100 top1: 1.00000 top5: 1.00000 elapse: 0.326 reader: 0.001s ips: 98.22489 instance/sec.\n[eval phase] current time, current epoch/ total epoch, batch id, metrics, elapse time, ips, etc.:\n    [12/28 17:31:32] epoch:[ 80/80 ] val step:0    loss: 0.20538 top1: 0.88281 top5: 0.99219 elapse: 1.589 reader: 0.000s ips: 20.14003 instance/sec.\n[epoch end] current time, metrics, elapse time, ips, etc.\n    [12/28 17:31:38] END epoch:80  val loss_avg: 0.52208 top1_avg: 0.84398 top5_avg: 0.97393 elapse_avg: 0.234 reader_avg: 0.000 elapse_sum: 7.021s ips: 136.73686 instance/sec.\n[the best Acc]  \n    [12/28 17:28:42] Already save the best model (top1 acc)0.8494",
+        "type": "code",
+        "location": "/english_documents/usage.md:71-89"
+    },
+    "4763": {
+        "file_id": 409,
+        "content": "The code shows log output format for training and validation phases, including time, epoch, batch ID, metrics, elapse time (execution time), and ips (instances per second). It also displays the best accuracy achieved during training.",
+        "type": "comment"
+    },
+    "4764": {
+        "file_id": 409,
+        "content": "<a name=\"model_resume\"></a>\n### 1.2 Resume\nIndicate `-o resume_epoch` to resume, It will training from ```resume_epoch``` epoch, PaddleVideo will auto load optimizers parameters and checkpoints from `./output` folder, as it is the default output destination.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/example.yaml \\\n    --validate \\\n    -o resume_epoch=5\n```\n<a name=\"model_finetune\"></a>\n### 1.3 Finetune\nIndicate `--weights` to load pretrained parameters, PaddleVideo will auto treat it as a finetune mission.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/example.yaml \\\n    --validate \\\n    --weights=./outputs/example/path_to_weights\n```\nNote: PaddleVideo will NOT load shape unmatched parameters.\n<a name=\"model_test\"></a>\n### 1.4 Test\nSwitch `--test` on to start test mode, and indicate `--weights` to load pretrained model.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3",
+        "type": "code",
+        "location": "/english_documents/usage.md:91-132"
+    },
+    "4765": {
+        "file_id": 409,
+        "content": "The code provides instructions on how to use PaddleVideo for three different tasks: resuming a training session, finetuning with pretrained parameters, and testing. In the resume task, the user should indicate \"-o resume_epoch\" to continue from a specific epoch, while in finetuning, \"--weights\" is used to load pretrained parameters. The test mode is activated using \"--test\". PaddleVideo will not load unmatched parameters.",
+        "type": "comment"
+    },
+    "4766": {
+        "file_id": 409,
+        "content": "python3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/example.yaml \\\n    --test \\\n    --weights=./output/example/path_to_weights\n```\n<a name=\"model_inference\"></a>\n## 2. Infer\nFirst, export model.\nIndicate `-c` to set configuration, `-p` to load pretrained model, `-o` to set inference files destination.\n```bash\npython tools/export_model.py \\\n    -c ./configs/example.yaml \\\n    -p ./output/example/path_to_weights \\\n    -o ./inference\n```\nIt will generate `model_name.pdmodel` , `model_name.pdiparams` and `model_name.pdiparames.info`.\nSecond, start PaddleInference engine to infer a video.\n```bash\npython tools/predict.py \\\n    --input_file \"data/example.avi\" \\\n    --model_file \"./inference/example.pdmodel\" \\\n    --params_file \"./inference/example.pdiparams\" \\\n    --use_gpu=True \\\n    --use_tensorrt=False\n```\nAttributes:\n+ `input_file`: input file path or input directory, which contains input files(s).\n+ `model_file`: pdmodel file path.\n+ `params_file`: pdiparams file path.\n+ `use_tensorrt`: use tensorrt to acclerate or not, default: False.",
+        "type": "code",
+        "location": "/english_documents/usage.md:134-174"
+    },
+    "4767": {
+        "file_id": 409,
+        "content": "This code is launching PaddleVideo in distributed mode with four GPUs, running the main.py script with a specified configuration file, and performing testing using weights from a particular path. Then it exports the model for inference by specifying the configuration file, pretrained weights, and output directory. Lastly, it uses the PaddleInference engine to infer a video using the exported model files, input video file, and optional TensorRT acceleration.",
+        "type": "comment"
+    },
+    "4768": {
+        "file_id": 409,
+        "content": "+ `use_gpu`: use gpu to infer or not, default: True.\nbenchmark results are shown in th [benchmark](./benchmark.md).",
+        "type": "code",
+        "location": "/english_documents/usage.md:175-177"
+    },
+    "4769": {
+        "file_id": 409,
+        "content": "This code snippet is referring to the `use_gpu` parameter in PaddleVideo, which enables or disables GPU usage for inferencing. The default setting is set to True and benchmark results are available in the [benchmark](./benchmark.md) document.",
+        "type": "comment"
+    },
+    "4770": {
+        "file_id": 410,
+        "content": "/main.py",
+        "type": "filepath"
+    },
+    "4771": {
+        "file_id": 410,
+        "content": "This code imports libraries, defines functions for PaddleVideo model training, and allows users to specify command-line arguments. It uses the Apache License 2.0 and enables parallel execution with distributed environments. The method to be executed is determined by the command line arguments.",
+        "type": "summary"
+    },
+    "4772": {
+        "file_id": 410,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport random\nimport numpy as np\nimport paddle\nfrom paddlevideo.tasks import (test_model, train_dali, train_model,\n                               train_model_multigrid)\nfrom paddlevideo.utils import get_config, get_dist_info\ndef parse_args():\n    parser = argparse.ArgumentParser(\"PaddleVideo train script\")\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,",
+        "type": "code",
+        "location": "/main.py:1-29"
+    },
+    "4773": {
+        "file_id": 410,
+        "content": "This code imports necessary libraries and defines functions for training a PaddleVideo model. It also handles command line arguments using argparse. The script is licensed under the Apache License, Version 2.0.",
+        "type": "comment"
+    },
+    "4774": {
+        "file_id": 410,
+        "content": "                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument('--test',\n                        action='store_true',\n                        help='whether to test a model')\n    parser.add_argument('--train_dali',\n                        action='store_true',\n                        help='whether to use dali to speed up training')\n    parser.add_argument('--multigrid',\n                        action='store_true',\n                        help='whether to use multigrid training')\n    parser.add_argument('-w',\n                        '--weights',\n                        type=str,\n                        help='weights for finetuning or testing')\n    parser.add_argument('--fleet',\n                        action='store_true',\n                        help='whether to use fleet run distributed training')",
+        "type": "code",
+        "location": "/main.py:30-52"
+    },
+    "4775": {
+        "file_id": 410,
+        "content": "This code segment is defining command line arguments using the 'argparse' library for a PaddleVideo program. It allows users to set configuration file paths, override config options, test models, enable DALI for training, use multigrid training, specify weights for finetuning or testing, and utilize fleet run distributed training.",
+        "type": "comment"
+    },
+    "4776": {
+        "file_id": 410,
+        "content": "    parser.add_argument('--amp',\n                        action='store_true',\n                        help='whether to open amp training.')\n    parser.add_argument(\n        '--amp_level',\n        type=str,\n        default=None,\n        help=\"optimize level when open amp training, can only be 'O1' or 'O2'.\")\n    parser.add_argument(\n        '--validate',\n        action='store_true',\n        help='whether to evaluate the checkpoint during training')\n    parser.add_argument(\n        '--seed',\n        type=int,\n        default=1234,\n        help='fixed all random seeds when the program is running')\n    parser.add_argument(\n        '--max_iters',\n        type=int,\n        default=None,\n        help='max iterations when training(this arg only used in test_tipc)')\n    parser.add_argument(\n        '-p',\n        '--profiler_options',\n        type=str,\n        default=None,\n        help='The option of profiler, which should be in format '\n        '\\\"key1=value1;key2=value2;key3=value3\\\".')\n    args = parser.parse_args()\n    return args",
+        "type": "code",
+        "location": "/main.py:53-84"
+    },
+    "4777": {
+        "file_id": 410,
+        "content": "The code adds command-line arguments for AMP (automatic mixed precision) training, validation, random seed, maximum iterations, and profiler options. It then parses these arguments to customize the program's behavior during training.",
+        "type": "comment"
+    },
+    "4778": {
+        "file_id": 410,
+        "content": "def main():\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override)\n    # enable to use npu if paddle is built with npu\n    if paddle.is_compiled_with_custom_device('npu') :\n        cfg.__setattr__(\"use_npu\", True)\n    elif paddle.device.is_compiled_with_xpu():\n        cfg.__setattr__(\"use_xpu\", True)\n    # set seed if specified\n    seed = args.seed\n    if seed is not None:\n        assert isinstance(\n            seed, int), f\"seed must be a integer when specified, but got {seed}\"\n        random.seed(seed)\n        np.random.seed(seed)\n        paddle.seed(seed)\n    # set amp_level if amp is enabled\n    if args.amp:\n        if args.amp_level is None:\n            args.amp_level = 'O1'  # set defaualt amp_level to 'O1'\n        else:\n            assert args.amp_level in [\n                'O1', 'O2'\n            ], f\"amp_level must be 'O1' or 'O2' when amp enabled, but got {args.amp_level}.\"\n    _, world_size = get_dist_info()\n    parallel = world_size != 1\n    if parallel:\n        paddle.distributed.init_parallel_env()",
+        "type": "code",
+        "location": "/main.py:87-118"
+    },
+    "4779": {
+        "file_id": 410,
+        "content": "This code snippet defines a `main` function that parses arguments, configures settings based on provided overrides and device availability (NPU or XPU), sets seed for random number generation if specified, and enables parallel execution using Paddle's distributed environment.",
+        "type": "comment"
+    },
+    "4780": {
+        "file_id": 410,
+        "content": "    if args.test:\n        test_model(cfg, weights=args.weights, parallel=parallel)\n    elif args.train_dali:\n        train_dali(cfg, weights=args.weights, parallel=parallel)\n    elif args.multigrid:\n        train_model_multigrid(cfg,\n                              world_size=world_size,\n                              validate=args.validate)\n    else:\n        train_model(cfg,\n                    weights=args.weights,\n                    parallel=parallel,\n                    validate=args.validate,\n                    use_fleet=args.fleet,\n                    use_amp=args.amp,\n                    amp_level=args.amp_level,\n                    max_iters=args.max_iters,\n                    profiler_options=args.profiler_options)\nif __name__ == '__main__':\n    main()",
+        "type": "code",
+        "location": "/main.py:120-141"
+    },
+    "4781": {
+        "file_id": 410,
+        "content": "This code determines the method to be executed based on command line arguments. If '--test' is given, it executes 'test_model'. If '--train_dali' is given, it executes 'train_dali'. If '--multigrid' is given, it executes 'train_model_multigrid'. Otherwise, it executes 'train_model', passing the necessary parameters to perform model training or validation.",
+        "type": "comment"
+    },
+    "4782": {
+        "file_id": 411,
+        "content": "/paddlevideo/__init__.py",
+        "type": "filepath"
+    },
+    "4783": {
+        "file_id": 411,
+        "content": "This code snippet is importing the paddlevideo_version from the version module. This suggests that this file is serving as an initialization point for the PaddleVideo library, potentially setting up necessary imports or defining constants and functions to be used throughout the library.",
+        "type": "summary"
+    },
+    "4784": {
+        "file_id": 411,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .version import paddlevideo_version",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/__init__.py:1-15"
+    },
+    "4785": {
+        "file_id": 411,
+        "content": "This code snippet is importing the paddlevideo_version from the version module. This suggests that this file is serving as an initialization point for the PaddleVideo library, potentially setting up necessary imports or defining constants and functions to be used throughout the library.",
+        "type": "comment"
+    },
+    "4786": {
+        "file_id": 412,
+        "content": "/paddlevideo/loader/__init__.py",
+        "type": "filepath"
+    },
+    "4787": {
+        "file_id": 412,
+        "content": "This code is part of the PaddleVideo library, containing imports and definitions for functions related to dataset building and data loading. It allows users to build datasets, dataloaders, batch pipelines, and utilize the TSN_Dali_loader and get_input_data functions.",
+        "type": "summary"
+    },
+    "4788": {
+        "file_id": 412,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .builder import build_dataset, build_dataloader, build_batch_pipeline\nfrom .dataset import VideoDataset\nfrom .dali_loader import TSN_Dali_loader, get_input_data\n__all__ = [\n    'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset',\n    'TSN_Dali_loader', 'get_input_data'\n]",
+        "type": "code",
+        "location": "/paddlevideo/loader/__init__.py:1-22"
+    },
+    "4789": {
+        "file_id": 412,
+        "content": "This code is part of the PaddleVideo library, containing imports and definitions for functions related to dataset building and data loading. It allows users to build datasets, dataloaders, batch pipelines, and utilize the TSN_Dali_loader and get_input_data functions.",
+        "type": "comment"
+    },
+    "4790": {
+        "file_id": 413,
+        "content": "/paddlevideo/loader/builder.py",
+        "type": "filepath"
+    },
+    "4791": {
+        "file_id": 413,
+        "content": "The code constructs a PaddleVideo pipeline for preprocessing data and builds a data loader for distributed model training, handling variable batch sizes and using mix_collate_fn to collate data.",
+        "type": "summary"
+    },
+    "4792": {
+        "file_id": 413,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport signal\nimport os\nimport paddle\nfrom paddle.io import DataLoader, DistributedBatchSampler\nfrom .registry import DATASETS, PIPELINES\nfrom ..utils.build_utils import build\nfrom .pipelines.compose import Compose\nfrom paddlevideo.utils import get_logger\nfrom paddlevideo.utils.multigrid import DistributedShortSampler\nimport numpy as np\nlogger = get_logger(\"paddlevideo\")\ndef build_pipeline(cfg):\n    \"\"\"Build pipeline.",
+        "type": "code",
+        "location": "/paddlevideo/loader/builder.py:1-29"
+    },
+    "4793": {
+        "file_id": 413,
+        "content": "The code is building a pipeline for PaddleVideo. It imports necessary libraries and classes, utilizes function to build the pipeline, logs information using get_logger from paddlevideo.utils, and adheres to Apache License 2.0. The purpose of this code seems to be related to data preprocessing and possibly model training in a distributed environment.",
+        "type": "comment"
+    },
+    "4794": {
+        "file_id": 413,
+        "content": "    Args:\n        cfg (dict): root config dict.\n    \"\"\"\n    if cfg == None:\n        return\n    return Compose(cfg)\ndef build_dataset(cfg):\n    \"\"\"Build dataset.\n    Args:\n        cfg (dict): root config dict.\n    Returns:\n        dataset: dataset.\n    \"\"\"\n    #XXX: ugly code here!\n    cfg_dataset, cfg_pipeline = cfg\n    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)\n    dataset = build(cfg_dataset, DATASETS, key=\"format\")\n    return dataset\ndef build_batch_pipeline(cfg):\n    batch_pipeline = build(cfg, PIPELINES)\n    return batch_pipeline\ndef build_dataloader(dataset,\n                     batch_size,\n                     num_workers,\n                     places,\n                     shuffle=True,\n                     drop_last=True,\n                     multigrid=False,\n                     collate_fn_cfg=None,\n                     **kwargs):\n    \"\"\"Build Paddle Dataloader.\n    XXX explain how the dataloader work!\n    Args:\n        dataset (paddle.dataset): A PaddlePaddle dataset object.\n        batch_size (int): batch size on single card.",
+        "type": "code",
+        "location": "/paddlevideo/loader/builder.py:30-74"
+    },
+    "4795": {
+        "file_id": 413,
+        "content": "build_dataset: Builds a dataset using provided config dictionary, building pipeline first.\nbuild_batch_pipeline: Constructs the batch pipeline using config from the PIPELINES module.\nbuild_dataloader: Creates Paddle Dataloader object using specified parameters and dataset.",
+        "type": "comment"
+    },
+    "4796": {
+        "file_id": 413,
+        "content": "        num_worker (int): num_worker\n        shuffle(bool): whether to shuffle the data at every epoch.\n    \"\"\"\n    if multigrid:\n        sampler = DistributedShortSampler(dataset,\n                                          batch_sizes=batch_size,\n                                          shuffle=True,\n                                          drop_last=True)\n    else:\n        sampler = DistributedBatchSampler(dataset,\n                                          batch_size=batch_size,\n                                          shuffle=shuffle,\n                                          drop_last=drop_last)\n    #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.\n    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:\n    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.\n    def mix_collate_fn(batch):\n        pipeline = build_batch_pipeline(collate_fn_cfg)\n        batch = pipeline(batch)\n        slots = []",
+        "type": "code",
+        "location": "/paddlevideo/loader/builder.py:75-96"
+    },
+    "4797": {
+        "file_id": 413,
+        "content": "This code appears to be part of a data loading and processing function for a machine learning or deep learning model. It uses a sampler to manage the data, with options for shuffling and dropping the last batch if needed. The mix_collate_fn function is defined to collate the data in a specific way using a pipeline built from collate_fn_cfg.",
+        "type": "comment"
+    },
+    "4798": {
+        "file_id": 413,
+        "content": "        for items in batch:\n            for i, item in enumerate(items):\n                if len(slots) < len(items):\n                    slots.append([item])\n                else:\n                    slots[i].append(item)\n        return [np.stack(slot, axis=0) for slot in slots]\n    #if collate_fn_cfg is not None:\n    #ugly code here. collate_fn is mix op config\n    #    collate_fn = mix_collate_fn(collate_fn_cfg)\n    data_loader = DataLoader(\n        dataset,\n        batch_sampler=sampler,\n        places=places,\n        num_workers=num_workers,\n        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,\n        return_list=True,\n        **kwargs)\n    return data_loader\ndef term_mp(sig_num, frame):\n    \"\"\" kill all child processes\n    \"\"\"\n    pid = os.getpid()\n    pgid = os.getpgid(os.getpid())\n    logger.info(\"main proc {} exit, kill process group \" \"{}\".format(pid, pgid))\n    os.killpg(pgid, signal.SIGKILL)\n    return\nsignal.signal(signal.SIGINT, term_mp)\nsignal.signal(signal.SIGTERM, term_mp)",
+        "type": "code",
+        "location": "/paddlevideo/loader/builder.py:97-132"
+    },
+    "4799": {
+        "file_id": 413,
+        "content": "This code appears to create a data loader that can handle batches of varying lengths. It iterates through each batch and organizes the items into slots based on their length, either creating a new slot for longer items or appending them to existing ones. The DataLoader class is then instantiated with this collate_fn for processing the dataset, using the provided parameters. Additionally, signal handlers are set up to handle SIGINT and SIGTERM signals to terminate the process group if needed.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/48.json b/docs/data/48.json
new file mode 100644
index 000000000..189ad2c00
--- /dev/null
+++ b/docs/data/48.json
@@ -0,0 +1,543 @@
+{
+    "4800": {
+        "file_id": 414,
+        "content": "/paddlevideo/loader/dali_loader.py",
+        "type": "filepath"
+    },
+    "4801": {
+        "file_id": 414,
+        "content": "The code imports necessary libraries, sets up a DALI reader, defines TSN_Dali_loader class, initializes parallel video preprocessing, handles potential import errors, and returns output and label using PaddleOps for normalization.",
+        "type": "summary"
+    },
+    "4802": {
+        "file_id": 414,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport math\nimport paddle\nfrom paddle.distributed import ParallelEnv\nimport paddle.distributed as dist\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\ntry:\n    from nvidia.dali.pipeline import Pipeline\n    import nvidia.dali.ops as ops\n    import nvidia.dali.types as types\n    import tempfile\n    from nvidia.dali.plugin.paddle import DALIGenericIterator\nexcept:\n    Pipeline = object",
+        "type": "code",
+        "location": "/paddlevideo/loader/dali_loader.py:1-32"
+    },
+    "4803": {
+        "file_id": 414,
+        "content": "This code imports necessary libraries, sets up logger, and attempts to import DALI pipeline and related functions for creating a generic iterator for PaddlePaddle. If any of these imports fail, it falls back by setting the respective variable as an object.",
+        "type": "comment"
+    },
+    "4804": {
+        "file_id": 414,
+        "content": "def get_input_data(data):\n    return paddle.to_tensor(data[0]['image']), paddle.to_tensor(\n        data[0]['label'])\nclass TSN_Dali_loader(object):\n    def __init__(self, cfg):\n        self.batch_size = cfg.batch_size\n        self.file_path = cfg.file_path\n        self.num_seg = cfg.num_seg\n        self.seglen = cfg.seglen\n        self.short_size = cfg.short_size\n        self.target_size = cfg.target_size\n        # set num_shards and shard_id when distributed training is implemented\n        self.num_shards = dist.get_world_size()\n        self.shard_id = ParallelEnv().local_rank\n        self.dali_mean = cfg.mean * (self.num_seg * self.seglen)\n        self.dali_std = cfg.std * (self.num_seg * self.seglen)\n    def build_dali_reader(self):\n        \"\"\"\n        build dali training reader\n        \"\"\"\n        def reader_():\n            with open(self.file_path) as flist:\n                full_lines = [line for line in flist]\n                if (not hasattr(reader_, 'seed')):\n                    reader_.seed = 0\n                random.Random(reader_.seed).shuffle(full_lines)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dali_loader.py:35-65"
+    },
+    "4805": {
+        "file_id": 414,
+        "content": "The code defines a class `TSN_Dali_loader` that initializes attributes related to batch size, file path, number of segments, segment length, input and target image sizes. It also sets variables for distributed training, data normalization, and builds a DALI reader for training data using shuffled full lines from the file path.",
+        "type": "comment"
+    },
+    "4806": {
+        "file_id": 414,
+        "content": "                logger.info(f\"reader shuffle seed: {reader_.seed}.\")\n                if reader_.seed is not None:\n                    reader_.seed += 1\n                per_node_lines = int(\n                    math.ceil(len(full_lines) * 1.0 / self.num_shards))\n                total_lines = per_node_lines * self.num_shards\n                # aligned full_lines so that it can evenly divisible\n                full_lines += full_lines[:(total_lines - len(full_lines))]\n                assert len(full_lines) == total_lines\n                # trainer get own sample\n                lines = full_lines[self.shard_id:total_lines:self.num_shards]\n                assert len(lines) == per_node_lines\n                logger.info(\n                    f\"shard_id: {self.shard_id}, trainer_count: {self.num_shards}\"\n                )\n                logger.info(\n                    f\"read videos from {self.shard_id * per_node_lines}, \"\n                    f\"length: {per_node_lines}, \"\n                    f\"lines length: {len(lines)}, \"",
+        "type": "code",
+        "location": "/paddlevideo/loader/dali_loader.py:66-88"
+    },
+    "4807": {
+        "file_id": 414,
+        "content": "This code snippet initializes a reader and distributes the data evenly across multiple shards. It calculates the number of lines to be assigned to each shard based on the total number of lines and the number of shards. It then ensures that the full_lines list is an even multiple of the total_lines by appending additional items if necessary. The snippet asserts that the length of full_lines equals total_lines, assigns lines to trainers based on their shard ID, and logs information about the distribution of data among shards.",
+        "type": "comment"
+    },
+    "4808": {
+        "file_id": 414,
+        "content": "                    f\"total: {len(full_lines)}\")\n            video_files = ''.join([item for item in lines])\n            tf = tempfile.NamedTemporaryFile()\n            tf.write(str.encode(video_files))\n            tf.flush()\n            video_files = tf.name\n            device_id = ParallelEnv().local_rank\n            logger.info(f'---------- device_id: {device_id} -----------')\n            pipe = VideoPipe(batch_size=self.batch_size,\n                             num_threads=1,\n                             device_id=device_id,\n                             file_list=video_files,\n                             sequence_length=self.num_seg * self.seglen,\n                             num_seg=self.num_seg,\n                             seg_length=self.seglen,\n                             resize_shorter_scale=self.short_size,\n                             crop_target_size=self.target_size,\n                             is_training=True,\n                             num_shards=self.num_shards,\n                             shard_id=self.shard_id,",
+        "type": "code",
+        "location": "/paddlevideo/loader/dali_loader.py:89-111"
+    },
+    "4809": {
+        "file_id": 414,
+        "content": "This code initializes a PaddlePaddle VideoPipe instance, loading and preprocessing video files in parallel for training. It sets the batch size, number of threads, device ID, file list, sequence length, number of segments, segment length, resize shorter scale, crop target size, whether it's in training mode, and the number of shards and shard ID.",
+        "type": "comment"
+    },
+    "4810": {
+        "file_id": 414,
+        "content": "                             dali_mean=self.dali_mean,\n                             dali_std=self.dali_std)\n            logger.info(\n                'initializing dataset, it will take several minutes if it is too large .... '\n            )\n            video_loader = DALIGenericIterator([pipe], ['image', 'label'],\n                                               len(lines),\n                                               dynamic_shape=True,\n                                               auto_reset=True)\n            return video_loader\n        dali_reader = reader_()\n        return dali_reader\nclass VideoPipe(Pipeline):\n    def __init__(self,\n                 batch_size,\n                 num_threads,\n                 device_id,\n                 file_list,\n                 sequence_length,\n                 num_seg,\n                 seg_length,\n                 resize_shorter_scale,\n                 crop_target_size,\n                 is_training=False,\n                 initial_prefetch_size=20,\n                 num_shards=1,",
+        "type": "code",
+        "location": "/paddlevideo/loader/dali_loader.py:112-142"
+    },
+    "4811": {
+        "file_id": 414,
+        "content": "This code initializes a DALI (Data Augmentation Library for Images) generic iterator to load video data from a file list, and returns it. It uses a VideoPipe class to define the pipeline configuration, including parameters such as batch size, number of threads, device ID, sequence length, and more.",
+        "type": "comment"
+    },
+    "4812": {
+        "file_id": 414,
+        "content": "                 shard_id=0,\n                 dali_mean=0.,\n                 dali_std=1.0):\n        super(VideoPipe, self).__init__(batch_size, num_threads, device_id)\n        self.input = ops.VideoReader(device=\"gpu\",\n                                     file_list=file_list,\n                                     sequence_length=sequence_length,\n                                     num_seg=num_seg,\n                                     seg_length=seg_length,\n                                     is_training=is_training,\n                                     num_shards=num_shards,\n                                     shard_id=shard_id,\n                                     random_shuffle=is_training,\n                                     initial_fill=initial_prefetch_size)\n        # the sequece data read by ops.VideoReader is of shape [F, H, W, C]\n        # Because the ops.Resize does not support sequence data,\n        # it will be transposed into [H, W, F, C],\n        # then reshaped to [H, W, FC], and then resized like a 2-D image.",
+        "type": "code",
+        "location": "/paddlevideo/loader/dali_loader.py:143-160"
+    },
+    "4813": {
+        "file_id": 414,
+        "content": "This code initializes a VideoPipe object with the given parameters, including file list, sequence length, and number of segments. It uses ops.VideoReader to read video data from the file list in the specified format. Due to the limitations of resize function, it transposes and reshapes the data before performing resizing operation on the 2-D image.",
+        "type": "comment"
+    },
+    "4814": {
+        "file_id": 414,
+        "content": "        self.transpose = ops.Transpose(device=\"gpu\", perm=[1, 2, 0, 3])\n        self.reshape = ops.Reshape(device=\"gpu\",\n                                   rel_shape=[1.0, 1.0, -1],\n                                   layout='HWC')\n        self.resize = ops.Resize(device=\"gpu\",\n                                 resize_shorter=resize_shorter_scale)\n        # crops and mirror are applied by ops.CropMirrorNormalize.\n        # Normalization will be implemented in paddle due to the difficulty of dimension broadcast,\n        # It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.\n        self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))\n        self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))\n        self.mirror_generator = ops.Uniform(range=(0.0, 1.0))\n        self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)\n        self.crop_mirror_norm = ops.CropMirrorNormalize(\n            device=\"gpu\",\n            crop=[crop_target_size, crop_target_size],",
+        "type": "code",
+        "location": "/paddlevideo/loader/dali_loader.py:161-176"
+    },
+    "4815": {
+        "file_id": 414,
+        "content": "The code creates a DALI loader for image processing, with transpose, reshape, resize operations, and implements crop and mirror normalization. It also includes uniform distribution generators for position and mirror. The normalization will be implemented using PaddleOps due to the difficulty of dimension broadcasting in DALI.",
+        "type": "comment"
+    },
+    "4816": {
+        "file_id": 414,
+        "content": "            mean=dali_mean,\n            std=dali_std)\n        self.reshape_back = ops.Reshape(\n            device=\"gpu\",\n            shape=[num_seg, seg_length * 3, crop_target_size, crop_target_size],\n            layout='FCHW')\n        self.cast_label = ops.Cast(device=\"gpu\", dtype=types.DALIDataType.INT64)\n    def define_graph(self):\n        output, label = self.input(name=\"Reader\")\n        output = self.transpose(output)\n        output = self.reshape(output)\n        output = self.resize(output)\n        output = output / 255.\n        pos_x = self.pos_rng_x()\n        pos_y = self.pos_rng_y()\n        mirror_flag = self.mirror_generator()\n        mirror_flag = (mirror_flag > 0.5)\n        mirror_flag = self.cast_mirror(mirror_flag)\n        output = self.crop_mirror_norm(output,\n                                       crop_pos_x=pos_x,\n                                       crop_pos_y=pos_y,\n                                       mirror=mirror_flag)\n        output = self.reshape_back(output)\n        label = self.cast_label(label)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dali_loader.py:177-202"
+    },
+    "4817": {
+        "file_id": 414,
+        "content": "The code defines a DALI loader and its associated operations for image processing. It includes mean and std for normalization, reshaping, casting to int64, transpose, resize, normalization by dividing by 255, generating positional information, cropping with mirror flag, and finally reshaping the output.",
+        "type": "comment"
+    },
+    "4818": {
+        "file_id": 414,
+        "content": "        return output, label\n    def __len__(self):\n        return self.epoch_size()",
+        "type": "code",
+        "location": "/paddlevideo/loader/dali_loader.py:203-206"
+    },
+    "4819": {
+        "file_id": 414,
+        "content": "The code defines a method that returns an output and label, and another method for determining the length of the loader.",
+        "type": "comment"
+    },
+    "4820": {
+        "file_id": 415,
+        "content": "/paddlevideo/loader/dataset/MRI.py",
+        "type": "filepath"
+    },
+    "4821": {
+        "file_id": 415,
+        "content": "The MRI.py file in PaddleVideo library provides an action recognition dataset loader, utilizing a MRIDataset class for transform operations on raw frames and includes license information, copyright notices, and data structure registration. It reads data, stores components in a list, handles missing files through retry and exception handling, and logs errors. The code snippet returns a numpy array for images and another for labels from the 'results' dictionary, likely used in a function that processes data from MRI datasets where 'imgs' contains image data and 'labels' stores their corresponding labels or annotations.",
+        "type": "summary"
+    },
+    "4822": {
+        "file_id": 415,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass MRIDataset(BaseDataset):\n    \"\"\"Rawframe dataset for action recognition.\n    The dataset loads raw frames from frame files, and apply specified transform operatation them.\n    The indecx file is",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI.py:1-31"
+    },
+    "4823": {
+        "file_id": 415,
+        "content": "The code snippet is from the MRI.py file within the PaddleVideo library, which appears to be a loader dataset for action recognition tasks. It imports necessary libraries and defines the MRIDataset class that inherits from BaseDataset. This class loads raw frames from frame files and applies specified transform operations on them. The index file is used by the dataset loader. The code also includes license information, copyright notices, and data structure registration.",
+        "type": "comment"
+    },
+    "4824": {
+        "file_id": 415,
+        "content": " a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.\n    Example of an index file:\n    .. code-block:: txt\n        file_path-1 150 1\n        file_path-2 160 1\n        file_path-3 170 2\n        file_path-4 180 2\n    Args:\n        file_path (str): Path to the index file.\n        pipeline(XXX):\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 num_retries=5,\n                 data_prefix=None,\n                 test_mode=False,\n                 suffix='img_{:05}.jpg'):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI.py:31-61"
+    },
+    "4825": {
+        "file_id": 415,
+        "content": "This function initializes the MRI dataset object, taking the file path to the index file as well as other optional arguments. The load_file method is used to load the index file and retrieve video information.",
+        "type": "comment"
+    },
+    "4826": {
+        "file_id": 415,
+        "content": "        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                frame_dir, frames_len, labels = line_split\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(\n                    dict(\n                        frame_dir=frame_dir,\n                        #suffix=self.suffix,\n                        frames_len=frames_len,\n                        labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid gisven index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI.py:62-86"
+    },
+    "4827": {
+        "file_id": 415,
+        "content": "This code reads data from a file, splits it into different components like frame directory, frames length, and labels, and stores it in a list. It also handles missing files by retrying multiple times using exception handling.",
+        "type": "comment"
+    },
+    "4828": {
+        "file_id": 415,
+        "content": "                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return np.array(results['imgs']), np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI.py:87-108"
+    },
+    "4829": {
+        "file_id": 415,
+        "content": "The code is attempting to load frames for testing by trying multiple times in case of an exception caused by missing frames. It uses a logger to inform about the error and tries again with different frames until successful or reaching the maximum retries.",
+        "type": "comment"
+    },
+    "4830": {
+        "file_id": 415,
+        "content": "            return np.array(results['imgs']), np.array([results['labels']])",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI.py:109-109"
+    },
+    "4831": {
+        "file_id": 415,
+        "content": "The code snippet returns a numpy array for images and another for labels from the 'results' dictionary. It is likely used in a function that processes data from MRI datasets, where 'imgs' contains image data and 'labels' stores their corresponding labels or annotations.",
+        "type": "comment"
+    },
+    "4832": {
+        "file_id": 416,
+        "content": "/paddlevideo/loader/dataset/MRI_SlowFast.py",
+        "type": "filepath"
+    },
+    "4833": {
+        "file_id": 416,
+        "content": "This code imports libraries, creates action recognition and data loading classes, and processes video data for training or validation using a pipeline, handling exceptions through retries and logging. It is part of a function that returns arrays of images and labels.",
+        "type": "summary"
+    },
+    "4834": {
+        "file_id": 416,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass SFMRIDataset(BaseDataset):\n    \"\"\"Rawframe dataset for action recognition.\n    The dataset loads raw frames from frame files, and apply specified transform operatation them.\n    The indecx file ",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI_SlowFast.py:1-31"
+    },
+    "4835": {
+        "file_id": 416,
+        "content": "This code snippet is importing necessary libraries and registering a new dataset class named SFMRIDataset for action recognition. It uses raw frames from frame files, applies specified transform operations, and loads an index file. The copyright and license information are also included in the beginning of the file.",
+        "type": "comment"
+    },
+    "4836": {
+        "file_id": 416,
+        "content": "is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.\n    Example of an index file:\n    .. code-block:: txt\n        file_path-1 150 1\n        file_path-2 160 1\n        file_path-3 170 2\n        file_path-4 180 2\n    Args:\n        file_path (str): Path to the index file.\n        pipeline(XXX):\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 num_retries=5,\n                 data_prefix=None,\n                 test_mode=False,\n                 suffix='img_{:05}.jpg'):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI_SlowFast.py:31-61"
+    },
+    "4837": {
+        "file_id": 416,
+        "content": "This code is creating a class for loading an index file containing video information, including the directory of frames, total frames, and label. The constructor takes arguments like the file path, pipeline, data prefix, test mode, and suffix. The load_file function loads the index file to retrieve the video details.",
+        "type": "comment"
+    },
+    "4838": {
+        "file_id": 416,
+        "content": "        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                frame_dir, frames_len, labels = line_split\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(\n                    dict(\n                        frame_dir=frame_dir,\n                        #suffix=self.suffix,\n                        frames_len=frames_len,\n                        labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid gisven index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI_SlowFast.py:62-86"
+    },
+    "4839": {
+        "file_id": 416,
+        "content": "The code reads information from a file and stores it in a list of dictionaries. It then attempts to prepare the frames for training or validation by applying a pipeline, handling potential exceptions within a specified number of retries.",
+        "type": "comment"
+    },
+    "4840": {
+        "file_id": 416,
+        "content": "                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return np.array(results['imgs'][0]), np.array(\n                results['imgs'][1]), np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI_SlowFast.py:87-108"
+    },
+    "4841": {
+        "file_id": 416,
+        "content": "This code handles error cases when loading data by retrying the operation if an exception occurs. It uses a logger to provide information on the error, the number of retries, and whether or not to try again with a different index. The 'prepare_test' function is responsible for preparing frames for testing.",
+        "type": "comment"
+    },
+    "4842": {
+        "file_id": 416,
+        "content": "                continue\n            return np.array(results['imgs'][0]), np.array(\n                results['imgs'][1]), np.array([results['labels']])",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/MRI_SlowFast.py:109-111"
+    },
+    "4843": {
+        "file_id": 416,
+        "content": "The code is part of a function that returns three arrays: the first image from the 'imgs' list, the second image, and the labels. If there are more images available, the function continues processing them; if not, it returns the stored images and labels.",
+        "type": "comment"
+    },
+    "4844": {
+        "file_id": 417,
+        "content": "/paddlevideo/loader/dataset/__init__.py",
+        "type": "filepath"
+    },
+    "4845": {
+        "file_id": 417,
+        "content": "The code imports various dataset classes from PaddleVideo library for video understanding tasks, and adds them to the `__all__` list for accessibility. These datasets include VideoDataset, FrameDataset, and more, with licensing information provided.",
+        "type": "summary"
+    },
+    "4846": {
+        "file_id": 417,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .actbert_dataset import ActBertDataset\nfrom .ava_dataset import AVADataset\nfrom .bmn_dataset import BMNDataset\nfrom .davis_dataset import DavisDataset\nfrom .feature import FeatureDataset\nfrom .frame import FrameDataset, FrameDataset_Sport\nfrom .MRI import MRIDataset\nfrom .MRI_SlowFast import SFMRIDataset\nfrom .msrvtt import MSRVTTDataset\nfrom .actbert_dataset import ActBertDataset\nfrom .asrf_dataset import ASRFDataset",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/__init__.py:1-25"
+    },
+    "4847": {
+        "file_id": 417,
+        "content": "This code is importing various dataset classes from different modules in the PaddleVideo library. These datasets are used for video understanding tasks, such as action recognition, activity classification, and video captioning. The code also includes licensing information and mentions that these datasets can be accessed on an \"AS IS\" basis.",
+        "type": "comment"
+    },
+    "4848": {
+        "file_id": 417,
+        "content": "from .ms_tcn_dataset import MSTCNDataset\nfrom .oxford import MonoDataset\nfrom .skeleton import SkeletonDataset\nfrom .slowfast_video import SFVideoDataset\nfrom .video import VideoDataset\nfrom .ucf101_skeleton import UCF101SkeletonDataset\nfrom .ucf24_dataset import UCF24Dataset\n__all__ = [\n    'VideoDataset', 'FrameDataset', 'SFVideoDataset', 'BMNDataset',\n    'FeatureDataset', 'SkeletonDataset', 'AVADataset', 'MonoDataset',\n    'MSRVTTDataset', 'ActBertDataset', 'DavisDataset', 'MRIDataset',\n    'SFMRIDataset', 'FrameDataset_Sport', 'MSTCNDataset', 'ASRFDataset',\n    'UCF101SkeletonDataset', 'UCF24Dataset'\n]",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/__init__.py:26-41"
+    },
+    "4849": {
+        "file_id": 417,
+        "content": "This code imports several dataset classes and adds them to the `__all__` list, making them accessible within this module. The datasets include VideoDataset, FrameDataset, SFVideoDataset, BMNDataset, FeatureDataset, SkeletonDataset, AVADataset, MonoDataset, MSRVTTDataset, ActBertDataset, DavisDataset, MRIDataset, SFMRIDataset, FrameDataset_Sport, MSTCNDataset, ASRFDataset, UCF101SkeletonDataset, and UCF24Dataset.",
+        "type": "comment"
+    },
+    "4850": {
+        "file_id": 418,
+        "content": "/paddlevideo/loader/dataset/actbert_dataset.py",
+        "type": "filepath"
+    },
+    "4851": {
+        "file_id": 418,
+        "content": "The code sets up ActBERT dataset in PaddlePaddle's video processing library, initializing the dataset with necessary libraries and packages. It defines two methods: \"prepare_train\" for preparing frames for training and a placeholder \"prepare_test\".",
+        "type": "summary"
+    },
+    "4852": {
+        "file_id": 418,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\ntry:\n    import lmdb\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT.\"\n    )\nimport pickle\nimport json\ntry:\n    from paddlenlp.transformers import BertTokenizer\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT.\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/actbert_dataset.py:1-31"
+    },
+    "4853": {
+        "file_id": 418,
+        "content": "This code is importing necessary libraries and packages, checking for missing dependencies, and setting up the ActBERT dataset in PaddlePaddle's video processing library.",
+        "type": "comment"
+    },
+    "4854": {
+        "file_id": 418,
+        "content": "    )\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass ActBertDataset(BaseDataset):\n    \"\"\"ActBert dataset.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        bert_model=\"bert-base-uncased\",\n        data_prefix=None,\n        test_mode=False,\n    ):\n        self.bert_model = bert_model\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        feature_data = np.load(self.file_path, allow_pickle=True)\n        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model,\n                                                       do_lower_case=True)\n        self.info = []\n        for item in feature_data:\n            self.info.append(dict(feature=item, tokenizer=self.tokenizer))\n        return self.info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid given index. \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/actbert_dataset.py:32-66"
+    },
+    "4855": {
+        "file_id": 418,
+        "content": "Class ActBertDataset is a dataset for PaddleVideo, initialized with file path, pipeline, bert_model, data_prefix and test mode. It loads the index file to get video information, uses the tokenizer from pre-trained bert model, and stores information in the info list. The load_file method is used to load the feature data and prepare the dataset for training or validation.",
+        "type": "comment"
+    },
+    "4856": {
+        "file_id": 418,
+        "content": "        results = copy.deepcopy(self.info[idx])\n        #print('==results==', results)\n        results = self.pipeline(results)\n        return results['features']\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        pass",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/actbert_dataset.py:67-74"
+    },
+    "4857": {
+        "file_id": 418,
+        "content": "This code defines two methods: \"prepare_train\" and \"prepare_test\". The former prepares the frames for training given an index by creating a deep copy of info at that index, applies the pipeline to it, and returns the features from the result. The latter is a placeholder method with no implementation.",
+        "type": "comment"
+    },
+    "4858": {
+        "file_id": 419,
+        "content": "/paddlevideo/loader/dataset/asrf_dataset.py",
+        "type": "filepath"
+    },
+    "4859": {
+        "file_id": 419,
+        "content": "This PaddleVideo library code initializes a dataset class for action segmentation videos, includes methods to load data for training/validation, and loads video features, labels, and boundaries using a pipeline.",
+        "type": "summary"
+    },
+    "4860": {
+        "file_id": 419,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport os\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass ASRFDataset(BaseDataset):\n    \"\"\"Video dataset for action segmentation.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        feature_path,\n        label_path,\n        boundary_path,\n        **kwargs,",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/asrf_dataset.py:1-38"
+    },
+    "4861": {
+        "file_id": 419,
+        "content": "The code imports necessary libraries and defines the ASRFDataset class for action segmentation video datasets. It registers this dataset with the DATASETS registry and initializes the dataset with specified file paths and pipeline parameters.",
+        "type": "comment"
+    },
+    "4862": {
+        "file_id": 419,
+        "content": "    ):\n        super().__init__(file_path, pipeline, **kwargs)\n        self.label_path = label_path\n        self.boundary_path = boundary_path\n        self.feature_path = feature_path\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        file_ptr = open(self.file_path, 'r')\n        info = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID: Prepare data for training/valid given the index.\"\"\"\n        results = {}\n        video_name = self.info[idx]\n        # load video feature\n        file_name = video_name.split('.')[0] + \".npy\"\n        feat_file_path = os.path.join(self.feature_path, file_name)\n        #TODO: check path\n        video_feat = np.load(feat_file_path)\n        # load label\n        file_name = video_name.split('.')[0] + \".npy\"\n        label_file_path = os.path.join(self.label_path, file_name)\n        label = np.load(label_file_path).astype(np.int64)\n        # load boundary\n        file_name = video_name.split('.')[0] + \".npy\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/asrf_dataset.py:39-68"
+    },
+    "4863": {
+        "file_id": 419,
+        "content": "The code initializes an instance of a dataset class with file paths for labels, boundaries, and features. It defines methods to load index files containing video information and prepare data for training/validation, including loading video features, labels, and boundaries based on the given index.",
+        "type": "comment"
+    },
+    "4864": {
+        "file_id": 419,
+        "content": "        boundary_file_path = os.path.join(self.boundary_path, file_name)\n        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)\n        results['video_feat'] = copy.deepcopy(video_feat)\n        results['video_label'] = copy.deepcopy(label)\n        results['video_boundary'] = copy.deepcopy(boundary)\n        results = self.pipeline(results)\n        return results['video_feat'], results['video_label'], results['video_boundary']\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        results = {}\n        video_name = self.info[idx]\n        # load video feature\n        file_name = video_name.split('.')[0] + \".npy\"\n        feat_file_path = os.path.join(self.feature_path, file_name)\n        #TODO: check path\n        video_feat = np.load(feat_file_path)\n        # load label\n        file_name = video_name.split('.')[0] + \".npy\"\n        label_file_path = os.path.join(self.label_path, file_name)\n        label = np.load(label_file_path).astype(np.int64)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/asrf_dataset.py:69-92"
+    },
+    "4865": {
+        "file_id": 419,
+        "content": "The code above is from a dataset loader class in the PaddleVideo library. It loads video features, labels, and boundaries for either training or testing data. The prepare_test function loads video features and labels given an index. The code uses numpy to load data from specified file paths and deepcopy the results for further processing by the pipeline function.",
+        "type": "comment"
+    },
+    "4866": {
+        "file_id": 419,
+        "content": "        # load boundary\n        file_name = video_name.split('.')[0] + \".npy\"\n        boundary_file_path = os.path.join(self.boundary_path, file_name)\n        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)\n        results['video_feat'] = copy.deepcopy(video_feat)\n        results['video_label'] = copy.deepcopy(label)\n        results['video_boundary'] = copy.deepcopy(boundary)\n        results = self.pipeline(results)\n        return results['video_feat'], results['video_label'], results['video_boundary']",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/asrf_dataset.py:94-104"
+    },
+    "4867": {
+        "file_id": 419,
+        "content": "This code snippet loads the boundary data for a video, reads it from a file using numpy's load function, and assigns it to a variable named 'boundary'. The code then creates a results dictionary, copies video features, labels, and boundaries into their respective keys in the results dictionary. Finally, it passes this dictionary through a pipeline and returns the video features, labels, and boundaries.",
+        "type": "comment"
+    },
+    "4868": {
+        "file_id": 420,
+        "content": "/paddlevideo/loader/dataset/ava_dataset.py",
+        "type": "filepath"
+    },
+    "4869": {
+        "file_id": 420,
+        "content": "The code introduces a spatial-temporal detection dataset class in PaddleVideo, initializes attributes and evaluation functions, loads records from paths, prepares training data by filtering proposals and annotations, pads elements to fixed lengths, and defines methods for padding 2D/1D features.",
+        "type": "summary"
+    },
+    "4870": {
+        "file_id": 420,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nimport sys\nimport os\nimport pickle\nfrom datetime import datetime\nfrom ...metrics.ava_utils import ava_evaluate_results\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom collections import defaultdict\n@DATASETS.register()\nclass AVADataset(BaseDataset):\n    \"\"\"AVA dataset for spatial temporal detection.\n    the dataset loads raw frames, bounding boxes, proposals and applies",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:1-32"
+    },
+    "4871": {
+        "file_id": 420,
+        "content": "This code snippet is the AVA dataset class for spatial-temporal detection, which is part of PaddleVideo. It imports necessary modules and registers the dataset in the DATASETS registry. The class inherits from BaseDataset and includes a function ava_evaluate_results for evaluation.",
+        "type": "comment"
+    },
+    "4872": {
+        "file_id": 420,
+        "content": "    transformations to return the frame tensors and other information.\n    \"\"\"\n    _FPS = 30\n    def __init__(self,\n                 pipeline,\n                 file_path=None,\n                 exclude_file=None,\n                 label_file=None,\n                 suffix='{:05}.jpg',\n                 proposal_file=None,\n                 person_det_score_thr=0.9,\n                 num_classes=81,\n                 data_prefix=None,\n                 test_mode=False,\n                 num_max_proposals=1000,\n                 timestamp_start=900,\n                 timestamp_end=1800):\n        self.custom_classes = None\n        self.exclude_file = exclude_file\n        self.label_file = label_file\n        self.proposal_file = proposal_file\n        assert 0 <= person_det_score_thr <= 1, (\n            'The value of '\n            'person_det_score_thr should in [0, 1]. ')\n        self.person_det_score_thr = person_det_score_thr\n        self.num_classes = num_classes\n        self.suffix = suffix\n        self.num_max_proposals = num_max_proposals",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:33-62"
+    },
+    "4873": {
+        "file_id": 420,
+        "content": "This code is initializing a class with various parameters for the AvaDataset. It sets default values and performs checks on input values, such as ensuring 'person_det_score_thr' falls within 0 to 1 range. The code also initializes instance variables, including custom classes, exclude file path, label file path, proposal file path, and more.",
+        "type": "comment"
+    },
+    "4874": {
+        "file_id": 420,
+        "content": "        self.timestamp_start = timestamp_start\n        self.timestamp_end = timestamp_end\n        super().__init__(\n            file_path,\n            pipeline,\n            data_prefix,\n            test_mode,\n        )\n        if self.proposal_file is not None:\n            self.proposals = self._load(self.proposal_file)\n        else:\n            self.proposals = None\n        if not test_mode:\n            valid_indexes = self.filter_exclude_file()\n            self.info = self.info = [self.info[i] for i in valid_indexes]\n    def _load(self, path):\n        f = open(path, 'rb')\n        res = pickle.load(f)\n        f.close()\n        return res\n    def parse_img_record(self, img_records):\n        bboxes, labels, entity_ids = [], [], []\n        while len(img_records) > 0:\n            img_record = img_records[0]\n            num_img_records = len(img_records)\n            selected_records = list(\n                filter(\n                    lambda x: np.array_equal(x['entity_box'], img_record[\n                        'entity_box']), img_records))",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:63-93"
+    },
+    "4875": {
+        "file_id": 420,
+        "content": "The code snippet initializes class attributes and checks for proposal file. If the proposal file exists, it loads the proposals; otherwise, it sets them as None. It then filters out invalid indexes if not in test mode. The code also includes a method to load data from a given path using pickle and close the file afterward. Another method parses img_records by extracting bounding boxes, labels, and entity IDs.",
+        "type": "comment"
+    },
+    "4876": {
+        "file_id": 420,
+        "content": "            num_selected_records = len(selected_records)\n            img_records = list(\n                filter(\n                    lambda x: not np.array_equal(x['entity_box'], img_record[\n                        'entity_box']), img_records))\n            assert len(img_records) + num_selected_records == num_img_records\n            bboxes.append(img_record['entity_box'])\n            valid_labels = np.array([\n                selected_record['label'] for selected_record in selected_records\n            ])\n            label = np.zeros(self.num_classes, dtype=np.float32)\n            label[valid_labels] = 1.\n            labels.append(label)\n            entity_ids.append(img_record['entity_id'])\n        bboxes = np.stack(bboxes)\n        labels = np.stack(labels)\n        entity_ids = np.stack(entity_ids)\n        return bboxes, labels, entity_ids\n    def filter_exclude_file(self):\n        valid_indexes = []\n        if self.exclude_file is None:\n            valid_indexes = list(range(len(self.info)))\n        else:\n            exclude_video_infos = [",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:94-122"
+    },
+    "4877": {
+        "file_id": 420,
+        "content": "This code is filtering out specific records from the dataset. It checks if the entity box of each record matches with a given img_record's entity box, excluding them if they do. If there are no exclude file information, it includes all the records in valid_indexes. Finally, it stacks and returns bboxes, labels, and entity_ids for further processing.",
+        "type": "comment"
+    },
+    "4878": {
+        "file_id": 420,
+        "content": "                x.strip().split(',') for x in open(self.exclude_file)\n            ]\n            for i, video_info in enumerate(self.info):\n                valid_indexes.append(i)\n                for video_id, timestamp in exclude_video_infos:\n                    if (video_info['video_id'] == video_id\n                            and video_info['timestamp'] == int(timestamp)):\n                        valid_indexes.pop()\n                        break\n        return valid_indexes\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        records_dict_by_img = defaultdict(list)\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split(',')\n                video_id = line_split[0]\n                timestamp = int(line_split[1])\n                img_key = f'{video_id},{timestamp:04d}'\n                entity_box = np.array(list(map(float, line_split[2:6])))\n                label = int(line_split[6])\n                entity_id = int(line_split[7])",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:123-148"
+    },
+    "4879": {
+        "file_id": 420,
+        "content": "The code reads a file, splits each line into video ID, timestamp, and other data. It then checks for any exclusion videos based on the ID and timestamp. If found, it removes that index from the valid_indexes list. Finally, it returns the updated valid_indexes list. The load_file method reads the file, extracts information including video ID, timestamp, entity box, label, and entity ID for each line.",
+        "type": "comment"
+    },
+    "4880": {
+        "file_id": 420,
+        "content": "                shot_info = (0, (self.timestamp_end - self.timestamp_start) *\n                             self._FPS)\n                video_info = dict(video_id=video_id,\n                                  timestamp=timestamp,\n                                  entity_box=entity_box,\n                                  label=label,\n                                  entity_id=entity_id,\n                                  shot_info=shot_info)\n                records_dict_by_img[img_key].append(video_info)\n        for img_key in records_dict_by_img:\n            video_id, timestamp = img_key.split(',')\n            bboxes, labels, entity_ids = self.parse_img_record(\n                records_dict_by_img[img_key])\n            ann = dict(gt_bboxes=bboxes,\n                       gt_labels=labels,\n                       entity_ids=entity_ids)\n            frame_dir = video_id\n            if self.data_prefix is not None:\n                frame_dir = osp.join(self.data_prefix, frame_dir)\n            video_info = dict(frame_dir=frame_dir,",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:149-170"
+    },
+    "4881": {
+        "file_id": 420,
+        "content": "The code initializes `shot_info` based on the timestamp range and FPS, then creates a `video_info` dictionary containing various video details. It appends this information to the `records_dict_by_img` for each `img_key`. Next, it extracts video ID and timestamp from `img_key`, calls `parse_img_record()`, and stores the resulting bounding boxes, labels, and entity IDs in an `ann` dictionary. Finally, it sets the frame directory path and adds a new `video_info` dictionary for each video.",
+        "type": "comment"
+    },
+    "4882": {
+        "file_id": 420,
+        "content": "                              video_id=video_id,\n                              timestamp=int(timestamp),\n                              img_key=img_key,\n                              shot_info=shot_info,\n                              fps=self._FPS,\n                              ann=ann)\n            info.append(video_info)\n        return info\n    def prepare_train(self, idx):\n        results = copy.deepcopy(self.info[idx])\n        img_key = results['img_key']\n        results['suffix'] = self.suffix\n        results['timestamp_start'] = self.timestamp_start\n        results['timestamp_end'] = self.timestamp_end\n        if self.proposals is not None:\n            if img_key not in self.proposals:\n                results['proposals'] = np.array([[0, 0, 1, 1]])\n                results['scores'] = np.array([1])\n            else:\n                proposals = self.proposals[img_key]\n                assert proposals.shape[-1] in [4, 5]\n                if proposals.shape[-1] == 5:\n                    thr = min(self.person_det_score_thr, max(proposals[:, 4]))",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:171-197"
+    },
+    "4883": {
+        "file_id": 420,
+        "content": "The code initializes a video information object with the provided parameters, including the video ID, timestamp, image key, shot info, FPS, and annotations. It then appends this object to a list of video information. The prepare_train method takes an index, creates a copy of the corresponding video information from the list, adds suffix and timestamp information if applicable, and populates proposals with default values if not present in self.proposals.",
+        "type": "comment"
+    },
+    "4884": {
+        "file_id": 420,
+        "content": "                    positive_inds = (proposals[:, 4] >= thr)\n                    proposals = proposals[positive_inds]\n                    proposals = proposals[:self.num_max_proposals]\n                    results['proposals'] = proposals[:, :4]\n                    results['scores'] = proposals[:, 4]\n                else:\n                    proposals = proposals[:self.num_max_proposals]\n                    results['proposals'] = proposals\n        ann = results.pop('ann')\n        results['gt_bboxes'] = ann['gt_bboxes']\n        results['gt_labels'] = ann['gt_labels']\n        results['entity_ids'] = ann['entity_ids']\n        #ret = self.pipeline(results, \"\")\n        ret = self.pipeline(results)\n        #padding for dataloader\n        len_proposals = ret['proposals'].shape[0]\n        len_gt_bboxes = ret['gt_bboxes'].shape[0]\n        len_gt_labels = ret['gt_labels'].shape[0]\n        len_scores = ret['scores'].shape[0]\n        len_entity_ids = ret['entity_ids'].shape[0]\n        padding_len = 128\n        ret['proposals'] = self.my_padding_2d(ret['proposals'], padding_len)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:198-221"
+    },
+    "4885": {
+        "file_id": 420,
+        "content": "This code is filtering and padding proposals and annotations for a dataset. It selects positive proposals based on a threshold, limits the number of proposals to the maximum allowed, and assigns the results to different categories. If there are no positive proposals, it simply limits the number and assigns them. After that, it retrieves ground truth bounding boxes, labels, and entity IDs from the 'ann' dictionary. Finally, the code pads the proposals, scores, and other elements with zeros to reach a fixed length of 128 using a custom padding function.",
+        "type": "comment"
+    },
+    "4886": {
+        "file_id": 420,
+        "content": "        ret['gt_bboxes'] = self.my_padding_2d(ret['gt_bboxes'], padding_len)\n        ret['gt_labels'] = self.my_padding_2d(ret['gt_labels'], padding_len)\n        ret['scores'] = self.my_padding_1d(ret['scores'], padding_len)\n        ret['entity_ids'] = self.my_padding_1d(ret['entity_ids'], padding_len)\n        return ret['imgs'][0], ret['imgs'][1], ret['proposals'], ret[\n            'gt_bboxes'], ret['gt_labels'], ret['scores'], ret[\n                'entity_ids'], np.array(\n                    ret['img_shape'], dtype=int\n                ), idx, len_proposals, len_gt_bboxes, len_gt_labels, len_scores, len_entity_ids\n    def my_padding_2d(self, feat, max_len):\n        feat_add = np.zeros((max_len - feat.shape[0], feat.shape[1]),\n                            dtype=np.float32)\n        feat_pad = np.concatenate((feat, feat_add), axis=0)\n        return feat_pad\n    def my_padding_1d(self, feat, max_len):\n        feat_add = np.zeros((max_len - feat.shape[0]), dtype=np.float32)\n        feat_pad = np.concatenate((feat, feat_add), axis=0)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:222-240"
+    },
+    "4887": {
+        "file_id": 420,
+        "content": "This code snippet defines a class with methods for padding 2D and 1D features. The 'my_padding_2d' method takes a feature matrix and pads it with zeros to the maximum length specified, while the 'my_padding_1d' method does the same but for 1D features. These methods are then called in another function to pad various feature matrices before returning them along with other variables.",
+        "type": "comment"
+    },
+    "4888": {
+        "file_id": 420,
+        "content": "        return feat_pad\n    def prepare_test(self, idx):\n        return self.prepare_train(idx)\n    def evaluate(self, results):\n        return ava_evaluate_results(self.info, len(self), results,\n                                    self.custom_classes, self.label_file,\n                                    self.file_path, self.exclude_file)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ava_dataset.py:241-249"
+    },
+    "4889": {
+        "file_id": 420,
+        "content": "The code defines three functions: 'prepare_train', 'prepare_test', and 'evaluate'. The 'prepare_train' function is used to prepare training data given an index, while the 'prepare_test' function returns the same as 'prepare_train'. The 'evaluate' function evaluates the results using 'ava_evaluate_results' by passing various arguments.",
+        "type": "comment"
+    },
+    "4890": {
+        "file_id": 421,
+        "content": "/paddlevideo/loader/dataset/base.py",
+        "type": "filepath"
+    },
+    "4891": {
+        "file_id": 421,
+        "content": "This class defines the BaseDataset for PaddlePaddle, with methods for loading data, preparing training and testing sets, and retrieving samples. It supports list format results due to limitations in Paddle.io.DataLoader.",
+        "type": "summary"
+    },
+    "4892": {
+        "file_id": 421,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport numpy as np\nfrom abc import ABC, abstractmethod\nimport paddle\nfrom paddle.io import Dataset\nclass BaseDataset(Dataset, ABC):\n    \"\"\"Base class for datasets\n    All datasets should subclass it.\n    All subclass should overwrite:\n    - Method: `load_file`, load info from index file.\n    - Method: `prepare_train`, providing train data.\n    - Method: `prepare_test`, providing test data.",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/base.py:1-32"
+    },
+    "4893": {
+        "file_id": 421,
+        "content": "This code is a Python class definition for the BaseDataset, which serves as the base class for all dataset types in PaddlePaddle. It requires subclasses to define load_file method for loading info from index files and provide train and test data using prepare_train and prepare_test methods respectively.",
+        "type": "comment"
+    },
+    "4894": {
+        "file_id": 421,
+        "content": "    Args:\n        file_path (str): index file path.\n        pipeline (Sequence XXX)\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): whether to build test dataset. Default: False.\n    \"\"\"\n    def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):\n        super().__init__()\n        self.file_path = file_path\n        self.data_prefix = osp.realpath(data_prefix) if \\\n            data_prefix is not None and osp.isdir(data_prefix) else data_prefix\n        self.test_mode = test_mode\n        self.pipeline = pipeline\n        self.info = self.load_file()\n    @abstractmethod\n    def load_file(self):\n        \"\"\"load the video information from the index file path.\"\"\"\n        pass\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/base.py:34-59"
+    },
+    "4895": {
+        "file_id": 421,
+        "content": "This code initializes a base dataset class with file path, pipeline, data prefix, and test mode as arguments. It loads video information from the index file using load_file() method, supports training and validation, but cannot handle dict type results due to Paddle.io limitations.",
+        "type": "comment"
+    },
+    "4896": {
+        "file_id": 421,
+        "content": "        #unsqueeze label to list\n        return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        #unsqueeze label to list\n        return results['imgs'], np.array([results['labels']])\n    def __len__(self):\n        \"\"\"get the size of the dataset.\"\"\"\n        return len(self.info)\n    def __getitem__(self, idx):\n        \"\"\" Get the sample for either training or testing given index\"\"\"\n        if self.test_mode:\n            return self.prepare_test(idx)\n        else:\n            return self.prepare_train(idx)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/base.py:60-80"
+    },
+    "4897": {
+        "file_id": 421,
+        "content": "This code defines a dataset class with methods for preparing data for training and testing. The `prepare_train` method returns the input images and labels for training, while the `prepare_test` method does the same for testing. The `__len__` method returns the size of the dataset, and the `__getitem__` method retrieves either a training or testing sample based on the mode. Due to an issue with Paddle.io.DataLoader not supporting dict type retval, the results are converted to list format.",
+        "type": "comment"
+    },
+    "4898": {
+        "file_id": 422,
+        "content": "/paddlevideo/loader/dataset/bmn_dataset.py",
+        "type": "filepath"
+    },
+    "4899": {
+        "file_id": 422,
+        "content": "The BMNDataset class handles video datasets for action localization, initializing with file path, pipeline, and subset information. It loads data, sorts by name, and returns features, ground truth IOU map, and start frame indices for training. The function also prepares test data given an index by processing through the pipeline and returning selected results.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/49.json b/docs/data/49.json
new file mode 100644
index 000000000..0dde73320
--- /dev/null
+++ b/docs/data/49.json
@@ -0,0 +1,544 @@
+{
+    "4900": {
+        "file_id": 422,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport json\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass BMNDataset(BaseDataset):\n    \"\"\"Video dataset for action localization.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        subset,\n        **kwargs,\n    ):\n        self.subset = subset\n        super().__init__(file_path, pipeline, **kwargs)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/bmn_dataset.py:1-36"
+    },
+    "4901": {
+        "file_id": 422,
+        "content": "This code snippet defines the BMNDataset class for video datasets used in action localization. It imports necessary modules, registers the class with the DATASETS registry, and initializes the dataset with file path, pipeline, and subset information. Logger is also defined for logging purposes.",
+        "type": "comment"
+    },
+    "4902": {
+        "file_id": 422,
+        "content": "    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        annos = json.load(open(self.file_path))\n        for video_name in annos.keys():\n            video_subset = annos[video_name][\"subset\"]\n            if self.subset in video_subset:\n                info.append(\n                    dict(\n                        video_name=video_name,\n                        video_info=annos[video_name],\n                    ))\n        #sort by video_name\n        sort_f = lambda elem: elem['video_name']\n        info.sort(key=sort_f)\n        #add video_idx to info\n        for idx, elem in enumerate(info):\n            info[idx]['video_idx'] = idx\n        logger.info(\"{} subset video numbers: {}\".format(\n            self.subset, len(info)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID: Prepare data for training/valid given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        return results['video_feat'], results['gt_iou_map'], results['gt_start'],\\",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/bmn_dataset.py:38-64"
+    },
+    "4903": {
+        "file_id": 422,
+        "content": "This function is loading an index file to get video information and then sorts the data by video name. It also adds a video_idx attribute to each element in the list and returns the video features, ground truth IOU map, and start frame indices for training purposes.",
+        "type": "comment"
+    },
+    "4904": {
+        "file_id": 422,
+        "content": "               results['gt_end']\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        return results['video_feat'], results['gt_iou_map'], results['gt_start'], \\\n               results['gt_end'], results['video_idx']",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/bmn_dataset.py:65-72"
+    },
+    "4905": {
+        "file_id": 422,
+        "content": "This function prepares test data given an index by copying the dataset info, processing it through the pipeline, and returning selected results (video_feat, gt_iou_map, gt_start, gt_end, video_idx).",
+        "type": "comment"
+    },
+    "4906": {
+        "file_id": 423,
+        "content": "/paddlevideo/loader/dataset/davis_dataset.py",
+        "type": "filepath"
+    },
+    "4907": {
+        "file_id": 423,
+        "content": "The Python class VOS_Test extends BaseDataset for video object segmentation tasks and supports pipeline mode, color type options, and resizing. The Davis 2017 dataset is initialized in PaddleVideo and returns a sequence dataset with images, labels, and fixed resolution of 480 pixels.",
+        "type": "summary"
+    },
+    "4908": {
+        "file_id": 423,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nimport shutil\nfrom PIL import Image\nimport cv2\nfrom paddle.io import Dataset\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\nclass VOS_Test(Dataset):\n    \"\"\"process frames in each video\n    \"\"\"\n    def __init__(self,\n                 image_root,\n                 label_root,",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/davis_dataset.py:1-37"
+    },
+    "4909": {
+        "file_id": 423,
+        "content": "This code snippet is from PaddleVideo's davis_dataset.py file and it appears to be a Python class named VOS_Test, which extends the BaseDataset class from the same module. The class is used for processing frames in each video of a dataset. It takes image_root and label_root as input parameters for accessing the required data. The logger is imported from paddle.utils to log any relevant information during execution. This dataset seems to be designed for video object segmentation (VOS) tasks, commonly used in computer vision applications.",
+        "type": "comment"
+    },
+    "4910": {
+        "file_id": 423,
+        "content": "                 seq_name,\n                 images,\n                 labels,\n                 pipeline=None,\n                 rgb=False,\n                 resolution=None):\n        self.image_root = image_root\n        self.label_root = label_root\n        self.seq_name = seq_name\n        self.images = images  # image file list\n        self.labels = labels\n        self.obj_num = 1\n        self.num_frame = len(self.images)\n        self.pipeline = pipeline\n        self.rgb = rgb\n        self.resolution = resolution\n        self.obj_nums = []\n        temp_obj_num = 0\n        for img_name in self.images:\n            self.obj_nums.append(temp_obj_num)\n            current_label_name = img_name.split('.')[0] + '.png'\n            if current_label_name in self.labels:\n                current_label = self.read_label(current_label_name)\n                if temp_obj_num < np.unique(\n                        current_label)[-1]:  #get object number from label_id\n                    temp_obj_num = np.unique(current_label)[-1]\n    def __len__(self):",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/davis_dataset.py:38-66"
+    },
+    "4911": {
+        "file_id": 423,
+        "content": "This code initializes the dataset with image and label file lists, image root and label root paths. It sets object number, total frames, pipeline mode, color type, resolution, creates an object numbers list, and assigns object numbers from labels.",
+        "type": "comment"
+    },
+    "4912": {
+        "file_id": 423,
+        "content": "        return len(self.images)\n    def read_image(self, idx):\n        img_name = self.images[idx]\n        img_path = os.path.join(self.image_root, self.seq_name, img_name)\n        img = cv2.imread(img_path)\n        img = np.array(img, dtype=np.float32)\n        if self.rgb:\n            img = img[:, :, [2, 1, 0]]\n        return img\n    def read_label(self, label_name):\n        label_path = os.path.join(self.label_root, self.seq_name, label_name)\n        label = Image.open(label_path)\n        label = np.array(label, dtype=np.uint8)\n        return label\n    def __getitem__(self, idx):\n        img_name = self.images[idx]\n        current_img = self.read_image(idx)\n        current_img = np.array(current_img)\n        height, width, channels = current_img.shape\n        if self.resolution is not None:\n            width = int(np.ceil(float(width) * self.resolution / float(height)))\n            height = int(self.resolution)\n        current_label_name = img_name.split('.')[0] + '.png'\n        obj_num = self.obj_nums[idx]",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/davis_dataset.py:67-94"
+    },
+    "4913": {
+        "file_id": 423,
+        "content": "This code defines a class that loads data from the DAVIS dataset. It first returns the number of images in the dataset, then reads an image at a given index, and finally reads a corresponding label for the image. The class also allows resizing the images to a specified resolution if needed.",
+        "type": "comment"
+    },
+    "4914": {
+        "file_id": 423,
+        "content": "        if current_label_name in self.labels:\n            current_label = self.read_label(current_label_name)\n            current_label = np.array(current_label)\n            sample = {\n                'current_img': current_img,\n                'current_label': current_label\n            }\n        else:\n            sample = {\n                'current_img': current_img\n            }  #only the first frame contains label\n        sample['meta'] = {\n            'seq_name': self.seq_name,\n            'frame_num': self.num_frame,\n            'obj_num': obj_num,\n            'current_name': img_name,\n            'height': height,\n            'width': width,\n            'flip': False\n        }\n        if self.pipeline is not None:\n            sample = self.pipeline(sample)\n        for s in sample:\n            s['current_img'] = np.array(s['current_img'])\n            if 'current_label' in s.keys():\n                s['current_label'] = s['current_label']\n        return sample\n@DATASETS.register()\nclass DavisDataset(BaseDataset):",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/davis_dataset.py:96-127"
+    },
+    "4915": {
+        "file_id": 423,
+        "content": "The function generates a sample for a dataset, including image and label data. It checks if the current_label_name is in labels, reads the label if present, creates a sample dictionary, adds metadata to the sample, applies a pipeline if one is specified, and converts 'current_img' to numpy array format.",
+        "type": "comment"
+    },
+    "4916": {
+        "file_id": 423,
+        "content": "    \"\"\"Davis 2017 dataset.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        result_root,\n        pipeline,\n        data_prefix=None,\n        test_mode=False,\n        year=2017,\n        rgb=False,\n        resolution='480p',\n    ):\n        self.rgb = rgb\n        self.result_root = result_root\n        self.resolution = resolution\n        self.year = year\n        self.spt = 'val' if test_mode else 'train'\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        self.image_root = os.path.join(self.file_path, 'JPEGImages',\n                                       self.resolution)\n        self.label_root = os.path.join(self.file_path, 'Annotations',\n                                       self.resolution)\n        seq_names = []\n        with open(\n                os.path.join(self.file_path, 'ImageSets', str(self.year),\n                             self.spt + '.txt')) as f:\n            seqs_tmp = f.readlines()\n        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/davis_dataset.py:128-158"
+    },
+    "4917": {
+        "file_id": 423,
+        "content": "The code represents the initialization and file loading process for the Davis 2017 dataset in PaddleVideo. The constructor takes various parameters like file path, result root, pipeline, data prefix, test mode, year, rgb, and resolution to initialize the class attributes. The load_file() method sets image and label roots based on the provided resolution and reads the sequence names from a specified file.",
+        "type": "comment"
+    },
+    "4918": {
+        "file_id": 423,
+        "content": "        seq_names.extend(seqs_tmp)\n        self.info = list(np.unique(seq_names))\n        return self.info\n    def prepare_test(self, idx):\n        seq_name = self.info[idx]  #video name\n        images = list(\n            np.sort(os.listdir(os.path.join(self.image_root, seq_name))))\n        labels = [images[0].replace('jpg', 'png')]  #we have first frame target\n        # copy first frame target\n        if not os.path.isfile(\n                os.path.join(self.result_root, seq_name, labels[0])):\n            if not os.path.exists(os.path.join(self.result_root, seq_name)):\n                os.makedirs(os.path.join(self.result_root, seq_name))\n            source_label_path = os.path.join(self.label_root, seq_name,\n                                             labels[0])\n            result_label_path = os.path.join(self.result_root, seq_name,\n                                             labels[0])\n            shutil.copy(source_label_path, result_label_path)\n        seq_dataset = VOS_Test(self.image_root,\n                               self.label_root,",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/davis_dataset.py:159-182"
+    },
+    "4919": {
+        "file_id": 423,
+        "content": "This function prepares a test dataset for the VOS task. It retrieves the video name from the info list, then lists all image files in the corresponding directory and adds the first frame as the target label. If the target label does not exist in the result directory, it creates the necessary directories and copies the target label file to the correct location.",
+        "type": "comment"
+    },
+    "4920": {
+        "file_id": 423,
+        "content": "                               seq_name,\n                               images,\n                               labels,\n                               self.pipeline,\n                               rgb=self.rgb,\n                               resolution=480)\n        return seq_dataset",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/davis_dataset.py:183-189"
+    },
+    "4921": {
+        "file_id": 423,
+        "content": "This code is returning a sequence dataset named seq_name with associated images and labels, processed by the pipeline function specified, potentially using RGB format if self.rgb is True, and a fixed resolution of 480 pixels.",
+        "type": "comment"
+    },
+    "4922": {
+        "file_id": 424,
+        "content": "/paddlevideo/loader/dataset/feature.py",
+        "type": "filepath"
+    },
+    "4923": {
+        "file_id": 424,
+        "content": "The Python class FeatureDataset, part of the PaddleVideo library, initializes attributes and provides methods for action recognition tasks. The code also includes a prepare_test function to prepare data for testing by applying a pipeline and checking 'iou_norm' results.",
+        "type": "summary"
+    },
+    "4924": {
+        "file_id": 424,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport os.path as osp\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\n@DATASETS.register()\nclass FeatureDataset(BaseDataset):\n    \"\"\"Feature dataset for action recognition\n       Example:(TODO)\n       Args:(TODO)\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        data_prefix=None,\n        test_mode=False,\n        suffix=None,\n    ):\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/feature.py:1-36"
+    },
+    "4925": {
+        "file_id": 424,
+        "content": "This code is a Python class named FeatureDataset, which is a subclass of BaseDataset. It appears to be part of the PaddleVideo library and is used for action recognition tasks. The class has an __init__ method that initializes various attributes such as file_path, pipeline, data_prefix, test_mode, and suffix. The class is registered in the DATASETS registry.",
+        "type": "comment"
+    },
+    "4926": {
+        "file_id": 424,
+        "content": "    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                filename = line.strip().split()[0]\n                if self.data_prefix is not None:\n                    filename = osp.join(self.data_prefix, filename)\n                if self.suffix is not None:\n                    filename = filename + self.suffix\n                info.append(dict(filename=filename))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        if 'iou_norm' in results:\n            return results['rgb_data'], results['rgb_len'], results[\n                'rgb_mask'], results['audio_data'], results[\n                    'audio_len'], results['audio_mask'], results[\n                        'labels'], results['iou_norm']\n        else:\n            return results['rgb_data'], results['rgb_len'], results[",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/feature.py:38-63"
+    },
+    "4927": {
+        "file_id": 424,
+        "content": "The code defines two methods. The `load_file` method reads an index file and retrieves video information by parsing each line, stripping whitespace, splitting the filename, and optionally appending a specified suffix or joining with a data prefix. It returns a list of dictionaries containing the filenames. The `prepare_train` method takes an index and prepares training/validation data using a specified pipeline function. If 'iou_norm' is present in the results, it returns multiple data types (e.g., rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask) along with labels.",
+        "type": "comment"
+    },
+    "4928": {
+        "file_id": 424,
+        "content": "                'rgb_mask'], results['audio_data'], results[\n                    'audio_len'], results['audio_mask'], results['labels']\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for testing given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        if 'iou_norm' in results:\n            return results['rgb_data'], results['rgb_len'], results[\n                'rgb_mask'], results['audio_data'], results[\n                    'audio_len'], results['audio_mask'], results[\n                        'labels'], results['iou_norm']\n        else:\n            return results['rgb_data'], results['rgb_len'], results[\n                'rgb_mask'], results['audio_data'], results[\n                    'audio_len'], results['audio_mask'], results['labels']",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/feature.py:64-80"
+    },
+    "4929": {
+        "file_id": 424,
+        "content": "The code defines a function prepare_test that prepares data for testing. It creates a deep copy of the dataset information at the given index, applies a pipeline to it and then checks if 'iou_norm' is in the results. If it is, it returns 7 elements including 'rgb_data', 'audio_data', 'labels' and 'iou_norm'. Otherwise, it returns 6 elements without 'iou_norm'.",
+        "type": "comment"
+    },
+    "4930": {
+        "file_id": 425,
+        "content": "/paddlevideo/loader/dataset/frame.py",
+        "type": "filepath"
+    },
+    "4931": {
+        "file_id": 425,
+        "content": "The PaddleVideo library's FrameDataset and FrameDataset_Sport classes load, transform, and process video data with error handling for missing or corrupted files under Apache License 2.0.",
+        "type": "summary"
+    },
+    "4932": {
+        "file_id": 425,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass FrameDataset(BaseDataset):\n    \"\"\"Rawframe dataset for action recognition.\n    The dataset loads raw frames from frame files, and apply specified transform operatation them.\n    The indecx file ",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/frame.py:1-31"
+    },
+    "4933": {
+        "file_id": 425,
+        "content": "This code is a Python class for a FrameDataset, which loads raw frames from frame files and applies specified transform operations. It is part of the PaddleVideo library and follows Apache License 2.0. The dataset index file is used to organize the loaded data. This class inherits from BaseDataset, suggesting it has some common functionalities.",
+        "type": "comment"
+    },
+    "4934": {
+        "file_id": 425,
+        "content": "is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.\n    Example of an index file:\n    .. code-block:: txt\n        file_path-1 150 1\n        file_path-2 160 1\n        file_path-3 170 2\n        file_path-4 180 2\n    Args:\n        file_path (str): Path to the index file.\n        pipeline(XXX):\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 num_retries=5,\n                 data_prefix=None,\n                 test_mode=False,\n                 suffix='img_{:05}.jpg'):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/frame.py:31-61"
+    },
+    "4935": {
+        "file_id": 425,
+        "content": "This code initializes a class for loading video information from an index file. The index file contains the directory of frames, total frames, and label for each video. It supports pipeline and data_prefix, and has options for test mode and suffix format.",
+        "type": "comment"
+    },
+    "4936": {
+        "file_id": 425,
+        "content": "        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                frame_dir, frames_len, labels = line_split\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(\n                    dict(frame_dir=frame_dir,\n                         suffix=self.suffix,\n                         frames_len=frames_len,\n                         labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/frame.py:62-86"
+    },
+    "4937": {
+        "file_id": 425,
+        "content": "This code reads data from a file and returns information related to frames, such as the frame directory, suffix, number of frames, and labels. It also includes a try-catch block that attempts to prepare the frames for training or validation multiple times if an exception occurs while reading the frames files.",
+        "type": "comment"
+    },
+    "4938": {
+        "file_id": 425,
+        "content": "                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/frame.py:87-108"
+    },
+    "4939": {
+        "file_id": 425,
+        "content": "The code handles exceptions for loading missing frames in the dataset. It tries to load frames multiple times within a specified range and logs errors when needed. If an exception occurs, it randomly selects another index and continues the process until successful.",
+        "type": "comment"
+    },
+    "4940": {
+        "file_id": 425,
+        "content": "@DATASETS.register()\nclass FrameDataset_Sport(BaseDataset):\n    \"\"\"Video dataset for action recognition\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates\n       a sample video with the filepath and label, which are split with a whitesapce.\n       Example of a inde file:\n       .. code-block:: txt\n           path/000.mp4 1\n           path/001.mp4 1\n           path/002.mp4 2\n           path/003.mp4 2\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, **kwargs)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/frame.py:111-136"
+    },
+    "4941": {
+        "file_id": 425,
+        "content": "The code defines a FrameDataset_Sport class for loading raw videos and applying specified transforms. It uses an index file containing video file paths and labels, and takes arguments for file path, data transforms pipeline, retry attempts, and other BaseDataset kwargs. The load_file function reads the index file to obtain video information.",
+        "type": "comment"
+    },
+    "4942": {
+        "file_id": 425,
+        "content": "            for line in fin:\n                line_split = line.strip().split()\n                frame_dir = line_split[0]\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(dict(frame_dir=frame_dir, suffix=self.suffix))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/frame.py:137-158"
+    },
+    "4943": {
+        "file_id": 425,
+        "content": "This code reads lines from a file, each representing a frame directory and associated information. It then splits the line into components and appends the frame directory and suffix to the info list. The prepare_train function attempts to process data for training or validation, handling exceptions by retrying up to a specified number of times before selecting another index at random.",
+        "type": "comment"
+    },
+    "4944": {
+        "file_id": 425,
+        "content": "                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for test given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/frame.py:159-177"
+    },
+    "4945": {
+        "file_id": 425,
+        "content": "The function `prepare_test` is attempting to prepare data for testing by iterating through a certain number of retries in case of exceptions caused by corrupted video files. If an exception occurs, it logs the error and tries again with a different random index from the list of info. Once successful, it returns the images and labels as arrays.",
+        "type": "comment"
+    },
+    "4946": {
+        "file_id": 426,
+        "content": "/paddlevideo/loader/dataset/ms_tcn_dataset.py",
+        "type": "filepath"
+    },
+    "4947": {
+        "file_id": 426,
+        "content": "The code initializes a class for MS-TCN dataset, loads video features and labels for training or testing, and converts label data to integers using a dictionary mapping.",
+        "type": "summary"
+    },
+    "4948": {
+        "file_id": 426,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport os\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass MSTCNDataset(BaseDataset):\n    \"\"\"Video dataset for action segmentation.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        feature_path,\n        gt_path,\n        actions_map_file_path,\n        **kwargs,",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ms_tcn_dataset.py:1-38"
+    },
+    "4949": {
+        "file_id": 426,
+        "content": "Imports required modules and registers a class for the MS-TCN dataset, a video dataset for action segmentation. The class initializes with file paths and other parameters.",
+        "type": "comment"
+    },
+    "4950": {
+        "file_id": 426,
+        "content": "    ):\n        super().__init__(file_path, pipeline, **kwargs)\n        self.gt_path = gt_path\n        self.actions_map_file_path = actions_map_file_path\n        self.feature_path = feature_path\n        # actions dict generate\n        file_ptr = open(self.actions_map_file_path, 'r')\n        actions = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        self.actions_dict = dict()\n        for a in actions:\n            self.actions_dict[a.split()[1]] = int(a.split()[0])\n        self.num_classes = len(self.actions_dict.keys())\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        file_ptr = open(self.file_path, 'r')\n        info = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID: Prepare data for training/valid given the index.\"\"\"\n        results = {}\n        video_name = self.info[idx]\n        # load video feature\n        file_name = video_name.split('.')[0] + \".npy\"\n        feat_file_path = os.path.join(self.feature_path, file_name)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ms_tcn_dataset.py:39-68"
+    },
+    "4951": {
+        "file_id": 426,
+        "content": "This code initializes a class, likely for data loading in a video dataset. It takes file paths as parameters, reads an actions map file to create a dictionary of action classes and their corresponding labels. The class also has a method load_file() to read the index file, and a method prepare_train() to prepare training data given an index.",
+        "type": "comment"
+    },
+    "4952": {
+        "file_id": 426,
+        "content": "        #TODO: check path\n        video_feat = np.load(feat_file_path)\n        # load label\n        target_file_path = os.path.join(self.gt_path, video_name)\n        file_ptr = open(target_file_path, 'r')\n        content = file_ptr.read().split('\\n')[:-1]\n        classes = np.zeros(min(np.shape(video_feat)[1], len(content)), dtype='int64')\n        for i in range(len(classes)):\n            classes[i] = self.actions_dict[content[i]]\n        # classes = classes * (-100)\n        results['video_feat'] = copy.deepcopy(video_feat)\n        results['video_gt'] = copy.deepcopy(classes)\n        results = self.pipeline(results)\n        return results['video_feat'], results['video_gt']\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        results = {}\n        video_name = self.info[idx]\n        # load video feature\n        file_name = video_name.split('.')[0] + \".npy\"\n        feat_file_path = os.path.join(self.feature_path, file_name)\n        #TODO: check path\n        video_feat = np.load(feat_file_path)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ms_tcn_dataset.py:69-95"
+    },
+    "4953": {
+        "file_id": 426,
+        "content": "This code is loading video features and labels from a dataset, likely for training or testing purposes. It first checks the path of the feature file, then loads both the video feature and label data from specified paths. The code converts the label data into integer format using a dictionary mapping and performs some potential preprocessing steps (not shown here). Finally, it returns the video feature and label data for further processing.",
+        "type": "comment"
+    },
+    "4954": {
+        "file_id": 426,
+        "content": "        # load label\n        target_file_path = os.path.join(self.gt_path, video_name)\n        file_ptr = open(target_file_path, 'r')\n        content = file_ptr.read().split('\\n')[:-1]\n        classes = np.zeros(min(np.shape(video_feat)[1], len(content)))\n        for i in range(len(classes)):\n            classes[i] = self.actions_dict[content[i]]\n        # classes = classes * (-100)\n        results['video_feat'] = copy.deepcopy(video_feat)\n        results['video_gt'] = copy.deepcopy(classes)\n        results = self.pipeline(results)\n        return results['video_feat'], results['video_gt']",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ms_tcn_dataset.py:97-110"
+    },
+    "4955": {
+        "file_id": 426,
+        "content": "This function loads labels for a video dataset. It reads the label file, converts content to class numbers using actions_dict, assigns class values to classes array, scales the classes, and returns the feature and ground truth data for the video.",
+        "type": "comment"
+    },
+    "4956": {
+        "file_id": 427,
+        "content": "/paddlevideo/loader/dataset/msrvtt.py",
+        "type": "filepath"
+    },
+    "4957": {
+        "file_id": 427,
+        "content": "The Python script prepares MSRVTTDataset by importing libraries, creating a class, tokenizing captions, retrieving features, and preparing sequences for processing. It processes image data, performs array operations, pads, resizes, calculates features, and converts to float32 for training/testing.",
+        "type": "summary"
+    },
+    "4958": {
+        "file_id": 427,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\ntry:\n    import lmdb\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT.\"\n    )\nimport pickle\ntry:\n    from paddlenlp.transformers import BertTokenizer\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT.\"\n    )",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/msrvtt.py:1-31"
+    },
+    "4959": {
+        "file_id": 427,
+        "content": "The code is a Python script that imports various libraries and packages, checks for the availability of 'lmdb' library, and tries to import 'BertTokenizer' from 'paddlenlp'. It also includes license information and copyright notice.",
+        "type": "comment"
+    },
+    "4960": {
+        "file_id": 427,
+        "content": "from ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass MSRVTTDataset(BaseDataset):\n    \"\"\"MSR-VTT dataset for text-video clip retrieval.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        features_path,\n        bert_model=\"bert-base-uncased\",\n        padding_index=0,\n        max_seq_length=36,\n        max_region_num=36,\n        max_action_num=5,\n        vision_feature_dim=2048,\n        action_feature_dim=2048,\n        spatials_dim=5,\n        data_prefix=None,\n        test_mode=False,\n    ):\n        self.features_path = features_path\n        self.bert_model = bert_model\n        self.padding_index = padding_index\n        self.max_seq_length = max_seq_length\n        self.max_region_num = max_region_num\n        self._max_action_num = max_action_num\n        self.vision_feature_dim = vision_feature_dim\n        self.action_feature_dim = action_feature_dim\n        self.spatials_dim = spatials_dim",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/msrvtt.py:32-67"
+    },
+    "4961": {
+        "file_id": 427,
+        "content": "The code defines the `MSRVTTDataset` class for text-video clip retrieval from MSR-VTT dataset, registering it in the registry. It takes parameters such as file path, pipeline, and maximum sequence length for initializing the dataset, and provides attributes like bert model, padding index, and other dimensions for processing the dataset.",
+        "type": "comment"
+    },
+    "4962": {
+        "file_id": 427,
+        "content": "        self._tokenizer = BertTokenizer.from_pretrained(bert_model,\n                                                        do_lower_case=True)\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n        self.tokenize()\n        self.gen_feature()\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        with open(self.file_path) as fin:\n            self.image_entries = []\n            self.caption_entries = []\n            for line in fin.readlines():\n                line = line.strip()\n                vid_id = line.split(',')[0]\n                self.image_entries.append(vid_id)\n                self.caption_entries.append({\n                    \"caption\": line.split(',')[1],\n                    \"vid_id\": vid_id\n                })\n        self.env = lmdb.open(self.features_path)\n    def tokenize(self):\n        for entry in self.caption_entries:\n            tokens = []\n            tokens.append(\"[CLS]\")\n            for token in self._tokenizer.tokenize(entry[\"caption\"]):",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/msrvtt.py:68-93"
+    },
+    "4963": {
+        "file_id": 427,
+        "content": "The code snippet initializes a BertTokenizer object, loads file containing video information, tokenizes each entry's caption using the initialized tokenizer.",
+        "type": "comment"
+    },
+    "4964": {
+        "file_id": 427,
+        "content": "                tokens.append(token)\n            tokens.append(\"[SEP]\")\n            tokens = self._tokenizer.convert_tokens_to_ids(tokens)\n            segment_ids = [0] * len(tokens)\n            input_mask = [1] * len(tokens)\n            if len(tokens) < self.max_seq_length:\n                padding = [self.padding_index\n                           ] * (self.max_seq_length - len(tokens))\n                tokens = tokens + padding\n                input_mask += padding\n                segment_ids += padding\n            entry[\"token\"] = np.array(tokens).astype('int64')\n            entry[\"input_mask\"] = np.array(input_mask)\n            entry[\"segment_ids\"] = np.array(segment_ids).astype('int64')\n    def get_image_feature(self, video_id):\n        video_id = str(video_id).encode()\n        with self.env.begin(write=False) as txn:\n            item = pickle.loads(txn.get(video_id))\n            video_id = item[\"video_id\"]\n            image_h = int(item[\"image_h\"])\n            image_w = int(item[\"image_w\"])\n            features = item[\"features\"].reshape(-1, self.vision_feature_dim)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/msrvtt.py:94-120"
+    },
+    "4965": {
+        "file_id": 427,
+        "content": "This code is part of a class that processes video data. It appends tokens to an entry, converts tokens to ids, creates segment and input masks, and pads the sequence if necessary. The \"get_image_feature\" function retrieves image features from a database for a given video ID.",
+        "type": "comment"
+    },
+    "4966": {
+        "file_id": 427,
+        "content": "            boxes = item[\"boxes\"].reshape(-1, 4)\n            num_boxes = features.shape[0]\n            g_feat = np.sum(features, axis=0) / num_boxes\n            num_boxes = num_boxes + 1\n            features = np.concatenate(\n                [np.expand_dims(g_feat, axis=0), features], axis=0)\n            action_features = item[\"action_features\"].reshape(\n                -1, self.action_feature_dim)\n            image_location = np.zeros((boxes.shape[0], self.spatials_dim),\n                                      dtype=np.float32)\n            image_location[:, :4] = boxes\n            image_location[:,\n                           4] = ((image_location[:, 3] - image_location[:, 1]) *\n                                 (image_location[:, 2] - image_location[:, 0]) /\n                                 (float(image_w) * float(image_h)))\n            image_location[:, 0] = image_location[:, 0] / float(image_w)\n            image_location[:, 1] = image_location[:, 1] / float(image_h)\n            image_location[:, 2] = image_location[:, 2] / float(image_w)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/msrvtt.py:121-142"
+    },
+    "4967": {
+        "file_id": 427,
+        "content": "This code is resizing and calculating the image location for each box in a dataset. It also concatenates the average feature to the start of the features array, and handles reshaping action_features. The code uses numpy functions extensively for array operations.",
+        "type": "comment"
+    },
+    "4968": {
+        "file_id": 427,
+        "content": "            image_location[:, 3] = image_location[:, 3] / float(image_h)\n            g_location = np.array([0, 0, 1, 1, 1])\n            image_location = np.concatenate(\n                [np.expand_dims(g_location, axis=0), image_location], axis=0)\n        return features, num_boxes, image_location, action_features\n    def gen_feature(self):\n        num_inst = len(self.image_entries)  #1000\n        self.features_all = np.zeros(\n            (num_inst, self.max_region_num, self.vision_feature_dim))\n        self.action_features_all = np.zeros(\n            (num_inst, self._max_action_num, self.action_feature_dim))\n        self.spatials_all = np.zeros(\n            (num_inst, self.max_region_num, self.spatials_dim))\n        self.image_mask_all = np.zeros((num_inst, self.max_region_num))\n        self.action_mask_all = np.zeros((num_inst, self._max_action_num))\n        for i, image_id in enumerate(self.image_entries):\n            features, num_boxes, boxes, action_features = self.get_image_feature(\n                image_id)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/msrvtt.py:143-163"
+    },
+    "4969": {
+        "file_id": 427,
+        "content": "The code defines a function that returns features, number of boxes, image location, and action features after processing an input image. It also initializes arrays for all instances of features, action features, spatial locations, and masks. The code then iterates over each image ID and calls another function to get the respective features, num_boxes, boxes, and action_features.",
+        "type": "comment"
+    },
+    "4970": {
+        "file_id": 427,
+        "content": "            mix_num_boxes = min(int(num_boxes), self.max_region_num)\n            mix_boxes_pad = np.zeros((self.max_region_num, self.spatials_dim))\n            mix_features_pad = np.zeros(\n                (self.max_region_num, self.vision_feature_dim))\n            image_mask = [1] * (int(mix_num_boxes))\n            while len(image_mask) < self.max_region_num:\n                image_mask.append(0)\n            action_mask = [1] * (self._max_action_num)\n            while len(action_mask) < self._max_action_num:\n                action_mask.append(0)\n            mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]\n            mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]\n            self.features_all[i] = mix_features_pad\n            x = action_features.shape[0]\n            self.action_features_all[i][:x] = action_features[:]\n            self.image_mask_all[i] = np.array(image_mask)\n            self.action_mask_all[i] = np.array(action_mask)\n            self.spatials_all[i] = mix_boxes_pad\n        self.features_all = self.features_all.astype(\"float32\")",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/msrvtt.py:165-187"
+    },
+    "4971": {
+        "file_id": 427,
+        "content": "The code handles the padding of features, boxes and masks for a dataset. It ensures that all sequences have the same length by padding them with zeros if necessary. The mixed features (maximum region number), boxes, and action features are assigned to respective lists. These lists will be used later in the program. The code also converts the features list to float32 data type.",
+        "type": "comment"
+    },
+    "4972": {
+        "file_id": 427,
+        "content": "        self.action_features_all = self.action_features_all.astype(\"float32\")\n        self.image_mask_all = self.image_mask_all.astype(\"int64\")\n        self.action_mask_all = self.action_mask_all.astype(\"int64\")\n        self.spatials_all = self.spatials_all.astype(\"float32\")\n    def prepare_train(self, idx):\n        pass\n    def prepare_test(self, idx):\n        entry = self.caption_entries[idx]\n        caption = entry[\"token\"]\n        input_mask = entry[\"input_mask\"]\n        segment_ids = entry[\"segment_ids\"]\n        target_all = np.zeros(1000)\n        for i, image_id in enumerate(self.image_entries):\n            if image_id == entry[\"vid_id\"]:\n                target_all[i] = 1\n        return (\n            caption,\n            self.action_features_all,\n            self.features_all,\n            self.spatials_all,\n            segment_ids,\n            input_mask,\n            self.image_mask_all,\n            self.action_mask_all,\n            target_all,\n        )\n    def __len__(self):\n        return len(self.caption_entries)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/msrvtt.py:188-220"
+    },
+    "4973": {
+        "file_id": 427,
+        "content": "This code initializes data types and provides methods for preparing training and testing data. The `prepare_train` method is left empty, while `prepare_test` takes an index, retrieves the corresponding entry, creates a target array, and returns various data arrays to be used in testing. The length of the dataset is determined by the number of caption entries.",
+        "type": "comment"
+    },
+    "4974": {
+        "file_id": 428,
+        "content": "/paddlevideo/loader/dataset/oxford.py",
+        "type": "filepath"
+    },
+    "4975": {
+        "file_id": 428,
+        "content": "This Python class defines the \"MonoDataset\" for PaddleVideo, initializes with file path, data prefix, and pipeline support. The code contains `load_file`, `prepare_train`, and `prepare_test` methods for dataset preparation and information retrieval.",
+        "type": "summary"
+    },
+    "4976": {
+        "file_id": 428,
+        "content": "# Copyright Niantic 2019. Patent Pending. All rights reserved.\n#\n# This software is licensed under the terms of the Monodepth2 licence\n# which allows for non-commercial use only, the full terms of which are made\n# available in the LICENSE file.\nfrom __future__ import absolute_import, division, print_function\nimport copy\nfrom os import path as osp\nfrom PIL import Image\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\ndef pil_loader(path):\n    # open path as file to avoid ResourceWarning\n    # (https://github.com/python-pillow/Pillow/issues/835)\n    with open(path, 'rb') as f:\n        with Image.open(f) as img:\n            return img.convert('RGB')\n@DATASETS.register()\nclass MonoDataset(BaseDataset):\n    def __init__(self,\n                 file_path,\n                 data_prefix,\n                 pipeline,\n                 num_retries=0,\n                 suffix='.png',\n                 **kwargs):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, **kwargs)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/oxford.py:1-37"
+    },
+    "4977": {
+        "file_id": 428,
+        "content": "This code is a Python class defining the \"MonoDataset\" dataset for PaddleVideo. It requires file path, data prefix, and pipeline for initialization, supports retries when accessing files, and utilizes the pil_loader function for loading RGB images from file paths with specified suffixes. The code also registers MonoDataset with DATASETS registry.",
+        "type": "comment"
+    },
+    "4978": {
+        "file_id": 428,
+        "content": "    def load_file(self):\n        info = []\n        with open(self.file_path, 'r') as f:\n            for line in f:\n                filename = line.strip() + self.suffix\n                folder = osp.dirname(filename)\n                frame_index = line.strip().split('/')[1]\n                info.append(\n                    dict(data_path=self.data_prefix,\n                         filename=filename,\n                         folder=folder,\n                         frame_index=int(frame_index)))\n        return info\n    def prepare_train(self, idx):\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        results['imgs']['idx'] = idx\n        return results['imgs'], results['day_or_night']\n    def prepare_test(self, idx):\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        return results['imgs'], results['day_or_night']",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/oxford.py:39-62"
+    },
+    "4979": {
+        "file_id": 428,
+        "content": "The code defines three methods: `load_file`, `prepare_train`, and `prepare_test`. The `load_file` method reads a file containing information about image files, stripping off newline characters, and appending the necessary file suffix. It then appends a dictionary to the `info` list with data path, filename, folder location, and frame index. The `prepare_train` and `prepare_test` methods copy an entry from `self.info` and apply a pipeline before returning relevant information (e.g., images, labels).",
+        "type": "comment"
+    },
+    "4980": {
+        "file_id": 429,
+        "content": "/paddlevideo/loader/dataset/skeleton.py",
+        "type": "filepath"
+    },
+    "4981": {
+        "file_id": 429,
+        "content": "The code defines a SkeletonDataset class for action recognition, loading skeleton features and applying normalization operations. It imports libraries, registers the dataset, includes a logger, and has a class for loading skeleton data with optional label path and test mode parameter. The class loads and returns data for training or testing, preparing features based on training/testing needs and considering labels if available.",
+        "type": "summary"
+    },
+    "4982": {
+        "file_id": 429,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nimport pickle\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass SkeletonDataset(BaseDataset):\n    \"\"\"\n    Skeleton dataset for action recognition.\n    The dataset loads skeleton feature, and apply norm operatations.\n    Args:\n        file_path (str): Path to the index file.",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/skeleton.py:1-34"
+    },
+    "4983": {
+        "file_id": 429,
+        "content": "This code defines a SkeletonDataset class for action recognition. It loads skeleton features and applies normalization operations. It also imports necessary libraries, registers the dataset with DATASETS, and includes a logger for logging purposes.",
+        "type": "comment"
+    },
+    "4984": {
+        "file_id": 429,
+        "content": "        pipeline(obj): Define the pipeline of data preprocessing.\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n    \"\"\"\n    def __init__(self, file_path, pipeline, label_path=None, test_mode=False):\n        self.label_path = label_path\n        super().__init__(file_path, pipeline, test_mode=test_mode)\n    def load_file(self):\n        \"\"\"Load feature file to get skeleton information.\"\"\"\n        logger.info(\"Loading data, it will take some moment...\")\n        self.data = np.load(self.file_path)\n        if self.label_path:\n            if self.label_path.endswith('npy'):\n                self.label = np.load(self.label_path)\n            elif self.label_path.endswith('pkl'):\n                with open(self.label_path, 'rb') as f:\n                    sample_name, self.label = pickle.load(f)\n        else:\n            logger.info(\n                \"Label path not provided when test_mode={}, here just output predictions.\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/skeleton.py:35-55"
+    },
+    "4985": {
+        "file_id": 429,
+        "content": "This code defines a class for loading skeleton data. It takes file path, pipeline, and label path (optional) as input parameters. It also has an optional test mode parameter. The `__init__` method initializes the class with provided parameters. The `load_file` method loads feature files to get skeleton information and handles different file types for labels. If a label path is given and it ends with 'npy' or 'pkl', it will load the label; otherwise, it just outputs predictions.",
+        "type": "comment"
+    },
+    "4986": {
+        "file_id": 429,
+        "content": "                .format(self.test_mode))\n        logger.info(\"Data Loaded!\")\n        return self.data  # used for __len__\n    def prepare_train(self, idx):\n        \"\"\"Prepare the feature for training/valid given index. \"\"\"\n        results = dict()\n        results['data'] = copy.deepcopy(self.data[idx])\n        results['label'] = copy.deepcopy(self.label[idx])\n        results = self.pipeline(results)\n        return results['data'], results['label']\n    def prepare_test(self, idx):\n        \"\"\"Prepare the feature for test given index. \"\"\"\n        results = dict()\n        results['data'] = copy.deepcopy(self.data[idx])\n        if self.label_path:\n            results['label'] = copy.deepcopy(self.label[idx])\n            results = self.pipeline(results)\n            return results['data'], results['label']\n        else:\n            results = self.pipeline(results)\n            return [results['data']]",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/skeleton.py:56-78"
+    },
+    "4987": {
+        "file_id": 429,
+        "content": "The code defines a class for loading, preparing, and returning data for training or testing. The `__getitem__` method loads the data and returns it when accessed by index. The `prepare_train` method prepares the feature for training/validation given an index. The `prepare_test` method prepares the feature for testing given an index, considering label if available.",
+        "type": "comment"
+    },
+    "4988": {
+        "file_id": 430,
+        "content": "/paddlevideo/loader/dataset/slowfast_video.py",
+        "type": "filepath"
+    },
+    "4989": {
+        "file_id": 430,
+        "content": "PaddleVideo's SFVideoDataset is a video dataset for action recognition, which extends BaseDataset with index file information and optional parameters. It prepares data for training by setting random seeds, loading index files, and appending entries before handling corrupted videos through retry mechanisms and calculating dataset size.",
+        "type": "summary"
+    },
+    "4990": {
+        "file_id": 430,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass SFVideoDataset(BaseDataset):\n    \"\"\"Video dataset for action recognition\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/slowfast_video.py:1-31"
+    },
+    "4991": {
+        "file_id": 430,
+        "content": "This code snippet is from the PaddleVideo module and defines a class called SFVideoDataset. It is a video dataset for action recognition, loading raw videos and applying specified transforms on them. The index file contains multiple lines with information about each video. The class extends BaseDataset and registers it in the DATASETS registry. The code also includes license and copyright information.",
+        "type": "comment"
+    },
+    "4992": {
+        "file_id": 430,
+        "content": "       a sample video with the filepath and label, which are split with a whitesapce.\n       Example of a inde file:\n       .. code-block:: txt\n           path/000.mp4 1\n           path/001.mp4 1\n           path/002.mp4 2\n           path/003.mp4 2\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           num_ensemble_views(int): temporal segment when multi-crop test\n           num_spatial_crops(int): spatial crop number when multi-crop test\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        num_ensemble_views=1,\n        num_spatial_crops=1,\n        num_retries=5,\n        num_samples_precise_bn=None,\n        **kwargs,\n    ):\n        self.num_ensemble_views = num_ensemble_views\n        self.num_spatial_crops = num_spatial_crops\n        self.num_retries = num_retries\n        self.num_samples_precise_bn = num_samples_precise_bn\n        super().__init__(file_path, pipeline, **kwargs)",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/slowfast_video.py:32-64"
+    },
+    "4993": {
+        "file_id": 430,
+        "content": "The code defines a class that represents an index file containing paths to video files and their corresponding labels. It takes arguments such as the path to the index file, data transforms pipeline, and optional parameters for ensemble views and spatial crops. It also includes keyword arguments for the BaseDataset class. The super() function is used to call the parent class's constructor.",
+        "type": "comment"
+    },
+    "4994": {
+        "file_id": 430,
+        "content": "        #set random seed\n        random.seed(0)\n        np.random.seed(0)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                filename, labels = line_split\n                if self.data_prefix is not None:\n                    filename = osp.join(self.data_prefix, filename)\n                for tidx in range(self.num_ensemble_views):\n                    for sidx in range(self.num_spatial_crops):\n                        info.append(\n                            dict(\n                                filename=filename,\n                                labels=int(labels),\n                                temporal_sample_index=tidx,\n                                spatial_sample_index=sidx,\n                                temporal_num_clips=self.num_ensemble_views,\n                                spatial_num_clips=self.num_spatial_crops,",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/slowfast_video.py:65-87"
+    },
+    "4995": {
+        "file_id": 430,
+        "content": "Sets random seed for reproducibility, loads index file to get video information, and appends dictionary entries containing filename, labels, temporal, and spatial sample indices.",
+        "type": "comment"
+    },
+    "4996": {
+        "file_id": 430,
+        "content": "                            ))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        short_cycle = False\n        if isinstance(idx, tuple):\n            idx, short_cycle_idx = idx\n            short_cycle = True\n        for ir in range(self.num_retries):\n            try:\n                #Multi-grid short cycle\n                if short_cycle:\n                    results = copy.deepcopy(self.info[idx])\n                    results['short_cycle_idx'] = short_cycle_idx\n                else:\n                    results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/slowfast_video.py:88-112"
+    },
+    "4997": {
+        "file_id": 430,
+        "content": "The code is responsible for preparing data for training in the context of a video dataset. It handles potential exceptions caused by reading corrupted video files and allows retries to avoid failures. The function takes an index as input, checks if it's a tuple or not, iterates over a specified number of retries, performs data processing using a pipeline, and handles any exceptions that occur during the process. If there are no exceptions, the results are returned; otherwise, the code logs an error message and tries again.",
+        "type": "comment"
+    },
+    "4998": {
+        "file_id": 430,
+        "content": "                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'][0], results['imgs'][1], np.array(\n                [results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for test given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'][0], results['imgs'][1], np.array(\n                [results['labels']]), np.array([idx])\n    def __len__(self):",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/slowfast_video.py:113-137"
+    },
+    "4999": {
+        "file_id": 430,
+        "content": "The code is implementing a retry mechanism for loading video files. If a corrupted file is encountered, it will attempt to load another random file up to the specified number of retries. If still unsuccessful, it will return an error. The function also includes a logging system to report exceptions and progress in retry attempts.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/5.json b/docs/data/5.json
new file mode 100644
index 000000000..af2c9f392
--- /dev/null
+++ b/docs/data/5.json
@@ -0,0 +1,551 @@
+{
+    "500": {
+        "file_id": 54,
+        "content": "                if ('head.' + k) not in model.state_dict().keys():\n                    print(f'pretrained -----{k} -------is not in model')\n        write_dict(state_dicts, 'model_for_infer.txt', **cfg)\n        model.set_state_dict(state_dicts)\n        inter_file = open(\n            os.path.join(\n                cfg.get(\"output_dir\", f\"./output/{cfg['model_name']}\"),\n                'inter_file.txt'), 'w')\n        seen_seq = False\n        with paddle.no_grad():\n            # Get the current iteration scribbles\n            for scribbles, first_scribble in get_scribbles():\n                t_total = timeit.default_timer()\n                f, h, w = images.shape[:3]\n                if 'prev_label_storage' not in locals().keys():\n                    prev_label_storage = paddle.zeros([f, h, w])\n                if len(annotated_frames(scribbles)) == 0:\n                    final_masks = prev_label_storage\n                    # ToDo To AP-kai: save_path传过来了\n                    submit_masks(cfg[\"save_path\"], final_masks.numpy(), images)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:88-109"
+    },
+    "501": {
+        "file_id": 54,
+        "content": "This code segment checks if certain keys are present in the model's state dictionary. If not, it prints a message and writes the state dictionaries to a file named 'model_for_infer.txt'. It then sets the model's state dict with the state dictionaries, opens an inter_file.txt for writing, and initializes a variable 'seen_seq' as False. Inside a no_grad context, it retrieves scribbles and iterates over them, calculating total time, image shape, and checks if there are any annotated frames. If not, it assigns the previous label storage as final masks and submits those masks to the specified save path with corresponding images.",
+        "type": "comment"
+    },
+    "502": {
+        "file_id": 54,
+        "content": "                    continue\n                # if no scribbles return, keep masks in previous round\n                start_annotated_frame = annotated_frames(scribbles)[0]\n                pred_masks = []\n                pred_masks_reverse = []\n                if first_scribble:  # If in the first round, initialize memories\n                    n_interaction = 1\n                    eval_global_map_tmp_dic = {}\n                    local_map_dics = ({}, {})\n                    total_frame_num = f\n                else:\n                    n_interaction += 1\n                inter_file.write(sequence + ' ' + 'interaction' +\n                                 str(n_interaction) + ' ' + 'frame' +\n                                 str(start_annotated_frame) + '\\n')\n                if first_scribble:  # if in the first round, extract pixel embbedings.\n                    if not seen_seq:\n                        seen_seq = True\n                        inter_turn = 1\n                        embedding_memory = []\n                        places = paddle.set_device('cpu')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:110-134"
+    },
+    "503": {
+        "file_id": 54,
+        "content": "The code handles the first round of scribbles and initializes memory for future interactions. It writes information to an inter_file, extracts pixel embeddings if it's the first round, and sets up variables for tracking interactions and embedding memories.",
+        "type": "comment"
+    },
+    "504": {
+        "file_id": 54,
+        "content": "                        for imgs in images:\n                            if cfg['PIPELINE'].get('test'):\n                                imgs = paddle.to_tensor([\n                                    build_pipeline(cfg['PIPELINE'].test)({\n                                        'img1':\n                                            imgs\n                                    })['img1']\n                                ])\n                            else:\n                                imgs = paddle.to_tensor([imgs])\n                            if parallel:\n                                for c in model.children():\n                                    frame_embedding = c.head.extract_feature(\n                                        imgs)\n                            else:\n                                frame_embedding = model.head.extract_feature(\n                                    imgs)\n                            embedding_memory.append(frame_embedding)\n                        del frame_embedding\n                        embedding_memory = paddle.concat(embedding_memory, 0)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:136-157"
+    },
+    "505": {
+        "file_id": 54,
+        "content": "This code is iterating through each image in a batch and applying a pipeline transformation if testing mode is enabled. It then creates frame embeddings either by looping over model children or directly from the model head. The frame embeddings are appended to a list, concatenated, and stored as embedding_memory.",
+        "type": "comment"
+    },
+    "506": {
+        "file_id": 54,
+        "content": "                        _, _, emb_h, emb_w = embedding_memory.shape\n                        ref_frame_embedding = embedding_memory[\n                            start_annotated_frame]\n                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                    else:\n                        inter_turn += 1\n                        ref_frame_embedding = embedding_memory[\n                            start_annotated_frame]\n                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                else:\n                    ref_frame_embedding = embedding_memory[\n                        start_annotated_frame]\n                    ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                ########\n                scribble_masks = scribbles2mask(scribbles, (emb_h, emb_w))\n                scribble_label = scribble_masks[start_annotated_frame]\n                scribble_sample = {'scribble_label': scribble_label}\n                scribble_sample = ToTensor_manet()(scribble_sample)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:158-176"
+    },
+    "507": {
+        "file_id": 54,
+        "content": "The code initializes the reference frame embedding and handles cases where the annotation is present or not. It extracts the reference frame embedding from the embedding memory, reshapes it, and then creates a scribble sample with the scribble label for further processing.",
+        "type": "comment"
+    },
+    "508": {
+        "file_id": 54,
+        "content": "                #                     print(ref_frame_embedding, ref_frame_embedding.shape)\n                scribble_label = scribble_sample['scribble_label']\n                scribble_label = scribble_label.unsqueeze(0)\n                model_name = cfg['model_name']\n                output_dir = cfg.get(\"output_dir\", f\"./output/{model_name}\")\n                inter_file_path = os.path.join(\n                    output_dir, sequence, 'interactive' + str(n_interaction),\n                                          'turn' + str(inter_turn))\n                if is_save_image:\n                    ref_scribble_to_show = scribble_label.squeeze().numpy()\n                    im_ = Image.fromarray(\n                        ref_scribble_to_show.astype('uint8')).convert('P', )\n                    im_.putpalette(_palette)\n                    ref_img_name = str(start_annotated_frame)\n                    if not os.path.exists(inter_file_path):\n                        os.makedirs(inter_file_path)\n                    im_.save(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:177-195"
+    },
+    "509": {
+        "file_id": 54,
+        "content": "This code snippet is responsible for saving an interactive scribble image. It first retrieves the scribble label, then constructs the file path to save the image based on configuration settings and iteration parameters. If the directory doesn't exist, it creates one. Finally, it saves the scribble image using a specific palette.",
+        "type": "comment"
+    },
+    "510": {
+        "file_id": 54,
+        "content": "                        os.path.join(inter_file_path,\n                                     'inter_' + ref_img_name + '.png'))\n                if first_scribble:\n                    prev_label = None\n                    prev_label_storage = paddle.zeros([f, h, w])\n                else:\n                    prev_label = prev_label_storage[start_annotated_frame]\n                    prev_label = prev_label.unsqueeze(0).unsqueeze(0)\n                # check if no scribbles.\n                if not first_scribble and paddle.unique(\n                        scribble_label).shape[0] == 1:\n                    print(\n                        'not first_scribble and paddle.unique(scribble_label).shape[0] == 1'\n                    )\n                    print(paddle.unique(scribble_label))\n                    final_masks = prev_label_storage\n                    submit_masks(cfg[\"save_path\"], final_masks.numpy(), images)\n                    continue\n                ###inteaction segmentation head\n                if parallel:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:196-216"
+    },
+    "511": {
+        "file_id": 54,
+        "content": "This code segment is part of a video modeling framework. It deals with handling scribbles and generating masks based on them. If there are no scribbles after the first one, it prints a message and continues execution by submitting the previous label storage as final masks. This code also checks for parallel processing and seems to be part of an interaction segmentation head.",
+        "type": "comment"
+    },
+    "512": {
+        "file_id": 54,
+        "content": "                    for c in model.children():\n                        tmp_dic, local_map_dics = c.head.int_seghead(\n                            ref_frame_embedding=ref_frame_embedding,\n                            ref_scribble_label=scribble_label,\n                            prev_round_label=prev_label,\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            frame_num=[start_annotated_frame],\n                            first_inter=first_scribble)\n                else:\n                    tmp_dic, local_map_dics = model.head.int_seghead(\n                        ref_frame_embedding=ref_frame_embedding,\n                        ref_scribble_label=scribble_label,\n                        prev_round_label=prev_label,\n                        global_map_tmp_dic=eval_global_map_tmp_dic,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:217-234"
+    },
+    "513": {
+        "file_id": 54,
+        "content": "This code is part of the Manet_Stage1 segmentation model in PaddleVideo. It iterates through the children of the model and calls the 'int_seghead' function to generate temporary dictionaries and local map dictionaries for each child. The 'int_seghead' function takes various parameters such as reference frame embedding, previous round label, global map temporary dictionary, etc., and returns a tuple containing the temporary dictionary and local map dictionaries. If there are no children in the model, it directly calls the 'int_seghead' function on the model's head for the same set of parameters.",
+        "type": "comment"
+    },
+    "514": {
+        "file_id": 54,
+        "content": "                        local_map_dics=local_map_dics,\n                        interaction_num=n_interaction,\n                        seq_names=[sequence],\n                        gt_ids=paddle.to_tensor([obj_nums]),\n                        frame_num=[start_annotated_frame],\n                        first_inter=first_scribble)\n                pred_label = tmp_dic[sequence]\n                pred_label = nn.functional.interpolate(pred_label,\n                                                       size=(h, w),\n                                                       mode='bilinear',\n                                                       align_corners=True)\n                pred_label = paddle.argmax(pred_label, axis=1)\n                pred_masks.append(float_(pred_label))\n                # np.unique(pred_label)\n                # array([0], dtype=int64)\n                prev_label_storage[start_annotated_frame] = float_(\n                    pred_label[0])\n                if is_save_image:  # save image\n                    pred_label_to_save = pred_label.squeeze(0).numpy()",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:235-254"
+    },
+    "515": {
+        "file_id": 54,
+        "content": "Creates a temporary dictionary with local maps and other parameters. Obtains the predicted label for the sequence, interpolates it to original size, gets the argument of maximum value along axis 1, adds it to prediction masks list, stores the first predicted label for current frame in prev_label_storage if saving images, converts pred_label to numpy array and displays unique elements.",
+        "type": "comment"
+    },
+    "516": {
+        "file_id": 54,
+        "content": "                    im = Image.fromarray(\n                        pred_label_to_save.astype('uint8')).convert('P', )\n                    im.putpalette(_palette)\n                    imgname = str(start_annotated_frame)\n                    while len(imgname) < 5:\n                        imgname = '0' + imgname\n                    if not os.path.exists(inter_file_path):\n                        os.makedirs(inter_file_path)\n                    im.save(os.path.join(inter_file_path, imgname + '.png'))\n                #######################################\n                if first_scribble:\n                    scribble_label = rough_ROI(scribble_label)\n                ##############################\n                ref_prev_label = pred_label.unsqueeze(0)\n                prev_label = pred_label.unsqueeze(0)\n                prev_embedding = ref_frame_embedding\n                for ii in range(start_annotated_frame + 1, total_frame_num):\n                    current_embedding = embedding_memory[ii]\n                    current_embedding = current_embedding.unsqueeze(0)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:255-274"
+    },
+    "517": {
+        "file_id": 54,
+        "content": "The code segment is generating annotated images from predicted labels and creating scribble-based reference labels. It saves the images in a specified folder path, and initializes variables for iterating through the frames of the video.",
+        "type": "comment"
+    },
+    "518": {
+        "file_id": 54,
+        "content": "                    prev_label = prev_label\n                    if parallel:\n                        for c in model.children():\n                            tmp_dic, eval_global_map_tmp_dic, local_map_dics = c.head.prop_seghead(\n                                ref_frame_embedding,\n                                prev_embedding,\n                                current_embedding,\n                                scribble_label,\n                                prev_label,\n                                normalize_nearest_neighbor_distances=True,\n                                use_local_map=True,\n                                seq_names=[sequence],\n                                gt_ids=paddle.to_tensor([obj_nums]),\n                                k_nearest_neighbors=cfg['knns'],\n                                global_map_tmp_dic=eval_global_map_tmp_dic,\n                                local_map_dics=local_map_dics,\n                                interaction_num=n_interaction,\n                                start_annotated_frame=start_annotated_frame,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:275-292"
+    },
+    "519": {
+        "file_id": 54,
+        "content": "The code iterates over the model's children and calls `prop_seghead` on each child, passing relevant embeddings and labels to calculate local maps and global maps for segmentation. It also takes into account nearest neighbors, interaction numbers, and annotated frame start.",
+        "type": "comment"
+    },
+    "520": {
+        "file_id": 54,
+        "content": "                                frame_num=[ii],\n                                dynamic_seghead=c.head.dynamic_seghead)\n                    else:\n                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.head.prop_seghead(\n                            ref_frame_embedding,\n                            prev_embedding,\n                            current_embedding,\n                            scribble_label,\n                            prev_label,\n                            normalize_nearest_neighbor_distances=True,\n                            use_local_map=True,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            k_nearest_neighbors=cfg['knns'],\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            start_annotated_frame=start_annotated_frame,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:293-310"
+    },
+    "521": {
+        "file_id": 54,
+        "content": "Code segment is part of a larger function in PaddleVideo library. It checks if frame number is the start_annotated_frame, if so, it extracts the current embedding, else it calls head.prop_seghead to get temporary dictionary, global map temporary dictionary and local maps based on reference frame embedding, previous embedding, current embedding, scribble label and previous label using Paddle (a deep learning framework). It also considers K nearest neighbors and interaction number while performing its operation.",
+        "type": "comment"
+    },
+    "522": {
+        "file_id": 54,
+        "content": "                            frame_num=[ii],\n                            dynamic_seghead=model.head.dynamic_seghead)\n                    pred_label = tmp_dic[sequence]\n                    pred_label = nn.functional.interpolate(pred_label,\n                                                           size=(h, w),\n                                                           mode='bilinear',\n                                                           align_corners=True)\n                    pred_label = paddle.argmax(pred_label, axis=1)\n                    pred_masks.append(float_(pred_label))\n                    prev_label = pred_label.unsqueeze(0)\n                    prev_embedding = current_embedding\n                    prev_label_storage[ii] = float_(pred_label[0])\n                    if is_save_image:\n                        pred_label_to_save = pred_label.squeeze(0).numpy()\n                        im = Image.fromarray(\n                            pred_label_to_save.astype('uint8')).convert('P', )\n                        im.putpalette(_palette)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:311-327"
+    },
+    "523": {
+        "file_id": 54,
+        "content": "This code segment is responsible for predicting the labels, creating masks and storing them in a list, and possibly saving an image. The predicted label is interpolated to match the frame size, converted to mask and added to the list of masks. This process continues for each frame. If saving images, the predicted labels are converted to an image format and saved as a grayscale PALETTE image.",
+        "type": "comment"
+    },
+    "524": {
+        "file_id": 54,
+        "content": "                        imgname = str(ii)\n                        while len(imgname) < 5:\n                            imgname = '0' + imgname\n                        if not os.path.exists(inter_file_path):\n                            os.makedirs(inter_file_path)\n                        im.save(os.path.join(inter_file_path,\n                                             imgname + '.png'))\n                #######################################\n                prev_label = ref_prev_label\n                prev_embedding = ref_frame_embedding\n                #######\n                # Propagation <-\n                for ii in range(start_annotated_frame):\n                    current_frame_num = start_annotated_frame - 1 - ii\n                    current_embedding = embedding_memory[current_frame_num]\n                    current_embedding = current_embedding.unsqueeze(0)\n                    prev_label = prev_label\n                    if parallel:\n                        for c in model.children():\n                            tmp_dic, eval_global_map_tmp_dic, local_map_dics = c.head.prop_seghead(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:328-347"
+    },
+    "525": {
+        "file_id": 54,
+        "content": "Code snippet saves frames to disk, initializes variables for propagation loop, and begins the propagation process by iterating through frames from start_annotated_frame down to 0. The model's children are then processed in parallel for segmentation head propagation.",
+        "type": "comment"
+    },
+    "526": {
+        "file_id": 54,
+        "content": "                                ref_frame_embedding,\n                                prev_embedding,\n                                current_embedding,\n                                scribble_label,\n                                prev_label,\n                                normalize_nearest_neighbor_distances=True,\n                                use_local_map=True,\n                                seq_names=[sequence],\n                                gt_ids=paddle.to_tensor([obj_nums]),\n                                k_nearest_neighbors=cfg['knns'],\n                                global_map_tmp_dic=eval_global_map_tmp_dic,\n                                local_map_dics=local_map_dics,\n                                interaction_num=n_interaction,\n                                start_annotated_frame=start_annotated_frame,\n                                frame_num=[current_frame_num],\n                                dynamic_seghead=c.head.dynamic_seghead)\n                    else:\n                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.head.prop_seghead(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:348-365"
+    },
+    "527": {
+        "file_id": 54,
+        "content": "This code appears to be part of a deep learning model for video segmentation. It is calling the \"prop_seghead\" function from the \"model.head\" object with specific parameters including reference frame embedding, previous and current embeddings, scribble label, and previous label. If certain conditions are met, additional parameters such as normalize nearest neighbor distances, use local map, sequence names, ground truth IDs, number of nearest neighbors, start annotated frame, and dynamic seghead are passed. The function returns a temporary dictionary, evaluation global map temporary dictionary, and local map dictionaries.",
+        "type": "comment"
+    },
+    "528": {
+        "file_id": 54,
+        "content": "                            ref_frame_embedding,\n                            prev_embedding,\n                            current_embedding,\n                            scribble_label,\n                            prev_label,\n                            normalize_nearest_neighbor_distances=True,\n                            use_local_map=True,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            k_nearest_neighbors=cfg['knns'],\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            start_annotated_frame=start_annotated_frame,\n                            frame_num=[current_frame_num],\n                            dynamic_seghead=model.head.dynamic_seghead)\n                    pred_label = tmp_dic[sequence]\n                    pred_label = nn.functional.interpolate(pred_label,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:366-383"
+    },
+    "529": {
+        "file_id": 54,
+        "content": "This code is calculating the predictions for a specific sequence by using various embeddings, labels, and configurations. It involves interacting with multiple dictionaries, tensor operations, and a dynamic seghead model. The predicted label is then interpolated to match the resolution of the original frame.",
+        "type": "comment"
+    },
+    "530": {
+        "file_id": 54,
+        "content": "                                                           size=(h, w),\n                                                           mode='bilinear',\n                                                           align_corners=True)\n                    pred_label = paddle.argmax(pred_label, axis=1)\n                    pred_masks_reverse.append(float_(pred_label))\n                    prev_label = pred_label.unsqueeze(0)\n                    prev_embedding = current_embedding\n                    ####\n                    prev_label_storage[current_frame_num] = float_(\n                        pred_label[0])\n                    ###\n                    if is_save_image:\n                        pred_label_to_save = pred_label.squeeze(0).numpy()\n                        im = Image.fromarray(\n                            pred_label_to_save.astype('uint8')).convert('P', )\n                        im.putpalette(_palette)\n                        imgname = str(current_frame_num)\n                        while len(imgname) < 5:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:384-402"
+    },
+    "531": {
+        "file_id": 54,
+        "content": "This code snippet is part of an image segmentation model. It extracts predictions from the model, converts them to masks, and stores previous label information for each frame. Additionally, it saves visualizations of these predictions as palette-colored images.",
+        "type": "comment"
+    },
+    "532": {
+        "file_id": 54,
+        "content": "                            imgname = '0' + imgname\n                        if not os.path.exists(inter_file_path):\n                            os.makedirs(inter_file_path)\n                        im.save(os.path.join(inter_file_path,\n                                             imgname + '.png'))\n                pred_masks_reverse.reverse()\n                pred_masks_reverse.extend(pred_masks)\n                final_masks = paddle.concat(pred_masks_reverse, 0)\n                submit_masks(cfg[\"save_path\"], final_masks.numpy(), images)\n                t_end = timeit.default_timer()\n                print('Total time for single interaction: ' +\n                      str(t_end - t_total))\n        inter_file.close()\n        return None",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:403-417"
+    },
+    "533": {
+        "file_id": 54,
+        "content": "This code saves images and their corresponding masks, creates final masks, and writes the total time for a single interaction. It handles non-existent folders by creating them before saving images.",
+        "type": "comment"
+    },
+    "534": {
+        "file_id": 55,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py",
+        "type": "filepath"
+    },
+    "535": {
+        "file_id": 55,
+        "content": "This function computes L2 distances, applies nearest neighbor attention and feature extraction, considers padding, uses local search windows and average pooling. It introduces a custom layer, calculates nearest neighbor features with embeddings, updates global map dictionaries, and processes inputs to return output dictionaries after calculations on local distance maps for each frame. The code segment updates global and local map dictionaries, calculates frame embeddings and masks, obtains segmentation predictions, and processes data for improved video processing accuracy.",
+        "type": "summary"
+    },
+    "536": {
+        "file_id": 55,
+        "content": "import numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom EIVideo.paddlevideo.utils.manet_utils import int_, float_, long_, load\nfrom EIVideo.paddlevideo.utils.manet_utils import kaiming_normal_\n#############################################################GLOBAL_DIST_MAP\nMODEL_UNFOLD = True\nWRONG_LABEL_PADDING_DISTANCE = 1e20\ndef _pairwise_distances(x, y, ys=None):\n    \"\"\"Computes pairwise squared l2 distances between tensors x and y.\n    Args:\n    x: Tensor of shape [n, feature_dim].\n    y: Tensor of shape [m, feature_dim].\n    Returns:\n    Float32 distances tensor of shape [n, m].\n    \"\"\"\n    xs = paddle.sum(x * x, 1)\n    xs = xs.unsqueeze(1)\n    if ys is None:\n        ys = paddle.sum(y * y, 1)\n        ys = ys.unsqueeze(0)\n    else:\n        ys = ys\n    d = xs + ys - 2. * paddle.matmul(x, paddle.t(y))\n    return d, ys\n##################\ndef _flattened_pairwise_distances(reference_embeddings, query_embeddings, ys):\n    \"\"\"Calculates flattened tensor of pairwise distances between ref and query.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:1-37"
+    },
+    "537": {
+        "file_id": 55,
+        "content": "This code defines a function that calculates pairwise squared L2 distances between two tensors. It takes in two tensors, x and y, and optionally a third tensor ys. The function first computes the sum of squares for each row in tensor x and stores them in xs. If ys is None, it then computes the sum of squares for each row in tensor y and stores them in ys. Otherwise, it uses the provided ys. Finally, the function calculates the pairwise distances using the formula xs + ys - 2 * paddle.matmul(x, paddle.t(y)).",
+        "type": "comment"
+    },
+    "538": {
+        "file_id": 55,
+        "content": "    Args:\n    reference_embeddings: Tensor of shape [..., embedding_dim],\n      the embedding vectors for the reference frame\n    query_embeddings: Tensor of shape [n_query_images, height, width,\n      embedding_dim], the embedding vectors for the query frames.\n    Returns:\n    A distance tensor of shape [reference_embeddings.size / embedding_dim,\n    query_embeddings.size / embedding_dim]\n    \"\"\"\n    embedding_dim = query_embeddings.shape[-1]\n    reference_embeddings = reference_embeddings.reshape([-1, embedding_dim])\n    first_dim = -1\n    query_embeddings = query_embeddings.reshape([first_dim, embedding_dim])\n    dists, ys = _pairwise_distances(query_embeddings, reference_embeddings, ys)\n    return dists, ys\ndef _nn_features_per_object_for_chunk(reference_embeddings, query_embeddings,\n                                      wrong_label_mask, k_nearest_neighbors,\n                                      ys):\n    \"\"\"Extracts features for each object using nearest neighbor attention.\n  Args:\n    reference_embeddings: Tensor of shape [n_chunk, embedding_dim],",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:38-60"
+    },
+    "539": {
+        "file_id": 55,
+        "content": "This function takes reference and query embeddings as input, calculates pairwise distances between them using the _pairwise_distances function, and returns the distances in dists and ys. The _nn_features_per_object_for_chunk function extracts features for each object using nearest neighbor attention, taking reference embeddings, query embeddings, wrong_label_mask, k_nearest_neighbors, and ys as input.",
+        "type": "comment"
+    },
+    "540": {
+        "file_id": 55,
+        "content": "      the embedding vectors for the reference frame.\n    query_embeddings: Tensor of shape [m_chunk, embedding_dim], the embedding\n      vectors for the query frames.\n    wrong_label_mask:\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n  Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [m_chunk, n_objects, feature_dim].\n    \"\"\"\n    #    reference_embeddings_key = reference_embeddings\n    #    query_embeddings_key = query_embeddings\n    dists, ys = _flattened_pairwise_distances(reference_embeddings,\n                                              query_embeddings, ys)\n    dists = (paddle.unsqueeze(dists, 1) +\n             paddle.unsqueeze(float_(wrong_label_mask), 0) *\n             WRONG_LABEL_PADDING_DISTANCE)\n    if k_nearest_neighbors == 1:\n        features = paddle.min(dists, 2, keepdim=True)\n    else:\n        dists, _ = paddle.topk(-dists, k=k_nearest_neighbors, axis=2)\n        dists = -dists\n        valid_mask = (dists < WRONG_LABEL_PADDING_DISTANCE)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:61-83"
+    },
+    "541": {
+        "file_id": 55,
+        "content": "This function calculates the pairwise distances between reference and query embeddings, selects the nearest neighbors based on those distances, and returns the nearest neighbor features. It handles cases with different numbers of reference and query embeddings by padding with a specified distance value for missing embeddings.",
+        "type": "comment"
+    },
+    "542": {
+        "file_id": 55,
+        "content": "        masked_dists = dists * valid_mask.float()\n        pad_dist = paddle.max(masked_dists, axis=2, keepdim=True)[0].tile(\n            (1, 1, masked_dists.shape[-1]))\n        dists = paddle.where(valid_mask, dists, pad_dist)\n        # take mean of distances\n        features = paddle.mean(dists, axis=2, keepdim=True)\n    return features, ys\n###\ndef _selected_pixel(ref_labels_flat, ref_emb_flat):\n    index_list = paddle.arange(len(ref_labels_flat))\n    index_list = index_list\n    index_ = paddle.masked_select(index_list, ref_labels_flat != -1)\n    index_ = long_(index_)\n    ref_labels_flat = paddle.index_select(ref_labels_flat, index_, 0)\n    ref_emb_flat = paddle.index_select(ref_emb_flat, index_, 0)\n    return ref_labels_flat, ref_emb_flat\n###\ndef _nearest_neighbor_features_per_object_in_chunks(reference_embeddings_flat,\n                                                    query_embeddings_flat,\n                                                    reference_labels_flat,\n                                                    ref_obj_ids,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:84-113"
+    },
+    "543": {
+        "file_id": 55,
+        "content": "The code calculates the mean of distances between valid points and assigns the result to \"features\". The function _selected_pixel() selects pixels from flattened arrays where reference labels are not -1. The function _nearest_neighbor_features_per_object_in_chunks() operates on flattened embeddings, labels, and object ids to compute nearest neighbor features per object in chunks.",
+        "type": "comment"
+    },
+    "544": {
+        "file_id": 55,
+        "content": "                                                    k_nearest_neighbors,\n                                                    n_chunks, **cfg):\n    \"\"\"Calculates the nearest neighbor features per object in chunks to save mem.\n    Uses chunking to bound the memory use.\n    Args:\n    reference_embeddings_flat: Tensor of shape [n, embedding_dim],\n      the embedding vectors for the reference frame.\n    query_embeddings_flat: Tensor of shape [m, embedding_dim], the embedding\n      vectors for the query frames.\n    reference_labels_flat: Tensor of shape [n], the class labels of the\n      reference frame.\n    ref_obj_ids: int tensor of unique object ids in the reference labels.\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n    n_chunks: Integer, the number of chunks to use to save memory\n      (set to 1 for no chunking).\n    Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [m, n_objects, feature_dim].\n    \"\"\"\n    # reference_embeddings_flat = reference_embeddings_flat.cpu()",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:114-134"
+    },
+    "545": {
+        "file_id": 55,
+        "content": "This function calculates the nearest neighbor features per object in chunks to save memory, using chunking for bounding memory usage. It takes embedding vectors for reference and query frames, their class labels, unique object IDs, number of nearest neighbors, and number of chunks as input. The function returns a tensor of nearest neighbor features for the query frames.",
+        "type": "comment"
+    },
+    "546": {
+        "file_id": 55,
+        "content": "    # query_embeddings_flat = query_embeddings_flat.cpu()\n    # reference_labels_flat = reference_labels_flat.cpu()\n    # ref_obj_ids = ref_obj_ids.cpu()\n    chunk_size = int_(\n        np.ceil((float_(query_embeddings_flat.shape[0]) / n_chunks).numpy()))\n    if cfg.get('test_mode'):\n        reference_labels_flat, reference_embeddings_flat = _selected_pixel(\n            reference_labels_flat, reference_embeddings_flat)\n    wrong_label_mask = (reference_labels_flat != paddle.unsqueeze(\n        ref_obj_ids, 1))\n    all_features = []\n    for n in range(n_chunks):\n        if n == 0:\n            ys = None\n        if n_chunks == 1:\n            query_embeddings_flat_chunk = query_embeddings_flat\n        else:\n            chunk_start = n * chunk_size\n            chunk_end = (n + 1) * chunk_size\n            query_embeddings_flat_chunk = query_embeddings_flat[\n                chunk_start:chunk_end]\n        features, ys = _nn_features_per_object_for_chunk(\n            reference_embeddings_flat, query_embeddings_flat_chunk,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:135-158"
+    },
+    "547": {
+        "file_id": 55,
+        "content": "This code splits the query embeddings into multiple chunks, depending on the number of chunks specified. It then applies a function to each chunk and appends the results to the all_features list. If in test mode, it selects pixels from the reference and query embeddings. It also creates a wrong label mask for the reference labels and query embeddings.",
+        "type": "comment"
+    },
+    "548": {
+        "file_id": 55,
+        "content": "            wrong_label_mask, k_nearest_neighbors, ys)\n        all_features.append(features)\n    if n_chunks == 1:\n        nn_features = all_features[0]\n    else:\n        nn_features = paddle.concat(all_features, axis=0)\n    return nn_features\ndef nearest_neighbor_features_per_object(reference_embeddings,\n                                         query_embeddings,\n                                         reference_labels,\n                                         k_nearest_neighbors,\n                                         gt_ids=None,\n                                         n_chunks=100,\n                                         **cfg):\n    \"\"\"Calculates the distance to the nearest neighbor per object.\n    For every pixel of query_embeddings calculate the distance to the\n    nearest neighbor in the (possibly subsampled) reference_embeddings per object.\n    Args:\n    reference_embeddings: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the reference frame.\n    query_embeddings: Tensor of shape [n_query_images, height, width,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:159-181"
+    },
+    "549": {
+        "file_id": 55,
+        "content": "This code calculates the nearest neighbor features per object using reference embeddings, query embeddings, and reference labels. It takes into account k-nearest neighbors and can handle a specified number of chunks for subsampling. The function returns the nearest neighbor features in the form of a tensor.",
+        "type": "comment"
+    },
+    "550": {
+        "file_id": 55,
+        "content": "      embedding_dim], the embedding vectors for the query frames.\n    reference_labels: Tensor of shape [height, width, 1], the class labels of\n      the reference frame.\n    max_neighbors_per_object: Integer, the maximum number of candidates\n      for the nearest neighbor query per object after subsampling,\n      or 0 for no subsampling.\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n    gt_ids: Int tensor of shape [n_objs] of the sorted unique ground truth\n      ids in the first frame. If None, it will be derived from\n      reference_labels.\n    n_chunks: Integer, the number of chunks to use to save memory\n      (set to 1 for no chunking).\n    Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [n_query_images, height, width, n_objects, feature_dim].\n    gt_ids: An int32 tensor of the unique sorted object ids present\n      in the reference labels.\n    \"\"\"\n    # reference_embeddings = reference_embeddings.detach().cpu()\n    # query_embeddings = query_embeddings.detach().cpu()",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:182-201"
+    },
+    "551": {
+        "file_id": 55,
+        "content": "This code calculates nearest neighbors for query frames based on the given embedding vectors. It takes input parameters like reference frame class labels, maximum number of candidates, and number of nearest neighbors to use. The function returns nearest neighbor features, unique sorted object ids present in the reference labels, and potentially gt_ids if provided.",
+        "type": "comment"
+    },
+    "552": {
+        "file_id": 55,
+        "content": "    # reference_labels = reference_labels.detach().cpu()\n    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])\n    h, w, _ = query_embeddings.shape\n    reference_labels_flat = reference_labels.reshape([-1])\n    if gt_ids is None:\n        ref_obj_ids = paddle.unique(reference_labels_flat)[-1]\n        ref_obj_ids = np.arange(0, ref_obj_ids + 1)\n        gt_ids = paddle.to_tensor(ref_obj_ids)\n        gt_ids = int_(gt_ids)\n    else:\n        gt_ids = int_(paddle.arange(0, gt_ids + 1))\n    embedding_dim = query_embeddings.shape[-1]\n    query_embeddings_flat = query_embeddings.reshape([-1, embedding_dim])\n    reference_embeddings_flat = reference_embeddings.reshape(\n        [-1, embedding_dim])\n    nn_features = _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat,\n        reference_labels_flat, gt_ids, k_nearest_neighbors, n_chunks, **cfg)\n    nn_features_dim = nn_features.shape[-1]\n    nn_features = nn_features.reshape(\n        [1, h, w, gt_ids.shape[0], nn_features_dim])",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:202-224"
+    },
+    "553": {
+        "file_id": 55,
+        "content": "This code is reshaping tensors and calculating nearest neighbor features for each object in chunks. It first reshapes the embeddings, then applies a function to find the closest neighbors and returns a tensor of these features. This process is done in chunks for efficiency and memory management.",
+        "type": "comment"
+    },
+    "554": {
+        "file_id": 55,
+        "content": "    return nn_features.cuda(), gt_ids\n########################################################################LOCAL_DIST_MAP\ndef local_pairwise_distances2(x, y, max_distance=9):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n    Naive implementation using map_fn.\n    Used as a slow fallback for when correlation_cost is not available.\n    Args:\n    x: Float32 tensor of shape [height, width, feature_dim].\n    y: Float32 tensor of shape [height, width, feature_dim].\n    max_distance: Integer, the maximum distance in pixel coordinates\n      per dimension which is considered to be in the search window.\n    Returns:\n    Float32 distances tensor of shape\n      [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    ori_h, ori_w, _ = x.shape\n    x = paddle.transpose(x, [2, 0, 1]).unsqueeze(0)\n    x = F.avg_pool2d(x, (2, 2), (2, 2))\n    y = paddle.transpose(y, [2, 0, 1]).unsqueeze(0)\n    y = F.avg_pool2d(y, (2, 2), (2, 2))\n    _, channels, height, width = x.shape\n    padding_val = 1e20\n    padded_y = F.pad(y,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:225-252"
+    },
+    "555": {
+        "file_id": 55,
+        "content": "This function calculates pairwise squared L2 distances using a local search window, with naive implementation using map_fn. It is used as a fallback when correlation_cost is not available. Inputs are tensors x and y of shape [height, width, feature\\_dim]. It returns a tensor of squared distance values shaped [height, width, (2 * max\\_distance + 1) ** 2], where max\\_distance is an integer representing the maximum distance in pixel coordinates per dimension. The function also applies average pooling with a 2x2 filter and pads the tensors x and y before calculating the distances.",
+        "type": "comment"
+    },
+    "556": {
+        "file_id": 55,
+        "content": "                     (max_distance, max_distance, max_distance, max_distance),\n                     mode='constant',\n                     value=padding_val)\n    offset_y = F.unfold(padded_y, kernel_sizes=[height, width]).reshape(\n        [1, channels, height, width, -1])\n    x = x.reshape([1, channels, height, width, 1])\n    minus = x - offset_y\n    dists = paddle.sum(paddle.multiply(minus, minus),\n                       axis=1).reshape([1, height, width,\n                                        -1]).transpose([0, 3, 1, 2])\n    dists = (paddle.nn.functional.sigmoid(dists) - 0.5) * 2\n    dists = F.interpolate(dists,\n                          size=[ori_h, ori_w],\n                          mode='bilinear',\n                          align_corners=True)\n    dists = dists.squeeze(0).transpose([1, 2, 0])\n    return dists\ndef local_previous_frame_nearest_neighbor_features_per_object(\n        prev_frame_embedding,\n        query_embedding,\n        prev_frame_labels,\n        gt_ids,\n        max_distance=12):\n    \"\"\"Computes nearest neighbor features while only allowing local matches.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:253-278"
+    },
+    "557": {
+        "file_id": 55,
+        "content": "This code calculates the nearest neighbor features for local matches in a video. It takes in parameters like previous frame embedding, query embedding, previous frame labels, and ground truth IDs. The function computes distances between frames using Sigmoid activation and bilinear interpolation. Max distance determines the maximum allowed distance for a match to be considered valid.",
+        "type": "comment"
+    },
+    "558": {
+        "file_id": 55,
+        "content": "  Args:\n    prev_frame_embedding: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the last frame.\n    query_embedding: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the query frames.\n    prev_frame_labels: Tensor of shape [height, width, 1], the class labels of\n      the previous frame.\n    gt_ids: Int Tensor of shape [n_objs] of the sorted unique ground truth\n      ids in the first frame.\n    max_distance: Integer, the maximum distance allowed for local matching.\n  Returns:\n    nn_features: A float32 np.array of nearest neighbor features of shape\n      [1, height, width, n_objects, 1].\n    \"\"\"\n    #     print(query_embedding.shape, prev_frame_embedding.shape)\n    #     print(query_embedding.place, prev_frame_embedding.place)\n    #     query_embedding = query_embedding.cpu()\n    #     prev_frame_embedding = prev_frame_embedding.cpu()\n    #     prev_frame_labels = prev_frame_labels.cpu()\n    #     print(prev_frame_labels.place, prev_frame_embedding.place, query_embedding.place)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:279-298"
+    },
+    "559": {
+        "file_id": 55,
+        "content": "This code calculates the nearest neighbor features by comparing embedding vectors of query frames with the last frame. It takes input tensors for embedding vectors, previous frame labels, and ground truth IDs along with a maximum distance limit. The function returns the nearest neighbor features in a specific shape.",
+        "type": "comment"
+    },
+    "560": {
+        "file_id": 55,
+        "content": "    d = local_pairwise_distances2(query_embedding,\n                                  prev_frame_embedding,\n                                  max_distance=max_distance)\n    height, width = prev_frame_embedding.shape[:2]\n    if MODEL_UNFOLD:\n        labels = float_(prev_frame_labels).transpose([2, 0, 1]).unsqueeze(0)\n        padded_labels = F.pad(labels, (\n            2 * max_distance,\n            2 * max_distance,\n            2 * max_distance,\n            2 * max_distance,\n        ))\n        offset_labels = F.unfold(padded_labels,\n                                 kernel_sizes=[height, width],\n                                 strides=[2,\n                                          2]).reshape([height, width, -1, 1])\n        offset_masks = paddle.equal(\n            offset_labels,\n            float_(gt_ids).unsqueeze(0).unsqueeze(0).unsqueeze(0))\n    else:\n        masks = paddle.equal(prev_frame_labels,\n                             gt_ids.unsqueeze(0).unsqueeze(0))\n        padded_masks = nn.functional.pad(masks, (",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:300-325"
+    },
+    "561": {
+        "file_id": 55,
+        "content": "Code snippet performs local pairwise distance calculation between query and previous frame embeddings. If MODEL_UNFOLD is enabled, it generates offset labels by unfolding padded labels with kernel sizes matching height and width of the previous frame embedding. It then creates offset masks by checking equality between offset labels and gt_ids. If MODEL_UNFOLD is not enabled, it directly creates masks by comparing previous frame labels and gt_ids. Finally, it pads the masks using nn.functional.pad with specified padding values.",
+        "type": "comment"
+    },
+    "562": {
+        "file_id": 55,
+        "content": "            0,\n            0,\n            max_distance,\n            max_distance,\n            max_distance,\n            max_distance,\n        ))\n        offset_masks = []\n        for y_start in range(2 * max_distance + 1):\n            y_end = y_start + height\n            masks_slice = padded_masks[y_start:y_end]\n            for x_start in range(2 * max_distance + 1):\n                x_end = x_start + width\n                offset_mask = masks_slice[:, x_start:x_end]\n                offset_masks.append(offset_mask)\n        offset_masks = paddle.stack(offset_masks, axis=2)\n    d_tiled = d.unsqueeze(-1).tile((1, 1, 1, gt_ids.shape[0]))\n    pad = paddle.ones_like(d_tiled)\n    d_masked = paddle.where(offset_masks, d_tiled, pad)\n    dists = paddle.min(d_masked, axis=2)\n    dists = dists.reshape([1, height, width, gt_ids.shape[0], 1])\n    return dists\n##############################################################\n#################\nclass _res_block(nn.Layer):\n    def __init__(self, in_dim, out_dim, **cfg):\n        super(_res_block, self).__init__()",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:326-358"
+    },
+    "563": {
+        "file_id": 55,
+        "content": "The code is performing feature extraction and masking for a specific model. It first tiles input data, then applies offset masks to selected regions, and finally extracts minimum distances using the tiled and masked data. The result is a new set of distances which are then reshaped for further processing.",
+        "type": "comment"
+    },
+    "564": {
+        "file_id": 55,
+        "content": "        self.conv1 = nn.Conv2D(in_dim,\n                               out_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.relu1 = nn.ReLU()\n        self.bn1 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg['train_bn_mom'])\n        self.conv2 = nn.Conv2D(out_dim,\n                               out_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.relu2 = nn.ReLU()\n        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg['train_bn_mom'])\n    def forward(self, x):\n        res = x\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu2(x)\n        x += res\n        return x\n####################\nclass IntSegHead(nn.Layer):\n    def __init__(self, in_dim, emb_dim, **cfg):\n        super(IntSegHead, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:359-390"
+    },
+    "565": {
+        "file_id": 55,
+        "content": "This code defines a convolutional neural network (CNN) architecture for image processing tasks. The class `IntVOS` contains two 2D convolutions, batch normalization, and ReLU activations in its forward pass. The `IntSegHead` class initializes another CNN with different parameters, which seems to be a part of the overall model. Both classes extend `nn.Layer`, indicating they are PaddlePaddle's version of PyTorch layers.",
+        "type": "comment"
+    },
+    "566": {
+        "file_id": 55,
+        "content": "                               emb_dim,\n                               kernel_size=7,\n                               stride=1,\n                               padding=3)\n        self.bn1 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg['train_bn_mom'])\n        self.relu1 = nn.ReLU(True)\n        self.res1 = _res_block(emb_dim, emb_dim, **cfg)\n        self.res2 = _res_block(emb_dim, emb_dim, **cfg)\n        self.conv2 = nn.Conv2D(256,\n                               emb_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.bn2 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg['train_bn_mom'])\n        self.relu2 = nn.ReLU(True)\n        self.conv3 = nn.Conv2D(emb_dim, 1, 1, 1)\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.res1(x)\n        x = self.res2(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu2(x)\n        x = self.conv3(x)\n        return x",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:391-418"
+    },
+    "567": {
+        "file_id": 55,
+        "content": "This code defines a custom Convolutional Neural Network (CNN) layer for extracting features from input images. It consists of multiple convolutions, batch normalizations, and ReLU activations. The input image is first passed through several convolution layers with different configurations, followed by batch normalization and ReLU activation functions to improve model performance. Finally, the output is returned after passing it through a single convolution layer and another batch normalization and ReLU activation.",
+        "type": "comment"
+    },
+    "568": {
+        "file_id": 55,
+        "content": "class _split_separable_conv2d(nn.Layer):\n    def __init__(self, in_dim, out_dim, kernel_size=7, **cfg):\n        super(_split_separable_conv2d, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,\n                               in_dim,\n                               kernel_size=kernel_size,\n                               stride=1,\n                               padding=int((kernel_size - 1) / 2),\n                               groups=in_dim)\n        self.relu1 = nn.ReLU(True)\n        self.bn1 = paddle.nn.BatchNorm2D(in_dim, momentum=cfg['train_bn_mom'])\n        self.conv2 = nn.Conv2D(in_dim, out_dim, kernel_size=1, stride=1)\n        self.relu2 = nn.ReLU(True)\n        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg['train_bn_mom'])\n        kaiming_normal_(self.conv1.weight, mode='fan_out', nonlinearity='relu')\n        kaiming_normal_(self.conv2.weight, mode='fan_out', nonlinearity='relu')\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.conv2(x)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:421-442"
+    },
+    "569": {
+        "file_id": 55,
+        "content": "This code defines a custom layer _split_separable_conv2d, which consists of two convolutional layers followed by ReLU and batch normalization. The first convolution is performed with the same number of input and output channels, while the second has fewer output channels than input dimensions. This architecture helps to reduce parameters and computational cost in a deep learning model for image processing tasks.",
+        "type": "comment"
+    },
+    "570": {
+        "file_id": 55,
+        "content": "        x = self.bn2(x)\n        x = self.relu2(x)\n        return x\nclass DynamicSegHead(nn.Layer):\n    def __init__(self, in_dim, embed_dim, **cfg):\n        super(DynamicSegHead, self).__init__()\n        self.layer1 = _split_separable_conv2d(in_dim, embed_dim, **cfg)\n        self.layer2 = _split_separable_conv2d(embed_dim, embed_dim, **cfg)\n        self.layer3 = _split_separable_conv2d(embed_dim, embed_dim, **cfg)\n        self.layer4 = _split_separable_conv2d(embed_dim, embed_dim, **cfg)\n        self.conv = nn.Conv2D(embed_dim, 1, 1, 1)\n        kaiming_normal_(self.conv.weight, mode='fan_out', nonlinearity='relu')\n    def forward(self, x):\n        x = self.layer1(x)\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        x = self.conv(x)\n        return x\nfrom ..registry import HEADS\n\"\"\"\n覆盖原理\nclass c1:\n    def __init__(self):\n        self.a = 1\nclass c2(c1):\n    def __init__(self):\n        super(c2, self).__init__()\n        self.a = 2\nc = c2()\nprint(c.a)\n\"\"\"\n@HEADS.register()\nclass IntVOS(nn.Layer):",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:443-488"
+    },
+    "571": {
+        "file_id": 55,
+        "content": "The code defines two classes: IntVOS and DynamicSegHead. IntVOS is a subclass of nn.Layer and utilizes the DynamicSegHead class as its segmentation head. DynamicSegHead is also a subclass of nn.Layer and consists of several layers (layer1, layer2, layer3, layer4) that apply separable convolutions to the input. Finally, there's a nn.Conv2D layer with Kaiming initialization for the output. This architecture can be used for segmentation tasks in computer vision applications.",
+        "type": "comment"
+    },
+    "572": {
+        "file_id": 55,
+        "content": "    def __init__(self, feature_extracter, **cfg):\n        super(IntVOS, self).__init__()\n        self.feature_extracter = feature_extracter  ##embedding extractor\n        self.feature_extracter.cls_conv = nn.Sequential()\n        self.feature_extracter.upsample4 = nn.Sequential()\n        self.semantic_embedding = None\n        self.seperate_conv = nn.Conv2D(cfg['model_aspp_outdim'],\n                                       cfg['model_aspp_outdim'],\n                                       kernel_size=3,\n                                       stride=1,\n                                       padding=1,\n                                       groups=cfg['model_aspp_outdim'])\n        self.bn1 = paddle.nn.BatchNorm2D(cfg['model_aspp_outdim'],\n                                         momentum=cfg['train_bn_mom'])\n        self.relu1 = nn.ReLU(True)\n        self.embedding_conv = nn.Conv2D(cfg['model_aspp_outdim'],\n                                        cfg['model_semantic_embedding_dim'], 1,\n                                        1)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:489-506"
+    },
+    "573": {
+        "file_id": 55,
+        "content": "This code defines a class called IntVOS. The constructor takes in a feature_extracter and **cfg parameters, initializes the instance variables, and adds layers to the feature_extracter if required. It also initializes the embedding convolution layer for semantic embedding extraction.",
+        "type": "comment"
+    },
+    "574": {
+        "file_id": 55,
+        "content": "        self.relu2 = nn.ReLU(True)\n        self.bn2 = paddle.nn.BatchNorm2D(cfg['model_semantic_embedding_dim'],\n                                         momentum=cfg['train_bn_mom'])\n        self.semantic_embedding = nn.Sequential(*[\n            self.seperate_conv, self.bn1, self.relu1, self.embedding_conv,\n            self.bn2, self.relu2\n        ])\n        for m in self.semantic_embedding:\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')\n        self.dynamic_seghead = DynamicSegHead(\n            in_dim=cfg['model_semantic_embedding_dim'] + 3,\n            embed_dim=cfg['model_head_embedding_dim'],\n            **cfg)  # propagation segm head\n        if cfg['model_useintseg']:\n            self.inter_seghead = IntSegHead(\n                in_dim=cfg['model_semantic_embedding_dim'] + 3,\n                emb_dim=cfg['model_head_embedding_dim'],\n                **cfg)\n        else:\n            self.inter_seghead = DynamicSegHead(\n                in_dim=cfg['model_semantic_embedding_dim'] + 2,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:507-530"
+    },
+    "575": {
+        "file_id": 55,
+        "content": "The code initializes and configures the layers for semantic segmentation. It creates a ReLU activation function, a batch normalization layer with specified parameters, and a sequential neural network containing the separate convolution, first batch norm, first ReLU, embedding convolution, second batch norm, and second ReLU. The code also initializes the dynamic segmentation head and (optionally) an inter-segmentation head depending on the config's 'model_useintseg' flag.",
+        "type": "comment"
+    },
+    "576": {
+        "file_id": 55,
+        "content": "                embed_dim=cfg['model_head_embedding_dim'],\n                **cfg)  # interaction segm head\n        self.pretrained = cfg.get('pretrained', None)\n        self.cfg = cfg\n    def init_weights(self):\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            self.set_state_dict(load(self.pretrained, self.state_dict()))\n            print('loaded pretrained model')\n    def loss(self, **kwargs):\n        return self.loss_func(**kwargs)\n    def forward(self,\n                x=None,\n                ref_scribble_label=None,\n                previous_frame_mask=None,\n                normalize_nearest_neighbor_distances=True,\n                use_local_map=True,\n                seq_names=None,\n                gt_ids=None,\n                k_nearest_neighbors=1,\n                global_map_tmp_dic=None,\n                local_map_dics=None,\n                interaction_num=None,\n                start_annotated_frame=None,\n                frame_num=None):\n        x = self.extract_feature(x)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:531-559"
+    },
+    "577": {
+        "file_id": 55,
+        "content": "This code defines a class for a model head that takes input, initializes weights (loading pretrained if available), and calculates the loss during forward pass. It uses various input parameters such as x, ref_scribble_label, previous_frame_mask, etc. The forward function extracts features from the input, and is responsible for losses related to the model head.",
+        "type": "comment"
+    },
+    "578": {
+        "file_id": 55,
+        "content": "        #         print('extract_feature:', x.mean().item())\n        ref_frame_embedding, previous_frame_embedding, current_frame_embedding = paddle.split(\n            x, num_or_sections=3, axis=0)\n        if global_map_tmp_dic is None:\n            dic = self.prop_seghead(\n                ref_frame_embedding,\n                previous_frame_embedding,\n                current_frame_embedding,\n                ref_scribble_label,\n                previous_frame_mask,\n                normalize_nearest_neighbor_distances,\n                use_local_map,\n                seq_names,\n                gt_ids,\n                k_nearest_neighbors,\n                global_map_tmp_dic,\n                local_map_dics,\n                interaction_num,\n                start_annotated_frame,\n                frame_num,\n                self.dynamic_seghead,\n            )\n            return dic\n        else:\n            dic, global_map_tmp_dic = self.prop_seghead(\n                ref_frame_embedding,\n                previous_frame_embedding,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:560-588"
+    },
+    "579": {
+        "file_id": 55,
+        "content": "This code is splitting input feature x into three parts (ref, previous, current frame embeddings), then calling the prop_seghead function to compute a dictionary of results. If global_map_tmp_dic is None, it returns only the dictionary; otherwise, it also updates global_map_tmp_dic and returns both.",
+        "type": "comment"
+    },
+    "580": {
+        "file_id": 55,
+        "content": "                current_frame_embedding,\n                ref_scribble_label,\n                previous_frame_mask,\n                normalize_nearest_neighbor_distances,\n                use_local_map,\n                seq_names,\n                gt_ids,\n                k_nearest_neighbors,\n                global_map_tmp_dic,\n                local_map_dics,\n                interaction_num,\n                start_annotated_frame,\n                frame_num,\n                self.dynamic_seghead,\n            )\n            return dic, global_map_tmp_dic\n    def extract_feature(self, x):\n        x = self.feature_extracter(x)\n        x = self.semantic_embedding(x)\n        return x\n    def prop_seghead(\n        self,\n        ref_frame_embedding=None,\n        previous_frame_embedding=None,\n        current_frame_embedding=None,\n        ref_scribble_label=None,\n        previous_frame_mask=None,\n        normalize_nearest_neighbor_distances=True,\n        use_local_map=True,\n        seq_names=None,\n        gt_ids=None,\n        k_nearest_neighbors=1,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:589-622"
+    },
+    "581": {
+        "file_id": 55,
+        "content": "This code defines a class with three methods: \"IntVOS\", \"extract_feature\", and \"prop_seghead\". The \"IntVOS\" function returns two dictionaries after performing some operations. The \"extract_feature\" method extracts features from input image using feature extracter and semantic embedding. The \"prop_seghead\" method takes various inputs, including frame embeddings, scribble label, and mask, and performs propagation segmentation head task with optional normalization and local map usage.",
+        "type": "comment"
+    },
+    "582": {
+        "file_id": 55,
+        "content": "        global_map_tmp_dic=None,\n        local_map_dics=None,\n        interaction_num=None,\n        start_annotated_frame=None,\n        frame_num=None,\n        dynamic_seghead=None,\n    ):\n        \"\"\"return: feature_embedding,global_match_map,local_match_map,previous_frame_mask\"\"\"\n        ###############\n        cfg = self.cfg\n        global_map_tmp_dic = global_map_tmp_dic\n        dic_tmp = {}\n        bs, c, h, w = current_frame_embedding.shape\n        if cfg.get('test_mode'):\n            scale_ref_scribble_label = float_(ref_scribble_label)\n        else:\n            scale_ref_scribble_label = paddle.nn.functional.interpolate(\n                float_(ref_scribble_label), size=(h, w), mode='nearest')\n        scale_ref_scribble_label = int_(scale_ref_scribble_label)\n        scale_previous_frame_label = paddle.nn.functional.interpolate(\n            float_(previous_frame_mask), size=(h, w), mode='nearest')\n        scale_previous_frame_label = int_(scale_previous_frame_label)\n        for n in range(bs):\n            seq_current_frame_embedding = current_frame_embedding[n]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:623-646"
+    },
+    "583": {
+        "file_id": 55,
+        "content": "This function takes in various parameters and returns feature_embedding, global_match_map, local_match_map, and previous_frame_mask. It initializes global_map_tmp_dic, dic_tmp, bs, c, h, w from current_frame_embedding, checks if it is in test mode, scales ref_scribble_label and previous_frame_mask using interpolation for matching dimensions, and then iterates through a range of bs, performing operations on seq_current_frame_embedding.",
+        "type": "comment"
+    },
+    "584": {
+        "file_id": 55,
+        "content": "            seq_ref_frame_embedding = ref_frame_embedding[n]\n            seq_prev_frame_embedding = previous_frame_embedding[n]\n            seq_ref_frame_embedding = seq_ref_frame_embedding.transpose(\n                [1, 2, 0])\n            seq_current_frame_embedding = seq_current_frame_embedding.transpose(\n                [1, 2, 0])\n            seq_ref_scribble_label = scale_ref_scribble_label[n].transpose(\n                [1, 2, 0])\n            #########Global Map\n            nn_features_n, ref_obj_ids = nearest_neighbor_features_per_object(\n                reference_embeddings=seq_ref_frame_embedding,\n                query_embeddings=seq_current_frame_embedding,\n                reference_labels=seq_ref_scribble_label,\n                k_nearest_neighbors=k_nearest_neighbors,\n                gt_ids=gt_ids[n],\n                n_chunks=10)\n            if normalize_nearest_neighbor_distances:\n                nn_features_n = (paddle.nn.functional.sigmoid(nn_features_n) -\n                                 0.5) * 2",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:647-665"
+    },
+    "585": {
+        "file_id": 55,
+        "content": "This code calculates nearest neighbor features for each object using reference and current frame embeddings, and scribble labels. It transposes the embeddings and label to match the global map format and uses k-nearest neighbors to find the corresponding features. If normalization is enabled, it applies a sigmoid function to normalize the distances.",
+        "type": "comment"
+    },
+    "586": {
+        "file_id": 55,
+        "content": "            #             print(nn_features_n)\n            ###\n            if global_map_tmp_dic is not None:  ###when testing, use global map memory\n                if seq_names[n] not in global_map_tmp_dic:\n                    global_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                        nn_features_n).tile([1000, 1, 1, 1, 1])\n                nn_features_n = paddle.where(\n                    nn_features_n <= global_map_tmp_dic[seq_names[n]][\n                        frame_num[n]].unsqueeze(0), nn_features_n,\n                    global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(\n                        0))\n                #                 print('detach 1')\n                #                 print(nn_features_n.shape)\n                # nn_features_n = nn_features_n.detach()\n                global_map_tmp_dic[seq_names[n]][\n                    frame_num[n]] = nn_features_n.detach()[0]\n            #########################Local dist map\n            seq_prev_frame_embedding = seq_prev_frame_embedding.transpose(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:667-687"
+    },
+    "587": {
+        "file_id": 55,
+        "content": "This code section checks if a sequence name exists in the global map dictionary, and if not, creates an entry for it. It then compares the current frame's features to the corresponding value in the global map for that sequence. If the current frame's features are less than or equal to the stored value, they remain unchanged; otherwise, they get updated with the stored value. Finally, it updates the global map entry with the new frame's features.",
+        "type": "comment"
+    },
+    "588": {
+        "file_id": 55,
+        "content": "                [1, 2, 0])\n            seq_previous_frame_label = scale_previous_frame_label[n].transpose(\n                [1, 2, 0])\n            if use_local_map:\n                prev_frame_nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(\n                    prev_frame_embedding=seq_prev_frame_embedding,\n                    query_embedding=seq_current_frame_embedding,\n                    prev_frame_labels=seq_previous_frame_label,\n                    gt_ids=ref_obj_ids,\n                    max_distance=cfg['model_max_local_distance'])\n            else:\n                prev_frame_nn_features_n, _ = nearest_neighbor_features_per_object(\n                    reference_embeddings=seq_prev_frame_embedding,\n                    query_embeddings=seq_current_frame_embedding,\n                    reference_labels=seq_previous_frame_label,\n                    k_nearest_neighbors=k_nearest_neighbors,\n                    gt_ids=gt_ids[n],\n                    n_chunks=20)\n                prev_frame_nn_features_n = (",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:688-707"
+    },
+    "589": {
+        "file_id": 55,
+        "content": "This code is finding the nearest neighbor features for the previous frame's embedding, based on whether local mapping is used or not. If local mapping is used, it calls a separate function `local_previous_frame_nearest_neighbor_features_per_object` to get the features and labels. Otherwise, it uses the `nearest_neighbor_features_per_object` function with specified parameters to find the nearest neighbors. The resulting features are stored in `prev_frame_nn_features_n`.",
+        "type": "comment"
+    },
+    "590": {
+        "file_id": 55,
+        "content": "                    paddle.nn.functional.sigmoid(prev_frame_nn_features_n) -\n                    0.5) * 2\n            #             print(prev_frame_nn_features_n.mean().item(), prev_frame_nn_features_n.shape, interaction_num)  # o\n            #############\n            if local_map_dics is not None:  ##When testing, use local map memory\n                local_map_tmp_dic, local_map_dist_dic = local_map_dics\n                if seq_names[n] not in local_map_dist_dic:\n                    print(seq_names[n], 'not in local_map_dist_dic')\n                    local_map_dist_dic[seq_names[n]] = paddle.zeros(1000, 9)\n                if seq_names[n] not in local_map_tmp_dic:\n                    print(seq_names[n], 'not in local_map_tmp_dic')\n                    local_map_tmp_dic[seq_names[n]] = paddle.zeros_like(\n                        prev_frame_nn_features_n).unsqueeze(0).tile(\n                            [1000, 9, 1, 1, 1, 1])\n                #                 print(local_map_dist_dic[seq_names[n]].shape)\n                #                 print('detach 2')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:708-724"
+    },
+    "591": {
+        "file_id": 55,
+        "content": "This code is checking if the local map dictionaries are not None, indicating testing with local map memory. If a specific sequence name isn't in the local map distance dictionary or temporary map dictionary, it prints an error message and creates a new zero tensor to store the data.",
+        "type": "comment"
+    },
+    "592": {
+        "file_id": 55,
+        "content": "                # prev_frame_nn_features_n = prev_frame_nn_features_n.detach()\n                local_map_dist_dic[seq_names[n]][\n                    frame_num[n], interaction_num -\n                    1] = 1.0 / (abs(frame_num[n] - start_annotated_frame)\n                                )  # bugs fixed.\n                local_map_tmp_dic[seq_names[n]][\n                    frame_num[n],\n                    interaction_num - 1] = prev_frame_nn_features_n.squeeze(\n                        0).detach()  # bugs fixed.\n                if interaction_num == 1:\n                    prev_frame_nn_features_n = local_map_tmp_dic[seq_names[n]][\n                        frame_num[n]][interaction_num - 1]\n                    prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                        0)\n                else:\n                    if local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 1] > \\\n                            local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 2]:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:725-741"
+    },
+    "593": {
+        "file_id": 55,
+        "content": "This code segment appears to be part of a larger function that processes video frames and interactions. It stores the distance from the current frame to the first annotated frame in the local_map_dist_dic dictionary, as well as the corresponding previous frame features in the local_map_tmp_dic. The code also updates the value of prev_frame_nn_features_n based on certain conditions involving interaction numbers and distances between frames.",
+        "type": "comment"
+    },
+    "594": {
+        "file_id": 55,
+        "content": "                        prev_frame_nn_features_n = local_map_tmp_dic[\n                            seq_names[n]][frame_num[n]][interaction_num - 1]\n                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                            0)\n                    else:\n                        prev_frame_nn_features_n = local_map_tmp_dic[\n                            seq_names[n]][frame_num[n]][interaction_num - 2]\n                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                            0)\n                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)\n            to_cat_previous_frame = (\n                float_(seq_previous_frame_label) == float_(ref_obj_ids)\n            )  # float comparision?\n            to_cat_current_frame_embedding = current_frame_embedding[\n                n].unsqueeze(0).tile((ref_obj_ids.shape[0], 1, 1, 1))\n            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(\n                [2, 3, 0, 1])\n            to_cat_previous_frame = float_(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:742-763"
+    },
+    "595": {
+        "file_id": 55,
+        "content": "This code appears to be part of a video modeling process. It seems to involve local map dictionaries and interaction numbers, comparing previous frames with current ones for float comparisons, unsqueezing and reshaping features and labels, and potentially using these operations in some video modeling or analysis task.",
+        "type": "comment"
+    },
+    "596": {
+        "file_id": 55,
+        "content": "                to_cat_previous_frame.unsqueeze(-1).transpose([2, 3, 0, 1]))\n            to_cat_prev_frame_nn_feature_n = prev_frame_nn_features_n.squeeze(\n                0).transpose([2, 3, 0, 1])\n            to_cat = paddle.concat(\n                (to_cat_current_frame_embedding, to_cat_nn_feature_n,\n                 to_cat_prev_frame_nn_feature_n, to_cat_previous_frame), 1)\n            pred_ = dynamic_seghead(to_cat)\n            pred_ = pred_.transpose([1, 0, 2, 3])\n            dic_tmp[seq_names[n]] = pred_\n        if global_map_tmp_dic is None:\n            return dic_tmp\n        else:\n            if local_map_dics is None:\n                return dic_tmp, global_map_tmp_dic\n            else:\n                return dic_tmp, global_map_tmp_dic, local_map_dics\n    def int_seghead(self,\n                    ref_frame_embedding=None,\n                    ref_scribble_label=None,\n                    prev_round_label=None,\n                    normalize_nearest_neighbor_distances=True,\n                    global_map_tmp_dic=None,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:764-787"
+    },
+    "597": {
+        "file_id": 55,
+        "content": "This code is defining a function \"int_seghead\" that takes in various inputs and returns output dictionaries. It concatenates embeddings and features, passes them to the dynamic_seghead function, transposes the result, and stores it in a dictionary. If global_map_tmp_dic is not None, the function also returns other dictionaries.",
+        "type": "comment"
+    },
+    "598": {
+        "file_id": 55,
+        "content": "                    local_map_dics=None,\n                    interaction_num=None,\n                    seq_names=None,\n                    gt_ids=None,\n                    k_nearest_neighbors=1,\n                    frame_num=None,\n                    first_inter=True):\n        dic_tmp = {}\n        bs, c, h, w = ref_frame_embedding.shape\n        scale_ref_scribble_label = paddle.nn.functional.interpolate(\n            float_(ref_scribble_label), size=(h, w), mode='nearest')\n        scale_ref_scribble_label = int_(scale_ref_scribble_label)\n        if not first_inter:\n            scale_prev_round_label = paddle.nn.functional.interpolate(\n                float_(prev_round_label), size=(h, w), mode='nearest')\n            scale_prev_round_label = int_(scale_prev_round_label)\n        n_chunks = 500\n        for n in range(bs):\n            gt_id = paddle.arange(0, gt_ids[n] + 1)\n            gt_id = int_(gt_id)\n            seq_ref_frame_embedding = ref_frame_embedding[n]\n            ########################Local dist map",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:788-813"
+    },
+    "599": {
+        "file_id": 55,
+        "content": "This code snippet calculates the local distance map for each frame in the batch and possibly a previous round if it's not the first interaction. The function takes in various parameters such as ref_frame_embedding, prev_round_label, gt_ids, etc., and performs interpolation to resize the reference scribble label and previous round label. It then iterates over each frame in the batch, creating a gt_id array, and calculating the local distance map for the current frame's embedding. This process may involve interpolation and integer conversion of the resized labels.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/50.json b/docs/data/50.json
new file mode 100644
index 000000000..ca44518a0
--- /dev/null
+++ b/docs/data/50.json
@@ -0,0 +1,546 @@
+{
+    "5000": {
+        "file_id": 430,
+        "content": "        \"\"\"get the size of the dataset.\"\"\"\n        if self.num_samples_precise_bn is None:\n            return len(self.info)\n        else:\n            random.shuffle(self.info)\n            return min(self.num_samples_precise_bn, len(self.info))",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/slowfast_video.py:138-143"
+    },
+    "5001": {
+        "file_id": 430,
+        "content": "This code calculates the size of the dataset. If num_samples_precise_bn is None, it returns the length of self.info. Otherwise, shuffles self.info and returns the minimum value between num_samples_precise_bn and the length of self.info.",
+        "type": "comment"
+    },
+    "5002": {
+        "file_id": 431,
+        "content": "/paddlevideo/loader/dataset/ucf101_skeleton.py",
+        "type": "filepath"
+    },
+    "5003": {
+        "file_id": 431,
+        "content": "This code defines a Python class for the UCF101 Skeleton Dataset in PaddleVideo, loading skeleton features and normalizing data for action recognition tasks. The dataset includes train and test methods for preparing frames with `prepare_train` and `prepare_test` functions.",
+        "type": "summary"
+    },
+    "5004": {
+        "file_id": 431,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nimport pickle\nimport paddle\nfrom paddle.io import Dataset\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass UCF101SkeletonDataset(BaseDataset):\n    \"\"\"\n    Skeleton dataset for action recognition.\n    The dataset loads skeleton feature, and apply norm operatations.",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ucf101_skeleton.py:1-35"
+    },
+    "5005": {
+        "file_id": 431,
+        "content": "This code snippet is a Python class for UCF101 Skeleton Dataset in PaddleVideo. It loads skeleton features and applies normalization operations, registering the dataset for action recognition tasks.",
+        "type": "comment"
+    },
+    "5006": {
+        "file_id": 431,
+        "content": "    Args:\n        file_path (str): Path to the index file.\n        pipeline(obj): Define the pipeline of data preprocessing.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 split,\n                 repeat_times,\n                 test_mode=False):\n        self.split = split\n        self.repeat_times = repeat_times\n        super().__init__(file_path, pipeline, test_mode=test_mode)\n        self._ori_len = len(self.info)\n        self.start_index = 0\n        self.modality = \"Pose\"\n    def load_file(self):\n        \"\"\"Load annotation file to get video information.\"\"\"\n        assert self.file_path.endswith('.pkl')\n        return self.load_pkl_annotations()\n    def load_pkl_annotations(self):\n        with open(self.file_path, \"rb\") as f:\n            data = pickle.load(f)\n        if self.split:\n            split, data = data['split'], data['annotations']\n            identifier = 'filename' if 'filename' in data[0] else 'frame_dir'",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ucf101_skeleton.py:36-66"
+    },
+    "5007": {
+        "file_id": 431,
+        "content": "This code defines a class that loads annotation data from a file, specifically for the UCF101 dataset's skeleton information. It takes arguments such as the file path, pipeline object, and whether it's building a test dataset. The load_file method checks if the file is a .pkl file and calls load_pkl_annotations to get video information. If the split argument is provided, it only uses the specified part of the data.",
+        "type": "comment"
+    },
+    "5008": {
+        "file_id": 431,
+        "content": "            data = [x for x in data if x[identifier] in split[self.split]]\n        return data\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx % self._ori_len])\n        results['modality'] = self.modality\n        results['start_index'] = self.start_index\n        return self.pipeline(results)\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for testing given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx % self._ori_len])\n        results['modality'] = self.modality\n        results['start_index'] = self.start_index\n        return self.pipeline(results)\n    def __len__(self):\n        \"\"\"get the size of the dataset.\"\"\"\n        return len(self.info) * self.repeat_times",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ucf101_skeleton.py:67-89"
+    },
+    "5009": {
+        "file_id": 431,
+        "content": "This code defines a dataset for PaddleVideo, containing train and test methods for preparing frames. The `prepare_train` and `prepare_test` functions create new results by copying the original information, setting modality and start index based on the given index. The `__len__` function returns the size of the dataset by multiplying the number of info items with repeat times.",
+        "type": "comment"
+    },
+    "5010": {
+        "file_id": 432,
+        "content": "/paddlevideo/loader/dataset/ucf24_dataset.py",
+        "type": "filepath"
+    },
+    "5011": {
+        "file_id": 432,
+        "content": "The Python code defines a \"Ucf24Dataset\" class for loading and transforming UCF24 dataset in PaddleVideo, with methods to prepare data for training/validation and testing. It extracts relevant information like image paths, labels, and frame indices, and converts image path names from 'jpg' to 'txt'.",
+        "type": "summary"
+    },
+    "5012": {
+        "file_id": 432,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass UCF24Dataset(BaseDataset):\n    \"\"\"Dataset for YOWO\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates\n       a sample video with the filepath and label, which are split with a whitesapce.",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ucf24_dataset.py:1-30"
+    },
+    "5013": {
+        "file_id": 432,
+        "content": "This code is a Python class for the UCF24 dataset used in PaddleVideo, which loads raw videos and applies specified transformations on them. It is registered within the registry module and utilizes other modules such as BaseDataset and gets logger from utils. The license information and import statements are also included.",
+        "type": "comment"
+    },
+    "5014": {
+        "file_id": 432,
+        "content": "       Example of a inde file:\n       .. code-block:: txt\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(self, file_path, pipeline, num_retries=5, **kwargs):\n        self.num_retries = num_retries\n        super().__init__(file_path, pipeline, **kwargs)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            lines = fin.readlines()\n        for line in lines:\n            line = line.strip()  # 'data/ucf24/labels/class_name/video_name/key_frame.txt'\n            filename = line.replace('txt', 'jpg').replace(\n                'labels', 'rgb-images')  # key frame path\n            info.append(dict(filename=filename))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ucf24_dataset.py:31-59"
+    },
+    "5015": {
+        "file_id": 432,
+        "content": "This code defines a dataset class, \"Ucf24Dataset\", which loads video information from an index file and prepares data for training or validation. It takes a file path, pipeline, and additional keyword arguments. The load_file method reads the index file to extract video information, such as filenames, while the prepare_train method prepares data for training/validation given an index.",
+        "type": "comment"
+    },
+    "5016": {
+        "file_id": 432,
+        "content": "        results = self.pipeline(results)\n        im_path = results['filename']\n        im_path = im_path.replace('jpg', 'txt')\n        im_split = im_path.split('/')\n        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]\n        return results['imgs'], np.array([results['labels']]), frame_index\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for test given the index.\"\"\"\n        # Try to catch Exception caused by reading corrupted video file\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        im_path = results['filename']\n        im_path = im_path.replace('jpg', 'txt')\n        im_split = im_path.split('/')\n        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]\n        return results['imgs'], np.array([results['labels']]), frame_index",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/ucf24_dataset.py:60-76"
+    },
+    "5017": {
+        "file_id": 432,
+        "content": "Code from \"PaddleVideo/paddlevideo/loader/dataset/ucf24_dataset.py\" prepares data for testing by copying the info at index idx, applying a pipeline function to it and extracting relevant information like image paths and labels. The code also converts image path names from 'jpg' to 'txt'. Finally, it returns images, labels, and frame indices.",
+        "type": "comment"
+    },
+    "5018": {
+        "file_id": 433,
+        "content": "/paddlevideo/loader/dataset/video.py",
+        "type": "filepath"
+    },
+    "5019": {
+        "file_id": 433,
+        "content": "VideoDataset is a subclass of BaseDataset that loads and processes raw videos, using an index file containing video information. It handles corrupted files with retries and error logging. The `prepare_train` and `prepare_test` methods return image data and labels for training and testing respectively.",
+        "type": "summary"
+    },
+    "5020": {
+        "file_id": 433,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass VideoDataset(BaseDataset):\n    \"\"\"Video dataset for action recognition\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/video.py:1-31"
+    },
+    "5021": {
+        "file_id": 433,
+        "content": "This code is for VideoDataset class, a subclass of BaseDataset, that loads raw videos and applies specified transforms. It uses index file with multiple lines where each line indicates information about videos in the dataset. The class is registered within the DATASETS registry and logger is initialized.",
+        "type": "comment"
+    },
+    "5022": {
+        "file_id": 433,
+        "content": "       a sample video with the filepath and label, which are split with a whitesapce.\n       Example of a inde file:\n       .. code-block:: txt\n           path/000.mp4 1\n           path/001.mp4 1\n           path/002.mp4 2\n           path/003.mp4 2\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, **kwargs)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                filename, labels = line_split\n                #TODO(hj): Required suffix format: may mp4/avi/wmv\n                filename = filename + self.suffix",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/video.py:32-57"
+    },
+    "5023": {
+        "file_id": 433,
+        "content": "This code initializes a new class for loading index file data, which contains video information. The index file has path and label entries separated by whitespace. The load_file method reads the index file to retrieve filename and labels for each video.",
+        "type": "comment"
+    },
+    "5024": {
+        "file_id": 433,
+        "content": "                if self.data_prefix is not None:\n                    filename = osp.join(self.data_prefix, filename)\n                info.append(dict(filename=filename, labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/video.py:58-80"
+    },
+    "5025": {
+        "file_id": 433,
+        "content": "This code defines a class with methods to prepare data for training and testing. It handles potential corrupted video files by retrying if an exception occurs, and logs the error message along with the number of retries. The `prepare_train` method returns image data (`imgs`) and corresponding labels from the given index in the dataset. Similarly, the `prepare_test` method returns image data and labels for testing.",
+        "type": "comment"
+    },
+    "5026": {
+        "file_id": 433,
+        "content": "        \"\"\"TEST. Prepare the data for test given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])",
+        "type": "code",
+        "location": "/paddlevideo/loader/dataset/video.py:81-95"
+    },
+    "5027": {
+        "file_id": 433,
+        "content": "This code attempts to load a video file and prepare the data for testing. It handles potential exceptions caused by corrupted files by retrying multiple times. If an exception occurs, it logs an error message and retries with another randomly selected file index. The function returns the images and labels from the successfully loaded video file.",
+        "type": "comment"
+    },
+    "5028": {
+        "file_id": 434,
+        "content": "/paddlevideo/loader/pipelines/__init__.py",
+        "type": "filepath"
+    },
+    "5029": {
+        "file_id": 434,
+        "content": "This file imports necessary modules for PaddleVideo data preprocessing and model training/testing, including annotations, video labels, augmentation, decoding, mixing, segmentation, and sampling. It also defines a list of pipeline components for custom video processing pipelines.",
+        "type": "summary"
+    },
+    "5030": {
+        "file_id": 434,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .anet_pipeline import GetMatchMap, GetVideoLabel, LoadFeat\nfrom .augmentations import (CenterCrop, ColorJitter, GroupRandomFlip,\n                            GroupResize, Image2Array, JitterScale, MultiCrop,\n                            Normalization, PackOutput, RandomCrop, RandomFlip,\n                            RandomResizedCrop, Scale, TenCrop, ToArray,\n                            UniformCrop, RandomGamma, MultiCenterCrop,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/__init__.py:1-20"
+    },
+    "5031": {
+        "file_id": 434,
+        "content": "This file contains the initialization and imports from different pipeline classes in PaddleVideo. It includes functions for loading annotations, getting video labels, and various augmentation techniques. The license and copyright information are also present.",
+        "type": "comment"
+    },
+    "5032": {
+        "file_id": 434,
+        "content": "                            RandomBrightness, RandomHue, RandomSaturation, YowoAug)\nfrom .augmentations_ava import *\nfrom .compose import Compose\nfrom .decode import FeatureDecoder, FrameDecoder, VideoDecoder, ActionFeatureDecoder\nfrom .decode_image import ImageDecoder\nfrom .decode_sampler import DecodeSampler\nfrom .mix import Cutmix, Mixup, VideoMix\nfrom .multimodal import FeaturePadding, RandomCap, RandomMask, Tokenize\nfrom .sample import Sampler, SamplerPkl\nfrom .sample_ava import *\nfrom .segmentation import MultiNorm, MultiRestrictSize\nfrom .skeleton_pipeline import AutoPadding, Iden, SkeletonNorm\nfrom .skeleton_pipeline import SketeonCropSample, SketeonModalityTransform, RandomRotation\nfrom .skeleton_pipeline import (UniformSampleFrames, PoseDecode, PoseCompact,\n                                RandomResizedCrop_V2, Flip_V2, CenterCrop_V2,\n                                GeneratePoseTarget, FormatShape, Collect)\nfrom .decode_sampler_MRI import SFMRI_DecodeSampler\nfrom .segmentation_pipline import SegmentationSampler",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/__init__.py:21-38"
+    },
+    "5033": {
+        "file_id": 434,
+        "content": "The code imports various classes and functions for different image, video, and skeleton-related pipelines. It includes modules for augmentations, decoding, mixing, segmentation, sampling, and more, used in the PaddleVideo library. These pipelines are used to preprocess, decode, and sample data for training and testing models.",
+        "type": "comment"
+    },
+    "5034": {
+        "file_id": 434,
+        "content": "from .sample_ucf24 import SamplerUCF24\n__all__ = [\n    'ImageDecoder', 'RandomMask', 'UniformCrop', 'SkeletonNorm', 'Tokenize',\n    'Sampler', 'FeatureDecoder', 'DecodeSampler', 'TenCrop', 'Compose',\n    'AutoPadding', 'Normalization', 'Mixup', 'Image2Array', 'Scale',\n    'GroupResize', 'VideoDecoder', 'FrameDecoder', 'PackOutput',\n    'ActionFeatureDecoder', 'GetVideoLabel', 'Cutmix', 'CenterCrop',\n    'RandomCrop', 'LoadFeat', 'RandomCap', 'JitterScale', 'Iden', 'VideoMix',\n    'ColorJitter', 'RandomFlip', 'ToArray', 'FeaturePadding', 'GetMatchMap',\n    'GroupRandomFlip', 'MultiCrop', 'SFMRI_DecodeSampler', 'MultiRestrictSize',\n    'MultiNorm', 'RandomResizedCrop', 'SamplerPkl', 'SegmentationSampler',\n    'SketeonCropSample', 'SketeonModalityTransform', 'RandomRotation',\n    'RandomGamma', 'MultiCenterCrop', 'RandomBrightness', 'RandomHue',\n    'RandomSaturation', 'UniformSampleFrames', 'PoseDecode', 'PoseCompact',\n    'Resize', 'RandomResizedCrop_V2', 'Flip_V2', 'GeneratePoseTarget',\n    'FormatShape', 'Collect', 'RandomSaturation', 'SamplerUCF24', 'YowoAug'",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/__init__.py:39-55"
+    },
+    "5035": {
+        "file_id": 434,
+        "content": "This code is importing the \"SamplerUCF24\" class from the \"sample_ucf24\" module and defining a list of available pipeline components for PaddleVideo, including image and feature decoders, transforms, samplers, and more. These components can be used to build custom video processing pipelines for various tasks.",
+        "type": "comment"
+    },
+    "5036": {
+        "file_id": 434,
+        "content": "]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/__init__.py:56-56"
+    },
+    "5037": {
+        "file_id": 434,
+        "content": "The code appears to be incomplete or empty, as there are no visible operations or assignments. It could potentially be a placeholder or an intentionally empty function/class definition.",
+        "type": "comment"
+    },
+    "5038": {
+        "file_id": 435,
+        "content": "/paddlevideo/loader/pipelines/anet_pipeline.py",
+        "type": "filepath"
+    },
+    "5039": {
+        "file_id": 435,
+        "content": "PaddleVideo library enables feature extraction and map creation, while GetVideoLabel class calculates IoU for object detection tasks. The code stores max IoU values and prepares data for evaluation or processing.",
+        "type": "summary"
+    },
+    "5040": {
+        "file_id": 435,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport numpy as np\nfrom ..registry import PIPELINES\n\"\"\"pipeline ops for Activity Net.\n\"\"\"\n@PIPELINES.register()\nclass LoadFeat(object):\n    def __init__(self, feat_path):\n        self.feat_path = feat_path\n    def __call__(self, results):\n        video_name = results['video_name']\n        file_name = video_name + \".npy\"\n        file_path = os.path.join(self.feat_path, file_name)\n        #TODO: check path\n        video_feat = np.load(file_path)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/anet_pipeline.py:1-32"
+    },
+    "5041": {
+        "file_id": 435,
+        "content": "This code is part of PaddleVideo library, specifically for loading feature data from a given path. It defines a class \"LoadFeat\" and uses the numpy library to load .npy files based on the video name provided in the results dictionary. The file path is constructed using the specified feat_path and video name.",
+        "type": "comment"
+    },
+    "5042": {
+        "file_id": 435,
+        "content": "        video_feat = video_feat.T\n        video_feat = video_feat.astype(\"float32\")\n        results['video_feat'] = video_feat\n        return results\n@PIPELINES.register()\nclass GetMatchMap(object):\n    def __init__(self, tscale):\n        self.tscale = tscale\n        self.tgap = 1. / self.tscale\n    def __call__(self, results):\n        match_map = []\n        for idx in range(self.tscale):\n            tmp_match_window = []\n            xmin = self.tgap * idx\n            for jdx in range(1, self.tscale + 1):\n                xmax = xmin + self.tgap * jdx\n                tmp_match_window.append([xmin, xmax])\n            match_map.append(tmp_match_window)\n        match_map = np.array(match_map)\n        match_map = np.transpose(match_map, [1, 0, 2])\n        match_map = np.reshape(match_map, [-1, 2])\n        anchor_xmin = [self.tgap * i for i in range(self.tscale)]\n        anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]\n        results['match_map'] = match_map\n        results['anchor_xmin'] = anchor_xmin",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/anet_pipeline.py:33-62"
+    },
+    "5043": {
+        "file_id": 435,
+        "content": "This code defines a pipeline function that generates matching maps for an input video. It creates temporal matching windows of varying sizes and reshapes the result into a specific format. The anchor positions are also extracted for later use.",
+        "type": "comment"
+    },
+    "5044": {
+        "file_id": 435,
+        "content": "        results['anchor_xmax'] = anchor_xmax\n        return results\n@PIPELINES.register()\nclass GetVideoLabel(object):\n    def __init__(self, tscale, dscale, datatype=\"float32\"):\n        self.tscale = tscale\n        self.dscale = dscale\n        self.tgap = 1. / self.tscale\n        self.datatype = datatype\n    def iou_with_anchors(self, anchors_min, anchors_max, box_min, box_max):\n        \"\"\"Compute jaccard score between a box and the anchors.\n        \"\"\"\n        len_anchors = anchors_max - anchors_min\n        int_xmin = np.maximum(anchors_min, box_min)\n        int_xmax = np.minimum(anchors_max, box_max)\n        inter_len = np.maximum(int_xmax - int_xmin, 0.)\n        union_len = len_anchors - inter_len + box_max - box_min\n        jaccard = np.divide(inter_len, union_len)\n        return jaccard\n    def ioa_with_anchors(self, anchors_min, anchors_max, box_min, box_max):\n        \"\"\"Compute intersection between score a box and the anchors.\n        \"\"\"\n        len_anchors = anchors_max - anchors_min\n        int_xmin = np.maximum(anchors_min, box_min)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/anet_pipeline.py:63-90"
+    },
+    "5045": {
+        "file_id": 435,
+        "content": "This code defines a class called \"GetVideoLabel\" which calculates the Intersection over Union (IOU) and intersection scores between a box and the anchors. It also initializes variables for time and distance scaling, and box type data types. The \"iou_with_anchors\" method calculates the Jaccard score and the \"ioa_with_anchors\" method computes the intersection. These methods can be used to determine the best match between an anchor box and a target box in object detection tasks.",
+        "type": "comment"
+    },
+    "5046": {
+        "file_id": 435,
+        "content": "        int_xmax = np.minimum(anchors_max, box_max)\n        inter_len = np.maximum(int_xmax - int_xmin, 0.)\n        scores = np.divide(inter_len, len_anchors)\n        return scores\n    def __call__(self, results):\n        video_info = results['video_info']\n        match_map = results['match_map']\n        anchor_xmin = results['anchor_xmin']\n        anchor_xmax = results['anchor_xmax']\n        video_second = video_info['duration_second']\n        video_labels = video_info['annotations']\n        gt_bbox = []\n        gt_iou_map = []\n        for gt in video_labels:\n            tmp_start = max(min(1, gt[\"segment\"][0] / video_second), 0)\n            tmp_end = max(min(1, gt[\"segment\"][1] / video_second), 0)\n            gt_bbox.append([tmp_start, tmp_end])\n            tmp_gt_iou_map = self.iou_with_anchors(match_map[:, 0],\n                                                   match_map[:, 1], tmp_start,\n                                                   tmp_end)\n            tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,\n                                        [self.dscale, self.tscale])",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/anet_pipeline.py:91-115"
+    },
+    "5047": {
+        "file_id": 435,
+        "content": "The function initializes gt_bbox and gt_iou_map variables to store ground truth bounding box coordinates and their IoU with anchor boxes. It then iterates through video labels, calculating the start and end timestamps in video seconds for each ground truth box. The IoU between match map and the current ground truth is computed using the iou_with_anchors function and stored in gt_iou_map, reshaped to match the dimensions of dscale and tscale.",
+        "type": "comment"
+    },
+    "5048": {
+        "file_id": 435,
+        "content": "            gt_iou_map.append(tmp_gt_iou_map)\n        gt_iou_map = np.array(gt_iou_map)\n        gt_iou_map = np.max(gt_iou_map, axis=0)\n        gt_bbox = np.array(gt_bbox)\n        gt_xmins = gt_bbox[:, 0]\n        gt_xmaxs = gt_bbox[:, 1]\n        gt_len_small = 3 * self.tgap\n        gt_start_bboxs = np.stack(\n            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)\n        gt_end_bboxs = np.stack(\n            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)\n        match_score_start = []\n        for jdx in range(len(anchor_xmin)):\n            match_score_start.append(\n                np.max(\n                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],\n                                          gt_start_bboxs[:, 0],\n                                          gt_start_bboxs[:, 1])))\n        match_score_end = []\n        for jdx in range(len(anchor_xmin)):\n            match_score_end.append(\n                np.max(\n                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/anet_pipeline.py:116-140"
+    },
+    "5049": {
+        "file_id": 435,
+        "content": "This code calculates the intersection over union (IoU) between ground truth bounding boxes and anchor boxes. It stores the maximum IoU values for each ground truth box and anchor pair, then calculates the maximum IoU values for start and end positions of anchor boxes. This information will be used to determine if a prediction matches with a ground truth box and assign appropriate scores.",
+        "type": "comment"
+    },
+    "5050": {
+        "file_id": 435,
+        "content": "                                          gt_end_bboxs[:, 0], gt_end_bboxs[:,\n                                                                           1])))\n        gt_start = np.array(match_score_start)\n        gt_end = np.array(match_score_end)\n        results['gt_iou_map'] = gt_iou_map.astype(self.datatype)\n        results['gt_start'] = gt_start.astype(self.datatype)\n        results['gt_end'] = gt_end.astype(self.datatype)\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/anet_pipeline.py:141-150"
+    },
+    "5051": {
+        "file_id": 435,
+        "content": "This code is storing ground truth (gt) IOU map, start and end indices for the annotations into the 'results' dictionary. The IOU map is converted to specified datatype before storage. These values will be used later for evaluation or further processing.",
+        "type": "comment"
+    },
+    "5052": {
+        "file_id": 436,
+        "content": "/paddlevideo/loader/pipelines/augmentations.py",
+        "type": "filepath"
+    },
+    "5053": {
+        "file_id": 436,
+        "content": "This code enhances PaddleVideo's loader with resize operation and augmentation pipeline, enabling diverse data preprocessing. It calculates crop offsets and performs object detection image augmentation using uniform sampling, resizing, and flipping techniques, resizes images, scales by 255.0, concatenates frames, transposes array dimensions, stores results in 'results', and returns arrays.",
+        "type": "summary"
+    },
+    "5054": {
+        "file_id": 436,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport random\nfrom collections.abc import Sequence\nimport cv2\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\nfrom PIL import Image\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass Scale(object):\n    \"\"\"\n    Scale images.\n    Args:\n        short_size(float | int): Short size of an image will be scaled to the short_size.\n        fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1-34"
+    },
+    "5055": {
+        "file_id": 436,
+        "content": "This code is from the PaddleVideo library, specifically the loader module's augmentations pipeline. It defines a Scale class that scales images based on their short side to the given short_size parameter. The fixed_ratio parameter determines whether or not the image should be resized while maintaining its aspect ratio. This class is then registered in PIPELINES for later use.",
+        "type": "comment"
+    },
+    "5056": {
+        "file_id": 436,
+        "content": "        do_round(bool): Whether to round up when calculating the zoom ratio. default: False\n        backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'\n    \"\"\"\n    def __init__(self,\n                 short_size,\n                 fixed_ratio=True,\n                 keep_ratio=None,\n                 do_round=False,\n                 backend='pillow'):\n        self.short_size = short_size\n        assert (fixed_ratio and not keep_ratio) or (not fixed_ratio), \\\n            f\"fixed_ratio and keep_ratio cannot be true at the same time\"\n        self.fixed_ratio = fixed_ratio\n        self.keep_ratio = keep_ratio\n        self.do_round = do_round\n        assert backend in [\n            'pillow', 'cv2'\n        ], f\"Scale's backend must be pillow or cv2, but get {backend}\"\n        self.backend = backend\n    def __call__(self, results):\n        \"\"\"\n        Performs resize operations.\n        Args:\n            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:35-61"
+    },
+    "5057": {
+        "file_id": 436,
+        "content": "The code defines a class for resize operations. It takes parameters for short size, fixed ratio (defaults to True), keep ratio, do_round (default False), and backend (default 'pillow'). The class checks if fixed_ratio and keep_ratio can't be true at the same time. It also ensures the backend is either 'pillow' or 'cv2'. The __call__ method performs resize operations on images, taking a Sequence of PIL.Image as input.",
+        "type": "comment"
+    },
+    "5058": {
+        "file_id": 436,
+        "content": "        return:\n            resized_imgs: List where each item is a PIL.Image after scaling.\n        \"\"\"\n        imgs = results['imgs']\n        resized_imgs = []\n        for i in range(len(imgs)):\n            img = imgs[i]\n            if isinstance(img, np.ndarray):\n                h, w, _ = img.shape\n            elif isinstance(img, Image.Image):\n                w, h = img.size\n            else:\n                raise NotImplementedError\n            if (w <= h and w == self.short_size) or (h <= w\n                                                     and h == self.short_size):\n                if self.backend == 'pillow' and not isinstance(\n                        img, Image.Image):\n                    img = Image.fromarray(img)\n                resized_imgs.append(img)\n                continue\n            if w <= h:\n                ow = self.short_size\n                if self.fixed_ratio:\n                    oh = int(self.short_size * 4.0 / 3.0)\n                elif self.keep_ratio is False:\n                    oh = self.short_size",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:62-88"
+    },
+    "5059": {
+        "file_id": 436,
+        "content": "This code is responsible for resizing images to a specified size in a PaddleVideo pipeline. It iterates through each image, checks the aspect ratio, and resizes them accordingly before appending to the resized_imgs list. If the image is already the correct size, it is directly added to the list without further processing.",
+        "type": "comment"
+    },
+    "5060": {
+        "file_id": 436,
+        "content": "                else:\n                    scale_factor = self.short_size / w\n                    oh = int(h * float(scale_factor) +\n                             0.5) if self.do_round else int(h *\n                                                            self.short_size / w)\n                    ow = int(w * float(scale_factor) +\n                             0.5) if self.do_round else self.short_size\n            else:\n                oh = self.short_size\n                if self.fixed_ratio:\n                    ow = int(self.short_size * 4.0 / 3.0)\n                elif self.keep_ratio is False:\n                    ow = self.short_size\n                else:\n                    scale_factor = self.short_size / h\n                    oh = int(h * float(scale_factor) +\n                             0.5) if self.do_round else self.short_size\n                    ow = int(w * float(scale_factor) +\n                             0.5) if self.do_round else int(w *\n                                                            self.short_size / h)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:89-108"
+    },
+    "5061": {
+        "file_id": 436,
+        "content": "This code calculates the output image size for resizing and maintains aspect ratio if specified. It uses scale_factor to calculate the output height (oh) and width (ow), considering do_round, fixed_ratio, keep_ratio flags and short_size.",
+        "type": "comment"
+    },
+    "5062": {
+        "file_id": 436,
+        "content": "            if self.backend == 'pillow':\n                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n            elif self.backend == 'cv2' and (self.keep_ratio is not None):\n                resized_imgs.append(\n                    cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR))\n            else:\n                resized_imgs.append(\n                    Image.fromarray(\n                        cv2.resize(np.asarray(img), (ow, oh),\n                                   interpolation=cv2.INTER_LINEAR)))\n        results['imgs'] = resized_imgs\n        return results\n@PIPELINES.register()\nclass RandomCrop(object):\n    \"\"\"\n    Random crop images.\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = target_size\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:109-138"
+    },
+    "5063": {
+        "file_id": 436,
+        "content": "This code defines an augmentation pipeline for image processing. It resizes images using different backends based on the backend specified and whether the ratio should be preserved or not. The results are then returned as a dictionary with 'imgs' key containing the resized images. Additionally, there is a RandomCrop class which performs random crop operations on images of the specified target size.",
+        "type": "comment"
+    },
+    "5064": {
+        "file_id": 436,
+        "content": "        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]\n            h, w = imgs.shape[2:]\n        else:\n            w, h = imgs[0].size\n        th, tw = self.target_size, self.target_size\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size\".format(\n                w, h, self.target_size)\n        crop_images = []\n        if 'backend' in results and results['backend'] == 'pyav':\n            x1 = np.random.randint(0, w - tw)\n            y1 = np.random.randint(0, h - th)\n            crop_images = imgs[:, :, y1:y1 + th, x1:x1 + tw]  # [C, T, th, tw]\n        else:\n            x1 = random.randint(0, w - tw)\n            y1 = random.randint(0, h - th)\n            for img in imgs:\n                if w == tw and h == th:\n                    crop_images.append(img)\n                else:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:139-164"
+    },
+    "5065": {
+        "file_id": 436,
+        "content": "This code is a part of PaddleVideo's augmentations.py, which applies random cropping to images. It checks if the backend used is 'pyav', and if so, extracts the image dimensions. If not, it gets the image size from the first image in the list. Then, it asserts that the image dimensions are larger than the target size. Finally, it generates a random crop position and crops each image in the list using these positions. The cropped images are stored in the 'crop_images' list which is returned at the end.",
+        "type": "comment"
+    },
+    "5066": {
+        "file_id": 436,
+        "content": "                    crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = crop_images\n        return results\n@PIPELINES.register()\nclass RandomResizedCrop(RandomCrop):\n    def __init__(self,\n                 area_range=(0.08, 1.0),\n                 aspect_ratio_range=(3 / 4, 4 / 3),\n                 target_size=224,\n                 backend='cv2'):\n        self.area_range = area_range\n        self.aspect_ratio_range = aspect_ratio_range\n        self.target_size = target_size\n        self.backend = backend\n    @staticmethod\n    def get_crop_bbox(img_shape,\n                      area_range,\n                      aspect_ratio_range,\n                      max_attempts=10):\n        assert 0 < area_range[0] <= area_range[1] <= 1\n        assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]\n        img_h, img_w = img_shape\n        area = img_h * img_w\n        min_ar, max_ar = aspect_ratio_range\n        aspect_ratios = np.exp(\n            np.random.uniform(np.log(min_ar), np.log(max_ar),",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:165-197"
+    },
+    "5067": {
+        "file_id": 436,
+        "content": "RandomResizedCrop is a pipeline that resizes and crops images randomly with specified area, aspect ratio range, target size, and backend. The method get_crop_bbox takes image shape, area and aspect ratio ranges as input and returns the crop bounding box within the specified range of area and aspect ratio.",
+        "type": "comment"
+    },
+    "5068": {
+        "file_id": 436,
+        "content": "                              size=max_attempts))\n        target_areas = np.random.uniform(*area_range, size=max_attempts) * area\n        candidate_crop_w = np.round(np.sqrt(target_areas *\n                                            aspect_ratios)).astype(np.int32)\n        candidate_crop_h = np.round(np.sqrt(target_areas /\n                                            aspect_ratios)).astype(np.int32)\n        for i in range(max_attempts):\n            crop_w = candidate_crop_w[i]\n            crop_h = candidate_crop_h[i]\n            if crop_h <= img_h and crop_w <= img_w:\n                x_offset = random.randint(0, img_w - crop_w)\n                y_offset = random.randint(0, img_h - crop_h)\n                return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h\n        # Fallback\n        crop_size = min(img_h, img_w)\n        x_offset = (img_w - crop_size) // 2\n        y_offset = (img_h - crop_size) // 2\n        return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size\n    def __call__(self, results):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:198-219"
+    },
+    "5069": {
+        "file_id": 436,
+        "content": "This function generates a random crop size based on the aspect ratios and target areas. It then iterates through candidate crop sizes, selecting one that fits within the image bounds. If no suitable crop is found, it falls back to centering a smaller crop. The function returns the offset coordinates and crop dimensions for the selected crop.",
+        "type": "comment"
+    },
+    "5070": {
+        "file_id": 436,
+        "content": "        imgs = results['imgs']\n        if self.backend == 'pillow':\n            img_w, img_h = imgs[0].size\n        elif self.backend == 'cv2':\n            img_h, img_w, _ = imgs[0].shape\n        elif self.backend == 'pyav':\n            img_h, img_w = imgs.shape[2:]  # [cthw]\n        else:\n            raise NotImplementedError\n        left, top, right, bottom = self.get_crop_bbox(\n            (img_h, img_w), self.area_range, self.aspect_ratio_range)\n        if self.backend == 'pillow':\n            img_w, img_h = imgs[0].size\n            imgs = [img.crop(left, top, right, bottom) for img in imgs]\n        elif self.backend == 'cv2':\n            img_h, img_w, _ = imgs[0].shape\n            imgs = [img[top:bottom, left:right] for img in imgs]\n        elif self.backend == 'pyav':\n            img_h, img_w = imgs.shape[2:]  # [cthw]\n            imgs = imgs[:, :, top:bottom, left:right]\n        else:\n            raise NotImplementedError\n        results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass CenterCrop(object):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:220-249"
+    },
+    "5071": {
+        "file_id": 436,
+        "content": "This code is a part of PaddleVideo library and performs image cropping based on the specified backend. It first retrieves the image dimensions, then applies a crop box to each image according to the defined area range and aspect ratio range. The code handles different backends such as Pillow, OpenCV (cv2), and PyAV. If an unsupported backend is encountered, it raises a NotImplementedError.",
+        "type": "comment"
+    },
+    "5072": {
+        "file_id": 436,
+        "content": "    \"\"\"\n    Center crop images.\n    Args:\n        target_size(int): Center crop a square with the target_size from an image.\n        do_round(bool): Whether to round up the coordinates of the upper left corner of the cropping area. default: True\n    \"\"\"\n    def __init__(self, target_size, do_round=True, backend='pillow'):\n        self.target_size = target_size\n        self.do_round = do_round\n        self.backend = backend\n    def __call__(self, results):\n        \"\"\"\n        Performs Center crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            ccrop_imgs: List where each item is a PIL.Image after Center crop.\n        \"\"\"\n        imgs = results['imgs']\n        ccrop_imgs = []\n        th, tw = self.target_size, self.target_size\n        if isinstance(imgs, paddle.Tensor):\n            h, w = imgs.shape[-2:]\n            x1 = int(round((w - tw) / 2.0)) if self.do_round else (w - tw) // 2\n            y1 = int(round((h - th) / 2.0)) if self.do_round else (h - th) // 2",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:250-276"
+    },
+    "5073": {
+        "file_id": 436,
+        "content": "This code defines a class for center cropping images. The constructor takes the target size, whether to round the coordinates (True by default), and the backend (default is Pillow). The `__call__` method applies the center crop operation on a list of PIL Image objects, returning a new list with the cropped images.",
+        "type": "comment"
+    },
+    "5074": {
+        "file_id": 436,
+        "content": "            ccrop_imgs = imgs[:, :, y1:y1 + th, x1:x1 + tw]\n        else:\n            for img in imgs:\n                if self.backend == 'pillow':\n                    w, h = img.size\n                elif self.backend == 'cv2':\n                    h, w, _ = img.shape\n                else:\n                    raise NotImplementedError\n                assert (w >= self.target_size) and (h >= self.target_size), \\\n                    \"image width({}) and height({}) should be larger than crop size\".format(\n                        w, h, self.target_size)\n                x1 = int(round(\n                    (w - tw) / 2.0)) if self.do_round else (w - tw) // 2\n                y1 = int(round(\n                    (h - th) / 2.0)) if self.do_round else (h - th) // 2\n                if self.backend == 'cv2':\n                    ccrop_imgs.append(img[y1:y1 + th, x1:x1 + tw])\n                elif self.backend == 'pillow':\n                    ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = ccrop_imgs",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:277-297"
+    },
+    "5075": {
+        "file_id": 436,
+        "content": "This function performs center crop on images based on the given target size. It first checks if the image dimensions are larger than the crop size, and then calculates the starting coordinates for cropping. If the backend is Pillow, it uses the crop() method to perform the cropping operation; if the backend is OpenCV (cv2), it slices the image array accordingly. The resulting cropped images are stored in 'ccrop_imgs' list and returned in the 'results' dictionary under the key 'imgs'.",
+        "type": "comment"
+    },
+    "5076": {
+        "file_id": 436,
+        "content": "        return results\n@PIPELINES.register()\nclass MultiScaleCrop(object):\n    \"\"\"\n    Random crop images in with multiscale sizes\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n        scales(int): List of candidate cropping scales.\n        max_distort(int): Maximum allowable deformation combination distance.\n        fix_crop(int): Whether to fix the cutting start point.\n        allow_duplication(int): Whether to allow duplicate candidate crop starting points.\n        more_fix_crop(int): Whether to allow more cutting starting points.\n    \"\"\"\n    def __init__(\n            self,\n            target_size,  # NOTE: named target size now, but still pass short size in it!\n            scales=None,\n            max_distort=1,\n            fix_crop=True,\n            allow_duplication=False,\n            more_fix_crop=True,\n            backend='pillow'):\n        self.target_size = target_size\n        self.scales = scales if scales else [1, .875, .75, .66]\n        self.max_distort = max_distort",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:298-325"
+    },
+    "5077": {
+        "file_id": 436,
+        "content": "The MultiScaleCrop class is a pipeline module that randomly crops images with multiple scales, targeting a specific size. It allows adjustable parameters like maximum distortion, fix crop start point, and duplicate candidate crop points for flexibility. This module is useful in image processing tasks where random cropping can provide more data augmentation and improve model performance.",
+        "type": "comment"
+    },
+    "5078": {
+        "file_id": 436,
+        "content": "        self.fix_crop = fix_crop\n        self.allow_duplication = allow_duplication\n        self.more_fix_crop = more_fix_crop\n        assert backend in [\n            'pillow', 'cv2'\n        ], f\"MultiScaleCrop's backend must be pillow or cv2, but get {backend}\"\n        self.backend = backend\n    def __call__(self, results):\n        \"\"\"\n        Performs MultiScaleCrop operations.\n        Args:\n            imgs: List where wach item is a PIL.Image.\n            XXX:\n        results:\n        \"\"\"\n        imgs = results['imgs']\n        input_size = [self.target_size, self.target_size]\n        im_size = imgs[0].size\n        # get random crop offset\n        def _sample_crop_size(im_size):\n            image_w, image_h = im_size[0], im_size[1]\n            base_size = min(image_w, image_h)\n            crop_sizes = [int(base_size * x) for x in self.scales]\n            crop_h = [\n                input_size[1] if abs(x - input_size[1]) < 3 else x\n                for x in crop_sizes\n            ]\n            crop_w = [\n                input_size[0] if abs(x - input_size[0]) < 3 else x",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:326-360"
+    },
+    "5079": {
+        "file_id": 436,
+        "content": "This code defines a class for multi-scale cropping of images with Pillow or OpenCV backend. The `__init__` method initializes the instance variables and checks if the provided backend is either 'pillow' or 'cv2'. The `__call__` method performs the actual multi-scale cropping operation on a given list of images, applying random crop offsets to each image with the specified target size.",
+        "type": "comment"
+    },
+    "5080": {
+        "file_id": 436,
+        "content": "                for x in crop_sizes\n            ]\n            pairs = []\n            for i, h in enumerate(crop_h):\n                for j, w in enumerate(crop_w):\n                    if abs(i - j) <= self.max_distort:\n                        pairs.append((w, h))\n            crop_pair = random.choice(pairs)\n            if not self.fix_crop:\n                w_offset = random.randint(0, image_w - crop_pair[0])\n                h_offset = random.randint(0, image_h - crop_pair[1])\n            else:\n                w_step = (image_w - crop_pair[0]) / 4\n                h_step = (image_h - crop_pair[1]) / 4\n                ret = list()\n                ret.append((0, 0))  # upper left\n                if self.allow_duplication or w_step != 0:\n                    ret.append((4 * w_step, 0))  # upper right\n                if self.allow_duplication or h_step != 0:\n                    ret.append((0, 4 * h_step))  # lower left\n                if self.allow_duplication or (h_step != 0 and w_step != 0):\n                    ret.append((4 * w_step, 4 * h_step))  # lower right",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:361-384"
+    },
+    "5081": {
+        "file_id": 436,
+        "content": "This code generates a random crop pair from provided sizes, and then applies different cropping locations to the image. If fix_crop is False, it randomly selects an offset for the crop pair within the image boundaries. If fix_crop is True, it calculates four different offsets in a grid pattern using step values based on the image size. The resulting crops are stored in 'ret'.",
+        "type": "comment"
+    },
+    "5082": {
+        "file_id": 436,
+        "content": "                if self.allow_duplication or (h_step != 0 or w_step != 0):\n                    ret.append((2 * w_step, 2 * h_step))  # center\n                if self.more_fix_crop:\n                    ret.append((0, 2 * h_step))  # center left\n                    ret.append((4 * w_step, 2 * h_step))  # center right\n                    ret.append((2 * w_step, 4 * h_step))  # lower center\n                    ret.append((2 * w_step, 0 * h_step))  # upper center\n                    ret.append((1 * w_step, 1 * h_step))  # upper left quarter\n                    ret.append((3 * w_step, 1 * h_step))  # upper right quarter\n                    ret.append((1 * w_step, 3 * h_step))  # lower left quarter\n                    ret.append((3 * w_step, 3 * h_step))  # lower righ quarter\n                w_offset, h_offset = random.choice(ret)\n            return crop_pair[0], crop_pair[1], w_offset, h_offset\n        crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)\n        crop_img_group = [\n            img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:385-405"
+    },
+    "5083": {
+        "file_id": 436,
+        "content": "This code samples random crop sizes and offsets for image augmentation. It appends different cropping positions based on user allowance or specific flag settings, then randomly selects one of these positions. Finally, it crops the image using the selected position and size.",
+        "type": "comment"
+    },
+    "5084": {
+        "file_id": 436,
+        "content": "            for img in imgs\n        ]\n        if self.backend == 'pillow':\n            ret_img_group = [\n                img.resize((input_size[0], input_size[1]), Image.BILINEAR)\n                for img in crop_img_group\n            ]\n        else:\n            ret_img_group = [\n                Image.fromarray(\n                    cv2.resize(np.asarray(img),\n                               dsize=(input_size[0], input_size[1]),\n                               interpolation=cv2.INTER_LINEAR))\n                for img in crop_img_group\n            ]\n        results['imgs'] = ret_img_group\n        return results\n@PIPELINES.register()\nclass RandomFlip(object):\n    \"\"\"\n    Random Flip images.\n    Args:\n        p(float): Random flip images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.5):\n        self.p = p\n    def __call__(self, results):\n        \"\"\"\n        Performs random flip operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:406-440"
+    },
+    "5085": {
+        "file_id": 436,
+        "content": "This code is a PaddleVideo pipeline for image augmentation, specifically performing random flips with a given probability. It resizes and crops images according to the provided input size. If the backend is set to 'pillow', it uses PIL library's resize function; otherwise, it uses OpenCV's resize function. The results are stored in the 'imgs' key of the 'results' dictionary, which is then returned.",
+        "type": "comment"
+    },
+    "5086": {
+        "file_id": 436,
+        "content": "        return:\n            flip_imgs: List where each item is a PIL.Image after random flip.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            if isinstance(imgs, paddle.Tensor):\n                results['imgs'] = paddle.flip(imgs, axis=[3])\n            elif isinstance(imgs[0], np.ndarray):\n                results['imgs'] = [cv2.flip(img, 1, img) for img in imgs\n                                   ]  # [[h,w,c], [h,w,c], ..., [h,w,c]]\n            else:\n                results['imgs'] = [\n                    img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs\n                ]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass RandomBrightness(object):\n    \"\"\"\n    Random Brightness images.\n    Args:\n        p(float): Random brightness images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.1, brightness=1):\n        self.p = p\n        self.brightness = brightness\n    def __call__(self, results):\n        \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:441-473"
+    },
+    "5087": {
+        "file_id": 436,
+        "content": "This code implements a random image flipping and brightness adjustment in PaddleVideo's pipeline. It takes an image as input, randomly decides whether to flip or keep it intact with probability 'p', and adjusts the brightness if applied. The result is then returned.",
+        "type": "comment"
+    },
+    "5088": {
+        "file_id": 436,
+        "content": "        Performs random brightness operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            brightness_imgs: List where each item is a PIL.Image after random brightness.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            transform = ColorJitter(brightness=self.brightness)\n            results['imgs'] = [transform(img) for img in imgs]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass RandomSaturation(object):\n    \"\"\"\n    Random Saturation images.\n    Args:\n        p(float): Random saturation images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.1, saturation=2):\n        self.p = p\n        self.saturation = saturation\n    def __call__(self, results):\n        \"\"\"\n        Performs random saturation operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:474-508"
+    },
+    "5089": {
+        "file_id": 436,
+        "content": "The code defines two classes, RandomBrightness and RandomSaturation, which perform random operations on image brightness and saturation respectively. The RandomBrightness class applies ColorJitter with a specified brightness level to each image in the list with a certain probability, while the RandomSaturation class adjusts the saturation of images with another probability. Both classes are registered as Pipelines for data augmentation in PaddleVideo.",
+        "type": "comment"
+    },
+    "5090": {
+        "file_id": 436,
+        "content": "        return:\n            saturation_imgs: List where each item is a PIL.Image after random saturation.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            transform = ColorJitter(saturation=self.saturation)\n            results['imgs'] = [transform(img) for img in imgs]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass RandomHue(object):\n    \"\"\"\n    Random Hue images.\n    Args:\n        p(float): Random hue images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.1, hue=0.5):\n        self.p = p\n        self.hue = hue\n    def __call__(self, results):\n        \"\"\"\n        Performs random hue operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            hue_imgs: List where each item is a PIL.Image after random hue.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:509-546"
+    },
+    "5091": {
+        "file_id": 436,
+        "content": "This code snippet contains two classes: RandomSaturation and RandomHue. Both classes are pipeline transforms for image processing in the PaddleVideo framework. The RandomSaturation class applies random saturation adjustments to images with a certain probability, while the RandomHue class randomly alters hue values of images with another probability. These transforms can be used to augment and enhance the dataset for better model training.",
+        "type": "comment"
+    },
+    "5092": {
+        "file_id": 436,
+        "content": "            transform = ColorJitter(hue=self.hue)\n            results['imgs'] = [transform(img) for img in imgs]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass RandomGamma(object):\n    \"\"\"\n    Random Gamma images.\n    Args:\n        p(float): Random gamma images with the probability p.\n        gamma (float): Non negative real number, same as `\\\\gamma` in the equation.\n                       gamma larger than 1 make the shadows darker,\n                      while gamma smaller than 1 make dark regions lighter.\n    \"\"\"\n    def __init__(self, p=0.1, gamma=0.2):\n        self.p = p\n        self.value = [1 - gamma, 1 + gamma]\n        self.value[0] = max(self.value[0], 0)\n    def _adust_gamma(self, img, gamma, gain=1.0):\n        flag = False\n        if isinstance(img, np.ndarray):\n            flag = True\n            img = Image.fromarray(img)\n        input_mode = img.mode\n        img = img.convert(\"RGB\")\n        gamma_map = [\n            int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma))",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:547-577"
+    },
+    "5093": {
+        "file_id": 436,
+        "content": "The code defines a pipeline module for data augmentation in PaddleVideo. It includes a RandomGamma class that randomly applies gamma correction to images with a specified probability and gamma value range. The ColorJitter transform is used to apply random changes to the hue of images. The results are stored in a dictionary under the 'imgs' key, either after applying transformations or as is if no transformation is needed. The code also handles adjusting gamma for both numpy arrays and PIL Image objects.",
+        "type": "comment"
+    },
+    "5094": {
+        "file_id": 436,
+        "content": "            for ele in range(256)\n        ] * 3\n        img = img.point(\n            gamma_map)  # use PIL's point-function to accelerate this part\n        img = img.convert(input_mode)\n        if flag:\n            img = np.array(img)\n        return img\n    def __call__(self, results):\n        \"\"\"\n        Performs random gamma operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            gamma_imgs: List where each item is a PIL.Image after random gamma.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            gamma = random.uniform(self.value[0], self.value[1])\n            results['imgs'] = [self._adust_gamma(img, gamma) for img in imgs]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass Image2Array(object):\n    \"\"\"\n    transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.\n    Args:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:578-611"
+    },
+    "5095": {
+        "file_id": 436,
+        "content": "This code is defining a pipeline for image augmentation, specifically adjusting gamma values randomly. It checks if a random number falls below the threshold and applies a random gamma adjustment to each image in the input list. If not, it leaves the images unchanged. Finally, it registers an Image2Array class that converts PIL.Image to Numpy array with transposed dimensions from 'dhwc' to 'dchw'.",
+        "type": "comment"
+    },
+    "5096": {
+        "file_id": 436,
+        "content": "        transpose: whether to transpose or not, default True, False for slowfast.\n    \"\"\"\n    def __init__(self, transpose=True, data_format='tchw'):\n        assert data_format in [\n            'tchw', 'cthw'\n        ], f\"Target format must in ['tchw', 'cthw'], but got {data_format}\"\n        self.transpose = transpose\n        self.data_format = data_format\n    def __call__(self, results):\n        \"\"\"\n        Performs Image to NumpyArray operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            np_imgs: Numpy array.\n        \"\"\"\n        imgs = results['imgs']\n        if 'backend' in results and results[\n                'backend'] == 'pyav':  # [T,H,W,C] in [0, 1]\n            if self.transpose:\n                if self.data_format == 'tchw':\n                    t_imgs = imgs.transpose((0, 3, 1, 2))  # tchw\n                else:\n                    t_imgs = imgs.transpose((3, 0, 1, 2))  # cthw\n            results['imgs'] = t_imgs",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:612-638"
+    },
+    "5097": {
+        "file_id": 436,
+        "content": "This code is part of a class that performs Image to NumpyArray operations. It initializes with the option to transpose or not, and specifies the data format as either 'tchw' or 'cthw'. The class checks if the backend is 'pyav', then transposes the images accordingly. If 'transpose' is True and 'data_format' is 'tchw', it transposes the images from (0, 3, 1, 2) to (0, 3, 1, 2), resulting in 'tchw'. Otherwise, if 'transpose' is True and 'data_format' is 'cthw', it transposes the images from (3, 0, 1, 2) to (3, 0, 1, 2), resulting in 'cthw'. The transposed images are then stored back into 'imgs'.",
+        "type": "comment"
+    },
+    "5098": {
+        "file_id": 436,
+        "content": "        else:\n            t_imgs = np.stack(imgs).astype('float32')\n            if self.transpose:\n                if self.data_format == 'tchw':\n                    t_imgs = t_imgs.transpose(0, 3, 1, 2)  # tchw\n                else:\n                    t_imgs = t_imgs.transpose(3, 0, 1, 2)  # cthw\n            results['imgs'] = t_imgs\n        return results\n@PIPELINES.register()\nclass Normalization(object):\n    \"\"\"\n    Normalization.\n    Args:\n        mean(Sequence[float]): mean values of different channels.\n        std(Sequence[float]): std values of different channels.\n        tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]\n    \"\"\"\n    def __init__(self, mean, std, tensor_shape=[3, 1, 1], inplace=False):\n        if not isinstance(mean, Sequence):\n            raise TypeError(\n                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')\n        if not isinstance(std, Sequence):\n            raise TypeError(\n                f'Std must be list, tuple or np.ndarray, but got {type(std)}')",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:639-665"
+    },
+    "5099": {
+        "file_id": 436,
+        "content": "This code is defining a class for normalization in PaddleVideo's loader pipelines. It takes mean and std values as arguments to normalize the image data, and allows for transpose operation depending on data_format. The tensor_shape parameter is optional with default value [3,1,1] for standard usage or [1,1,1,3] for slowfast support. Inplace flag can be set to True to perform in-place operations if desired.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/51.json b/docs/data/51.json
new file mode 100644
index 000000000..5fa2eba68
--- /dev/null
+++ b/docs/data/51.json
@@ -0,0 +1,551 @@
+{
+    "5100": {
+        "file_id": 436,
+        "content": "        self.inplace = inplace\n        if not inplace:\n            self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)\n            self.std = np.array(std).reshape(tensor_shape).astype(np.float32)\n        else:\n            self.mean = np.array(mean, dtype=np.float32)\n            self.std = np.array(std, dtype=np.float32)\n    def __call__(self, results):\n        \"\"\"\n        Performs normalization operations.\n        Args:\n            imgs: Numpy array.\n        return:\n            np_imgs: Numpy array after normalization.\n        \"\"\"\n        if self.inplace:\n            n = len(results['imgs'])\n            h, w, c = results['imgs'][0].shape\n            norm_imgs = np.empty((n, h, w, c), dtype=np.float32)\n            for i, img in enumerate(results['imgs']):\n                norm_imgs[i] = img\n            for img in norm_imgs:  # [n,h,w,c]\n                mean = np.float64(self.mean.reshape(1, -1))  # [1, 3]\n                stdinv = 1 / np.float64(self.std.reshape(1, -1))  # [1, 3]\n                cv2.subtract(img, mean, img)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:667-693"
+    },
+    "5101": {
+        "file_id": 436,
+        "content": "This code defines a class for normalizing images. It takes mean and std values as inputs, which are used for image normalization. If inplace is set to False, it converts the input into numpy arrays with appropriate shapes and data types. The __call__ method performs normalization on the given results. If inplace is True, it uses the existing array and avoids making copies. The method calculates mean and std values for normalization and applies them to each image in the results using cv2.subtract.",
+        "type": "comment"
+    },
+    "5102": {
+        "file_id": 436,
+        "content": "                cv2.multiply(img, stdinv, img)\n        else:\n            imgs = results['imgs']\n            norm_imgs = imgs / 255.0\n            norm_imgs -= self.mean\n            norm_imgs /= self.std\n            if 'backend' in results and results['backend'] == 'pyav':\n                norm_imgs = paddle.to_tensor(norm_imgs, dtype=paddle.float32)\n        results['imgs'] = norm_imgs\n        return results\n@PIPELINES.register()\nclass JitterScale(object):\n    \"\"\"\n    Scale image, while the target short size is randomly select between min_size and max_size.\n    Args:\n        min_size: Lower bound for random sampler.\n        max_size: Higher bound for random sampler.\n    \"\"\"\n    def __init__(self,\n                 min_size,\n                 max_size,\n                 short_cycle_factors=[0.5, 0.7071],\n                 default_min_size=256):\n        self.default_min_size = default_min_size\n        self.orig_min_size = self.min_size = min_size\n        self.max_size = max_size\n        self.short_cycle_factors = short_cycle_factors",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:694-722"
+    },
+    "5103": {
+        "file_id": 436,
+        "content": "This code applies image normalization and potentially scales the images while preserving aspect ratio, with options for random scaling. This is part of a PaddleVideo pipeline, likely for preprocessing input data before feeding it to a model for training or inference. It can be used with different backends such as \"cv2\" or \"pyav\", and returns the processed image results.",
+        "type": "comment"
+    },
+    "5104": {
+        "file_id": 436,
+        "content": "    def __call__(self, results):\n        \"\"\"\n        Performs jitter resize operations.\n        Args:\n            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            resized_imgs: List where each item is a PIL.Image after scaling.\n        \"\"\"\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx in [0, 1]:\n            self.min_size = int(\n                round(self.short_cycle_factors[short_cycle_idx] *\n                      self.default_min_size))\n        else:\n            self.min_size = self.orig_min_size\n        imgs = results['imgs']\n        size = int(round(np.random.uniform(self.min_size, self.max_size)))\n        assert (len(imgs) >= 1), \\\n            \"len(imgs):{} should be larger than 1\".format(len(imgs))\n        if 'backend' in results and results['backend'] == 'pyav':\n            height, width = imgs.shape[2:]\n        else:\n            width, height = imgs[0].size",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:724-749"
+    },
+    "5105": {
+        "file_id": 436,
+        "content": "This code defines a function that performs jitter resize operations. It takes in an image sequence and scales each image based on a random size between min_size and max_size, considering short cycle factors and asserting the minimum length of images. If the backend is pyav, it retrieves height and width separately; otherwise, it gets the size from the first image.",
+        "type": "comment"
+    },
+    "5106": {
+        "file_id": 436,
+        "content": "        if (width <= height and width == size) or (height <= width\n                                                   and height == size):\n            return results\n        new_width = size\n        new_height = size\n        if width < height:\n            new_height = int(math.floor((float(height) / width) * size))\n        else:\n            new_width = int(math.floor((float(width) / height) * size))\n        if 'backend' in results and results['backend'] == 'pyav':\n            frames_resize = F.interpolate(imgs,\n                                          size=(new_height, new_width),\n                                          mode=\"bilinear\",\n                                          align_corners=False)  # [c,t,h,w]\n        else:\n            frames_resize = []\n            for j in range(len(imgs)):\n                img = imgs[j]\n                scale_img = img.resize((new_width, new_height), Image.BILINEAR)\n                frames_resize.append(scale_img)\n        results['imgs'] = frames_resize\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:750-774"
+    },
+    "5107": {
+        "file_id": 436,
+        "content": "This code resizes images to a specified size (width or height equals size). It checks if the image is loaded by PyAV and performs the resize operation using F.interpolate for PyAV-loaded images, otherwise it uses PIL's Image.resize function for other images. The resized images are added to 'imgs' in the results dictionary and returned.",
+        "type": "comment"
+    },
+    "5108": {
+        "file_id": 436,
+        "content": "@PIPELINES.register()\nclass MultiCenterCrop(object):\n    \"\"\"\n    center crop, left center crop right center crop\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = target_size\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]\n            h, w = imgs.shape[2:]\n        else:\n            w, h = imgs[0].size\n        th, tw = self.target_size, self.target_size\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size\".format(\n                w, h, self.target_size)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:777-805"
+    },
+    "5109": {
+        "file_id": 436,
+        "content": "This code defines a MultiCenterCrop class that performs center crop, left center crop, and right center crop operations on images. It takes a target size as input and returns the cropped images. The function checks if the image size is larger than the target size before performing the operation. If the image size is smaller, it throws an assertion error.",
+        "type": "comment"
+    },
+    "5110": {
+        "file_id": 436,
+        "content": "        crop_images = []\n        #just for tensor\n        crop_imgs_center = []\n        crop_imgs_left = []\n        crop_imgs_right = []\n        if 'backend' in results and results['backend'] == 'pyav':\n            #center_corp\n            x1 = 0\n            if w > self.target_size:\n                x1 = int((w - self.target_size) / 2.0)\n            y1 = 0\n            if h > self.target_size:\n                y1 = int((h - self.target_size) / 2.0)\n            crop_imgs_center = imgs[:, :, y1:y1 + th,\n                                    x1:x1 + tw].numpy()  # [C, T, th, tw]\n            #left_crop\n            x1 = 0\n            y1 = 0\n            if h > self.target_size:\n                y1 = int((h - self.target_size) / 2.0)\n            crop_imgs_left = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy()\n            #right_crop\n            x1 = 0\n            y1 = 0\n            if w > self.target_size:\n                x1 = w - self.target_size\n            if h > self.target_size:\n                y1 = int((h - self.target_size) / 2.0)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:807-834"
+    },
+    "5111": {
+        "file_id": 436,
+        "content": "This code is performing image cropping for a specific backend (pyav) and storing the results in three separate lists: crop_imgs_center, crop_imgs_left, and crop_imgs_right. The cropping is done based on the size of the original image compared to the target size, with different crops for center, left, and right areas.",
+        "type": "comment"
+    },
+    "5112": {
+        "file_id": 436,
+        "content": "            crop_imgs_right = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy()\n            crop_imgs = np.concatenate(\n                (crop_imgs_center, crop_imgs_left, crop_imgs_right), axis=1)\n            crop_images = paddle.to_tensor(crop_imgs)\n        else:\n            x1 = 0\n            if w > self.target_size:\n                x1 = random.randint(0, w - tw)\n            y1 = 0\n            if h > self.target_size:\n                y1 = random.randint(0, h - th)\n            for img in imgs:\n                if w == tw and h == th:\n                    crop_images.append(img)\n                else:\n                    crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = crop_images\n        return results\n@PIPELINES.register()\nclass MultiCrop(object):\n    \"\"\"\n    Random crop image.\n    This operation can perform multi-crop during multi-clip test, as in slowfast model.\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self,\n                 target_size,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:835-865"
+    },
+    "5113": {
+        "file_id": 436,
+        "content": "This code defines a MultiCrop pipeline that randomly crops an image into three parts: center, left, and right. The cropped images are concatenated horizontally and converted to Paddle Tensor before returning the results.",
+        "type": "comment"
+    },
+    "5114": {
+        "file_id": 436,
+        "content": "                 default_crop_size=224,\n                 short_cycle_factors=[0.5, 0.7071],\n                 test_mode=False):\n        self.orig_target_size = self.target_size = target_size\n        self.short_cycle_factors = short_cycle_factors\n        self.default_crop_size = default_crop_size\n        self.test_mode = test_mode\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        spatial_sample_index = results['spatial_sample_index']\n        spatial_num_clips = results['spatial_num_clips']\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx in [0, 1]:\n            self.target_size = int(\n                round(self.short_cycle_factors[short_cycle_idx] *\n                      self.default_crop_size))",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:866-891"
+    },
+    "5115": {
+        "file_id": 436,
+        "content": "The code initializes an augmentation class with parameters for target size, short cycle factors, default crop size, and test mode. It then defines a __call__ method that performs random cropping operations on images based on the provided parameters.",
+        "type": "comment"
+    },
+    "5116": {
+        "file_id": 436,
+        "content": "        else:\n            self.target_size = self.orig_target_size  # use saved value before call\n        w, h = imgs[0].size\n        if w == self.target_size and h == self.target_size:\n            return results\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size({},{})\".format(w, h, self.target_size, self.target_size)\n        frames_crop = []\n        if not self.test_mode:\n            x_offset = random.randint(0, w - self.target_size)\n            y_offset = random.randint(0, h - self.target_size)\n        else:  # multi-crop\n            x_gap = int(\n                math.ceil((w - self.target_size) / (spatial_num_clips - 1)))\n            y_gap = int(\n                math.ceil((h - self.target_size) / (spatial_num_clips - 1)))\n            if h > w:\n                x_offset = int(math.ceil((w - self.target_size) / 2))\n                if spatial_sample_index == 0:\n                    y_offset = 0\n                elif spatial_sample_index == spatial_num_clips - 1:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:892-914"
+    },
+    "5117": {
+        "file_id": 436,
+        "content": "This code checks if the image size matches the target size. If it does, it returns the results. If not, it generates crops for multi-crop testing mode or a single crop for non-testing mode based on random offsets. The code also handles the case where the target size is determined from a saved value before the call.",
+        "type": "comment"
+    },
+    "5118": {
+        "file_id": 436,
+        "content": "                    y_offset = h - self.target_size\n                else:\n                    y_offset = y_gap * spatial_sample_index\n            else:\n                y_offset = int(math.ceil((h - self.target_size) / 2))\n                if spatial_sample_index == 0:\n                    x_offset = 0\n                elif spatial_sample_index == spatial_num_clips - 1:\n                    x_offset = w - self.target_size\n                else:\n                    x_offset = x_gap * spatial_sample_index\n        for img in imgs:\n            nimg = img.crop((x_offset, y_offset, x_offset + self.target_size,\n                             y_offset + self.target_size))\n            frames_crop.append(nimg)\n        results['imgs'] = frames_crop\n        return results\n@PIPELINES.register()\nclass PackOutput(object):\n    \"\"\"\n    In slowfast model, we want to get slow pathway from fast pathway based on\n    alpha factor.\n    Args:\n        alpha(int): temporal length of fast/slow\n    \"\"\"\n    def __init__(self, alpha):\n        self.alpha = alpha",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:915-944"
+    },
+    "5119": {
+        "file_id": 436,
+        "content": "This code calculates the crop offsets for a set of images based on their size and target size. If the aspect ratio is preserved, it determines the y_offset, otherwise, it calculates the x and y offsets separately for each spatial sample index. The resulting cropped images are stored in frames\\_crop and added to results['imgs']. PackOutput is a pipeline register that takes an alpha argument and is used in slowfast model to get slow pathway from fast pathway based on alpha factor.",
+        "type": "comment"
+    },
+    "5120": {
+        "file_id": 436,
+        "content": "    def __call__(self, results):\n        fast_pathway = results['imgs']\n        # sample num points between start and end\n        slow_idx_start = 0\n        slow_idx_end = fast_pathway.shape[0] - 1\n        slow_idx_num = fast_pathway.shape[0] // self.alpha\n        slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,\n                                       slow_idx_num).astype(\"int64\")\n        slow_pathway = fast_pathway[slow_idxs_select]\n        # T H W C -> C T H W.\n        slow_pathway = slow_pathway.transpose(3, 0, 1, 2)\n        fast_pathway = fast_pathway.transpose(3, 0, 1, 2)\n        # slow + fast\n        frames_list = [slow_pathway, fast_pathway]\n        results['imgs'] = frames_list\n        return results\n@PIPELINES.register()\nclass GroupFullResSample(object):\n    def __init__(self, crop_size, flip=False):\n        self.crop_size = crop_size if not isinstance(crop_size, int) else (\n            crop_size, crop_size)\n        self.flip = flip\n    def __call__(self, results):\n        img_group = results['imgs']",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:946-975"
+    },
+    "5121": {
+        "file_id": 436,
+        "content": "The code defines a GroupFullResSample pipeline that selects and groups slow and fast pathways from input images. It resizes the pathways to the specified crop_size, performs horizontal flips if flip is True, and stores them in frames_list before updating results['imgs'].",
+        "type": "comment"
+    },
+    "5122": {
+        "file_id": 436,
+        "content": "        image_w, image_h = img_group[0].size\n        crop_w, crop_h = self.crop_size\n        w_step = (image_w - crop_w) // 4\n        h_step = (image_h - crop_h) // 4\n        offsets = list()\n        offsets.append((0 * w_step, 2 * h_step))  # left\n        offsets.append((4 * w_step, 2 * h_step))  # right\n        offsets.append((2 * w_step, 2 * h_step))  # center\n        oversample_group = list()\n        for o_w, o_h in offsets:\n            normal_group = list()\n            flip_group = list()\n            for i, img in enumerate(img_group):\n                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))\n                normal_group.append(crop)\n                if self.flip:\n                    flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)\n                    flip_group.append(flip_crop)\n            oversample_group.extend(normal_group)\n            if self.flip:\n                oversample_group.extend(flip_group)\n        results['imgs'] = oversample_group\n        return results\n@PIPELINES.register()",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:977-1007"
+    },
+    "5123": {
+        "file_id": 436,
+        "content": "This code performs image augmentation by creating a list of different crops and flips from the input image group. It calculates the crop size and step sizes, creates offsets for each crop position, iterates over the input images to create normal and flipped crops, and stores them in separate groups before combining them into the oversample_group. Finally, it adds the oversample_group to the results dictionary and returns the results.",
+        "type": "comment"
+    },
+    "5124": {
+        "file_id": 436,
+        "content": "class TenCrop:\n    \"\"\"\n    Crop out 5 regions (4 corner points + 1 center point) from the picture,\n    and then flip the cropping result to get 10 cropped images, which can make the prediction result more robust.\n    Args:\n        target_size(int | tuple[int]): (w, h) of target size for crop.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = (target_size, target_size)\n    def __call__(self, results):\n        imgs = results['imgs']\n        img_w, img_h = imgs[0].size\n        crop_w, crop_h = self.target_size\n        w_step = (img_w - crop_w) // 4\n        h_step = (img_h - crop_h) // 4\n        offsets = [\n            (0, 0),\n            (4 * w_step, 0),\n            (0, 4 * h_step),\n            (4 * w_step, 4 * h_step),\n            (2 * w_step, 2 * h_step),\n        ]\n        img_crops = list()\n        for x_offset, y_offset in offsets:\n            crop = [\n                img.crop(\n                    (x_offset, y_offset, x_offset + crop_w, y_offset + crop_h))\n                for img in imgs\n            ]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1008-1037"
+    },
+    "5125": {
+        "file_id": 436,
+        "content": "This code defines a class \"TenCrop\" which crops a given image into 10 cropped images, taking the top-left corner, bottom-left corner, top-right corner, bottom-right corner, and center of the image. It achieves this by using the target size for crop and calculating the width and height steps based on the original image's dimensions. The class also includes a __call__ method which takes a results dictionary as input and returns an array of cropped images.",
+        "type": "comment"
+    },
+    "5126": {
+        "file_id": 436,
+        "content": "            crop_fliped = [\n                timg.transpose(Image.FLIP_LEFT_RIGHT) for timg in crop\n            ]\n            img_crops.extend(crop)\n            img_crops.extend(crop_fliped)\n        results['imgs'] = img_crops\n        return results\n@PIPELINES.register()\nclass UniformCrop:\n    \"\"\"\n    Perform uniform spatial sampling on the images,\n    select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions.\n    Args:\n        target_size(int | tuple[int]): (w, h) of target size for crop.\n    \"\"\"\n    def __init__(self, target_size, backend='cv2'):\n        if isinstance(target_size, tuple):\n            self.target_size = target_size\n        elif isinstance(target_size, int):\n            self.target_size = (target_size, target_size)\n        else:\n            raise TypeError(\n                f'target_size must be int or tuple[int], but got {type(target_size)}'\n            )\n        self.backend = backend\n    def __call__(self, results):\n        imgs = results['imgs']",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1038-1069"
+    },
+    "5127": {
+        "file_id": 436,
+        "content": "This code is for the \"UniformCrop\" pipeline, which performs uniform spatial sampling on images by selecting three regions: two ends of the long side and the middle position (left/right or top/bottom). The target size can be provided as an integer for square crop or a tuple for specific width and height. It uses either OpenCV or PIL for image manipulation based on the 'backend' argument, which defaults to OpenCV.",
+        "type": "comment"
+    },
+    "5128": {
+        "file_id": 436,
+        "content": "        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]\n            img_h, img_w = imgs.shape[2:]\n        elif self.backend == 'pillow':\n            img_w, img_h = imgs[0].size\n        else:\n            img_h, img_w = imgs[0].shape[:2]\n        crop_w, crop_h = self.target_size\n        if crop_h == img_h:\n            w_step = (img_w - crop_w) // 2\n            offsets = [\n                (0, 0),\n                (w_step * 2, 0),\n                (w_step, 0),\n            ]\n        elif crop_w == img_w:\n            h_step = (img_h - crop_h) // 2\n            offsets = [\n                (0, 0),\n                (0, h_step * 2),\n                (0, h_step),\n            ]\n        else:\n            raise ValueError(\n                f\"img_w({img_w}) == crop_w({crop_w}) or img_h({img_h}) == crop_h({crop_h})\"\n            )\n        img_crops = []\n        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]\n            for x_offset, y_offset in offsets:\n                crop = imgs[:, :, y_offset:y_offset + crop_h,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1070-1099"
+    },
+    "5129": {
+        "file_id": 436,
+        "content": "This code is determining the image offsets for cropping based on the target size and the original image dimensions. It supports two backends: 'pyav' and 'pillow'. If the backend is 'pyav', it extracts the height and width of the image. If the backend is 'pillow', it retrieves the width and height from the first image. The code then calculates the step size for cropping based on whether the target size matches the image dimensions or not, and finally constructs a list of offsets to crop the images.",
+        "type": "comment"
+    },
+    "5130": {
+        "file_id": 436,
+        "content": "                            x_offset:x_offset + crop_w]\n                img_crops.append(crop)\n            img_crops = paddle.concat(img_crops, axis=1)\n        else:\n            if self.backend == 'pillow':\n                for x_offset, y_offset in offsets:\n                    crop = [\n                        img.crop((x_offset, y_offset, x_offset + crop_w,\n                                  y_offset + crop_h)) for img in imgs\n                    ]\n                    img_crops.extend(crop)\n            else:\n                for x_offset, y_offset in offsets:\n                    crop = [\n                        img[y_offset:y_offset + crop_h,\n                            x_offset:x_offset + crop_w] for img in imgs\n                    ]\n                    img_crops.extend(crop)\n        results['imgs'] = img_crops\n        return results\n@PIPELINES.register()\nclass GroupResize(object):\n    def __init__(self, height, width, scale, K, mode='train'):\n        self.height = height\n        self.width = width\n        self.scale = scale",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1100-1127"
+    },
+    "5131": {
+        "file_id": 436,
+        "content": "The code is defining a pipeline for image augmentation, including cropping and resizing operations. If the backend is 'pillow', it performs cropping using pixel coordinates; otherwise, it uses slice notation to crop images. The results are stored in 'img_crops' and returned as 'results['imgs']'.",
+        "type": "comment"
+    },
+    "5132": {
+        "file_id": 436,
+        "content": "        self.resize = {}\n        self.K = np.array(K, dtype=np.float32)\n        self.mode = mode\n        for i in range(self.scale):\n            s = 2**i\n            self.resize[i] = paddle.vision.transforms.Resize(\n                (self.height // s, self.width // s), interpolation='lanczos')\n    def __call__(self, results):\n        if self.mode == 'infer':\n            imgs = results['imgs']\n            for k in list(imgs):  # (\"color\", 0, -1)\n                if \"color\" in k or \"color_n\" in k:\n                    n, im, _ = k\n                    for i in range(self.scale):\n                        imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])\n        else:\n            imgs = results['imgs']\n            for scale in range(self.scale):\n                K = self.K.copy()\n                K[0, :] *= self.width // (2**scale)\n                K[1, :] *= self.height // (2**scale)\n                inv_K = np.linalg.pinv(K)\n                imgs[(\"K\", scale)] = K\n                imgs[(\"inv_K\", scale)] = inv_K\n            for k in list(imgs):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1128-1156"
+    },
+    "5133": {
+        "file_id": 436,
+        "content": "This code initializes a resize transformation for image augmentation in PaddleVideo. The transformations are applied based on the scale and mode ('infer' or 'train') specified. For infer mode, it processes color images by applying resizing to each scale level. In train mode, it calculates the K matrix and its inverse for each scale level and stores them in the results dictionary.",
+        "type": "comment"
+    },
+    "5134": {
+        "file_id": 436,
+        "content": "                if \"color\" in k or \"color_n\" in k:\n                    n, im, i = k\n                    for i in range(self.scale):\n                        imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass ColorJitter(object):\n    \"\"\"Randomly change the brightness, contrast, saturation and hue of an image.\n    \"\"\"\n    def __init__(self,\n                 brightness=0,\n                 contrast=0,\n                 saturation=0,\n                 hue=0,\n                 mode='train',\n                 p=0.5,\n                 keys=None):\n        self.mode = mode\n        self.colorjitter = paddle.vision.transforms.ColorJitter(\n            brightness, contrast, saturation, hue)\n        self.p = p\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            results (PIL Image): Input image.\n        Returns:\n            PIL Image: Color jittered image.\n        \"\"\"\n        do_color_aug = random.random() > self.p\n        imgs = results['imgs']",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1157-1193"
+    },
+    "5135": {
+        "file_id": 436,
+        "content": "This code applies color jitter augmentation to the images by randomly adjusting brightness, contrast, saturation, and hue. The ColorJitter class initializes a colorjitter transform with specified parameters for train mode or test mode. The __call__ method is called on each image in the results dictionary and checks if the color augmentation should be applied. If true, the color jittered image is returned.",
+        "type": "comment"
+    },
+    "5136": {
+        "file_id": 436,
+        "content": "        for k in list(imgs):\n            f = imgs[k]\n            if \"color\" in k or \"color_n\" in k:\n                n, im, i = k\n                imgs[(n, im, i)] = f\n                if do_color_aug:\n                    imgs[(n + \"_aug\", im, i)] = self.colorjitter(f)\n                else:\n                    imgs[(n + \"_aug\", im, i)] = f\n        if self.mode == \"train\":\n            for i in results['frame_idxs']:\n                del imgs[(\"color\", i, -1)]\n                del imgs[(\"color_aug\", i, -1)]\n                del imgs[(\"color_n\", i, -1)]\n                del imgs[(\"color_n_aug\", i, -1)]\n        else:\n            for i in results['frame_idxs']:\n                del imgs[(\"color\", i, -1)]\n                del imgs[(\"color_aug\", i, -1)]\n        results['img'] = imgs\n        return results\n@PIPELINES.register()\nclass GroupRandomFlip(object):\n    def __init__(self, p=0.5):\n        self.p = p\n    def __call__(self, results):\n        imgs = results['imgs']\n        do_flip = random.random() > self.p\n        if do_flip:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1194-1227"
+    },
+    "5137": {
+        "file_id": 436,
+        "content": "This code is part of a data augmentation pipeline, specifically handling color and flip transformations for images. It iterates over the 'imgs' dictionary to find and organize color images, applying color jitter if required. Then it removes specific color images based on the mode (\"train\" or \"test\"). Finally, it returns the updated results dictionary with the modified image groupings. The GroupRandomFlip class performs random flipping of images with a specified probability.",
+        "type": "comment"
+    },
+    "5138": {
+        "file_id": 436,
+        "content": "            for k in list(imgs):\n                if \"color\" in k or \"color_n\" in k:\n                    n, im, i = k\n                    imgs[(n, im,\n                          i)] = imgs[(n, im,\n                                      i)].transpose(Image.FLIP_LEFT_RIGHT)\n            if \"depth_gt\" in imgs:\n                imgs['depth_gt'] = np.array(np.fliplr(imgs['depth_gt']))\n        results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass ToArray(object):\n    def __init__(self):\n        pass\n    def __call__(self, results):\n        imgs = results['imgs']\n        for k in list(imgs):\n            if \"color\" in k or \"color_n\" in k or \"color_aug\" in k or \"color_n_aug\" in k:\n                n, im, i = k\n                imgs[(n, im,\n                      i)] = np.array(imgs[(n, im, i)]).astype('float32') / 255.0\n                imgs[(n, im, i)] = imgs[(n, im, i)].transpose((2, 0, 1))\n        if \"depth_gt\" in imgs:\n            imgs['depth_gt'] = np.array(imgs['depth_gt']).astype('float32')\n        results['imgs'] = imgs",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1228-1257"
+    },
+    "5139": {
+        "file_id": 436,
+        "content": "This code is part of a machine learning pipeline that processes image data. It first flips left-right some images marked with \"color\" or \"color_n\". Then, it converts certain color and depth images to floats and normalizes them to [0,1] for training. Finally, it returns the updated image dictionary as part of the results.",
+        "type": "comment"
+    },
+    "5140": {
+        "file_id": 436,
+        "content": "        return results\n@PIPELINES.register()\nclass YowoAug(object):\n    def __init__(self, target_size=224, jitter=0.2, hue=0.1, saturation=1.5, exposure=1.5, valid_mode=False):\n        self.shape = (target_size, target_size)\n        self.jitter = jitter\n        self.hue = hue\n        self.saturation = saturation\n        self.exposure = exposure\n        self.valid_mode = valid_mode\n    def _rand_scale(self, s):\n        scale = random.uniform(1, s)\n        if (random.randint(1, 10000) % 2):\n            return scale\n        return 1. / scale\n    def _distort_image(self, im, hue, sat, val):\n        im = im.convert('HSV')\n        cs = list(im.split())\n        cs[1] = cs[1].point(lambda i: i * sat)\n        cs[2] = cs[2].point(lambda i: i * val)\n        def _change_hue(x):\n            x += hue * 255\n            if x > 255:\n                x -= 255\n            if x < 0:\n                x += 255\n            return x\n        cs[0] = cs[0].point(_change_hue)\n        im = Image.merge(im.mode, tuple(cs))\n        im = im.convert('RGB')",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1258-1294"
+    },
+    "5141": {
+        "file_id": 436,
+        "content": "This code defines a class called YowoAug for image augmentation. It takes in parameters such as target size, jitter, hue, saturation, exposure, and valid mode. The class has methods to randomly scale the image, distort the image by changing hue, saturation, and exposure levels, and returns the augmented results.",
+        "type": "comment"
+    },
+    "5142": {
+        "file_id": 436,
+        "content": "        # constrain_image(im)\n        return im\n    def _random_distort_image(self, im, dhue, dsat, dexp):\n        res = self._distort_image(im, dhue, dsat, dexp)\n        return res\n    def _read_truths_args(self, lab_path, min_box_scale):\n        truths = np.loadtxt(lab_path)\n        truths = np.reshape(truths, (truths.size // 5, 5))\n        new_truths = []\n        for i in range(truths.shape[0]):\n            cx = (truths[i][1] + truths[i][3]) / (2 * 320)\n            cy = (truths[i][2] + truths[i][4]) / (2 * 240)\n            imgw = (truths[i][3] - truths[i][1]) / 320\n            imgh = (truths[i][4] - truths[i][2]) / 240\n            truths[i][0] = truths[i][0] - 1\n            truths[i][1] = cx\n            truths[i][2] = cy\n            truths[i][3] = imgw\n            truths[i][4] = imgh\n            if truths[i][3] < min_box_scale:\n                continue\n            new_truths.append([truths[i][0], truths[i][1], truths[i][2], truths[i][3], truths[i][4]])\n        return np.array(new_truths)\n    def _fill_truth_detection(self, labpath, flip, dx, dy, sx, sy):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1295-1322"
+    },
+    "5143": {
+        "file_id": 436,
+        "content": "The code snippet defines several functions related to image augmentation and truth detection in an object detection task. The \"constrain_image\" function ensures the image is within a specific range of values. \"random_distort_image\" applies distortion to the input image randomly. \"read_truths_args\" reads the ground truth boxes from a file, scales and transforms them accordingly, and checks if the box scale is smaller than the minimum required scale before adding it to new_truths. Lastly, \"_fill_truth_detection\" fills in the ground truth detection with additional parameters like flip, dx, dy, sx, and sy.",
+        "type": "comment"
+    },
+    "5144": {
+        "file_id": 436,
+        "content": "        max_boxes = 50\n        label = np.zeros((max_boxes, 5))\n        bs = np.loadtxt(labpath)\n        bs = np.reshape(bs, (-1, 5))\n        for i in range(bs.shape[0]):\n            cx = (bs[i][1] + bs[i][3]) / (2 * 320)\n            cy = (bs[i][2] + bs[i][4]) / (2 * 240)\n            imgw = (bs[i][3] - bs[i][1]) / 320\n            imgh = (bs[i][4] - bs[i][2]) / 240\n            bs[i][0] = bs[i][0] - 1\n            bs[i][1] = cx\n            bs[i][2] = cy\n            bs[i][3] = imgw\n            bs[i][4] = imgh\n        cc = 0\n        for i in range(bs.shape[0]):\n            x1 = bs[i][1] - bs[i][3] / 2\n            y1 = bs[i][2] - bs[i][4] / 2\n            x2 = bs[i][1] + bs[i][3] / 2\n            y2 = bs[i][2] + bs[i][4] / 2\n            x1 = min(0.999, max(0, x1 * sx - dx))\n            y1 = min(0.999, max(0, y1 * sy - dy))\n            x2 = min(0.999, max(0, x2 * sx - dx))\n            y2 = min(0.999, max(0, y2 * sy - dy))\n            bs[i][1] = (x1 + x2) / 2\n            bs[i][2] = (y1 + y2) / 2\n            bs[i][3] = (x2 - x1)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1323-1353"
+    },
+    "5145": {
+        "file_id": 436,
+        "content": "The code resizes and normalizes bounding box coordinates from a loaded label file, adjusts them based on image size scaling factors and offsets, and updates the bounding boxes accordingly.",
+        "type": "comment"
+    },
+    "5146": {
+        "file_id": 436,
+        "content": "            bs[i][4] = (y2 - y1)\n            if flip:\n                bs[i][1] = 0.999 - bs[i][1]\n            if bs[i][3] < 0.001 or bs[i][4] < 0.001:\n                continue\n            label[cc] = bs[i]\n            cc += 1\n            if cc >= 50:\n                break\n        label = np.reshape(label, (-1))\n        return label\n    def __call__(self, results):\n        clip = results['imgs']\n        frame_num = len(clip)\n        oh = clip[0].height\n        ow = clip[0].width\n        labpath = results['filename'].replace('jpg', 'txt').replace('rgb-images', 'labels')\n        if not self.valid_mode:\n            dw = int(ow * self.jitter)\n            dh = int(oh * self.jitter)\n            pleft = random.randint(-dw, dw)\n            pright = random.randint(-dw, dw)\n            ptop = random.randint(-dh, dh)\n            pbot = random.randint(-dh, dh)\n            swidth = ow - pleft - pright\n            sheight = oh - ptop - pbot\n            sx = float(swidth) / ow\n            sy = float(sheight) / oh\n            dx = (float(pleft) / ow) / sx",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1354-1390"
+    },
+    "5147": {
+        "file_id": 436,
+        "content": "This code initializes a list of bounding boxes, applies jitter if not in valid mode (randomly adjusts image size and position), reshapes the list into a single array of bounding boxes, and returns it.",
+        "type": "comment"
+    },
+    "5148": {
+        "file_id": 436,
+        "content": "            dy = (float(ptop) / oh) / sy\n            flip = random.randint(1, 10000) % 2\n            dhue = random.uniform(-self.hue, self.hue)\n            dsat = self._rand_scale(self.saturation)\n            dexp = self._rand_scale(self.exposure)\n            # Augment\n            cropped = [img.crop((pleft, ptop, pleft + swidth - 1, ptop + sheight - 1)) for img in clip]\n            sized = [img.resize(self.shape) for img in cropped]\n            if flip:\n                sized = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in sized]\n            clip = [self._random_distort_image(img, dhue, dsat, dexp) for img in sized]\n            label = self._fill_truth_detection(labpath, flip, dx, dy, 1. / sx, 1. / sy)\n        else:\n            label = np.zeros([50 * 5])\n            tmp = self._read_truths_args(labpath, 8.0 / clip[0].width).astype('float32')\n            tmp = np.reshape(tmp, [-1])\n            tsz = tmp.size\n            if tsz > 50 * 5:\n                label = tmp[0:50 * 5]\n            elif tsz > 0:\n                label[0:tsz] = tmp",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1391-1419"
+    },
+    "5149": {
+        "file_id": 436,
+        "content": "This code performs image augmentations and label manipulations. It applies random crop, resize, flip (horizontally), and distortion to the image(s) with a certain probability. The label is either filled from the truth detection or set to zero vector depending on the size of the extracted truth data.",
+        "type": "comment"
+    },
+    "5150": {
+        "file_id": 436,
+        "content": "            clip = [img.resize(self.shape) for img in clip]\n        clip = [np.asarray(img).astype('float32') / 255.0 for img in clip]\n        clip = np.concatenate(clip, 0).reshape([frame_num, 224, 224, 3])\n        clip = np.transpose(clip, [3, 0, 1, 2])\n        results['imgs'] = clip\n        results['labels'] = label\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations.py:1420-1427"
+    },
+    "5151": {
+        "file_id": 436,
+        "content": "Resizes images to a specific shape, converts them to float32 type and scales by 255.0, concatenates frames into a single array, transposes array dimensions, stores image and label arrays in 'results', returns results",
+        "type": "comment"
+    },
+    "5152": {
+        "file_id": 437,
+        "content": "/paddlevideo/loader/pipelines/augmentations_ava.py",
+        "type": "filepath"
+    },
+    "5153": {
+        "file_id": 437,
+        "content": "This code defines image augmentation and resizing functions for AVA dataset in PaddleVideo library using operations like resizing, lazy initialization, and RandomRescale/Resize transforms. It creates classes for ground truth bounding boxes and proposals, cropping and flipping entity boxes, and includes Flip and Normalize classes for image processing and normalization.",
+        "type": "summary"
+    },
+    "5154": {
+        "file_id": 437,
+        "content": "#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nimport math\nfrom PIL import Image\nfrom ..registry import PIPELINES\nfrom collections.abc import Sequence\nimport cv2\npillow_interp_codes = {\n    'nearest': Image.NEAREST,\n    'bilinear': Image.BILINEAR,\n    'bicubic': Image.BICUBIC,\n    'box': Image.BOX,\n    'lanczos': Image.LANCZOS,\n    'hamming': Image.HAMMING\n}\ncv2_interp_codes = {\n    'nearest': cv2.INTER_NEAREST,\n    'bilinear': cv2.INTER_LINEAR,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:1-34"
+    },
+    "5155": {
+        "file_id": 437,
+        "content": "This code is part of the PaddleVideo library and contains a module for image augmentations in the AVA dataset. It imports necessary libraries, defines conversion dictionaries for interpolation methods between PIL and OpenCV, and sets up registry entries for the PIPELINES module.",
+        "type": "comment"
+    },
+    "5156": {
+        "file_id": 437,
+        "content": "    'bicubic': cv2.INTER_CUBIC,\n    'area': cv2.INTER_AREA,\n    'lanczos': cv2.INTER_LANCZOS4\n}\ndef _init_lazy_if_proper(results, lazy):\n    \"\"\"Initialize lazy operation properly.\n    Make sure that a lazy operation is properly initialized,\n    and avoid a non-lazy operation accidentally getting mixed in.\n    Required keys in results are \"imgs\" if \"img_shape\" not in results,\n    otherwise, Required keys in results are \"img_shape\", add or modified keys\n    are \"img_shape\", \"lazy\".\n    Add or modified keys in \"lazy\" are \"original_shape\", \"crop_bbox\", \"flip\",\n    \"flip_direction\", \"interpolation\".\n    Args:\n        results (dict): A dict stores data pipeline result.\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    if 'img_shape' not in results:\n        results['img_shape'] = results['imgs'][0].shape[:2]\n    if lazy:\n        if 'lazy' not in results:\n            img_h, img_w = results['img_shape']\n            lazyop = dict()\n            lazyop['original_shape'] = results['img_shape']",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:35-64"
+    },
+    "5157": {
+        "file_id": 437,
+        "content": "This function initializes the lazy operation properly, ensuring a non-lazy operation is not accidentally mixed in. If 'img_shape' is not in results, it adds 'img_shape'. If 'lazy' is set to True and 'lazy' does not exist in results, it creates a new dictionary for lazy operation containing 'original_shape', 'img_shape', 'crop_bbox', 'flip', 'flip_direction', and 'interpolation'.",
+        "type": "comment"
+    },
+    "5158": {
+        "file_id": 437,
+        "content": "            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],\n                                           dtype=np.float32)\n            lazyop['flip'] = False\n            lazyop['flip_direction'] = None\n            lazyop['interpolation'] = None\n            results['lazy'] = lazyop\n    else:\n        assert 'lazy' not in results, 'Use Fuse after lazy operations'\ndef _scale_size(size, scale):\n    \"\"\"Rescale a size by a ratio.\n    Args:\n        size (tuple[int]): (w, h).\n        scale (float): Scaling factor.\n    Returns:\n        tuple[int]: scaled size.\n    \"\"\"\n    w, h = size\n    return int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)\ndef rescale_size(old_size, scale, return_scale=False):\n    \"\"\"Calculate the new size to be rescaled to.\n    Args:\n        old_size (tuple[int]): The old size (w, h) of image.\n        scale (float | tuple[int]): The scaling factor or maximum size.\n            If it is a float number, then the image will be rescaled by this\n            factor, else if it is a tuple of 2 integers, then the image will",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:65-96"
+    },
+    "5159": {
+        "file_id": 437,
+        "content": "This code defines functions for image augmentations in the AVA dataset pipeline. The \"_scale_size\" function scales a size by a ratio, while \"rescale_size\" calculates the new size to be rescaled based on an input scale factor or maximum size. The code also includes the initialization of crop parameters and flipping options for lazy operations in the results dictionary.",
+        "type": "comment"
+    },
+    "5160": {
+        "file_id": 437,
+        "content": "            be rescaled as large as possible within the scale.\n        return_scale (bool): Whether to return the scaling factor besides the\n            rescaled image size.\n    Returns:\n        tuple[int]: The new rescaled image size.\n    \"\"\"\n    w, h = old_size\n    if isinstance(scale, (float, int)):\n        if scale <= 0:\n            raise ValueError(f'Invalid scale {scale}, must be positive.')\n        scale_factor = scale\n    elif isinstance(scale, tuple):\n        max_long_edge = max(scale)\n        max_short_edge = min(scale)\n        scale_factor = min(max_long_edge / max(h, w),\n                           max_short_edge / min(h, w))\n    else:\n        raise TypeError(\n            f'Scale must be a number or tuple of int, but got {type(scale)}')\n    new_size = _scale_size((w, h), scale_factor)\n    if return_scale:\n        return new_size, scale_factor\n    else:\n        return new_size\ndef imresize(img,\n             size,\n             return_scale=False,\n             interpolation='bilinear',\n             out=None,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:97-130"
+    },
+    "5161": {
+        "file_id": 437,
+        "content": "Function \"imresize\" resizes an image based on the provided scale factor. If the scale is a number, it's used directly as the scaling factor. If it's a tuple of ints, it sets max and min edge sizes for resizing. Returns new resized size or both size and scaling factor if requested.",
+        "type": "comment"
+    },
+    "5162": {
+        "file_id": 437,
+        "content": "             backend=None):\n    \"\"\"Resize image to a given size.  \"\"\"\n    h, w = img.shape[:2]\n    if backend is None:\n        backend = 'cv2'\n    if backend not in ['cv2', 'pillow']:\n        raise ValueError(f'backend: {backend} is not supported for resize.'\n                         f\"Supported backends are 'cv2', 'pillow'\")\n    if backend == 'pillow':\n        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'\n        pil_image = Image.fromarray(img)\n        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])\n        resized_img = np.array(pil_image)\n    else:\n        resized_img = cv2.resize(\n            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])\n    if not return_scale:\n        return resized_img\n    else:\n        w_scale = size[0] / w\n        h_scale = size[1] / h\n        return resized_img, w_scale, h_scale\n@PIPELINES.register()\nclass EntityBoxRescale:\n    \"\"\"Rescale the entity box and proposals according to the image shape.\n    Required keys are \"proposals\", \"gt_bboxes\", added or modified keys are",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:131-160"
+    },
+    "5163": {
+        "file_id": 437,
+        "content": "This code defines a function for resizing an image to a given size. It supports two backends: 'cv2' and 'pillow'. If the backend is not specified, it defaults to 'cv2'. The function first gets the original image's height and width, checks if the backend is valid, handles unsupported backends, asserts the image type for 'pillow' backend, resizes the image using either OpenCV or Pillow library based on the backend, and finally returns the resized image along with scale factors if return_scale is True. The EntityBoxRescale class registers a pipeline to rescale entity boxes and proposals according to the image shape.",
+        "type": "comment"
+    },
+    "5164": {
+        "file_id": 437,
+        "content": "    \"gt_bboxes\". If original \"proposals\" is not None, \"proposals\" and\n    will be added or modified.\n    Args:\n        scale_factor (np.ndarray): The scale factor used entity_box rescaling.\n    \"\"\"\n    def __init__(self, scale_factor):\n        self.scale_factor = scale_factor\n    def __call__(self, results):\n        scale_factor = np.concatenate([self.scale_factor, self.scale_factor])\n        if 'gt_bboxes' in results:\n            gt_bboxes = results['gt_bboxes']\n            results['gt_bboxes'] = gt_bboxes * scale_factor\n        if 'proposals' in results:\n            proposals = results['proposals']\n            if proposals is not None:\n                assert proposals.shape[1] == 4, (\n                    'proposals shape should be in '\n                    f'(n, 4), but got {proposals.shape}')\n                results['proposals'] = proposals * scale_factor\n        return results\n    def __repr__(self):\n        return f'{self.__class__.__name__}(scale_factor={self.scale_factor})'\n@PIPELINES.register()\nclass EntityBoxCrop:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:161-193"
+    },
+    "5165": {
+        "file_id": 437,
+        "content": "The code defines a class called EntityBoxCrop that scales the ground truth bounding boxes (gt_bboxes) and proposals, if present, by a given scale factor. It ensures that the number of columns in the proposals is 4. This class can be registered as a pipeline for video augmentation.",
+        "type": "comment"
+    },
+    "5166": {
+        "file_id": 437,
+        "content": "    \"\"\"Crop the entity boxes and proposals according to the cropped images.\n    Required keys are \"proposals\", \"gt_bboxes\", added or modified keys are\n    \"gt_bboxes\". If original \"proposals\" is not None, \"proposals\" will be\n    modified.\n    Args:\n        crop_bbox(np.ndarray | None): The bbox used to crop the original image.\n    \"\"\"\n    def __init__(self, crop_bbox):\n        self.crop_bbox = crop_bbox\n    def __call__(self, results):\n        proposals = results['proposals']\n        gt_bboxes = results['gt_bboxes']\n        if self.crop_bbox is None:\n            return results\n        x1, y1, x2, y2 = self.crop_bbox\n        img_w, img_h = x2 - x1, y2 - y1\n        assert gt_bboxes.shape[-1] == 4\n        gt_bboxes_ = gt_bboxes.copy()\n        gt_bboxes_[..., 0::2] = np.clip(gt_bboxes[..., 0::2] - x1, 0, img_w - 1)\n        gt_bboxes_[..., 1::2] = np.clip(gt_bboxes[..., 1::2] - y1, 0, img_h - 1)\n        results['gt_bboxes'] = gt_bboxes_\n        if proposals is not None:\n            assert proposals.shape[-1] == 4",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:194-224"
+    },
+    "5167": {
+        "file_id": 437,
+        "content": "This code initializes an object that crops the entity boxes and proposals according to the cropped images. The required keys are \"proposals\" and \"gt_bboxes\", while \"gt_bboxes\" is added or modified. If original \"proposals\" is not None, \"proposals\" will be modified. The crop_bbox argument specifies the bbox used to crop the original image.",
+        "type": "comment"
+    },
+    "5168": {
+        "file_id": 437,
+        "content": "            proposals_ = proposals.copy()\n            proposals_[..., 0::2] = np.clip(proposals[..., 0::2] - x1, 0,\n                                            img_w - 1)\n            proposals_[..., 1::2] = np.clip(proposals[..., 1::2] - y1, 0,\n                                            img_h - 1)\n            results['proposals'] = proposals_\n        return results\n    def __repr__(self):\n        return f'{self.__class__.__name__}(crop_bbox={self.crop_bbox})'\n@PIPELINES.register()\nclass EntityBoxFlip:\n    \"\"\"Flip the entity boxes and proposals with a probability.\n    Reverse the order of elements in the given bounding boxes and proposals\n    with a specific direction. The shape of them are preserved, but the\n    elements are reordered. Only the horizontal flip is supported (seems\n    vertical flipping makes no sense). Required keys are \"proposals\",\n    \"gt_bboxes\", added or modified keys are \"gt_bboxes\". If \"proposals\"\n    is not None, it will also be modified.\n    Args:\n        img_shape (tuple[int]): The img shape.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:225-249"
+    },
+    "5169": {
+        "file_id": 437,
+        "content": "This code defines two classes, \"EntityBoxFlip\" and a nameless class. The nameless class performs cropping operations on proposals based on given coordinates (x1, y1). It also updates the results['proposals'] with the modified proposals. The EntityBoxFlip class flips the entity boxes and proposals horizontally with a certain probability. It adds or modifies keys \"gt_bboxes\" in the results dictionary. If \"proposals\" is not None, it will also modify them. The img_shape tuple represents the image shape.",
+        "type": "comment"
+    },
+    "5170": {
+        "file_id": 437,
+        "content": "    \"\"\"\n    def __init__(self, img_shape):\n        self.img_shape = img_shape\n    def __call__(self, results):\n        proposals = results['proposals']\n        gt_bboxes = results['gt_bboxes']\n        img_h, img_w = self.img_shape\n        assert gt_bboxes.shape[-1] == 4\n        gt_bboxes_ = gt_bboxes.copy()\n        gt_bboxes_[..., 0::4] = img_w - gt_bboxes[..., 2::4] - 1\n        gt_bboxes_[..., 2::4] = img_w - gt_bboxes[..., 0::4] - 1\n        if proposals is not None:\n            assert proposals.shape[-1] == 4\n            proposals_ = proposals.copy()\n            proposals_[..., 0::4] = img_w - proposals[..., 2::4] - 1\n            proposals_[..., 2::4] = img_w - proposals[..., 0::4] - 1\n        else:\n            proposals_ = None\n        results['proposals'] = proposals_\n        results['gt_bboxes'] = gt_bboxes_\n        return results\n    def __repr__(self):\n        repr_str = f'{self.__class__.__name__}(img_shape={self.img_shape})'\n        return repr_str\n@PIPELINES.register()\nclass Resize:\n    \"\"\"Resize images to a specific size.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:250-284"
+    },
+    "5171": {
+        "file_id": 437,
+        "content": "This code defines a pipeline for resizing images to a specific size. It first initializes the pipeline with an image shape and then, in the __call__ method, it adjusts the ground truth bounding boxes and proposal bounding boxes by subtracting their width values from the total image width minus 1. If there are no proposals, it sets proposals_ to None. The __repr__ method provides a string representation of the pipeline.",
+        "type": "comment"
+    },
+    "5172": {
+        "file_id": 437,
+        "content": "    Required keys are \"imgs\", \"img_shape\", \"modality\", added or modified\n    keys are \"imgs\", \"img_shape\", \"keep_ratio\", \"scale_factor\", \"lazy\",\n    \"resize_size\". Required keys in \"lazy\" is None, added or modified key is\n    \"interpolation\".\n    Args:\n        scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling\n            factor or maximum size:\n            If it is a float number, the image will be rescaled by this\n            factor, else if it is a tuple of 2 integers, the image will\n            be rescaled as large as possible within the scale.\n            Otherwise, it serves as (w, h) of output size.\n        keep_ratio (bool): If set to True, Images will be resized without\n            changing the aspect ratio. Otherwise, it will resize images to a\n            given size. Default: True.\n        interpolation (str): Algorithm used for interpolation:\n            \"nearest\" | \"bilinear\". Default: \"bilinear\".\n        lazy (bool): Determine whether to apply lazy operation. Default: False.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:286-303"
+    },
+    "5173": {
+        "file_id": 437,
+        "content": "This code defines a function for image augmentation in PaddleVideo. The function takes arguments like scale, keep_ratio, interpolation, and lazy. If keep_ratio is True, it scales the image by the given factor or resizes to the maximum size specified by the tuple. It uses bilinear interpolation by default. Lazy operation can be determined if lazy argument is set to True.",
+        "type": "comment"
+    },
+    "5174": {
+        "file_id": 437,
+        "content": "    \"\"\"\n    def __init__(self,\n                 scale,\n                 keep_ratio=True,\n                 interpolation='bilinear',\n                 lazy=False):\n        if isinstance(scale, str):\n            scale = eval(scale)\n        if isinstance(scale, float):\n            if scale <= 0:\n                raise ValueError(f'Invalid scale {scale}, must be positive.')\n        elif isinstance(scale, tuple):\n            max_long_edge = max(scale)\n            max_short_edge = min(scale)\n            if max_short_edge == -1:\n                # assign np.inf to long edge for rescaling short edge later.\n                scale = (np.inf, max_long_edge)\n        else:\n            raise TypeError(\n                f'Scale must be float or tuple of int, but got {type(scale)}')\n        self.scale = scale\n        self.keep_ratio = keep_ratio\n        self.interpolation = interpolation\n        self.lazy = lazy\n    def __call__(self, results):\n        \"\"\"Performs the Resize augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:304-334"
+    },
+    "5175": {
+        "file_id": 437,
+        "content": "Initializes a resize augmentation object with scale, keeps ratio (True or False), interpolation method ('bilinear' default), and lazy flag (False).",
+        "type": "comment"
+    },
+    "5176": {
+        "file_id": 437,
+        "content": "                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        if 'scale_factor' not in results:\n            results['scale_factor'] = np.array([1, 1], dtype=np.float32)\n        img_h, img_w = results['img_shape']\n        if self.keep_ratio:\n            new_w, new_h = rescale_size((img_w, img_h), self.scale)\n        else:\n            new_w, new_h = self.scale\n        self.scale_factor = np.array([new_w / img_w, new_h / img_h],\n                                     dtype=np.float32)\n        results['img_shape'] = (new_h, new_w)\n        results['keep_ratio'] = self.keep_ratio\n        results['scale_factor'] = results['scale_factor'] * self.scale_factor\n        if not self.lazy:\n            if 'imgs' in results:\n                results['imgs'] = [\n                    imresize(\n                        img, (new_w, new_h), interpolation=self.interpolation)\n                    for img in results['imgs']\n                ]\n            if 'keypoint' in results:\n                results['keypoint'] = results['keypoint'] * self.scale_factor",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:335-363"
+    },
+    "5177": {
+        "file_id": 437,
+        "content": "This code resizes images and keypoints based on the 'scale_factor' or scale provided. If 'scale_factor' is not already in the results, it initializes it with a default value of [1, 1]. The code then calculates new image width (new_w) and height (new_h) based on the scale or keep_ratio setting. It updates 'img_shape', 'keep_ratio', and 'scale_factor' in the results dictionary, and if not lazy, it resizes images and keypoints accordingly using the imresize function.",
+        "type": "comment"
+    },
+    "5178": {
+        "file_id": 437,
+        "content": "        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Put Flip at last for now')\n            lazyop['interpolation'] = self.interpolation\n        #if 'gt_bboxes' in results:\n        assert not self.lazy\n        entity_box_rescale = EntityBoxRescale(self.scale_factor)\n        results = entity_box_rescale(results)\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'scale={self.scale}, keep_ratio={self.keep_ratio}, '\n                    f'interpolation={self.interpolation}, '\n                    f'lazy={self.lazy})')\n        return repr_str\n@PIPELINES.register()\nclass RandomRescale:\n    \"\"\"Randomly resize images so that the short_edge is resized to a specific\n    size in a given range. The scale ratio is unchanged after resizing.\n    \"\"\"\n    def __init__(self, scale_range, interpolation='bilinear'):\n        scale_range = eval(scale_range)\n        self.scale_range = scale_range",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:364-393"
+    },
+    "5179": {
+        "file_id": 437,
+        "content": "This code defines a class RandomRescale that performs random rescaling on images, maintaining the aspect ratio. It takes in a range for the short edge size, and an optional interpolation method. The class also has a lazy attribute to control whether the transformation is applied lazily or not. The class also includes an EntityBoxRescale function to rescale bounding boxes. The code ends with registering RandomRescale as a pipeline module.",
+        "type": "comment"
+    },
+    "5180": {
+        "file_id": 437,
+        "content": "        assert len(scale_range) == 2\n        assert scale_range[0] < scale_range[1]\n        assert np.all([x > 0 for x in scale_range])\n        self.keep_ratio = True\n        self.interpolation = interpolation\n    def __call__(self, results):\n        \"\"\"Performs the Resize augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        short_edge = np.random.randint(self.scale_range[0],\n                                       self.scale_range[1] + 1)\n        resize = Resize((-1, short_edge),\n                        keep_ratio=True,\n                        interpolation=self.interpolation,\n                        lazy=False)\n        results = resize(results)\n        results['short_edge'] = short_edge\n        return results\n    def __repr__(self):\n        scale_range = self.scale_range\n        repr_str = (f'{self.__class__.__name__}('\n                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:395-423"
+    },
+    "5181": {
+        "file_id": 437,
+        "content": "This code defines a Resize augmentation transform with random scaling range, keeps aspect ratio and applies specified interpolation. It also includes a __repr__ method to provide class name, scale range and short edge value for debugging purposes.",
+        "type": "comment"
+    },
+    "5182": {
+        "file_id": 437,
+        "content": "                    f'interpolation={self.interpolation})')\n        return repr_str\n@PIPELINES.register()\nclass Rescale:\n    \"\"\"resize images so that the short_edge is resized to a specific\n    size in a given range. The scale ratio is unchanged after resizing.\n    Required keys are \"imgs\", \"img_shape\", \"modality\", added or modified\n    keys are \"imgs\", \"img_shape\", \"keep_ratio\", \"scale_factor\", \"resize_size\",\n    \"short_edge\".\n    Args:\n        scale_range (tuple[int]): The range of short edge length. A closed\n            interval.\n        interpolation (str): Algorithm used for interpolation:\n            \"nearest\" | \"bilinear\". Default: \"bilinear\".\n    \"\"\"\n    def __init__(self, scale_range, interpolation='bilinear'):\n        scale_range = eval(scale_range)\n        self.scale_range = scale_range\n        self.keep_ratio = True\n        self.interpolation = interpolation\n    def __call__(self, results):\n        \"\"\"Performs the Resize augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:424-455"
+    },
+    "5183": {
+        "file_id": 437,
+        "content": "This code defines a Rescale augmentation class for image processing in the PaddleVideo framework. It resizes images so that the short edge length is within a specified range, while maintaining the aspect ratio. The interpolation method can be set to 'nearest' or 'bilinear'. This augmentation modifies the 'imgs', 'img_shape', 'keep_ratio', 'scale_factor', 'resize_size', and 'short_edge' keys in the results dictionary.",
+        "type": "comment"
+    },
+    "5184": {
+        "file_id": 437,
+        "content": "                to the next transform in pipeline.\n        \"\"\"\n        resize = Resize(\n            self.scale_range,\n            keep_ratio=True,\n            interpolation=self.interpolation,\n            lazy=False)\n        results = resize(results)\n        return results\n    def __repr__(self):\n        scale_range = self.scale_range\n        repr_str = (f'{self.__class__.__name__}('\n                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '\n                    f'interpolation={self.interpolation})')\n        return repr_str\n@PIPELINES.register()\nclass RandomCrop_v2:\n    \"\"\"Vanilla square random crop that specifics the output size.\n    Required keys in results are \"imgs\" and \"img_shape\", added or\n    modified keys are \"imgs\", \"lazy\"; Required keys in \"lazy\" are \"flip\",\n    \"crop_bbox\", added or modified key is \"crop_bbox\".\n    Args:\n        size (int): The output size of the images.\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    def __init__(self, size, lazy=False):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:456-487"
+    },
+    "5185": {
+        "file_id": 437,
+        "content": "This code defines a Resize transform and a RandomCrop_v2 class for image processing pipelines in PaddleVideo. The Resize transform scales images within a specified range and with optional interpolation, while the RandomCrop_v2 performs square random cropping on images to a specific output size.",
+        "type": "comment"
+    },
+    "5186": {
+        "file_id": 437,
+        "content": "        if not isinstance(size, int):\n            raise TypeError(f'Size must be an int, but got {type(size)}')\n        self.size = size\n        self.lazy = lazy\n    def __call__(self, results):\n        \"\"\"Performs the RandomCrop augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        img_h, img_w = results['img_shape']\n        assert self.size <= img_h and self.size <= img_w\n        y_offset = 0\n        x_offset = 0\n        if img_h > self.size:\n            y_offset = int(np.random.randint(0, img_h - self.size))\n        if img_w > self.size:\n            x_offset = int(np.random.randint(0, img_w - self.size))\n        if 'crop_quadruple' not in results:\n            results['crop_quadruple'] = np.array(\n                [0, 0, 1, 1],  # x, y, w, h\n                dtype=np.float32)\n        x_ratio, y_ratio = x_offset / img_w, y_offset / img_h\n        w_ratio, h_ratio = self.size / img_w, self.size / img_h",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:488-517"
+    },
+    "5187": {
+        "file_id": 437,
+        "content": "This code defines a class with an __init__ method that checks if the input 'size' is an integer, and a __call__ method to perform random cropping. The __call__ method takes a dictionary of results and performs random cropping based on the size attribute of the class instance. It asserts that the size is less than or equal to the image height and width, then randomly selects y and x offsets for cropping. If 'crop_quadruple' is not in the results dictionary, it adds a new entry with initial values. Finally, it calculates ratios for cropping based on the input size and image dimensions.",
+        "type": "comment"
+    },
+    "5188": {
+        "file_id": 437,
+        "content": "        old_crop_quadruple = results['crop_quadruple']\n        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]\n        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]\n        new_crop_quadruple = [\n            old_x_ratio + x_ratio * old_w_ratio,\n            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,\n            h_ratio * old_x_ratio\n        ]\n        results['crop_quadruple'] = np.array(\n            new_crop_quadruple, dtype=np.float32)\n        new_h, new_w = self.size, self.size\n        results['crop_bbox'] = np.array(\n            [x_offset, y_offset, x_offset + new_w, y_offset + new_h])\n        results['img_shape'] = (new_h, new_w)\n        if not self.lazy:\n            results['imgs'] = [\n                img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]\n                for img in results['imgs']\n            ]\n        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Put Flip at last for now')",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:519-544"
+    },
+    "5189": {
+        "file_id": 437,
+        "content": "This code segment is adjusting the crop quadruple, calculating a new crop bounding box, and updating the image shape based on provided offsets. It also handles lazy loading if enabled.",
+        "type": "comment"
+    },
+    "5190": {
+        "file_id": 437,
+        "content": "            # record crop_bbox in lazyop dict to ensure only crop once in Fuse\n            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']\n            left = x_offset * (lazy_right - lazy_left) / img_w\n            right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w\n            top = y_offset * (lazy_bottom - lazy_top) / img_h\n            bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h\n            lazyop['crop_bbox'] = np.array(\n                [(lazy_left + left), (lazy_top + top), (lazy_left + right),\n                 (lazy_top + bottom)],\n                dtype=np.float32)\n        # Process entity boxes\n        if 'gt_bboxes' in results:\n            assert not self.lazy\n            entity_box_crop = EntityBoxCrop(results['crop_bbox'])\n            results = entity_box_crop(results)\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}(size={self.size}, '\n                    f'lazy={self.lazy})')\n        return repr_str\ndef imflip_(img, direction='horizontal'):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:546-571"
+    },
+    "5191": {
+        "file_id": 437,
+        "content": "This code section is responsible for applying augmentations to video frames, specifically crop and flip operations. It takes input parameters such as image size, whether it should be applied lazily or not, and the direction of flipping (if applicable). The code adjusts the crop region based on the specified offset values and stores them in the 'crop_bbox' field of the lazy operation dictionary. Additionally, if there are entity boxes present, they will also be processed according to the applied crop and flip operations.",
+        "type": "comment"
+    },
+    "5192": {
+        "file_id": 437,
+        "content": "    \"\"\"Inplace flip an image horizontally or vertically.\n    Args:\n        img (ndarray): Image to be flipped.\n        direction (str): The flip direction, either \"horizontal\" or\n            \"vertical\" or \"diagonal\".\n    Returns:\n        ndarray: The flipped image (inplace).\n    \"\"\"\n    assert direction in ['horizontal', 'vertical', 'diagonal']\n    if direction == 'horizontal':\n        return cv2.flip(img, 1, img)\n    elif direction == 'vertical':\n        return cv2.flip(img, 0, img)\n    else:\n        return cv2.flip(img, -1, img)\ndef iminvert(img):\n    \"\"\"Invert (negate) an image.\n    Args:\n        img (ndarray): Image to be inverted.\n    Returns:\n        ndarray: The inverted image.\n    \"\"\"\n    return np.full_like(img, 255) - img\n@PIPELINES.register()\nclass Flip:\n    \"\"\"Flip the input images with a probability.\n    Reverse the order of elements in the given imgs with a specific direction.\n    The shape of the imgs is preserved, but the elements are reordered.\n    Required keys are \"imgs\", \"img_shape\", \"modality\", added or modified",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:572-609"
+    },
+    "5193": {
+        "file_id": 437,
+        "content": "The provided code contains three functions: `inplace_flip`, `iminvert`, and a pipeline class called `Flip`. \n\n`inplace_flip` takes an image (`ndarray`) and the direction for flipping (horizontal, vertical or diagonal), asserts that the direction is valid, and returns the flipped image in-place. If the direction is horizontal, it uses `cv2.flip()` with parameter 1 to flip horizontally; if the direction is vertical, it uses `cv2.flip()` with parameter 0 to flip vertically; if the direction is diagonal, it uses `cv2.flip()` with parameter -1 for diagonal flipping.\n\n`iminvert` takes an image (`ndarray`) and returns its negative (inverted) version by subtracting the original image from a numpy array of full value 255 (the maximum possible value for an 8-bit image). This effectively reverses all pixel intensities in the image.\n\nThe `Flip` class is a pipeline module that flips the input images with a certain probability. It requires keys \"imgs\", \"img_shape\", and \"modality\" (although it does not modify them) and adds no new keys.",
+        "type": "comment"
+    },
+    "5194": {
+        "file_id": 437,
+        "content": "    keys are \"imgs\", \"lazy\" and \"flip_direction\". Required keys in \"lazy\" is\n    None, added or modified key are \"flip\" and \"flip_direction\". The Flip\n    augmentation should be placed after any cropping / reshaping augmentations,\n    to make sure crop_quadruple is calculated properly.\n    Args:\n        flip_ratio (float): Probability of implementing flip. Default: 0.5.\n        direction (str): Flip imgs horizontally or vertically. Options are\n            \"horizontal\" | \"vertical\". Default: \"horizontal\".\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    _directions = ['horizontal', 'vertical']\n    def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False):\n        if direction not in self._directions:\n            raise ValueError(f'Direction {direction} is not supported. '\n                             f'Currently support ones are {self._directions}')\n        self.flip_ratio = flip_ratio\n        self.direction = direction\n        self.lazy = lazy\n    def __call__(self, results):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:610-631"
+    },
+    "5195": {
+        "file_id": 437,
+        "content": "This code defines a Flip augmentation class for image processing in PaddleVideo. It takes flip ratio, direction (horizontal or vertical), and lazy operation as parameters. The flip_ratio determines the probability of applying the flip transformation, while direction specifies whether to flip horizontally or vertically. If the 'lazy' parameter is True, the transformation will be applied lazily. This augmentation should be placed after cropping/reshaping transformations for proper crop_quadruple calculation.",
+        "type": "comment"
+    },
+    "5196": {
+        "file_id": 437,
+        "content": "        \"\"\"Performs the Flip augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        flip = np.random.rand() < self.flip_ratio\n        results['flip'] = flip\n        results['flip_direction'] = self.direction\n        if not self.lazy:\n            if flip:\n                for i, img in enumerate(results['imgs']):\n                    imflip_(img, self.direction)\n                lt = len(results['imgs'])\n            else:\n                results['imgs'] = list(results['imgs'])\n        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Use one Flip please')\n            lazyop['flip'] = flip\n            lazyop['flip_direction'] = self.direction\n        if 'gt_bboxes' in results and flip:\n            assert not self.lazy and self.direction == 'horizontal'\n            entity_box_flip = EntityBoxFlip(results['img_shape'])",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:632-660"
+    },
+    "5197": {
+        "file_id": 437,
+        "content": "The code snippet performs a Flip augmentation on images, randomly flipping them horizontally based on a given flip ratio. It also sets the 'flip' and 'flip_direction' keys in the results dictionary. If the 'lazy' option is not used (self.lazy), it iterates through the images, applying the flip transformation if necessary. It also handles 'gt_bboxes' if they exist in the results dictionary, ensuring horizontal flips are applied correctly without any issues.",
+        "type": "comment"
+    },
+    "5198": {
+        "file_id": 437,
+        "content": "            results = entity_box_flip(results)\n        return results\n    def __repr__(self):\n        repr_str = (\n            f'{self.__class__.__name__}('\n            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '\n            f'lazy={self.lazy})')\n        return repr_str\ndef imnormalize_(img, mean, std, to_rgb=True):\n    \"\"\"Inplace normalize an image with mean and std.\n    Args:\n        img (ndarray): Image to be normalized.\n        mean (ndarray): The mean to be used for normalize.\n        std (ndarray): The std to be used for normalize.\n        to_rgb (bool): Whether to convert to rgb.\n    Returns:\n        ndarray: The normalized image.\n    \"\"\"\n    # cv2 inplace normalization does not accept uint8\n    assert img.dtype != np.uint8\n    mean = np.float64(mean.reshape(1, -1))\n    stdinv = 1 / np.float64(std.reshape(1, -1))\n    if to_rgb:\n        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace\n    cv2.subtract(img, mean, img)  # inplace\n    cv2.multiply(img, stdinv, img)  # inplace\n    return img",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:661-693"
+    },
+    "5199": {
+        "file_id": 437,
+        "content": "This code contains a class for image augmentation, including flip ratio and direction, with a method to normalize an image using mean and std values. It also includes an inplace normalization function that converts BGR to RGB if necessary. The `__repr__` method returns a string representation of the class attributes.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/52.json b/docs/data/52.json
new file mode 100644
index 000000000..95c1f11cb
--- /dev/null
+++ b/docs/data/52.json
@@ -0,0 +1,545 @@
+{
+    "5200": {
+        "file_id": 437,
+        "content": "@PIPELINES.register()\nclass Normalize:\n    \"\"\"Normalize images with the given mean and std value.\n    Required keys are \"imgs\", \"img_shape\", \"modality\", added or modified\n    keys are \"imgs\" and \"img_norm_cfg\". If modality is 'Flow', additional\n    keys \"scale_factor\" is required\n    Args:\n        mean (Sequence[float]): Mean values of different channels.\n        std (Sequence[float]): Std values of different channels.\n        to_bgr (bool): Whether to convert channels from RGB to BGR.\n            Default: False.\n        adjust_magnitude (bool): Indicate whether to adjust the flow magnitude\n            on 'scale_factor' when modality is 'Flow'. Default: False.\n    \"\"\"\n    def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False):\n        if not isinstance(mean, Sequence):\n            raise TypeError(\n                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')\n        if not isinstance(std, Sequence):\n            raise TypeError(\n                f'Std must be list, tuple or np.ndarray, but got {type(std)}')",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:696-720"
+    },
+    "5201": {
+        "file_id": 437,
+        "content": "This code defines a class called \"Normalize\" that normalizes images based on given mean and std values. It can also convert channels from RGB to BGR if necessary. Additionally, it adjusts flow magnitude when modality is 'Flow' with an optional adjust_magnitude parameter. The class requires keys \"imgs\", \"img_shape\", \"modality\" with additional keys \"imgs\" and \"img_norm_cfg\" being added or modified.",
+        "type": "comment"
+    },
+    "5202": {
+        "file_id": 437,
+        "content": "        self.mean = np.array(mean, dtype=np.float32)\n        self.std = np.array(std, dtype=np.float32)\n        self.to_bgr = to_bgr\n        self.adjust_magnitude = adjust_magnitude\n    def __call__(self, results):\n        n = len(results['imgs'])\n        h, w, c = results['imgs'][0].shape\n        imgs = np.empty((n, h, w, c), dtype=np.float32)\n        for i, img in enumerate(results['imgs']):\n            imgs[i] = img\n        for img in imgs:\n            imnormalize_(img, self.mean, self.std, self.to_bgr)\n        results['imgs'] = imgs\n        results['img_norm_cfg'] = dict(\n            mean=self.mean, std=self.std, to_bgr=self.to_bgr)\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'mean={self.mean}, '\n                    f'std={self.std}, '\n                    f'to_bgr={self.to_bgr}, '\n                    f'adjust_magnitude={self.adjust_magnitude})')\n        return repr_str",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/augmentations_ava.py:722-749"
+    },
+    "5203": {
+        "file_id": 437,
+        "content": "This code defines an augmentation pipeline for image normalization in AVA. It initializes mean, std, and to_bgr values, and then applies the normalization transformation to each input image. The normalized images are stored in 'imgs' and the configuration is saved in 'img_norm_cfg'. The __repr__ method provides a string representation of the object's state.",
+        "type": "comment"
+    },
+    "5204": {
+        "file_id": 438,
+        "content": "/paddlevideo/loader/pipelines/compose.py",
+        "type": "filepath"
+    },
+    "5205": {
+        "file_id": 438,
+        "content": "The Compose class combines registry-based pipeline components like decode functions, sample functions, and transforms to apply transformations flexibly on dictionary or list inputs. It includes a workaround for old format configuration files.",
+        "type": "summary"
+    },
+    "5206": {
+        "file_id": 438,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections.abc import Sequence\nfrom ..registry import PIPELINES\nimport traceback\nfrom ...utils import build\nfrom ...utils import get_logger\n@PIPELINES.register()\nclass Compose(object):\n    \"\"\"\n    Composes several pipelines(include decode func, sample func, and transforms) together.\n    Note: To deal with ```list``` type cfg temporaray, like:\n        transform:\n            - Crop: # A list\n                attribute: 10",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:1-31"
+    },
+    "5207": {
+        "file_id": 438,
+        "content": "This code defines the Compose class, which composes multiple pipelines such as decode functions, sample functions, and transforms. It uses the PIPELINES registry for registration and builds pipelines based on input configurations. The code also handles temporary list-type configuration for flexibility.",
+        "type": "comment"
+    },
+    "5208": {
+        "file_id": 438,
+        "content": "            - Resize: # A list\n                attribute: 20\n    every key of list will pass as the key name to build a module.\n    XXX: will be improved in the future.\n    Args:\n        pipelines (list): List of transforms to compose.\n    Returns:\n        A compose object which is callable, __call__ for this Compose\n        object will call each given :attr:`transforms` sequencely.\n    \"\"\"\n    def __init__(self, pipelines):\n        #assert isinstance(pipelines, Sequence)\n        self.pipelines = []\n        for p in pipelines.values():\n            if isinstance(p, dict):\n                p = build(p, PIPELINES)\n                self.pipelines.append(p)\n            elif isinstance(p, list):\n                for t in p:\n                    #XXX: to deal with old format cfg, ugly code here!\n                    temp_dict = dict(name=list(t.keys())[0])\n                    for all_sub_t in t.values():\n                        if all_sub_t is not None:\n                            temp_dict.update(all_sub_t) \n                    t = build(temp_dict, PIPELINES)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:32-59"
+    },
+    "5209": {
+        "file_id": 438,
+        "content": "The code defines a Compose class that takes a list of transforms and composes them sequentially. It checks if the input is a dictionary or a list, builds the transform modules using build function from PIPELINES, and appends them to the pipelines list. The code also includes an ugly workaround for dealing with old format configuration files.",
+        "type": "comment"
+    },
+    "5210": {
+        "file_id": 438,
+        "content": "                    self.pipelines.append(t)\n            elif callable(p):\n                self.pipelines.append(p)\n            else:\n                raise TypeError(f'pipelines must be callable or a dict,'\n                                f'but got {type(p)}')\n    def __call__(self, data):\n        for p in self.pipelines:\n            try:\n                data = p(data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger = get_logger(\"paddlevideo\")\n                logger.info(\"fail to perform transform [{}] with error: \"\n                      \"{} and stack:\\n{}\".format(p, e, str(stack_info)))\n                raise e\n        return data",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:60-76"
+    },
+    "5211": {
+        "file_id": 438,
+        "content": "The code defines a class with a `__call__` method and an append function for adding pipelines. The `__call__` method applies transformations to input data by iterating over the pipelines. If any pipeline fails, it logs the error and raises an exception. Pipelines can be either callable or dictionaries, but if not, a TypeError is raised.",
+        "type": "comment"
+    },
+    "5212": {
+        "file_id": 439,
+        "content": "/paddlevideo/loader/pipelines/decode.py",
+        "type": "filepath"
+    },
+    "5213": {
+        "file_id": 439,
+        "content": "This code utilizes PaddleVideo's TimeSformer model for video processing, including a VideoDecoder class to decode mp4 files and handle varying durations. The \"ActionFeatureDecoder\" class handles feature decoding, while the function prepares data for model input and normalizes inputs for PaddlePaddle's video pipeline.",
+        "type": "summary"
+    },
+    "5214": {
+        "file_id": 439,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\ntry:\n    import av\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models.\"\n    )\nimport cv2\nimport pickle\nimport decord as de\nimport math\nimport random\nfrom ..registry import PIPELINES\ndef get_start_end_idx(video_size, clip_size, clip_idx, num_clips):\n    delta = max(video_size - clip_size, 0)\n    if clip_idx == -1:  # here",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:1-32"
+    },
+    "5215": {
+        "file_id": 439,
+        "content": "This code snippet is part of the PaddleVideo library and it appears to import various packages, define a function \"get_start_end_idx\", and register something into the PIPELINES registry. It seems to handle video clip processing for TimeSformer and other models. The function calculates start and end indices for video clips based on video size, clip size, clip index, and the total number of clips.",
+        "type": "comment"
+    },
+    "5216": {
+        "file_id": 439,
+        "content": "        # Random temporal sampling.\n        start_idx = random.uniform(0, delta)\n    else:  # ignore\n        # Uniformly sample the clip with the given index.\n        start_idx = delta * clip_idx / num_clips\n    end_idx = start_idx + clip_size - 1\n    return start_idx, end_idx\n@PIPELINES.register()\nclass VideoDecoder(object):\n    \"\"\"\n    Decode mp4 file to frames.\n    Args:\n        filepath: the file path of mp4 file\n    \"\"\"\n    def __init__(self,\n                 backend='cv2',\n                 mode='train',\n                 sampling_rate=32,\n                 num_seg=8,\n                 num_clips=1,\n                 target_fps=30):\n        self.backend = backend\n        # params below only for TimeSformer\n        self.mode = mode\n        self.sampling_rate = sampling_rate\n        self.num_seg = num_seg\n        self.num_clips = num_clips\n        self.target_fps = target_fps\n    def __call__(self, results):\n        \"\"\"\n        Perform mp4 decode operations.\n        return:\n            List where each item is a numpy array after decoder.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:33-69"
+    },
+    "5217": {
+        "file_id": 439,
+        "content": "This code defines a VideoDecoder class for decoding mp4 files to frames. It takes the file path as input and has additional parameters for time-series applications like TimeSformer. The __call__ method performs the decoding operation, returning a list of numpy arrays representing the decoded frames.",
+        "type": "comment"
+    },
+    "5218": {
+        "file_id": 439,
+        "content": "        \"\"\"\n        file_path = results['filename']\n        results['format'] = 'video'\n        results['backend'] = self.backend\n        if self.backend == 'cv2':\n            cap = cv2.VideoCapture(file_path)\n            videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n            sampledFrames = []\n            for i in range(videolen):\n                ret, frame = cap.read()\n                # maybe first frame is empty\n                if ret == False:\n                    continue\n                img = frame[:, :, ::-1]\n                sampledFrames.append(img)\n            results['frames'] = sampledFrames\n            results['frames_len'] = len(sampledFrames)\n        elif self.backend == 'decord':\n            container = de.VideoReader(file_path)\n            frames_len = len(container)\n            results['frames'] = container\n            results['frames_len'] = frames_len\n        elif self.backend == 'pyav':  # for TimeSformer\n            if self.mode in [\"train\", \"valid\"]:\n                clip_idx = -1\n            elif self.mode in [\"test\"]:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:70-98"
+    },
+    "5219": {
+        "file_id": 439,
+        "content": "This code is part of a video decoding pipeline. It checks the backend and decodes videos using either cv2, decord or pyav depending on the backend specified. It reads frames from the video and stores them in 'results' dictionary for further processing.",
+        "type": "comment"
+    },
+    "5220": {
+        "file_id": 439,
+        "content": "                clip_idx = 0\n            else:\n                raise NotImplementedError\n            container = av.open(file_path)\n            num_clips = 1  # always be 1\n            # decode process\n            fps = float(container.streams.video[0].average_rate)\n            frames_length = container.streams.video[0].frames\n            duration = container.streams.video[0].duration\n            if duration is None:\n                # If failed to fetch the decoding information, decode the entire video.\n                decode_all_video = True\n                video_start_pts, video_end_pts = 0, math.inf\n            else:\n                decode_all_video = False\n                start_idx, end_idx = get_start_end_idx(\n                    frames_length,\n                    self.sampling_rate * self.num_seg / self.target_fps * fps,\n                    clip_idx, num_clips)\n                timebase = duration / frames_length\n                video_start_pts = int(start_idx * timebase)\n                video_end_pts = int(end_idx * timebase)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:99-125"
+    },
+    "5221": {
+        "file_id": 439,
+        "content": "This code checks if the duration of a video file is None. If it is, it sets decode_all_video to True and calculates video_start_pts and video_end_pts as 0 and infinity respectively. If the duration is not None, it calculates start and end indices for decoding specific clips from the video file.",
+        "type": "comment"
+    },
+    "5222": {
+        "file_id": 439,
+        "content": "            frames = None\n            # If video stream was found, fetch video frames from the video.\n            if container.streams.video:\n                margin = 1024\n                seek_offset = max(video_start_pts - margin, 0)\n                container.seek(seek_offset,\n                               any_frame=False,\n                               backward=True,\n                               stream=container.streams.video[0])\n                tmp_frames = {}\n                buffer_count = 0\n                max_pts = 0\n                for frame in container.decode(**{\"video\": 0}):\n                    max_pts = max(max_pts, frame.pts)\n                    if frame.pts < video_start_pts:\n                        continue\n                    if frame.pts <= video_end_pts:\n                        tmp_frames[frame.pts] = frame\n                    else:\n                        buffer_count += 1\n                        tmp_frames[frame.pts] = frame\n                        if buffer_count >= 0:\n                            break",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:127-150"
+    },
+    "5223": {
+        "file_id": 439,
+        "content": "This code snippet is part of a video decoding pipeline in PaddleVideo. It seeks to a specific start time of the video stream, then decodes and filters frames based on their start and end points. Frames before the start point are skipped, while frames after the end point are buffered. Finally, it stores the relevant frames in the \"tmp\\_frames\" dictionary.",
+        "type": "comment"
+    },
+    "5224": {
+        "file_id": 439,
+        "content": "                video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]\n                container.close()\n                frames = [frame.to_rgb().to_ndarray() for frame in video_frames]\n                clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps\n                start_idx, end_idx = get_start_end_idx(\n                    len(frames),  # frame_len\n                    clip_sz,\n                    clip_idx if decode_all_video else\n                    0,  # If decode all video, -1 in train and valid, 0 in test;\n                    # else, always 0 in train, valid and test, as we has selected clip size frames when decode.\n                    1)\n                results['frames'] = frames\n                results['frames_len'] = len(frames)\n                results['start_idx'] = start_idx\n                results['end_idx'] = end_idx\n        else:\n            raise NotImplementedError\n        return results\n@PIPELINES.register()\nclass FrameDecoder(object):\n    \"\"\"just parse results\n    \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:151-177"
+    },
+    "5225": {
+        "file_id": 439,
+        "content": "This code extracts video frames, sorts them by timestamp, and then converts the frames to RGB format. It calculates the start and end indices for a given clip size based on the number of frames and the selected clip size. The results are stored in a dictionary along with additional information such as frame length and indices. If no code is provided for the \"else\" condition, a NotImplementedError will be raised. This class is registered as a pipeline using @PIPELINES.register().",
+        "type": "comment"
+    },
+    "5226": {
+        "file_id": 439,
+        "content": "    def __init__(self):\n        pass\n    def __call__(self, results):\n        results['format'] = 'frame'\n        return results\n@PIPELINES.register()\nclass MRIDecoder(object):\n    \"\"\"just parse results\n    \"\"\"\n    def __init__(self):\n        pass\n    def __call__(self, results):\n        results['format'] = 'MRI'\n        return results\n@PIPELINES.register()\nclass FeatureDecoder(object):\n    \"\"\"\n        Perform feature decode operations.e.g.youtube8m\n    \"\"\"\n    def __init__(self, num_classes, max_len=512, has_label=True):\n        self.max_len = max_len\n        self.num_classes = num_classes\n        self.has_label = has_label\n    def __call__(self, results):\n        \"\"\"\n        Perform feature decode operations.\n        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        #1. load pkl\n        #2. parse to rgb/audio/\n        #3. padding\n        filepath = results['filename']\n        data = pickle.load(open(filepath, 'rb'), encoding='bytes')\n        record = data\n        nframes = record['nframes'] if 'nframes' in record else record[",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:178-222"
+    },
+    "5227": {
+        "file_id": 439,
+        "content": "The code defines three pipeline classes for decoding different types of data. The MRIDecoder class sets the format to 'MRI'. The FeatureDecoder class initializes with parameters num_classes, max_len and has_label, then performs feature decode operations on loaded pkl files, parsing them into RGB/audio format, padding as necessary, and returning a list of numpy arrays.",
+        "type": "comment"
+    },
+    "5228": {
+        "file_id": 439,
+        "content": "            b'nframes']\n        rgb = record['feature'].astype(\n            float) if 'feature' in record else record[b'feature'].astype(float)\n        audio = record['audio'].astype(\n            float) if 'audio' in record else record[b'audio'].astype(float)\n        if self.has_label:\n            label = record['label'] if 'label' in record else record[b'label']\n            one_hot_label = self.make_one_hot(label, self.num_classes)\n        rgb = rgb[0:nframes, :]\n        audio = audio[0:nframes, :]\n        rgb = self.dequantize(rgb,\n                              max_quantized_value=2.,\n                              min_quantized_value=-2.)\n        audio = self.dequantize(audio,\n                                max_quantized_value=2,\n                                min_quantized_value=-2)\n        if self.has_label:\n            results['labels'] = one_hot_label.astype(\"float32\")\n        feat_pad_list = []\n        feat_len_list = []\n        mask_list = []\n        vitem = [rgb, audio]\n        for vi in range(2):  #rgb and audio",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:223-249"
+    },
+    "5229": {
+        "file_id": 439,
+        "content": "This code is preparing data for a model. It loads the 'feature' and 'audio' from the record if available, converts them to float type, and cuts them up to the specified number of frames (nframes). If labels are present in the record, it makes one-hot encoding out of them. The data is then dequantized using a method, and results are stored into the 'labels' variable. Finally, three lists (feat_pad_list, feat_len_list, mask_list) are initialized for further data processing. The code handles two types of data: 'feature' and 'audio', iterating over them in a range loop.",
+        "type": "comment"
+    },
+    "5230": {
+        "file_id": 439,
+        "content": "            if vi == 0:\n                prefix = \"rgb_\"\n            else:\n                prefix = \"audio_\"\n            feat = vitem[vi]\n            results[prefix + 'len'] = feat.shape[0]\n            #feat pad step 1. padding\n            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),\n                                dtype=np.float32)\n            feat_pad = np.concatenate((feat, feat_add), axis=0)\n            results[prefix + 'data'] = feat_pad.astype(\"float32\")\n            #feat pad step 2. mask\n            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)\n            feat_mask_add = feat_add\n            feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),\n                                       axis=0)\n            results[prefix + 'mask'] = feat_mask.astype(\"float32\")\n        return results\n    def dequantize(self,\n                   feat_vector,\n                   max_quantized_value=2.,\n                   min_quantized_value=-2.):\n        \"\"\"\n        Dequantize the feature from the byte format to the float format",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:250-275"
+    },
+    "5231": {
+        "file_id": 439,
+        "content": "This function pads and dequantizes video features for model input. It first checks the type of feature (video or audio) and prepends 'rgb_' or 'audio_' to the result keys accordingly. Then it pads the feature with zeros to match the max length, creates a mask for the padded feature, and dequantizes the feature from byte format to float format.",
+        "type": "comment"
+    },
+    "5232": {
+        "file_id": 439,
+        "content": "        \"\"\"\n        assert max_quantized_value > min_quantized_value\n        quantized_range = max_quantized_value - min_quantized_value\n        scalar = quantized_range / 255.0\n        bias = (quantized_range / 512.0) + min_quantized_value\n        return feat_vector * scalar + bias\n    def make_one_hot(self, label, dim=3862):\n        one_hot_label = np.zeros(dim)\n        one_hot_label = one_hot_label.astype(float)\n        for ind in label:\n            one_hot_label[int(ind)] = 1\n        return one_hot_label\n@PIPELINES.register()\nclass ActionFeatureDecoder(object):\n    \"\"\"\n        Perform feature decode operations on footballaction\n    \"\"\"\n    def __init__(self, num_classes, max_len=512, has_label=True):\n        self.max_len = max_len\n        self.num_classes = num_classes\n        self.has_label = has_label\n    def __call__(self, results):\n        \"\"\"\n        Perform feature decode operations.\n        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        #1. load pkl\n        #2. parse to rgb/audio/",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:276-310"
+    },
+    "5233": {
+        "file_id": 439,
+        "content": "The code defines a class called \"ActionFeatureDecoder\" for feature decoding operations in football actions. It initializes with parameters for the maximum length, number of classes, and whether or not it should handle labels. The __call__ method performs the decoding operation on input results and returns a list of numpy arrays after decoding.",
+        "type": "comment"
+    },
+    "5234": {
+        "file_id": 439,
+        "content": "        #3. padding\n        filepath = results['filename']\n        data = pickle.load(open(filepath, 'rb'), encoding='bytes')\n        pkl_data = data\n        rgb = pkl_data['image_feature'].astype(float)\n        audio = pkl_data['audio_feature'].astype(float)\n        label_id_info = pkl_data['label_info']\n        label_cls = [label_id_info['label']]\n        label_one = int(label_cls[0])\n        if len(label_cls) > 1:\n            label_index = random.randint(0, 1)\n            label_one = int(label_cls[label_index])\n        iou_norm = float(label_id_info['norm_iou'])\n        results['labels'] = np.array([label_one])\n        results['iou_norm'] = float(iou_norm)\n        vitem = [rgb, audio]\n        for vi in range(2):  #rgb and audio\n            if vi == 0:\n                prefix = \"rgb_\"\n            else:\n                prefix = \"audio_\"\n            feat = vitem[vi]\n            results[prefix + 'len'] = feat.shape[0]\n            #feat pad step 1. padding\n            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:311-338"
+    },
+    "5235": {
+        "file_id": 439,
+        "content": "The code is reading a pickle file, extracting the rgb image and audio features, label information, and performing some data manipulations. It sets the label to either 0 or 1 randomly if there's more than one in the data, normalizes iou values, and adds padding to the data for further processing. This is used in a video processing pipeline for PaddlePaddle.",
+        "type": "comment"
+    },
+    "5236": {
+        "file_id": 439,
+        "content": "                                dtype=np.float32)\n            feat_pad = np.concatenate((feat, feat_add), axis=0)\n            results[prefix + 'data'] = feat_pad.astype(\"float32\")\n            #feat pad step 2. mask\n            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)\n            feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0)\n            results[prefix + 'mask'] = feat_mask.astype(\"float32\")\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode.py:339-347"
+    },
+    "5237": {
+        "file_id": 439,
+        "content": "This code pads the feature data and its corresponding mask for a PaddleVideo pipeline, concatenating them and casting the results to float32 type before storing in the 'results' dictionary.",
+        "type": "comment"
+    },
+    "5238": {
+        "file_id": 440,
+        "content": "/paddlevideo/loader/pipelines/decode_image.py",
+        "type": "filepath"
+    },
+    "5239": {
+        "file_id": 440,
+        "content": "The PaddleVideo class in PaddlePipelines uses PIL and skimage for image decoding operations. It accepts parameters such as scales, side map, and backend. This class can be used with datasets like KITTI and KITTI ODOM, supporting the retrieval of image paths and resizing of depth images. The code organizes results into a dictionary structure, processes image data based on 'train' or 'val', retrieves color images, adjusts intrinsics for depth estimation, stores results in the 'imgs' dictionary, and adds processed 'imgs' to 'results'.",
+        "type": "summary"
+    },
+    "5240": {
+        "file_id": 440,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport numpy as np\nimport PIL.Image as pil\ntry:\n    import skimage.transform\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [scikit-image] package and it's dependencies is required for ADDS.\"\n    )\nfrom PIL import Image\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass ImageDecoder(object):\n    \"\"\"Decode Image\n    \"\"\"\n    def __init__(self,\n                 dataset,\n                 frame_idxs,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_image.py:1-37"
+    },
+    "5241": {
+        "file_id": 440,
+        "content": "This code is a Python class for decoding images, registered with the PADDLEPIPELINES module. It uses the PIL and skimage libraries, and is part of the PaddleVideo package in PaddlePaddle. The class takes in a dataset and frame_idxs as parameters for image decoding operations.",
+        "type": "comment"
+    },
+    "5242": {
+        "file_id": 440,
+        "content": "                 num_scales,\n                 side_map,\n                 full_res_shape,\n                 img_ext,\n                 backend='cv2'):\n        self.backend = backend\n        self.dataset = dataset\n        self.frame_idxs = frame_idxs\n        self.num_scales = num_scales\n        self.side_map = side_map\n        self.full_res_shape = full_res_shape\n        self.img_ext = img_ext\n    def _pil_loader(self, path):\n        with open(path, 'rb') as f:\n            with Image.open(f) as img:\n                return img.convert('RGB')\n    def get_color(self, folder, frame_index, side):\n        color = self._pil_loader(\n            self.get_image_path(self.dataset, folder, frame_index, side))\n        return color\n    def get_image_path(self, dataset, folder, frame_index, side):\n        if dataset == \"kitti\":\n            f_str = \"{:010d}{}\".format(frame_index, self.img_ext)\n            image_path = os.path.join(self.data_path, folder, f_str)\n        elif dataset == \"kitti_odom\":\n            f_str = \"{:06d}{}\".format(frame_index, self.img_ext)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_image.py:38-66"
+    },
+    "5243": {
+        "file_id": 440,
+        "content": "This code defines a class for image decoding pipelines, accepting parameters such as number of scales, side map, full resolution shape, image extension, and backend. It also includes methods for loading images using the PIL library and retrieving image paths based on the dataset. The class is intended to be used for decoding images from specific datasets like KITTI and KITTI ODOM.",
+        "type": "comment"
+    },
+    "5244": {
+        "file_id": 440,
+        "content": "            image_path = os.path.join(self.data_path,\n                                      \"sequences/{:02d}\".format(int(folder)),\n                                      \"image_{}\".format(self.side_map[side]),\n                                      f_str)\n        elif dataset == \"kitti_depth\":\n            f_str = \"{:010d}{}\".format(frame_index, self.img_ext)\n            image_path = os.path.join(\n                self.data_path, folder,\n                \"image_0{}/data\".format(self.side_map[side]), f_str)\n        return image_path\n    def get_depth(self, dataset, folder, frame_index, side):\n        if dataset == \"kitii_depth\":\n            f_str = \"{:010d}.png\".format(frame_index)\n            depth_path = os.path.join(\n                self.data_path, folder,\n                \"proj_depth/groundtruth/image_0{}\".format(self.side_map[side]),\n                f_str)\n            depth_gt = pil.open(depth_path)\n            depth_gt = depth_gt.resize(self.full_res_shape, pil.NEAREST)\n            depth_gt = np.array(depth_gt).astype(np.float32) / 256",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_image.py:67-89"
+    },
+    "5245": {
+        "file_id": 440,
+        "content": "This code defines a class with two methods: `get_image_path` and `get_depth`. The first method returns the path of an image based on the dataset, folder, frame index, and side. If the dataset is \"kitti_depth\", it constructs the path using frame index and extension. The second method retrieves depth data for a given dataset, folder, frame index, and side. It uses the \"kitii_depth\" dataset, constructs the path to the depth file, opens the image, resizes it, and converts it into a float32 array divided by 256.",
+        "type": "comment"
+    },
+    "5246": {
+        "file_id": 440,
+        "content": "        else:\n            f_str = \"{:010d}{}\".format(frame_index, self.img_ext)\n            depth_path = os.path.join(self.data_path, folder + '_gt', f_str)\n            img_file = Image.open(depth_path)\n            depth_png = np.array(img_file, dtype=int)\n            img_file.close()\n            # make sure we have a proper 16bit depth map here.. not 8bit!\n            assert np.max(depth_png) > 255, \\\n                \"np.max(depth_png)={}, path={}\".format(np.max(depth_png), depth_path)\n            depth_gt = depth_png.astype(np.float) / 256.\n            depth_gt = depth_gt[160:960 - 160, :]\n            depth_gt = skimage.transform.resize(depth_gt,\n                                                self.full_res_shape[::-1],\n                                                order=0,\n                                                preserve_range=True,\n                                                mode='constant')\n        return depth_gt\n    def __call__(self, results):\n        \"\"\"\n        Perform mp4 decode operations.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_image.py:91-116"
+    },
+    "5247": {
+        "file_id": 440,
+        "content": "This function reads the depth image from file and resizes it to the desired shape, ensuring that it is a 16-bit depth map. It also checks if the maximum value exceeds 255, asserting that this is not an 8-bit image. The final output is a resized depth_gt with dimensions self.full_res_shape[::-1]. The function returns this resized depth_gt after performing any necessary operations for mp4 decode operations.",
+        "type": "comment"
+    },
+    "5248": {
+        "file_id": 440,
+        "content": "        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        if results.get('mode', None) == 'infer':\n            imgs = {}\n            imgs[(\"color\", 0,\n                  -1)] = Image.open(results[\"filename\"]).convert(\"RGB\")\n            results['imgs'] = imgs\n            return results\n        self.data_path = results['data_path']\n        results['backend'] = self.backend\n        imgs = {}\n        results['frame_idxs'] = self.frame_idxs\n        results['num_scales'] = self.num_scales\n        file_name = results['filename']\n        folder = results['folder']\n        frame_index = results['frame_index']\n        line = file_name.split('/')\n        istrain = folder.split('_')[1]\n        if 'mode' not in results:\n            results['mode'] = istrain\n        results['day_or_night'] = folder.split('_')[0]\n        if istrain == \"train\":\n            if folder[0] == 'd':\n                folder2 = folder + '_fake_night'\n                flag = 0\n            else:\n                folder2 = folder + '_fake_day'",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_image.py:117-149"
+    },
+    "5249": {
+        "file_id": 440,
+        "content": "This code handles the decoding of images and organizes the results into a dictionary structure. It checks if the mode is set to 'infer', where it opens an image in RGB format, stores it in the results dictionary under 'imgs' key, and returns the results. If the mode is not set or is 'train', it sets up necessary variables for organizing data based on day or night folders and whether the folder is real or fake.",
+        "type": "comment"
+    },
+    "5250": {
+        "file_id": 440,
+        "content": "                tmp = folder\n                folder = folder2\n                folder2 = tmp\n                flag = 1\n            if len(line) == 3:\n                side = line[2]\n            else:\n                side = None\n            results['side'] = side\n            for i in self.frame_idxs:\n                if i == \"s\":\n                    other_side = {\"r\": \"l\", \"l\": \"r\"}[side]\n                    imgs[(\"color\", i,\n                          -1)] = self.get_color(folder, frame_index, other_side)\n                    imgs[(\"color_n\", i,\n                          -1)] = self.get_color(folder2, frame_index,\n                                                other_side)\n                else:\n                    imgs[(\"color\", i,\n                          -1)] = self.get_color(folder, frame_index + i, side)\n                    imgs[(\"color_n\", i,\n                          -1)] = self.get_color(folder2, frame_index + i, side)\n            istrain = folder.split('_')[1]\n            if istrain != 'train':\n                if flag:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_image.py:150-179"
+    },
+    "5251": {
+        "file_id": 440,
+        "content": "This code is setting up image files for decoding from a given folder and folder2 based on the frame indexes. It also considers whether the images are for the left or right side, identified by 'r' and 'l'. The flag variable is used to check if there's a change in side. If the folder name does not contain 'train', it executes something else (not shown in this code snippet).",
+        "type": "comment"
+    },
+    "5252": {
+        "file_id": 440,
+        "content": "                    depth_gt = self.get_depth(folder2, frame_index, side)\n                else:\n                    depth_gt = self.get_depth(folder, frame_index, side)\n                imgs[\"depth_gt\"] = np.expand_dims(depth_gt, 0)\n        elif istrain == 'val':\n            if len(line) == 3:\n                side = line[2]\n            else:\n                side = None\n            for i in self.frame_idxs:\n                if i == \"s\":\n                    other_side = {\"r\": \"l\", \"l\": \"r\"}[side]\n                    imgs[(\"color\", i,\n                          -1)] = self.get_color(folder, frame_index, other_side)\n                else:\n                    imgs[(\"color\", i,\n                          -1)] = self.get_color(folder, frame_index + i, side)\n            # adjusting intrinsics to match each scale in the pyramid\n            depth_gt = self.get_depth(self.dataset, folder, frame_index, side)\n            imgs[\"depth_gt\"] = np.expand_dims(depth_gt, 0)\n        results['imgs'] = imgs\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_image.py:180-206"
+    },
+    "5253": {
+        "file_id": 440,
+        "content": "The code checks the 'train' or 'val' flag and processes image data accordingly. It retrieves color images based on 'frame_idxs', adjusts intrinsics for depth estimation, and stores results in 'imgs' dictionary. The processed 'imgs' is then added to 'results' before returning it.",
+        "type": "comment"
+    },
+    "5254": {
+        "file_id": 441,
+        "content": "/paddlevideo/loader/pipelines/decode_sampler.py",
+        "type": "filepath"
+    },
+    "5255": {
+        "file_id": 441,
+        "content": "The code imports libraries, defines a DecodeSampler class for video decoding, and initializes parameters. It then decodes video frames, clips the index, retrieves frames, converts them to images using PIL library, and stores the images in a list.",
+        "type": "summary"
+    },
+    "5256": {
+        "file_id": 441,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nfrom PIL import Image\nimport decord as de\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass DecodeSampler(object):\n    \"\"\"\n    We use 'decord' for decode and sampling, which is faster than opencv.\n    This is used in slowfast model.\n    Args:\n        num_frames(int): the number of frames we want to sample.\n        sampling_rate(int): sampling rate for video data.\n        target_fps(int): desired fps, default 30",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler.py:1-30"
+    },
+    "5257": {
+        "file_id": 441,
+        "content": "This code imports necessary libraries, defines the DecodeSampler class for faster decoding and sampling of video data using 'decord', and registers it with the PIPELINES registry. It is used in the slowfast model and takes arguments such as num_frames, sampling_rate, and target_fps.",
+        "type": "comment"
+    },
+    "5258": {
+        "file_id": 441,
+        "content": "        test_mode(bool): whether test or train/valid. In slowfast, we use multicrop when test.\n    \"\"\"\n    def __init__(self,\n                 num_frames,\n                 sampling_rate,\n                 default_sampling_rate=2,\n                 target_fps=30,\n                 test_mode=False):\n        self.num_frames = num_frames\n        self.orig_sampling_rate = self.sampling_rate = sampling_rate\n        self.default_sampling_rate = default_sampling_rate\n        self.target_fps = target_fps\n        self.test_mode = test_mode\n    def get_start_end_idx(self, video_size, clip_size, clip_idx,\n                          temporal_num_clips):\n        delta = max(video_size - clip_size, 0)\n        if not self.test_mode:\n            # Random temporal sampling.\n            start_idx = random.uniform(0, delta)\n        else:\n            # Uniformly sample the clip with the given index.\n            start_idx = delta * clip_idx / temporal_num_clips\n        end_idx = start_idx + clip_size - 1\n        return start_idx, end_idx",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler.py:31-55"
+    },
+    "5259": {
+        "file_id": 441,
+        "content": "This code initializes a class with parameters for sampling video frames, and determines the start and end indices for each clip based on test mode (random or uniform).",
+        "type": "comment"
+    },
+    "5260": {
+        "file_id": 441,
+        "content": "    def __call__(self, results):\n        \"\"\"\n        Perform mp4 decode operations.\n        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx:\n            self.sampling_rate = random.randint(self.default_sampling_rate,\n                                                self.orig_sampling_rate)\n        filepath = results['filename']\n        temporal_sample_index = results['temporal_sample_index']\n        temporal_num_clips = results['temporal_num_clips']\n        vr = de.VideoReader(filepath)\n        videolen = len(vr)\n        fps = vr.get_avg_fps()\n        clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps\n        start_idx, end_idx = self.get_start_end_idx(videolen, clip_size,\n                                                    temporal_sample_index,\n                                                    temporal_num_clips)\n        index = np.linspace(start_idx, end_idx, self.num_frames).astype(\"int64\")",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler.py:57-81"
+    },
+    "5261": {
+        "file_id": 441,
+        "content": "This function performs mp4 decode operations and returns a list of numpy arrays after decoding. It considers the short_cycle_idx to adjust the sampling rate, takes the filepath and temporal parameters from results, initializes a VideoReader object, calculates clip size, gets start and end indices for video clipping based on these values, and finally creates an index list for the decoded frames.",
+        "type": "comment"
+    },
+    "5262": {
+        "file_id": 441,
+        "content": "        index = np.clip(index, 0, videolen)\n        frames_select = vr.get_batch(index)  #1 for buffer\n        # dearray_to_img\n        np_frames = frames_select.asnumpy()\n        frames_select_list = []\n        for i in range(np_frames.shape[0]):\n            imgbuf = np_frames[i]\n            frames_select_list.append(Image.fromarray(imgbuf, mode='RGB'))\n        results['imgs'] = frames_select_list\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler.py:82-93"
+    },
+    "5263": {
+        "file_id": 441,
+        "content": "This code segment is responsible for decoding and preparing image frames from a video. It clips the index value to ensure it falls within the valid range, retrieves the corresponding batch of frames using get_batch function, converts these frames into an array, and then loops through the array to convert each frame into an image using the PIL library's Image.fromarray method. The resulting images are stored in a list which is then assigned to 'results'['imgs'] before the function returns the results.",
+        "type": "comment"
+    },
+    "5264": {
+        "file_id": 442,
+        "content": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py",
+        "type": "filepath"
+    },
+    "5265": {
+        "file_id": 442,
+        "content": "The SFMRI_DecodeSampler class is a tool that decodes and samples MRI frames, creating segments based on sampling indices and handling video length constraints. It calculates offsets for 's' and 'f' frame types, determines average durations per segment, and returns an object containing the frame indices.",
+        "type": "summary"
+    },
+    "5266": {
+        "file_id": 442,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport random\nimport numpy as np\nfrom PIL import Image\ntry:\n    import SimpleITK as sitk\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care.\"\n    )\nimport cv2\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass SFMRI_DecodeSampler(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:1-36"
+    },
+    "5267": {
+        "file_id": 442,
+        "content": "This code snippet is a Python class for the SFMRI_DecodeSampler pipeline, which decodes and samples MRI frames. It uses PIL and SimpleITK packages to read images and relies on OpenCV for image processing. The class is registered in the PIPELINES module.",
+        "type": "comment"
+    },
+    "5268": {
+        "file_id": 442,
+        "content": "        num_seg(int): number of segments.\n        seg_len(int): number of sampled frames in each segment.\n        valid_mode(bool): True or False.\n        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.\n    Returns:\n        frames_idx: the index of sampled #frames.\n    \"\"\"\n    def __init__(self,\n                 num_seg,\n                 seg_len,\n                 valid_mode=False,\n                 select_left=False,\n                 dense_sample=False,\n                 linspace_sample=False):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.valid_mode = valid_mode\n        self.select_left = select_left\n        self.dense_sample = dense_sample\n        self.linspace_sample = linspace_sample\n    def _get(self, frames_idx_s, frames_idx_f, results):\n        frame_dir = results['frame_dir']\n        imgs_s = []\n        imgs_f = []\n        MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))\n        for idx in frames_idx_s:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:37-64"
+    },
+    "5269": {
+        "file_id": 442,
+        "content": "This code defines a class with methods for creating segments of frames from an MRI image. The constructor takes arguments for the number of segments, length of each segment, and optional parameters for sampling mode. It returns the indexes of sampled frames in each segment. The class also includes a method for getting images from the MRI and storing them.",
+        "type": "comment"
+    },
+    "5270": {
+        "file_id": 442,
+        "content": "            item = MRI[idx]\n            item = cv2.resize(item, (224, 224))\n            imgs_s.append(item)\n        for idx in frames_idx_f:\n            item = MRI[idx]\n            item = cv2.resize(item, (224, 224))\n            imgs_f.append(item)\n        results['imgs'] = [imgs_s, imgs_f]\n        return results\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            frames_len: length of frames.\n        return:\n            sampling id.\n        \"\"\"\n        frames_len = int(results['frames_len'])\n        average_dur1 = int(frames_len / self.num_seg[0])\n        average_dur2 = int(frames_len / self.num_seg[1])\n        frames_idx_s = []\n        frames_idx_f = []\n        if self.linspace_sample:\n            if 'start_idx' in results and 'end_idx' in results:\n                offsets_s = np.linspace(results['start_idx'],\n                                        results['end_idx'], self.num_seg[0])\n                offsets_f = np.linspace(results['start_idx'],\n                                        results['end_idx'], self.num_seg[1])",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:65-94"
+    },
+    "5271": {
+        "file_id": 442,
+        "content": "This code defines a class that takes in MRI data and returns resized images for sampling. It creates two lists, imgs_s and imgs_f, which contain the resized MRI frames. The results dictionary contains these lists under the 'imgs' key. The __call__ method calculates the average duration of each segment based on frames_len, and generates frame indices for each segment using linspace_sample. It does not return any value in this context.",
+        "type": "comment"
+    },
+    "5272": {
+        "file_id": 442,
+        "content": "            else:\n                offsets_s = np.linspace(0, frames_len - 1, self.num_seg[0])\n                offsets_f = np.linspace(0, frames_len - 1, self.num_seg[1])\n            offsets_s = np.clip(offsets_s, 0, frames_len - 1).astype(np.int64)\n            offsets_f = np.clip(offsets_f, 0, frames_len - 1).astype(np.int64)\n            frames_idx_s = list(offsets_s)\n            frames_idx_f = list(offsets_f)\n            return self._get(frames_idx_s, frames_idx_f, results)\n        if not self.select_left:\n            if self.dense_sample:  # For ppTSM\n                if not self.valid_mode:  # train\n                    sample_pos = max(1, 1 + frames_len - 64)\n                    t_stride1 = 64 // self.num_seg[0]\n                    t_stride2 = 64 // self.num_seg[1]\n                    start_idx = 0 if sample_pos == 1 else np.random.randint(\n                        0, sample_pos - 1)\n                    offsets_s = [(idx * t_stride1 + start_idx) % frames_len + 1\n                                 for idx in range(self.num_seg[0])]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:95-115"
+    },
+    "5273": {
+        "file_id": 442,
+        "content": "This code segment handles sampling of frames for video decoding. It sets the offsets for sample positions based on the number of segments specified and ensures they are within the valid frame range. If `select_left` is not set, it further checks if `dense_sample` is enabled in dense sampling mode. For ppTSM, it selects a sample position and calculates the corresponding offsets for each segment using the given formulas.",
+        "type": "comment"
+    },
+    "5274": {
+        "file_id": 442,
+        "content": "                    offsets_f = [(idx * t_stride2 + start_idx) % frames_len + 1\n                                 for idx in range(self.num_seg[1])]\n                    frames_idx_s = offsets_s\n                    frames_idx_f = offsets_f\n                else:\n                    sample_pos = max(1, 1 + frames_len - 64)\n                    t_stride1 = 64 // self.num_seg[0]\n                    t_stride2 = 64 // self.num_seg[1]\n                    start_list = np.linspace(0,\n                                             sample_pos - 1,\n                                             num=10,\n                                             dtype=int)\n                    offsets_s = []\n                    offsets_f = []\n                    for start_idx in start_list.tolist():\n                        offsets_s += [\n                            (idx * t_stride1 + start_idx) % frames_len + 1\n                            for idx in range(self.num_seg[0])\n                        ]\n                    for start_idx in start_list.tolist():",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:116-135"
+    },
+    "5275": {
+        "file_id": 442,
+        "content": "This code calculates the sampling indices for both spatial and frequency domains. It creates two lists, frames_idx_s and frames_idx_f, based on the number of segments in each dimension (self.num_seg[0] and self.num_seg[1]). If the video length is less than 64 frames, it sets a smaller sampling range for both domains. The code also includes a backup strategy that uses a list of starting points for sampling if the video length is longer but still shorter than 64 frames.",
+        "type": "comment"
+    },
+    "5276": {
+        "file_id": 442,
+        "content": "                        offsets_f += [\n                            (idx * t_stride2 + start_idx) % frames_len + 1\n                            for idx in range(self.num_seg[1])\n                        ]\n                    frames_idx_s = offsets_s\n                    frames_idx_f = offsets_f\n            else:\n                for i in range(self.num_seg[0]):\n                    idx = 0\n                    if not self.valid_mode:\n                        if average_dur1 >= self.seg_len:\n                            idx = random.randint(0, average_dur1 - self.seg_len)\n                            idx += i * average_dur1\n                        elif average_dur1 >= 1:\n                            idx += i * average_dur1\n                        else:\n                            idx = i\n                    else:\n                        if average_dur1 >= self.seg_len:\n                            idx = (average_dur1 - 1) // 2\n                            idx += i * average_dur1\n                        elif average_dur1 >= 1:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:136-157"
+    },
+    "5277": {
+        "file_id": 442,
+        "content": "This code calculates the offsets for segmenting frames and storing them in two lists, `frames_idx_s` and `frames_idx_f`. If `valid_mode` is set, it randomly selects the indices within the constraints of `average_dur1`, otherwise it uses sequential indexing. It also handles cases where `average_dur1` is less than 1 by setting the index to i.",
+        "type": "comment"
+    },
+    "5278": {
+        "file_id": 442,
+        "content": "                            idx += i * average_dur1\n                        else:\n                            idx = i\n                    for jj in range(idx, idx + self.seg_len):\n                        frames_idx_s.append(jj)\n                for i in range(self.num_seg[1]):\n                    idx = 0\n                    if not self.valid_mode:\n                        if average_dur2 >= self.seg_len:\n                            idx = random.randint(0, average_dur2 - self.seg_len)\n                            idx += i * average_dur2\n                        elif average_dur2 >= 1:\n                            idx += i * average_dur2\n                        else:\n                            idx = i\n                    else:\n                        if average_dur2 >= self.seg_len:\n                            idx = (average_dur2 - 1) // 2\n                            idx += i * average_dur2\n                        elif average_dur2 >= 1:\n                            idx += i * average_dur2\n                        else:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:158-180"
+    },
+    "5279": {
+        "file_id": 442,
+        "content": "Code iterates over frames and segments, assigning frame indices based on valid mode and average durations. If valid mode is off, it determines the idx based on average duration 1 and 2, or if in valid mode, it sets the idx to half the remaining average duration 2 minus 1. Finally, it appends frame indices to frames_idx_s list for each segment length.",
+        "type": "comment"
+    },
+    "5280": {
+        "file_id": 442,
+        "content": "                            idx = i\n                    for jj in range(idx, idx + self.seg_len):\n                        frames_idx_f.append(jj)\n            return self._get(frames_idx_s, frames_idx_f, results)\n        else:  # for TSM\n            if not self.valid_mode:\n                if average_dur2 > 0:\n                    offsets_s = np.multiply(list(range(\n                        self.num_seg[0])), average_dur1) + np.random.randint(\n                            average_dur1, size=self.num_seg[0])\n                    offsets_f = np.multiply(list(range(\n                        self.num_seg[1])), average_dur2) + np.random.randint(\n                            average_dur2, size=self.num_seg[1])\n                elif frames_len > self.num_seg[1]:\n                    offsets_s = np.sort(\n                        np.random.randint(frames_len, size=self.num_seg[0]))\n                    offsets_f = np.sort(\n                        np.random.randint(frames_len, size=self.num_seg[1]))\n                else:\n                    offsets_s = np.zeros(shape=(self.num_seg[0], ))",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:181-203"
+    },
+    "5281": {
+        "file_id": 442,
+        "content": "If not in valid mode, if average duration 2 > 0, generate offsets_s and offsets_f for TSM. If frames_len is greater than num_seg[1], randomly select offsets_s and offsets_f. Otherwise, set offsets_s to zeros.",
+        "type": "comment"
+    },
+    "5282": {
+        "file_id": 442,
+        "content": "                    offsets_f = np.zeros(shape=(self.num_seg[1], ))\n            else:\n                if frames_len > self.num_seg[1]:\n                    average_dur_float_s = frames_len / self.num_seg[0]\n                    offsets_s = np.array([\n                        int(average_dur_float_s / 2.0 + average_dur_float_s * x)\n                        for x in range(self.num_seg[0])\n                    ])\n                    average_dur_float_f = frames_len / self.num_seg[1]\n                    offsets_f = np.array([\n                        int(average_dur_float_f / 2.0 + average_dur_float_f * x)\n                        for x in range(self.num_seg[1])\n                    ])\n                else:\n                    offsets_s = np.zeros(shape=(self.num_seg[0], ))\n                    offsets_f = np.zeros(shape=(self.num_seg[1], ))\n            frames_idx_s = list(offsets_s)\n            frames_idx_f = list(offsets_f)\n            return self._get(frames_idx_s, frames_idx_f, results)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:204-224"
+    },
+    "5283": {
+        "file_id": 442,
+        "content": "This code calculates the offsets for segmenting frames into 's' and 'f' types based on the number of segments specified. If the total number of frames is greater than the specified number of segments, it calculates the average duration per segment for both types ('s' and 'f'). It then creates arrays of frame indices for 's' and 'f' frames using these calculated offsets. Finally, it returns an object by calling a method '_get'.",
+        "type": "comment"
+    },
+    "5284": {
+        "file_id": 443,
+        "content": "/paddlevideo/loader/pipelines/mix.py",
+        "type": "filepath"
+    },
+    "5285": {
+        "file_id": 443,
+        "content": "The code introduces a VideoMix operator for data augmentation in image classification tasks, using mixup and cutmix operations with controllable parameters.",
+        "type": "summary"
+    },
+    "5286": {
+        "file_id": 443,
+        "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass Mixup(object):\n    \"\"\"\n    Mixup operator.\n    Args:\n        alpha(float): alpha value.\n    \"\"\"\n    def __init__(self, alpha=0.2):\n        assert alpha > 0., \\\n                'parameter alpha[%f] should > 0.0' % (alpha)\n        self.alpha = alpha\n    def __call__(self, batch):\n        imgs, labels = list(zip(*batch))\n        imgs = np.array(imgs)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/mix.py:1-34"
+    },
+    "5287": {
+        "file_id": 443,
+        "content": "Mixup class implements a mixup operator for PaddleVideo. It takes an alpha value as input and ensures it is greater than 0. The __call__ method takes a batch of images and labels, combines them with random weights determined by the alpha value, and returns the mixed up image batch and label batch.",
+        "type": "comment"
+    },
+    "5288": {
+        "file_id": 443,
+        "content": "        labels = np.array(labels)\n        bs = len(batch)\n        idx = np.random.permutation(bs)\n        lam = np.random.beta(self.alpha, self.alpha)\n        lams = np.array([lam] * bs, dtype=np.float32)\n        imgs = lam * imgs + (1 - lam) * imgs[idx]\n        return list(zip(imgs, labels, labels[idx], lams))\n@PIPELINES.register()\nclass Cutmix(object):\n    \"\"\" Cutmix operator\n    Args:\n        alpha(float): alpha value.\n    \"\"\"\n    def __init__(self, alpha=0.2):\n        assert alpha > 0., \\\n                'parameter alpha[%f] should > 0.0' % (alpha)\n        self.alpha = alpha\n    def rand_bbox(self, size, lam):\n        \"\"\" rand_bbox \"\"\"\n        w = size[2]\n        h = size[3]\n        cut_rat = np.sqrt(1. - lam)\n        cut_w = np.int(w * cut_rat)\n        cut_h = np.int(h * cut_rat)\n        # uniform\n        cx = np.random.randint(w)\n        cy = np.random.randint(h)\n        bbx1 = np.clip(cx - cut_w // 2, 0, w)\n        bby1 = np.clip(cy - cut_h // 2, 0, h)\n        bbx2 = np.clip(cx + cut_w // 2, 0, w)\n        bby2 = np.clip(cy + cut_h // 2, 0, h)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/mix.py:35-70"
+    },
+    "5289": {
+        "file_id": 443,
+        "content": "The code defines a Cutmix class for a mixup operator. It takes an alpha value as input, and randomly generates new images by cutting out a part of the original image and pasting it on top of another image, with alpha value determining the ratio of the two. The function rand_bbox is used to determine the dimensions and location of the cutout box.",
+        "type": "comment"
+    },
+    "5290": {
+        "file_id": 443,
+        "content": "        return bbx1, bby1, bbx2, bby2\n    def __call__(self, batch):\n        imgs, labels = list(zip(*batch))\n        imgs = np.array(imgs)\n        labels = np.array(labels)\n        bs = len(batch)\n        idx = np.random.permutation(bs)\n        lam = np.random.beta(self.alpha, self.alpha)\n        bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)\n        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]\n        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /\n                   (imgs.shape[-2] * imgs.shape[-1]))\n        lams = np.array([lam] * bs, dtype=np.float32)\n        return list(zip(imgs, labels, labels[idx], lams))\n@PIPELINES.register()\nclass VideoMix(object):\n    \"\"\"\n    VideoMix operator.\n    Args:\n        cutmix_prob(float): prob choose cutmix\n        mixup_alpha(float): alpha for mixup aug\n        cutmix_alpha(float): alpha for cutmix aug\n    \"\"\"\n    def __init__(self, cutmix_prob=0.5, mixup_alpha=0.2, cutmix_alpha=1.0):\n        assert cutmix_prob > 0., \\\n                'parameter cutmix_prob[%f] should > 0.0' % (cutmix_prob)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/mix.py:72-103"
+    },
+    "5291": {
+        "file_id": 443,
+        "content": "This code defines a VideoMix operator that performs data augmentation by either mixing or cutting images from different samples in the batch. The mixup_alpha and cutmix_alpha parameters control the degree of blending between samples, while the cutmix_prob parameter determines the probability of applying the cutmix operation.",
+        "type": "comment"
+    },
+    "5292": {
+        "file_id": 443,
+        "content": "        assert mixup_alpha > 0., \\\n                'parameter mixup_alpha[%f] should > 0.0' % (mixup_alpha)\n        assert cutmix_alpha > 0., \\\n                'parameter cutmix_alpha[%f] should > 0.0' % (cutmix_alpha)\n        self.cutmix_prob = cutmix_prob\n        self.mixup = Mixup(mixup_alpha)\n        self.cutmix = Cutmix(cutmix_alpha)\n    def __call__(self, batch):\n        if np.random.random() < self.cutmix_prob:\n            return self.cutmix(batch)\n        else:\n            return self.mixup(batch)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/mix.py:104-116"
+    },
+    "5293": {
+        "file_id": 443,
+        "content": "This code asserts that mixup_alpha and cutmix_alpha are greater than 0.0, sets the cutmix_prob, creates Mixup and Cutmix objects with the provided alphas, and defines a __call__ method to randomly choose between applying either Mixup or Cutmix to the batch.",
+        "type": "comment"
+    },
+    "5294": {
+        "file_id": 444,
+        "content": "/paddlevideo/loader/pipelines/multimodal.py",
+        "type": "filepath"
+    },
+    "5295": {
+        "file_id": 444,
+        "content": "The code introduces a new \"FeaturePadding\" class to PaddlePaddle library, handles data preprocessing for multimodal tasks, and provides masking, region selection, and action perturbation functions for PaddleVideo.",
+        "type": "summary"
+    },
+    "5296": {
+        "file_id": 444,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nfrom PIL import Image\nimport decord as de\nimport copy\nimport json\nfrom ..registry import PIPELINES\ntry:\n    from paddlenlp.transformers import BertTokenizer\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT.\"\n    )\n@PIPELINES.register()\nclass FeaturePadding(object):\n    \"\"\"\n    Padding feature to target shape.\n    \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:1-35"
+    },
+    "5297": {
+        "file_id": 444,
+        "content": "This code is part of a PaddlePaddle video analysis library. It registers a new class called \"FeaturePadding\" which performs feature padding to target shape. It imports necessary libraries and packages including decord, PIL, numpy, json, paddlenlp for ActBERT, and the PIPELINES registry.",
+        "type": "comment"
+    },
+    "5298": {
+        "file_id": 444,
+        "content": "    def __init__(self, max_region_num=36, max_action_num=5):\n        self.max_region_num = max_region_num\n        self.max_action_num = max_action_num\n    def __call__(self, results):\n        \"\"\"\n        Padding feature.\n        \"\"\"\n        pack_feature = results['feature']\n        tokenizer = results['tokenizer']\n        image_feature_wp, image_target_wp, image_location_wp, \\\n                num_boxes,  image_h, image_w, image_id, caption, \\\n                action_feature_wp, action_target_wp, num_actions = pack_feature\n        image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)\n        image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)\n        image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)\n        action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)\n        action_target = np.zeros((self.max_action_num, ), dtype=np.int64)\n        num_boxes = int(num_boxes)\n        image_feature[:num_boxes] = image_feature_wp\n        image_target[:num_boxes] = image_target_wp",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:36-59"
+    },
+    "5299": {
+        "file_id": 444,
+        "content": "This code defines a class with an __init__ method and a __call__ method. The __init__ method initializes the maximum number of regions (36) and actions (5). The __call__ method takes in results as input, including feature packs for image and action data. It pads the features to their maximum allowed dimensions with zeroes if there are less than the specified maximum. This is useful for maintaining consistent input sizes in machine learning models.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/53.json b/docs/data/53.json
new file mode 100644
index 000000000..97f3120c2
--- /dev/null
+++ b/docs/data/53.json
@@ -0,0 +1,548 @@
+{
+    "5300": {
+        "file_id": 444,
+        "content": "        image_location[:num_boxes, :4] = image_location_wp\n        image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (\n            image_location[:, 2] - image_location[:, 0]) / (float(image_w) *\n                                                            float(image_h))\n        image_location[:, 0] = image_location[:, 0] / float(image_w)\n        image_location[:, 1] = image_location[:, 1] / float(image_h)\n        image_location[:, 2] = image_location[:, 2] / float(image_w)\n        image_location[:, 3] = image_location[:, 3] / float(image_h)\n        image_feature = copy.deepcopy(image_feature)\n        image_target = copy.deepcopy(image_target)\n        num_actions = int(num_actions)\n        action_feature[:num_actions] = action_feature_wp\n        action_target[:num_actions] = action_target_wp\n        action_feature = copy.deepcopy(action_feature)\n        action_target = copy.deepcopy(action_target)\n        results = dict(image_feat=image_feature,\n                       image_target=image_target,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:60-81"
+    },
+    "5301": {
+        "file_id": 444,
+        "content": "This code segment is responsible for resizing and normalizing the image and action feature coordinates, as well as deep copying the features. It also initializes the results dictionary with keys for image_feat and image_target. This appears to be part of a data preprocessing step in a multimodal pipeline.",
+        "type": "comment"
+    },
+    "5302": {
+        "file_id": 444,
+        "content": "                       caption=caption,\n                       image_loc=image_location,\n                       num_boxes=int(num_boxes),\n                       action_feat=action_feature,\n                       action_target=action_target,\n                       num_actions=int(num_actions),\n                       tokenizer=tokenizer)\n        return results\n@PIPELINES.register()\nclass RandomCap(object):\n    def __init__(self, caption_path):\n        \"\"\"\n        Random Caption for NSP task\n        \"\"\"\n        self.caption_path = caption_path\n    def select_caption(self, caption):\n        captions = caption.split('!')\n        rind = random.randint(0, len(captions) - 1)\n        caption = captions[rind]\n        return caption\n    def get_random_caption(self, all_captions):\n        num_caps = len(all_captions)\n        rand_doc_idx = random.randint(0, num_caps - 1)\n        caption = all_captions[rand_doc_idx]\n        caption = self.select_caption(caption)\n        return caption\n    def random_cap(self, caption, all_captions):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:82-113"
+    },
+    "5303": {
+        "file_id": 444,
+        "content": "The code defines a pipeline that randomly selects captions for the NSP task. It takes caption paths as input and returns random captions. The class has an `__init__` method to initialize the caption path, a `select_caption` method to randomly choose one from multiple captions, a `get_random_caption` method to select a random caption from all provided captions, and finally a `random_cap` method that combines these functionalities.",
+        "type": "comment"
+    },
+    "5304": {
+        "file_id": 444,
+        "content": "        if random.random() > 0.5:\n            label = 0\n        else:\n            caption = self.get_random_caption(all_captions)\n            label = 1\n        return caption, label\n    def __call__(self, results):\n        caption = results['caption']\n        all_captions = list(json.load(open(self.caption_path, 'r')))\n        caption = self.select_caption(caption)\n        caption, label = self.random_cap(caption, all_captions)\n        results['caption'] = caption\n        results['is_next'] = label\n        return results\n@PIPELINES.register()\nclass Tokenize(object):\n    def __init__(self, ):\n        \"\"\"\n        Tokenize caption\n        \"\"\"\n        pass\n    def __call__(self, results):\n        caption = results['caption']\n        tokenizer = results['tokenizer']\n        tokens_caption = tokenizer.tokenize(caption)\n        results['caption'] = tokens_caption\n        return results\n@PIPELINES.register()\nclass RandomMask(object):\n    def __init__(self,\n                 max_seq_length=36,\n                 max_action_length=5,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:114-151"
+    },
+    "5305": {
+        "file_id": 444,
+        "content": "The code is part of a multi-modal pipeline, where it randomly generates labels (0 or 1) and selects captions from a list. It also includes classes for tokenizing captions and applying random masks on the text data.",
+        "type": "comment"
+    },
+    "5306": {
+        "file_id": 444,
+        "content": "                 max_region_length=36):\n        self.max_seq_length = max_seq_length\n        self.max_action_length = max_action_length\n        self.max_region_length = max_region_length\n    def get_image_global_feature(self, image_feat, image_loc, image_mask):\n        g_image_feat = np.sum(image_feat, axis=0) / np.sum(\n            image_mask, axis=0, keepdims=True)\n        image_feat = np.concatenate(\n            [np.expand_dims(g_image_feat, axis=0), image_feat],\n            axis=0).astype(\"float32\")\n        g_image_loc = np.array([0, 0, 1, 1, 1]).astype(\"float32\")\n        image_loc = np.concatenate(\n            [np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)\n        g_image_mask = np.array([1])\n        image_mask = np.concatenate([g_image_mask, image_mask], axis=0)\n        return image_feat, image_loc, image_mask\n    def _truncate_seq_pair(self, tokens_b, max_length):\n        \"\"\"Truncates a sequence pair in place to the maximum length.\n        This is a simple heuristic which will always truncate the longer sequence",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:152-175"
+    },
+    "5307": {
+        "file_id": 444,
+        "content": "This code defines a class for loading multimodal data, including images and text, into TensorFlow datasets. The constructor takes the maximum sequence length, action length, and region length as arguments. It also includes functions to generate global image features and truncate a sequence pair if they exceed the maximum length.",
+        "type": "comment"
+    },
+    "5308": {
+        "file_id": 444,
+        "content": "        one token at a time. This makes more sense than truncating an equal percent\n        of tokens from each, since if one sequence is very short then each token\n        that's truncated likely contains more information than a longer sequence.\n        \"\"\"\n        while True:\n            total_length = len(tokens_b)\n            if total_length <= max_length:\n                break\n            tokens_b.pop()\n    def random_word(self, tokens, tokenizer):\n        \"\"\"\n        Masking some random tokens for Language Model task with probabilities as in the original BERT paper.\n        Args:\n            tokens: list of str, tokenized sentence.\n            tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)\n        Return:\n            (list of str, list of int), masked tokens and related labels for LM prediction\n        \"\"\"\n        output_label = []\n        for i, token in enumerate(tokens):\n            prob = random.random()\n            # mask token with 15% probability\n            if prob < 0.15:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:176-201"
+    },
+    "5309": {
+        "file_id": 444,
+        "content": "The code is implementing a method to mask random tokens in a sentence for Language Model (LM) tasks. It first ensures that all sequences have equal length by truncating one token at a time from the longer sequence, then randomly masks 15% of the tokens in each sequence. The method also includes logic to handle tokenizer and produces masked tokens along with their related labels for LM prediction.",
+        "type": "comment"
+    },
+    "5310": {
+        "file_id": 444,
+        "content": "                prob /= 0.15\n                # 80% randomly change token to mask token\n                if prob < 0.8:\n                    tokens[i] = \"[MASK]\"\n                # 10% randomly change token to random token\n                elif prob < 0.9:\n                    #tok = random.choice(list(tokenizer.vocab.items()))[0]\n                    tok = tokenizer.vocab.idx_to_token[random.randint(\n                        0,\n                        tokenizer.vocab_size,\n                    )]\n                    tokens[i] = tok\n                # rest 10% randomly keep current token\n                # append current token to output (we will predict these later)\n                try:\n                    output_label.append(tokenizer.vocab[token])\n                except KeyError:\n                    # For unknown words (should not occur with BPE vocab)\n                    output_label.append(tokenizer.vocab[\"[UNK]\"])\n                    print(\n                        \"Cannot find token '{}' in vocab. Using [UNK] insetad\".",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:202-225"
+    },
+    "5311": {
+        "file_id": 444,
+        "content": "This code modifies tokens in a given input sequence by randomly replacing them with mask tokens, random tokens from the vocabulary, or keeping them unchanged. The probability of each action is controlled by a variable 'prob', which is normalized to ensure the total probability sums up to 1.0. The resulting modified sequence is appended to 'output_label' for further prediction. Additionally, it handles unknown words by replacing them with '[UNK]'.",
+        "type": "comment"
+    },
+    "5312": {
+        "file_id": 444,
+        "content": "                        format(token))\n            else:\n                # no masking token (will be ignored by loss function later)\n                output_label.append(-1)\n        return tokens, output_label\n    def random_region(self, image_feat, image_loc, num_boxes):\n        output_label = []\n        for i in range(num_boxes):\n            prob = random.random()\n            # mask token with 15% probability\n            if prob < 0.15:\n                prob /= 0.15\n                # 80% randomly change token to mask token\n                if prob < 0.9:\n                    image_feat[i] = 0\n                # rest 20% randomly keep current token\n                # append current token to output (we will predict these later)\n                output_label.append(1)\n            else:\n                # no masking token (will be ignored by loss function later)\n                output_label.append(-1)\n        return image_feat, image_loc, output_label\n    def random_action(self, action_feat, action_target, num_actions):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:226-255"
+    },
+    "5313": {
+        "file_id": 444,
+        "content": "The code defines three functions: \"random_region\", \"mask_token\", and \"random_action\". These functions are responsible for randomly masking tokens, selecting a random region from an image feature map, and randomly perturbing action features respectively. The random_region function masks 15% of the tokens in the input, while the random_action function perturbs 20% of the action features.",
+        "type": "comment"
+    },
+    "5314": {
+        "file_id": 444,
+        "content": "        output_label = []\n        for i in range(num_actions):\n            prob = random.random()\n            # mask token with 15% probability\n            if prob < 0.15:\n                prob /= 0.15\n                # 90% randomly change token to mask token\n                if prob < 0.9:\n                    action_feat[i] = 0\n                # rest 10% randomly keep current token\n                # append current token to output (we will predict these later)\n                output_label.append(action_target[i])\n            else:\n                # no masking token (will be ignored by loss function later)\n                output_label.append(-1)\n        return action_feat, output_label\n    def __call__(self, results):\n        caption = results['caption']\n        tokenizer = results['tokenizer']\n        image_feat = results['image_feat']\n        image_loc = results['image_loc']\n        num_boxes = results['num_boxes']\n        action_feat = results['action_feat']\n        action_target = results['action_target']\n        num_actions = results['num_actions']",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:256-285"
+    },
+    "5315": {
+        "file_id": 444,
+        "content": "This code defines a function that applies random masking to an input sequence of actions. It randomly chooses to either replace 90% of the tokens with mask tokens, keep them unchanged (10%), or ignore them for loss calculation by setting their value to -1. The function takes as input various results from a pipeline and returns the masked action features and labels.",
+        "type": "comment"
+    },
+    "5316": {
+        "file_id": 444,
+        "content": "        is_next = results['is_next']\n        image_target = results['image_target']\n        self._truncate_seq_pair(caption, self.max_seq_length - 2)\n        caption, caption_label = self.random_word(caption, tokenizer)\n        image_feat, image_loc, image_label = self.random_region(\n            image_feat, image_loc, num_boxes)\n        action_feat, action_label = self.random_action(action_feat,\n                                                       action_target,\n                                                       num_actions)\n        # concatenate lm labels and account for CLS, SEP, SEP\n        lm_label_ids = [-1] + caption_label + [-1]\n        # The convention in BERT is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids: 0   0   0   0  0     0 0\n        #",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:286-308"
+    },
+    "5317": {
+        "file_id": 444,
+        "content": "This code is part of a multimodal pipeline that randomly selects words from the caption, regions from an image, and actions, then concatenates them using BERT's convention for sequence pairs. It also handles truncating the caption and assigning labels to the input features.",
+        "type": "comment"
+    },
+    "5318": {
+        "file_id": 444,
+        "content": "        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary\n        # since the [SEP] token unambigiously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = []\n        segment_ids = []\n        tokens.append(\"[CLS]\")\n        segment_ids.append(0)\n        for token in caption:\n            tokens.append(token)\n            segment_ids.append(0)\n        tokens.append(\"[SEP]\")\n        segment_ids.append(0)\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:309-333"
+    },
+    "5319": {
+        "file_id": 444,
+        "content": "This code prepares input data for a multimodal pipeline in PaddleVideo. It appends special tokens \"[CLS]\" and \"[SEP]\" to the token list, assigns segment ID 0 to all tokens (indicating first sequence), converts tokens to input IDs using tokenizer, and creates a mask with 1 for real tokens and 0 for padding tokens. This allows the model to learn sequences and use the [CLS] vector as a \"sentence vector\" for classification tasks.",
+        "type": "comment"
+    },
+    "5320": {
+        "file_id": 444,
+        "content": "        input_mask = [1] * (len(input_ids))\n        image_mask = [1] * (num_boxes)\n        action_mask = [1] * (num_actions)\n        # Zero-pad up to the visual sequence length.\n        while len(image_mask) < self.max_region_length:\n            image_mask.append(0)\n            image_label.append(-1)\n        while len(action_mask) < self.max_action_length:\n            action_mask.append(0)\n            action_label.append(-1)\n        # Zero-pad up to the sequence length.\n        while len(input_ids) < self.max_seq_length:\n            input_ids.append(0)\n            input_mask.append(0)\n            segment_ids.append(0)\n            lm_label_ids.append(-1)\n        assert len(input_ids) == self.max_seq_length\n        assert len(input_mask) == self.max_seq_length\n        assert len(segment_ids) == self.max_seq_length\n        assert len(lm_label_ids) == self.max_seq_length\n        assert len(image_mask) == self.max_region_length\n        assert len(image_label) == self.max_region_length\n        assert len(action_mask) == self.max_action_length",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:334-359"
+    },
+    "5321": {
+        "file_id": 444,
+        "content": "Zero-padding visual, action, and input sequences to the maximum lengths. Asserting that all lists are of equal length after padding and match their respective max lengths.",
+        "type": "comment"
+    },
+    "5322": {
+        "file_id": 444,
+        "content": "        assert len(action_label) == self.max_action_length\n        image_feat, image_loc, image_mask = self.get_image_global_feature(\n            image_feat, image_loc, np.array(image_mask))\n        features = [\n            np.array(input_ids),\n            action_feat,\n            image_feat,\n            image_loc,\n            np.array(segment_ids),\n            np.array(input_mask),\n            image_mask,\n            np.array(action_mask),\n            np.array(lm_label_ids),\n            np.array(action_label),\n            np.array(is_next),\n            np.array(image_label),\n            image_target,\n        ]\n        results['features'] = features\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/multimodal.py:360-380"
+    },
+    "5323": {
+        "file_id": 444,
+        "content": "This code snippet is part of a pipeline function that asserts the length of 'action_label' matches the maximum allowed action length. It then calls another function to get global image features, and forms a list of feature arrays including input ids, action feature, image feature, location, segment ids, input mask, image mask, action label, lm_label_ids, is_next, image label, and image target. The results dictionary is updated with these features before the function returns.",
+        "type": "comment"
+    },
+    "5324": {
+        "file_id": 445,
+        "content": "/paddlevideo/loader/pipelines/sample.py",
+        "type": "filepath"
+    },
+    "5325": {
+        "file_id": 445,
+        "content": "The code utilizes PaddleVideo's image processing pipeline for efficient frame sampling, defines a sampler class for video decoding and data conversion, and calculates sampling positions, offsets, and generates frame indices for video sequences.",
+        "type": "summary"
+    },
+    "5326": {
+        "file_id": 445,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport random\nimport numpy as np\nfrom PIL import Image\ntry:\n    import SimpleITK as sitk\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care.\"\n    )\nimport cv2\nfrom ..registry import PIPELINES\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\n@PIPELINES.register()",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:1-38"
+    },
+    "5327": {
+        "file_id": 445,
+        "content": "This code is a Python module that imports various libraries and defines an image processing pipeline for PaddleVideo. It checks if SimpleITK is installed, handles pickling, and registers the pipeline using PaddleVideo's registry.",
+        "type": "comment"
+    },
+    "5328": {
+        "file_id": 445,
+        "content": "class Sampler(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:\n        num_seg(int): number of segments.\n        seg_len(int): number of sampled frames in each segment.\n        valid_mode(bool): True or False.\n        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.\n    Returns:\n        frames_idx: the index of sampled #frames.\n    \"\"\"\n    def __init__(self,\n                 num_seg,\n                 seg_len,\n                 frame_interval=None,\n                 valid_mode=False,\n                 select_left=False,\n                 dense_sample=False,\n                 linspace_sample=False,\n                 use_pil=True):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.frame_interval = frame_interval\n        self.valid_mode = valid_mode\n        self.select_left = select_left\n        self.dense_sample = dense_sample\n        self.linspace_sample = linspace_sample",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:39-66"
+    },
+    "5329": {
+        "file_id": 445,
+        "content": "The `Sampler` class is used to sample frames based on various parameters such as number of segments, length of each segment, frame interval, valid mode, select left flag and whether to use PIL for reading images. It returns the index of sampled frames.",
+        "type": "comment"
+    },
+    "5330": {
+        "file_id": 445,
+        "content": "        self.use_pil = use_pil\n    def _get(self, frames_idx, results):\n        data_format = results['format']\n        if data_format == \"frame\":\n            frame_dir = results['frame_dir']\n            imgs = []\n            for idx in frames_idx:\n                img = Image.open(\n                    os.path.join(frame_dir,\n                                 results['suffix'].format(idx))).convert('RGB')\n                imgs.append(img)\n        elif data_format == \"MRI\":\n            frame_dir = results['frame_dir']\n            imgs = []\n            MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))\n            for idx in frames_idx:\n                item = MRI[idx]\n                item = cv2.resize(item, (224, 224))\n                imgs.append(item)\n        elif data_format == \"video\":\n            if results['backend'] == 'cv2':\n                frames = np.array(results['frames'])\n                imgs = []\n                for idx in frames_idx:\n                    imgbuf = frames[idx]\n                    img = Image.fromarray(imgbuf, mode='RGB')",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:67-96"
+    },
+    "5331": {
+        "file_id": 445,
+        "content": "The code defines a class with an attribute 'use_pil' that determines the image format. The '_get' method retrieves frames based on data format (frame, MRI, or video), applies necessary conversions and resizing, and stores them in 'imgs'. It uses different libraries such as Image, sitk, and cv2 for different formats.",
+        "type": "comment"
+    },
+    "5332": {
+        "file_id": 445,
+        "content": "                    imgs.append(img)\n            elif results['backend'] == 'decord':\n                container = results['frames']\n                if self.use_pil:\n                    frames_select = container.get_batch(frames_idx)\n                    # dearray_to_img\n                    np_frames = frames_select.asnumpy()\n                    imgs = []\n                    for i in range(np_frames.shape[0]):\n                        imgbuf = np_frames[i]\n                        imgs.append(Image.fromarray(imgbuf, mode='RGB'))\n                else:\n                    if frames_idx.ndim != 1:\n                        frames_idx = np.squeeze(frames_idx)\n                    frame_dict = {\n                        idx: container[idx].asnumpy()\n                        for idx in np.unique(frames_idx)\n                    }\n                    imgs = [frame_dict[idx] for idx in frames_idx]\n            elif results['backend'] == 'pyav':\n                imgs = []\n                frames = np.array(results['frames'])\n                for idx in frames_idx:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:97-119"
+    },
+    "5333": {
+        "file_id": 445,
+        "content": "Code is handling video decoding using different backends such as 'opencv', 'decord', and 'pyav'. It appends the frames to imgs list, converts numpy array to image using Image.fromarray method for 'decord' backend, and handles frame indexing and data structures based on backend used.",
+        "type": "comment"
+    },
+    "5334": {
+        "file_id": 445,
+        "content": "                    if self.dense_sample:\n                        idx = idx - 1\n                    imgbuf = frames[idx]\n                    imgs.append(imgbuf)\n                imgs = np.stack(imgs)  # thwc\n            else:\n                raise NotImplementedError\n        else:\n            raise NotImplementedError\n        results['imgs'] = imgs\n        return results\n    def _get_train_clips(self, num_frames):\n        ori_seg_len = self.seg_len * self.frame_interval\n        avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg\n        if avg_interval > 0:\n            base_offsets = np.arange(self.num_seg) * avg_interval\n            clip_offsets = base_offsets + np.random.randint(avg_interval,\n                                                            size=self.num_seg)\n        elif num_frames > max(self.num_seg, ori_seg_len):\n            clip_offsets = np.sort(\n                np.random.randint(num_frames - ori_seg_len + 1,\n                                  size=self.num_seg))\n        elif avg_interval == 0:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:120-144"
+    },
+    "5335": {
+        "file_id": 445,
+        "content": "This code snippet is responsible for sampling frames from a video sequence, and it handles different scenarios based on the input parameters. If `dense_sample` is True, it adjusts the index before accessing the frame. The frames are then appended to a list called `imgs`. If neither of the else conditions are met, it raises a `NotImplementedError`. The function also includes another method, `_get_train_clips`, which calculates clip offsets for training purposes based on the number of frames and other parameters.",
+        "type": "comment"
+    },
+    "5336": {
+        "file_id": 445,
+        "content": "            ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg\n            clip_offsets = np.around(np.arange(self.num_seg) * ratio)\n        else:\n            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)\n        return clip_offsets\n    def _get_test_clips(self, num_frames):\n        ori_seg_len = self.seg_len * self.frame_interval\n        avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)\n        if num_frames > ori_seg_len - 1:\n            base_offsets = np.arange(self.num_seg) * avg_interval\n            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)\n        else:\n            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)\n        return clip_offsets\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            frames_len: length of frames.\n        return:\n            sampling id.\n        \"\"\"\n        frames_len = int(results['frames_len'])\n        frames_idx = []\n        if self.frame_interval is not None:\n            assert isinstance(self.frame_interval, int)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:145-171"
+    },
+    "5337": {
+        "file_id": 445,
+        "content": "The code defines a class with methods to determine clip offsets based on the number of frames and segment length. If the number of frames exceeds the original segment length, it calculates clip offsets for each segment. Otherwise, it sets all clip offsets to zero. The class also has a __call__ method that takes frames length as input and returns sampling indices.",
+        "type": "comment"
+    },
+    "5338": {
+        "file_id": 445,
+        "content": "            if not self.valid_mode:\n                offsets = self._get_train_clips(frames_len)\n            else:\n                offsets = self._get_test_clips(frames_len)\n            offsets = offsets[:, None] + np.arange(\n                self.seg_len)[None, :] * self.frame_interval\n            offsets = np.concatenate(offsets)\n            offsets = offsets.reshape((-1, self.seg_len))\n            offsets = np.mod(offsets, frames_len)\n            offsets = np.concatenate(offsets)\n            if results['format'] == 'video':\n                frames_idx = offsets\n            elif results['format'] == 'frame':\n                frames_idx = list(offsets + 1)\n            else:\n                raise NotImplementedError\n            return self._get(frames_idx, results)\n        if self.linspace_sample:\n            if 'start_idx' in results and 'end_idx' in results:\n                offsets = np.linspace(results['start_idx'], results['end_idx'],\n                                      self.num_seg)\n            else:\n                offsets = np.linspace(0, frames_len - 1, self.num_seg)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:172-199"
+    },
+    "5339": {
+        "file_id": 445,
+        "content": "This code determines the sampling method for frames based on the mode (valid or train) and format ('video' or 'frame'). It calculates offsets, handles different formats, and if linspace_sample is True, it generates offsets using linear spacing.",
+        "type": "comment"
+    },
+    "5340": {
+        "file_id": 445,
+        "content": "            offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)\n            if results['format'] == 'video':\n                frames_idx = list(offsets)\n                frames_idx = [x % frames_len for x in frames_idx]\n            elif results['format'] == 'frame':\n                frames_idx = list(offsets + 1)\n            elif results['format'] == 'MRI':\n                frames_idx = list(offsets)\n            else:\n                raise NotImplementedError\n            return self._get(frames_idx, results)\n        average_dur = int(frames_len / self.num_seg)\n        if not self.select_left:\n            if self.dense_sample:  # For ppTSM\n                if not self.valid_mode:  # train\n                    sample_pos = max(1, 1 + frames_len - 64)\n                    t_stride = 64 // self.num_seg\n                    start_idx = 0 if sample_pos == 1 else np.random.randint(\n                        0, sample_pos - 1)\n                    offsets = [(idx * t_stride + start_idx) % frames_len + 1\n                               for idx in range(self.num_seg)]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:200-223"
+    },
+    "5341": {
+        "file_id": 445,
+        "content": "This code segment calculates the frames to sample from a video, based on its format (video/frame/MRI). It also handles dense sampling for ppTSM. In non-dense mode, it selects random positions for each segment within the range of 1 to frames_len. For dense sampling in train mode, it generates a set of evenly spaced frame indices between start_idx and sample_pos, which is calculated based on frames_len and 64 (to ensure at least one frame within the window). The offsets are then used to fetch corresponding data using the _get method.",
+        "type": "comment"
+    },
+    "5342": {
+        "file_id": 445,
+        "content": "                    frames_idx = offsets\n                else:\n                    sample_pos = max(1, 1 + frames_len - 64)\n                    t_stride = 64 // self.num_seg\n                    start_list = np.linspace(0,\n                                             sample_pos - 1,\n                                             num=10,\n                                             dtype=int)\n                    offsets = []\n                    for start_idx in start_list.tolist():\n                        offsets += [\n                            (idx * t_stride + start_idx) % frames_len + 1\n                            for idx in range(self.num_seg)\n                        ]\n                    frames_idx = offsets\n            else:\n                for i in range(self.num_seg):\n                    idx = 0\n                    if not self.valid_mode:\n                        if average_dur >= self.seg_len:\n                            idx = random.randint(0, average_dur - self.seg_len)\n                            idx += i * average_dur",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:224-245"
+    },
+    "5343": {
+        "file_id": 445,
+        "content": "The code determines the sampling position based on frames length, number of segments, and valid mode. If no offsets are provided, it calculates the starting positions for each segment using a linear space. Then, it generates the offsets by multiplying the stride with the current segment index and adding the start index. Finally, if in valid mode, it randomly selects indices within the average duration per segment and adds them to the offsets list.",
+        "type": "comment"
+    },
+    "5344": {
+        "file_id": 445,
+        "content": "                        elif average_dur >= 1:\n                            idx += i * average_dur\n                        else:\n                            idx = i\n                    else:\n                        if average_dur >= self.seg_len:\n                            idx = (average_dur - 1) // 2\n                            idx += i * average_dur\n                        elif average_dur >= 1:\n                            idx += i * average_dur\n                        else:\n                            idx = i\n                    for jj in range(idx, idx + self.seg_len):\n                        if results['format'] == 'video':\n                            frames_idx.append(int(jj % frames_len))\n                        elif results['format'] == 'frame':\n                            frames_idx.append(jj + 1)\n                        elif results['format'] == 'MRI':\n                            frames_idx.append(jj)\n                        else:\n                            raise NotImplementedError\n            return self._get(frames_idx, results)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:246-268"
+    },
+    "5345": {
+        "file_id": 445,
+        "content": "Code calculates index based on average duration, then appends corresponding frame indices to frames_idx list based on the format specified in results. If the format is not recognized, it raises NotImplementedError. Finally, it returns the frames_idx and results to an unknown method.",
+        "type": "comment"
+    },
+    "5346": {
+        "file_id": 445,
+        "content": "        else:  # for TSM\n            if not self.valid_mode:\n                if average_dur > 0:\n                    offsets = np.multiply(list(range(self.num_seg)),\n                                          average_dur) + np.random.randint(\n                                              average_dur, size=self.num_seg)\n                elif frames_len > self.num_seg:\n                    offsets = np.sort(\n                        np.random.randint(frames_len, size=self.num_seg))\n                else:\n                    offsets = np.zeros(shape=(self.num_seg, ))\n            else:\n                if frames_len > self.num_seg:\n                    average_dur_float = frames_len / self.num_seg\n                    offsets = np.array([\n                        int(average_dur_float / 2.0 + average_dur_float * x)\n                        for x in range(self.num_seg)\n                    ])\n                else:\n                    offsets = np.zeros(shape=(self.num_seg, ))\n            if results['format'] == 'video':\n                frames_idx = list(offsets)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:270-292"
+    },
+    "5347": {
+        "file_id": 445,
+        "content": "This code generates random offsets for selecting frames from a video. If the valid mode is not enabled, it randomly selects frame offsets within the available duration or number of frames. If the valid mode is enabled, it evenly distributes the frames across the video duration. The 'format' variable determines if the selected frames are in video format.",
+        "type": "comment"
+    },
+    "5348": {
+        "file_id": 445,
+        "content": "                frames_idx = [x % frames_len for x in frames_idx]\n            elif results['format'] == 'frame':\n                frames_idx = list(offsets + 1)\n            elif results['format'] == 'MRI':\n                frames_idx = list(offsets)\n            else:\n                raise NotImplementedError\n            return self._get(frames_idx, results)\n@PIPELINES.register()\nclass SamplerPkl(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:\n        num_seg(int): number of segments.\n        seg_len(int): number of sampled frames in each segment.\n        mode(str): 'train', 'valid'\n    Returns:\n        frames_idx: the index of sampled #frames.\n    \"\"\"\n    def __init__(self, num_seg, seg_len, backend='pillow', valid_mode=False):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.valid_mode = valid_mode\n        self.backend = backend\n    def _get(self, buf):\n        if isinstance(buf, str):\n            img = Image.open(StringIO(buf))\n        else:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:293-327"
+    },
+    "5349": {
+        "file_id": 445,
+        "content": "This code snippet defines a SamplerPkl class that samples frames' indices for video loading. It takes arguments num_seg, seg_len, and backend and returns the index of sampled frames. Depending on the results format ('frame', 'MRI', or others), it sets the frames_idx accordingly before returning it.",
+        "type": "comment"
+    },
+    "5350": {
+        "file_id": 445,
+        "content": "            img = Image.open(BytesIO(buf))\n        img = img.convert('RGB')\n        if self.backend != 'pillow':\n            img = np.array(img)\n        return img\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            frames_len: length of frames.\n        return:\n            sampling id.\n        \"\"\"\n        filename = results['frame_dir']\n        data_loaded = pickle.load(open(filename, 'rb'), encoding='bytes')\n        video_name, label, frames = data_loaded\n        if isinstance(label, dict):\n            label = label['动作类型']\n            results['labels'] = label\n        elif len(label) == 1:\n            results['labels'] = int(label[0])\n        else:\n            results['labels'] = int(label[0]) if random.random() < 0.5 else int(\n                label[1])\n        results['frames_len'] = len(frames)\n        frames_len = results['frames_len']\n        average_dur = int(int(frames_len) / self.num_seg)\n        imgs = []\n        for i in range(self.num_seg):\n            idx = 0\n            if not self.valid_mode:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:328-358"
+    },
+    "5351": {
+        "file_id": 445,
+        "content": "This code is part of a pipeline for image sampling in video processing. It loads data from disk, converts images to RGB format, and handles labels. The `__call__` method takes results as input, retrieves the video name, label, and frames from the loaded data. If the label is a dictionary or has multiple elements, it assigns the label to '动作类型' or randomly chooses between the first two elements. It sets the 'frames_len' based on the length of frames and calculates the average duration per segment. Then, it initializes an empty list for the images and loops through the segments to create image samples. If valid mode is not enabled, it also resets the index variable.",
+        "type": "comment"
+    },
+    "5352": {
+        "file_id": 445,
+        "content": "                if average_dur >= self.seg_len:\n                    idx = random.randint(0, average_dur - self.seg_len)\n                    idx += i * average_dur\n                elif average_dur >= 1:\n                    idx += i * average_dur\n                else:\n                    idx = i\n            else:\n                if average_dur >= self.seg_len:\n                    idx = (average_dur - 1) // 2\n                    idx += i * average_dur\n                elif average_dur >= 1:\n                    idx += i * average_dur\n                else:\n                    idx = i\n            for jj in range(idx, idx + self.seg_len):\n                imgbuf = frames[int(jj % results['frames_len'])]\n                img = self._get(imgbuf)\n                imgs.append(img)\n        results['backend'] = self.backend\n        results['imgs'] = imgs\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample.py:359-382"
+    },
+    "5353": {
+        "file_id": 445,
+        "content": "The code calculates the index for a segment of frames based on average duration and frame length. It then retrieves images from the frames list, appends them to imgs, sets backend type, and returns the results including the imgs and backend information.",
+        "type": "comment"
+    },
+    "5354": {
+        "file_id": 446,
+        "content": "/paddlevideo/loader/pipelines/sample_ava.py",
+        "type": "filepath"
+    },
+    "5355": {
+        "file_id": 446,
+        "content": "This code introduces SampleFrames class for PaddleVideo's loader pipelines using OpenCV, supports various modes and training, includes storage backend classes, converts images to numpy arrays, defines pipeline, and provides SampleAVAFrames class for sampling video frames.",
+        "type": "summary"
+    },
+    "5356": {
+        "file_id": 446,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nfrom PIL import Image\nfrom ..registry import PIPELINES\nimport os\nimport numpy as np\nimport io\nimport os.path as osp\nfrom abc import ABCMeta, abstractmethod\nimport cv2\nfrom cv2 import IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_UNCHANGED\nimport inspect\nimread_backend = 'cv2'\nimread_flags = {\n    'color': IMREAD_COLOR,\n    'grayscale': IMREAD_GRAYSCALE,\n    'unchanged': IMREAD_UNCHANGED\n}\n@PIPELINES.register()\nclass SampleFrames:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:1-35"
+    },
+    "5357": {
+        "file_id": 446,
+        "content": "This code is importing necessary libraries and registering a class SampleFrames under PaddleVideo's loader pipelines. The class appears to sample frames from video, supporting different reading modes (color/grayscale/unchanged). It uses OpenCV (cv2) as the image processing backend.",
+        "type": "comment"
+    },
+    "5358": {
+        "file_id": 446,
+        "content": "    \"\"\"Sample frames from the video. \"\"\"\n    def __init__(self,\n                 clip_len,\n                 frame_interval=1,\n                 num_clips=1,\n                 temporal_jitter=False,\n                 twice_sample=False,\n                 out_of_bound_opt='loop',\n                 test_mode=False):\n        self.clip_len = clip_len\n        self.frame_interval = frame_interval\n        self.num_clips = num_clips\n        self.temporal_jitter = temporal_jitter\n        self.twice_sample = twice_sample\n        self.out_of_bound_opt = out_of_bound_opt\n        self.test_mode = test_mode\n        assert self.out_of_bound_opt in ['loop', 'repeat_last']\n    def _get_train_clips(self, num_frames):\n        \"\"\"Get clip offsets in train mode. \"\"\"\n        ori_clip_len = self.clip_len * self.frame_interval\n        avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips\n        if avg_interval > 0:\n            base_offsets = np.arange(self.num_clips) * avg_interval\n            clip_offsets = base_offsets + np.random.randint(",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:36-61"
+    },
+    "5359": {
+        "file_id": 446,
+        "content": "The function `__init__` initializes the parameters for sampling frames from a video, including clip length, frame interval, number of clips, temporal jittering options, and out-of-bound handling. The `_get_train_clips` function calculates the clip offsets in training mode by determining the average interval between clips based on the total number of frames. It then generates random base offsets and adds random offsets to create the final clip offsets.",
+        "type": "comment"
+    },
+    "5360": {
+        "file_id": 446,
+        "content": "                avg_interval, size=self.num_clips)\n        elif num_frames > max(self.num_clips, ori_clip_len):\n            clip_offsets = np.sort(\n                np.random.randint(\n                    num_frames - ori_clip_len + 1, size=self.num_clips))\n        elif avg_interval == 0:\n            ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips\n            clip_offsets = np.around(np.arange(self.num_clips) * ratio)\n        else:\n            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)\n        return clip_offsets\n    def _get_test_clips(self, num_frames):\n        \"\"\"Get clip offsets in test mode. \"\"\"\n        ori_clip_len = self.clip_len * self.frame_interval\n        avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)\n        if num_frames > ori_clip_len - 1:\n            base_offsets = np.arange(self.num_clips) * avg_interval\n            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)\n            if self.twice_sample:\n                clip_offsets = np.concatenate([clip_offsets, base_offsets])",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:62-82"
+    },
+    "5361": {
+        "file_id": 446,
+        "content": "This code calculates clip offsets for video sampling based on the number of frames and other parameters. It handles different scenarios, such as when the number of frames exceeds or equals the original clip length, when average interval is 0, and in test mode. The clip_offsets are returned at the end.",
+        "type": "comment"
+    },
+    "5362": {
+        "file_id": 446,
+        "content": "        else:\n            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)\n        return clip_offsets\n    def _sample_clips(self, num_frames):\n        \"\"\"Choose clip offsets for the video in a given mode. \"\"\"\n        if self.test_mode:\n            clip_offsets = self._get_test_clips(num_frames)\n        else:\n            clip_offsets = self._get_train_clips(num_frames)\n        return clip_offsets\n    def __call__(self, results):\n        \"\"\"Perform the SampleFrames loading. \"\"\"\n        total_frames = results['total_frames']\n        clip_offsets = self._sample_clips(total_frames)\n        frame_inds = clip_offsets[:, None] + np.arange(\n            self.clip_len)[None, :] * self.frame_interval\n        frame_inds = np.concatenate(frame_inds)\n        if self.temporal_jitter:\n            perframe_offsets = np.random.randint(\n                self.frame_interval, size=len(frame_inds))\n            frame_inds += perframe_offsets\n        frame_inds = frame_inds.reshape((-1, self.clip_len))\n        if self.out_of_bound_opt == 'loop':",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:83-107"
+    },
+    "5363": {
+        "file_id": 446,
+        "content": "This code defines a class that samples video clips and loads frames based on different modes, such as testing or training. It takes the total number of frames in a video and returns the corresponding clip offsets and frame indices for loading. The sampling mode, temporal jitter, and out-of-bound options can be specified to customize the sampling process.",
+        "type": "comment"
+    },
+    "5364": {
+        "file_id": 446,
+        "content": "            frame_inds = np.mod(frame_inds, total_frames)\n        elif self.out_of_bound_opt == 'repeat_last':\n            safe_inds = frame_inds < total_frames\n            unsafe_inds = 1 - safe_inds\n            last_ind = np.max(safe_inds * frame_inds, axis=1)\n            new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)\n            frame_inds = new_inds\n        else:\n            raise ValueError('Illegal out_of_bound option.')\n        start_index = results['start_index']\n        frame_inds = np.concatenate(frame_inds) + start_index\n        results['frame_inds'] = frame_inds.astype(np.int)\n        results['clip_len'] = self.clip_len\n        results['frame_interval'] = self.frame_interval\n        results['num_clips'] = self.num_clips\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'clip_len={self.clip_len}, '\n                    f'frame_interval={self.frame_interval}, '\n                    f'num_clips={self.num_clips}, '",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:108-129"
+    },
+    "5365": {
+        "file_id": 446,
+        "content": "Code handles out-of-bound frame indices by wrapping them around, repeating the last frame, or throwing an error. It then updates results with frame indices, clip length, and number of clips.",
+        "type": "comment"
+    },
+    "5366": {
+        "file_id": 446,
+        "content": "                    f'temporal_jitter={self.temporal_jitter}, '\n                    f'twice_sample={self.twice_sample}, '\n                    f'out_of_bound_opt={self.out_of_bound_opt}, '\n                    f'test_mode={self.test_mode})')\n        return repr_str\nclass BaseStorageBackend(metaclass=ABCMeta):\n    \"\"\"Abstract class of storage backends. \"\"\"\n    @abstractmethod\n    def get(self, filepath):\n        pass\n    @abstractmethod\n    def get_text(self, filepath):\n        pass\nclass HardDiskBackend(BaseStorageBackend):\n    \"\"\"Raw hard disks storage backend.\"\"\"\n    def get(self, filepath):\n        filepath = str(filepath)\n        with open(filepath, 'rb') as f:\n            value_buf = f.read()\n        return value_buf\n    def get_text(self, filepath):\n        filepath = str(filepath)\n        with open(filepath, 'r') as f:\n            value_buf = f.read()\n        return value_buf\nclass FileClient:\n    \"\"\"A general file client to access files in different backend. \"\"\"\n    _backends = {\n        'disk': HardDiskBackend,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:130-166"
+    },
+    "5367": {
+        "file_id": 446,
+        "content": "This code defines three classes: `BaseStorageBackend` (abstract class), `HardDiskBackend`, and a generic file client called `FileClient`. The `BaseStorageBackend` is an abstract class that provides two methods, `get()` and `get_text()`, which are expected to be implemented by subclasses. The `HardDiskBackend` implements these methods for handling files stored on the hard disk. Finally, the `FileClient` serves as a generic file client to access files in different backends.",
+        "type": "comment"
+    },
+    "5368": {
+        "file_id": 446,
+        "content": "    }\n    def __init__(self, backend='disk', **kwargs):\n        if backend not in self._backends:\n            raise ValueError(\n                f'Backend {backend} is not supported. Currently supported ones'\n                f' are {list(self._backends.keys())}')\n        self.backend = backend\n        self.client = self._backends[backend](**kwargs)\n    @classmethod\n    def _register_backend(cls, name, backend, force=False):\n        if not isinstance(name, str):\n            raise TypeError('the backend name should be a string, '\n                            f'but got {type(name)}')\n        if not inspect.isclass(backend):\n            raise TypeError(\n                f'backend should be a class but got {type(backend)}')\n        if not issubclass(backend, BaseStorageBackend):\n            raise TypeError(\n                f'backend {backend} is not a subclass of BaseStorageBackend')\n        if not force and name in cls._backends:\n            raise KeyError(\n                f'{name} is already registered as a storage backend, '",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:167-190"
+    },
+    "5369": {
+        "file_id": 446,
+        "content": "This code defines a class with an initializer and a class method for registering backends. The initializer takes a backend argument, checks if it is supported, and initializes the client object with that backend. If the name or backend type is incorrect, it raises TypeError. The _register_backend method allows for backend registration by name, checking if it is a string and if the backend is a subclass of BaseStorageBackend. Raises KeyError if already registered.",
+        "type": "comment"
+    },
+    "5370": {
+        "file_id": 446,
+        "content": "                'add \"force=True\" if you want to override it')\n        cls._backends[name] = backend\n    @classmethod\n    def register_backend(cls, name, backend=None, force=False):\n        \"\"\"Register a backend to FileClient. \"\"\"\n        if backend is not None:\n            cls._register_backend(name, backend, force=force)\n            return\n        def _register(backend_cls):\n            cls._register_backend(name, backend_cls, force=force)\n            return backend_cls\n        return _register\n    def get(self, filepath):\n        return self.client.get(filepath)\n    def get_text(self, filepath):\n        return self.client.get_text(filepath)\n@PIPELINES.register()\nclass RawFrameDecode:\n    \"\"\"Load and decode frames with given indices. \"\"\"\n    def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs):\n        self.io_backend = io_backend\n        self.decoding_backend = decoding_backend\n        self.kwargs = kwargs\n        self.file_client = None\n    def _pillow2array(self,img, flag='color', channel_order='bgr'):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:191-225"
+    },
+    "5371": {
+        "file_id": 446,
+        "content": "This code defines a class called FileClient, which handles file operations like registration and retrieval. It also registers a pipeline named RawFrameDecode for loading and decoding frames using specified backends for I/O and decoding. The class has an _pillow2array method to convert PIL image to numpy array in specific channel order.",
+        "type": "comment"
+    },
+    "5372": {
+        "file_id": 446,
+        "content": "        \"\"\"Convert a pillow image to numpy array. \"\"\"\n        channel_order = channel_order.lower()\n        if channel_order not in ['rgb', 'bgr']:\n            raise ValueError('channel order must be either \"rgb\" or \"bgr\"')\n        if flag == 'unchanged':\n            array = np.array(img)\n            if array.ndim >= 3 and array.shape[2] >= 3:  # color image\n                array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR\n        else:\n            # If the image mode is not 'RGB', convert it to 'RGB' first.\n            if img.mode != 'RGB':\n                if img.mode != 'LA':\n                    # Most formats except 'LA' can be directly converted to RGB\n                    img = img.convert('RGB')\n                else:\n                    # When the mode is 'LA', the default conversion will fill in\n                    #  the canvas with black, which sometimes shadows black objects\n                    #  in the foreground.\n                    #\n                    # Therefore, a random color (124, 117, 104) is used for canvas",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:226-247"
+    },
+    "5373": {
+        "file_id": 446,
+        "content": "This code converts a Pillow image to a numpy array. It checks the channel order and flag, then either keeps the array unchanged or converts it from RGB to BGR if necessary. If the image mode is not RGB, it converts it to RGB first using convert('RGB'). If the mode is LA, a random color is used for the canvas to avoid shadowing black objects in the foreground.",
+        "type": "comment"
+    },
+    "5374": {
+        "file_id": 446,
+        "content": "                    img_rgba = img.convert('RGBA')\n                    img = Image.new('RGB', img_rgba.size, (124, 117, 104))\n                    img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha\n            if flag == 'color':\n                array = np.array(img)\n                if channel_order != 'rgb':\n                    array = array[:, :, ::-1]  # RGB to BGR\n            elif flag == 'grayscale':\n                img = img.convert('L')\n                array = np.array(img)\n            else:\n                raise ValueError(\n                    'flag must be \"color\", \"grayscale\" or \"unchanged\", '\n                    f'but got {flag}')\n        return array\n    def _imfrombytes(self,content, flag='color', channel_order='bgr'):#, backend=None):\n        \"\"\"Read an image from bytes. \"\"\"\n        img_np = np.frombuffer(content, np.uint8)\n        flag = imread_flags[flag] if isinstance(flag, str) else flag\n        img = cv2.imdecode(img_np, flag)\n        if flag == IMREAD_COLOR and channel_order == 'rgb':",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:248-270"
+    },
+    "5375": {
+        "file_id": 446,
+        "content": "The code reads an image from bytes and converts it into a numpy array based on the provided flag (color or grayscale) and channel order. It first checks if the flag is valid, then decodes the image using OpenCV's imdecode function. If the flag is color and channel order is rgb, it returns the image as is. For other combinations, it converts the image to RGB or grayscale before returning the numpy array.",
+        "type": "comment"
+    },
+    "5376": {
+        "file_id": 446,
+        "content": "            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)\n        return img\n    def __call__(self, results):\n        \"\"\"Perform the ``RawFrameDecode`` to pick frames given indices.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        # mmcv.use_backend(self.decoding_backend)\n        directory = results['frame_dir']\n        suffix = results['suffix']\n        #modality = results['modality']\n        if self.file_client is None:\n            self.file_client = FileClient(self.io_backend, **self.kwargs)\n        imgs = list()\n        if results['frame_inds'].ndim != 1:\n            results['frame_inds'] = np.squeeze(results['frame_inds'])\n        offset = results.get('offset', 0)\n        for frame_idx in results['frame_inds']:\n            frame_idx += offset\n            filepath = osp.join(directory, suffix.format(frame_idx))\n            img_bytes = self.file_client.get(filepath) #以二进制方式读取图片\n            # Get frame with channel order RGB directly.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:271-301"
+    },
+    "5377": {
+        "file_id": 446,
+        "content": "This code defines a pipeline for decoding frames using the RawFrameDecode transform. It reads image files from the specified directory and suffix, handles different frame indices, and utilizes a file client to retrieve images in binary format. The cv2.cvtColor function is used to convert the color of images from BGR to RGB. The code also checks if the frame indices have the correct dimensions and squeezes them if necessary.",
+        "type": "comment"
+    },
+    "5378": {
+        "file_id": 446,
+        "content": "            cur_frame = self._imfrombytes(img_bytes, channel_order='rgb')\n            imgs.append(cur_frame)\n        results['imgs'] = imgs\n        results['original_shape'] = imgs[0].shape[:2]\n        results['img_shape'] = imgs[0].shape[:2]\n        # we resize the gt_bboxes and proposals to their real scale\n        h, w = results['img_shape']\n        scale_factor = np.array([w, h, w, h])\n        if 'gt_bboxes' in results:\n            gt_bboxes = results['gt_bboxes']\n            gt_bboxes_new = (gt_bboxes * scale_factor).astype(np.float32)\n            results['gt_bboxes'] = gt_bboxes_new\n        if 'proposals' in results and results['proposals'] is not None:\n            proposals = results['proposals']\n            proposals = (proposals * scale_factor).astype(np.float32)\n            results['proposals'] = proposals\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'io_backend={self.io_backend}, '\n                    f'decoding_backend={self.decoding_backend})')",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:303-326"
+    },
+    "5379": {
+        "file_id": 446,
+        "content": "Function applies image processing and resizing to input, appends frames to a list, and scales gt_bboxes and proposals accordingly. It then returns the results. The __repr__ function provides a string representation of the object's class and arguments.",
+        "type": "comment"
+    },
+    "5380": {
+        "file_id": 446,
+        "content": "        return repr_str\n@PIPELINES.register()\nclass SampleAVAFrames(SampleFrames):\n    def __init__(self, clip_len, frame_interval=2, test_mode=False):\n        super().__init__(clip_len, frame_interval, test_mode=test_mode)\n    def _get_clips(self, center_index, skip_offsets, shot_info):\n        start = center_index - (self.clip_len // 2) * self.frame_interval\n        end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval\n        frame_inds = list(range(start, end, self.frame_interval))\n        frame_inds = frame_inds + skip_offsets\n        frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1)\n        return frame_inds\n    def __call__(self, results):\n        fps = results['fps']\n        timestamp = results['timestamp']\n        timestamp_start = results['timestamp_start']\n        shot_info = results['shot_info']\n        #delta=(timestamp - timestamp_start) 为该帧距离15min视频开头有几秒\n        #center_index=fps*delta为该帧距离15min视频开头有几帧\n        #center_index+1是为了避免后续采样时出现负数? \n        #后续需要以center_index为中心前后采样视频帧片段",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:327-354"
+    },
+    "5381": {
+        "file_id": 446,
+        "content": "The code defines a class called SampleAVAFrames, which inherits from SampleFrames. It takes clip length, frame interval, and test mode as arguments during initialization. The _get_clips method calculates the start and end indices for a given center index, taking into account skip offsets and shot information. The __call__ method retrieves fps, timestamp, timestamp_start, and shot_info from the results dictionary, and then calculates the center index to sample video frames around that index.",
+        "type": "comment"
+    },
+    "5382": {
+        "file_id": 446,
+        "content": "        center_index = fps * (timestamp - timestamp_start) + 1\n        skip_offsets = np.random.randint(\n            -self.frame_interval // 2, (self.frame_interval + 1) // 2,\n            size=self.clip_len)\n        frame_inds = self._get_clips(center_index, skip_offsets, shot_info)\n        results['frame_inds'] = np.array(frame_inds, dtype=np.int)\n        results['clip_len'] = self.clip_len\n        results['frame_interval'] = self.frame_interval\n        results['num_clips'] = 1\n        results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32)\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'clip_len={self.clip_len}, '\n                    f'frame_interval={self.frame_interval}, '\n                    f'test_mode={self.test_mode})')\n        return repr_str",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ava.py:355-374"
+    },
+    "5383": {
+        "file_id": 446,
+        "content": "This function samples a video clip by calculating the center index and generating random skip offsets to select frames. It returns frame indices, clip length, frame interval, number of clips, and crop quadruple in a dictionary format for further processing. The `__repr__` method provides a concise string representation of the object's attributes.",
+        "type": "comment"
+    },
+    "5384": {
+        "file_id": 447,
+        "content": "/paddlevideo/loader/pipelines/sample_ucf24.py",
+        "type": "filepath"
+    },
+    "5385": {
+        "file_id": 447,
+        "content": "The \"SamplerUCF24\" class samples frames from videos using parameters like frame count and interval, utilizes PIL library, initializes pipeline, generates frame indices, returns sampled frames.",
+        "type": "summary"
+    },
+    "5386": {
+        "file_id": 447,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport random\nfrom PIL import Image\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass SamplerUCF24(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:\n        num_frames(int): The amount of frames used in a video\n        frame_interval(int): Sampling rate\n        valid_mode(bool): True or False.\n    Returns:\n        frames_idx: the index of sampled #frames.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ucf24.py:1-33"
+    },
+    "5387": {
+        "file_id": 447,
+        "content": "This code defines a class \"SamplerUCF24\" for sampling frames in videos, taking parameters such as num_frames and frame_interval. It uses PIL instead of OpenCV to read images and returns the index of sampled frames.",
+        "type": "comment"
+    },
+    "5388": {
+        "file_id": 447,
+        "content": "    \"\"\"\n    def __init__(self,\n                 num_frames=16,\n                 frame_interval=1,\n                 valid_mode=False):\n        self.num_frames = num_frames\n        self.frame_interval = frame_interval if valid_mode else random.randint(1, 2)\n        self.valid_mode = valid_mode\n    def _get(self, frames_idxs, img_folder, results):\n        imgs = []\n        for idx in frames_idxs:\n            img = Image.open(\n                os.path.join(img_folder, '{:05d}.jpg'.format(idx))).convert('RGB')\n            imgs.append(img)\n        results['imgs'] = imgs\n        return results\n    def _make_clip(self, im_ind, max_num):\n        frame_idxs = []\n        for i in reversed(range(self.num_frames)):\n            # make it as a loop\n            i_temp = im_ind - i * self.frame_interval\n            if i_temp < 1:\n                i_temp = 1\n            elif i_temp > max_num:\n                i_temp = max_num\n            frame_idxs.append(i_temp)\n        return frame_idxs\n    def __call__(self, results):\n        img_folder, key_frame = os.path.split(results['filename'])",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ucf24.py:34-65"
+    },
+    "5389": {
+        "file_id": 447,
+        "content": "This code defines a pipeline for loading and creating clips from video files. The `__init__` method initializes the number of frames, frame interval (randomly determined if valid mode is False), and valid mode flag. The `_get` method retrieves images in order, converts them to RGB, and appends them to a list. The `_make_clip` method generates a set of frame indices that create a looped clip. The pipeline is called with the results as input, extracting the image folder and filename for further processing.",
+        "type": "comment"
+    },
+    "5390": {
+        "file_id": 447,
+        "content": "        frame_len = len(os.listdir(img_folder))\n        key_idx = int(key_frame[0:5])\n        frame_idxs = self._make_clip(key_idx, frame_len)\n        return self._get(frame_idxs, img_folder, results)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/sample_ucf24.py:66-69"
+    },
+    "5391": {
+        "file_id": 447,
+        "content": "This code retrieves the number of frames in a folder, assigns a key frame index based on the input, generates frame indices for a video clip, and returns the requested frames from their folder.",
+        "type": "comment"
+    },
+    "5392": {
+        "file_id": 448,
+        "content": "/paddlevideo/loader/pipelines/segmentation.py",
+        "type": "filepath"
+    },
+    "5393": {
+        "file_id": 448,
+        "content": "The code enables image resizing, flipping, multi-scale segmentation in PaddleVideo's pipeline, with metadata addition and normalization. It performs image normalization and transposition before storing the result in a samples data structure.",
+        "type": "summary"
+    },
+    "5394": {
+        "file_id": 448,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nfrom PIL import Image\nimport copy\nimport cv2\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass MultiRestrictSize(object):\n    def __init__(self,\n                 min_size=None,\n                 max_size=800,\n                 flip=False,\n                 multi_scale=[1.3]):\n        self.min_size = min_size\n        self.max_size = max_size\n        self.multi_scale = multi_scale\n        self.flip = flip",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/segmentation.py:1-32"
+    },
+    "5395": {
+        "file_id": 448,
+        "content": "This code is for PaddleVideo's segmentation pipeline. It includes the class definition MultiRestrictSize, which can be used with minimum and maximum size limits, flipping option, and multiple scales for image resizing.",
+        "type": "comment"
+    },
+    "5396": {
+        "file_id": 448,
+        "content": "        assert ((min_size is None)) or ((max_size is None))\n    def __call__(self, sample):\n        samples = []\n        image = sample['current_img']\n        h, w = image.shape[:2]\n        for scale in self.multi_scale:\n            # Fixed range of scales\n            sc = None\n            # Align short edge\n            if not (self.min_size is None):\n                if h > w:\n                    short_edge = w\n                else:\n                    short_edge = h\n                if short_edge > self.min_size:\n                    sc = float(self.min_size) / short_edge\n            else:\n                if h > w:\n                    long_edge = h\n                else:\n                    long_edge = w\n                if long_edge > self.max_size:\n                    sc = float(self.max_size) / long_edge\n            if sc is None:\n                new_h = h\n                new_w = w\n            else:\n                new_h = sc * h\n                new_w = sc * w\n            new_h = int(new_h * scale)\n            new_w = int(new_w * scale)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/segmentation.py:33-65"
+    },
+    "5397": {
+        "file_id": 448,
+        "content": "This code is a function that applies image segmentation using multi-scale technique. It scales the input image based on a fixed range of scales and aligns short or long edges to meet minimum or maximum size requirements, respectively. The scaled images are stored in a list for further processing.",
+        "type": "comment"
+    },
+    "5398": {
+        "file_id": 448,
+        "content": "            if (new_h - 1) % 16 != 0:\n                new_h = int(np.around((new_h - 1) / 16.) * 16 + 1)\n            if (new_w - 1) % 16 != 0:\n                new_w = int(np.around((new_w - 1) / 16.) * 16 + 1)\n            if new_h == h and new_w == w:\n                samples.append(sample)\n            else:\n                new_sample = {}\n                for elem in sample.keys():\n                    if 'meta' in elem:\n                        new_sample[elem] = sample[elem]\n                        continue\n                    tmp = sample[elem]\n                    if 'label' in elem:\n                        new_sample[elem] = sample[elem]\n                        continue\n                    else:\n                        flagval = cv2.INTER_CUBIC\n                        tmp = cv2.resize(tmp,\n                                         dsize=(new_w, new_h),\n                                         interpolation=flagval)\n                        new_sample[elem] = tmp\n                samples.append(new_sample)\n            if self.flip:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/segmentation.py:67-92"
+    },
+    "5399": {
+        "file_id": 448,
+        "content": "Code resizes input images to a multiple of 16x16, appends samples with matching metadata, and optionally flips the image if enabled.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/54.json b/docs/data/54.json
new file mode 100644
index 000000000..21c8d4bc5
--- /dev/null
+++ b/docs/data/54.json
@@ -0,0 +1,550 @@
+{
+    "5400": {
+        "file_id": 448,
+        "content": "                now_sample = samples[-1]\n                new_sample = {}\n                for elem in now_sample.keys():\n                    if 'meta' in elem:\n                        new_sample[elem] = now_sample[elem].copy()\n                        new_sample[elem]['flip'] = True\n                        continue\n                    tmp = now_sample[elem]\n                    tmp = tmp[:, ::-1].copy()\n                    new_sample[elem] = tmp\n                samples.append(new_sample)\n        return samples\n@PIPELINES.register()\nclass MultiNorm(object):\n    def __call__(self, samples):\n        for idx in range(len(samples)):\n            sample = samples[idx]\n            for elem in sample.keys():\n                if 'meta' in elem:\n                    continue\n                tmp = sample[elem]\n                if tmp is None:\n                    continue\n                if tmp.ndim == 2:\n                    tmp = tmp[:, :, np.newaxis]\n                else:\n                    tmp = tmp / 255.\n                    tmp -= (0.485, 0.456, 0.406)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/segmentation.py:93-124"
+    },
+    "5401": {
+        "file_id": 448,
+        "content": "This code segment is from the PaddleVideo library, specifically in the loader/pipelines/segmentation.py file. It appears to be a function that adds flipped image data to a list of samples, after normalizing each image by dividing it by 255 and subtracting (0.485, 0.456, 0.406). This function is part of the MultiNorm pipeline registered in the PIPELINES module.",
+        "type": "comment"
+    },
+    "5402": {
+        "file_id": 448,
+        "content": "                    tmp /= (0.229, 0.224, 0.225)\n                tmp = tmp.transpose((2, 0, 1))\n                samples[idx][elem] = tmp\n        return samples",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/segmentation.py:125-130"
+    },
+    "5403": {
+        "file_id": 448,
+        "content": "This code segment performs image normalization and transposition before storing the result in a dictionary-like samples data structure. It divides each RGB channel value by the average RGB values, then transposes the image channels. Finally, it adds the transformed image to the samples dictionary for the given index and element.",
+        "type": "comment"
+    },
+    "5404": {
+        "file_id": 449,
+        "content": "/paddlevideo/loader/pipelines/segmentation_pipline.py",
+        "type": "filepath"
+    },
+    "5405": {
+        "file_id": 449,
+        "content": "This Python code defines a SegmentationSampler class in PaddleVideo, which samples data at a specified rate and registers it for the Action Segmentation Dataset. It is part of a video processing library's pipeline, likely for segmentation purposes.",
+        "type": "summary"
+    },
+    "5406": {
+        "file_id": 449,
+        "content": "#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport os\nimport numpy as np\nimport random\nimport paddle\nfrom ..registry import PIPELINES\n\"\"\"\npipeline ops for Action Segmentation Dataset.\n\"\"\"\n@PIPELINES.register()\nclass SegmentationSampler(object):\n    def __init__(self, sample_rate):\n        self.sample_rate = sample_rate\n    def __call__(self, results):\n        for key, data in results.items():\n            if len(data.shape) == 1:\n                data = data[::self.sample_rate]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/segmentation_pipline.py:1-35"
+    },
+    "5407": {
+        "file_id": 449,
+        "content": "This Python code is from the PaddleVideo library and defines a SegmentationSampler class. It samples data at a specified rate, only keeps every nth element in a 1D array, and registers this pipeline operation for the Action Segmentation Dataset.",
+        "type": "comment"
+    },
+    "5408": {
+        "file_id": 449,
+        "content": "                results[key] = copy.deepcopy(data)\n            else:\n                data = data[:, ::self.sample_rate]\n                results[key] = copy.deepcopy(data)\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/segmentation_pipline.py:36-40"
+    },
+    "5409": {
+        "file_id": 449,
+        "content": "This code segment appears to be part of a pipeline in a video processing library, possibly for segmentation. It selects specific data based on the sample rate and stores it in a results dictionary with deep copy.",
+        "type": "comment"
+    },
+    "5410": {
+        "file_id": 450,
+        "content": "/paddlevideo/loader/pipelines/skeleton_pipeline.py",
+        "type": "filepath"
+    },
+    "5411": {
+        "file_id": 450,
+        "content": "The code develops efficient data processing classes for PaddleVideo, including interpolation, cropping, and pipeline optimization. It performs image flipping operations, augments 'Flow' modality images, transforms data formats, collapses dimensions, includes ML data pipeline support, generates heatmaps for keypoints and limbs in image sequences using input parameters, applies data augmentation, and uses Gaussian filtering with specified sigma value.",
+        "type": "summary"
+    },
+    "5412": {
+        "file_id": 450,
+        "content": "#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport collections\nfrom itertools import repeat\nimport copy as cp\nfrom collections import abc\nimport numpy as np\nimport paddle.nn.functional as F\nimport random\nimport paddle\nfrom ..registry import PIPELINES\nfrom .augmentations_ava import iminvert, imflip_\n\"\"\"pipeline ops for Activity Net.\n\"\"\"\ndef _ntuple(n):\n    def parse(x):\n        if isinstance(x, collections.abc.Iterable):\n            return tuple(x)\n        return tuple(repeat(x, n))",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1-34"
+    },
+    "5413": {
+        "file_id": 450,
+        "content": "This code is importing necessary libraries and defining a function to create ActivityNet-style pipeline operations. It also registers these pipelines for future use.",
+        "type": "comment"
+    },
+    "5414": {
+        "file_id": 450,
+        "content": "    return parse\n_single = _ntuple(1)\n_pair = _ntuple(2)\n_triple = _ntuple(3)\n_quadruple = _ntuple(4)\ndef _init_lazy_if_proper(results, lazy):\n    \"\"\"Initialize lazy operation properly.\n    Make sure that a lazy operation is properly initialized,\n    and avoid a non-lazy operation accidentally getting mixed in.\n    Required keys in results are \"imgs\" if \"img_shape\" not in results,\n    otherwise, Required keys in results are \"img_shape\", add or modified keys\n    are \"img_shape\", \"lazy\".\n    Add or modified keys in \"lazy\" are \"original_shape\", \"crop_bbox\", \"flip\",\n    \"flip_direction\", \"interpolation\".\n    Args:\n        results (dict): A dict stores data pipeline result.\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    if 'img_shape' not in results:\n        results['img_shape'] = results['imgs'][0].shape[:2]\n    if lazy:\n        if 'lazy' not in results:\n            img_h, img_w = results['img_shape']\n            lazyop = dict()\n            lazyop['original_shape'] = results['img_shape']",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:36-68"
+    },
+    "5415": {
+        "file_id": 450,
+        "content": "This function initializes the lazy operation properly by checking if \"img_shape\" is in results, and adds or modifies keys \"lazy\", \"original_shape\", \"crop_bbox\", \"flip\", \"flip_direction\", and \"interpolation\" based on whether \"lazy\" is True.",
+        "type": "comment"
+    },
+    "5416": {
+        "file_id": 450,
+        "content": "            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],\n                                           dtype=np.float32)\n            lazyop['flip'] = False\n            lazyop['flip_direction'] = None\n            lazyop['interpolation'] = None\n            results['lazy'] = lazyop\n    else:\n        assert 'lazy' not in results, 'Use Fuse after lazy operations'\n@PIPELINES.register()\nclass AutoPadding(object):\n    \"\"\"\n    Sample or Padding frame skeleton feature.\n    Args:\n        window_size: int, temporal size of skeleton feature.\n        random_pad: bool, whether do random padding when frame length < window size. Default: False.\n    \"\"\"\n    def __init__(self, window_size, random_pad=False):\n        self.window_size = window_size\n        self.random_pad = random_pad\n    def get_frame_num(self, data):\n        C, T, V, M = data.shape\n        for i in range(T - 1, -1, -1):\n            tmp = np.sum(data[:, i, :, :])\n            if tmp > 0:\n                T = i + 1\n                break\n        return T\n    def __call__(self, results):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:69-101"
+    },
+    "5417": {
+        "file_id": 450,
+        "content": "The code is defining a Pipeline class, specifically for auto-padding and skeleton feature extraction from image data. It first checks if the 'lazy' operation has been performed, then initializes necessary parameters for padding or sampling frames based on window size and random_pad setting. The get_frame_num function calculates the number of frames containing valid data, and the __call__ method applies the pipeline to the results.",
+        "type": "comment"
+    },
+    "5418": {
+        "file_id": 450,
+        "content": "        data = results['data']\n        C, T, V, M = data.shape\n        T = self.get_frame_num(data)\n        if T == self.window_size:\n            data_pad = data[:, :self.window_size, :, :]\n        elif T < self.window_size:\n            begin = random.randint(\n                0, self.window_size - T) if self.random_pad else 0\n            data_pad = np.zeros((C, self.window_size, V, M))\n            data_pad[:, begin:begin + T, :, :] = data[:, :T, :, :]\n        else:\n            if self.random_pad:\n                index = np.random.choice(\n                    T, self.window_size, replace=False).astype('int64')\n            else:\n                index = np.linspace(0, T, self.window_size).astype(\"int64\")\n            data_pad = data[:, index, :, :]\n        results['data'] = data_pad\n        return results\n@PIPELINES.register()\nclass SkeletonNorm(object):\n    \"\"\"\n    Normalize skeleton feature.\n    Args:\n        aixs: dimensions of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default: 2.\n    \"\"\"\n    def __init__(self, axis=2, squeeze=False):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:102-133"
+    },
+    "5419": {
+        "file_id": 450,
+        "content": "Code snippet performs data padding to ensure consistent frame size for skeleton data in the Skeleton Pipeline. It checks the current frame number (T) and pads it with zeroes if T is smaller than the window size, or selects a subset of frames from the original data if T is larger than the window size. The result is then returned as 'results'.",
+        "type": "comment"
+    },
+    "5420": {
+        "file_id": 450,
+        "content": "        self.axis = axis\n        self.squeeze = squeeze\n    def __call__(self, results):\n        data = results['data']\n        # Centralization\n        data = data - data[:, :, 8:9, :]\n        data = data[:self.axis, :, :, :]  # get (x,y) from (x,y, acc)\n        C, T, V, M = data.shape\n        if self.squeeze:\n            data = data.reshape((C, T, V))  # M = 1\n        results['data'] = data.astype('float32')\n        if 'label' in results:\n            label = results['label']\n            results['label'] = np.expand_dims(label, 0).astype('int64')\n        return results\n@PIPELINES.register()\nclass Iden(object):\n    \"\"\"\n    Wrapper Pipeline\n    \"\"\"\n    def __init__(self, label_expand=True):\n        self.label_expand = label_expand\n    def __call__(self, results):\n        data = results['data']\n        results['data'] = data.astype('float32')\n        if 'label' in results and self.label_expand:\n            label = results['label']\n            results['label'] = np.expand_dims(label, 0).astype('int64')\n        return results",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:134-170"
+    },
+    "5421": {
+        "file_id": 450,
+        "content": "This code defines two classes, \"SkeletonPipeline\" and \"Iden\", which are used as PaddleVideo pipeline components. The SkeletonPipeline class is responsible for centralizing the data along a specified axis and reshaping it if squeeze is True. The Iden class simply converts the 'data' to float32 type and expands the shape of 'label' if it exists and label_expand is set to True. Both classes return updated results after processing.",
+        "type": "comment"
+    },
+    "5422": {
+        "file_id": 450,
+        "content": "@PIPELINES.register()\nclass RandomRotation(object):\n    \"\"\"\n    Random rotation sketeton.\n    Args:\n        argument: bool, if rotation.\n        theta: float, rotation rate.\n    \"\"\"\n    def __init__(self, argument, theta=0.3):\n        self.theta = theta\n        self.argument = argument\n    def _rot(self, rot):\n        \"\"\"\n        rot: T,3\n        \"\"\"\n        cos_r, sin_r = np.cos(rot), np.sin(rot)  # T,3\n        zeros = np.zeros((rot.shape[0], 1))  # T,1\n        ones = np.ones((rot.shape[0], 1))  # T,1\n        r1 = np.stack((ones, zeros, zeros), axis=-1)  # T,1,3\n        rx2 = np.stack((zeros, cos_r[:, 0:1], sin_r[:, 0:1]), axis=-1)  # T,1,3\n        rx3 = np.stack((zeros, -sin_r[:, 0:1], cos_r[:, 0:1]), axis=-1)  # T,1,3\n        rx = np.concatenate((r1, rx2, rx3), axis=1)  # T,3,3\n        ry1 = np.stack((cos_r[:, 1:2], zeros, -sin_r[:, 1:2]), axis=-1)\n        r2 = np.stack((zeros, ones, zeros), axis=-1)\n        ry3 = np.stack((sin_r[:, 1:2], zeros, cos_r[:, 1:2]), axis=-1)\n        ry = np.concatenate((ry1, r2, ry3), axis=1)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:173-202"
+    },
+    "5423": {
+        "file_id": 450,
+        "content": "The code defines a random rotation skeleton class for applying random rotations to input data. It takes arguments for rotation and rotation rate, and has a method for performing the rotation operation on the input data. The method calculates rotation matrices for rotation around the x, y, and z axes using the given rotation rate and applies them to the input data.",
+        "type": "comment"
+    },
+    "5424": {
+        "file_id": 450,
+        "content": "        rz1 = np.stack((cos_r[:, 2:3], sin_r[:, 2:3], zeros), axis=-1)\n        r3 = np.stack((zeros, zeros, ones), axis=-1)\n        rz2 = np.stack((-sin_r[:, 2:3], cos_r[:, 2:3], zeros), axis=-1)\n        rz = np.concatenate((rz1, rz2, r3), axis=1)\n        rot = np.matmul(np.matmul(rz, ry), rx)\n        return rot\n    def __call__(self, results):\n        # C,T,V,M\n        data = results['data']\n        if self.argument:\n            C, T, V, M = data.shape\n            data_numpy = np.transpose(data, (1, 0, 2, 3)).conjugate().reshape(\n                T, C, V * M)  # T,3,V*M\n            rot = np.random.uniform(-self.theta, self.theta, 3)\n            rot = np.stack(\n                [\n                    rot,\n                ] * T, axis=0)\n            rot = self._rot(rot)  # T,3,3\n            data_numpy = np.matmul(rot, data_numpy)\n            data_numpy = data_numpy.reshape(T, C, V, M)\n            data_numpy = np.transpose(data_numpy, (1, 0, 2, 3))\n            data = data_numpy\n        results['data'] = data.astype(np.float32)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:204-229"
+    },
+    "5425": {
+        "file_id": 450,
+        "content": "This code defines a class with two methods: `_rot` and `__call__`. The `_rot` method takes rotation angles and returns the rotation matrix. The `__call__` method applies random rotations to input data, performs the rotations using `_rot`, and adjusts the shape of the data before returning it.",
+        "type": "comment"
+    },
+    "5426": {
+        "file_id": 450,
+        "content": "        return results\n@PIPELINES.register()\nclass SketeonCropSample(object):\n    \"\"\"\n    Sketeon Crop Sampler.\n    Args:\n        crop_model: str, crop model, support: ['center'].\n        p_interval: list, crop len\n        window_size: int, sample windows size.\n    \"\"\"\n    def __init__(self, window_size, crop_model='center', p_interval=1):\n        assert crop_model in ['center'], \"Don't support :\" + crop_model\n        self.crop_model = crop_model\n        self.window_size = window_size\n        self.p_interval = p_interval\n    def __call__(self, results):\n        if self.crop_model == 'center':\n            # input: C,T,V,M\n            data = results['data']\n            valid_frame_num = np.sum(data.sum(0).sum(-1).sum(-1) != 0)\n            C, T, V, M = data.shape\n            begin = 0\n            end = valid_frame_num\n            valid_size = end - begin\n            #crop\n            if len(self.p_interval) == 1:\n                p = self.p_interval[0]\n                bias = int((1 - p) * valid_size / 2)\n                data = data[:, begin + bias:end - bias, :, :]  # center_crop",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:230-265"
+    },
+    "5427": {
+        "file_id": 450,
+        "content": "This code defines a Pipeline class for cropping sample data using the Sketeon crop model. It supports only the 'center' crop model and takes window size, crop model (default 'center'), and p_interval (default 1) as arguments. The __call__ method is used to apply the crop operation on the input results by selecting a center crop based on the crop model and p_interval values.",
+        "type": "comment"
+    },
+    "5428": {
+        "file_id": 450,
+        "content": "                cropped_length = data.shape[1]\n            else:\n                p = np.random.rand(1) * (self.p_interval[1] - self.p_interval[0]\n                                         ) + self.p_interval[0]\n                # constraint cropped_length lower bound as 64\n                cropped_length = np.minimum(\n                    np.maximum(int(np.floor(valid_size * p)), 64), valid_size)\n                bias = np.random.randint(0, valid_size - cropped_length + 1)\n                data = data[:, begin + bias:begin + bias + cropped_length, :, :]\n            # resize\n            data = np.transpose(data, (0, 2, 3, 1)).conjugate().reshape(\n                C * V * M, cropped_length)\n            data = data[None, None, :, :]\n            # could perform both up sample and down sample\n            data_tensor = paddle.to_tensor(data)\n            data_tensor = F.interpolate(\n                data_tensor,\n                size=(C * V * M, self.window_size),\n                mode='bilinear',\n                align_corners=False).squeeze()",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:266-286"
+    },
+    "5429": {
+        "file_id": 450,
+        "content": "This code randomly selects a cropped length within a specified interval, then applies random bias to the cropped length. It reshapes and transposes the data before performing interpolation on the tensor for up or down sampling using bilinear mode.",
+        "type": "comment"
+    },
+    "5430": {
+        "file_id": 450,
+        "content": "            data = paddle.transpose(\n                paddle.reshape(data_tensor, (C, V, M, self.window_size)),\n                (0, 3, 1, 2)).numpy()\n        else:\n            raise NotImplementedError\n        results['data'] = data\n        return results\n@PIPELINES.register()\nclass SketeonModalityTransform(object):\n    \"\"\"\n    Sketeon Crop Sampler.\n    Args:\n        crop_model: str, crop model, support: ['center'].\n        p_interval: list, crop len\n        window_size: int, sample windows size.\n    \"\"\"\n    def __init__(self, bone, motion, joint=True, graph='ntu_rgb_d'):\n        self.joint = joint\n        self.bone = bone\n        self.motion = motion\n        self.graph = graph\n        if self.graph == \"ntu_rgb_d\":\n            self.bone_pairs = ((1, 2), (2, 21), (3, 21), (4, 3), (5, 21),\n                               (6, 5), (7, 6), (8, 7), (9, 21), (10, 9),\n                               (11, 10), (12, 11), (13, 1), (14, 13), (15, 14),\n                               (16, 15), (17, 1), (18, 17), (19, 18), (20, 19),",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:287-316"
+    },
+    "5431": {
+        "file_id": 450,
+        "content": "This code is part of the PaddleVideo library and appears to be a function or class related to skeleton data transformation for video analysis tasks. The code seems to handle reshaping and transposing data based on certain parameters, such as window size, crop model, and more. This could potentially be used for video processing in computer vision applications like action recognition or pose estimation.",
+        "type": "comment"
+    },
+    "5432": {
+        "file_id": 450,
+        "content": "                               (22, 23), (21, 21), (23, 8), (24, 25), (25, 12))\n        else:\n            raise NotImplementedError\n    def __call__(self, results):\n        if self.joint:\n            return results\n        data_numpy = results['data']\n        if self.bone:\n            bone_data_numpy = np.zeros_like(data_numpy)\n            for v1, v2 in self.bone_pairs:\n                bone_data_numpy[:, :, v1 -\n                                1] = data_numpy[:, :, v1 -\n                                                1] - data_numpy[:, :, v2 - 1]\n            data_numpy = bone_data_numpy\n        if self.motion:\n            data_numpy[:, :-1] = data_numpy[:, 1:] - data_numpy[:, :-1]\n            data_numpy[:, -1] = 0\n        results['data'] = data_numpy\n        return results\n@PIPELINES.register()\nclass UniformSampleFrames:\n    \"\"\"Uniformly sample frames from the video.\n    To sample an n-frame clip from the video. UniformSampleFrames basically\n    divide the video into n segments of equal length and randomly sample one",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:317-344"
+    },
+    "5433": {
+        "file_id": 450,
+        "content": "This code defines a class for skeleton processing in PaddleVideo. If joints are enabled, it returns the results as is. If bones are enabled, it calculates bone data by subtracting corresponding bone vertices from each other. If motion is enabled, it sets the last frame's coordinates to 0. The UniformSampleFrames pipeline uniformly samples frames from a video by dividing it into equal segments and randomly selecting one from each segment.",
+        "type": "comment"
+    },
+    "5434": {
+        "file_id": 450,
+        "content": "    frame from each segment. To make the testing results reproducible, a\n    random seed is set during testing, to make the sampling results\n    deterministic.\n    Required keys are \"total_frames\", \"start_index\" , added or modified keys\n    are \"frame_inds\", \"clip_len\", \"frame_interval\" and \"num_clips\".\n    Args:\n        clip_len (int): Frames of each sampled output clip.\n        num_clips (int): Number of clips to be sampled. Default: 1.\n        test_mode (bool): Store True when building test or validation dataset.\n            Default: False.\n        seed (int): The random seed used during test time. Default: 255.\n    \"\"\"\n    def __init__(self, clip_len, num_clips=1, test_mode=False, seed=255):\n        self.clip_len = clip_len\n        self.num_clips = num_clips\n        self.test_mode = test_mode\n        self.seed = seed\n    def _get_train_clips(self, num_frames, clip_len):\n        \"\"\"Uniformly sample indices for training clips.\n        Args:\n            num_frames (int): The number of frames.\n            clip_len (int): The length of the clip.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:345-372"
+    },
+    "5435": {
+        "file_id": 450,
+        "content": "This code snippet defines a class with an __init__ method that initializes the clip_len, num_clips, test_mode, and seed. The _get_train_clips method uniformly samples indices for training clips based on the given number of frames and clip length. This is used in PaddleVideo for loading and processing video data.",
+        "type": "comment"
+    },
+    "5436": {
+        "file_id": 450,
+        "content": "        \"\"\"\n        assert self.num_clips == 1\n        if num_frames < clip_len:\n            start = np.random.randint(0, num_frames)\n            inds = np.arange(start, start + clip_len)\n        elif clip_len <= num_frames < 2 * clip_len:\n            basic = np.arange(clip_len)\n            inds = np.random.choice(\n                clip_len + 1, num_frames - clip_len, replace=False)\n            offset = np.zeros(clip_len + 1, dtype=np.int64)\n            offset[inds] = 1\n            offset = np.cumsum(offset)\n            inds = basic + offset[:-1]\n        else:\n            bids = np.array(\n                [i * num_frames // clip_len for i in range(clip_len + 1)])\n            bsize = np.diff(bids)\n            bst = bids[:clip_len]\n            offset = np.random.randint(bsize)\n            inds = bst + offset\n        return inds\n    def _get_test_clips(self, num_frames, clip_len):\n        \"\"\"Uniformly sample indices for testing clips.\n        Args:\n            num_frames (int): The number of frames.\n            clip_len (int): The length of the clip.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:373-401"
+    },
+    "5437": {
+        "file_id": 450,
+        "content": "This code determines the indices for a skeleton clip from a given number of frames and clip length. It handles three scenarios: when the number of frames is less than the clip length, between the clip length and twice the clip length, or more than twice the clip length. The function returns the sampled indices accordingly.",
+        "type": "comment"
+    },
+    "5438": {
+        "file_id": 450,
+        "content": "        \"\"\"\n        np.random.seed(self.seed)\n        if num_frames < clip_len:\n            # Then we use a simple strategy\n            if num_frames < self.num_clips:\n                start_inds = list(range(self.num_clips))\n            else:\n                start_inds = [\n                    i * num_frames // self.num_clips\n                    for i in range(self.num_clips)\n                ]\n            inds = np.concatenate(\n                [np.arange(i, i + clip_len) for i in start_inds])\n        elif clip_len <= num_frames < clip_len * 2:\n            all_inds = []\n            for i in range(self.num_clips):\n                basic = np.arange(clip_len)\n                inds = np.random.choice(\n                    clip_len + 1, num_frames - clip_len, replace=False)\n                offset = np.zeros(clip_len + 1, dtype=np.int64)\n                offset[inds] = 1\n                offset = np.cumsum(offset)\n                inds = basic + offset[:-1]\n                all_inds.append(inds)\n            inds = np.concatenate(all_inds)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:402-427"
+    },
+    "5439": {
+        "file_id": 450,
+        "content": "The code handles the random selection of frame indices for a given clip length and total number of frames. It considers three scenarios: when there are fewer frames than the clip length, exactly equal to the clip length, or between the clip length and twice the clip length. It uses list comprehension and numpy functions to generate the desired indices for each case.",
+        "type": "comment"
+    },
+    "5440": {
+        "file_id": 450,
+        "content": "        else:\n            bids = np.array(\n                [i * num_frames // clip_len for i in range(clip_len + 1)])\n            bsize = np.diff(bids)\n            bst = bids[:clip_len]\n            all_inds = []\n            for i in range(self.num_clips):\n                offset = np.random.randint(bsize)\n                all_inds.append(bst + offset)\n            inds = np.concatenate(all_inds)\n        return inds\n    def __call__(self, results):\n        num_frames = results['total_frames']\n        if self.test_mode:\n            inds = self._get_test_clips(num_frames, self.clip_len)\n        else:\n            inds = self._get_train_clips(num_frames, self.clip_len)\n        inds = np.mod(inds, num_frames)\n        start_index = results['start_index']\n        inds = inds + start_index\n        results['frame_inds'] = inds.astype(np.int)\n        results['clip_len'] = self.clip_len\n        results['frame_interval'] = None\n        results['num_clips'] = self.num_clips\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:428-459"
+    },
+    "5441": {
+        "file_id": 450,
+        "content": "This code defines a class for generating frame indices for skeleton data in PaddleVideo. It has methods to generate clips for training or testing, and returns the generated clips as results. The class takes parameters such as clip length, number of clips, total frames, etc. It ensures that the returned frame indices are within the range of total frames and converts them to integers.",
+        "type": "comment"
+    },
+    "5442": {
+        "file_id": 450,
+        "content": "                    f'clip_len={self.clip_len}, '\n                    f'num_clips={self.num_clips}, '\n                    f'test_mode={self.test_mode}, '\n                    f'seed={self.seed})')\n        return repr_str\n@PIPELINES.register()\nclass PoseDecode:\n    \"\"\"Load and decode pose with given indices.\n    Required keys are \"keypoint\", \"frame_inds\" (optional), \"keypoint_score\"\n    (optional), added or modified keys are \"keypoint\", \"keypoint_score\" (if\n    applicable).\n    \"\"\"\n    @staticmethod\n    def _load_kp(kp, frame_inds):\n        \"\"\"Load keypoints given frame indices.\n        Args:\n            kp (np.ndarray): The keypoint coordinates.\n            frame_inds (np.ndarray): The frame indices.\n        \"\"\"\n        return [x[frame_inds].astype(np.float32) for x in kp]\n    @staticmethod\n    def _load_kpscore(kpscore, frame_inds):\n        \"\"\"Load keypoint scores given frame indices.\n        Args:\n            kpscore (np.ndarray): The confidence scores of keypoints.\n            frame_inds (np.ndarray): The frame indices.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:460-493"
+    },
+    "5443": {
+        "file_id": 450,
+        "content": "This code defines a PoseDecode class that loads and decodes pose with given indices. It requires \"keypoint\" and \"frame_inds\" keys, and optionally \"keypoint_score\". The _load_kp static method loads keypoint coordinates based on frame indices, while the _load_kpscore method loads keypoint scores with frame indices. Both methods return arrays of float32 values for keypoint coordinates or scores respectively. This class is registered at PIPelines for further usage.",
+        "type": "comment"
+    },
+    "5444": {
+        "file_id": 450,
+        "content": "        \"\"\"\n        return [x[frame_inds].astype(np.float32) for x in kpscore]\n    def __call__(self, results):\n        if 'frame_inds' not in results:\n            results['frame_inds'] = np.arange(results['total_frames'])\n        if results['frame_inds'].ndim != 1:\n            results['frame_inds'] = np.squeeze(results['frame_inds'])\n        offset = results.get('offset', 0)\n        frame_inds = results['frame_inds'] + offset\n        if 'keypoint_score' in results:\n            kpscore = results['keypoint_score']\n            results['keypoint_score'] = kpscore[:, frame_inds].astype(\n                np.float32)\n        if 'keypoint' in results:\n            results['keypoint'] = results['keypoint'][:, frame_inds].astype(\n                np.float32)\n        return results\n    def __repr__(self):\n        repr_str = f'{self.__class__.__name__}()'\n        return repr_str\n@PIPELINES.register()\nclass PoseCompact:\n    \"\"\"Convert the coordinates of keypoints to make it more compact.\n    Specifically, it first find a tight bounding box that surrounds all joints",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:494-528"
+    },
+    "5445": {
+        "file_id": 450,
+        "content": "This code defines a PoseCompact class, which is a pipeline for converting keypoint coordinates into a more compact representation. It takes results from previous steps and processes 'keypoint_score' and 'keypoint' keys based on frame indices. If present, it extracts the keypoint scores and keypoint coordinates for the specified frames and converts them to float32 type.",
+        "type": "comment"
+    },
+    "5446": {
+        "file_id": 450,
+        "content": "    in each frame, then we expand the tight box by a given padding ratio. For\n    example, if 'padding == 0.25', then the expanded box has unchanged center,\n    and 1.25x width and height.\n    Required keys in results are \"img_shape\", \"keypoint\", add or modified keys\n    are \"img_shape\", \"keypoint\", \"crop_quadruple\".\n    Args:\n        padding (float): The padding size. Default: 0.25.\n        threshold (int): The threshold for the tight bounding box. If the width\n            or height of the tight bounding box is smaller than the threshold,\n            we do not perform the compact operation. Default: 10.\n        hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded\n            box. Float indicates the specific ratio and tuple indicates a\n            ratio range. If set as None, it means there is no requirement on\n            hw_ratio. Default: None.\n        allow_imgpad (bool): Whether to allow expanding the box outside the\n            image to meet the hw_ratio requirement. Default: True.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:529-546"
+    },
+    "5447": {
+        "file_id": 450,
+        "content": "This function expands tight bounding boxes by a given padding ratio and adds new key \"crop_quadruple\". It requires keys \"img_shape\", \"keypoint\" and may modify them. The threshold determines if the box is too small to expand, hw_ratio sets the box aspect ratio (optional), and allow_imgpad allows expanding outside image for hw_ratio (optional). Default values are padding=0.25, threshold=10, hw_ratio=None, allow_imgpad=True.",
+        "type": "comment"
+    },
+    "5448": {
+        "file_id": 450,
+        "content": "    Returns:\n        type: Description of returned object.\n    \"\"\"\n    def __init__(self,\n                 padding=0.25,\n                 threshold=10,\n                 hw_ratio=None,\n                 allow_imgpad=True):\n        self.padding = padding\n        self.threshold = threshold\n        if hw_ratio is not None:\n            hw_ratio = _pair(hw_ratio)\n        self.hw_ratio = hw_ratio\n        self.allow_imgpad = allow_imgpad\n        assert self.padding >= 0\n    def _combine_quadruple(self, a, b):\n        return (a[0] + a[2] * b[0], a[1] + a[3] * b[1], a[2] * b[2],\n                a[3] * b[3])\n    def __call__(self, results):\n        img_shape = results['img_shape']\n        h, w = img_shape\n        kp = results['keypoint']\n        # Make NaN zero\n        kp[np.isnan(kp)] = 0.\n        kp_x = kp[..., 0]\n        kp_y = kp[..., 1]\n        min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)\n        min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)\n        max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)\n        max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:548-585"
+    },
+    "5449": {
+        "file_id": 450,
+        "content": "This code is initializing a class for skeleton pipeline. It takes parameters such as padding, threshold, hw_ratio, and allow_imgpad. The class also has methods to combine quadruples, apply transformations, and handle keypoints in the image. The code performs various operations like making NaN values zero, finding minimum and maximum keypoint coordinates, and applying padding if needed.",
+        "type": "comment"
+    },
+    "5450": {
+        "file_id": 450,
+        "content": "        # The compact area is too small\n        if max_x - min_x < self.threshold or max_y - min_y < self.threshold:\n            return results\n        center = ((max_x + min_x) / 2, (max_y + min_y) / 2)\n        half_width = (max_x - min_x) / 2 * (1 + self.padding)\n        half_height = (max_y - min_y) / 2 * (1 + self.padding)\n        if self.hw_ratio is not None:\n            half_height = max(self.hw_ratio[0] * half_width, half_height)\n            half_width = max(1 / self.hw_ratio[1] * half_height, half_width)\n        min_x, max_x = center[0] - half_width, center[0] + half_width\n        min_y, max_y = center[1] - half_height, center[1] + half_height\n        # hot update\n        if not self.allow_imgpad:\n            min_x, min_y = int(max(0, min_x)), int(max(0, min_y))\n            max_x, max_y = int(min(w, max_x)), int(min(h, max_y))\n        else:\n            min_x, min_y = int(min_x), int(min_y)\n            max_x, max_y = int(max_x), int(max_y)\n        kp_x[kp_x != 0] -= min_x\n        kp_y[kp_y != 0] -= min_y",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:587-611"
+    },
+    "5451": {
+        "file_id": 450,
+        "content": "This code checks if the compact area is too small and adjusts the bounding box parameters accordingly. It calculates the center, half-width, and half-height of the bounding box. If the aspect ratio should be maintained (hw_ratio), it ensures that by adjusting half_height based on half_width. The code then updates the minimum and maximum x and y values within the constraints of the image's width and height, unless allow_imgpad is True, in which case it doesn't limit the bounding box size. Finally, it adjusts the x and y coordinates of the keypoints by subtracting the new min_x and min_y to maintain their relative positions within the adjusted bounding box.",
+        "type": "comment"
+    },
+    "5452": {
+        "file_id": 450,
+        "content": "        new_shape = (max_y - min_y, max_x - min_x)\n        results['img_shape'] = new_shape\n        # the order is x, y, w, h (in [0, 1]), a tuple\n        crop_quadruple = results.get('crop_quadruple', (0., 0., 1., 1.))\n        new_crop_quadruple = (min_x / w, min_y / h, (max_x - min_x) / w,\n                              (max_y - min_y) / h)\n        crop_quadruple = self._combine_quadruple(crop_quadruple,\n                                                 new_crop_quadruple)\n        results['crop_quadruple'] = crop_quadruple\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '\n                    f'threshold={self.threshold}, '\n                    f'hw_ratio={self.hw_ratio}, '\n                    f'allow_imgpad={self.allow_imgpad})')\n        return repr_str\nclass CropBase:\n    @staticmethod\n    def _crop_kps(kps, crop_bbox):\n        return kps - crop_bbox[:2]\n    @staticmethod\n    def _crop_imgs(imgs, crop_bbox):\n        x1, y1, x2, y2 = crop_bbox",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:613-640"
+    },
+    "5453": {
+        "file_id": 450,
+        "content": "This code segment is part of a pipeline for skeleton detection in images. It calculates the new image shape based on the cropping region, updates the 'crop_quadruple' in the results dictionary, and defines two static methods for cropping keypoints (_crop_kps) and cropped images (_crop_imgs). The CropBase class provides functionality to crop keypoints based on the provided crop region.",
+        "type": "comment"
+    },
+    "5454": {
+        "file_id": 450,
+        "content": "        return [img[y1:y2, x1:x2] for img in imgs]\n    @staticmethod\n    def _box_crop(box, crop_bbox):\n        \"\"\"Crop the bounding boxes according to the crop_bbox.\n        Args:\n            box (np.ndarray): The bounding boxes.\n            crop_bbox(np.ndarray): The bbox used to crop the original image.\n        \"\"\"\n        x1, y1, x2, y2 = crop_bbox\n        img_w, img_h = x2 - x1, y2 - y1\n        box_ = box.copy()\n        box_[..., 0::2] = np.clip(box[..., 0::2] - x1, 0, img_w - 1)\n        box_[..., 1::2] = np.clip(box[..., 1::2] - y1, 0, img_h - 1)\n        return box_\n    def _all_box_crop(self, results, crop_bbox):\n        \"\"\"Crop the gt_bboxes and proposals in results according to crop_bbox.\n        Args:\n            results (dict): All information about the sample, which contain\n                'gt_bboxes' and 'proposals' (optional).\n            crop_bbox(np.ndarray): The bbox used to crop the original image.\n        \"\"\"\n        results['gt_bboxes'] = self._box_crop(results['gt_bboxes'], crop_bbox)\n        if 'proposals' in results and results['proposals'] is not None:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:641-669"
+    },
+    "5455": {
+        "file_id": 450,
+        "content": "This code defines a function `_all_box_crop` that crops the gt_bboxes and proposals in results according to the crop_bbox. It first applies the `_box_crop` function to 'gt_bboxes', then if 'proposals' are present and not None, it also applies the `_box_crop` function to them. The `_box_crop` function crops bounding boxes by subtracting the x1, y1 coordinates from their x and y values respectively, ensuring they fall within the new image dimensions.",
+        "type": "comment"
+    },
+    "5456": {
+        "file_id": 450,
+        "content": "            assert results['proposals'].shape[1] == 4\n            results['proposals'] = self._box_crop(results['proposals'],\n                                                  crop_bbox)\n        return results\n    def __call__(self, results):\n        raise NotImplementedError\n@PIPELINES.register()\nclass RandomResizedCrop_V2(CropBase):\n    \"\"\"Random crop that specifics the area and height-weight ratio range.\n    Required keys in results are \"img_shape\", \"crop_bbox\", \"imgs\" (optional),\n    \"keypoint\" (optional), added or modified keys are \"imgs\", \"keypoint\",\n    \"crop_bbox\" and \"lazy\"; Required keys in \"lazy\" are \"flip\", \"crop_bbox\",\n    added or modified key is \"crop_bbox\".\n    Args:\n        area_range (Tuple[float]): The candidate area scales range of\n            output cropped images. Default: (0.08, 1.0).\n        aspect_ratio_range (Tuple[float]): The candidate aspect ratio range of\n            output cropped images. Default: (3 / 4, 4 / 3).\n        lazy (bool): Determine whether to apply lazy operation. Default: False.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:670-693"
+    },
+    "5457": {
+        "file_id": 450,
+        "content": "This code defines a RandomResizedCrop_V2 pipeline that randomly crops an image to a specified area and height-weight ratio range. The required keys in results are \"img_shape\", \"crop_bbox\", and \"imgs\" (optional). The modified keys are \"imgs\", \"keypoint\", \"crop_bbox\", and \"lazy\". The required keys in \"lazy\" are \"flip\", \"crop_bbox\". It provides an area range and aspect ratio range for the output cropped images.",
+        "type": "comment"
+    },
+    "5458": {
+        "file_id": 450,
+        "content": "    \"\"\"\n    def __init__(self,\n                 area_range=(0.08, 1.0),\n                 aspect_ratio_range=(3 / 4, 4 / 3),\n                 lazy=False):\n        self.area_range = eval(area_range)\n        self.aspect_ratio_range = aspect_ratio_range\n        self.lazy = lazy\n        if not is_tuple_of(self.area_range, float):\n            raise TypeError(f'Area_range must be a tuple of float, '\n                            f'but got {type(area_range)}')\n        if not is_tuple_of(self.aspect_ratio_range, float):\n            raise TypeError(f'Aspect_ratio_range must be a tuple of float, '\n                            f'but got {type(aspect_ratio_range)}')\n    @staticmethod\n    def get_crop_bbox(img_shape,\n                      area_range,\n                      aspect_ratio_range,\n                      max_attempts=10):\n        \"\"\"Get a crop bbox given the area range and aspect ratio range.\n        Args:\n            img_shape (Tuple[int]): Image shape\n            area_range (Tuple[float]): The candidate area scales range of",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:694-719"
+    },
+    "5459": {
+        "file_id": 450,
+        "content": "This code initializes a class with area_range, aspect_ratio_range, and lazy parameters. It checks if the ranges are tuples of floats, and raises TypeError if not. The get_crop_bbox static method takes image shape, area range, aspect ratio range, and max attempts as arguments to return a crop bounding box.",
+        "type": "comment"
+    },
+    "5460": {
+        "file_id": 450,
+        "content": "                output cropped images. Default: (0.08, 1.0).\n            aspect_ratio_range (Tuple[float]): The candidate aspect\n                ratio range of output cropped images. Default: (3 / 4, 4 / 3).\n                max_attempts (int): The maximum of attempts. Default: 10.\n            max_attempts (int): Max attempts times to generate random candidate\n                bounding box. If it doesn't qualified one, the center bounding\n                box will be used.\n        Returns:\n            (list[int]) A random crop bbox within the area range and aspect\n            ratio range.\n        \"\"\"\n        assert 0 < area_range[0] <= area_range[1] <= 1\n        assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]\n        img_h, img_w = img_shape\n        area = img_h * img_w\n        min_ar, max_ar = aspect_ratio_range\n        aspect_ratios = np.exp(\n            np.random.uniform(\n                np.log(min_ar), np.log(max_ar), size=max_attempts))\n        target_areas = np.random.uniform(*area_range, size=max_attempts) * area",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:720-741"
+    },
+    "5461": {
+        "file_id": 450,
+        "content": "This function generates a random crop bounding box within a specified area range and aspect ratio range. It takes image shape, area range, and aspect ratio range as input parameters. The function first checks the validity of the ranges, then calculates the image's total area, minimum and maximum aspect ratios from the aspect ratio range. It uses numpy to generate a list of random candidate bounding box aspect ratios and target areas within the specified ranges.",
+        "type": "comment"
+    },
+    "5462": {
+        "file_id": 450,
+        "content": "        candidate_crop_w = np.round(np.sqrt(\n            target_areas * aspect_ratios)).astype(np.int32)\n        candidate_crop_h = np.round(np.sqrt(\n            target_areas / aspect_ratios)).astype(np.int32)\n        for i in range(max_attempts):\n            crop_w = candidate_crop_w[i]\n            crop_h = candidate_crop_h[i]\n            if crop_h <= img_h and crop_w <= img_w:\n                x_offset = random.randint(0, img_w - crop_w)\n                y_offset = random.randint(0, img_h - crop_h)\n                return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h\n        # Fallback\n        crop_size = min(img_h, img_w)\n        x_offset = (img_w - crop_size) // 2\n        y_offset = (img_h - crop_size) // 2\n        return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size\n    def __call__(self, results):\n        \"\"\"Performs the RandomResizeCrop augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:742-766"
+    },
+    "5463": {
+        "file_id": 450,
+        "content": "This code calculates random crop sizes based on the aspect ratios and target areas, then attempts to find a suitable crop region within the image. If a suitable crop is found, it returns the offsets and dimensions of that crop. If not, it falls back to a centered crop with minimum size. This function is called as part of a pipeline for image augmentation.",
+        "type": "comment"
+    },
+    "5464": {
+        "file_id": 450,
+        "content": "        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        if 'keypoint' in results:\n            assert not self.lazy, ('Keypoint Augmentations are not compatible '\n                                   'with lazy == True')\n        img_h, img_w = results['img_shape']\n        left, top, right, bottom = self.get_crop_bbox(\n            (img_h, img_w), self.area_range, self.aspect_ratio_range)\n        new_h, new_w = bottom - top, right - left\n        if 'crop_quadruple' not in results:\n            results['crop_quadruple'] = np.array(\n                [0, 0, 1, 1],  # x, y, w, h\n                dtype=np.float32)\n        x_ratio, y_ratio = left / img_w, top / img_h\n        w_ratio, h_ratio = new_w / img_w, new_h / img_h\n        old_crop_quadruple = results['crop_quadruple']\n        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]\n        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]\n        new_crop_quadruple = [\n            old_x_ratio + x_ratio * old_w_ratio,",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:767-791"
+    },
+    "5465": {
+        "file_id": 450,
+        "content": "This code initializes and adjusts the crop quadruple of an image based on its aspect ratio, area range, and size. It ensures that the 'keypoint' is not applied if lazy augmentation is enabled. The crop quadruple contains x, y, width, and height values representing the image's cropping region, which are updated according to the original image's dimensions and the desired aspect ratio range.",
+        "type": "comment"
+    },
+    "5466": {
+        "file_id": 450,
+        "content": "            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,\n            h_ratio * old_h_ratio\n        ]\n        results['crop_quadruple'] = np.array(\n            new_crop_quadruple, dtype=np.float32)\n        crop_bbox = np.array([left, top, right, bottom])\n        results['crop_bbox'] = crop_bbox\n        results['img_shape'] = (new_h, new_w)\n        if not self.lazy:\n            if 'keypoint' in results:\n                results['keypoint'] = self._crop_kps(results['keypoint'],\n                                                     crop_bbox)\n            if 'imgs' in results:\n                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)\n        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Put Flip at last for now')\n            # record crop_bbox in lazyop dict to ensure only crop once in Fuse\n            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']\n            left = left * (lazy_right - lazy_left) / img_w",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:792-815"
+    },
+    "5467": {
+        "file_id": 450,
+        "content": "This code performs cropping on images and bboxes based on given ratios. It updates the results dictionary with cropped quadruple, crop bbox, and new image shape. If not in lazy mode, it crops keypoints and images using these values. If in lazy mode, it stores the left, top, right, and bottom crop positions for later fusion.",
+        "type": "comment"
+    },
+    "5468": {
+        "file_id": 450,
+        "content": "            right = right * (lazy_right - lazy_left) / img_w\n            top = top * (lazy_bottom - lazy_top) / img_h\n            bottom = bottom * (lazy_bottom - lazy_top) / img_h\n            lazyop['crop_bbox'] = np.array(\n                [(lazy_left + left), (lazy_top + top), (lazy_left + right),\n                 (lazy_top + bottom)],\n                dtype=np.float32)\n        if 'gt_bboxes' in results:\n            assert not self.lazy\n            results = self._all_box_crop(results, results['crop_bbox'])\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'area_range={self.area_range}, '\n                    f'aspect_ratio_range={self.aspect_ratio_range}, '\n                    f'lazy={self.lazy})')\n        return repr_str\ndef is_seq_of(seq, expected_type, seq_type=None):\n    \"\"\"Check whether it is a sequence of some type.\n    Args:\n        seq (Sequence): The sequence to be checked.\n        expected_type (type): Expected type of sequence items.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:816-843"
+    },
+    "5469": {
+        "file_id": 450,
+        "content": "This code is a pipeline for skeleton processing in PaddleVideo. It scales the bounding box and applies it to the lazy operation, performs cropping based on the new bounding box, and if 'gt_bboxes' is present in results, it crops all other boxes accordingly. The class also has a __repr__ method for string representation. There is also a helper function, is_seq_of, which checks whether a sequence contains items of a specific type.",
+        "type": "comment"
+    },
+    "5470": {
+        "file_id": 450,
+        "content": "        seq_type (type, optional): Expected sequence type.\n    Returns:\n        bool: Whether the sequence is valid.\n    \"\"\"\n    if seq_type is None:\n        exp_seq_type = abc.Sequence\n    else:\n        assert isinstance(seq_type, type)\n        exp_seq_type = seq_type\n    if not isinstance(seq, exp_seq_type):\n        return False\n    for item in seq:\n        if not isinstance(item, expected_type):\n            return False\n    return True\ndef is_tuple_of(seq, expected_type):\n    \"\"\"Check whether it is a tuple of some type.\n    A partial method of :func:`is_seq_of`.\n    \"\"\"\n    return is_seq_of(seq, expected_type, seq_type=tuple)\n@PIPELINES.register()\nclass CenterCrop_V2(CropBase):\n    \"\"\"Crop the center area from images.\n    Required keys are \"img_shape\", \"imgs\" (optional), \"keypoint\" (optional),\n    added or modified keys are \"imgs\", \"keypoint\", \"crop_bbox\", \"lazy\" and\n    \"img_shape\". Required keys in \"lazy\" is \"crop_bbox\", added or modified key\n    is \"crop_bbox\".\n    Args:\n        crop_size (int | tuple[int]): (w, h) of crop size.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:844-880"
+    },
+    "5471": {
+        "file_id": 450,
+        "content": "The code defines a function `is_seq_of` that checks if a given sequence is of the expected type. It also defines two partial methods, `is_tuple_of`, which uses `is_seq_of` to check if a sequence is a tuple of a certain type. Lastly, it registers a new pipeline `CenterCrop_V2` for cropping the center area from images with required keys \"img_shape\", \"imgs\" (optional), and modified or added keys as mentioned in the function description.",
+        "type": "comment"
+    },
+    "5472": {
+        "file_id": 450,
+        "content": "        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    def __init__(self, crop_size, lazy=False):\n        self.crop_size = _pair(crop_size)\n        self.lazy = lazy\n        if not is_tuple_of(self.crop_size, int):\n            raise TypeError(f'Crop_size must be int or tuple of int, '\n                            f'but got {type(crop_size)}')\n    def __call__(self, results):\n        \"\"\"Performs the CenterCrop augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        if 'keypoint' in results:\n            assert not self.lazy, ('Keypoint Augmentations are not compatible '\n                                   'with lazy == True')\n        img_h, img_w = results['img_shape']\n        crop_w, crop_h = self.crop_size\n        left = (img_w - crop_w) // 2\n        top = (img_h - crop_h) // 2\n        right = left + crop_w\n        bottom = top + crop_h",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:881-909"
+    },
+    "5473": {
+        "file_id": 450,
+        "content": "This code defines a class for CenterCrop augmentation. It initializes with crop size and lazy operation flag, checks the validity of input parameters, performs CenterCrop operation on images, and handles keypoint augmentations if present in results dictionary.",
+        "type": "comment"
+    },
+    "5474": {
+        "file_id": 450,
+        "content": "        new_h, new_w = bottom - top, right - left\n        crop_bbox = np.array([left, top, right, bottom])\n        results['crop_bbox'] = crop_bbox\n        results['img_shape'] = (new_h, new_w)\n        if 'crop_quadruple' not in results:\n            results['crop_quadruple'] = np.array(\n                [0, 0, 1, 1],  # x, y, w, h\n                dtype=np.float32)\n        x_ratio, y_ratio = left / img_w, top / img_h\n        w_ratio, h_ratio = new_w / img_w, new_h / img_h\n        old_crop_quadruple = results['crop_quadruple']\n        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]\n        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]\n        new_crop_quadruple = [\n            old_x_ratio + x_ratio * old_w_ratio,\n            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,\n            h_ratio * old_h_ratio\n        ]\n        results['crop_quadruple'] = np.array(\n            new_crop_quadruple, dtype=np.float32)\n        if not self.lazy:\n            if 'keypoint' in results:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:910-936"
+    },
+    "5475": {
+        "file_id": 450,
+        "content": "This code calculates the new image shape and crop box coordinates based on the provided top, left, right, and bottom values. It then updates the 'crop_bbox' and 'img_shape' in the results dictionary. If 'crop_quadruple' is not already present in the results, it creates and appends it. The code then calculates new crop quadruple coordinates by adjusting the old ones with the ratios of the original and new image widths and heights. Finally, if 'keypoint' is present in the results, the code proceeds further (presumably for lazy mode).",
+        "type": "comment"
+    },
+    "5476": {
+        "file_id": 450,
+        "content": "                results['keypoint'] = self._crop_kps(results['keypoint'],\n                                                     crop_bbox)\n            if 'imgs' in results:\n                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)\n        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Put Flip at last for now')\n            # record crop_bbox in lazyop dict to ensure only crop once in Fuse\n            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']\n            left = left * (lazy_right - lazy_left) / img_w\n            right = right * (lazy_right - lazy_left) / img_w\n            top = top * (lazy_bottom - lazy_top) / img_h\n            bottom = bottom * (lazy_bottom - lazy_top) / img_h\n            lazyop['crop_bbox'] = np.array(\n                [(lazy_left + left), (lazy_top + top), (lazy_left + right),\n                 (lazy_top + bottom)],\n                dtype=np.float32)\n        if 'gt_bboxes' in results:",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:937-957"
+    },
+    "5477": {
+        "file_id": 450,
+        "content": "This code is handling the case where 'lazyop' contains a flip operation. It records crop_bbox in lazyop to ensure only one crop operation is performed in Fuse. If 'gt_bboxes' is present in results, it indicates ground truth bbox information.",
+        "type": "comment"
+    },
+    "5478": {
+        "file_id": 450,
+        "content": "            assert not self.lazy\n            results = self._all_box_crop(results, results['crop_bbox'])\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}(crop_size={self.crop_size}, '\n                    f'lazy={self.lazy})')\n        return repr_str\n@PIPELINES.register()\nclass Flip_V2:\n    \"\"\"Flip the input images with a probability.\n    Reverse the order of elements in the given imgs with a specific direction.\n    The shape of the imgs is preserved, but the elements are reordered.\n    Required keys are \"img_shape\", \"modality\", \"imgs\" (optional), \"keypoint\"\n    (optional), added or modified keys are \"imgs\", \"keypoint\", \"lazy\" and\n    \"flip_direction\". Required keys in \"lazy\" is None, added or modified key\n    are \"flip\" and \"flip_direction\". The Flip augmentation should be placed\n    after any cropping / reshaping augmentations, to make sure crop_quadruple\n    is calculated properly.\n    Args:\n        flip_ratio (float): Probability of implementing flip. Default: 0.5.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:958-984"
+    },
+    "5479": {
+        "file_id": 450,
+        "content": "This code snippet is registering a pipeline class called \"Flip_V2\". The Flip_V2 class flips the input images with a probability and reverses the order of elements in the given imgs with a specific direction. It requires keys such as \"img_shape\", \"modality\", and \"imgs\" while adding or modifying keys like \"imgs\", \"keypoint\", \"lazy\", and \"flip_direction\". Flip_V2 should be placed after cropping/reshaping augmentations to ensure crop_quadruple is calculated properly. The flip ratio, which determines the probability of implementing flip, is set to 0.5 by default.",
+        "type": "comment"
+    },
+    "5480": {
+        "file_id": 450,
+        "content": "        direction (str): Flip imgs horizontally or vertically. Options are\n            \"horizontal\" | \"vertical\". Default: \"horizontal\".\n        flip_label_map (Dict[int, int] | None): Transform the label of the\n            flipped image with the specific label. Default: None.\n        left_kp (list[int]): Indexes of left keypoints, used to flip keypoints.\n            Default: None.\n        right_kp (list[ind]): Indexes of right keypoints, used to flip\n            keypoints. Default: None.\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    _directions = ['horizontal', 'vertical']\n    def __init__(self,\n                 flip_ratio=0.5,\n                 direction='horizontal',\n                 flip_label_map=None,\n                 left_kp=None,\n                 right_kp=None,\n                 lazy=False):\n        if direction not in self._directions:\n            raise ValueError(f'Direction {direction} is not supported. '\n                             f'Currently support ones are {self._directions}')",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:985-1006"
+    },
+    "5481": {
+        "file_id": 450,
+        "content": "This code snippet is for an object called \"SkeletonPipeline\". It has parameters for direction, flip_label_map, left_kp, right_kp, and lazy. The direction parameter can be either 'horizontal' or 'vertical'. Flip_label_map is a dictionary used to transform the label of flipped images. Left_kp and right_kp are indexes used for flipping keypoints. Lazy determines whether to apply lazy operations, default set to False. The function checks if the given direction is within ['horizontal', 'vertical'].",
+        "type": "comment"
+    },
+    "5482": {
+        "file_id": 450,
+        "content": "        self.flip_ratio = flip_ratio\n        self.direction = direction\n        self.flip_label_map = flip_label_map\n        self.left_kp = left_kp\n        self.right_kp = right_kp\n        self.lazy = lazy\n    def _flip_imgs(self, imgs, modality):\n        _ = [imflip_(img, self.direction) for img in imgs]\n        lt = len(imgs)\n        if modality == 'Flow':\n            # The 1st frame of each 2 frames is flow-x\n            for i in range(0, lt, 2):\n                imgs[i] = iminvert(imgs[i])\n        return imgs\n    def _flip_kps(self, kps, kpscores, img_width):\n        kp_x = kps[..., 0]\n        kp_x[kp_x != 0] = img_width - kp_x[kp_x != 0]\n        new_order = list(range(kps.shape[2]))\n        if self.left_kp is not None and self.right_kp is not None:\n            for left, right in zip(self.left_kp, self.right_kp):\n                new_order[left] = right\n                new_order[right] = left\n        kps = kps[:, :, new_order]\n        if kpscores is not None:\n            kpscores = kpscores[:, :, new_order]\n        return kps, kpscores",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1007-1034"
+    },
+    "5483": {
+        "file_id": 450,
+        "content": "This code is a part of the skeleton_pipeline module in PaddleVideo. It initializes parameters such as flip ratio, direction, and flips label map, and defines a function _flip_imgs for image flipping and another function _flip_kps for keypoints flipping based on the direction provided. The code also includes conditions to handle flow images specifically by inverting the first frame of each two frames.",
+        "type": "comment"
+    },
+    "5484": {
+        "file_id": 450,
+        "content": "    @staticmethod\n    def _box_flip(box, img_width):\n        \"\"\"Flip the bounding boxes given the width of the image.\n        Args:\n            box (np.ndarray): The bounding boxes.\n            img_width (int): The img width.\n        \"\"\"\n        box_ = box.copy()\n        box_[..., 0::4] = img_width - box[..., 2::4]\n        box_[..., 2::4] = img_width - box[..., 0::4]\n        return box_\n    def __call__(self, results):\n        \"\"\"Performs the Flip augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        if 'keypoint' in results:\n            assert not self.lazy, ('Keypoint Augmentations are not compatible '\n                                   'with lazy == True')\n            assert self.direction == 'horizontal', (\n                'Only horizontal flips are'\n                'supported for human keypoints')\n        modality = results['modality']\n        if modality == 'Flow':",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1036-1065"
+    },
+    "5485": {
+        "file_id": 450,
+        "content": "This code snippet is from the PaddleVideo library's skeleton pipeline. It defines a function for flipping bounding boxes and a method that applies horizontal flip augmentation to images, unless the image contains keypoints where only horizontal flip is supported. The code checks if the modality of the image is 'Flow'.",
+        "type": "comment"
+    },
+    "5486": {
+        "file_id": 450,
+        "content": "            assert self.direction == 'horizontal'\n        flip = np.random.rand() < self.flip_ratio\n        results['flip'] = flip\n        results['flip_direction'] = self.direction\n        img_width = results['img_shape'][1]\n        if self.flip_label_map is not None and flip:\n            results['label'] = self.flip_label_map.get(results['label'],\n                                                       results['label'])\n        if not self.lazy:\n            if flip:\n                if 'imgs' in results:\n                    results['imgs'] = self._flip_imgs(results['imgs'], modality)\n                if 'keypoint' in results:\n                    kp = results['keypoint']\n                    kpscore = results.get('keypoint_score', None)\n                    kp, kpscore = self._flip_kps(kp, kpscore, img_width)\n                    results['keypoint'] = kp\n                    if 'keypoint_score' in results:\n                        results['keypoint_score'] = kpscore\n        else:\n            lazyop = results['lazy']",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1066-1090"
+    },
+    "5487": {
+        "file_id": 450,
+        "content": "The code checks if the direction is horizontal, flips the image randomly based on a flip ratio, and updates results accordingly. If the flip label map is not None and flip occurs, it updates the label in the results. If lazy is not set, it flips images and keypoints if necessary, updating results. Otherwise, it stores the operation for later execution.",
+        "type": "comment"
+    },
+    "5488": {
+        "file_id": 450,
+        "content": "            if lazyop['flip']:\n                raise NotImplementedError('Use one Flip please')\n            lazyop['flip'] = flip\n            lazyop['flip_direction'] = self.direction\n        if 'gt_bboxes' in results and flip:\n            assert not self.lazy and self.direction == 'horizontal'\n            width = results['img_shape'][1]\n            results['gt_bboxes'] = self._box_flip(results['gt_bboxes'], width)\n            if 'proposals' in results and results['proposals'] is not None:\n                assert results['proposals'].shape[1] == 4\n                results['proposals'] = self._box_flip(results['proposals'],\n                                                      width)\n        return results\n    def __repr__(self):\n        repr_str = (\n            f'{self.__class__.__name__}('\n            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '\n            f'flip_label_map={self.flip_label_map}, lazy={self.lazy})')\n        return repr_str\n@PIPELINES.register()\nclass FormatShape:\n    \"\"\"Format final imgs shape to the given input_format.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1091-1117"
+    },
+    "5489": {
+        "file_id": 450,
+        "content": "This code snippet is part of the \"SkeletonPipeline\" class in PaddleVideo. It checks if the 'flip' parameter is set and applies horizontal flipping to the 'gt_bboxes' and 'proposals' (if present) based on the direction specified. It also defines a __repr__ method for the class, providing information about its attributes, and registers a new pipeline called \"FormatShape\" for use in the system.",
+        "type": "comment"
+    },
+    "5490": {
+        "file_id": 450,
+        "content": "    Required keys are \"imgs\", \"num_clips\" and \"clip_len\", added or modified\n    keys are \"imgs\" and \"input_shape\".\n    Args:\n        input_format (str): Define the final imgs format.\n        collapse (bool): To collpase input_format N... to ... (NCTHW to CTHW,\n            etc.) if N is 1. Should be set as True when training and testing\n            detectors. Default: False.\n    \"\"\"\n    def __init__(self, input_format, collapse=False):\n        self.input_format = input_format\n        self.collapse = collapse\n        if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:\n            raise ValueError(\n                f'The input format {self.input_format} is invalid.')\n    def __call__(self, results):\n        \"\"\"Performs the FormatShape formating.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        if not isinstance(results['imgs'], np.ndarray):\n            results['imgs'] = np.array(results['imgs'])",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1119-1144"
+    },
+    "5491": {
+        "file_id": 450,
+        "content": "The code defines a class for formatting image data in a specific format based on the input_format parameter. The class takes an input_format and a collapse boolean argument, and checks if the input_format is valid (options are 'NCTHW', 'NCHW', 'NCHW_Flow', or 'NPTCHW'). If results['imgs'] is not of type np.ndarray, it converts it to np.ndarray.",
+        "type": "comment"
+    },
+    "5492": {
+        "file_id": 450,
+        "content": "        imgs = results['imgs']\n        # [M x H x W x C]\n        # M = 1 * N_crops * N_clips * L\n        if self.collapse:\n            assert results['num_clips'] == 1\n        if self.input_format == 'NCTHW':\n            num_clips = results['num_clips']\n            clip_len = results['clip_len']\n            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])\n            # N_crops x N_clips x L x H x W x C\n            imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))\n            # N_crops x N_clips x C x L x H x W\n            imgs = imgs.reshape((-1, ) + imgs.shape[2:])\n            # M' x C x L x H x W\n            # M' = N_crops x N_clips\n        elif self.input_format == 'NCHW':\n            imgs = np.transpose(imgs, (0, 3, 1, 2))\n            # M x C x H x W\n        elif self.input_format == 'NCHW_Flow':\n            num_clips = results['num_clips']\n            clip_len = results['clip_len']\n            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])\n            # N_crops x N_clips x L x H x W x C",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1145-1169"
+    },
+    "5493": {
+        "file_id": 450,
+        "content": "This code is a part of the SkeletonPipeline class in PaddleVideo. It processes images from results and reshapes them based on the input format specified. If input_format is 'NCTHW', it transposes and reshapes the images accordingly, if it's 'NCHW', it only transposes, and if it's 'NCHW_Flow', it also performs reshaping similar to 'NCTHW'. The 'collapse' check ensures that if results have multiple clips, it won't collapse them.",
+        "type": "comment"
+    },
+    "5494": {
+        "file_id": 450,
+        "content": "            imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4))\n            # N_crops x N_clips x L x C x H x W\n            imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) +\n                                imgs.shape[4:])\n            # M' x C' x H x W\n            # M' = N_crops x N_clips\n            # C' = L x C\n        elif self.input_format == 'NPTCHW':\n            num_proposals = results['num_proposals']\n            num_clips = results['num_clips']\n            clip_len = results['clip_len']\n            imgs = imgs.reshape((num_proposals, num_clips * clip_len) +\n                                imgs.shape[1:])\n            # P x M x H x W x C\n            # M = N_clips x L\n            imgs = np.transpose(imgs, (0, 1, 4, 2, 3))\n            # P x M x C x H x W\n        if self.collapse:\n            assert imgs.shape[0] == 1\n            imgs = imgs.squeeze(0)\n        results['imgs'] = imgs\n        results['input_shape'] = imgs.shape\n        return results\n    def __repr__(self):\n        repr_str = self.__class__.__name__",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1170-1197"
+    },
+    "5495": {
+        "file_id": 450,
+        "content": "This code transforms image data into various formats depending on the input format specified. It supports 'NHWC', 'NCHW', and 'NPTCHW' formats. If the collapse parameter is True, it squeezes the first dimension of the images array. The results dictionary is updated with the transformed images and their shape.",
+        "type": "comment"
+    },
+    "5496": {
+        "file_id": 450,
+        "content": "        repr_str += f\"(input_format='{self.input_format}')\"\n        return repr_str\n@PIPELINES.register()\nclass Collect:\n    \"\"\"Collect data from the loader relevant to the specific task.\n    This keeps the items in ``keys`` as it is, and collect items in\n    ``meta_keys`` into a meta item called ``meta_name``.This is usually\n    the last stage of the data loader pipeline.\n    For example, when keys='imgs', meta_keys=('filename', 'label',\n    'original_shape'), meta_name='img_metas', the results will be a dict with\n    keys 'imgs' and 'img_metas', where 'img_metas' is a DataContainer of\n    another dict with keys 'filename', 'label', 'original_shape'.\n    Args:\n        keys (Sequence[str]): Required keys to be collected.\n        meta_name (str): The name of the key that contains meta infomation.\n            This key is always populated. Default: \"img_metas\".\n        meta_keys (Sequence[str]): Keys that are collected under meta_name.\n            The contents of the ``meta_name`` dictionary depends on\n            ``meta_keys``.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1198-1220"
+    },
+    "5497": {
+        "file_id": 450,
+        "content": "The code defines a Pipeline class called \"Collect\" that collects specific data from the loader relevant to the task. It keeps keys as is and gathers items in meta_keys into a meta item called meta_name, typically used as the last stage of the data loader pipeline. The Collect class takes keys, meta_name, and meta_keys as arguments, with default values provided for meta_name.",
+        "type": "comment"
+    },
+    "5498": {
+        "file_id": 450,
+        "content": "            By default this includes:\n            - \"filename\": path to the image file\n            - \"label\": label of the image file\n            - \"original_shape\": original shape of the image as a tuple\n                (h, w, c)\n            - \"img_shape\": shape of the image input to the network as a tuple\n                (h, w, c).  Note that images may be zero padded on the\n                bottom/right, if the batch tensor is larger than this shape.\n            - \"pad_shape\": image shape after padding\n            - \"flip_direction\": a str in (\"horiziontal\", \"vertival\") to\n                indicate if the image is fliped horizontally or vertically.\n            - \"img_norm_cfg\": a dict of normalization information:\n                - mean - per channel mean subtraction\n                - std - per channel std divisor\n                - to_rgb - bool indicating if bgr was converted to rgb\n        nested (bool): If set as True, will apply data[x] = [data[x]] to all\n            items in data. The arg is added for compatibility. Default: False.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1221-1238"
+    },
+    "5499": {
+        "file_id": 450,
+        "content": "This code defines a dictionary containing default parameters for image data loading. It includes fields such as \"filename\", \"label\", \"original_shape\", \"img_shape\", \"pad_shape\", \"flip_direction\", and \"img_norm_cfg\". The \"nested\" argument determines whether these parameters should be applied recursively to all items within the data dictionary.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/55.json b/docs/data/55.json
new file mode 100644
index 000000000..4e1f533af
--- /dev/null
+++ b/docs/data/55.json
@@ -0,0 +1,545 @@
+{
+    "5500": {
+        "file_id": 450,
+        "content": "    \"\"\"\n    def __init__(self,\n                 keys,\n                 meta_keys=('filename', 'label', 'original_shape', 'img_shape',\n                            'pad_shape', 'flip_direction', 'img_norm_cfg'),\n                 meta_name='img_metas'):\n        self.keys = keys\n        self.meta_keys = meta_keys\n        self.meta_name = meta_name\n    def __call__(self, results):\n        \"\"\"Performs the Collect formating.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        data = []\n        for key in self.keys:\n            data.append(results[key])\n        if len(self.meta_keys) != 0:\n            meta = {}\n            for key in self.meta_keys:\n                meta[key] = results[key]\n            data.append(meta)\n        return data\n    def __repr__(self):\n        return (f'{self.__class__.__name__}('\n                f'keys={self.keys}, meta_keys={self.meta_keys}, '\n                f'nested={self.nested})')\n@PIPELINES.register()",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1239-1275"
+    },
+    "5501": {
+        "file_id": 450,
+        "content": "The code defines a class that initializes with specified keys and optional metadata keys. It executes a call method to perform Collect formating on input results, appending each key's data into a list and adding any specified metadata as well. The __repr__ method provides a string representation of the object including its attributes. This pipeline component is registered in @PIPELINES using decorator.",
+        "type": "comment"
+    },
+    "5502": {
+        "file_id": 450,
+        "content": "class GeneratePoseTarget:\n    \"\"\"Generate pseudo heatmaps based on joint coordinates and confidence.\n    Required keys are \"keypoint\", \"img_shape\", \"keypoint_score\" (optional),\n    added or modified keys are \"imgs\".\n    Args:\n        sigma (float): The sigma of the generated gaussian map. Default: 0.6.\n        use_score (bool): Use the confidence score of keypoints as the maximum\n            of the gaussian maps. Default: True.\n        with_kp (bool): Generate pseudo heatmaps for keypoints. Default: True.\n        with_limb (bool): Generate pseudo heatmaps for limbs. At least one of\n            'with_kp' and 'with_limb' should be True. Default: False.\n        skeletons (tuple[tuple]): The definition of human skeletons.\n            Default: ((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), (7, 9),\n                      (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), (13, 15),\n                      (6, 12), (12, 14), (14, 16), (11, 12)),\n            which is the definition of COCO-17p skeletons.\n        double (bool): Output both original heatmaps and flipped heatmaps.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1276-1294"
+    },
+    "5503": {
+        "file_id": 450,
+        "content": "This code defines a class `GeneratePoseTarget` that generates pseudo heatmaps based on joint coordinates and confidence, with optional use of score, limbs, or skeletons. It takes in required keys \"keypoint\", \"img_shape\", and \"keypoint_score\" (optional). It adds or modifies keys as \"imgs\". The class has parameters like sigma, use_score, with_kp, with_limb, skeletons, and double.",
+        "type": "comment"
+    },
+    "5504": {
+        "file_id": 450,
+        "content": "            Default: False.\n        left_kp (tuple[int]): Indexes of left keypoints, which is used when\n            flipping heatmaps. Default: (1, 3, 5, 7, 9, 11, 13, 15),\n            which is left keypoints in COCO-17p.\n        right_kp (tuple[int]): Indexes of right keypoints, which is used when\n            flipping heatmaps. Default: (2, 4, 6, 8, 10, 12, 14, 16),\n            which is right keypoints in COCO-17p.\n    \"\"\"\n    def __init__(self,\n                 sigma=0.6,\n                 use_score=True,\n                 with_kp=True,\n                 with_limb=False,\n                 skeletons=((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),\n                            (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),\n                            (13, 15), (6, 12), (12, 14), (14, 16), (11, 12)),\n                 double=False,\n                 left_kp=(1, 3, 5, 7, 9, 11, 13, 15),\n                 right_kp=(2, 4, 6, 8, 10, 12, 14, 16)):\n        self.sigma = sigma\n        self.use_score = use_score\n        self.with_kp = with_kp",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1295-1318"
+    },
+    "5505": {
+        "file_id": 450,
+        "content": "The function initializes skeleton parameters such as sigma, use_score, with_kp, with_limb, skeletons, double, left_kp, and right_kp. It sets default values for these parameters to be used in the skeleton pipeline process.",
+        "type": "comment"
+    },
+    "5506": {
+        "file_id": 450,
+        "content": "        self.with_limb = with_limb\n        self.double = double\n        # an auxiliary const\n        self.eps = 1e-4\n        assert self.with_kp or self.with_limb, (\n            'At least one of \"with_limb\" '\n            'and \"with_kp\" should be set as True.')\n        self.left_kp = left_kp\n        self.right_kp = right_kp\n        self.skeletons = skeletons\n    def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values):\n        \"\"\"Generate pseudo heatmap for one keypoint in one frame.\n        Args:\n            img_h (int): The height of the heatmap.\n            img_w (int): The width of the heatmap.\n            centers (np.ndarray): The coordinates of corresponding keypoints\n                (of multiple persons).\n            sigma (float): The sigma of generated gaussian.\n            max_values (np.ndarray): The max values of each keypoint.\n        Returns:\n            np.ndarray: The generated pseudo heatmap.\n        \"\"\"\n        heatmap = np.zeros([img_h, img_w], dtype=np.float32)\n        for center, max_value in zip(centers, max_values):",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1319-1349"
+    },
+    "5507": {
+        "file_id": 450,
+        "content": "This code is part of the SkeletonPipeline class, which appears to be related to skeleton detection or tracking. The class takes in parameters such as with_limb, double, eps, left_kp, right_kp, and skeletons. It generates a heatmap for one keypoint in one frame using the generate_a_heatmap method. This method takes in img_h, img_w, centers, sigma, and max_values as parameters to create a pseudo heatmap with a zero initial state, iterates through each center-max_value pair, and fills the heatmap accordingly.",
+        "type": "comment"
+    },
+    "5508": {
+        "file_id": 450,
+        "content": "            mu_x, mu_y = center[0], center[1]\n            if max_value < self.eps:\n                continue\n            st_x = max(int(mu_x - 3 * sigma), 0)\n            ed_x = min(int(mu_x + 3 * sigma) + 1, img_w)\n            st_y = max(int(mu_y - 3 * sigma), 0)\n            ed_y = min(int(mu_y + 3 * sigma) + 1, img_h)\n            x = np.arange(st_x, ed_x, 1, np.float32)\n            y = np.arange(st_y, ed_y, 1, np.float32)\n            # if the keypoint not in the heatmap coordinate system\n            if not (len(x) and len(y)):\n                continue\n            y = y[:, None]\n            patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2)\n            patch = patch * max_value\n            heatmap[st_y:ed_y, st_x:ed_x] = np.maximum(\n                heatmap[st_y:ed_y, st_x:ed_x], patch)\n        return heatmap\n    def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,\n                                start_values, end_values):\n        \"\"\"Generate pseudo heatmap for one limb in one frame.",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1350-1375"
+    },
+    "5509": {
+        "file_id": 450,
+        "content": "This function generates a heatmap for a limb in a frame by calculating the Gaussian kernel patch, and updating the heatmap with it. It checks if the keypoint is within the image boundaries before processing. The keypoint positions are calculated based on center coordinates and sigma values. If the keypoints are not within the image bounds, the function continues to the next iteration without updating the heatmap.",
+        "type": "comment"
+    },
+    "5510": {
+        "file_id": 450,
+        "content": "        Args:\n            img_h (int): The height of the heatmap.\n            img_w (int): The width of the heatmap.\n            starts (np.ndarray): The coordinates of one keypoint in the\n                corresponding limbs (of multiple persons).\n            ends (np.ndarray): The coordinates of the other keypoint in the\n                corresponding limbs (of multiple persons).\n            sigma (float): The sigma of generated gaussian.\n            start_values (np.ndarray): The max values of one keypoint in the\n                corresponding limbs.\n            end_values (np.ndarray): The max values of the other keypoint in\n                the corresponding limbs.\n        Returns:\n            np.ndarray: The generated pseudo heatmap.\n        \"\"\"\n        heatmap = np.zeros([img_h, img_w], dtype=np.float32)\n        for start, end, start_value, end_value in zip(starts, ends,\n                                                      start_values, end_values):\n            value_coeff = min(start_value, end_value)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1377-1398"
+    },
+    "5511": {
+        "file_id": 450,
+        "content": "This function takes in parameters such as image height, width, keypoint coordinates, and values for each limb. It then generates a pseudo heatmap by iterating through the inputs, calculates a value coefficient for each limb based on their start and end values, and returns a numpy array representing the generated heatmap.",
+        "type": "comment"
+    },
+    "5512": {
+        "file_id": 450,
+        "content": "            if value_coeff < self.eps:\n                continue\n            min_x, max_x = min(start[0], end[0]), max(start[0], end[0])\n            min_y, max_y = min(start[1], end[1]), max(start[1], end[1])\n            min_x = max(int(min_x - 3 * sigma), 0)\n            max_x = min(int(max_x + 3 * sigma) + 1, img_w)\n            min_y = max(int(min_y - 3 * sigma), 0)\n            max_y = min(int(max_y + 3 * sigma) + 1, img_h)\n            x = np.arange(min_x, max_x, 1, np.float32)\n            y = np.arange(min_y, max_y, 1, np.float32)\n            if not (len(x) and len(y)):\n                continue\n            y = y[:, None]\n            x_0 = np.zeros_like(x)\n            y_0 = np.zeros_like(y)\n            # distance to start keypoints\n            d2_start = ((x - start[0])**2 + (y - start[1])**2)\n            # distance to end keypoints\n            d2_end = ((x - end[0])**2 + (y - end[1])**2)\n            # the distance between start and end keypoints.\n            d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1399-1427"
+    },
+    "5513": {
+        "file_id": 450,
+        "content": "This code calculates the distance between a pair of keypoints (start and end) for every pixel in the image, based on certain conditions like value_coeff and sigma. It also adjusts the x and y coordinates to avoid out-of-bounds errors. If the resulting arrays of x and y are empty, it skips processing this pair of keypoints. The calculated distances are used for further processing in the codebase.",
+        "type": "comment"
+    },
+    "5514": {
+        "file_id": 450,
+        "content": "            if d2_ab < 1:\n                full_map = self.generate_a_heatmap(img_h, img_w, [start], sigma,\n                                                   [start_value])\n                heatmap = np.maximum(heatmap, full_map)\n                continue\n            coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab\n            a_dominate = coeff <= 0\n            b_dominate = coeff >= 1\n            seg_dominate = 1 - a_dominate - b_dominate\n            position = np.stack([x + y_0, y + x_0], axis=-1)\n            projection = start + np.stack([coeff, coeff],\n                                          axis=-1) * (end - start)\n            d2_line = position - projection\n            d2_line = d2_line[:, :, 0]**2 + d2_line[:, :, 1]**2\n            d2_seg = (a_dominate * d2_start + b_dominate * d2_end +\n                      seg_dominate * d2_line)\n            patch = np.exp(-d2_seg / 2. / sigma**2)\n            patch = patch * value_coeff\n            heatmap[min_y:max_y, min_x:max_x] = np.maximum(\n                heatmap[min_y:max_y, min_x:max_x], patch)",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1429-1453"
+    },
+    "5515": {
+        "file_id": 450,
+        "content": "This code calculates the dominant points and updates the heatmap by applying a Gaussian kernel. It checks if a point is within the start or end of a line segment, and if not, it computes the distance between the point and the line segment. It then uses this distance to compute a weight for each dominant point (start, end) and updates the heatmap using these weights. This ensures that the dominant points have more influence on the heatmap than the less dominant ones.",
+        "type": "comment"
+    },
+    "5516": {
+        "file_id": 450,
+        "content": "        return heatmap\n    def generate_heatmap(self, img_h, img_w, kps, sigma, max_values):\n        \"\"\"Generate pseudo heatmap for all keypoints and limbs in one frame (if\n        needed).\n        Args:\n            img_h (int): The height of the heatmap.\n            img_w (int): The width of the heatmap.\n            kps (np.ndarray): The coordinates of keypoints in this frame.\n            sigma (float): The sigma of generated gaussian.\n            max_values (np.ndarray): The confidence score of each keypoint.\n        Returns:\n            np.ndarray: The generated pseudo heatmap.\n        \"\"\"\n        heatmaps = []\n        if self.with_kp:\n            num_kp = kps.shape[1]\n            for i in range(num_kp):\n                heatmap = self.generate_a_heatmap(img_h, img_w, kps[:, i],\n                                                  sigma, max_values[:, i])\n                heatmaps.append(heatmap)\n        if self.with_limb:\n            for limb in self.skeletons:\n                start_idx, end_idx = limb\n                starts = kps[:, start_idx]",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1455-1483"
+    },
+    "5517": {
+        "file_id": 450,
+        "content": "This code generates a heatmap for all keypoints and limbs in one frame. It takes the height, width, coordinates of keypoints, sigma value, and confidence scores as input. The function iterates over each keypoint to generate separate heatmaps if 'with_kp' is enabled, then appends these individual heatmaps to a list called 'heatmaps'. If 'with_limb' is also enabled, the code generates additional heatmaps for each limb defined in 'self.skeletons' by iterating over them and finding corresponding start and end indices.",
+        "type": "comment"
+    },
+    "5518": {
+        "file_id": 450,
+        "content": "                ends = kps[:, end_idx]\n                start_values = max_values[:, start_idx]\n                end_values = max_values[:, end_idx]\n                heatmap = self.generate_a_limb_heatmap(\n                    img_h, img_w, starts, ends, sigma, start_values, end_values)\n                heatmaps.append(heatmap)\n        return np.stack(heatmaps, axis=-1)\n    def gen_an_aug(self, results):\n        \"\"\"Generate pseudo heatmaps for all frames.\n        Args:\n            results (dict): The dictionary that contains all info of a sample.\n        Returns:\n            list[np.ndarray]: The generated pseudo heatmaps.\n        \"\"\"\n        all_kps = results['keypoint']\n        kp_shape = all_kps.shape\n        if 'keypoint_score' in results:\n            all_kpscores = results['keypoint_score']\n        else:\n            all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32)\n        img_h, img_w = results['img_shape']\n        num_frame = kp_shape[1]\n        imgs = []\n        for i in range(num_frame):\n            sigma = self.sigma",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1484-1517"
+    },
+    "5519": {
+        "file_id": 450,
+        "content": "The code defines a function that generates pseudo heatmaps for all frames in an image sequence. It extracts keypoint coordinates and scores from the input results, and then creates heatmaps by calling another function to generate limb heatmaps for each frame. These heatmaps are appended into a list and finally stacked along a specific axis to form the output.",
+        "type": "comment"
+    },
+    "5520": {
+        "file_id": 450,
+        "content": "            kps = all_kps[:, i]\n            kpscores = all_kpscores[:, i]\n            max_values = np.ones(kpscores.shape, dtype=np.float32)\n            if self.use_score:\n                max_values = kpscores\n            hmap = self.generate_heatmap(img_h, img_w, kps, sigma, max_values)\n            imgs.append(hmap)\n        return imgs\n    def __call__(self, results):\n        if not self.double:\n            results['imgs'] = np.stack(self.gen_an_aug(results))\n        else:\n            results_ = cp.deepcopy(results)\n            flip = Flip_V2(\n                flip_ratio=1, left_kp=self.left_kp, right_kp=self.right_kp)\n            results_ = flip(results_)\n            results['imgs'] = np.concatenate(\n                [self.gen_an_aug(results),\n                 self.gen_an_aug(results_)])\n        results['label'] = np.array([results['label']])\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'sigma={self.sigma}, '\n                    f'use_score={self.use_score}, '",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1518-1547"
+    },
+    "5521": {
+        "file_id": 450,
+        "content": "This code defines a class that generates heatmaps from keypoints and applies data augmentation to images. It takes in results as input, generates image heatmaps based on the keypoints, and optionally doubles the output by applying horizontal flipping with a specified left and right keypoint. The sigma value is used for Gaussian filtering, and the use_score flag determines whether scores are considered for generating heatmaps.",
+        "type": "comment"
+    },
+    "5522": {
+        "file_id": 450,
+        "content": "                    f'with_kp={self.with_kp}, '\n                    f'with_limb={self.with_limb}, '\n                    f'skeletons={self.skeletons}, '\n                    f'double={self.double}, '\n                    f'left_kp={self.left_kp}, '\n                    f'right_kp={self.right_kp})')\n        return repr_str",
+        "type": "code",
+        "location": "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1548-1554"
+    },
+    "5523": {
+        "file_id": 450,
+        "content": "This code is formatting a string that represents the parameters for a skeleton pipeline. The parameters include whether to output keypoints and limbs, the number of skeletons, and left/right keypoint options.",
+        "type": "comment"
+    },
+    "5524": {
+        "file_id": 451,
+        "content": "/paddlevideo/loader/registry.py",
+        "type": "filepath"
+    },
+    "5525": {
+        "file_id": 451,
+        "content": "This code snippet is a part of the PaddleVideo framework and contains copyright information, license details, and registry definitions for pipelines and datasets. It defines two registries, \"pipeline\" and \"datasets\", using the Registry class from the utils module, allowing the creation and management of custom pipeline and dataset classes.",
+        "type": "summary"
+    },
+    "5526": {
+        "file_id": 451,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nPIPELINES = Registry(\"pipeline\")\nDATASETS = Registry(\"datasets\")",
+        "type": "code",
+        "location": "/paddlevideo/loader/registry.py:1-18"
+    },
+    "5527": {
+        "file_id": 451,
+        "content": "This code snippet is a part of the PaddleVideo framework and contains copyright information, license details, and registry definitions for pipelines and datasets. It defines two registries, \"pipeline\" and \"datasets\", using the Registry class from the utils module, allowing the creation and management of custom pipeline and dataset classes.",
+        "type": "comment"
+    },
+    "5528": {
+        "file_id": 452,
+        "content": "/paddlevideo/metrics/ActivityNet/__init__.py",
+        "type": "filepath"
+    },
+    "5529": {
+        "file_id": 452,
+        "content": "The code imports the ANETproposal class from the \"anet_prop\" module and adds it to the __all__ list, making it a public API in the package. This allows other modules to import and use this class directly.",
+        "type": "summary"
+    },
+    "5530": {
+        "file_id": 452,
+        "content": "from .anet_prop import ANETproposal\n__all__ = ['ANETproposal']",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/__init__.py:1-3"
+    },
+    "5531": {
+        "file_id": 452,
+        "content": "The code imports the ANETproposal class from the \"anet_prop\" module and adds it to the __all__ list, making it a public API in the package. This allows other modules to import and use this class directly.",
+        "type": "comment"
+    },
+    "5532": {
+        "file_id": 453,
+        "content": "/paddlevideo/metrics/ActivityNet/anet_prop.py",
+        "type": "filepath"
+    },
+    "5533": {
+        "file_id": 453,
+        "content": "This code imports libraries, defines a class for metrics calculation, retrieves data from ActivityNet API, compares results, creates DataFrames, evaluates proposals using AUC-RC, and calculates average recall. It extracts videos, computes proposal scores, IOU scores, handles exceptions, determines recall with thresholds, and efficiently computes IoU for target and candidate segments.",
+        "type": "summary"
+    },
+    "5534": {
+        "file_id": 453,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport json\nimport numpy as np\nimport pandas as pd\nimport urllib.request as urllib2\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\nclass ANETproposal(object):\n    \"\"\"\n    This class is used for calculating AR@N and AUC;\n    Code transfer from ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)\n    \"\"\"\n    GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']\n    PROPOSAL_FIELDS = ['results', 'version', 'external_data']\n    API = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/challenge19/api.py'",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:1-29"
+    },
+    "5535": {
+        "file_id": 453,
+        "content": "This code imports necessary libraries, defines a class for calculating AR@N and AUC, and sets the API URL for accessing ActivityNet data. The class uses ground truth fields and proposal fields to compare results. Code is transferred from the ActivityNet GitHub repository.",
+        "type": "comment"
+    },
+    "5536": {
+        "file_id": 453,
+        "content": "    def __init__(self,\n                 ground_truth_filename=None,\n                 proposal_filename=None,\n                 ground_truth_fields=GROUND_TRUTH_FIELDS,\n                 proposal_fields=PROPOSAL_FIELDS,\n                 tiou_thresholds=np.linspace(0.5, 0.95, 10),\n                 max_avg_nr_proposals=None,\n                 subset='validation',\n                 verbose=False,\n                 check_status=True):\n        if not ground_truth_filename:\n            raise IOError('Please input a valid ground truth file.')\n        if not proposal_filename:\n            raise IOError('Please input a valid proposal file.')\n        self.subset = subset\n        self.tiou_thresholds = tiou_thresholds\n        self.max_avg_nr_proposals = max_avg_nr_proposals\n        self.verbose = verbose\n        self.gt_fields = ground_truth_fields\n        self.pred_fields = proposal_fields\n        self.recall = None\n        self.avg_recall = None\n        self.proposals_per_video = None\n        self.check_status = check_status",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:31-54"
+    },
+    "5537": {
+        "file_id": 453,
+        "content": "Initializing the class with ground truth and proposal filenames as required, setting default parameters, and checking if both files exist.",
+        "type": "comment"
+    },
+    "5538": {
+        "file_id": 453,
+        "content": "        # Retrieve blocked videos from server.\n        if self.check_status:\n            self.blocked_videos = self.get_blocked_videos()\n        else:\n            self.blocked_videos = list()\n        # Import ground truth and proposals.\n        self.ground_truth, self.activity_index = self._import_ground_truth(\n            ground_truth_filename)\n        self.proposal = self._import_proposal(proposal_filename)\n        if self.verbose:\n            print('[INIT] Loaded annotations from {} subset.'.format(subset))\n            nr_gt = len(self.ground_truth)\n            print('\\tNumber of ground truth instances: {}'.format(nr_gt))\n            nr_pred = len(self.proposal)\n            print('\\tNumber of proposals: {}'.format(nr_pred))\n            print('\\tFixed threshold for tiou score: {}'.format(\n                self.tiou_thresholds))\n    def _import_ground_truth(self, ground_truth_filename):\n        \"\"\"\n        Reads ground truth file, checks if it is well formatted, and returns\n        the ground truth instances and the activity classes.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:55-77"
+    },
+    "5539": {
+        "file_id": 453,
+        "content": "This code retrieves blocked videos from a server, imports ground truth and proposals, and checks if the ground truth file is well formatted. It also prints information about the number of ground truth instances and proposals, as well as the fixed threshold for tiou score.",
+        "type": "comment"
+    },
+    "5540": {
+        "file_id": 453,
+        "content": "        Parameters:\n        ground_truth_filename (str): full path to the ground truth json file.\n        Returns:\n        ground_truth (df): Data frame containing the ground truth instances.\n        activity_index (dict): Dictionary containing class index.\n        \"\"\"\n        with open(ground_truth_filename, 'r') as fobj:\n            data = json.load(fobj)\n        # Checking format\n        if not all([field in data.keys() for field in self.gt_fields]):\n            raise IOError('Please input a valid ground truth file.')\n        # Read ground truth data.\n        activity_index, cidx = {}, 0\n        video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []\n        for videoid, v in data['database'].items():\n            if self.subset != v['subset']:\n                continue\n            if videoid in self.blocked_videos:\n                continue\n            for ann in v['annotations']:\n                if ann['label'] not in activity_index:\n                    activity_index[ann['label']] = cidx\n                    cidx += 1",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:79-102"
+    },
+    "5541": {
+        "file_id": 453,
+        "content": "This function reads a ground truth JSON file and returns a DataFrame containing the instances. It also returns a dictionary of class indices. The function checks if the input file has the required fields, skips videos not in the specified subset, and ignores blocked videos. If an activity label is not found in the activity_index, it adds it to the index and increments the counter.",
+        "type": "comment"
+    },
+    "5542": {
+        "file_id": 453,
+        "content": "                video_lst.append(videoid)\n                t_start_lst.append(float(ann['segment'][0]))\n                t_end_lst.append(float(ann['segment'][1]))\n                label_lst.append(activity_index[ann['label']])\n        ground_truth = pd.DataFrame({\n            'video-id': video_lst,\n            't-start': t_start_lst,\n            't-end': t_end_lst,\n            'label': label_lst\n        })\n        return ground_truth, activity_index\n    def _import_proposal(self, proposal_filename):\n        \"\"\"\n        Reads proposal file, checks if it is well formatted, and returns\n        the proposal instances.\n        Parameters:\n        proposal_filename (str): Full path to the proposal json file.\n        Returns:\n        proposal (df): Data frame containing the proposal instances.\n        \"\"\"\n        with open(proposal_filename, 'r') as fobj:\n            data = json.load(fobj)\n        # Checking format...\n        if not all([field in data.keys() for field in self.pred_fields]):\n            raise IOError('Please input a valid proposal file.')",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:103-130"
+    },
+    "5543": {
+        "file_id": 453,
+        "content": "The code reads a proposal file, checks its format and returns proposal instances in the form of a data frame. It also generates ground truth data by appending video IDs, start and end times, and labels to lists before creating a DataFrame. The function takes a string as input for the full path to the proposal JSON file and returns a data frame containing the proposal instances.",
+        "type": "comment"
+    },
+    "5544": {
+        "file_id": 453,
+        "content": "        # Read predictions.\n        video_lst, t_start_lst, t_end_lst = [], [], []\n        score_lst = []\n        for videoid, v in data['results'].items():\n            if videoid in self.blocked_videos:\n                continue\n            for result in v:\n                video_lst.append(videoid)\n                t_start_lst.append(float(result['segment'][0]))\n                t_end_lst.append(float(result['segment'][1]))\n                score_lst.append(result['score'])\n        proposal = pd.DataFrame({\n            'video-id': video_lst,\n            't-start': t_start_lst,\n            't-end': t_end_lst,\n            'score': score_lst\n        })\n        return proposal\n    def evaluate(self):\n        \"\"\"\n        Evaluates a proposal file. To measure the performance of a\n        method for the proposal task, we computes the area under the\n        average recall vs average number of proposals per video curve.\n        \"\"\"\n        recall, avg_recall, proposals_per_video = self.average_recall_vs_avg_nr_proposals(\n            self.ground_truth,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:132-158"
+    },
+    "5545": {
+        "file_id": 453,
+        "content": "The code reads predictions from a data source, extracts relevant information (video IDs, start and end timestamps, scores), stores them in a DataFrame, and defines two functions: one for evaluating proposal files by computing area under the average recall vs average number of proposals per video curve. The evaluation function calls another function to compute this metric using ground truth data and the stored proposal data.",
+        "type": "comment"
+    },
+    "5546": {
+        "file_id": 453,
+        "content": "            self.proposal,\n            max_avg_nr_proposals=self.max_avg_nr_proposals,\n            tiou_thresholds=self.tiou_thresholds)\n        area_under_curve = np.trapz(avg_recall, proposals_per_video)\n        if self.verbose:\n            print('[RESULTS] Performance on ActivityNet proposal task.')\n            with open(\"data/bmn/BMN_Test_results/auc_result.txt\",\n                      \"a\") as text_file:\n                text_file.write(\n                    '\\tArea Under the AR vs AN curve: {}% \\n'.format(\n                        100. * float(area_under_curve) /\n                        proposals_per_video[-1]))\n            print('\\tArea Under the AR vs AN curve: {}%'.format(\n                100. * float(area_under_curve) / proposals_per_video[-1]))\n        self.recall = recall\n        self.avg_recall = avg_recall\n        self.proposals_per_video = proposals_per_video\n    def average_recall_vs_avg_nr_proposals(self,\n                                           ground_truth,\n                                           proposals,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:159-182"
+    },
+    "5547": {
+        "file_id": 453,
+        "content": "Calculates the area under the curve of recall vs average number of proposals for ActivityNet proposal task, writes result to file and stores recall, average recall, and proposals per video in class attributes.",
+        "type": "comment"
+    },
+    "5548": {
+        "file_id": 453,
+        "content": "                                           max_avg_nr_proposals=None,\n                                           tiou_thresholds=np.linspace(\n                                               0.5, 0.95, 10)):\n        \"\"\"\n        Computes the average recall given an average number of\n        proposals per video.\n        Parameters:\n        ground_truth(df): Data frame containing the ground truth instances.\n            Required fields: ['video-id', 't-start', 't-end']\n        proposal(df): Data frame containing the proposal instances.\n            Required fields: ['video-id, 't-start', 't-end', 'score']\n        tiou_thresholds(1d-array | optional): array with tiou thresholds.\n        Returns:\n        recall(2d-array): recall[i,j] is recall at ith tiou threshold at the jth\n            average number of average number of proposals per video.\n        average_recall(1d-array): recall averaged over a list of tiou threshold.\n            This is equivalent to recall.mean(axis=0).\n        proposals_per_video(1d-array): average number of proposals per video.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:183-202"
+    },
+    "5549": {
+        "file_id": 453,
+        "content": "This code defines a function that computes average recall for given average number of proposals per video. It takes ground truth and proposal data frames as input, along with optional tiou_thresholds. It returns recall and average_recall arrays.",
+        "type": "comment"
+    },
+    "5550": {
+        "file_id": 453,
+        "content": "        \"\"\"\n        # Get list of videos.\n        video_lst = ground_truth['video-id'].unique()\n        if not max_avg_nr_proposals:\n            max_avg_nr_proposals = float(\n                proposals.shape[0]) / video_lst.shape[0]\n        ratio = max_avg_nr_proposals * float(\n            video_lst.shape[0]) / proposals.shape[0]\n        # Adaptation to query faster\n        ground_truth_gbvn = ground_truth.groupby('video-id')\n        proposals_gbvn = proposals.groupby('video-id')\n        # For each video, computes tiou scores among the retrieved proposals.\n        score_lst = []\n        total_nr_proposals = 0\n        for videoid in video_lst:\n            # Get ground-truth instances associated to this video.\n            ground_truth_videoid = ground_truth_gbvn.get_group(videoid)\n            this_video_ground_truth = ground_truth_videoid.loc[:, [\n                't-start', 't-end'\n            ]].values\n            # Get proposals for this video.\n            try:\n                proposals_videoid = proposals_gbvn.get_group(videoid)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:203-231"
+    },
+    "5551": {
+        "file_id": 453,
+        "content": "This code retrieves a list of videos, calculates the maximum average number of proposals per video, groups proposals and ground truth by video ID, and then computes Tiou scores between ground-truth instances and retrieved proposals for each video.",
+        "type": "comment"
+    },
+    "5552": {
+        "file_id": 453,
+        "content": "            except:\n                n = this_video_ground_truth.shape[0]\n                score_lst.append(np.zeros((n, 1)))\n                continue\n            this_video_proposals = proposals_videoid.loc[:,\n                                                         ['t-start', 't-end'\n                                                          ]].values\n            if this_video_proposals.shape[0] == 0:\n                n = this_video_ground_truth.shape[0]\n                score_lst.append(np.zeros((n, 1)))\n                continue\n            # Sort proposals by score.\n            sort_idx = proposals_videoid['score'].argsort()[::-1]\n            this_video_proposals = this_video_proposals[sort_idx, :]\n            if this_video_proposals.ndim != 2:\n                this_video_proposals = np.expand_dims(this_video_proposals,\n                                                      axis=0)\n            if this_video_ground_truth.ndim != 2:\n                this_video_ground_truth = np.expand_dims(\n                    this_video_ground_truth, axis=0)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:232-255"
+    },
+    "5553": {
+        "file_id": 453,
+        "content": "This code block is part of a function that handles exceptions when dealing with video proposals and ground truth. It appends a zero matrix to the score list if there are no video proposals or ground truth data for the current video. If there are proposals, it sorts them by score in descending order and expands dimensions as necessary before proceeding.",
+        "type": "comment"
+    },
+    "5554": {
+        "file_id": 453,
+        "content": "            nr_proposals = np.minimum(\n                int(this_video_proposals.shape[0] * ratio),\n                this_video_proposals.shape[0])\n            total_nr_proposals += nr_proposals\n            this_video_proposals = this_video_proposals[:nr_proposals, :]\n            # Compute tiou scores.\n            tiou = self.wrapper_segment_iou(this_video_proposals,\n                                            this_video_ground_truth)\n            score_lst.append(tiou)\n        # Given that the length of the videos is really varied, we\n        # compute the number of proposals in terms of a ratio of the total\n        # proposals retrieved, i.e. average recall at a percentage of proposals\n        # retrieved per video.\n        # Computes average recall.\n        pcn_lst = np.arange(1, 101) / 100.0 * (max_avg_nr_proposals * float(\n            video_lst.shape[0]) / total_nr_proposals)\n        matches = np.empty((video_lst.shape[0], pcn_lst.shape[0]))\n        positives = np.empty(video_lst.shape[0])\n        recall = np.empty((tiou_thresholds.shape[0], pcn_lst.shape[0]))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:257-278"
+    },
+    "5555": {
+        "file_id": 453,
+        "content": "This code calculates average recall for a set of video proposals. It sets the number of proposals based on a ratio, computes IOU scores, and stores the results in lists. The average recall is computed using a predetermined maximum number of proposals and the total number of proposals retrieved, considering the variable length of videos.",
+        "type": "comment"
+    },
+    "5556": {
+        "file_id": 453,
+        "content": "        # Iterates over each tiou threshold.\n        for ridx, tiou in enumerate(tiou_thresholds):\n            # Inspect positives retrieved per video at different\n            # number of proposals (percentage of the total retrieved).\n            for i, score in enumerate(score_lst):\n                # Total positives per video.\n                positives[i] = score.shape[0]\n                # Find proposals that satisfies minimum tiou threshold.\n                true_positives_tiou = score >= tiou\n                # Get number of proposals as a percentage of total retrieved.\n                pcn_proposals = np.minimum(\n                    (score.shape[1] * pcn_lst).astype(int), score.shape[1])\n                for j, nr_proposals in enumerate(pcn_proposals):\n                    # Compute the number of matches for each percentage of the proposals\n                    matches[i, j] = np.count_nonzero(\n                        (true_positives_tiou[:, :nr_proposals]).sum(axis=1))\n            # Computes recall given the set of matches per video.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:279-298"
+    },
+    "5557": {
+        "file_id": 453,
+        "content": "Code iterates over different tiou thresholds and positive scores, computing the number of true positives based on threshold and percentage of proposals. It calculates matches per video and computes recall for each set of matches.",
+        "type": "comment"
+    },
+    "5558": {
+        "file_id": 453,
+        "content": "            recall[ridx, :] = matches.sum(axis=0) / positives.sum()\n        # Recall is averaged.\n        avg_recall = recall.mean(axis=0)\n        # Get the average number of proposals per video.\n        proposals_per_video = pcn_lst * (float(total_nr_proposals) /\n                                         video_lst.shape[0])\n        return recall, avg_recall, proposals_per_video\n    def get_blocked_videos(self, api=API):\n        api_url = '{}?action=get_blocked'.format(api)\n        req = urllib2.Request(api_url)\n        response = urllib2.urlopen(req)\n        return json.loads(response.read())\n    def wrapper_segment_iou(self, target_segments, candidate_segments):\n        \"\"\"\n        Compute intersection over union btw segments\n        Parameters:\n        target_segments(nd-array): 2-dim array in format [m x 2:=[init, end]]\n        candidate_segments(nd-array): 2-dim array in format [n x 2:=[init, end]]\n        Returns:\n        tiou(nd-array): 2-dim array [n x m] with IOU ratio.\n        Note: It assumes that candidate-segments are more scarce that target-segments",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:299-324"
+    },
+    "5559": {
+        "file_id": 453,
+        "content": "The function calculates recall and average recall for detected objects in videos, based on the number of true positives and total proposals. It also returns the average number of proposals per video. The second function retrieves a list of blocked videos from an API. The third function computes intersection over union between target and candidate segments efficiently.",
+        "type": "comment"
+    },
+    "5560": {
+        "file_id": 453,
+        "content": "        \"\"\"\n        if candidate_segments.ndim != 2 or target_segments.ndim != 2:\n            raise ValueError('Dimension of arguments is incorrect')\n        n, m = candidate_segments.shape[0], target_segments.shape[0]\n        tiou = np.empty((n, m))\n        for i in range(m):\n            tiou[:, i] = self.segment_iou(target_segments[i, :],\n                                          candidate_segments)\n        return tiou\n    def segment_iou(self, target_segment, candidate_segments):\n        \"\"\"\n        Compute the temporal intersection over union between a\n        target segment and all the test segments.\n        Parameters:\n        target_segment(1d-array): Temporal target segment containing [starting, ending] times.\n        candidate_segments(2d-array): Temporal candidate segments containing N x [starting, ending] times.\n        Returns:\n        tiou(1d-array): Temporal intersection over union score of the N's candidate segments.\n        \"\"\"\n        tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:325-349"
+    },
+    "5561": {
+        "file_id": 453,
+        "content": "This function calculates the temporal intersection over union (TIOU) between a target segment and multiple candidate segments. If the dimensions of arguments are not 2, it raises a ValueError. It loops through each candidate segment, compares their starting and ending times with the target segment's times using np.maximum, and stores the TIOU results in a 2D array.",
+        "type": "comment"
+    },
+    "5562": {
+        "file_id": 453,
+        "content": "        tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])\n        # Intersection including Non-negative overlap score.\n        segments_intersection = (tt2 - tt1).clip(0)\n        # Segment union.\n        segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \\\n                         + (target_segment[1] - target_segment[0]) - segments_intersection\n        # Compute overlap as the ratio of the intersection\n        # over union of two segments.\n        tIoU = segments_intersection.astype(float) / segments_union\n        return tIoU",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ActivityNet/anet_prop.py:350-359"
+    },
+    "5563": {
+        "file_id": 453,
+        "content": "Computes intersection over union (IoU) of two segments by finding the minimum endpoints, calculating intersection and union, and dividing intersection by union.",
+        "type": "comment"
+    },
+    "5564": {
+        "file_id": 454,
+        "content": "/paddlevideo/metrics/__init__.py",
+        "type": "filepath"
+    },
+    "5565": {
+        "file_id": 454,
+        "content": "The code imports various metrics from different modules for video analysis and evaluation, including AVAMetric, VOSMetric, BMNMetric, MSRVTTMetric, SkeletonMetric, TransNetV2Metric, DepthMetric, CenterCropMetric, MultiCropMetric, HitOneMetric, and SegmentationMetric. It also imports the METRIC registry for managing these metrics.",
+        "type": "summary"
+    },
+    "5566": {
+        "file_id": 454,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .bmn_metric import BMNMetric\nfrom .build import build_metric\nfrom .center_crop_metric import CenterCropMetric\nfrom .depth_metric import DepthMetric\nfrom .msrvtt_metric import MSRVTTMetric\nfrom .multi_crop_metric import MultiCropMetric\nfrom .registry import METRIC\nfrom .skeleton_metric import SkeletonMetric\nfrom .transnetv2_metric import TransNetV2Metric\nfrom .youtube8m.eval_util import HitOneMetric\nfrom .segmentation_metric import SegmentationMetric",
+        "type": "code",
+        "location": "/paddlevideo/metrics/__init__.py:1-25"
+    },
+    "5567": {
+        "file_id": 454,
+        "content": "The code is importing various metrics from different metric classes for video analysis and evaluation. It includes BMNMetric, MSRVTTMetric, SkeletonMetric, TransNetV2Metric, DepthMetric, CenterCropMetric, MultiCropMetric, HitOneMetric, and SegmentationMetric. The METRIC registry is also imported for managing these metrics.",
+        "type": "comment"
+    },
+    "5568": {
+        "file_id": 454,
+        "content": "from .ava_metric import AVAMetric\nfrom .vos_metric import VOSMetric\nfrom .center_crop_metric_MRI import CenterCropMetric_MRI\nfrom .yowo_metric import YOWOMetric\n__all__ = [\n    'METRIC', 'build_metric', 'MultiCropMetric', 'BMNMetric',\n    'CenterCropMetric', 'SkeletonMetric', 'HitOneMetric', 'TransNetV2Metric',\n    'DepthMetric', 'MSRVTTMetric', 'VOSMetric', 'CenterCropMetric_MRI','AVAMetric',\n    'SegmentationMetric', 'YOWOMetric'\n]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/__init__.py:26-36"
+    },
+    "5569": {
+        "file_id": 454,
+        "content": "This code imports various metrics from different modules and adds them to the __all__ list for easy access, including AVAMetric, VOSMetric, CenterCropMetric_MRI, YOWOMetric, METRIC, build_metric, MultiCropMetric, BMNMetric, CenterCropMetric, SkeletonMetric, HitOneMetric, TransNetV2Metric, DepthMetric, MSRVTTMetric.",
+        "type": "comment"
+    },
+    "5570": {
+        "file_id": 455,
+        "content": "/paddlevideo/metrics/ava_evaluation/README.md",
+        "type": "filepath"
+    },
+    "5571": {
+        "file_id": 455,
+        "content": "This code is from the ActivityNet repo and has been modified to reduce length, possibly for efficiency or better readability. It uses metrics and evaluation methods for action recognition tasks, likely in video analysis applications.",
+        "type": "summary"
+    },
+    "5572": {
+        "file_id": 455,
+        "content": "The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet).\nSome unused codes are removed to minimize the length of codes added.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/README.md:1-2"
+    },
+    "5573": {
+        "file_id": 455,
+        "content": "This code is from the ActivityNet repo and has been modified to reduce length, possibly for efficiency or better readability. It uses metrics and evaluation methods for action recognition tasks, likely in video analysis applications.",
+        "type": "comment"
+    },
+    "5574": {
+        "file_id": 456,
+        "content": "/paddlevideo/metrics/ava_evaluation/metrics.py",
+        "type": "filepath"
+    },
+    "5575": {
+        "file_id": 456,
+        "content": "This function calculates precision and recall metrics from scores, labels, and ground truth instances, raising ValueError for incorrect inputs. It computes average precision using valid precision and recall arrays and calculates CorLoc performance metrics for object detection with given ground truth and detected images per class.",
+        "type": "summary"
+    },
+    "5576": {
+        "file_id": 456,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Functions for computing metrics like precision, recall, CorLoc and etc.\"\"\"\nimport numpy as np\ndef compute_precision_recall(scores, labels, num_gt):\n    \"\"\"Compute precision and recall.\n    Args:\n        scores: A float numpy array representing detection score\n        labels: A boolean numpy array representing true/false positive labels\n        num_gt: Number of ground truth instances\n    Raises:\n        ValueError: if the input is not of the correct format",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/metrics.py:1-30"
+    },
+    "5577": {
+        "file_id": 456,
+        "content": "This code defines a function to compute precision and recall metrics based on input scores, labels, and the number of ground truth instances. It also raises a ValueError if the input is in the incorrect format.",
+        "type": "comment"
+    },
+    "5578": {
+        "file_id": 456,
+        "content": "    Returns:\n        precision: Fraction of positive instances over detected ones. This\n            value is None if no ground truth labels are present.\n        recall: Fraction of detected positive instance over all positive\n            instances. This value is None if no ground truth labels are\n            present.\n    \"\"\"\n    if (not isinstance(labels, np.ndarray) or labels.dtype != np.bool\n            or len(labels.shape) != 1):\n        raise ValueError('labels must be single dimension bool numpy array')\n    if not isinstance(scores, np.ndarray) or len(scores.shape) != 1:\n        raise ValueError('scores must be single dimension numpy array')\n    if num_gt < np.sum(labels):\n        raise ValueError(\n            'Number of true positives must be smaller than num_gt.')\n    if len(scores) != len(labels):\n        raise ValueError('scores and labels must be of the same size.')\n    if num_gt == 0:\n        return None, None\n    sorted_indices = np.argsort(scores)\n    sorted_indices = sorted_indices[::-1]\n    labels = labels.astype(int)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/metrics.py:32-58"
+    },
+    "5579": {
+        "file_id": 456,
+        "content": "This code checks if input 'labels' and 'scores' are valid arrays. It verifies that 'labels' is a one-dimensional boolean numpy array, 'scores' is a one-dimensional numpy array, the number of true positives is less than num_gt (number of ground truth labels), and the lengths of 'scores' and 'labels' are equal. If any conditions are not met, it raises a ValueError with an appropriate error message. If all checks pass and there are no ground truth labels, the function returns None for both precision and recall.",
+        "type": "comment"
+    },
+    "5580": {
+        "file_id": 456,
+        "content": "    true_positive_labels = labels[sorted_indices]\n    false_positive_labels = 1 - true_positive_labels\n    cum_true_positives = np.cumsum(true_positive_labels)\n    cum_false_positives = np.cumsum(false_positive_labels)\n    precision = cum_true_positives.astype(float) / (\n        cum_true_positives + cum_false_positives)\n    recall = cum_true_positives.astype(float) / num_gt\n    return precision, recall\ndef compute_average_precision(precision, recall):\n    \"\"\"Compute Average Precision according to the definition in VOCdevkit.\n    Precision is modified to ensure that it does not decrease as recall\n    decrease.\n    Args:\n        precision: A float [N, 1] numpy array of precisions\n        recall: A float [N, 1] numpy array of recalls\n    Raises:\n        ValueError: if the input is not of the correct format\n    Returns:\n        average_precison: The area under the precision recall curve. NaN if\n            precision and recall are None.\n    \"\"\"\n    if precision is None:\n        if recall is not None:\n            raise ValueError('If precision is None, recall must also be None')",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/metrics.py:59-88"
+    },
+    "5581": {
+        "file_id": 456,
+        "content": "Computes precision and recall from sorted labels, returns both values.\nDefines a function to compute average precision using precision and recall arrays.",
+        "type": "comment"
+    },
+    "5582": {
+        "file_id": 456,
+        "content": "        return np.NAN\n    if not isinstance(precision, np.ndarray) or not isinstance(\n            recall, np.ndarray):\n        raise ValueError('precision and recall must be numpy array')\n    if precision.dtype != np.float or recall.dtype != np.float:\n        raise ValueError('input must be float numpy array.')\n    if len(precision) != len(recall):\n        raise ValueError('precision and recall must be of the same size.')\n    if not precision.size:\n        return 0.0\n    if np.amin(precision) < 0 or np.amax(precision) > 1:\n        raise ValueError('Precision must be in the range of [0, 1].')\n    if np.amin(recall) < 0 or np.amax(recall) > 1:\n        raise ValueError('recall must be in the range of [0, 1].')\n    if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):\n        raise ValueError('recall must be a non-decreasing array')\n    recall = np.concatenate([[0], recall, [1]])\n    precision = np.concatenate([[0], precision, [0]])\n    # Preprocess precision to be a non-decreasing array\n    for i in range(len(precision) - 2, -1, -1):",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/metrics.py:89-111"
+    },
+    "5583": {
+        "file_id": 456,
+        "content": "This function checks the data types and ranges of precision and recall arrays, ensuring they are numpy float arrays within the range [0,1] and have the same size. If all conditions pass, it then concatenates recall and precision arrays with 0 and 1 at the end respectively before preprocessing precision to be a non-decreasing array.",
+        "type": "comment"
+    },
+    "5584": {
+        "file_id": 456,
+        "content": "        precision[i] = np.maximum(precision[i], precision[i + 1])\n    indices = np.where(recall[1:] != recall[:-1])[0] + 1\n    average_precision = np.sum(\n        (recall[indices] - recall[indices - 1]) * precision[indices])\n    return average_precision\ndef compute_cor_loc(num_gt_imgs_per_class,\n                    num_images_correctly_detected_per_class):\n    \"\"\"Compute CorLoc according to the definition in the following paper.\n    https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf\n    Returns nans if there are no ground truth images for a class.\n    Args:\n        num_gt_imgs_per_class: 1D array, representing number of images\n            containing at least one object instance of a particular class\n        num_images_correctly_detected_per_class: 1D array, representing number\n            of images that are correctly detected at least one object instance\n            of a particular class\n    Returns:\n        corloc_per_class: A float numpy array represents the corloc score of\n            each class",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/metrics.py:112-137"
+    },
+    "5585": {
+        "file_id": 456,
+        "content": "This code computes the average precision and CorLoc, which is a metric used to evaluate object detection performance. It takes in arrays of ground truth images per class and correctly detected images per class. The average precision function calculates the average precision by comparing recall values, while the compute_cor_loc function calculates the CorLoc score for each class based on these inputs. If there are no ground truth images for a class, it returns NaN.",
+        "type": "comment"
+    },
+    "5586": {
+        "file_id": 456,
+        "content": "    \"\"\"\n    # Divide by zero expected for classes with no gt examples.\n    with np.errstate(divide='ignore', invalid='ignore'):\n        return np.where(\n            num_gt_imgs_per_class == 0, np.nan,\n            num_images_correctly_detected_per_class / num_gt_imgs_per_class)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/metrics.py:138-143"
+    },
+    "5587": {
+        "file_id": 456,
+        "content": "Divides the number of images correctly detected by the number of ground truth images per class, ignoring division by zero for classes with no examples.",
+        "type": "comment"
+    },
+    "5588": {
+        "file_id": 457,
+        "content": "/paddlevideo/metrics/ava_evaluation/np_box_list.py",
+        "type": "filepath"
+    },
+    "5589": {
+        "file_id": 457,
+        "content": "The \"BoxList\" class manages bounding boxes, and the _is_valid_boxes function checks if data array of shape [N, 4] representing box coordinates adheres to the correct format. The function returns a boolean indicating whether all ymax are greater than or equal to ymin, all xmax are greater than or equal to xmin, and the data is not empty.",
+        "type": "summary"
+    },
+    "5590": {
+        "file_id": 457,
+        "content": "# Copyright 2017 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# =============================================================================\n\"\"\"Numpy BoxList classes and functions.\"\"\"\nimport numpy as np\nclass BoxList:\n    \"\"\"Box collection.\n    BoxList represents a list of bounding boxes as numpy array, where each\n    bounding box is represented as a row of 4 numbers,\n    [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes within\n    a given list correspond to a single image.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:1-26"
+    },
+    "5591": {
+        "file_id": 457,
+        "content": "The code defines a class called \"BoxList\" that represents a collection of bounding boxes as a numpy array. Each box is represented by 4 numbers: y_min, x_min, y_max, and x_max. It assumes all boxes in the list correspond to a single image.",
+        "type": "comment"
+    },
+    "5592": {
+        "file_id": 457,
+        "content": "    Optionally, users can add additional related fields (such as\n    objectness/classification scores).\n    \"\"\"\n    def __init__(self, data):\n        \"\"\"Constructs box collection.\n        Args:\n            data: a numpy array of shape [N, 4] representing box coordinates\n        Raises:\n            ValueError: if bbox data is not a numpy array\n            ValueError: if invalid dimensions for bbox data\n        \"\"\"\n        if not isinstance(data, np.ndarray):\n            raise ValueError('data must be a numpy array.')\n        if len(data.shape) != 2 or data.shape[1] != 4:\n            raise ValueError('Invalid dimensions for box data.')\n        if data.dtype != np.float32 and data.dtype != np.float64:\n            raise ValueError(\n                'Invalid data type for box data: float is required.')\n        if not self._is_valid_boxes(data):\n            raise ValueError('Invalid box data. data must be a numpy array of '\n                             'N*[y_min, x_min, y_max, x_max]')\n        self.data = {'boxes': data}",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:28-52"
+    },
+    "5593": {
+        "file_id": 457,
+        "content": "This code defines a class for box collections, where users can optionally add additional related fields such as objectness or classification scores. The `__init__` method checks if the input data is a numpy array, has valid dimensions and data type (float), and raises a ValueError if any of these conditions are not met. It then stores the data in a dictionary with key \"boxes\".",
+        "type": "comment"
+    },
+    "5594": {
+        "file_id": 457,
+        "content": "    def num_boxes(self):\n        \"\"\"Return number of boxes held in collections.\"\"\"\n        return self.data['boxes'].shape[0]\n    def get_extra_fields(self):\n        \"\"\"Return all non-box fields.\"\"\"\n        return [k for k in self.data if k != 'boxes']\n    def has_field(self, field):\n        return field in self.data\n    def add_field(self, field, field_data):\n        \"\"\"Add data to a specified field.\n        Args:\n            field: a string parameter used to speficy a related field to be\n                accessed.\n            field_data: a numpy array of [N, ...] representing the data\n                associated with the field.\n        Raises:\n            ValueError: if the field is already exist or the dimension of the\n                field data does not matches the number of boxes.\n        \"\"\"\n        if self.has_field(field):\n            raise ValueError('Field ' + field + 'already exists')\n        if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(\n        ):\n            raise ValueError('Invalid dimensions for field data')",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:54-81"
+    },
+    "5595": {
+        "file_id": 457,
+        "content": "This code defines a class with methods to handle box collections. It provides functionality for counting the number of boxes, retrieving non-box fields, checking if a specific field exists, and adding data to an existing or new field while handling errors related to field existence and data dimensions.",
+        "type": "comment"
+    },
+    "5596": {
+        "file_id": 457,
+        "content": "        self.data[field] = field_data\n    def get(self):\n        \"\"\"Convenience function for accesssing box coordinates.\n        Returns:\n            a numpy array of shape [N, 4] representing box corners\n        \"\"\"\n        return self.get_field('boxes')\n    def get_field(self, field):\n        \"\"\"Accesses data associated with the specified field in the box\n        collection.\n        Args:\n            field: a string parameter used to speficy a related field to be\n                accessed.\n        Returns:\n            a numpy 1-d array representing data of an associated field\n        Raises:\n            ValueError: if invalid field\n        \"\"\"\n        if not self.has_field(field):\n            raise ValueError(f'field {field} does not exist')\n        return self.data[field]\n    def get_coordinates(self):\n        \"\"\"Get corner coordinates of boxes.\n        Returns:\n            a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]\n        \"\"\"\n        box_coordinates = self.get()\n        y_min = box_coordinates[:, 0]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:82-117"
+    },
+    "5597": {
+        "file_id": 457,
+        "content": "The code defines a class with methods to access box coordinates from a stored dataset. The \"get\" method returns a numpy array of shape [N, 4] representing box corners. The \"get_field\" method is used to access data related to a specific field in the box collection. If an invalid field is provided, it raises a ValueError. The \"get_coordinates\" method returns a list of 4 1-d numpy arrays containing y_min, x_min, y_max, and x_max values for each box.",
+        "type": "comment"
+    },
+    "5598": {
+        "file_id": 457,
+        "content": "        x_min = box_coordinates[:, 1]\n        y_max = box_coordinates[:, 2]\n        x_max = box_coordinates[:, 3]\n        return [y_min, x_min, y_max, x_max]\n    def _is_valid_boxes(self, data):\n        \"\"\"Check whether data fullfills the format of N*[ymin, xmin, ymax,\n        xmin].\n        Args:\n            data: a numpy array of shape [N, 4] representing box coordinates\n        Returns:\n            a boolean indicating whether all ymax of boxes are equal or greater\n            than ymin, and all xmax of boxes are equal or greater than xmin.\n        \"\"\"\n        if len(data):\n            for v in data:\n                if v[0] > v[2] or v[1] > v[3]:\n                    return False\n        return True",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/np_box_list.py:118-138"
+    },
+    "5599": {
+        "file_id": 457,
+        "content": "This code defines a function `_is_valid_boxes` which checks if the data array of shape [N, 4] representing box coordinates fulfills the format N*[ymin, xmin, ymax, xmax]. It returns a boolean indicating whether all ymax of boxes are equal or greater than ymin and all xmax of boxes are equal or greater than xmin. The function also checks if the data is not empty.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/56.json b/docs/data/56.json
new file mode 100644
index 000000000..6b24e5532
--- /dev/null
+++ b/docs/data/56.json
@@ -0,0 +1,549 @@
+{
+    "5600": {
+        "file_id": 458,
+        "content": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py",
+        "type": "filepath"
+    },
+    "5601": {
+        "file_id": 458,
+        "content": "The code defines functions for numpy array operations on bounding boxes, including area calculation and intersection-over-union scores useful in computer vision tasks. It computes pairwise IoU scores using numpy arrays by dividing the intersection by the second set's box areas.",
+        "type": "summary"
+    },
+    "5602": {
+        "file_id": 458,
+        "content": "# Copyright 2017 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# ==============================================================================\n\"\"\"Operations for [N, 4] numpy arrays representing bounding boxes.\nExample box operations that are supported:\n    * Areas: compute bounding box areas\n    * IOU: pairwise intersection-over-union scores\n\"\"\"\nimport numpy as np\ndef area(boxes):\n    \"\"\"Computes area of boxes.\n    Args:\n        boxes: Numpy array with shape [N, 4] holding N boxes",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:1-29"
+    },
+    "5603": {
+        "file_id": 458,
+        "content": "The code defines functions for performing operations on numpy arrays of bounding boxes. It includes functionality to compute areas and intersection-over-union scores between pairs of boxes. The array holds N boxes and is expected to have shape [N, 4]. These operations are useful in computer vision tasks like object detection and tracking.",
+        "type": "comment"
+    },
+    "5604": {
+        "file_id": 458,
+        "content": "    Returns:\n        a numpy array with shape [N*1] representing box areas\n    \"\"\"\n    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])\ndef intersection(boxes1, boxes2):\n    \"\"\"Compute pairwise intersection areas between boxes.\n    Args:\n        boxes1: a numpy array with shape [N, 4] holding N boxes\n        boxes2: a numpy array with shape [M, 4] holding M boxes\n    Returns:\n        a numpy array with shape [N*M] representing pairwise intersection area\n    \"\"\"\n    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)\n    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)\n    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))\n    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))\n    intersect_heights = np.maximum(\n        np.zeros(all_pairs_max_ymin.shape),\n        all_pairs_min_ymax - all_pairs_max_ymin)\n    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))\n    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))\n    intersect_widths = np.maximum(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:31-57"
+    },
+    "5605": {
+        "file_id": 458,
+        "content": "Computes box areas by multiplying width and height (lines 30-34).\nCalculates pairwise intersection areas between boxes (lines 36-51).",
+        "type": "comment"
+    },
+    "5606": {
+        "file_id": 458,
+        "content": "        np.zeros(all_pairs_max_xmin.shape),\n        all_pairs_min_xmax - all_pairs_max_xmin)\n    return intersect_heights * intersect_widths\ndef iou(boxes1, boxes2):\n    \"\"\"Computes pairwise intersection-over-union between box collections.\n    Args:\n        boxes1: a numpy array with shape [N, 4] holding N boxes.\n        boxes2: a numpy array with shape [M, 4] holding N boxes.\n    Returns:\n        a numpy array with shape [N, M] representing pairwise iou scores.\n    \"\"\"\n    intersect = intersection(boxes1, boxes2)\n    area1 = area(boxes1)\n    area2 = area(boxes2)\n    union = (\n        np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) -\n        intersect)\n    return intersect / union\ndef ioa(boxes1, boxes2):\n    \"\"\"Computes pairwise intersection-over-area between box collections.\n    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as\n    their intersection area over box2's area. Note that ioa is not symmetric,\n    that is, IOA(box1, box2) != IOA(box2, box1).\n    Args:\n        boxes1: a numpy array with shape [N, 4] holding N boxes.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:58-90"
+    },
+    "5607": {
+        "file_id": 458,
+        "content": "The code defines functions for computing pairwise intersection-over-union (iou) and intersection-over-area (ioa) between box collections. The iou function takes two numpy arrays of boxes, computes their intersection using the intersection function, calculates the union by adding the areas of both boxes and subtracting the intersection, and finally returns the pairwise iou scores. The ioa function also takes two numpy arrays of boxes, defines intersection-over-area as the intersection area divided by box2's area, and does not consider symmetry between box1 and box2.",
+        "type": "comment"
+    },
+    "5608": {
+        "file_id": 458,
+        "content": "        boxes2: a numpy array with shape [M, 4] holding N boxes.\n    Returns:\n        a numpy array with shape [N, M] representing pairwise ioa scores.\n    \"\"\"\n    intersect = intersection(boxes1, boxes2)\n    areas = np.expand_dims(area(boxes2), axis=0)\n    return intersect / areas",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:91-98"
+    },
+    "5609": {
+        "file_id": 458,
+        "content": "This code calculates pairwise Intersection over Union (IoU) scores between two sets of bounding boxes represented by numpy arrays. It first computes the intersection of the two sets, then calculates the area of each box in the second set, and finally divides the intersection by the areas to obtain the IoU scores.",
+        "type": "comment"
+    },
+    "5610": {
+        "file_id": 459,
+        "content": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py",
+        "type": "filepath"
+    },
+    "5611": {
+        "file_id": 459,
+        "content": "PaddleVideo's \"object_detection_evaluation\" module offers `ObjectDetectionEvaluator` for evaluating object detection outcomes, including metrics like mAP and mean correct localization, considering IOU threshold. It handles AVA dataset and computes AVA metrics for average precision and mean average precision.",
+        "type": "summary"
+    },
+    "5612": {
+        "file_id": 459,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# =============================================================================\n\"\"\"object_detection_evaluation module.\nObjectDetectionEvaluation is a class which manages ground truth information of\na object detection dataset, and computes frequently used detection metrics such\nas Precision, Recall, CorLoc of the provided detection results.\nIt supports the following operations:\n1) Add ground truth information of images sequentially.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:1-21"
+    },
+    "5613": {
+        "file_id": 459,
+        "content": "The provided code is a part of the \"object_detection_evaluation\" module in the PaddleVideo library. This module provides a class called \"ObjectDetectionEvaluation\" that manages ground truth information for object detection datasets, computes frequently used metrics like Precision, Recall, and CorLoc from detection results. The class supports adding ground truth information sequentially and various operations for evaluation.",
+        "type": "comment"
+    },
+    "5614": {
+        "file_id": 459,
+        "content": "2) Add detection result of images sequentially.\n3) Evaluate detection metrics on already inserted detection results.\n4) Write evaluation result into a pickle file for future processing or\n   visualization.\nNote: This module operates on numpy boxes and box lists.\n\"\"\"\nimport collections\nimport logging\nfrom abc import ABCMeta, abstractmethod\nimport numpy as np\nfrom . import metrics, per_image_evaluation, standard_fields\nclass DetectionEvaluator:\n    \"\"\"Interface for object detection evalution classes.\n    Example usage of the Evaluator:\n    ------------------------------\n    evaluator = DetectionEvaluator(categories)\n    # Detections and groundtruth for image 1.\n    evaluator.add_single_groundtruth_image_info(...)\n    evaluator.add_single_detected_image_info(...)\n    # Detections and groundtruth for image 2.\n    evaluator.add_single_groundtruth_image_info(...)\n    evaluator.add_single_detected_image_info(...)\n    metrics_dict = evaluator.evaluate()\n    \"\"\"\n    __metaclass__ = ABCMeta\n    def __init__(self, categories):",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:22-58"
+    },
+    "5615": {
+        "file_id": 459,
+        "content": "This code defines an abstract class `DetectionEvaluator` for evaluating object detection results. It takes categories as input and allows adding single ground truth and detected image information. After adding all the data, it can be evaluated to get a metrics dictionary. This evaluation is done on numpy boxes and box lists.",
+        "type": "comment"
+    },
+    "5616": {
+        "file_id": 459,
+        "content": "        \"\"\"Constructor.\n        Args:\n            categories: A list of dicts, each of which has the following keys -\n                'id': (required) an integer id uniquely identifying this\n                    category.\n                'name': (required) string representing category name e.g.,\n                    'cat', 'dog'.\n        \"\"\"\n        self._categories = categories\n    @abstractmethod\n    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):\n        \"\"\"Adds groundtruth for a single image to be used for evaluation.\n        Args:\n            image_id: A unique string/integer identifier for the image.\n            groundtruth_dict: A dictionary of groundtruth numpy arrays required\n                for evaluations.\n        \"\"\"\n    @abstractmethod\n    def add_single_detected_image_info(self, image_id, detections_dict):\n        \"\"\"Adds detections for a single image to be used for evaluation.\n        Args:\n            image_id: A unique string/integer identifier for the image.\n            detections_dict: A dictionary of detection numpy arrays required",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:59-86"
+    },
+    "5617": {
+        "file_id": 459,
+        "content": "This code defines a class constructor that takes categories as input and provides two abstract methods for adding ground truth and detected image information for evaluation. The categories are used to uniquely identify different objects in the images.",
+        "type": "comment"
+    },
+    "5618": {
+        "file_id": 459,
+        "content": "                for evaluation.\n        \"\"\"\n    @abstractmethod\n    def evaluate(self):\n        \"\"\"Evaluates detections and returns a dictionary of metrics.\"\"\"\n    @abstractmethod\n    def clear(self):\n        \"\"\"Clears the state to prepare for a fresh evaluation.\"\"\"\nclass ObjectDetectionEvaluator(DetectionEvaluator):\n    \"\"\"A class to evaluate detections.\"\"\"\n    def __init__(\n        self,\n        categories,\n        matching_iou_threshold=0.5,\n        evaluate_corlocs=False,\n        metric_prefix=None,\n        use_weighted_mean_ap=False,\n        evaluate_masks=False,\n    ):\n        \"\"\"Constructor.\n        Args:\n            categories: A list of dicts, each of which has the following keys -\n                'id': (required) an integer id uniquely identifying this\n                    category.\n                'name': (required) string representing category name e.g.,\n                    'cat', 'dog'.\n            matching_iou_threshold: IOU threshold to use for matching\n                groundtruth boxes to detection boxes.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:87-120"
+    },
+    "5619": {
+        "file_id": 459,
+        "content": "This code defines an `ObjectDetectionEvaluator` class that evaluates object detection results. It takes categories, IOU threshold, options for evaluating corner localizations and masks. The `evaluate()` method returns a dictionary of metrics, while the `clear()` method clears the state for a new evaluation.",
+        "type": "comment"
+    },
+    "5620": {
+        "file_id": 459,
+        "content": "            evaluate_corlocs: (optional) boolean which determines if corloc\n                scores are to be returned or not.\n            metric_prefix: (optional) string prefix for metric name; if None,\n                no prefix is used.\n            use_weighted_mean_ap: (optional) boolean which determines if the\n                mean average precision is computed directly from the scores and\n                tp_fp_labels of all classes.\n            evaluate_masks: If False, evaluation will be performed based on\n                boxes. If True, mask evaluation will be performed instead.\n        Raises:\n            ValueError: If the category ids are not 1-indexed.\n        \"\"\"\n        super(ObjectDetectionEvaluator, self).__init__(categories)\n        self._num_classes = max([cat['id'] for cat in categories])\n        if min(cat['id'] for cat in categories) < 1:\n            raise ValueError('Classes should be 1-indexed.')\n        self._matching_iou_threshold = matching_iou_threshold\n        self._use_weighted_mean_ap = use_weighted_mean_ap",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:121-139"
+    },
+    "5621": {
+        "file_id": 459,
+        "content": "This code is initializing the ObjectDetectionEvaluator class, which evaluates object detection performance. It takes in optional parameters for corloc scores, metric prefix, and weighted mean AP computation. It checks if category IDs are 1-indexed and raises a ValueError if not.",
+        "type": "comment"
+    },
+    "5622": {
+        "file_id": 459,
+        "content": "        self._label_id_offset = 1\n        self._evaluate_masks = evaluate_masks\n        self._evaluation = ObjectDetectionEvaluation(\n            num_groundtruth_classes=self._num_classes,\n            matching_iou_threshold=self._matching_iou_threshold,\n            use_weighted_mean_ap=self._use_weighted_mean_ap,\n            label_id_offset=self._label_id_offset,\n        )\n        self._image_ids = set([])\n        self._evaluate_corlocs = evaluate_corlocs\n        self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''\n    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):\n        \"\"\"Adds groundtruth for a single image to be used for evaluation.\n        Args:\n            image_id: A unique string/integer identifier for the image.\n            groundtruth_dict: A dictionary containing -\n                standard_fields.InputDataFields.groundtruth_boxes: float32\n                    numpy array of shape [num_boxes, 4] containing `num_boxes`\n                    groundtruth boxes of the format [ymin, xmin, ymax, xmax] in",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:140-160"
+    },
+    "5623": {
+        "file_id": 459,
+        "content": "This code is initializing an object detection evaluation module, specifically for the Aggregated Average Precision metric. The module takes in parameters such as the number of ground truth classes, matching IOU threshold, and a label offset. It also adds a single image's ground truth information for evaluation purposes. The function expects an image ID and a dictionary containing ground truth boxes information.",
+        "type": "comment"
+    },
+    "5624": {
+        "file_id": 459,
+        "content": "                    absolute image coordinates.\n                standard_fields.InputDataFields.groundtruth_classes: integer\n                    numpy array of shape [num_boxes] containing 1-indexed\n                    groundtruth classes for the boxes.\n                standard_fields.InputDataFields.groundtruth_difficult: Optional\n                    length M numpy boolean array denoting whether a ground\n                    truth box is a difficult instance or not. This field is\n                    optional to support the case that no boxes are difficult.\n                standard_fields.InputDataFields.groundtruth_instance_masks:\n                    Optional numpy array of shape [num_boxes, height, width]\n                    with values in {0, 1}.\n        Raises:\n            ValueError: On adding groundtruth for an image more than once. Will\n                also raise error if instance masks are not in groundtruth\n                dictionary.\n        \"\"\"\n        if image_id in self._image_ids:\n            raise ValueError(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:161-179"
+    },
+    "5625": {
+        "file_id": 459,
+        "content": "The code is defining the `add_groundtruth` function that takes in an image ID and a groundtruth dictionary. It checks if the image ID has been added before, and raises a ValueError if it has. The groundtruth dictionary should contain 'boxes', 'groundtruth_classes', 'groundtruth_difficult' (optional), and 'groundtruth_instance_masks' (if difficult instances). If the groundtruth is valid, it adds the information to the _image_ids set and initializes corresponding arrays for that image ID. If instance masks are not in the groundtruth dictionary, it raises a ValueError.",
+        "type": "comment"
+    },
+    "5626": {
+        "file_id": 459,
+        "content": "                'Image with id {} already added.'.format(image_id))\n        groundtruth_classes = (\n            groundtruth_dict[\n                standard_fields.InputDataFields.groundtruth_classes] -\n            self._label_id_offset)\n        # If the key is not present in the groundtruth_dict or the array is\n        # empty (unless there are no annotations for the groundtruth on this\n        # image) use values from the dictionary or insert None otherwise.\n        if (standard_fields.InputDataFields.groundtruth_difficult\n                in groundtruth_dict.keys()) and (groundtruth_dict[\n                    standard_fields.InputDataFields.groundtruth_difficult].size\n                                                 or\n                                                 not groundtruth_classes.size):\n            groundtruth_difficult = groundtruth_dict[\n                standard_fields.InputDataFields.groundtruth_difficult]\n        else:\n            groundtruth_difficult = None\n            if not len(self._image_ids) % 1000:",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:180-198"
+    },
+    "5627": {
+        "file_id": 459,
+        "content": "This code is checking if an image with a specific id already exists. If not, it retrieves the groundtruth classes and difficult labels from the dictionary, either from existing keys or by setting them to None if not present or empty. It also checks if the image_id is already added to avoid duplicates.",
+        "type": "comment"
+    },
+    "5628": {
+        "file_id": 459,
+        "content": "                logging.warn(('image %s does not have groundtruth difficult '\n                              'flag specified'), image_id)\n        groundtruth_masks = None\n        if self._evaluate_masks:\n            if (standard_fields.InputDataFields.groundtruth_instance_masks\n                    not in groundtruth_dict):\n                raise ValueError(\n                    'Instance masks not in groundtruth dictionary.')\n            groundtruth_masks = groundtruth_dict[\n                standard_fields.InputDataFields.groundtruth_instance_masks]\n        self._evaluation.add_single_ground_truth_image_info(\n            image_key=image_id,\n            groundtruth_boxes=groundtruth_dict[\n                standard_fields.InputDataFields.groundtruth_boxes],\n            groundtruth_class_labels=groundtruth_classes,\n            groundtruth_is_difficult_list=groundtruth_difficult,\n            groundtruth_masks=groundtruth_masks,\n        )\n        self._image_ids.update([image_id])\n    def add_single_detected_image_info(self, image_id, detections_dict):",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:199-219"
+    },
+    "5629": {
+        "file_id": 459,
+        "content": "This code block checks if the ground truth difficult flag is specified for an image and raises a warning if not. It then adds single ground truth image information, including bounding boxes, class labels, and mask (if available), to the evaluation object. This allows for evaluating the performance of the object detection model on the given image.",
+        "type": "comment"
+    },
+    "5630": {
+        "file_id": 459,
+        "content": "        \"\"\"Adds detections for a single image to be used for evaluation.\n        Args:\n            image_id: A unique string/integer identifier for the image.\n            detections_dict: A dictionary containing -\n                standard_fields.DetectionResultFields.detection_boxes: float32\n                    numpy array of shape [num_boxes, 4] containing `num_boxes`\n                    detection boxes of the format [ymin, xmin, ymax, xmax] in\n                    absolute image coordinates.\n                standard_fields.DetectionResultFields.detection_scores: float32\n                    numpy array of shape [num_boxes] containing detection\n                    scores for the boxes.\n                standard_fields.DetectionResultFields.detection_classes:\n                    integer numpy array of shape [num_boxes] containing\n                    1-indexed detection classes for the boxes.\n                standard_fields.DetectionResultFields.detection_masks: uint8\n                    numpy array of shape [num_boxes, height, width] containing",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:220-236"
+    },
+    "5631": {
+        "file_id": 459,
+        "content": "This code snippet adds detections for a single image to be used in evaluation. It takes in image_id and a dictionary containing detection boxes, scores, classes, and masks as input. The detection boxes are represented by a float32 numpy array of shape [num_boxes, 4] with the format [ymin, xmin, ymax, xmax] in absolute image coordinates. Detection scores and classes are integer numpy arrays representing the scores and classes for each box respectively, while detection masks are represented by a uint8 numpy array of shape [num_boxes, height, width].",
+        "type": "comment"
+    },
+    "5632": {
+        "file_id": 459,
+        "content": "                    `num_boxes` masks of values ranging between 0 and 1.\n        Raises:\n            ValueError: If detection masks are not in detections dictionary.\n        \"\"\"\n        detection_classes = (\n            detections_dict[\n                standard_fields.DetectionResultFields.detection_classes] -\n            self._label_id_offset)\n        detection_masks = None\n        if self._evaluate_masks:\n            if (standard_fields.DetectionResultFields.detection_masks\n                    not in detections_dict):\n                raise ValueError(\n                    'Detection masks not in detections dictionary.')\n            detection_masks = detections_dict[\n                standard_fields.DetectionResultFields.detection_masks]\n        self._evaluation.add_single_detected_image_info(\n            image_key=image_id,\n            detected_boxes=detections_dict[\n                standard_fields.DetectionResultFields.detection_boxes],\n            detected_scores=detections_dict[\n                standard_fields.DetectionResultFields.detection_scores],",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:237-259"
+    },
+    "5633": {
+        "file_id": 459,
+        "content": "This code block retrieves detection classes and masks from the \"detections_dict\" dictionary. If _evaluate_Masks is True, it checks if detection masks are present in detections_dict. If not, it raises a ValueError. Then, it adds single detected image information to _evaluation using detected boxes, scores, and (optionally) detection masks.",
+        "type": "comment"
+    },
+    "5634": {
+        "file_id": 459,
+        "content": "            detected_class_labels=detection_classes,\n            detected_masks=detection_masks,\n        )\n    def create_category_index(self, categories):\n        \"\"\"Creates dictionary of COCO compatible categories keyed by category\n        id.\n        Args:\n            categories: a list of dicts, each of which has the following keys:\n                'id': (required) an integer id uniquely identifying this\n                    category.\n                'name': (required) string representing category name\n                    e.g., 'cat', 'dog', 'pizza'.\n        Returns:\n            category_index: a dict containing the same entries as categories,\n                but keyed by the 'id' field of each category.\n        \"\"\"\n        category_index = {}\n        for cat in categories:\n            category_index[cat['id']] = cat\n        return category_index\n    def evaluate(self):\n        \"\"\"Compute evaluation result.\n        Returns:\n            A dictionary of metrics with the following fields -\n            1. summary_metrics:",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:260-290"
+    },
+    "5635": {
+        "file_id": 459,
+        "content": "This code is related to object detection evaluation in the AVA dataset. The `create_category_index` function creates a dictionary of COCO compatible categories, keyed by category id. The `evaluate` function computes the evaluation results, returning a dictionary of metrics including summary_metrics. The code also uses `detection_classes` and `detection_masks` for evaluation.",
+        "type": "comment"
+    },
+    "5636": {
+        "file_id": 459,
+        "content": "                'Precision/mAP@<matching_iou_threshold>IOU': mean average\n                precision at the specified IOU threshold\n            2. per_category_ap: category specific results with keys of the form\n               'PerformanceByCategory/mAP@<matching_iou_threshold>IOU/category'\n        \"\"\"\n        (\n            per_class_ap,\n            mean_ap,\n            _,\n            _,\n            per_class_corloc,\n            mean_corloc,\n        ) = self._evaluation.evaluate()\n        metric = f'mAP@{self._matching_iou_threshold}IOU'\n        pascal_metrics = {self._metric_prefix + metric: mean_ap}\n        if self._evaluate_corlocs:\n            pascal_metrics[self._metric_prefix +\n                           'Precision/meanCorLoc@{}IOU'.format(\n                               self._matching_iou_threshold)] = mean_corloc\n        category_index = self.create_category_index(self._categories)\n        for idx in range(per_class_ap.size):\n            if idx + self._label_id_offset in category_index:\n                display_name = (",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:291-315"
+    },
+    "5637": {
+        "file_id": 459,
+        "content": "This code calculates the mean average precision (mAP) and optionally, the mean correct localization score (meanCorLoc), at a specified IOU threshold for object detection evaluation. It creates metrics under different categories using category-specific results. The calculated values are then stored in the pascal_metrics dictionary.",
+        "type": "comment"
+    },
+    "5638": {
+        "file_id": 459,
+        "content": "                    self._metric_prefix +\n                    'PerformanceByCategory/AP@{}IOU/{}'.format(\n                        self._matching_iou_threshold,\n                        category_index[idx + self._label_id_offset]['name'],\n                    ))\n                pascal_metrics[display_name] = per_class_ap[idx]\n                # Optionally add CorLoc metrics.classes\n                if self._evaluate_corlocs: #False\n                    display_name = (\n                        self._metric_prefix +\n                        'PerformanceByCategory/CorLoc@{}IOU/{}'.format(\n                            self._matching_iou_threshold,\n                            category_index[idx +\n                                           self._label_id_offset]['name'],\n                        ))\n                    pascal_metrics[display_name] = per_class_corloc[idx]\n        return pascal_metrics\n    def clear(self):\n        \"\"\"Clears the state to prepare for a fresh evaluation.\"\"\"\n        self._evaluation = ObjectDetectionEvaluation(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:316-338"
+    },
+    "5639": {
+        "file_id": 459,
+        "content": "This code calculates average precision (AP) and optional correct localization (CorLoc) metrics for object detection by category. It appends these metrics to the pascal_metrics dictionary based on the matching IOU threshold and category names from the category_index. The clear() function resets the evaluation state for a new evaluation.",
+        "type": "comment"
+    },
+    "5640": {
+        "file_id": 459,
+        "content": "            num_groundtruth_classes=self._num_classes,\n            matching_iou_threshold=self._matching_iou_threshold,\n            use_weighted_mean_ap=self._use_weighted_mean_ap,\n            label_id_offset=self._label_id_offset,\n        )\n        self._image_ids.clear()\nclass PascalDetectionEvaluator(ObjectDetectionEvaluator):\n    \"\"\"A class to evaluate detections using PASCAL metrics.\"\"\"\n    def __init__(self, categories, matching_iou_threshold=0.5):\n        super(PascalDetectionEvaluator, self).__init__(\n            categories,\n            matching_iou_threshold=matching_iou_threshold,\n            evaluate_corlocs=False,\n            use_weighted_mean_ap=False,\n        )\nObjectDetectionEvalMetrics = collections.namedtuple(\n    'ObjectDetectionEvalMetrics',\n    [\n        'average_precisions',\n        'mean_ap',\n        'precisions',\n        'recalls',\n        'corlocs',\n        'mean_corloc',\n    ],\n)\nclass ObjectDetectionEvaluation:\n    \"\"\"Internal implementation of Pascal object detection metrics.\"\"\"\n    def __init__(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:339-375"
+    },
+    "5641": {
+        "file_id": 459,
+        "content": "The given code is a part of an object detection evaluation module in PaddleVideo. It defines classes and functions to evaluate detections using PASCAL metrics. The ObjectDetectionEvaluator class initializes with categories, matching IoU threshold, evaluating corlocs flag, and use weighted mean AP flag. PascalDetectionEvaluator is a subclass of ObjectDetectionEvaluator specifically for PASCAL evaluation. The code also defines the ObjectDetectionEvalMetrics namedtuple which includes average_precisions, mean_ap, precisions, recalls, corlocs, and mean_corloc attributes.",
+        "type": "comment"
+    },
+    "5642": {
+        "file_id": 459,
+        "content": "        self,\n        num_groundtruth_classes,\n        matching_iou_threshold=0.5,\n        nms_iou_threshold=1.0,\n        nms_max_output_boxes=10000,\n        use_weighted_mean_ap=False,\n        label_id_offset=0,\n    ):\n        if num_groundtruth_classes < 1:\n            raise ValueError(\n                'Need at least 1 groundtruth class for evaluation.')\n        self.per_image_eval = per_image_evaluation.PerImageEvaluation(\n            num_groundtruth_classes=num_groundtruth_classes,\n            matching_iou_threshold=matching_iou_threshold,\n        )\n        self.num_class = num_groundtruth_classes\n        self.use_weighted_mean_ap = use_weighted_mean_ap\n        self.label_id_offset = label_id_offset\n        self.groundtruth_boxes = {}\n        self.groundtruth_class_labels = {}\n        self.groundtruth_masks = {}\n        self.groundtruth_is_difficult_list = {}\n        self.groundtruth_is_group_of_list = {}\n        self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int)\n        self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:376-402"
+    },
+    "5643": {
+        "file_id": 459,
+        "content": "This function initializes the necessary attributes for object detection evaluation. It requires 'self', number of ground truth classes, matching and nms iou thresholds, maximum output boxes, whether to use weighted mean AP, label offset, and sets up dictionaries to store ground truth information. It also initializes counters for the number of instances and images per class.",
+        "type": "comment"
+    },
+    "5644": {
+        "file_id": 459,
+        "content": "        self._initialize_detections()\n    def _initialize_detections(self):\n        self.detection_keys = set()\n        self.scores_per_class = [[] for _ in range(self.num_class)]\n        self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]\n        self.num_images_correctly_detected_per_class = np.zeros(self.num_class)\n        self.average_precision_per_class = np.empty(\n            self.num_class, dtype=float)\n        self.average_precision_per_class.fill(np.nan)\n        self.precisions_per_class = []\n        self.recalls_per_class = []\n        self.corloc_per_class = np.ones(self.num_class, dtype=float)\n    def clear_detections(self):\n        self._initialize_detections()\n    def add_single_ground_truth_image_info(\n        self,\n        image_key,\n        groundtruth_boxes,\n        groundtruth_class_labels,\n        groundtruth_is_difficult_list=None,\n        groundtruth_is_group_of_list=None,\n        groundtruth_masks=None,\n    ):\n        \"\"\"Adds groundtruth for a single image to be used for evaluation.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:404-430"
+    },
+    "5645": {
+        "file_id": 459,
+        "content": "This code initializes the detection variables and provides functions to clear detections, add single ground truth image info, and perform evaluation. The average precision per class is initialized with nan values, and these functions manage the data for object detection evaluation.",
+        "type": "comment"
+    },
+    "5646": {
+        "file_id": 459,
+        "content": "        Args:\n            image_key: A unique string/integer identifier for the image.\n            groundtruth_boxes: float32 numpy array of shape [num_boxes, 4]\n                containing `num_boxes` groundtruth boxes of the format\n                [ymin, xmin, ymax, xmax] in absolute image coordinates.\n            groundtruth_class_labels: integer numpy array of shape [num_boxes]\n                containing 0-indexed groundtruth classes for the boxes.\n            groundtruth_is_difficult_list: A length M numpy boolean array\n                denoting whether a ground truth box is a difficult instance or\n                not. To support the case that no boxes are difficult, it is by\n                default set as None.\n            groundtruth_is_group_of_list: A length M numpy boolean array\n                denoting whether a ground truth box is a group-of box or not.\n                To support the case that no boxes are groups-of, it is by\n                default set as None.\n            groundtruth_masks: uint8 numpy array of shape",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:432-447"
+    },
+    "5647": {
+        "file_id": 459,
+        "content": "The function takes in an image identifier, ground truth boxes coordinates, class labels for the boxes, a boolean array denoting difficult instances, and another boolean array for group-of boxes. It calculates average precision and recall for object detection using these inputs. The function also supports cases where no boxes are difficult or groups-of.",
+        "type": "comment"
+    },
+    "5648": {
+        "file_id": 459,
+        "content": "                [num_boxes, height, width] containing `num_boxes` groundtruth\n                masks. The mask values range from 0 to 1.\n        \"\"\"\n        if image_key in self.groundtruth_boxes:\n            logging.warn(('image %s has already been added to the ground '\n                          'truth database.'), image_key)\n            return\n        self.groundtruth_boxes[image_key] = groundtruth_boxes\n        self.groundtruth_class_labels[image_key] = groundtruth_class_labels\n        self.groundtruth_masks[image_key] = groundtruth_masks\n        if groundtruth_is_difficult_list is None:\n            num_boxes = groundtruth_boxes.shape[0]\n            groundtruth_is_difficult_list = np.zeros(num_boxes, dtype=bool)\n        self.groundtruth_is_difficult_list[\n            image_key] = groundtruth_is_difficult_list.astype(dtype=bool)\n        if groundtruth_is_group_of_list is None:\n            num_boxes = groundtruth_boxes.shape[0]\n            groundtruth_is_group_of_list = np.zeros(num_boxes, dtype=bool)\n        self.groundtruth_is_group_of_list[",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:448-467"
+    },
+    "5649": {
+        "file_id": 459,
+        "content": "This function adds the ground truth boxes, class labels, and masks to the database for a given image key. If the groundtruth_is_difficult_list or groundtruth_is_group_of_list are None, it creates them with default values. It stores these lists as well in the database for the specified image key.",
+        "type": "comment"
+    },
+    "5650": {
+        "file_id": 459,
+        "content": "            image_key] = groundtruth_is_group_of_list.astype(dtype=bool)\n        self._update_ground_truth_statistics(\n            groundtruth_class_labels,\n            groundtruth_is_difficult_list.astype(dtype=bool),\n            groundtruth_is_group_of_list.astype(dtype=bool),\n        )\n    def add_single_detected_image_info(\n        self,\n        image_key,\n        detected_boxes,\n        detected_scores,\n        detected_class_labels,\n        detected_masks=None,\n    ):\n        \"\"\"Adds detections for a single image to be used for evaluation.\n        Args:\n            image_key: A unique string/integer identifier for the image.\n            detected_boxes: float32 numpy array of shape [num_boxes, 4]\n                containing `num_boxes` detection boxes of the format\n                [ymin, xmin, ymax, xmax] in absolute image coordinates.\n            detected_scores: float32 numpy array of shape [num_boxes]\n                containing detection scores for the boxes.\n            detected_class_labels: integer numpy array of shape [num_boxes]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:468-493"
+    },
+    "5651": {
+        "file_id": 459,
+        "content": "This function adds detections for a single image to be used for evaluation. It requires an image key, detected boxes, detected scores, and detected class labels as input. The detected boxes should be in the format [ymin, xmin, ymax, xmax] and the detected scores and detected class labels should be numpy arrays of the specified shapes. The function calls a _update_ground_truth_statistics method with groundtruth class labels, difficult list, and group of list as input. This method updates the ground truth statistics for evaluation.",
+        "type": "comment"
+    },
+    "5652": {
+        "file_id": 459,
+        "content": "                containing 0-indexed detection classes for the boxes.\n            detected_masks: np.uint8 numpy array of shape\n                [num_boxes, height, width] containing `num_boxes` detection\n                masks with values ranging between 0 and 1.\n        Raises:\n            ValueError: if the number of boxes, scores and class labels differ\n                in length.\n        \"\"\"\n        if len(detected_boxes) != len(detected_scores) or len(\n                detected_boxes) != len(detected_class_labels):\n            raise ValueError(\n                'detected_boxes, detected_scores and '\n                'detected_class_labels should all have same lengths. Got'\n                '[%d, %d, %d]' % len(detected_boxes),\n                len(detected_scores),\n                len(detected_class_labels),\n            )\n        if image_key in self.detection_keys:\n            logging.warn(('image %s has already been added to the ground '\n                          'truth database.'), image_key)\n            return",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:494-516"
+    },
+    "5653": {
+        "file_id": 459,
+        "content": "This function creates a numpy array of detection masks based on detected boxes, scores and class labels. It raises a ValueError if the lengths of these lists are not equal. If an image key already exists in the detection keys list, it logs a warning message and returns without adding the image to the database.",
+        "type": "comment"
+    },
+    "5654": {
+        "file_id": 459,
+        "content": "        self.detection_keys.add(image_key)\n        if image_key in self.groundtruth_boxes:\n            groundtruth_boxes = self.groundtruth_boxes[image_key]\n            groundtruth_class_labels = self.groundtruth_class_labels[image_key]\n            # Masks are popped instead of look up. The reason is that we do not\n            # want to keep all masks in memory which can cause memory overflow.\n            groundtruth_masks = self.groundtruth_masks.pop(image_key)\n            groundtruth_is_difficult_list = self.groundtruth_is_difficult_list[\n                image_key]\n            groundtruth_is_group_of_list = self.groundtruth_is_group_of_list[\n                image_key]\n        else:\n            groundtruth_boxes = np.empty(shape=[0, 4], dtype=float)\n            groundtruth_class_labels = np.array([], dtype=int)\n            if detected_masks is None:\n                groundtruth_masks = None\n            else:\n                groundtruth_masks = np.empty(shape=[0, 1, 1], dtype=float)\n            groundtruth_is_difficult_list = np.array([], dtype=bool)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:518-536"
+    },
+    "5655": {
+        "file_id": 459,
+        "content": "This code is initializing ground truth values for object detection evaluation. If an image key exists in the ground truth boxes dictionary, it retrieves the corresponding ground truth values (boxes, class labels, masks) and removes them from their respective dictionaries to avoid memory overflow. If no image key exists, it initializes empty arrays or None values for the ground truth values.",
+        "type": "comment"
+    },
+    "5656": {
+        "file_id": 459,
+        "content": "            groundtruth_is_group_of_list = np.array([], dtype=bool)\n        (\n            scores,\n            tp_fp_labels,\n        ) = self.per_image_eval.compute_object_detection_metrics(\n            detected_boxes=detected_boxes,\n            detected_scores=detected_scores,\n            detected_class_labels=detected_class_labels,\n            groundtruth_boxes=groundtruth_boxes,\n            groundtruth_class_labels=groundtruth_class_labels,\n            groundtruth_is_difficult_list=groundtruth_is_difficult_list,\n            groundtruth_is_group_of_list=groundtruth_is_group_of_list,\n            detected_masks=detected_masks,\n            groundtruth_masks=groundtruth_masks,\n        )\n        for i in range(self.num_class):\n            if scores[i].shape[0] > 0:\n                self.scores_per_class[i].append(scores[i])\n                self.tp_fp_labels_per_class[i].append(tp_fp_labels[i])\n    def _update_ground_truth_statistics(\n        self,\n        groundtruth_class_labels,\n        groundtruth_is_difficult_list,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:537-561"
+    },
+    "5657": {
+        "file_id": 459,
+        "content": "This code is part of the PaddleVideo library and it computes object detection metrics. It takes in detected boxes, scores, class labels, ground truth boxes, class labels, masks etc., and calculates true positive and false positive labels for each image. The computed values are then stored per class in separate lists (scores_per_class and tp_fp_labels_per_class). Additionally, the function updates ground truth statistics by appending new ground truth class labels and difficult list to existing ones.",
+        "type": "comment"
+    },
+    "5658": {
+        "file_id": 459,
+        "content": "        groundtruth_is_group_of_list,\n    ):\n        \"\"\"Update grouth truth statitistics.\n        1. Difficult boxes are ignored when counting the number of ground truth\n        instances as done in Pascal VOC devkit.\n        2. Difficult boxes are treated as normal boxes when computing CorLoc\n        related statitistics.\n        Args:\n            groundtruth_class_labels: An integer numpy array of length M,\n                representing M class labels of object instances in ground truth\n            groundtruth_is_difficult_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a difficult instance or\n                not\n            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a group-of box or not\n        \"\"\"\n        for class_index in range(self.num_class):\n            num_gt_instances = np.sum(groundtruth_class_labels[\n                ~groundtruth_is_difficult_list\n                & ~groundtruth_is_group_of_list] == class_index)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:562-583"
+    },
+    "5659": {
+        "file_id": 459,
+        "content": "This function updates ground truth statistics for object detection by counting instances, excluding difficult boxes and treating them as normal ones for CorLoc computations. It iterates through class indices to determine the number of instances for each class label, excluding difficult or group-of boxes.",
+        "type": "comment"
+    },
+    "5660": {
+        "file_id": 459,
+        "content": "            self.num_gt_instances_per_class[class_index] += num_gt_instances\n            if np.any(groundtruth_class_labels == class_index):\n                self.num_gt_imgs_per_class[class_index] += 1\n    def evaluate(self):\n        \"\"\"Compute evaluation result.\n        Returns:\n            A named tuple with the following fields -\n                average_precision: float numpy array of average precision for\n                    each class.\n                mean_ap: mean average precision of all classes, float scalar\n                precisions: List of precisions, each precision is a float numpy\n                    array\n                recalls: List of recalls, each recall is a float numpy array\n                corloc: numpy float array\n                mean_corloc: Mean CorLoc score for each class, float scalar\n        \"\"\"\n        if (self.num_gt_instances_per_class == 0).any():\n            print(\n                'The following classes have no ground truth examples: %s',\n                np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) +",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:584-605"
+    },
+    "5661": {
+        "file_id": 459,
+        "content": "The code calculates average precision, mean average precision, precisions, recalls, and CorLoc scores for object detection evaluation. It checks if any ground truth instances exist for each class and returns a named tuple with evaluation results. If there are classes with no ground truth examples, it prints a warning message.",
+        "type": "comment"
+    },
+    "5662": {
+        "file_id": 459,
+        "content": "                self.label_id_offset, \"self.detection_keys:\",self.detection_keys\n            )\n        if self.use_weighted_mean_ap:\n            all_scores = np.array([], dtype=float)\n            all_tp_fp_labels = np.array([], dtype=bool)\n        for class_index in range(self.num_class):\n            if self.num_gt_instances_per_class[class_index] == 0:\n                continue\n            if not self.scores_per_class[class_index]:\n                scores = np.array([], dtype=float)\n                tp_fp_labels = np.array([], dtype=bool)\n            else:\n                scores = np.concatenate(self.scores_per_class[class_index])\n                tp_fp_labels = np.concatenate(\n                    self.tp_fp_labels_per_class[class_index])\n            if self.use_weighted_mean_ap:\n                all_scores = np.append(all_scores, scores)\n                all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)\n            precision, recall = metrics.compute_precision_recall(\n                scores,\n                tp_fp_labels,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:606-629"
+    },
+    "5663": {
+        "file_id": 459,
+        "content": "This code is part of a class that performs object detection evaluation using Average Vehicle Accuracy (AVA) metrics. It checks for the number of ground truth instances per class and concatenates scores and true positive/false positive labels per class. If weighted mean average precision (AP) calculation is enabled, it appends the scores and labels to the total arrays. The code uses the compute_precision_recall function from the metrics module to calculate precision and recall values.",
+        "type": "comment"
+    },
+    "5664": {
+        "file_id": 459,
+        "content": "                self.num_gt_instances_per_class[class_index],\n            )\n            self.precisions_per_class.append(precision)\n            self.recalls_per_class.append(recall)\n            average_precision = metrics.compute_average_precision(\n                precision, recall)\n            self.average_precision_per_class[class_index] = average_precision\n        self.corloc_per_class = metrics.compute_cor_loc(\n            self.num_gt_imgs_per_class,\n            self.num_images_correctly_detected_per_class,\n        )\n        if self.use_weighted_mean_ap:\n            num_gt_instances = np.sum(self.num_gt_instances_per_class)\n            precision, recall = metrics.compute_precision_recall(\n                all_scores, all_tp_fp_labels, num_gt_instances)\n            mean_ap = metrics.compute_average_precision(precision, recall)\n        else:\n            mean_ap = np.nanmean(self.average_precision_per_class)\n        mean_corloc = np.nanmean(self.corloc_per_class)\n        return ObjectDetectionEvalMetrics(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:630-651"
+    },
+    "5665": {
+        "file_id": 459,
+        "content": "This function calculates average precision and correlation localization for object detection evaluation. It stores the precision, recall, average precision per class, and correlation localization per class. If weighted mean AP is enabled, it computes precision, recall, mean AP, and mean correlation localization.",
+        "type": "comment"
+    },
+    "5666": {
+        "file_id": 459,
+        "content": "            self.average_precision_per_class,\n            mean_ap,\n            self.precisions_per_class,\n            self.recalls_per_class,\n            self.corloc_per_class,\n            mean_corloc,\n        )",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:652-658"
+    },
+    "5667": {
+        "file_id": 459,
+        "content": "This code snippet appears to be part of a class function that returns several evaluation metrics for object detection. The metrics include average precision per class, mean average precision, precisions and recalls per class, and mean corloc values. These metrics are commonly used in evaluating object detection models' performance.",
+        "type": "comment"
+    },
+    "5668": {
+        "file_id": 460,
+        "content": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py",
+        "type": "filepath"
+    },
+    "5669": {
+        "file_id": 460,
+        "content": "The code measures object detection performance, handles class labels and non-maximum suppression, and calculates true/false positives using an IoU threshold. It is used for AVA evaluation and contains functions to select class-specific data, remove invalid boxes, and filter input arrays.",
+        "type": "summary"
+    },
+    "5670": {
+        "file_id": 460,
+        "content": "# Copyright 2017 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# =============================================================================\n\"\"\"Evaluate Object Detection result on a single image.\nAnnotate each detected result as true positives or false positive according to\na predefined IOU ratio. Non Maximum Supression is used by default. Multi class\ndetection is supported by default. Based on the settings, per image evaluation\nis either performed on boxes or on object masks.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:1-20"
+    },
+    "5671": {
+        "file_id": 460,
+        "content": "This code file is for evaluating object detection results on a single image. It determines true positives or false positives based on a predefined IOU ratio. Non Maximum Supression and multi-class detection are supported. The evaluation can be performed either on boxes or object masks, depending on the settings.",
+        "type": "comment"
+    },
+    "5672": {
+        "file_id": 460,
+        "content": "\"\"\"\nimport numpy as np\nfrom . import np_box_list, np_box_ops\nclass PerImageEvaluation:\n    \"\"\"Evaluate detection result of a single image.\"\"\"\n    def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5):\n        \"\"\"Initialized PerImageEvaluation by evaluation parameters.\n        Args:\n            num_groundtruth_classes: Number of ground truth object classes\n            matching_iou_threshold: A ratio of area intersection to union,\n                which is the threshold to consider whether a detection is true\n                positive or not\n        \"\"\"\n        self.matching_iou_threshold = matching_iou_threshold\n        self.num_groundtruth_classes = num_groundtruth_classes\n    def compute_object_detection_metrics(\n        self,\n        detected_boxes,\n        detected_scores,\n        detected_class_labels,\n        groundtruth_boxes,\n        groundtruth_class_labels,\n        groundtruth_is_difficult_list,\n        groundtruth_is_group_of_list,\n        detected_masks=None,\n        groundtruth_masks=None,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:21-53"
+    },
+    "5673": {
+        "file_id": 460,
+        "content": "This code initializes a class for evaluating detection results of a single image. It takes in parameters such as the number of ground truth classes and matching IOU threshold, and computes object detection metrics using detected boxes, scores, class labels, etc.",
+        "type": "comment"
+    },
+    "5674": {
+        "file_id": 460,
+        "content": "    ):\n        \"\"\"Evaluates detections as being tp, fp or ignored from a single image.\n        The evaluation is done in two stages:\n        1. All detections are matched to non group-of boxes; true positives\n            are determined and detections matched to difficult boxes are\n            ignored.\n        2. Detections that are determined as false positives are matched\n            against group-of boxes and ignored if matched.\n        Args:\n            detected_boxes: A float numpy array of shape [N, 4], representing N\n                regions of detected object regions.\n                Each row is of the format [y_min, x_min, y_max, x_max]\n            detected_scores: A float numpy array of shape [N, 1], representing\n                the confidence scores of the detected N object instances.\n            detected_class_labels: A integer numpy array of shape [N, 1],\n                repreneting the class labels of the detected N object\n                instances.\n            groundtruth_boxes: A float numpy array of shape [M, 4],",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:54-73"
+    },
+    "5675": {
+        "file_id": 460,
+        "content": "This function evaluates detections as true positives, false positives or ignored based on the detected and ground truth boxes. It works in two stages: 1) matching all detections to non group-of boxes for true positives, ignoring difficult ones; and 2) ignoring detections matched to group-of boxes. The inputs are numpy arrays of detected boxes, scores, class labels, and ground truth boxes.",
+        "type": "comment"
+    },
+    "5676": {
+        "file_id": 460,
+        "content": "                representing M regions of object instances in ground truth\n            groundtruth_class_labels: An integer numpy array of shape [M, 1],\n                representing M class labels of object instances in ground truth\n            groundtruth_is_difficult_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a difficult instance or\n                not\n            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box has group-of tag\n            detected_masks: (optional) A uint8 numpy array of shape\n                [N, height, width]. If not None, the metrics will be computed\n                based on masks.\n            groundtruth_masks: (optional) A uint8 numpy array of shape\n                [M, height, width].\n        Returns:\n            scores: A list of C float numpy arrays. Each numpy array is of\n                shape [K, 1], representing K scores detected with object class\n                label c",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:74-91"
+    },
+    "5677": {
+        "file_id": 460,
+        "content": "This code function accepts multiple parameters including ground truth regions, class labels, difficult instances, group-of tags, optional detected masks and ground truth masks. It returns a list of scores representing K scores detected with object class label c.",
+        "type": "comment"
+    },
+    "5678": {
+        "file_id": 460,
+        "content": "            tp_fp_labels: A list of C boolean numpy arrays. Each numpy array\n                is of shape [K, 1], representing K True/False positive label of\n                object instances detected with class label c\n        \"\"\"\n        (\n            detected_boxes,\n            detected_scores,\n            detected_class_labels,\n            detected_masks,\n        ) = self._remove_invalid_boxes(\n            detected_boxes,\n            detected_scores,\n            detected_class_labels,\n            detected_masks,\n        )\n        scores, tp_fp_labels = self._compute_tp_fp(\n            detected_boxes=detected_boxes,\n            detected_scores=detected_scores,\n            detected_class_labels=detected_class_labels,\n            groundtruth_boxes=groundtruth_boxes,\n            groundtruth_class_labels=groundtruth_class_labels,\n            groundtruth_is_difficult_list=groundtruth_is_difficult_list,\n            groundtruth_is_group_of_list=groundtruth_is_group_of_list,\n            detected_masks=detected_masks,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:92-115"
+    },
+    "5679": {
+        "file_id": 460,
+        "content": "This function is part of the AvaEvaluation class, which evaluates object detection performance in videos. It computes true positive and false positive labels for detected object instances based on ground truth information. The function removes invalid detection boxes before computing the tp_fp_labels. This helps in evaluating the accuracy of detected objects.",
+        "type": "comment"
+    },
+    "5680": {
+        "file_id": 460,
+        "content": "            groundtruth_masks=groundtruth_masks,\n        )\n        return scores, tp_fp_labels\n    def _compute_tp_fp(\n        self,\n        detected_boxes,\n        detected_scores,\n        detected_class_labels,\n        groundtruth_boxes,\n        groundtruth_class_labels,\n        groundtruth_is_difficult_list,\n        groundtruth_is_group_of_list,\n        detected_masks=None,\n        groundtruth_masks=None,\n    ):\n        \"\"\"Labels true/false positives of detections of an image across all\n        classes.\n        Args:\n            detected_boxes: A float numpy array of shape [N, 4], representing N\n                regions of detected object regions.\n                Each row is of the format [y_min, x_min, y_max, x_max]\n            detected_scores: A float numpy array of shape [N, 1], representing\n                the confidence scores of the detected N object instances.\n            detected_class_labels: A integer numpy array of shape [N, 1],\n                repreneting the class labels of the detected N object",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:116-143"
+    },
+    "5681": {
+        "file_id": 460,
+        "content": "This code calculates true/false positives for object detection in an image across all classes. It takes detected boxes, scores, class labels, ground truth boxes, class labels, and optional masks as input, returning the computed scores and tp_fp_labels. The separate function computes tp_fp for a single image given the above inputs.",
+        "type": "comment"
+    },
+    "5682": {
+        "file_id": 460,
+        "content": "                instances.\n            groundtruth_boxes: A float numpy array of shape [M, 4],\n                representing M regions of object instances in ground truth\n            groundtruth_class_labels: An integer numpy array of shape [M, 1],\n                representing M class labels of object instances in ground truth\n            groundtruth_is_difficult_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a difficult instance or\n                not\n            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box has group-of tag\n            detected_masks: (optional) A np.uint8 numpy array of shape\n                [N, height, width]. If not None, the scores will be computed\n                based on masks.\n            groundtruth_masks: (optional) A np.uint8 numpy array of shape\n                [M, height, width].\n        Returns:\n            result_scores: A list of float numpy arrays. Each numpy array is of",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:144-161"
+    },
+    "5683": {
+        "file_id": 460,
+        "content": "The function takes input parameters like instances, groundtruth_boxes, groundtruth_class_labels, groundtruth_is_difficult_list, groundtruth_is_group_of_list, detected_masks and groundtruth_masks. It returns a list of float numpy arrays representing result scores based on these inputs. The function computes scores considering masks if detected_masks is not None and groundtruth_masks if it's not None.",
+        "type": "comment"
+    },
+    "5684": {
+        "file_id": 460,
+        "content": "                shape [K, 1], representing K scores detected with object class\n                label c\n            result_tp_fp_labels: A list of boolean numpy array. Each numpy\n                array is of shape [K, 1], representing K True/False positive\n                label of object instances detected with class label c\n        Raises:\n            ValueError: If detected masks is not None but groundtruth masks are\n                None, or the other way around.\n        \"\"\"\n        if detected_masks is not None and groundtruth_masks is None:\n            raise ValueError(\n                'Detected masks is available but groundtruth masks is not.')\n        if detected_masks is None and groundtruth_masks is not None:\n            raise ValueError(\n                'Groundtruth masks is available but detected masks is not.')\n        result_scores = []\n        result_tp_fp_labels = []\n        for i in range(self.num_groundtruth_classes):\n            groundtruth_is_difficult_list_at_ith_class = (\n                groundtruth_is_difficult_list[groundtruth_class_labels == i])",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:162-183"
+    },
+    "5685": {
+        "file_id": 460,
+        "content": "This function checks if both detected_masks and groundtruth_masks are not null. It then initializes result_scores and result_tp_fp_labels lists for storing scores and True/False positive labels of object instances respectively. If only one of the masks is available, it raises a ValueError. This function seems to be part of an AVA evaluation process where it deals with class label c, groundtruth_class_labels, detected_masks, and groundtruth_masks.",
+        "type": "comment"
+    },
+    "5686": {
+        "file_id": 460,
+        "content": "            groundtruth_is_group_of_list_at_ith_class = (\n                groundtruth_is_group_of_list[groundtruth_class_labels == i])\n            (\n                gt_boxes_at_ith_class,\n                gt_masks_at_ith_class,\n                detected_boxes_at_ith_class,\n                detected_scores_at_ith_class,\n                detected_masks_at_ith_class,\n            ) = self._get_ith_class_arrays(detected_boxes, detected_scores,\n                                           detected_masks,\n                                           detected_class_labels,\n                                           groundtruth_boxes,\n                                           groundtruth_masks,\n                                           groundtruth_class_labels, i)\n            scores, tp_fp_labels = self._compute_tp_fp_for_single_class(\n                detected_boxes=detected_boxes_at_ith_class,\n                detected_scores=detected_scores_at_ith_class,\n                groundtruth_boxes=gt_boxes_at_ith_class,\n                groundtruth_is_difficult_list=(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:184-202"
+    },
+    "5687": {
+        "file_id": 460,
+        "content": "The code is extracting per-class arrays for detected and ground truth objects. It separates the data into specific classes, computes true positive and false positive labels using a single class function, and assigns them to their respective variables.",
+        "type": "comment"
+    },
+    "5688": {
+        "file_id": 460,
+        "content": "                    groundtruth_is_difficult_list_at_ith_class),\n                groundtruth_is_group_of_list=(\n                    groundtruth_is_group_of_list_at_ith_class),\n                detected_masks=detected_masks_at_ith_class,\n                groundtruth_masks=gt_masks_at_ith_class,\n            )\n            result_scores.append(scores)\n            result_tp_fp_labels.append(tp_fp_labels)\n        return result_scores, result_tp_fp_labels\n    def _get_overlaps_and_scores_box_mode(\n        self,\n        detected_boxes,\n        detected_scores,\n        groundtruth_boxes,\n        groundtruth_is_group_of_list,\n    ):\n        \"\"\"Computes overlaps and scores between detected and groudntruth boxes.\n        Args:\n            detected_boxes: A numpy array of shape [N, 4] representing detected\n                box coordinates\n            detected_scores: A 1-d numpy array of length N representing\n                classification score\n            groundtruth_boxes: A numpy array of shape [M, 4] representing\n                ground truth box coordinates",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:203-228"
+    },
+    "5689": {
+        "file_id": 460,
+        "content": "This code is calculating overlapping regions and scores between detected and ground truth boxes. It's taking in arrays of detected box coordinates, classification scores, ground truth box coordinates, and ground truth group indicators. The code then returns the resultant scores and true positive/false positive labels for each image. This seems to be part of an object detection or instance segmentation evaluation metric.",
+        "type": "comment"
+    },
+    "5690": {
+        "file_id": 460,
+        "content": "            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box has group-of tag. If a\n                groundtruth box is group-of box, every detection matching this\n                box is ignored.\n        Returns:\n            iou: A float numpy array of size [num_detected_boxes,\n                num_gt_boxes]. If gt_non_group_of_boxlist.num_boxes() == 0 it\n                will be None.\n            ioa: A float numpy array of size [num_detected_boxes,\n                num_gt_boxes]. If gt_group_of_boxlist.num_boxes() == 0 it will\n                be None.\n            scores: The score of the detected boxlist.\n            num_boxes: Number of non-maximum suppressed detected boxes.\n        \"\"\"\n        detected_boxlist = np_box_list.BoxList(detected_boxes)\n        detected_boxlist.add_field('scores', detected_scores)\n        gt_non_group_of_boxlist = np_box_list.BoxList(\n            groundtruth_boxes[~groundtruth_is_group_of_list])\n        iou = np_box_ops.iou(detected_boxlist.get(),",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:229-249"
+    },
+    "5691": {
+        "file_id": 460,
+        "content": "Code computes intersection over union (IoU) and intersection over area (IoA) between detected boxlists and ground truth boxlists. It also returns the scores of the detected boxes and the number of non-maximum suppressed detected boxes. The ground truth is_group_of_list is used to ignore group-of boxes during calculation.",
+        "type": "comment"
+    },
+    "5692": {
+        "file_id": 460,
+        "content": "                             gt_non_group_of_boxlist.get())\n        scores = detected_boxlist.get_field('scores')\n        num_boxes = detected_boxlist.num_boxes()\n        return iou, None, scores, num_boxes\n    def _compute_tp_fp_for_single_class(\n        self,\n        detected_boxes,\n        detected_scores,\n        groundtruth_boxes,\n        groundtruth_is_difficult_list,\n        groundtruth_is_group_of_list,\n        detected_masks=None,\n        groundtruth_masks=None,\n    ):\n        \"\"\"Labels boxes detected with the same class from the same image as\n        tp/fp.\n        Args:\n            detected_boxes: A numpy array of shape [N, 4] representing detected\n                box coordinates\n            detected_scores: A 1-d numpy array of length N representing\n                classification score\n            groundtruth_boxes: A numpy array of shape [M, 4] representing\n                groundtruth box coordinates\n            groundtruth_is_difficult_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a difficult instance or",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:250-276"
+    },
+    "5693": {
+        "file_id": 460,
+        "content": "This function labels boxes detected with the same class from the same image as true positives or false positives. It takes in the detected boxes, scores, ground truth boxes, and other relevant information to perform this labeling task. The output is determined based on the intersection-over-union (IoU) threshold between detected and ground truth boxes. If a detected box has an IoU greater than 0.5 with any ground truth box in the same class and image, it is considered a true positive (tp). Otherwise, it's considered a false positive (fp). The function also computes the number of detected boxes.",
+        "type": "comment"
+    },
+    "5694": {
+        "file_id": 460,
+        "content": "                not. If a groundtruth box is difficult, every detection\n                matching this box is ignored.\n            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box has group-of tag. If a\n                groundtruth box is group-of box, every detection matching this\n                box is ignored.\n            detected_masks: (optional) A uint8 numpy array of shape\n                [N, height, width]. If not None, the scores will be computed\n                based on masks.\n            groundtruth_masks: (optional) A uint8 numpy array of shape\n                [M, height, width].\n        Returns:\n            Two arrays of the same size, containing all boxes that were\n            evaluated as being true positives or false positives; if a box\n            matched to a difficult box or to a group-of box, it is ignored.\n            scores: A numpy array representing the detection scores.\n            tp_fp_labels: a boolean numpy array indicating whether a detection",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:277-295"
+    },
+    "5695": {
+        "file_id": 460,
+        "content": "This function computes true positive (TP) and false positive (FP) labels for detected boxes based on whether they match difficult or group-of ground truth boxes. It returns scores and TP/FP labels, ignoring any detections that match these challenging boxes. Optional mask inputs are also supported to compute scores based on pixel-wise comparisons instead of bounding box overlaps.",
+        "type": "comment"
+    },
+    "5696": {
+        "file_id": 460,
+        "content": "                is a true positive.\n        \"\"\"\n        if detected_boxes.size == 0:\n            return np.array([], dtype=float), np.array([], dtype=bool)\n        (\n            iou,\n            _,\n            scores,\n            num_detected_boxes,\n        ) = self._get_overlaps_and_scores_box_mode(\n            detected_boxes=detected_boxes,\n            detected_scores=detected_scores,\n            groundtruth_boxes=groundtruth_boxes,\n            groundtruth_is_group_of_list=groundtruth_is_group_of_list,\n        )\n        if groundtruth_boxes.size == 0:\n            return scores, np.zeros(num_detected_boxes, dtype=bool)\n        tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool)\n        is_matched_to_difficult_box = np.zeros(num_detected_boxes, dtype=bool)\n        is_matched_to_group_of_box = np.zeros(num_detected_boxes, dtype=bool)\n        # The evaluation is done in two stages:\n        # 1. All detections are matched to non group-of boxes; true positives\n        #    are determined and detections matched to difficult boxes are",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:296-322"
+    },
+    "5697": {
+        "file_id": 460,
+        "content": "This code checks for true positive detections by first obtaining the Intersection over Union (IoU) and scores between detected boxes and ground truth boxes. If there are no detected or ground truth boxes, it returns empty arrays. Then, it initializes variables to keep track of whether a detection is matched to a difficult box or a group-of box. The code proceeds in two stages: first, all detections are matched to non-group-of boxes, determining true positives, and then detections matched to difficult boxes are identified.",
+        "type": "comment"
+    },
+    "5698": {
+        "file_id": 460,
+        "content": "        #    ignored.\n        # 2. Detections that are determined as false positives are matched\n        #    against group-of boxes and ignored if matched.\n        # Tp-fp evaluation for non-group of boxes (if any).\n        if iou.shape[1] > 0:\n            groundtruth_nongroup_of_is_difficult_list = (\n                groundtruth_is_difficult_list[~groundtruth_is_group_of_list])\n            max_overlap_gt_ids = np.argmax(iou, axis=1)\n            is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)\n            for i in range(num_detected_boxes):\n                gt_id = max_overlap_gt_ids[i]\n                if iou[i, gt_id] >= self.matching_iou_threshold:\n                    if not groundtruth_nongroup_of_is_difficult_list[gt_id]:\n                        if not is_gt_box_detected[gt_id]:\n                            tp_fp_labels[i] = True\n                            is_gt_box_detected[gt_id] = True\n                    else:\n                        is_matched_to_difficult_box[i] = True\n        return (\n            scores[~is_matched_to_difficult_box & ~is_matched_to_group_of_box],",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:323-344"
+    },
+    "5699": {
+        "file_id": 460,
+        "content": "This code performs a TP-FP evaluation for non-group of boxes, ignoring difficult ground truth boxes and false positives matched to group-of boxes. It checks the IOU between detected boxes and ground truth boxes, and assigns labels accordingly.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/57.json b/docs/data/57.json
new file mode 100644
index 000000000..b7745e73b
--- /dev/null
+++ b/docs/data/57.json
@@ -0,0 +1,545 @@
+{
+    "5700": {
+        "file_id": 460,
+        "content": "            tp_fp_labels[~is_matched_to_difficult_box\n                         & ~is_matched_to_group_of_box],\n        )\n    def _get_ith_class_arrays(\n        self,\n        detected_boxes,\n        detected_scores,\n        detected_masks,\n        detected_class_labels,\n        groundtruth_boxes,\n        groundtruth_masks,\n        groundtruth_class_labels,\n        class_index,\n    ):\n        \"\"\"Returns numpy arrays belonging to class with index `class_index`.\n        Args:\n            detected_boxes: A numpy array containing detected boxes.\n            detected_scores: A numpy array containing detected scores.\n            detected_masks: A numpy array containing detected masks.\n            detected_class_labels: A numpy array containing detected class\n                labels.\n            groundtruth_boxes: A numpy array containing groundtruth boxes.\n            groundtruth_masks: A numpy array containing groundtruth masks.\n            groundtruth_class_labels: A numpy array containing groundtruth\n                class labels.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:345-371"
+    },
+    "5701": {
+        "file_id": 460,
+        "content": "This function, _get_ith_class_arrays, takes in various numpy arrays of detected and ground truth boxes, masks, and class labels. It then returns the corresponding numpy arrays for a specific class index.",
+        "type": "comment"
+    },
+    "5702": {
+        "file_id": 460,
+        "content": "            class_index: An integer index.\n        Returns:\n            gt_boxes_at_ith_class: A numpy array containing groundtruth boxes\n                labeled as ith class.\n            gt_masks_at_ith_class: A numpy array containing groundtruth masks\n                labeled as ith class.\n            detected_boxes_at_ith_class: A numpy array containing detected\n                boxes corresponding to the ith class.\n            detected_scores_at_ith_class: A numpy array containing detected\n                scores corresponding to the ith class.\n            detected_masks_at_ith_class: A numpy array containing detected\n                masks corresponding to the ith class.\n        \"\"\"\n        selected_groundtruth = groundtruth_class_labels == class_index\n        gt_boxes_at_ith_class = groundtruth_boxes[selected_groundtruth]\n        if groundtruth_masks is not None:\n            gt_masks_at_ith_class = groundtruth_masks[selected_groundtruth]\n        else:\n            gt_masks_at_ith_class = None\n        selected_detections = detected_class_labels == class_index",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:372-392"
+    },
+    "5703": {
+        "file_id": 460,
+        "content": "This function returns ground truth boxes, masks (if provided), detected boxes, scores, and masks for a given class index. It selects the data corresponding to the class index from input arrays and returns them in separate numpy arrays.",
+        "type": "comment"
+    },
+    "5704": {
+        "file_id": 460,
+        "content": "        detected_boxes_at_ith_class = detected_boxes[selected_detections]\n        detected_scores_at_ith_class = detected_scores[selected_detections]\n        if detected_masks is not None:\n            detected_masks_at_ith_class = detected_masks[selected_detections]\n        else:\n            detected_masks_at_ith_class = None\n        return (\n            gt_boxes_at_ith_class,\n            gt_masks_at_ith_class,\n            detected_boxes_at_ith_class,\n            detected_scores_at_ith_class,\n            detected_masks_at_ith_class,\n        )\n    def _remove_invalid_boxes(\n        self,\n        detected_boxes,\n        detected_scores,\n        detected_class_labels,\n        detected_masks=None,\n    ):\n        \"\"\"Removes entries with invalid boxes.\n        A box is invalid if either its xmax is smaller than its xmin, or its\n        ymax is smaller than its ymin.\n        Args:\n            detected_boxes: A float numpy array of size [num_boxes, 4]\n                containing box coordinates in [ymin, xmin, ymax, xmax] format.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:393-421"
+    },
+    "5705": {
+        "file_id": 460,
+        "content": "This code defines two functions: \n1. _get_class_specific_results: Extracts class-specific results from the given data and returns them in a tuple.\n2. _remove_invalid_boxes: Removes entries with invalid boxes from the given data. An invalid box is one where xmax < xmin or ymax < ymin.",
+        "type": "comment"
+    },
+    "5706": {
+        "file_id": 460,
+        "content": "            detected_scores: A float numpy array of size [num_boxes].\n            detected_class_labels: A int32 numpy array of size [num_boxes].\n            detected_masks: A uint8 numpy array of size\n                [num_boxes, height, width].\n        Returns:\n            valid_detected_boxes: A float numpy array of size\n                [num_valid_boxes, 4] containing box coordinates in\n                [ymin, xmin, ymax, xmax] format.\n            valid_detected_scores: A float numpy array of size\n                [num_valid_boxes].\n            valid_detected_class_labels: A int32 numpy array of size\n                [num_valid_boxes].\n            valid_detected_masks: A uint8 numpy array of size\n                [num_valid_boxes, height, width].\n        \"\"\"\n        valid_indices = np.logical_and(\n            detected_boxes[:, 0] < detected_boxes[:, 2],\n            detected_boxes[:, 1] < detected_boxes[:, 3],\n        )\n        detected_boxes = detected_boxes[valid_indices]\n        detected_scores = detected_scores[valid_indices]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:422-443"
+    },
+    "5707": {
+        "file_id": 460,
+        "content": "This function performs a filtering operation on the input arrays (detected_boxes, detected_scores, detected_class_labels, and detected_masks). It keeps only those elements where the first element of the detected_box is less than its fourth element, and the second element of detected_box is less than its third element. The resulting valid indices are used to slice the input arrays into their valid subsets (valid_detected_boxes, valid_detected_scores, valid_detected_class_labels, and valid_detected_masks).",
+        "type": "comment"
+    },
+    "5708": {
+        "file_id": 460,
+        "content": "        detected_class_labels = detected_class_labels[valid_indices]\n        if detected_masks is not None:\n            detected_masks = detected_masks[valid_indices]\n        return [\n            detected_boxes,\n            detected_scores,\n            detected_class_labels,\n            detected_masks,\n        ]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:444-452"
+    },
+    "5709": {
+        "file_id": 460,
+        "content": "Function returns detected bounding boxes, scores, class labels, and masks (if available) for valid indices only.",
+        "type": "comment"
+    },
+    "5710": {
+        "file_id": 461,
+        "content": "/paddlevideo/metrics/ava_evaluation/standard_fields.py",
+        "type": "filepath"
+    },
+    "5711": {
+        "file_id": 461,
+        "content": "This code sets naming conventions for object detection, defines fields and variables for efficient communication between decoder and model, and improves dataset evaluation. It also establishes conventions for video object detector output storage and standard metrics for field evaluation.",
+        "type": "summary"
+    },
+    "5712": {
+        "file_id": 461,
+        "content": "# Copyright 2017 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# =============================================================================\n\"\"\"Contains classes specifying naming conventions used for object detection.\nSpecifies:\n  InputDataFields: standard fields used by reader/preprocessor/batcher.\n  DetectionResultFields: standard fields returned by object detector.\n\"\"\"\nclass InputDataFields:\n    \"\"\"Names for the input tensors.\n    Holds the standard data field names to use for identifying input tensors.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:1-26"
+    },
+    "5713": {
+        "file_id": 461,
+        "content": "This code is defining classes for standard naming conventions in object detection. It provides InputDataFields for input tensors and DetectionResultFields for results returned by the object detector.",
+        "type": "comment"
+    },
+    "5714": {
+        "file_id": 461,
+        "content": "    This should be used by the decoder to identify keys for the returned\n    tensor_dict containing input tensors. And it should be used by the model to\n    identify the tensors it needs.\n    Attributes:\n        image: image.\n        original_image: image in the original input size.\n        key: unique key corresponding to image.\n        source_id: source of the original image.\n        filename: original filename of the dataset (without common path).\n        groundtruth_image_classes: image-level class labels.\n        groundtruth_boxes: coordinates of the ground truth boxes in the image.\n        groundtruth_classes: box-level class labels.\n        groundtruth_label_types: box-level label types (e.g. explicit\n            negative).\n        groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]\n            is the groundtruth a single object or a crowd.\n        groundtruth_area: area of a groundtruth segment.\n        groundtruth_difficult: is a `difficult` object\n        groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:27-46"
+    },
+    "5715": {
+        "file_id": 461,
+        "content": "This code defines standard fields used by the decoder and model for identifying keys in returned tensor_dict. Fields include image, original_image, source_id, filename, groundtruth_image_classes, groundtruth_boxes, groundtruth_classes, groundtruth_label_types, groundtruth_is_crowd, groundtruth_area, and groundtruth_difficult. It is used by the decoder to identify keys for returned tensor_dict and by model to identify necessary tensors.",
+        "type": "comment"
+    },
+    "5716": {
+        "file_id": 461,
+        "content": "            the same class, forming a connected group, where instances are\n            heavily occluding each other.\n        proposal_boxes: coordinates of object proposal boxes.\n        proposal_objectness: objectness score of each proposal.\n        groundtruth_instance_masks: ground truth instance masks.\n        groundtruth_instance_boundaries: ground truth instance boundaries.\n        groundtruth_instance_classes: instance mask-level class labels.\n        groundtruth_keypoints: ground truth keypoints.\n        groundtruth_keypoint_visibilities: ground truth keypoint visibilities.\n        groundtruth_label_scores: groundtruth label scores.\n        groundtruth_weights: groundtruth weight factor for bounding boxes.\n        num_groundtruth_boxes: number of groundtruth boxes.\n        true_image_shapes: true shapes of images in the resized images, as\n            resized images can be padded with zeros.\n    \"\"\"\n    image = 'image'\n    original_image = 'original_image'\n    key = 'key'\n    source_id = 'source_id'",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:47-66"
+    },
+    "5717": {
+        "file_id": 461,
+        "content": "This code defines a dictionary of variables used for AVA evaluation, including image and original image keys, source IDs, and other metrics such as proposal boxes, ground truth instance masks, and more. These variables are necessary for accurately evaluating the performance of video object detection models.",
+        "type": "comment"
+    },
+    "5718": {
+        "file_id": 461,
+        "content": "    filename = 'filename'\n    groundtruth_image_classes = 'groundtruth_image_classes'\n    groundtruth_boxes = 'groundtruth_boxes'\n    groundtruth_classes = 'groundtruth_classes'\n    groundtruth_label_types = 'groundtruth_label_types'\n    groundtruth_is_crowd = 'groundtruth_is_crowd'\n    groundtruth_area = 'groundtruth_area'\n    groundtruth_difficult = 'groundtruth_difficult'\n    groundtruth_group_of = 'groundtruth_group_of'\n    proposal_boxes = 'proposal_boxes'\n    proposal_objectness = 'proposal_objectness'\n    groundtruth_instance_masks = 'groundtruth_instance_masks'\n    groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'\n    groundtruth_instance_classes = 'groundtruth_instance_classes'\n    groundtruth_keypoints = 'groundtruth_keypoints'\n    groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'\n    groundtruth_label_scores = 'groundtruth_label_scores'\n    groundtruth_weights = 'groundtruth_weights'\n    num_groundtruth_boxes = 'num_groundtruth_boxes'\n    true_image_shape = 'true_image_shape'",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:67-86"
+    },
+    "5719": {
+        "file_id": 461,
+        "content": "This code defines various field names for the AWA dataset, which includes attributes like ground truth image classes, bounding boxes, class labels, label types, object crowding status, and more. The fields cover aspects such as instance masks, boundaries, keypoints, visibilities, label scores, and weights. These field definitions are likely used to organize and manage data in the dataset for further processing or evaluation tasks.",
+        "type": "comment"
+    },
+    "5720": {
+        "file_id": 461,
+        "content": "class DetectionResultFields:\n    \"\"\"Naming conventions for storing the output of the detector.\n    Attributes:\n        source_id: source of the original image.\n        key: unique key corresponding to image.\n        detection_boxes: coordinates of the detection boxes in the image.\n        detection_scores: detection scores for the detection boxes in the\n            image.\n        detection_classes: detection-level class labels.\n        detection_masks: contains a segmentation mask for each detection box.\n        detection_boundaries: contains an object boundary for each detection\n            box.\n        detection_keypoints: contains detection keypoints for each detection\n            box.\n        num_detections: number of detections in the batch.\n    \"\"\"\n    source_id = 'source_id'\n    key = 'key'\n    detection_boxes = 'detection_boxes'\n    detection_scores = 'detection_scores'\n    detection_classes = 'detection_classes'\n    detection_masks = 'detection_masks'\n    detection_boundaries = 'detection_boundaries'",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:89-113"
+    },
+    "5721": {
+        "file_id": 461,
+        "content": "This class defines the standard naming conventions for storing the output of a video object detector. It includes attributes like source_id, key, detection boxes coordinates, scores, classes, masks, boundaries, keypoints, and number of detections in a batch.",
+        "type": "comment"
+    },
+    "5722": {
+        "file_id": 461,
+        "content": "    detection_keypoints = 'detection_keypoints'\n    num_detections = 'num_detections'",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_evaluation/standard_fields.py:114-115"
+    },
+    "5723": {
+        "file_id": 461,
+        "content": "These two variables, detection_keypoints and num_detections, represent metrics for storing the keypoints of detected objects and the number of detections respectively in the standard fields evaluation.",
+        "type": "comment"
+    },
+    "5724": {
+        "file_id": 462,
+        "content": "/paddlevideo/metrics/ava_metric.py",
+        "type": "filepath"
+    },
+    "5725": {
+        "file_id": 462,
+        "content": "The code imports libraries, defines AVAMetric class for PaddleVideo and prepares metrics for video object detection. It also includes methods for logging during iterations, setting dataset info, and calculating final results.",
+        "type": "summary"
+    },
+    "5726": {
+        "file_id": 462,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nfrom collections import OrderedDict\nfrom paddlevideo.utils import get_logger, load, log_batch, AverageMeter\nfrom .registry import METRIC\nfrom .base import BaseMetric\nimport time\nfrom datetime import datetime\nfrom .ava_utils import ava_evaluate_results\nlogger = get_logger(\"paddlevideo\")\n\"\"\" An example for metrics class.\n    MultiCropMetric for slowfast.\n\"\"\"\n@METRIC.register\nclass AVAMetric(BaseMetric):\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 file_path,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_metric.py:1-34"
+    },
+    "5727": {
+        "file_id": 462,
+        "content": "This code imports necessary libraries and registers a class called AVAMetric as a metric for PaddleVideo. It initializes the AVAMetric with specified data size, batch size, and file path. The class inherits from BaseMetric and has an __init__ method which sets instance variables for data_size, batch_size, file_path, result_filename, and other properties.",
+        "type": "comment"
+    },
+    "5728": {
+        "file_id": 462,
+        "content": "                 exclude_file,\n                 label_file,\n                 custom_classes,\n                 log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.file_path = file_path\n        self.exclude_file = exclude_file\n        self.label_file = label_file\n        self.custom_classes = custom_classes\n        self.results = []\n        record_list = [\n            (\"loss\", AverageMeter('loss', '7.5f')),\n            (\"recall@thr=0.5\", AverageMeter(\"recall@thr=0.5\", '.5f')),\n            (\"prec@thr=0.5\", AverageMeter(\"prec@thr=0.5\", '.5f')),\n            (\"recall@top3\", AverageMeter(\"recall@top3\", '.5f')),\n            (\"prec@top3\", AverageMeter(\"prec@top3\", '.5f')),\n            (\"recall@top5\", AverageMeter(\"recall@top5\", '.5f')),\n            (\"prec@top5\", AverageMeter(\"prec@top5\", '.5f')),\n            (\"mAP@0.5IOU\", AverageMeter(\"mAP@0.5IOU\", '.5f')),\n            (\"batch_time\", AverageMeter('batch_cost', '.5f')),\n            (\"reader_time\", AverageMeter('reader_cost', '.5f')),",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_metric.py:35-60"
+    },
+    "5729": {
+        "file_id": 462,
+        "content": "The code initializes a class for preparing metrics in video object detection. It takes various parameters like file path, exclude file, label file, custom classes, and log interval for initialization. The class uses AverageMeter to store metrics such as loss, recall@thr=0.5, prec@thr=0.5, recall@top3, prec@top3, recall@top5, prec@top5, mAP@0.5IOU, batch time, and reader time.",
+        "type": "comment"
+    },
+    "5730": {
+        "file_id": 462,
+        "content": "        ]\n        self.record_list = OrderedDict(record_list)\n        self.tic = time.time()\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        self.results.extend(outputs)\n        self.record_list['batch_time'].update(time.time() - self.tic)\n        tic = time.time()\n        ips = \"ips: {:.5f} instance/sec.\".format(\n            self.batch_size / self.record_list[\"batch_time\"].val)\n        log_batch(self.record_list, batch_id, 0, 0, \"test\", ips)\n    def set_dataset_info(self, info, dataset_len):\n        self.info = info\n        self.dataset_len = dataset_len\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        test_res = ava_evaluate_results(self.info, self.dataset_len,\n                                        self.results, None, self.label_file,\n                                        self.file_path, self.exclude_file)\n        for name, value in test_res.items():\n            self.record_list[name].update(value, self.batch_size)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_metric.py:61-90"
+    },
+    "5731": {
+        "file_id": 462,
+        "content": "This code defines a class for metrics calculation and logging, with methods for updating metrics during iterations, setting dataset information, and accumulating final results. The update method extends the results list, updates batch time, logs batch time, and logs instance per second (ips). The accumulate method calculates final test results using ava_evaluate_results function and updates the record list with the final values.",
+        "type": "comment"
+    },
+    "5732": {
+        "file_id": 462,
+        "content": "        return self.record_list",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_metric.py:92-92"
+    },
+    "5733": {
+        "file_id": 462,
+        "content": "The code snippet is returning the record list from a class method. It seems that this method might have been responsible for recording or storing some data in the `record_list` attribute of the class instance, and now it's returning that data.",
+        "type": "comment"
+    },
+    "5734": {
+        "file_id": 463,
+        "content": "/paddlevideo/metrics/ava_utils.py",
+        "type": "filepath"
+    },
+    "5735": {
+        "file_id": 463,
+        "content": "This code handles video sequence object detection, converting results to CSV format and evaluating AVA metrics using error handling, utility functions, and GPU-based processing.",
+        "type": "summary"
+    },
+    "5736": {
+        "file_id": 463,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport csv\nimport heapq\nimport logging\nimport time\nfrom collections import defaultdict\nfrom .ava_evaluation import object_detection_evaluation as det_eval\nfrom .ava_evaluation import standard_fields\nfrom .recall import eval_recalls\nimport shutil\nimport pickle\nimport time\nimport os\nimport os.path as osp\nfrom paddlevideo.utils import get_logger, get_dist_info\nimport paddle.distributed as dist\nimport sys\nimport numpy as np",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:1-31"
+    },
+    "5737": {
+        "file_id": 463,
+        "content": "This code imports necessary libraries and modules for evaluating AVA (Activity-driven Visual Attention) metrics. It also includes a license notice, time management functions, and error handling measures. The code uses defaultdict from collections and eval_recalls function from the same repository to perform evaluation tasks related to object detection in video sequences. Additionally, it incorporates paddlevideo's get_logger() function for logging, dist library for distributed processing, and numpy for numerical operations.",
+        "type": "comment"
+    },
+    "5738": {
+        "file_id": 463,
+        "content": "from pathlib import Path\nfrom datetime import datetime\nimport paddle\ndef det2csv(info, dataset_len, results, custom_classes):\n    csv_results = []\n    for idx in range(dataset_len):\n        video_id = info[idx]['video_id']\n        timestamp = info[idx]['timestamp']\n        result = results[idx]\n        for label, _ in enumerate(result):\n            for bbox in result[label]:\n                if type(bbox) == paddle.Tensor:\n                    bbox = bbox.numpy()\n                bbox_ = tuple(bbox.tolist())\n                if custom_classes is not None:\n                    actual_label = custom_classes[label + 1]\n                else:\n                    actual_label = label + 1\n                csv_results.append((\n                    video_id,\n                    timestamp,\n                ) + bbox_[:4] + (actual_label, ) + bbox_[4:])\n    return csv_results\n# results is organized by class\ndef results2csv(info, dataset_len, results, out_file, custom_classes=None):\n    if isinstance(results[0], list):\n        csv_results = det2csv(info, dataset_len, results, custom_classes)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:32-64"
+    },
+    "5739": {
+        "file_id": 463,
+        "content": "The code defines two functions: \"det2csv\" and \"results2csv\". \"det2csv\" takes in information, dataset length, results, and custom classes (if any), and returns a list of tuples representing the results in CSV format. It loops through each entry, extracts relevant data, converts tensors to numpy arrays if needed, and appends the information to the csv_results list. \"results2csv\" checks if the results are organized by class or not, then calls either \"det2csv\" or performs CSV conversion directly using it.",
+        "type": "comment"
+    },
+    "5740": {
+        "file_id": 463,
+        "content": "    # save space for float\n    def tostr(item):\n        if isinstance(item, float):\n            return f'{item:.3f}'\n        return str(item)\n    with open(out_file, 'w') as f:\n        for csv_result in csv_results:\n            f.write(','.join(map(lambda x: tostr(x), csv_result)))\n            f.write('\\n')\ndef print_time(message, start):\n    print('==> %g seconds to %s' % (time.time() - start, message))\ndef make_image_key(video_id, timestamp):\n    \"\"\"Returns a unique identifier for a video id & timestamp.\"\"\"\n    return f'{video_id},{int(timestamp):04d}'\ndef read_csv(csv_file, class_whitelist=None, capacity=0):\n    \"\"\"Loads boxes and class labels from a CSV file in the AVA format.\n    CSV file format described at https://research.google.com/ava/download.html.\n    Args:\n        csv_file: A file object.\n        class_whitelist: If provided, boxes corresponding to (integer) class\n        labels not in this set are skipped.\n        capacity: Maximum number of labeled boxes allowed for each example.\n        Default is 0 where there is no limit.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:66-97"
+    },
+    "5741": {
+        "file_id": 463,
+        "content": "This code snippet contains several utility functions used for video analysis. The \"tostr\" function converts a float to a string representation with 3 decimal places, while the \"print_time\" function calculates and prints the time elapsed since a given start point. The \"make_image_key\" function generates a unique identifier for a video ID and timestamp, and \"read_csv\" function loads boxes and class labels from a CSV file in AVA format.",
+        "type": "comment"
+    },
+    "5742": {
+        "file_id": 463,
+        "content": "    Returns:\n        boxes: A dictionary mapping each unique image key (string) to a list of\n        boxes, given as coordinates [y1, x1, y2, x2].\n        labels: A dictionary mapping each unique image key (string) to a list\n        of integer class lables, matching the corresponding box in `boxes`.\n        scores: A dictionary mapping each unique image key (string) to a list\n        of score values lables, matching the corresponding label in `labels`.\n        If scores are not provided in the csv, then they will default to 1.0.\n    \"\"\"\n    start = time.time()\n    entries = defaultdict(list)\n    boxes = defaultdict(list)\n    labels = defaultdict(list)\n    scores = defaultdict(list)\n    reader = csv.reader(csv_file)\n    for row in reader:\n        assert len(row) in [7, 8], 'Wrong number of columns: ' + row\n        image_key = make_image_key(row[0], row[1])\n        x1, y1, x2, y2 = [float(n) for n in row[2:6]]\n        action_id = int(row[6])\n        if class_whitelist and action_id not in class_whitelist:\n            continue",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:99-120"
+    },
+    "5743": {
+        "file_id": 463,
+        "content": "This code reads a CSV file with video frame data, and for each row, it creates dictionaries for boxes, labels, and scores. It checks the class whitelist before adding the data to the respective lists. If scores are not provided in the CSV, they default to 1.0. The time taken for this process is measured at the beginning with start = time.time().",
+        "type": "comment"
+    },
+    "5744": {
+        "file_id": 463,
+        "content": "        score = 1.0\n        if len(row) == 8:\n            score = float(row[7])\n        if capacity < 1 or len(entries[image_key]) < capacity:\n            heapq.heappush(entries[image_key],\n                           (score, action_id, y1, x1, y2, x2))\n        elif score > entries[image_key][0][0]:\n            heapq.heapreplace(entries[image_key],\n                              (score, action_id, y1, x1, y2, x2))\n    for image_key in entries:\n        # Evaluation API assumes boxes with descending scores\n        entry = sorted(entries[image_key], key=lambda tup: -tup[0])\n        for item in entry:\n            score, action_id, y1, x1, y2, x2 = item\n            boxes[image_key].append([y1, x1, y2, x2])\n            labels[image_key].append(action_id)\n            scores[image_key].append(score)\n    print_time('read file ' + csv_file.name, start)\n    return boxes, labels, scores\ndef read_exclusions(exclusions_file):\n    \"\"\"Reads a CSV file of excluded timestamps.\n    Args:\n        exclusions_file: A file object containing a csv of video-id,timestamp.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:122-147"
+    },
+    "5745": {
+        "file_id": 463,
+        "content": "This code reads a CSV file containing object detection results and stores them in three lists: boxes, labels, and scores. The code also handles exclusions by reading a separate CSV file that contains excluded timestamps. The score is determined based on the length of each row in the CSV file and added to the corresponding image key's entry in the entries dictionary if the capacity allows or if the score is higher than the current highest score for that image key. The code then sorts the entries by descending scores and appends them to the boxes, labels, and scores lists for each image key.",
+        "type": "comment"
+    },
+    "5746": {
+        "file_id": 463,
+        "content": "    Returns:\n        A set of strings containing excluded image keys, e.g.\n        \"aaaaaaaaaaa,0904\",\n        or an empty set if exclusions file is None.\n    \"\"\"\n    excluded = set()\n    if exclusions_file:\n        reader = csv.reader(exclusions_file)\n    for row in reader:\n        assert len(row) == 2, 'Expected only 2 columns, got: ' + row\n        excluded.add(make_image_key(row[0], row[1]))\n    return excluded\ndef read_labelmap(labelmap_file):\n    \"\"\"Reads a labelmap without the dependency on protocol buffers.\n    Args:\n        labelmap_file: A file object containing a label map protocol buffer.\n    Returns:\n        labelmap: The label map in the form used by the\n        object_detection_evaluation\n        module - a list of {\"id\": integer, \"name\": classname } dicts.\n        class_ids: A set containing all of the valid class id integers.\n    \"\"\"\n    labelmap = []\n    class_ids = set()\n    name = ''\n    class_id = ''\n    for line in labelmap_file:\n        if line.startswith('  name:'):\n            name = line.split('\"')[1]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:149-181"
+    },
+    "5747": {
+        "file_id": 463,
+        "content": "Function `read_excluded_images` reads an exclusions file and returns a set of image keys to exclude. The input file is read row by row, and for each row the function checks that there are exactly two columns and adds the image key (combination of column 1 and column 2) to the excluded set.\n\nFunction `read_labelmap` reads a labelmap file without using protocol buffers. It iterates over the file line by line. When it encounters a line starting with 'name:', it extracts the class name, and when it encounters a line starting with 'id:' it extracts the class id. The function then appends a dictionary containing the id and name to the labelmap list and adds the id to the set of valid class ids.",
+        "type": "comment"
+    },
+    "5748": {
+        "file_id": 463,
+        "content": "        elif line.startswith('  id:') or line.startswith('  label_id:'):\n            class_id = int(line.strip().split(' ')[-1])\n            labelmap.append({'id': class_id, 'name': name})\n            class_ids.add(class_id)\n    return labelmap, class_ids\n# Seems there is at most 100 detections for each image\ndef ava_eval(result_file,\n             result_type,\n             label_file,\n             ann_file,\n             exclude_file,\n             max_dets=(100, ),\n             verbose=True,\n             custom_classes=None):\n    assert result_type in ['mAP']\n    start = time.time()\n    categories, class_whitelist = read_labelmap(open(label_file))\n    if custom_classes is not None:\n        custom_classes = custom_classes[1:]\n        assert set(custom_classes).issubset(set(class_whitelist))\n        class_whitelist = custom_classes\n        categories = [cat for cat in categories if cat['id'] in custom_classes]\n    # loading gt, do not need gt score\n    gt_boxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist, 0)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:182-210"
+    },
+    "5749": {
+        "file_id": 463,
+        "content": "This function ava_eval() takes several file paths as input and evaluates the results using mean average precision (mAP). It uses a label map to convert class labels from detections to their corresponding IDs. The code checks for 'id' or 'label_id' in each line of the label file, appends the ID and name to the label map, and adds the ID to a set of class_ids. The function also handles custom classes by excluding any category whose ID is not in the custom_classes list. The gt_boxes, gt_labels, and _ are loaded from the ann_file for evaluation.",
+        "type": "comment"
+    },
+    "5750": {
+        "file_id": 463,
+        "content": "    if verbose:\n        print_time('Reading detection results', start)\n    if exclude_file is not None:\n        excluded_keys = read_exclusions(open(exclude_file))\n    else:\n        excluded_keys = list()\n    start = time.time()\n    boxes, labels, scores = read_csv(open(result_file), class_whitelist, 0)\n    if verbose:\n        print_time('Reading detection results', start)\n    if result_type == 'proposal':\n        gts = [\n            np.array(gt_boxes[image_key], dtype=float) for image_key in gt_boxes\n        ]\n        proposals = []\n        for image_key in gt_boxes:\n            if image_key in boxes:\n                proposals.append(\n                    np.concatenate(\n                        (np.array(boxes[image_key], dtype=float),\n                         np.array(scores[image_key], dtype=float)[:, None]),\n                        axis=1))\n            else:\n                # if no corresponding proposal, add a fake one\n                proposals.append(np.array([0, 0, 1, 1, 1]))\n        # Proposals used here are with scores",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:211-240"
+    },
+    "5751": {
+        "file_id": 463,
+        "content": "The code reads detection results from a file, excludes certain keys if specified in an exclude file, and measures the time taken to read the results. It then checks if the result type is 'proposal' and creates proposals based on the gt_boxes for each image key present in boxes or adds a fake one if no corresponding proposal exists. Proposals include scores.",
+        "type": "comment"
+    },
+    "5752": {
+        "file_id": 463,
+        "content": "        recalls = eval_recalls(gts, proposals, np.array(max_dets),\n                               np.arange(0.5, 0.96, 0.05))\n        ar = recalls.mean(axis=1)\n        ret = {}\n        for i, num in enumerate(max_dets):\n            print(f'Recall@0.5@{num}\\t={recalls[i, 0]:.4f}')\n            print(f'AR@{num}\\t={ar[i]:.4f}')\n            ret[f'Recall@0.5@{num}'] = recalls[i, 0]\n            ret[f'AR@{num}'] = ar[i]\n        return ret\n    if result_type == 'mAP':\n        pascal_evaluator = det_eval.PascalDetectionEvaluator(categories)\n        start = time.time()\n        for image_key in gt_boxes:\n            if verbose and image_key in excluded_keys:\n                logging.info(\n                    'Found excluded timestamp in detections: %s.'\n                    'It will be ignored.', image_key)\n                continue\n            pascal_evaluator.add_single_ground_truth_image_info(\n                image_key, {\n                    standard_fields.InputDataFields.groundtruth_boxes:\n                    np.array(gt_boxes[image_key], dtype=float),",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:241-265"
+    },
+    "5753": {
+        "file_id": 463,
+        "content": "This code calculates the Average Recall (AR) and Recall@0.5 (R@0.5) for different detection numbers using the eval_recalls function. It then prints the results and stores them in a dictionary. If the result type is 'mAP', it initializes a PascalDetectionEvaluator, adds ground truth information for each image key, and calculates the mean average precision (mAP).",
+        "type": "comment"
+    },
+    "5754": {
+        "file_id": 463,
+        "content": "                    standard_fields.InputDataFields.groundtruth_classes:\n                    np.array(gt_labels[image_key], dtype=int),\n                    standard_fields.InputDataFields.groundtruth_difficult:\n                    np.zeros(len(gt_boxes[image_key]), dtype=bool)\n                })\n        if verbose:\n            print_time('Convert groundtruth', start)\n        start = time.time()\n        for image_key in boxes:\n            if verbose and image_key in excluded_keys:\n                logging.info(\n                    'Found excluded timestamp in detections: %s.'\n                    'It will be ignored.', image_key)\n                continue\n            pascal_evaluator.add_single_detected_image_info(\n                image_key, {\n                    standard_fields.DetectionResultFields.detection_boxes:\n                    np.array(boxes[image_key], dtype=float),\n                    standard_fields.DetectionResultFields.detection_classes:\n                    np.array(labels[image_key], dtype=int),",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:266-286"
+    },
+    "5755": {
+        "file_id": 463,
+        "content": "This code adds single detected image information to a Pascal evaluator. It converts groundtruth labels and boxes into appropriate data structures, handles excluded timestamps, and processes detection boxes and classes for evaluation.",
+        "type": "comment"
+    },
+    "5756": {
+        "file_id": 463,
+        "content": "                    standard_fields.DetectionResultFields.detection_scores:\n                    np.array(scores[image_key], dtype=float)\n                })\n        if verbose:\n            print_time('convert detections', start)\n        start = time.time()\n        metrics = pascal_evaluator.evaluate()\n        if verbose:\n            print_time('run_evaluator', start)\n        for display_name in metrics:\n            print(f'{display_name}=\\t{metrics[display_name]}')\n        ret = {\n            display_name: metrics[display_name]\n            for display_name in metrics if 'ByCategory' not in display_name\n        }\n        return ret\ndef mkdir_or_exist(dir_name, mode=0o777):\n    if dir_name == '':\n        return\n    dir_name = osp.expanduser(dir_name)\n    os.makedirs(dir_name, mode=mode, exist_ok=True)\ndef dump_to_fileobj(obj, file, **kwargs):\n    kwargs.setdefault('protocol', 2)\n    pickle.dump(obj, file, **kwargs)\ndef dump_to_path(obj, filepath, mode='wb'):\n    with open(filepath, mode) as f:\n        dump_to_fileobj(obj, f)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:287-320"
+    },
+    "5757": {
+        "file_id": 463,
+        "content": "Code snippet performs AVA evaluation for detection results, and prints or returns specific metrics. It also includes functions for creating directories, dumping objects to files using pickle library, and has a function for time measurement called print_time.",
+        "type": "comment"
+    },
+    "5758": {
+        "file_id": 463,
+        "content": "def load_from_fileobj(file, **kwargs):\n    return pickle.load(file, **kwargs)\ndef load_from_path(filepath, mode='rb'):\n    with open(filepath, mode) as f:\n        return load_from_fileobj(f)\ndef collect_results_cpu(result_part, size):\n    \"\"\"Collect results in cpu mode.\n    It saves the results on different gpus to 'tmpdir' and collects\n    them by the rank 0 worker.\n    \"\"\"\n    tmpdir = osp.join('./', 'collect_results_cpu')\n    #1. load results of all parts from tmp dir\n    mkdir_or_exist(tmpdir)\n    rank, world_size = get_dist_info()\n    dump_to_path(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))\n    dist.barrier()\n    if rank != 0:\n        return None\n    #2. collect all parts\n    while 1:\n        all_exist = True\n        for i in range(world_size):\n            part_file = osp.join(tmpdir, f'part_{i}.pkl')\n            if not Path(part_file).exists():\n                all_exist = False\n        if all_exist:\n            break\n        else:\n            time.sleep(60)\n    time.sleep(120)\n    #3. load results of all parts from tmp dir",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:323-357"
+    },
+    "5759": {
+        "file_id": 463,
+        "content": "This code defines three functions: `load_from_fileobj`, `load_from_path`, and `collect_results_cpu`. The first two are used to load data from files or file paths, respectively. The third function, `collect_results_cpu`, is a CPU-based method for collecting results across multiple GPUs by saving them in a temporary directory ('tmpdir') and having the rank 0 worker collect them. It checks if all parts exist, waits if not, then loads and returns the collected results once they are all available.",
+        "type": "comment"
+    },
+    "5760": {
+        "file_id": 463,
+        "content": "    part_list = []\n    for i in range(world_size):\n        part_file = osp.join(tmpdir, f'part_{i}.pkl')\n        part_list.append(load_from_path(part_file))\n    #4. sort the results\n    ordered_results = []\n    for res in zip(*part_list):\n        ordered_results.extend(list(res))\n    ordered_results = ordered_results[:\n                                      size]  #the dataloader may pad some samples\n    #5. remove results of all parts from tmp dir, avoid dump_file fail to tmp dir when dir not exists.\n    for i in range(world_size):\n        part_file = osp.join(tmpdir, f'part_{i}.pkl')\n        os.remove(part_file)\n    return ordered_results\ndef ava_evaluate_results(info, dataset_len, results, custom_classes, label_file,\n                         file_path, exclude_file):\n    # need to create a temp result file\n    time_now = datetime.now().strftime('%Y%m%d_%H%M%S')\n    temp_file = f'AVA_{time_now}_result.csv'\n    results2csv(info, dataset_len, results, temp_file)\n    ret = {}\n    eval_result = ava_eval(\n        temp_file,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:358-384"
+    },
+    "5761": {
+        "file_id": 463,
+        "content": "This code is used for evaluating AVA results by splitting the computation across multiple processes, then combining and ordering the partial results before deleting temporary files. It takes in information about the dataset, the evaluation results, custom class labels, and file paths for input and exclusion lists. The code creates a temporary result file, converts the results to a CSV format, performs AVA evaluation on the temporary file, and returns an evaluation result.",
+        "type": "comment"
+    },
+    "5762": {
+        "file_id": 463,
+        "content": "        'mAP',\n        label_file,\n        file_path,  #ann_file,\n        exclude_file,\n        custom_classes=custom_classes)\n    ret.update(eval_result)\n    os.remove(temp_file)\n    return ret",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ava_utils.py:385-394"
+    },
+    "5763": {
+        "file_id": 463,
+        "content": "This code is computing the mean average precision (mAP) for object detection metrics. It reads from a label file, file path, and excludes certain classes as specified. The results are stored in the 'ret' dictionary before removing a temporary file and returning the final results.",
+        "type": "comment"
+    },
+    "5764": {
+        "file_id": 464,
+        "content": "/paddlevideo/metrics/base.py",
+        "type": "filepath"
+    },
+    "5765": {
+        "file_id": 464,
+        "content": "This code defines a base class for metrics in the PaddleVideo library, initializing with data size, batch size, world size, and log interval. It includes all-gather and concatenation methods, along with abstract update and accumulate functions to be implemented by subclasses.",
+        "type": "summary"
+    },
+    "5766": {
+        "file_id": 464,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nimport paddle\nfrom paddlevideo.utils import get_dist_info\nfrom .registry import METRIC\nclass BaseMetric(object):\n    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):\n        self.data_size = data_size\n        self.batch_size = batch_size\n        _, self.world_size = get_dist_info()\n        self.log_interval = log_interval\n    def gather_from_gpu(self,\n                        gather_object: paddle.Tensor,\n                        concat_axis=0) -> paddle.Tensor:\n        \"\"\"gather Tensor from all gpus into a list and concatenate them on `concat_axis`.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/base.py:1-31"
+    },
+    "5767": {
+        "file_id": 464,
+        "content": "This code is part of the PaddleVideo library and defines a base class for metrics. It initializes the metric with data size, batch size, world size, and log interval. The gather_from_gpu method gathers Tensors from all GPUs into a list and concatenates them on a specified axis.",
+        "type": "comment"
+    },
+    "5768": {
+        "file_id": 464,
+        "content": "        Args:\n            gather_object (paddle.Tensor): gather object Tensor\n            concat_axis (int, optional): axis for concatenation. Defaults to 0.\n        Returns:\n            paddle.Tensor: gatherd & concatenated Tensor\n        \"\"\"\n        gather_object_list = []\n        paddle.distributed.all_gather(gather_object_list, gather_object.cuda())\n        return paddle.concat(gather_object_list, axis=concat_axis)\n    @abstractmethod\n    def update(self):\n        raise NotImplementedError(\n            \"'update' method must be implemented in subclass\")\n    @abstractmethod\n    def accumulate(self):\n        raise NotImplementedError(\n            \"'accumulate' method must be implemented in subclass\")",
+        "type": "code",
+        "location": "/paddlevideo/metrics/base.py:33-52"
+    },
+    "5769": {
+        "file_id": 464,
+        "content": "Function that performs all-gather and concatenation on the gather object Tensor. Abstract methods for update and accumulate that must be implemented in subclasses.",
+        "type": "comment"
+    },
+    "5770": {
+        "file_id": 465,
+        "content": "/paddlevideo/metrics/bmn_metric.py",
+        "type": "filepath"
+    },
+    "5771": {
+        "file_id": 465,
+        "content": "This code calculates BMN metric for object detection in computer vision frameworks, supports batch_size and world_size as 1, initializes class variables, processes video data, logs progress, saves results, performs soft NMS, calculates proposal lists, evaluates performance using the \"cal_metrics\" function.",
+        "type": "summary"
+    },
+    "5772": {
+        "file_id": 465,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport os\nimport json\nimport numpy as np\nimport pandas as pd\nimport multiprocessing as mp\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom .ActivityNet import ANETproposal\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\ndef iou_with_anchors(anchors_min, anchors_max, box_min, box_max):\n    \"\"\"Compute jaccard score between a box and the anchors.\n    \"\"\"\n    len_anchors = anchors_max - anchors_min\n    int_xmin = np.maximum(anchors_min, box_min)\n    int_xmax = np.minimum(anchors_max, box_max)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:1-32"
+    },
+    "5773": {
+        "file_id": 465,
+        "content": "This code imports necessary libraries and defines a function to compute the Intersection over Union (IoU) between a box and anchors. It appears to be related to object detection or proposal generation within a computer vision framework.",
+        "type": "comment"
+    },
+    "5774": {
+        "file_id": 465,
+        "content": "    inter_len = np.maximum(int_xmax - int_xmin, 0.)\n    union_len = len_anchors - inter_len + box_max - box_min\n    jaccard = np.divide(inter_len, union_len)\n    return jaccard\ndef boundary_choose(score_list):\n    \"\"\"Choose start and end boundary from score.\n    \"\"\"\n    max_score = max(score_list)\n    mask_high = (score_list > max_score * 0.5)\n    score_list = list(score_list)\n    score_middle = np.array([0.0] + score_list + [0.0])\n    score_front = np.array([0.0, 0.0] + score_list)\n    score_back = np.array(score_list + [0.0, 0.0])\n    mask_peak = ((score_middle > score_front) & (score_middle > score_back))\n    mask_peak = mask_peak[1:-1]\n    mask = (mask_high | mask_peak).astype('float32')\n    return mask\ndef soft_nms(df, alpha, t1, t2):\n    '''\n    df: proposals generated by network;\n    alpha: alpha value of Gaussian decaying function;\n    t1, t2: threshold for soft nms.\n    '''\n    df = df.sort_values(by=\"score\", ascending=False)\n    tstart = list(df.xmin.values[:])\n    tend = list(df.xmax.values[:])\n    tscore = list(df.score.values[:])",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:33-63"
+    },
+    "5775": {
+        "file_id": 465,
+        "content": "inter_len calculates the intersection length between two bounding boxes. union_len computes the total length of both boxes and jaccard index is calculated by dividing inter_len with union_len. This function returns the Jaccard index.\nThe boundary_choose() function selects start and end boundaries based on a given score list. It identifies the highest score and creates three arrays - score_list, score_front, score_back for comparison. mask_peak is created by comparing these arrays, followed by generating a binary mask of True values.\nSoft_nms function sorts proposals generated by network based on scores in descending order. It takes alpha value (Gaussian decaying function), and two threshold values t1, t2.",
+        "type": "comment"
+    },
+    "5776": {
+        "file_id": 465,
+        "content": "    rstart = []\n    rend = []\n    rscore = []\n    while len(tscore) > 1 and len(rscore) < 101:\n        max_index = tscore.index(max(tscore))\n        tmp_iou_list = iou_with_anchors(np.array(tstart), np.array(tend),\n                                        tstart[max_index], tend[max_index])\n        for idx in range(0, len(tscore)):\n            if idx != max_index:\n                tmp_iou = tmp_iou_list[idx]\n                tmp_width = tend[max_index] - tstart[max_index]\n                if tmp_iou > t1 + (t2 - t1) * tmp_width:\n                    tscore[idx] = tscore[idx] * np.exp(\n                        -np.square(tmp_iou) / alpha)\n        rstart.append(tstart[max_index])\n        rend.append(tend[max_index])\n        rscore.append(tscore[max_index])\n        tstart.pop(max_index)\n        tend.pop(max_index)\n        tscore.pop(max_index)\n    newDf = pd.DataFrame()\n    newDf['score'] = rscore\n    newDf['xmin'] = rstart\n    newDf['xmax'] = rend\n    return newDf\n@METRIC.register\nclass BMNMetric(BaseMetric):\n    \"\"\"\n    Metrics for BMN. Two Stages in this metric:",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:65-98"
+    },
+    "5777": {
+        "file_id": 465,
+        "content": "The code calculates BMN metric for object detection by iterating through a list of scores and appending the maximum score, along with its corresponding start and end positions, to new lists. It then creates a new DataFrame using these lists before returning it as the final output.",
+        "type": "comment"
+    },
+    "5778": {
+        "file_id": 465,
+        "content": "    (1) Get test results using trained model, results will be saved in BMNMetric.result_path;\n    (2) Calculate metrics using results file from stage (1).\n    \"\"\"\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 tscale,\n                 dscale,\n                 file_path,\n                 ground_truth_filename,\n                 subset,\n                 output_path,\n                 result_path,\n                 get_metrics=True,\n                 log_interval=1):\n        \"\"\"\n        Init for BMN metrics.\n        Params:\n            get_metrics: whether to calculate AR@N and AUC metrics or not, default True.\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        assert self.batch_size == 1, \" Now we just support batch_size==1 test\"\n        assert self.world_size == 1, \" Now we just support single-card test\"\n        self.tscale = tscale\n        self.dscale = dscale\n        self.file_path = file_path\n        self.ground_truth_filename = ground_truth_filename",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:99-127"
+    },
+    "5779": {
+        "file_id": 465,
+        "content": "This code initializes an instance of BMNMetric class with various parameters such as data_size, batch_size, tscale, dscale, file_path, ground_truth_filename, subset, output_path, result_path, get_metrics, and log_interval. It also performs assertions to ensure batch_size is 1 and world_size is 1, as the code currently supports only these conditions. The class is a part of PaddleVideo library for video analysis tasks.",
+        "type": "comment"
+    },
+    "5780": {
+        "file_id": 465,
+        "content": "        self.subset = subset\n        self.output_path = output_path\n        self.result_path = result_path\n        self.get_metrics = get_metrics\n        if not os.path.isdir(self.output_path):\n            os.makedirs(self.output_path)\n        if not os.path.isdir(self.result_path):\n            os.makedirs(self.result_path)\n        self.video_dict, self.video_list = self.get_dataset_dict(\n            self.file_path, self.subset)\n    def get_dataset_dict(self, file_path, subset):\n        annos = json.load(open(file_path))\n        video_dict = {}\n        for video_name in annos.keys():\n            video_subset = annos[video_name][\"subset\"]\n            if subset in video_subset:\n                video_dict[video_name] = annos[video_name]\n        video_list = list(video_dict.keys())\n        video_list.sort()\n        return video_dict, video_list\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        fid = data[4].numpy()\n        pred_bm, pred_start, pred_end = outputs",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:128-156"
+    },
+    "5781": {
+        "file_id": 465,
+        "content": "The code initializes the class variables and checks if the output and result directories exist, creating them if not. It then calls a method to get the dataset dictionary and list based on the provided file path and subset. The update method takes batch ID, data, and outputs as inputs to update metrics during each iteration.",
+        "type": "comment"
+    },
+    "5782": {
+        "file_id": 465,
+        "content": "        pred_bm = pred_bm.numpy()\n        pred_start = pred_start[0].numpy()\n        pred_end = pred_end[0].numpy()\n        snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]\n        snippet_xmaxs = [\n            1.0 / self.tscale * i for i in range(1, self.tscale + 1)\n        ]\n        cols = [\"xmin\", \"xmax\", \"score\"]\n        video_name = self.video_list[fid[0]]\n        pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]\n        start_mask = boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_vector_list = []\n        for idx in range(self.dscale):\n            for jdx in range(self.tscale):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < self.tscale and start_mask[\n                        start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = snippet_xmins[start_index]\n                    xmax = snippet_xmaxs[end_index]\n                    xmin_score = pred_start[start_index]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:157-182"
+    },
+    "5783": {
+        "file_id": 465,
+        "content": "Code snippet performs boundary detection and creates a score vector list for each detection. It uses the provided prediction, start and end values to calculate the xmin and xmax values within the defined time scale. Then it checks if the start and end mask conditions are met and adds the corresponding score value to the score vector list. This information is used for further analysis or evaluation of video frames.",
+        "type": "comment"
+    },
+    "5784": {
+        "file_id": 465,
+        "content": "                    xmax_score = pred_end[end_index]\n                    bm_score = pred_bm[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bm_score\n                    score_vector_list.append([xmin, xmax, conf_score])\n        score_vector_list = np.stack(score_vector_list)\n        video_df = pd.DataFrame(score_vector_list, columns=cols)\n        video_df.to_csv(os.path.join(self.output_path, \"%s.csv\" % video_name),\n                        index=False)\n        if batch_id % self.log_interval == 0:\n            logger.info(\"Processing................ batch {}\".format(batch_id))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        # check clip index of each video\n        #Stage1\n        self.bmn_post_processing(self.video_dict, self.subset, self.output_path,\n                                 self.result_path)\n        if self.get_metrics:\n            logger.info(\"[TEST] calculate metrics...\")\n            #Stage2\n            uniform_average_nr_proposals_valid, uniform_average_recall_valid, uniform_recall_valid = self.cal_metrics(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:183-206"
+    },
+    "5785": {
+        "file_id": 465,
+        "content": "This code snippet performs post-processing on video data and calculates metrics. It first processes the video data, then accumulates the metrics for each batch during processing. The code uses numpy arrays to handle score vectors, Pandas DataFrame to store and manipulate data, and logging to provide progress updates. The results are saved in a CSV file at the specified output path.",
+        "type": "comment"
+    },
+    "5786": {
+        "file_id": 465,
+        "content": "                self.ground_truth_filename,\n                os.path.join(self.result_path, \"bmn_results_validation.json\"),\n                max_avg_nr_proposals=100,\n                tiou_thresholds=np.linspace(0.5, 0.95, 10),\n                subset='validation')\n            logger.info(\"AR@1; AR@5; AR@10; AR@100\")\n            logger.info(\"%.02f %.02f %.02f %.02f\" %\n                        (100 * np.mean(uniform_recall_valid[:, 0]),\n                         100 * np.mean(uniform_recall_valid[:, 4]),\n                         100 * np.mean(uniform_recall_valid[:, 9]),\n                         100 * np.mean(uniform_recall_valid[:, -1])))\n    def bmn_post_processing(self, video_dict, subset, output_path, result_path):\n        video_list = list(video_dict.keys())\n        global result_dict\n        result_dict = mp.Manager().dict()\n        pp_num = 12\n        num_videos = len(video_list)\n        num_videos_per_thread = int(num_videos / pp_num)\n        processes = []\n        for tid in range(pp_num - 1):\n            tmp_video_list = video_list[tid * num_videos_per_thread:(tid + 1) *",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:207-229"
+    },
+    "5787": {
+        "file_id": 465,
+        "content": "This code is initializing a bmn_post_processing function that will process multiple videos in parallel using multiple processes. It creates a result dictionary and divides the video list into equal parts to assign each part to a separate process. It also logs the average recall at different thresholds for different numbers of detections.",
+        "type": "comment"
+    },
+    "5788": {
+        "file_id": 465,
+        "content": "                                        num_videos_per_thread]\n            p = mp.Process(target=self.video_process,\n                           args=(tmp_video_list, video_dict, output_path,\n                                 result_dict))\n            p.start()\n            processes.append(p)\n        tmp_video_list = video_list[(pp_num - 1) * num_videos_per_thread:]\n        p = mp.Process(target=self.video_process,\n                       args=(tmp_video_list, video_dict, output_path,\n                             result_dict))\n        p.start()\n        processes.append(p)\n        for p in processes:\n            p.join()\n        result_dict = dict(result_dict)\n        output_dict = {\n            \"version\": \"VERSION 1.3\",\n            \"results\": result_dict,\n            \"external_data\": {}\n        }\n        outfile = open(\n            os.path.join(result_path, \"bmn_results_%s.json\" % subset), \"w\")\n        # json.dump(output_dict, outfile)\n        # in case of file name in chinese\n        json.dump(output_dict, outfile, ensure_ascii=False)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:230-256"
+    },
+    "5789": {
+        "file_id": 465,
+        "content": "The code creates multiple processes to handle video processing tasks in parallel, using multiprocessing. It then joins all the results together into a single output dictionary before writing it to a JSON file. This approach allows for efficient and concurrent processing of large numbers of videos.",
+        "type": "comment"
+    },
+    "5790": {
+        "file_id": 465,
+        "content": "        outfile.close()\n    def video_process(self,\n                      video_list,\n                      video_dict,\n                      output_path,\n                      result_dict,\n                      snms_alpha=0.4,\n                      snms_t1=0.55,\n                      snms_t2=0.9):\n        for video_name in video_list:\n            logger.info(\"Processing video........\" + video_name)\n            df = pd.read_csv(os.path.join(output_path, video_name + \".csv\"))\n            if len(df) > 1:\n                df = soft_nms(df, snms_alpha, snms_t1, snms_t2)\n            video_duration = video_dict[video_name][\"duration_second\"]\n            proposal_list = []\n            for idx in range(min(100, len(df))):\n                tmp_prop={\"score\":df.score.values[idx], \\\n                          \"segment\":[max(0,df.xmin.values[idx])*video_duration, \\\n                                     min(1,df.xmax.values[idx])*video_duration]}\n                proposal_list.append(tmp_prop)\n            video_name = video_name[2:] if video_name[:2] == 'v_' else video_name",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:257-282"
+    },
+    "5791": {
+        "file_id": 465,
+        "content": "This function takes a list of video names, corresponding metadata dictionaries, output path, and result dictionary. It processes each video by reading its CSV file, performs soft NMS if the dataframe has more than one row, calculates proposal list for each video, and appends them to the result dictionary.",
+        "type": "comment"
+    },
+    "5792": {
+        "file_id": 465,
+        "content": "            result_dict[video_name] = proposal_list\n    def cal_metrics(self,\n                    ground_truth_filename,\n                    proposal_filename,\n                    max_avg_nr_proposals=100,\n                    tiou_thresholds=np.linspace(0.5, 0.95, 10),\n                    subset='validation'):\n        anet_proposal = ANETproposal(ground_truth_filename,\n                                     proposal_filename,\n                                     tiou_thresholds=tiou_thresholds,\n                                     max_avg_nr_proposals=max_avg_nr_proposals,\n                                     subset=subset,\n                                     verbose=True,\n                                     check_status=False)\n        anet_proposal.evaluate()\n        recall = anet_proposal.recall\n        average_recall = anet_proposal.avg_recall\n        average_nr_proposals = anet_proposal.proposals_per_video\n        return (average_nr_proposals, average_recall, recall)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/bmn_metric.py:283-304"
+    },
+    "5793": {
+        "file_id": 465,
+        "content": "The code defines a function \"cal_metrics\" that takes in ground truth and proposal filenames, calculates the average recall, average proposals per video, and overall recall using ANETproposal class. This function is used to evaluate performance based on given thresholds and subsets of data.",
+        "type": "comment"
+    },
+    "5794": {
+        "file_id": 466,
+        "content": "/paddlevideo/metrics/build.py",
+        "type": "filepath"
+    },
+    "5795": {
+        "file_id": 466,
+        "content": "Copyright notice, Apache License v2.0, software distributed as is without warranties or conditions. Imports registry and utils modules, defines build_metric function that builds metric using provided configuration.",
+        "type": "summary"
+    },
+    "5796": {
+        "file_id": 466,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import METRIC\nfrom ..utils import build\ndef build_metric(cfg):\n    return build(cfg, METRIC)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py:1-20"
+    },
+    "5797": {
+        "file_id": 466,
+        "content": "Copyright notice, Apache License v2.0, software distributed as is without warranties or conditions. Imports registry and utils modules, defines build_metric function that builds metric using provided configuration.",
+        "type": "comment"
+    },
+    "5798": {
+        "file_id": 467,
+        "content": "/paddlevideo/metrics/center_crop_metric.py",
+        "type": "filepath"
+    },
+    "5799": {
+        "file_id": 467,
+        "content": "The code initializes a metric class for PaddleVideo, handling batch updates and GPU data to mitigate resampling effects while managing output accumulation, concatenation, top-k accuracy, and logging.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/58.json b/docs/data/58.json
new file mode 100644
index 000000000..7f538fc7e
--- /dev/null
+++ b/docs/data/58.json
@@ -0,0 +1,543 @@
+{
+    "5800": {
+        "file_id": 467,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom typing import List\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom .base import BaseMetric\nfrom .registry import METRIC\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass CenterCropMetric(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval, **kwargs)\n        self.rest_data_size = data_size  # Number of samples remaining to be tested\n        self.all_outputs = []",
+        "type": "code",
+        "location": "/paddlevideo/metrics/center_crop_metric.py:1-31"
+    },
+    "5801": {
+        "file_id": 467,
+        "content": "This code registers a class called CenterCropMetric as a metric in the PaddleVideo library. It initializes the metric with data_size, batch_size, and log_interval parameters. The rest_data_size is also stored to keep track of remaining samples to be tested.",
+        "type": "comment"
+    },
+    "5802": {
+        "file_id": 467,
+        "content": "        self.all_labels = []\n        self.topk = kwargs.get(\"topk\", [1, 5])\n    def update(self, batch_id: int, data: List, outputs: paddle.Tensor) -> None:\n        \"\"\"update metrics during each iter\n        Args:\n            batch_id (int): iter id of current batch.\n            data (List): list of batched data, such as [inputs, labels]\n            outputs (paddle.Tensor): batched outputs from model\n        \"\"\"\n        labels = data[1]\n        if self.world_size > 1:\n            labels_gathered = self.gather_from_gpu(labels, concat_axis=0)\n            outpus_gathered = self.gather_from_gpu(outputs, concat_axis=0)\n        else:\n            labels_gathered = labels\n            outpus_gathered = outputs\n        # Avoid resampling effects when testing with multiple cards\n        labels_gathered = labels_gathered[0:min(len(labels_gathered), self.\n                                                rest_data_size)]\n        outpus_gathered = outpus_gathered[0:min(len(outpus_gathered), self.\n                                                rest_data_size)]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/center_crop_metric.py:32-55"
+    },
+    "5803": {
+        "file_id": 467,
+        "content": "This code is initializing a metric object, allowing for batch updates, and handling data from multiple GPUs to avoid resampling effects when testing with multiple cards.",
+        "type": "comment"
+    },
+    "5804": {
+        "file_id": 467,
+        "content": "        self.all_labels.append(labels_gathered)\n        self.all_outputs.append(outpus_gathered)\n        self.rest_data_size -= outpus_gathered.shape[0]\n        # preds ensemble\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate, compute, and show metrics when finished all iters.\n        \"\"\"\n        self.all_outputs = paddle.concat(self.all_outputs, axis=0)\n        self.all_labels = paddle.concat(self.all_labels, axis=0)\n        result_str = []\n        for _k in self.topk:\n            topk_val = paddle.metric.accuracy(input=self.all_outputs,\n                                              label=self.all_labels,\n                                              k=_k).item()\n            result_str.append(f\"avg_acc{_k}={topk_val}\")\n        result_str = \", \".join(result_str)\n        logger.info(f\"[TEST] finished, {result_str}\")",
+        "type": "code",
+        "location": "/paddlevideo/metrics/center_crop_metric.py:56-79"
+    },
+    "5805": {
+        "file_id": 467,
+        "content": "The code is part of a class that seems to be handling batch processing in a machine learning application. It accumulates and concatenates outputs and labels from multiple batches, performs top-k accuracy calculations, and logs the results. The log_interval variable controls when progress updates are displayed.",
+        "type": "comment"
+    },
+    "5806": {
+        "file_id": 468,
+        "content": "/paddlevideo/metrics/center_crop_metric_MRI.py",
+        "type": "filepath"
+    },
+    "5807": {
+        "file_id": 468,
+        "content": "The code creates a class to calculate top-1 and possibly top-5 accuracy metrics in image classification tasks, tracking and averaging them during iteration, with support for multi-GPU scenarios using all-reduce operations.",
+        "type": "summary"
+    },
+    "5808": {
+        "file_id": 468,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass CenterCropMetric_MRI(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1, if_slowfast=0):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.top1 = []\n        self.if_slowfast = if_slowfast\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter",
+        "type": "code",
+        "location": "/paddlevideo/metrics/center_crop_metric_MRI.py:1-33"
+    },
+    "5809": {
+        "file_id": 468,
+        "content": "This code defines the class CenterCropMetric_MRI, a metric for a video processing framework. It initializes variables and tracks top1 accuracy during iteration.",
+        "type": "comment"
+    },
+    "5810": {
+        "file_id": 468,
+        "content": "        \"\"\"\n        labels = data[1]\n        if self.if_slowfast:\n            labels = data[2]\n        top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)\n        #top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)\n        #NOTE(shipping): deal with multi cards validate\n        if self.world_size > 1:\n            top1 = paddle.distributed.all_reduce(\n                top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            # top5 = paddle.distributed.all_reduce(\n            #     top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size\n        self.top1.append(top1.numpy())\n        #self.top5.append(top5.numpy())\n        # preds ensemble\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        logger.info('[TEST] finished, avg_acc1= {}'.format(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/center_crop_metric_MRI.py:34-60"
+    },
+    "5811": {
+        "file_id": 468,
+        "content": "This code snippet defines a class for calculating top-1 and possibly top-5 accuracy metrics in an image classification task. It collects the metrics for each batch during testing, then averages them at the end of all iterations. The code handles multi-GPU scenarios by performing all-reduce operations on the metric values.",
+        "type": "comment"
+    },
+    "5812": {
+        "file_id": 468,
+        "content": "            np.mean(np.array(self.top1))))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/center_crop_metric_MRI.py:61-61"
+    },
+    "5813": {
+        "file_id": 468,
+        "content": "Calculates mean of top-1 accuracy across all samples in the batch.",
+        "type": "comment"
+    },
+    "5814": {
+        "file_id": 469,
+        "content": "/paddlevideo/metrics/depth_metric.py",
+        "type": "filepath"
+    },
+    "5815": {
+        "file_id": 469,
+        "content": "The `DepthMetric` class inherits from `BaseMetric`, processes batches, accumulates metrics and performs distributed all-reduce operations before averaging metric values.",
+        "type": "summary"
+    },
+    "5816": {
+        "file_id": 469,
+        "content": "import numpy as np\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom .base import BaseMetric\nfrom .registry import METRIC\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass DepthMetric(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.abs_rel = []\n        self.sq_rel = []\n        self.rmse = []\n        self.rmse_log = []\n        self.a1 = []\n        self.a2 = []\n        self.a3 = []\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = outputs['abs_rel'], outputs['sq_rel'], outputs['rmse'], \\\n                                                      outputs['rmse_log'], outputs['a1'], outputs['a2'],outputs['a3']\n        # preds ensemble\n        if self.world_size > 1:\n            abs_rel = paddle.distributed.all_reduce(\n                outputs['abs_rel'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size",
+        "type": "code",
+        "location": "/paddlevideo/metrics/depth_metric.py:1-34"
+    },
+    "5817": {
+        "file_id": 469,
+        "content": "This code defines a class `DepthMetric` that inherits from `BaseMetric`. It initializes lists for various metric values and then updates these metrics during each iteration. The code also includes logic to handle distributed computing, using all-reduce operation to average the results across different processes in the same training job.",
+        "type": "comment"
+    },
+    "5818": {
+        "file_id": 469,
+        "content": "            sq_rel = paddle.distributed.all_reduce(\n                outputs['sq_rel'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            rmse = paddle.distributed.all_reduce(\n                outputs['rmse'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            rmse_log = paddle.distributed.all_reduce(\n                outputs['rmse_log'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            a1 = paddle.distributed.all_reduce(\n                outputs['a1'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            a2 = paddle.distributed.all_reduce(\n                outputs['a2'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            a3 = paddle.distributed.all_reduce(\n                outputs['a3'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n        self.abs_rel.append(abs_rel)\n        self.sq_rel.append(sq_rel)\n        self.rmse.append(rmse)\n        self.rmse_log.append(rmse_log)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/depth_metric.py:35-57"
+    },
+    "5819": {
+        "file_id": 469,
+        "content": "This code performs distributed all-reduce operations on several metrics (sq_rel, rmse, rmse\\_log, a1, a2, a3) and calculates their average values by dividing by the world size. These averaged metric values are then appended to corresponding lists (abs_rel, sq_rel, rmse, rmse_log).",
+        "type": "comment"
+    },
+    "5820": {
+        "file_id": 469,
+        "content": "        self.a1.append(a1)\n        self.a2.append(a2)\n        self.a3.append(a3)\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        logger.info(\n            '[TEST] finished, abs_rel= {}, sq_rel= {} , rmse= {}, rmse_log= {},'\n            'a1= {}, a2= {}, a3= {}'.format(np.mean(np.array(self.abs_rel)),\n                                            np.mean(np.array(self.sq_rel)),\n                                            np.mean(np.array(self.rmse)),\n                                            np.mean(np.array(self.rmse_log)),\n                                            np.mean(np.array(self.a1)),\n                                            np.mean(np.array(self.a2)),\n                                            np.mean(np.array(self.a3))))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/depth_metric.py:58-77"
+    },
+    "5821": {
+        "file_id": 469,
+        "content": "This code defines a class with methods for processing batches and accumulating metrics. The `process_batch` method appends data to lists, logs progress if the batch ID is divisible by log_interval, and handles the next batch. The `accumulate` method calculates mean values for each metric list and logs them using a logger with the corresponding metric values.",
+        "type": "comment"
+    },
+    "5822": {
+        "file_id": 470,
+        "content": "/paddlevideo/metrics/msrvtt_metric.py",
+        "type": "filepath"
+    },
+    "5823": {
+        "file_id": 470,
+        "content": "The code initializes and defines a class for MS-RNN/VTT model metrics computation, updates metrics using input data, calculates rank metrics (r1, r5, r10, medr, mean), logs these metrics, and signals the end of iterations.",
+        "type": "summary"
+    },
+    "5824": {
+        "file_id": 470,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass MSRVTTMetric(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.score_matrix = np.zeros((data_size, data_size))\n        self.target_matrix = np.zeros((data_size, data_size))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/msrvtt_metric.py:1-31"
+    },
+    "5825": {
+        "file_id": 470,
+        "content": "The code is from the MSRVTTMetric class in paddlevideo's metrics module. It initializes an instance of the class, prepares for metrics computation, and creates score_matrix and target_matrix using numpy with zeroes. These matrices will be used to store results during metric calculations. The class also registers with the base BaseMetric class.",
+        "type": "comment"
+    },
+    "5826": {
+        "file_id": 470,
+        "content": "        self.rank_matrix = np.ones((data_size)) * data_size\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        target = data[-1]\n        cm_logit = outputs[-1]\n        self.score_matrix[batch_id, :] = F.softmax(\n            cm_logit, axis=1)[:, 0].reshape([-1]).numpy()\n        self.target_matrix[batch_id, :] = target.reshape([-1]).numpy()\n        rank = np.where((np.argsort(-self.score_matrix[batch_id]) == np.where(\n            self.target_matrix[batch_id] == 1)[0][0]) == 1)[0][0]\n        self.rank_matrix[batch_id] = rank\n        rank_matrix_tmp = self.rank_matrix[:batch_id + 1]\n        r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp)\n        r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp)\n        r10 = 100.0 * np.sum(rank_matrix_tmp < 10) / len(rank_matrix_tmp)\n        medr = np.floor(np.median(rank_matrix_tmp) + 1)\n        meanr = np.mean(rank_matrix_tmp) + 1\n        logger.info(\n            \"[{}] Final r1:{:.3f}, r5:{:.3f}, r10:{:.3f}, mder:{:.3f}, meanr:{:.3f}\"",
+        "type": "code",
+        "location": "/paddlevideo/metrics/msrvtt_metric.py:32-56"
+    },
+    "5827": {
+        "file_id": 470,
+        "content": "This code initializes a rank matrix, updates score and target matrices based on input data, calculates r1, r5, r10 rank metrics and median rank (medr) and mean rank, then logs these metrics.",
+        "type": "comment"
+    },
+    "5828": {
+        "file_id": 470,
+        "content": "            .format(batch_id, r1, r5, r10, medr, meanr))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        logger.info(\"Eval Finished!\")",
+        "type": "code",
+        "location": "/paddlevideo/metrics/msrvtt_metric.py:57-62"
+    },
+    "5829": {
+        "file_id": 470,
+        "content": "This code defines a class for accumulating metrics related to the MS-RNN/VTT model. It seems to have methods for updating and finalizing the metric calculations. The update method takes in values r1, r5, r10, medr, and meanr, which are likely performance scores. The accumulate method signals the end of iterations by logging a message saying \"Eval Finished!\".",
+        "type": "comment"
+    },
+    "5830": {
+        "file_id": 471,
+        "content": "/paddlevideo/metrics/multi_crop_metric.py",
+        "type": "filepath"
+    },
+    "5831": {
+        "file_id": 471,
+        "content": "The MultiCropMetric class in PaddleVideo computes top-1/5 accuracy using multi-crop metrics for each video label and logs average values.",
+        "type": "summary"
+    },
+    "5832": {
+        "file_id": 471,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nfrom paddle.hapi.model import _all_gather\nfrom paddlevideo.utils import get_logger\nfrom .registry import METRIC\nfrom .base import BaseMetric\nlogger = get_logger(\"paddlevideo\")\n\"\"\" An example for metrics class.\n    MultiCropMetric for slowfast.\n\"\"\"\n@METRIC.register\nclass MultiCropMetric(BaseMetric):\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 num_ensemble_views,\n                 num_spatial_crops,\n                 num_classes,\n                 log_interval=1):",
+        "type": "code",
+        "location": "/paddlevideo/metrics/multi_crop_metric.py:1-35"
+    },
+    "5833": {
+        "file_id": 471,
+        "content": "This code snippet defines a class for MultiCropMetric, which is part of the PaddleVideo library. It initializes instances with various parameters such as data_size, batch_size, num_ensemble_views, num_spatial_crops, and num_classes. The log_interval parameter determines how often to update logs during training. This metric appears to be related to slowfast video processing, as specified in the comment.",
+        "type": "comment"
+    },
+    "5834": {
+        "file_id": 471,
+        "content": "        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.num_ensemble_views = num_ensemble_views\n        self.num_spatial_crops = num_spatial_crops\n        self.num_classes = num_classes\n        self.num_clips = self.num_ensemble_views * self.num_spatial_crops\n        num_videos = self.data_size // self.num_clips\n        self.video_preds = np.zeros((num_videos, self.num_classes))\n        self.video_labels = np.zeros((num_videos, 1), dtype=\"int64\")\n        self.clip_count = {}\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        labels = data[2]\n        clip_ids = data[3]\n        # gather mulit card, results of following process in each card is the same.\n        if self.world_size > 1:\n            outputs = _all_gather(outputs, self.world_size)\n            labels = _all_gather(labels.cuda(), self.world_size)\n            clip_ids = _all_gather(clip_ids.cuda(), self.world_size)\n        # to numpy",
+        "type": "code",
+        "location": "/paddlevideo/metrics/multi_crop_metric.py:36-61"
+    },
+    "5835": {
+        "file_id": 471,
+        "content": "This code initializes a multi-crop metric class, which takes data size, batch size, log interval, number of ensemble views, and number of spatial crops as parameters. It calculates the number of videos and clips, creates arrays to store video predictions and labels, and initializes a clip_count dictionary. The update method is used to update metrics during each iteration by gathering data across multiple cards if needed, converting outputs to numpy arrays.",
+        "type": "comment"
+    },
+    "5836": {
+        "file_id": 471,
+        "content": "        preds = outputs.numpy()\n        labels = labels.numpy().astype(\"int64\")\n        clip_ids = clip_ids.numpy()\n        # preds ensemble\n        for ind in range(preds.shape[0]):\n            vid_id = int(clip_ids[ind]) // self.num_clips\n            ts_idx = int(clip_ids[ind]) % self.num_clips\n            if vid_id not in self.clip_count:\n                self.clip_count[vid_id] = []\n            if ts_idx in self.clip_count[vid_id]:\n                logger.info(\n                    \"[TEST] Passed!! read video {} clip index {} / {} repeatedly.\"\n                    .format(vid_id, ts_idx, clip_ids[ind]))\n            else:\n                self.clip_count[vid_id].append(ts_idx)\n                self.video_preds[vid_id] += preds[ind]  # ensemble method: sum\n                if self.video_labels[vid_id].sum() > 0:\n                    assert self.video_labels[vid_id] == labels[ind]\n                self.video_labels[vid_id] = labels[ind]\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/multi_crop_metric.py:62-83"
+    },
+    "5837": {
+        "file_id": 471,
+        "content": "The code loops through each prediction and label for video clips in a batch. It checks if the clip index has been encountered before for a particular video ID, and updates the count if it's a new clip or performs ensemble by summing predictions if it's not. If there are labels for a video, it asserts they match and updates the label accordingly. The code also logs processing information at log intervals.",
+        "type": "comment"
+    },
+    "5838": {
+        "file_id": 471,
+        "content": "                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        # check clip index of each video\n        for key in self.clip_count.keys():\n            if len(self.clip_count[key]) != self.num_clips or sum(\n                    self.clip_count[key]) != self.num_clips * (self.num_clips -\n                                                               1) / 2:\n                logger.info(\n                    \"[TEST] Count Error!! video [{}] clip count [{}] not match number clips {}\"\n                    .format(key, self.clip_count[key], self.num_clips))\n        video_preds = paddle.to_tensor(self.video_preds)\n        video_labels = paddle.to_tensor(self.video_labels)\n        acc_top1 = paddle.metric.accuracy(input=video_preds,\n                                          label=video_labels,\n                                          k=1)\n        acc_top5 = paddle.metric.accuracy(input=video_preds,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/multi_crop_metric.py:84-104"
+    },
+    "5839": {
+        "file_id": 471,
+        "content": "This code defines a class that accumulates metrics when all iterations are finished. It checks if the number of clips and their counts match, logging an error if not. Then, it converts video predictions and labels to Paddle tensors and calculates top-1 and top-5 accuracy using Paddle's metric library.",
+        "type": "comment"
+    },
+    "5840": {
+        "file_id": 471,
+        "content": "                                          label=video_labels,\n                                          k=5)\n        logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {} '.format(\n            acc_top1.numpy(), acc_top5.numpy()))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/multi_crop_metric.py:105-108"
+    },
+    "5841": {
+        "file_id": 471,
+        "content": "Calculates top-1 and top-5 accuracy using multi-crop metric for each video label, then logs the average accuracy values.",
+        "type": "comment"
+    },
+    "5842": {
+        "file_id": 472,
+        "content": "/paddlevideo/metrics/recall.py",
+        "type": "filepath"
+    },
+    "5843": {
+        "file_id": 472,
+        "content": "This code calculates recall for object detection by iterating through images, selecting proposals based on scores, and calculating IoU between these proposals and ground truth boxes. It defines `recalls()` and `eval_recalls()`, with the latter computing recalls for each image.",
+        "type": "summary"
+    },
+    "5844": {
+        "file_id": 472,
+        "content": "import numpy as np\nimport paddle \ndef _recalls(all_ious, proposal_nums, thrs):\n    img_num = all_ious.shape[0]\n    total_gt_num = sum([ious.shape[0] for ious in all_ious])\n    ious_ = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)\n    for k, proposal_num in enumerate(proposal_nums):\n        tmp_ious = np.zeros(0)\n        for i in range(img_num):\n            ious = all_ious[i][:, :proposal_num].copy()\n            gt_ious = np.zeros(ious.shape[0])\n            if ious.size == 0:\n                tmp_ious = np.hstack((tmp_ious, gt_ious))\n                continue\n            for j in range(ious.shape[0]):\n                gt_max_overlaps = ious.argmax(axis=1)\n                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]\n                gt_idx = max_ious.argmax()\n                gt_ious[j] = max_ious[gt_idx]\n                box_idx = gt_max_overlaps[gt_idx]\n                ious[gt_idx, :] = -1\n                ious[:, box_idx] = -1\n            tmp_ious = np.hstack((tmp_ious, gt_ious))\n        ious_[k, :] = tmp_ious",
+        "type": "code",
+        "location": "/paddlevideo/metrics/recall.py:1-27"
+    },
+    "5845": {
+        "file_id": 472,
+        "content": "This code calculates recall metric for object detection. It takes in all IoUs (intersection over union), proposal numbers, and thresholds as input. It iterates through images and gt (ground truth) boxes, and then computes the recall scores for each image and stores them in an array. The recall is computed by finding the maximum IOU between ground truth and proposals for each image and storing it in a temporary array, then concatenating these values into the final result.",
+        "type": "comment"
+    },
+    "5846": {
+        "file_id": 472,
+        "content": "    ious_ = np.fliplr(np.sort(ious_, axis=1))\n    recalls = np.zeros((proposal_nums.size, thrs.size))\n    for i, thr in enumerate(thrs):\n        recalls[:, i] = (ious_ >= thr).sum(axis=1) / float(total_gt_num)\n    return recalls\ndef set_recall_param(proposal_nums, iou_thrs):\n    if isinstance(proposal_nums, list):\n        proposal_nums_ = np.array(proposal_nums)\n    elif isinstance(proposal_nums, int):\n        proposal_nums_ = np.array([proposal_nums])\n    else:\n        proposal_nums_ = proposal_nums\n    if iou_thrs is None:\n        _iou_thrs = np.array([0.5])\n    elif isinstance(iou_thrs, list):\n        _iou_thrs = np.array(iou_thrs)\n    elif isinstance(iou_thrs, float):\n        _iou_thrs = np.array([iou_thrs])\n    else:\n        _iou_thrs = iou_thrs\n    return proposal_nums_, _iou_thrs\ndef eval_recalls(gts, proposals, proposal_nums=None, iou_thrs=None):\n    \"\"\"Calculate recalls. \"\"\"\n    img_num = len(gts)\n    assert img_num == len(proposals)\n    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/recall.py:29-62"
+    },
+    "5847": {
+        "file_id": 472,
+        "content": "The code defines two functions: `recalls()` and `eval_recalls()`. \n\n`recalls()` calculates the average precision-recall curve by comparing predicted bounding boxes with ground truth ones. It does this by sorting intersection over union (IOU) values, creating a recall matrix based on IOU thresholds, and averaging recall across images.\n\n`eval_recalls()` is a wrapper function that calls `recalls()`. It calculates recalls for each image given ground truths and proposals. It also checks input types and sets default parameters if necessary.",
+        "type": "comment"
+    },
+    "5848": {
+        "file_id": 472,
+        "content": "    all_ious = []\n    for i in range(img_num):\n        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:\n            scores = proposals[i][:, 4]\n            sort_idx = np.argsort(scores)[::-1]\n            img_proposal = proposals[i][sort_idx, :]\n        else:\n            img_proposal = proposals[i]\n        prop_num = min(img_proposal.shape[0], proposal_nums[-1])\n        if gts[i] is None or gts[i].shape[0] == 0:\n            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)\n        else:\n            ious = bbox_overlaps(\n                torch.tensor(gts[i]),\n                torch.tensor(img_proposal[:prop_num, :4]))\n            ious = ious.data.numpy()\n        all_ious.append(ious)\n    all_ious = np.array(all_ious)\n    recalls = _recalls(all_ious, proposal_nums, iou_thrs)\n    return recalls",
+        "type": "code",
+        "location": "/paddlevideo/metrics/recall.py:64-84"
+    },
+    "5849": {
+        "file_id": 472,
+        "content": "This code calculates recall for object detection. It iterates through images, sorts and selects proposals based on scores, and then calculates IoU (intersection over union) between these proposals and ground truth boxes. If no ground truth is found or it has zero boxes, all_ious is filled with zeros. The function returns recalls for each image number.",
+        "type": "comment"
+    },
+    "5850": {
+        "file_id": 473,
+        "content": "/paddlevideo/metrics/registry.py",
+        "type": "filepath"
+    },
+    "5851": {
+        "file_id": 473,
+        "content": "The code imports a Registry class from the utils module and initializes the METRIC as an instance of this Registry, designed to store and manage different types of metrics.",
+        "type": "summary"
+    },
+    "5852": {
+        "file_id": 473,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nMETRIC = Registry('metric')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py:1-17"
+    },
+    "5853": {
+        "file_id": 473,
+        "content": "The code imports a Registry class from the utils module and initializes the METRIC as an instance of this Registry, designed to store and manage different types of metrics.",
+        "type": "comment"
+    },
+    "5854": {
+        "file_id": 474,
+        "content": "/paddlevideo/metrics/segmentation_metric.py",
+        "type": "filepath"
+    },
+    "5855": {
+        "file_id": 474,
+        "content": "The code creates a function for label change detection in video segmentation, computes precision, recall, and F1 score, uses Levenstein distance, and evaluates ground truth and predicted actions.",
+        "type": "summary"
+    },
+    "5856": {
+        "file_id": 474,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport argparse\nimport pandas as pd\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\ndef get_labels_scores_start_end_time(input_np,\n                                     frame_wise_labels,\n                                     actions_dict,\n                                     bg_class=[\"background\", \"None\"]):\n    labels = []\n    starts = []\n    ends = []\n    scores = []\n    boundary_score_ptr = 0\n    last_label = frame_wise_labels[0]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:1-35"
+    },
+    "5857": {
+        "file_id": 474,
+        "content": "This code is a part of PaddleVideo library and defines a function get_labels_scores_start_end_time that takes input, frame-wise labels, actions dictionary, and optional background class. It returns labels, starts, ends, and scores based on the input and labels. The function also keeps track of the boundary score pointer and the last label.",
+        "type": "comment"
+    },
+    "5858": {
+        "file_id": 474,
+        "content": "    if frame_wise_labels[0] not in bg_class:\n        labels.append(frame_wise_labels[0])\n        starts.append(0)\n    for i in range(len(frame_wise_labels)):\n        if frame_wise_labels[i] != last_label:\n            if frame_wise_labels[i] not in bg_class:\n                labels.append(frame_wise_labels[i])\n                starts.append(i)\n            if last_label not in bg_class:\n                ends.append(i)\n                score = np.mean(\n                        input_np[actions_dict[labels[boundary_score_ptr]], \\\n                            starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]\n                        )\n                scores.append(score)\n                boundary_score_ptr = boundary_score_ptr + 1\n            last_label = frame_wise_labels[i]\n    if last_label not in bg_class:\n        ends.append(i + 1)\n        score = np.mean(\n                    input_np[actions_dict[labels[boundary_score_ptr]], \\\n                        starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:36-57"
+    },
+    "5859": {
+        "file_id": 474,
+        "content": "This code segment is a part of a larger video analysis algorithm. It identifies changes in frame-wise labels and calculates scores for those changes based on input_np data associated with the actions_dict[labels]. The scores are then appended to the 'scores' list, while starts and ends lists keep track of the start and end indices for each identified change. Finally, if the last label is not in bg_class, it adds an ending index to the 'ends' list and calculates a score using input_np data associated with starts and ends indices. The code then updates boundary_score_ptr and proceeds to the next iteration.",
+        "type": "comment"
+    },
+    "5860": {
+        "file_id": 474,
+        "content": "                    )\n        scores.append(score)\n        boundary_score_ptr = boundary_score_ptr + 1\n    return labels, starts, ends, scores\ndef get_labels_start_end_time(frame_wise_labels,\n                              bg_class=[\"background\", \"None\"]):\n    labels = []\n    starts = []\n    ends = []\n    last_label = frame_wise_labels[0]\n    if frame_wise_labels[0] not in bg_class:\n        labels.append(frame_wise_labels[0])\n        starts.append(0)\n    for i in range(len(frame_wise_labels)):\n        if frame_wise_labels[i] != last_label:\n            if frame_wise_labels[i] not in bg_class:\n                labels.append(frame_wise_labels[i])\n                starts.append(i)\n            if last_label not in bg_class:\n                ends.append(i)\n            last_label = frame_wise_labels[i]\n    if last_label not in bg_class:\n        ends.append(i + 1)\n    return labels, starts, ends\ndef levenstein(p, y, norm=False):\n    m_row = len(p)\n    n_col = len(y)\n    D = np.zeros([m_row + 1, n_col + 1], np.float)\n    for i in range(m_row + 1):",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:58-91"
+    },
+    "5861": {
+        "file_id": 474,
+        "content": "The first code chunk is a function that takes a list of frame-wise labels, iterates over the frames, and returns lists for the label names, starting indices, and ending indices. It appends new label names to the labels list and new starting indices to the starts list when a new label appears, and adds corresponding ending indices to the ends list if the previous label was not \"background\" or \"None\". The last label's ending index is added after the loop.\n\nThe second code chunk defines a function that calculates the Levenshtein distance between two strings (p and y) using dynamic programming, which measures the minimum number of operations required to transform one string into another (insertion, deletion, or substitution). The function creates a 2D array D of size (m_row + 1) x (n_col + 1), where m_row is the length of p and n_col is the length of y. It then fills the array using dynamic programming, considering different operations at each step to calculate the minimum distance. This function likely uses the D array for further calculations or returns it as a result.",
+        "type": "comment"
+    },
+    "5862": {
+        "file_id": 474,
+        "content": "        D[i, 0] = i\n    for i in range(n_col + 1):\n        D[0, i] = i\n    for j in range(1, n_col + 1):\n        for i in range(1, m_row + 1):\n            if y[j - 1] == p[i - 1]:\n                D[i, j] = D[i - 1, j - 1]\n            else:\n                D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,\n                              D[i - 1, j - 1] + 1)\n    if norm:\n        score = (1 - D[-1, -1] / max(m_row, n_col)) * 100\n    else:\n        score = D[-1, -1]\n    return score\ndef edit_score(recognized,\n               ground_truth,\n               norm=True,\n               bg_class=[\"background\", \"None\"]):\n    P, _, _ = get_labels_start_end_time(recognized, bg_class)\n    Y, _, _ = get_labels_start_end_time(ground_truth, bg_class)\n    return levenstein(P, Y, norm)\ndef f_score(recognized, ground_truth, overlap, bg_class=[\"background\", \"None\"]):\n    p_label, p_start, p_end = get_labels_start_end_time(recognized, bg_class)\n    y_label, y_start, y_end = get_labels_start_end_time(ground_truth, bg_class)\n    tp = 0\n    fp = 0",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:92-126"
+    },
+    "5863": {
+        "file_id": 474,
+        "content": "The code contains a function called \"levenstein\" that calculates the Levenstein distance between two sequences. The Levenstein distance is a metric used to measure the difference between two strings of characters, such as words or labels. In this case, it's used to compare the recognized and ground truth labels for video segmentation. The function takes in a pair of lists P and Y representing recognized and ground truth labels respectively, and an optional norm parameter which normalizes the score if True. The output is a single numeric value representing the distance between the two lists of labels. This score can be used to evaluate the accuracy of the recognition algorithm in comparison with the ground truth data.",
+        "type": "comment"
+    },
+    "5864": {
+        "file_id": 474,
+        "content": "    hits = np.zeros(len(y_label))\n    for j in range(len(p_label)):\n        intersection = np.minimum(p_end[j], y_end) - np.maximum(\n            p_start[j], y_start)\n        union = np.maximum(p_end[j], y_end) - np.minimum(p_start[j], y_start)\n        IoU = (1.0 * intersection / union) * (\n            [p_label[j] == y_label[x] for x in range(len(y_label))])\n        # Get the best scoring segment\n        idx = np.array(IoU).argmax()\n        if IoU[idx] >= overlap and not hits[idx]:\n            tp += 1\n            hits[idx] = 1\n        else:\n            fp += 1\n    fn = len(y_label) - sum(hits)\n    return float(tp), float(fp), float(fn)\ndef boundary_AR(pred_boundary, gt_boundary, overlap_list, max_proposal):\n    p_label, p_start, p_end, p_scores = pred_boundary\n    y_label, y_start, y_end, _ = gt_boundary\n    # sort proposal\n    pred_dict = {\n        \"label\": p_label,\n        \"start\": p_start,\n        \"end\": p_end,\n        \"scores\": p_scores\n    }\n    pdf = pd.DataFrame(pred_dict)\n    pdf = pdf.sort_values(by=\"scores\", ascending=False)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:128-161"
+    },
+    "5865": {
+        "file_id": 474,
+        "content": "The code calculates the precision, recall, and F1 score for image segmentation by iterating through predicted and ground truth labels. It assigns hits when there is a match between predictions and ground truth, and counts true positives (TP), false positives (FP), and false negatives (FN). The boundary_AR function takes in predicted and ground truth boundaries, sorts them based on scores, and calculates various metrics for image segmentation.",
+        "type": "comment"
+    },
+    "5866": {
+        "file_id": 474,
+        "content": "    p_label = list(pdf[\"label\"])\n    p_start = list(pdf[\"start\"])\n    p_end = list(pdf[\"end\"])\n    p_scores = list(pdf[\"scores\"])\n    # refine AN\n    if len(p_label) < max_proposal and len(p_label) > 0:\n        p_label = p_label + [p_label[-1]] * (max_proposal - len(p_label))\n        p_start = p_start + [p_start[-1]] * (max_proposal - len(p_start))\n        p_start = p_start + p_start[len(p_start) -\n                                    (max_proposal - len(p_start)):]\n        p_end = p_end + [p_end[-1]] * (max_proposal - len(p_end))\n        p_scores = p_scores + [p_scores[-1]] * (max_proposal - len(p_scores))\n    elif len(p_label) > max_proposal:\n        p_label[max_proposal:] = []\n        p_start[max_proposal:] = []\n        p_end[max_proposal:] = []\n        p_scores[max_proposal:] = []\n    t_AR = np.zeros(len(overlap_list))\n    for i in range(len(overlap_list)):\n        overlap = overlap_list[i]\n        tp = 0\n        fp = 0\n        hits = np.zeros(len(y_label))\n        for j in range(len(p_label)):\n            intersection = np.minimum(p_end[j], y_end) - np.maximum(",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:162-191"
+    },
+    "5867": {
+        "file_id": 474,
+        "content": "This code segment handles the refinement of proposals in an object detection model. If the number of proposals is less than the maximum allowed, it repeats the last proposal to meet the requirement. If there are more proposals than the maximum allowed, it discards extra proposals. The code then calculates the average recall (AR) by iterating over the overlap list and counting true positives (tp) and false positives (fp). It also initializes hits for each proposal in the ground truth labels.",
+        "type": "comment"
+    },
+    "5868": {
+        "file_id": 474,
+        "content": "                p_start[j], y_start)\n            union = np.maximum(p_end[j], y_end) - np.minimum(\n                p_start[j], y_start)\n            IoU = (1.0 * intersection / union)\n            # Get the best scoring segment\n            idx = np.array(IoU).argmax()\n            if IoU[idx] >= overlap and not hits[idx]:\n                tp += 1\n                hits[idx] = 1\n            else:\n                fp += 1\n        fn = len(y_label) - sum(hits)\n        recall = float(tp) / (float(tp) + float(fn))\n        t_AR[i] = recall\n    AR = np.mean(t_AR)\n    return AR\n@METRIC.register\nclass SegmentationMetric(BaseMetric):\n    \"\"\"\n    Test for Video Segmentation based model.\n    \"\"\"\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 overlap,\n                 actions_map_file_path,\n                 log_interval=1,\n                 tolerance=5,\n                 boundary_threshold=0.7,\n                 max_proposal=100):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:192-230"
+    },
+    "5869": {
+        "file_id": 474,
+        "content": "This code calculates the Average Recall (AR) for video segmentation models. It iterates through ground truth and predicted labels, calculates intersection over union (IoU), counts true positives (tp), false positives (fp), and false negatives (fn). Then it computes recall and averages them to obtain AR. The SegmentationMetric class initializes with various parameters for the metric calculation.",
+        "type": "comment"
+    },
+    "5870": {
+        "file_id": 474,
+        "content": "        # actions dict generate\n        file_ptr = open(actions_map_file_path, 'r')\n        actions = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        self.actions_dict = dict()\n        for a in actions:\n            self.actions_dict[a.split()[1]] = int(a.split()[0])\n        # cls score\n        self.overlap = overlap\n        self.overlap_len = len(overlap)\n        self.cls_tp = np.zeros(self.overlap_len)\n        self.cls_fp = np.zeros(self.overlap_len)\n        self.cls_fn = np.zeros(self.overlap_len)\n        self.total_correct = 0\n        self.total_edit = 0\n        self.total_frame = 0\n        self.total_video = 0\n        # boundary score\n        self.max_proposal = max_proposal\n        self.AR_at_AN = [[] for _ in range(max_proposal)]\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        groundTruth = data[1]\n        predicted = outputs['predict']\n        output_np = outputs['output_np']\n        outputs_np = predicted.numpy()\n        outputs_arr = output_np.numpy()[0, :]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:231-264"
+    },
+    "5871": {
+        "file_id": 474,
+        "content": "This code initializes a SegmentationMetric object, reads an actions map file, and prepares to update metrics during each iteration. It calculates true positives (cls_tp), false positives (cls_fp), and false negatives (cls_fn) for each frame's overlap. The AR_at_AN is also initialized with empty lists for each max_proposal value.",
+        "type": "comment"
+    },
+    "5872": {
+        "file_id": 474,
+        "content": "        gt_np = groundTruth.numpy()[0, :]\n        recognition = []\n        for i in range(outputs_np.shape[0]):\n            recognition = np.concatenate((recognition, [\n                list(self.actions_dict.keys())[list(\n                    self.actions_dict.values()).index(outputs_np[i])]\n            ]))\n        recog_content = list(recognition)\n        gt_content = []\n        for i in range(gt_np.shape[0]):\n            gt_content = np.concatenate((gt_content, [\n                list(self.actions_dict.keys())[list(\n                    self.actions_dict.values()).index(gt_np[i])]\n            ]))\n        gt_content = list(gt_content)\n        pred_boundary = get_labels_scores_start_end_time(\n            outputs_arr, recog_content, self.actions_dict)\n        gt_boundary = get_labels_scores_start_end_time(\n            np.ones(outputs_arr.shape), gt_content, self.actions_dict)\n        # cls score\n        correct = 0\n        total = 0\n        edit = 0\n        for i in range(len(gt_content)):\n            total += 1\n            #accumulate",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:265-295"
+    },
+    "5873": {
+        "file_id": 474,
+        "content": "This code segment compares ground truth and predicted actions for a video. It converts the numpy arrays to lists, generates predicted and ground truth boundaries using the `get_labels_scores_start_end_time` function, and then initializes variables for accuracy calculation.",
+        "type": "comment"
+    },
+    "5874": {
+        "file_id": 474,
+        "content": "            self.total_frame += 1\n            if gt_content[i] == recog_content[i]:\n                correct += 1\n                #accumulate\n                self.total_correct += 1\n        edit_num = edit_score(recog_content, gt_content)\n        edit += edit_num\n        self.total_edit += edit_num\n        for s in range(self.overlap_len):\n            tp1, fp1, fn1 = f_score(recog_content, gt_content, self.overlap[s])\n            # accumulate\n            self.cls_tp[s] += tp1\n            self.cls_fp[s] += fp1\n            self.cls_fn[s] += fn1\n        # accumulate\n        self.total_video += 1\n        # proposal score\n        for AN in range(self.max_proposal):\n            AR = boundary_AR(pred_boundary,\n                             gt_boundary,\n                             self.overlap,\n                             max_proposal=(AN + 1))\n            self.AR_at_AN[AN].append(AR)\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        # cls metric\n        Acc = 100 * float(self.total_correct) / self.total_frame",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:296-330"
+    },
+    "5875": {
+        "file_id": 474,
+        "content": "This code calculates segmentation metrics, including accuracy, false positives, and false negatives for video frames. It also keeps track of total correct predictions, edit distances, proposal scores, and accumulates these metrics per video. The `accumulate` function is used to calculate classification accuracy when all iterations are finished.",
+        "type": "comment"
+    },
+    "5876": {
+        "file_id": 474,
+        "content": "        Edit = (1.0 * self.total_edit) / self.total_video\n        Fscore = dict()\n        for s in range(self.overlap_len):\n            precision = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fp[s])\n            recall = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fn[s])\n            f1 = 2.0 * (precision * recall) / (precision + recall)\n            f1 = np.nan_to_num(f1) * 100\n            Fscore[self.overlap[s]] = f1\n        # proposal metric\n        proposal_AUC = np.array(self.AR_at_AN) * 100\n        AUC = np.mean(proposal_AUC)\n        AR_at_AN1 = np.mean(proposal_AUC[0, :])\n        AR_at_AN5 = np.mean(proposal_AUC[4, :])\n        AR_at_AN15 = np.mean(proposal_AUC[14, :])\n        # log metric\n        log_mertic_info = \"dataset model performence: \"\n        # preds ensemble\n        log_mertic_info += \"Acc: {:.4f}, \".format(Acc)\n        log_mertic_info += 'Edit: {:.4f}, '.format(Edit)\n        for s in range(len(self.overlap)):\n            log_mertic_info += 'F1@{:0.2f}: {:.4f}, '.format(\n                self.overlap[s], Fscore[self.overlap[s]])",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:331-356"
+    },
+    "5877": {
+        "file_id": 474,
+        "content": "This code calculates segmentation metrics, including Edit distance, F1 score at different overlap levels, and proposal area under the curve (AUC). It then stores these values in a dictionary and computes average AUCs for different overlap thresholds. The code also calculates an ensemble metric based on accuracy (Acc) and Edit distance. Finally, it logs this information as a string.",
+        "type": "comment"
+    },
+    "5878": {
+        "file_id": 474,
+        "content": "        # boundary metric\n        log_mertic_info += \"Auc: {:.4f}, \".format(AUC)\n        log_mertic_info += \"AR@AN1: {:.4f}, \".format(AR_at_AN1)\n        log_mertic_info += \"AR@AN5: {:.4f}, \".format(AR_at_AN5)\n        log_mertic_info += \"AR@AN15: {:.4f}, \".format(AR_at_AN15)\n        logger.info(log_mertic_info)\n        # log metric\n        metric_dict = dict()\n        metric_dict['Acc'] = Acc\n        metric_dict['Edit'] = Edit\n        for s in range(len(self.overlap)):\n            metric_dict['F1@{:0.2f}'.format(\n                self.overlap[s])] = Fscore[self.overlap[s]]\n        metric_dict['Auc'] = AUC\n        metric_dict['AR@AN1'] = AR_at_AN1\n        metric_dict['AR@AN5'] = AR_at_AN5\n        metric_dict['AR@AN15'] = AR_at_AN15\n        # clear for next epoch\n        # cls\n        self.cls_tp = np.zeros(self.overlap_len)\n        self.cls_fp = np.zeros(self.overlap_len)\n        self.cls_fn = np.zeros(self.overlap_len)\n        self.total_correct = 0\n        self.total_edit = 0\n        self.total_frame = 0\n        self.total_video = 0",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:358-385"
+    },
+    "5879": {
+        "file_id": 474,
+        "content": "This code calculates and logs various segmentation metrics, including AUC, AR@AN1, AR@AN5, and AR@AN15. It also updates the metric dictionary with F1 scores for different overlap thresholds and clears the classifier statistics for the next epoch.",
+        "type": "comment"
+    },
+    "5880": {
+        "file_id": 474,
+        "content": "        # proposal\n        self.AR_at_AN = [[] for _ in range(self.max_proposal)]\n        return metric_dict",
+        "type": "code",
+        "location": "/paddlevideo/metrics/segmentation_metric.py:386-389"
+    },
+    "5881": {
+        "file_id": 474,
+        "content": "This code initializes the attribute \"AR_at_AN\" as a list of empty lists, with the length equal to the maximum number of proposals. This is done within the context of a class method and the list will likely be used for storing evaluation metrics related to these proposals. The method then returns a dictionary (metric_dict) containing other potentially calculated metrics.",
+        "type": "comment"
+    },
+    "5882": {
+        "file_id": 475,
+        "content": "/paddlevideo/metrics/skeleton_metric.py",
+        "type": "filepath"
+    },
+    "5883": {
+        "file_id": 475,
+        "content": "The SkeletonMetric class in PaddleVideo measures skeleton-based model performance metrics, supports batch size 1 and single card testing, and calculates top1 and top5 accuracy for batches with labels. It logs processing info, updates progress, and accumulates metrics while saving results to 'submission.csv'.",
+        "type": "summary"
+    },
+    "5884": {
+        "file_id": 475,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nimport csv\nimport paddle.nn.functional as F\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass SkeletonMetric(BaseMetric):\n    \"\"\"\n    Test for Skeleton based model.\n    note: only support batch size = 1, single card test.\n    Args:\n        out_file: str, file to save test results.\n    \"\"\"\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 out_file='submission.csv',",
+        "type": "code",
+        "location": "/paddlevideo/metrics/skeleton_metric.py:1-38"
+    },
+    "5885": {
+        "file_id": 475,
+        "content": "This code is for the SkeletonMetric class in PaddleVideo, a machine learning framework. The class measures performance metrics for skeleton-based models. It supports batch size 1 and single card testing. Results can be saved to a file named 'submission.csv'.",
+        "type": "comment"
+    },
+    "5886": {
+        "file_id": 475,
+        "content": "                 log_interval=1,\n                 top_k=5):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.top1 = []\n        self.top5 = []\n        self.values = []\n        self.out_file = out_file\n        self.k = top_k\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        if data[0].shape[0] != outputs.shape[0]:\n            num_segs = data[0].shape[1]\n            batch_size = outputs.shape[0]\n            outputs = outputs.reshape(\n                [batch_size // num_segs, num_segs, outputs.shape[-1]])\n            outputs = outputs.mean(axis=1)\n        if len(data) == 2:  # data with label\n            labels = data[1]\n            top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)\n            top5 = paddle.metric.accuracy(input=outputs, label=labels, k=self.k)\n            if self.world_size > 1:\n                top1 = paddle.distributed.all_reduce(\n                    top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size",
+        "type": "code",
+        "location": "/paddlevideo/metrics/skeleton_metric.py:39-65"
+    },
+    "5887": {
+        "file_id": 475,
+        "content": "This code initializes a metrics class for tracking accuracy metrics during training. The `__init__` function sets up the top1, top5, and values lists to store metric results, as well as an output file path and the desired top k value. The `update` method processes data from each batch iteration, updating the metrics based on whether the input data contains labels or not. It also handles distributed training by averaging across multiple workers.",
+        "type": "comment"
+    },
+    "5888": {
+        "file_id": 475,
+        "content": "                top5 = paddle.distributed.all_reduce(\n                    top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            self.top1.append(top1.numpy())\n            self.top5.append(top5.numpy())\n        else:  # data without label, only support batch_size=1. Used for fsd-10.\n            prob = F.softmax(outputs)\n            clas = paddle.argmax(prob, axis=1).numpy()[0]\n            self.values.append((batch_id, clas))\n        # preds ensemble\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        if self.top1:  # data with label\n            logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {}'.format(\n                np.mean(np.array(self.top1)), np.mean(np.array(self.top5))))\n        else:\n            headers = ['sample_index', 'predict_category']",
+        "type": "code",
+        "location": "/paddlevideo/metrics/skeleton_metric.py:66-88"
+    },
+    "5889": {
+        "file_id": 475,
+        "content": "This code segment is part of a class that handles metrics for a testing process. It calculates top1 and top5 accuracy for batches with labels and stores them. For batches without labels, it performs softmax on outputs and gets the class with highest probability. It logs processing information and updates progress. Finally, it accumulates metrics when all iterations are done.",
+        "type": "comment"
+    },
+    "5890": {
+        "file_id": 475,
+        "content": "            with open(\n                    self.out_file,\n                    'w',\n            ) as fp:\n                writer = csv.writer(fp)\n                writer.writerow(headers)\n                writer.writerows(self.values)\n            logger.info(\"Results saved in {} !\".format(self.out_file))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/skeleton_metric.py:89-96"
+    },
+    "5891": {
+        "file_id": 475,
+        "content": "Writes headers and values from self.values to file, saves results in out_file and logs success.",
+        "type": "comment"
+    },
+    "5892": {
+        "file_id": 476,
+        "content": "/paddlevideo/metrics/transnetv2_metric.py",
+        "type": "filepath"
+    },
+    "5893": {
+        "file_id": 476,
+        "content": "This code calculates precision, recall, and F1-score for a TransNetV2 metric machine learning model by handling scene location transformations and errors. It iterates through different thresholds before logging the results.",
+        "type": "summary"
+    },
+    "5894": {
+        "file_id": 476,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\ndef predictions_to_scenes(predictions):\n    scenes = []\n    t, t_prev, start = -1, 0, 0\n    for i, t in enumerate(predictions):\n        if t_prev == 1 and t == 0:\n            start = i\n        if t_prev == 0 and t == 1 and i != 0:\n            scenes.append([start, i])\n        t_prev = t\n    if t == 0:\n        scenes.append([start, i])\n    # just fix if all predictions are 1",
+        "type": "code",
+        "location": "/paddlevideo/metrics/transnetv2_metric.py:1-34"
+    },
+    "5895": {
+        "file_id": 476,
+        "content": "This function, named \"predictions_to_scenes\", takes in a list of predictions and outputs a list of scene locations. The scenes are determined by identifying changes from 0 to 1 and vice versa. If all predictions are 1, the function adds a final scene ending at the last index. The code also includes error checking for cases where all predictions are 1 or when there is a disruption in prediction data flow.",
+        "type": "comment"
+    },
+    "5896": {
+        "file_id": 476,
+        "content": "    if len(scenes) == 0:\n        return np.array([[0, len(predictions) - 1]], dtype=np.int32)\n    return np.array(scenes, dtype=np.int32)\ndef evaluate_scenes(gt_scenes, pred_scenes, n_frames_miss_tolerance=2):\n    \"\"\"\n    Adapted from: https://github.com/gyglim/shot-detection-evaluation\n    The original based on: http://imagelab.ing.unimore.it/imagelab/researchActivity.asp?idActivity=19\n    n_frames_miss_tolerance:\n        Number of frames it is possible to miss ground truth by, and still being counted as a correct detection.\n    Examples of computation with different tolerance margin:\n    n_frames_miss_tolerance = 0\n      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.5, 5.5]]\n      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[5.5, 5.5]] -> HIT\n      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[4.5, 4.5]] -> MISS\n    n_frames_miss_tolerance = 1\n      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.0, 6.0]]\n      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[5.0, 6.0]] -> HIT\n      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[4.0, 5.0]] -> HIT",
+        "type": "code",
+        "location": "/paddlevideo/metrics/transnetv2_metric.py:35-57"
+    },
+    "5897": {
+        "file_id": 476,
+        "content": "This function converts scene lists to transition lists. If there are no scenes, it returns a transition list with one element. The function is based on an external source and adapted for specific use cases. It can handle different tolerance margins, which affects how the pred_scenes and gt_scenes are transformed into prediction transitions (pred_trans) and ground truth transitions (gt_trans), respectively. A \"HIT\" or \"MISS\" status is determined based on these converted lists.",
+        "type": "comment"
+    },
+    "5898": {
+        "file_id": 476,
+        "content": "      gt_scenes:   [[0, 3], [4, 9]] -> gt_trans:   [[3.0, 4.0]] -> MISS\n    n_frames_miss_tolerance = 2\n      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[4.5, 6.5]]\n      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[4.5, 6.5]] -> HIT\n      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[3.5, 5.5]] -> HIT\n      gt_scenes:   [[0, 3], [4, 9]] -> gt_trans:   [[2.5, 4.5]] -> HIT\n      gt_scenes:   [[0, 2], [3, 9]] -> gt_trans:   [[1.5, 3.5]] -> MISS\n      Users should be careful about adopting these functions in any commercial matters.\n    \"\"\"\n    shift = n_frames_miss_tolerance / 2\n    gt_scenes = gt_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])\n    pred_scenes = pred_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])\n    gt_trans = np.stack([gt_scenes[:-1, 1], gt_scenes[1:, 0]], 1)\n    pred_trans = np.stack([pred_scenes[:-1, 1], pred_scenes[1:, 0]], 1)\n    i, j = 0, 0\n    tp, fp, fn = 0, 0, 0\n    while i < len(gt_trans) or j < len(pred_trans):\n        if j == len(pred_trans) or pred_trans[j, 0] > gt_trans[i, 1]:",
+        "type": "code",
+        "location": "/paddlevideo/metrics/transnetv2_metric.py:58-80"
+    },
+    "5899": {
+        "file_id": 476,
+        "content": "This code adjusts and transforms input frame scene and transition data, and then iterates through both to calculate true positives (TP), false positives (FP), and false negatives (FN) for evaluation.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/59.json b/docs/data/59.json
new file mode 100644
index 000000000..99f444294
--- /dev/null
+++ b/docs/data/59.json
@@ -0,0 +1,549 @@
+{
+    "5900": {
+        "file_id": 476,
+        "content": "            fn += 1\n            i += 1\n        elif i == len(gt_trans) or pred_trans[j, 1] < gt_trans[i, 0]:\n            fp += 1\n            j += 1\n        else:\n            i += 1\n            j += 1\n            tp += 1\n    if tp + fp != 0:\n        p = tp / (tp + fp)\n    else:\n        p = 0\n    if tp + fn != 0:\n        r = tp / (tp + fn)\n    else:\n        r = 0\n    if p + r != 0:\n        f1 = (p * r * 2) / (p + r)\n    else:\n        f1 = 0\n    assert tp + fn == len(gt_trans)\n    assert tp + fp == len(pred_trans)\n    return p, r, f1, (tp, fp, fn)\ndef create_scene_based_summaries(one_hot_pred, one_hot_gt):\n    thresholds = np.array([\n        0.02, 0.06, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9\n    ])\n    precision, recall, f1, tp, fp, fn = np.zeros_like(thresholds), np.zeros_like(thresholds),\\\n                                        np.zeros_like(thresholds), np.zeros_like(thresholds),\\\n                                        np.zeros_like(thresholds), np.zeros_like(thresholds)\n    gt_scenes = predictions_to_scenes(one_hot_gt)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/transnetv2_metric.py:81-120"
+    },
+    "5901": {
+        "file_id": 476,
+        "content": "This function calculates precision, recall, and F1-score for transnetv2 metric given ground truth (gt) and predicted (pred) transcript sequences. It iterates through the sequences to count true positives (tp), false negatives (fn), and false positives (fp). Afterwards, it computes precision, recall, and F1-score based on these counts. The function also asserts that the total number of true positives matches the length of gt_trans and the total number of false positives matches the length of pred_trans. It then returns the calculated metrics and the count of tp, fp, and fn. The create_scene_based_summaries function generates precision, recall, and F1-score for different thresholds using a numpy array. It initializes these metrics as well as the counts of true positives, false negatives, false positives, and false negatives to zero, then iterates over the thresholds to calculate the metric values for each threshold.",
+        "type": "comment"
+    },
+    "5902": {
+        "file_id": 476,
+        "content": "    for i in range(len(thresholds)):\n        pred_scenes = predictions_to_scenes(\n            (one_hot_pred > thresholds[i]).astype(np.uint8)\n        )\n        precision[i], recall[i], f1[i], (tp[i], fp[i], fn[i]) = evaluate_scenes(gt_scenes, pred_scenes)\n    best_idx = np.argmax(f1)\n    return f1[best_idx]\n@METRIC.register\nclass TransNetV2Metric(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.predictions = []\n        self.total_stats = {\"tp\": 0, \"fp\": 0, \"fn\": 0}\n    def update(self, batch_id, data, one_hot):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        if isinstance(one_hot, tuple):\n            one_hot = one_hot[0]\n        one_hot = paddle.nn.functional.sigmoid(one_hot)[0]\n        self.predictions.append(one_hot.numpy()[25:75])\n        gt_scenes = data[1]\n        is_new_file = data[2]\n        if is_new_file:\n            self.compute(gt_scenes)\n        # preds ensemble",
+        "type": "code",
+        "location": "/paddlevideo/metrics/transnetv2_metric.py:121-152"
+    },
+    "5903": {
+        "file_id": 476,
+        "content": "This code is from the TransNetV2Metric class, which calculates metrics for a model's predictions. It iterates through different thresholds to compute precision, recall, F1 score, and true positive, false positive, and false negative counts. The update method appends predictions and computes metrics when a new file is encountered.",
+        "type": "comment"
+    },
+    "5904": {
+        "file_id": 476,
+        "content": "        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def compute(self, gt_scenes):\n        predictions = np.concatenate(self.predictions, 0)[:len(frames)]\n        _, _, _, (tp, fp, fn), fp_mistakes, fn_mistakes = evaluate_scenes(\n            gt_scenes, predictions_to_scenes((predictions >= args.thr).astype(np.uint8)))\n        self.total_stats[\"tp\"] += tp\n        self.total_stats[\"fp\"] += fp\n        self.total_stats[\"fn\"] += fn\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        p = self.total_stats[\"tp\"] / (self.total_stats[\"tp\"] + self.total_stats[\"fp\"])\n        r = self.total_stats[\"tp\"] / (self.total_stats[\"tp\"] + self.total_stats[\"fn\"])\n        f1 = (p * r * 2) / (p + r)\n        logger.info('[TEST] finished, Precision= {:5.2f}, Recall= {:5.2f} , F1 Score= {:5.2f} '.format(\n            p * 100, r * 100, f1 * 100))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/transnetv2_metric.py:153-174"
+    },
+    "5905": {
+        "file_id": 476,
+        "content": "The code calculates precision, recall, and F1 score for a machine learning model. It accumulates the metrics after processing all batches and logs the results using logger. It also displays the Precision, Recall, and F1 Score at the end of computation.",
+        "type": "comment"
+    },
+    "5906": {
+        "file_id": 477,
+        "content": "/paddlevideo/metrics/ucf24_utils.py",
+        "type": "filepath"
+    },
+    "5907": {
+        "file_id": 477,
+        "content": "The PaddleVideo library's UCF101 dataset offers utility functions and the Ucf24Metrics class for metric manipulation, bounding box handling, and precision/recall calculations. It computes mAP for image classification tasks and stores results per class using utility methods to read bounding box text files.",
+        "type": "summary"
+    },
+    "5908": {
+        "file_id": 477,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Forked from: https://github.com/rafaelpadilla/Object-Detection-Metrics\n# Developed by: Rafael Padilla (rafael.padilla@smt.ufrj.br)\nimport glob\nimport os\nimport shutil\nimport sys\nfrom collections import Counter\nimport numpy as np\nfrom enum import Enum\nimport cv2\nclass MethodAveragePrecision(Enum):\n    \"\"\"\n    Class representing if the coordinates are relative to the\n    image size or are absolute values.\n        Developed by: Rafael Padilla",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:1-33"
+    },
+    "5909": {
+        "file_id": 477,
+        "content": "This code snippet is from the UCF101 dataset utility functions in the PaddleVideo library. It contains an enum class representing average precision metrics and a copyright notice with license information, original source link, and developer contact details.",
+        "type": "comment"
+    },
+    "5910": {
+        "file_id": 477,
+        "content": "        Last modification: Apr 28 2018\n    \"\"\"\n    EveryPointInterpolation = 1\n    ElevenPointInterpolation = 2\nclass CoordinatesType(Enum):\n    \"\"\"\n    Class representing if the coordinates are relative to the\n    image size or are absolute values.\n        Developed by: Rafael Padilla\n        Last modification: Apr 28 2018\n    \"\"\"\n    Relative = 1\n    Absolute = 2\nclass BBType(Enum):\n    \"\"\"\n    Class representing if the bounding box is groundtruth or not.\n        Developed by: Rafael Padilla\n        Last modification: May 24 2018\n    \"\"\"\n    GroundTruth = 1\n    Detected = 2\nclass BBFormat(Enum):\n    \"\"\"\n    Class representing the format of a bounding box.\n    It can be (X,Y,width,height) => XYWH\n    or (X1,Y1,X2,Y2) => XYX2Y2\n        Developed by: Rafael Padilla\n        Last modification: May 24 2018\n    \"\"\"\n    XYWH = 1\n    XYX2Y2 = 2\ndef convertToRelativeValues(size, box):\n    dw = 1. / (size[0])\n    dh = 1. / (size[1])\n    cx = (box[1] + box[0]) / 2.0\n    cy = (box[3] + box[2]) / 2.0\n    w = box[1] - box[0]\n    h = box[3] - box[2]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:34-81"
+    },
+    "5911": {
+        "file_id": 477,
+        "content": "This code defines three enumerations (CoordinatesType, BBType, and BBFormat) to represent different types of coordinates and bounding boxes. It also includes a function convertToRelativeValues that takes a size and a box as input and returns the box in relative values. The code was developed by Rafael Padilla with last modifications on April 28th for CoordinatesType, May 24th for BBType and format, and the function convertToRelativeValues is defined as well.",
+        "type": "comment"
+    },
+    "5912": {
+        "file_id": 477,
+        "content": "    x = cx * dw\n    y = cy * dh\n    w = w * dw\n    h = h * dh\n    return x, y, w, h\ndef convertToAbsoluteValues(size, box):\n    xIn = round(((2 * float(box[0]) - float(box[2])) * size[0] / 2))\n    yIn = round(((2 * float(box[1]) - float(box[3])) * size[1] / 2))\n    xEnd = xIn + round(float(box[2]) * size[0])\n    yEnd = yIn + round(float(box[3]) * size[1])\n    if xIn < 0:\n        xIn = 0\n    if yIn < 0:\n        yIn = 0\n    if xEnd >= size[0]:\n        xEnd = size[0] - 1\n    if yEnd >= size[1]:\n        yEnd = size[1] - 1\n    return xIn, yIn, xEnd, yEnd\ndef add_bb_into_image(image, bb, color=(255, 0, 0), thickness=2, label=None):\n    r = int(color[0])\n    g = int(color[1])\n    b = int(color[2])\n    font = cv2.FONT_HERSHEY_SIMPLEX\n    fontScale = 0.5\n    fontThickness = 1\n    x1, y1, x2, y2 = bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n    x1 = int(x1)\n    y1 = int(y1)\n    x2 = int(x2)\n    y2 = int(y2)\n    cv2.rectangle(image, (x1, y1), (x2, y2), (b, g, r), thickness)\n    # Add label\n    if label is not None:\n        # Get size of the text box",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:82-122"
+    },
+    "5913": {
+        "file_id": 477,
+        "content": "Function `ucf24_utils.py:81-121` defines a function `convertToAbsoluteValues` which takes in the size and bounding box coordinates (x, y, w, h) as input and returns absolute values for xIn, yIn, xEnd, yEnd considering the image size. If any of these values fall outside the image boundaries, they are adjusted to the last valid pixel within the image.\nThis code also includes a function `add_bb_into_image` which adds a bounding box with given coordinates (x1, y1, x2, y2) and label on the image using OpenCV's rectangle() function and font() function for adding labels to the bounding boxes.",
+        "type": "comment"
+    },
+    "5914": {
+        "file_id": 477,
+        "content": "        (tw, th) = cv2.getTextSize(label, font, fontScale, fontThickness)[0]\n        # Top-left coord of the textbox\n        (xin_bb, yin_bb) = (x1 + thickness, y1 - th + int(12.5 * fontScale))\n        # Checking position of the text top-left (outside or inside the bb)\n        if yin_bb - th <= 0:  # if outside the image\n            yin_bb = y1 + th  # put it inside the bb\n        r_Xin = x1 - int(thickness / 2)\n        r_Yin = y1 - th - int(thickness / 2)\n        # Draw filled rectangle to put the text in it\n        cv2.rectangle(image, (r_Xin, r_Yin - thickness),\n                      (r_Xin + tw + thickness * 3, r_Yin + th + int(12.5 * fontScale)), (b, g, r),\n                      -1)\n        cv2.putText(image, label, (xin_bb, yin_bb), font, fontScale, (0, 0, 0), fontThickness,\n                    cv2.LINE_AA)\n    return image\nclass BoundingBox:\n    def __init__(self,\n                 imageName,\n                 classId,\n                 x,\n                 y,\n                 w,\n                 h,\n                 typeCoordinates=None,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:123-148"
+    },
+    "5915": {
+        "file_id": 477,
+        "content": "This function calculates the text box coordinates and draws a rectangle around it, then adds text within the rectangle. The text position is adjusted if it's outside the image area. It also initializes a class for bounding boxes with properties like image name, class ID, and coordinates.",
+        "type": "comment"
+    },
+    "5916": {
+        "file_id": 477,
+        "content": "                 imgSize=None,\n                 bbType=None,\n                 classConfidence=None,\n                 format=None):\n        \"\"\"Constructor.\n        Args:\n            imageName: String representing the image name.\n            classId: String value representing class id.\n            x: Float value representing the X upper-left coordinate of the bounding box.\n            y: Float value representing the Y upper-left coordinate of the bounding box.\n            w: Float value representing the width bounding box.\n            h: Float value representing the height bounding box.\n            typeCoordinates: (optional) Enum (Relative or Absolute) represents if the bounding box\n            coordinates (x,y,w,h) are absolute or relative to size of the image. Default:'Absolute'.\n            imgSize: (optional) 2D vector (width, height)=>(int, int) represents the size of the\n            image of the bounding box. If typeCoordinates is 'Relative', imgSize is required.\n            bbType: (optional) Enum (Groundtruth or Detection) identifies if the bounding box",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:149-165"
+    },
+    "5917": {
+        "file_id": 477,
+        "content": "This code snippet defines a constructor for the class Ucf24Metrics, which takes parameters like image name, class id, bounding box coordinates (x, y, w, h), and type of bounding box coordinates. It also accepts optional arguments such as imgSize, bbType, classConfidence, and format. If typeCoordinates is 'Relative', then imgSize is required. The constructor initializes an object representing a metric for UCF101 dataset.",
+        "type": "comment"
+    },
+    "5918": {
+        "file_id": 477,
+        "content": "            represents a ground truth or a detection. If it is a detection, the classConfidence has\n            to be informed.\n            classConfidence: (optional) Float value representing the confidence of the detected\n            class. If detectionType is Detection, classConfidence needs to be informed.\n            format: (optional) Enum (BBFormat.XYWH or BBFormat.XYX2Y2) indicating the format of the\n            coordinates of the bounding boxes. BBFormat.XYWH: <left> <top> <width> <height>\n            BBFormat.XYX2Y2: <left> <top> <right> <bottom>.\n        \"\"\"\n        self._imageName = imageName\n        self._typeCoordinates = typeCoordinates\n        if typeCoordinates == CoordinatesType.Relative and imgSize is None:\n            raise IOError(\n                'Parameter \\'imgSize\\' is required. It is necessary to inform the image size.')\n        if bbType == BBType.Detected and classConfidence is None:\n            raise IOError(\n                'For bbType=\\'Detection\\', it is necessary to inform the classConfidence value.')",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:166-181"
+    },
+    "5919": {
+        "file_id": 477,
+        "content": "This code defines a class with properties: imageName, typeCoordinates (Relative or Absolute), imgSize (image size required if typeCoordinates is Relative), bbType (Ground Truth or Detection), and classConfidence (optional for Detection). It also includes error checks for mandatory parameters (imgSize for Relative typeCoordinates and classConfidence for Detection bbType).",
+        "type": "comment"
+    },
+    "5920": {
+        "file_id": 477,
+        "content": "        self._classConfidence = classConfidence\n        self._bbType = bbType\n        self._classId = classId\n        self._format = format\n        # If relative coordinates, convert to absolute values\n        # For relative coords: (x,y,w,h)=(X_center/img_width , Y_center/img_height)\n        if typeCoordinates == CoordinatesType.Relative:\n            (self._x, self._y, self._w, self._h) = convertToAbsoluteValues(imgSize, (x, y, w, h))\n            self._width_img = imgSize[0]\n            self._height_img = imgSize[1]\n            if format == BBFormat.XYWH:\n                self._x2 = self._w\n                self._y2 = self._h\n                self._w = self._x2 - self._x\n                self._h = self._y2 - self._y\n            else:\n                raise IOError(\n                    'For relative coordinates, the format must be XYWH (x,y,width,height)')\n        # For absolute coords: (x,y,w,h)=real bb coords\n        else:\n            self._x = x\n            self._y = y\n            if format == BBFormat.XYWH:\n                self._w = w",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:183-207"
+    },
+    "5921": {
+        "file_id": 477,
+        "content": "This function converts relative bounding box coordinates to absolute values and assigns them to the object. If the given format is XYWH, it adjusts the width and height accordingly. For absolute coordinates, it directly assigns the provided values. If the format does not match XYWH for relative coordinates, an IOError is raised.",
+        "type": "comment"
+    },
+    "5922": {
+        "file_id": 477,
+        "content": "                self._h = h\n                self._x2 = self._x + self._w\n                self._y2 = self._y + self._h\n            else:  # format == BBFormat.XYX2Y2: <left> <top> <right> <bottom>.\n                self._x2 = w\n                self._y2 = h\n                self._w = self._x2 - self._x\n                self._h = self._y2 - self._y\n        if imgSize is None:\n            self._width_img = None\n            self._height_img = None\n        else:\n            self._width_img = imgSize[0]\n            self._height_img = imgSize[1]\n    def getAbsoluteBoundingBox(self, format=None):\n        if format == BBFormat.XYWH:\n            return self._x, self._y, self._w, self._h\n        elif format == BBFormat.XYX2Y2:\n            return self._x, self._y, self._x2, self._y2\n    def getRelativeBoundingBox(self, imgSize=None):\n        if imgSize is None and self._width_img is None and self._height_img is None:\n            raise IOError(\n                'Parameter \\'imgSize\\' is required. It is necessary to inform the image size.')",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:208-232"
+    },
+    "5923": {
+        "file_id": 477,
+        "content": "The code defines a class with methods to handle bounding box formats. It supports two formats: XYWH and XYX2Y2. The constructor initializes the bounding box dimensions and image size if provided. The getAbsoluteBoundingBox method returns the bounding box coordinates based on the format specified. If no image size is available, getRelativeBoundingBox requires the imgSize parameter to determine the absolute position of the bounding box.",
+        "type": "comment"
+    },
+    "5924": {
+        "file_id": 477,
+        "content": "        if imgSize is None:\n            return convertToRelativeValues((imgSize[0], imgSize[1]),\n                                           (self._x, self._y, self._w, self._h))\n        else:\n            return convertToRelativeValues((self._width_img, self._height_img),\n                                           (self._x, self._y, self._w, self._h))\n    def getImageName(self):\n        return self._imageName\n    def getConfidence(self):\n        return self._classConfidence\n    def getFormat(self):\n        return self._format\n    def getClassId(self):\n        return self._classId\n    def getImageSize(self):\n        return self._width_img, self._height_img\n    def getCoordinatesType(self):\n        return self._typeCoordinates\n    def getBBType(self):\n        return self._bbType\n    @staticmethod\n    def compare(det1, det2):\n        det1BB = det1.getAbsoluteBoundingBox(format=BBFormat.XYWH)\n        det1ImgSize = det1.getImageSize()\n        det2BB = det2.getAbsoluteBoundingBox(format=BBFormat.XYWH)\n        det2ImgSize = det2.getImageSize()",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:233-266"
+    },
+    "5925": {
+        "file_id": 477,
+        "content": "This code defines a class with various getter methods to access different attributes of the detection result. The class also contains a static method compare() that takes two detections as input and compares them using absolute bounding boxes and image sizes.",
+        "type": "comment"
+    },
+    "5926": {
+        "file_id": 477,
+        "content": "        if det1.getClassId() == det2.getClassId() and \\\n                det1.classConfidence == det2.classConfidenc() and \\\n                det1BB[0] == det2BB[0] and \\\n                det1BB[1] == det2BB[1] and \\\n                det1BB[2] == det2BB[2] and \\\n                det1BB[3] == det2BB[3] and \\\n                det1ImgSize[0] == det1ImgSize[0] and \\\n                det2ImgSize[1] == det2ImgSize[1]:\n            return True\n        return False\n    @staticmethod\n    def clone(boundingBox):\n        absBB = boundingBox.getAbsoluteBoundingBox(format=BBFormat.XYWH)\n        newBoundingBox = BoundingBox(\n            boundingBox.getImageName(),\n            boundingBox.getClassId(),\n            absBB[0],\n            absBB[1],\n            absBB[2],\n            absBB[3],\n            typeCoordinates=boundingBox.getCoordinatesType(),\n            imgSize=boundingBox.getImageSize(),\n            bbType=boundingBox.getBBType(),\n            classConfidence=boundingBox.getConfidence(),\n            format=BBFormat.XYWH)\n        return newBoundingBox",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:268-294"
+    },
+    "5927": {
+        "file_id": 477,
+        "content": "The code snippet compares two bounding boxes to check if they match by comparing their class IDs, coordinates, and image sizes. If the conditions are met, it returns True; otherwise, False. The static method `clone` creates a new bounding box with the same properties as an existing one, allowing for easy cloning of bounding boxes.",
+        "type": "comment"
+    },
+    "5928": {
+        "file_id": 477,
+        "content": "class BoundingBoxes:\n    def __init__(self):\n        self._boundingBoxes = []\n    def addBoundingBox(self, bb):\n        self._boundingBoxes.append(bb)\n    def removeBoundingBox(self, _boundingBox):\n        for d in self._boundingBoxes:\n            if BoundingBox.compare(d, _boundingBox):\n                del self._boundingBoxes[d]\n                return\n    def removeAllBoundingBoxes(self):\n        self._boundingBoxes = []\n    def getBoundingBoxes(self):\n        return self._boundingBoxes\n    def getBoundingBoxByClass(self, classId):\n        boundingBoxes = []\n        for d in self._boundingBoxes:\n            if d.getClassId() == classId:  # get only specified bounding box type\n                boundingBoxes.append(d)\n        return boundingBoxes\n    def getClasses(self):\n        classes = []\n        for d in self._boundingBoxes:\n            c = d.getClassId()\n            if c not in classes:\n                classes.append(c)\n        return classes\n    def getBoundingBoxesByType(self, bbType):\n        # get only specified bb type",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:297-332"
+    },
+    "5929": {
+        "file_id": 477,
+        "content": "This class represents a collection of bounding boxes with methods to add, remove, and retrieve bounding boxes based on their type or class. It also provides functionality to retrieve all classes present in the bounding boxes.",
+        "type": "comment"
+    },
+    "5930": {
+        "file_id": 477,
+        "content": "        return [d for d in self._boundingBoxes if d.getBBType() == bbType]\n    def getBoundingBoxesByImageName(self, imageName):\n        # get only specified bb type\n        return [d for d in self._boundingBoxes if d.getImageName() == imageName]\n    def count(self, bbType=None):\n        if bbType is None:  # Return all bounding boxes\n            return len(self._boundingBoxes)\n        count = 0\n        for d in self._boundingBoxes:\n            if d.getBBType() == bbType:  # get only specified bb type\n                count += 1\n        return count\n    def clone(self):\n        newBoundingBoxes = BoundingBoxes()\n        for d in self._boundingBoxes:\n            det = BoundingBox.clone(d)\n            newBoundingBoxes.addBoundingBox(det)\n        return newBoundingBoxes\n    def drawAllBoundingBoxes(self, image, imageName):\n        bbxes = self.getBoundingBoxesByImageName(imageName)\n        for bb in bbxes:\n            if bb.getBBType() == BBType.GroundTruth:  # if ground truth\n                image = add_bb_into_image(image, bb, color=(0, 255, 0))  # green",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:333-359"
+    },
+    "5931": {
+        "file_id": 477,
+        "content": "Function `getBoundingBoxesByBBType` returns a list of bounding boxes with the specified BB type.\nFunction `getBoundingBoxesByImageName` returns a list of bounding boxes for the given image name.\nMethod `count` counts and returns the number of bounding boxes with the specified BB type, or all bounding boxes if no type is provided.\nMethod `clone` creates a new instance of BoundingBoxes and adds clones of each bounding box from the original instance.\nFunction `drawAllBoundingBoxes` draws all bounding boxes for the given image name on the specified image, only ground truth bounding boxes are drawn in green color.",
+        "type": "comment"
+    },
+    "5932": {
+        "file_id": 477,
+        "content": "            else:  # if detection\n                image = add_bb_into_image(image, bb, color=(255, 0, 0))  # red\n        return image\nclass Evaluator:\n    def GetPascalVOCMetrics(self,\n                            boundingboxes,\n                            IOUThreshold=0.5,\n                            method=None):\n        \"\"\"Get the metrics used by the VOC Pascal 2012 challenge.\n        Get\n        Args:\n            boundingboxes: Object of the class BoundingBoxes representing ground truth and detected\n            bounding boxes;\n            IOUThreshold: IOU threshold indicating which detections will be considered TP or FP\n            (default value = 0.5);\n            method (default = EveryPointInterpolation): It can be calculated as the implementation\n            in the official PASCAL VOC toolkit (EveryPointInterpolation), or applying the 11-point\n            interpolatio as described in the paper \"The PASCAL Visual Object Classes(VOC) Challenge\"\n            or EveryPointInterpolation\"  (ElevenPointInterpolation);",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:360-380"
+    },
+    "5933": {
+        "file_id": 477,
+        "content": "The code defines a function `GetPascalVOCMetrics` within the `Evaluator` class to calculate metrics for Pascal VOC Challenge. It takes `boundingboxes`, `IOUThreshold`, and `method` as input parameters. The method can be set as `EveryPointInterpolation` or `ElevenPointInterpolation`. This function calculates precision, recall, F1 score, and AP metric using the provided parameters for Pascal VOC Challenge evaluation.",
+        "type": "comment"
+    },
+    "5934": {
+        "file_id": 477,
+        "content": "        Returns:\n            A list of dictionaries. Each dictionary contains information and metrics of each class.\n            The keys of each dictionary are:\n            dict['class']: class representing the current dictionary;\n            dict['precision']: array with the precision values;\n            dict['recall']: array with the recall values;\n            dict['AP']: average precision;\n            dict['interpolated precision']: interpolated precision values;\n            dict['interpolated recall']: interpolated recall values;\n            dict['total positives']: total number of ground truth positives;\n            dict['total TP']: total number of True Positive detections;\n            dict['total FP']: total number of False Negative detections;\n        \"\"\"\n        ret = []  # list containing metrics (precision, recall, average precision) of each class\n        # List with all ground truths (Ex: [imageName,class,confidence=1, (bb coordinates XYX2Y2)])\n        groundTruths = []\n        # List with all detections (Ex: [imageName,class,confidence,(bb coordinates XYX2Y2)])",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:381-397"
+    },
+    "5935": {
+        "file_id": 477,
+        "content": "The function returns a list of dictionaries, each containing information and metrics of each class. The keys include class representation, precision values, recall values, average precision, interpolated precision, interpolated recall, total positives, total true positives, and total false positives. It initializes an empty list \"ret\" to store the metrics for each class, as well as groundTruths and detection lists.",
+        "type": "comment"
+    },
+    "5936": {
+        "file_id": 477,
+        "content": "        detections = []\n        # Get all classes\n        classes = []\n        # Loop through all bounding boxes and separate them into GTs and detections\n        for bb in boundingboxes.getBoundingBoxes():\n            # [imageName, class, confidence, (bb coordinates XYX2Y2)]\n            if bb.getBBType() == BBType.GroundTruth:\n                groundTruths.append([\n                    bb.getImageName(),\n                    bb.getClassId(), 1,\n                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n                ])\n            else:\n                detections.append([\n                    bb.getImageName(),\n                    bb.getClassId(),\n                    bb.getConfidence(),\n                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n                ])\n            # get class\n            if bb.getClassId() not in classes:\n                classes.append(bb.getClassId())\n        classes = sorted(classes)\n        # Precision x Recall is obtained individually by each class\n        # Loop through by classes",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:398-422"
+    },
+    "5937": {
+        "file_id": 477,
+        "content": "The code initializes empty lists for detections and classes, then iterates through all bounding boxes. It separates ground truth (GT) bounding boxes from detections, appending them to their respective lists with additional information such as image name, class ID, confidence, and bounding box coordinates. It also keeps track of unique classes and sorts them. The code will then use these lists and sorted classes for precision-recall calculations by individual classes.",
+        "type": "comment"
+    },
+    "5938": {
+        "file_id": 477,
+        "content": "        for c in classes:\n            # Get only detection of class c\n            dects = []\n            [dects.append(d) for d in detections if d[1] == c]\n            # Get only ground truths of class c\n            gts = []\n            [gts.append(g) for g in groundTruths if g[1] == c]\n            npos = len(gts)\n            # sort detections by decreasing confidence\n            dects = sorted(dects, key=lambda conf: conf[2], reverse=True)\n            TP = np.zeros(len(dects))\n            FP = np.zeros(len(dects))\n            # create dictionary with amount of gts for each image\n            det = Counter([cc[0] for cc in gts])\n            for key, val in det.items():\n                det[key] = np.zeros(val)\n            # Loop through detections\n            for d in range(len(dects)):\n                # Find ground truth image\n                gt = [gt for gt in gts if gt[0] == dects[d][0]]\n                iouMax = sys.float_info.min\n                for j in range(len(gt)):\n                    iou = Evaluator.iou(dects[d][3], gt[j][3])",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:423-445"
+    },
+    "5939": {
+        "file_id": 477,
+        "content": "Iterating through classes, the code collects detections and ground truths for each class. It then calculates the number of positive ground truths (npos), sorts detections by confidence level, and initializes True Positive (TP) and False Positive (FP) arrays. The code creates a dictionary to store the amount of ground truths per image, and iterates through detections to find corresponding ground truth images, calculating Intersection over Union (IoU) between detection and ground truth bounding boxes.",
+        "type": "comment"
+    },
+    "5940": {
+        "file_id": 477,
+        "content": "                    if iou > iouMax:\n                        iouMax = iou\n                        jmax = j\n                # Assign detection as true positive/don't care/false positive\n                if iouMax >= IOUThreshold:\n                    if det[dects[d][0]][jmax] == 0:\n                        TP[d] = 1  # count as true positive\n                        det[dects[d][0]][jmax] = 1  # flag as already 'seen'\n                    else:\n                        FP[d] = 1  # count as false positive\n                # - A detected \"cat\" is overlaped with a GT \"cat\" with IOU >= IOUThreshold.\n                else:\n                    FP[d] = 1  # count as false positive\n            # compute precision, recall and average precision\n            acc_FP = np.cumsum(FP)\n            acc_TP = np.cumsum(TP)\n            rec = acc_TP / npos\n            prec = np.divide(acc_TP, (acc_FP + acc_TP))\n            # Depending on the method, call the right implementation\n            if method == MethodAveragePrecision.EveryPointInterpolation:",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:446-465"
+    },
+    "5941": {
+        "file_id": 477,
+        "content": "This code calculates true positives, false positives, and computes precision, recall, and average precision. It checks if a detected object overlaps with ground truth objects using IOU threshold. If the overlap is within the threshold, it counts as a true positive or false positive depending on whether the object has already been marked 'seen'. Finally, based on the method chosen (EveryPointInterpolation in this case), it calls the appropriate average precision calculation function.",
+        "type": "comment"
+    },
+    "5942": {
+        "file_id": 477,
+        "content": "                [ap, mpre, mrec, ii] = Evaluator.CalculateAveragePrecision(rec, prec)\n            else:\n                [ap, mpre, mrec, _] = Evaluator.ElevenPointInterpolatedAP(rec, prec)\n            # add class result in the dictionary to be returned\n            r = {\n                'class': c,\n                'precision': prec,\n                'recall': rec,\n                'AP': ap,\n                'interpolated precision': mpre,\n                'interpolated recall': mrec,\n                'total positives': npos,\n                'total TP': np.sum(TP),\n                'total FP': np.sum(FP)\n            }\n            ret.append(r)\n        return ret\n    @staticmethod\n    def CalculateAveragePrecision(rec, prec):\n        mrec = [0]\n        [mrec.append(e) for e in rec]\n        mrec.append(1)\n        mpre = [0]\n        [mpre.append(e) for e in prec]\n        mpre.append(0)\n        for i in range(len(mpre) - 1, 0, -1):\n            mpre[i - 1] = max(mpre[i - 1], mpre[i])\n        ii = []\n        for i in range(len(mrec) - 1):",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:466-495"
+    },
+    "5943": {
+        "file_id": 477,
+        "content": "Calculates average precision for each class using CalculateAveragePrecision or ElevenPointInterpolatedAP depending on the input. Appends the results to a dictionary, then adds the dictionary to a list and returns it.",
+        "type": "comment"
+    },
+    "5944": {
+        "file_id": 477,
+        "content": "            if mrec[1:][i] != mrec[0:-1][i]:\n                ii.append(i + 1)\n        ap = 0\n        for i in ii:\n            ap = ap + np.sum((mrec[i] - mrec[i - 1]) * mpre[i])\n        return [ap, mpre[0:len(mpre) - 1], mrec[0:len(mpre) - 1], ii]\n    @staticmethod\n    # 11-point interpolated average precision\n    def ElevenPointInterpolatedAP(rec, prec):\n        mrec = []\n        [mrec.append(e) for e in rec]\n        mpre = []\n        [mpre.append(e) for e in prec]\n        recallValues = np.linspace(0, 1, 11)\n        recallValues = list(recallValues[::-1])\n        rhoInterp = []\n        recallValid = []\n        for r in recallValues:\n            # Obtain all recall values higher or equal than r\n            argGreaterRecalls = np.argwhere(mrec[:] >= r)\n            pmax = 0\n            # If there are recalls above r\n            if argGreaterRecalls.size != 0:\n                pmax = max(mpre[argGreaterRecalls.min():])\n            recallValid.append(r)\n            rhoInterp.append(pmax)\n        # By definition AP = sum(max(precision whose recall is above r))/11",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:496-523"
+    },
+    "5945": {
+        "file_id": 477,
+        "content": "The code calculates the 11-point interpolated average precision (AP) between recall and precision values. It first appends recall and precision lists in reverse order, then creates a list of recall values from 0 to 1 in reverse order. Next, it iterates over these recall values, finding all recall values greater than or equal to the current value and selecting the maximum precision at that index. Finally, it returns the interpolated AP by summing the maximum precisions for each recall value and dividing by 11. The resulting AP values are stored in a list along with the original recall and precision lists, as well as an indicator list of indices where the recall values were greater than or equal to the current recall value.",
+        "type": "comment"
+    },
+    "5946": {
+        "file_id": 477,
+        "content": "        ap = sum(rhoInterp) / 11\n        # Generating values for the plot\n        rvals = [recallValid[0]]\n        [rvals.append(e) for e in recallValid]\n        rvals.append(0)\n        pvals = [0]\n        [pvals.append(e) for e in rhoInterp]\n        pvals.append(0)\n        # rhoInterp = rhoInterp[::-1]\n        cc = []\n        for i in range(len(rvals)):\n            p = (rvals[i], pvals[i - 1])\n            if p not in cc:\n                cc.append(p)\n            p = (rvals[i], pvals[i])\n            if p not in cc:\n                cc.append(p)\n        recallValues = [i[0] for i in cc]\n        rhoInterp = [i[1] for i in cc]\n        return [ap, rhoInterp, recallValues, None]\n    # For each detections, calculate IOU with reference\n    @staticmethod\n    def _getAllIOUs(reference, detections):\n        ret = []\n        bbReference = reference.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n        # img = np.zeros((200,200,3), np.uint8)\n        for d in detections:\n            bb = d.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n            iou = Evaluator.iou(bbReference, bb)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:524-553"
+    },
+    "5947": {
+        "file_id": 477,
+        "content": "The code calculates average precision (AP) and Area Under Curve (AUC), then generates recall and precision values for a plot. It also defines a method to calculate the Intersection over Union (IoU) between reference and detection bounding boxes.",
+        "type": "comment"
+    },
+    "5948": {
+        "file_id": 477,
+        "content": "            ret.append((iou, reference, d))  # iou, reference, detection\n        return sorted(ret, key=lambda i: i[0], reverse=True)  # sort by iou (from highest to lowest)\n    @staticmethod\n    def iou(boxA, boxB):\n        # if boxes dont intersect\n        if Evaluator._boxesIntersect(boxA, boxB) is False:\n            return 0\n        interArea = Evaluator._getIntersectionArea(boxA, boxB)\n        union = Evaluator._getUnionAreas(boxA, boxB, interArea=interArea)\n        # intersection over union\n        iou = interArea / union\n        assert iou >= 0\n        return iou\n    @staticmethod\n    def _boxesIntersect(boxA, boxB):\n        if boxA[0] > boxB[2]:\n            return False  # boxA is right of boxB\n        if boxB[0] > boxA[2]:\n            return False  # boxA is left of boxB\n        if boxA[3] < boxB[1]:\n            return False  # boxA is above boxB\n        if boxA[1] > boxB[3]:\n            return False  # boxA is below boxB\n        return True\n    @staticmethod\n    def _getIntersectionArea(boxA, boxB):\n        xA = max(boxA[0], boxB[0])",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:554-583"
+    },
+    "5949": {
+        "file_id": 477,
+        "content": "The code calculates the IoU (intersection over union) between two bounding boxes, and returns a list of detection results sorted by IoU in descending order. It also includes utility methods to check if two boxes intersect and calculate the intersection area.",
+        "type": "comment"
+    },
+    "5950": {
+        "file_id": 477,
+        "content": "        yA = max(boxA[1], boxB[1])\n        xB = min(boxA[2], boxB[2])\n        yB = min(boxA[3], boxB[3])\n        # intersection area\n        return (xB - xA + 1) * (yB - yA + 1)\n    @staticmethod\n    def _getUnionAreas(boxA, boxB, interArea=None):\n        area_A = Evaluator._getArea(boxA)\n        area_B = Evaluator._getArea(boxB)\n        if interArea is None:\n            interArea = Evaluator._getIntersectionArea(boxA, boxB)\n        return float(area_A + area_B - interArea)\n    @staticmethod\n    def _getArea(box):\n        return (box[2] - box[0] + 1) * (box[3] - box[1] + 1)\n# Validate formats\ndef ValidateFormats(argFormat, argName, errors):\n    if argFormat == 'xywh':\n        return BBFormat.XYWH\n    elif argFormat == 'xyrb':\n        return BBFormat.XYX2Y2\n    elif argFormat is None:\n        return BBFormat.XYWH  # default when nothing is passed\n    else:\n        errors.append(\n            'argument %s: invalid value. It must be either \\'xywh\\' or \\'xyrb\\'' % argName)\n# Validate mandatory args\ndef ValidateMandatoryArgs(arg, argName, errors):",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:584-617"
+    },
+    "5951": {
+        "file_id": 477,
+        "content": "This code contains functions to calculate intersection and union areas of two bounding boxes, and two validation functions for argument formats and mandatory arguments. The ValidateFormats function checks if the format is 'xywh', 'xyrb' or None (default) and returns a corresponding BBFormat type. The ValidateMandatoryArgs function checks if an argument exists and appends an error message to 'errors' if it doesn't meet the requirements.",
+        "type": "comment"
+    },
+    "5952": {
+        "file_id": 477,
+        "content": "    if arg is None:\n        errors.append('argument %s: required argument' % argName)\n    else:\n        return True\ndef ValidateImageSize(arg, argName, argInformed, errors):\n    errorMsg = 'argument %s: required argument if %s is relative' % (argName, argInformed)\n    ret = None\n    if arg is None:\n        errors.append(errorMsg)\n    else:\n        arg = arg.replace('(', '').replace(')', '')\n        args = arg.split(',')\n        if len(args) != 2:\n            errors.append(\n                '%s. It must be in the format \\'width,height\\' (e.g. \\'600,400\\')' % errorMsg)\n        else:\n            if not args[0].isdigit() or not args[1].isdigit():\n                errors.append(\n                    '%s. It must be in INdiaTEGER the format \\'width,height\\' (e.g. \\'600,400\\')' %\n                    errorMsg)\n            else:\n                ret = (int(args[0]), int(args[1]))\n    return ret\n# Validate coordinate types\ndef ValidateCoordinatesTypes(arg, argName, errors):\n    if arg == 'abs':\n        return CoordinatesType.Absolute",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:618-648"
+    },
+    "5953": {
+        "file_id": 477,
+        "content": "This code defines a function ValidateImageSize that checks if the image size argument is valid. It appends error messages to the errors list if the argument is missing or not in the correct format 'width,height'. The function also handles the case where the argument is relative and requires both width and height to be integers. Finally, it returns a tuple of (width, height) if valid. Additionally, there's a ValidateCoordinatesTypes function that checks if the coordinate type argument is valid and returns the CoordinatesType.Absolute if 'abs'.",
+        "type": "comment"
+    },
+    "5954": {
+        "file_id": 477,
+        "content": "    elif arg == 'rel':\n        return CoordinatesType.Relative\n    elif arg is None:\n        return CoordinatesType.Absolute  # default when nothing is passed\n    errors.append('argument %s: invalid value. It must be either \\'rel\\' or \\'abs\\'' % argName)\ndef getBoundingBoxes(directory,\n                     isGT,\n                     bbFormat,\n                     coordType,\n                     allBoundingBoxes=None,\n                     allClasses=None,\n                     imgSize=(0, 0)):\n    \"\"\"Read txt files containing bounding boxes (ground truth and detections).\"\"\"\n    print(directory)\n    if allBoundingBoxes is None:\n        allBoundingBoxes = BoundingBoxes()\n    if allClasses is None:\n        allClasses = []\n    # Read ground truths\n    os.chdir(directory)\n    files = glob.glob(\"*.txt\")\n    files.sort()\n    for f in files:\n        nameOfImage = f.replace(\".txt\", \"\")\n        fh1 = open(f, \"r\")\n        for line in fh1:\n            line = line.replace(\"\\n\", \"\")\n            if line.replace(' ', '') == '':\n                continue",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:649-680"
+    },
+    "5955": {
+        "file_id": 477,
+        "content": "This code reads text files containing bounding boxes (ground truth and detections). It handles 'relative' or 'absolute' coordinates, and checks for invalid arguments. The function takes a directory, image format, and coordinate type as inputs, and returns bounding boxes and classes. If allBoundingBoxes or allClasses are None, it initializes them. It changes the working directory to the specified directory and reads all files in alphabetical order.",
+        "type": "comment"
+    },
+    "5956": {
+        "file_id": 477,
+        "content": "            splitLine = line.split(\" \")\n            if isGT:\n                idClass = (splitLine[0])  # class\n                x = float(splitLine[1])\n                y = float(splitLine[2])\n                w = float(splitLine[3])\n                h = float(splitLine[4])\n                bb = BoundingBox(\n                    nameOfImage,\n                    idClass,\n                    x,\n                    y,\n                    w,\n                    h,\n                    coordType,\n                    imgSize,\n                    BBType.GroundTruth,\n                    format=bbFormat)\n            else:\n                idClass = (splitLine[0])  # class\n                confidence = float(splitLine[1])\n                x = float(splitLine[2])\n                y = float(splitLine[3])\n                w = float(splitLine[4])\n                h = float(splitLine[5])\n                bb = BoundingBox(\n                    nameOfImage,\n                    idClass,\n                    x,\n                    y,\n                    w,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:681-711"
+    },
+    "5957": {
+        "file_id": 477,
+        "content": "This code reads a line of text and determines whether it represents ground truth or predicted bounding boxes. It then initializes BoundingBox objects with the appropriate attributes based on the type (ground truth or prediction) and stores them accordingly.",
+        "type": "comment"
+    },
+    "5958": {
+        "file_id": 477,
+        "content": "                    h,\n                    coordType,\n                    imgSize,\n                    BBType.Detected,\n                    confidence,\n                    format=bbFormat)\n            allBoundingBoxes.addBoundingBox(bb)\n            if idClass not in allClasses:\n                allClasses.append(idClass)\n        fh1.close()\n    return allBoundingBoxes, allClasses\ndef get_mAP(gtFolder, detFolder, threshold=0.5, savePath=None):\n    gtFormat = 'xyrb'\n    detFormat = 'xyrb'\n    gtCoordinates = 'abs'\n    detCoordinates = 'abs'\n    gtFolder = os.path.join(os.path.abspath('.'), gtFolder)\n    detFolder = os.path.join(os.path.abspath('.'), detFolder)\n    iouThreshold = threshold\n    # Arguments validation\n    errors = []\n    # Validate formats\n    gtFormat = ValidateFormats(gtFormat, 'gtFormat', errors)\n    detFormat = ValidateFormats(detFormat, '-detformat', errors)\n    # Coordinates types\n    gtCoordType = ValidateCoordinatesTypes(gtCoordinates, '-gtCoordinates', errors)\n    detCoordType = ValidateCoordinatesTypes(detCoordinates, '-detCoordinates', errors)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:712-743"
+    },
+    "5959": {
+        "file_id": 477,
+        "content": "The code defines a function to calculate the mean average precision (mAP) between ground truth and detected objects in image classification tasks. It takes input folders containing ground truth and detection results, adjustable threshold for determining true positives, and an optional save path for the output file. The code performs argument validation to ensure correct formats and coordinate types.",
+        "type": "comment"
+    },
+    "5960": {
+        "file_id": 477,
+        "content": "    imgSize = (0, 0)\n    # Create directory to save results\n    shutil.rmtree(savePath, ignore_errors=True)  # Clear folder\n    if savePath is not None:\n        os.makedirs(savePath)\n    # Get groundtruth boxes\n    allBoundingBoxes, allClasses = getBoundingBoxes(\n        gtFolder, True, gtFormat, gtCoordType, imgSize=imgSize)\n    # Get detected boxes\n    allBoundingBoxes, allClasses = getBoundingBoxes(\n        detFolder, False, detFormat, detCoordType, allBoundingBoxes, allClasses, imgSize=imgSize)\n    allClasses.sort()\n    evaluator = Evaluator()\n    acc_AP = 0\n    validClasses = 0\n    # Plot Precision x Recall curve\n    detections = evaluator.GetPascalVOCMetrics(allBoundingBoxes, iouThreshold,\n                                               method=MethodAveragePrecision.EveryPointInterpolation)\n    # each detection is a class and store AP and mAP results in AP_res list\n    AP_res = []\n    for metricsPerClass in detections:\n        # Get metric values per each class\n        cl = metricsPerClass['class']\n        ap = metricsPerClass['AP']",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:744-772"
+    },
+    "5961": {
+        "file_id": 477,
+        "content": "This code is creating a directory to save results, clearing any previous content in the folder. It then retrieves ground truth and detected bounding boxes, sorts classes, initializes an evaluator object, and calculates average precision (AP) for each class, storing the AP and mean AP results in the AP_res list.",
+        "type": "comment"
+    },
+    "5962": {
+        "file_id": 477,
+        "content": "        totalPositives = metricsPerClass['total positives']\n        if totalPositives > 0:\n            validClasses = validClasses + 1\n            acc_AP = acc_AP + ap\n            ap_str = \"{0:.2f}%\".format(ap * 100)\n            AP_res.append('AP: %s (%s)' % (ap_str, cl))\n    mAP = acc_AP / validClasses\n    mAP_str = \"{0:.2f}%\".format(mAP * 100)\n    AP_res.append('mAP: %s' % mAP_str)\n    return AP_res",
+        "type": "code",
+        "location": "/paddlevideo/metrics/ucf24_utils.py:773-783"
+    },
+    "5963": {
+        "file_id": 477,
+        "content": "This code calculates mean Average Precision (mAP) for each class and returns it as a list. It iterates through valid classes, calculates Average Precision (AP) for each class if there are positive samples, updates mAP by averaging APs of all valid classes, and appends AP and class labels to the result list. The final mAP value is also formatted and added to the result list before returning it.",
+        "type": "comment"
+    },
+    "5964": {
+        "file_id": 478,
+        "content": "/paddlevideo/metrics/vos_metric.py",
+        "type": "filepath"
+    },
+    "5965": {
+        "file_id": 478,
+        "content": "The PaddleVideo framework's VOSMetric class is responsible for video object segmentation tasks, data processing, and model preparation. It includes methods 'flip_tensor', 'save_mask', and manages various operations such as handling failures and logging data.",
+        "type": "summary"
+    },
+    "5966": {
+        "file_id": 478,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport os\nimport paddle\nimport zipfile\nimport time\nfrom PIL import Image\nfrom paddle.io import DataLoader\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass VOSMetric(BaseMetric):\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 result_root,\n                 zip_dir,\n                 log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:1-38"
+    },
+    "5967": {
+        "file_id": 478,
+        "content": "This code is part of the PaddleVideo framework, implementing the VOSMetric class. It registers a metric for video object segmentation tasks using the PaddlePaddle library. The class takes parameters such as data size, batch size, result root directory, and zip directory for results storage.",
+        "type": "comment"
+    },
+    "5968": {
+        "file_id": 478,
+        "content": "        self.video_num = 0\n        self.total_time = 0\n        self.total_frame = 0\n        self.total_sfps = 0\n        self.total_video_num = data_size\n        self.count = 0\n        self.result_root = result_root\n        self.zip_dir = zip_dir\n    def update(self, batch_id, data, model):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        self.video_num += 1\n        seq_dataset = data\n        seq_name = seq_dataset.seq_name\n        logger.info('Prcessing Seq {} [{}/{}]:'.format(seq_name, self.video_num,\n                                                       self.total_video_num))\n        seq_dataloader = DataLoader(seq_dataset,\n                                    return_list=True,\n                                    batch_size=1,\n                                    shuffle=False,\n                                    num_workers=0)\n        seq_total_time = 0\n        seq_total_frame = 0\n        ref_embeddings = []\n        ref_masks = []\n        prev_embedding = []\n        prev_mask = []\n        with paddle.no_grad():",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:39-68"
+    },
+    "5969": {
+        "file_id": 478,
+        "content": "This code initializes a VOS metric class with parameters such as total_video_num, result_root and zip_dir. The update method processes each video in the dataset, updating metrics like seq_total_time and seq_total_frame. It also prepares variables for reference embeddings and masks for the Video Object Segmentation task using PaddlePaddle framework.",
+        "type": "comment"
+    },
+    "5970": {
+        "file_id": 478,
+        "content": "            for frame_idx, samples in enumerate(seq_dataloader):\n                time_start = time.time()\n                all_preds = []\n                join_label = None\n                for aug_idx in range(len(samples)):\n                    if len(ref_embeddings) <= aug_idx:\n                        ref_embeddings.append([])\n                        ref_masks.append([])\n                        prev_embedding.append(None)\n                        prev_mask.append(None)\n                    sample = samples[aug_idx]\n                    ref_emb = ref_embeddings[aug_idx]\n                    ref_m = ref_masks[aug_idx]\n                    prev_emb = prev_embedding[aug_idx]\n                    prev_m = prev_mask[aug_idx]\n                    current_img = sample['current_img']\n                    if 'current_label' in sample.keys():\n                        current_label = sample['current_label']\n                        current_label = paddle.to_tensor(current_label)\n                    else:\n                        current_label = None",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:69-91"
+    },
+    "5971": {
+        "file_id": 478,
+        "content": "This code appears to be part of a data loading and processing loop for a video object detection model. It loads samples from a sequential dataloader, processes each augmented image, and appends their embeddings and masks to the corresponding lists. The labels are also loaded if available. This process is repeated for all augmented images in the sequence.",
+        "type": "comment"
+    },
+    "5972": {
+        "file_id": 478,
+        "content": "                    obj_num = sample['meta']['obj_num']\n                    imgname = sample['meta']['current_name']\n                    ori_height = sample['meta']['height']\n                    ori_width = sample['meta']['width']\n                    current_img = current_img\n                    obj_num = obj_num\n                    bs, _, h, w = current_img.shape\n                    data_batch = [\n                        ref_emb, ref_m, prev_emb, prev_m, current_img,\n                        [ori_height, ori_width], obj_num\n                    ]\n                    all_pred, current_embedding = model(data_batch, mode='test')\n                    if frame_idx == 0:\n                        if current_label is None:\n                            logger.info(\n                                \"No first frame label in Seq {}.\".format(\n                                    seq_name))\n                        ref_embeddings[aug_idx].append(current_embedding)\n                        ref_masks[aug_idx].append(current_label)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:93-113"
+    },
+    "5973": {
+        "file_id": 478,
+        "content": "This code prepares data for a video object detection model. It extracts necessary information from the sample such as obj_num, imgname, ori_height and ori_width. The current image shape is also obtained. A list of data is created including reference embedding, reference mask, previous embedding, previous mask, the current image, image dimensions, and object number. The model is then used to generate all predictions and current embedding. If it's the first frame, if no label exists, an info message is logged. Reference embeddings and masks are appended accordingly.",
+        "type": "comment"
+    },
+    "5974": {
+        "file_id": 478,
+        "content": "                        prev_embedding[aug_idx] = current_embedding\n                        prev_mask[aug_idx] = current_label\n                    else:\n                        if sample['meta']['flip']:  #False\n                            all_pred = self.flip_tensor(all_pred, 3)\n                        #  In YouTube-VOS, not all the objects appear in the first frame for the first time. Thus, we\n                        #  have to introduce new labels for new objects, if necessary.\n                        if not sample['meta']['flip'] and not (\n                                current_label is None) and join_label is None:\n                            join_label = paddle.cast(current_label,\n                                                     dtype='int64')\n                        all_preds.append(all_pred)\n                        if current_label is not None:\n                            ref_embeddings[aug_idx].append(current_embedding)\n                        prev_embedding[aug_idx] = current_embedding",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:115-129"
+    },
+    "5975": {
+        "file_id": 478,
+        "content": "In this code, it checks if the sample has a 'meta' field with 'flip' set to True. If not, it checks if there are new labels for new objects. If necessary, it introduces a new label and adds the current prediction and embedding to their respective lists. The prev_embedding is also updated.",
+        "type": "comment"
+    },
+    "5976": {
+        "file_id": 478,
+        "content": "                if frame_idx > 0:\n                    all_preds = paddle.concat(all_preds, axis=0)\n                    all_preds = paddle.mean(\n                        all_preds, axis=0)  #average results if augmentation\n                    pred_label = paddle.argmax(all_preds, axis=0)\n                    if join_label is not None:\n                        join_label = paddle.squeeze(paddle.squeeze(join_label,\n                                                                   axis=0),\n                                                    axis=0)\n                        keep = paddle.cast((join_label == 0), dtype=\"int64\")\n                        pred_label = pred_label * keep + join_label * (1 - keep)\n                        pred_label = pred_label\n                    current_label = paddle.reshape(\n                        pred_label, shape=[1, 1, ori_height, ori_width])\n                    flip_pred_label = self.flip_tensor(pred_label, 1)\n                    flip_current_label = paddle.reshape(\n                        flip_pred_label, shape=[1, 1, ori_height, ori_width])",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:131-147"
+    },
+    "5977": {
+        "file_id": 478,
+        "content": "This code calculates the mean of previous predictions, then finds the maximum value from these averaged results. It handles joining labels if present and reshapes the final prediction to match the original image dimensions.",
+        "type": "comment"
+    },
+    "5978": {
+        "file_id": 478,
+        "content": "                    for aug_idx in range(len(samples)):\n                        if join_label is not None:\n                            if samples[aug_idx]['meta']['flip']:\n                                ref_masks[aug_idx].append(flip_current_label)\n                            else:\n                                ref_masks[aug_idx].append(current_label)\n                        if samples[aug_idx]['meta']['flip']:\n                            prev_mask[aug_idx] = flip_current_label\n                        else:\n                            prev_mask[\n                                aug_idx] = current_label  #update prev_mask\n                    one_frametime = time.time() - time_start\n                    seq_total_time += one_frametime\n                    seq_total_frame += 1\n                    obj_num = float(obj_num)\n                    logger.info('Frame: {}, Obj Num: {}, Time: {}'.format(\n                        imgname[0], obj_num, one_frametime))\n                    self.save_mask(\n                        pred_label,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:149-168"
+    },
+    "5979": {
+        "file_id": 478,
+        "content": "The code iterates over a list of samples, updating reference and previous masks based on whether the sample is flipped or not. It then calculates the time taken for one frame, adds it to total sequence time, increments the total frame count, logs frame information including object number, and saves the predicted label mask.",
+        "type": "comment"
+    },
+    "5980": {
+        "file_id": 478,
+        "content": "                        os.path.join(self.result_root, seq_name,\n                                     imgname[0].split('.')[0] + '.png'))\n                else:\n                    one_frametime = time.time() - time_start\n                    seq_total_time += one_frametime\n                    logger.info('Ref Frame: {}, Time: {}'.format(\n                        imgname[0], one_frametime))\n            del (ref_embeddings)\n            del (ref_masks)\n            del (prev_embedding)\n            del (prev_mask)\n            del (seq_dataset)\n            del (seq_dataloader)\n        seq_avg_time_per_frame = seq_total_time / seq_total_frame\n        self.total_time += seq_total_time\n        self.total_frame += seq_total_frame\n        total_avg_time_per_frame = self.total_time / self.total_frame\n        self.total_sfps += seq_avg_time_per_frame\n        avg_sfps = self.total_sfps / (batch_id + 1)\n        logger.info(\"Seq {} FPS: {}, Total FPS: {}, FPS per Seq: {}\".format(\n            seq_name, 1. / seq_avg_time_per_frame,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:169-191"
+    },
+    "5981": {
+        "file_id": 478,
+        "content": "This code calculates the average time per frame for a video sequence and reports it. It also keeps track of total time, total frames, average frames per second (FPS) for each sequence, and overall FPS. It logs this information for debugging or analysis purposes. The code handles both cases where all frames are successfully processed and when some frames fail processing. It then deletes unnecessary variables to free up memory.",
+        "type": "comment"
+    },
+    "5982": {
+        "file_id": 478,
+        "content": "            1. / total_avg_time_per_frame, 1. / avg_sfps))\n    def flip_tensor(self, tensor, dim=0):\n        inv_idx = paddle.cast(paddle.arange(tensor.shape[dim] - 1, -1, -1),\n                              dtype=\"int64\")\n        tensor = paddle.index_select(x=tensor, index=inv_idx, axis=dim)\n        return tensor\n    def save_mask(self, mask_tensor, path):\n        _palette = [\n            0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128,\n            0, 128, 128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191,\n            128, 0, 64, 0, 128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64,\n            0, 128, 64, 0, 0, 191, 0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22,\n            22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27,\n            28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33,\n            33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39,\n            39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43, 44, 44, 44,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:192-209"
+    },
+    "5983": {
+        "file_id": 478,
+        "content": "This code defines a class with two methods: 'flip_tensor' and 'save_mask'. The 'flip_tensor' method flips the tensor along a specified dimension by inverting the indices. The 'save_mask' method saves a mask tensor to a specified file path using a provided palette.",
+        "type": "comment"
+    },
+    "5984": {
+        "file_id": 478,
+        "content": "            45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50,\n            50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n            56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61,\n            62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67,\n            67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73,\n            73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78,\n            79, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 83, 83, 83, 84, 84,\n            84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 89, 89, 89, 90,\n            90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94, 94, 94, 95, 95, 95,\n            96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100, 100, 100, 101,\n            101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105, 105,\n            105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109,\n            110, 110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:210-222"
+    },
+    "5985": {
+        "file_id": 478,
+        "content": "This code appears to be a list of numbers, each representing a potential value for an unknown variable. It spans from 45 to 114 and includes each number exactly once. Without context or additional information, it's difficult to determine the purpose or meaning behind these values.",
+        "type": "comment"
+    },
+    "5986": {
+        "file_id": 478,
+        "content": "            114, 114, 115, 115, 115, 116, 116, 116, 117, 117, 117, 118, 118,\n            118, 119, 119, 119, 120, 120, 120, 121, 121, 121, 122, 122, 122,\n            123, 123, 123, 124, 124, 124, 125, 125, 125, 126, 126, 126, 127,\n            127, 127, 128, 128, 128, 129, 129, 129, 130, 130, 130, 131, 131,\n            131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135, 135, 135,\n            136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n            140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144,\n            144, 145, 145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148,\n            149, 149, 149, 150, 150, 150, 151, 151, 151, 152, 152, 152, 153,\n            153, 153, 154, 154, 154, 155, 155, 155, 156, 156, 156, 157, 157,\n            157, 158, 158, 158, 159, 159, 159, 160, 160, 160, 161, 161, 161,\n            162, 162, 162, 163, 163, 163, 164, 164, 164, 165, 165, 165, 166,\n            166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170, 170,\n            170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:223-236"
+    },
+    "5987": {
+        "file_id": 478,
+        "content": "This code snippet contains a series of consecutive integers from 114 to 174.",
+        "type": "comment"
+    },
+    "5988": {
+        "file_id": 478,
+        "content": "            175, 175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179,\n            179, 179, 180, 180, 180, 181, 181, 181, 182, 182, 182, 183, 183,\n            183, 184, 184, 184, 185, 185, 185, 186, 186, 186, 187, 187, 187,\n            188, 188, 188, 189, 189, 189, 190, 190, 190, 191, 191, 191, 192,\n            192, 192, 193, 193, 193, 194, 194, 194, 195, 195, 195, 196, 196,\n            196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200, 200, 200,\n            201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n            205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209,\n            209, 210, 210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213,\n            214, 214, 214, 215, 215, 215, 216, 216, 216, 217, 217, 217, 218,\n            218, 218, 219, 219, 219, 220, 220, 220, 221, 221, 221, 222, 222,\n            222, 223, 223, 223, 224, 224, 224, 225, 225, 225, 226, 226, 226,\n            227, 227, 227, 228, 228, 228, 229, 229, 229, 230, 230, 230, 231,\n            231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235, 235,",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:237-250"
+    },
+    "5989": {
+        "file_id": 478,
+        "content": "This code snippet is likely representing a list of numbers, potentially related to frame or timestamp values in the video processing context.",
+        "type": "comment"
+    },
+    "5990": {
+        "file_id": 478,
+        "content": "            235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239,\n            240, 240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244,\n            244, 244, 245, 245, 245, 246, 246, 246, 247, 247, 247, 248, 248,\n            248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 252, 252, 252,\n            253, 253, 253, 254, 254, 254, 255, 255, 255\n        ]\n        mask = mask_tensor.cpu().numpy().astype('uint8')\n        mask = Image.fromarray(mask).convert('P')\n        mask.putpalette(_palette)\n        mask.save(path)\n    def zip_folder(self, source_folder, zip_dir):\n        f = zipfile.ZipFile(zip_dir, 'w', zipfile.ZIP_DEFLATED)\n        pre_len = len(os.path.dirname(source_folder))\n        for dirpath, dirnames, filenames in os.walk(source_folder):\n            for filename in filenames:\n                pathfile = os.path.join(dirpath, filename)\n                arcname = pathfile[pre_len:].strip(os.path.sep)\n                f.write(pathfile, arcname)\n        f.close()\n    def accumulate(self):",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:251-272"
+    },
+    "5991": {
+        "file_id": 478,
+        "content": "Code snippet creates a mask from tensor data, converts it to an image and saves it with specified palette.\nThe 'zip_folder' function compresses the contents of a source folder into a zip file, preserving directory structure.\nThe 'accumulate' function is not defined in this code chunk.",
+        "type": "comment"
+    },
+    "5992": {
+        "file_id": 478,
+        "content": "        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        self.zip_folder(self.result_root, self.zip_dir)\n        logger.info('Save result to {}.'.format(self.zip_dir))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/vos_metric.py:273-276"
+    },
+    "5993": {
+        "file_id": 478,
+        "content": "This code snippet is part of a class that handles metrics calculation. It accumulates metrics once all iterations are complete, then zips the results and saves them to a specified directory (zip_dir) using self.zip_folder method from the parent class. The logger.info statement displays an informational message confirming the save location and name of the zip file in the zip_dir.",
+        "type": "comment"
+    },
+    "5994": {
+        "file_id": 479,
+        "content": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py",
+        "type": "filepath"
+    },
+    "5995": {
+        "file_id": 479,
+        "content": "The AveragePrecisionCalculator calculates average precision in video object detection tasks, using a priority queue and providing methods for non-interpolated average precision. It also includes sorting, recall & precision computation, data shuffling, and prediction normalization.",
+        "type": "summary"
+    },
+    "5996": {
+        "file_id": 479,
+        "content": "# Copyright 2020 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Calculate or keep track of the interpolated average precision.\nIt provides an interface for calculating interpolated average precision for an\nentire list or the top-n ranked items. For the definition of the\n(non-)interpolated average precision:\nhttp://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf\nExample usages:\n1) Use it as a static function call to directly calculate average precision for\na short ranked list in the memory.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:1-23"
+    },
+    "5997": {
+        "file_id": 479,
+        "content": "This code calculates the interpolated average precision for an entire list or top-n ranked items, following the definition provided in the given reference. It can be used as a static function call to calculate average precision for short ranked lists in memory.",
+        "type": "comment"
+    },
+    "5998": {
+        "file_id": 479,
+        "content": "```\nimport random\np = np.array([random.random() for _ in xrange(10)])\na = np.array([random.choice([0, 1]) for _ in xrange(10)])\nap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)\n```\n2) Use it as an object for long ranked list that cannot be stored in memory or\nthe case where partial predictions can be observed at a time (Tensorflow\npredictions). In this case, we first call the function accumulate many times\nto process parts of the ranked list. After processing all the parts, we call\npeek_interpolated_ap_at_n.\n```\np1 = np.array([random.random() for _ in xrange(5)])\na1 = np.array([random.choice([0, 1]) for _ in xrange(5)])\np2 = np.array([random.random() for _ in xrange(5)])\na2 = np.array([random.choice([0, 1]) for _ in xrange(5)])\n# interpolated average precision at 10 using 1000 break points\ncalculator = average_precision_calculator.AveragePrecisionCalculator(10)\ncalculator.accumulate(p1, a1)\ncalculator.accumulate(p2, a2)\nap3 = calculator.peek_ap_at_n()\n```\n\"\"\"\nimport heapq\nimport random\nimport numbers",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:25-55"
+    },
+    "5999": {
+        "file_id": 479,
+        "content": "The code defines an AveragePrecisionCalculator class that calculates average precision based on a ranked list. The calculator can handle long lists that cannot fit in memory or partial predictions observed over time (such as from Tensorflow). It uses the accumulate method to process parts of the ranked list and peek_interpolated_ap_at_n to calculate the interpolated average precision at a specific recall level.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/6.json b/docs/data/6.json
new file mode 100644
index 000000000..24027edfa
--- /dev/null
+++ b/docs/data/6.json
@@ -0,0 +1,541 @@
+{
+    "600": {
+        "file_id": 55,
+        "content": "            seq_ref_frame_embedding = paddle.transpose(seq_ref_frame_embedding,\n                                                       [1, 2, 0])\n            seq_ref_scribble_label = paddle.transpose(\n                scale_ref_scribble_label[n], [1, 2, 0])\n            nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(\n                prev_frame_embedding=seq_ref_frame_embedding,\n                query_embedding=seq_ref_frame_embedding,\n                prev_frame_labels=seq_ref_scribble_label,\n                gt_ids=gt_id,\n                max_distance=self.cfg['model_max_local_distance'])\n            #######\n            ######################Global map update\n            if seq_names[n] not in global_map_tmp_dic:\n                global_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                    nn_features_n).tile([1000, 1, 1, 1, 1])\n            nn_features_n_ = paddle.where(\n                nn_features_n <=\n                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0),",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:814-832"
+    },
+    "601": {
+        "file_id": 55,
+        "content": "Updating the global map with the nearest neighbor features for each sequence, only if it's not already in the global_map_tmp_dic.",
+        "type": "comment"
+    },
+    "602": {
+        "file_id": 55,
+        "content": "                nn_features_n,\n                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0))\n            ###\n            ###\n            #             print('detach 3')\n            # nn_features_n_ = nn_features_n_.detach()\n            global_map_tmp_dic[seq_names[n]][\n                frame_num[n]] = nn_features_n_.detach()[0]\n            ##################Local map update\n            if local_map_dics is not None:\n                local_map_tmp_dic, local_map_dist_dic = local_map_dics\n                if seq_names[n] not in local_map_dist_dic:\n                    local_map_dist_dic[seq_names[n]] = paddle.zeros([1000, 9])\n                if seq_names[n] not in local_map_tmp_dic:\n                    local_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                        nn_features_n).unsqueeze(0).tile([1000, 9, 1, 1, 1, 1])\n                local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num\n                                                               - 1] = 0\n                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:833-854"
+    },
+    "603": {
+        "file_id": 55,
+        "content": "This code segment appears to be updating the global and local map dictionaries in a video processing model. The global_map_tmp_dic is being updated with nn_features_n_.detach()[0] at the current frame. Additionally, if the sequence name exists in the local_map_dist_dic or local_map_tmp_dic it is being modified accordingly.",
+        "type": "comment"
+    },
+    "604": {
+        "file_id": 55,
+        "content": "            ##################\n            to_cat_current_frame_embedding = ref_frame_embedding[n].unsqueeze(\n                0).tile((gt_id.shape[0], 1, 1, 1))\n            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(\n                [2, 3, 0, 1])\n            to_cat_scribble_mask_to_cat = (\n                float_(seq_ref_scribble_label) == float_(gt_id)\n            )  # float comparision?\n            to_cat_scribble_mask_to_cat = float_(\n                to_cat_scribble_mask_to_cat.unsqueeze(-1).transpose(\n                    [2, 3, 0, 1]))\n            if not first_inter:\n                seq_prev_round_label = scale_prev_round_label[n].transpose(\n                    [1, 2, 0])\n                to_cat_prev_round_to_cat = (\n                    float_(seq_prev_round_label) == float_(gt_id)\n                )  # float comparision?\n                to_cat_prev_round_to_cat = float_(\n                    to_cat_prev_round_to_cat.unsqueeze(-1).transpose(\n                        [2, 3, 0, 1]))\n            else:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:856-878"
+    },
+    "605": {
+        "file_id": 55,
+        "content": "This code calculates the current frame embedding and nn_feature_n for each object instance in the scene. It then creates a scribble mask for each object and, if not the first iteration, also creates a previous round mask. The code uses transpose and unsqueeze functions for tensor manipulation and float comparisons to create binary masks.",
+        "type": "comment"
+    },
+    "606": {
+        "file_id": 55,
+        "content": "                to_cat_prev_round_to_cat = paddle.zeros_like(\n                    to_cat_scribble_mask_to_cat)\n                to_cat_prev_round_to_cat[0] = 1.\n            to_cat = paddle.concat(\n                (to_cat_current_frame_embedding, to_cat_scribble_mask_to_cat,\n                 to_cat_prev_round_to_cat), 1)\n            pred_ = self.inter_seghead(to_cat)\n            pred_ = pred_.transpose([1, 0, 2, 3])\n            dic_tmp[seq_names[n]] = pred_\n        if local_map_dics is None:\n            return dic_tmp\n        else:\n            return dic_tmp, local_map_dics",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:879-893"
+    },
+    "607": {
+        "file_id": 55,
+        "content": "In this code, a concatenation of current frame embedding, scribble mask, and previous round information is passed to inter_seghead for segmentation prediction. The predictions are then transposed before being added to dic_tmp for further processing. If local_map_dics is None, the function returns dic_tmp; otherwise, it returns both dic_tmp and local_map_dics.",
+        "type": "comment"
+    },
+    "608": {
+        "file_id": 56,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py",
+        "type": "filepath"
+    },
+    "609": {
+        "file_id": 56,
+        "content": "Copyright notice and license information for the module. Imports IntVOS from the same directory and adds it to __all__.",
+        "type": "summary"
+    },
+    "610": {
+        "file_id": 56,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .IntVOS import IntVOS\n__all__ = ['IntVOS'\n]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py:1-17"
+    },
+    "611": {
+        "file_id": 56,
+        "content": "Copyright notice and license information for the module. Imports IntVOS from the same directory and adds it to __all__.",
+        "type": "comment"
+    },
+    "612": {
+        "file_id": 57,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py",
+        "type": "filepath"
+    },
+    "613": {
+        "file_id": 57,
+        "content": "The \"registry.py\" file in PaddleVideo's EIVideo application defines Registry classes for components of the video processing pipeline, and organizes them into four registries for different functionalities: bbox_coder, estimator, multimodal, and segment.",
+        "type": "summary"
+    },
+    "614": {
+        "file_id": 57,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nBACKBONES = Registry('backbone')\nHEADS = Registry('head')\nRECOGNIZERS = Registry('recognizer')\nLOCALIZERS = Registry('localizer')\nPARTITIONERS = Registry('partitioner')\nSEGMENT = Registry('segmentation')\nLOSSES = Registry('loss')\nROI_EXTRACTORS = Registry('roi_extractor')\nDETECTORS = Registry('detectors')\nBBOX_ASSIGNERS = Registry('bbox_assigner')\nBBOX_SAMPLERS = Registry('bbox_sampler')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py:1-27"
+    },
+    "615": {
+        "file_id": 57,
+        "content": "The code snippet is from the \"registry.py\" file in PaddleVideo's EIVideo application. It defines several Registry classes, each representing a different component of the video processing pipeline: BACKBONES, HEADS, RECOGNIZERS, LOCALIZERS, PARTITIONERS, SEGMENT, LOSSES, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, and BBOX_SAMPLERS. These Registry classes will be used to register and manage different instances of these components in the video processing pipeline.",
+        "type": "comment"
+    },
+    "616": {
+        "file_id": 57,
+        "content": "BBOX_CODERS = Registry('bbox_coder')\nESTIMATORS = Registry('estimator')\nMULTIMODAL = Registry('multimodal')\nSEGMENT = Registry('segment')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py:28-31"
+    },
+    "617": {
+        "file_id": 57,
+        "content": "This code defines four registries for different functionalities: bbox_coder, estimator, multimodal, and segment. These registries will be used to organize and manage different types of models or algorithms in the codebase.",
+        "type": "comment"
+    },
+    "618": {
+        "file_id": 58,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py",
+        "type": "filepath"
+    },
+    "619": {
+        "file_id": 58,
+        "content": "This code initializes weights for a PaddlePaddle layer with options for customization and truncated normal distribution, offering proper initialization for deep learning models using normal distribution and PaddlePaddle's Normal initializer.",
+        "type": "summary"
+    },
+    "620": {
+        "file_id": 58,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn.initializer as init\nimport numpy as np\nfrom scipy import special\ndef weight_init_(layer,\n                 func,\n                 weight_name=None,\n                 bias_name=None,\n                 bias_value=0.0,\n                 **kwargs):\n    \"\"\"\n    In-place params init function.\n    Usage:\n    .. code-block:: python\n        import paddle\n        import numpy as np\n        data = np.ones([3, 4], dtype='float32')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:1-36"
+    },
+    "621": {
+        "file_id": 58,
+        "content": "The code is a function for initializing the weights of a given layer using different functions. It supports in-place parameter initialization and can be used with PaddlePaddle framework. The function accepts various arguments to customize the weight initialization process.",
+        "type": "comment"
+    },
+    "622": {
+        "file_id": 58,
+        "content": "        linear = paddle.nn.Linear(4, 4)\n        input = paddle.to_tensor(data)\n        print(linear.weight)\n        linear(input)\n        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)\n        print(linear.weight)\n    \"\"\"\n    if hasattr(layer, 'weight') and layer.weight is not None:\n        getattr(init, func)(**kwargs)(layer.weight)\n        if weight_name is not None:\n            # override weight name\n            layer.weight.name = weight_name\n    if hasattr(layer, 'bias') and layer.bias is not None:\n        init.Constant(bias_value)(layer.bias)\n        if bias_name is not None:\n            # override bias name\n            layer.bias.name = bias_name\ndef _no_grad_trunc_normal_(tensor, mean, std, a, b):\n    def norm_cdf(x):\n        # Computes standard normal cumulative distribution function\n        return (1. + math.erf(x / math.sqrt(2.))) / 2.\n    if (mean < a - 2 * std) or (mean > b + 2 * std):\n        print(\"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. \"\n              \"The distribution of values may be incorrect.\")",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:37-66"
+    },
+    "623": {
+        "file_id": 58,
+        "content": "Code initializes a linear layer, prints its weight, applies weight initialization with normal distribution, and prints the weight again. The _no_grad_trunc_normal_ function sets tensor values to be truncated normal with specified mean, std, a, and b parameters.",
+        "type": "comment"
+    },
+    "624": {
+        "file_id": 58,
+        "content": "    with paddle.no_grad():\n        # Values are generated by using a truncated uniform distribution and\n        # then using the inverse CDF for the normal distribution.\n        # Get upper and lower cdf values\n        l = norm_cdf((a - mean) / std)\n        u = norm_cdf((b - mean) / std)\n        # Uniformly fill tensor with values from [l, u], then translate to [2l-1, 2u-1].\n        tmp = np.random.uniform(2 * l - 1, 2 * u - 1,\n                                size=list(tensor.shape)).astype(np.float32)\n        # Use inverse cdf transform for normal distribution to get truncated\n        # standard normal\n        tmp = special.erfinv(tmp)\n        # Transform to proper mean, std\n        tmp *= (std * math.sqrt(2.0))\n        tmp += mean\n        # Clamp to ensure it's in the proper range\n        tmp = np.clip(tmp, a, b)\n        tensor.set_value(paddle.to_tensor(tmp))\n        return tensor\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = tensor.dim()\n    if dimensions < 2:\n        raise ValueError(\n            \"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions\"",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:68-98"
+    },
+    "625": {
+        "file_id": 58,
+        "content": "This code initializes the weights of a tensor by generating values from a truncated normal distribution, with the lower and upper bounds defined by 'a' and 'b'. It then transforms these values to ensure they are within the desired range, mean, and standard deviation. The resulting tensor is set as the new value for the original tensor. This process ensures proper initialization for deep learning models.",
+        "type": "comment"
+    },
+    "626": {
+        "file_id": 58,
+        "content": "        )\n    num_input_fmaps = tensor.shape[1]\n    num_output_fmaps = tensor.shape[0]\n    receptive_field_size = 1\n    if tensor.dim() > 2:\n        receptive_field_size = tensor[0][0].numel()\n    fan_in = num_input_fmaps * receptive_field_size\n    fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):\n    return _no_grad_trunc_normal_(tensor, mean, std, a, b)\ndef kaiming_normal_(tensor, a=0., mode='fan_in', nonlinearity='leaky_relu'):\n    def _calculate_correct_fan(tensor, mode):\n        mode = mode.lower()\n        valid_modes = ['fan_in', 'fan_out']\n        if mode not in valid_modes:\n            raise ValueError(\n                \"Mode {} not supported, please use one of {}\".format(\n                    mode, valid_modes))\n        fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n        return fan_in if mode == 'fan_in' else fan_out\n    def calculate_gain(nonlinearity, param=None):\n        linear_fns = [\n            'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:99-130"
+    },
+    "627": {
+        "file_id": 58,
+        "content": "The code defines a function for initializing the weight of a neural network. It first calculates the fan-in and fan-out based on the shape and dimensions of the tensor. Then, it provides options to initialize weights with truncated normal or Kaiming normal distributions. The trunc_normal_ and kaiming_normal_ functions are also defined to handle different initialization methods with optional parameters for mean, std, a, b, mode, and nonlinearity.",
+        "type": "comment"
+    },
+    "628": {
+        "file_id": 58,
+        "content": "            'conv_transpose2d', 'conv_transpose3d'\n        ]\n        if nonlinearity in linear_fns or nonlinearity == 'sigmoid':\n            return 1\n        elif nonlinearity == 'tanh':\n            return 5.0 / 3\n        elif nonlinearity == 'relu':\n            return math.sqrt(2.0)\n        elif nonlinearity == 'leaky_relu':\n            if param is None:\n                negative_slope = 0.01\n            elif not isinstance(param, bool) and isinstance(\n                    param, int) or isinstance(param, float):\n                negative_slope = param\n            else:\n                raise ValueError(\n                    \"negative_slope {} not a valid number\".format(param))\n            return math.sqrt(2.0 / (1 + negative_slope**2))\n        else:\n            raise ValueError(\n                \"Unsupported nonlinearity {}\".format(nonlinearity))\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    with paddle.no_grad():\n        paddle.nn.initializer.Normal(0, std)(tensor)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:131-157"
+    },
+    "629": {
+        "file_id": 58,
+        "content": "This function initializes the weights of a tensor with a normal distribution. It checks the type of nonlinearity function and returns an appropriate gain factor, then calculates the standard deviation for weight initialization using fan inversion formula. The final step is to initialize the tensor with Normal initializer from PaddlePaddle library.",
+        "type": "comment"
+    },
+    "630": {
+        "file_id": 58,
+        "content": "        return tensor",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:158-158"
+    },
+    "631": {
+        "file_id": 58,
+        "content": "Initializing weights for a neural network model.\nThis function returns the initialized tensor with random values.",
+        "type": "comment"
+    },
+    "632": {
+        "file_id": 59,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py",
+        "type": "filepath"
+    },
+    "633": {
+        "file_id": 59,
+        "content": "This code snippet is importing the \"test_model\" function from the \"test.py\" module in the same directory and adding it to the __all__ list. The text at the beginning of the file contains license information and copyright notice.",
+        "type": "summary"
+    },
+    "634": {
+        "file_id": 59,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .test import test_model\n__all__ = [\n    'test_model',\n]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py:1-19"
+    },
+    "635": {
+        "file_id": 59,
+        "content": "This code snippet is importing the \"test_model\" function from the \"test.py\" module in the same directory and adding it to the __all__ list. The text at the beginning of the file contains license information and copyright notice.",
+        "type": "comment"
+    },
+    "636": {
+        "file_id": 60,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py",
+        "type": "filepath"
+    },
+    "637": {
+        "file_id": 60,
+        "content": "This code imports libraries and defines a test_model function for testing a model without gradient calculation. It sets configuration, updates model's test_step function with updated parameters, and performs multi-card testing.",
+        "type": "summary"
+    },
+    "638": {
+        "file_id": 60,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom EIVideo.paddlevideo.utils import get_logger, load\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..metrics import build_metric\nfrom ..modeling.builder import build_model\nfrom ..modeling.framework import Manet\nlogger = get_logger(\"paddlevideo\")\n@paddle.no_grad()\ndef test_model(cfg, weights, parallel=True):\n    \"\"\"Test model entry\n    Args:\n        cfg (dict): configuration.\n        weights (str): weights path to load.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py:1-31"
+    },
+    "639": {
+        "file_id": 60,
+        "content": "This code imports necessary libraries and functions for testing a model. It defines a function called test_model that takes configuration (cfg), weights path (weights), and parallel flag as arguments. The function performs model testing without gradient calculation (paddle.no_grad()) to save computation resources.",
+        "type": "comment"
+    },
+    "640": {
+        "file_id": 60,
+        "content": "        parallel (bool): Whether to do multi-cards testing. Default: True.\n    \"\"\"\n    if cfg.MODEL.framework == \"Manet\":\n        cfg_helper = {\"knns\": 1, \"is_save_image\": True}\n        cfg.update(cfg_helper)\n        final = Manet().test_step(**cfg, weights=weights, parallel=False)\n        return final",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py:32-39"
+    },
+    "641": {
+        "file_id": 60,
+        "content": "This code sets the configuration for multi-card testing and then calls the Manet model's test_step function with updated configuration, weights, and parallel set to False.",
+        "type": "comment"
+    },
+    "642": {
+        "file_id": 61,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py",
+        "type": "filepath"
+    },
+    "643": {
+        "file_id": 61,
+        "content": "This code imports various functions and classes from different modules within the PaddleVideo library. It also sets up logger and profiler functionality, provides a build function for creating objects, and handles saving and loading data.",
+        "type": "summary"
+    },
+    "644": {
+        "file_id": 61,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import Registry\nfrom .build_utils import build\nfrom .config import *\nfrom .logger import setup_logger, coloring, get_logger\nfrom .record import AverageMeter, build_record, log_batch, log_epoch\nfrom .dist_utils import get_dist_info, main_only\nfrom .save_load import save, load, load_ckpt, mkdir\nfrom .precise_bn import do_preciseBN\nfrom .profiler import add_profiler_step\n__all__ = ['Registry', 'build']",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py:1-24"
+    },
+    "645": {
+        "file_id": 61,
+        "content": "This code imports various functions and classes from different modules within the PaddleVideo library. It also sets up logger and profiler functionality, provides a build function for creating objects, and handles saving and loading data.",
+        "type": "comment"
+    },
+    "646": {
+        "file_id": 62,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py",
+        "type": "filepath"
+    },
+    "647": {
+        "file_id": 62,
+        "content": "The \"build\" function takes a config dictionary and registry, constructs an object from the configuration, checks for required keys, retrieves class from the registry, and returns the instance.",
+        "type": "summary"
+    },
+    "648": {
+        "file_id": 62,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\ndef build(cfg, registry, key='name'):\n    \"\"\"Build a module from config dict.\n    Args:\n        cfg (dict): Config dict. It should at least contain the key.\n        registry (XXX): The registry to search the type from.\n        key (str): the key.\n    Returns:\n        obj: The constructed object.\n    \"\"\"\n    assert isinstance(cfg, dict) and key in cfg\n    cfg_copy = cfg.copy()\n    obj_type = cfg_copy.pop(key)\n    obj_cls = registry.get(obj_type)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py:1-31"
+    },
+    "649": {
+        "file_id": 62,
+        "content": "This code defines a function named \"build\" that takes a config dictionary and a registry, builds an object from the given configuration dictionary, and returns it. The function asserts that the input is a valid dictionary and checks if the required key exists. It then retrieves the object type from the dictionary and gets the corresponding class from the registry before returning the constructed object.",
+        "type": "comment"
+    },
+    "650": {
+        "file_id": 62,
+        "content": "    if obj_cls is None:\n        raise KeyError('{} is not in the {} registry'.format(\n                obj_type, registry.name))\n    return obj_cls(**cfg_copy)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py:32-35"
+    },
+    "651": {
+        "file_id": 62,
+        "content": "Checks if an object class is provided, raises a KeyError if not found in the registry, and returns an instance of the found class with provided configuration.",
+        "type": "comment"
+    },
+    "652": {
+        "file_id": 63,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py",
+        "type": "filepath"
+    },
+    "653": {
+        "file_id": 63,
+        "content": "The code imports libraries, sets up logger, and defines AttrDict class for config files. It includes functions to parse, print, visualize, check, and replace configurations using 'override' function. The code parses a config file, applies overrides, checks if the input option is a string, separates key-value pairs, splits keys by dots, calls `print_config()` and `check_config()`, and returns the updated config object.",
+        "type": "summary"
+    },
+    "654": {
+        "file_id": 63,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport yaml\nfrom EIVideo.paddlevideo.utils.logger import coloring, setup_logger\n__all__ = ['get_config']\nlogger = setup_logger(\"./\", name=\"paddlevideo\", level=\"INFO\")\nclass AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef create_attr_dict(yaml_config):",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:1-34"
+    },
+    "655": {
+        "file_id": 63,
+        "content": "This code block is importing necessary libraries, setting up logger, and defining an AttrDict class for handling config files. It also defines a function create_attr_dict that takes in a yaml configuration file.",
+        "type": "comment"
+    },
+    "656": {
+        "file_id": 63,
+        "content": "    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef print_dict(d, delimiter=0):\n    \"\"\"\n    Recursively visualize a dict and\n    indenting acrrording by the relationship of keys.\n    \"\"\"\n    placeholder = \"-\" * 60\n    for k, v in sorted(d.items()):\n        if isinstance(v, dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", coloring(k,\n                                                                   \"HEADER\")))",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:35-67"
+    },
+    "657": {
+        "file_id": 63,
+        "content": "This code defines functions for parsing and printing config files. The `parse_config` function loads a config file into an AttrDict object, handling nested dictionaries and string values. The `print_dict` function recursively visualizes a dictionary, indented based on the relationship of keys.",
+        "type": "comment"
+    },
+    "658": {
+        "file_id": 63,
+        "content": "            print_dict(v, delimiter + 4)\n        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \",\n                                         coloring(str(k), \"HEADER\")))\n            for value in v:\n                print_dict(value, delimiter + 4)\n        else:\n            logger.info(\"{}{} : {}\".format(delimiter * \" \",\n                                           coloring(k, \"HEADER\"),\n                                           coloring(v, \"OKGREEN\")))\n        if k.isupper():\n            logger.info(placeholder)\ndef print_config(config):\n    \"\"\"\n    visualize configs\n    Arguments:\n        config: configs\n    \"\"\"\n    print_dict(config)\ndef check_config(config):\n    \"\"\"\n    Check config\n    \"\"\"\n    pass\ndef override(dl, ks, v):\n    \"\"\"\n    Recursively replace dict of list\n    Args:\n        dl(dict or list): dict or list to be replaced\n        ks(list): list of keys\n        v(str): value to be replaced\n    \"\"\"\n    def str2num(v):\n        try:\n            return eval(v)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:68-109"
+    },
+    "659": {
+        "file_id": 63,
+        "content": "This code defines functions for visualizing and checking configurations, as well as recursively replacing dictionary or list values. It includes functions to print a configuration, check the configuration (currently empty), and override values in a dictionary or list. The print function formats output with coloring and delimiters, and the override function handles both dictionaries and lists for value replacement.",
+        "type": "comment"
+    },
+    "660": {
+        "file_id": 63,
+        "content": "        except Exception:\n            return v\n    assert isinstance(dl, (list, dict)), (\"{} should be a list or a dict\")\n    assert len(ks) > 0, ('lenght of keys should larger than 0')\n    if isinstance(dl, list):\n        k = str2num(ks[0])\n        if len(ks) == 1:\n            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))\n            dl[k] = str2num(v)\n        else:\n            override(dl[k], ks[1:], v)\n    else:\n        if len(ks) == 1:\n            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))\n            if not ks[0] in dl:\n                logger.warning('A new filed ({}) detected!'.format(ks[0], dl))\n            dl[ks[0]] = str2num(v)\n        else:\n            assert ks[0] in dl, (\n                '({}) doesn\\'t exist in {}, a new dict field is invalid'.format(\n                    ks[0], dl))\n            override(dl[ks[0]], ks[1:], v)\ndef override_config(config, options=None):\n    \"\"\"\n    Recursively override the config\n    Args:\n        config(dict): dict to be replaced",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:110-139"
+    },
+    "661": {
+        "file_id": 63,
+        "content": "This function overrides the config, recursively replacing values in the dictionary or list. It requires the configuration and optional options, both as dictionaries. If the key exists, it updates the value, otherwise a warning is issued for new fields, and a new field is created if the key is present in the options. If the key does not exist, an error is thrown.\n\nIn other words, this function allows you to update your configuration by replacing values with new ones. It also helps to identify and handle newly-appearing fields.",
+        "type": "comment"
+    },
+    "662": {
+        "file_id": 63,
+        "content": "        options(list): list of pairs(key0.key1.idx.key2=value)\n            such as: [\n                epochs=20',\n                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'\n            ]\n    Returns:\n        config(dict): replaced config\n    \"\"\"\n    if options is not None:\n        for opt in options:\n            assert isinstance(opt,\n                              str), (\"option({}) should be a str\".format(opt))\n            assert \"=\" in opt, (\n                \"option({}) should contain a =\"\n                \"to distinguish between key and value\".format(opt))\n            pair = opt.split('=')\n            assert len(pair) == 2, (\"there can be only a = in the option\")\n            key, value = pair\n            keys = key.split('.')\n            override(config, keys, value)\n    return config\ndef get_config(fname, overrides=None, show=True):\n    \"\"\"\n    Read config from file\n    \"\"\"\n    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))\n    config = parse_config(fname)\n    override_config(config, overrides)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:140-170"
+    },
+    "663": {
+        "file_id": 63,
+        "content": "The code parses a config file and applies overrides. It checks if the input option is a string and if it contains an equal sign to separate key-value pairs. It then splits the option into key and value, further splitting keys by dots. The function overrides the configuration file with these options, returning the updated configuration.",
+        "type": "comment"
+    },
+    "664": {
+        "file_id": 63,
+        "content": "    if show:\n        print_config(config)\n    check_config(config)\n    return config",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:171-174"
+    },
+    "665": {
+        "file_id": 63,
+        "content": "This code checks if `show` is True, and if so, it calls the function `print_config(config)`. It then always calls another function `check_config(config)`, before finally returning the `config` object. This implies that `print_config()` prints out configuration details, while `check_config()` checks for correctness or validity of the configuration. The config is returned regardless to ensure it's available to the rest of the codebase.",
+        "type": "comment"
+    },
+    "666": {
+        "file_id": 64,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py",
+        "type": "filepath"
+    },
+    "667": {
+        "file_id": 64,
+        "content": "This code is from PaddleVideo's EIVideo module and includes util functions for distributed computing. It defines a function get_dist_info() to retrieve the current rank and world size, and main_only() is a decorator that only runs the wrapped function if the rank is 0 (used in distributed environments).",
+        "type": "summary"
+    },
+    "668": {
+        "file_id": 64,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport functools\nimport paddle\nimport paddle.distributed as dist\ndef get_dist_info():\n    world_size = dist.get_world_size()\n    rank = dist.get_rank()\n    return rank, world_size\ndef main_only(func):\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        rank, _ = get_dist_info()\n        if rank == 0:\n            return func(*args, **kwargs)\n    return wrapper",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py:1-30"
+    },
+    "669": {
+        "file_id": 64,
+        "content": "This code is from PaddleVideo's EIVideo module and includes util functions for distributed computing. It defines a function get_dist_info() to retrieve the current rank and world size, and main_only() is a decorator that only runs the wrapped function if the rank is 0 (used in distributed environments).",
+        "type": "comment"
+    },
+    "670": {
+        "file_id": 65,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py",
+        "type": "filepath"
+    },
+    "671": {
+        "file_id": 65,
+        "content": "This code sets up a colorful logging function for PaddleVideo, initializes logger with verbosity levels, and ensures non-propagation of logs. It configures logger for Python's logging module using different formats and handlers based on local rank.",
+        "type": "summary"
+    },
+    "672": {
+        "file_id": 65,
+        "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport logging\nimport os\nimport sys\nimport datetime\nfrom paddle.distributed import ParallelEnv\nColor = {\n    'RED': '\\033[31m',\n    'HEADER': '\\033[35m',  # deep purple\n    'PURPLE': '\\033[95m',  # purple\n    'OKBLUE': '\\033[94m',\n    'OKGREEN': '\\033[92m',\n    'WARNING': '\\033[93m',\n    'FAIL': '\\033[91m',\n    'ENDC': '\\033[0m'\n}\ndef coloring(message, color=\"OKGREEN\"):\n    assert color in Color.keys()\n    if os.environ.get('COLORING', True):",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:1-38"
+    },
+    "673": {
+        "file_id": 65,
+        "content": "This code is from the \"logger.py\" file in the PaddleVideo project, and it sets up a coloring function for logging messages with optional colors using ANSI escape sequences. The function takes a message and an optional color parameter, which should be one of the defined colors in the Color dictionary. It asserts that the provided color is indeed a key in the dictionary, and then returns the message with the specified color applied. The function also checks the environment variable \"COLORING\" to determine whether coloring should be enabled or not (default is True).",
+        "type": "comment"
+    },
+    "674": {
+        "file_id": 65,
+        "content": "        return Color[color] + str(message) + Color[\"ENDC\"]\n    else:\n        return message\nlogger_initialized = []\ndef setup_logger(output=None, name=\"paddlevideo\", level=\"INFO\"):\n    \"\"\"\n    Initialize the paddlevideo logger and set its verbosity level to \"INFO\".\n    Args:\n        output (str): a file name or a directory to save log. If None, will not save log file.\n            If ends with \".txt\" or \".log\", assumed to be a file name.\n            Otherwise, logs will be saved to `output/log.txt`.\n        name (str): the root module name of this logger\n    Returns:\n        logging.Logger: a logger\n    \"\"\"\n    def time_zone(sec, fmt):\n        real_time = datetime.datetime.now()\n        return real_time.timetuple()\n    logging.Formatter.converter = time_zone\n    logger = logging.getLogger(name)\n    if level == \"INFO\":\n        logger.setLevel(logging.INFO)\n    elif level==\"DEBUG\":\n        logger.setLevel(logging.DEBUG)\n    logger.propagate = False\n    if level == \"DEBUG\":\n        plain_formatter = logging.Formatter(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:39-71"
+    },
+    "675": {
+        "file_id": 65,
+        "content": "This code initializes the PaddleVideo logger and sets its verbosity level to \"INFO\" or \"DEBUG\", depending on the input argument. It also defines a custom time zone converter for logging, and ensures that the logger does not propagate logs to its parent loggers.",
+        "type": "comment"
+    },
+    "676": {
+        "file_id": 65,
+        "content": "            \"[%(asctime)s] %(name)s %(levelname)s: %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    else:\n        plain_formatter = logging.Formatter(\n            \"[%(asctime)s] %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    # stdout logging: master only\n    local_rank = ParallelEnv().local_rank\n    if local_rank == 0:\n        ch = logging.StreamHandler(stream=sys.stdout)\n        ch.setLevel(logging.DEBUG)\n        formatter = plain_formatter\n        ch.setFormatter(formatter)\n        logger.addHandler(ch)\n    # file logging: all workers\n    if output is not None:\n        if output.endswith(\".txt\") or output.endswith(\".log\"):\n            filename = output\n        else:\n            filename = os.path.join(output, \".log.txt\")\n        if local_rank > 0:\n            filename = filename + \".rank{}\".format(local_rank)\n        # PathManager.mkdirs(os.path.dirname(filename))\n        os.makedirs(os.path.dirname(filename), exist_ok=True)\n        # fh = logging.StreamHandler(_cached_log_stream(filename)\n        fh = logging.FileHandler(filename, mode='a')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:72-100"
+    },
+    "677": {
+        "file_id": 65,
+        "content": "This code configures a logger for Python's logging module. It uses different formats and handlers (stdout, file) based on the local rank of the process, creating separate log files for each worker ranked greater than 0. If the output is a .txt or .log file, it will be used as-is; otherwise, a .log.txt file with optional rank appended will be created. The code also ensures that missing directories for the log file are created beforehand.",
+        "type": "comment"
+    },
+    "678": {
+        "file_id": 65,
+        "content": "        fh.setLevel(logging.DEBUG)\n        fh.setFormatter(plain_formatter)\n        logger.addHandler(fh)\n    logger_initialized.append(name)\n    return logger\ndef get_logger(name, output=None):\n    logger = logging.getLogger(name)\n    if name in logger_initialized:\n        return logger\n    return setup_logger(name=name, output=name)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:101-113"
+    },
+    "679": {
+        "file_id": 65,
+        "content": "This code initializes a logger object and sets its level to DEBUG, adds a file handler with a plain formatter, and appends the logger's name to an initialized list. The function returns the logger if it has been previously initialized for the given name; otherwise, it sets up the logger using the provided name and optional output.",
+        "type": "comment"
+    },
+    "680": {
+        "file_id": 66,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py",
+        "type": "filepath"
+    },
+    "681": {
+        "file_id": 66,
+        "content": "This PyTorch code uses OpenCV for image processing, offers conversion functions and error handling with PaddleVideo. It initializes tensors using Xavier/Glorot or Kaiming normal distribution, favoring Torch.nn.init methods over older ones.",
+        "type": "summary"
+    },
+    "682": {
+        "file_id": 66,
+        "content": "from __future__ import absolute_import\nimport json\nimport math\nimport os\nimport pickle\nimport warnings\nimport numpy\nimport numpy as np\nfrom numpy import inf\nfrom paddle import Tensor, concat, reshape, nn\nimport paddle\nfrom typing import Union, Iterable\n# from reprod_log.compare import compute_diff\n# from reprod_log.utils import check_print_diff, np2torch, np2paddle, torch2np, paddle2np\n_tensor_or_tensors = Union[paddle.Tensor, Iterable[paddle.Tensor]]\n_palette = [\n    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,\n    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,\n    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,\n    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,\n    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,\n    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,\n    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1-28"
+    },
+    "683": {
+        "file_id": 66,
+        "content": "Imports various modules and defines a type hint for paddle tensor or iterable of tensors.",
+        "type": "comment"
+    },
+    "684": {
+        "file_id": 66,
+        "content": "    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,\n    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,\n    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,\n    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,\n    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,\n    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,\n    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,\n    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,\n    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,\n    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,\n    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:29-41"
+    },
+    "685": {
+        "file_id": 66,
+        "content": "This code consists of a long sequence of integers with no apparent functionality or structure. It may represent an array, list, or range of values used in various parts of the codebase, but without further context, it is impossible to determine the specific purpose or usage for these numbers.",
+        "type": "comment"
+    },
+    "686": {
+        "file_id": 66,
+        "content": "    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,\n    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,\n    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,\n    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,\n    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,\n    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,\n    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,\n    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,\n    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,\n    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,\n    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:42-54"
+    },
+    "687": {
+        "file_id": 66,
+        "content": "This code likely contains a list of integer values, potentially representing coordinates or other numerical data.",
+        "type": "comment"
+    },
+    "688": {
+        "file_id": 66,
+        "content": "    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,\n    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,\n    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,\n    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,\n    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,\n    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,\n    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,\n    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,\n    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,\n    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,\n    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,\n    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:55-67"
+    },
+    "689": {
+        "file_id": 66,
+        "content": "This code appears to be a list of integers. It is difficult to provide a brief comment as there seems to be no clear context or purpose for these numbers in this specific location.",
+        "type": "comment"
+    },
+    "690": {
+        "file_id": 66,
+        "content": "    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,\n    255, 255\n]\n# paddle.set_device('gpu') if paddle.is_compiled_with_cuda() else paddle.set_device('cpu')\nimport paddle\nimport PIL\nimport numbers\nimport numpy as np\nfrom PIL import Image\nfrom paddle.vision.transforms import BaseTransform\nfrom paddle.vision.transforms import functional as F\nimport numpy as np\nfrom scipy.ndimage import interpolation, binary_dilation\ntry:\n    from skimage import morphology, transform\nexcept ImportError as e:\n    print(\n        f\"{e}, [scikit-image] package and it's dependencies is required for EIVideo.\"\n    )\nimport paddle\nimport cv2\nimport random\n####\ndef mask_damager(labels=None, p_black=0.2):\n    scales = (0.8, 1.0, 1.2)\n    kernel_size = random.randint(10, 15)\n    kernel = np.ones((kernel_size, kernel_size), np.uint8)\n    if random.random() < p_black:\n        final_label = paddle.zeros_like(labels)\n        final_label = final_label.squeeze().numpy()\n    else:\n        prot = random.randint(5, 15)\n        nrot = random.randint(-15, -5)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:68-105"
+    },
+    "691": {
+        "file_id": 66,
+        "content": "This code defines a function called \"mask\\_damager\" which takes in labels and a probability of blacking out as input. It randomly scales the image using a scale range of (0.8, 1.0, 1.2), generates a random kernel size between 10 to 15, and applies random rotation to the image. If a random number is less than the given probability, it sets the final label as black; otherwise, it performs random rotations and scaling on the input labels.",
+        "type": "comment"
+    },
+    "692": {
+        "file_id": 66,
+        "content": "        rots = [prot, nrot, 0]\n        rot = rots[random.randint(0, 2)]\n        sc = scales[random.randint(0, 2)]\n        _, _, h, w = labels.shape\n        tmp = labels.squeeze()\n        tmp = tmp.unsqueeze(-1)\n        tmp = tmp.numpy().astype(np.uint8)\n        morph_p = random.random()\n        if morph_p < 0.5:\n            tmp = cv2.morphologyEx(tmp, cv2.MORPH_OPEN, kernel)\n        else:\n            tmp = cv2.morphologyEx(tmp, cv2.MORPH_CLOSE, kernel)\n        tmp = tmp.astype(np.uint8)\n        center = (w / 2, h / 2)\n        M = cv2.getRotationMatrix2D(center, rot, sc)\n        final_label = cv2.warpAffine(tmp, M, (w, h), cv2.INTER_NEAREST)\n    return final_label\ncolor_map = [\n    [0, 0, 0],\n    [255, 127, 0],\n    [30, 144, 255],\n    [186, 85, 211],\n    [255, 105, 180],\n    [192, 255, 62],\n    [255, 105, 180],\n    [50, 255, 255],\n]\ncolor_map_np = np.array(color_map)\ndef overlay_davis(image, mask, alpha=0.5):\n    \"\"\" Overlay segmentation on top of RGB image. from davis official\"\"\"\n    im_overlay = image.copy()\n    mask = mask.astype('uint8')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:106-146"
+    },
+    "693": {
+        "file_id": 66,
+        "content": "The code performs morphological operations on an image using OpenCV and then applies a rotation transformation to overlay the segmentation mask onto the RGB image. It uses different colors for different classes in the segmentation mask.",
+        "type": "comment"
+    },
+    "694": {
+        "file_id": 66,
+        "content": "    colored_mask = color_map_np[mask]\n    foreground = image * alpha + (1 - alpha) * colored_mask\n    binary_mask = (mask > 0)\n    # Compose image\n    im_overlay[binary_mask] = foreground[binary_mask]\n    countours = binary_dilation(binary_mask) ^ binary_mask\n    im_overlay[countours, :] = 0\n    return im_overlay.astype(image.dtype)\n# TODO\ndef submit_masks(masks, images, inter_file_path):\n    overlays = []\n    save_result_path = os.path.join(inter_file_path, 'result')\n    os.makedirs(save_result_path, exist_ok=True)\n    for imgname, (mask, image) in enumerate(zip(masks, images)):\n        overlay = overlay_davis(image, mask)\n        overlays.append(overlay.tolist())\n        overlay = Image.fromarray(overlay)\n        imgname = str(imgname)\n        while len(imgname) < 5:\n            imgname = '0' + imgname\n        overlay.save(os.path.join(save_result_path, imgname + '.png'))\n    result = {'overlays': overlays}\n    # result = {'masks': masks.tolist()}\n    with open(os.path.join(save_result_path, 'masks.json'), 'w') as f:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:147-172"
+    },
+    "695": {
+        "file_id": 66,
+        "content": "This function takes a list of masks and images, and for each pair, it applies an overlay function to generate an overlay image. It saves these overlay images in the specified directory with filenames corresponding to their original image names. Additionally, it stores the list of overlays as JSON in a file named \"masks.json\". The comments suggest that there might be another function to store masks as a list instead of overlays.",
+        "type": "comment"
+    },
+    "696": {
+        "file_id": 66,
+        "content": "        json.dump(result, f)\ndef load_video(path, min_side=None):\n    frame_list = []\n    cap = cv2.VideoCapture(path)\n    while (cap.isOpened()):\n        _, frame = cap.read()\n        if frame is None:\n            break\n        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n        if min_side:\n            h, w = frame.shape[:2]\n            new_w = (w * min_side // min(w, h))\n            new_h = (h * min_side // min(w, h))\n            frame = cv2.resize(frame, (new_w, new_h),\n                               interpolation=cv2.INTER_CUBIC)\n            # .transpose([2, 0, 1])\n        frame_list.append(frame)\n    frames = np.stack(frame_list, axis=0)\n    return frames\ndef get_scribbles():\n    for i in range(8):\n        with open(f'/home/lc/paddlevideo/data/bike-packing/lable/{i + 1}.json'\n                  ) as f:\n            scribbles = json.load(f)\n            first_scribble = not i\n            yield scribbles, first_scribble\ndef get_images(sequence='bike-packing'):\n    img_path = os.path.join('/home/lc/paddlevideo/data', sequence.strip(),",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:173-206"
+    },
+    "697": {
+        "file_id": 66,
+        "content": "load_video function reads frames from a video file and optionally resizes the frame to match a minimum side length, appending each frame to a list. The function then stacks the frames in the list into a single numpy array and returns it. get_scribbles generates scribble data for 8 labels by iterating through corresponding JSON files and yields the data along with a flag indicating if it is the first label or not. get_images retrieves video images from a specified sequence directory.",
+        "type": "comment"
+    },
+    "698": {
+        "file_id": 66,
+        "content": "                            'frame')\n    img_files = os.listdir(img_path)\n    img_files.sort()\n    files = []\n    for img in img_files:\n        img_file = np.array(Image.open(os.path.join(img_path, img)))\n        files.append(img_file)\n    return np.array(files)\ndef rough_ROI(ref_scribble_labels):\n    #### b*1*h*w\n    dist = 20\n    b, _, h, w = ref_scribble_labels.shape\n    filter_ = paddle.zeros_like(ref_scribble_labels)\n    to_fill = paddle.zeros_like(ref_scribble_labels)\n    for i in range(b):\n        no_background = (ref_scribble_labels[i] != -1)\n        no_background = no_background.squeeze(0)\n        no_b = no_background.nonzero()\n        (h_min, w_min) = paddle.min(no_b, 0)\n        (h_max, w_max) = paddle.max(no_b, 0)\n        filter_[i, 0,\n                max(h_min - dist, 0):min(h_max + dist, h - 1),\n                max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1\n    final_scribble_labels = paddle.where(byte_(filter_), ref_scribble_labels,\n                                         to_fill)\n    return final_scribble_labels",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:207-236"
+    },
+    "699": {
+        "file_id": 66,
+        "content": "The code defines two functions: \"load_image\" and \"rough_ROI\". The \"load_image\" function loads an image from a specified directory, sorts the images by file name, reads each image using PIL library, and returns the images as a numpy array. The \"rough_ROI\" function receives scribble labels as input, determines the bounding box around each scribble in the batch, applies this bounding box to another mask, and returns the final scribble labels after filtering.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/60.json b/docs/data/60.json
new file mode 100644
index 000000000..883ca2800
--- /dev/null
+++ b/docs/data/60.json
@@ -0,0 +1,544 @@
+{
+    "6000": {
+        "file_id": 479,
+        "content": "import numpy\nclass AveragePrecisionCalculator(object):\n    \"\"\"Calculate the average precision and average precision at n.\"\"\"\n    def __init__(self, top_n=None):\n        \"\"\"Construct an AveragePrecisionCalculator to calculate average precision.\n    This class is used to calculate the average precision for a single label.\n    Args:\n      top_n: A positive Integer specifying the average precision at n, or\n        None to use all provided data points.\n    Raises:\n      ValueError: An error occurred when the top_n is not a positive integer.\n    \"\"\"\n        if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):\n            raise ValueError(\"top_n must be a positive integer or None.\")\n        self._top_n = top_n  # average precision at n\n        self._total_positives = 0  # total number of positives have seen\n        self._heap = []  # max heap of (prediction, actual)\n    @property\n    def heap_size(self):\n        \"\"\"Gets the heap size maintained in the class.\"\"\"\n        return len(self._heap)\n    @property",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:57-86"
+    },
+    "6001": {
+        "file_id": 479,
+        "content": "AveragePrecisionCalculator is a class used for calculating the average precision and average precision at n. It constructs an object to calculate average precision for single label, with optional top_n parameter for average precision at n. If top_n is not positive integer or None, a ValueError is raised. The class maintains heap of (prediction, actual) pairs and total positives seen. Heap size can be queried using the heap_size property.",
+        "type": "comment"
+    },
+    "6002": {
+        "file_id": 479,
+        "content": "    def num_accumulated_positives(self):\n        \"\"\"Gets the number of positive samples that have been accumulated.\"\"\"\n        return self._total_positives\n    def accumulate(self, predictions, actuals, num_positives=None):\n        \"\"\"Accumulate the predictions and their ground truth labels.\n    After the function call, we may call peek_ap_at_n to actually calculate\n    the average precision.\n    Note predictions and actuals must have the same shape.\n    Args:\n      predictions: a list storing the prediction scores.\n      actuals: a list storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      num_positives = If the 'predictions' and 'actuals' inputs aren't complete,\n      then it's possible some true positives were missed in them. In that case,\n      you can provide 'num_positives' in order to accurately track recall.\n    Raises:\n      ValueError: An error occurred when the format of the input is not the\n      numpy 1-D array or the shape of predictions and actuals does not match.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:87-108"
+    },
+    "6003": {
+        "file_id": 479,
+        "content": "This code defines a class that calculates the average precision in video object detection tasks. It provides methods to accumulate positive samples and return the total number of positives. The accumulate method takes prediction scores, ground truth labels, and optional num_positives parameter for accurate tracking when inputs are incomplete.",
+        "type": "comment"
+    },
+    "6004": {
+        "file_id": 479,
+        "content": "    \"\"\"\n        if len(predictions) != len(actuals):\n            raise ValueError(\n                \"the shape of predictions and actuals does not match.\")\n        if not num_positives is None:\n            if not isinstance(num_positives,\n                              numbers.Number) or num_positives < 0:\n                raise ValueError(\n                    \"'num_positives' was provided but it wan't a nonzero number.\"\n                )\n        if not num_positives is None:\n            self._total_positives += num_positives\n        else:\n            self._total_positives += numpy.size(numpy.where(actuals > 0))\n        topk = self._top_n\n        heap = self._heap\n        for i in range(numpy.size(predictions)):\n            if topk is None or len(heap) < topk:\n                heapq.heappush(heap, (predictions[i], actuals[i]))\n            else:\n                if predictions[i] > heap[0][0]:  # heap[0] is the smallest\n                    heapq.heappop(heap)\n                    heapq.heappush(heap, (predictions[i], actuals[i]))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:109-134"
+    },
+    "6005": {
+        "file_id": 479,
+        "content": "This code checks if the length of predictions and actuals match. It also ensures that num_positives is a nonzero number, then adds positives to total_positives. The code uses heapq to push and pop elements in a priority queue based on top_n, ensuring correctness and efficiency.",
+        "type": "comment"
+    },
+    "6006": {
+        "file_id": 479,
+        "content": "    def clear(self):\n        \"\"\"Clear the accumulated predictions.\"\"\"\n        self._heap = []\n        self._total_positives = 0\n    def peek_ap_at_n(self):\n        \"\"\"Peek the non-interpolated average precision at n.\n    Returns:\n      The non-interpolated average precision at n (default 0).\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    \"\"\"\n        if self.heap_size <= 0:\n            return 0\n        predlists = numpy.array(list(zip(*self._heap)))\n        ap = self.ap_at_n(predlists[0],\n                          predlists[1],\n                          n=self._top_n,\n                          total_num_positives=self._total_positives)\n        return ap\n    @staticmethod\n    def ap(predictions, actuals):\n        \"\"\"Calculate the non-interpolated average precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      actuals: a numpy 1-D array storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:136-166"
+    },
+    "6007": {
+        "file_id": 479,
+        "content": "This code defines a class that calculates non-interpolated average precision. It has methods to clear accumulated predictions, peek the non-interpolated average precision at n, and calculate non-interpolated average precision from prediction and actual scores. The class uses numpy arrays and requires positive labels to be greater than 0 and negative labels as 0.",
+        "type": "comment"
+    },
+    "6008": {
+        "file_id": 479,
+        "content": "    Returns:\n      The non-interpolated average precision at n.\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    Raises:\n      ValueError: An error occurred when the format of the input is not the\n      numpy 1-D array or the shape of predictions and actuals does not match.\n    \"\"\"\n        return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None)\n    @staticmethod\n    def ap_at_n(predictions, actuals, n=20, total_num_positives=None):\n        \"\"\"Calculate the non-interpolated average precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      actuals: a numpy 1-D array storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      n: the top n items to be considered in ap@n.\n      total_num_positives : (optionally) you can specify the number of total\n        positive\n      in the list. If specified, it will be used in calculation.\n    Returns:",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:168-192"
+    },
+    "6009": {
+        "file_id": 479,
+        "content": "This code calculates the non-interpolated average precision at a specified number 'n' from given predictions and actuals. It raises a ValueError if the input format is not a numpy 1D array, or the shape of predictions and actuals does not match.",
+        "type": "comment"
+    },
+    "6010": {
+        "file_id": 479,
+        "content": "      The non-interpolated average precision at n.\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    Raises:\n      ValueError: An error occurred when\n      1) the format of the input is not the numpy 1-D array;\n      2) the shape of predictions and actuals does not match;\n      3) the input n is not a positive integer.\n    \"\"\"\n        if len(predictions) != len(actuals):\n            raise ValueError(\n                \"the shape of predictions and actuals does not match.\")\n        if n is not None:\n            if not isinstance(n, int) or n <= 0:\n                raise ValueError(\"n must be 'None' or a positive integer.\"\n                                 \" It was '%s'.\" % n)\n        ap = 0.0\n        predictions = numpy.array(predictions)\n        actuals = numpy.array(actuals)\n        # add a shuffler to avoid overestimating the ap\n        predictions, actuals = AveragePrecisionCalculator._shuffle(\n            predictions, actuals)\n        sortidx = sorted(range(len(predictions)),",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:193-220"
+    },
+    "6011": {
+        "file_id": 479,
+        "content": "This function calculates the non-interpolated average precision at a given rank 'n'. It checks if the lengths of predictions and actuals match, ensures 'n' is an integer greater than zero, and shuffles the lists to avoid overestimation. If any errors occur, it raises a ValueError.",
+        "type": "comment"
+    },
+    "6012": {
+        "file_id": 479,
+        "content": "                         key=lambda k: predictions[k],\n                         reverse=True)\n        if total_num_positives is None:\n            numpos = numpy.size(numpy.where(actuals > 0))\n        else:\n            numpos = total_num_positives\n        if numpos == 0:\n            return 0\n        if n is not None:\n            numpos = min(numpos, n)\n        delta_recall = 1.0 / numpos\n        poscount = 0.0\n        # calculate the ap\n        r = len(sortidx)\n        if n is not None:\n            r = min(r, n)\n        for i in range(r):\n            if actuals[sortidx[i]] > 0:\n                poscount += 1\n                ap += poscount / (i + 1) * delta_recall\n        return ap\n    @staticmethod\n    def _shuffle(predictions, actuals):\n        random.seed(0)\n        suffidx = random.sample(range(len(predictions)), len(predictions))\n        predictions = predictions[suffidx]\n        actuals = actuals[suffidx]\n        return predictions, actuals\n    @staticmethod\n    def _zero_one_normalize(predictions, epsilon=1e-7):",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:221-256"
+    },
+    "6013": {
+        "file_id": 479,
+        "content": "This function calculates the average precision (AP) of a set of predictions and actuals. It first sorts the predictions by value, then calculates recall and precision to compute AP. If a total number of positives is provided, it uses that instead of counting non-zero actuals. The function also includes helper methods for shuffling the data and normalizing predictions with an epsilon value.",
+        "type": "comment"
+    },
+    "6014": {
+        "file_id": 479,
+        "content": "        \"\"\"Normalize the predictions to the range between 0.0 and 1.0.\n    For some predictions like SVM predictions, we need to normalize them before\n    calculate the interpolated average precision. The normalization will not\n    change the rank in the original list and thus won't change the average\n    precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      epsilon: a small constant to avoid denominator being zero.\n    Returns:\n      The normalized prediction.\n    \"\"\"\n        denominator = numpy.max(predictions) - numpy.min(predictions)\n        ret = (predictions - numpy.min(predictions)) / numpy.max(\n            denominator, epsilon)\n        return ret",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:257-274"
+    },
+    "6015": {
+        "file_id": 479,
+        "content": "This function normalizes the predictions to a range of 0.0-1.0, ensuring that the rank in the original list remains unchanged and does not affect the average precision calculation. It prevents division by zero using a small epsilon value.",
+        "type": "comment"
+    },
+    "6016": {
+        "file_id": 480,
+        "content": "/paddlevideo/metrics/youtube8m/eval_util.py",
+        "type": "filepath"
+    },
+    "6017": {
+        "file_id": 480,
+        "content": "This code imports libraries, defines a merging function and uses Paddlevideo classes for evaluation. It calculates Hit@1, measures video-level precision, averages results to assess model performance. The function computes top-k triplet predictions, raises ValueError if k is not a positive integer, and initializes HitOneMetric class for evaluation metrics in Youtube8m's PaddleVideo module.",
+        "type": "summary"
+    },
+    "6018": {
+        "file_id": 480,
+        "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Provides functions to help with evaluating models.\"\"\"\nimport numpy as np\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom ..base import BaseMetric\nfrom ..registry import METRIC\nfrom . import average_precision_calculator as ap_calculator\nfrom . import mean_average_precision_calculator as map_calculator\nlogger = get_logger(\"paddlevideo\")\ndef flatten(l):\n    \"\"\" Merges a list of lists into a single list. \"\"\"\n    return [item for sublist in l for item in sublist]",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/eval_util.py:1-29"
+    },
+    "6019": {
+        "file_id": 480,
+        "content": "The code provides functions for evaluating models. It imports necessary libraries, defines a function to merge multiple lists into one, and includes classes for Average Precision Calculator and Mean Average Precision Calculator from the paddlevideo module.",
+        "type": "comment"
+    },
+    "6020": {
+        "file_id": 480,
+        "content": "def calculate_hit_at_one(predictions, actuals):\n    \"\"\"\n    Hit@k: indicates the fraction of test samples that contain at least\n    one of the ground truth labels in the top k predictions,\n    i.e topk.\n    Args:\n        predictions: Matrix containing the outputs of the model.\n        Dimensions are 'batch' x 'num_classes'.\n        actuals: Matrix containing the ground truth labels.\n        Dimensions are 'batch' x 'num_classes'.\n    Returns:\n        float: The average hit at one across the entire batch.\n    \"\"\"\n    top_prediction = np.argmax(predictions, 1)\n    hits = actuals[np.arange(actuals.shape[0]), top_prediction]\n    return np.mean(hits)\ndef calculate_precision_at_equal_recall_rate(predictions, actuals):\n    \"\"\"\n    PERR: measures the video-level annotation precision when we retrieve the same number\n     of entities per video as there are in the ground-truth.\n    More details please refer to:  https://arxiv.org/abs/1609.08675\n    Args:\n        predictions: Matrix containing the outputs of the model.\n        Dimensions are 'batch' x 'num_classes'.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/eval_util.py:32-60"
+    },
+    "6021": {
+        "file_id": 480,
+        "content": "Calculates Hit@1, the fraction of samples with at least one ground truth label in top predictions.\nMeasures video-level annotation precision when retrieving the same number of entities as ground truth.",
+        "type": "comment"
+    },
+    "6022": {
+        "file_id": 480,
+        "content": "        actuals: Matrix containing the ground truth labels.\n        Dimensions are 'batch' x 'num_classes'.\n    Returns:\n        float: The average precision at equal recall rate across the entire batch.\n    \"\"\"\n    aggregated_precision = 0.0\n    num_videos = actuals.shape[0]\n    for row in np.arange(num_videos):\n        num_labels = int(np.sum(actuals[row]))\n        top_indices = np.argpartition(predictions[row],\n                                      -num_labels)[-num_labels:]\n        item_precision = 0.0\n        for label_index in top_indices:\n            if predictions[row][label_index] > 0:\n                item_precision += actuals[row][label_index]\n        item_precision /= top_indices.size\n        aggregated_precision += item_precision\n    aggregated_precision /= num_videos\n    return aggregated_precision\ndef calculate_gap(predictions, actuals, top_k=20):\n    \"\"\"\n    GAP: the global average precision.\n    Only the top_k predictions are taken for each of the videos.\n    Args:\n        predictions: Matrix containing the outputs of the model.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/eval_util.py:61-90"
+    },
+    "6023": {
+        "file_id": 480,
+        "content": "The code calculates the average precision at equal recall rate and global average precision for a batch of videos. It iterates over each video, determines the number of labels, finds the top indices based on predictions, calculates item-wise precision, aggregates these precisions for all videos, and returns the averaged precision as well as the gap score.",
+        "type": "comment"
+    },
+    "6024": {
+        "file_id": 480,
+        "content": "        Dimensions are 'batch' x 'num_classes'.\n        actuals: Matrix containing the ground truth labels.\n        Dimensions are 'batch' x 'num_classes'.\n        top_k: How many predictions to use per video.\n    Returns:\n        float: The global average precision.\n    \"\"\"\n    gap_calculator = ap_calculator.AveragePrecisionCalculator()\n    sparse_predictions, sparse_labels, num_positives = top_k_by_class(\n        predictions, actuals, top_k)\n    gap_calculator.accumulate(flatten(sparse_predictions),\n                              flatten(sparse_labels), sum(num_positives))\n    return gap_calculator.peek_ap_at_n()\ndef top_k_by_class(predictions, labels, k=20):\n    \"\"\"Extracts the top k predictions for each video, sorted by class.\n    Args:\n        predictions: A numpy matrix containing the outputs of the model.\n        Dimensions are 'batch' x 'num_classes'.\n        k: the top k non-zero entries to preserve in each prediction.\n    Returns:\n        A tuple (predictions,labels, true_positives). 'predictions' and 'labels'",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/eval_util.py:91-116"
+    },
+    "6025": {
+        "file_id": 480,
+        "content": "This code calculates the global average precision by first extracting the top k predictions for each video, sorted by class. It then accumulates these results using an AveragePrecisionCalculator and returns the global average precision.",
+        "type": "comment"
+    },
+    "6026": {
+        "file_id": 480,
+        "content": "        are lists of lists of floats. 'true_positives' is a list of scalars. The\n        length of the lists are equal to the number of classes. The entries in the\n        predictions variable are probability predictions, and\n        the corresponding entries in the labels variable are the ground truth for\n        those predictions. The entries in 'true_positives' are the number of true\n        positives for each class in the ground truth.\n    Raises:\n        ValueError: An error occurred when the k is not a positive integer.\n    \"\"\"\n    if k <= 0:\n        raise ValueError(\"k must be a positive integer.\")\n    k = min(k, predictions.shape[1])\n    num_classes = predictions.shape[1]\n    prediction_triplets = []\n    for video_index in range(predictions.shape[0]):\n        prediction_triplets.extend(\n            top_k_triplets(predictions[video_index], labels[video_index], k))\n    out_predictions = [[] for v in range(num_classes)]\n    out_labels = [[] for v in range(num_classes)]\n    for triplet in prediction_triplets:",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/eval_util.py:117-137"
+    },
+    "6027": {
+        "file_id": 480,
+        "content": "This function takes in a list of lists containing probability predictions and ground truth labels for multiple classes, and calculates top-k triplet predictions based on the given k value. It raises a ValueError if k is not a positive integer. The function then creates empty lists to store output predictions and labels for each class.",
+        "type": "comment"
+    },
+    "6028": {
+        "file_id": 480,
+        "content": "        out_predictions[triplet[0]].append(triplet[1])\n        out_labels[triplet[0]].append(triplet[2])\n    out_true_positives = [np.sum(labels[:, i]) for i in range(num_classes)]\n    return out_predictions, out_labels, out_true_positives\ndef top_k_triplets(predictions, labels, k=20):\n    \"\"\"Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in\n    (prediction, class) format\"\"\"\n    m = len(predictions)\n    k = min(k, m)\n    indices = np.argpartition(predictions, -k)[-k:]\n    return [(index, predictions[index], labels[index]) for index in indices]\n@METRIC.register\nclass HitOneMetric(BaseMetric):\n    \"\"\"A class to store the evaluation metrics.\"\"\"\n    def __init__(self,\n                 num_class,\n                 top_k,\n                 data_size,\n                 batch_size,\n                 log_interval=20):\n        \"\"\"Construct an HitOneMetric object to store the evaluation metrics.\"\"\"\n        self.hit_at_one = []\n        self.perr = []\n        self.gap = []\n        super().__init__(data_size, batch_size, log_interval)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/eval_util.py:138-167"
+    },
+    "6029": {
+        "file_id": 480,
+        "content": "This code calculates top-k predictions and labels from given predictions and labels arrays, and then initializes HitOneMetric class to store the evaluation metrics.",
+        "type": "comment"
+    },
+    "6030": {
+        "file_id": 480,
+        "content": "    def accumulate(self):\n        logger.info(\n            '[TEST] finished, hit_at_one = {:.5f}, perr = {:.5f}, gap = {:.5f}'.\n            format(np.mean(np.array(self.hit_at_one)),\n                   np.mean(np.array(self.perr)), np.mean(np.array(self.gap))))\n    def clear(self):\n        \"\"\"Clear the evaluation metrics and reset the HitOneMetric object.\"\"\"\n        self.hit_at_one = []\n        self.perr = []\n        self.gap = []\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        hit_at_one = paddle.to_tensor(outputs['hit_at_one'])\n        perr = paddle.to_tensor(outputs['perr'])\n        gap = paddle.to_tensor(outputs['gap'])\n        # NOTE(shipping): deal with multi cards validate\n        if self.world_size > 1:\n            hit_at_one = paddle.distributed.all_reduce(\n                hit_at_one,\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            perr = paddle.distributed.all_reduce(\n                perr, op=paddle.distributed.ReduceOp.SUM) / self.world_size",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/eval_util.py:169-193"
+    },
+    "6031": {
+        "file_id": 480,
+        "content": "The code defines a HitOneMetric class for evaluating metrics in a video prediction task. The accumulate method calculates mean values of hit_at_one, perr, and gap, and logs the results as information. The clear method resets all metrics to an empty list. The update method updates the metric with each iteration, taking into account multi-card validation using PaddlePaddle's distributed functions.",
+        "type": "comment"
+    },
+    "6032": {
+        "file_id": 480,
+        "content": "            gap = paddle.distributed.all_reduce(\n                gap, op=paddle.distributed.ReduceOp.SUM) / self.world_size\n        self.hit_at_one.append(hit_at_one.numpy())\n        self.perr.append(perr.numpy())\n        self.gap.append(gap.numpy())\n        # preds ensemble\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{}...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size),\n            ))",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/eval_util.py:194-205"
+    },
+    "6033": {
+        "file_id": 480,
+        "content": "This code snippet is a part of the Youtube8m evaluation module in PaddleVideo. It calculates the gap between ground truth and prediction for each batch, performs all-reduce on the gap, appends it to the corresponding list. Also, logs information about processing batches during testing.",
+        "type": "comment"
+    },
+    "6034": {
+        "file_id": 481,
+        "content": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py",
+        "type": "filepath"
+    },
+    "6035": {
+        "file_id": 481,
+        "content": "This code uses MeanAveragePrecisionCalculator to calculate mAP for ranked lists, initializes AveragePrecisionCalculator objects, supports interpolated precisions, and ensures shape compatibility. It averages average precisions of each class to provide the final result as mAP.",
+        "type": "summary"
+    },
+    "6036": {
+        "file_id": 481,
+        "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Calculate the mean average precision.\nIt provides an interface for calculating mean average precision\nfor an entire list or the top-n ranked items.\nExample usages:\nWe first call the function accumulate many times to process parts of the ranked\nlist. After processing all the parts, we call peek_map_at_n\nto calculate the mean average precision.\n```\nimport random\np = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:1-27"
+    },
+    "6037": {
+        "file_id": 481,
+        "content": "This code calculates the mean average precision for a ranked list of items. It provides an interface to calculate this metric for the entire list or top-n ranked items. The example usage demonstrates accumulating data in parts and then using peek_map_at_n function to calculate the final result. The provided numpy array is used for demonstration purposes, representing a ranked list of values.",
+        "type": "comment"
+    },
+    "6038": {
+        "file_id": 481,
+        "content": "a = np.array([[random.choice([0, 1]) for _ in xrange(50)]\n     for _ in xrange(1000)])\n# mean average precision for 50 classes.\ncalculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(\n            num_class=50)\ncalculator.accumulate(p, a)\naps = calculator.peek_map_at_n()\n```\n\"\"\"\nimport numpy\nfrom . import average_precision_calculator\nclass MeanAveragePrecisionCalculator(object):\n    \"\"\"This class is to calculate mean average precision.\n  \"\"\"\n    def __init__(self, num_class):\n        \"\"\"Construct a calculator to calculate the (macro) average precision.\n    Args:\n      num_class: A positive Integer specifying the number of classes.\n      top_n_array: A list of positive integers specifying the top n for each\n      class. The top n in each class will be used to calculate its average\n      precision at n.\n      The size of the array must be num_class.\n    Raises:\n      ValueError: An error occurred when num_class is not a positive integer;\n      or the top_n_array is not a list of positive integers.",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:28-59"
+    },
+    "6039": {
+        "file_id": 481,
+        "content": "Creates a numpy array with 1000 samples, each containing 50 binary random choices. Initializes MeanAveragePrecisionCalculator object with specified number of classes (in this case, 50). Accumulates predictions and ground truth for calculating average precision. Retrieves the average precision map at a given point in time.",
+        "type": "comment"
+    },
+    "6040": {
+        "file_id": 481,
+        "content": "    \"\"\"\n        if not isinstance(num_class, int) or num_class <= 1:\n            raise ValueError(\"num_class must be a positive integer.\")\n        self._ap_calculators = []  # member of AveragePrecisionCalculator\n        self._num_class = num_class  # total number of classes\n        for i in range(num_class):\n            self._ap_calculators.append(\n                average_precision_calculator.AveragePrecisionCalculator())\n    def accumulate(self, predictions, actuals, num_positives=None):\n        \"\"\"Accumulate the predictions and their ground truth labels.\n    Args:\n      predictions: A list of lists storing the prediction scores. The outer\n      dimension corresponds to classes.\n      actuals: A list of lists storing the ground truth labels. The dimensions\n      should correspond to the predictions input. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      num_positives: If provided, it is a list of numbers representing the\n      number of true positives for each class. If not provided, the number of",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:60-80"
+    },
+    "6041": {
+        "file_id": 481,
+        "content": "This code defines a class for calculating Mean Average Precision (mAP) in the context of video classification. The constructor checks if num_class is a positive integer and initializes a list to store AveragePrecisionCalculator objects. The accumulate method takes predictions and actuals as input, accumulating prediction scores with their corresponding ground truth labels. If num_positives is provided, it represents the number of true positives for each class; otherwise, it defaults to no value.",
+        "type": "comment"
+    },
+    "6042": {
+        "file_id": 481,
+        "content": "      true positives will be inferred from the 'actuals' array.\n    Raises:\n      ValueError: An error occurred when the shape of predictions and actuals\n      does not match.\n    \"\"\"\n        if not num_positives:\n            num_positives = [None for i in predictions.shape[1]]\n        calculators = self._ap_calculators\n        for i in range(len(predictions)):\n            calculators[i].accumulate(predictions[i], actuals[i],\n                                      num_positives[i])\n    def clear(self):\n        for calculator in self._ap_calculators:\n            calculator.clear()\n    def is_empty(self):\n        return ([calculator.heap_size for calculator in self._ap_calculators] ==\n                [0 for _ in range(self._num_class)])\n    def peek_map_at_n(self):\n        \"\"\"Peek the non-interpolated mean average precision at n.\n    Returns:\n      An array of non-interpolated average precision at n (default 0) for each\n      class.\n    \"\"\"\n        aps = [\n            self._ap_calculators[i].peek_ap_at_n()\n            for i in range(self._num_class)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:81-112"
+    },
+    "6043": {
+        "file_id": 481,
+        "content": "This code calculates the mean average precision for each class in a dataset, and provides methods to clear and check if the calculators are empty. The peek_map_at_n function returns an array of non-interpolated average precisions at n for each class. It also checks for shape compatibility between predictions and actuals arrays.",
+        "type": "comment"
+    },
+    "6044": {
+        "file_id": 481,
+        "content": "        ]\n        return aps",
+        "type": "code",
+        "location": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:113-114"
+    },
+    "6045": {
+        "file_id": 481,
+        "content": "This code calculates the mean average precision (mAP) by averaging the average precisions of each class. It returns the mAP value as a result.",
+        "type": "comment"
+    },
+    "6046": {
+        "file_id": 482,
+        "content": "/paddlevideo/metrics/yowo_metric.py",
+        "type": "filepath"
+    },
+    "6047": {
+        "file_id": 482,
+        "content": "The code adds a YOWOMetric class to the PaddleVideo framework for measuring YOWO metrics in two stages: saving test results and calculating metrics from saved results files. The code also handles batch processing, logging progress, and evaluates mAP metrics.",
+        "type": "summary"
+    },
+    "6048": {
+        "file_id": 482,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport os\nfrom paddlevideo.utils import get_logger\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom .ucf24_utils import get_mAP\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass YOWOMetric(BaseMetric):\n    \"\"\"\n    Metrics for YOWO. Two Stages in this metric:\n    (1) Get test results using trained model, results will be saved in YOWOMetric.result_path;\n    (2) Calculate metrics using results file from stage (1).",
+        "type": "code",
+        "location": "/paddlevideo/metrics/yowo_metric.py:1-30"
+    },
+    "6049": {
+        "file_id": 482,
+        "content": "This code defines a YOWOMetric class within the PaddleVideo framework. The class measures metrics for YOWO in two stages: first, it saves test results using a trained model, and then calculates metrics from the saved results file.",
+        "type": "comment"
+    },
+    "6050": {
+        "file_id": 482,
+        "content": "    \"\"\"\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 gt_folder,\n                 result_path,\n                 threshold=0.5,\n                 save_path=None,\n                 log_interval=1):\n        \"\"\"\n        Init for BMN metrics.\n        Params:\n            gtfolder:groundtruth folder path for ucf24\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.result_path = result_path\n        self.gt_folder = gt_folder\n        self.threshold = threshold\n        self.save_path = save_path\n        if not osp.isdir(self.result_path):\n            os.makedirs(self.result_path)\n    def update(self, batch_id, data, outputs):\n        frame_idx = outputs['frame_idx']\n        boxes = outputs[\"boxes\"]\n        for j in range(len(frame_idx)):\n            detection_path = osp.join(self.result_path, frame_idx[j])\n            with open(detection_path, 'w+') as f_detect:\n                for box in boxes[j]:\n                    x1 = round(float(box[0] - box[2] / 2.0) * 320.0)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/yowo_metric.py:31-62"
+    },
+    "6051": {
+        "file_id": 482,
+        "content": "The code initializes an instance of a BMN metrics class with specified parameters. It checks if the result path exists and creates it if not, then updates the metric by writing detection results to corresponding files in the result path for each batch.",
+        "type": "comment"
+    },
+    "6052": {
+        "file_id": 482,
+        "content": "                    y1 = round(float(box[1] - box[3] / 2.0) * 240.0)\n                    x2 = round(float(box[0] + box[2] / 2.0) * 320.0)\n                    y2 = round(float(box[1] + box[3] / 2.0) * 240.0)\n                    det_conf = float(box[4])\n                    for j in range((len(box) - 5) // 2):\n                        cls_conf = float(box[5 + 2 * j].item())\n                        prob = det_conf * cls_conf\n                        f_detect.write(\n                            str(int(box[6]) + 1) + ' ' + str(prob) + ' ' + str(x1) + ' ' + str(y1) + ' ' + str(\n                                x2) + ' ' + str(y2) + '\\n')\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        metric_list = get_mAP(self.gt_folder, self.result_path, self.threshold, self.save_path)\n        for info in metric_list:\n            logger.info(info)",
+        "type": "code",
+        "location": "/paddlevideo/metrics/yowo_metric.py:63-82"
+    },
+    "6053": {
+        "file_id": 482,
+        "content": "This code snippet is part of the PaddleVideo library. It calculates and writes yolo v5 box information into a file, handling batch processing and logging progress with an interval. The accumulate function collects mAP metrics for evaluation.",
+        "type": "comment"
+    },
+    "6054": {
+        "file_id": 483,
+        "content": "/paddlevideo/modeling/__init__.py",
+        "type": "filepath"
+    },
+    "6055": {
+        "file_id": 483,
+        "content": "The code imports modules from PaddleVideo library, initializes a model registry, and provides functions for building video recognition models and defining loss functions.",
+        "type": "summary"
+    },
+    "6056": {
+        "file_id": 483,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .assigners import MaxIoUAssignerAVA\nfrom .backbones import ResNet\nfrom .builder import (build_backbone, build_head, build_localizer, build_loss,\n                      build_recognizer)\nfrom .framework.detectors import BaseDetector, FastRCNN, TwoStageDetector\nfrom .framework.recognizers import BaseRecognizer, Recognizer2D\nfrom .heads import (AVARoIHead, BaseHead, BBoxHeadAVA, SingleRoIExtractor3D,\n                    TSNHead)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/__init__.py:1-22"
+    },
+    "6057": {
+        "file_id": 483,
+        "content": "This code is an import statement from the PaddleVideo library, including various modules for backbones, builders, detectors, recognizers, and heads. It also includes license information and copyright details. The code allows users to access and build models using these imported modules.",
+        "type": "comment"
+    },
+    "6058": {
+        "file_id": 483,
+        "content": "from .losses import CrossEntropyLoss\nfrom .registry import (BACKBONES, DETECTORS, HEADS, LOCALIZERS, LOSSES,\n                       PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)\nfrom .samplers import RandomSampler\nfrom .weight_init import kaiming_normal_, trunc_normal_, weight_init_\n__all__ = [\n    'BACKBONES', 'HEADS', 'RECOGNIZERS', 'LOCALIZERS', 'PARTITIONERS', 'LOSSES',\n    'build_recognizer', 'build_localizer', 'build_head', 'build_backbone',\n    'build_loss', 'ResNet', 'TSNHead', 'BaseHead', 'BaseRecognizer',\n    'Recognizer2d', 'CrossEntropyLoss', 'ROI_EXTRACTORS',\n    'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'MaxIoUAssignerAVA',\n    'RandomSampler', 'DETECTORS', 'kaiming_normal_', 'trunc_normal_',\n    'weight_init_'\n]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/__init__.py:23-37"
+    },
+    "6059": {
+        "file_id": 483,
+        "content": "This code imports various modules, initializes a registry of models and functions, and lists all the available ones. It also defines a few key functions like `build_recognizer` and `build_localizer`, as well as some important loss functions such as `CrossEntropyLoss`. The code is part of PaddleVideo's modeling package and seems to be involved in building different parts of a video recognition model.",
+        "type": "comment"
+    },
+    "6060": {
+        "file_id": 484,
+        "content": "/paddlevideo/modeling/assigners/__init__.py",
+        "type": "filepath"
+    },
+    "6061": {
+        "file_id": 484,
+        "content": "This code imports the MaxIoUAssignerAVA class from the max_iou_assigner_ava module and adds it to the __all__ list, making it importable by default. The comment at the top of the file contains license information and copyright notices.",
+        "type": "summary"
+    },
+    "6062": {
+        "file_id": 484,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .max_iou_assigner_ava import MaxIoUAssignerAVA\n__all__ = ['MaxIoUAssignerAVA']",
+        "type": "code",
+        "location": "/paddlevideo/modeling/assigners/__init__.py:1-17"
+    },
+    "6063": {
+        "file_id": 484,
+        "content": "This code imports the MaxIoUAssignerAVA class from the max_iou_assigner_ava module and adds it to the __all__ list, making it importable by default. The comment at the top of the file contains license information and copyright notices.",
+        "type": "comment"
+    },
+    "6064": {
+        "file_id": 485,
+        "content": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py",
+        "type": "filepath"
+    },
+    "6065": {
+        "file_id": 485,
+        "content": "The code defines AssignResult class, initializes MaxIoUAssignerAVA, assigns GT boxes to bboxes using max IOU method and handles multi-class cases. It's registered at BBOX_ASSIGNERS.",
+        "type": "summary"
+    },
+    "6066": {
+        "file_id": 485,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport numpy as np\nfrom ..registry import BBOX_ASSIGNERS\nfrom ..bbox_utils import bbox_overlaps\nclass AssignResult():\n    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):\n        self.num_gts = num_gts\n        self.gt_inds = gt_inds\n        self.max_overlaps = max_overlaps\n        self.labels = labels\n    def add_gt_(self, gt_labels):\n        \"\"\"Add ground truth as assigned results.  \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:1-27"
+    },
+    "6067": {
+        "file_id": 485,
+        "content": "This code defines a class called \"AssignResult\" for storing the assigned results, including number of gts, ground truth indexes, maximum overlaps, and labels if available. It also includes a method called \"add_gt_\" to add ground truth as assigned results.",
+        "type": "comment"
+    },
+    "6068": {
+        "file_id": 485,
+        "content": "        self_inds = paddle.arange(1, len(gt_labels) + 1, dtype=\"int32\")\n        gt_inds_squeeze = paddle.squeeze(self.gt_inds, axis=0)\n        self.gt_inds = paddle.concat([self_inds, gt_inds_squeeze])\n        gt_label_ones = paddle.full((len(gt_labels), ), 1, dtype='float32')\n        max_overlaps_squeeze = paddle.squeeze(self.max_overlaps, axis=0)\n        self.max_overlaps = paddle.concat([gt_label_ones, max_overlaps_squeeze])\n        if self.labels is not None:\n            self.labels = paddle.concat([gt_labels, self.labels])\n@BBOX_ASSIGNERS.register()\nclass MaxIoUAssignerAVA():\n    \"\"\"Assign a corresponding gt bbox or background to each bbox.  \"\"\"\n    def __init__(self,\n                 pos_iou_thr,\n                 neg_iou_thr,\n                 min_pos_iou=.0,\n                 gt_max_assign_all=True,\n                 ignore_iof_thr=-1,\n                 ignore_wrt_candidates=True,\n                 match_low_quality=True,\n                 gpu_assign_thr=-1,\n                 iou_calculator=dict(type='BboxOverlaps2D')):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:28-49"
+    },
+    "6069": {
+        "file_id": 485,
+        "content": "This code initializes a MaxIoUAssignerAVA object by setting the self_inds and gt_inds attributes using paddle.arange and paddle.squeeze functions, concatenating them with paddle.concat function. It also sets max_overlaps attribute by concatenating gt_label_ones and max_overlaps_squeeze, and updates labels attribute if not None. The class is then registered at BBOX_ASSIGNERS with the decorator @BBOX_ASSIGNERS.register().",
+        "type": "comment"
+    },
+    "6070": {
+        "file_id": 485,
+        "content": "        self.pos_iou_thr = pos_iou_thr\n        self.neg_iou_thr = neg_iou_thr\n        self.min_pos_iou = min_pos_iou\n        self.gt_max_assign_all = gt_max_assign_all\n        self.ignore_iof_thr = ignore_iof_thr\n        self.ignore_wrt_candidates = ignore_wrt_candidates\n        self.gpu_assign_thr = gpu_assign_thr\n        self.match_low_quality = match_low_quality\n    def assign(self, \n               bboxes, \n               gt_bboxes, \n               gt_labels=None):\n        \"\"\"Assign gt to bboxes.  \"\"\"\n        overlaps = bbox_overlaps(gt_bboxes, bboxes)\n        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)\n        return assign_result\n    def assign_wrt_overlaps(self, overlaps, gt_labels=None):\n        \"\"\"Assign w.r.t. the overlaps of bboxes with gts.  \"\"\"\n        num_gts, num_bboxes = overlaps.shape[0], overlaps.shape[1]\n        # 1. assign -1\n        assigned_gt_inds = paddle.full((num_bboxes, ), -1, dtype='int32')\n        # for each anchor, which gt best overlaps with it\n        # for each anchor, the max iou of all gts",
+        "type": "code",
+        "location": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:50-75"
+    },
+    "6071": {
+        "file_id": 485,
+        "content": "The code defines a class that assigns ground truth (GT) boxes to bboxes. It takes in bboxes and GT bboxes as input, and returns the assignment result. The function assign_wrt_overlaps calculates assigned_gt_inds based on the overlaps of bboxes with gts.",
+        "type": "comment"
+    },
+    "6072": {
+        "file_id": 485,
+        "content": "        max_overlaps, argmax_overlaps = paddle.topk(overlaps, k=1, axis=0)\n        # for each gt, which anchor best overlaps with it\n        # for each gt, the max iou of all proposals\n        gt_max_overlaps, gt_argmax_overlaps = paddle.topk(overlaps, k=1, axis=1) \n        # 2. assign negative: below the negative inds are set to be 0\n        match_labels = paddle.full(argmax_overlaps.shape, -1, dtype='int32')\n        match_labels = paddle.where(max_overlaps < self.neg_iou_thr,\n                            paddle.zeros_like(match_labels), match_labels)\n        # 3. assign positive: above positive IoU threshold\n        argmax_overlaps_int32 = paddle.cast(argmax_overlaps, 'int32')\n        match_labels = paddle.where(max_overlaps >= self.pos_iou_thr,\n                                argmax_overlaps_int32 + 1, match_labels)\n        assigned_gt_inds = match_labels\n        if self.match_low_quality:\n            # Low-quality matching will overwirte the assigned_gt_inds\n            # assigned in Step 3. Thus, the assigned gt might not be the",
+        "type": "code",
+        "location": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:76-93"
+    },
+    "6073": {
+        "file_id": 485,
+        "content": "This code assigns positive and negative labels to anchors based on their IoU with ground truth boxes. If the max IoU is above a certain threshold, it's considered positive. If it's below another threshold, it's negative. This process helps determine which anchor best overlaps with each ground truth box.",
+        "type": "comment"
+    },
+    "6074": {
+        "file_id": 485,
+        "content": "            # best one for prediction.\n            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox\n            # 1 & 2, bbox 1 will be assigned as the best target for bbox A\n            # in step 3. However, if GT bbox 2's gt_argmax_overlaps = A,\n            # bbox A's assigned_gt_inds will be overwritten to be bbox B.\n            # This might be the reason that it is not used in ROI Heads.\n            for i in range(num_gts):\n                if gt_max_overlaps.numpy()[i] >= self.min_pos_iou:\n                    if self.gt_max_assign_all:\n                        equal_x_np = overlaps[i, :].numpy()\n                        equal_y_np = gt_max_overlaps[i].numpy()\n                        max_iou_inds = np.equal(equal_x_np, equal_y_np)\n                        max_iou_inds = paddle.to_tensor(max_iou_inds)\n                        max_iou_inds = paddle.reshape( max_iou_inds, [1,max_iou_inds.shape[0]] )\n                        match_labels_gts = paddle.full(max_iou_inds.shape, i+1, dtype='int32')\n                        match_labels = paddle.where(max_iou_inds, match_labels_gts, match_labels)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:94-109"
+    },
+    "6075": {
+        "file_id": 485,
+        "content": "This code iterates over each ground truth (GT) bounding box, and if the IOU with a detection is above the minimum allowed position IOU, it checks whether all overlapping detections should be assigned to this GT. It creates a tensor of boolean values representing the assignment for each detection and GT pair. This is done by comparing the overlaps matrix and gt_max_overlaps, then reshaping and replacing match labels accordingly.",
+        "type": "comment"
+    },
+    "6076": {
+        "file_id": 485,
+        "content": "                        assigned_gt_inds = match_labels\n                    else:\n                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1\n        if gt_labels is not None:\n            # consider multi-class case (AVA)\n            assert len(gt_labels[0]) > 1\n            assigned_labels = paddle.full([num_bboxes, len(gt_labels[0])], 0, dtype='float32')\n            assigned_gt_inds_reshape = assigned_gt_inds.reshape([assigned_gt_inds.shape[1]])\n            pos_inds = paddle.nonzero( assigned_gt_inds_reshape , as_tuple=False)\n            pos_inds_num = float(paddle.numel(pos_inds))\n            if pos_inds_num > 0:\n                pos_inds = paddle.squeeze(pos_inds, axis = 1 )\n                assigned_gt_inds_squeeze = paddle.squeeze(assigned_gt_inds, axis=0)\n                assigned_gt_inds_select = paddle.index_select(assigned_gt_inds_squeeze, pos_inds) - 1\n                gt_labels_select = paddle.index_select(gt_labels, assigned_gt_inds_select)\n                A = assigned_gt_inds_squeeze",
+        "type": "code",
+        "location": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:110-126"
+    },
+    "6077": {
+        "file_id": 485,
+        "content": "This code assigns ground truth (GT) labels and indices for maximum IOU Assigner in the AVA dataset. It handles both multi-class cases with multiple classes per label. If there is a match, GT indices are assigned, otherwise, it assigns index + 1. Finally, it considers the multi-class case by asserting the existence of more than one class and assigns zeros to the initial labels array before updating them based on the selected gt_labels.",
+        "type": "comment"
+    },
+    "6078": {
+        "file_id": 485,
+        "content": "                X = assigned_gt_inds_squeeze - 1\n                Y = paddle.zeros_like(X)\n                if A.shape[0]==1:\n                    if float(A) > 0:\n                        T=X\n                    else:\n                        T=Y\n                else:\n                    T = paddle.where(A>0, X, Y)\n                S = paddle.index_select(gt_labels, T)\n                AE = paddle.expand(A, [S.shape[1], A.shape[0]]) \n                AET = paddle.transpose(AE, perm=[1, 0])\n                R = paddle.where(AET>0, S, assigned_labels) \n                assigned_labels = R\n        else:\n            assigned_labels = None\n        ret = AssignResult(\n            num_gts,\n            assigned_gt_inds,\n            max_overlaps,\n            labels=assigned_labels)\n        return ret",
+        "type": "code",
+        "location": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:127-148"
+    },
+    "6079": {
+        "file_id": 485,
+        "content": "This code snippet is part of a max IOU assigner implementation in PaddleVideo. It assigns labels to objects based on the maximum IoU (intersection over union) threshold. If there's only one object, it assigns the ground truth index if the overlap is greater than 0, otherwise sets it to 0. For multiple objects, it uses a where statement to select the max IOU assignment. The assigned labels are then returned as part of the AssignResult.",
+        "type": "comment"
+    },
+    "6080": {
+        "file_id": 486,
+        "content": "/paddlevideo/modeling/backbones/__init__.py",
+        "type": "filepath"
+    },
+    "6081": {
+        "file_id": 486,
+        "content": "This code initializes and defines various backbone models for video analysis tasks in PaddleVideo, including ResNet, Vision Transformer, AGCN, and popular models such as ResNetTSN_MRI, ResNetTSM_MRI, and SwinTransformer3D. These models form the foundation for object detection, segmentation, motion estimation, and various computer vision applications in PaddlePaddle framework.",
+        "type": "summary"
+    },
+    "6082": {
+        "file_id": 486,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .actbert import BertForMultiModalPreTraining\nfrom .adds import ADDS_DepthNet\nfrom .agcn import AGCN\nfrom .asrf import ASRF\nfrom .bmn import BMN\nfrom .cfbi import CFBI\nfrom .movinet import MoViNet\nfrom .ms_tcn import MSTCN\nfrom .resnet import ResNet\nfrom .resnet_slowfast import ResNetSlowFast\nfrom .resnet_slowfast_MRI import ResNetSlowFast_MRI\nfrom .resnet_tsm import ResNetTSM\nfrom .resnet_tsm_MRI import ResNetTSM_MRI",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/__init__.py:1-27"
+    },
+    "6083": {
+        "file_id": 486,
+        "content": "This code is an initialization file for backbone models in PaddleVideo. It imports various model classes from submodules, including BertForMultiModalPreTraining, ADDS_DepthNet, AGCN, ASRF, BMN, CFBI, MoViNet, MSTCN, ResNet, ResNetSlowFast, ResNetSlowFast_MRI, and ResNetTSM, ResNetTSM_MRI. These models can be used for video analysis tasks in the PaddlePaddle framework.",
+        "type": "comment"
+    },
+    "6084": {
+        "file_id": 486,
+        "content": "from .resnet_tsn_MRI import ResNetTSN_MRI\nfrom .resnet_tweaks_tsm import ResNetTweaksTSM\nfrom .resnet_tweaks_tsn import ResNetTweaksTSN\nfrom .stgcn import STGCN\nfrom .swin_transformer import SwinTransformer3D\nfrom .transnetv2 import TransNetV2\nfrom .vit import VisionTransformer\nfrom .vit_tweaks import VisionTransformer_tweaks\nfrom .ms_tcn import MSTCN\nfrom .asrf import ASRF\nfrom .resnet_tsn_MRI import ResNetTSN_MRI\nfrom .resnet_tsm_MRI import ResNetTSM_MRI\nfrom .resnet_slowfast_MRI import ResNetSlowFast_MRI\nfrom .cfbi import CFBI\nfrom .ctrgcn import CTRGCN\nfrom .agcn2s import AGCN2s\nfrom .movinet import MoViNet\nfrom .resnet3d_slowonly import ResNet3dSlowOnly\nfrom .toshift_vit import TokenShiftVisionTransformer\nfrom .pptsm_mv2 import PPTSM_MobileNetV2\nfrom .pptsm_mv3 import PPTSM_MobileNetV3\nfrom .pptsm_v2 import PPTSM_v2\nfrom .yowo import YOWO\n__all__ = [\n    'ResNet', 'ResNetTSM', 'ResNetTweaksTSM', 'ResNetSlowFast', 'BMN',\n    'ResNetTweaksTSN', 'VisionTransformer', 'STGCN', 'AGCN', 'TransNetV2',\n    'ADDS_DepthNet', 'VisionTransformer_tweaks', 'BertForMultiModalPreTraining',",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/__init__.py:28-55"
+    },
+    "6085": {
+        "file_id": 486,
+        "content": "The code imports various backbone models for video analysis from different modules within the PaddleVideo library, including ResNet, Vision Transformer, STGCN, AGCN, and more. The models are used for tasks like object detection, segmentation, and motion estimation in video processing.",
+        "type": "comment"
+    },
+    "6086": {
+        "file_id": 486,
+        "content": "    'ResNetTSN_MRI', 'ResNetTSM_MRI', 'ResNetSlowFast_MRI', 'CFBI', 'MSTCN',\n    'ASRF', 'MoViNet', 'SwinTransformer3D', 'CTRGCN',\n    'TokenShiftVisionTransformer', 'AGCN2s', 'PPTSM_MobileNetV2',\n    'PPTSM_MobileNetV3', 'PPTSM_v2', 'ResNet3dSlowOnly', 'YOWO'\n]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/__init__.py:56-60"
+    },
+    "6087": {
+        "file_id": 486,
+        "content": "This code defines a list of available backbones for video processing tasks, including popular models such as ResNetTSN_MRI, ResNetTSM_MRI, and SwinTransformer3D. These backbones serve as the foundation for various computer vision applications in PaddleVideo.",
+        "type": "comment"
+    },
+    "6088": {
+        "file_id": 487,
+        "content": "/paddlevideo/modeling/backbones/actbert.py",
+        "type": "filepath"
+    },
+    "6089": {
+        "file_id": 487,
+        "content": "The code presents a PaddlePaddle BertEmbeddings class for BERT model embeddings in video action recognition, utilizing self-attention and ACTBERT's backbone for multimodal inputs including text, video, and action data.",
+        "type": "summary"
+    },
+    "6090": {
+        "file_id": 487,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nimport numpy as np\nimport math\nimport copy\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout)\nfrom paddle.nn.initializer import Constant, Normal\nfrom ...utils.save_load import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nACT2FN = {\"gelu\": F.gelu, \"relu\": F.relu, \"swish\": F.swish}\nclass BertEmbeddings(nn.Layer):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:1-32"
+    },
+    "6091": {
+        "file_id": 487,
+        "content": "This code is a Python file containing a class named \"BertEmbeddings\" within the PaddlePaddle framework. The class inherits from nn.Layer and appears to contain embeddings for the BERT model. This code also includes comments with copyright information, license details, and an import section with necessary libraries. It introduces a dictionary, ACT2FN, that maps activation functions for use in the BertEmbeddings class.",
+        "type": "comment"
+    },
+    "6092": {
+        "file_id": 487,
+        "content": "    \"\"\"Construct the embeddings from word, position and token_type embeddings.\n    \"\"\"\n    def __init__(self, vocab_size, max_position_embeddings, type_vocab_size,\n                 hidden_size, hidden_dropout_prob):\n        super(BertEmbeddings, self).__init__()\n        self.word_embeddings = nn.Embedding(vocab_size,\n                                            hidden_size,\n                                            padding_idx=0)\n        self.position_embeddings = nn.Embedding(max_position_embeddings,\n                                                hidden_size)\n        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)\n        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n    def forward(self, input_ids, token_type_ids=None):\n        seq_length = input_ids.shape[1]\n        position_ids = paddle.arange(end=seq_length, dtype=\"int64\")\n        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)\n        if token_type_ids is None:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:33-52"
+    },
+    "6093": {
+        "file_id": 487,
+        "content": "BertEmbeddings initializes word, position, and token_type embeddings for a given vocabulary size, maximum position embedding size, type vocab size, hidden size, and hidden dropout probability. Forward function uses input ids and token type ids to generate position ids and then combines the different embeddings with layer normalization and dropout.",
+        "type": "comment"
+    },
+    "6094": {
+        "file_id": 487,
+        "content": "            token_type_ids = paddle.zeros_like(input_ids)\n        words_embeddings = self.word_embeddings(input_ids)  #8,36  -> 8,36,768\n        position_embeddings = self.position_embeddings(\n            position_ids)  #8,36  -> 8,36,768\n        token_type_embeddings = self.token_type_embeddings(\n            token_type_ids)  #8,36  -> 8,36,768\n        embeddings = words_embeddings + position_embeddings + token_type_embeddings\n        embeddings = self.LayerNorm(embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings\nclass BertImageEmbeddings(nn.Layer):\n    def __init__(self, v_feature_size, v_hidden_size, v_hidden_dropout_prob):\n        super(BertImageEmbeddings, self).__init__()\n        self.image_embeddings = nn.Linear(v_feature_size, v_hidden_size)\n        self.image_location_embeddings = nn.Linear(5, v_hidden_size)\n        self.LayerNorm = nn.LayerNorm(v_hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(v_hidden_dropout_prob)\n    def forward(self, input_ids, input_loc):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:53-75"
+    },
+    "6095": {
+        "file_id": 487,
+        "content": "This code defines the `ActBert` class, which is a backbone model for video action recognition. It initializes word embeddings, position embeddings, and token type embeddings. The class also includes a forward function that combines these embeddings, applies layer normalization and dropout, and returns the result. Additionally, there's the `BertImageEmbeddings` class which takes image features and their locations as input and uses linear layers to generate embeddings for both, followed by layer normalization and dropout.",
+        "type": "comment"
+    },
+    "6096": {
+        "file_id": 487,
+        "content": "        img_embeddings = self.image_embeddings(\n            input_ids)  #8,37,2048 -> 8,37,1024\n        loc_embeddings = self.image_location_embeddings(\n            input_loc)  #8,37,5 -> 8,37,1024\n        embeddings = self.LayerNorm(img_embeddings + loc_embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings  # shape: bs*seq_len*hs\nclass BertActionEmbeddings(nn.Layer):\n    def __init__(self, a_feature_size, a_hidden_size, a_hidden_dropout_prob):\n        super(BertActionEmbeddings, self).__init__()\n        self.action_embeddings = nn.Linear(a_feature_size, a_hidden_size)\n        self.LayerNorm = nn.LayerNorm(a_hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(a_hidden_dropout_prob)\n    def forward(self, input_ids):\n        action_embeddings = self.action_embeddings(\n            input_ids)  #8,5,2048 -> 8,5,768\n        embeddings = self.LayerNorm(action_embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings\nclass BertSelfAttention(nn.Layer):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:76-100"
+    },
+    "6097": {
+        "file_id": 487,
+        "content": "This code defines two classes: `BertActionEmbeddings` and `BertSelfAttention`. The former takes action features as input, linearly projects them into hidden states, normalizes these using LayerNorm, applies dropout, and returns the embeddings. The latter performs self-attention over the embeddings produced by `BertActionEmbeddings`, applies a feed-forward network, applies LayerNorm, and finally applies dropout before returning the output.",
+        "type": "comment"
+    },
+    "6098": {
+        "file_id": 487,
+        "content": "    def __init__(self, hidden_size, num_attention_heads,\n                 attention_probs_dropout_prob):\n        super(BertSelfAttention, self).__init__()\n        if hidden_size % num_attention_heads != 0:\n            raise ValueError(\n                \"The hidden size (%d) is not a multiple of the number of attention \"\n                \"heads (%d)\" % (hidden_size, num_attention_heads))\n        self.num_attention_heads = num_attention_heads\n        self.attention_head_size = int(hidden_size / num_attention_heads)\n        self.all_head_size = self.num_attention_heads * self.attention_head_size\n        self.query = nn.Linear(hidden_size, self.all_head_size)\n        self.key = nn.Linear(hidden_size, self.all_head_size)\n        self.value = nn.Linear(hidden_size, self.all_head_size)\n        self.dropout = nn.Dropout(attention_probs_dropout_prob)\n    def transpose_for_scores(self, x):\n        new_x_shape = x.shape[:-1] + [\n            self.num_attention_heads,\n            self.attention_head_size,\n        ]\n        x = x.reshape(new_x_shape)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:101-123"
+    },
+    "6099": {
+        "file_id": 487,
+        "content": "This code defines a BertSelfAttention class with parameters: hidden_size, num_attention_heads, and attention_probs_dropout_prob. It checks if the hidden size is divisible by the number of attention heads. If not, it raises a ValueError. Then, it calculates attention_head_size and all_head_size. Finally, it initializes query, key, value linear layers and dropout layer for attention probabilities.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/61.json b/docs/data/61.json
new file mode 100644
index 000000000..332fc87b0
--- /dev/null
+++ b/docs/data/61.json
@@ -0,0 +1,551 @@
+{
+    "6100": {
+        "file_id": 487,
+        "content": "        return x.transpose((0, 2, 1, 3))\n    def forward(self, hidden_states, attention_mask):\n        mixed_query_layer = self.query(hidden_states)\n        mixed_key_layer = self.key(hidden_states)\n        mixed_value_layer = self.value(hidden_states)\n        query_layer = self.transpose_for_scores(mixed_query_layer)\n        key_layer = self.transpose_for_scores(mixed_key_layer)\n        value_layer = self.transpose_for_scores(mixed_value_layer)\n        # Take the dot product between \"query\" and \"key\" to get the raw attention scores.\n        attention_scores = paddle.matmul(query_layer,\n                                         key_layer.transpose((0, 1, 3, 2)))\n        attention_scores = attention_scores / math.sqrt(\n            self.attention_head_size)\n        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)\n        attention_scores = attention_scores + attention_mask\n        # Normalize the attention scores to probabilities.\n        attention_probs = nn.Softmax(axis=-1)(attention_scores)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:124-144"
+    },
+    "6101": {
+        "file_id": 487,
+        "content": "This code performs multi-head attention in an attention mechanism. It transposes the query, key, and value layers before calculating raw attention scores via dot product. The results are then normalized into probabilities using softmax. The attention mask is applied to the attention scores for masked self-attention.",
+        "type": "comment"
+    },
+    "6102": {
+        "file_id": 487,
+        "content": "        # This is actually dropping out entire tokens to attend to, which might\n        # seem a bit unusual, but is taken from the original Transformer paper.\n        attention_probs = self.dropout(attention_probs)\n        context_layer = paddle.matmul(attention_probs, value_layer)\n        context_layer = context_layer.transpose((0, 2, 1, 3))\n        new_context_layer_shape = context_layer.shape[:-2] + [\n            self.all_head_size\n        ]\n        context_layer = context_layer.reshape(new_context_layer_shape)\n        return context_layer, attention_probs\nclass BertSelfOutput(nn.Layer):\n    def __init__(self, hidden_size, hidden_dropout_prob):\n        super(BertSelfOutput, self).__init__()\n        self.dense = nn.Linear(hidden_size, hidden_size)\n        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n    def forward(self, hidden_states, input_tensor):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.dropout(hidden_states)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:146-169"
+    },
+    "6103": {
+        "file_id": 487,
+        "content": "This code defines a BertSelfOutput layer that takes input hidden states, applies linear transformation and dropout for regularization, then passes the output through layer normalization.",
+        "type": "comment"
+    },
+    "6104": {
+        "file_id": 487,
+        "content": "        hidden_states = self.LayerNorm(hidden_states + input_tensor)\n        return hidden_states\nclass BertAttention(nn.Layer):\n    def __init__(self, hidden_size, hidden_dropout_prob, num_attention_heads,\n                 attention_probs_dropout_prob):\n        super(BertAttention, self).__init__()\n        self.self = BertSelfAttention(hidden_size, num_attention_heads,\n                                      attention_probs_dropout_prob)\n        self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)\n    def forward(self, input_tensor, attention_mask):\n        self_output, attention_probs = self.self(input_tensor, attention_mask)\n        attention_output = self.output(self_output, input_tensor)\n        return attention_output, attention_probs\nclass BertIntermediate(nn.Layer):\n    def __init__(self, hidden_size, intermediate_size, hidden_act):\n        super(BertIntermediate, self).__init__()\n        self.dense = nn.Linear(hidden_size, intermediate_size)\n        if isinstance(hidden_act, str) or (sys.version_info[0] == 2",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:170-192"
+    },
+    "6105": {
+        "file_id": 487,
+        "content": "This code defines three classes: `ActBert`, `BertAttention`, and `BertIntermediate`. \n\n`ActBert` appears to be a model that includes `BertAttention` and `BertIntermediate` as its layers. The `BertAttention` class defines forward function for attention mechanism, which takes in an input tensor and attention mask, and returns output and attention probabilities. The `BertIntermediate` class appears to be a dense layer with linear activation function.",
+        "type": "comment"
+    },
+    "6106": {
+        "file_id": 487,
+        "content": "                                           and isinstance(hidden_act, str)):\n            self.intermediate_act_fn = ACT2FN[hidden_act]\n        else:\n            self.intermediate_act_fn = hidden_act\n    def forward(self, hidden_states):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.intermediate_act_fn(hidden_states)\n        return hidden_states\nclass BertOutput(nn.Layer):\n    def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob):\n        super(BertOutput, self).__init__()\n        self.dense = nn.Linear(intermediate_size, hidden_size)\n        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n    def forward(self, hidden_states, input_tensor):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.LayerNorm(hidden_states + input_tensor)\n        return hidden_states\nclass BertEntAttention(nn.Layer):\n    \"\"\"Core mudule of tangled transformer.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:193-219"
+    },
+    "6107": {
+        "file_id": 487,
+        "content": "Code defines a class for an attention-based transformer model. It includes an intermediate activation function and forward pass layers for processing input, applying dropout regularization, and normalizing outputs. The BertEntAttention layer is the core module for the transformer.",
+        "type": "comment"
+    },
+    "6108": {
+        "file_id": 487,
+        "content": "    \"\"\"\n    def __init__(\n        self,\n        hidden_size,\n        v_hidden_size,\n        a_hidden_size,\n        bi_hidden_size,\n        attention_probs_dropout_prob,\n        v_attention_probs_dropout_prob,\n        a_attention_probs_dropout_prob,\n        av_attention_probs_dropout_prob,\n        at_attention_probs_dropout_prob,\n        bi_num_attention_heads,\n    ):\n        super(BertEntAttention, self).__init__()\n        if bi_hidden_size % bi_num_attention_heads != 0:\n            raise ValueError(\n                \"The hidden size (%d) is not a multiple of the number of attention \"\n                \"heads (%d)\" % (bi_hidden_size, bi_num_attention_heads))\n        self.num_attention_heads = bi_num_attention_heads\n        self.attention_head_size = int(bi_hidden_size / bi_num_attention_heads)\n        self.all_head_size = self.num_attention_heads * self.attention_head_size\n        # self attention layers for vision input\n        self.query1 = nn.Linear(v_hidden_size, self.all_head_size)\n        self.key1 = nn.Linear(v_hidden_size, self.all_head_size)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:220-246"
+    },
+    "6109": {
+        "file_id": 487,
+        "content": "This code defines a BertEntAttention class with parameters for hidden size, vision input hidden size, attention probabilities dropout probabilities, and bi-directional hidden size. It also checks if the hidden size is a multiple of the number of attention heads. The class initializes attributes such as the number of attention heads, attention head size, all head size, and linear layers for self-attention in vision input.",
+        "type": "comment"
+    },
+    "6110": {
+        "file_id": 487,
+        "content": "        self.value1 = nn.Linear(v_hidden_size, self.all_head_size)\n        self.dropout1 = nn.Dropout(v_attention_probs_dropout_prob)\n        # self attention layers for text input\n        self.query2 = nn.Linear(hidden_size, self.all_head_size)\n        self.key2 = nn.Linear(hidden_size, self.all_head_size)\n        self.value2 = nn.Linear(hidden_size, self.all_head_size)\n        self.dropout2 = nn.Dropout(attention_probs_dropout_prob)\n        # self attention layers for action input\n        self.query3 = nn.Linear(a_hidden_size, self.all_head_size)\n        self.key3 = nn.Linear(a_hidden_size, self.all_head_size)\n        self.value3 = nn.Linear(a_hidden_size, self.all_head_size)\n        self.dropout3 = nn.Dropout(a_attention_probs_dropout_prob)\n        # self attention layers for action_text\n        self.key_at = nn.Linear(bi_hidden_size, self.all_head_size)\n        self.value_at = nn.Linear(bi_hidden_size, self.all_head_size)\n        self.dropout_at = nn.Dropout(av_attention_probs_dropout_prob)\n        # self attention layers for action_vision",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:247-267"
+    },
+    "6111": {
+        "file_id": 487,
+        "content": "This code defines layers for self-attention in the ACTBERT model, including linear and dropout layers for text, action, action_text, and action_vision inputs.",
+        "type": "comment"
+    },
+    "6112": {
+        "file_id": 487,
+        "content": "        self.key_av = nn.Linear(bi_hidden_size, self.all_head_size)\n        self.value_av = nn.Linear(bi_hidden_size, self.all_head_size)\n        self.dropout_av = nn.Dropout(at_attention_probs_dropout_prob)\n    def transpose_for_scores(self, x):\n        new_x_shape = x.shape[:-1] + [\n            self.num_attention_heads,\n            self.attention_head_size,\n        ]\n        x = x.reshape(new_x_shape)\n        return x.transpose((0, 2, 1, 3))\n    def forward(\n        self,\n        input_tensor1,\n        attention_mask1,\n        input_tensor2,\n        attention_mask2,\n        input_tensor3,\n        attention_mask3,\n    ):\n        # for vision input.\n        mixed_query_layer1 = self.query1(input_tensor1)\n        mixed_key_layer1 = self.key1(input_tensor1)\n        mixed_value_layer1 = self.value1(input_tensor1)\n        query_layer1 = self.transpose_for_scores(mixed_query_layer1)\n        key_layer1 = self.transpose_for_scores(mixed_key_layer1)\n        value_layer1 = self.transpose_for_scores(mixed_value_layer1)\n        # for text input:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:268-299"
+    },
+    "6113": {
+        "file_id": 487,
+        "content": "This code defines a model for attention mechanism in a transformer architecture, used for both vision and text inputs. The key steps involve creating linear layers for keys and values, applying dropout, transposing the input tensors for scoring, and forwarding the input through these operations for both vision and text inputs separately.",
+        "type": "comment"
+    },
+    "6114": {
+        "file_id": 487,
+        "content": "        mixed_query_layer2 = self.query2(input_tensor2)\n        mixed_key_layer2 = self.key2(input_tensor2)\n        mixed_value_layer2 = self.value2(input_tensor2)\n        query_layer2 = self.transpose_for_scores(mixed_query_layer2)\n        key_layer2 = self.transpose_for_scores(mixed_key_layer2)\n        value_layer2 = self.transpose_for_scores(mixed_value_layer2)\n        # for action input:\n        mixed_query_layer3 = self.query3(input_tensor3)\n        mixed_key_layer3 = self.key3(input_tensor3)\n        mixed_value_layer3 = self.value3(input_tensor3)\n        query_layer3 = self.transpose_for_scores(mixed_query_layer3)\n        key_layer3 = self.transpose_for_scores(mixed_key_layer3)\n        value_layer3 = self.transpose_for_scores(mixed_value_layer3)\n        def do_attention(query_layer, key_layer, value_layer, attention_mask,\n                         dropout):\n            \"\"\" compute attention \"\"\"\n            attention_scores = paddle.matmul(query_layer,\n                                             key_layer.transpose((0, 1, 3, 2)))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:300-321"
+    },
+    "6115": {
+        "file_id": 487,
+        "content": "This code is performing multi-head attention operation. It first separates the input tensor into two parts, with each part going through its own set of linear layers to create query, key, and value tensors. Then it transposes the query and key tensors before computing attention scores by taking the dot product of the transposed query and key tensors.",
+        "type": "comment"
+    },
+    "6116": {
+        "file_id": 487,
+        "content": "            attention_scores = attention_scores / math.sqrt(\n                self.attention_head_size)\n            attention_scores = attention_scores + attention_mask\n            # Normalize the attention scores to probabilities.\n            attention_probs = nn.Softmax(axis=-1)(attention_scores)\n            # This is actually dropping out entire tokens to attend to, which might\n            # seem a bit unusual, but is taken from the original Transformer paper.\n            attention_probs = dropout(attention_probs)\n            context_layer = paddle.matmul(attention_probs, value_layer)\n            context_layer = context_layer.transpose((0, 2, 1, 3))\n            new_context_layer_shape = context_layer.shape[:-2] + [\n                self.all_head_size\n            ]\n            context_layer = context_layer.reshape(new_context_layer_shape)\n            return context_layer\n        context_av = do_attention(query_layer3, key_layer1, value_layer1,\n                                  attention_mask1, self.dropout_av)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:322-342"
+    },
+    "6117": {
+        "file_id": 487,
+        "content": "This code calculates attention scores between queries, keys, and values, normalizes them to probabilities using softmax, applies dropout, performs matrix multiplication with the values, transposes the result, reshapes it, and returns the context layer. It follows the Transformer paper's approach of dropping out entire tokens to attend to.",
+        "type": "comment"
+    },
+    "6118": {
+        "file_id": 487,
+        "content": "        context_at = do_attention(query_layer3, key_layer2, value_layer2,\n                                  attention_mask2, self.dropout_at)\n        context_key_av = self.key_av(context_av).transpose((0, 2, 1))\n        # interpolate only support 4-D tensor now.\n        context_key_av = F.interpolate(context_key_av.unsqueeze(-1),\n                                       size=(key_layer2.shape[2],\n                                             1)).squeeze(-1)\n        context_key_av = self.transpose_for_scores(\n            context_key_av.transpose((0, 2, 1)))\n        key_layer2 = key_layer2 + context_key_av\n        context_key_at = self.key_at(context_at).transpose((0, 2, 1))\n        context_key_at = F.interpolate(context_key_at.unsqueeze(-1),\n                                       size=(key_layer1.shape[2],\n                                             1)).squeeze(-1)\n        context_key_at = self.transpose_for_scores(\n            context_key_at.transpose((0, 2, 1)))\n        key_layer1 = key_layer1 + context_key_at",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:343-361"
+    },
+    "6119": {
+        "file_id": 487,
+        "content": "This code is performing attention mechanism for multi-scale context fusion. It uses interpolation to resize the context features, then adds them to the original key layers (key_layer2 and key_layer1). The purpose of this is to incorporate contextual information from different scales into the model's understanding.",
+        "type": "comment"
+    },
+    "6120": {
+        "file_id": 487,
+        "content": "        context_val_av = self.value_at(context_av).transpose((0, 2, 1))\n        context_val_av = F.interpolate(context_val_av.unsqueeze(-1),\n                                       size=(value_layer2.shape[2],\n                                             1)).squeeze(-1)\n        context_val_av = self.transpose_for_scores(\n            context_val_av.transpose((0, 2, 1)))\n        value_layer2 = value_layer2 + context_val_av\n        context_val_at = self.value_at(context_at).transpose((0, 2, 1))\n        context_val_at = F.interpolate(context_val_at.unsqueeze(-1),\n                                       size=(value_layer1.shape[2],\n                                             1)).squeeze(-1)\n        context_val_at = self.transpose_for_scores(\n            context_val_at.transpose((0, 2, 1)))\n        value_layer1 = value_layer1 + context_val_at\n        context_layer1 = do_attention(query_layer1, key_layer1, value_layer1,\n                                      attention_mask1, self.dropout1)\n        context_layer2 = do_attention(query_layer2, key_layer2, value_layer2,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:363-381"
+    },
+    "6121": {
+        "file_id": 487,
+        "content": "This code snippet is performing cross-attention in a transformer model. It first interpolates and adds context vectors to value layers, then applies attention mechanisms to compute context layers. This process helps to capture dependencies between different parts of the input data effectively.",
+        "type": "comment"
+    },
+    "6122": {
+        "file_id": 487,
+        "content": "                                      attention_mask2, self.dropout2)\n        context_layer3 = do_attention(query_layer3, key_layer3, value_layer3,\n                                      attention_mask3, self.dropout3)\n        return context_layer1, context_layer2, context_layer3  # vision, text, action\nclass BertEntOutput(nn.Layer):\n    def __init__(\n        self,\n        bi_hidden_size,\n        hidden_size,\n        v_hidden_size,\n        v_hidden_dropout_prob,\n        hidden_dropout_prob,\n    ):\n        super(BertEntOutput, self).__init__()\n        self.dense1 = nn.Linear(bi_hidden_size, v_hidden_size)\n        self.LayerNorm1 = nn.LayerNorm(v_hidden_size, epsilon=1e-12)\n        self.dropout1 = nn.Dropout(v_hidden_dropout_prob)\n        self.dense2 = nn.Linear(bi_hidden_size, hidden_size)\n        self.LayerNorm2 = nn.LayerNorm(hidden_size, epsilon=1e-12)\n        self.dropout2 = nn.Dropout(hidden_dropout_prob)\n        self.dense3 = nn.Linear(bi_hidden_size, hidden_size)\n        self.LayerNorm3 = nn.LayerNorm(hidden_size, epsilon=1e-12)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:382-409"
+    },
+    "6123": {
+        "file_id": 487,
+        "content": "The code defines a class \"BertEntOutput\" with several layers including dense and dropout. It uses bi-hidden size, hidden size, v_hidden_size, and corresponding dropout probabilities for initializing the layers. This class seems to be part of a model architecture where it performs layer normalization, applies dropout regularization, and linear transformations to process input data.",
+        "type": "comment"
+    },
+    "6124": {
+        "file_id": 487,
+        "content": "        self.dropout3 = nn.Dropout(hidden_dropout_prob)\n    def forward(\n        self,\n        hidden_states1,\n        input_tensor1,\n        hidden_states2,\n        input_tensor2,\n        hidden_states3,\n        input_tensor3,\n    ):\n        context_state1 = self.dense1(hidden_states1)\n        context_state1 = self.dropout1(context_state1)\n        context_state2 = self.dense2(hidden_states2)\n        context_state2 = self.dropout2(context_state2)\n        context_state3 = self.dense3(hidden_states3)\n        context_state3 = self.dropout3(context_state3)\n        hidden_states1 = self.LayerNorm1(context_state1 + input_tensor1)\n        hidden_states2 = self.LayerNorm2(context_state2 + input_tensor2)\n        hidden_states3 = self.LayerNorm3(context_state3 + input_tensor3)\n        return hidden_states1, hidden_states2, hidden_states3\nclass BertLayer(nn.Layer):\n    def __init__(self, hidden_size, intermediate_size, hidden_act,\n                 hidden_dropout_prob, num_attention_heads,\n                 attention_probs_dropout_prob):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:410-440"
+    },
+    "6125": {
+        "file_id": 487,
+        "content": "This code defines a BertLayer class with Dropout layers and dense layers, performing attention mechanism. The forward method applies these layers to hidden_states1,2,3, and returns the updated hidden states after adding them with their corresponding input tensors.",
+        "type": "comment"
+    },
+    "6126": {
+        "file_id": 487,
+        "content": "        super(BertLayer, self).__init__()\n        self.attention = BertAttention(hidden_size, hidden_dropout_prob,\n                                       num_attention_heads,\n                                       attention_probs_dropout_prob)\n        self.intermediate = BertIntermediate(hidden_size, intermediate_size,\n                                             hidden_act)\n        self.output = BertOutput(intermediate_size, hidden_size,\n                                 hidden_dropout_prob)\n    def forward(self, hidden_states, attention_mask):\n        attention_output, attention_probs = self.attention(\n            hidden_states, attention_mask)\n        intermediate_output = self.intermediate(attention_output)\n        layer_output = self.output(intermediate_output, attention_output)\n        return layer_output, attention_probs\nclass BertConnectionLayer(nn.Layer):\n    def __init__(self, hidden_size, v_hidden_size, a_hidden_size,\n                 bi_hidden_size, bi_num_attention_heads,\n                 attention_probs_dropout_prob, v_attention_probs_dropout_prob,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:441-461"
+    },
+    "6127": {
+        "file_id": 487,
+        "content": "This code defines a BertLayer class and a BertConnectionLayer class. The BertLayer class includes an attention layer, an intermediate layer, and an output layer. It has a forward function that performs the calculations for these layers. The BertConnectionLayer class is a subclass of nn.Layer with various parameters for hidden sizes and attention dropout probabilities.",
+        "type": "comment"
+    },
+    "6128": {
+        "file_id": 487,
+        "content": "                 a_attention_probs_dropout_prob,\n                 av_attention_probs_dropout_prob,\n                 at_attention_probs_dropout_prob, intermediate_size,\n                 v_intermediate_size, a_intermediate_size, hidden_act,\n                 v_hidden_act, a_hidden_act, hidden_dropout_prob,\n                 v_hidden_dropout_prob, a_hidden_dropout_prob):\n        super(BertConnectionLayer, self).__init__()\n        self.ent_attention = BertEntAttention(\n            hidden_size,\n            v_hidden_size,\n            a_hidden_size,\n            bi_hidden_size,\n            attention_probs_dropout_prob,\n            v_attention_probs_dropout_prob,\n            a_attention_probs_dropout_prob,\n            av_attention_probs_dropout_prob,\n            at_attention_probs_dropout_prob,\n            bi_num_attention_heads,\n        )\n        self.ent_output = BertEntOutput(\n            bi_hidden_size,\n            hidden_size,\n            v_hidden_size,\n            v_hidden_dropout_prob,\n            hidden_dropout_prob,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:462-487"
+    },
+    "6129": {
+        "file_id": 487,
+        "content": "This code initializes a BertConnectionLayer object with various parameters including hidden size, attention probabilities dropout probability, and intermediate size. It also initializes two other objects: BertEntAttention and BertEntOutput. The BertEntAttention object is responsible for performing entity-based attention while the BertEntOutput object is responsible for producing the output of the connection layer.",
+        "type": "comment"
+    },
+    "6130": {
+        "file_id": 487,
+        "content": "        )\n        self.v_intermediate = BertIntermediate(v_hidden_size,\n                                               v_intermediate_size,\n                                               v_hidden_act)\n        self.v_output = BertOutput(v_intermediate_size, v_hidden_size,\n                                   v_hidden_dropout_prob)\n        self.t_intermediate = BertIntermediate(hidden_size, intermediate_size,\n                                               hidden_act)\n        self.t_output = BertOutput(intermediate_size, hidden_size,\n                                   hidden_dropout_prob)\n        self.a_intermediate = BertIntermediate(a_hidden_size,\n                                               a_intermediate_size,\n                                               a_hidden_act)\n        self.a_output = BertOutput(a_intermediate_size, a_hidden_size,\n                                   a_hidden_dropout_prob)\n    def forward(\n        self,\n        input_tensor1,\n        attention_mask1,\n        input_tensor2,\n        attention_mask2,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:488-512"
+    },
+    "6131": {
+        "file_id": 487,
+        "content": "This code defines a model with three input streams (v, t, and a) and initializes intermediate layers and output layers for each stream. The forward function takes in two pairs of input tensors and attention masks for each stream.",
+        "type": "comment"
+    },
+    "6132": {
+        "file_id": 487,
+        "content": "        input_tensor3,\n        attention_mask3,\n    ):\n        ent_output1, ent_output2, ent_output3 = self.ent_attention(\n            input_tensor1, attention_mask1, input_tensor2, attention_mask2,\n            input_tensor3, attention_mask3)\n        attention_output1, attention_output2, attention_output3 = self.ent_output(\n            ent_output1, input_tensor1, ent_output2, input_tensor2, ent_output3,\n            input_tensor3)\n        intermediate_output1 = self.v_intermediate(attention_output1)\n        layer_output1 = self.v_output(intermediate_output1, attention_output1)\n        intermediate_output2 = self.t_intermediate(attention_output2)\n        layer_output2 = self.t_output(intermediate_output2, attention_output2)\n        intermediate_output3 = self.a_intermediate(attention_output3)\n        layer_output3 = self.a_output(intermediate_output3, attention_output3)\n        return layer_output1, layer_output2, layer_output3\nclass BertEncoder(nn.Layer):\n    \"\"\"\n    ActBert Encoder, consists 3 pathway of multi-BertLayers and BertConnectionLayer.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:513-539"
+    },
+    "6133": {
+        "file_id": 487,
+        "content": "This function computes the layer outputs for three different pathways using the ActBert Encoder. It utilizes attention masks and input tensors for each pathway, passing them through various intermediate layers before returning the final layer outputs. The BertEncoder class represents a combination of three multi-BertLayers and BertConnectionLayer.",
+        "type": "comment"
+    },
+    "6134": {
+        "file_id": 487,
+        "content": "    \"\"\"\n    def __init__(\n        self,\n        v_ent_attention_id,\n        t_ent_attention_id,\n        a_ent_attention_id,\n        fixed_t_layer,\n        fixed_v_layer,\n        hidden_size,\n        v_hidden_size,\n        a_hidden_size,\n        bi_hidden_size,\n        intermediate_size,\n        v_intermediate_size,\n        a_intermediate_size,\n        hidden_act,\n        v_hidden_act,\n        a_hidden_act,\n        hidden_dropout_prob,\n        v_hidden_dropout_prob,\n        a_hidden_dropout_prob,\n        attention_probs_dropout_prob,\n        v_attention_probs_dropout_prob,\n        a_attention_probs_dropout_prob,\n        av_attention_probs_dropout_prob,\n        at_attention_probs_dropout_prob,\n        num_attention_heads,\n        v_num_attention_heads,\n        a_num_attention_heads,\n        bi_num_attention_heads,\n        num_hidden_layers,\n        v_num_hidden_layers,\n        a_num_hidden_layers,\n    ):\n        super(BertEncoder, self).__init__()\n        self.v_ent_attention_id = v_ent_attention_id\n        self.t_ent_attention_id = t_ent_attention_id",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:540-576"
+    },
+    "6135": {
+        "file_id": 487,
+        "content": "This code defines the `BertEncoder` class, which initializes various parameters for the BERT model's encoder. These parameters include attention IDs, fixed layer positions, hidden sizes, and dropout probabilities for different components of the model. The class extends `super(BertEncoder, self).__init__()`, indicating it inherits from another class.",
+        "type": "comment"
+    },
+    "6136": {
+        "file_id": 487,
+        "content": "        self.a_ent_attention_id = a_ent_attention_id\n        self.fixed_t_layer = fixed_t_layer\n        self.fixed_v_layer = fixed_v_layer\n        layer = BertLayer(hidden_size, intermediate_size, hidden_act,\n                          hidden_dropout_prob, num_attention_heads,\n                          attention_probs_dropout_prob)\n        v_layer = BertLayer(v_hidden_size, v_intermediate_size, v_hidden_act,\n                            v_hidden_dropout_prob, v_num_attention_heads,\n                            v_attention_probs_dropout_prob)\n        a_layer = BertLayer(a_hidden_size, a_intermediate_size, a_hidden_act,\n                            a_hidden_dropout_prob, a_num_attention_heads,\n                            a_attention_probs_dropout_prob)\n        connect_layer = BertConnectionLayer(\n            hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,\n            bi_num_attention_heads, attention_probs_dropout_prob,\n            v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,\n            av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:577-594"
+    },
+    "6137": {
+        "file_id": 487,
+        "content": "This code initializes three BertLayer objects and one BertConnectionLayer object with different hidden sizes, intermediate sizes, activation functions, dropout probabilities, and attention head numbers. These layers will be used for encoding input sequences in the Actor-Critic Transformer (ACT) model.",
+        "type": "comment"
+    },
+    "6138": {
+        "file_id": 487,
+        "content": "            intermediate_size, v_intermediate_size, a_intermediate_size,\n            hidden_act, v_hidden_act, a_hidden_act, hidden_dropout_prob,\n            v_hidden_dropout_prob, a_hidden_dropout_prob)\n        self.layer = nn.LayerList(\n            [copy.deepcopy(layer) for _ in range(num_hidden_layers)])  #12\n        self.v_layer = nn.LayerList(\n            [copy.deepcopy(v_layer) for _ in range(v_num_hidden_layers)])  #2\n        self.a_layer = nn.LayerList(\n            [copy.deepcopy(a_layer) for _ in range(a_num_hidden_layers)])  #3\n        self.c_layer = nn.LayerList([\n            copy.deepcopy(connect_layer) for _ in range(len(v_ent_attention_id))\n        ]  #2  [0,1]\n                                    )\n    def forward(\n        self,\n        txt_embedding,\n        image_embedding,\n        action_embedding,\n        txt_attention_mask,\n        image_attention_mask,\n        action_attention_mask,\n        output_all_encoded_layers=True,\n    ):\n        v_start, a_start, t_start = 0, 0, 0\n        count = 0\n        all_encoder_layers_t = []",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:595-622"
+    },
+    "6139": {
+        "file_id": 487,
+        "content": "This code defines a model with separate layers for text (txt_layer), vision (v_layer), and action (a_layer) embeddings. It also includes a connect_layer to combine the visual and action information for attention masks. The forward method takes input embeddings, attention masks, and an optional parameter to output all encoded layers.",
+        "type": "comment"
+    },
+    "6140": {
+        "file_id": 487,
+        "content": "        all_encoder_layers_v = []\n        all_encoder_layers_a = []\n        for v_layer_id, a_layer_id, t_layer_id in zip(self.v_ent_attention_id,\n                                                      self.a_ent_attention_id,\n                                                      self.t_ent_attention_id):\n            v_end = v_layer_id\n            a_end = a_layer_id\n            t_end = t_layer_id\n            assert self.fixed_t_layer <= t_end\n            assert self.fixed_v_layer <= v_end\n            ### region embedding\n            for idx in range(v_start,\n                             self.fixed_v_layer):  #两次训练，这个循环都没有进去  #前面的层固定住\n                with paddle.no_grad():\n                    image_embedding, image_attention_probs = self.v_layer[idx](\n                        image_embedding, image_attention_mask)\n                    v_start = self.fixed_v_layer\n            for idx in range(v_start, v_end):\n                image_embedding, image_attention_probs = self.v_layer[idx](\n                    image_embedding, image_attention_mask)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:623-645"
+    },
+    "6141": {
+        "file_id": 487,
+        "content": "This code initializes empty lists for all encoder layers in vision and audio. It then iterates through the given layer IDs, splitting them into vision (v), audio (a), and time (t) layers. The code asserts that the fixed time layer is less than or equal to the last time layer, and the fixed vision layer is less than or equal to the last vision layer. Next, it iterates through all vision layers from the start index up to but not including the fixed vision layer. Inside this loop, it applies the corresponding vision layer to the image embedding and attention mask using Paddle's no_grad context manager. Finally, it loops over all vision layers from the start index to the end index (fixed vision layer excluded), applying the corresponding vision layer to the image embedding and attention mask.",
+        "type": "comment"
+    },
+    "6142": {
+        "file_id": 487,
+        "content": "            ### action embedding\n            for idx in range(a_start, a_end):\n                action_embedding, action_attention_probs = self.a_layer[idx](\n                    action_embedding, action_attention_mask)\n            ### text embedding\n            for idx in range(t_start, self.fixed_t_layer):\n                with paddle.no_grad():\n                    txt_embedding, txt_attention_probs = self.layer[idx](\n                        txt_embedding, txt_attention_mask)\n                    t_start = self.fixed_t_layer\n            for idx in range(t_start, t_end):\n                txt_embedding, txt_attention_probs = self.layer[idx](\n                    txt_embedding, txt_attention_mask)\n            image_embedding, txt_embedding, action_embedding = self.c_layer[\n                count](image_embedding, image_attention_mask, txt_embedding,\n                       txt_attention_mask, action_embedding,\n                       action_attention_mask)\n            v_start = v_end\n            t_start = t_end\n            a_start = a_end",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:647-669"
+    },
+    "6143": {
+        "file_id": 487,
+        "content": "This code is performing multi-modal embedding by separately handling action, text, and image embeddings. It iterates through layers for each modality to compute the embeddings and attention probs. Finally, it combines the embeddings in a specific order before potentially updating start/end indices.",
+        "type": "comment"
+    },
+    "6144": {
+        "file_id": 487,
+        "content": "            count += 1\n            if output_all_encoded_layers:\n                all_encoder_layers_t.append(txt_embedding)\n                all_encoder_layers_v.append(image_embedding)\n                all_encoder_layers_a.append(action_embedding)\n        for idx in range(v_start, len(self.v_layer)):  # 1\n            image_embedding, image_attention_probs = self.v_layer[idx](\n                image_embedding, image_attention_mask)\n        for idx in range(a_start, len(self.a_layer)):\n            action_embedding, action_attention_probs = self.a_layer[idx](\n                action_embedding, action_attention_mask)\n        for idx in range(t_start, len(self.layer)):\n            txt_embedding, txt_attention_probs = self.layer[idx](\n                txt_embedding, txt_attention_mask)\n        # add the end part to finish.\n        if not output_all_encoded_layers:\n            all_encoder_layers_t.append(txt_embedding)  #8, 36, 768\n            all_encoder_layers_v.append(image_embedding)  #8, 37, 1024\n            all_encoder_layers_a.append(action_embedding)  #8, 5, 768",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:670-693"
+    },
+    "6145": {
+        "file_id": 487,
+        "content": "This code is responsible for encoding text, image, and action inputs in a neural network model. It iterates over the layers of each input type to produce their respective encoded representations. If output_all_encoded_layers is set to True, it appends all intermediate encoded layers to separate lists; otherwise, only the final encoded layer is stored. This allows for flexibility in selecting which encoded layers to use in further processing.",
+        "type": "comment"
+    },
+    "6146": {
+        "file_id": 487,
+        "content": "        return all_encoder_layers_t, all_encoder_layers_v, all_encoder_layers_a\nclass BertPooler(nn.Layer):\n    \"\"\" \"Pool\" the model by simply taking the hidden state corresponding\n        to the first token.\n    \"\"\"\n    def __init__(self, hidden_size, bi_hidden_size):\n        super(BertPooler, self).__init__()\n        self.dense = nn.Linear(hidden_size, bi_hidden_size)\n        self.activation = nn.ReLU()\n    def forward(self, hidden_states):\n        first_token_tensor = hidden_states[:, 0]  #8, 768\n        pooled_output = self.dense(first_token_tensor)\n        pooled_output = self.activation(pooled_output)\n        return pooled_output\nclass BertModel(nn.Layer):\n    def __init__(\n        self,\n        vocab_size,\n        max_position_embeddings,\n        type_vocab_size,\n        v_feature_size,\n        a_feature_size,\n        num_hidden_layers,\n        v_num_hidden_layers,\n        a_num_hidden_layers,\n        v_ent_attention_id,\n        t_ent_attention_id,\n        a_ent_attention_id,\n        fixed_t_layer,\n        fixed_v_layer,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:695-729"
+    },
+    "6147": {
+        "file_id": 487,
+        "content": "The code defines a BertPooler class that pools the model by taking the hidden state corresponding to the first token. It also includes a BertModel class with various parameters for initializing the BERT model, including vocab size, max position embeddings, type vocab size, and feature sizes for different entities (v, a). The code also defines attention IDs and fixed layers for tokens and aspects.",
+        "type": "comment"
+    },
+    "6148": {
+        "file_id": 487,
+        "content": "        hidden_size,\n        v_hidden_size,\n        a_hidden_size,\n        bi_hidden_size,\n        intermediate_size,\n        v_intermediate_size,\n        a_intermediate_size,\n        hidden_act,\n        v_hidden_act,\n        a_hidden_act,\n        hidden_dropout_prob,\n        v_hidden_dropout_prob,\n        a_hidden_dropout_prob,\n        attention_probs_dropout_prob,\n        v_attention_probs_dropout_prob,\n        a_attention_probs_dropout_prob,\n        av_attention_probs_dropout_prob,\n        at_attention_probs_dropout_prob,\n        num_attention_heads,\n        v_num_attention_heads,\n        a_num_attention_heads,\n        bi_num_attention_heads,\n    ):\n        super(BertModel, self).__init__()\n        # initilize word embedding\n        self.embeddings = BertEmbeddings(vocab_size, max_position_embeddings,\n                                         type_vocab_size, hidden_size,\n                                         hidden_dropout_prob)\n        # initlize the region embedding\n        self.v_embeddings = BertImageEmbeddings(v_feature_size, v_hidden_size,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:730-759"
+    },
+    "6149": {
+        "file_id": 487,
+        "content": "This code is initializing a BertModel class with various parameters including hidden size, attention-related parameters, and embedding types. It uses the superclass constructor to initialize the base model and then further customizes it by adding word and image embeddings.",
+        "type": "comment"
+    },
+    "6150": {
+        "file_id": 487,
+        "content": "                                                v_hidden_dropout_prob)\n        # initlize the action embedding\n        self.a_embeddings = BertActionEmbeddings(a_feature_size, a_hidden_size,\n                                                 a_hidden_dropout_prob)\n        self.encoder = BertEncoder(\n            v_ent_attention_id, t_ent_attention_id, a_ent_attention_id,\n            fixed_t_layer, fixed_v_layer, hidden_size, v_hidden_size,\n            a_hidden_size, bi_hidden_size, intermediate_size,\n            v_intermediate_size, a_intermediate_size, hidden_act, v_hidden_act,\n            a_hidden_act, hidden_dropout_prob, v_hidden_dropout_prob,\n            a_hidden_dropout_prob, attention_probs_dropout_prob,\n            v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,\n            av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,\n            num_attention_heads, v_num_attention_heads, a_num_attention_heads,\n            bi_num_attention_heads, num_hidden_layers, v_num_hidden_layers,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:760-775"
+    },
+    "6151": {
+        "file_id": 487,
+        "content": "This code is initializing a model for the ACTBERT backbone, which includes an encoder and action embedding. The model has parameters for various hidden sizes, dropout probabilities, attention head numbers, and hidden layer counts for both textual (v), visual (t), and action (a) components. The model also uses different activation functions for each component.",
+        "type": "comment"
+    },
+    "6152": {
+        "file_id": 487,
+        "content": "            a_num_hidden_layers)\n        self.t_pooler = BertPooler(hidden_size, bi_hidden_size)\n        self.v_pooler = BertPooler(v_hidden_size, bi_hidden_size)\n        self.a_pooler = BertPooler(a_hidden_size, bi_hidden_size)\n    def forward(\n        self,\n        text_ids,\n        action_feat,\n        image_feat,\n        image_loc,\n        token_type_ids=None,\n        text_mask=None,\n        image_mask=None,\n        action_mask=None,\n        output_all_encoded_layers=False,\n    ):\n        \"\"\"\n        text_ids: input text ids. Shape: [batch_size, seqence_length]\n        action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]\n        image_feat: input image feature. Shape: [batch_size, region_length, image_feature_dim]]\n        image_loc: input region location. Shape: [batch_size, region_length, region_location_dim]\n        token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]\n        text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:776-800"
+    },
+    "6153": {
+        "file_id": 487,
+        "content": "This code defines a class for a model that takes in text, action, and image features as inputs. It initializes three poolers for text, action, and visual features. The forward function processes the input data and returns encoded layers based on the inputs received. The output_all_encoded_layers parameter allows getting all encoded layers if set to True.",
+        "type": "comment"
+    },
+    "6154": {
+        "file_id": 487,
+        "content": "        image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]\n        action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]\n        output_all_encoded_layers: is output encoded layers feature or not. Type: Bool.\n        \"\"\"\n        if text_mask is None:\n            text_mask = paddle.ones_like(text_ids)\n        if token_type_ids is None:\n            token_type_ids = paddle.zeros_like(text_ids)\n        if image_mask is None:\n            image_mask = paddle.ones(image_feat.shape[0],\n                                     image_feat.shape[1]).astype(text_ids.dtype)\n        if action_mask is None:\n            action_mask = paddle.ones(action_feat.shape[0],\n                                      action_feat.shape[1]).astype(\n                                          text_ids.dtype)\n        # We create a 3D attention mask from a 2D tensor mask.\n        # Sizes are [batch_size, 1, 1, to_seq_length]\n        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length].",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:801-819"
+    },
+    "6155": {
+        "file_id": 487,
+        "content": "This code checks if the input masks for text, token_type, image, and action are None. If any of them are None, it generates a mask with the same shape as the corresponding feature tensor and fills it with ones (real tokens) or zeros (padding tokens). The attention mask is created from the 2D tensor mask to be used in the multi-head attention mechanism, which broadcasts to [batch_size, num_heads, from_seq_length, to_seq_length].",
+        "type": "comment"
+    },
+    "6156": {
+        "file_id": 487,
+        "content": "        extended_text_mask = text_mask.unsqueeze(1).unsqueeze(2)\n        extended_image_mask = image_mask.unsqueeze(1).unsqueeze(2)\n        extended_action_mask = action_mask.unsqueeze(1).unsqueeze(2)\n        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for\n        # masked positions, this operation will create a tensor which is 0.0 for\n        # positions we want to attend and -10000.0 for masked positions.\n        # Since we are adding it to the raw scores before the softmax, this is\n        # effectively the same as removing these entirely.\n        def set_mask(extended_attention_mask):\n            extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0\n            return extended_attention_mask\n        extended_text_mask = set_mask(extended_text_mask)\n        extended_image_mask = set_mask(extended_image_mask)\n        extended_action_mask = set_mask(extended_action_mask)\n        t_embedding_output = self.embeddings(text_ids, token_type_ids)\n        v_embedding_output = self.v_embeddings(image_feat, image_loc)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:820-838"
+    },
+    "6157": {
+        "file_id": 487,
+        "content": "This code segment is part of a backbone model for the ACTBERT. It creates extended masks for text, image, and action inputs by unsqueezing the existing masks along dimensions 1 and 2. The function set_mask is then used to multiply each mask with -10000.0 at positions where we want to attend, effectively removing those positions from the attention process. This is done for all three input types: text, image, and action. Finally, the code applies the embeddings to the text inputs using self.embeddings.",
+        "type": "comment"
+    },
+    "6158": {
+        "file_id": 487,
+        "content": "        a_embedding_output = self.a_embeddings(action_feat)\n        # var = [t_embedding_output, v_embedding_output, a_embedding_output]\n        # import numpy as np\n        # for i, item in enumerate(var):\n        #     np.save('tmp/' + str(i)+'.npy', item.numpy())\n        encoded_layers_t, encoded_layers_v, encoded_layers_a = self.encoder(\n            t_embedding_output,\n            v_embedding_output,\n            a_embedding_output,\n            extended_text_mask,\n            extended_image_mask,\n            extended_action_mask,\n            output_all_encoded_layers=output_all_encoded_layers,\n        )\n        sequence_output_t = encoded_layers_t[-1]  #get item from list\n        sequence_output_v = encoded_layers_v[-1]\n        sequence_output_a = encoded_layers_a[-1]\n        pooled_output_t = self.t_pooler(sequence_output_t)\n        pooled_output_v = self.v_pooler(sequence_output_v)\n        pooled_output_a = self.a_pooler(sequence_output_a)\n        if not output_all_encoded_layers:\n            encoded_layers_t = encoded_layers_t[-1]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:839-865"
+    },
+    "6159": {
+        "file_id": 487,
+        "content": "This code is part of a backbone for a multimodal model, specifically ACTBERT. It first computes the embedding outputs for text (t), vision (v), and action (a) features. Then it passes these embeddings to an encoder to obtain encoded layers for each modality. The last hidden state from each encoder is used as a sequence output, and a pooled output is also computed using separate poolers for each modality. If output_all_encoded_layers is False, the code reduces the encoded layers to their last hidden states.",
+        "type": "comment"
+    },
+    "6160": {
+        "file_id": 487,
+        "content": "            encoded_layers_v = encoded_layers_v[-1]\n            encoded_layers_a = encoded_layers_a[-1]\n        return encoded_layers_t, encoded_layers_v, encoded_layers_a, \\\n            pooled_output_t, pooled_output_v, pooled_output_a\n# For Head\nclass BertPredictionHeadTransform(nn.Layer):\n    def __init__(self, hidden_size, hidden_act):\n        super(BertPredictionHeadTransform, self).__init__()\n        self.dense = nn.Linear(hidden_size, hidden_size)\n        if isinstance(hidden_act, str) or (sys.version_info[0] == 2\n                                           and isinstance(hidden_act, str)):\n            self.transform_act_fn = ACT2FN[hidden_act]\n        else:\n            self.transform_act_fn = hidden_act\n        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)\n    def forward(self, hidden_states):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.transform_act_fn(hidden_states)\n        hidden_states = self.LayerNorm(hidden_states)\n        return hidden_states\nclass BertLMPredictionHead(nn.Layer):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:866-892"
+    },
+    "6161": {
+        "file_id": 487,
+        "content": "This code defines two classes: `BertPredictionHeadTransform` and `BertLMPredictionHead`. The former is a transform layer used in BERT's prediction heads. It applies a dense layer followed by an activation function and layer normalization. The latter is the prediction head itself, which takes input hidden states, applies the transform defined in `BertPredictionHeadTransform`, and returns output.",
+        "type": "comment"
+    },
+    "6162": {
+        "file_id": 487,
+        "content": "    def __init__(self, hidden_size, hidden_act, bert_model_embedding_weights):\n        super(BertLMPredictionHead, self).__init__()\n        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)\n        # The output weights are the same as the input embeddings, but there is\n        # an output-only bias for each token.\n        assert bert_model_embedding_weights.shape[1] == hidden_size\n        vocab_size = bert_model_embedding_weights.shape[0]\n        # another implementation which would create another big params:\n        # self.decoder = nn.Linear(hidden_size, vocab_size)   # NOTE bias default: constant 0.0\n        # self.decoder.weight = self.create_parameter(shape=[hidden_size, vocab_size],\n        #                                             default_initializer=nn.initializer.Assign(\n        #                                                 bert_model_embedding_weights.t()))  # transpose\n        self.decoder_weight = bert_model_embedding_weights\n        self.decoder_bias = self.create_parameter(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:893-909"
+    },
+    "6163": {
+        "file_id": 487,
+        "content": "This code initializes the BertLMPredictionHead class, which is a part of the BERT model. It takes in hidden_size, hidden_act, and bert_model_embedding_weights as parameters. The class uses these to initialize its transform and decoder components. The decoder component has a weight equal to the input embedding weights with an output-only bias for each token. This implementation avoids creating additional large parameters by directly assigning the input embedding weights to the decoder's weight attribute.",
+        "type": "comment"
+    },
+    "6164": {
+        "file_id": 487,
+        "content": "            shape=[vocab_size],\n            dtype=bert_model_embedding_weights.dtype,\n            is_bias=True)  # NOTE bias default: constant 0.0\n    def forward(self, hidden_states):\n        hidden_states = self.transform(hidden_states)\n        hidden_states = paddle.tensor.matmul(\n            hidden_states, self.decoder_weight,\n            transpose_y=True) + self.decoder_bias\n        return hidden_states\nclass BertImageActionPredictionHead(nn.Layer):\n    def __init__(self, hidden_size, hidden_act, target_size):\n        super(BertImageActionPredictionHead, self).__init__()\n        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)\n        self.decoder = nn.Linear(hidden_size, target_size)\n    def forward(self, hidden_states):\n        hidden_states = self.transform(hidden_states)\n        hidden_states = self.decoder(hidden_states)\n        return hidden_states\nclass BertPreTrainingHeads(nn.Layer):\n    def __init__(self, hidden_size, v_hidden_size, a_hidden_size,\n                 bi_hidden_size, hidden_act, v_hidden_act, a_hidden_act,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:910-937"
+    },
+    "6165": {
+        "file_id": 487,
+        "content": "This code defines three classes: ActBert, BertImageActionPredictionHead, and BertPreTrainingHeads. ActBert is a layer that performs attention and feedforward operations in BERT's transformer layers. The BertImageActionPredictionHead class is responsible for the image action prediction task in BERT. Finally, the BertPreTrainingHeads class includes multiple layers for pre-training tasks such as masked language modeling, next sentence prediction, and SQUAD question answering.",
+        "type": "comment"
+    },
+    "6166": {
+        "file_id": 487,
+        "content": "                 v_target_size, a_target_size, fusion_method,\n                 bert_model_embedding_weights):\n        super(BertPreTrainingHeads, self).__init__()\n        self.predictions = BertLMPredictionHead(hidden_size, hidden_act,\n                                                bert_model_embedding_weights)\n        self.seq_relationship = nn.Linear(bi_hidden_size, 2)\n        self.imagePredictions = BertImageActionPredictionHead(\n            v_hidden_size, v_hidden_act, v_target_size)  # visual class number\n        self.actionPredictions = BertImageActionPredictionHead(\n            a_hidden_size, a_hidden_act, a_target_size)  # action class number\n        self.fusion_method = fusion_method\n        self.dropout = nn.Dropout(0.1)\n    def forward(self, sequence_output_t, sequence_output_v, sequence_output_a,\n                pooled_output_t, pooled_output_v, pooled_output_a):\n        if self.fusion_method == 'sum':\n            pooled_output = self.dropout(pooled_output_t + pooled_output_v +\n                                         pooled_output_a)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:938-956"
+    },
+    "6167": {
+        "file_id": 487,
+        "content": "The code defines a class BertPreTrainingHeads that extends an existing class. It initializes the necessary modules for prediction and fusion. The forward function performs pooling and fusion operations based on the specified fusion method ('sum').",
+        "type": "comment"
+    },
+    "6168": {
+        "file_id": 487,
+        "content": "        elif self.fusion_method == 'mul':\n            pooled_output = self.dropout(pooled_output_t * pooled_output_v +\n                                         pooled_output_a)\n        else:\n            assert False\n        prediction_scores_t = self.predictions(\n            sequence_output_t)  # 8， 36 ，30522\n        seq_relationship_score = self.seq_relationship(pooled_output)  # 8, 2\n        prediction_scores_v = self.imagePredictions(\n            sequence_output_v)  # 8, 37, 1601\n        prediction_scores_a = self.actionPredictions(\n            sequence_output_a)  # 8, 5, 401\n        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score\n@BACKBONES.register()\nclass BertForMultiModalPreTraining(nn.Layer):\n    \"\"\"BERT model with multi modal pre-training heads.\n    \"\"\"\n    def __init__(\n        self,\n        vocab_size=30522,\n        max_position_embeddings=512,\n        type_vocab_size=2,\n        v_target_size=1601,\n        a_target_size=700,\n        v_feature_size=2048,\n        a_feature_size=2048,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:957-986"
+    },
+    "6169": {
+        "file_id": 487,
+        "content": "In this code snippet, the model is returning prediction scores for text (t), video (v), and action (a) inputs along with a sequence relationship score. The model is a multi-modal pre-training BERT model with fusion method 'mul'. If the fusion method is not 'mul', it will raise an assertion error.",
+        "type": "comment"
+    },
+    "6170": {
+        "file_id": 487,
+        "content": "        num_hidden_layers=12,\n        v_num_hidden_layers=2,\n        a_num_hidden_layers=3,\n        t_ent_attention_id=[10, 11],\n        v_ent_attention_id=[0, 1],\n        a_ent_attention_id=[0, 1],\n        fixed_t_layer=0,\n        fixed_v_layer=0,\n        hidden_size=768,\n        v_hidden_size=1024,\n        a_hidden_size=768,\n        bi_hidden_size=1024,\n        intermediate_size=3072,\n        v_intermediate_size=1024,\n        a_intermediate_size=3072,\n        hidden_act=\"gelu\",\n        v_hidden_act=\"gelu\",\n        a_hidden_act=\"gelu\",\n        hidden_dropout_prob=0.1,\n        v_hidden_dropout_prob=0.1,\n        a_hidden_dropout_prob=0.1,\n        attention_probs_dropout_prob=0.1,\n        v_attention_probs_dropout_prob=0.1,\n        a_attention_probs_dropout_prob=0.1,\n        av_attention_probs_dropout_prob=0.1,\n        at_attention_probs_dropout_prob=0.1,\n        num_attention_heads=12,\n        v_num_attention_heads=8,\n        a_num_attention_heads=12,\n        bi_num_attention_heads=8,\n        fusion_method=\"mul\",\n        pretrained=None,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:987-1018"
+    },
+    "6171": {
+        "file_id": 487,
+        "content": "This code defines a custom transformer backbone model for ACT-BERT, with specific configurations for the text (t), video (v), and audio (a) modalities. It includes parameters such as hidden layer numbers, sizes, activation functions, dropout rates, attention heads, and fusion method. The pretrained parameter is set to None.",
+        "type": "comment"
+    },
+    "6172": {
+        "file_id": 487,
+        "content": "    ):\n        \"\"\"\n        vocab_size: vocabulary size. Default: 30522.\n        max_position_embeddings: max position id. Default: 512.\n        type_vocab_size: max segment id. Default: 2.\n        v_target_size: class number of visual word. Default: 1601.\n        a_target_size: class number of action word. Default: 700.\n        v_feature_size: input visual feature dimension. Default: 2048.\n        a_feature_size: input action feature dimension. Default: 2048.\n        num_hidden_layers: number of BertLayer in text transformer. Default: 12.\n        v_num_hidden_layers: number of BertLayer in visual transformer. Default: 2.\n        a_num_hidden_layers: number of BertLayer in action transformer. Default:3.\n        t_ent_attention_id: index id of BertConnectionLayer in text transformer. Default: [10, 11].\n        v_ent_attention_id: index id of BertConnectionLayer in visual transformer. Default:[0, 1].\n        a_ent_attention_id: index id of BertConnectionLayer in action transformer. Default:[0, 1].\n        fixed_t_layer: index id of fixed BertLayer in text transformer. Default: 0.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:1019-1034"
+    },
+    "6173": {
+        "file_id": 487,
+        "content": "This function defines the input parameters for an ActBERT model, including vocabulary size, maximum position embedding, type vocab size, visual and action target sizes, feature sizes for vision and actions, number of hidden layers in text, visual, and action transformers, index IDs for BertConnectionLayer, and a fixed layer index for the text transformer.",
+        "type": "comment"
+    },
+    "6174": {
+        "file_id": 487,
+        "content": "        fixed_v_layer: index id of fixed BertLayer in visual transformer. Default: 0.\n        hidden_size: hidden size in text BertLayer. Default: 768.\n        v_hidden_size: hidden size in visual BertLayer. Default: 1024.\n        a_hidden_size: hidden size in action BertLayer. Default: 768.\n        bi_hidden_size: hidden size in BertConnectionLayer. Default: 1024,\n        intermediate_size: intermediate size in text BertLayer. Default: 3072.\n        v_intermediate_size: intermediate size in visual BertLayer. Default: 1024.\n        a_intermediate_size: intermediate size in text BertLayer. Default: 3072.\n        hidden_act: hidden activation function in text BertLayer. Default: \"gelu\".\n        v_hidden_act: hidden activation function in visual BertLayer. Default: \"gelu\".\n        a_hidden_act: hidden activation function in action BertLayer. Default: \"gelu\".\n        hidden_dropout_prob: hidden dropout probability in text Embedding Layer. Default: 0.1\n        v_hidden_dropout_prob: hidden dropout probability in visual Embedding Layer. Default: 0.1",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:1035-1047"
+    },
+    "6175": {
+        "file_id": 487,
+        "content": "This code defines parameters for different BertLayers in a model. Fixed_v_layer is the index of a fixed BertLayer in the visual transformer, hidden_size is the hidden size for text and other BERT layers, v_hidden_size is the hidden size for visual BERT layer, a_hidden_size is the hidden size for action BERT layer, bi_hidden_size is the hidden size for BertConnectionLayer, intermediate_size is the intermediate size for text and other BERT layers, v_intermediate_size is the intermediate size for visual BERT layer, a_intermediate_size is the intermediate size for action BERT layer, hidden_act is the activation function for text BERT layer, v_hidden_act is the activation function for visual BERT layer, a_hidden_act is the activation function for action BERT layer, and hidden_dropout_prob is the dropout probability for text embedding layer. All default values are provided in case no other values are specified.",
+        "type": "comment"
+    },
+    "6176": {
+        "file_id": 487,
+        "content": "        a_hidden_dropout_prob: hidden dropout probability in action Embedding Layer. Default: 0.1\n        attention_probs_dropout_prob: attention dropout probability in text BertLayer. Default: 0.1\n        v_attention_probs_dropout_prob: attention dropout probability in visual BertLayer. Default: 0.1\n        a_attention_probs_dropout_prob: attention dropout probability in action BertLayer. Default: 0.1\n        av_attention_probs_dropout_prob: attention dropout probability in action-visual BertConnectionLayer. Default: 0.1\n        at_attention_probs_dropout_prob: attention dropout probability in action-text BertConnectionLayer. Default: 0.1\n        num_attention_heads: number of heads in text BertLayer. Default: 12.\n        v_num_attention_heads: number of heads in visual BertLayer. Default: 8.\n        a_num_attention_heads: number of heads in action BertLayer. Default: 12.\n        bi_num_attention_heads: number of heads in BertConnectionLayer. Default: 8.\n        fusion_method: methods of fusing pooled output from 3 transformer. Default: \"mul\".",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:1048-1058"
+    },
+    "6177": {
+        "file_id": 487,
+        "content": "This code snippet defines default values for various parameters in a transformer model. These parameters include hidden dropout probabilities, attention dropout probabilities, number of attention heads, and fusion methods. The default values are provided for the text, visual, and action BertLayers as well as the BertConnectionLayer.",
+        "type": "comment"
+    },
+    "6178": {
+        "file_id": 487,
+        "content": "        \"\"\"\n        super(BertForMultiModalPreTraining, self).__init__()\n        self.pretrained = pretrained\n        self.vocab_size = vocab_size\n        self.a_target_size = a_target_size\n        self.bert = BertModel(\n            vocab_size,\n            max_position_embeddings,\n            type_vocab_size,\n            v_feature_size,\n            a_feature_size,\n            num_hidden_layers,\n            v_num_hidden_layers,\n            a_num_hidden_layers,\n            v_ent_attention_id,\n            t_ent_attention_id,\n            a_ent_attention_id,\n            fixed_t_layer,\n            fixed_v_layer,\n            hidden_size,\n            v_hidden_size,\n            a_hidden_size,\n            bi_hidden_size,\n            intermediate_size,\n            v_intermediate_size,\n            a_intermediate_size,\n            hidden_act,\n            v_hidden_act,\n            a_hidden_act,\n            hidden_dropout_prob,\n            v_hidden_dropout_prob,\n            a_hidden_dropout_prob,\n            attention_probs_dropout_prob,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:1059-1092"
+    },
+    "6179": {
+        "file_id": 487,
+        "content": "This code initializes an instance of the BertForMultiModalPreTraining class, which is a pre-trained model for multi-modal tasks. It takes in various parameters such as vocab_size, max_position_embeddings, type_vocab_size, v_feature_size, a_feature_size, num_hidden_layers, v_num_hidden_layers, a_num_hidden_layers, v_ent_attention_id, t_ent_attention_id, a_ent_attention_id, fixed_t_layer, fixed_v_layer, hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size, intermediate_size, v_intermediate_size, a_intermediate_size, hidden_act, v_hidden_act, a_hidden_act, hidden_dropout_prob, v_hidden_dropout_prob, and a_hidden_dropout_prob. These parameters define the architecture and behavior of the model. The super() function is used to call a method from the parent class, in this case, BertModel. The pretrained variable indicates whether the model should use pre-trained weights or not.",
+        "type": "comment"
+    },
+    "6180": {
+        "file_id": 487,
+        "content": "            v_attention_probs_dropout_prob,\n            a_attention_probs_dropout_prob,\n            av_attention_probs_dropout_prob,\n            at_attention_probs_dropout_prob,\n            num_attention_heads,\n            v_num_attention_heads,\n            a_num_attention_heads,\n            bi_num_attention_heads,\n        )\n        self.cls = BertPreTrainingHeads(\n            hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,\n            hidden_act, v_hidden_act, a_hidden_act, v_target_size,\n            a_target_size, fusion_method,\n            self.bert.embeddings.word_embeddings.weight)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, (nn.Linear, nn.Embedding)):\n                    weight_init_(layer, 'Normal', std=0.02)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:1093-1116"
+    },
+    "6181": {
+        "file_id": 487,
+        "content": "This code initializes the parameters of a pre-trained ACTBERT model. It checks if the model has been pre-trained and, if not, initializes the weights for the layers (using normal distribution with standard deviation 0.02).",
+        "type": "comment"
+    },
+    "6182": {
+        "file_id": 487,
+        "content": "                elif isinstance(layer, nn.LayerNorm):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(\n            self,\n            text_ids,  #8,36\n            action_feat,  #8,5,2048\n            image_feat,  #8,37,2048\n            image_loc,  #8,37,5\n            token_type_ids=None,  #8,36\n            text_mask=None,  #8,36\n            image_mask=None,  #8,37\n            action_mask=None,  #8,5\n    ):\n        \"\"\"\n        text_ids: input text ids. Shape: [batch_size, seqence_length]\n        action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]\n        image_feat: input image feature. Shape: [batch_size, region_length+1, image_feature_dim]], add 1 for image global feature.\n        image_loc: input region location. Shape: [batch_size, region_length+1, region_location_dim], add 1 for image global feature location.\n        token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]\n        text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:1117-1137"
+    },
+    "6183": {
+        "file_id": 487,
+        "content": "The code defines a function \"forward\" that takes text_ids, action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, and action_mask as input. The text_ids represent input text ids of shape [batch_size, sequence_length]. Action_feat is the input action feature of shape [batch_size, action_length, action_feature_dim], while image_feat is the input image feature of shape [batch_size, region_length+1, image_feature_dim] (adding 1 for global image feature). Image_loc represents the input region location of shape [batch_size, region_length+1, region_location_dim] (adding 1 for global image feature location). Token_type_ids represent segment ids of each video clip and are of shape [batch_size, sequence_length]. Text_mask is a binary mask representing real tokens as 1 and padding tokens as 0 with shape [batch_size, sequence_length]. Image_mask and action_mask also serve similar functions but for image and action respectively.",
+        "type": "comment"
+    },
+    "6184": {
+        "file_id": 487,
+        "content": "        image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]\n        action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]\n        \"\"\"\n        sequence_output_t, sequence_output_v, sequence_output_a, \\\n        pooled_output_t, pooled_output_v, pooled_output_a = self.bert(\n            text_ids,\n            action_feat,\n            image_feat,\n            image_loc,\n            token_type_ids,\n            text_mask,\n            image_mask,\n            action_mask,\n            output_all_encoded_layers=False,\n        )\n        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.cls(\n            sequence_output_t, sequence_output_v, sequence_output_a,\n            pooled_output_t, pooled_output_v, pooled_output_a)\n        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/actbert.py:1138-1158"
+    },
+    "6185": {
+        "file_id": 487,
+        "content": "This code is a function that takes in text IDs, action feature, image feature, image location, token type IDs, text mask, image mask, and action mask as inputs. It uses the BERT model to process these inputs and returns prediction scores for each input (text, vision, action) and sequence relationship score.",
+        "type": "comment"
+    },
+    "6186": {
+        "file_id": 488,
+        "content": "/paddlevideo/modeling/backbones/adds.py",
+        "type": "filepath"
+    },
+    "6187": {
+        "file_id": 488,
+        "content": "The code imports necessary libraries, registers object detection backbones, performs vector transformations, defines network creation functions, computes depth prediction, initializes PaddleVideo backbone, includes Project3D layer, calculates SSIM loss, and creates a deep learning model for image processing with ResNet V1.5, DepthDecoder, and PoseDecoder classes. The pose estimation model supports diverse inputs, handles day/night scenarios, computes parameters, generates warped images, and selects data based on conditions.",
+        "type": "summary"
+    },
+    "6188": {
+        "file_id": 488,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nfrom collections import OrderedDict\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn import BatchNorm2D, Conv2D\nfrom paddle.nn.initializer import Constant, Normal\nfrom paddle.vision.models import ResNet\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import kaiming_normal_, _calculate_fan_in_and_fan_out\nzeros_ = Constant(value=0.)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:1-30"
+    },
+    "6189": {
+        "file_id": 488,
+        "content": "This code imports necessary libraries, defines constants, and registers backbones in a PaddlePaddle's object detection model library. It also includes comments for licensing and copyright information as well as function definitions for weight initialization and calculating fan-in and fan-out of layers.",
+        "type": "comment"
+    },
+    "6190": {
+        "file_id": 488,
+        "content": "ones_ = Constant(value=1.)\nnormal_ = Normal(mean=0, std=1e-3)\ndef disp_to_depth(disp, min_depth, max_depth):\n    \"\"\"Convert network's sigmoid output into depth prediction\n    The formula for this conversion is given in the 'additional considerations'\n    section of the paper.\n    \"\"\"\n    min_disp = 1 / max_depth\n    max_disp = 1 / min_depth\n    scaled_disp = min_disp + (max_disp - min_disp) * disp\n    depth = 1 / scaled_disp\n    return scaled_disp, depth\ndef gram_matrix(y):\n    (b, ch, h, w) = y.shape\n    features = y.reshape([b, ch, w * h])\n    features_t = paddle.transpose(features, [0, 2, 1])\n    gram = features.bmm(features_t) / (ch * h * w)\n    return gram\ndef convt_bn_relu(in_channels,\n                  out_channels,\n                  kernel_size,\n                  stride=1,\n                  padding=0,\n                  output_padding=0,\n                  bn=True,\n                  relu=True):\n    bias = not bn\n    layers = []\n    layers.append(\n        nn.Conv2DTranspose(in_channels,\n                           out_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:31-67"
+    },
+    "6191": {
+        "file_id": 488,
+        "content": "Code snippet defines three functions - \"disp_to_depth\" for converting network's sigmoid output into depth prediction, \"gram_matrix\" for computing the Gram matrix of feature maps and \"convt_bn_relu\" for creating a convolution layer with batch normalization and ReLU activation.",
+        "type": "comment"
+    },
+    "6192": {
+        "file_id": 488,
+        "content": "                           kernel_size,\n                           stride,\n                           padding,\n                           output_padding,\n                           bias_attr=bias))\n    if bn:\n        layers.append(nn.BatchNorm2D(out_channels))\n    if relu:\n        layers.append(nn.LeakyReLU(0.2))\n    layers = nn.Sequential(*layers)\n    # initialize the weights\n    for m in layers.sublayers(include_self=True):\n        if isinstance(m, nn.Conv2DTranspose):\n            normal_(m.weight)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.BatchNorm2D):\n            ones_(m.weight)\n            zeros_(m.bias)\n    return layers\ndef transformation_from_parameters(axisangle, translation, invert=False):\n    \"\"\"Convert the network's (axisangle, translation) output into a 4x4 matrix\n    \"\"\"\n    R = rot_from_axisangle(axisangle)\n    t = translation.clone()\n    if invert:\n        R = R.transpose([0, 2, 1])\n        t *= -1\n    T = get_translation_matrix(t)\n    if invert:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:68-104"
+    },
+    "6193": {
+        "file_id": 488,
+        "content": "The code defines a function for creating a convolutional transpose layer, adding batch normalization and Leaky ReLU activation if specified. It also includes weight initialization for the created layers. The second function converts network's (axisangle, translation) output into a 4x4 matrix based on parameters and an optional invert flag.",
+        "type": "comment"
+    },
+    "6194": {
+        "file_id": 488,
+        "content": "        M = paddle.matmul(R, T)\n    else:\n        M = paddle.matmul(T, R)\n    return M\ndef get_translation_matrix(translation_vector):\n    \"\"\"Convert a translation vector into a 4x4 transformation matrix\n    \"\"\"\n    t = translation_vector.reshape([-1, 3, 1])\n    gather_object = paddle.stack([\n        paddle.zeros([\n            translation_vector.shape[0],\n        ], paddle.float32),\n        paddle.ones([\n            translation_vector.shape[0],\n        ], paddle.float32),\n        paddle.squeeze(t[:, 0], axis=-1),\n        paddle.squeeze(t[:, 1], axis=-1),\n        paddle.squeeze(t[:, 2], axis=-1),\n    ])\n    gather_index = paddle.to_tensor([\n        [1],\n        [0],\n        [0],\n        [2],\n        [0],\n        [1],\n        [0],\n        [3],\n        [0],\n        [0],\n        [1],\n        [4],\n        [0],\n        [0],\n        [0],\n        [1],\n    ])\n    T = paddle.gather_nd(gather_object, gather_index)\n    T = T.reshape([4, 4, -1]).transpose((2, 0, 1))\n    return T\ndef rot_from_axisangle(vec):\n    \"\"\"Convert an axisangle rotation into a 4x4 transformation matrix",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:105-151"
+    },
+    "6195": {
+        "file_id": 488,
+        "content": "get_translation_matrix: Converts translation vector to a 4x4 transformation matrix.\nrot_from_axisangle: Converts axis-angle rotation into a 4x4 transformation matrix.",
+        "type": "comment"
+    },
+    "6196": {
+        "file_id": 488,
+        "content": "    (adapted from https://github.com/Wallacoloo/printipi)\n    Input 'vec' has to be Bx1x3\n    \"\"\"\n    angle = paddle.norm(vec, 2, 2, True)\n    axis = vec / (angle + 1e-7)\n    ca = paddle.cos(angle)\n    sa = paddle.sin(angle)\n    C = 1 - ca\n    x = axis[..., 0].unsqueeze(1)\n    y = axis[..., 1].unsqueeze(1)\n    z = axis[..., 2].unsqueeze(1)\n    xs = x * sa\n    ys = y * sa\n    zs = z * sa\n    xC = x * C\n    yC = y * C\n    zC = z * C\n    xyC = x * yC\n    yzC = y * zC\n    zxC = z * xC\n    gather_object = paddle.stack([\n        paddle.squeeze(x * xC + ca, axis=(-1, -2)),\n        paddle.squeeze(xyC - zs, axis=(-1, -2)),\n        paddle.squeeze(zxC + ys, axis=(-1, -2)),\n        paddle.squeeze(xyC + zs, axis=(-1, -2)),\n        paddle.squeeze(y * yC + ca, axis=(-1, -2)),\n        paddle.squeeze(yzC - xs, axis=(-1, -2)),\n        paddle.squeeze(zxC - ys, axis=(-1, -2)),\n        paddle.squeeze(yzC + xs, axis=(-1, -2)),\n        paddle.squeeze(z * zC + ca, axis=(-1, -2)),\n        paddle.ones([\n            vec.shape[0],\n        ], dtype=paddle.float32),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:152-188"
+    },
+    "6197": {
+        "file_id": 488,
+        "content": "This code performs rotation operations on a 3D vector 'vec'. It calculates the angle and axis of rotation, then applies trigonometry to compute rotation matrices. Finally, it gathers transformed vectors using stacked tensor operations.",
+        "type": "comment"
+    },
+    "6198": {
+        "file_id": 488,
+        "content": "        paddle.zeros([\n            vec.shape[0],\n        ], dtype=paddle.float32)\n    ])\n    gather_index = paddle.to_tensor([\n        [0],\n        [1],\n        [2],\n        [10],\n        [3],\n        [4],\n        [5],\n        [10],\n        [6],\n        [7],\n        [8],\n        [10],\n        [10],\n        [10],\n        [10],\n        [9],\n    ])\n    rot = paddle.gather_nd(gather_object, gather_index)\n    rot = rot.reshape([4, 4, -1]).transpose((2, 0, 1))\n    return rot\ndef upsample(x):\n    \"\"\"Upsample input tensor by a factor of 2\n    \"\"\"\n    return F.interpolate(x, scale_factor=2, mode=\"nearest\")\ndef get_smooth_loss(disp, img):\n    \"\"\"Computes the smoothness loss for a disparity image\n    The color image is used for edge-aware smoothness\n    \"\"\"\n    grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])\n    grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])\n    grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),\n                             1,\n                             keepdim=True)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:189-231"
+    },
+    "6199": {
+        "file_id": 488,
+        "content": "Code defines three functions: \"get_rot\", \"upsample\", and \"get_smooth_loss\". get_rot performs a gather operation on a tensor, reshapes the result, then transposes it. upsample interpolates an input tensor by doubling its size. get_smooth_loss computes the smoothness loss for disparity images using gradients of disparities and color image edges.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/62.json b/docs/data/62.json
new file mode 100644
index 000000000..2f9639ea9
--- /dev/null
+++ b/docs/data/62.json
@@ -0,0 +1,549 @@
+{
+    "6200": {
+        "file_id": 488,
+        "content": "    grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),\n                             1,\n                             keepdim=True)\n    grad_disp_x *= paddle.exp(-grad_img_x)\n    grad_disp_y *= paddle.exp(-grad_img_y)\n    return grad_disp_x.mean() + grad_disp_y.mean()\ndef conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):\n    \"\"\"3x3 convolution with padding\"\"\"\n    return nn.Conv2D(in_planes,\n                     out_planes,\n                     kernel_size=3,\n                     stride=stride,\n                     padding=dilation,\n                     groups=groups,\n                     bias_attr=False,\n                     dilation=dilation)\ndef conv1x1(in_planes, out_planes, stride=1):\n    \"\"\"1x1 convolution\"\"\"\n    return nn.Conv2D(in_planes,\n                     out_planes,\n                     kernel_size=1,\n                     stride=stride,\n                     bias_attr=False)\ndef resnet_multiimage_input(num_layers, num_input_images=1):\n    \"\"\"Constructs a ResNet model.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:232-264"
+    },
+    "6201": {
+        "file_id": 488,
+        "content": "This code defines functions for creating convolutional layers and a ResNet model with multiple input images. The functions include 3x3 and 1x1 convolutions, along with a function that constructs the ResNet model itself. The ResNet model can handle multiple input images by combining gradients from each image channel.",
+        "type": "comment"
+    },
+    "6202": {
+        "file_id": 488,
+        "content": "    Args:\n        num_layers (int): Number of resnet layers. Must be 18 or 50\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\n        num_input_images (int): Number of frames stacked as input\n    \"\"\"\n    assert num_layers in [18, 50], \"Can only run with 18 or 50 layer resnet\"\n    blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]\n    block_type = {18: BasicBlock, 50: Bottleneck}[num_layers]\n    model = ResNetMultiImageInput(block_type,\n                                  num_layers,\n                                  blocks,\n                                  num_input_images=num_input_images)\n    model.init_weights()\n    return model\nclass ConvBlock(nn.Layer):\n    \"\"\"Layer to perform a convolution followed by ELU\n    \"\"\"\n    def __init__(self, in_channels, out_channels):\n        super(ConvBlock, self).__init__()\n        self.conv = Conv3x3(in_channels, out_channels)\n        self.nonlin = nn.ELU()\n    def forward(self, x):\n        out = self.conv(x)\n        out = self.nonlin(out)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:265-294"
+    },
+    "6203": {
+        "file_id": 488,
+        "content": "This code defines a function that creates a ResNet model with multiple image inputs. The model takes in the number of resnet layers (18 or 50), whether to use pretrained weights, and the number of input frames to stack. It then creates blocks based on the layer type and number of layers provided, and initializes the model's weights. The ConvBlock class performs a convolution followed by ELU activation.",
+        "type": "comment"
+    },
+    "6204": {
+        "file_id": 488,
+        "content": "        return out\nclass Conv3x3(nn.Layer):\n    \"\"\"Layer to pad and convolve input\n    \"\"\"\n    def __init__(self, in_channels, out_channels, use_refl=True):\n        super(Conv3x3, self).__init__()\n        if use_refl:\n            self.pad = nn.Pad2D(1, mode='reflect')\n        else:\n            self.pad = nn.Pad2D(1)\n        self.conv = nn.Conv2D(int(in_channels), int(out_channels), 3)\n    def forward(self, x):\n        out = self.pad(x)\n        out = self.conv(out)\n        return out\nclass BackprojectDepth(nn.Layer):\n    \"\"\"Layer to transform a depth image into a point cloud\n    \"\"\"\n    def __init__(self, batch_size, height, width):\n        super(BackprojectDepth, self).__init__()\n        self.batch_size = batch_size\n        self.height = height\n        self.width = width\n        meshgrid = np.meshgrid(range(self.width),\n                               range(self.height),\n                               indexing='xy')\n        id_coords = np.stack(meshgrid, axis=0).astype(np.float32)\n        self.id_coords = self.create_parameter(shape=list(id_coords.shape),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:295-330"
+    },
+    "6205": {
+        "file_id": 488,
+        "content": "Conv3x3 is a layer that pads and convolves the input.\nBackprojectDepth transforms a depth image into a point cloud.",
+        "type": "comment"
+    },
+    "6206": {
+        "file_id": 488,
+        "content": "                                               dtype=paddle.float32)\n        self.id_coords.set_value(id_coords)\n        self.add_parameter(\"id_coords\", self.id_coords)\n        self.id_coords.stop_gradient = True\n        self.ones = self.create_parameter(\n            shape=[self.batch_size, 1, self.height * self.width],\n            default_initializer=ones_)\n        self.add_parameter(\"ones\", self.ones)\n        self.ones.stop_gradient = True\n        pix_coords = paddle.unsqueeze(\n            paddle.stack([\n                self.id_coords[0].reshape([\n                    -1,\n                ]), self.id_coords[1].reshape([\n                    -1,\n                ])\n            ], 0), 0)\n        pix_coords = pix_coords.tile([batch_size, 1, 1])\n        pix_coords = paddle.concat([pix_coords, self.ones], 1)\n        self.pix_coords = self.create_parameter(shape=list(pix_coords.shape), )\n        self.pix_coords.set_value(pix_coords)\n        self.add_parameter(\"pix_coords\", self.pix_coords)\n        self.pix_coords.stop_gradient = True",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:331-355"
+    },
+    "6207": {
+        "file_id": 488,
+        "content": "This code creates and initializes parameters for a backbone in PaddleVideo, specifically for the ID and pixel coordinates. It sets the gradients to stop, meaning they won't be updated during backpropagation. The code uses paddling operations like unsqueeze, stack, tile, and concat for parameter creation and manipulation.",
+        "type": "comment"
+    },
+    "6208": {
+        "file_id": 488,
+        "content": "    def forward(self, depth, inv_K):\n        cam_points = paddle.matmul(inv_K[:, :3, :3], self.pix_coords)\n        cam_points = depth.reshape([self.batch_size, 1, -1]) * cam_points\n        cam_points = paddle.concat([cam_points, self.ones], 1)\n        return cam_points\nclass Project3D(nn.Layer):\n    \"\"\"Layer which projects 3D points into a camera with intrinsics K and at position T\n    \"\"\"\n    def __init__(self, batch_size, height, width, eps=1e-7):\n        super(Project3D, self).__init__()\n        self.batch_size = batch_size\n        self.height = height\n        self.width = width\n        self.eps = eps\n    def forward(self, points, K, T):\n        P = paddle.matmul(K, T)[:, :3, :]\n        cam_points = paddle.matmul(P, points)\n        pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) +\n                                             self.eps)\n        pix_coords = pix_coords.reshape(\n            [self.batch_size, 2, self.height, self.width])\n        pix_coords = pix_coords.transpose([0, 2, 3, 1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:357-385"
+    },
+    "6209": {
+        "file_id": 488,
+        "content": "The code defines a Project3D layer that projects 3D points into a camera with intrinsics K and at position T. It includes the forward pass, initialization, and required parameters such as batch_size, height, and width. The forward function calculates camera projection points by multiplying the intrinsic matrix K with the translation matrix T, then projects the points to pixels coordinates.",
+        "type": "comment"
+    },
+    "6210": {
+        "file_id": 488,
+        "content": "        pix_coords[..., 0] /= self.width - 1\n        pix_coords[..., 1] /= self.height - 1\n        pix_coords = (pix_coords - 0.5) * 2\n        return pix_coords\nclass SSIM(nn.Layer):\n    \"\"\"Layer to compute the SSIM loss between a pair of images\n    \"\"\"\n    def __init__(self):\n        super(SSIM, self).__init__()\n        self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.refl = nn.Pad2D(1, mode='reflect')\n        self.C1 = 0.01**2\n        self.C2 = 0.03**2\n    def forward(self, x, y):\n        x = self.refl(x)\n        y = self.refl(y)\n        mu_x = self.mu_x_pool(x)\n        mu_y = self.mu_y_pool(y)\n        sigma_x = self.sig_x_pool(x**2) - mu_x**2\n        sigma_y = self.sig_y_pool(y**2) - mu_y**2\n        sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:386-417"
+    },
+    "6211": {
+        "file_id": 488,
+        "content": "The code defines a function `pix_coords` that normalizes pixel coordinates and a class `SSIM` for computing the Structural Similarity Index (SSIM) loss between two images. It initializes variables for mean, variance pooling, and applies padding to input images before calculating SSIM loss using provided formulas.",
+        "type": "comment"
+    },
+    "6212": {
+        "file_id": 488,
+        "content": "        SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)\n        SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)\n        return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)\nclass ResNetMultiImageInput(ResNet):\n    \"\"\"Constructs a resnet model with varying number of input images.\n    Adapted from https://github.com/pypaddle/vision/blob/master/paddlevision/models/resnet.py\n    \"\"\"\n    def __init__(self, block, depth, layers, num_input_images=1):\n        super(ResNetMultiImageInput, self).__init__(block, depth)\n        self.inplanes = 64\n        self.conv1 = nn.Conv2D(num_input_images * 3,\n                               64,\n                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)\n        self.bn1 = nn.BatchNorm2D(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block, 64, layers[0])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:419-441"
+    },
+    "6213": {
+        "file_id": 488,
+        "content": "The code defines a ResNet model with multiple input images. It includes a convolution layer, batch normalization, ReLU activation, and max pooling for initial processing. The class \"ResNetMultiImageInput\" inherits from the base \"ResNet\" class and can handle different numbers of input images.",
+        "type": "comment"
+    },
+    "6214": {
+        "file_id": 488,
+        "content": "        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)\n        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)\n        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)\n    def init_weights(self):\n        for layer in self.sublayers(include_self=True):\n            if isinstance(layer, nn.Conv2D):\n                kaiming_normal_(layer.weight,\n                                mode='fan_out',\n                                nonlinearity='relu')\n            elif isinstance(layer, nn.BatchNorm2D):\n                ones_(layer.weight)\n                zeros_(layer.bias)\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:\n        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        act (str): Indicate activation after BatchNorm2D layer.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:442-466"
+    },
+    "6215": {
+        "file_id": 488,
+        "content": "The code defines a model architecture with multiple layers, including ConvBNLayer. It initializes the weights of these layers using specific methods and constraints for convolutional and batch normalization layers. This is typically done to improve performance and stability in deep learning models.",
+        "type": "comment"
+    },
+    "6216": {
+        "file_id": 488,
+        "content": "        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values\n    and name the restored parameters, values initialization\n    are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            bias_attr=False)\n        self._act = act\n        self._batch_norm = BatchNorm2D(out_channels)\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:467-497"
+    },
+    "6217": {
+        "file_id": 488,
+        "content": "The `ConvBNLayer` class is a custom layer that consists of a convolution operation and batch normalization. It initializes the Conv2D layer and BatchNorm2D layer with specified parameters, and applies them sequentially in the forward pass.",
+        "type": "comment"
+    },
+    "6218": {
+        "file_id": 488,
+        "content": "        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BasicBlock(nn.Layer):\n    expansion = 1\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 stride=1,\n                 downsample=None,\n                 groups=1,\n                 base_width=64,\n                 dilation=1,\n                 norm_layer=None):\n        super(BasicBlock, self).__init__()\n        if norm_layer is None:\n            norm_layer = nn.BatchNorm2D\n        if groups != 1 or base_width != 64:\n            raise ValueError(\n                'BasicBlock only supports groups=1 and base_width=64')\n        if dilation > 1:\n            raise NotImplementedError(\n                \"Dilation > 1 not supported in BasicBlock\")\n        # Both self.conv1 and self.downsample layers downsample the input when stride != 1\n        self.conv1 = conv3x3(inplanes, planes, stride)\n        self.bn1 = norm_layer(planes)\n        self.relu = nn.ReLU()\n        self.conv2 = conv3x3(planes, planes)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:498-528"
+    },
+    "6219": {
+        "file_id": 488,
+        "content": "The code defines a class called `BasicBlock` which is an instance of the `nn.Layer` class, and initializes it with parameters such as `inplanes`, `planes`, `stride`, `downsample`, `groups`, `base_width`, `dilation`, and `norm_layer`. It also performs some checks to ensure that certain values match the block's requirements, and then initializes specific layers like `conv1`, `bn1`, and `relu` accordingly. The code also handles cases where `stride` is not equal to 1 by downsampling the input through both `self.conv1` and `self.downsample`.",
+        "type": "comment"
+    },
+    "6220": {
+        "file_id": 488,
+        "content": "        self.bn2 = norm_layer(planes)\n        self.downsample = downsample\n        self.stride = stride\n    def forward(self, x):\n        identity = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        if self.downsample is not None:\n            identity = self.downsample(x)\n        out += identity\n        out = self.relu(out)\n        return out\nclass Bottleneck(nn.Layer):\n    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)\n    # while original implementation places the stride at the first 1x1 convolution(self.conv1)\n    # according to \"Deep residual learning for image recognition\"https://arxiv.org/abs/1512.03385.\n    # This variant is also known as ResNet V1.5 and improves accuracy according to\n    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.\n    expansion = 4\n    def __init__(self,\n                 inplanes,\n                 planes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:529-563"
+    },
+    "6221": {
+        "file_id": 488,
+        "content": "The code defines a Bottleneck layer with stride at the 3x3 convolution (self.conv2) for ResNet V1.5, improving accuracy according to sources like \"Deep residual learning for image recognition\" and \"NVIDIA: ResNet_50_v1_5_for_PyTorch\". The Bottleneck layer has an expansion of 4, and its class initializes inplanes, planes, and other parameters.",
+        "type": "comment"
+    },
+    "6222": {
+        "file_id": 488,
+        "content": "                 stride=1,\n                 downsample=None,\n                 groups=1,\n                 base_width=64,\n                 dilation=1,\n                 norm_layer=None):\n        super(Bottleneck, self).__init__()\n        if norm_layer is None:\n            norm_layer = nn.BatchNorm2D\n        width = int(planes * (base_width / 64.)) * groups\n        self.conv1 = conv1x1(inplanes, width)\n        self.bn1 = norm_layer(width)\n        self.conv2 = conv3x3(width, width, stride, groups, dilation)\n        self.bn2 = norm_layer(width)\n        self.conv3 = conv1x1(width, planes * self.expansion)\n        self.bn3 = norm_layer(planes * self.expansion)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n    def forward(self, x):\n        identity = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:564-597"
+    },
+    "6223": {
+        "file_id": 488,
+        "content": "The code defines a Bottleneck class for a convolutional neural network. It has multiple layers of 1x1 and 3x3 convolutions, with batch normalization and ReLU activation functions. The class also supports downsampling and stride configuration options.",
+        "type": "comment"
+    },
+    "6224": {
+        "file_id": 488,
+        "content": "        if self.downsample is not None:\n            identity = self.downsample(x)\n        out += identity\n        out = self.relu(out)\n        return out\nclass DepthDecoder(nn.Layer):\n    def __init__(self,\n                 num_ch_enc,\n                 scales=range(4),\n                 num_output_channels=1,\n                 use_skips=True):\n        super(DepthDecoder, self).__init__()\n        self.num_output_channels = num_output_channels\n        self.use_skips = use_skips\n        self.upsample_mode = 'nearest'\n        self.scales = scales\n        self.num_ch_enc = num_ch_enc\n        self.num_ch_dec = np.array([16, 32, 64, 128, 256])\n        # decoder\n        self.convs = OrderedDict()\n        for i in range(4, -1, -1):\n            # upconv_0\n            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i +\n                                                                           1]\n            num_ch_out = self.num_ch_dec[i]\n            self.convs[(\"upconv\", i, 0)] = ConvBlock(num_ch_in, num_ch_out)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:599-631"
+    },
+    "6225": {
+        "file_id": 488,
+        "content": "The code defines a class called DepthDecoder. It takes in parameters such as number of channels, scales, output channel count, and use_skips. The class initializes various attributes like num_output_channels, use_skips, upsample_mode, and scale. It also creates an OrderedDict named 'convs' which stores ConvBlock instances based on the given parameters.",
+        "type": "comment"
+    },
+    "6226": {
+        "file_id": 488,
+        "content": "            # upconv_1\n            num_ch_in = self.num_ch_dec[i]\n            if self.use_skips and i > 0:\n                num_ch_in += self.num_ch_enc[i - 1]\n            num_ch_out = self.num_ch_dec[i]\n            self.convs[(\"upconv\", i, 1)] = ConvBlock(num_ch_in, num_ch_out)\n        for s in self.scales:\n            self.convs[(\"dispconv\", s)] = Conv3x3(self.num_ch_dec[s],\n                                                  self.num_output_channels)\n        self.decoder = nn.LayerList(list(self.convs.values()))\n        self.sigmoid = nn.Sigmoid()\n    def forward(self, input_features):\n        outputs = {}\n        # decoder\n        x = input_features[-1]\n        for i in range(4, -1, -1):\n            x = self.convs[(\"upconv\", i, 0)](x)\n            x = [upsample(x)]\n            if self.use_skips and i > 0:\n                x += [input_features[i - 1]]\n            x = paddle.concat(x, 1)\n            x = self.convs[(\"upconv\", i, 1)](x)\n            if i in self.scales:\n                outputs[(\"disp\", i)] = self.sigmoid(self.convs[(\"dispconv\",",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:633-660"
+    },
+    "6227": {
+        "file_id": 488,
+        "content": "Code defines a convolutional network architecture for image decoding. It uses ConvBlock layers and Conv3x3 layers in the decoder section. The input features are upsampled and combined with previous encoder outputs at each stage, and then passed through convolution layers. The results are stored in 'outputs' dictionary.",
+        "type": "comment"
+    },
+    "6228": {
+        "file_id": 488,
+        "content": "                                                                i)](x))\n        return outputs\nclass PoseDecoder(nn.Layer):\n    def __init__(self,\n                 num_ch_enc,\n                 num_input_features,\n                 num_frames_to_predict_for=None,\n                 stride=1):\n        super(PoseDecoder, self).__init__()\n        self.num_ch_enc = num_ch_enc\n        self.num_input_features = num_input_features\n        if num_frames_to_predict_for is None:\n            num_frames_to_predict_for = num_input_features - 1\n        self.num_frames_to_predict_for = num_frames_to_predict_for\n        self.convs = OrderedDict()\n        self.convs[(\"squeeze\")] = nn.Conv2D(self.num_ch_enc[-1], 256, 1)\n        self.convs[(\"pose\", 0)] = nn.Conv2D(num_input_features * 256, 256, 3,\n                                            stride, 1)\n        self.convs[(\"pose\", 1)] = nn.Conv2D(256, 256, 3, stride, 1)\n        self.convs[(\"pose\", 2)] = nn.Conv2D(256, 6 * num_frames_to_predict_for,\n                                            1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:661-686"
+    },
+    "6229": {
+        "file_id": 488,
+        "content": "The PoseDecoder class in this code is a neural network layer that uses convolutional layers to predict pose for a given number of frames. It takes in the number of input channels, the number of input features, and an optional parameter for the number of frames to predict. The layer contains three convolution layers with different parameters for each.",
+        "type": "comment"
+    },
+    "6230": {
+        "file_id": 488,
+        "content": "        self.relu = nn.ReLU()\n        self.net = nn.LayerList(list(self.convs.values()))\n    def forward(self, input_features):\n        last_features = [f[-1] for f in input_features]\n        cat_features = [\n            self.relu(self.convs[\"squeeze\"](f)) for f in last_features\n        ]\n        cat_features = paddle.concat(cat_features, 1)\n        out = cat_features\n        for i in range(3):\n            out = self.convs[(\"pose\", i)](out)\n            if i != 2:\n                out = self.relu(out)\n        out = out.mean(3).mean(2)\n        out = 0.01 * out.reshape([-1, self.num_frames_to_predict_for, 1, 6])\n        axisangle = out[..., :3]\n        translation = out[..., 3:]\n        return axisangle, translation\nclass ResnetEncoder(nn.Layer):\n    \"\"\"Pypaddle module for a resnet encoder\n    \"\"\"\n    def __init__(self, num_layers, pretrained=False, num_input_images=1):\n        super(ResnetEncoder, self).__init__()\n        self.num_ch_enc = np.array([64, 64, 128, 256, 512])\n        resnets = {\n            18: paddle.vision.models.resnet18,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:688-725"
+    },
+    "6231": {
+        "file_id": 488,
+        "content": "The code defines a class \"Adds\" with a forward function that performs feature extraction and concatenation, followed by convolution and activation operations. The code also includes a class \"ResnetEncoder\" which is a Pypaddle implementation of a ResNet encoder.",
+        "type": "comment"
+    },
+    "6232": {
+        "file_id": 488,
+        "content": "            34: paddle.vision.models.resnet34,\n            50: paddle.vision.models.resnet50,\n            101: paddle.vision.models.resnet101,\n            152: paddle.vision.models.resnet152\n        }\n        if num_layers not in resnets:\n            raise ValueError(\n                \"{} is not a valid number of resnet layers\".format(num_layers))\n        if num_input_images > 1:\n            self.encoder = resnet_multiimage_input(num_layers, pretrained,\n                                                   num_input_images)\n        else:\n            self.encoder = resnets[num_layers](pretrained)\n        if num_layers > 34:\n            self.num_ch_enc[1:] *= 4\n        ######################################\n        # night public first conv\n        ######################################\n        self.conv1 = nn.Conv2D(3,\n                               64,\n                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:726-753"
+    },
+    "6233": {
+        "file_id": 488,
+        "content": "The code defines a function that creates a ResNet backbone model with specified layers and checks if the input has multiple images. It uses pretrained weights, adds a convolutional layer to the output of the ResNet, and scales certain channels based on the number of layers.",
+        "type": "comment"
+    },
+    "6234": {
+        "file_id": 488,
+        "content": "        self.bn1 = nn.BatchNorm2D(64)\n        self.relu = nn.ReLU()  # NOTE\n        self.conv_shared = nn.Conv2D(512, 64, kernel_size=1)\n        ##########################################\n        # private source encoder, day\n        ##########################################\n        self.encoder_day = resnets[num_layers](pretrained)\n        self.conv_diff_day = nn.Conv2D(\n            512, 64, kernel_size=1)  # no bn after conv, so bias=true\n        ##########################################\n        # private target encoder, night\n        ##########################################\n        self.encoder_night = resnets[num_layers](pretrained)\n        self.conv_diff_night = nn.Conv2D(512, 64, kernel_size=1)\n        ######################################\n        # shared decoder (small decoder), use a simple de-conv to upsample the features with no skip connection\n        ######################################\n        self.convt5 = convt_bn_relu(in_channels=512,\n                                    out_channels=256,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:754-776"
+    },
+    "6235": {
+        "file_id": 488,
+        "content": "This code initializes a network backbone with shared and private encoders for day and night, as well as a shared decoder. It uses BatchNorm2D, ReLU activation, Conv2D layers, and sets up convolutional blocks for the encoders and decoder.",
+        "type": "comment"
+    },
+    "6236": {
+        "file_id": 488,
+        "content": "                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,\n                                    output_padding=1)\n        self.convt4 = convt_bn_relu(in_channels=256,\n                                    out_channels=128,\n                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,\n                                    output_padding=1)\n        self.convt3 = convt_bn_relu(in_channels=128,\n                                    out_channels=64,\n                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,\n                                    output_padding=1)\n        self.convt2 = convt_bn_relu(in_channels=64,\n                                    out_channels=64,\n                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:777-797"
+    },
+    "6237": {
+        "file_id": 488,
+        "content": "This code defines a series of convolutional layers with batch normalization and ReLU activation functions. The layers have different numbers of input and output channels, as well as identical kernel sizes, strides, padding, and output padding values. These layers likely form part of a deep learning model for image processing or analysis tasks.",
+        "type": "comment"
+    },
+    "6238": {
+        "file_id": 488,
+        "content": "                                    output_padding=1)\n        self.convt1 = convt_bn_relu(in_channels=64,\n                                    out_channels=64,\n                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,\n                                    output_padding=1)\n        self.convtf = nn.Conv2D(64, 3, kernel_size=1, stride=1, padding=0)\n    def forward(self, input_image, is_night):\n        if self.training:\n            result = []\n            input_data = (input_image - 0.45) / 0.225\n            if is_night == 'day':\n                # source private encoder, day\n                private_feature = self.encoder_day.conv1(input_data)\n                private_feature = self.encoder_day.bn1(private_feature)\n                private_feature = self.encoder_day.relu(private_feature)\n                private_feature = self.encoder_day.maxpool(private_feature)\n                private_feature = self.encoder_day.layer1(private_feature)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:798-817"
+    },
+    "6239": {
+        "file_id": 488,
+        "content": "The code defines a class with an initializer for two ConvT blocks and a convolutional layer. The forward function is used for training, where it subtracts 0.45 and divides by 0.225 from the input image to normalize it, and if the 'is_night' parameter is 'day', it passes this normalized image through the day encoder blocks of the model.",
+        "type": "comment"
+    },
+    "6240": {
+        "file_id": 488,
+        "content": "                private_feature = self.encoder_day.layer2(private_feature)\n                private_feature = self.encoder_day.layer3(private_feature)\n                private_feature = self.encoder_day.layer4(private_feature)\n                private_code = self.conv_diff_day(private_feature)\n                private_gram = gram_matrix(private_feature)\n                result.append(private_code)\n                result.append(private_gram)\n            elif is_night == 'night':\n                # target private encoder, night\n                private_feature = self.encoder_night.conv1(input_data)\n                private_feature = self.encoder_night.bn1(private_feature)\n                private_feature = self.encoder_night.relu(private_feature)\n                private_feature = self.encoder_night.maxpool(private_feature)\n                private_feature = self.encoder_night.layer1(private_feature)\n                private_feature = self.encoder_night.layer2(private_feature)\n                private_feature = self.encoder_night.layer3(private_feature)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:818-834"
+    },
+    "6241": {
+        "file_id": 488,
+        "content": "The code is processing the input data through a day or night specific encoder, applying convolutions, batch normalization, ReLU activation, and max pooling. It then appends the resulting private code and gram matrix to the 'result' list.",
+        "type": "comment"
+    },
+    "6242": {
+        "file_id": 488,
+        "content": "                private_feature = self.encoder_night.layer4(private_feature)\n                private_code = self.conv_diff_night(private_feature)\n                private_gram = gram_matrix(private_feature)\n                result.append(private_code)\n                result.append(private_gram)\n        # shared encoder\n        self.features = []\n        x = (input_image - 0.45) / 0.225\n        if is_night == 'day':\n            x = self.encoder.conv1(x)\n            x = self.encoder.bn1(x)\n            self.features.append(self.encoder.relu(x))\n        else:\n            x = self.conv1(x)\n            x = self.bn1(x)\n            self.features.append(self.relu(x))\n        self.features.append(\n            self.encoder.layer1(self.encoder.maxpool(self.features[-1])))\n        self.features.append(self.encoder.layer2(self.features[-1]))\n        self.features.append(self.encoder.layer3(self.features[-1]))\n        self.features.append(self.encoder.layer4(self.features[-1]))\n        if self.training:\n            shared_code = self.conv_shared(self.features[-1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:835-861"
+    },
+    "6243": {
+        "file_id": 488,
+        "content": "This code defines a model with two branches: one for day and one for night. It extracts features from the input image, applies different layers depending on whether it's day or night, and appends them to a list of features. Finally, it calculates a shared code for training using the last feature extracted.",
+        "type": "comment"
+    },
+    "6244": {
+        "file_id": 488,
+        "content": "            shared_gram = gram_matrix(self.features[-1])\n            result.append(shared_code)  # use this to calculate loss of diff\n            result.append(shared_gram)\n            result.append(\n                self.features[-1])  # use this to calculate loss of similarity\n            union_code = private_feature + self.features[-1]\n            rec_code = self.convt5(union_code)\n            rec_code = self.convt4(rec_code)\n            rec_code = self.convt3(rec_code)\n            rec_code = self.convt2(rec_code)\n            rec_code = self.convt1(rec_code)\n            rec_code = self.convtf(rec_code)\n            result.append(rec_code)\n            return self.features, result\n        else:\n            return self.features\nclass ResnetEncoder_pose(nn.Layer):\n    \"\"\"Pypaddle module for a resnet encoder\n    \"\"\"\n    def __init__(self, num_layers, pretrained=False, num_input_images=1):\n        super(ResnetEncoder_pose, self).__init__()\n        self.num_ch_enc = np.array([64, 64, 128, 256, 512])\n        resnets = {",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:862-889"
+    },
+    "6245": {
+        "file_id": 488,
+        "content": "This code defines a ResnetEncoder_pose class, which is a Pypaddle module for a resnet encoder. It initializes the number of layers and whether pre-trained weights are used. The code then defines several convolutional layers (convt1 to convt5) for processing feature maps. If pretrained is set to True, the method returns the features. Otherwise, it appends the processed feature maps to a result list and returns the features and result.",
+        "type": "comment"
+    },
+    "6246": {
+        "file_id": 488,
+        "content": "            18: paddle.vision.models.resnet18,\n            34: paddle.vision.models.resnet34,\n            50: paddle.vision.models.resnet50,\n            101: paddle.vision.models.resnet101,\n            152: paddle.vision.models.resnet152\n        }\n        if num_layers not in resnets:\n            raise ValueError(\n                \"{} is not a valid number of resnet layers\".format(num_layers))\n        if num_input_images > 1:\n            self.encoder = resnet_multiimage_input(num_layers, num_input_images)\n        else:\n            self.encoder = resnets[num_layers](pretrained)\n        if num_layers > 34:\n            self.num_ch_enc[1:] *= 4\n    def forward(self, input_image):\n        features = []\n        x = (input_image - 0.45) / 0.225\n        x = self.encoder.conv1(x)\n        x = self.encoder.bn1(x)\n        features.append(self.encoder.relu(x))\n        features.append(self.encoder.layer1(self.encoder.maxpool(features[-1])))\n        features.append(self.encoder.layer2(features[-1]))\n        features.append(self.encoder.layer3(features[-1]))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:890-917"
+    },
+    "6247": {
+        "file_id": 488,
+        "content": "This code defines a ResNet backbone model with different layers (18, 34, 50, 101, 152) and handles multi-image input cases. The encoder is initialized based on the specified number of layers, and adjusts the number of channels for layers larger than 34. The forward function extracts features from an input image through a series of ResNet layers.",
+        "type": "comment"
+    },
+    "6248": {
+        "file_id": 488,
+        "content": "        features.append(self.encoder.layer4(features[-1]))\n        return features\n@BACKBONES.register()\nclass ADDS_DepthNet(nn.Layer):\n    def __init__(self,\n                 num_layers=18,\n                 frame_ids=[0, -1, 1],\n                 height=256,\n                 width=512,\n                 batch_size=6,\n                 pose_model_input=\"pairs\",\n                 use_stereo=False,\n                 only_depth_encoder=False,\n                 pretrained=None,\n                 scales=[0, 1, 2, 3],\n                 min_depth=0.1,\n                 max_depth=100.0,\n                 pose_model_type='separate_resnet',\n                 v1_multiscale=False,\n                 predictive_mask=False,\n                 disable_automasking=False):\n        super(ADDS_DepthNet, self).__init__()\n        self.num_layers = num_layers\n        self.height = height\n        self.width = width\n        self.batch_size = batch_size\n        self.frame_ids = frame_ids\n        self.pose_model_input = pose_model_input\n        self.use_stereo = use_stereo",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:918-949"
+    },
+    "6249": {
+        "file_id": 488,
+        "content": "This code defines the class `ADDS_DepthNet`, which is a depth estimation network, with parameters such as number of layers, frame IDs, input size, batch size, etc. It inherits from `nn.Layer` and has methods to encode poses and features. The class also registers itself at `BACKBONES`.",
+        "type": "comment"
+    },
+    "6250": {
+        "file_id": 488,
+        "content": "        self.only_depth_encoder = only_depth_encoder\n        self.pretrained = pretrained\n        self.scales = scales\n        self.pose_model_type = pose_model_type\n        self.predictive_mask = predictive_mask\n        self.disable_automasking = disable_automasking\n        self.v1_multiscale = v1_multiscale\n        self.min_depth = min_depth\n        self.max_depth = max_depth\n        self.num_input_frames = len(self.frame_ids)\n        self.num_pose_frames = 2 if self.pose_model_input == \"pairs\" else self.num_input_frames\n        assert self.frame_ids[0] == 0, \"frame_ids must start with 0\"\n        self.use_pose_net = not (self.use_stereo and self.frame_ids == [0])\n        self.encoder = ResnetEncoder(self.num_layers)\n        if not self.only_depth_encoder:\n            self.depth = DepthDecoder(self.encoder.num_ch_enc, self.scales)\n        if self.use_pose_net and not self.only_depth_encoder:\n            if self.pose_model_type == \"separate_resnet\":\n                self.pose_encoder = ResnetEncoder_pose(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:950-972"
+    },
+    "6251": {
+        "file_id": 488,
+        "content": "The code initializes the model parameters and instances, including whether to only use the depth encoder (only_depth_encoder), if pre-trained weights are used (pretrained), and the scales for the depth decoding (scales). It also determines the number of input frames needed for both depth and pose prediction based on the provided inputs. The code creates instances of DepthDecoder, ResnetEncoder, and ResnetEncoder_pose depending on the model configuration.",
+        "type": "comment"
+    },
+    "6252": {
+        "file_id": 488,
+        "content": "                    self.num_layers, num_input_images=self.num_pose_frames)\n                self.pose = PoseDecoder(self.pose_encoder.num_ch_enc,\n                                        num_input_features=1,\n                                        num_frames_to_predict_for=2)\n        self.backproject_depth = {}\n        self.project_3d = {}\n        for scale in self.scales:\n            h = self.height // (2**scale)\n            w = self.width // (2**scale)\n            self.backproject_depth[scale] = BackprojectDepth(\n                self.batch_size, h, w)\n            self.project_3d[scale] = Project3D(batch_size, h, w)\n    def init_weights(self):\n        \"\"\"First init model's weight\"\"\"\n        for m in self.sublayers(include_self=True):\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight, a=math.sqrt(5))\n                if m.bias is not None:\n                    fan_in, _ = _calculate_fan_in_and_fan_out(m.weight)\n                    bound = 1 / math.sqrt(fan_in)\n                    uniform_ = paddle.nn.initializer.Uniform(-bound, bound)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:973-996"
+    },
+    "6253": {
+        "file_id": 488,
+        "content": "The code initializes a backbone model by defining its layers and scales, then initializing the weights of convolutional layers using Kaiming normalization and uniform initialization for bias. This backbone model is designed for handling pose estimation tasks.",
+        "type": "comment"
+    },
+    "6254": {
+        "file_id": 488,
+        "content": "                    uniform_(m.bias)\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if self.pretrained:  # load pretrained weights\n            load_ckpt(self, self.pretrained)\n    def forward(self, inputs, day_or_night='day'):\n        if self.training:\n            features, result = self.encoder(inputs[\"color_aug\", 0, 0], 'day')\n            features_night, result_night = self.encoder(\n                inputs[(\"color_n_aug\", 0, 0)], 'night')\n            outputs = self.depth(features)\n            outputs_night = self.depth(features_night)\n            if self.use_pose_net and not self.only_depth_encoder:\n                outputs.update(self.predict_poses(inputs, 'day'))\n                outputs_night.update(self.predict_poses(inputs, 'night'))\n                self.generate_images_pred(inputs, outputs, 'day')\n                self.generate_images_pred(inputs, outputs_night, 'night')\n            outputs['frame_ids'] = self.frame_ids\n            outputs['scales'] = self.scales\n            outputs['result'] = result",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:997-1019"
+    },
+    "6255": {
+        "file_id": 488,
+        "content": "This code defines a forward function for a backbone model. It applies the encoder to inputs and uses the depth module to extract features. If pose prediction is enabled, it adds poses to the output dictionary, generates images, and stores frame IDs and scales in the outputs dictionary. This function handles both day and night scenarios.",
+        "type": "comment"
+    },
+    "6256": {
+        "file_id": 488,
+        "content": "            outputs['result_night'] = result_night\n            outputs_night['frame_ids'] = self.frame_ids\n            outputs_night['scales'] = self.scales\n            outputs['outputs_night'] = outputs_night\n        else:\n            if isinstance(inputs, dict):\n                input_color = inputs[(\"color\", 0, 0)]\n                features = self.encoder(input_color, day_or_night[0])\n                outputs = self.depth(features)\n                pred_disp, _ = disp_to_depth(outputs[(\"disp\", 0)],\n                                             self.min_depth, self.max_depth)\n                pred_disp = pred_disp[:, 0].numpy()\n                outputs['pred_disp'] = np.squeeze(pred_disp)\n                outputs['gt'] = np.squeeze(inputs['depth_gt'].numpy())\n            else:\n                input_color = inputs\n                features = self.encoder(input_color, day_or_night)\n                outputs = self.depth(features)\n                pred_disp, _ = disp_to_depth(outputs[(\"disp\", 0)],\n                                             self.min_depth, self.max_depth)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:1020-1044"
+    },
+    "6257": {
+        "file_id": 488,
+        "content": "This code handles both dictionary and non-dictionary inputs for a model. If the input is a dictionary, it selects the 'color' input and processes accordingly. It uses an encoder to extract features from the input, then passes those features through a depth function to get predictions. The predictions are converted to depth format, and the final outputs include pred_disp and gt (ground truth) for further processing.",
+        "type": "comment"
+    },
+    "6258": {
+        "file_id": 488,
+        "content": "                pred_disp = pred_disp[:, 0]\n                outputs = paddle.squeeze(pred_disp)\n        return outputs\n    def predict_poses(self, inputs, is_night):\n        \"\"\"Predict poses between input frames for monocular sequences.\n        \"\"\"\n        outputs = {}\n        if self.num_pose_frames == 2:\n            if is_night:\n                pose_feats = {\n                    f_i: inputs[\"color_n_aug\", f_i, 0]\n                    for f_i in self.frame_ids\n                }\n            else:\n                pose_feats = {\n                    f_i: inputs[\"color_aug\", f_i, 0]\n                    for f_i in self.frame_ids\n                }\n            for f_i in self.frame_ids[1:]:\n                if f_i != \"s\":\n                    if f_i < 0:\n                        pose_inputs = [pose_feats[f_i], pose_feats[0]]\n                    else:\n                        pose_inputs = [pose_feats[0], pose_feats[f_i]]\n                    if self.pose_model_type == \"separate_resnet\":\n                        pose_inputs = [",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:1046-1074"
+    },
+    "6259": {
+        "file_id": 488,
+        "content": "This code is defining a function to predict poses between input frames for monocular sequences. It takes inputs as parameters and checks if the number of pose frames is 2. If so, it applies different treatments based on whether it's night or day. For night, it uses color_n_aug; for day, it uses color_aug. Then, it iterates through the frame IDs, excluding 's', and prepares inputs accordingly. The pose model type is \"separate_resnet\".",
+        "type": "comment"
+    },
+    "6260": {
+        "file_id": 488,
+        "content": "                            self.pose_encoder(paddle.concat(pose_inputs,\n                                                            axis=1))\n                        ]\n                    axisangle, translation = self.pose(pose_inputs)\n                    outputs[(\"axisangle\", 0, f_i)] = axisangle\n                    outputs[(\"translation\", 0, f_i)] = translation\n                    # Invert the matrix if the frame id is negative\n                    outputs[(\"cam_T_cam\", 0,\n                             f_i)] = transformation_from_parameters(\n                                 axisangle[:, 0],\n                                 translation[:, 0],\n                                 invert=(f_i < 0))\n            return outputs\n    def generate_images_pred(self, inputs, outputs, is_night):\n        \"\"\"Generate the warped (reprojected) color images for a minibatch.\n        Generated images are saved into the `outputs` dictionary.\n        \"\"\"\n        _, _, height, width = inputs['color', 0, 0].shape\n        for scale in self.scales:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:1075-1096"
+    },
+    "6261": {
+        "file_id": 488,
+        "content": "This code segment defines a function that calculates pose, axisangle, translation, and camera transformation parameters for an image. It takes input from the \"pose_encoder\" function, combines them, and assigns the results to specific positions in the \"outputs\" dictionary. If the frame ID is negative, it inverts the calculated matrix. The code also initializes a nested loop over different scales and generates warped color images for a given batch of inputs.",
+        "type": "comment"
+    },
+    "6262": {
+        "file_id": 488,
+        "content": "            disp = outputs[(\"disp\", scale)]\n            if self.v1_multiscale:\n                source_scale = scale\n            else:\n                disp = F.interpolate(disp, [height, width],\n                                     mode=\"bilinear\",\n                                     align_corners=False)\n                source_scale = 0\n            _, depth = disp_to_depth(disp, self.min_depth, self.max_depth)\n            outputs[(\"depth\", 0, scale)] = depth\n            for i, frame_id in enumerate(self.frame_ids[1:]):\n                T = outputs[(\"cam_T_cam\", 0, frame_id)]\n                cam_points = self.backproject_depth[source_scale](\n                    depth, inputs[(\"inv_K\", source_scale)])\n                pix_coords = self.project_3d[source_scale](\n                    cam_points, inputs[(\"K\", source_scale)], T)\n                outputs[(\"sample\", frame_id, scale)] = pix_coords\n                if is_night:\n                    inputs[(\"color_n\", frame_id,\n                            source_scale)].stop_gradient = False",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:1097-1122"
+    },
+    "6263": {
+        "file_id": 488,
+        "content": "The code interpolates the displacement output based on the scale, and if multiscale is not enabled, it performs bilinear interpolation to match the input size. It then converts the displacement into depth using disp_to_depth function. Depth and its corresponding scale are added to the outputs. For each frame ID in the list, it retrieves camera transformation matrix T, backprojects depth to 3D coordinates, projects them onto image plane using project_3d, and adds the resulting pixel coordinates to the outputs. If is_night is True, it modifies the color_n input's stop_gradient attribute.",
+        "type": "comment"
+    },
+    "6264": {
+        "file_id": 488,
+        "content": "                    outputs[(\"color\", frame_id,\n                             scale)] = paddle.nn.functional.grid_sample(\n                                 inputs[(\"color_n\", frame_id, source_scale)],\n                                 outputs[(\"sample\", frame_id, scale)],\n                                 padding_mode=\"border\",\n                                 align_corners=False)\n                else:\n                    inputs[(\"color\", frame_id,\n                            source_scale)].stop_gradient = False\n                    outputs[(\"color\", frame_id,\n                             scale)] = paddle.nn.functional.grid_sample(\n                                 inputs[(\"color\", frame_id, source_scale)],\n                                 outputs[(\"sample\", frame_id, scale)],\n                                 padding_mode=\"border\",\n                                 align_corners=False)\n                if not self.disable_automasking:\n                    if is_night:\n                        outputs[(\"color_identity\", frame_id, scale)] = \\",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:1123-1142"
+    },
+    "6265": {
+        "file_id": 488,
+        "content": "This code performs grid sampling on a tensor and assigns the result to a specific location in the outputs dictionary based on frame_id and scale. If disable_automasking is True, it also creates an identity mask for night scenes.",
+        "type": "comment"
+    },
+    "6266": {
+        "file_id": 488,
+        "content": "                            inputs[(\"color_n\", frame_id, source_scale)]\n                    else:\n                        outputs[(\"color_identity\", frame_id, scale)] = \\\n                            inputs[(\"color\", frame_id, source_scale)]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/adds.py:1143-1146"
+    },
+    "6267": {
+        "file_id": 488,
+        "content": "This code is selecting the input data from a dictionary based on specific conditions. If the frame_id and source_scale match, it assigns the value to \"color_n\". Otherwise, it assigns the value of \"color\" input to \"color_identity\".",
+        "type": "comment"
+    },
+    "6268": {
+        "file_id": 489,
+        "content": "/paddlevideo/modeling/backbones/agcn.py",
+        "type": "filepath"
+    },
+    "6269": {
+        "file_id": 489,
+        "content": "The code defines a PaddlePaddle GCN class with convolutional blocks for temporal sequences, utilizing layers such as batch normalization and residual connections. It also presents a custom AGCN backbone model for graph convolution tasks using adaptive graph convolutions.",
+        "type": "summary"
+    },
+    "6270": {
+        "file_id": 489,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nclass GCN(nn.Layer):\n    def __init__(self, in_channels, out_channels, vertex_nums=25, stride=1):\n        super(GCN, self).__init__()\n        self.conv1 = nn.Conv2D(in_channels=in_channels,\n                               out_channels=3 * out_channels,\n                               kernel_size=1,\n                               stride=1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn.py:1-27"
+    },
+    "6271": {
+        "file_id": 489,
+        "content": "The code is defining a GCN (Graph Convolutional Network) class within the PaddlePaddle framework. It takes in channel dimensions, output channel dimensions, vertex numbers and stride as parameters for its constructor. The class has one convolution layer with kernel size of 1 and stride of 1.",
+        "type": "comment"
+    },
+    "6272": {
+        "file_id": 489,
+        "content": "        self.conv2 = nn.Conv2D(in_channels=vertex_nums * 3,\n                               out_channels=vertex_nums,\n                               kernel_size=1)\n    def forward(self, x):\n        # x --- N,C,T,V\n        x = self.conv1(x)  # N,3C,T,V\n        N, C, T, V = x.shape\n        x = paddle.reshape(x, [N, C // 3, 3, T, V])  # N,C,3,T,V\n        x = paddle.transpose(x, perm=[0, 1, 2, 4, 3])  # N,C,3,V,T\n        x = paddle.reshape(x, [N, C // 3, 3 * V, T])  # N,C,3V,T\n        x = paddle.transpose(x, perm=[0, 2, 1, 3])  # N,3V,C,T\n        x = self.conv2(x)  # N,V,C,T\n        x = paddle.transpose(x, perm=[0, 2, 3, 1])  # N,C,T,V\n        return x\nclass Block(paddle.nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 vertex_nums=25,\n                 temporal_size=9,\n                 stride=1,\n                 residual=True):\n        super(Block, self).__init__()\n        self.residual = residual\n        self.out_channels = out_channels\n        self.bn_res = nn.BatchNorm2D(out_channels)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn.py:28-57"
+    },
+    "6273": {
+        "file_id": 489,
+        "content": "The code defines a convolutional block for processing temporal sequences with 3D spatial-temporal convolutions. The block applies multiple convolution layers, batch normalization, and transposes the dimensions to perform feature extraction from the input sequence. It is parameterized by the number of channels, output channels, vertex numbers, temporal size, and a flag for residual connections.",
+        "type": "comment"
+    },
+    "6274": {
+        "file_id": 489,
+        "content": "        self.conv_res = nn.Conv2D(in_channels=in_channels,\n                                  out_channels=out_channels,\n                                  kernel_size=1,\n                                  stride=(stride, 1))\n        self.gcn = GCN(in_channels=in_channels,\n                       out_channels=out_channels,\n                       vertex_nums=vertex_nums)\n        self.tcn = nn.Sequential(\n            nn.BatchNorm2D(out_channels),\n            nn.ReLU(),\n            nn.Conv2D(in_channels=out_channels,\n                      out_channels=out_channels,\n                      kernel_size=(temporal_size, 1),\n                      padding=((temporal_size - 1) // 2, 0),\n                      stride=(stride, 1)),\n            nn.BatchNorm2D(out_channels),\n        )\n    def forward(self, x):\n        if self.residual:\n            y = self.conv_res(x)\n            y = self.bn_res(y)\n        x = self.gcn(x)\n        x = self.tcn(x)\n        out = x + y if self.residual else x\n        out = F.relu(out)\n        return out",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn.py:58-84"
+    },
+    "6275": {
+        "file_id": 489,
+        "content": "This code initializes a convolutional residual block with a Graph Convolutional Network (GCN) and Temporal Convolutional Network (TCN). The conv_res is a 1x1 convolution, gcn is a GCN layer, and tcn is a TCN layer. In the forward pass, if residual is True, the input goes through the conv_res layer before being passed to the gcn layer, then the tcn layer. The output is either the sum of the input and the residual (if residual is True) or just the output of the GCN layer, which is then passed through a ReLU activation function.",
+        "type": "comment"
+    },
+    "6276": {
+        "file_id": 489,
+        "content": "@BACKBONES.register()\nclass AGCN(nn.Layer):\n    \"\"\"\n    AGCN model improves the performance of ST-GCN using\n    Adaptive Graph Convolutional Networks.\n    Args:\n        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.\n    \"\"\"\n    def __init__(self, in_channels=2, **kwargs):\n        super(AGCN, self).__init__()\n        self.data_bn = nn.BatchNorm1D(25 * 2)\n        self.agcn = nn.Sequential(\n            Block(in_channels=in_channels,\n                  out_channels=64,\n                  residual=False,\n                  **kwargs), Block(in_channels=64, out_channels=64, **kwargs),\n            Block(in_channels=64, out_channels=64, **kwargs),\n            Block(in_channels=64, out_channels=64, **kwargs),\n            Block(in_channels=64, out_channels=128, stride=2, **kwargs),\n            Block(in_channels=128, out_channels=128, **kwargs),\n            Block(in_channels=128, out_channels=128, **kwargs),\n            Block(in_channels=128, out_channels=256, stride=2, **kwargs),\n            Block(in_channels=256, out_channels=256, **kwargs),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn.py:87-110"
+    },
+    "6277": {
+        "file_id": 489,
+        "content": "The code defines a class AGCN (Adaptive Graph Convolutional Network) as a subclass of nn.Layer, which is an improved version of ST-GCN for graph convolution tasks using adaptive graph convolutions. The model architecture consists of several Block layers with varying in_channels and out_channels, and downsampling is performed with stride=2.",
+        "type": "comment"
+    },
+    "6278": {
+        "file_id": 489,
+        "content": "            Block(in_channels=256, out_channels=256, **kwargs))\n        self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))\n    def forward(self, x):\n        # data normalization\n        N, C, T, V, M = x.shape\n        x = x.transpose((0, 4, 1, 2, 3))  # N, M, C, T, V\n        x = x.reshape((N * M, C, T, V))\n        x = self.agcn(x)\n        x = self.pool(x)  # NM,C,T,V --> NM,C,1,1\n        C = x.shape[1]\n        x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1)  # N,C,1,1\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn.py:111-128"
+    },
+    "6279": {
+        "file_id": 489,
+        "content": "This code defines a custom backbone for AGCN model with a block of 256 in_channels and out_channels, followed by an adaptive average pooling layer. The forward function performs data normalization, transposes the shape, reshapes it, applies the AGCN layer, pools it to size (1,1), and finally reshapes and averages along one axis before returning the result.",
+        "type": "comment"
+    },
+    "6280": {
+        "file_id": 490,
+        "content": "/paddlevideo/modeling/backbones/agcn2s.py",
+        "type": "filepath"
+    },
+    "6281": {
+        "file_id": 490,
+        "content": "The code implements temporal convolutional networks and GCN units in PaddlePaddle, creating a Graph class and AGCN2s graph convolution layer for the NTURGB+D dataset. This involves initializing variables, obtaining adjacency matrix, normalization, and executing convolutions.",
+        "type": "summary"
+    },
+    "6282": {
+        "file_id": 490,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport numpy as np\nfrom ..registry import BACKBONES\ndef import_class(name):\n    components = name.split('.')\n    mod = __import__(components[0])\n    for comp in components[1:]:\n        mod = getattr(mod, comp)\n    return mod\nclass UnitTCN(nn.Layer):\n    def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):\n        super(UnitTCN, self).__init__()\n        pad = int((kernel_size - 1) / 2)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn2s.py:1-32"
+    },
+    "6283": {
+        "file_id": 490,
+        "content": "This code defines a class named \"UnitTCN\" which is a type of layer for temporal convolutional network. It's implemented using PaddlePaddle library and includes methods to define the convolutional layers with specified number of input and output channels, kernel size and stride. The class is registered in the BACKBONES registry of the PaddleVideo module.",
+        "type": "comment"
+    },
+    "6284": {
+        "file_id": 490,
+        "content": "        self.conv = nn.Conv2D(in_channels,\n                              out_channels,\n                              kernel_size=(kernel_size, 1),\n                              padding=(pad, 0),\n                              stride=(stride, 1))\n        self.bn = nn.BatchNorm2D(out_channels)\n        self.relu = nn.ReLU()\n    def forward(self, x):\n        \" input size : (N*M, C, T, V)\"\n        x = self.bn(self.conv(x))\n        return x\nclass UnitGCN(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 A,\n                 coff_embedding=4,\n                 num_subset=3):\n        super(UnitGCN, self).__init__()\n        inter_channels = out_channels // coff_embedding\n        self.inter_c = inter_channels\n        PA = self.create_parameter(shape=A.shape, dtype='float32')\n        self.PA = PA\n        self.A = paddle.to_tensor(A.astype(np.float32))\n        self.num_subset = num_subset\n        self.conv_a = nn.LayerList()\n        self.conv_b = nn.LayerList()\n        self.conv_d = nn.LayerList()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn2s.py:33-65"
+    },
+    "6285": {
+        "file_id": 490,
+        "content": "This code defines a GCN unit class with convolutional layers for learning spatio-temporal features. It uses batch normalization and ReLU activation, allowing the model to learn representations from the input data. The GCN unit takes in channels, output channels, adjacency matrix A, coefficient embedding, and number of subsets as parameters.",
+        "type": "comment"
+    },
+    "6286": {
+        "file_id": 490,
+        "content": "        for i in range(self.num_subset):\n            self.conv_a.append(nn.Conv2D(in_channels, inter_channels, 1))\n            self.conv_b.append(nn.Conv2D(in_channels, inter_channels, 1))\n            self.conv_d.append(nn.Conv2D(in_channels, out_channels, 1))\n        if in_channels != out_channels:\n            self.down = nn.Sequential(nn.Conv2D(in_channels, out_channels, 1),\n                                      nn.BatchNorm2D(out_channels))\n        else:\n            self.down = lambda x: x\n        self.bn = nn.BatchNorm2D(out_channels)\n        self.soft = nn.Softmax(-2)\n        self.relu = nn.ReLU()\n    def forward(self, x):\n        N, C, T, V = x.shape\n        A = self.A + self.PA\n        y = None\n        for i in range(self.num_subset):\n            A1 = paddle.transpose(self.conv_a[i](x),\n                                  perm=[0, 3, 1,\n                                        2]).reshape([N, V, self.inter_c * T])\n            A2 = self.conv_b[i](x).reshape([N, self.inter_c * T, V])\n            A1 = self.soft(paddle.matmul(A1, A2) / A1.shape[-1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn2s.py:66-91"
+    },
+    "6287": {
+        "file_id": 490,
+        "content": "This code defines a neural network backbone for the AGCN2S model. It initializes and appends convolutional layers, checks if input and output channels are different to determine whether to add a downsampling layer, and defines softmax, batch normalization, and ReLU activation functions. The forward function performs operations on input data to produce the final output.",
+        "type": "comment"
+    },
+    "6288": {
+        "file_id": 490,
+        "content": "            A1 = A1 + A[i]\n            A2 = x.reshape([N, C * T, V])\n            z = self.conv_d[i](paddle.matmul(A2, A1).reshape([N, C, T, V]))\n            y = z + y if y is not None else z\n        y = self.bn(y)\n        y += self.down(x)\n        return self.relu(y)\nclass Block(nn.Layer):\n    def __init__(self, in_channels, out_channels, A, stride=1, residual=True):\n        super(Block, self).__init__()\n        self.gcn1 = UnitGCN(in_channels, out_channels, A)\n        self.tcn1 = UnitTCN(out_channels, out_channels, stride=stride)\n        self.relu = nn.ReLU()\n        if not residual:\n            self.residual = lambda x: 0\n        elif (in_channels == out_channels) and (stride == 1):\n            self.residual = lambda x: x\n        else:\n            self.residual = UnitTCN(in_channels,\n                                    out_channels,\n                                    kernel_size=1,\n                                    stride=stride)\n    def forward(self, x):\n        x = self.tcn1(self.gcn1(x)) + self.residual(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn2s.py:92-121"
+    },
+    "6289": {
+        "file_id": 490,
+        "content": "The code defines a block class for a neural network architecture. It consists of GCN and TCN units in series, followed by a ReLU activation function. The residual connection is either set to zero or equal to the input if not specified, allowing for identity shortcuts within the network. The forward method combines the outputs from GCN and TCN with residual connections.",
+        "type": "comment"
+    },
+    "6290": {
+        "file_id": 490,
+        "content": "        return self.relu(x)\n# This Graph structure is for the NTURGB+D dataset. If you use a custom dataset, modify num_node and the corresponding graph adjacency structure.\nclass Graph:\n    def __init__(self, labeling_mode='spatial'):\n        num_node = 25\n        self_link = [(i, i) for i in range(num_node)]\n        inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),\n                            (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),\n                            (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),\n                            (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),\n                            (23, 8), (24, 25), (25, 12)]\n        inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]\n        outward = [(j, i) for (i, j) in inward]\n        neighbor = inward + outward\n        self.num_node = num_node\n        self.self_link = self_link\n        self.inward = inward\n        self.outward = outward\n        self.neighbor = neighbor\n        self.A = self.get_adjacency_matrix(labeling_mode)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn2s.py:122-144"
+    },
+    "6291": {
+        "file_id": 490,
+        "content": "This code defines a Graph class with a fixed number of nodes (25) and connectivity patterns for the NTURGB+D dataset. It initializes self_link, inward, outward, and neighbor variables based on the specified labeling mode ('spatial' by default). The adjacency matrix is obtained using get_adjacency_matrix method.",
+        "type": "comment"
+    },
+    "6292": {
+        "file_id": 490,
+        "content": "    def edge2mat(self, link, num_node):\n        A = np.zeros((num_node, num_node))\n        for i, j in link:\n            A[j, i] = 1\n        return A\n    def normalize_digraph(self, A):\n        Dl = np.sum(A, 0)\n        h, w = A.shape\n        Dn = np.zeros((w, w))\n        for i in range(w):\n            if Dl[i] > 0:\n                Dn[i, i] = Dl[i]**(-1)\n        AD = np.dot(A, Dn)\n        return AD\n    def get_spatial_graph(self, num_node, self_link, inward, outward):\n        I = self.edge2mat(self_link, num_node)\n        In = self.normalize_digraph(self.edge2mat(inward, num_node))\n        Out = self.normalize_digraph(self.edge2mat(outward, num_node))\n        A = np.stack((I, In, Out))\n        return A\n    def get_adjacency_matrix(self, labeling_mode=None):\n        if labeling_mode is None:\n            return self.A\n        if labeling_mode == 'spatial':\n            A = self.get_spatial_graph(self.num_node, self.self_link,\n                                       self.inward, self.outward)\n        else:\n            raise ValueError()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn2s.py:146-176"
+    },
+    "6293": {
+        "file_id": 490,
+        "content": "The code defines three functions: `edge2mat()`, `normalize_digraph()`, and `get_spatial_graph()`. `edge2mat()` converts a list of edges into an adjacency matrix. `normalize_digraph()` normalizes a directed graph by computing the in-degree for each node. `get_spatial_graph()` combines the adjacency matrices from self-links, incoming edges, and outgoing edges into one matrix. The last function `get_adjacency_matrix()` returns the adjacency matrix depending on the given labeling mode (default or spatial).",
+        "type": "comment"
+    },
+    "6294": {
+        "file_id": 490,
+        "content": "        return A\n@BACKBONES.register()\nclass AGCN2s(nn.Layer):\n    def __init__(self,\n                 num_point=25,\n                 num_person=2,\n                 graph='ntu_rgb_d',\n                 graph_args=dict(),\n                 in_channels=3):\n        super(AGCN2s, self).__init__()\n        if graph == 'ntu_rgb_d':\n            self.graph = Graph(**graph_args)\n        else:\n            raise ValueError()\n        A = self.graph.A\n        self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point)\n        self.l1 = Block(in_channels, 64, A, residual=False)\n        self.l2 = Block(64, 64, A)\n        self.l3 = Block(64, 64, A)\n        self.l4 = Block(64, 64, A)\n        self.l5 = Block(64, 128, A, stride=2)\n        self.l6 = Block(128, 128, A)\n        self.l7 = Block(128, 128, A)\n        self.l8 = Block(128, 256, A, stride=2)\n        self.l9 = Block(256, 256, A)\n        self.l10 = Block(256, 256, A)\n    def forward(self, x):\n        N, C, T, V, M = x.shape\n        x = x.transpose([0, 4, 3, 1, 2]).reshape_([N, M * V * C, T])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn2s.py:177-212"
+    },
+    "6295": {
+        "file_id": 490,
+        "content": "Class AGCN2s defines a neural network layer for graph convolutions. It takes parameters such as number of points, persons, and the type of graph. The code initializes graph adjacency matrix 'A' from the specified graph and creates several Block layers for convolution operations with different parameters and strides. In forward pass, it rearranges the input tensor dimensions and reshapes it before performing graph convolutions.",
+        "type": "comment"
+    },
+    "6296": {
+        "file_id": 490,
+        "content": "        x = self.data_bn(x)\n        x = x.reshape_([N, M, V, C,\n                        T]).transpose([0, 1, 3, 4,\n                                       2]).reshape_([N * M, C, T, V])\n        x = self.l1(x)\n        x = self.l2(x)\n        x = self.l3(x)\n        x = self.l4(x)\n        x = self.l5(x)\n        x = self.l6(x)\n        x = self.l7(x)\n        x = self.l8(x)\n        x = self.l9(x)\n        x = self.l10(x)\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/agcn2s.py:213-229"
+    },
+    "6297": {
+        "file_id": 490,
+        "content": "The code performs the following operations: \n1. Applies data normalization to x using self.data_bn.\n2. Reshapes x with dimensions N, M, V, C, and T to (N*M,C,T,V).\n3. Passes x through ten linear layers (l1 to l10) for transformation.\n4. Finally, returns the transformed x.",
+        "type": "comment"
+    },
+    "6298": {
+        "file_id": 491,
+        "content": "/paddlevideo/modeling/backbones/asrf.py",
+        "type": "filepath"
+    },
+    "6299": {
+        "file_id": 491,
+        "content": "The code imports libraries, registers a backbone model in PaddleVideo, initializes an ASRF class for computer vision tasks, and sets layer biases using init_bias function. The ASRF forward method performs convolution on input x and iterates through shared layers before returning the output.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/63.json b/docs/data/63.json
new file mode 100644
index 000000000..dafc529d0
--- /dev/null
+++ b/docs/data/63.json
@@ -0,0 +1,547 @@
+{
+    "6300": {
+        "file_id": 491,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# https://github.com/yabufarha/ms-tcn/blob/master/model.py\n# https://github.com/yiskw713/asrf/libs/models/tcn.py\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nimport copy\nimport random\nimport math\nfrom paddle import ParamAttr\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom .ms_tcn import DilatedResidualLayer\nfrom ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/asrf.py:1-30"
+    },
+    "6301": {
+        "file_id": 491,
+        "content": "This code block is importing necessary libraries and modules, as well as registering a backbone model within the PaddleVideo framework. It also includes references to external repositories for inspiration or implementation guidance.",
+        "type": "comment"
+    },
+    "6302": {
+        "file_id": 491,
+        "content": "@BACKBONES.register()\nclass ASRF(nn.Layer):\n    def __init__(self, in_channel, num_features, num_classes, num_stages,\n                 num_layers):\n        super().__init__()\n        self.in_channel = in_channel\n        self.num_features = num_features\n        self.num_classes = num_classes\n        self.num_stages = num_stages\n        self.num_layers = num_layers\n        # define layers\n        self.conv_in = nn.Conv1D(self.in_channel, self.num_features, 1)\n        shared_layers = [\n            DilatedResidualLayer(2**i, self.num_features, self.num_features)\n            for i in range(self.num_layers)\n        ]\n        self.shared_layers = nn.LayerList(shared_layers)\n        self.init_weights()\n    def init_weights(self):\n        \"\"\"\n        initialize model layers' weight\n        \"\"\"\n        # init weight\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv1D):\n                layer.weight.set_value(\n                    KaimingUniform_like_torch(layer.weight).astype('float32'))\n                if layer.bias is not None:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/asrf.py:33-65"
+    },
+    "6303": {
+        "file_id": 491,
+        "content": "The ASRF class is a type of backbone model for computer vision tasks. It initializes convolutional layers and shared dilated residual layers, and sets their weights using KaimingUniform initialization. The number of features, stages, and layers are configurable parameters.",
+        "type": "comment"
+    },
+    "6304": {
+        "file_id": 491,
+        "content": "                    layer.bias.set_value(\n                        init_bias(layer.weight, layer.bias).astype('float32'))\n    def forward(self, x):\n        \"\"\" ASRF forward\n        \"\"\"\n        out = self.conv_in(x)\n        for layer in self.shared_layers:\n            out = layer(out)\n        return out",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/asrf.py:66-75"
+    },
+    "6305": {
+        "file_id": 491,
+        "content": "This code sets the initial values of layer biases using init_bias function. The ASRF forward method performs convolution on input x, then iterates through shared layers to modify the output before returning it.",
+        "type": "comment"
+    },
+    "6306": {
+        "file_id": 492,
+        "content": "/paddlevideo/modeling/backbones/bmn.py",
+        "type": "filepath"
+    },
+    "6307": {
+        "file_id": 492,
+        "content": "This function creates mask matrices and BMN class in Paddle.ai, initializes 2D convolutional layers for the BMSN backbone, and defines a video analysis model with layers, activation functions, and returns processed input xp.",
+        "type": "summary"
+    },
+    "6308": {
+        "file_id": 492,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport numpy as np\nimport paddle\nfrom paddle import ParamAttr\nfrom ..registry import BACKBONES\ndef _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,\n                           num_sample_perbin):\n    \"\"\" generate sample mask for a boundary-matching pair \"\"\"\n    plen = float(seg_xmax - seg_xmin)\n    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)\n    total_samples = [\n        seg_xmin + plen_sample * ii",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:1-28"
+    },
+    "6309": {
+        "file_id": 492,
+        "content": "This function generates a sample mask for a boundary-matching pair. It calculates the number of samples per bin and total samples based on segment bounds, total length, and desired numbers of samples.",
+        "type": "comment"
+    },
+    "6310": {
+        "file_id": 492,
+        "content": "        for ii in range(num_sample * num_sample_perbin)\n    ]\n    p_mask = []\n    for idx in range(num_sample):\n        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *\n                                    num_sample_perbin]\n        bin_vector = np.zeros([tscale])\n        for sample in bin_samples:\n            sample_upper = math.ceil(sample)\n            sample_decimal, sample_down = math.modf(sample)\n            if (tscale - 1) >= int(sample_down) >= 0:\n                bin_vector[int(sample_down)] += 1 - sample_decimal\n            if (tscale - 1) >= int(sample_upper) >= 0:\n                bin_vector[int(sample_upper)] += sample_decimal\n        bin_vector = 1.0 / num_sample_perbin * bin_vector\n        p_mask.append(bin_vector)\n    p_mask = np.stack(p_mask, axis=1)\n    return p_mask\ndef get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,\n                      num_sample_perbin):\n    \"\"\" generate sample mask for each point in Boundary-Matching Map \"\"\"\n    mask_mat = []\n    for start_index in range(tscale):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:29-53"
+    },
+    "6311": {
+        "file_id": 492,
+        "content": "This code generates sample masks for each point in a Boundary-Matching Map. It iterates through samples, creates binary vectors for each, and then scales them to obtain the final mask. The resulting masks are stored in an array and returned.",
+        "type": "comment"
+    },
+    "6312": {
+        "file_id": 492,
+        "content": "        mask_mat_vector = []\n        for duration_index in range(dscale):\n            if start_index + duration_index < tscale:\n                p_xmin = start_index\n                p_xmax = start_index + duration_index\n                center_len = float(p_xmax - p_xmin) + 1\n                sample_xmin = p_xmin - center_len * prop_boundary_ratio\n                sample_xmax = p_xmax + center_len * prop_boundary_ratio\n                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,\n                                                tscale, num_sample,\n                                                num_sample_perbin)\n            else:\n                p_mask = np.zeros([tscale, num_sample])\n            mask_mat_vector.append(p_mask)\n        mask_mat_vector = np.stack(mask_mat_vector, axis=2)\n        mask_mat.append(mask_mat_vector)\n    mask_mat = np.stack(mask_mat, axis=3)\n    mask_mat = mask_mat.astype(np.float32)\n    sample_mask = np.reshape(mask_mat, [tscale, -1])\n    return sample_mask\ndef init_params(name, in_channels, kernel_size):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:54-77"
+    },
+    "6313": {
+        "file_id": 492,
+        "content": "This code generates mask matrices for video frames. It iterates over different duration scales and starts from a given start index. For each duration scale, it creates binary masks using interpolation. If the duration is smaller than the total time scale, it adjusts the sample range to include boundaries. Zero paddings are used if the duration exceeds the total time scale. The generated mask vectors are stacked together and reshaped for final output.",
+        "type": "comment"
+    },
+    "6314": {
+        "file_id": 492,
+        "content": "    fan_in = in_channels * kernel_size * 1\n    k = 1. / math.sqrt(fan_in)\n    param_attr = ParamAttr(name=name,\n                           initializer=paddle.nn.initializer.Uniform(low=-k,\n                                                                     high=k))\n    return param_attr\n@BACKBONES.register()\nclass BMN(paddle.nn.Layer):\n    \"\"\"BMN model from\n    `\"BMN: Boundary-Matching Network for Temporal Action Proposal Generation\" <https://arxiv.org/abs/1907.09702>`_\n    Args:\n        tscale (int): sequence length, default 100.\n        dscale (int): max duration length, default 100.\n        prop_boundary_ratio (float): ratio of expanded temporal region in proposal boundary, default 0.5.\n        num_sample (int): number of samples betweent starting boundary and ending boundary of each propoasl, default 32.\n        num_sample_perbin (int):  number of selected points in each sample, default 3.\n    \"\"\"\n    def __init__(\n        self,\n        tscale,\n        dscale,\n        prop_boundary_ratio,\n        num_sample,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:78-103"
+    },
+    "6315": {
+        "file_id": 492,
+        "content": "This code defines a BMN class as a Paddle.ai layer implementing the BMN model for temporal action proposal generation from the paper \"BMN: Boundary-Matching Network for Temporal Action Proposal Generation\". It has parameters tscale, dscale, prop_boundary_ratio, num_sample, and num_sample_perbin which determine the sequence length, max duration length, ratio of expanded temporal region in proposal boundary, number of samples between starting and ending boundaries of each proposal, and number of selected points in each sample respectively. The code also initializes a ParamAttr with Uniform initializer for weight initialization.",
+        "type": "comment"
+    },
+    "6316": {
+        "file_id": 492,
+        "content": "        num_sample_perbin,\n        feat_dim=400,\n    ):\n        super(BMN, self).__init__()\n        #init config\n        self.feat_dim = feat_dim\n        self.tscale = tscale\n        self.dscale = dscale\n        self.prop_boundary_ratio = prop_boundary_ratio\n        self.num_sample = num_sample\n        self.num_sample_perbin = num_sample_perbin\n        self.hidden_dim_1d = 256\n        self.hidden_dim_2d = 128\n        self.hidden_dim_3d = 512\n        # Base Module\n        self.b_conv1 = paddle.nn.Conv1D(\n            in_channels=self.feat_dim,\n            out_channels=self.hidden_dim_1d,\n            kernel_size=3,\n            padding=1,\n            groups=4,\n            weight_attr=init_params('Base_1_w', self.feat_dim, 3),\n            bias_attr=init_params('Base_1_b', self.feat_dim, 3))\n        self.b_conv1_act = paddle.nn.ReLU()\n        self.b_conv2 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=self.hidden_dim_1d,\n            kernel_size=3,\n            padding=1,\n            groups=4,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:104-137"
+    },
+    "6317": {
+        "file_id": 492,
+        "content": "This code defines the BMN class, which is a backbone model. It initializes parameters and includes convolutional layers with ReLU activation functions for feature extraction. The code also includes instance variables for controlling the model's behavior and dimensionality of the hidden states.",
+        "type": "comment"
+    },
+    "6318": {
+        "file_id": 492,
+        "content": "            weight_attr=init_params('Base_2_w', self.hidden_dim_1d, 3),\n            bias_attr=init_params('Base_2_b', self.hidden_dim_1d, 3))\n        self.b_conv2_act = paddle.nn.ReLU()\n        # Temporal Evaluation Module\n        self.ts_conv1 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=self.hidden_dim_1d,\n            kernel_size=3,\n            padding=1,\n            groups=4,\n            weight_attr=init_params('TEM_s1_w', self.hidden_dim_1d, 3),\n            bias_attr=init_params('TEM_s1_b', self.hidden_dim_1d, 3))\n        self.ts_conv1_act = paddle.nn.ReLU()\n        self.ts_conv2 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=1,\n            kernel_size=1,\n            padding=0,\n            groups=1,\n            weight_attr=init_params('TEM_s2_w', self.hidden_dim_1d, 1),\n            bias_attr=init_params('TEM_s2_b', self.hidden_dim_1d, 1))\n        self.ts_conv2_act = paddle.nn.Sigmoid()\n        self.te_conv1 = paddle.nn.Conv1D(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:138-163"
+    },
+    "6319": {
+        "file_id": 492,
+        "content": "This code defines a Conv1D block for the BMN model, including an input layer, a temporal evaluation module, and two convolutional layers with ReLU activation functions. The weights and biases are initialized using the 'init_params' function.",
+        "type": "comment"
+    },
+    "6320": {
+        "file_id": 492,
+        "content": "            in_channels=self.hidden_dim_1d,\n            out_channels=self.hidden_dim_1d,\n            kernel_size=3,\n            padding=1,\n            groups=4,\n            weight_attr=init_params('TEM_e1_w', self.hidden_dim_1d, 3),\n            bias_attr=init_params('TEM_e1_b', self.hidden_dim_1d, 3))\n        self.te_conv1_act = paddle.nn.ReLU()\n        self.te_conv2 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=1,\n            kernel_size=1,\n            padding=0,\n            groups=1,\n            weight_attr=init_params('TEM_e2_w', self.hidden_dim_1d, 1),\n            bias_attr=init_params('TEM_e2_b', self.hidden_dim_1d, 1))\n        self.te_conv2_act = paddle.nn.Sigmoid()\n        #Proposal Evaluation Module\n        self.p_conv1 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=self.hidden_dim_2d,\n            kernel_size=3,\n            padding=1,\n            groups=1,\n            weight_attr=init_params('PEM_1d_w', self.hidden_dim_1d, 3),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:164-189"
+    },
+    "6321": {
+        "file_id": 492,
+        "content": "This code initializes the TEM and PEM modules of a backbone network. It defines several convolutional layers with specific configurations for each module, followed by activation functions. The weight and bias attributes are initialized using the init_params function.",
+        "type": "comment"
+    },
+    "6322": {
+        "file_id": 492,
+        "content": "            bias_attr=init_params('PEM_1d_b', self.hidden_dim_1d, 3))\n        self.p_conv1_act = paddle.nn.ReLU()\n        # init to speed up\n        sample_mask = get_interp1d_mask(self.tscale, self.dscale,\n                                        self.prop_boundary_ratio,\n                                        self.num_sample, self.num_sample_perbin)\n        self.sample_mask = paddle.to_tensor(sample_mask)\n        self.sample_mask.stop_gradient = True\n        self.p_conv3d1 = paddle.nn.Conv3D(\n            in_channels=128,\n            out_channels=self.hidden_dim_3d,\n            kernel_size=(self.num_sample, 1, 1),\n            stride=(self.num_sample, 1, 1),\n            padding=0,\n            weight_attr=ParamAttr(name=\"PEM_3d1_w\"),\n            bias_attr=ParamAttr(name=\"PEM_3d1_b\"))\n        self.p_conv3d1_act = paddle.nn.ReLU()\n        self.p_conv2d1 = paddle.nn.Conv2D(\n            in_channels=512,\n            out_channels=self.hidden_dim_2d,\n            kernel_size=1,\n            stride=1,\n            padding=0,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:190-215"
+    },
+    "6323": {
+        "file_id": 492,
+        "content": "This code initializes a backbone model for the BMN architecture. It includes convolutional layers, ReLU activations, and a tensor mask for sampling. The model uses 1D, 2D, and 3D convolutions with specific parameters, as well as applies bias attributes to the weights and biases of the convolutions.",
+        "type": "comment"
+    },
+    "6324": {
+        "file_id": 492,
+        "content": "            weight_attr=ParamAttr(name=\"PEM_2d1_w\"),\n            bias_attr=ParamAttr(name=\"PEM_2d1_b\"))\n        self.p_conv2d1_act = paddle.nn.ReLU()\n        self.p_conv2d2 = paddle.nn.Conv2D(\n            in_channels=128,\n            out_channels=self.hidden_dim_2d,\n            kernel_size=3,\n            stride=1,\n            padding=1,\n            weight_attr=ParamAttr(name=\"PEM_2d2_w\"),\n            bias_attr=ParamAttr(name=\"PEM_2d2_b\"))\n        self.p_conv2d2_act = paddle.nn.ReLU()\n        self.p_conv2d3 = paddle.nn.Conv2D(\n            in_channels=128,\n            out_channels=self.hidden_dim_2d,\n            kernel_size=3,\n            stride=1,\n            padding=1,\n            weight_attr=ParamAttr(name=\"PEM_2d3_w\"),\n            bias_attr=ParamAttr(name=\"PEM_2d3_b\"))\n        self.p_conv2d3_act = paddle.nn.ReLU()\n        self.p_conv2d4 = paddle.nn.Conv2D(\n            in_channels=128,\n            out_channels=2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            weight_attr=ParamAttr(name=\"PEM_2d4_w\"),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:216-246"
+    },
+    "6325": {
+        "file_id": 492,
+        "content": "This code initializes a series of 2D convolutional layers with ReLU activation functions for the Batch Multi-Scale Network (BMSN) backbone in PaddleVideo. Each convolutional layer has a specified number of output channels, kernel size, and stride. The weights and biases for each layer are defined using ParamAttr.",
+        "type": "comment"
+    },
+    "6326": {
+        "file_id": 492,
+        "content": "            bias_attr=ParamAttr(name=\"PEM_2d4_b\"))\n        self.p_conv2d4_act = paddle.nn.Sigmoid()\n    def init_weights(self):\n        pass\n    def forward(self, x):\n        #Base Module\n        x = self.b_conv1(x)\n        x = self.b_conv1_act(x)\n        x = self.b_conv2(x)\n        x = self.b_conv2_act(x)\n        #TEM\n        xs = self.ts_conv1(x)\n        xs = self.ts_conv1_act(xs)\n        xs = self.ts_conv2(xs)\n        xs = self.ts_conv2_act(xs)\n        xs = paddle.squeeze(xs, axis=[1])\n        xe = self.te_conv1(x)\n        xe = self.te_conv1_act(xe)\n        xe = self.te_conv2(xe)\n        xe = self.te_conv2_act(xe)\n        xe = paddle.squeeze(xe, axis=[1])\n        #PEM\n        xp = self.p_conv1(x)\n        xp = self.p_conv1_act(xp)\n        #BM layer\n        xp = paddle.matmul(xp, self.sample_mask)\n        xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale])\n        xp = self.p_conv3d1(xp)\n        xp = self.p_conv3d1_act(xp)\n        xp = paddle.squeeze(xp, axis=[2])\n        xp = self.p_conv2d1(xp)\n        xp = self.p_conv2d1_act(xp)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:247-283"
+    },
+    "6327": {
+        "file_id": 492,
+        "content": "The code is defining a backbone model for video analysis. It consists of base, TEM (temporal-inspired module), PEM (position-inspired module), and BM (block-matching module) layers. The layers are sequentially applied to the input data with appropriate activation functions and reshaping operations in between. Finally, it performs matrix multiplication with a sample mask and applies additional convolutions and activations.",
+        "type": "comment"
+    },
+    "6328": {
+        "file_id": 492,
+        "content": "        xp = self.p_conv2d2(xp)\n        xp = self.p_conv2d2_act(xp)\n        xp = self.p_conv2d3(xp)\n        xp = self.p_conv2d3_act(xp)\n        xp = self.p_conv2d4(xp)\n        xp = self.p_conv2d4_act(xp)\n        return xp, xs, xe",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/bmn.py:284-290"
+    },
+    "6329": {
+        "file_id": 492,
+        "content": "This code is part of a neural network backbone model. It applies multiple convolution layers with activation functions and returns the processed input xp, along with other variables xs and xe.",
+        "type": "comment"
+    },
+    "6330": {
+        "file_id": 493,
+        "content": "/paddlevideo/modeling/backbones/cfbi.py",
+        "type": "filepath"
+    },
+    "6331": {
+        "file_id": 493,
+        "content": "The code imports libraries and defines an FPN class with three layers, creates a backbone model using convolutional layers and GroupNorm. It also defines a \"CFBI\" class that utilizes DeepLab for feature extraction and FPN to combine multi-scale features, returning extracted features at 4x, 8x, 16x scales along with low-level features using a forward function.",
+        "type": "summary"
+    },
+    "6332": {
+        "file_id": 493,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom .deeplab import DeepLab\nclass FPN(nn.Layer):\n    \"\"\"FPN Layer\"\"\"\n    def __init__(self, in_dim_4x, in_dim_8x, in_dim_16x, out_dim):\n        super(FPN, self).__init__()\n        self.toplayer = self._make_layer(in_dim_16x, out_dim)\n        self.latlayer1 = self._make_layer(in_dim_8x, out_dim)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/cfbi.py:1-28"
+    },
+    "6333": {
+        "file_id": 493,
+        "content": "This code imports necessary libraries and defines a class called FPN, which is an FPN layer in a neural network. It has three layers: toplayer, latlayer1, and latlayer2, each with specific input dimensions and output dimensions. The _make_layer function is used to create these layers.",
+        "type": "comment"
+    },
+    "6334": {
+        "file_id": 493,
+        "content": "        self.latlayer2 = self._make_layer(in_dim_4x, out_dim)\n        self.smooth1 = self._make_layer(out_dim,\n                                        out_dim,\n                                        kernel_size=3,\n                                        padding=1)\n        self.smooth2 = self._make_layer(out_dim,\n                                        out_dim,\n                                        kernel_size=3,\n                                        padding=1)\n    def _make_layer(self, in_dim, out_dim, kernel_size=1, padding=0):\n        return nn.Sequential(\n            nn.Conv2D(in_dim,\n                      out_dim,\n                      kernel_size=kernel_size,\n                      stride=1,\n                      padding=padding,\n                      bias_attr=False),\n            nn.GroupNorm(num_groups=32, num_channels=out_dim))\n    def forward(self, x_4x, x_8x, x_16x):\n        \"\"\" forward function\"\"\"\n        x_16x = self.toplayer(x_16x)\n        x_8x = self.latlayer1(x_8x)\n        x_4x = self.latlayer2(x_4x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/cfbi.py:29-54"
+    },
+    "6335": {
+        "file_id": 493,
+        "content": "The code defines a backbone model with two convolutional layers followed by GroupNorm layer. The forward function applies the defined layers to input images of size 4x, 8x, and 16x.",
+        "type": "comment"
+    },
+    "6336": {
+        "file_id": 493,
+        "content": "        x_8x = x_8x + F.interpolate(\n            x_16x, size=x_8x.shape[-2:], mode='bilinear', align_corners=True)\n        x_4x = x_4x + F.interpolate(\n            x_8x, size=x_4x.shape[-2:], mode='bilinear', align_corners=True)\n        x_8x = self.smooth1(x_8x)\n        x_4x = self.smooth2(x_4x)\n        return F.relu(x_4x), F.relu(x_8x), F.relu(x_16x)\n@BACKBONES.register()\nclass CFBI(nn.Layer):\n    \"\"\"CFBI plus backbone\"\"\"\n    def __init__(self,\n                 backbone='resnet',\n                 freeze_bn=True,\n                 model_aspp_outdim=256,\n                 in_dim_8x=512,\n                 model_semantic_embedding_dim=256):  #,epsilon=1e-05):\n        super(CFBI, self).__init__()\n        #self.epsilon = epsilon\n        self.feature_extracter = DeepLab(backbone=backbone, freeze_bn=freeze_bn)\n        self.fpn = FPN(in_dim_4x=model_aspp_outdim,\n                       in_dim_8x=in_dim_8x,\n                       in_dim_16x=model_aspp_outdim,\n                       out_dim=model_semantic_embedding_dim)\n    def forward(self, x):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/cfbi.py:56-84"
+    },
+    "6337": {
+        "file_id": 493,
+        "content": "This code defines a class \"CFBI\" which is a backbone model for feature extraction. It utilizes DeepLab as the feature extractor and FPN (Feature Pyramid Network) to combine features from different scales. The input image x is processed through the feature extracter and the output is passed through the fpn to obtain three outputs at 4x, 8x and 16x scales. These outputs are then interpolated and smoothed before being returned after applying ReLU activation.",
+        "type": "comment"
+    },
+    "6338": {
+        "file_id": 493,
+        "content": "        \"\"\"forward function\"\"\"\n        x, aspp_x, low_level, mid_level = self.feature_extracter(x, True)\n        x_4x, x_8x, x_16x = self.fpn(x, mid_level, aspp_x)\n        return x_4x, x_8x, x_16x, low_level",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/cfbi.py:85-88"
+    },
+    "6339": {
+        "file_id": 493,
+        "content": "This code defines a forward function that takes an input image and uses the feature_extracter and fpn modules to extract features at different scales. It returns the extracted features at 4x, 8x, and 16x scales along with the low-level features.",
+        "type": "comment"
+    },
+    "6340": {
+        "file_id": 494,
+        "content": "/paddlevideo/modeling/backbones/ctrgcn.py",
+        "type": "filepath"
+    },
+    "6341": {
+        "file_id": 494,
+        "content": "The code presents a CTRGCN backbone for video models, initializes a CTRGC model with batch normalization layers and NTUGraph class, defines a neural network model with TCN_GCN_unit, and includes a final layer 10 (l10) to process input and return output.",
+        "type": "summary"
+    },
+    "6342": {
+        "file_id": 494,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\ndef conv_init(conv):\n    if conv.weight is not None:\n        weight_init_(conv.weight, 'kaiming_normal_', mode='fan_in')\n    if conv.bias is not None:\n        nn.initializer.Constant(value=0.0)(conv.bias)\ndef bn_init(bn, scale):\n    nn.initializer.Constant(value=float(scale))(bn.weight)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:1-31"
+    },
+    "6343": {
+        "file_id": 494,
+        "content": "This code imports necessary libraries, defines a convolution initialization function and a batch normalization initialization function. It also sets up scale values for the batch normalization function and registers backbone models in the registry.",
+        "type": "comment"
+    },
+    "6344": {
+        "file_id": 494,
+        "content": "    nn.initializer.Constant(value=0.0)(bn.bias)\ndef einsum(x1, x3):\n    \"\"\"paddle.einsum only support in dynamic graph mode.\n    x1 : n c u v\n    x2 : n c t v\n    \"\"\"\n    n, c, u, v1 = x1.shape\n    n, c, t, v3 = x3.shape\n    assert (v1 == v3), \"Args of einsum not match!\"\n    x1 = paddle.transpose(x1, perm=[0, 1, 3, 2])  # n c v u\n    y = paddle.matmul(x3, x1)\n    # out: n c t u\n    return y\nclass CTRGC(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 rel_reduction=8,\n                 mid_reduction=1):\n        super(CTRGC, self).__init__()\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        if in_channels == 3 or in_channels == 9:\n            self.rel_channels = 8\n            self.mid_channels = 16\n        else:\n            self.rel_channels = in_channels // rel_reduction\n            self.mid_channels = in_channels // mid_reduction\n        self.conv1 = nn.Conv2D(self.in_channels,\n                               self.rel_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:32-66"
+    },
+    "6345": {
+        "file_id": 494,
+        "content": "Defines a CTRGC class, a type of convolutional neural network layer. It has two reductions: rel_reduction (defaults to 8) and mid_reduction (defaults to 1). Depending on the input channels, it assigns different channel numbers for rel_channels (always 8 if in_channels is 3 or 9; otherwise based on rel_reduction). It also initializes a Conv2D layer with the assigned channel numbers.",
+        "type": "comment"
+    },
+    "6346": {
+        "file_id": 494,
+        "content": "                               kernel_size=1)\n        self.conv2 = nn.Conv2D(self.in_channels,\n                               self.rel_channels,\n                               kernel_size=1)\n        self.conv3 = nn.Conv2D(self.in_channels,\n                               self.out_channels,\n                               kernel_size=1)\n        self.conv4 = nn.Conv2D(self.rel_channels,\n                               self.out_channels,\n                               kernel_size=1)\n        self.tanh = nn.Tanh()\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                conv_init(m)\n            elif isinstance(m, nn.BatchNorm2D):\n                bn_init(m, 1)\n    def forward(self, x, A=None, alpha=1):\n        x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean(-2), self.conv3(\n            x)\n        x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2))\n        x1 = self.conv4(x1) * alpha + (\n            A.unsqueeze(0).unsqueeze(0) if A is not None else 0)  # N,C,V,V",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:67-93"
+    },
+    "6347": {
+        "file_id": 494,
+        "content": "This code defines a Convolutional Temporal Relational Graph Convolutional Network (CTRGCN) backbone for a video model. It initializes weights and performs forward pass calculations. It uses convolution layers, tanh activation function, and optionally includes an additional input A.",
+        "type": "comment"
+    },
+    "6348": {
+        "file_id": 494,
+        "content": "        # We only support 'paddle.einsum()' in dynamic graph mode, if use in infer model please implement self.\n        # x1 = paddle.einsum('ncuv,nctv->nctu', x1, x3)\n        x1 = einsum(x1, x3)\n        return x1\nclass TemporalConv(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 dilation=1):\n        super(TemporalConv, self).__init__()\n        pad = (kernel_size + (kernel_size - 1) * (dilation - 1) - 1) // 2\n        self.conv = nn.Conv2D(in_channels,\n                              out_channels,\n                              kernel_size=(kernel_size, 1),\n                              padding=(pad, 0),\n                              stride=(stride, 1),\n                              dilation=(dilation, 1))\n        self.bn = nn.BatchNorm2D(out_channels)\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        return x\nclass MultiScale_TemporalConv(nn.Layer):\n    def __init__(self,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:94-127"
+    },
+    "6349": {
+        "file_id": 494,
+        "content": "Code snippet defines a class TemporalConv, which is a 2D convolutional layer for temporal data. It inherits from the paddle.nn.Layer and includes an instance of nn.Conv2D and nn.BatchNorm2D layers. The MultiScale_TemporalConv class is also defined but its implementation is missing, suggesting it extends TemporalConv with multiple temporal convolution blocks for multi-scale processing.",
+        "type": "comment"
+    },
+    "6350": {
+        "file_id": 494,
+        "content": "                 in_channels,\n                 out_channels,\n                 kernel_size=3,\n                 stride=1,\n                 dilations=[1, 2, 3, 4],\n                 residual=True,\n                 residual_kernel_size=1):\n        super(MultiScale_TemporalConv, self).__init__()\n        assert out_channels % (\n            len(dilations) +\n            2) == 0, '# out channels should be multiples of # branches'\n        # Multiple branches of temporal convolution\n        self.num_branches = len(dilations) + 2\n        branch_channels = out_channels // self.num_branches\n        if type(kernel_size) == list:\n            assert len(kernel_size) == len(dilations)\n        else:\n            kernel_size = [kernel_size] * len(dilations)\n        # Temporal Convolution branches\n        self.branches = nn.LayerList([\n            nn.Sequential(\n                nn.Conv2D(in_channels,\n                          branch_channels,\n                          kernel_size=1,\n                          padding=0),\n                nn.BatchNorm2D(branch_channels),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:128-155"
+    },
+    "6351": {
+        "file_id": 494,
+        "content": "This code defines a MultiScale_TemporalConv layer with multiple branches of temporal convolution. The number of branches is determined by the dilations, and out channels should be multiples of the number of branches for correct operation. Each branch has its own kernel size, and there are Conv2D layers followed by BatchNorm2D for each branch.",
+        "type": "comment"
+    },
+    "6352": {
+        "file_id": 494,
+        "content": "                nn.ReLU(),\n                TemporalConv(branch_channels,\n                             branch_channels,\n                             kernel_size=ks,\n                             stride=stride,\n                             dilation=dilation),\n            ) for ks, dilation in zip(kernel_size, dilations)\n        ])\n        # Additional Max & 1x1 branch\n        self.branches.append(\n            nn.Sequential(\n                nn.Conv2D(in_channels,\n                          branch_channels,\n                          kernel_size=1,\n                          padding=0), nn.BatchNorm2D(branch_channels),\n                nn.ReLU(),\n                nn.MaxPool2D(kernel_size=(3, 1),\n                             stride=(stride, 1),\n                             padding=(1, 0)), nn.BatchNorm2D(branch_channels)))\n        self.branches.append(\n            nn.Sequential(\n                nn.Conv2D(in_channels,\n                          branch_channels,\n                          kernel_size=1,\n                          padding=0,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:156-182"
+    },
+    "6353": {
+        "file_id": 494,
+        "content": "This code defines a Conv-Temporal RGN backbone model for video analysis. It consists of multiple branches with various convolutional and pooling layers, including TemporalConv and MaxPool2D operations. The branches are appended to the model and initialized with respective settings such as kernel size, dilation rate, etc.",
+        "type": "comment"
+    },
+    "6354": {
+        "file_id": 494,
+        "content": "                          stride=(stride, 1)), nn.BatchNorm2D(branch_channels)))\n        # Residual connection\n        if not residual:\n            self.residual = lambda x: 0\n        elif (in_channels == out_channels) and (stride == 1):\n            self.residual = lambda x: x\n        else:\n            self.residual = TemporalConv(in_channels,\n                                         out_channels,\n                                         kernel_size=residual_kernel_size,\n                                         stride=stride)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        # initialize\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                conv_init(m)\n            elif isinstance(m, nn.BatchNorm2D):\n                weight_init_(m.weight, 'Normal', std=0.02, mean=1.0)\n                nn.initializer.Constant(value=0.0)(m.bias)\n    def forward(self, x):\n        # Input dim: (N,C,T,V)\n        res = self.residual(x)\n        branch_outs = []\n        for tempconv in self.branches:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:183-211"
+    },
+    "6355": {
+        "file_id": 494,
+        "content": "This code defines a class for a Conv-Temporal Residual Group Convolutional Network (CTRGCN) backbone. The class contains a constructor that sets up the architecture, an initialization function to set the weights, and a forward pass function for feeding data into the model. It performs residual connections using temporal convolutions and has batch normalization layers.",
+        "type": "comment"
+    },
+    "6356": {
+        "file_id": 494,
+        "content": "            out = tempconv(x)\n            branch_outs.append(out)\n        out = paddle.concat(branch_outs, axis=1)\n        out += res\n        return out\nclass unit_tcn(nn.Layer):\n    def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):\n        super(unit_tcn, self).__init__()\n        pad = int((kernel_size - 1) / 2)\n        self.conv = nn.Conv2D(in_channels,\n                              out_channels,\n                              kernel_size=(kernel_size, 1),\n                              padding=(pad, 0),\n                              stride=(stride, 1))\n        self.bn = nn.BatchNorm2D(out_channels)\n        self.relu = nn.ReLU()\n        conv_init(self.conv)\n        bn_init(self.bn, 1)\n    def forward(self, x):\n        x = self.bn(self.conv(x))\n        return x\nclass unit_gcn(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 A,\n                 coff_embedding=4,\n                 adaptive=True,\n                 residual=True):\n        super(unit_gcn, self).__init__()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:212-250"
+    },
+    "6357": {
+        "file_id": 494,
+        "content": "This code defines two classes: \"unit_tcn\" and \"unit_gcn\". The \"unit_tcn\" class is a Temporal Convolutional Network unit that performs temporal convolution with batch normalization and ReLU activation. The \"unit_gcn\" class is a Graph Convolutional Network unit that takes input channels, output channels, adjacency matrix A, coefficient embedding, adaptive flag, and residual flag as parameters.",
+        "type": "comment"
+    },
+    "6358": {
+        "file_id": 494,
+        "content": "        inter_channels = out_channels // coff_embedding\n        self.inter_c = inter_channels\n        self.out_c = out_channels\n        self.in_c = in_channels\n        self.adaptive = adaptive\n        self.num_subset = A.shape[0]\n        self.convs = nn.LayerList()\n        for i in range(self.num_subset):\n            self.convs.append(CTRGC(in_channels, out_channels))\n        if residual:\n            if in_channels != out_channels:\n                self.down = nn.Sequential(\n                    nn.Conv2D(in_channels, out_channels, 1),\n                    nn.BatchNorm2D(out_channels))\n            else:\n                self.down = lambda x: x\n        else:\n            self.down = lambda x: 0\n        if self.adaptive:\n            pa_param = paddle.ParamAttr(\n                initializer=paddle.nn.initializer.Assign(A.astype(np.float32)))\n            self.PA = paddle.create_parameter(shape=A.shape,\n                                              dtype='float32',\n                                              attr=pa_param)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:251-276"
+    },
+    "6359": {
+        "file_id": 494,
+        "content": "This code initializes a CTRGC model with specified input and output channels. It also includes optional residual connection, batch normalization, and adaptive parameterization. The number of subsets is determined by the shape of A. If adaptive is set to True, it creates a trainable parameter for the subset of weights.",
+        "type": "comment"
+    },
+    "6360": {
+        "file_id": 494,
+        "content": "        else:\n            A_tensor = paddle.to_tensor(A, dtype=\"float32\")\n            self.A = paddle.create_parameter(\n                shape=A_tensor.shape,\n                dtype='float32',\n                default_initializer=paddle.nn.initializer.Assign(A_tensor))\n            self.A.stop_gradient = True\n        alpha_tensor = paddle.to_tensor(np.zeros(1), dtype=\"float32\")\n        self.alpha = paddle.create_parameter(\n            shape=alpha_tensor.shape,\n            dtype='float32',\n            default_initializer=paddle.nn.initializer.Assign(alpha_tensor))\n        self.bn = nn.BatchNorm2D(out_channels)\n        self.soft = nn.Softmax(-2)\n        self.relu = nn.ReLU()\n    def init_weights(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                conv_init(m)\n            elif isinstance(m, nn.BatchNorm2D):\n                bn_init(m, 1)\n        bn_init(self.bn, 1e-6)\n    def forward(self, x):\n        y = None\n        if self.adaptive:\n            A = self.PA\n        else:\n            A = self.A.cuda(x.get_device())",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:277-306"
+    },
+    "6361": {
+        "file_id": 494,
+        "content": "This code initializes the parameters A and alpha, sets up batch normalization (bn) layers with Softmax and ReLU activation functions, initializes weights using conv_init and bn_init functions, and defines a forward pass that adapts A based on the adaptive flag.",
+        "type": "comment"
+    },
+    "6362": {
+        "file_id": 494,
+        "content": "        for i in range(self.num_subset):\n            z = self.convs[i](x, A[i], self.alpha)\n            y = z + y if y is not None else z\n        y = self.bn(y)\n        y += self.down(x)\n        y = self.relu(y)\n        return y\nclass TCN_GCN_unit(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 A,\n                 stride=1,\n                 residual=True,\n                 adaptive=True,\n                 kernel_size=5,\n                 dilations=[1, 2]):\n        super(TCN_GCN_unit, self).__init__()\n        self.gcn1 = unit_gcn(in_channels, out_channels, A, adaptive=adaptive)\n        self.tcn1 = MultiScale_TemporalConv(out_channels,\n                                            out_channels,\n                                            kernel_size=kernel_size,\n                                            stride=stride,\n                                            dilations=dilations,\n                                            residual=False)\n        self.relu = nn.ReLU()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:307-335"
+    },
+    "6363": {
+        "file_id": 494,
+        "content": "This code defines a TCN_GCN_unit class, which is a combination of Graph Convolutional Network (GCN) and Temporal Convolution units. The unit takes input channels, output channels, adjacency matrix A, stride, residual connection, adaptive flag, kernel size, and dilations as parameters. It initializes the GCN and TemporalConv layers, followed by a ReLU activation function.",
+        "type": "comment"
+    },
+    "6364": {
+        "file_id": 494,
+        "content": "        if not residual:\n            self.residual = lambda x: 0\n        elif (in_channels == out_channels) and (stride == 1):\n            self.residual = lambda x: x\n        else:\n            self.residual = unit_tcn(in_channels,\n                                     out_channels,\n                                     kernel_size=1,\n                                     stride=stride)\n    def forward(self, x):\n        y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))\n        return y\nclass NTUDGraph:\n    def __init__(self, labeling_mode='spatial'):\n        num_node = 25\n        self_link = [(i, i) for i in range(num_node)]\n        inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),\n                            (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),\n                            (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),\n                            (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),\n                            (23, 8), (24, 25), (25, 12)]\n        inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:336-363"
+    },
+    "6365": {
+        "file_id": 494,
+        "content": "The code defines a `CTRGCN` class with a `forward` method and an `NTUDGraph` class. The `forward` method takes input `x`, applies `relu` activation and adds the residual output of a `unit_tcn` layer or simply passes through if specified conditions are met. The `NTUDGraph` initializes with a fixed number of nodes, self-links, and inward connections.",
+        "type": "comment"
+    },
+    "6366": {
+        "file_id": 494,
+        "content": "        outward = [(j, i) for (i, j) in inward]\n        neighbor = inward + outward\n        self.num_node = num_node\n        self.self_link = self_link\n        self.inward = inward\n        self.outward = outward\n        self.neighbor = neighbor\n        self.A = self.get_adjacency_matrix(labeling_mode)\n    def edge2mat(self, link, num_node):\n        A = np.zeros((num_node, num_node))\n        for i, j in link:\n            A[j, i] = 1\n        return A\n    def normalize_digraph(self, A):\n        Dl = np.sum(A, 0)\n        h, w = A.shape\n        Dn = np.zeros((w, w))\n        for i in range(w):\n            if Dl[i] > 0:\n                Dn[i, i] = Dl[i]**(-1)\n        AD = np.dot(A, Dn)\n        return AD\n    def get_spatial_graph(self, num_node, self_link, inward, outward):\n        I = self.edge2mat(self_link, num_node)\n        In = self.normalize_digraph(self.edge2mat(inward, num_node))\n        Out = self.normalize_digraph(self.edge2mat(outward, num_node))\n        A = np.stack((I, In, Out))\n        return A\n    def get_adjacency_matrix(self, labeling_mode=None):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:364-397"
+    },
+    "6367": {
+        "file_id": 494,
+        "content": "Function `get_adjacency_matrix` generates adjacency matrices for the model. The function takes a parameter `labeling_mode`, which is optional. It initializes a set of variables: `inward`, `outward`, and `neighbor`. These variables store the connections between nodes in both directions. Then, it calls other helper functions to generate normalized adjacency matrices for self-links, inward edges, outward edges, and finally returns an array containing all these matrices. This is useful for inputting into a model that requires specific formatted input data.",
+        "type": "comment"
+    },
+    "6368": {
+        "file_id": 494,
+        "content": "        if labeling_mode is None:\n            return self.A\n        if labeling_mode == 'spatial':\n            A = self.get_spatial_graph(self.num_node, self.self_link,\n                                       self.inward, self.outward)\n        else:\n            raise ValueError()\n        return A\n@BACKBONES.register()\nclass CTRGCN(nn.Layer):\n    \"\"\"\n    CTR-GCN model from:\n    `\"Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition\" <https://arxiv.org/abs/2107.12213>`_\n    Args:\n        num_point: int, numbers of sketeton point.\n        num_person: int, numbers of person.\n        base_channel: int, model's hidden dim.\n        graph: str, sketeton adjacency matrix name.\n        graph_args: dict, sketeton adjacency graph class args.\n        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 3.\n        adaptive: bool, if adjacency matrix can adaptive.\n    \"\"\"\n    def __init__(self,\n                 num_point=25,\n                 num_person=2,\n                 base_channel=64,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:398-426"
+    },
+    "6369": {
+        "file_id": 494,
+        "content": "This code is part of the CTRGCN class in the PaddleVideo library, which represents a specific type of model for skeleton-based action recognition. The function within this code block is used to return an adjacency matrix (A) based on a given labeling mode. If no labeling mode is specified, it returns the adjacency matrix from the instance variables. If the labeling mode is set to 'spatial', it calls another function to generate a spatial adjacency graph. Otherwise, if an invalid labeling mode is provided, it raises a ValueError exception.",
+        "type": "comment"
+    },
+    "6370": {
+        "file_id": 494,
+        "content": "                 graph='ntu_rgb_d',\n                 graph_args=dict(),\n                 in_channels=3,\n                 adaptive=True):\n        super(CTRGCN, self).__init__()\n        if graph == 'ntu_rgb_d':\n            self.graph = NTUDGraph(**graph_args)\n        else:\n            raise ValueError()\n        A = self.graph.A  # 3,25,25\n        self.num_point = num_point\n        self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point)\n        self.base_channel = base_channel\n        self.l1 = TCN_GCN_unit(in_channels,\n                               self.base_channel,\n                               A,\n                               residual=False,\n                               adaptive=adaptive)\n        self.l2 = TCN_GCN_unit(self.base_channel,\n                               self.base_channel,\n                               A,\n                               adaptive=adaptive)\n        self.l3 = TCN_GCN_unit(self.base_channel,\n                               self.base_channel,\n                               A,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:427-455"
+    },
+    "6371": {
+        "file_id": 494,
+        "content": "This code defines the CTRGCN class, which initializes its graph and layers based on input parameters. It includes a batch normalization layer (data_bn) and three TCN_GCN_unit layers (l1, l2, l3). The graph is determined by the 'graph' parameter, with NTUDGraph used if 'ntu_rgb_d'. If another graph is provided, it raises a ValueError.",
+        "type": "comment"
+    },
+    "6372": {
+        "file_id": 494,
+        "content": "                               adaptive=adaptive)\n        self.l4 = TCN_GCN_unit(self.base_channel,\n                               self.base_channel,\n                               A,\n                               adaptive=adaptive)\n        self.l5 = TCN_GCN_unit(self.base_channel,\n                               self.base_channel * 2,\n                               A,\n                               stride=2,\n                               adaptive=adaptive)\n        self.l6 = TCN_GCN_unit(self.base_channel * 2,\n                               self.base_channel * 2,\n                               A,\n                               adaptive=adaptive)\n        self.l7 = TCN_GCN_unit(self.base_channel * 2,\n                               self.base_channel * 2,\n                               A,\n                               adaptive=adaptive)\n        self.l8 = TCN_GCN_unit(self.base_channel * 2,\n                               self.base_channel * 4,\n                               A,\n                               stride=2,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:456-477"
+    },
+    "6373": {
+        "file_id": 494,
+        "content": "The code initializes six TCN_GCN_unit layers, each with different configurations, for a CTRGCN model. The first layer (l4) has the base channel as input and output. Following layers (l5 to l8) increase the number of channels or apply strides. This represents a deep TCN-GCN architecture with progressively increasing depth and downsampling.",
+        "type": "comment"
+    },
+    "6374": {
+        "file_id": 494,
+        "content": "                               adaptive=adaptive)\n        self.l9 = TCN_GCN_unit(self.base_channel * 4,\n                               self.base_channel * 4,\n                               A,\n                               adaptive=adaptive)\n        self.l10 = TCN_GCN_unit(self.base_channel * 4,\n                                self.base_channel * 4,\n                                A,\n                                adaptive=adaptive)\n    def init_weights(self):\n        bn_init(self.data_bn, 1)\n    def forward(self, x):\n        N, C, T, V, M = x.shape\n        x = paddle.transpose(x, perm=[0, 4, 3, 1, 2])\n        x = paddle.reshape(x, (N, M * V * C, T))\n        x = self.data_bn(x)\n        x = paddle.reshape(x, (N, M, V, C, T))\n        x = paddle.transpose(x, perm=(0, 1, 3, 4, 2))\n        x = paddle.reshape(x, (N * M, C, T, V))\n        x = self.l1(x)\n        x = self.l2(x)\n        x = self.l3(x)\n        x = self.l4(x)\n        x = self.l5(x)\n        x = self.l6(x)\n        x = self.l7(x)\n        x = self.l8(x)\n        x = self.l9(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:478-511"
+    },
+    "6375": {
+        "file_id": 494,
+        "content": "This code defines a neural network model with multiple layers. It uses Paddle's TCN_GCN_unit in the last two layers. The init_weights function initializes batch normalization for the data_bn layer, and the forward function processes input through multiple layers before returning the final output.",
+        "type": "comment"
+    },
+    "6376": {
+        "file_id": 494,
+        "content": "        x = self.l10(x)\n        return x, N, M",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ctrgcn.py:512-514"
+    },
+    "6377": {
+        "file_id": 494,
+        "content": "This code represents the final step of a neural network function. It applies layer 10 (l10) to input x, and returns both the updated x and the original N, M values. This function seems to be part of a larger model, as it references previous layers.",
+        "type": "comment"
+    },
+    "6378": {
+        "file_id": 495,
+        "content": "/paddlevideo/modeling/backbones/darknet.py",
+        "type": "filepath"
+    },
+    "6379": {
+        "file_id": 495,
+        "content": "This code defines a ConvBNLayer class and Darknet backbone, performing convolutions, pooling, and reorganization in a neural network. It concatenates results from two branches, applies more convolutions, and returns final output.",
+        "type": "summary"
+    },
+    "6380": {
+        "file_id": 495,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 input_channels,\n                 output_channels,\n                 filter_size,\n                 stride,\n                 padding,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self._conv = nn.Conv2D(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/darknet.py:1-32"
+    },
+    "6381": {
+        "file_id": 495,
+        "content": "This code defines a ConvBNLayer class that inherits from nn.Layer and includes a Conv2D layer, Batch Normalization, and other parameters like input_channels, output_channels, filter_size, stride, padding, and name.",
+        "type": "comment"
+    },
+    "6382": {
+        "file_id": 495,
+        "content": "            in_channels=input_channels,\n            out_channels=output_channels,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=padding,\n            weight_attr=ParamAttr(name=name + \".conv.weights\"),\n            bias_attr=False)\n        bn_name = name + \".bn\"\n        self._bn = nn.BatchNorm(\n            num_channels=output_channels,\n            act=\"leaky_relu\",\n            param_attr=ParamAttr(name=bn_name + \".scale\"),\n            bias_attr=ParamAttr(name=bn_name + \".offset\"),\n            moving_mean_name=bn_name + \".mean\",\n            moving_variance_name=bn_name + \".var\")\n    def forward(self, inputs):\n        x = self._conv(inputs)\n        x = self._bn(x)\n        return x\nclass BasicBlock(nn.Layer):\n    def __init__(self, input_channels, output_channels, name=None):\n        super(BasicBlock, self).__init__()\n        self._conv1 = ConvBNLayer(input_channels=input_channels, output_channels=output_channels, filter_size=[\n                                  3, 3], stride=1, padding=1,  name=name+'.0')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/darknet.py:33-61"
+    },
+    "6383": {
+        "file_id": 495,
+        "content": "This code defines a convolutional neural network block with batch normalization and leaky ReLU activation. The forward function applies the convolution followed by batch normalization, and BasicBlock is a subclass of nn.Layer representing a single block in the model architecture.",
+        "type": "comment"
+    },
+    "6384": {
+        "file_id": 495,
+        "content": "        self._max_pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)\n        self._conv2 = ConvBNLayer(input_channels=output_channels, output_channels=output_channels *\n                                  2, filter_size=[3, 3], stride=1, padding=1, name=name+'.1')\n        self._conv3 = ConvBNLayer(input_channels=output_channels*2, output_channels=output_channels,\n                                  filter_size=[1, 1], stride=1, padding=0, name=name+'.2')\n    def forward(self, x):\n        x = self._conv1(x)\n        x = self._max_pool(x)\n        x = self._conv2(x)\n        x = self._conv3(x)\n        return x\nclass Reorg(nn.Layer):\n    def __init__(self, stride=2):\n        super(Reorg, self).__init__()\n        self.stride = stride\n    def forward(self, x):\n        stride = self.stride\n        assert (x.dim() == 4)\n        B = x.shape[0]\n        C = x.shape[1]\n        H = x.shape[2]\n        W = x.shape[3]\n        assert (H % stride == 0)\n        assert (W % stride == 0)\n        ws = stride\n        hs = stride\n        x = x.reshape([B, C, H // hs, hs, W // ws, ws]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/darknet.py:62-92"
+    },
+    "6385": {
+        "file_id": 495,
+        "content": "Code defines a Darknet backbone with ConvBNLayer and MaxPooling layers, followed by Reorg layer for spatial downsampling.",
+        "type": "comment"
+    },
+    "6386": {
+        "file_id": 495,
+        "content": "                      ).transpose([0, 1, 2, 4, 3, 5])\n        x = x.reshape([B, C, H // hs * W // ws, hs * ws]\n                      ).transpose([0, 1, 3, 2])\n        x = x.reshape([B, C, hs * ws, H // hs, W // ws]\n                      ).transpose([0, 2, 1, 3, 4])\n        x = x.reshape([B, hs * ws * C, H // hs, W // ws])\n        return x\nclass Darknet(nn.Layer):\n    def __init__(self, pretrained=None):\n        super(Darknet, self).__init__()\n        self.pretrained = pretrained\n        self._conv1 = ConvBNLayer(\n            input_channels=3, output_channels=32, filter_size=3, stride=1, padding=1, name='input')\n        self._max_pool1 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)\n        self._basic_block_11 = BasicBlock(\n            input_channels=32, output_channels=64, name='1.1')\n        self._basic_block_12 = BasicBlock(\n            input_channels=64, output_channels=128, name='1.2')\n        self._basic_block_13 = BasicBlock(\n            input_channels=128, output_channels=256, name='1.3')\n        self._conv2 = ConvBNLayer(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/darknet.py:93-115"
+    },
+    "6387": {
+        "file_id": 495,
+        "content": "This code reshapes the input tensor and performs a sequence of transpose operations to rearrange dimensions. The code is part of a Darknet class, which inherits from nn.Layer and contains various ConvBNLayer and BasicBlock instances for building a convolutional neural network.",
+        "type": "comment"
+    },
+    "6388": {
+        "file_id": 495,
+        "content": "            input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='up1')\n        self._conv3 = ConvBNLayer(\n            input_channels=512, output_channels=256, filter_size=1, stride=1, padding=0, name='down1')\n        self._conv4 = ConvBNLayer(\n            input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='2.1')\n        self._max_pool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)\n        self._conv5 = ConvBNLayer(\n            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='2.2')\n        self._conv6 = ConvBNLayer(input_channels=1024, output_channels=512,\n                                  filter_size=1, stride=1, padding=0, name='2.3')  # ori\n        self._conv7 = ConvBNLayer(\n            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='up2')\n        self._conv8 = ConvBNLayer(input_channels=1024, output_channels=512,\n                                  filter_size=1, stride=1, padding=0, name='down2')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/darknet.py:116-129"
+    },
+    "6389": {
+        "file_id": 495,
+        "content": "The code defines a series of ConvBNLayer objects for the Darknet backbone. These layers include upsampling, downsampling, and convolution operations with different filter sizes and strides. The ConvBNLayer class is used to perform convolutions followed by batch normalization.",
+        "type": "comment"
+    },
+    "6390": {
+        "file_id": 495,
+        "content": "        self._conv9 = ConvBNLayer(\n            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.1')\n        self._conv10 = ConvBNLayer(\n            input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.2')\n        self._conv11 = ConvBNLayer(\n            input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.3')\n        self._conv12 = ConvBNLayer(\n            input_channels=512, output_channels=64, filter_size=1, stride=1, padding=0, name='4.1')\n        self._reorg = Reorg()\n        self._conv13 = ConvBNLayer(\n            input_channels=1280, output_channels=1024, filter_size=3, stride=1, padding=1, name='5.1')\n        self._conv14 = nn.Conv2D(1024, 425, kernel_size=1)\n    def forward(self, inputs):\n        x = self._conv1(inputs)\n        x = self._max_pool1(x)\n        x = self._basic_block_11(x)\n        x = self._basic_block_12(x)\n        x = self._basic_block_13(x)\n        x = self._conv2(x)\n        x = self._conv3(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/darknet.py:130-150"
+    },
+    "6391": {
+        "file_id": 495,
+        "content": "This code defines a neural network backbone with multiple convolutional layers, batch normalization, and pooling operations. The forward method implements the network's processing flow for input images.",
+        "type": "comment"
+    },
+    "6392": {
+        "file_id": 495,
+        "content": "        ori = self._conv4(x)\n        x = self._max_pool2(ori)\n        x = self._conv5(x)\n        x = self._conv6(x)\n        x = self._conv7(x)\n        x = self._conv8(x)\n        x = self._conv9(x)\n        x = self._conv10(x)\n        x1 = self._conv11(x)\n        x2 = self._conv12(ori)\n        x2 = self._reorg(x2)\n        x = paddle.concat([x2, x1], 1)\n        x = self._conv13(x)\n        x = self._conv14(x)\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/darknet.py:151-165"
+    },
+    "6393": {
+        "file_id": 495,
+        "content": "The code performs multiple convolutional operations, followed by pooling and reorganization. It concatenates the results of two separate branches, then applies further convolutions before returning the final output.",
+        "type": "comment"
+    },
+    "6394": {
+        "file_id": 496,
+        "content": "/paddlevideo/modeling/backbones/deeplab.py",
+        "type": "filepath"
+    },
+    "6395": {
+        "file_id": 496,
+        "content": "This code constructs a PaddlePaddle DeepLab network with convolution layers, batch normalization and activation functions in Bottleneck and ResNet classes. It includes additional layers for better performance, initializes ASPP modules in the DeepLab model for feature extraction, defines a segmentation model with ResNet backbone, adaptive pooling, and Decoder modules, and performs inference using forward function.",
+        "type": "summary"
+    },
+    "6396": {
+        "file_id": 496,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport copy\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nclass FrozenBatchNorm2D(nn.Layer):\n    \"\"\"\n    BatchNorm2D where the batch statistics and the affine parameters\n    are fixed\n    \"\"\"\n    def __init__(self, n, epsilon=1e-5):\n        super(FrozenBatchNorm2D, self).__init__()\n        x1 = paddle.ones([n])\n        x2 = paddle.zeros([n])\n        weight = self.create_parameter(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:1-33"
+    },
+    "6397": {
+        "file_id": 496,
+        "content": "This code defines a class `FrozenBatchNorm2D` which is a type of batch normalization layer where the batch statistics and affine parameters are fixed. It inherits from `nn.Layer` and initializes `paddle.ones` and `paddle.zeros` tensors as its parameters, representing fixed batch statistics and affine transformation.",
+        "type": "comment"
+    },
+    "6398": {
+        "file_id": 496,
+        "content": "            shape=x1.shape, default_initializer=nn.initializer.Assign(x1))\n        bias = self.create_parameter(\n            shape=x2.shape, default_initializer=nn.initializer.Assign(x2))\n        running_mean = self.create_parameter(\n            shape=x2.shape, default_initializer=nn.initializer.Assign(x2))\n        running_var = self.create_parameter(\n            shape=x1.shape, default_initializer=nn.initializer.Assign(x1))\n        self.add_parameter('weight', weight)\n        self.add_parameter('bias', bias)\n        self.add_parameter('running_mean', running_mean)\n        self.add_parameter('running_var', running_var)\n        self.epsilon = epsilon\n    def forward(self, x):\n        scale = self.weight * paddle.rsqrt((self.running_var + self.epsilon))\n        bias = self.bias - self.running_mean * scale\n        scale = paddle.reshape(scale, [1, -1, 1, 1])\n        bias = paddle.reshape(bias, [1, -1, 1, 1])\n        return x * scale + bias\nclass Bottleneck(nn.Layer):\n    expansion = 4\n    def __init__(self,\n                 inplanes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:34-59"
+    },
+    "6399": {
+        "file_id": 496,
+        "content": "The code defines a DeepLab class, initializes its parameters, and creates a Bottleneck layer. The DeepLab class contains a weight parameter for the convolution operation, a bias parameter to adjust output, and running_mean and running_var parameters used in normalization. The Bottleneck layer has an expansion factor of 4, implying it will increase the number of channels by this factor. This code is part of a neural network backbone implementation using PaddlePaddle framework.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/64.json b/docs/data/64.json
new file mode 100644
index 000000000..1ad6633ba
--- /dev/null
+++ b/docs/data/64.json
@@ -0,0 +1,549 @@
+{
+    "6400": {
+        "file_id": 496,
+        "content": "                 planes,\n                 stride=1,\n                 dilation=1,\n                 downsample=None,\n                 BatchNorm=None):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)\n        self.bn1 = BatchNorm(planes)\n        self.conv2 = nn.Conv2D(planes,\n                               planes,\n                               kernel_size=3,\n                               stride=stride,\n                               dilation=dilation,\n                               padding=dilation,\n                               bias_attr=False)\n        self.bn2 = BatchNorm(planes)\n        self.conv3 = nn.Conv2D(planes,\n                               planes * 4,\n                               kernel_size=1,\n                               bias_attr=False)\n        self.bn3 = BatchNorm(planes * 4)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n    def forward(self, x):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:60-86"
+    },
+    "6401": {
+        "file_id": 496,
+        "content": "Bottleneck class is a convolution neural network layer with batch normalization, designed for DeepLab model. It consists of 3 consecutive convolutions with varying kernel sizes and stride. BatchNorm layers are used after each convolution to normalize the activations, followed by ReLU activation function. The output channels are scaled by 4 in the final convolution.",
+        "type": "comment"
+    },
+    "6402": {
+        "file_id": 496,
+        "content": "        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass ResNet(nn.Layer):\n    def __init__(self,\n                 block,\n                 layers,\n                 output_stride,\n                 BatchNorm,\n                 pretrained=False):\n        self.inplanes = 64\n        super(ResNet, self).__init__()\n        blocks = [1, 2, 4]\n        if output_stride == 16:\n            strides = [1, 2, 2, 1]\n            dilations = [1, 1, 1, 2]\n        elif output_stride == 8:\n            strides = [1, 2, 1, 1]\n            dilations = [1, 1, 2, 4]\n        else:\n            raise NotImplementedError\n        # Modules\n        self.conv1 = nn.Conv2D(3,\n                               64,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:87-130"
+    },
+    "6403": {
+        "file_id": 496,
+        "content": "Code snippet performs residual block operations using convolutional layers and batch normalization with ReLU activation. It also includes downsampling if specified, and returns the output after applying final ReLU. The class ResNet initializes a ResNet network with given number of blocks, output stride, BatchNorm type, and pretrained option.",
+        "type": "comment"
+    },
+    "6404": {
+        "file_id": 496,
+        "content": "                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)\n        self.bn1 = BatchNorm(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block,\n                                       64,\n                                       layers[0],\n                                       stride=strides[0],\n                                       dilation=dilations[0],\n                                       BatchNorm=BatchNorm)\n        self.layer2 = self._make_layer(block,\n                                       128,\n                                       layers[1],\n                                       stride=strides[1],\n                                       dilation=dilations[1],\n                                       BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block,\n                                       256,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:131-152"
+    },
+    "6405": {
+        "file_id": 496,
+        "content": "This code is defining a deep learning model with convolutional layers, batch normalization, and activation functions. It uses the DeepLab backbone architecture and specifies parameters such as kernel sizes, strides, padding, and dilation rates for each layer. The BatchNorm parameter allows for optional batch normalization between layers, improving model performance by reducing internal covariate shift.",
+        "type": "comment"
+    },
+    "6406": {
+        "file_id": 496,
+        "content": "                                       layers[2],\n                                       stride=strides[2],\n                                       dilation=dilations[2],\n                                       BatchNorm=BatchNorm)\n        self.layer4 = self._make_MG_unit(block,\n                                         512,\n                                         blocks=blocks,\n                                         stride=strides[3],\n                                         dilation=dilations[3],\n                                         BatchNorm=BatchNorm)\n        self._init_weight()\n    def _make_layer(self,\n                    block,\n                    planes,\n                    blocks,\n                    stride=1,\n                    dilation=1,\n                    BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:153-176"
+    },
+    "6407": {
+        "file_id": 496,
+        "content": "This code defines a class with two layers, layer3 and layer4. Layer3 is created using the _make_MG_unit function with specific parameters like block, planes, blocks, stride, dilation, and BatchNorm. Layer4 is also created by calling _make_layer function. Downsampling is done if stride is not 1 or inplanes are not equal to planes*block.expansion.",
+        "type": "comment"
+    },
+    "6408": {
+        "file_id": 496,
+        "content": "                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes, planes, stride, dilation, downsample,\n                  BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      dilation=dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def _make_MG_unit(self,\n                      block,\n                      planes,\n                      blocks,\n                      stride=1,\n                      dilation=1,\n                      BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:177-207"
+    },
+    "6409": {
+        "file_id": 496,
+        "content": "This code defines a function _make_MG_unit that creates a module for the DeepLab model, which includes multiple layers of a specified block. It takes in parameters such as block, planes, blocks, stride, dilation, and BatchNorm (optional). The function first checks if downsampling is needed based on stride and inplanes. If so, it creates a Conv2D layer for downsampling. Then, it appends the initial layer with the specified parameters and expands the number of layers as required. Finally, it returns the created sequence of layers as a nn.Sequential module.",
+        "type": "comment"
+    },
+    "6410": {
+        "file_id": 496,
+        "content": "                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes,\n                  planes,\n                  stride,\n                  dilation=blocks[0] * dilation,\n                  downsample=downsample,\n                  BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, len(blocks)):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      stride=1,\n                      dilation=blocks[i] * dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def forward(self, input, return_mid_level=False):\n        x = self.conv1(input)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n        x = self.layer1(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:208-240"
+    },
+    "6411": {
+        "file_id": 496,
+        "content": "This code defines a function that creates a convolutional neural network for the DeepLab model. It takes input, creates layers with specified parameters, and returns a Sequential object representing the model.",
+        "type": "comment"
+    },
+    "6412": {
+        "file_id": 496,
+        "content": "        low_level_feat = x\n        x = self.layer2(x)\n        mid_level_feat = x\n        x = self.layer3(x)\n        x = self.layer4(x)\n        if return_mid_level:\n            return x, low_level_feat, mid_level_feat\n        else:\n            return x, low_level_feat\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)\nclass _ASPPModule(nn.Layer):\n    def __init__(self, inplanes, planes, kernel_size, padding, dilation,\n                 BatchNorm):\n        super(_ASPPModule, self).__init__()\n        self.atrous_conv = nn.Conv2D(inplanes,\n                                     planes,\n                                     kernel_size=kernel_size,\n                                     stride=1,\n                                     padding=padding,\n                                     dilation=dilation,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:241-269"
+    },
+    "6413": {
+        "file_id": 496,
+        "content": "This code defines a DeepLab model that utilizes an ASPP module. It extracts low and mid-level features from the input, has multiple layers of convolutions, and initializes weights using specific initializers. The ASPP module applies atrous convolutions with different dilation rates for feature extraction.",
+        "type": "comment"
+    },
+    "6414": {
+        "file_id": 496,
+        "content": "                                     bias_attr=False)\n        self.bn = BatchNorm(planes)\n        self.relu = nn.ReLU()\n        self._init_weight()\n    def forward(self, x):\n        x = self.atrous_conv(x)\n        x = self.bn(x)\n        return self.relu(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                m.weight_attr = nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.BatchNorm2D):\n                m.weight.data.fill_(1)\n                m.bias.data.zero_()\nclass ASPP(nn.Layer):\n    def __init__(self, backbone, output_stride, BatchNorm):\n        super(ASPP, self).__init__()\n        if backbone == 'drn':\n            inplanes = 512\n        elif backbone == 'mobilenet':\n            inplanes = 320\n        else:\n            inplanes = 2048\n        if output_stride == 16:\n            dilations = [1, 6, 12, 18]\n        elif output_stride == 8:\n            dilations = [1, 12, 24, 36]\n        else:\n            raise NotImplementedError\n        self.aspp1 = _ASPPModule(inplanes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:270-307"
+    },
+    "6415": {
+        "file_id": 496,
+        "content": "The code defines a DeepLab class with an ASPP module for feature extraction. It initializes the layers and sets their weights using Kaiming normal initialization or fills BatchNorm2D weight with 1 and biases with 0. The ASPP class accepts backbone and output_stride as parameters to determine dilations for the ASPP modules.",
+        "type": "comment"
+    },
+    "6416": {
+        "file_id": 496,
+        "content": "                                 256,\n                                 1,\n                                 padding=0,\n                                 dilation=dilations[0],\n                                 BatchNorm=BatchNorm)\n        self.aspp2 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[1],\n                                 dilation=dilations[1],\n                                 BatchNorm=BatchNorm)\n        self.aspp3 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[2],\n                                 dilation=dilations[2],\n                                 BatchNorm=BatchNorm)\n        self.aspp4 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[3],\n                                 dilation=dilations[3],\n                                 BatchNorm=BatchNorm)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:308-330"
+    },
+    "6417": {
+        "file_id": 496,
+        "content": "This code initializes four instances of the _ASPPModule class, each with different dilation rates and padding values for the DeepLab model's ASPP feature extraction module. The inplanes parameter is consistent across all four modules, indicating the number of input feature planes. BatchNorm specifies whether to apply batch normalization or not.",
+        "type": "comment"
+    },
+    "6418": {
+        "file_id": 496,
+        "content": "        self.global_avg_pool = nn.Sequential(\n            nn.AdaptiveAvgPool2D((1, 1)),\n            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),\n            BatchNorm(256), nn.ReLU())\n        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)\n        self.bn1 = BatchNorm(256)\n        self.relu = nn.ReLU()\n        self.dropout = nn.Dropout(0.1)\n        self._init_weight()\n    def forward(self, x):\n        x1 = self.aspp1(x)\n        x2 = self.aspp2(x)\n        x3 = self.aspp3(x)\n        x4 = self.aspp4(x)\n        x5 = self.global_avg_pool(x)\n        x5 = F.interpolate(x5,\n                           size=x4.shape[2:],\n                           mode='bilinear',\n                           align_corners=True)\n        x = paddle.concat(x=[x1, x2, x3, x4, x5], axis=1)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        return self.dropout(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:332-363"
+    },
+    "6419": {
+        "file_id": 496,
+        "content": "This code defines a DeepLab backbone model for image segmentation. It has adaptive global average pooling, multiple ASPP modules, and convolutional layers with batch normalization, ReLU activation, and dropout regularization. The constructor initializes the model's sublayers with Kaiming Normal initialization.",
+        "type": "comment"
+    },
+    "6420": {
+        "file_id": 496,
+        "content": "            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)\nclass Decoder(nn.Layer):\n    def __init__(self, backbone, BatchNorm):\n        super(Decoder, self).__init__()\n        if backbone == 'resnet':\n            low_level_inplanes = 256\n        elif backbone == 'mobilenet':\n            raise NotImplementedError\n        else:\n            raise NotImplementedError\n        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)\n        self.bn1 = BatchNorm(48)\n        self.relu = nn.ReLU()\n        self.last_conv = nn.Sequential(\n            nn.Conv2D(304,\n                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(),\n            nn.Sequential(),\n            nn.Conv2D(256,\n                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:364-395"
+    },
+    "6421": {
+        "file_id": 496,
+        "content": "This code is defining a Decoder class that takes in a backbone and BatchNorm as arguments. It initializes a convolution layer, batch normalization layer, and ReLU activation function. The last convolution sequence includes two convolutional layers with BatchNorm between them, followed by an optional second sequence of layers.",
+        "type": "comment"
+    },
+    "6422": {
+        "file_id": 496,
+        "content": "                      bias_attr=False), BatchNorm(256), nn.ReLU(),\n            nn.Sequential())\n        self._init_weight()\n    def forward(self, x, low_level_feat):\n        low_level_feat = self.conv1(low_level_feat)\n        low_level_feat = self.bn1(low_level_feat)\n        low_level_feat = self.relu(low_level_feat)\n        x = F.interpolate(x,\n                          size=low_level_feat.shape[2:],\n                          mode='bilinear',\n                          align_corners=True)\n        x = paddle.concat(x=[x, low_level_feat], axis=1)\n        x = self.last_conv(x)\n        return x\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)\nclass DeepLab(nn.Layer):\n    \"\"\"DeepLab model for segmentation\"\"\"\n    def __init__(self, backbone='resnet', output_stride=16, freeze_bn=True):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:396-426"
+    },
+    "6423": {
+        "file_id": 496,
+        "content": "The provided code defines a DeepLab model for segmentation. It includes a convolution layer, batch normalization, ReLU activation function, and interpolation operation. The forward method processes input features and returns output features. The _init_weight method initializes the weight of each sublayer. The DeepLab class takes parameters like backbone type, output stride, and freeze batch normalization flag for model initialization.",
+        "type": "comment"
+    },
+    "6424": {
+        "file_id": 496,
+        "content": "        super(DeepLab, self).__init__()\n        if freeze_bn == True:\n            print(\"Use frozen BN in DeepLab!\")\n            BatchNorm = FrozenBatchNorm2D\n        else:\n            BatchNorm = nn.BatchNorm2D\n        self.backbone = ResNet(Bottleneck, [3, 4, 23, 3],\n                               output_stride,\n                               BatchNorm,\n                               pretrained=True)\n        self.aspp = ASPP(backbone, output_stride, BatchNorm)\n        self.decoder = Decoder(backbone, BatchNorm)\n    def forward(self, input, return_aspp=False):\n        \"\"\"forward function\"\"\"\n        if return_aspp:\n            x, low_level_feat, mid_level_feat = self.backbone(input, True)\n        else:\n            x, low_level_feat = self.backbone(input)\n        aspp_x = self.aspp(x)\n        x = self.decoder(aspp_x, low_level_feat)\n        if return_aspp:\n            return x, aspp_x, low_level_feat, mid_level_feat\n        else:\n            return x, low_level_feat",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/deeplab.py:427-454"
+    },
+    "6425": {
+        "file_id": 496,
+        "content": "The code defines a DeepLab class with an optional frozen Batch Normalization layer. It initializes the backbone network (ResNet) and adds ASPP and Decoder modules. The forward function performs inference, returning either the final output or additional intermediate features depending on the return_aspp flag.",
+        "type": "comment"
+    },
+    "6426": {
+        "file_id": 497,
+        "content": "/paddlevideo/modeling/backbones/movinet.py",
+        "type": "filepath"
+    },
+    "6427": {
+        "file_id": 497,
+        "content": "The code defines a MoViNet model configuration with MobileNetV2 layers and parameters, constructing CNN layers and a MoViNet backbone class for video analysis. The model is configurable and can be causal or non-causal based on the 'causal' parameter.",
+        "type": "summary"
+    },
+    "6428": {
+        "file_id": 497,
+        "content": "import collections.abc\nfrom itertools import repeat\nfrom typing import Any, Callable, Optional, Tuple, Union\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.layer import Identity\nfrom ..registry import BACKBONES\nfrom collections import OrderedDict\ncontainer_abcs = collections.abc\n\"\"\"Model Config\n\"\"\"\nA0 = {'block_num': [0, 1, 3, 3, 4, 4]}\nA0['conv1'] = [3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1)]\nA0['b2_l0'] = [8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1)]\nA0['b3_l0'] = [8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0)]\nA0['b3_l1'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b3_l2'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b4_l0'] = [32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0)]\nA0['b4_l1'] = [56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b4_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b5_l0'] = [56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1)]\nA0['b5_l1'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:1-27"
+    },
+    "6429": {
+        "file_id": 497,
+        "content": "This code contains the configuration for a MOViNet model. It specifies the number of blocks, convolutional layers, and filter sizes for each stage of the network. The configuration is stored in a dictionary format with keys like 'A0', 'b2_l0', and so on, representing different parts of the model architecture.",
+        "type": "comment"
+    },
+    "6430": {
+        "file_id": 497,
+        "content": "A0['b5_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b5_l3'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b6_l0'] = [56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1)]\nA0['b6_l1'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]\nA0['b6_l2'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]\nA0['b6_l3'] = [104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]\nA0['conv7'] = [104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0)]\nMODEL_CONFIG = {'A0': A0}\ndef _ntuple(n):\n    def parse(x):\n        if isinstance(x, container_abcs.Iterable):\n            return x\n        return tuple(repeat(x, n))\n    return parse\ndef _make_divisible(v: float,\n                    divisor: int,\n                    min_value: Optional[int] = None) -> int:\n    \"\"\"\n    This function is taken from the original tf repo.\n    It ensures that all layers have a channel number that is divisible by 8.\n    It can be seen here:\n    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:28-55"
+    },
+    "6431": {
+        "file_id": 497,
+        "content": "This code defines a model architecture for the MobileNetV2 network, specifying the number of filters, kernel sizes, and stride values at each layer. The `_ntuple` function parses layer configurations, and `_make_divisible` ensures all layers have a divisible channel number. The dictionary `MODEL_CONFIG` stores these configuration parameters for the model.",
+        "type": "comment"
+    },
+    "6432": {
+        "file_id": 497,
+        "content": "    \"\"\"\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    # Make sure that round down does not go down by more than 10%.\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\n_single = _ntuple(1)\n_pair = _ntuple(2)\n_triple = _ntuple(3)\n_quadruple = _ntuple(4)\nclass CausalModule(nn.Layer):\n    def __init__(self) -> None:\n        super().__init__()\n        self.activation = None\n    def reset_activation(self) -> None:\n        self.activation = None\nclass Conv2dBNActivation(nn.Sequential):\n    def __init__(\n        self,\n        in_planes: int,\n        out_planes: int,\n        kernel_size: Union[int, Tuple[int, int]],\n        padding: Union[int, Tuple[int, int]],\n        stride: Union[int, Tuple[int, int]] = 1,\n        groups: int = 1,\n        norm_layer: Optional[Callable[..., nn.Layer]] = None,\n        activation_layer: Optional[Callable[..., nn.Layer]] = None,\n        **kwargs: Any,\n    ) -> None:\n        kernel_size = _pair(kernel_size)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:56-94"
+    },
+    "6433": {
+        "file_id": 497,
+        "content": "The code defines a CausalModule class that contains an activation layer and resets it when needed. Conv2dBNActivation is a Sequential module with optional normalization and activation layers, used in the construction of the MoviNet backbone model.",
+        "type": "comment"
+    },
+    "6434": {
+        "file_id": 497,
+        "content": "        stride = _pair(stride)\n        padding = _pair(padding)\n        if norm_layer is None:\n            norm_layer = Identity\n        if activation_layer is None:\n            activation_layer = Identity\n        self.kernel_size = kernel_size\n        self.stride = stride\n        dict_layers = (nn.Conv2D(in_planes,\n                                 out_planes,\n                                 kernel_size=kernel_size,\n                                 stride=stride,\n                                 padding=padding,\n                                 groups=groups,\n                                 **kwargs), norm_layer(out_planes,\n                                                       momentum=0.1),\n                       activation_layer())\n        self.out_channels = out_planes\n        super(Conv2dBNActivation, self).__init__(dict_layers[0], dict_layers[1],\n                                                 dict_layers[2])\nclass Conv3DBNActivation(nn.Sequential):\n    def __init__(\n        self,\n        in_planes: int,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:95-121"
+    },
+    "6435": {
+        "file_id": 497,
+        "content": "This code defines two classes, `Conv2dBNActivation` and `Conv3DBNActivation`, which are convolutional neural network layers with batch normalization and activation functions. The layers have adjustable input (in_planes), output (out_planes), kernel size, stride, padding, and groups parameters. The batch normalization layer uses a momentum of 0.1, and the activation function is an Identity function by default but can be overridden with another specified activation function.",
+        "type": "comment"
+    },
+    "6436": {
+        "file_id": 497,
+        "content": "        out_planes: int,\n        kernel_size: Union[int, Tuple[int, int, int]],\n        padding: Union[int, Tuple[int, int, int]],\n        stride: Union[int, Tuple[int, int, int]] = 1,\n        groups: int = 1,\n        norm_layer: Optional[Callable[..., nn.Layer]] = None,\n        activation_layer: Optional[Callable[..., nn.Layer]] = None,\n        **kwargs: Any,\n    ) -> None:\n        kernel_size = _triple(kernel_size)\n        stride = _triple(stride)\n        padding = _triple(padding)\n        if norm_layer is None:\n            norm_layer = Identity\n        if activation_layer is None:\n            activation_layer = Identity\n        self.kernel_size = kernel_size\n        self.stride = stride\n        dict_layers = (nn.Conv3D(in_planes,\n                                 out_planes,\n                                 kernel_size=kernel_size,\n                                 stride=stride,\n                                 padding=padding,\n                                 groups=groups,\n                                 **kwargs), norm_layer(out_planes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:122-147"
+    },
+    "6437": {
+        "file_id": 497,
+        "content": "This function is creating a Conv3D layer with specified parameters, including the number of input and output planes, kernel size, padding, stride, groups, and optional norm and activation layers. The function also ensures that the input values for kernel_size, stride, and padding are correctly formatted as triples.",
+        "type": "comment"
+    },
+    "6438": {
+        "file_id": 497,
+        "content": "                                                       momentum=0.1),\n                       activation_layer())\n        self.out_channels = out_planes\n        super(Conv3DBNActivation, self).__init__(dict_layers[0], dict_layers[1],\n                                                 dict_layers[2])\nclass ConvBlock3D(CausalModule):\n    def __init__(\n        self,\n        in_planes: int,\n        out_planes: int,\n        kernel_size: Union[int, Tuple[int, int, int]],\n        causal: bool,\n        conv_type: str,\n        padding: Union[int, Tuple[int, int, int]] = 0,\n        stride: Union[int, Tuple[int, int, int]] = 1,\n        norm_layer: Optional[Callable[..., nn.Layer]] = None,\n        activation_layer: Optional[Callable[..., nn.Layer]] = None,\n        bias_attr: bool = False,\n        **kwargs: Any,\n    ) -> None:\n        super().__init__()\n        kernel_size = _triple(kernel_size)\n        stride = _triple(stride)\n        padding = _triple(padding)\n        self.conv_2 = None\n        if causal is True:\n            padding = (0, padding[1], padding[2])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:148-177"
+    },
+    "6439": {
+        "file_id": 497,
+        "content": "The code defines a class named `ConvBlock3D` as a subclass of `CausalModule`. It takes inputs such as the number of input and output planes, kernel size, causality status, convolution type, padding, stride, normalization layer, activation layer, bias attribute, and optional keyword arguments. It initializes the class variables and creates an instance of the `Conv3DBNActivation` class.",
+        "type": "comment"
+    },
+    "6440": {
+        "file_id": 497,
+        "content": "        if conv_type != \"2plus1d\" and conv_type != \"3d\":\n            raise ValueError(\"only 2plus2d or 3d are \" +\n                             \"allowed as 3d convolutions\")\n        if conv_type == \"2plus1d\":\n            self.conv_1 = Conv2dBNActivation(in_planes,\n                                             out_planes,\n                                             kernel_size=(kernel_size[1],\n                                                          kernel_size[2]),\n                                             padding=(padding[1], padding[2]),\n                                             stride=(stride[1], stride[2]),\n                                             activation_layer=activation_layer,\n                                             norm_layer=norm_layer,\n                                             bias_attr=bias_attr,\n                                             **kwargs)\n            if kernel_size[0] > 1:\n                self.conv_2 = Conv2dBNActivation(\n                    in_planes,\n                    out_planes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:178-196"
+    },
+    "6441": {
+        "file_id": 497,
+        "content": "This code is checking the convolution type and raising a ValueError if it's neither \"2plus1d\" nor \"3d\". If the type is \"2plus1d\", it initializes two Conv2dBNActivation layers with appropriate parameters.",
+        "type": "comment"
+    },
+    "6442": {
+        "file_id": 497,
+        "content": "                    kernel_size=(kernel_size[0], 1),\n                    padding=(padding[0], 0),\n                    stride=(stride[0], 1),\n                    activation_layer=activation_layer,\n                    norm_layer=norm_layer,\n                    bias_attr=bias_attr,\n                    **kwargs)\n        elif conv_type == \"3d\":\n            self.conv_1 = Conv3DBNActivation(in_planes,\n                                             out_planes,\n                                             kernel_size=kernel_size,\n                                             padding=padding,\n                                             activation_layer=activation_layer,\n                                             norm_layer=norm_layer,\n                                             stride=stride,\n                                             bias_attr=bias_attr,\n                                             **kwargs)\n        self.padding = padding\n        self.kernel_size = kernel_size\n        self.dim_pad = self.kernel_size[0] - 1",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:197-216"
+    },
+    "6443": {
+        "file_id": 497,
+        "content": "The code defines a layer with different convolution types (\"2d\" or \"3d\") and initializes the corresponding Conv2D or Conv3D layers with specified parameters such as input/output planes, kernel size, padding, activation layer, norm layer, stride, bias attribute and other keyword arguments. It also stores the padding and kernel size for future use.",
+        "type": "comment"
+    },
+    "6444": {
+        "file_id": 497,
+        "content": "        self.stride = stride\n        self.causal = causal\n        self.conv_type = conv_type\n    def _forward(self, x: paddle.Tensor) -> paddle.Tensor:\n        if self.dim_pad > 0 and self.conv_2 is None and self.causal is True:\n            x = self._cat_stream_buffer(x)\n        b, c, t, h, w = x.shape\n        if self.conv_type == \"2plus1d\":\n            x = paddle.transpose(x, (0, 2, 1, 3, 4))  # bcthw --> btchw\n            x = paddle.reshape_(x, (-1, c, h, w))  # btchw --> bt,c,h,w\n        x = self.conv_1(x)\n        if self.conv_type == \"2plus1d\":\n            b, c, h, w = x.shape\n            x = paddle.reshape_(x, (-1, t, c, h, w))  # bt,c,h,w --> b,t,c,h,w\n            x = paddle.transpose(x, (0, 2, 1, 3, 4))  # b,t,c,h,w --> b,c,t,h,w\n            if self.conv_2 is not None:\n                if self.dim_pad > 0 and self.causal is True:\n                    x = self._cat_stream_buffer(x)\n                b, c, t, h, w = x.shape\n                x = paddle.reshape_(x, (b, c, t, h * w))\n                x = self.conv_2(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:217-238"
+    },
+    "6445": {
+        "file_id": 497,
+        "content": "This code defines a class with an attribute `_forward` method. The constructor takes stride, causal, and conv_type as parameters. If causal is True, stream buffer is concatenated to the input tensor. Depending on conv_type, the tensor shape may be reshaped for proper processing. Finally, if conv_2 is not None, it applies a convolution operation to the tensor.",
+        "type": "comment"
+    },
+    "6446": {
+        "file_id": 497,
+        "content": "                b, c, t, _ = x.shape\n                x = paddle.reshape_(x, (b, c, t, h, w))\n        return x\n    def forward(self, x: paddle.Tensor) -> paddle.Tensor:\n        x = self._forward(x)\n        return x\n    def _cat_stream_buffer(self, x: paddle.Tensor) -> paddle.Tensor:\n        if self.activation is None:\n            self._setup_activation(x.shape)\n        x = paddle.concat((self.activation, x), 2)\n        self._save_in_activation(x)\n        return x\n    def _save_in_activation(self, x: paddle.Tensor) -> None:\n        assert self.dim_pad > 0\n        self.activation = paddle.to_tensor(x.numpy()[:, :, -self.dim_pad:,\n                                                     ...]).clone().detach()\n    def _setup_activation(self, input_shape: Tuple[float, ...]) -> None:\n        assert self.dim_pad > 0\n        self.activation = paddle.zeros(shape=[\n            *input_shape[:2],  # type: ignore\n            self.dim_pad,\n            *input_shape[3:]\n        ])\nclass TemporalCGAvgPool3D(CausalModule):\n    def __init__(self, ) -> None:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:239-269"
+    },
+    "6447": {
+        "file_id": 497,
+        "content": "1. Reshapes input tensor to (b, c, t, h, w).\n2. Defines a forward function that applies the _forward function and returns the result.\n3. Concatenates the activation tensor with the input along dimension 2.\n4. Saves the last self.dim_pad rows of the input in the activation tensor.\n5. Sets up the activation tensor with zeros and self.dim_pad rows for future use.\n6. TemporalCGAvgPool3D is a CausalModule class.",
+        "type": "comment"
+    },
+    "6448": {
+        "file_id": 497,
+        "content": "        super().__init__()\n        self.n_cumulated_values = 0\n        self.register_forward_post_hook(self._detach_activation)\n    def forward(self, x: paddle.Tensor) -> paddle.Tensor:\n        input_shape = x.shape\n        cumulative_sum = paddle.cumsum(x, axis=2)\n        if self.activation is None:\n            self.activation = cumulative_sum[:, :, -1:].clone()\n        else:\n            cumulative_sum += self.activation\n            self.activation = cumulative_sum[:, :, -1:].clone()\n        noe = paddle.arange(1, input_shape[2] + 1)\n        axis = paddle.to_tensor([0, 1, 3, 4])\n        noe = paddle.unsqueeze(noe, axis=axis)\n        divisor = noe.expand(x.shape)\n        x = cumulative_sum / (self.n_cumulated_values + divisor)\n        self.n_cumulated_values += input_shape[2]\n        return x\n    @staticmethod\n    def _detach_activation(module: CausalModule, inputs: paddle.Tensor,\n                           output: paddle.Tensor) -> None:\n        module.activation.detach()\n    def reset_activation(self) -> None:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:270-296"
+    },
+    "6449": {
+        "file_id": 497,
+        "content": "The code defines a forward function for a CausalModule that performs cumulative sum operation on input tensor. It also includes methods to detach and reset the activation tensor.",
+        "type": "comment"
+    },
+    "6450": {
+        "file_id": 497,
+        "content": "        super().reset_activation()\n        self.n_cumulated_values = 0\nclass SqueezeExcitation(nn.Layer):\n    def __init__(self,\n                 input_channels: int,\n                 activation_2: nn.Layer,\n                 activation_1: nn.Layer,\n                 conv_type: str,\n                 causal: bool,\n                 squeeze_factor: int = 4,\n                 bias_attr: bool = True) -> None:\n        super().__init__()\n        self.causal = causal\n        se_multiplier = 2 if causal else 1\n        squeeze_channels = _make_divisible(\n            input_channels // squeeze_factor * se_multiplier, 8)\n        self.temporal_cumualtive_GAvg3D = TemporalCGAvgPool3D()\n        self.fc1 = ConvBlock3D(input_channels * se_multiplier,\n                               squeeze_channels,\n                               kernel_size=(1, 1, 1),\n                               padding=0,\n                               causal=causal,\n                               conv_type=conv_type,\n                               bias_attr=bias_attr)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:297-322"
+    },
+    "6451": {
+        "file_id": 497,
+        "content": "This code defines a SqueezeExcitation layer class with input channels, activation functions, convolution type, causality flag, and squeeze factor as parameters. It initializes the layer by setting the causal flag's multiplier, dividing the input channel count by the squeeze factor, rounding up to 8 using make_divisible function, and adding temporal cumulative average pooling and convolution blocks with specified parameters.",
+        "type": "comment"
+    },
+    "6452": {
+        "file_id": 497,
+        "content": "        self.activation_1 = activation_1()\n        self.activation_2 = activation_2()\n        self.fc2 = ConvBlock3D(squeeze_channels,\n                               input_channels,\n                               kernel_size=(1, 1, 1),\n                               padding=0,\n                               causal=causal,\n                               conv_type=conv_type,\n                               bias_attr=bias_attr)\n    def _scale(self, inputs: paddle.Tensor) -> paddle.Tensor:\n        if self.causal:\n            x_space = paddle.mean(inputs, axis=[3, 4], keepdim=True)\n            scale = self.temporal_cumualtive_GAvg3D(x_space)\n            scale = paddle.concat((scale, x_space), axis=1)\n        else:\n            scale = F.adaptive_avg_pool3d(inputs, 1)\n        scale = self.fc1(scale)\n        scale = self.activation_1(scale)\n        scale = self.fc2(scale)\n        return self.activation_2(scale)\n    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:\n        scale = self._scale(inputs)\n        return scale * inputs",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:323-347"
+    },
+    "6453": {
+        "file_id": 497,
+        "content": "The code defines a class with two activation functions, and a _scale method that scales the input tensor based on temporal average or average pooling. The forward method applies the scale to the input for spatial pyramid pooling.",
+        "type": "comment"
+    },
+    "6454": {
+        "file_id": 497,
+        "content": "class BasicBneck(nn.Layer):\n    def __init__(\n        self,\n        input_channels,\n        out_channels,\n        expanded_channels,\n        kernel_size,\n        stride,\n        padding,\n        padding_avg,\n        causal: bool,\n        conv_type: str,\n        norm_layer: Optional[Callable[..., nn.Layer]] = None,\n        activation_layer: Optional[Callable[..., nn.Layer]] = None,\n    ) -> None:\n        super().__init__()\n        assert type(stride) is tuple\n        if (not stride[0] == 1 or not (1 <= stride[1] <= 2)\n                or not (1 <= stride[2] <= 2)):\n            raise ValueError('illegal stride value')\n        self.res = None\n        layers = []\n        if expanded_channels != out_channels:\n            # expand\n            self.expand = ConvBlock3D(in_planes=input_channels,\n                                      out_planes=expanded_channels,\n                                      kernel_size=(1, 1, 1),\n                                      padding=(0, 0, 0),\n                                      causal=causal,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:350-382"
+    },
+    "6455": {
+        "file_id": 497,
+        "content": "This code defines the BasicBneck class which is a neural network layer. It has multiple parameters such as input_channels, out_channels, expanded_channels, kernel_size, stride, padding, padding_avg, causal, conv_type, norm_layer, and activation_layer. If expanded_channels is not equal to out_channels, it will first expand the channels using ConvBlock3D. The class also checks for illegal stride values to prevent unexpected behavior.",
+        "type": "comment"
+    },
+    "6456": {
+        "file_id": 497,
+        "content": "                                      conv_type=conv_type,\n                                      norm_layer=norm_layer,\n                                      activation_layer=activation_layer)\n        # deepwise\n        self.deep = ConvBlock3D(in_planes=expanded_channels,\n                                out_planes=expanded_channels,\n                                kernel_size=kernel_size,\n                                padding=padding,\n                                stride=stride,\n                                groups=expanded_channels,\n                                causal=causal,\n                                conv_type=conv_type,\n                                norm_layer=norm_layer,\n                                activation_layer=activation_layer)\n        # SE\n        self.se = SqueezeExcitation(\n            expanded_channels,\n            causal=causal,\n            activation_1=activation_layer,\n            activation_2=(nn.Sigmoid if conv_type == \"3d\" else nn.Hardsigmoid),\n            conv_type=conv_type)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:383-404"
+    },
+    "6457": {
+        "file_id": 497,
+        "content": "This code defines a ConvBlock3D for MoviNet backbone, includes deepwise convolution and SE (Squeeze Excitation) layers. These components process 3D feature maps with various configurations depending on the input planes, kernel size, stride, padding, etc., applying different activation functions based on the conv_type.",
+        "type": "comment"
+    },
+    "6458": {
+        "file_id": 497,
+        "content": "        # project\n        self.project = ConvBlock3D(expanded_channels,\n                                   out_channels,\n                                   kernel_size=(1, 1, 1),\n                                   padding=(0, 0, 0),\n                                   causal=causal,\n                                   conv_type=conv_type,\n                                   norm_layer=norm_layer,\n                                   activation_layer=Identity)\n        if not (stride == (1, 1, 1) and input_channels == out_channels):\n            if stride != (1, 1, 1):\n                layers.append(\n                    nn.AvgPool3D((1, 3, 3), stride=stride, padding=padding_avg))\n            layers.append(\n                ConvBlock3D(\n                    in_planes=input_channels,\n                    out_planes=out_channels,\n                    kernel_size=(1, 1, 1),\n                    padding=(0, 0, 0),\n                    norm_layer=norm_layer,\n                    activation_layer=Identity,\n                    causal=causal,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:405-427"
+    },
+    "6459": {
+        "file_id": 497,
+        "content": "This code defines a ConvBlock3D for projecting the input channels to the desired output channels. If the stride is not (1, 1, 1) or input and output channels are different, it adds an average pooling layer and another ConvBlock3D with appropriate parameters. The causal parameter determines if causal convolution should be used, and Identity activation layer is applied without any transformation.",
+        "type": "comment"
+    },
+    "6460": {
+        "file_id": 497,
+        "content": "                    conv_type=conv_type,\n                ))\n            self.res = nn.Sequential(*layers)\n        self.alpha = self.create_parameter(shape=[1], dtype=\"float32\")\n    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:\n        if self.res is not None:\n            residual = self.res(inputs)\n        else:\n            residual = inputs\n        if self.expand is not None:\n            x = self.expand(inputs)\n        else:\n            x = inputs\n        x = self.deep(x)\n        x = self.se(x)\n        x = self.project(x)\n        result = residual + self.alpha * x\n        return result\n@BACKBONES.register()\nclass MoViNet(nn.Layer):\n    def __init__(\n        self,\n        model_type: str = 'A0',\n        hidden_dim: int = 2048,\n        causal: bool = True,\n        num_classes: int = 400,\n        conv_type: str = \"3d\",\n    ) -> None:\n        super().__init__()\n        \"\"\"\n        causal: causal mode\n        num_classes: number of classes for classifcation\n        conv_type: type of convolution either 3d or 2plus1d",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:428-464"
+    },
+    "6461": {
+        "file_id": 497,
+        "content": "The code defines the MoViNet class, which is a backbone model for video analysis. It initializes layers based on input parameters and then performs feature extraction using the defined layers. The forward method applies residual connections and a scale factor to combine the extracted features with the input.",
+        "type": "comment"
+    },
+    "6462": {
+        "file_id": 497,
+        "content": "        \"\"\"\n        blocks_dic = OrderedDict()\n        cfg = MODEL_CONFIG[model_type]\n        norm_layer = nn.BatchNorm3D if conv_type == \"3d\" else nn.BatchNorm2D\n        activation_layer = nn.Swish if conv_type == \"3d\" else nn.Hardswish\n        # conv1\n        self.conv1 = ConvBlock3D(in_planes=cfg['conv1'][0],\n                                 out_planes=cfg['conv1'][1],\n                                 kernel_size=cfg['conv1'][2],\n                                 stride=cfg['conv1'][3],\n                                 padding=cfg['conv1'][4],\n                                 causal=causal,\n                                 conv_type=conv_type,\n                                 norm_layer=norm_layer,\n                                 activation_layer=activation_layer)\n        # blocks\n        for i in range(2, len(cfg['block_num']) + 1):\n            for j in range(cfg['block_num'][i - 1]):\n                blocks_dic[f'b{i}_l{j}'] = BasicBneck(\n                    cfg[f'b{i}_l{j}'][0],\n                    cfg[f'b{i}_l{j}'][1],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:465-487"
+    },
+    "6463": {
+        "file_id": 497,
+        "content": "The code defines a MOViNet model, which consists of a ConvBlock3D (conv1) and multiple BasicBneck blocks. It takes in parameters such as the number of input and output planes, kernel size, stride, padding, causal flag, conv type, norm layer, and activation layer. These parameters are extracted from the MODEL_CONFIG dictionary based on the model type. The blocks are organized in an OrderedDict called blocks_dic for future reference.",
+        "type": "comment"
+    },
+    "6464": {
+        "file_id": 497,
+        "content": "                    cfg[f'b{i}_l{j}'][2],\n                    cfg[f'b{i}_l{j}'][3],\n                    cfg[f'b{i}_l{j}'][4],\n                    cfg[f'b{i}_l{j}'][5],\n                    cfg[f'b{i}_l{j}'][6],\n                    causal=causal,\n                    conv_type=conv_type,\n                    norm_layer=norm_layer,\n                    activation_layer=activation_layer)\n        self.blocks = nn.Sequential(*(blocks_dic.values()))\n        # conv7\n        self.conv7 = ConvBlock3D(in_planes=cfg['conv7'][0],\n                                 out_planes=cfg['conv7'][1],\n                                 kernel_size=cfg['conv7'][2],\n                                 stride=cfg['conv7'][3],\n                                 padding=cfg['conv7'][4],\n                                 causal=causal,\n                                 conv_type=conv_type,\n                                 norm_layer=norm_layer,\n                                 activation_layer=activation_layer)\n        # pool\n        self.classifier = nn.Sequential(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:488-510"
+    },
+    "6465": {
+        "file_id": 497,
+        "content": "This code is defining a MOViNet model with specific configurations for blocks, convolutional layers, and pooling operations. It initializes the blocks as sequential layers and adds an additional 3D ConvBlock layer ('conv7') followed by a classifier.",
+        "type": "comment"
+    },
+    "6466": {
+        "file_id": 497,
+        "content": "            # dense9\n            ConvBlock3D(in_planes=cfg['conv7'][1],\n                        out_planes=hidden_dim,\n                        kernel_size=(1, 1, 1),\n                        causal=causal,\n                        conv_type=conv_type,\n                        bias_attr=True),\n            nn.Swish(),\n            nn.Dropout(p=0.2),\n            # dense10d\n            ConvBlock3D(in_planes=hidden_dim,\n                        out_planes=num_classes,\n                        kernel_size=(1, 1, 1),\n                        causal=causal,\n                        conv_type=conv_type,\n                        bias_attr=True),\n        )\n        if causal:\n            self.cgap = TemporalCGAvgPool3D()\n        self.apply(self._weight_init)\n        self.causal = causal\n    def avg(self, x: paddle.Tensor) -> paddle.Tensor:\n        if self.causal:\n            avg = F.adaptive_avg_pool3d(x, (x.shape[2], 1, 1))\n            avg = self.cgap(avg)[:, :, -1:]\n        else:\n            avg = F.adaptive_avg_pool3d(x, 1)\n        return avg",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:511-539"
+    },
+    "6467": {
+        "file_id": 497,
+        "content": "This code defines a 3D Convolutional Neural Network (CNN) backbone for MoviNet. It includes dense layers, convolution blocks, and optional temporal pooling. The model architecture can be causal or non-causal depending on the `causal` parameter.",
+        "type": "comment"
+    },
+    "6468": {
+        "file_id": 497,
+        "content": "    @staticmethod\n    def _weight_init(m):\n        if isinstance(m, nn.Conv3D):\n            nn.initializer.KaimingNormal(m.weight)\n            if m.bias is not None:\n                nn.initializer.Constant(0.0)(m.bias)\n        elif isinstance(m, (nn.BatchNorm3D, nn.BatchNorm2D, nn.GroupNorm)):\n            nn.initializer.Constant(1.0)(m.weight)\n            nn.initializer.Constant(0.0)(m.bias)\n        elif isinstance(m, nn.Linear):\n            nn.initializer.Normal(m.weight, 0, 0.01)\n            nn.initializer.Constant(0.0)(m.bias)\n    def forward(self, x: paddle.Tensor) -> paddle.Tensor:\n        x = self.conv1(x)\n        x = self.blocks(x)\n        x = self.conv7(x)\n        x = self.avg(x)\n        x = self.classifier(x)\n        x = x.flatten(1)\n        return x\n    @staticmethod\n    def _clean_activation_buffers(m):\n        if issubclass(type(m), CausalModule):\n            m.reset_activation()\n    def clean_activation_buffers(self) -> None:\n        self.apply(self._clean_activation_buffers)\nif __name__ == '__main__':",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:541-572"
+    },
+    "6469": {
+        "file_id": 497,
+        "content": "The code defines a class for a MoviNet backbone, which performs convolutions and has block layers. The forward function passes the input through these layers and then flattens the result before returning it. A static method initializes the network weights based on the layer type. Another static method cleans activation buffers in CausalModule subclasses.",
+        "type": "comment"
+    },
+    "6470": {
+        "file_id": 497,
+        "content": "    net = MoViNet(causal=False, conv_type='3d')\n    paddle.summary(net, input_size=(1, 3, 8, 224, 224))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/movinet.py:573-574"
+    },
+    "6471": {
+        "file_id": 497,
+        "content": "Creating a MoViNet network instance with causal set to False and 3D convolution type, then generating summary using Paddle's summary function with input size (1, 3, 8, 224, 224).",
+        "type": "comment"
+    },
+    "6472": {
+        "file_id": 498,
+        "content": "/paddlevideo/modeling/backbones/ms_tcn.py",
+        "type": "filepath"
+    },
+    "6473": {
+        "file_id": 498,
+        "content": "The code imports modules, defines Kaiming uniform initialization and SingleStageModel class. It initializes MSTCN backbone with DilatedResidualLayer stages and applies softmax to previous outputs, concatenating them together while initializing weights for convolutional layers with KaimingUniform_like_torch.",
+        "type": "summary"
+    },
+    "6474": {
+        "file_id": 498,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nimport copy\nimport random\nimport math\nfrom paddle import ParamAttr\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = len(tensor.shape)\n    if dimensions < 2:\n        raise ValueError(\"Fan in and fan out can not be computed \\\n        for tensor with fewer than 2 dimensions\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ms_tcn.py:1-32"
+    },
+    "6475": {
+        "file_id": 498,
+        "content": "This code snippet appears to be part of a larger file and sets up some initial definitions, imports, and checks for necessary conditions. It includes license information, imports various modules, and defines a function to calculate fan-in and fan-out for tensor dimensions.",
+        "type": "comment"
+    },
+    "6476": {
+        "file_id": 498,
+        "content": "    if dimensions == 2:  # Linear\n        fan_in = tensor.shape[1]\n        fan_out = tensor.shape[0]\n    else:\n        num_input_fmaps = tensor.shape[1]\n        num_output_fmaps = tensor.shape[0]\n        receptive_field_size = 1\n        if tensor.dim() > 2:\n            receptive_field_size = tensor[0][0].numel()\n        fan_in = num_input_fmaps * receptive_field_size\n        fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef calculate_gain(nonlinearity=None, a=None):\n    if nonlinearity == 'tanh':\n        return 5.0 / 3\n    elif nonlinearity == 'relu':\n        return math.sqrt(2.0)\n    elif nonlinearity == 'leaky_relu':\n        if a != None:\n            return math.sqrt(2.0 / (1 + a**2))\n        else:\n            return math.sqrt(2.0 / (1 + 0.01**2))\n    elif nonlinearity == 'selu':\n        return 3.0 / 4\n    else:\n        return 1\ndef KaimingUniform_like_torch(weight_npy,\n                              mode='fan_in',\n                              nonlinearity='leaky_relu'):\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ms_tcn.py:34-68"
+    },
+    "6477": {
+        "file_id": 498,
+        "content": "This code defines three functions: `_calculate_fan_in_and_fan_out`, `calculate_gain`, and `KaimingUniform_like_torch`. The first function calculates the fan-in and fan-out values based on the input tensor's dimensions. The second function determines the gain value depending on the nonlinearity used. The third function applies the Kaiming uniform initialization to the weight_npy parameter, utilizing the previous two functions.",
+        "type": "comment"
+    },
+    "6478": {
+        "file_id": 498,
+        "content": "    if mode == 'fan_in':\n        fan_mode = fan_in\n    else:\n        fan_mode = fan_out\n    a = math.sqrt(5.0)\n    gain = calculate_gain(nonlinearity=nonlinearity, a=a)\n    std = gain / math.sqrt(fan_mode)\n    bound = math.sqrt(3.0) * std\n    return np.random.uniform(-bound, bound, weight_npy.shape)\ndef init_bias(weight_npy, bias_npy):\n    # attention this weight is not bias\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)\n    bound = 1.0 / math.sqrt(fan_in)\n    return np.random.uniform(-bound, bound, bias_npy.shape)\nclass SingleStageModel(nn.Layer):\n    def __init__(self, num_layers, num_f_maps, dim, num_classes):\n        super(SingleStageModel, self).__init__()\n        self.conv_in = nn.Conv1D(dim, num_f_maps, 1)\n        self.layers = nn.LayerList([\n            copy.deepcopy(DilatedResidualLayer(2**i, num_f_maps, num_f_maps))\n            for i in range(num_layers)\n        ])\n        self.conv_out = nn.Conv1D(num_f_maps, num_classes, 1)\n    def forward(self, x):\n        out = self.conv_in(x)\n        for layer in self.layers:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ms_tcn.py:69-100"
+    },
+    "6479": {
+        "file_id": 498,
+        "content": "This code defines a SingleStageModel class that inherits from nn.Layer and consists of a convolutional layer, multiple DilatedResidualLayers, and another convolutional layer. The model is initialized with specified parameters: number of layers, number of feature maps, input dimension, and number of output classes.",
+        "type": "comment"
+    },
+    "6480": {
+        "file_id": 498,
+        "content": "            out = layer(out)\n        out = self.conv_out(out)\n        return out\nclass DilatedResidualLayer(nn.Layer):\n    def __init__(self, dilation, in_channels, out_channels):\n        super(DilatedResidualLayer, self).__init__()\n        self.conv_dilated = nn.Conv1D(in_channels,\n                                      out_channels,\n                                      3,\n                                      padding=dilation,\n                                      dilation=dilation)\n        self.conv_in = nn.Conv1D(out_channels, out_channels, 1)\n        self.dropout = nn.Dropout()\n    def forward(self, x):\n        out = F.relu(self.conv_dilated(x))\n        out = self.conv_in(out)\n        out = self.dropout(out)\n        return (x + out)\n@BACKBONES.register()\nclass MSTCN(nn.Layer):\n    def __init__(self, num_stages, num_layers, num_f_maps, dim, num_classes):\n        super().__init__()\n        self.stage1 = SingleStageModel(num_layers, num_f_maps, dim, num_classes)\n        self.stages = nn.LayerList([\n            copy.deepcopy(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ms_tcn.py:101-132"
+    },
+    "6481": {
+        "file_id": 498,
+        "content": "The code defines a DilatedResidualLayer, which is a type of residual layer used in the MSTCN backbone. The MSTCN class initializes a SingleStageModel and a list of stages using the provided parameters. Each stage within the model is an instance of the DilatedResidualLayer.",
+        "type": "comment"
+    },
+    "6482": {
+        "file_id": 498,
+        "content": "                SingleStageModel(num_layers, num_f_maps, num_classes,\n                                 num_classes)) for s in range(num_stages - 1)\n        ])\n    def forward(self, x):\n        \"\"\" MSTCN forward\n        \"\"\"\n        out = self.stage1(x)\n        outputs = out.unsqueeze(0)\n        for s in self.stages:\n            out = s(F.softmax(out, axis=1))\n            outputs = paddle.concat((outputs, out.unsqueeze(0)), axis=0)\n        return outputs\n    def init_weights(self):\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv1D):\n                layer.weight.set_value(\n                    KaimingUniform_like_torch(layer.weight).astype('float32'))\n                if layer.bias is not None:\n                    layer.bias.set_value(\n                        init_bias(layer.weight, layer.bias).astype('float32'))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/ms_tcn.py:133-154"
+    },
+    "6483": {
+        "file_id": 498,
+        "content": "The code defines a forward function for MSTCN model and initializes the weights for convolutional layers. It iterates over stages, applying softmax to previous output and concatenating it to previous outputs. Weights are initialized with KaimingUniform_like_torch for conv1D layers and bias is set according to the layer's weight.",
+        "type": "comment"
+    },
+    "6484": {
+        "file_id": 499,
+        "content": "/paddlevideo/modeling/backbones/pptsm_mv2.py",
+        "type": "filepath"
+    },
+    "6485": {
+        "file_id": 499,
+        "content": "The ConvBNLayer class introduces PaddlePaddle's MobileNetV2 backbone model for image/video processing with pretrained weights and inverted residual units. It initializes and returns three models (PPTSM_MobileNetV2_x0_75, PPTSM_MobileNetV2_x1_5, PPTSM_MobileNetV2_x2_0).",
+        "type": "summary"
+    },
+    "6486": {
+        "file_id": 499,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn import Conv2D, BatchNorm, Linear, Dropout\nfrom paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt\n# Download URL of pretrained model\n# {\n# \"MobileNetV2\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_ssld_pretrained.pdparams\",",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:1-30"
+    },
+    "6487": {
+        "file_id": 499,
+        "content": "This code is part of the PaddlePaddle deep learning framework, specifically for the MobileNetV2 backbone model. It imports necessary libraries and defines functions for the architecture, weight initialization, and pre-trained model downloading. The commented sections provide licensing information and download URLs for pretrained models.",
+        "type": "comment"
+    },
+    "6488": {
+        "file_id": 499,
+        "content": "# \"MobileNetV2_x0_25\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_25_pretrained.pdparams\",\n# \"MobileNetV2_x0_5\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_5_pretrained.pdparams\",\n# \"MobileNetV2_x0_75\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_75_pretrained.pdparams\",\n# \"MobileNetV2_x1_5\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x1_5_pretrained.pdparams\",\n# \"MobileNetV2_x2_0\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x2_0_pretrained.pdparams\"\n# }\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 filter_size,\n                 num_filters,\n                 stride,\n                 padding,\n                 channels=None,\n                 num_groups=1,\n                 name=None,\n                 use_cudnn=True):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=num_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:32-58"
+    },
+    "6489": {
+        "file_id": 499,
+        "content": "This code defines the ConvBNLayer class, which inherits from nn.Layer and contains a convolutional layer followed by a batch normalization layer. The constructor takes several parameters such as number of channels, filter size, etc., to define the specifics of the convolutional layer. The URLs provided indicate that pretrained models are available for MobileNetV2 with various scaling factors.",
+        "type": "comment"
+    },
+    "6490": {
+        "file_id": 499,
+        "content": "                            out_channels=num_filters,\n                            kernel_size=filter_size,\n                            stride=stride,\n                            padding=padding,\n                            groups=num_groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        self._batch_norm = BatchNorm(\n            num_filters,\n            param_attr=ParamAttr(name=name + \"_bn_scale\"),\n            bias_attr=ParamAttr(name=name + \"_bn_offset\"),\n            moving_mean_name=name + \"_bn_mean\",\n            moving_variance_name=name + \"_bn_variance\")\n    def forward(self, inputs, if_act=True):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if if_act:\n            y = F.relu6(y)\n        return y\nclass InvertedResidualUnit(nn.Layer):\n    def __init__(self, num_channels, num_in_filter, num_filters, stride,\n                 filter_size, padding, expansion_factor, name, num_seg):\n        super(InvertedResidualUnit, self).__init__()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:59-85"
+    },
+    "6491": {
+        "file_id": 499,
+        "content": "The code defines a class for an inverted residual unit with batch normalization. The unit takes input, performs convolution, applies batch normalization, and optionally applies activation if specified.",
+        "type": "comment"
+    },
+    "6492": {
+        "file_id": 499,
+        "content": "        self.num_seg = num_seg\n        num_expfilter = int(round(num_in_filter * expansion_factor))\n        self._expand_conv = ConvBNLayer(num_channels=num_channels,\n                                        num_filters=num_expfilter,\n                                        filter_size=1,\n                                        stride=1,\n                                        padding=0,\n                                        num_groups=1,\n                                        name=name + \"_expand\")\n        self._bottleneck_conv = ConvBNLayer(num_channels=num_expfilter,\n                                            num_filters=num_expfilter,\n                                            filter_size=filter_size,\n                                            stride=stride,\n                                            padding=padding,\n                                            num_groups=num_expfilter,\n                                            use_cudnn=False,\n                                            name=name + \"_dwise\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:86-103"
+    },
+    "6493": {
+        "file_id": 499,
+        "content": "This code initializes and assigns class attributes for a backbone model. It defines two convolutional layers, one for expansion (num_channels to num_expfilter) and another for bottleneck (num_expfilter to num_expfilter), both followed by BN operations. The layers are named with the prefix \"name\" for future reference or identification.",
+        "type": "comment"
+    },
+    "6494": {
+        "file_id": 499,
+        "content": "        self._linear_conv = ConvBNLayer(num_channels=num_expfilter,\n                                        num_filters=num_filters,\n                                        filter_size=1,\n                                        stride=1,\n                                        padding=0,\n                                        num_groups=1,\n                                        name=name + \"_linear\")\n    def forward(self, inputs, ifshortcut):\n        # add temporal shift module\n        y = inputs\n        if ifshortcut:\n            y = F.temporal_shift(y, self.num_seg, 1.0 / self.num_seg)\n        y = self._expand_conv(y, if_act=True)\n        y = self._bottleneck_conv(y, if_act=True)\n        y = self._linear_conv(y, if_act=False)\n        if ifshortcut:\n            y = paddle.add(inputs, y)\n        return y\nclass InvresiBlocks(nn.Layer):\n    def __init__(self, in_c, t, c, n, s, name, num_seg):\n        super(InvresiBlocks, self).__init__()\n        self._first_block = InvertedResidualUnit(num_channels=in_c,\n                                                 num_in_filter=in_c,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:105-132"
+    },
+    "6495": {
+        "file_id": 499,
+        "content": "This code defines a neural network layer, likely for image or video processing. It contains a series of convolutional layers and activation functions. The \"forward\" function applies temporal shift to the input based on the number of segments and performs convolutions in different stages. The \"InvresiBlocks\" class defines an Inverted Residual block with initial parameters.",
+        "type": "comment"
+    },
+    "6496": {
+        "file_id": 499,
+        "content": "                                                 num_filters=c,\n                                                 stride=s,\n                                                 filter_size=3,\n                                                 padding=1,\n                                                 expansion_factor=t,\n                                                 name=name + \"_1\",\n                                                 num_seg=num_seg)\n        self._block_list = []\n        for i in range(1, n):\n            block = self.add_sublayer(name + \"_\" + str(i + 1),\n                                      sublayer=InvertedResidualUnit(\n                                          num_channels=c,\n                                          num_in_filter=c,\n                                          num_filters=c,\n                                          stride=1,\n                                          filter_size=3,\n                                          padding=1,\n                                          expansion_factor=t,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:133-151"
+    },
+    "6497": {
+        "file_id": 499,
+        "content": "The code defines a function for the PPTSM_MV2 model, creating an InvertedResidualUnit with specified parameters and adding it to a list. The loop iterates from 1 to n-1, building multiple residual units with increasing indexes.",
+        "type": "comment"
+    },
+    "6498": {
+        "file_id": 499,
+        "content": "                                          name=name + \"_\" + str(i + 1),\n                                          num_seg=num_seg))\n            self._block_list.append(block)\n    def forward(self, inputs):\n        y = self._first_block(inputs, ifshortcut=False)\n        for block in self._block_list:\n            y = block(y, ifshortcut=True)\n        return y\nclass MobileNet(nn.Layer):\n    def __init__(self,\n                 class_num=400,\n                 scale=1.0,\n                 pretrained=None,\n                 prefix_name=\"\",\n                 num_seg=8):\n        super(MobileNet, self).__init__()\n        self.scale = scale\n        self.class_num = class_num\n        self.pretrained = pretrained\n        self.num_seg = num_seg\n        bottleneck_params_list = [\n            (1, 16, 1, 1),\n            (6, 24, 2, 2),\n            (6, 32, 3, 2),\n            (6, 64, 4, 2),\n            (6, 96, 3, 1),\n            (6, 160, 3, 2),\n            (6, 320, 1, 1),\n        ]\n        self.conv1 = ConvBNLayer(num_channels=3,\n                                 num_filters=int(32 * scale),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:152-187"
+    },
+    "6499": {
+        "file_id": 499,
+        "content": "This code defines a PPTSM-MV2 backbone and MobileNet model for image processing. The `__init__` function initializes the model with class number, scaling factor, pretrained weights, prefix name, and number of segments. The `forward` function passes inputs through each block in sequence. The `MobileNet` class defines a convolutional neural network (CNN) architecture with specific parameters for each stage, including the number of filters and stride sizes.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/65.json b/docs/data/65.json
new file mode 100644
index 000000000..56458d739
--- /dev/null
+++ b/docs/data/65.json
@@ -0,0 +1,548 @@
+{
+    "6500": {
+        "file_id": 499,
+        "content": "                                 filter_size=3,\n                                 stride=2,\n                                 padding=1,\n                                 name=prefix_name + \"conv1_1\")\n        self.block_list = []\n        i = 1\n        in_c = int(32 * scale)\n        for layer_setting in bottleneck_params_list:\n            t, c, n, s = layer_setting\n            i += 1\n            block = self.add_sublayer(prefix_name + \"conv\" + str(i),\n                                      sublayer=InvresiBlocks(in_c=in_c,\n                                                             t=t,\n                                                             c=int(c * scale),\n                                                             n=n,\n                                                             s=s,\n                                                             name=prefix_name +\n                                                             \"conv\" + str(i),\n                                                             num_seg=num_seg))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:188-207"
+    },
+    "6501": {
+        "file_id": 499,
+        "content": "This code initializes a PPTSM_MV2 backbone model. It adds a convolution layer with specific parameters, and creates a list of block layers using InvresiBlocks with varying settings. The scale value affects the number of input channels in each layer.",
+        "type": "comment"
+    },
+    "6502": {
+        "file_id": 499,
+        "content": "            self.block_list.append(block)\n            in_c = int(c * scale)\n        self.out_c = int(1280 * scale) if scale > 1.0 else 1280\n        self.conv9 = ConvBNLayer(num_channels=in_c,\n                                 num_filters=self.out_c,\n                                 filter_size=1,\n                                 stride=1,\n                                 padding=0,\n                                 name=prefix_name + \"conv9\")\n        self.pool2d_avg = AdaptiveAvgPool2D(1)\n        self.out = Linear(self.out_c,\n                          class_num,\n                          weight_attr=ParamAttr(name=prefix_name +\n                                                \"fc10_weights\"),\n                          bias_attr=ParamAttr(name=prefix_name + \"fc10_offset\"))\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:208-232"
+    },
+    "6503": {
+        "file_id": 499,
+        "content": "This code defines a class, appends blocks to block_list, sets output channels based on scale factor, initializes convolution and pooling layers, and defines an initialization function for the weights. It seems to be part of a deep learning model backbone implementation.",
+        "type": "comment"
+    },
+    "6504": {
+        "file_id": 499,
+        "content": "            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        y = self.conv1(inputs, if_act=True)\n        for block in self.block_list:\n            y = block(y)\n        y = self.conv9(y, if_act=True)\n        y = self.pool2d_avg(y)\n        y = paddle.reshape(y, [-1, self.num_seg, y.shape[1]])\n        y = paddle.mean(y, axis=1)\n        y = paddle.reshape(y, shape=[-1, self.out_c])\n        y = self.out(y)\n        return y\n@BACKBONES.register()\ndef PPTSM_MobileNetV2(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=1.0, **kwargs)\n    return model\ndef PPTSM_MobileNetV2_x0_25(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=0.25, **kwargs)\n    return model\ndef PPTSM_MobileNetV2_x0_5(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=0.5, **kwargs)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:233-266"
+    },
+    "6505": {
+        "file_id": 499,
+        "content": "Initializes a PPTSM MobileNetV2 model with optional pretrained weights and customizable scale. Initializes the underlying MobileNet model and applies custom modifications. Iterates through sublayers, applying Kaiming Normal initialization for Conv2D layers and constant initialization for BatchNorm2D layers. Defines the forward pass of the PPTSM MobileNetV2 model. Registers multiple PPTSM MobileNetV2 variants with different scales.",
+        "type": "comment"
+    },
+    "6506": {
+        "file_id": 499,
+        "content": "    return model\ndef PPTSM_MobileNetV2_x0_75(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=0.75, **kwargs)\n    return model\ndef PPTSM_MobileNetV2_x1_5(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=1.5, **kwargs)\n    return model\ndef PPTSM_MobileNetV2_x2_0(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=2.0, **kwargs)\n    return model",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv2.py:267-282"
+    },
+    "6507": {
+        "file_id": 499,
+        "content": "The code defines three functions, PPTSM_MobileNetV2_x0_75, PPTSM_MobileNetV2_x1_5, and PPTSM_MobileNetV2_x2_0. Each function creates a MobileNet model with different scales (0.75, 1.5, and 2.0) using the MobileNet class. The pretrained option allows for loading pre-trained weights if set to True. The functions return the created models.",
+        "type": "comment"
+    },
+    "6508": {
+        "file_id": 500,
+        "content": "/paddlevideo/modeling/backbones/pptsm_mv3.py",
+        "type": "filepath"
+    },
+    "6509": {
+        "file_id": 500,
+        "content": "The code introduces PPTSM-Mv3 backbone networks and MobileNetV3 models in PaddleVideo using PyTorch, with diverse parameters, weight initialization, pretrained model URLs, network configuration dictionaries. It also constructs CNN layers with Batch Normalization and builds the PPTSM-MV3 backbone model using temporal shifting, convolutions, SE modules, and implements Hardsigmoid function separately.",
+        "type": "summary"
+    },
+    "6510": {
+        "file_id": 500,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# reference: https://arxiv.org/abs/1905.02244\nfrom __future__ import absolute_import, division, print_function\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear\nfrom paddle.regularizer import L2Decay\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:1-28"
+    },
+    "6511": {
+        "file_id": 500,
+        "content": "Copyright notice, license information, and reference to the associated research paper. The code imports necessary libraries and registers the backbone model within the PaddleVideo module registry. It also includes a function for weight initialization.",
+        "type": "comment"
+    },
+    "6512": {
+        "file_id": 500,
+        "content": "# Download URL of pretrained model\n# MODEL_URLS = {\n#     \"MobileNetV3_small_x1_0\":\n#     \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_ssld_pretrained.pdparams\",\n#     \"MobileNetV3_large_x1_0\":\n#     \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_ssld_pretrained.pdparams\",\n# }\nMODEL_STAGES_PATTERN = {\n    \"MobileNetV3_small\": [\"blocks[0]\", \"blocks[2]\", \"blocks[7]\", \"blocks[10]\"],\n    \"MobileNetV3_large\":\n    [\"blocks[0]\", \"blocks[2]\", \"blocks[5]\", \"blocks[11]\", \"blocks[14]\"]\n}\n# \"large\", \"small\" is just for MobinetV3_large, MobileNetV3_small respectively.\n# The type of \"large\" or \"small\" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.\n# k: kernel_size\n# exp: middle channel number in depthwise block\n# c: output channel number in depthwise block\n# se: whether to use SE block\n# act: which activation to use\n# s: stride in depthwise block\nNET_CONFIG = {",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:30-52"
+    },
+    "6513": {
+        "file_id": 500,
+        "content": "The code defines pretrained model URLs for MobileNetV3_small_x1_0 and MobileNetV3_large_x1_0, as well as lists of stages for each model. The MODEL_STAGES_PATTERN contains different depthwise blocks' parameters such as kernel size, channel numbers, activation function, and stride. NET_CONFIG is a dictionary containing configurations for specific network architectures with different parameters.",
+        "type": "comment"
+    },
+    "6514": {
+        "file_id": 500,
+        "content": "    \"large\": [\n        # k, exp, c, se, act, s\n        [3, 16, 16, False, \"relu\", 1],\n        [3, 64, 24, False, \"relu\", 2],\n        [3, 72, 24, False, \"relu\", 1],\n        [5, 72, 40, True, \"relu\", 2],\n        [5, 120, 40, True, \"relu\", 1],\n        [5, 120, 40, True, \"relu\", 1],\n        [3, 240, 80, False, \"hardswish\", 2],\n        [3, 200, 80, False, \"hardswish\", 1],\n        [3, 184, 80, False, \"hardswish\", 1],\n        [3, 184, 80, False, \"hardswish\", 1],\n        [3, 480, 112, True, \"hardswish\", 1],\n        [3, 672, 112, True, \"hardswish\", 1],\n        [5, 672, 160, True, \"hardswish\", 2],\n        [5, 960, 160, True, \"hardswish\", 1],\n        [5, 960, 160, True, \"hardswish\", 1],\n    ],\n    \"small\": [\n        # k, exp, c, se, act, s\n        [3, 16, 16, True, \"relu\", 2],\n        [3, 72, 24, False, \"relu\", 2],\n        [3, 88, 24, False, \"relu\", 1],\n        [5, 96, 40, True, \"hardswish\", 2],\n        [5, 240, 40, True, \"hardswish\", 1],\n        [5, 240, 40, True, \"hardswish\", 1],\n        [5, 120, 48, True, \"hardswish\", 1],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:53-79"
+    },
+    "6515": {
+        "file_id": 500,
+        "content": "This code defines two versions of the PPTSM-Mv3 backbone network architecture for the PaddleVideo library: \"large\" and \"small\". The backbone is a series of convolutional layers, with different configurations specified by parameters k (kernel size), exp (expansion factor), c (number of channels), se (if using squeeze-and-excitation), act (activation function), and s (strides). The large version has more layers and higher capacities for learning, while the small version is optimized for inference speed. Each layer's configuration is defined in a list of lists.",
+        "type": "comment"
+    },
+    "6516": {
+        "file_id": 500,
+        "content": "        [5, 144, 48, True, \"hardswish\", 1],\n        [5, 288, 96, True, \"hardswish\", 2],\n        [5, 576, 96, True, \"hardswish\", 1],\n        [5, 576, 96, True, \"hardswish\", 1],\n    ]\n}\n# first conv output channel number in MobileNetV3\nSTEM_CONV_NUMBER = 16\n# last second conv output channel for \"small\"\nLAST_SECOND_CONV_SMALL = 576\n# last second conv output channel for \"large\"\nLAST_SECOND_CONV_LARGE = 960\n# last conv output channel number for \"large\" and \"small\"\nLAST_CONV = 1280\ndef _make_divisible(v, divisor=8, min_value=None):\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\ndef _create_act(act):\n    if act == \"hardswish\":\n        return nn.Hardswish()\n    elif act == \"relu\":\n        return nn.ReLU()\n    elif act is None:\n        return None\n    else:\n        raise RuntimeError(\n            \"The activation function is not supported: {}\".format(act))\nclass MobileNetV3(nn.Layer):\n    \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:80-118"
+    },
+    "6517": {
+        "file_id": 500,
+        "content": "This code defines the MobileNetV3 model with various parameters such as channel numbers, activation functions, and division rules for each layer. The class \"MobileNetV3\" is a custom PyTorch Layer that represents the network architecture, utilizing convolutional layers and activation functions like Hardswish or ReLU. The function \"_make_divisible\" ensures proper alignment of channel numbers with hardware considerations, while \"_create_act\" creates instances of the specified activation functions.",
+        "type": "comment"
+    },
+    "6518": {
+        "file_id": 500,
+        "content": "    MobileNetV3\n    Args:\n        config: list. MobileNetV3 depthwise blocks config.\n        scale: float=1.0. The coefficient that controls the size of network parameters.\n        class_num: int=1000. The number of classes.\n        inplanes: int=16. The output channel number of first convolution layer.\n        class_squeeze: int=960. The output channel number of penultimate convolution layer.\n        class_expand: int=1280. The output channel number of last convolution layer.\n        dropout_prob: float=0.2.  Probability of setting units to zero.\n    Returns:\n        model: nn.Layer. Specific MobileNetV3 model depends on args.\n    \"\"\"\n    def __init__(self,\n                 config,\n                 stages_pattern,\n                 scale=1.0,\n                 class_num=400,\n                 inplanes=STEM_CONV_NUMBER,\n                 class_squeeze=LAST_SECOND_CONV_LARGE,\n                 class_expand=LAST_CONV,\n                 dropout_prob=0.2,\n                 num_seg=8,\n                 pretrained=None,\n                 return_patterns=None,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:119-142"
+    },
+    "6519": {
+        "file_id": 500,
+        "content": "The function defines a MobileNetV3 model with configurable parameters like depthwise blocks, scale, class number, inplanes, class_squeeze, class_expand, dropout probability, and number of segments. It takes these parameters as inputs and returns the specific MobileNetV3 model based on the arguments provided.",
+        "type": "comment"
+    },
+    "6520": {
+        "file_id": 500,
+        "content": "                 return_stages=None):\n        super().__init__()\n        self.cfg = config\n        self.scale = scale\n        self.inplanes = inplanes\n        self.class_squeeze = class_squeeze\n        self.class_expand = class_expand\n        self.class_num = class_num\n        self.num_seg = num_seg\n        self.pretrained = pretrained\n        self.conv = ConvBNLayer(in_c=3,\n                                out_c=_make_divisible(self.inplanes *\n                                                      self.scale),\n                                filter_size=3,\n                                stride=2,\n                                padding=1,\n                                num_groups=1,\n                                if_act=True,\n                                act=\"hardswish\")\n        self.blocks = nn.Sequential(*[\n            ResidualUnit(in_c=_make_divisible(self.inplanes * self.scale if i ==\n                                              0 else self.cfg[i - 1][2] *\n                                              self.scale),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:143-168"
+    },
+    "6521": {
+        "file_id": 500,
+        "content": "This code defines a PPTSM-MV3 backbone model with specified configurations, including input planes, scale factor, class parameters, and number of segments. It uses convolutional layers and residual units for feature extraction and processing.",
+        "type": "comment"
+    },
+    "6522": {
+        "file_id": 500,
+        "content": "                         mid_c=_make_divisible(self.scale * exp),\n                         out_c=_make_divisible(self.scale * c),\n                         filter_size=k,\n                         stride=s,\n                         use_se=se,\n                         num_seg=self.num_seg,\n                         act=act)\n            for i, (k, exp, c, se, act, s) in enumerate(self.cfg)\n        ])\n        self.last_second_conv = ConvBNLayer(\n            in_c=_make_divisible(self.cfg[-1][2] * self.scale),\n            out_c=_make_divisible(self.scale * self.class_squeeze),\n            filter_size=1,\n            stride=1,\n            padding=0,\n            num_groups=1,\n            if_act=True,\n            act=\"hardswish\")\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        self.last_conv = Conv2D(in_channels=_make_divisible(self.scale *\n                                                            self.class_squeeze),\n                                out_channels=self.class_expand,\n                                kernel_size=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:169-194"
+    },
+    "6523": {
+        "file_id": 500,
+        "content": "The code initializes a PPTSM-MV3 model, which consists of several convolutional blocks and a final classification layer. The convolutional blocks are defined by the `self.cfg` list, where each element contains the kernel size, expansion factor, output channels, whether to use SE module, and activation function, along with the stride. The last convolutional block is followed by an average pooling layer and a final convolution layer for classification.",
+        "type": "comment"
+    },
+    "6524": {
+        "file_id": 500,
+        "content": "                                stride=1,\n                                padding=0,\n                                bias_attr=False)\n        self.hardswish = nn.Hardswish()\n        if dropout_prob is not None:\n            self.dropout = Dropout(p=dropout_prob, mode=\"downscale_in_infer\")\n        else:\n            self.dropout = None\n        self.fc = Linear(self.class_expand, class_num)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.blocks(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:195-222"
+    },
+    "6525": {
+        "file_id": 500,
+        "content": "This code defines a neural network model for the PPTSM_MV3 backbone. It includes convolutional layers, blocks, a Hardswish activation function, optional dropout, and fully connected layers for classification. The `init_weights` method initializes the network's weights, and the `forward` method passes input through the model layers to generate output.",
+        "type": "comment"
+    },
+    "6526": {
+        "file_id": 500,
+        "content": "        x = self.last_second_conv(x)\n        x = self.avg_pool(x)\n        x = self.last_conv(x)\n        x = self.hardswish(x)\n        if self.dropout is not None:\n            x = self.dropout(x)\n        # feature aggregation for video\n        x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]])\n        x = paddle.mean(x, axis=1)\n        x = paddle.reshape(x, shape=[-1, self.class_expand])\n        x = self.fc(x)\n        return x\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_c,\n                 out_c,\n                 filter_size,\n                 stride,\n                 padding,\n                 num_groups=1,\n                 if_act=True,\n                 act=None):\n        super().__init__()\n        self.conv = Conv2D(in_channels=in_c,\n                           out_channels=out_c,\n                           kernel_size=filter_size,\n                           stride=stride,\n                           padding=padding,\n                           groups=num_groups,\n                           bias_attr=False)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:223-258"
+    },
+    "6527": {
+        "file_id": 500,
+        "content": "This code defines a ConvBNLayer class that takes input and output channels, filter size, stride, padding, number of groups, activation function flag, and activation type as parameters. It initializes the layers for convolutional neural network and applies Batch Normalization and activation functions if specified. The class also returns the last layer of the model after feature aggregation for video classification.",
+        "type": "comment"
+    },
+    "6528": {
+        "file_id": 500,
+        "content": "        self.bn = BatchNorm(num_channels=out_c,\n                            act=None,\n                            param_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.if_act = if_act\n        self.act = _create_act(act)\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        if self.if_act:\n            x = self.act(x)\n        return x\nclass ResidualUnit(nn.Layer):\n    def __init__(self,\n                 in_c,\n                 mid_c,\n                 out_c,\n                 filter_size,\n                 stride,\n                 use_se,\n                 num_seg=8,\n                 act=None):\n        super().__init__()\n        self.if_shortcut = stride == 1 and in_c == out_c\n        self.if_se = use_se\n        self.num_seg = num_seg\n        self.expand_conv = ConvBNLayer(in_c=in_c,\n                                       out_c=mid_c,\n                                       filter_size=1,\n                                       stride=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:259-292"
+    },
+    "6529": {
+        "file_id": 500,
+        "content": "The code defines a ResidualUnit class with an expand_conv layer containing ConvBNLayer, used for building the residual unit in PPTSM-MV3 model. It also includes optional BatchNorm (bn) and activation (act) layers based on provided parameters.",
+        "type": "comment"
+    },
+    "6530": {
+        "file_id": 500,
+        "content": "                                       padding=0,\n                                       if_act=True,\n                                       act=act)\n        self.bottleneck_conv = ConvBNLayer(in_c=mid_c,\n                                           out_c=mid_c,\n                                           filter_size=filter_size,\n                                           stride=stride,\n                                           padding=int((filter_size - 1) // 2),\n                                           num_groups=mid_c,\n                                           if_act=True,\n                                           act=act)\n        if self.if_se:\n            self.mid_se = SEModule(mid_c)\n        self.linear_conv = ConvBNLayer(in_c=mid_c,\n                                       out_c=out_c,\n                                       filter_size=1,\n                                       stride=1,\n                                       padding=0,\n                                       if_act=False,\n                                       act=None)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:293-312"
+    },
+    "6531": {
+        "file_id": 500,
+        "content": "Defines a PPTSM_MV3 block with ConvBNLayer, bottleneck convolution layer, optional SEModule for spatial attention, and a linear convolution layer.",
+        "type": "comment"
+    },
+    "6532": {
+        "file_id": 500,
+        "content": "    def forward(self, x):\n        identity = x\n        if self.if_shortcut:\n            x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg)\n        x = self.expand_conv(x)\n        x = self.bottleneck_conv(x)\n        if self.if_se:\n            x = self.mid_se(x)\n        x = self.linear_conv(x)\n        if self.if_shortcut:\n            x = paddle.add(identity, x)\n        return x\n# nn.Hardsigmoid can't transfer \"slope\" and \"offset\" in nn.functional.hardsigmoid\nclass Hardsigmoid(nn.Layer):\n    def __init__(self, slope=0.2, offset=0.5):\n        super().__init__()\n        self.slope = slope\n        self.offset = offset\n    def forward(self, x):\n        return nn.functional.hardsigmoid(x,\n                                         slope=self.slope,\n                                         offset=self.offset)\nclass SEModule(nn.Layer):\n    def __init__(self, channel, reduction=4):\n        super().__init__()\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        self.conv1 = Conv2D(in_channels=channel,\n                            out_channels=channel // reduction,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:314-348"
+    },
+    "6533": {
+        "file_id": 500,
+        "content": "This code defines a PPTSM-MV3 backbone model for video analysis. It uses temporal shifting, convolutions, and SE module (if specified) in its forward pass. The Hardsigmoid function is implemented as a separate class to apply hard sigmoid activation with customizable slope and offset parameters.",
+        "type": "comment"
+    },
+    "6534": {
+        "file_id": 500,
+        "content": "                            kernel_size=1,\n                            stride=1,\n                            padding=0)\n        self.relu = nn.ReLU()\n        self.conv2 = Conv2D(in_channels=channel // reduction,\n                            out_channels=channel,\n                            kernel_size=1,\n                            stride=1,\n                            padding=0)\n        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)\n    def forward(self, x):\n        identity = x\n        x = self.avg_pool(x)\n        x = self.conv1(x)\n        x = self.relu(x)\n        x = self.conv2(x)\n        x = self.hardsigmoid(x)\n        return paddle.multiply(x=identity, y=x)\ndef PPTSM_MobileNetV3_small_x1_0(pretrained=None, **kwargs):\n    \"\"\"\n    MobileNetV3_small_x1_0\n    Args:\n        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.\n                    If str, means the path of the pretrained model.\n        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:349-376"
+    },
+    "6535": {
+        "file_id": 500,
+        "content": "The code defines a Convolutional Neural Network layer for the PPTSM-MobileNetV3_small_x1_0 model. It consists of an average pooling layer, two 1x1 convolution layers with ReLU and hard sigmoid activations. The forward function performs element-wise multiplication between input and output to implement residual learning.",
+        "type": "comment"
+    },
+    "6536": {
+        "file_id": 500,
+        "content": "    Returns:\n        model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.\n    \"\"\"\n    model = MobileNetV3(\n        config=NET_CONFIG[\"small\"],\n        scale=1.0,\n        stages_pattern=MODEL_STAGES_PATTERN[\"MobileNetV3_small\"],\n        class_squeeze=LAST_SECOND_CONV_SMALL,\n        pretrained=pretrained,\n        **kwargs)\n    return model\n@BACKBONES.register()\ndef PPTSM_MobileNetV3(pretrained=None, **kwargs):\n    \"\"\"\n    MobileNetV3_large_x1_0\n    Args:\n        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.\n                    If str, means the path of the pretrained model.\n        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.\n    Returns:\n        model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.\n    \"\"\"\n    model = MobileNetV3(\n        config=NET_CONFIG[\"large\"],\n        scale=1.0,\n        stages_pattern=MODEL_STAGES_PATTERN[\"MobileNetV3_large\"],\n        class_squeeze=LAST_SECOND_CONV_LARGE,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:377-405"
+    },
+    "6537": {
+        "file_id": 500,
+        "content": "This code defines a function that returns specific MobileNetV3 models based on given arguments. The \"MobileNetV3\" class is used to create the models, and parameters such as config, scale, stages_pattern, class_squeeze, pretrained, and other optional keyword arguments are passed to the constructor of the class. The function is then registered with BACKBONES for future use.",
+        "type": "comment"
+    },
+    "6538": {
+        "file_id": 500,
+        "content": "        pretrained=pretrained,\n        **kwargs)\n    return model",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_mv3.py:406-408"
+    },
+    "6539": {
+        "file_id": 500,
+        "content": "This code is creating an instance of the PPTSM-MV3 backbone model with specified pretrained weights and returning it.",
+        "type": "comment"
+    },
+    "6540": {
+        "file_id": 501,
+        "content": "/paddlevideo/modeling/backbones/pptsm_v2.py",
+        "type": "filepath"
+    },
+    "6541": {
+        "file_id": 501,
+        "content": "This Python module provides video processing layers and functions, including Depthwise Separable Convolution layers initialization, PPTSMV2 model with convolutional layers, and batch normalization. It defines a PPTSM_v2 backbone model for video analysis with customizable options like pretrained models, scaling, depths, dropout probability, and additional arguments.",
+        "type": "summary"
+    },
+    "6542": {
+        "file_id": 501,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import, division, print_function\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear, BatchNorm2D\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import KaimingNormal\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:1-27"
+    },
+    "6543": {
+        "file_id": 501,
+        "content": "This code is a Python module for the PaddlePaddle framework. It contains definitions of various layers and functions used in neural network backbones, including convolutional layers, pooling layers, batch normalization, linear layers, and more. The code also includes comments about copyright and licensing information, as well as imports necessary modules for these operations. Additionally, it references utility functions for weight initialization and model loading from checkpoints.",
+        "type": "comment"
+    },
+    "6544": {
+        "file_id": 501,
+        "content": "# MODEL_URLS = {\n#     \"PPLCNetV2\":\n#     \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_ssld_pretrained.pdparams\",\n# }\nMODEL_STAGES_PATTERN = {\n    \"PPLCNet\": [\"blocks2\", \"blocks3\", \"blocks4\", \"blocks5\", \"blocks6\"]\n}\nNET_CONFIG = {\n    # in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut\n    \"stage1\": [64, 3, False, False, False, False],\n    \"stage2\": [128, 3, False, False, False, False],\n    \"stage3\": [256, 5, True, True, True, False],\n    \"stage4\": [512, 5, False, True, False, True],\n}\ndef make_divisible(v, divisor=8, min_value=None):\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\nclass GlobalAttention(nn.Layer):\n    \"\"\"\n    Lightweight temporal attention module.\n    \"\"\"\n    def __init__(self, num_seg=8):\n        super().__init__()\n        self.fc = nn.Linear(in_features=num_seg,\n                            out_features=num_seg,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:29-64"
+    },
+    "6545": {
+        "file_id": 501,
+        "content": "This code defines the PPLCNetV2 backbone model for video processing tasks. It includes the URL to download a pretrained model, stages of the network (PPLCNet), and network configurations. The make_divisible function is used to round up numbers for better performance. The GlobalAttention class is a lightweight temporal attention module used in the model.",
+        "type": "comment"
+    },
+    "6546": {
+        "file_id": 501,
+        "content": "                            weight_attr=ParamAttr(learning_rate=5.0,\n                                                  regularizer=L2Decay(1e-4)),\n                            bias_attr=ParamAttr(learning_rate=10.0,\n                                                regularizer=L2Decay(0.0)))\n        self.num_seg = num_seg\n    def forward(self, x):\n        _, C, H, W = x.shape\n        x0 = x\n        x = x.reshape([-1, self.num_seg, C * H * W])\n        x = paddle.mean(x, axis=2)  # efficient way of avg_pool\n        x = x.squeeze(axis=-1)\n        x = self.fc(x)\n        attention = F.sigmoid(x)\n        attention = attention.reshape(\n            (-1, self.num_seg, 1, 1, 1))  #for broadcast\n        x0 = x0.reshape([-1, self.num_seg, C, H, W])\n        y = paddle.multiply(x0, attention)\n        y = y.reshape_([-1, C, H, W])\n        return y\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride,\n                 groups=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:65-96"
+    },
+    "6547": {
+        "file_id": 501,
+        "content": "The code defines a ConvBNLayer class and a PPTSMV2 class. The ConvBNLayer class is a convolution layer followed by batch normalization, and the PPTSMV2 class is an encoder model that takes input of shape (-1, 3, H, W) where H and W are height and width respectively, and returns output of the same shape after processing. It first resizes the input, applies convolution with specified parameters, calculates attention maps, and performs element-wise multiplication between original input and attention maps to extract relevant features for each segmented region.",
+        "type": "comment"
+    },
+    "6548": {
+        "file_id": 501,
+        "content": "                 use_act=True):\n        super().__init__()\n        self.use_act = use_act\n        self.conv = Conv2D(in_channels=in_channels,\n                           out_channels=out_channels,\n                           kernel_size=kernel_size,\n                           stride=stride,\n                           padding=(kernel_size - 1) // 2,\n                           groups=groups,\n                           weight_attr=ParamAttr(initializer=KaimingNormal()),\n                           bias_attr=False)\n        self.bn = BatchNorm2D(out_channels,\n                              weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                              bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        if self.use_act:\n            self.act = nn.ReLU()\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        if self.use_act:\n            x = self.act(x)\n        return x\nclass SEModule(nn.Layer):\n    def __init__(self, channel, reduction=4):\n        super().__init__()\n        self.avg_pool = AdaptiveAvgPool2D(1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:97-127"
+    },
+    "6549": {
+        "file_id": 501,
+        "content": "The code defines a Conv2D layer followed by a BatchNorm2D layer and an optional ReLU activation function. The SEModule class inherits from nn.Layer and contains an AdaptiveAvgPool2D layer for average pooling operations.",
+        "type": "comment"
+    },
+    "6550": {
+        "file_id": 501,
+        "content": "        self.conv1 = Conv2D(in_channels=channel,\n                            out_channels=channel // reduction,\n                            kernel_size=1,\n                            stride=1,\n                            padding=0)\n        self.relu = nn.ReLU()\n        self.conv2 = Conv2D(in_channels=channel // reduction,\n                            out_channels=channel,\n                            kernel_size=1,\n                            stride=1,\n                            padding=0)\n        self.hardsigmoid = nn.Sigmoid()\n    def forward(self, x):\n        identity = x\n        x = self.avg_pool(x)\n        x = self.conv1(x)\n        x = self.relu(x)\n        x = self.conv2(x)\n        x = self.hardsigmoid(x)\n        x = paddle.multiply(x=identity, y=x)\n        return x\nclass RepDepthwiseSeparable(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 dw_size=3,\n                 split_pw=False,\n                 use_rep=False,\n                 use_se=False,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:128-161"
+    },
+    "6551": {
+        "file_id": 501,
+        "content": "This code initializes a depthwise separable convolution layer with optional parameters like in_channels, out_channels, stride, dw_size, split_pw, use_rep, and use_se. It contains Conv2D layers for convolution operations, ReLU activation, Sigmoid activation, and element-wise multiplication for identity shortcut connection.",
+        "type": "comment"
+    },
+    "6552": {
+        "file_id": 501,
+        "content": "                 use_shortcut=False):\n        super().__init__()\n        self.is_repped = False\n        self.dw_size = dw_size\n        self.split_pw = split_pw\n        self.use_rep = use_rep\n        self.use_se = use_se\n        self.use_shortcut = True if use_shortcut and stride == 1 and in_channels == out_channels else False\n        if self.use_rep:\n            self.dw_conv_list = nn.LayerList()\n            for kernel_size in range(self.dw_size, 0, -2):\n                if kernel_size == 1 and stride != 1:\n                    continue\n                dw_conv = ConvBNLayer(in_channels=in_channels,\n                                      out_channels=in_channels,\n                                      kernel_size=kernel_size,\n                                      stride=stride,\n                                      groups=in_channels,\n                                      use_act=False)\n                self.dw_conv_list.append(dw_conv)\n            self.dw_conv = nn.Conv2D(in_channels=in_channels,\n                                     out_channels=in_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:162-185"
+    },
+    "6553": {
+        "file_id": 501,
+        "content": "This code initializes a PPTSM backbone model. It creates a ConvBNLayer for each kernel size in the dw_size range, skipping 1x1 if stride is not 1. The layers are stored in the dw_conv_list. An additional Conv2D layer with the same number of input and output channels is also created. This model can be used for image classification tasks.",
+        "type": "comment"
+    },
+    "6554": {
+        "file_id": 501,
+        "content": "                                     kernel_size=dw_size,\n                                     stride=stride,\n                                     padding=(dw_size - 1) // 2,\n                                     groups=in_channels)\n        else:\n            self.dw_conv = ConvBNLayer(in_channels=in_channels,\n                                       out_channels=in_channels,\n                                       kernel_size=dw_size,\n                                       stride=stride,\n                                       groups=in_channels)\n        self.act = nn.ReLU()\n        if use_se:\n            self.se = SEModule(in_channels)\n        if self.split_pw:\n            pw_ratio = 0.5\n            self.pw_conv_1 = ConvBNLayer(in_channels=in_channels,\n                                         kernel_size=1,\n                                         out_channels=int(out_channels *\n                                                          pw_ratio),\n                                         stride=1)\n            self.pw_conv_2 = ConvBNLayer(in_channels=int(out_channels *",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:186-209"
+    },
+    "6555": {
+        "file_id": 501,
+        "content": "Code creates a ConvBNLayer object for downsample convolution with optional SE module and split point-wise convolution. It handles different configurations based on dw_size, stride, and use_se parameters.",
+        "type": "comment"
+    },
+    "6556": {
+        "file_id": 501,
+        "content": "                                                         pw_ratio),\n                                         kernel_size=1,\n                                         out_channels=out_channels,\n                                         stride=1)\n        else:\n            self.pw_conv = ConvBNLayer(in_channels=in_channels,\n                                       kernel_size=1,\n                                       out_channels=out_channels,\n                                       stride=1)\n    def forward(self, x):\n        if self.use_rep:\n            input_x = x\n            if self.is_repped:\n                x = self.act(self.dw_conv(x))\n            else:\n                y = self.dw_conv_list[0](x)\n                for dw_conv in self.dw_conv_list[1:]:\n                    y += dw_conv(x)\n                x = self.act(y)\n        else:\n            x = self.dw_conv(x)\n        if self.use_se:\n            x = self.se(x)\n        if self.split_pw:\n            x = self.pw_conv_1(x)\n            x = self.pw_conv_2(x)\n        else:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:210-238"
+    },
+    "6557": {
+        "file_id": 501,
+        "content": "This code defines a backbone for a deep learning model, specifically the PPTSM_v2 architecture. It uses convolutional layers and Batch Normalization to process input data. The use of Point-wise Convolution (pw_conv) or Depth-wise Separable Convolutions (dw_conv) depends on certain conditions. If \"use_rep\" is True, it applies repeated depth-wise convolutions if the current layer has been repped. It also includes optional Squeeze and Excitation blocks for feature enhancement.",
+        "type": "comment"
+    },
+    "6558": {
+        "file_id": 501,
+        "content": "            x = self.pw_conv(x)\n        if self.use_shortcut:\n            x = x + input_x\n        return x\n    def rep(self):\n        if self.use_rep:\n            self.is_repped = True\n            kernel, bias = self._get_equivalent_kernel_bias()\n            self.dw_conv.weight.set_value(kernel)\n            self.dw_conv.bias.set_value(bias)\n    def _get_equivalent_kernel_bias(self):\n        kernel_sum = 0\n        bias_sum = 0\n        for dw_conv in self.dw_conv_list:\n            kernel, bias = self._fuse_bn_tensor(dw_conv)\n            kernel = self._pad_tensor(kernel, to_size=self.dw_size)\n            kernel_sum += kernel\n            bias_sum += bias\n        return kernel_sum, bias_sum\n    def _fuse_bn_tensor(self, branch):\n        kernel = branch.conv.weight\n        running_mean = branch.bn._mean\n        running_var = branch.bn._variance\n        gamma = branch.bn.weight\n        beta = branch.bn.bias\n        eps = branch.bn._epsilon\n        std = (running_var + eps).sqrt()\n        t = (gamma / std).reshape((-1, 1, 1, 1))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:239-269"
+    },
+    "6559": {
+        "file_id": 501,
+        "content": "This code implements a backbone for PPTSM_V2 model. It performs pointwise convolution, adds shortcut connection if enabled, and includes functions for representation fusion and fusing batch normalization tensor.",
+        "type": "comment"
+    },
+    "6560": {
+        "file_id": 501,
+        "content": "        return kernel * t, beta - running_mean * gamma / std\n    def _pad_tensor(self, tensor, to_size):\n        from_size = tensor.shape[-1]\n        if from_size == to_size:\n            return tensor\n        pad = (to_size - from_size) // 2\n        return F.pad(tensor, [pad, pad, pad, pad])\nclass PPTSM_v2_LCNet(nn.Layer):\n    def __init__(self,\n                 scale,\n                 depths,\n                 class_num=400,\n                 dropout_prob=0,\n                 num_seg=8,\n                 use_temporal_att=False,\n                 pretrained=None,\n                 use_last_conv=True,\n                 class_expand=1280):\n        super().__init__()\n        self.scale = scale\n        self.use_last_conv = use_last_conv\n        self.class_expand = class_expand\n        self.num_seg = num_seg\n        self.use_temporal_att = use_temporal_att\n        self.pretrained = pretrained\n        self.stem = nn.Sequential(*[\n            ConvBNLayer(in_channels=3,\n                        kernel_size=3,\n                        out_channels=make_divisible(32 * scale),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:270-303"
+    },
+    "6561": {
+        "file_id": 501,
+        "content": "The code defines a PPTSM_v2_LCNet class, which is a type of backbone neural network. It includes initialization parameters such as scale, depths, and class_num. The class also has methods for kernel multiplication, tensor padding, and other operations related to image processing and neural network layers.",
+        "type": "comment"
+    },
+    "6562": {
+        "file_id": 501,
+        "content": "                        stride=2),\n            RepDepthwiseSeparable(in_channels=make_divisible(32 * scale),\n                                  out_channels=make_divisible(64 * scale),\n                                  stride=1,\n                                  dw_size=3)\n        ])\n        # stages\n        self.stages = nn.LayerList()\n        for depth_idx, k in enumerate(NET_CONFIG):\n            in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut = NET_CONFIG[\n                k]\n            self.stages.append(\n                nn.Sequential(*[\n                    RepDepthwiseSeparable(in_channels=make_divisible(\n                        (in_channels if i == 0 else in_channels * 2) * scale),\n                                          out_channels=make_divisible(\n                                              in_channels * 2 * scale),\n                                          stride=2 if i == 0 else 1,\n                                          dw_size=kernel_size,\n                                          split_pw=split_pw,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:304-324"
+    },
+    "6563": {
+        "file_id": 501,
+        "content": "This code defines a PPTSM-v2 backbone model, using DepthwiseSeparable blocks with varying configurations for different stages. It utilizes `make_divisible()` function to adjust the number of channels and kernel sizes, and a LayerList to create a sequence of layers for each stage. The NET_CONFIG determines the specifics of each stage's parameters like in_channels, kernel_size, split_pw, use_rep, use_se, and use_shortcut.",
+        "type": "comment"
+    },
+    "6564": {
+        "file_id": 501,
+        "content": "                                          use_rep=use_rep,\n                                          use_se=use_se,\n                                          use_shortcut=use_shortcut)\n                    for i in range(depths[depth_idx])\n                ]))\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        if self.use_last_conv:\n            self.last_conv = Conv2D(in_channels=make_divisible(\n                NET_CONFIG[\"stage4\"][0] * 2 * scale),\n                                    out_channels=self.class_expand,\n                                    kernel_size=1,\n                                    stride=1,\n                                    padding=0,\n                                    bias_attr=False)\n            self.act = nn.ReLU()\n            self.dropout = Dropout(p=dropout_prob, mode=\"downscale_in_infer\")\n        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)\n        in_features = self.class_expand if self.use_last_conv else NET_CONFIG[\n            \"stage4\"][0] * 2 * scale\n        self.fc = Linear(in_features, class_num)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:325-347"
+    },
+    "6565": {
+        "file_id": 501,
+        "content": "This code defines a PPTSM_V2 backbone for the PaddleVideo model. It includes multiple Conv2D layers, BatchNorm layers, AdaptiveAvgPool2D, and optional final convolutional layer, flatten layer, linear layer. The use of these components depends on certain conditions such as `use_rep`, `use_se`, `use_shortcut`, `use_last_conv`, and other parameters.",
+        "type": "comment"
+    },
+    "6566": {
+        "file_id": 501,
+        "content": "        if self.use_temporal_att:\n            self.global_attention = GlobalAttention(num_seg=self.num_seg)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, x):\n        x = self.stem(x)\n        count = 0\n        for stage in self.stages:\n            # only add temporal attention and tsm in stage3 for efficiency\n            if count == 2:\n                # add temporal attention\n                if self.use_temporal_att:\n                    x = self.global_attention(x)\n                x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:348-372"
+    },
+    "6567": {
+        "file_id": 501,
+        "content": "Code initializes weights for a PPTSM_v2 backbone model. It first checks if the pretrained weights are provided and then initializes the layers with specified methods. Stage 3 adds temporal attention and Temporal Shift Module (TSM) operations for efficiency.",
+        "type": "comment"
+    },
+    "6568": {
+        "file_id": 501,
+        "content": "            count += 1\n            x = stage(x)\n        x = self.avg_pool(x)\n        if self.use_last_conv:\n            x = self.last_conv(x)\n            x = self.act(x)\n            x = self.dropout(x)\n        # Feature aggregation\n        x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]])\n        x = paddle.mean(x, axis=1)\n        x = paddle.reshape(x, shape=[-1, self.class_expand])\n        x = self.fc(x)\n        return x\n@BACKBONES.register()\ndef PPTSM_v2(pretrained=None, use_ssld=False, **kwargs):\n    \"\"\"\n    PP-TSM_v2 model.\n    Args:\n        pretrained: str, means the path of the pretrained model.\n    Returns:\n        model: nn.Layer.\n    \"\"\"\n    model = PPTSM_v2_LCNet(pretrained=pretrained,\n                           scale=1.0,\n                           depths=[2, 2, 6, 2],\n                           dropout_prob=0.2,\n                           **kwargs)\n    return model",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/pptsm_v2.py:373-405"
+    },
+    "6569": {
+        "file_id": 501,
+        "content": "Code snippet defines a PPTSM_v2 backbone model for video analysis. It consists of stages, an average pooling layer, and a convolution layer. The function also includes feature aggregation and reshaping operations before feeding the data to a fully connected layer. The pretrained model can be loaded from a given path, and it supports custom scaling, depths, dropout probability, and additional keyword arguments.",
+        "type": "comment"
+    },
+    "6570": {
+        "file_id": 502,
+        "content": "/paddlevideo/modeling/backbones/resnet.py",
+        "type": "filepath"
+    },
+    "6571": {
+        "file_id": 502,
+        "content": "This code defines a ResNet backbone model, utilizing ConvBNLayer and ReLU activation. It can dynamically add bottleneck blocks for models like ResNet-101 and ResNet-152, includes forward function and supports pretrained models.",
+        "type": "summary"
+    },
+    "6572": {
+        "file_id": 502,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport math\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:\n        in_channels (int): Number of channels for the input.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:1-34"
+    },
+    "6573": {
+        "file_id": 502,
+        "content": "This code is importing necessary libraries and defining the ConvBNLayer class, which combines a convolutional layer with batch normalization. This class takes in the number of input channels as an argument. The copyright notice at the beginning indicates this code is licensed under the Apache License 2.0.",
+        "type": "comment"
+    },
+    "6574": {
+        "file_id": 502,
+        "content": "        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:35-58"
+    },
+    "6575": {
+        "file_id": 502,
+        "content": "The ConvBNLayer class is a custom layer that initializes the Conv2D layer with BatchNorm2D and optional activation function. It takes in_channels, out_channels, kernel_size, stride (default: 1), groups (default: 1), act (optional activation function), and name as parameters. Weight and bias initialization are defined in the init_weights method.",
+        "type": "comment"
+    },
+    "6576": {
+        "file_id": 502,
+        "content": "                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(out_channels,\n                                       weight_attr=ParamAttr(name=bn_name +\n                                                             \"_scale\"),\n                                       bias_attr=ParamAttr(bn_name + \"_offset\"))\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BottleneckBlock, self).__init__()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:59-89"
+    },
+    "6577": {
+        "file_id": 502,
+        "content": "ResNet module with batch normalization and optional activation function. BottleneckBlock class for ResNet blocks.",
+        "type": "comment"
+    },
+    "6578": {
+        "file_id": 502,
+        "content": "        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:90-111"
+    },
+    "6579": {
+        "file_id": 502,
+        "content": "This code defines the ResNet backbone structure. It creates three ConvBNLayer instances for the first block of the network, with different parameters for each layer. The `self.conv0` layer has 1x1 kernel and performs a relu activation. `self.conv1` has a 3x3 kernel and also applies a relu activation after a stride operation. Lastly, `self.conv2` has a 1x1 kernel, no activation function, and increases the number of output channels by 4 times. The shortcut connection is created if `shortcut` is not set to `True`.",
+        "type": "comment"
+    },
+    "6580": {
+        "file_id": 502,
+        "content": "                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.relu(y)\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 filter_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:112-143"
+    },
+    "6581": {
+        "file_id": 502,
+        "content": "The code defines a ResNet block with optional shortcut connection, containing ConvBNLayer and ReLU activation. The BasicBlock class initializes the parameters for the ResNet block including stride, number of channels, convolution layer, and optional shortcut.",
+        "type": "comment"
+    },
+    "6582": {
+        "file_id": 502,
+        "content": "        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 filter_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     filter_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNet(nn.Layer):\n    \"\"\"ResNet backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:144-178"
+    },
+    "6583": {
+        "file_id": 502,
+        "content": "This code defines a ResNet backbone model. It includes a ConvBNLayer for feature extraction and optionally applies shortcut connections based on the input and output channel count. The forward function performs addition, followed by ReLU activation for each input. The ResNet class is registered with BACKBONES and takes arguments for depth and pretrained model.",
+        "type": "comment"
+    },
+    "6584": {
+        "file_id": 502,
+        "content": "    \"\"\"\n    def __init__(self, depth, pretrained=None):\n        super(ResNet, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = [64, 256, 512, 1024]\n        out_channels = [64, 128, 256, 512]\n        self.conv = ConvBNLayer(in_channels=3,\n                                out_channels=64,\n                                kernel_size=7,\n                                stride=2,\n                                act=\"relu\",\n                                name=\"conv1\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:179-208"
+    },
+    "6585": {
+        "file_id": 502,
+        "content": "This code defines a ResNet class with different layers and their corresponding depths. It also initializes the ConvBNLayer for the first convolution operation and MaxPool2D layer for pooling. The supported layers are 18, 34, 50, 101, and 152.",
+        "type": "comment"
+    },
+    "6586": {
+        "file_id": 502,
+        "content": "        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        conv_name,\n                        BottleneckBlock(\n                            # NOTE: Be careful! Here is different from TSM model.\n                            in_channels=in_channels[block]\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:210-229"
+    },
+    "6587": {
+        "file_id": 502,
+        "content": "This code adds bottleneck blocks to a ResNet backbone model, dynamically creating sublayers based on the input parameters. The block type and number of layers are determined by the given depth configuration. It also handles specific cases for ResNet-101 and ResNet-152 models.",
+        "type": "comment"
+    },
+    "6588": {
+        "file_id": 502,
+        "content": "                            shortcut=shortcut,\n                            name=conv_name))\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:230-252"
+    },
+    "6589": {
+        "file_id": 502,
+        "content": "The code is defining a ResNet model by creating layers and blocks based on the given depth configuration. It alternates between BottleneckBlock and BasicBlock depending on the current block number and depth. It also initializes weights for the parameters in the model.",
+        "type": "comment"
+    },
+    "6590": {
+        "file_id": 502,
+        "content": "        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:253-268"
+    },
+    "6591": {
+        "file_id": 502,
+        "content": "If a pretrained loading path is specified, the code will load the model with that path. If no pretrained path is provided or it's set to an empty string, it initializes Conv2D layers with KaimingNormal function and BatchNorm2D layers with Constant function (value=1).",
+        "type": "comment"
+    },
+    "6592": {
+        "file_id": 502,
+        "content": "    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27\n        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        y = self.conv(inputs)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet.py:270-283"
+    },
+    "6593": {
+        "file_id": 502,
+        "content": "This code defines the forward function for a ResNet backbone. It reshapes and passes the input through a convolutional layer, max pooling, and a series of blocks. The output is returned after processing all blocks.",
+        "type": "comment"
+    },
+    "6594": {
+        "file_id": 503,
+        "content": "/paddlevideo/modeling/backbones/resnet3d.py",
+        "type": "filepath"
+    },
+    "6595": {
+        "file_id": 503,
+        "content": "The code introduces a simplified 3D ResNet model in PaddleVideo, allowing for configurable parameters and options for non-local blocks and dilation values. The model is initialized with inflated 2D params, constructs layers, and can utilize pretrained weights.",
+        "type": "summary"
+    },
+    "6596": {
+        "file_id": 503,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport warnings\nimport collections\nfrom itertools import repeat\nimport paddle\nfrom paddle import nn\ndef _ntuple(n):\n    def parse(x):\n        if isinstance(x, collections.abc.Iterable):\n            return tuple(x)\n        return tuple(repeat(x, n))\n    return parse\n_triple = _ntuple(3)\nclass ConvBNLayer(nn.Layer):\n    \"\"\"A conv block that bundles conv/norm/activation layers.\n        This block simplifies the usage of convolution layers, which are commonly",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:1-37"
+    },
+    "6597": {
+        "file_id": 503,
+        "content": "The code is defining a function that creates a ConvBNLayer, which is a combination of convolution, normalization, and activation layers. It simplifies the usage of these layers in a convolutional neural network model.",
+        "type": "comment"
+    },
+    "6598": {
+        "file_id": 503,
+        "content": "        used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).\n        It is based upon three build methods: `build_conv_layer()`,\n        `build_norm_layer()` and `build_activation_layer()`.\n        Besides, we add some additional features in this module.\n        1. Automatically set `bias` of the conv layer.\n        2. Spectral norm is supported.\n        3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only\n        supports zero and circular padding, and we add \"reflect\" padding mode.\n        Args:\n            in_channels (int): Number of channels in the input feature map.\n                Same as that in ``nn._ConvNd``.\n            out_channels (int): Number of channels produced by the convolution.\n                Same as that in ``nn._ConvNd``.\n            kernel_size (int | tuple[int]): Size of the convolving kernel.\n                Same as that in ``nn._ConvNd``.\n            stride (int | tuple[int]): Stride of the convolution.\n                Same as that in ``nn._ConvNd``.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:38-56"
+    },
+    "6599": {
+        "file_id": 503,
+        "content": "This code defines a Conv2D layer with additional features including automatic bias setting, spectral norm support, and more padding modes. It is used in building convolutional layers, normalization layers, and activation layers for ResNet3D backbones in PaddleVideo.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/66.json b/docs/data/66.json
new file mode 100644
index 000000000..3ed41406b
--- /dev/null
+++ b/docs/data/66.json
@@ -0,0 +1,550 @@
+{
+    "6600": {
+        "file_id": 503,
+        "content": "            padding (int | tuple[int]): Zero-padding added to both sides of\n                the input. Same as that in ``nn._ConvNd``.\n            dilation (int | tuple[int]): Spacing between kernel elements.\n                Same as that in ``nn._ConvNd``.\n            groups (int): Number of blocked connections from input channels to\n                output channels. Same as that in ``nn._ConvNd``.\n        \"\"\"\n    def __init__(\n            self,\n            in_channels,\n            out_channels,\n            kernel_size,\n            padding=0,\n            stride=1,\n            dilation=1,\n            groups=1,\n            act=None,\n            bias=None,\n    ):\n        super(ConvBNLayer, self).__init__()\n        self._conv = nn.Conv3D(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            bias_attr=bias)\n        self._batch_norm = nn.BatchNorm3D(out_channels, momentum=0.1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:57-89"
+    },
+    "6601": {
+        "file_id": 503,
+        "content": "This code defines a ConvBNLayer class, which is a 3D convolutional layer followed by batch normalization. It takes parameters such as in_channels, out_channels, kernel_size, padding, stride, dilation, groups, act (activation function), and bias. The constructor initializes the Conv3D layer and BatchNorm3D with the specified parameters.",
+        "type": "comment"
+    },
+    "6602": {
+        "file_id": 503,
+        "content": "        self.act = act\n        if act is not None:\n            self._act_op = nn.ReLU()\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self.act is not None:\n            y = self._act_op(y)\n        return y\nclass Bottleneck3d(nn.Layer):\n    \"\"\"Bottleneck 3d block for ResNet3D.\n    Args:\n        inplanes (int): Number of channels for the input in first conv3d layer.\n        planes (int): Number of channels produced by some norm/conv3d layers.\n        spatial_stride (int): Spatial stride in the conv3d layer. Default: 1.\n        temporal_stride (int): Temporal stride in the conv3d layer. Default: 1.\n        dilation (int): Spacing between kernel elements. Default: 1.\n        downsample (nn.Module | None): Downsample layer. Default: None.\n        inflate (bool): Whether to inflate kernel. Default: True.\n        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the\n            kernel sizes and padding strides for conv1 and conv2 in each block.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:90-115"
+    },
+    "6603": {
+        "file_id": 503,
+        "content": "The code defines a Bottleneck3d class, which represents a bottleneck block for ResNet3D. It takes in input channels (inplanes), output channels (planes), spatial and temporal stride, dilation rate, downsample layer, inflate flag, and inflate style as arguments. The class has an act variable to store the activation function and defines a forward method that performs convolutions, batch normalization, and activation if necessary.",
+        "type": "comment"
+    },
+    "6604": {
+        "file_id": 503,
+        "content": "            Default: '3x1x1'.\n        non_local (bool): Determine whether to apply non-local module in this\n            block. Default: False.\n        non_local_cfg (dict): Config for non-local module. Default: ``dict()``.\n        conv_cfg (dict): Config dict for convolution layer.\n            Default: ``dict(type='Conv3d')``.\n        norm_cfg (dict): Config for norm layers. required keys are ``type``,\n            Default: ``dict(type='BN3d')``.\n        act_cfg (dict): Config dict for activation layer.\n            Default: ``dict(type='ReLU')``.\n        with_cp (bool): Use checkpoint or not. Using checkpoint will save some\n            memory while slowing down the training speed. Default: False.\n    \"\"\"\n    expansion = 4\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 spatial_stride=1,\n                 temporal_stride=1,\n                 dilation=1,\n                 downsample=None,\n                 inflate=True,\n                 inflate_style='3x1x1',\n                 non_local=False,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:116-140"
+    },
+    "6605": {
+        "file_id": 503,
+        "content": "The code defines a ResNet3D block with various configurations including the number of input and output planes, spatial and temporal stride, dilation rate, downsampling method, inflation settings, and whether to apply non-local modules or not. The default configuration includes convolution, norm, and activation layers, as well as an option for using checkpoint to save memory at the cost of training speed.",
+        "type": "comment"
+    },
+    "6606": {
+        "file_id": 503,
+        "content": "                 non_local_cfg=dict(),\n                 conv_cfg=dict(type='Conv3d'),\n                 norm_cfg=dict(type='BN3d'),\n                 act_cfg=dict(type='ReLU'),\n                 with_cp=False):\n        super().__init__()\n        assert inflate_style in ['3x1x1', '3x3x3']\n        self.inplanes = inplanes\n        self.planes = planes\n        self.spatial_stride = spatial_stride\n        self.temporal_stride = temporal_stride\n        self.dilation = dilation\n        self.inflate = inflate\n        self.inflate_style = inflate_style\n        self.norm_cfg = norm_cfg\n        self.conv_cfg = conv_cfg\n        self.act_cfg = act_cfg\n        self.with_cp = with_cp\n        self.non_local = non_local\n        self.non_local_cfg = non_local_cfg\n        self.conv1_stride_s = 1\n        self.conv2_stride_s = spatial_stride\n        self.conv1_stride_t = 1\n        self.conv2_stride_t = temporal_stride\n        if self.inflate:\n            if inflate_style == '3x1x1':\n                conv1_kernel_size = (3, 1, 1)\n                conv1_padding = (1, 0, 0)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:141-171"
+    },
+    "6607": {
+        "file_id": 503,
+        "content": "This code initializes an instance of a 3D ResNet backbone model with specified parameters, including planes, spatial and temporal strides, dilation, inflate style, norm and conv configurations, whether to use non-local blocks, and more. It sets various attributes based on the input and instantiates a Conv3d layer for the first block.",
+        "type": "comment"
+    },
+    "6608": {
+        "file_id": 503,
+        "content": "                conv2_kernel_size = (1, 3, 3)\n                conv2_padding = (0, dilation, dilation)\n            else:\n                conv1_kernel_size = (1, 1, 1)\n                conv1_padding = (0, 0, 0)\n                conv2_kernel_size = (3, 3, 3)\n                conv2_padding = (1, dilation, dilation)\n        else:\n            conv1_kernel_size = (1, 1, 1)\n            conv1_padding = (0, 0, 0)\n            conv2_kernel_size = (1, 3, 3)\n            conv2_padding = (0, dilation, dilation)\n        self.conv1 = ConvBNLayer(\n            in_channels=inplanes,\n            out_channels=planes,\n            kernel_size=conv1_kernel_size,\n            stride=(self.conv1_stride_t, self.conv1_stride_s,\n                    self.conv1_stride_s),\n            padding=conv1_padding,\n            bias=False,\n            act='relu')\n        self.conv2 = ConvBNLayer(\n            in_channels=planes,\n            out_channels=planes,\n            kernel_size=conv2_kernel_size,\n            stride=(self.conv2_stride_t, self.conv2_stride_s,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:172-198"
+    },
+    "6609": {
+        "file_id": 503,
+        "content": "Code is setting up convolutional layers for a ResNet3D model. It creates ConvBNLayer instances with different kernel sizes and padding based on the dilation value. These layers are used for temporal, spatial, and spatial dimensions depending on the dilation value provided.",
+        "type": "comment"
+    },
+    "6610": {
+        "file_id": 503,
+        "content": "                    self.conv2_stride_s),\n            padding=conv2_padding,\n            dilation=(1, dilation, dilation),\n            bias=False,\n            act='relu')\n        self.conv3 = ConvBNLayer(\n            in_channels=planes,\n            out_channels=planes * self.expansion,\n            kernel_size=1,\n            bias=False,\n            act=None,\n        )\n        self.downsample = downsample\n        self.relu = nn.ReLU()\n    def forward(self, x):\n        \"\"\"Defines the computation performed at every call.\"\"\"\n        def _inner_forward(x):\n            \"\"\"Forward wrapper for utilizing checkpoint.\"\"\"\n            identity = x\n            out = self.conv1(x)\n            out = self.conv2(out)\n            out = self.conv3(out)\n            if self.downsample is not None:\n                identity = self.downsample(x)\n            out = out + identity\n            return out\n        out = _inner_forward(x)\n        out = self.relu(out)\n        if self.non_local:\n            out = self.non_local_block(out)\n        return out",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:199-239"
+    },
+    "6611": {
+        "file_id": 503,
+        "content": "This code defines a ResNet3D block with ConvBNLayer, downsample layer, and ReLU activation. The forward method applies the layers sequentially, optionally performs downsampling, and adds the identity connection before returning the output. Non-local blocks can be applied if specified.",
+        "type": "comment"
+    },
+    "6612": {
+        "file_id": 503,
+        "content": "class ResNet3d(nn.Layer):\n    \"\"\"ResNet 3d backbone.\n    Args:\n        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.\n        pretrained (str | None): Name of pretrained model.\n        stage_blocks (tuple | None): Set number of stages for each res layer.\n            Default: None.\n        pretrained2d (bool): Whether to load pretrained 2D model.\n            Default: True.\n        in_channels (int): Channel num of input features. Default: 3.\n        base_channels (int): Channel num of stem output features. Default: 64.\n        out_indices (Sequence[int]): Indices of output feature. Default: (3, ).\n        num_stages (int): Resnet stages. Default: 4.\n        spatial_strides (Sequence[int]):\n            Spatial strides of residual blocks of each stage.\n            Default: ``(1, 2, 2, 2)``.\n        temporal_strides (Sequence[int]):\n            Temporal strides of residual blocks of each stage.\n            Default: ``(1, 1, 1, 1)``.\n        dilations (Sequence[int]): Dilation of each stage.\n            Default: ``(1, 1, 1, 1)``.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:242-263"
+    },
+    "6613": {
+        "file_id": 503,
+        "content": "The code defines a ResNet 3D backbone, with options for depth (18, 34, 50, 101, or 152), pretrained model name, number of stages for each res layer, loading of pretrained 2D model, input channel features, output feature indices, number of stages, and spatial and temporal strides.",
+        "type": "comment"
+    },
+    "6614": {
+        "file_id": 503,
+        "content": "        conv1_kernel (Sequence[int]): Kernel size of the first conv layer.\n            Default: ``(3, 7, 7)``.\n        conv1_stride_s (int): Spatial stride of the first conv layer.\n            Default: 2.\n        conv1_stride_t (int): Temporal stride of the first conv layer.\n            Default: 1.\n        pool1_stride_s (int): Spatial stride of the first pooling layer.\n            Default: 2.\n        pool1_stride_t (int): Temporal stride of the first pooling layer.\n            Default: 1.\n        with_pool2 (bool): Whether to use pool2. Default: True.\n        inflate (Sequence[int]): Inflate Dims of each block.\n            Default: (1, 1, 1, 1).\n        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the\n            kernel sizes and padding strides for conv1 and conv2 in each block.\n            Default: '3x1x1'.\n        conv_cfg (dict): Config for conv layers. required keys are ``type``\n            Default: ``dict(type='Conv3d')``.\n        norm_cfg (dict): Config for norm layers. required keys are ``type`` and",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:264-282"
+    },
+    "6615": {
+        "file_id": 503,
+        "content": "This code defines the parameters for ResNet3D backbone model including kernel sizes, stride values, and inflation dimensions. It also sets the default configuration for convolutional layers and normalization layers. The inflation style determines the kernel sizes for conv1 and conv2 in each block based on the given string input.",
+        "type": "comment"
+    },
+    "6616": {
+        "file_id": 503,
+        "content": "            ``requires_grad``.\n            Default: ``dict(type='BN3d', requires_grad=True)``.\n        act_cfg (dict): Config dict for activation layer.\n            Default: ``dict(type='ReLU', inplace=True)``.\n        norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze\n            running stats (mean and var). Default: False.\n        with_cp (bool): Use checkpoint or not. Using checkpoint will save some\n            memory while slowing down the training speed. Default: False.\n        non_local (Sequence[int]): Determine whether to apply non-local module\n            in the corresponding block of each stages. Default: (0, 0, 0, 0).\n        non_local_cfg (dict): Config for non-local module. Default: ``dict()``.\n        zero_init_residual (bool):\n            Whether to use zero initialization for residual block,\n            Default: True.\n        kwargs (dict, optional): Key arguments for \"make_res_layer\".\n    \"\"\"\n    arch_settings = {\n        50: (Bottleneck3d, (3, 4, 6, 3)),\n        101: (Bottleneck3d, (3, 4, 23, 3)),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:283-302"
+    },
+    "6617": {
+        "file_id": 503,
+        "content": "This code defines the parameters and architecture settings for ResNet3D model in PaddleVideo. It includes options such as backbone type, stages, activation layer, normalization mode, checkpoint usage, non-local module application, and residual block initialization.",
+        "type": "comment"
+    },
+    "6618": {
+        "file_id": 503,
+        "content": "        152: (Bottleneck3d, (3, 8, 36, 3))\n    }\n    def __init__(self,\n                 depth,\n                 stage_blocks=None,\n                 pretrained2d=True,\n                 in_channels=3,\n                 num_stages=4,\n                 base_channels=64,\n                 out_indices=(3, ),\n                 spatial_strides=(1, 2, 2, 2),\n                 temporal_strides=(1, 1, 1, 1),\n                 dilations=(1, 1, 1, 1),\n                 conv1_kernel=(3, 7, 7),\n                 conv1_stride_s=2,\n                 conv1_stride_t=1,\n                 pool1_stride_s=2,\n                 pool1_stride_t=1,\n                 with_pool1=True,\n                 with_pool2=True,\n                 inflate=(1, 1, 1, 1),\n                 inflate_style='3x1x1',\n                 conv_cfg=dict(type='Conv3d'),\n                 norm_cfg=dict(type='BN3d', requires_grad=True),\n                 act_cfg=dict(type='ReLU', inplace=True),\n                 norm_eval=False,\n                 with_cp=False,\n                 non_local=(0, 0, 0, 0),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:303-331"
+    },
+    "6619": {
+        "file_id": 503,
+        "content": "This code defines a ResNet3D backbone model with customizable parameters such as depth, stage blocks, and more. It uses Bottleneck3d layers and allows for pre-trained 2D weights usage. The model is designed for processing 4D data (spatial and temporal dimensions).",
+        "type": "comment"
+    },
+    "6620": {
+        "file_id": 503,
+        "content": "                 non_local_cfg=dict(),\n                 zero_init_residual=True,\n                 **kwargs):\n        super().__init__()\n        if depth not in self.arch_settings:\n            raise KeyError(f'invalid depth {depth} for resnet')\n        self.depth = depth\n        self.pretrained2d = pretrained2d\n        self.in_channels = in_channels\n        self.base_channels = base_channels\n        self.num_stages = num_stages\n        assert 1 <= num_stages <= 4\n        self.stage_blocks = stage_blocks\n        self.out_indices = out_indices\n        assert max(out_indices) < num_stages\n        self.spatial_strides = spatial_strides\n        self.temporal_strides = temporal_strides\n        self.dilations = dilations\n        assert len(spatial_strides) == len(temporal_strides) == len(\n            dilations) == num_stages\n        if self.stage_blocks is not None:\n            assert len(self.stage_blocks) == num_stages\n        self.conv1_kernel = conv1_kernel\n        self.conv1_stride_s = conv1_stride_s\n        self.conv1_stride_t = conv1_stride_t",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:332-357"
+    },
+    "6621": {
+        "file_id": 503,
+        "content": "This function is initializing a ResNet3D model with specified depth, input channels, base channels, number of stages, stage blocks, output indices, spatial and temporal strides, dilations, and convolution kernel parameters. It raises an error if the provided depth does not match any of the known configurations or if the output indices exceed the number of stages. If the stage_blocks are specified, it also checks that their length matches the number of stages. The class inherits from a superclass.",
+        "type": "comment"
+    },
+    "6622": {
+        "file_id": 503,
+        "content": "        self.pool1_stride_s = pool1_stride_s\n        self.pool1_stride_t = pool1_stride_t\n        self.with_pool1 = with_pool1\n        self.with_pool2 = with_pool2\n        self.stage_inflations = _ntuple(num_stages)(inflate)\n        self.non_local_stages = _ntuple(num_stages)(non_local)\n        self.inflate_style = inflate_style\n        self.conv_cfg = conv_cfg\n        self.norm_cfg = norm_cfg\n        self.act_cfg = act_cfg\n        self.norm_eval = norm_eval\n        self.with_cp = with_cp\n        self.zero_init_residual = zero_init_residual\n        self.block, stage_blocks = self.arch_settings[depth]\n        if self.stage_blocks is None:\n            self.stage_blocks = stage_blocks[:num_stages]\n        self.inplanes = self.base_channels\n        self.non_local_cfg = non_local_cfg\n        self._make_stem_layer()\n        self.res_layers = []\n        for i, num_blocks in enumerate(self.stage_blocks):\n            spatial_stride = spatial_strides[i]\n            temporal_stride = temporal_strides[i]\n            dilation = dilations[i]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:358-387"
+    },
+    "6623": {
+        "file_id": 503,
+        "content": "This code sets various attributes for a ResNet3D model. It initializes strides, determines if pooling layers are used in certain stages, inflates stages based on input, and configures convolutional, normalization, and activation settings. It also defines the block architecture and stage blocks according to the provided depth. Finally, it creates stem and residual layers based on the configuration.",
+        "type": "comment"
+    },
+    "6624": {
+        "file_id": 503,
+        "content": "            planes = self.base_channels * 2**i\n            res_layer = self.make_res_layer(\n                self.block,\n                self.inplanes,\n                planes,\n                num_blocks,\n                spatial_stride=spatial_stride,\n                temporal_stride=temporal_stride,\n                dilation=dilation,\n                norm_cfg=self.norm_cfg,\n                conv_cfg=self.conv_cfg,\n                act_cfg=self.act_cfg,\n                non_local=self.non_local_stages[i],\n                non_local_cfg=self.non_local_cfg,\n                inflate=self.stage_inflations[i],\n                inflate_style=self.inflate_style,\n                with_cp=with_cp,\n                **kwargs)\n            self.inplanes = planes * self.block.expansion\n            layer_name = f'layer{i + 1}'\n            self.add_sublayer(layer_name, res_layer)\n            self.res_layers.append(layer_name)\n        self.feat_dim = self.block.expansion * self.base_channels * 2**(\n            len(self.stage_blocks) - 1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:388-412"
+    },
+    "6625": {
+        "file_id": 503,
+        "content": "This code defines a function that adds ResNet3D layers with specified block, input and output planes, number of blocks, spatial and temporal strides, dilation, norm/conv cfg, non-local stages, inflations, style, and with_cp. It updates inplanes and feat_dim accordingly.",
+        "type": "comment"
+    },
+    "6626": {
+        "file_id": 503,
+        "content": "    @staticmethod\n    def make_res_layer(block,\n                       inplanes,\n                       planes,\n                       blocks,\n                       spatial_stride=1,\n                       temporal_stride=1,\n                       dilation=1,\n                       inflate=1,\n                       inflate_style='3x1x1',\n                       non_local=0,\n                       non_local_cfg=dict(),\n                       norm_cfg=None,\n                       act_cfg=None,\n                       conv_cfg=None,\n                       with_cp=False,\n                       **kwargs):\n        \"\"\"Build residual layer for ResNet3D.\n        Args:\n            block (nn.Module): Residual module to be built.\n            inplanes (int): Number of channels for the input feature\n                in each block.\n            planes (int): Number of channels for the output feature\n                in each block.\n            blocks (int): Number of residual blocks.\n            spatial_stride (int | Sequence[int]): Spatial strides in",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:414-440"
+    },
+    "6627": {
+        "file_id": 503,
+        "content": "The function \"make_res_layer\" builds a residual layer for ResNet3D. It takes parameters such as block, inplanes, planes, blocks, spatial_stride, temporal_stride, and other optional settings like non_local, norm_cfg, act_cfg, conv_cfg, with_cp to create the residual module. The function constructs the layer based on the input arguments and returns it.",
+        "type": "comment"
+    },
+    "6628": {
+        "file_id": 503,
+        "content": "                residual and conv layers. Default: 1.\n            temporal_stride (int | Sequence[int]): Temporal strides in\n                residual and conv layers. Default: 1.\n            dilation (int): Spacing between kernel elements. Default: 1.\n            inflate (int | Sequence[int]): Determine whether to inflate\n                for each block. Default: 1.\n            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines\n                the kernel sizes and padding strides for conv1 and conv2\n                in each block. Default: '3x1x1'.\n            non_local (int | Sequence[int]): Determine whether to apply\n                non-local module in the corresponding block of each stages.\n                Default: 0.\n            non_local_cfg (dict): Config for non-local module.\n                Default: ``dict()``.\n            conv_cfg (dict | None): Config for norm layers. Default: None.\n            norm_cfg (dict | None): Config for norm layers. Default: None.\n            act_cfg (dict | None): Config for activate layers. Default: None.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:441-457"
+    },
+    "6629": {
+        "file_id": 503,
+        "content": "This function defines the ResNet3D backbone model, allowing customization through parameters such as residual and conv layers, temporal stride, dilation, inflate, inflate_style, non_local modules, conv_cfg, norm_cfg, and act_cfg. Default values are provided for each parameter.",
+        "type": "comment"
+    },
+    "6630": {
+        "file_id": 503,
+        "content": "            with_cp (bool | None): Use checkpoint or not. Using checkpoint\n                will save some memory while slowing down the training speed.\n                Default: False.\n        Returns:\n            nn.Module: A residual layer for the given config.\n        \"\"\"\n        inflate = inflate if not isinstance(inflate,\n                                            int) else (inflate, ) * blocks\n        non_local = non_local if not isinstance(non_local,\n                                                int) else (non_local, ) * blocks\n        assert len(inflate) == blocks and len(non_local) == blocks\n        downsample = None\n        if spatial_stride != 1 or inplanes != planes * block.expansion:\n            downsample = ConvBNLayer(\n                in_channels=inplanes,\n                out_channels=planes * block.expansion,\n                kernel_size=1,\n                stride=(temporal_stride, spatial_stride, spatial_stride),\n                bias=False,\n                act=None)\n        layers = []\n        layers.append(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:458-481"
+    },
+    "6631": {
+        "file_id": 503,
+        "content": "This function creates a residual layer based on the given configuration. It uses inflation and non-local blocks for the specified number of blocks, and optionally adds downsampling if there is a change in spatial or in/out planes. The output is a neural network module (nn.Module).",
+        "type": "comment"
+    },
+    "6632": {
+        "file_id": 503,
+        "content": "            block(\n                inplanes,\n                planes,\n                spatial_stride=spatial_stride,\n                temporal_stride=temporal_stride,\n                dilation=dilation,\n                downsample=downsample,\n                inflate=(inflate[0] == 1),\n                inflate_style=inflate_style,\n                non_local=(non_local[0] == 1),\n                non_local_cfg=non_local_cfg,\n                norm_cfg=norm_cfg,\n                conv_cfg=conv_cfg,\n                act_cfg=act_cfg,\n                with_cp=with_cp,\n                **kwargs))\n        inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(\n                    inplanes,\n                    planes,\n                    spatial_stride=1,\n                    temporal_stride=1,\n                    dilation=dilation,\n                    inflate=(inflate[i] == 1),\n                    inflate_style=inflate_style,\n                    non_local=(non_local[i] == 1),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:482-509"
+    },
+    "6633": {
+        "file_id": 503,
+        "content": "The code defines a ResNet3D architecture with multiple blocks, each with configurable parameters such as spatial and temporal stride, dilation, downsample, inflate style, non-local operation, norm/conv configuration, activation function, and whether to include channel pruning. The inplanes are updated based on the expansion factor of the block.",
+        "type": "comment"
+    },
+    "6634": {
+        "file_id": 503,
+        "content": "                    non_local_cfg=non_local_cfg,\n                    norm_cfg=norm_cfg,\n                    conv_cfg=conv_cfg,\n                    act_cfg=act_cfg,\n                    with_cp=with_cp,\n                    **kwargs))\n        return nn.Sequential(*layers)\n    @staticmethod\n    def _inflate_conv_params(conv3d, state_dict_2d, module_name_2d,\n                             inflated_param_names):\n        \"\"\"Inflate a conv module from 2d to 3d.\n        Args:\n            conv3d (nn.Module): The destination conv3d module.\n            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.\n            module_name_2d (str): The name of corresponding conv module in the\n                2d model.\n            inflated_param_names (list[str]): List of parameters that have been\n                inflated.\n        \"\"\"\n        weight_2d_name = module_name_2d + '.weight'\n        conv2d_weight = state_dict_2d[weight_2d_name]\n        kernel_t = conv3d.weight.data.shape[2]\n        new_weight = conv2d_weight.data.unsqueeze(2).expand_as(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:510-537"
+    },
+    "6635": {
+        "file_id": 503,
+        "content": "This code defines a function to inflate a 3D convolutional neural network module from a pre-trained 2D model. It takes the destination conv3d module, state dict of the 2D model, name of the corresponding conv module in the 2D model, and list of inflated parameters as inputs. The function extracts the weight from the 2D model's state dict and reshapes it to fit the 3D convolution.",
+        "type": "comment"
+    },
+    "6636": {
+        "file_id": 503,
+        "content": "            conv3d.weight) / kernel_t\n        conv3d.weight.data.copy_(new_weight)\n        inflated_param_names.append(weight_2d_name)\n        if getattr(conv3d, 'bias') is not None:\n            bias_2d_name = module_name_2d + '.bias'\n            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])\n            inflated_param_names.append(bias_2d_name)\n    @staticmethod\n    def _inflate_bn_params(bn3d, state_dict_2d, module_name_2d,\n                           inflated_param_names):\n        \"\"\"Inflate a norm module from 2d to 3d.\n        Args:\n            bn3d (nn.Module): The destination bn3d module.\n            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.\n            module_name_2d (str): The name of corresponding bn module in the\n                2d model.\n            inflated_param_names (list[str]): List of parameters that have been\n                inflated.\n        \"\"\"\n        for param_name, param in bn3d.named_parameters():\n            param_2d_name = f'{module_name_2d}.{param_name}'",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:538-561"
+    },
+    "6637": {
+        "file_id": 503,
+        "content": "This code inflates 2D convolutional and Batch Normalization (BN) parameters to 3D for a ResNet3D backbone. It copies the weights and biases, if present, from the 2D state dictionary to their corresponding 3D modules and updates the list of inflated parameter names.",
+        "type": "comment"
+    },
+    "6638": {
+        "file_id": 503,
+        "content": "            param_2d = state_dict_2d[param_2d_name]\n            if param.data.shape != param_2d.shape:\n                warnings.warn(f'The parameter of {module_name_2d} is not'\n                              'loaded due to incompatible shapes. ')\n                return\n            param.data.copy_(param_2d)\n            inflated_param_names.append(param_2d_name)\n        for param_name, param in bn3d.named_buffers():\n            param_2d_name = f'{module_name_2d}.{param_name}'\n            # some buffers like num_batches_tracked may not exist in old\n            # checkpoints\n            if param_2d_name in state_dict_2d:\n                param_2d = state_dict_2d[param_2d_name]\n                param.data.copy_(param_2d)\n                inflated_param_names.append(param_2d_name)\n    def _make_stem_layer(self):\n        \"\"\"Construct the stem layers consists of a conv+norm+act module and a\n        pooling layer.\"\"\"\n        self.conv1 = ConvBNLayer(\n            in_channels=self.in_channels,\n            out_channels=self.base_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:562-586"
+    },
+    "6639": {
+        "file_id": 503,
+        "content": "This code snippet is from the PaddleVideo library, specifically the ResNet3D backbone. It is loading and inflating parameters from a state dictionary, ensuring compatibility between 2D and 3D parameter shapes. The function _make_stem_layer constructs a stem layer consisting of a convolution, normalization, activation, and pooling module.",
+        "type": "comment"
+    },
+    "6640": {
+        "file_id": 503,
+        "content": "            kernel_size=self.conv1_kernel,\n            stride=(self.conv1_stride_t, self.conv1_stride_s,\n                    self.conv1_stride_s),\n            padding=tuple([(k - 1) // 2 for k in _triple(self.conv1_kernel)]),\n            bias=False,\n            act=\"relu\")\n        self.maxpool = nn.MaxPool3D(\n            kernel_size=(1, 3, 3),\n            stride=(self.pool1_stride_t, self.pool1_stride_s,\n                    self.pool1_stride_s),\n            padding=(0, 1, 1))\n        self.pool2 = nn.MaxPool3D(kernel_size=(2, 1, 1), stride=(2, 1, 1))\n    @staticmethod\n    def _init_weights(self, pretrained=None):\n        pass\n    def init_weights(self, pretrained=None):\n        self._init_weights(self, pretrained)\n    def forward(self, x):\n        \"\"\"Defines the computation performed at every call.\n        Args:\n            x (torch.Tensor): The input data.\n        Returns:\n            torch.Tensor: The feature of the input\n            samples extracted by the backbone.\n        \"\"\"\n        x = self.conv1(x)\n        if self.with_pool1:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:587-620"
+    },
+    "6641": {
+        "file_id": 503,
+        "content": "This code is initializing a ResNet3D model with convolutional and pooling layers. The convolution layer has specified kernel size, stride, padding, and uses ReLU activation function. The max pooling layer has varying sizes for temporal, spatial dimensions. This model also includes optional pool1 and can be initialized with pretrained weights.",
+        "type": "comment"
+    },
+    "6642": {
+        "file_id": 503,
+        "content": "            x = self.maxpool(x)\n        outs = []\n        for i, layer_name in enumerate(self.res_layers):\n            res_layer = getattr(self, layer_name)\n            x = res_layer(x)\n            if i == 0 and self.with_pool2:\n                x = self.pool2(x)\n            if i in self.out_indices:\n                outs.append(x)\n        if len(outs) == 1:\n            return outs[0]\n        return tuple(outs)\n    def train(self, mode=True):\n        \"\"\"Set the optimization status when training.\"\"\"\n        super().train()\n        if mode and self.norm_eval:\n            for m in self.modules():\n                if isinstance(m, paddle.nn._BatchNormBase):\n                    m.eval()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d.py:621-641"
+    },
+    "6643": {
+        "file_id": 503,
+        "content": "This code defines a ResNet-3D backbone model with residual blocks, max pooling layers, and optionally a second pooling layer. The train function sets the model to training mode and evaluates batch normalization layers if self.norm_eval is True.",
+        "type": "comment"
+    },
+    "6644": {
+        "file_id": 504,
+        "content": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py",
+        "type": "filepath"
+    },
+    "6645": {
+        "file_id": 504,
+        "content": "The ResNet3dSlowOnly class creates a Slowfast pathway in the ResNet3d architecture, reduces channel number, and is registered under BACKBONES. The make_res_layer function builds residual layers with specified spatial_strides, temporal_strides, and dilations for 3D Resnet layers.",
+        "type": "summary"
+    },
+    "6646": {
+        "file_id": 504,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport warnings\nimport paddle\nimport paddle.nn as nn\nfrom .resnet3d import ResNet3d, ConvBNLayer\nfrom ..registry import BACKBONES\n@BACKBONES.register()\nclass ResNet3dSlowOnly(ResNet3d):\n    \"\"\"A pathway of Slowfast based on ResNet3d.\n    Args:\n        *args (arguments): Arguments same as :class:``ResNet3d``.\n        channel_ratio (int): Reduce the channel number of fast pathway\n            by ``channel_ratio``, corresponding to ``beta`` in the paper.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:1-30"
+    },
+    "6647": {
+        "file_id": 504,
+        "content": "This code defines a ResNet3dSlowOnly class that extends ResNet3d for creating a Slowfast pathway. It reduces the channel number of the fast pathway by a specified 'channel_ratio'. This model is registered under BACKBONES and accepts the same arguments as ResNet3d.",
+        "type": "comment"
+    },
+    "6648": {
+        "file_id": 504,
+        "content": "            Default: 8.\n        **kwargs (keyword arguments): Keywords arguments for ResNet3d.\n    \"\"\"\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.inplanes = self.base_channels\n        self.lateral_connections = []\n        for i in range(len(self.stage_blocks)):\n            planes = self.base_channels * 2**i\n            self.inplanes = planes * self.block.expansion\n    def make_res_layer(self,\n                       block,\n                       inplanes,\n                       planes,\n                       blocks,\n                       spatial_stride=1,\n                       temporal_stride=1,\n                       dilation=1,\n                       inflate=1,\n                       inflate_style='3x1x1',\n                       non_local=0,\n                       non_local_cfg=dict(),\n                       conv_cfg=None,\n                       norm_cfg=None,\n                       act_cfg=None,\n                       with_cp=False):\n        \"\"\"Build residual layer for Slowfast.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:31-60"
+    },
+    "6649": {
+        "file_id": 504,
+        "content": "Function `__init__` initializes the ResNet3d object by setting initial values for inplanes and lateral_connections based on provided args and kwargs. The function make_res_layer builds a residual layer for Slowfast, taking in various parameters including block type, input planes, output planes, number of blocks, and more.",
+        "type": "comment"
+    },
+    "6650": {
+        "file_id": 504,
+        "content": "        Args:\n            block (nn.Module): Residual module to be built.\n            inplanes (int): Number of channels for the input\n                feature in each block.\n            planes (int): Number of channels for the output\n                feature in each block.\n            blocks (int): Number of residual blocks.\n            spatial_stride (int | Sequence[int]): Spatial strides\n                in residual and conv layers. Default: 1.\n            temporal_stride (int | Sequence[int]): Temporal strides in\n                residual and conv layers. Default: 1.\n            dilation (int): Spacing between kernel elements. Default: 1.\n            inflate (int | Sequence[int]): Determine whether to inflate\n                for each block. Default: 1.\n            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines\n                the kernel sizes and padding strides for conv1 and\n                conv2 in each block. Default: ``3x1x1``.\n            non_local (int | Sequence[int]): Determine whether to apply",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:62-79"
+    },
+    "6651": {
+        "file_id": 504,
+        "content": "This function is defining a Residual module with specified parameters including block type, input and output planes, number of residual blocks, spatial and temporal strides, dilation rate, whether to inflate or apply non-local operations for each block.",
+        "type": "comment"
+    },
+    "6652": {
+        "file_id": 504,
+        "content": "                non-local module in the corresponding block of each stages.\n                Default: 0.\n            non_local_cfg (dict): Config for non-local module.\n                Default: ``dict()``.\n            conv_cfg (dict | None): Config for conv layers. Default: None.\n            norm_cfg (dict | None): Config for norm layers. Default: None.\n            act_cfg (dict | None): Config for activate layers. Default: None.\n            with_cp (bool): Use checkpoint or not. Using checkpoint will save\n                some memory while slowing down the training speed.\n                Default: False.\n        Returns:\n            nn.Module: A residual layer for the given config.\n        \"\"\"\n        inflate = inflate if not isinstance(inflate,\n                                            int) else (inflate, ) * blocks\n        non_local = non_local if not isinstance(non_local,\n                                                int) else (non_local, ) * blocks\n        assert len(inflate) == blocks and len(non_local) == blocks",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:80-98"
+    },
+    "6653": {
+        "file_id": 504,
+        "content": "This function takes in a configuration for building residual layers, including parameters like blocks (number of residual layers to create), inflate (inflation times for the conv layers), non_local (whether to use non-local modules), conv_cfg, norm_cfg, act_cfg, and with_cp (use checkpoint). It asserts that the lengths of inflate and non_local match the number of blocks specified. The function returns a residual layer for the given configuration.",
+        "type": "comment"
+    },
+    "6654": {
+        "file_id": 504,
+        "content": "        lateral_inplanes = 0\n        if (spatial_stride != 1\n                or (inplanes + lateral_inplanes) != planes * block.expansion):\n            downsample = ConvBNLayer(\n                in_channels=inplanes + lateral_inplanes,\n                out_channels=planes * block.expansion,\n                kernel_size=1,\n                stride=(temporal_stride, spatial_stride, spatial_stride),\n                bias=False,\n                act=None)\n        else:\n            downsample = None\n        layers = []\n        layers.append(\n            block(\n                inplanes + lateral_inplanes,\n                planes,\n                spatial_stride,\n                temporal_stride,\n                dilation,\n                downsample,\n                inflate=(inflate[0] == 1),\n                inflate_style=inflate_style,\n                non_local=(non_local[0] == 1),\n                non_local_cfg=non_local_cfg,\n                conv_cfg=conv_cfg,\n                norm_cfg=norm_cfg,\n                act_cfg=act_cfg,\n                with_cp=with_cp))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:100-129"
+    },
+    "6655": {
+        "file_id": 504,
+        "content": "This code is creating a downsample layer and appending a block to the layers list. The downsample is created based on whether the current input planes match the expected value or not. If it doesn't match, a convolutional layer with stride is used for downsampling. The block is added to the layers list with specified parameters.",
+        "type": "comment"
+    },
+    "6656": {
+        "file_id": 504,
+        "content": "        inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(\n                    inplanes,\n                    planes,\n                    1,\n                    1,\n                    dilation,\n                    inflate=(inflate[i] == 1),\n                    inflate_style=inflate_style,\n                    non_local=(non_local[i] == 1),\n                    non_local_cfg=non_local_cfg,\n                    conv_cfg=conv_cfg,\n                    norm_cfg=norm_cfg,\n                    act_cfg=act_cfg,\n                    with_cp=with_cp))\n        return nn.Sequential(*layers)\n    def _inflate_conv_params(self, conv3d, state_dict_2d, module_name_2d,\n                             inflated_param_names):\n        \"\"\"Inflate a conv module from 2d to 3d.\n        The differences of conv modules betweene 2d and 3d in Pathway\n        mainly lie in the inplanes due to lateral connections. To fit the\n        shapes of the lateral connection counterpart, it will expand",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:130-157"
+    },
+    "6657": {
+        "file_id": 504,
+        "content": "This code defines a function to create layers for a Resnet3D backbone model in PaddleVideo. It takes in parameters such as planes, blocks, dilation, inflate, inflate_style, non_local, non_local_cfg, conv_cfg, norm_cfg, act_cfg, and with_cp. The function creates layers by appending instances of a block class to a list, and returns them as a sequential model for training or inference. Additionally, there is another function that inflates a 2D conv module to a 3D one, mainly adjusting the inplanes due to lateral connections for fitting the shapes of lateral connection counterparts.",
+        "type": "comment"
+    },
+    "6658": {
+        "file_id": 504,
+        "content": "        parameters by concatting conv2d parameters and extra zero paddings.\n        Args:\n            conv3d (nn.Module): The destination conv3d module.\n            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.\n            module_name_2d (str): The name of corresponding conv module in the\n                2d model.\n            inflated_param_names (list[str]): List of parameters that have been\n                inflated.\n        \"\"\"\n        weight_2d_name = module_name_2d + '.weight'\n        conv2d_weight = state_dict_2d[weight_2d_name]\n        old_shape = conv2d_weight.shape\n        new_shape = conv3d.weight.data.shape\n        kernel_t = new_shape[2]\n        if new_shape[1] != old_shape[1]:\n            if new_shape[1] < old_shape[1]:\n                warnings.warn(f'The parameter of {module_name_2d} is not'\n                              'loaded due to incompatible shapes. ')\n                return\n            # Inplanes may be different due to lateral connections\n            new_channels = new_shape[1] - old_shape[1]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:158-180"
+    },
+    "6659": {
+        "file_id": 504,
+        "content": "This function loads the 2D model's state dictionary into a 3D Conv module, concatenating conv2d parameters and adding zero paddings to match the new shape. The weight shape of the 2D model is retrieved, and if the number of input channels in the 3D model is different from the 2D model, it will raise a warning or return without loading the parameters due to incompatible shapes.",
+        "type": "comment"
+    },
+    "6660": {
+        "file_id": 504,
+        "content": "            pad_shape = old_shape\n            pad_shape = pad_shape[:1] + (new_channels, ) + pad_shape[2:]\n            # Expand parameters by concat extra channels\n            conv2d_weight = paddle.concat(\n                (conv2d_weight, paddle.zeros(pad_shape)), axis=1)\n        new_weight = conv2d_weight.data.unsqueeze(2).expand_as(\n            conv3d.weight) / kernel_t\n        conv3d.weight.data.copy_(new_weight)\n        inflated_param_names.append(weight_2d_name)\n        if getattr(conv3d, 'bias') is not None:\n            bias_2d_name = module_name_2d + '.bias'\n            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])\n            inflated_param_names.append(bias_2d_name)\nif __name__ == '__main__':\n    net = ResNet3dSlowOnly(\n        depth=50,\n        in_channels=17,\n        base_channels=32,\n        conv1_kernel=(1, 7, 7),\n        num_stages=3,\n        out_indices=[2],\n        stage_blocks=[3, 4, 6],\n        conv1_stride_s=1,\n        pool1_stride_s=1,\n        inflate=[0, 1, 1],\n        with_pool2=False,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:181-210"
+    },
+    "6661": {
+        "file_id": 504,
+        "content": "The code inflates a 2D convolutional layer into a 3D convolutional layer by padding the weights and copying the bias. It does this for all layers specified in the ResNet3dSlowOnly architecture, with specified parameters.",
+        "type": "comment"
+    },
+    "6662": {
+        "file_id": 504,
+        "content": "        spatial_strides=[2, 2, 2],\n        temporal_strides=[1, 1, 2],\n        dilations=[1, 1, 1])\n    pass",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:211-214"
+    },
+    "6663": {
+        "file_id": 504,
+        "content": "This code sets the spatial_strides, temporal_strides, and dilations for a 3D Resnet layer, with spatial strides of [2, 2, 2], temporal strides of [1, 1, 2], and dilations of [1, 1, 1].",
+        "type": "comment"
+    },
+    "6664": {
+        "file_id": 505,
+        "content": "/paddlevideo/modeling/backbones/resnet_slowfast.py",
+        "type": "filepath"
+    },
+    "6665": {
+        "file_id": 505,
+        "content": "The code defines ResNetSlowFast and SlowFast models for video recognition and computer vision tasks, respectively, with separate pathways for slow and fast processing using 3D convolutional layers and multi-pathway models.",
+        "type": "summary"
+    },
+    "6666": {
+        "file_id": 505,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import KaimingNormal\nfrom ..registry import BACKBONES\nfrom paddlevideo.utils.multigrid import get_norm\nimport sys\nimport numpy as np\nimport paddle.distributed as dist\n# seed random seed\npaddle.framework.seed(0)\n# get init parameters for conv layer\ndef get_conv_init(fan_out):\n    return KaimingNormal(fan_in=fan_out)\ndef get_bn_param_attr(bn_weight=1.0, coeff=0.0):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:1-33"
+    },
+    "6667": {
+        "file_id": 505,
+        "content": "Copyright notice and license information for the code. Imports necessary modules, defines function to get convolutional layer initialization parameters, and a function to set batch normalization layer parameters. No actual model or functionality defined yet.",
+        "type": "comment"
+    },
+    "6668": {
+        "file_id": 505,
+        "content": "    param_attr = paddle.ParamAttr(\n        initializer=paddle.nn.initializer.Constant(bn_weight),\n        regularizer=paddle.regularizer.L2Decay(coeff))\n    return param_attr\n\"\"\"Video models.\"\"\"\nclass BottleneckTransform(paddle.nn.Layer):\n    \"\"\"\n    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of\n        temporal kernel.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 temp_kernel_size,\n                 stride,\n                 dim_inner,\n                 num_groups,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 eps=1e-5,\n                 dilation=1,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (int): the channel dimensions of the input.\n            dim_out (int): the channel dimension of the output.\n            temp_kernel_size (int): the temporal kernel sizes of the middle\n                convolution in the bottleneck.\n            stride (int): the stride of the bottleneck.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:34-66"
+    },
+    "6669": {
+        "file_id": 505,
+        "content": "This code defines a BottleneckTransform class in PaddleVideo for video models. It performs Tx1x1, 1x3x3, 1x1x1 transformations with variable temporal kernel sizes. The constructor takes in arguments like dim_in, dim_out, temp_kernel_size, stride, and more to configure the transformation layer.",
+        "type": "comment"
+    },
+    "6670": {
+        "file_id": 505,
+        "content": "            dim_inner (int): the inner dimension of the block.\n            num_groups (int): number of groups for the convolution. num_groups=1\n                is for standard ResNet like networks, and num_groups>1 is for\n                ResNeXt like networks.\n            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise\n                apply stride to the 3x3 conv.\n            inplace_relu (bool): if True, calculate the relu on the original\n                input without allocating new memory.\n            eps (float): epsilon for batch norm.\n            dilation (int): size of dilation.\n        \"\"\"\n        super(BottleneckTransform, self).__init__()\n        self.temp_kernel_size = temp_kernel_size\n        self._inplace_relu = inplace_relu\n        self._eps = eps\n        self._stride_1x1 = stride_1x1\n        self.norm_module = norm_module\n        self._construct(dim_in, dim_out, stride, dim_inner, num_groups,\n                        dilation)\n    def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:67-87"
+    },
+    "6671": {
+        "file_id": 505,
+        "content": "This code defines a class called BottleneckTransform with parameters such as dim_in, dim_out, stride, dim_inner, num_groups, and dilation. It also has attributes like _inplace_relu, _eps, and norm_module for various operations and settings. The _construct method is used to initialize the class with these parameters.",
+        "type": "comment"
+    },
+    "6672": {
+        "file_id": 505,
+        "content": "                   dilation):\n        str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)\n        fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self.a = paddle.nn.Conv3D(\n            in_channels=dim_in,\n            out_channels=dim_inner,\n            kernel_size=[self.temp_kernel_size, 1, 1],\n            stride=[1, str1x1, str1x1],\n            padding=[int(self.temp_kernel_size // 2), 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.a_bn = self.norm_module(num_features=dim_inner,\n                                     epsilon=self._eps,\n                                     weight_attr=get_bn_param_attr(),\n                                     bias_attr=get_bn_param_attr(bn_weight=0.0))\n        # 1x3x3, BN, ReLU.\n        fan = (dim_inner) * (1 * 3 * 3)\n        initializer_tmp = get_conv_init(fan)\n        self.b = paddle.nn.Conv3D(\n            in_channels=dim_inner,\n            out_channels=dim_inner,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:88-113"
+    },
+    "6673": {
+        "file_id": 505,
+        "content": "Defines a Conv3D layer for the ResNet_SlowFast backbone, with specified dimensions and stride. Initializes Conv3D weights using get_conv_init function and includes batch normalization (BN) and ReLU activation layers.",
+        "type": "comment"
+    },
+    "6674": {
+        "file_id": 505,
+        "content": "            kernel_size=[1, 3, 3],\n            stride=[1, str3x3, str3x3],\n            padding=[0, dilation, dilation],\n            groups=num_groups,\n            dilation=[1, dilation, dilation],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.b_bn = self.norm_module(num_features=dim_inner,\n                                     epsilon=self._eps,\n                                     weight_attr=get_bn_param_attr(),\n                                     bias_attr=get_bn_param_attr(bn_weight=0.0))\n        # 1x1x1, BN.\n        fan = (dim_out) * (1 * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self.c = paddle.nn.Conv3D(\n            in_channels=dim_inner,\n            out_channels=dim_out,\n            kernel_size=[1, 1, 1],\n            stride=[1, 1, 1],\n            padding=[0, 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.c_bn = self.norm_module(\n            num_features=dim_out,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:114-139"
+    },
+    "6675": {
+        "file_id": 505,
+        "content": "This code defines a 3D convolutional layer with specific kernel sizes, strides, padding, and grouping. It also includes batch normalization layers for the intermediate and output features. The initializer functions are used to set the weights of each layer, with different initializers for different layers.",
+        "type": "comment"
+    },
+    "6676": {
+        "file_id": 505,
+        "content": "            epsilon=self._eps,\n            weight_attr=get_bn_param_attr(bn_weight=0.0),\n            bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        # Branch2a.\n        x = self.a(x)\n        x = self.a_bn(x)\n        x = F.relu(x)\n        # Branch2b.\n        x = self.b(x)\n        x = self.b_bn(x)\n        x = F.relu(x)\n        # Branch2c\n        x = self.c(x)\n        x = self.c_bn(x)\n        return x\nclass ResBlock(paddle.nn.Layer):\n    \"\"\"\n    Residual block.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 temp_kernel_size,\n                 stride,\n                 dim_inner,\n                 num_groups=1,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 eps=1e-5,\n                 dilation=1,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        ResBlock class constructs redisual blocks. More details can be found in:\n            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.\n            \"Deep residual learning for image recognition.\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:140-180"
+    },
+    "6677": {
+        "file_id": 505,
+        "content": "ResNetSlowFast forward function performs convolutions and Batch Normalization for each branch (2a, 2b, 2c), then applies ReLU activation. ResBlock is a layer implementing residual blocks with specified dimensions, stride, inner dimension, groups, dilation, and normalization method.",
+        "type": "comment"
+    },
+    "6678": {
+        "file_id": 505,
+        "content": "            https://arxiv.org/abs/1512.03385\n        Args:\n            dim_in (int): the channel dimensions of the input.\n            dim_out (int): the channel dimension of the output.\n            temp_kernel_size (int): the temporal kernel sizes of the middle\n                convolution in the bottleneck.\n            stride (int): the stride of the bottleneck.\n            trans_func (string): transform function to be used to construct the\n                bottleneck.\n            dim_inner (int): the inner dimension of the block.\n            num_groups (int): number of groups for the convolution. num_groups=1\n                is for standard ResNet like networks, and num_groups>1 is for\n                ResNeXt like networks.\n            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise\n                apply stride to the 3x3 conv.\n            inplace_relu (bool): calculate the relu on the original input\n                without allocating new memory.\n            eps (float): epsilon for batch norm.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:181-198"
+    },
+    "6679": {
+        "file_id": 505,
+        "content": "This code defines the arguments for constructing a ResNet bottleneck. It includes parameters for input and output channel dimensions, temporal kernel size, stride, transform function, inner dimension, number of groups for convolution, whether to apply stride to 1x1 or 3x3 conv, inplace_relu flag, and epsilon for batch normalization.",
+        "type": "comment"
+    },
+    "6680": {
+        "file_id": 505,
+        "content": "            dilation (int): size of dilation.\n        \"\"\"\n        super(ResBlock, self).__init__()\n        self._inplace_relu = inplace_relu\n        self._eps = eps\n        self.norm_module = norm_module\n        self._construct(\n            dim_in,\n            dim_out,\n            temp_kernel_size,\n            stride,\n            dim_inner,\n            num_groups,\n            stride_1x1,\n            inplace_relu,\n            dilation,\n        )\n    def _construct(\n        self,\n        dim_in,\n        dim_out,\n        temp_kernel_size,\n        stride,\n        dim_inner,\n        num_groups,\n        stride_1x1,\n        inplace_relu,\n        dilation,\n    ):\n        # Use skip connection with projection if dim or res change.\n        if (dim_in != dim_out) or (stride != 1):\n            fan = (dim_out) * (1 * 1 * 1)\n            initializer_tmp = get_conv_init(fan)\n            self.branch1 = paddle.nn.Conv3D(\n                in_channels=dim_in,\n                out_channels=dim_out,\n                kernel_size=1,\n                stride=[1, stride, stride],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:199-237"
+    },
+    "6681": {
+        "file_id": 505,
+        "content": "The code defines a ResBlock class, which is a residual block used in deep neural networks. It initializes the block with input and output dimensions, kernel size, stride, inner dimension, number of groups, and skip connection settings. The constructor method _construct creates a 3D convolution layer for the skip connection if there is a change in dimensions or stride.",
+        "type": "comment"
+    },
+    "6682": {
+        "file_id": 505,
+        "content": "                padding=0,\n                weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n                bias_attr=False,\n                dilation=1)\n            self.branch1_bn = self.norm_module(\n                num_features=dim_out,\n                epsilon=self._eps,\n                weight_attr=get_bn_param_attr(),\n                bias_attr=get_bn_param_attr(bn_weight=0.0))\n        self.branch2 = BottleneckTransform(dim_in,\n                                           dim_out,\n                                           temp_kernel_size,\n                                           stride,\n                                           dim_inner,\n                                           num_groups,\n                                           stride_1x1=stride_1x1,\n                                           inplace_relu=inplace_relu,\n                                           dilation=dilation,\n                                           norm_module=self.norm_module)\n    def forward(self, x):\n        if hasattr(self, \"branch1\"):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:238-260"
+    },
+    "6683": {
+        "file_id": 505,
+        "content": "This code defines a ResNet SlowFast backbone model. It includes convolution layers, batch normalization layers, and BottleneckTransform modules. The forward function checks if the \"branch1\" attribute exists to handle different stages of the network.",
+        "type": "comment"
+    },
+    "6684": {
+        "file_id": 505,
+        "content": "            x1 = self.branch1(x)\n            x1 = self.branch1_bn(x1)\n            x2 = self.branch2(x)\n            x = paddle.add(x=x1, y=x2)\n        else:\n            x2 = self.branch2(x)\n            x = paddle.add(x=x, y=x2)\n        x = F.relu(x)\n        return x\nclass ResStage(paddle.nn.Layer):\n    \"\"\"\n    Stage of 3D ResNet. It expects to have one or more tensors as input for\n        multi-pathway (SlowFast) cases.  More details can be found here:\n        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.\n        \"Slowfast networks for video recognition.\"\n        https://arxiv.org/pdf/1812.03982.pdf\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 stride,\n                 temp_kernel_sizes,\n                 num_blocks,\n                 dim_inner,\n                 num_groups,\n                 num_block_temp_kernel,\n                 dilation,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 norm_module=paddle.nn.BatchNorm3D):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:261-294"
+    },
+    "6685": {
+        "file_id": 505,
+        "content": "The code defines a ResNet stage for multi-pathway (SlowFast) cases in video recognition. It takes one or more tensors as input and applies branching to separate paths with different kernel sizes. The output is added together, passed through ReLU activation, and returned. This stage supports 1x1 stride option and uses BatchNorm3D for normalization.",
+        "type": "comment"
+    },
+    "6686": {
+        "file_id": 505,
+        "content": "        \"\"\"\n        The `__init__` method of any subclass should also contain these arguments.\n        ResStage builds p streams, where p can be greater or equal to one.\n        Args:\n            dim_in (list): list of p the channel dimensions of the input.\n                Different channel dimensions control the input dimension of\n                different pathways.\n            dim_out (list): list of p the channel dimensions of the output.\n                Different channel dimensions control the input dimension of\n                different pathways.\n            temp_kernel_sizes (list): list of the p temporal kernel sizes of the\n                convolution in the bottleneck. Different temp_kernel_sizes\n                control different pathway.\n            stride (list): list of the p strides of the bottleneck. Different\n                stride control different pathway.\n            num_blocks (list): list of p numbers of blocks for each of the\n                pathway.\n            dim_inner (list): list of the p inner channel dimensions of the",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:295-312"
+    },
+    "6687": {
+        "file_id": 505,
+        "content": "The ResStage class constructor takes several lists as arguments to build p streams of pathways, controlling input and output dimensions, temporal kernel sizes, strides, and block numbers for each pathway.",
+        "type": "comment"
+    },
+    "6688": {
+        "file_id": 505,
+        "content": "                input. Different channel dimensions control the input dimension\n                of different pathways.\n            num_groups (list): list of number of p groups for the convolution.\n                num_groups=1 is for standard ResNet like networks, and\n                num_groups>1 is for ResNeXt like networks.\n            num_block_temp_kernel (list): extent the temp_kernel_sizes to\n                num_block_temp_kernel blocks, then fill temporal kernel size\n                of 1 for the rest of the layers.\n            dilation (list): size of dilation for each pathway.\n        \"\"\"\n        super(ResStage, self).__init__()\n        assert all((num_block_temp_kernel[i] <= num_blocks[i]\n                    for i in range(len(temp_kernel_sizes))))\n        self.num_blocks = num_blocks\n        self.temp_kernel_sizes = [\n            (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +\n            [1] * (num_blocks[i] - num_block_temp_kernel[i])\n            for i in range(len(temp_kernel_sizes))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:313-330"
+    },
+    "6689": {
+        "file_id": 505,
+        "content": "This code defines a ResStage class for a residual block. It takes input dimensions and channel dimensions as parameters, and initializes the number of blocks and temporal kernel sizes based on these inputs. The code also ensures that the provided number of block temporary kernel sizes does not exceed the specified number of blocks.",
+        "type": "comment"
+    },
+    "6690": {
+        "file_id": 505,
+        "content": "        ]\n        assert (len({\n            len(dim_in),\n            len(dim_out),\n            len(temp_kernel_sizes),\n            len(stride),\n            len(num_blocks),\n            len(dim_inner),\n            len(num_groups),\n            len(num_block_temp_kernel),\n        }) == 1)\n        self.num_pathways = len(self.num_blocks)\n        self.norm_module = norm_module\n        self._construct(\n            dim_in,\n            dim_out,\n            stride,\n            dim_inner,\n            num_groups,\n            stride_1x1,\n            inplace_relu,\n            dilation,\n        )\n    def _construct(\n        self,\n        dim_in,\n        dim_out,\n        stride,\n        dim_inner,\n        num_groups,\n        stride_1x1,\n        inplace_relu,\n        dilation,\n    ):\n        for pathway in range(self.num_pathways):\n            for i in range(self.num_blocks[pathway]):\n                res_block = ResBlock(\n                    dim_in[pathway] if i == 0 else dim_out[pathway],\n                    dim_out[pathway],\n                    self.temp_kernel_sizes[pathway][i],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:331-372"
+    },
+    "6691": {
+        "file_id": 505,
+        "content": "The code initializes a ResNet SlowFast model by creating instances of blocks based on given parameters. It ensures that the input and output dimensions are correctly set, creates the desired number of pathways, and applies the specified norm module. The constructor then iterates over each pathway and block, creating ResBlock instances with appropriate sizes and configurations.",
+        "type": "comment"
+    },
+    "6692": {
+        "file_id": 505,
+        "content": "                    stride[pathway] if i == 0 else 1,\n                    dim_inner[pathway],\n                    num_groups[pathway],\n                    stride_1x1=stride_1x1,\n                    inplace_relu=inplace_relu,\n                    dilation=dilation[pathway],\n                    norm_module=self.norm_module)\n                self.add_sublayer(\"pathway{}_res{}\".format(pathway, i),\n                                  res_block)\n    def forward(self, inputs):\n        output = []\n        for pathway in range(self.num_pathways):\n            x = inputs[pathway]\n            for i in range(self.num_blocks[pathway]):\n                m = getattr(self, \"pathway{}_res{}\".format(pathway, i))\n                x = m(x)\n            output.append(x)\n        return output\nclass ResNetBasicStem(paddle.nn.Layer):\n    \"\"\"\n    ResNe(X)t 3D stem module.\n    Performs spatiotemporal Convolution, BN, and Relu following by a\n        spatiotemporal pooling.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:373-404"
+    },
+    "6693": {
+        "file_id": 505,
+        "content": "This code defines a ResNet backbone with slow-fast pathways, which includes residual blocks and basic stem modules. The `forward` method processes inputs from each pathway and returns the outputs as a list. It uses getattr to access the correct residual block module for each iteration in each pathway.",
+        "type": "comment"
+    },
+    "6694": {
+        "file_id": 505,
+        "content": "                 kernel,\n                 stride,\n                 padding,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        super(ResNetBasicStem, self).__init__()\n        self.kernel = kernel\n        self.stride = stride\n        self.padding = padding\n        self.eps = eps\n        self.norm_module = norm_module\n        self._construct_stem(dim_in, dim_out)\n    def _construct_stem(self, dim_in, dim_out):\n        fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])\n        initializer_tmp = get_conv_init(fan)\n        self._conv = paddle.nn.Conv3D(\n            in_channels=dim_in,\n            out_channels=dim_out,\n            kernel_size=self.kernel,\n            stride=self.stride,\n            padding=self.padding,\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self._bn = self.norm_module(num_features=dim_out,\n                                    epsilon=self.eps,\n                                    weight_attr=get_bn_param_attr(),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:405-432"
+    },
+    "6695": {
+        "file_id": 505,
+        "content": "This code defines a class for ResNet basic stem module with options to specify kernel, stride, padding, and batch normalization. It initializes the Conv3D layer and BatchNorm3D module based on the specified parameters. The constructor also calls the _construct_stem method to further initialize the Conv3D layer and BatchNorm3D module.",
+        "type": "comment"
+    },
+    "6696": {
+        "file_id": 505,
+        "content": "                                    bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        x = self._conv(x)\n        x = self._bn(x)\n        x = F.relu(x)\n        x = F.max_pool3d(x=x,\n                         kernel_size=[1, 3, 3],\n                         stride=[1, 2, 2],\n                         padding=[0, 1, 1],\n                         data_format=\"NCDHW\")\n        return x\nclass VideoModelStem(paddle.nn.Layer):\n    \"\"\"\n    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool\n    on input data tensor for slow and fast pathways.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 kernel,\n                 stride,\n                 padding,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (list): the list of channel dimensions of the inputs.\n            dim_out (list): the output dimension of the convolution in the stem\n                layer.\n            kernel (list): the kernels' size of the convolutions in the stem",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:433-466"
+    },
+    "6697": {
+        "file_id": 505,
+        "content": "This code defines a 3D stem module for video input. It consists of convolutional, batch normalization, ReLU, and max pooling operations applied to both slow and fast pathways. The dim_in, dim_out, kernel, stride, padding parameters are used to configure the specifics of these operations. Epsilon (eps) is a small value for numerical stability, and norm_module is the batch normalization module being used.",
+        "type": "comment"
+    },
+    "6698": {
+        "file_id": 505,
+        "content": "                layers. Temporal kernel size, height kernel size, width kernel\n                size in order.\n            stride (list): the stride sizes of the convolutions in the stem\n                layer. Temporal kernel stride, height kernel size, width kernel\n                size in order.\n            padding (list): the paddings' sizes of the convolutions in the stem\n                layer. Temporal padding size, height padding size, width padding\n                size in order.\n            eps (float): epsilon for batch norm.\n        \"\"\"\n        super(VideoModelStem, self).__init__()\n        assert (len({\n            len(dim_in),\n            len(dim_out),\n            len(kernel),\n            len(stride),\n            len(padding),\n        }) == 1), \"Input pathway dimensions are not consistent.\"\n        self.num_pathways = len(dim_in)\n        self.kernel = kernel\n        self.stride = stride\n        self.padding = padding\n        self.eps = eps\n        self.norm_module = norm_module\n        self._construct_stem(dim_in, dim_out)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:467-492"
+    },
+    "6699": {
+        "file_id": 505,
+        "content": "The code defines a VideoModelStem class with parameters for input and output dimensions, temporal kernel size, stride, padding, epsilon for batch norm, and the normalization module. It checks for consistent dimensions and initializes instance variables before calling a constructor method.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/67.json b/docs/data/67.json
new file mode 100644
index 000000000..d61d9ac7d
--- /dev/null
+++ b/docs/data/67.json
@@ -0,0 +1,550 @@
+{
+    "6700": {
+        "file_id": 505,
+        "content": "    def _construct_stem(self, dim_in, dim_out):\n        for pathway in range(len(dim_in)):\n            stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],\n                                   self.kernel[pathway], self.stride[pathway],\n                                   self.padding[pathway], self.eps,\n                                   self.norm_module)\n            self.add_sublayer(\"pathway{}_stem\".format(pathway), stem)\n    def forward(self, x):\n        assert (len(x) == self.num_pathways\n                ), \"Input tensor does not contain {} pathway\".format(\n                    self.num_pathways)\n        for pathway in range(len(x)):\n            m = getattr(self, \"pathway{}_stem\".format(pathway))\n            x[pathway] = m(x[pathway])\n        return x\nclass FuseFastToSlow(paddle.nn.Layer):\n    \"\"\"\n    Fuses the information from the Fast pathway to the Slow pathway. Given the\n    tensors from Slow pathway and Fast pathway, fuse information from Fast to\n    Slow, then return the fused tensors from Slow and Fast pathway in order.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:494-518"
+    },
+    "6701": {
+        "file_id": 505,
+        "content": "This code defines a class that constructs a stem for each pathway in ResNet, then applies it to the input tensors. The FuseFastToSlow class fuses information from the Fast pathway to the Slow pathway and returns the fused tensors in order.",
+        "type": "comment"
+    },
+    "6702": {
+        "file_id": 505,
+        "content": "    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 fusion_conv_channel_ratio,\n                 fusion_kernel,\n                 alpha,\n                 fuse_bn_relu=1,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (int): the channel dimension of the input.\n            fusion_conv_channel_ratio (int): channel ratio for the convolution\n                used to fuse from Fast pathway to Slow pathway.\n            fusion_kernel (int): kernel size of the convolution used to fuse\n                from Fast pathway to Slow pathway.\n            alpha (int): the frame rate ratio between the Fast and Slow pathway.\n            eps (float): epsilon for batch norm.\n        \"\"\"\n        super(FuseFastToSlow, self).__init__()\n        self.fuse_bn_relu = fuse_bn_relu\n        fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self._conv_f2s = paddle.nn.Conv3D(\n            in_channels=dim_in,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:519-544"
+    },
+    "6703": {
+        "file_id": 505,
+        "content": "This function initializes the FuseFastToSlow class, which takes in dimensions, fusion parameters, and other options. It sets up a convolutional layer to fuse information from the Fast pathway to the Slow pathway. It uses a specified channel ratio and kernel size for the convolution operation. The epsilon parameter is used for batch normalization.",
+        "type": "comment"
+    },
+    "6704": {
+        "file_id": 505,
+        "content": "            out_channels=dim_in * fusion_conv_channel_ratio,\n            kernel_size=[fusion_kernel, 1, 1],\n            stride=[alpha, 1, 1],\n            padding=[fusion_kernel // 2, 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,\n                               epsilon=eps,\n                               weight_attr=get_bn_param_attr(),\n                               bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        x_s = x[0]\n        x_f = x[1]\n        fuse = self._conv_f2s(x_f)\n        #  TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.\n        if self.fuse_bn_relu:\n            fuse = self._bn(fuse)\n            fuse = F.relu(fuse)\n        x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)\n        return [x_s_fuse, x_f]\n@BACKBONES.register()\nclass ResNetSlowFast(paddle.nn.Layer):\n    \"\"\"\n    SlowFast model builder for SlowFast network.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:545-572"
+    },
+    "6705": {
+        "file_id": 505,
+        "content": "This code defines a ResNetSlowFast model in PaddlePaddle, which is a variation of the SlowFast network. It includes a fusion convolution layer followed by batch normalization and ReLU activation if the fuse_bn_relu flag is set to True. The forward function performs concatenation of the input features and returns the result. This model is registered under BACKBONES for future use.",
+        "type": "comment"
+    },
+    "6706": {
+        "file_id": 505,
+        "content": "    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.\n    \"Slowfast networks for video recognition.\"\n    https://arxiv.org/pdf/1812.03982.pdf\n    \"\"\"\n    def __init__(\n        self,\n        alpha,\n        beta,\n        bn_norm_type=\"batchnorm\",\n        bn_num_splits=1,\n        num_pathways=2,\n        depth=50,\n        num_groups=1,\n        input_channel_num=[3, 3],\n        width_per_group=64,\n        fusion_conv_channel_ratio=2,\n        fusion_kernel_sz=7,  #5?\n        pool_size_ratio=[[1, 1, 1], [1, 1, 1]],\n        fuse_bn_relu = 1,\n        spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]],\n        use_pool_af_s2 = 1,\n    ):\n        \"\"\"\n        Args:\n            cfg (CfgNode): model building configs, details are in the\n                comments of the config file.\n        \"\"\"\n        super(ResNetSlowFast, self).__init__()\n        self.alpha = alpha  #8\n        self.beta = beta  #8\n        self.norm_module = get_norm(bn_norm_type, bn_num_splits)\n        self.num_pathways = num_pathways\n        self.depth = depth",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:574-607"
+    },
+    "6707": {
+        "file_id": 505,
+        "content": "This code defines a class ResNetSlowFast, which is a variant of the ResNet architecture with slow and fast paths for video recognition. It takes various parameters such as alpha, beta, bn_norm_type, etc., to build the network. The class also includes methods for initializing the model with the given parameters.",
+        "type": "comment"
+    },
+    "6708": {
+        "file_id": 505,
+        "content": "        self.num_groups = num_groups\n        self.input_channel_num = input_channel_num\n        self.width_per_group = width_per_group\n        self.fusion_conv_channel_ratio = fusion_conv_channel_ratio\n        self.fusion_kernel_sz = fusion_kernel_sz  # NOTE: modify to 7 in 8*8, 5 in old implement\n        self.pool_size_ratio = pool_size_ratio\n        self.fuse_bn_relu = fuse_bn_relu\n        self.spatial_strides = spatial_strides\n        self.use_pool_af_s2 = use_pool_af_s2\n        self._construct_network()\n    def _construct_network(self):\n        \"\"\"\n        Builds a SlowFast model.\n        The first pathway is the Slow pathway\n        and the second pathway is the Fast pathway.\n        Args:\n            cfg (CfgNode): model building configs, details are in the\n                comments of the config file.\n        \"\"\"\n        temp_kernel = [\n            [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.\n            [[1], [3]],  # res2 temporal kernel for slow and fast pathway.\n            [[1], [3]],  # res3 temporal kernel for slow and fast pathway.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:608-632"
+    },
+    "6709": {
+        "file_id": 505,
+        "content": "This code defines a SlowFast model for computer vision tasks. It takes in several parameters including the number of groups, input channel number, and others. The construct_network function builds the SlowFast model with separate pathways for Slow and Fast pathways using different temporal kernels.",
+        "type": "comment"
+    },
+    "6710": {
+        "file_id": 505,
+        "content": "            [[3], [3]],  # res4 temporal kernel for slow and fast pathway.\n            [[3], [3]],\n        ]  # res5 temporal kernel for slow and fast pathway.\n        self.s1 = VideoModelStem(\n            dim_in=self.input_channel_num,\n            dim_out=[self.width_per_group, self.width_per_group // self.beta],\n            kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],\n            stride=[[1, 2, 2]] * 2,\n            padding=[\n                [temp_kernel[0][0][0] // 2, 3, 3],\n                [temp_kernel[0][1][0] // 2, 3, 3],\n            ],\n            norm_module=self.norm_module)\n        self.s1_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu)\n        # ResNet backbone\n        MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}\n        (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:633-657"
+    },
+    "6711": {
+        "file_id": 505,
+        "content": "The code initializes a SlowFast ResNet backbone model. It defines temporal kernels for res4 and res5 pathways, creates a VideoModelStem layer with specific dimensions and parameters, and a FuseFastToSlow layer for fusion. The code also sets the model stage depth according to the chosen depth (50, 101, or 152).",
+        "type": "comment"
+    },
+    "6712": {
+        "file_id": 505,
+        "content": "        num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]\n        spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]\n        spatial_strides = self.spatial_strides\n        #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]\n        #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment\n        out_dim_ratio = self.beta // self.fusion_conv_channel_ratio  #4\n        dim_inner = self.width_per_group * self.num_groups  #64\n        self.s2 = ResStage(dim_in=[\n            self.width_per_group + self.width_per_group // out_dim_ratio,\n            self.width_per_group // self.beta,\n        ],\n                           dim_out=[\n                               self.width_per_group * 4,\n                               self.width_per_group * 4 // self.beta,\n                           ],\n                           dim_inner=[dim_inner, dim_inner // self.beta],\n                           temp_kernel_sizes=temp_kernel[1],\n                           stride=spatial_strides[0],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:659-678"
+    },
+    "6713": {
+        "file_id": 505,
+        "content": "This code defines a ResStage, which is a stage in the ResNet SlowFast model. It sets the dimensions and parameters for this stage including input and output widths, kernel sizes, and strides. The code also specifies temporary kernel sizes and spatial strides for this particular stage of the model.",
+        "type": "comment"
+    },
+    "6714": {
+        "file_id": 505,
+        "content": "                           num_blocks=[d2] * 2,\n                           num_groups=[self.num_groups] * 2,\n                           num_block_temp_kernel=num_block_temp_kernel[0],\n                           dilation=spatial_dilations[0],\n                           norm_module=self.norm_module)\n        self.s2_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 4 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s3 = ResStage(\n            dim_in=[\n                self.width_per_group * 4 +\n                self.width_per_group * 4 // out_dim_ratio,\n                self.width_per_group * 4 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 8,\n                self.width_per_group * 8 // self.beta,\n            ],\n            dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:679-704"
+    },
+    "6715": {
+        "file_id": 505,
+        "content": "The code defines a resnet_slowfast model with two main components: s1 and s3. The s1 component consists of three branches, with the first two having 2x repeat_num_body blocks each, while the third has (2*repeat_num_body + 1) blocks. It also includes norm_module and spatial_dilations for the first branch. The s3 component contains a ResStage layer. The model uses parameters such as width_per_group, out_dim_ratio, dim_inner, repeat_num_body, alpha, beta, fusion_conv_channel_ratio, fusion_kernel_sz, norm_module, and fuse_bn_relu.",
+        "type": "comment"
+    },
+    "6716": {
+        "file_id": 505,
+        "content": "            temp_kernel_sizes=temp_kernel[2],\n            stride=spatial_strides[1],\n            num_blocks=[d3] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[1],\n            dilation=spatial_dilations[1],\n            norm_module=self.norm_module,\n        )\n        self.s3_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 8 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s4 = ResStage(\n            dim_in=[\n                self.width_per_group * 8 +\n                self.width_per_group * 8 // out_dim_ratio,\n                self.width_per_group * 8 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 16,\n                self.width_per_group * 16 // self.beta,\n            ],\n            dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:705-733"
+    },
+    "6717": {
+        "file_id": 505,
+        "content": "The code initializes and defines different layers for the ResNet SlowFast model. It includes operations such as creating convolutional layers, fusing fast to slow features, and defining a stage layer with specified input and output dimensions. The alpha, fusion_kernel_sz, out_dim_ratio, beta, dim_inner values are used to control the specifics of these operations.",
+        "type": "comment"
+    },
+    "6718": {
+        "file_id": 505,
+        "content": "            temp_kernel_sizes=temp_kernel[3],\n            stride=spatial_strides[2],\n            num_blocks=[d4] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[2],\n            dilation=spatial_dilations[2],\n            norm_module=self.norm_module,\n        )\n        self.s4_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 16 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s5 = ResStage(\n            dim_in=[\n                self.width_per_group * 16 +\n                self.width_per_group * 16 // out_dim_ratio,\n                self.width_per_group * 16 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 32,\n                self.width_per_group * 32 // self.beta,\n            ],\n            dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:734-762"
+    },
+    "6719": {
+        "file_id": 505,
+        "content": "The code defines the ResStage and FuseFastToSlow modules for a ResNet SlowFast model. It initializes these modules with specific dimensions and parameters, such as number of channels, fusion kernel size, alpha value, and dilation rates. These modules are used to extract features from the input and fuse them together for further processing in the network.",
+        "type": "comment"
+    },
+    "6720": {
+        "file_id": 505,
+        "content": "            temp_kernel_sizes=temp_kernel[4],\n            stride=spatial_strides[3],\n            num_blocks=[d5] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[3],\n            dilation=spatial_dilations[3],\n            norm_module=self.norm_module,\n        )\n    def init_weights(self):\n        pass\n    def forward(self, x):\n        x = self.s1(x)  #VideoModelStem\n        x = self.s1_fuse(x)  #FuseFastToSlow\n        x = self.s2(x)  #ResStage\n        x = self.s2_fuse(x)\n        #  TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.\n        if self.use_pool_af_s2:\n            for pathway in range(self.num_pathways):\n                x[pathway] = F.max_pool3d(x=x[pathway],\n                                          kernel_size=self.pool_size_ratio[pathway],\n                                          stride=self.pool_size_ratio[pathway],\n                                          padding=[0, 0, 0],\n                                          data_format=\"NCDHW\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:763-788"
+    },
+    "6721": {
+        "file_id": 505,
+        "content": "This code defines a ResNet slowfast model with specified parameters, initializes the weights, and forwards the input data. It also includes an optional max-pooling operation for one of its stages (s2) depending on a flag value.",
+        "type": "comment"
+    },
+    "6722": {
+        "file_id": 505,
+        "content": "        x = self.s3(x)\n        x = self.s3_fuse(x)\n        x = self.s4(x)\n        x = self.s4_fuse(x)\n        x = self.s5(x)\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast.py:790-795"
+    },
+    "6723": {
+        "file_id": 505,
+        "content": "This code snippet represents the final segment of a neural network model. It processes input data (x) through four sequential layers (s3, s4, and s5), then fuses their outputs before returning the result as output. Each layer is likely responsible for feature extraction or transformation at different levels of the network's architecture.",
+        "type": "comment"
+    },
+    "6724": {
+        "file_id": 506,
+        "content": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py",
+        "type": "filepath"
+    },
+    "6725": {
+        "file_id": 506,
+        "content": "The PaddleVideo library includes functions for convolutional layers, ResBlock classes, Slow and Fast branches, and a VideoModelStem class to initialize a ResNet SlowFast model with MRI for video analysis.",
+        "type": "summary"
+    },
+    "6726": {
+        "file_id": 506,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import KaimingNormal\nfrom ..registry import BACKBONES\nfrom paddlevideo.utils.multigrid import get_norm\nimport sys\nimport numpy as np\nimport paddle.distributed as dist\n# seed random seed\npaddle.framework.seed(0)\n# get init parameters for conv layer\ndef get_conv_init(fan_out):\n    return KaimingNormal(fan_in=fan_out)\ndef get_bn_param_attr(bn_weight=1.0, coeff=0.0):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:1-33"
+    },
+    "6727": {
+        "file_id": 506,
+        "content": "This code is part of the PaddleVideo library, licensed under Apache 2.0. It imports necessary modules and defines functions for initializing convolutional layers with KaimingNormal distribution. It also includes a function for batch normalization parameters and registers backbones in PaddlePaddle's registry. The code uses paddle, numpy, and other libraries for various functionalities such as seeding random numbers for reproducibility.",
+        "type": "comment"
+    },
+    "6728": {
+        "file_id": 506,
+        "content": "    param_attr = paddle.ParamAttr(\n        initializer=paddle.nn.initializer.Constant(bn_weight),\n        regularizer=paddle.regularizer.L2Decay(coeff))\n    return param_attr\n\"\"\"Video models.\"\"\"\nclass BottleneckTransform(paddle.nn.Layer):\n    \"\"\"\n    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of\n        temporal kernel.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 temp_kernel_size,\n                 stride,\n                 dim_inner,\n                 num_groups,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 eps=1e-5,\n                 dilation=1,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (int): the channel dimensions of the input.\n            dim_out (int): the channel dimension of the output.\n            temp_kernel_size (int): the temporal kernel sizes of the middle\n                convolution in the bottleneck.\n            stride (int): the stride of the bottleneck.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:34-66"
+    },
+    "6729": {
+        "file_id": 506,
+        "content": "This code defines a BottleneckTransform class, which is a layer for video models. It performs temporal convolutions with 1x1, 1x3x3, and 1x1x1 layers, where T is the size of the temporal kernel. The class takes various parameters such as dimension, stride, and kernel sizes.",
+        "type": "comment"
+    },
+    "6730": {
+        "file_id": 506,
+        "content": "            dim_inner (int): the inner dimension of the block.\n            num_groups (int): number of groups for the convolution. num_groups=1\n                is for standard ResNet like networks, and num_groups>1 is for\n                ResNeXt like networks.\n            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise\n                apply stride to the 3x3 conv.\n            inplace_relu (bool): if True, calculate the relu on the original\n                input without allocating new memory.\n            eps (float): epsilon for batch norm.\n            dilation (int): size of dilation.\n        \"\"\"\n        super(BottleneckTransform, self).__init__()\n        self.temp_kernel_size = temp_kernel_size\n        self._inplace_relu = inplace_relu\n        self._eps = eps\n        self._stride_1x1 = stride_1x1\n        self.norm_module = norm_module\n        self._construct(dim_in, dim_out, stride, dim_inner, num_groups,\n                        dilation)\n    def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:67-87"
+    },
+    "6731": {
+        "file_id": 506,
+        "content": "The code defines a class 'BottleneckTransform' with parameters for dimensions, stride, inner dimension, number of groups, and other attributes. It inherits from another class and initializes its own instance variables before constructing the model structure. The constructor method takes in various arguments to configure the bottleneck transformation block.",
+        "type": "comment"
+    },
+    "6732": {
+        "file_id": 506,
+        "content": "                   dilation):\n        str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)\n        fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self.a = paddle.nn.Conv3D(\n            in_channels=dim_in,\n            out_channels=dim_inner,\n            kernel_size=[self.temp_kernel_size, 1, 1],\n            stride=[1, str1x1, str1x1],\n            padding=[int(self.temp_kernel_size // 2), 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.a_bn = self.norm_module(num_features=dim_inner,\n                                     epsilon=self._eps,\n                                     weight_attr=get_bn_param_attr(),\n                                     bias_attr=get_bn_param_attr(bn_weight=0.0))\n        # 1x3x3, BN, ReLU.\n        fan = (dim_inner) * (1 * 3 * 3)\n        initializer_tmp = get_conv_init(fan)\n        self.b = paddle.nn.Conv3D(\n            in_channels=dim_inner,\n            out_channels=dim_inner,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:88-113"
+    },
+    "6733": {
+        "file_id": 506,
+        "content": "This code initializes two 3D convolutional layers with Batch Normalization and ReLU activation. The first layer has stride 1 for all dimensions, while the second layer has stride 1 for the first dimension and a different value (determined by _stride_1x1) for the remaining two dimensions. Both layers have specified kernel sizes, padding, and use custom initializers.",
+        "type": "comment"
+    },
+    "6734": {
+        "file_id": 506,
+        "content": "            kernel_size=[1, 3, 3],\n            stride=[1, str3x3, str3x3],\n            padding=[0, dilation, dilation],\n            groups=num_groups,\n            dilation=[1, dilation, dilation],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.b_bn = self.norm_module(num_features=dim_inner,\n                                     epsilon=self._eps,\n                                     weight_attr=get_bn_param_attr(),\n                                     bias_attr=get_bn_param_attr(bn_weight=0.0))\n        # 1x1x1, BN.\n        fan = (dim_out) * (1 * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self.c = paddle.nn.Conv3D(\n            in_channels=dim_inner,\n            out_channels=dim_out,\n            kernel_size=[1, 1, 1],\n            stride=[1, 1, 1],\n            padding=[0, 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.c_bn = self.norm_module(\n            num_features=dim_out,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:114-139"
+    },
+    "6735": {
+        "file_id": 506,
+        "content": "The code defines a Conv3D layer with 1x3x3 kernel, stride of str3x3, and dilation of dilation. It also includes a batch normalization (BN) module for the output. The BN module is applied to the output of the previous layer and has no bias. Finally, another Conv3D layer with 1x1x1 kernel and no BN is defined followed by another BN without a bias.",
+        "type": "comment"
+    },
+    "6736": {
+        "file_id": 506,
+        "content": "            epsilon=self._eps,\n            weight_attr=get_bn_param_attr(bn_weight=0.0),\n            bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        # Branch2a.\n        x = self.a(x)\n        x = self.a_bn(x)\n        x = F.relu(x)\n        # Branch2b.\n        x = self.b(x)\n        x = self.b_bn(x)\n        x = F.relu(x)\n        # Branch2c\n        x = self.c(x)\n        x = self.c_bn(x)\n        return x\nclass ResBlock(paddle.nn.Layer):\n    \"\"\"\n    Residual block.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 temp_kernel_size,\n                 stride,\n                 dim_inner,\n                 num_groups=1,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 eps=1e-5,\n                 dilation=1,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        ResBlock class constructs redisual blocks. More details can be found in:\n            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.\n            \"Deep residual learning for image recognition.\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:140-180"
+    },
+    "6737": {
+        "file_id": 506,
+        "content": "ResNetSlowFastMRI: A residual network model that adds Slow and Fast branches to extract temporal features. ResBlock is a residual block class used in the architecture, which utilizes BatchNorm3D for normalization and applies ReLU activations after each branch.",
+        "type": "comment"
+    },
+    "6738": {
+        "file_id": 506,
+        "content": "            https://arxiv.org/abs/1512.03385\n        Args:\n            dim_in (int): the channel dimensions of the input.\n            dim_out (int): the channel dimension of the output.\n            temp_kernel_size (int): the temporal kernel sizes of the middle\n                convolution in the bottleneck.\n            stride (int): the stride of the bottleneck.\n            trans_func (string): transform function to be used to construct the\n                bottleneck.\n            dim_inner (int): the inner dimension of the block.\n            num_groups (int): number of groups for the convolution. num_groups=1\n                is for standard ResNet like networks, and num_groups>1 is for\n                ResNeXt like networks.\n            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise\n                apply stride to the 3x3 conv.\n            inplace_relu (bool): calculate the relu on the original input\n                without allocating new memory.\n            eps (float): epsilon for batch norm.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:181-198"
+    },
+    "6739": {
+        "file_id": 506,
+        "content": "This function defines a bottleneck for ResNet and ResNeXt-like networks with specified parameters. It takes channel dimensions, temporal kernel sizes, stride, transform function, inner dimension, number of groups, if applying stride to 1x1 conv, inplace relu calculation, and epsilon for batch norm as arguments.",
+        "type": "comment"
+    },
+    "6740": {
+        "file_id": 506,
+        "content": "            dilation (int): size of dilation.\n        \"\"\"\n        super(ResBlock, self).__init__()\n        self._inplace_relu = inplace_relu\n        self._eps = eps\n        self.norm_module = norm_module\n        self._construct(\n            dim_in,\n            dim_out,\n            temp_kernel_size,\n            stride,\n            dim_inner,\n            num_groups,\n            stride_1x1,\n            inplace_relu,\n            dilation,\n        )\n    def _construct(\n        self,\n        dim_in,\n        dim_out,\n        temp_kernel_size,\n        stride,\n        dim_inner,\n        num_groups,\n        stride_1x1,\n        inplace_relu,\n        dilation,\n    ):\n        # Use skip connection with projection if dim or res change.\n        if (dim_in != dim_out) or (stride != 1):\n            fan = (dim_out) * (1 * 1 * 1)\n            initializer_tmp = get_conv_init(fan)\n            self.branch1 = paddle.nn.Conv3D(\n                in_channels=dim_in,\n                out_channels=dim_out,\n                kernel_size=1,\n                stride=[1, stride, stride],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:199-237"
+    },
+    "6741": {
+        "file_id": 506,
+        "content": "This code defines a ResBlock class with skip connection, which performs convolution operations for image processing tasks. The constructor takes various parameters like dimensions, kernel size, stride, etc., and initializes the necessary components based on whether a skip connection is needed or not.",
+        "type": "comment"
+    },
+    "6742": {
+        "file_id": 506,
+        "content": "                padding=0,\n                weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n                bias_attr=False,\n                dilation=1)\n            self.branch1_bn = self.norm_module(\n                num_features=dim_out,\n                epsilon=self._eps,\n                weight_attr=get_bn_param_attr(),\n                bias_attr=get_bn_param_attr(bn_weight=0.0))\n        self.branch2 = BottleneckTransform(dim_in,\n                                           dim_out,\n                                           temp_kernel_size,\n                                           stride,\n                                           dim_inner,\n                                           num_groups,\n                                           stride_1x1=stride_1x1,\n                                           inplace_relu=inplace_relu,\n                                           dilation=dilation,\n                                           norm_module=self.norm_module)\n    def forward(self, x):\n        if hasattr(self, \"branch1\"):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:238-260"
+    },
+    "6743": {
+        "file_id": 506,
+        "content": "This code defines a ResNet SlowFast MRI model with batch normalization (BN) for the branch1 and a BottleneckTransform layer for branch2. The forward function checks if \"branch1\" exists, suggesting it may be conditionally defined elsewhere in the codebase.",
+        "type": "comment"
+    },
+    "6744": {
+        "file_id": 506,
+        "content": "            x1 = self.branch1(x)\n            x1 = self.branch1_bn(x1)\n            x2 = self.branch2(x)\n            x = paddle.add(x=x1, y=x2)\n        else:\n            x2 = self.branch2(x)\n            x = paddle.add(x=x, y=x2)\n        x = F.relu(x)\n        return x\nclass ResStage(paddle.nn.Layer):\n    \"\"\"\n    Stage of 3D ResNet. It expects to have one or more tensors as input for\n        multi-pathway (SlowFast) cases.  More details can be found here:\n        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.\n        \"Slowfast networks for video recognition.\"\n        https://arxiv.org/pdf/1812.03982.pdf\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 stride,\n                 temp_kernel_sizes,\n                 num_blocks,\n                 dim_inner,\n                 num_groups,\n                 num_block_temp_kernel,\n                 dilation,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 norm_module=paddle.nn.BatchNorm3D):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:261-294"
+    },
+    "6745": {
+        "file_id": 506,
+        "content": "This code defines a ResStage class for 3D ResNet, which can handle multi-pathway cases in SlowFast networks. It consists of one or more tensors as input. The stage includes operations such as adding tensors and applying ReLU activation. It also has a branch1 and branch2 for different paths, with batch normalization applied to the first path.",
+        "type": "comment"
+    },
+    "6746": {
+        "file_id": 506,
+        "content": "        \"\"\"\n        The `__init__` method of any subclass should also contain these arguments.\n        ResStage builds p streams, where p can be greater or equal to one.\n        Args:\n            dim_in (list): list of p the channel dimensions of the input.\n                Different channel dimensions control the input dimension of\n                different pathways.\n            dim_out (list): list of p the channel dimensions of the output.\n                Different channel dimensions control the input dimension of\n                different pathways.\n            temp_kernel_sizes (list): list of the p temporal kernel sizes of the\n                convolution in the bottleneck. Different temp_kernel_sizes\n                control different pathway.\n            stride (list): list of the p strides of the bottleneck. Different\n                stride control different pathway.\n            num_blocks (list): list of p numbers of blocks for each of the\n                pathway.\n            dim_inner (list): list of the p inner channel dimensions of the",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:295-312"
+    },
+    "6747": {
+        "file_id": 506,
+        "content": "The given code is the initialization method of a ResStage class in PaddleVideo. It accepts arguments such as dim_in, dim_out, temp_kernel_sizes, stride, and num_blocks to build p pathways with different channel dimensions, temporal kernel sizes, strides, and numbers of blocks for each pathway.",
+        "type": "comment"
+    },
+    "6748": {
+        "file_id": 506,
+        "content": "                input. Different channel dimensions control the input dimension\n                of different pathways.\n            num_groups (list): list of number of p groups for the convolution.\n                num_groups=1 is for standard ResNet like networks, and\n                num_groups>1 is for ResNeXt like networks.\n            num_block_temp_kernel (list): extent the temp_kernel_sizes to\n                num_block_temp_kernel blocks, then fill temporal kernel size\n                of 1 for the rest of the layers.\n            dilation (list): size of dilation for each pathway.\n        \"\"\"\n        super(ResStage, self).__init__()\n        assert all((num_block_temp_kernel[i] <= num_blocks[i]\n                    for i in range(len(temp_kernel_sizes))))\n        self.num_blocks = num_blocks\n        self.temp_kernel_sizes = [\n            (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +\n            [1] * (num_blocks[i] - num_block_temp_kernel[i])\n            for i in range(len(temp_kernel_sizes))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:313-330"
+    },
+    "6749": {
+        "file_id": 506,
+        "content": "This function initializes a ResStage object, which is a layer of a network. It takes in parameters such as the number of blocks and temporal kernel sizes for each pathway, and asserts that the number of block_temp_kernel does not exceed the number of blocks. The temp_kernel_sizes are extended to num_block_temp_kernel blocks with temporal kernel size 1 for the rest of the layers.",
+        "type": "comment"
+    },
+    "6750": {
+        "file_id": 506,
+        "content": "        ]\n        assert (len({\n            len(dim_in),\n            len(dim_out),\n            len(temp_kernel_sizes),\n            len(stride),\n            len(num_blocks),\n            len(dim_inner),\n            len(num_groups),\n            len(num_block_temp_kernel),\n        }) == 1)\n        self.num_pathways = len(self.num_blocks)\n        self.norm_module = norm_module\n        self._construct(\n            dim_in,\n            dim_out,\n            stride,\n            dim_inner,\n            num_groups,\n            stride_1x1,\n            inplace_relu,\n            dilation,\n        )\n    def _construct(\n        self,\n        dim_in,\n        dim_out,\n        stride,\n        dim_inner,\n        num_groups,\n        stride_1x1,\n        inplace_relu,\n        dilation,\n    ):\n        for pathway in range(self.num_pathways):\n            for i in range(self.num_blocks[pathway]):\n                res_block = ResBlock(\n                    dim_in[pathway] if i == 0 else dim_out[pathway],\n                    dim_out[pathway],\n                    self.temp_kernel_sizes[pathway][i],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:331-372"
+    },
+    "6751": {
+        "file_id": 506,
+        "content": "The code creates an instance of a backbone network with adjustable pathways and blocks. It checks the input parameters' length, assigns the number of pathways, initializes a norm module, and calls a private method to construct the network structure. The _construct method loops through each pathway and block, creating ResBlock instances.",
+        "type": "comment"
+    },
+    "6752": {
+        "file_id": 506,
+        "content": "                    stride[pathway] if i == 0 else 1,\n                    dim_inner[pathway],\n                    num_groups[pathway],\n                    stride_1x1=stride_1x1,\n                    inplace_relu=inplace_relu,\n                    dilation=dilation[pathway],\n                    norm_module=self.norm_module)\n                self.add_sublayer(\"pathway{}_res{}\".format(pathway, i),\n                                  res_block)\n    def forward(self, inputs):\n        output = []\n        for pathway in range(self.num_pathways):\n            x = inputs[pathway]\n            for i in range(self.num_blocks[pathway]):\n                m = getattr(self, \"pathway{}_res{}\".format(pathway, i))\n                x = m(x)\n            output.append(x)\n        return output\nclass ResNetBasicStem(paddle.nn.Layer):\n    \"\"\"\n    ResNe(X)t 3D stem module.\n    Performs spatiotemporal Convolution, BN, and Relu following by a\n        spatiotemporal pooling.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:373-404"
+    },
+    "6753": {
+        "file_id": 506,
+        "content": "This code defines a ResNet backbone with SlowFast pathways, including stem module and residual blocks. It initializes the layers for each pathway and then defines a forward function to process inputs through the specified number of pathways and blocks. The ResNetBasicStem performs spatiotemporal convolution, batch normalization, and ReLU before pooling in the 3D stem module.",
+        "type": "comment"
+    },
+    "6754": {
+        "file_id": 506,
+        "content": "                 kernel,\n                 stride,\n                 padding,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        super(ResNetBasicStem, self).__init__()\n        self.kernel = kernel\n        self.stride = stride\n        self.padding = padding\n        self.eps = eps\n        self.norm_module = norm_module\n        self._construct_stem(dim_in, dim_out)\n    def _construct_stem(self, dim_in, dim_out):\n        fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])\n        initializer_tmp = get_conv_init(fan)\n        self._conv = paddle.nn.Conv3D(\n            in_channels=dim_in,\n            out_channels=dim_out,\n            kernel_size=self.kernel,\n            stride=self.stride,\n            padding=self.padding,\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self._bn = self.norm_module(num_features=dim_out,\n                                    epsilon=self.eps,\n                                    weight_attr=get_bn_param_attr(),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:405-432"
+    },
+    "6755": {
+        "file_id": 506,
+        "content": "This code defines a ResNetBasicStem class that initializes the stem of a ResNet network. The constructor takes parameters for kernel size, stride, padding, epsilon value, and norm module. The _construct_stem method creates a 3D convolutional layer with specified dimensions and uses an appropriate initializer for its weights. A batch normalization layer is also created using the provided norm module and epsilon value.",
+        "type": "comment"
+    },
+    "6756": {
+        "file_id": 506,
+        "content": "                                    bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        x = self._conv(x)\n        x = self._bn(x)\n        x = F.relu(x)\n        x = F.max_pool3d(x=x,\n                         kernel_size=[1, 3, 3],\n                         stride=[1, 2, 2],\n                         padding=[0, 1, 1],\n                         data_format=\"NCDHW\")\n        return x\nclass VideoModelStem(paddle.nn.Layer):\n    \"\"\"\n    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool\n    on input data tensor for slow and fast pathways.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 kernel,\n                 stride,\n                 padding,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (list): the list of channel dimensions of the inputs.\n            dim_out (list): the output dimension of the convolution in the stem\n                layer.\n            kernel (list): the kernels' size of the convolutions in the stem",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:433-466"
+    },
+    "6757": {
+        "file_id": 506,
+        "content": "This code defines a video stem module for slow and fast pathways, performing Conv, BN, ReLU, MaxPool operations on input data tensors. The function takes dim_in, dim_out, kernel, stride, padding, eps, and norm_module as arguments.",
+        "type": "comment"
+    },
+    "6758": {
+        "file_id": 506,
+        "content": "                layers. Temporal kernel size, height kernel size, width kernel\n                size in order.\n            stride (list): the stride sizes of the convolutions in the stem\n                layer. Temporal kernel stride, height kernel size, width kernel\n                size in order.\n            padding (list): the paddings' sizes of the convolutions in the stem\n                layer. Temporal padding size, height padding size, width padding\n                size in order.\n            eps (float): epsilon for batch norm.\n        \"\"\"\n        super(VideoModelStem, self).__init__()\n        assert (len({\n            len(dim_in),\n            len(dim_out),\n            len(kernel),\n            len(stride),\n            len(padding),\n        }) == 1), \"Input pathway dimensions are not consistent.\"\n        self.num_pathways = len(dim_in)\n        self.kernel = kernel\n        self.stride = stride\n        self.padding = padding\n        self.eps = eps\n        self.norm_module = norm_module\n        self._construct_stem(dim_in, dim_out)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:467-492"
+    },
+    "6759": {
+        "file_id": 506,
+        "content": "The code defines a class VideoModelStem with parameters for dimensions, kernel size, stride, padding, and epsilon for batch normalization. It checks for consistent input pathway dimensions and initializes instance variables before constructing the stem layer.",
+        "type": "comment"
+    },
+    "6760": {
+        "file_id": 506,
+        "content": "    def _construct_stem(self, dim_in, dim_out):\n        for pathway in range(len(dim_in)):\n            stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],\n                                   self.kernel[pathway], self.stride[pathway],\n                                   self.padding[pathway], self.eps,\n                                   self.norm_module)\n            self.add_sublayer(\"pathway{}_stem\".format(pathway), stem)\n    def forward(self, x):\n        assert (len(x) == self.num_pathways\n                ), \"Input tensor does not contain {} pathway\".format(\n                    self.num_pathways)\n        for pathway in range(len(x)):\n            m = getattr(self, \"pathway{}_stem\".format(pathway))\n            x[pathway] = m(x[pathway])\n        return x\nclass FuseFastToSlow(paddle.nn.Layer):\n    \"\"\"\n    Fuses the information from the Fast pathway to the Slow pathway. Given the\n    tensors from Slow pathway and Fast pathway, fuse information from Fast to\n    Slow, then return the fused tensors from Slow and Fast pathway in order.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:494-518"
+    },
+    "6761": {
+        "file_id": 506,
+        "content": "This code defines a class that constructs and fuses two pathways (slow and fast) in a video processing model. It initializes the stem layers for each pathway and then fuses the information from the fast pathway to the slow pathway. The input tensor should contain the specified number of pathways, and the output is returned as tensors from both pathways in order.",
+        "type": "comment"
+    },
+    "6762": {
+        "file_id": 506,
+        "content": "    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 fusion_conv_channel_ratio,\n                 fusion_kernel,\n                 alpha,\n                 fuse_bn_relu=1,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (int): the channel dimension of the input.\n            fusion_conv_channel_ratio (int): channel ratio for the convolution\n                used to fuse from Fast pathway to Slow pathway.\n            fusion_kernel (int): kernel size of the convolution used to fuse\n                from Fast pathway to Slow pathway.\n            alpha (int): the frame rate ratio between the Fast and Slow pathway.\n            eps (float): epsilon for batch norm.\n        \"\"\"\n        super(FuseFastToSlow, self).__init__()\n        self.fuse_bn_relu = fuse_bn_relu\n        fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self._conv_f2s = paddle.nn.Conv3D(\n            in_channels=dim_in,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:519-544"
+    },
+    "6763": {
+        "file_id": 506,
+        "content": "The code is initializing a class called FuseFastToSlow with parameters for input channel dimension, fusion convolution channel ratio, fusion kernel size, and frame rate ratio. It sets the number of channels in the 3D Convolution layer for fusing frames from Fast to Slow pathways based on given ratios. The fusion operation is performed using a Conv3D layer initialized with the given fan value.",
+        "type": "comment"
+    },
+    "6764": {
+        "file_id": 506,
+        "content": "            out_channels=dim_in * fusion_conv_channel_ratio,\n            kernel_size=[fusion_kernel, 1, 1],\n            stride=[alpha, 1, 1],\n            padding=[fusion_kernel // 2, 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,\n                               epsilon=eps,\n                               weight_attr=get_bn_param_attr(),\n                               bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        x_s = x[0]\n        x_f = x[1]\n        fuse = self._conv_f2s(x_f)\n        #  TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.\n        if self.fuse_bn_relu:\n            fuse = self._bn(fuse)\n            fuse = F.relu(fuse)\n        x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)\n        return [x_s_fuse, x_f]\n@BACKBONES.register()\nclass ResNetSlowFast_MRI(paddle.nn.Layer):\n    \"\"\"\n    SlowFast model builder for SlowFast network.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:545-572"
+    },
+    "6765": {
+        "file_id": 506,
+        "content": "This code defines a ResNetSlowFast_MRI model, which is a type of SlowFast network. It includes a fusion convolution layer and a batch normalization (BN) layer with optional ReLU activation after the fusion convolution. The forward method combines input x_s and x_f using concat before returning both inputs.",
+        "type": "comment"
+    },
+    "6766": {
+        "file_id": 506,
+        "content": "    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.\n    \"Slowfast networks for video recognition.\"\n    https://arxiv.org/pdf/1812.03982.pdf\n    \"\"\"\n    def __init__(\n        self,\n        alpha,\n        beta,\n        bn_norm_type=\"batchnorm\",\n        bn_num_splits=1,\n        num_pathways=2,\n        depth=50,\n        num_groups=1,\n        input_channel_num=[1, 1],\n        width_per_group=64,\n        fusion_conv_channel_ratio=2,\n        fusion_kernel_sz=7,  #5?\n        pool_size_ratio=[[1, 1, 1], [1, 1, 1]],\n        fuse_bn_relu=1,\n        spatial_strides=[[1, 1], [2, 2], [2, 2], [2, 2]],\n        use_pool_af_s2=1,\n    ):\n        \"\"\"\n        Args:\n            cfg (CfgNode): model building configs, details are in the\n                comments of the config file.\n        \"\"\"\n        super(ResNetSlowFast_MRI, self).__init__()\n        self.alpha = alpha  #8\n        self.beta = beta  #8\n        self.norm_module = get_norm(bn_norm_type, bn_num_splits)\n        self.num_pathways = num_pathways\n        self.depth = depth",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:574-607"
+    },
+    "6767": {
+        "file_id": 506,
+        "content": "This code initializes a ResNetSlowFast_MRI model with specified parameters, including alpha and beta values for the network architecture. The class extends from an existing superclass and includes properties like the norm module type, number of pathways, depth, group numbers, input channel numbers, width per group, fusion convolution channel ratio, pool size ratios, whether to use a pooling average operation at spatial stride 2, and spatial strides. The class also initializes these specified attributes for the model configuration.",
+        "type": "comment"
+    },
+    "6768": {
+        "file_id": 506,
+        "content": "        self.num_groups = num_groups\n        self.input_channel_num = input_channel_num\n        self.width_per_group = width_per_group\n        self.fusion_conv_channel_ratio = fusion_conv_channel_ratio\n        self.fusion_kernel_sz = fusion_kernel_sz  # NOTE: modify to 7 in 8*8, 5 in old implement\n        self.pool_size_ratio = pool_size_ratio\n        self.fuse_bn_relu = fuse_bn_relu\n        self.spatial_strides = spatial_strides\n        self.use_pool_af_s2 = use_pool_af_s2\n        self._construct_network()\n    def _construct_network(self):\n        \"\"\"\n        Builds a SlowFast model.\n        The first pathway is the Slow pathway\n        and the second pathway is the Fast pathway.\n        Args:\n            cfg (CfgNode): model building configs, details are in the\n                comments of the config file.\n        \"\"\"\n        temp_kernel = [\n            [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.\n            [[1], [3]],  # res2 temporal kernel for slow and fast pathway.\n            [[1], [3]],  # res3 temporal kernel for slow and fast pathway.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:608-632"
+    },
+    "6769": {
+        "file_id": 506,
+        "content": "The code defines a SlowFast model with separate slow and fast pathways. The constructor sets parameters like input channel number, group number, fusion convolution channel ratio, and more. It also includes functions to build the network structure. The temporal kernels for each layer of both pathways are defined within the code.",
+        "type": "comment"
+    },
+    "6770": {
+        "file_id": 506,
+        "content": "            [[3], [3]],  # res4 temporal kernel for slow and fast pathway.\n            [[3], [3]],\n        ]  # res5 temporal kernel for slow and fast pathway.\n        self.s1 = VideoModelStem(\n            dim_in=self.input_channel_num,\n            dim_out=[self.width_per_group, self.width_per_group // self.beta],\n            kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],\n            stride=[[1, 2, 2]] * 2,\n            padding=[\n                [temp_kernel[0][0][0] // 2, 3, 3],\n                [temp_kernel[0][1][0] // 2, 3, 3],\n            ],\n            norm_module=self.norm_module)\n        self.s1_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu)\n        # ResNet backbone\n        MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}\n        (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:633-657"
+    },
+    "6771": {
+        "file_id": 506,
+        "content": "This code defines a ResNet backbone for the MRI dataset. It sets the temporal kernels for res4 and res5 pathways, initializes the video model stem (s1) with specified dimensions and stride, adds a fuseFastToSlow module for fusion, and defines model stage depth based on the chosen depth of ResNet.",
+        "type": "comment"
+    },
+    "6772": {
+        "file_id": 506,
+        "content": "        num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]\n        spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]\n        spatial_strides = self.spatial_strides\n        #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]\n        #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment\n        out_dim_ratio = self.beta // self.fusion_conv_channel_ratio  #4\n        dim_inner = self.width_per_group * self.num_groups  #64\n        self.s2 = ResStage(dim_in=[\n            self.width_per_group + self.width_per_group // out_dim_ratio,\n            self.width_per_group // self.beta,\n        ],\n                           dim_out=[\n                               self.width_per_group * 4,\n                               self.width_per_group * 4 // self.beta,\n                           ],\n                           dim_inner=[dim_inner, dim_inner // self.beta],\n                           temp_kernel_sizes=temp_kernel[1],\n                           stride=spatial_strides[0],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:659-678"
+    },
+    "6773": {
+        "file_id": 506,
+        "content": "The code is defining the parameters for a ResStage layer in a residual network model. It sets the input and output dimensions, inner dimensions, temporal kernel sizes, and stride values based on previously defined values from the function.",
+        "type": "comment"
+    },
+    "6774": {
+        "file_id": 506,
+        "content": "                           num_blocks=[d2] * 2,\n                           num_groups=[self.num_groups] * 2,\n                           num_block_temp_kernel=num_block_temp_kernel[0],\n                           dilation=spatial_dilations[0],\n                           norm_module=self.norm_module)\n        self.s2_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 4 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s3 = ResStage(\n            dim_in=[\n                self.width_per_group * 4 +\n                self.width_per_group * 4 // out_dim_ratio,\n                self.width_per_group * 4 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 8,\n                self.width_per_group * 8 // self.beta,\n            ],\n            dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:679-704"
+    },
+    "6775": {
+        "file_id": 506,
+        "content": "The code initializes a ResNet SlowFast model with multiple layers and parameters. It creates two branches (slow and fast) for the network, each with its own set of layers and parameters. The slow branch has 2x more blocks than the fast branch, and both branches have identical group numbers and dilation rates. The code also initializes a fusing layer that combines features from the fast and slow branches and a stage for the third level of the network.",
+        "type": "comment"
+    },
+    "6776": {
+        "file_id": 506,
+        "content": "            temp_kernel_sizes=temp_kernel[2],\n            stride=spatial_strides[1],\n            num_blocks=[d3] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[1],\n            dilation=spatial_dilations[1],\n            norm_module=self.norm_module,\n        )\n        self.s3_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 8 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s4 = ResStage(\n            dim_in=[\n                self.width_per_group * 8 +\n                self.width_per_group * 8 // out_dim_ratio,\n                self.width_per_group * 8 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 16,\n                self.width_per_group * 16 // self.beta,\n            ],\n            dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:705-733"
+    },
+    "6777": {
+        "file_id": 506,
+        "content": "This code is initializing a ResNet SlowFast model for MRI. It creates an instance of the class, sets parameters like kernel sizes, strides, block numbers, group numbers, dilation rates, and normalization module. The model consists of multiple stages, including s1, s2, s3_fuse, and s4, each with different dimensions, inner dimensions, and configurations.",
+        "type": "comment"
+    },
+    "6778": {
+        "file_id": 506,
+        "content": "            temp_kernel_sizes=temp_kernel[3],\n            stride=spatial_strides[2],\n            num_blocks=[d4] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[2],\n            dilation=spatial_dilations[2],\n            norm_module=self.norm_module,\n        )\n        self.s4_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 16 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s5 = ResStage(\n            dim_in=[\n                self.width_per_group * 16 +\n                self.width_per_group * 16 // out_dim_ratio,\n                self.width_per_group * 16 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 32,\n                self.width_per_group * 32 // self.beta,\n            ],\n            dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:734-762"
+    },
+    "6779": {
+        "file_id": 506,
+        "content": "This code defines a ResNet SlowFast model with MRI for video analysis. It includes creating layers for stage 4, fusing fast and slow features, and defining the stage 5 layer with specific dimensions for input, output, and inner dimensions. The model utilizes group width, out_dim_ratio, beta, and dim_inner parameters to control its behavior.",
+        "type": "comment"
+    },
+    "6780": {
+        "file_id": 506,
+        "content": "            temp_kernel_sizes=temp_kernel[4],\n            stride=spatial_strides[3],\n            num_blocks=[d5] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[3],\n            dilation=spatial_dilations[3],\n            norm_module=self.norm_module,\n        )\n    def init_weights(self):\n        pass\n    def forward(self, x):\n        x = self.s1(x)  #VideoModelStem\n        x = self.s1_fuse(x)  #FuseFastToSlow\n        x = self.s2(x)  #ResStage\n        x = self.s2_fuse(x)\n        #  TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.\n        if self.use_pool_af_s2:\n            for pathway in range(self.num_pathways):\n                x[pathway] = F.max_pool3d(\n                    x=x[pathway],\n                    kernel_size=self.pool_size_ratio[pathway],\n                    stride=self.pool_size_ratio[pathway],\n                    padding=[0, 0, 0],\n                    data_format=\"NCDHW\")\n        x = self.s3(x)\n        x = self.s3_fuse(x)\n        x = self.s4(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:763-793"
+    },
+    "6781": {
+        "file_id": 506,
+        "content": "This code initializes a 3D ResNet SlowFast model, sets the weights, and applies several stages of convolutions and fusions to process video input. The forward function sequentially passes the input through multiple stages, potentially applying max pooling for AVA if use_pool_af_s2 is True.",
+        "type": "comment"
+    },
+    "6782": {
+        "file_id": 506,
+        "content": "        x = self.s4_fuse(x)\n        x = self.s5(x)\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:794-796"
+    },
+    "6783": {
+        "file_id": 506,
+        "content": "This code snippet is part of a ResNet SlowFast model. It fuses the outputs from previous stages (s4), passes them through another stage (s5) and returns the result as output.",
+        "type": "comment"
+    },
+    "6784": {
+        "file_id": 507,
+        "content": "/paddlevideo/modeling/backbones/resnet_tsm.py",
+        "type": "filepath"
+    },
+    "6785": {
+        "file_id": 507,
+        "content": "The code initializes a ResNet-TSM backbone with convolutional layers and bottleneck blocks, but may be deprecated and needs data preparation changes for better compatibility.",
+        "type": "summary"
+    },
+    "6786": {
+        "file_id": 507,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:1-30"
+    },
+    "6787": {
+        "file_id": 507,
+        "content": "This code snippet defines a ConvBNLayer class which combines a Conv2D and BatchNorm2D layer. It is imported from paddle.nn and will be used for creating convolutional neural network layers in the PaddlePaddle framework. The layer can be utilized to process image data or other types of spatial input data by applying convolution operations followed by batch normalization.",
+        "type": "comment"
+    },
+    "6788": {
+        "file_id": 507,
+        "content": "        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 act=None,\n                 name=None,\n                 data_format=\"NCHW\"):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:31-53"
+    },
+    "6789": {
+        "file_id": 507,
+        "content": "This code defines a class called ConvBNLayer which inherits from an unspecified parent class. It takes in parameters such as number of input and output channels, kernel size, stride, groups, activation function, name, and data format for the Conv2D layer. The class initializes a Conv2D layer using these parameters and adds BatchNorm2D and ReLU layers after it. Weight and bias initialization values are named in the restore parameters, and they are explicitly declared in the init_weights method.",
+        "type": "comment"
+    },
+    "6790": {
+        "file_id": 507,
+        "content": "                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False,\n                            data_format=data_format)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(\n            out_channels,\n            weight_attr=ParamAttr(name=bn_name + \"_scale\",\n                                  regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(name=bn_name + \"_offset\",\n                                regularizer=L2Decay(0.0)),\n            data_format=data_format)\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:54-84"
+    },
+    "6791": {
+        "file_id": 507,
+        "content": "This code defines a BottleneckBlock class with a convolution layer, batch normalization, and optional activation function. The forward pass applies the convolution, batch normalization, and activation if present.",
+        "type": "comment"
+    },
+    "6792": {
+        "file_id": 507,
+        "content": "                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 num_seg=8,\n                 name=None,\n                 data_format=\"NCHW\"):\n        super(BottleneckBlock, self).__init__()\n        self.data_format = data_format\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\",\n                                 data_format=data_format)\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2b\",\n                                 data_format=data_format)\n        self.conv2 = ConvBNLayer(in_channels=out_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:85-108"
+    },
+    "6793": {
+        "file_id": 507,
+        "content": "This code defines a class called BottleneckBlock, which is a part of a neural network model. It contains several ConvBNLayer objects for processing input data, with different parameters such as in_channels, out_channels, kernel_size, stride, and act (activation function). The class also has an attribute for data_format and initializes the ConvBNLayer objects with specific names.",
+        "type": "comment"
+    },
+    "6794": {
+        "file_id": 507,
+        "content": "                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\",\n                                 data_format=data_format)\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\",\n                                     data_format=data_format)\n        self.shortcut = shortcut\n        self.num_seg = num_seg\n    def forward(self, inputs):\n        if paddle.is_compiled_with_custom_device('npu'):\n            x = inputs\n            seg_num = self.num_seg\n            shift_ratio = 1.0 / self.num_seg\n            shape = x.shape  #[N*T, C, H, W]\n            reshape_x = x.reshape(\n                (-1, seg_num, shape[1], shape[2], shape[3]))  #[N, T, C, H, W]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:109-134"
+    },
+    "6795": {
+        "file_id": 507,
+        "content": "This code is initializing a ConvBNLayer and a shortcut connection for the TSM backbone. The layers have specific out_channels, kernel_size, stride, name, and data_format configurations. If a shortcut is not provided, it initializes another ConvBNLayer. The forward function reshapes input to have a shape of [N, T, C, H, W] for further processing.",
+        "type": "comment"
+    },
+    "6796": {
+        "file_id": 507,
+        "content": "            pad_x = F.pad(reshape_x, [\n                0,\n                0,\n                1,\n                1,\n                0,\n                0,\n                0,\n                0,\n                0,\n                0,\n            ])  #[N, T+2, C, H, W]\n            c1 = int(shape[1] * shift_ratio)\n            c2 = int(shape[1] * 2 * shift_ratio)\n            slice1 = pad_x[:, :seg_num, :c1, :, :]\n            slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]\n            slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]\n            concat_x = paddle.concat([slice1, slice2, slice3],\n                                     axis=2)  #[N, T, C, H, W]\n            shifts = concat_x.reshape(shape)\n        else:\n            shifts = F.temporal_shift(inputs,\n                                      self.num_seg,\n                                      1.0 / self.num_seg,\n                                      data_format=self.data_format)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:135-164"
+    },
+    "6797": {
+        "file_id": 507,
+        "content": "This code performs temporal shift operation on the input tensor. If a certain condition is met, it pads and concatenates slices of the input tensor before performing the reshape and temporal shift operations. The resulting output is then passed through several convolutional layers.",
+        "type": "comment"
+    },
+    "6798": {
+        "file_id": 507,
+        "content": "            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.relu(y)\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None,\n                 data_format=\"NCHW\"):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            filter_size=3,\n            stride=stride,\n            act=\"relu\",\n            name=name + \"_branch2a\",\n            data_format=data_format,\n        )\n        self.conv1 = ConvBNLayer(\n            in_channels=out_channels,\n            out_channels=out_channels,\n            filter_size=3,\n            act=None,\n            name=name + \"_branch2b\",\n            data_format=data_format,\n        )\n        if not shortcut:\n            self.short = ConvBNLayer(\n                in_channels=in_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:165-202"
+    },
+    "6799": {
+        "file_id": 507,
+        "content": "The code defines a BasicBlock class which is a residual block. It contains two 3x3 convolutional layers followed by BN and ReLU activations. If the shortcut connection is not used, it also includes an additional convolution layer for the shortcut path. The purpose of this residual block is to alleviate the problem of vanishing gradients in deeper networks.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/68.json b/docs/data/68.json
new file mode 100644
index 000000000..3365cc890
--- /dev/null
+++ b/docs/data/68.json
@@ -0,0 +1,548 @@
+{
+    "6800": {
+        "file_id": 507,
+        "content": "                out_channels=out_channels,\n                filter_size=1,\n                stride=stride,\n                name=name + \"_branch1\",\n                data_format=data_format,\n            )\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTSM(nn.Layer):\n    \"\"\"ResNet TSM backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, num_seg=8, data_format=\"NCHW\", pretrained=None):\n        super(ResNetTSM, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        self.num_seg = num_seg\n        self.data_format = data_format\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:203-241"
+    },
+    "6801": {
+        "file_id": 507,
+        "content": "The code defines a ResNet TSM backbone model with specified depth and data format. It consists of an initialization, a forward function for processing inputs, and the ability to be registered at BACKBONES. It also supports different layers like 18, 34, 50, 101, and 152.",
+        "type": "comment"
+    },
+    "6802": {
+        "file_id": 507,
+        "content": "            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = 64\n        out_channels = [64, 128, 256, 512]\n        self.conv = ConvBNLayer(in_channels=3,\n                                out_channels=64,\n                                kernel_size=7,\n                                stride=2,\n                                act=\"relu\",\n                                name=\"conv1\",\n                                data_format=self.data_format)\n        self.pool2D_max = MaxPool2D(\n            kernel_size=3,\n            stride=2,\n            padding=1,\n            data_format=self.data_format,\n        )\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:242-273"
+    },
+    "6803": {
+        "file_id": 507,
+        "content": "This code initializes a ResNet-TSM backbone with different depth configurations based on the input layers. It includes a convolution layer, max pooling 2D layer, and a block list for deeper networks. The code checks if the layers are supported (18, 34, 50, 101, or 152) and assigns corresponding depth and number of channels accordingly.",
+        "type": "comment"
+    },
+    "6804": {
+        "file_id": 507,
+        "content": "                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        conv_name,\n                        BottleneckBlock(\n                            in_channels=in_channels\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            num_seg=self.num_seg,\n                            shortcut=shortcut,\n                            name=conv_name,\n                            data_format=self.data_format))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:274-293"
+    },
+    "6805": {
+        "file_id": 507,
+        "content": "Code creates bottleneck blocks for ResNet TSM architecture, varying the number of input channels based on layer index and configuration. It adds sublayers with specified parameters including number of segments and stride for each block.",
+        "type": "comment"
+    },
+    "6806": {
+        "file_id": 507,
+        "content": "                    in_channels = out_channels[block] * 4\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(\n                            in_channels=in_channels[block]\n                            if i == 0 else out_channels[block],\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            shortcut=shortcut,\n                            name=conv_name,\n                            data_format=self.data_format,\n                        ))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:294-316"
+    },
+    "6807": {
+        "file_id": 507,
+        "content": "Code initializes a ResNet TSM model backbone, with block-specific in_channels and adds either bottleneck or basic blocks depending on the depth configuration. Init_weights function is also defined to initialize weights for the model.",
+        "type": "comment"
+    },
+    "6808": {
+        "file_id": 507,
+        "content": "        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:317-332"
+    },
+    "6809": {
+        "file_id": 507,
+        "content": "This code initializes parameters for a ResNet TSM backbone. If a pretrained loading path is provided, it loads the weights from that path; otherwise, it uses specific initialization functions for Conv2D and BatchNorm2d layers. No bias is used in Conv2D layers.",
+        "type": "comment"
+    },
+    "6810": {
+        "file_id": 507,
+        "content": "                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        #NOTE: (deprecated design) Already merge axis 0(batches) and axis 1(clips) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27\n        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        #NOTE: As paddlepaddle to_static method need a \"pure\" model to trim. It means from\n        #  1. the phase of generating data[images, label] from dataloader\n        #     to\n        #  2. last layer of a model, always is FC layer\n        y = self.conv(inputs)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm.py:333-353"
+    },
+    "6811": {
+        "file_id": 507,
+        "content": "This code defines a forward function for a backbone model. It uses convolution and pooling layers to extract features from input data. The comments indicate that this implementation may be deprecated, and the data preparation should be modified according to recognizer2d.py for better compatibility with paddlepaddle's to_static method.",
+        "type": "comment"
+    },
+    "6812": {
+        "file_id": 508,
+        "content": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py",
+        "type": "filepath"
+    },
+    "6813": {
+        "file_id": 508,
+        "content": "The code defines a ResNet-TSM model in PaddleVideo with Batch Normalization, Leaky ReLU activation, and optional shortcut connections for MRI applications, using ConvBNLayer and ResNetTSM_MRI classes.",
+        "type": "summary"
+    },
+    "6814": {
+        "file_id": 508,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport math\nimport sys\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils.save_load import load_ckpt\nfrom paddle.regularizer import L2Decay\nclass ConvBNLayer(nn.Layer):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:1-32"
+    },
+    "6815": {
+        "file_id": 508,
+        "content": "This code is importing necessary libraries and defining a class for a Convolutional Batch Normalization Layer. It also provides information about copyright, license, and contact details of the PaddlePaddle Authors.",
+        "type": "comment"
+    },
+    "6816": {
+        "file_id": 508,
+        "content": "    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:\n        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        is_tweaks_mode (bool): switch for tweaks. Default: False.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:33-58"
+    },
+    "6817": {
+        "file_id": 508,
+        "content": "This code defines a ConvBNLayer class with various parameters such as in_channels, out_channels, kernel_size, stride, groups, is_tweaks_mode, act, and name. It inherits from the base class and initializes the layer's weights and biases using explicit declarations in the init_weights method.",
+        "type": "comment"
+    },
+    "6818": {
+        "file_id": 508,
+        "content": "        #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,\n        #             whose stride is changed to 1, works well in practice.\n        self._pool2d_avg = AvgPool2D(kernel_size=2,\n                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(\n            out_channels,\n            weight_attr=ParamAttr(name=bn_name + \"_scale\",",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:59-83"
+    },
+    "6819": {
+        "file_id": 508,
+        "content": "This code snippet initializes ResNet-D with a 2x2 average pooling layer followed by a convolution operation. The pooling layer has a stride of 2 and is changed to 1 later in practice. The convolution uses the specified parameters such as in_channels, out_channels, kernel size, stride, padding, groups, and names for weights and batch normalization.",
+        "type": "comment"
+    },
+    "6820": {
+        "file_id": 508,
+        "content": "                                  regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(bn_name + \"_offset\", regularizer=L2Decay(0.0)))\n    def forward(self, inputs):\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 if_first=False,\n                 num_seg=8,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:84-112"
+    },
+    "6821": {
+        "file_id": 508,
+        "content": "The code defines a class `ResNetTSM_MRI` which appears to be a backbone model for ResNet-TSM. It contains a forward function that applies pooling, convolution, batch normalization, and activation (if specified) to the inputs. The BottleneckBlock class is defined with options for stride, shortcut connection, number of segments, and name. It initializes a ConvBNLayer instance for the first branch, and another ConvBNLayer instance for the second branch.",
+        "type": "comment"
+    },
+    "6822": {
+        "file_id": 508,
+        "content": "                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(\n                in_channels=in_channels,\n                out_channels=out_channels * 4,\n                kernel_size=1,\n                stride=\n                1,  #ResNet-D 2/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,\n                #             whose stride is changed to 1, works well in practice.\n                is_tweaks_mode=False if if_first else True,\n                name=name + \"_branch1\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:113-134"
+    },
+    "6823": {
+        "file_id": 508,
+        "content": "In this code, the function creates three ConvBNLayer instances: a \"branch2a\", \"branch2b\", and \"branch2c\". The \"branch2a\" instance is created with specified parameters. If shortcut is not set, an additional \"branch1\" instance (ConvBNLayer) is created with a 1x1 convolution layer and a stride of 1. This is explained to be useful in ResNet-D 2/2 configuration where a 2x2 average pooling layer with a stride of 2 is added before the convolution, which is later changed to 1 in practice.",
+        "type": "comment"
+    },
+    "6824": {
+        "file_id": 508,
+        "content": "        self.shortcut = shortcut\n        self.num_seg = num_seg\n    def forward(self, inputs):\n        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.leaky_relu(y)\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 filter_size=3,\n                                 stride=stride,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2a\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:136-166"
+    },
+    "6825": {
+        "file_id": 508,
+        "content": "The code defines a class for a ResNet-TSM backbone model, with the forward function applying temporal shifts and convolutions. The BasicBlock class is used for the basic building block of the network, with optional shortcut connections.",
+        "type": "comment"
+    },
+    "6826": {
+        "file_id": 508,
+        "content": "        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 filter_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     filter_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.leaky_relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTSM_MRI(nn.Layer):\n    \"\"\"ResNet TSM backbone.\n    Args:\n        depth (int): Depth of resnet model.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:167-200"
+    },
+    "6827": {
+        "file_id": 508,
+        "content": "This code defines a ResNet TSM backbone model with Batch Normalization and Leaky ReLU activation. It includes a ConvBNLayer for the branch2b, and an optional shortcut connection depending on the input. The forward function performs convolution, adds the shortcut, applies leaky ReLU activation, and returns the result. The ResNetTSM_MRI class is registered with BACKBONES.",
+        "type": "comment"
+    },
+    "6828": {
+        "file_id": 508,
+        "content": "        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, num_seg=8, pretrained=None, in_channels=1):\n        super(ResNetTSM_MRI, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        self.num_seg = num_seg\n        self.in_channels = in_channels\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = 64\n        out_channels = [64, 128, 256, 512]\n        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,\n                                   out_channels=32,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:201-229"
+    },
+    "6829": {
+        "file_id": 508,
+        "content": "This code defines a ResNetTSM_MRI class with parameters for depth, num_seg, pretrained (default None), and in_channels. It checks if the input layer is supported, sets the depth based on the input layer, sets out channels, and initializes ConvBNLayer instances accordingly.",
+        "type": "comment"
+    },
+    "6830": {
+        "file_id": 508,
+        "content": "                                   kernel_size=3,\n                                   stride=2,\n                                   act='leaky_relu',\n                                   name=\"conv1_1\")\n        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='leaky_relu',\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='leaky_relu',\n                                   name=\"conv1_3\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:230-252"
+    },
+    "6831": {
+        "file_id": 508,
+        "content": "This code defines the ResNet-TSM backbone architecture in PaddleVideo. It includes multiple ConvBNLayer instances for different stages of feature extraction and a MaxPool2D layer for downsampling. The depth of each block is specified by the provided depth list, with shortcut connections determined based on the number of layers specified.",
+        "type": "comment"
+    },
+    "6832": {
+        "file_id": 508,
+        "content": "                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' %\n                        (block, i),  #same with PaddleClas, for loading pretrain\n                        BottleneckBlock(\n                            in_channels=in_channels\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            num_seg=self.num_seg,\n                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            name=conv_name))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:253-271"
+    },
+    "6833": {
+        "file_id": 508,
+        "content": "The code dynamically assigns a name to the BottleneckBlock based on its block and layer indices. If the layers are 101 or 152 at block 2, it further distinguishes between convolutions 'a' and 'b'. The 'bb_%d_%d' naming is used for loading pre-trained models. The BottleneckBlock parameters include in_channels based on if i == 0 or not, out_channels of the block, stride depending on if it's the first layer or not, num_seg for segmentation, shortcut type, and a flag for if it's the first layer.",
+        "type": "comment"
+    },
+    "6834": {
+        "file_id": 508,
+        "content": "                    in_channels = out_channels[block] * 4\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:272-294"
+    },
+    "6835": {
+        "file_id": 508,
+        "content": "This code defines a ResNet TSM backbone with multiple blocks and basic blocks. It dynamically creates convolutional layers using the add_sublayer function. The number of blocks and their configuration is defined by the input parameters \"depth\". Shortcuts are used between layers, and the function init_weights initializes the parameters of the backbone.",
+        "type": "comment"
+    },
+    "6836": {
+        "file_id": 508,
+        "content": "            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:295-311"
+    },
+    "6837": {
+        "file_id": 508,
+        "content": "This code initializes the backbone of a neural network. If a pretrained path is not specified, it uses specific initialization methods for Conv2D and BatchNorm2d layers. The KaimingNormal function initializes the Conv2D layer, while the Constant function with value 1 initializes the BatchNorm2d layer.",
+        "type": "comment"
+    },
+    "6838": {
+        "file_id": 508,
+        "content": "        \"\"\"\n        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27\n        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:313-327"
+    },
+    "6839": {
+        "file_id": 508,
+        "content": "Reshaping and applying convolutional layers, max pooling, and iterating through a list of blocks to perform operations on the input feature map.",
+        "type": "comment"
+    },
+    "6840": {
+        "file_id": 509,
+        "content": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py",
+        "type": "filepath"
+    },
+    "6841": {
+        "file_id": 509,
+        "content": "This code imports libraries, defines a ResNet-TSN model with basic and bottleneck blocks in PaddlePaddle, initializes weights for training, and outputs results.",
+        "type": "summary"
+    },
+    "6842": {
+        "file_id": 509,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn import Conv2D, BatchNorm\nfrom paddle.nn import MaxPool2D, AvgPool2D\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:1-29"
+    },
+    "6843": {
+        "file_id": 509,
+        "content": "This code is for importing necessary libraries, defining a ResNet-TSN backbone model in PaddlePaddle, and registering it to the BACKBONES registry. It also includes license information and mentions function-level future imports for compatibility and division settings. The code initializes parameters, defines Conv2D, BatchNorm, MaxPool2D, AvgPool2D layers, and sets up weight initialization functions and loading checkpoints utilities.",
+        "type": "comment"
+    },
+    "6844": {
+        "file_id": 509,
+        "content": "__all__ = [\"ResNetTSN_MRI\"]\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 lr_mult=1.0,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode\n        self._pool2d_avg = AvgPool2D(kernel_size=2,\n                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\",\n                                                  learning_rate=lr_mult),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:31-58"
+    },
+    "6845": {
+        "file_id": 509,
+        "content": "Defines a ConvBNLayer class with an average pooling layer and convolutional layer. The class takes input parameters for channels, kernel size, stride, groups, and more. It initializes the layers and sets is_tweaks_mode flag.",
+        "type": "comment"
+    },
+    "6846": {
+        "file_id": 509,
+        "content": "                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._batch_norm = BatchNorm(\n            out_channels,\n            act=act,\n            param_attr=ParamAttr(name=bn_name + '_scale',\n                                 learning_rate=lr_mult,\n                                 regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(bn_name + '_offset',\n                                learning_rate=lr_mult,\n                                regularizer=L2Decay(0.0)),\n            moving_mean_name=bn_name + '_mean',\n            moving_variance_name=bn_name + '_variance')\n    def forward(self, inputs):\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:59-89"
+    },
+    "6847": {
+        "file_id": 509,
+        "content": "This code defines a class for Resnet_TSN, which is a type of backbone model. It includes an initialization function that initializes the BatchNorm layer and a forward function that applies pooling (if in tweaks mode), convolution, and batch normalization to inputs. Additionally, there is a BottleneckBlock class defined for creating bottleneck blocks within the network.",
+        "type": "comment"
+    },
+    "6848": {
+        "file_id": 509,
+        "content": "                 if_first=False,\n                 lr_mult=1.0,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:90-111"
+    },
+    "6849": {
+        "file_id": 509,
+        "content": "The code defines a BottleneckBlock class, which is a layer in the ResNet model. It consists of three ConvBNLayer layers with different properties such as kernel size, stride, and activation functions. The class initializes these layers and takes input and output channel counts, learning rate multiplier, and name as parameters.",
+        "type": "comment"
+    },
+    "6850": {
+        "file_id": 509,
+        "content": "                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     lr_mult=lr_mult,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        y = F.relu(y)\n        return y\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:112-144"
+    },
+    "6851": {
+        "file_id": 509,
+        "content": "The code defines a ResNet TSN backbone with two branches, where the first branch contains convolutional layers and the second branch has a shortcut connection. The forward function performs addition between the shortcut connection and the output of the convolutional layers. The BasicBlock class is a subclass for implementing basic building blocks.",
+        "type": "comment"
+    },
+    "6852": {
+        "file_id": 509,
+        "content": "                 shortcut=True,\n                 if_first=False,\n                 lr_mult=1.0,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 act=None,\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:145-167"
+    },
+    "6853": {
+        "file_id": 509,
+        "content": "This code defines a BasicBlock class in PaddleVideo for ResNet TSN MRI model. It has an input, output channels, and stride. The class initializes convolution layers (conv0 and conv1) with specified parameters. If shortcut is not set, it also includes a ConvBNLayer as the 'short' attribute.",
+        "type": "comment"
+    },
+    "6854": {
+        "file_id": 509,
+        "content": "                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     lr_mult=lr_mult,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTSN_MRI(nn.Layer):\n    \"\"\"ResNetTweaksTSN backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self,\n                 layers=50,\n                 pretrained=None,\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],\n                 in_channels=1):\n        super(ResNetTSN_MRI, self).__init__()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:168-202"
+    },
+    "6855": {
+        "file_id": 509,
+        "content": "This code defines a ResNetTSN_MRI backbone with specified depth, pretrained model option, and learning rate multipliers for each layer. The forward function performs convolutions and shortcut connections, applying ReLU activation at the end. This backbone is registered in BACKBONES for use in the PaddleVideo library.",
+        "type": "comment"
+    },
+    "6856": {
+        "file_id": 509,
+        "content": "        self.pretrained = pretrained\n        self.layers = layers\n        supported_layers = [18, 34, 50, 101, 152, 200]\n        assert layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, layers)\n        self.lr_mult_list = lr_mult_list\n        self.in_channels = in_channels\n        assert isinstance(\n            self.lr_mult_list,\n            (list, tuple\n             )), \"lr_mult_list should be in (list, tuple) but got {}\".format(\n                 type(self.lr_mult_list))\n        assert len(\n            self.lr_mult_list\n        ) == 5, \"lr_mult_list length should should be 5 but got {}\".format(\n            len(self.lr_mult_list))\n        if layers == 18:\n            depth = [2, 2, 2, 2]\n        elif layers == 34 or layers == 50:\n            depth = [3, 4, 6, 3]\n        elif layers == 101:\n            depth = [3, 4, 23, 3]\n        elif layers == 152:\n            depth = [3, 8, 36, 3]\n        elif layers == 200:\n            depth = [3, 12, 48, 3]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:204-232"
+    },
+    "6857": {
+        "file_id": 509,
+        "content": "This code initializes a ResNet TSN backbone model with specified layers, in_channels, and pretrained weight option. It supports specific layer options (18, 34, 50, 101, 152, 200) and checks if the input layer is within supported range. The code also ensures lr_mult_list is a list or tuple and has a length of 5. Depending on the layers, it assigns depth values for each block in the model.",
+        "type": "comment"
+    },
+    "6858": {
+        "file_id": 509,
+        "content": "        num_channels = [64, 256, 512, 1024\n                        ] if layers >= 50 else [64, 64, 128, 256]\n        num_filters = [64, 128, 256, 512]\n        self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=2,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_1\")\n        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:233-253"
+    },
+    "6859": {
+        "file_id": 509,
+        "content": "This code defines a ResNet model for Temporal Segment Networks (TSN) with multiple branch inputs. It initializes the layers of the network, including convolutional and batch normalization operations. The number of channels and filters used in each layer depend on the total number of layers specified. Different learning rate multipliers are assigned to each layer for efficient training.",
+        "type": "comment"
+    },
+    "6860": {
+        "file_id": 509,
+        "content": "                                   stride=1,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_3\")\n        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if layers in [101, 152, 200] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' % (block, i),\n                        BottleneckBlock(\n                            in_channels=num_channels[block]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:254-275"
+    },
+    "6861": {
+        "file_id": 509,
+        "content": "Initializing layers of ResNet-TSN with specified depth, creating bottleneck blocks for each layer. If layers are 101, 152 or 200 and block is 2, specific naming convention applied. BottleneckBlock is added as sublayer in a sequential manner.",
+        "type": "comment"
+    },
+    "6862": {
+        "file_id": 509,
+        "content": "                            if i == 0 else num_filters[block] * 4,\n                            out_channels=num_filters[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            lr_mult=self.lr_mult_list[block + 1],\n                            name=conv_name))\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        'bb_%d_%d' % (block, i),\n                        BasicBlock(in_channels=num_channels[block]\n                                   if i == 0 else num_filters[block],\n                                   out_channels=num_filters[block],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:276-294"
+    },
+    "6863": {
+        "file_id": 509,
+        "content": "The code creates a ResNet TSN model with bottleneck and basic blocks. It initializes the block_list by adding each block, sets shortcut to True for the first block of each stage, and appends each block to block_list. The number of filters, out_channels, stride, and other parameters are determined based on the stage and block indexes. The name of each block is also specified according to its position in the network.",
+        "type": "comment"
+    },
+    "6864": {
+        "file_id": 509,
+        "content": "                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   if_first=block == i == 0,\n                                   name=conv_name,\n                                   lr_mult=self.lr_mult_list[block + 1]))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be\n            initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        # XXX: check bias!!! check pretrained!!!",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:295-311"
+    },
+    "6865": {
+        "file_id": 509,
+        "content": "This code initializes the weights of a ResNet TSN backbone model. It creates blocks with specified parameters and appends them to the block list. The `init_weights` function initializes the parameters based on whether pretrained loading path is indicated or not, following specific initialization functions for Conv2D and BatchNorm2d layers.",
+        "type": "comment"
+    },
+    "6866": {
+        "file_id": 509,
+        "content": "        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    # XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2d_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:313-331"
+    },
+    "6867": {
+        "file_id": 509,
+        "content": "This code is checking if the pretrained model path is provided and initializing weights for Conv2D and BatchNorm2D layers if not. The forward function performs convolutions, max pooling, and processes through blocks to output a result.",
+        "type": "comment"
+    },
+    "6868": {
+        "file_id": 510,
+        "content": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py",
+        "type": "filepath"
+    },
+    "6869": {
+        "file_id": 510,
+        "content": "The code defines a TSM ResNet backbone class for feature extraction in Temporal Segment Networks, with customizable depth, segments, and pretrained options. It applies temporal shift modules and convolutions across various ResNet models (18-152 layers).",
+        "type": "summary"
+    },
+    "6870": {
+        "file_id": 510,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport math\nimport sys\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils.save_load import load_ckpt\n# Download URL of pretrained model",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:1-31"
+    },
+    "6871": {
+        "file_id": 510,
+        "content": "This code imports necessary libraries, defines a class for a TSM ResNet backbone, and includes functions for loading pre-trained models. The code also contains a license notice and copyright information.",
+        "type": "comment"
+    },
+    "6872": {
+        "file_id": 510,
+        "content": "# {\n# \"ResNet50_vd\":\n# \"wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams\",\n# \"ResNet101_vd\":\n# \"https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams\",\n# \"ResNet18_vd\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams\",\n# \"ResNet34_vd\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet34_vd_ssld_pretrained.pdparams\",\n# \"ResNet152_vd\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams\",\n# \"ResNet200_vd\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams\",\n# }\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:\n        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:32-54"
+    },
+    "6873": {
+        "file_id": 510,
+        "content": "This code defines a class called \"ConvBNLayer\" that combines Conv2D and BatchNorm2D layers, taking input and output channel counts, kernel size, and stride as arguments. It also includes a dictionary of pre-trained model URLs for ResNet variations.",
+        "type": "comment"
+    },
+    "6874": {
+        "file_id": 510,
+        "content": "        groups (int): Groups in the Conv2D, Default: 1.\n        is_tweaks_mode (bool): switch for tweaks. Default: False.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode\n        #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,\n        #             whose stride is changed to 1, works well in practice.\n        self._pool2d_avg = AvgPool2D(kernel_size=2,\n                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:55-78"
+    },
+    "6875": {
+        "file_id": 510,
+        "content": "The code defines a class \"ConvBNLayer\" with parameters such as in_channels, out_channels, kernel_size, stride, groups, is_tweaks_mode, act, and name. It also adds an average pooling layer before the convolution for ResNet-D 1/2 tweak, which works well in practice.",
+        "type": "comment"
+    },
+    "6876": {
+        "file_id": 510,
+        "content": "                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(\n            out_channels,\n            weight_attr=ParamAttr(name=bn_name + \"_scale\",\n                                  regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(bn_name + \"_offset\", regularizer=L2Decay(0.0)))\n    def forward(self, inputs):\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:79-105"
+    },
+    "6877": {
+        "file_id": 510,
+        "content": "This code defines a Convolutional Neural Network (CNN) layer with Batch Normalization and optionally activation function. The layer can have tweaks mode for pooling and average pooling operations. It also includes a forward method for passing inputs through the defined layers.",
+        "type": "comment"
+    },
+    "6878": {
+        "file_id": 510,
+        "content": "        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 if_first=False,\n                 num_seg=8,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:106-132"
+    },
+    "6879": {
+        "file_id": 510,
+        "content": "The code defines a BottleneckBlock class with in_channels, out_channels, stride, shortcut (optional), if_first (boolean), and num_seg as parameters. It initializes ConvBNLayer objects for conv0, conv1, and conv2 layers. The BottleneckBlock is a part of the ResNet architecture with tweaks and TSM.",
+        "type": "comment"
+    },
+    "6880": {
+        "file_id": 510,
+        "content": "                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n        self.num_seg = num_seg\n    def forward(self, inputs):\n        if paddle.is_compiled_with_custom_device('npu'):\n            x = inputs\n            seg_num = self.num_seg\n            shift_ratio = 1.0 / self.num_seg\n            shape = x.shape  #[N*T, C, H, W]\n            reshape_x = x.reshape(\n                (-1, seg_num, shape[1], shape[2], shape[3]))  #[N, T, C, H, W]\n            pad_x = F.pad(reshape_x, [\n                0,\n                0,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:133-159"
+    },
+    "6881": {
+        "file_id": 510,
+        "content": "This code defines a class with an initializer and a forward method. The initializer sets the number of segments (num_seg) and whether to use shortcut connection. The forward method reshapes input, pads it based on segment numbers, and likely performs some computations for Temporal Segment Networks.",
+        "type": "comment"
+    },
+    "6882": {
+        "file_id": 510,
+        "content": "                1,\n                1,\n                0,\n                0,\n                0,\n                0,\n                0,\n                0,\n            ])  #[N, T+2, C, H, W]\n            c1 = int(shape[1] * shift_ratio)\n            c2 = int(shape[1] * 2 * shift_ratio)\n            slice1 = pad_x[:, :seg_num, :c1, :, :]\n            slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]\n            slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]\n            concat_x = paddle.concat([slice1, slice2, slice3],\n                                     axis=2)  #[N, T, C, H, W]\n            shifts = concat_x.reshape(shape)\n        else:\n            shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.leaky_relu(y)\nclass BasicBlock(nn.Layer):\n    def __init__(self,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:160-192"
+    },
+    "6883": {
+        "file_id": 510,
+        "content": "This code defines a function and a class, both part of a ResNet backbone model. The function takes in an input tensor and applies temporal shifts, convolutions, and shortcut connections to form the output. The BasicBlock class initializes a basic block layer with convolutional layers and a shortcut connection.",
+        "type": "comment"
+    },
+    "6884": {
+        "file_id": 510,
+        "content": "                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 num_seg=8,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.num_seg = num_seg\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     kernel_size=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:193-217"
+    },
+    "6885": {
+        "file_id": 510,
+        "content": "This code defines a BasicBlock class for ResNet TSM model, with parameters including input and output channels, stride, shortcut flag, and number of segments. It initializes instance variables and creates convolution layers (conv0, conv1) for feature extraction. If not using shortcut connections, it also initializes a short layer for residual connections.",
+        "type": "comment"
+    },
+    "6886": {
+        "file_id": 510,
+        "content": "                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        # add temporal shift module\n        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.leaky_relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTweaksTSM(nn.Layer):\n    \"\"\"ResNet TSM backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, num_seg=8, pretrained=None):\n        super(ResNetTweaksTSM, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        self.num_seg = num_seg\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:218-253"
+    },
+    "6887": {
+        "file_id": 510,
+        "content": "The code defines a ResNet TSM backbone model. It has an init function that initializes the model with specified depth, number of segments, and pretrained options. The forward function applies temporal shift module, convolution, shortcut connection if applicable, and Leaky ReLU activation for feature extraction.",
+        "type": "comment"
+    },
+    "6888": {
+        "file_id": 510,
+        "content": "            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = 64\n        out_channels = [64, 128, 256, 512]\n        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        self.conv1_1 = ConvBNLayer(in_channels=3,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=2,\n                                   act='leaky_relu',\n                                   name=\"conv1_1\")\n        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:254-279"
+    },
+    "6889": {
+        "file_id": 510,
+        "content": "The code defines a ResNet model with different depths based on the specified number of layers. It initializes the layers, including a 7x7 convolution and multiple 3x3 convolutions, as well as Batch Normalization and Leaky ReLU activation functions. The model structure is determined by the input layer size, with supported layers ranging from 18 to 152.",
+        "type": "comment"
+    },
+    "6890": {
+        "file_id": 510,
+        "content": "                                   act='leaky_relu',\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='leaky_relu',\n                                   name=\"conv1_3\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:280-301"
+    },
+    "6891": {
+        "file_id": 510,
+        "content": "This code defines a ResNet backbone with TSM tweaks. It includes convolutional layers, Batch Normalization, Leaky ReLU activation functions, and max pooling. The block_list is initialized and the structure of the network adapts depending on the specified layers.",
+        "type": "comment"
+    },
+    "6892": {
+        "file_id": 510,
+        "content": "                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' %\n                        (block, i),  #same with PaddleClas, for loading pretrain\n                        BottleneckBlock(\n                            in_channels=in_channels\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            num_seg=self.num_seg,\n                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            name=conv_name))\n                    in_channels = out_channels[block] * 4\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            in_channels = [64, 64, 128, 256]\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:302-322"
+    },
+    "6893": {
+        "file_id": 510,
+        "content": "Iterates through ResNet blocks and Bottleneck blocks, assigning in_channels based on previous block's out_channels. Inserts each block into the block_list. Adjusts shortcut value accordingly. Initializes in_channels as 64 for specified blocks if depth is not defined.",
+        "type": "comment"
+    },
+    "6894": {
+        "file_id": 510,
+        "content": "                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   num_seg=self.num_seg,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:323-339"
+    },
+    "6895": {
+        "file_id": 510,
+        "content": "This code defines a function for initializing weights in the ResNet TSMBackbone. It loads pre-trained weights if a loading path is specified or uses specific initialization methods otherwise, with Conv2D layers using KaimingNormal and BatchNorm2d layers using Constant initialization.",
+        "type": "comment"
+    },
+    "6896": {
+        "file_id": 510,
+        "content": "            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    # no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:340-362"
+    },
+    "6897": {
+        "file_id": 510,
+        "content": "This function initializes the backbone's weights, with Kaiming Normal distribution for Conv2D layers and constant value 1 for BatchNorm2D. The forward function defines how the backbone processes inputs through a series of convolutions and pooling, then passes the result to each block in the block_list.",
+        "type": "comment"
+    },
+    "6898": {
+        "file_id": 511,
+        "content": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py",
+        "type": "filepath"
+    },
+    "6899": {
+        "file_id": 511,
+        "content": "PaddleVideo library contains ResNet TSN model backbones, licensed under Apache 2.0, includes ConvBNLayer, offers modified ResNet with BatchNorm and pooling layers, uses BottleneckBlock for deeper networks, and initializes configurable parameters with BasicBlock. The code constructs a ResNet backbone, performs forward pass through network, applies convolutions and pooling, and returns output after passing through each block in the block list using input lists to determine layers and filters while initializing model weights.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/69.json b/docs/data/69.json
new file mode 100644
index 000000000..977ee33a1
--- /dev/null
+++ b/docs/data/69.json
@@ -0,0 +1,549 @@
+{
+    "6900": {
+        "file_id": 511,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn import Conv2D, BatchNorm\nfrom paddle.nn import MaxPool2D, AvgPool2D\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:1-29"
+    },
+    "6901": {
+        "file_id": 511,
+        "content": "This code is a part of the PaddleVideo library, which provides model backbones including ResNet Tweaks TSN. It imports necessary modules and defines functions for creating convolutional layers, batch normalization, pooling layers, initializing weights, and loading checkpoints. The code follows the Apache License 2.0 and is distributed under an \"AS IS\" basis.",
+        "type": "comment"
+    },
+    "6902": {
+        "file_id": 511,
+        "content": "__all__ = [\"ResNetTweaksTSN\"]\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 lr_mult=1.0,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode\n        self._pool2d_avg = AvgPool2D(kernel_size=2,\n                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\",\n                                                  learning_rate=lr_mult),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:31-58"
+    },
+    "6903": {
+        "file_id": 511,
+        "content": "This code defines a ConvBNLayer class with an average pooling operation, a convolution layer, and optional tweaks mode. It also initializes a Conv2D layer and sets parameters for weight attributes and learning rates.",
+        "type": "comment"
+    },
+    "6904": {
+        "file_id": 511,
+        "content": "                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._batch_norm = BatchNorm(\n            out_channels,\n            act=act,\n            param_attr=ParamAttr(name=bn_name + '_scale',\n                                 learning_rate=lr_mult,\n                                 regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(bn_name + '_offset',\n                                learning_rate=lr_mult,\n                                regularizer=L2Decay(0.0)),\n            moving_mean_name=bn_name + '_mean',\n            moving_variance_name=bn_name + '_variance')\n    def forward(self, inputs):\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:59-89"
+    },
+    "6905": {
+        "file_id": 511,
+        "content": "The code defines a ResNet backbone with Temporal Segment Network (TSN) modifications. It includes a BatchNorm layer for normalization and has a forward function that applies pooling if in tweaks mode, followed by the batch norm and convolution layers. The BottleneckBlock class is also defined as a sublayer.",
+        "type": "comment"
+    },
+    "6906": {
+        "file_id": 511,
+        "content": "                 if_first=False,\n                 lr_mult=1.0,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:90-111"
+    },
+    "6907": {
+        "file_id": 511,
+        "content": "This code defines the BottleneckBlock class, which is a layer in ResNet backbone. It consists of three ConvBNLayer instances: conv0, conv1, and conv2. The first one performs a 1x1 convolution, while the second one does a 3x3 convolution with stride. Lastly, the third one executes a 1x1 convolution without activation function. This block is designed to reduce parameters for deeper networks.",
+        "type": "comment"
+    },
+    "6908": {
+        "file_id": 511,
+        "content": "                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     lr_mult=lr_mult,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        y = F.relu(y)\n        return y\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:112-144"
+    },
+    "6909": {
+        "file_id": 511,
+        "content": "This code defines a ResNet block with two convolution layers, one optional shortcut connection, and applies ReLU activation after the addition of the branch outputs. The BasicBlock class is used for the basic building block of the network.",
+        "type": "comment"
+    },
+    "6910": {
+        "file_id": 511,
+        "content": "                 shortcut=True,\n                 if_first=False,\n                 lr_mult=1.0,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 act=None,\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:145-167"
+    },
+    "6911": {
+        "file_id": 511,
+        "content": "This code defines a BasicBlock class with convolutional layers and Batch Normalization. It initializes the block's parameters like stride, convolution layers, and batch normalization. The shortcut connection is optional and depends on the 'shortcut' parameter. The 'if_first', 'lr_mult', and 'name' parameters are also provided for customization.",
+        "type": "comment"
+    },
+    "6912": {
+        "file_id": 511,
+        "content": "                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     lr_mult=lr_mult,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTweaksTSN(nn.Layer):\n    \"\"\"ResNetTweaksTSN backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self,\n                 layers=50,\n                 pretrained=None,\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0]):\n        super(ResNetTweaksTSN, self).__init__()\n        self.pretrained = pretrained",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:168-203"
+    },
+    "6913": {
+        "file_id": 511,
+        "content": "This code defines a ResNetTweaksTSN backbone for deep learning models. It includes layers such as convolution, shortcut connections, and ReLU activation function. The constructor takes parameters like depth (layers), pretrained model, and learning rate multipliers for different layers.",
+        "type": "comment"
+    },
+    "6914": {
+        "file_id": 511,
+        "content": "        self.layers = layers\n        supported_layers = [18, 34, 50, 101, 152, 200]\n        assert layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, layers)\n        self.lr_mult_list = lr_mult_list\n        assert isinstance(\n            self.lr_mult_list,\n            (list, tuple\n             )), \"lr_mult_list should be in (list, tuple) but got {}\".format(\n                 type(self.lr_mult_list))\n        assert len(\n            self.lr_mult_list\n        ) == 5, \"lr_mult_list length should should be 5 but got {}\".format(\n            len(self.lr_mult_list))\n        if layers == 18:\n            depth = [2, 2, 2, 2]\n        elif layers == 34 or layers == 50:\n            depth = [3, 4, 6, 3]\n        elif layers == 101:\n            depth = [3, 4, 23, 3]\n        elif layers == 152:\n            depth = [3, 8, 36, 3]\n        elif layers == 200:\n            depth = [3, 12, 48, 3]\n        num_channels = [64, 256, 512, 1024\n                        ] if layers >= 50 else [64, 64, 128, 256]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:204-232"
+    },
+    "6915": {
+        "file_id": 511,
+        "content": "This code initializes a ResNet backbone with different configurations based on the input layer. It checks if the provided layer is supported, asserts the type and length of the learning rate multiplier list, and assigns depth and number of channels for each layer configuration.",
+        "type": "comment"
+    },
+    "6916": {
+        "file_id": 511,
+        "content": "        num_filters = [64, 128, 256, 512]\n        self.conv1_1 = ConvBNLayer(in_channels=3,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=2,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_1\")\n        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:233-254"
+    },
+    "6917": {
+        "file_id": 511,
+        "content": "This code defines the first layer of the ResNet backbone, including three ConvBNLayer instances for different operations. The first layer consists of a 2x downsampling convolution, followed by two 1x1 convolutions to reduce dimensionality and apply relu activation. Lr_mult ensures that these layers are trained with different learning rates based on their importance.",
+        "type": "comment"
+    },
+    "6918": {
+        "file_id": 511,
+        "content": "                                   name=\"conv1_3\")\n        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if layers in [101, 152, 200] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' % (block, i),\n                        BottleneckBlock(\n                            in_channels=num_channels[block]\n                            if i == 0 else num_filters[block] * 4,\n                            out_channels=num_filters[block],\n                            stride=2 if i == 0 and block != 0 else 1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:255-276"
+    },
+    "6919": {
+        "file_id": 511,
+        "content": "This code defines a ResNet backbone with optional Temporal Segment Network (TSN) modifications. It adds BottleneckBlock layers, specifies pooling operations, and handles shortcut connections for blocks 0-56. The number of layers and filters are determined by the provided depth and num_filters lists.",
+        "type": "comment"
+    },
+    "6920": {
+        "file_id": 511,
+        "content": "                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            lr_mult=self.lr_mult_list[block + 1],\n                            name=conv_name))\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        'bb_%d_%d' % (block, i),\n                        BasicBlock(in_channels=num_channels[block]\n                                   if i == 0 else num_filters[block],\n                                   out_channels=num_filters[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   if_first=block == i == 0,\n                                   name=conv_name,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:277-296"
+    },
+    "6921": {
+        "file_id": 511,
+        "content": "This code adds layers to the ResNet backbone model. It uses conditional statements and loops to determine the number of layers added at each block based on a given depth configuration, and applies different configurations for the first block. Layers are added with specific parameters such as in_channels, out_channels, stride, shortcut, if_first flag, and name.",
+        "type": "comment"
+    },
+    "6922": {
+        "file_id": 511,
+        "content": "                                   lr_mult=self.lr_mult_list[block + 1]))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be\n            initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        # XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:297-314"
+    },
+    "6923": {
+        "file_id": 511,
+        "content": "This code initializes a backbone model and handles the loading of pre-trained weights. If pre-trained path is specified, it loads the weights; otherwise, it follows specific initialization for Conv2D layers and BatchNorm2d using KaimingNormal and Constant functions respectively.",
+        "type": "comment"
+    },
+    "6924": {
+        "file_id": 511,
+        "content": "                if isinstance(layer, nn.Conv2D):\n                    # XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2d_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:315-328"
+    },
+    "6925": {
+        "file_id": 511,
+        "content": "This code initializes the weights of convolutional layers without bias and batch normalization layers with constant value 1. It then performs forward pass through the network, applying convolutions and pooling operations. The output is returned after passing through each block in the block list.",
+        "type": "comment"
+    },
+    "6926": {
+        "file_id": 512,
+        "content": "/paddlevideo/modeling/backbones/resnext101.py",
+        "type": "filepath"
+    },
+    "6927": {
+        "file_id": 512,
+        "content": "The code defines a ResNeXt-101 model in PaddlePaddle, including downsample and residual blocks, BottleneckBlock, performs convolutions, activation, max pooling on input image.",
+        "type": "summary"
+    },
+    "6928": {
+        "file_id": 512,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom functools import partial\nimport paddle\nclass ConvBNLayer(paddle.nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 filter_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 padding_mode='zeros',\n                 weight_attr=None,\n                 bias_attr=None,\n                 name=None,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnext101.py:1-31"
+    },
+    "6929": {
+        "file_id": 512,
+        "content": "This code defines a ConvBNLayer class in PaddlePaddle, which is a convolution-batch normalization layer. It takes inputs like num_channels, num_filters, filter_size, stride, padding, dilation, groups, padding_mode, weight_attr, bias_attr, and name.",
+        "type": "comment"
+    },
+    "6930": {
+        "file_id": 512,
+        "content": "                 data_format=\"NCDHW\"):\n        super(ConvBNLayer, self).__init__()\n        self._conv = paddle.nn.Conv3D(\n            in_channels=num_channels,\n            out_channels=num_filters,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            padding_mode=padding_mode,\n            weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.KaimingNormal(\n                fan_in=num_filters * filter_size * filter_size), name=name+'_weights'),\n            bias_attr=bias_attr,\n            data_format=data_format)\n        bn_name = \"bn_\" + name\n        self._batch_norm = paddle.nn.BatchNorm3D(\n            num_filters,\n            momentum=0.9,\n            epsilon=1e-05,\n            weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(\n                1.), name=bn_name + '_scale'),\n            bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(\n                0.), name=bn_name + '_offset'),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnext101.py:32-55"
+    },
+    "6931": {
+        "file_id": 512,
+        "content": "This code defines a ConvBNLayer class with specified parameters for convolutional and batch normalization layers. The convolutional layer uses Kaiming Normal initialization, while the batch normalization layer has fixed scales and offsets initialized to 1 and 0 respectively.",
+        "type": "comment"
+    },
+    "6932": {
+        "file_id": 512,
+        "content": "            data_format=data_format)\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        return y\ndef _downsample_basic_block(self, x, planes, stride):\n    out = paddle.nn.functional.avg_pool3d(x, kernel_size=1, stride=stride)\n    shape = out.shape\n    zero_pads = paddle.zeros(shape=[shape[0], planes - shape[1], shape[2], shape[3], shape[4]],\n                                   dtype='float32')\n    out = paddle.concat(x=[out, zero_pads], axis=1)\nclass BottleneckBlock(paddle.nn.Layer):\n    expansion = 2\n    def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None, name=None):\n        super(BottleneckBlock, self).__init__()\n        mid_planes = cardinality * int(planes / 32)\n        self.conv0 = ConvBNLayer(\n            inplanes, mid_planes, filter_size=1, bias_attr=False, name=name+'_branch2a')\n        self.conv1 = ConvBNLayer(mid_planes, mid_planes, filter_size=3, stride=stride,\n                                 padding=1, groups=cardinality, bias_attr=False, name=name+'_branch2b')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnext101.py:56-82"
+    },
+    "6933": {
+        "file_id": 512,
+        "content": "This code defines a BottleneckBlock class and a downsample function for the ResNeXt101 model in PaddlePaddle. The BottleneckBlock has an expansion factor of 2 and uses ConvBNLayer for convolution and batch normalization. The downsample function performs average pooling and concatenation to perform downsampling.",
+        "type": "comment"
+    },
+    "6934": {
+        "file_id": 512,
+        "content": "        self.conv2 = ConvBNLayer(mid_planes, planes * self.expansion,\n                                 filter_size=1, bias_attr=False, name=name+'_branch2c')\n        self.downsample = downsample\n        self.stride = stride\n        self.relu = paddle.nn.ReLU()\n    def forward(self, x):\n        residual = x\n        out = self.conv0(x)\n        out = self.relu(out)\n        out = self.conv1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass ResNeXt(paddle.nn.Layer):\n    def __init__(self,\n                 block,\n                 layers,\n                 shortcut_type='B',\n                 cardinality=32):\n        self.inplanes = 64\n        super(ResNeXt, self).__init__()\n        self.conv = ConvBNLayer(\n            3,\n            64,\n            filter_size=7,\n            stride=(1, 2, 2),\n            padding=(3, 3, 3),\n            bias_attr=False,\n            name=\"res_conv1\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnext101.py:83-122"
+    },
+    "6935": {
+        "file_id": 512,
+        "content": "This code defines a ResNeXt model. The class ResNeXt has an initialization that sets inplanes to 64 and inherits from paddle.nn.Layer. It contains a convolution layer (conv) with 3 input channels, 64 output channels, filter size of 7, and stride of (1,2,2). The class also includes a ResNet-style residual block as a member variable named 'block'. It has layers, shortcut_type (defaults to B), and cardinality parameters.",
+        "type": "comment"
+    },
+    "6936": {
+        "file_id": 512,
+        "content": "        )\n        self.relu = paddle.nn.ReLU()\n        self.maxpool = paddle.nn.MaxPool3D(kernel_size=(3, 3, 3), stride=2, padding=1)\n        self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type,\n                                       cardinality, stride=1, name='layer1')\n        self.layer2 = self._make_layer(\n            block, 256, layers[1], shortcut_type, cardinality, stride=2, name='layer2')\n        self.layer3 = self._make_layer(\n            block, 512, layers[2], shortcut_type, cardinality, stride=2, name='layer3')\n        self.layer4 = self._make_layer(\n            block, 1024, layers[3], shortcut_type, cardinality, stride=2, name='layer4')\n        self.avgpool = paddle.nn.AvgPool3D((2, 1, 1), stride=1, exclusive=False)\n    def _make_layer(self,\n                    block,\n                    planes,\n                    blocks,\n                    shortcut_type,\n                    cardinality,\n                    stride=1,\n                    name=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnext101.py:123-148"
+    },
+    "6937": {
+        "file_id": 512,
+        "content": "The code defines a ResNext101 backbone for a deep learning model. It includes a ReLU activation function and max pooling operation, followed by four residual layers (layer1 to layer4) with varying numbers of planes (256, 512, 1024 respectively). The _make_layer method is used to create the layers, with options for downsampling and varying expansion rates.",
+        "type": "comment"
+    },
+    "6938": {
+        "file_id": 512,
+        "content": "            if shortcut_type == 'A':\n                downsample = partial(self._downsample_basic_block,\n                                     planes=planes * block.expansion,\n                                     stride=stride)\n            else:\n                downsample = ConvBNLayer(\n                    self.inplanes,\n                    planes * block.expansion,\n                    1,\n                    stride=stride,\n                    bias_attr=False,\n                    name=name+'downsample'\n                )\n        layers = []\n        layers.append(\n            block(self.inplanes, planes, cardinality, stride, downsample, name=name+'_downsample'))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(block(self.inplanes, planes,\n                          cardinality, name=name+'_res_block'+str(i)))\n        return paddle.nn.Sequential(*layers)\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n        x = self.layer1(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnext101.py:149-176"
+    },
+    "6939": {
+        "file_id": 512,
+        "content": "This code defines a ResNeXt-101 model, implementing its downsample and residual blocks. It takes an input image, performs convolutions, applies ReLU activation, and max pooling before passing through the specified number of residual blocks.",
+        "type": "comment"
+    },
+    "6940": {
+        "file_id": 512,
+        "content": "        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        return x\ndef ResNext101():\n    \"\"\"Constructs a ResNext-101 model.\n    \"\"\"\n    model = ResNeXt(BottleneckBlock, [3, 4, 23, 3])\n    return model",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/resnext101.py:177-187"
+    },
+    "6941": {
+        "file_id": 512,
+        "content": "The ResNext101 function constructs a ResNeXt-101 model using BottleneckBlock and the specified block configurations. It applies the layer2, layer3, and layer4 operations to x before returning the result.",
+        "type": "comment"
+    },
+    "6942": {
+        "file_id": 513,
+        "content": "/paddlevideo/modeling/backbones/stgcn.py",
+        "type": "filepath"
+    },
+    "6943": {
+        "file_id": 513,
+        "content": "The code incorporates image processing and Graph class, supports layouts like 'stgcn' and 'coco_keypoint'. It defines a STGCN model for spatio-temporal data processing using ConvTemporalGraphical layer. The code creates a STGCN class for skeleton-based action recognition with edge importance, applies networks, pools results, and averages before returning output.",
+        "type": "summary"
+    },
+    "6944": {
+        "file_id": 513,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\ndef zero(x):\n    return 0\ndef iden(x):\n    return x\ndef einsum(x, A):\n    \"\"\"paddle.einsum will be implemented in release/2.2.\n    \"\"\"\n    x = x.transpose((0, 2, 3, 1, 4))\n    n, c, t, k, v = x.shape\n    k2, v2, w = A.shape\n    assert (k == k2 and v == v2), \"Args of einsum not match!\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:1-37"
+    },
+    "6945": {
+        "file_id": 513,
+        "content": "This code snippet imports necessary libraries, defines several functions (zero, iden, einsum), and registers the BACKBONES. The purpose of this module seems to be defining backbone architectures or functions used in image processing tasks. However, more context is needed to understand the specific functionality of these functions or their use within the BACKBONES registry.",
+        "type": "comment"
+    },
+    "6946": {
+        "file_id": 513,
+        "content": "    x = x.reshape((n, c, t, k * v))\n    A = A.reshape((k * v, w))\n    y = paddle.matmul(x, A)\n    return y\ndef get_hop_distance(num_node, edge, max_hop=1):\n    A = np.zeros((num_node, num_node))\n    for i, j in edge:\n        A[j, i] = 1\n        A[i, j] = 1\n    # compute hop steps\n    hop_dis = np.zeros((num_node, num_node)) + np.inf\n    transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]\n    arrive_mat = (np.stack(transfer_mat) > 0)\n    for d in range(max_hop, -1, -1):\n        hop_dis[arrive_mat[d]] = d\n    return hop_dis\ndef normalize_digraph(A):\n    Dl = np.sum(A, 0)\n    num_node = A.shape[0]\n    Dn = np.zeros((num_node, num_node))\n    for i in range(num_node):\n        if Dl[i] > 0:\n            Dn[i, i] = Dl[i]**(-1)\n    AD = np.dot(A, Dn)\n    return AD\nclass Graph():\n    def __init__(self,\n                 layout='openpose',\n                 strategy='uniform',\n                 max_hop=1,\n                 dilation=1):\n        self.max_hop = max_hop\n        self.dilation = dilation\n        self.get_edge(layout)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:38-80"
+    },
+    "6947": {
+        "file_id": 513,
+        "content": "This code defines a Graph class and three functions: get_hop_distance, normalize_digraph, and a constructor for the Graph class. The Graph class initializes with layout, strategy, max_hop, and dilation parameters. The get_hop_distance function computes hop distances between nodes in a graph up to max_hop level. The normalize_digraph function calculates and applies row-wise node degrees to normalize the adjacency matrix. The constructor initializes an instance of the Graph class with given parameters.",
+        "type": "comment"
+    },
+    "6948": {
+        "file_id": 513,
+        "content": "        self.hop_dis = get_hop_distance(self.num_node,\n                                        self.edge,\n                                        max_hop=max_hop)\n        self.get_adjacency(strategy)\n    def __str__(self):\n        return self.A\n    def get_edge(self, layout):\n        # edge is a list of [child, parent] paris\n        if layout == 'fsd10':\n            self.num_node = 25\n            self_link = [(i, i) for i in range(self.num_node)]\n            neighbor_link = [(1, 8), (0, 1), (15, 0), (17, 15), (16, 0),\n                             (18, 16), (5, 1), (6, 5), (7, 6), (2, 1), (3, 2),\n                             (4, 3), (9, 8), (10, 9), (11, 10), (24, 11),\n                             (22, 11), (23, 22), (12, 8), (13, 12), (14, 13),\n                             (21, 14), (19, 14), (20, 19)]\n            self.edge = self_link + neighbor_link\n            self.center = 8\n        elif layout == 'ntu-rgb+d':\n            self.num_node = 25\n            self_link = [(i, i) for i in range(self.num_node)]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:81-104"
+    },
+    "6949": {
+        "file_id": 513,
+        "content": "The code initializes the hop distance and edge based on the number of nodes, maximum hops, and layout type. It defines self_link as intra-node connections and neighbor_link as inter-node connections. The center node is determined based on the layout.",
+        "type": "comment"
+    },
+    "6950": {
+        "file_id": 513,
+        "content": "            neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),\n                              (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),\n                              (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),\n                              (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),\n                              (23, 8), (24, 25), (25, 12)]\n            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]\n            self.edge = self_link + neighbor_link\n            self.center = 21 - 1\n        elif layout == 'coco_keypoint':\n            self.num_node = 17\n            self_link = [(i, i) for i in range(self.num_node)]\n            neighbor_1base = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6),\n                              (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12),\n                              (11, 13), (12, 14), (13, 15), (14, 16), (11, 12)]\n            neighbor_link = [(i, j) for (i, j) in neighbor_1base]\n            self.edge = self_link + neighbor_link",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:105-120"
+    },
+    "6951": {
+        "file_id": 513,
+        "content": "The code initializes 'self.edge' and 'self.center', defining nodes, self-links, and neighboring node links based on the specified layout ('stgcn' or 'coco_keypoint'). For 'stgcn', there are 25 nodes with various connections, while for 'coco_keypoint', there are 17 nodes with specific connections.",
+        "type": "comment"
+    },
+    "6952": {
+        "file_id": 513,
+        "content": "            self.center = 11\n        else:\n            raise ValueError(\"Do Not Exist This Layout.\")\n    def get_adjacency(self, strategy):\n        valid_hop = range(0, self.max_hop + 1, self.dilation)\n        adjacency = np.zeros((self.num_node, self.num_node))\n        for hop in valid_hop:\n            adjacency[self.hop_dis == hop] = 1\n        normalize_adjacency = normalize_digraph(adjacency)\n        if strategy == 'spatial':\n            A = []\n            for hop in valid_hop:\n                a_root = np.zeros((self.num_node, self.num_node))\n                a_close = np.zeros((self.num_node, self.num_node))\n                a_further = np.zeros((self.num_node, self.num_node))\n                for i in range(self.num_node):\n                    for j in range(self.num_node):\n                        if self.hop_dis[j, i] == hop:\n                            if self.hop_dis[j, self.center] == self.hop_dis[\n                                    i, self.center]:\n                                a_root[j, i] = normalize_adjacency[j, i]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:121-143"
+    },
+    "6953": {
+        "file_id": 513,
+        "content": "This function sets the adjacency matrix for STGCN based on the strategy. It initializes the adjacency matrix as a zero matrix, then fills it with 1s for valid hops. The adjacency matrix is normalized using `normalize_digraph`. If the strategy is 'spatial', it iterates over each pair of nodes and populates the adjacency matrix accordingly based on their hop distance from the center node and their hop distance to each other.",
+        "type": "comment"
+    },
+    "6954": {
+        "file_id": 513,
+        "content": "                            elif self.hop_dis[j, self.center] > self.hop_dis[\n                                    i, self.center]:\n                                a_close[j, i] = normalize_adjacency[j, i]\n                            else:\n                                a_further[j, i] = normalize_adjacency[j, i]\n                if hop == 0:\n                    A.append(a_root)\n                else:\n                    A.append(a_root + a_close)\n                    A.append(a_further)\n            A = np.stack(A)\n            self.A = A\n        else:\n            raise ValueError(\"Do Not Exist This Strategy\")\nclass ConvTemporalGraphical(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 t_kernel_size=1,\n                 t_stride=1,\n                 t_padding=0,\n                 t_dilation=1):\n        super().__init__()\n        self.kernel_size = kernel_size\n        self.conv = nn.Conv2D(in_channels,\n                              out_channels * kernel_size,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:144-174"
+    },
+    "6955": {
+        "file_id": 513,
+        "content": "This code implements a ConvTemporalGraphical layer, which is a backbone architecture for STGCN. It initializes the ConvTemporalGraphical layer with input and output channels, kernel size, and temporal parameters. The code handles different strategies to build the adjacency matrix (A) for the graph convolution by considering close and further nodes based on hop distance from the central node. If hop == 0, it appends A to a list, otherwise appends both close and further A matrices. Finally, it stacks the A matrices into a numpy array and assigns it to self.A.",
+        "type": "comment"
+    },
+    "6956": {
+        "file_id": 513,
+        "content": "                              kernel_size=(t_kernel_size, 1),\n                              padding=(t_padding, 0),\n                              stride=(t_stride, 1),\n                              dilation=(t_dilation, 1))\n    def forward(self, x, A):\n        assert A.shape[0] == self.kernel_size\n        x = self.conv(x)\n        n, kc, t, v = x.shape\n        x = x.reshape((n, self.kernel_size, kc // self.kernel_size, t, v))\n        x = einsum(x, A)\n        return x, A\nclass st_gcn_block(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 dropout=0,\n                 residual=True):\n        super(st_gcn_block, self).__init__()\n        assert len(kernel_size) == 2\n        assert kernel_size[0] % 2 == 1\n        padding = ((kernel_size[0] - 1) // 2, 0)\n        self.gcn = ConvTemporalGraphical(in_channels, out_channels,\n                                         kernel_size[1])\n        self.tcn = nn.Sequential(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:175-209"
+    },
+    "6957": {
+        "file_id": 513,
+        "content": "The code defines a ConvTemporalGraphical layer and an STGCNBlock. The ConvTemporalGraphical layer is a 2D convolutional layer with temporal kernel size, padding, stride, and dilation. The STGCNBlock is a residual block that takes input and output channels, temporal kernel size, stride, and dropout as inputs. It initializes the GCN layer and TCN layers sequentially, with the GCN layer performing temporal graph convolution and the TCN layers performing temporal convolutions.",
+        "type": "comment"
+    },
+    "6958": {
+        "file_id": 513,
+        "content": "            nn.BatchNorm2D(out_channels),\n            nn.ReLU(),\n            nn.Conv2D(\n                out_channels,\n                out_channels,\n                (kernel_size[0], 1),\n                (stride, 1),\n                padding,\n            ),\n            nn.BatchNorm2D(out_channels),\n            nn.Dropout(dropout),\n        )\n        if not residual:\n            self.residual = zero\n        elif (in_channels == out_channels) and (stride == 1):\n            self.residual = iden\n        else:\n            self.residual = nn.Sequential(\n                nn.Conv2D(in_channels,\n                          out_channels,\n                          kernel_size=1,\n                          stride=(stride, 1)),\n                nn.BatchNorm2D(out_channels),\n            )\n        self.relu = nn.ReLU()\n    def forward(self, x, A):\n        res = self.residual(x)\n        x, A = self.gcn(x, A)\n        x = self.tcn(x) + res\n        return self.relu(x), A\n@BACKBONES.register()\nclass STGCN(nn.Layer):\n    \"\"\"\n    ST-GCN model from:\n ",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:210-251"
+    },
+    "6959": {
+        "file_id": 513,
+        "content": "This code defines a STGCN (Spatio-Temporal Graph Convolutional Network) model. It includes layers such as BatchNormalization, ReLU activation, and convolution operations for processing spatial and temporal data. The forward method applies these operations to input features x and adjacency matrix A.",
+        "type": "comment"
+    },
+    "6960": {
+        "file_id": 513,
+        "content": "   `\"Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition\" <https://arxiv.org/abs/1801.07455>`_\n    Args:\n        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.\n        edge_importance_weighting: bool, whether to use edge attention. Default True.\n        data_bn: bool, whether to use data BatchNorm. Default True.\n    \"\"\"\n    def __init__(self,\n                 in_channels=2,\n                 edge_importance_weighting=True,\n                 data_bn=True,\n                 layout='fsd10',\n                 strategy='spatial',\n                 **kwargs):\n        super(STGCN, self).__init__()\n        self.data_bn = data_bn\n        # load graph\n        self.graph = Graph(\n            layout=layout,\n            strategy=strategy,\n        )\n        A = paddle.to_tensor(self.graph.A, dtype='float32')\n        self.register_buffer('A', A)\n        # build networks\n        spatial_kernel_size = A.shape[0]\n        temporal_kernel_size = 9\n        kernel_size = (temporal_kernel_size, spatial_kernel_size)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:251-278"
+    },
+    "6961": {
+        "file_id": 513,
+        "content": "This code defines the STGCN (Spatial Temporal Graph Convolutional Networks) class, which is a model for skeleton-based action recognition. It takes arguments like in_channels, edge_importance_weighting, and data_bn to determine the network configuration. It loads graph data and builds networks with specific kernel sizes for spatial and temporal dimensions.",
+        "type": "comment"
+    },
+    "6962": {
+        "file_id": 513,
+        "content": "        self.data_bn = nn.BatchNorm1D(in_channels *\n                                      A.shape[1]) if self.data_bn else iden\n        kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}\n        self.st_gcn_networks = nn.LayerList((\n            st_gcn_block(in_channels,\n                         64,\n                         kernel_size,\n                         1,\n                         residual=False,\n                         **kwargs0),\n            st_gcn_block(64, 64, kernel_size, 1, **kwargs),\n            st_gcn_block(64, 64, kernel_size, 1, **kwargs),\n            st_gcn_block(64, 64, kernel_size, 1, **kwargs),\n            st_gcn_block(64, 128, kernel_size, 2, **kwargs),\n            st_gcn_block(128, 128, kernel_size, 1, **kwargs),\n            st_gcn_block(128, 128, kernel_size, 1, **kwargs),\n            st_gcn_block(128, 256, kernel_size, 2, **kwargs),\n            st_gcn_block(256, 256, kernel_size, 1, **kwargs),\n            st_gcn_block(256, 256, kernel_size, 1, **kwargs),\n        ))\n        # initialize parameters for edge importance weighting",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:279-300"
+    },
+    "6963": {
+        "file_id": 513,
+        "content": "This code initializes a series of ST-GCN blocks with different configurations for the ST-GCN backbone, including batch normalization and specific layer dimensions. These blocks are stored in a LayerList for flexibility and efficient computation.",
+        "type": "comment"
+    },
+    "6964": {
+        "file_id": 513,
+        "content": "        if edge_importance_weighting:\n            self.edge_importance = nn.ParameterList([\n                self.create_parameter(\n                    shape=self.A.shape,\n                    default_initializer=nn.initializer.Constant(1))\n                for i in self.st_gcn_networks\n            ])\n        else:\n            self.edge_importance = [1] * len(self.st_gcn_networks)\n        self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv2D):\n                weight_init_(layer, 'Normal', mean=0.0, std=0.02)\n            elif isinstance(layer, nn.BatchNorm2D):\n                weight_init_(layer, 'Normal', mean=1.0, std=0.02)\n            elif isinstance(layer, nn.BatchNorm1D):\n                weight_init_(layer, 'Normal', mean=1.0, std=0.02)\n    def forward(self, x):\n        # data normalization\n        N, C, T, V, M = x.shape\n        x = x.transpose((0, 4, 3, 1, 2))  # N, M, V, C, T",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:301-327"
+    },
+    "6965": {
+        "file_id": 513,
+        "content": "Code creates edge importance parameters if edge_importance_weighting is True, otherwise sets all edge importances to 1. Initializes weights for convolutional layers and batch normalization layers with specified means and standard deviations. The forward function transposes the input tensor shape before processing.",
+        "type": "comment"
+    },
+    "6966": {
+        "file_id": 513,
+        "content": "        x = x.reshape((N * M, V * C, T))\n        if self.data_bn:\n            x.stop_gradient = False\n        x = self.data_bn(x)\n        x = x.reshape((N, M, V, C, T))\n        x = x.transpose((0, 1, 3, 4, 2))  # N, M, C, T, V\n        x = x.reshape((N * M, C, T, V))\n        # forward\n        for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):\n            x, _ = gcn(x, paddle.multiply(self.A, importance))\n        x = self.pool(x)  # NM,C,T,V --> NM,C,1,1\n        C = x.shape[1]\n        x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1)  # N,C,1,1\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/stgcn.py:328-343"
+    },
+    "6967": {
+        "file_id": 513,
+        "content": "This code reshapes the input tensor and applies batch normalization before reshaping again. It then transposes the dimensions and reshapes once more. The main operation involves iterating through each ST-GCN network and applying it to the input with multiplied edge importance, followed by pooling. Finally, it reshapes the output, performs averaging over the third dimension, and returns the result.",
+        "type": "comment"
+    },
+    "6968": {
+        "file_id": 514,
+        "content": "/paddlevideo/modeling/backbones/swin_transformer.py",
+        "type": "filepath"
+    },
+    "6969": {
+        "file_id": 514,
+        "content": "The code introduces a DropPath layer, Swin Transformer backbone with window-based multi-head attention for image processing, and implements the Swin Transformer Block 3D in PaddleVideo, which also features a 3D PatchEmbed3D and 3D backbone.",
+        "type": "summary"
+    },
+    "6970": {
+        "file_id": 514,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom functools import lru_cache, reduce\nfrom operator import mul\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Constant\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import trunc_normal_\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)\ndef drop_path(x, drop_prob=0., training=False):\n    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:1-33"
+    },
+    "6971": {
+        "file_id": 514,
+        "content": "Copyright notice, import statements, and drop_path function definition for stochastic depth in residual blocks.",
+        "type": "comment"
+    },
+    "6972": {
+        "file_id": 514,
+        "content": "    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    # issuecomment-532968956 ...\n    See discussion: https://github.com/tensorflow/tpu/issues/494\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob)\n    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)\n    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\nclass Mlp(nn.Layer):\n    \"\"\" Multilayer perceptron.\"\"\"\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:34-64"
+    },
+    "6973": {
+        "file_id": 514,
+        "content": "This code snippet defines a \"DropPath\" layer that applies drop paths (Stochastic Depth) to the input, based on the provided drop probability. The drop paths are applied in the main path of residual blocks for each sample. This class also includes a forward method that drops out elements from the input with the specified probability during training but returns the original input unchanged when not training or if the drop probability is 0.",
+        "type": "comment"
+    },
+    "6974": {
+        "file_id": 514,
+        "content": "                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\ndef window_partition(x, window_size):\n    \"\"\"window_partition\n    Args:\n        x (Tensor): x.shape = [B, D, H, W, C]\n        window_size (tuple[int]): window_size\n    Returns:\n        Tensor: (B*num_windows, window_size*window_size, C)\n    \"\"\"\n    B, D, H, W, C = x.shape\n    x = x.reshape([\n        B, D // window_size[0], window_size[0], H // window_size[1],\n        window_size[1], W // window_size[2], window_size[2], C\n    ])\n    windows = x.transpose([0, 1, 3, 5, 2, 4, 6,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:65-99"
+    },
+    "6975": {
+        "file_id": 514,
+        "content": "The code above defines a layer for the Swin Transformer backbone. It contains two linear layers, an activation function (GELU), and a dropout layer. The `window_partition` function partitions input tensor based on specified window size.",
+        "type": "comment"
+    },
+    "6976": {
+        "file_id": 514,
+        "content": "                           7]).reshape([-1, reduce(mul, window_size), C])\n    return windows\nclass Identity(nn.Layer):\n    def __init__(self):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\ndef window_reverse(windows, window_size, B, D, H, W):\n    \"\"\"\n    Args:\n        windows: (B*num_windows, window_size, window_size, C)\n        window_size (tuple[int]): Window size\n        H (int): Height of image\n        W (int): Width of image\n    Returns:\n        x: (B, D, H, W, C)\n    \"\"\"\n    x = windows.reshape([\n        B, D // window_size[0], H // window_size[1], W // window_size[2],\n        window_size[0], window_size[1], window_size[2], -1\n    ])\n    x = x.transpose([0, 1, 4, 2, 5, 3, 6, 7]).reshape([B, D, H, W, -1])\n    return x\ndef get_window_size(x_size, window_size, shift_size=None):\n    use_window_size = list(window_size)\n    if shift_size is not None:\n        use_shift_size = list(shift_size)\n    for i in range(len(x_size)):\n        if x_size[i] <= window_size[i]:\n            use_window_size[i] = x_size[i]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:100-137"
+    },
+    "6977": {
+        "file_id": 514,
+        "content": "The code defines a function `window_reverse` that takes a set of windows and rearranges them back into the original image shape. The `get_window_size` function determines the appropriate window size based on input dimensions. Both functions are used in the Swin Transformer backbone model.",
+        "type": "comment"
+    },
+    "6978": {
+        "file_id": 514,
+        "content": "            if shift_size is not None:\n                use_shift_size[i] = 0\n    if shift_size is None:\n        return tuple(use_window_size)\n    else:\n        return tuple(use_window_size), tuple(use_shift_size)\nclass WindowAttention3D(nn.Layer):\n    \"\"\" Window based multi-head self attention (W-MSA) module with relative position bias.\n    It supports both of shifted and non-shifted window.\n    Args:\n        dim (int): Number of input channels.\n        window_size (tuple[int]): The temporal length, height and width of the window.\n        num_heads (int): Number of attention heads.\n        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set\n        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0\n        proj_drop (float, optional): Dropout ratio of output. Default: 0.0\n    \"\"\"\n    def __init__(self,\n                 dim,\n                 window_size,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:138-161"
+    },
+    "6979": {
+        "file_id": 514,
+        "content": "This code defines a class called \"WindowAttention3D\" which implements a window-based multi-head self attention module with relative position bias. It supports both shifted and non-shifted windows, and takes in parameters such as the number of input channels (dim), temporal length, height and width of the window (window_size), number of attention heads (num_heads), whether to add a learnable bias to query, key, value (qkv_bias), override default qk scale of head_dim ** -0.5 if set (qk_scale), dropout ratio of attention weight (attn_drop), and dropout ratio of output (proj_drop). The function at the top part of the code determines whether to use window or shift size based on a given value.",
+        "type": "comment"
+    },
+    "6980": {
+        "file_id": 514,
+        "content": "                 num_heads,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.,\n                 proj_drop=0.):\n        super().__init__()\n        self.dim = dim\n        self.window_size = window_size  # Wd, Wh, Ww\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n        # define a parameter table of relative position bias\n        self.relative_position_bias_table = self.create_parameter(\n            shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1) *\n                   (2 * window_size[2] - 1), num_heads),\n            default_initializer=zeros_,\n        )  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH\n        self.add_parameter(\"relative_position_bias_table\",\n                           self.relative_position_bias_table)\n        # get pair-wise relative position index for each token inside the window\n        coords_d = paddle.arange(self.window_size[0])\n        coords_h = paddle.arange(self.window_size[1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:162-185"
+    },
+    "6981": {
+        "file_id": 514,
+        "content": "This code initializes the Swin Transformer's self-attention module. It defines a window size and number of attention heads, calculates head dimensions, sets up position bias table, and adds parameters for position bias table and head dimensions. The code also creates coordinate arrays for dimension and height inside the window.",
+        "type": "comment"
+    },
+    "6982": {
+        "file_id": 514,
+        "content": "        coords_w = paddle.arange(self.window_size[2])\n        coords = paddle.stack(paddle.meshgrid(coords_d, coords_h,\n                                              coords_w))  # 3, Wd, Wh, Ww\n        coords_flatten = paddle.flatten(coords, 1)  # 3, Wd*Wh*Ww\n        relative_coords = coords_flatten.unsqueeze(\n            axis=2) - coords_flatten.unsqueeze(axis=1)  # 3, Wd*Wh*Ww, Wd*Wh*Ww\n        # relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1)  # 3, Wd*Wh*Ww, Wd*Wh*Ww\n        relative_coords = relative_coords.transpose([1, 2, 0\n                                                     ])  # Wd*Wh*Ww, Wd*Wh*Ww, 3\n        relative_coords[:, :,\n                        0] += self.window_size[0] - 1  # shift to start from 0\n        relative_coords[:, :, 1] += self.window_size[1] - 1\n        relative_coords[:, :, 2] += self.window_size[2] - 1\n        relative_coords[:, :, 0] *= (2 * self.window_size[1] -\n                                     1) * (2 * self.window_size[2] - 1)\n        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:186-204"
+    },
+    "6983": {
+        "file_id": 514,
+        "content": "This code performs relative position encoding for the Swin Transformer by calculating relative coordinates of patches within a sliding window. It first creates 2D and 3D coordinate grids, then subtracts them to obtain relative positions. Finally, it shifts and scales the relative coordinates to fit the range of the window size.",
+        "type": "comment"
+    },
+    "6984": {
+        "file_id": 514,
+        "content": "        relative_position_index = relative_coords.sum(\n            axis=-1)  # Wd*Wh*Ww, Wd*Wh*Ww\n        self.register_buffer(\"relative_position_index\", relative_position_index)\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.attn_drop = nn.Dropout(attn_drop)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n        trunc_normal_(self.relative_position_bias_table, std=0.02)\n        self.softmax = nn.Softmax(axis=-1)\n    def forward(self, x, mask=None):\n        \"\"\" Forward function.\n        Args:\n            x: input features with shape of (num_windows*B, N, C)\n            mask: (0/-inf) mask with shape of (num_windows, N, N) or None\n        \"\"\"\n        B_, N, C = x.shape\n        qkv = self.qkv(x).reshape(\n            [B_, N, 3, self.num_heads,\n             C // self.num_heads]).transpose([2, 0, 3, 1, 4])\n        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C\n        q = q * self.scale\n        attn = q @ k.transpose([0, 1, 3, 2])\n        relative_position_bias = self.relative_position_bias_table[",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:205-232"
+    },
+    "6985": {
+        "file_id": 514,
+        "content": "This code initializes a Swin Transformer backbone by registering a buffer for relative position indices and defining the linear projections, dropouts, softmax function, and forward pass. The forward function takes input features of shape (num_windows*B, N, C) and performs multi-head self-attention with learned query, key, and value matrices, scaled by the square root of the dimension. Attention is calculated using dot product between queries and keys, and then passed through a softmax function for normalization before being multiplied by values and projected back to the original feature space.",
+        "type": "comment"
+    },
+    "6986": {
+        "file_id": 514,
+        "content": "            self.relative_position_index[:N, :N].reshape([-1])].reshape(\n                [N, N, -1])  # Wd*Wh*Ww,Wd*Wh*Ww,nH\n        relative_position_bias = relative_position_bias.transpose(\n            [2, 0, 1])  # nH, Wd*Wh*Ww, Wd*Wh*Ww\n        attn = attn + relative_position_bias.unsqueeze(0)  # B_, nH, N, N\n        if mask is not None:\n            nW = mask.shape[0]\n            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N\n                                 ]) + mask.unsqueeze(1).unsqueeze(0).astype(attn.dtype)\n            attn = attn.reshape([-1, self.num_heads, N, N])\n            attn = self.softmax(attn)\n        else:\n            attn = self.softmax(attn)\n        attn = self.attn_drop(attn)\n        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B_, N, C])\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\nclass SwinTransformerBlock3D(nn.Layer):\n    \"\"\" Swin Transformer Block.\n    Args:\n        dim (int): Number of input channels.\n        num_heads (int): Number of attention heads.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:233-261"
+    },
+    "6987": {
+        "file_id": 514,
+        "content": "This code defines the Swin Transformer Block 3D, which implements a self-attention mechanism for multi-dimensional data. It adds relative position biases to the attention scores, applies a mask if provided, and applies softmax normalization. Finally, it passes the result through two dropout layers before outputting the transformed feature map.",
+        "type": "comment"
+    },
+    "6988": {
+        "file_id": 514,
+        "content": "        window_size (tuple[int]): Window size.\n        shift_size (tuple[int]): Shift size for SW-MSA.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.\n        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.\n        drop (float, optional): Dropout rate. Default: 0.0\n        attn_drop (float, optional): Attention dropout rate. Default: 0.0\n        drop_path (float, optional): Stochastic depth rate. Default: 0.0\n        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU\n        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm\n    \"\"\"\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 window_size=(2, 7, 7),\n                 shift_size=(0, 0, 0),\n                 mlp_ratio=4.,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:262-282"
+    },
+    "6989": {
+        "file_id": 514,
+        "content": "This code initializes a class for the Swin Transformer backbone, specifying the dimensions, number of heads, window size, shift size, mlp ratio, and various optional parameters like dropout rates and activation layers.",
+        "type": "comment"
+    },
+    "6990": {
+        "file_id": 514,
+        "content": "                 drop_path=0.,\n                 act_layer=nn.GELU,\n                 norm_layer=nn.LayerNorm,\n                 use_checkpoint=False):\n        super().__init__()\n        self.dim = dim\n        self.num_heads = num_heads\n        self.window_size = window_size\n        self.shift_size = shift_size\n        self.mlp_ratio = mlp_ratio\n        # self.use_checkpoint=use_checkpoint\n        assert 0 <= self.shift_size[0] < self.window_size[\n            0], \"shift_size must in 0-window_size\"\n        assert 0 <= self.shift_size[1] < self.window_size[\n            1], \"shift_size must in 0-window_size\"\n        assert 0 <= self.shift_size[2] < self.window_size[\n            2], \"shift_size must in 0-window_size\"\n        self.norm1 = norm_layer(dim)\n        self.attn = WindowAttention3D(dim,\n                                      window_size=self.window_size,\n                                      num_heads=num_heads,\n                                      qkv_bias=qkv_bias,\n                                      qk_scale=qk_scale,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:283-307"
+    },
+    "6991": {
+        "file_id": 514,
+        "content": "The code defines a class for the Swin Transformer backbone in PaddleVideo. It takes input parameters such as dimension, number of attention heads, window size, and shift size, and initializes layers including norm_layer and attn layer. It performs assertions on shift sizes to ensure they are within the window size limits and then initializes the normalization layer.",
+        "type": "comment"
+    },
+    "6992": {
+        "file_id": 514,
+        "content": "                                      attn_drop=attn_drop,\n                                      proj_drop=drop)\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        self.norm2 = norm_layer(dim)\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n    def forward_part1(self, x, mask_matrix):\n        B = paddle.shape(x)[0]\n        _, D, H, W, C = x.shape\n        window_size, shift_size = get_window_size((D, H, W), self.window_size,\n                                                  self.shift_size)\n        x = self.norm1(x)\n        # pad feature maps to multiples of window size\n        pad_l = pad_t = pad_d0 = 0\n        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]\n        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]\n        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:308-330"
+    },
+    "6993": {
+        "file_id": 514,
+        "content": "This code defines a Swin Transformer backbone class with parameters like window size, shift size, and drop path. It initializes the layers including attention and mlp blocks. The forward_part1 function pads input features to multiples of window size for processing.",
+        "type": "comment"
+    },
+    "6994": {
+        "file_id": 514,
+        "content": "        x = F.pad(x, (pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1),\n                  data_format='NDHWC')\n        _, Dp, Hp, Wp, _ = x.shape\n        # cyclic shift\n        if any(i > 0 for i in shift_size):\n            shifted_x = paddle.roll(x,\n                                    shifts=(-shift_size[0], -shift_size[1],\n                                            -shift_size[2]),\n                                    axis=(1, 2, 3))\n            attn_mask = mask_matrix\n        else:\n            shifted_x = x\n            attn_mask = None\n        # partition windows\n        x_windows = window_partition(shifted_x,\n                                     window_size)  # B*nW, Wd*Wh*Ww, C\n        # W-MSA/SW-MSA\n        attn_windows = self.attn(x_windows, mask=attn_mask)  # B*nW, Wd*Wh*Ww, C\n        # merge windows\n        attn_windows = attn_windows.reshape([-1, *(window_size + (C, ))])\n        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,\n                                   Wp)  # B D' H' W' C\n        # reverse cyclic shift",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:331-353"
+    },
+    "6995": {
+        "file_id": 514,
+        "content": "This code performs a cyclic shift on the input feature map, depending on the shift size. If any of the shift sizes are greater than 0, it applies the roll operation to the feature map along specific axes (1, 2, and 3). The shifted feature map is then partitioned into windows based on the window size specified. These windows go through a self-attention layer (self.attn) and are reshaped accordingly. Finally, a reverse cyclic shift is applied to the result before returning the output feature map. This process helps in performing window-based self-attention or spatial-wise self-attention in the Swin Transformer architecture.",
+        "type": "comment"
+    },
+    "6996": {
+        "file_id": 514,
+        "content": "        if any(i > 0 for i in shift_size):\n            x = paddle.roll(shifted_x,\n                            shifts=(shift_size[0], shift_size[1],\n                                    shift_size[2]),\n                            axis=(1, 2, 3))\n        else:\n            x = shifted_x\n        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:\n            x = x[:, :D, :H, :W, :]\n        return x\n    def forward_part2(self, x):\n        return self.drop_path(self.mlp(self.norm2(x)))\n    def forward(self, x, mask_matrix):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, D, H, W, C).\n            mask_matrix: Attention mask for cyclic shift.\n        \"\"\"\n        shortcut = x\n        x = self.forward_part1(x, mask_matrix)\n        x = shortcut + self.drop_path(x).astype(shortcut.dtype)\n        x = x + self.forward_part2(x).astype(x.dtype)\n        return x\nclass PatchMerging(nn.Layer):\n    \"\"\" Patch Merging Layer\n    Args:\n        dim (int): Number of input channels.\n        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:354-390"
+    },
+    "6997": {
+        "file_id": 514,
+        "content": "The code defines a function for the forward pass of a neural network. It consists of two parts: `forward_part1` and `forward_part2`. The function takes an input tensor, performs some operations, and returns the result. The `forward_part1` function applies a shift operation to the input based on a specified shift size, followed by a padding operation if necessary. The `forward_part2` function passes the input through a multi-layer perceptron (MLP) and applies dropout. Finally, the `forward` function combines the outputs of these two parts and returns the result after adding it to an initial shortcut connection.",
+        "type": "comment"
+    },
+    "6998": {
+        "file_id": 514,
+        "content": "    \"\"\"\n    def __init__(self, dim, norm_layer=nn.LayerNorm):\n        super().__init__()\n        self.dim = dim\n        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)\n        self.norm = norm_layer(4 * dim)\n    def forward(self, x):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, D, H, W, C).\n        \"\"\"\n        B, D, H, W, C = x.shape\n        # padding\n        pad_input = (H % 2 == 1) or (W % 2 == 1)\n        if pad_input:\n            x = F.pad(x, (0, W % 2, 0, H % 2, 0, 0), data_format='NDHWC')\n        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C\n        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C\n        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C\n        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C\n        x = paddle.concat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C\n        x = self.norm(x)\n        x = self.reduction(x)\n        return x\n# cache each stage results\n@lru_cache()\ndef compute_mask(D, H, W, window_size, shift_size):\n    img_mask = paddle.zeros((1, D, H, W, 1))  # 1 Dp Hp Wp 1",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:391-426"
+    },
+    "6999": {
+        "file_id": 514,
+        "content": "The code defines a Swin Transformer backbone for an image model. The `__init__` method initializes the Swin Transformer with specified dimension and normalization layer. The forward function processes input feature by splitting, concatenating, normalizing, and reducing dimensions. The `compute_mask` function generates an image mask using LRU caching.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/7.json b/docs/data/7.json
new file mode 100644
index 000000000..f819521cb
--- /dev/null
+++ b/docs/data/7.json
@@ -0,0 +1,549 @@
+{
+    "700": {
+        "file_id": 66,
+        "content": "import os.path as osp\ndef load(file_name, model, **cfg):\n    if not osp.isfile(file_name):\n        raise IOError(f'{file_name} not exist')\n    try:\n        state_dicts_ = paddle.load(file_name)['state_dict']\n    except:\n        state_dicts_ = paddle.load(file_name)\n    state_dicts = {}\n    for k in model.keys():\n        if 'num_batches_tracked' not in k:\n            if ('head.' + k) not in state_dicts_.keys():\n                if k not in state_dicts_.keys():\n                    print(f'model -----{k} -------is not in pretrained')\n                else:\n                    state_dicts[k] = state_dicts_[k]\n            else:\n                state_dicts[k] = state_dicts_['head.' + k]\n    write_dict(state_dicts, 'state_dicts.txt', **cfg)\n    write_dict(model, 'model.txt', **cfg)\n    return state_dicts\n#####\ndef write_dict(state_dict, file_name, **cfg):\n    lines = []\n    tot = 0\n    for k, v in state_dict.items():\n        # 目前只发现了torch和paddle模型参数命名的这三种不一致\n        # 不一致1\n        if 'num_batches_tracked' in k:\n            tot += 1",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:239-272"
+    },
+    "701": {
+        "file_id": 66,
+        "content": "This code defines a function `load` that loads a pretrained model from a file. It checks if the file exists, then loads either the 'state_dict' or the entire dictionary depending on compatibility with the model keys. The function also filters out 'num_batches_tracked' and 'head.' before assigning the correct values to state_dicts. Finally, it writes both the modified state_dicts and model to separate text files.",
+        "type": "comment"
+    },
+    "702": {
+        "file_id": 66,
+        "content": "            continue\n        try:\n            line = str(k) + '\\t' + str(v.cpu().detach().numpy().shape) + '\\n'\n        except:\n            line = str(k) + '\\t' + str(v.shape) + '\\n'\n        lines.append(line)\n    # with open(cfg.get(\"output_dir\", f\"./output/{file_name}\"), 'w') as f:\n    #     f.writelines(lines)\n    # print('%d num_batches_tracked skipped' % tot)\ndef damage_masks(labels, shift=True, scale=True, rotate=True):\n    \"\"\"\n    Args:\n    labels: numpy array (batch_size * 1 * h * w)\n    \"\"\"\n    bs, _, h, w = labels.shape\n    labels = labels.transpose([0, 2, 3, 1])\n    labels = labels.numpy()\n    final_label = []\n    for i in range(bs):\n        label = labels[i]\n        damaged_label = damage_masks_np(label, shift, scale, rotate)\n        final_label.append(damaged_label)\n    final_label = np.array(final_label)\n    final_label = paddle.to_tensor(final_label)\n    final_label = final_label.transpose([0, 3, 1, 2])\n    return final_label\ndef damage_masks_np(labels, shift=True, scale=True, rotate=True):\n    \"\"\"Performs the actual mask damaging in numpy.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:273-304"
+    },
+    "703": {
+        "file_id": 66,
+        "content": "The code defines two functions: `damage_masks` and `damage_masks_np`. Both functions take in a labels array, and apply damage to the masks by applying shift, scale, and rotate transformations. The output is returned as a tensor after being transposed. The functions are designed for PaddlePaddle and NumPy respectively, and the input must be of shape (batch_size * 1 * h * w).",
+        "type": "comment"
+    },
+    "704": {
+        "file_id": 66,
+        "content": "    Args:\n    labels: Int32 numpy array of shape (height, width, 1).\n    shift: Boolean, whether to damage the masks by shifting.\n    scale: Boolean, whether to damage the masks by scaling.\n    rotate: Boolean, whether to damage the masks by rotation.\n    dilate: Boolean, whether to damage the masks by dilation.\n    Returns:\n    The damaged version of labels.\n    \"\"\"\n    unique_labels = np.unique(labels)\n    unique_labels = np.setdiff1d(unique_labels, [0])\n    # Shuffle to get random depth ordering when combining together.\n    np.random.shuffle(unique_labels)\n    damaged_labels = np.zeros_like(labels)\n    for l in unique_labels:\n        obj_mask = (labels == l)\n        damaged_obj_mask = _damage_single_object_mask(obj_mask, shift, scale,\n                                                      rotate)\n        damaged_labels[damaged_obj_mask] = l\n    return damaged_labels\ndef _damage_single_object_mask(mask, shift, scale, rotate):\n    \"\"\"Performs mask damaging in numpy for a single object.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:305-330"
+    },
+    "705": {
+        "file_id": 66,
+        "content": "This function damages the input labels by randomly shifting, scaling, rotating, and dilating the object masks. It first extracts unique labels, then shuffles them before iterating through each unique label to generate a damaged version of the labels. The `_damage_single_object_mask` function is used internally for performing mask damage on a single object.",
+        "type": "comment"
+    },
+    "706": {
+        "file_id": 66,
+        "content": "    shift: Boolean, whether to damage the masks by shifting.\n    scale: Boolean, whether to damage the masks by scaling.\n    rotate: Boolean, whether to damage the masks by rotation.\n    dilate: Boolean, whether to damage the masks by dilation.\n    Returns:\n    The damaged version of mask.\n    \"\"\"\n    if shift:\n        mask = _shift_mask(mask)\n    if scale:\n        mask = _scale_mask(mask)\n    if rotate:\n        mask = _rotate_mask(mask)\n    return mask\ndef _shift_mask(mask, max_shift_factor=0.05):\n    \"\"\"Damages a mask for a single object by randomly shifting it in numpy.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    max_shift_factor: Float scalar, the maximum factor for random shifting.\n    Returns:\n    The shifted version of mask.\n    \"\"\"\n    nzy, nzx, _ = mask.nonzero()\n    h = nzy.max() - nzy.min()\n    w = nzx.max() - nzx.min()\n    size = np.sqrt(h * w)\n    offset = np.random.uniform(-size * max_shift_factor,\n                               size * max_shift_factor, 2)\n    shifted_mask = interpolation.shift(np.squeeze(mask, axis=2),",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:331-361"
+    },
+    "707": {
+        "file_id": 66,
+        "content": "This code appears to be part of a function that damages a mask for a single object by randomly shifting it in numpy. The function takes a Boolean numpy array as input and returns the shifted version of the mask. It also includes parameters for scaling, rotation, and dilation, but these operations are not defined in this snippet.",
+        "type": "comment"
+    },
+    "708": {
+        "file_id": 66,
+        "content": "                                       offset,\n                                       order=0).astype('bool')[..., np.newaxis]\n    return shifted_mask\ndef _scale_mask(mask, scale_amount=0.025):\n    \"\"\"Damages a mask for a single object by randomly scaling it in numpy.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    scale_amount: Float scalar, the maximum factor for random scaling.\n    Returns:\n    The scaled version of mask.\n    \"\"\"\n    nzy, nzx, _ = mask.nonzero()\n    cy = 0.5 * (nzy.max() - nzy.min())\n    cx = 0.5 * (nzx.max() - nzx.min())\n    scale_factor = np.random.uniform(1.0 - scale_amount, 1.0 + scale_amount)\n    shift = transform.SimilarityTransform(translation=[-cx, -cy])\n    inv_shift = transform.SimilarityTransform(translation=[cx, cy])\n    s = transform.SimilarityTransform(scale=[scale_factor, scale_factor])\n    m = (shift + (s + inv_shift)).inverse\n    scaled_mask = transform.warp(mask, m) > 0.5\n    return scaled_mask\ndef _rotate_mask(mask, max_rot_degrees=3.0):\n    \"\"\"Damages a mask for a single object by randomly rotating it in numpy.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:362-388"
+    },
+    "709": {
+        "file_id": 66,
+        "content": "The code contains three functions: _shift_mask, _scale_mask, and _rotate_mask. These functions are used to randomly manipulate a binary mask by shifting, scaling, or rotating it for a single object. The purpose is to damage the mask to enhance the robustness of the AI system against different poses or scales of the object.",
+        "type": "comment"
+    },
+    "710": {
+        "file_id": 66,
+        "content": "    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    max_rot_degrees: Float scalar, the maximum number of degrees to rotate.\n    Returns:\n    The scaled version of mask.\n    \"\"\"\n    cy = 0.5 * mask.shape[0]\n    cx = 0.5 * mask.shape[1]\n    rot_degrees = np.random.uniform(-max_rot_degrees, max_rot_degrees)\n    shift = transform.SimilarityTransform(translation=[-cx, -cy])\n    inv_shift = transform.SimilarityTransform(translation=[cx, cy])\n    r = transform.SimilarityTransform(rotation=np.deg2rad(rot_degrees))\n    m = (shift + (r + inv_shift)).inverse\n    scaled_mask = transform.warp(mask, m) > 0.5\n    return scaled_mask\nclass AverageMeter(object):\n    \"\"\"Computes and stores the average and current value\"\"\"\n    def __init__(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def reset(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        self.val = val\n        self.sum += val * n",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:389-422"
+    },
+    "711": {
+        "file_id": 66,
+        "content": "This code defines a function that rotates and scales a binary mask. It first calculates the center coordinates of the mask, then generates a random rotation angle within a specified range, applies the transformation, and inverses it to get the final scaling transformation matrix. The result is a warped version of the mask where pixels above 0.5 are considered as true values. Additionally, there's an AverageMeter class that computes and stores average and current value for continuous metrics calculation.",
+        "type": "comment"
+    },
+    "712": {
+        "file_id": 66,
+        "content": "        self.count += n\n        self.avg = self.sum / self.count\nimport numpy as np\ndef label2colormap(label):\n    m = label.astype(np.uint8)\n    r, c = m.shape\n    cmap = np.zeros((r, c, 3), dtype=np.uint8)\n    cmap[:, :, 0] = (m & 1) << 7 | (m & 8) << 3 | (m & 64) >> 1\n    cmap[:, :, 1] = (m & 2) << 6 | (m & 16) << 2 | (m & 128) >> 2\n    cmap[:, :, 2] = (m & 4) << 5 | (m & 32) << 1\n    return cmap\ndef torch2paddle(data):\n    try:\n        import torch\n        if isinstance(data, dict):\n            np_data = {}\n            for k, v in data.items():\n                np_data[k] = paddle.to_tensor(v.detach().numpy())\n            return np_data\n        else:\n            return paddle.to_tensor(data.detach().numpy())\n    except:\n        pass\ndef fill_(tensor: Tensor, value):\n    return tensor.set_value(paddle.full_like(tensor, value))\ndef zero_(tensor: Tensor):\n    return tensor.set_value(paddle.zeros_like(tensor))\ndef float_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='float32')\ndef long_(tensor: Tensor):",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:423-466"
+    },
+    "713": {
+        "file_id": 66,
+        "content": "Code utilities for PaddleVideo: converts label to colormap, converts PyTorch data types to Paddle, fills tensor with a value, sets tensor value to zero, and casts tensor to float32 dtype.",
+        "type": "comment"
+    },
+    "714": {
+        "file_id": 66,
+        "content": "    return paddle.to_tensor(tensor, dtype='int64')\ndef int_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='int32')\ndef byte_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='bool')\nclass ToPILImage(BaseTransform):\n    def __init__(self, mode=None, keys=None):\n        super(ToPILImage, self).__init__(keys)\n    def _apply_image(self, pic):\n        \"\"\"\n        Args:\n            pic (Tensor|np.ndarray): Image to be converted to PIL Image.\n        Returns:\n            PIL: Converted image.\n        \"\"\"\n        if not (isinstance(pic, paddle.Tensor) or isinstance(pic, np.ndarray)):\n            raise TypeError('pic should be Tensor or ndarray. Got {}.'.format(\n                type(pic)))\n        elif isinstance(pic, paddle.Tensor):\n            if pic.ndimension() not in {2, 3}:\n                raise ValueError(\n                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(\n                        pic.ndimension()))\n            elif pic.ndimension() == 2:\n                # if 2D image, add channel dimension (CHW)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:467-500"
+    },
+    "715": {
+        "file_id": 66,
+        "content": "The code provides three functions for converting tensors to different data types: int64, int32, and bool. The class ToPILImage is used to convert images from Tensor or np.ndarray format to PIL Image. It checks the type of input pic and throws a TypeError if it's not a Tensor or ndarray. If pic has 2 or 3 dimensions, it adds a channel dimension for 2D images. If the number of dimensions is not 2 or 3, it raises a ValueError.",
+        "type": "comment"
+    },
+    "716": {
+        "file_id": 66,
+        "content": "                pic = pic.unsqueeze(0)\n        elif isinstance(pic, np.ndarray):\n            if pic.ndim not in {2, 3}:\n                raise ValueError(\n                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(\n                        pic.ndim))\n            elif pic.ndim == 2:\n                # if 2D image, add channel dimension (HWC)\n                pic = np.expand_dims(pic, 2)\n        npimg = pic\n        if isinstance(pic, paddle.Tensor) and \"float\" in str(\n                pic.numpy().dtype) and self.mode != 'F':\n            pic = pic.mul(255).byte()\n        if isinstance(pic, paddle.Tensor):\n            npimg = np.transpose(pic.numpy(), (1, 2, 0))\n        if not isinstance(npimg, np.ndarray):\n            raise TypeError(\n                'Input pic must be a paddle.Tensor or NumPy ndarray, ' +\n                'not {}'.format(type(npimg)))\n        if npimg.shape[2] == 1:\n            expected_mode = None\n            npimg = npimg[:, :, 0]\n            if npimg.dtype == np.uint8:\n                expected_mode = 'L'",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:501-529"
+    },
+    "717": {
+        "file_id": 66,
+        "content": "This code is checking the input \"pic\" and adjusting its format to be compatible with the function. It first checks if it's a tensor or ndarray, then ensures that the image is 2D or 3D (adding channels if necessary) before converting it into NumPy ndarray format. Finally, it checks the data type and mode to further adjust the \"pic\" as needed. If any issue arises during this process, it raises an error with a descriptive message.",
+        "type": "comment"
+    },
+    "718": {
+        "file_id": 66,
+        "content": "            elif npimg.dtype == np.int16:\n                expected_mode = 'I;16'\n            elif npimg.dtype == np.int32:\n                expected_mode = 'I'\n            elif npimg.dtype == np.float32:\n                expected_mode = 'F'\n            if self.mode is not None and self.mode != expected_mode:\n                raise ValueError(\n                    \"Incorrect self.mode ({}) supplied for input type {}. Should be {}\"\n                    .format(self.mode, np.dtype, expected_mode))\n            self.mode = expected_mode\n        elif npimg.shape[2] == 2:\n            permitted_2_channel_modes = ['LA']\n            if self.mode is not None and self.mode not in permitted_2_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 2D inputs\".format(\n                        permitted_2_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'LA'\n        elif npimg.shape[2] == 4:\n            permitted_4_channel_modes = ['RGBA', 'CMYK', 'RGBX']",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:530-553"
+    },
+    "719": {
+        "file_id": 66,
+        "content": "This code is validating the input image's data type and dimensions to determine the appropriate mode for the image. It raises a ValueError if the supplied self.mode does not match the expected mode based on the input type, or if the number of channels in the image does not match permitted modes.",
+        "type": "comment"
+    },
+    "720": {
+        "file_id": 66,
+        "content": "            if self.mode is not None and self.mode not in permitted_4_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 4D inputs\".format(\n                        permitted_4_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'RGBA'\n        else:\n            permitted_3_channel_modes = ['RGB', 'YCbCr', 'HSV']\n            if self.mode is not None and self.mode not in permitted_3_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 3D inputs\".format(\n                        permitted_3_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'RGB'\n        if self.mode is None:\n            raise TypeError('Input type {} is not supported'.format(\n                npimg.dtype))\n        return Image.fromarray(npimg, mode=self.mode)\nclass Identity(nn.Layer):\n    r\"\"\"A placeholder identity operator that is argument-insensitive.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:554-578"
+    },
+    "721": {
+        "file_id": 66,
+        "content": "The code snippet is part of a class function that checks the input image mode and data type to determine if it's compatible with the operation. If not, it raises an error or sets the mode accordingly. It also defines a placeholder identity operator class.",
+        "type": "comment"
+    },
+    "722": {
+        "file_id": 66,
+        "content": "    Args:\n        args: any argument (unused)\n        kwargs: any keyword argument (unused)\n    \"\"\"\n    def __init__(self, *args, **kwargs):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\ndef convert(data: dict, to, dtype=None):\n    assert isinstance(data, dict)\n    input = {}\n    for k, v in data.items():\n        if 'paddle' == to:\n            if isinstance(v, np.ndarray):\n                if dtype is not None:\n                    input[k] = paddle.to_tensor(v.astype(dtype))\n                else:\n                    input[k] = paddle.to_tensor(v)\n            else:\n                input[k] = v\n        elif 'torch' == to:\n            try:\n                import torch\n                if isinstance(v, np.ndarray):\n                    if dtype is not None:\n                        input[k] = torch.tensor(v.astype(dtype))\n                    else:\n                        input[k] = torch.tensor(v)\n                else:\n                    input[k] = v\n            except:\n                pass",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:580-615"
+    },
+    "723": {
+        "file_id": 66,
+        "content": "This code defines a class \"Identity\" with an empty forward function and a convert function that converts dictionary data between Paddle and Torch formats. If 'paddle' is given as the to parameter, it converts numpy arrays in the input dictionary to Paddle tensors. If 'torch' is given, it tries to import torch and converts numpy arrays or leaves unchanged non-numpy elements in the input dictionary to Torch tensors. Dtype can be used to specify a specific data type for tensor conversion.",
+        "type": "comment"
+    },
+    "724": {
+        "file_id": 66,
+        "content": "        else:\n            if isinstance(v, np.ndarray):\n                input[k] = v.astype(to)\n            else:\n                input[k] = v\n    return input\ndef clip_grad_norm_(parameters: _tensor_or_tensors,\n                    max_norm: float,\n                    norm_type: float = 2.0,\n                    error_if_nonfinite: bool = False) -> paddle.Tensor:\n    r\"\"\"Clips gradient norm of an iterable of parameters.\n    The norm is computed over all gradients together, as if they were\n    concatenated into a single vector. Gradients are modified in-place.\n    Args:\n        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a\n            single Tensor that will have gradients normalized\n        max_norm (float or int): max norm of the gradients\n        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for\n            infinity norm.\n        error_if_nonfinite (bool): if True, an error is thrown if the total\n            norm of the gradients from :attr:``parameters`` is ``nan``,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:616-640"
+    },
+    "725": {
+        "file_id": 66,
+        "content": "This function clips the gradient norm of an iterable of parameters. The norm is computed over all gradients together, as if they were concatenated into a single vector. Gradients are modified in-place. It takes arguments such as iterable of Tensors or a single Tensor that will have gradients normalized, max_norm (float or int) to set the maximum norm of the gradients, norm_type (float or int) for the type of used p-norm, and error_if_nonfinite (bool) to indicate whether an error should be thrown if total norm is nan.",
+        "type": "comment"
+    },
+    "726": {
+        "file_id": 66,
+        "content": "            ``inf``, or ``-inf``. Default: False (will switch to True in the future)\n    Returns:\n        Total norm of the parameters (viewed as a single vector).\n    \"\"\"\n    import time\n    if isinstance(parameters, paddle.Tensor):\n        parameters = [parameters]\n    parameters = [p for p in parameters if p.grad is not None]\n    detached_grads = [p.grad.detach() for p in parameters]\n    max_norm = float(max_norm)\n    norm_type = float(norm_type)\n    if len(parameters) == 0:\n        return paddle.to_tensor(0.)\n    # device = paddle.get_device()  # parameters[0].grad.device\n    if norm_type == inf:\n        norms = [p.abs().max() for p in parameters]\n        total_norm = norms[0] if len(norms) == 1 else paddle.max(\n            paddle.stack(norms))\n    else:\n        #         tik = time.time()\n        total_norm = paddle.norm(\n            paddle.stack([paddle.norm(g, norm_type) for g in detached_grads]),\n            norm_type)\n    #         total_norm = paddle.norm(paddle.stack([paddle.sqrt(paddle.sum(g*g)) for g in detached_grads]), norm_type)  # fixed.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:641-666"
+    },
+    "727": {
+        "file_id": 66,
+        "content": "This function calculates the total norm of parameters (viewed as a single vector) and handles cases where parameters are tensors. It first checks if the parameter is a tensor, then selects only those with non-null gradients, detaches their gradients, and applies different norm types based on the input. If the norm type is infinity, it calculates maximum absolute values for each parameter; otherwise, it calculates the p-norm using provided parameters and detached gradients.",
+        "type": "comment"
+    },
+    "728": {
+        "file_id": 66,
+        "content": "    #         print(time.time() - tik)\n    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),\n                                                total_norm.isinf()):\n        raise RuntimeError(\n            f'The total norm of order {norm_type} for gradients from '\n            '`parameters` is non-finite, so it cannot be clipped. To disable '\n            'this error and scale the gradients by the non-finite norm anyway, '\n            'set `error_if_nonfinite=False`')\n    clip_coef = max_norm / (total_norm + 1e-6)\n    # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so\n    # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization\n    # when the gradients do not reside in CPU memory.\n    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)\n    for i, p in enumerate(parameters):\n        #         p.set_value(paddle.multiply(p, clip_coef_clamped))\n        p.grad.set_value(detached_grads[i] * clip_coef_clamped)  # fixed",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:667-682"
+    },
+    "729": {
+        "file_id": 66,
+        "content": "This code checks if the total norm of gradients from `parameters` is non-finite. If it is, it raises a RuntimeError and suggests setting `error_if_nonfinite=False`. Then it calculates the clipping coefficient, performs a clip operation to ensure it's within the range (0, 1], and finally multiplies the gradients with this coefficient to scale them.",
+        "type": "comment"
+    },
+    "730": {
+        "file_id": 66,
+        "content": "    #         p.grad.detach().mul_(clip_coef_clamped\n    return total_norm\n# def max(a: paddle.Tensor, axis=0, keepdim=True):\n#     \"\"\"ndarray=numpy.array([[1, 2, 3, 4],\n#            [4, 3, 2, 1],\n#            [5, 6, 7, 8],\n#            [8, 7, 6, 5]])\n#     np.where(ndarray == np.max(ndarray))\n#     (array([2, 3]), array([3, 0]))\n#     ndarray[np.where(ndarray == np.max(ndarray))]\n#     array([8, 8])\n#     \"\"\"\n#     max_ = a.max(axis).unsqueeze(-1)\n#     index = paddle.argmax(a, axis=axis, keepdim=keepdim)\n#     max_ = max_.numpy()\n#     index = index.numpy()\n#     # index = paddle.argmax(a, axis=axis, keepdim=keepdim)[-1].flatten()\n#     return max_, index\ndef gather(tmp: paddle.Tensor, ind: paddle.Tensor):\n    shape = tmp.shape\n    tmp = paddle.to_tensor(tmp)\n    ind = paddle.to_tensor(ind)\n    if len(shape) == 2:\n        b = shape[0]\n        return concat([\n            reshape(paddle.gather(tmp[i, :], ind[i, :]), [1, -1])\n            for i in range(b)\n        ],\n                      axis=0)\n    elif len(shape) == 3:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:683-716"
+    },
+    "731": {
+        "file_id": 66,
+        "content": "This code defines a function `max()` that finds the maximum values in a tensor and their corresponding indices. It also includes a `gather()` function that performs tensor gathering based on provided indices. The functions use PaddlePaddle library for Tensor operations.",
+        "type": "comment"
+    },
+    "732": {
+        "file_id": 66,
+        "content": "        out = []\n        for i in range(tmp.shape[0]):\n            _ = paddle.index_sample(tmp[i], ind[i])\n            out.append(_)\n        return paddle.to_tensor(out)\n    elif len(shape) == 4:\n        b, c, d = shape[:3]\n        return concat([\n            reshape(\n                concat([\n                    reshape(\n                        concat([\n                            reshape(\n                                paddle.gather(tmp[i, j, k, :], ind[i, j, k, :]),\n                                [1, -1]) for k in range(d)\n                        ],\n                               axis=0), [1, d, -1]) for j in range(c)\n                ],\n                       axis=0), [1, c, d, -1]) for i in range(b)\n        ],\n                      axis=0)\n    else:\n        pass\n# These no_grad_* functions are necessary as wrappers around the parts of these\n# functions that use `with torch.no_grad()`. The JIT doesn't support context\n# managers, so these need to be implemented as builtins. Using these wrappers\n# lets us keep those builtins small and re-usable.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:717-745"
+    },
+    "733": {
+        "file_id": 66,
+        "content": "This function performs sampling and reshaping operations on tensors with different shapes. It handles cases where the tensor shape has 0, 1, or 4 dimensions. In case of a 4-dimensional tensor, it uses gather and concat functions to rearrange data according to the given indices. The no_grad_* functions are used as wrappers for parts that require `torch.no_grad()` context manager due to JIT's inability to handle context managers directly.",
+        "type": "comment"
+    },
+    "734": {
+        "file_id": 66,
+        "content": "def _no_grad_uniform_(tensor, a, b):\n    with paddle.no_grad():\n        tensor.set_value(paddle.uniform(tensor.shape, min=a, max=b))\n        return tensor\ndef _no_grad_normal_(tensor, mean, std):\n    with paddle.no_grad():\n        tensor.set_value(paddle.normal(shape=tensor.shape, mean=mean, std=std))\n        return tensor\ndef _no_grad_trunc_normal_(tensor, mean, std, a, b):\n    from scipy import special\n    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf\n    def norm_cdf(x):\n        # Computes standard normal cumulative distribution function\n        return (1. + math.erf(x / math.sqrt(2.))) / 2.\n    if (mean < a - 2 * std) or (mean > b + 2 * std):\n        warnings.warn(\n            \"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. \"\n            \"The distribution of values may be incorrect.\",\n            stacklevel=2)\n    with paddle.no_grad():\n        # Values are generated by using a truncated uniform distribution and\n        # then using the inverse CDF for the normal distribution.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:746-774"
+    },
+    "735": {
+        "file_id": 66,
+        "content": "This code defines three functions: `_no_grad_uniform_`, `_no_grad_normal_`, and `_no_grad_trunc_normal_`. These functions generate tensors with specific distributions while using gradient calculation, which helps in tensor initialization or randomization tasks. The first function generates uniformly distributed values within a defined range. The second function generates normally distributed values with specified mean and standard deviation. The third function generates truncated normal distribution values by combining uniform and normal distribution functions.",
+        "type": "comment"
+    },
+    "736": {
+        "file_id": 66,
+        "content": "        # Get upper and lower cdf values\n        l = norm_cdf((a - mean) / std)\n        u = norm_cdf((b - mean) / std)\n        # Uniformly fill tensor with values from [l, u], then translate to\n        # [2l-1, 2u-1].\n        tensor.set_value(\n            paddle.uniform(tensor.shape, min=2 * l - 1, max=2 * u - 1))\n        # tensor.uniform_(2 * l - 1, 2 * u - 1)\n        # Use inverse cdf transform for normal distribution to get truncated\n        # standard normal\n        # tensor.erfinv_()  # paddle 无\n        tensor.set_value(special.erfinv(tensor))\n        # Transform to proper mean, std\n        # tensor.mul_(std * math.sqrt(2.))\n        tensor.set_value(tensor.multiply(paddle.to_tensor(std * math.sqrt(2.))))\n        tensor.add_(mean)\n        # Clamp to ensure it's in the proper range\n        tensor.clip_(min=a, max=b)\n        return tensor\ndef _no_grad_fill_(tensor, val):\n    with paddle.no_grad():\n        tensor.set_value(paddle.full_like(tensor, fill_value=val))\n        return tensor\ndef _no_grad_zero_(tensor):",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:775-806"
+    },
+    "737": {
+        "file_id": 66,
+        "content": "This code snippet is used to generate random values within a specified range and then apply inverse cumulative distribution function (CDF) transforms to normalize the tensor. It uses PyTorch's `paddle` library for its operations. The result is then clamped between a minimum and maximum value, which are defined by the variables 'a' and 'b', respectively. This process ensures that the generated tensor falls within the desired range.",
+        "type": "comment"
+    },
+    "738": {
+        "file_id": 66,
+        "content": "    with paddle.no_grad():\n        tensor.set_value(paddle.zeros_like(tensor))\n        return tensor\ndef calculate_gain(nonlinearity, param=None):\n    r\"\"\"Return the recommended gain value for the given nonlinearity function.\n    The values are as follows:\n    ================= ====================================================\n    nonlinearity      gain\n    ================= ====================================================\n    Linear / Identity :math:`1`\n    Conv{1,2,3}D      :math:`1`\n    Sigmoid           :math:`1`\n    Tanh              :math:`\\frac{5}{3}`\n    ReLU              :math:`\\sqrt{2}`\n    Leaky Relu        :math:`\\sqrt{\\frac{2}{1 + \\text{negative\\_slope}^2}}`\n    SELU              :math:`\\frac{3}{4}`\n    ================= ====================================================\n    Args:\n        nonlinearity: the non-linear function (`nn.functional` name)\n        param: optional parameter for the non-linear function\n    Examples:\n        >>> gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:807-833"
+    },
+    "739": {
+        "file_id": 66,
+        "content": "This function calculates the recommended gain value for a given nonlinearity function. The gain values depend on the function used, with different values assigned to functions like Linear/Identity (1), Sigmoid (1), Tanh (5/3), ReLU (sqrt(2)), Leaky Relu (sqrt((2/(1 + negative_slope^2))), SELU (3/4). The function takes nonlinearity and optional param as arguments, and returns the gain value.",
+        "type": "comment"
+    },
+    "740": {
+        "file_id": 66,
+        "content": "    \"\"\"\n    linear_fns = [\n        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',\n        'conv_transpose2d', 'conv_transpose3d'\n    ]\n    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':\n        return 1\n    elif nonlinearity == 'tanh':\n        return 5.0 / 3\n    elif nonlinearity == 'relu':\n        return math.sqrt(2.0)\n    elif nonlinearity == 'leaky_relu':\n        if param is None:\n            negative_slope = 0.01\n        elif not isinstance(param, bool) and isinstance(\n                param, int) or isinstance(param, float):\n            # True/False are instances of int, hence check above\n            negative_slope = param\n        else:\n            raise ValueError(\n                \"negative_slope {} not a valid number\".format(param))\n        return math.sqrt(2.0 / (1 + negative_slope**2))\n    elif nonlinearity == 'selu':\n        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)\n    else:\n        raise ValueError(\"Unsupported nonlinearity {}\".format(nonlinearity))",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:834-859"
+    },
+    "741": {
+        "file_id": 66,
+        "content": "This code defines a function to map different nonlinearities (e.g., linear, sigmoid, tanh) to corresponding numerical values or exceptions when an unsupported nonlinearity is provided. It handles cases like linear, sigmoid, tanh, relu, leaky_relu, and selu, providing the appropriate values for each.",
+        "type": "comment"
+    },
+    "742": {
+        "file_id": 66,
+        "content": "def uniform_(tensor: Tensor, a: float = 0., b: float = 1.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from the uniform\n    distribution :math:`\\mathcal{U}(a, b)`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the lower bound of the uniform distribution\n        b: the upper bound of the uniform distribution\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.uniform_(w)\n    \"\"\"\n    return _no_grad_uniform_(tensor, a, b)\ndef normal_(tensor: Tensor, mean: float = 0., std: float = 1.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from the normal\n    distribution :math:`\\mathcal{N}(\\text{mean}, \\text{std}^2)`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        mean: the mean of the normal distribution\n        std: the standard deviation of the normal distribution\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.normal_(w)\n    \"\"\"\n    return _no_grad_normal_(tensor, mean, std)\ndef trunc_normal_(tensor: Tensor,\n                  mean: float = 0.,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:862-895"
+    },
+    "743": {
+        "file_id": 66,
+        "content": "These code snippets define functions to initialize a tensor with values drawn from a uniform or normal distribution. The `uniform_` function fills the input tensor with values from a uniform distribution, while `normal_` initializes it with values from a normal distribution. These functions can be used for various tasks such as initializing weights in a neural network.",
+        "type": "comment"
+    },
+    "744": {
+        "file_id": 66,
+        "content": "                  std: float = 1.,\n                  a: float = -2.,\n                  b: float = 2.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from a truncated\n    normal distribution. The values are effectively drawn from the\n    normal distribution :math:`\\mathcal{N}(\\text{mean}, \\text{std}^2)`\n    with values outside :math:`[a, b]` redrawn until they are within\n    the bounds. The method used for generating the random values works\n    best when :math:`a \\leq \\text{mean} \\leq b`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        mean: the mean of the normal distribution\n        std: the standard deviation of the normal distribution\n        a: the minimum cutoff value\n        b: the maximum cutoff value\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.trunc_normal_(w)\n    \"\"\"\n    return _no_grad_trunc_normal_(tensor, mean, std, a, b)\ndef constant_(tensor: Tensor, val: float) -> Tensor:\n    r\"\"\"Fills the input Tensor with the value :math:`\\text{val}`.\n    Args:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:896-923"
+    },
+    "745": {
+        "file_id": 66,
+        "content": "This code snippet defines a function `trunc_normal_` that fills the input Tensor with values drawn from a truncated normal distribution. The values are within the range [a, b] and the method works best when a <= mean <= b. Additionally, it includes a separate function `constant_` which fills the input Tensor with a constant value val.",
+        "type": "comment"
+    },
+    "746": {
+        "file_id": 66,
+        "content": "        tensor: an n-dimensional `torch.Tensor`\n        val: the value to fill the tensor with\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.constant_(w, 0.3)\n    \"\"\"\n    return _no_grad_fill_(tensor, val)\ndef ones_(tensor: Tensor) -> Tensor:\n    r\"\"\"Fills the input Tensor with the scalar value `1`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.ones_(w)\n    \"\"\"\n    return _no_grad_fill_(tensor, 1.)\ndef zeros_(tensor: Tensor) -> Tensor:\n    r\"\"\"Fills the input Tensor with the scalar value `0`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.zeros_(w)\n    \"\"\"\n    return _no_grad_zero_(tensor)\ndef eye_(tensor):\n    r\"\"\"Fills the 2-dimensional input `Tensor` with the identity\n    matrix. Preserves the identity of the inputs in `Linear` layers, where as\n    many inputs are preserved as possible.\n    Args:\n        tensor: a 2-dimensional `torch.Tensor`",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:924-966"
+    },
+    "747": {
+        "file_id": 66,
+        "content": "These methods fill a tensor with specific values or an identity matrix for Linear layers. The `constant_()`, `ones_()`, and `zeros_()` functions fill the input tensor with constant, ones, or zeros respectively. The `eye_()` function fills a 2-dimensional tensor with an identity matrix while preserving the identities of inputs in Linear layers.",
+        "type": "comment"
+    },
+    "748": {
+        "file_id": 66,
+        "content": "    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.eye_(w)\n    \"\"\"\n    if tensor.ndimension() != 2:\n        raise ValueError(\"Only tensors with 2 dimensions are supported\")\n    with paddle.no_grad():\n        tensor.set_value(paddle.eye(*tensor.shape))\n    return tensor\ndef dirac_(tensor, groups=1):\n    r\"\"\"Fills the {3, 4, 5}-dimensional input `Tensor` with the Dirac\n    delta function. Preserves the identity of the inputs in `Convolutional`\n    layers, where as many input channels are preserved as possible. In case\n    of groups>1, each group of channels preserves identity\n    Args:\n        tensor: a {3, 4, 5}-dimensional `torch.Tensor`\n        groups (optional): number of groups in the conv layer (default: 1)\n    Examples:\n        >>> w = torch.empty(3, 16, 5, 5)\n        >>> nn.init.dirac_(w)\n        >>> w = torch.empty(3, 24, 5, 5)\n        >>> nn.init.dirac_(w, 3)\n    \"\"\"\n    dimensions = tensor.ndimension()\n    if dimensions not in [3, 4, 5]:\n        raise ValueError(\n            \"Only tensors with 3, 4, or 5 dimensions are supported\")",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:968-998"
+    },
+    "749": {
+        "file_id": 66,
+        "content": "The provided code defines two functions, `eye_` and `dirac_`, that initialize a tensor with specific values. The `eye_` function fills the 2D tensor with an identity matrix, while the `dirac_` function fills a 3D, 4D or 5D tensor with Dirac delta functions. It also takes an optional argument for groups in case of Convolutional layers. Both functions require the input tensor to have specific dimensions and raise a ValueError if not satisfied.",
+        "type": "comment"
+    },
+    "750": {
+        "file_id": 66,
+        "content": "    sizes = tensor.shape\n    if sizes[0] % groups != 0:\n        raise ValueError('dim 0 must be divisible by groups')\n    out_chans_per_grp = sizes[0] // groups\n    min_dim = min(out_chans_per_grp, sizes[1])\n    with paddle.no_grad():\n        tensor.zero_()\n        for g in range(groups):\n            for d in range(min_dim):\n                if dimensions == 3:  # Temporal convolution\n                    tensor[g * out_chans_per_grp + d, d,\n                           tensor.shape[2] // 2] = 1\n                elif dimensions == 4:  # Spatial convolution\n                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,\n                           tensor.shape[3] // 2] = 1\n                else:  # Volumetric convolution\n                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,\n                           tensor.shape[3] // 2, tensor.shape[4] // 2] = 1\n    return tensor\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = tensor.dim()\n    if dimensions < 2:\n        raise ValueError(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1000-1028"
+    },
+    "751": {
+        "file_id": 66,
+        "content": "This function initializes a tensor with ones in specific positions based on the provided dimensions (3 for temporal convolution, 4 for spatial convolution, and 5 for volumetric convolution). It checks if dim 0 is divisible by groups and raises an error if not. Then it calculates out_chans_per_grp and min_dim, and finally initializes the tensor using no_grad context manager.",
+        "type": "comment"
+    },
+    "752": {
+        "file_id": 66,
+        "content": "            \"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions\"\n        )\n    num_input_fmaps = tensor.shape[1]  # .size(1)\n    num_output_fmaps = tensor.shape[0]  # .size(0)\n    receptive_field_size = 1\n    if tensor.dim() > 2:\n        for s in tensor.shape[2:]:\n            receptive_field_size *= s  # fixed\n    fan_in = num_input_fmaps * receptive_field_size\n    fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef LongTensor(x):\n    return paddle.to_tensor(x, dtype='int64')\ndef IntTensor(x):\n    return paddle.to_tensor(x, dtype='int32')\ndef xavier_uniform_(tensor: Tensor, gain: float = 1.) -> Tensor:\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Understanding the difficulty of training deep feedforward\n    neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform\n    distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{U}(-a, a)` where\n    .. math::\n        a = \\text{gain} \\times \\sqrt{\\frac{6}{\\text{fan\\_in} + \\text{fan\\_out}}}",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1029-1060"
+    },
+    "753": {
+        "file_id": 66,
+        "content": "Function to calculate fan_in and fan_out for tensor with dimensions greater than 2, compute the gain factor for Xavier uniform initialization, fill the input Tensor with values from a uniform distribution according to Glorot & Bengio (2010) method.",
+        "type": "comment"
+    },
+    "754": {
+        "file_id": 66,
+        "content": "    Also known as Glorot initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        gain: an optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation\n    return _no_grad_uniform_(tensor, -a, a)\ndef xavier_normal_(tensor: Tensor, gain: float = 1.) -> Tensor:\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Understanding the difficulty of training deep feedforward\n    neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal\n    distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{N}(0, \\text{std}^2)` where\n    .. math::\n        \\text{std} = \\text{gain} \\times \\sqrt{\\frac{2}{\\text{fan\\_in} + \\text{fan\\_out}}}\n    Also known as Glorot initialization.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1062-1089"
+    },
+    "755": {
+        "file_id": 66,
+        "content": "This code snippet is for initializing a Tensor with values from a normal distribution, using the Xavier/Glorot initialization method. It calculates the standard deviation based on the fan-in and fan-out of the tensor and applies it to uniformly fill the tensor within specified bounds.",
+        "type": "comment"
+    },
+    "756": {
+        "file_id": 66,
+        "content": "    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        gain: an optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.xavier_normal_(w)\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    return _no_grad_normal_(tensor, 0., std)\ndef _calculate_correct_fan(tensor, mode):\n    mode = mode.lower()\n    valid_modes = ['fan_in', 'fan_out']\n    if mode not in valid_modes:\n        raise ValueError(\"Mode {} not supported, please use one of {}\".format(\n            mode, valid_modes))\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    return fan_in if mode == 'fan_in' else fan_out\ndef kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Delving deep into rectifiers: Surpassing human-level\n    performance on ImageNet classification` - He, K. et al. (2015), using a\n    uniform distribution. The resulting tensor will have values sampled from",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1091-1120"
+    },
+    "757": {
+        "file_id": 66,
+        "content": "This code snippet is from the PyTorch library and provides functions for initializing tensors with Xavier normal distribution. It includes the `kaiming_uniform_` function that takes a tensor, gain factor, and optional parameters for fan-in/fan-out calculation or nonlinearity. The `_calculate_fan_in_and_fan_out` and `_calculate_correct_fan` functions help calculate the appropriate fan values based on input arguments.",
+        "type": "comment"
+    },
+    "758": {
+        "file_id": 66,
+        "content": "    :math:`\\mathcal{U}(-\\text{bound}, \\text{bound})` where\n    .. math::\n        \\text{bound} = \\text{gain} \\times \\sqrt{\\frac{3}{\\text{fan\\_mode}}}\n    Also known as He initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the negative slope of the rectifier used after this layer (only\n            used with ``'leaky_relu'``)\n        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``\n            preserves the magnitude of the variance of the weights in the\n            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the\n            backwards pass.\n        nonlinearity: the non-linear function (`nn.functional` name),\n            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1121-1145"
+    },
+    "759": {
+        "file_id": 66,
+        "content": "This function initializes a tensor with Kaiming uniform initialization, setting the standard deviation of the normal distribution to be equal to gain multiplied by the square root of 3 divided by fan mode. It's used for He initialization and applies nonlinearity based on the specified parameter.",
+        "type": "comment"
+    },
+    "760": {
+        "file_id": 66,
+        "content": "    bound = math.sqrt(\n        3.0) * std  # Calculate uniform bounds from standard deviation\n    with paddle.no_grad():\n        tensor.set_value(paddle.uniform(tensor.shape, min=-bound, max=bound))\n        return tensor\ndef kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Delving deep into rectifiers: Surpassing human-level\n    performance on ImageNet classification` - He, K. et al. (2015), using a\n    normal distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{N}(0, \\text{std}^2)` where\n    .. math::\n        \\text{std} = \\frac{\\text{gain}}{\\sqrt{\\text{fan\\_mode}}}\n    Also known as He initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the negative slope of the rectifier used after this layer (only\n            used with ``'leaky_relu'``)\n        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``\n            preserves the magnitude of the variance of the weights in the",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1146-1170"
+    },
+    "761": {
+        "file_id": 66,
+        "content": "This code initializes a tensor using the Kaiming normal method. It fills the input tensor with values sampled from a normal distribution, where std is calculated based on gain and fan_mode (fan_in by default). This initialization method is often used in neural networks to improve performance.",
+        "type": "comment"
+    },
+    "762": {
+        "file_id": 66,
+        "content": "            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the\n            backwards pass.\n        nonlinearity: the non-linear function (`nn.functional` name),\n            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> kaiming_normal_(w, mode='fan_out', nonlinearity='relu')\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    with paddle.no_grad():\n        tensor.set_value(paddle.normal(shape=tensor.shape, mean=0, std=std))\n        return tensor\ndef orthogonal_(tensor, gain=1):\n    r\"\"\"Fills the input `Tensor` with a (semi) orthogonal matrix, as\n    described in `Exact solutions to the nonlinear dynamics of learning in deep\n    linear neural networks` - Saxe, A. et al. (2013). The input tensor must have\n    at least 2 dimensions, and for tensors with more than 2 dimensions the\n    trailing dimensions are flattened.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`, where :math:`n \\geq 2`",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1171-1196"
+    },
+    "763": {
+        "file_id": 66,
+        "content": "These functions fill a tensor with either a (semi) orthogonal matrix or initialize weights using Kaiming normal distribution. The 'fan_out' and 'nonlinearity' parameters are used for the initialization process. These functions are inspired by research papers, one focusing on orthogonal matrices in deep linear neural networks and another on Kaiming normal distribution for weight initialization.",
+        "type": "comment"
+    },
+    "764": {
+        "file_id": 66,
+        "content": "        gain: optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.orthogonal_(w)\n    \"\"\"\n    if tensor.ndimension() < 2:\n        raise ValueError(\"Only tensors with 2 or more dimensions are supported\")\n    rows = tensor.shape[0]  # .size(0)\n    cols = tensor.numel() // rows\n    flattened = tensor.new(rows, cols).normal_(0, 1)\n    if rows < cols:\n        flattened.t_()\n    # Compute the qr factorization\n    q, r = paddle.to_tensor(np.linalg.qr(flattened.numpy()))\n    # q, r = torch.qr(flattened)\n    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf\n    d = paddle.diag(r, 0)\n    ph = d.sign()\n    q *= ph\n    if rows < cols:\n        q.t_()\n    with paddle.no_grad():\n        tensor.view_as(q).copy_(q)\n        tensor.mul_(gain)\n    return tensor\ndef sparse_(tensor, sparsity, std=0.01):\n    r\"\"\"Fills the 2D input `Tensor` as a sparse matrix, where the\n    non-zero elements will be drawn from the normal distribution\n    :math:`\\mathcal{N}(0, 0.01)`, as described in `Deep learning via",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1197-1233"
+    },
+    "765": {
+        "file_id": 66,
+        "content": "This function initializes a 2D tensor with values drawn from the standard normal distribution, ensuring that at least a certain sparsity level is maintained. It uses QR factorization and scales the result by a given gain factor if specified.",
+        "type": "comment"
+    },
+    "766": {
+        "file_id": 66,
+        "content": "    Hessian-free optimization` - Martens, J. (2010).\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        sparsity: The fraction of elements in each column to be set to zero\n        std: the standard deviation of the normal distribution used to generate\n            the non-zero values\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.sparse_(w, sparsity=0.1)\n    \"\"\"\n    if tensor.ndimension() != 2:\n        raise ValueError(\"Only tensors with 2 dimensions are supported\")\n    rows, cols = tensor.shape\n    num_zeros = int(math.ceil(sparsity * rows))\n    with paddle.no_grad():\n        tensor.normal_(0, std)\n        for col_idx in range(cols):\n            row_indices = paddle.randperm(rows)\n            zero_indices = row_indices[:num_zeros]\n            tensor[zero_indices, col_idx] = 0\n    return tensor\n# for backward compatibility\ndef _make_deprecate(meth):\n    new_name = meth.__name__\n    old_name = new_name[:-1]\n    def deprecated_init(*args, **kwargs):\n        warnings.warn(\n            \"nn.init.{} is now deprecated in favor of nn.init.{}.\".format(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1234-1268"
+    },
+    "767": {
+        "file_id": 66,
+        "content": "This code initializes a 2D torch.Tensor with a specified sparsity and standard deviation by setting some elements to zero while keeping others non-zero. It checks for tensor dimensions, normalizes values, assigns zeroes based on the input sparsity, and is compatible with both PyTorch and PaddlePaddle.",
+        "type": "comment"
+    },
+    "768": {
+        "file_id": 66,
+        "content": "                old_name, new_name),\n            stacklevel=2)\n        return meth(*args, **kwargs)\n    deprecated_init.__doc__ = r\"\"\"\n    {old_name}(...)\n    .. warning::\n        This method is now deprecated in favor of :func:`torch.nn.init.{new_name}`.\n    See :func:`~torch.nn.init.{new_name}` for details.\"\"\".format(\n        old_name=old_name, new_name=new_name)\n    deprecated_init.__name__ = old_name\n    return deprecated_init\n# uniform = _make_deprecate(uniform_)\n# normal = _make_deprecate(normal_)\n# constant = _make_deprecate(constant_)\n# eye = _make_deprecate(eye_)\n# dirac = _make_deprecate(dirac_)\n# xavier_uniform = _make_deprecate(xavier_uniform_)\n# xavier_normal = _make_deprecate(xavier_normal_)\n# kaiming_uniform = _make_deprecate(kaiming_uniform_)\n# kaiming_normal = _make_deprecate(kaiming_normal_)\n# orthogonal = _make_deprecate(orthogonal_)\n# sparse = _make_deprecate(sparse_)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1269-1295"
+    },
+    "769": {
+        "file_id": 66,
+        "content": "This code defines several deprecated initialization methods and creates their corresponding non-deprecated alternatives. The _make_deprecate function wraps the old functions with a warning that they are deprecated in favor of new Torch.nn.init functions, redirecting users to the new functions for more information.",
+        "type": "comment"
+    },
+    "770": {
+        "file_id": 67,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py",
+        "type": "filepath"
+    },
+    "771": {
+        "file_id": 67,
+        "content": "The precise_bn.py file in PaddlePaddle's EIVideo module contains a function called do_preciseBN, which recomputes batch normalization stats for improved accuracy by running the model multiple times with input data from the data_loader, updating BN layers with running averages for normalization.",
+        "type": "summary"
+    },
+    "772": {
+        "file_id": 67,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport itertools\nfrom EIVideo.paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n\"\"\"\nImplement precise bn, which is useful for improving accuracy.\n\"\"\"\n@paddle.no_grad()  # speed up and save CUDA memory\ndef do_preciseBN(model, data_loader, parallel, num_iters=200):\n    \"\"\"\n    Recompute and update the batch norm stats to make them more precise. During\n    training both BN stats and the weight are changing after every iteration, so",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:1-30"
+    },
+    "773": {
+        "file_id": 67,
+        "content": "The code provided is part of the PaddlePaddle framework for video applications, specifically the EIVideo module. This precise_bn.py file contains a function called do_preciseBN that recomputes and updates batch norm stats to improve accuracy. It does so by running the model with input data from the data_loader multiple times (num_iters) to make BN statistics more precise. The code also includes an import for paddle, itertools, and EIVideo's paddlevideo module.",
+        "type": "comment"
+    },
+    "774": {
+        "file_id": 67,
+        "content": "    the running average can not precisely reflect the actual stats of the\n    current model.\n    In this function, the BN stats are recomputed with fixed weights, to make\n    the running average more precise. Specifically, it computes the true average\n    of per-batch mean/variance instead of the running average.\n    This is useful to improve validation accuracy.\n    Args:\n        model: the model whose bn stats will be recomputed\n        data_loader: an iterator. Produce data as input to the model\n        num_iters: number of iterations to compute the stats.\n    Return:\n        the model with precise mean and variance in bn layers.\n    \"\"\"\n    bn_layers_list = [\n        m for m in model.sublayers()\n        if any((isinstance(m, bn_type)\n                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,\n                                paddle.nn.BatchNorm3D))) and m.training\n    ]\n    if len(bn_layers_list) == 0:\n        return\n    # moving_mean=moving_mean*momentum+batch_mean*(1.−momentum)\n    # we set momentum=0. to get the true mean and variance during forward",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:31-54"
+    },
+    "775": {
+        "file_id": 67,
+        "content": "This code recomputes the batch normalization (BN) statistics with fixed weights for a given model, improving validation accuracy. It computes true average of per-batch mean/variance instead of running average. The code targets specific BN layers in the model and is applied when there are no such layers or if training is not enabled.",
+        "type": "comment"
+    },
+    "776": {
+        "file_id": 67,
+        "content": "    momentum_actual = [bn._momentum for bn in bn_layers_list]\n    for bn in bn_layers_list:\n        bn._momentum = 0.\n    running_mean = [paddle.zeros_like(bn._mean)\n                    for bn in bn_layers_list]  #pre-ignore\n    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]\n    ind = -1\n    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):\n        logger.info(\"doing precise BN {} / {}...\".format(ind + 1, num_iters))\n        if parallel:\n            model._layers.train_step(data)\n        else:\n            model.train_step(data)\n        for i, bn in enumerate(bn_layers_list):\n            # Accumulates the bn stats.\n            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)\n            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)\n    assert ind == num_iters - 1, (\n        \"update_bn_stats is meant to run for {} iterations, but the batch_sampler stops at {} iterations.\"\n        .format(num_iters, ind))\n    # Sets the precise bn stats.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:55-80"
+    },
+    "777": {
+        "file_id": 67,
+        "content": "This code initializes the momentum of batch normalization (BN) layers to 0 and creates lists for running mean and variance. It then trains a model for a specified number of iterations, updating the BN statistics by accumulating the difference between current and running mean/variance. Finally, it asserts that the correct number of iterations were performed.",
+        "type": "comment"
+    },
+    "778": {
+        "file_id": 67,
+        "content": "    for i, bn in enumerate(bn_layers_list):\n        bn._mean.set_value(running_mean[i])\n        bn._variance.set_value(running_var[i])\n        bn._momentum = momentum_actual[i]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:81-84"
+    },
+    "779": {
+        "file_id": 67,
+        "content": "This code is iterating through a list of batch normalization (BN) layers, setting their mean and variance values from a separate list, and updating their momentum value. This could be part of a model's training process where it updates the BN layers with running averages for normalization.",
+        "type": "comment"
+    },
+    "780": {
+        "file_id": 68,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py",
+        "type": "filepath"
+    },
+    "781": {
+        "file_id": 68,
+        "content": "This code initializes global profiling variables and defines the ProfilerOptions class for operator-level timing using PaddlePaddle's profiler. It also stops the profiler, checks for exit conditions, and increments _profiler_step_id.",
+        "type": "summary"
+    },
+    "782": {
+        "file_id": 68,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nimport paddle\n# A global variable to record the number of calling times for profiler\n# functions. It is used to specify the tracing range of training steps.\n_profiler_step_id = 0\n# A global variable to avoid parsing from string every time.\n_profiler_options = None\nclass ProfilerOptions(object):\n    \"\"\"\n    Use a string to initialize a ProfilerOptions.\n    The string should be in the format: \"key1=value1;key2=value;key3=value3\".",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:1-29"
+    },
+    "783": {
+        "file_id": 68,
+        "content": "This code is setting up a global variable to record the number of calling times for profiler functions and another global variable to avoid parsing from string every time. It also defines the ProfilerOptions class, which can be initialized using a string in the format \"key1=value1;key2=value;key3=value3\". This indicates that the code is part of PaddleVideo's EIVideo application and is related to profiling options and step ID management.",
+        "type": "comment"
+    },
+    "784": {
+        "file_id": 68,
+        "content": "    For example:\n      \"profile_path=model.profile\"\n      \"batch_range=[50, 60]; profile_path=model.profile\"\n      \"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile\"\n    ProfilerOptions supports following key-value pair:\n      batch_range      - a integer list, e.g. [100, 110].\n      state            - a string, the optional values are 'CPU', 'GPU' or 'All'.\n      sorted_key       - a string, the optional values are 'calls', 'total',\n                         'max', 'min' or 'ave.\n      tracer_option    - a string, the optional values are 'Default', 'OpDetail',\n                         'AllOpDetail'.\n      profile_path     - a string, the path to save the serialized profile data,\n                         which can be used to generate a timeline.\n      exit_on_finished - a boolean.\n    \"\"\"\n    def __init__(self, options_str):\n        assert isinstance(options_str, str)\n        self._options = {\n            'batch_range': [10, 20],\n            'state': 'All',\n            'sorted_key': 'total',",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:30-52"
+    },
+    "785": {
+        "file_id": 68,
+        "content": "The code defines a class \"ProfilerOptions\" which takes in an options string and initializes its attributes. Options can include batch_range, state (CPU/GPU/All), sorted_key (calls/total/max/min/ave), tracer_option (Default/OpDetail/AllOpDetail), profile_path for storing serialized data, and exit_on_finished boolean flag.",
+        "type": "comment"
+    },
+    "786": {
+        "file_id": 68,
+        "content": "            'tracer_option': 'Default',\n            'profile_path': '/tmp/profile',\n            'exit_on_finished': True\n        }\n        self._parse_from_string(options_str)\n    def _parse_from_string(self, options_str):\n        for kv in options_str.replace(' ', '').split(';'):\n            key, value = kv.split('=')\n            if key == 'batch_range':\n                value_list = value.replace('[', '').replace(']', '').split(',')\n                value_list = list(map(int, value_list))\n                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[\n                        1] > value_list[0]:\n                    self._options[key] = value_list\n            elif key == 'exit_on_finished':\n                self._options[key] = value.lower() in (\"yes\", \"true\", \"t\", \"1\")\n            elif key in [\n                    'state', 'sorted_key', 'tracer_option', 'profile_path'\n            ]:\n                self._options[key] = value\n    def __getitem__(self, name):\n        if self._options.get(name, None) is None:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:53-76"
+    },
+    "787": {
+        "file_id": 68,
+        "content": "Class for parsing profile options from a string. It stores the batch range, tracer option, exit on finished status, state, sorted key, and profile path as options. The _parse_from_string function sets the values based on specific conditions: if the 'batch_range' is valid, 'exit_on_finished' is set to True if the value matches \"yes\", \"true\", \"t\", or \"1\", and other options are directly assigned from the string. If an option name is not found in the string, it returns None.",
+        "type": "comment"
+    },
+    "788": {
+        "file_id": 68,
+        "content": "            raise ValueError(\n                \"ProfilerOptions does not have an option named %s.\" % name)\n        return self._options[name]\ndef add_profiler_step(options_str=None):\n    \"\"\"\n    Enable the operator-level timing using PaddlePaddle's profiler.\n    The profiler uses a independent variable to count the profiler steps.\n    One call of this function is treated as a profiler step.\n    Args:\n      profiler_options - a string to initialize the ProfilerOptions.\n                         Default is None, and the profiler is disabled.\n    \"\"\"\n    if options_str is None:\n        return\n    global _profiler_step_id\n    global _profiler_options\n    if _profiler_options is None:\n        _profiler_options = ProfilerOptions(options_str)\n    if _profiler_step_id == _profiler_options['batch_range'][0]:\n        paddle.utils.profiler.start_profiler(_profiler_options['state'],\n                                             _profiler_options['tracer_option'])\n    elif _profiler_step_id == _profiler_options['batch_range'][1]:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:77-104"
+    },
+    "789": {
+        "file_id": 68,
+        "content": "This function enables the operator-level timing using PaddlePaddle's profiler. It initializes the ProfilerOptions with a provided string and increments the global profiler step id. If the current step matches the start or end of the batch range in the options, it starts or stops the profiler respectively.",
+        "type": "comment"
+    },
+    "790": {
+        "file_id": 68,
+        "content": "        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],\n                                            _profiler_options['profile_path'])\n        if _profiler_options['exit_on_finished']:\n            sys.exit(0)\n    _profiler_step_id += 1",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:105-110"
+    },
+    "791": {
+        "file_id": 68,
+        "content": "This code snippet stops the profiler, checks if it should exit on finished, and increments _profiler_step_id.",
+        "type": "comment"
+    },
+    "792": {
+        "file_id": 69,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py",
+        "type": "filepath"
+    },
+    "793": {
+        "file_id": 69,
+        "content": "This code initializes a logger, defines functions for logging metrics such as loss and learning rate during training. It also creates a class for tracking various metrics with update method and logs batch metrics at specified batch IDs in log_batch function, while formatting log string with colors for clarity in video processing tasks.",
+        "type": "summary"
+    },
+    "794": {
+        "file_id": 69,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections import OrderedDict\nimport paddle\nfrom .logger import coloring, get_logger\nlogger = get_logger(\"paddlevideo\")\n__all__ = ['AverageMeter', 'build_record', 'log_batch', 'log_epoch']\ndef build_record(cfg):\n    framework_type = cfg.get('framework', '')\n    record_list = [\n        (\"loss\", AverageMeter('loss', '7.5f')),\n        (\"lr\", AverageMeter('lr', 'f', need_avg=False)),\n    ]\n    if 'Recognizer1D' in framework_type:  #TODO: required specify str in framework",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:1-32"
+    },
+    "795": {
+        "file_id": 69,
+        "content": "This code imports necessary libraries and defines functions for logging metrics such as loss, learning rate during training. It also initializes a logger with the name \"paddlevideo\" and specifies the available classes or functions that can be accessed from this file. The build_record function takes a configuration file and creates an ordered dictionary of metrics to record based on the framework type specified in the configuration file.",
+        "type": "comment"
+    },
+    "796": {
+        "file_id": 69,
+        "content": "        record_list.append((\"hit_at_one\", AverageMeter(\"hit_at_one\", '.5f')))\n        record_list.append((\"perr\", AverageMeter(\"perr\", '.5f')))\n        record_list.append((\"gap\", AverageMeter(\"gap\", '.5f')))\n    elif 'Recognizer' in framework_type:\n        record_list.append((\"top1\", AverageMeter(\"top1\", '.5f')))\n        record_list.append((\"top5\", AverageMeter(\"top5\", '.5f')))\n    elif 'FastRCNN' in framework_type:\n        record_list.append(\n            (\"recall@thr=0.5\", AverageMeter(\"recall@thr=0.5\", '.5f')))\n        record_list.append(\n            (\"prec@thr=0.5\", AverageMeter(\"prec@thr=0.5\", '.5f')))\n        record_list.append((\"recall@top3\", AverageMeter(\"recall@top3\", '.5f')))\n        record_list.append((\"prec@top3\", AverageMeter(\"prec@top3\", '.5f')))\n        record_list.append((\"recall@top5\", AverageMeter(\"recall@top5\", '.5f')))\n        record_list.append((\"prec@top5\", AverageMeter(\"prec@top5\", '.5f')))\n        record_list.append((\"mAP@0.5IOU\", AverageMeter(\"mAP@0.5IOU\", '.5f')))\n    elif 'DepthEstimator' in cfg.framework:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:33-49"
+    },
+    "797": {
+        "file_id": 69,
+        "content": "This code is part of a function that handles recording different metrics for various framework types. It appends specific metric names and instances of the AverageMeter class to the record_list depending on the framework type. If 'Recognizer' is in the framework type, it records 'top1' and 'top5' metrics. If 'FastRCNN' is present, it records a series of recall and precision metrics along with mAP@0.5IOU. The function continues with more conditions for different framework types after this snippet.",
+        "type": "comment"
+    },
+    "798": {
+        "file_id": 69,
+        "content": "        record_list.append((\"abs_rel\", AverageMeter(\"abs_rel\", '.5f')))\n        record_list.append((\"sq_rel\", AverageMeter(\"sq_rel\", '.5f')))\n        record_list.append((\"rmse\", AverageMeter(\"rmse\", '.5f')))\n        record_list.append((\"rmse_log\", AverageMeter(\"rmse_log\", '.5f')))\n        record_list.append((\"a1\", AverageMeter(\"a1\", '.5f')))\n        record_list.append((\"a2\", AverageMeter(\"a2\", '.5f')))\n        record_list.append((\"a3\", AverageMeter(\"a3\", '.5f')))\n        record_list.append((\"losses_day\", AverageMeter(\"losses_day\", '.5f')))\n        record_list.append(\n            (\"losses_night\", AverageMeter(\"losses_night\", '.5f')))\n    record_list.append((\"batch_time\", AverageMeter('batch_cost', '.5f')))\n    record_list.append((\"reader_time\", AverageMeter('reader_cost', '.5f')))\n    record_list = OrderedDict(record_list)\n    return record_list\nclass AverageMeter(object):\n    \"\"\"\n    Computes and stores the average and current value\n    \"\"\"\n    def __init__(self, name='', fmt='f', need_avg=True):\n        self.name = name",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:50-72"
+    },
+    "799": {
+        "file_id": 69,
+        "content": "This code defines a list of metrics to be tracked and an AverageMeter class that computes and stores the average and current value for each metric. The list includes various metrics like \"abs_rel\", \"sq_rel\", \"rmse\", \"rmse_log\", \"a1\", \"a2\", \"a3\", \"losses_day\", \"losses_night\", \"batch_time\", and \"reader_time\". The list is then converted to an OrderedDict.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/70.json b/docs/data/70.json
new file mode 100644
index 000000000..eefb779eb
--- /dev/null
+++ b/docs/data/70.json
@@ -0,0 +1,550 @@
+{
+    "7000": {
+        "file_id": 514,
+        "content": "    cnt = 0\n    for d in slice(-window_size[0]), slice(-window_size[0],\n                                           -shift_size[0]), slice(\n                                               -shift_size[0], None):\n        for h in slice(-window_size[1]), slice(-window_size[1],\n                                               -shift_size[1]), slice(\n                                                   -shift_size[1], None):\n            for w in slice(-window_size[2]), slice(-window_size[2],\n                                                   -shift_size[2]), slice(\n                                                       -shift_size[2], None):\n                img_mask[:, d, h, w, :] = cnt\n                cnt += 1\n    mask_windows = window_partition(img_mask,\n                                    window_size)  # nW, ws[0]*ws[1]*ws[2], 1\n    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]\n    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)\n    # attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:427-443"
+    },
+    "7001": {
+        "file_id": 514,
+        "content": "This code generates an attention mask for a Swin Transformer model. It iterates through various dimensions (d, h, w) within the window size and shift size, assigning incremental values to each position in the img_mask tensor. The resulting img_mask is then partitioned into non-overlapping windows and squeezed along the last dimension to create mask_windows. Finally, attn_mask is created by subtracting the expanded version of mask_windows from itself, effectively creating a binary mask where values are either 0 or -100.",
+        "type": "comment"
+    },
+    "7002": {
+        "file_id": 514,
+        "content": "    huns = -100.0 * paddle.ones_like(attn_mask)\n    attn_mask = huns * (attn_mask != 0).astype(\"float32\")\n    return attn_mask\nclass BasicLayer(nn.Layer):\n    \"\"\" A basic Swin Transformer layer for one stage.\n    Args:\n        dim (int): Number of feature channels\n        depth (int): Depths of this stage.\n        num_heads (int): Number of attention head.\n        window_size (tuple[int]): Local window size. Default: (1,7,7).\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.\n        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.\n        drop (float, optional): Dropout rate. Default: 0.0\n        attn_drop (float, optional): Attention dropout rate. Default: 0.0\n        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0\n        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm\n   ",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:444-464"
+    },
+    "7003": {
+        "file_id": 514,
+        "content": "The code defines a Swin Transformer layer for one stage in a neural network. The BasicLayer class takes various arguments such as feature channel dimensions, depth, number of heads, local window size, etc. It also includes an MLP (Multi-Layer Perceptron) with a specified ratio, and provides options to add learnable bias, scale factors, dropout rates, stochastic depth rate, and a normalization layer for each input. This basic layer can be utilized in the Swin Transformer architecture for feature extraction and classification tasks.",
+        "type": "comment"
+    },
+    "7004": {
+        "file_id": 514,
+        "content": "     downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None\n    \"\"\"\n    def __init__(self,\n                 dim,\n                 depth,\n                 num_heads,\n                 window_size=(1, 7, 7),\n                 mlp_ratio=4.,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 norm_layer=nn.LayerNorm,\n                 downsample=None,\n                 use_checkpoint=False):\n        super().__init__()\n        self.window_size = window_size\n        self.shift_size = tuple(i // 2 for i in window_size)\n        self.depth = depth\n        self.use_checkpoint = use_checkpoint\n        # build blocks\n        self.blocks = nn.LayerList([\n            SwinTransformerBlock3D(\n                dim=dim,\n                num_heads=num_heads,\n                window_size=window_size,\n                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,\n                mlp_ratio=mlp_ratio,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:464-493"
+    },
+    "7005": {
+        "file_id": 514,
+        "content": "This code defines a 3D Swin Transformer block with optional downsampling layer at the end. It takes parameters such as dim, depth, num_heads, window size, mlp ratio, etc., and initializes an instance of the class SwinTransformerBlock3D for each block in a LayerList. The window size is set to (1, 7, 7) by default and the shift size is determined based on whether the current index is even or odd.",
+        "type": "comment"
+    },
+    "7006": {
+        "file_id": 514,
+        "content": "                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop,\n                attn_drop=attn_drop,\n                drop_path=drop_path[i]\n                if isinstance(drop_path, list) else drop_path,\n                norm_layer=norm_layer,\n                use_checkpoint=use_checkpoint,\n            ) for i in range(depth)\n        ])\n        self.downsample = downsample\n        if self.downsample is not None:\n            self.downsample = downsample(dim=dim, norm_layer=norm_layer)\n    def forward(self, x):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, C, D, H, W).\n        \"\"\"\n        # calculate attention mask for SW-MSA\n        B = paddle.shape(x)[0]\n        _, C, D, H, W = x.shape\n        window_size, shift_size = get_window_size((D, H, W), self.window_size,\n                                                  self.shift_size)\n        # x = rearrange(x, 'b c d h w -> b d h w c')\n        x = x.transpose([0, 2, 3, 4, 1])\n        Dp = int(np.ceil(D / window_size[0])) * window_size[0]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:494-522"
+    },
+    "7007": {
+        "file_id": 514,
+        "content": "This code defines a Swin Transformer block for the PaddleVideo library. It takes input dimensions and creates multiple linear layers for self-attention, followed by a downsampling operation if needed. The forward function calculates an attention mask based on window size and shifts before rearranging the input tensor.",
+        "type": "comment"
+    },
+    "7008": {
+        "file_id": 514,
+        "content": "        Hp = int(np.ceil(H / window_size[1])) * window_size[1]\n        Wp = int(np.ceil(W / window_size[2])) * window_size[2]\n        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size)\n        for blk in self.blocks:\n            x = blk(x, attn_mask)\n        x = x.reshape([B, D, H, W, C])\n        if self.downsample is not None:\n            x = self.downsample(x)\n        x = x.transpose([0, 4, 1, 2, 3])\n        return x\nclass PatchEmbed3D(nn.Layer):\n    \"\"\" Video to Patch Embedding.\n    Args:\n        patch_size (int): Patch token size. Default: (2,4,4).\n        in_chans (int): Number of input video channels. Default: 3.\n        embed_dim (int): Number of linear projection output channels. Default: 96.\n        norm_layer (nn.Layer, optional): Normalization layer. Default: None\n    \"\"\"\n    def __init__(self,\n                 patch_size=(2, 4, 4),\n                 in_chans=3,\n                 embed_dim=96,\n                 norm_layer=None):\n        super().__init__()\n        self.patch_size = patch_size",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:523-551"
+    },
+    "7009": {
+        "file_id": 514,
+        "content": "This code implements a PatchEmbed3D class, which embeds input video frames into patches for use in the Swin Transformer model. It takes the input video frames, divides them into non-overlapping patches, and performs linear projections on the patches to obtain embeddings. The patch size, number of input channels, and embedding dimension are configurable parameters.",
+        "type": "comment"
+    },
+    "7010": {
+        "file_id": 514,
+        "content": "        self.in_chans = in_chans\n        self.embed_dim = embed_dim\n        self.proj = nn.Conv3D(in_chans,\n                              embed_dim,\n                              kernel_size=patch_size,\n                              stride=patch_size)\n        if norm_layer is not None:\n            self.norm = norm_layer(embed_dim)\n        else:\n            self.norm = None\n    def forward(self, x):\n        _, _, D, H, W = x.shape\n        if W % self.patch_size[2] != 0:\n            x = F.pad(\n                x, (0, self.patch_size[2] - W % self.patch_size[2], 0, 0, 0, 0),\n                data_format='NCDHW')\n        if H % self.patch_size[1] != 0:\n            x = F.pad(\n                x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1], 0, 0),\n                data_format='NCDHW')\n        if D % self.patch_size[0] != 0:\n            x = F.pad(\n                x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]),\n                data_format='NCDHW')\n        x = self.proj(x)  # B C D Wh Ww\n        if self.norm is not None:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:553-581"
+    },
+    "7011": {
+        "file_id": 514,
+        "content": "This code is for the Swin Transformer backbone in PaddleVideo. It initializes the module with input channels (in_chans), embed dim, and patch size. If a norm layer is provided, it also initializes the normalization layer (norm). The forward function pads the input according to the dimensions and applies a convolution operation for feature extraction. If a normalization layer was initialized, it performs normalization on the features before returning them.",
+        "type": "comment"
+    },
+    "7012": {
+        "file_id": 514,
+        "content": "            D, Wh, Ww = x.shape[2], x.shape[3], x.shape[4]\n            x = x.flatten(2).transpose([0, 2, 1])\n            x = self.norm(x)\n            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, D, Wh, Ww])\n        return x\n@BACKBONES.register()\nclass SwinTransformer3D(nn.Layer):\n    \"\"\" Swin Transformer backbone.\n        A Paddle impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -\n          https://arxiv.org/pdf/2103.14030\n    Args:\n        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).\n        in_chans (int): Number of input image channels. Default: 3.\n        embed_dim (int): Number of linear projection output channels. Default: 96.\n        depths (tuple[int]): Depths of each Swin Transformer stage.\n        num_heads (tuple[int]): Number of attention head of each stage.\n        window_size (int): Window size. Default: 7.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.\n        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:582-604"
+    },
+    "7013": {
+        "file_id": 514,
+        "content": "This code defines the Swin Transformer 3D backbone for Paddle Video. It takes an input tensor and performs normalization, transposition, and reshaping operations before returning the processed tensor. The class also registers with BACKBONES to be recognized as a valid backbone model.",
+        "type": "comment"
+    },
+    "7014": {
+        "file_id": 514,
+        "content": "        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.\n        drop_rate (float): Dropout rate.\n        attn_drop_rate (float): Attention dropout rate. Default: 0.\n        drop_path_rate (float): Stochastic depth rate. Default: 0.2.\n        norm_layer: Normalization layer. Default: nn.LayerNorm.\n        patch_norm (bool): If True, add normalization after patch embedding. Default: False.\n        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).\n            -1 means not freezing any parameters.\n    \"\"\"\n    def __init__(self,\n                 pretrained=None,\n                 patch_size=(4, 4, 4),\n                 in_chans=3,\n                 embed_dim=96,\n                 depths=[2, 2, 6, 2],\n                 num_heads=[3, 6, 12, 24],\n                 window_size=(2, 7, 7),\n                 mlp_ratio=4.,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.2,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:605-627"
+    },
+    "7015": {
+        "file_id": 514,
+        "content": "This code defines the initialization parameters for the SWIN Transformer model in PaddleVideo. Parameters include pretrained weights, patch size, input channels, embedding dimension, depths of each stage, number of heads per stage, window size, MLP ratio, qkv_bias, qk scale, drop rate, attn drop rate, and stochastic depth rate. The normalization layer and whether to freeze any stages can also be specified during initialization.",
+        "type": "comment"
+    },
+    "7016": {
+        "file_id": 514,
+        "content": "                 norm_layer=nn.LayerNorm,\n                 patch_norm=False,\n                 frozen_stages=-1,\n                 use_checkpoint=False):\n        super().__init__()\n        self.pretrained = pretrained\n        self.num_layers = len(depths)\n        self.embed_dim = embed_dim\n        self.patch_norm = patch_norm\n        self.frozen_stages = frozen_stages\n        self.window_size = window_size\n        self.patch_size = patch_size\n        # split image into non-overlapping patches\n        self.patch_embed = PatchEmbed3D(\n            patch_size=patch_size,\n            in_chans=in_chans,\n            embed_dim=embed_dim,\n            norm_layer=norm_layer if self.patch_norm else None)\n        self.pos_drop = nn.Dropout(p=drop_rate)\n        # stochastic depth\n        dpr = [\n            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))\n        ]  # stochastic depth decay rule\n        # build layers\n        self.layers = nn.LayerList()\n        for i_layer in range(self.num_layers):\n            layer = BasicLayer(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:628-659"
+    },
+    "7017": {
+        "file_id": 514,
+        "content": "The code initializes a Swin Transformer model with specified parameters, including depths, embed dimension, patch size, window size, and input channels. It creates the patch embedding layer and position dropout layer. Stochastic depth is applied using a decay rule. The layers are built using BasicLayer instances for each layer in the specified number of layers.",
+        "type": "comment"
+    },
+    "7018": {
+        "file_id": 514,
+        "content": "                dim=int(embed_dim * 2**i_layer),\n                depth=depths[i_layer],\n                num_heads=num_heads[i_layer],\n                window_size=window_size,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop_rate,\n                attn_drop=attn_drop_rate,\n                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],\n                norm_layer=norm_layer,\n                downsample=PatchMerging\n                if i_layer < self.num_layers - 1 else None,\n                use_checkpoint=use_checkpoint)\n            self.layers.append(layer)\n        self.num_features = int(embed_dim * 2**(self.num_layers - 1))\n        # add a norm layer for each output\n        self.norm = norm_layer(self.num_features)\n        self._freeze_stages()\n    def _freeze_stages(self):\n        if self.frozen_stages >= 0:\n            self.patch_embed.eval()\n            for param in self.patch_embed.parameters():\n                param.stop_gradient = True",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:660-687"
+    },
+    "7019": {
+        "file_id": 514,
+        "content": "This code initializes a Swin Transformer backbone with specified parameters and adds a norm layer for each output. It also includes a function to freeze certain stages of the model if desired.",
+        "type": "comment"
+    },
+    "7020": {
+        "file_id": 514,
+        "content": "        if self.frozen_stages >= 1:\n            self.pos_drop.eval()\n            for i in range(0, self.frozen_stages):\n                m = self.layers[i]\n                m.eval()\n                for param in m.parameters():\n                    param.stop_gradient = True\n    def _init_fn(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight, std=0.02)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):\n            zeros_(m.bias)\n            ones_(m.weight)\n    def init_weights(self):\n        \"\"\"Initialize the weights in backbone.\n        Args:\n            pretrained (str, optional): Path to pre-trained weights.\n                Defaults to None.\n        \"\"\"\n        \"\"\"First init model's weight\"\"\"\n        self.apply(self._init_fn)\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if isinstance(\n                self.pretrained, str\n        ) and self.pretrained.strip() != \"\":  # load pretrained weights\n            load_ckpt(self, self.pretrained)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:689-720"
+    },
+    "7021": {
+        "file_id": 514,
+        "content": "This code is part of a backbone model's initialization. It first applies an initializer function to the layers, then checks if pretrained weights are provided and loads them if available. The frozen_stages variable determines how many stages of the model should be frozen (set to eval mode) during inference.",
+        "type": "comment"
+    },
+    "7022": {
+        "file_id": 514,
+        "content": "        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            pass\n        else:\n            raise NotImplementedError\n    def forward(self, x):\n        \"\"\"Forward function.\"\"\"\n        x = self.patch_embed(x)\n        x = self.pos_drop(x)\n        for layer in self.layers:\n            x = layer(x)\n        x = x.transpose([0, 2, 3, 4, 1])\n        x = self.norm(x)\n        x = x.transpose([0, 4, 1, 2, 3])\n        return x\n    def train(self, mode=True):\n        \"\"\"Convert the model into training mode while keep layers freezed.\"\"\"\n        super(SwinTransformer3D, self).train(mode)\n        self._freeze_stages()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/swin_transformer.py:721-742"
+    },
+    "7023": {
+        "file_id": 514,
+        "content": "If pretrained is None or empty, do nothing. Else, raise NotImplementedError. Forward function processes input through patch embedding and positional dropout, iterates over layers, transposes dimensions, normalizes, and returns output. Train mode keeps layers unfrozen by calling the superclass method and freezing stages.",
+        "type": "comment"
+    },
+    "7024": {
+        "file_id": 515,
+        "content": "/paddlevideo/modeling/backbones/toshift_vit.py",
+        "type": "filepath"
+    },
+    "7025": {
+        "file_id": 515,
+        "content": "The code defines a VisionTransformer class, ToShiftVIT model and TokenShiftVisionTransformer for image processing with attention blocks, positional embeddings, dropout and normalization layers. It also supports pretrained checkpoints.",
+        "type": "summary"
+    },
+    "7026": {
+        "file_id": 515,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections.abc import Callable\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Constant\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import trunc_normal_\n__all__ = ['VisionTransformer']\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)\ndef to_2tuple(x):\n    return tuple([x] * 2)\ndef drop_path(x, drop_prob=0., training=False):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:1-37"
+    },
+    "7027": {
+        "file_id": 515,
+        "content": "The code defines a class for VisionTransformer backbones and imports necessary libraries. It includes functions like `to_2tuple` and `drop_path` for processing input data and implementing drop path operation, respectively. The code also handles initialization of zero and one constants.",
+        "type": "comment"
+    },
+    "7028": {
+        "file_id": 515,
+        "content": "    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    # issuecomment-532968956 ...\n    See discussion: https://github.com/tensorflow/tpu/issues/494\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob)\n    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)\n    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\nclass Identity(nn.Layer):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:38-65"
+    },
+    "7029": {
+        "file_id": 515,
+        "content": "Code implements Drop Paths (Stochastic Depth) for residual blocks. The function applies dropout probabilistically, and the class `DropPath` handles it during forward pass. `Identity` class serves as an identity mapping.",
+        "type": "comment"
+    },
+    "7030": {
+        "file_id": 515,
+        "content": "    def __init__(self):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.0):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\nclass Attention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.0,\n                 proj_drop=0.0):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:66-104"
+    },
+    "7031": {
+        "file_id": 515,
+        "content": "This code defines three classes: Identity, Mlp, and Attention. Identity is a simple class that returns its input unchanged. Mlp stands for Multi-Layer Perceptron and defines a feedforward neural network layer with optional hidden layers. The Attention class implements a self-attention mechanism commonly used in transformer models. It initializes the necessary parameters and applies dropout to the input and output of the attention operation.",
+        "type": "comment"
+    },
+    "7032": {
+        "file_id": 515,
+        "content": "        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n        self.attn_drop = nn.Dropout(attn_drop)\n    def forward(self, x):\n        N, C = x.shape[1:]\n        qkv = self.qkv(x).reshape(\n            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(\n                (2, 0, 3, 1, 4))\n        q, k, v = qkv[0], qkv[1], qkv[2]\n        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale\n        attn = nn.functional.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.0,\n                 qkv_bias=False,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:105-138"
+    },
+    "7033": {
+        "file_id": 515,
+        "content": "The code defines a class representing a self-attention module, with parameters like dimension (dim), number of heads (num_heads), and optional bias for the QKV linear layer (qkv_bias). The class initializes these attributes and defines its forward function to compute attention scores and output.",
+        "type": "comment"
+    },
+    "7034": {
+        "file_id": 515,
+        "content": "                 qk_scale=None,\n                 drop=0.0,\n                 attn_drop=0.0,\n                 drop_path=0.1,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 num_segments = 8,\n                 fold_div = 4):\n                #attention_type='divided_space_time',\n        super().__init__()\n        self.n_seg = num_segments       #ckk\n        self.foldP_div = fold_div       #ckk\n        #self.attention_type = attention_type\n        if isinstance(norm_layer, str):\n            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm1 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        self.attn = Attention(dim,\n                              num_heads=num_heads,\n                              qkv_bias=qkv_bias,\n                              qk_scale=qk_scale,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:139-164"
+    },
+    "7035": {
+        "file_id": 515,
+        "content": "The code above initializes an object with multiple parameters such as num_segments, fold_div, norm_layer, and attention_type. It also creates a norm1 layer based on the type of norm_layer provided (either a string or a Callable). If norm_layer is a string, it uses eval() to call the specified class, otherwise if it's a Callable, it directly initializes the layer with that function.",
+        "type": "comment"
+    },
+    "7036": {
+        "file_id": 515,
+        "content": "                              attn_drop=attn_drop,\n                              proj_drop=drop)\n        # Temporal Attention Parameters\n        '''\n        if self.attention_type == 'divided_space_time':\n            if isinstance(norm_layer, str):\n                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n            elif isinstance(norm_layer, Callable):\n                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)\n            else:\n                raise TypeError(\n                    \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n            self.temporal_attn = Attention(dim,\n                                           num_heads=num_heads,\n                                           qkv_bias=qkv_bias,\n                                           qk_scale=qk_scale,\n                                           attn_drop=attn_drop,\n                                           proj_drop=drop)\n            self.temporal_fc = nn.Linear(dim, dim)\n        '''\n        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:165-186"
+    },
+    "7037": {
+        "file_id": 515,
+        "content": "This code initializes the Temporal Attention parameters for the model. If the attention_type is 'divided_space_time', it creates a temporal normalization layer and an attention layer, as well as a fully connected layer for the temporal branch of the model. Drop paths are used for stochastic depth to reduce overfitting.",
+        "type": "comment"
+    },
+    "7038": {
+        "file_id": 515,
+        "content": "        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        if isinstance(norm_layer, str):\n            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm2 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n    # token_shift\n    def shuift_tk(self, x):\n        t = self.n_seg\n        bt, n, c = x.shape\n        b = bt // t\n        x = x.reshape([b, t, n, c]) #B T N C\n        fold = c // self.foldP_div\n        out = paddle.zeros_like(x)\n        out.stop_gradient = True\n        # print(\"#### fold \", fold)\n        # print(out.shape)\n        # print(x[:, 1:, 0, :fold].unsqueeze(2).shape)\n        # print(out[:, :-1, 0:1, :fold].shape)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:187-213"
+    },
+    "7039": {
+        "file_id": 515,
+        "content": "This code initializes a ToShift ViT model. It creates a drop path layer, normalization layer, and MLP based on the given parameters. The `shuift_tk` function performs token shifting by reshaping the input, creating a mask with stop gradient attribute, and element-wise adding it to the original input. This helps in improving the model's performance by reducing the effect of irrelevant tokens during training.",
+        "type": "comment"
+    },
+    "7040": {
+        "file_id": 515,
+        "content": "        # exit(0)\n        out[:, :-1, 0, :fold] = x[:, 1:, 0, :fold] # shift left\n        out[:, 1:,  0, fold:2*fold] = x[:,:-1:, 0, fold:2*fold]\n        out[:, :, 1:, :2*fold] = x[:, :, 1:, :2*fold]\n        out[:, :, :, 2*fold:] = x[:, :, :, 2*fold:]\n        return out.reshape([bt, n, c])\n    def forward(self, x):\n        x = self.shuift_tk(x)\n        x = x + self.drop_path(self.attn(self.norm1(x)))\n        x = self.shuift_tk(x)\n        x = x + self.drop_path(self.mlp(self.norm2(x)))\n        return x\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n    def __init__(self,\n                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768):\n        super().__init__()\n        img_size = to_2tuple(img_size)\n        patch_size = to_2tuple(patch_size)\n        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //\n                                                        patch_size[0])\n        self.img_size = img_size\n        self.patch_size = patch_size",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:214-245"
+    },
+    "7041": {
+        "file_id": 515,
+        "content": "This code defines a \"ToshiftVIT\" class, which appears to be a custom backbone for a Vision Transformer model. It includes a forward function that applies shift and drop path operations on the input, as well as a PatchEmbed class for image-to-patch embedding. The ToshiftVIT class also has an unknown \"shuift_tk\" function that seems to be used in the forward pass.",
+        "type": "comment"
+    },
+    "7042": {
+        "file_id": 515,
+        "content": "        self.num_patches = num_patches\n        self.proj = nn.Conv2D(in_channels,\n                              embed_dim,\n                              kernel_size=patch_size,\n                              stride=patch_size)\n    def forward(self, x):\n        B, C, T, H, W = x.shape\n        assert H == self.img_size[0] and W == self.img_size[1], \\\n            f\"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).\"\n        x = x.transpose((0, 2, 1, 3, 4))\n        x = x.reshape([-1, C, H, W])\n        x = self.proj(x)\n        W = x.shape[-1]\n        x = x.flatten(2).transpose((0, 2, 1))\n        return x, T, W\n@BACKBONES.register()\nclass TokenShiftVisionTransformer(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n    \"\"\"\n    def __init__(self,\n                 pretrained=None,\n                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:246-278"
+    },
+    "7043": {
+        "file_id": 515,
+        "content": "This code defines a TokenShiftVisionTransformer class, which is a type of Vision Transformer model that supports patch input. The class has an initialization function where it sets the number of patches and creates a projection layer. It also includes a forward function for processing input data through the model. The assert statement ensures the input image size matches the expected model dimensions. The @BACKBONES.register() decorator registers the model with other backbones in the codebase.",
+        "type": "comment"
+    },
+    "7044": {
+        "file_id": 515,
+        "content": "                 qkv_bias=False,\n                 qk_scale=None,\n                 drop_rate=0,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.1,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 num_seg=8,\n                 attention_type='divided_space_time',\n                 **args):\n        super().__init__()\n        self.pretrained = pretrained\n        self.num_seg = num_seg\n        self.attention_type = attention_type\n        self.num_features = self.embed_dim = embed_dim\n        self.patch_embed = PatchEmbed(img_size=img_size,\n                                      patch_size=patch_size,\n                                      in_channels=in_channels,\n                                      embed_dim=embed_dim)\n        num_patches = self.patch_embed.num_patches\n        # Positional Embeddings\n        self.cls_token = self.create_parameter(shape=(1, 1, embed_dim),\n                                               default_initializer=zeros_)\n        self.pos_embed = self.create_parameter(shape=(1, num_patches + 1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:279-305"
+    },
+    "7045": {
+        "file_id": 515,
+        "content": "This code is initializing a class for the Toshift ViT backbone. It sets parameters such as pretrained, num_seg, attention_type, embed_dim, and others. It creates PatchEmbed and positional embeddings (cls_token and pos_embed). The code also calculates the number of patches.",
+        "type": "comment"
+    },
+    "7046": {
+        "file_id": 515,
+        "content": "                                                      embed_dim),\n                                               default_initializer=zeros_)\n        self.pos_drop = nn.Dropout(p=drop_rate)\n        if self.attention_type != 'space_only':\n            self.time_embed = self.create_parameter(shape=(1, num_seg,\n                                                           embed_dim),\n                                                    default_initializer=zeros_)\n            self.time_drop = nn.Dropout(p=drop_rate)\n        self.add_parameter(\"pos_embed\", self.pos_embed)\n        self.add_parameter(\"cls_token\", self.cls_token)\n        dpr = np.linspace(0, drop_path_rate, depth)\n        self.blocks = nn.LayerList([\n            Block(dim=embed_dim,\n                  num_heads=num_heads,\n                  mlp_ratio=mlp_ratio,\n                  qkv_bias=qkv_bias,\n                  qk_scale=qk_scale,\n                  drop=drop_rate,\n                  attn_drop=attn_drop_rate,\n                  drop_path=dpr[i],\n                  norm_layer=norm_layer,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:306-330"
+    },
+    "7047": {
+        "file_id": 515,
+        "content": "This code initializes and sets up the parameters for a Transformer-based backbone model. It creates positional embeddings, dropout layers, and a list of transformer blocks with specified dimensions, numbers of heads, ratios, biases, scale factors, drop rates, attn drop rates, and drop path rates. These parameters are used to build the network architecture for processing data in downstream tasks.",
+        "type": "comment"
+    },
+    "7048": {
+        "file_id": 515,
+        "content": "                  epsilon=epsilon,\n                  num_segments= self.num_seg\n                  ) for i in range(depth)\n                #attention_type=self.attention_type\n        ])\n        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)\n    def init_weights(self):\n        \"\"\"First init model's weight\"\"\"\n        trunc_normal_(self.pos_embed, std=0.02)\n        trunc_normal_(self.cls_token, std=0.02)\n        self.apply(self._init_fn)\n        if self.attention_type == 'divided_space_time':\n            i = 0\n            for m in self.blocks.sublayers(include_self=True):\n                m_str = str(m)\n                if 'Block' in m_str:\n                    if i > 0:\n                        zeros_(m.temporal_fc.weight)\n                        zeros_(m.temporal_fc.bias)\n                    i += 1\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if isinstance(\n                self.pretrained, str\n        ) and self.pretrained.strip() != \"\":  # load pretrained weights\n            load_ckpt(self,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:331-360"
+    },
+    "7049": {
+        "file_id": 515,
+        "content": "The code initializes a Toshift_VIT model with the specified number of segments and depth. It sets the attention type to 'divided_space_time' for certain blocks. The model's weights are then initialized using truncated normal distribution and the provided function, and any temporal FC layers in the respective block are set to zero. If a pretrained checkpoint is provided, it will be loaded.",
+        "type": "comment"
+    },
+    "7050": {
+        "file_id": 515,
+        "content": "                      self.pretrained,\n                      num_patches=self.patch_embed.num_patches,\n                      num_seg=self.num_seg,\n                      attention_type=self.attention_type)\n    def _init_fn(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):\n            ones_(m.weight)\n            zeros_(m.bias)\n    def forward_features(self, x):\n        # B = x.shape[0]\n        B = paddle.shape(x)[0]\n        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]\n        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]\n        x = paddle.concat((cls_tokens, x), axis=1)\n        pos_interp = (x.shape[1] != self.pos_embed.shape[1])\n        if pos_interp:\n            pos_embed = self.pos_embed\n            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)\n            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(\n                (0, 2, 1))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:361-386"
+    },
+    "7051": {
+        "file_id": 515,
+        "content": "The code initializes a TOShiftViT model, defines an initialization function for the layers, and defines a forward_features function to process input feature maps. The function takes the number of patches from the patch embedding layer, expands the class token, concatenates it with the features, and applies positional embeddings if needed.",
+        "type": "comment"
+    },
+    "7052": {
+        "file_id": 515,
+        "content": "            P = int(other_pos_embed.shape[2]**0.5)\n            H = x.shape[1] // W\n            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])\n            new_pos_embed = F.interpolate(other_pos_embed,\n                                          size=(H, W),\n                                          mode='nearest')\n            new_pos_embed = new_pos_embed.flatten(2)\n            new_pos_embed = new_pos_embed.transpose((0, 2, 1))\n            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),\n                                          axis=1)\n            x = x + new_pos_embed\n        else:\n            x = x + self.pos_embed\n        x = self.pos_drop(x)\n        # Attention blocks\n        for blk in self.blocks:\n            x = blk(x)\n        x = self.norm(x)\n        return x[:, 0]  # [B,  embed_dim]  -> [B*T, embed_dim]\n    def forward(self, x):\n        x = self.forward_features(x)\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/toshift_vit.py:387-413"
+    },
+    "7053": {
+        "file_id": 515,
+        "content": "This function reshapes and interpolates a positional embedding, adding it to the input if specified. It then performs positional dropout before passing through attention blocks and normalization layers, finally returning the forward pass of features.",
+        "type": "comment"
+    },
+    "7054": {
+        "file_id": 516,
+        "content": "/paddlevideo/modeling/backbones/transnetv2.py",
+        "type": "filepath"
+    },
+    "7055": {
+        "file_id": 516,
+        "content": "OctConv3D is a configurable 3D convolutional layer in TransNetV2's backbone, utilizing features such as max pooling and SDDCNNV2 blocks for shot transition detection. ConvNextV2 applies feature extraction and pooling, while the code defines models using Linear and ConvexCombinationRegularization layers for classification tasks.",
+        "type": "summary"
+    },
+    "7056": {
+        "file_id": 516,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as functional\nimport random\nfrom paddle import ParamAttr\nfrom ..registry import BACKBONES\nclass OctConv3D(nn.Layer):\n    def __init__(self, in_filters, filters, kernel_size=3, dilation_rate=(1, 1, 1), alpha=0.25,\n                 use_bias=True, kernel_initializer=nn.initializer.KaimingNormal()):\n        super(OctConv3D, self).__init__()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:1-28"
+    },
+    "7057": {
+        "file_id": 516,
+        "content": "This code defines a 3D convolutional neural network layer called OctConv3D. It takes input and output channels, kernel size, dilation rate, alpha (for octave pooling), use_bias flag, and initializer as parameters for creating the layer. This layer can be used in other models by utilizing the BACKBONES registry.",
+        "type": "comment"
+    },
+    "7058": {
+        "file_id": 516,
+        "content": "        self.low_channels = int(filters * alpha)\n        self.high_channels = filters - self.low_channels\n        self.high_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,\n                                      dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),\n                                      weight_attr=ParamAttr(initializer=kernel_initializer),\n                                      bias_attr=ParamAttr(\n                                          initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)\n        self.high_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,\n                                     dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),\n                                     weight_attr=ParamAttr(initializer=kernel_initializer),\n                                     bias_attr=False)\n        self.low_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,\n                                     dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:30-43"
+    },
+    "7059": {
+        "file_id": 516,
+        "content": "Defines a 3D Convolutional network with interleaved low and high-resolution paths. Low-to-high and high-to-low convolutions are performed to maintain spatial resolution while reducing dimensionality for the TransNetV2 backbone model.",
+        "type": "comment"
+    },
+    "7060": {
+        "file_id": 516,
+        "content": "                                     weight_attr=ParamAttr(initializer=kernel_initializer),\n                                     bias_attr=False)\n        self.low_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,\n                                    dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),\n                                    weight_attr=ParamAttr(initializer=kernel_initializer),\n                                    bias_attr=ParamAttr(\n                                        initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)\n        self.upsampler = nn.Upsample(size=(1, 2, 2), data_format='NCDHW')\n        self.downsampler = nn.AvgPool3D(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=(0, 1, 1))\n    @staticmethod\n    def pad_to(tensor, target_shape):\n        shape = tensor.shape\n        padding = [[0, tar - curr] for curr, tar in zip(shape, target_shape)]\n        return functional.pad(tensor, padding, \"CONSTANT\", data_format='NCDHW')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:44-58"
+    },
+    "7061": {
+        "file_id": 516,
+        "content": "This code defines a TransNetV2 backbone model for video analysis. It includes convolutional layers, an upsampler, and downsampler to process input data. The `pad_to` function pads the tensor with zeros to match a target shape, useful for maintaining consistent dimensions throughout the model.",
+        "type": "comment"
+    },
+    "7062": {
+        "file_id": 516,
+        "content": "    @staticmethod\n    def crop_to(tensor, target_width, target_height):\n        return tensor[:, :, :target_height, :target_width]\n    def forward(self, inputs):\n        low_inputs, high_inputs = inputs\n        high_to_high = self.high_to_high(high_inputs)\n        high_to_low = self.high_to_low(self.downsampler(high_inputs))\n        low_to_high = self.upsampler(self.low_to_high(low_inputs))\n        low_to_low = self.low_to_low(low_inputs)\n        high_output = high_to_high[:, :, :, :low_to_high.shape[3], :low_to_high.shape[4]] + low_to_high\n        low_output = low_to_low + high_to_low[:, :, :, :low_to_low.shape[3], :low_to_low.shape[4]]\n        return low_output, high_output\nclass Conv3DConfigurable(nn.Layer):\n    def __init__(self,\n                 in_filters,\n                 filters,\n                 dilation_rate,\n                 separable=True,\n                 octave=False,\n                 use_bias=True):\n        super(Conv3DConfigurable, self).__init__()\n        assert not (separable and octave)\n        if separable:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:60-90"
+    },
+    "7063": {
+        "file_id": 516,
+        "content": "The code defines a forward function that takes inputs and performs high-to-high, high-to-low, low-to-high, and low-to-low transformations. It also includes a Conv3DConfigurable class with parameters for in_filters, filters, dilation_rate, separable, octave, and use_bias. The code asserts that separable and octave cannot both be True.",
+        "type": "comment"
+    },
+    "7064": {
+        "file_id": 516,
+        "content": "            conv1 = nn.Conv3D(in_filters, 2 * filters, kernel_size=(1, 3, 3),\n                              dilation=(1, 1, 1), padding=(0, 1, 1),\n                              weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),\n                              bias_attr=False)\n            conv2 = nn.Conv3D(2 * filters, filters, kernel_size=(3, 1, 1),\n                              dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 0, 0),\n                              weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),\n                              bias_attr=ParamAttr(\n                                  initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)\n            self.layers = nn.LayerList([conv1, conv2])\n        elif octave:\n            conv = OctConv3D(in_filters, filters, kernel_size=3, dilation_rate=(dilation_rate, 1, 1),\n                             use_bias=use_bias,\n                             kernel_initializer=nn.initializer.KaimingNormal())",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:91-104"
+    },
+    "7065": {
+        "file_id": 516,
+        "content": "The code initializes a Conv3D layer and an optional octave convolution layer for the TransNetV2 backbone. The Conv3D layers apply 3x3 kernel with varying dilation rates, while the optional OctConv3D layer has a 3x1x1 kernel and dilation rate (dilation_rate, 1, 1). The layers are added to a LayerList for further processing in the network.",
+        "type": "comment"
+    },
+    "7066": {
+        "file_id": 516,
+        "content": "            self.layers = [conv]\n        else:\n            conv = nn.Conv3D(in_filters, filters, kernel_size=3,\n                             dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 1, 1),\n                             weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),\n                             bias_attr=ParamAttr(\n                                 initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)\n            self.layers = nn.LayerList([conv])\n    def forward(self, inputs):\n        x = inputs\n        for layer in self.layers:\n            x = layer(x)\n        return x\nclass DilatedDCNNV2(nn.Layer):\n    def __init__(self,\n                 in_filters,\n                 filters,\n                 batch_norm=True,\n                 activation=None,\n                 octave_conv=False):\n        super(DilatedDCNNV2, self).__init__()\n        assert not (octave_conv and batch_norm)\n        self.Conv3D_1 = Conv3DConfigurable(in_filters, filters, 1, use_bias=not batch_norm, octave=octave_conv)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:105-131"
+    },
+    "7067": {
+        "file_id": 516,
+        "content": "This code defines a neural network backbone called TransnetV2, which consists of convolutional layers. The Conv3DConfigurable class is used to configure the layers with specified input and output filters, kernel size, dilation rate, padding, and whether to use bias or batch normalization. The DilatedDCNNV2 class extends this concept by allowing the choice between octave convolution and batch normalization. Both classes inherit from nn.Layer and have a forward method for processing inputs.",
+        "type": "comment"
+    },
+    "7068": {
+        "file_id": 516,
+        "content": "        self.Conv3D_2 = Conv3DConfigurable(in_filters, filters, 2, use_bias=not batch_norm, octave=octave_conv)\n        self.Conv3D_4 = Conv3DConfigurable(in_filters, filters, 4, use_bias=not batch_norm, octave=octave_conv)\n        self.Conv3D_8 = Conv3DConfigurable(in_filters, filters, 8, use_bias=not batch_norm, octave=octave_conv)\n        self.octave = octave_conv\n        self.bn = nn.BatchNorm3D(filters * 4, momentum=0.99, epsilon=1e-03,\n                                 weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),\n                                 bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))\n                                 ) if batch_norm else None\n        self.activation = activation\n    def forward(self, inputs):\n        conv1 = self.Conv3D_1(inputs)\n        conv2 = self.Conv3D_2(inputs)\n        conv3 = self.Conv3D_4(inputs)\n        conv4 = self.Conv3D_8(inputs)\n        # shape of convi[j]/convi is [B, 3, T, H, W], concat in channel dimension\n        if self.octave:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:132-150"
+    },
+    "7069": {
+        "file_id": 516,
+        "content": "This code defines a TransNetV2 model, which uses multiple Conv3D layers to process input data. The model includes configurable convolution layers with different filter sizes (2, 4, and 8), batch normalization, and activation functions. The forward method applies these layers to the inputs and concatenates their outputs along the channel dimension.",
+        "type": "comment"
+    },
+    "7070": {
+        "file_id": 516,
+        "content": "            x = [paddle.concat([conv1[0], conv2[0], conv3[0], conv4[0]], axis=1),\n                 paddle.concat([conv1[1], conv2[1], conv3[1], conv4[1]], axis=1)]\n        else:\n            x = paddle.concat([conv1, conv2, conv3, conv4], axis=1)\n        if self.bn is not None:\n            x = self.bn(x)\n        if self.activation is not None:\n            if self.octave:\n                x = [self.activation(x[0]), self.activation(x[1])]\n            else:\n                x = self.activation(x)\n        return x\nclass StackedDDCNNV2(nn.Layer):\n    def __init__(self,\n                 in_filters,\n                 n_blocks,\n                 filters,\n                 shortcut=True,\n                 use_octave_conv=False,\n                 pool_type=\"avg\",\n                 stochastic_depth_drop_prob=0.0):\n        super(StackedDDCNNV2, self).__init__()\n        assert pool_type == \"max\" or pool_type == \"avg\"\n        if use_octave_conv and pool_type == \"max\":\n            print(\"WARN: Octave convolution was designed with average pooling, not max pooling.\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:151-179"
+    },
+    "7071": {
+        "file_id": 516,
+        "content": "The code defines a StackedDDCNNV2 class that is a type of neural network layer. It takes in parameters such as number of input filters, number of blocks, and output filters. The class uses convolutions with optional batch normalization and activation functions. The convolutions can be either octave or non-octave depending on the parameter setting. The pooling type is either max or average pooling, and there is a stochastic depth drop probability parameter.",
+        "type": "comment"
+    },
+    "7072": {
+        "file_id": 516,
+        "content": "        self.shortcut = shortcut\n        self.DDCNN = nn.LayerList([\n            DilatedDCNNV2(in_filters if i == 1 else filters * 4, filters, octave_conv=use_octave_conv,\n                          activation=functional.relu if i != n_blocks else None) for i in range(1, n_blocks + 1)\n        ])\n        self.pool = nn.MaxPool3D(kernel_size=(1, 2, 2)) if pool_type == \"max\" else nn.AvgPool3D(kernel_size=(1, 2, 2))\n        self.octave = use_octave_conv\n        self.stochastic_depth_drop_prob = stochastic_depth_drop_prob\n    def forward(self, inputs):\n        x = inputs\n        shortcut = None\n        if self.octave:\n            x = [self.pool(x), x]\n        for block in self.DDCNN:\n            x = block(x)\n            if shortcut is None:\n                shortcut = x\n        # shape of x[i] is [B, 3, T, H, W], concat in channel dimension\n        if self.octave:\n            x = paddle.concat([x[0], self.pool(x[1])], axis=1)\n        x = functional.relu(x)\n        if self.shortcut is not None:\n            if self.stochastic_depth_drop_prob != 0.:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:181-207"
+    },
+    "7073": {
+        "file_id": 516,
+        "content": "Initializes backbone layers and sets parameters. Applies octave convolution if use_octave_conv is True, and performs max or avg pooling depending on pool_type. Stochastic depth is applied with probability stochastic_depth_drop_prob. Forward pass applies blocks of DDCNNV2, concatenates and applies ReLU activation.",
+        "type": "comment"
+    },
+    "7074": {
+        "file_id": 516,
+        "content": "                if self.training:\n                    if random.random() < self.stochastic_depth_drop_prob:\n                        x = shortcut\n                    else:\n                        x = x + shortcut\n                else:\n                    x = (1 - self.stochastic_depth_drop_prob) * x + shortcut\n            else:\n                x += shortcut\n        if not self.octave:\n            x = self.pool(x)\n        return x\nclass ResNetBlock(nn.Layer):\n    def __init__(self, in_filters, filters, strides=(1, 1)):\n        super(ResNetBlock, self).__init__()\n        self.conv1 = nn.Conv2D(in_filters, filters, kernel_size=(3, 3), stride=strides, padding=(1, 1),\n                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                               bias_attr=False)\n        self.bn1 = nn.BatchNorm2D(filters,\n                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),\n                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:208-232"
+    },
+    "7075": {
+        "file_id": 516,
+        "content": "This code defines a ResNetBlock class that consists of Conv2D layer and BatchNorm2D layer. The stochastic depth is applied during training by randomly dropping connections with a specified probability, while in non-octave cases, it applies pooling to the output.",
+        "type": "comment"
+    },
+    "7076": {
+        "file_id": 516,
+        "content": "        self.conv2 = nn.Conv2D(filters, filters, kernel_size=(3, 3), padding=(1, 1),\n                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                               bias_attr=False)\n        self.bn2 = nn.BatchNorm2D(filters,\n                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)),\n                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))\n    def forward(self, inputs):\n        x = self.conv1(inputs)\n        x = self.bn1(x)\n        x = functional.relu(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        shortcut = inputs\n        x += shortcut\n        return functional.relu(x)\nclass ResNetFeatures(nn.Layer):\n    def __init__(self, in_filters=3,\n                 mean=[0.485, 0.456, 0.406],\n                 std=[0.229, 0.224, 0.225]):\n        super(ResNetFeatures, self).__init__()\n        self.conv1 = nn.Conv2D(in_channels=in_filters, out_channels=64, kernel_size=(7, 7),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:234-260"
+    },
+    "7077": {
+        "file_id": 516,
+        "content": "The code defines a Conv2D layer and BatchNorm2D layer in the `TransNetV2` class, followed by a forward function that applies these layers in sequence. The ResNetFeatures class initializes a Conv2D layer for extracting features from input images. Both classes are part of an object-oriented model architecture.",
+        "type": "comment"
+    },
+    "7078": {
+        "file_id": 516,
+        "content": "                               stride=(2, 2), padding=(3, 3),\n                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                               bias_attr=False)\n        self.bn1 = nn.BatchNorm2D(num_features=64, momentum=0.99, epsilon=1e-03,\n                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),\n                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))\n                                  )\n        self.max_pool = nn.MaxPool2D(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n        self.layer2a = ResNetBlock(64, 64)\n        self.layer2b = ResNetBlock(64, 64)\n        self.mean = paddle.to_tensor(mean)\n        self.std = paddle.to_tensor(std)\n    def forward(self, inputs):\n        shape = inputs.shape\n        x = paddle.reshape(inputs, [shape[0] * shape[2], shape[1], shape[3], shape[4]])\n        x = (x - self.mean) / self.std\n        x = self.conv1(x)\n        x = self.bn1(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:261-282"
+    },
+    "7079": {
+        "file_id": 516,
+        "content": "This code is for TransNetV2 backbone model initialization. It includes a convolution layer with padding, batch normalization, max pooling, and ResNetBlocks (layer2a, layer2b). The forward function performs normalization, reshaping, convolution, and batch normalization on the input.",
+        "type": "comment"
+    },
+    "7080": {
+        "file_id": 516,
+        "content": "        x = functional.relu(x)\n        x = self.max_pool(x)\n        x = self.layer2a(x)\n        x = self.layer2b(x)\n        new_shape = x.shape\n        x = paddle.reshape(x, [shape[0], new_shape[1], shape[2], new_shape[2], new_shape[3]])\n        return x\nclass FrameSimilarity(nn.Layer):\n    def __init__(self,\n                 in_filters,\n                 similarity_dim=128,\n                 lookup_window=101,\n                 output_dim=128,\n                 stop_gradient=False,\n                 use_bias=False):\n        super(FrameSimilarity, self).__init__()\n        self.projection = nn.Linear(in_filters, similarity_dim,\n                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                    bias_attr=use_bias)\n        self.fc = nn.Linear(lookup_window, output_dim,\n                            weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                            bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:283-307"
+    },
+    "7081": {
+        "file_id": 516,
+        "content": "This code defines a class \"FrameSimilarity\" that takes in filters, similarity dimension, lookup window, output dimension, stop_gradient flag, and use_bias as parameters. It initializes the layer with a projection linear layer and an fc linear layer. The projection layer maps input features to a specified similarity dimension using XavierUniform initialization. The fc layer maps the lookup window to the output dimension, using XavierUniform initialization for weights and Constant initialization for biases.",
+        "type": "comment"
+    },
+    "7082": {
+        "file_id": 516,
+        "content": "        self.lookup_window = lookup_window\n        self.stop_gradient = stop_gradient\n        assert lookup_window % 2 == 1, \"`lookup_window` must be odd integer\"\n    def forward(self, inputs):\n        x = paddle.concat([paddle.mean(x, axis=[3, 4]) for x in inputs], axis=1)\n        x = paddle.transpose(x, (0, 2, 1))\n        if self.stop_gradient:\n            x = x.stop_gradient\n        x = self.projection(x)\n        x = functional.normalize(x, p=2, axis=2)\n        batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]\n        time_window = x.shape[1]\n        similarities = paddle.bmm(x, x.transpose([0, 2, 1]))  # [batch_size, time_window, time_window]\n        similarities_padded = functional.pad(similarities,\n                                             [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],\n                                             data_format='NCL')\n        batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:309-330"
+    },
+    "7083": {
+        "file_id": 516,
+        "content": "The code initializes a TransNetV2 model with lookup window and stop_gradient options. It then calculates similarities between time windows using batch mean, transpose, projection, and normalization. Finally, it pads the similarities for further calculations.",
+        "type": "comment"
+    },
+    "7084": {
+        "file_id": 516,
+        "content": "        batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])\n        time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])\n        time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])\n        lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])\n        lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices\n        indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)\n        similarities = paddle.gather_nd(similarities_padded, indices)\n        return functional.relu(self.fc(similarities))\nclass ConvexCombinationRegularization(nn.Layer):\n    def __init__(self, in_filters, filters=32, delta_scale=10., loss_weight=0.01):\n        super(ConvexCombinationRegularization, self).__init__()\n        self.projection = nn.Conv3D(in_filters, filters, kernel_size=1, dilation=1, padding=(0, 0, 0),\n                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:331-346"
+    },
+    "7085": {
+        "file_id": 516,
+        "content": "This code is calculating the indices for gathering similarities from a padded tensor. It tiles and stacks batch, time, and lookup indices to create an array of valid indices. Then it uses these indices to gather similarities from the padded tensor and applies ReLU activation on top of an FC layer to return the output. The ConvexCombinationRegularization class initializes a projection layer with specified parameters.",
+        "type": "comment"
+    },
+    "7086": {
+        "file_id": 516,
+        "content": "                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))\n        self.features = nn.Conv3D((filters * 3), filters * 2,\n                                  kernel_size=(3, 3, 3), dilation=1, padding=(1, 1, 1),\n                                  weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))\n        self.dense = nn.Linear(64, 1, weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), bias_attr=True)\n        self.loss = nn.SmoothL1Loss(reduction='none')\n        self.delta_scale = delta_scale\n        self.loss_weight = loss_weight\n    def forward(self, image_inputs, feature_inputs):\n        x = feature_inputs\n        x = self.projection(x)\n        x = functional.relu(x)\n        batch_size = x.shape[0]\n        window_size = x.shape[2]\n        first_frame = paddle.tile(x[:, :, :1], [1, 1, window_size, 1, 1])\n        last_frame = paddle.tile(x[:, :, -1:], [1, 1, window_size, 1, 1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:347-364"
+    },
+    "7087": {
+        "file_id": 516,
+        "content": "This code defines a Conv3D model for the TransNetV2 backbone. It has a projection layer, relu activation, and takes in image_inputs and feature_inputs. The forward function processes these inputs, extracting the first and last frame windows.",
+        "type": "comment"
+    },
+    "7088": {
+        "file_id": 516,
+        "content": "        x = paddle.concat([x, first_frame, last_frame], 1)\n        x = self.features(x)\n        x = functional.relu(x)\n        x = paddle.mean(x, axis=[3, 4])\n        x = paddle.transpose(x, (0, 2, 1))\n        alpha = self.dense(x)\n        alpha = paddle.transpose(alpha, (0, 2, 1))\n        first_img = paddle.tile(image_inputs[:, :, :1], [1, 1, window_size, 1, 1])\n        last_img = paddle.tile(image_inputs[:, :, -1:], [1, 1, window_size, 1, 1])\n        alpha_ = functional.sigmoid(alpha)\n        alpha_ = paddle.reshape(alpha_, [batch_size, 1, window_size, 1, 1])\n        predictions_ = (alpha_ * first_img + (1 - alpha_) * last_img)\n        loss_ = self.loss(label=image_inputs / self.delta_scale, input=predictions_ / self.delta_scale)\n        loss_ = self.loss_weight * paddle.mean(loss_)\n        return alpha, loss_\nclass ColorHistograms(nn.Layer):\n    def __init__(self,\n                 lookup_window=101,\n                 output_dim=None):\n        super(ColorHistograms, self).__init__()\n        self.fc = nn.Linear(lookup_window, output_dim,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:365-390"
+    },
+    "7089": {
+        "file_id": 516,
+        "content": "This code is part of the TransnetV2 model in PaddleVideo. It concatenates frames, processes them through layers, and calculates alpha values for first and last images. It then combines these images based on the calculated alphas and performs loss calculation using a loss function. The ColorHistograms layer is initialized with a linear transformation layer.",
+        "type": "comment"
+    },
+    "7090": {
+        "file_id": 516,
+        "content": "                            weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                            bias_attr=ParamAttr(\n                                initializer=nn.initializer.Constant(value=0.))) if output_dim is not None else None\n        self.lookup_window = lookup_window\n        assert lookup_window % 2 == 1, \"`lookup_window` must be odd integer\"\n    def compute_color_histograms(self, frames):\n        frames = frames.astype('int32')\n        def get_bin(frames):\n            # returns 0 .. 511\n            R, G, B = frames[:, :, 0], frames[:, :, 1], frames[:, :, 2]\n            R, G, B = R // 32, G // 32, B // 32\n            return (R * 64) + (G * 8) + B\n        batch_size = paddle.slice(frames.shape, starts=[0], ends=[1], axes=[0]) if frames.shape[0] == -1 else frames.shape[0]\n        time_window, height, width, no_channels = frames.shape[1:]\n        assert no_channels == 3 or no_channels == 6\n        if no_channels == 3:\n            frames_flatten = frames.reshape([-1, height * width, 3])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:391-411"
+    },
+    "7091": {
+        "file_id": 516,
+        "content": "This code defines a function to compute color histograms of frames. It first converts the frame values to int32, then defines a function get_bin which extracts and scales RGB values. The batch size is extracted from the frames shape, and the frames are flattened into a 3-dimensional array if the number of channels is 3 or 6.",
+        "type": "comment"
+    },
+    "7092": {
+        "file_id": 516,
+        "content": "        else:\n            frames_flatten = frames.reshape([-1, height * width * 2, 3])\n        binned_values = get_bin(frames_flatten)\n        frame_bin_prefix = (paddle.arange(0, batch_size * time_window) * 512).reshape([-1, 1])\n        binned_values = (binned_values + frame_bin_prefix).reshape([-1, 1])\n        histograms = paddle.zeros_like(frame_bin_prefix, dtype='int32').tile([512]).reshape([-1])\n        histograms = histograms.scatter_nd_add(binned_values, paddle.ones_like(binned_values, dtype='int32').reshape([-1]))\n        histograms = histograms.reshape([batch_size, time_window, 512]).astype('float32')\n        histograms_normalized = functional.normalize(histograms, p=2, axis=2)\n        return histograms_normalized\n    def forward(self, inputs):\n        x = self.compute_color_histograms(inputs)\n        batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]\n        time_window = x.shape[1]\n        similarities = paddle.bmm(x, x.transpose([0, 2, 1]))  # [batch_size, time_window, time_window]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:412-429"
+    },
+    "7093": {
+        "file_id": 516,
+        "content": "This code computes color histograms for each frame in a video and then calculates similarities between frames using batch matrix multiplication. It first checks the input shape to determine whether it should extract only the batch size and time window or use the total shape. It then reshapes and bins the frame values, normalizes the histograms, and finally computes the similarity matrix for each frame pair. The purpose is likely for video sequence analysis or comparison.",
+        "type": "comment"
+    },
+    "7094": {
+        "file_id": 516,
+        "content": "        similarities_padded = functional.pad(similarities,\n                                             [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],\n                                             data_format='NCL')\n        batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])\n        batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])\n        time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])\n        time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])\n        lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])\n        lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices\n        indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)\n        similarities = paddle.gather_nd(similarities_padded, indices)\n        if self.fc is not None:\n            return functional.relu(self.fc(similarities))\n        return similarities",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:430-446"
+    },
+    "7095": {
+        "file_id": 516,
+        "content": "This code performs lookup on a padded tensor using gathered indices from batch, time window, and lookup window. It then applies an optional fully connected layer with ReLU activation function if present.",
+        "type": "comment"
+    },
+    "7096": {
+        "file_id": 516,
+        "content": "@BACKBONES.register()\nclass TransNetV2(nn.Layer):\n    \"\"\"TransNetV2 model from\n    `\"TransNet V2: An effective deep network architecture for fast shot transition detection\" <https://arxiv.org/abs/2008.04838>`_\n    \"\"\"\n    def __init__(self,\n                 F=16, L=3, S=2, D=1024,\n                 use_many_hot_targets=True,\n                 use_frame_similarity=True,\n                 use_color_histograms=True,\n                 use_mean_pooling=False,\n                 dropout_rate=0.5,\n                 use_convex_comb_reg=False,\n                 use_resnet_features=False,\n                 use_resnet_like_top=False,\n                 frame_similarity_on_last_layer=False,\n                 mean=[0.485, 0.456, 0.406],\n                 std=[0.229, 0.224, 0.225]):\n        super(TransNetV2, self).__init__()\n        self.mean = np.array(mean, np.float32).reshape([1, 3, 1, 1]) * 255\n        self.std = np.array(std, np.float32).reshape([1, 3, 1, 1]) * 255\n        self.use_resnet_features = use_resnet_features\n        s",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:449-473"
+    },
+    "7097": {
+        "file_id": 516,
+        "content": "The code defines the TransNetV2 model, a deep network architecture for shot transition detection. It has multiple input sources and various options to use or not use different features and operations. The mean and std are provided as initialization parameters to standardize the input data.",
+        "type": "comment"
+    },
+    "7098": {
+        "file_id": 516,
+        "content": "elf.resnet_layers = ResNetFeatures(in_filters=3, mean=self.mean, std=self.std) if self.use_resnet_features else None\n        self.resnet_like_top = use_resnet_like_top\n        if self.resnet_like_top:\n            self.resnet_like_top_conv = nn.Conv3D(64 if self.use_resnet_features else 3, 32, kernel_size=(3, 7, 7),\n                                                  stride=(1, 2, 2),\n                                                  padding=(1, 3, 3),\n                                                  weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                                  bias_attr=False)\n            self.resnet_like_top_bn = nn.BatchNorm3D(32, momentum=0.99, epsilon=1e-03,\n                                                     weight_attr=ParamAttr(\n                                                         initializer=nn.initializer.Constant(value=1.)),\n                                                     bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:473-484"
+    },
+    "7099": {
+        "file_id": 516,
+        "content": "Code snippet is from PaddleVideo's TransNetV2 model. It checks if use_resnet_features is True and if so, initializes resnet_layers with ResNetFeatures. If resnet_like_top is also True, it then initializes resnet_like_top_conv and resnet_like_top_bn for ResNet-like top layers with specified parameters.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/71.json b/docs/data/71.json
new file mode 100644
index 000000000..cbe809f0f
--- /dev/null
+++ b/docs/data/71.json
@@ -0,0 +1,548 @@
+{
+    "7100": {
+        "file_id": 516,
+        "content": "            self.resnet_like_top_max_pool = nn.MaxPool3D(kernel_size=(1, 3, 3), stride=(1, 2, 2),\n                                                         padding=(0, 1, 1))\n        if self.resnet_like_top:\n            in_filters = 32\n        elif self.use_resnet_features:\n            in_filters = 64\n        else:\n            in_filters = 3\n        self.SDDCNN = nn.LayerList(\n            [StackedDDCNNV2(in_filters=in_filters, n_blocks=S, filters=F,\n                            stochastic_depth_drop_prob=0.)] +\n            [StackedDDCNNV2(in_filters=(F * 2 ** (i - 1)) * 4, n_blocks=S, filters=F * 2 ** i) for i in range(1, L)]\n        )\n        self.frame_sim_layer = FrameSimilarity(\n            sum([(F * 2 ** i) * 4 for i in range(L)]), lookup_window=101, output_dim=128, similarity_dim=128,\n            use_bias=True\n        ) if use_frame_similarity else None\n        self.color_hist_layer = ColorHistograms(\n            lookup_window=101, output_dim=128\n        ) if use_color_histograms else None\n        self.dropout = nn.Dropout(dropout_rate) if dropout_rate is not None else None",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:485-508"
+    },
+    "7101": {
+        "file_id": 516,
+        "content": "This code initializes the model components of a TransNetv2 backbone. It sets up max pooling, creates a LayerList for SDDCNNV2 blocks, initializes frame similarity and color histogram layers based on flags, and includes dropout layer if needed.",
+        "type": "comment"
+    },
+    "7102": {
+        "file_id": 516,
+        "content": "        output_dim = ((F * 2 ** (L - 1)) * 4) * 3 * 6  # 3x6 for spatial dimensions\n        if use_frame_similarity: output_dim += 128\n        if use_color_histograms: output_dim += 128\n        self.use_mean_pooling = use_mean_pooling\n        self.has_downsample = False\n        if self.use_resnet_features or self.resnet_like_top or self.use_mean_pooling:\n            self.has_downsample = True\n        self.fc1 = nn.Linear(512 if self.has_downsample else output_dim, D,\n                             weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                             bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))\n                             )\n        self.frame_similarity_on_last_layer = frame_similarity_on_last_layer\n        self.cls_layer1 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,\n                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:510-526"
+    },
+    "7103": {
+        "file_id": 516,
+        "content": "This code initializes a neural network model with a linear layer (`self.fc1`) that takes an input dimension of 512 if certain conditions are met, otherwise it takes the output_dim calculated earlier. The layer has D output dimensions and uses Xavier uniform initialization for weights and constant initialization for biases. Additionally, there's another linear layer (`self.cls_layer1`) with 1 output dimension that is initialized with Xavier uniform initialization for weights and a constant value of 0 for biases. It takes an input dimension of either 1152 or D based on whether frame similarity is added to the last layer or not.",
+        "type": "comment"
+    },
+    "7104": {
+        "file_id": 516,
+        "content": "                                    )\n        self.cls_layer2 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,\n                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))\n                                    ) if use_many_hot_targets else None\n        self.convex_comb_reg = ConvexCombinationRegularization(\n            in_filters=(F * 2 ** (L - 1) * 4)) if use_convex_comb_reg else None\n    def forward(self, inputs):\n        assert list(inputs.shape[2:]) == [27, 48, 3] and inputs.dtype == paddle.float32, \\\n            \"incorrect input type and/or shape\"\n        out_dict = {}\n        # shape [B, T, H, W, 3] to shape [B, 3, T, H, W]\n        x = inputs.transpose([0, 4, 1, 2, 3])\n        if self.use_resnet_features:\n            x = self.resnet_layers(x)\n        else:\n            x = x / 255.\n        inputs = inputs.clip(min=0).astype('uint8')\n        if self.resnet_like_top:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:527-548"
+    },
+    "7105": {
+        "file_id": 516,
+        "content": "The code defines a model with two layers, a Linear layer and ConvexCombinationRegularization, depending on the use_many_hot_targets and use_convex_comb_reg parameters. The Linear layer has 1 output for each frame, unless frame_similarity_on_last_layer is set, in which case it has D outputs. If use_many_hot_targets is False, the layer is None. The forward function receives inputs of shape [B, T, H, W, 3] and performs transpose, resnet_features processing (if use_resnet_features=True), and normalization to apply the model layers. It also clips the input values between 0 and 255 before applying the regularization if use_convex_comb_reg is True.",
+        "type": "comment"
+    },
+    "7106": {
+        "file_id": 516,
+        "content": "            x = self.resnet_like_top_conv(x)\n            x = self.resnet_like_top_bn(x)\n            x = self.resnet_like_top_max_pool(x)\n        block_features = []\n        for block in self.SDDCNN:\n            x = block(x)\n            block_features.append(x)\n        if self.convex_comb_reg is not None:\n            out_dict[\"alphas\"], out_dict[\"comb_reg_loss\"] = self.convex_comb_reg(inputs.transpose([0, 4, 1, 2, 3]), x)\n        if self.use_mean_pooling:\n            x = paddle.mean(x, axis=[3, 4])\n            x = x.transpose([0, 2, 1])\n        else:\n            x = x.transpose([0, 2, 3, 4, 1])\n            x = x.reshape([x.shape[0], x.shape[1], x.shape[2]*x.shape[3]*x.shape[4]])\n        if self.frame_sim_layer is not None:\n            x = paddle.concat([self.frame_sim_layer(block_features), x], 2)\n        if self.color_hist_layer is not None:\n            x = paddle.concat([self.color_hist_layer(inputs), x], 2)\n        x = self.fc1(x)\n        x = functional.relu(x)\n        if self.dropout is not None:\n            x = self.dropout(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:549-571"
+    },
+    "7107": {
+        "file_id": 516,
+        "content": "This code performs feature extraction and pooling operations for a ConvNextV2 backbone model. It applies residual blocks, top convolutions, batch normalization, and max pooling to the input. Then it calculates convex combination regression if required. The code either applies mean pooling or 3D reshaping based on the use_mean_pooling flag. Finally, it concatenates frame similarity layer outputs and color histogram layer outputs before performing fully connected layer calculations and applying relu activation and dropout if necessary.",
+        "type": "comment"
+    },
+    "7108": {
+        "file_id": 516,
+        "content": "        if self.frame_sim_layer is not None and self.frame_similarity_on_last_layer:\n            x = paddle.concat([self.frame_sim_layer(block_features), x], 2)\n        one_hot = self.cls_layer1(x)\n        if self.cls_layer2 is not None:\n            out_dict[\"many_hot\"] = self.cls_layer2(x)\n        if len(out_dict) > 0:\n            return one_hot, out_dict\n        return one_hot",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/transnetv2.py:572-581"
+    },
+    "7109": {
+        "file_id": 516,
+        "content": "This code checks if the frame similarity layer and classifier layers are not None, then performs a concatenation operation on block features and x. It applies the classifier layer to the resulting output and optionally applies another classifier layer. The function returns one_hot and an optional out_dict if they exist.",
+        "type": "comment"
+    },
+    "7110": {
+        "file_id": 517,
+        "content": "/paddlevideo/modeling/backbones/vit.py",
+        "type": "filepath"
+    },
+    "7111": {
+        "file_id": 517,
+        "content": "The PaddleVideo code offers video processing functions, including a VisionTransformer class. It initializes and applies the model using parameters, transformations, and blocks while setting up components for future use.",
+        "type": "summary"
+    },
+    "7112": {
+        "file_id": 517,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections.abc import Callable\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Constant\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import trunc_normal_\n__all__ = ['VisionTransformer']\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)\ndef to_2tuple(x):\n    return tuple([x] * 2)\ndef drop_path(x, drop_prob=0., training=False):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:1-37"
+    },
+    "7113": {
+        "file_id": 517,
+        "content": "This code snippet is from the PaddleVideo library and contains a copyright notice, license information, and several helper functions. The VisionTransformer class will be defined later in the file, which serves as a backbone model for video processing tasks. The code defines constants for zero and one values, a function to convert a single value into a tuple of length 2 (to_2tuple), and a drop path function that applies dropout to inputs with a specified probability during training.",
+        "type": "comment"
+    },
+    "7114": {
+        "file_id": 517,
+        "content": "    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    # issuecomment-532968956 ...\n    See discussion: https://github.com/tensorflow/tpu/issues/494\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)\n    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)\n    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\nclass Identity(nn.Layer):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:38-65"
+    },
+    "7115": {
+        "file_id": 517,
+        "content": "This code defines three classes: \"DropPath\", \"Identity\". The DropPath class implements dropout paths (Stochastic Depth) for each sample in the main path of residual blocks. It takes a single parameter, 'drop_prob', to control the probability of dropping out features. If 'drop_prob' is 0 or not training, it returns the input unchanged. The Identity class simply returns its input without any transformation.",
+        "type": "comment"
+    },
+    "7116": {
+        "file_id": 517,
+        "content": "    def __init__(self):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.0):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\nclass Attention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.0,\n                 proj_drop=0.0):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:66-104"
+    },
+    "7117": {
+        "file_id": 517,
+        "content": "The code defines three classes: Identity, Mlp, and Attention. Identity is a simple class that returns its input unchanged. Mlp stands for Multilayer Perceptron, and it's a feed-forward neural network layer. Attention is a class for implementing attention mechanisms in the model. Both Mlp and Attention classes take inputs and return outputs after applying their respective operations.",
+        "type": "comment"
+    },
+    "7118": {
+        "file_id": 517,
+        "content": "        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n        self.attn_drop = nn.Dropout(attn_drop)\n    def forward(self, x):\n        N, C = x.shape[1:]\n        qkv = self.qkv(x).reshape(\n            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(\n                (2, 0, 3, 1, 4))\n        q, k, v = qkv[0], qkv[1], qkv[2]\n        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale\n        attn = nn.functional.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.0,\n                 qkv_bias=False,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:105-138"
+    },
+    "7119": {
+        "file_id": 517,
+        "content": "This code initializes a multi-head attention layer, and defines the forward pass. It reshapes input into query (Q), key (K), and value (V) matrices, calculates attention scores, applies dropout, and reconstructs output using residual connections and layer normalization. The `Block` class is also defined for building a Vision Transformer model.",
+        "type": "comment"
+    },
+    "7120": {
+        "file_id": 517,
+        "content": "                 qk_scale=None,\n                 drop=0.0,\n                 attn_drop=0.0,\n                 drop_path=0.1,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 attention_type='divided_space_time'):\n        super().__init__()\n        self.attention_type = attention_type\n        if isinstance(norm_layer, str):\n            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm1 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        self.attn = Attention(dim,\n                              num_heads=num_heads,\n                              qkv_bias=qkv_bias,\n                              qk_scale=qk_scale,\n                              attn_drop=attn_drop,\n                              proj_drop=drop)\n        # Temporal Attention Parameters\n        if self.attention_type == 'divided_space_time':",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:139-166"
+    },
+    "7121": {
+        "file_id": 517,
+        "content": "This function is initializing a backbone model with specified parameters. It takes in arguments like attention_type, norm_layer, and others to define the model's layers, including its attention layer. If norm_layer is a string, it uses the given string as the normalization layer; if it's a Callable, it uses that function as the normalization layer. The code also checks if the attention type is 'divided_space_time'.",
+        "type": "comment"
+    },
+    "7122": {
+        "file_id": 517,
+        "content": "            if isinstance(norm_layer, str):\n                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n            elif isinstance(norm_layer, Callable):\n                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)\n            else:\n                raise TypeError(\n                    \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n            self.temporal_attn = Attention(dim,\n                                           num_heads=num_heads,\n                                           qkv_bias=qkv_bias,\n                                           qk_scale=qk_scale,\n                                           attn_drop=attn_drop,\n                                           proj_drop=drop)\n            self.temporal_fc = nn.Linear(dim, dim)\n        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        if isinstance(norm_layer, str):\n            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:167-185"
+    },
+    "7123": {
+        "file_id": 517,
+        "content": "This code initializes the temporal normalization layer and attention mechanism for a Vision Transformer backbone. It also creates a linear layer and drop path, based on provided configurations. The norm_layer parameter can be a string representing the desired normalization layer or a Callable object. If not a valid type, it raises a TypeError.",
+        "type": "comment"
+    },
+    "7124": {
+        "file_id": 517,
+        "content": "        elif isinstance(norm_layer, Callable):\n            self.norm2 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n    def forward(self, x, B, T, W):\n        num_spatial_tokens = (x.shape[1] - 1) // T\n        H = num_spatial_tokens // W\n        if self.attention_type in ['space_only', 'joint_space_time']:\n            x = x + self.drop_path(self.attn(self.norm1(x)))\n            x = x + self.drop_path(self.mlp(self.norm2(x)))\n            return x\n        elif self.attention_type == 'divided_space_time':\n            ########## Temporal ##########\n            xt = x[:, 1:, :]\n            _, _, _, _t, _m = B, H, W, T, xt.shape[-1]\n            xt = xt.reshape([-1, _t, _m])\n            res_temporal = self.drop_path(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:186-210"
+    },
+    "7125": {
+        "file_id": 517,
+        "content": "The code defines a class and its forward method. It sets the normalization layer, calculates the number of spatial tokens, checks the attention type, applies normalization and MLP layers to the input, and performs divided space-time attention.",
+        "type": "comment"
+    },
+    "7126": {
+        "file_id": 517,
+        "content": "                self.temporal_attn(self.temporal_norm1(xt)))\n            _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]\n            res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])\n            res_temporal = self.temporal_fc(res_temporal)\n            xt = x[:, 1:, :] + res_temporal\n            ########## Spatial ##########\n            init_cls_token = x[:, 0, :].unsqueeze(1)\n            cls_token = init_cls_token.tile((1, T, 1))\n            _b, _t, _m = cls_token.shape\n            cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)\n            xs = xt\n            _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]\n            xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(\n                (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])\n            xs = paddle.concat((cls_token, xs), axis=1)\n            res_spatial = self.drop_path(self.attn(self.norm1(xs)))\n            # Taking care of CLS token\n            cls_token = res_spatial[:, 0, :]\n            _, _t, _m = B, T, cls_token.shape[-1]\n            cls_token = cls_token.reshape([-1, _t, _m])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:211-235"
+    },
+    "7127": {
+        "file_id": 517,
+        "content": "This code performs spatial attention in the Vision Transformer model. It creates a cls_token, reshapes the input, concatenates it with the cls_token, and then passes it through a drop path and an attention layer. Finally, it extracts the cls_token for further use.",
+        "type": "comment"
+    },
+    "7128": {
+        "file_id": 517,
+        "content": "            # averaging for every frame\n            cls_token = paddle.mean(cls_token, axis=1, keepdim=True)\n            res_spatial = res_spatial[:, 1:, :]\n            _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]\n            res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(\n                (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])\n            res = res_spatial\n            x = xt\n            x = paddle.concat((init_cls_token, x), axis=1) + paddle.concat(\n                (cls_token, res), axis=1)\n            # Mlp\n            x = x + self.drop_path(self.mlp(self.norm2(x)))\n            return x\n        else:\n            raise NotImplementedError\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n    def __init__(self,\n                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768):\n        super().__init__()\n        img_size = to_2tuple(img_size)\n        patch_size = to_2tuple(patch_size)\n        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:236-267"
+    },
+    "7129": {
+        "file_id": 517,
+        "content": "This code performs averaging across frames, reshapes the spatial features, concatenates initial class token and input sequence, adds a drop path and MLP layer, and returns the output. It also defines PatchEmbed for image to patch embedding.",
+        "type": "comment"
+    },
+    "7130": {
+        "file_id": 517,
+        "content": "                                                        patch_size[0])\n        self.img_size = img_size\n        self.patch_size = patch_size\n        self.num_patches = num_patches\n        self.proj = nn.Conv2D(in_channels,\n                              embed_dim,\n                              kernel_size=patch_size,\n                              stride=patch_size)\n    def forward(self, x):\n        B, C, T, H, W = x.shape\n        assert H == self.img_size[0] and W == self.img_size[1], \\\n            f\"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).\"\n        x = x.transpose((0, 2, 1, 3, 4))\n        x = x.reshape([-1, C, H, W])\n        x = self.proj(x)\n        W = x.shape[-1]\n        x = x.flatten(2).transpose((0, 2, 1))\n        return x, T, W\n@BACKBONES.register()\nclass VisionTransformer(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n    \"\"\"\n    def __init__(self,\n                 pretrained=None,\n                 img_size=224,\n                 patch_size=16,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:268-298"
+    },
+    "7131": {
+        "file_id": 517,
+        "content": "The code defines a VisionTransformer class that takes input patches of an image. It initializes the model parameters such as img_size, patch_size and num_patches. The forward function performs the transformation by projecting the input into embedding space using a convolutional layer. If the input image size does not match the expected model size, it raises an assertion error. This class is registered with BACKBONES for future use.",
+        "type": "comment"
+    },
+    "7132": {
+        "file_id": 517,
+        "content": "                 in_channels=3,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.1,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 num_seg=8,\n                 attention_type='divided_space_time',\n                 **args):\n        super().__init__()\n        self.pretrained = pretrained\n        self.num_seg = num_seg\n        self.attention_type = attention_type\n        self.num_features = self.embed_dim = embed_dim\n        self.patch_embed = PatchEmbed(img_size=img_size,\n                                      patch_size=patch_size,\n                                      in_channels=in_channels,\n                                      embed_dim=embed_dim)\n        num_patches = self.patch_embed.num_patches\n        # Positional Embeddings\n        self.cls_token = self.create_parameter(shape=(1, 1, embed_dim),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:299-327"
+    },
+    "7133": {
+        "file_id": 517,
+        "content": "This code initializes a Vision Transformer (ViT) backbone model with specified parameters such as input dimensions, embedding dimension, depth, number of heads, mlp ratio, and attention type. The code sets up the patch embedding layer, creates a class token, and defines the number of patches based on the input size provided.",
+        "type": "comment"
+    },
+    "7134": {
+        "file_id": 517,
+        "content": "                                               default_initializer=zeros_)\n        self.pos_embed = self.create_parameter(shape=(1, num_patches + 1,\n                                                      embed_dim),\n                                               default_initializer=zeros_)\n        self.pos_drop = nn.Dropout(p=drop_rate)\n        if self.attention_type != 'space_only':\n            self.time_embed = self.create_parameter(shape=(1, num_seg,\n                                                           embed_dim),\n                                                    default_initializer=zeros_)\n            self.time_drop = nn.Dropout(p=drop_rate)\n        self.add_parameter(\"pos_embed\", self.pos_embed)\n        self.add_parameter(\"cls_token\", self.cls_token)\n        dpr = np.linspace(0, drop_path_rate, depth)\n        self.blocks = nn.LayerList([\n            Block(dim=embed_dim,\n                  num_heads=num_heads,\n                  mlp_ratio=mlp_ratio,\n                  qkv_bias=qkv_bias,\n                  qk_scale=qk_scale,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:328-350"
+    },
+    "7135": {
+        "file_id": 517,
+        "content": "This code initializes various components of a vision transformer model, including positional embeddings (pos_embed), classification token (cls_token), and dropout layers (pos_drop, time_drop). It also creates a LayerList of blocks with specified dimensions and parameters.",
+        "type": "comment"
+    },
+    "7136": {
+        "file_id": 517,
+        "content": "                  drop=drop_rate,\n                  attn_drop=attn_drop_rate,\n                  drop_path=dpr[i],\n                  norm_layer=norm_layer,\n                  epsilon=epsilon,\n                  attention_type=self.attention_type) for i in range(depth)\n        ])\n        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)\n    def init_weights(self):\n        \"\"\"First init model's weight\"\"\"\n        trunc_normal_(self.pos_embed, std=0.02)\n        trunc_normal_(self.cls_token, std=0.02)\n        self.apply(self._init_fn)\n        if self.attention_type == 'divided_space_time':\n            i = 0\n            for m in self.blocks.sublayers(include_self=True):\n                m_str = str(m)\n                if 'Block' in m_str:\n                    if i > 0:\n                        zeros_(m.temporal_fc.weight)\n                        zeros_(m.temporal_fc.bias)\n                    i += 1\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if isinstance(\n                self.pretrained, str\n        ) and self.pretrained.strip() != \"\":  # load pretrained weights",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:351-379"
+    },
+    "7137": {
+        "file_id": 517,
+        "content": "This code initializes a Vision Transformer (ViT) model. It creates a series of blocks with specified dimensions, applies normalization layers, and initializes the weight values using truncated normal distribution. Additionally, if pre-trained weights are provided, it loads them into the model.",
+        "type": "comment"
+    },
+    "7138": {
+        "file_id": 517,
+        "content": "            load_ckpt(self,\n                      self.pretrained,\n                      num_patches=self.patch_embed.num_patches,\n                      num_seg=self.num_seg,\n                      attention_type=self.attention_type)\n    def _init_fn(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):\n            ones_(m.weight)\n            zeros_(m.bias)\n    def forward_features(self, x):\n        # B = x.shape[0]\n        B = paddle.shape(x)[0]\n        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]\n        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]\n        x = paddle.concat((cls_tokens, x), axis=1)\n        pos_interp = (x.shape[1] != self.pos_embed.shape[1])\n        if pos_interp:\n            pos_embed = self.pos_embed\n            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)\n            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:380-405"
+    },
+    "7139": {
+        "file_id": 517,
+        "content": "This code initializes the forward function of a Vision Transformer (ViT) model. It extracts features from input images, adds positional embeddings, and handles batch size changes. The trunc_normal_ and zeros_ functions are used to initialize weights and biases for layers like Linear and LayerNorm, respectively.",
+        "type": "comment"
+    },
+    "7140": {
+        "file_id": 517,
+        "content": "                (0, 2, 1))\n            P = int(other_pos_embed.shape[2]**0.5)\n            H = x.shape[1] // W\n            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])\n            new_pos_embed = F.interpolate(other_pos_embed,\n                                          size=(H, W),\n                                          mode='nearest')\n            new_pos_embed = new_pos_embed.flatten(2)\n            new_pos_embed = new_pos_embed.transpose((0, 2, 1))\n            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),\n                                          axis=1)\n            x = x + new_pos_embed\n        else:\n            x = x + self.pos_embed\n        x = self.pos_drop(x)\n        # Time Embeddings\n        if self.attention_type != 'space_only':\n            cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(\n                T)[0].index_select(paddle.to_tensor([0]), axis=1)\n            x = x[:, 1:]\n            _, _n, _m = x.shape\n            _t = T\n            x = x.reshape([-1, _t, _n, _m]).transpose(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:406-430"
+    },
+    "7141": {
+        "file_id": 517,
+        "content": "The code is applying relative position embeddings to the input features (x) for a vision transformer model. It first checks if a specific flag is set, then interpolates other position embeddings based on the size of the input and adds them to class position embeddings. If the flag is not set, it simply adds the position embeddings from the model. Afterward, the code applies time embeddings if the attention type is not \"space_only\".",
+        "type": "comment"
+    },
+    "7142": {
+        "file_id": 517,
+        "content": "                (0, 2, 1, 3)).reshape([-1, _t, _m])\n            # Resizing time embeddings in case they don't match\n            time_interp = (T != self.time_embed.shape[1])\n            if time_interp:  # T' != T\n                time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)\n                new_time_embed = F.interpolate(time_embed,\n                                               size=(T, x.shape[-1]),\n                                               mode='nearest').squeeze(0)\n                new_time_embed = new_time_embed.transpose((0, 2, 1))\n                x = x + new_time_embed\n            else:\n                x = x + self.time_embed\n            x = self.time_drop(x)\n            _, _t, _m = x.shape\n            x = x.reshape([-1, W * W * T, _m])\n            x = paddle.concat((cls_tokens, x), axis=1)\n        # Attention blocks\n        for blk in self.blocks:\n            x = blk(x, B, T, W)\n        # Predictions for space-only baseline\n        if self.attention_type == 'space_only':\n            _, _n, _m = x.shape",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:431-455"
+    },
+    "7143": {
+        "file_id": 517,
+        "content": "This code performs time embeddings resizing and adds them to the input feature maps. It then flattens the tensor, concatenates class tokens, processes through attention blocks, and finally, for space-only attention type, it makes predictions.",
+        "type": "comment"
+    },
+    "7144": {
+        "file_id": 517,
+        "content": "            _t = T\n            x = x.reshape([-1, _t, _n, _m])\n            x = paddle.mean(x, 1)  # averaging predictions for every frame\n        x = self.norm(x)\n        return x[:, 0]  # [B,  embed_dim]\n    def forward(self, x):\n        x = self.forward_features(x)\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit.py:456-465"
+    },
+    "7145": {
+        "file_id": 517,
+        "content": "This code snippet is part of a Vision Transformer (ViT) model implementation. The function averages predictions for every frame and applies normalization before returning the embeddings for each image in the input sequence.",
+        "type": "comment"
+    },
+    "7146": {
+        "file_id": 518,
+        "content": "/paddlevideo/modeling/backbones/vit_tweaks.py",
+        "type": "filepath"
+    },
+    "7147": {
+        "file_id": 518,
+        "content": "The PaddleVideo library's backbones code introduces the VisionTransformer_tweaks model with weight initialization, stochastic depth, spatial attention in ViT models, and transformer configurations. It is a time-based feature modification model that computes space-only predictions through attention blocks.",
+        "type": "summary"
+    },
+    "7148": {
+        "file_id": 518,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections.abc import Callable\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Constant\nfrom paddle.regularizer import L2Decay\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import trunc_normal_\n__all__ = ['VisionTransformer_tweaks']\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:1-32"
+    },
+    "7149": {
+        "file_id": 518,
+        "content": "This code is from the PaddleVideo library's backbones module and defines the VisionTransformer_tweaks model. It imports necessary libraries, sets constant values, and includes function definitions for weight initialization and regularizers. The BACKBONES registry is also defined to categorize the model type.",
+        "type": "comment"
+    },
+    "7150": {
+        "file_id": 518,
+        "content": "def to_2tuple(x):\n    return tuple([x] * 2)\ndef rand_bbox(size, lam):\n    \"\"\" rand_bbox \"\"\"\n    w = size[2]\n    h = size[3]\n    cut_rat = np.sqrt(1. - lam)\n    cut_w = np.int(w * cut_rat)\n    cut_h = np.int(h * cut_rat)\n    # uniform\n    cx = np.random.randint(w)\n    cy = np.random.randint(h)\n    bbx1 = np.clip(cx - cut_w // 2, 0, w)\n    bby1 = np.clip(cy - cut_h // 2, 0, h)\n    bbx2 = np.clip(cx + cut_w // 2, 0, w)\n    bby2 = np.clip(cy + cut_h // 2, 0, h)\n    return bbx1, bby1, bbx2, bby2\ndef drop_path(x, drop_prob=0., training=False):\n    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    # issuecomment-532968956 ...\n    See discussion: https://github.com/tensorflow/tpu/issues/494\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob)\n    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:35-69"
+    },
+    "7151": {
+        "file_id": 518,
+        "content": "The code defines three functions. The \"to_2tuple\" function takes an input and returns a tuple with the same value repeated twice. The \"rand_bbox\" function generates random bounding box coordinates within the size of an image, given a specified probability. The \"drop_path\" function applies stochastic depth (dropout) to each sample in the main path of residual blocks with a specified dropout rate.",
+        "type": "comment"
+    },
+    "7152": {
+        "file_id": 518,
+        "content": "    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\nclass Identity(nn.Layer):\n    def __init__(self):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.,\n                 wd_bias=True,\n                 lr_mult=1.0):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:70-107"
+    },
+    "7153": {
+        "file_id": 518,
+        "content": "This code defines a class called `Mlp` which is a fully connected layer with a middle layer and an output layer. It also includes an activation function (GELU by default) and a dropout layer (with drop probability specified). The class `DropPath` applies drop paths to stochastically mask layers during training, while the `Identity` class simply returns its input unchanged.",
+        "type": "comment"
+    },
+    "7154": {
+        "file_id": 518,
+        "content": "        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\nclass Attention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.,\n                 proj_drop=0.,\n                 wd_bias=True,\n                 lr_mult=1.0):\n        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n        self.attn_drop = nn.Dropout(attn_drop)\n    def forward(self, x):\n        N, C = x.shape[1:]\n        qkv = self.qkv(x).reshape(\n            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:108-144"
+    },
+    "7155": {
+        "file_id": 518,
+        "content": "The code defines a neural network layer called \"Attention\" with several components including a Linear layer for the query-key-value (QKV) transform, and separate Dropout layers for the attention and projection operations. The forward function performs the multi-head self-attention operation on the input tensor x, reshaping it to apply the QKV transform, and then applying dropout for both the attention and projection steps before returning the result. This layer is commonly used in transformer models for processing sequential data.",
+        "type": "comment"
+    },
+    "7156": {
+        "file_id": 518,
+        "content": "                (2, 0, 3, 1, 4))\n        q, k, v = qkv[0], qkv[1], qkv[2]\n        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale\n        attn = nn.functional.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.0,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop=0.0,\n                 attn_drop=0.0,\n                 drop_path=0.1,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 attention_type='divided_space_time',\n                 wd_bias=True,\n                 lr_mult=1.0):\n        super().__init__()\n        self.attention_type = attention_type\n        if isinstance(norm_layer, str):\n            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:145-178"
+    },
+    "7157": {
+        "file_id": 518,
+        "content": "This code defines a `Block` class that implements an attention mechanism using query-key value (QKV) decomposition. The block also includes a multi-layer perceptron (MLP) layer and supports different attention types. The input dimensions, number of heads in the attention mechanism, and other parameters are passed to the constructor.",
+        "type": "comment"
+    },
+    "7158": {
+        "file_id": 518,
+        "content": "        elif isinstance(norm_layer, Callable):\n            self.norm1 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        self.attn = Attention(dim,\n                              num_heads=num_heads,\n                              qkv_bias=qkv_bias,\n                              qk_scale=qk_scale,\n                              attn_drop=attn_drop,\n                              proj_drop=drop,\n                              wd_bias=wd_bias,\n                              lr_mult=lr_mult)\n        # Temporal Attention Parameters\n        if self.attention_type == 'divided_space_time':\n            if isinstance(norm_layer, str):\n                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n            elif isinstance(norm_layer, Callable):\n                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)\n            else:\n                raise TypeError(\n                    \"The norm_layer must be str or paddle.nn.layer.Layer class\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:179-202"
+    },
+    "7159": {
+        "file_id": 518,
+        "content": "This code checks the type of norm_layer and creates an instance of either a str or a paddle.nn.layer.Layer class for self.norm1. If no temporal attention is required, it raises a TypeError if norm_layer is neither a str nor a Callable. If divided space time attention is selected, it checks the type of norm_layer again and creates an instance of either a str or a paddle.nn.layer.Layer class for self.temporal_norm1.",
+        "type": "comment"
+    },
+    "7160": {
+        "file_id": 518,
+        "content": "            self.temporal_attn = Attention(dim,\n                                           num_heads=num_heads,\n                                           qkv_bias=qkv_bias,\n                                           qk_scale=qk_scale,\n                                           attn_drop=attn_drop,\n                                           proj_drop=drop,\n                                           wd_bias=wd_bias,\n                                           lr_mult=lr_mult)\n            self.temporal_fc = nn.Linear(dim, dim)\n        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        if isinstance(norm_layer, str):\n            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm2 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:203-221"
+    },
+    "7161": {
+        "file_id": 518,
+        "content": "This code initializes the temporal attention module, a linear layer for temporal features, and a drop path for stochastic depth. It also handles norm_layer initialization according to its type.",
+        "type": "comment"
+    },
+    "7162": {
+        "file_id": 518,
+        "content": "        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop,\n                       wd_bias=wd_bias,\n                       lr_mult=lr_mult)\n    def forward(self, x, B, T, W):\n        num_spatial_tokens = (x.shape[1] - 1) // T\n        H = num_spatial_tokens // W\n        if self.attention_type in ['space_only', 'joint_space_time']:\n            x = paddle.add(x, self.drop_path(self.attn(self.norm1(x))))\n            x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))\n            return x\n        elif self.attention_type == 'divided_space_time':\n            ########## Temporal ##########\n            xt = x[:, 1:, :]\n            _, _, _, _t, _m = B, H, W, T, xt.shape[-1]\n            xt = xt.reshape([-1, _t, _m])\n            res_temporal = self.drop_path(\n                self.temporal_attn(self.temporal_norm1(xt)))\n            _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:223-247"
+    },
+    "7163": {
+        "file_id": 518,
+        "content": "Code defines a backbone for Vision Transformer (ViT) with tweaks and handles the forward pass. The MLP layer is added, and attention type can be space-only, joint space-time or divided space-time. In divided space-time, it also includes temporal attention.",
+        "type": "comment"
+    },
+    "7164": {
+        "file_id": 518,
+        "content": "            res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])\n            res_temporal = self.temporal_fc(res_temporal)\n            xt = paddle.add(x[:, 1:, :], res_temporal)\n            ########## Spatial ##########\n            init_cls_token = x[:, 0, :].unsqueeze(1)\n            cls_token = init_cls_token.tile((1, T, 1))\n            _b, _t, _m = cls_token.shape\n            cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)\n            xs = xt\n            _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]\n            xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(\n                (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])\n            xs = paddle.concat((cls_token, xs), axis=1)\n            res_spatial = self.drop_path(self.attn(self.norm1(xs)))\n            # Taking care of CLS token\n            cls_token = res_spatial[:, 0, :]\n            _, _t, _m = B, T, cls_token.shape[-1]\n            cls_token = cls_token.reshape([-1, _t, _m])\n            # averaging for every frame\n            cls_token = paddle.mean(cls_token, axis=1, keepdim=True)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:248-271"
+    },
+    "7165": {
+        "file_id": 518,
+        "content": "This code performs spatial attention in a ViT model. It reshapes the input, concatenates the class token with the reshaped input, applies normalization and self-attention, and finally averages the class tokens for each frame to obtain a contextual representation.",
+        "type": "comment"
+    },
+    "7166": {
+        "file_id": 518,
+        "content": "            res_spatial = res_spatial[:, 1:, :]\n            _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]\n            res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(\n                (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])\n            res = res_spatial\n            x = xt\n            x = paddle.add(paddle.concat((init_cls_token, x), axis=1),\n                           paddle.concat((cls_token, res), axis=1))\n            # Mlp\n            x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))\n            return x\n        else:\n            raise NotImplementedError\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n    def __init__(self,\n                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768,\n                 wd_bias=True,\n                 lr_mult=1.0):\n        super().__init__()\n        img_size = to_2tuple(img_size)\n        patch_size = to_2tuple(patch_size)\n        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:273-302"
+    },
+    "7167": {
+        "file_id": 518,
+        "content": "This code is from the PaddleVideo library and defines a PatchEmbed class for image to patch embedding. It takes in parameters such as img_size, patch_size, in_channels, embed_dim, wd_bias, and lr_mult. The class performs image to patch embedding by dividing the input image into patches of specified size and flattening them into a 2D feature map. The code also includes a NotImplementedError for certain conditions, suggesting that some parts may not be fully implemented yet.",
+        "type": "comment"
+    },
+    "7168": {
+        "file_id": 518,
+        "content": "                                                        patch_size[0])\n        self.img_size = img_size\n        self.patch_size = patch_size\n        self.num_patches = num_patches\n        self.proj = nn.Conv2D(in_channels,\n                              embed_dim,\n                              kernel_size=patch_size,\n                              stride=patch_size)\n    def forward(self, x):\n        B, C, T, H, W = x.shape\n        assert H == self.img_size[0] and W == self.img_size[1], \\\n            f\"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).\"\n        x = x.transpose((0, 2, 1, 3, 4))  # [B,T,C,H,W]\n        x = x.reshape([-1, C, H, W])  # [BT,C,H,W]\n        x = self.proj(x)  # [BT,F,nH,nW]\n        W = x.shape[-1]\n        x = x.flatten(2).transpose((0, 2, 1))  # [BT,F,nHnW]\n        return x, T, W\n@BACKBONES.register()\nclass VisionTransformer_tweaks(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n    \"\"\"\n    def __init__(self,\n                 pretrained=None,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:303-331"
+    },
+    "7169": {
+        "file_id": 518,
+        "content": "This code defines a VisionTransformer with patch input. The model takes an image of size img_size and divides it into patches of size patch_size, extracting features from each patch using the Conv2D layer. The forward method reshapes the input and passes it through the projection convolution. It then flattens the output and returns the result along with the number of patches (T) and the total number of image pixels (W).",
+        "type": "comment"
+    },
+    "7170": {
+        "file_id": 518,
+        "content": "                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.1,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 num_seg=8,\n                 attention_type='divided_space_time',\n                 wd_bias=True,\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],\n                 **args):\n        super().__init__()\n        self.pretrained = pretrained\n        self.num_seg = num_seg\n        self.attention_type = attention_type\n        self.lr_mult_list = lr_mult_list\n        self.num_features = self.embed_dim = embed_dim\n        self.patch_embed = PatchEmbed(img_size=img_size,\n                                      patch_size=patch_size,\n                                      in_channels=in_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:332-360"
+    },
+    "7171": {
+        "file_id": 518,
+        "content": "This code initializes a ViT (Vision Transformer) model with specified dimensions and parameters. It uses the PatchEmbed class to embed input images, sets the number of segments for attention, and defines the learning rate multipliers for each stage of the model. It also specifies whether to use pre-trained weights.",
+        "type": "comment"
+    },
+    "7172": {
+        "file_id": 518,
+        "content": "                                      embed_dim=embed_dim,\n                                      wd_bias=wd_bias,\n                                      lr_mult=self.lr_mult_list[0])\n        num_patches = self.patch_embed.num_patches\n        # Positional Embeddings\n        self.cls_token = self.create_parameter(\n            shape=(1, 1, embed_dim),\n            default_initializer=zeros_,\n            attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.pos_embed = self.create_parameter(\n            shape=(1, num_patches + 1, embed_dim),\n            default_initializer=zeros_,\n            attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.pos_drop = nn.Dropout(p=drop_rate)\n        if self.attention_type != 'space_only':\n            self.time_embed = self.create_parameter(\n                shape=(1, num_seg, embed_dim),\n                default_initializer=zeros_,\n                attr=ParamAttr(regularizer=L2Decay(0.0)))\n            self.time_drop = nn.Dropout(p=drop_rate)\n        self.add_parameter(\"pos_embed\", self.pos_embed)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:361-384"
+    },
+    "7173": {
+        "file_id": 518,
+        "content": "This code initializes the positional and time embeddings for a transformer model. It creates a cls_token, pos_embed and optionally time_embed with specified dimensions and regularizers. It also adds dropout layers for positional and temporal features, if needed.",
+        "type": "comment"
+    },
+    "7174": {
+        "file_id": 518,
+        "content": "        self.add_parameter(\"cls_token\", self.cls_token)\n        dpr = np.linspace(0, drop_path_rate, depth)\n        self.blocks = nn.LayerList([\n            Block(dim=embed_dim,\n                  num_heads=num_heads,\n                  mlp_ratio=mlp_ratio,\n                  qkv_bias=qkv_bias,\n                  qk_scale=qk_scale,\n                  drop=drop_rate,\n                  attn_drop=attn_drop_rate,\n                  drop_path=dpr[i],\n                  norm_layer=norm_layer,\n                  epsilon=epsilon,\n                  attention_type=self.attention_type,\n                  wd_bias=wd_bias,\n                  lr_mult=self.lr_mult_list[(i // 4) + 1]) for i in range(depth)\n        ])\n        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)\n    def init_weights(self):\n        \"\"\"First init model's weight\"\"\"\n        trunc_normal_(self.pos_embed, std=0.02)\n        trunc_normal_(self.cls_token, std=0.02)\n        self.apply(self._init_fn)\n        if self.attention_type == 'divided_space_time':\n            i = 0",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:385-414"
+    },
+    "7175": {
+        "file_id": 518,
+        "content": "The code initializes a transformer model with blocks, adds parameters for position and classification tokens, creates a layer list of blocks with varying drop paths and attention types, and applies weight initialization to the positional embeddings, classification token, and layers.",
+        "type": "comment"
+    },
+    "7176": {
+        "file_id": 518,
+        "content": "            for m in self.blocks.sublayers(include_self=True):\n                m_str = str(m)\n                if 'Block' in m_str:\n                    if i > 0:\n                        zeros_(m.temporal_fc.weight)\n                        zeros_(m.temporal_fc.bias)\n                    i += 1\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if isinstance(\n                self.pretrained, str\n        ) and self.pretrained.strip() != \"\":  # load pretrained weights\n            load_ckpt(self,\n                      self.pretrained,\n                      num_patches=self.patch_embed.num_patches,\n                      num_seg=self.num_seg,\n                      attention_type=self.attention_type)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            pass\n        else:\n            raise NotImplementedError\n    def _init_fn(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:415-441"
+    },
+    "7177": {
+        "file_id": 518,
+        "content": "Initializing the backbone network by iterating through each sublayer, setting temporal_fc weight and bias to zeros if it's a Block type. If pretrained weights are provided, load them after checking the input. Else, continue with no change or raise an error for unsupported inputs. Initialize the network parameters using truncated normal distribution for Linear layers and setting bias of LayerNorm layers to zero.",
+        "type": "comment"
+    },
+    "7178": {
+        "file_id": 518,
+        "content": "            ones_(m.weight)\n            zeros_(m.bias)\n    def forward_features(self, x):\n        # B = x.shape[0]\n        B = paddle.shape(x)[0]\n        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]\n        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]\n        x = paddle.concat((cls_tokens, x), axis=1)\n        pos_interp = (x.shape[1] != self.pos_embed.shape[1])\n        if pos_interp:\n            pos_embed = self.pos_embed\n            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)\n            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(\n                (0, 2, 1))\n            P = int(other_pos_embed.shape[2]**0.5)\n            H = x.shape[1] // W\n            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])\n            new_pos_embed = F.interpolate(other_pos_embed,\n                                          size=(H, W),\n                                          mode='nearest')\n            new_pos_embed = new_pos_embed.flatten(2)\n            new_pos_embed = new_pos_embed.transpose((0, 2, 1))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:442-464"
+    },
+    "7179": {
+        "file_id": 518,
+        "content": "This code snippet is part of a transformer model's forward pass implementation. It reshapes the positional embeddings to match the patch embedding dimension and performs interpolation if necessary, ensuring the correct size for the subsequent layers.",
+        "type": "comment"
+    },
+    "7180": {
+        "file_id": 518,
+        "content": "            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),\n                                          axis=1)\n            x = paddle.add(x, new_pos_embed)\n        else:\n            x = paddle.add(x, self.pos_embed)\n        x = self.pos_drop(x)\n        # Time Embeddings\n        if self.attention_type != 'space_only':\n            cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(\n                T)[0].index_select(paddle.to_tensor([0]), axis=1)\n            x = x[:, 1:]\n            _, _n, _m = x.shape\n            _t = T\n            x = x.reshape([-1, _t, _n, _m]).transpose(\n                (0, 2, 1, 3)).reshape([-1, _t, _m])\n            # Resizing time embeddings in case they don't match\n            time_interp = (T != self.time_embed.shape[1])\n            if time_interp:  # T' != T\n                time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)\n                new_time_embed = F.interpolate(time_embed,\n                                               size=(T, x.shape[-1]),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:465-487"
+    },
+    "7181": {
+        "file_id": 518,
+        "content": "This code is part of a vision transformer model. It concatenates the class position embeddings with new position embeddings, adds them to the input tensor, and applies positional dropout. If attention type is not \"space_only,\" it extracts time embeddings from the input tensor, reshapes them, interpolates time embeddings if their size doesn't match, and performs some operations on them.",
+        "type": "comment"
+    },
+    "7182": {
+        "file_id": 518,
+        "content": "                                               mode='nearest').squeeze(0)\n                new_time_embed = new_time_embed.transpose((0, 2, 1))\n                x = paddle.add(x, new_time_embed)\n            else:\n                x = paddle.add(x, self.time_embed)\n            x = self.time_drop(x)\n            _, _t, _m = x.shape\n            x = x.reshape([-1, W * W * T, _m])\n            x = paddle.concat((cls_tokens, x), axis=1)\n        # Attention blocks\n        for blk in self.blocks:\n            x = blk(x, B, T, W)\n        # Predictions for space-only baseline\n        if self.attention_type == 'space_only':\n            _, _n, _m = x.shape\n            _t = T\n            x = x.reshape([-1, _t, _n, _m])\n            x = paddle.mean(x, 1)  # averaging predictions for every frame\n        x = self.norm(x)\n        return x[:, 0]  # [B,  embed_dim]\n    def forward(self, x):\n        x = self.forward_features(x)\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/vit_tweaks.py:488-515"
+    },
+    "7183": {
+        "file_id": 518,
+        "content": "This code performs time-based feature modification and passes the data through attention blocks for a Vision Transformer model. It also provides an option to compute space-only predictions by averaging predictions for every frame. The forward function applies the forward_features transformation before passing data through attention blocks and normalization.",
+        "type": "comment"
+    },
+    "7184": {
+        "file_id": 519,
+        "content": "/paddlevideo/modeling/backbones/yowo.py",
+        "type": "filepath"
+    },
+    "7185": {
+        "file_id": 519,
+        "content": "The CAM_Module and YOWO backbone model are for image processing and video classification respectively, using attention mechanism and convolutional layers. The code loads pretrain weights correctly and returns a Paddle Video YOWO model after processing input clips through backbones, CFAM, and convolutional layers.",
+        "type": "summary"
+    },
+    "7186": {
+        "file_id": 519,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..registry import BACKBONES\nfrom .darknet import Darknet\nfrom .resnext101 import ResNext101\nimport paddle.nn as nn\nimport paddle\nclass CAM_Module(nn.Layer):\n    def __init__(self, in_dim):\n        super(CAM_Module, self).__init__()\n        self.chanel_in = in_dim\n        temp = paddle.zeros([1], dtype='float32')\n        self.gamma = paddle.create_parameter(shape=temp.shape, dtype=str(temp.numpy().dtype),\n                                             default_initializer=paddle.nn.initializer.Assign(temp))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/yowo.py:1-28"
+    },
+    "7187": {
+        "file_id": 519,
+        "content": "This code is a part of the PaddleVideo library and defines a custom layer called CAM_Module. It takes an input dimension as a parameter, initializes a gamma parameter, and inherits from nn.Layer. The class constructor creates a zero-dimensional tensor as the initial value for gamma using paddle.create_parameter function. This module is used in backbone architectures to enable Channel Attention Mechanism for image processing tasks.",
+        "type": "comment"
+    },
+    "7188": {
+        "file_id": 519,
+        "content": "        self.softmax = nn.Softmax(axis=-1)\n    def forward(self, x):\n        m_batchsize, C, height, width = x.shape\n        proj_query = paddle.reshape(x, [m_batchsize, C, -1])\n        proj_key = paddle.transpose(paddle.reshape(\n            x, [m_batchsize, C, -1]), perm=[0, 2, 1])\n        energy = paddle.bmm(proj_query, proj_key)\n        energy_new = paddle.expand_as(paddle.max(\n            energy, axis=-1, keepdim=True), energy) - energy\n        attention = self.softmax(energy_new)\n        proj_value = paddle.reshape(x, [m_batchsize, C, -1])\n        out = paddle.bmm(attention, proj_value)\n        out = out.reshape([m_batchsize, C, height, width])\n        out = self.gamma * out + x\n        return out\nclass CFAMBlock(nn.Layer):\n    def __init__(self, in_channels, out_channels):\n        super(CFAMBlock, self).__init__()\n        inter_channels = 1024\n        self.conv_bn_relu1 = nn.Sequential(nn.Conv2D(in_channels, inter_channels, kernel_size=1, bias_attr=False),\n                                           nn.BatchNorm2D(inter_channels),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/yowo.py:29-53"
+    },
+    "7189": {
+        "file_id": 519,
+        "content": "The code defines a CFAMPBlock layer with a Channel-wise Attention Mechanism. It contains a convolution, batch normalization, and ReLU layers for the attention mechanism, followed by a gamma scaling and channel-wise attention calculation. The forward function performs the attention operation and scales the input using the attention map.",
+        "type": "comment"
+    },
+    "7190": {
+        "file_id": 519,
+        "content": "                                           nn.ReLU())\n        self.conv_bn_relu2 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),\n                                           nn.BatchNorm2D(inter_channels),\n                                           nn.ReLU())\n        self.sc = CAM_Module(inter_channels)\n        self.conv_bn_relu3 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),\n                                           nn.BatchNorm2D(inter_channels),\n                                           nn.ReLU())\n        self.conv_out = nn.Sequential(nn.Dropout2D(0.1), nn.Conv2D(\n            inter_channels, out_channels, 1, bias_attr=True))\n    def forward(self, x):\n        x = self.conv_bn_relu1(x)\n        x = self.conv_bn_relu2(x)\n        x = self.sc(x)\n        x = self.conv_bn_relu3(x)\n        output = self.conv_out(x)\n        return output\n@BACKBONES.register()\nclass YOWO(nn.Layer):\n    def __init__(self, num_class, pretrained_2d=None, pretrained_3d=None):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/yowo.py:54-79"
+    },
+    "7191": {
+        "file_id": 519,
+        "content": "This code defines a YOWO backbone model, which is a neural network architecture for video classification tasks. It consists of several convolutional layers followed by batch normalization and ReLU activations. The CAM_Module is also included, which might be a custom attention mechanism. The output channels are adjusted based on the input size. Dropout regularization is applied to prevent overfitting.",
+        "type": "comment"
+    },
+    "7192": {
+        "file_id": 519,
+        "content": "        super(YOWO, self).__init__()\n        self.pretrained_2d = pretrained_2d\n        self.pretrained_3d = pretrained_3d\n        self.backbone_2d = Darknet()\n        self.backbone_3d = ResNext101()\n        self.num_ch_2d = 425\n        self.num_ch_3d = 2048\n        self.num_class = num_class\n        self.cfam = CFAMBlock(self.num_ch_2d + self.num_ch_3d, 1024)\n        self.conv_final = nn.Conv2D(\n            1024, 5 * (self.num_class + 4 + 1), kernel_size=1, bias_attr=False)\n        self.seen = 0\n    def init_weights(self):\n        if self.pretrained_2d is not None:\n            self.backbone_2d = self.load_pretrain_weight(\n                self.backbone_2d, self.pretrained_2d)\n        if self.pretrained_3d is not None:\n            self.backbone_3d = self.load_pretrain_weight(\n                self.backbone_3d, self.pretrained_3d)\n    def load_pretrain_weight(self, model, weights_path):\n        model_dict = model.state_dict()\n        param_state_dict = paddle.load(weights_path)\n        ignore_weights = set()\n        # hack: fit for faster rcnn. Pretrain weights contain prefix of 'backbone'",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/yowo.py:80-108"
+    },
+    "7193": {
+        "file_id": 519,
+        "content": "The code initializes a YOWO model with pre-trained 2D and 3D backbones, loads pre-trained weights if provided for both backbones, and has a method to initialize weights.",
+        "type": "comment"
+    },
+    "7194": {
+        "file_id": 519,
+        "content": "        # while res5 module is located in bbox_head.head. Replace the prefix of\n        # res5 with 'bbox_head.head' to load pretrain weights correctly.\n        for k in list(param_state_dict.keys()):\n            if 'backbone.res5' in k:\n                new_k = k.replace('backbone', 'bbox_head.head')\n                if new_k in model_dict.keys():\n                    value = param_state_dict.pop(k)\n                    param_state_dict[new_k] = value\n        for name, weight in param_state_dict.items():\n            if name in model_dict.keys():\n                if list(weight.shape) != list(model_dict[name].shape):\n                    print(\n                        '{} not used, shape {} unmatched with {} in model.'.format(\n                            name, weight.shape, list(model_dict[name].shape)))\n                    ignore_weights.add(name)\n            else:\n                print('Redundant weight {} and ignore it.'.format(name))\n                ignore_weights.add(name)\n        for weight in ignore_weights:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/yowo.py:109-129"
+    },
+    "7195": {
+        "file_id": 519,
+        "content": "This code is replacing the prefix of 'res5' with 'bbox_head.head' in param_state_dict to load pretrain weights correctly. It then checks if the weight shapes match and adds redundant or unmatched weights to ignore_weights.",
+        "type": "comment"
+    },
+    "7196": {
+        "file_id": 519,
+        "content": "            param_state_dict.pop(weight, None)\n        model.set_dict(param_state_dict)\n        print('Finish loading model weights: {}'.format(weights_path))\n        return model\n    def forward(self, input):\n        x_3d = input  # Input clip\n        x_2d = input[:, :, -1, :, :]  # Last frame of the clip that is read\n        x_2d = self.backbone_2d(x_2d)\n        x_3d = self.backbone_3d(x_3d)\n        x_3d = paddle.squeeze(x_3d, axis=2)\n        x = paddle.concat([x_3d, x_2d], axis=1)\n        x = self.cfam(x)\n        out = self.conv_final(x)\n        return out",
+        "type": "code",
+        "location": "/paddlevideo/modeling/backbones/yowo.py:130-150"
+    },
+    "7197": {
+        "file_id": 519,
+        "content": "This function loads model weights from the specified path and returns a Paddle Video YOWO model. The model's `forward` method takes an input clip, separates it into 3D and 2D representations, passes them through their respective backbones, concatenates them together, and finally feeds it to CFAM and a convolutional layer for processing before returning the output.",
+        "type": "comment"
+    },
+    "7198": {
+        "file_id": 520,
+        "content": "/paddlevideo/modeling/bbox_utils.py",
+        "type": "filepath"
+    },
+    "7199": {
+        "file_id": 520,
+        "content": "This code calculates delta between bounding boxes, adjusts using weighted averages, provides functions for filtering, computing overlaps, generating anchor points, decoding YOLO boxes, and calculating IoU. It transforms coordinates, computes deltas, stacks results, calculates dimensions and center of rotated boxes, converts rectangles to polygons, and finds the best begin point for a coordinate.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/72.json b/docs/data/72.json
new file mode 100644
index 000000000..52d0a0df0
--- /dev/null
+++ b/docs/data/72.json
@@ -0,0 +1,543 @@
+{
+    "7200": {
+        "file_id": 520,
+        "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn.functional as F\nimport math\nimport numpy as np\ndef bbox2delta(src_boxes, tgt_boxes, weights):\n    src_w = src_boxes[:, 2] - src_boxes[:, 0]\n    src_h = src_boxes[:, 3] - src_boxes[:, 1]\n    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w\n    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h\n    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]\n    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]\n    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:1-30"
+    },
+    "7201": {
+        "file_id": 520,
+        "content": "This code calculates the delta between source and target bounding boxes. It first computes the width and height of both source and target boxes, then their center coordinates. The variables are initialized and calculated for further usage in other functions related to bounding box transformation.",
+        "type": "comment"
+    },
+    "7202": {
+        "file_id": 520,
+        "content": "    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h\n    wx, wy, ww, wh = weights\n    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w\n    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h\n    dw = ww * paddle.log(tgt_w / src_w)\n    dh = wh * paddle.log(tgt_h / src_h)\n    deltas = paddle.stack((dx, dy, dw, dh), axis=1)\n    return deltas\ndef delta2bbox(deltas, boxes, weights):\n    clip_scale = math.log(1000.0 / 16)\n    widths = boxes[:, 2] - boxes[:, 0]\n    heights = boxes[:, 3] - boxes[:, 1]\n    ctr_x = boxes[:, 0] + 0.5 * widths\n    ctr_y = boxes[:, 1] + 0.5 * heights\n    wx, wy, ww, wh = weights\n    dx = deltas[:, 0::4] / wx\n    dy = deltas[:, 1::4] / wy\n    dw = deltas[:, 2::4] / ww\n    dh = deltas[:, 3::4] / wh\n    # Prevent sending too large values into paddle.exp()\n    dw = paddle.clip(dw, max=clip_scale)\n    dh = paddle.clip(dh, max=clip_scale)\n    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)\n    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)\n    pred_w = paddle.exp(dw) * widths.unsqueeze(1)\n    pred_h = paddle.exp(dh) * heights.unsqueeze(1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:31-63"
+    },
+    "7203": {
+        "file_id": 520,
+        "content": "This code calculates the differentials (deltas) between target and source bounding boxes, then converts those deltas back into new bounding box coordinates. The conversion is done with weighted averages for x, y, width, and height adjustments, ensuring values are clipped to avoid large inputs for paddle.exp().",
+        "type": "comment"
+    },
+    "7204": {
+        "file_id": 520,
+        "content": "    pred_boxes = []\n    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)\n    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)\n    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)\n    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)\n    pred_boxes = paddle.stack(pred_boxes, axis=-1)\n    return pred_boxes\ndef expand_bbox(bboxes, scale):\n    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5\n    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5\n    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5\n    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5\n    w_half *= scale\n    h_half *= scale\n    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)\n    bboxes_exp[:, 0] = x_c - w_half\n    bboxes_exp[:, 2] = x_c + w_half\n    bboxes_exp[:, 1] = y_c - h_half\n    bboxes_exp[:, 3] = y_c + h_half\n    return bboxes_exp\ndef clip_bbox(boxes, im_shape):\n    h, w = im_shape[0], im_shape[1]\n    x1 = boxes[:, 0].clip(0, w)\n    y1 = boxes[:, 1].clip(0, h)\n    x2 = boxes[:, 2].clip(0, w)\n    y2 = boxes[:, 3].clip(0, h)\n    return paddle.stack([x1, y1, x2, y2], axis=1)\ndef nonempty_bbox(boxes, min_size=0, return_mask=False):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:65-102"
+    },
+    "7205": {
+        "file_id": 520,
+        "content": "The code contains three functions: `expand_bbox`, `clip_bbox`, and `nonempty_bbox`. `expand_bbox` takes bbox coordinates, scales them by a factor, and returns the expanded bboxes. `clip_bbox` clips the bbox coordinates to the image shape boundaries. `nonempty_bbox` filters out empty bounding boxes based on a minimum size threshold or returns a mask if return_mask is True.",
+        "type": "comment"
+    },
+    "7206": {
+        "file_id": 520,
+        "content": "    w = boxes[:, 2] - boxes[:, 0]\n    h = boxes[:, 3] - boxes[:, 1]\n    mask = paddle.logical_and(w > min_size, w > min_size)\n    if return_mask:\n        return mask\n    keep = paddle.nonzero(mask).flatten()\n    return keep\ndef bbox_area(boxes):\n    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])\ndef bbox_overlaps(boxes1, boxes2):\n    \"\"\"\n    Calculate overlaps between boxes1 and boxes2\n    Args:\n        boxes1 (Tensor): boxes with shape [M, 4]\n        boxes2 (Tensor): boxes with shape [N, 4]\n    Return:\n        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]\n    \"\"\"\n    area1 = bbox_area(boxes1)\n    area2 = bbox_area(boxes2)\n    xy_max = paddle.minimum(\n        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])\n    xy_min = paddle.maximum(\n        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])\n    width_height = xy_max - xy_min\n    width_height = width_height.clip(min=0)\n    inter = width_height.prod(axis=2)\n    overlaps = paddle.where(inter > 0, inter /\n                            (paddle.unsqueeze(area1, 1) + area2 - inter),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:103-139"
+    },
+    "7207": {
+        "file_id": 520,
+        "content": "The function `bbox_utils.py` at line 102-138 contains two functions: 'filter_boxes_by_size' and 'bbox_overlaps'.\n'filter_boxes_by_size' filters the bounding boxes by size, only keeping those whose width or height exceeds a specified minimum size. If a mask is also desired, it returns one of true values for the selected bounding boxes.\n'bbox_overlaps' calculates overlaps between two sets of bounding boxes and returns them as a tensor with shape [M, N]. This function uses the areas of the bounding boxes to compute the intersections and unions, applying clipping for valid computations and handling non-intersecting boxes.\n\nExplanation: The code contains functions that filter bounding boxes by size and calculate overlaps between them. The 'filter_boxes_by_size' function filters bounding boxes based on their width or height, while the 'bbox_overlaps' function calculates overlap between two sets of bounding boxes and returns a tensor with shape [M, N].",
+        "type": "comment"
+    },
+    "7208": {
+        "file_id": 520,
+        "content": "                            paddle.zeros_like(inter))\n    return overlaps\ndef xywh2xyxy(box):\n    x, y, w, h = box\n    x1 = x - w * 0.5\n    y1 = y - h * 0.5\n    x2 = x + w * 0.5\n    y2 = y + h * 0.5\n    return [x1, y1, x2, y2]\ndef make_grid(h, w, dtype):\n    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])\n    return paddle.stack((xv, yv), 2).cast(dtype=dtype)\ndef decode_yolo(box, anchor, downsample_ratio):\n    \"\"\"decode yolo box\n    Args:\n        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]\n        anchor (list): anchor with the shape [na, 2]\n        downsample_ratio (int): downsample ratio, default 32\n        scale (float): scale, default 1.\n    Return:\n        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]\n    \"\"\"\n    x, y, w, h = box\n    na, grid_h, grid_w = x.shape[1:4]\n    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))\n    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w\n    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h\n    anchor = paddle.to_tensor(anchor)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:140-176"
+    },
+    "7209": {
+        "file_id": 520,
+        "content": "The code defines functions for converting box coordinates, generating a grid of anchor points, and decoding YOLO bounding boxes. The \"xywh2xyxy\" function transforms (x, y, w, h) to (x1, y1, x2, y2). The \"make_grid\" function generates a grid of coordinates for downsampled images. The \"decode_yolo\" function decodes YOLO bounding boxes using anchor points and downsample ratios.",
+        "type": "comment"
+    },
+    "7210": {
+        "file_id": 520,
+        "content": "    anchor = paddle.cast(anchor, x.dtype)\n    anchor = anchor.reshape((1, na, 1, 1, 2))\n    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)\n    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)\n    return [x1, y1, w1, h1]\ndef iou_similarity(box1, box2, eps=1e-9):\n    \"\"\"Calculate iou of box1 and box2\n    Args:\n        box1 (Tensor): box with the shape [N, M1, 4]\n        box2 (Tensor): box with the shape [N, M2, 4]\n    Return:\n        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]\n    \"\"\"\n    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]\n    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]\n    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]\n    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]\n    x1y1 = paddle.maximum(px1y1, gx1y1)\n    x2y2 = paddle.minimum(px2y2, gx2y2)\n    overlap = (x2y2 - x1y1).clip(0).prod(-1)\n    area1 = (px2y2 - px1y1).clip(0).prod(-1)\n    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)\n    union = area1 + area2 - overlap + eps",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:177-204"
+    },
+    "7211": {
+        "file_id": 520,
+        "content": "Code defines anchor and calculates width (w1) and height (h1) for bounding boxes based on exponential values of w and h, downsample ratio, and grid dimensions. It also includes a function iou_similarity that calculates the intersection over union (IoU) between two sets of bounding boxes.",
+        "type": "comment"
+    },
+    "7212": {
+        "file_id": 520,
+        "content": "    return overlap / union\ndef bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):\n    \"\"\"calculate the iou of box1 and box2\n    Args:\n        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]\n        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]\n        giou (bool): whether use giou or not, default False\n        diou (bool): whether use diou or not, default False\n        ciou (bool): whether use ciou or not, default False\n        eps (float): epsilon to avoid divide by zero\n    Return:\n        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]\n    \"\"\"\n    px1, py1, px2, py2 = box1\n    gx1, gy1, gx2, gy2 = box2\n    x1 = paddle.maximum(px1, gx1)\n    y1 = paddle.maximum(py1, gy1)\n    x2 = paddle.minimum(px2, gx2)\n    y2 = paddle.minimum(py2, gy2)\n    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))\n    area1 = (px2 - px1) * (py2 - py1)\n    area1 = area1.clip(0)\n    area2 = (gx2 - gx1) * (gy2 - gy1)\n    area2 = area2.clip(0)\n    union = area1 + area2 - overlap + eps",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:205-237"
+    },
+    "7213": {
+        "file_id": 520,
+        "content": "This function calculates the intersection over union (IoU) between two bounding boxes, box1 and box2. It supports various IoU metrics such as Giou, Diou, or Ciou. The calculated IoU is returned as a tensor with the same shape as box1 and box2. This function is used in object detection tasks to measure the overlap between predicted and ground truth bounding boxes.",
+        "type": "comment"
+    },
+    "7214": {
+        "file_id": 520,
+        "content": "    iou = overlap / union\n    if giou or ciou or diou:\n        # convex w, h\n        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)\n        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)\n        if giou:\n            c_area = cw * ch + eps\n            return iou - (c_area - union) / c_area\n        else:\n            # convex diagonal squared\n            c2 = cw**2 + ch**2 + eps\n            # center distance\n            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4\n            if diou:\n                return iou - rho2 / c2\n            else:\n                w1, h1 = px2 - px1, py2 - py1 + eps\n                w2, h2 = gx2 - gx1, gy2 - gy1 + eps\n                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)\n                v = (4 / math.pi**2) * paddle.pow(delta, 2)\n                alpha = v / (1 + eps - iou + v)\n                alpha.stop_gradient = True\n                return iou - (rho2 / c2 + v * alpha)\n    else:\n        return iou\ndef rect2rbox(bboxes):\n    \"\"\"\n    :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:238-268"
+    },
+    "7215": {
+        "file_id": 520,
+        "content": "This code calculates the Intersection over Union (IoU) between two bounding boxes and applies various forms of IoU calculations depending on the input parameters. It first checks if giou, ciou, or diou is True and then proceeds with the corresponding calculation based on the convex area, diagonal distance, or aspect ratio difference between the bounding boxes. The rect2rbox function transforms a set of bounding boxes from (xmin, ymin, xmax, ymax) format to (cx, cy, w, h) format where cx and cy are center coordinates and w and h are width and height respectively.",
+        "type": "comment"
+    },
+    "7216": {
+        "file_id": 520,
+        "content": "    :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle)\n    \"\"\"\n    bboxes = bboxes.reshape(-1, 4)\n    num_boxes = bboxes.shape[0]\n    x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0\n    y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0\n    edges1 = np.abs(bboxes[:, 2] - bboxes[:, 0])\n    edges2 = np.abs(bboxes[:, 3] - bboxes[:, 1])\n    angles = np.zeros([num_boxes], dtype=bboxes.dtype)\n    inds = edges1 < edges2\n    rboxes = np.stack((x_ctr, y_ctr, edges1, edges2, angles), axis=1)\n    rboxes[inds, 2] = edges2[inds]\n    rboxes[inds, 3] = edges1[inds]\n    rboxes[inds, 4] = np.pi / 2.0\n    return rboxes\ndef delta2rbox(Rrois,\n               deltas,\n               means=[0, 0, 0, 0, 0],\n               stds=[1, 1, 1, 1, 1],\n               wh_ratio_clip=1e-6):\n    \"\"\"\n    :param Rrois: (cx, cy, w, h, theta)\n    :param deltas: (dx, dy, dw, dh, dtheta)\n    :param means:\n    :param stds:\n    :param wh_ratio_clip:\n    :return:\n    \"\"\"\n    means = paddle.to_tensor(means)\n    stds = paddle.to_tensor(stds)\n    deltas = paddle.reshape(deltas, [-1, deltas.shape[-1]])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:269-304"
+    },
+    "7217": {
+        "file_id": 520,
+        "content": "This function converts bounding box coordinates and dimensions to rotation-aligned bounding boxes by calculating the center, edge lengths, and angle. It returns a new tensor with reshaped and rotated bounding boxes.",
+        "type": "comment"
+    },
+    "7218": {
+        "file_id": 520,
+        "content": "    denorm_deltas = deltas * stds + means\n    dx = denorm_deltas[:, 0]\n    dy = denorm_deltas[:, 1]\n    dw = denorm_deltas[:, 2]\n    dh = denorm_deltas[:, 3]\n    dangle = denorm_deltas[:, 4]\n    max_ratio = np.abs(np.log(wh_ratio_clip))\n    dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)\n    dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)\n    Rroi_x = Rrois[:, 0]\n    Rroi_y = Rrois[:, 1]\n    Rroi_w = Rrois[:, 2]\n    Rroi_h = Rrois[:, 3]\n    Rroi_angle = Rrois[:, 4]\n    gx = dx * Rroi_w * paddle.cos(Rroi_angle) - dy * Rroi_h * paddle.sin(\n        Rroi_angle) + Rroi_x\n    gy = dx * Rroi_w * paddle.sin(Rroi_angle) + dy * Rroi_h * paddle.cos(\n        Rroi_angle) + Rroi_y\n    gw = Rroi_w * dw.exp()\n    gh = Rroi_h * dh.exp()\n    ga = np.pi * dangle + Rroi_angle\n    ga = (ga + np.pi / 4) % np.pi - np.pi / 4\n    ga = paddle.to_tensor(ga)\n    gw = paddle.to_tensor(gw, dtype='float32')\n    gh = paddle.to_tensor(gh, dtype='float32')\n    bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1)\n    return bboxes\ndef rbox2delta(proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:305-339"
+    },
+    "7219": {
+        "file_id": 520,
+        "content": "This code computes the bounding box regression results for each proposed bounding box. It calculates the deltas and applies clipping to ensure they stay within reasonable bounds, then transforms these deltas into actual bounding box coordinates. The resulting bounding boxes are stacked in a tensor and returned as output.",
+        "type": "comment"
+    },
+    "7220": {
+        "file_id": 520,
+        "content": "    \"\"\"\n    Args:\n        proposals:\n        gt:\n        means: 1x5\n        stds: 1x5\n    Returns:\n    \"\"\"\n    proposals = proposals.astype(np.float64)\n    PI = np.pi\n    gt_widths = gt[..., 2]\n    gt_heights = gt[..., 3]\n    gt_angle = gt[..., 4]\n    proposals_widths = proposals[..., 2]\n    proposals_heights = proposals[..., 3]\n    proposals_angle = proposals[..., 4]\n    coord = gt[..., 0:2] - proposals[..., 0:2]\n    dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4])\n          * coord[..., 1]) / proposals_widths\n    dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4])\n          * coord[..., 1]) / proposals_heights\n    dw = np.log(gt_widths / proposals_widths)\n    dh = np.log(gt_heights / proposals_heights)\n    da = (gt_angle - proposals_angle)\n    da = (da + PI / 4) % PI - PI / 4\n    da /= PI\n    deltas = np.stack([dx, dy, dw, dh, da], axis=-1)\n    means = np.array(means, dtype=deltas.dtype)\n    stds = np.array(stds, dtype=deltas.dtype)\n    deltas = (deltas - means) / stds",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:340-378"
+    },
+    "7221": {
+        "file_id": 520,
+        "content": "This code calculates the delta values between ground truth and proposals for bounding boxes, taking into account their widths, heights, angles, and applying normalization based on means and stds. It also ensures that angle differences are within 0 to 2π range before scaling by the inverse of PI.",
+        "type": "comment"
+    },
+    "7222": {
+        "file_id": 520,
+        "content": "    deltas = deltas.astype(np.float32)\n    return deltas\ndef bbox_decode(bbox_preds,\n                anchors,\n                means=[0, 0, 0, 0, 0],\n                stds=[1, 1, 1, 1, 1]):\n    \"\"\"decode bbox from deltas\n    Args:\n        bbox_preds: [N,H,W,5]\n        anchors: [H*W,5]\n    return:\n        bboxes: [N,H,W,5]\n    \"\"\"\n    means = paddle.to_tensor(means)\n    stds = paddle.to_tensor(stds)\n    num_imgs, H, W, _ = bbox_preds.shape\n    bboxes_list = []\n    for img_id in range(num_imgs):\n        bbox_pred = bbox_preds[img_id]\n        # bbox_pred.shape=[5,H,W]\n        bbox_delta = bbox_pred\n        anchors = paddle.to_tensor(anchors)\n        bboxes = delta2rbox(\n            anchors, bbox_delta, means, stds, wh_ratio_clip=1e-6)\n        bboxes = paddle.reshape(bboxes, [H, W, 5])\n        bboxes_list.append(bboxes)\n    return paddle.stack(bboxes_list, axis=0)\ndef poly_to_rbox(polys):\n    \"\"\"\n    poly:[x0,y0,x1,y1,x2,y2,x3,y3]\n    to\n    rotated_boxes:[x_ctr,y_ctr,w,h,angle]\n    \"\"\"\n    rotated_boxes = []\n    for poly in polys:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:379-417"
+    },
+    "7223": {
+        "file_id": 520,
+        "content": "Function `bbox_decode` takes bbox predictions, anchors and means/stds as inputs. It returns decoded bounding boxes. It first converts the means and stds to tensors. Then for each image, it computes the bbox delta from the bbox predictions. It then transforms these deltas to actual bounding box coordinates using `delta2rbox` function. Finally, it reshapes the obtained bounding boxes and stores them in a list. The function returns a stack of all the bounding boxes for each image.",
+        "type": "comment"
+    },
+    "7224": {
+        "file_id": 520,
+        "content": "        poly = np.array(poly[:8], dtype=np.float32)\n        pt1 = (poly[0], poly[1])\n        pt2 = (poly[2], poly[3])\n        pt3 = (poly[4], poly[5])\n        pt4 = (poly[6], poly[7])\n        edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[\n            1]) * (pt1[1] - pt2[1]))\n        edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[\n            1]) * (pt2[1] - pt3[1]))\n        width = max(edge1, edge2)\n        height = min(edge1, edge2)\n        rbox_angle = 0\n        if edge1 > edge2:\n            rbox_angle = np.arctan2(\n                np.float(pt2[1] - pt1[1]), np.float(pt2[0] - pt1[0]))\n        elif edge2 >= edge1:\n            rbox_angle = np.arctan2(\n                np.float(pt4[1] - pt1[1]), np.float(pt4[0] - pt1[0]))\n        def norm_angle(angle, range=[-np.pi / 4, np.pi]):\n            return (angle - range[0]) % range[1] + range[0]\n        rbox_angle = norm_angle(rbox_angle)\n        x_ctr = np.float(pt1[0] + pt3[0]) / 2\n        y_ctr = np.float(pt1[1] + pt3[1]) / 2",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:418-447"
+    },
+    "7225": {
+        "file_id": 520,
+        "content": "This code calculates the width, height, and angle of a rotated bounding box based on its eight points. It first converts the polyline to a numpy array, then calculates the lengths of two edges. The function then determines the maximum edge length as the width and the minimum edge length as the height. Based on these values, it computes the rotation angle using arctan2. Finally, it normalizes the rotation angle within a specified range using the norm_angle function. It also calculates the x and y coordinates of the box's center.",
+        "type": "comment"
+    },
+    "7226": {
+        "file_id": 520,
+        "content": "        rotated_box = np.array([x_ctr, y_ctr, width, height, rbox_angle])\n        rotated_boxes.append(rotated_box)\n    ret_rotated_boxes = np.array(rotated_boxes)\n    assert ret_rotated_boxes.shape[1] == 5\n    return ret_rotated_boxes\ndef cal_line_length(point1, point2):\n    import math\n    return math.sqrt(\n        math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2))\ndef get_best_begin_point_single(coordinate):\n    x1, y1, x2, y2, x3, y3, x4, y4 = coordinate\n    xmin = min(x1, x2, x3, x4)\n    ymin = min(y1, y2, y3, y4)\n    xmax = max(x1, x2, x3, x4)\n    ymax = max(y1, y2, y3, y4)\n    combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]],\n                 [[x4, y4], [x1, y1], [x2, y2], [x3, y3]],\n                 [[x3, y3], [x4, y4], [x1, y1], [x2, y2]],\n                 [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]]\n    dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]\n    force = 100000000.0\n    force_flag = 0\n    for i in range(4):\n        temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \\",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:448-475"
+    },
+    "7227": {
+        "file_id": 520,
+        "content": "This function `get_best_begin_point_single` takes a coordinate as input and calculates the minimum x and y coordinates (xmin, ymin) and maximum x and y coordinates (xmax, ymax). It then defines four different combinations of the four points in the coordinate and compares these combinations to the destination coordinate (dst_coordinate) by calculating the distance using `cal_line_length` function. The combination with the smallest distance is returned as the best begin point. The code also includes a force variable to handle potential edge cases where no valid begin point can be found.",
+        "type": "comment"
+    },
+    "7228": {
+        "file_id": 520,
+        "content": "                     + cal_line_length(combinate[i][1], dst_coordinate[1]) \\\n                     + cal_line_length(combinate[i][2], dst_coordinate[2]) \\\n                     + cal_line_length(combinate[i][3], dst_coordinate[3])\n        if temp_force < force:\n            force = temp_force\n            force_flag = i\n    if force_flag != 0:\n        pass\n    return np.array(combinate[force_flag]).reshape(8)\ndef rbox2poly_single(rrect):\n    \"\"\"\n    rrect:[x_ctr,y_ctr,w,h,angle]\n    to\n    poly:[x0,y0,x1,y1,x2,y2,x3,y3]\n    \"\"\"\n    x_ctr, y_ctr, width, height, angle = rrect[:5]\n    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2\n    # rect 2x4\n    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])\n    R = np.array([[np.cos(angle), -np.sin(angle)],\n                  [np.sin(angle), np.cos(angle)]])\n    # poly\n    poly = R.dot(rect)\n    x0, x1, x2, x3 = poly[0, :4] + x_ctr\n    y0, y1, y2, y3 = poly[1, :4] + y_ctr\n    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:476-503"
+    },
+    "7229": {
+        "file_id": 520,
+        "content": "This function `rbox2poly_single` takes a rectangle represented by its center coordinates, width, height, and angle, and converts it to a polygon representation. It first calculates the top-left and bottom-right coordinates of the rectangle. Then, it creates a 2x4 matrix representing the four corners of the rectangle. The function applies a rotation matrix to transform the rectangle into a rotated coordinate system. Finally, it shifts the transformed coordinates by the center coordinates and returns the polygon representation as an array.",
+        "type": "comment"
+    },
+    "7230": {
+        "file_id": 520,
+        "content": "    poly = get_best_begin_point_single(poly)\n    return poly\ndef rbox2poly(rrects):\n    \"\"\"\n    rrect:[x_ctr,y_ctr,w,h,angle]\n    to\n    poly:[x0,y0,x1,y1,x2,y2,x3,y3]\n    \"\"\"\n    polys = []\n    for rrect in rrects:\n        x_ctr, y_ctr, width, height, angle = rrect[:5]\n        tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2\n        rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])\n        R = np.array([[np.cos(angle), -np.sin(angle)],\n                      [np.sin(angle), np.cos(angle)]])\n        poly = R.dot(rect)\n        x0, x1, x2, x3 = poly[0, :4] + x_ctr\n        y0, y1, y2, y3 = poly[1, :4] + y_ctr\n        poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)\n        poly = get_best_begin_point_single(poly)\n        polys.append(poly)\n    polys = np.array(polys)\n    return polys",
+        "type": "code",
+        "location": "/paddlevideo/modeling/bbox_utils.py:504-528"
+    },
+    "7231": {
+        "file_id": 520,
+        "content": "This function `rbox2poly` converts a list of rotation rectangles (rrects) into polygons (polys). It first calculates the top-left and bottom-right coordinates of each rrect. Then, it rotates the rectangle using the given angle. The function adjusts the poly points by adding the x_ctr and y_ctr values to obtain the final poly. It applies a single best begin point adjustment (`get_best_begin_point_single`) and adds the poly to the list of polys. Finally, it returns the array of polygons.",
+        "type": "comment"
+    },
+    "7232": {
+        "file_id": 521,
+        "content": "/paddlevideo/modeling/builder.py",
+        "type": "filepath"
+    },
+    "7233": {
+        "file_id": 521,
+        "content": "This code imports modules and registers functions for building a video object detection model, as well as dynamically constructing components based on a configuration file.",
+        "type": "summary"
+    },
+    "7234": {
+        "file_id": 521,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, BBOX_CODERS, PARTITIONERS, MULTIMODAL, SEGMENT, SEGMENTERS\nfrom ..utils import build\nfrom .registry import (BACKBONES, BBOX_ASSIGNERS, BBOX_CODERS, BBOX_SAMPLERS,\n                       DETECTORS, ESTIMATORS, HEADS, LOCALIZERS, LOSSES,\n                       MULTIMODAL, PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/builder.py:1-19"
+    },
+    "7235": {
+        "file_id": 521,
+        "content": "Imports necessary modules and registers various components for video object detection model building.",
+        "type": "comment"
+    },
+    "7236": {
+        "file_id": 521,
+        "content": "def build_backbone(cfg):\n    \"\"\"Build backbone.\"\"\"\n    return build(cfg, BACKBONES)\ndef build_roi_extractor(cfg):\n    \"\"\"Build roi extractor.\"\"\"\n    return build(cfg, ROI_EXTRACTORS)\ndef build_assigner(cfg, **default_args):\n    \"\"\"Builder of box assigner.\"\"\"\n    return build(cfg, BBOX_ASSIGNERS)\ndef build_sampler(cfg, **default_args):\n    \"\"\"Builder of box sampler.\"\"\"\n    return build(cfg, BBOX_SAMPLERS)\ndef build_roi_extractor(cfg):\n    \"\"\"Build roi extractor.\"\"\"\n    return build(cfg, ROI_EXTRACTORS)\ndef build_assigner(cfg, **default_args):\n    \"\"\"Builder of box assigner.\"\"\"\n    return build(cfg, BBOX_ASSIGNERS)\ndef build_sampler(cfg, **default_args):\n    \"\"\"Builder of box sampler.\"\"\"\n    return build(cfg, BBOX_SAMPLERS)\ndef build_head(cfg):\n    \"\"\"Build head.\"\"\"\n    return build(cfg, HEADS)\ndef build_loss(cfg):\n    \"\"\"Build loss.\"\"\"\n    return build(cfg, LOSSES)\ndef build_recognizer(cfg):\n    \"\"\"Build recognizer.\"\"\"\n    return build(cfg, RECOGNIZERS, key='framework')\ndef build_segmenter(cfg):\n    \"\"\"Build segmenter.\"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/builder.py:22-73"
+    },
+    "7237": {
+        "file_id": 521,
+        "content": "This code defines various building functions for different parts of a model. The \"build_backbone\" function builds the backbone of the model, while \"build_roi_extractor\", \"build_assigner\", and \"build_sampler\" build the region of interest extractor, box assigner, and box sampler respectively. \"build_head\" builds the head of the model, \"build_loss\" builds the loss function, and \"build_recognizer\" and \"build_segmenter\" build recognizers and segmenters with different frameworks or keys. The functions use the \"build\" method from an unspecified source (\"*\") to perform the actual building process.",
+        "type": "comment"
+    },
+    "7238": {
+        "file_id": 521,
+        "content": "    return build(cfg, SEGMENTERS, key='framework')\ndef build_localizer(cfg):\n    \"\"\"Build localizer.\"\"\"\n    return build(cfg, LOCALIZERS, key='framework')\ndef build_detector(cfg, train_cfg=None, test_cfg=None):\n    \"\"\"Build detector.\"\"\"\n    return build(cfg, DETECTORS, key='framework')\ndef build_partitioner(cfg):\n    \"\"\"Build partitioner.\"\"\"\n    return build(cfg, PARTITIONERS, key='framework')\ndef build_estimator(cfg):\n    \"\"\"Build estimator.\"\"\"\n    return build(cfg, ESTIMATORS, key='framework')\ndef build_multimodal(cfg):\n    \"\"\"Build multimodal.\"\"\"\n    return build(cfg, MULTIMODAL, key='framework')\ndef build_segment(cfg):\n    \"\"\"Build segment.\"\"\"\n    return build(cfg, SEGMENT, key='framework')\ndef build_model(cfg):\n    cfg_copy = cfg.copy()\n    framework_type = cfg_copy.get('framework')\n    if framework_type in RECOGNIZERS:\n        return build_recognizer(cfg)\n    elif framework_type in LOCALIZERS:\n        return build_localizer(cfg)\n    elif framework_type in PARTITIONERS:\n        return build_partitioner(cfg)\n    elif framework_type in DETECTORS:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/builder.py:74-116"
+    },
+    "7239": {
+        "file_id": 521,
+        "content": "The code is a builder function that dynamically builds various components (recognizers, localizers, detectors, partitioners, estimators, and multimodal) based on the specified framework type in the configuration file. It utilizes the 'build' function to return the appropriate component for further processing.",
+        "type": "comment"
+    },
+    "7240": {
+        "file_id": 521,
+        "content": "        return build_detector(cfg)\n    elif framework_type in ESTIMATORS:\n        return build_estimator(cfg)\n    elif framework_type in MULTIMODAL:\n        return build_multimodal(cfg)\n    elif framework_type in SEGMENTERS:\n        return build_segmenter(cfg)\n    elif framework_type in SEGMENT:\n        return build_segment(cfg)\n    else:\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/builder.py:117-127"
+    },
+    "7241": {
+        "file_id": 521,
+        "content": "This code selects a specific function to build based on the framework type. It checks if the framework type is in defined lists of detectors, estimators, multimodal models, segmenters, or segments. If none match, it raises NotImplementedError.",
+        "type": "comment"
+    },
+    "7242": {
+        "file_id": 522,
+        "content": "/paddlevideo/modeling/framework/__init__.py",
+        "type": "filepath"
+    },
+    "7243": {
+        "file_id": 522,
+        "content": "This code is part of the PaddleVideo framework, which provides base classes for various model classes like 'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator', 'DepthEstimator', and more. The list contains names of these available base classes within the module.",
+        "type": "summary"
+    },
+    "7244": {
+        "file_id": 522,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .estimators import BaseEstimator, DepthEstimator\nfrom .localizers import BaseLocalizer, BMNLocalizer\nfrom .partitioners import BasePartitioner, TransNetV2Partitioner\nfrom .recognizers import BaseRecognizer, Recognizer2D\nfrom .multimodal import ActBert, BaseMultimodal\nfrom .segment import BaseSegment, CFBI\nfrom .segmenters import MSTCN\n__all__ = [\n    'BaseRecognizer', 'Recognizer2D', 'BaseLocalizer', 'BMNLocalizer',",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/__init__.py:1-24"
+    },
+    "7245": {
+        "file_id": 522,
+        "content": "This code is part of the PaddleVideo framework, which provides several base classes for estimators, localizers, partitioners, recognizers, multimodal models, segments, and segmenters. The commented lines describe the license information and the imported classes from different modules within the framework. The '__all__' list contains the names of the base classes available in this module.",
+        "type": "comment"
+    },
+    "7246": {
+        "file_id": 522,
+        "content": "    'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator',\n    'DepthEstimator', 'BaseMultimodal', 'ActBert', 'BaseSegment', 'CFBI',\n    'MSTCN'\n]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/__init__.py:25-28"
+    },
+    "7247": {
+        "file_id": 522,
+        "content": "This code snippet contains a list of various model classes, including 'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator', 'DepthEstimator', 'BaseMultimodal', 'ActBert', 'BaseSegment', 'CFBI', and 'MSTCN'. These models are likely used for different tasks within the PaddleVideo framework.",
+        "type": "comment"
+    },
+    "7248": {
+        "file_id": 523,
+        "content": "/paddlevideo/modeling/framework/detectors/__init__.py",
+        "type": "filepath"
+    },
+    "7249": {
+        "file_id": 523,
+        "content": "This code is part of the PaddleVideo library, specifically defining detectors. It imports three detector classes (BaseDetector, FastRCNN, and TwoStageDetector) from its local directory and lists them in __all__. The comment at the beginning establishes copyright information and licensing.",
+        "type": "summary"
+    },
+    "7250": {
+        "file_id": 523,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseDetector\nfrom .fast_rcnn import FastRCNN\nfrom .two_stage import TwoStageDetector\n__all__ = ['BaseDetector', 'TwoStageDetector', 'FastRCNN']",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/__init__.py:1-17"
+    },
+    "7251": {
+        "file_id": 523,
+        "content": "This code is part of the PaddleVideo library, specifically defining detectors. It imports three detector classes (BaseDetector, FastRCNN, and TwoStageDetector) from its local directory and lists them in __all__. The comment at the beginning establishes copyright information and licensing.",
+        "type": "comment"
+    },
+    "7252": {
+        "file_id": 524,
+        "content": "/paddlevideo/modeling/framework/detectors/base.py",
+        "type": "filepath"
+    },
+    "7253": {
+        "file_id": 524,
+        "content": "BaseDetector class serves as a parent for detectors, providing common features and abstract train_step method implementation. Abstract base classes are defined for training, validating, and testing steps in machine learning models.",
+        "type": "summary"
+    },
+    "7254": {
+        "file_id": 524,
+        "content": "from abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nfrom ...registry import DETECTORS\n@DETECTORS.register()\nclass BaseDetector(nn.Layer):\n    \"\"\"Base class for detectors.  \"\"\"\n    def __init__(self, backbone=None, head=None):\n        super().__init__()\n    def init_weights(self):\n        \"\"\"Initialize the model network weights. \"\"\"\n        self.backbone.init_weights()  \n        self.head.init_weights()\n    def extract_feature(self, imgs, iter_num):\n        \"\"\"Extract features through a backbone.  \"\"\"\n        feature = self.backbone(imgs)\n        return feature\n    def forward(self,  data_batch, mode='infer'):\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/base.py:1-36"
+    },
+    "7255": {
+        "file_id": 524,
+        "content": "BaseDetector class is the parent class for detectors, providing common functionality like extracting features and initializing weights. It defines an abstract train_step method that needs to be implemented by subclasses for training. The class also contains methods for feature extraction, model forward pass, and handling different modes (train, valid, test, infer).",
+        "type": "comment"
+    },
+    "7256": {
+        "file_id": 524,
+        "content": "        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/base.py:37-51"
+    },
+    "7257": {
+        "file_id": 524,
+        "content": "This code defines abstract base classes for training, validating, and testing steps in a machine learning model. These methods must be implemented by subclasses to perform the specific tasks accordingly.",
+        "type": "comment"
+    },
+    "7258": {
+        "file_id": 525,
+        "content": "/paddlevideo/modeling/framework/detectors/fast_rcnn.py",
+        "type": "filepath"
+    },
+    "7259": {
+        "file_id": 525,
+        "content": "FastRCNN is a two-stage object detection class inheriting from TwoStageDetector, created with specified head, train and test configurations, and optional pretrained weights.",
+        "type": "summary"
+    },
+    "7260": {
+        "file_id": 525,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .two_stage import TwoStageDetector\nfrom ...registry import DETECTORS\n@DETECTORS.register()\nclass FastRCNN(TwoStageDetector):\n    def __init__(self,\n                 backbone,\n                 head=None,\n                 train_cfg=None,\n                 test_cfg=None,\n                 neck=None,\n                 pretrained=None):\n        super(FastRCNN, self).__init__(\n            backbone=backbone,\n            neck=neck,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/fast_rcnn.py:1-30"
+    },
+    "7261": {
+        "file_id": 525,
+        "content": "Defines the FastRCNN class, a two-stage detector that inherits from TwoStageDetector. It takes backbone, head, train_cfg, test_cfg, neck, and pretrained as parameters for object detection.",
+        "type": "comment"
+    },
+    "7262": {
+        "file_id": 525,
+        "content": "            roi_head=head,\n            train_cfg=train_cfg,\n            test_cfg=test_cfg,\n            pretrained=pretrained)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/fast_rcnn.py:31-34"
+    },
+    "7263": {
+        "file_id": 525,
+        "content": "Creates a Fast R-CNN detector with specified head, train and test configurations, and optionally pretrained weights.",
+        "type": "comment"
+    },
+    "7264": {
+        "file_id": 526,
+        "content": "/paddlevideo/modeling/framework/detectors/two_stage.py",
+        "type": "filepath"
+    },
+    "7265": {
+        "file_id": 526,
+        "content": "This code uses the slowfast model for object detection, initializes components and parameters, supports pretrained weights, and provides methods for training, testing, and inference. It also retrieves data from PaddleVideo's two-stage detector with various inputs and entity ID selection using index_select.",
+        "type": "summary"
+    },
+    "7266": {
+        "file_id": 526,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom ... import builder\nimport paddle.distributed as dist\nfrom ...registry import DETECTORS\nfrom .base import BaseDetector\n@DETECTORS.register()\nclass TwoStageDetector(BaseDetector):\n    \"\"\"Base class for two-stage detectors.  \"\"\"\n    def __init__(self,\n                 backbone,\n                 neck=None,\n                 rpn_head=None,\n                 roi_head=None,\n                 train_cfg=None,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/two_stage.py:1-32"
+    },
+    "7267": {
+        "file_id": 526,
+        "content": "This code is importing necessary libraries, registering a two-stage detector class (TwoStageDetector) within the DETECTORS registry, and initializing its components. The class serves as a base for implementing two-stage object detection algorithms.",
+        "type": "comment"
+    },
+    "7268": {
+        "file_id": 526,
+        "content": "                 test_cfg=None,\n                 pretrained=None):\n        super(TwoStageDetector, self).__init__()\n        self.backbone = builder.build_backbone(backbone)\n        if neck is not None:\n            self.neck = neck  # useless\n        if rpn_head is not None:\n            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None\n            rpn_head_ = rpn_head.copy()\n            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)\n            self.rpn_head = builder.build_head(rpn_head_)\n        if roi_head is not None:\n            self.roi_head = builder.build_head(roi_head)\n        self.train_cfg = train_cfg\n        self.test_cfg = test_cfg\n        if pretrained is not None:\n            self.init_weights(pretrained=pretrained)\n    @property\n    def with_rpn(self):\n        \"\"\"whether the detector has RPN\"\"\"\n        return hasattr(self, 'rpn_head') and self.rpn_head is not None\n    @property\n    def with_roi_head(self):\n        \"\"\"whether the detector has a RoI head\"\"\"\n        return hasattr(self, 'roi_head') and self.roi_head is not None",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/two_stage.py:33-64"
+    },
+    "7269": {
+        "file_id": 526,
+        "content": "This code defines a class for a two-stage object detection model. It initializes the backbone, neck (if provided), and heads for RPN and ROI. The constructor also takes optional train_cfg and test_cfg parameters for each head. Additional pretrained weights can be loaded later if provided. The @property methods check whether the detector has RPN or ROI head.",
+        "type": "comment"
+    },
+    "7270": {
+        "file_id": 526,
+        "content": "    def init_weights(self, pretrained=None):\n        \"\"\"Initialize the weights in detector.  \"\"\"\n        super(TwoStageDetector, self).init_weights(pretrained)\n        self.backbone.init_weights(pretrained=pretrained)\n        if self.with_rpn:\n            self.rpn_head.init_weights()\n        if self.with_roi_head:\n            self.roi_head.init_weights(pretrained)\n    def extract_feat(self, img):\n        \"\"\"Directly extract features from the backbone.\"\"\"\n        x = self.backbone(img)\n        return x\n    def train_step(self, data, **kwargs):\n        img_slow = data[0]\n        img_fast = data[1]\n        proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(\n            data)\n        img_shape = data[7]\n        img_idx = data[8]\n        img_metas = scores, entity_ids\n        x = self.extract_feat(img=[img_slow, img_fast])\n        roi_losses = self.roi_head.train_step(x, img_metas, proposals,\n                                              gt_bboxes, gt_labels, **kwargs)\n        losses = dict()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/two_stage.py:66-91"
+    },
+    "7271": {
+        "file_id": 526,
+        "content": "The code initializes the weights of a two-stage detector and extracts features from its backbone. The train_step function takes input data, extracts features using the extract_feat method, and computes roi_losses using the roi_head's train_step method. These losses are then stored in the losses dictionary.",
+        "type": "comment"
+    },
+    "7272": {
+        "file_id": 526,
+        "content": "        losses.update(roi_losses)\n        return losses\n    def val_step(self, data, rescale=False):\n        img_slow = data[0]\n        img_fast = data[1]\n        proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(\n            data)\n        img_shape = data[7]\n        img_metas = scores, entity_ids\n        x = self.extract_feat(img=[img_slow, img_fast])\n        return self.roi_head.simple_test(x,\n                                         proposals[0],\n                                         img_shape,\n                                         rescale=rescale)\n    def test_step(self, data, rescale=False):\n        return self.val_step(data, rescale)\n    def infer_step(self, data, rescale=False):\n        ''' model inference'''\n        img_slow = data[0]\n        img_fast = data[1]\n        proposals = data[2]\n        img_shape = data[3]\n        # using slowfast model to extract spatio-temporal features\n        x = self.extract_feat(img=[img_slow, img_fast])\n        ret = self.roi_head.simple_test(x,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/two_stage.py:92-124"
+    },
+    "7273": {
+        "file_id": 526,
+        "content": "This code defines three methods, val_step, test_step, and infer_step. All these methods extract features using the slowfast model and then pass them to roi_head for further processing. Val_step is used for validation while test_step is used for testing. Infer_step performs inference using previously obtained data.",
+        "type": "comment"
+    },
+    "7274": {
+        "file_id": 526,
+        "content": "                                        proposals[0],\n                                        img_shape,\n                                        rescale=rescale)\n        return ret\n    def get_unpad_datas(self, data):\n        ''' get original datas padded in dataset '''\n        pad_proposals = data[2]\n        pad_gt_bboxes = data[3]\n        pad_gt_labels = data[4]\n        pad_scores, pad_entity_ids = data[5], data[6]\n        len_proposals = data[9]\n        len_gt_bboxes = data[10]\n        len_gt_labels = data[11]\n        len_scores = data[12]\n        len_entity_ids = data[13]\n        N = pad_proposals.shape[0]\n        proposals = []\n        gt_bboxes = []\n        gt_labels = []\n        scores = []\n        entity_ids = []\n        for bi in range(N):\n            pad_proposal = pad_proposals[bi]\n            len_proposal = len_proposals[bi]\n            index_proposal = paddle.arange(len_proposal)\n            proposal = paddle.index_select(x=pad_proposal,\n                                           index=index_proposal,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/two_stage.py:125-152"
+    },
+    "7275": {
+        "file_id": 526,
+        "content": "This code snippet is part of the PaddleVideo library's two-stage detector implementation. It defines a function that retrieves original data from padded dataset, and another function for getting unpadded datas. The first function takes in a set of proposals, ground truth bboxes, labels, scores, and entity ids, and returns them as unpadded data based on the number of proposals at each index. The second function retrieves original datas padded in dataset for two-stage detector implementation.",
+        "type": "comment"
+    },
+    "7276": {
+        "file_id": 526,
+        "content": "                                           axis=0)\n            proposals.append(proposal)\n            pad_gt_bbox = pad_gt_bboxes[bi]\n            len_gt_bbox = len_gt_bboxes[bi]\n            index_gt_bbox = paddle.arange(len_gt_bbox)\n            gt_bbox = paddle.index_select(x=pad_gt_bbox,\n                                          index=index_gt_bbox,\n                                          axis=0)\n            gt_bboxes.append(gt_bbox)\n            pad_gt_label = pad_gt_labels[bi]\n            len_gt_label = len_gt_labels[bi]\n            index_gt_label = paddle.arange(len_gt_label)\n            gt_label = paddle.index_select(x=pad_gt_label,\n                                           index=index_gt_label,\n                                           axis=0)\n            gt_labels.append(gt_label)\n            pad_score = pad_scores[bi]\n            len_score = len_scores[bi]\n            index_score = paddle.arange(len_score)\n            score = paddle.index_select(x=pad_score, index=index_score, axis=0)\n            scores.append(score)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/two_stage.py:153-176"
+    },
+    "7277": {
+        "file_id": 526,
+        "content": "This code creates a list of proposals, and corresponding ground truth bounding boxes (gt_bboxes), labels (gt_labels), and scores. It handles batches by iterating over each batch index (bi) and for each batch, it performs index selection on the padded data based on the indices of the current length of the batch to extract the relevant gt_bbox, gt_label, and score information. These are then appended to their respective lists.",
+        "type": "comment"
+    },
+    "7278": {
+        "file_id": 526,
+        "content": "            pad_entity_id = pad_entity_ids[bi]\n            len_entity_id = len_entity_ids[bi]\n            index_entity_id = paddle.arange(len_entity_id)\n            entity_id = paddle.index_select(x=pad_entity_id,\n                                            index=index_entity_id,\n                                            axis=0)\n            entity_ids.append(entity_id)\n        return proposals, gt_bboxes, gt_labels, scores, entity_ids",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/detectors/two_stage.py:178-186"
+    },
+    "7279": {
+        "file_id": 526,
+        "content": "This code segment is selecting specific entity IDs from a list and appending them to the 'entity_ids' list. It uses Paddle's index_select function to achieve this.",
+        "type": "comment"
+    },
+    "7280": {
+        "file_id": 527,
+        "content": "/paddlevideo/modeling/framework/estimators/__init__.py",
+        "type": "filepath"
+    },
+    "7281": {
+        "file_id": 527,
+        "content": "This code imports necessary classes and defines the publically accessible '__all__' list, containing the DepthEstimator and BaseEstimator classes.",
+        "type": "summary"
+    },
+    "7282": {
+        "file_id": 527,
+        "content": "from .base import BaseEstimator\nfrom .depth_estimator import DepthEstimator\n__all__ = ['DepthEstimator', 'BaseEstimator']",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/estimators/__init__.py:1-4"
+    },
+    "7283": {
+        "file_id": 527,
+        "content": "This code imports necessary classes and defines the publically accessible '__all__' list, containing the DepthEstimator and BaseEstimator classes.",
+        "type": "comment"
+    },
+    "7284": {
+        "file_id": 528,
+        "content": "/paddlevideo/modeling/framework/estimators/base.py",
+        "type": "filepath"
+    },
+    "7285": {
+        "file_id": 528,
+        "content": "The code creates a PaddleVideo BaseEstimator class, inheriting from nn.Layer and utilizing builder for backbone construction. It initializes weights, registers the class, and sets forward modes for validation, testing, and inference, with abstract methods that must be implemented by subclasses.",
+        "type": "summary"
+    },
+    "7286": {
+        "file_id": 528,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nimport paddle\nimport paddle.nn as nn\nfrom paddlevideo.modeling.registry import ESTIMATORS\nfrom paddlevideo.utils import get_logger\nfrom ... import builder\nlogger = get_logger(\"paddlevideo\")\n@ESTIMATORS.register()\nclass BaseEstimator(nn.Layer):\n    \"\"\"BaseEstimator\n    \"\"\"\n    def __init__(self, backbone=None, head=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/estimators/base.py:1-34"
+    },
+    "7287": {
+        "file_id": 528,
+        "content": "This code is defining a base class for an estimator in PaddleVideo. It inherits from nn.Layer, uses builder to construct the backbone if specified, and initializes the weights of the backbone if it has an init_weights method. The ESTIMATORS registry is used to register this BaseEstimator class.",
+        "type": "comment"
+    },
+    "7288": {
+        "file_id": 528,
+        "content": "                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/estimators/base.py:35-66"
+    },
+    "7289": {
+        "file_id": 528,
+        "content": "The code initializes the backbone and head components of a model depending on their availability. It then defines four forward modes (train, valid, test, infer) to execute the model accordingly. The train_step abstract method must be implemented separately.",
+        "type": "comment"
+    },
+    "7290": {
+        "file_id": 528,
+        "content": "        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch):\n        \"\"\"Define how the model is going to valid, from input to output.\"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/estimators/base.py:67-82"
+    },
+    "7291": {
+        "file_id": 528,
+        "content": "This code defines abstract methods for model validation, testing, and inference steps. It raises a NotImplementedError to ensure subclasses must implement these methods.",
+        "type": "comment"
+    },
+    "7292": {
+        "file_id": 529,
+        "content": "/paddlevideo/modeling/framework/estimators/depth_estimator.py",
+        "type": "filepath"
+    },
+    "7293": {
+        "file_id": 529,
+        "content": "The DepthEstimator class inherits from BaseEstimator and contains a forward_net method for feature extraction. It has training, validation, testing, and inference methods with loss metrics calculated using the forward_net and head.loss.",
+        "type": "summary"
+    },
+    "7294": {
+        "file_id": 529,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nfrom paddlevideo.modeling.framework.estimators.base import BaseEstimator\nfrom paddlevideo.modeling.registry import ESTIMATORS\nfrom paddlevideo.utils import get_logger\nfrom ... import builder\nlogger = get_logger(\"paddlevideo\")\n@ESTIMATORS.register()\nclass DepthEstimator(BaseEstimator):\n    \"\"\"DepthEstimator\n    \"\"\"\n    def forward_net(self, inputs, day_or_night='day_and_night'):\n        if self.backbone is not None:\n            outputs = self.backbone(inputs, day_or_night)\n        else:\n            outputs = inputs",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/estimators/depth_estimator.py:1-31"
+    },
+    "7295": {
+        "file_id": 529,
+        "content": "The code defines a DepthEstimator class that inherits from BaseEstimator. It has a forward_net method that takes inputs and optionally applies a backbone network for feature extraction. The results are stored in outputs.",
+        "type": "comment"
+    },
+    "7296": {
+        "file_id": 529,
+        "content": "        return outputs\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        inputs, _ = data_batch\n        outputs = self.forward_net(inputs, day_or_night='day_and_night')\n        loss_metrics = self.head.loss(inputs, outputs)\n        return loss_metrics\n    def val_step(self, data_batch):\n        inputs, day_or_night = data_batch\n        outputs = self.forward_net(inputs, day_or_night=day_or_night)\n        loss_metrics = self.head.loss(inputs, outputs)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        inputs, day_or_night = data_batch\n        outputs = self.forward_net(inputs, day_or_night=day_or_night)\n        loss_metrics = self.head.loss(inputs, outputs)\n        return loss_metrics\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        inputs = data_batch[0]\n        outputs = self.forward_net(inputs, day_or_night='day')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/estimators/depth_estimator.py:32-58"
+    },
+    "7297": {
+        "file_id": 529,
+        "content": "The code defines four methods: train_step, val_step, test_step, and infer_step. The main purpose of each step is to calculate the loss metrics from the input data. The 'forward_net' method is used in all steps to process the inputs and generate outputs, which are then passed to the 'head.loss' method to compute the loss metrics for each step.",
+        "type": "comment"
+    },
+    "7298": {
+        "file_id": 529,
+        "content": "        return outputs",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/estimators/depth_estimator.py:59-59"
+    },
+    "7299": {
+        "file_id": 529,
+        "content": "This code snippet returns the output results from a depth estimator model.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/73.json b/docs/data/73.json
new file mode 100644
index 000000000..63b7220c9
--- /dev/null
+++ b/docs/data/73.json
@@ -0,0 +1,541 @@
+{
+    "7300": {
+        "file_id": 530,
+        "content": "/paddlevideo/modeling/framework/localizers/__init__.py",
+        "type": "filepath"
+    },
+    "7301": {
+        "file_id": 530,
+        "content": "This code is a part of the PaddleVideo library and contains localizers, which are responsible for handling different types of video localization tasks. The base class \"BaseLocalizer\" serves as a parent class, while specific classes like \"BMNLocalizer\" and \"YOWOLocalizer\" extend it to handle various localization techniques. These localizers may be used in a wide range of applications depending on the required localization approach.",
+        "type": "summary"
+    },
+    "7302": {
+        "file_id": 530,
+        "content": "# copyright (c) 2020  paddlepaddle authors. all rights reserved.\n#\n# licensed under the apache license, version 2.0 (the \"license\"\n# you may not use this file except in compliance with the license.\n# you may obtain a copy of the license at\n#\n#     http://www.apache.org/licenses/license-2.0\n#\n# unless required by applicable law or agreed to in writing, software\n# distributed under the license is distributed on an \"as is\" basis,\n# without warranties or conditions of any kind, either express or implied.\n# see the license for the specific language governing permissions and\n# limitations under the license.\nfrom .base import BaseLocalizer\nfrom .bmn_localizer import BMNLocalizer\nfrom .yowo_localizer import YOWOLocalizer\n__all__ = ['BaseLocalizer', 'BMNLocalizer', 'YOWOLocalizer']",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/__init__.py:1-19"
+    },
+    "7303": {
+        "file_id": 530,
+        "content": "This code is a part of the PaddleVideo library and contains localizers, which are responsible for handling different types of video localization tasks. The base class \"BaseLocalizer\" serves as a parent class, while specific classes like \"BMNLocalizer\" and \"YOWOLocalizer\" extend it to handle various localization techniques. These localizers may be used in a wide range of applications depending on the required localization approach.",
+        "type": "comment"
+    },
+    "7304": {
+        "file_id": 531,
+        "content": "/paddlevideo/modeling/framework/localizers/base.py",
+        "type": "filepath"
+    },
+    "7305": {
+        "file_id": 531,
+        "content": "This code defines a base class for localization models using PaddlePaddle framework, with train, valid, and test steps implemented in subclasses. It supports different operation modes and allows weight initialization.",
+        "type": "summary"
+    },
+    "7306": {
+        "file_id": 531,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom abc import abstractmethod\nimport paddle.nn as nn\nfrom ... import builder\nclass BaseLocalizer(nn.Layer):\n    \"\"\"Base class for Localization.\n    All localizer should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, define your train step.\n    - Methods:``valid_step``, define your valid step, always the same as train_step.\n    - Methods:``test_step``, define your test step.\n    \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/base.py:1-27"
+    },
+    "7307": {
+        "file_id": 531,
+        "content": "This code snippet defines a base class for localization models. All subclasses of this base class should implement train_step, valid_step, and test_step methods to define their respective steps in the model's training process. It uses PaddlePaddle's framework and is licensed under the Apache License, Version 2.0.",
+        "type": "comment"
+    },
+    "7308": {
+        "file_id": 531,
+        "content": "    def __init__(self, backbone, loss):\n        super().__init__()\n        self.backbone = builder.build_backbone(backbone)\n        self.loss = builder.build_loss(loss)\n        self.init_weights()\n    def init_weights(self):\n        \"\"\"Initialize the model network weights. \"\"\"\n        if getattr(self.backbone, 'init_weights'):\n            self.backbone.init_weights()\n        else:\n            pass\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/base.py:28-56"
+    },
+    "7309": {
+        "file_id": 531,
+        "content": "This code initializes a localizer model, handling backbone and loss functions, and allows for different operation modes (train, valid, test, infer). It also includes a function to initialize the model's network weights.",
+        "type": "comment"
+    },
+    "7310": {
+        "file_id": 531,
+        "content": "    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.  input_data_batch -> loss_metric\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating setp. input_data_batch -> loss_metric\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Tets setp. to get acc in test data. input_data_batch -> output\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/base.py:58-74"
+    },
+    "7311": {
+        "file_id": 531,
+        "content": "This code defines abstract classes for training, validation, and testing steps in a model. The train_step, val_step, and test_step methods require implementation by subclasses to perform the necessary computations.",
+        "type": "comment"
+    },
+    "7312": {
+        "file_id": 532,
+        "content": "/paddlevideo/modeling/framework/localizers/bmn_localizer.py",
+        "type": "filepath"
+    },
+    "7313": {
+        "file_id": 532,
+        "content": "This code defines a localizer model for PaddleVideo with forward network and methods for training, validating, testing, and inferring. It uses input data to predict bounding boxes, start position, and end position while calculating loss using ground truth values.",
+        "type": "summary"
+    },
+    "7314": {
+        "file_id": 532,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import LOCALIZERS\nfrom .base import BaseLocalizer\nimport paddle\n@LOCALIZERS.register()\nclass BMNLocalizer(BaseLocalizer):\n    \"\"\"BMN Localization framework\n    \"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Call backbone forward.\n        \"\"\"\n        preds = self.backbone(imgs)\n        return preds\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        x_data = data_batch[0]\n        gt_iou_map = data_batch[1]\n        gt_start = data_batch[2]\n        gt_end = data_batch[3]\n        gt_iou_map.stop_gradient = True",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/bmn_localizer.py:1-36"
+    },
+    "7315": {
+        "file_id": 532,
+        "content": "This code is part of the PaddleVideo library and defines a BMNLocalizer class, which is a localization framework. It includes a forward_net method for calling the backbone's forward function and a train_step method for handling training steps with input data. The gt_iou_map, gt_start, and gt_end are provided as part of the data batch to be used in the training step.",
+        "type": "comment"
+    },
+    "7316": {
+        "file_id": 532,
+        "content": "        gt_start.stop_gradient = True\n        gt_end.stop_gradient = True\n        # call Model forward\n        pred_bm, pred_start, pred_end = self.forward_net(x_data)\n        # call Loss forward\n        loss = self.loss(pred_bm, pred_start, pred_end, gt_iou_map, gt_start,\n                         gt_end)\n        avg_loss = paddle.mean(loss)\n        loss_metrics = dict()\n        loss_metrics['loss'] = avg_loss\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        x_data = data_batch[0]\n        pred_bm, pred_start, pred_end = self.forward_net(x_data)\n        return pred_bm, pred_start, pred_end\n    def infer_step(self, data_batch):\n        \"\"\"Infer step\n        \"\"\"\n        x_data = data_batch[0]\n        # call Model forward\n        pred_bm, pred_start, pred_end = self.forward_net(x_data)\n        return pred_bm, pred_start, pred_end",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/bmn_localizer.py:37-69"
+    },
+    "7317": {
+        "file_id": 532,
+        "content": "This code defines a localizer model for PaddleVideo. It includes functions for training, validating, testing, and inferring steps. The localizer has a forward network which takes input data and returns predictions for bounding boxes (pred_bm), start position (pred_start), and end position (pred_end). Loss is calculated using the provided ground truth values (gt_iou_map, gt_start, gt_end) and averaged over the batch.",
+        "type": "comment"
+    },
+    "7318": {
+        "file_id": 533,
+        "content": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py",
+        "type": "filepath"
+    },
+    "7319": {
+        "file_id": 533,
+        "content": "The YOWOLocalizer extends BaseLocalizer, performs NMS on detected boxes, matches ground truth with predicted boxes based on IoU threshold, and calculates precision, recall, and F-score for YOWO localizer using test step function.",
+        "type": "summary"
+    },
+    "7320": {
+        "file_id": 533,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import LOCALIZERS\nfrom .base import BaseLocalizer\nfrom .yowo_utils import truths_length, nms, get_region_boxes, bbox_iou\n@LOCALIZERS.register()\nclass YOWOLocalizer(BaseLocalizer):\n    \"\"\"YOWO Localization framework\n    \"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Call backbone forward.\n        \"\"\"\n        # imgs.shape=[N,C,T,H,W], for YOWO\n        preds = self.backbone(imgs)\n        return preds\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        x_data = data_batch[0]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:1-33"
+    },
+    "7321": {
+        "file_id": 533,
+        "content": "Code from PaddleVideo's yowo_localizer.py file defines a YOWOLocalizer class which extends BaseLocalizer and utilizes the backbone function for forwarding image data. It also includes methods forward_net and train_step for processing images in training context.",
+        "type": "comment"
+    },
+    "7322": {
+        "file_id": 533,
+        "content": "        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor\n        target.stop_gradient = True\n        # call Model forward\n        out = self.forward_net(x_data)\n        # call Loss forward\n        loss, nCorrect = self.loss(out, target)\n        loss_metrics = dict()\n        loss_metrics['loss'] = loss\n        loss_metrics['nCorrect'] = nCorrect\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        total = 0.0\n        proposals = 0.0\n        correct = 0.0\n        fscore = 0.0\n        eps = 1e-5\n        nms_thresh = 0.4\n        iou_thresh = 0.5\n        x_data = data_batch[0]\n        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor\n        frame_idx = data_batch[2]\n        target.stop_gradient = True\n        # call Model forward\n        out = self.forward_net(x_data)\n        all_boxes = get_region_boxes(out)\n        out_boxes = []\n        for i in range(out.shape[0]):\n            boxes = all_boxes[i]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:34-67"
+    },
+    "7323": {
+        "file_id": 533,
+        "content": "Function \"forward_net\" is called to perform model's forward pass, and the output of this function call is stored in 'out'. The code then calls another function \"loss\" passing the output from the model (out) and the target data (target). The output from the loss function call is stored in 'loss', along with number of correct predictions ('nCorrect'). A dictionary named 'loss_metrics' is created storing 'loss' and 'nCorrect'. This process is part of training set step. \nThe 'val_step' function performs validation steps like calculating total, proposals, correct and fscore variables using specific values (total = 0.0, proposals = 0.0, correct = 0.0, fscore = 0.0). It also uses an epsilon value of 1e-5 and a nms_thresh and iou_thresh of 0.4 for certain calculations. It calls the model's forward pass (forward_net) to get 'out', then gets all region boxes using get_region_boxes function, then iterates over each box in out, storing them in a list named 'out_boxes'. This process is part of validating step.",
+        "type": "comment"
+    },
+    "7324": {
+        "file_id": 533,
+        "content": "            boxes = nms(boxes, nms_thresh)\n            out_boxes.append(boxes)\n            truths = target[i].reshape([-1, 5])\n            num_gts = truths_length(truths)\n            total = total + num_gts\n            pred_list = []\n            for i in range(len(boxes)):\n                if boxes[i][4] > 0.25:\n                    proposals = proposals + 1\n                    pred_list.append(i)\n            for i in range(num_gts):\n                box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]]\n                best_iou = 0\n                best_j = -1\n                for j in pred_list:  # ITERATE THROUGH ONLY CONFIDENT BOXES\n                    iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False)\n                    if iou > best_iou:\n                        best_j = j\n                        best_iou = iou\n                if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]:\n                    correct = correct + 1\n        precision = 1.0 * correct / (proposals + eps)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:68-90"
+    },
+    "7325": {
+        "file_id": 533,
+        "content": "This code performs Non-Maximum Suppression (NMS) on detected boxes, selects confident boxes for further processing, and associates ground truth boxes with predicted boxes based on Intersection over Union (IoU) threshold. It counts correct matches, proposals, total ground truth boxes, and calculates precision. The precision is calculated by dividing the number of correct matches by the sum of proposals and a small epsilon value to avoid division by zero.",
+        "type": "comment"
+    },
+    "7326": {
+        "file_id": 533,
+        "content": "        recall = 1.0 * correct / (total + eps)\n        fscore = 2.0 * precision * recall / (precision + recall + eps)\n        outs = dict()\n        outs['precision'] = precision\n        outs['recall'] = recall\n        outs['fscore'] = fscore\n        outs['frame_idx'] = frame_idx\n        return outs\n    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        total = 0.0\n        proposals = 0.0\n        correct = 0.0\n        fscore = 0.0\n        eps = 1e-5\n        nms_thresh = 0.4\n        iou_thresh = 0.5\n        x_data = data_batch[0]\n        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor\n        frame_idx = data_batch[2]\n        target.stop_gradient = True\n        # call Model forward\n        out = self.forward_net(x_data)\n        all_boxes = get_region_boxes(out)\n        out_boxes = []\n        for i in range(out.shape[0]):\n            boxes = all_boxes[i]\n            boxes = nms(boxes, nms_thresh)\n            out_boxes.append(boxes)\n            truths = target[i].reshape([-1, 5])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:91-125"
+    },
+    "7327": {
+        "file_id": 533,
+        "content": "This code defines a localizer for the YOLOv3 model and calculates precision, recall, and F-score using test step function. It initializes variables, processes input data batch, applies non-maximum suppression (NMS) to regions of interest, and returns output metrics including precision, recall, F-score, and frame index.",
+        "type": "comment"
+    },
+    "7328": {
+        "file_id": 533,
+        "content": "            num_gts = truths_length(truths)\n            total = total + num_gts\n            pred_list = []\n            for i in range(len(boxes)):\n                if boxes[i][4] > 0.25:\n                    proposals = proposals + 1\n                    pred_list.append(i)\n            for i in range(num_gts):\n                box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]]\n                best_iou = 0\n                best_j = -1\n                for j in pred_list:  # ITERATE THROUGH ONLY CONFIDENT BOXES\n                    iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False)\n                    if iou > best_iou:\n                        best_j = j\n                        best_iou = iou\n                if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]:\n                    correct = correct + 1\n        precision = 1.0 * correct / (proposals + eps)\n        recall = 1.0 * correct / (total + eps)\n        fscore = 2.0 * precision * recall / (precision + recall + eps)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:126-147"
+    },
+    "7329": {
+        "file_id": 533,
+        "content": "The code computes precision, recall, and F-score for each class of the YOWO localizer. It iterates through ground truth boxes and confident proposal boxes to match them based on Intersection over Union (IoU) threshold. It counts correctly matched boxes and total boxes, then calculates precision, recall, and F-score using these values.",
+        "type": "comment"
+    },
+    "7330": {
+        "file_id": 533,
+        "content": "        outs = dict()\n        outs['boxes'] = out_boxes\n        outs['precision'] = precision\n        outs['recall'] = recall\n        outs['fscore'] = fscore\n        outs['frame_idx'] = frame_idx\n        return outs\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        out = self.forward_net(data_batch[0])\n        return out",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:149-161"
+    },
+    "7331": {
+        "file_id": 533,
+        "content": "This code defines two functions within the yowo_localizer class. The first function, \"infer_step\", takes in a data batch and feeds it into the forward_net to get an output. The second function returns dictionaries for boxes, precision, recall, fscore, and frame_idx as outputs.",
+        "type": "comment"
+    },
+    "7332": {
+        "file_id": 534,
+        "content": "/paddlevideo/modeling/framework/localizers/yowo_utils.py",
+        "type": "filepath"
+    },
+    "7333": {
+        "file_id": 534,
+        "content": "The code contains functions for non-maximum suppression, tensor movement, and applying NMS to anchor boxes in images using PaddlePaddle. It transforms YOLOv2 output, generates ground truth targets, calculates IoU, counts instances, updates predictions, and returns masks and transformation parameters for translation, width, and height.",
+        "type": "summary"
+    },
+    "7334": {
+        "file_id": 534,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn as nn\nimport numpy as np\nfrom builtins import range as xrange\ndef truths_length(truths):\n    for i in range(50):\n        if truths[i][1] == 0:\n            return i\ndef nms(boxes, nms_thresh):\n    if len(boxes) == 0:\n        return boxes\n    det_confs = paddle.zeros([len(boxes)])\n    for i in range(len(boxes)):\n        det_confs[i] = 1 - boxes[i][4]\n    sortIds = paddle.argsort(det_confs)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:1-36"
+    },
+    "7335": {
+        "file_id": 534,
+        "content": "This code snippet defines a function `truths_length()` that returns the index of the first occurrence where the second element in 'truths' array is 0. It also defines a function `nms()` that applies Non-Maximum Suppression to filter out bounding boxes based on a given NMS threshold. The code checks if there are any bounding boxes, assigns confidence scores, sorts them in descending order of confidence, and removes overlapping bounding boxes with IoU greater than the NMS threshold.",
+        "type": "comment"
+    },
+    "7336": {
+        "file_id": 534,
+        "content": "    out_boxes = []\n    for i in range(len(boxes)):\n        box_i = boxes[sortIds[i]]\n        if box_i[4] > 0:\n            out_boxes.append(box_i)\n            for j in range(i + 1, len(boxes)):\n                box_j = boxes[sortIds[j]]\n                if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh:\n                    box_j[4] = 0\n    return out_boxes\ndef convert2cpu(gpu_matrix):\n    float_32_g = gpu_matrix.astype('float32')\n    return float_32_g.cpu()\ndef convert2cpu_long(gpu_matrix):\n    int_64_g = gpu_matrix.astype('int64')\n    return int_64_g.cpu()\ndef get_region_boxes(output, conf_thresh=0.005, num_classes=24,\n                     anchors=[0.70458, 1.18803, 1.26654, 2.55121, 1.59382,\n                              4.08321, 2.30548, 4.94180, 3.52332, 5.91979],\n                     num_anchors=5, only_objectness=1, validation=False):\n    anchor_step = len(anchors) // num_anchors\n    if output.dim() == 3:\n        output = output.unsqueeze(0)\n    batch = output.shape[0]\n    assert (output.shape[1] == (5 + num_classes) * num_anchors)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:37-67"
+    },
+    "7337": {
+        "file_id": 534,
+        "content": "This code is defining three functions: \"out_boxes\" appears to perform non-maximum suppression on bounding boxes, \"convert2cpu\" converts a tensor from GPU memory to CPU memory as a float32 type, and \"convert2cpu_long\" performs the same operation but as an int64 type. The \"get_region_boxes\" function takes in output from a model and applies non-maximum suppression for each anchor box in the image, using provided anchors and thresholds. This function also reshapes the input to have shape (batch, num_anchors, 5 + num_classes). The code includes assertions to ensure proper input shapes are being used.",
+        "type": "comment"
+    },
+    "7338": {
+        "file_id": 534,
+        "content": "    h = output.shape[2]\n    w = output.shape[3]\n    all_boxes = []\n    output = paddle.reshape(\n        output, [batch * num_anchors, 5 + num_classes, h * w])\n    output = paddle.transpose(output, (1, 0, 2))\n    output = paddle.reshape(\n        output, [5 + num_classes, batch * num_anchors * h * w])\n    grid_x = paddle.linspace(0, w - 1, w)\n    grid_x = paddle.tile(grid_x, [h, 1])\n    grid_x = paddle.tile(grid_x, [batch * num_anchors, 1, 1])\n    grid_x = paddle.reshape(grid_x, [batch * num_anchors * h * w]).cuda()\n    grid_y = paddle.linspace(0, h - 1, h)\n    grid_y = paddle.tile(grid_y, [w, 1]).t()\n    grid_y = paddle.tile(grid_y, [batch * num_anchors, 1, 1])\n    grid_y = paddle.reshape(grid_y, [batch * num_anchors * h * w]).cuda()\n    sigmoid = nn.Sigmoid()\n    xs = sigmoid(output[0]) + grid_x\n    ys = sigmoid(output[1]) + grid_y\n    anchor_w = paddle.to_tensor(anchors)\n    anchor_w = paddle.reshape(anchor_w, [num_anchors, anchor_step])\n    anchor_w = paddle.index_select(anchor_w, index=paddle.to_tensor(\n        np.array([0]).astype('int32')), axis=1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:68-94"
+    },
+    "7339": {
+        "file_id": 534,
+        "content": "This code performs box regression by reshaping the output tensor, creating grids for x and y coordinates, applying sigmoid function to the output, adding grid coordinates to get refined box coordinates. It also converts anchor widths into a tensor for further processing.",
+        "type": "comment"
+    },
+    "7340": {
+        "file_id": 534,
+        "content": "    anchor_h = paddle.to_tensor(anchors)\n    anchor_h = paddle.reshape(anchor_h, [num_anchors, anchor_step])\n    anchor_h = paddle.index_select(anchor_h, index=paddle.to_tensor(\n        np.array([1]).astype('int32')), axis=1)\n    anchor_w = paddle.tile(anchor_w, [batch, 1])\n    anchor_w = paddle.tile(anchor_w, [1, 1, h * w])\n    anchor_w = paddle.reshape(anchor_w, [batch * num_anchors * h * w]).cuda()\n    anchor_h = paddle.tile(anchor_h, [batch, 1])\n    anchor_h = paddle.tile(anchor_h, [1, 1, h * w])\n    anchor_h = paddle.reshape(anchor_h, [batch * num_anchors * h * w]).cuda()\n    ws = paddle.exp(output[2]) * anchor_w\n    hs = paddle.exp(output[3]) * anchor_h\n    det_confs = sigmoid(output[4])\n    cls_confs = paddle.to_tensor(output[5:5 + num_classes], stop_gradient=True)\n    cls_confs = paddle.transpose(cls_confs, [1, 0])\n    s = nn.Softmax()\n    cls_confs = paddle.to_tensor(s(cls_confs))\n    cls_max_confs = paddle.max(cls_confs, axis=1)\n    cls_max_ids = paddle.argmax(cls_confs, axis=1)\n    cls_max_confs = paddle.reshape(cls_max_confs, [-1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:96-122"
+    },
+    "7341": {
+        "file_id": 534,
+        "content": "Code prepares output from a YOLOv2 object detection model, performing necessary reshaping and transformations to obtain the final detections and classifications. It computes the widths (ws) and heights (hs) of the bounding boxes based on the input feature maps, applies sigmoid activation to the fifth output channel for detection confidences, and converts the rest of the outputs to stop_gradient=True tensors for class predictions. The code then performs softmax normalization over class predictions and retrieves the maximum confidence and corresponding class IDs for each bounding box. Finally, it reshapes cls_max_confs to a 1D tensor.",
+        "type": "comment"
+    },
+    "7342": {
+        "file_id": 534,
+        "content": "    cls_max_ids = paddle.reshape(cls_max_ids, [-1])\n    sz_hw = h * w\n    sz_hwa = sz_hw * num_anchors\n    det_confs = convert2cpu(det_confs)\n    cls_max_confs = convert2cpu(cls_max_confs)\n    cls_max_ids = convert2cpu_long(cls_max_ids)\n    xs = convert2cpu(xs)\n    ys = convert2cpu(ys)\n    ws = convert2cpu(ws)\n    hs = convert2cpu(hs)\n    if validation:\n        cls_confs = convert2cpu(cls_confs.reshape([-1, num_classes]))\n    for b in range(batch):\n        boxes = []\n        for cy in range(h):\n            for cx in range(w):\n                for i in range(num_anchors):\n                    ind = b * sz_hwa + i * sz_hw + cy * w + cx\n                    det_conf = det_confs[ind]\n                    if only_objectness:\n                        conf = det_confs[ind]\n                    else:\n                        conf = det_confs[ind] * cls_max_confs[ind]\n                    if conf > conf_thresh:\n                        bcx = xs[ind]\n                        bcy = ys[ind]\n                        bw = ws[ind]\n                        bh = hs[ind]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:123-153"
+    },
+    "7343": {
+        "file_id": 534,
+        "content": "The code extracts data from a PaddlePaddle tensor and converts it to CPU memory. It then reshapes the data, applies conditions, and stores box coordinates and confidences in lists for each batch. The extracted data is used to create bounding boxes for objects detected within the input image.",
+        "type": "comment"
+    },
+    "7344": {
+        "file_id": 534,
+        "content": "                        cls_max_conf = cls_max_confs[ind]\n                        cls_max_id = cls_max_ids[ind]\n                        box = [bcx / w, bcy / h, bw / w, bh / h,\n                               det_conf, cls_max_conf, cls_max_id]\n                        if (not only_objectness) and validation:\n                            for c in range(num_classes):\n                                tmp_conf = cls_confs[ind][c]\n                                if c != cls_max_id and det_confs[ind] * tmp_conf > conf_thresh:\n                                    box.append(tmp_conf)\n                                    box.append(c)\n                        boxes.append(box)\n        all_boxes.append(boxes)\n    return all_boxes\ndef bbox_iou(box1, box2, x1y1x2y2=True):\n    if x1y1x2y2:\n        mx = min(box1[0], box2[0])\n        Mx = max(box1[2], box2[2])\n        my = min(box1[1], box2[1])\n        My = max(box1[3], box2[3])\n        w1 = box1[2] - box1[0]\n        h1 = box1[3] - box1[1]\n        w2 = box2[2] - box2[0]\n        h2 = box2[3] - box2[1]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:154-178"
+    },
+    "7345": {
+        "file_id": 534,
+        "content": "The function `yowo_utils.py` returns a list of boxes with their respective confidences and class ids for each box. It includes only objectness if only_objectness is True, otherwise it also includes the per-class confidences. The function `bbox_iou` calculates the intersection over union between two bounding boxes, considering x1y1x2y2 format where (x1, y1) is the top left corner and (x2, y2) is the bottom right corner.",
+        "type": "comment"
+    },
+    "7346": {
+        "file_id": 534,
+        "content": "    else:\n        mx = min(float(box1[0] - box1[2] / 2.0),\n                 float(box2[0] - box2[2] / 2.0))\n        Mx = max(float(box1[0] + box1[2] / 2.0),\n                 float(box2[0] + box2[2] / 2.0))\n        my = min(float(box1[1] - box1[3] / 2.0),\n                 float(box2[1] - box2[3] / 2.0))\n        My = max(float(box1[1] + box1[3] / 2.0),\n                 float(box2[1] + box2[3] / 2.0))\n        w1 = box1[2]\n        h1 = box1[3]\n        w2 = box2[2]\n        h2 = box2[3]\n    uw = Mx - mx\n    uh = My - my\n    cw = w1 + w2 - uw\n    ch = h1 + h2 - uh\n    carea = 0\n    if cw <= 0 or ch <= 0:\n        return paddle.to_tensor(0.0)\n    area1 = w1 * h1\n    area2 = w2 * h2\n    carea = cw * ch\n    uarea = area1 + area2 - carea\n    return carea / uarea\ndef bbox_ious(boxes1, boxes2, x1y1x2y2=True):\n    if x1y1x2y2:\n        mx = paddle.min(boxes1[0], boxes2[0])\n        Mx = paddle.max(boxes1[2], boxes2[2])\n        my = paddle.min(boxes1[1], boxes2[1])\n        My = paddle.max(boxes1[3], boxes2[3])\n        w1 = boxes1[2] - boxes1[0]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:179-213"
+    },
+    "7347": {
+        "file_id": 534,
+        "content": "The code calculates the intersection-over-union (IOU) between two bounding boxes, which is commonly used in object detection tasks. It first finds the overlapping area by computing the minimum and maximum coordinates of the bounding boxes, then calculates the union of these boxes, and finally returns the intersection over union ratio. This helps in determining if the two bounding boxes represent the same object or not.",
+        "type": "comment"
+    },
+    "7348": {
+        "file_id": 534,
+        "content": "        h1 = boxes1[3] - boxes1[1]\n        w2 = boxes2[2] - boxes2[0]\n        h2 = boxes2[3] - boxes2[1]\n    else:\n        mx = paddle.min(paddle.stack(\n            [boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0], axis=0), axis=0)\n        Mx = paddle.max(paddle.stack(\n            [boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0], axis=0), axis=0)\n        my = paddle.min(paddle.stack(\n            [boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0], axis=0), axis=0)\n        My = paddle.max(paddle.stack(\n            [boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0], axis=0), axis=0)\n        w1 = boxes1[2]\n        h1 = boxes1[3]\n        w2 = boxes2[2]\n        h2 = boxes2[3]\n    uw = Mx - mx\n    uh = My - my\n    cw = w1 + w2 - uw\n    ch = h1 + h2 - uh\n    mask = paddle.cast(cw <= 0, dtype=\"int32\") + \\\n        paddle.cast(ch <= 0, dtype=\"int32\") > 0\n    area1 = w1 * h1\n    area2 = w2 * h2\n    carea = cw * ch\n    carea[mask] = 0\n    uarea = area1 + area2 - carea\n    return carea / uarea",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:214-241"
+    },
+    "7349": {
+        "file_id": 534,
+        "content": "This code calculates the intersection over union (IoU) between two bounding boxes. It first checks if both boxes have valid dimensions, then computes the coordinates of each box and their widths and heights. If the boxes overlap, it calculates the intersection area and union area of the bounding boxes, taking into account non-overlapping areas by setting them to 0 in the case of non-intersection. Finally, it returns the IoU as the intersection area divided by the union area.",
+        "type": "comment"
+    },
+    "7350": {
+        "file_id": 534,
+        "content": "# this function works for building the groud truth\ndef build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,\n                  sil_thresh):\n    # nH, nW here are number of grids in y and x directions (7, 7 here)\n    nB = target.shape[0]  # batch size\n    nA = num_anchors  # 5 for our case\n    nC = num_classes\n    anchor_step = len(anchors) // num_anchors\n    conf_mask = paddle.ones([nB, nA, nH, nW]) * noobject_scale\n    coord_mask = paddle.zeros([nB, nA, nH, nW])\n    cls_mask = paddle.zeros([nB, nA, nH, nW])\n    tx = paddle.zeros([nB, nA, nH, nW])\n    ty = paddle.zeros([nB, nA, nH, nW])\n    tw = paddle.zeros([nB, nA, nH, nW])\n    th = paddle.zeros([nB, nA, nH, nW])\n    tconf = paddle.zeros([nB, nA, nH, nW])\n    tcls = paddle.zeros([nB, nA, nH, nW])\n    # for each grid there are nA anchors\n    # nAnchors is the number of anchor for one image\n    nAnchors = nA * nH * nW\n    nPixels = nH * nW\n    # for each image\n    for b in xrange(nB):\n        # get all anchor boxes in one image",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:244-268"
+    },
+    "7351": {
+        "file_id": 534,
+        "content": "This function builds ground truth targets for each grid in the image. It iterates over each batch, anchor, height, and width to create confidence, coordinate, and class masks, as well as target coordinates and classes for each anchor box. The targets are then concatenated into a single tensor.",
+        "type": "comment"
+    },
+    "7352": {
+        "file_id": 534,
+        "content": "        # (4 * nAnchors)\n        cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()\n        # initialize iou score for each anchor\n        cur_ious = paddle.zeros([nAnchors])\n        for t in xrange(50):\n            # for each anchor 4 coordinate parameters, already in the coordinate system for the whole image\n            # this loop is for anchors in each image\n            # for each anchor 5 parameters are available (class, x, y, w, h)\n            if target[b][t * 5 + 1] == 0:\n                break\n            gx = target[b][t * 5 + 1] * nW\n            gy = target[b][t * 5 + 2] * nH\n            gw = target[b][t * 5 + 3] * nW\n            gh = target[b][t * 5 + 4] * nH\n            # groud truth boxes\n            cur_gt_boxes = paddle.tile(paddle.to_tensor(\n                [gx, gy, gw, gh], dtype='float32').t(), [nAnchors, 1]).t()\n            # bbox_ious is the iou value between orediction and groud truth\n            cur_ious = paddle.max(\n                paddle.stack([cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)], axis=0), axis=0)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:269-288"
+    },
+    "7353": {
+        "file_id": 534,
+        "content": "This code calculates the IoU (Intersection over Union) between predicted and ground truth boxes for each anchor. It uses a loop to iterate through 50 time steps, breaks if no target is available at the current time step, and calculates the bbox_ious function using cur_pred_boxes and cur_gt_boxes for IoU calculation. The highest IoU value is stored in cur_ious for each anchor.",
+        "type": "comment"
+    },
+    "7354": {
+        "file_id": 534,
+        "content": "        # if iou > a given threshold, it is seen as it includes an object\n        # conf_mask[b][cur_ious>sil_thresh] = 0\n        conf_mask_t = paddle.reshape(conf_mask, [nB, -1])\n        conf_mask_t[b, cur_ious > sil_thresh] = 0\n        conf_mask_tt = paddle.reshape(conf_mask_t[b], [nA, nH, nW])\n        conf_mask[b] = conf_mask_tt\n    # number of ground truth\n    nGT = 0\n    nCorrect = 0\n    for b in xrange(nB):\n        # anchors for one batch (at least batch size, and for some specific classes, there might exist more than one anchor)\n        for t in xrange(50):\n            if target[b][t * 5 + 1] == 0:\n                break\n            nGT = nGT + 1\n            best_iou = 0.0\n            best_n = -1\n            min_dist = 10000\n            # the values saved in target is ratios\n            # times by the width and height of the output feature maps nW and nH\n            gx = target[b][t * 5 + 1] * nW\n            gy = target[b][t * 5 + 2] * nH\n            gi = int(gx)\n            gj = int(gy)\n            gw = target[b][t * 5 + 3] * nW",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:289-315"
+    },
+    "7355": {
+        "file_id": 534,
+        "content": "This code calculates the IoU (Intersection over Union) between predicted bounding boxes and ground truth bounding boxes, and applies a mask to the confidences based on this IoU. It also counts the number of ground truth instances (nGT) and correct detections (nCorrect). The target values are ratios multiplied by the width and height of the output feature maps.",
+        "type": "comment"
+    },
+    "7356": {
+        "file_id": 534,
+        "content": "            gh = target[b][t * 5 + 4] * nH\n            gt_box = [0, 0, gw, gh]\n            for n in xrange(nA):\n                # get anchor parameters (2 values)\n                aw = anchors[anchor_step * n]\n                ah = anchors[anchor_step * n + 1]\n                anchor_box = [0, 0, aw, ah]\n                # only consider the size (width and height) of the anchor box\n                iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)\n                # get the best anchor form with the highest iou\n                if iou > best_iou:\n                    best_iou = iou\n                    best_n = n\n            # then we determine the parameters for an anchor (4 values together)\n            gt_box = [gx, gy, gw, gh]\n            # find corresponding prediction box\n            pred_box = pred_boxes[b * nAnchors +\n                                  best_n * nPixels + gj * nW + gi]\n            # only consider the best anchor box, for each image\n            coord_mask[b, best_n, gj, gi] = 1\n            cls_mask[b, best_n, gj, gi] = 1",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:316-338"
+    },
+    "7357": {
+        "file_id": 534,
+        "content": "This code iterates over anchor boxes, calculates IoU with ground truth boxes and selects the best matching one. It then updates the corresponding prediction box for that image and marks it as valid in coord_mask and cls_mask matrices.",
+        "type": "comment"
+    },
+    "7358": {
+        "file_id": 534,
+        "content": "            # in this cell of the output feature map, there exists an object\n            conf_mask[b, best_n, gj, gi] = object_scale\n            tx[b, best_n, gj, gi] = paddle.cast(\n                target[b][t * 5 + 1] * nW - gi, dtype='float32')\n            ty[b, best_n, gj, gi] = paddle.cast(\n                target[b][t * 5 + 2] * nH - gj, dtype='float32')\n            tw[b, best_n, gj, gi] = math.log(\n                gw / anchors[anchor_step * best_n])\n            th[b, best_n, gj, gi] = math.log(\n                gh / anchors[anchor_step * best_n + 1])\n            iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou\n            # confidence equals to iou of the corresponding anchor\n            tconf[b, best_n, gj, gi] = paddle.cast(iou, dtype='float32')\n            tcls[b, best_n, gj, gi] = paddle.cast(\n                target[b][t * 5], dtype='float32')\n            # if ious larger than 0.5, we justify it as a correct prediction\n            if iou > 0.5:\n                nCorrect = nCorrect + 1",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:340-357"
+    },
+    "7359": {
+        "file_id": 534,
+        "content": "The code calculates object position, size, and confidence for each detected object. It then counts the number of correct detections by checking if the IOU is greater than 0.5.",
+        "type": "comment"
+    },
+    "7360": {
+        "file_id": 534,
+        "content": "    # true values are returned\n    return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/localizers/yowo_utils.py:358-359"
+    },
+    "7361": {
+        "file_id": 534,
+        "content": "The function returns the ground truth values (nGT), correct predictions (nCorrect), and corresponding masks for coordinates, confidence, and class labels, as well as the transformation parameters for translation, width, and height.",
+        "type": "comment"
+    },
+    "7362": {
+        "file_id": 535,
+        "content": "/paddlevideo/modeling/framework/multimodal/__init__.py",
+        "type": "filepath"
+    },
+    "7363": {
+        "file_id": 535,
+        "content": "This code file is part of the PaddleVideo library and contains the initialization, base class (BaseMultimodal), and a specific multimodal model (ActBert). It also mentions licensing information and a link to access it. The __all__ variable lists the available modules for importing from this file.",
+        "type": "summary"
+    },
+    "7364": {
+        "file_id": 535,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseMultimodal\nfrom .actbert import ActBert\n__all__ = ['BaseMultimodal', 'ActBert']",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/multimodal/__init__.py:1-16"
+    },
+    "7365": {
+        "file_id": 535,
+        "content": "This code file is part of the PaddleVideo library and contains the initialization, base class (BaseMultimodal), and a specific multimodal model (ActBert). It also mentions licensing information and a link to access it. The __all__ variable lists the available modules for importing from this file.",
+        "type": "comment"
+    },
+    "7366": {
+        "file_id": 536,
+        "content": "/paddlevideo/modeling/framework/multimodal/actbert.py",
+        "type": "filepath"
+    },
+    "7367": {
+        "file_id": 536,
+        "content": "The code introduces the ActBert model for multimodal tasks, including training and validation steps. It utilizes a backbone function for predictions with text, video, and action scores along with sequence relationship scores. The infer_step is yet to be implemented.",
+        "type": "summary"
+    },
+    "7368": {
+        "file_id": 536,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import MULTIMODAL\nfrom .base import BaseMultimodal\nimport paddle\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@MULTIMODAL.register()\nclass ActBert(BaseMultimodal):\n    \"\"\"ActBert model framework.\"\"\"\n    def forward_net(self, text_ids, action_feat, image_feat, image_loc,\n                    token_type_ids, text_mask, image_mask, action_mask):\n        pred = self.backbone(text_ids, action_feat, image_feat, image_loc,\n                             token_type_ids, text_mask, image_mask, action_mask)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/multimodal/actbert.py:1-27"
+    },
+    "7369": {
+        "file_id": 536,
+        "content": "This code snippet defines the ActBert class, which is a multimodal model framework. It registers the model under MULTIMODAL in the registry and includes a forward_net method for processing text, action, and image data. The self.backbone function is used to make predictions based on this data. The code also includes import statements, variable definitions, and a get_logger function call for logging purposes.",
+        "type": "comment"
+    },
+    "7370": {
+        "file_id": 536,
+        "content": "        return pred\n    def train_step(self, data_batch):\n        \"\"\"For ActBert Dataset. Define how the model is going to train, from input to output.\n        \"\"\"\n        text_ids, action_feat, image_feat, image_loc, \\\n        token_type_ids, text_mask, image_mask, action_mask, \\\n        text_labels, action_label, next_sentence_label, image_label, image_target = data_batch\n        loss_metrics = dict()\n        pred = self.backbone(text_ids, action_feat, image_feat, image_loc,\n                             token_type_ids, text_mask, image_mask, action_mask)\n        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = pred\n        total_loss = self.loss(prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \\\n                text_labels, image_label, image_target, action_label, next_sentence_label)\n        loss_metrics['loss'] = paddle.mean(total_loss)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"For ActBert Dataset. Define how the model is going to val, from input to output.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/multimodal/actbert.py:28-46"
+    },
+    "7371": {
+        "file_id": 536,
+        "content": "This code defines a train_step and val_step for ActBert Dataset. In the train_step, it takes input data (text_ids, action_feat, image_feat, etc.), passes them through the backbone model to get prediction scores, calculates loss, and returns a loss metric dictionary. The val_step does not appear to have any additional functionality.",
+        "type": "comment"
+    },
+    "7372": {
+        "file_id": 536,
+        "content": "        \"\"\"\n        return self.train_step(data_batch)\n    def test_step(self, data_batch):\n        \"\"\"For MSR-VTT Dataset. Define how the model is going to test, from input to output.\"\"\"\n        text_ids, action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask = data_batch[:\n                                                                                                                      -1]\n        action_feat = action_feat.squeeze(0)\n        image_feat = image_feat.squeeze(0)\n        image_loc = image_loc.squeeze(0)\n        image_mask = image_mask.squeeze(0)\n        action_mask = action_mask.squeeze(0)\n        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.forward_net(text_ids, \\\n            action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask)\n        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score\n    def infer_step(self, data_batch):\n        pass",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/multimodal/actbert.py:47-64"
+    },
+    "7373": {
+        "file_id": 536,
+        "content": "The code defines a model that takes in multiple inputs like text_ids, action_feat, image_feat, and more. It performs testing with the test_step() function and returns prediction scores for text, video, and action, along with the sequence relationship score. The infer_step() function is not implemented yet.",
+        "type": "comment"
+    },
+    "7374": {
+        "file_id": 537,
+        "content": "/paddlevideo/modeling/framework/multimodal/base.py",
+        "type": "filepath"
+    },
+    "7375": {
+        "file_id": 537,
+        "content": "This code defines a base class for multimodal models in PaddleVideo, requiring subclasses to override train_step, valid_step, test_step, and define abstract methods for validating, testing, and inference steps.",
+        "type": "summary"
+    },
+    "7376": {
+        "file_id": 537,
+        "content": "from abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseMultimodal(nn.Layer):\n    \"\"\"Base class for Multimodal.\n    All Multimodal model should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Head to process feature.\n        loss(dict): Loss function.\n    \"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/multimodal/base.py:1-32"
+    },
+    "7377": {
+        "file_id": 537,
+        "content": "This code defines a base class for multimodal models in PaddleVideo. It requires subclasses to override train_step, valid_step, and test_step methods. The constructor accepts optional backbone, head, and loss parameters which are built using the builder module. If provided, the backbone is initialized with its init_weights method.",
+        "type": "comment"
+    },
+    "7378": {
+        "file_id": 537,
+        "content": "            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n        if loss is not None:\n            self.loss = builder.build_loss(loss)\n        else:\n            self.loss = None\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/multimodal/base.py:33-63"
+    },
+    "7379": {
+        "file_id": 537,
+        "content": "The code defines a base class for multimodal models, with an initializer to set up the head and loss functions. The `forward` method selects the appropriate step function based on the given mode (train, valid, test, or infer). The abstract `train_step` method must be implemented in subclasses for training.",
+        "type": "comment"
+    },
+    "7380": {
+        "file_id": 537,
+        "content": "    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/multimodal/base.py:65-81"
+    },
+    "7381": {
+        "file_id": 537,
+        "content": "This code defines three abstract methods: val_step, test_step, and infer_step. These methods represent validating, testing, and inference steps respectively. The methods are not yet implemented and will need to be filled by subclasses according to the specific requirements of the model being developed.",
+        "type": "comment"
+    },
+    "7382": {
+        "file_id": 538,
+        "content": "/paddlevideo/modeling/framework/partitioners/__init__.py",
+        "type": "filepath"
+    },
+    "7383": {
+        "file_id": 538,
+        "content": "This code is the initialization file of PaddleVideo's partitioner module. It imports base and TransNetV2Partitioner classes, and declares them as part of the public interface (__all__). The license and copyright information are also included.",
+        "type": "summary"
+    },
+    "7384": {
+        "file_id": 538,
+        "content": "# copyright (c) 2020  paddlepaddle authors. all rights reserved.\n#\n# licensed under the apache license, version 2.0 (the \"license\"\n# you may not use this file except in compliance with the license.\n# you may obtain a copy of the license at\n#\n#     http://www.apache.org/licenses/license-2.0\n#\n# unless required by applicable law or agreed to in writing, software\n# distributed under the license is distributed on an \"as is\" basis,\n# without warranties or conditions of any kind, either express or implied.\n# see the license for the specific language governing permissions and\n# limitations under the license.\nfrom .base import BasePartitioner\nfrom .transnetv2_partitioner import TransNetV2Partitioner\n__all__ = ['BasePartitioner', 'TransNetV2Partitioner']",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/partitioners/__init__.py:1-18"
+    },
+    "7385": {
+        "file_id": 538,
+        "content": "This code is the initialization file of PaddleVideo's partitioner module. It imports base and TransNetV2Partitioner classes, and declares them as part of the public interface (__all__). The license and copyright information are also included.",
+        "type": "comment"
+    },
+    "7386": {
+        "file_id": 539,
+        "content": "/paddlevideo/modeling/framework/partitioners/base.py",
+        "type": "filepath"
+    },
+    "7387": {
+        "file_id": 539,
+        "content": "This Python class is part of PaddleVideo's modeling framework, serving as a base for partitioners and initializing partitioned models. It includes backbone and head components initialization, optional weight initialization, and defines a forward function. A base class for model partitioners is also defined with methods for train, validate, test, and infer steps, leaving the actual implementation to subclasses.",
+        "type": "summary"
+    },
+    "7388": {
+        "file_id": 539,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom abc import abstractmethod\nimport paddle.nn as nn\nfrom ... import builder\nclass BasePartitioner(nn.Layer):\n    \"\"\"Base class for Partition.\n    All partitioner should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, define your train step.\n    - Methods:``valid_step``, define your valid step, always the same as train_step.\n    - Methods:``test_step``, define your test step.\n    \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/partitioners/base.py:1-27"
+    },
+    "7389": {
+        "file_id": 539,
+        "content": "This code is a Python class for base partitioner in PaddleVideo's modeling framework. It is an abstract class that serves as the foundation for all partitioners and requires its subclasses to define specific methods like train_step, valid_step, and test_step.",
+        "type": "comment"
+    },
+    "7390": {
+        "file_id": 539,
+        "content": "    def __init__(self, backbone=None, head=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n    def init_weights(self):\n        \"\"\"Initialize the model network weights. \"\"\"\n        if getattr(self.backbone, 'init_weights'):\n            self.backbone.init_weights()\n        else:\n            pass\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/partitioners/base.py:28-55"
+    },
+    "7391": {
+        "file_id": 539,
+        "content": "This code initializes a partitioned model by building backbone and head components. It also includes an option to initialize weights for these components, and provides a forward function defining the model's execution path depending on the provided mode.",
+        "type": "comment"
+    },
+    "7392": {
+        "file_id": 539,
+        "content": "        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.  input_data_batch -> loss_metric\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating setp. input_data_batch -> loss_metric\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Tets setp. to get acc in test data. input_data_batch -> output\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/partitioners/base.py:56-84"
+    },
+    "7393": {
+        "file_id": 539,
+        "content": "The code defines a base class for model partitioners, which includes methods for train, validate, test, and infer steps. Each step takes a data batch as input and returns either a loss metric or the output. If an unsupported mode is provided, it raises a NotImplementedError. The actual implementation of these steps is left to subclasses.",
+        "type": "comment"
+    },
+    "7394": {
+        "file_id": 540,
+        "content": "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py",
+        "type": "filepath"
+    },
+    "7395": {
+        "file_id": 540,
+        "content": "TransNetV2 Partitioner in PaddleVideo framework defines a model partitioner, includes forwarding methods for image processing and computing loss metrics. It has three methods: \"loss_metrics\", \"test_step\", and \"infer_step\" for training, testing, and inference phases respectively.",
+        "type": "summary"
+    },
+    "7396": {
+        "file_id": 540,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import PARTITIONERS\nfrom .base import BasePartitioner\nimport paddle\n@PARTITIONERS.register()\nclass TransNetV2Partitioner(BasePartitioner):\n    \"\"\"TransNetV2 Partitioner framework\n    \"\"\"\n    def forward_net(self, imgs):\n        one_hot_pred = self.backbone(imgs)\n        return one_hot_pred\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        frame_sequence = data_batch[0]\n        one_hot_gt, many_hot_gt = data_batch[1:]\n        one_hot_pred = self.forward_net(frame_sequence)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py:1-32"
+    },
+    "7397": {
+        "file_id": 540,
+        "content": "TransNetV2 Partitioner class for PaddleVideo framework, with forward_net and train_step methods for image processing and model training.",
+        "type": "comment"
+    },
+    "7398": {
+        "file_id": 540,
+        "content": "        dict_ = {}\n        if isinstance(one_hot_pred, tuple):\n            one_hot_pred, dict_ = one_hot_pred\n        many_hot_pred = dict_.get(\"many_hot\", None)\n        comb_reg_loss = dict_.get(\"comb_reg_loss\", None)\n        loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,\n                                    many_hot_pred, many_hot_gt,\n                                    reg_losses={\"comb_reg\": comb_reg_loss})\n        return loss_metrics\n    def val_step(self, data_batch):\n        frame_sequence = data_batch[0]\n        one_hot_gt, many_hot_gt = data_batch[1:]\n        one_hot_pred = self.forward_net(frame_sequence)\n        dict_ = {}\n        if isinstance(one_hot_pred, tuple):\n            one_hot_pred, dict_ = one_hot_pred\n        many_hot_pred = dict_.get(\"many_hot\", None)\n        comb_reg_loss = dict_.get(\"comb_reg_loss\", None)\n        loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,\n                                      many_hot_pred, many_hot_gt,\n                                      reg_losses={\"comb_reg\": comb_reg_loss})",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py:33-54"
+    },
+    "7399": {
+        "file_id": 540,
+        "content": "Code defines a model partitioner for TransNetV2. It returns loss metrics from the validation step by forwarding frame sequences through the model, extracting one-hot and many-hot predictions and ground truths, and applying losses based on provided dictionaries.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/74.json b/docs/data/74.json
new file mode 100644
index 000000000..ba8ec7f64
--- /dev/null
+++ b/docs/data/74.json
@@ -0,0 +1,541 @@
+{
+    "7400": {
+        "file_id": 540,
+        "content": "        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics\n        frame_sequence = data_batch[0]\n        one_hot_pred = self.forward_net(frame_sequence)\n        return one_hot_pred\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        frame_sequence = data_batch[0]\n        one_hot_pred = self.forward_net(frame_sequence)\n        return one_hot_pred",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py:55-68"
+    },
+    "7401": {
+        "file_id": 540,
+        "content": "The code defines three methods: \"loss_metrics\" returns loss and metrics for training, \"test_step\" performs testing by forwarding frames through the net without calculating loss, and \"infer_step\" also performs testing with forwarding frames but without specifying if it's for a test or inference phase.",
+        "type": "comment"
+    },
+    "7402": {
+        "file_id": 541,
+        "content": "/paddlevideo/modeling/framework/recognizers/__init__.py",
+        "type": "filepath"
+    },
+    "7403": {
+        "file_id": 541,
+        "content": "This code file in the PaddleVideo library imports various recognizer classes for video recognition tasks, including 1D, 2D, 3D, transformer-based, GCN, MRI, and MoViNet frame-based recognizers. These models are used for action recognition and motion estimation tasks.",
+        "type": "summary"
+    },
+    "7404": {
+        "file_id": 541,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseRecognizer\nfrom .recognizer1d import Recognizer1D, RecognizerAction\nfrom .recognizer2d import Recognizer2D\nfrom .recognizer3d import Recognizer3D\nfrom .recognizer_transformer import RecognizerTransformer\nfrom .recognizer_gcn import RecognizerGCN\nfrom .recognizerMRI import RecognizerMRI\nfrom .recognizer3dMRI import Recognizer3DMRI\nfrom .recognizer_transformer_MRI import RecognizerTransformer_MRI\nfrom .recognizer_movinet_frame import MoViNetRecognizerFrame\nfrom .recognizerDistillation import RecognizerDistillation",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/__init__.py:1-23"
+    },
+    "7405": {
+        "file_id": 541,
+        "content": "This code file imports various recognizer classes from different modules within the PaddleVideo framework for video recognition tasks. These recognizers include 1D, 2D, 3D, transformer-based, GCN, MRI, 3D MRI, and MoViNet frame-based recognizers, as well as a distillation-based recognizer. Each recognizer is designed for specific types of recognition tasks in video analysis.",
+        "type": "comment"
+    },
+    "7406": {
+        "file_id": 541,
+        "content": "__all__ = [\n    'BaseRecognizer', 'Recognizer1D', 'Recognizer2D', 'Recognizer3D',\n    'RecognizerTransformer', 'RecognizerGCN', 'RecognizerMRI',\n    'Recognizer3DMRI', 'RecognizerTransformer_MRI', 'MoViNetRecognizerFrame',\n    'RecognizerAction', 'RecognizerDistillation'\n]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/__init__.py:25-30"
+    },
+    "7407": {
+        "file_id": 541,
+        "content": "This code snippet in the PaddleVideo library defines various recognizer models including BaseRecognizer, Recognizer1D, Recognizer2D, and more. These classes are used for video recognition tasks like action recognition and motion estimation.",
+        "type": "comment"
+    },
+    "7408": {
+        "file_id": 542,
+        "content": "/paddlevideo/modeling/framework/recognizers/base.py",
+        "type": "filepath"
+    },
+    "7409": {
+        "file_id": 542,
+        "content": "This code initializes a model's head, defines modes of operation, and provides abstract methods for training, validation, and inference steps. It serves as a base class for recognizer models in PaddleVideo and raises NotImplementedError if subclasses don't implement these steps.",
+        "type": "summary"
+    },
+    "7410": {
+        "file_id": 542,
+        "content": "from abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseRecognizer(nn.Layer):\n    \"\"\"Base class for recognizers.\n    All recognizers should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Classification head to process feature.\n    \"\"\"\n    def __init__(self, backbone=None, head=None, runtime_cfg=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/base.py:1-33"
+    },
+    "7411": {
+        "file_id": 542,
+        "content": "Base class for recognizers: Subclasses should override train_step, valid_step, and test_step methods. Builds backbone and head using builder if provided.",
+        "type": "comment"
+    },
+    "7412": {
+        "file_id": 542,
+        "content": "                self.head.init_weights()\n        else:\n            self.head = None\n        # Settings when the model is running,\n        # such as 'avg_type'\n        self.runtime_cfg = runtime_cfg\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/base.py:34-66"
+    },
+    "7413": {
+        "file_id": 542,
+        "content": "This code initializes a model's head, defines the mode of operation (train, valid, test, infer), and provides abstract methods for training and validation steps. If the mode is 'infer', it saves the inference model.",
+        "type": "comment"
+    },
+    "7414": {
+        "file_id": 542,
+        "content": "        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/base.py:67-81"
+    },
+    "7415": {
+        "file_id": 542,
+        "content": "This code snippet in PaddleVideo defines abstract methods for validating, testing, and inferring steps. It serves as a base class for recognizer models and expects subclasses to implement these methods. The NotImplementedError is raised to ensure that subclasses provide their own implementation for these steps.",
+        "type": "comment"
+    },
+    "7416": {
+        "file_id": 543,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py",
+        "type": "filepath"
+    },
+    "7417": {
+        "file_id": 543,
+        "content": "The code defines a 1D recognizer model in PaddleVideo, processing both image and audio data for training, validation, testing, and inference. It includes forward pass, loss computation, metrics calculations and handles RGB and audio data batches.",
+        "type": "summary"
+    },
+    "7418": {
+        "file_id": 543,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\n@RECOGNIZERS.register()\nclass Recognizer1D(BaseRecognizer):\n    \"\"\"1D recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        lstm_logit, lstm_output = self.head(imgs)\n        return lstm_logit, lstm_output\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels = data_batch",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:1-29"
+    },
+    "7419": {
+        "file_id": 543,
+        "content": "This code defines a 1D recognizer model framework in PaddleVideo. It includes the forward_net function to define how the model trains from input to output and the train_step function for training steps. The data batch contains rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, and labels.",
+        "type": "comment"
+    },
+    "7420": {
+        "file_id": 543,
+        "content": "        imgs = [(rgb_data, rgb_len, rgb_mask),\n                (audio_data, audio_len, audio_mask)]\n        # call forward\n        lstm_logit, lstm_output = self.forward_net(imgs)\n        loss = self.head.loss(lstm_logit, labels)\n        hit_at_one, perr, gap = self.head.metric(lstm_output, labels)\n        loss_metrics = dict()\n        loss_metrics['loss'] = loss\n        loss_metrics['hit_at_one'] = hit_at_one\n        loss_metrics['perr'] = perr\n        loss_metrics['gap'] = gap\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def test_step(self, data_batch):\n        \"\"\"Testing setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def infer_step(self, data_batch):\n        \"\"\"Infering setp.\n        \"\"\"\n        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch\n        imgs = [(rgb_data, rgb_len, rgb_mask),\n                (audio_data, audio_len, audio_mask)]\n        # call forward",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:30-61"
+    },
+    "7421": {
+        "file_id": 543,
+        "content": "The code defines a recognizer1d model that processes both image and audio data. It includes methods for forward pass, validation, testing, and inference steps. In the forward pass, it takes input images and calculates logits and output from the LSTM network. The loss is then computed based on these logits and labels, and metrics such as hit_at_one, perr, and gap are calculated using the output and labels. The validation and testing steps perform similar calculations to those in the training step. In the inference step, only image and audio data are processed to produce output for each input.",
+        "type": "comment"
+    },
+    "7422": {
+        "file_id": 543,
+        "content": "        lstm_logit, _ = self.forward_net(imgs)\n        return lstm_logit\n@RECOGNIZERS.register()\nclass RecognizerAction(BaseRecognizer):\n    \"\"\"1D recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        lstm_logit, lstm_output = self.head(imgs)\n        return lstm_logit, lstm_output\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels, labels_iou = data_batch\n        imgs = [(rgb_data, rgb_len, rgb_mask),\n                (audio_data, audio_len, audio_mask)]\n        # call forward\n        output_logit, output_iou = self.forward_net(imgs)\n        loss = self.head.loss(output_logit, output_iou, labels, labels_iou)\n        top1, top5 = self.head.metric(output_logit, labels)\n        loss_metrics = dict()\n        loss_metrics['loss'] = loss\n        loss_metrics['top1'] = top1\n        loss_metrics['top5'] = top5\n        return loss_metrics",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:62-91"
+    },
+    "7423": {
+        "file_id": 543,
+        "content": "This code defines a 1D recognizer model framework, which includes a forward_net function to define how the model trains from input to output and a train_step function for the training process. It takes in data batches, including both RGB and audio data, and outputs loss metrics including loss, top1, and top5.",
+        "type": "comment"
+    },
+    "7424": {
+        "file_id": 543,
+        "content": "    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def test_step(self, data_batch):\n        \"\"\"Testing setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def infer_step(self, data_batch):\n        \"\"\"Infering setp.\n        \"\"\"\n        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch\n        imgs = [(rgb_data, rgb_len, rgb_mask),\n                (audio_data, audio_len, audio_mask)]\n        # call forward\n        output_logit, output_iou = self.forward_net(imgs)\n        return output_logit, output_iou",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:93-111"
+    },
+    "7425": {
+        "file_id": 543,
+        "content": "The code contains three methods: `val_step`, `test_step`, and `infer_step`. These steps perform validating, testing, and inference, respectively. In all three cases, the data batch is passed to the `train_step` method, suggesting a shared implementation between these steps. The `infer_step` specifically expects certain types of data: RGB data with length and mask, as well as audio data with its respective length and mask, in a tuple format. It then processes this data using `forward_net`, returning output logits and IOU values.",
+        "type": "comment"
+    },
+    "7426": {
+        "file_id": 544,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizer2d.py",
+        "type": "filepath"
+    },
+    "7427": {
+        "file_id": 544,
+        "content": "Recognizer2D is a 2D model in PaddleVideo for video analysis. It requires num_segs and includes functions for processing, training/validating, and testing the model. The Recognizer2D class defines forward_net and infer_step methods for classification scores.",
+        "type": "summary"
+    },
+    "7428": {
+        "file_id": 544,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nimport paddle\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass Recognizer2D(BaseRecognizer):\n    \"\"\"2D recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.\n        num_segs = imgs.shape[\n            1]  # imgs.shape=[N,T,C,H,W], for most commonly case",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer2d.py:1-27"
+    },
+    "7429": {
+        "file_id": 544,
+        "content": "Recognizer2D is a 2D recognizer model framework in PaddleVideo, inheriting from BaseRecognizer. It requires the number of segments (num_segs) which can be obtained from the shape of input images. The forward_net function performs image recognition using this model framework.",
+        "type": "comment"
+    },
+    "7430": {
+        "file_id": 544,
+        "content": "        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))\n        if self.backbone is not None:\n            feature = self.backbone(imgs)\n        else:\n            feature = imgs\n        if self.head is not None:\n            cls_score = self.head(feature, num_segs)\n        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        # NOTE: (s",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer2d.py:28-60"
+    },
+    "7431": {
+        "file_id": 544,
+        "content": "The code defines a recognizer2D model for video analysis. It consists of three main parts: the forward_net function that processes images, the train_step function for training the model using input data, and the val_step and test_step functions for validating and testing the trained model respectively. The forward_net function reshapes the images and passes them through a backbone network if one is defined, then to a head network if one is defined as well. It returns the classification scores. The train_step calculates the loss metrics using the provided labels, while the val_step does the same but in validation mode. The test_step computes the loss metrics without providing any labels.",
+        "type": "comment"
+    },
+    "7432": {
+        "file_id": 544,
+        "content": "hipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics\n        imgs = data_batch[0]\n        cls_score = self.forward_net(imgs)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        cls_score = self.forward_net(imgs)\n        return cls_score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer2d.py:60-69"
+    },
+    "7433": {
+        "file_id": 544,
+        "content": "The code defines a Recognizer2D class with two methods: forward_net and infer_step. The forward_net method takes in images (imgs) and returns the classification scores (cls_score). The infer_step method is used for testing and follows the same process as forward_net to return cls_score.",
+        "type": "comment"
+    },
+    "7434": {
+        "file_id": 545,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizer3d.py",
+        "type": "filepath"
+    },
+    "7435": {
+        "file_id": 545,
+        "content": "Recognizer3D defines a model framework with forward_net, train/val_step for training and validation in recognition models. It handles image processing based on backbone and calculates loss metrics. The code defines test_step and infer_step methods for testing and inference.",
+        "type": "summary"
+    },
+    "7436": {
+        "file_id": 545,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass Recognizer3D(BaseRecognizer):\n    \"\"\"3D Recognizer model framework.\n    \"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        feature = self.backbone(imgs)\n        cls_score = self.head(feature)\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Training step.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer3d.py:1-33"
+    },
+    "7437": {
+        "file_id": 545,
+        "content": "Recognizer3D is a 3D Recognizer model framework, which defines how the model runs from input to output. It includes forward_net method for model execution and train_step method for training.",
+        "type": "comment"
+    },
+    "7438": {
+        "file_id": 545,
+        "content": "        \"\"\"\n        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':\n            imgs = data_batch[0]\n            labels = data_batch[1:]\n            if imgs.dim() == 6:\n                imgs = imgs.reshape([-1] + imgs.shape[2:])\n        else:\n            imgs = data_batch[0:2]\n            labels = data_batch[2:]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':\n            imgs = data_batch[0]\n            labels = data_batch[1:]\n            if imgs.dim() == 6:\n                imgs = imgs.reshape([-1] + imgs.shape[2:])\n        else:\n            imgs = data_batch[0:2]\n            labels = data_batch[2:]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)\n        return loss_metrics",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer3d.py:34-64"
+    },
+    "7439": {
+        "file_id": 545,
+        "content": "The code is defining two methods, `train_step` and `val_step`, which are used for training and validation steps respectively in a recognition model. If the backbone of the model is 'ResNet3dSlowOnly', it reshapes the images to have a specific dimension before processing. For other backbones, it separates the images and labels from the data batch accordingly. Both methods then forward the images through the `forward_net` and calculate loss metrics with or without validation mode depending on the step type. The final output is the loss metrics.",
+        "type": "comment"
+    },
+    "7440": {
+        "file_id": 545,
+        "content": "    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':\n            imgs = data_batch[0]\n            if imgs.dim() == 6:\n                imgs = imgs.reshape([-1] + imgs.shape[2:])\n        else:\n            imgs = data_batch[0:2]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':\n            imgs = data_batch[0]\n            # call forward\n            imgs = imgs.reshape([-1] + imgs.shape[2:])\n            cls_score = self.forward_net(imgs)\n        else:\n            imgs = data_batch[0:2]\n            # call forward\n            cls_score = self.forward_net(imgs)\n        return cls_score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer3d.py:66-93"
+    },
+    "7441": {
+        "file_id": 545,
+        "content": "The code defines two methods: `test_step` and `infer_step`. In the `test_step`, if the backbone is a 'ResNet3dSlowOnly', it reshapes the input images, then calls the forward pass to get class scores. Otherwise, it takes the first two elements of the data batch for inference. The `infer_step` follows similar logic but without the condition on backbone type. Both methods return the class scores.",
+        "type": "comment"
+    },
+    "7442": {
+        "file_id": 546,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py",
+        "type": "filepath"
+    },
+    "7443": {
+        "file_id": 546,
+        "content": "The code defines a 3D Recognizer model and framework in PaddleVideo, with classes and methods for training, validation, and testing. It includes two methods, \"test_step\" and \"infer_step\", used for testing or inferring on limited data batches.",
+        "type": "summary"
+    },
+    "7444": {
+        "file_id": 546,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nfrom paddlevideo.utils import get_logger\nimport paddle\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass Recognizer3DMRI(BaseRecognizer):\n    \"\"\"3D Recognizer model framework.\n    \"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        imgs[0] = paddle.cast(imgs[0], \"float32\")\n        imgs[1] = paddle.cast(imgs[1], \"float32\")\n        imgs[0] = imgs[0].unsqueeze(1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py:1-31"
+    },
+    "7445": {
+        "file_id": 546,
+        "content": "This code defines a 3D Recognizer model framework that takes in input images, casts them to float32 type and unsqueeze the first image for dimension alignment. The Recognizer3DMRI class inherits from BaseRecognizer and has a forward_net method for defining how the model should run from input to output.",
+        "type": "comment"
+    },
+    "7446": {
+        "file_id": 546,
+        "content": "        imgs[1] = imgs[1].unsqueeze(1)\n        feature = self.backbone(imgs)\n        cls_score = self.head(feature)\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        imgs = data_batch[0:2]\n        labels = data_batch[2:]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        imgs = data_batch[0:2]\n        labels = data_batch[2:]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score,\n                                      labels,\n                                      valid_mode=True,\n                                      if_top5=False)\n        return loss_metrics\n    def test_step(self, data_batch):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py:32-65"
+    },
+    "7447": {
+        "file_id": 546,
+        "content": "This code defines a recognizer3dMRI model in the PaddleVideo framework. It has three methods: train_step, val_step, and test_step for training, validating, and testing the model, respectively. In each step, it processes image data batches, calls the forward function to generate class scores using a forward_net, applies sigmoid activation, and calculates losses using the head's loss function.",
+        "type": "comment"
+    },
+    "7448": {
+        "file_id": 546,
+        "content": "        \"\"\"Test step.\n        \"\"\"\n        imgs = data_batch[0:2]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        imgs = data_batch[0:2]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        return cls_score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py:66-81"
+    },
+    "7449": {
+        "file_id": 546,
+        "content": "This code defines two methods, \"test_step\" and \"infer_step\", which both take a data batch as input and return the class score after calling the forward function in the forward_net object. These steps seem to be used for testing or inferring on a limited subset of the data batch (the first two images).",
+        "type": "comment"
+    },
+    "7450": {
+        "file_id": 547,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py",
+        "type": "filepath"
+    },
+    "7451": {
+        "file_id": 547,
+        "content": "The code introduces a RecognizerDistillation class for recognizer distillation in PaddleVideo's framework, and includes model selection, modes like training and validation, loss functions, accuracy functions, and forward pass capabilities.",
+        "type": "summary"
+    },
+    "7452": {
+        "file_id": 547,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nimport paddle\nimport paddle.nn as nn\nfrom ...registry import RECOGNIZERS\nfrom ... import builder\nfrom paddlevideo.utils import get_logger, get_dist_info\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerDistillation(nn.Layer):\n    \"\"\"recognizer Distillation framework.\"\"\"\n    def __init__(self,\n                 freeze_params_list=None,\n                 models=None,\n                 loss=None,\n                 **kargs):\n        \"\"\"\n        Args:\n            freeze_params_list: list, set each model is trainable or not",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:1-34"
+    },
+    "7453": {
+        "file_id": 547,
+        "content": "This code defines a RecognizerDistillation class that inherits from nn.Layer in PaddleVideo's framework. It implements recognizer distillation, which is a machine learning framework for object recognition tasks. The class takes optional arguments such as freeze_params_list (a list to set models trainable/not), models, and loss. It is registered under RECOGNIZERS and uses logger from paddlevideo's utils.",
+        "type": "comment"
+    },
+    "7454": {
+        "file_id": 547,
+        "content": "            models: config of distillaciton model.\n            loss: config of loss list\n        \"\"\"\n        super().__init__()\n        self.model_list = []\n        self.model_name_list = []\n        self.loss_cfgs = loss\n        if freeze_params_list is None:\n            freeze_params_list = [False] * len(models)\n        assert len(freeze_params_list) == len(models)\n        # build Teacher and Student model\n        for idx, model_config in enumerate(models):\n            assert len(model_config) == 1\n            key = list(model_config.keys())[0]  #Teacher or Student\n            model_config = model_config[key]\n            model_name = model_config['backbone']['name']\n            backbone, head = None, None\n            if model_config.get('backbone'):\n                backbone = builder.build_backbone(model_config['backbone'])\n                if hasattr(backbone, 'init_weights'):\n                    backbone.init_weights()\n            if model_config.get('head'):\n                head = builder.build_head(model_config['head'])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:35-60"
+    },
+    "7455": {
+        "file_id": 547,
+        "content": "This code initializes an instance of a distillation model. It takes in a list of models and loss configurations, as well as a freeze_params_list (optional). It checks the lengths of the input lists, builds teacher and student models, and initializes backbone and head if they exist in the configurations.",
+        "type": "comment"
+    },
+    "7456": {
+        "file_id": 547,
+        "content": "                if hasattr(head, 'init_weights'):\n                    head.init_weights()\n            model = nn.Sequential(backbone, head)\n            logger.info('build distillation {} model done'.format(key))\n            # for add all parameters in nn.Layer class\n            self.model_list.append(self.add_sublayer(key, model))\n            self.model_name_list.append({model_name: key})\n            # set model trainable or not\n            if freeze_params_list[idx]:\n                for param in model.parameters():\n                    param.trainable = False\n        # build loss: support for loss list\n        self.loss_func_list = []\n        mode_keys = list(loss.keys())\n        for mode in mode_keys:\n            loss_cfgs = loss[mode]\n            for loss_cfg in loss_cfgs:\n                loss_func_dict = {}\n                model_name_pairs = loss_cfg.pop('model_name_pairs')\n                loss_func = builder.build_loss(loss_cfg)\n                loss_func_dict['mode'] = mode\n                loss_func_dict['loss_func'] = loss_func",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:61-85"
+    },
+    "7457": {
+        "file_id": 547,
+        "content": "Builds a distillation model by appending a head to the backbone, initializes weights for the head if possible, and sets trainable parameters based on freeze_params_list. Constructs loss functions using builder.build_loss().",
+        "type": "comment"
+    },
+    "7458": {
+        "file_id": 547,
+        "content": "                loss_func_dict['model_name_pairs'] = model_name_pairs\n                self.loss_func_list.append(loss_func_dict)\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    def get_loss(self, output, labels, mode):\n        \"\"\"\n        Args:\n            output: dict, output name and its value\n            labels: label of data\n            mode: str, 'Train' or 'Val'\n        \"\"\"\n        output['GroundTruth'] = labels\n        loss_list = []",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:86-114"
+    },
+    "7459": {
+        "file_id": 547,
+        "content": "This code defines a class for handling different modes of operation (train, valid, test, infer) and includes methods to handle each mode. It also contains a method to calculate the loss based on output and labels in 'Train' or 'Val' mode. The code is likely used in a model framework and the class might be used to control the flow and operations of the model depending on the mode it runs in.",
+        "type": "comment"
+    },
+    "7460": {
+        "file_id": 547,
+        "content": "        for loss_func_dict in self.loss_func_list:\n            if mode == loss_func_dict['mode']:\n                model_name_pairs = loss_func_dict['model_name_pairs']\n                loss_func = loss_func_dict['loss_func']\n                loss_val = loss_func(output[model_name_pairs[0]],\n                                     output[model_name_pairs[1]])\n                loss_list.append(loss_val)\n        total_loss = paddle.add_n(loss_list)\n        return total_loss\n    def get_acc(self, scores, labels, mode='Train'):\n        def _get_acc(score, label, mode='Train'):\n            top1 = paddle.metric.accuracy(input=score, label=label, k=1)\n            top5 = paddle.metric.accuracy(input=score, label=label, k=5)\n            _, world_size = get_dist_info()\n            # Deal with multi cards validate\n            if world_size > 1 and mode == 'Val':  #reduce sum when valid\n                top1 = paddle.distributed.all_reduce(\n                    top1, op=paddle.distributed.ReduceOp.SUM) / world_size\n                top5 = paddle.distributed.all_reduce(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:116-136"
+    },
+    "7461": {
+        "file_id": 547,
+        "content": "This code is iterating over a list of loss function dictionaries to find the appropriate loss function based on the input mode. It then calculates the loss value and appends it to a list. Finally, it adds up all the loss values to get the total loss. In the `get_acc` method, it defines an inner function that calculates top-1 and top-5 accuracy scores using PaddlePaddle's `metric.accuracy` function. It also handles multi-card validation by reducing the sum of top-1 and top-5 accuracy scores across multiple cards.",
+        "type": "comment"
+    },
+    "7462": {
+        "file_id": 547,
+        "content": "                    top5, op=paddle.distributed.ReduceOp.SUM) / world_size\n            return top1, top5\n        if len(labels) == 1:\n            label = labels[0]\n            return _get_acc(scores, label)\n        # Deal with VideoMix\n        elif len(labels) == 3:\n            label_a, label_b, lam = labels\n            top1a, top5a = _get_acc(scores, label_a, mode)\n            top1b, top5b = _get_acc(scores, label_b, mode)\n            top1 = lam * top1a + (1 - lam) * top1b\n            top5 = lam * top5a + (1 - lam) * top5b\n            return top1, top5\n    def forward_model(self, imgs, model_name, model):\n        if model_name in ['PPTSM_v2', 'ResNetTweaksTSM']:\n            # [N,T,C,H,W] -> [N*T,C,H,W]\n            imgs = paddle.reshape(imgs, [-1] + list(imgs.shape[2:]))\n        return model(imgs)\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        out = {}\n        loss_metrics = {}\n        imgs = data_batch[0]\n        labels = data_batch[1:]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:137-165"
+    },
+    "7463": {
+        "file_id": 547,
+        "content": "The code snippet contains a recognizerDistillation function, which calculates accuracy based on given scores and labels. It also includes a forward_model function for reshaping images and applying model operations. The train_step function defines the training process from input to output, including loss metrics calculation.",
+        "type": "comment"
+    },
+    "7464": {
+        "file_id": 547,
+        "content": "        for idx, item in enumerate(self.model_name_list):\n            model = self.model_list[idx]\n            model_name = list(item.keys())[0]\n            model_type = item[model_name]  # Teacher or Student\n            out[model_type] = self.forward_model(imgs, model_name, model)\n        # out_student, out_teacher\n        loss = self.get_loss(out, labels, 'Train')\n        loss_metrics['loss'] = loss\n        # calculate acc with student output\n        top1, top5 = self.get_acc(out['Student'], labels)\n        loss_metrics['top1'] = top1\n        loss_metrics['top5'] = top5\n        return loss_metrics\n    def val_step(self, data_batch):\n        out = {}\n        loss_metrics = {}\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        for idx, item in enumerate(self.model_name_list):\n            model = self.model_list[idx]\n            model_name = list(item.keys())[0]\n            model_type = item[model_name]  # Teacher or Student\n            out[model_type] = self.forward_model(imgs, model_name, model)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:167-193"
+    },
+    "7465": {
+        "file_id": 547,
+        "content": "This code defines a class that implements a recognizer for distillation. The model takes in an image and a model name, and returns the output from both the student and teacher models. It calculates loss using the student and teacher outputs, as well as top-1 and top-5 accuracy metrics from the student's output only. This is used for both training (train_step) and validation (val_step). The class utilizes a list of model names and corresponding models for both types, student and teacher, and iterates over them to apply the forward pass and calculate loss and metrics.",
+        "type": "comment"
+    },
+    "7466": {
+        "file_id": 547,
+        "content": "        # Loss of student with gt:  out_student, label\n        loss = self.get_loss(out, labels, 'Val')\n        loss_metrics['loss'] = loss\n        top1, top5 = self.get_acc(out['Student'], labels, 'Val')\n        loss_metrics['top1'] = top1\n        loss_metrics['top5'] = top5\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        # Use Student to test\n        for idx, item in enumerate(self.model_name_list):\n            model = self.model_list[idx]\n            model_name = list(item.keys())[0]\n            model_type = item[model_name]  # Teacher or Student\n            if model_type == \"Student\":\n                out = self.forward_model(imgs, model_name, model)\n        return out\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        # Use Student to infer\n        for idx, item in enumerate(self.model_name_list):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:195-224"
+    },
+    "7467": {
+        "file_id": 547,
+        "content": "In this code snippet, the get_loss and get_acc functions are used to calculate loss and accuracy metrics for a \"Student\" model. The test_step function tests the Student model using forward_model function, and the infer_step function is not implemented here. This code seems related to evaluating the performance of a student model in image recognition tasks.",
+        "type": "comment"
+    },
+    "7468": {
+        "file_id": 547,
+        "content": "            model = self.model_list[idx]\n            model_name = list(item.keys())[0]\n            model_type = item[model_name]  # Teacher or Student\n            if model_type == \"Student\":\n                out = self.forward_model(imgs, model_name, model)\n        return out",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:225-231"
+    },
+    "7469": {
+        "file_id": 547,
+        "content": "The code selects a model from the model_list based on the idx, and assigns its name to model_name. If the model type is \"Student\", it calls forward_model function passing imgs, model_name, and model as parameters, and returns the output.",
+        "type": "comment"
+    },
+    "7470": {
+        "file_id": 548,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py",
+        "type": "filepath"
+    },
+    "7471": {
+        "file_id": 548,
+        "content": "The code creates a 2D image classifier model using PaddleVideo's RecognizerMRI, with train_step and val_step calculating loss metrics, and test_step for testing without calling head.loss during inference.",
+        "type": "summary"
+    },
+    "7472": {
+        "file_id": 548,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nimport paddle\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerMRI(BaseRecognizer):\n    \"\"\"2D recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.\n        num_segs = imgs.shape[\n            1]  # imgs.shape=[N,T,C,H,W], for most commonly case",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py:1-27"
+    },
+    "7473": {
+        "file_id": 548,
+        "content": "Code is from PaddleVideo's RecognizerMRI class, a 2D recognizer model framework. It has a forward_net method that takes imgs as input and returns the output of the network. The number of segments is obtained from the image shape and used to call the self.head method.",
+        "type": "comment"
+    },
+    "7474": {
+        "file_id": 548,
+        "content": "        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))\n        imgs = paddle.cast(imgs, \"float32\")  #############\n        imgs = imgs.unsqueeze(1)\n        if self.backbone != None:\n            feature = self.backbone(imgs)\n        else:\n            feature = imgs\n        if self.head != None:\n            cls_score = self.head(feature, num_segs)\n        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py:28-59"
+    },
+    "7475": {
+        "file_id": 548,
+        "content": "This code defines a model for image classification. It first reshapes and casts the input images to float32 type, then passes them through a backbone network if one is defined. After that, it sends the resulting feature map through a head network (if defined) to produce class scores. The train_step function uses these class scores to calculate loss metrics during training, while the val_step function performs similar operations but does not compute losses.",
+        "type": "comment"
+    },
+    "7476": {
+        "file_id": 548,
+        "content": "                                      labels,\n                                      valid_mode=True,\n                                      if_top5=False)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics\n        imgs = data_batch[0]\n        cls_score = self.forward_net(imgs)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        cls_score = self.forward_net(imgs)\n        return cls_score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py:60-76"
+    },
+    "7477": {
+        "file_id": 548,
+        "content": "The code defines a test_step and infer_step function for a model, which takes in data_batch as input and returns the classification scores from the forward_net function. The test_step specifically mentions that during testing, the net won't call head.loss.",
+        "type": "comment"
+    },
+    "7478": {
+        "file_id": 549,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py",
+        "type": "filepath"
+    },
+    "7479": {
+        "file_id": 549,
+        "content": "The code introduces a GCN Recognizer model framework for PaddleVideo, classifying images through forward pass definition, training step loss calculation, and validation. A RecognizerGCN model is defined with test_step and infer_step functions.",
+        "type": "summary"
+    },
+    "7480": {
+        "file_id": 549,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerGCN(BaseRecognizer):\n    \"\"\"GCN Recognizer model framework.\n    \"\"\"\n    def __init__(self,\n                 backbone=None,\n                 head=None,\n                 runtime_cfg=None,\n                 if_top5=True):\n        \"\"\"\n        Args:\n            backbone (dict): Backbone modules to extract feature.\n            head (dict): Classification head to process feature.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py:1-33"
+    },
+    "7481": {
+        "file_id": 549,
+        "content": "This code defines a GCN Recognizer model framework for PaddleVideo. It has an initialization method that takes arguments for backbone, head, runtime_cfg, and if_top5. The GCN Recognizer is registered with the RECOGNIZERS registry and extends BaseRecognizer class.",
+        "type": "comment"
+    },
+    "7482": {
+        "file_id": 549,
+        "content": "            is_top5 (bool): Whether to display top-5 accuracy during training/validation steps.\n        \"\"\"\n        super(RecognizerGCN, self).__init__(backbone, head, runtime_cfg)\n        self.if_top5 = if_top5\n    def forward_net(self, data):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        feature = self.backbone(data)\n        cls_score = self.head(feature)\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        data = data_batch[0]\n        label = data_batch[1:]\n        # call forward\n        cls_score = self.forward_net(data)\n        loss_metrics = self.head.loss(cls_score, label, if_top5=self.if_top5)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        data = data_batch[0]\n        label = data_batch[1:]\n        # call forward\n        cls_score = self.forward_net(data)\n        loss_metrics = self.head.loss(cls_score,\n                                      label,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py:34-66"
+    },
+    "7483": {
+        "file_id": 549,
+        "content": "RecognizerGCN is a model that performs image classification. It has a backbone for feature extraction and a head for classification. Forward_net defines the forward pass. Train_step calculates loss and metrics during training, taking into account if_top5 flag. Val_step performs validation by forward pass and loss calculation.",
+        "type": "comment"
+    },
+    "7484": {
+        "file_id": 549,
+        "content": "                                      valid_mode=True,\n                                      if_top5=self.if_top5)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        data = data_batch[0]\n        # call forward\n        cls_score = self.forward_net(data)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        data = data_batch[0]\n        # call forward\n        cls_score = self.forward_net(data)\n        return cls_score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py:67-87"
+    },
+    "7485": {
+        "file_id": 549,
+        "content": "The code defines a RecognizerGCN model and provides test_step and infer_step functions to classify data by forwarding it through the network and returning class scores.",
+        "type": "comment"
+    },
+    "7486": {
+        "file_id": 550,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py",
+        "type": "filepath"
+    },
+    "7487": {
+        "file_id": 550,
+        "content": "The MoViNetRecognizerFrame class, extending BaseRecognizer, has forward_net and train_step methods for training steps. Three functions - forward_net, test_step, and infer_step are defined for model's testing or inference process.",
+        "type": "summary"
+    },
+    "7488": {
+        "file_id": 550,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom .base import BaseRecognizer\nfrom ...registry import RECOGNIZERS\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass MoViNetRecognizerFrame(BaseRecognizer):\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        self.backbone.clean_activation_buffers()\n        outputs = self.backbone(imgs)\n        cls_score = self.head(outputs)\n        return cls_score\n    def train_step(self, data_batch):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py:1-33"
+    },
+    "7489": {
+        "file_id": 550,
+        "content": "The code is defining a class named \"MoViNetRecognizerFrame\" which extends the BaseRecognizer class. It has two methods, forward_net and train_step. The forward_net method defines how the model will run from input to output by first cleaning activation buffers in the backbone and then passing the inputs through it to get outputs. Finally, the head is applied on these outputs to get class scores. The train_step method defines a training step for this model.",
+        "type": "comment"
+    },
+    "7490": {
+        "file_id": 550,
+        "content": "        \"\"\"Training step.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1]  #.astype(\"int64\")\n        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])\n        # call forward\n        cls_score = self.forward_net(data)\n        loss_metrics = self.head.loss_func(cls_score, labels)\n        top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)\n        top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)\n        output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}\n        return output\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1]  #.astype(\"int64\")\n        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])\n        # call forward\n        cls_score = self.forward_net(data)\n        loss_metrics = self.head.loss_func(cls_score, labels)\n        top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)\n        top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py:34-57"
+    },
+    "7491": {
+        "file_id": 550,
+        "content": "Training step: Implements a training step for the model, taking data_batch as input. Extracts images and labels, transposes data, applies forward pass in the network, calculates loss metrics, and computes top-1 and top-5 accuracy scores. Returns output with 'loss', 'top1', and 'top5' keys.\nValidating step: Implements a validating step for the model, similar to training step but used to validate the model on unseen data. Computes top-1 and top-5 accuracy scores along with loss metrics.",
+        "type": "comment"
+    },
+    "7492": {
+        "file_id": 550,
+        "content": "        output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}\n        return output\n    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        imgs = data_batch[0]\n        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])\n        # call forward\n        cls_score = self.forward_net(data)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        imgs = data_batch[0]\n        # call forward\n        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])\n        cls_score = self.forward_net(data)\n        return cls_score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py:58-78"
+    },
+    "7493": {
+        "file_id": 550,
+        "content": "This code defines three functions: `forward_net`, `test_step`, and `infer_step`. The `forward_net` function is the core of the model, responsible for forward propagation. The `test_step` and `infer_step` functions both take in a data batch, transpose the images, call the `forward_net` function to get class scores, and return these scores. These steps are likely part of a deep learning model's testing or inference process.",
+        "type": "comment"
+    },
+    "7494": {
+        "file_id": 551,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py",
+        "type": "filepath"
+    },
+    "7495": {
+        "file_id": 551,
+        "content": "The code defines a RecognizerTransformer class for implementing a transformer-based recognizer model, which includes feature extraction, training, validation, and testing steps. It also defines a model for inferring image results from multiple views using forward_net function and averaging based on 'avg_type'.",
+        "type": "summary"
+    },
+    "7496": {
+        "file_id": 551,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport paddle.nn.functional as F\nfrom paddlevideo.utils import get_logger\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerTransformer(BaseRecognizer):\n    \"\"\"Transformer's recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        # imgs.shape=[N,C,T,H,W], for transformer case\n        if self.backbone is not None:\n            feature = self.backbone(imgs)\n        else:\n            feature = imgs",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:1-31"
+    },
+    "7497": {
+        "file_id": 551,
+        "content": "This code defines a RecognizerTransformer class that inherits from BaseRecognizer and implements a transformer-based recognizer model framework. It takes in an input tensor imgs of shape [N,C,T,H,W] where N is the batch size, C is the number of channels, T is the temporal length, H is the height, and W is the width. If a backbone is specified, it applies the backbone to the images for feature extraction; otherwise, it uses the input images directly. The resulting feature tensor is returned.",
+        "type": "comment"
+    },
+    "7498": {
+        "file_id": 551,
+        "content": "        if self.head is not None:\n            cls_score = self.head(feature)\n        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        imgs = data_batch[0]\n        num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg\n        cls_score = []\n        for i in range(num_views):\n            view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:33-62"
+    },
+    "7499": {
+        "file_id": 551,
+        "content": "The code defines a model's training, validation, and testing steps. The train_step calculates the loss between predicted class scores and actual labels. The val_step is similar but marks some samples as valid in validation mode. The test_step infers by processing views of images and stores class scores in a list.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/75.json b/docs/data/75.json
new file mode 100644
index 000000000..c75692bfe
--- /dev/null
+++ b/docs/data/75.json
@@ -0,0 +1,547 @@
+{
+    "7500": {
+        "file_id": 551,
+        "content": "                        self.runtime_cfg.test.num_seg]\n            cls_score.append(self.forward_net(view))\n        cls_score = self._average_view(cls_score,\n                                       self.runtime_cfg.test.avg_type)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        imgs = data_batch[0]\n        num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg\n        cls_score = []\n        for i in range(num_views):\n            view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *\n                        self.runtime_cfg.test.num_seg]\n            cls_score.append(self.forward_net(view))\n        cls_score = self._average_view(cls_score,\n                                       self.runtime_cfg.test.avg_type)\n        return cls_score\n    def _average_view(self, cls_score, avg_type='score'):\n        \"\"\"Combine the predicted results of different views\n        Args:\n            cls_score (list): results of multiple views",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:63-86"
+    },
+    "7501": {
+        "file_id": 551,
+        "content": "This code defines a model for inferring the results from multiple views of images. The `forward_net` function is used to process each view, and then the results are averaged using the `_average_view` function based on the specified average type. This allows the model to make predictions from different perspectives of an image and combine them for a more accurate result.",
+        "type": "comment"
+    },
+    "7502": {
+        "file_id": 551,
+        "content": "            avg_type (str, optional): Average calculation method. Defaults to 'score'.\n        \"\"\"\n        assert avg_type in ['score', 'prob'], \\\n            f\"Currently only the average of 'score' or 'prob' is supported, but got {avg_type}\"\n        if avg_type == 'score':\n            return paddle.add_n(cls_score) / len(cls_score)\n        elif avg_type == 'prob':\n            return paddle.add_n(\n                [F.softmax(score, axis=-1)\n                 for score in cls_score]) / len(cls_score)\n        else:\n            raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:87-98"
+    },
+    "7503": {
+        "file_id": 551,
+        "content": "This code defines a class method with an optional 'avg_type' parameter for average calculation. It checks if the input is either 'score' or 'prob'. If 'score', it returns the sum of 'cls_score' divided by its length. If 'prob', it applies softmax to each element in 'cls_score', then averages their sum and length. Otherwise, it raises a NotImplementedError.",
+        "type": "comment"
+    },
+    "7504": {
+        "file_id": 552,
+        "content": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py",
+        "type": "filepath"
+    },
+    "7505": {
+        "file_id": 552,
+        "content": "The code imports libraries, defines the RecognizerTransformer_MRI model class with forward method and training/validation steps, using loss metrics. It includes two inference methods: 'test_step' and 'infer_step', which split input into multiple views for classification score generation. The average_view function combines these scores across views, using either 'score' or 'prob' averaging types.",
+        "type": "summary"
+    },
+    "7506": {
+        "file_id": 552,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport paddle.nn.functional as F\nfrom paddlevideo.utils import get_logger\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerTransformer_MRI(BaseRecognizer):\n    \"\"\"Transformer's recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        # imgs.shape=[N,C,T,H,W], for transformer case\n        imgs = paddle.cast(imgs, \"float32\")  #############\n        imgs = imgs.unsqueeze(1)\n        if self.backbone != None:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:1-32"
+    },
+    "7507": {
+        "file_id": 552,
+        "content": "This code imports necessary libraries, defines a class for the RecognizerTransformer_MRI model, and sets the input image shape. The forward_net method preprocesses input images by casting them to float32 type and adding an extra dimension for compatibility with the transformer architecture.",
+        "type": "comment"
+    },
+    "7508": {
+        "file_id": 552,
+        "content": "            feature = self.backbone(imgs)\n        else:\n            feature = imgs\n        if self.head != None:\n            cls_score = self.head(feature)\n        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score,\n                                      labels,\n                                      valid_mode=True,\n                                      if_top5=False)\n        return loss_metrics",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:33-63"
+    },
+    "7509": {
+        "file_id": 552,
+        "content": "This code defines a recognizer transformer model for image classification. The `forward_net` method processes images and returns class scores, while the `train_step` and `val_step` methods perform training and validation steps by passing data batches to the model and computing loss metrics using sigmoid activation and the head's loss function.",
+        "type": "comment"
+    },
+    "7510": {
+        "file_id": 552,
+        "content": "    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        imgs = data_batch[0]\n        num_views = imgs.shape[2] // self.backbone.seg_num\n        cls_score = []\n        for i in range(num_views):\n            view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *\n                        self.backbone.seg_num]\n            cls_score.append(self.forward_net(view))\n        cls_score = self.average_view(cls_score)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        imgs = data_batch[0]\n        num_views = imgs.shape[2] // self.backbone.seg_num\n        cls_score = []\n        for i in range(num_views):\n            view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *\n                        self.backbone.seg_num]\n            cls_score.append(self.forward_net(view))\n        cls_score = self.average_view(cls_score)\n        return cls_score\n    def average_view(self, cls_score, average_type='score'):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:65-89"
+    },
+    "7511": {
+        "file_id": 552,
+        "content": "The code defines two methods, 'test_step' and 'infer_step', for the model to infer from input to output. It splits the input into multiple views based on the number of segments in each view. For each view, it applies the forward network to generate a set of classification scores. Finally, it averages the scores across all views using the average_view method.",
+        "type": "comment"
+    },
+    "7512": {
+        "file_id": 552,
+        "content": "        \"\"\"Combine the scores of different views\n        Args:\n            cls_score (list): Scores of multiple views\n            average_type (str, optional): Average calculation method. Defaults to 'score'.\n        \"\"\"\n        assert average_type in ['score', 'prob'], \\\n            f\"Currently only the average of 'score' or 'prob' is supported, but got {average_type}\"\n        if average_type == 'score':\n            return paddle.add_n(cls_score) / len(cls_score)\n        elif average_type == 'avg':\n            return paddle.add_n([F.softmax(score)\n                                 for score in cls_score]) / len(cls_score)\n        else:\n            raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:90-104"
+    },
+    "7513": {
+        "file_id": 552,
+        "content": "This function combines the scores of multiple views, taking two arguments: a list of cls_scores and an optional average_type. It asserts that average_type is either 'score' or 'prob'. If 'score', it adds all scores in the list and divides by the count. If 'avg', it first applies softmax to each score, then adds them and divides by the count. Otherwise, it raises a NotImplementedError.",
+        "type": "comment"
+    },
+    "7514": {
+        "file_id": 553,
+        "content": "/paddlevideo/modeling/framework/segment/__init__.py",
+        "type": "filepath"
+    },
+    "7515": {
+        "file_id": 553,
+        "content": "This code file contains the Python implementation of segment models in PaddleVideo, including BaseSegment and CFBI classes. It is licensed under the Apache License, Version 2.0. The __all__ variable lists the available segments: BaseSegment and CFBI.",
+        "type": "summary"
+    },
+    "7516": {
+        "file_id": 553,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseSegment\nfrom .cfbi import CFBI\n__all__ = ['BaseSegment', 'CFBI']",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/__init__.py:1-16"
+    },
+    "7517": {
+        "file_id": 553,
+        "content": "This code file contains the Python implementation of segment models in PaddleVideo, including BaseSegment and CFBI classes. It is licensed under the Apache License, Version 2.0. The __all__ variable lists the available segments: BaseSegment and CFBI.",
+        "type": "comment"
+    },
+    "7518": {
+        "file_id": 554,
+        "content": "/paddlevideo/modeling/framework/segment/base.py",
+        "type": "filepath"
+    },
+    "7519": {
+        "file_id": 554,
+        "content": "This code defines a semi-Video Object Segmentation abstract base class with train_step, valid_step, and test_step methods for different modes (train, valid, test, or infer). Subclasses must implement 4 methods for model training and evaluation.",
+        "type": "summary"
+    },
+    "7520": {
+        "file_id": 554,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseSegment(nn.Layer):\n    \"\"\"Base class for semi-Video Object Segmentation.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Head to process feature.\n        loss(dict): Loss function.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/base.py:1-29"
+    },
+    "7521": {
+        "file_id": 554,
+        "content": "This code defines an abstract base class for semi-Video Object Segmentation. It has three required methods: train_step, valid_step, and test_step. The class also contains backbone, head, and loss modules to extract feature, process feature, and define the loss function respectively.",
+        "type": "comment"
+    },
+    "7522": {
+        "file_id": 554,
+        "content": "    \"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n        if loss is not None:\n            self.loss = builder.build_loss(loss)\n        else:\n            self.loss = None\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/base.py:30-57"
+    },
+    "7523": {
+        "file_id": 554,
+        "content": "This code initializes a segment model by building its backbone, head, and loss based on the provided parameters. The forward method defines how the model runs in different modes (train, valid, test, or infer). If running in train mode, the model performs training operations.",
+        "type": "comment"
+    },
+    "7524": {
+        "file_id": 554,
+        "content": "            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\"\n        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/base.py:58-90"
+    },
+    "7525": {
+        "file_id": 554,
+        "content": "The code defines an abstract class with four methods (train_step, val_step, test_step, and infer_step) that must be implemented by subclasses. The method name is chosen based on the mode input parameter for different phases of model training or evaluation. If an unsupported mode is passed, a NotImplementedError is raised.",
+        "type": "comment"
+    },
+    "7526": {
+        "file_id": 555,
+        "content": "/paddlevideo/modeling/framework/segment/cfbi.py",
+        "type": "filepath"
+    },
+    "7527": {
+        "file_id": 555,
+        "content": "This Python class initializes the CFBI model in PaddleVideo library for image segmentation and video processing using AI techniques, with instance-level attention via previous frame embeddings and labels.",
+        "type": "summary"
+    },
+    "7528": {
+        "file_id": 555,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom .utils import foreground2background, global_matching_for_eval, local_matching, calculate_attention_head_for_eval\nfrom ...registry import SEGMENT\nfrom .base import BaseSegment\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@SEGMENT.register()\nclass CFBI(BaseSegment):\n    \"\"\"CFBI model framework.\"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__(backbone, head, loss)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:1-30"
+    },
+    "7529": {
+        "file_id": 555,
+        "content": "This code is a Python class for the CFBI model in the PaddleVideo library. It initializes the model and inherits from the BaseSegment class, allowing it to use other classes like backbone, head, and loss.",
+        "type": "comment"
+    },
+    "7530": {
+        "file_id": 555,
+        "content": "        x1 = paddle.zeros([3, 1, 1, 1])\n        self.bg_bias = paddle.create_parameter(\n            shape=x1.shape,\n            dtype=x1.dtype,\n            default_initializer=nn.initializer.Assign(x1))\n        self.fg_bias = paddle.create_parameter(\n            shape=x1.shape,\n            dtype=x1.dtype,\n            default_initializer=nn.initializer.Assign(x1))\n        self.epsilon = 1e-05\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\n        \"\"\"\n        self.test_mode = True\n        ref_embeddings, ref_masks, prev_embedding, prev_mask, current_frame, pred_size, gt_ids = data_batch\n        current_frame_embedding_4x, current_frame_embedding_8x, current_frame_embedding_16x, \\\n        current_low_level = self.backbone(current_frame)\n        current_frame_embedding = [\n            current_frame_embedding_4x, current_frame_embedding_8x,\n            current_frame_embedding_16x\n        ]\n        if prev_embedding is None:\n            return None, current_frame_embedding",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:31-56"
+    },
+    "7531": {
+        "file_id": 555,
+        "content": "This code defines a class with a `test_step` method that performs testing on input data. It initializes some parameters and returns None if there is no previous embedding. The backbone function extracts multiple frame embeddings, which are stored in the `current_frame_embedding` list.",
+        "type": "comment"
+    },
+    "7532": {
+        "file_id": 555,
+        "content": "        else:\n            bs, c, h, w = current_frame_embedding_4x.shape\n            tmp_dic, _ = self.before_seghead_process(\n                ref_embeddings,\n                prev_embedding,\n                current_frame_embedding,\n                ref_masks,\n                prev_mask,\n                gt_ids,\n                current_low_level=current_low_level,\n            )\n            all_pred = []\n            for i in range(bs):\n                pred = tmp_dic[i]\n                pred = F.interpolate(pred,\n                                     size=[pred_size[0], pred_size[1]],\n                                     mode='bilinear',\n                                     align_corners=True)\n                all_pred.append(pred)\n            all_pred = paddle.concat(all_pred, axis=0)\n            all_pred = F.softmax(all_pred, axis=1)\n            return all_pred, current_frame_embedding\n    def before_seghead_process(self,\n                               ref_frame_embeddings=None,\n                               previous_frame_embeddings=None,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:57-84"
+    },
+    "7533": {
+        "file_id": 555,
+        "content": "The code is in PaddleVideo framework, and it contains an else block that executes when a condition is not met. The function first defines the shape of current_frame_embedding_4x. It then processes reference embeddings, previous embeddings, and current frame embedding with other parameters such as masks and IDs. It interpolates predictions and concatenates them along the specified axis. Finally, it applies softmax to all_pred on the specified axis before returning both all_pred and current_frame_embedding.",
+        "type": "comment"
+    },
+    "7534": {
+        "file_id": 555,
+        "content": "                               current_frame_embeddings=None,\n                               ref_frame_labels=None,\n                               previous_frame_mask=None,\n                               gt_ids=None,\n                               current_low_level=None):\n        \"\"\" process befor segmentation head\"\"\"\n        TEST_GLOBAL_MATCHING_CHUNK = [4, 1, 1]\n        TEST_GLOBAL_ATROUS_RATE = [2, 1, 1]\n        TRAIN_LOCAL_ATROUS_RATE = [2, 1, 1]\n        TEST_LOCAL_ATROUS_RATE = [2, 1, 1]\n        MODEL_FLOAT16_MATCHING = False\n        TEST_GLOBAL_MATCHING_MIN_PIXEL = 100\n        MODEL_MULTI_LOCAL_DISTANCE = [[4, 8, 12, 16, 20, 24],\n                                      [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]]\n        TRAIN_LOCAL_PARALLEL = True\n        TEST_LOCAL_PARALLEL = True\n        MODEL_MATCHING_BACKGROUND = True\n        MODEL_SEMANTIC_MATCHING_DIM = [32, 64, 128]\n        dic_tmp = []\n        boards = {}\n        scale_ref_frame_labels = []\n        scale_previous_frame_labels = []\n        for current_frame_embedding in current_frame_embeddings:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:85-108"
+    },
+    "7535": {
+        "file_id": 555,
+        "content": "The code initializes various constants and variables for the segmentation head process. It includes settings for matching, atroous rates, and parallel processing, as well as defining arrays for scale reference frame labels and previous frame labels.",
+        "type": "comment"
+    },
+    "7536": {
+        "file_id": 555,
+        "content": "            bs, c, h, w = current_frame_embedding.shape\n            if not self.test_mode:\n                raise NotImplementedError\n            else:\n                ref_frame_embeddings = list(zip(*ref_frame_embeddings))\n                all_scale_ref_frame_label = []\n                for ref_frame_label in ref_frame_labels:\n                    scale_ref_frame_label = paddle.cast(F.interpolate(\n                        paddle.cast(ref_frame_label, dtype=\"float32\"),\n                        size=(h, w),\n                        mode='nearest'),\n                                                        dtype=\"int32\")\n                    all_scale_ref_frame_label.append(scale_ref_frame_label)\n                scale_ref_frame_labels.append(all_scale_ref_frame_label)\n            scale_previous_frame_label = paddle.cast(F.interpolate(\n                paddle.cast(previous_frame_mask, dtype=\"float32\"),\n                size=(h, w),\n                mode='nearest'),\n                                                     dtype=\"int32\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:109-127"
+    },
+    "7537": {
+        "file_id": 555,
+        "content": "Resizing ref_frame_label and previous_frame_mask to match current frame size for nearest mode interpolation in PaddleVideo model.",
+        "type": "comment"
+    },
+    "7538": {
+        "file_id": 555,
+        "content": "            scale_previous_frame_labels.append(scale_previous_frame_label)\n        for n in range(bs):\n            ref_obj_ids = paddle.reshape(\n                paddle.cast(paddle.arange(0,\n                                          np.array(gt_ids)[n] + 1),\n                            dtype=\"int32\"), [-1, 1, 1, 1])\n            obj_num = ref_obj_ids.shape[0]\n            low_level_feat = paddle.unsqueeze(current_low_level[n], axis=0)\n            all_CE_input = []\n            all_attention_head = []\n            for scale_idx, current_frame_embedding, ref_frame_embedding, previous_frame_embedding, \\\n                scale_ref_frame_label, scale_previous_frame_label in zip(range(3), \\\n                    current_frame_embeddings, ref_frame_embeddings, previous_frame_embeddings, \\\n                    scale_ref_frame_labels, scale_previous_frame_labels):\n                #Prepare\n                seq_current_frame_embedding = current_frame_embedding[n]\n                seq_prev_frame_embedding = previous_frame_embedding[n]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:128-144"
+    },
+    "7539": {
+        "file_id": 555,
+        "content": "The code is iterating over the input data and for each frame, it prepares the current_frame_embedding and previous_frame_embedding by reshaping, unsqueezing, and extracting the specific frames from their respective arrays. It then adds these embeddings to separate lists for later use in calculating attention scores and computing cross-entropy loss.",
+        "type": "comment"
+    },
+    "7540": {
+        "file_id": 555,
+        "content": "                seq_previous_frame_label = paddle.cast(\n                    (paddle.cast(scale_previous_frame_label[n], dtype=\"int32\")\n                     == ref_obj_ids),\n                    dtype=\"float32\")\n                if np.array(gt_ids)[n] > 0:\n                    dis_bias = paddle.concat([\n                        paddle.unsqueeze(self.bg_bias[scale_idx], axis=0),\n                        paddle.expand(\n                            paddle.unsqueeze(self.fg_bias[scale_idx], axis=0),\n                            [np.array(gt_ids)[n], -1, -1, -1])\n                    ],\n                                             axis=0)\n                else:\n                    dis_bias = paddle.unsqueeze(self.bg_bias[scale_idx], axis=0)\n                #Global FG map\n                matching_dim = MODEL_SEMANTIC_MATCHING_DIM[scale_idx]\n                seq_current_frame_embedding_for_matching = paddle.transpose(\n                    seq_current_frame_embedding[:matching_dim], [1, 2, 0])\n                if not self.test_mode:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:145-164"
+    },
+    "7541": {
+        "file_id": 555,
+        "content": "This code calculates the distance bias for each frame in a sequence and prepares it for matching. It checks if the object ID is greater than 0, then assigns the corresponding background or foreground distance bias. It also transposes the current frame embedding for matching in case it's not in test mode.",
+        "type": "comment"
+    },
+    "7542": {
+        "file_id": 555,
+        "content": "                    raise NotImplementedError\n                else:\n                    all_scale_ref_frame_label = scale_ref_frame_label\n                    all_ref_frame_embedding = ref_frame_embedding\n                    all_reference_embeddings = []\n                    all_reference_labels = []\n                    seq_ref_frame_labels = []\n                    count = 0\n                    for idx in range(len(all_scale_ref_frame_label)):\n                        ref_frame_embedding = all_ref_frame_embedding[idx]\n                        scale_ref_frame_label = all_scale_ref_frame_label[idx]\n                        seq_ref_frame_embedding = ref_frame_embedding[n]\n                        seq_ref_frame_embedding = paddle.transpose(\n                            seq_ref_frame_embedding, [1, 2, 0])\n                        seq_ref_frame_label = paddle.cast(\n                            (paddle.cast(scale_ref_frame_label[n],\n                                         dtype=\"int32\") == ref_obj_ids),\n                            dtype=\"float32\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:165-184"
+    },
+    "7543": {
+        "file_id": 555,
+        "content": "The code raises a NotImplementedError if the condition is met and otherwise creates variables for storing reference frame embeddings, labels, and sequence-specific values. It then iterates through the provided labels and embeddings to prepare them for use in the model's segmentation process.",
+        "type": "comment"
+    },
+    "7544": {
+        "file_id": 555,
+        "content": "                        seq_ref_frame_labels.append(seq_ref_frame_label)\n                        seq_ref_frame_label = paddle.transpose(\n                            paddle.squeeze(seq_ref_frame_label, axis=1),\n                            [1, 2, 0])\n                        all_reference_embeddings.append(\n                            seq_ref_frame_embedding[:, :, :matching_dim])\n                        all_reference_labels.append(seq_ref_frame_label)\n                    global_matching_fg = global_matching_for_eval(\n                        all_reference_embeddings=all_reference_embeddings,\n                        query_embeddings=\n                        seq_current_frame_embedding_for_matching,\n                        all_reference_labels=all_reference_labels,\n                        n_chunks=TEST_GLOBAL_MATCHING_CHUNK[scale_idx],\n                        dis_bias=dis_bias,\n                        atrous_rate=TEST_GLOBAL_ATROUS_RATE[scale_idx],\n                        use_float16=MODEL_FLOAT16_MATCHING,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:185-200"
+    },
+    "7545": {
+        "file_id": 555,
+        "content": "This code appears to be part of a computer vision model. It is appending reference frame labels, transposing them, and adding the reference embeddings to a list. Then it calls a function called \"global_matching_fg\" with the reference embeddings, query embeddings, reference labels, number of chunks, distance bias, and atrous rate as arguments. The function is likely used for global matching evaluation in the context of this model.",
+        "type": "comment"
+    },
+    "7546": {
+        "file_id": 555,
+        "content": "                        atrous_obj_pixel_num=TEST_GLOBAL_MATCHING_MIN_PIXEL)\n                # Local FG map\n                seq_prev_frame_embedding_for_matching = paddle.transpose(\n                    seq_prev_frame_embedding[:matching_dim], [1, 2, 0])\n                seq_previous_frame_label_for_matching = paddle.transpose(\n                    paddle.squeeze(seq_previous_frame_label, axis=1), [1, 2, 0])\n                local_matching_fg = local_matching(\n                    prev_frame_embedding=seq_prev_frame_embedding_for_matching,\n                    query_embedding=seq_current_frame_embedding_for_matching,\n                    prev_frame_labels=seq_previous_frame_label_for_matching,\n                    multi_local_distance=MODEL_MULTI_LOCAL_DISTANCE[scale_idx],\n                    dis_bias=dis_bias,\n                    atrous_rate=TRAIN_LOCAL_ATROUS_RATE[scale_idx] if\n                    not self.test_mode else TEST_LOCAL_ATROUS_RATE[scale_idx],\n                    use_float16=MODEL_FLOAT16_MATCHING,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:201-216"
+    },
+    "7547": {
+        "file_id": 555,
+        "content": "This code block prepares input for a local matching function to compare previous and current frames. It transposes the embeddings and labels, sets atrous rate based on test mode, and uses float16 if needed.",
+        "type": "comment"
+    },
+    "7548": {
+        "file_id": 555,
+        "content": "                    allow_downsample=False,\n                    allow_parallel=TRAIN_LOCAL_PARALLEL\n                    if not self.test_mode else TEST_LOCAL_PARALLEL)\n                #Aggregate Pixel-level Matching\n                to_cat_global_matching_fg = paddle.transpose(\n                    paddle.squeeze(global_matching_fg, axis=0), [2, 3, 0, 1])\n                to_cat_local_matching_fg = paddle.transpose(\n                    paddle.squeeze(local_matching_fg, axis=0), [2, 3, 0, 1])\n                all_to_cat = [\n                    to_cat_global_matching_fg, to_cat_local_matching_fg,\n                    seq_previous_frame_label\n                ]\n                #Global and Local BG map\n                if MODEL_MATCHING_BACKGROUND:\n                    to_cat_global_matching_bg = foreground2background(\n                        to_cat_global_matching_fg,\n                        np.array(gt_ids)[n] + 1)\n                    reshaped_prev_nn_feature_n = paddle.unsqueeze(\n                        paddle.transpose(to_cat_local_matching_fg,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:217-237"
+    },
+    "7549": {
+        "file_id": 555,
+        "content": "This code performs pixel-level matching and global/local background subtraction for image segmentation. It transposes and squeezes the global and local matching results, concatenates them with previous frame labels, and if using background modeling, computes global and local background maps.",
+        "type": "comment"
+    },
+    "7550": {
+        "file_id": 555,
+        "content": "                                         [0, 2, 3, 1]),\n                        axis=1)\n                    to_cat_local_matching_bg = foreground2background(\n                        reshaped_prev_nn_feature_n,\n                        np.array(gt_ids)[n] + 1)\n                    to_cat_local_matching_bg = paddle.squeeze(paddle.transpose(\n                        to_cat_local_matching_bg, [0, 4, 2, 3, 1]),\n                                                              axis=-1)\n                    all_to_cat += [\n                        to_cat_local_matching_bg, to_cat_global_matching_bg\n                    ]\n                to_cat_current_frame_embedding = paddle.expand(\n                    paddle.unsqueeze(current_frame_embedding[n], axis=0),\n                    [obj_num, -1, -1, -1])\n                to_cat_prev_frame_embedding = paddle.expand(\n                    paddle.unsqueeze(previous_frame_embedding[n], axis=0),\n                    [obj_num, -1, -1, -1])\n                to_cat_prev_frame_embedding_fg = to_cat_prev_frame_embedding * seq_previous_frame_label",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:238-256"
+    },
+    "7551": {
+        "file_id": 555,
+        "content": "This code segment appears to be part of a computer vision model that handles segmentation for video frames. It seems to be working with object instances, their previous and current frame embeddings, and global/local matching backgrounds. The code is performing reshaping operations and expansions on tensors, and calculating local and global matching backgrounds for the current frame's object instance. Overall, it appears to be a complex segment of a larger AI-based video processing pipeline.",
+        "type": "comment"
+    },
+    "7552": {
+        "file_id": 555,
+        "content": "                to_cat_prev_frame_embedding_bg = to_cat_prev_frame_embedding * (\n                    1 - seq_previous_frame_label)\n                all_to_cat += [\n                    to_cat_current_frame_embedding,\n                    to_cat_prev_frame_embedding_fg,\n                    to_cat_prev_frame_embedding_bg\n                ]\n                CE_input = paddle.concat(all_to_cat, axis=1)\n                #Instance-level Attention\n                if not self.test_mode:\n                    raise NotImplementedError\n                else:\n                    attention_head = calculate_attention_head_for_eval(\n                        all_ref_frame_embedding,\n                        seq_ref_frame_labels,\n                        paddle.expand(\n                            paddle.unsqueeze(previous_frame_embedding[n],\n                                             axis=0), [obj_num, -1, -1, -1]),\n                        seq_previous_frame_label,\n                        epsilon=self.epsilon)\n                all_CE_input.append(CE_input)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:257-279"
+    },
+    "7553": {
+        "file_id": 555,
+        "content": "This code calculates attention for instance-level using previous frame embeddings and labels. It concatenates current, previous frame embedding (for foreground and background), and then applies attention on all frames in non-test mode. In test mode, it raises a NotImplementedError.",
+        "type": "comment"
+    },
+    "7554": {
+        "file_id": 555,
+        "content": "                all_attention_head.append(attention_head)\n            #Collaborative Ensembler\n            pred = self.head(all_CE_input, all_attention_head, low_level_feat)\n            dic_tmp.append(pred)\n        return dic_tmp, boards",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/cfbi.py:280-286"
+    },
+    "7555": {
+        "file_id": 555,
+        "content": "This code snippet is part of a machine learning model. It appends the \"attention_head\" to the list \"all_attention_head\", then passes the combined inputs along with \"low_level_feat\" to a \"head\" function, and appends its output to \"dic_tmp\". Finally, it returns both \"dic_tmp\" and \"boards\".",
+        "type": "comment"
+    },
+    "7556": {
+        "file_id": 556,
+        "content": "/paddlevideo/modeling/framework/segment/utils.py",
+        "type": "filepath"
+    },
+    "7557": {
+        "file_id": 556,
+        "content": "This code uses PaddleVideo for video segment matching, ASPP-based deep learning models for object size determination and feature extraction, handles padding, computes distances, prepares data, performs feature selection and masking, and utilizes parallel processing in PaddlePaddle.",
+        "type": "summary"
+    },
+    "7558": {
+        "file_id": 556,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\ndef foreground2background(dis, obj_num):\n    if obj_num == 1:\n        return dis\n    bg_dis = []\n    for i in range(obj_num):\n        obj_back = []\n        for j in range(obj_num):\n            if i == j:\n                continue\n            obj_back.append(paddle.unsqueeze(dis[j], axis=0))\n        obj_back = paddle.concat(x=obj_back, axis=1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:1-31"
+    },
+    "7559": {
+        "file_id": 556,
+        "content": "This code defines a function \"foreground2background\" that takes distance (dis) and object number (obj_num) as inputs. It returns the background distances for each foreground object when obj_num is greater than 1 by concatenating the unsqueezed distance of other objects along axis 1.",
+        "type": "comment"
+    },
+    "7560": {
+        "file_id": 556,
+        "content": "        obj_back = paddle.min(x=obj_back, axis=1, keepdim=True)\n        bg_dis.append(obj_back)\n    bg_dis = paddle.concat(x=bg_dis, axis=0)\n    return bg_dis\nWRONG_LABEL_PADDING_DISTANCE = 5e4\n#GLOBAL_DIST_MAP\ndef _pairwise_distances(x, x2, y, y2):\n    \"\"\"\n    Computes pairwise squared l2 distances between tensors x and y.\n    Args:\n    x: [n, feature_dim].\n    y: [m, feature_dim].\n    Returns:\n    d: [n, m].\n    \"\"\"\n    xs = x2\n    ys = y2\n    xs = paddle.unsqueeze(xs, axis=1)\n    ys = paddle.unsqueeze(ys, axis=0)\n    d = xs + ys - 2. * paddle.matmul(x, y, transpose_y=True)\n    return d\ndef _flattened_pairwise_distances(reference_embeddings, ref_square,\n                                  query_embeddings, query_square):\n    \"\"\"\n    Calculates flattened tensor of pairwise distances between ref and query.\n    Args:\n        reference_embeddings: [..., embedding_dim],\n          the embedding vectors for the reference frame\n        query_embeddings: [..., embedding_dim],\n          the embedding vectors for the query frames.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:32-68"
+    },
+    "7561": {
+        "file_id": 556,
+        "content": "This function calculates the pairwise squared L2 distances between tensors x and y, returns them in a matrix d. The function takes x and y as input, which are [n, feature_dim] and [m, feature_dim] respectively. It then performs matrix calculations to compute the pairwise distances and returns d, which is of size [n, m].",
+        "type": "comment"
+    },
+    "7562": {
+        "file_id": 556,
+        "content": "    Returns:\n        dists: [reference_embeddings.size / embedding_dim, query_embeddings.size / embedding_dim]\n    \"\"\"\n    dists = _pairwise_distances(query_embeddings, query_square,\n                                reference_embeddings, ref_square)\n    return dists\ndef _nn_features_per_object_for_chunk(reference_embeddings, ref_square,\n                                      query_embeddings, query_square,\n                                      wrong_label_mask):\n    \"\"\"Extracts features for each object using nearest neighbor attention.\n    Args:\n        reference_embeddings: [n_chunk, embedding_dim],\n          the embedding vectors for the reference frame.\n        query_embeddings: [m_chunk, embedding_dim],\n          the embedding vectors for the query frames.\n        wrong_label_mask: [n_objects, n_chunk],\n          the mask for pixels not used for matching.\n    Returns:\n        nn_features: A float32 tensor of nearest neighbor features of shape\n          [m_chunk, n_objects, n_chunk].\n    \"\"\"\n    if reference_embeddings.dtype == \"float16\":",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:69-92"
+    },
+    "7563": {
+        "file_id": 556,
+        "content": "This code computes pairwise distances between query and reference embeddings, then extracts features for each object using nearest neighbor attention. It takes embedding vectors for the reference frame and query frames as input, along with a mask for pixels not used for matching. The output is a tensor of nearest neighbor features shape [m_chunk, n_objects, n_chunk]. The code also checks the dtype of reference_embeddings to handle float16 data.",
+        "type": "comment"
+    },
+    "7564": {
+        "file_id": 556,
+        "content": "        wrong_label_mask = paddle.cast(wrong_label_mask, dtype=\"float16\")\n    else:\n        wrong_label_mask = paddle.cast(wrong_label_mask, dtype=\"float32\")\n    reference_embeddings_key = reference_embeddings\n    query_embeddings_key = query_embeddings\n    dists = _flattened_pairwise_distances(reference_embeddings_key, ref_square,\n                                          query_embeddings_key, query_square)\n    dists = (paddle.unsqueeze(dists, axis=1) +\n             paddle.unsqueeze(wrong_label_mask, axis=0) *\n             WRONG_LABEL_PADDING_DISTANCE)\n    features = paddle.min(dists, axis=2, keepdim=True)\n    return features\ndef _nearest_neighbor_features_per_object_in_chunks(reference_embeddings_flat,\n                                                    query_embeddings_flat,\n                                                    reference_labels_flat,\n                                                    n_chunks):\n    \"\"\"Calculates the nearest neighbor features per object in chunks to save mem.\n    Uses chunking to bound the memory use.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:93-113"
+    },
+    "7565": {
+        "file_id": 556,
+        "content": "This function calculates the nearest neighbor features per object in chunks to save memory. It takes reference embeddings, query embeddings, and reference labels as inputs. The function first casts the wrong_label_mask based on its type (float16 or float32). Then it calculates pairwise distances between reference and query embeddings. Distances for incorrect matches are set to a specific padding distance using wrong_label_mask. Finally, it returns the features by taking the minimum value across chunks in each dimension.",
+        "type": "comment"
+    },
+    "7566": {
+        "file_id": 556,
+        "content": "    Args:\n        reference_embeddings_flat: [n, embedding_dim],\n          the embedding vectors for the reference frame.\n        query_embeddings_flat: [m, embedding_dim],\n          the embedding vectors for the query frames.\n        reference_labels_flat: [n, n_objects],\n          the class labels of the reference frame.\n        n_chunks: Integer, the number of chunks to use to save memory\n          (set to 1 for no chunking).\n    Returns:\n        nn_features: [m, n_objects, n].\n    \"\"\"\n    feature_dim, embedding_dim = query_embeddings_flat.shape\n    chunk_size = int(np.ceil(float(feature_dim) / n_chunks))\n    wrong_label_mask = reference_labels_flat < 0.1\n    wrong_label_mask = paddle.transpose(x=wrong_label_mask, perm=[1, 0])\n    ref_square = paddle.sum(paddle.pow(reference_embeddings_flat, 2), axis=1)\n    query_square = paddle.sum(paddle.pow(query_embeddings_flat, 2), axis=1)\n    all_features = []\n    for n in range(n_chunks):\n        if n_chunks == 1:\n            query_embeddings_flat_chunk = query_embeddings_flat",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:114-138"
+    },
+    "7567": {
+        "file_id": 556,
+        "content": "This function computes the features for a set of query frames against a reference frame. It takes in embedding vectors for reference and query frames, as well as their respective class labels. The function uses chunking to handle large feature dimensions, with the number of chunks adjustable by the user. It returns a tensor of shape [m, n_objects, n] which represents the features for each query frame.",
+        "type": "comment"
+    },
+    "7568": {
+        "file_id": 556,
+        "content": "            query_square_chunk = query_square\n            chunk_start = 0\n        else:\n            chunk_start = n * chunk_size\n            chunk_end = (n + 1) * chunk_size\n            query_square_chunk = query_square[chunk_start:chunk_end]\n            if query_square_chunk.shape[0] == 0:\n                continue\n            query_embeddings_flat_chunk = query_embeddings_flat[\n                chunk_start:chunk_end]\n        features = _nn_features_per_object_for_chunk(\n            reference_embeddings_flat, ref_square, query_embeddings_flat_chunk,\n            query_square_chunk, wrong_label_mask)\n        all_features.append(features)\n    if n_chunks == 1:\n        nn_features = all_features[0]\n    else:\n        nn_features = paddle.concat(all_features, axis=0)\n    return nn_features\ndef global_matching(reference_embeddings,\n                    query_embeddings,\n                    reference_labels,\n                    n_chunks=100,\n                    dis_bias=0.,\n                    ori_size=None,\n                    atrous_rate=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:139-167"
+    },
+    "7569": {
+        "file_id": 556,
+        "content": "This function is performing global matching on query embeddings and reference embeddings. It breaks down the embeddings into chunks, calculates features for each chunk, and concatenates these features to get the final nn_features. The number of chunks is determined by n_chunks, which default to 100. If n_chunks = 1, it returns the features from the only chunk.",
+        "type": "comment"
+    },
+    "7570": {
+        "file_id": 556,
+        "content": "                    use_float16=True,\n                    atrous_obj_pixel_num=0):\n    \"\"\"\n    Calculates the distance to the nearest neighbor per object.\n    For every pixel of query_embeddings calculate the distance to the\n    nearest neighbor in the (possibly subsampled) reference_embeddings per object.\n    Args:\n        reference_embeddings: [height, width, embedding_dim],\n          the embedding vectors for the reference frame.\n        query_embeddings: [height, width,\n          embedding_dim], the embedding vectors for the query frames.\n        reference_labels: [height, width, obj_nums],\n          the class labels of the reference frame.\n        n_chunks: Integer, the number of chunks to use to save memory\n          (set to 1 for no chunking).\n        dis_bias: [n_objects], foreground and background bias\n        ori_size: (ori_height, ori_width),\n          the original spatial size. If \"None\", (ori_height, ori_width) = (height, width).\n        atrous_rate: Integer, the atrous rate of reference_embeddings.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:168-186"
+    },
+    "7571": {
+        "file_id": 556,
+        "content": "This code calculates the distance to the nearest neighbor per object for query_embeddings and reference_embeddings, given class labels and other parameters. It uses chunks to save memory and takes into account the atrous rate of reference_embeddings.",
+        "type": "comment"
+    },
+    "7572": {
+        "file_id": 556,
+        "content": "        use_float16: Bool, if \"True\", use float16 type for matching.\n    Returns:\n        nn_features: [1, ori_height, ori_width, n_objects, feature_dim].\n    \"\"\"\n    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])\n    if use_float16:\n        query_embeddings = paddle.cast(query_embeddings, dtype=\"float16\")\n        reference_embeddings = paddle.cast(reference_embeddings,\n                                           dtype=\"float16\")\n    h, w, embedding_dim = query_embeddings.shape\n    obj_nums = reference_labels.shape[2]\n    if atrous_rate > 1:\n        h_pad = (atrous_rate - h % atrous_rate) % atrous_rate\n        w_pad = (atrous_rate - w % atrous_rate) % atrous_rate\n        selected_points = paddle.zeros([h + h_pad, w + w_pad])\n        selected_points = selected_points.view(\n            (h + h_pad) // atrous_rate, atrous_rate, (w + w_pad) // atrous_rate,\n            atrous_rate)\n        selected_points[:, 0, :, 0] = 1.\n        selected_points = paddle.reshape(selected_points,\n                                         [h + h_pad, w + w_pad, 1])[:h, :w]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:187-209"
+    },
+    "7573": {
+        "file_id": 556,
+        "content": "This code snippet calculates and pads the selected points for spatial pyramid pooling in a segmentation model. It checks if float16 is used, then prepares padding based on the atrous rate. The resulting tensor of selected points is reshaped to match the input shape before returning it.",
+        "type": "comment"
+    },
+    "7574": {
+        "file_id": 556,
+        "content": "        is_big_obj = (paddle.sum(\n            reference_labels,\n            axis=(0, 1))) > (atrous_obj_pixel_num * atrous_rate**2)\n        reference_labels[:, :,\n                         is_big_obj] = reference_labels[:, :,\n                                                        is_big_obj] * selected_points\n    reference_embeddings_flat = paddle.reshape(reference_embeddings,\n                                               [-1, embedding_dim])\n    reference_labels_flat = paddle.reshape(reference_labels, [-1, obj_nums])\n    query_embeddings_flat = paddle.reshape(query_embeddings,\n                                           [-1, embedding_dim])\n    all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9\n    reference_labels_flat = paddle.reshape(\n        paddle.masked_select(reference_labels_flat,\n                             paddle.expand(all_ref_fg, [-1, obj_nums])),\n        [-1, obj_nums])\n    if reference_labels_flat.shape[0] == 0:\n        return paddle.ones([1, h, w, obj_nums, 1])\n    reference_embeddings_flat = paddle.reshape(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:210-230"
+    },
+    "7575": {
+        "file_id": 556,
+        "content": "The code is implementing a segmentation method in the PaddleVideo library. It first determines if an object is big or small based on the sum of reference labels. Then, it reshapes the reference embeddings and labels for further processing. It checks if any reference labels are present and returns default values if none are found.",
+        "type": "comment"
+    },
+    "7576": {
+        "file_id": 556,
+        "content": "        paddle.masked_select(reference_embeddings_flat,\n                             paddle.expand(all_ref_fg, [-1, embedding_dim])),\n        [-1, embedding_dim])\n    nn_features = _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,\n        n_chunks)\n    nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])\n    nn_features_reshape = (\n        F.sigmoid(nn_features_reshape +\n                  paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2\n    #TODO: ori_size is not None\n    if use_float16:\n        nn_features_reshape = paddle.cast(nn_features_reshape, dtype=\"float32\")\n    return nn_features_reshape\ndef global_matching_for_eval(all_reference_embeddings,\n                             query_embeddings,\n                             all_reference_labels,\n                             n_chunks=20,\n                             dis_bias=0.,\n                             ori_size=None,\n                             atrous_rate=1,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:231-257"
+    },
+    "7577": {
+        "file_id": 556,
+        "content": "This function performs nearest neighbor feature extraction for video segment matching using reference and query embeddings, reference labels, and other parameters such as number of chunks, displacement bias, original size, and atrous rate. It returns the normalized nearest neighbor features in a reshaped format.",
+        "type": "comment"
+    },
+    "7578": {
+        "file_id": 556,
+        "content": "                             use_float16=True,\n                             atrous_obj_pixel_num=0):\n    \"\"\"\n    Calculates the distance to the nearest neighbor per object.\n    For every pixel of query_embeddings calculate the distance to the\n    nearest neighbor in the (possibly subsampled) reference_embeddings per object.\n    Args:\n        all_reference_embeddings: A list of reference_embeddings,\n          each with size [height, width, embedding_dim],\n          the embedding vectors for the reference frame.\n        query_embeddings: [n_query_images, height, width,\n          embedding_dim], the embedding vectors for the query frames.\n        all_reference_labels: A list of reference_labels,\n          each with size [height, width, obj_nums],\n          the class labels of the reference frame.\n        n_chunks: Integer, the number of chunks to use to save memory\n          (set to 1 for no chunking).\n        dis_bias: [n_objects], foreground and background bias\n        ori_size: (ori_height, ori_width),\n          the original spatial size. If \"None\", (ori_height, ori_width) = (height, width).",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:258-277"
+    },
+    "7579": {
+        "file_id": 556,
+        "content": "This code calculates the distance to the nearest neighbor per object for query embeddings in a list of reference embeddings, considering potentially subsampled frames. It takes query_embeddings of size [n_query_images, height, width, embedding_dim], all_reference_embeddings and all_reference_labels lists with size [height, width, obj_nums] each, n_chunks, dis_bias, and ori_size as input arguments.",
+        "type": "comment"
+    },
+    "7580": {
+        "file_id": 556,
+        "content": "        atrous_rate: Integer, the atrous rate of reference_embeddings.\n        use_float16: Bool, if \"True\", use float16 type for matching.\n    Returns:\n        nn_features: [n_query_images, ori_height, ori_width, n_objects, feature_dim].\n    \"\"\"\n    h, w, embedding_dim = query_embeddings.shape\n    obj_nums = all_reference_labels[0].shape[2]\n    all_reference_embeddings_flat = []\n    all_reference_labels_flat = []\n    ref_num = len(all_reference_labels)\n    n_chunks *= ref_num\n    if atrous_obj_pixel_num > 0:\n        if atrous_rate > 1:\n            h_pad = (atrous_rate - h % atrous_rate) % atrous_rate\n            w_pad = (atrous_rate - w % atrous_rate) % atrous_rate\n            selected_points = paddle.zeros([h + h_pad, w + w_pad])\n            selected_points = paddle.reshape(\n                selected_points, [(h + h_pad) // atrous_rate, atrous_rate,\n                                  (w + w_pad) // atrous_rate, atrous_rate])\n            selected_points[:, 0, :, 0] = 1.\n            selected_points = paddle.reshape(selected_points,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:278-299"
+    },
+    "7581": {
+        "file_id": 556,
+        "content": "This function is responsible for creating a tensor of reference embeddings and labels for a given set of query embeddings, based on the provided atrous rate. The function first calculates the shape of the input tensors, then initializes empty lists for flat versions of reference embeddings and labels. It then determines the padding needed to match the atrous rate, creates a selection matrix with ones at the selected points, and reshapes it according to the atrous rate. Finally, it prepares the tensor for matching by flattening the reference embeddings and labels lists.",
+        "type": "comment"
+    },
+    "7582": {
+        "file_id": 556,
+        "content": "                                             [h + h_pad, w + w_pad, 1])[:h, :w]\n        for reference_embeddings, reference_labels, idx in zip(\n                all_reference_embeddings, all_reference_labels, range(ref_num)):\n            if atrous_rate > 1:\n                is_big_obj = paddle.sum(\n                    reference_labels,\n                    axis=(0, 1)) > (atrous_obj_pixel_num * atrous_rate**2)\n                is_big_obj = list(np.array(is_big_obj))\n                for j in range(len(is_big_obj)):\n                    if is_big_obj[j] == True:\n                        reference_labels[:, :, j:j +\n                                         1] = reference_labels[:, :, j:j +\n                                                               1] * selected_points\n            reference_embeddings_flat = paddle.reshape(reference_embeddings,\n                                                       [-1, embedding_dim])\n            reference_labels_flat = paddle.reshape(reference_labels,\n                                                   [-1, obj_nums])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:300-318"
+    },
+    "7583": {
+        "file_id": 556,
+        "content": "This code segment appears to be a part of image segmentation or object detection algorithm. It processes reference embeddings and labels, potentially for each detected object in the image. The atrous rate determines if an object is big or small, with larger objects being processed separately by multiplying selected points to corresponding regions in reference_labels. The embeddings are flattened into 1D arrays, as well as reference_labels.",
+        "type": "comment"
+    },
+    "7584": {
+        "file_id": 556,
+        "content": "            all_reference_embeddings_flat.append(reference_embeddings_flat)\n            all_reference_labels_flat.append(reference_labels_flat)\n        reference_embeddings_flat = paddle.concat(\n            x=all_reference_embeddings_flat, axis=0)\n        reference_labels_flat = paddle.concat(x=all_reference_labels_flat,\n                                              axis=0)\n    else:\n        if ref_num == 1:\n            reference_embeddings, reference_labels = all_reference_embeddings[\n                0], all_reference_labels[0]\n            if atrous_rate > 1:\n                h_pad = (atrous_rate - h % atrous_rate) % atrous_rate\n                w_pad = (atrous_rate - w % atrous_rate) % atrous_rate\n                if h_pad > 0 or w_pad > 0:\n                    reference_embeddings = F.pad(reference_embeddings,\n                                                 [0, h_pad, 0, w_pad, 0, 0])\n                    reference_labels = F.pad(reference_labels,\n                                             [0, h_pad, 0, w_pad, 0, 0])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:320-338"
+    },
+    "7585": {
+        "file_id": 556,
+        "content": "The code concatenates reference embeddings and labels, then pads them if necessary based on the atrous rate. If there is only one reference, it directly selects the first item from all_reference_embeddings and all_reference_labels lists.",
+        "type": "comment"
+    },
+    "7586": {
+        "file_id": 556,
+        "content": "                reference_embeddings = paddle.reshape(\n                    reference_embeddings,\n                    [(h + h_pad) // atrous_rate, atrous_rate,\n                     (w + w_pad) // atrous_rate, atrous_rate, 32])\n                reference_labels = paddle.reshape(\n                    reference_labels,\n                    [(h + h_pad) // atrous_rate, atrous_rate,\n                     (w + w_pad) // atrous_rate, atrous_rate, -1])\n                reference_embeddings = paddle.reshape(\n                    reference_embeddings[:, 0, :, 0, :],\n                    reference_embeddings[:, 0, :, 0, :].shape)\n                reference_labels = paddle.reshape(\n                    reference_labels[:, 0, :, 0, :],\n                    reference_labels[:, 0, :, 0, :].shape)\n            reference_embeddings_flat = paddle.reshape(reference_embeddings,\n                                                       [-1, embedding_dim])\n            reference_labels_flat = paddle.reshape(reference_labels,\n                                                   [-1, obj_nums])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:339-356"
+    },
+    "7587": {
+        "file_id": 556,
+        "content": "This code reshapes the reference embeddings and labels to match a specific pattern, then flattens the reference embeddings while preserving their data type and shape.",
+        "type": "comment"
+    },
+    "7588": {
+        "file_id": 556,
+        "content": "        else:\n            for reference_embeddings, reference_labels, idx in zip(\n                    all_reference_embeddings, all_reference_labels,\n                    range(ref_num)):\n                if atrous_rate > 1:\n                    h_pad = (atrous_rate - h % atrous_rate) % atrous_rate\n                    w_pad = (atrous_rate - w % atrous_rate) % atrous_rate\n                    if h_pad > 0 or w_pad > 0:\n                        reference_embeddings = F.pad(reference_embeddings,\n                                                     [0, h_pad, 0, w_pad, 0, 0])\n                        reference_labels = F.pad(reference_labels,\n                                                 [0, h_pad, 0, w_pad, 0, 0])\n                    reference_embeddings = paddle.reshape(\n                        reference_embeddings,\n                        [(h + h_pad) // atrous_rate, atrous_rate,\n                         (w + w_pad) // atrous_rate, atrous_rate, -1])\n                    reference_labels = paddle.reshape(\n                        reference_labels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:357-375"
+    },
+    "7589": {
+        "file_id": 556,
+        "content": "This code segment handles the case where atrous_rate is greater than 1. It pads reference embeddings and labels if needed, then reshapes them to have a shape compatible with Atrous Spatial Pyramid Pooling (ASPP) in deep learning models for image classification or detection tasks.",
+        "type": "comment"
+    },
+    "7590": {
+        "file_id": 556,
+        "content": "                        [(h + h_pad) // atrous_rate, atrous_rate,\n                         (w + w_pad) // atrous_rate, atrous_rate, -1])\n                    reference_embeddings = paddle.reshape(\n                        reference_embeddings[:, 0, :, 0, :],\n                        reference_embeddings[:, 0, :, 0, :].shape)\n                    reference_labels = paddle.reshape(\n                        reference_labels[:, 0, :, 0, :],\n                        reference_labels[:, 0, :, 0, :].shape)\n                reference_embeddings_flat = paddle.reshape(\n                    reference_embeddings, [-1, embedding_dim])\n                reference_labels_flat = paddle.reshape(reference_labels,\n                                                       [-1, obj_nums])\n                all_reference_embeddings_flat.append(reference_embeddings_flat)\n                all_reference_labels_flat.append(reference_labels_flat)\n            reference_embeddings_flat = paddle.concat(\n                all_reference_embeddings_flat, axis=0)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:376-394"
+    },
+    "7591": {
+        "file_id": 556,
+        "content": "This code reshapes the reference embeddings and labels into a flattened format, appends them to lists, and then concatenates all the flattened reference embeddings along axis 0. This is likely for use in a deep learning model that requires the data in this specific format for training or prediction.",
+        "type": "comment"
+    },
+    "7592": {
+        "file_id": 556,
+        "content": "            reference_labels_flat = paddle.concat(all_reference_labels_flat,\n                                                  axis=0)\n    query_embeddings_flat = paddle.reshape(query_embeddings,\n                                           [-1, embedding_dim])\n    all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9\n    reference_labels_flat = paddle.reshape(\n        paddle.masked_select(reference_labels_flat,\n                             paddle.expand(all_ref_fg, [-1, obj_nums])),\n        [-1, obj_nums])\n    if reference_labels_flat.shape[0] == 0:\n        return paddle.ones([1, h, w, obj_nums, 1])\n    reference_embeddings_flat = paddle.reshape(\n        paddle.masked_select(reference_embeddings_flat,\n                             paddle.expand(all_ref_fg, [-1, embedding_dim])),\n        [-1, embedding_dim])\n    if use_float16:\n        query_embeddings_flat = paddle.cast(query_embeddings_flat,\n                                            dtype=\"float16\")\n        reference_embeddings_flat = paddle.cast(reference_embeddings_flat,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:395-415"
+    },
+    "7593": {
+        "file_id": 556,
+        "content": "This code segment performs feature selection and reshaping of query and reference embeddings for the segment matching process. It concatenates all reference labels, flattens the query embeddings, masks the selected reference labels and embeddings based on a threshold, and finally reshapes them before returning a tensor of ones if no references are found or casting the embeddings to float16 datatype if specified.",
+        "type": "comment"
+    },
+    "7594": {
+        "file_id": 556,
+        "content": "                                                dtype=\"float16\")\n    nn_features = _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,\n        n_chunks)\n    nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])\n    nn_features_reshape = (\n        F.sigmoid(nn_features_reshape +\n                  paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2\n    # TODO: ori_size is not None\n    if use_float16:\n        nn_features_reshape = paddle.cast(nn_features_reshape, dtype=\"float32\")\n    return nn_features_reshape\n#LOCAL_DIST_MAP\ndef local_pairwise_distances(x,\n                             y,\n                             max_distance=9,\n                             atrous_rate=1,\n                             allow_downsample=False):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n        Use for-loop for saving memory.\n    Args:\n        x: Float32 tensor of shape [height, width, feature_dim].",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:416-442"
+    },
+    "7595": {
+        "file_id": 556,
+        "content": "This code calculates pairwise squared L2 distances using a local search window, and then computes the nearest neighbor features for each object in image chunks. The result is reshaped into an appropriate format and can be used for further processing or analysis.",
+        "type": "comment"
+    },
+    "7596": {
+        "file_id": 556,
+        "content": "        y: Float32 tensor of shape [height, width, feature_dim].\n        max_distance: Integer, the maximum distance in pixel coordinates\n          per dimension which is considered to be in the search window.\n        atrous_rate: Integer, the atrous rate of local matching.\n        allow_downsample: Bool, if \"True\", downsample x and y\n          with a stride of 2.\n    Returns:\n        Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    if allow_downsample:\n        ori_height = x.shape[0]\n        ori_width = x.shape[1]\n        x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)\n        y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)\n        down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)\n        x = F.interpolate(x,\n                          size=down_size,\n                          mode='bilinear',\n                          align_corners=True)\n        y = F.interpolate(y,\n                          size=down_size,\n                          mode='bilinear',",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:443-464"
+    },
+    "7597": {
+        "file_id": 556,
+        "content": "This function takes in a tensor 'x' and 'y', along with parameters such as max_distance, atrous_rate, and allow_downsample. It returns a distances tensor of shape [height, width, (2 * max_distance + 1) ** 2]. If downsampling is allowed, the original height and width are saved and the tensors 'x' and 'y' are reshaped. Then, using bilinear interpolation, 'x' and 'y' are downsampled to half their size while preserving values at borders.",
+        "type": "comment"
+    },
+    "7598": {
+        "file_id": 556,
+        "content": "                          align_corners=True)\n        x = paddle.unsqueeze(paddle.transpose(x, [1, 2, 0]), axis=0)\n        y = paddle.unsqueeze(paddle.transpose(y, [1, 2, 0]), axis=0)\n    pad_max_distance = max_distance - max_distance % atrous_rate\n    # no change pad\n    padded_y = F.pad(y, (0, 0, pad_max_distance, pad_max_distance,\n                         pad_max_distance, pad_max_distance),\n                     value=WRONG_LABEL_PADDING_DISTANCE)\n    height, width, _ = x.shape\n    dists = []\n    for y in range(2 * pad_max_distance // atrous_rate + 1):\n        y_start = y * atrous_rate\n        y_end = y_start + height\n        y_slice = padded_y[y_start:y_end]\n        for x in range(2 * max_distance + 1):\n            x_start = x * atrous_rate\n            x_end = x_start + width\n            offset_y = y_slice[:, x_start:x_end]\n            dist = paddle.sum(paddle.pow((x - offset_y), 2), axis=2)\n            dists.append(dist)\n    dists = paddle.stack(dists, axis=2)\n    return dists\ndef local_pairwise_distances_parallel(x,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:465-492"
+    },
+    "7599": {
+        "file_id": 556,
+        "content": "This code computes local pairwise distances between the input tensors x and y, accounting for atrous dilation. It first pads y with wrong label padding distance to match the size of x. Then it loops through the range of possible offsets for each pixel and calculates the sum of squared differences between the current pixel and all potential offsets in y. These distances are then stacked along the channel axis before being returned.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/76.json b/docs/data/76.json
new file mode 100644
index 000000000..333fe9ce9
--- /dev/null
+++ b/docs/data/76.json
@@ -0,0 +1,545 @@
+{
+    "7600": {
+        "file_id": 556,
+        "content": "                                      y,\n                                      max_distance=9,\n                                      atrous_rate=1,\n                                      allow_downsample=True):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n    Args:\n        x: Float32 tensor of shape [height, width, feature_dim].\n        y: Float32 tensor of shape [height, width, feature_dim].\n        max_distance: Integer, the maximum distance in pixel coordinates\n          per dimension which is considered to be in the search window.\n        atrous_rate: Integer, the atrous rate of local matching.\n        allow_downsample: Bool, if \"True\", downsample x and y\n          with a stride of 2.\n    Returns:\n        Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    ori_height, ori_width, _ = x.shape\n    x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)\n    y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)\n    if allow_downsample:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:493-513"
+    },
+    "7601": {
+        "file_id": 556,
+        "content": "This function computes pairwise squared L2 distances using a local search window. It takes two tensors x and y of shape [height, width, feature_dim] as input. The maximum distance (max\\_distance) in pixel coordinates per dimension is considered in the search window. Atrous rate determines the local matching rate. If downsampling is allowed, the function downsamples the tensors with a stride of 2. It returns a float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].",
+        "type": "comment"
+    },
+    "7602": {
+        "file_id": 556,
+        "content": "        down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)\n        x = F.interpolate(x,\n                          size=down_size,\n                          mode='bilinear',\n                          align_corners=True)\n        y = F.interpolate(y,\n                          size=down_size,\n                          mode='bilinear',\n                          align_corners=True)\n    _, channels, height, width = x.shape\n    x2 = paddle.reshape(paddle.sum(paddle.pow(x, 2), axis=1),\n                        [height, width, 1])\n    y2 = paddle.reshape(paddle.sum(paddle.pow(y, 2), axis=1),\n                        [1, 1, height, width])\n    pad_max_distance = max_distance - max_distance % atrous_rate\n    # no change pad\n    padded_y = F.pad(y, (pad_max_distance, pad_max_distance, pad_max_distance,\n                         pad_max_distance))\n    padded_y2 = F.pad(y2, (pad_max_distance, pad_max_distance, pad_max_distance,\n                           pad_max_distance),\n                      value=WRONG_LABEL_PADDING_DISTANCE)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:514-537"
+    },
+    "7603": {
+        "file_id": 556,
+        "content": "The code resizes the input tensors x and y to a downsized version of half the original size using bilinear interpolation. It then calculates the squared values for x and y, reshapes them, pads the tensors with WRONG_LABEL_PADDING_DISTANCE to match the atrous rate, and assigns them to padded_y and padded_y2 variables.",
+        "type": "comment"
+    },
+    "7604": {
+        "file_id": 556,
+        "content": "    offset_y = paddle.transpose(\n        paddle.reshape(\n            F.unfold(x=padded_y,\n                     kernel_sizes=[height, width],\n                     strides=[atrous_rate, atrous_rate]),\n            [channels, height * width, -1]), [1, 0, 2])\n    offset_y2 = paddle.reshape(\n        F.unfold(padded_y2,\n                 kernel_sizes=[height, width],\n                 strides=[atrous_rate, atrous_rate]), [height, width, -1])\n    x = paddle.transpose(paddle.reshape(x, [channels, height * width, -1]),\n                         [1, 2, 0])\n    dists = x2 + offset_y2 - 2. * paddle.reshape(paddle.matmul(x, offset_y),\n                                                 [height, width, -1])\n    return dists\ndef local_matching(prev_frame_embedding,\n                   query_embedding,\n                   prev_frame_labels,\n                   dis_bias=0.,\n                   multi_local_distance=[15],\n                   ori_size=None,\n                   atrous_rate=1,\n                   use_float16=True,\n                   allow_downsample=True,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:539-566"
+    },
+    "7605": {
+        "file_id": 556,
+        "content": "This code snippet calculates the distance between embeddings of frames to measure similarity. It takes in two frame embeddings, their corresponding labels, and several optional parameters, including an atomic number for local distances, a size parameter, and whether to allow downsampling. The function uses Paddle's unfold operation to reshape the data, then calculates distances between these reshaped embeddings using a formula that involves matmul (matrix multiplication) operations. The final step is returning the calculated distances.",
+        "type": "comment"
+    },
+    "7606": {
+        "file_id": 556,
+        "content": "                   allow_parallel=True):\n    \"\"\"Computes nearest neighbor features while only allowing local matches.\n    Args:\n        prev_frame_embedding: [height, width, embedding_dim],\n          the embedding vectors for the last frame.\n        query_embedding: [height, width, embedding_dim],\n          the embedding vectors for the query frames.\n        prev_frame_labels: [height, width, n_objects],\n        the class labels of the previous frame.\n        multi_local_distance: A list of Integer,\n          a list of maximum distance allowed for local matching.\n        ori_size: (ori_height, ori_width),\n          the original spatial size. If \"None\", (ori_height, ori_width) = (height, width).\n        atrous_rate: Integer, the atrous rate of local matching.\n        use_float16: Bool, if \"True\", use float16 type for matching.\n        allow_downsample: Bool, if \"True\", downsample prev_frame_embedding and query_embedding\n          with a stride of 2.\n        allow_parallel: Bool, if \"True\", do matching in a parallel way. If \"False\", do matching in",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:567-584"
+    },
+    "7607": {
+        "file_id": 556,
+        "content": "This code computes nearest neighbor features for local matching in video segmentation. It takes embedding vectors, class labels, and a list of maximum distances as input. The function allows downsampling and parallel processing.",
+        "type": "comment"
+    },
+    "7608": {
+        "file_id": 556,
+        "content": "          a for-loop way, which will save GPU memory.\n    Returns:\n        nn_features: A float32 np.array of nearest neighbor features of shape\n          [1, height, width, n_objects, 1].\n    \"\"\"\n    max_distance = multi_local_distance[-1]\n    if ori_size is None:\n        height, width = prev_frame_embedding.shape[:2]\n        ori_size = (height, width)\n    obj_num = prev_frame_labels.shape[2]\n    pad = paddle.ones([1]) * WRONG_LABEL_PADDING_DISTANCE\n    if use_float16:\n        query_embedding = paddle.cast(query_embedding, dtype=\"float16\")\n        prev_frame_embedding = paddle.cast(prev_frame_embedding,\n                                           dtype=\"float16\")\n        pad = paddle.cast(pad, dtype=\"float16\")\n    if allow_parallel:\n        d = local_pairwise_distances_parallel(query_embedding,\n                                              prev_frame_embedding,\n                                              max_distance=max_distance,\n                                              atrous_rate=atrous_rate,\n                                              allow_downsample=allow_downsample)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:585-609"
+    },
+    "7609": {
+        "file_id": 556,
+        "content": "This function calculates nearest neighbor features by using local pairwise distances in a parallel manner, with options to cast data types and allow downsampling. It takes query and previous frame embeddings as input, and returns nearest neighbor features of shape [1, height, width, n_objects, 1].",
+        "type": "comment"
+    },
+    "7610": {
+        "file_id": 556,
+        "content": "    else:\n        d = local_pairwise_distances(query_embedding,\n                                     prev_frame_embedding,\n                                     max_distance=max_distance,\n                                     atrous_rate=atrous_rate,\n                                     allow_downsample=allow_downsample)\n    height, width = d.shape[:2]\n    labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]), 1)\n    labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]),\n                              axis=1)\n    if (height, width) != ori_size:\n        labels = F.interpolate(labels, size=(height, width), mode='nearest')\n    pad_max_distance = max_distance - max_distance % atrous_rate\n    atrous_max_distance = pad_max_distance // atrous_rate\n    #no change pad\n    padded_labels = F.pad(labels, (\n        pad_max_distance,\n        pad_max_distance,\n        pad_max_distance,\n        pad_max_distance,\n    ),\n                          mode='constant',\n                          value=0)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:610-635"
+    },
+    "7611": {
+        "file_id": 556,
+        "content": "This code calculates pairwise distances between query and previous frame embeddings. If the shape of the distances doesn't match the original size, it interpolates labels using nearest neighbor mode. The code then pads the labels with zeros to match the maximum distance considering the atrous rate.",
+        "type": "comment"
+    },
+    "7612": {
+        "file_id": 556,
+        "content": "    offset_masks = paddle.transpose(\n        paddle.reshape(\n            F.unfold(padded_labels,\n                     kernel_sizes=[height, width],\n                     strides=[atrous_rate, atrous_rate]),\n            [obj_num, height, width, -1]), [1, 2, 3, 0]) > 0.9\n    d_tiled = paddle.expand(paddle.unsqueeze(\n        d, axis=-1), [-1, -1, -1, obj_num])  # h, w, num_local_pos, obj_num\n    d_masked = paddle.where(offset_masks, d_tiled, pad)\n    dists = paddle.min(d_masked, axis=2)\n    multi_dists = [\n        paddle.unsqueeze(paddle.transpose(dists, [2, 0, 1]), axis=1)\n    ]  # n_objects, num_multi_local, h, w\n    reshaped_d_masked = paddle.reshape(d_masked, [\n        height, width, 2 * atrous_max_distance + 1, 2 * atrous_max_distance + 1,\n        obj_num\n    ])\n    for local_dis in multi_local_distance[:-1]:\n        local_dis = local_dis // atrous_rate\n        start_idx = atrous_max_distance - local_dis\n        end_idx = atrous_max_distance + local_dis + 1\n        new_d_masked = paddle.reshape(\n            reshaped_d_masked[:, :, start_idx:end_idx, start_idx:end_idx, :],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:637-662"
+    },
+    "7613": {
+        "file_id": 556,
+        "content": "This code segment applies atrous spatial pyramid pooling in a PaddlePaddle implementation. It creates offset masks, performs element-wise masking, computes minimum distances, and reshapes the data for each local distance level to perform feature extraction at different scales.",
+        "type": "comment"
+    },
+    "7614": {
+        "file_id": 556,
+        "content": "            reshaped_d_masked[:, :, start_idx:end_idx,\n                              start_idx:end_idx, :].shape)\n        new_d_masked = paddle.reshape(new_d_masked,\n                                      [height, width, -1, obj_num])\n        new_dists = paddle.min(new_d_masked, axis=2)\n        new_dists = paddle.unsqueeze(paddle.transpose(new_dists, [2, 0, 1]),\n                                     axis=1)\n        multi_dists.append(new_dists)\n    multi_dists = paddle.concat(multi_dists, axis=1)\n    multi_dists = (F.sigmoid(multi_dists +\n                             paddle.reshape(dis_bias, [-1, 1, 1, 1])) - 0.5) * 2\n    if use_float16:\n        multi_dists = paddle.cast(multi_dists, dtype=\"float32\")\n    if (height, width) != ori_size:\n        multi_dists = F.interpolate(multi_dists,\n                                    size=ori_size,\n                                    mode='bilinear',\n                                    align_corners=True)\n    multi_dists = paddle.transpose(multi_dists, perm=[2, 3, 0, 1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:663-684"
+    },
+    "7615": {
+        "file_id": 556,
+        "content": "This code performs image segmentation by reshaping and resizing the distance matrix, calculating minimum distances, and applying sigmoid activation. It also handles cases where input size is not the original size.",
+        "type": "comment"
+    },
+    "7616": {
+        "file_id": 556,
+        "content": "    multi_dists = paddle.reshape(multi_dists,\n                                 [1, ori_size[0], ori_size[1], obj_num, -1])\n    return multi_dists\ndef calculate_attention_head(ref_embedding,\n                             ref_label,\n                             prev_embedding,\n                             prev_label,\n                             epsilon=1e-5):\n    ref_head = ref_embedding * ref_label\n    ref_head_pos = paddle.sum(ref_head, axis=(2, 3))\n    ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos\n    ref_pos_num = paddle.sum(ref_label, axis=(2, 3))\n    ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))\n    ref_head_pos = ref_head_pos / (ref_pos_num + epsilon)\n    ref_head_neg = ref_head_neg / (ref_neg_num + epsilon)\n    prev_head = prev_embedding * prev_label\n    prev_head_pos = paddle.sum(prev_head, axis=(2, 3))\n    prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos\n    prev_pos_num = paddle.sum(prev_label, axis=(2, 3))\n    prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:685-709"
+    },
+    "7617": {
+        "file_id": 556,
+        "content": "This function calculates the attention heads for each object in a given scene. It takes reference and previous embeddings and labels as input, along with an optional epsilon value. It then computes the positional and negative head values, divides them by their respective counts (positive and negative labels), and returns the resulting attention heads for each object. The epsilon is added to avoid division by zero in the calculations.",
+        "type": "comment"
+    },
+    "7618": {
+        "file_id": 556,
+        "content": "    prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)\n    prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)\n    total_head = paddle.concat(\n        x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)\n    return total_head\ndef calculate_attention_head_for_eval(ref_embeddings,\n                                      ref_labels,\n                                      prev_embedding,\n                                      prev_label,\n                                      epsilon=1e-5):\n    total_ref_head_pos = 0.\n    total_ref_head_neg = 0.\n    total_ref_pos_num = 0.\n    total_ref_neg_num = 0.\n    for idx in range(len(ref_embeddings)):\n        ref_embedding = ref_embeddings[idx]\n        ref_label = ref_labels[idx]\n        ref_head = ref_embedding * ref_label\n        ref_head_pos = paddle.sum(ref_head, axis=(2, 3))\n        ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos\n        ref_pos_num = paddle.sum(ref_label, axis=(2, 3))\n        ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:710-736"
+    },
+    "7619": {
+        "file_id": 556,
+        "content": "This code calculates the attention head values for evaluation, where it sums up reference embeddings multiplied by their corresponding labels. It also accounts for positive and negative instances of reference embeddings by subtracting them from total sums. The final total_head is returned as a concatenated matrix.",
+        "type": "comment"
+    },
+    "7620": {
+        "file_id": 556,
+        "content": "        total_ref_head_pos = total_ref_head_pos + ref_head_pos\n        total_ref_head_neg = total_ref_head_neg + ref_head_neg\n        total_ref_pos_num = total_ref_pos_num + ref_pos_num\n        total_ref_neg_num = total_ref_neg_num + ref_neg_num\n    ref_head_pos = total_ref_head_pos / (total_ref_pos_num + epsilon)\n    ref_head_neg = total_ref_head_neg / (total_ref_neg_num + epsilon)\n    prev_head = prev_embedding * prev_label\n    prev_head_pos = paddle.sum(prev_head, axis=(2, 3))\n    prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos\n    prev_pos_num = paddle.sum(prev_label, axis=(2, 3))\n    prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))\n    prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)\n    prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)\n    total_head = paddle.concat(\n        x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)\n    return total_head",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segment/utils.py:737-754"
+    },
+    "7621": {
+        "file_id": 556,
+        "content": "This code calculates and returns a total head value by accumulating reference (ref) head values and previous (prev) head values, then normalizing them. It handles potential zero-division cases with a small epsilon for stability. The resulting total head consists of reference positive (pos), reference negative (neg), previous positive (pos), and previous negative (neg) head components concatenated along axis 1.",
+        "type": "comment"
+    },
+    "7622": {
+        "file_id": 557,
+        "content": "/paddlevideo/modeling/framework/segmenters/__init__.py",
+        "type": "filepath"
+    },
+    "7623": {
+        "file_id": 557,
+        "content": "This code is part of the PaddleVideo library and defines a segmenter module. It includes three classes: BaseSegmenter, MSTCN, and ASRF. These classes are used for video frame-level feature extraction, semantic segmentation, and audio source separation respectively. The __all__ variable lists all exported names in this package.",
+        "type": "summary"
+    },
+    "7624": {
+        "file_id": 557,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseSegmenter\nfrom .ms_tcn import MSTCN\nfrom .asrf import ASRF\n__all__ = ['BaseSegmenter', 'MSTCN', 'ASRF']",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/__init__.py:1-17"
+    },
+    "7625": {
+        "file_id": 557,
+        "content": "This code is part of the PaddleVideo library and defines a segmenter module. It includes three classes: BaseSegmenter, MSTCN, and ASRF. These classes are used for video frame-level feature extraction, semantic segmentation, and audio source separation respectively. The __all__ variable lists all exported names in this package.",
+        "type": "comment"
+    },
+    "7626": {
+        "file_id": 558,
+        "content": "/paddlevideo/modeling/framework/segmenters/asrf.py",
+        "type": "filepath"
+    },
+    "7627": {
+        "file_id": 558,
+        "content": "The PaddleVideo framework's ASRF segmentation model uses a backbone for feature extraction and head network for classification. It performs forward passes, post-processing, inference, validates using loss and F1@0.50 score, and extracts class outputs for results.",
+        "type": "summary"
+    },
+    "7628": {
+        "file_id": 558,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import SEGMENTERS\nfrom .base import BaseSegmenter\nimport paddle\nimport paddle.nn.functional as F\nfrom .utils import ASRFPostProcessing\n@SEGMENTERS.register()\nclass ASRF(BaseSegmenter):\n    \"\"\"ASRF model framework.\"\"\"\n    def __init__(self,\n                 postprocessing_method,\n                 boundary_threshold,\n                 backbone=None,\n                 head=None,\n                 loss=None):\n        super().__init__(backbone=backbone, head=head, loss=loss)\n        self.postprocessing_method = postprocessing_method",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/asrf.py:1-33"
+    },
+    "7629": {
+        "file_id": 558,
+        "content": "Class ASRF is a segmenter model in PaddleVideo framework. It takes arguments like postprocessing_method, boundary_threshold, backbone, head, and loss for initialization.",
+        "type": "comment"
+    },
+    "7630": {
+        "file_id": 558,
+        "content": "        self.boundary_threshold = boundary_threshold\n    def forward_net(self, video_feature):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        if self.backbone is not None:\n            feature = self.backbone(video_feature)\n        else:\n            feature = video_feature\n        if self.head is not None:\n            network_outputs = self.head(feature)\n        else:\n            network_outputs = None\n        return network_outputs\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        feature, label, boundary = data_batch\n        # call forward\n        outputs_cls, outputs_boundary = self.forward_net(feature)\n        # transfer data\n        outputs_cls_np = outputs_cls[-1].numpy()\n        outputs_boundary_np = outputs_boundary[-1].numpy()\n        # caculate loss\n        if self.loss is not None:\n            output_loss = self.loss(feature, outputs_cls, label,\n                                    outputs_boundary, boundary)\n        else:\n            output_loss = None",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/asrf.py:34-67"
+    },
+    "7631": {
+        "file_id": 558,
+        "content": "The code defines a model for segmentation, which has a forward function and train step. It uses a backbone for feature extraction and a head network for classification. The train_step calculates loss using the defined loss function if it's not None.",
+        "type": "comment"
+    },
+    "7632": {
+        "file_id": 558,
+        "content": "        # predict post process\n        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,\n                                       self.postprocessing_method)\n        predicted = paddle.squeeze(predicted)\n        loss_metrics = dict()\n        loss_metrics['loss'] = output_loss\n        loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, label)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        feature, label, boundary = data_batch\n        # call forward\n        outputs_cls, outputs_boundary = self.forward_net(feature)\n        # transfer data\n        outputs_cls_np = outputs_cls[-1].numpy()\n        outputs_boundary_np = outputs_boundary[-1].numpy()\n        ## caculate loss\n        if self.loss is not None:\n            output_loss = self.loss(feature, outputs_cls, label,\n                                    outputs_boundary, boundary)\n        else:\n            output_loss = None\n        # predict post process\n        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/asrf.py:69-100"
+    },
+    "7633": {
+        "file_id": 558,
+        "content": "The code snippet represents the ASRF model's validation step. It predicts the outputs for the given inputs, calculates loss if applicable, and performs post-processing using ASRFPostProcessing function. The function then returns a dictionary of metrics including the 'loss' value and the 'F1@0.50' score.",
+        "type": "comment"
+    },
+    "7634": {
+        "file_id": 558,
+        "content": "                                       self.postprocessing_method)\n        predicted = paddle.squeeze(predicted)\n        outputs_dict = dict()\n        outputs_dict['loss'] = output_loss\n        outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, label)\n        return outputs_dict\n    def test_step(self, data_batch):\n        \"\"\"Testing setp.\n        \"\"\"\n        feature, _, _ = data_batch\n        outputs_dict = dict()\n        # call forward\n        outputs_cls, outputs_boundary = self.forward_net(feature)\n        # transfer data\n        outputs_cls_np = outputs_cls[-1].numpy()\n        outputs_boundary_np = outputs_boundary[-1].numpy()\n        # predict post process\n        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,\n                                       self.postprocessing_method)\n        outputs_dict['predict'] = paddle.to_tensor(predicted[0, :])\n        outputs_dict['output_np'] = F.sigmoid(outputs_cls[-1])\n        return outputs_dict\n    def infer_step(self, data_batch):\n        \"\"\"Infering setp.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/asrf.py:101-129"
+    },
+    "7635": {
+        "file_id": 558,
+        "content": "This code is for a model that performs segmentation using ASRF (Adaptive Sparsely Represented Field) method. It consists of functions for forward pass, post processing, and inference steps. The forward_net function takes input features and returns predicted classes and boundaries. The test_step performs testing by calling the forward_net function and applying post-processing to the results. The infer_step performs inference on data_batch using ASRFPostProcessing. It outputs the predicted segmentation, sigmoid-transformed output, and returns them in a dictionary for further processing or evaluation.",
+        "type": "comment"
+    },
+    "7636": {
+        "file_id": 558,
+        "content": "        \"\"\"\n        feature = data_batch[0]\n        # call forward\n        outputs_cls, outputs_boundary = self.forward_net(feature)\n        # transfer data\n        outputs_cls_np = outputs_cls[-1]\n        outputs_boundary_np = outputs_boundary[-1]\n        outputs = [\n            outputs_cls_np, outputs_boundary_np,\n            F.sigmoid(outputs_cls[-1])\n        ]\n        return outputs",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/asrf.py:130-143"
+    },
+    "7637": {
+        "file_id": 558,
+        "content": "This code segment performs the forward pass on a feature, then extracts last outputs for class and boundary, applies sigmoid to the last output of class, and returns all in a list as results.",
+        "type": "comment"
+    },
+    "7638": {
+        "file_id": 559,
+        "content": "/paddlevideo/modeling/framework/segmenters/base.py",
+        "type": "filepath"
+    },
+    "7639": {
+        "file_id": 559,
+        "content": "The BaseSegmenter class serves as a foundation for PaddleVideo segmenters, handling training, validation, testing, and inference with a mode parameter. Subclasses must implement train_step, valid_step, test_step, and feature extraction modules.",
+        "type": "summary"
+    },
+    "7640": {
+        "file_id": 559,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseSegmenter(nn.Layer):\n    \"\"\"Base class for segementers.\n    All segementers should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Classification head to process feature.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/base.py:1-30"
+    },
+    "7641": {
+        "file_id": 559,
+        "content": "The code is defining a BaseSegmenter class, which serves as the base class for all segmenters. It requires subclasses to override train_step, valid_step, and test_step methods. The class also accepts backbone and head modules to extract features and process them respectively.",
+        "type": "comment"
+    },
+    "7642": {
+        "file_id": 559,
+        "content": "    \"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__()\n        # build backbone\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        # build head\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n        # build loss\n        if loss is not None:\n            self.loss_name = loss.name\n            self.loss = builder.build_loss(loss)\n            if hasattr(self.loss, 'init_weights'):\n                self.loss.init_weights()\n        else:\n            self.loss = None\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/base.py:32-63"
+    },
+    "7643": {
+        "file_id": 559,
+        "content": "This code defines a segmenter base class for PaddleVideo. It initializes the backbone, head, and loss layers based on user input. The `forward` method specifies how the model processes data in either infer or train mode. Initializing weights is optional but can be called if the layer supports it.",
+        "type": "comment"
+    },
+    "7644": {
+        "file_id": 559,
+        "content": "        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/base.py:64-99"
+    },
+    "7645": {
+        "file_id": 559,
+        "content": "This code defines a base class for segmenters that supports training, validation, testing, and inference steps. The `mode` parameter determines which step to execute, and abstract methods must be implemented by subclasses for each step.",
+        "type": "comment"
+    },
+    "7646": {
+        "file_id": 559,
+        "content": "        raise NotImplementedError",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/base.py:100-100"
+    },
+    "7647": {
+        "file_id": 559,
+        "content": "This code block raises a NotImplementedError, indicating that the current implementation of the function or method is not complete and requires further development.",
+        "type": "comment"
+    },
+    "7648": {
+        "file_id": 560,
+        "content": "/paddlevideo/modeling/framework/segmenters/ms_tcn.py",
+        "type": "filepath"
+    },
+    "7649": {
+        "file_id": 560,
+        "content": "The MSTCN model is a video segmentation tool that extends BaseSegmenter class, includes an optional backbone and head, and defines training/validation steps with loss calculation. The code includes three functions: forward_net for training, test_step for testing, and infer_step for inference.",
+        "type": "summary"
+    },
+    "7650": {
+        "file_id": 560,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import SEGMENTERS\nfrom .base import BaseSegmenter\nimport paddle\nimport paddle.nn.functional as F\n@SEGMENTERS.register()\nclass MSTCN(BaseSegmenter):\n    \"\"\"MS-TCN model framework.\"\"\"\n    def forward_net(self, video_feature):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        if self.backbone is not None:\n            feature = self.backbone(video_feature)\n        else:\n            feature = video_feature\n        if self.head is not None:\n            cls_score = self.head(feature)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/ms_tcn.py:1-33"
+    },
+    "7651": {
+        "file_id": 560,
+        "content": "Class MSTCN defines a model for video segmentation, extending BaseSegmenter class. It contains an optional backbone and head for feature extraction and classification. The forward_net function maps input to output through these components if present.",
+        "type": "comment"
+    },
+    "7652": {
+        "file_id": 560,
+        "content": "        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        video_feat, video_gt = data_batch\n        # call forward\n        output = self.forward_net(video_feat)\n        loss = 0.\n        for i in range(len(output)):\n            loss += self.head.loss(output[i], video_gt)\n        predicted = paddle.argmax(output[-1], axis=1)\n        predicted = paddle.squeeze(predicted)\n        loss_metrics = dict()\n        loss_metrics['loss'] = loss\n        loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        video_feat, video_gt = data_batch\n        # call forward\n        output = self.forward_net(video_feat)\n        loss = 0.\n        for i in range(len(output)):\n            loss += self.head.loss(output[i], video_gt)\n        predicted = paddle.argmax(output[-1], axis=1)\n        predicted = paddle.squeeze(predicted)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/ms_tcn.py:34-70"
+    },
+    "7653": {
+        "file_id": 560,
+        "content": "This code defines a training step, validation step, and a method to predict the class score for video segmentation. The training step calculates the loss based on the forward network output and ground truth labels, while the validation step does the same but doesn't return a loss. Both methods return predicted results and loss metrics.",
+        "type": "comment"
+    },
+    "7654": {
+        "file_id": 560,
+        "content": "        outputs_dict = dict()\n        outputs_dict['loss'] = loss\n        outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)\n        return outputs_dict\n    def test_step(self, data_batch):\n        \"\"\"Testing setp.\n        \"\"\"\n        video_feat, _ = data_batch\n        outputs_dict = dict()\n        # call forward\n        output = self.forward_net(video_feat)\n        predicted = paddle.argmax(output[-1], axis=1)\n        predicted = paddle.squeeze(predicted)\n        outputs_dict['predict'] = predicted\n        outputs_dict['output_np'] = F.sigmoid(output[-1])\n        return outputs_dict\n    def infer_step(self, data_batch):\n        \"\"\"Infering setp.\n        \"\"\"\n        video_feat = data_batch[0]\n        # call forward\n        output = self.forward_net(video_feat)\n        predicted = paddle.argmax(output[-1], axis=1)\n        predicted = paddle.squeeze(predicted)\n        output_np = F.sigmoid(output[-1])\n        return predicted, output_np",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/ms_tcn.py:72-101"
+    },
+    "7655": {
+        "file_id": 560,
+        "content": "This code defines three functions: \"forward_net\" for training, \"test_step\" for testing, and \"infer_step\" for inference. The forward pass of the model is called within each function. In the training step, the loss is calculated and an F1 score is computed using the head module. The predicted labels are also stored. For testing and inference, the predicted labels and output after sigmoid activation are returned separately.",
+        "type": "comment"
+    },
+    "7656": {
+        "file_id": 561,
+        "content": "/paddlevideo/modeling/framework/segmenters/utils.py",
+        "type": "filepath"
+    },
+    "7657": {
+        "file_id": 561,
+        "content": "The GaussianSmoothing class in PaddlePaddle applies 1D gaussian smoothing for image processing tasks, and the code initializes weights and biases for a neural network layer with Kaiming Uniform method.",
+        "type": "summary"
+    },
+    "7658": {
+        "file_id": 561,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# https://github.com/yiskw713/asrf/libs/postprocess.py\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nimport math\nclass GaussianSmoothing(nn.Layer):\n    \"\"\"\n    Apply gaussian smoothing on a 1d tensor.\n    Filtering is performed seperately for each channel\n    in the input using a depthwise convolution.\n    Arguments:\n        channels (int, sequence): Number of channels of the input tensors. Output will\n            have this number of channels as well.\n        kernel_size (int, sequence): Size of the gaussian kernel.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:1-30"
+    },
+    "7659": {
+        "file_id": 561,
+        "content": "This code defines a GaussianSmoothing class in PaddlePaddle for applying gaussian smoothing on 1D tensors. It uses depthwise convolution to filter each channel separately, with input and output channels remaining the same. The kernel size can be specified as an integer or sequence.",
+        "type": "comment"
+    },
+    "7660": {
+        "file_id": 561,
+        "content": "        sigma (float, sequence): Standard deviation of the gaussian kernel.\n    \"\"\"\n    def __init__(self, kernel_size=15, sigma=1.0):\n        super().__init__()\n        self.kernel_size = kernel_size\n        # The gaussian kernel is the product of the\n        # gaussian function of each dimension.\n        kernel = 1\n        meshgrid = paddle.arange(kernel_size)\n        meshgrid = paddle.cast(meshgrid, dtype='float32')\n        mean = (kernel_size - 1) / 2\n        kernel = kernel / (sigma * math.sqrt(2 * math.pi))\n        kernel = kernel * paddle.exp(-(((meshgrid - mean) / sigma)**2) / 2)\n        # Make sure sum of values in gaussian kernel equals 1.\n        # kernel = kernel / paddle.max(kernel)\n        self.kernel = paddle.reshape(kernel, [1, 1, -1])\n    def forward(self, inputs):\n        \"\"\"\n        Apply gaussian filter to input.\n        Arguments:\n            input (paddle.Tensor): Input to apply gaussian filter on.\n        Returns:\n            filtered (paddle.Tensor): Filtered output.\n        \"\"\"\n        _, c, _ = inputs.shape",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:31-62"
+    },
+    "7661": {
+        "file_id": 561,
+        "content": "This code initializes a Gaussian kernel with specified kernel size and sigma. The kernel is then applied to the input during forward pass to filter the data.",
+        "type": "comment"
+    },
+    "7662": {
+        "file_id": 561,
+        "content": "        inputs = F.pad(inputs,\n                       pad=((self.kernel_size - 1) // 2,\n                            (self.kernel_size - 1) // 2),\n                       mode=\"reflect\",\n                       data_format='NCL')\n        kernel = paddle.expand(self.kernel, shape=[c, 1, self.kernel_size])\n        return F.conv1d(inputs, weight=kernel, groups=c)\ndef argrelmax(prob, threshold=0.7):\n    \"\"\"\n    Calculate arguments of relative maxima.\n    prob: np.array. boundary probability maps distributerd in [0, 1]\n    prob shape is (T)\n    ignore the peak whose value is under threshold\n    Return:\n        Index of peaks for each batch\n    \"\"\"\n    # ignore the values under threshold\n    prob[prob < threshold] = 0.0\n    # calculate the relative maxima of boundary maps\n    # treat the first frame as boundary\n    peak = np.concatenate(\n        [\n            np.ones((1), dtype=np.bool),\n            (prob[:-2] < prob[1:-1]) & (prob[2:] < prob[1:-1]),\n            np.zeros((1), dtype=np.bool),\n        ],\n        axis=0,\n    )",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:63-95"
+    },
+    "7663": {
+        "file_id": 561,
+        "content": "This code defines a convolution operation and an argrelmax function for image processing. The conv1d function performs 1D convolutions on the input tensor, with padding, kernel expansion, and return as output. The argrelmax function calculates the arguments of relative maxima in boundary probability maps, ignoring values below a certain threshold. This code seems to be related to image segmentation or edge detection tasks.",
+        "type": "comment"
+    },
+    "7664": {
+        "file_id": 561,
+        "content": "    peak_idx = np.where(peak)[0].tolist()\n    return peak_idx\ndef is_probability(x):\n    assert x.ndim == 3\n    if x.shape[1] == 1:\n        # sigmoid\n        if x.min() >= 0 and x.max() <= 1:\n            return True\n        else:\n            return False\n    else:\n        # softmax\n        _sum = np.sum(x, axis=1).astype(np.float32)\n        _ones = np.ones_like(_sum, dtype=np.float32)\n        return np.allclose(_sum, _ones)\ndef convert2probability(x):\n    \"\"\"\n    Args: x (N, C, T)\n    \"\"\"\n    assert x.ndim == 3\n    if is_probability(x):\n        return x\n    else:\n        if x.shape[1] == 1:\n            # sigmoid\n            prob = 1 / (1 + np.exp(-x))\n        else:\n            # softmax\n            prob = np.exp(x) / np.sum(np.exp(x), axis=1)\n        return prob.astype(np.float32)\ndef convert2label(x):\n    assert x.ndim == 2 or x.ndim == 3\n    if x.ndim == 2:\n        return x.astype(np.int64)\n    else:\n        if not is_probability(x):\n            x = convert2probability(x)\n        label = np.argmax(x, axis=1)\n        return label.astype(np.int64)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:97-146"
+    },
+    "7665": {
+        "file_id": 561,
+        "content": "The code provides functions to convert tensors into probabilities or labels. The 'is_probability' function checks if a tensor is in the form of sigmoid or softmax outputs and returns True/False accordingly. The 'convert2probability' function converts tensors into probabilities based on whether they are sigmoid or softmax outputs. Lastly, 'convert2label' function converts tensors (2D or 3D) into labels by either casting to int64 directly for 2D or first converting the tensor to probability and then finding the index of maximum value along the appropriate axis.",
+        "type": "comment"
+    },
+    "7666": {
+        "file_id": 561,
+        "content": "def refinement_with_boundary(outputs, boundaries, boundary_threshold):\n    \"\"\"\n    Get segments which is defined as the span b/w two boundaries,\n    and decide their classes by majority vote.\n    Args:\n        outputs: numpy array. shape (N, C, T)\n            the model output for frame-level class prediction.\n        boundaries: numpy array.  shape (N, 1, T)\n            boundary prediction.\n        boundary_threshold: the threshold of the size of action segments. float(default=0.7)\n    Return:\n        preds: np.array. shape (N, T)\n            final class prediction considering boundaries.\n    \"\"\"\n    preds = convert2label(outputs)\n    boundaries = convert2probability(boundaries)\n    for i, (output, pred, boundary) in enumerate(zip(outputs, preds,\n                                                     boundaries)):\n        idx = argrelmax(boundary[0, :], threshold=boundary_threshold)\n        # add the index of the last action ending\n        T = pred.shape[0]\n        idx.append(T)\n        # majority vote\n        for j in range(len(idx) - 1):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:149-176"
+    },
+    "7667": {
+        "file_id": 561,
+        "content": "This function refines the segmented action outputs based on boundary predictions, and performs majority vote to decide class labels. The inputs include model output (outputs) for frame-level class prediction, boundary prediction (boundaries), and an optional threshold (boundary_threshold). It converts outputs and boundaries into label and probability format respectively. For each sequence, it finds the indices of maximum boundary values above the threshold, appends the last action end index, then performs majority vote on each interval between adjacent max boundaries. The function returns the final class prediction considering boundaries in a numpy array format (preds).",
+        "type": "comment"
+    },
+    "7668": {
+        "file_id": 561,
+        "content": "            count = np.bincount(pred[idx[j]:idx[j + 1]])\n            modes = np.where(count == count.max())[0]\n            if len(modes) == 1:\n                mode = modes\n            else:\n                if outputs.ndim == 3:\n                    # if more than one majority class exist\n                    prob_sum_max = 0\n                    for m in modes:\n                        prob_sum = output[m, idx[j]:idx[j + 1]].sum()\n                        if prob_sum_max < prob_sum:\n                            mode = m\n                            prob_sum_max = prob_sum\n                else:\n                    # decide first mode when more than one majority class\n                    # have the same number during oracle experiment\n                    mode = modes[0]\n            preds[i, idx[j]:idx[j + 1]] = mode\n    return preds\ndef relabeling(outputs, theta_t):\n    \"\"\"\n        Relabeling small action segments with their previous action segment\n        Args:\n            output: the results of action segmentation. (N, T) or (N, C, T)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:177-203"
+    },
+    "7669": {
+        "file_id": 561,
+        "content": "This code segment performs action segmentation by detecting the majority class in each chunk and relabeling smaller action segments with their previous action segment. It uses numpy's bincount and where functions to find majority classes, and has separate logic for cases with multiple majority classes depending on the dimension of outputs. The results are stored in preds array.",
+        "type": "comment"
+    },
+    "7670": {
+        "file_id": 561,
+        "content": "            theta_t: the threshold of the size of action segments.\n        Return:\n            relabeled output. (N, T)\n        \"\"\"\n    preds = convert2label(outputs)\n    for i in range(preds.shape[0]):\n        # shape (T,)\n        last = preds[i][0]\n        cnt = 1\n        for j in range(1, preds.shape[1]):\n            if last == preds[i][j]:\n                cnt += 1\n            else:\n                if cnt > theta_t:\n                    cnt = 1\n                    last = preds[i][j]\n                else:\n                    preds[i][j - cnt:j] = preds[i][j - cnt - 1]\n                    cnt = 1\n                    last = preds[i][j]\n        if cnt <= theta_t:\n            preds[i][j - cnt:j] = preds[i][j - cnt - 1]\n    return preds\ndef smoothing(outputs, filter_func):\n    \"\"\"\n        Smoothing action probabilities with gaussian filter.\n        Args:\n            outputs: frame-wise action probabilities. (N, C, T)\n        Return:\n            predictions: final prediction. (N, T)\n        \"\"\"\n    outputs = convert2probability(outputs)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:204-242"
+    },
+    "7671": {
+        "file_id": 561,
+        "content": "The code defines two functions: \"relabel\" and \"smoothing\". The relabel function takes predicted action segment labels, applies a threshold to merge adjacent segments with overlapping actions, and returns the relabeled output. The smoothing function applies a Gaussian filter to frame-wise action probabilities, resulting in final predictions.",
+        "type": "comment"
+    },
+    "7672": {
+        "file_id": 561,
+        "content": "    outputs = filter_func(paddle.to_tensor(outputs)).numpy()\n    preds = convert2label(outputs)\n    return preds\ndef ASRFPostProcessing(outputs_cls,\n                       outputs_boundary,\n                       refinement_method,\n                       boundary_threshold=0.7,\n                       theta_t=15,\n                       kernel_size=15):\n    \"\"\"\n    ASRF post processing is to refine action boundary\n    Args:\n        outputs_cls: the results of action segmentation. (N, T) or (N, C, T)\n        outputs_boundary: action boundary probability. (N, 1, T)\n        refinement_method: the way of refine predict boundary and classification. str\n        boundary_threshold: the threshold of the size of action segments. float(default=0.7)\n        theta_t: the threshold of the size of action segments. int(default=15)\n        kernel_size: Size of the gaussian kernel. int(default=15)\n    Return:\n        preds output. (N, T)\n    \"\"\"\n    func = [\n        \"refinement_with_boundary\",\n        \"relabeling\",\n        \"smoothing\",",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:243-270"
+    },
+    "7673": {
+        "file_id": 561,
+        "content": "This code is implementing ASRF post-processing for refining action boundary and classification. It takes in outputs_cls (action segmentation results), outputs_boundary (action boundary probability), refinement_method, boundary_threshold, theta_t (threshold of the size of action segments), and kernel_size as arguments. The code applies three processing steps: \"refinement_with_boundary\", \"relabeling\", and \"smoothing\" to refine the predict boundary and classification. It returns the preds output which is a (N, T) shape.",
+        "type": "comment"
+    },
+    "7674": {
+        "file_id": 561,
+        "content": "    ]\n    if refinement_method == \"smoothing\":\n        filter_func = GaussianSmoothing(kernel_size)\n        preds = smoothing(outputs_cls, filter_func)\n    elif refinement_method == \"relabeling\":\n        preds = relabeling(outputs_cls, theta_t)\n    elif refinement_method == \"refinement_with_boundary\":\n        preds = refinement_with_boundary(outputs_cls, outputs_boundary,\n                                         boundary_threshold)\n    else:\n        preds = np.zeros((1, 1))\n        assert refinement_method in func\n    return paddle.to_tensor(preds)\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = len(tensor.shape)\n    if dimensions < 2:\n        raise ValueError(\"Fan in and fan out can not be computed \\\n        for tensor with fewer than 2 dimensions\")\n    if dimensions == 2:  # Linear\n        fan_in = tensor.shape[1]\n        fan_out = tensor.shape[0]\n    else:\n        num_input_fmaps = tensor.shape[1]\n        num_output_fmaps = tensor.shape[0]\n        receptive_field_size = 1\n        if tensor.dim() > 2:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:271-301"
+    },
+    "7675": {
+        "file_id": 561,
+        "content": "This code segment defines a function that takes an input tensor and calculates the fan-in and fan-out. It also applies different refinement methods to outputs_cls based on the user-specified refinement method. If an invalid method is chosen, it returns a zero tensor. The code includes functions for smoothing, relabeling, and refinement with boundary.",
+        "type": "comment"
+    },
+    "7676": {
+        "file_id": 561,
+        "content": "            receptive_field_size = tensor[0][0].numel()\n        fan_in = num_input_fmaps * receptive_field_size\n        fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef calculate_gain(nonlinearity=None, a=None):\n    if nonlinearity == 'tanh':\n        return 5.0 / 3\n    elif nonlinearity == 'relu':\n        return math.sqrt(2.0)\n    elif nonlinearity == 'leaky_relu':\n        if a is not None:\n            return math.sqrt(2.0 / (1 + a**2))\n        else:\n            return math.sqrt(2.0 / (1 + 0.01**2))\n    elif nonlinearity == 'selu':\n        return 3.0 / 4\n    else:\n        return 1\ndef KaimingUniform_like_torch(weight_npy,\n                              mode='fan_in',\n                              nonlinearity='leaky_relu'):\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)\n    if mode == 'fan_in':\n        fan_mode = fan_in\n    else:\n        fan_mode = fan_out\n    a = math.sqrt(5.0)\n    gain = calculate_gain(nonlinearity=nonlinearity, a=a)\n    std = gain / math.sqrt(fan_mode)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:302-335"
+    },
+    "7677": {
+        "file_id": 561,
+        "content": "This code calculates the gain and fan-in/fan-out values for weight initialization in a neural network. It supports different nonlinearities such as 'tanh', 'relu', 'leaky_relu', and 'selu'. The function KaimingUniform_like_torch initializes weights using the Kaiming Uniform method with the specified nonlinearity, fan mode (fan_in or fan_out), and standard deviation of the initialization.",
+        "type": "comment"
+    },
+    "7678": {
+        "file_id": 561,
+        "content": "    bound = math.sqrt(3.0) * std\n    return np.random.uniform(-bound, bound, weight_npy.shape)\ndef init_bias(weight_npy, bias_npy):\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)\n    bound = 1.0 / math.sqrt(fan_in)\n    return np.random.uniform(-bound, bound, bias_npy.shape)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/framework/segmenters/utils.py:336-343"
+    },
+    "7679": {
+        "file_id": 561,
+        "content": "This code initializes weights and biases for a neural network layer. It calculates the fan-in and fan-out, determines bounds based on standard deviation or square root of three times the standard deviation for weights, and uses a uniform distribution within those bounds to initialize the weights and biases.",
+        "type": "comment"
+    },
+    "7680": {
+        "file_id": 562,
+        "content": "/paddlevideo/modeling/heads/__init__.py",
+        "type": "filepath"
+    },
+    "7681": {
+        "file_id": 562,
+        "content": "This code imports various head classes from different modules in the PaddleVideo library for video object detection, segmentation, or action recognition tasks, and adds them to the `__all__` list for easy access.",
+        "type": "summary"
+    },
+    "7682": {
+        "file_id": 562,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .adds_head import AddsHead\nfrom .asrf_head import ASRFHead\nfrom .attention_lstm_head import AttentionLstmHead, ActionAttentionLstmHead\nfrom .base import BaseHead\nfrom .bbox_head import BBoxHeadAVA\nfrom .cfbi_head import CollaborativeEnsemblerMS\nfrom .i3d_head import I3DHead\nfrom .movinet_head import MoViNetHead\nfrom .ms_tcn_head import MSTCNHead\nfrom .pptimesformer_head import ppTimeSformerHead\nfrom .pptsm_head import ppTSMHead",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/__init__.py:1-25"
+    },
+    "7683": {
+        "file_id": 562,
+        "content": "This code is importing various classes from different modules in the PaddleVideo library. These classes represent different types of heads used in video modeling, such as AttentionLstmHead and BBoxHeadAVA. The code also includes licenses and copyright information for the PaddlePaddle Authors.",
+        "type": "comment"
+    },
+    "7684": {
+        "file_id": 562,
+        "content": "from .pptsn_head import ppTSNHead\nfrom .roi_head import AVARoIHead\nfrom .single_straight3d import SingleRoIExtractor3D\nfrom .slowfast_head import SlowFastHead\nfrom .stgcn_head import STGCNHead\nfrom .timesformer_head import TimeSformerHead\nfrom .transnetv2_head import TransNetV2Head\nfrom .tsm_head import TSMHead\nfrom .tsn_head import TSNHead\nfrom .ms_tcn_head import MSTCNHead\nfrom .asrf_head import ASRFHead\nfrom .ctrgcn_head import CTRGCNHead\nfrom .movinet_head import MoViNetHead\nfrom .agcn2s_head import AGCN2sHead\nfrom .token_shift_head import TokenShiftHead\n__all__ = [\n    'BaseHead', 'TSNHead', 'TSMHead', 'ppTSMHead', 'ppTSNHead', 'SlowFastHead',\n    'AttentionLstmHead', 'TimeSformerHead', 'STGCNHead', 'TransNetV2Head',\n    'I3DHead', 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'AddsHead',\n    'ppTimeSformerHead', 'CollaborativeEnsemblerMS', 'MSTCNHead', 'ASRFHead',\n    'MoViNetHead', 'CTRGCNHead', 'TokenShiftHead', 'ActionAttentionLstmHead',\n    'AGCN2sHead'\n]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/__init__.py:26-49"
+    },
+    "7685": {
+        "file_id": 562,
+        "content": "This code imports various head classes from different modules and adds them to the `__all__` list, making them accessible for import when using this module. These head classes are used in video object detection, segmentation or action recognition tasks. They include ppTSNHead, TSNHead, TSMHead, ppTSMHead, SlowFastHead, TimeSformerHead and more. Each head class has its own specific functionality for different tasks.",
+        "type": "comment"
+    },
+    "7686": {
+        "file_id": 563,
+        "content": "/paddlevideo/modeling/heads/adds_head.py",
+        "type": "filepath"
+    },
+    "7687": {
+        "file_id": 563,
+        "content": "The \"AddsHead\" class in PaddleVideo handles object detection, loss calculation during training and metrics like abs_rel, rmse in inference, while supporting multi-GPU scenarios with all-reduce operations.",
+        "type": "summary"
+    },
+    "7688": {
+        "file_id": 563,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport cv2\nimport numpy as np\nimport paddle.nn as nn\nfrom paddlevideo.utils import get_dist_info\nimport paddle\nfrom ..builder import build_loss\nfrom ..registry import HEADS\nMIN_DEPTH = 1e-3\nMAX_DEPTH = 80\n@HEADS.register()\nclass AddsHead(nn.Layer):\n    \"\"\"TimeSformerHead Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/adds_head.py:1-33"
+    },
+    "7689": {
+        "file_id": 563,
+        "content": "This code is part of the PaddleVideo library, defining a class called AddsHead for object detection. It uses input features with specific number of channels and classes to be classified. The class is registered in the registry under HEADS. MIN_DEPTH and MAX_DEPTH constants define the minimum and maximum depth values respectively.",
+        "type": "comment"
+    },
+    "7690": {
+        "file_id": 563,
+        "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 avg_reprojection,\n                 disparity_smoothness,\n                 no_ssim,\n                 loss_cfg=dict(name='ADDSLoss'),\n                 max_gt_depth=60,\n                 pred_depth_scale_factor=1):\n        super(AddsHead, self).__init__()\n        loss_cfg['avg_reprojection'] = avg_reprojection\n        loss_cfg['disparity_smoothness'] = disparity_smoothness\n        loss_cfg['no_ssim'] = no_ssim\n        self.max_gt_depth = max_gt_depth\n        self.pred_depth_scale_factor = pred_depth_scale_factor\n        self.loss_func = build_loss(loss_cfg)\n    def forward(self):\n        raise NotImplemented\n    def loss(self, inputs, outputs):\n        if self.training:\n            return self.loss_func(inputs, outputs)\n        else:\n            abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.get_metrics(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/adds_head.py:34-62"
+    },
+    "7691": {
+        "file_id": 563,
+        "content": "The code represents the initialization and forward pass of a class named \"AddsHead\". The class takes in parameters like avg_reprojection, disparity_smoothness, no_ssim, etc. It builds a loss function using build_loss method with the provided configuration (loss_cfg). During training, it returns the result of the loss function on inputs and outputs. In inference mode, it uses get_metrics method to calculate metrics such as abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3.",
+        "type": "comment"
+    },
+    "7692": {
+        "file_id": 563,
+        "content": "                outputs['pred_disp'], outputs['gt'])\n            outputs['abs_rel'] = abs_rel\n            outputs['sq_rel'] = sq_rel\n            outputs['rmse'] = rmse\n            outputs['rmse_log'] = rmse_log\n            outputs['a1'] = a1\n            outputs['a2'] = a2\n            outputs['a3'] = a3\n            return outputs\n    def get_metrics(self, pred_disp, gt_depth):\n        gt_height, gt_width = gt_depth.shape[:2]\n        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))\n        pred_depth = 1 / pred_disp\n        mask = gt_depth > 0\n        pred_depth = pred_depth[mask]\n        gt_depth = gt_depth[mask]\n        pred_depth *= self.pred_depth_scale_factor\n        ratio = np.median(gt_depth) / np.median(pred_depth)\n        pred_depth *= ratio\n        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH\n        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH\n        mask2 = gt_depth <= self.max_gt_depth\n        pred_depth = pred_depth[mask2]\n        gt_depth = gt_depth[mask2]\n        abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.compute_errors(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/adds_head.py:63-95"
+    },
+    "7693": {
+        "file_id": 563,
+        "content": "This code snippet defines an \"AddsHead\" class that returns a dictionary of metrics including absolute relative error, squared relative error, root mean square error, and additional error measures. The get_metrics function resizes the predicted displacement to match the ground truth depth, scales and adjusts the predicted depth based on certain factors, and then computes the specified errors using another function.",
+        "type": "comment"
+    },
+    "7694": {
+        "file_id": 563,
+        "content": "            gt_depth, pred_depth)\n        _, world_size = get_dist_info()\n        if world_size > 1:\n            # educe sum when valid\n            # TODO: there are some problems with multi gpu gather code.\n            abs_rel = paddle.to_tensor(abs_rel)\n            sq_rel = paddle.to_tensor(sq_rel)\n            rmse = paddle.to_tensor(rmse)\n            rmse_log = paddle.to_tensor(rmse_log)\n            a1 = paddle.to_tensor(a1)\n            a2 = paddle.to_tensor(a2)\n            a3 = paddle.to_tensor(a3)\n            abs_rel = paddle.distributed.all_reduce(\n                abs_rel, op=paddle.distributed.ReduceOp.SUM) / world_size\n            sq_rel = paddle.distributed.all_reduce(\n                sq_rel, op=paddle.distributed.ReduceOp.SUM) / world_size\n            rmse = paddle.distributed.all_reduce(\n                rmse, op=paddle.distributed.ReduceOp.SUM) / world_size\n            rmse_log = paddle.distributed.all_reduce(\n                rmse_log, op=paddle.distributed.ReduceOp.SUM) / world_size\n            a1 = paddle.distributed.all_reduce(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/adds_head.py:96-117"
+    },
+    "7695": {
+        "file_id": 563,
+        "content": "This code is performing all-reduce operations on tensors for multi-GPU scenarios, ensuring that the sum of tensor values across GPUs is reduced and then divided by the total number of participating GPUs. This allows for accurate averaging of results when working with multiple GPUs in a distributed environment.",
+        "type": "comment"
+    },
+    "7696": {
+        "file_id": 563,
+        "content": "                a1, op=paddle.distributed.ReduceOp.SUM) / world_size\n            a2 = paddle.distributed.all_reduce(\n                a2, op=paddle.distributed.ReduceOp.SUM) / world_size\n            a3 = paddle.distributed.all_reduce(\n                a3, op=paddle.distributed.ReduceOp.SUM) / world_size\n            return abs_rel.item(), sq_rel.item(), rmse.item(), rmse_log.item(\n            ), a1.item(), a2.item(), a3.item()\n        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3\n    def compute_errors(self, gt, pred):\n        \"\"\"Computation of error metrics between predicted and ground truth depths\n        \"\"\"\n        thresh = np.maximum((gt / pred), (pred / gt))\n        a1 = (thresh < 1.25).mean()\n        a2 = (thresh < 1.25**2).mean()\n        a3 = (thresh < 1.25**3).mean()\n        rmse = (gt - pred)**2\n        rmse = np.sqrt(rmse.mean())\n        rmse_log = (np.log(gt) - np.log(pred))**2\n        rmse_log = np.sqrt(rmse_log.mean())\n        abs_rel = np.mean(np.abs(gt - pred) / gt)\n        sq_rel = np.mean(((gt - pred)**2) / gt)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/adds_head.py:118-144"
+    },
+    "7697": {
+        "file_id": 563,
+        "content": "The code defines a function that computes error metrics between predicted and ground truth depths. It uses all-reduce operations to calculate average values and returns multiple metrics including absolute relative error, squared relative error, RMSE, log RMSE, and three averages (a1, a2, a3) indicating the percentage of thresholds below certain levels.",
+        "type": "comment"
+    },
+    "7698": {
+        "file_id": 563,
+        "content": "        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/adds_head.py:146-146"
+    },
+    "7699": {
+        "file_id": 563,
+        "content": "This code returns six metrics: abs_rel, sq_rel, rmse, rmse_log, a1, and a2. These metrics are likely related to evaluating the performance of some model or algorithm in a regression task. The 'abs_rel' might stand for absolute relative error, 'sq_rel' could be squared relative error, 'rmse' represents root mean squared error, 'rmse_log' could be the logarithm of rmse, and 'a1', 'a2', and 'a3' are possibly other evaluation metrics.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/77.json b/docs/data/77.json
new file mode 100644
index 000000000..70763a33b
--- /dev/null
+++ b/docs/data/77.json
@@ -0,0 +1,546 @@
+{
+    "7700": {
+        "file_id": 564,
+        "content": "/paddlevideo/modeling/heads/agcn2s_head.py",
+        "type": "filepath"
+    },
+    "7701": {
+        "file_id": 564,
+        "content": "The AGCN2sHead class is a head for the AGCN2s model in PaddleVideo, with input arguments defining channels, classes, people, and dropout ratio. It registers under HEADS registry, inherits from BaseHead class, initializes base class, sets instance variables, creates a linear layer, and reshapes input for forward pass. The code takes the input tensor x, averages along axes, passes through a fully connected layer (self.fc) to produce output.",
+        "type": "summary"
+    },
+    "7702": {
+        "file_id": 564,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn as nn\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass AGCN2sHead(BaseHead):\n    \"\"\"\n    Head for AGCN2s model.\n    Args:\n        in_channels: int, input feature channels. Default: 64.\n        num_classes: int, output the number of classes.\n        M: int, number of people.\n        drop_out: float, dropout ratio of layer. Default: 0.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/agcn2s_head.py:1-32"
+    },
+    "7703": {
+        "file_id": 564,
+        "content": "The code defines the AGCN2sHead class, a head for the AGCN2s model in PaddleVideo. It has input feature channels, number of classes, number of people, and dropout ratio as arguments. This head is registered under HEADS registry and inherits from BaseHead class.",
+        "type": "comment"
+    },
+    "7704": {
+        "file_id": 564,
+        "content": "    \"\"\"\n    def __init__(self, in_channels=64, num_classes=10, M=2, **kwargs):\n        super().__init__(num_classes, in_channels, **kwargs)\n        self.in_channels = in_channels\n        self.M = M\n        weight_attr = paddle.ParamAttr(\n            name=\"linear_weight\",\n            initializer=paddle.nn.initializer.Normal(mean=0.0,\n                                                     std=math.sqrt(\n                                                         2. / num_classes)))\n        self.fc = nn.Linear(self.in_channels * 4,\n                            self.num_classes,\n                            weight_attr=weight_attr)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        assert x.shape[\n            0] % self.M == 0, f'The first dimension of the output must be an integer multiple of the number of people M, but recieved shape[0]={x.shape[0]}, M={self.M}'\n        # N*M,C,T,V\n        N = x.shape[0] // self.M\n        c_new = x.shape[1]\n        x = x.reshape([N, self.M, c_new, -1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/agcn2s_head.py:33-56"
+    },
+    "7705": {
+        "file_id": 564,
+        "content": "Class constructor takes in_channels, num_classes, and M as parameters, initializes base class, sets instance variables, creates a linear layer with specified weights using paddle's Normal initializer, and reshapes input for forward pass.",
+        "type": "comment"
+    },
+    "7706": {
+        "file_id": 564,
+        "content": "        x = x.mean(3).mean(1)\n        return self.fc(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/agcn2s_head.py:57-59"
+    },
+    "7707": {
+        "file_id": 564,
+        "content": "This code takes the input tensor x, averages it along the third and first axes respectively, then passes it through a fully connected layer (self.fc) to produce an output.",
+        "type": "comment"
+    },
+    "7708": {
+        "file_id": 565,
+        "content": "/paddlevideo/modeling/heads/asrf_head.py",
+        "type": "filepath"
+    },
+    "7709": {
+        "file_id": 565,
+        "content": "The ASRFHead class is a model for action recognition using convolutional layers, and computes precision, recall, F1 score. It creates an ASRF head class for video processing with label retrieval, Levenshtein distance methods, edit scores, true positives, false positives, IoU measures, and selects the best scoring segment.",
+        "type": "summary"
+    },
+    "7710": {
+        "file_id": 565,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# https://github.com/yiskw713/asrf/libs/models/tcn.py\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom paddle import ParamAttr\nfrom ..backbones.ms_tcn import SingleStageModel\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nfrom ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch\n@HEADS.register()\nclass ASRFHead(BaseHead):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/asrf_head.py:1-32"
+    },
+    "7711": {
+        "file_id": 565,
+        "content": "The code defines a class for the ASRFHead, which is an instance of BaseHead and registered in HEADS registry. It imports necessary libraries, defines several models including SingleStageModel, and includes various utility functions from other modules. It also initializes weights with KaimingUniform_like_torch method.",
+        "type": "comment"
+    },
+    "7712": {
+        "file_id": 565,
+        "content": "    def __init__(self,\n                 num_classes,\n                 num_features,\n                 num_stages,\n                 num_layers,\n                 num_stages_asb=None,\n                 num_stages_brb=None):\n        super().__init__(num_classes=num_classes, in_channels=num_features)\n        if not isinstance(num_stages_asb, int):\n            num_stages_asb = num_stages\n        if not isinstance(num_stages_brb, int):\n            num_stages_brb = num_stages\n        self.num_layers = num_layers\n        self.num_stages_asb = num_stages_asb\n        self.num_stages_brb = num_stages_brb\n        self.num_features = num_features\n        # cls score\n        self.overlap = 0.5\n        self.conv_cls = nn.Conv1D(self.num_features, self.num_classes, 1)\n        self.conv_boundary = nn.Conv1D(self.num_features, 1, 1)\n        # action segmentation branch\n        asb = [\n            SingleStageModel(self.num_layers, self.num_features,\n                             self.num_classes, self.num_classes)\n            for _ in range(self.num_stages_asb - 1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/asrf_head.py:34-63"
+    },
+    "7713": {
+        "file_id": 565,
+        "content": "The code above initializes an object of a class representing a feature extraction and classification model for action recognition. It takes several parameters such as the number of classes, features, stages in the action segmentation branch (ASB), stages in the boundary refinement branch (BRB), and layers per stage. The object is initialized by first calling the superclass constructor and then setting up the necessary components like the convolutional layers for class scores and boundary prediction, as well as multiple SingleStageModel instances for the action segmentation branch if needed.",
+        "type": "comment"
+    },
+    "7714": {
+        "file_id": 565,
+        "content": "        ]\n        # boundary regression branch\n        brb = [\n            SingleStageModel(self.num_layers, self.num_features, 1, 1)\n            for _ in range(self.num_stages_brb - 1)\n        ]\n        self.brb = nn.LayerList(brb)\n        self.asb = nn.LayerList(asb)\n        self.activation_asb = nn.Softmax(axis=1)\n        self.activation_brb = nn.Sigmoid()\n    def init_weights(self):\n        \"\"\"\n        initialize model layers' weight\n        \"\"\"\n        # init weight\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv1D):\n                layer.weight.set_value(\n                    KaimingUniform_like_torch(layer.weight).astype('float32'))\n                if layer.bias is not None:\n                    layer.bias.set_value(\n                        init_bias(layer.weight, layer.bias).astype('float32'))\n    def forward(self, x):\n        \"\"\"\n        ASRF head\n        \"\"\"\n        out_cls = self.conv_cls(x)\n        out_boundary = self.conv_boundary(x)\n        outputs_cls = [out_cls]\n        outputs_boundary = [out_boundary]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/asrf_head.py:64-98"
+    },
+    "7715": {
+        "file_id": 565,
+        "content": "This code defines a ASRF head model, initializes its weights and performs forward pass for classification and boundary regression tasks. It uses Conv1D layers and LayerList for flexibility. The weight initialization follows Kaiming uniform distribution and applies bias if present. The outputs of both tasks are stored separately in lists.",
+        "type": "comment"
+    },
+    "7716": {
+        "file_id": 565,
+        "content": "        for as_stage in self.asb:\n            out_cls = as_stage(self.activation_asb(out_cls))\n            outputs_cls.append(out_cls)\n        for br_stage in self.brb:\n            out_boundary = br_stage(self.activation_brb(out_boundary))\n            outputs_boundary.append(out_boundary)\n        return outputs_cls, outputs_boundary\n    def get_F1_score(self, predicted, groundTruth):\n        recog_content = list(predicted.numpy())\n        gt_content = list(groundTruth[0].numpy())\n        # cls score\n        correct = 0\n        total = 0\n        edit = 0\n        for i in range(len(gt_content)):\n            total += 1\n            if gt_content[i] == recog_content[i]:\n                correct += 1\n        edit_num = self.edit_score(recog_content, gt_content)\n        edit += edit_num\n        tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)\n        # cls metric\n        precision = tp / float(tp + fp)\n        recall = tp / float(fp + fn)\n        if precision + recall > 0.0:\n            f1 = 2.0 * (precision * recall) / (precision + recall)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/asrf_head.py:100-136"
+    },
+    "7717": {
+        "file_id": 565,
+        "content": "This code implements an ASRF head for a model, which takes in input and outputs classified classes and boundary scores. It also includes a get_F1_score function to calculate precision, recall, and F1 score for classification tasks. The F1 score is calculated based on the correctness of predicted class labels compared to ground truth labels.",
+        "type": "comment"
+    },
+    "7718": {
+        "file_id": 565,
+        "content": "        else:\n            f1 = 0.0\n        f1 = np.nan_to_num(f1)\n        return f1\n    def get_labels_start_end_time(self, frame_wise_labels):\n        labels = []\n        starts = []\n        ends = []\n        last_label = frame_wise_labels[0]\n        labels.append(frame_wise_labels[0])\n        starts.append(0)\n        for i in range(len(frame_wise_labels)):\n            if frame_wise_labels[i] != last_label:\n                labels.append(frame_wise_labels[i])\n                starts.append(i)\n                ends.append(i)\n                last_label = frame_wise_labels[i]\n        ends.append(i + 1)\n        return labels, starts, ends\n    def levenstein(self, p, y, norm=False):\n        m_row = len(p)\n        n_col = len(y)\n        D = np.zeros([m_row + 1, n_col + 1], np.float)\n        for i in range(m_row + 1):\n            D[i, 0] = i\n        for i in range(n_col + 1):\n            D[0, i] = i\n        for j in range(1, n_col + 1):\n            for i in range(1, m_row + 1):\n                if y[j - 1] == p[i - 1]:\n                    D[i, j] = D[i - 1, j - 1]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/asrf_head.py:137-170"
+    },
+    "7719": {
+        "file_id": 565,
+        "content": "The code defines an ASRF head class that seems to be related to video processing and includes methods for retrieving label information and calculating the Levenshtein distance between two sequences. The get_labels_start_end_time method converts frame-wise labels into a list of labels, their respective start times, and end times. The levenstein method calculates the Levenshtein distance, which is used to compare two sequences of characters.",
+        "type": "comment"
+    },
+    "7720": {
+        "file_id": 565,
+        "content": "                else:\n                    D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,\n                                  D[i - 1, j - 1] + 1)\n        if norm:\n            score = (1 - D[-1, -1] / max(m_row, n_col)) * 100\n        else:\n            score = D[-1, -1]\n        return score\n    def edit_score(self, recognized, ground_truth, norm=True):\n        P, _, _ = self.get_labels_start_end_time(recognized)\n        Y, _, _ = self.get_labels_start_end_time(ground_truth)\n        return self.levenstein(P, Y, norm)\n    def f_score(self, recognized, ground_truth, overlap):\n        p_label, p_start, p_end = self.get_labels_start_end_time(recognized)\n        y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)\n        tp = 0\n        fp = 0\n        hits = np.zeros(len(y_label))\n        for j in range(len(p_label)):\n            intersection = np.minimum(p_end[j], y_end) - np.maximum(\n                p_start[j], y_start)\n            union = np.maximum(p_end[j], y_end) - np.minimum(\n                p_start[j], y_start)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/asrf_head.py:171-200"
+    },
+    "7721": {
+        "file_id": 565,
+        "content": "The code contains a function to calculate the edit score between two sequences. It uses the Levenshtein distance algorithm to compare recognized and ground truth labels, considering insertions, deletions, and substitutions. The f_score function calculates true positive (tp) and false positive (fp) values based on label overlaps, and normalizes the edit score if required.",
+        "type": "comment"
+    },
+    "7722": {
+        "file_id": 565,
+        "content": "            IoU = (1.0 * intersection / union) * (\n                [p_label[j] == y_label[x] for x in range(len(y_label))])\n            # Get the best scoring segment\n            idx = np.array(IoU).argmax()\n            if IoU[idx] >= overlap and not hits[idx]:\n                tp += 1\n                hits[idx] = 1\n            else:\n                fp += 1\n        fn = len(y_label) - sum(hits)\n        return float(tp), float(fp), float(fn)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/asrf_head.py:201-212"
+    },
+    "7723": {
+        "file_id": 565,
+        "content": "This code calculates true positives (tp), false positives (fp) and false negatives (fn). It measures IoU between predicted and actual labels, selects best scoring segment and tracks hits and misses. The method returns tp, fp, fn as float values.",
+        "type": "comment"
+    },
+    "7724": {
+        "file_id": 566,
+        "content": "/paddlevideo/modeling/heads/attention_lstm_head.py",
+        "type": "filepath"
+    },
+    "7725": {
+        "file_id": 566,
+        "content": "The code defines the AttentionLstmHead class for LSTM-based attention mechanism in PaddleVideo, performing feature extraction and softmax normalization for video and audio classification tasks.",
+        "type": "summary"
+    },
+    "7726": {
+        "file_id": 566,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Normal\nfrom paddle.regularizer import L2Decay\nimport paddle.nn.functional as F\nfrom ...metrics.youtube8m import eval_util as youtube8m_metrics\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nfrom .base import BaseHead\n@HEADS.register()\nclass AttentionLstmHead(BaseHead):\n    \"\"\"AttentionLstmHead.\n    Args: TODO\n    \"\"\"\n    def __init__(self,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:1-32"
+    },
+    "7727": {
+        "file_id": 566,
+        "content": "This code defines a class called AttentionLstmHead, which is a type of head used in a neural network. It is part of the PaddleVideo library and inherits from the BaseHead class. The class uses LSTM for attention, has its own parameters (specified by ParamAttr), and utilizes weight initialization. This code also includes license information, documentation on arguments, and registration in the HEADS registry.",
+        "type": "comment"
+    },
+    "7728": {
+        "file_id": 566,
+        "content": "                 num_classes=3862,\n                 feature_num=2,\n                 feature_dims=[1024, 128],\n                 embedding_size=512,\n                 lstm_size=1024,\n                 in_channels=2048,\n                 loss_cfg=dict(name='CrossEntropyLoss')):\n        super(AttentionLstmHead, self).__init__(num_classes, in_channels,\n                                                loss_cfg)\n        self.num_classes = num_classes\n        self.feature_dims = feature_dims\n        self.embedding_size = embedding_size\n        self.lstm_size = lstm_size\n        self.feature_num = len(self.feature_dims)\n        for i in range(self.feature_num):  # 0:rgb, 1:audio\n            fc_feature = paddle.nn.Linear(in_features=self.feature_dims[i],\n                                          out_features=self.embedding_size)\n            self.add_sublayer(\"fc_feature{}\".format(i), fc_feature)\n            bi_lstm = paddle.nn.LSTM(input_size=self.embedding_size,\n                                     hidden_size=self.lstm_size,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:33-53"
+    },
+    "7729": {
+        "file_id": 566,
+        "content": "This code initializes an AttentionLstmHead object with specified parameters. It creates a Linear layer for each feature dimension (rgb, audio) and adds a bi-directional LSTM layer with specified sizes. The AttentionLstmHead will be used to process video frames and audio data in parallel for classification tasks.",
+        "type": "comment"
+    },
+    "7730": {
+        "file_id": 566,
+        "content": "                                     direction=\"bidirectional\")\n            self.add_sublayer(\"bi_lstm{}\".format(i), bi_lstm)\n            drop_rate = 0.5\n            self.dropout = paddle.nn.Dropout(drop_rate)\n            att_fc = paddle.nn.Linear(in_features=self.lstm_size * 2,\n                                      out_features=1)\n            self.add_sublayer(\"att_fc{}\".format(i), att_fc)\n            self.softmax = paddle.nn.Softmax()\n        self.fc_out1 = paddle.nn.Linear(in_features=self.lstm_size * 4,\n                                        out_features=8192,\n                                        bias_attr=ParamAttr(\n                                            regularizer=L2Decay(0.0),\n                                            initializer=Normal()))\n        self.relu = paddle.nn.ReLU()\n        self.fc_out2 = paddle.nn.Linear(in_features=8192,\n                                        out_features=4096,\n                                        bias_attr=ParamAttr(\n                                            regularizer=L2Decay(0.0),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:54-74"
+    },
+    "7731": {
+        "file_id": 566,
+        "content": "The code initializes an LSTM layer with bidirectional capability and adds dropout for regularization. It defines a linear layer (att_fc) to map the output of the LSTM layer to 1 feature, applies softmax activation, and then defines two fully connected layers (fc_out1 and fc_out2) for further processing with specific activations and parameters.",
+        "type": "comment"
+    },
+    "7732": {
+        "file_id": 566,
+        "content": "                                            initializer=Normal()))\n        self.fc_logit = paddle.nn.Linear(in_features=4096,\n                                         out_features=self.num_classes,\n                                         bias_attr=ParamAttr(\n                                             regularizer=L2Decay(0.0),\n                                             initializer=Normal()))\n        self.sigmoid = paddle.nn.Sigmoid()\n    def init_weights(self):\n        pass\n    def forward(self, inputs):\n        # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)]\n        # deal with features with different length\n        # 1. padding to same lenght, make a tensor\n        # 2. make a mask tensor with the same shpae with 1\n        # 3. compute output using mask tensor, s.t. output is nothing todo with padding\n        assert (len(inputs) == self.feature_num\n                ), \"Input tensor does not contain {} features\".format(\n                    self.feature_num)\n        att_outs = []",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:75-95"
+    },
+    "7733": {
+        "file_id": 566,
+        "content": "The code defines a class for an attention LSTM head in PaddleVideo. It initializes two linear layers and a sigmoid activation function. The `init_weights` method is currently empty, and the `forward` method takes inputs of different lengths and processes them before storing the results in the `att_outs` list.",
+        "type": "comment"
+    },
+    "7734": {
+        "file_id": 566,
+        "content": "        for i in range(len(inputs)):\n            # 1. fc\n            m = getattr(self, \"fc_feature{}\".format(i))\n            output_fc = m(inputs[i][0])\n            output_fc = paddle.tanh(output_fc)\n            # 2. bi_lstm\n            m = getattr(self, \"bi_lstm{}\".format(i))\n            lstm_out, _ = m(inputs=output_fc, sequence_length=inputs[i][1])\n            lstm_dropout = self.dropout(lstm_out)\n            # 3. att_fc\n            m = getattr(self, \"att_fc{}\".format(i))\n            lstm_weight = m(lstm_dropout)\n            # 4. softmax replace start, for it's relevant to sum in time step\n            lstm_exp = paddle.exp(lstm_weight)\n            lstm_mask = paddle.mean(inputs[i][2], axis=2)\n            lstm_mask = paddle.unsqueeze(lstm_mask, axis=2)\n            lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask)\n            lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1)\n            exponent = -1\n            lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent)\n            lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:96-120"
+    },
+    "7735": {
+        "file_id": 566,
+        "content": "The code performs feature extraction, bi-directional LSTM processing, attention weight calculation, and finally softmax normalization on each input in a list. It uses dropout to prevent overfitting, applies masking for attention calculations, and calculates the denominator using power function.",
+        "type": "comment"
+    },
+    "7736": {
+        "file_id": 566,
+        "content": "            lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator)\n            lstm_weight = lstm_softmax\n            # softmax replace end\n            lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight)\n            # 5. sequence_pool's replace start, for it's relevant to sum in time step\n            lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask)\n            fea_lens = inputs[i][1]\n            fea_len = int(fea_lens[0])\n            lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1)\n            # sequence_pool's replace end\n            att_outs.append(lstm_pool)\n        att_out = paddle.concat(att_outs, axis=1)\n        fc_out1 = self.fc_out1(att_out)\n        fc_out1_act = self.relu(fc_out1)\n        fc_out2 = self.fc_out2(fc_out1_act)\n        fc_out2_act = paddle.tanh(fc_out2)\n        fc_logit = self.fc_logit(fc_out2_act)\n        output = self.sigmoid(fc_logit)\n        return fc_logit, output\n    def loss(self, lstm_logit, labels, **kwargs):\n        labels.stop_gradient = True",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:121-144"
+    },
+    "7737": {
+        "file_id": 566,
+        "content": "This code performs LSTM-based attention mechanism for a sequence modeling task. It applies softmax, dropout, and mask operations on the LSTM outputs to compute the attention weights. The attention weights are then used to generate an attentive pooling of the sequence, which is passed through fully connected layers and sigmoid activation for the final output. The loss function uses labels with stop_gradient=True for training the model.",
+        "type": "comment"
+    },
+    "7738": {
+        "file_id": 566,
+        "content": "        losses = dict()\n        bce_logit_loss = paddle.nn.BCEWithLogitsLoss(reduction='sum')\n        sum_cost = bce_logit_loss(lstm_logit, labels)\n        return sum_cost\n    def metric(self, lstm_output, labels):\n        pred = lstm_output.numpy()\n        label = labels.numpy()\n        hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)\n        perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(\n            pred, label)\n        gap = youtube8m_metrics.calculate_gap(pred, label)\n        return hit_at_one, perr, gap\n@HEADS.register()\nclass ActionAttentionLstmHead(BaseHead):\n    \"\"\"AttentionLstmHead for FootballAction\n    Args: TODO\n    \"\"\"\n    def __init__(self,\n                 num_classes=8,\n                 feature_num=2,\n                 feature_dims=[2048, 1024],\n                 embedding_size=512,\n                 lstm_size=1024,\n                 in_channels=2048,\n                 loss_cfg=dict(name='CrossEntropyLoss')):\n        super(ActionAttentionLstmHead, self).__init__(num_classes, in_channels,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:145-173"
+    },
+    "7739": {
+        "file_id": 566,
+        "content": "This code defines an ActionAttentionLstmHead class which is a type of BaseHead. It uses LSTM for attention and takes in various arguments like num_classes, feature_num, feature_dims, embedding_size, lstm_size, in_channels, and loss_cfg. The metric function calculates hit_at_one, perr (precision at equal recall rate), and gap values from the LSTM output and labels. The sum_cost function calculates the loss using BCEWithLogitsLoss.",
+        "type": "comment"
+    },
+    "7740": {
+        "file_id": 566,
+        "content": "                                                      loss_cfg)\n        self.num_classes = num_classes\n        self.feature_dims = feature_dims\n        self.embedding_size = embedding_size\n        self.lstm_size = lstm_size\n        self.feature_num = len(self.feature_dims)\n        for i in range(self.feature_num):  # 0:rgb, 1:audio\n            bi_lstm = paddle.nn.LSTM(input_size=self.feature_dims[i],\n                                     hidden_size=self.feature_dims[i],\n                                     direction=\"bidirectional\")\n            self.add_sublayer(\"bi_lstm{}\".format(i), bi_lstm)\n            drop_rate = 0.5\n            self.dropout = paddle.nn.Dropout(drop_rate)\n            att_fc = paddle.nn.Linear(in_features=self.feature_dims[i] * 2,\n                                      out_features=1)\n            self.add_sublayer(\"att_fc{}\".format(i), att_fc)\n            self.softmax = paddle.nn.Softmax()\n        self.fc1 = paddle.nn.Linear(in_features=2 * sum(self.feature_dims),\n                                    out_features=8192,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:174-195"
+    },
+    "7741": {
+        "file_id": 566,
+        "content": "This code initializes a LSTM network for feature processing and attention mechanism. It defines bidirectional LSTM layers for each feature dimension (RGB, audio), followed by dropout and fully connected layers. The model has 8192 output features and is used for multimodal fusion in a video understanding task.",
+        "type": "comment"
+    },
+    "7742": {
+        "file_id": 566,
+        "content": "                                    bias_attr=ParamAttr(\n                                        regularizer=L2Decay(0.0),\n                                        initializer=Normal()))\n        self.bn1 = paddle.nn.BatchNorm(num_channels=8192)\n        self.dropout1 = paddle.nn.Dropout(0.5)\n        self.fc2 = paddle.nn.Linear(in_features=8192,\n                                    out_features=4096,\n                                    bias_attr=ParamAttr(\n                                        regularizer=L2Decay(0.0),\n                                        initializer=Normal()))\n        self.bn2 = paddle.nn.BatchNorm(num_channels=4096)\n        self.dropout2 = paddle.nn.Dropout(0.5)\n        self.fc3 = paddle.nn.Linear(\n            in_features=4096,\n            out_features=self.num_classes,\n        )\n        self.fc4 = paddle.nn.Linear(\n            in_features=4096,\n            out_features=1,\n        )\n    def init_weights(self):\n        pass\n    def forward(self, inputs):\n        # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:196-221"
+    },
+    "7743": {
+        "file_id": 566,
+        "content": "This code defines a class for an attention-based LSTM head in PaddleVideo. It includes several fully connected layers, batch normalization, dropout, and two linear layers. The `init_weights` function is not implemented, and the `forward` method takes input data as a tuple of (rgb_data, rgb_len, rgb_mask) and (audio_data, audio_len, audio_mask).",
+        "type": "comment"
+    },
+    "7744": {
+        "file_id": 566,
+        "content": "        # deal with features with different length\n        # 1. padding to same lenght, make a tensor\n        # 2. make a mask tensor with the same shpae with 1\n        # 3. compute output using mask tensor, s.t. output is nothing todo with padding\n        assert (len(inputs) == self.feature_num\n                ), \"Input tensor does not contain {} features\".format(\n                    self.feature_num)\n        att_outs = []\n        for i in range(len(inputs)):\n            m = getattr(self, \"bi_lstm{}\".format(i))\n            lstm_out, _ = m(inputs=inputs[i][0], sequence_length=inputs[i][1])\n            lstm_dropout = self.dropout(lstm_out)\n            # 3. att_fc\n            m = getattr(self, \"att_fc{}\".format(i))\n            lstm_weight = m(lstm_dropout)\n            # 4. softmax replace start, for it's relevant to sum in time step\n            lstm_exp = paddle.exp(lstm_weight)\n            lstm_mask = paddle.mean(inputs[i][2], axis=2)\n            lstm_mask = paddle.unsqueeze(lstm_mask, axis=2)\n            lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:222-244"
+    },
+    "7745": {
+        "file_id": 566,
+        "content": "This code handles features with varying lengths. It pads features to the same length, creates a mask tensor, and computes the output using the mask tensor, effectively ignoring padding values. It asserts that the input tensor contains the expected number of features. It iterates over each feature, performs bi-directional LSTM, applies dropout, calculates weighted sum using attention mechanism, applies softmax to the weights, multiplies by a mask, and stores the results in att_outs.",
+        "type": "comment"
+    },
+    "7746": {
+        "file_id": 566,
+        "content": "            lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1)\n            exponent = -1\n            lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent)\n            lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2)\n            lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator)\n            lstm_weight = lstm_softmax\n            # softmax replace end\n            lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight)\n            # 5. sequence_pool's replace start, for it's relevant to sum in time step\n            lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask)\n            # fea_lens = inputs[i][1]\n            # fea_len = int(fea_lens[0])\n            lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1)\n            # sequence_pool's replace end\n            att_outs.append(lstm_pool)\n        att_out = paddle.concat(att_outs, axis=1)\n        y = self.fc1(att_out)\n        y = self.bn1(y)\n        y = F.relu(y)\n        y = self.dropout1(y)\n        y = self.fc2(y)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:245-267"
+    },
+    "7747": {
+        "file_id": 566,
+        "content": "This code segment calculates the attention scores using LSTM and applies them to sequence pooling. It then passes the output through multiple layers of neural networks, including fully connected layers, batch normalization, ReLU activation, and dropout. The final result is stored in `att_out` for further processing.",
+        "type": "comment"
+    },
+    "7748": {
+        "file_id": 566,
+        "content": "        y = self.bn2(y)\n        y = F.relu(y)\n        y = self.dropout2(y)\n        out1 = self.fc3(y)\n        out1 = F.softmax(out1)\n        out2 = self.fc4(y)\n        out2 = F.sigmoid(out2)\n        return out1, out2\n    def loss(self, logits, iou, labels, labels_iou, **kwargs):\n        alpha = 10\n        softmax_loss = F.cross_entropy(logits, labels)\n        labels_iou = labels_iou.astype('float32')\n        mse_loss = paddle.sum(F.square_error_cost(iou, labels_iou), axis=-1)\n        sum_loss = softmax_loss + alpha * mse_loss\n        return sum_loss\n    def metric(self, scores, labels):\n        top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)\n        top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)\n        return top1, top5",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/attention_lstm_head.py:268-288"
+    },
+    "7749": {
+        "file_id": 566,
+        "content": "The code contains two main components: a LSTM attention head and loss/metric functions. The LSTM attention head computes attention weights for the input sequence, followed by softmax and sigmoid activation functions. The loss function calculates cross-entropy and mean squared error losses, with alpha as a weight parameter. The metric function computes top1 and top5 accuracy.",
+        "type": "comment"
+    },
+    "7750": {
+        "file_id": 567,
+        "content": "/paddlevideo/modeling/heads/base.py",
+        "type": "filepath"
+    },
+    "7751": {
+        "file_id": 567,
+        "content": "The code defines a PaddleVideo classification head base class and function for loss/accuracy calculation, supporting binary, multi-class, and specific MRI scenarios with label smoothing. It also calculates top5 accuracy, hard/soft labels, and performs all-reduce operation in distributed training.",
+        "type": "summary"
+    },
+    "7752": {
+        "file_id": 567,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nfrom abc import abstractmethod\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..builder import build_loss\nfrom paddlevideo.utils import get_logger, get_dist_info\nlogger = get_logger(\"paddlevideo\")\nclass BaseHead(nn.Layer):\n    \"\"\"Base class for head part.\n    All head should subclass it.\n    All subclass should overwrite:\n    - Methods: ```init_weights```, initializing weights.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/base.py:1-34"
+    },
+    "7753": {
+        "file_id": 567,
+        "content": "Base class for head part, all subclass should overwrite init_weights method for initializing weights.",
+        "type": "comment"
+    },
+    "7754": {
+        "file_id": 567,
+        "content": "    - Methods: ```forward```, forward function.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channels in input feature.\n        loss_cfg (dict): Config for building loss. Default: dict(type='CrossEntropyLoss').\n        ls_eps (float): label smoothing epsilon. Default: 0. .\n    \"\"\"\n    def __init__(\n        self,\n        num_classes=None,\n        in_channels=None,\n        loss_cfg=dict(\n            name=\"CrossEntropyLoss\"\n        ),  #TODO(shipping): only pass a name or standard build cfg format.\n        #multi_class=False, NOTE(shipping): not supported now.\n        ls_eps=0.):\n        super().__init__()\n        self.num_classes = num_classes\n        self.in_channels = in_channels\n        self.loss_func = build_loss(loss_cfg)\n        #self.multi_class = multi_class NOTE(shipping): not supported now\n        self.ls_eps = ls_eps\n    @abstractmethod\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        raise NotImplemented",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/base.py:35-65"
+    },
+    "7755": {
+        "file_id": 567,
+        "content": "This code is defining a base class for a classification head in PaddleVideo. It has an `__init__` method that sets the number of classes, input channels, and loss configuration. It also builds the loss function using the provided configuration. The `forward` method must be implemented by any subclasses, as it defines how the head will run during model inference.",
+        "type": "comment"
+    },
+    "7756": {
+        "file_id": 567,
+        "content": "    def loss(self, scores, labels, valid_mode=False, if_top5=True, **kwargs):\n        \"\"\"Calculate the loss accroding to the model output ```scores```,\n           and the target ```labels```.\n        Args:\n            scores (paddle.Tensor): The output of the model.\n            labels (paddle.Tensor): The target output of the model.\n        Returns:\n            losses (dict): A dict containing field 'loss'(mandatory) and 'top1_acc', 'top5_acc'(optional).\n        \"\"\"\n        if len(labels) == 1:  #commonly case\n            labels = labels[0]\n            losses = dict()\n            if self.ls_eps != 0. and not valid_mode:  # label_smooth\n                loss = self.label_smooth_loss(scores, labels, **kwargs)\n            else:\n                loss = self.loss_func(scores, labels, **kwargs)\n            if if_top5:\n                top1, top5 = self.get_acc(scores, labels, valid_mode)\n                losses['top1'] = top1\n                losses['top5'] = top5\n                losses['loss'] = loss\n            else:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/base.py:67-91"
+    },
+    "7757": {
+        "file_id": 567,
+        "content": "This function calculates the loss based on model output (scores) and target output (labels). It returns a dictionary containing 'loss', 'top1_acc', and 'top5_acc'. If labels are single, they are expanded. Label smoothing is applied if ls_eps is non-zero and not in valid mode. The loss function is used if label smoothing is not applicable. Top-1 and top-5 accuracy are also calculated if top-5 is set to True.",
+        "type": "comment"
+    },
+    "7758": {
+        "file_id": 567,
+        "content": "                top1 = self.get_acc(scores, labels, valid_mode, if_top5)\n                losses['top1'] = top1\n                losses['loss'] = loss\n            return losses\n        # MRI目前二分类无top5\n        elif len(labels) == 3:  # mix_up\n            labels_a, labels_b, lam = labels\n            lam = lam[0]  # get lam value\n            losses = dict()\n            if self.ls_eps != 0:\n                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)\n                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)\n            else:\n                loss_a = self.loss_func(scores, labels_a, **kwargs)\n                loss_b = self.loss_func(scores, labels_b, **kwargs)\n            loss = lam * loss_a + (1 - lam) * loss_b\n            if if_top5:\n                top1a, top5a = self.get_acc(scores, labels_a, valid_mode)\n                top1b, top5b = self.get_acc(scores, labels_b, valid_mode)\n                top1 = lam * top1a + (1 - lam) * top1b\n                top5 = lam * top5a + (1 - lam) * top5b",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/base.py:92-113"
+    },
+    "7759": {
+        "file_id": 567,
+        "content": "This code handles different cases for classification tasks. For binary and multi-class tasks, it calculates top1 accuracy and loss, while for the specific case of MRI with three labels (mix_up), it applies label smoothing or regular loss function and averages results for each sample to get the final loss and top1 accuracy.",
+        "type": "comment"
+    },
+    "7760": {
+        "file_id": 567,
+        "content": "                losses['top1'] = top1\n                losses['top5'] = top5\n                losses['loss'] = loss\n            else:\n                top1a = self.get_acc(scores, labels_a, valid_mode, if_top5)\n                top1b = self.get_acc(scores, labels_b, valid_mode, if_top5)\n                top1 = lam * top1a + (1 - lam) * top1b\n                losses['top1'] = top1\n                losses['loss'] = loss\n            return losses\n        else:\n            raise NotImplemented\n    def label_smooth_loss(self, scores, labels, **kwargs):\n        \"\"\"\n        Args:\n            scores (paddle.Tensor): [N, num_classes]\n            labels (paddle.Tensor): [N, ]\n        Returns:\n            paddle.Tensor: [1,]\n        \"\"\"\n        if paddle.is_compiled_with_custom_device('npu'):\n            \"\"\"\n            Designed for the lack of temporary operators of NPU,\n            main idea is to split smooth loss into uniform distribution loss\n            and hard label calculation\n            \"\"\"\n            hard_loss = (1.0 - self.ls_eps) * F.cross_entropy(scores, labels)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/base.py:114-143"
+    },
+    "7761": {
+        "file_id": 567,
+        "content": "Function defines a loss function for classification tasks, returning a dictionary of losses including top1 and overall loss. If valid_mode is True, calculates accuracy on validation set, otherwise on training set. Top1 accuracies for two sets are combined with specified lambda value to calculate final top1.",
+        "type": "comment"
+    },
+    "7762": {
+        "file_id": 567,
+        "content": "            uniform_loss = (self.ls_eps / self.num_classes) * (\n                -F.log_softmax(scores, -1).sum(-1).mean(0))\n            loss = hard_loss + uniform_loss\n        else:\n            labels = F.one_hot(labels, self.num_classes)\n            labels = F.label_smooth(labels, epsilon=self.ls_eps)\n            labels = paddle.squeeze(labels, axis=1)\n            loss = self.loss_func(scores, labels, soft_label=True, **kwargs)\n        return loss\n    def get_acc(self, scores, labels, valid_mode, if_top5=True):\n        if if_top5:\n            top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)\n            top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)\n            _, world_size = get_dist_info()\n            #NOTE(shipping): deal with multi cards validate\n            if world_size > 1 and valid_mode:  #reduce sum when valid\n                paddle.distributed.all_reduce(\n                    top1, op=paddle.distributed.ReduceOp.SUM)\n                top1 = top1 / world_size\n                paddle.distributed.all_reduce(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/base.py:144-164"
+    },
+    "7763": {
+        "file_id": 567,
+        "content": "Code is for computing loss and accuracy in a classification model. If the hard label is given, it calculates uniform_loss based on scores and adds it to hard_loss for total loss. Otherwise, it computes soft labels using one-hot encoding and label smoothing, then calculates loss using the provided loss function with soft_label set to True. The get_acc function computes top1 and top5 accuracy, averages them across all cards if valid mode is on, and returns the accuracy.",
+        "type": "comment"
+    },
+    "7764": {
+        "file_id": 567,
+        "content": "                    top5, op=paddle.distributed.ReduceOp.SUM)\n                top5 = top5 / world_size\n            return top1, top5\n        else:\n            top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)\n            _, world_size = get_dist_info()\n            #NOTE(shipping): deal with multi cards validate\n            if world_size > 1 and valid_mode:  #reduce sum when valid\n                paddle.distributed.all_reduce(\n                    top1, op=paddle.distributed.ReduceOp.SUM)\n                top1 = top1 / world_size\n            return top1",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/base.py:165-178"
+    },
+    "7765": {
+        "file_id": 567,
+        "content": "This code calculates the top1 and optionally top5 accuracy for a classification task. If distributed training is enabled, it performs all-reduce operation on the calculated metrics to ensure consistency across multiple cards/devices. The reduction operation used is sum, and the results are divided by the total number of devices (world_size) to obtain an average value.",
+        "type": "comment"
+    },
+    "7766": {
+        "file_id": 568,
+        "content": "/paddlevideo/modeling/heads/bbox_head.py",
+        "type": "filepath"
+    },
+    "7767": {
+        "file_id": 568,
+        "content": "The BBoxHeadAVA class generates classification targets, handles dropout, constructs labels, calculates recall/precision, computes losses, and uses a bbox_head for object detection. The code defines \"get_det_bboxes\" and \"multilabel_accuracy\" functions for detecting boxes and computing recall/precision, respectively. Loss is computed using binary cross-entropy with sigmoid activation.",
+        "type": "summary"
+    },
+    "7768": {
+        "file_id": 568,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle \nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom .. import builder\nfrom ..registry import HEADS\n@HEADS.register()\nclass BBoxHeadAVA(nn.Layer):\n    \"\"\"Simplest RoI head, with only two fc layers for classification and\n    regression respectively.  \"\"\"\n    def __init__(\n            self,\n            temporal_pool_type='avg',\n            spatial_pool_type='max',\n            in_channels=2048,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:1-32"
+    },
+    "7769": {
+        "file_id": 568,
+        "content": "This code defines a BBoxHeadAVA class, which is the simplest RoI (region of interest) head with two fully connected layers for classification and regression. The temporal_pool_type and spatial_pool_type parameters allow users to choose different pooling methods, while in_channels specifies the number of input channels for the network.",
+        "type": "comment"
+    },
+    "7770": {
+        "file_id": 568,
+        "content": "            num_classes=81,# The first class is reserved, to classify bbox as pos / neg\n            dropout_ratio=0,\n            dropout_before_pool=True,\n            topk=(3, 5),\n            multilabel=True):\n        super(BBoxHeadAVA, self).__init__()\n        assert temporal_pool_type in ['max', 'avg']\n        assert spatial_pool_type in ['max', 'avg']\n        self.temporal_pool_type = temporal_pool_type\n        self.spatial_pool_type = spatial_pool_type\n        self.in_channels = in_channels\n        self.num_classes = num_classes\n        self.dropout_ratio = dropout_ratio\n        self.dropout_before_pool = dropout_before_pool\n        self.multilabel = multilabel\n        if topk is None:\n            self.topk = ()\n        elif isinstance(topk, int):\n            self.topk = (topk, )\n        elif isinstance(topk, tuple):\n            assert all([isinstance(k, int) for k in topk])\n            self.topk = topk\n        else:\n            raise TypeError('topk should be int or tuple[int], '\n                            f'but get {type(topk)}')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:33-61"
+    },
+    "7771": {
+        "file_id": 568,
+        "content": "Class BBoxHeadAVA is being initialized with specified parameters including in_channels, num_classes, dropout_ratio, temporal and spatial pool types, topk values for pooling results, and multilabel flag. The code performs checks on the provided parameters to ensure their validity before assigning them to instance variables.",
+        "type": "comment"
+    },
+    "7772": {
+        "file_id": 568,
+        "content": "        # Class 0 is ignored when calculaing multilabel accuracy,\n        # so topk cannot be equal to num_classes\n        assert all([k < num_classes for k in self.topk])\n        assert self.multilabel\n        in_channels = self.in_channels\n        if self.temporal_pool_type == 'avg':\n            self.temporal_pool = nn.AdaptiveAvgPool3D((1, None, None))\n        else:\n            self.temporal_pool = nn.AdaptiveMaxPool3D((1, None, None))\n        if self.spatial_pool_type == 'avg':\n            self.spatial_pool = nn.AdaptiveAvgPool3D((None, 1, 1))\n        else:\n            self.spatial_pool = nn.AdaptiveMaxPool3D((None, 1, 1))\n        if dropout_ratio > 0:\n            self.dropout = nn.Dropout(dropout_ratio)\n        weight_attr = paddle.framework.ParamAttr(name=\"weight\",\n                                                 initializer=paddle.nn.initializer.Normal(mean=0.0, std=0.01))\n        bias_attr = paddle.ParamAttr(name=\"bias\",\n                                     initializer=paddle.nn.initializer.Constant(value=0.0))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:62-83"
+    },
+    "7773": {
+        "file_id": 568,
+        "content": "This code initializes the BBoxHead model, which is a part of PaddleVideo. It sets up different layers such as temporal and spatial pooling layers, and dropout layer if needed. The code also specifies the parameters for weights and biases in these layers.",
+        "type": "comment"
+    },
+    "7774": {
+        "file_id": 568,
+        "content": "        self.fc_cls = nn.Linear(in_channels, num_classes, weight_attr=weight_attr, bias_attr=bias_attr)\n        self.debug_imgs = None\n    def forward(self, x,rois, rois_num):\n        roi = paddle.concat(rois)\n        roi_x1 = paddle.index_select(roi, index=paddle.to_tensor(0), axis=1)\n        roi_x2 = paddle.index_select(roi, index=paddle.to_tensor(2), axis=1)\n        roi_w = roi_x2 - roi_x1\n        roi_y1 = paddle.index_select(roi, index=paddle.to_tensor(1), axis=1)\n        roi_y2 = paddle.index_select(roi, index=paddle.to_tensor(3), axis=1)\n        roi_h = roi_y2 - roi_y1\n        roi_area = paddle.multiply(roi_w, roi_h)\n        A = roi_area\n        A1 = paddle.full(A.shape, 1, dtype='int32')\n        A2 = paddle.where(A == 0, paddle.zeros_like(A1), A1)\n        AE = paddle.expand(A2, [A.shape[0], x.shape[1]])\n        rois_num = paddle.to_tensor(rois_num, dtype='int32')\n        if self.dropout_before_pool and self.dropout_ratio > 0 :\n            x = self.dropout(x)\n        x = self.temporal_pool(x)\n        x = self.spatial_pool(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:85-106"
+    },
+    "7775": {
+        "file_id": 568,
+        "content": "This code defines a bbox_head with a linear layer (fc_cls) for classification and initializes debug images. It also performs forward pass by computing ROI features, applying dropout if enabled, and pooling the features.",
+        "type": "comment"
+    },
+    "7776": {
+        "file_id": 568,
+        "content": "        if not self.dropout_before_pool and self.dropout_ratio > 0 :\n            x = self.dropout(x)\n        x = paddle.reshape(x, [x.shape[0], -1])\n        x = paddle.multiply(x, paddle.cast(AE,\"float32\"))\n        cls_score = self.fc_cls(x)\n        # We do not predict bbox, so return None\n        return cls_score, None\n    def get_targets(self, sampling_results, gt_bboxes, gt_labels, pos_weight):\n        pos_proposals = [res.pos_bboxes for res in sampling_results]\n        neg_proposals = [res.neg_bboxes for res in sampling_results]\n        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]\n        cls_reg_targets = self.bbox_target(pos_proposals, neg_proposals,\n                                      pos_gt_labels, pos_weight)\n        return cls_reg_targets\n    def bbox_target(self, pos_bboxes_list, neg_bboxes_list, gt_labels, pos_weight):\n        \"\"\"Generate classification targets for bboxes.  \"\"\"\n        labels, label_weights = [], []\n        pos_weight = 1.0 if pos_weight <= 0 else pos_weight",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:107-126"
+    },
+    "7777": {
+        "file_id": 568,
+        "content": "Code snippet is part of a Bounding Box (BBox) head in PaddleVideo, responsible for generating classification targets and handling dropout before pooling. The code also includes functions to generate bbox targets based on positive and negative proposals, ground truth labels, and a positional weight.",
+        "type": "comment"
+    },
+    "7778": {
+        "file_id": 568,
+        "content": "        assert len(pos_bboxes_list) == len(neg_bboxes_list) == len(gt_labels)\n        length = len(pos_bboxes_list)\n        for i in range(length):\n            pos_bboxes = pos_bboxes_list[i]\n            neg_bboxes = neg_bboxes_list[i]\n            gt_label = gt_labels[i]\n            num_pos = pos_bboxes.shape[0]\n            if neg_bboxes is not None:\n                num_neg = neg_bboxes.shape[0]\n            else:\n                num_neg = 0\n            num_samples = num_pos + num_neg\n            neg_label = paddle.zeros([num_neg, gt_label.shape[1]])\n            label = paddle.concat([gt_label,neg_label])\n            labels.append(label)\n        labels = paddle.concat(labels, 0)\n        return labels\n    def recall_prec(self, pred_vec, target_vec):\n        correct = paddle.to_tensor(np.logical_and(pred_vec.numpy(), target_vec.numpy()))\n        correct = paddle.where(correct, \n                                    paddle.full(correct.shape,1,dtype='int32'),\n                                    paddle.full(correct.shape,0,dtype='int32'))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:128-152"
+    },
+    "7779": {
+        "file_id": 568,
+        "content": "This code snippet is part of the PaddleVideo library's bbox_head module. It asserts that three lists have equal lengths and then iterates over each list, counting positive (pos) and negative (neg) bounding boxes. It constructs a label by concatenating ground truth labels with zero-filled negatives. The function returns the generated labels for training. The recall_prec function compares prediction vectors to target vectors, creating a correct vector before filling it with 1s or 0s based on their logical AND operation.",
+        "type": "comment"
+    },
+    "7780": {
+        "file_id": 568,
+        "content": "        recall_correct = paddle.cast(paddle.sum(correct, axis=1), 'float32')\n        target_vec = paddle.where(target_vec, \n                                    paddle.full(target_vec.shape,1,dtype='int32'),\n                                    paddle.full(target_vec.shape,0,dtype='int32'))\n        recall_target = paddle.cast(paddle.sum(target_vec, axis=1),'float32')\n        recall = recall_correct / recall_target\n        pred_vec = paddle.where(pred_vec, \n                                    paddle.full(pred_vec.shape,1,dtype='int32'),\n                                    paddle.full(pred_vec.shape,0,dtype='int32'))\n        prec_target = paddle.cast(paddle.sum(pred_vec, axis=1) + 1e-6, 'float32')\n        prec = recall_correct / prec_target\n        recall_mean = paddle.mean(recall)\n        prec_mean = paddle.mean(prec)\n        return recall_mean, prec_mean\n    def multilabel_accuracy(self, pred, target, thr=0.5):\n        pred = paddle.nn.functional.sigmoid(pred)\n        pred_vec = pred > thr\n        target_vec = target > 0.5",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:153-171"
+    },
+    "7781": {
+        "file_id": 568,
+        "content": "This code calculates recall and precision for multi-label classification tasks. It first computes recall and precision for each sample, then calculates the mean recall and precision across all samples. The function uses threshold values of 0.5 and 1e-6 for target and prediction vectors to ensure numerical stability.",
+        "type": "comment"
+    },
+    "7782": {
+        "file_id": 568,
+        "content": "        recall_thr, prec_thr = self.recall_prec(pred_vec, target_vec)\n        recalls, precs = [], []\n        for k in self.topk:\n            _, pred_label = paddle.topk(pred, k, 1, True, True)\n            pred_vec = paddle.full(pred.shape,0,dtype='bool')\n            num_sample = pred.shape[0]\n            for i in range(num_sample):\n                pred_vec[i, pred_label[i].numpy()] = 1  \n            recall_k, prec_k = self.recall_prec(pred_vec, target_vec)\n            recalls.append(recall_k)\n            precs.append(prec_k)\n        return recall_thr, prec_thr, recalls, precs\n    def loss(self,\n             cls_score,\n             labels):\n        losses = dict()\n        if cls_score is not None:\n            # Only use the cls_score\n            labels = labels[:, 1:]\n            pos_inds_bool = paddle.sum(labels, axis=-1) > 0\n            pos_inds = paddle.where(paddle.sum(labels, axis=-1) > 0,\n                                    paddle.full([labels.shape[0]],1,dtype='int32'),\n                                    paddle.full([labels.shape[0]],0,dtype='int32'))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:172-195"
+    },
+    "7783": {
+        "file_id": 568,
+        "content": "Code creates a bbox_head for object detection. It computes recall and precision given predicted and target vectors, and returns the results. In loss function, it only considers cls_score if available and computes losses based on pos_inds (positive indices) and labels.",
+        "type": "comment"
+    },
+    "7784": {
+        "file_id": 568,
+        "content": "            pos_inds = paddle.nonzero(pos_inds, as_tuple=False)\n            cls_score = paddle.index_select(cls_score, pos_inds, axis=0)\n            cls_score = cls_score[:, 1:] \n            labels = paddle.index_select(labels, pos_inds, axis=0)\n            bce_loss = F.binary_cross_entropy_with_logits\n            loss = bce_loss(cls_score, labels, reduction='none')\n            losses['loss'] = paddle.mean(loss)\n            recall_thr, prec_thr, recall_k, prec_k = self.multilabel_accuracy(\n                cls_score, labels, thr=0.5)\n            losses['recall@thr=0.5'] = recall_thr\n            losses['prec@thr=0.5'] = prec_thr\n            for i, k in enumerate(self.topk):\n                losses[f'recall@top{k}'] = recall_k[i]\n                losses[f'prec@top{k}'] = prec_k[i]\n        return losses\n    def get_det_bboxes(self,\n                       rois,\n                       cls_score,\n                       img_shape,\n                       flip=False,\n                       crop_quadruple=None,\n                       cfg=None):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:196-218"
+    },
+    "7785": {
+        "file_id": 568,
+        "content": "This code defines two functions: \"get_det_bboxes\" and \"multilabel_accuracy\". The \"get_det_bboxes\" function takes ROIs, cls_score, img_shape, flip, and crop_quadruple as inputs to calculate detection boxes for each bounding box. The \"multilabel_accuracy\" function calculates recall and precision for different thresholds and top-k values from the given cls_score and labels arrays. The code also computes loss using binary cross-entropy with logits and adds it to the losses dictionary.",
+        "type": "comment"
+    },
+    "7786": {
+        "file_id": 568,
+        "content": "        if isinstance(cls_score, list):\n            cls_score = sum(cls_score) / float(len(cls_score))\n        assert self.multilabel\n        m = paddle.nn.Sigmoid()\n        scores = m(cls_score)\n        bboxes = rois\n        return bboxes, scores",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/bbox_head.py:219-225"
+    },
+    "7787": {
+        "file_id": 568,
+        "content": "The code checks if cls_score is a list, calculates the mean of its elements if it's a list, asserts that self.multilabel is True, applies sigmoid activation to cls_score, and assigns resulting scores to variable 'scores'. It also assigns rois to bboxes and returns both bboxes and scores.",
+        "type": "comment"
+    },
+    "7788": {
+        "file_id": 569,
+        "content": "/paddlevideo/modeling/heads/cfbi_head.py",
+        "type": "filepath"
+    },
+    "7789": {
+        "file_id": 569,
+        "content": "The code introduces new layers, initializes Bottleneck with GCT, defines Convolutional Feature Fusion Block and Atrous Spatial Pyramid Pooling modules. The CollaborativeEnsemblerMS class is a neural network architecture with multiple input dimensions, transformer stages, convolutional layers, ReLU activation, and outputs foreground/background logits using ASPP modules.",
+        "type": "summary"
+    },
+    "7790": {
+        "file_id": 569,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nclass IA_gate(nn.Layer):\n    def __init__(self, in_dim, out_dim):\n        super(IA_gate, self).__init__()\n        self.IA = nn.Linear(in_dim, out_dim)\n    def forward(self, x, IA_head):\n        a = self.IA(IA_head)\n        a = 1. + paddle.tanh(a)\n        a = paddle.unsqueeze(paddle.unsqueeze(a, axis=-1), axis=-1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:1-32"
+    },
+    "7791": {
+        "file_id": 569,
+        "content": "This code defines a class for the IA_gate layer, which is a part of a computer vision model. It has an input and output dimension and includes a linear layer and a forward function. The forward function calculates the activation (a) by applying a tanh function to the linear layer's output and then unsqueezing it along the axis for multiplication. The result is used in the model's computation.",
+        "type": "comment"
+    },
+    "7792": {
+        "file_id": 569,
+        "content": "        x = a * x\n        return x\nclass GCT(nn.Layer):\n    def __init__(self, num_channels, epsilon=1e-5, mode='l2', after_relu=False):\n        super(GCT, self).__init__()\n        x1 = paddle.zeros([1, num_channels, 1, 1])\n        x2 = paddle.ones([1, num_channels, 1, 1])\n        self.alpha = paddle.create_parameter(\n            shape=x2.shape,\n            dtype=x2.dtype,\n            default_initializer=nn.initializer.Assign(x2))\n        self.alpha.stop_gradient = False\n        self.gamma = paddle.create_parameter(\n            shape=x1.shape,\n            dtype=x1.dtype,\n            default_initializer=nn.initializer.Assign(x1))\n        self.gamma.stop_gradient = False\n        self.beta = paddle.create_parameter(\n            shape=x1.shape,\n            dtype=x1.dtype,\n            default_initializer=nn.initializer.Assign(x1))\n        self.beta.stop_gradient = False\n        self.epsilon = epsilon\n        self.mode = mode\n        self.after_relu = after_relu\n    def forward(self, x):\n        if self.mode == 'l2':\n            embedding = paddle.pow(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:33-65"
+    },
+    "7793": {
+        "file_id": 569,
+        "content": "This code defines a GCT layer, which is a type of normalization layer for neural networks. It initializes parameters alpha, gamma and beta with specific shapes and default values. The layer also takes in an input x, applies the mode 'l2' operation (pow) on it, and returns the result.",
+        "type": "comment"
+    },
+    "7794": {
+        "file_id": 569,
+        "content": "                paddle.sum(paddle.pow(x, 2), axis=[2, 3], keepdim=True) +\n                self.epsilon, 0.5) * self.alpha\n            norm = self.gamma / paddle.pow(\n                (paddle.mean(paddle.pow(embedding, 2), axis=1, keepdim=True) +\n                 self.epsilon), 0.5)\n        elif self.mode == 'l1':\n            if not self.after_relu:\n                _x = paddle.abs(x)\n            else:\n                _x = x\n            embedding = paddle.sum(_x, axis=(2, 3), keepdim=True) * self.alpha\n            norm = self.gamma / (paddle.mean(\n                paddle.abs(embedding), axis=1, keepdim=True) + self.epsilon)\n        else:\n            print('Unknown mode!')\n            exit()\n        gate = 1. + paddle.tanh(embedding * norm + self.beta)\n        return x * gate\nclass Bottleneck(nn.Layer):\n    def __init__(self, inplanes, outplanes, stride=1, dilation=1):\n        super(Bottleneck, self).__init__()\n        expansion = 4\n        planes = int(outplanes / expansion)\n        self.GCT1 = GCT(inplanes)\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:66-95"
+    },
+    "7795": {
+        "file_id": 569,
+        "content": "The code initializes a Bottleneck layer in the PaddleVideo model, with GCT and convolutional layers for feature extraction. It also includes adjustable normalization and activation based on the mode parameter.",
+        "type": "comment"
+    },
+    "7796": {
+        "file_id": 569,
+        "content": "        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=planes)\n        self.conv2 = nn.Conv2D(planes,\n                               planes,\n                               kernel_size=3,\n                               stride=stride,\n                               dilation=dilation,\n                               padding=dilation,\n                               bias_attr=False)\n        self.bn2 = nn.GroupNorm(num_groups=32, num_channels=planes)\n        self.conv3 = nn.Conv2D(planes,\n                               planes * expansion,\n                               kernel_size=1,\n                               bias_attr=False)\n        self.bn3 = nn.GroupNorm(num_groups=32, num_channels=planes * expansion)\n        self.relu = nn.ReLU()\n        if stride != 1 or inplanes != planes * expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(inplanes,\n                          planes * expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:96-119"
+    },
+    "7797": {
+        "file_id": 569,
+        "content": "This code defines a neural network layer that includes batch normalization and convolutional layers, as well as ReLU activation. It has the option for downsampling if necessary.",
+        "type": "comment"
+    },
+    "7798": {
+        "file_id": 569,
+        "content": "                nn.GroupNorm(num_groups=32, num_channels=planes * expansion),\n            )\n        else:\n            downsample = None\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n    def forward(self, x):\n        residual = x\n        out = self.GCT1(x)\n        out = self.conv1(out)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass _ASPPModule(nn.Layer):\n    def __init__(self, inplanes, planes, kernel_size, padding, dilation):\n        super(_ASPPModule, self).__init__()\n        self.GCT = GCT(inplanes)\n        self.atrous_conv = nn.Conv2D(inplanes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:120-160"
+    },
+    "7799": {
+        "file_id": 569,
+        "content": "Code initializes a module with 3 Conv2D layers and BatchNorm2D layers. It also includes a GroupNorm layer if num_groups and num_channels are specified, otherwise sets downsample to None. Initializes sublayers and applies Kaiming Normal initialization. Forward function performs convolutions, adds residual connection if applicable, and applies ReLU activation. _ASPPModule has GCT and AtrousConv2D layers.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/78.json b/docs/data/78.json
new file mode 100644
index 000000000..9909f05b7
--- /dev/null
+++ b/docs/data/78.json
@@ -0,0 +1,543 @@
+{
+    "7800": {
+        "file_id": 569,
+        "content": "                                     planes,\n                                     kernel_size=kernel_size,\n                                     stride=1,\n                                     padding=padding,\n                                     dilation=dilation,\n                                     bias_attr=False)\n        self.bn = nn.GroupNorm(num_groups=int(planes / 4), num_channels=planes)\n        self.relu = nn.ReLU()\n        self._init_weight()\n    def forward(self, x):\n        x = self.GCT(x)\n        x = self.atrous_conv(x)\n        x = self.bn(x)\n        return self.relu(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)\nclass ASPP(nn.Layer):\n    def __init__(self):\n        super(ASPP, self).__init__()\n        inplanes = 512\n        dilations = [1, 6, 12, 18]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:161-193"
+    },
+    "7801": {
+        "file_id": 569,
+        "content": "The code defines a Convolutional Feature Fusion Block (CFFB) and an Atrous Spatial Pyramid Pooling (ASPP) module. The CFFB consists of group convolution, batch normalization, and ReLU activation layers. The ASPP module has four pathways with different dilation rates, each followed by a group convolution, batch normalization, and ReLU activation.",
+        "type": "comment"
+    },
+    "7802": {
+        "file_id": 569,
+        "content": "        self.aspp1 = _ASPPModule(inplanes,\n                                 128,\n                                 1,\n                                 padding=0,\n                                 dilation=dilations[0])\n        self.aspp2 = _ASPPModule(inplanes,\n                                 128,\n                                 3,\n                                 padding=dilations[1],\n                                 dilation=dilations[1])\n        self.aspp3 = _ASPPModule(inplanes,\n                                 128,\n                                 3,\n                                 padding=dilations[2],\n                                 dilation=dilations[2])\n        self.aspp4 = _ASPPModule(inplanes,\n                                 128,\n                                 3,\n                                 padding=dilations[3],\n                                 dilation=dilations[3])\n        self.global_avg_pool = nn.Sequential(\n            nn.AdaptiveAvgPool2D((1, 1)),\n            nn.Conv2D(inplanes, 128, 1, stride=1, bias_attr=False), nn.ReLU())",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:195-218"
+    },
+    "7803": {
+        "file_id": 569,
+        "content": "This code initializes four ASPPModules and a global average pooling layer in the CFBI head model for feature extraction and pooling. The ASPPModules have different dilation rates based on the specified dilations list, while the global_avg_pool performs adaptive averaging and convolution to extract global features.",
+        "type": "comment"
+    },
+    "7804": {
+        "file_id": 569,
+        "content": "        self.GCT = GCT(640)\n        self.conv1 = nn.Conv2D(640, 256, 1, bias_attr=False)\n        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=256)\n        self.relu = nn.ReLU()\n        self._init_weight()\n    def forward(self, x):\n        x1 = self.aspp1(x)\n        x2 = self.aspp2(x)\n        x3 = self.aspp3(x)\n        x4 = self.aspp4(x)\n        x5 = self.global_avg_pool(x)\n        x5 = F.interpolate(x5,\n                           size=x4.shape[2:],\n                           mode='bilinear',\n                           align_corners=True)\n        x = paddle.concat([x1, x2, x3, x4, x5], axis=1)\n        x = self.GCT(x)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        return x\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:220-251"
+    },
+    "7805": {
+        "file_id": 569,
+        "content": "The code initializes a class with multiple layers for feature extraction and processing, using Conv2D, GroupNorm, ReLU activation functions, and Global Average Pooling. The forward function combines features from different ASPP modules and passes them through GCT, convolution, batch normalization, and ReLU for final output. Initializes the weight of each layer with specific initializers.",
+        "type": "comment"
+    },
+    "7806": {
+        "file_id": 569,
+        "content": "@HEADS.register()\nclass CollaborativeEnsemblerMS(nn.Layer):\n    def __init__(\n        self,\n        model_semantic_embedding_dim=256,\n        model_multi_local_distance=[[4, 8, 12, 16, 20, 24],\n                                    [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]],\n        model_head_embedding_dim=256,\n        model_refine_channels=64,\n        model_low_level_inplanes=256,\n    ):\n        super(CollaborativeEnsemblerMS, self).__init__()\n        in_dim_4x = model_semantic_embedding_dim * 3 + 3 + 2 * len(\n            model_multi_local_distance[0])\n        in_dim_8x = model_semantic_embedding_dim * 3 + 3 + 2 * len(\n            model_multi_local_distance[1])\n        in_dim_16x = model_semantic_embedding_dim * 3 + 3 + 2 * len(\n            model_multi_local_distance[2])\n        attention_dim = model_semantic_embedding_dim * 4\n        embed_dim = model_head_embedding_dim\n        refine_dim = model_refine_channels\n        low_level_dim = model_low_level_inplanes\n        IA_in_dim = attention_dim\n        self.relu = nn.ReLU()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:254-279"
+    },
+    "7807": {
+        "file_id": 569,
+        "content": "The code defines a CollaborativeEnsemblerMS class within the PaddleVideo framework. It has multiple input dimensions (4x, 8x, and 16x) for semantic embedding, local distance, and attention dimension. The class also includes an instance of ReLU activation function.",
+        "type": "comment"
+    },
+    "7808": {
+        "file_id": 569,
+        "content": "        # stage 1\n        self.S1_IA1 = IA_gate(IA_in_dim, in_dim_4x)\n        self.S1_layer1 = Bottleneck(in_dim_4x, embed_dim)\n        self.S1_IA2 = IA_gate(IA_in_dim, embed_dim)\n        self.S1_layer2 = Bottleneck(embed_dim, embed_dim, 1, 2)\n        # stage2\n        self.S2_IA1 = IA_gate(IA_in_dim, embed_dim)\n        self.S2_layer1 = Bottleneck(embed_dim, embed_dim * 2, 2)\n        self.S2_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_8x)\n        self.S2_layer2 = Bottleneck(embed_dim * 2 + in_dim_8x, embed_dim * 2, 1,\n                                    2)\n        self.S2_IA3 = IA_gate(IA_in_dim, embed_dim * 2)\n        self.S2_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)\n        # stage3\n        self.S3_IA1 = IA_gate(IA_in_dim, embed_dim * 2)\n        self.S3_layer1 = Bottleneck(embed_dim * 2, embed_dim * 2, 2)\n        self.S3_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_16x)\n        self.S3_layer2 = Bottleneck(embed_dim * 2 + in_dim_16x, embed_dim * 2,\n                                    1, 2)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:281-306"
+    },
+    "7809": {
+        "file_id": 569,
+        "content": "This code initializes multiple layers for different stages of a transformer model. Each stage consists of several IA_gate and Bottleneck layers, with varying input and output dimensions. The stages progressively increase the embedding dimension, incorporating additional inputs along the way.",
+        "type": "comment"
+    },
+    "7810": {
+        "file_id": 569,
+        "content": "        self.S3_IA3 = IA_gate(IA_in_dim, embed_dim * 2)\n        self.S3_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)\n        self.ASPP_IA = IA_gate(IA_in_dim, embed_dim * 2)\n        self.ASPP = ASPP()\n        # Decoder\n        self.GCT_sc = GCT(low_level_dim + embed_dim)\n        self.conv_sc = nn.Conv2D(low_level_dim + embed_dim,\n                                 refine_dim,\n                                 1,\n                                 bias_attr=False)\n        self.bn_sc = nn.GroupNorm(num_groups=int(refine_dim / 4),\n                                  num_channels=refine_dim)\n        self.relu = nn.ReLU()\n        self.IA10 = IA_gate(IA_in_dim, embed_dim + refine_dim)\n        self.conv1 = nn.Conv2D(embed_dim + refine_dim,\n                               int(embed_dim / 2),\n                               kernel_size=3,\n                               padding=1,\n                               bias_attr=False)\n        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))\n        self.IA11 = IA_gate(IA_in_dim, int(embed_dim / 2))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:308-332"
+    },
+    "7811": {
+        "file_id": 569,
+        "content": "This code is defining various components of a model for feature extraction and fusion. It includes IA_gate, Bottleneck, GCT, ASPP, nn.Conv2D, GroupNorm, ReLU layers and their configurations. The model has separate modules for encoding and decoding stages to process low-level and high-level features respectively.",
+        "type": "comment"
+    },
+    "7812": {
+        "file_id": 569,
+        "content": "        self.conv2 = nn.Conv2D(int(embed_dim / 2),\n                               int(embed_dim / 2),\n                               kernel_size=3,\n                               padding=1,\n                               bias_attr=False)\n        self.bn2 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))\n        # Output\n        self.IA_final_fg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)\n        self.IA_final_bg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)\n        self.conv_sc.weight.data = nn.initializer.KaimingNormal()\n        self.conv1.weight.data = nn.initializer.KaimingNormal()\n        self.conv2.weight.data = nn.initializer.KaimingNormal()\n    def forward(self, all_x, all_IA_head=None, low_level_feat=None):\n        x_4x, x_8x, x_16x = all_x\n        IA_head = all_IA_head[0]\n        # stage 1\n        x = self.S1_IA1(x_4x, IA_head)\n        x = self.S1_layer1(x)\n        x = self.S1_IA2(x, IA_head)\n        x = self.S1_layer2(x)\n        low_level_feat = paddle.concat(\n            [paddle.expand(low_level_feat, [x.shape[0], -1, -1, -1]), x],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:333-360"
+    },
+    "7813": {
+        "file_id": 569,
+        "content": "This code defines a neural network architecture for a computer vision task. It includes convolutional layers, batch normalization, and linear layers. The forward function applies these operations to input features at different scales (4x, 8x, 16x) and concatenates the results. The KaimingNormal initialization is used to set the weights of the convolution layers.",
+        "type": "comment"
+    },
+    "7814": {
+        "file_id": 569,
+        "content": "            axis=1)\n        # stage 2\n        x = self.S2_IA1(x, IA_head)\n        x = self.S2_layer1(x)\n        x = paddle.concat([x, x_8x], axis=1)\n        x = self.S2_IA2(x, IA_head)\n        x = self.S2_layer2(x)\n        x = self.S2_IA3(x, IA_head)\n        x = self.S2_layer3(x)\n        # stage 3\n        x = self.S3_IA1(x, IA_head)\n        x = self.S3_layer1(x)\n        x = paddle.concat([x, x_16x], axis=1)\n        x = self.S3_IA2(x, IA_head)\n        x = self.S3_layer2(x)\n        x = self.S3_IA3(x, IA_head)\n        x = self.S3_layer3(x)\n        # ASPP + Decoder\n        x = self.ASPP_IA(x, IA_head)\n        x = self.ASPP(x)\n        x = self.decoder(x, low_level_feat, IA_head)\n        fg_logit = self.IA_logit(x, IA_head, self.IA_final_fg)\n        bg_logit = self.IA_logit(x, IA_head, self.IA_final_bg)\n        pred = self.augment_background_logit(fg_logit, bg_logit)\n        return pred\n    def IA_logit(self, x, IA_head, IA_final):\n        n, c, h, w = x.shape\n        x = paddle.reshape(x, [1, n * c, h, w])\n        IA_output = IA_final(IA_head)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:361-401"
+    },
+    "7815": {
+        "file_id": 569,
+        "content": "This code defines a neural network architecture for instance segmentation. It consists of multiple stages and an ASPP (Atrous Spatial Pyramid Pooling) module. The IA_logit function is used to output foreground and background logits. The final output, 'pred', is the instance segmentation prediction after applying background augmentation.",
+        "type": "comment"
+    },
+    "7816": {
+        "file_id": 569,
+        "content": "        IA_weight = IA_output[:, :c]\n        IA_bias = IA_output[:, -1]\n        IA_weight = paddle.reshape(IA_weight, [n, c, 1, 1])\n        IA_bias = paddle.reshape(IA_bias, [-1])\n        logit = paddle.reshape(\n            F.conv2d(x, weight=IA_weight, bias=IA_bias, groups=n), [n, 1, h, w])\n        return logit\n    def decoder(self, x, low_level_feat, IA_head):\n        x = F.interpolate(x,\n                          size=low_level_feat.shape[2:],\n                          mode='bicubic',\n                          align_corners=True)\n        low_level_feat = self.GCT_sc(low_level_feat)\n        low_level_feat = self.conv_sc(low_level_feat)\n        low_level_feat = self.bn_sc(low_level_feat)\n        low_level_feat = self.relu(low_level_feat)\n        x = paddle.concat([x, low_level_feat], axis=1)\n        x = self.IA10(x, IA_head)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.IA11(x, IA_head)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu(x)\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:402-433"
+    },
+    "7817": {
+        "file_id": 569,
+        "content": "The code defines two functions: `IA_head` and `decoder`. The `IA_head` function takes an input, applies a convolution with a weight and a bias, and reshapes the output. The `decoder` function combines an input image and a low-level feature, passes it through several convolutional layers with batch normalization and ReLU activation, then applies two IA heads.",
+        "type": "comment"
+    },
+    "7818": {
+        "file_id": 569,
+        "content": "    def augment_background_logit(self, fg_logit, bg_logit):\n        #  We augment the logit of absolute background by using the relative background logit of all the\n        #  foreground objects.\n        obj_num = fg_logit.shape[0]\n        pred = fg_logit\n        if obj_num > 1:\n            bg_logit = bg_logit[1:obj_num, :, :, :]\n            aug_bg_logit = paddle.min(bg_logit, axis=0, keepdim=True)\n            pad = paddle.expand(paddle.zeros(aug_bg_logit.shape),\n                                [obj_num - 1, -1, -1, -1])\n            aug_bg_logit = paddle.concat([aug_bg_logit, pad], axis=0)\n            pred = pred + aug_bg_logit\n        pred = paddle.transpose(pred, [1, 0, 2, 3])\n        return pred",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/cfbi_head.py:435-448"
+    },
+    "7819": {
+        "file_id": 569,
+        "content": "This function takes two logits, fg_logit and bg_logit, and augments the absolute background logit by using relative background logits from all foreground objects. If there are more than one foreground object, it calculates the minimum of their relative background logits, pads with zeros to match the number of original background logits, concatenates, and adds this augmented background logit to the original fg_logit. The output is then transposed before being returned.",
+        "type": "comment"
+    },
+    "7820": {
+        "file_id": 570,
+        "content": "/paddlevideo/modeling/heads/ctrgcn_head.py",
+        "type": "filepath"
+    },
+    "7821": {
+        "file_id": 570,
+        "content": "The CTRGCNHead class is a neural network head for the CTR-GCN model in PaddleVideo library, containing layers initialization, weight initialization, and forward pass function definition. The ctrgcn_head class returns the result of passing input x through a fully connected layer (fc) for feature processing and prediction.",
+        "type": "summary"
+    },
+    "7822": {
+        "file_id": 570,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn as nn\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass CTRGCNHead(BaseHead):\n    \"\"\"\n    Head for CTR-GCN model.\n    Args:\n        in_channels: int, input feature channels. Default: 64.\n        num_classes: int, output the number of classes.\n        drop_out: float, dropout ratio of layer. Default: 0.\n    \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/ctrgcn_head.py:1-32"
+    },
+    "7823": {
+        "file_id": 570,
+        "content": "This code snippet is a part of the PaddleVideo library, specifically the CTRGCNHead class. It is a neural network head for the CTR-GCN model that takes in input feature channels and outputs the number of classes, with an optional dropout ratio. The code imports necessary libraries, registers the class under the HEADS registry, and defines the class itself as part of the BaseHead class.",
+        "type": "comment"
+    },
+    "7824": {
+        "file_id": 570,
+        "content": "    def __init__(self, in_channels=64, num_classes=10, drop_out=0, **kwargs):\n        super().__init__(num_classes, in_channels, **kwargs)\n        self.in_channels = in_channels\n        self.drop_out = drop_out\n        self.fc = nn.Linear(self.in_channels * 4, self.num_classes)\n        if drop_out:\n            self.drop_out = nn.Dropout(self.drop_out)\n        else:\n            self.drop_out = lambda x: x\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv2D):\n                weight_init_(layer.weight,\n                             'Normal',\n                             mean=0.0,\n                             std=math.sqrt(2. / self.num_classes))\n    def forward(self, output_patch):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        x, N, M = output_patch\n        # N*M,C,T,V\n        _, c_new, T, V = x.shape\n        x = paddle.reshape(x, shape=[N, M, c_new, T * V])\n        x = x.mean(3).mean(1)\n        x = self.drop_out(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/ctrgcn_head.py:34-63"
+    },
+    "7825": {
+        "file_id": 570,
+        "content": "Class constructor for a neural network head with optional dropout. Initializes layers, applies weight initialization, and defines the forward pass function.",
+        "type": "comment"
+    },
+    "7826": {
+        "file_id": 570,
+        "content": "        return self.fc(x)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/ctrgcn_head.py:65-65"
+    },
+    "7827": {
+        "file_id": 570,
+        "content": "This code snippet is from the ctrgcn_head class, and it returns the result of passing the input x through a fully connected layer (fc). The purpose might be to process the features extracted by the previous layers in the model for making predictions or generating output.",
+        "type": "comment"
+    },
+    "7828": {
+        "file_id": 571,
+        "content": "/paddlevideo/modeling/heads/i3d_head.py",
+        "type": "filepath"
+    },
+    "7829": {
+        "file_id": 571,
+        "content": "This code defines an I3D classification head in PaddleVideo with options for loss, pooling type, dropout ratio and initialization standard deviation. It performs adaptive average pooling, dropout, linear layer, and has a learning rate of 10.0.",
+        "type": "summary"
+    },
+    "7830": {
+        "file_id": 571,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nfrom .base import BaseHead\n@HEADS.register()\nclass I3DHead(BaseHead):\n    \"\"\"Classification head for I3D.\n    Args:\n        num_classes (int): Number of classes to be classified.\n        in_channels (int): Number of channels in input feature.\n        loss_cls (dict): Config for building loss.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/i3d_head.py:1-31"
+    },
+    "7831": {
+        "file_id": 571,
+        "content": "This code snippet imports necessary libraries and defines a class called \"I3DHead\" which is a classification head for I3D models. It takes in arguments like the number of classes to be classified, the input channel size, and configuration for building loss. The code is part of PaddleVideo library and registered with HEADS registry.",
+        "type": "comment"
+    },
+    "7832": {
+        "file_id": 571,
+        "content": "            Default: dict(name='CrossEntropyLoss')\n        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.\n        drop_ratio (float): Probability of dropout layer. Default: 0.5.\n        std (float): Std value for Initiation. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to be used to initialize\n            the head.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 spatial_type='avg',\n                 drop_ratio=0.5,\n                 std=0.01,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.spatial_type = spatial_type\n        self.drop_ratio = drop_ratio\n        self.stdv = std\n        if self.drop_ratio != 0:\n            self.dropout = nn.Dropout(p=self.drop_ratio)\n        else:\n            self.dropout = None\n        self.fc = nn.Linear(\n            self.in_channels,\n            self.num_classes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/i3d_head.py:32-59"
+    },
+    "7833": {
+        "file_id": 571,
+        "content": "Class constructor for a head, with optional parameters for loss configuration, pooling type in spatial dimension, dropout ratio, and standard deviation for initialization. Initializes the base class, sets attributes, and optionally adds a Dropout layer if drop_ratio is non-zero.",
+        "type": "comment"
+    },
+    "7834": {
+        "file_id": 571,
+        "content": "            weight_attr=ParamAttr(learning_rate=10.0),\n            bias_attr=ParamAttr(learning_rate=10.0),\n        )\n        if self.spatial_type == 'avg':\n            # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels.\n            self.avg_pool = nn.AdaptiveAvgPool3D((1, 1, 1))\n        else:\n            self.avg_pool = None\n    def init_weights(self):\n        \"\"\"Initiate the parameters from scratch.\"\"\"\n        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)\n    def forward(self, x):\n        \"\"\"Defines the computation performed at every call.\n        Args:\n            x (torch.Tensor): The input data.\n        Returns:\n            torch.Tensor: The classification scores for input samples.\n        \"\"\"\n        # [N, in_channels, 4, 7, 7]\n        if self.avg_pool is not None:\n            x = self.avg_pool(x)\n        # [N, in_channels, 1, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)\n        # [N, in_channels, 1, 1, 1]\n        N = paddle.shape(x)[0]\n        x = x.reshape([N, -1])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/i3d_head.py:60-91"
+    },
+    "7835": {
+        "file_id": 571,
+        "content": "Function: I3D Head\nPurpose: To process and classify the extracted features from an I3D network\nKey Operations: Adaptive average pooling, dropout, and a linear layer for classification\nLearning Rate: Set to 10.0 for weights and bias parameters",
+        "type": "comment"
+    },
+    "7836": {
+        "file_id": 571,
+        "content": "        # [N, in_channels]\n        cls_score = self.fc(x)\n        # [N, num_classes]\n        return cls_score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/i3d_head.py:92-95"
+    },
+    "7837": {
+        "file_id": 571,
+        "content": "This code snippet represents the output layer of a classification head in PaddleVideo. It takes input 'x' and passes it through 'self.fc', which is presumably a fully connected (FC) layer, producing 'cls_score'. The result is then returned as the final classification score for each sample. The shape of the output is [N, num_classes], where N is the batch size.",
+        "type": "comment"
+    },
+    "7838": {
+        "file_id": 572,
+        "content": "/paddlevideo/modeling/heads/movinet_head.py",
+        "type": "filepath"
+    },
+    "7839": {
+        "file_id": 572,
+        "content": "Class MoViNetHead extends BaseHead and registers itself with the HEADS registry. It initializes without any specific parameters and its forward function simply returns input 'x' without any modifications.",
+        "type": "summary"
+    },
+    "7840": {
+        "file_id": 572,
+        "content": "import collections.abc\ncontainer_abcs = collections.abc\nfrom ..registry import HEADS\nfrom .base import BaseHead\nfrom ..builder import build_loss\n@HEADS.register()\nclass MoViNetHead(BaseHead):\n    def __init__(self):\n        super().__init__()\n    def forward(self, x, *args):\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/movinet_head.py:1-15"
+    },
+    "7841": {
+        "file_id": 572,
+        "content": "Class MoViNetHead extends BaseHead and registers itself with the HEADS registry. It initializes without any specific parameters and its forward function simply returns input 'x' without any modifications.",
+        "type": "comment"
+    },
+    "7842": {
+        "file_id": 573,
+        "content": "/paddlevideo/modeling/heads/ms_tcn_head.py",
+        "type": "filepath"
+    },
+    "7843": {
+        "file_id": 573,
+        "content": "The code defines a model and calculates loss, F1 score, and edit scores for recognition tasks. It retrieves label start/end times from recognized and ground truth sequences, then iterates through labels to calculate F-score for overlapping segments, updating tp, fp, fn counts and returning the F-score as a float value.",
+        "type": "summary"
+    },
+    "7844": {
+        "file_id": 573,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom paddle import ParamAttr\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass MSTCNHead(BaseHead):\n    def __init__(self, num_classes, in_channels):\n        super().__init__(num_classes, in_channels)\n        self.ce = nn.CrossEntropyLoss(ignore_index=-100)\n        self.mse = nn.MSELoss(reduction='none')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/ms_tcn_head.py:1-33"
+    },
+    "7845": {
+        "file_id": 573,
+        "content": "This code defines the MSTCNHead class, a head for PaddleVideo's MS-TCN model. It inherits from BaseHead and initializes a CrossEntropyLoss and Mean Squared Error loss function.",
+        "type": "comment"
+    },
+    "7846": {
+        "file_id": 573,
+        "content": "        self.num_classes = num_classes\n        # cls score\n        self.overlap = 0.5\n    def forward(self, x):\n        \"\"\"MS-TCN no head\n        \"\"\"\n        return x\n    def loss(self, output, video_gt):\n        \"\"\"calculate loss\n        \"\"\"\n        output_transpose = paddle.transpose(output, [2, 0, 1])\n        ce_x = paddle.reshape(output_transpose,\n                              (output_transpose.shape[0] *\n                               output_transpose.shape[1], self.num_classes))\n        ce_y = video_gt[0, :]\n        ce_loss = self.ce(ce_x, ce_y)\n        loss = ce_loss\n        mse = self.mse(F.log_softmax(output[:, :, 1:], axis=1),\n                       F.log_softmax(output.detach()[:, :, :-1], axis=1))\n        mse = paddle.clip(mse, min=0, max=16)\n        mse_loss = 0.15 * paddle.mean(mse)\n        loss += mse_loss\n        return loss\n    def get_F1_score(self, predicted, groundTruth):\n        recog_content = list(predicted.numpy())\n        gt_content = list(groundTruth[0].numpy())\n        # cls score\n        correct = 0",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/ms_tcn_head.py:34-68"
+    },
+    "7847": {
+        "file_id": 573,
+        "content": "The code defines a class for the MS-TCN head, which calculates loss and F1 score. The forward function returns the input as is. The loss function transposes output tensor, computes cross-entropy (CE) loss, and adds mean squared error (MSE) loss with weight 0.15. The get_F1_score function converts predicted and ground truth to lists, counts correct classifications, and returns F1 score.",
+        "type": "comment"
+    },
+    "7848": {
+        "file_id": 573,
+        "content": "        total = 0\n        edit = 0\n        for i in range(len(gt_content)):\n            total += 1\n            if gt_content[i] == recog_content[i]:\n                correct += 1\n        edit_num = self.edit_score(recog_content, gt_content)\n        edit += edit_num\n        tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)\n        # cls metric\n        precision = tp / float(tp + fp)\n        recall = tp / float(fp + fn)\n        if precision + recall > 0.0:\n            f1 = 2.0 * (precision * recall) / (precision + recall)\n        else:\n            f1 = 0.0\n        f1 = np.nan_to_num(f1)\n        return f1\n    def get_labels_start_end_time(self, frame_wise_labels):\n        labels = []\n        starts = []\n        ends = []\n        last_label = frame_wise_labels[0]\n        labels.append(frame_wise_labels[0])\n        starts.append(0)\n        for i in range(len(frame_wise_labels)):\n            if frame_wise_labels[i] != last_label:\n                labels.append(frame_wise_labels[i])\n                starts.append(i)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/ms_tcn_head.py:69-105"
+    },
+    "7849": {
+        "file_id": 573,
+        "content": "This code calculates the F1 score based on a given sequence of content and then extracts labels, start times, and end times from frame-wise labels. It iterates through the sequence to determine correct and incorrect elements, as well as false positives and negatives for the F1 score calculation. The extracted labels, starts, and ends are stored in separate lists.",
+        "type": "comment"
+    },
+    "7850": {
+        "file_id": 573,
+        "content": "                ends.append(i)\n                last_label = frame_wise_labels[i]\n        ends.append(i + 1)\n        return labels, starts, ends\n    def levenstein(self, p, y, norm=False):\n        m_row = len(p)\n        n_col = len(y)\n        D = np.zeros([m_row + 1, n_col + 1], np.float)\n        for i in range(m_row + 1):\n            D[i, 0] = i\n        for i in range(n_col + 1):\n            D[0, i] = i\n        for j in range(1, n_col + 1):\n            for i in range(1, m_row + 1):\n                if y[j - 1] == p[i - 1]:\n                    D[i, j] = D[i - 1, j - 1]\n                else:\n                    D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,\n                                  D[i - 1, j - 1] + 1)\n        if norm:\n            score = (1 - D[-1, -1] / max(m_row, n_col)) * 100\n        else:\n            score = D[-1, -1]\n        return score\n    def edit_score(self, recognized, ground_truth, norm=True):\n        P, _, _ = self.get_labels_start_end_time(recognized)\n        Y, _, _ = self.get_labels_start_end_time(ground_truth)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/ms_tcn_head.py:106-137"
+    },
+    "7851": {
+        "file_id": 573,
+        "content": "This code defines two functions: \"labels_start_end\" and \"edit_score\". The first function takes in frame-wise labels, starts, and ends and returns the labels, starts, and ends. The second function calculates the edit score between recognized text and ground truth using a dynamic programming approach, specifically Levenshtein distance algorithm. It normalizes the scores if norm is True, and returns the unnormalized score otherwise.",
+        "type": "comment"
+    },
+    "7852": {
+        "file_id": 573,
+        "content": "        return self.levenstein(P, Y, norm)\n    def f_score(self, recognized, ground_truth, overlap):\n        p_label, p_start, p_end = self.get_labels_start_end_time(recognized)\n        y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)\n        tp = 0\n        fp = 0\n        hits = np.zeros(len(y_label))\n        for j in range(len(p_label)):\n            intersection = np.minimum(p_end[j], y_end) - np.maximum(\n                p_start[j], y_start)\n            union = np.maximum(p_end[j], y_end) - np.minimum(\n                p_start[j], y_start)\n            IoU = (1.0 * intersection / union) * (\n                [p_label[j] == y_label[x] for x in range(len(y_label))])\n            # Get the best scoring segment\n            idx = np.array(IoU).argmax()\n            if IoU[idx] >= overlap and not hits[idx]:\n                tp += 1\n                hits[idx] = 1\n            else:\n                fp += 1\n        fn = len(y_label) - sum(hits)\n        return float(tp), float(fp), float(fn)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/ms_tcn_head.py:138-165"
+    },
+    "7853": {
+        "file_id": 573,
+        "content": "This code calculates the F-score for overlapping segments of labels in two sequences. It first retrieves the start and end times for each label in the recognized and ground truth sequences, then iterates through each label in the recognized sequence to calculate the intersection and union between the current recognized segment and each segment in the ground truth sequence. The code then determines if there is an overlap between the segments, updates the true positive (tp), false positive (fp), and false negative (fn) counts accordingly, and finally returns the F-score as a float value.",
+        "type": "comment"
+    },
+    "7854": {
+        "file_id": 574,
+        "content": "/paddlevideo/modeling/heads/pptimesformer_head.py",
+        "type": "filepath"
+    },
+    "7855": {
+        "file_id": 574,
+        "content": "The code defines a PaddlePaddle class \"ppTimeSformerHead\" as a head for the TimeSformer model, extending BaseHead and initializing fully connected layers with truncated normal distribution.",
+        "type": "summary"
+    },
+    "7856": {
+        "file_id": 574,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom paddle.nn import Linear\nfrom ..registry import HEADS\nfrom ..weight_init import trunc_normal_, weight_init_\nfrom .base import BaseHead\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n@HEADS.register()\nclass ppTimeSformerHead(BaseHead):\n    \"\"\"TimeSformerHead Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptimesformer_head.py:1-30"
+    },
+    "7857": {
+        "file_id": 574,
+        "content": "This code defines a class called \"ppTimeSformerHead\" which is a head for the TimeSformer model in PaddlePaddle framework. It extends the BaseHead class, and has attributes such as num_classes, in_channels. The class also registers itself in the HEADS registry of the PaddleVideo module. The code uses Linear and ParamAttr from paddle.nn and weight_init from .base, and imports trunc_normal_ and L2Decay from other modules.",
+        "type": "comment"
+    },
+    "7858": {
+        "file_id": 574,
+        "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 std=0.02,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.std = std\n        self.fc = Linear(self.in_channels,\n                         self.num_classes,\n                         bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'TruncatedNormal',\n                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.0,\n                     std=self.std)\n        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptimesformer_head.py:31-58"
+    },
+    "7859": {
+        "file_id": 574,
+        "content": "The code defines a class named \"PPTimesformerHead\" with an __init__ method that takes parameters such as num_classes, in_channels, loss_cfg (with default value), std (with default 0.02), and optional kwargs. It initializes superclass attributes, sets self.std, and initializes the FC layer parameters using weight_init_. The TruncatedNormal initialization method is used with specific attribute names.",
+        "type": "comment"
+    },
+    "7860": {
+        "file_id": 574,
+        "content": "        trunc_normal_(self.fc.weight, std=self.std)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # XXX: check dropout location!\n        # x.shape = [N, embed_dim]\n        score = self.fc(x)\n        # [N, num_class]\n        # x = F.softmax(x)  # NOTE remove\n        return score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptimesformer_head.py:59-74"
+    },
+    "7861": {
+        "file_id": 574,
+        "content": "The code defines a head for the PPTimesformer model. It initializes the fully connected layer (fc) with truncated normal distribution and defines the forward pass, which involves passing input through fc to generate scores for classification tasks.",
+        "type": "comment"
+    },
+    "7862": {
+        "file_id": 575,
+        "content": "/paddlevideo/modeling/heads/pptsm_head.py",
+        "type": "filepath"
+    },
+    "7863": {
+        "file_id": 575,
+        "content": "The code defines a ppTSMHead class, a subclass of TSNHead with L2Decay regularizer. It initializes the PPTSM model head with average pooling and dropout, defining an 'init_weights' function for FC layer parameters. This is part of the PaddlePaddle Video library.",
+        "type": "summary"
+    },
+    "7864": {
+        "file_id": 575,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle import ParamAttr\nfrom paddle.nn import Linear\nfrom paddle.regularizer import L2Decay\nfrom .tsn_head import TSNHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass ppTSMHead(TSNHead):\n    \"\"\" ppTSM Head\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptsm_head.py:1-31"
+    },
+    "7865": {
+        "file_id": 575,
+        "content": "This code defines a ppTSMHead class, which is a subclass of TSNHead. It has arguments such as num_classes, in_channels, and loss_cfg. The class is registered under the HEADS registry for future use. The L2Decay regularizer is used, and weight initialization is performed using the weight_init function.",
+        "type": "comment"
+    },
+    "7866": {
+        "file_id": 575,
+        "content": "        drop_ratio(float): drop ratio. Default: 0.8.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.001.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(\n            self,\n            num_classes,\n            in_channels,  # NOTE: 2048 for >= R50, 512 for <= R34\n            drop_ratio=0.8,\n            std=0.01,\n            data_format=\"NCHW\",\n            num_seg=8,\n            **kwargs):\n        super().__init__(num_classes,\n                         in_channels,\n                         drop_ratio=drop_ratio,\n                         std=std,\n                         data_format=data_format,\n                         **kwargs)\n        self.fc = Linear(self.in_channels,\n                         self.num_classes,\n                         weight_attr=ParamAttr(learning_rate=5.0,\n                                               regularizer=L2Decay(1e-4)),\n                         bias_attr=ParamAttr(learning_rate=10.0,\n                                             regularizer=L2Decay(0.0)))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptsm_head.py:32-58"
+    },
+    "7867": {
+        "file_id": 575,
+        "content": "This code defines a class with an __init__ method that takes arguments for number of classes, input channels, dropout ratio, std value, data format, and optional keyword arguments. It initializes the base class and sets up a linear layer (self.fc) with specified learning rates and regularizers.",
+        "type": "comment"
+    },
+    "7868": {
+        "file_id": 575,
+        "content": "        self.stdv = std\n        self.num_seg = num_seg\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)\n    def forward(self, x, num_seg=None):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        #XXX: check dropout location!\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)\n            # [N * num_seg, in_channels, 1, 1]\n        num_seg = num_seg if num_seg is not None else self.num_seg\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, in_channels]\n        x = paddle.reshape(x, shape=[-1, self.in_channels])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptsm_head.py:59-87"
+    },
+    "7869": {
+        "file_id": 575,
+        "content": "The code initializes a head for the PPTSM model, which includes an average pooling layer, dropout if specified, and reshaping operations. It then returns the classification scores for input samples. The 'init_weights' function initializes the FC layer parameters with normal distribution using the given standard deviation (stdv).",
+        "type": "comment"
+    },
+    "7870": {
+        "file_id": 575,
+        "content": "        # [N, in_channels]\n        score = self.fc(x)\n        # [N, num_class]\n        #x = F.softmax(x)  #NOTE remove\n        return score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptsm_head.py:88-92"
+    },
+    "7871": {
+        "file_id": 575,
+        "content": "This code snippet is part of the PaddlePaddle Video (PaddleVideo) library. It defines a function within a class called \"pptsm_head\". The function takes input 'x' and performs a fully connected operation using \"self.fc\", returning the scores in the form of \"score\" with dimensions [N, in_channels]. The line \"#x = F.softmax(x) #NOTE remove\" was likely removed from the code, but its original purpose would have been to apply softmax function on 'x' and return the normalized probabilities.",
+        "type": "comment"
+    },
+    "7872": {
+        "file_id": 576,
+        "content": "/paddlevideo/modeling/heads/pptsn_head.py",
+        "type": "filepath"
+    },
+    "7873": {
+        "file_id": 576,
+        "content": "This Python code implements a PaddlePaddle neural network head for classification tasks using ppTSN Head, initializing the base class and applying dropout regularization with an FC layer. The init_weights function sets the FC layer's initial weights.",
+        "type": "summary"
+    },
+    "7874": {
+        "file_id": 576,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle import ParamAttr\nfrom paddle.nn import AdaptiveAvgPool2D, Linear, Dropout\nfrom paddle.regularizer import L2Decay\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass ppTSNHead(BaseHead):\n    \"\"\"ppTSN Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptsn_head.py:1-30"
+    },
+    "7875": {
+        "file_id": 576,
+        "content": "This code is the Python implementation of ppTSN Head, a classification model head used in PaddleVideo. The class has the number of classes and input channels as arguments. It inherits from BaseHead and is registered to the HEADS registry using @HEADS.register(). The code also imports necessary libraries and functions for its operations such as Linear, AdaptiveAvgPool2D, Dropout, ParamAttr, L2Decay, paddle.nn, and PaddleVideo's base and weight_init modules.",
+        "type": "comment"
+    },
+    "7876": {
+        "file_id": 576,
+        "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        drop_ratio(float): drop ratio. Default: 0.4.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        data_format(str): data format of input tensor in ['NCHW', 'NHWC']. Default: 'NCHW'.\n        fclr5(bool): Whether to increase the learning rate of the fully connected layer. Default: True\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 drop_ratio=0.4,\n                 std=0.01,\n                 data_format=\"NCHW\",\n                 fclr5=True,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.drop_ratio = drop_ratio\n        self.std = std\n        # NOTE: global pool performance\n        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptsn_head.py:31-54"
+    },
+    "7877": {
+        "file_id": 576,
+        "content": "This code defines a class with an __init__ method, taking parameters such as num_classes, in_channels, loss_cfg, drop_ratio, std, data_format, and fclr5. It initializes the base class and sets the drop_ratio, std, and creates an AdaptiveAvgPool2D object for global pooling performance.",
+        "type": "comment"
+    },
+    "7878": {
+        "file_id": 576,
+        "content": "        if self.drop_ratio != 0:\n            self.dropout = Dropout(p=self.drop_ratio)\n        else:\n            self.dropout = None\n        self.fc = Linear(\n            self.in_channels,\n            self.num_classes,\n            weight_attr=ParamAttr(learning_rate=5.0 if fclr5 else 1.0,\n                                  regularizer=L2Decay(1e-4)),\n            bias_attr=ParamAttr(learning_rate=10.0 if fclr5 else 1.0,\n                                regularizer=L2Decay(0.0)))\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'Normal',\n                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.,\n                     std=self.std)\n    def forward(self, x, num_seg=8):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptsn_head.py:56-84"
+    },
+    "7879": {
+        "file_id": 576,
+        "content": "This code initializes and defines a PaddlePaddle neural network head for classification tasks. It includes optional dropout regularization, an FC layer with learnable parameters, and a forward function to process input data. The init_weights function is used to set the initial weights of the FC layer.",
+        "type": "comment"
+    },
+    "7880": {
+        "file_id": 576,
+        "content": "        \"\"\"\n        # XXX: check dropout location!\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, in_channels]\n        if self.dropout is not None:\n            x = self.dropout(x)\n            # [N, in_channels]\n        x = paddle.reshape(x, shape=[-1, self.in_channels])\n        # [N, in_channels]\n        score = self.fc(x)\n        # [N, num_class]\n        # x = F.softmax(x)  # NOTE remove\n        return score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/pptsn_head.py:85-103"
+    },
+    "7881": {
+        "file_id": 576,
+        "content": "This code snippet is responsible for processing the input and output of a PPTSN head model. It performs average pooling, reshapes the tensor, calculates the mean along an axis, applies dropout if applicable, reshapes again, and finally passes the result through a fully connected layer to produce scores.",
+        "type": "comment"
+    },
+    "7882": {
+        "file_id": 577,
+        "content": "/paddlevideo/modeling/heads/roi_extractor.py",
+        "type": "filepath"
+    },
+    "7883": {
+        "file_id": 577,
+        "content": "RoIAlign is a class for region of interest alignment. It takes features, regions of interest (roi), and number of roi as inputs, and uses PaddlePaddle's roi_align operation to extract aligned features.",
+        "type": "summary"
+    },
+    "7884": {
+        "file_id": 577,
+        "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\n#@register\nclass RoIAlign(object):\n    def __init__(self,\n                 resolution=14,\n                 spatial_scale=0.0625,\n                 sampling_ratio=0,\n                 aligned=False):\n        super(RoIAlign, self).__init__()\n        self.resolution = resolution\n        self.spatial_scale = spatial_scale\n        self.sampling_ratio = sampling_ratio\n        self.aligned = aligned\n    def __call__(self, feats, roi, rois_num):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/roi_extractor.py:1-31"
+    },
+    "7885": {
+        "file_id": 577,
+        "content": "RoIAlign is a class used for region of interest alignment. It takes features, regions of interest (roi), and the number of roi as inputs. The aligned parameter specifies whether to return aligned features or not.",
+        "type": "comment"
+    },
+    "7886": {
+        "file_id": 577,
+        "content": "        roi = paddle.concat(roi) if len(roi) > 1 else roi[0]\n        rois_num = paddle.to_tensor(rois_num, dtype='int32')\n        rois_num = paddle.cast(rois_num, dtype='int32')\n        if len(feats) == 1:\n            roi_feat = paddle.vision.ops.roi_align(feats,\n                                     roi,\n                                     rois_num,\n                                     self.resolution,\n                                     self.spatial_scale,\n                                     self.sampling_ratio,\n                                     self.aligned)\n        else:\n            rois_feat_list = []\n            roi_feat = paddle.vision.ops.roi_align(feats,\n                                     roi,\n                                     rois_num,\n                                     self.resolution,\n                                     self.spatial_scale,\n                                     self.sampling_ratio,\n                                     self.aligned)\n        return roi_feat",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/roi_extractor.py:32-53"
+    },
+    "7887": {
+        "file_id": 577,
+        "content": "This code concatenates ROIs and ensures correct data type, then uses the PaddlePaddle library's roi_align operation to extract features from input features (feats) based on ROIs. If there is only one feature, it performs alignment for all ROIs. Otherwise, it creates a list of aligned feature tensors.",
+        "type": "comment"
+    },
+    "7888": {
+        "file_id": 578,
+        "content": "/paddlevideo/modeling/heads/roi_head.py",
+        "type": "filepath"
+    },
+    "7889": {
+        "file_id": 578,
+        "content": "The code introduces a Non-Maximum Suppression function for bounding boxes and defines the AVARoIHead class, an object detection layer performing ROI alignment with bbox loss calculation, image assignment & sampling, and result returning. The simple_test function tests detection without augmentation.",
+        "type": "summary"
+    },
+    "7890": {
+        "file_id": 578,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nfrom .. import builder\nfrom ..registry import HEADS\ndef bbox2result(bboxes, labels, num_classes, img_shape, thr=0.01):\n    \"\"\"Convert detection results to a list of numpy arrays.  \"\"\"\n    if len(bboxes) == 0:\n        return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32))\n    else:\n        bboxes = bboxes[0]\n        labels = labels\n        img_shape_np = img_shape",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/roi_head.py:1-29"
+    },
+    "7891": {
+        "file_id": 578,
+        "content": "This code defines the function bbox2result, which takes in bounding box coordinates (bboxes), labels, number of classes (num_classes), image shape (img_shape) and a threshold value (thr). The function returns a list of numpy arrays representing the detection results. If there are no detections (i.e., bboxes is empty), it returns an empty list of zeros for each class.",
+        "type": "comment"
+    },
+    "7892": {
+        "file_id": 578,
+        "content": "        img_h, img_w = img_shape_np[0][0], img_shape_np[0][1]\n        img_w = paddle.cast(img_w, dtype='int32')\n        img_h = paddle.cast(img_h, dtype='int32')\n        bboxes[:, 0::2] /= img_w\n        bboxes[:, 1::2] /= img_h\n        # We only handle multilabel now\n        assert labels.shape[-1] > 1\n        scores = labels  # rename\n        thr = (thr, ) * num_classes if isinstance(thr, float) else thr\n        assert scores.shape[1] == num_classes\n        assert len(thr) == num_classes\n        result = []\n        for i in range(num_classes - 1):\n            #step1. 对该类, 每个bbox的得分是否大于阈值\n            where = scores[:, i + 1] > thr[i + 1]\n            where = paddle.nonzero(where)  # index\n            bboxes_select = paddle.index_select(x=bboxes, index=where)\n            bboxes_select = bboxes_select[:, :4]\n            scores_select = paddle.index_select(x=scores, index=where)\n            scores_select = scores_select[:, i + 1:i + 2]\n            result.append(\n                #对于step1中得分大于阈值的bbox(可能为空), 将bbox及在该类的score放入result列表.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/roi_head.py:30-59"
+    },
+    "7893": {
+        "file_id": 578,
+        "content": "This code performs Non-Maximum Suppression (NMS) on bounding boxes and scores to filter out overlapping regions. It iterates through each class, selects bounding boxes and their corresponding scores that are above a certain threshold for each class, and appends them to the result list.",
+        "type": "comment"
+    },
+    "7894": {
+        "file_id": 578,
+        "content": "                paddle.concat((bboxes_select, scores_select), axis=1).numpy())\n        return result\n@HEADS.register()\nclass AVARoIHead(nn.Layer):\n    def __init__(self,\n                 assigner,\n                 sampler,\n                 pos_weight=1.0,\n                 action_thr=0.0,\n                 bbox_roi_extractor=None,\n                 bbox_head=None,\n                 train_cfg=None,\n                 test_cfg=None):\n        super().__init__()\n        self.assigner = assigner\n        self.sampler = sampler\n        self.pos_weight = pos_weight\n        self.action_thr = action_thr\n        self.init_assigner_sampler()\n        if bbox_head is not None:\n            self.init_bbox_head(bbox_roi_extractor, bbox_head)\n    def init_assigner_sampler(self):\n        \"\"\"Initialize assigner and sampler.\"\"\"\n        self.bbox_assigner = None\n        self.bbox_sampler = None\n        self.bbox_assigner = builder.build_assigner(self.assigner)\n        self.bbox_sampler = builder.build_sampler(self.sampler, context=self)\n    def init_bbox_head(self, bbox_roi_extractor, bbox_head):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/roi_head.py:60-93"
+    },
+    "7895": {
+        "file_id": 578,
+        "content": "The code defines a class named AVARoIHead, which is a PaddlePaddle layer for object detection. It initializes the assigner and sampler, and optionally initializes the bbox_head (bounding box regression head). The method init_assigner_sampler initializes the bbox_assigner and bbox_sampler from the passed arguments. The method init_bbox_head initializes the bounding box regression head if the bbox_head is provided. This class registers with HEADS, which may be a registry or a list of defined classes.",
+        "type": "comment"
+    },
+    "7896": {
+        "file_id": 578,
+        "content": "        \"\"\"Initialize ``bbox_head``\"\"\"\n        self.bbox_roi_extractor = builder.build_roi_extractor(\n            bbox_roi_extractor)\n        self.bbox_head = builder.build_head(bbox_head)\n    def _bbox_forward(self, x, rois, rois_num):\n        bbox_feat = self.bbox_roi_extractor(x, rois, rois_num)\n        cls_score, bbox_pred = self.bbox_head(\n            bbox_feat, rois, rois_num\n        )  #deal with: when roi's width or height = 0 , roi_align is wrong\n        bbox_results = dict(cls_score=cls_score,\n                            bbox_pred=bbox_pred,\n                            bbox_feats=bbox_feat)\n        return bbox_results\n    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels):\n        \"\"\"Run forward function and calculate loss for box head in training.\"\"\"\n        rois = [res.bboxes for res in sampling_results]\n        rois_num = [res.bboxes.shape[0] for res in sampling_results]\n        bbox_results = self._bbox_forward(x, rois, rois_num)\n        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/roi_head.py:94-114"
+    },
+    "7897": {
+        "file_id": 578,
+        "content": "This code initializes the bbox_head and defines the _bbox_forward function for feature extraction and prediction, as well as the _bbox_forward_train function for training purposes. It also handles situations where ROI's width or height equals 0 by correcting the roi_align operation.",
+        "type": "comment"
+    },
+    "7898": {
+        "file_id": 578,
+        "content": "                                                  gt_labels, self.pos_weight)\n        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], bbox_targets)\n        bbox_results.update(loss_bbox=loss_bbox)\n        return bbox_results\n    def train_step(self, x, img_metas, proposal_list, gt_bboxes, gt_labels):\n        #1. assign gts and sample proposals\n        num_imgs = len(img_metas[0])\n        sampling_results = []\n        for i in range(num_imgs):\n            assign_result = self.bbox_assigner.assign(proposal_list[i],\n                                                      gt_bboxes[i],\n                                                      gt_labels[i])\n            sampling_result = self.bbox_sampler.sample(assign_result,\n                                                       proposal_list[i],\n                                                       gt_bboxes[i],\n                                                       gt_labels[i])\n            sampling_results.append(sampling_result)\n        #2. forward and loss",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/roi_head.py:115-134"
+    },
+    "7899": {
+        "file_id": 578,
+        "content": "The code defines a ROI head that calculates the bbox loss and performs assignment and sampling for each image in a batch. It takes input images, proposal list, ground truth bounding boxes, and labels as parameters and returns results containing loss_bbox.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/79.json b/docs/data/79.json
new file mode 100644
index 000000000..21071019c
--- /dev/null
+++ b/docs/data/79.json
@@ -0,0 +1,541 @@
+{
+    "7900": {
+        "file_id": 578,
+        "content": "        bbox_results = self._bbox_forward_train(x, sampling_results, gt_bboxes,\n                                                gt_labels)\n        losses = dict()\n        losses.update(bbox_results['loss_bbox'])\n        return losses\n    def simple_test(self, x, proposal_list, img_shape, rescale=False):\n        x_shape = x[0].shape\n        #assert x_shape[0] == 1, 'only accept 1 sample at test mode'\n        det_bboxes, det_labels = self.simple_test_bboxes(x,\n                                                         img_shape,\n                                                         proposal_list,\n                                                         self.action_thr,\n                                                         rescale=rescale)\n        bbox_results = bbox2result(det_bboxes, det_labels,\n                                   self.bbox_head.num_classes, img_shape,\n                                   self.action_thr)\n        return [bbox_results]\n    def simple_test_bboxes(self,\n                           x,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/roi_head.py:135-158"
+    },
+    "7901": {
+        "file_id": 578,
+        "content": "The code contains two main functions, \"simple_test\" and \"_bbox_forward_train\". The former is for testing the model in simple test mode by taking input x, proposal list, img_shape, and rescale flag. It calculates det_bboxes and det_labels using the function \"simple_test_bboxes\". Then it uses bbox2result to convert det_bboxes and det_labels into bbox_results. The latter function takes input x, sampling results, gt_bboxes, and gt_labels to calculate bbox results and losses. It updates the losses dictionary with \"loss_bbox\" and returns the losses.",
+        "type": "comment"
+    },
+    "7902": {
+        "file_id": 578,
+        "content": "                           img_shape,\n                           proposals,\n                           action_thr,\n                           rescale=False):\n        \"\"\"Test only det bboxes without augmentation.\"\"\"\n        rois = [proposals]\n        rois_num = [rois[0].shape[0]]\n        bbox_results = self._bbox_forward(x, rois, rois_num)\n        cls_score = bbox_results['cls_score']\n        crop_quadruple = np.array([0, 0, 1, 1])\n        flip = False\n        det_bboxes, det_labels = self.bbox_head.get_det_bboxes(\n            rois,\n            cls_score,\n            img_shape,\n            flip=flip,\n            crop_quadruple=crop_quadruple)\n        return det_bboxes, det_labels",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/roi_head.py:159-177"
+    },
+    "7903": {
+        "file_id": 578,
+        "content": "This function tests only detection bboxes without augmentation. It takes input x, proposals, action_thr, and rescale as parameters. It creates rois and rois_num from the proposals. It then calls _bbox_forward to get cls_score. It sets crop_quadruple and flip to False. Finally, it calls bbox_head's get_det_bboxes to return det_bboxes and det_labels.",
+        "type": "comment"
+    },
+    "7904": {
+        "file_id": 579,
+        "content": "/paddlevideo/modeling/heads/single_straight3d.py",
+        "type": "filepath"
+    },
+    "7905": {
+        "file_id": 579,
+        "content": "This code defines a 3D ROI extractor class and head, performing feature extraction with optional temporal pooling. The forward method executes feature extraction based on input features, RoIs, and number of RoIs, and returns the final output after applying ROI layer and stacking features along axis 2.",
+        "type": "summary"
+    },
+    "7906": {
+        "file_id": 579,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport numpy as np\nfrom ..registry import ROI_EXTRACTORS\nfrom .roi_extractor import RoIAlign\n@ROI_EXTRACTORS.register()\nclass SingleRoIExtractor3D(nn.Layer):\n    \"\"\"Extract RoI features from a single level feature map.  \"\"\"\n    def __init__(self,\n                 roi_layer_type='RoIAlign',\n                 featmap_stride=16,\n                 output_size=16,\n                 sampling_ratio=0,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/single_straight3d.py:1-28"
+    },
+    "7907": {
+        "file_id": 579,
+        "content": "This code imports necessary libraries and registers a new ROI (Region of Interest) extractor named \"SingleRoIExtractor3D\". This class inherits from nn.Layer and is designed to extract RoI features from a single level feature map with specific options such as roi_layer_type, featmap_stride, output_size, and sampling_ratio.",
+        "type": "comment"
+    },
+    "7908": {
+        "file_id": 579,
+        "content": "                 pool_mode='avg',\n                 aligned=True,\n                 with_temporal_pool=True,\n                 with_global=False):\n        super().__init__()\n        self.roi_layer_type = roi_layer_type\n        assert self.roi_layer_type in ['RoIPool', 'RoIAlign']\n        self.featmap_stride = featmap_stride\n        self.spatial_scale = 1. / self.featmap_stride\n        self.output_size = output_size\n        self.sampling_ratio = sampling_ratio\n        self.pool_mode = pool_mode\n        self.aligned = aligned\n        self.with_temporal_pool = with_temporal_pool\n        self.with_global = with_global\n        self.roi_layer = RoIAlign(resolution=self.output_size,\n                                  spatial_scale=self.spatial_scale,\n                                  sampling_ratio=self.sampling_ratio,\n                                  aligned=self.aligned)\n    def init_weights(self):\n        pass\n    # The shape of feat is N, C, T, H, W\n    def forward(self, feat, rois, rois_num):\n        if len(feat) >= 2:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/single_straight3d.py:29-55"
+    },
+    "7909": {
+        "file_id": 579,
+        "content": "This code defines a class for a 3D head that takes input features and regions of interest (ROIs) to extract features using the RoIAlign layer. It also includes an optional temporal pooling operation and an initialization function. The forward method performs feature extraction given the input features, ROIs, and number of ROIs.",
+        "type": "comment"
+    },
+    "7910": {
+        "file_id": 579,
+        "content": "            assert self.with_temporal_pool\n        if self.with_temporal_pool:\n            xi = 0\n            for x in feat:\n                xi = xi + 1\n                y = paddle.mean(x, 2, keepdim=True)\n            feat = [paddle.mean(x, 2, keepdim=True) for x in feat]\n        feat = paddle.concat(feat, axis=1)  # merge slow and fast\n        roi_feats = []\n        for t in range(feat.shape[2]):\n            if type(t) == paddle.static.Variable:\n                index = paddle.to_tensor(t)\n            else:\n                data_index = np.array([t]).astype('int32')\n                index = paddle.to_tensor(data_index)\n            frame_feat = paddle.index_select(feat, index, axis=2)\n            frame_feat = paddle.squeeze(frame_feat,\n                                        axis=2)  #axis=2,避免N=1时, 第一维度被删除.\n            roi_feat = self.roi_layer(frame_feat, rois, rois_num)\n            roi_feats.append(roi_feat)\n        ret = paddle.stack(roi_feats, axis=2)\n        return ret",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/single_straight3d.py:56-79"
+    },
+    "7911": {
+        "file_id": 579,
+        "content": "This code performs temporal pooling, concatenates slow and fast features, extracts frame-wise features using index selection, squeezes the dimensions to prevent deletion when N=1, applies a ROI layer on each frame, stacks the resulting features along axis 2, and returns the final output.",
+        "type": "comment"
+    },
+    "7912": {
+        "file_id": 580,
+        "content": "/paddlevideo/modeling/heads/slowfast_head.py",
+        "type": "filepath"
+    },
+    "7913": {
+        "file_id": 580,
+        "content": "This code defines 3D head projection classes for PaddleVideo library, initializes a SlowFast head model with dropout regularization and adaptive average pooling, performs convolutional inference, applies softmax activation, averages when not training, and reshapes before returning.",
+        "type": "summary"
+    },
+    "7914": {
+        "file_id": 580,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..registry import HEADS\nfrom .base import BaseHead\nimport paddle\nimport paddle.nn.functional as F\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass SlowFastHead(BaseHead):\n    \"\"\"\n    ResNe(X)t 3D head.\n    This layer performs a fully-connected projection during training, when the\n    input size is 1x1x1. It performs a convolutional projection during testing\n    when the input size is larger than 1x1x1. If the inputs are from multiple",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/slowfast_head.py:1-30"
+    },
+    "7915": {
+        "file_id": 580,
+        "content": "This code is from the PaddleVideo library and defines a SlowFastHead class for ResNe(X)t 3D head. It performs a fully-connected projection during training and convolutional projection during testing, with different input sizes handled accordingly. The code includes import statements, registration using HEADS registry, and base class inheritance from BaseHead.",
+        "type": "comment"
+    },
+    "7916": {
+        "file_id": 580,
+        "content": "    different pathways, the inputs will be concatenated after pooling.\n    \"\"\"\n    def __init__(self,\n                 width_per_group,\n                 alpha,\n                 beta,\n                 num_classes,\n                 num_frames,\n                 crop_size,\n                 dropout_rate,\n                 pool_size_ratio=[[1, 1, 1], [1, 1, 1]],\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 multigrid_short=False,\n                 **kwargs):\n        \"\"\"\n        ResNetBasicHead takes p pathways as input where p in [1, infty].\n        Args:\n            dim_in (list): the list of channel dimensions of the p inputs to the\n                ResNetHead.\n            num_classes (int): the channel dimensions of the p outputs to the\n                ResNetHead.\n            pool_size (list): the list of kernel sizes of p spatial temporal\n                poolings, temporal pool kernel size, spatial pool kernel size,\n                spatial pool kernel size in order.\n            dropout_rate (float): dropout rate. If equal to 0.0, perform no",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/slowfast_head.py:31-56"
+    },
+    "7917": {
+        "file_id": 580,
+        "content": "The code defines a class for SlowFast_Head, which takes different pathways as input and concatenates the inputs after pooling. It has various parameters such as width_per_group, alpha, beta, etc. The ResNetBasicHead takes p pathways as input where p can be in the range of 1 to infinity. It has arguments for dim_in (list), num_classes (int), pool_size (list), and dropout_rate (float).",
+        "type": "comment"
+    },
+    "7918": {
+        "file_id": 580,
+        "content": "                dropout.\n        \"\"\"\n        super().__init__(num_classes, loss_cfg, **kwargs)\n        self.multigrid_short = multigrid_short\n        self.width_per_group = width_per_group\n        self.alpha = alpha\n        self.beta = beta\n        self.num_classes = num_classes\n        self.num_frames = num_frames\n        self.crop_size = crop_size\n        self.dropout_rate = dropout_rate\n        self.pool_size_ratio = pool_size_ratio\n        self.dim_in = [\n            self.width_per_group * 32,\n            self.width_per_group * 32 // self.beta,\n        ]\n        self.pool_size = [None, None] if self.multigrid_short else [\n            [\n                self.num_frames // self.alpha // self.pool_size_ratio[0][0],\n                self.crop_size // 32 // self.pool_size_ratio[0][1],\n                self.crop_size // 32 // self.pool_size_ratio[0][2],\n            ],\n            [\n                self.num_frames // self.pool_size_ratio[1][0],\n                self.crop_size // 32 // self.pool_size_ratio[1][1],\n                self.crop_size // 32 // self.pool_size_ratio[1][2],",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/slowfast_head.py:57-83"
+    },
+    "7919": {
+        "file_id": 580,
+        "content": "This code is initializing a SlowFast head model with specified parameters such as multigrid_short, width_per_group, alpha, beta, num_classes, num_frames, crop_size, and dropout_rate. It also sets the dimension input (dim_in) based on these parameters, and determines the pool size accordingly based on whether multigrid_short is True or False.",
+        "type": "comment"
+    },
+    "7920": {
+        "file_id": 580,
+        "content": "            ],\n        ]\n        assert (len({len(self.pool_size), len(self.dim_in)\n                     }) == 1), \"pathway dimensions are not consistent.\"\n        self.num_pathways = len(self.pool_size)\n        self.dropout = paddle.nn.Dropout(p=self.dropout_rate)\n        self.projection = paddle.nn.Linear(\n            in_features=sum(self.dim_in),\n            out_features=self.num_classes,\n        )\n    def init_weights(self):\n        weight_init_(self.projection,\n                     \"Normal\",\n                     bias_value=0.0,\n                     mean=0.0,\n                     std=0.01)\n    def forward(self, inputs):\n        assert (len(inputs) == self.num_pathways\n                ), \"Input tensor does not contain {} pathway\".format(\n                    self.num_pathways)\n        pool_out = []\n        for pathway in range(self.num_pathways):\n            if self.pool_size[pathway] is None:\n                tmp_out = F.adaptive_avg_pool3d(x=inputs[pathway],\n                                                output_size=(1, 1, 1),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/slowfast_head.py:84-113"
+    },
+    "7921": {
+        "file_id": 580,
+        "content": "This code initializes a SlowFast head model. It defines the number of pathways, applies dropout regularization, and initializes weights for linear projection. The forward method expects inputs with the same number of pathways as defined in the model. It then performs adaptive average pooling on each input pathway separately.",
+        "type": "comment"
+    },
+    "7922": {
+        "file_id": 580,
+        "content": "                                                data_format=\"NCDHW\")\n            else:\n                tmp_out = F.avg_pool3d(x=inputs[pathway],\n                                       kernel_size=self.pool_size[pathway],\n                                       stride=1,\n                                       data_format=\"NCDHW\")\n            pool_out.append(tmp_out)\n        x = paddle.concat(x=pool_out, axis=1)\n        x = paddle.transpose(x=x, perm=(0, 2, 3, 4, 1))\n        # Perform dropout.\n        if self.dropout_rate > 0.0:\n            x = self.dropout(x)\n        x = self.projection(x)\n        # Performs fully convlutional inference.\n        if not self.training:  # attr of base class\n            x = F.softmax(x, axis=4)\n            x = paddle.mean(x, axis=[1, 2, 3])\n        x = paddle.reshape(x, shape=(x.shape[0], -1))\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/slowfast_head.py:114-137"
+    },
+    "7923": {
+        "file_id": 580,
+        "content": "This code performs pooling and dropout operations on input tensors, followed by projection and fully convolutional inference. It also applies softmax activation and averaging when not in training mode. The resulting tensor is reshaped before returning.",
+        "type": "comment"
+    },
+    "7924": {
+        "file_id": 581,
+        "content": "/paddlevideo/modeling/heads/stgcn_head.py",
+        "type": "filepath"
+    },
+    "7925": {
+        "file_id": 581,
+        "content": "This code creates a STGCNHead class in PaddlePaddle's video modeling library, initializing a convolutional layer and applying forward pass for input x to produce N, C shaped output.",
+        "type": "summary"
+    },
+    "7926": {
+        "file_id": 581,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass STGCNHead(BaseHead):\n    \"\"\"\n    Head for ST-GCN model.\n    Args:\n        in_channels: int, input feature channels. Default: 256.\n        num_classes: int, number classes. Default: 10.\n    \"\"\"\n    def __init__(self, in_channels=256, num_classes=10, **kwargs):\n        super().__init__(num_classes, in_channels, **kwargs)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/stgcn_head.py:1-32"
+    },
+    "7927": {
+        "file_id": 581,
+        "content": "This code snippet is for the STGCNHead class in PaddlePaddle's video modeling library. It's a subclass of BaseHead with 256 input feature channels and 10 number classes as default values, initialized using super(). This model can be customized further by passing additional keyword arguments.",
+        "type": "comment"
+    },
+    "7928": {
+        "file_id": 581,
+        "content": "        self.fcn = nn.Conv2D(in_channels=in_channels,\n                             out_channels=num_classes,\n                             kernel_size=1)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv2D):\n                weight_init_(layer, 'Normal', std=0.02)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        x = self.fcn(x)\n        x = paddle.reshape_(x, (x.shape[0], -1))  # N,C,1,1 --> N,C\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/stgcn_head.py:33-50"
+    },
+    "7929": {
+        "file_id": 581,
+        "content": "The code defines a head class with a convolutional layer, initializes its weights using a normal distribution with standard deviation 0.02, and applies the forward pass to input x by passing it through a convolutional layer and reshaping the output to shape N, C (N: number of samples, C: number of classes).",
+        "type": "comment"
+    },
+    "7930": {
+        "file_id": 582,
+        "content": "/paddlevideo/modeling/heads/timesformer_head.py",
+        "type": "filepath"
+    },
+    "7931": {
+        "file_id": 582,
+        "content": "The PaddleVideo's TimeSformerHead class is a model head in the TimeSformer architecture that initializes parameters and defines forward methods for computing output. It uses Linear layers from PaddlePaddle and allows customizing parameters with keyword arguments. The function applies an fc layer to input tensor x and returns classification scores without softmax, with unclear dropout location.",
+        "type": "summary"
+    },
+    "7932": {
+        "file_id": 582,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom paddle.nn import Linear\nfrom ..registry import HEADS\nfrom ..weight_init import trunc_normal_, weight_init_\nfrom .base import BaseHead\n@HEADS.register()\nclass TimeSformerHead(BaseHead):\n    \"\"\"TimeSformerHead Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/timesformer_head.py:1-29"
+    },
+    "7933": {
+        "file_id": 582,
+        "content": "This code is from PaddleVideo's TimeSformerHead class, which is a head in the modeling module. It is a subclass of BaseHead and has attributes such as num_classes, in_channels, and loss_cfg. The class is registered using HEADS registry, and it uses functions from paddle.nn, Linear, and BaseHead modules. Weight initialization is performed using trunc_normal_ and weight_init functions.",
+        "type": "comment"
+    },
+    "7934": {
+        "file_id": 582,
+        "content": "        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 std=0.02,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.std = std\n        self.fc = Linear(self.in_channels, self.num_classes)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'TruncatedNormal',\n                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.0,\n                     std=self.std)\n        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal\n        trunc_normal_(self.fc.weight, std=self.std)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/timesformer_head.py:30-60"
+    },
+    "7935": {
+        "file_id": 582,
+        "content": "The code snippet defines a class for the TimeSformer head, initializes its parameters and provides a forward method to compute the output of the head. It uses PaddlePaddle's Linear layer and allows setting a specific std value in normal initialization, as well as customizing other parameters with additional keyword arguments. The forward function defines how the head operates on input data x.",
+        "type": "comment"
+    },
+    "7936": {
+        "file_id": 582,
+        "content": "        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # XXX: check dropout location!\n        # x.shape = [N, embed_dim]\n        score = self.fc(x)\n        # [N, num_class]\n        # x = F.softmax(x)  # NOTE remove\n        return score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/timesformer_head.py:61-70"
+    },
+    "7937": {
+        "file_id": 582,
+        "content": "This function applies a fully connected layer (fc) to the input tensor x and returns the classification scores for input samples without applying softmax. The dropout location needs further clarification as indicated by XXX.",
+        "type": "comment"
+    },
+    "7938": {
+        "file_id": 583,
+        "content": "/paddlevideo/modeling/heads/token_shift_head.py",
+        "type": "filepath"
+    },
+    "7939": {
+        "file_id": 583,
+        "content": "The code defines a TokenShiftHead class for classification tasks in Paddle. It inherits from BaseHead, uses Linear module, and returns classification scores after passing input tensor x through fully connected layer self.fc.",
+        "type": "summary"
+    },
+    "7940": {
+        "file_id": 583,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom paddle.nn import Linear\nimport paddle\nfrom ..registry import HEADS\nfrom ..weight_init import trunc_normal_, weight_init_\nfrom .base import BaseHead\n@HEADS.register()\nclass TokenShiftHead(BaseHead):\n    \"\"\"TokenShift Transformer Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        num_seg(int): The number of segments. Default: 8. ",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/token_shift_head.py:1-30"
+    },
+    "7941": {
+        "file_id": 583,
+        "content": "The code is defining a class called TokenShiftHead, which is a Transformer head for classification tasks. It has attributes such as num_classes, in_channels, and num_seg (defaulted to 8). The class inherits from BaseHead and is registered under the HEADS registry. The code imports necessary modules and functions, and uses Paddle's Linear module for the layer implementation.",
+        "type": "comment"
+    },
+    "7942": {
+        "file_id": 583,
+        "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        ls_eps (float): Label smoothing epsilon. Default: 0.01.\n        std (float): Std(Scale) Value in normal initilizar. Default: 0.02.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 num_seg=8,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 ls_eps=0.01,\n                 std=0.02,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, ls_eps)\n        self.num_seg = num_seg\n        self.std = std\n        self.fc = Linear(self.in_channels, self.num_classes)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'TruncatedNormal',\n                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.0,\n                     std=self.std)\n        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/token_shift_head.py:31-60"
+    },
+    "7943": {
+        "file_id": 583,
+        "content": "__init__ function initializes the class with specified parameters, and init_weights is used to initialize the FC layer's parameters using truncated normal distribution.",
+        "type": "comment"
+    },
+    "7944": {
+        "file_id": 583,
+        "content": "        trunc_normal_(self.fc.weight, std=self.std)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # XXX: check dropout location!\n        # x.shape = [N, embed_dim]\n        score = self.fc(x)\n        # [N*T, num_class]\n        _, _m = score.shape\n        _t = self.num_seg\n        score = score.reshape([-1, _t, _m])\n        score = paddle.mean(score, 1)  # averaging predictions for every frame\n        score = paddle.squeeze(score, axis=1)\n        return score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/token_shift_head.py:61-79"
+    },
+    "7945": {
+        "file_id": 583,
+        "content": "This code defines a TokenShiftHead, which performs classification tasks. The forward function takes input tensor x and passes it through fully connected layer self.fc, resulting in classification scores for each sample. It then reshapes the score to average predictions for every frame, finally squeezing the axis to return the final score.",
+        "type": "comment"
+    },
+    "7946": {
+        "file_id": 584,
+        "content": "/paddlevideo/modeling/heads/transnetv2_head.py",
+        "type": "filepath"
+    },
+    "7947": {
+        "file_id": 584,
+        "content": "This code defines the TransNetV2Head class, a type of head used in computer vision models, inheriting from BaseHead with arguments for number of classes, input channels, and loss configuration. It also includes TransNetV2Loss class registered as HEADS registry, and two methods (loss and get_score) for calculating loss and F1 score between predictions and ground truth.",
+        "type": "summary"
+    },
+    "7948": {
+        "file_id": 584,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..losses import TransNetV2Loss\nfrom ...metrics.transnetv2_metric import create_scene_based_summaries\n@HEADS.register()\nclass TransNetV2Head(BaseHead):\n    \"\"\"TransNetV2 Head.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name=\"TransNetV2Loss\")\n                 ):\n        super().__init__(num_classes,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/transnetv2_head.py:1-29"
+    },
+    "7949": {
+        "file_id": 584,
+        "content": "This code defines the TransNetV2Head class, which is a type of head used in computer vision models. It inherits from BaseHead and takes arguments for number of classes, input channels, and loss configuration. The TransNetV2Loss class is registered with the HEADS registry to be used by this head. Additionally, the code provides comments about licensing and copyright information.",
+        "type": "comment"
+    },
+    "7950": {
+        "file_id": 584,
+        "content": "                         in_channels,\n                         loss_cfg)\n    def loss(self, one_hot_pred, one_hot_gt,\n                many_hot_pred=None, many_hot_gt=None, reg_losses=None):\n        losses = dict()\n        loss = self.loss_func(scores, labels, **kwargs)\n        f1 = self.get_score(one_hot_pred, one_hot_gt)\n        losses['f1'] = f1\n        losses['loss'] = loss\n        return losses\n    def get_score(self, one_hot_pred, one_hot_gt):\n        f1 = create_scene_based_summaries(one_hot_pred, one_hot_gt)\n        return f1",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/transnetv2_head.py:30-45"
+    },
+    "7951": {
+        "file_id": 584,
+        "content": "This code defines a class with two methods, `loss` and `get_score`. The `loss` method calculates the loss between predictions and ground truth, while `get_score` method calculates an F1 score based on one-hot predictions and ground truth. The calculated losses are stored in a dictionary for further use.",
+        "type": "comment"
+    },
+    "7952": {
+        "file_id": 585,
+        "content": "/paddlevideo/modeling/heads/tsm_head.py",
+        "type": "filepath"
+    },
+    "7953": {
+        "file_id": 585,
+        "content": "TSMHead, a classification task-oriented class extending TSNHead, initializes weights and registers in the HEADS registry. It is part of PaddleVideo's temporal segment network head, with parameters for weights and data format, forward function with average pooling and optional dropout, and possible tensor reshaping.",
+        "type": "summary"
+    },
+    "7954": {
+        "file_id": 585,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nfrom paddle import ParamAttr\nfrom paddle.nn import Linear\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom .tsn_head import TSNHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass TSMHead(TSNHead):\n    \"\"\" TSM Head\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/tsm_head.py:1-33"
+    },
+    "7955": {
+        "file_id": 585,
+        "content": "This code defines the TSMHead class, which extends the TSNHead class. It is used for classification tasks with a specific number of classes and input feature channels. The class is registered in the HEADS registry and follows a certain weight initialization method.",
+        "type": "comment"
+    },
+    "7956": {
+        "file_id": 585,
+        "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        drop_ratio(float): drop ratio. Default: 0.5.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.001.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 drop_ratio=0.5,\n                 std=0.001,\n                 data_format=\"NCHW\",\n                 **kwargs):\n        super().__init__(num_classes,\n                         in_channels,\n                         drop_ratio=drop_ratio,\n                         std=std,\n                         data_format=data_format,\n                         **kwargs)\n        self.fc = Linear(self.in_channels,\n                         self.num_classes,\n                         weight_attr=ParamAttr(learning_rate=5.0,\n                                               regularizer=L2Decay(1e-4)),\n                         bias_attr=ParamAttr(learning_rate=10.0,",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/tsm_head.py:34-57"
+    },
+    "7957": {
+        "file_id": 585,
+        "content": "The code defines a class with an __init__ method that initializes the TsmHead object. It takes arguments like num_classes, in_channels, drop_ratio, std, and data_format to set up the internal structure of the class. The Linear layer is also initialized with specific learning rates for weights and biases.",
+        "type": "comment"
+    },
+    "7958": {
+        "file_id": 585,
+        "content": "                                             regularizer=L2Decay(0.0)))\n        assert (data_format in [\n            'NCHW', 'NHWC'\n        ]), f\"data_format must be 'NCHW' or 'NHWC', but got {data_format}\"\n        self.data_format = data_format\n        self.stdv = std\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)\n    def forward(self, x, num_seg):\n        \"\"\"Define how the tsm-head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # x.shape = [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)  # [N * num_segs, in_channels, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)  # [N * num_seg, in_channels, 1, 1]\n        if self.data_format == 'NCHW':\n            x = paddle.reshape(x, x.shape[:2])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/tsm_head.py:58-89"
+    },
+    "7959": {
+        "file_id": 585,
+        "content": "The code initializes a TSM head, sets the data format and standard deviation for weights, initializes FC layer parameters, defines the forward function to perform average pooling, dropout if applicable, and reshapes the tensor based on the data format.",
+        "type": "comment"
+    },
+    "7960": {
+        "file_id": 585,
+        "content": "        else:\n            x = paddle.reshape(x, x.shape[::3])\n        score = self.fc(x)  # [N * num_seg, num_class]\n        score = paddle.reshape(\n            score, [-1, num_seg, score.shape[1]])  # [N, num_seg, num_class]\n        score = paddle.mean(score, axis=1)  # [N, num_class]\n        score = paddle.reshape(score,\n                               shape=[-1, self.num_classes])  # [N, num_class]\n        # score = F.softmax(score)  #NOTE remove\n        return score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/tsm_head.py:90-99"
+    },
+    "7961": {
+        "file_id": 585,
+        "content": "This code is part of a temporal segment network head in PaddleVideo. If the input is not 3-channel, it reshapes the feature map to have only the first third of channels. Then, it passes the reshaped feature through a fully connected layer and averages across segments to get a score for each class. The shape of the scores is then modified accordingly, and softmax could be applied (note: comment indicates that softmax might be removed).",
+        "type": "comment"
+    },
+    "7962": {
+        "file_id": 586,
+        "content": "/paddlevideo/modeling/heads/tsn_head.py",
+        "type": "filepath"
+    },
+    "7963": {
+        "file_id": 586,
+        "content": "TSNHead class is defined for image classification, inheriting from BaseHead with num_classes, in_channels, and loss_cfg parameters. It uses AdaptiveAvgPool2D, Linear, Dropout layers and weight_init function for initialization. The function defines a head for TSN model that performs average pooling, reshapes, takes mean, applies dropout if enabled, and passes through fully connected layer for classification scores.",
+        "type": "summary"
+    },
+    "7964": {
+        "file_id": 586,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle.nn import AdaptiveAvgPool2D, Linear, Dropout\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass TSNHead(BaseHead):\n    \"\"\"TSN Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/tsn_head.py:1-30"
+    },
+    "7965": {
+        "file_id": 586,
+        "content": "The code is defining a TSNHead class for image classification, which inherits from the BaseHead class. It has parameters for num_classes, in_channels, and loss_cfg, and uses AdaptiveAvgPool2D, Linear, Dropout layers. The weight_init function is also imported for weight initialization.",
+        "type": "comment"
+    },
+    "7966": {
+        "file_id": 586,
+        "content": "        drop_ratio(float): drop ratio. Default: 0.4.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 drop_ratio=0.4,\n                 std=0.01,\n                 data_format=\"NCHW\",\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.drop_ratio = drop_ratio\n        self.std = std\n        #NOTE: global pool performance\n        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)\n        if self.drop_ratio != 0:\n            self.dropout = Dropout(p=self.drop_ratio)\n        else:\n            self.dropout = None\n        self.fc = Linear(self.in_channels, self.num_classes)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'Normal',",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/tsn_head.py:31-63"
+    },
+    "7967": {
+        "file_id": 586,
+        "content": "Initializes a TSN head with specified parameters, including num_classes, in_channels, loss_cfg, drop_ratio, std, and data_format. It creates an adaptive average pooling layer, a dropout layer if drop_ratio is non-zero, and a fully connected linear layer (fc). The fc layer weights are then initialized with normal distribution.",
+        "type": "comment"
+    },
+    "7968": {
+        "file_id": 586,
+        "content": "                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.,\n                     std=self.std)\n    def forward(self, x, num_seg):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        #XXX: check dropout location!\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, in_channels]\n        if self.dropout is not None:\n            x = self.dropout(x)\n            # [N, in_channels]\n        score = self.fc(x)\n        # [N, num_class]\n        #x = F.softmax(x)  #NOTE remove\n        return score",
+        "type": "code",
+        "location": "/paddlevideo/modeling/heads/tsn_head.py:64-93"
+    },
+    "7969": {
+        "file_id": 586,
+        "content": "The function defines a head for the TSN model. It performs average pooling, reshapes the input, takes the mean along an axis, applies dropout if enabled, and passes the result through a fully connected layer to output scores for classification.",
+        "type": "comment"
+    },
+    "7970": {
+        "file_id": 587,
+        "content": "/paddlevideo/modeling/losses/__init__.py",
+        "type": "filepath"
+    },
+    "7971": {
+        "file_id": 587,
+        "content": "This code imports diverse loss functions from different modules, such as CrossEntropyLoss, BMNLoss, and TransNetV2Loss for PaddleVideo's video recognition, segmentation tasks, providing a comprehensive list of usable losses. The PaddleVideo model uses BaseWeightedLoss, ASRFLoss, DistillationCELoss, and DistillationDMLLoss for audio-visual speech recognition, distillation-based learning, and region-specific loss computation.",
+        "type": "summary"
+    },
+    "7972": {
+        "file_id": 587,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .base import BaseWeightedLoss\nfrom .bmn_loss import BMNLoss\nfrom .cross_entropy_loss import CrossEntropyLoss\nfrom .depth_loss import ADDSLoss\nfrom .transnetv2_loss import TransNetV2Loss\nfrom .actbert_loss import ActBertLoss\nfrom .asrf_loss import ASRFLoss\nfrom .distillation_loss import DistillationCELoss, DistillationDMLLoss\nfrom .yowo_loss import RegionLoss\n__all__ = [\n    'CrossEntropyLoss', 'BMNLoss', 'TransNetV2Loss', 'ActBertLoss', 'ADDSLoss',",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/__init__.py:1-26"
+    },
+    "7973": {
+        "file_id": 587,
+        "content": "This code imports different types of loss functions from various modules, such as CrossEntropyLoss, BMNLoss, and TransNetV2Loss. These losses are used in PaddleVideo for various applications like video recognition, segmentation, and more. The code provides a comprehensive list of loss functions that can be used depending on the specific task.",
+        "type": "comment"
+    },
+    "7974": {
+        "file_id": 587,
+        "content": "    'BaseWeightedLoss', 'ASRFLoss', 'DistillationCELoss', 'DistillationDMLLoss',\n    'RegionLoss'\n]",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/__init__.py:27-29"
+    },
+    "7975": {
+        "file_id": 587,
+        "content": "The code defines a list of loss functions used in the PaddleVideo model. These losses include BaseWeightedLoss, ASRFLoss, DistillationCELoss, and DistillationDMLLoss for various tasks like audio-visual speech recognition, distillation-based learning, and region-specific loss computation.",
+        "type": "comment"
+    },
+    "7976": {
+        "file_id": 588,
+        "content": "/paddlevideo/modeling/losses/actbert_loss.py",
+        "type": "filepath"
+    },
+    "7977": {
+        "file_id": 588,
+        "content": "This code defines ActBertLoss and actBertLoss classes as loss functions for ActBert model, using CrossEntropyLoss and nn.KLDivLoss. The total loss is calculated by summing masked text, masked image, masked action, and next sentence losses, based on predictions and labels from various sources.",
+        "type": "summary"
+    },
+    "7978": {
+        "file_id": 588,
+        "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass ActBertLoss(BaseWeightedLoss):\n    \"\"\"Loss for ActBert model\n    \"\"\"\n    def __init__(self, vocab_size=30522, a_target_size=700):\n        super().__init__()\n        self.vocab_size = vocab_size\n        self.a_target_size = a_target_size\n        self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/actbert_loss.py:1-32"
+    },
+    "7979": {
+        "file_id": 588,
+        "content": "This code defines the ActBertLoss class, which is a loss function for the ActBert model. It uses the CrossEntropyLoss from PaddlePaddle's nn library and takes two arguments: vocab_size and a_target_size. The class inherits from BaseWeightedLoss.",
+        "type": "comment"
+    },
+    "7980": {
+        "file_id": 588,
+        "content": "        self.vis_criterion = nn.KLDivLoss(reduction=\"none\")\n    def forward(self, prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \\\n                text_labels, image_label, image_target, action_label, next_sentence_label):\n        \"\"\"\n        Args:\n            text_label: text label(with mask). Shape: [batch_size, seqence_length]\n            image_label: image label(with mask). Shape: [batch_size, region_length]\n            image_target: label of image feature distribution,\n                            Shape: [batch_size, region_length-1, num_image_class](minus 1 for xxx).\n            action label: action label(with mask), Shape: [batch_size, action_length]\n            next_sentence_label: is next sentence or not. Shape: [batch_size]\n        \"\"\"\n        prediction_scores_v = prediction_scores_v[:,\n                                                  1:]  #8,37,1601 --> 8,36,1601\n        img_loss = self.vis_criterion(\n            F.log_softmax(prediction_scores_v, axis=2),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/actbert_loss.py:33-50"
+    },
+    "7981": {
+        "file_id": 588,
+        "content": "This code defines a class for an actBert loss function. It uses the nn.KLDivLoss function as a criterion and takes in prediction scores, sequence labels, image labels, image targets, action labels, and next sentence labels as input to compute the visual loss (img_loss). The prediction_scores_v variable is modified by removing the first element from each sequence, likely for consistency purposes.",
+        "type": "comment"
+    },
+    "7982": {
+        "file_id": 588,
+        "content": "            image_target  #8,36,1601\n        )\n        masked_img_loss = paddle.sum(\n            img_loss * (image_label == 1).unsqueeze(2).astype('float32')) / max(\n                paddle.sum((image_label == 1).astype('float32')), 1e-6)\n        masked_text_loss = self.loss_fct(\n            prediction_scores_t.reshape([-1, self.vocab_size]),  #8,36,30522\n            text_labels.reshape([-1]),  #8,36   # label -1 will be ignored\n        )\n        masked_action_loss = self.loss_fct(\n            prediction_scores_a.reshape([-1, self.a_target_size]),  #8,5,700\n            action_label.reshape([-1]),  #8,5\n        )\n        next_sentence_loss = self.loss_fct(\n            seq_relationship_score.reshape([-1, 2]),\n            next_sentence_label.reshape([-1])  #8,2\n        )\n        total_loss = masked_text_loss.unsqueeze(0) + masked_img_loss.unsqueeze(\n            0) + masked_action_loss.unsqueeze(0) + next_sentence_loss.unsqueeze(\n                0)\n        return total_loss",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/actbert_loss.py:51-75"
+    },
+    "7983": {
+        "file_id": 588,
+        "content": "This code calculates a total loss by summing the masked text loss, masked image loss, masked action loss, and next sentence loss. The losses are calculated based on predictions and labels from various sources. The `loss_fct` function is used to compute these losses, and they are reshaped before being added together for the final total loss.",
+        "type": "comment"
+    },
+    "7984": {
+        "file_id": 589,
+        "content": "/paddlevideo/modeling/losses/asrf_loss.py",
+        "type": "filepath"
+    },
+    "7985": {
+        "file_id": 589,
+        "content": "This code defines custom loss functions for video modeling, including TMSE and GSTMSE, with the ActionSegmentationLoss class applying various criteria like regression, classification, and temporal segmentation losses.",
+        "type": "summary"
+    },
+    "7986": {
+        "file_id": 589,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# https://github.com/yiskw713/asrf/libs/loss_fn/__init__.py\nimport numpy as np\nimport pandas as pd\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport sys\nimport os\nfrom ..registry import LOSSES\nclass TMSE(nn.Layer):\n    \"\"\"\n    Temporal MSE Loss Function\n    Proposed in Y. A. Farha et al. MS-TCN: Multi-Stage Temporal Convolutional Network for ActionSegmentation in CVPR2019\n    arXiv: https://arxiv.org/pdf/1903.01945.pdf",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:1-32"
+    },
+    "7987": {
+        "file_id": 589,
+        "content": "The code defines a class TMSE, which is a temporal MSE loss function. It's inspired by the MS-TCN method proposed in CVPR2019 for action segmentation tasks.",
+        "type": "comment"
+    },
+    "7988": {
+        "file_id": 589,
+        "content": "    \"\"\"\n    def __init__(self, threshold=4, ignore_index=255):\n        super().__init__()\n        self.threshold = threshold\n        self.ignore_index = ignore_index\n        self.mse = nn.MSELoss(reduction=\"none\")\n    def forward(self, preds, gts):\n        total_loss = 0.0\n        batch_size = preds.shape[0]\n        for pred, gt in zip(preds, gts):\n            pred = paddle.gather(pred,\n                                 paddle.nonzero(gt != self.ignore_index)[:, 0])\n            loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),\n                            F.log_softmax(pred[:, :-1], axis=1))\n            loss = paddle.clip(loss, min=0, max=self.threshold**2)\n            total_loss += paddle.mean(loss)\n        return total_loss / batch_size\nclass GaussianSimilarityTMSE(nn.Layer):\n    \"\"\"\n    Temporal MSE Loss Function with Gaussian Similarity Weighting\n    \"\"\"\n    def __init__(self, threshold=4, sigma=1.0, ignore_index=255):\n        super().__init__()\n        self.threshold = threshold\n        self.ignore_index = ignore_index",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:33-66"
+    },
+    "7989": {
+        "file_id": 589,
+        "content": "The code defines two classes: ASRF_Loss and GaussianSimilarityTMSE. The first class represents the Average Symmetric Ranking Forest Loss, while the second class is a Temporal MSE Loss Function with Gaussian Similarity Weighting. Both classes inherit from nn.Layer and have an __init__ method for initialization, as well as a forward method for calculating losses. The ASRF_Loss class uses an MSELoss function to calculate loss between predicted and ground truth frames, while the GaussianSimilarityTMSE class calculates temporal MSE with Gaussian similarity weighting.",
+        "type": "comment"
+    },
+    "7990": {
+        "file_id": 589,
+        "content": "        self.mse = nn.MSELoss(reduction=\"none\")\n        self.sigma = sigma\n    def forward(self, preds, gts, sim_index):\n        \"\"\"\n        Args:\n            preds: the output of model before softmax. (N, C, T)\n            gts: Ground Truth. (N, T)\n            sim_index: similarity index. (N, C, T)\n        Return:\n            the value of Temporal MSE weighted by Gaussian Similarity.\n        \"\"\"\n        total_loss = 0.0\n        batch_size = preds.shape[0]\n        for pred, gt, sim in zip(preds, gts, sim_index):\n            pred = paddle.gather(pred,\n                                 paddle.nonzero(gt != self.ignore_index)[:, 0],\n                                 axis=1)\n            sim = paddle.gather(sim,\n                                paddle.nonzero(gt != self.ignore_index)[:, 0],\n                                axis=1)\n            # calculate gaussian similarity\n            diff = sim[:, 1:] - sim[:, :-1]\n            similarity = paddle.exp(\n                (-1 * paddle.norm(diff, axis=0)) / (2 * self.sigma**2))",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:67-92"
+    },
+    "7991": {
+        "file_id": 589,
+        "content": "This code calculates a temporal MSE loss weighted by Gaussian similarity. It uses Paddle's nn.MSELoss function, with reduction set to \"none\". The forward method takes in the model's predictions (before softmax), ground truth and similarity index as inputs. It loops through each input, performs non-zero checks for gt != ignore_index, then gathers the relevant rows from the predicted values. It calculates gaussian similarity using the gathered data and the given sigma value. The calculated loss is returned.",
+        "type": "comment"
+    },
+    "7992": {
+        "file_id": 589,
+        "content": "            # calculate temporal mse\n            loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),\n                            F.log_softmax(pred[:, :-1], axis=1))\n            loss = paddle.clip(loss, min=0, max=self.threshold**2)\n            # gaussian similarity weighting\n            loss = similarity * loss\n            total_loss += paddle.mean(loss)\n        return total_loss / batch_size\nclass FocalLoss(nn.Layer):\n    def __init__(self,\n                 weight=None,\n                 size_average=True,\n                 batch_average=True,\n                 ignore_index=255,\n                 gamma=2.0,\n                 alpha=0.25):\n        super().__init__()\n        self.gamma = gamma\n        self.alpha = alpha\n        self.batch_average = batch_average\n        self.criterion = nn.CrossEntropyLoss(weight=weight,\n                                             ignore_index=ignore_index,\n                                             size_average=size_average)\n    def forward(self, logit, target):\n        n, _, _ = logit.size()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:94-126"
+    },
+    "7993": {
+        "file_id": 589,
+        "content": "This code defines a class \"ASRF_Loss\" for calculating ASRF loss using temporal MSE and Gaussian similarity weighting. It also defines a class \"FocalLoss\" for focal loss calculation using CrossEntropyLoss with custom gamma and alpha parameters.",
+        "type": "comment"
+    },
+    "7994": {
+        "file_id": 589,
+        "content": "        logpt = -self.criterion(logit, target.long())\n        pt = paddle.exp(logpt)\n        if self.alpha is not None:\n            logpt *= self.alpha\n        loss = -((1 - pt)**self.gamma) * logpt\n        if self.batch_average:\n            loss /= n\n        return loss\nclass ActionSegmentationLoss(nn.Layer):\n    \"\"\"\n    Loss Function for Action Segmentation\n    You can choose the below loss functions and combine them.\n        - Cross Entropy Loss (CE)\n        - Focal Loss\n        - Temporal MSE (TMSE)\n        - Gaussian Similarity TMSE (GSTMSE)\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 file_path,\n                 label_path,\n                 ce=True,\n                 focal=True,\n                 tmse=False,\n                 gstmse=False,\n                 weight=None,\n                 threshold=4.,\n                 ignore_index=255,\n                 ce_weight=1.0,\n                 focal_weight=1.0,\n                 tmse_weight=0.15,\n                 gstmse_weight=0.15):\n        super().__init__()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:128-167"
+    },
+    "7995": {
+        "file_id": 589,
+        "content": "This code defines an ActionSegmentationLoss class, which is a loss function for action segmentation tasks. It allows the user to choose from various loss functions including Cross Entropy Loss (CE), Focal Loss, Temporal MSE (TMSE), and Gaussian Similarity TMSE (GSTMSE). The user can specify parameters such as num_classes, file_path, label_path, ce, focal, tmse, gstmse, weight, threshold, ignore_index, ce_weight, focal_weight, and tmse_weight. The class initializes the chosen loss functions and calculates the overall loss based on user inputs.",
+        "type": "comment"
+    },
+    "7996": {
+        "file_id": 589,
+        "content": "        self.criterions = []\n        self.weights = []\n        self.num_classes = num_classes\n        self.file_path = file_path\n        self.label_path = label_path\n        if weight:\n            class_weight = self.get_class_weight()\n        else:\n            class_weight = None\n        if ce:\n            self.criterions.append(\n                nn.CrossEntropyLoss(weight=class_weight,\n                                    ignore_index=ignore_index))\n            self.weights.append(ce_weight)\n        if focal:\n            self.criterions.append(FocalLoss(ignore_index=ignore_index))\n            self.weights.append(focal_weight)\n        if tmse:\n            self.criterions.append(\n                TMSE(threshold=threshold, ignore_index=ignore_index))\n            self.weights.append(tmse_weight)\n        if gstmse:\n            self.criterions.append(\n                GaussianSimilarityTMSE(threshold=threshold,\n                                       ignore_index=ignore_index))\n            self.weights.append(gstmse_weight)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:168-198"
+    },
+    "7997": {
+        "file_id": 589,
+        "content": "The code initializes criterions and weights for different loss functions based on the provided parameters. It adds CrossEntropyLoss, FocalLoss, TMSE, and GaussianSimilarityTMSE to self.criterions list, and their corresponding weights to self.weights list. The weight parameter determines whether class weights are used in CrossEntropyLoss. Ignore_index is added for all loss functions.",
+        "type": "comment"
+    },
+    "7998": {
+        "file_id": 589,
+        "content": "        if len(self.criterions) == 0:\n            print(\"You have to choose at least one loss function.\")\n            sys.exit(1)\n    def get_class_weight(self):\n        \"\"\"\n        Class weight for CrossEntropy\n        Class weight is calculated in the way described in:\n            D. Eigen and R. Fergus, “Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture,” in ICCV,\n            openaccess: https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Eigen_Predicting_Depth_Surface_ICCV_2015_paper.pdf\n        \"\"\"\n        # load file list\n        file_ptr = open(self.file_path, 'r')\n        info = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        nums = [0 for i in range(self.num_classes)]\n        for i in range(len(info)):\n            video_name = info[i]\n            file_name = video_name.split('.')[0] + \".npy\"\n            label_file_path = os.path.join(self.label_path, file_name)\n            label = np.load(label_file_path).astype(np.int64)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:200-221"
+    },
+    "7999": {
+        "file_id": 589,
+        "content": "The code snippet loads file information from a given path and calculates class weights for CrossEntropy loss function, based on the method described in the Eigen and Fergus paper. It reads file names and their corresponding labels, and stores them as lists for later use.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/8.json b/docs/data/8.json
new file mode 100644
index 000000000..8a0ad0920
--- /dev/null
+++ b/docs/data/8.json
@@ -0,0 +1,539 @@
+{
+    "800": {
+        "file_id": 69,
+        "content": "        self.fmt = fmt\n        self.need_avg = need_avg\n        self.reset()\n    def reset(self):\n        \"\"\" reset \"\"\"\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        \"\"\" update \"\"\"\n        if isinstance(val, paddle.Tensor):\n            val = float(val)\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n    @property\n    def total(self):\n        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)\n    @property\n    def total_minute(self):\n        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,\n                                                            self=self)\n    @property\n    def mean(self):\n        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(\n            self=self) if self.need_avg else ''\n    @property\n    def value(self):\n        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)\ndef log_batch(metric_list,\n              batch_id,",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:73-113"
+    },
+    "801": {
+        "file_id": 69,
+        "content": "This code defines a class for tracking metrics such as sum, count, average, and total values. The `update` method allows updating the metric with a new value, while the `total`, `total_minute`, `mean`, and `value` properties retrieve the current metric value in different formats. The `log_batch` function logs batch metrics for a list of metrics at a specified batch ID.",
+        "type": "comment"
+    },
+    "802": {
+        "file_id": 69,
+        "content": "              epoch_id,\n              total_epoch,\n              mode,\n              ips,\n              tot_step=None,\n              max_iters=None):\n    batch_cost = str(metric_list['batch_time'].value) + ' sec,'\n    reader_cost = str(metric_list['reader_time'].value) + ' sec,'\n    metric_values = []\n    for m in metric_list:\n        if not (m == 'batch_time' or m == 'reader_time'):\n            metric_values.append(metric_list[m].value)\n    metric_str = ' '.join([str(v) for v in metric_values])\n    if max_iters:\n        epoch_str = \"iter:[{:>3d}/{:<3d}]\".format(tot_step, max_iters)\n    else:\n        epoch_str = \"epoch:[{:>3d}/{:<3d}]\".format(epoch_id, total_epoch)\n    step_str = \"{:s} step:{:<4d}\".format(mode, batch_id)\n    logger.info(\"{:s} {:s} {:s} {:s} {:s} {}\".format(\n        coloring(epoch_str, \"HEADER\") if batch_id == 0 else epoch_str,\n        coloring(step_str, \"PURPLE\"), coloring(metric_str, 'OKGREEN'),\n        coloring(batch_cost, \"OKGREEN\"), coloring(reader_cost, 'OKGREEN'),\n        ips))\ndef log_epoch(metric_list, epoch, mode, ips):",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:114-141"
+    },
+    "803": {
+        "file_id": 69,
+        "content": "This function logs epoch metrics and step information for a video processing task. It formats the log string with different colors for each section: epoch or iteration, step number, metric values, batch time, reader time, and ips (images per second). The logger outputs this formatted string to provide an informative summary of the task's progress.",
+        "type": "comment"
+    },
+    "804": {
+        "file_id": 69,
+        "content": "    batch_cost = 'avg_' + str(metric_list['batch_time'].value) + ' sec,'\n    reader_cost = 'avg_' + str(metric_list['reader_time'].value) + ' sec,'\n    batch_sum = str(metric_list['batch_time'].total) + ' sec,'\n    metric_values = []\n    for m in metric_list:\n        if not (m == 'batch_time' or m == 'reader_time'):\n            metric_values.append(metric_list[m].mean)\n    metric_str = ' '.join([str(v) for v in metric_values])\n    end_epoch_str = \"END epoch:{:<3d}\".format(epoch)\n    logger.info(\"{:s} {:s} {:s} {:s} {:s} {:s} {}\".format(\n        coloring(end_epoch_str, \"RED\"), coloring(mode, \"PURPLE\"),\n        coloring(metric_str, \"OKGREEN\"), coloring(batch_cost, \"OKGREEN\"),\n        coloring(reader_cost, \"OKGREEN\"), coloring(batch_sum, \"OKGREEN\"), ips))",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:142-157"
+    },
+    "805": {
+        "file_id": 69,
+        "content": "This code is formatting and logging information at the end of an epoch. It calculates various metric values, constructs a formatted string with different colors, and then logs this information using logger.info(). The metrics include batch time, reader time, total cost, mode, and inference per second. The strings are color-coded for visual clarity.",
+        "type": "comment"
+    },
+    "806": {
+        "file_id": 70,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py",
+        "type": "filepath"
+    },
+    "807": {
+        "file_id": 70,
+        "content": "The code defines a Registry class for mapping names to objects and provides methods for registering, getting, and unregistering objects. It utilizes the @BACKBONES.register() decorator or BACKBONES.register(ResNet) function for registration, and also verifies if an object with a given name exists in the registry using the `get` method.",
+        "type": "summary"
+    },
+    "808": {
+        "file_id": 70,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nclass Registry(object):\n    \"\"\"\n    The registry that provides name -> object mapping, to support third-party users' custom modules.\n    To register an object:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        @BACKBONES.register()\n        class ResNet:\n            pass\n    Or:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        class ResNet:\n            pass\n        BACKBONES.register(ResNet)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:1-34"
+    },
+    "809": {
+        "file_id": 70,
+        "content": "This code defines a Registry class that provides name to object mapping, allowing third-party users to register their custom modules. Users can register their objects by using the @BACKBONES.register() decorator or by calling BACKBONES.register(ResNet).",
+        "type": "comment"
+    },
+    "810": {
+        "file_id": 70,
+        "content": "    Usage: To build a module.\n    .. code-block:: python\n        backbone_name = \"ResNet\"\n        b = BACKBONES.get(backbone_name)()\n    \"\"\"\n    def __init__(self, name):\n        \"\"\"\n        Args:\n            name (str): the name of this registry\n        \"\"\"\n        self._name = name\n        self._obj_map = {}\n    def __contains__(self, key):\n        return self._obj_map.get(key) is not None\n    def _do_register(self, name, obj):\n        assert (\n            name not in self._obj_map\n        ), \"An object named '{}' was already registered in '{}' registry!\".format(\n            name, self._name)\n        self._obj_map[name] = obj\n    def register(self, obj=None, name=None):\n        \"\"\"\n        Register the given object under the the name `obj.__name__`.\n        Can be used as either a decorator or not. See docstring of this class for usage.\n        \"\"\"\n        if obj is None:\n            # used as a decorator\n            def deco(func_or_class, name=name):\n                if name is None:\n                    name = func_or_class.__name__",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:36-70"
+    },
+    "811": {
+        "file_id": 70,
+        "content": "This code is a registry class for storing and managing objects. It allows registering objects under their names or using decorators, and provides functions to check if an object with a given name exists in the registry.",
+        "type": "comment"
+    },
+    "812": {
+        "file_id": 70,
+        "content": "                self._do_register(name, func_or_class)\n                return func_or_class\n            return deco\n        # used as a function call\n        if name is None:\n            name = obj.__name__\n        self._do_register(name, obj)\n    def get(self, name):\n        \"\"\"Get the registry record.\n        Args:\n            name (str): The class name.\n        Returns:\n            ret: The class.\n        \"\"\"\n        ret = self._obj_map.get(name)\n        if ret is None:\n            raise KeyError(\n                \"No object named '{}' found in '{}' registry!\".format(\n                    name, self._name))\n        return ret",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:71-96"
+    },
+    "813": {
+        "file_id": 70,
+        "content": "The code defines a class with methods for registering, getting and unregistering objects in a registry. The `_do_register` method is used to store the object's name and function or class into a dictionary. If no name is provided when calling the function, it defaults to the object's name. The `get` method retrieves an object from the registry using its name. If the object is not found, it raises a KeyError with an error message.",
+        "type": "comment"
+    },
+    "814": {
+        "file_id": 71,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py",
+        "type": "filepath"
+    },
+    "815": {
+        "file_id": 71,
+        "content": "This code adapts ViT model parameters, modifies pos_embed and time_embed for compatibility, and includes functions for loading/saving PaddlePaddle models with parallel/non-parallel handling and progress bar.",
+        "type": "summary"
+    },
+    "816": {
+        "file_id": 71,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport os.path as osp\nimport time\nfrom tqdm import tqdm\nimport paddle\nimport paddle.nn.functional as F\nfrom EIVideo.paddlevideo.utils import get_logger\nfrom EIVideo.paddlevideo.utils import main_only\ndef pretrain_vit_param_trans(model, state_dicts, num_patches, seg_num,\n                             attention_type):\n    \"\"\"\n    Convert ViT's pre-trained model parameters to a parameter dictionary that matches the existing model",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:1-28"
+    },
+    "817": {
+        "file_id": 71,
+        "content": "The code is a function that converts pre-trained ViT model parameters to match the existing model. It takes in the model, state_dicts, num_patches, seg_num, and attention_type as arguments. The function adapts the ViT's pre-trained model parameters for better compatibility with the existing model structure.",
+        "type": "comment"
+    },
+    "818": {
+        "file_id": 71,
+        "content": "    \"\"\"\n    if 'head' + '.weight' in state_dicts:\n        del state_dicts['head' + '.weight']\n    if 'head' + '.bias' in state_dicts:\n        del state_dicts['head' + '.bias']\n    total_len = len(model.state_dict())\n    if num_patches + 1 != state_dicts['pos_embed'].shape[1]:\n        pos_embed = state_dicts['pos_embed']\n        cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)\n        other_pos_embed = pos_embed[0,\n                                    1:, :].unsqueeze(0).unsqueeze(1).transpose(\n                                        (0, 1, 3, 2))\n        new_pos_embed = F.interpolate(other_pos_embed,\n                                      size=(other_pos_embed.shape[-2],\n                                            num_patches),\n                                      mode='nearest')\n        new_pos_embed = new_pos_embed.squeeze(0).transpose((0, 2, 1))\n        new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), axis=1)\n        state_dicts['pos_embed'] = new_pos_embed\n        time.sleep(0.01)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:29-49"
+    },
+    "819": {
+        "file_id": 71,
+        "content": "This code modifies the 'pos_embed' tensor in state_dicts if its shape doesn't match the expected shape. It interpolates the other_pos_embed to fit the desired size, then concatenates the cls_pos_embed and new_pos_embed and updates the state_dicts['pos_embed']. This allows the code to maintain consistency in the 'pos_embed' tensor.",
+        "type": "comment"
+    },
+    "820": {
+        "file_id": 71,
+        "content": "    if 'time_embed' in state_dicts and seg_num != state_dicts[\n            'time_embed'].shape[1]:\n        time_embed = state_dicts['time_embed'].transpose((0, 2, 1)).unsqueeze(0)\n        new_time_embed = F.interpolate(time_embed,\n                                       size=(time_embed.shape[-2], seg_num),\n                                       mode='nearest')\n        state_dicts['time_embed'] = new_time_embed.squeeze(0).transpose(\n            (0, 2, 1))\n        time.sleep(0.01)\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        if attention_type == 'divided_space_time':\n            new_state_dicts = state_dicts.copy()\n            for key in tqdm(state_dicts):\n                if 'blocks' in key and 'attn' in key:\n                    desc.set_description(\"Loading %s\" % key)\n                    new_key = key.replace('attn', 'temporal_attn')\n                    if not new_key in state_dicts:\n                        new_state_dicts[new_key] = state_dicts[key]",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:51-71"
+    },
+    "821": {
+        "file_id": 71,
+        "content": "The code checks if a specific key 'time_embed' exists in the state_dicts and adjusts its shape accordingly. It then interpolates the time_embed using nearest mode and transposes it to fit into the new shape. After that, it creates a progress bar \"Loading weights\" using tqdm for the total length of data and sets the description as the current key being processed. If 'attn' is present in the key and 'blocks', it replaces 'attn' with 'temporal_attn' if not already present in state_dicts and adds it to new_state_dicts.",
+        "type": "comment"
+    },
+    "822": {
+        "file_id": 71,
+        "content": "                    else:\n                        new_state_dicts[new_key] = state_dicts[new_key]\n                if 'blocks' in key and 'norm1' in key:\n                    desc.set_description(\"Loading %s\" % key)\n                    new_key = key.replace('norm1', 'temporal_norm1')\n                    if not new_key in state_dicts:\n                        new_state_dicts[new_key] = state_dicts[key]\n                    else:\n                        new_state_dicts[new_key] = state_dicts[new_key]\n                time.sleep(0.01)\n    ret_str = \"loading {:<20d} weights completed.\".format(\n        len(model.state_dict()))\n    desc.set_description(ret_str)\n    return new_state_dicts\ndef pretrain_resnet18_param_trans(model, loaded_dict):\n    encoder_dict = model.encoder.state_dict()\n    pose_encoder_dict = model.pose_encoder.state_dict()\n    names = ['encoder.', 'encoder_day.', 'encoder_night.']\n    for name in names:\n        for key, value in loaded_dict.items():\n            key = str(name + key)\n            if key in encoder_dict:",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:72-96"
+    },
+    "823": {
+        "file_id": 71,
+        "content": "This code is checking if a certain key exists in the state_dict and creating a new key with 'temporal' added to it. It is also updating the description for loading weights and returning the updated state_dicts. The function pretrain_resnet18_param_trans compares loaded dict with encoder and pose_encoder dictionaries in the model, possibly for parameter transfer learning.",
+        "type": "comment"
+    },
+    "824": {
+        "file_id": 71,
+        "content": "                encoder_dict[key] = value\n    num_input_images = 2\n    loaded_dict['conv1.weight'] = paddle.concat(\n        [loaded_dict['conv1.weight']] * num_input_images, 1) / num_input_images\n    for name, value in loaded_dict.items():\n        name = str('encoder.' + name)\n        if name in pose_encoder_dict:\n            pose_encoder_dict[name] = value\n    return encoder_dict, pose_encoder_dict\n#XXX(shipping): maybe need load N times because of different cards have different params.\n@main_only\ndef load_ckpt(model, weight_path, **kargs):\n    \"\"\"\n    1. Load pre-trained model parameters\n    2. Extract and convert from the pre-trained model to the parameters\n    required by the existing model\n    3. Load the converted parameters of the existing model\n    \"\"\"\n    #model.set_state_dict(state_dict)\n    if not osp.isfile(weight_path):\n        raise IOError(f'{weight_path} is not a checkpoint file')\n    #state_dicts = load(weight_path)\n    logger = get_logger(\"paddlevideo\")\n    state_dicts = paddle.load(weight_path)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:97-127"
+    },
+    "825": {
+        "file_id": 71,
+        "content": "The code is defining a function to load pre-trained model parameters, which requires converting the parameters of the pre-trained model into the parameters needed for the current model. The function first checks if the weight path exists and raises an IOError if it does not. Then, it loads the state_dicts from the given weight_path using paddle.load().",
+        "type": "comment"
+    },
+    "826": {
+        "file_id": 71,
+        "content": "    if 'ResnetEncoder' in str(model):\n        encoder_dict, pose_encoder_dict = pretrain_resnet18_param_trans(\n            model, state_dicts)\n        tmp = model.state_dict()\n        tmp.update(\n            {'backbone.encoder.' + k: v\n             for (k, v) in encoder_dict.items()})\n        tmp.update({\n            'backbone.pose_encoder.' + k: v\n            for (k, v) in pose_encoder_dict.items()\n        })\n    elif \"VisionTransformer\" in str(model):  # For TimeSformer case\n        tmp = pretrain_vit_param_trans(model, state_dicts, kargs['num_patches'],\n                                       kargs['seg_num'],\n                                       kargs['attention_type'])\n    else:\n        tmp = {}\n        total_len = len(model.state_dict())\n        with tqdm(total=total_len,\n                  position=1,\n                  bar_format='{desc}',\n                  desc=\"Loading weights\") as desc:\n            for item in tqdm(model.state_dict(), total=total_len, position=0):\n                name = item\n                desc.set_description('Loading %s' % name)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:128-152"
+    },
+    "827": {
+        "file_id": 71,
+        "content": "This code is used to load weights for a model, specifically handling Resnet Encoder and Vision Transformer cases. For Resnet Encoder, it updates the state dictionary with separate dictionaries for encoder and pose_encoder. For Vision Transformer (TimeSformer), it uses pretrain_vit_param_trans function. If the model is neither of these types, it initializes an empty dictionary. The code also includes a tqdm progress bar to show the loading progress.",
+        "type": "comment"
+    },
+    "828": {
+        "file_id": 71,
+        "content": "                if name not in state_dicts:  # Convert from non-parallel model\n                    if str('backbone.' + name) in state_dicts:\n                        tmp[name] = state_dicts['backbone.' + name]\n                else:  # Convert from parallel model\n                    tmp[name] = state_dicts[name]\n                time.sleep(0.01)\n        ret_str = \"loading {:<20d} weights completed.\".format(\n            len(model.state_dict()))\n        desc.set_description(ret_str)\n    model.set_state_dict(tmp)\ndef mkdir(dir):\n    if not os.path.exists(dir):\n        # avoid error when train with multiple gpus\n        try:\n            os.makedirs(dir)\n        except:\n            pass\n@main_only\ndef save(obj, path):\n    paddle.save(obj, path)\ndef load(file_name):\n    if not osp.isfile(file_name):\n        raise IOError(f'{file_name} not exist')\n    return paddle.load(file_name)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:153-182"
+    },
+    "829": {
+        "file_id": 71,
+        "content": "This code snippet defines functions for loading and saving PaddlePaddle models. It checks if the model is parallel or non-parallel, converts the state dictionaries accordingly, and updates the model's state dictionary. The `mkdir` function creates a directory if it doesn't exist already, and there are separate save and load functions defined for ease of use.",
+        "type": "comment"
+    },
+    "830": {
+        "file_id": 72,
+        "content": "/applications/EIVideo/EIVideo/paddlevideo/version.py",
+        "type": "filepath"
+    },
+    "831": {
+        "file_id": 72,
+        "content": "This code contains the version information for PaddleVideo, licensed under the Apache License 2.0, and defines the current version as \"0.0.1\".",
+        "type": "summary"
+    },
+    "832": {
+        "file_id": 72,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n__all__ = [\"paddlevideo_version\"]\npaddlevideo_version = \"0.0.1\"",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/version.py:1-16"
+    },
+    "833": {
+        "file_id": 72,
+        "content": "This code contains the version information for PaddleVideo, licensed under the Apache License 2.0, and defines the current version as \"0.0.1\".",
+        "type": "comment"
+    },
+    "834": {
+        "file_id": 73,
+        "content": "/applications/EIVideo/EIVideo/setup.py",
+        "type": "filepath"
+    },
+    "835": {
+        "file_id": 73,
+        "content": "These comments provide author, date, and copyright information, indicating the source should be cited if the code is reprinted.",
+        "type": "summary"
+    },
+    "836": {
+        "file_id": 73,
+        "content": "# Author: Acer Zhang\n# Datetime: 2022/1/11\n# Copyright belongs to the author.\n# Please indicate the source for reprinting.",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/setup.py:1-4"
+    },
+    "837": {
+        "file_id": 73,
+        "content": "These comments provide author, date, and copyright information, indicating the source should be cited if the code is reprinted.",
+        "type": "comment"
+    },
+    "838": {
+        "file_id": 74,
+        "content": "/applications/EIVideo/EIVideo/version.py",
+        "type": "filepath"
+    },
+    "839": {
+        "file_id": 74,
+        "content": "This code snippet is the version information for the EIVideo application, created by Acer Zhang on January 11th, 2022. It has a version number \"0.1a\" and the author requests proper attribution if reusing the code.",
+        "type": "summary"
+    },
+    "840": {
+        "file_id": 74,
+        "content": "# Author: Acer Zhang\n# Datetime: 2022/1/11 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\n__version__ = \"0.1a\"",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/version.py:1-6"
+    },
+    "841": {
+        "file_id": 74,
+        "content": "This code snippet is the version information for the EIVideo application, created by Acer Zhang on January 11th, 2022. It has a version number \"0.1a\" and the author requests proper attribution if reusing the code.",
+        "type": "comment"
+    },
+    "842": {
+        "file_id": 75,
+        "content": "/applications/EIVideo/QEIVideo/__init__.py",
+        "type": "filepath"
+    },
+    "843": {
+        "file_id": 75,
+        "content": "Code sets the QEI_VIDEO_ROOT variable to the absolute path of the directory containing the current file. It also imports the version module from QEIVideo and assigns its __version__ attribute to a variable. This may be used for identifying the version of the QEIVideo application.",
+        "type": "summary"
+    },
+    "844": {
+        "file_id": 75,
+        "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport os\nQEI_VIDEO_ROOT = os.path.abspath(os.path.dirname(__file__))\nimport os\nfrom QEIVideo.version import __version__\nQEI_VIDEO_ROOT = os.path.abspath(os.path.dirname(__file__))",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/__init__.py:1-13"
+    },
+    "845": {
+        "file_id": 75,
+        "content": "Code sets the QEI_VIDEO_ROOT variable to the absolute path of the directory containing the current file. It also imports the version module from QEIVideo and assigns its __version__ attribute to a variable. This may be used for identifying the version of the QEIVideo application.",
+        "type": "comment"
+    },
+    "846": {
+        "file_id": 76,
+        "content": "/applications/EIVideo/QEIVideo/build_gui.py",
+        "type": "filepath"
+    },
+    "847": {
+        "file_id": 76,
+        "content": "The script uses PyQt5 to create a video processing GUI with functions for initializing variables, opening file dialogs, handling combo box indexing, and pen color changes. It also includes a `open_frame` function that updates the progress slider and stops the video at the last frame.",
+        "type": "summary"
+    },
+    "848": {
+        "file_id": 76,
+        "content": "# Author: Acer Zhang\n# Datetime:2022/1/11 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport json\nimport os\nimport numpy as np\nfrom PIL import Image\nfrom PyQt5 import QtCore, QtWidgets\nfrom PyQt5.QtGui import *\nfrom PyQt5.QtWidgets import *\nfrom PyQt5.QtCore import *\nimport cv2\nfrom EIVideo.api import json2frame, png2json, load_video\nfrom EIVideo.main import main\n# ToDo To AP-kai: 这是定义前端临时保存用于推理的json的地点之类的，因为是固定的，所以声明为全局常量是最好的\nfrom EIVideo import TEMP_JSON_SAVE_PATH, TEMP_IMG_SAVE_PATH, TEMP_JSON_FINAL_PATH\nfrom QEIVideo.gui.ui_main_window import Ui_MainWindow\nclass BuildGUI(QMainWindow, Ui_MainWindow):\n    def __init__(self):\n        super(BuildGUI, self).__init__()\n        # ToDo To AP-kai: 这里定义当前选择的视频路径的占位符，相当于全局变量\n        self.select_video_path = None\n        # ToDo To AP-kai: 未来为用户提供个保存路径的入口哈，这里先随意定义了个路径\n        self.save_path = \"./result\"\n        os.makedirs(self.save_path, exist_ok=True)\n        self.setupUi(self)\n    def infer(self):\n        self.label.setText(\"Start infer\")",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/build_gui.py:1-36"
+    },
+    "849": {
+        "file_id": 76,
+        "content": "This code is the initial part of a Python script for building a GUI (Graphical User Interface) application using PyQt5 library. It defines a class called BuildGUI that inherits from QMainWindow and Ui_MainWindow, which likely contains the layout and design elements of the GUI. The __init__ method sets up some initial variables such as the selected video path and save path for results. The infer method is meant to start an inference process.",
+        "type": "comment"
+    },
+    "850": {
+        "file_id": 76,
+        "content": "        self.progressBar.setProperty(\"value\", 0)\n        image = self.paintBoard.get_content_as_q_image()\n        image.save(TEMP_IMG_SAVE_PATH)\n        print(self.slider_frame_num)\n        self.progressBar.setProperty(\"value\", 25)\n        # ToDo To AP-kai:相同的文件路径，直接定义一个常量就好\n        png2json(TEMP_IMG_SAVE_PATH, self.slider_frame_num, TEMP_JSON_SAVE_PATH)\n        self.progressBar.setProperty(\"value\", 50)\n        # ToDo To AP-kai:打印的信息，需要注意首字母大写\n        # ToDo To AP-kai: 此处传入保存路径以及当前选择的视频路径，最后会在manet_stage1.py里通过cfg来传入\n        out = main(video_path=self.select_video_path, save_path=self.save_path)\n        print('Infer ok')\n        self.progressBar.setProperty(\"value\", 75)\n        self.all_frames = json2frame(TEMP_JSON_FINAL_PATH)\n        print(\"Success get submit_masks\")\n        self.open_frame()\n        self.progressBar.setProperty(\"value\", 100)\n        self.label.setText(\"Infer succeed\")\n    def btn_func(self, btn):\n        if btn == self.playbtn:\n            self.label.setText(\"Play video\")\n            if self.progress_slider.value() == self.cap.get(7) - 1:",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/build_gui.py:37-59"
+    },
+    "851": {
+        "file_id": 76,
+        "content": "The code snippet sets the progress bar value, saves an image, prints frame numbers, calls a main function to perform inference, loads JSON frames, opens a frame, and updates a label when play button is clicked.",
+        "type": "comment"
+    },
+    "852": {
+        "file_id": 76,
+        "content": "                self.slider_frame_num = 0\n                self.progress_slider.setValue(self.slider_frame_num)\n                self.time_label.setText('{}/{}'.format(self.slider_frame_num, self.cap.get(7)))\n            self.timer_camera = QTimer()  # 定义定时器\n            self.timer_camera.start(1000 / self.cap.get(cv2.CAP_PROP_FPS))\n            self.slider_frame_num = self.progress_slider.value()\n            self.timer_camera.timeout.connect(self.open_frame)\n        elif btn == self.pushButton_2:\n            self.label.setText(\"Stop video\")\n            self.slot_stop()\n        elif btn == self.pushButton_4:\n            self.label.setText(\"Choose video\")\n            self.select_video_path, _ = QFileDialog.getOpenFileName(self.frame, \"Open\", \"\", \"*.mp4;;All Files(*)\")\n            print(\"-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\")\n            print(\"Select video file path:\\t\" + self.select_video_path)\n            # ToDo To AP-kai:下断点来看一下，如果不选择的时候返回值是什么样的，然后再做判断，目前这个if没有生效\n            if self.select_video_path != \"\":",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/build_gui.py:60-78"
+    },
+    "853": {
+        "file_id": 76,
+        "content": "The code above contains three elif conditions for button press events. If the self.pushButton_1 is pressed, it initializes variables and starts a timer to update the video frame. If self.pushButton_2 is pressed, it stops the video and sets the label text to \"Stop video\". If self.pushButton_4 is pressed, it opens a file dialog for choosing a video file, and if a file is chosen, it prints the selected video file path. The current code is checking if there is a non-empty selected video file path after the file dialog is closed.",
+        "type": "comment"
+    },
+    "854": {
+        "file_id": 76,
+        "content": "                self.cap = cv2.VideoCapture(self.select_video_path)\n                # 存所有frame\n                self.save_temp_frame()\n                print(\"save temp frame done\")\n                self.progress_slider.setRange(0, self.cap.get(cv2.CAP_PROP_FRAME_COUNT))\n                self.slider_frame_num = 0\n                self.open_frame()\n            # ToDo To AP-kai: 未来这个地方增加提示框，告诉他没有选择文件\n    def on_cbtn_eraser_clicked(self):\n        self.label.setText(\"Eraser On\")\n        if self.cbtn_Eraser.isChecked():\n            self.paintBoard.EraserMode = True  # 进入橡皮擦模式\n        else:\n            self.paintBoard.EraserMode = False  # 退出橡皮擦模式\n    def fill_color_list(self, combo_box):\n        index_black = 0\n        index = 0\n        for color in self.colorList:\n            if color == \"black\":\n                index_black = index\n            index += 1\n            pix = QPixmap(70, 20)\n            pix.fill(QColor(color))\n            combo_box.addItem(QIcon(pix), None)\n            combo_box.setIconSize(QSize(70, 20))\n            combo_box.setSizeAdjustPolicy(QComboBox.AdjustToContents)",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/build_gui.py:79-107"
+    },
+    "855": {
+        "file_id": 76,
+        "content": "The code snippet creates a GUI for video processing. It sets up a VideoCapture object, stores all frames, and initializes a progress slider with the total number of frames. The Eraser button toggles between EraserMode on/off in the paintBoard. The fill_color_list function populates a color combo box with predefined colors, including black at a specific index.",
+        "type": "comment"
+    },
+    "856": {
+        "file_id": 76,
+        "content": "        combo_box.setCurrentIndex(index_black)\n    def on_pen_color_change(self):\n        self.label.setText(\"Change pen color\")\n        color_index = self.comboBox_penColor.currentIndex()\n        color_str = self.colorList[color_index]\n        self.paintBoard.change_pen_color(color_str)\n    # 拖拽进度条\n    def update_video_position_func(self):\n        self.label.setText(\"Change slider position\")\n        self.slider_frame_num = self.progress_slider.value()\n        self.slot_stop()\n        self.open_frame()\n        self.progress_slider.setValue(self.slider_frame_num)\n        self.time_label.setText('{}/{}'.format(self.slider_frame_num, self.cap.get(7)))\n    def save_temp_frame(self):\n        _, self.all_frames = load_video(self.select_video_path, 480)\n    def slot_stop(self):\n        if self.cap != []:\n            self.timer_camera.stop()  # 停止计时器\n        else:\n            # ToDo To AP-kai: QMessageBox.warning没有返回值，这里我把Warming = QMessageBox.warning的Warming删去了\n            QMessageBox.warning(self, \"Warming\", \"Push the left upper corner button to Quit.\",",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/build_gui.py:109-135"
+    },
+    "857": {
+        "file_id": 76,
+        "content": "This code defines several functions for a GUI application. It sets the current index of a combo box, handles pen color changes by updating the paintBoard's color, updates the video position based on progress slider input, saves a temporary frame from a video file, and stops the timer if it is running.",
+        "type": "comment"
+    },
+    "858": {
+        "file_id": 76,
+        "content": "                                QMessageBox.Yes)\n    def open_frame(self):\n        self.progress_slider.setValue(self.slider_frame_num)\n        self.slider_frame_num = self.progress_slider.value()\n        self.frame = self.all_frames[self.slider_frame_num]\n        frame = self.frame\n        height, width, bytes_per_component = frame.shape\n        bytes_per_line = bytes_per_component * width\n        q_image = QImage(frame.data, width, height, bytes_per_line,\n                         QImage.Format_RGB888).scaled(self.picturelabel.width(), self.picturelabel.height())\n        self.picturelabel.setPixmap(QPixmap.fromImage(q_image))\n        self.slider_frame_num = self.slider_frame_num + 1\n        self.time_label.setText('{}/{}'.format(self.slider_frame_num, self.cap.get(7)))\n        if self.progress_slider.value() == self.cap.get(7) - 1:\n            self.slot_stop()",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/build_gui.py:136-151"
+    },
+    "859": {
+        "file_id": 76,
+        "content": "This code defines a function `open_frame` which updates the progress slider, displays the current frame using QImage and QPixmap, increments the slider value, sets the time label, and stops the video if at the last frame.",
+        "type": "comment"
+    },
+    "860": {
+        "file_id": 77,
+        "content": "/applications/EIVideo/QEIVideo/gui/__init__.py",
+        "type": "filepath"
+    },
+    "861": {
+        "file_id": 77,
+        "content": "This code block appears to be a comment section at the beginning of the file, indicating the author's name, date of creation, copyright information, and request for proper citation in case of reuse. The code seems to belong to PaddleVideo's EIVideo application, specifically within QEIVideo's gui module.",
+        "type": "summary"
+    },
+    "862": {
+        "file_id": 77,
+        "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/__init__.py:1-4"
+    },
+    "863": {
+        "file_id": 77,
+        "content": "This code block appears to be a comment section at the beginning of the file, indicating the author's name, date of creation, copyright information, and request for proper citation in case of reuse. The code seems to belong to PaddleVideo's EIVideo application, specifically within QEIVideo's gui module.",
+        "type": "comment"
+    },
+    "864": {
+        "file_id": 78,
+        "content": "/applications/EIVideo/QEIVideo/gui/demo.py",
+        "type": "filepath"
+    },
+    "865": {
+        "file_id": 78,
+        "content": "The code defines a `DrawFrame` class, which is a QWidget for drawing paths and responding to mouse events. It is used in conjunction with other classes such as `DemoUI`. The 'export' function converts painter's polygon to fill polygon and is triggered by the 'start_btn'. The code initializes the application and starts the event loop.",
+        "type": "summary"
+    },
+    "866": {
+        "file_id": 78,
+        "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport sys\nfrom PyQt5.QtWidgets import QApplication, QMainWindow, QFrame, QWidget\nfrom PyQt5.QtGui import QPainter, QPixmap, QPen, QColor, QPainterPath\nfrom PyQt5.QtCore import Qt, QPoint\nfrom PyQt5 import QtCore, QtGui, QtWidgets\nfrom QEIVideo.ui.demo import Ui_MainWindow as DemoUIRoot\nclass DrawFrame(QWidget):\n    def __init__(self, painter, *args, **kwargs):\n        super(DrawFrame, self).__init__(*args, **kwargs)\n        self.painter = painter\n    def paintEvent(self, event):\n        painter = QPainter(self)\n        pen = QPen(QColor(\"orange\"))\n        pen.setWidth(5)\n        pen.setCapStyle(Qt.RoundCap)\n        pen.setJoinStyle(Qt.RoundJoin)\n        painter.setPen(pen)\n        painter.drawPath(self.painter)\n    def mousePressEvent(self, event):\n        self.painter.moveTo(event.pos())\n        self.update()\n    def mouseMoveEvent(self, event):\n        self.painter.lineTo(event.pos())\n        self.update()",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/demo.py:1-36"
+    },
+    "867": {
+        "file_id": 78,
+        "content": "This code defines a class called DrawFrame, which is a QWidget that can be drawn on. It overrides the paintEvent method to draw paths using a QPainter, and responds to mouse events for line drawing. The class takes a painter object in its constructor, suggesting it could be used in conjunction with other classes or methods.",
+        "type": "comment"
+    },
+    "868": {
+        "file_id": 78,
+        "content": "class DemoUI(QMainWindow, DemoUIRoot):\n    def __init__(self):\n        super(DemoUI, self).__init__()\n        self.setupUi(self)\n        self.painter = QPainterPath()\n        self.draw_frame = DrawFrame(self.painter, self.video_frame)\n        self.draw_frame.setGeometry(QtCore.QRect(0, 10, 751, 301))\n        self.draw_frame.setObjectName(\"draw_frame\")\n        self.draw_frame.raise_()\n        self.draw_frame.setAttribute(QtCore.Qt.WA_TranslucentBackground)\n        self.start_btn.clicked.connect(self.export)\n    def export(self):\n        a = self.painter.toFillPolygon()\n        pass\nif __name__ == '__main__':\n    app = QApplication(sys.argv)\n    gui_class = DemoUI()\n    gui_class.show()\n    sys.exit(app.exec_())",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/demo.py:39-62"
+    },
+    "869": {
+        "file_id": 78,
+        "content": "This code initializes a `DemoUI` class with a `DrawFrame` object that draws video frames. The `export` function converts the painter's polygon to a fill polygon and is triggered by the 'start_btn'. The code also sets up the application, creates an instance of the `DemoUI` class, and starts the event loop.",
+        "type": "comment"
+    },
+    "870": {
+        "file_id": 79,
+        "content": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py",
+        "type": "filepath"
+    },
+    "871": {
+        "file_id": 79,
+        "content": "This code initializes a video/painting app with UI elements, connects events to functions for smooth operation, and displays \"Hi, This is EIVideo\" on stop.",
+        "type": "summary"
+    },
+    "872": {
+        "file_id": 79,
+        "content": "# -*- coding: utf-8 -*-\n# Form implementation generated from reading ui file 'GUI.ui'\n#\n# Created by: PyQt5 UI code generator 5.15.2\n#\n# WARNING: Any manual changes made to this file will be lost when pyuic5 is\n# run again.  Do not edit this file unless you know what you are doing.\nfrom PyQt5 import QtCore, QtWidgets\nfrom PyQt5.QtGui import *\nfrom PyQt5.QtWidgets import *\nfrom PyQt5.QtCore import *\nfrom QEIVideo.widget.PaintBoard import PaintBoard\nclass Ui_MainWindow(object):\n    def setupUi(self, MainWindow):\n        MainWindow.setObjectName(\"EIVideo\")\n        MainWindow.resize(1101, 751)\n        self.centralwidget = QtWidgets.QWidget(MainWindow)\n        self.centralwidget.setObjectName(\"centralwidget\")\n        self.frame = QtWidgets.QFrame(self.centralwidget)\n        self.frame.setGeometry(QtCore.QRect(20, 20, 1271, 771))\n        self.frame.setFrameShadow(QtWidgets.QFrame.Raised)\n        self.frame.setObjectName(\"frame\")\n        self.cap = []\n        self.all_frames = []\n        self.fps = None\n        self.timer = QTimer(self.frame)",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:1-32"
+    },
+    "873": {
+        "file_id": 79,
+        "content": "This code initializes the main window of a video application, sets its size and geometry, creates an empty list for capturing frames, and starts a timer.",
+        "type": "comment"
+    },
+    "874": {
+        "file_id": 79,
+        "content": "        self.time_label = QLabel('--/--', self.frame)\n        self.progress_slider = QSlider(self.frame)\n        self.progress_slider.setEnabled(True)\n        self.progress_slider.setOrientation(Qt.Horizontal)\n        self.progress_slider.setFixedWidth(710)\n        self.progress_slider.setFixedHeight(20)\n        self.progress_slider.setSingleStep(1)  # 设置变化步长\n        self.progress_slider.setValue(0)\n        self.progress_slider.sliderReleased.connect(self.update_video_position_func)  # 拖拽进度条\n        self.picturelabel = QtWidgets.QLabel(self.frame)\n        self.picturelabel.setGeometry(30, 30, 810, 458)\n        self.picturelabel.setText(\"\")\n        self.picturelabel.setObjectName(\"picturelabel\")\n        self.paintBoard = PaintBoard(self.frame)\n        self.paintBoard.setGeometry(30, 30, 810, 458)\n        self.cbtn_Eraser = QCheckBox(\"橡皮擦\")\n        self.cbtn_Eraser.setParent(self.frame)\n        self.cbtn_Eraser.move(950, 40)\n        self.cbtn_Eraser.clicked.connect(self.on_cbtn_eraser_clicked)\n        self.btn_Clear = QPushButton(\"清空画板\")",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:33-56"
+    },
+    "875": {
+        "file_id": 79,
+        "content": "This code is initializing UI elements in a window, setting up progress slider for video position tracking, adding a picture label and paint board for drawing, and enabling interaction with eraser checkbox and clear button.",
+        "type": "comment"
+    },
+    "876": {
+        "file_id": 79,
+        "content": "        self.btn_Clear.setParent(self.frame)  # 设置父对象为本界面\n        self.btn_Clear.move(950, 60)\n        self.btn_Clear.clicked.connect(self.paintBoard.clear)\n        self.label_penColor = QLabel(self.frame)\n        self.label_penColor.setText(\"画笔颜色\")\n        self.label_penColor.move(990, 100)\n        # 获取颜色列表(字符串类型)\n        self.colorList = QColor.colorNames()\n        self.comboBox_penColor = QComboBox(self.frame)\n        self.fill_color_list(self.comboBox_penColor)  # 用各种颜色填充下拉列表\n        self.comboBox_penColor.move(1080, 80)\n        self.comboBox_penColor.currentIndexChanged.connect(\n            self.on_pen_color_change)  # 关联下拉列表的当前索引变更信号与函数on_PenColorChange\n        self.helplabel = QLabel()\n        self.helplabel.setText(\"Hi,Welcome to use EIVideo\\n\"\n                               \"This is a guide for EIVideo,\\n\"\n                               \"please check\\n\"\n                               \"1. Choose 'Add' for a video\\n\"\n                               \"2. Click 'Play' to start playing\\n\"\n                               \"3. At this point, all functions \\n\"",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:57-77"
+    },
+    "877": {
+        "file_id": 79,
+        "content": "The code sets the parent object of btn_Clear to self.frame, moves the btn_Clear and label_penColor widgets to specific positions, connects a button click event to clear the paintBoard, fills a comboBox_penColor with color options, places it at a particular location, links its currentIndexChanged signal to on_pen_color_change function, and provides guidance for using EIVideo.",
+        "type": "comment"
+    },
+    "878": {
+        "file_id": 79,
+        "content": "                               \"are unlocked\\n\"\n                               \"4. Paint and enjoy it!\\n\")\n        self.widget2 = QtWidgets.QWidget(self.frame)\n        self.widget2.setGeometry(860, 60, 200, 300)\n        self.widget2.setObjectName(\"widget2\")\n        self.rightLayout = QtWidgets.QVBoxLayout(self.widget2)\n        self.rightLayout.setContentsMargins(0, 0, 0, 0)\n        self.rightLayout.setObjectName(\"rightLayout\")\n        self.rightLayout.addWidget(self.helplabel)\n        self.rightLayout.addSpacing(50)\n        self.rightLayout.addWidget(self.cbtn_Eraser)\n        self.rightLayout.addWidget(self.btn_Clear)\n        self.colorLayout = QtWidgets.QHBoxLayout(self.widget2)\n        self.colorLayout.setContentsMargins(0, 0, 0, 0)\n        self.colorLayout.setObjectName('colorLayout')\n        self.colorLayout.addWidget(self.label_penColor)\n        self.colorLayout.addWidget(self.comboBox_penColor)\n        self.rightLayout.addLayout(self.colorLayout)\n        # pushButton_6 -> GO\n        self.pushButton_6 = QtWidgets.QPushButton(self.frame)",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:78-101"
+    },
+    "879": {
+        "file_id": 79,
+        "content": "This code sets up a user interface layout for a painting application. It includes a help label, buttons to clear and switch the eraser, a color picker, and a 'GO' push button. The layout is organized in a vertical box and horizontal box arrangement.",
+        "type": "comment"
+    },
+    "880": {
+        "file_id": 79,
+        "content": "        self.pushButton_6.setGeometry(870, 600, 150, 90)\n        self.pushButton_6.setObjectName(\"pushButton_6\")\n        self.pushButton_6.clicked.connect(self.infer)\n        self.widget1 = QtWidgets.QWidget(self.frame)\n        self.widget1.move(60, 520)\n        self.widget1.setObjectName(\"widget1\")\n        self.barLayout = QtWidgets.QVBoxLayout(self.widget1)\n        self.barLayout.setContentsMargins(0, 0, 0, 0)\n        self.barLayout.setObjectName(\"barLayout\")\n        self.horizontalLayout = QtWidgets.QHBoxLayout(self.widget1)\n        self.horizontalLayout.setContentsMargins(0, 0, 0, 0)\n        self.horizontalLayout.setObjectName(\"horizontalLayout\")\n        self.timeLayout = QtWidgets.QHBoxLayout(self.widget1)\n        self.timeLayout.setContentsMargins(0, 0, 0, 0)\n        self.timeLayout.setObjectName(\"horizontalLayout\")\n        self.playbtn = QtWidgets.QPushButton(self.widget1)\n        self.playbtn.setObjectName(\"playbtn\")\n        self.playbtn.clicked.connect(lambda: self.btn_func(self.playbtn))\n        self.horizontalLayout.addWidget(self.playbtn)",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:102-122"
+    },
+    "881": {
+        "file_id": 79,
+        "content": "This code is creating a UI for a video player. It sets the position and function of a play button, as well as defining layouts for other UI elements such as time display. The play button's click event is connected to a method called \"btn_func\" which takes the play button as an argument.",
+        "type": "comment"
+    },
+    "882": {
+        "file_id": 79,
+        "content": "        self.pushButton_2 = QtWidgets.QPushButton(self.widget1)\n        self.pushButton_2.setObjectName(\"pushButton_2\")\n        self.pushButton_2.clicked.connect(lambda: self.btn_func(self.pushButton_2))\n        self.horizontalLayout.addWidget(self.pushButton_2)\n        self.pushButton_4 = QtWidgets.QPushButton(self.widget1)\n        self.pushButton_4.setObjectName(\"pushButton_4\")\n        self.pushButton_4.clicked.connect(lambda: self.btn_func(self.pushButton_4))\n        self.horizontalLayout.addWidget(self.pushButton_4)\n        self.timeLayout.addWidget(self.progress_slider)\n        self.timeLayout.addWidget(self.time_label)\n        self.barLayout.addSpacing(20)\n        self.barLayout.addLayout(self.timeLayout)\n        self.barLayout.addSpacing(30)\n        self.barLayout.addLayout(self.horizontalLayout)\n        self.splitter = QtWidgets.QSplitter(self.frame)\n        self.splitter.setGeometry(QtCore.QRect(71, 670, 750, 20))\n        self.splitter.setOrientation(QtCore.Qt.Horizontal)\n        self.splitter.setObjectName(\"splitter\")",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:123-142"
+    },
+    "883": {
+        "file_id": 79,
+        "content": "This code initializes two push buttons, connects their click events to a function, adds them to a horizontal layout, and adds the layout to a splitter. It also adds time-related widgets to another layout, and adds both layouts to the splitter.",
+        "type": "comment"
+    },
+    "884": {
+        "file_id": 79,
+        "content": "        self.label = QtWidgets.QLabel(self.splitter)\n        self.label.setObjectName(\"label\")\n        self.progressBar = QtWidgets.QProgressBar(self.splitter)\n        self.progressBar.setProperty(\"value\", 0)\n        self.progressBar.setObjectName(\"progressBar\")\n        MainWindow.setCentralWidget(self.centralwidget)\n        self.menubar = QtWidgets.QMenuBar(MainWindow)\n        self.menubar.setGeometry(QtCore.QRect(0, 0, 1327, 23))\n        self.menubar.setObjectName(\"menubar\")\n        MainWindow.setMenuBar(self.menubar)\n        self.statusbar = QtWidgets.QStatusBar(MainWindow)\n        self.statusbar.setObjectName(\"statusbar\")\n        MainWindow.setStatusBar(self.statusbar)\n        self.retranslateUi(MainWindow)\n        QtCore.QMetaObject.connectSlotsByName(MainWindow)\n    def retranslateUi(self, MainWindow):\n        _translate = QtCore.QCoreApplication.translate\n        MainWindow.setWindowTitle(_translate(\"MainWindow\", \"MainWindow\"))\n        self.pushButton_6.setText(_translate(\"MainWindow\", \"GO\"))\n        self.playbtn.setText(_translate(\"MainWindow\", \"Play\"))",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:143-164"
+    },
+    "885": {
+        "file_id": 79,
+        "content": "This code is creating a user interface for a main window of an application. It includes a label, progress bar, menu bar, and status bar. The window has a title and two buttons: \"GO\" and \"Play\". The `retranslateUi` function is used to set the window title and button labels.",
+        "type": "comment"
+    },
+    "886": {
+        "file_id": 79,
+        "content": "        self.pushButton_2.setText(_translate(\"MainWindow\", \"Stop\"))\n        self.pushButton_4.setText(_translate(\"MainWindow\", \"Add\"))\n        self.label.setText(_translate(\"MainWindow\", \"Hi, This is EIVideo\"))",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:165-167"
+    },
+    "887": {
+        "file_id": 79,
+        "content": "This code updates the text on three GUI elements in the \"MainWindow\" class. The first button is labeled \"Stop,\" the second button is labeled \"Add,\" and a label displays \"Hi, This is EIVideo.\"",
+        "type": "comment"
+    },
+    "888": {
+        "file_id": 80,
+        "content": "/applications/EIVideo/QEIVideo/start.py",
+        "type": "filepath"
+    },
+    "889": {
+        "file_id": 80,
+        "content": "The code is a Python script that initializes a QApplication object, creates an instance of the BuildGUI class from the QEIVideo module, displays it, and executes the application's event loop. It is likely used to launch a graphical user interface (GUI) for a video processing or analysis application. The author is credited, and the code includes a copyright notice requesting proper attribution if reused.",
+        "type": "summary"
+    },
+    "890": {
+        "file_id": 80,
+        "content": "# Author: AP-Kai\n# Datetime: 2022/1/7\n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport sys\nfrom QEIVideo.build_gui import BuildGUI\nfrom PyQt5.QtWidgets import QApplication\ndef run():\n    app = QApplication(sys.argv)\n    demo = BuildGUI()\n    demo.show()\n    sys.exit(app.exec())\nif __name__ == '__main__':\n    run()",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/start.py:1-20"
+    },
+    "891": {
+        "file_id": 80,
+        "content": "The code is a Python script that initializes a QApplication object, creates an instance of the BuildGUI class from the QEIVideo module, displays it, and executes the application's event loop. It is likely used to launch a graphical user interface (GUI) for a video processing or analysis application. The author is credited, and the code includes a copyright notice requesting proper attribution if reused.",
+        "type": "comment"
+    },
+    "892": {
+        "file_id": 81,
+        "content": "/applications/EIVideo/QEIVideo/tools/__init__.py",
+        "type": "filepath"
+    },
+    "893": {
+        "file_id": 81,
+        "content": "This code block appears to be a comment section at the beginning of the file, indicating the author's name, date of creation, copyright information, and request for proper citation in case of reuse. The code seems to belong to PaddleVideo's EIVideo application, specifically within QEIVideo's gui module.",
+        "type": "summary"
+    },
+    "894": {
+        "file_id": 81,
+        "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/__init__.py:1-4"
+    },
+    "895": {
+        "file_id": 81,
+        "content": "This code block appears to be a comment section at the beginning of the file, indicating the author's name, date of creation, copyright information, and request for proper citation in case of reuse. The code seems to belong to PaddleVideo's EIVideo application, specifically within QEIVideo's gui module.",
+        "type": "comment"
+    },
+    "896": {
+        "file_id": 82,
+        "content": "/applications/EIVideo/QEIVideo/ui/__init__.py",
+        "type": "filepath"
+    },
+    "897": {
+        "file_id": 82,
+        "content": "This code block appears to be a comment section at the beginning of the file, indicating the author's name, date of creation, copyright information, and request for proper citation in case of reuse. The code seems to belong to PaddleVideo's EIVideo application, specifically within QEIVideo's gui module.",
+        "type": "summary"
+    },
+    "898": {
+        "file_id": 82,
+        "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/gui/__init__.py:1-4"
+    },
+    "899": {
+        "file_id": 82,
+        "content": "This code block appears to be a comment section at the beginning of the file, indicating the author's name, date of creation, copyright information, and request for proper citation in case of reuse. The code seems to belong to PaddleVideo's EIVideo application, specifically within QEIVideo's gui module.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/80.json b/docs/data/80.json
new file mode 100644
index 000000000..250492bba
--- /dev/null
+++ b/docs/data/80.json
@@ -0,0 +1,545 @@
+{
+    "8000": {
+        "file_id": 589,
+        "content": "            num, cnt = np.unique(label, return_counts=True)\n            for n, c in zip(num, cnt):\n                nums[n] += c\n        class_num = paddle.to_tensor(nums, dtype=\"float32\")\n        total = class_num.sum().item()\n        frequency = class_num / total\n        median = paddle.median(frequency)\n        class_weight = median / frequency\n        return class_weight\n    def forward(self, preds, gts, sim_index):\n        \"\"\"\n        Args:\n            preds: paddle.float (N, C, T).\n            gts: paddle.int64 (N, T).\n            sim_index: paddle.float (N, C', T).\n        \"\"\"\n        loss = 0.0\n        for criterion, weight in zip(self.criterions, self.weights):\n            if isinstance(criterion, GaussianSimilarityTMSE):\n                loss += weight * criterion(preds, gts, sim_index)\n            elif isinstance(criterion, nn.CrossEntropyLoss):\n                preds_t = paddle.transpose(preds, perm=[0, 2, 1])\n                loss += weight * criterion(preds_t, gts)\n            else:\n                loss += weight * criterion(preds, gts)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:222-248"
+    },
+    "8001": {
+        "file_id": 589,
+        "content": "This code defines a class that calculates class weights based on the frequency of occurrence in labels. It also includes a forward function for applying different loss functions to predictions and ground truths, with associated weights. The criterion types include GaussianSimilarityTMSE and nn.CrossEntropyLoss.",
+        "type": "comment"
+    },
+    "8002": {
+        "file_id": 589,
+        "content": "        return loss\nclass BoundaryRegressionLoss(nn.Layer):\n    \"\"\"\n    Boundary Regression Loss\n        bce: Binary Cross Entropy Loss for Boundary Prediction\n        mse: Mean Squared Error\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 label_path,\n                 bce=True,\n                 focal=False,\n                 mse=False,\n                 weight=None,\n                 pos_weight=None):\n        super().__init__()\n        self.criterions = []\n        self.file_path = file_path\n        self.label_path = label_path\n        pos_weight = self.get_pos_weight()\n        if bce:\n            self.criterions.append(\n                nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight))\n        if focal:\n            self.criterions.append(FocalLoss())\n        if mse:\n            self.criterions.append(nn.MSELoss())\n        if len(self.criterions) == 0:\n            print(\"You have to choose at least one loss function.\")\n            sys.exit(1)\n    def get_pos_weight(self, norm=None):\n        \"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:250-291"
+    },
+    "8003": {
+        "file_id": 589,
+        "content": "This class defines a boundary regression loss function, which combines different loss types such as Binary Cross Entropy Loss (bce), Mean Squared Error (mse) and Focal Loss (focal). It initializes with file_path, label_path, bce, focal, mse, weight and pos_weight parameters. The get_pos_weight method retrieves a position weight depending on the norm parameter. If at least one loss function is chosen, the criterions list is created. If not, it prints an error message and exits the program.",
+        "type": "comment"
+    },
+    "8004": {
+        "file_id": 589,
+        "content": "        pos_weight for binary cross entropy with logits loss\n        pos_weight is defined as reciprocal of ratio of positive samples in the dataset\n        \"\"\"\n        # load file list\n        file_ptr = open(self.file_path, 'r')\n        info = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        n_classes = 2  # boundary or not\n        nums = [0 for i in range(n_classes)]\n        for i in range(len(info)):\n            video_name = info[i]\n            file_name = video_name.split('.')[0] + \".npy\"\n            label_file_path = os.path.join(self.label_path, file_name)\n            label = np.load(label_file_path).astype(np.int64)\n            num, cnt = np.unique(label, return_counts=True)\n            for n, c in zip(num, cnt):\n                nums[n] += c\n        pos_ratio = nums[1] / sum(nums)\n        pos_weight = 1 / pos_ratio\n        if norm is not None:\n            pos_weight /= norm\n        return paddle.to_tensor(pos_weight, dtype=\"float32\")\n    def forward(self, preds, gts):\n        \"\"\"\n        Args:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:292-321"
+    },
+    "8005": {
+        "file_id": 589,
+        "content": "This code calculates the positive weight for binary cross-entropy with logits loss. It loads file information from a given path, counts the number of positive and negative samples, then calculates the ratio of positive samples to total samples. The positive weight is set as the reciprocal of this ratio. If a normalization factor is provided, it divides the positive weight by this factor before returning it in float32 tensor format.",
+        "type": "comment"
+    },
+    "8006": {
+        "file_id": 589,
+        "content": "            preds: paddle.float (N, 1, T).\n            gts: paddle.float (N, 1, T).\n        \"\"\"\n        loss = 0.0\n        batch_size = float(preds.shape[0])\n        for criterion in self.criterions:\n            for pred, gt in zip(preds, gts):\n                loss += criterion(pred, gt)\n        return loss / batch_size\n@LOSSES.register()\nclass ASRFLoss(nn.Layer):\n    def __init__(self,\n                 lambda_bound_loss,\n                 num_classes,\n                 file_path,\n                 label_path,\n                 boundary_path,\n                 ce=True,\n                 asl_focal=True,\n                 tmse=False,\n                 gstmse=False,\n                 asl_weight=None,\n                 threshold=4.,\n                 ignore_index=255,\n                 ce_weight=1.0,\n                 focal_weight=1.0,\n                 tmse_weight=0.15,\n                 gstmse_weight=0.15,\n                 bce=True,\n                 brl_focal=False,\n                 mse=False,\n                 brl_weight=None):\n        super().__init__()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:322-359"
+    },
+    "8007": {
+        "file_id": 589,
+        "content": "The ASRFLoss class is an ASR (Automatic Speech Recognition) loss function implemented with various criteria for prediction and ground truth. It uses different weights for CE, Focal, TMSE, GST MSE, BCE, and BR LFocal losses depending on the input parameters. The function returns the average loss across all criterions and samples.",
+        "type": "comment"
+    },
+    "8008": {
+        "file_id": 589,
+        "content": "        self.criterion_cls = ActionSegmentationLoss(ce=ce,\n                                                    focal=asl_focal,\n                                                    tmse=tmse,\n                                                    gstmse=gstmse,\n                                                    weight=asl_weight,\n                                                    threshold=threshold,\n                                                    ignore_index=ignore_index,\n                                                    ce_weight=ce_weight,\n                                                    focal_weight=focal_weight,\n                                                    tmse_weight=tmse_weight,\n                                                    gstmse_weight=gstmse_weight,\n                                                    file_path=file_path,\n                                                    label_path=label_path,\n                                                    num_classes=num_classes)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:360-373"
+    },
+    "8009": {
+        "file_id": 589,
+        "content": "This code initializes an ActionSegmentationLoss object with specified parameters for classification loss, focal loss, and temporal segmentation losses. It also takes weights and file paths as inputs to optimize the model's performance.",
+        "type": "comment"
+    },
+    "8010": {
+        "file_id": 589,
+        "content": "        self.criterion_boundary = BoundaryRegressionLoss(\n            bce=bce,\n            focal=brl_focal,\n            mse=mse,\n            weight=brl_weight,\n            file_path=file_path,\n            label_path=boundary_path)\n        self.lambda_bound_loss = lambda_bound_loss\n    def forward(self, x, output_cls, label, outputs_boundary, boundary):\n        loss = 0.0\n        if isinstance(output_cls, list):\n            n = len(output_cls)\n            for out in output_cls:\n                loss += self.criterion_cls(out, label, x) / n\n        else:\n            loss += self.criterion_cls(output_cls, label, x)\n        if isinstance(outputs_boundary, list):\n            n = len(outputs_boundary)\n            for out in outputs_boundary:\n                loss += self.lambda_bound_loss * self.criterion_boundary(\n                    out, boundary) / n\n        else:\n            loss += self.lambda_bound_loss * self.criterion_boundary(\n                outputs_boundary, boundary)\n        return loss",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/asrf_loss.py:374-401"
+    },
+    "8011": {
+        "file_id": 589,
+        "content": "This code defines a custom loss function for a video modeling framework. It initializes a boundary regression loss criterion and takes a weighted average of classification and boundary losses. The forward method calculates the total loss by summing weighted classification and boundary losses, and returns the final loss value.",
+        "type": "comment"
+    },
+    "8012": {
+        "file_id": 590,
+        "content": "/paddlevideo/modeling/losses/base.py",
+        "type": "filepath"
+    },
+    "8013": {
+        "file_id": 590,
+        "content": "This code is a PaddlePaddle base class for loss functions, requiring subclasses to override `_forward()` and supports optional weight scaling. It initializes the loss class and defines forward pass computation.",
+        "type": "summary"
+    },
+    "8014": {
+        "file_id": 590,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom abc import  abstractmethod\nimport paddle\nimport paddle.nn as nn\n#XXX use _forward?? or forward??\nclass BaseWeightedLoss(nn.Layer):\n    \"\"\"Base class for loss.\n    All subclass should overwrite the ``_forward()`` method which returns the\n    normal loss without loss weights.\n    Args:\n        loss_weight (float): Factor scalar multiplied on the loss.\n            Default: 1.0.\n    \"\"\"\n    def __init__(self, loss_weight=1.0):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/base.py:1-31"
+    },
+    "8015": {
+        "file_id": 590,
+        "content": "This code is the base class for a loss function in PaddlePaddle. It requires subclasses to override the `_forward()` method, which returns the normal loss without weights. The `loss_weight` parameter is optional and defaults to 1.0, which can be used to scale the final loss value.",
+        "type": "comment"
+    },
+    "8016": {
+        "file_id": 590,
+        "content": "        super().__init__()\n        self.loss_weight = loss_weight\n    @abstractmethod\n    def _forward(self, *args, **kwargs):\n        pass\n    def forward(self, *args, **kwargs):\n        \"\"\"Defines the computation performed at every call.\n        Args:\n            *args: The positional arguments for the corresponding\n                loss.\n            **kwargs: The keyword arguments for the corresponding\n                loss.\n        Returns:\n            paddle.Tensor: The calculated loss.\n        \"\"\"\n        return self._forward(*args, **kwargs) * self.loss_weight",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/base.py:32-49"
+    },
+    "8017": {
+        "file_id": 590,
+        "content": "Initializes the loss class with a weight and defines forward pass for computation.",
+        "type": "comment"
+    },
+    "8018": {
+        "file_id": 591,
+        "content": "/paddlevideo/modeling/losses/bmn_loss.py",
+        "type": "filepath"
+    },
+    "8019": {
+        "file_id": 591,
+        "content": "This code defines a BMN loss function for PaddleVideo, considering time-scale attributes and ratio of positive entries. It also includes a loss function for object detection models with weighted samples, position losses, and ground truth IoU masks. The code further defines a loss function for PEM and TEAM tasks by combining predicted and ground truth values using three loss functions.",
+        "type": "summary"
+    },
+    "8020": {
+        "file_id": 591,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass BMNLoss(BaseWeightedLoss):\n    \"\"\"Loss for BMN model\n    Args:\n        tscale (int): sequence length, default 100.\n        dscale (int): max duration length, default 100.\n    \"\"\"\n    def __init__(self, dscale, tscale):\n        super().__init__()\n        self.dscale = dscale",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/bmn_loss.py:1-32"
+    },
+    "8021": {
+        "file_id": 591,
+        "content": "This code defines a BMN loss function for the PaddleVideo library. It is registered in the LOSSES registry and takes two arguments: dscale and tscale, which represent max duration length and sequence length respectively. The class extends BaseWeightedLoss, suggesting it combines multiple weighted losses.",
+        "type": "comment"
+    },
+    "8022": {
+        "file_id": 591,
+        "content": "        self.tscale = tscale\n    def _get_mask(self, dscale, tscale):\n        bm_mask = []\n        for idx in range(dscale):\n            mask_vector = [1 for i in range(tscale - idx)\n                           ] + [0 for i in range(idx)]\n            bm_mask.append(mask_vector)\n        bm_mask = np.array(bm_mask, dtype='float32')\n        bm_mask = paddle.to_tensor(bm_mask)\n        bm_mask.stop_gradient = True\n        return bm_mask\n    def tem_loss_func(self, pred_start, pred_end, gt_start, gt_end):\n        def bi_loss(pred_score, gt_label, datatype):\n            pred_score = paddle.reshape(x=pred_score, shape=[-1])\n            gt_label = paddle.reshape(x=gt_label, shape=[-1])\n            gt_label.stop_gradient = True\n            pmask = paddle.cast(x=(gt_label > 0.5), dtype=datatype)\n            num_entries = paddle.cast(paddle.shape(pmask), dtype=datatype)\n            num_positive = paddle.cast(paddle.sum(pmask), dtype=datatype)\n            ratio = num_entries / num_positive\n            coef_0 = 0.5 * ratio / (ratio - 1)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/bmn_loss.py:33-55"
+    },
+    "8023": {
+        "file_id": 591,
+        "content": "This code defines a class with a time-scale attribute, a method to create binary mask arrays, and a loss function for a specific task. The loss function takes in predicted start and end positions along with ground truth values and calculates a ratio between the number of entries and positive values. This ratio is then used to calculate a coefficient for the loss function.",
+        "type": "comment"
+    },
+    "8024": {
+        "file_id": 591,
+        "content": "            coef_1 = 0.5 * ratio\n            epsilon = 0.000001\n            loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)\n            loss_pos = coef_1 * paddle.mean(loss_pos)\n            loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),\n                                       (1.0 - pmask))\n            loss_neg = coef_0 * paddle.mean(loss_neg)\n            loss = -1 * (loss_pos + loss_neg)\n            return loss\n        loss_start = bi_loss(pred_start, gt_start, pred_start.dtype)\n        loss_end = bi_loss(pred_end, gt_end, pred_start.dtype)\n        loss = loss_start + loss_end\n        return loss\n    def pem_reg_loss_func(self, pred_score, gt_iou_map, mask):\n        gt_iou_map = paddle.multiply(gt_iou_map, mask)\n        u_hmask = paddle.cast(x=gt_iou_map > 0.7, dtype=pred_score.dtype)\n        u_mmask = paddle.logical_and(gt_iou_map <= 0.7, gt_iou_map > 0.3)\n        u_mmask = paddle.cast(x=u_mmask, dtype=pred_score.dtype)\n        u_lmask = paddle.logical_and(gt_iou_map <= 0.3, gt_iou_map >= 0.)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/bmn_loss.py:56-77"
+    },
+    "8025": {
+        "file_id": 591,
+        "content": "The code defines a loss function for object detection models. It calculates the loss by considering positive and negative samples, applying weights to each sample based on their ratio, and then combines them. The bi_loss function is used to calculate losses for start and end positions. In another function, Pem_reg_loss_func, it separates ground truth IoU map into three masks: high (>0.7), medium (<=0.7 & >0.3), and low (<=0.3 & >=0). It then applies these masks to calculate the loss.",
+        "type": "comment"
+    },
+    "8026": {
+        "file_id": 591,
+        "content": "        u_lmask = paddle.cast(x=u_lmask, dtype=pred_score.dtype)\n        u_lmask = paddle.multiply(u_lmask, mask)\n        num_h = paddle.cast(paddle.sum(u_hmask), dtype=pred_score.dtype)\n        num_m = paddle.cast(paddle.sum(u_mmask), dtype=pred_score.dtype)\n        num_l = paddle.cast(paddle.sum(u_lmask), dtype=pred_score.dtype)\n        r_m = num_h / num_m\n        u_smmask = paddle.uniform(shape=[\n            gt_iou_map.shape[1], gt_iou_map.shape[2]\n        ],\n                                  min=0.0,\n                                  max=1.0).astype(pred_score.dtype)\n        u_smmask = paddle.multiply(u_mmask, u_smmask)\n        u_smmask = paddle.cast(x=(u_smmask > (1. - r_m)),\n                               dtype=pred_score.dtype)\n        r_l = num_h / num_l\n        u_slmask = paddle.uniform(shape=[\n            gt_iou_map.shape[1], gt_iou_map.shape[2]\n        ],\n                                  min=0.0,\n                                  max=1.0).astype(pred_score.dtype)\n        u_slmask = paddle.multiply(u_lmask, u_slmask)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/bmn_loss.py:78-101"
+    },
+    "8027": {
+        "file_id": 591,
+        "content": "Calculating the number of elements in different masks and using them to calculate ratios for later mask operations. Creating uniform masks and multiplying them with corresponding existing masks, then casting the results.",
+        "type": "comment"
+    },
+    "8028": {
+        "file_id": 591,
+        "content": "        u_slmask = paddle.cast(x=(u_slmask > (1. - r_l)),\n                               dtype=pred_score.dtype)\n        weights = u_hmask + u_smmask + u_slmask\n        weights.stop_gradient = True\n        loss = F.square_error_cost(pred_score, gt_iou_map)\n        loss = paddle.multiply(loss, weights)\n        loss = 0.5 * paddle.sum(loss) / paddle.sum(weights)\n        return loss\n    def pem_cls_loss_func(self, pred_score, gt_iou_map, mask):\n        gt_iou_map = paddle.multiply(gt_iou_map, mask)\n        gt_iou_map.stop_gradient = True\n        pmask = paddle.cast(x=(gt_iou_map > 0.9), dtype=pred_score.dtype)\n        nmask = paddle.cast(x=(gt_iou_map <= 0.9), dtype=pred_score.dtype)\n        nmask = paddle.multiply(nmask, mask)\n        num_positive = paddle.sum(pmask)\n        num_entries = num_positive + paddle.sum(nmask)\n        ratio = num_entries / num_positive\n        coef_0 = 0.5 * ratio / (ratio - 1)\n        coef_1 = 0.5 * ratio\n        epsilon = 0.000001\n        loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/bmn_loss.py:102-126"
+    },
+    "8029": {
+        "file_id": 591,
+        "content": "In this code, u_slmask is created by comparing r_l with 1 and casting the result to the dtype of pred_score. Then, weights are calculated by adding u_hmask, u_smmask, and u_slmask. The stop_gradient attribute of weights is set to True. Loss is calculated using square error cost between pred_score and gt_iou_map, multiplied by weights, averaged, and returned.\nIn the pem_cls_loss_func, gt_iou_map is multiplied by mask and marked as non-trainable (stop_gradient = True). Pmask and nmask are created based on conditions with gt_iou_map and mask. Num_positive and num_entries are calculated. Ratios are used to determine coef_0 and coef_1. Loss_pos is log(pred_score + epsilon) multiplied by pmask.",
+        "type": "comment"
+    },
+    "8030": {
+        "file_id": 591,
+        "content": "        loss_pos = coef_1 * paddle.sum(loss_pos)\n        loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),\n                                   nmask)\n        loss_neg = coef_0 * paddle.sum(loss_neg)\n        loss = -1 * (loss_pos + loss_neg) / num_entries\n        return loss\n    def forward(self, pred_bm, pred_start, pred_end, gt_iou_map, gt_start,\n                gt_end):\n        pred_bm_reg = paddle.squeeze(paddle.slice(pred_bm,\n                                                  axes=[1],\n                                                  starts=[0],\n                                                  ends=[1]),\n                                     axis=[1])\n        pred_bm_cls = paddle.squeeze(paddle.slice(pred_bm,\n                                                  axes=[1],\n                                                  starts=[1],\n                                                  ends=[2]),\n                                     axis=[1])\n        bm_mask = self._get_mask(self.dscale, self.tscale)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/bmn_loss.py:127-147"
+    },
+    "8031": {
+        "file_id": 591,
+        "content": "Function `forward` takes in `pred_bm`, `pred_start`, `pred_end`, `gt_iou_map`, `gt_start`, and `gt_end`. It first extracts `pred_bm_reg` and `pred_bm_cls` by slicing `pred_bm` along the specified axes. Then, it calculates the `bm_mask` using `_get_mask` with given scales. The function returns the calculated loss from the input parameters.",
+        "type": "comment"
+    },
+    "8032": {
+        "file_id": 591,
+        "content": "        pem_reg_loss = self.pem_reg_loss_func(pred_bm_reg, gt_iou_map, bm_mask)\n        pem_cls_loss = self.pem_cls_loss_func(pred_bm_cls, gt_iou_map, bm_mask)\n        tem_loss = self.tem_loss_func(pred_start, pred_end, gt_start, gt_end)\n        loss = tem_loss + 10 * pem_reg_loss + pem_cls_loss\n        return loss",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/bmn_loss.py:149-155"
+    },
+    "8033": {
+        "file_id": 591,
+        "content": "This code calculates the loss for PEM and TEAM detection tasks by combining the predicted and ground truth values. It uses three loss functions: `pem_reg_loss_func`, `pem_cls_loss_func`, and `tem_loss_func`. The final loss is the sum of the temporal (TEM) loss, 10 times the PEM regression loss, and the PEM classification loss.",
+        "type": "comment"
+    },
+    "8034": {
+        "file_id": 592,
+        "content": "/paddlevideo/modeling/losses/cross_entropy_loss.py",
+        "type": "filepath"
+    },
+    "8035": {
+        "file_id": 592,
+        "content": "CrossEntropyLoss is a custom loss function in PaddlePaddle, inheriting from BaseWeightedLoss, for classification tasks. It calculates CrossEntropy loss between scores and labels using F.cross_entropy method and returns the result as a tensor.",
+        "type": "summary"
+    },
+    "8036": {
+        "file_id": 592,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass CrossEntropyLoss(BaseWeightedLoss):\n    \"\"\"Cross Entropy Loss.\"\"\"\n    def _forward(self, score, labels, **kwargs):\n        \"\"\"Forward function.\n        Args:\n            score (paddle.Tensor): The class score.\n            labels (paddle.Tensor): The ground truth labels.\n            kwargs: Any keyword argument to be used to calculate",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/cross_entropy_loss.py:1-30"
+    },
+    "8037": {
+        "file_id": 592,
+        "content": "CrossEntropyLoss is a custom loss function in PaddlePaddle for classification tasks. It inherits from BaseWeightedLoss and has a forward method that takes class scores and labels as input.",
+        "type": "comment"
+    },
+    "8038": {
+        "file_id": 592,
+        "content": "                CrossEntropy loss.\n        Returns:\n            loss (paddle.Tensor): The returned CrossEntropy loss.\n        \"\"\"\n        loss = F.cross_entropy(score, labels, **kwargs)\n        return loss",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/cross_entropy_loss.py:31-36"
+    },
+    "8039": {
+        "file_id": 592,
+        "content": "This function calculates the CrossEntropy loss between score and labels, using Paddle's F.cross_entropy method, and returns the result as a tensor.",
+        "type": "comment"
+    },
+    "8040": {
+        "file_id": 593,
+        "content": "/paddlevideo/modeling/losses/depth_loss.py",
+        "type": "filepath"
+    },
+    "8041": {
+        "file_id": 593,
+        "content": "This code calculates smoothness and reprojection losses for depth estimation tasks, combining identity and reprojection losses to compute disparity loss. It handles day and night scenarios while saving images if necessary. The total loss is stored in the losses dictionary.",
+        "type": "summary"
+    },
+    "8042": {
+        "file_id": 593,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\ndef get_smooth_loss(disp, img):\n    \"\"\"Computes the smoothness loss for a disparity image\n    The color image is used for edge-aware smoothness\n    \"\"\"\n    grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])\n    grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])\n    grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:1-29"
+    },
+    "8043": {
+        "file_id": 593,
+        "content": "This code defines a function \"get_smooth_loss\" that calculates the smoothness loss for disparity images using color image gradients and disparity image gradients. It uses PaddlePaddle library functions like paddle.abs() and paddle.mean(). The function is part of the BaseWeightedLoss class in the LOSSES registry.",
+        "type": "comment"
+    },
+    "8044": {
+        "file_id": 593,
+        "content": "                             1,\n                             keepdim=True)\n    grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),\n                             1,\n                             keepdim=True)\n    grad_disp_x *= paddle.exp(-grad_img_x)\n    grad_disp_y *= paddle.exp(-grad_img_y)\n    return grad_disp_x.mean() + grad_disp_y.mean()\nclass DiffLoss(nn.Layer):\n    def __init__(self):\n        super(DiffLoss, self).__init__()\n    def forward(self, input1, input2):\n        batch_size = input1.shape[0]\n        input1 = input1.reshape([batch_size, -1])\n        input2 = input2.reshape([batch_size, -1])\n        input1_l2 = input1\n        input2_l2 = input2\n        diff_loss = 0\n        dim = input1.shape[1]\n        for i in range(input1.shape[0]):\n            diff_loss = diff_loss + paddle.mean(\n                ((input1_l2[i:i + 1, :].mm(input2_l2[i:i + 1, :].T)).pow(2)) /\n                dim)\n        diff_loss = diff_loss / input1.shape[0]\n        return diff_loss\nclass MSE(nn.Layer):\n    def __init__(self):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:30-67"
+    },
+    "8045": {
+        "file_id": 593,
+        "content": "The code defines two classes: DiffLoss and MSE. DiffLoss calculates the loss between two inputs using L2 norm, while MSE calculates mean squared error loss for a single input. The function on lines 29-66 calculates gradients of disparity maps using image differences, applies exponential decay based on gradient values, and returns their average. This seems to be related to depth estimation or disparity prediction in computer vision tasks.",
+        "type": "comment"
+    },
+    "8046": {
+        "file_id": 593,
+        "content": "        super(MSE, self).__init__()\n    def forward(self, pred, real):\n        diffs = paddle.add(real, -pred)\n        n = paddle.numel(diffs)\n        mse = paddle.sum(diffs.pow(2)) / n\n        return mse\nclass SIMSE(nn.Layer):\n    def __init__(self):\n        super(SIMSE, self).__init__()\n    def forward(self, pred, real):\n        diffs = paddle.add(real, -pred)\n        n = paddle.numel(diffs)\n        simse = paddle.sum(diffs).pow(2) / (n**2)\n        return simse\nclass SSIM(nn.Layer):\n    \"\"\"Layer to compute the SSIM loss between a pair of images\n    \"\"\"\n    def __init__(self):\n        super(SSIM, self).__init__()\n        self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.refl = nn.Pad2D(1, mode='reflect')\n        self.C1 = 0.01**2\n        self.C2 = 0.03**2",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:68-104"
+    },
+    "8047": {
+        "file_id": 593,
+        "content": "MSE class is a mean squared error loss function for PaddlePaddle, SIMSE class calculates the structured iterative mean squared error loss, and SSIM class computes the structural similarity index (SSIM) loss between a pair of images using various pooling operations and constants.",
+        "type": "comment"
+    },
+    "8048": {
+        "file_id": 593,
+        "content": "    def forward(self, x, y):\n        x = self.refl(x)\n        y = self.refl(y)\n        mu_x = self.mu_x_pool(x)\n        mu_y = self.mu_y_pool(y)\n        sigma_x = self.sig_x_pool(x**2) - mu_x**2\n        sigma_y = self.sig_y_pool(y**2) - mu_y**2\n        sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y\n        SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)\n        SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)\n        return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)\n@LOSSES.register()\nclass ADDSLoss(BaseWeightedLoss):\n    def __init__(self, avg_reprojection, disparity_smoothness, no_ssim):\n        super(ADDSLoss, self).__init__()\n        self.avg_reprojection = avg_reprojection\n        self.disparity_smoothness = disparity_smoothness\n        self.no_ssim = no_ssim\n        self.loss_diff = DiffLoss()\n        self.loss_recon1 = MSE()\n        self.loss_recon2 = SIMSE()\n        self.loss_similarity = MSE()\n    def compute_reprojection_loss(self, pred, target):\n        \"\"\"Computes reprojection loss between a batch of predicted and target images",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:106-137"
+    },
+    "8049": {
+        "file_id": 593,
+        "content": "This code defines a forward function for calculating the SSIM loss, which is used in the ADDSLoss class. The SSIM loss measures the structural similarity between two images and takes into account luminance (mu_x and mu_y) and contrast (sigma_x and sigma_y) for each image. It also considers the covariance of the two images (sigma_xy). The SSIM loss is then used in the ADDSLoss class to compute the reprojection loss between predicted and target images.",
+        "type": "comment"
+    },
+    "8050": {
+        "file_id": 593,
+        "content": "        \"\"\"\n        abs_diff = paddle.abs(target - pred)\n        l1_loss = abs_diff.mean(1, True)\n        if not self.no_ssim:\n            self.ssim = SSIM()\n        if self.no_ssim:\n            reprojection_loss = l1_loss\n        else:\n            ssim_loss = self.ssim(pred, target).mean(1, True)\n            reprojection_loss = 0.85 * ssim_loss + 0.15 * l1_loss\n        return reprojection_loss\n    def compute_losses(self, inputs, outputs, is_night):\n        \"\"\"Compute the reprojection and smoothness losses for a minibatch\n        \"\"\"\n        losses = {}\n        total_loss = 0\n        for scale in outputs['scales']:\n            loss = 0\n            reprojection_losses = []\n            source_scale = 0\n            disp = outputs[(\"disp\", scale)]\n            if is_night:\n                color = inputs[(\"color_n\", 0, scale)]\n                target = inputs[(\"color_n\", 0, source_scale)]\n            else:\n                color = inputs[(\"color\", 0, scale)]\n                target = inputs[(\"color\", 0, source_scale)]\n            for frame_id in outputs['frame_ids'][1:]:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:138-173"
+    },
+    "8051": {
+        "file_id": 593,
+        "content": "This code computes the reprojection and smoothness losses for a minibatch by iterating over different scales. It calculates the L1 loss between the predicted depth and the target depth, and optionally computes the SSIM (Structural Similarity Index) loss as well. The reprojection loss is determined based on these two values, with 85% weighted towards the SSIM loss and 15% towards the L1 loss. The total loss for the minibatch is accumulated in the \"total_loss\" variable.",
+        "type": "comment"
+    },
+    "8052": {
+        "file_id": 593,
+        "content": "                pred = outputs[(\"color\", frame_id, scale)]\n                reprojection_losses.append(\n                    self.compute_reprojection_loss(pred, target))\n            reprojection_losses = paddle.concat(reprojection_losses, 1)\n            identity_reprojection_losses = []\n            for frame_id in outputs['frame_ids'][1:]:\n                if is_night:\n                    pred = inputs[(\"color_n\", frame_id, source_scale)]\n                else:\n                    pred = inputs[(\"color\", frame_id, source_scale)]\n                identity_reprojection_losses.append(\n                    self.compute_reprojection_loss(pred, target))\n            identity_reprojection_losses = paddle.concat(\n                identity_reprojection_losses, 1)\n            if self.avg_reprojection:\n                identity_reprojection_loss = identity_reprojection_losses.mean(\n                    1, keepdim=True)\n            else:\n                # save both images, and do min all at once below\n                identity_reprojection_loss = identity_reprojection_losses",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:174-197"
+    },
+    "8053": {
+        "file_id": 593,
+        "content": "This code computes reprojection losses for day and night scenarios, concatenates them into a single tensor, and then checks if average reprojection loss should be computed. If not, it saves both images and performs minimum operation all at once.",
+        "type": "comment"
+    },
+    "8054": {
+        "file_id": 593,
+        "content": "            if self.avg_reprojection:\n                reprojection_loss = reprojection_losses.mean(1, keepdim=True)\n            else:\n                reprojection_loss = reprojection_losses\n            # add random numbers to break ties\n            identity_reprojection_loss = identity_reprojection_loss + paddle.randn(\n                identity_reprojection_loss.shape) * 0.00001\n            combined = paddle.concat(\n                (identity_reprojection_loss, reprojection_loss), axis=1)\n            if combined.shape[1] == 1:\n                to_optimise = combined\n            else:\n                to_optimise = paddle.min(combined, axis=1)\n            loss = loss + to_optimise.mean()\n            mean_disp = disp.mean(2, True).mean(3, True)\n            norm_disp = disp / (mean_disp + 1e-7)\n            smooth_loss = get_smooth_loss(norm_disp, color)\n            loss = loss + self.disparity_smoothness * smooth_loss / (2**scale)\n            total_loss = total_loss + loss\n            losses[\"loss/{}\".format(scale)] = loss",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:199-223"
+    },
+    "8055": {
+        "file_id": 593,
+        "content": "This code calculates the depth loss by combining identity and reprojection losses, adds random numbers to break ties, concatenates them, selects minimum values for optimization, and calculates disparity smoothness loss. It then updates the total loss and stores it in the losses dictionary.",
+        "type": "comment"
+    },
+    "8056": {
+        "file_id": 593,
+        "content": "        total_loss /= len(outputs['scales'])\n        losses[\"loss\"] = total_loss\n        return losses\n    def forward(self, inputs, outputs):\n        losses_day = self.compute_losses(inputs, outputs, 'day')\n        losses_night = self.compute_losses(inputs, outputs['outputs_night'],\n                                           'night')\n        loss = 0\n        losses = []\n        # diff\n        target_diff1 = 0.5 * self.loss_diff(\n            outputs['result'][0], outputs['result'][2])  # 10 when batchsize=1\n        target_diff2 = 0.5 * self.loss_diff(outputs['result_night'][0],\n                                            outputs['result_night'][2])\n        losses.append(target_diff1)\n        losses.append(target_diff2)\n        loss = loss + target_diff1\n        loss = loss + target_diff2\n        target_diff3 = 1 * self.loss_diff(\n            outputs['result'][1], outputs['result'][3])  # 10 when batchsize=1\n        target_diff4 = 1 * self.loss_diff(outputs['result_night'][1],\n                                          outputs['result_night'][3])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:225-250"
+    },
+    "8057": {
+        "file_id": 593,
+        "content": "This code computes losses for both day and night scenes in a video, using the compute_losses function. It appends two target differences to the 'loss' list and adds them to the total loss. The target_diff1 and target_diff2 are calculated by the loss_diff function, comparing specific elements from the outputs. Target_diff3 and target_diff4 are also computed in a similar manner. The final total loss is divided by the number of scales and stored in the losses dictionary before returning.",
+        "type": "comment"
+    },
+    "8058": {
+        "file_id": 593,
+        "content": "        losses.append(target_diff3)\n        losses.append(target_diff4)\n        loss = loss + target_diff3\n        loss = loss + target_diff4\n        # recon\n        target_mse = 1 * self.loss_recon1(outputs['result'][5],\n                                          inputs[\"color_aug\", 0, 0])\n        loss = loss + target_mse\n        target_simse = 1 * self.loss_recon2(outputs['result'][5],\n                                            inputs[\"color_aug\", 0, 0])\n        loss = loss + target_simse\n        losses.append(target_mse)\n        losses.append(target_simse)\n        target_mse_night = 1 * self.loss_recon1(outputs['result_night'][5],\n                                                inputs[\"color_n_aug\", 0, 0])\n        loss = loss + target_mse_night\n        target_simse_night = 1 * self.loss_recon2(outputs['result_night'][5],\n                                                  inputs[\"color_n_aug\", 0, 0])\n        loss = loss + target_simse_night\n        losses.append(target_mse_night)\n        losses.append(target_simse_night)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:251-276"
+    },
+    "8059": {
+        "file_id": 593,
+        "content": "The code calculates multiple losses, including depth and reconstruction, for both daytime and night-time scenes. It then adds these losses to the total loss and appends them to the 'losses' list.",
+        "type": "comment"
+    },
+    "8060": {
+        "file_id": 593,
+        "content": "        # depth loss\n        pseudo_label = outputs[(\"disp\", 0)].detach()\n        depth_loss = 1 * self.loss_similarity(\n            outputs['outputs_night'][(\"disp\", 0)], pseudo_label)\n        loss = loss + depth_loss\n        losses.append(depth_loss)\n        outputs['loss'] = loss + losses_day['loss'] + losses_night['loss']\n        outputs['losses_day'] = losses_day['loss']\n        outputs['losses_night'] = losses_night['loss']\n        return outputs",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/depth_loss.py:278-290"
+    },
+    "8061": {
+        "file_id": 593,
+        "content": "This code calculates a depth loss by comparing predicted depths with detached pseudo-labels, then adds it to the overall loss and appends it to the losses list. Finally, it updates the output dictionary with the total loss and separate day/night losses before returning the updated outputs.",
+        "type": "comment"
+    },
+    "8062": {
+        "file_id": 594,
+        "content": "/paddlevideo/modeling/losses/distillation_loss.py",
+        "type": "filepath"
+    },
+    "8063": {
+        "file_id": 594,
+        "content": "This code defines Distillation Entropy Loss and KL divergence loss classes, implementing CrossEntropy loss for single/triple labels and KL divergence respectively, with optional weighted average and activation functions.",
+        "type": "summary"
+    },
+    "8064": {
+        "file_id": 594,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass DistillationCELoss(BaseWeightedLoss):\n    \"\"\"Distillation Entropy Loss.\"\"\"\n    def _forward(self, score, labels, **kwargs):\n        \"\"\"Forward function.\n        Args:\n            score (paddle.Tensor): The class score.\n            labels (paddle.Tensor): The ground truth labels.",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/distillation_loss.py:1-30"
+    },
+    "8065": {
+        "file_id": 594,
+        "content": "Defines a Distillation Entropy Loss class, which inherits from BaseWeightedLoss and takes score and labels as input for its forward function.",
+        "type": "comment"
+    },
+    "8066": {
+        "file_id": 594,
+        "content": "            kwargs: Any keyword argument to be used to calculate\n                CrossEntropy loss.\n        Returns:\n            loss (paddle.Tensor): The returned CrossEntropy loss.\n        \"\"\"\n        if len(labels) == 1:\n            label = labels[0]\n            loss = F.cross_entropy(score, label, **kwargs)\n        # Deal with VideoMix\n        elif len(labels) == 3:\n            label_a, label_b, lam = labels\n            loss_a = F.cross_entropy(score, label_a, **kwargs)\n            loss_b = F.cross_entropy(score, label_b, **kwargs)\n            loss = lam * loss_a + (1 - lam) * loss_b\n            loss = paddle.mean(loss)  #lam shape is bs\n        return loss\n@LOSSES.register()\nclass DistillationDMLLoss(BaseWeightedLoss):\n    \"\"\"\n    DistillationDMLLoss\n    \"\"\"\n    def __init__(self, act=\"softmax\", eps=1e-12, **kargs):\n        super().__init__(**kargs)\n        if act is not None:\n            assert act in [\"softmax\", \"sigmoid\"]\n        if act == \"softmax\":\n            self.act = nn.Softmax(axis=-1)\n        elif act == \"sigmoid\":",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/distillation_loss.py:31-60"
+    },
+    "8067": {
+        "file_id": 594,
+        "content": "The code defines a loss function that calculates CrossEntropy loss and supports both single and triple labels. For single label, it directly calculates the CrossEntropy loss. For triple labels, it first calculates two separate CrossEntropy losses, then combines them with a weighted average based on a given lambda value (lam). The DistillationDMLLoss class implements this behavior and also handles the act parameter for specifying different activation functions.",
+        "type": "comment"
+    },
+    "8068": {
+        "file_id": 594,
+        "content": "            self.act = nn.Sigmoid()\n        else:\n            self.act = None\n        self.eps = eps\n    def _kldiv(self, x, target):\n        class_num = x.shape[-1]\n        cost = target * paddle.log(\n            (target + self.eps) / (x + self.eps)) * class_num\n        return cost\n    def _forward(self, x, target):\n        if self.act is not None:\n            x = self.act(x)\n            target = self.act(target)\n        loss = self._kldiv(x, target) + self._kldiv(target, x)\n        loss = loss / 2\n        loss = paddle.mean(loss)\n        return loss",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/distillation_loss.py:61-79"
+    },
+    "8069": {
+        "file_id": 594,
+        "content": "This code defines a class for implementing the Kullback-Leibler (KL) divergence loss. The constructor takes an optional activation function and epsilon for numerical stability. The _kldiv method calculates the KL divergence between two vectors, while the _forward method applies the activation function if provided and computes the final loss by averaging the KL divergences in both directions.",
+        "type": "comment"
+    },
+    "8070": {
+        "file_id": 595,
+        "content": "/paddlevideo/modeling/losses/transnetv2_loss.py",
+        "type": "filepath"
+    },
+    "8071": {
+        "file_id": 595,
+        "content": "This code defines a class \"TransNetV2Loss\" for calculating TransNetV2 model loss with transition_weight and many-hot_loss_weight parameters, using weighted binary cross-entropy loss for one-hot and many-hot predictions. The snippet returns the total loss from TransNetV2 components.",
+        "type": "summary"
+    },
+    "8072": {
+        "file_id": 595,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass TransNetV2Loss(BaseWeightedLoss):\n    \"\"\"Loss for TransNetV2 model\n    \"\"\"\n    def __init__(self, transition_weight=5.0, many_hot_loss_weight=0.1):\n        self.transition_weight = transition_weight\n        self.many_hot_loss_weight = many_hot_loss_weight\n        super().__init__()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/transnetv2_loss.py:1-28"
+    },
+    "8073": {
+        "file_id": 595,
+        "content": "This code defines a class called \"TransNetV2Loss\" for calculating the loss in TransNetV2 model. It inherits from BaseWeightedLoss and takes transition_weight and many_hot_loss_weight as parameters for customizing the loss calculation.",
+        "type": "comment"
+    },
+    "8074": {
+        "file_id": 595,
+        "content": "    def _forward(self, one_hot_pred, one_hot_gt,\n                many_hot_pred=None, many_hot_gt=None, reg_losses=None):\n        assert transition_weight != 1\n        one_hot_pred = one_hot_pred[:, :, 0]\n        one_hot_gt = one_hot_gt.astype('float32')\n        one_hot_loss = F.binary_cross_entropy_with_logits(logit=one_hot_pred, label=one_hot_gt, reduction='none')\n        one_hot_loss *= 1 + one_hot_gt * (transition_weight - 1)\n        one_hot_loss = paddle.mean(one_hot_loss)\n        many_hot_loss = 0.\n        if many_hot_loss_weight != 0. and many_hot_pred is not None:\n            many_hot_loss = many_hot_loss_weight * paddle.mean(\n                F.binary_cross_entropy_with_logits(logit=many_hot_pred[:, :, 0],\n                                                   label=many_hot_gt.astype('float32'), reduction='none'))\n        total_loss = one_hot_loss + many_hot_loss\n        if reg_losses is not None:\n            for name, value in reg_losses.items():\n                if value is not None:\n                    total_loss += value",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/transnetv2_loss.py:30-54"
+    },
+    "8075": {
+        "file_id": 595,
+        "content": "This code defines a loss function for the TransNetV2 model, taking in one-hot and many-hot predictions and ground truth labels. It calculates the binary cross-entropy loss for both types of predictions, applies a weighted factor based on transition weight, and averages the losses before summing them together.",
+        "type": "comment"
+    },
+    "8076": {
+        "file_id": 595,
+        "content": "        return total_loss",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/transnetv2_loss.py:56-56"
+    },
+    "8077": {
+        "file_id": 595,
+        "content": "This code snippet is returning the total loss computed from various components of the TransNetV2 model.",
+        "type": "comment"
+    },
+    "8078": {
+        "file_id": 596,
+        "content": "/paddlevideo/modeling/losses/yowo_loss.py",
+        "type": "filepath"
+    },
+    "8079": {
+        "file_id": 596,
+        "content": "FocalLoss optimizes hard examples in object detection, while YowoLoss and RegionLoss use softmax encoding. Code prepares input with reshaping, sigmoid activation, and anchor parameters. The code calculates YOLOv3-style losses for bounding box location, confidence, and classification on GPU.",
+        "type": "summary"
+    },
+    "8080": {
+        "file_id": 596,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nfrom paddle.static import Variable\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\nfrom ..framework.localizers.yowo_utils import build_targets\nclass FocalLoss(nn.Layer):\n    \"\"\"\n        This criterion is a implemenation of Focal Loss, which is proposed in\n        Focal Loss for Dense Object Detection.\n            Loss(x, class) = - \\alpha (1-softmax(x)[class])^gamma \\log(softmax(x)[class])",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:1-31"
+    },
+    "8081": {
+        "file_id": 596,
+        "content": "This code snippet defines a FocalLoss class that implements the Focal Loss criterion. It is used for dense object detection and aims to reduce the classification loss for well-classified samples, focusing more on hard examples. The formula for the loss is given as -α(1-softmax(x)[class])^γ * log(softmax(x)[class]).",
+        "type": "comment"
+    },
+    "8082": {
+        "file_id": 596,
+        "content": "        The losses are averaged across observations for each minibatch.\n        Args:\n            alpha(1D Tensor, Variable) : the scalar factor for this criterion\n            gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5),\n                                   putting more focus on hard, misclassiﬁed examples\n            size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.\n                                However, if the field size_average is set to False, the losses are\n                                instead summed for each minibatch.\n    \"\"\"\n    def __init__(self, class_num, alpha=None, gamma=2, size_average=True):\n        super(FocalLoss, self).__init__()\n        if alpha is None:\n            self.alpha = paddle.ones(\n                [class_num, 1])\n            self.alpha.stop_gradient = False\n        else:\n            if isinstance(alpha, Variable):\n                self.alpha = alpha\n            else:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:33-55"
+    },
+    "8083": {
+        "file_id": 596,
+        "content": "FocalLoss is a criterion that takes in alpha, gamma, and size_average as arguments. It averages losses across observations for each minibatch by default but can sum the losses if size_average is set to False. Alpha is either a tensor or variable, and gamma should be greater than 0, reducing relative loss for well-classified examples.",
+        "type": "comment"
+    },
+    "8084": {
+        "file_id": 596,
+        "content": "                self.alpha = (alpha)\n                self.alpha.stop_gradient = False\n        self.gamma = gamma\n        self.class_num = class_num\n        self.size_average = size_average\n    def forward(self, inputs, targets):\n        N = inputs.shape[0]\n        C = inputs.shape[1]\n        P = F.softmax(inputs, axis=1)\n        tmp = numpy.zeros((N, C))\n        class_mask = paddle.to_tensor(tmp, place=inputs.place)\n        class_mask.stop_gradient = False\n        ids = paddle.reshape(targets, [-1, 1])\n        class_mask = F.one_hot(ids.squeeze(-1), class_mask.shape[1])\n        if \"Place\" not in str(inputs.place) and \"Place\" not in str(self.alpha.place):\n            self.alpha = self.alpha.cuda()\n        alpha = self.alpha[paddle.reshape(ids.detach(), [-1])]\n        probs = paddle.reshape((P * class_mask).sum(1), [-1, 1])\n        log_p = probs.log()\n        batch_loss = -alpha * (paddle.pow((1 - probs), self.gamma)) * log_p\n        if self.size_average:\n            loss = batch_loss.mean()\n        else:\n            loss = batch_loss.sum()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:56-87"
+    },
+    "8085": {
+        "file_id": 596,
+        "content": "This code defines a class for the Yowo loss function. The constructor sets various attributes like alpha, gamma, class_num, size_average, and stop_gradient. The forward method calculates the loss using softmax, one-hot encoding, and other operations. If inputs or self.alpha are not in GPU, it transfers them to the GPU. It then computes the batch_loss and finally returns either average or sum depending on size_average attribute.",
+        "type": "comment"
+    },
+    "8086": {
+        "file_id": 596,
+        "content": "        return loss\n@LOSSES.register()\nclass RegionLoss(BaseWeightedLoss):\n    # for our model anchors has 10 values and number of anchors is 5\n    # parameters: 24, 10 float values, 24, 5\n    def __init__(self, num_classes, anchors, num_anchors, object_scale, noobject_scale, class_scale, coord_scale):\n        super().__init__()\n        self.num_classes = num_classes\n        self.anchors = [float(x) for x in anchors]\n        self.num_anchors = num_anchors\n        self.anchor_step = len(self.anchors) // self.num_anchors  # each anchor has 2 parameters\n        self.object_scale = object_scale\n        self.noobject_scale = noobject_scale\n        self.class_scale = class_scale\n        self.coord_scale = coord_scale\n        self.focalloss = FocalLoss(class_num=self.num_classes, gamma=2, size_average=False)\n        self.thresh = 0.6\n    def convert2cpu(self, gpu_matrix):\n        # return paddle.to_tensor((gpu_matrix.shape), dtype=\"float32\").copy_(gpu_matrix)\n        return gpu_matrix.cpu()\n    def forward(self, output, target):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:88-112"
+    },
+    "8087": {
+        "file_id": 596,
+        "content": "This code defines a RegionLoss class that inherits from BaseWeightedLoss. It takes parameters such as num_classes, anchors, num_anchors, object_scale, noobject_scale, class_scale, and coord_scale. The class initializes an instance of FocalLoss and sets a threshold. The forward method computes the loss between output and target tensors.",
+        "type": "comment"
+    },
+    "8088": {
+        "file_id": 596,
+        "content": "        # output : B*A*(4+1+num_classes)*H*W            8*5*29*24*24\n        # B: number of batches\n        # A: number of anchors\n        # 4: 4 parameters for each bounding box\n        # 1: confidence score\n        # num_classes\n        # H: height of the image (in grids)\n        # W: width of the image (in grids)\n        # for each grid cell, there are A*(4+1+num_classes) parameters\n        nB = output.detach().shape[0]  # batch\n        nA = self.num_anchors  # anchor_num\n        nC = self.num_classes\n        nH = output.detach().shape[2]\n        nW = output.detach().shape[3]\n        # resize the output (all parameters for each anchor can be reached)\n        output = paddle.reshape(output, [nB, nA, (5 + nC), nH, nW])\n        # anchor's parameter tx\n        x = F.sigmoid(\n            paddle.reshape(paddle.index_select(output, paddle.to_tensor([0], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW]))\n        x.stop_gradient = False\n        # anchor's parameter ty\n        y = F.sigmoid(",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:113-137"
+    },
+    "8089": {
+        "file_id": 596,
+        "content": "This code reshapes the output tensor for each anchor's parameters and applies sigmoid activation to the transformed tensor. The output tensor is of shape B*A*(4+1+num_classes)*H*W, which represents the coordinates (tx, ty), width, height, confidence score, and class probabilities for each anchor box in the image grid. By applying sigmoid activation functions to tx and ty, the code scales the anchor's parameter values between 0 and 1, preparing them for the subsequent operations in the YOLOv4 model.",
+        "type": "comment"
+    },
+    "8090": {
+        "file_id": 596,
+        "content": "            paddle.reshape(paddle.index_select(output, paddle.to_tensor([1], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW]))\n        y.stop_gradient = False\n        # anchor's parameter tw\n        w = paddle.reshape(paddle.index_select(output, paddle.to_tensor([2], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW])\n        w.stop_gradient = False\n        # anchor's parameter th\n        h = paddle.reshape(paddle.index_select(output, paddle.to_tensor([3], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW])\n        h.stop_gradient = False\n        # confidence score for each anchor\n        conf = F.sigmoid(\n            paddle.reshape(paddle.index_select(output, paddle.to_tensor([4], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW]))\n        conf.stop_gradient = False\n        # anchor's parameter class label\n        cls = paddle.index_select(output, paddle.linspace(5, 5 + nC - 1, nC, 'int64').cuda(), axis=2)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:138-155"
+    },
+    "8091": {
+        "file_id": 596,
+        "content": "The code reshapes and assigns stop_gradient to output slices of the tensor \"output\" corresponding to anchor parameters (paddle, w, h) and a confidence score (conf), as well as class labels (cls). All are assigned stop_gradient=False.",
+        "type": "comment"
+    },
+    "8092": {
+        "file_id": 596,
+        "content": "        cls.stop_gradient = False\n        # resize the data structure so that for every anchor there is a class label in the last dimension\n        cls = paddle.reshape(paddle.transpose(paddle.reshape(cls, [nB * nA, nC, nH * nW]), [0, 2, 1]),\n                             [nB * nA * nH * nW, nC])\n        # for the prediction of localization of each bounding box, there exist 4 parameters (tx, ty, tw, th)\n        # pred_boxes = torch.cuda.FloatTensor(4, nB*nA*nH*nW)\n        pred_boxes = paddle.zeros([4, nB * nA * nH * nW], dtype='float32').cuda()\n        # tx and ty\n        grid_x = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nW - 1, nW), [nH, 1]), [nB * nA, 1, 1]),\n                                [nB * nA * nH * nW]).cuda()\n        grid_y = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nH - 1, nH), [nW, 1]).t(), [nB * nA, 1, 1]),\n                                [nB * nA * nH * nW]).cuda()\n        # for each anchor there are anchor_step variables (with the structure num_anchor*anchor_step)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:156-169"
+    },
+    "8093": {
+        "file_id": 596,
+        "content": "This code resizes the data structure to have a class label for each anchor, initializes prediction boxes, and creates grid coordinates for localization. It uses PaddlePaddle's linear algebra functions like paddle.reshape, paddle.transpose, and paddle.linspace. The code aims to prepare the input data for object detection model training.",
+        "type": "comment"
+    },
+    "8094": {
+        "file_id": 596,
+        "content": "        # for each row(anchor), the first variable is anchor's width, second is anchor's height\n        # pw and ph\n        anchor_w = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]),\n                                       paddle.to_tensor([0], dtype='int64'), axis=1).cuda()\n        anchor_h = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]),\n                                       paddle.to_tensor([1], dtype='int64'), axis=1).cuda()\n        # for each pixel (grid) repeat the above process (obtain width and height of each grid)\n        anchor_w = paddle.reshape(paddle.tile(paddle.tile(anchor_w, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW])\n        anchor_h = paddle.reshape(paddle.tile(paddle.tile(anchor_h, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW])\n        # prediction of bounding box localization\n        # x.data and y.data: top left corner of the anchor\n        # grid_x, grid_y: tx and ty predictions made by yowo",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:170-181"
+    },
+    "8095": {
+        "file_id": 596,
+        "content": "This code is preparing anchor width and height values for the YOWO loss function. It reshapes the anchors, index selects the width and height values, tiles them to match grid dimensions, and assigns the prediction of bounding box localization for each grid cell.",
+        "type": "comment"
+    },
+    "8096": {
+        "file_id": 596,
+        "content": "        x_data = paddle.reshape(x.detach(), [-1])\n        y_data = paddle.reshape(y.detach(), [-1])\n        w_data = paddle.reshape(w.detach(), [-1])\n        h_data = paddle.reshape(h.detach(), [-1])\n        pred_boxes[0] = paddle.cast(x_data, dtype='float32') + paddle.cast(grid_x, dtype='float32')  # bx\n        pred_boxes[1] = paddle.cast(y_data, dtype='float32') + paddle.cast(grid_y, dtype='float32')  # by\n        pred_boxes[2] = paddle.exp(paddle.cast(w_data, dtype='float32')) * paddle.cast(anchor_w, dtype='float32')  # bw\n        pred_boxes[3] = paddle.exp(paddle.cast(h_data, dtype='float32')) * paddle.cast(anchor_h, dtype='float32')  # bh\n        # the size -1 is inferred from other dimensions\n        # pred_boxes (nB*nA*nH*nW, 4)\n        pred_boxes = self.convert2cpu(\n            paddle.cast(paddle.reshape(paddle.transpose(pred_boxes, (1, 0)), [-1, 4]), dtype='float32'))\n        nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,\n           ",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:183-199"
+    },
+    "8097": {
+        "file_id": 596,
+        "content": "This code reshapes and casts input tensors, calculates predicted bounding box coordinates based on input features, and calls a function to build targets for the model. It then reshapes and transposes the predicted boxes tensor before passing it to the build_targets function. The function is part of the YOLOv3-style loss calculation in PaddleVideo.",
+        "type": "comment"
+    },
+    "8098": {
+        "file_id": 596,
+        "content": "                                                                                         target.detach(),\n                                                                                                    self.anchors, nA,\n                                                                                                    nC, \\\n                                                                                                    nH, nW,\n                                                                                                    self.noobject_scale,\n                                                                                                    self.object_scale,\n                                                                                                    self.thresh)\n        cls_mask = (cls_mask == 1)\n        #  keep those with high box confidence scores (greater than 0.25) as our final predictions\n        nProposals = int((conf > 0.25).sum().detach().item())\n        tx = (tx).cuda()",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:199-210"
+    },
+    "8099": {
+        "file_id": 596,
+        "content": "This code is setting up a loss function for object detection. It takes in target values, anchors, number of anchors (nA), number of classes (nC), and the image dimensions (nH, nW). The noobject_scale and object_scale variables control how the loss is applied depending on whether an object is present or not. The cls_mask variable filters out proposals with low box confidence scores. The final predictions are kept if their confidence score is greater than 0.25. The tensor tx is moved to the GPU (cuda).",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/81.json b/docs/data/81.json
new file mode 100644
index 000000000..9ccda8778
--- /dev/null
+++ b/docs/data/81.json
@@ -0,0 +1,542 @@
+{
+    "8100": {
+        "file_id": 596,
+        "content": "        tx.stop_gradient = False\n        ty = ty.cuda()\n        ty.stop_gradient = False\n        tw = tw.cuda()\n        tw.stop_gradient = False\n        th = th.cuda()\n        th.stop_gradient = False\n        tconf = tconf.cuda()\n        tconf.stop_gradient = False\n        tcls = paddle.reshape(tcls, [-1]).astype('int64')[paddle.reshape(cls_mask, [-1])].cuda()\n        tcls.stop_gradient = False\n        coord_mask = coord_mask.cuda()\n        coord_mask.stop_gradient = False\n        conf_mask = conf_mask.cuda().sqrt()\n        coord_mask.stop_gradient = False\n        cls_mask = paddle.tile(paddle.reshape(cls_mask, [-1, 1]), [1, nC]).cuda()\n        cls_mask.stop_gradient = False\n        cls = paddle.reshape(cls[cls_mask], [-1, nC])\n        # losses between predictions and targets (ground truth)\n        # In total 6 aspects are considered as losses:\n        # 4 for bounding box location, 2 for prediction confidence and classification seperately\n        L1_loss = nn.SmoothL1Loss(reduction='sum')\n        loss_x = self.coord_scale * L1_loss(paddle.cast(x, dtype=\"float32\") * coord_mask, tx * coord_mask) / 2.0",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:211-237"
+    },
+    "8101": {
+        "file_id": 596,
+        "content": "This code is moving variables to the GPU and setting their gradient flags to False. Then, it calculates losses for bounding box location, prediction confidence, and classification separately using SmoothL1Loss.",
+        "type": "comment"
+    },
+    "8102": {
+        "file_id": 596,
+        "content": "        loss_y = self.coord_scale * L1_loss(paddle.cast(y, dtype=\"float32\") * coord_mask, ty * coord_mask) / 2.0\n        loss_w = self.coord_scale * L1_loss(paddle.cast(w * coord_mask, dtype=\"float32\"), tw * coord_mask) / 2.0\n        loss_h = self.coord_scale * L1_loss(paddle.cast(h * coord_mask, dtype=\"float32\"), th * coord_mask) / 2.0\n        loss_conf = nn.MSELoss(reduction='sum')(paddle.cast(conf, dtype=\"float32\") * conf_mask, tconf * conf_mask) / 2.0\n        # try focal loss with gamma = 2\n        loss_cls = self.class_scale * self.focalloss(cls, tcls)\n        # sum of loss\n        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls\n        return loss, nCorrect",
+        "type": "code",
+        "location": "/paddlevideo/modeling/losses/yowo_loss.py:238-249"
+    },
+    "8103": {
+        "file_id": 596,
+        "content": "This code calculates the loss for an object detection model, consisting of L1_loss for coordinates (x, y, w, h) and MSELoss for confidence. It applies focal loss for classification with a gamma value of 2, sums all losses together, and returns the total loss and count of correct predictions.",
+        "type": "comment"
+    },
+    "8104": {
+        "file_id": 597,
+        "content": "/paddlevideo/modeling/registry.py",
+        "type": "filepath"
+    },
+    "8105": {
+        "file_id": 597,
+        "content": "This code registers various model types (backbones, heads, recognizers) using a Registry class for efficient organization and management in a larger model architecture or framework implementation. Registries are created for 'bbox_coder', 'estimator', 'multimodal', and 'segment'.",
+        "type": "summary"
+    },
+    "8106": {
+        "file_id": 597,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nBACKBONES = Registry('backbone')\nHEADS = Registry('head')\nRECOGNIZERS = Registry('recognizer')\nSEGMENTERS = Registry('Segmenters')\nLOCALIZERS = Registry('localizer')\nPARTITIONERS = Registry('partitioner')\nLOSSES = Registry('loss')\nROI_EXTRACTORS = Registry('roi_extractor')\nDETECTORS = Registry('detectors')\nBBOX_ASSIGNERS = Registry('bbox_assigner')\nBBOX_SAMPLERS = Registry('bbox_sampler')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/registry.py:1-27"
+    },
+    "8107": {
+        "file_id": 597,
+        "content": "This code is registering different types of models (backbones, heads, recognizers, etc.) using a Registry class from the utils module. The Registry will help in organizing and managing these different model types efficiently. This code snippet seems to be part of a larger model architecture or framework implementation.",
+        "type": "comment"
+    },
+    "8108": {
+        "file_id": 597,
+        "content": "BBOX_CODERS = Registry('bbox_coder')\nESTIMATORS = Registry('estimator')\nMULTIMODAL = Registry('multimodal')\nSEGMENT = Registry('segment')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/registry.py:28-31"
+    },
+    "8109": {
+        "file_id": 597,
+        "content": "Registry is created for 'bbox_coder', 'estimator', 'multimodal', and 'segment'. These Registries organize and manage the different types of models or coding methods, allowing for easy access and maintenance.",
+        "type": "comment"
+    },
+    "8110": {
+        "file_id": 598,
+        "content": "/paddlevideo/modeling/samplers/__init__.py",
+        "type": "filepath"
+    },
+    "8111": {
+        "file_id": 598,
+        "content": "This code is licensing information and imports the RandomSampler class from a submodule, then defines the __all__ variable to include only the RandomSampler class.",
+        "type": "summary"
+    },
+    "8112": {
+        "file_id": 598,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .random_sampler import RandomSampler\n__all__ = ['RandomSampler']",
+        "type": "code",
+        "location": "/paddlevideo/modeling/samplers/__init__.py:1-17"
+    },
+    "8113": {
+        "file_id": 598,
+        "content": "This code is licensing information and imports the RandomSampler class from a submodule, then defines the __all__ variable to include only the RandomSampler class.",
+        "type": "comment"
+    },
+    "8114": {
+        "file_id": 599,
+        "content": "/paddlevideo/modeling/samplers/random_sampler.py",
+        "type": "filepath"
+    },
+    "8115": {
+        "file_id": 599,
+        "content": "The code initializes a SamplingResult class for bbox sampling and defines a RandomSampler class to sample positive and negative bboxes from assigned results, ensuring enough samples are available in each case.",
+        "type": "summary"
+    },
+    "8116": {
+        "file_id": 599,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport numpy as np\nfrom ..registry import BBOX_SAMPLERS\nclass SamplingResult():\n    \"\"\"Bbox sampling result.  \"\"\"\n    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,\n                 gt_flags):\n        self.pos_inds = pos_inds\n        self.neg_inds = neg_inds\n        self.pos_bboxes = paddle.index_select(bboxes,pos_inds)\n        # neg_inds may be empty\n        if neg_inds.shape[0]!=0:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/samplers/random_sampler.py:1-28"
+    },
+    "8117": {
+        "file_id": 599,
+        "content": "This code is importing necessary libraries and defining a class called \"SamplingResult\" that holds the result of bbox sampling. The class has attributes for positive indices, negative indices, bboxes, gt_bboxes, assign_result, and gt_flags. It uses paddle library to index select the bboxes based on pos_inds. Negative_inds may be empty.",
+        "type": "comment"
+    },
+    "8118": {
+        "file_id": 599,
+        "content": "            self.neg_bboxes = paddle.index_select(bboxes,neg_inds)\n        else:\n            self.neg_bboxes=None\n        self.pos_is_gt  = paddle.index_select(gt_flags,pos_inds)\n        self.num_gts = gt_bboxes.shape[0]\n        self.pos_assigned_gt_inds = paddle.index_select(assign_result.gt_inds,pos_inds) - 1\n        if float(gt_bboxes.numel()) == 0:\n            assert self.pos_assigned_gt_inds.numel() == 0\n            self.pos_gt_bboxes = paddle.empty_like(gt_bboxes).view(-1, 4)\n        else:\n            if len(gt_bboxes.shape) < 2:\n                gt_bboxes = gt_bboxes.view(-1, 4)\n            self.pos_gt_bboxes = paddle.index_select(gt_bboxes, self.pos_assigned_gt_inds)\n        if assign_result.labels is not None:\n            self.pos_gt_labels = paddle.index_select(assign_result.labels, pos_inds)\n        else:\n            self.pos_gt_labels = None\n    @property\n    def bboxes(self):\n        if self.neg_bboxes is not None:\n            ret = paddle.concat([self.pos_bboxes, self.neg_bboxes])\n        else:",
+        "type": "code",
+        "location": "/paddlevideo/modeling/samplers/random_sampler.py:29-55"
+    },
+    "8119": {
+        "file_id": 599,
+        "content": "This code initializes the negative bounding boxes, positive ground truth (gt) bounding boxes and labels for a sampler. It checks if there are any gt bboxes available, if not, it sets up a placeholder for them. The 'pos_bboxes' are then concatenated with the neg_bboxes if they exist. If assign_result.labels is not None, it also extracts and stores positive gt labels.",
+        "type": "comment"
+    },
+    "8120": {
+        "file_id": 599,
+        "content": "            # neg bbox may be empty\n            ret = self.pos_bboxes\n        return ret\n@BBOX_SAMPLERS.register()\nclass RandomSampler():\n    def __init__(self,\n                 num,\n                 pos_fraction,\n                 neg_pos_ub=-1,\n                 add_gt_as_proposals=True,\n                 **kwargs):\n        self.num = num\n        self.pos_fraction = pos_fraction\n        self.neg_pos_ub = neg_pos_ub\n        self.add_gt_as_proposals = add_gt_as_proposals\n    def sample(self,\n               assign_result,\n               bboxes,\n               gt_bboxes,\n               gt_labels=None,\n               **kwargs):\n        \"\"\"Sample positive and negative bboxes.  \"\"\"\n        if len(bboxes.shape) < 2:\n            bboxes = bboxes[None, :]\n        bboxes = bboxes[:, :4]\n        gt_flags = paddle.full([bboxes.shape[0], ], 0, dtype='int32')\n        if self.add_gt_as_proposals and len(gt_bboxes) > 0:\n            if gt_labels is None:\n                raise ValueError(\n                    'gt_labels must be given when add_gt_as_proposals is True')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/samplers/random_sampler.py:56-92"
+    },
+    "8121": {
+        "file_id": 599,
+        "content": "This code defines a RandomSampler class which samples positive and negative bboxes from assigned results. It takes arguments like num, pos_fraction, neg_pos_ub, add_gt_as_proposals, etc. If add_gt_as_proposals is True and gt_bboxes are present, it raises a ValueError if gt_labels are not given. The sample method takes assign_result, bboxes, gt_bboxes, and gt_labels as arguments. It checks the shape of bboxes, converts them to 4-column format, and creates gt_flags.",
+        "type": "comment"
+    },
+    "8122": {
+        "file_id": 599,
+        "content": "            bboxes = paddle.concat([gt_bboxes, bboxes])\n            assign_result.add_gt_(gt_labels)\n            gt_ones = paddle.full([gt_bboxes.shape[0], ], 1, dtype='int32')\n            gt_flags = paddle.concat([gt_ones, gt_flags])\n        #1. 得到正样本的数量, inds\n        num_expected_pos = int(self.num * self.pos_fraction)\n        pos_inds = self._sample_pos( assign_result, num_expected_pos, bboxes=bboxes, **kwargs)\n        pos_inds = paddle.to_tensor(np.unique(pos_inds.numpy()))\n        #2. 得到负样本的数量, inds\n        num_sampled_pos = pos_inds.numel()\n        num_expected_neg = self.num - num_sampled_pos\n        neg_inds = self._sample_neg(\n            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)\n        neg_inds = paddle.to_tensor(np.unique(neg_inds.numpy()))\n        #3. 得到sampling result\n        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,\n                                         assign_result, gt_flags)\n        return sampling_result\n    def random_choice(self, gallery, num):",
+        "type": "code",
+        "location": "/paddlevideo/modeling/samplers/random_sampler.py:93-114"
+    },
+    "8123": {
+        "file_id": 599,
+        "content": "This code samples positive and negative indices for assigning ground truth labels to objects, ensuring a desired ratio of positive and negative samples. It then creates a SamplingResult object containing these indices along with bounding boxes and other information. The random_choice function is used to randomly select a specific number of samples from a given set of objects.",
+        "type": "comment"
+    },
+    "8124": {
+        "file_id": 599,
+        "content": "        \"\"\"Random select some elements from the gallery.  \"\"\"\n        assert len(gallery) >= num\n        perm = paddle.arange(gallery.numel())[:num]\n        perm = paddle.randperm(gallery.numel())[:num] \n        rand_inds = paddle.index_select(gallery, perm)\n        return rand_inds\n    def _sample_pos(self, assign_result, num_expected, **kwargs):\n        \"\"\"Randomly sample some positive samples.\"\"\"\n        #1.首先看一下给的bboxes里面有哪些label是大于0的 得到了他们的index\n        pos_inds = paddle.nonzero(assign_result.gt_inds, as_tuple=False)\n        #2. 只要这个pos_inds的数目不是0个 这些就都可以是positive sample\n        # 当pos_inds的数目小于num_expected(想要的sample的最大数目), 就直接用这个pos_inds\n        # 反之就从这么多index里随机采样num_expected个出来\n        if float(pos_inds.numel()) != 0:\n            pos_inds = pos_inds.squeeze() \n        if float(pos_inds.numel()) <= num_expected:\n            return pos_inds\n        else:\n            return self.random_choice(pos_inds, num_expected)\n    def _sample_neg(self, assign_result, num_expected, **kwargs):\n        \"\"\"Randomly sample some negative samples.\"\"\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/samplers/random_sampler.py:115-139"
+    },
+    "8125": {
+        "file_id": 599,
+        "content": "The code defines a random sampler that randomly selects elements from the gallery. It has two functions: _sample_pos, which randomly samples positive samples, and _sample_neg, which randomly samples negative samples. The _sample_pos function first finds indexes of assign_result with label greater than 0 (i.e., positive samples), then checks if the number of positive samples is less than or equal to num_expected. If it's less, returns the indices; otherwise, selects num_expected random samples from the available indices using the random_choice method. The _sample_neg function does a similar process for negative samples but doesn't return the indices if their number is 0.",
+        "type": "comment"
+    },
+    "8126": {
+        "file_id": 599,
+        "content": "        neg_inds = paddle.nonzero(assign_result.gt_inds == 0, as_tuple=False)\n        if float(neg_inds.numel()) != 0:\n            neg_inds = neg_inds.squeeze() \n        if (float(neg_inds.numel())) <= float(num_expected):\n            return neg_inds\n        else:\n            return self.random_choice(neg_inds, num_expected)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/samplers/random_sampler.py:140-146"
+    },
+    "8127": {
+        "file_id": 599,
+        "content": "This code checks the assign_result's gt_inds for zero values, extracts their indices in neg_inds, and if there are non-zero values, squeezes them. If the number of non-zero values is less than or equal to expected, it returns neg_inds. Otherwise, it uses random_choice() to select required indices from neg_inds.",
+        "type": "comment"
+    },
+    "8128": {
+        "file_id": 600,
+        "content": "/paddlevideo/modeling/weight_init.py",
+        "type": "filepath"
+    },
+    "8129": {
+        "file_id": 600,
+        "content": "This code initializes layer weights in PaddlePaddle, applying truncated normal or other initializations like Gaussian and Kaiming uniform. It adjusts for different modes and supports numpy arrays and Paddle tensors.",
+        "type": "summary"
+    },
+    "8130": {
+        "file_id": 600,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn.initializer as init\nimport numpy as np\nfrom scipy import special\ndef weight_init_(layer,\n                 func,\n                 weight_name=None,\n                 bias_name=None,\n                 bias_value=0.0,\n                 **kwargs):\n    \"\"\"\n    In-place params init function.\n    Usage:\n    .. code-block:: python\n        import paddle\n        import numpy as np\n        data = np.ones([3, 4], dtype='float32')",
+        "type": "code",
+        "location": "/paddlevideo/modeling/weight_init.py:1-36"
+    },
+    "8131": {
+        "file_id": 600,
+        "content": "This code defines a function that initializes the weights of a PaddlePaddle layer using specified functions. It can also set bias values and is compatible with numpy arrays and Paddle tensors.",
+        "type": "comment"
+    },
+    "8132": {
+        "file_id": 600,
+        "content": "        linear = paddle.nn.Linear(4, 4)\n        input = paddle.to_tensor(data)\n        print(linear.weight)\n        linear(input)\n        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)\n        print(linear.weight)\n    \"\"\"\n    if hasattr(layer, 'weight') and layer.weight is not None:\n        getattr(init, func)(**kwargs)(layer.weight)\n        if weight_name is not None:\n            # override weight name\n            layer.weight.name = weight_name\n    if hasattr(layer, 'bias') and layer.bias is not None:\n        init.Constant(bias_value)(layer.bias)\n        if bias_name is not None:\n            # override bias name\n            layer.bias.name = bias_name\ndef _no_grad_trunc_normal_(tensor, mean, std, a, b):\n    def norm_cdf(x):\n        # Computes standard normal cumulative distribution function\n        return (1. + math.erf(x / math.sqrt(2.))) / 2.\n    if (mean < a - 2 * std) or (mean > b + 2 * std):\n        print(\"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. \"\n              \"The distribution of values may be incorrect.\")",
+        "type": "code",
+        "location": "/paddlevideo/modeling/weight_init.py:37-66"
+    },
+    "8133": {
+        "file_id": 600,
+        "content": "Code initializes a Linear layer, applies truncated normal initialization to its weights with specified mean and std deviation, and optionally changes the weight name. If the layer has bias, it initializes the bias with a constant value and optionally changes the bias name. The _no_grad_trunc_normal_ function is used internally by nn.init.trunc_normal_.",
+        "type": "comment"
+    },
+    "8134": {
+        "file_id": 600,
+        "content": "    with paddle.no_grad():\n        # Values are generated by using a truncated uniform distribution and\n        # then using the inverse CDF for the normal distribution.\n        # Get upper and lower cdf values\n        l = norm_cdf((a - mean) / std)\n        u = norm_cdf((b - mean) / std)\n        # Uniformly fill tensor with values from [l, u], then translate to [2l-1, 2u-1].\n        tmp = np.random.uniform(2 * l - 1, 2 * u - 1,\n                                size=list(tensor.shape)).astype(np.float32)\n        # Use inverse cdf transform for normal distribution to get truncated\n        # standard normal\n        tmp = special.erfinv(tmp)\n        # Transform to proper mean, std\n        tmp *= (std * math.sqrt(2.0))\n        tmp += mean\n        # Clamp to ensure it's in the proper range\n        tmp = np.clip(tmp, a, b)\n        tensor.set_value(paddle.to_tensor(tmp))\n        return tensor\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = tensor.dim()\n    if dimensions < 2:\n        raise ValueError(\n            \"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions\"",
+        "type": "code",
+        "location": "/paddlevideo/modeling/weight_init.py:68-98"
+    },
+    "8135": {
+        "file_id": 600,
+        "content": "This code generates weights for a tensor following a truncated Gaussian distribution. It computes the lower and upper bounds, uniformly fills the tensor with values between these bounds, transforms them to a standard Gaussian distribution, adjusts the mean and standard deviation, clamps the values within the original bounds, and sets the tensor's value.",
+        "type": "comment"
+    },
+    "8136": {
+        "file_id": 600,
+        "content": "        )\n    num_input_fmaps = tensor.shape[1]\n    num_output_fmaps = tensor.shape[0]\n    receptive_field_size = 1\n    if tensor.dim() > 2:\n        receptive_field_size = tensor[0][0].numel()\n    fan_in = num_input_fmaps * receptive_field_size\n    fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):\n    return _no_grad_trunc_normal_(tensor, mean, std, a, b)\ndef kaiming_normal_(tensor, a=0., mode='fan_in', nonlinearity='leaky_relu'):\n    def _calculate_correct_fan(tensor, mode):\n        mode = mode.lower()\n        valid_modes = ['fan_in', 'fan_out']\n        if mode not in valid_modes:\n            raise ValueError(\n                \"Mode {} not supported, please use one of {}\".format(\n                    mode, valid_modes))\n        fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n        return fan_in if mode == 'fan_in' else fan_out\n    def calculate_gain(nonlinearity, param=None):\n        linear_fns = [\n            'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',",
+        "type": "code",
+        "location": "/paddlevideo/modeling/weight_init.py:99-130"
+    },
+    "8137": {
+        "file_id": 600,
+        "content": "This code initializes weights in a convolutional layer using either truncated normal or Kaiming uniform initialization. It calculates the fan-in and fan-out based on input and output feature maps, receptive field size, and optionally adjusts for different modes. The `trunc_normal_` function generates random values within specific bounds using truncated normal distribution, while `kaiming_normal_` sets weights using Kaiming uniform initialization with an optional nonlinearity parameter.",
+        "type": "comment"
+    },
+    "8138": {
+        "file_id": 600,
+        "content": "            'conv_transpose2d', 'conv_transpose3d'\n        ]\n        if nonlinearity in linear_fns or nonlinearity == 'sigmoid':\n            return 1\n        elif nonlinearity == 'tanh':\n            return 5.0 / 3\n        elif nonlinearity == 'relu':\n            return math.sqrt(2.0)\n        elif nonlinearity == 'leaky_relu':\n            if param is None:\n                negative_slope = 0.01\n            elif not isinstance(param, bool) and isinstance(\n                    param, int) or isinstance(param, float):\n                negative_slope = param\n            else:\n                raise ValueError(\n                    \"negative_slope {} not a valid number\".format(param))\n            return math.sqrt(2.0 / (1 + negative_slope**2))\n        else:\n            raise ValueError(\"Unsupported nonlinearity {}\".format(nonlinearity))\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    with paddle.no_grad():\n        paddle.nn.initializer.Normal(0, std)(tensor)",
+        "type": "code",
+        "location": "/paddlevideo/modeling/weight_init.py:131-156"
+    },
+    "8139": {
+        "file_id": 600,
+        "content": "This function initializes the weights of a neural network layer with respect to the nonlinearity used. It returns different values depending on the nonlinearity type, calculates the fan for each layer and then applies normal initialization using Paddle's Normal initializer.",
+        "type": "comment"
+    },
+    "8140": {
+        "file_id": 600,
+        "content": "        return tensor",
+        "type": "code",
+        "location": "/paddlevideo/modeling/weight_init.py:157-157"
+    },
+    "8141": {
+        "file_id": 600,
+        "content": "Initializes a tensor with specified values and returns it.",
+        "type": "comment"
+    },
+    "8142": {
+        "file_id": 601,
+        "content": "/paddlevideo/solver/__init__.py",
+        "type": "filepath"
+    },
+    "8143": {
+        "file_id": 601,
+        "content": "This code snippet appears to import the \"build_optimizer\" and \"build_lr\" functions from their respective modules within the \"paddlevideo.solver\" package. The comments at the top of the file indicate that this code is protected by copyright and licensed under the Apache License, Version 2.0.",
+        "type": "summary"
+    },
+    "8144": {
+        "file_id": 601,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .optimizer import build_optimizer\nfrom .lr import build_lr",
+        "type": "code",
+        "location": "/paddlevideo/solver/__init__.py:1-16"
+    },
+    "8145": {
+        "file_id": 601,
+        "content": "This code snippet appears to import the \"build_optimizer\" and \"build_lr\" functions from their respective modules within the \"paddlevideo.solver\" package. The comments at the top of the file indicate that this code is protected by copyright and licensed under the Apache License, Version 2.0.",
+        "type": "comment"
+    },
+    "8146": {
+        "file_id": 602,
+        "content": "/paddlevideo/solver/custom_lr.py",
+        "type": "filepath"
+    },
+    "8147": {
+        "file_id": 602,
+        "content": "The code introduces CustomWarmupCosineDecay and CustomWarmupPiecewiseDecay schedulers for PaddleVideo, combining warm-up, cosine decay, and piecewise decay. The CustomWarmupAdjustDecay scheduler combines warmup and cosine decay and adjusts based on epoch number.",
+        "type": "summary"
+    },
+    "8148": {
+        "file_id": 602,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nfrom paddle.optimizer.lr import *\nimport numpy as np\n\"\"\"\nPaddleVideo Learning Rate Schedule:\nYou can use paddle.optimizer.lr\nor define your custom_lr in this file.\n\"\"\"\nclass CustomWarmupCosineDecay(LRScheduler):\n    r\"\"\"\n    We combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        warmup_start_lr (float): start learning rate used in warmup stage.\n        warmup_epochs (int): the number epochs of warmup.",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:1-31"
+    },
+    "8149": {
+        "file_id": 602,
+        "content": "This code defines a custom learning rate scheduler, CustomWarmupCosineDecay, which combines warmup and stepwise-cosine decay for use in PaddleVideo. It is part of the PaddlePaddle framework and can be used to adjust learning rates during training.",
+        "type": "comment"
+    },
+    "8150": {
+        "file_id": 602,
+        "content": "        cosine_base_lr (float|int, optional): base learning rate in cosine schedule.\n        max_epoch (int): total training epochs.\n        num_iters(int): number iterations of each epoch.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CosineAnnealingDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 warmup_start_lr,\n                 warmup_epochs,\n                 cosine_base_lr,\n                 max_epoch,\n                 num_iters,\n                 last_epoch=-1,\n                 verbose=False):\n        self.warmup_start_lr = warmup_start_lr\n        self.warmup_epochs = warmup_epochs\n        self.cosine_base_lr = cosine_base_lr\n        self.max_epoch = max_epoch\n        self.num_iters = num_iters\n        #call step() in base class, last_lr/last_epoch/base_lr will be update",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:32-54"
+    },
+    "8151": {
+        "file_id": 602,
+        "content": "This code defines a class `CosineAnnealingDecay` that schedules the learning rate for training. It takes parameters like warmup start lr, warmup epochs, cosine base lr, max epoch, num_iters, last_epoch (optional), and verbose (optional). The class initializes these parameters and provides a `step()` method to update the learning rate based on cosine annealing schedule. If verbose is set to True, it will print messages for each update.",
+        "type": "comment"
+    },
+    "8152": {
+        "file_id": 602,
+        "content": "        super(CustomWarmupCosineDecay, self).__init__(last_epoch=last_epoch,\n                                                      verbose=verbose)\n    def step(self, epoch=None):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if self.last_epoch == -1:\n                self.last_epoch += 1\n            else:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def _lr_func_cosine(self, cur_epoch, cosine_base_lr, max_epoch):",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:55-80"
+    },
+    "8153": {
+        "file_id": 602,
+        "content": "This code defines a custom learning rate scheduler for PaddleVideo, implementing the CustomWarmupCosineDecay class. The step() method updates the learning rate based on current epoch and calls get_lr() to set the new learning rate. The _lr_func_cosine() function calculates the learning rate using a cosine annealing schedule.",
+        "type": "comment"
+    },
+    "8154": {
+        "file_id": 602,
+        "content": "        return cosine_base_lr * (math.cos(math.pi * cur_epoch / max_epoch) +\n                                 1.0) * 0.5\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        lr = self._lr_func_cosine(self.last_epoch, self.cosine_base_lr,\n                                  self.max_epoch)\n        lr_end = self._lr_func_cosine(self.warmup_epochs, self.cosine_base_lr,\n                                      self.max_epoch)\n        # Perform warm up.\n        if self.last_epoch < self.warmup_epochs:\n            lr_start = self.warmup_start_lr\n            alpha = (lr_end - lr_start) / self.warmup_epochs\n            lr = self.last_epoch * alpha + lr_start\n        return lr\nclass CustomWarmupPiecewiseDecay(LRScheduler):\n    r\"\"\"\n    This op combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        warmup_start_lr (float): start learning rate used in warmup stage.\n        warmup_epochs (int): the number epochs of warmup.\n        step_base_lr (float|int, optional): base learning rate in step schedule.",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:81-106"
+    },
+    "8155": {
+        "file_id": 602,
+        "content": "The code defines a custom learning rate (LR) scheduler that includes warmup and stepwise-cosine decay. It first performs a warmup stage with a linear increase in LR from the warmup_start_lr to lr_end over warmup_epochs, then applies cosine annealing for the rest of the epochs, resulting in a learning rate that decreases from the base value according to the cosine function.",
+        "type": "comment"
+    },
+    "8156": {
+        "file_id": 602,
+        "content": "        max_epoch (int): total training epochs.\n        num_iters(int): number iterations of each epoch.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CustomWarmupPiecewiseDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 warmup_start_lr,\n                 warmup_epochs,\n                 step_base_lr,\n                 lrs,\n                 gamma,\n                 steps,\n                 max_epoch,\n                 num_iters,\n                 last_epoch=0,\n                 verbose=False):\n        self.warmup_start_lr = warmup_start_lr\n        self.warmup_epochs = warmup_epochs\n        self.step_base_lr = step_base_lr\n        self.lrs = lrs\n        self.gamma = gamma\n        self.steps = steps\n        self.max_epoch = max_epoch\n        self.num_iters = num_iters",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:107-133"
+    },
+    "8157": {
+        "file_id": 602,
+        "content": "This code defines a class `CustomWarmupPiecewiseDecay` which schedules learning rate for training. The class takes parameters like warmup_start_lr, warmup_epochs, step_base_lr, lrs, gamma, steps, max_epoch, num_iters, last_epoch, and verbose. It initializes these parameters in the constructor (__init__). The learning rate is scheduled to decay over time following a piecewise function with warm-up and custom decays.",
+        "type": "comment"
+    },
+    "8158": {
+        "file_id": 602,
+        "content": "        self.last_epoch = last_epoch\n        self.last_lr = self.warmup_start_lr  # used in first iter\n        self.verbose = verbose\n        self._var_name = None\n    def step(self, epoch=None, rebuild=False):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if not rebuild:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print(\n                'step Epoch {}: {} set learning rate to {}.self.num_iters={}, 1/self.num_iters={}'\n                .format(self.last_epoch, self.__class__.__name__, self.last_lr,",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:134-158"
+    },
+    "8159": {
+        "file_id": 602,
+        "content": "This code defines a custom learning rate scheduler for the PaddleVideo library. The `step` method updates the learning rate based on the current epoch and returns None. It should be called after `optimizer.step`. If no epoch is specified, it increments the last epoch by the number of iterations divided by the total number of iterations. The last learning rate is stored in `self.last_lr`, and if verbose is set to True, it prints the current epoch, scheduler name, and updated learning rate.",
+        "type": "comment"
+    },
+    "8160": {
+        "file_id": 602,
+        "content": "                        self.num_iters, 1 / self.num_iters))\n    def _lr_func_steps_with_relative_lrs(self, cur_epoch, lrs, base_lr, steps,\n                                         max_epoch):\n        # get step index\n        steps = steps + [max_epoch]\n        for ind, step in enumerate(steps):\n            if cur_epoch < step:\n                break\n        if self.verbose:\n            print(\n                '_lr_func_steps_with_relative_lrs, cur_epoch {}: {}, steps {}, ind {}, step{}, max_epoch{}'\n                .format(cur_epoch, self.__class__.__name__, steps, ind, step,\n                        max_epoch))\n        return lrs[ind - 1] * base_lr\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        lr = self._lr_func_steps_with_relative_lrs(\n            self.last_epoch,\n            self.lrs,\n            self.step_base_lr,\n            self.steps,\n            self.max_epoch,\n        )\n        lr_end = self._lr_func_steps_with_relative_lrs(\n            self.warmup_epochs,\n            self.lrs,\n            self.step_base_lr,",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:159-188"
+    },
+    "8161": {
+        "file_id": 602,
+        "content": "This function defines a learning rate (lr) policy that varies based on the current epoch, predefined learning rates, base lr, steps, and maximum epoch. It calculates the learning rate for each step using a relative learning rate function and returns it. The function also includes a warmup phase where the learning rate gradually increases from 0 to its final value over the specified number of epochs.",
+        "type": "comment"
+    },
+    "8162": {
+        "file_id": 602,
+        "content": "            self.steps,\n            self.max_epoch,\n        )\n        # Perform warm up.\n        if self.last_epoch < self.warmup_epochs:\n            lr_start = self.warmup_start_lr\n            alpha = (lr_end - lr_start) / self.warmup_epochs\n            lr = self.last_epoch * alpha + lr_start\n        if self.verbose:\n            print(\n                'get_lr, Epoch {}: {}, lr {}, lr_end {}, self.lrs{}, self.step_base_lr{}, self.steps{}, self.max_epoch{}'\n                .format(self.last_epoch, self.__class__.__name__, lr, lr_end,\n                        self.lrs, self.step_base_lr, self.steps,\n                        self.max_epoch))\n        return lr\nclass CustomPiecewiseDecay(PiecewiseDecay):\n    def __init__(self, **kargs):\n        kargs.pop('num_iters')\n        super().__init__(**kargs)\nclass CustomWarmupCosineStepDecay(LRScheduler):\n    def __init__(self,\n                 warmup_iters,\n                 warmup_ratio=0.1,\n                 min_lr=0,\n                 base_lr=3e-5,\n                 max_epoch=30,",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:189-222"
+    },
+    "8163": {
+        "file_id": 602,
+        "content": "This code implements a CustomWarmupCosineStepDecay learning rate scheduler, which performs warm up and then applies piecewise decay. The learning rate is determined based on the current epoch, warmup epochs, warmup start and end rates, and the number of steps. A CustomPiecewiseDecay class is also defined, which inherits from PiecewiseDecay and overrides the num_iters parameter.",
+        "type": "comment"
+    },
+    "8164": {
+        "file_id": 602,
+        "content": "                 last_epoch=-1,\n                 num_iters=None,\n                 verbose=False):\n        self.warmup_ratio = warmup_ratio\n        self.min_lr = min_lr\n        self.warmup_epochs = warmup_iters\n        self.warmup_iters = warmup_iters * num_iters\n        self.cnt_iters = 0\n        self.cnt_epoch = 0\n        self.num_iters = num_iters\n        self.tot_iters = max_epoch * num_iters\n        self.max_epoch = max_epoch\n        self.cosine_base_lr = base_lr  # initial lr for all param groups\n        self.regular_lr = self.get_regular_lr()\n        super().__init__(last_epoch=last_epoch, verbose=verbose)\n    def annealing_cos(self, start, end, factor, weight=1):\n        cos_out = math.cos(math.pi * factor) + 1\n        return end + 0.5 * weight * (start - end) * cos_out\n    def get_regular_lr(self):\n        progress = self.cnt_epoch\n        max_progress = self.max_epoch\n        target_lr = self.min_lr\n        return self.annealing_cos(self.cosine_base_lr, target_lr, progress /\n                                  max_progress)  # self.cosine_base_lr",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:223-249"
+    },
+    "8165": {
+        "file_id": 602,
+        "content": "This function initializes the custom learning rate scheduler. It sets warmup ratio, minimum learning rate, and warmup iterations. The total number of iterations, maximum epochs, base learning rate for cosine annealing, and a regular learning rate are calculated. The function also defines a helper method 'annealing_cos' for cosine annealing.",
+        "type": "comment"
+    },
+    "8166": {
+        "file_id": 602,
+        "content": "    def get_warmup_lr(self, cur_iters):\n        k = (1 - cur_iters / self.warmup_iters) * (1 - self.warmup_ratio)\n        warmup_lr = self.regular_lr * (1 - k)  # 3e-5 * (1-k)\n        return warmup_lr\n    def step(self, epoch=None):\n        self.regular_lr = self.get_regular_lr()\n        self.last_lr = self.get_lr()\n        self.cnt_epoch = (self.cnt_iters +\n                          1) // self.num_iters  # update step with iters\n        self.cnt_iters += 1\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        cur_iter = self.cnt_iters\n        if cur_iter >= self.warmup_iters:\n            return self.regular_lr\n        else:\n            warmup_lr = self.get_warmup_lr(cur_iter)\n            return warmup_lr\nclass CustomWarmupAdjustDecay(LRScheduler):\n    r\"\"\"\n    We combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        step_base_lr (float): start learning rate used in warmup stage.",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:251-282"
+    },
+    "8167": {
+        "file_id": 602,
+        "content": "This code defines a custom learning rate scheduler that combines warmup and stepwise-cosine decay. The get_warmup_lr function calculates the warmup learning rate, while the get_lr function determines whether the current iteration is in the warmup stage or not, returning either the regular learning rate or the warmed-up learning rate. The step function updates the learning rate and counts the number of iterations.",
+        "type": "comment"
+    },
+    "8168": {
+        "file_id": 602,
+        "content": "        warmup_epochs (int): the number epochs of warmup.\n        lr_decay_rate (float|int, optional): base learning rate decay rate.\n        step (int): step in change learning rate.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CosineAnnealingDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 step_base_lr,\n                 warmup_epochs,\n                 lr_decay_rate,\n                 boundaries,\n                 num_iters=None,\n                 last_epoch=-1,\n                 verbose=False):\n        self.step_base_lr = step_base_lr\n        self.warmup_epochs = warmup_epochs\n        self.lr_decay_rate = lr_decay_rate\n        self.boundaries = boundaries\n        self.num_iters = num_iters\n        #call step() in base class, last_lr/last_epoch/base_lr will be update",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:283-305"
+    },
+    "8169": {
+        "file_id": 602,
+        "content": "Custom learning rate scheduler with warmup, decay, and boundary steps. Initializes the LR scheduler with step base LR, warmup epochs, decay rate, boundaries, number of iterations (optional), last epoch (optional), and verbosity level (optional).",
+        "type": "comment"
+    },
+    "8170": {
+        "file_id": 602,
+        "content": "        super(CustomWarmupAdjustDecay, self).__init__(last_epoch=last_epoch,\n                                                      verbose=verbose)\n    def step(self, epoch=None):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if self.last_epoch == -1:\n                self.last_epoch += 1\n            else:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def get_lr(self):",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:306-332"
+    },
+    "8171": {
+        "file_id": 602,
+        "content": "The code defines a custom learning rate scheduler, CustomWarmupAdjustDecay, which adjusts the learning rate based on epoch number. It initializes the scheduler and provides a step method for updating the learning rate after optimizer.step is called. The get_lr method returns the current learning rate. The last_epoch variable keeps track of the current epoch. If no epoch is specified, it auto-increments from the last_epoch value. If an epoch is provided, the last_epoch is set to that value. Finally, if verbose is True, it prints the current epoch and the learning rate set.",
+        "type": "comment"
+    },
+    "8172": {
+        "file_id": 602,
+        "content": "        if self.last_epoch < self.warmup_epochs:\n            lr = self.step_base_lr * (self.last_epoch + 1) / self.warmup_epochs\n        else:\n            lr = self.step_base_lr * (self.lr_decay_rate**np.sum(\n                self.last_epoch >= np.array(self.boundaries)))\n        return lr",
+        "type": "code",
+        "location": "/paddlevideo/solver/custom_lr.py:333-338"
+    },
+    "8173": {
+        "file_id": 602,
+        "content": "This code calculates the learning rate based on whether the current epoch is within the warmup phase or not. If in warmup, it linearly increases the base learning rate. Otherwise, it applies a decay rate to determine the learning rate.",
+        "type": "comment"
+    },
+    "8174": {
+        "file_id": 603,
+        "content": "/paddlevideo/solver/lr.py",
+        "type": "filepath"
+    },
+    "8175": {
+        "file_id": 603,
+        "content": "This code constructs a learning rate scheduler based on the 'OPTIMIZER' configuration provided, returns it with specified iterations, and handles custom cases such as converting 'learning_rate' to a custom object.",
+        "type": "summary"
+    },
+    "8176": {
+        "file_id": 603,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom typing import Dict\nfrom paddle.optimizer.lr import LRScheduler\nfrom . import custom_lr\ndef build_lr(cfg: Dict, num_iters: int) -> LRScheduler:\n    \"\"\"Build a learning rate scheduler accroding to ```OPTIMIZER``` configuration, and it always pass into the optimizer.\n    In configuration:\n    learning_rate:\n        name: 'PiecewiseDecay'\n        boundaries: [20, 60]\n        values: [0.00025, 0.000025, 0.0000025]",
+        "type": "code",
+        "location": "/paddlevideo/solver/lr.py:1-28"
+    },
+    "8177": {
+        "file_id": 603,
+        "content": "This code is building a learning rate scheduler according to the \"OPTIMIZER\" configuration provided in the cfg dictionary. The scheduler is based on the 'PiecewiseDecay' name, and has boundaries and values for adjusting the learning rate at specified iterations.",
+        "type": "comment"
+    },
+    "8178": {
+        "file_id": 603,
+        "content": "    Args:\n        cfg (Dict): learning rate configuration.\n        num_iters (int): The number of iterations that may be used when calculating the learning rate\n    Returns:\n        LRScheduler: learning rate scheduler.\n    \"\"\"\n    cfg_copy = cfg.copy()\n    #when learning_rate is LRScheduler\n    if cfg_copy.get('learning_rate') and isinstance(cfg_copy['learning_rate'],\n                                                    dict):\n        cfg_copy['learning_rate'] = build_lr(\n            cfg_copy['learning_rate'],\n            num_iters)  #not support only inner iter_step\n    lr_name = cfg_copy.pop('name')\n    if cfg_copy.get('iter_step'):\n        cfg_copy['num_iters'] = num_iters\n        cfg_copy.pop('iter_step')\n    return getattr(custom_lr, lr_name)(**cfg_copy)",
+        "type": "code",
+        "location": "/paddlevideo/solver/lr.py:30-52"
+    },
+    "8179": {
+        "file_id": 603,
+        "content": "This function takes a learning rate configuration and the number of iterations, and returns a learning rate scheduler. If the configuration includes a 'learning_rate' key with a dictionary value, it converts it to a custom learning rate object using the build_lr() function. It also handles cases where 'iter_step' is present in the configuration, replacing it with 'num_iters'. The returned scheduler is obtained from the 'custom_lr' module with the specified 'name'.",
+        "type": "comment"
+    },
+    "8180": {
+        "file_id": 604,
+        "content": "/paddlevideo/solver/optimizer.py",
+        "type": "filepath"
+    },
+    "8181": {
+        "file_id": 604,
+        "content": "This code initializes optimizer configurations, handles weight decay, grad clip, and excludes parameters for L2 decay. It sets learning rate with LRScheduler, supports multi-precision, and creates an optimizer based on inputs.",
+        "type": "summary"
+    },
+    "8182": {
+        "file_id": 604,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport inspect\n# for python3.11\nif not hasattr(inspect, 'getargspec'):\n    inspect.getargspec = inspect.getfullargspec\nfrom typing import Dict\nimport paddle\nfrom paddle.optimizer.lr import LRScheduler\nfrom paddle.regularizer import L1Decay, L2Decay\nfrom paddlevideo.utils import get_logger\ndef build_optimizer(cfg: Dict,\n                    lr_scheduler: LRScheduler,\n                    model: paddle.nn.Layer,\n                    use_amp: bool = False,",
+        "type": "code",
+        "location": "/paddlevideo/solver/optimizer.py:1-31"
+    },
+    "8183": {
+        "file_id": 604,
+        "content": "This code is from the \"optimizer.py\" file in the PaddleVideo library, and it's responsible for building an optimizer. It imports necessary modules, checks compatibility with Python versions, defines a function build_optimizer that takes parameters such as configuration (cfg), learning rate scheduler (lr_scheduler), model, and optional AMP usage (use_amp). This file also includes some license information and comments.",
+        "type": "comment"
+    },
+    "8184": {
+        "file_id": 604,
+        "content": "                    amp_level: str = None) -> paddle.optimizer.Optimizer:\n    \"\"\"Build an optimizer and learning rate scheduler to optimize parameters accroding to ```OPTIMIZER``` field in configuration.\n    In configuration:\n    OPTIMIZER:\n        name: Momentum\n        momentum: 0.9\n        weight_decay: 0.001\n    or\n    OPTIMIZER:\n        name: Momentum\n        momentum: 0.9\n        weight_decay:\n            name: \"L1\"\n            value: 0.001\n    Momentum optimizer will be applied to optimize network and L1Decay regularizer will be applied to avoid overfit.\n    OPTIMIZER:\n        name: Adam\n        weight_decay:\n            name: \"L2\"\n            value: 0.001\n    Adam optimizer will be applied to optimize network and L2Decay regularizer will applied to avoid overfit.\n    Refer to ```https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/regularizer/L2Decay_en.html``` for more details.\n    Args:\n        cfg (Dict): optimizer configuration.\n        lr_scheduler (LRScheduler): learning rate scheduler.",
+        "type": "code",
+        "location": "/paddlevideo/solver/optimizer.py:32-63"
+    },
+    "8185": {
+        "file_id": 604,
+        "content": "Builds an optimizer and learning rate scheduler according to the OPTIMIZER field in the configuration. The Momentum or Adam optimizers are applied to optimize the network, and L1Decay or L2Decay regularizers are used to avoid overfitting. The function takes optimizer configuration (cfg) and learning rate scheduler (lr_scheduler) as arguments.",
+        "type": "comment"
+    },
+    "8186": {
+        "file_id": 604,
+        "content": "        model (paddle.nn.Layer, optional): model which contains parameters to be optimized. Defaults to None.\n        use_amp (bool, optional): Whether use amp. Defaults to False.\n        amp_level (str, optional): amp level when amp is enabled. Defaults to None.\n    Returns:\n        paddle.optimizer.Optimizer: an optimizer for the input model.\n    \"\"\"\n    logger = get_logger(\"paddlevideo\")\n    cfg_copy = cfg.copy()\n    # NOTE: check none and illegal cfg!!!\n    opt_name = cfg_copy.pop('name')\n    # deal with weight decay\n    if cfg_copy.get('weight_decay'):\n        if isinstance(cfg_copy.get('weight_decay'),\n                      float):  # just an float factor\n            cfg_copy['weight_decay'] = cfg_copy.get('weight_decay')\n        elif 'L1' in cfg_copy.get('weight_decay').get(\n                'name').upper():  # specify L2 wd and it's float factor\n            cfg_copy['weight_decay'] = L1Decay(\n                cfg_copy.get('weight_decay').get('value'))\n        elif 'L2' in cfg_copy.get('weight_decay').get(",
+        "type": "code",
+        "location": "/paddlevideo/solver/optimizer.py:64-85"
+    },
+    "8187": {
+        "file_id": 604,
+        "content": "This code defines a function that creates an optimizer for a given model. It accepts parameters such as the model, whether to use AMP or not, and the AMP level. The function also handles weight decay by checking if a 'weight_decay' configuration is present and applying the appropriate settings (L1 or L2 decay).",
+        "type": "comment"
+    },
+    "8188": {
+        "file_id": 604,
+        "content": "                'name').upper():  # specify L1 wd and it's float factor\n            cfg_copy['weight_decay'] = L2Decay(\n                cfg_copy.get('weight_decay').get('value'))\n        else:\n            raise ValueError\n    # deal with grad clip\n    if cfg_copy.get('grad_clip'):\n        if isinstance(cfg_copy.get('grad_clip'), float):\n            cfg_copy['grad_clip'] = cfg_copy.get('grad_clip').get('value')\n        elif 'global' in cfg_copy.get('grad_clip').get('name').lower():\n            cfg_copy['grad_clip'] = paddle.nn.ClipGradByGlobalNorm(\n                cfg_copy.get('grad_clip').get('value'))\n        else:\n            raise ValueError\n    # Set for optimizers that cannot be applied to l2decay, i.e. AdamW\n    if cfg_copy.get('no_weight_decay_name'):\n        no_weight_decay_name = cfg_copy.pop('no_weight_decay_name')\n        no_weight_decay_name_list = no_weight_decay_name.split(' ')\n        # NOTE: use param.name not name\n        no_weight_decay_param_list = [\n            param.name for name, param in model.named_parameters()",
+        "type": "code",
+        "location": "/paddlevideo/solver/optimizer.py:86-109"
+    },
+    "8189": {
+        "file_id": 604,
+        "content": "This code is initializing the configuration for an optimizer, handling L1 and L2 weight decay, grad clip, and no_weight_decay parameters. If 'name' is specified for L1 wd, it sets the 'weight_decay' to the float factor. For grad clip, if a float value is given, it is set as the 'grad_clip', or if 'global' in name, creates a ClipGradByGlobalNorm object. If 'no_weight_decay_name' is specified, it extracts the list of parameters to exclude from L2 decay.",
+        "type": "comment"
+    },
+    "8190": {
+        "file_id": 604,
+        "content": "            if any(key_word in name for key_word in no_weight_decay_name_list)\n        ]  # get the full param name of no weight decay\n        _apply_decay_param_fun = lambda name: name not in no_weight_decay_param_list\n        cfg_copy['apply_decay_param_fun'] = _apply_decay_param_fun\n        logger.info(\n            f\"No weight Decay list :({len(no_weight_decay_param_list)})\",\n            no_weight_decay_param_list)\n    cfg_copy.pop('learning_rate')\n    # set multi_precision\n    optimizer_setting = {\n        'learning_rate': lr_scheduler,\n        'parameters': model.parameters(),\n        **cfg_copy\n    }\n    optimizer_init_args = inspect.getargspec(\n        getattr(paddle.optimizer, opt_name).__init__).args\n    if use_amp and amp_level == \"O2\" and \"multi_precision\" in optimizer_init_args:\n        # support \"multi_precision\" arg in optimizer's __init__ function.\n        optimizer_setting.update({\"multi_precision\": True})\n        logger.info(\n            \"Set multi_precision=True for optimizer when use_amp=True and amp_level='O2'\"",
+        "type": "code",
+        "location": "/paddlevideo/solver/optimizer.py:110-133"
+    },
+    "8191": {
+        "file_id": 604,
+        "content": "This code checks if there are any parameters without weight decay, and sets the learning rate using a LRScheduler. It also handles multi-precision for optimizer when use_amp is True and amp_level is 'O2'. The code updates the optimizer_setting with no_weight_decay_param_list and \"multi_precision\" if required, logging relevant information throughout.",
+        "type": "comment"
+    },
+    "8192": {
+        "file_id": 604,
+        "content": "        )\n    return getattr(paddle.optimizer, opt_name)(**optimizer_setting)",
+        "type": "code",
+        "location": "/paddlevideo/solver/optimizer.py:134-136"
+    },
+    "8193": {
+        "file_id": 604,
+        "content": "This code is creating and returning an optimizer based on the given \"opt_name\" and \"optimizer_setting\". The optimizer type is determined by using \"paddle.optimizer[opt_name]\" and the parameters are passed through **optimizer_settings** to initialize the optimizer object.",
+        "type": "comment"
+    },
+    "8194": {
+        "file_id": 605,
+        "content": "/paddlevideo/tasks/__init__.py",
+        "type": "filepath"
+    },
+    "8195": {
+        "file_id": 605,
+        "content": "This code is the initialization file for PaddleVideo's tasks module. It imports functions from various task-specific modules and adds them to `__all__` for export. The code also includes a license notice, copyright information, and a disclaimer.",
+        "type": "summary"
+    },
+    "8196": {
+        "file_id": 605,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .train import train_model\nfrom .test import test_model\nfrom .train_dali import train_dali\nfrom .train_multigrid import train_model_multigrid\n__all__ = ['train_model', 'test_model', 'train_dali', 'train_model_multigrid']",
+        "type": "code",
+        "location": "/paddlevideo/tasks/__init__.py:1-20"
+    },
+    "8197": {
+        "file_id": 605,
+        "content": "This code is the initialization file for PaddleVideo's tasks module. It imports functions from various task-specific modules and adds them to `__all__` for export. The code also includes a license notice, copyright information, and a disclaimer.",
+        "type": "comment"
+    },
+    "8198": {
+        "file_id": 606,
+        "content": "/paddlevideo/tasks/test.py",
+        "type": "filepath"
+    },
+    "8199": {
+        "file_id": 606,
+        "content": "This Python function uses PaddlePaddle to test models, enabling parallel processing and logging. It initializes the device, constructs model, dataset, and dataloader in test mode with adjustable parameters. The code builds a dataloader, loads state_dicts, sets up metrics, and iterates over batches for output or metric updates before accumulating the final result.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/82.json b/docs/data/82.json
new file mode 100644
index 000000000..b7431f937
--- /dev/null
+++ b/docs/data/82.json
@@ -0,0 +1,546 @@
+{
+    "8200": {
+        "file_id": 606,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddlevideo.utils import get_logger, load\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..metrics import build_metric\nfrom ..modeling.builder import build_model\nlogger = get_logger(\"paddlevideo\")\n@paddle.no_grad()\ndef test_model(cfg, weights, parallel=True):\n    \"\"\"Test model entry\n    Args:\n        cfg (dict): configuration.\n        weights (str): weights path to load.\n        parallel (bool): Whether to do multi-cards testing. Default: True.",
+        "type": "code",
+        "location": "/paddlevideo/tasks/test.py:1-32"
+    },
+    "8201": {
+        "file_id": 606,
+        "content": "The code is a Python function for testing a model using PaddlePaddle framework. It takes configuration (cfg) and weights path (weights) as inputs, and allows for parallel processing. The logger captures any log messages from the function execution.",
+        "type": "comment"
+    },
+    "8202": {
+        "file_id": 606,
+        "content": "    \"\"\"\n    if cfg.get('use_npu', False):\n        places = paddle.set_device('npu')\n    elif cfg.get('use_xpu', False):\n        places = paddle.set_device('xpu')\n    else:\n        places = paddle.set_device('gpu')\n    # 1. Construct model.\n    if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):\n        cfg.MODEL.backbone.pretrained = ''  # disable pretrain model init\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    # 2. Construct dataset and dataloader.\n    cfg.DATASET.test.test_mode = True\n    dataset = build_dataset((cfg.DATASET.test, cfg.PIPELINE.test))\n    batch_size = cfg.DATASET.get(\"test_batch_size\", 8)\n    # default num worker: 0, which means no subprocess will be created\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    num_workers = cfg.DATASET.get('test_num_workers', num_workers)\n    dataloader_setting = dict(batch_size=batch_size,\n                              num_workers=num_workers,\n                              places=places,",
+        "type": "code",
+        "location": "/paddlevideo/tasks/test.py:34-61"
+    },
+    "8203": {
+        "file_id": 606,
+        "content": "This code block initializes the model's device, constructs and configures the model, dataset, and dataloader. It also sets test mode and adjusts batch size and number of workers.",
+        "type": "comment"
+    },
+    "8204": {
+        "file_id": 606,
+        "content": "                              drop_last=False,\n                              shuffle=False)\n    data_loader = build_dataloader(\n        dataset, **dataloader_setting) if cfg.model_name not in ['CFBI'\n                                                                 ] else dataset\n    model.eval()\n    state_dicts = load(weights)\n    model.set_state_dict(state_dicts)\n    # add params to metrics\n    cfg.METRIC.data_size = len(dataset)\n    cfg.METRIC.batch_size = batch_size\n    Metric = build_metric(cfg.METRIC)\n    if cfg.MODEL.framework == \"FastRCNN\":\n        Metric.set_dataset_info(dataset.info, len(dataset))\n    for batch_id, data in enumerate(data_loader):\n        if cfg.model_name in [\n                'CFBI'\n        ]:  # for VOS task, dataset for video and dataloader for frames in each video\n            Metric.update(batch_id, data, model)\n        else:\n            outputs = model(data, mode='test')\n            Metric.update(batch_id, data, outputs)\n    Metric.accumulate()",
+        "type": "code",
+        "location": "/paddlevideo/tasks/test.py:62-90"
+    },
+    "8205": {
+        "file_id": 606,
+        "content": "The code builds a dataloader for the dataset, loads state_dicts into the model, and sets up metrics. It then iterates over batches of data from the dataloader to either update the metric directly or get outputs from the model before updating the metric. After processing all batches, it accumulates the final result in the metric.",
+        "type": "comment"
+    },
+    "8206": {
+        "file_id": 607,
+        "content": "/paddlevideo/tasks/train.py",
+        "type": "filepath"
+    },
+    "8207": {
+        "file_id": 607,
+        "content": "The code utilizes PaddlePaddle's Fleet API for distributed training, defines models/metrics, and uses AMP to speed up gradient descent via DataParallel. It logs performance data, evaluates using PaddleVideo, saves the best model/optimizer, and periodically saves state during training.",
+        "type": "summary"
+    },
+    "8208": {
+        "file_id": 607,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport time\nimport paddle\nimport paddle.amp as amp\nimport paddle.distributed as dist\nimport paddle.distributed.fleet as fleet\nfrom paddlevideo.utils import (add_profiler_step, build_record, get_logger,\n                               load, log_batch, log_epoch, mkdir, save)\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..metrics.ava_utils import collect_results_cpu\nfrom ..modeling.builder import build_model",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:1-27"
+    },
+    "8209": {
+        "file_id": 607,
+        "content": "The code imports necessary libraries, defines functions to build data loaders, datasets, models, and metrics using a builder pattern. It also includes functions for logging progress and saving results. The code is licensed under the Apache License 2.0, and it might be part of a larger framework or application dealing with video analysis tasks.",
+        "type": "comment"
+    },
+    "8210": {
+        "file_id": 607,
+        "content": "from ..solver import build_lr, build_optimizer\nfrom ..utils import do_preciseBN\ndef train_model(cfg,\n                weights=None,\n                parallel=True,\n                validate=True,\n                use_amp=False,\n                amp_level=None,\n                max_iters=None,\n                use_fleet=False,\n                profiler_options=None):\n    \"\"\"Train model entry\n    Args:\n        cfg (dict): configuration.\n        weights (str, optional): weights path for finetuning. Defaults to None.\n        parallel (bool, optional): whether multi-cards training. Defaults to True.\n        validate (bool, optional): whether to do evaluation. Defaults to True.\n        use_amp (bool, optional): whether to use automatic mixed precision during training. Defaults to False.\n        amp_level (str, optional): amp optmization level, must be 'O1' or 'O2' when use_amp is True. Defaults to None.\n        max_iters (int, optional): max running iters in an epoch. Defaults to None.\n        use_fleet (bool, optional): whether to use fleet. Defaults to False.",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:28-51"
+    },
+    "8211": {
+        "file_id": 607,
+        "content": "The code defines a train_model function for training the model using given configuration (cfg). It takes optional arguments like weights path, parallel training flag, validation enablement, automatic mixed precision usage, and more.",
+        "type": "comment"
+    },
+    "8212": {
+        "file_id": 607,
+        "content": "        profiler_options (str, optional): configuration for the profiler function. Defaults to None.\n    \"\"\"\n    if use_fleet:\n        fleet.init(is_collective=True)\n    logger = get_logger(\"paddlevideo\")\n    batch_size = cfg.DATASET.get('batch_size', 8)\n    valid_batch_size = cfg.DATASET.get('valid_batch_size', batch_size)\n    # gradient accumulation settings\n    use_gradient_accumulation = cfg.get('GRADIENT_ACCUMULATION', None)\n    if use_gradient_accumulation and dist.get_world_size() >= 1:\n        global_batch_size = cfg.GRADIENT_ACCUMULATION.get(\n            'global_batch_size', None)\n        num_gpus = dist.get_world_size()\n        assert isinstance(\n            global_batch_size, int\n        ), f\"global_batch_size must be int, but got {type(global_batch_size)}\"\n        assert batch_size <= global_batch_size, \\\n            f\"global_batch_size({global_batch_size}) must not be less than batch_size({batch_size})\"\n        cur_global_batch_size = batch_size * num_gpus  # The number of batches calculated by all GPUs at one time",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:52-75"
+    },
+    "8213": {
+        "file_id": 607,
+        "content": "This code sets up gradient accumulation and global batch size for distributed training using PaddlePaddle's Fleet API. It retrieves batch and validation batch sizes from the configuration, then checks if gradient accumulation is enabled and the world size of the distributed setup. If so, it calculates the global batch size based on these settings and asserts that global_batch_size is greater than the current batch size.",
+        "type": "comment"
+    },
+    "8214": {
+        "file_id": 607,
+        "content": "        assert global_batch_size % cur_global_batch_size == 0, \\\n            f\"The global batchsize({global_batch_size}) must be divisible by cur_global_batch_size({cur_global_batch_size})\"\n        cfg.GRADIENT_ACCUMULATION[\n            \"num_iters\"] = global_batch_size // cur_global_batch_size\n        # The number of iterations required to reach the global batchsize\n        logger.info(\n            f\"Using gradient accumulation training strategy, \"\n            f\"global_batch_size={global_batch_size}, \"\n            f\"num_gpus={num_gpus}, \"\n            f\"num_accumulative_iters={cfg.GRADIENT_ACCUMULATION.num_iters}\")\n    if cfg.get('use_npu', False):\n        places = paddle.set_device('npu')\n    elif cfg.get('use_xpu', False):\n        places = paddle.set_device('xpu')\n    else:\n        places = paddle.set_device('gpu')\n    # default num worker: 0, which means no subprocess will be created\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    valid_num_workers = cfg.DATASET.get('valid_num_workers', num_workers)",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:76-96"
+    },
+    "8215": {
+        "file_id": 607,
+        "content": "The code ensures the global batch size is divisible by cur_global_batch_size, sets the number of iterations needed to reach the global batch size, and sets the device type (NPU, XPU, or GPU) based on config values. It also allows for setting the number of workers for training and validation data loading.",
+        "type": "comment"
+    },
+    "8216": {
+        "file_id": 607,
+        "content": "    model_name = cfg.model_name\n    output_dir = cfg.get(\"output_dir\", f\"./output/{model_name}\")\n    mkdir(output_dir)\n    # 1. Construct model\n    model = build_model(cfg.MODEL)\n    if cfg.get('to_static', False):\n        specs = None\n        model = paddle.jit.to_static(model, input_spec=specs)\n        logger.info(\n            \"Successfully to apply @to_static with specs: {}\".format(specs))\n    # 2. Construct dataset and dataloader for training and evaluation\n    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))\n    train_dataloader_setting = dict(\n        batch_size=batch_size,\n        num_workers=num_workers,\n        collate_fn_cfg=cfg.get('MIX', None),\n        places=places)\n    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)\n    if validate:\n        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))\n        validate_dataloader_setting = dict(\n            batch_size=valid_batch_size,\n            num_workers=valid_num_workers,\n            places=places,",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:97-124"
+    },
+    "8217": {
+        "file_id": 607,
+        "content": "Code snippet builds a model, creates dataset and dataloader for training and validation, and optionally converts the model to static using Paddle.jit.to_static(). It saves the output in the specified directory and logs if @to_static is applied successfully.",
+        "type": "comment"
+    },
+    "8218": {
+        "file_id": 607,
+        "content": "            drop_last=False,\n            shuffle=cfg.DATASET.get(\n                'shuffle_valid',\n                False)  # NOTE: attention_LSTM needs to shuffle valid data.\n        )\n        valid_loader = build_dataloader(valid_dataset,\n                                        **validate_dataloader_setting)\n    # 3. Construct learning rate scheduler(lr) and optimizer\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))\n    optimizer = build_optimizer(\n        cfg.OPTIMIZER, lr, model=model, use_amp=use_amp, amp_level=amp_level)\n    # 4. Construct scalar and convert parameters for amp(optional)\n    if use_amp:\n        scaler = amp.GradScaler(\n            init_loss_scaling=2.0**16,\n            incr_every_n_steps=2000,\n            decr_every_n_nan_or_inf=1)\n        # convert model parameters to fp16 when amp_level is O2(pure fp16)\n        model, optimizer = amp.decorate(\n            models=model,\n            optimizers=optimizer,\n            level=amp_level,\n            master_weight=True,\n            save_dtype=None)",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:125-150"
+    },
+    "8219": {
+        "file_id": 607,
+        "content": "This code is setting up a training process for the PaddleVideo framework. It first creates train and validation dataloaders with specified settings, then constructs a learning rate scheduler and optimizer based on provided configurations. Optionally, it converts model parameters to fp16 using AMP if needed.",
+        "type": "comment"
+    },
+    "8220": {
+        "file_id": 607,
+        "content": "        # NOTE: save_dtype is set to float32 now.\n        logger.info(f\"Training in amp mode, amp_level={amp_level}.\")\n    else:\n        assert amp_level is None, f\"amp_level must be None when training in fp32 mode, but got {amp_level}.\"\n        logger.info(\"Training in fp32 mode.\")\n    # 5. Resume(optional)\n    resume_epoch = cfg.get(\"resume_epoch\", 0)\n    if resume_epoch:\n        filename = osp.join(output_dir,\n                            model_name + f\"_epoch_{resume_epoch:05d}\")\n        resume_model_dict = load(filename + '.pdparams')\n        resume_opt_dict = load(filename + '.pdopt')\n        model.set_state_dict(resume_model_dict)\n        optimizer.set_state_dict(resume_opt_dict)\n        logger.info(\"Resume from checkpoint: {}\".format(filename))\n    # 6. Finetune(optional)\n    if weights:\n        assert resume_epoch == 0, f\"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it.\"\n        model_dict = load(weights)\n        model.set_state_dict(model_dict)",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:151-172"
+    },
+    "8221": {
+        "file_id": 607,
+        "content": "The code checks if training in amp mode or fp32 mode. If in amp mode, it asserts that the amp_level is not None and logs the current level. If in fp32 mode, it asserts that amp_level is None and logs the mode. It then handles optional resume and finetuning steps if specified by loading model weights from a file, setting the model state dictionary to the loaded dictionary, and logging the checkpoint used.",
+        "type": "comment"
+    },
+    "8222": {
+        "file_id": 607,
+        "content": "        logger.info(\"Finetune from checkpoint: {}\".format(weights))\n    # 7. Parallelize(optional)\n    if parallel:\n        model = paddle.DataParallel(model)\n    if use_fleet:\n        model = fleet.distributed_model(model)\n        optimizer = fleet.distributed_optimizer(optimizer)\n    # 8. Train Model\n    best = 0.0\n    for epoch in range(0, cfg.epochs):\n        if epoch < resume_epoch:\n            logger.info(\n                f\"| epoch: [{epoch + 1}] <= resume_epoch: [{resume_epoch}], continue...\"\n            )\n            continue\n        model.train()\n        record_list = build_record(cfg.MODEL)\n        tic = time.time()\n        for i, data in enumerate(train_loader):\n            \"\"\"Next two line of code only used in test_tipc,\n            ignore it most of the time\"\"\"\n            if max_iters is not None and i >= max_iters:\n                break\n            record_list['reader_time'].update(time.time() - tic)\n            # Collect performance information when profiler_options is activate\n            add_profiler_step(profiler_options)",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:173-204"
+    },
+    "8223": {
+        "file_id": 607,
+        "content": "The code finetunes a model from a specified checkpoint. It optionally parallelizes the training process using Paddle's DataParallel API and Fleet distributed computing for further optimization. The code trains the model for a specified number of epochs, continuing from a previous resume_epoch if needed. Performance information is collected when profiler options are activated.",
+        "type": "comment"
+    },
+    "8224": {
+        "file_id": 607,
+        "content": "            # 8.1 forward\n            # AMP #\n            if use_amp:\n                with amp.auto_cast(\n                        custom_black_list={\"reduce_mean\", \"conv3d\"},\n                        level=amp_level):\n                    outputs = model(data, mode='train')\n                avg_loss = outputs['loss']\n                if use_gradient_accumulation:\n                    # clear grad at when epoch begins\n                    if i == 0:\n                        optimizer.clear_grad()\n                    # Loss normalization\n                    avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters\n                    # Loss scaling\n                    scaled = scaler.scale(avg_loss)\n                    # 8.2 backward\n                    scaled.backward()\n                    # 8.3 minimize\n                    if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:\n                        scaler.minimize(optimizer, scaled)\n                        optimizer.clear_grad()\n                else:  # general case\n                    # Loss scaling",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:206-229"
+    },
+    "8225": {
+        "file_id": 607,
+        "content": "Applies Automatic Mixed Precision (AMP) for faster training, calculates average loss, performs gradient accumulation, and scales backpropagation to reduce memory usage.",
+        "type": "comment"
+    },
+    "8226": {
+        "file_id": 607,
+        "content": "                    scaled = scaler.scale(avg_loss)\n                    # 8.2 backward\n                    scaled.backward()\n                    # 8.3 minimize\n                    scaler.minimize(optimizer, scaled)\n                    optimizer.clear_grad()\n            else:\n                outputs = model(data, mode='train')\n                avg_loss = outputs['loss']\n                if use_gradient_accumulation:\n                    # clear grad at when epoch begins\n                    if i == 0:\n                        optimizer.clear_grad()\n                    # Loss normalization\n                    avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters\n                    # 8.2 backward\n                    avg_loss.backward()\n                    # 8.3 minimize\n                    if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:\n                        optimizer.step()\n                        optimizer.clear_grad()\n                else:  # general case\n                    # 8.2 backward\n                    avg_loss.backward()",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:230-253"
+    },
+    "8227": {
+        "file_id": 607,
+        "content": "This code calculates the average loss, scales it if necessary, performs backward pass, and applies gradient descent to minimize the loss. If gradient accumulation is used, the gradients are cleared at the start of each epoch and after every accumulated number of iterations.",
+        "type": "comment"
+    },
+    "8228": {
+        "file_id": 607,
+        "content": "                    # 8.3 minimize\n                    optimizer.step()\n                    optimizer.clear_grad()\n            # log record\n            record_list['lr'].update(optimizer.get_lr(), batch_size)\n            for name, value in outputs.items():\n                if name in record_list:\n                    record_list[name].update(value, batch_size)\n            record_list['batch_time'].update(time.time() - tic)\n            tic = time.time()\n            if i % cfg.get(\"log_interval\", 10) == 0:\n                ips = \"ips: {:.5f} instance/sec,\".format(\n                    batch_size / record_list[\"batch_time\"].val)\n                cur_progress = ((i + 1) + epoch * len(train_loader)) / (\n                    len(train_loader) * cfg.epochs)\n                eta = int(record_list[\"batch_time\"].sum *\n                          (1 - cur_progress) / cur_progress + 0.5)\n                log_batch(record_list, i, epoch + 1, cfg.epochs, \"train\", ips,\n                          eta)\n            # learning rate iter step",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:254-277"
+    },
+    "8229": {
+        "file_id": 607,
+        "content": "Optimizer step and gradient clearance followed by logging records, updating logs, calculating instantaneous performance (ips), determining progress and estimated time of arrival (eta), and calling log_batch function.",
+        "type": "comment"
+    },
+    "8230": {
+        "file_id": 607,
+        "content": "            if cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n                lr.step()\n        # learning rate epoch step\n        if not cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n            lr.step()\n        ips = \"avg_ips: {:.5f} instance/sec.\".format(\n            batch_size * record_list[\"batch_time\"].count /\n            record_list[\"batch_time\"].sum)\n        log_epoch(record_list, epoch + 1, \"train\", ips)\n        def evaluate(best):\n            model.eval()\n            results = []\n            record_list = build_record(cfg.MODEL)\n            record_list.pop('lr')\n            tic = time.time()\n            if parallel:\n                rank = dist.get_rank()\n            # single_gpu_test and multi_gpu_test\n            for i, data in enumerate(valid_loader):\n                \"\"\"Next two line of code only used in test_tipc,\n                ignore it most of the time\"\"\"\n                if max_iters is not None and i >= max_iters:\n                    break\n                if use_amp:\n                    with amp.auto_cast(",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:278-306"
+    },
+    "8231": {
+        "file_id": 607,
+        "content": "This code snippet is from the PaddleVideo library and it contains code for training a model. It uses an optimizer with a learning rate that can be stepped based on whether it's an iterative step or not. After performing an epoch, it logs the average instances per second processed. The code then evaluates the model by setting it to evaluation mode and collecting test results using a record list. It also records the time taken for testing in 'tic'.",
+        "type": "comment"
+    },
+    "8232": {
+        "file_id": 607,
+        "content": "                            custom_black_list={\"reduce_mean\", \"conv3d\"},\n                            level=amp_level):\n                        outputs = model(data, mode='valid')\n                else:\n                    outputs = model(data, mode='valid')\n                if cfg.MODEL.framework == \"FastRCNN\":\n                    results.extend(outputs)\n                # log_record\n                if cfg.MODEL.framework != \"FastRCNN\":\n                    for name, value in outputs.items():\n                        if name in record_list:\n                            record_list[name].update(value, batch_size)\n                record_list['batch_time'].update(time.time() - tic)\n                tic = time.time()\n                if i % cfg.get(\"log_interval\", 10) == 0:\n                    ips = \"ips: {:.5f} instance/sec.\".format(\n                        valid_batch_size / record_list[\"batch_time\"].val)\n                    log_batch(record_list, i, epoch + 1, cfg.epochs, \"val\", ips)\n            if cfg.MODEL.framework == \"FastRCNN\":",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:307-330"
+    },
+    "8233": {
+        "file_id": 607,
+        "content": "This code snippet is from the PaddleVideo library and appears to be handling model training for a specific framework. It calculates outputs, updates records for non-FastRCNN models, logs batch information, and handles FastRCNN-specific operations. The code also includes functionality for updating batch time and logging progress at regular intervals.",
+        "type": "comment"
+    },
+    "8234": {
+        "file_id": 607,
+        "content": "                if parallel:\n                    results = collect_results_cpu(results, len(valid_dataset))\n                if not parallel or (parallel and rank == 0):\n                    eval_res = valid_dataset.evaluate(results)\n                    for name, value in eval_res.items():\n                        record_list[name].update(value, valid_batch_size)\n            ips = \"avg_ips: {:.5f} instance/sec.\".format(\n                valid_batch_size * record_list[\"batch_time\"].count /\n                record_list[\"batch_time\"].sum)\n            log_epoch(record_list, epoch + 1, \"val\", ips)\n            best_flag = False\n            if cfg.MODEL.framework == \"FastRCNN\" and (not parallel or\n                                                      (parallel and rank == 0)):\n                if record_list[\"mAP@0.5IOU\"].val > best:\n                    best = record_list[\"mAP@0.5IOU\"].val\n                    best_flag = True\n                return best, best_flag\n            if cfg.MODEL.framework == \"YOWOLocalizer\" and (not parallel or",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:331-351"
+    },
+    "8235": {
+        "file_id": 607,
+        "content": "Code section checks if parallel processing is enabled, collects results for CPU, and evaluates the dataset. It calculates average instance processing speed and logs it. If using specific models like FastRCNN or YOWOLocalizer, compares current performance metrics with the best values achieved so far and returns them along with a flag indicating if a new best value was found.",
+        "type": "comment"
+    },
+    "8236": {
+        "file_id": 607,
+        "content": "                                                           (parallel and rank == 0)):\n                if record_list[\"fscore\"].avg > best:\n                    best = record_list[\"fscore\"].avg\n                    best_flag = True\n                return best, best_flag\n            # forbest2, cfg.MODEL.framework != \"FastRCNN\":\n            for top_flag in ['hit_at_one', 'top1', 'rmse', \"F1@0.50\"]:\n                if record_list.get(top_flag):\n                    if top_flag != 'rmse' and record_list[top_flag].avg > best:\n                        best = record_list[top_flag].avg\n                        best_flag = True\n                    elif top_flag == 'rmse' and (\n                            best == 0.0 or record_list[top_flag].avg < best):\n                        best = record_list[top_flag].avg\n                        best_flag = True\n            return best, best_flag\n        # use precise bn to improve acc\n        if cfg.get(\"PRECISEBN\") and (\n                epoch % cfg.PRECISEBN.preciseBN_interval == 0",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:352-373"
+    },
+    "8237": {
+        "file_id": 607,
+        "content": "This code is updating the best value and flag based on various metrics (fscore, hit_at_one, top1, rmse, F1@0.50) in a parallel setting with rank 0. It also checks if using precise batch normalization improves accuracy every 'preciseBN_interval' epochs.",
+        "type": "comment"
+    },
+    "8238": {
+        "file_id": 607,
+        "content": "                or epoch == cfg.epochs - 1):\n            do_preciseBN(model, train_loader, parallel,\n                         min(cfg.PRECISEBN.num_iters_preciseBN,\n                             len(train_loader)), use_amp, amp_level)\n        # 9. Validation\n        if validate and (epoch % cfg.get(\"val_interval\", 1) == 0\n                         or epoch == cfg.epochs - 1):\n            with paddle.no_grad():\n                best, save_best_flag = evaluate(best)\n            # save best\n            if save_best_flag:\n                save(optimizer.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdopt\"))\n                save_student_model_flag = True if \"Distillation\" in cfg.MODEL.framework else False\n                save(\n                    model.state_dict(),\n                    osp.join(output_dir, model_name + \"_best.pdparams\"),\n                    save_student_model=save_student_model_flag)\n                if model_name == \"AttentionLstm\":\n                    logger.info(\n                        f\"Already save the best model (hit_at_one){best}\")",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:374-395"
+    },
+    "8239": {
+        "file_id": 607,
+        "content": "This code block is responsible for the precise Batch Normalization and validation steps in a deep learning training process. It applies PreciseBN for specific number of iterations, performs validation every \"val_interval\" epochs or at the last epoch, saves best model state if validation accuracy improves, and handles model saving differently depending on the framework used (Distillation vs others).",
+        "type": "comment"
+    },
+    "8240": {
+        "file_id": 607,
+        "content": "                elif cfg.MODEL.framework == \"FastRCNN\":\n                    logger.info(\n                        f\"Already save the best model (mAP@0.5IOU){int(best * 10000) / 10000}\"\n                    )\n                elif cfg.MODEL.framework == \"DepthEstimator\":\n                    logger.info(\n                        f\"Already save the best model (rmse){int(best * 10000) / 10000}\"\n                    )\n                elif cfg.MODEL.framework in ['MSTCN', 'ASRF']:\n                    logger.info(\n                        f\"Already save the best model (F1@0.50){int(best * 10000) / 10000}\"\n                    )\n                elif cfg.MODEL.framework in ['YOWOLocalizer']:\n                    logger.info(\n                        f\"Already save the best model (fsocre){int(best * 10000) / 10000}\"\n                    )\n                else:\n                    logger.info(\n                        f\"Already save the best model (top1 acc){int(best * 10000) / 10000}\"\n                    )\n        # 10. Save model and optimizer",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:396-417"
+    },
+    "8241": {
+        "file_id": 607,
+        "content": "This code block checks the current model framework and logs the metric used to identify the best model saved, followed by saving the best model and optimizer.",
+        "type": "comment"
+    },
+    "8242": {
+        "file_id": 607,
+        "content": "        if epoch % cfg.get(\"save_interval\", 1) == 0 or epoch == cfg.epochs - 1:\n            save(optimizer.state_dict(),\n                 osp.join(output_dir,\n                          model_name + f\"_epoch_{epoch + 1:05d}.pdopt\"))\n            save(model.state_dict(),\n                 osp.join(output_dir,\n                          model_name + f\"_epoch_{epoch + 1:05d}.pdparams\"))\n    logger.info(f'training {model_name} finished')",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train.py:418-426"
+    },
+    "8243": {
+        "file_id": 607,
+        "content": "This code saves the optimizer and model state dictionaries at specific intervals during training. The optimizer state is saved with a .pdopt extension and the model state is saved with a .pdparams extension. This occurs if the current epoch is either divisible by the save_interval or is the final epoch, to preserve progress during training. Finally, it logs that training for the specified model has finished.",
+        "type": "comment"
+    },
+    "8244": {
+        "file_id": 608,
+        "content": "/paddlevideo/tasks/train_dali.py",
+        "type": "filepath"
+    },
+    "8245": {
+        "file_id": 608,
+        "content": "The code sets up libraries, initializes DALI and TSN model, creates a dataloader, builds solver, trains model with optimization steps, logs performance metrics, updates learning rates, supports resuming training/finetuning, and saves states at intervals.",
+        "type": "summary"
+    },
+    "8246": {
+        "file_id": 608,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport time\nimport os.path as osp\nimport paddle\nfrom ..modeling.builder import build_model\nfrom ..solver import build_lr, build_optimizer\nfrom ..utils import do_preciseBN\nfrom paddlevideo.utils import get_logger, coloring\nfrom paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,\n                               save, load, mkdir)\nfrom paddlevideo.loader import TSN_Dali_loader, get_input_data",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_dali.py:1-25"
+    },
+    "8247": {
+        "file_id": 608,
+        "content": "This code imports necessary libraries and modules, sets up licenses, and imports functions from other files for model building, solver configuration, and additional utility functions. It also defines a loader for TSN-Dali dataset and functions for input data preparation.",
+        "type": "comment"
+    },
+    "8248": {
+        "file_id": 608,
+        "content": "\"\"\"\nWe only supported DALI training for TSN model now.\n\"\"\"\ndef train_dali(cfg, weights=None, parallel=True):\n    \"\"\"Train model entry\n    Args:\n    \tcfg (dict): configuration.\n        weights (str): weights path for finetuning.\n    \tparallel (bool): Whether multi-cards training. Default: True.\n    \"\"\"\n    logger = get_logger(\"paddlevideo\")\n    batch_size = cfg.DALI_LOADER.get('batch_size', 8)\n    places = paddle.set_device('gpu')\n    model_name = cfg.model_name\n    output_dir = cfg.get(\"output_dir\", f\"./output/{model_name}\")\n    mkdir(output_dir)\n    # 1. Construct model\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    # 2. Construct dali dataloader\n    train_loader = TSN_Dali_loader(cfg.DALI_LOADER).build_dali_reader()\n    # 3. Construct solver.\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, None)\n    optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)\n    # Resume\n    resume_epoch = cfg.get(\"resume_epoch\", 0)\n    if resume_epoch:\n        filename = osp.join(output_dir,",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_dali.py:26-63"
+    },
+    "8249": {
+        "file_id": 608,
+        "content": "This code snippet initializes and trains a DALI (Data Augmentation and Input Pipeline Library) for the TSN model. It first constructs the model, creates a Dali dataloader, builds a solver with specified optimizer and learning rate, and then resumes training from the last checkpoint if provided.",
+        "type": "comment"
+    },
+    "8250": {
+        "file_id": 608,
+        "content": "                            model_name + f\"_epoch_{resume_epoch:05d}\")\n        resume_model_dict = load(filename + '.pdparams')\n        resume_opt_dict = load(filename + '.pdopt')\n        model.set_state_dict(resume_model_dict)\n        optimizer.set_state_dict(resume_opt_dict)\n    # Finetune:\n    if weights:\n        assert resume_epoch == 0, f\"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it.\"\n        model_dict = load(weights)\n        model.set_state_dict(model_dict)\n    # 4. Train Model\n    for epoch in range(0, cfg.epochs):\n        if epoch < resume_epoch:\n            logger.info(\n                f\"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... \"\n            )\n            continue\n        model.train()\n        record_list = build_record(cfg.MODEL)\n        tic = time.time()\n        for i, data in enumerate(train_loader):\n            data = get_input_data(data)\n            record_list['reader_time'].update(time.time() - tic)",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_dali.py:64-88"
+    },
+    "8251": {
+        "file_id": 608,
+        "content": "This code snippet is part of a model training pipeline. It first checks if the resume_epoch is 0 or if weights are provided for finetuning, then loads and sets the corresponding state dictionaries for the model and optimizer. The model is trained for specified epochs, with the option to continue from a previous epoch or start from scratch depending on the resume_epoch and weights inputs. It also records reader time during training loop iterations.",
+        "type": "comment"
+    },
+    "8252": {
+        "file_id": 608,
+        "content": "            # 4.1 forward\n            outputs = model(data, mode='train')\n            # 4.2 backward\n            avg_loss = outputs['loss']\n            avg_loss.backward()\n            # 4.3 minimize\n            optimizer.step()\n            optimizer.clear_grad()\n            # log record\n            record_list['lr'].update(optimizer._global_learning_rate(),\n                                     batch_size)\n            for name, value in outputs.items():\n                record_list[name].update(value, batch_size)\n            record_list['batch_time'].update(time.time() - tic)\n            tic = time.time()\n            if i % cfg.get(\"log_interval\", 10) == 0:\n                ips = \"ips: {:.5f} instance/sec.\".format(\n                    batch_size / record_list[\"batch_time\"].val)\n                log_batch(record_list, i, epoch + 1, cfg.epochs, \"train\", ips)\n            # learning rate iter step\n            if cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n                lr.step()\n        # learning rate epoch step",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_dali.py:89-116"
+    },
+    "8253": {
+        "file_id": 608,
+        "content": "This code is training a model. It performs forward, backward pass, and optimization steps before logging performance metrics and updating learning rates. The model takes input data and calculates outputs in 'train' mode. Then, it calculates the average loss from the outputs. Next, it updates gradients using backward propagation, optimizes the model with step and clears gradients. It records log information such as learning rate and batch time for later analysis. The code also checks if there is an interval in the training to log current metrics and provides an instance per second rate (ips) as performance indicator. Lastly, it updates learning rates using both iteration steps and epoch steps, based on configuration settings.",
+        "type": "comment"
+    },
+    "8254": {
+        "file_id": 608,
+        "content": "        if not cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n            lr.step()\n        ips = \"ips: {:.5f} instance/sec.\".format(\n            batch_size * record_list[\"batch_time\"].count /\n            record_list[\"batch_time\"].sum)\n        log_epoch(record_list, epoch + 1, \"train\", ips)\n        # use precise bn to improve acc\n        if cfg.get(\"PRECISEBN\") and (epoch % cfg.PRECISEBN.preciseBN_interval\n                                     == 0 or epoch == cfg.epochs - 1):\n            do_preciseBN(\n                model, train_loader, parallel,\n                min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader)))\n        # 5. Save model and optimizer\n        if epoch % cfg.get(\"save_interval\", 1) == 0 or epoch == cfg.epochs - 1:\n            save(\n                optimizer.state_dict(),\n                osp.join(output_dir,\n                         model_name + f\"_epoch_{epoch+1:05d}.pdopt\"))\n            save(\n                model.state_dict(),\n                osp.join(output_dir,\n                         model_name + f\"_epoch_{epoch+1:05d}.pdparams\"))",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_dali.py:117-141"
+    },
+    "8255": {
+        "file_id": 608,
+        "content": "This code chunk performs the following actions:\n1. Checks if learning rate should be updated based on iteration count.\n2. Calculates and logs the training instance speed (ips).\n3. Optionally applies precise Batch Normalization (bn) to improve accuracy.\n4. Saves the model's and optimizer's state every 'save_interval' epochs.",
+        "type": "comment"
+    },
+    "8256": {
+        "file_id": 608,
+        "content": "    logger.info(f'training {model_name} finished')",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_dali.py:143-143"
+    },
+    "8257": {
+        "file_id": 608,
+        "content": "This line logs the completion of training a specific model using the \"logger.info\" function, indicating that the training process for the specified \"model_name\" has ended.",
+        "type": "comment"
+    },
+    "8258": {
+        "file_id": 609,
+        "content": "/paddlevideo/tasks/train_multigrid.py",
+        "type": "filepath"
+    },
+    "8259": {
+        "file_id": 609,
+        "content": "The code prepares the environment for training PaddleVideo models, builds a multigrid configuration, handles device and parallelism, trains the model, optimizes it using specified optimizer, logs progress/learning rate updates, evaluates performance, saves state, and saves model & optimizer.",
+        "type": "summary"
+    },
+    "8260": {
+        "file_id": 609,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport time\nimport os.path as osp\nimport paddle\nimport paddle.distributed as dist\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..modeling.builder import build_model\nfrom ..solver import build_lr, build_optimizer\nfrom ..utils import do_preciseBN\nfrom paddlevideo.utils import get_logger, coloring\nfrom paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,\n                               save, load, mkdir)",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:1-27"
+    },
+    "8261": {
+        "file_id": 609,
+        "content": "The code snippet is the opening section of the file \"train_multigrid.py\" within the PaddleVideo library. It starts by declaring copyright, licensing information, and importing necessary modules. It also includes functions to build datasets, models, loaders, solvers, and utilities for logging, saving, and loading model parameters and progress. This section sets up the environment for training video models in the PaddleVideo framework.",
+        "type": "comment"
+    },
+    "8262": {
+        "file_id": 609,
+        "content": "from paddlevideo.utils.multigrid import MultigridSchedule, aggregate_sub_bn_stats, subn_load, subn_save, is_eval_epoch\ndef construct_loader(cfg, places, validate, precise_bn, num_iters_precise_bn,\n                     world_size):\n    batch_size = cfg.DATASET.get('batch_size', 2)\n    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))\n    precise_bn_dataloader_setting = dict(\n        batch_size=batch_size,\n        num_workers=cfg.DATASET.get('num_workers', 0),\n        places=places,\n    )\n    if precise_bn:\n        cfg.DATASET.train.num_samples_precise_bn = num_iters_precise_bn * batch_size * world_size\n        precise_bn_dataset = build_dataset((cfg.DATASET.train,\n                                            cfg.PIPELINE.train))\n        precise_bn_loader = build_dataloader(precise_bn_dataset,\n                                             **precise_bn_dataloader_setting)\n        cfg.DATASET.train.num_samples_precise_bn = None\n    else:\n        precise_bn_loader = None\n    if cfg.MULTIGRID.SHORT_CYCLE:",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:28-50"
+    },
+    "8263": {
+        "file_id": 609,
+        "content": "This function constructs data loaders for training a model with the multigrid approach. It takes several arguments including configuration (cfg), places to distribute the data, whether to use precise batch normalization (precise_bn), number of iterations for precise BN (num_iters_precise_bn), and world size. If precise BN is enabled, it adjusts the number of samples in the training dataset, creates a separate loader for precise BN, and sets the adjusted number of samples back to None. If not, it sets the precise BN loader to None. The code also checks if a short cycle multigrid approach is being used.",
+        "type": "comment"
+    },
+    "8264": {
+        "file_id": 609,
+        "content": "        # get batch size list in short cycle schedule\n        bs_factor = [\n            int(\n                round((float(cfg.PIPELINE.train.transform[1]['MultiCrop'][\n                    'target_size']) / (s * cfg.MULTIGRID.default_crop_size))\n                      **2)) for s in cfg.MULTIGRID.short_cycle_factors\n        ]\n        batch_sizes = [\n            batch_size * bs_factor[0],\n            batch_size * bs_factor[1],\n            batch_size,\n        ]\n        train_dataloader_setting = dict(\n            batch_size=batch_sizes,\n            multigrid=True,\n            num_workers=cfg.DATASET.get('num_workers', 0),\n            places=places,\n        )\n    else:\n        train_dataloader_setting = precise_bn_dataloader_setting\n    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)\n    if validate:\n        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))\n        validate_dataloader_setting = dict(\n            batch_size=batch_size,\n            num_workers=cfg.DATASET.get('num_workers', 0),",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:51-77"
+    },
+    "8265": {
+        "file_id": 609,
+        "content": "The code adjusts the batch size in a short cycle schedule based on target image size, multi-grid factors and default crop size. It then sets up a train_dataloader with these batch sizes and other parameters. If validate is True, it also builds a valid_dataset and valid_dataloader with the given configurations.",
+        "type": "comment"
+    },
+    "8266": {
+        "file_id": 609,
+        "content": "            places=places,\n            drop_last=False,\n            shuffle=False)\n        valid_loader = build_dataloader(valid_dataset,\n                                        **validate_dataloader_setting)\n    else:\n        valid_loader = None\n    return train_loader, valid_loader, precise_bn_loader\ndef build_trainer(cfg, places, parallel, validate, precise_bn,\n                  num_iters_precise_bn, world_size):\n    \"\"\"\n    Build training model and its associated tools, including optimizer,\n    dataloaders and meters.\n    Args:\n        cfg (CfgNode): configs.\n    Returns:\n        model: training model.\n        optimizer: optimizer.\n        train_loader: training data loader.\n        val_loader: validatoin data loader.\n        precise_bn_loader: training data loader for computing\n            precise BN.\n    \"\"\"\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    train_loader, valid_loader, precise_bn_loader = \\\n        construct_loader(cfg,\n                         places,",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:78-110"
+    },
+    "8267": {
+        "file_id": 609,
+        "content": "This code is creating training and validation data loaders for a PaddleVideo model. It also builds the model, and if parallelization is enabled, it wraps the model with Paddle's DataParallel API to distribute computation across multiple GPUs. The function returns the trained model, its optimizer, and the various data loaders required for training and validation.",
+        "type": "comment"
+    },
+    "8268": {
+        "file_id": 609,
+        "content": "                         validate,\n                         precise_bn,\n                         num_iters_precise_bn,\n                         world_size,\n                         )\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))\n    optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)\n    return (\n        model,\n        lr,\n        optimizer,\n        train_loader,\n        valid_loader,\n        precise_bn_loader,\n    )\ndef train_model_multigrid(cfg, world_size=1, validate=True):\n    \"\"\"Train model entry\n    Args:\n    \tcfg (dict): configuration.\n    \tparallel (bool): Whether multi-card training. Default: True\n        validate (bool): Whether to do evaluation. Default: False.\n    \"\"\"\n    # Init multigrid.\n    multigrid = None\n    if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE:\n        multigrid = MultigridSchedule()\n        cfg = multigrid.init_multigrid(cfg)\n        if cfg.MULTIGRID.LONG_CYCLE:\n            cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0)\n    multi_save_epoch = [i[-1] - 1 for i in multigrid.schedule]",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:111-146"
+    },
+    "8269": {
+        "file_id": 609,
+        "content": "This code initializes a multigrid training configuration and builds the model, learning rate, optimizer, and loaders for training, validation, and precise Batch Normalization. It also includes an optional multigrid schedule for long or short cycles if specified in the configuration.",
+        "type": "comment"
+    },
+    "8270": {
+        "file_id": 609,
+        "content": "    parallel = world_size != 1\n    logger = get_logger(\"paddlevideo\")\n    batch_size = cfg.DATASET.get('batch_size', 2)\n    if cfg.get('use_npu', False):\n        places = paddle.set_device('npu')\n    elif cfg.get('use_xpu', False):\n        places = paddle.set_device('xpu')\n    else:\n        places = paddle.set_device('gpu')\n    model_name = cfg.model_name\n    output_dir = cfg.get(\"output_dir\", f\"./output/{model_name}\")\n    mkdir(output_dir)\n    local_rank = dist.ParallelEnv().local_rank\n    precise_bn = cfg.get(\"PRECISEBN\")\n    num_iters_precise_bn = cfg.PRECISEBN.num_iters_preciseBN\n    # 1. Construct model\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    # 2. Construct dataloader\n    train_loader, valid_loader, precise_bn_loader = \\\n        construct_loader(cfg,\n                         places,\n                         validate,\n                         precise_bn,\n                         num_iters_precise_bn,\n                         world_size,\n                         )",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:148-179"
+    },
+    "8271": {
+        "file_id": 609,
+        "content": "This code sets the device (npu, xpu or gpu) based on configuration and creates the model, dataloaders for training, validation, and precise BN if needed. It also initializes a logger and handles distributed training using parallel models and dataloaders.",
+        "type": "comment"
+    },
+    "8272": {
+        "file_id": 609,
+        "content": "    # 3. Construct optimizer\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))\n    optimizer = build_optimizer(\n        cfg.OPTIMIZER, lr, parameter_list=model.parameters())\n    # Resume\n    resume_epoch = cfg.get(\"resume_epoch\", 0)\n    if resume_epoch:\n        filename = osp.join(\n            output_dir,\n            model_name + str(local_rank) + '_' + f\"{resume_epoch:05d}\")\n        subn_load(model, filename, optimizer)\n    # 4. Train Model\n    best = 0.\n    total_epochs = int(cfg.epochs * cfg.MULTIGRID.epoch_factor)\n    for epoch in range(total_epochs):\n        if epoch < resume_epoch:\n            logger.info(\n                f\"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... \"\n            )\n            continue\n        if cfg.MULTIGRID.LONG_CYCLE:\n            cfg, changed = multigrid.update_long_cycle(cfg, epoch)\n            if changed:\n                logger.info(\"====== Rebuild model/optimizer/loader =====\")\n                (\n                    model,\n                    lr,",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:181-210"
+    },
+    "8273": {
+        "file_id": 609,
+        "content": "Constructing the optimizer, resuming training from a previous checkpoint if specified in the config file, and updating the long cycle configuration for multi-grid training.",
+        "type": "comment"
+    },
+    "8274": {
+        "file_id": 609,
+        "content": "                    optimizer,\n                    train_loader,\n                    valid_loader,\n                    precise_bn_loader,\n                ) = build_trainer(cfg, places, parallel, validate, precise_bn,\n                                  num_iters_precise_bn, world_size)\n                #load checkpoint after re-build model\n                if epoch != 0:\n                    #epoch no need to -1, haved add 1 when save\n                    filename = osp.join(\n                        output_dir,\n                        model_name + str(local_rank) + '_' + f\"{(epoch):05d}\")\n                    subn_load(model, filename, optimizer)\n                #update lr last epoch, not to use saved params\n                lr.last_epoch = epoch\n                lr.step(rebuild=True)\n        model.train()\n        record_list = build_record(cfg.MODEL)\n        tic = time.time()\n        for i, data in enumerate(train_loader):\n            record_list['reader_time'].update(time.time() - tic)\n            # 4.1 forward\n            outputs = model(data, mode='train')",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:211-235"
+    },
+    "8275": {
+        "file_id": 609,
+        "content": "The code builds a trainer with specified configurations, optimizer, train and validation loaders. It loads checkpoints if the epoch is not zero and updates the learning rate for the next epoch before training the model on the given data.",
+        "type": "comment"
+    },
+    "8276": {
+        "file_id": 609,
+        "content": "            # 4.2 backward\n            avg_loss = outputs['loss']\n            avg_loss.backward()\n            # 4.3 minimize\n            optimizer.step()\n            optimizer.clear_grad()\n            # log record\n            record_list['lr'].update(\n                float(optimizer._global_learning_rate()), batch_size)\n            for name, value in outputs.items():\n                record_list[name].update(float(value), batch_size)\n            record_list['batch_time'].update(time.time() - tic)\n            tic = time.time()\n            if i % cfg.get(\"log_interval\", 10) == 0:\n                ips = \"ips: {:.5f} instance/sec.\".format(\n                    batch_size / record_list[\"batch_time\"].val)\n                log_batch(record_list, i, epoch + 1, total_epochs, \"train\", ips)\n            # learning rate iter step\n            if cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n                lr.step()\n        # learning rate epoch step\n        if not cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n            lr.step()",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:236-262"
+    },
+    "8277": {
+        "file_id": 609,
+        "content": "Performing backward pass, optimizing using given optimizer, logging progress, and updating learning rate in both iteration step and epoch step.",
+        "type": "comment"
+    },
+    "8278": {
+        "file_id": 609,
+        "content": "        ips = \"ips: {:.5f} instance/sec.\".format(\n            batch_size * record_list[\"batch_time\"].count /\n            record_list[\"batch_time\"].sum)\n        log_epoch(record_list, epoch + 1, \"train\", ips)\n        def evaluate(best):\n            model.eval()\n            record_list = build_record(cfg.MODEL)\n            record_list.pop('lr')\n            tic = time.time()\n            for i, data in enumerate(valid_loader):\n                outputs = model(data, mode='valid')\n                # log_record\n                for name, value in outputs.items():\n                    record_list[name].update(float(value), batch_size)\n                record_list['batch_time'].update(time.time() - tic)\n                tic = time.time()\n                if i % cfg.get(\"log_interval\", 10) == 0:\n                    ips = \"ips: {:.5f} instance/sec.\".format(\n                        batch_size / record_list[\"batch_time\"].val)\n                    log_batch(record_list, i, epoch + 1, total_epochs, \"val\",\n                              ips)",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:264-288"
+    },
+    "8279": {
+        "file_id": 609,
+        "content": "This code snippet evaluates the model's performance during training and updates the record list with new values. It also logs the progress at certain intervals, displaying the number of instances processed per second (ips). The function 'evaluate' is called to perform this evaluation for each data batch in the valid_loader, updating the record list accordingly.",
+        "type": "comment"
+    },
+    "8280": {
+        "file_id": 609,
+        "content": "            ips = \"ips: {:.5f} instance/sec.\".format(\n                batch_size * record_list[\"batch_time\"].count /\n                record_list[\"batch_time\"].sum)\n            log_epoch(record_list, epoch + 1, \"val\", ips)\n            best_flag = False\n            if record_list.get('top1') and record_list['top1'].avg > best:\n                best = record_list['top1'].avg\n                best_flag = True\n            return best, best_flag\n        # use precise bn to improve acc\n        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):\n            logger.info(f\"do precise BN in {epoch+1} ...\")\n            do_preciseBN(model, precise_bn_loader, parallel,\n                         min(num_iters_precise_bn, len(precise_bn_loader)))\n        #  aggregate sub_BN stats\n        logger.info(\"Aggregate sub_BatchNorm stats...\")\n        aggregate_sub_bn_stats(model)\n        # 5. Validation\n        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):\n            logger.info(f\"eval in {epoch+1} ...\")",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:290-313"
+    },
+    "8281": {
+        "file_id": 609,
+        "content": "The code calculates the instantaneous processing speed (ips) and checks if a new best performance has been achieved. It then logs this information. If it's an evaluation epoch, it performs precise batch normalization, aggregates sub-batch normalization stats, and validates the model.",
+        "type": "comment"
+    },
+    "8282": {
+        "file_id": 609,
+        "content": "            with paddle.no_grad():\n                best, save_best_flag = evaluate(best)\n            # save best\n            if save_best_flag:\n                save(optimizer.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdopt\"))\n                save(model.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdparams\"))\n                logger.info(\n                    f\"Already save the best model (top1 acc){int(best * 10000) / 10000}\"\n                )\n        # 6. Save model and optimizer\n        if is_eval_epoch(\n                cfg, epoch,\n                total_epochs, multigrid.schedule) or epoch % cfg.get(\n                    \"save_interval\", 10) == 0 or epoch in multi_save_epoch:\n            logger.info(\"[Save parameters] ======\")\n            subn_save(output_dir, model_name + str(local_rank) + '_', epoch + 1,\n                      model, optimizer)\n    logger.info(f'training {model_name} finished')",
+        "type": "code",
+        "location": "/paddlevideo/tasks/train_multigrid.py:314-335"
+    },
+    "8283": {
+        "file_id": 609,
+        "content": "The code saves the best model if it outperforms previous results, and periodically saves the current model parameters during training. It uses the evaluate function to measure performance, the save function to store state dictionaries, and the subn_save function for saving models and optimizers at certain epochs. The logger is used for informative messages about saving and training completion.",
+        "type": "comment"
+    },
+    "8284": {
+        "file_id": 610,
+        "content": "/paddlevideo/utils/__init__.py",
+        "type": "filepath"
+    },
+    "8285": {
+        "file_id": 610,
+        "content": "This code imports various functions and classes from different modules within the PaddleVideo library. It also sets up logger and profiler functionality, provides a build function for creating objects, and handles saving and loading data.",
+        "type": "summary"
+    },
+    "8286": {
+        "file_id": 610,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import Registry\nfrom .build_utils import build\nfrom .config import *\nfrom .logger import setup_logger, coloring, get_logger\nfrom .record import AverageMeter, build_record, log_batch, log_epoch\nfrom .dist_utils import get_dist_info, main_only\nfrom .save_load import save, load, load_ckpt, mkdir\nfrom .precise_bn import do_preciseBN\nfrom .profiler import add_profiler_step\n__all__ = ['Registry', 'build']",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py:1-24"
+    },
+    "8287": {
+        "file_id": 610,
+        "content": "This code imports various functions and classes from different modules within the PaddleVideo library. It also sets up logger and profiler functionality, provides a build function for creating objects, and handles saving and loading data.",
+        "type": "comment"
+    },
+    "8288": {
+        "file_id": 611,
+        "content": "/paddlevideo/utils/build_utils.py",
+        "type": "filepath"
+    },
+    "8289": {
+        "file_id": 611,
+        "content": "The \"build\" function takes a config dictionary and registry, constructs an object from the configuration, checks for required keys, retrieves class from the registry, and returns the instance.",
+        "type": "summary"
+    },
+    "8290": {
+        "file_id": 611,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\ndef build(cfg, registry, key='name'):\n    \"\"\"Build a module from config dict.\n    Args:\n        cfg (dict): Config dict. It should at least contain the key.\n        registry (XXX): The registry to search the type from.\n        key (str): the key.\n    Returns:\n        obj: The constructed object.\n    \"\"\"\n    assert isinstance(cfg, dict) and key in cfg\n    cfg_copy = cfg.copy()\n    obj_type = cfg_copy.pop(key)\n    obj_cls = registry.get(obj_type)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py:1-31"
+    },
+    "8291": {
+        "file_id": 611,
+        "content": "This code defines a function named \"build\" that takes a config dictionary and a registry, builds an object from the given configuration dictionary, and returns it. The function asserts that the input is a valid dictionary and checks if the required key exists. It then retrieves the object type from the dictionary and gets the corresponding class from the registry before returning the constructed object.",
+        "type": "comment"
+    },
+    "8292": {
+        "file_id": 611,
+        "content": "    if obj_cls is None:\n        raise KeyError('{} is not in the {} registry'.format(\n                obj_type, registry.name))\n    return obj_cls(**cfg_copy)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py:32-35"
+    },
+    "8293": {
+        "file_id": 611,
+        "content": "Checks if an object class is provided, raises a KeyError if not found in the registry, and returns an instance of the found class with provided configuration.",
+        "type": "comment"
+    },
+    "8294": {
+        "file_id": 612,
+        "content": "/paddlevideo/utils/config.py",
+        "type": "filepath"
+    },
+    "8295": {
+        "file_id": 612,
+        "content": "The code imports necessary modules, sets up a logger, creates an AttrDict class for config handling, and defines functions to load, visualize, and override dictionary from YAML file. It also includes 'options' and 'get_config' functions to apply overrides and print or return the updated configuration.",
+        "type": "summary"
+    },
+    "8296": {
+        "file_id": 612,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport yaml\nfrom paddlevideo.utils.logger import coloring, get_logger, setup_logger\n__all__ = ['get_config']\nlogger = setup_logger(\"./\", name=\"paddlevideo\", level=\"INFO\")\nclass AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef create_attr_dict(yaml_config):",
+        "type": "code",
+        "location": "/paddlevideo/utils/config.py:1-34"
+    },
+    "8297": {
+        "file_id": 612,
+        "content": "The code is importing necessary modules, defining an AttrDict class for config handling and setting up a logger. It also creates a function 'create_attr_dict' that takes in a yaml configuration file as input.",
+        "type": "comment"
+    },
+    "8298": {
+        "file_id": 612,
+        "content": "    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef print_dict(d, delimiter=0):\n    \"\"\"\n    Recursively visualize a dict and\n    indenting acrrording by the relationship of keys.\n    \"\"\"\n    placeholder = \"-\" * 60\n    for k, v in sorted(d.items()):\n        if isinstance(v, dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", coloring(k,\n                                                                   \"HEADER\")))",
+        "type": "code",
+        "location": "/paddlevideo/utils/config.py:35-67"
+    },
+    "8299": {
+        "file_id": 612,
+        "content": "The code defines two functions: \"parse_config\" and \"print_dict\". The \"parse_config\" function loads a config file into an AttrDict, while the \"print_dict\" function recursively visualizes a dictionary by indenting according to the relationship of keys.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/83.json b/docs/data/83.json
new file mode 100644
index 000000000..adf322db1
--- /dev/null
+++ b/docs/data/83.json
@@ -0,0 +1,544 @@
+{
+    "8300": {
+        "file_id": 612,
+        "content": "            print_dict(v, delimiter + 4)\n        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \",\n                                         coloring(str(k), \"HEADER\")))\n            for value in v:\n                print_dict(value, delimiter + 4)\n        else:\n            logger.info(\"{}{} : {}\".format(delimiter * \" \",\n                                           coloring(k, \"HEADER\"),\n                                           coloring(v, \"OKGREEN\")))\n        if k.isupper():\n            logger.info(placeholder)\ndef print_config(config):\n    \"\"\"\n    visualize configs\n    Arguments:\n        config: configs\n    \"\"\"\n    print_dict(config)\ndef check_config(config):\n    \"\"\"\n    Check config\n    \"\"\"\n    pass\ndef override(dl, ks, v):\n    \"\"\"\n    Recursively replace dict of list\n    Args:\n        dl(dict or list): dict or list to be replaced\n        ks(list): list of keys\n        v(str): value to be replaced\n    \"\"\"\n    def str2num(v):\n        try:\n            return eval(v)",
+        "type": "code",
+        "location": "/paddlevideo/utils/config.py:68-109"
+    },
+    "8301": {
+        "file_id": 612,
+        "content": "The code defines several functions related to handling and manipulating configurations. The \"print_config\" function visualizes the configs, while \"check_config\" checks the validity of the configs. The \"override\" function recursively replaces values in a dictionary or list using a provided key and value.",
+        "type": "comment"
+    },
+    "8302": {
+        "file_id": 612,
+        "content": "        except Exception:\n            return v\n    assert isinstance(dl, (list, dict)), (\"{} should be a list or a dict\")\n    assert len(ks) > 0, ('lenght of keys should larger than 0')\n    if isinstance(dl, list):\n        k = str2num(ks[0])\n        if len(ks) == 1:\n            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))\n            dl[k] = str2num(v)\n        else:\n            override(dl[k], ks[1:], v)\n    else:\n        if len(ks) == 1:\n            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))\n            if not ks[0] in dl:\n                logger.warning('A new filed ({}) detected!'.format(ks[0], dl))\n            dl[ks[0]] = str2num(v)\n        else:\n            assert ks[0] in dl, (\n                '({}) doesn\\'t exist in {}, a new dict field is invalid'.format(\n                    ks[0], dl))\n            override(dl[ks[0]], ks[1:], v)\ndef override_config(config, options=None):\n    \"\"\"\n    Recursively override the config\n    Args:\n        config(dict): dict to be replaced",
+        "type": "code",
+        "location": "/paddlevideo/utils/config.py:110-139"
+    },
+    "8303": {
+        "file_id": 612,
+        "content": "This code is part of a function `override_config` that recursively overrides the given config with new options. The function first checks if the data `dl` is a list or dictionary, and then proceeds accordingly. If `dl` is a list, it extracts the first key from `ks`, converts it to an integer, and ensures that the index is within range of `dl`. It then updates the corresponding element in `dl` with the value `v`. If `ks` has more than one key, it calls the `override` function to update a specific field in `dl`. \n\nIf `dl` is a dictionary, it again handles two scenarios: when there's only one key and when there are multiple keys. In the single-key scenario, it checks if the key exists in `dl` (and warns if not) and updates its value. If there are multiple keys, it first asserts that the first key exists in `dl`, then calls the `override` function to update a specific field in `dl`.\n\nThe code also includes an exception handling block which simply returns the original value `v` in case of any error or exception.",
+        "type": "comment"
+    },
+    "8304": {
+        "file_id": 612,
+        "content": "        options(list): list of pairs(key0.key1.idx.key2=value)\n            such as: [\n                epochs=20',\n                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'\n            ]\n    Returns:\n        config(dict): replaced config\n    \"\"\"\n    if options is not None:\n        for opt in options:\n            assert isinstance(opt,\n                              str), (\"option({}) should be a str\".format(opt))\n            assert \"=\" in opt, (\n                \"option({}) should contain a =\"\n                \"to distinguish between key and value\".format(opt))\n            pair = opt.split('=')\n            assert len(pair) == 2, (\"there can be only a = in the option\")\n            key, value = pair\n            keys = key.split('.')\n            override(config, keys, value)\n    return config\ndef get_config(fname, overrides=None, show=True):\n    \"\"\"\n    Read config from file\n    \"\"\"\n    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))\n    config = parse_config(fname)\n    override_config(config, overrides)",
+        "type": "code",
+        "location": "/paddlevideo/utils/config.py:140-170"
+    },
+    "8305": {
+        "file_id": 612,
+        "content": "This code defines two functions, `options(list)` and `get_config(fname, overrides=None, show=True)`. The `options(list)` function takes a list of pairs (key-value) as input and replaces the config with new values. The `get_config(fname, overrides=None, show=True)` function reads the config from a file, applies any overrides, and if 'show' is True, prints the updated configuration.",
+        "type": "comment"
+    },
+    "8306": {
+        "file_id": 612,
+        "content": "    if show:\n        print_config(config)\n    check_config(config)\n    return config",
+        "type": "code",
+        "location": "/paddlevideo/utils/config.py:171-174"
+    },
+    "8307": {
+        "file_id": 612,
+        "content": "This code block checks if the 'show' variable is set to True, and if so, it calls a function named 'print_config' with the 'config' parameter. It then always executes another function called 'check_config' with the same 'config' argument before returning the 'config' variable.",
+        "type": "comment"
+    },
+    "8308": {
+        "file_id": 613,
+        "content": "/paddlevideo/utils/dist_utils.py",
+        "type": "filepath"
+    },
+    "8309": {
+        "file_id": 613,
+        "content": "This code is from PaddleVideo's EIVideo module and includes util functions for distributed computing. It defines a function get_dist_info() to retrieve the current rank and world size, and main_only() is a decorator that only runs the wrapped function if the rank is 0 (used in distributed environments).",
+        "type": "summary"
+    },
+    "8310": {
+        "file_id": 613,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport functools\nimport paddle\nimport paddle.distributed as dist\ndef get_dist_info():\n    world_size = dist.get_world_size()\n    rank = dist.get_rank()\n    return rank, world_size\ndef main_only(func):\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        rank, _ = get_dist_info()\n        if rank == 0:\n            return func(*args, **kwargs)\n    return wrapper",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py:1-30"
+    },
+    "8311": {
+        "file_id": 613,
+        "content": "This code is from PaddleVideo's EIVideo module and includes util functions for distributed computing. It defines a function get_dist_info() to retrieve the current rank and world size, and main_only() is a decorator that only runs the wrapped function if the rank is 0 (used in distributed environments).",
+        "type": "comment"
+    },
+    "8312": {
+        "file_id": 614,
+        "content": "/paddlevideo/utils/logger.py",
+        "type": "filepath"
+    },
+    "8313": {
+        "file_id": 614,
+        "content": "This code sets up a colorful logging function for PaddleVideo, initializes logger with verbosity levels, and ensures non-propagation of logs. It configures logger for Python's logging module using different formats and handlers based on local rank.",
+        "type": "summary"
+    },
+    "8314": {
+        "file_id": 614,
+        "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport logging\nimport os\nimport sys\nimport datetime\nfrom paddle.distributed import ParallelEnv\nColor = {\n    'RED': '\\033[31m',\n    'HEADER': '\\033[35m',  # deep purple\n    'PURPLE': '\\033[95m',  # purple\n    'OKBLUE': '\\033[94m',\n    'OKGREEN': '\\033[92m',\n    'WARNING': '\\033[93m',\n    'FAIL': '\\033[91m',\n    'ENDC': '\\033[0m'\n}\ndef coloring(message, color=\"OKGREEN\"):\n    assert color in Color.keys()\n    if os.environ.get('COLORING', True):",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:1-38"
+    },
+    "8315": {
+        "file_id": 614,
+        "content": "This code is from the \"logger.py\" file in the PaddleVideo project, and it sets up a coloring function for logging messages with optional colors using ANSI escape sequences. The function takes a message and an optional color parameter, which should be one of the defined colors in the Color dictionary. It asserts that the provided color is indeed a key in the dictionary, and then returns the message with the specified color applied. The function also checks the environment variable \"COLORING\" to determine whether coloring should be enabled or not (default is True).",
+        "type": "comment"
+    },
+    "8316": {
+        "file_id": 614,
+        "content": "        return Color[color] + str(message) + Color[\"ENDC\"]\n    else:\n        return message\nlogger_initialized = []\ndef setup_logger(output=None, name=\"paddlevideo\", level=\"INFO\"):\n    \"\"\"\n    Initialize the paddlevideo logger and set its verbosity level to \"INFO\".\n    Args:\n        output (str): a file name or a directory to save log. If None, will not save log file.\n            If ends with \".txt\" or \".log\", assumed to be a file name.\n            Otherwise, logs will be saved to `output/log.txt`.\n        name (str): the root module name of this logger\n    Returns:\n        logging.Logger: a logger\n    \"\"\"\n    def time_zone(sec, fmt):\n        real_time = datetime.datetime.now()\n        return real_time.timetuple()\n    logging.Formatter.converter = time_zone\n    logger = logging.getLogger(name)\n    if level == \"INFO\":\n        logger.setLevel(logging.INFO)\n    elif level==\"DEBUG\":\n        logger.setLevel(logging.DEBUG)\n    logger.propagate = False\n    if level == \"DEBUG\":\n        plain_formatter = logging.Formatter(",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:39-71"
+    },
+    "8317": {
+        "file_id": 614,
+        "content": "This code initializes the PaddleVideo logger and sets its verbosity level to \"INFO\" or \"DEBUG\", depending on the input argument. It also defines a custom time zone converter for logging, and ensures that the logger does not propagate logs to its parent loggers.",
+        "type": "comment"
+    },
+    "8318": {
+        "file_id": 614,
+        "content": "            \"[%(asctime)s] %(name)s %(levelname)s: %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    else:\n        plain_formatter = logging.Formatter(\n            \"[%(asctime)s] %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    # stdout logging: master only\n    local_rank = ParallelEnv().local_rank\n    if local_rank == 0:\n        ch = logging.StreamHandler(stream=sys.stdout)\n        ch.setLevel(logging.DEBUG)\n        formatter = plain_formatter\n        ch.setFormatter(formatter)\n        logger.addHandler(ch)\n    # file logging: all workers\n    if output is not None:\n        if output.endswith(\".txt\") or output.endswith(\".log\"):\n            filename = output\n        else:\n            filename = os.path.join(output, \".log.txt\")\n        if local_rank > 0:\n            filename = filename + \".rank{}\".format(local_rank)\n        # PathManager.mkdirs(os.path.dirname(filename))\n        os.makedirs(os.path.dirname(filename), exist_ok=True)\n        # fh = logging.StreamHandler(_cached_log_stream(filename)\n        fh = logging.FileHandler(filename, mode='a')",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:72-100"
+    },
+    "8319": {
+        "file_id": 614,
+        "content": "This code configures a logger for Python's logging module. It uses different formats and handlers (stdout, file) based on the local rank of the process, creating separate log files for each worker ranked greater than 0. If the output is a .txt or .log file, it will be used as-is; otherwise, a .log.txt file with optional rank appended will be created. The code also ensures that missing directories for the log file are created beforehand.",
+        "type": "comment"
+    },
+    "8320": {
+        "file_id": 614,
+        "content": "        fh.setLevel(logging.DEBUG)\n        fh.setFormatter(plain_formatter)\n        logger.addHandler(fh)\n    logger_initialized.append(name)\n    return logger\ndef get_logger(name, output=None):\n    logger = logging.getLogger(name)\n    if name in logger_initialized:\n        return logger\n    return setup_logger(name=name, output=name)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:101-113"
+    },
+    "8321": {
+        "file_id": 614,
+        "content": "This code initializes a logger object and sets its level to DEBUG, adds a file handler with a plain formatter, and appends the logger's name to an initialized list. The function returns the logger if it has been previously initialized for the given name; otherwise, it sets up the logger using the provided name and optional output.",
+        "type": "comment"
+    },
+    "8322": {
+        "file_id": 615,
+        "content": "/paddlevideo/utils/multigrid/__init__.py",
+        "type": "filepath"
+    },
+    "8323": {
+        "file_id": 615,
+        "content": "This code imports various functions and classes from different modules in the PaddleVideo library. The __all__ list specifies the exported public symbols including MultigridSchedule, get_norm, aggregate_sub_bn_stats, DistributedShortSampler, subn_save, subn_load, and is_eval_epoch.",
+        "type": "summary"
+    },
+    "8324": {
+        "file_id": 615,
+        "content": "from .multigrid import MultigridSchedule\nfrom .batchnorm_helper import get_norm, aggregate_sub_bn_stats\nfrom .short_sampler import DistributedShortSampler\nfrom .save_load_helper import subn_save, subn_load\nfrom .interval_helper import is_eval_epoch\n__all__ = [\n    'MultigridSchedule', 'get_norm', 'aggregate_sub_bn_stats',\n    'DistributedShortSampler', 'subn_save', 'subn_load', 'is_eval_epoch'\n]",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/__init__.py:1-10"
+    },
+    "8325": {
+        "file_id": 615,
+        "content": "This code imports various functions and classes from different modules in the PaddleVideo library. The __all__ list specifies the exported public symbols including MultigridSchedule, get_norm, aggregate_sub_bn_stats, DistributedShortSampler, subn_save, subn_load, and is_eval_epoch.",
+        "type": "comment"
+    },
+    "8326": {
+        "file_id": 616,
+        "content": "/paddlevideo/utils/multigrid/batchnorm_helper.py",
+        "type": "filepath"
+    },
+    "8327": {
+        "file_id": 616,
+        "content": "This code defines a PyTorch class for batch normalization, initializing a BatchNorm3D layer and including methods to compute mean and standard deviation. It also supports aggregating statistics from multiple splits and performs forward pass for training or evaluation.",
+        "type": "summary"
+    },
+    "8328": {
+        "file_id": 616,
+        "content": "from functools import partial\nimport paddle\ndef get_norm(bn_norm_type, bn_num_splits):\n    \"\"\"\n    Args:\n        cfg (CfgNode): model building configs, details are in the comments of\n            the config file.\n    Returns:\n        nn.Layer: the normalization layer.\n    \"\"\"\n    if bn_norm_type == \"batchnorm\":\n        return paddle.nn.BatchNorm3D\n    elif bn_norm_type == \"sub_batchnorm\":\n        return partial(SubBatchNorm3D, num_splits=bn_num_splits)\n    else:\n        raise NotImplementedError(\n            \"Norm type {} is not supported\".format(bn_norm_type))\ndef aggregate_sub_bn_stats(model):\n    \"\"\"\n    Recursively find all SubBN modules and aggregate sub-BN stats.\n    Args:\n        model (nn.Layer): model to be aggregate sub-BN stats\n    Returns:\n        count (int): number of SubBN module found.\n    \"\"\"\n    count = 0\n    for child in model.children():\n        if isinstance(child, SubBatchNorm3D):\n            child.aggregate_stats()\n            count += 1\n        else:\n            count += aggregate_sub_bn_stats(child)",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/batchnorm_helper.py:1-36"
+    },
+    "8329": {
+        "file_id": 616,
+        "content": "This code defines a function `get_norm` that returns the normalization layer based on the provided bn_norm_type and bn_num_splits. If bn_norm_type is 'batchnorm', it returns paddle.nn.BatchNorm3D, otherwise if it's 'sub_batchnorm', it returns a partially applied SubBatchNorm3D function with num_splits parameter set to bn_num_splits. If the norm type isn't supported, it raises a NotImplementedError. It also defines `aggregate_sub_bn_stats` function that recursively finds all SubBN modules in the given model and aggregates sub-BN stats by calling aggregate_stats() on each found SubBatchNorm3D module. It returns the count of SubBN modules found.",
+        "type": "comment"
+    },
+    "8330": {
+        "file_id": 616,
+        "content": "    return count\nclass SubBatchNorm3D(paddle.nn.Layer):\n    \"\"\"\n    Implement based on paddle2.0.\n    The standard BN layer computes stats across all examples in a GPU. In some\n    cases it is desirable to compute stats across only a subset of examples\n    SubBatchNorm3D splits the batch dimension into N splits, and run BN on\n    each of them separately (so that the stats are computed on each subset of\n    examples (1/N of batch) independently. During evaluation, it aggregates\n    the stats from all splits into one BN.\n    \"\"\"\n    def __init__(self, num_splits, **args):\n        \"\"\"\n        Args:\n            num_splits (int): number of splits.\n            args (list): list of args\n        \"\"\"\n        super(SubBatchNorm3D, self).__init__()\n        self.num_splits = num_splits\n        self.num_features = args[\"num_features\"]\n        self.weight_attr = args[\"weight_attr\"]\n        self.bias_attr = args[\"bias_attr\"]\n        # Keep only one set of weight and bias (outside).\n        if self.weight_attr == False:\n            self.weight = self.create_parameter(",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/batchnorm_helper.py:37-64"
+    },
+    "8331": {
+        "file_id": 616,
+        "content": "The code defines a SubBatchNorm3D class that implements Batch Normalization with the option to split the batch dimension into N splits. It computes stats for each subset of examples independently during training and aggregates them during evaluation. The class takes num_splits as an argument and other parameters such as num_features, weight_attr, and bias_attr are set in its constructor.",
+        "type": "comment"
+    },
+    "8332": {
+        "file_id": 616,
+        "content": "                attr=None,\n                shape=[self.num_features],\n                default_initializer=paddle.nn.initializer.Constant(1.0))\n            self.weight.stop_gradient = True\n        else:\n            self.weight = self.create_parameter(\n                attr=self.weight_attr,\n                shape=[self.num_features],\n                default_initializer=paddle.nn.initializer.Constant(1.0))\n            self.weight.stop_gradient = self.weight_attr is not None \\\n                                        and self.weight_attr.learning_rate == 0.\n        if self.bias_attr == False:\n            self.bias = self.create_parameter(attr=None,\n                                              shape=[self.num_features],\n                                              is_bias=True)\n            self.bias.stop_gradient = True\n        else:\n            self.bias = self.create_parameter(attr=self.bias_attr,\n                                              shape=[self.num_features],\n                                              is_bias=True)",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/batchnorm_helper.py:65-85"
+    },
+    "8333": {
+        "file_id": 616,
+        "content": "This code initializes the weight and bias parameters of a BatchNorm layer in PaddlePaddle. If learning rate is 0, it sets weight to have no gradient update, and if `bias_attr` is False, it sets the bias to True and stops its gradients from being updated.",
+        "type": "comment"
+    },
+    "8334": {
+        "file_id": 616,
+        "content": "            self.bias.stop_gradient = self.bias_attr is not None \\\n                                      and self.bias_attr.learning_rate == 0.\n        # set weights and bias fixed (inner).\n        args[\"weight_attr\"] = False\n        args[\"bias_attr\"] = False\n        self.bn = paddle.nn.BatchNorm3D(**args)\n        # update number of features used in split_bn\n        args[\"num_features\"] = self.num_features * self.num_splits\n        self.split_bn = paddle.nn.BatchNorm3D(**args)\n    def _get_aggregated_mean_std(self, means, stds, n):\n        \"\"\"\n        Calculate the aggregated mean and stds.\n        Use the method of update mean and std when merge multi-part data.\n        Args:\n            means (tensor): mean values.\n            stds (tensor): standard deviations.\n            n (int): number of sets of means and stds.\n        \"\"\"\n        mean = paddle.sum(paddle.reshape(means, (n, -1)), axis=0) / n\n        std = (paddle.sum(paddle.reshape(stds, (n, -1)), axis=0) / n +\n               paddle.sum(paddle.reshape(",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/batchnorm_helper.py:86-108"
+    },
+    "8335": {
+        "file_id": 616,
+        "content": "Class is initializing a BatchNorm3D layer and storing two instances of it (self.bn and self.split_bn). The first instance has its weights and bias set as fixed (inner), while the second instance handles splitting the features for a specified number of splits. The function _get_aggregated_mean_std calculates the aggregated mean and standard deviation by summing each set's means and stds, then dividing them by the total count to get the average values.",
+        "type": "comment"
+    },
+    "8336": {
+        "file_id": 616,
+        "content": "                   paddle.pow((paddle.reshape(means, (n, -1)) - mean), 2),\n                   (n, -1)),\n                          axis=0) / n)\n        return mean, std\n    def aggregate_stats(self):\n        \"\"\"\n        Synchronize running_mean, and running_var to self.bn.\n        Call this before eval, then call model.eval();\n        When eval, forward function will call self.bn instead of self.split_bn,\n        During this time the running_mean, and running_var of self.bn has been obtained from\n        self.split_bn.\n        \"\"\"\n        if self.split_bn.training:\n            bn_mean_tensor, bn_variance_tensor = self._get_aggregated_mean_std(\n                self.split_bn._mean,\n                self.split_bn._variance,\n                self.num_splits,\n            )\n            self.bn._mean.set_value(bn_mean_tensor)\n            self.bn._variance.set_value(bn_variance_tensor)\n    def forward(self, x):\n        if self.training:\n            n, c, t, h, w = x.shape\n            x = paddle.reshape(\n                x, (n // self.num_splits, c * self.num_splits, t, h, w))",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/batchnorm_helper.py:109-135"
+    },
+    "8337": {
+        "file_id": 616,
+        "content": "This code is defining a class that implements batch normalization in PyTorch. The class has methods to compute the mean and standard deviation, aggregate statistics from multiple splits of batch normalization, and perform forward pass for training or evaluation.",
+        "type": "comment"
+    },
+    "8338": {
+        "file_id": 616,
+        "content": "            x = self.split_bn(x)\n            x = paddle.reshape(x, (n, c, t, h, w))\n        else:\n            x = self.bn(x)\n        x = paddle.multiply(x, paddle.reshape(self.weight, (-1, 1, 1, 1)))\n        x = paddle.add(x, paddle.reshape(self.bias, (-1, 1, 1, 1)))\n        return x",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/batchnorm_helper.py:136-142"
+    },
+    "8339": {
+        "file_id": 616,
+        "content": "The code applies batch normalization to the input tensor and multiplies it by a weight matrix. Then, it adds a bias vector and returns the normalized tensor.",
+        "type": "comment"
+    },
+    "8340": {
+        "file_id": 617,
+        "content": "/paddlevideo/utils/multigrid/interval_helper.py",
+        "type": "filepath"
+    },
+    "8341": {
+        "file_id": 617,
+        "content": "The function `is_eval_epoch` determines whether the model should be evaluated at a given epoch based on the provided configs, current epoch, and multigrid training schedule. If the current epoch is equal to the total number of epochs or if there's a non-null multigrid schedule, it checks if the current epoch is a time for evaluation based on the schedule intervals. The function returns True when an evaluation should occur and False otherwise.",
+        "type": "summary"
+    },
+    "8342": {
+        "file_id": 617,
+        "content": "def is_eval_epoch(cfg, cur_epoch, total_epochs, multigrid_schedule):\n    \"\"\"\n    Determine if the model should be evaluated at the current epoch.\n    Args:\n        cfg (CfgNode): configs. Details can be found in\n            slowfast/config/defaults.py\n        cur_epoch (int): current epoch.\n        multigrid_schedule (List): schedule for multigrid training.\n    \"\"\"\n    if cur_epoch + 1 == total_epochs:\n        return True\n    if multigrid_schedule is not None:\n        prev_epoch = 0\n        for s in multigrid_schedule:\n            if cur_epoch < s[-1]:\n                period = max(\n                    (s[-1] - prev_epoch) // cfg.MULTIGRID.EVAL_FREQ + 1, 1)\n                return (s[-1] - 1 - cur_epoch) % period == 0\n            prev_epoch = s[-1]",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/interval_helper.py:1-19"
+    },
+    "8343": {
+        "file_id": 617,
+        "content": "The function `is_eval_epoch` determines whether the model should be evaluated at a given epoch based on the provided configs, current epoch, and multigrid training schedule. If the current epoch is equal to the total number of epochs or if there's a non-null multigrid schedule, it checks if the current epoch is a time for evaluation based on the schedule intervals. The function returns True when an evaluation should occur and False otherwise.",
+        "type": "comment"
+    },
+    "8344": {
+        "file_id": 618,
+        "content": "/paddlevideo/utils/multigrid/multigrid.py",
+        "type": "filepath"
+    },
+    "8345": {
+        "file_id": 618,
+        "content": "The MultigridSchedule class manages multigrid training schedules, batch sizes, sampling rates, and long cycle updates. The update_long_cycle() function adjusts these parameters based on the epoch in PaddleVideo. It also calculates final learning rate schedules and provides a function for determining long cycle base shape.",
+        "type": "summary"
+    },
+    "8346": {
+        "file_id": 618,
+        "content": "\"\"\"Functions for multigrid training.\"\"\"\nimport numpy as np\nclass MultigridSchedule(object):\n    \"\"\"\n    This class defines multigrid training schedule and update cfg accordingly.\n    \"\"\"\n    def init_multigrid(self, cfg):\n        \"\"\"\n        Update cfg based on multigrid settings.\n        Args:\n            cfg (configs): configs that contains training and multigrid specific\n                hyperparameters.\n        Returns:\n            cfg (configs): the updated cfg.\n        \"\"\"\n        self.schedule = None\n        # We may modify cfg.DATASET.batch_size, cfg.PIPELINE.train.decode_sampler.num_frames, and\n        # cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] during training, so we store their original\n        # value in cfg and use them as global variables.\n        cfg.MULTIGRID.default_batch_size = cfg.DATASET.batch_size  # total bs,64\n        cfg.MULTIGRID.default_temporal_size = cfg.PIPELINE.train.decode_sampler.num_frames  # 32\n        cfg.MULTIGRID.default_crop_size = cfg.PIPELINE.train.transform[1][",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:1-25"
+    },
+    "8347": {
+        "file_id": 618,
+        "content": "This code defines a MultigridSchedule class for multigrid training schedule and updates cfg according to multigrid settings. The init_multigrid function takes in configs (cfg) as input, updates it based on multigrid settings, and returns the updated cfg. It stores original values of batch size, temporal size, and crop size in cfg's MULTIGRID subsection as global variables for later use.",
+        "type": "comment"
+    },
+    "8348": {
+        "file_id": 618,
+        "content": "            'MultiCrop']['target_size']  # 224\n        if cfg.MULTIGRID.LONG_CYCLE:\n            self.schedule = self.get_long_cycle_schedule(cfg)\n            cfg.OPTIMIZER.learning_rate.steps = [0] + [\n                s[-1] for s in self.schedule\n            ]\n            # Fine-tuning phase.\n            cfg.OPTIMIZER.learning_rate.steps[-1] = (\n                cfg.OPTIMIZER.learning_rate.steps[-2] +\n                cfg.OPTIMIZER.learning_rate.steps[-1]) // 2\n            cfg.OPTIMIZER.learning_rate.lrs = [\n                cfg.OPTIMIZER.learning_rate.gamma**s[0] * s[1][0]\n                for s in self.schedule\n            ]\n            # Fine-tuning phase.\n            cfg.OPTIMIZER.learning_rate.lrs = cfg.OPTIMIZER.learning_rate.lrs[:-1] + [\n                cfg.OPTIMIZER.learning_rate.lrs[-2],\n                cfg.OPTIMIZER.learning_rate.lrs[-1],\n            ]\n            cfg.OPTIMIZER.learning_rate.max_epoch = self.schedule[-1][-1]\n        elif cfg.MULTIGRID.SHORT_CYCLE:\n            cfg.OPTIMIZER.learning_rate.steps = [",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:26-50"
+    },
+    "8349": {
+        "file_id": 618,
+        "content": "The code initializes the multi-grid training schedule for the given configuration (cfg). If a long cycle is enabled, it sets learning rate steps and adjusts them for fine-tuning. It also updates the maximum epoch count based on the schedule.",
+        "type": "comment"
+    },
+    "8350": {
+        "file_id": 618,
+        "content": "                int(s * cfg.MULTIGRID.epoch_factor)\n                for s in cfg.OPTIMIZER.learning_rate.steps\n            ]\n            cfg.OPTIMIZER.learning_rate.max_epoch = int(\n                cfg.OPTIMIZER.learning_rate.max_epoch *\n                cfg.OPTIMIZER.learning_rate.max_epoch)\n        return cfg\n    def update_long_cycle(self, cfg, cur_epoch):\n        \"\"\"\n        Before every epoch, check if long cycle shape should change. If it\n            should, update cfg accordingly.\n        Args:\n            cfg (configs): configs that contains training and multigrid specific\n                hyperparameters.\n            cur_epoch (int): current epoch index.\n        Returns:\n            cfg (configs): the updated cfg.\n            changed (bool): whether to change long cycle shape at this epoch\n        \"\"\"\n        base_b, base_t, base_s = get_current_long_cycle_shape(\n            self.schedule, cur_epoch)\n        if base_s != cfg.PIPELINE.train.transform[1]['MultiCrop'][\n                'target_size'] or base_t != cfg.PIPELINE.train.decode_sampler.num_frames:",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:51-74"
+    },
+    "8351": {
+        "file_id": 618,
+        "content": "This function, update_long_cycle(), checks if the long cycle shape should change before every epoch. If it should, it updates cfg accordingly. It takes in configs (cfg) and current epoch index (cur_epoch), and returns the updated cfg and a boolean indicating whether the long cycle shape changed. The function also retrieves the base_b, base_t, and base_s using get_current_long_cycle_shape(). If these values differ from the target size or number of frames in the cfg, it implies that the long cycle shape should change.",
+        "type": "comment"
+    },
+    "8352": {
+        "file_id": 618,
+        "content": "            #NOTE Modify\n            # no need to modify, used by pool_size in head, None when multigrid\n            # cfg.MODEL.head.num_frames = base_t\n            # cfg.MODEL.head.crop_size  = base_s\n            cfg.PIPELINE.train.decode_sampler.num_frames = base_t\n            cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] = base_s\n            cfg.DATASET.batch_size = base_b * cfg.MULTIGRID.default_batch_size  #change bs\n            bs_factor = (float(cfg.DATASET.batch_size) /\n                         cfg.MULTIGRID.bn_base_size)\n            if bs_factor == 1:  #single bs == bn_base_size (== 8)\n                cfg.MODEL.backbone.bn_norm_type = \"batchnorm\"\n            else:\n                cfg.MODEL.backbone.bn_norm_type = \"sub_batchnorm\"\n                cfg.MODEL.backbone.bn_num_splits = int(bs_factor)\n            cfg.MULTIGRID.long_cycle_sampling_rate = cfg.PIPELINE.train.decode_sampler.sampling_rate * (\n                cfg.MULTIGRID.default_temporal_size // base_t)\n            print(\"Long cycle updates:\")",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:75-94"
+    },
+    "8353": {
+        "file_id": 618,
+        "content": "This code sets the number of frames and crop size for the head and transform, adjusts batch size based on multigrid configuration, determines whether to use \"batchnorm\" or \"sub_batchnorm\", and sets the long cycle sampling rate. The output is a message stating if long cycle updates are enabled.",
+        "type": "comment"
+    },
+    "8354": {
+        "file_id": 618,
+        "content": "            print(\"\\tbn_norm_type: {}\".format(cfg.MODEL.backbone.bn_norm_type))\n            if cfg.MODEL.backbone.bn_norm_type == \"sub_batchnorm\":\n                print(\"\\tbn_num_splits: {}\".format(\n                    cfg.MODEL.backbone.bn_num_splits))\n            print(\"\\tTRAIN.batch_size[single card]: {}\".format(\n                cfg.DATASET.batch_size))\n            print(\"\\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}\".format(\n                cfg.PIPELINE.train.decode_sampler.num_frames,\n                cfg.MULTIGRID.long_cycle_sampling_rate))\n            print(\"\\tDATA.train_crop_size: {}\".format(\n                cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']))\n            return cfg, True\n        else:\n            return cfg, False\n    def get_long_cycle_schedule(self, cfg):\n        \"\"\"\n        Based on multigrid hyperparameters, define the schedule of a long cycle.\n        Args:\n            cfg (configs): configs that contains training and multigrid specific\n                hyperparameters.",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:95-115"
+    },
+    "8355": {
+        "file_id": 618,
+        "content": "The code is a function that checks the configuration for certain parameters related to multigrid training. It prints specific values and returns two values: a boolean indicating if the long cycle schedule should be used, and the original config unchanged.",
+        "type": "comment"
+    },
+    "8356": {
+        "file_id": 618,
+        "content": "        Returns:\n            schedule (list): Specifies a list long cycle base shapes and their\n                corresponding training epochs.\n        \"\"\"\n        steps = cfg.OPTIMIZER.learning_rate.steps\n        default_size = float(\n            cfg.PIPELINE.train.decode_sampler.num_frames *\n            cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']**\n            2)  # 32 * 224 * 224  C*H*W\n        default_iters = steps[-1]  # 196\n        # Get shapes and average batch size for each long cycle shape.\n        avg_bs = []\n        all_shapes = []\n        #        for t_factor, s_factor in cfg.MULTIGRID.long_cycle_factors:\n        for item in cfg.MULTIGRID.long_cycle_factors:\n            t_factor, s_factor = item[\"value\"]\n            base_t = int(\n                round(cfg.PIPELINE.train.decode_sampler.num_frames * t_factor))\n            base_s = int(\n                round(\n                    cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']\n                    * s_factor))\n            if cfg.MULTIGRID.SHORT_CYCLE:",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:116-141"
+    },
+    "8357": {
+        "file_id": 618,
+        "content": "This code calculates the schedule for multi-grid training, iterating over long cycle factor pairs in `cfg.MULTIGRID.long_cycle_factors`. It determines base shapes for each cycle, calculating `base_t` based on `cfg.PIPELINE.train.decode_sampler.num_frames` and `t_factor`, and `base_s` based on target size from `cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']` and `s_factor`. It also considers short cycle training flag, `cfg.MULTIGRID.SHORT_CYCLE`.",
+        "type": "comment"
+    },
+    "8358": {
+        "file_id": 618,
+        "content": "                shapes = [\n                    [\n                        base_t,\n                        cfg.MULTIGRID.default_crop_size *\n                        cfg.MULTIGRID.short_cycle_factors[0],\n                    ],\n                    [\n                        base_t,\n                        cfg.MULTIGRID.default_crop_size *\n                        cfg.MULTIGRID.short_cycle_factors[1],\n                    ],\n                    [base_t, base_s],\n                ]  #first two is short_cycle, last is the base long_cycle\n            else:\n                shapes = [[base_t, base_s]]\n            # (T, S) -> (B, T, S)\n            shapes = [[\n                int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]\n            ] for s in shapes]\n            avg_bs.append(np.mean([s[0] for s in shapes]))\n            all_shapes.append(shapes)\n        # Get schedule regardless of cfg.MULTIGRID.epoch_factor.\n        total_iters = 0\n        schedule = []\n        for step_index in range(len(steps) - 1):\n            step_epochs = steps[step_index + 1] - steps[step_index]",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:142-169"
+    },
+    "8359": {
+        "file_id": 618,
+        "content": "This code defines the multigrid training schedule for PaddleVideo. It sets the shapes for different grid levels, converts them to batch sizes, and calculates the average batch size. The code then computes the total number of iterations and generates the multigrid training schedule based on the steps provided.",
+        "type": "comment"
+    },
+    "8360": {
+        "file_id": 618,
+        "content": "            for long_cycle_index, shapes in enumerate(all_shapes):\n                #ensure each of 4 sequences run the same num of iters\n                cur_epochs = (step_epochs * avg_bs[long_cycle_index] /\n                              sum(avg_bs))\n                # get cur_iters from cur_epochs\n                cur_iters = cur_epochs / avg_bs[long_cycle_index]\n                total_iters += cur_iters\n                schedule.append((step_index, shapes[-1], cur_epochs))\n        iter_saving = default_iters / total_iters  # ratio between default iters and real iters\n        final_step_epochs = cfg.OPTIMIZER.learning_rate.max_epoch - steps[-1]\n        # We define the fine-tuning phase to have the same amount of iteration\n        # saving as the rest of the training.\n        #final_step_epochs / iter_saving make fine-tune having the same iters as training\n        ft_epochs = final_step_epochs / iter_saving * avg_bs[-1]\n        #        schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs))\n        schedule.append((step_index + 1, all_shapes[-1][-1], ft_epochs))",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:171-191"
+    },
+    "8361": {
+        "file_id": 618,
+        "content": "This code calculates the number of iterations for each sequence based on average batch sizes, and then appends the schedule with corresponding step index, shape, and epochs. It also ensures that the fine-tuning phase has the same number of iterations as the rest of the training.",
+        "type": "comment"
+    },
+    "8362": {
+        "file_id": 618,
+        "content": "        # Obtrain final schedule given desired cfg.MULTIGRID.epoch_factor.\n        x = (cfg.OPTIMIZER.learning_rate.max_epoch *\n             cfg.MULTIGRID.epoch_factor / sum(s[-1] for s in schedule))\n        final_schedule = []\n        total_epochs = 0\n        for s in schedule:\n            epochs = s[2] * x\n            total_epochs += epochs\n            final_schedule.append((s[0], s[1], int(round(total_epochs))))\n        print_schedule(final_schedule)\n        return final_schedule\ndef print_schedule(schedule):\n    \"\"\"\n    Log schedule.\n    \"\"\"\n    print(\n        \"Long_cycle_index\\tBase_shape(bs_factor,temporal_size,crop_size)\\tEpochs\"\n    )\n    for s in schedule:\n        print(\"{}\\t\\t\\t{}\\t\\t\\t\\t\\t{}\".format(s[0], s[1], s[2]))\ndef get_current_long_cycle_shape(schedule, epoch):\n    \"\"\"\n    Given a schedule and epoch index, return the long cycle base shape.\n    Args:\n        schedule (configs): configs that contains training and multigrid specific\n            hyperparameters.\n        cur_epoch (int): current epoch index.",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:193-224"
+    },
+    "8363": {
+        "file_id": 618,
+        "content": "This code calculates the final learning rate schedule for multigrid training based on a provided schedule, max_epoch, and epoch_factor. It then prints this new schedule. The function get_current_long_cycle_shape takes in this same schedule and current epoch index to return the long cycle base shape for the given epoch.",
+        "type": "comment"
+    },
+    "8364": {
+        "file_id": 618,
+        "content": "    Returns:\n        shapes (list): A list describing the base shape in a long cycle:\n            [batch size relative to default,\n            number of frames, spatial dimension].\n    \"\"\"\n    for s in schedule:\n        if epoch < s[-1]:\n            return s[1]\n    return schedule[-1][1]",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/multigrid.py:225-233"
+    },
+    "8365": {
+        "file_id": 618,
+        "content": "This function returns a list describing the base shape in a long cycle based on the current epoch and a given schedule. It iterates through the schedule, returning the appropriate shape if the current epoch is less than the scheduled value, otherwise it returns the last shape in the schedule.",
+        "type": "comment"
+    },
+    "8366": {
+        "file_id": 619,
+        "content": "/paddlevideo/utils/multigrid/save_load_helper.py",
+        "type": "filepath"
+    },
+    "8367": {
+        "file_id": 619,
+        "content": "This function ensures state dict consistency by comparing optimizer and model parameters, saving/loading checkpoints, and converting sub-bn to normal bn. It checks if certain layers are set to load and prints a message for unloaded weights before loading pre-trained weights and setting the optimizer's state dictionary.",
+        "type": "summary"
+    },
+    "8368": {
+        "file_id": 619,
+        "content": "import os\nimport numpy as np\nimport paddle\nimport copy\ndef sub_to_normal_bn(sd):\n    \"\"\"\n    When save, Convert the Sub-BN paprameters to normal BN parameters in a state dict.\n    There are two copies of BN layers in a Sub-BN implementation: `bn.bn` and\n    `bn.split_bn`. `bn.split_bn` is used during training and\n    \"compute_precise_bn\". Before saving or evaluation, its stats are copied to\n    `bn.bn`. We rename `bn.bn` to `bn` and store it to be consistent with normal\n    BN layers.\n    Args:\n        sd (OrderedDict): a dict of parameters which might contain Sub-BN\n        parameters.\n    Returns:\n        new_sd (OrderedDict): a dict with Sub-BN parameters reshaped to\n        normal parameters.\n    \"\"\"\n    modifications = [\n        (\"bn.bn._mean\", \"bn._mean\"),\n        (\"bn.bn._variance\", \"bn._variance\"),\n    ]\n    to_remove = [\"bn.bn.\", \".split_bn.\"]\n    key_list = list(sd.keys())  #odict_keys to list\n    for key in key_list:\n        for before, after in modifications:\n            if key.endswith(before):\n                new_key = key.split(before)[0] + after",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/save_load_helper.py:1-31"
+    },
+    "8369": {
+        "file_id": 619,
+        "content": "This function converts Sub-BN parameters to normal BN parameters in a state dict. It renames `bn.bn` to `bn`, and modifies `_mean` and `_variance` accordingly. This is done before saving or evaluation to maintain consistency with normal BN layers. The modifications are made by iterating through the dictionary and checking if the key ends with the appropriate string, then updating it accordingly.",
+        "type": "comment"
+    },
+    "8370": {
+        "file_id": 619,
+        "content": "                sd[new_key] = sd.pop(key)\n        for rm in to_remove:\n            if rm in key and key in sd:\n                del sd[key]\ndef normal_to_sub_bn(checkpoint_sd, model_sd):\n    \"\"\"\n    When load, Convert BN parameters to Sub-BN parameters if model contains Sub-BNs.\n    Args:\n        checkpoint_sd (OrderedDict): source dict of parameters.\n        model_sd (OrderedDict): target dict of parameters.\n    Returns:\n        new_sd (OrderedDict): converted dict of parameters.\n    \"\"\"\n    for key in model_sd:\n        if key not in checkpoint_sd:\n            # not to replace bn.weight and bn.bias\n            if \"bn.split_bn.\" in key and \"bn.weight\" not in key and \"bn.bias\" not in key:\n                load_key = key.replace(\"bn.split_bn.\", \"bn.\")\n                bn_key = key.replace(\"bn.split_bn.\", \"bn.bn.\")\n                checkpoint_sd[key] = checkpoint_sd.pop(load_key)\n                checkpoint_sd[bn_key] = checkpoint_sd[key]\n    # match the shape of bn.split_bn._xx\n    # model_sd: split_bn.rm.shape = num_feature*num_split",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/save_load_helper.py:32-58"
+    },
+    "8371": {
+        "file_id": 619,
+        "content": "This function converts BN parameters to Sub-BN parameters when loading a checkpoint into a model containing Sub-BNs. It loops through the model's parameters, if a parameter has the \"bn.split_bn.\" prefix and is not the weight or bias of BN, it renames and moves the corresponding value from the checkpoint dict to the bn.bn key in the same subdict. Finally, it adjusts the shape of the Sub-BN parameters to match the original BN parameters' shape.",
+        "type": "comment"
+    },
+    "8372": {
+        "file_id": 619,
+        "content": "    # checkpoint_sd: split_bn.rm.shape = bn.rm.shape = num_feature\n    for key in model_sd:\n        if key in checkpoint_sd:\n            model_blob_shape = model_sd[key].shape  #bn.split_bn\n            c2_blob_shape = checkpoint_sd[key].shape  #bn.bn\n            if (len(model_blob_shape) == 1 and len(c2_blob_shape) == 1\n                    and model_blob_shape[0] > c2_blob_shape[0]\n                    and model_blob_shape[0] % c2_blob_shape[0] == 0):\n                before_shape = checkpoint_sd[key].shape\n                checkpoint_sd[key] = np.concatenate(\n                    [checkpoint_sd[key]] *\n                    (model_blob_shape[0] // c2_blob_shape[0]))\n                if 'split_bn' not in key:  #split_bn is excepted\n                    print(\"{} {} -> {}\".format(key, before_shape,\n                                               checkpoint_sd[key].shape))\n    return checkpoint_sd\ndef mapping_opt_dict(opt_dict, model_key_list):\n    \"\"\"\n    Paddle Name schedule: conv_1.w -> conv_2.w\n    Sometimes: sub_bn -> bn",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/save_load_helper.py:59-81"
+    },
+    "8373": {
+        "file_id": 619,
+        "content": "This code is comparing the shape of certain keys in the model and checkpoint dictionaries. If they match certain criteria, it will concatenate the checkpoint key to expand its size based on the model's shape. This is done for specific keys in the dictionary, except 'split_bn'. The function prints out the before and after shapes of the affected keys.",
+        "type": "comment"
+    },
+    "8374": {
+        "file_id": 619,
+        "content": "    when re-build model, we desire the parameter name to be coincident,\n    but the parameters name index will be added, as conv_1 to conv_2, not conv_1.\n    It will raise error if we set old saved parameters to new created optimizer.\n    as conv_2 cannot find in state_dict(only conv_1).\n    Args:\n        opt_dict: optimizer state dict, including the name and value of parameters gradient.\n        model_key_list: the parameters name list of re-build model.\n    Return: optimizer state dict with modified keys\n    \"\"\"\n    def get_name_info(PNAME, PN_key_list, key_list):\n        min_index = float('inf')\n        max_index = 0\n        for name in PN_key_list[1:]:\n            for key in key_list:\n                if name in key:\n                    index = int(key.split('.')[0].split(name)[-1])\n                    if index < min_index:\n                        min_index = index\n                    if index > max_index:\n                        max_index = index\n            num_name = max_index - min_index + 1\n            PNAME[name].append((min_index, max_index, num_name))",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/save_load_helper.py:82-103"
+    },
+    "8375": {
+        "file_id": 619,
+        "content": "This function takes an optimizer state dict and a list of parameter names from a rebuilt model. It aims to modify the keys in the optimizer state dict to match the new parameters' names, while also considering any added index for better compatibility. The function then returns the modified optimizer state dict.",
+        "type": "comment"
+    },
+    "8376": {
+        "file_id": 619,
+        "content": "            min_index = float('inf')\n            max_index = 0\n    PNAME = {\n        \"LR_Scheduler\": [],\n        \"conv3d_\": [],\n        \"linear_\": [],\n        \"sub_batch_norm3d_\": [],\n        \"batch_norm3d_\": [],\n    }\n    pd_key_list = list(opt_dict.keys())\n    print(\"The number of parameters in saved optimizer state dict = {}\".format(\n        len(pd_key_list)))\n    print(\"The number of parameters in re-build model list = {}\".format(\n        len(model_key_list)))\n    # 1 may be LR_Scheduler\n    PN_key_list = list(PNAME.keys())\n    # get the number of each PNAME\n    get_name_info(PNAME, PN_key_list, pd_key_list)\n    get_name_info(PNAME, PN_key_list, model_key_list)\n    print(\"[Parameters info] prefix: min_index, max_index, number_params: \\n\",\n          PNAME)\n    # whether to change name of bn layer\n    change_name = False\n    if PNAME[\"sub_batch_norm3d_\"][0][-1] == -float('inf'):\n        PN_key_list.remove(\"sub_batch_norm3d_\")\n        if PNAME[\"sub_batch_norm3d_\"][1][-1] != -float('inf'):\n            print(\n                \"Optimizer state dict saved bn, but Re-build model use sub_bn, changed name!\"",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/save_load_helper.py:104-135"
+    },
+    "8377": {
+        "file_id": 619,
+        "content": "This code appears to be a part of a larger program that compares the parameters in an optimizer state dict with those in a re-built model. It calculates and prints information about the number of parameters associated with each prefix, checks if batch normalization layers need their names changed, and potentially removes the \"sub_batch_norm3d_\" prefix from consideration. The code assumes that the \"opt_dict\" and \"model\" variables have already been defined elsewhere.",
+        "type": "comment"
+    },
+    "8378": {
+        "file_id": 619,
+        "content": "            )\n            change_name = True\n        else:\n            print(\"Optimizer state dict saved bn, and Re-build model use bn\")\n    else:\n        PN_key_list.remove(\"batch_norm3d_\")\n        if PNAME[\"sub_batch_norm3d_\"][1][-1] == -float('inf'):\n            print(\n                \"Optimizer state dict saved sub_bn, but Re-build model use bn, changed name!\"\n            )\n            change_name = True\n        else:\n            print(\n                \"Optimizer state dict saved sub_bn, Re-build model use sub_bn\")\n    #update key name\n    # sub_bn -> bn name mapping, pre-define dict\n    change_dict = {\n        \"sub_batch_norm3d_\": \"batch_norm3d_\",\n        \"batch_norm3d_\": \"sub_batch_norm3d_\"\n    }\n    for key in pd_key_list:\n        for name in PN_key_list[1:]:\n            if key.startswith(name):\n                start = change_dict[name] if (\n                    change_name and \"batch_norm\" in name) else name\n                str_index = key.split('.')[0].split(name)[-1]\n                index = int(str_index)",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/save_load_helper.py:136-163"
+    },
+    "8379": {
+        "file_id": 619,
+        "content": "The code checks if the optimizer state dict saved batch normalization (bn) or sub_batch_normalization and updates the key names accordingly. If the state dict saved bn but the model uses sub_bn, it prints a message and changes the name. If the state dict saved sub_bn and the model also uses sub_bn, it prints a separate message. The code then defines a change_dict mapping and iterates over the key list to update the names if required.",
+        "type": "comment"
+    },
+    "8380": {
+        "file_id": 619,
+        "content": "                new_index = str(index +\n                                (PNAME[start][1][0] - PNAME[name][0][0]))\n                end = key.split('.')[-1]\n                update_key = start + new_index + '.' + end\n                opt_dict[update_key] = opt_dict.pop(key)\n    return opt_dict\ndef subn_save(save_dir, name_prefix, epoch, video_model, optimizer):\n    if not os.path.isdir(save_dir):\n        os.makedirs(save_dir)\n    model_path = os.path.join(save_dir, name_prefix + \"{:05d}\".format(epoch))\n    model_dict = video_model.state_dict()\n    sub_to_normal_bn(model_dict)\n    opti_dict = optimizer.state_dict()\n    paddle.save(model_dict, model_path + '.pdparams')\n    paddle.save(opti_dict, model_path + '.pdopt')\n    print('[Saved Epoch {} parameters and optimizer state ]'.format(epoch))\ndef subn_load(model, ck_path, optimizer=None):\n    \"\"\"\n    Load the checkpoint from the given file.\n    Args:\n        model (model): model to load the weights from the checkpoint.\n        optimizer (optim, optional): optimizer to load the historical state.",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/save_load_helper.py:164-190"
+    },
+    "8381": {
+        "file_id": 619,
+        "content": "This code defines two functions: \"subn_save\" and \"subn_load\". \"subn_save\" saves a model's state dictionary along with the optimizer's state dictionary to specified directories in a specific format. It also converts sub-bn to normal bn before saving, and prints a message confirming the save operation. \"subn_load\" loads checkpoints from given files into the specified model and optionally an optimizer.",
+        "type": "comment"
+    },
+    "8382": {
+        "file_id": 619,
+        "content": "        ck_path (str): checkpoint path\n    Returns:\n        (int): the number of training epoch of the checkpoint.\n    \"\"\"\n    assert os.path.exists(ck_path + \".pdparams\"), \\\n        \"Given dir {}.pdparams not exist.\".format(ck_path)\n    print(\"load checkpint from {}.pdparams\".format(ck_path))\n    model_dict = model.state_dict()\n    checkpoint_dict = paddle.load(ck_path + \".pdparams\")\n    #    checkpoint_dict = copy.deepcopy(checkpoint_dict_orig)  #not modify when multi card\n    pre_train_dict = normal_to_sub_bn(checkpoint_dict, model_dict)\n    # Match pre-trained weights that have same shape as current model.\n    pre_train_dict_match = {\n        k: v\n        for k, v in pre_train_dict.items()\n        if k in model_dict and tuple(v.shape) == tuple(model_dict[k].shape)\n    }\n    # Weights that do not have match from the pre-trained model.\n    not_load_layers = [\n        k for k in model_dict.keys() if k not in pre_train_dict_match.keys()\n    ]\n    # Log weights that are not loaded with the pre-trained weights.",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/save_load_helper.py:191-216"
+    },
+    "8383": {
+        "file_id": 619,
+        "content": "This function loads checkpoints from a specific path and returns the number of training epochs. It ensures that the given directory has .pdparams file, prints the checkpoint loading information, copies model state dictionary, and compares the shapes of pre-trained weights to current model weights for matching. It also identifies layers that are not loaded with pre-trained weights.",
+        "type": "comment"
+    },
+    "8384": {
+        "file_id": 619,
+        "content": "    if not_load_layers:\n        for k in not_load_layers:\n            if 'bn.weight' not in k and 'bn.bias' not in k:\n                print(\"Network weights {} not loaded.\".format(k))\n    # Load pre-trained weights.\n    model.set_state_dict(pre_train_dict_match)\n    if optimizer:\n        assert os.path.exists(ck_path + \".pdopt\"), \\\n            \"Given dir {}.pdopt not exist.\".format(ck_path)\n        print(\"load checkpint from {}.pdopt\".format(ck_path))\n        opt_dict = paddle.load(ck_path + \".pdopt\")\n        # get parameters that required gradient from re-build model\n        model_key_list = []\n        for param in model.parameters():\n            if param.stop_gradient == False:\n                model_key_list.append(param.name)\n        new_opt_dict = mapping_opt_dict(opt_dict, model_key_list)\n        optimizer.set_state_dict(new_opt_dict)",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/save_load_helper.py:217-237"
+    },
+    "8385": {
+        "file_id": 619,
+        "content": "This code block checks if certain layers in the model are not set to load, and prints a message if those weights are not loaded. It then loads the pre-trained weights for the model and checks if a specific file exists before loading the optimizer's state dictionary from that file. The function mapping_opt_dict is called to create a new dictionary containing only parameters that require gradient, which is then set as the state dictionary of the optimizer.",
+        "type": "comment"
+    },
+    "8386": {
+        "file_id": 620,
+        "content": "/paddlevideo/utils/multigrid/short_sampler.py",
+        "type": "filepath"
+    },
+    "8387": {
+        "file_id": 620,
+        "content": "DistributedShortSampler streamlines distributed data loading, dynamic batch sizes, and GPU support for PaddleVideo's multigrid. It efficiently calculates average batch size and offers sample dropping options.",
+        "type": "summary"
+    },
+    "8388": {
+        "file_id": 620,
+        "content": "from __future__ import print_function\nfrom __future__ import division\nimport numpy as np\nimport math\nimport paddle\n__all__ = [\"DistributedShortSampler\"]\nclass DistributedShortSampler(paddle.io.BatchSampler):\n    \"\"\"Sampler that restricts data loading to a subset of the dataset.\n    In such case, each process can pass a DistributedBatchSampler instance\n    as a DataLoader sampler, and load a subset of the original dataset that\n    is exclusive to it.\n    .. note::\n        Batch size is dynamic changed following short cycle schedule.\n    Args:\n        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement\n                     or other python object which implemented\n                     `__len__` for BatchSampler to get sample\n                     number of data source.\n        batch_sizes(list): batch size list of one cycle.\n        num_replicas(int, optional): porcess number in distributed training.\n            If :attr:`num_replicas` is None, :attr:`num_replicas` will be\n            retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/short_sampler.py:1-28"
+    },
+    "8389": {
+        "file_id": 620,
+        "content": "The code defines a DistributedShortSampler class which is a sampler for restricting data loading to a subset of the dataset in distributed training. It allows each process to load exclusive subsets by passing the DistributedBatchSampler as a DataLoader sampler and supports dynamic batch size changes following short cycle schedules. The class takes in a dataset, batch_sizes list, and optionally num_replicas (process number in distributed training).",
+        "type": "comment"
+    },
+    "8390": {
+        "file_id": 620,
+        "content": "            Default None.\n        rank(int, optional): the rank of the current process among :attr:`num_replicas`\n            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from\n            :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.\n        shuffle(bool): whther to shuffle indices order before genrating\n            batch indices. Default False.\n        drop_last(bool): whether drop the last incomplete batch dataset size\n            is not divisible by the batch size. Default False\n    \"\"\"\n    def __init__(self,\n                 dataset,\n                 batch_sizes,\n                 num_replicas=None,\n                 rank=None,\n                 shuffle=False,\n                 drop_last=False):\n        self.dataset = dataset\n        assert any(isinstance(batch_size, int) and batch_size > 0 for batch_size in batch_sizes), \\\n            \"batch_size should be a positive integer\"\n        self.batch_sizes = batch_sizes\n        self.len_batch_sizes = len(self.batch_sizes)\n        assert isinstance(shuffle, bool), \\",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/short_sampler.py:29-51"
+    },
+    "8391": {
+        "file_id": 620,
+        "content": "The `__init__` method initializes an instance of the class with a dataset, batch sizes, number of replicas (optional), rank (optional), whether to shuffle indices (optional), and whether to drop last incomplete batch (optional). The batch_sizes should be positive integers. The method performs assertions on the inputs to ensure validity.",
+        "type": "comment"
+    },
+    "8392": {
+        "file_id": 620,
+        "content": "            \"shuffle should be a boolean value\"\n        self.shuffle = shuffle\n        assert isinstance(drop_last, bool), \\\n            \"drop_last should be a boolean number\"\n        if num_replicas is not None:\n            assert isinstance(num_replicas, int) and num_replicas > 0, \\\n                \"num_replicas should be a positive integer\"\n            self.nranks = num_replicas\n        else:\n            self.nranks = paddle.distributed.ParallelEnv().nranks\n        if rank is not None:\n            assert isinstance(rank, int) and rank >= 0, \\\n                \"rank should be a non-negative integer\"\n            self.local_rank = rank\n        else:\n            self.local_rank = paddle.distributed.ParallelEnv().local_rank\n        self.drop_last = drop_last\n        self.epoch = 0\n        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))\n        self.total_size = self.num_samples * self.nranks\n    def __iter__(self):\n        num_samples = len(self.dataset)\n        indices = np.arange(num_samples).tolist()",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/short_sampler.py:52-79"
+    },
+    "8393": {
+        "file_id": 620,
+        "content": "The code initializes a MultigridSampler object, which manages the sampling of data across multiple ranks in distributed training. It checks for valid input values (boolean for shuffle and drop_last) and ensures positive integer for num_replicas. It determines the number of ranks and local rank based on provided values or environment. The total number of samples is calculated based on the dataset size and number of ranks, and an array of indices is created.",
+        "type": "comment"
+    },
+    "8394": {
+        "file_id": 620,
+        "content": "        indices += indices[:(self.total_size -\n                             len(indices))]  #completion last iter\n        assert len(indices) == self.total_size\n        if self.shuffle:\n            np.random.RandomState(self.epoch).shuffle(indices)\n            self.epoch += 1\n        # subsample\n        def _get_indices_by_batch_size(indices):\n            total_batch_size = sum(self.batch_sizes)\n            subsampled_indices = []\n            last_batch_size = self.total_size % (\n                total_batch_size * self.nranks)  #number samples of last batch\n            assert last_batch_size % self.nranks == 0\n            last_local_batch_size = last_batch_size // self.nranks\n            for i in range(self.local_rank * total_batch_size,\n                           len(indices) - last_batch_size,\n                           total_batch_size * self.nranks):\n                subsampled_indices.extend(indices[i:i + total_batch_size])\n            indices = indices[len(indices) - last_batch_size:]\n            subsampled_indices.extend(",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/short_sampler.py:80-102"
+    },
+    "8395": {
+        "file_id": 620,
+        "content": "This code ensures that the number of samples selected is equal to the total size, and then subsamples them by batch sizes. It handles the last batch with potentially fewer samples due to modulo operations and shuffles the indices if desired.",
+        "type": "comment"
+    },
+    "8396": {
+        "file_id": 620,
+        "content": "                indices[self.local_rank *\n                        last_local_batch_size:(self.local_rank + 1) *\n                        last_local_batch_size])\n            return subsampled_indices\n        if self.nranks > 1:\n            indices = _get_indices_by_batch_size(indices)\n        assert len(indices) == self.num_samples  #index length in each card\n        _sample_iter = iter(indices)\n        batch_indices = []\n        counter = 0\n        batch_size = self.batch_sizes[0]\n        for idx in _sample_iter:\n            batch_indices.append(\n                (idx, counter %\n                 self.len_batch_sizes))  #to be used in dataloader get_item\n            if len(batch_indices) == batch_size:\n                yield batch_indices\n                counter += 1\n                batch_size = self.batch_sizes[counter % self.len_batch_sizes]\n                batch_indices = []\n        if not self.drop_last and len(batch_indices) > 0:\n            yield batch_indices\n    def __len__(self):\n        avg_batch_size = sum(self.batch_sizes) / float(self.len_batch_sizes)",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/short_sampler.py:103-130"
+    },
+    "8397": {
+        "file_id": 620,
+        "content": "This code is responsible for creating a sampler that supports dynamic batch sizes. It first sub-samples the input indices based on the local rank and local batch size. Then, it handles cases with multiple GPUs (ranks > 1), dividing the indices into batches of uniform size. Finally, it yields these batches until all samples have been used, or if the drop_last flag is set to False, it yields remaining samples even if they don't form a full batch. The average batch size is also calculated and stored in the class variable avg_batch_size.",
+        "type": "comment"
+    },
+    "8398": {
+        "file_id": 620,
+        "content": "        if self.drop_last:\n            return int(np.floor(self.num_samples / avg_batch_size))\n        else:\n            return int(np.ceil(self.num_samples / avg_batch_size))\n    def set_epoch(self, epoch):\n        \"\"\"\n        Sets the epoch number. When :attr:`shuffle=True`, this number is used\n        as seeds of random numbers. By default, users may not set this, all\n        replicas (workers) use a different random ordering for each epoch.\n        If set same number at each epoch, this sampler will yield the same\n        ordering at all epoches.\n        Arguments:\n            epoch (int): Epoch number.\n        \"\"\"\n        self.epoch = epoch",
+        "type": "code",
+        "location": "/paddlevideo/utils/multigrid/short_sampler.py:131-146"
+    },
+    "8399": {
+        "file_id": 620,
+        "content": "This code defines a class for a sampler that can be used with PaddleVideo's multigrid. It calculates the number of samples to return based on batch size and either rounds down or up depending on whether drop_last is set. The set_epoch method sets the epoch number and, when shuffle is True, uses it as seeds for random numbers. This can result in the same ordering being yielded at all epochs if the same number is set each time.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/84.json b/docs/data/84.json
new file mode 100644
index 000000000..1a7d8c919
--- /dev/null
+++ b/docs/data/84.json
@@ -0,0 +1,543 @@
+{
+    "8400": {
+        "file_id": 621,
+        "content": "/paddlevideo/utils/precise_bn.py",
+        "type": "filepath"
+    },
+    "8401": {
+        "file_id": 621,
+        "content": "This code defines a function for precise batch normalization that recomputes and updates BN statistics, improving accuracy while speeding up training and saving memory.",
+        "type": "summary"
+    },
+    "8402": {
+        "file_id": 621,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport itertools\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n\"\"\"\nImplement precise bn, which is useful for improving accuracy.\n\"\"\"\n@paddle.no_grad()  # speed up and save CUDA memory\ndef do_preciseBN(model,\n                 data_loader,\n                 parallel,\n                 num_iters=200,\n                 use_amp=False,\n                 amp_level=None):\n    \"\"\"\n    Recompute and update the batch norm stats to make them more precise. During",
+        "type": "code",
+        "location": "/paddlevideo/utils/precise_bn.py:1-34"
+    },
+    "8403": {
+        "file_id": 621,
+        "content": "The code is importing necessary libraries and defining a function for the precise batch normalization (BN) technique. This BN improves accuracy by recomputing and updating batch norm statistics to make them more precise, which can speed up training and save memory. The function takes in a model, data loader, parallel flag, number of iterations, whether to use automatic mixed precision, and the AMP level.",
+        "type": "comment"
+    },
+    "8404": {
+        "file_id": 621,
+        "content": "    training both BN stats and the weight are changing after every iteration, so\n    the running average can not precisely reflect the actual stats of the\n    current model.\n    In this function, the BN stats are recomputed with fixed weights, to make\n    the running average more precise. Specifically, it computes the true average\n    of per-batch mean/variance instead of the running average.\n    This is useful to improve validation accuracy.\n    Args:\n        model: the model whose bn stats will be recomputed\n        data_loader: an iterator. Produce data as input to the model\n        num_iters: number of iterations to compute the stats.\n    Return:\n        the model with precise mean and variance in bn layers.\n    \"\"\"\n    bn_layers_list = [\n        m for m in model.sublayers()\n        if any((isinstance(m, bn_type)\n                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,\n                                paddle.nn.BatchNorm3D))) and m.training\n    ]\n    if len(bn_layers_list) == 0:\n        return",
+        "type": "code",
+        "location": "/paddlevideo/utils/precise_bn.py:35-56"
+    },
+    "8405": {
+        "file_id": 621,
+        "content": "This function recomputes the BN stats for a given model using fixed weights to improve validation accuracy. It targets specific BN layers and runs iterations to compute precise mean and variance values. This is useful when training both BN stats and weights are changing with every iteration, affecting running averages.",
+        "type": "comment"
+    },
+    "8406": {
+        "file_id": 621,
+        "content": "    # moving_mean=moving_mean*momentum+batch_mean*(1.−momentum)\n    # we set momentum=0. to get the true mean and variance during forward\n    momentum_actual = [bn._momentum for bn in bn_layers_list]\n    for bn in bn_layers_list:\n        bn._momentum = 0.\n    running_mean = [paddle.zeros_like(bn._mean)\n                    for bn in bn_layers_list]  # pre-ignore\n    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]\n    ind = -1\n    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):\n        logger.info(\"Computing precise BN {} / {}...\".format(\n            ind + 1, num_iters))\n        if use_amp:\n            with paddle.amp.auto_cast(\n                    custom_black_list={\"reduce_mean\",\n                                       \"conv3d\"}, level=amp_level):\n                model(data, mode='train')\n        else:\n            model(data, mode='train')\n        for i, bn in enumerate(bn_layers_list):\n            # Accumulates the bn stats.\n            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)",
+        "type": "code",
+        "location": "/paddlevideo/utils/precise_bn.py:58-83"
+    },
+    "8407": {
+        "file_id": 621,
+        "content": "This code resets the momentum in Batch Normalization layers to 0, calculates precise Batch Normalization by accumulating batch means and variances across iterations, and then updates the running mean and variance.",
+        "type": "comment"
+    },
+    "8408": {
+        "file_id": 621,
+        "content": "            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)\n    assert ind == num_iters - 1, (\n        \"update_bn_stats is meant to run for {} iterations, but the dataloader stops at {} iterations.\"\n        .format(num_iters, ind))\n    # Sets the precise bn stats.\n    for i, bn in enumerate(bn_layers_list):\n        bn._mean.set_value(running_mean[i])\n        bn._variance.set_value(running_var[i])\n        bn._momentum = momentum_actual[i]",
+        "type": "code",
+        "location": "/paddlevideo/utils/precise_bn.py:84-94"
+    },
+    "8409": {
+        "file_id": 621,
+        "content": "This code updates batch normalization (BN) statistics based on the running mean, variance, and momentum. It asserts that the dataloader has run for the expected number of iterations before setting these values to the BN layers.",
+        "type": "comment"
+    },
+    "8410": {
+        "file_id": 622,
+        "content": "/paddlevideo/utils/profiler.py",
+        "type": "filepath"
+    },
+    "8411": {
+        "file_id": 622,
+        "content": "This code is part of PaddleVideo's profiler module, which allows performance analysis and optimization. It initializes a profiler object and starts/stops profiling based on step ID and specified batch range, generating summary reports in ms units.",
+        "type": "summary"
+    },
+    "8412": {
+        "file_id": 622,
+        "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nimport paddle.profiler as profiler\n# A global variable to record the number of calling times for profiler\n# functions. It is used to specify the tracing range of training steps.\n_profiler_step_id = 0\n# A global variable to avoid parsing from string every time.\n_profiler_options = None\n_prof = None\nclass ProfilerOptions(object):\n    '''\n    Use a string to initialize a ProfilerOptions.\n    The string should be in the format: \"key1=value1;key2=value;key3=value3\".",
+        "type": "code",
+        "location": "/paddlevideo/utils/profiler.py:1-29"
+    },
+    "8413": {
+        "file_id": 622,
+        "content": "This code is a part of PaddleVideo's profiler module, which allows for performance analysis and optimization. It imports the necessary libraries, initializes global variables, and defines the ProfilerOptions class to configure profiling options using a string in key-value format.",
+        "type": "comment"
+    },
+    "8414": {
+        "file_id": 622,
+        "content": "    For example:\n      \"profile_path=model.profile\"\n      \"batch_range=[50, 60]; profile_path=model.profile\"\n      \"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile\"\n    ProfilerOptions supports following key-value pair:\n      batch_range      - a integer list, e.g. [100, 110].\n      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. \n      sorted_key       - a string, the optional values are 'calls', 'total',\n                         'max', 'min' or 'ave.\n      tracer_option    - a string, the optional values are 'Default', 'OpDetail',\n                         'AllOpDetail'.\n      profile_path     - a string, the path to save the serialized profile data,\n                         which can be used to generate a timeline.\n      exit_on_finished - a boolean.\n    '''\n    def __init__(self, options_str):\n        assert isinstance(options_str, str)\n        self._options = {\n            'batch_range': [10, 20],\n            'state': 'All',\n            'sorted_key': 'total',",
+        "type": "code",
+        "location": "/paddlevideo/utils/profiler.py:30-53"
+    },
+    "8415": {
+        "file_id": 622,
+        "content": "The code defines a class \"ProfilerOptions\" with options for profiling. It takes an options string as input and has attributes for batch range (default [10, 20]), state (default 'All'), sorted key (default 'total'), tracer option (default 'Default'), profile path (empty string), and exit on finished flag (False).",
+        "type": "comment"
+    },
+    "8416": {
+        "file_id": 622,
+        "content": "            'tracer_option': 'Default',\n            'profile_path': '/tmp/profile',\n            'exit_on_finished': True,\n            'timer_only': True\n        }\n        self._parse_from_string(options_str)\n    def _parse_from_string(self, options_str):\n        for kv in options_str.replace(' ', '').split(';'):\n            key, value = kv.split('=')\n            if key == 'batch_range':\n                value_list = value.replace('[', '').replace(']', '').split(',')\n                value_list = list(map(int, value_list))\n                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[\n                        1] > value_list[0]:\n                    self._options[key] = value_list\n            elif key == 'exit_on_finished':\n                self._options[key] = value.lower() in (\"yes\", \"true\", \"t\", \"1\")\n            elif key in [\n                    'state', 'sorted_key', 'tracer_option', 'profile_path'\n            ]:\n                self._options[key] = value\n            elif key == 'timer_only':\n                self._options[key] = value",
+        "type": "code",
+        "location": "/paddlevideo/utils/profiler.py:54-77"
+    },
+    "8417": {
+        "file_id": 622,
+        "content": "The code defines a class with an option parser. It parses options from a string, sets batch range if present, handles exit_on_finished flag, and updates other specified options (state, sorted_key, tracer_option, profile_path, timer_only).",
+        "type": "comment"
+    },
+    "8418": {
+        "file_id": 622,
+        "content": "    def __getitem__(self, name):\n        if self._options.get(name, None) is None:\n            raise ValueError(\n                \"ProfilerOptions does not have an option named %s.\" % name)\n        return self._options[name]\ndef add_profiler_step(options_str=None):\n    '''\n    Enable the operator-level timing using PaddlePaddle's profiler.\n    The profiler uses a independent variable to count the profiler steps.\n    One call of this function is treated as a profiler step.\n    Args:\n      profiler_options - a string to initialize the ProfilerOptions.\n                         Default is None, and the profiler is disabled.\n    '''\n    if options_str is None:\n        return\n    global _prof \n    global _profiler_step_id\n    global _profiler_options\n    if _profiler_options is None:\n        _profiler_options = ProfilerOptions(options_str)\n    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan\n    # timer_only = True  only the model's throughput and time overhead are displayed",
+        "type": "code",
+        "location": "/paddlevideo/utils/profiler.py:79-105"
+    },
+    "8419": {
+        "file_id": 622,
+        "content": "This code provides a function to enable the operator-level timing using PaddlePaddle's profiler. The profiler step is initialized with options provided as a string. If no options are given, the profiler remains disabled. This can be used for performance analysis of models by measuring their throughput and time overhead.",
+        "type": "comment"
+    },
+    "8420": {
+        "file_id": 622,
+        "content": "    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.\n    # timer_only = False the output Timeline information can be found in the profiler_log directory\n    if _prof is None:\n        _timer_only = str(_profiler_options['timer_only']) == str(True)\n        _prof = profiler.Profiler(\n                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),\n                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),\n                   timer_only = _timer_only)\n        _prof.start()\n    else:\n        _prof.step()\n    if _profiler_step_id == _profiler_options['batch_range'][1]:\n        _prof.stop()\n        _prof.summary(\n             op_detail=True,\n             thread_sep=False,\n             time_unit='ms')\n        _prof = None\n        if _profiler_options['exit_on_finished']:\n            sys.exit(0)\n    _profiler_step_id += 1",
+        "type": "code",
+        "location": "/paddlevideo/utils/profiler.py:106-128"
+    },
+    "8421": {
+        "file_id": 622,
+        "content": "This code initializes a profiler object with specified scheduler range and timer_only option, then starts the profiling process. If the step ID matches the specified batch range, it stops the profiling, generates a summary report in ms units, clears the profiler, and exits the program if instructed to do so.",
+        "type": "comment"
+    },
+    "8422": {
+        "file_id": 623,
+        "content": "/paddlevideo/utils/record.py",
+        "type": "filepath"
+    },
+    "8423": {
+        "file_id": 623,
+        "content": "This code records metrics, calculates means, logs batch info and epoch progress in training processes with colored formatting for visibility. It uses PaddleVideo framework, AverageMeter and OrderedDict for efficient logging.",
+        "type": "summary"
+    },
+    "8424": {
+        "file_id": 623,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport datetime\nfrom collections import OrderedDict\nimport paddle\nfrom .logger import coloring, get_logger\nlogger = get_logger(\"paddlevideo\")\n__all__ = ['AverageMeter', 'build_record', 'log_batch', 'log_epoch']\ndef build_record(cfg):\n    record_list = [\n        (\"loss\", AverageMeter('loss', '7.5f')),\n        (\"lr\", AverageMeter('lr', 'f', need_avg=False)),\n    ]\n    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework",
+        "type": "code",
+        "location": "/paddlevideo/utils/record.py:1-32"
+    },
+    "8425": {
+        "file_id": 623,
+        "content": "Code snippet imports necessary libraries and defines functions for building a record, logging batches and epochs. It also sets up logger for the PaddleVideo framework.",
+        "type": "comment"
+    },
+    "8426": {
+        "file_id": 623,
+        "content": "        record_list.append((\"hit_at_one\", AverageMeter(\"hit_at_one\", '.5f')))\n        record_list.append((\"perr\", AverageMeter(\"perr\", '.5f')))\n        record_list.append((\"gap\", AverageMeter(\"gap\", '.5f')))\n    elif 'Recognizer' in cfg.framework:\n        record_list.append((\"top1\", AverageMeter(\"top1\", '.5f')))\n        record_list.append((\"top5\", AverageMeter(\"top5\", '.5f')))\n    elif 'FastRCNN' in cfg.framework:\n        record_list.append(\n            (\"recall@thr=0.5\", AverageMeter(\"recall@thr=0.5\", '.5f')))\n        record_list.append((\"prec@thr=0.5\", AverageMeter(\"prec@thr=0.5\",\n                                                         '.5f')))\n        record_list.append((\"recall@top3\", AverageMeter(\"recall@top3\", '.5f')))\n        record_list.append((\"prec@top3\", AverageMeter(\"prec@top3\", '.5f')))\n        record_list.append((\"recall@top5\", AverageMeter(\"recall@top5\", '.5f')))\n        record_list.append((\"prec@top5\", AverageMeter(\"prec@top5\", '.5f')))\n        record_list.append((\"mAP@0.5IOU\", AverageMeter(\"mAP@0.5IOU\", '.5f')))",
+        "type": "code",
+        "location": "/paddlevideo/utils/record.py:33-48"
+    },
+    "8427": {
+        "file_id": 623,
+        "content": "Code appends specific metrics to the record list based on the framework specified in cfg. Frameworks include 'PaddleVideo', 'Recognizer', and 'FastRCNN'. Metrics are averaged using AverageMeter and include 'hit_at_one', 'perr', 'gap', 'top1', 'top5', recall@thr=0.5, prec@thr=0.5, recall@top3, prec@top3, recall@top5, prec@top5, and mAP@0.5IOU.",
+        "type": "comment"
+    },
+    "8428": {
+        "file_id": 623,
+        "content": "    elif 'DepthEstimator' in cfg.framework:\n        record_list.append((\"abs_rel\", AverageMeter(\"abs_rel\", '.5f')))\n        record_list.append((\"sq_rel\", AverageMeter(\"sq_rel\", '.5f')))\n        record_list.append((\"rmse\", AverageMeter(\"rmse\", '.5f')))\n        record_list.append((\"rmse_log\", AverageMeter(\"rmse_log\", '.5f')))\n        record_list.append((\"a1\", AverageMeter(\"a1\", '.5f')))\n        record_list.append((\"a2\", AverageMeter(\"a2\", '.5f')))\n        record_list.append((\"a3\", AverageMeter(\"a3\", '.5f')))\n        record_list.append((\"losses_day\", AverageMeter(\"losses_day\", '.5f')))\n        record_list.append((\"losses_night\", AverageMeter(\"losses_night\",\n                                                         '.5f')))\n    elif 'MSTCN' in cfg.framework or 'ASRF' in cfg.framework:\n        record_list.append((\"F1@0.50\", AverageMeter(\"F1@0.50\", '.5f')))\n    elif 'YOWOLocalizer' in cfg.framework:\n        record_list.append((\"nCorrect\", AverageMeter('nCorrect', '.1f')))\n        record_list.append((\"fscore\", AverageMeter(\"fscore\", '.5f')))",
+        "type": "code",
+        "location": "/paddlevideo/utils/record.py:49-65"
+    },
+    "8429": {
+        "file_id": 623,
+        "content": "The code is conditionally adding metrics to the record list based on the value of 'cfg.framework'. It handles three different cases: 'DepthEstimator', 'MSTCN' or 'ASRF', and 'YOWOLocalizer'. For 'DepthEstimator', it adds 9 metrics, for 'MSTCN' or 'ASRF', it adds one metric, and for 'YOWOLocalizer', it adds two metrics. Each metric is associated with an AverageMeter object that keeps track of its mean value over time.",
+        "type": "comment"
+    },
+    "8430": {
+        "file_id": 623,
+        "content": "    record_list.append((\"batch_time\", AverageMeter('batch_cost', '.5f')))\n    record_list.append((\"reader_time\", AverageMeter('reader_cost', '.5f')))\n    record_list = OrderedDict(record_list)\n    return record_list\nclass AverageMeter(object):\n    \"\"\"\n    Computes and stores the average and current value\n    \"\"\"\n    def __init__(self, name='', fmt='f', need_avg=True):\n        self.name = name\n        self.fmt = fmt\n        self.need_avg = need_avg\n        self.reset()\n    def reset(self):\n        \"\"\" reset \"\"\"\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        \"\"\" update \"\"\"\n        if isinstance(val, paddle.Tensor):\n            val = float(val)\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n    @property\n    def total(self):\n        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)\n    @property\n    def total_minute(self):\n        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,",
+        "type": "code",
+        "location": "/paddlevideo/utils/record.py:67-105"
+    },
+    "8431": {
+        "file_id": 623,
+        "content": "This function creates a record dictionary containing two AverageMeter objects, one for batch time and another for reader time. It then converts the list to an OrderedDict and returns it. The AverageMeter class calculates and stores the average and current values of a given metric, allowing easy tracking of performance metrics during program execution.",
+        "type": "comment"
+    },
+    "8432": {
+        "file_id": 623,
+        "content": "                                                            self=self)\n    @property\n    def mean(self):\n        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(\n            self=self) if self.need_avg else ''\n    @property\n    def value(self):\n        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)\ndef log_batch(metric_list,\n              batch_id,\n              epoch_id,\n              total_epoch,\n              mode,\n              ips,\n              eta_sec: int = None):\n    batch_cost = str(metric_list['batch_time'].value) + ' sec,'\n    reader_cost = str(metric_list['reader_time'].value) + ' sec,'\n    metric_values = []\n    for m in metric_list:\n        if not (m == 'batch_time' or m == 'reader_time'):\n            metric_values.append(metric_list[m].value)\n    metric_str = ' '.join([str(v) for v in metric_values])\n    epoch_str = \"epoch:[{:>3d}/{:<3d}]\".format(epoch_id, total_epoch)\n    step_str = \"{:s} step:{:<4d}\".format(mode, batch_id)\n    if eta_sec is not None:\n        eta_str = \"eta: {:s}\".format(",
+        "type": "code",
+        "location": "/paddlevideo/utils/record.py:106-136"
+    },
+    "8433": {
+        "file_id": 623,
+        "content": "This code defines a class and functions for recording metrics, calculating means, and logging batch information. The `log_batch` function records the time taken for each batch, adds other metric values, and logs the total epoch, current epoch, mode, and step. It also calculates the remaining time for the current operation if provided.",
+        "type": "comment"
+    },
+    "8434": {
+        "file_id": 623,
+        "content": "            str(datetime.timedelta(seconds=int(eta_sec))))\n    else:\n        eta_str = ''\n    max_mem_reserved_str = \"\"\n    max_mem_allocated_str = \"\"\n    if paddle.device.is_compiled_with_cuda():\n        max_mem_reserved_str = f\"max_mem_reserved: {format(paddle.device.cuda.max_memory_reserved() / (1024 ** 2), '.2f')} MB\"\n        max_mem_allocated_str = f\"max_mem_allocated: {format(paddle.device.cuda.max_memory_allocated() / (1024 ** 2), '.2f')} MB\"\n    logger.info(\"{:s} {:s} {:s} {:s} {:s} {} {:s}, {} {}\".format(\n        coloring(epoch_str, \"HEADER\") if batch_id == 0 else epoch_str,\n        coloring(step_str, \"PURPLE\"), coloring(metric_str, 'OKGREEN'),\n        coloring(batch_cost, \"OKGREEN\"), coloring(reader_cost, 'OKGREEN'), ips,\n        eta_str, max_mem_reserved_str, max_mem_allocated_str))\ndef log_epoch(metric_list, epoch, mode, ips):\n    batch_cost = 'avg_' + str(metric_list['batch_time'].value) + ' sec,'\n    reader_cost = 'avg_' + str(metric_list['reader_time'].value) + ' sec,'\n    batch_sum = str(metric_list['batch_time'].total) + ' sec,'",
+        "type": "code",
+        "location": "/paddlevideo/utils/record.py:137-155"
+    },
+    "8435": {
+        "file_id": 623,
+        "content": "This code logs the progress of an epoch in a training process. It formats the logged information with colors for better visibility. The logger displays the current step, metrics, time taken, batch cost and reader cost, estimated time remaining (ETA), and maximum memory reserved and allocated on CUDA devices if available.",
+        "type": "comment"
+    },
+    "8436": {
+        "file_id": 623,
+        "content": "    metric_values = []\n    for m in metric_list:\n        if not (m == 'batch_time' or m == 'reader_time'):\n            metric_values.append(metric_list[m].mean)\n    metric_str = ' '.join([str(v) for v in metric_values])\n    end_epoch_str = \"END epoch:{:<3d}\".format(epoch)\n    logger.info(\"{:s} {:s} {:s} {:s} {:s} {:s} {}\".format(\n        coloring(end_epoch_str, \"RED\"), coloring(mode, \"PURPLE\"),\n        coloring(metric_str, \"OKGREEN\"), coloring(batch_cost, \"OKGREEN\"),\n        coloring(reader_cost, \"OKGREEN\"), coloring(batch_sum, \"OKGREEN\"), ips))",
+        "type": "code",
+        "location": "/paddlevideo/utils/record.py:157-168"
+    },
+    "8437": {
+        "file_id": 623,
+        "content": "This code calculates the mean of metrics except 'batch_time' and 'reader_time', then joins them into a string. It formats an info message with RED for \"END epoch\", PURPLE for mode, GREEN for metric values, batch cost, reader cost, and batch sum, as well as ips (inferences per second). The logger outputs this formatted message.",
+        "type": "comment"
+    },
+    "8438": {
+        "file_id": 624,
+        "content": "/paddlevideo/utils/registry.py",
+        "type": "filepath"
+    },
+    "8439": {
+        "file_id": 624,
+        "content": "The code defines a Registry class for mapping names to objects and provides methods for registering, getting, and unregistering objects. It utilizes the @BACKBONES.register() decorator or BACKBONES.register(ResNet) function for registration, and also verifies if an object with a given name exists in the registry using the `get` method.",
+        "type": "summary"
+    },
+    "8440": {
+        "file_id": 624,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nclass Registry(object):\n    \"\"\"\n    The registry that provides name -> object mapping, to support third-party users' custom modules.\n    To register an object:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        @BACKBONES.register()\n        class ResNet:\n            pass\n    Or:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        class ResNet:\n            pass\n        BACKBONES.register(ResNet)",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:1-34"
+    },
+    "8441": {
+        "file_id": 624,
+        "content": "This code defines a Registry class that provides name to object mapping, allowing third-party users to register their custom modules. Users can register their objects by using the @BACKBONES.register() decorator or by calling BACKBONES.register(ResNet).",
+        "type": "comment"
+    },
+    "8442": {
+        "file_id": 624,
+        "content": "    Usage: To build a module.\n    .. code-block:: python\n        backbone_name = \"ResNet\"\n        b = BACKBONES.get(backbone_name)()\n    \"\"\"\n    def __init__(self, name):\n        \"\"\"\n        Args:\n            name (str): the name of this registry\n        \"\"\"\n        self._name = name\n        self._obj_map = {}\n    def __contains__(self, key):\n        return self._obj_map.get(key) is not None\n    def _do_register(self, name, obj):\n        assert (\n            name not in self._obj_map\n        ), \"An object named '{}' was already registered in '{}' registry!\".format(\n            name, self._name)\n        self._obj_map[name] = obj\n    def register(self, obj=None, name=None):\n        \"\"\"\n        Register the given object under the the name `obj.__name__`.\n        Can be used as either a decorator or not. See docstring of this class for usage.\n        \"\"\"\n        if obj is None:\n            # used as a decorator\n            def deco(func_or_class, name=name):\n                if name is None:\n                    name = func_or_class.__name__",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:36-70"
+    },
+    "8443": {
+        "file_id": 624,
+        "content": "This code is a registry class for storing and managing objects. It allows registering objects under their names or using decorators, and provides functions to check if an object with a given name exists in the registry.",
+        "type": "comment"
+    },
+    "8444": {
+        "file_id": 624,
+        "content": "                self._do_register(name, func_or_class)\n                return func_or_class\n            return deco\n        # used as a function call\n        if name is None:\n            name = obj.__name__\n        self._do_register(name, obj)\n    def get(self, name):\n        \"\"\"Get the registry record.\n        Args:\n            name (str): The class name.\n        Returns:\n            ret: The class.\n        \"\"\"\n        ret = self._obj_map.get(name)\n        if ret is None:\n            raise KeyError(\n                \"No object named '{}' found in '{}' registry!\".format(\n                    name, self._name))\n        return ret",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:71-96"
+    },
+    "8445": {
+        "file_id": 624,
+        "content": "The code defines a class with methods for registering, getting and unregistering objects in a registry. The `_do_register` method is used to store the object's name and function or class into a dictionary. If no name is provided when calling the function, it defaults to the object's name. The `get` method retrieves an object from the registry using its name. If the object is not found, it raises a KeyError with an error message.",
+        "type": "comment"
+    },
+    "8446": {
+        "file_id": 625,
+        "content": "/paddlevideo/utils/save_load.py",
+        "type": "filepath"
+    },
+    "8447": {
+        "file_id": 625,
+        "content": "The code imports modules, transfers model parameters, adjusts positional embeddings, and provides save/load functions for Resnet18, VisionTransformer (TimeSformer), SwinTransformer3D models using PaddlePaddle library.",
+        "type": "summary"
+    },
+    "8448": {
+        "file_id": 625,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport os.path as osp\nimport time\nimport paddle\nimport paddle.nn.functional as F\nfrom paddlevideo.utils import get_logger, main_only\nfrom tqdm import tqdm\nimport numpy as np\nfrom scipy import ndimage\ndef pretrain_swin_param_trans(model, state_dicts):\n    # delete classifier's params\n    if 'head.fc' + '.weight' in state_dicts:\n        del state_dicts['head.fc' + '.weight']\n    if 'head.fc' + '.bias' in state_dicts:",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:1-30"
+    },
+    "8449": {
+        "file_id": 625,
+        "content": "This code is from the PaddleVideo library and it imports necessary modules, defines a function for transferring pre-trained Swin model parameters, and deletes the classifier's weights from state_dicts.",
+        "type": "comment"
+    },
+    "8450": {
+        "file_id": 625,
+        "content": "        del state_dicts['head.fc' + '.bias']\n    state_dicts = {\n        k.replace('backbone.', ''): v\n        for k, v in state_dicts.items()\n    }\n    if len(state_dicts) == len(model.state_dict()):\n        print(\"Load 3D weights\")\n        return state_dicts\n    print(\"Load 2D weights\")\n    relative_position_index_keys = [\n        k for k in state_dicts.keys() if \"relative_position_index\" in k\n    ]\n    for k in relative_position_index_keys:\n        del state_dicts[k]\n    # delete attn_mask since we always re-init it\n    attn_mask_keys = [k for k in state_dicts.keys() if \"attn_mask\" in k]\n    for k in attn_mask_keys:\n        del state_dicts[k]\n    state_dicts['patch_embed.proj.weight'] = state_dicts[\n        'patch_embed.proj.weight'].unsqueeze(2).tile(\n            [1, 1, model.patch_size[0], 1, 1]) / model.patch_size[0]\n    # bicubic interpolate relative_position_bias_table if not match\n    relative_position_bias_table_keys = [\n        k for k in state_dicts.keys() if \"relative_position_bias_table\" in k\n    ]",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:31-61"
+    },
+    "8451": {
+        "file_id": 625,
+        "content": "This code checks if the loaded state dictionaries match the model's state dictionaries and handles any inconsistencies. It removes unnecessary keys, adjusts certain weights, and bicubically interpolates relative position bias tables if they don't match to ensure proper loading of 2D or 3D weights.",
+        "type": "comment"
+    },
+    "8452": {
+        "file_id": 625,
+        "content": "    total_len = len(relative_position_bias_table_keys)\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        for key in tqdm(relative_position_bias_table_keys,\n                        total=total_len,\n                        position=0):\n            relative_position_bias_table_pretrained = state_dicts[key]\n            relative_position_bias_table_current = model.state_dict()[key]\n            L1, nH1 = relative_position_bias_table_pretrained.shape\n            L2, nH2 = relative_position_bias_table_current.shape\n            L2 = (2 * model.window_size[1] - 1) * (2 * model.window_size[2] - 1)\n            wd = model.window_size[0]\n            if nH1 != nH2:\n                desc.set_description(f\"Error in loading {key}, skip\")\n            else:\n                if L1 != L2:\n                    S1 = int(L1**0.5)\n                    relative_position_bias_table_pretrained_resized = paddle.nn.functional.interpolate(\n                        relative_position_bias_table_pretrained.transpose(",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:62-82"
+    },
+    "8453": {
+        "file_id": 625,
+        "content": "Loading weights for relative position bias tables from pretrained and current model state dictionaries.",
+        "type": "comment"
+    },
+    "8454": {
+        "file_id": 625,
+        "content": "                            [1, 0]).reshape([1, nH1, S1, S1]),\n                        size=(2 * model.window_size[1] - 1,\n                              2 * model.window_size[2] - 1),\n                        mode='bicubic')\n                    relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.reshape(\n                        [nH2, L2]).transpose([1, 0])\n                desc.set_description(f\"Loading {key}\")\n            state_dicts[key] = relative_position_bias_table_pretrained.tile(\n                [2 * wd - 1, 1])\n            time.sleep(0.01)\n    ret_str = \"loading {:<20d} weights completed.\".format(\n        len(model.state_dict()))\n    desc.set_description(ret_str)\n    return state_dicts\ndef pretrain_vit_param_trans(model, state_dicts, num_patches, num_seg,\n                             attention_type):\n    \"\"\"\n    Convert ViT's pre-trained model parameters to a parameter dictionary that matches the existing model\n    \"\"\"\n    if 'head' + '.weight' in state_dicts:\n        del state_dicts['head' + '.weight']",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:83-105"
+    },
+    "8455": {
+        "file_id": 625,
+        "content": "Function is loading pre-trained model parameters, resizing a table, and setting the description.\nThe code is performing model parameter transformation for ViT models, deleting unnecessary weights.",
+        "type": "comment"
+    },
+    "8456": {
+        "file_id": 625,
+        "content": "    if 'head' + '.bias' in state_dicts:\n        del state_dicts['head' + '.bias']\n    total_len = len(model.state_dict())\n    if num_patches + 1 != state_dicts['pos_embed'].shape[1]:  # when\n        pos_embed = state_dicts['pos_embed']\n        cls_pos_embed = paddle.to_tensor(\n            pos_embed[0, 0, :]).unsqueeze(0).unsqueeze(1)\n        other_pos_embed = paddle.to_tensor(pos_embed[0, 1:, :])\n        gs_new = int(np.sqrt(num_patches))\n        gs_old = int(np.sqrt(other_pos_embed.shape[0]))\n        zoom = (gs_new / gs_old, gs_new / gs_old, 1)\n        other_pos_embed = paddle.reshape(other_pos_embed, [gs_old, gs_old, -1])\n        other_pos_embed = ndimage.zoom(other_pos_embed, zoom, order=1)\n        other_pos_embed = paddle.to_tensor(other_pos_embed)\n        new_pos_embed = paddle.reshape(other_pos_embed, [1, num_patches, -1])\n        new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), axis=1)\n        state_dicts['pos_embed'] = new_pos_embed\n        time.sleep(0.01)\n    if 'time_embed' in state_dicts and num_seg != state_dicts[",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:106-126"
+    },
+    "8457": {
+        "file_id": 625,
+        "content": "This code block checks the shape of the 'pos_embed' tensor and adjusts it based on the number of patches provided. It resizes the tensor using Paddle's ndimage.zoom function and then reconstructs the updated positional embedding for the model. This is necessary when the number of patches changes, ensuring the positional embeddings are consistent with the new patch count.",
+        "type": "comment"
+    },
+    "8458": {
+        "file_id": 625,
+        "content": "            'time_embed'].shape[1]:\n        time_embed = state_dicts['time_embed'].transpose((0, 2, 1)).unsqueeze(0)\n        new_time_embed = F.interpolate(time_embed,\n                                       size=(time_embed.shape[-2], num_seg),\n                                       mode='nearest')\n        state_dicts['time_embed'] = new_time_embed.squeeze(0).transpose(\n            (0, 2, 1))\n        time.sleep(0.01)\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        if attention_type == 'divided_space_time':\n            new_state_dicts = state_dicts.copy()\n            for key in tqdm(state_dicts):\n                if 'blocks' in key and 'attn' in key:\n                    desc.set_description(\"Loading %s\" % key)\n                    new_key = key.replace('attn', 'temporal_attn')\n                    if not new_key in state_dicts:\n                        new_state_dicts[new_key] = state_dicts[key]\n                    else:",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:127-147"
+    },
+    "8459": {
+        "file_id": 625,
+        "content": "This code block is part of a larger program that loads pre-trained model weights. It first checks if the shape of 'time_embed' matches a specific condition, and if not, it performs some transformations on it. Afterwards, it starts a progress bar with the description \"Loading weights\" to show the progress of loading these weights. If the attention type is 'divided_space_time', it makes a copy of state_dicts and iterates over its keys, replacing 'attn' keys with 'temporal_attn' if not already present.",
+        "type": "comment"
+    },
+    "8460": {
+        "file_id": 625,
+        "content": "                        new_state_dicts[new_key] = state_dicts[new_key]\n                if 'blocks' in key and 'norm1' in key:\n                    desc.set_description(\"Loading %s\" % key)\n                    new_key = key.replace('norm1', 'temporal_norm1')\n                    if not new_key in state_dicts:\n                        new_state_dicts[new_key] = state_dicts[key]\n                    else:\n                        new_state_dicts[new_key] = state_dicts[new_key]\n                time.sleep(0.01)\n        elif attention_type == 'space_only':  # tokenshift raw vit\n            new_state_dicts = state_dicts.copy()\n    ret_str = \"loading {:<20d} weights completed.\".format(\n        len(model.state_dict()))\n    desc.set_description(ret_str)\n    return new_state_dicts\ndef pretrain_resnet18_param_trans(model, loaded_dict):\n    encoder_dict = model.encoder.state_dict()\n    pose_encoder_dict = model.pose_encoder.state_dict()\n    names = ['encoder.', 'encoder_day.', 'encoder_night.']\n    for name in names:\n        total_len = len(loaded_dict.items())",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:148-172"
+    },
+    "8461": {
+        "file_id": 625,
+        "content": "This code appears to be related to model weight loading and adaptation for a pre-trained ResNet18 in a specific context. It modifies the state_dicts of certain keys, like replacing 'norm1' with 'temporal_norm1', possibly to adapt the weights to fit the new model structure. The code also checks if a certain key exists and copies it if not, ensuring the new model has all necessary parameters. Finally, it updates the description for the loading process.",
+        "type": "comment"
+    },
+    "8462": {
+        "file_id": 625,
+        "content": "        with tqdm(total=total_len,\n                  position=1,\n                  bar_format='{desc}',\n                  desc=\"Loading weights\") as desc:\n            for key, value in tqdm(loaded_dict.items(),\n                                   total=total_len,\n                                   position=0):\n                key = str(name + key)\n                if key in encoder_dict:\n                    encoder_dict[key] = value\n                    desc.set_description('Loading %s' % key)\n                time.sleep(0.01)\n    num_input_images = 2\n    loaded_dict['conv1.weight'] = paddle.concat(\n        [loaded_dict['conv1.weight']] * num_input_images, 1) / num_input_images\n    total_len = len(loaded_dict.items())\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        for name, value in tqdm(loaded_dict.items(),\n                                total=total_len,\n                                position=0):\n            name = str('encoder.' + name)",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:173-197"
+    },
+    "8463": {
+        "file_id": 625,
+        "content": "This code is loading weights from a dictionary, updating the encoder_dict if the key already exists. It also updates loaded_dict for a specific convolution layer based on the number of input images and uses tqdm to provide progress updates.",
+        "type": "comment"
+    },
+    "8464": {
+        "file_id": 625,
+        "content": "            if name in pose_encoder_dict:\n                pose_encoder_dict[name] = value\n                desc.set_description('Loading %s' % key)\n            time.sleep(0.01)\n        ret_str = \"loading {:<20d} weights completed.\".format(\n            len(model.state_dict()))\n        desc.set_description(ret_str)\n    return encoder_dict, pose_encoder_dict\n#XXX(shipping): maybe need load N times because of different cards have different params.\n@main_only\ndef load_ckpt(model, weight_path, **kargs):\n    \"\"\"\n    1. Load pre-trained model parameters\n    2. Extract and convert from the pre-trained model to the parameters\n    required by the existing model\n    3. Load the converted parameters of the existing model\n    \"\"\"\n    #model.set_state_dict(state_dict)\n    if not osp.isfile(weight_path):\n        raise IOError(f'{weight_path} is not a checkpoint file')\n    #state_dicts = load(weight_path)\n    logger = get_logger(\"paddlevideo\")\n    state_dicts = paddle.load(weight_path)\n    if 'ResnetEncoder' in str(model):\n        encoder_dict, pose_encoder_dict = pretrain_resnet18_param_trans(",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:198-226"
+    },
+    "8465": {
+        "file_id": 625,
+        "content": "This code loads pre-trained model parameters from a specified file path and converts them for use in the existing model. If the weight_path is not a valid checkpoint file, it raises an IOError. The code also utilizes Paddle's `paddle.load()` function to load state_dicts from the specified file path. It handles loading of Resnet18 parameters specifically with the `pretrain_resnet18_param_trans()` function.",
+        "type": "comment"
+    },
+    "8466": {
+        "file_id": 625,
+        "content": "            model, state_dicts)\n        model.encoder.load_dict(encoder_dict)\n        model.pose_encoder.load_dict(pose_encoder_dict)\n        tmp = model.state_dict()\n    elif \"VisionTransformer\" in str(model):  # For TimeSformer case\n        tmp = pretrain_vit_param_trans(model, state_dicts, kargs['num_patches'],\n                                       kargs['num_seg'],\n                                       kargs['attention_type'])\n    elif 'SwinTransformer3D' in str(model):\n        tmp = pretrain_swin_param_trans(model, state_dicts)\n    else:\n        tmp = {}\n        total_len = len(model.state_dict())\n        with tqdm(total=total_len,\n                  position=1,\n                  bar_format='{desc}',\n                  desc=\"Loading weights\") as desc:\n            for item in tqdm(model.state_dict(), total=total_len, position=0):\n                name = item\n                desc.set_description('Loading %s' % name)\n                if name not in state_dicts:  # Convert from non-parallel model\n                    if str('backbone.' + name) in state_dicts:",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:227-248"
+    },
+    "8467": {
+        "file_id": 625,
+        "content": "This code is loading the model's weights and dictionary entries. It checks the type of the model and then either loads or transposes the parameters accordingly, handling cases such as VisionTransformer (TimeSformer) and SwinTransformer3D. For other models, it simply initializes an empty dictionary and starts loading each item from the state_dict in a tqdm progress bar.",
+        "type": "comment"
+    },
+    "8468": {
+        "file_id": 625,
+        "content": "                        tmp[name] = state_dicts['backbone.' + name]\n                else:  # Convert from parallel model\n                    tmp[name] = state_dicts[name]\n                time.sleep(0.01)\n        ret_str = \"loading {:<20d} weights completed.\".format(\n            len(model.state_dict()))\n        desc.set_description(ret_str)\n    model.set_state_dict(tmp)\ndef mkdir(dir):\n    if not os.path.exists(dir):\n        # avoid error when train with multiple gpus\n        try:\n            os.makedirs(dir)\n        except:\n            pass\ndef _extract_student_weights(all_params, student_prefix=\"Student.\"):\n    s_params = {\n        key[len(student_prefix):]: all_params[key]\n        for key in all_params if student_prefix in key\n    }\n    return s_params\n@main_only\ndef save(obj, path, save_student_model=False):\n    if save_student_model:\n        s_params = _extract_student_weights(obj)\n        student_path = path.replace(\".pdparams\", \"_student.pdparams\")\n        if len(s_params) > 0:\n            paddle.save(s_params, student_path)",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:249-282"
+    },
+    "8469": {
+        "file_id": 625,
+        "content": "This code saves a PaddlePaddle model's state dictionary and optionally the student model's state dictionary to separate files. It also has functionality for handling parallel models, converting them into separate state dictionaries. The `mkdir` function is used to create directories if they don't exist already. If the `save_student_model` flag is set to True, it will save both the main and student model weights in separate files.",
+        "type": "comment"
+    },
+    "8470": {
+        "file_id": 625,
+        "content": "    paddle.save(obj, path)\ndef load(file_name):\n    if not osp.isfile(file_name):\n        raise IOError(f'{file_name} not exist')\n    return paddle.load(file_name)",
+        "type": "code",
+        "location": "/paddlevideo/utils/save_load.py:283-289"
+    },
+    "8471": {
+        "file_id": 625,
+        "content": "This code defines two functions: \"save\" and \"load\". The \"save\" function uses the Paddle library to save an object (obj) at a specified path. The \"load\" function checks if a file exists, raises an IOError if it does not, and then loads the object from the file using the Paddle library's load function.",
+        "type": "comment"
+    },
+    "8472": {
+        "file_id": 626,
+        "content": "/paddlevideo/version.py",
+        "type": "filepath"
+    },
+    "8473": {
+        "file_id": 626,
+        "content": "This code contains the version information for PaddleVideo, licensed under the Apache License 2.0, and defines the current version as \"0.0.1\".",
+        "type": "summary"
+    },
+    "8474": {
+        "file_id": 626,
+        "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n__all__ = [\"paddlevideo_version\"]\npaddlevideo_version = \"0.0.1\"",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/paddlevideo/version.py:1-16"
+    },
+    "8475": {
+        "file_id": 626,
+        "content": "This code contains the version information for PaddleVideo, licensed under the Apache License 2.0, and defines the current version as \"0.0.1\".",
+        "type": "comment"
+    },
+    "8476": {
+        "file_id": 627,
+        "content": "/run.sh",
+        "type": "filepath"
+    },
+    "8477": {
+        "file_id": 627,
+        "content": "This script trains multiple deep learning models for various computer vision tasks with PaddlePaddle framework and demonstrates running BMN test, exporting models, inference using PaddleVideo toolkit, and provides training time calculation.",
+        "type": "summary"
+    },
+    "8478": {
+        "file_id": 627,
+        "content": "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n#export FLAGS_conv_workspace_size_limit=800 #MB\n#export FLAGS_cudnn_exhaustive_search=1\n#export FLAGS_cudnn_batchnorm_spatial_persistent=1\nstart_time=$(date +%s)\n# run pp-tsm training\n#python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml\n# run pp-tsm_v2 distillation training\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm_v2  main.py --validate -c configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml\n# run ava training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=logdir.ava_part main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava_part.yaml\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=logdir.ava_all.1203 main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava_all.yaml",
+        "type": "code",
+        "location": "/run.sh:1-18"
+    },
+    "8479": {
+        "file_id": 627,
+        "content": "This script sets the CUDA visible devices, launches distributed training for multiple models (pp-tsm, pp-tsm_v2, ava), and specifies log directories and configurations. It runs all at once using 8 GPUs and Python3.7.",
+        "type": "comment"
+    },
+    "8480": {
+        "file_id": 627,
+        "content": "# run adds training\n# python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20\n# run tsm training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_frames.yaml\n# run tsm amp training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml\n# run tsm amp training, nhwc\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml\n# run tsn training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsn main.py  --validate -c configs/recognition/tsn/tsn_k400_frames.yaml\n# run video-swin-transformer training\n# python3.7 -u -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_videoswin main.py --amp --validate -c configs/recognition/videoswin/videoswin_k400_videos.yaml",
+        "type": "code",
+        "location": "/run.sh:20-36"
+    },
+    "8481": {
+        "file_id": 627,
+        "content": "This code contains various command lines to run different video recognition training processes using PaddlePaddle framework on multiple GPUs. Each line specifies the model architecture, the configuration file, and the options like validation, amp (automatic mixed precision), and GPU allocation for each specific task.",
+        "type": "comment"
+    },
+    "8482": {
+        "file_id": 627,
+        "content": "# run slowfast training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_slowfast  main.py --validate -c configs/recognition/slowfast/slowfast.yaml\n# run slowfast multi-grid training\n# python3.7 -B -m paddle.distributed.launch --selected_gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml\n# run bmn training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_bmn main.py  --validate -c configs/localization/bmn.yaml\n# run attention_lstm training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_attetion_lstm  main.py  --validate -c configs/recognition/attention_lstm/attention_lstm_youtube-8m.yaml\n# run pp-tsn training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsn  main.py  --validate -c configs/recognition/pptsn/pptsn_k400_frames.yaml\n# run timesformer training\n# python3.7 -B -m paddle.",
+        "type": "code",
+        "location": "/run.sh:38-54"
+    },
+    "8483": {
+        "file_id": 627,
+        "content": "This code executes multiple deep learning model training scripts for various computer vision tasks such as recognition, localization, and more using PaddlePaddle framework. The training runs in distributed mode with GPU utilization to speed up the process.",
+        "type": "comment"
+    },
+    "8484": {
+        "file_id": 627,
+        "content": "distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_timesformer  main.py  --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml\n# run pp-timesformer training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptimesformer  main.py  --validate -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml\n# run st-gcn training\n# python3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml\n# run agcn training\n# python3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml\n# run actbert training\n# python3.7 main.py  --validate -c configs/multimodal/actbert/actbert.yaml\n# run tsn dali training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml\n# test.sh\n# just use `example` as example, please replace to real name.\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_test main.py --test -c configs/example.yaml -w \"output/example/example_best.pdparams\"",
+        "type": "code",
+        "location": "/run.sh:54-74"
+    },
+    "8485": {
+        "file_id": 627,
+        "content": "This code executes multiple deep learning model training and testing scripts for various tasks. It launches distributed training with specific GPU configurations, sets log directories, and uses different configuration files depending on the task. The tasks include pp-timesformer, st-gcn, agcn, actbert, tsn dali training, and example test.",
+        "type": "comment"
+    },
+    "8486": {
+        "file_id": 627,
+        "content": "# NOTE: run bmn test, only support single card, bs=1\n# python3.7 main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00010.pdparams -o DATASET.batch_size=1\n# export_models script\n# just use `example` as example, please replace to real name.\n# python3.7 tools/export_model.py -c configs/example.yaml -p output/example/example_best.pdparams -o ./inference\n# predict script\n# just use `example` as example, please replace to real name.\n# python3.7 tools/predict.py -v example.avi --model_file \"./inference/example.pdmodel\" --params_file \"./inference/example.pdiparams\" --enable_benchmark=False --model=\"example\" --num_seg=8\nend_time=$(date +%s)\ncost_time=$[ $end_time-$start_time ]\necho \"Time to train is $(($cost_time/60))min $(($cost_time%60))s\"",
+        "type": "code",
+        "location": "/run.sh:76-89"
+    },
+    "8487": {
+        "file_id": 627,
+        "content": "This script demonstrates the process of running BMN test, exporting models, and performing inference using the PaddleVideo toolkit. It highlights the commands required for each step, including the necessary configuration files and output directories. The script also calculates and outputs the training time in minutes and seconds.",
+        "type": "comment"
+    },
+    "8488": {
+        "file_id": 628,
+        "content": "/setup.py",
+        "type": "filepath"
+    },
+    "8489": {
+        "file_id": 628,
+        "content": "The setup file installs necessary packages for PaddleVideo, supports Python 3.2-3.6, and includes a console script \"ppvideo\" with keywords: \"A treasure chest for video understanding powered by PaddlePaddle.\" The code specifies Python version 3.7 and categorizes the project as a utility for metadata description in setup.py file.",
+        "type": "summary"
+    },
+    "8490": {
+        "file_id": 628,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom setuptools import setup\nfrom io import open\nwith open('requirements.txt', encoding=\"utf-8-sig\") as f:\n    requirements = f.readlines()\ndef readme():\n    with open('docs/en/quick_start.md', encoding=\"utf-8-sig\") as f:\n        README = f.read()\n    return README\nsetup(\n    name='ppvideo',  #name of .whl file\n    packages=['ppvideo'],  #install package name\n    package_dir={'ppvideo': ''},\n    include_package_data=",
+        "type": "code",
+        "location": "/setup.py:1-31"
+    },
+    "8491": {
+        "file_id": 628,
+        "content": "This code is a setup file for the PaddleVideo package using setuptools. It specifies the package name, installs required packages from requirements.txt, reads README content, and sets up directory structure.",
+        "type": "comment"
+    },
+    "8492": {
+        "file_id": 628,
+        "content": "    True,  #Accept all data files and directories matched by MANIFEST.in\n    install_requires=requirements,\n    entry_points={\"console_scripts\": [\"ppvideo= ppvideo.tools.wheel:main\"]},\n    version='2.3.0',\n    license='Apache License 2.0',\n    description='Awesome Video toolkits based on PaddlePaddle ',\n    long_description=readme(),\n    long_description_content_type='text/markdown',\n    url='https://github.com/PaddlePaddle/PaddleVideo',\n    download_url='https://github.com/PaddlePaddle/PaddleVideo.git',\n    keywords=[\n        'A treasure chest for video understanding powered by PaddlePaddle.'\n    ],\n    classifiers=[\n        'Intended Audience :: Developers', 'Operating System :: OS Independent',\n        'Natural Language :: Chinese (Simplified)',\n        'Programming Language :: Python :: 3',\n        'Programming Language :: Python :: 3.2',\n        'Programming Language :: Python :: 3.3',\n        'Programming Language :: Python :: 3.4',\n        'Programming Language :: Python :: 3.5',\n        'Programming Language :: Python :: 3.6',",
+        "type": "code",
+        "location": "/setup.py:32-53"
+    },
+    "8493": {
+        "file_id": 628,
+        "content": "This code is for setting up a Python package named \"PaddleVideo\" using setup.py. It specifies package details such as its name, version, requirements, description, license, and URL. The package is built with console script \"ppvideo\", and it supports Python 3.2-3.6. Keywords: \"A treasure chest for video understanding powered by PaddlePaddle.\"",
+        "type": "comment"
+    },
+    "8494": {
+        "file_id": 628,
+        "content": "        'Programming Language :: Python :: 3.7', 'Topic :: Utilities'\n    ],\n)",
+        "type": "code",
+        "location": "/setup.py:54-56"
+    },
+    "8495": {
+        "file_id": 628,
+        "content": "The code is specifying Python version 3.7 and categorizing the project as a utility for metadata description in setup.py file.",
+        "type": "comment"
+    },
+    "8496": {
+        "file_id": 629,
+        "content": "/test_tipc/README.md",
+        "type": "filepath"
+    },
+    "8497": {
+        "file_id": 629,
+        "content": "The code provides TIPC support for PaddleVideo, offering tutorials on acceleration features, defining naming conventions for testing, ONNX conversion, deployment with Paddle Serving, offline quantized training/inference, and multi-machine multi-GPU training/inference.",
+        "type": "summary"
+    },
+    "8498": {
+        "file_id": 629,
+        "content": "# 飞桨训推一体认证（TIPC）\n## 1. 简介\n飞桨除了基本的模型训练和预测，还提供了支持多端多平台的高性能推理部署工具。本文档提供了PaddleVideo中所有模型的飞桨训推一体认证 (Training and Inference Pipeline Certification(TIPC)) 信息和测试工具，方便用户查阅每种模型的训练推理部署打通情况，并可以进行一键测试。\n<div align=\"center\">\n    <img src=\"docs/guide.png\" width=\"1000\">\n</div>\n## 2. 汇总信息\n打通情况汇总如下，已填写的部分表示可以使用本工具进行一键测试，未填写的表示正在支持中。\n**字段说明：**\n- 基础训练预测：包括模型训练、Paddle Inference Python预测。\n- 更多训练方式：包括多机多卡(TODO)、混合精度。\n- 模型压缩：包括裁剪、离线/在线量化(TODO)、蒸馏(TODO)。\n- 其他预测部署：包括Paddle Inference C++预测、Paddle Serving部署、Paddle-Lite部署(TODO)等。\n更详细的mkldnn、Tensorrt等预测加速相关功能的支持情况可以查看各测试工具的[更多教程](#more)。\n| 算法名称 | 模型名称 | 模型类型 | 基础<br>训练预测 | 更多<br>训练方式 | 模型压缩 |  其他预测部署  |\n| :--- | :--- |  :----:  | :--------: |  :----  |   :----  |   :----  |\n| PP-TSM     |pptsm_k400_frames_uniform | 动作识别 | 支持 | 混合精度 | 离线量化 | Paddle Inference: C++ |\n| PP-TSN |pptsn_k400_videos | 动作识别 | 支持 | 混合精度 | - | Paddle Inference: C++ |\n| AGCN |agcn_fsd\t | 动作识别 | 支持 | 混合精度 | - | - |\n| STGCN |stgcn_fsd | 动作识别 | 支持 | 混合精度 | - | - |\n| TimeSformer |timesformer_k400_videos | 动作识别 | 支持 | 混合精度 | - | - |",
+        "type": "code",
+        "location": "/test_tipc/README.md:2-30"
+    },
+    "8499": {
+        "file_id": 629,
+        "content": "This code provides an introduction to the PaddleVideo training and inference pipeline certification (TIPC), including a summary of support status for various models and deployment methods. It also mentions that more details on specific acceleration features can be found in tutorials associated with each test tool.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/85.json b/docs/data/85.json
new file mode 100644
index 000000000..86a437e4b
--- /dev/null
+++ b/docs/data/85.json
@@ -0,0 +1,547 @@
+{
+    "8500": {
+        "file_id": 629,
+        "content": "| SlowFast |slowfast | 动作识别 | 支持 | 混合精度 | - | - |\n| TSM  |tsm_k400_frames | 动作识别 | 支持 | 混合精度 | - | - |\n| TSN  |tsn_k400_frames          | 动作识别 |支持|混合精度|-|-|\n| AttentionLSTM |attention_lstm_youtube8m | 动作识别 | 支持 | 混合精度 | - | - |\n| BMN |bmn | 动作时间定位 | 支持 | 混合精度 | - | - |\n## 3. 测试工具简介\n### 目录介绍\n```shell\ntest_tipc/\n├── configs/  # 配置文件目录\n│   ├── PP-TSM/\n│   │   ├── train_infer_python.txt # PP-TSM在Linux上进行python训练预测（基础训练预测）的配置文件\n│   │   ├── serving_infer_cpp.txt  # PP-TSM在Linux上进行cpp serving测试的配置文件\n│   │   ├── train_amp_infer_python.txt # PP-TSM在Linux上进行python训练预测（混合精度训练预测）的配置文件\n│   │   ├── serving_infer_python.txt # PP-TSM在Linux上进行python serving预测的配置文件\n│   │   └── train_ptq_infer_python.txt # PP-TSM在Linux上进行离线量化推理测试的配置文件\n│   ├── PP-TSN/\n│   │   ├── train_infer_python.txt # PP-TSN在Linux上进行python训练预测（基础训练预测）的配置文件\n│   │   ├── paddle2onnx_infer_python.txt # PP-TSN在Linux上进行Paddle2ONNX预测（基础训练预测）的配置文件\n│   │   ├── serving_infer_cpp.txt  # PP-TSN在Linux上进行cpp serving测试的配置文件\n│   │   └── train_amp_infer_python.txt # PP-TSN在Linux上进行python训练预测（混合精度训练预测）的配置文件",
+        "type": "code",
+        "location": "/test_tipc/README.md:31-55"
+    },
+    "8501": {
+        "file_id": 629,
+        "content": "This code snippet introduces the test tool for PaddleVideo, providing an overview of supported models and their respective configurations, as well as the directory structure containing these configuration files.",
+        "type": "comment"
+    },
+    "8502": {
+        "file_id": 629,
+        "content": "│   ├── ...\n│   └── ...\n├── results/   # 预先保存的预测结果，用于和实际预测结果进行精度比对\n│   ├── PP-TSM/\n│   │\t├── python_ppvideo_PP-TSM_results_fp16.txt # 预存的PP-TSM识别识别模型python预测fp16精度的结果\n│   │\t└── python_ppvideo_PP-TSM_results_fp32.txt # 预存的PP-TSM识别识别模型python预测fp32精度的结果\n│   ├── PP-TSN/\n│   │\t├── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型python预测fp16精度的结果\n│   │\t└── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型python预测fp32精度的结果\n│   ├── PP-TSN_CPP/\n│   │\t├── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型C++预测fp16精度的结果\n│   │\t└── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型C++预测fp32精度的结果\n│   ├── ...\n│   └── ...\n├── prepare.sh                        # 完成test_*.sh运行所需要的数据和模型下载\n├── docs/                             # 详细的TIPC各种功能文档\n├── test_train_inference_python.sh    # 测试python训练预测的主程序\n├── test_inference_cpp.sh             # 测试C++预测的主程序\n├── test_paddle2onnx.sh               # 测试paddle2onnx转换与推理的主程序\n├── compare_results.py                # 用于对比log中的预测结果与results中的预存结果精度误差是否在限定范围内\n└── README.md                         # 介绍文档",
+        "type": "code",
+        "location": "/test_tipc/README.md:56-76"
+    },
+    "8503": {
+        "file_id": 629,
+        "content": "This code represents the directory structure of a PaddleVideo test_tipc project. It includes pre-stored prediction results for various models in the 'results' folder, which are used to compare and verify the precision of the actual predictions. The scripts 'prepare.sh', 'test_train_inference_python.sh', 'test_inference_cpp.sh', and 'compare_results.py' are provided for testing, training, inference using Python or C++, as well as comparing the results with pre-stored data to calculate precision errors. The 'docs' folder contains detailed documentation on TIPC features, while 'test_paddle2onnx.sh' is used to test Paddle to ONNX conversion and inference.",
+        "type": "comment"
+    },
+    "8504": {
+        "file_id": 629,
+        "content": "```\n### 测试流程概述\n使用本工具，可以测试不同功能的支持情况，以及预测结果是否对齐，测试流程概括如下：\n<div align=\"center\">\n    <img src=\"docs/Video_TIPC.png\" width=\"800\">\n</div>\n1. 运行prepare.sh准备测试所需数据和模型；\n2. 运行要测试的功能对应的测试脚本`test_*.sh`，产出log，由log可以看到不同配置是否运行成功；\n3. 用`compare_results.py`对比log中的预测结果和预存在results目录下的结果，判断预测精度是否符合预期（在误差范围内）。\n测试单项功能仅需两行命令，**如需测试不同模型/功能，替换配置文件即可**，命令格式如下：\n```shell\n# 功能：准备数据\n# 格式：bash + 运行脚本 + 参数1: 配置文件选择 + 参数2: 模式选择\nbash test_tipc/prepare.sh  configs/[model_name]/[params_file_name]  [Mode]\n# 功能：运行测试\n# 格式：bash + 运行脚本 + 参数1: 配置文件选择 + 参数2: 模式选择\nbash test_tipc/test_train_inference_python.sh configs/[model_name]/[params_file_name]  [Mode]\n```\n例如，测试基本训练预测功能的`lite_train_lite_infer`模式，运行：\n```shell\n# 准备数据\nbash test_tipc/prepare.sh ./test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'\n# 运行测试\nbash test_tipc/test_train_inference_python.sh ./test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'\n```\n关于本示例命令的更多信息可查看[基础训练预测使用文档](./docs/test_train_inference_python.md)。\n### 配置文件命名规范\n在`configs`目录下存放所有模型测试需要用到的配置文件，配置文件的命名遵循如下规范：",
+        "type": "code",
+        "location": "/test_tipc/README.md:77-112"
+    },
+    "8505": {
+        "file_id": 629,
+        "content": "The code provides an overview of the test process for PaddleVideo's TIPC. It requires running a prepare script and test_*.sh scripts, comparing log files, and specifying model names and parameters using configuration files. Testing a single feature takes only two commands, and changing configurations is as simple as replacing the configuration file.",
+        "type": "comment"
+    },
+    "8506": {
+        "file_id": 629,
+        "content": "1. 基础训练预测配置简单命名为：`train_infer_python.txt`，表示**Linux环境下单机、不使用混合精度训练+python预测**，其完整命名对应`train_linux_gpu_normal_normal_infer_python_linux_gpu_cpu.txt`，由于本配置文件使用频率较高，这里进行了名称简化。\n2. 其他带训练配置命名格式为：`train_训练硬件环境(linux_gpu/linux_dcu/…)_是否多机(fleet/normal)_是否混合精度(amp/normal)_预测模式(infer/lite/serving/js)_语言(cpp/python/java)_预测硬件环境(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`。如，linux gpu下多机多卡+混合精度链条测试对应配置 `train_linux_gpu_fleet_amp_infer_python_linux_gpu_cpu.txt`，linux dcu下基础训练预测对应配置 `train_linux_dcu_normal_normal_infer_python_linux_dcu.txt`。\n3. 仅预测的配置（如serving、lite等）命名格式：`model_训练硬件环境(linux_gpu/linux_dcu/…)_是否多机(fleet/normal)_是否混合精度(amp/normal)_(infer/lite/serving/js)_语言(cpp/python/java)_预测硬件环境(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`，即，与2相比，仅第一个字段从train换为model，测试时模型直接下载获取，这里的“训练硬件环境”表示所测试的模型是在哪种环境下训练得到的。\n**根据上述命名规范，可以直接从子目录名称和配置文件名找到需要测试的场景和功能对应的配置文件。**\n<a name=\"more\"></a>\n## 4. 开始测试\n各功能测试中涉及混合精度、裁剪、量化等训练相关，及mkldnn、Tensorrt等多种预测相关参数配置，请点击下方相应链接了解更多细节和使用教程：\n- [test_train_inference_python 使用](docs/test_train_inference_python.md) ：测试基于Python的模型训练、评估、推理等基本功能。",
+        "type": "code",
+        "location": "/test_tipc/README.md:114-126"
+    },
+    "8507": {
+        "file_id": 629,
+        "content": "Code defines naming conventions for various training and inference configurations used by PaddleVideo, allowing users to easily identify the desired test scenario based on subdirectories and configuration file names.",
+        "type": "comment"
+    },
+    "8508": {
+        "file_id": 629,
+        "content": "- [test_amp_train_inference_python 使用](docs/test_train_amp_inference_python.md) ：测试基于Python的**混合精度**模型训练、评估、推理等基本功能。\n- [test_inference_cpp 使用](docs/test_inference_cpp.md) ：测试基于C++的模型推理功能。\n- [test_paddle2onnx 使用](docs/test_paddle2onnx.md) ：测试基于python2onnx模型的推理功能。\n- [test_serving_infer_python 使用](docs/test_serving_infer_python.md) ：测试基于Paddle Serving的服务化部署功能。\n- [test_serving_infer_cpp 使用](docs/test_serving_infer_cpp.md) ：测试基于C++的模型推理功能。\n- [test_ptq_inference_python 使用](docs/test_train_ptq_inference_python.md) ：测试离线量化训练推理功能。\n- [test_train_fleet_inference_python 使用](./docs/test_train_fleet_inference_python.md)：测试基于Python的多机多卡训练与推理等基本功能",
+        "type": "code",
+        "location": "/test_tipc/README.md:127-133"
+    },
+    "8509": {
+        "file_id": 629,
+        "content": "This code provides a brief overview of various test cases available for different functionalities within the PaddleVideo framework. The functionalities include testing Python-based mixed precision training, evaluation, and inference; C++-based model inference; converting models to ONNX format for inference; deploying models using Paddle Serving; offline quantized training and inference; and multi-machine multi-GPU training and inference using Python.",
+        "type": "comment"
+    },
+    "8510": {
+        "file_id": 630,
+        "content": "/test_tipc/benchmark_train.sh",
+        "type": "filepath"
+    },
+    "8511": {
+        "file_id": 630,
+        "content": "This script prepares environment for benchmarking PaddleVideo model, trains with varying batch sizes and precisions, measures execution time, and processes log files to extract performance metrics.",
+        "type": "summary"
+    },
+    "8512": {
+        "file_id": 630,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\n# set env\npython=python\nexport model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d\"/\" -f 3`\nexport model_commit=$(git log|head -n1|awk '{print $2}')\nexport str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)\nexport frame_version=${str_tmp%%.post*}\nexport frame_commit=$(echo `${python} -c \"import paddle;print(paddle.version.commit)\"`)\n# BENCHMARK_ROOT='.'  # only for self-test\n# run benchmark sh\n# Usage:\n# bash run_benchmark_train.sh config.txt params\n# or\n# bash run_benchmark_train.sh config.txt\nfunction func_parser_params(){\n    strs=$1\n    IFS=\"=\"\n    array=(${strs})\n    tmp=${array[1]}\n    echo ${tmp}\n}\nfunction func_sed_params(){\n    filename=$1\n    line=$2\n    param_value=$3\n    params=`sed -n \"${line}p\" $filename`\n    IFS=\":\"\n    array=(${params})\n    key=${array[0]}\n    value=${array[1]}\n    if [[ $value =~ 'benchmark_train' ]];then\n        IFS='='\n        _val=(${value})\n        param_value=\"${param_value}\"\n    fi\n    new_params=\"${key}:${param_value}\"",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:1-42"
+    },
+    "8513": {
+        "file_id": 630,
+        "content": "This script is a Bash function for running benchmark training on PaddlePaddle GPU. It sets environment variables, parses command line arguments, and executes the benchmark training using the provided configuration file.",
+        "type": "comment"
+    },
+    "8514": {
+        "file_id": 630,
+        "content": "    IFS=\";\"\n    cmd=\"sed -i '${line}s/.*/${new_params}/' '${filename}'\"\n    eval $cmd\n}\nfunction set_gpu_id(){\n    string=$1\n    _str=${string:1:6}\n    IFS=\"C\"\n    arr=(${_str})\n    M=${arr[0]}\n    P=${arr[1]}\n    gn=`expr $P - 1`\n    gpu_num=`expr $gn / $M`\n    seq=`seq -s \",\" 0 $gpu_num`\n    echo $seq\n}\nfunction get_repo_name(){\n    IFS=\";\"\n    cur_dir=$(pwd)\n    IFS=\"/\"\n    arr=(${cur_dir})\n    echo ${arr[-1]}\n}\nFILENAME=$1\n# copy FILENAME as new\nnew_filename=\"./test_tipc/benchmark_train.txt\"\ncmd=`yes|cp $FILENAME $new_filename`\nFILENAME=$new_filename\n# MODE must be one of ['benchmark_train']\nMODE=$2\nPARAMS=$3\nREST_ARGS=$4\n# bash test_tipc/benchmark_train.sh /workspace/PaddleVideo/test_tipc/configs/BMN/train_infer_python.txt benchmark_train dynamicTostatic_bs8_fp32_DP_N1C8\nto_static=\"\"\n# parse \"to_static\" options and modify trainer into \"to_static_trainer\"\nif [[ $PARAMS =~ \"dynamicTostatic\" ]] ;then\n   to_static=\"d2sT_\"\n   sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME\n   # clear PARAM contents\n   if [ $PARAMS = \"to_static\" ] ;then",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:43-86"
+    },
+    "8515": {
+        "file_id": 630,
+        "content": "This code defines functions to modify parameters in a file and set GPU IDs. It then copies the input filename, sets the mode as \"benchmark_train\", and processes additional parameters. The script performs operations such as modifying lines in the file and replacing \"trainer:norm_train\" with \"trainer:to_static_train\". The purpose of this code seems to be related to manipulating configuration files for a program using PaddleVideo's test_tipc directory.",
+        "type": "comment"
+    },
+    "8516": {
+        "file_id": 630,
+        "content": "    PARAMS=\"\"\n   fi\nfi\nIFS=$'\\n'\n# parser params from train_benchmark.txt\ndataline=`cat $FILENAME`\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\nmodel_name=$(func_parser_value \"${lines[1]}\")\n# 获取'train_benchmark_params'所在的行数\nline_num=`grep -n -w \"train_benchmark_params\" $FILENAME  | cut -d \":\" -f 1`\n# for train log parser\nbatch_size=$(func_parser_value \"${lines[line_num]}\")\nline_num=`expr $line_num + 1`\nfp_items=$(func_parser_value \"${lines[line_num]}\")\nline_num=`expr $line_num + 1`\nepoch=$(func_parser_value \"${lines[line_num]}\")\nline_num=`expr $line_num + 1`\nprofile_option_key=$(func_parser_key \"${lines[line_num]}\")\nprofile_option_params=$(func_parser_value \"${lines[line_num]}\")\nprofile_option=\"${profile_option_key}:${profile_option_params}\"\nline_num=`expr $line_num + 1`\nflags_value=$(func_parser_value \"${lines[line_num]}\")\n# 设置每个模型max-iters，以获取稳定的ips\nline_num=`expr $line_num + 1`\nmax_iters_value=$(func_parser_value \"${lines[line_num]}\")\n# set flags\nIFS=\";\"\nflags_list=(${flags_value})\nfor _flag in ${flags_list[*]}; do",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:87-123"
+    },
+    "8517": {
+        "file_id": 630,
+        "content": "The code is parsing parameters from the \"train_benchmark.txt\" file and setting variables such as model name, batch size, fp_items, epoch, profile option key, profile option parameters, flags value, and max_iters value for training purposes. These values will be used to train a specific model with given parameters.",
+        "type": "comment"
+    },
+    "8518": {
+        "file_id": 630,
+        "content": "    cmd=\"export ${_flag}\"\n    eval $cmd\ndone\n# set log_name\nrepo_name=$(get_repo_name )\nSAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)}   # */benchmark_log\nmkdir -p \"${SAVE_LOG}/benchmark_log/\"\nstatus_log=\"${SAVE_LOG}/benchmark_log/results.log\"\n# get benchmark profiling params : PROFILING_TIMER_ONLY=no|True|False\nPROFILING_TIMER_ONLY=${PROFILING_TIMER_ONLY:-\"True\"}\n# The number of lines in which train params can be replaced.\nline_python=3\nline_gpuid=4\nline_precision=6\nline_epoch=7\nline_batchsize=9\nline_profile=12\nline_eval_py=24\nline_eval_py_2=25\nline_export_py=38\nline_export_py_2=28\nline_export_py_3=30\nline_norm_train=16\nfunc_sed_params \"$FILENAME\" \"${line_eval_py}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_eval_py_2}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_export_py}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_export_py_2}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_export_py_3}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_python}\"  \"$python\"\n# 末尾加上--max_iters=30和--log_interval=1，以便运行并输出足量数据\nset_log_interval_cmd=\"sed -i '${line_norm_train}s/.*/& --max_iters=${max_iters_value} -o log_interval=1/' '${filename}'\"",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:124-158"
+    },
+    "8519": {
+        "file_id": 630,
+        "content": "This code is setting environment variables, defining log file locations and names, and using sed commands to modify a configuration file. It then executes the modified configuration file with additional command line parameters. This is likely part of a benchmarking or training process for machine learning or video processing tasks.",
+        "type": "comment"
+    },
+    "8520": {
+        "file_id": 630,
+        "content": "eval $set_log_interval_cmd\n# 去掉--validate，benchmark不需要validate\nremove_validate_cmd=\"sed -i '${line_norm_train}s/--validate//' '${filename}'\"\neval $remove_validate_cmd\n# if params\nif  [ ! -n \"$PARAMS\" ] ;then\n    # PARAMS input is not a word.\n    IFS=\"|\"\n    batch_size_list=(${batch_size})\n    fp_items_list=(${fp_items})\n    device_num_list=(N1C4)\n    run_mode=\"DP\"\nelif [[ ${PARAMS} = \"dynamicTostatic\" ]] ;then\n    IFS=\"|\"\n    model_type=$PARAMS\n    batch_size_list=(${batch_size})\n    fp_items_list=(${fp_items})\n    device_num_list=(N1C4)\n    run_mode=\"DP\"\nelse\n    # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}\n    IFS=\"_\"\n    params_list=(${PARAMS})\n    model_type=${params_list[0]}\n    batch_size=${params_list[1]}\n    batch_size=`echo  ${batch_size} | tr -cd \"[0-9]\" `\n    precision=${params_list[2]}\n    run_mode=${params_list[3]}\n    device_num=${params_list[4]}\n    IFS=\";\"\n    if [ ${precision} = \"null\" ];then\n        precision=\"fp32\"\n    fi\n    fp_items_list=($precision)\n    batch_size_list=($batch_size)",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:159-197"
+    },
+    "8521": {
+        "file_id": 630,
+        "content": "This code is parsing parameters and configuring the environment for benchmarking. It removes \"validate\" from the command, checks if the input is a dynamic or static parameter, and then assigns variables based on the type of model, batch size, precision, run mode, and device number. If the precision is null, it defaults to fp32.",
+        "type": "comment"
+    },
+    "8522": {
+        "file_id": 630,
+        "content": "    device_num_list=($device_num)\nfi\nlog_interval='--log_interval 1'\nIFS=\"|\"\nfor batch_size in ${batch_size_list[*]}; do\n    for precision in ${fp_items_list[*]}; do\n        for device_num in ${device_num_list[*]}; do\n            # sed batchsize and precision\n            func_sed_params \"$FILENAME\" \"${line_precision}\" \"$precision\"\n            func_sed_params \"$FILENAME\" \"${line_batchsize}\" \"$batch_size\"\n            func_sed_params \"$FILENAME\" \"${line_epoch}\" \"$epoch\"\n            gpu_id=$(set_gpu_id $device_num)\n            if [ ${#gpu_id} -le 1 ];then\n                func_sed_params \"$FILENAME\" \"${line_gpuid}\" \"0\"  # sed used gpu_id \n                if [[ ${PROFILING_TIMER_ONLY} != \"no\" ]];then\n                    echo \"run profile\"\n                    # The default value of profile_option's timer_only parameter is True\n                    if [[ ${PROFILING_TIMER_ONLY} = \"False\" ]];then\n                        profile_option=\"${profile_option};timer_only=False\"\n                    fi\n                    log_path=\"$SAVE_LOG/profiling_log\"",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:198-220"
+    },
+    "8523": {
+        "file_id": 630,
+        "content": "The code is iterating over different combinations of batch sizes and precisions to train the PaddleVideo model. It sets up various environment variables and uses sed to modify a file before running the training script on specific GPUs. The profile option determines if only timer information should be logged or if full profiling data should be collected.",
+        "type": "comment"
+    },
+    "8524": {
+        "file_id": 630,
+        "content": "                    mkdir -p $log_path\n                    log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling\"\n                    # set profile_option params\n                    tmp=`sed -i \"${line_profile}s/.*/\\\"${profile_option}\\\"/\" \"${FILENAME}\"`\n                    # for models which need to accumulate gradient.\n                    if [[ ${model_name} =~ \"TimeSformer\" ]]; then\n                        global_bs=`expr ${batch_size} \\* ${device_num:3:4} \\* 8`\n                        modify_global_bs_cmd=\"sed -i '${line_norm_train}s/.*/& -o GRADIENT_ACCUMULATION.global_batch_size=${global_bs}/' '${filename}'\"\n                        eval $modify_global_bs_cmd\n                    fi\n                    # run test_train_inference_python.sh\n                    cmd=\"timeout 5m bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 \"\n                    echo $cmd\n                    eval ${cmd}",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:221-234"
+    },
+    "8525": {
+        "file_id": 630,
+        "content": "Creates a directory for log storage, sets the name of the log file based on various parameters, modifies profile option settings if necessary (for TimeSformer models), and then runs test_train_inference_python.sh script with provided arguments, redirecting output to the specified log path.",
+        "type": "comment"
+    },
+    "8526": {
+        "file_id": 630,
+        "content": "                    eval \"cat ${log_path}/${log_name}\"\n                fi\n                echo \"run without profile\"  \n                # without profile\n                log_path=\"$SAVE_LOG/train_log\"\n                speed_log_path=\"$SAVE_LOG/index\"\n                mkdir -p $log_path\n                mkdir -p $speed_log_path\n                log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log\"\n                speed_log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed\"\n                func_sed_params \"$FILENAME\" \"${line_profile}\" \"null\"  # sed profile_id as null\n                cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 \"\n                echo $cmd\n                job_bt=`date '+%Y%m%d%H%M%S'`\n                eval $cmd\n                job_et=`date '+%Y%m%d%H%M%S'`\n                export model_run_time=$((${job_et}-${job_bt}))\n                eval \"cat ${log_path}/${log_name}\"",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:235-253"
+    },
+    "8527": {
+        "file_id": 630,
+        "content": "This code snippet executes a script without profiling. It sets the log and speed log paths, creates directories if necessary, and then runs a command to execute the test_train_inference_python.sh script. The run time is measured and stored in model_run_time variable. Finally, it displays the execution log.",
+        "type": "comment"
+    },
+    "8528": {
+        "file_id": 630,
+        "content": "                # parser log\n                _model_name=\"${model_name}_bs${batch_size}_${precision}_${run_mode}\"\n                cmd=\"${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \\\n                        --speed_log_file '${speed_log_path}/${speed_log_name}' \\\n                        --model_name ${_model_name} \\\n                        --base_batch_size ${batch_size} \\\n                        --run_mode ${run_mode} \\\n                        --fp_item ${precision} \\\n                        --keyword ips: \\\n                        --skip_steps 5 \\\n                        --device_num ${device_num} \\\n                        --speed_unit instance/sec \\\n                        --convergence_key loss: \"\n                echo $cmd\n                eval $cmd\n                last_status=${PIPESTATUS[0]}\n                status_check $last_status \"${cmd}\" \"${status_log}\" \"${model_name}\"\n            else\n                IFS=\";\"\n                unset_env=`unset CUDA_VISIBLE_DEVICES`",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:255-274"
+    },
+    "8529": {
+        "file_id": 630,
+        "content": "This code section is using Python to execute an analysis script. The analysis script processes log files, extracting performance metrics like inference per second (ips) and loss convergence data. It also handles skipping steps during processing and considers the device used for computation. The resulting status is logged into a specified file.",
+        "type": "comment"
+    },
+    "8530": {
+        "file_id": 630,
+        "content": "                log_path=\"$SAVE_LOG/train_log\"\n                speed_log_path=\"$SAVE_LOG/index\"\n                mkdir -p $log_path\n                mkdir -p $speed_log_path\n                log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log\"\n                speed_log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed\"\n                func_sed_params \"$FILENAME\" \"${line_gpuid}\" \"$gpu_id\"  # sed used gpu_id\n                func_sed_params \"$FILENAME\" \"${line_profile}\" \"null\"  # sed --profile_option as null\n                # for models which need to accumulate gradient.\n                if [[ ${model_name} =~ \"TimeSformer\" ]]; then\n                    global_bs=`expr ${batch_size} \\* ${device_num:3:4} \\* 8`\n                    modify_global_bs_cmd=\"sed -i '${line_norm_train}s/.*/& -o GRADIENT_ACCUMULATION.global_batch_size=${global_bs}/' '${filename}'\"\n                    eval $modify_global_bs_cmd",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:275-288"
+    },
+    "8531": {
+        "file_id": 630,
+        "content": "Creates log and speed directories, sets variable names for logging files. Uses sed to modify the config file with gpu_id, profile option as null, and adjusts global batch size for TimeSformer model that needs gradient accumulation.",
+        "type": "comment"
+    },
+    "8532": {
+        "file_id": 630,
+        "content": "                fi\n                cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 \"\n                echo $cmd\n                job_bt=`date '+%Y%m%d%H%M%S'`\n                eval $cmd\n                job_et=`date '+%Y%m%d%H%M%S'`\n                export model_run_time=$((${job_et}-${job_bt}))\n                eval \"cat ${log_path}/${log_name}\"\n                # parser log\n                _model_name=\"${model_name}_bs${batch_size}_${precision}_${run_mode}\"\n                cmd=\"${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \\\n                        --speed_log_file '${speed_log_path}/${speed_log_name}' \\\n                        --model_name ${_model_name} \\\n                        --base_batch_size ${batch_size} \\\n                        --run_mode ${run_mode} \\\n                        --fp_item ${precision} \\\n                        --keyword ips: \\\n                        --skip_steps 5 \\\n                        --device_num ${device_num} \\",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:289-308"
+    },
+    "8533": {
+        "file_id": 630,
+        "content": "This code segment is running a benchmark training script and logging the results. It measures the model run time, parses the log to extract information about the speed, and then passes this information to another script for further analysis. The script is designed to handle different batch sizes, precision types, and run modes.",
+        "type": "comment"
+    },
+    "8534": {
+        "file_id": 630,
+        "content": "                        --speed_unit instance/sec \\\n                        --convergence_key loss: \"\n                echo $cmd\n                eval $cmd\n                last_status=${PIPESTATUS[0]}\n                status_check $last_status \"${cmd}\" \"${status_log}\" \"${model_name}\"\n            fi\n        done\n    done\ndone",
+        "type": "code",
+        "location": "/test_tipc/benchmark_train.sh:309-318"
+    },
+    "8535": {
+        "file_id": 630,
+        "content": "This code iterates through different models and configurations, running them with specified parameters. It logs the commands and checks their status to ensure successful execution.",
+        "type": "comment"
+    },
+    "8536": {
+        "file_id": 631,
+        "content": "/test_tipc/common_func.sh",
+        "type": "filepath"
+    },
+    "8537": {
+        "file_id": 631,
+        "content": "This code includes functions for parsing, setting parameters, and performing status checks. The last function logs success or failure with command details to the specified run_log file using tee -a.",
+        "type": "summary"
+    },
+    "8538": {
+        "file_id": 631,
+        "content": "#!/bin/bash\nfunction func_parser_key(){\n    strs=$1\n    IFS=\":\"\n    array=(${strs})\n    tmp=${array[0]}\n    echo ${tmp}\n}\nfunction func_parser_value(){\n    strs=$1\n    IFS=\":\"\n    array=(${strs})\n    tmp=${array[1]}\n    echo ${tmp}\n}\nfunction func_set_params(){\n    key=$1\n    value=$2\n    if [ ${key}x = \"null\"x ];then\n        echo \" \"\n    elif [[ ${value} = \"null\" ]] || [[ ${value} = \" \" ]] || [ ${#value} -le 0 ];then\n        echo \" \"\n    else \n        echo \"${key}=${value}\"\n    fi\n}\nfunction func_parser_params(){\n    strs=$1\n    IFS=\":\"\n    array=(${strs})\n    key=${array[0]}\n    tmp=${array[1]}\n    IFS=\"|\"\n    res=\"\"\n    for _params in ${tmp[*]}; do\n        IFS=\"=\"\n        array=(${_params})\n        mode=${array[0]}\n        value=${array[1]}\n        if [[ ${mode} = ${MODE} ]]; then\n            IFS=\"|\"\n            #echo $(func_set_params \"${mode}\" \"${value}\")\n            echo $value\n            break\n        fi\n        IFS=\"|\"\n    done\n    echo ${res}\n}\nfunction status_check(){\n    last_status=$1   # the exit code\n    run_command=$2\n    run_log=$3",
+        "type": "code",
+        "location": "/test_tipc/common_func.sh:1-58"
+    },
+    "8539": {
+        "file_id": 631,
+        "content": "This code defines several functions for parsing and setting parameters, as well as performing status checks. The functions extract keys and values from strings using specific delimiters, set parameters based on their presence and content, and check the exit code of previous commands.",
+        "type": "comment"
+    },
+    "8540": {
+        "file_id": 631,
+        "content": "    model_name=$4\n    log_path=$5\n    if [ $last_status -eq 0 ]; then\n        echo -e \"\\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path} \\033[0m\" | tee -a ${run_log}\n    else\n        echo -e \"\\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path} \\033[0m\" | tee -a ${run_log}\n    fi\n}",
+        "type": "code",
+        "location": "/test_tipc/common_func.sh:59-66"
+    },
+    "8541": {
+        "file_id": 631,
+        "content": "This function checks the last status and logs whether the run was successful or failed with specific command details. It appends the log to the specified run_log file using tee -a command.",
+        "type": "comment"
+    },
+    "8542": {
+        "file_id": 632,
+        "content": "/test_tipc/compare_results.py",
+        "type": "filepath"
+    },
+    "8543": {
+        "file_id": 632,
+        "content": "The code imports libraries, defines functions for parsing arguments and log files, checks for names, reads a log file, stores results in \"parser_results\", loads ground truth from multiple files, and compares log results with ground truth for testing.",
+        "type": "summary"
+    },
+    "8544": {
+        "file_id": 632,
+        "content": "import numpy as np\nimport os\nimport subprocess\nimport json\nimport argparse\nimport glob\ndef init_args():\n    parser = argparse.ArgumentParser()\n    # params for testing assert allclose\n    parser.add_argument(\"--atol\", type=float, default=1e-3)\n    parser.add_argument(\"--rtol\", type=float, default=1e-3)\n    parser.add_argument(\"--gt_file\", type=str, default=\"\")\n    parser.add_argument(\"--log_file\", type=str, default=\"\")\n    parser.add_argument(\"--precision\", type=str, default=\"fp32\")\n    return parser\ndef parse_args():\n    parser = init_args()\n    return parser.parse_args()\ndef run_shell_command(cmd):\n    p = subprocess.Popen(cmd,\n                         stdout=subprocess.PIPE,\n                         stderr=subprocess.PIPE,\n                         shell=True)\n    out, err = p.communicate()\n    if p.returncode == 0:\n        return out.decode('utf-8')\n    else:\n        return None\ndef parser_results_from_log_by_name(log_path, names_list):\n    if not os.path.exists(log_path):\n        raise ValueError(\"The log file {} does not exists!\".format(log_path))",
+        "type": "code",
+        "location": "/test_tipc/compare_results.py:1-40"
+    },
+    "8545": {
+        "file_id": 632,
+        "content": "This code imports necessary libraries and defines functions for parsing command-line arguments, running shell commands, and retrieving results from log files. It uses ArgumentParser to handle command line arguments, subprocess to execute shell commands, and os to check file existence.",
+        "type": "comment"
+    },
+    "8546": {
+        "file_id": 632,
+        "content": "    if names_list is None or len(names_list) < 1:\n        return []\n    parser_results = {}\n    lines = open(log_path, 'r').read().splitlines()\n    if 'python_infer' in log_path:  # parse python inference\n        for line in lines:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            for name in names_list:\n                if name in line:\n                    if '.' in split_items[-1]:\n                        parser_results[name] = float(split_items[-1])\n                    else:\n                        parser_results[name] = int(split_items[-1])\n    else:  # parse cpp inference\n        for line in lines:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            if all([(name + ':') in split_items for name in names_list]):\n                # print(split_items)",
+        "type": "code",
+        "location": "/test_tipc/compare_results.py:42-64"
+    },
+    "8547": {
+        "file_id": 632,
+        "content": "This code checks if there are any names in the \"names_list\" and reads a log file at the specified \"log_path\". If the file contains \"python_infer\", it parses the python inference results, while for other log files, it parses C++ inference results. It stores the results in the \"parser_results\" dictionary with names as keys and corresponding values as either integers or floats.",
+        "type": "comment"
+    },
+    "8548": {
+        "file_id": 632,
+        "content": "                parser_results['class'] = int(split_items[2])\n                parser_results['score'] = float(split_items[-1])\n    return parser_results\ndef load_gt_from_file(gt_file):\n    if not os.path.exists(gt_file):\n        raise ValueError(\"The log file {} does not exists!\".format(gt_file))\n    with open(gt_file, 'r') as f:\n        data = f.readlines()\n        f.close()\n    parser_gt = {}\n    for line in data:\n        if 'top-1 class' in line:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            parser_gt['top-1 class'] = int(split_items[-1])\n        elif 'top-1 score' in line:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            parser_gt['top-1 score'] = float(split_items[-1])\n        elif \"score\" in line and 'segment' in line:\n            location_dict = eval(line)",
+        "type": "code",
+        "location": "/test_tipc/compare_results.py:65-89"
+    },
+    "8549": {
+        "file_id": 632,
+        "content": "This code defines a function `load_gt_from_file` that reads and parses the contents of a log file. It first checks if the file exists, then opens it in read mode. For each line containing 'top-1 class' or 'top-1 score', it extracts the class and score values, storing them as key-value pairs in `parser_gt`. If the file is not found, it raises a ValueError with an error message. The code also handles dictionaries with string keys, allowing for easy integration into larger programs.",
+        "type": "comment"
+    },
+    "8550": {
+        "file_id": 632,
+        "content": "            parser_gt[f\"score_{len(parser_gt)}\"] = location_dict['score']\n            parser_gt[f\"segment_{len(parser_gt)}\"] = location_dict['segment']\n        elif \"class:\" in line and \"score:\" in line:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            parser_gt['class'] = int(split_items[2])\n            parser_gt['score'] = float(split_items[-1])\n    return parser_gt\ndef load_gt_from_txts(gt_file):\n    gt_list = glob.glob(gt_file)\n    gt_collection = {}\n    for gt_f in gt_list:\n        gt_dict = load_gt_from_file(gt_f)\n        basename = os.path.basename(gt_f)\n        if \"fp32\" in basename:\n            gt_collection[\"fp32\"] = [gt_dict, gt_f]\n        elif \"fp16\" in basename:\n            gt_collection[\"fp16\"] = [gt_dict, gt_f]\n        elif \"int8\" in basename:\n            gt_collection[\"int8\"] = [gt_dict, gt_f]\n        else:\n            continue\n    return gt_collection\ndef collect_predict_from_logs(log_path, key_list):",
+        "type": "code",
+        "location": "/test_tipc/compare_results.py:90-118"
+    },
+    "8551": {
+        "file_id": 632,
+        "content": "The code defines three functions:\n1. `load_gt_from_file` loads ground truth data from a file, handling both the cases when each line contains location details or class and score information.\n2. `collect_predict_from_logs` collects predict results from logs based on given key list.\n3. `load_gt_from_txts` loads ground truth collections from multiple files (fp32, fp16, int8), organizing them under corresponding keys in a dictionary.",
+        "type": "comment"
+    },
+    "8552": {
+        "file_id": 632,
+        "content": "    log_list = glob.glob(log_path)\n    pred_collection = {}\n    for log_f in log_list:\n        pred_dict = parser_results_from_log_by_name(log_f, key_list)\n        key = os.path.basename(log_f)\n        pred_collection[key] = pred_dict\n    return pred_collection\ndef testing_assert_allclose(dict_x, dict_y, atol=1e-7, rtol=1e-7):\n    for k in dict_x:\n        np.testing.assert_allclose(np.array(dict_x[k]),\n                                   np.array(dict_y[k]),\n                                   atol=atol,\n                                   rtol=rtol)\nif __name__ == \"__main__\":\n    # Usage example:\n    # test python infer:\n    ## python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/PP-TSM/*.txt  --log_file=./test_tipc/output/PP-TSM/python_infer_*.log\n    # test cpp infer:\n    ## python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/PP-TSM_CPP/*.txt  --log_file=./test_tipc/output/PP-TSM_CPP/cpp_infer_*.log\n    args = parse_args()\n    gt_collection = load_gt_from_txts(args.gt_file)",
+        "type": "code",
+        "location": "/test_tipc/compare_results.py:119-146"
+    },
+    "8553": {
+        "file_id": 632,
+        "content": "The code reads logs from specified file paths and compares the results with ground truth data for testing purposes. It uses numpy's assert_allclose function to validate the accuracy of the predicted results against the ground truth. The usage example provides command line options to compare Python and C++ inferencing results.",
+        "type": "comment"
+    },
+    "8554": {
+        "file_id": 632,
+        "content": "    key_list = gt_collection[\"fp32\"][0].keys()\n    pred_collection = collect_predict_from_logs(args.log_file, key_list)\n    for filename in pred_collection.keys():\n        if \"fp32\" in filename:\n            gt_dict, gt_filename = gt_collection[\"fp32\"]\n        elif \"fp16\" in filename:\n            gt_dict, gt_filename = gt_collection[\"fp16\"]\n        elif \"int8\" in filename:\n            gt_dict, gt_filename = gt_collection[\"int8\"]\n        else:\n            continue\n        pred_dict = pred_collection[filename]\n        try:\n            testing_assert_allclose(gt_dict,\n                                    pred_dict,\n                                    atol=args.atol,\n                                    rtol=args.rtol)\n            print(\n                \"Assert allclose passed! The results of {} and {} are consistent!\"\n                .format(filename, gt_filename))\n        except Exception as E:\n            print(E)\n            raise ValueError(\n                \"The results of {} and the results of {} are inconsistent!\".",
+        "type": "code",
+        "location": "/test_tipc/compare_results.py:147-170"
+    },
+    "8555": {
+        "file_id": 632,
+        "content": "Iterates through the log files, compares \"fp32\", \"fp16\" and \"int8\" results with ground truth, uses testing_assert_allclose to check for consistency and prints success/failure messages.",
+        "type": "comment"
+    },
+    "8556": {
+        "file_id": 632,
+        "content": "                format(filename, gt_filename))",
+        "type": "code",
+        "location": "/test_tipc/compare_results.py:171-171"
+    },
+    "8557": {
+        "file_id": 632,
+        "content": "This line of code formats the filename and ground truth filename for comparison purposes in the context of image or video analysis.",
+        "type": "comment"
+    },
+    "8558": {
+        "file_id": 633,
+        "content": "/test_tipc/extract_loss.py",
+        "type": "filepath"
+    },
+    "8559": {
+        "file_id": 633,
+        "content": "This code parses and extracts expressions, specifies reduction type (print/sum/mean), discards line parts, and enables debug mode. It defines functions to parse arguments, log messages, validate/extract data, and performs calculations on a list of numerical tuples based on user-defined parameters in the main function.",
+        "type": "summary"
+    },
+    "8560": {
+        "file_id": 633,
+        "content": "import sys\nimport argparse\nimport re\ndef parameter_parser():\n    parser = argparse.ArgumentParser(description=\"Support Args:\")\n    parser.add_argument(\"-v\",\n                        \"--valid-expr\",\n                        type=str,\n                        default=\"*\",\n                        help=\"when not match, the line will discard.\")\n    parser.add_argument(\"-e\",\n                        \"--extract-expr\",\n                        type=str,\n                        default=\"^{%s}$,\",\n                        help=\"the extract expr for the loss: loss {%f}\")\n    parser.add_argument(\"-r\",\n                        \"--reduction-expr\",\n                        type=str,\n                        default=\"print\",\n                        help=\"print | sum | mean\")\n    parser.add_argument(\"-n\",\n                        \"--discard\",\n                        type=int,\n                        default=0,\n                        help=\"while reduction, discard [0:n] and [-n:]\")\n    parser.add_argument(\"-d\", \"--debug\", type=bool, default=False, help=\"debug\")",
+        "type": "code",
+        "location": "/test_tipc/extract_loss.py:1-28"
+    },
+    "8561": {
+        "file_id": 633,
+        "content": "This code parses arguments for validating and extracting expressions, specifying reduction type (print/sum/mean), discarding line parts, and enabling debug mode.",
+        "type": "comment"
+    },
+    "8562": {
+        "file_id": 633,
+        "content": "    return parser.parse_args()\nargs = parameter_parser()\ndef log(*inp, **kargs):\n    if args.debug:\n        print(*inp, **kargs)\ndef is_valid(line, valid_expr):\n    if valid_expr == \"*\": return True\n    if valid_expr in line: return True\n    return False\ndef extract(line, extract_expr):\n    \"\"\"\n    return tuple, the output will be\n    \"\"\"\n    log(\"Extract_expression is : \", extract_expr)\n    x = re.findall(\"\\{%(.)\\}\", extract_expr)\n    assert len(x) == 1, \"Must exist a {%d} | {%f} | {%s} \"\n    t = x[0]\n    type_converter = {\n        'f': float,\n        'i': int,\n        's': str,\n    }\n    type_extracter = {\n        \"f\": r'(-?\\\\d+\\\\.\\\\d+)',\n        \"i\": r'(-?\\\\d+)',\n        \"s\": r'(.*?)',\n    }\n    log(type_extracter[t])\n    pattern = re.sub(\"\\{%(.)\\}\", type_extracter[t], extract_expr, 1)\n    log(\"Created Pattern is: \", pattern)\n    x = re.findall(pattern, line)\n    if len(x) == 0: return None\n    assert len(x) == 1, f\"Multi Match for `{extract_expr}` in line: \\n{line}\"\n    log(\"Find in line: \", x[0].strip())\n    return type_converter[t](x[0].strip())",
+        "type": "code",
+        "location": "/test_tipc/extract_loss.py:29-71"
+    },
+    "8563": {
+        "file_id": 633,
+        "content": "The code defines functions to parse arguments, log messages, and validate or extract data from a given line. The \"is_valid\" function checks if the input line matches a specific expression or wildcard, while the \"extract\" function uses regular expressions to parse a specified type of data (float, int, or string) from a given line.",
+        "type": "comment"
+    },
+    "8564": {
+        "file_id": 633,
+        "content": "def action(tuple_list, action):\n    # discard the warm up\n    if args.discard > 0:\n        tuple_list = tuple_list[args.discard:]\n        tuple_list = tuple_list[:-args.discard]\n    # do action for each item\n    if action == \"sum\":\n        print(sum(tuple_list))\n    if action == \"mean\":\n        if len(tuple_list) == 0: print(\"null\")\n        else: print(sum(tuple_list) / len(tuple_list))\n    if action == \"print\":\n        for item in tuple_list:\n            print(item)\ndef main():\n    current_step = 0\n    tuple_list = []\n    for line in sys.stdin:\n        line = line.strip()\n        if is_valid(line, args.valid_expr):\n            ret = extract(line, args.extract_expr)\n            if ret: tuple_list.append(ret)\n    action(tuple_list, args.reduction_expr)\nif __name__ == \"__main__\":\n    main()",
+        "type": "code",
+        "location": "/test_tipc/extract_loss.py:74-102"
+    },
+    "8565": {
+        "file_id": 633,
+        "content": "This code defines a function 'action' which performs calculations on a list of numerical tuples and prints the result based on the given action. The main function reads input lines, validates them, extracts values, and passes the resulting tuple list to the 'action' function based on user-defined parameters.",
+        "type": "comment"
+    },
+    "8566": {
+        "file_id": 634,
+        "content": "/test_tipc/prepare.sh",
+        "type": "filepath"
+    },
+    "8567": {
+        "file_id": 634,
+        "content": "This script prepares PaddlePaddle's video object detection models by handling data, installing packages, and downloading/preprocessing for TIPC models. It also prepares data for AttentionLSTM and SlowFast models. The code downloads pre-trained model data and weights for various models like ResNet50, TSN, TimeSformer, PP-TSM, and VideoSwin.",
+        "type": "summary"
+    },
+    "8568": {
+        "file_id": 634,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\n# set -xe\n:<<!\nMODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer',\n#                 'whole_infer',\n#                 'cpp_infer', ]\n!\nMODE=$2\ndataline=$(cat ${FILENAME})\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# determine python interpreter version\npython=python\n# install auto-log package.\n${python} -m pip install unrar\n${python} -m pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl\n# The training params\nmodel_name=$(func_parser_value \"${lines[1]}\")\ntrainer_list=$(func_parser_value \"${lines[14]}\")\nif [ ${MODE} = \"lite_train_lite_infer\" ];then\n    if [ ${model_name} == \"PP-TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:1-44"
+    },
+    "8569": {
+        "file_id": 634,
+        "content": "This script is preparing the environment for training and inference on PaddlePaddle's video object detection models. It takes a filename as an argument, parses its contents to determine the model name and mode, installs required packages like auto-log, and prepares any necessary data or pretrained weights for the selected model.",
+        "type": "comment"
+    },
+    "8570": {
+        "file_id": 634,
+        "content": "    elif [ ${model_name} == \"PP-TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"STGCN\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"AGCN2s\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:45-67"
+    },
+    "8571": {
+        "file_id": 634,
+        "content": "This code checks the value of the 'model_name' variable and performs specific actions based on its value. If 'model_name' is \"PP-TSN\", it downloads pretrained weights for ResNet50 model. If 'model_name' is \"AGCN\" or \"STGCN\", it downloads training data for FSD10 dataset. The code uses pushd and popd commands to navigate directories, and wget command to download files.",
+        "type": "comment"
+    },
+    "8572": {
+        "file_id": 634,
+        "content": "        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:68-87"
+    },
+    "8573": {
+        "file_id": 634,
+        "content": "This code segment downloads the necessary data and pretrained weights for different models (FSD, TSM, or TimeSformer). It checks the value of the model_name variable and performs specific actions accordingly. For FSD, it downloads train data and labels. For TSM and TimeSformer, it downloads lite train data and pretrained ResNet50 weights. The code also uses pushd and popd commands to change directories temporarily during the process.",
+        "type": "comment"
+    },
+    "8574": {
+        "file_id": 634,
+        "content": "        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        pushd data/yt8m\n        ## download & decompression training data\n        wget -nc https://videotag.bj.bcebos.com/Data/yt8m_rawframe_small.tar\n        tar -xf yt8m_rawframe_small.tar\n        ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple\n        ${python} tf2pkl.py ./frame ./pkl_frame/\n        ls pkl_frame/train*.pkl > train_small.list # 将train*.pkl的路径写入train_small.list\n        ls pkl_frame/validate*.pkl > val_small.list # 将validate*.pkl的路径写入val_small.list\n        ${python} split_yt8m.py train_small.list # 拆分每个train*.pkl变成多个train*_split*.pkl",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:88-105"
+    },
+    "8575": {
+        "file_id": 634,
+        "content": "The code snippet is preparing data and downloading pretrained weights for different models. It first prepares the lite train data by downloading and decompressing a dataset, then installs TensorFlow GPU version 1.14.0, converts data format using tf2pkl.py script, splits the train data into multiple files, and finally, it downloads the pretrained weights for the specified model.",
+        "type": "comment"
+    },
+    "8576": {
+        "file_id": 634,
+        "content": "        ${python} split_yt8m.py val_small.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl\n        ls pkl_frame/train*_split*.pkl > train_small.list # 将train*_split*.pkl的路径重新写入train_small.list\n        ls pkl_frame/validate*_split*.pkl > val_small.list # 将validate*_split*.pkl的路径重新写入val_small.list\n        popd\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n    elif [ ${model_name} == \"BMN\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        mkdir bmn_data\n        cd bmn_data\n        wget -nc https://videotag.bj.bcebos.com/Data/BMN_lite/bmn_feat.tar.gz\n        tar -xf bmn_feat.tar.gz\n        wget -nc https://videotag.bj.bcebos.com/Data/BMN_lite/activitynet_1.3_annotations.json\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json\n        popd\n    elif [ ${model_name} == \"TokenShiftVisionTransformer\" ]; then",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:106-127"
+    },
+    "8577": {
+        "file_id": 634,
+        "content": "This code is checking the value of the variable `model_name` and performing different operations based on its value. For example, if `model_name` equals \"SlowFast\", it changes directory to `./data/k400`, downloads a tar file containing data for pre-training a SlowFast model, and then extracts the tar file. Similarly, if `model_name` is \"BMN\" or \"TokenShiftVisionTransformer\", different operations are carried out, such as downloading and extracting necessary data files for pre-training these models. The code uses various commands like `pushd`, `popd`, `wget`, and `tar` to manipulate directories and files.",
+        "type": "comment"
+    },
+    "8578": {
+        "file_id": 634,
+        "content": "        # download pretrained weights\n        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"PoseC3D\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        mkdir posec3d_data\n        cd posec3d_data\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PoseC3D_data_small.tar\n        tar -xf PoseC3D_data_small.tar\n        popd\n    elif [ ${model_name} == \"YOWO\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        wget -nc https://videotag.bj.bcebos.com/Data/ucf-24-lite.zip\n        unzip -qo ucf-24-lite.zip\n        pushd ./ucf24\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/YOWO_epoch_00005.pdparams\n        popd\n    else\n        echo \"Not added into TIPC yet.\"",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:128-149"
+    },
+    "8579": {
+        "file_id": 634,
+        "content": "This script downloads pre-trained model weights and preprocesses training data for specific models. For ViT_base, it downloads the weight file. For PoseC3D, it downloads and unzips a small dataset. For YOWO, it downloads the necessary datasets and YOWO's pre-trained model at a specific epoch. Models not in TIPC are not processed.",
+        "type": "comment"
+    },
+    "8580": {
+        "file_id": 634,
+        "content": "    fi\nelif [ ${MODE} = \"whole_train_whole_infer\" ];then\n    if [ ${model_name} == \"PP-TSM\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"PP-TSN\" ]; then\n        # pretrain whole train data",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:150-168"
+    },
+    "8581": {
+        "file_id": 634,
+        "content": "Checking if MODE is \"whole_train_whole_infer\". If true, it determines the model (PP-TSM or PP-TSN) and performs specific actions for pretraining with whole training data. For PP-TSM, downloads Kinetics400 data, extracts raw frames, downloads annotations, and gets pretrained weights. For PP-TSN, similar steps are followed but with different data and models.",
+        "type": "comment"
+    },
+    "8582": {
+        "file_id": 634,
+        "content": "        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        # pretrain whole train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"STGCN\" ]; then\n        # pretrain whole train data\n        pushd data/fsd10",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:169-188"
+    },
+    "8583": {
+        "file_id": 634,
+        "content": "This code downloads pre-trained model weights and data for different models, such as ResNet50_vd_ssld_v2, AGCN, and STGCN. It also downloads annotations and train/validation lists from specific URLs. The code uses pushd and popd commands to change directories temporarily and wget command to perform non-checking downloads (nc) of files.",
+        "type": "comment"
+    },
+    "8584": {
+        "file_id": 634,
+        "content": "        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"TSM\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:189-205"
+    },
+    "8585": {
+        "file_id": 634,
+        "content": "The code checks the value of 'model_name', and if it's \"TSM\", it performs specific actions. It changes to the directory ./data/k400, downloads train and val lists from the URLs provided, then uses bash scripts to download data based on the list. Afterwards, it extracts frames from video files using the 'extract_rawframes.py' script at a certain level and with specific extensions. It also downloads annotations for training and validation sets. Finally, it changes back to the previous directory and downloads pretrained ResNet50 weights.",
+        "type": "comment"
+    },
+    "8586": {
+        "file_id": 634,
+        "content": "    elif [ ${model_name} == \"TSN\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:206-223"
+    },
+    "8587": {
+        "file_id": 634,
+        "content": "The code checks if the model name is \"TSN\" or \"TimeSformer\" and then downloads corresponding data and pretrained weights for each model. It pushes to a directory, downloads training and validation lists, extracts frames from videos, downloads annotations, and finally pops out of the directory. This script appears to be part of a larger program that prepares data for specific models in a machine learning or deep learning context.",
+        "type": "comment"
+    },
+    "8588": {
+        "file_id": 634,
+        "content": "        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        # pretrain whole train data\n        pushd data/yt8m\n        mkdir frame\n        cd frame\n        ## download & decompression training data\n        curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python\n        curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python\n        ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:224-241"
+    },
+    "8589": {
+        "file_id": 634,
+        "content": "This code is preparing the data and environment for a specific model named \"AttentionLSTM\". It downloads links for training and validation datasets, fetches annotations, and gets pre-trained weights. The model requires TensorFlow GPU version 1.14.0 and uses YT8M dataset partitioned into 2 parts - train and validate.",
+        "type": "comment"
+    },
+    "8590": {
+        "file_id": 634,
+        "content": "        cd ..\n        ${python} tf2pkl.py ./frame ./pkl_frame/\n        ls pkl_frame/train*.pkl > train.list # 将train*.pkl的路径写入train.list\n        ls pkl_frame/validate*.pkl > val.list # 将validate*.pkl的路径写入val.list\n        ${python} split_yt8m.py train.list # 拆分每个train*.pkl变成多个train*_split*.pkl\n        ${python} split_yt8m.py val.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl\n        ls pkl_frame/train*_split*.pkl > train.list # 将train*_split*.pkl的路径重新写入train.list\n        ls pkl_frame/validate*_split*.pkl > val.list # 将validate*_split*.pkl的路径重新写入val.list\n        popd\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:242-261"
+    },
+    "8591": {
+        "file_id": 634,
+        "content": "The code is preparing the Kinetics400 dataset for PaddleVideo's SlowFast model by downloading and splitting the train and validation data. It changes directory, uses tf2pkl.py to convert frame files into pkl format, splits the pkl files using split_yt8m.py, and finally writes the file paths into train.list and val.list.",
+        "type": "comment"
+    },
+    "8592": {
+        "file_id": 634,
+        "content": "        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list\n        popd\n    elif [ ${model_name} == \"BMN\" ]; then\n        # pretrain whole train data\n        pushd ./data\n        mkdir bmn_data\n        cd bmn_data\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz\n        tar -xf bmn_feat.tar.gz\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json\n        popd\n    else\n        echo \"Not added into TIPC yet.\"\n    fi\nelif [ ${MODE} = \"lite_train_whole_infer\" ];then\n    if [ ${model_name} == \"PP-TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:262-285"
+    },
+    "8593": {
+        "file_id": 634,
+        "content": "Code handles different scenarios based on the model_name and MODE variables. For BMN, it pretrains using whole train data by downloading necessary files from specified URLs. For PP-TSM in lite_train_whole_infer scenario, it pretrains using lite train data by downloading a tar file and pretrained weights. If none of the conditions match, it displays \"Not added into TIPC yet.\" message.",
+        "type": "comment"
+    },
+    "8594": {
+        "file_id": 634,
+        "content": "    elif [ ${model_name} == \"PP-TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"STGCN\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:286-308"
+    },
+    "8595": {
+        "file_id": 634,
+        "content": "This code checks the value of the `model_name` variable and performs different actions accordingly. If it's \"PP-TSN\", it downloads pretrained weights and lite train data for PP-TSN model. If it's \"AGCN\" or \"STGCN\", it downloads lite train data. And if it's \"TSM\", it downloads lite train data for TSM model. It uses pushd/popd to change directories and wget to download files from specified URLs.",
+        "type": "comment"
+    },
+    "8596": {
+        "file_id": 634,
+        "content": "        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https:/",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:309-329"
+    },
+    "8597": {
+        "file_id": 634,
+        "content": "This code downloads pre-trained model files for different models. For \"PaddleVideo/test_tipc/prepare.sh\", it checks the value of $model_name and proceeds accordingly. It pushes to a specific data folder, then downloads rawframes or videos depending on the model type. Finally, it retrieves the pre-trained weights for each model from an HTTPS URL, handling network errors with -nc option.",
+        "type": "comment"
+    },
+    "8598": {
+        "file_id": 634,
+        "content": "/paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        # pretrain lite train data\n        pushd data/yt8m\n        ## download & decompression training data\n        wget -nc https://videotag.bj.bcebos.com/Data/yt8m_rawframe_small.tar\n        tar -xf yt8m_rawframe_small.tar\n        ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple\n        ${python} tf2pkl.py ./frame ./pkl_frame/\n        ls pkl_frame/train*.pkl > train_small.list # 将train*.pkl的路径写入train_small.list\n        ls pkl_frame/validate*.pkl > val_small.list # 将validate*.pkl的路径写入val_small.list\n        ${python} split_yt8m.py train_small.list # 拆分每个train*.pkl变成多个train*_split*.pkl\n        ${python} split_yt8m.py val_small.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl\n        ls pkl_frame/train*_split*.pkl > train_small.list # 将train*_split*.pkl的路径重新写入train_small.list\n        ls pkl_frame/validate*_split*.pkl > val_small.list # 将validate*_split*.pkl的路径重新写入val_small.list",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:329-345"
+    },
+    "8599": {
+        "file_id": 634,
+        "content": "This code snippet downloads and prepares the dataset for training an AttentionLSTM model. It first checks out the data from a specific URL, installs TensorFlow version 1.14.0, converts the raw video frames to pickle format, splits the data into training and validation sets, and finally lists the resulting files in train_small.list and val_small.list.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/86.json b/docs/data/86.json
new file mode 100644
index 000000000..846355078
--- /dev/null
+++ b/docs/data/86.json
@@ -0,0 +1,545 @@
+{
+    "8600": {
+        "file_id": 634,
+        "content": "        popd\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n    elif [ ${model_name} == \"BMN\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        mkdir bmn_data\n        cd bmn_data\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz\n        tar -xf bmn_feat.tar.gz\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json\n        popd\n    else\n        echo \"Not added into TIPC yet.\"\n    fi\nelif [ ${MODE} = \"whole_infer\" ];then\n    if [ ${model_name} = \"PP-TSM\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams --no-check-certificate\n    elif [ ${model_name} = \"PP-TSN\" ]; then",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:346-370"
+    },
+    "8601": {
+        "file_id": 634,
+        "content": "This code is checking the model_name and performing specific actions based on its value. If model_name is \"SlowFast\", it downloads pretrain lite train data for that model. If model_name is \"BMN\", it downloads required datasets for that model. For other model names, it prints a message indicating they are not added to TIPC yet. In the case of MODE being \"whole_infer\", it performs specific actions based on model_name such as downloading pretrained weights for PP-TSM and PP-TSN models.",
+        "type": "comment"
+    },
+    "8602": {
+        "file_id": 634,
+        "content": "        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams --no-check-certificate\n    elif [ ${model_name} == \"STGCN\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSM\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSN\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:371-385"
+    },
+    "8603": {
+        "file_id": 634,
+        "content": "This code is checking the value of 'model_name' variable and downloading the corresponding pretrained weights for different models using 'wget' command. If model name matches, it retrieves the respective model's file from a specific URL and saves it in the './data' directory without certificate checks.",
+        "type": "comment"
+    },
+    "8604": {
+        "file_id": 634,
+        "content": "        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams --no-check-certificate\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams --no-check-certificate\n    elif [ ${model_name} == \"BMN\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams --no-check-certificate\n    else\n        echo \"Not added into TIPC yet.\"\n    fi\nfi\nif [ ${MODE} = \"benchmark_train\" ];then\n    ${python} -m pip install -r requirements.txt\n    if [ ${model_name} == \"PP-TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:386-406"
+    },
+    "8605": {
+        "file_id": 634,
+        "content": "This code downloads pre-trained model weights depending on the specified model name. It uses wget to retrieve the files from specific URLs and saves them in the \"./data\" directory. The code also checks if the MODE is \"benchmark_train\" and installs necessary packages using pip if so. Additionally, it changes the current directory to \"./data/k400\" to prepare for pre-training the Lite train data of PP-TSM model.",
+        "type": "comment"
+    },
+    "8606": {
+        "file_id": 634,
+        "content": "        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"PP-TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        echo \"Not added into TIPC yet.\"\n    elif [ ${model_name} == \"STGCN\" ]; then\n        echo \"Not added into TIPC yet.\"\n    elif [ ${model_name} == \"TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:407-427"
+    },
+    "8607": {
+        "file_id": 634,
+        "content": "Code snippet checks the value of `model_name` and performs specific actions based on its value. For example, if it is \"PaddleVideo/ResNet50\", it downloads pretrained weights for that model. If `model_name` is not recognized, it prints a message saying it's not added to TIPC yet.",
+        "type": "comment"
+    },
+    "8608": {
+        "file_id": 634,
+        "content": "        tar -xf k400_rawframes_small.tar\n        # download datalist for fleet benchmark\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/train_fleet_frames.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/val_fleet_frames.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:428-446"
+    },
+    "8609": {
+        "file_id": 634,
+        "content": "This code is downloading pre-trained weights and data for PaddleVideo models, such as ResNet50, TSN, and TimeSformer. It checks the model_name and performs specific tasks accordingly: unzipping tar files, downloading lists of frames, and retrieving pretrained weights from specified URLs.",
+        "type": "comment"
+    },
+    "8610": {
+        "file_id": 634,
+        "content": "        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        echo \"Not added into TIPC yet.\"\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n    elif [ ${model_name} == \"BMN\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        mkdir bmn_data\n        cd bmn_data\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz\n        tar -xf bmn_feat.tar.gz\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json\n        popd",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:447-468"
+    },
+    "8611": {
+        "file_id": 634,
+        "content": "Code snippet checks the model name and performs specific actions for each. If model is \"k400_videos_small\", it downloads pre-trained weights. If model is \"SlowFast\", it downloads lite train data. For \"BMN\", it downloads BMN training data, including annotations and JSON files. No action is taken for \"AttentionLSTM\" as it's not added to TIPC yet.",
+        "type": "comment"
+    },
+    "8612": {
+        "file_id": 634,
+        "content": "    elif [ ${model_name} == \"VideoSwin\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_small_patch4_window7_224.pdparams --no-check-certificate\n    else\n        echo \"Not added into TIPC yet.\"\n    fi\nfi\nif [ ${MODE} = \"klquant_whole_infer\" ]; then\n    if [ ${model_name} = \"PP-TSM\" ]; then\n        # download lite data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download inference model\n        mkdir ./inference\n        pushd ./inference\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate\n        unzip ppTSM.zip\n        popd\n    else\n        echo \"Not added into TIPC yet.\"\n    fi",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:469-497"
+    },
+    "8613": {
+        "file_id": 634,
+        "content": "This code checks if the model is VideoSwin or PP-TSM. If VideoSwin, it downloads pretrain lite train data and pretrained weights. If PP-TSM, it downloads lite data and inference model. Other models are not added to TIPC yet.",
+        "type": "comment"
+    },
+    "8614": {
+        "file_id": 634,
+        "content": "fi\nif [ ${MODE} = \"cpp_infer\" ];then\n    # install required packages\n    apt-get update\n    apt install libavformat-dev\n    apt install libavcodec-dev\n    apt install libswresample-dev\n    apt install libswscale-dev\n    apt install libavutil-dev\n    apt install libsdl1.2-dev\n    apt-get install ffmpeg\n    if [ ${model_name} = \"PP-TSM\" ]; then\n        # download pretrained weights\n        wget -nc -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams --no-check-certificate\n        # export inference model\n        ${python} tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml -p data/ppTSM_k400_uniform.pdparams -o ./inference/ppTSM\n    elif [ ${model_name} = \"PP-TSN\" ]; then\n        # download pretrained weights\n        wget -nc -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams --no-check-certificate\n        # export inference model\n        ${python} tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_videos.yaml -p data/ppTSN_k400.pdparams -o ./inference/ppTSN",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:498-520"
+    },
+    "8615": {
+        "file_id": 634,
+        "content": "This code installs necessary packages, downloads pre-trained model weights for either PP-TSM or PP-TSN, and exports the inference models for these two models. This is typically done before running inference on new data.",
+        "type": "comment"
+    },
+    "8616": {
+        "file_id": 634,
+        "content": "    else\n        echo \"Not added into TIPC now.\"\n    fi\nfi\nif [ ${MODE} = \"serving_infer_python\" ];then\n    if [[ ${model_name} == \"PP-TSM\" ]];then\n        # prepare lite infer data for serving\n        pushd ./data\n        mkdir python_serving_infer_video_dir\n        cp ./example.avi python_serving_infer_video_dir/\n        popd\n        # prepare inference model\n        mkdir ./inference\n        pushd ./inference\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate\n        unzip ppTSM.zip\n        popd\n    elif [[ ${model_name} == \"PP-TSN\" ]];then\n        # prepare lite infer data for serving\n        pushd ./data\n        mkdir python_serving_infer_video_dir\n        cp ./example.avi python_serving_infer_video_dir/\n        popd\n        # prepare inference model\n        mkdir ./inference\n        pushd ./inference\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip --no-check-certificate\n        unzip ppTSN.zip\n        popd\n    else\n        echo \"Not added into TIPC now.\"",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:521-552"
+    },
+    "8617": {
+        "file_id": 634,
+        "content": "This code checks if the model_name is either \"PP-TSM\" or \"PP-TSN\". If it matches, it prepares lite infer data and downloads the corresponding inference model for serving. If not, it displays a message indicating that the model is not added into TIPC now.",
+        "type": "comment"
+    },
+    "8618": {
+        "file_id": 634,
+        "content": "    fi\nfi\nif [ ${MODE} = \"paddle2onnx_infer\" ];then\n    # install paddle2onnx\n    python_name_list=$(func_parser_value \"${lines[2]}\")\n    IFS='|'\n    array=(${python_name_list})\n    python_name=${array[0]}\n    ${python_name} -m pip install paddle2onnx\n    ${python_name} -m pip install onnxruntime==1.9.0\n    if [ ${model_name} = \"PP-TSM\" ]; then\n        echo \"Not added into TIPC now.\"\n    elif [ ${model_name} = \"PP-TSN\" ]; then\n        mkdir -p ./inference\n        wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip\n        # unzip inference model\n        pushd ./inference\n        unzip ppTSN.zip\n        popd\n    else\n        echo \"Not added into TIPC now.\"\n    fi\nfi",
+        "type": "code",
+        "location": "/test_tipc/prepare.sh:553-577"
+    },
+    "8619": {
+        "file_id": 634,
+        "content": "This code snippet checks the current mode and performs specific actions accordingly. If the mode is \"paddle2onnx_infer\", it installs paddle2onnx and onnxruntime with a specified Python interpreter. For the \"PP-TSM\" model, it displays a message indicating that it's not added to TIPC. For the \"PP-TSN\" model, it downloads and unzips the inference model from a specific URL. If the mode is not recognized, it indicates that the corresponding action is not available in TIPC.",
+        "type": "comment"
+    },
+    "8620": {
+        "file_id": 635,
+        "content": "/test_tipc/test_inference_cpp.sh",
+        "type": "filepath"
+    },
+    "8621": {
+        "file_id": 635,
+        "content": "This code sets up PaddleVideo model inference, performs tests with MKLDNN or float point precision, iterates through thread settings and precisions, logs results, configures and builds PaddleVideo, sets OpenCV, CUDA, CUDNN directories, checks GPUID, runs inference tests on a list of model directories.",
+        "type": "summary"
+    },
+    "8622": {
+        "file_id": 635,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\nMODE=$2\ndataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# parser cpp inference model\nmodel_name=$(func_parser_value \"${lines[1]}\")\nuse_opencv=$(func_parser_value \"${lines[2]}\")\ncpp_infer_model_dir_list=$(func_parser_value \"${lines[3]}\")\ncpp_infer_is_quant=$(func_parser_value \"${lines[4]}\")\n# parser cpp inference\ninference_cmd=$(func_parser_value \"${lines[5]}\")\ncpp_use_gpu_key=$(func_parser_key \"${lines[6]}\")\ncpp_use_gpu_list=$(func_parser_value \"${lines[6]}\")\ncpp_use_mkldnn_key=$(func_parser_key \"${lines[7]}\")\ncpp_use_mkldnn_list=$(func_parser_value \"${lines[7]}\")\ncpp_cpu_threads_key=$(func_parser_key \"${lines[8]}\")\ncpp_cpu_threads_list=$(func_parser_value \"${lines[8]}\")\ncpp_batch_size_key=$(func_parser_key \"${lines[9]}\")\ncpp_batch_size_list=$(func_parser_value \"${lines[9]}\")\ncpp_use_trt_key=$(func_parser_key \"${lines[10]}\")\ncpp_use_trt_list=$(func_parser_value \"${lines[10]}\")\ncpp_precision_key=$(func_parser_key \"${lines[11]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:1-29"
+    },
+    "8623": {
+        "file_id": 635,
+        "content": "This script uses Bash to parse input file lines, extracting model information and inference parameters for C++ models. It sources a common function script and then proceeds to parse each line of the input file into various variables like model name, OpenCV usage, C++ inference model directory list, inference command, GPU/MKLDNN/CPU thread settings, etc. These parsed values are stored in different variables for further use.",
+        "type": "comment"
+    },
+    "8624": {
+        "file_id": 635,
+        "content": "cpp_precision_list=$(func_parser_value \"${lines[11]}\")\ncpp_infer_model_key=$(func_parser_key \"${lines[12]}\")\ncpp_image_dir_key=$(func_parser_key \"${lines[13]}\")\ncpp_infer_img_dir=$(func_parser_value \"${lines[13]}\")\ncpp_infer_key1=$(func_parser_key \"${lines[14]}\")\ncpp_infer_value1=$(func_parser_value \"${lines[14]}\")\ncpp_benchmark_key=$(func_parser_key \"${lines[15]}\")\ncpp_benchmark_value=$(func_parser_value \"${lines[15]}\")\ncpp_infer_key2=$(func_parser_key \"${lines[16]}\")\ncpp_infer_value2=$(func_parser_value \"${lines[16]}\")\ncpp_infer_key3=$(func_parser_key \"${lines[17]}\")\ncpp_infer_value3=$(func_parser_value \"${lines[17]}\")\nLOG_PATH=\"./test_tipc/output/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_cpp.log\"\nfunction func_cpp_inference(){\n    IFS='|'\n    _script=$1\n    _model_dir=$2\n    _log_path=$3\n    _img_dir=$4\n    _flag_quant=$5\n    # inference\n    for use_gpu in ${cpp_use_gpu_list[*]}; do\n        if [ ${use_gpu} = \"False\" ] || [ ${use_gpu} = \"cpu\" ]; then\n            for use_mkldnn in ${cpp_use_mkldnn_list[*]}; do",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:30-58"
+    },
+    "8625": {
+        "file_id": 635,
+        "content": "This code is setting up variables for running a PaddleVideo model inference using C++. It sets the precision list, infer model key, image directory key and value, and other keys and values required for the benchmarking process. The code also creates a log path for storing results of the C++ inference and prepares to loop through possible GPU usage and MKLDNN configurations.",
+        "type": "comment"
+    },
+    "8626": {
+        "file_id": 635,
+        "content": "                if [ ${use_mkldnn} = \"False\" ] && [ ${_flag_quant} = \"True\" ]; then\n                    continue\n                fi\n                for threads in ${cpp_cpu_threads_list[*]}; do\n                    for batch_size in ${cpp_batch_size_list[*]}; do\n                        precision=\"fp32\"\n                        if [ ${use_mkldnn} = \"False\" ] && [ ${_flag_quant} = \"True\" ]; then\n                            precison=\"int8\"\n                        fi\n                        _save_log_path=\"${_log_path}/cpp_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log\"\n                        set_infer_data=$(func_set_params \"${cpp_image_dir_key}\" \"${_img_dir}\")\n                        set_benchmark=$(func_set_params \"${cpp_benchmark_key}\" \"${cpp_benchmark_value}\")\n                        set_batchsize=$(func_set_params \"${cpp_batch_size_key}\" \"${batch_size}\")\n                        set_cpu_threads=$(func_set_params \"${cpp_cpu_threads_key}\" \"${threads}\")",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:59-72"
+    },
+    "8627": {
+        "file_id": 635,
+        "content": "This code checks if MKLDNN is not being used and quantized precision is true. If so, it continues without executing the loop. Otherwise, it iterates through different thread settings, batch sizes, and precisions to execute inference tests on CPU using MKLDNN (if enabled) or float point precision (default). Logs are saved with details of parameters used for each run.",
+        "type": "comment"
+    },
+    "8628": {
+        "file_id": 635,
+        "content": "                        set_model_dir=$(func_set_params \"${cpp_infer_model_key}\" \"${_model_dir}\")\n                        set_infer_params1=$(func_set_params \"${cpp_infer_key1}\" \"${cpp_infer_value1}\")\n                        set_infer_params2=$(func_set_params \"${cpp_infer_key2}\" \"${cpp_infer_value2}\")\n                        set_infer_params3=$(func_set_params \"${cpp_infer_key3}\" \"${cpp_infer_value3}\")\n                        command=\"${_script} ${cpp_use_gpu_key}=${use_gpu} ${cpp_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params2} ${set_infer_params3} > ${_save_log_path} 2>&1 \"\n                        eval $command\n                        last_status=${PIPESTATUS[0]}\n                        eval \"cat ${_save_log_path}\"\n                        status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\"\n                    done\n                done\n            done\n        elif [ ${use_gpu} = \"True\" ] || [ ${use_gpu} = \"gpu\" ]; then",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:73-85"
+    },
+    "8629": {
+        "file_id": 635,
+        "content": "This code is iterating over different model names and configurations, setting various parameters such as GPU usage and thread count. It then executes a command to run inference on the model and saves the log file. The script checks the status of the execution and logs any errors or warnings for debugging purposes.",
+        "type": "comment"
+    },
+    "8630": {
+        "file_id": 635,
+        "content": "            for use_trt in ${cpp_use_trt_list[*]}; do\n                for precision in ${cpp_precision_list[*]}; do\n                    if [[ ${_flag_quant} = \"False\" ]] && [[ ${precision} =~ \"int8\" ]]; then\n                        continue\n                    fi\n                    if [[ ${precision} =~ \"fp16\" || ${precision} =~ \"int8\" ]] && [ ${use_trt} = \"False\" ]; then\n                        continue\n                    fi\n                    if [[ ${use_trt} = \"False\" || ${precision} =~ \"int8\" ]] && [ ${_flag_quant} = \"True\" ]; then\n                        continue\n                    fi\n                    for batch_size in ${cpp_batch_size_list[*]}; do\n                        _save_log_path=\"${_log_path}/cpp_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log\"\n                        set_infer_data=$(func_set_params \"${cpp_image_dir_key}\" \"${_img_dir}\")\n                        set_benchmark=$(func_set_params \"${cpp_benchmark_key}\" \"${cpp_benchmark_value}\")\n                        set_batchsize=$(func_set_params \"${cpp_batch_size_key}\" \"${batch_size}\")",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:86-101"
+    },
+    "8631": {
+        "file_id": 635,
+        "content": "The code snippet is performing nested loops to iterate over different combinations of TensorRT (TRT) usage and precision options. It checks specific conditions using if statements, such as avoiding quantized precision with non-quantized flag set or excluding certain combinations based on TRT and precision values. Finally, it sets variables for the log path, input data parameters, benchmark value, and batch size before potentially executing further code within these loops.",
+        "type": "comment"
+    },
+    "8632": {
+        "file_id": 635,
+        "content": "                        set_tensorrt=$(func_set_params \"${cpp_use_trt_key}\" \"${use_trt}\")\n                        set_precision=$(func_set_params \"${cpp_precision_key}\" \"${precision}\")\n                        set_model_dir=$(func_set_params \"${cpp_infer_model_key}\" \"${_model_dir}\")\n                        set_infer_params1=$(func_set_params \"${cpp_infer_key1}\" \"${cpp_infer_value1}\")\n                        set_infer_params2=$(func_set_params \"${cpp_infer_key2}\" \"${cpp_infer_value2}\")\n                        set_infer_params3=$(func_set_params \"${cpp_infer_key3}\" \"${cpp_infer_value3}\")\n                        command=\"${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params2} ${set_infer_params3} > ${_save_log_path} 2>&1 \"\n                        eval $command\n                        last_status=${PIPESTATUS[0]}\n                        eval \"cat ${_save_log_path}\"\n                        status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\"",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:102-112"
+    },
+    "8633": {
+        "file_id": 635,
+        "content": "The code is setting parameters for a TensorRT inference script. It assigns values to various keys and directories before executing the script and saving the output log file. The last status of the command execution is checked, and the log file is displayed if no issues occurred.",
+        "type": "comment"
+    },
+    "8634": {
+        "file_id": 635,
+        "content": "                    done\n                done\n            done\n        else\n            echo \"Does not support hardware other than CPU and GPU Currently!\"\n        fi\n    done\n}\ncd deploy/cpp_infer\nif [ ${use_opencv} = \"True\" ]; then\n    if [ -d \"opencv-3.4.7/opencv3/\" ] && [ $(md5sum opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = \"faa2b5950f8bee3f03118e600c74746a\" ];then\n        echo \"################### build opencv skipped ###################\"\n    else\n        echo \"################### building opencv ###################\"\n        rm -rf opencv-3.4.7.tar.gz opencv-3.4.7/\n        wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/opencv-3.4.7.tar.gz\n        tar -xf opencv-3.4.7.tar.gz\n        cd opencv-3.4.7/\n        install_path=$(pwd)/opencv3\n        rm -rf build\n        mkdir build\n        cd build\n        cmake .. \\\n            -DCMAKE_INSTALL_PREFIX=${install_path} \\\n            -DCMAKE_BUILD_TYPE=Release \\\n            -DBUILD_SHARED_LIBS=OFF \\\n            -DWITH_IPP=OFF \\\n            -DBUILD_IPP_IW=OFF \\",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:114-146"
+    },
+    "8635": {
+        "file_id": 635,
+        "content": "The code checks if the current hardware supports CPU and GPU, and if not, it prints a message. If the OpenCV library is missing or outdated, it downloads the latest version and builds it. It then sets up the installation path for the built OpenCV library.",
+        "type": "comment"
+    },
+    "8636": {
+        "file_id": 635,
+        "content": "            -DWITH_LAPACK=OFF \\\n            -DWITH_EIGEN=OFF \\\n            -DCMAKE_INSTALL_LIBDIR=lib64 \\\n            -DWITH_ZLIB=ON \\\n            -DBUILD_ZLIB=ON \\\n            -DWITH_JPEG=ON \\\n            -DBUILD_JPEG=ON \\\n            -DWITH_PNG=ON \\\n            -DBUILD_PNG=ON \\\n            -DWITH_TIFF=ON \\\n            -DBUILD_TIFF=ON \\\n            -DWITH_FFMPEG=ON\n        make -j\n        make install\n        cd ../\n        echo \"################### building opencv finished ###################\"\n    fi\nfi\nif [ !-d \"paddle_inference\" ]; then\n    echo \"################### download inference lib skipped ###################\"\nelse\n    echo \"################### downloading inference lib ###################\"\n    wget -nc https://paddle-inference-lib.bj.bcebos.com/2.1.1-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddle_inference.tgz\n    tar -xf paddle_inference.tgz\n    echo \"################### downloading inference lib finished ###################\"\nfi\necho \"################### building PaddleVideo demo ####################\"\nif [ ${use_opencv} = \"True\" ]; then",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:147-178"
+    },
+    "8637": {
+        "file_id": 635,
+        "content": "This code sets various CMake flags to configure the build process, then proceeds with making and installing the required libraries. It checks if a directory exists, downloads necessary files if needed, and finally starts building the PaddleVideo demo.",
+        "type": "comment"
+    },
+    "8638": {
+        "file_id": 635,
+        "content": "    OPENCV_DIR=$(pwd)/opencv-3.4.7/opencv3\nelse\n    OPENCV_DIR=''\nfi\nLIB_DIR=$(pwd)/paddle_inference\nCUDA_LIB_DIR=$(dirname `find /usr -name libcudart.so`)\nCUDNN_LIB_DIR=$(dirname `find /usr -name libcudnn.so`)\nBUILD_DIR=build\nrm -rf ${BUILD_DIR}\nmkdir ${BUILD_DIR}\ncd ${BUILD_DIR}\ncmake .. \\\n    -DPADDLE_LIB=${LIB_DIR} \\\n    -DWITH_MKL=ON \\\n    -DWITH_GPU=OFF \\\n    -DWITH_STATIC_LIB=OFF \\\n    -DWITH_TENSORRT=OFF \\\n    -DOPENCV_DIR=${OPENCV_DIR} \\\n    -DCUDNN_LIB=${CUDNN_LIB_DIR} \\\n    -DCUDA_LIB=${CUDA_LIB_DIR} \\\n    -DTENSORRT_DIR=${TENSORRT_DIR} \\\nmake -j\ncd ../../../\necho \"################### building PaddleVideo demo finished ###################\"\n# set cuda device\nGPUID=$2\nif [ ${#GPUID} -le 0 ];then\n    env=\" \"\nelse\n    env=\"export CUDA_VISIBLE_DEVICES=${GPUID}\"\nfi\nset CUDA_VISIBLE_DEVICES\neval $env\necho \"################### running test ###################\"\nexport Count=0\nIFS=\"|\"\ninfer_quant_flag=(${cpp_infer_is_quant})\nfor infer_model in ${cpp_infer_model_dir_list[*]}; do\n    #run inference\n    is_quant=${infer_quant_flag[Count]}",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:179-225"
+    },
+    "8639": {
+        "file_id": 635,
+        "content": "This code is configuring and building PaddleVideo, setting up OpenCV, CUDA, and CUDNN directories, and preparing for running test inference. It also checks if GPUID is set and sets the CUDA_VISIBLE_DEVICES environment variable accordingly. Finally, it loops through a list of model directories to run inference tests.",
+        "type": "comment"
+    },
+    "8640": {
+        "file_id": 635,
+        "content": "    func_cpp_inference \"${inference_cmd}\" \"${infer_model}\" \"${LOG_PATH}\" \"${cpp_infer_img_dir}\" ${is_quant}\n    Count=$(($Count + 1))\ndone",
+        "type": "code",
+        "location": "/test_tipc/test_inference_cpp.sh:226-228"
+    },
+    "8641": {
+        "file_id": 635,
+        "content": "This code snippet is calling a function \"func_cpp_inference\" to execute inference commands, incrementing the Count variable on each iteration of a loop. The function is called with input parameters for the command, model path, log path, image directory, and a quantization flag.",
+        "type": "comment"
+    },
+    "8642": {
+        "file_id": 636,
+        "content": "/test_tipc/test_paddle2onnx.sh",
+        "type": "filepath"
+    },
+    "8643": {
+        "file_id": 636,
+        "content": "The code reads a file for model details, extracts lines with common functions, sets up inference directories and parameters, enables ONNX checker, converts using paddle2onnx, saves logs, runs inference, and checks status for the \"func_paddle2onnx\" function.",
+        "type": "summary"
+    },
+    "8644": {
+        "file_id": 636,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\nMODE=$2\ndataline=$(cat ${FILENAME})\nlines=(${dataline})\n# common params\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython=$(func_parser_value \"${lines[2]}\")\n# parser params\ndataline=$(awk 'NR==1, NR==14{print}'  $FILENAME)\nIFS=$'\\n'\nlines=(${dataline})\n# parser paddle2onnx\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython=$(func_parser_value \"${lines[2]}\")\npadlle2onnx_cmd=$(func_parser_value \"${lines[3]}\")\ninfer_model_dir_key=$(func_parser_key \"${lines[4]}\")\ninfer_model_dir_value=$(func_parser_value \"${lines[4]}\")\nmodel_filename_key=$(func_parser_key \"${lines[5]}\")\nmodel_filename_value=$(func_parser_value \"${lines[5]}\")\nparams_filename_key=$(func_parser_key \"${lines[6]}\")\nparams_filename_value=$(func_parser_value \"${lines[6]}\")\nsave_file_key=$(func_parser_key \"${lines[7]}\")\nsave_file_value=$(func_parser_value \"${lines[7]}\")\nopset_version_key=$(func_parser_key \"${lines[8]}\")\nopset_version_value=$(func_parser_value \"${lines[8]}\")\nenable_onnx_checker_key=$(func_parser_key \"${lines[9]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_paddle2onnx.sh:1-32"
+    },
+    "8645": {
+        "file_id": 636,
+        "content": "Code is reading a file, parsing specific lines to extract model name, python path, and other parameters for paddle2onnx conversion. It's using common functions from \"common_func.sh\" and \"awk\" command for line extraction.",
+        "type": "comment"
+    },
+    "8646": {
+        "file_id": 636,
+        "content": "enable_onnx_checker_value=$(func_parser_value \"${lines[9]}\")\n# parser onnx inference\ninference_py=$(func_parser_value \"${lines[10]}\")\nconfig_key=$(func_parser_key \"${lines[11]}\")\nconfig_value=$(func_parser_value \"${lines[11]}\")\nmodel_key=$(func_parser_key \"${lines[12]}\")\ninput_file_key=$(func_parser_key \"${lines[13]}\")\ninput_file_value=$(func_parser_value \"${lines[13]}\")\nLOG_PATH=\"./log/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_paddle2onnx.log\"\nfunction func_paddle2onnx(){\n    IFS='|'\n    _script=$1\n    # paddle2onnx\n    _save_log_path=\"${LOG_PATH}/paddle2onnx_infer_cpu.log\"\n    set_dirname=$(func_set_params \"${infer_model_dir_key}\" \"${infer_model_dir_value}\")\n    set_model_filename=$(func_set_params \"${model_filename_key}\" \"${model_filename_value}\")\n    set_params_filename=$(func_set_params \"${params_filename_key}\" \"${params_filename_value}\")\n    set_save_model=$(func_set_params \"${save_file_key}\" \"${save_file_value}\")\n    set_opset_version=$(func_set_params \"${opset_version_key}\" \"${opset_version_value}\")",
+        "type": "code",
+        "location": "/test_tipc/test_paddle2onnx.sh:33-58"
+    },
+    "8647": {
+        "file_id": 636,
+        "content": "Creating function \"func_paddle2onnx\" with arguments _script, setting up log path and directories for paddle2onnx inference. It then sets parameters such as infer_model_dir_key, model_filename_key, params_filename_key, save_file_key, and opset_version_key.",
+        "type": "comment"
+    },
+    "8648": {
+        "file_id": 636,
+        "content": "    set_enable_onnx_checker=$(func_set_params \"${enable_onnx_checker_key}\" \"${enable_onnx_checker_value}\")\n    trans_log=\"${LOG_PATH}/trans_model.log\"\n    trans_model_cmd=\"${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} > ${trans_log} 2>&1 \"\n    eval $trans_model_cmd\n    last_status=${PIPESTATUS[0]}\n    status_check $last_status \"${trans_model_cmd}\" \"${status_log}\" \"${model_name}\"\n    # python inference\n    set_gpu=$(func_set_params \"${use_gpu_key}\" \"${use_gpu_value}\")\n    set_model_dir=$(func_set_params \"${model_key}\" \"${save_file_value}\")\n    set_input_file=$(func_set_params \"${input_file_key}\" \"${input_file_value}\")\n    set_config=$(func_set_params \"${config_key}\" \"${config_value}\")\n    infer_model_cmd=\"${python} ${inference_py} ${set_config} ${set_input_file} ${set_model_dir} > ${_save_log_path} 2>&1 \"\n    eval $infer_model_cmd\n    last_status=${PIPESTATUS[0]}\n    status_check $last_status \"${infer_model_cmd}\" \"${status_log}\" \"${model_name}\"",
+        "type": "code",
+        "location": "/test_tipc/test_paddle2onnx.sh:59-73"
+    },
+    "8649": {
+        "file_id": 636,
+        "content": "The code sets enable_onnx_checker and uses it to execute paddle2onnx conversion, saves the log. Then, it runs inference using Python and saves the status check log.",
+        "type": "comment"
+    },
+    "8650": {
+        "file_id": 636,
+        "content": "}\necho \"################### run test ###################\"\nexport Count=0\nIFS=\"|\"\nfunc_paddle2onnx",
+        "type": "code",
+        "location": "/test_tipc/test_paddle2onnx.sh:74-81"
+    },
+    "8651": {
+        "file_id": 636,
+        "content": "This code segment is running a test for the function \"func_paddle2onnx\" by exporting Count variable, setting IFS to \"|\", and echoing a message.",
+        "type": "comment"
+    },
+    "8652": {
+        "file_id": 637,
+        "content": "/test_tipc/test_ptq_inference_python.sh",
+        "type": "filepath"
+    },
+    "8653": {
+        "file_id": 637,
+        "content": "The code reads parameters, separates configurations, and executes inference tests on different GPUs/CPUs for batch sizes. It sets up a loop for PaddleVideo model inference, handles hardware configurations, prepares settings for exporting models, logs results, and calls the \"func_inference\" function.",
+        "type": "summary"
+    },
+    "8654": {
+        "file_id": 637,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\n# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer']\nMODE=$2\ndataline=$(awk 'NR==1, NR==32{print}'  $FILENAME)\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# The training params\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython=$(func_parser_value \"${lines[2]}\")\nuse_gpu_key=$(func_parser_key \"${lines[3]}\")\nuse_gpu_value=$(func_parser_value \"${lines[3]}\")\nquant_config_file_key=$(func_parser_key \"${lines[4]}\")\nquant_config_file_value=$(func_parser_value \"${lines[4]}\")\nmodel_path_key=$(func_parser_key \"${lines[5]}\")\nmodel_path_value=$(func_parser_value \"${lines[5]}\")\noutput_dir_key=$(func_parser_key \"${lines[6]}\")\noutput_dir_value=$(func_parser_value \"${lines[6]}\")\ndata_dir_key=$(func_parser_key \"${lines[7]}\")\ndata_dir_value=$(func_parser_value \"${lines[7]}\")\ndata_anno_key=$(func_parser_key \"${lines[8]}\")\ndata_anno_value=$(func_parser_value \"${lines[8]}\")\nbatch_num_key=$(func_parser_key \"${lines[9]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_ptq_inference_python.sh:1-29"
+    },
+    "8655": {
+        "file_id": 637,
+        "content": "The code reads a file, parses parameters for model name, Python version, GPU usage, quantization configuration file, model path, output directory, data directory, data annotation file, and batch numbers. It uses awk to extract specific lines from the file and functions defined in common_func.sh for parameter extraction. The MODE variable can have values to determine the type of task being performed.",
+        "type": "comment"
+    },
+    "8656": {
+        "file_id": 637,
+        "content": "batch_num_value=$(func_parser_value \"${lines[9]}\")\nquant_batch_size_key=$(func_parser_key \"${lines[10]}\")\nquant_batch_size_value=$(func_parser_value \"${lines[10]}\")\n# parser trainer\ntrain_py=$(func_parser_value \"${lines[13]}\")\n# parser inference\ninference_py=$(func_parser_value \"${lines[16]}\")\nuse_gpu_key=$(func_parser_key \"${lines[17]}\")\nuse_gpu_list=$(func_parser_value \"${lines[17]}\")\ninfer_config_file_key=$(func_parser_key \"${lines[18]}\")\ninfer_config_file_value=$(func_parser_value \"${lines[18]}\")\ninfer_batch_size_key=$(func_parser_key \"${lines[19]}\")\ninfer_batch_size_list=$(func_parser_value \"${lines[19]}\")\ninfer_model_key=$(func_parser_key \"${lines[20]}\")\ninfer_model_value=$(func_parser_value \"${lines[20]}\")\ninfer_params_key=$(func_parser_key \"${lines[21]}\")\ninfer_params_value=$(func_parser_value \"${lines[21]}\")\ninfer_video_key=$(func_parser_key \"${lines[22]}\")\ninfer_video_dir=$(func_parser_value \"${lines[22]}\")\nbenchmark_key=$(func_parser_key \"${lines[23]}\")\nbenchmark_value=$(func_parser_value \"${lines[23]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_ptq_inference_python.sh:30-52"
+    },
+    "8657": {
+        "file_id": 637,
+        "content": "The code retrieves values and keys from a configuration file, storing them in variables for later use. It separates trainer and inference configurations, including GPU usage, inferential model parameters, video directory path, and benchmark options.",
+        "type": "comment"
+    },
+    "8658": {
+        "file_id": 637,
+        "content": "function func_inference(){\n    IFS='|'\n    _python=$1\n    _script=$2\n    _model_dir=$3\n    _log_path=$4\n    _img_dir=$5\n    # inference\n    for use_gpu in ${use_gpu_list[*]}; do\n        # cpu\n        if [ ${use_gpu} = \"False\" ] || [ ${use_gpu} = \"cpu\" ]; then\n            for batch_size in ${infer_batch_size_list[*]}; do\n                _save_log_path=\"${_log_path}/python_infer_cpu_batchsize_${batch_size}.log\"\n                set_infer_data=$(func_set_params \"${infer_video_key}\" \"${_img_dir}\")\n                set_benchmark=$(func_set_params \"${benchmark_key}\" \"${benchmark_value}\")\n                set_batchsize=$(func_set_params \"${infer_batch_size_key}\" \"${batch_size}\")\n                set_model_file_path=$(func_set_params \"${infer_model_key}\" \"${infer_model_value}\")\n                set_params_file_path=$(func_set_params \"${infer_params_key}\" \"${infer_params_value}\")\n                set_config_file_path=$(func_set_params \"${infer_config_file_key}\" \"${infer_config_file_value}\")\n                command=\"${_",
+        "type": "code",
+        "location": "/test_tipc/test_ptq_inference_python.sh:55-74"
+    },
+    "8659": {
+        "file_id": 637,
+        "content": "This function executes inference on different GPUs and CPUs for various batch sizes. It sets log paths, parameters, model file path, params file path, and config file path using helper functions. The script performs inference using Python and logs the results.",
+        "type": "comment"
+    },
+    "8660": {
+        "file_id": 637,
+        "content": "python} ${_script} ${use_gpu_key}=${use_gpu} ${set_config_file_path} ${set_model_file_path} ${set_params_file_path} ${set_batchsize} ${set_infer_data} ${set_benchmark} > ${_save_log_path} 2>&1 \"\n                # echo $command\n                eval $command\n                last_status=${PIPESTATUS[0]}\n                eval \"cat ${_save_log_path}\"\n                status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\"\n            done\n        # gpu\n        elif [ ${use_gpu} = \"True\" ] || [ ${use_gpu} = \"gpu\" ]; then\n            for batch_size in ${infer_batch_size_list[*]}; do\n                _save_log_path=\"${_log_path}/python_infer_gpu_batchsize_${batch_size}.log\"\n                set_infer_data=$(func_set_params \"${infer_video_key}\" \"${_img_dir}\")\n                set_benchmark=$(func_set_params \"${benchmark_key}\" \"${benchmark_value}\")\n                set_batchsize=$(func_set_params \"${infer_batch_size_key}\" \"${batch_size}\")\n                set_model_file_path=$(func_set_params \"${infer_model_key}\" \"${infer_model_value}\")",
+        "type": "code",
+        "location": "/test_tipc/test_ptq_inference_python.sh:74-88"
+    },
+    "8661": {
+        "file_id": 637,
+        "content": "This code is running a loop to execute inference tests on different GPU configurations. It sets variables for batch size, input data path, and model file path. The output logs are saved into specific files for later analysis.",
+        "type": "comment"
+    },
+    "8662": {
+        "file_id": 637,
+        "content": "                set_params_file_path=$(func_set_params \"${infer_params_key}\" \"${infer_params_value}\")\n                set_config_file_path=$(func_set_params \"${infer_config_file_key}\" \"${infer_config_file_value}\")\n                command=\"${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_config_file_path} ${set_model_file_path} ${set_params_file_path} ${set_batchsize} ${set_infer_data} ${set_benchmark} > ${_save_log_path} 2>&1 \"\n                echo $command\n                eval $command\n                last_status=${PIPESTATUS[0]}\n                eval \"cat ${_save_log_path}\"\n                status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\"\n            done\n        else\n            echo \"Does not support hardware other than CPU and GPU Currently!\"\n        fi\n    done\n}\n# log\nLOG_PATH=\"./log/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_python.log\"\nif [ ${MODE} = \"whole_infer\" ]; then\n    IFS=\"|\"\n    # run export\n    set_output_dir=$(func_set_params \"${output_dir_key}\" \"${output_dir_value}\")",
+        "type": "code",
+        "location": "/test_tipc/test_ptq_inference_python.sh:89-112"
+    },
+    "8663": {
+        "file_id": 637,
+        "content": "This code is setting up a loop to run inference on the PaddleVideo model for different hardware configurations and modes. It sets the necessary parameters, files, and batch size, then executes the command and checks the status of the execution. The output is logged in a specified directory. If the mode is \"whole_infer\", it uses IFS to separate the export settings.",
+        "type": "comment"
+    },
+    "8664": {
+        "file_id": 637,
+        "content": "    set_data_dir=$(func_set_params \"${data_dir_key}\" \"${data_dir_value}\")\n    set_data_anno=$(func_set_params \"${data_anno_key}\" \"${data_anno_value}\")\n    set_batch_size=$(func_set_params \"${quant_batch_size_key}\" \"${quant_batch_size_value}\")\n    set_batch_num=$(func_set_params \"${batch_num_key}\" \"${batch_num_value}\")\n    set_model_path=$(func_set_params \"${model_path_key}\" \"${model_path_value}\")\n    set_config_file=$(func_set_params \"${quant_config_file_key}\" \"${quant_config_file_value}\")\n    set_use_gpu=$(func_set_params \"${use_gpu_key}\" \"${use_gpu_value}\")\n    export_log_path=\"${LOG_PATH}/${MODE}_export_${Count}.log\"\n    export_cmd=\"${python} ${train_py} ${set_use_gpu} ${set_config_file} ${set_model_path} ${set_batch_num} ${set_batch_size} ${set_data_dir} ${set_data_anno} ${set_output_dir} > ${export_log_path} 2>&1 \"\n    echo $export_cmd\n    eval $export_cmd\n    status_export=$?\n    status_check $status_export \"${export_cmd}\" \"${status_log}\" \"${model_name}\"\n    save_infer_dir=${output_dir_value}\n    #run inference",
+        "type": "code",
+        "location": "/test_tipc/test_ptq_inference_python.sh:113-129"
+    },
+    "8665": {
+        "file_id": 637,
+        "content": "This code is preparing various settings for executing a command to export a model. It sets values from input variables, exports the model with specified parameters, logs the results, and then checks the status of the export. Finally, it prepares a directory for running inference.",
+        "type": "comment"
+    },
+    "8666": {
+        "file_id": 637,
+        "content": "    func_inference \"${python}\" \"${inference_py}\" \"${save_infer_dir}\" \"${LOG_PATH}\" \"${infer_video_dir}\"\nfi",
+        "type": "code",
+        "location": "/test_tipc/test_ptq_inference_python.sh:130-132"
+    },
+    "8667": {
+        "file_id": 637,
+        "content": "The code snippet is calling a function named \"func_inference\" with arguments such as python, inference_py (likely the path of the Python script), save_infer_dir, LOG_PATH and infer_video_dir. This could be a part of an if condition block, possibly initializing or running an inference process.",
+        "type": "comment"
+    },
+    "8668": {
+        "file_id": 638,
+        "content": "/test_tipc/test_serving_infer_cpp.sh",
+        "type": "filepath"
+    },
+    "8669": {
+        "file_id": 638,
+        "content": "This Bash script, using a configuration file and mode inputs, initializes a model, serves it via Python/C++, prepares the environment, logs execution, runs a GPU server, and tests a web service function with incrementing \"Count\" variable and IFS separation.",
+        "type": "summary"
+    },
+    "8670": {
+        "file_id": 638,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\nMODE=$2\ndataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# parser serving\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython_list=$(func_parser_value \"${lines[2]}\")\ntrans_model_py=$(func_parser_value \"${lines[3]}\")\ninfer_model_dir_key=$(func_parser_key \"${lines[4]}\")\ninfer_model_dir_value=$(func_parser_value \"${lines[4]}\")\nmodel_filename_key=$(func_parser_key \"${lines[5]}\")\nmodel_filename_value=$(func_parser_value \"${lines[5]}\")\nparams_filename_key=$(func_parser_key \"${lines[6]}\")\nparams_filename_value=$(func_parser_value \"${lines[6]}\")\nserving_server_key=$(func_parser_key \"${lines[7]}\")\nserving_server_value=$(func_parser_value \"${lines[7]}\")\nserving_client_key=$(func_parser_key \"${lines[8]}\")\nserving_client_value=$(func_parser_value \"${lines[8]}\")\nserving_dir_value=$(func_parser_value \"${lines[9]}\")\nrun_model_path_key=$(func_parser_key \"${lines[10]}\")\nrun_model_path_value=$(func_parser_value \"${lines[10]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_serving_infer_cpp.sh:1-28"
+    },
+    "8671": {
+        "file_id": 638,
+        "content": "This code is a Bash script that takes in two arguments: the filename of a configuration file and the mode to operate in. It uses `awk` to extract a specific section from the configuration file, parses this data into variables using custom functions, and then sets up various parameters for running an image classification model.",
+        "type": "comment"
+    },
+    "8672": {
+        "file_id": 638,
+        "content": "port_key=$(func_parser_key \"${lines[11]}\")\nport_value=$(func_parser_value \"${lines[11]}\")\ncpp_client_value=$(func_parser_value \"${lines[12]}\")\ninput_video_key=$(func_parser_key \"${lines[13]}\")\ninput_video_value=$(func_parser_value \"${lines[13]}\")\nLOG_PATH=\"./test_tipc/output/log/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_serving.log\"\nfunction func_serving(){\n    IFS='|'\n    _python=$1\n    _script=$2\n    _model_dir=$3\n    # phase 1: save model\n    set_dirname=$(func_set_params \"${infer_model_dir_key}\" \"${infer_model_dir_value}\")\n    set_model_filename=$(func_set_params \"${model_filename_key}\" \"${model_filename_value}\")\n    set_params_filename=$(func_set_params \"${params_filename_key}\" \"${params_filename_value}\")\n    set_serving_server=$(func_set_params \"${serving_server_key}\" \"${serving_server_value}\")\n    set_serving_client=$(func_set_params \"${serving_client_key}\" \"${serving_client_value}\")\n    python_list=(${python_list})\n    python=${python_list[0]}\n    trans_log=\"${LOG_PATH}/cpp_trans_model.log\"",
+        "type": "code",
+        "location": "/test_tipc/test_serving_infer_cpp.sh:29-54"
+    },
+    "8673": {
+        "file_id": 638,
+        "content": "This code parses keys and values from a configuration file, sets directory names and filenames for saving the model, and initializes variables for later use. It uses Python and potentially C++ for model serving. The code creates log files to store results of the model inference and transfer process, and prepares for the next steps involving Python scripts and possibly C++ client or server execution.",
+        "type": "comment"
+    },
+    "8674": {
+        "file_id": 638,
+        "content": "    trans_model_cmd=\"${python} ${trans_model_py} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_serving_server} ${set_serving_client} > ${trans_log} 2>&1 \"\n    eval ${trans_model_cmd}\n    last_status=${PIPESTATUS[0]}\n    status_check $last_status \"${trans_model_cmd}\" \"${status_log}\" \"${model_name}\"\n    # modify the alias name of fetch_var to \"outputs\"\n    server_fetch_var_line_cmd=\"sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \\\"outputs\\\"/' $serving_server_value/serving_server_conf.prototxt\"\n    eval ${server_fetch_var_line_cmd}\n    client_fetch_var_line_cmd=\"sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \\\"outputs\\\"/' $serving_client_value/serving_client_conf.prototxt\"\n    eval ${client_fetch_var_line_cmd}\n    cd ${serving_dir_value}\n    echo $PWD\n    unset https_proxy\n    unset http_proxy\n    _save_log_path=\"${LOG_PATH}/cpp_client_infer_gpu_batchsize_1.log\"\n    # phase 2: run server\n    server_log_path=\"${LOG_PATH}/cpp_server_gpu.log\"\n    cpp_ser",
+        "type": "code",
+        "location": "/test_tipc/test_serving_infer_cpp.sh:55-73"
+    },
+    "8675": {
+        "file_id": 638,
+        "content": "This code snippet modifies a serving configuration file, sets up the environment for running a C++ server and client, logs their execution, and finally runs the C++ server on GPU.",
+        "type": "comment"
+    },
+    "8676": {
+        "file_id": 638,
+        "content": "ver_cmd=\"${python} -m paddle_serving_server.serve ${run_model_path_key} ${run_model_path_value} ${port_key} ${port_value} > ${server_log_path} 2>&1 &\"\n    eval ${cpp_server_cmd}\n    sleep 20s\n    # phase 3: run client\n    real_model_name=${model_name/PP-/PP}\n    serving_client_conf_path=\"${serving_client_value/deploy\\/cpp_serving\\/}\"\n    serving_client_conf_path=\"${serving_client_conf_path/\\/\\//}serving_client_conf.prototxt\"\n    cpp_client_cmd=\"${python} ${cpp_client_value} -n ${real_model_name} -c ${serving_client_conf_path} ${input_video_key} ${input_video_value} > ${_save_log_path} 2>&1 \"\n    eval ${cpp_client_cmd}\n    last_status=${PIPESTATUS[0]}\n    eval \"cat ${_save_log_path}\"\n    cd ../../\n    status_check $last_status \"${cpp_server_cmd}\" \"${status_log}\" \"${model_name}\"\n    ps ux | grep -i 'paddle_serving_server' | awk '{print $2}' | xargs kill -s 9\n}\n# set cuda device\nGPUID=$3\nif [ ${#GPUID} -le 0 ];then\n    env=\" \"\nelse\n    env=\"export CUDA_VISIBLE_DEVICES=${GPUID}\"\nfi\nset CUDA_VISIBLE_DEVICES\neval $env",
+        "type": "code",
+        "location": "/test_tipc/test_serving_infer_cpp.sh:73-100"
+    },
+    "8677": {
+        "file_id": 638,
+        "content": "The script starts a PaddlePaddle serving server, runs a client against it, and performs status checks. The CUDA device can be set using the GPUID parameter.",
+        "type": "comment"
+    },
+    "8678": {
+        "file_id": 638,
+        "content": "echo \"################### run test ###################\"\nexport Count=0\nIFS=\"|\"\nfunc_serving \"${web_service_cmd}\"",
+        "type": "code",
+        "location": "/test_tipc/test_serving_infer_cpp.sh:103-107"
+    },
+    "8679": {
+        "file_id": 638,
+        "content": "This code is executing a test function for serving a web service, incrementing the \"Count\" variable and using IFS to separate the function arguments with \"|\".",
+        "type": "comment"
+    },
+    "8680": {
+        "file_id": 639,
+        "content": "/test_tipc/test_serving_infer_python.sh",
+        "type": "filepath"
+    },
+    "8681": {
+        "file_id": 639,
+        "content": "The Bash script configures a model serving environment, sets up an API server, transfers the model using provided Python code, and handles cleanup tasks. It also adjusts alias names, logs paths, and CUDA visible devices while running video processing pipeline tests.",
+        "type": "summary"
+    },
+    "8682": {
+        "file_id": 639,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\ndataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)\nMODE=$2\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# parser serving\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython_list=$(func_parser_value \"${lines[2]}\")\ntrans_model_py=$(func_parser_value \"${lines[3]}\")\ninfer_model_dir_key=$(func_parser_key \"${lines[4]}\")\ninfer_model_dir_value=$(func_parser_value \"${lines[4]}\")\nmodel_filename_key=$(func_parser_key \"${lines[5]}\")\nmodel_filename_value=$(func_parser_value \"${lines[5]}\")\nparams_filename_key=$(func_parser_key \"${lines[6]}\")\nparams_filename_value=$(func_parser_value \"${lines[6]}\")\nserving_server_key=$(func_parser_key \"${lines[7]}\")\nserving_server_value=$(func_parser_value \"${lines[7]}\")\nserving_client_key=$(func_parser_key \"${lines[8]}\")\nserving_client_value=$(func_parser_value \"${lines[8]}\")\nserving_dir_value=$(func_parser_value \"${lines[9]}\")\nweb_service_py=$(func_parser_value \"${lines[10]}\")\npipeline_py=$(func_parser_value \"${lines[11]}\")\nvideo_dir_key=$(func_parser_key \"${lines[12]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_serving_infer_python.sh:1-29"
+    },
+    "8683": {
+        "file_id": 639,
+        "content": "This Bash script is parsing a configuration file and extracting various parameters for running model inference. It assigns values to variables such as 'model_name', 'python_list', and others that will be used later in the code. The purpose is to set up an environment for serving the model and potentially run inferences on videos.",
+        "type": "comment"
+    },
+    "8684": {
+        "file_id": 639,
+        "content": "video_dir_value=$(func_parser_value \"${lines[12]}\")\nLOG_PATH=\"./test_tipc/output/log/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_serving.log\"\nfunction func_serving(){\n    IFS='|'\n    _python=$1\n    _script=$2\n    _model_dir=$3\n    # python serving code\n    set_dirname=$(func_set_params \"${infer_model_dir_key}\" \"${infer_model_dir_value}\")\n    set_model_filename=$(func_set_params \"${model_filename_key}\" \"${model_filename_value}\")\n    set_params_filename=$(func_set_params \"${params_filename_key}\" \"${params_filename_value}\")\n    set_serving_server=$(func_set_params \"${serving_server_key}\" \"${serving_server_value}\")\n    set_serving_client=$(func_set_params \"${serving_client_key}\" \"${serving_client_value}\")\n    python_list=(${python_list})\n    python=${python_list[0]}\n    trans_log=\"${LOG_PATH}/python_trans_model.log\"\n    trans_model_cmd=\"${python} ${trans_model_py} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_serving_server} ${set_serving_client} > ${trans_log} 2>&1 \"",
+        "type": "code",
+        "location": "/test_tipc/test_serving_infer_python.sh:30-54"
+    },
+    "8685": {
+        "file_id": 639,
+        "content": "The code defines a function `func_serving` that takes Python executable path, script, model directory, and sets various parameters for serving. It then executes a command to transfer the model to the specified server or client using a provided Python script. The output is logged in the `trans_log` file.",
+        "type": "comment"
+    },
+    "8686": {
+        "file_id": 639,
+        "content": "    eval ${trans_model_cmd}\n    # modify the alias name of fetch_var to \"outputs\"\n    server_fetch_var_line_cmd=\"sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \\\"outputs\\\"/' $serving_server_value/serving_server_conf.prototxt\"\n    eval ${server_fetch_var_line_cmd}\n    client_fetch_var_line_cmd=\"sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \\\"outputs\\\"/' $serving_client_value/serving_client_conf.prototxt\"\n    eval ${client_fetch_var_line_cmd}\n    cd ${serving_dir_value}\n    echo 'PWD= '$PWD\n    unset https_proxy\n    unset http_proxy\n    server_log_path=\"${LOG_PATH}/python_server_gpu.log\"\n    web_service_cmd=\"${python} ${web_service_py} > ${server_log_path} 2>&1 &\"\n    eval $web_service_cmd\n    last_status=${PIPESTATUS[0]}\n    status_check $last_status \"${web_service_cmd}\" \"${status_log}\" \"${model_name}\"\n    sleep 30s # not too short is ok\n    _save_log_path=\"../../${LOG_PATH}/python_server_infer_gpu_batchsize_1.log\"\n    set_video_dir=$(func_set_params \"${video_dir_key}\" \"${video_dir_value}\")",
+        "type": "code",
+        "location": "/test_tipc/test_serving_infer_python.sh:56-77"
+    },
+    "8687": {
+        "file_id": 639,
+        "content": "This code modifies alias names in configuration files, sets log paths and starts a web service using Python. It also checks the status of the service, sleeps for 30 seconds, and saves logs into a specific path. The code is executed within a specific directory and sets environment variables before running the commands.",
+        "type": "comment"
+    },
+    "8688": {
+        "file_id": 639,
+        "content": "    pipeline_cmd=\"${python} ${pipeline_py} ${set_video_dir} > ${_save_log_path} 2>&1 \"\n    eval $pipeline_cmd\n    last_status=${PIPESTATUS[0]}\n    eval \"cat ${_save_log_path}\"\n    cd ../../\n    status_check $last_status \"${pipeline_cmd}\" \"${status_log}\" \"${model_name}\"\n    ps ux | grep -E 'web_service|pipeline' | awk '{print $2}' | xargs kill -s 9\n}\n# set cuda device\nGPUID=$3\nif [ ${#GPUID} -le 0 ];then\n    env=\" \"\nelse\n    env=\"export CUDA_VISIBLE_DEVICES=${GPUID}\"\nfi\nset CUDA_VISIBLE_DEVICES\neval $env\necho \"################### run test ###################\"\nexport Count=0\nIFS=\"|\"\nfunc_serving \"${web_service_cmd}\"",
+        "type": "code",
+        "location": "/test_tipc/test_serving_infer_python.sh:78-105"
+    },
+    "8689": {
+        "file_id": 639,
+        "content": "This code is setting up the environment and running a test for a video processing pipeline. It sets the CUDA visible devices, runs the test using specified command, and performs clean-up by killing related processes after the test.",
+        "type": "comment"
+    },
+    "8690": {
+        "file_id": 640,
+        "content": "/test_tipc/test_train_dy2static_python.sh",
+        "type": "filepath"
+    },
+    "8691": {
+        "file_id": 640,
+        "content": "The code trains and analyzes two models (dygraph and dy2static), compares their losses, logs the differences, and prints the results.",
+        "type": "summary"
+    },
+    "8692": {
+        "file_id": 640,
+        "content": "source test_tipc/common_func.sh\nIFS=$'\\n'\nBASE_CONFIG_FILE=$1\n# always use the lite_train_lite_infer mode to speed. Modify the config file.\nMODE=lite_train_lite_infer\nBASEDIR=$(dirname \"$0\")\n# get the log path.\ndataline=$(cat ${BASE_CONFIG_FILE})\nlines=(${dataline})\nmodel_name=$(func_parser_value \"${lines[1]}\")\nLOG_PATH=\"./test_tipc/output/${model_name}/${MODE}\"\nrm -rf $LOG_PATH\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_python.log\"\n# make cudnn algorithm deterministic, such as conv.\nexport FLAGS_cudnn_deterministic=True\n# read the base config and parse and run the sub commands\nconfig_line_numbers=`cat ${BASE_CONFIG_FILE} | grep -n \"============\" | cut -d':' -f1`\nfor cln in $config_line_numbers\ndo\n    # change IFS to prevent \\n is parsed as delimiter.\n    IFS=\"\"\n    config_lines=$(cat ${BASE_CONFIG_FILE} | sed -n \"${cln},\\$p\" | head -n 22)\n    config_name=`echo ${config_lines} | grep '=====' | cut -d' ' -f2`\n    FILENAME=$LOG_PATH/dy2static_$config_name.txt\n    echo \"[Start dy2static]\" \"${config_name} : ${FILENAME}\"",
+        "type": "code",
+        "location": "/test_tipc/test_train_dy2static_python.sh:1-30"
+    },
+    "8693": {
+        "file_id": 640,
+        "content": "Source common functions and set IFS to handle line breaks. Read the BASE_CONFIG_FILE, identify MODE, get log path, delete existing directory if it exists, create a new one, set CUDNN deterministic for stable results, read base config, parse sub commands, and output relevant information.",
+        "type": "comment"
+    },
+    "8694": {
+        "file_id": 640,
+        "content": "    echo ${config_lines} > $FILENAME\n    sed -i 's/gpu_list.*$/gpu_list:0/g' $FILENAME\n    # execute the last line command\n    custom_cmd=$(echo $config_lines | tail -n 1)\n    echo \"CustomCmd is: \" $custom_cmd\n    eval $custom_cmd\n    IFS=$'\\n'\n    # start dygraph train\n    dygraph_output=$LOG_PATH/${config_name}_python_train_infer_dygraph_output.txt\n    dygraph_loss=$LOG_PATH/${config_name}_dygraph_loss.txt\n    cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} $MODE >$dygraph_output 2>&1\"\n    echo $cmd\n    eval $cmd\n    # start dy2static train\n    dy2static_output=$LOG_PATH/${config_name}_python_train_infer_dy2static_output.txt\n    dy2static_loss=$LOG_PATH/${config_name}_dy2static_loss.txt\n    sed -i '16s/$/ -o to_static=True/' ${FILENAME}\n    cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} $MODE >$dy2static_output 2>&1\"\n    echo $cmd\n    eval $cmd\n    # analysis and compare the losses.\n    dyout=`cat $dy2static_output | python test_tipc/extract_loss.py -v 'train step' -e 'loss: {%f} ' | head -n 3`",
+        "type": "code",
+        "location": "/test_tipc/test_train_dy2static_python.sh:31-57"
+    },
+    "8695": {
+        "file_id": 640,
+        "content": "This code is configuring, running and analyzing two different training models. It first sets the necessary environment, then runs a dygraph training model and a dy2static one, saving their results in separate logs. Finally, it extracts the losses from the dy2static log for comparison with the dygraph's loss.",
+        "type": "comment"
+    },
+    "8696": {
+        "file_id": 640,
+        "content": "    stout=`cat $dygraph_output   | python test_tipc/extract_loss.py -v 'train step' -e 'loss: {%f} ' | head -n 3`\n    echo $dyout > $dygraph_loss\n    echo $stout > $dy2static_loss\n    diff_log=$LOG_PATH/${config_name}_diff_log.txt\n    diff_cmd=\"diff -w $dygraph_loss $dy2static_loss > $diff_log\"\n    eval $diff_cmd\n    last_status=$?\n    cat $diff_log\n    if [ \"$dyout\" = \"\" ]; then\n        status_check 1 $diff_cmd $status_log $model_name $diff_log\n    elif [ \"$stout\" = \"\" ]; then\n        status_check 2 $diff_cmd $status_log $model_name $diff_log\n    else\n        status_check $last_status $diff_cmd $status_log $model_name $diff_log\n    fi\ndone",
+        "type": "code",
+        "location": "/test_tipc/test_train_dy2static_python.sh:58-73"
+    },
+    "8697": {
+        "file_id": 640,
+        "content": "This code compares the outputs of two models (dygraph_loss and dy2static_loss), checks for differences using a diff command, and logs the result to diff_log. If either dyout or stout is empty, it runs status_check with different codes. Finally, it prints the diff_log.",
+        "type": "comment"
+    },
+    "8698": {
+        "file_id": 641,
+        "content": "/test_tipc/test_train_inference_python.sh",
+        "type": "filepath"
+    },
+    "8699": {
+        "file_id": 641,
+        "content": "This code optimizes PaddleVideo model performance by configuring environment variables for efficient training or export tasks, evaluates models, saves trained models, logs, and runs evaluation scripts.",
+        "type": "summary"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/87.json b/docs/data/87.json
new file mode 100644
index 000000000..76646f864
--- /dev/null
+++ b/docs/data/87.json
@@ -0,0 +1,547 @@
+{
+    "8700": {
+        "file_id": 641,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\n# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer']\nMODE=$2\ndataline=$(cat ${FILENAME})\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# The training params\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython=$(func_parser_value \"${lines[2]}\")\ngpu_list=$(func_parser_value \"${lines[3]}\")\ntrain_use_gpu_key=$(func_parser_key \"${lines[4]}\")\ntrain_use_gpu_value=$(func_parser_value \"${lines[4]}\")\nautocast_list=$(func_parser_value \"${lines[5]}\")\nautocast_key=$(func_parser_key \"${lines[5]}\")\nepoch_key=$(func_parser_key \"${lines[6]}\")\nepoch_num=$(func_parser_value \"${lines[6]}\")\nsave_model_key=$(func_parser_key \"${lines[7]}\")\ntrain_batch_key=$(func_parser_key \"${lines[8]}\")\ntrain_batch_value=$(func_parser_value \"${lines[8]}\")\npretrain_model_key=$(func_parser_key \"${lines[9]}\")\npretrain_model_value=$(func_parser_value \"${lines[9]}\")\ntrain_model_name=$(func_parser_value \"${lines[10]}\")\ntrain_param_key1=$(func_parser_key \"${lines[12]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:1-30"
+    },
+    "8701": {
+        "file_id": 641,
+        "content": "The code reads a file containing training parameters and parses the values using different functions. These parameters include the model name, Python version, GPU list, auto-cast settings, epoch number, batch size, pre-trained model, and training model name. The parsed values will be used for further processing in the script.",
+        "type": "comment"
+    },
+    "8702": {
+        "file_id": 641,
+        "content": "train_param_value1=$(func_parser_value \"${lines[12]}\")\ntrain_param_key2=$(func_parser_key \"${lines[11]}\")\ntrain_param_value2=$(func_parser_value \"${lines[11]}\")\ntrainer_list=$(func_parser_value \"${lines[14]}\")\ntrainer_norm=$(func_parser_key \"${lines[15]}\")\nnorm_trainer=$(func_parser_value \"${lines[15]}\")\npact_key=$(func_parser_key \"${lines[16]}\")\npact_trainer=$(func_parser_value \"${lines[16]}\")\nfpgm_key=$(func_parser_key \"${lines[17]}\")\nfpgm_trainer=$(func_parser_value \"${lines[17]}\")\ndistill_key=$(func_parser_key \"${lines[18]}\")\ndistill_trainer=$(func_parser_value \"${lines[18]}\")\namp_key=$(func_parser_key \"${lines[19]}\")\namp_trainer=$(func_parser_value \"${lines[19]}\")\ntrainer_key2=$(func_parser_key \"${lines[20]}\")\ntrainer_value2=$(func_parser_value \"${lines[20]}\")\neval_py=$(func_parser_value \"${lines[23]}\")\neval_key1=$(func_parser_key \"${lines[24]}\")\neval_value1=$(func_parser_value \"${lines[24]}\")\nsave_infer_key=$(func_parser_key \"${lines[27]}\")\nsave_infer_value=$(func_parser_value \"${lines[27]}\")\nexport_weight=$(func_parser_key \"${lines[28]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:31-56"
+    },
+    "8703": {
+        "file_id": 641,
+        "content": "This code is parsing key-value pairs from different lines and assigning them to specific variables. The variables are used for trainer, pact, fpgm, distill, amp, evaluator, save_infer, and export weight configurations. This information will likely be utilized in subsequent parts of the script or program.",
+        "type": "comment"
+    },
+    "8704": {
+        "file_id": 641,
+        "content": "norm_export=$(func_parser_value \"${lines[29]}\")\npact_export=$(func_parser_value \"${lines[30]}\")\nfpgm_export=$(func_parser_value \"${lines[31]}\")\ndistill_export=$(func_parser_value \"${lines[32]}\")\nexport_key1=$(func_parser_key \"${lines[33]}\")\nexport_value1=$(func_parser_value \"${lines[33]}\")\nexport_key2=$(func_parser_key \"${lines[34]}\")\nexport_value2=$(func_parser_value \"${lines[34]}\")\ninference_dir=$(func_parser_value \"${lines[35]}\")\n# parser inference model\ninfer_model_dir_list=$(func_parser_value \"${lines[36]}\")\ninfer_export_list=$(func_parser_value \"${lines[37]}\")\ninfer_is_quant=$(func_parser_value \"${lines[38]}\")\n# parser inference\ninference_py=$(func_parser_value \"${lines[39]}\")\nuse_gpu_key=$(func_parser_key \"${lines[40]}\")\nuse_gpu_list=$(func_parser_value \"${lines[40]}\")\nuse_mkldnn_key=$(func_parser_key \"${lines[41]}\")\nuse_mkldnn_list=$(func_parser_value \"${lines[41]}\")\ncpu_threads_key=$(func_parser_key \"${lines[42]}\")\ncpu_threads_list=$(func_parser_value \"${lines[42]}\")\nbatch_size_key=$(func_parser_key \"${lines[43]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:57-79"
+    },
+    "8705": {
+        "file_id": 641,
+        "content": "The code parses various configuration values and keys from the lines of a script. It extracts normalization, quantization, and distillation settings; inference directory path; model directories for inference; whether to use GPU, MKLDNN, specify CPU threads, and batch size. The variables are assigned with these parsed values.",
+        "type": "comment"
+    },
+    "8706": {
+        "file_id": 641,
+        "content": "batch_size_list=$(func_parser_value \"${lines[43]}\")\nuse_trt_key=$(func_parser_key \"${lines[44]}\")\nuse_trt_list=$(func_parser_value \"${lines[44]}\")\nprecision_key=$(func_parser_key \"${lines[45]}\")\nprecision_list=$(func_parser_value \"${lines[45]}\")\ninfer_model_key=$(func_parser_key \"${lines[46]}\")\ninfer_model_value=$(func_parser_value \"${lines[46]}\")\nvideo_dir_key=$(func_parser_key \"${lines[47]}\")\ninfer_video_dir=$(func_parser_value \"${lines[47]}\")\nsave_log_key=$(func_parser_key \"${lines[48]}\")\nbenchmark_key=$(func_parser_key \"${lines[49]}\")\nbenchmark_value=$(func_parser_value \"${lines[49]}\")\ninfer_key1=$(func_parser_key \"${lines[50]}\")\ninfer_value1=$(func_parser_value \"${lines[50]}\")\nline_num=`grep -n -w \"to_static_train_benchmark_params\" $FILENAME  | cut -d \":\" -f 1`\nto_static_key=$(func_parser_key \"${lines[line_num]}\")\nto_static_trainer=$(func_parser_value \"${lines[line_num]}\")\n# parser klquant_infer\nif [ ${MODE} = \"klquant_whole_infer\" ]; then\n    dataline=$(awk 'NR==1 NR==17{print}'  $FILENAME)\n    lines=(${dataline})",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:80-104"
+    },
+    "8707": {
+        "file_id": 641,
+        "content": "This code is parsing the function parameters from a configuration file. The batch size, use_trt, precision, infer model value, video directory path, log saving flag, and benchmark are being assigned to respective variables. A specific line number is obtained using grep command for a keyword \"to_static_train_benchmark_params\". Then it checks if the mode is set to \"klquant_whole_infer\" and processes the first and 17th lines of the configuration file.",
+        "type": "comment"
+    },
+    "8708": {
+        "file_id": 641,
+        "content": "    model_name=$(func_parser_value \"${lines[1]}\")\n    python=$(func_parser_value \"${lines[2]}\")\n    # parser inference model\n    infer_model_dir_list=$(func_parser_value \"${lines[3]}\")\n    infer_export_list=$(func_parser_value \"${lines[4]}\")\n    infer_is_quant=$(func_parser_value \"${lines[5]}\")\n    # parser inference\n    inference_py=$(func_parser_value \"${lines[6]}\")\n    use_gpu_key=$(func_parser_key \"${lines[7]}\")\n    use_gpu_list=$(func_parser_value \"${lines[7]}\")\n    use_mkldnn_key=$(func_parser_key \"${lines[8]}\")\n    use_mkldnn_list=$(func_parser_value \"${lines[8]}\")\n    cpu_threads_key=$(func_parser_key \"${lines[9]}\")\n    cpu_threads_list=$(func_parser_value \"${lines[9]}\")\n    batch_size_key=$(func_parser_key \"${lines[10]}\")\n    batch_size_list=$(func_parser_value \"${lines[10]}\")\n    use_trt_key=$(func_parser_key \"${lines[11]}\")\n    use_trt_list=$(func_parser_value \"${lines[11]}\")\n    precision_key=$(func_parser_key \"${lines[12]}\")\n    precision_list=$(func_parser_value \"${lines[12]}\")\n    infer_model_key=$(func_parser_key \"${lines[13]}\")",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:105-125"
+    },
+    "8709": {
+        "file_id": 641,
+        "content": "The code is parsing the configuration file to extract specific values for different variables like model name, python version, inference model directory list, and more. These values are used later in the script to execute specific commands related to the test and train operations.",
+        "type": "comment"
+    },
+    "8710": {
+        "file_id": 641,
+        "content": "    video_dir_key=$(func_parser_key \"${lines[14]}\")\n    infer_video_dir=$(func_parser_value \"${lines[14]}\")\n    save_log_key=$(func_parser_key \"${lines[15]}\")\n    benchmark_key=$(func_parser_key \"${lines[16]}\")\n    benchmark_value=$(func_parser_value \"${lines[16]}\")\n    infer_key1=$(func_parser_key \"${lines[17]}\")\n    infer_value1=$(func_parser_value \"${lines[17]}\")\nfi\nLOG_PATH=\"./test_tipc/output/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_python.log\"\nfunction func_inference(){\n    IFS='|'\n    _python=$1\n    _script=$2\n    _model_dir=$3\n    _log_path=$4\n    _video_dir=$5\n    _flag_quant=$6\n    _gpu=$7\n    # inference\n    for use_gpu in ${use_gpu_list[*]}; do\n        if [ ${use_gpu} = \"False\" ] || [ ${use_gpu} = \"cpu\" ]; then\n            for use_mkldnn in ${use_mkldnn_list[*]}; do\n                if [[ ${use_mkldnn} = \"False\" ]] && [[ ${_flag_quant} = \"True\" ]]; then\n                    continue\n                fi\n                for threads in ${cpu_threads_list[*]}; do\n                    for batch_size in ${batch_size_list[*]}; do",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:126-157"
+    },
+    "8711": {
+        "file_id": 641,
+        "content": "This code sets variables for video directory, log path, inference functions, and other parameters. It then loops through various conditions to perform inferences with different combinations of GPU and MKLDNN usage, while keeping track of the results in the specified log file.",
+        "type": "comment"
+    },
+    "8712": {
+        "file_id": 641,
+        "content": "                        for precision in ${precision_list[*]}; do\n                            if [[ ${use_mkldnn} = \"False\" ]] && [[ ${precision} = \"fp16\" ]]; then\n                                continue\n                            fi # skip when enable fp16 but disable mkldnn\n                            if [[ ${_flag_quant} = \"True\" ]] && [[ ${precision} != \"int8\" ]]; then\n                                continue\n                            fi # skip when quant model inference but precision is not int8\n                            set_precision=$(func_set_params \"${precision_key}\" \"${precision}\")\n                            _save_log_path=\"${_log_path}/python_infer_cpu_gpus_${_gpu}_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log\"\n                            mkdir -p ${_log_path}\n                            set_infer_data=$(func_set_params \"${video_dir_key}\" \"${infer_video_dir}\")\n                            set_benchmark=$(func_set_params \"${benchmark_key}\" \"${benchmark_value}\")",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:158-170"
+    },
+    "8713": {
+        "file_id": 641,
+        "content": "This code is iterating over a list of precision values, checking conditions to decide whether to continue or skip the current iteration. It sets the appropriate log path, creates directories if needed, and calls functions to set parameters for inference data and benchmarking.",
+        "type": "comment"
+    },
+    "8714": {
+        "file_id": 641,
+        "content": "                            set_batchsize=$(func_set_params \"${batch_size_key}\" \"${batch_size}\")\n                            set_cpu_threads=$(func_set_params \"${cpu_threads_key}\" \"${threads}\")\n                            set_model_dir=$(func_set_params \"${infer_model_key}\" \"${_model_dir}/${infer_model_value}\")\n                            set_infer_params1=$(func_set_params \"${infer_key1}\" \"${_model_dir}/${infer_value1}\")\n                            command=\"${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 \"\n                            eval $command\n                            last_status=${PIPESTATUS[0]}\n                            eval \"cat ${_save_log_path}\"\n                            status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\" \"${_save_log_path}\"\n                        done\n                    done",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:171-181"
+    },
+    "8715": {
+        "file_id": 641,
+        "content": "The code is setting variables, constructing a command using environment variables, and executing it. It then checks the status of the execution and logs the output for later inspection. This appears to be part of a loop that's running multiple tests or experiments with varying parameters.",
+        "type": "comment"
+    },
+    "8716": {
+        "file_id": 641,
+        "content": "                done\n            done\n        elif [ ${use_gpu} = \"True\" ] || [ ${use_gpu} = \"gpu\" ]; then\n            for use_trt in ${use_trt_list[*]}; do\n                for precision in ${precision_list[*]}; do\n                    if [[ ${_flag_quant} = \"False\" ]] && [[ ${precision} =~ \"int8\" ]]; then\n                        continue\n                    fi\n                    if [[ ${precision} =~ \"fp16\" || ${precision} =~ \"int8\" ]] && [[ ${use_trt} = \"False\" ]]; then\n                        continue\n                    fi\n                    if [[ ${use_trt} = \"False\" || ${precision} =~ \"int8\" ]] && [[ ${_flag_quant} = \"True\" ]]; then\n                        continue\n                    fi\n                    for batch_size in ${batch_size_list[*]}; do\n                        _save_log_path=\"${_log_path}/python_infer_gpu_gpus_${_gpu}_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log\"\n                        set_infer_data=$(func_set_params \"${video_dir_key}\" \"${infer_video_dir}\")",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:182-198"
+    },
+    "8717": {
+        "file_id": 641,
+        "content": "This code snippet is checking various conditions for model inference using different parameters like GPU usage, precision, and batch size. It iterates through a list of options to set up the necessary configurations for logging and execution. The purpose seems to be running inference tests with varying settings to optimize performance.",
+        "type": "comment"
+    },
+    "8718": {
+        "file_id": 641,
+        "content": "                        set_benchmark=$(func_set_params \"${benchmark_key}\" \"${benchmark_value}\")\n                        set_batchsize=$(func_set_params \"${batch_size_key}\" \"${batch_size}\")\n                        set_tensorrt=$(func_set_params \"${use_trt_key}\" \"${use_trt}\")\n                        set_precision=$(func_set_params \"${precision_key}\" \"${precision}\")\n                        set_model_dir=$(func_set_params \"${infer_model_key}\" \"${_model_dir}/${infer_model_value}\")\n                        set_infer_params1=$(func_set_params \"${infer_key1}\" \"${_model_dir}/${infer_value1}\")\n                        command=\"${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 \"\n                        eval $command\n                        last_status=${PIPESTATUS[0]}\n                        eval \"cat ${_save_log_path}\"\n                        status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\" \"${_save_log_path}\"",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:200-212"
+    },
+    "8719": {
+        "file_id": 641,
+        "content": "This code sets parameters for benchmark, batch size, tensorrt usage, precision, model directory, and infer params1. It then executes a command with these parameters to run inference, saves the log, and checks the status of the execution.",
+        "type": "comment"
+    },
+    "8720": {
+        "file_id": 641,
+        "content": "                    done\n                done\n            done\n        else\n            echo \"Does not support hardware other than CPU and GPU Currently!\"\n        fi\n    done\n}\nif [ ${MODE} = \"whole_infer\" ] || [ ${MODE} = \"klquant_whole_infer\" ]; then\n    GPUID=$3\n    if [ ${#GPUID} -le 0 ];then\n        env=\" \"\n    else\n        env=\"export CUDA_VISIBLE_DEVICES=${GPUID}\"\n    fi\n    set CUDA_VISIBLE_DEVICES\n    eval $env\n    export Count=0\n    IFS=\"|\"\n    infer_run_exports=(${infer_export_list})\n    infer_quant_flag=(${infer_is_quant})\n    for infer_model in ${infer_model_dir_list[*]}; do\n        # run export\n        if [ ${infer_run_exports[Count]} != \"null\" ];then\n            save_infer_dir=$(dirname $infer_model)\n            set_export_weight=$(func_set_params \"${export_weight}\" \"${infer_model}\")\n            set_save_infer_key=$(func_set_params \"${save_infer_key}\" \"${save_infer_dir}\")\n            export_log_path=\"${LOG_PATH}_export_${Count}.log\"\n            export_cmd=\"${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 \"",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:214-243"
+    },
+    "8721": {
+        "file_id": 641,
+        "content": "This code is part of a script that tests and runs inference models using PaddleVideo. It checks the hardware being used (CPU or GPU) and sets appropriate environment variables accordingly. It then iterates through each inference model, running them with specific exported weights and saving the output logs for further analysis.",
+        "type": "comment"
+    },
+    "8722": {
+        "file_id": 641,
+        "content": "            echo ${infer_run_exports[Count]}\n            eval $export_cmd\n            echo $export_cmd\n            status_export=$?\n            status_check $status_export \"${export_cmd}\" \"${status_log}\" \"${model_name}\" \"${export_log_path}\"\n        else\n            save_infer_dir=${infer_model}\n        fi\n        #run inference\n        is_quant=${infer_quant_flag[Count]}\n        if [ ${MODE} = \"klquant_infer\" ]; then\n            is_quant=\"True\"\n        fi\n        func_inference \"${python}\" \"${inference_py}\" \"${save_infer_dir}\" \"${LOG_PATH}\" \"${infer_video_dir}\" ${is_quant} \"${gpu}\"\n        Count=$(($Count + 1))\n    done\nelse\n    IFS=\"|\"\n    export Count=0\n    USE_GPU_KEY=(${train_use_gpu_value})\n    for gpu in ${gpu_list[*]}; do\n        train_use_gpu=${USE_GPU_KEY[Count]}\n        Count=$(($Count + 1))\n        ips=\"\"\n        if [ ${gpu} = \"-1\" ];then\n            env=\"\"\n        elif [ ${#gpu} -le 1 ];then\n            env=\"export CUDA_VISIBLE_DEVICES=${gpu}\"\n            eval ${env}\n        elif [ ${#gpu} -le 15 ];then",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:244-274"
+    },
+    "8723": {
+        "file_id": 641,
+        "content": "This code is iterating through a list of GPUs, setting the visible CUDA devices accordingly and running inference for each GPU. It also checks if exporting is needed, saves the infer directory, runs quantized inference if it's a klquant_infer mode, and keeps track of the count to ensure all GPUs are considered. If a GPU list item is -1, it uses no GPU, otherwise, it sets the environment for that specific GPU. The code block also checks whether the GPU count is less than or equal to 15 to avoid potential issues with larger lists.",
+        "type": "comment"
+    },
+    "8724": {
+        "file_id": 641,
+        "content": "            IFS=\",\"\n            array=(${gpu})\n            env=\"export CUDA_VISIBLE_DEVICES=${array[0]}\"\n            IFS=\"|\"\n        else\n            IFS=\";\"\n            array=(${gpu})\n            ips=${array[0]}\n            gpu=${array[1]}\n            IFS=\"|\"\n            env=\" \"\n        fi\n        for autocast in ${autocast_list[*]}; do\n            if [ ${autocast} = \"fp16\" ]; then\n                set_amp_config=\"--amp --amp_level 'O2'\"\n            else\n                set_amp_config=\" \"\n            fi\n            for trainer in ${trainer_list[*]}; do\n                flag_quant=False\n                if [ ${trainer} = ${pact_key} ]; then\n                    run_train=${pact_trainer}\n                    run_export=${pact_export}\n                    flag_quant=True\n                elif [ ${trainer} = \"${fpgm_key}\" ]; then\n                    run_train=${fpgm_trainer}\n                    run_export=${fpgm_export}\n                elif [ ${trainer} = \"${distill_key}\" ]; then\n                    run_train=${distill_trainer}",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:275-303"
+    },
+    "8725": {
+        "file_id": 641,
+        "content": "This code is setting up environment variables for parallel GPU usage, iterating through different autocast and trainer configurations to execute training or export tasks based on the provided key values. The flag_quant variable tracks whether quantization is required for a particular configuration.",
+        "type": "comment"
+    },
+    "8726": {
+        "file_id": 641,
+        "content": "                    run_export=${distill_export}\n                elif [ ${trainer} = ${amp_key} ]; then\n                    run_train=${amp_trainer}\n                    run_export=${norm_export}\n                elif [[ ${trainer} = ${trainer_key2} ]]; then\n                    run_train=${trainer_value2}\n                    run_export=${export_value2}\n                # In case of @to_static, we re-used norm_traier,\n                # but append \"-o to_static=True\" for config\n                # to trigger \"to_static\" logic in 'train.py'\n                elif [ ${trainer} = \"${to_static_key}\" ]; then\n                    run_train=\"${norm_trainer}  ${to_static_trainer}\"\n                    run_export=${norm_export}\n                else\n                    run_train=${norm_trainer}\n                    run_export=${norm_export}\n                fi\n                if [ ${run_train} = \"null\" ]; then\n                    continue\n                fi\n                if [[ ${MODE} != \"benchmark_train\" ]] && [[ ! ${MODE} =~ \"whole_train\" ]]; then",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:304-325"
+    },
+    "8727": {
+        "file_id": 641,
+        "content": "This code uses conditional statements to assign values to `run_train` and `run_export` variables based on the value of `trainer`. It handles multiple scenarios, including cases with specific keys, and triggers \"to_static\" logic in 'train.py' when needed. If `run_train` is assigned as null, it continues without executing further.",
+        "type": "comment"
+    },
+    "8728": {
+        "file_id": 641,
+        "content": "                    # 训练参数末尾加上--max_iters=30和--log_interval=1，以便运行并输出足量数据\n                    run_train=${run_train}\" --max_iters=30\"\n                fi\n                set_autocast=$(func_set_params \"${autocast_key}\" \"${autocast}\")\n                set_epoch=$(func_set_params \"${epoch_key}\" \"${epoch_num}\")\n                if [[ $MODE =~ \"whole_train\" ]]; then\n                    set_epoch=\"\"\n                fi\n                set_pretrain=$(func_set_params \"${pretrain_model_key}\" \"${pretrain_model_value}\")\n                if [[ $MODE =~ \"whole_train\" ]]; then\n                    train_batch_key=\"\"\n                    train_batch_value=\"\"\n                fi\n                set_batchsize=$(func_set_params \"${train_batch_key}\" \"${train_batch_value}\")\n                if [[ $MODE =~ \"whole_train\" ]]; then\n                    train_param_key1=\"\"\n                    train_param_value1=\"\"\n                fi\n                set_train_params1=$(func_set_params \"${train_param_key1}\" \"${train_param_value1}\")\n                if [[ $MODE =~ \"whole_train\" ]]; then",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:326-347"
+    },
+    "8729": {
+        "file_id": 641,
+        "content": "This code is setting up parameters for model training and inference. It appends --max_iters=30 and --log_interval=1 to the run_train string for better data output, sets autocast, epoch, pretrain values, and checks if MODE includes \"whole_train\" to set certain variables to empty strings or nulls. The code also uses func_set_params to set batch size and train parameters.",
+        "type": "comment"
+    },
+    "8730": {
+        "file_id": 641,
+        "content": "                    train_param_key2=\"\"\n                    train_param_value2=\"\"\n                fi\n                set_train_params2=$(func_set_params \"${train_param_key2}\" \"${train_param_value2}\")\n                set_use_gpu=$(func_set_params \"${train_use_gpu_key}\" \"${train_use_gpu}\")\n                if [ ${#ips} -le 15 ];then\n                    # len(ips)<=15, single machine\n                    nodes=1\n                    save_log=\"${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}\"\n                else\n                    # if length of ips > 15, then it is seen as multi-machine\n                    # 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0\n                    IFS=\",\"\n                    ips_array=(${ips})\n                    IFS=\"|\"\n                    nodes=${#ips_array[@]}\n                    save_log=\"${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}\"\n                fi\n                # load pretrain from norm training if current trainer is pact or fpgm trainer",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:348-367"
+    },
+    "8731": {
+        "file_id": 641,
+        "content": "This code sets up parameters for training and inference of a PaddleVideo model. It determines whether the training is on a single machine or multiple machines based on the number of IPs provided. Depending on this, it sets the number of nodes, logs information accordingly, and loads pre-training from normal training if the current trainer is PACT or FPGM.",
+        "type": "comment"
+    },
+    "8732": {
+        "file_id": 641,
+        "content": "                if ([ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]) && [ ${nodes} -le 1 ]; then\n                    set_pretrain=\"${load_norm_train_model}\"\n                fi\n                set_save_model=$(func_set_params \"${save_model_key}\" \"${save_log}\")\n                if [ ${#gpu} -le 2 ];then  # train with cpu or single gpu\n                    cmd=\"${python} ${run_train} ${set_amp_config} ${set_use_gpu}  ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_train_params2} > ${LOG_PATH}/train.log 2>&1\"\n                elif [ ${#ips} -le 15 ];then  # train with multi-gpu\n                    cmd=\"${python} -B -m paddle.distributed.launch --devices=\\\"${gpu}\\\" ${run_train} ${set_amp_config} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_train_params2} > ${LOG_PATH}/train.log 2>&1\"\n                else     # train with multi-machine\n                    cmd=\"${python} -B -m paddle.distr",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:368-378"
+    },
+    "8733": {
+        "file_id": 641,
+        "content": "This code checks if the trainer is either 'pact_key' or 'fpgm_key', and if the number of nodes is less than or equal to 1. If true, it sets the 'set_pretrain' variable to 'load_norm_train_model'. The code then determines the appropriate command based on whether the number of GPUs is 2 or less (train with CPU or single GPU), up to 15 (train with multi-GPU), or more than 15 (train with multiple machines). The command uses PaddlePaddle's distributed training capabilities.",
+        "type": "comment"
+    },
+    "8734": {
+        "file_id": 641,
+        "content": "ibuted.launch --ips=${ips} --devices=\\\"${gpu}\\\" ${run_train} ${set_amp_config} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_train_params1} ${set_train_params2} > ${LOG_PATH}/train.log 2>&1\"\n                fi\n                # run train\n                eval $cmd\n                # display log for benchmark train\n                eval \"cat ${LOG_PATH}/train.log\"\n                eval \"cat ${LOG_PATH}/train.log >> ${save_log}.log\"\n                status_check $? \"${cmd}\" \"${status_log}\" \"${model_name}\" \"${save_log}.log\"\n                # set_eval_pretrain=$(func_set_params \"${pretrain_model_key}\" \"${save_log}/${train_model_name}\")\n                # save norm trained models to set pretrain for pact training and fpgm training\n                if [ [${trainer} = ${trainer_norm}] ] && [ [${nodes} -le 1] ]; then\n                    load_norm_train_model=${set_eval_pretrain}\n                fi\n                # run eval\n                if [ ${eval_py} != \"null\" ]; then\n                    real_model_name=${model_name/PP-/pp}",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:378-395"
+    },
+    "8735": {
+        "file_id": 641,
+        "content": "This code snippet is running a training script for a PaddleVideo model. It sets parameters, evaluates pre-trained models, and saves the trained models. The script also displays logs for benchmarking and checks the status of the operation. If there's a single node and trainer, it loads a norm-train model for further usage. Finally, it runs an evaluation script if specified.",
+        "type": "comment"
+    },
+    "8736": {
+        "file_id": 641,
+        "content": "                    set_eval_params1=$(func_set_params \"${eval_key1}\" \"${save_log}/${real_model_name}_epoch_00001.pdparams\")\n                    eval_log_path=\"${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log\"\n                    if [[ $MODE =~ \"lite_infer\" ]] && [[ ${train_param_key1} != \"null\" ]]; then\n                        eval_cmd=\"${python} ${eval_py} ${set_use_gpu} ${set_eval_params1} ${train_param_key1}=${train_param_value1} > ${eval_log_path} 2>&1 \"\n                    else\n                        eval_cmd=\"${python} ${eval_py} ${set_use_gpu} ${set_eval_params1} > ${eval_log_path} 2>&1 \"\n                    fi\n                    eval $eval_cmd\n                    status_check $? \"${eval_cmd}\" \"${status_log}\" \"${model_name}\" \"${eval_log_path}\"\n                fi\n                # run export model\n                if [ ${run_export} != \"null\" ]; then\n                    save_infer_path=\"${save_log}\"\n                    real_model_name=${model_name/PP-/pp}\n     ",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:396-410"
+    },
+    "8737": {
+        "file_id": 641,
+        "content": "The code sets the evaluation parameters and prepares a command to evaluate a model using specified inputs. If the MODE includes \"lite_infer\" and train_param_key1 is not null, it appends additional parameters to the command. Finally, it runs the command and checks the status of the evaluation.",
+        "type": "comment"
+    },
+    "8738": {
+        "file_id": 641,
+        "content": "               set_export_weight=$(func_set_params \"${export_weight}\" \"${save_log}/${real_model_name}_epoch_00001.pdparams\")\n                    set_save_infer_key=$(func_set_params \"${save_infer_key}\" \"${save_log}\")\n                    export_log_path=\"${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_export.log\"\n                    export_cmd=\"${python} ${run_export} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 \"\n                    eval $export_cmd\n                    status_check $? \"${export_cmd}\" \"${status_log}\" \"${model_name}\" \"${export_log_path}\"\n                    #run inference\n                    eval $env\n                    save_infer_path=\"${save_log}\"\n                    if [ ${inference_dir} != \"null\" ] && [ ${inference_dir} != '##' ]; then\n                        infer_model_dir=${save_infer_path}\n                    else\n                        infer_model_dir=${save_infer_path}\n                    fi\n                    func_inference ",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:410-426"
+    },
+    "8739": {
+        "file_id": 641,
+        "content": "This code is setting up variables for exporting weights, saving inference key, and defining the export command. It then executes the export command and checks its status before running inference. If an inference directory is provided, it sets the inference model directory accordingly. Finally, it calls a function for inference processing.",
+        "type": "comment"
+    },
+    "8740": {
+        "file_id": 641,
+        "content": "\"${python}\" \"${inference_py}\" \"${infer_model_dir}\" \"${LOG_PATH}\" \"${infer_video_dir}\" \"${flag_quant}\" \"${gpu}\"\n                    eval \"unset CUDA_VISIBLE_DEVICES\"\n                fi\n            done  # done with:    for trainer in ${trainer_list[*]}; do\n        done      # done with:    for autocast in ${autocast_list[*]}; do\n    done          # done with:    for gpu in ${gpu_list[*]}; do\nfi  # end if [ ${MODE} = \"infer\" ]; then",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python.sh:426-433"
+    },
+    "8741": {
+        "file_id": 641,
+        "content": "This code snippet is a bash script that iterates through trainers, autocast options, and GPUs. It sets CUDA_VISIBLE_DEVICES to empty if the current mode is inference. The purpose is likely to set up an environment for training or inference based on different configurations.",
+        "type": "comment"
+    },
+    "8742": {
+        "file_id": 642,
+        "content": "/test_tipc/test_train_inference_python_npu.sh",
+        "type": "filepath"
+    },
+    "8743": {
+        "file_id": 642,
+        "content": "The script updates a configuration file for NPU use, disables MKLDNN on non-x86_64, sets Python to 3.9 for NPU support, and changes the execution script from \"gpu\" to \"npu\". It executes a bash script using eval with the command stored in variable 'cmd'.",
+        "type": "summary"
+    },
+    "8744": {
+        "file_id": 642,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nfunction readlinkf() {\n    perl -MCwd -e \"print Cwd::abs_path shift\" \"$1\";\n}\nfunction func_parser_config() {\n    strs=$1\n    IFS=\" \"\n    array=(${strs})\n    tmp=${array[2]}\n    echo ${tmp}\n}\nBASEDIR=$(dirname \"$0\")\nREPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)\nFILENAME=$1\n# disable mkldnn on non x86_64 env\narch=$(uname -i)\nif [ $arch != \"x86_64\" ]; then\n    sed -i \"s/--enable_mkldnn:True|False/--enable_mkldnn:False/g\" $FILENAME\n    sed -i \"s/--enable_mkldnn:True/--enable_mkldnn:False/g\" $FILENAME\nfi\n# change gpu to npu in tipc txt configs\nsed -i \"s/use_gpu/use_npu/g\" $FILENAME\n# disable benchmark as AutoLog required nvidia-smi command\nsed -i \"s/--enable_benchmark:True/--enable_benchmark:False/g\" $FILENAME\n# python has been updated to version 3.9 for npu backend\nsed -i \"s/python3.7/python3.9/g\" $FILENAME\ndataline=`cat $FILENAME`\n# change gpu to npu in execution script\nsed -i \"s/\\\"gpu\\\"/\\\"npu\\\"/g\" test_tipc/test_train_inference_python.sh\n# pass parameters to test_train_inference_python.sh",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python_npu.sh:1-39"
+    },
+    "8745": {
+        "file_id": 642,
+        "content": "This script modifies a configuration file to use NPU instead of GPU, disables MKLDNN on non-x86_64 environments, and updates the Python version to 3.9 for NPU backend support. It also changes the execution script from using \"gpu\" to \"npu\".",
+        "type": "comment"
+    },
+    "8746": {
+        "file_id": 642,
+        "content": "cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} $2\"\necho -e \"\\033[1;32m Started to run command: ${cmd}!  \\033[0m\"\neval $cmd",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python_npu.sh:40-42"
+    },
+    "8747": {
+        "file_id": 642,
+        "content": "The code is executing a bash script, storing the command in variable 'cmd', printing its execution status, and then running it using eval.",
+        "type": "comment"
+    },
+    "8748": {
+        "file_id": 643,
+        "content": "/test_tipc/test_train_inference_python_xpu.sh",
+        "type": "filepath"
+    },
+    "8749": {
+        "file_id": 643,
+        "content": "The script modifies PaddleVideo configuration to use XPU, disables benchmarking, and updates the execution script for Python 3.9 NPU backend. The code logs the execution start after running a bash command with specified parameters.",
+        "type": "summary"
+    },
+    "8750": {
+        "file_id": 643,
+        "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nfunction readlinkf() {\n    perl -MCwd -e \"print Cwd::abs_path shift\" \"$1\";\n}\nfunction func_parser_config() {\n    strs=$1\n    IFS=\" \"\n    array=(${strs})\n    tmp=${array[2]}\n    echo ${tmp}\n}\nBASEDIR=$(dirname \"$0\")\nREPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)\nFILENAME=$1\n# disable mkldnn on non x86_64 env\narch=$(uname -i)\nif [ $arch != \"x86_64\" ]; then\n    sed -i \"s/--enable_mkldnn:True|False/--enable_mkldnn:False/g\" $FILENAME\n    sed -i \"s/--enable_mkldnn:True/--enable_mkldnn:False/g\" $FILENAME\nfi\n# change gpu to xpu in tipc txt configs\nsed -i \"s/use_gpu/use_xpu/g\" $FILENAME\n# disable benchmark as AutoLog required nvidia-smi command\nsed -i \"s/--enable_benchmark:True/--enable_benchmark:False/g\" $FILENAME\n# python has been updated to version 3.9 for npu backend\nsed -i \"s/python3.7/python3.9/g\" $FILENAME\ndataline=`cat $FILENAME`\n# change gpu to xpu in execution script\nsed -i \"s/\\\"gpu\\\"/\\\"xpu\\\"/g\" test_tipc/test_train_inference_python.sh\n# pass parameters to test_train_inference_python.sh",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python_xpu.sh:1-39"
+    },
+    "8751": {
+        "file_id": 643,
+        "content": "This script changes the configuration file for PaddleVideo to use XPU instead of GPU, disables benchmarking and uses Python 3.9 for NPU backend, and updates the test_train_inference_python.sh execution script to use \"xpu\" instead of \"gpu\".",
+        "type": "comment"
+    },
+    "8752": {
+        "file_id": 643,
+        "content": "cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} $2\"\necho -e \"\\033[1;32m Started to run command: ${cmd}!  \\033[0m\"\neval $cmd",
+        "type": "code",
+        "location": "/test_tipc/test_train_inference_python_xpu.sh:40-42"
+    },
+    "8753": {
+        "file_id": 643,
+        "content": "This code executes a bash command with specified parameters, logging the start of execution.",
+        "type": "comment"
+    },
+    "8754": {
+        "file_id": 644,
+        "content": "/tools/__init__.py",
+        "type": "filepath"
+    },
+    "8755": {
+        "file_id": 644,
+        "content": "This code block is importing modules and defining the contents of the package. It sets __all__ to include 'utils', 'PaddleVideo', and 'ava_predict'. The code block also includes copyright, license information, and imports from different . files within the package.",
+        "type": "summary"
+    },
+    "8756": {
+        "file_id": 644,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n__all__ = ['utils', 'PaddleVideo', 'ava_predict']\nfrom . import utils\nfrom .wheel import PaddleVideo\nfrom . import ava_predict",
+        "type": "code",
+        "location": "/tools/__init__.py:1-19"
+    },
+    "8757": {
+        "file_id": 644,
+        "content": "This code block is importing modules and defining the contents of the package. It sets __all__ to include 'utils', 'PaddleVideo', and 'ava_predict'. The code block also includes copyright, license information, and imports from different . files within the package.",
+        "type": "comment"
+    },
+    "8758": {
+        "file_id": 645,
+        "content": "/tools/ava_predict.py",
+        "type": "filepath"
+    },
+    "8759": {
+        "file_id": 645,
+        "content": "The code establishes paths, defines functions for AVA model in PaddleVideo with OpenCV, creates a video analysis model, extracts frames, predicts label scores, detects humans, performs inference, and identifies spatio-temporal actions.",
+        "type": "summary"
+    },
+    "8760": {
+        "file_id": 645,
+        "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport paddle\nimport os, sys\nimport copy as cp\nimport cv2\nimport math\ntry:\n    import ppdet\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [paddledet] package and it's dependencies is required for AVA.\"\n    )\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\nfrom paddlevideo.modeling.builder import build_model\nfrom paddlevideo.utils import get_config",
+        "type": "code",
+        "location": "/tools/ava_predict.py:1-32"
+    },
+    "8761": {
+        "file_id": 645,
+        "content": "This code is a Python script for the AVA (Action Unit Detection) model in PaddleVideo. It imports necessary libraries, checks for missing dependencies, and sets up paths for model building.",
+        "type": "comment"
+    },
+    "8762": {
+        "file_id": 645,
+        "content": "from paddlevideo.loader.builder import build_dataloader, build_dataset, build_pipeline\nfrom paddlevideo.metrics.ava_utils import read_labelmap\nimport time\nfrom os import path as osp\nimport numpy as np\nfrom paddlevideo.utils import get_config\nimport pickle\nfrom paddlevideo.utils import (get_logger, load, mkdir, save)\nimport shutil\nFONTFACE = cv2.FONT_HERSHEY_DUPLEX\nFONTSCALE = 0.5\nFONTCOLOR = (255, 255, 255)  # BGR, white\nMSGCOLOR = (128, 128, 128)  # BGR, gray\nTHICKNESS = 1\nLINETYPE = 1\ndef hex2color(h):\n    \"\"\"Convert the 6-digit hex string to tuple of 3 int value (RGB)\"\"\"\n    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))\nplate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'\nplate_blue = plate_blue.split('-')\nplate_blue = [hex2color(h) for h in plate_blue]\nplate_green = '004b23-006400-007200-008000-38b000-70e000'\nplate_green = plate_green.split('-')\nplate_green = [hex2color(h) for h in plate_green]\ndef abbrev(name):\n    \"\"\"Get the abbreviation of label name:\n    'take (an object) from (a person)' -> 'take ... from ...'",
+        "type": "code",
+        "location": "/tools/ava_predict.py:33-68"
+    },
+    "8763": {
+        "file_id": 645,
+        "content": "This code snippet is a part of the PaddleVideo library. It defines several color schemes and abbreviation functions related to video analysis tasks. The color schemes are used for annotations, while the abbreviation function is for simplifying label names in the AVA dataset.",
+        "type": "comment"
+    },
+    "8764": {
+        "file_id": 645,
+        "content": "    \"\"\"\n    while name.find('(') != -1:\n        st, ed = name.find('('), name.find(')')\n        name = name[:st] + '...' + name[ed + 1:]\n    return name\n# annotations is pred results\ndef visualize(frames, annotations, plate=plate_blue, max_num=5):\n    \"\"\"Visualize frames with predicted annotations.\n    Args:\n        frames (list[np.ndarray]): Frames for visualization, note that\n            len(frames) % len(annotations) should be 0.\n        annotations (list[list[tuple]]): The predicted results.\n        plate (str): The plate used for visualization. Default: plate_blue.\n        max_num (int): Max number of labels to visualize for a person box.\n            Default: 5，目前不能大于5.\n    Returns:\n        list[np.ndarray]: Visualized frames.\n    \"\"\"\n    assert max_num + 1 <= len(plate)\n    plate = [x[::-1] for x in plate]\n    frames_ = cp.deepcopy(frames)\n    nf, na = len(frames), len(annotations)\n    assert nf % na == 0\n    nfpa = len(frames) // len(annotations)\n    anno = None\n    h, w, _ = frames[0].shape\n    # proposals被归一化需要还原真实坐标值",
+        "type": "code",
+        "location": "/tools/ava_predict.py:69-98"
+    },
+    "8765": {
+        "file_id": 645,
+        "content": "This function visualizes frames with predicted annotations, requiring the number of frames and annotations to be multiples. It asserts that the max_num is less than or equal to the length of the plate used for visualization and ensures that frames are a deep copy before processing. The assertions check if the number of frames is divisible by the number of annotations, and calculates the number of frames per annotation. The function also initializes the annotation variable and stores the image height and width for later use.",
+        "type": "comment"
+    },
+    "8766": {
+        "file_id": 645,
+        "content": "    scale_ratio = np.array([w, h, w, h])\n    for i in range(na):\n        anno = annotations[i]\n        if anno is None:\n            continue\n        for j in range(nfpa):\n            ind = i * nfpa + j\n            frame = frames_[ind]\n            for ann in anno:\n                box = ann[0]\n                label = ann[1]\n                if not len(label):\n                    continue\n                score = ann[2]\n                box = (box * scale_ratio).astype(np.int64)\n                st, ed = tuple(box[:2]), tuple(box[2:])\n                cv2.rectangle(frame, st, ed, plate[0], 2)\n                for k, lb in enumerate(label):\n                    if k >= max_num:\n                        break\n                    text = abbrev(lb)\n                    text = ': '.join([text, str(score[k])])\n                    location = (0 + st[0], 18 + k * 18 + st[1])\n                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,\n                                               THICKNESS)[0]\n                    textwidth = textsize[0]",
+        "type": "code",
+        "location": "/tools/ava_predict.py:99-125"
+    },
+    "8767": {
+        "file_id": 645,
+        "content": "This code is iterating through annotations and frames, scaling box coordinates based on image size, drawing rectangles around objects in frames using OpenCV, and displaying labels above the rectangles with their corresponding scores.",
+        "type": "comment"
+    },
+    "8768": {
+        "file_id": 645,
+        "content": "                    diag0 = (location[0] + textwidth, location[1] - 14)\n                    diag1 = (location[0], location[1] + 2)\n                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)\n                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,\n                                FONTCOLOR, THICKNESS, LINETYPE)\n    return frames_\ndef frame_extraction(video_path, target_dir):\n    \"\"\"Extract frames given video_path.\n    Args:\n        video_path (str): The video_path.\n    \"\"\"\n    if not os.path.exists(target_dir):\n        os.makedirs(target_dir, exist_ok=True)\n    # Should be able to handle videos up to several hours\n    frame_tmpl = osp.join(target_dir, '{:05d}.jpg')\n    vid = cv2.VideoCapture(video_path)\n    FPS = int(vid.get(5))\n    frames = []\n    frame_paths = []\n    flag, frame = vid.read()\n    index = 1\n    while flag:\n        frames.append(frame)\n        frame_path = frame_tmpl.format(index)\n        frame_paths.append(frame_path)\n        cv2.imwrite(frame_path, frame)\n        index += 1",
+        "type": "code",
+        "location": "/tools/ava_predict.py:126-160"
+    },
+    "8769": {
+        "file_id": 645,
+        "content": "This code is part of the \"ava_predict.py\" file in the PaddleVideo library. It defines a function called \"frame_extraction\" that takes a video path and target directory as arguments. The function extracts frames from the given video_path and saves them to the specified target directory. It reads each frame of the video, appends it to the \"frames\" list, writes it to disk using cv2.imwrite, and increments the index for frame naming. The target directory is created if it doesn't exist already. This function handles videos with a maximum length of several hours, as indicated by the FPS (Frames Per Second) value obtained from the video.",
+        "type": "comment"
+    },
+    "8770": {
+        "file_id": 645,
+        "content": "        flag, frame = vid.read()\n    return frame_paths, frames, FPS\ndef parse_args():\n    def str2bool(v):\n        return v.lower() in (\"true\", \"t\", \"1\")\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Inference model script\")\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument('--video_path', help='video file/url')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument('-w',\n                        '--weights',\n                        type=str,\n                        help='weights for finetuning or testing')\n    #detection_model_name\n    parser.add_argument('--detection_model_name',\n                        help='the name of detection model ')",
+        "type": "code",
+        "location": "/tools/ava_predict.py:161-191"
+    },
+    "8771": {
+        "file_id": 645,
+        "content": "This code is for running PaddleVideo inference model. It takes a video file or URL, config file path, and overrides options as input parameters. The model can be finetuned or tested using specified weights. The detection model name is also an optional parameter.",
+        "type": "comment"
+    },
+    "8772": {
+        "file_id": 645,
+        "content": "    # detection_model_weights\n    parser.add_argument('--detection_model_weights',\n                        help='the weights path of detection model ')\n    # params for predict\n    parser.add_argument('--out-filename',\n                        default='ava_det_demo.mp4',\n                        help='output filename')\n    parser.add_argument('--predict-stepsize',\n                        default=8,\n                        type=int,\n                        help='give out a prediction per n frames')\n    parser.add_argument(\n        '--output-stepsize',\n        default=4,\n        type=int,\n        help=('show one frame per n frames in the demo, we should have: '\n              'predict_stepsize % output_stepsize == 0'))\n    parser.add_argument('--output-fps',\n                        default=6,\n                        type=int,\n                        help='the fps of demo video output')\n    return parser.parse_args()\n# 一帧的结果。根据概率大小进行排序\ndef pack_result(human_detection, result):\n    \"\"\"Short summary.\n    Args:\n        human_detection (np.ndarray): Human detection result.",
+        "type": "code",
+        "location": "/tools/ava_predict.py:192-222"
+    },
+    "8773": {
+        "file_id": 645,
+        "content": "This code is parsing arguments for the ava_predict function, including detection model weights path, output filename, predict step size, output step size, and output FPS. The pack_result function combines human detection results with a given result, sorting them by probability size.",
+        "type": "comment"
+    },
+    "8774": {
+        "file_id": 645,
+        "content": "        result (type): The predicted label of each human proposal.\n    Returns:\n        tuple: Tuple of human proposal, label name and label score.\n    \"\"\"\n    results = []\n    if result is None:\n        return None\n    for prop, res in zip(human_detection, result):\n        res.sort(key=lambda x: -x[1])\n        results.append((prop, [x[0] for x in res], [x[1] for x in res]))\n    return results\n# 构造数据处理需要的results\ndef get_timestep_result(frame_dir, timestamp, clip_len, frame_interval, FPS):\n    result = {}\n    result[\"frame_dir\"] = frame_dir\n    frame_num = len(os.listdir(frame_dir))\n    dir_name = frame_dir.split(\"/\")[-1]\n    result[\"video_id\"] = dir_name\n    result['timestamp'] = timestamp\n    timestamp_str = '{:04d}'.format(timestamp)\n    img_key = dir_name + \",\" + timestamp_str\n    result['img_key'] = img_key\n    result['shot_info'] = (1, frame_num)\n    result['fps'] = FPS\n    result['suffix'] = '{:05}.jpg'\n    result['timestamp_start'] = 1\n    result['timestamp_end'] = int(frame_num / result['fps'])\n    return result",
+        "type": "code",
+        "location": "/tools/ava_predict.py:223-264"
+    },
+    "8775": {
+        "file_id": 645,
+        "content": "This function takes the predicted label of each human proposal and returns a tuple containing the human proposal, label name, and label score. It also constructs data processing results for frame directory, timestamp, clip length, frame interval, and frames per second.",
+        "type": "comment"
+    },
+    "8776": {
+        "file_id": 645,
+        "content": "def detection_inference(frame_paths, output_dir, model_name, weights_path):\n    \"\"\"Detect human boxes given frame paths.\n    Args:\n        frame_paths (list[str]): The paths of frames to do detection inference.\n    Returns:\n        list[np.ndarray]: The human detection results.\n    \"\"\"\n    detection_cfg = ppdet.model_zoo.get_config_file(model_name)\n    detection_cfg = ppdet.core.workspace.load_config(detection_cfg)\n    detection_trainer = ppdet.engine.Trainer(detection_cfg, mode='test')\n    detection_trainer.load_weights(weights_path)\n    print('Performing Human Detection for each frame')\n    detection_trainer.predict(frame_paths, output_dir=output_dir, save_txt=True)\n    print(\"finish object detection\")\n    results = []\n    for frame_path in frame_paths:\n        (file_dir, file_name) = os.path.split(frame_path)\n        (file_path, ext) = os.path.splitext(frame_path)\n        txt_file_name = file_name.replace(ext, \".txt\")\n        txt_path = os.path.join(output_dir, txt_file_name)\n        results.append(txt_path)",
+        "type": "code",
+        "location": "/tools/ava_predict.py:267-294"
+    },
+    "8777": {
+        "file_id": 645,
+        "content": "This function performs human detection on a list of frame paths using a specified model and weight file. It uses the trainer object to predict human boxes in each frame, saving the results as text files in the specified output directory. The function then returns a list of paths for these detection results.",
+        "type": "comment"
+    },
+    "8778": {
+        "file_id": 645,
+        "content": "    return results\ndef get_detection_result(txt_file_path, img_h, img_w, person_det_score_thr):\n    \"\"\"\n    根据检测结果文件得到图像中人的检测框(proposals)和置信度（scores）\n    txt_file_path:检测结果存放路径\n    img_h:图像高度\n    img_w:图像宽度\n    \"\"\"\n    proposals = []\n    scores = []\n    with open(txt_file_path, 'r') as detection_file:\n        lines = detection_file.readlines()\n        for line in lines:  # person 0.9842637181282043 0.0 469.1407470703125 944.7770385742188 831.806396484375\n            items = line.split(\" \")\n            if items[0] != 'person':  #只要人\n                continue\n            score = items[1]\n            if (float)(score) < person_det_score_thr:\n                continue\n            x1 = (float(items[2])) / img_w\n            y1 = ((float)(items[3])) / img_h\n            box_w = ((float)(items[4]))\n            box_h = ((float)(items[5]))\n            x2 = (float(items[2]) + box_w) / img_w\n            y2 = (float(items[3]) + box_h) / img_h\n            scores.append(score)\n            proposals.append([x1, y1, x2, y2])\n    return np.array(proposals), np.array(scores)",
+        "type": "code",
+        "location": "/tools/ava_predict.py:296-334"
+    },
+    "8779": {
+        "file_id": 645,
+        "content": "This function reads a detection result file and returns the bounding box proposals (proposals) and corresponding scores for people in the image. It takes the path to the txt file, image height, and image width as input parameters. The function first splits the lines of the file and then checks each line to see if it corresponds to a person detection result. If so, it extracts the score and bounding box coordinates (x1, y1, x2, y2) for that object and adds them to separate lists, scores and proposals. Finally, it returns numpy arrays of the extracted proposals and scores.",
+        "type": "comment"
+    },
+    "8780": {
+        "file_id": 645,
+        "content": "@paddle.no_grad()\ndef main(args):\n    config = get_config(args.config, show=False)  #parse config file\n    # extract frames from video\n    video_path = args.video_path\n    frame_dir = 'tmp_frames'\n    frame_paths, frames, FPS = frame_extraction(video_path, frame_dir)\n    num_frame = len(frame_paths)  #视频秒数*FPS\n    assert num_frame != 0\n    print(\"Frame Number：\", num_frame)\n    # 帧图像高度和宽度\n    h, w, _ = frames[0].shape\n    # Get clip_len, frame_interval and calculate center index of each clip\n    data_process_pipeline = build_pipeline(config.PIPELINE.test)  #测试时输出处理流水配置\n    clip_len = config.PIPELINE.test.sample['clip_len']\n    assert clip_len % 2 == 0, 'We would like to have an even clip_len'\n    frame_interval = config.PIPELINE.test.sample['frame_interval']\n    # 此处关键帧每秒取一个\n    clip_len = config.PIPELINE.test.sample['clip_len']\n    assert clip_len % 2 == 0, 'We would like to have an even clip_len'\n    frame_interval = config.PIPELINE.test.sample['frame_interval']\n    window_size = clip_len * frame_interval\n    timestamps = np.arange(window_size // 2, (num_frame + 1 - window_size // 2),",
+        "type": "code",
+        "location": "/tools/ava_predict.py:337-365"
+    },
+    "8781": {
+        "file_id": 645,
+        "content": "This code function is extracting frames from a video, parsing config files, and setting up processing pipelines for testing. The frame extraction process involves specifying the input video path and output directory for storing frames. It calculates the number of frames in the video and ensures it's not zero. It asserts that clip_len and frame_interval are even numbers to create equal-sized clips. Finally, it calculates the window size based on these parameters.",
+        "type": "comment"
+    },
+    "8782": {
+        "file_id": 645,
+        "content": "                           args.predict_stepsize)\n    print(\"timetamps number:\", len(timestamps))\n    # get selected frame list according to timestamps\n    selected_frame_list = []\n    for timestamp in timestamps:\n        selected_frame_list.append(frame_paths[timestamp - 1])\n    # Load label_map\n    label_map_path = config.DATASET.test['label_file']\n    categories, class_whitelist = read_labelmap(open(label_map_path))\n    label_map = {}\n    for item in categories:\n        id = item['id']\n        name = item['name']\n        label_map[id] = name\n    # Construct model.\n    if config.MODEL.backbone.get('pretrained'):\n        config.MODEL.backbone.pretrained = ''  # disable pretrain model init\n    model = build_model(config.MODEL)\n    model.eval()\n    state_dicts = load(args.weights)\n    model.set_state_dict(state_dicts)\n    detection_result_dir = 'tmp_detection'\n    detection_model_name = args.detection_model_name\n    detection_model_weights = args.detection_model_weights\n    detection_txt_list = detection_inference(selected_frame_list,",
+        "type": "code",
+        "location": "/tools/ava_predict.py:366-395"
+    },
+    "8783": {
+        "file_id": 645,
+        "content": "This code snippet is parsing timestamps from a file, selecting frames based on those timestamps, loading a label map, constructing a model, and setting its state dictionary. The selected frames are passed to the `detection_inference` function which performs inference using the specified detection model with given weights.",
+        "type": "comment"
+    },
+    "8784": {
+        "file_id": 645,
+        "content": "                                             detection_result_dir,\n                                             detection_model_name,\n                                             detection_model_weights)\n    assert len(detection_txt_list) == len(timestamps)\n    print('Performing SpatioTemporal Action Detection for each clip')\n    human_detections = []\n    predictions = []\n    index = 0\n    for timestamp, detection_txt_path in zip(timestamps, detection_txt_list):\n        proposals, scores = get_detection_result(\n            detection_txt_path, h, w,\n            (float)(config.DATASET.test['person_det_score_thr']))\n        if proposals.shape[0] == 0:\n            predictions.append(None)\n            human_detections.append(None)\n            continue\n        human_detections.append(proposals)\n        result = get_timestep_result(frame_dir,\n                                     timestamp,\n                                     clip_len,\n                                     frame_interval,\n                                     FPS=FPS)",
+        "type": "code",
+        "location": "/tools/ava_predict.py:396-421"
+    },
+    "8785": {
+        "file_id": 645,
+        "content": "This code performs SpatioTemporal Action Detection for each clip. It first retrieves detection results from various txt files, ensuring their lengths match the timestamps. Then, it extracts human detections and predictions for each timestamp using get_detection_result() and get_timestep_result(). If there are no detections in a frame, None values are appended to the lists.",
+        "type": "comment"
+    },
+    "8786": {
+        "file_id": 645,
+        "content": "        result[\"proposals\"] = proposals\n        result[\"scores\"] = scores\n        new_result = data_process_pipeline(result)\n        proposals = new_result['proposals']\n        img_slow = new_result['imgs'][0]\n        img_slow = img_slow[np.newaxis, :]\n        img_fast = new_result['imgs'][1]\n        img_fast = img_fast[np.newaxis, :]\n        proposals = proposals[np.newaxis, :]\n        scores = scores[np.newaxis, :]\n        img_shape = np.asarray(new_result['img_shape'])\n        img_shape = img_shape[np.newaxis, :]\n        data = [\n            paddle.to_tensor(img_slow, dtype='float32'),\n            paddle.to_tensor(img_fast, dtype='float32'),\n            paddle.to_tensor(proposals, dtype='float32'), scores,\n            paddle.to_tensor(img_shape, dtype='int32')\n        ]\n        with paddle.no_grad():\n            result = model(data, mode='infer')\n            result = result[0]\n            prediction = []\n            person_num = proposals.shape[1]\n            # N proposals\n            for i in range(person_num):",
+        "type": "code",
+        "location": "/tools/ava_predict.py:422-455"
+    },
+    "8787": {
+        "file_id": 645,
+        "content": "This code prepares input data for a model by converting images, proposals, and shapes to tensors. It then feeds the prepared data into the model in order mode='infer'. The output is stored in 'result' and used to generate predictions based on number of proposals.",
+        "type": "comment"
+    },
+    "8788": {
+        "file_id": 645,
+        "content": "                prediction.append([])\n            # Perform action score thr\n            for i in range(len(result)):\n                if i + 1 not in class_whitelist:\n                    continue\n                for j in range(person_num):\n                    if result[i][j, 4] > config.MODEL.head['action_thr']:\n                        prediction[j].append((label_map[i + 1], result[i][j,\n                                                                          4]))\n            predictions.append(prediction)\n        index = index + 1\n        if index % 10 == 0:\n            print(index, \"/\", len(timestamps))\n    results = []\n    for human_detection, prediction in zip(human_detections, predictions):\n        results.append(pack_result(human_detection, prediction))\n    def dense_timestamps(timestamps, n):\n        \"\"\"Make it nx frames.\"\"\"\n        old_frame_interval = (timestamps[1] - timestamps[0])\n        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2\n        new_frame_inds = np.arange(\n            len(timestamps) * n) * old_frame_interval / n + start",
+        "type": "code",
+        "location": "/tools/ava_predict.py:456-481"
+    },
+    "8789": {
+        "file_id": 645,
+        "content": "This code performs action score thresholding for each detected person in the video. It appends labels and corresponding scores to a prediction list, then appends the predictions to a list of lists for all detected humans. The code also prints progress updates every 10 iterations, and finally, it creates denser timestamps using an older frame interval.",
+        "type": "comment"
+    },
+    "8790": {
+        "file_id": 645,
+        "content": "        return new_frame_inds.astype(np.int)\n    dense_n = int(args.predict_stepsize / args.output_stepsize)  #30\n    frames = [\n        cv2.imread(frame_paths[i - 1])\n        for i in dense_timestamps(timestamps, dense_n)\n    ]\n    vis_frames = visualize(frames, results)\n    try:\n        import moviepy.editor as mpy\n    except ImportError:\n        raise ImportError('Please install moviepy to enable output file')\n    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],\n                                fps=args.output_fps)\n    vid.write_videofile(args.out_filename)\n    print(\"finish write !\")\n    # delete tmp files and dirs\n    shutil.rmtree(frame_dir)\n    shutil.rmtree(detection_result_dir)\nif __name__ == '__main__':\n    args = parse_args()  #解析参数\n    main(args)",
+        "type": "code",
+        "location": "/tools/ava_predict.py:482-509"
+    },
+    "8791": {
+        "file_id": 645,
+        "content": "The code reads video frames, performs visualization, and writes the processed frames into a new video file. It requires moviepy to be installed for output functionality and deletes temporary files after use.",
+        "type": "comment"
+    },
+    "8792": {
+        "file_id": 646,
+        "content": "/tools/export_model.py",
+        "type": "filepath"
+    },
+    "8793": {
+        "file_id": 646,
+        "content": "This code defines functions for setting up imports, parsing command line arguments, and exporting PaddleVideo models. It includes model building, loading pretrained parameters, evaluating the model, providing input specifications, converting to static, saving, and printing saved model location.",
+        "type": "summary"
+    },
+    "8794": {
+        "file_id": 646,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nimport os.path as osp\nimport sys\nimport paddle\nfrom paddle.jit import to_static\nfrom paddle.static import InputSpec\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\nfrom paddlevideo.modeling.builder import build_model\nfrom paddlevideo.utils import get_config\ndef parse_args():\n    parser = argparse.ArgumentParser(\"PaddleVideo export model script\")",
+        "type": "code",
+        "location": "/tools/export_model.py:1-32"
+    },
+    "8795": {
+        "file_id": 646,
+        "content": "This code snippet is the first 31 lines of the \"export_model.py\" file in PaddleVideo's tools directory. It sets up imports and defines a function parse_args(). This function uses argparse to create an argument parser for the script. The script seems to be part of a model exporting tool designed for PaddleVideo, possibly used for command line arguments.",
+        "type": "comment"
+    },
+    "8796": {
+        "file_id": 646,
+        "content": "    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument('--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument(\"-p\",\n                        \"--pretrained_params\",\n                        default='./best.pdparams',\n                        type=str,\n                        help='params path')\n    parser.add_argument(\"-o\",\n                        \"--output_path\",\n                        type=str,\n                        default=\"./inference\",\n                        help='output path')\n    parser.add_argument('--save_name',\n                        type=str,\n                        default=None,\n                        help='specify the exported inference \\\n                             files(pdiparams and pdmodel) name,\\",
+        "type": "code",
+        "location": "/tools/export_model.py:33-57"
+    },
+    "8797": {
+        "file_id": 646,
+        "content": "This code block is parsing command line arguments to specify the config file path, pre-trained parameters path, override options, and output path for exporting a model. The exported files will include pdiparams and pdmodel.",
+        "type": "comment"
+    },
+    "8798": {
+        "file_id": 646,
+        "content": "                             only used in TIPC')\n    return parser.parse_args()\ndef trim_config(cfg):\n    \"\"\"\n    Reuse the trainging config will bring useless attributes, such as: backbone.pretrained model.\n    and some build phase attributes should be overrided, such as: backbone.num_seg.\n    Trim it here.\n    \"\"\"\n    model_name = cfg.model_name\n    if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):\n        cfg.MODEL.backbone.pretrained = \"\"  # not ued when inference\n    # for distillation\n    if cfg.MODEL.get('models'):\n        if cfg.MODEL.models[0]['Teacher']['backbone'].get('pretrained'):\n            cfg.MODEL.models[0]['Teacher']['backbone']['pretrained'] = \"\"\n        if cfg.MODEL.models[1]['Student']['backbone'].get('pretrained'):\n            cfg.MODEL.models[1]['Student']['backbone']['pretrained'] = \"\"\n    return cfg, model_name\ndef get_input_spec(cfg, model_name):\n    if model_name in ['ppTSM', 'TSM', 'MoViNet', 'ppTSMv2']:\n        input_spec = [[\n            InputSpec(\n                shape=[None, cfg.num_seg, 3, cfg.target_size, cfg.target_size],",
+        "type": "code",
+        "location": "/tools/export_model.py:58-87"
+    },
+    "8799": {
+        "file_id": 646,
+        "content": "This code appears to be involved in model exporting and configuration trimming. It defines three functions: \"export_model\" parses command line arguments, \"trim_config\" removes unused or unnecessary attributes from the configuration, and \"get_input_spec\" sets the input specification based on the given model name. The code seems to be a part of PaddleVideo library and involves several specific models such as TSM, MoViNet, ppTSM, and ppTSMv2.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/88.json b/docs/data/88.json
new file mode 100644
index 000000000..c54bd30e6
--- /dev/null
+++ b/docs/data/88.json
@@ -0,0 +1,549 @@
+{
+    "8800": {
+        "file_id": 646,
+        "content": "                dtype='float32'),\n        ]]\n    elif model_name in ['TokenShiftVisionTransformer']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['TSN', 'ppTSN']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, cfg.num_seg * 10, 3, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['BMN']:\n        input_spec = [[\n            InputSpec(shape=[None, cfg.feat_dim, cfg.tscale],\n                      dtype='float32',\n                      name='feat_input'),\n        ]]\n    elif model_name in ['TimeSformer', 'ppTimeSformer']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['VideoSwin']:",
+        "type": "code",
+        "location": "/tools/export_model.py:88-117"
+    },
+    "8801": {
+        "file_id": 646,
+        "content": "The code snippet defines different input specifications based on the model name. It checks the model name and sets the shape and dtype of the input accordingly, handling various models such as 'PaddleVideo', 'TokenShiftVisionTransformer', 'TSN', 'ppTSN', 'BMN', 'TimeSformer', and 'ppTimeSformer'. The input specifications define the dimensions for inputs like number of frames, number of segments, channels, and target size.",
+        "type": "comment"
+    },
+    "8802": {
+        "file_id": 646,
+        "content": "        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_seg * cfg.seg_len * 1, cfg.target_size,\n                cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['VideoSwin_TableTennis']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_seg * cfg.seg_len * 3, cfg.target_size,\n                cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['AttentionLSTM']:\n        input_spec = [[\n            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],\n                      dtype='float32'),  # for rgb_data\n            InputSpec(shape=[\n                None,\n            ], dtype='int64'),  # for rgb_len\n            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],\n                      dtype='float32'),  # for rgb_mask\n            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],\n                      dtype='float32'),  # for audio_data",
+        "type": "code",
+        "location": "/tools/export_model.py:118-143"
+    },
+    "8803": {
+        "file_id": 646,
+        "content": "The code is defining input specifications for different model names in the PaddleVideo tool. It uses InputSpec to specify the shape and data type of inputs for each model, with varying numbers of inputs based on the model's requirements (e.g., RGB data, audio data, etc.). This allows the export_model function to handle various models appropriately.",
+        "type": "comment"
+    },
+    "8804": {
+        "file_id": 646,
+        "content": "            InputSpec(shape=[\n                None,\n            ], dtype='int64'),  # for audio_len\n            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],\n                      dtype='float32'),  # for audio_mask\n        ]]\n    elif model_name in ['SlowFast']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,\n                cfg.target_size\n            ],\n                      dtype='float32',\n                      name='slow_input'),\n            InputSpec(shape=[\n                None, 3, cfg.num_frames, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32',\n                      name='fast_input'),\n        ]]\n    elif model_name in ['STGCN', 'AGCN', 'CTRGCN']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, cfg.num_channels, cfg.window_size, cfg.vertex_nums,\n                cfg.person_nums\n            ],\n                      dtype='float32'),\n        ]]\n    # 由于在模型运行过程中涉及到第一维乘human个数(N*M), 所以这里用1作为shape",
+        "type": "code",
+        "location": "/tools/export_model.py:144-172"
+    },
+    "8805": {
+        "file_id": 646,
+        "content": "This code snippet defines input specifications for different models used in the PaddleVideo framework. It determines the shape and data type of inputs based on the model name provided, such as audio data for models like ResNet50, SlowFast, and temporal graph convolutional networks (TGCN) models like STGCN, AGCN, and CTRGCN. The shapes account for variables like number of frames, window size, and feature dimensions specific to each model.",
+        "type": "comment"
+    },
+    "8806": {
+        "file_id": 646,
+        "content": "    elif model_name in ['AGCN2s']:\n        input_spec = [[\n            InputSpec(shape=[\n                1, cfg.num_channels, cfg.window_size, cfg.vertex_nums,\n                cfg.person_nums\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['TransNetV2']:\n        input_spec = [[\n            InputSpec(shape=[\n                None,\n                cfg.num_frames,\n                cfg.height,\n                cfg.width,\n                cfg.num_channels,\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['MSTCN', 'ASRF']:\n        input_spec = [[\n            InputSpec(shape=[None, cfg.num_channels, None], dtype='float32'),\n        ]]\n    elif model_name in ['ADDS']:\n        input_spec = [[\n            InputSpec(shape=[None, cfg.num_channels, cfg.height, cfg.width],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['AVA_SlowFast_FastRcnn']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,",
+        "type": "code",
+        "location": "/tools/export_model.py:173-204"
+    },
+    "8807": {
+        "file_id": 646,
+        "content": "The code defines different input specifications for various model names. It handles models like AGCN2s, TransNetV2, MSTCN, ASRF, ADDs, and AVA_SlowFast_FastRcnn by specifying the shape of the input data and its data type ('float32'). The shapes are defined according to the specific model's input requirements.",
+        "type": "comment"
+    },
+    "8808": {
+        "file_id": 646,
+        "content": "                cfg.target_size\n            ],\n                      dtype='float32',\n                      name='slow_input'),\n            InputSpec(shape=[\n                None, 3, cfg.num_frames, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32',\n                      name='fast_input'),\n            InputSpec(shape=[None, None, 4], dtype='float32', name='proposals'),\n            InputSpec(shape=[None, 2], dtype='float32', name='img_shape')\n        ]]\n    elif model_name in ['PoseC3D']:\n        input_spec = [[\n            InputSpec(shape=[None, 1, 17, 48, 56, 56], dtype='float32'),\n        ]]\n    elif model_name in ['YOWO']:\n        input_spec = [[\n            InputSpec(shape=[\n                1, 3, cfg.num_seg, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    return input_spec\ndef main():\n    args = parse_args()\n    cfg, model_name = trim_config(\n        get_config(args.config, overrides=args.override, show=False))\n    print(f\"Building model({model_name})...\")",
+        "type": "code",
+        "location": "/tools/export_model.py:205-236"
+    },
+    "8809": {
+        "file_id": 646,
+        "content": "This code defines a function that takes in a model name and returns the input specification for different models. The input specification determines the shape, dtype, and name of the input tensors for each model. Different models have different input specifications based on their architecture and requirements. The returned input specification is used to build the model correctly.",
+        "type": "comment"
+    },
+    "8810": {
+        "file_id": 646,
+        "content": "    model = build_model(cfg.MODEL)\n    assert osp.isfile(\n        args.pretrained_params\n    ), f\"pretrained params ({args.pretrained_params} is not a file path.)\"\n    if not os.path.isdir(args.output_path):\n        os.makedirs(args.output_path)\n    print(f\"Loading params from ({args.pretrained_params})...\")\n    params = paddle.load(args.pretrained_params)\n    model.set_dict(params)\n    model.eval()\n    # for rep nets\n    for layer in model.sublayers():\n        if hasattr(layer, \"rep\") and not getattr(layer, \"is_repped\"):\n            layer.rep()\n    input_spec = get_input_spec(cfg.INFERENCE, model_name)\n    model = to_static(model, input_spec=input_spec)\n    paddle.jit.save(\n        model,\n        osp.join(args.output_path,\n                 model_name if args.save_name is None else args.save_name))\n    print(\n        f\"model ({model_name}) has been already saved in ({args.output_path}).\")\nif __name__ == \"__main__\":\n    main()",
+        "type": "code",
+        "location": "/tools/export_model.py:237-267"
+    },
+    "8811": {
+        "file_id": 646,
+        "content": "Building the model, checking pretrained params are a file path, creating output directory if necessary, loading pretrained params, setting parameters to the model, evaluating the model, reppping layers if required, getting input specification, converting model to static, saving model with a specified name, and printing saved model location.",
+        "type": "comment"
+    },
+    "8812": {
+        "file_id": 647,
+        "content": "/tools/predict.py",
+        "type": "filepath"
+    },
+    "8813": {
+        "file_id": 647,
+        "content": "The code sets command-line arguments for Paddle Video tool, configures the predictor, supports GPU/NPU usage, and utilizes TensorRT engine with YOWO model. It creates a directory, preprocesses data, performs inference, post-processes output, benchmarks, and guides users to install \"auto_log\".",
+        "type": "summary"
+    },
+    "8814": {
+        "file_id": 647,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nfrom os import path as osp\nimport paddle\nfrom paddle import inference\nfrom paddle.inference import Config, create_predictor\nfrom utils import build_inference_helper\nfrom paddlevideo.utils import get_config\ndef parse_args():\n    def str2bool(v):\n        return v.lower() in (\"true\", \"t\", \"1\")\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Inference model script\")\n    parser.add_argument(",
+        "type": "code",
+        "location": "/tools/predict.py:1-32"
+    },
+    "8815": {
+        "file_id": 647,
+        "content": "Code snippet is an import-heavy function definition. It begins with a lengthy comment mentioning the copyright and license details, followed by multiple imports from various modules. The only executable code present is the \"parse_args\" function definition. This function uses argparse to create a parser for general parameters of PaddleVideo Inference model script.",
+        "type": "comment"
+    },
+    "8816": {
+        "file_id": 647,
+        "content": "        '-c',\n        '--config',\n        type=str,\n        default='configs/example.yaml',\n        help='config file path')\n    parser.add_argument(\n        '-o',\n        '--override',\n        action='append',\n        default=[],\n        help='config options to be overridden')\n    parser.add_argument(\"-i\", \"--input_file\", type=str, help=\"input file path\")\n    parser.add_argument(\n        \"--time_test_file\",\n        type=str2bool,\n        default=False,\n        help=\"whether input time test file\")\n    parser.add_argument(\"--model_file\", type=str)\n    parser.add_argument(\"--params_file\", type=str)\n    # params for paddle predict\n    parser.add_argument(\"-b\", \"--batch_size\", type=int, default=1)\n    parser.add_argument(\"--use_gpu\", type=str2bool, default=True)\n    parser.add_argument(\"--use_xpu\", type=str2bool, default=False)\n    parser.add_argument(\"--use_npu\", type=str2bool, default=False)\n    parser.add_argument(\"--precision\", type=str, default=\"fp32\")\n    parser.add_argument(\"--ir_optim\", type=str2bool, default=True)",
+        "type": "code",
+        "location": "/tools/predict.py:33-59"
+    },
+    "8817": {
+        "file_id": 647,
+        "content": "The code defines command-line arguments for a Paddle Video tool. It allows the user to specify the config file, input file, model and parameters files, batch size, and GPU/XPU usage. The `str2bool` type converts string inputs to boolean values.",
+        "type": "comment"
+    },
+    "8818": {
+        "file_id": 647,
+        "content": "    parser.add_argument(\"--use_tensorrt\", type=str2bool, default=False)\n    parser.add_argument(\"--gpu_mem\", type=int, default=8000)\n    parser.add_argument(\"--enable_benchmark\", type=str2bool, default=False)\n    parser.add_argument(\"--enable_mkldnn\", type=str2bool, default=False)\n    parser.add_argument(\"--cpu_threads\", type=int, default=None)\n    parser.add_argument(\"--disable_glog\", type=str2bool, default=False)\n    # parser.add_argument(\"--hubserving\", type=str2bool, default=False)  #TODO\n    return parser.parse_args()\ndef create_paddle_predictor(args, cfg):\n    config = Config(args.model_file, args.params_file)\n    if args.use_gpu:\n        config.enable_use_gpu(args.gpu_mem, 0)\n    elif args.use_npu:\n        config.enable_npu()\n    elif args.use_xpu:\n        config.enable_xpu()\n    else:\n        config.disable_gpu()\n        if args.cpu_threads:\n            config.set_cpu_math_library_num_threads(args.cpu_threads)\n        if args.enable_mkldnn:\n            # cache 10 different shapes for mkldnn to avoid memory leak",
+        "type": "code",
+        "location": "/tools/predict.py:60-84"
+    },
+    "8819": {
+        "file_id": 647,
+        "content": "This code is parsing arguments to configure a Paddle video predictor. It adds various arguments for use_tensorrt, gpu_mem, enable_benchmark, enable_mkldnn, cpu_threads, and disable_glog. The create_paddle_predictor function creates a config object with the provided arguments, enabling GPU or NPU usage if specified, and disabling GPU if not. It also sets the number of CPU threads if provided, and enables MKLDNN if enabled.",
+        "type": "comment"
+    },
+    "8820": {
+        "file_id": 647,
+        "content": "            config.set_mkldnn_cache_capacity(10)\n            config.enable_mkldnn()\n            if args.precision == \"fp16\":\n                config.enable_mkldnn_bfloat16()\n    # config.disable_glog_info()\n    config.switch_ir_optim(args.ir_optim)  # default true\n    if args.use_tensorrt:\n        # choose precision\n        if args.precision == \"fp16\":\n            precision = inference.PrecisionType.Half\n        elif args.precision == \"int8\":\n            precision = inference.PrecisionType.Int8\n        else:\n            precision = inference.PrecisionType.Float32\n        # calculate real max batch size during inference when tenrotRT enabled\n        max_batch_size = args.batch_size\n        if 'num_seg' in cfg.INFERENCE:\n            # num_seg: number of segments when extracting frames.\n            # seg_len: number of frames extracted within a segment, default to 1.\n            # num_views: the number of video frame groups obtained by cropping and flipping,\n            # uniformcrop=3, tencrop=10, centercrop=1.",
+        "type": "code",
+        "location": "/tools/predict.py:85-107"
+    },
+    "8821": {
+        "file_id": 647,
+        "content": "The code configures the PaddleVideo model for inference by setting the MKLDNN cache capacity, enabling MKLDNN and optionally BFloat16, disabling GLOG info, switching IR optim, and handling precision and batch size when TensorRT is enabled.",
+        "type": "comment"
+    },
+    "8822": {
+        "file_id": 647,
+        "content": "            num_seg = cfg.INFERENCE.num_seg\n            seg_len = cfg.INFERENCE.get('seg_len', 1)\n            num_views = 1\n            if 'tsm' in cfg.model_name.lower():\n                num_views = 1  # CenterCrop\n            elif 'tsn' in cfg.model_name.lower():\n                num_views = 10  # TenCrop\n            elif 'timesformer' in cfg.model_name.lower():\n                num_views = 3  # UniformCrop\n            elif 'videoswin' in cfg.model_name.lower():\n                num_views = 3  # UniformCrop\n            elif 'tokenshift' in cfg.model_name.lower():\n                num_views = 3  # UniformCrop\n            max_batch_size = args.batch_size * num_views * num_seg * seg_len\n        config.enable_tensorrt_engine(\n            precision_mode=precision, max_batch_size=max_batch_size)\n    config.enable_memory_optim()\n    # use zero copy\n    config.switch_use_feed_fetch_ops(False)\n    # disable glog\n    if args.disable_glog:\n        config.disable_glog_info()\n    # for ST-GCN tensorRT case usage\n    # config.delete_pass(\"shuffle_channel_detect_pass\")",
+        "type": "code",
+        "location": "/tools/predict.py:108-134"
+    },
+    "8823": {
+        "file_id": 647,
+        "content": "The code sets the number of segments and views based on the model name, calculates the maximum batch size, enables TensorRT engine with specified precision mode, enables memory optimization, disables glog if instructed to do so, and potentially deletes a pass for ST-GCN TensorRT case usage.",
+        "type": "comment"
+    },
+    "8824": {
+        "file_id": 647,
+        "content": "    predictor = create_predictor(config)\n    return config, predictor\ndef parse_file_paths(input_path: str) -> list:\n    if osp.isfile(input_path):\n        files = [\n            input_path,\n        ]\n    else:\n        files = os.listdir(input_path)\n        files = [\n            file for file in files\n            if (file.endswith(\".avi\") or file.endswith(\".mp4\"))\n        ]\n        files = [osp.join(input_path, file) for file in files]\n    return files\ndef main():\n    \"\"\"predict using paddle inference model\n    \"\"\"\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override, show=False)\n    model_name = cfg.model_name\n    print(f\"Inference model({model_name})...\")\n    InferenceHelper = build_inference_helper(cfg.INFERENCE)\n    inference_config, predictor = create_paddle_predictor(args, cfg)\n    # get input_tensor and output_tensor\n    input_names = predictor.get_input_names()\n    output_names = predictor.get_output_names()\n    input_tensor_list = []\n    output_tensor_list = []\n    for item in input_names:",
+        "type": "code",
+        "location": "/tools/predict.py:136-173"
+    },
+    "8825": {
+        "file_id": 647,
+        "content": "The code is implementing a main function for predicting using Paddle Inference model. It first parses arguments from command-line, then retrieves configuration and overrides for the inference task, prints an informative message, builds the inference helper, and creates paddle predictor with the given arguments and configuration. After this, it gets input and output names, initializes empty lists for input and output tensors, and iterates through input names to populate these lists.",
+        "type": "comment"
+    },
+    "8826": {
+        "file_id": 647,
+        "content": "        input_tensor_list.append(predictor.get_input_handle(item))\n    for item in output_names:\n        output_tensor_list.append(predictor.get_output_handle(item))\n    # get the absolute file path(s) to be processed\n    if model_name in [\"MSTCN\", \"ASRF\"]:\n        files = InferenceHelper.get_process_file(args.input_file)\n    else:\n        files = parse_file_paths(args.input_file)\n    if model_name == 'TransNetV2':\n        for file in files:\n            inputs = InferenceHelper.preprocess(file)\n            outputs = []\n            for input in inputs:\n                # Run inference\n                for i in range(len(input_tensor_list)):\n                    input_tensor_list[i].copy_from_cpu(input)\n                predictor.run()\n                output = []\n                for j in range(len(output_tensor_list)):\n                    output.append(output_tensor_list[j].copy_to_cpu())\n                outputs.append(output)\n            # Post process output\n            InferenceHelper.postprocess(outputs)\n    elif model_name == 'AVA_SlowFast_FastRcnn':",
+        "type": "code",
+        "location": "/tools/predict.py:174-201"
+    },
+    "8827": {
+        "file_id": 647,
+        "content": "The code is processing input files for a specific model and running inference. For certain models, it preprocesses the input files using InferenceHelper and then runs inference by setting input tensors and calling predictor.run(). Finally, if the model is AVA_SlowFast_FastRcnn, it post-processes the output.",
+        "type": "comment"
+    },
+    "8828": {
+        "file_id": 647,
+        "content": "        for file in files:  # for videos\n            inputs = InferenceHelper.preprocess(file)\n            outputs = []\n            for input in inputs:\n                # Run inference\n                input_len = len(input_tensor_list)\n                for i in range(input_len):\n                    if type(input[i]) == paddle.Tensor:\n                        input_tmp = input[i].numpy()\n                    else:\n                        input_tmp = input[i]\n                    input_tensor_list[i].copy_from_cpu(input_tmp)\n                predictor.run()\n                output = []\n                for j in range(len(output_tensor_list)):\n                    output.append(output_tensor_list[j].copy_to_cpu())\n                outputs.append(output)\n            # Post process output\n            InferenceHelper.postprocess(outputs)\n    elif model_name == 'YOWO':\n        for file in files:  # for videos\n            (_, filename) = os.path.split(file)\n            (filename, _) = os.path.splitext(filename)\n            save_dir = osp.join('inference', 'YOWO_infer')",
+        "type": "code",
+        "location": "/tools/predict.py:202-227"
+    },
+    "8829": {
+        "file_id": 647,
+        "content": "Iterates through each video file in the list. \nPreprocesses the input data for a model. \nRuns inference for each input, copying CPU memory. \nStores the output of each run. \nPost processes the outputs using InferenceHelper function. For YOWO model, also does filename operations and saves results to specified directory.",
+        "type": "comment"
+    },
+    "8830": {
+        "file_id": 647,
+        "content": "            if not osp.exists('inference'):\n                os.mkdir('inference')\n            if not osp.exists(save_dir):\n                os.mkdir(save_dir)\n            save_path = osp.join(save_dir, filename)\n            if not osp.exists(save_path):\n                os.mkdir(save_path)\n            inputs, frames = InferenceHelper.preprocess(file)\n            for idx, input in enumerate(inputs):\n                # Run inference\n                outputs = []\n                input_len = len(input_tensor_list)\n                for i in range(input_len):\n                    input_tensor_list[i].copy_from_cpu(input[i])\n                predictor.run()\n                for j in range(len(output_tensor_list)):\n                    outputs.append(output_tensor_list[j].copy_to_cpu())\n                # Post process output\n                InferenceHelper.postprocess(outputs, frames[idx], osp.join(save_path, str(idx).zfill(3)))\n    else:\n        if args.enable_benchmark:\n            num_warmup = 3\n            # instantiate auto log",
+        "type": "code",
+        "location": "/tools/predict.py:228-251"
+    },
+    "8831": {
+        "file_id": 647,
+        "content": "This code creates a directory and checks if the save path exists, then preprocesses input data for inference. It runs inference using a predictor, post-processes the output, and if benchmarking is enabled, it instantiates auto log.",
+        "type": "comment"
+    },
+    "8832": {
+        "file_id": 647,
+        "content": "            try:\n                import auto_log\n            except ImportError as e:\n                print(f\"{e}, [git+https://github.com/LDOUBLEV/AutoLog] \"\n                      f\"package and it's dependencies is required for \"\n                      f\"python-inference when enable_benchmark=True.\")\n            pid = os.getpid()\n            autolog = auto_log.AutoLogger(\n                model_name=cfg.model_name,\n                model_precision=args.precision,\n                batch_size=args.batch_size,\n                data_shape=\"dynamic\",\n                save_path=\"./output/auto_log.lpg\",\n                inference_config=inference_config,\n                pids=pid,\n                process_name=None,\n                gpu_ids=0 if args.use_gpu else None,\n                time_keys=[\n                    'preprocess_time', 'inference_time', 'postprocess_time'\n                ],\n                warmup=num_warmup)\n            if not args.time_test_file:\n                test_video_num = 15\n                files = [args.input_file for _ in range(test_video_num)]",
+        "type": "code",
+        "location": "/tools/predict.py:252-275"
+    },
+    "8833": {
+        "file_id": 647,
+        "content": "This code snippet attempts to import the \"auto_log\" package and if it fails, provides instructions on how to install it. Then, it creates an instance of AutoLogger, configuring various parameters like model name, batch size, data shape, etc., and specifies which timing metrics to track during inference. If no time test file is provided, the code sets the number of test videos to 15 and assigns all input files to these tests.",
+        "type": "comment"
+    },
+    "8834": {
+        "file_id": 647,
+        "content": "            else:\n                f_input = open(args.input_file, 'r')\n                files = [i.strip() for i in f_input.readlines()]\n                test_video_num = len(files)\n                f_input.close()\n        # Inferencing process\n        batch_num = args.batch_size\n        for st_idx in range(0, len(files), batch_num):\n            ed_idx = min(st_idx + batch_num, len(files))\n            # auto log start\n            if args.enable_benchmark:\n                autolog.times.start()\n            # Pre process batched input\n            batched_inputs = InferenceHelper.preprocess_batch(\n                files[st_idx:ed_idx])\n            # get pre process time cost\n            if args.enable_benchmark:\n                autolog.times.stamp()\n            # run inference\n            for i in range(len(input_tensor_list)):\n                input_tensor_list[i].copy_from_cpu(batched_inputs[i])\n            predictor.run()\n            batched_outputs = []\n            for j in range(len(output_tensor_list)):\n                batched_outputs.append(output_tensor_list[j].copy_to_cpu())",
+        "type": "code",
+        "location": "/tools/predict.py:276-306"
+    },
+    "8835": {
+        "file_id": 647,
+        "content": "This code reads input files, processes them in batches, runs inference on a model, and collects output. It also supports benchmarking and logs processing times for each step.",
+        "type": "comment"
+    },
+    "8836": {
+        "file_id": 647,
+        "content": "            # get inference process time cost\n            if args.enable_benchmark:\n                autolog.times.stamp()\n            InferenceHelper.postprocess(batched_outputs,\n                                        not args.enable_benchmark)\n            # get post process time cost\n            if args.enable_benchmark:\n                autolog.times.end(stamp=True)\n            # time.sleep(0.01)  # sleep for T4 GPU\n    # report benchmark log if enabled\n    if args.enable_benchmark:\n        autolog.report()\nif __name__ == \"__main__\":\n    main()",
+        "type": "code",
+        "location": "/tools/predict.py:308-327"
+    },
+    "8837": {
+        "file_id": 647,
+        "content": "Enables benchmarking for inference time, processes outputs and records post-processing time, then reports the benchmark log if enabled.",
+        "type": "comment"
+    },
+    "8838": {
+        "file_id": 648,
+        "content": "/tools/summary.py",
+        "type": "filepath"
+    },
+    "8839": {
+        "file_id": 648,
+        "content": "The code imports libraries, defines a function for parsing command line arguments, and sets up paths and licenses before building the model using PaddleVideo. It initializes segments (num_seg) and summarizes the model's parameters while calculating FLOPs if enabled.",
+        "type": "summary"
+    },
+    "8840": {
+        "file_id": 648,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nimport sys\nimport os.path as osp\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle.jit import to_static\nimport paddleslim\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\nfrom paddlevideo.modeling.builder import build_model\nfrom paddlevideo.utils import get_config\ndef parse_args():\n    parser = argparse.ArgumentParser(\"PaddleVideo Summary\")",
+        "type": "code",
+        "location": "/tools/summary.py:1-34"
+    },
+    "8841": {
+        "file_id": 648,
+        "content": "This code snippet is importing necessary libraries and defining a function for parsing command line arguments in the PaddleVideo project. The code also sets up some paths and licenses, ensuring compliance with the Apache License, Version 2.0.",
+        "type": "comment"
+    },
+    "8842": {
+        "file_id": 648,
+        "content": "    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument(\"--img_size\", type=int, default=224)\n    parser.add_argument(\"--num_seg\", type=int, default=8)\n    parser.add_argument(\"--FLOPs\",\n                        action=\"store_true\",\n                        help=\"whether to print FLOPs\")\n    return parser.parse_args()\ndef _trim(cfg, args):\n    \"\"\"\n    Reuse the trainging config will bring useless attribute, such as: backbone.pretrained model. Trim it here.\n    \"\"\"\n    model_name = cfg.model_name\n    cfg = cfg.MODEL\n    cfg.backbone.pretrained = \"\"\n    if 'num_seg' in cfg.backbone:\n        cfg.backbone.num_seg = args.num_seg\n    return cfg, model_name\ndef main():\n    args = parse_args()\n    cfg, model_name = _trim(get_config(args.config, show=False), args)\n    print(f\"Building model({model_name})...\")\n    model = build_model(cfg)\n    img_size = args.img_size",
+        "type": "code",
+        "location": "/tools/summary.py:35-69"
+    },
+    "8843": {
+        "file_id": 648,
+        "content": "This code parses arguments for the config file path, image size, and number of segments. It then trims unnecessary attributes from the training configuration before building the model using the parsed arguments.",
+        "type": "comment"
+    },
+    "8844": {
+        "file_id": 648,
+        "content": "    num_seg = args.num_seg\n    #NOTE: only support tsm now, will refine soon\n    params_info = paddle.summary(model, (1, 1, num_seg, 3, img_size, img_size))\n    print(params_info)\n    if args.FLOPs:\n        flops_info = paddleslim.analysis.flops(\n            model, [1, 1, num_seg, 3, img_size, img_size])\n        print(flops_info)\nif __name__ == \"__main__\":\n    main()",
+        "type": "code",
+        "location": "/tools/summary.py:70-82"
+    },
+    "8845": {
+        "file_id": 648,
+        "content": "This code snippet initializes the number of segments (num_seg) and currently only supports tsm. It generates a summary of the model's parameters using Paddle's summary function, and if FLOPs is enabled, it also calculates and prints the model's floating-point operations using paddleslim's analysis.flops function.",
+        "type": "comment"
+    },
+    "8846": {
+        "file_id": 649,
+        "content": "/tools/utils.py",
+        "type": "filepath"
+    },
+    "8847": {
+        "file_id": 649,
+        "content": "The code utilizes PaddleVideo for video inference, including preprocessing steps and various action recognition techniques. It also offers classes for human detection and pose estimation which can be used for classification or object detection tasks in videos with NMS and label/probability display.",
+        "type": "summary"
+    },
+    "8848": {
+        "file_id": 649,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport json\nimport os\nimport shutil\nimport sys\nfrom typing import List\nimport pickle\nimport cv2\ntry:\n    import imageio\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [imageio] package and it's dependencies is required for VideoSwin.\"\n    )\ntry:\n    import matplotlib as mpl\n    import matplotlib.cm as cm\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [matplotlib] package and it's dependencies is required for ADDS.\"",
+        "type": "code",
+        "location": "/tools/utils.py:1-34"
+    },
+    "8849": {
+        "file_id": 649,
+        "content": "This code block is an import and error handling section for various Python libraries such as imageio, matplotlib, and json. It also contains license information and warning messages for required packages.",
+        "type": "comment"
+    },
+    "8850": {
+        "file_id": 649,
+        "content": "    )\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\nimport pandas\nfrom PIL import Image\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\nfrom abc import abstractmethod\nfrom paddlevideo.loader.builder import build_pipeline\nfrom paddlevideo.loader.pipelines import (\n    AutoPadding, CenterCrop, DecodeSampler, FeatureDecoder, FrameDecoder,\n    GroupResize, Image2Array, ImageDecoder, JitterScale, MultiCrop,\n    Normalization, PackOutput, Sampler, SamplerPkl, Scale, SkeletonNorm,\n    TenCrop, ToArray, UniformCrop, VideoDecoder, SegmentationSampler,\n    SketeonCropSample, MultiCenterCrop, SketeonCropSample, UniformSampleFrames,\n    PoseDecode, PoseCompact, Resize, CenterCrop_V2, GeneratePoseTarget,\n    FormatShape, Collect)\nfrom paddlevideo.metrics.ava_utils import read_labelmap\nfrom paddlevideo.metrics.bmn_metric import boundary_choose, soft_nms\nfrom paddlevideo.utils import Registry, build, get_config\nfrom paddlevideo.modeling.framework.segmenters.utils import ASRFPostProcessing",
+        "type": "code",
+        "location": "/tools/utils.py:35-58"
+    },
+    "8851": {
+        "file_id": 649,
+        "content": "This code imports necessary libraries, defines a directory path, appends the directory to the system path, and imports classes and functions from various modules within the PaddleVideo framework. It also includes abstract methods for building pipelines and metrics, as well as utility functions for model segmentation and post-processing.",
+        "type": "comment"
+    },
+    "8852": {
+        "file_id": 649,
+        "content": "from tools.ava_predict import (detection_inference, frame_extraction,\n                               get_detection_result, get_timestep_result,\n                               pack_result, visualize)\nfrom paddlevideo.modeling.framework.localizers.yowo_utils import nms, get_region_boxes\nINFERENCE = Registry('inference')\ndef build_inference_helper(cfg):\n    return build(cfg, INFERENCE)\nclass Base_Inference_helper():\n    def __init__(self,\n                 num_seg=8,\n                 seg_len=1,\n                 short_size=256,\n                 target_size=224,\n                 top_k=1):\n        \"\"\"Base_Inference_helper\n        Args:\n            num_seg (int, optional): number of segmentations of an sliced input video. Defaults to 8.\n            seg_len (int, optional): length of each segmentation. Defaults to 1.\n            short_size (int, optional): short size of input video. Defaults to 256.\n            target_size (int, optional): size of cropped video. Defaults to 224.\n            top_k (int, optional): select topk result in outputs. Defaults to 1.",
+        "type": "code",
+        "location": "/tools/utils.py:60-86"
+    },
+    "8853": {
+        "file_id": 649,
+        "content": "This code imports functions from the \"ava_predict\" and \"yowo_utils\" modules. It defines a function called \"build_inference_helper\" which uses the \"Registry\" class to build an inference helper object. The base class for this object is defined as \"Base_Inference_helper\". This class has an initializer that takes arguments for number of segmentations, length of each segmentation, short size, target size, and top k.",
+        "type": "comment"
+    },
+    "8854": {
+        "file_id": 649,
+        "content": "        \"\"\"\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n    @abstractmethod\n    def preprocess(self, input_file: str):\n        \"\"\"preprocess abstractmethod\n        Args:\n            input_file (str): input file path.\n        \"\"\"\n        pass\n    def preprocess_batch(self, file_list: List[str]) -> List[np.ndarray]:\n        \"\"\"preprocess for file list\n        Args:\n            file_list (List[str]): file pathes in an list, [path1, path2, ...].\n        Returns:\n            List[np.ndarray]: batched inputs data, [data_batch[0], data_batch[1], ...].\n        \"\"\"\n        batched_inputs = []\n        for file in file_list:\n            inputs = self.preprocess(file)\n            batched_inputs.append(inputs)\n        batched_inputs = [\n            np.concatenate([item[i] for item in batched_inputs])\n            for i in range(len(batched_inputs[0]))\n        ]\n        self.input_file = file_list\n        return batched_inputs",
+        "type": "code",
+        "location": "/tools/utils.py:87-121"
+    },
+    "8855": {
+        "file_id": 649,
+        "content": "This code defines an abstract class with a preprocess method and a concrete implementation of the preprocess_batch method. The class has attributes for the number of segments, segment length, short size, target size, and top k. The preprocess_batch method processes each input file in a list of file paths and concatenates the processed data into batches. The input files are stored in the self.input\\_file attribute.",
+        "type": "comment"
+    },
+    "8856": {
+        "file_id": 649,
+        "content": "    def postprocess(self,\n                    output: np.ndarray,\n                    print_output: bool = True,\n                    return_result: bool = False):\n        \"\"\"postprocess\n        Args:\n            output (np.ndarray): batched output scores, shape of (batch_size, class_num).\n            print_output (bool, optional): whether to print result. Defaults to True.\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,\n            ]\n        output = output[0]  # [B, num_cls]\n        N = len(self.input_file)\n        if output.shape[0] != N:\n            output = output.reshape([N] + [output.shape[0] // N] +\n                                    list(output.shape[1:]))  # [N, T, C]\n            output = output.mean(axis=1)  # [N, C]\n        output = F.softmax(paddle.to_tensor(output), axis=-1).numpy()\n        results_list = []\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]",
+        "type": "code",
+        "location": "/tools/utils.py:123-147"
+    },
+    "8857": {
+        "file_id": 649,
+        "content": "This function postprocesses output scores from a model, accepting batched output scores as input. It checks if the input file is a list and reshapes the output array accordingly. The code applies softmax to each individual output tensor along the last axis, then iterates over the number of inputs (N) to generate class predictions. Classes are sorted based on their scores, and the results are stored in a list for further use.",
+        "type": "comment"
+    },
+    "8858": {
+        "file_id": 649,
+        "content": "            scores = output[i, classes]\n            topk_class = classes[:self.top_k]\n            topk_scores = scores[:self.top_k]\n            result = {\n                \"video_id\": self.input_file[i],\n                \"topk_class\": topk_class,\n                \"topk_scores\": topk_scores\n            }\n            results_list.append(result)\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))\n                print(\"\\ttop-{0} class: {1}\".format(self.top_k, topk_class))\n                print(\"\\ttop-{0} score: {1}\".format(self.top_k, topk_scores))\n        if return_result:\n            return results_list\n@INFERENCE.register()\nclass ppTSM_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=8,\n                 seg_len=1,\n                 short_size=256,\n                 target_size=224,\n                 top_k=1):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size",
+        "type": "code",
+        "location": "/tools/utils.py:148-176"
+    },
+    "8859": {
+        "file_id": 649,
+        "content": "This code is creating a helper class for inference tasks. It takes input files, performs video classification using the PaddleVideo framework, and returns top-k class results for each video file. The class also has options to print output and return results as a list. The user can customize the number of segments, segment length, short side size, target size, and top-k values for the classification.",
+        "type": "comment"
+    },
+    "8860": {
+        "file_id": 649,
+        "content": "        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        img_mean = [0.485, 0.456, 0.406]\n        img_std = [0.229, 0.224, 0.225]\n        ops = [\n            VideoDecoder(backend=\"decord\"),\n            Sampler(self.num_seg, self.seg_len, valid_mode=True),\n            Scale(self.short_size),\n            CenterCrop(self.target_size),\n            Image2Array(),\n            Normalization(img_mean, img_std)\n        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass ppTSN_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=25,\n                 seg_len=1,\n                 short_size=256,\n                 target_size=224,\n                 top_k=1):",
+        "type": "code",
+        "location": "/tools/utils.py:177-211"
+    },
+    "8861": {
+        "file_id": 649,
+        "content": "This code defines a class that takes in an input file path, applies several image preprocessing operations such as decoding, sampling, resizing, cropping, and normalization, then returns the processed image data in a list. The class also initializes some parameters like the number of segments, segment length, short size for resizing, target size for cropping, and top k value. The code is part of PaddleVideo library.",
+        "type": "comment"
+    },
+    "8862": {
+        "file_id": 649,
+        "content": "        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        img_mean = [0.485, 0.456, 0.406]\n        img_std = [0.229, 0.224, 0.225]\n        ops = [\n            VideoDecoder(backend=\"decord\"),\n            Sampler(self.num_seg,\n                    self.seg_len,\n                    valid_mode=True,\n                    select_left=True),\n            Scale(self.short_size,\n                  fixed_ratio=True,\n                  do_round=True,\n                  backend='cv2'),\n            TenCrop(self.target_size),\n            Image2Array(),\n            Normalization(img_mean, img_std)\n        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['imgs'], axis=0).copy()",
+        "type": "code",
+        "location": "/tools/utils.py:212-245"
+    },
+    "8863": {
+        "file_id": 649,
+        "content": "This code snippet initializes a class object with several parameters (num_seg, seg_len, short_size, target_size, top_k) and defines a preprocess method. The preprocess method takes an input file path, performs various operations on the image using different ops such as VideoDecoder, Sampler, Scale, TenCrop, Image2Array, Normalization in sequence, and returns an array of processed images.",
+        "type": "comment"
+    },
+    "8864": {
+        "file_id": 649,
+        "content": "        return [res]\n@INFERENCE.register()\nclass BMN_Inference_helper(Base_Inference_helper):\n    def __init__(self, feat_dim, dscale, tscale, result_path):\n        self.feat_dim = feat_dim\n        self.dscale = dscale\n        self.tscale = tscale\n        self.result_path = result_path\n        if not os.path.isdir(self.result_path):\n            os.makedirs(self.result_path)\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        file_info = json.load(open(input_file))\n        self.feat_path = file_info['feat_path']\n        self.video_duration = file_info['duration_second']\n        feat = np.load(self.feat_path).astype('float32').T\n        res = np.expand_dims(feat, axis=0).copy()\n        return [res]\n    def postprocess(self, outputs, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        pred_bm, pred_start, pred_end = outputs",
+        "type": "code",
+        "location": "/tools/utils.py:246-278"
+    },
+    "8865": {
+        "file_id": 649,
+        "content": "This function serves as a helper class for BMN inference and handles preprocessing of input files. It loads and preprocesses the features from the specified file path, converts them to float32 type, and returns the result in a list format. The postprocess function takes outputs as input, assuming it is a list containing predicted BMN, start time, and end time values.",
+        "type": "comment"
+    },
+    "8866": {
+        "file_id": 649,
+        "content": "        self._gen_props(pred_bm, pred_start[0], pred_end[0], print_output)\n    def _gen_props(self, pred_bm, pred_start, pred_end, print_output):\n        snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]\n        snippet_xmaxs = [\n            1.0 / self.tscale * i for i in range(1, self.tscale + 1)\n        ]\n        pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]\n        start_mask = boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_vector_list = []\n        for idx in range(self.dscale):\n            for jdx in range(self.tscale):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < self.tscale and start_mask[\n                        start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = snippet_xmins[start_index]\n                    xmax = snippet_xmaxs[end_index]\n                    xmin_score = pred_start[start_index]\n                    xmax_score = pred_end[end_index]",
+        "type": "code",
+        "location": "/tools/utils.py:279-302"
+    },
+    "8867": {
+        "file_id": 649,
+        "content": "This code defines a function _gen_props that calculates snippet xmin and xmax values, generates start and end masks from pred_start and pred_end, and initializes score_vector_list. It iterates over the dscale and tscale to determine start and end indices, checks if valid indices are found, and assigns xmin, xmax, xmin_score, and xmax_score accordingly.",
+        "type": "comment"
+    },
+    "8868": {
+        "file_id": 649,
+        "content": "                    bm_score = pred_bm[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bm_score\n                    score_vector_list.append([xmin, xmax, conf_score])\n        cols = [\"xmin\", \"xmax\", \"score\"]\n        score_vector_list = np.stack(score_vector_list)\n        df = pandas.DataFrame(score_vector_list, columns=cols)\n        result_dict = {}\n        proposal_list = []\n        df = soft_nms(df, alpha=0.4, t1=0.55, t2=0.9)\n        for idx in range(min(100, len(df))):\n            tmp_prop={\"score\":df.score.values[idx], \\\n                      \"segment\":[max(0,df.xmin.values[idx])*self.video_duration, \\\n                                 min(1,df.xmax.values[idx])*self.video_duration]}\n            proposal_list.append(tmp_prop)\n        result_dict[self.feat_path] = proposal_list\n        # print top-5 predictions\n        if print_output:\n            print(\"Current video file: {0} :\".format(self.feat_path))\n            for pred in proposal_list[:5]:\n                print(pred)\n        # save result",
+        "type": "code",
+        "location": "/tools/utils.py:303-328"
+    },
+    "8869": {
+        "file_id": 649,
+        "content": "This code performs non-maximum suppression (NMS) on bounding box predictions, selects top-5 predictions for each video feature path, and stores the results in a dictionary. It also prints the top-5 predictions if `print_output` is enabled.",
+        "type": "comment"
+    },
+    "8870": {
+        "file_id": 649,
+        "content": "        outfile = open(\n            os.path.join(self.result_path, \"bmn_results_inference.json\"), \"w\")\n        json.dump(result_dict, outfile)\n@INFERENCE.register()\nclass TokenShift_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=8,\n                 seg_len=1,\n                 short_size=256,\n                 target_size=256,\n                 top_k=1,\n                 mean=[0.5, 0.5, 0.5],\n                 std=[0.5, 0.5, 0.5]):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n        self.mean = mean\n        self.std = std\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        ops = [\n            VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),",
+        "type": "code",
+        "location": "/tools/utils.py:329-362"
+    },
+    "8871": {
+        "file_id": 649,
+        "content": "This code defines a class called TokenShift_Inference_helper, which extends Base_Inference_helper. It has several parameters for customizing the inference process and includes a preprocess method that reads an input file and returns results as a dictionary. The results are then written to a JSON file named \"bmn_results_inference.json\".",
+        "type": "comment"
+    },
+    "8872": {
+        "file_id": 649,
+        "content": "            Sampler(self.num_seg, self.seg_len, valid_mode=True),\n            Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),\n            Image2Array(data_format='cthw'),\n            JitterScale(self.short_size, self.short_size),\n            MultiCenterCrop(self.target_size)\n        ]\n        for op in ops:\n            results = op(results)\n        # [N,C,Tx3,H,W]\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass TimeSformer_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=8,\n                 seg_len=1,\n                 short_size=224,\n                 target_size=224,\n                 top_k=1,\n                 mean=[0.45, 0.45, 0.45],\n                 std=[0.225, 0.225, 0.225]):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n        self.mean = mean\n        self.std = std\n    def preprocess(self, input_file):",
+        "type": "code",
+        "location": "/tools/utils.py:363-395"
+    },
+    "8873": {
+        "file_id": 649,
+        "content": "The code creates a series of data processing operations to preprocess input images for the TimeSformer model. It initializes an instance of TimeSformer_Inference_helper with specified parameters, then applies these operations in order on the input image, resulting in a final tensor ready for model inference.",
+        "type": "comment"
+    },
+    "8874": {
+        "file_id": 649,
+        "content": "        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        ops = [\n            VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),\n            Sampler(self.num_seg,\n                    self.seg_len,\n                    valid_mode=True,\n                    linspace_sample=True),\n            Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),\n            Image2Array(data_format='cthw'),\n            JitterScale(self.short_size, self.short_size),\n            UniformCrop(self.target_size)\n        ]\n        for op in ops:\n            results = op(results)\n        # [N,C,Tx3,H,W]\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass VideoSwin_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=4,\n                 seg_len=32,\n                 frame_interval=2,",
+        "type": "code",
+        "location": "/tools/utils.py:396-427"
+    },
+    "8875": {
+        "file_id": 649,
+        "content": "This code defines a function that reads an input file, applies a series of operations to it, and returns the processed data. The operations include video decoding, sampling, normalization, image conversion, jitter scaling, and uniform cropping. The result is a tensor in the shape [N,C,Tx3,H,W], where N is the number of segments, C is the number of channels, Tx3 is the number of frames, H is the height, and W is the width.",
+        "type": "comment"
+    },
+    "8876": {
+        "file_id": 649,
+        "content": "                 short_size=224,\n                 target_size=224,\n                 top_k=1,\n                 mean=[123.675, 116.28, 103.53],\n                 std=[58.395, 57.12, 57.375]):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.frame_interval = frame_interval\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n        self.mean = mean\n        self.std = std\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        self.input_file = input_file\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        ops = [\n            VideoDecoder(backend='decord', mode='valid'),\n            Sampler(num_seg=self.num_seg,\n                    frame_interval=self.frame_interval,\n                    seg_len=self.seg_len,\n                    valid_mode=True,\n                    use_pil=False),",
+        "type": "code",
+        "location": "/tools/utils.py:428-458"
+    },
+    "8877": {
+        "file_id": 649,
+        "content": "This code defines a class for video preprocessing, taking input file path as parameter. It checks if the file exists and stores the filename in results dictionary. The class uses Decord backend for video decoding and Sampler to sample frames based on specified parameters.",
+        "type": "comment"
+    },
+    "8878": {
+        "file_id": 649,
+        "content": "            Scale(short_size=self.short_size,\n                  fixed_ratio=False,\n                  keep_ratio=True,\n                  backend='cv2',\n                  do_round=True),\n            CenterCrop(target_size=224, backend='cv2'),\n            Normalization(mean=self.mean,\n                          std=self.std,\n                          tensor_shape=[3, 1, 1, 1],\n                          inplace=True),\n            Image2Array(data_format='cthw')\n        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n    def postprocess(self, output, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,\n            ]\n        output = output[0]  # [B, num_cls]\n        N = len(self.input_file)\n        if output.shape[0] != N:\n            output = output.reshape([N] + [output.shape[0] // N] +\n                                    list(output.shape[1:]))  # [N, T, C]",
+        "type": "code",
+        "location": "/tools/utils.py:459-489"
+    },
+    "8879": {
+        "file_id": 649,
+        "content": "The code preprocesses images by resizing, cropping, normalizing, and converting to arrays. It also provides a postprocessing function that handles outputs for multiple input files if necessary.",
+        "type": "comment"
+    },
+    "8880": {
+        "file_id": 649,
+        "content": "            output = output.mean(axis=1)  # [N, C]\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]\n            scores = output[i, classes]\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))\n                for j in range(self.top_k):\n                    print(\"\\ttop-{0} class: {1}\".format(j + 1, classes[j]))\n                    print(\"\\ttop-{0} score: {1}\".format(j + 1, scores[j]))\n@INFERENCE.register()\nclass VideoSwin_TableTennis_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=1,\n                 seg_len=32,\n                 short_size=256,\n                 target_size=224,\n                 top_k=1):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n    def preprocess(self, input_file):",
+        "type": "code",
+        "location": "/tools/utils.py:490-516"
+    },
+    "8881": {
+        "file_id": 649,
+        "content": "This code snippet is part of a function that extracts the top k classes and their corresponding scores from an output tensor. It first performs mean pooling along the axis 1 to reshape the tensor to [N, C] format, where N is the number of images and C is the number of channels. Then, it iterates over each image and finds the indexes of top k classes by performing argument partition and sorting them based on their scores. Finally, it prints out these results for each image if the print_output flag is set to True.",
+        "type": "comment"
+    },
+    "8882": {
+        "file_id": 649,
+        "content": "        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'frame_dir': input_file, 'suffix': 'img_{:05}.jpg'}\n        img_mean = [123.675, 116.28, 103.53]\n        img_std = [58.395, 57.12, 57.375]\n        ops = [\n            FrameDecoder(),\n            SamplerPkl(num_seg=self.num_seg,\n                       seg_len=self.seg_len,\n                       backend='cv2',\n                       valid_mode=True),\n            Scale(short_size=self.short_size,\n                  fixed_ratio=False,\n                  keep_ratio=True,\n                  backend='cv2',\n                  do_round=True),\n            UniformCrop(target_size=self.target_size, backend='cv2'),\n            Normalization(mean=img_mean,\n                          std=img_std,\n                          tensor_shape=[3, 1, 1, 1],\n                          inplace=True),\n            Image2Array(data_format='cthw')",
+        "type": "code",
+        "location": "/tools/utils.py:517-542"
+    },
+    "8883": {
+        "file_id": 649,
+        "content": "This code defines a function that takes an input file, reads frames from it, applies various transformations including decoding, sampling, scaling, cropping, and normalization, and finally converts the resulting images to a numpy array. It uses the PaddleVideo library and has parameters for short_size, target_size, and num_seg.",
+        "type": "comment"
+    },
+    "8884": {
+        "file_id": 649,
+        "content": "        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n    def add_text_to_video(\n            self,\n            video_path,\n            output_dir=\"applications/TableTennis/ActionRecognition/results\",\n            text=None):\n        os.makedirs(output_dir, exist_ok=True)\n        if video_path.endswith('.pkl'):\n            try:\n                import cPickle as pickle\n                from cStringIO import StringIO\n            except ImportError:\n                import pickle\n                from io import BytesIO\n            from PIL import Image\n            data_loaded = pickle.load(open(video_path, 'rb'), encoding='bytes')\n            _, _, frames = data_loaded\n            frames_len = len(frames)\n        else:\n            videoCapture = cv2.VideoCapture()\n            videoCapture.open(video_path)\n            fps = videoCapture.get(cv2.CAP_PROP_FPS)\n            frame_width = int(videoCapture.get(cv2.CAP_PROP_FRAME_WIDTH))",
+        "type": "code",
+        "location": "/tools/utils.py:543-573"
+    },
+    "8885": {
+        "file_id": 649,
+        "content": "The code snippet is adding text to a video. It creates directories, loads or captures frames from the video, and extracts important information like frame length, FPS, and frame width. The code then calls other functions to manipulate images and add text to each frame before storing or displaying the final result.",
+        "type": "comment"
+    },
+    "8886": {
+        "file_id": 649,
+        "content": "            frame_height = int(videoCapture.get(cv2.CAP_PROP_FRAME_HEIGHT))\n            frames_len = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT)\n            print(\"fps=\", int(fps), \"frames=\", int(frames_len), \"scale=\",\n                  f\"{frame_height}x{frame_width}\")\n        frames_rgb_list = []\n        for i in range(int(frames_len)):\n            if video_path.endswith('.pkl'):\n                frame = np.array(\n                    Image.open(BytesIO(frames[i])).convert(\"RGB\").resize(\n                        (240, 135)))[:, :, ::-1].astype('uint8')\n            else:\n                _, frame = videoCapture.read()\n            frame = cv2.putText(frame, text, (30, 30), cv2.FONT_HERSHEY_COMPLEX,\n                                1.0, (0, 0, 255), 2)\n            frames_rgb_list.append(frame[:, :, ::-1])  # bgr to rgb\n        if not video_path.endswith('.pkl'):\n            videoCapture.release()\n        cv2.destroyAllWindows()\n        output_filename = os.path.basename(video_path)\n        output_filename = output_filename.split('.')[0] + '.gif'",
+        "type": "code",
+        "location": "/tools/utils.py:574-595"
+    },
+    "8887": {
+        "file_id": 649,
+        "content": "The code reads the video frames and resizes them, then converts to RGB format. If the file is a .pkl file, it opens the image from binary data. It also adds text to each frame using cv2.putText. The code appends each frame in RGB format to a list, and finally, releases the videoCapture object, closes all windows, and saves the resulting GIF with a specific filename.",
+        "type": "comment"
+    },
+    "8888": {
+        "file_id": 649,
+        "content": "        imageio.mimsave(f'{output_dir}/{output_filename}',\n                        frames_rgb_list,\n                        'GIF',\n                        duration=0.00085)\n    def postprocess(self, output, print_output=True, save_gif=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,\n            ]\n        output = output[0]  # [B, num_cls]\n        N = len(self.input_file)\n        if output.shape[0] != N:\n            output = output.reshape([N] + [output.shape[0] // N] +\n                                    list(output.shape[1:]))  # [N, T, C]\n            output = output.mean(axis=1)  # [N, C]\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]\n            scores = output[i, classes]\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))",
+        "type": "code",
+        "location": "/tools/utils.py:596-620"
+    },
+    "8889": {
+        "file_id": 649,
+        "content": "The function `postprocess` takes an output list and processes it according to the specified parameters. It ensures that the shape of the input matches with the number of files in the input_file list, then calculates class scores for each video file. If print_output is True, it will print the current video file being processed. Finally, if save_gif is True, it creates a GIF using the frames_rgb_list and saves it to the specified output directory with the filename mentioned in the function call.",
+        "type": "comment"
+    },
+    "8890": {
+        "file_id": 649,
+        "content": "                for j in range(self.top_k):\n                    print(\"\\ttop-{0} class: {1}\".format(j + 1, classes[j]))\n                    print(\"\\ttop-{0} score: {1}\".format(j + 1, scores[j]))\n            if save_gif:\n                self.add_text_to_video(\n                    self.input_file[0],\n                    text=f\"{str(classes[0])} {float(scores[0]):.5f}\")\n@INFERENCE.register()\nclass SlowFast_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_frames=32,\n                 sampling_rate=2,\n                 target_size=256,\n                 alpha=8,\n                 top_k=1):\n        self.num_frames = num_frames\n        self.sampling_rate = sampling_rate\n        self.target_size = target_size\n        self.alpha = alpha\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {",
+        "type": "code",
+        "location": "/tools/utils.py:621-651"
+    },
+    "8891": {
+        "file_id": 649,
+        "content": "This code is a part of PaddleVideo's utils.py file, specifically the SlowFast_Inference_helper class, which handles video inference using the SlowFast model. The class has attributes for number of frames, sampling rate, target size, alpha value, and top k classes to display. It contains methods like preprocess, add_text_to_video, and infer. In this section, it displays the top-1 class and score for each frame in a video and adds text annotations to the first frame of the video if save_gif is set to True.",
+        "type": "comment"
+    },
+    "8892": {
+        "file_id": 649,
+        "content": "            'filename': input_file,\n            'temporal_sample_index': 0,\n            'spatial_sample_index': 0,\n            'temporal_num_clips': 1,\n            'spatial_num_clips': 1\n        }\n        img_mean = [0.45, 0.45, 0.45]\n        img_std = [0.225, 0.225, 0.225]\n        ops = [\n            DecodeSampler(self.num_frames, self.sampling_rate, test_mode=True),\n            JitterScale(self.target_size, self.target_size),\n            MultiCrop(self.target_size),\n            Image2Array(transpose=False),\n            Normalization(img_mean, img_std, tensor_shape=[1, 1, 1, 3]),\n            PackOutput(self.alpha),\n        ]\n        for op in ops:\n            results = op(results)\n        res = []\n        for item in results['imgs']:\n            res.append(np.expand_dims(item, axis=0).copy())\n        return res\n    def postprocess(self, output, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,",
+        "type": "code",
+        "location": "/tools/utils.py:652-682"
+    },
+    "8893": {
+        "file_id": 649,
+        "content": "This code defines a function for preprocessing and postprocessing video frames. It initializes parameters like filename, sampling rate, target size, and normalization values. The function applies a series of operations to the input image, such as decoding, jitter scaling, cropping, converting to array format, normalizing pixel values, and packing the output. Finally, it expands the result along an axis and returns the processed frames.",
+        "type": "comment"
+    },
+    "8894": {
+        "file_id": 649,
+        "content": "            ]\n        output = output[0]  # [B, num_cls]\n        N = len(self.input_file)\n        if output.shape[0] != N:\n            output = output.reshape([N] + [output.shape[0] // N] +\n                                    list(output.shape[1:]))  # [N, T, C]\n            output = output.mean(axis=1)  # [N, C]\n        # output = F.softmax(paddle.to_tensor(output), axis=-1).numpy() # done in it's head\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]\n            scores = output[i, classes]\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))\n                for j in range(self.top_k):\n                    print(\"\\ttop-{0} class: {1}\".format(j + 1, classes[j]))\n                    print(\"\\ttop-{0} score: {1}\".format(j + 1, scores[j]))\n@INFERENCE.register()\nclass STGCN_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_channels,",
+        "type": "code",
+        "location": "/tools/utils.py:683-706"
+    },
+    "8895": {
+        "file_id": 649,
+        "content": "This function reshapes the output tensor based on the number of input files, then calculates top classes and scores for each file. If print_output is True, it prints the top classes and scores for each video file. The output is from a STGCN (Spatio-Temporal Graph Convolutional Network) inference process.",
+        "type": "comment"
+    },
+    "8896": {
+        "file_id": 649,
+        "content": "                 window_size,\n                 vertex_nums,\n                 person_nums,\n                 top_k=1):\n        self.num_channels = num_channels\n        self.window_size = window_size\n        self.vertex_nums = vertex_nums\n        self.person_nums = person_nums\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        data = np.load(input_file)\n        results = {'data': data}\n        ops = [AutoPadding(window_size=self.window_size), SkeletonNorm()]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['data'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass CTRGCN_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_channels=3,\n                 vertex_nums=25,\n                 person_nums=2,\n                 window_size=64,",
+        "type": "code",
+        "location": "/tools/utils.py:707-740"
+    },
+    "8897": {
+        "file_id": 649,
+        "content": "This code defines a class `CTRGCN_Inference_helper` that preprocesses data for CTRGCN inference. It takes input file path as parameter and returns processed data as list. The preprocessing includes applying auto-padding, skeleton normalization operations on the input data. The window size, vertex numbers, person numbers can be specified during initialization of the class.",
+        "type": "comment"
+    },
+    "8898": {
+        "file_id": 649,
+        "content": "                 p_interval=[0.95],\n                 top_k=1):\n        self.window_size = window_size\n        self.p_interval = p_interval\n        self.num_channels = num_channels\n        self.vertex_nums = vertex_nums\n        self.person_nums = person_nums\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        data = np.load(input_file)\n        results = {'data': data}\n        ops = [\n            SketeonCropSample(window_size=self.window_size,\n                              p_interval=self.p_interval)\n        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['data'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass AGCN2s_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 window_size=300,\n                 num_channels=3,\n                 vertex_nums=25,",
+        "type": "code",
+        "location": "/tools/utils.py:741-775"
+    },
+    "8899": {
+        "file_id": 649,
+        "content": "This code defines a class for preprocessing data and applying operations. It has an `__init__` method to initialize the window size, number of channels, vertex numbers, person numbers, and top k. The `preprocess` method takes a file path, asserts that it exists, loads the data, applies operations defined in ops, expands dimensions, and returns the processed data. It also registers this class for inference using the @INFERENCE decorator.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/89.json b/docs/data/89.json
new file mode 100644
index 000000000..ffb6152c4
--- /dev/null
+++ b/docs/data/89.json
@@ -0,0 +1,518 @@
+{
+    "8900": {
+        "file_id": 649,
+        "content": "                 person_nums=2,\n                 top_k=1):\n        self.window_size = window_size\n        self.num_channels = num_channels\n        self.vertex_nums = vertex_nums\n        self.person_nums = person_nums\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        data = np.load(input_file)\n        results = {'data': data}\n        res = np.expand_dims(results['data'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass MSTCN_Inference_helper(Base_Inference_helper):\n    def __init__(self, num_channels, actions_map_file_path, feature_path=None):\n        self.num_channels = num_channels\n        file_ptr = open(actions_map_file_path, 'r')\n        actions = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        self.actions_dict = dict()\n        for a in actions:\n            self.actions_dict[a.split()[1]] = int(a.split()[0])",
+        "type": "code",
+        "location": "/tools/utils.py:776-807"
+    },
+    "8901": {
+        "file_id": 649,
+        "content": "This code defines a class for preprocessing input data and an inference helper class for MSTCN. It initializes the class with parameters like window size, number of channels, vertex numbers, and top k. The `preprocess` method loads data from a file path and returns it as a list. The `MSTCN_Inference_helper` registers itself to be used by INFERENCE.",
+        "type": "comment"
+    },
+    "8902": {
+        "file_id": 649,
+        "content": "        self.feature_path = feature_path\n        self.file_name_list = []\n    def get_process_file(self, input_file_txt):\n        with open(input_file_txt, 'r') as file_ptr:\n            info = file_ptr.read().split('\\n')[:-1]\n        files = []\n        for video_name in info:\n            if self.feature_path is not None:\n                file_name = video_name.split('.')[0] + \".npy\"\n                input_file = os.path.join(self.feature_path, file_name)\n            else:\n                input_file = video_name\n            assert os.path.isfile(\n                input_file) is not None, \"{0} not exists\".format(input_file)\n            files.append(input_file)\n            self.file_name_list.append(input_file.split('/')[-1].split('.')[0])\n        return files\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, feature file list txt path\n        return: list\n        \"\"\"\n        output_list = []\n        data = np.load(input_file)\n        results = {'video_feat': data, 'video_gt': None}\n        ops = []",
+        "type": "code",
+        "location": "/tools/utils.py:809-840"
+    },
+    "8903": {
+        "file_id": 649,
+        "content": "The code defines a class with methods to handle video feature files. It initializes the feature path and creates an empty list for file names. The `get_process_file` method reads the input text file, checks if each file exists, appends file paths to `self.file_name_list`, and returns a list of files. The `preprocess` method loads a feature file into data, creates a dictionary with 'video_feat' key, and returns it as output_list.",
+        "type": "comment"
+    },
+    "8904": {
+        "file_id": 649,
+        "content": "        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['video_feat'], axis=0).copy()\n        output_list.append(res)\n        return output_list\n    def postprocess(self, output, print_output=True):\n        reslut_path = os.path.join(\"./inference/infer_results/\")\n        if not os.path.isdir(reslut_path):\n            os.makedirs(reslut_path)\n        output = [output]\n        for outputs in output:\n            output_np = outputs[0]\n            recognition = []\n            for i in range(output_np.shape[0]):\n                recognition = np.concatenate((recognition, [\n                    list(self.actions_dict.keys())[list(\n                        self.actions_dict.values()).index(output_np[i])]\n                ]))\n            recog_content = list(recognition)\n            recog_content = [line + \"\\n\" for line in recog_content]\n            filename = self.file_name_list.pop(0)\n            write_path = os.path.join(reslut_path, filename + \".txt\")\n            f = open(write_path, \"w\")",
+        "type": "code",
+        "location": "/tools/utils.py:841-867"
+    },
+    "8905": {
+        "file_id": 649,
+        "content": "The code processes video features, performs post-processing by creating a directory if it doesn't exist, appends the processed output to the output list and then creates separate text files for each result in the output list. The text files contain the recognized actions and are saved in the specified directory with corresponding filenames.",
+        "type": "comment"
+    },
+    "8906": {
+        "file_id": 649,
+        "content": "            f.writelines(recog_content)\n            f.close()\n        print(\"result write in : \" + write_path)\n@INFERENCE.register()\nclass ASRF_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_channels,\n                 actions_map_file_path,\n                 postprocessing_method,\n                 boundary_threshold,\n                 feature_path=None):\n        self.num_channels = num_channels\n        file_ptr = open(actions_map_file_path, 'r')\n        actions = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        self.actions_dict = dict()\n        for a in actions:\n            self.actions_dict[a.split()[1]] = int(a.split()[0])\n        self.postprocessing_method = postprocessing_method\n        self.boundary_threshold = boundary_threshold\n        self.feature_path = feature_path\n        self.file_name_list = []\n    def get_process_file(self, input_file_txt):\n        with open(input_file_txt, 'r') as file_ptr:\n            info = file_ptr.read().split('\\n')[:-1]\n        files = []",
+        "type": "code",
+        "location": "/tools/utils.py:868-898"
+    },
+    "8907": {
+        "file_id": 649,
+        "content": "This code initializes an instance of the ASRF_Inference_helper class, which takes in parameters such as num_channels, actions_map_file_path, postprocessing_method, boundary_threshold, and feature_path. It reads the actions map file, splits the lines into separate action names and their corresponding indices, and stores them in a dictionary called self.actions_dict. The code also creates an empty list called self.file_name_list. Additionally, it defines another function called get_process_file that takes input_file_txt as a parameter and reads its content to store information for further processing.",
+        "type": "comment"
+    },
+    "8908": {
+        "file_id": 649,
+        "content": "        for video_name in info:\n            if self.feature_path is not None:\n                file_name = video_name.split('.')[0] + \".npy\"\n                input_file = os.path.join(self.feature_path, file_name)\n            else:\n                input_file = video_name\n            assert os.path.isfile(\n                input_file) is not None, \"{0} not exists\".format(input_file)\n            files.append(input_file)\n            self.file_name_list.append(input_file.split('/')[-1].split('.')[0])\n        return files\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, feature file list txt path\n        return: list\n        \"\"\"\n        output_list = []\n        data = np.load(input_file)\n        results = {'video_feat': data, 'video_gt': None}\n        ops = []\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['video_feat'], axis=0).copy()\n        output_list.append(res)\n        return output_list\n    def postprocess(self, output, print_output=True):\n        reslut_path = os.path.join(\"./inference/infer_results/\")",
+        "type": "code",
+        "location": "/tools/utils.py:899-932"
+    },
+    "8909": {
+        "file_id": 649,
+        "content": "The code defines a class with methods for loading feature files, preprocessing data, and post-processing results. The `load_features` method reads the feature file list, checks if each input file exists, and stores their names in `file_name_list`. The `preprocess` method loads the features from a specified input file, applies transformations defined by `ops`, and returns a processed output. The `postprocess` method saves the final results to the specified result path.",
+        "type": "comment"
+    },
+    "8910": {
+        "file_id": 649,
+        "content": "        if not os.path.isdir(reslut_path):\n            os.makedirs(reslut_path)\n        output = [output]\n        for outputs in output:\n            outputs_cls_np = outputs[0]\n            outputs_boundary_np = outputs[1]\n            output_np = ASRFPostProcessing(\n                outputs_cls_np,\n                outputs_boundary_np,\n                self.postprocessing_method,\n                boundary_threshold=self.boundary_threshold).numpy()[0, :]\n            recognition = []\n            for i in range(output_np.shape[0]):\n                recognition = np.concatenate((recognition, [\n                    list(self.actions_dict.keys())[list(\n                        self.actions_dict.values()).index(output_np[i])]\n                ]))\n            recog_content = list(recognition)\n            recog_content = [line + \"\\n\" for line in recog_content]\n            filename = self.file_name_list.pop(0)\n            write_path = os.path.join(reslut_path, filename + \".txt\")\n            f = open(write_path, \"w\")\n            f.writelines(recog_content)",
+        "type": "code",
+        "location": "/tools/utils.py:933-959"
+    },
+    "8911": {
+        "file_id": 649,
+        "content": "The code is creating a directory if it doesn't exist, then processing and storing video outputs into separate text files based on the actions detected. It uses a dictionary to match action values with corresponding labels. The processed output is written into a new file for each video, using the populated file name list as references.",
+        "type": "comment"
+    },
+    "8912": {
+        "file_id": 649,
+        "content": "            f.close()\n        print(\"result write in : \" + write_path)\n@INFERENCE.register()\nclass AttentionLSTM_Inference_helper(Base_Inference_helper):\n    def __init__(\n            self,\n            num_classes,  #Optional, the number of classes to be classified.\n            feature_num,\n            feature_dims,\n            embedding_size,\n            lstm_size,\n            top_k=1):\n        self.num_classes = num_classes\n        self.feature_num = feature_num\n        self.feature_dims = feature_dims\n        self.embedding_size = embedding_size\n        self.lstm_size = lstm_size\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        ops = [FeatureDecoder(num_classes=self.num_classes, has_label=False)]\n        for op in ops:\n            results = op(results)\n        res = []",
+        "type": "code",
+        "location": "/tools/utils.py:960-993"
+    },
+    "8913": {
+        "file_id": 649,
+        "content": "This code defines a class `AttentionLSTM_Inference_helper` that initializes attributes for processing data, and has a method `preprocess()` to process input file. The method applies feature decoding operations on the input file, stores results in dictionary format, and returns the result as a list.",
+        "type": "comment"
+    },
+    "8914": {
+        "file_id": 649,
+        "content": "        for modality in ['rgb', 'audio']:\n            res.append(\n                np.expand_dims(results[f'{modality}_data'], axis=0).copy())\n            res.append(\n                np.expand_dims(results[f'{modality}_len'], axis=0).copy())\n            res.append(\n                np.expand_dims(results[f'{modality}_mask'], axis=0).copy())\n        return res\n@INFERENCE.register()\nclass TransNetV2_Inference_helper():\n    def __init__(self,\n                 num_frames,\n                 height,\n                 width,\n                 num_channels,\n                 threshold=0.5,\n                 output_path=None,\n                 visualize=True):\n        self._input_size = (height, width, num_channels)\n        self.output_path = output_path\n        self.len_frames = 0\n        self.threshold = threshold\n        self.visualize = visualize\n    def input_iterator(self, frames):\n        # return windows of size 100 where the first/last 25 frames are from the previous/next batch\n        # the first and last window must be padded by copies of the first and last frame of the video",
+        "type": "code",
+        "location": "/tools/utils.py:994-1022"
+    },
+    "8915": {
+        "file_id": 649,
+        "content": "This code snippet defines a function and a class for video inference using the TransNetV2 model. The function takes input frames, processes them by dividing into windows of 100 frames, padding first/last window, and returns the results as a list of arrays representing data, lengths, and masks for 'rgb' and 'audio' modalities. The class initializes an instance with specified parameters for image size, number of channels, threshold value, output path, and visualization flag.",
+        "type": "comment"
+    },
+    "8916": {
+        "file_id": 649,
+        "content": "        no_padded_frames_start = 25\n        no_padded_frames_end = 25 + 50 - (\n            len(frames) % 50 if len(frames) % 50 != 0 else 50)  # 25 - 74\n        start_frame = np.expand_dims(frames[0], 0)\n        end_frame = np.expand_dims(frames[-1], 0)\n        padded_inputs = np.concatenate([start_frame] * no_padded_frames_start +\n                                       [frames] +\n                                       [end_frame] * no_padded_frames_end, 0)\n        ptr = 0\n        while ptr + 100 <= len(padded_inputs):\n            out = padded_inputs[ptr:ptr + 100]\n            out = out.astype(np.float32)\n            ptr += 50\n            yield out[np.newaxis]\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: iterator\n        \"\"\"\n        try:\n            import ffmpeg\n        except ImportError as e:\n            print(\n                f\"Warning! {e}, [ffmpeg-python] package and it's dependencies is required for TransNetV2.\"\n            )\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(",
+        "type": "code",
+        "location": "/tools/utils.py:1023-1051"
+    },
+    "8917": {
+        "file_id": 649,
+        "content": "This code is part of a function that takes in an input file and preprocesses it. It imports the 'ffmpeg' library, checks if it exists or not, and then proceeds with the data processing operations. The code calculates the number of padded frames based on the total number of frames, concatenates the start frame, frames, and end frame into a single array, and then iteratively yields batches of 100 elements from this array as an iterator for further processing.",
+        "type": "comment"
+    },
+    "8918": {
+        "file_id": 649,
+        "content": "            input_file)\n        self.input_file = input_file\n        self.filename = os.path.splitext(os.path.split(self.input_file)[1])[0]\n        video_stream, err = ffmpeg.input(\n            self.input_file).output(\"pipe:\",\n                                    format=\"rawvideo\",\n                                    pix_fmt=\"rgb24\",\n                                    s=\"48x27\").run(capture_stdout=True,\n                                                   capture_stderr=True)\n        self.frames = np.frombuffer(video_stream,\n                                    np.uint8).reshape([-1, 27, 48, 3])\n        self.len_frames = len(self.frames)\n        return self.input_iterator(self.frames)\n    def predictions_to_scenes(self, predictions):\n        predictions = (predictions > self.threshold).astype(np.uint8)\n        scenes = []\n        t, t_prev, start = -1, 0, 0\n        for i, t in enumerate(predictions):\n            if t_prev == 1 and t == 0:\n                start = i\n            if t_prev == 0 and t == 1 and i != 0:",
+        "type": "code",
+        "location": "/tools/utils.py:1052-1074"
+    },
+    "8919": {
+        "file_id": 649,
+        "content": "The code initializes a video input and extracts frames from it. It then reshapes the frames into a 3D array and stores them for further processing. The `input_iterator` function returns an iterator over these frames. The `predictions_to_scenes` function takes predictions, converts them to binary format (0 or 1), and iterates through them to identify scene changes based on consecutive 0's and 1's.",
+        "type": "comment"
+    },
+    "8920": {
+        "file_id": 649,
+        "content": "                scenes.append([start, i])\n            t_prev = t\n        if t == 0:\n            scenes.append([start, i])\n        # just fix if all predictions are 1\n        if len(scenes) == 0:\n            return np.array([[0, len(predictions) - 1]], dtype=np.int32)\n        return np.array(scenes, dtype=np.int32)\n    def visualize_predictions(self, frames, predictions):\n        from PIL import Image, ImageDraw\n        if isinstance(predictions, np.ndarray):\n            predictions = [predictions]\n        ih, iw, ic = frames.shape[1:]\n        width = 25\n        # pad frames so that length of the video is divisible by width\n        # pad frames also by len(predictions) pixels in width in order to show predictions\n        pad_with = width - len(frames) % width if len(\n            frames) % width != 0 else 0\n        frames = np.pad(frames, [(0, pad_with), (0, 1), (0, len(predictions)),\n                                 (0, 0)])\n        predictions = [np.pad(x, (0, pad_with)) for x in predictions]\n        height = len(frames) // width",
+        "type": "code",
+        "location": "/tools/utils.py:1075-1103"
+    },
+    "8921": {
+        "file_id": 649,
+        "content": "The code above is part of a video processing tool. It appends the start and end frames of a scene to a list, skips scenes with no changes in predictions, pads frames to ensure even widths, and then flattens the scene lists into an array. The `visualize_predictions` function takes a sequence of frames and predictions, pads them to match lengths, and splits the frames into a grid based on width.",
+        "type": "comment"
+    },
+    "8922": {
+        "file_id": 649,
+        "content": "        img = frames.reshape([height, width, ih + 1, iw + len(predictions), ic])\n        img = np.concatenate(np.split(\n            np.concatenate(np.split(img, height), axis=2)[0], width),\n                             axis=2)[0, :-1]\n        img = Image.fromarray(img)\n        draw = ImageDraw.Draw(img)\n        # iterate over all frames\n        for i, pred in enumerate(zip(*predictions)):\n            x, y = i % width, i // width\n            x, y = x * (iw + len(predictions)) + iw, y * (ih + 1) + ih - 1\n            # we can visualize multiple predictions per single frame\n            for j, p in enumerate(pred):\n                color = [0, 0, 0]\n                color[(j + 1) % 3] = 255\n                value = round(p * (ih - 1))\n                if value != 0:\n                    draw.line((x + j, y, x + j, y - value),\n                              fill=tuple(color),\n                              width=1)\n        return img\n    def postprocess(self, outputs, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\"",
+        "type": "code",
+        "location": "/tools/utils.py:1105-1133"
+    },
+    "8923": {
+        "file_id": 649,
+        "content": "The code takes in a list of predictions and reshapes them into an image. It then iterates over the frames and predictions, drawing lines to visualize multiple predictions per frame. Finally, it returns the processed image.",
+        "type": "comment"
+    },
+    "8924": {
+        "file_id": 649,
+        "content": "        predictions = []\n        for output in outputs:\n            single_frame_logits, all_frames_logits = output\n            single_frame_pred = F.sigmoid(paddle.to_tensor(single_frame_logits))\n            all_frames_pred = F.sigmoid(paddle.to_tensor(all_frames_logits))\n            predictions.append((single_frame_pred.numpy()[0, 25:75, 0],\n                                all_frames_pred.numpy()[0, 25:75, 0]))\n        single_frame_pred = np.concatenate(\n            [single_ for single_, all_ in predictions])\n        all_frames_pred = np.concatenate(\n            [all_ for single_, all_ in predictions])\n        single_frame_predictions, all_frame_predictions = single_frame_pred[:\n                                                                            self\n                                                                            .\n                                                                            len_frames], all_frames_pred[:\n                                                                                                         self",
+        "type": "code",
+        "location": "/tools/utils.py:1134-1149"
+    },
+    "8925": {
+        "file_id": 649,
+        "content": "This code generates predictions for single and all frames. It extracts logits from outputs, applies sigmoid function to convert them into probabilities, and stores the results in a list. Finally, it concatenates the lists of single and all frame predictions for further processing.",
+        "type": "comment"
+    },
+    "8926": {
+        "file_id": 649,
+        "content": "                                                                                                         .\n                                                                                                         len_frames]\n        scenes = self.predictions_to_scenes(single_frame_predictions)\n        if print_output:\n            print(\"Current video file: {0}\".format(self.input_file))\n            print(\"\\tShot Boundarys: {0}\".format(scenes))\n        if self.output_path:\n            if not os.path.exists(self.output_path):\n                os.makedirs(self.output_path)\n            predictions = np.stack(\n                [single_frame_predictions, all_frame_predictions], 1)\n            predictions_file = os.path.join(self.output_path,\n                                            self.filename + \"_predictions.txt\")\n            np.savetxt(predictions_file, predictions, fmt=\"%.6f\")\n            scenes_file = os.path.join(self.output_path,\n                                       self.filename + \"_scenes.txt\")\n            np.savetxt(scenes_file, scenes, fmt=\"%d\")",
+        "type": "code",
+        "location": "/tools/utils.py:1150-1169"
+    },
+    "8927": {
+        "file_id": 649,
+        "content": "The code takes in single-frame and all-frame predictions, converts them into shot boundary scenes, and then optionally prints the output. If an output path is provided and it doesn't exist, it creates the directory. It then stacks the two prediction arrays horizontally, saves the frame predictions file with formatted floats, and saves the scene file with formatted integers.",
+        "type": "comment"
+    },
+    "8928": {
+        "file_id": 649,
+        "content": "            if self.visualize:\n                pil_image = self.visualize_predictions(\n                    self.frames,\n                    predictions=(single_frame_predictions,\n                                 all_frame_predictions))\n                image_file = os.path.join(self.output_path,\n                                          self.filename + \"_vis.png\")\n                pil_image.save(image_file)\n@INFERENCE.register()\nclass ADDS_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 frame_idxs=[0],\n                 num_scales=4,\n                 side_map={\n                     \"2\": 2,\n                     \"3\": 3,\n                     \"l\": 2,\n                     \"r\": 3\n                 },\n                 height=256,\n                 width=512,\n                 full_res_shape=None,\n                 num_channels=None,\n                 img_ext=\".png\",\n                 K=None):\n        self.frame_idxs = frame_idxs\n        self.num_scales = num_scales\n        self.side_map = side_map",
+        "type": "code",
+        "location": "/tools/utils.py:1171-1201"
+    },
+    "8929": {
+        "file_id": 649,
+        "content": "This code initializes an ADDS_Inference_helper object with various parameters such as frame indices, number of scales, side map, height, width, full resolution shape, number of channels, image extension, and K. The visualize feature is also included to display predictions on saved images.",
+        "type": "comment"
+    },
+    "8930": {
+        "file_id": 649,
+        "content": "        self.full_res_shape = full_res_shape\n        self.img_ext = img_ext\n        self.height = height\n        self.width = width\n        self.K = K\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {\n            'filename': input_file,\n            'mode': 'infer',\n            'day_or_night': 'day',\n        }\n        ops = [\n            ImageDecoder(\n                backend='pil',\n                dataset='kitti',\n                frame_idxs=self.frame_idxs,\n                num_scales=self.num_scales,\n                side_map=self.side_map,\n                full_res_shape=self.full_res_shape,\n                img_ext=self.img_ext,\n            ),\n            GroupResize(\n                height=self.height,\n                width=self.width,\n                K=self.K,\n                scale=1,\n                mode='infer',\n            ),\n            ToArray(),",
+        "type": "code",
+        "location": "/tools/utils.py:1202-1237"
+    },
+    "8931": {
+        "file_id": 649,
+        "content": "The code defines a class with attributes 'full_res_shape', 'img_ext', 'height', 'width', and 'K'. It also has a method 'preprocess' that takes an input file path, checks if the file exists, and returns a list. The preprocess method uses three operations: ImageDecoder, GroupResize, and ToArray(). These operations are applied in sequence to preprocess the image data from the given input file.",
+        "type": "comment"
+    },
+    "8932": {
+        "file_id": 649,
+        "content": "        ]\n        for op in ops:\n            results = op(results)\n        res = results['imgs'][('color', 0, 0)]\n        res = np.expand_dims(res, axis=0).copy()\n        return [res]\n    def postprocess(self, output, print_output, save_dir='data/'):\n        \"\"\"\n        output: list\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,\n            ]\n        print(len(output))\n        N = len(self.input_file)\n        for i in range(N):\n            pred_depth = output[i]  # [H, W]\n            if print_output:\n                print(\"Current input image: {0}\".format(self.input_file[i]))\n                file_name = os.path.basename(self.input_file[i]).split('.')[0]\n                save_path = os.path.join(save_dir,\n                                         file_name + \"_depth\" + \".png\")\n                pred_depth_color = self._convertPNG(pred_depth)\n                pred_depth_color.save(save_path)\n                print(f\"pred depth image saved to: {save_path}\")",
+        "type": "code",
+        "location": "/tools/utils.py:1238-1264"
+    },
+    "8933": {
+        "file_id": 649,
+        "content": "This function processes a list of outputs and performs post-processing operations on each output. It checks if the input file is a single item or a list, then iterates over the outputs to extract depth maps, optionally prints information about each input image and saves the associated depth map as an image file in a specified directory. The code also converts the depth maps to PNG format before saving them.",
+        "type": "comment"
+    },
+    "8934": {
+        "file_id": 649,
+        "content": "    def _convertPNG(self, image_numpy):\n        disp_resized = cv2.resize(image_numpy, (1280, 640))\n        disp_resized_np = disp_resized\n        vmax = np.percentile(disp_resized_np, 95)\n        normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)\n        mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')\n        colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *\n                          255).astype(np.uint8)\n        im = Image.fromarray(colormapped_im)\n        return im\n@INFERENCE.register()\nclass AVA_SlowFast_FastRCNN_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 detection_model_name,\n                 detection_model_weights,\n                 config_file_path,\n                 predict_stepsize=8,\n                 output_stepsize=4,\n                 output_fps=6,\n                 out_filename='ava_det_demo.mp4',\n                 num_frames=32,\n                 alpha=4,\n                 target_size=256):\n        self.detection_model_name = detection_model_name",
+        "type": "code",
+        "location": "/tools/utils.py:1266-1291"
+    },
+    "8935": {
+        "file_id": 649,
+        "content": "This code defines a function `_convertPNG` that converts an image to PNG format after resizing, normalizing, and color mapping. The class `AVA_SlowFast_FastRCNN_Inference_helper` initializes with various parameters for detection model inference and output settings.",
+        "type": "comment"
+    },
+    "8936": {
+        "file_id": 649,
+        "content": "        self.detection_model_weights = detection_model_weights\n        self.config = get_config(config_file_path,\n                                 show=False)  #parse config file\n        self.predict_stepsize = predict_stepsize\n        self.output_stepsize = output_stepsize\n        self.output_fps = output_fps\n        self.out_filename = out_filename\n        self.num_frames = num_frames\n        self.alpha = alpha\n        self.target_size = target_size\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        \"\"\"\n        frame_dir = 'tmp_frames'\n        self.frame_paths, frames, FPS = frame_extraction(input_file, frame_dir)\n        num_frame = len(self.frame_paths)  #视频秒数*FPS\n        assert num_frame != 0\n        # 帧图像高度和宽度\n        h, w, _ = frames[0].shape\n        # Get clip_len, frame_interval and calculate center index of each clip\n        data_process_pipeline = build_pipeline(\n            self.config.PIPELINE.test)  #测试时输出处理流水配置\n        clip_len = self.config.PIPELINE.test.sample['clip_len']",
+        "type": "code",
+        "location": "/tools/utils.py:1292-1321"
+    },
+    "8937": {
+        "file_id": 649,
+        "content": "The code is initializing some parameters and then extracting frames from the input video file for further processing. It builds a pipeline configuration for testing, sets clip length, and calculates center indices of each clip. The extracted frames will be used for object detection or other tasks in subsequent steps.",
+        "type": "comment"
+    },
+    "8938": {
+        "file_id": 649,
+        "content": "        assert clip_len % 2 == 0, 'We would like to have an even clip_len'\n        frame_interval = self.config.PIPELINE.test.sample['frame_interval']\n        # 此处关键帧每秒取一个\n        clip_len = self.config.PIPELINE.test.sample['clip_len']\n        assert clip_len % 2 == 0, 'We would like to have an even clip_len'\n        frame_interval = self.config.PIPELINE.test.sample['frame_interval']\n        window_size = clip_len * frame_interval\n        timestamps = np.arange(window_size // 2,\n                               (num_frame + 1 - window_size // 2),\n                               self.predict_stepsize)\n        selected_frame_list = []\n        for timestamp in timestamps:\n            selected_frame_list.append(self.frame_paths[timestamp - 1])\n        # Load label_map\n        label_map_path = self.config.DATASET.test['label_file']\n        self.categories, self.class_whitelist = read_labelmap(\n            open(label_map_path))\n        label_map = {}\n        for item in self.categories:\n            id = item['id']",
+        "type": "code",
+        "location": "/tools/utils.py:1322-1344"
+    },
+    "8939": {
+        "file_id": 649,
+        "content": "The code asserts for an even clip_len and frame_interval, calculates window size, generates timestamps for selecting frames, creates a list of selected frames, reads label map from file and assigns categories to a dictionary.",
+        "type": "comment"
+    },
+    "8940": {
+        "file_id": 649,
+        "content": "            name = item['name']\n            label_map[id] = name\n        self.label_map = label_map\n        detection_result_dir = 'tmp_detection'\n        detection_model_name = self.detection_model_name\n        detection_model_weights = self.detection_model_weights\n        detection_txt_list = detection_inference(selected_frame_list,\n                                                 detection_result_dir,\n                                                 detection_model_name,\n                                                 detection_model_weights)\n        assert len(detection_txt_list) == len(timestamps)\n        human_detections = []\n        data_list = []\n        person_num_list = []\n        for timestamp, detection_txt_path in zip(timestamps,\n                                                 detection_txt_list):\n            proposals, scores = get_detection_result(\n                detection_txt_path, h, w,\n                (float)(self.config.DATASET.test['person_det_score_thr']))\n            if proposals.shape[0] == 0:",
+        "type": "code",
+        "location": "/tools/utils.py:1345-1369"
+    },
+    "8941": {
+        "file_id": 649,
+        "content": "This code is initializing a label map, running object detection inference on a list of frames, and extracting detection results for each timestamp. It then processes these results by getting proposals and scores for each frame, and checks if there are any detections (if not, it proceeds).",
+        "type": "comment"
+    },
+    "8942": {
+        "file_id": 649,
+        "content": "                #person_num_list.append(0)\n                human_detections.append(None)\n                continue\n            human_detections.append(proposals)\n            result = get_timestep_result(frame_dir,\n                                         timestamp,\n                                         clip_len,\n                                         frame_interval,\n                                         FPS=FPS)\n            result[\"proposals\"] = proposals\n            result[\"scores\"] = scores\n            new_result = data_process_pipeline(result)\n            proposals = new_result['proposals']\n            img_slow = new_result['imgs'][0]\n            img_slow = img_slow[np.newaxis, :]\n            img_fast = new_result['imgs'][1]\n            img_fast = img_fast[np.newaxis, :]\n            proposals = proposals[np.newaxis, :]\n            scores = scores[np.newaxis, :]\n            img_shape = np.asarray(new_result['img_shape'])\n            img_shape = img_shape[np.newaxis, :]\n            data = [\n                paddle.to_tensor(img_slow, dtype='float32'),",
+        "type": "code",
+        "location": "/tools/utils.py:1370-1400"
+    },
+    "8943": {
+        "file_id": 649,
+        "content": "This code is part of a data processing pipeline in PaddleVideo. It appends proposals and scores to the result dictionary, reshapes tensors for image and proposal inputs, and converts images and proposal lists to Paddle Tensors for further processing.",
+        "type": "comment"
+    },
+    "8944": {
+        "file_id": 649,
+        "content": "                paddle.to_tensor(img_fast, dtype='float32'),\n                paddle.to_tensor(proposals, dtype='float32'),\n                paddle.to_tensor(img_shape, dtype='int32')\n            ]\n            person_num = proposals.shape[1]\n            person_num_list.append(person_num)\n            data_list.append(data)\n        self.human_detections = human_detections\n        self.person_num_list = person_num_list\n        self.timestamps = timestamps\n        self.frame_dir = frame_dir\n        self.detection_result_dir = detection_result_dir\n        return data_list\n    def postprocess(self, outputs, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        predictions = []\n        assert len(self.person_num_list) == len(outputs)\n        #print(\"***  self.human_detections\",len( self.human_detections))\n        #print(\"***  outputs\",len( outputs))\n        index = 0\n        for t_index in range(len(self.timestamps)):\n            if self.human_detections[t_index] is None:\n                predictions.append(None)",
+        "type": "code",
+        "location": "/tools/utils.py:1401-1433"
+    },
+    "8945": {
+        "file_id": 649,
+        "content": "This code defines a class with methods to create and post-process human detections. It takes in various directories as input, and outputs lists of data and predictions. The preprocess method converts image, proposals, and shape into tensors, and appends the number of people and data list for each frame. The postprocess method takes output from the model and checks if human_detections is None for each timestamp, then adds predictions to a list.",
+        "type": "comment"
+    },
+    "8946": {
+        "file_id": 649,
+        "content": "                continue\n            human_detection = self.human_detections[t_index]\n            output = outputs[index]\n            result = output  #长度为类别个数，不包含背景\n            person_num = self.person_num_list[index]\n            index = index + 1\n            prediction = []\n            if human_detection is None:\n                predictions.append(None)\n                continue\n            # N proposals\n            for i in range(person_num):\n                prediction.append([])\n            # Perform action score thr\n            for i in range(len(result)):  # for class\n                if i + 1 not in self.class_whitelist:\n                    continue\n                for j in range(person_num):\n                    if result[i][j, 4] > self.config.MODEL.head['action_thr']:\n                        prediction[j].append(\n                            (self.label_map[i + 1], result[i][j, 4]\n                             ))  # label_map is a dict, label index start from 1\n            predictions.append(prediction)",
+        "type": "code",
+        "location": "/tools/utils.py:1434-1464"
+    },
+    "8947": {
+        "file_id": 649,
+        "content": "This code iterates over human detections and their corresponding outputs. If a detection is None, it appends a None value to the predictions list. It then iterates through the result array for each class, checking if the action score exceeds the specified threshold. For each valid action score, it adds the class label and score to the prediction list. Finally, it appends the prediction list to the predictions list.",
+        "type": "comment"
+    },
+    "8948": {
+        "file_id": 649,
+        "content": "        results = []\n        for human_detection, prediction in zip(self.human_detections,\n                                               predictions):\n            results.append(pack_result(human_detection, prediction))\n        def dense_timestamps(timestamps, n):\n            \"\"\"Make it nx frames.\"\"\"\n            old_frame_interval = (timestamps[1] - timestamps[0])\n            start = timestamps[0] - old_frame_interval / n * (n - 1) / 2\n            new_frame_inds = np.arange(\n                len(timestamps) * n) * old_frame_interval / n + start\n            return new_frame_inds.astype(np.int)\n        dense_n = int(self.predict_stepsize / self.output_stepsize)  #30\n        frames = [\n            cv2.imread(self.frame_paths[i - 1])\n            for i in dense_timestamps(self.timestamps, dense_n)\n        ]\n        vis_frames = visualize(frames, results)\n        try:\n            import moviepy.editor as mpy\n        except ImportError:\n            raise ImportError('Please install moviepy to enable output file')",
+        "type": "code",
+        "location": "/tools/utils.py:1466-1490"
+    },
+    "8949": {
+        "file_id": 649,
+        "content": "Code snippet reads frames from specific paths, performs human detections and predictions, and densely samples timestamps to create a sequence of images. It then visualizes these images and attempts to import moviepy library for output file creation.",
+        "type": "comment"
+    },
+    "8950": {
+        "file_id": 649,
+        "content": "        vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],\n                                    fps=self.output_fps)\n        vid.write_videofile(self.out_filename)\n        print(\"finish write !\")\n        # delete tmp files and dirs\n        shutil.rmtree(self.frame_dir)\n        shutil.rmtree(self.detection_result_dir)\n@INFERENCE.register()\nclass PoseC3D_Inference_helper(Base_Inference_helper):\n    def __init__(self, top_k=1):\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        with open(input_file, 'rb') as f:\n            data = pickle.load(f)\n        self.input_file = input_file\n        left_kp = [1, 3, 5, 7, 9, 11, 13, 15]\n        right_kp = [2, 4, 6, 8, 10, 12, 14, 16]\n        ops = [\n            UniformSampleFrames(clip_len=48, num_clips=10, test_mode=True),\n            PoseDecode(),\n            PoseCompact(hw_ratio=1., allow_imgpad=True),",
+        "type": "code",
+        "location": "/tools/utils.py:1492-1523"
+    },
+    "8951": {
+        "file_id": 649,
+        "content": "This code snippet defines a class PoseC3D_Inference_helper that handles image processing and inference for pose estimation. It includes methods for preprocessing, such as loading data from file, defining keypoint indices for left and right body parts, and applying various operations like frame sampling, pose decoding, and compacting the pose results. The code also demonstrates error handling by checking if input files exist before processing them, and performs cleanup of temporary directories after writing video files.",
+        "type": "comment"
+    },
+    "8952": {
+        "file_id": 649,
+        "content": "            Resize(scale=(-1, 56)),\n            CenterCrop_V2(crop_size=56),\n            GeneratePoseTarget(sigma=0.6,\n                               use_score=True,\n                               with_kp=True,\n                               with_limb=False,\n                               double=True,\n                               left_kp=left_kp,\n                               right_kp=right_kp),\n            FormatShape(input_format='NCTHW'),\n            Collect(keys=['imgs', 'label'], meta_keys=[])\n        ]\n        for op in ops:\n            results = op(data)\n        results = [results[0][np.newaxis, :, :, :, :, :]]\n        self.num_segs = results[0].shape[1]\n        return results\n    def postprocess(self, outputs, print_output=True):\n        batch_size = outputs[0].shape[0]\n        cls_score = outputs[0].reshape(\n            [batch_size // self.num_segs, self.num_segs, outputs[0].shape[-1]])\n        output = F.softmax(paddle.to_tensor(cls_score),\n                           axis=2).mean(axis=1).numpy()",
+        "type": "code",
+        "location": "/tools/utils.py:1524-1548"
+    },
+    "8953": {
+        "file_id": 649,
+        "content": "The code appears to be a part of a PaddleVideo tool that performs image preprocessing, resizing, cropping, and pose estimation. It uses PaddlePaddle library functions such as Resize, CenterCrop_V2, GeneratePoseTarget, FormatShape, Collect, and F.softmax for various operations. The code also calculates the number of segments and performs post-processing on output results.",
+        "type": "comment"
+    },
+    "8954": {
+        "file_id": 649,
+        "content": "        N = len(self.input_file)\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]\n            scores = output[i, classes]\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))\n                for j in range(self.top_k):\n                    print(\"\\ttop-{0} class: {1}\".format(j + 1, classes[j]))\n                    print(\"\\ttop-{0} score: {1}\".format(j + 1, scores[j]))\n@INFERENCE.register()\nclass YOWO_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=16,\n                 target_size=224,\n                 nms_thresh=0.5,\n                 conf_thresh_valid=0.5,\n                 mean=[0.4345, 0.4051, 0.3775],\n                 std=[0.2768, 0.2713, 0.2737]):\n        self.num_seg = num_seg\n        self.target_size = target_size\n        self.nms_thresh = nms_thresh\n        self.conf_thresh_valid = conf_thresh_valid",
+        "type": "code",
+        "location": "/tools/utils.py:1549-1574"
+    },
+    "8955": {
+        "file_id": 649,
+        "content": "This code snippet is a part of YOWO_Inference_helper class in PaddleVideo. It initializes the class with parameters such as num_seg, target_size, nms_thresh, conf_thresh_valid, mean, and std. The class seems to be used for image classification or object detection tasks, based on the presence of top-k classes and scores.",
+        "type": "comment"
+    },
+    "8956": {
+        "file_id": 649,
+        "content": "        self.mean = mean\n        self.std = std\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        cap = cv2.VideoCapture(input_file)\n        queue = []\n        inputs = []\n        frames = []\n        while (cap.isOpened()):\n            ret, frame = cap.read()\n            if ret == False:\n                break\n            if len(queue) <= 0:  # At initialization, populate queue with initial frame\n                for i in range(self.num_seg):\n                    queue.append(frame)\n            # Add the read frame to last and pop out the oldest one\n            queue.append(frame)\n            queue.pop(0)\n            # Resize images\n            imgs = [cv2.resize(img, (self.target_size, self.target_size), interpolation=cv2.INTER_LINEAR) for img in\n                    queue]\n            # Convert image to CHW keeping BGR order.\n            imgs = [img.transpose([2, 0, 1]) for img in imgs]",
+        "type": "code",
+        "location": "/tools/utils.py:1575-1606"
+    },
+    "8957": {
+        "file_id": 649,
+        "content": "This code is initializing a preprocess function for video input. It checks if the input file exists, then uses OpenCV to read frames from the video file. The function populates a queue with initial frames, adds new frames, and resizes them using interpolation. Finally, it converts images to CHW order while keeping BGR values.",
+        "type": "comment"
+    },
+    "8958": {
+        "file_id": 649,
+        "content": "            # Image [0, 255] -> [0, 1].\n            imgs = [img / 255.0 for img in imgs]\n            imgs = [\n                np.ascontiguousarray(\n                    img.reshape((3, imgs[0].shape[1], imgs[0].shape[2]))\n                ).astype(np.float32)\n                for img in imgs\n            ]\n            # Concat list of images to single ndarray.\n            imgs = np.concatenate(\n                [np.expand_dims(img, axis=1) for img in imgs], axis=1\n            )\n            imgs = np.ascontiguousarray(imgs)\n            imgs = np.expand_dims(imgs, axis=0)\n            imgs = np.expand_dims(imgs, axis=0)\n            inputs.append(imgs)\n            frames.append(queue[-1])\n        return inputs, frames\n    def postprocess(self, outputs, frame, filename, save_img=True):\n        \"\"\"\n        outputs: list\n        frames: list\n        \"\"\"\n        labels = [\n            \"Basketball\", \"BasketballDunk\", \"Biking\", \"CliffDiving\", \"CricketBowling\",\n            \"Diving\", \"Fencing\", \"FloorGymnastics\", \"GolfSwing\", \"HorseRiding\",",
+        "type": "code",
+        "location": "/tools/utils.py:1608-1638"
+    },
+    "8959": {
+        "file_id": 649,
+        "content": "The code normalizes the image values to [0, 1] range and reshapes them into a specific format. It then concatenates the images to form a single array and expands dimensions as necessary before appending it to the inputs list. The postprocess function takes outputs, frames, frame, and filename as input and returns labels for classification tasks.",
+        "type": "comment"
+    },
+    "8960": {
+        "file_id": 649,
+        "content": "            \"IceDancing\", \"LongJump\", \"PoleVault\", \"RopeClimbing\", \"SalsaSpin\",\n            \"SkateBoarding\", \"Skiing\", \"Skijet\", \"SoccerJuggling\", \"Surfing\",\n            \"TennisSwing\", \"TrampolineJumping\", \"VolleyballSpiking\", \"WalkingWithDog\"]\n        nms_thresh = 0.5\n        font = cv2.FONT_HERSHEY_SIMPLEX\n        for out in outputs:\n            out = paddle.to_tensor(out)\n            preds = []\n            all_boxes = get_region_boxes(out)\n            for i in range(out.shape[0]):\n                boxes = all_boxes[i]\n                boxes = nms(boxes, nms_thresh)\n                for box in boxes:\n                    x1 = round(float(box[0] - box[2] / 2.0) * 320.0)\n                    y1 = round(float(box[1] - box[3] / 2.0) * 240.0)\n                    x2 = round(float(box[0] + box[2] / 2.0) * 320.0)\n                    y2 = round(float(box[1] + box[3] / 2.0) * 240.0)\n                    det_conf = float(box[4])\n                    for j in range((len(box) - 5) // 2):\n                        cls_conf = float(box[5 + 2 * j].item())",
+        "type": "code",
+        "location": "/tools/utils.py:1639-1660"
+    },
+    "8961": {
+        "file_id": 649,
+        "content": "This code appears to be involved in object detection and recognition. It applies Non-Maximum Suppression (NMS) to the predicted bounding boxes to filter out redundant detections, calculates the adjusted coordinates for each box, and extracts the classification confidence scores for each class of the detected objects. The specific activity being detected or the model architecture used is not specified in this code snippet.",
+        "type": "comment"
+    },
+    "8962": {
+        "file_id": 649,
+        "content": "                        prob = det_conf * cls_conf\n                    preds.append([[x1, y1, x2, y2], prob, labels[int(box[6])]])\n            for _, dets in enumerate(preds):\n                if dets[1] < 0.4:\n                    break\n                text = dets[2] + ' ' + '{:.2f}'.format(dets[1])\n                cv2.rectangle(frame, (dets[0][0], dets[0][1]), (dets[0][2], dets[0][3]), (0, 255, 0), 2)\n                cv2.putText(frame, text, (dets[0][0] + 3, dets[0][1] - 5 - 10 * _), font, 0.5, (0, 255, 0), 2)\n            cv2.imwrite('{}.jpg'.format(filename), frame)",
+        "type": "code",
+        "location": "/tools/utils.py:1661-1670"
+    },
+    "8963": {
+        "file_id": 649,
+        "content": "This code is part of a video object detection system. It calculates the probability (prob) of detections based on confidence (det_conf) and class confidence (cls_conf). The detections are stored in preds list. If the probability is below 0.4, the loop breaks. Then it draws rectangles around detected objects on the frame using their coordinates from preds[0], colors them green, and displays text with object label and probability using cv2.putText(). Finally, it saves the processed frame as a .jpg image named after filename.",
+        "type": "comment"
+    },
+    "8964": {
+        "file_id": 650,
+        "content": "/tools/wheel.py",
+        "type": "filepath"
+    },
+    "8965": {
+        "file_id": 650,
+        "content": "The code utilizes an ArgumentParser to handle command line arguments, downloads and saves a model, initializes PaddleVideo with GPU/MKLDNN usage for video label prediction, and iterates through results to print top classes/scores/labels.",
+        "type": "summary"
+    },
+    "8966": {
+        "file_id": 650,
+        "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,",
+        "type": "code",
+        "location": "/tools/wheel.py:1-24"
+    },
+    "8967": {
+        "file_id": 650,
+        "content": "This code block is a license notice for the Apache License, Version 2.0, which grants permission to use this file as long as it complies with the terms of the license.",
+        "type": "comment"
+    },
+    "8968": {
+        "file_id": 650,
+        "content": "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\n__dir__ = os.path.dirname(__file__)\nsys.path.append(os.path.join(__dir__, ''))\nimport numpy as np\nimport tarfile\nimport requests\nfrom tqdm import tqdm\nimport shutil\nfrom paddle import inference\nfrom paddle.inference import Config, create_predictor\nfrom tools.utils import ppTSM_Inference_helper\n__all__ = ['PaddleVideo']\n# path of download model and data\nBASE_DIR = os.path.expanduser(\"~/.paddlevideo_inference/\")\nBASE_INFERENCE_MODEL_DIR = os.path.join(BASE_DIR, 'inference_model')\nBASE_VIDEOS_DIR = os.path.join(BASE_DIR, 'videos')\n# support Models\nMODELS = {\n    'ppTSM':\n    'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_infer.tar',\n    'ppTSM_v2':\n    'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_v2_infer.tar'\n}\nMODEL_NAMES = list(MODELS.keys())\ndef parse_args(mMain=True, add_help=True):",
+        "type": "code",
+        "location": "/tools/wheel.py:25-64"
+    },
+    "8969": {
+        "file_id": 650,
+        "content": "This code imports necessary modules, defines paths and model names for PaddleVideo inference models, and includes a function to parse command line arguments. The code is setting up the environment for using different PaddleVideo models and downloading them if needed.",
+        "type": "comment"
+    },
+    "8970": {
+        "file_id": 650,
+        "content": "    \"\"\"\n    Args:\n        mMain: bool. True for command args, False for python interface\n    \"\"\"\n    import argparse\n    def str2bool(v):\n        return v.lower() in (\"true\", \"t\", \"1\")\n    if mMain == True:\n        # general params\n        parser = argparse.ArgumentParser(add_help=add_help)\n        parser.add_argument(\"--model_name\", type=str, default='')\n        parser.add_argument(\"-v\", \"--video_file\", type=str, default='')\n        parser.add_argument(\"--use_gpu\", type=str2bool, default=True)\n        # params for decode and sample\n        parser.add_argument(\"--num_seg\", type=int, default=16)\n        # params for preprocess\n        parser.add_argument(\"--short_size\", type=int, default=256)\n        parser.add_argument(\"--target_size\", type=int, default=224)\n        # params for predict\n        parser.add_argument(\"--model_file\", type=str, default='')\n        parser.add_argument(\"--params_file\", type=str)\n        parser.add_argument(\"-b\", \"--batch_size\", type=int, default=1)\n        parser.add_argument(\"--use_fp16\", type=str2bool, default=False)",
+        "type": "code",
+        "location": "/tools/wheel.py:65-93"
+    },
+    "8971": {
+        "file_id": 650,
+        "content": "This code defines a function that creates an ArgumentParser object for command line arguments. It includes various argument types and default values, such as model name, video file, use GPU flag, number of segments, short and target sizes, and batch size. The function is intended to be used in the main section of a Python script when set to True.",
+        "type": "comment"
+    },
+    "8972": {
+        "file_id": 650,
+        "content": "        parser.add_argument(\"--ir_optim\", type=str2bool, default=True)\n        parser.add_argument(\"--use_tensorrt\", type=str2bool, default=False)\n        parser.add_argument(\"--gpu_mem\", type=int, default=8000)\n        parser.add_argument(\"--top_k\", type=int, default=1)\n        parser.add_argument(\"--enable_mkldnn\", type=bool, default=False)\n        parser.add_argument(\"--label_name_path\", type=str, default='')\n        return parser.parse_args()\n    else:\n        return argparse.Namespace(model_name='',\n                                  video_file='',\n                                  use_gpu=True,\n                                  num_seg=16,\n                                  short_size=256,\n                                  target_size=224,\n                                  model_file='',\n                                  params_file='',\n                                  batch_size=1,\n                                  use_fp16=False,\n                                  ir_optim=True,\n                                  use_tensorrt=False,",
+        "type": "code",
+        "location": "/tools/wheel.py:94-115"
+    },
+    "8973": {
+        "file_id": 650,
+        "content": "This code is initializing argument parser with default values for various options like ir_optim, use_tensorrt, gpu_mem, top_k and enable_mkldnn. It then parses the arguments using argparse and returns the resulting Namespace.",
+        "type": "comment"
+    },
+    "8974": {
+        "file_id": 650,
+        "content": "                                  gpu_mem=8000,\n                                  top_k=1,\n                                  enable_mkldnn=False,\n                                  label_name_path='')\ndef parse_file_paths(input_path: str) -> list:\n    if os.path.isfile(input_path):\n        files = [\n            input_path,\n        ]\n    else:\n        files = os.listdir(input_path)\n        files = [\n            file for file in files\n            if (file.endswith(\".avi\") or file.endswith(\".mp4\"))\n        ]\n        files = [os.path.join(input_path, file) for file in files]\n    return files\ndef download_with_progressbar(url, save_path):\n    response = requests.get(url, stream=True)\n    total_size_in_bytes = int(response.headers.get('content-length', 0))\n    block_size = 1024  # 1 Kibibyte\n    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)\n    with open(save_path, 'wb') as file:\n        for data in response.iter_content(block_size):\n            progress_bar.update(len(data))\n            file.write(data)",
+        "type": "code",
+        "location": "/tools/wheel.py:116-145"
+    },
+    "8975": {
+        "file_id": 650,
+        "content": "Function `parse_file_paths` takes an input path as a parameter, checks if it is a file or a directory. If it's a file, it returns the file itself; otherwise, it lists all files in the directory, filters out those that don't end with \".avi\" or \".mp4\", and joins the input path with each filtered file to form an absolute path. These paths are then returned as a list.\n\nFunction `download_with_progressbar` downloads data from the given URL in chunks while providing progress updates using tqdm's progress bar. It sets the total size of the download based on the 'content-length' header from the response, and writes each chunk to the specified save path in a 'wb' mode.",
+        "type": "comment"
+    },
+    "8976": {
+        "file_id": 650,
+        "content": "    progress_bar.close()\n    if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes:\n        raise Exception(\"Something went wrong while downloading models\")\ndef download_inference_model(model_storage_directory, url):\n    # using custom model\n    tar_file_name_list = [\n        'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel'\n    ]\n    if not os.path.exists(\n            os.path.join(model_storage_directory,\n                         'inference.pdiparams')) or not os.path.exists(\n                             os.path.join(model_storage_directory,\n                                          'inference.pdmodel')):\n        tmp_path = os.path.join(model_storage_directory, url.split('/')[-1])\n        print('download {} to {}'.format(url, tmp_path))\n        os.makedirs(model_storage_directory, exist_ok=True)\n        download_with_progressbar(url, tmp_path)  #download\n        #save to directory\n        with tarfile.open(tmp_path, 'r') as tarObj:\n            for member in tarObj.getmembers():",
+        "type": "code",
+        "location": "/tools/wheel.py:146-168"
+    },
+    "8977": {
+        "file_id": 650,
+        "content": "The code downloads an inference model from a given URL and saves it to the specified directory. It first checks if the required files ('inference.pdiparams' and 'inference.pdmodel') exist, then creates temporary directories for downloading, prints the download progress, extracts the tar archive containing the model files, and raises an exception if any issue occurs during the process.",
+        "type": "comment"
+    },
+    "8978": {
+        "file_id": 650,
+        "content": "                filename = None\n                for tar_file_name in tar_file_name_list:\n                    if tar_file_name in member.name:\n                        filename = tar_file_name\n                if filename is None:\n                    continue\n                file = tarObj.extractfile(member)\n                with open(os.path.join(model_storage_directory, filename),\n                          'wb') as f:\n                    f.write(file.read())\n        os.remove(tmp_path)\ndef create_paddle_predictor(args):\n    config = Config(args.model_file, args.params_file)\n    if args.use_gpu:\n        config.enable_use_gpu(args.gpu_mem, 0)\n    else:\n        config.disable_gpu()\n        if args.enable_mkldnn:\n            # cache 10 different shapes for mkldnn to avoid memory leak\n            config.set_mkldnn_cache_capacity(10)\n            config.enable_mkldnn()\n    config.disable_glog_info()\n    config.switch_ir_optim(args.ir_optim)  # default true\n    if args.use_tensorrt:\n        config.enable_tensorrt_engine(",
+        "type": "code",
+        "location": "/tools/wheel.py:169-197"
+    },
+    "8979": {
+        "file_id": 650,
+        "content": "This code is initializing a Paddle predictor by reading arguments and configuring the model accordingly. It enables GPU use or MKLDNN based on the provided flags, sets the log level, and switches IR optimization if requested.",
+        "type": "comment"
+    },
+    "8980": {
+        "file_id": 650,
+        "content": "            precision_mode=Config.Precision.Half\n            if args.use_fp16 else Config.Precision.Float32,\n            max_batch_size=args.batch_size)\n    config.enable_memory_optim()\n    # use zero copy\n    config.switch_use_feed_fetch_ops(False)\n    predictor = create_predictor(config)\n    return predictor\ndef load_label_name_dict(path):\n    result = {}\n    if not os.path.exists(path):\n        print(\n            'Warning: If want to use your own label_dict, please input legal path!\\nOtherwise label_names will be empty!'\n        )\n    else:\n        for line in open(path, 'r'):\n            partition = line.split('\\n')[0].partition(' ')\n            try:\n                result[int(partition[0])] = str(partition[-1])\n            except:\n                result = {}\n                break\n    return result\nclass PaddleVideo(object):\n    def __init__(self, **kwargs):\n        print(\n            '\\nInference models that Paddle provides are listed as follows:\\n{}'\n            .format(MODEL_NAMES), '\\n')\n        process_params = parse_args(mMain=False, add_help=False)",
+        "type": "code",
+        "location": "/tools/wheel.py:198-232"
+    },
+    "8981": {
+        "file_id": 650,
+        "content": "The code snippet is initializing a PaddleVideo object and creating a predictor. It sets the precision mode based on the `args.use_fp16` flag, enables memory optimization, and switches off zero copy operations. It also loads a label name dictionary from the specified path. The purpose of this code is to facilitate model inference using PaddleVideo and provide a user-friendly interface.",
+        "type": "comment"
+    },
+    "8982": {
+        "file_id": 650,
+        "content": "        process_params.__dict__.update(**kwargs)\n        if not os.path.exists(process_params.model_file):\n            if process_params.model_name is None:\n                raise Exception('Please input model name that you want to use!')\n            if process_params.model_name in MODEL_NAMES:\n                url = MODELS[process_params.model_name]\n                download_path = os.path.join(BASE_INFERENCE_MODEL_DIR,\n                                             process_params.model_name)\n                if not os.path.exists(download_path):\n                    os.makedirs(download_path)\n                #create pretrained model download_path\n                download_inference_model(model_storage_directory=download_path,\n                                         url=url)\n                process_params.model_file = os.path.join(\n                    download_path, 'inference.pdmodel')\n                process_params.params_file = os.path.join(\n                    download_path, 'inference.pdiparams')\n                process_params.label_name_path = os.path.join(",
+        "type": "code",
+        "location": "/tools/wheel.py:233-253"
+    },
+    "8983": {
+        "file_id": 650,
+        "content": "This code checks if the model file exists, if not it prompts for a model name and downloads a pre-trained model from the provided URL if the model name is in the MODEL_NAMES list. It creates directories for the downloaded files and updates process_params with paths to the inference.pdmodel, inference.pdiparams, label_name_path files.",
+        "type": "comment"
+    },
+    "8984": {
+        "file_id": 650,
+        "content": "                    __dir__, '../data/k400/Kinetics-400_label_list.txt')\n            else:\n                raise Exception(\n                    'If you want to use your own model, Please input model_file as model path!'\n                )\n        else:\n            print('Using user-specified model and params!')\n        print(\"process params are as follows: \\n{}\".format(process_params))\n        self.label_name_dict = load_label_name_dict(\n            process_params.label_name_path)\n        self.args = process_params\n        self.predictor = create_paddle_predictor(process_params)\n    def predict(self, video):\n        \"\"\"\n        predict label of video with paddlevideo\n        Args:\n            video:input video for clas, support single video , internet url, folder path containing series of videos\n        Returns:\n            list[dict:{videoname: \"\",class_ids: [], scores: [], label_names: []}],if label name path is None,label names will be empty\n        \"\"\"\n        video_list = []\n        assert isinstance(video, (str))",
+        "type": "code",
+        "location": "/tools/wheel.py:254-277"
+    },
+    "8985": {
+        "file_id": 650,
+        "content": "The code initializes an object that can predict video labels using PaddleVideo. It checks for the presence of required parameters and allows user-specified models, then loads label name dictionary, and finally defines a \"predict\" method to classify videos.",
+        "type": "comment"
+    },
+    "8986": {
+        "file_id": 650,
+        "content": "        # get input_tensor and output_tensor\n        input_names = self.predictor.get_input_names()\n        output_names = self.predictor.get_output_names()\n        input_tensor_list = []\n        output_tensor_list = []\n        for item in input_names:\n            input_tensor_list.append(self.predictor.get_input_handle(item))\n        for item in output_names:\n            output_tensor_list.append(self.predictor.get_output_handle(item))\n        if isinstance(video, str):\n            # download internet video\n            if video.startswith('http'):\n                if not os.path.exists(BASE_VIDEOS_DIR):\n                    os.makedirs(BASE_VIDEOS_DIR)\n                video_path = os.path.join(BASE_VIDEOS_DIR, 'tmp.mp4')\n                download_with_progressbar(video, video_path)\n                print(\"Current using video from Internet:{}, renamed as: {}\".\n                      format(video, video_path))\n                video = video_path\n            files = parse_file_paths(video)\n        else:\n            print('Please input legal video!')",
+        "type": "code",
+        "location": "/tools/wheel.py:279-301"
+    },
+    "8987": {
+        "file_id": 650,
+        "content": "The code fetches input and output tensor names from the predictor, then retrieves their handles. If the video is a URL, it downloads the internet video and saves it to the BASE_VIDEOS_DIR. The downloaded video file path replaces the original URL. It checks if the video is not legal (not a string) and outputs an error message.",
+        "type": "comment"
+    },
+    "8988": {
+        "file_id": 650,
+        "content": "        # Inferencing process\n        InferenceHelper = ppTSM_Inference_helper(\n            num_seg=self.args.num_seg,\n            short_size=self.args.short_size,\n            target_size=self.args.target_size,\n            top_k=self.args.top_k)\n        batch_num = self.args.batch_size\n        for st_idx in range(0, len(files), batch_num):\n            ed_idx = min(st_idx + batch_num, len(files))\n            # Pre process batched input\n            batched_inputs = InferenceHelper.preprocess_batch(\n                files[st_idx:ed_idx])\n            # run inference\n            for i in range(len(input_tensor_list)):\n                input_tensor_list[i].copy_from_cpu(batched_inputs[i])\n            self.predictor.run()\n            batched_outputs = []\n            for j in range(len(output_tensor_list)):\n                batched_outputs.append(output_tensor_list[j].copy_to_cpu())\n            results_list = InferenceHelper.postprocess(batched_outputs,\n                                                       print_output=False,",
+        "type": "code",
+        "location": "/tools/wheel.py:303-327"
+    },
+    "8989": {
+        "file_id": 650,
+        "content": "Looping over each chunk of files, preprocesses and runs inference on batched inputs, then post-processes the outputs to store in `batched_outputs`.",
+        "type": "comment"
+    },
+    "8990": {
+        "file_id": 650,
+        "content": "                                                       return_result=True)\n            for res in results_list:\n                classes = res[\"topk_class\"]\n                label_names = []\n                if len(self.label_name_dict) != 0:\n                    label_names = [self.label_name_dict[c] for c in classes]\n                res[\"label_names\"] = label_names\n                print(\"Current video file: {0}\".format(res[\"video_id\"]))\n                print(\"\\ttop-{0} classes: {1}\".format(len(res[\"topk_class\"]),\n                                                      res[\"topk_class\"]))\n                print(\"\\ttop-{0} scores: {1}\".format(len(res[\"topk_scores\"]),\n                                                     res[\"topk_scores\"]))\n                print(\"\\ttop-{0} label names: {1}\".format(\n                    len(res[\"label_names\"]), res[\"label_names\"]))\ndef main():\n    # for cmd\n    args = parse_args(mMain=True)\n    clas_engine = PaddleVideo(**(args.__dict__))\n    clas_engine.predict(args.video_file)\nif __name__ == '__main__':",
+        "type": "code",
+        "location": "/tools/wheel.py:328-353"
+    },
+    "8991": {
+        "file_id": 650,
+        "content": "This code block is iterating through the 'results_list' and adding labels to each result. If the 'label_name_dict' is not empty, it assigns the corresponding label names from the dictionary to the results. It then prints various information about each result such as video file name and top classes/scores/labels. The main function initializes the PaddleVideo class and calls its 'predict' method with a specific video file.",
+        "type": "comment"
+    },
+    "8992": {
+        "file_id": 650,
+        "content": "    main()",
+        "type": "code",
+        "location": "/tools/wheel.py:354-354"
+    },
+    "8993": {
+        "file_id": 650,
+        "content": "This line of code likely represents the entry point for the execution of the script, calling the main function to kick off the program's logic. The specific function or operations within this main() function will depend on the rest of the codebase.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/9.json b/docs/data/9.json
new file mode 100644
index 000000000..5b1d2c93d
--- /dev/null
+++ b/docs/data/9.json
@@ -0,0 +1,543 @@
+{
+    "900": {
+        "file_id": 83,
+        "content": "/applications/EIVideo/QEIVideo/ui/demo.py",
+        "type": "filepath"
+    },
+    "901": {
+        "file_id": 83,
+        "content": "The code creates a PyQt5 video player UI with QGraphicsView, QFrame button, and main window layout. It includes interactive buttons, sliders, QProgressBar, QLabel, and tabs in the tab widget for status and configuration.",
+        "type": "summary"
+    },
+    "902": {
+        "file_id": 83,
+        "content": "# -*- coding: utf-8 -*-\n# Form implementation generated from reading ui file '/Users/zhanghongji/PycharmProjects/EIVideo/resources/QT/demo.ui'\n#\n# Created by: PyQt5 UI code generator 5.15.6\n#\n# WARNING: Any manual changes made to this file will be lost when pyuic5 is\n# run again.  Do not edit this file unless you know what you are doing.\nfrom PyQt5 import QtCore, QtGui, QtWidgets\nclass Ui_MainWindow(object):\n    def setupUi(self, MainWindow):\n        MainWindow.setObjectName(\"MainWindow\")\n        MainWindow.resize(800, 486)\n        MainWindow.setMinimumSize(QtCore.QSize(800, 486))\n        MainWindow.setMaximumSize(QtCore.QSize(800, 486))\n        self.centralwidget = QtWidgets.QWidget(MainWindow)\n        self.centralwidget.setObjectName(\"centralwidget\")\n        self.video_frame = QtWidgets.QFrame(self.centralwidget)\n        self.video_frame.setGeometry(QtCore.QRect(20, 20, 761, 361))\n        self.video_frame.setFrameShape(QtWidgets.QFrame.StyledPanel)\n        self.video_frame.setFrameShadow(QtWidgets.QFrame.Raised)",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/ui/demo.py:1-25"
+    },
+    "903": {
+        "file_id": 83,
+        "content": "The code is a form implementation generated by PyQt5 UI code generator. It defines the `Ui_MainWindow` class which has a method `setupUi` that sets up the properties and widgets of the `MainWindow`. The main window has a central widget, containing a video frame, with dimensions 761x361 pixels.",
+        "type": "comment"
+    },
+    "904": {
+        "file_id": 83,
+        "content": "        self.video_frame.setObjectName(\"video_frame\")\n        self.graphicsView = QtWidgets.QGraphicsView(self.video_frame)\n        self.graphicsView.setGeometry(QtCore.QRect(0, 0, 761, 321))\n        self.graphicsView.setObjectName(\"graphicsView\")\n        self.frame_2 = QtWidgets.QFrame(self.video_frame)\n        self.frame_2.setGeometry(QtCore.QRect(0, 320, 761, 41))\n        self.frame_2.setFrameShape(QtWidgets.QFrame.StyledPanel)\n        self.frame_2.setFrameShadow(QtWidgets.QFrame.Raised)\n        self.frame_2.setObjectName(\"frame_2\")\n        self.horizontalLayoutWidget = QtWidgets.QWidget(self.frame_2)\n        self.horizontalLayoutWidget.setGeometry(QtCore.QRect(-1, -1, 761, 41))\n        self.horizontalLayoutWidget.setObjectName(\"horizontalLayoutWidget\")\n        self.horizontalLayout = QtWidgets.QHBoxLayout(self.horizontalLayoutWidget)\n        self.horizontalLayout.setContentsMargins(0, 0, 0, 0)\n        self.horizontalLayout.setObjectName(\"horizontalLayout\")\n        self.open_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/ui/demo.py:26-41"
+    },
+    "905": {
+        "file_id": 83,
+        "content": "This code sets up the user interface elements for a video player. It creates a QGraphicsView for displaying video frames, and a QFrame with a horizontal layout widget containing a PushButton for opening videos. The QGraphicsView is set to take up most of the video frame, while the QFrame and its widgets sit at the bottom.",
+        "type": "comment"
+    },
+    "906": {
+        "file_id": 83,
+        "content": "        self.open_btn.setObjectName(\"open_btn\")\n        self.horizontalLayout.addWidget(self.open_btn)\n        self.save_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)\n        self.save_btn.setObjectName(\"save_btn\")\n        self.horizontalLayout.addWidget(self.save_btn)\n        self.horizontalSlider = QtWidgets.QSlider(self.horizontalLayoutWidget)\n        self.horizontalSlider.setOrientation(QtCore.Qt.Horizontal)\n        self.horizontalSlider.setObjectName(\"horizontalSlider\")\n        self.horizontalLayout.addWidget(self.horizontalSlider)\n        self.select_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)\n        self.select_btn.setObjectName(\"select_btn\")\n        self.horizontalLayout.addWidget(self.select_btn)\n        self.clean_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)\n        self.clean_btn.setObjectName(\"clean_btn\")\n        self.horizontalLayout.addWidget(self.clean_btn)\n        self.start_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)\n        self.start_btn.setObjectName(\"start_btn\")",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/ui/demo.py:42-58"
+    },
+    "907": {
+        "file_id": 83,
+        "content": "Creates UI buttons and a slider for video player interaction, sets their object names.",
+        "type": "comment"
+    },
+    "908": {
+        "file_id": 83,
+        "content": "        self.horizontalLayout.addWidget(self.start_btn)\n        self.draw_frame = QtWidgets.QFrame(self.video_frame)\n        self.draw_frame.setGeometry(QtCore.QRect(0, 10, 751, 301))\n        self.draw_frame.setFrameShape(QtWidgets.QFrame.StyledPanel)\n        self.draw_frame.setFrameShadow(QtWidgets.QFrame.Raised)\n        self.draw_frame.setObjectName(\"draw_frame\")\n        self.menu_tab = QtWidgets.QTabWidget(self.centralwidget)\n        self.menu_tab.setGeometry(QtCore.QRect(20, 380, 761, 81))\n        self.menu_tab.setObjectName(\"menu_tab\")\n        self.tab = QtWidgets.QWidget()\n        self.tab.setObjectName(\"tab\")\n        self.act_label = QtWidgets.QLabel(self.tab)\n        self.act_label.setEnabled(True)\n        self.act_label.setGeometry(QtCore.QRect(10, 30, 71, 21))\n        self.act_label.setObjectName(\"act_label\")\n        self.act_info_label = QtWidgets.QLabel(self.tab)\n        self.act_info_label.setEnabled(True)\n        self.act_info_label.setGeometry(QtCore.QRect(80, 30, 81, 21))\n        self.act_info_label.setObjectName(\"act_info_label\")",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/ui/demo.py:59-77"
+    },
+    "909": {
+        "file_id": 83,
+        "content": "The code is creating a GUI layout for a video player application. It adds widgets to a main window, sets the geometry and styling of some elements, and creates a tabbed interface with two labels.",
+        "type": "comment"
+    },
+    "910": {
+        "file_id": 83,
+        "content": "        self.act_progressbar = QtWidgets.QProgressBar(self.tab)\n        self.act_progressbar.setGeometry(QtCore.QRect(170, 32, 521, 21))\n        self.act_progressbar.setProperty(\"value\", 24)\n        self.act_progressbar.setObjectName(\"act_progressbar\")\n        self.label_3 = QtWidgets.QLabel(self.tab)\n        self.label_3.setEnabled(True)\n        self.label_3.setGeometry(QtCore.QRect(680, 30, 60, 21))\n        self.label_3.setLayoutDirection(QtCore.Qt.LeftToRight)\n        self.label_3.setAlignment(QtCore.Qt.AlignRight|QtCore.Qt.AlignTrailing|QtCore.Qt.AlignVCenter)\n        self.label_3.setObjectName(\"label_3\")\n        self.menu_tab.addTab(self.tab, \"\")\n        self.tab_2 = QtWidgets.QWidget()\n        self.tab_2.setObjectName(\"tab_2\")\n        self.menu_tab.addTab(self.tab_2, \"\")\n        MainWindow.setCentralWidget(self.centralwidget)\n        self.statusbar = QtWidgets.QStatusBar(MainWindow)\n        self.statusbar.setObjectName(\"statusbar\")\n        MainWindow.setStatusBar(self.statusbar)\n        self.retranslateUi(MainWindow)",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/ui/demo.py:78-97"
+    },
+    "911": {
+        "file_id": 83,
+        "content": "Code snippet creates a QProgressBar and QLabel, sets their properties and positions, adds them to the tab widget, and sets tabs for the main window.",
+        "type": "comment"
+    },
+    "912": {
+        "file_id": 83,
+        "content": "        self.menu_tab.setCurrentIndex(0)\n        QtCore.QMetaObject.connectSlotsByName(MainWindow)\n    def retranslateUi(self, MainWindow):\n        _translate = QtCore.QCoreApplication.translate\n        MainWindow.setWindowTitle(_translate(\"MainWindow\", \"MainWindow\"))\n        self.open_btn.setText(_translate(\"MainWindow\", \"打开视频\"))\n        self.save_btn.setText(_translate(\"MainWindow\", \"保存标注\"))\n        self.select_btn.setText(_translate(\"MainWindow\", \"选择目标\"))\n        self.clean_btn.setText(_translate(\"MainWindow\", \"清空目标\"))\n        self.start_btn.setText(_translate(\"MainWindow\", \"开始推理\"))\n        self.act_label.setText(_translate(\"MainWindow\", \"当前状态：\"))\n        self.act_info_label.setText(_translate(\"MainWindow\", \"-------------\"))\n        self.label_3.setText(_translate(\"MainWindow\", \"12%\"))\n        self.menu_tab.setTabText(self.menu_tab.indexOf(self.tab), _translate(\"MainWindow\", \"状态\"))\n        self.menu_tab.setTabText(self.menu_tab.indexOf(self.tab_2), _translate(\"MainWindow\", \"属性配置\"))",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/ui/demo.py:98-113"
+    },
+    "913": {
+        "file_id": 83,
+        "content": "This code is a part of the user interface (UI) definition for a MainWindow in the QEIVideo application. It sets the window title and button texts, translates strings using QtCore.QCoreApplication.translate, and updates tab labels using self.menu_tab.setTabText. The UI consists of several tabs: one for displaying the current status, another for configuring attributes. The code also connects slots to signals in this MainWindow class.",
+        "type": "comment"
+    },
+    "914": {
+        "file_id": 84,
+        "content": "/applications/EIVideo/QEIVideo/version.py",
+        "type": "filepath"
+    },
+    "915": {
+        "file_id": 84,
+        "content": "This code snippet is the version information for the EIVideo application, created by Acer Zhang on January 11th, 2022. It has a version number \"0.1a\" and the author requests proper attribution if reusing the code.",
+        "type": "summary"
+    },
+    "916": {
+        "file_id": 84,
+        "content": "# Author: Acer Zhang\n# Datetime: 2022/1/11 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\n__version__ = \"0.1a\"",
+        "type": "code",
+        "location": "/applications/EIVideo/EIVideo/version.py:1-6"
+    },
+    "917": {
+        "file_id": 84,
+        "content": "This code snippet is the version information for the EIVideo application, created by Acer Zhang on January 11th, 2022. It has a version number \"0.1a\" and the author requests proper attribution if reusing the code.",
+        "type": "comment"
+    },
+    "918": {
+        "file_id": 85,
+        "content": "/applications/EIVideo/QEIVideo/widget/PaintBoard.py",
+        "type": "filepath"
+    },
+    "919": {
+        "file_id": 85,
+        "content": "The PaintBoard class, derived from QWidget, handles data initialization and view size, provides methods for clearing board, changing pen attributes, painting events, and retrieving content. The code implements a mouse event handler for drawing on the board in PaintBoard mode or eraser mode based on user selection.",
+        "type": "summary"
+    },
+    "920": {
+        "file_id": 85,
+        "content": "from PyQt5.QtWidgets import QWidget\nfrom PyQt5.Qt import QPixmap, QPainter, QPoint, QPaintEvent, QMouseEvent, QPen, \\\n    QColor, QSize\nfrom PyQt5.QtCore import Qt\nclass PaintBoard(QWidget):\n    def __init__(self, parent=None):\n        '''\n        Constructor\n        '''\n        super().__init__(parent)\n        self.__init_data()  # 先初始化数据，再初始化界面\n        self.__init_view()\n    def __init_data(self):\n        self.__size = QSize(810, 458)\n        # 新建QPixmap作为画板，尺寸为__size\n        self.__board = QPixmap(self.__size)\n        self.__board.fill(Qt.transparent)  # 用透明填充画板\n        self.__IsEmpty = True  # 默认为空画板\n        self.EraserMode = False  # 默认为禁用橡皮擦模式\n        self.__lastPos = QPoint(0, 0)  # 上一次鼠标位置\n        self.__currentPos = QPoint(0, 0)  # 当前的鼠标位置\n        self.__painter = QPainter()  # 新建绘图工具\n        self.__thickness = 15  # 默认画笔粗细为10px\n        self.__penColor = QColor(\"black\")  # 设置默认画笔颜色为黑色\n        self.__colorList = QColor.colorNames()  # 获取颜色列表\n    def __init_view(self):\n        # 设置界面的尺寸为__size\n        self.setFixedSize(self.__size)",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/widget/PaintBoard.py:1-40"
+    },
+    "921": {
+        "file_id": 85,
+        "content": "The code defines a class `PaintBoard` which inherits from `QWidget`. It initializes data such as the size of the board, an empty QPixmap, a boolean for EraserMode, and variables to store the position and pen attributes. Then it sets the view's fixed size based on the initialized data.",
+        "type": "comment"
+    },
+    "922": {
+        "file_id": 85,
+        "content": "    def clear(self):\n        # 清空画板\n        # self.__board.fill(Qt.white)\n        self.__board = QPixmap(self.__size)\n        self.__board.fill(Qt.transparent)  # 用透明填充画板\n        self.update()\n        self.__IsEmpty = True\n    def change_pen_color(self, color=\"black\"):\n        # 改变画笔颜色\n        # rgbaColor = QColor(255, 255, 0, 100)\n        self.__penColor = QColor(color)\n    def change_pen_thickness(self, thickness=10):\n        # 改变画笔粗细\n        self.__thickness = thickness\n    def is_empty(self):\n        # 返回画板是否为空\n        return self.__IsEmpty\n    def get_content_as_q_image(self):\n        # 获取画板内容（返回QImage）\n        image = self.__board.toImage()\n        return image\n    def paintEvent(self, paint_event):\n        # 绘图事件\n        # 绘图时必须使用QPainter的实例，此处为__painter\n        # 绘图在begin()函数与end()函数间进行\n        # begin(param)的参数要指定绘图设备，即把图画在哪里\n        # drawPixmap用于绘制QPixmap类型的对象\n        self.__painter.begin(self)\n        # 0,0为绘图的左上角起点的坐标，__board即要绘制的图\n        self.__painter.drawPixmap(0, 0, self.__board)\n        self.__painter.end()",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/widget/PaintBoard.py:42-78"
+    },
+    "923": {
+        "file_id": 85,
+        "content": "This code defines a class with methods to clear the paint board, change pen color and thickness, check if the board is empty, retrieve content as QImage, and handle painting events. The paintEvent method utilizes QPainter to draw the pixmap on the board in the correct location.",
+        "type": "comment"
+    },
+    "924": {
+        "file_id": 85,
+        "content": "    def mousePressEvent(self, mouse_event):\n        # 鼠标按下时，获取鼠标的当前位置保存为上一次位置\n        self.__currentPos = mouse_event.pos()\n        self.__lastPos = self.__currentPos\n    def mouseMoveEvent(self, mouse_event):\n        # 鼠标移动时，更新当前位置，并在上一个位置和当前位置间画线\n        self.__currentPos = mouse_event.pos()\n        self.__painter.begin(self.__board)\n        if self.EraserMode == False:\n            # 非橡皮擦模式\n            self.__painter.setPen(QPen(self.__penColor, self.__thickness))  # 设置画笔颜色，粗细\n        else:\n            # 橡皮擦模式下画笔为纯白色，粗细为10\n            self.__painter.setPen(QPen(Qt.transparent, 10))\n        # 画线\n        # print(self.__lastPos + self.__currentPos)\n        self.__painter.drawLine(self.__lastPos, self.__currentPos)\n        self.__painter.end()\n        self.__lastPos = self.__currentPos\n        self.update()  # 更新显示\n    def mouseReleaseEvent(self, mouseEvent):\n        self.__IsEmpty = False  # 画板不再为空",
+        "type": "code",
+        "location": "/applications/EIVideo/QEIVideo/widget/PaintBoard.py:80-106"
+    },
+    "925": {
+        "file_id": 85,
+        "content": "This code implements a mouse event handler for drawing on a PaintBoard. When the mouse is pressed, the current position is saved as the previous position. As the mouse moves, it draws lines between the last and current positions based on whether eraser mode is enabled or not. Upon mouse release, the board is marked as not empty. The drawing is updated to reflect the changes.",
+        "type": "comment"
+    },
+    "926": {
+        "file_id": 86,
+        "content": "/applications/EIVideo/README.md",
+        "type": "filepath"
+    },
+    "927": {
+        "file_id": 86,
+        "content": "EIVideo is a Windows-based video annotation tool using Baidu Paddle MA-Net model, maintained by QPT-Family on GitHub and available in pre-release/stable versions with customization, usage instructions, updates, and licensing details.",
+        "type": "summary"
+    },
+    "928": {
+        "file_id": 86,
+        "content": "# EIVideo - 交互式智能视频标注工具\n[![Downloads](https://static.pepy.tech/personalized-badge/eivideo?period=total&units=international_system&left_color=grey&right_color=orange&left_text=EIVideo%20User)](https://pepy.tech/project/eivideo)\n[![Downloads](https://static.pepy.tech/personalized-badge/qeivideo?period=total&units=international_system&left_color=grey&right_color=orange&left_text=QEIVideo%20User)](https://pepy.tech/project/qeivideo)\n![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/QPT-Family/EIVideo?include_prereleases)\n![GitHub forks](https://img.shields.io/github/forks/QPT-Family/EIVideo)\n![GitHub Repo stars](https://img.shields.io/github/stars/QPT-Family/EIVideo)\n![GitHub](https://img.shields.io/github/license/QPT-Family/EIVideo)\n![](https://img.shields.io/badge/%E6%B7%B1%E5%BA%A6%E9%80%82%E9%85%8D->Win7-9cf)\n---\n<div align=\"center\">\n<img width=\"600\" alt=\"图片\" src=\"https://user-images.githubusercontent.com/46156734/148925774-a04b641c-6a71-43ed-a7c0-d4b66e8d6e8a.png\">",
+        "type": "code",
+        "location": "/applications/EIVideo/README.md:1-15"
+    },
+    "929": {
+        "file_id": 86,
+        "content": "EIVideo is an interactive intelligent video annotation tool, available for Windows systems starting from Win7. It has downloadable packages for both EIVideo and QEIVideo users, with options to choose pre-releases or the latest stable version. The tool features a user-friendly interface and is actively maintained under the QPT-Family organization on GitHub, with an open license for use.",
+        "type": "comment"
+    },
+    "930": {
+        "file_id": 86,
+        "content": "</div>\nEIVideo，基于百度飞桨MA-Net交互式视频分割模型打造的交互式**智能视频**标注工具箱，只需简单标注几帧，即可完成全视频标注，若自动标注结果未达要求还可通过多次和视频交互而不断提升视频分割质量，直至对分割质量满意。  \n戳 -> 了解相关[技术文章&模型原理](等待微信公众号)\n<div align=\"center\">\n<img width=\"300\" alt=\"图片\" src=\"https://ai-studio-static-online.cdn.bcebos.com/f792bac0dd3b4f44ade7d744b58e908e2a85ed8718b541cfb6b2ce9fc8ad4374\">\n</div>\n> 为了更好的解放双手，我们还提供了图形化界面工具QEIVideo，通过它我们可以不使用繁杂的命令方式来完成视频的智能标注工作。\n---\n### README目录\n- [EAP - The Early Access Program 早期访问计划](#eap---the-early-access-program-早期访问计划)\n- [使用方式](#使用方式)\n  - [安装&运行](#安装运行)\n    - [QPT包 - 适合无Python基础用户](#qpt包---适合无python基础用户)\n    - [标准Python包 - 适合普通Python开发者](#标准python包---适合普通python开发者)\n    - [开发版本 - 适合高阶开发者进行开发/社区贡献](#开发版本---适合高阶开发者进行开发社区贡献)\n- [(Q)EIVideo产品规划安排](#qeivideo产品规划安排)\n- [开源协议](#开源协议)\n---\n### EAP - The Early Access Program 早期访问计划\n> Warning 当前图形化界面QEIVideo处于**极其初阶**的...建设阶段，并不能保证程序稳定性。\n<div align=\"center\"> <img width=\"100\" alt=\"图片\" src=\"https://user-images.githubusercontent.com/46156734/148927601-791362c0-0286-4fb9-b9d1-c193f7485de1.png\"> </div>\n当您选择使用QEIVideo作为图形化界面时，即可视为同意使用“可能会存在大量体验不佳”的EAP产品。",
+        "type": "code",
+        "location": "/applications/EIVideo/README.md:16-49"
+    },
+    "931": {
+        "file_id": 86,
+        "content": "EIVideo: Interactive intelligent video annotation toolbox, based on the Baidu Paddle MA-Net interactive video segmentation model. Can complete full video annotation with simple frame tagging. Improves video segmentation quality through multiple interactions with the video.",
+        "type": "comment"
+    },
+    "932": {
+        "file_id": 86,
+        "content": "同样，您可选择借助基于[PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo) 实现的\n交互式视频标注模型[EIVideo](https://github.com/QPT-Family/EIVideo/EIVideo) 进行二次开发，在此之上也可完成您需要的自定义图形化界面，后续也将提供二次开发指南。\n<div align=\"center\"> <img width=\"100\" alt=\"图片\" src=\"https://user-images.githubusercontent.com/46156734/148928046-b1490080-52f0-4a15-b7ff-11d54b135039.png\"> </div>\n> 如果您愿意参与到EIVideo或QEIVideo的建设中来，欢迎您与PMC取得联系 -> WX:GT_ZhangAcer  \n## 使用方式\n### 安装&运行\n#### QPT包 - 适合无Python基础用户\n自动化配置相关Python环境，但仅支持Windows7/10/11操作系统，且不对盗版Windows7做任何适配。  \n下载地址：暂未上传\n> 自动化部署工具由[QPT - 自动封装工具](https://github.com/QPT-Family/QPT) 支持  \n#### 标准Python包 - 适合普通Python开发者\n* 国际方式：\n  ```shell\n  python -m pip install eivideo\n  python qeivideo\n  ```\n* 国内推荐：\n  ```shell\n  python -m pip install eivideo -i https://mirrors.bfsu.edu.cn/pypi/web/simple\n  python qeivideo\n  ```\n> 上述命令仅适用于常规情况，若您安装了多个Python或修改了相关开发工具与配置，请自行修改相关命令使其符合您的开发环境。\n#### 开发版本 - 适合高阶开发者进行开发/社区贡献\n* 国际方式：\n  ```shell\n  git clone https://github.com/QPT-Family/EIVideo.git\n  python -m pip install -r requirements.txt\n  ```",
+        "type": "code",
+        "location": "/applications/EIVideo/README.md:51-85"
+    },
+    "933": {
+        "file_id": 86,
+        "content": "Code is introducing the user to EIVideo, a customizable interactive video annotation model based on PaddleVideo, with instructions for installation and usage.",
+        "type": "comment"
+    },
+    "934": {
+        "file_id": 86,
+        "content": "* 国内推荐：\n  ```shell\n  # 请勿用于Push！！！\n  git clone https://hub.fastgit.org/QPT-Family/EIVideo.git\n  python -m pip install -r requirements.txt -i https://mirrors.bfsu.edu.cn/pypi/web/simple\n  ```\n* 运行程序\n  ```shell\n  # 进入工作目录\n  cd 此处填写EIVideo所在的目录的绝对路径，且该目录下拥有EIVideo与QEIVideo两文件夹。\n  # 运行\n  python QEIVideo/start.py\n  # 如运行时无法找到对应包，可选择下述方式添加环境变量来调整索引次序后执行python\n  # Windows\n  set PYTHONPATH=$pwd:$PYTHONPATH\n  # Linux\n  export PYTHONPATH=$pwd:$PYTHONPATH\n  ```\n> 上述命令仅适用于常规情况，若您安装了多个Python或修改了相关开发工具与配置，请自行修改相关命令使其符合您的开发环境。\n## (Q)EIVideo产品规划安排  \n> 由于QEIVideo由飞桨开源社区学生爱好者构成，所以在项目的产出过程中将会以学习为主进行开源贡献，如您原因与我们一同建设，我们也将非常欢迎~\n<div align=\"center\"> <img width=\"100\" alt=\"图片\" src=\"https://user-images.githubusercontent.com/46156734/148928475-b5b340b7-241d-4ddc-8155-70d98c6384a9.png\"> </div>\n- [x] EIVideo与Demo版QEIVideo发布0.1.0Alpha版本\n- [ ] 完善QEIVideo，丰富基础标注功能，于Q1升级至1.0Alpha版本\n- [ ] 回归QEIVideo稳定性，于Q2完成1.0正式版本发版\n- [ ] 增加视频目标检测、分类任务的交互式标注功能。\n### 开源协议\n本项目使用GNU LESSER GENERAL PUBLIC LICENSE(LGPL)开源协议。  \n> 因所使用的模型与数据集等原因，本项目中任一代码、参数均不可直接进行商用，如需商用请与我们取得联系。",
+        "type": "code",
+        "location": "/applications/EIVideo/README.md:86-119"
+    },
+    "935": {
+        "file_id": 86,
+        "content": "This code provides instructions for cloning the EIVideo repository, installing necessary dependencies, and running the QEIVideo application. It also mentions that these commands are suitable for regular cases, and users might need to modify them according to their specific development environment. The code discusses the product roadmap of (Q)EIVideo, including planned features and versions. It also specifies the open-source license used for this project and clarifies that the code and parameters cannot be directly used for commercial purposes without prior consent from the developers.",
+        "type": "comment"
+    },
+    "936": {
+        "file_id": 86,
+        "content": "### 引用来源\n1. EIVideo模型以及相关源码、论文与项目 - [PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo)\n2. 部分表情包来源 - [甘城なつき](https://www.pixiv.net/users/3036679)",
+        "type": "code",
+        "location": "/applications/EIVideo/README.md:121-123"
+    },
+    "937": {
+        "file_id": 86,
+        "content": "This code block provides the reference sources for the EIVideo model and its related resources, as well as mentioning the origin of some emoji used in the project.",
+        "type": "comment"
+    },
+    "938": {
+        "file_id": 87,
+        "content": "/applications/EIVideo/resources/QT/demo.ui",
+        "type": "filepath"
+    },
+    "939": {
+        "file_id": 87,
+        "content": "The code creates a Qt application UI with video display, QGraphicsView and push button, along with complex design elements including buttons, sliders, panels, tabs, labels, and progress bars.",
+        "type": "summary"
+    },
+    "940": {
+        "file_id": 87,
+        "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<ui version=\"4.0\">\n <class>MainWindow</class>\n <widget class=\"QMainWindow\" name=\"MainWindow\">\n  <property name=\"geometry\">\n   <rect>\n    <x>0</x>\n    <y>0</y>\n    <width>800</width>\n    <height>486</height>\n   </rect>\n  </property>\n  <property name=\"minimumSize\">\n   <size>\n    <width>800</width>\n    <height>486</height>\n   </size>\n  </property>\n  <property name=\"maximumSize\">\n   <size>\n    <width>800</width>\n    <height>486</height>\n   </size>\n  </property>\n  <property name=\"windowTitle\">\n   <string>MainWindow</string>\n  </property>\n  <widget class=\"QWidget\" name=\"centralwidget\">\n   <widget class=\"QFrame\" name=\"video_frame\">\n    <property name=\"geometry\">\n     <rect>\n      <x>20</x>\n      <y>20</y>\n      <width>761</width>\n      <height>361</height>\n     </rect>\n    </property>\n    <property name=\"frameShape\">\n     <enum>QFrame::StyledPanel</enum>\n    </property>\n    <property name=\"frameShadow\">\n     <enum>QFrame::Raised</enum>\n    </property>\n    <widget class=\"QGraphicsView\" name=\"graphicsView\">",
+        "type": "code",
+        "location": "/applications/EIVideo/resources/QT/demo.ui:1-44"
+    },
+    "941": {
+        "file_id": 87,
+        "content": "This code defines a user interface for a main window with a central widget, a frame for displaying video content, and a QGraphicsView to render the video. The window has a fixed size of 800x486 pixels.",
+        "type": "comment"
+    },
+    "942": {
+        "file_id": 87,
+        "content": "     <property name=\"geometry\">\n      <rect>\n       <x>0</x>\n       <y>0</y>\n       <width>761</width>\n       <height>321</height>\n      </rect>\n     </property>\n    </widget>\n    <widget class=\"QFrame\" name=\"frame_2\">\n     <property name=\"geometry\">\n      <rect>\n       <x>0</x>\n       <y>320</y>\n       <width>761</width>\n       <height>41</height>\n      </rect>\n     </property>\n     <property name=\"frameShape\">\n      <enum>QFrame::StyledPanel</enum>\n     </property>\n     <property name=\"frameShadow\">\n      <enum>QFrame::Raised</enum>\n     </property>\n     <widget class=\"QWidget\" name=\"horizontalLayoutWidget\">\n      <property name=\"geometry\">\n       <rect>\n        <x>-1</x>\n        <y>-1</y>\n        <width>761</width>\n        <height>41</height>\n       </rect>\n      </property>\n      <layout class=\"QHBoxLayout\" name=\"horizontalLayout\">\n       <item>\n        <widget class=\"QPushButton\" name=\"open_btn\">\n         <property name=\"text\">\n          <string>打开视频</string>\n         </property>\n        </widget>\n       </item>\n       <item>",
+        "type": "code",
+        "location": "/applications/EIVideo/resources/QT/demo.ui:45-86"
+    },
+    "943": {
+        "file_id": 87,
+        "content": "The code represents a UI layout design for a user interface with a frame, a horizontal layout widget containing a push button labeled \"打开视频\", and possibly other UI elements. The frame is styled as raised panel, has a specified geometry, and the push button serves to open a video file.",
+        "type": "comment"
+    },
+    "944": {
+        "file_id": 87,
+        "content": "        <widget class=\"QPushButton\" name=\"save_btn\">\n         <property name=\"text\">\n          <string>保存标注</string>\n         </property>\n        </widget>\n       </item>\n       <item>\n        <widget class=\"QSlider\" name=\"horizontalSlider\">\n         <property name=\"orientation\">\n          <enum>Qt::Horizontal</enum>\n         </property>\n        </widget>\n       </item>\n       <item>\n        <widget class=\"QPushButton\" name=\"select_btn\">\n         <property name=\"text\">\n          <string>选择目标</string>\n         </property>\n        </widget>\n       </item>\n       <item>\n        <widget class=\"QPushButton\" name=\"clean_btn\">\n         <property name=\"text\">\n          <string>清空目标</string>\n         </property>\n        </widget>\n       </item>\n       <item>\n        <widget class=\"QPushButton\" name=\"start_btn\">\n         <property name=\"text\">\n          <string>开始推理</string>\n         </property>\n        </widget>\n       </item>\n      </layout>\n     </widget>\n    </widget>\n    <widget class=\"QFrame\" name=\"draw_frame\">\n     <property name=\"geometry\">",
+        "type": "code",
+        "location": "/applications/EIVideo/resources/QT/demo.ui:87-125"
+    },
+    "945": {
+        "file_id": 87,
+        "content": "This code defines a user interface layout with various widgets, including QPushButtons and QSlider. The buttons have text labels in Chinese for \"保存标注\", \"选择目标\", \"清空目标\", and \"开始推理\". The layout is nested within other widgets to create a complex UI design.",
+        "type": "comment"
+    },
+    "946": {
+        "file_id": 87,
+        "content": "      <rect>\n       <x>0</x>\n       <y>10</y>\n       <width>751</width>\n       <height>301</height>\n      </rect>\n     </property>\n     <property name=\"frameShape\">\n      <enum>QFrame::StyledPanel</enum>\n     </property>\n     <property name=\"frameShadow\">\n      <enum>QFrame::Raised</enum>\n     </property>\n    </widget>\n   </widget>\n   <widget class=\"QTabWidget\" name=\"menu_tab\">\n    <property name=\"geometry\">\n     <rect>\n      <x>20</x>\n      <y>380</y>\n      <width>761</width>\n      <height>81</height>\n     </rect>\n    </property>\n    <property name=\"currentIndex\">\n     <number>0</number>\n    </property>\n    <widget class=\"QWidget\" name=\"tab\">\n     <attribute name=\"title\">\n      <string>状态</string>\n     </attribute>\n     <widget class=\"QLabel\" name=\"act_label\">\n      <property name=\"enabled\">\n       <bool>true</bool>\n      </property>\n      <property name=\"geometry\">\n       <rect>\n        <x>10</x>\n        <y>30</y>\n        <width>71</width>\n        <height>21</height>\n       </rect>\n      </property>\n      <property name=\"text\">",
+        "type": "code",
+        "location": "/applications/EIVideo/resources/QT/demo.ui:126-169"
+    },
+    "947": {
+        "file_id": 87,
+        "content": "The code defines a user interface layout with a panel, tab widget, and label. The panel has dimensions, frame shape, and shadow properties set. The tab widget contains a single enabled tab named \"状态\" (Chinese for \"Status\") and has a label inside it with specific geometry and text settings.",
+        "type": "comment"
+    },
+    "948": {
+        "file_id": 87,
+        "content": "       <string>当前状态：</string>\n      </property>\n     </widget>\n     <widget class=\"QLabel\" name=\"act_info_label\">\n      <property name=\"enabled\">\n       <bool>true</bool>\n      </property>\n      <property name=\"geometry\">\n       <rect>\n        <x>80</x>\n        <y>30</y>\n        <width>81</width>\n        <height>21</height>\n       </rect>\n      </property>\n      <property name=\"text\">\n       <string>-------------</string>\n      </property>\n     </widget>\n     <widget class=\"QProgressBar\" name=\"act_progressbar\">\n      <property name=\"geometry\">\n       <rect>\n        <x>170</x>\n        <y>32</y>\n        <width>521</width>\n        <height>21</height>\n       </rect>\n      </property>\n      <property name=\"value\">\n       <number>24</number>\n      </property>\n     </widget>\n     <widget class=\"QLabel\" name=\"label_3\">\n      <property name=\"enabled\">\n       <bool>true</bool>\n      </property>\n      <property name=\"geometry\">\n       <rect>\n        <x>680</x>\n        <y>30</y>\n        <width>60</width>\n        <height>21</height>\n       </rect>",
+        "type": "code",
+        "location": "/applications/EIVideo/resources/QT/demo.ui:170-212"
+    },
+    "949": {
+        "file_id": 87,
+        "content": "This code snippet represents the UI layout for a user interface, using QT framework. It includes labels, progress bars and their respective properties such as position, size, text and value. The labels display current status, act information, and potentially other relevant data. The progress bar shows progress with a specific value and is likely used to represent the completion of certain tasks or actions.",
+        "type": "comment"
+    },
+    "950": {
+        "file_id": 87,
+        "content": "      </property>\n      <property name=\"layoutDirection\">\n       <enum>Qt::LeftToRight</enum>\n      </property>\n      <property name=\"text\">\n       <string>12%</string>\n      </property>\n      <property name=\"alignment\">\n       <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>\n      </property>\n     </widget>\n    </widget>\n    <widget class=\"QWidget\" name=\"tab_2\">\n     <attribute name=\"title\">\n      <string>属性配置</string>\n     </attribute>\n    </widget>\n   </widget>\n  </widget>\n  <widget class=\"QStatusBar\" name=\"statusbar\"/>\n </widget>\n <resources/>\n <connections/>\n</ui>",
+        "type": "code",
+        "location": "/applications/EIVideo/resources/QT/demo.ui:213-236"
+    },
+    "951": {
+        "file_id": 87,
+        "content": "This code represents the user interface layout for a Qt application. It includes various widgets such as labels, buttons and tabs arranged in a specific order with their respective properties and alignment set. The code also specifies the title of each tab.",
+        "type": "comment"
+    },
+    "952": {
+        "file_id": 88,
+        "content": "/applications/EIVideo/resources/cmd",
+        "type": "filepath"
+    },
+    "953": {
+        "file_id": 88,
+        "content": "Updating PaddleVideo's EIVideo on GitHub: pushing and pulling development branches, splitting and rejoining code.",
+        "type": "summary"
+    },
+    "954": {
+        "file_id": 88,
+        "content": "# 更新PaddleVideo上的EIVideo\ngit subtree push --prefix=applications/EIVideo/ https://github.com/QPT-Family/EIVideo 开发分支\ngit subtree pull --prefix=applications/EIVideo/ https://github.com/QPT-Family/EIVideo 开发分支 --squash\ngit subtree split --rejoin --prefix=applications/EIVideo/  --branch 开发分支",
+        "type": "code",
+        "location": "/applications/EIVideo/resources/cmd:1-4"
+    },
+    "955": {
+        "file_id": 88,
+        "content": "Updating PaddleVideo's EIVideo on GitHub: pushing and pulling development branches, splitting and rejoining code.",
+        "type": "comment"
+    },
+    "956": {
+        "file_id": 89,
+        "content": "/applications/FightRecognition/README.md",
+        "type": "filepath"
+    },
+    "957": {
+        "file_id": 89,
+        "content": "The README guides using PaddleVideo's Fight Recognition model for detecting fight and non-fight videos across four datasets. It includes data preparation, training, evaluation, exporting, quickstart guidance, and GPU usage control.",
+        "type": "summary"
+    },
+    "958": {
+        "file_id": 89,
+        "content": "# 打架识别模型\n## 内容\n- [1 快速开始](#快速开始)\n- [2 数据准备](#数据准备)\n    - [2.1 数据集下载](#数据集下载)\n    - [2.2 视频抽帧](#视频抽帧)\n    - [2.3 训练集和验证集划分](#训练集和验证集划分)\n    - [2.4 视频裁剪](#视频裁剪)\n- [3 模型训练](#模型训练)\n- [4 模型评估](#模型评估)\n- [5 模型导出](#模型导出)\n实时行人分析工具[PP-Human](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/pphuman)中集成了视频分类的打架识别模块。本文档介绍如何基于[PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo/)，完成打架识别模型的训练流程。\n目前打架识别模型使用的是[PP-TSM](https://github.com/PaddlePaddle/PaddleVideo/blob/63c88a435e98c6fcaf353429d2df6cc24b8113ba/docs/zh-CN/model_zoo/recognition/pp-tsm.md)，并在PP-TSM视频分类模型训练流程的基础上修改适配，完成模型训练。\n请先参考[使用说明](https://github.com/XYZ-916/PaddleVideo/blob/develop/docs/zh-CN/usage.md)了解PaddleVideo模型库的使用。\n<a name=\"快速开始\"></a>\n## 1 快速开始\n打架识别静态图模型获取[https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.zip](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.zip)。\n打架识别[demo](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/fight_demo.mp4)。\n首先需要将下载好的静态图模型解压并放到`inference`目录下，然后执行下面的命令即可直接判断一个给定的视频中是否存在打架行为：",
+        "type": "code",
+        "location": "/applications/FightRecognition/README.md:1-29"
+    },
+    "959": {
+        "file_id": 89,
+        "content": "This README provides an overview of the Fight Recognition model using PaddleVideo, including sections on quick start, data preparation, model training, evaluation, and model export. The PP-TSM model is used for fight recognition and can be adapted from the existing PP-TSM video classification model training process. Quickstart instructions and download links are provided, along with information on where to find additional usage guidance.",
+        "type": "comment"
+    },
+    "960": {
+        "file_id": 89,
+        "content": "```\ncd ${PaddleVideo_root}\npython tools/predict.py --input_file fight.avi \\\n                           --config pptsm_fight_frames_dense.yaml \\\n                           --model_file inference/ppTSM/ppTSM.pdmodel \\\n                           --params_file inference/ppTSM/ppTSM.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\n<a name=\"数据准备\"></a>\n## 2 数据准备\nPP-TSM是一个基于视频片段进行预测的模型。在PaddleVideo中，训练数据为`.mp4`、`.avi`等格式视频或者是抽帧后的视频帧序列，标签则可以是`.txt`格式存储的文件。\n<a name=\"数据集下载\"></a>\n### 2.1 数据集下载\n本项目基于6个公开的打架、暴力行为相关数据集合并后的数据进行模型训练。公开数据集具体信息如下：\n| 数据集 | 下载连接 | 简介 | 标注 | 数量 | 时长 |\n| ---- | ---- | ---------- | ---- | ---- | ---------- |\n|  Surveillance Camera Fight Dataset| https://github.com/sayibet/fight-detection-surv-dataset | 裁剪视频，监控视角 | 视频级别 | 打架：150；非打架：150 | 2s |\n| A Dataset for Automatic Violence Detection in Videos | https://github.com/airtlab/A-Dataset-for-Automatic-Violence-Detection-in-Videos | 裁剪视频，室内自行录制 | 视频级别 | 暴力行为：115个场景，2个机位，共230 ；非暴力行为：60个场景，2个机位，共120 | 几秒钟 |",
+        "type": "code",
+        "location": "/applications/FightRecognition/README.md:31-55"
+    },
+    "961": {
+        "file_id": 89,
+        "content": "This code is executing a Python script named \"predict.py\" in PaddleVideo's root directory, to predict fight events from a video file named 'fight.avi'. It uses the pre-trained pptsm_fight_frames_dense model and sets GPU usage and TensorRT as False.",
+        "type": "comment"
+    },
+    "962": {
+        "file_id": 89,
+        "content": "| Hockey Fight Detection Dataset | https://www.kaggle.com/datasets/yassershrief/hockey-fight-vidoes?resource=download | 裁剪视频，非真实场景 | 视频级别 | 打架：500；非打架：500 | 2s |\n| Video Fight Detection Dataset | https://www.kaggle.com/datasets/naveenk903/movies-fight-detection-dataset | 裁剪视频，非真实场景 | 视频级别 | 打架：100；非打架：101 | 2s |\n| Real Life Violence Situations Dataset | https://www.kaggle.com/datasets/mohamedmustafa/real-life-violence-situations-dataset | 裁剪视频，非真实场景 | 视频级别 | 暴力行为：1000；非暴力行为：1000 | 几秒钟 |\n| UBI Abnormal Event Detection Dataset| http://socia-lab.di.ubi.pt/EventDetection/ | 未裁剪视频，监控视角 | 帧级别 | 打架：216；非打架：784；裁剪后二次标注：打架1976，非打架1630 | 原视频几秒到几分钟不等，裁剪后2s |\n打架（暴力行为）视频3956个，非打架（非暴力行为）视频3501个，共7457个视频，每个视频几秒钟。\n<a name=\"视频抽帧\"></a>\n### 2.2 视频抽帧\n为了加快训练速度，将视频进行抽帧。\n```bash\ncd ${PaddleVideo_root}\npython data/ucf101/extract_rawframes.py dataset/ rawframes/ --level 2 --ext mp4\n```\n其中，视频存放在`dataset`目录下，打架（暴力）视频存放在`dataset/fight`中；非打架（非暴力）视频存放在`dataset/nofight`中。`rawframes`目录存放抽取的视频帧。\n<a name=\"训练集和验证集划分\"></a>\n### 2.3 训练集和验证集划分",
+        "type": "code",
+        "location": "/applications/FightRecognition/README.md:56-75"
+    },
+    "963": {
+        "file_id": 89,
+        "content": "Code comments:\n- Hockey Fight Detection Dataset: URL, clipped videos, non-realistic scenarios, video level, 500 fight and 500 non-fight videos, 2s duration.\n- Video Fight Detection Dataset: URL, clipped videos, non-realistic scenarios, video level, 100 fights and 101 non-fights, 2s duration.\n- Real Life Violence Situations Dataset: URL, clipped videos, non-realistic scenarios, video level, 1000 fights and 1000 non-fights, a few seconds duration.\n- UBI Abnormal Event Detection Dataset: URL, unclipped videos, surveillance angle, frame level, 216 fights, 784 non-fights, 7,840 frames total, original video durations varying from a few seconds to a few minutes.\n- Extracting rawframes for faster training by running a script in PaddleVideo_root.\n- Split dataset into fight and non-fight videos stored in fight and nofight directories respectively.",
+        "type": "comment"
+    },
+    "964": {
+        "file_id": 89,
+        "content": "本项目验证集1500条，来自Surveillance Camera Fight Dataset、A Dataset for Automatic Violence Detection in Videos、UBI Abnormal Event Detection Dataset三个数据集。\n也可根据下面的代码将数据按照0.8:0.2的比例划分成训练集和测试集：\n```python\nimport os\nimport glob\nimport random\nimport fnmatch\nimport re\nclass_id = {\n    \"nofight\":0,\n    \"fight\":1\n}\ndef get_list(path,key_func=lambda x: x[-11:], rgb_prefix='img_', level=1):\n    if level == 1:\n        frame_folders = glob.glob(os.path.join(path, '*'))\n    elif level == 2:\n        frame_folders = glob.glob(os.path.join(path, '*', '*'))\n    else:\n        raise ValueError('level can be only 1 or 2')\n    def count_files(directory):\n        lst = os.listdir(directory)\n        cnt = len(fnmatch.filter(lst, rgb_prefix + '*'))\n        return cnt\n    # check RGB\n    video_dict = {}\n    for f in frame_folders:\n        cnt = count_files(f)\n        k = key_func(f)\n        if level==2:\n            k = k.split(\"/\")[0]\n        video_dict[f]=str(cnt)+\" \"+str(class_id[k])\n    return video_dict\ndef fight_splits(video_dict, train_percent=0.8):",
+        "type": "code",
+        "location": "/applications/FightRecognition/README.md:77-118"
+    },
+    "965": {
+        "file_id": 89,
+        "content": "The code reads data from three datasets: Surveillance Camera Fight Dataset, A Dataset for Automatic Violence Detection in Videos, and UBI Abnormal Event Detection Dataset. It also allows for splitting the data into training and testing sets with an 80:20 ratio. The 'get_list' function retrieves the list of files and counts them, while the 'fight_splits' function takes the video dictionary and train percent as inputs to split the data into training and testing sets.",
+        "type": "comment"
+    },
+    "966": {
+        "file_id": 89,
+        "content": "    videos = list(video_dict.keys())\n    train_num = int(len(videos)*train_percent)\n    train_list = []\n    val_list = []\n    random.shuffle(videos)\n    for i in range(train_num):\n        train_list.append(videos[i]+\" \"+str(video_dict[videos[i]]))\n    for i in range(train_num,len(videos)):\n        val_list.append(videos[i]+\" \"+str(video_dict[videos[i]]))\n    print(\"train:\",len(train_list),\",val:\",len(val_list))\n    with open(\"fight_train_list.txt\",\"w\") as f:\n        for item in train_list:\n            f.write(item+\"\\n\")\n    with open(\"fight_val_list.txt\",\"w\") as f:\n        for item in val_list:\n            f.write(item+\"\\n\")\nframe_dir = \"rawframes\"\nlevel = 2\ntrain_percent = 0.8\nif level == 2:\n    def key_func(x):\n        return '/'.join(x.split('/')[-2:])\nelse:\n    def key_func(x):\n        return x.split('/')[-1]\nvideo_dict = get_list(frame_dir, key_func=key_func, level=level)  \nprint(\"number:\",len(video_dict))\nfight_splits(video_dict, train_percent)\n```\n最终生成fight_train_list.txt和fight_val_list.txt两个文件。打架的标签为1，非打架的标签为0。",
+        "type": "code",
+        "location": "/applications/FightRecognition/README.md:119-160"
+    },
+    "967": {
+        "file_id": 89,
+        "content": "This code generates two lists, one for training and one for validation, based on a provided video dictionary. It then shuffles the list of videos and splits them into train and val lists. The code also defines a key function depending on the level parameter. Finally, it prints the lengths of both lists, writes them to separate files \"fight_train_list.txt\" and \"fight_val_list.txt\", and calls the fight_splits() function with the video dictionary and train percentage as parameters. These two files will contain the labels for training and validation sets, where fight (label 1) and non-fight (label 0) videos are listed separately.",
+        "type": "comment"
+    },
+    "968": {
+        "file_id": 89,
+        "content": "<a name=\"视频裁剪\"></a>\n### 2.4 视频裁剪\n对于未裁剪的视频，需要先进行裁剪才能用于模型训练，这个给出视频裁剪的函数`cut_video`，输入为视频路径，裁剪的起始帧和结束帧以及裁剪后的视频保存路径。\n```python\nimport cv2\ndef cut_video(video_path, frameToStart, frametoStop, saved_video_path):\n    cap = cv2.VideoCapture(video_path)\n    FPS = cap.get(cv2.CAP_PROP_FPS)\n    #print(\"FPS:\",FPS)\n    TOTAL_FRAME = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # 获取视频总帧数\n    #print(\"TOTAL_FRAME:\",TOTAL_FRAME)\n    size = (cap.get(cv2.CAP_PROP_FRAME_WIDTH), cap.get(cv2.CAP_PROP_FRAME_HEIGHT))\n    #print(\"size:\",size)\n    videoWriter =cv2.VideoWriter(saved_video_path,apiPreference = 0,fourcc = cv2.VideoWriter_fourcc(*'mp4v'),fps=FPS,\n            frameSize=(int(size[0]),int(size[1])))\n    COUNT = 0\n    while True:\n            success, frame = cap.read()\n            if success:\n                COUNT += 1\n                if COUNT <= frametoStop and COUNT > frameToStart:  # 选取起始帧\n                    videoWriter.write(frame)\n            else:\n                print(\"cap.read failed!\")\n                break\n            if COUNT > frametoStop:",
+        "type": "code",
+        "location": "/applications/FightRecognition/README.md:162-192"
+    },
+    "969": {
+        "file_id": 89,
+        "content": "The code defines a function `cut_video` which takes a video path, start and stop frame numbers, and a saved video path. It uses OpenCV to read the input video, determine its FPS, total frames, and size. The function then creates a new VideoWriter object with the specified output file name, fourcc codec, and same FPS as the input video. It writes only the frames between the start and stop frame numbers to the new video file.",
+        "type": "comment"
+    },
+    "970": {
+        "file_id": 89,
+        "content": "                break\n    cap.release()\n    videoWriter.release()\n    print(saved_video_path)\n```\n<a name=\"模型训练\"></a>\n## 3 模型训练\n下载预训练模型：\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams\n```\n模型训练：\n```bash\n# 单卡训练\ncd ${PaddleVideo_root}\npython main.py --validate -c pptsm_fight_frames_dense.yaml\n```\n```bash\ncd ${PaddleVideo_root}\n# 多卡训练\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython -B -m paddle.distributed.launch --gpus=“0,1,2,3” \\\n   --log_dir=log_pptsm_dense  main.py  --validate \\\n   -c pptsm_fight_frames_dense.yaml\n```\n<a name=\"模型评估\"></a>\n## 4 模型评估\n训练好的模型下载：[https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.pdparams)\n模型评估：\n```bash\ncd ${PaddleVideo_root}\npython main.py --test -c pptsm_fight_frames_dense.yaml \\\n   -w ppTSM_fight_best.pdparams\n```\n其中`ppTSM_fight_best.pdparams`为训练好的模型。\n<a name=\"模型导出\"></a>\n## 5 模型导出\n导出inference模型：\n```bash\ncd ${PaddleVideo_root}\npython tools/export_model.py -c pptsm_fight_frames_dense.yaml \\",
+        "type": "code",
+        "location": "/applications/FightRecognition/README.md:193-245"
+    },
+    "971": {
+        "file_id": 89,
+        "content": "This code represents the final part of the model training process. The first line is a break statement which implies the end of a loop or condition block. Following that, it releases the `cap` and `videoWriter` objects, suggesting they were used for capturing and writing video data respectively. The last line prints out the saved video path.",
+        "type": "comment"
+    },
+    "972": {
+        "file_id": 89,
+        "content": "                                -p ppTSM_fight_best.pdparams \\\n                                -o inference/ppTSM\n```",
+        "type": "code",
+        "location": "/applications/FightRecognition/README.md:246-248"
+    },
+    "973": {
+        "file_id": 89,
+        "content": "This code is loading a pre-trained model, \"ppTSM_fight_best.pdparams\", and saving the inference output to the \"inference/ppTSM\" directory.",
+        "type": "comment"
+    },
+    "974": {
+        "file_id": 90,
+        "content": "/applications/FigureSkating/README.md",
+        "type": "filepath"
+    },
+    "975": {
+        "file_id": 90,
+        "content": "This code provides a guide for using OpenPose to process figure skating action data by converting videos into bone point data, suitable for model training and prediction in the PaddleVideo framework.",
+        "type": "summary"
+    },
+    "976": {
+        "file_id": 90,
+        "content": "# 花样滑冰动作识别\n---\n## 内容\n- [视频数据处理方法](#视频数据处理方法)\n- [模型训练预测方法](#模型训练预测方法)\n<div align=\"center\">\n  <img src=\"Alex.gif\" width=250/></div>\n### 视频数据处理方法\n - 提供从视频中提取骨骼点数据的方法，方便用户自行提取数据进行测试。\n 花样滑冰数据提取采用了openpose，通过其提供的demo或是相应的api来实现数据的提取，因此需要用户配置openpose环境。\n 如下是通过花样滑冰数据集构建项目[Skeleton Scripts](https://github.com/HaxiSnake/skeleton_scripts)提取骨骼点数据方法的具体介绍。\n #### step1 安装openpose\n - 参考：https://github.com/CMU-Perceptual-Computing-Lab/openpose  \n #### step2 测试openpose提供demo\n - 这里通过测试openpose的demo程序来验证是否安装成功。\n demo1：检测视频中身体骨骼点（以linux系统为例）：\n ```bash\n ./build/examples/openpose/openpose.bin --video examples_video.avi --write_json output/ --display 0 --render_pose 0\n ```\n 执行成功之后会在output/路径下生成视频每一帧骨骼点数据的json文件。\n demo2：检测视频中身体+面部+手部骨骼点（以linux系统为例）：\n ```bash\n ./build/examples/openpose/openpose.bin --video examples_video.avi --write_json output/ --display 0 --render_pose 0 --face --hand\n ```\n 执行成功之后会在output/路径下生成视频每一帧身体+面部+手部骨骼点数据的json文件。\n #### step3 视频及相关信息处理\n - 由于[Skeleton Scripts](https://github.com/HaxiSnake/skeleton_scripts)为制作花样滑冰数据集所用，因此此处步骤可能存在不同程度误差，实际请用户自行调试代码。",
+        "type": "code",
+        "location": "/applications/FigureSkating/README.md:1-46"
+    },
+    "977": {
+        "file_id": 90,
+        "content": "This code is a guide for processing figure skating action data using openpose, a tool for detecting body skeletons from videos. It includes instructions on how to install and test openpose, as well as specific steps for processing video data with the Skeleton Scripts project.",
+        "type": "comment"
+    },
+    "978": {
+        "file_id": 90,
+        "content": " 将要转化的花样滑冰视频储存到[Skeleton Scripts](https://github.com/HaxiSnake/skeleton_scripts)的指定路径（可自行创建）：\n ```bash\n ./skating2.0/skating63/\n ```\n 同时需要用户自行完成对视频信息的提取，保存为label_skating63.csv文件，储存到如下路径中（可自行创建）：\n ```bash\n ./skating2.0/skating63/\n ./skating2.0/skating63_openpose_result/\n ```\n label_skating63.csv中格式如下：\n | 动作分类 | 视频文件名 | 视频帧数 | 动作标签 |\n | :----: | :----: | :----: | :---- |\n 此处用户只需要输入视频文件名（无需后缀，默认后缀名为.mp4，其他格式需自行更改代码)，其他三项定义为空字符串即可，不同表项之间通过 ',' 分割。\n #### step4 执行skating_convert.py:\n - 注意，这一步需要根据用户对openpose的配置进行代码的更改，主要修改项为openpose路径、openpose-demo路径等，具体详见代码。\n 本脚步原理是调用openpose提供的demo提取视频中的骨骼点，并进行数据格式清洗，最后将每个视频的提取结果结果打包成json文件，json文件储存在如下路径：\n ```bash\n ./skating2.0/skating63_openpose_result/label_skating63_data/\n ```\n #### step5 执行skating_gendata.py:\n 将json文件整理为npy文件并保存，多个视频文件将保存为一个npy文件，保存路径为：\n ```bash\n ./skating2.0/skating63_openpose_result/skeleton_file/\n ```\n - 通过上述步骤就可以将视频数据转化为无标签的骨骼点数据。\n - 最后用户只需将npy数据输入送入网络开始模型测试，亦可通过预测引擎推理。\n ### 模型训练预测方法\n 模型使用方法参考[ST-GCN模型文档](../../docs/zh-CN/model_zoo/recognition/stgcn.md)",
+        "type": "code",
+        "location": "/applications/FigureSkating/README.md:48-92"
+    },
+    "979": {
+        "file_id": 90,
+        "content": "The code is outlining the steps to convert figure skating videos into bone point data, which can then be used for model training and prediction. This involves specifying the video storage paths, extracting video information, using OpenPose to process the videos, and saving the results as npy files. Finally, users can input these npy files into a model or prediction engine. The code is specifically for FigureSkating application in PaddleVideo codebase.",
+        "type": "comment"
+    },
+    "980": {
+        "file_id": 91,
+        "content": "/applications/FootballAction/README.md",
+        "type": "filepath"
+    },
+    "981": {
+        "file_id": 91,
+        "content": "The FootballAction model in PaddleVideo employs PP-TSM, BMN, and Attention LSTM for feature extraction and classification/regression. The code updates the recognizer2d.py file, exports PP-TSM inference models, creates datasets, and predicts BMN proposal information. The Attention LSTM model is trained with improvements, resulting in accuracy and F1-score enhancements.",
+        "type": "summary"
+    },
+    "982": {
+        "file_id": 91,
+        "content": "# 足球动作检测模型\n## 内容\n- [1. 模型简介](#1-模型简介)\n- [2. 环境准备](#2-环境准备)\n- [3. 数据准备](#3-数据准备)\n    - [3.1 数据集简介](#31-数据集简介)\n    - [3.2 数据集下载](#32-数据集下载)\n    - [3.3 数据预处理](#33-数据预处理)\n- [4. 快速体验](#4-快速体验)\n- [5. 进阶使用](#5-进阶使用)\n    - [5.1 模型训练](#51-模型训练)\n    - [5.2 模型推理](#52-模型推理)\n    - [5.3 模型评估](#53-模型评估)\n    - [5.4 模型优化](#54-模型优化)\n    - [5.5 模型部署](#55-模型部署)\n- [6. 参考论文](#6-参考论文)\n<a name=\"模型简介\"></a>\n## 1. 模型简介\nFootballAction是基于PaddleVideo实现的足球动作检测算法，用于从足球比赛视频中定位出精彩动作片段发生的起止时间和对应的动作类别。可以定位的足球动作类型包括8种，分别为：\n```txt\n背景、进球、角球、任意球、黄牌、红牌、换人、界外球\n```\n我们提出的方案结合PP-TSM、BMN和AttentionLSTM三个模型，图像和音频两种模态进行动作检测，算法整体流程共分为以下三步：\n - 特征抽取\n    - 图像特性：PP-TSM\n    - 音频特征：VGGish\n - proposal提取：BMN\n - 动作分类 + 回归：AttentionLSTM\nAIStudio项目： [基于PP-TSM+BMN+AttentionLSTM实现足球精彩时刻剪辑](https://aistudio.baidu.com/aistudio/projectdetail/3473391?channelType=0&channel=0)\n<a name=\"环境准备\"></a>\n## 2. 环境准备\n- PaddleVideo模型库依赖安装请参考 [安装说明](../../docs/zh-CN/install.md)\n<a name=\"数据准备\"></a>\n## 3. 数据准备\n<a name=\"数据集简介\"></a>\n### 3.1 数据集简介\n数据集来自欧洲杯2016，共49个足球视频，其中训练集44个，验证集5个。\n- 数据集label格式\n```\n{\n    \"0\": \"背景\",",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:1-54"
+    },
+    "983": {
+        "file_id": 91,
+        "content": "This is a README for the FootballAction model in PaddleVideo, introducing a soccer action detection algorithm. It consists of sections on model introduction, environment preparation, data preparation (including dataset details), quick experience, advanced usage, references, and installation instructions. The model uses PP-TSM, BMN, and AttentionLSTM for feature extraction, proposal extraction, and action classification/regression from image and audio modalities. The dataset is derived from the 2016 European Cup, with 49 videos in total (44 training, 5 validation).",
+        "type": "comment"
+    },
+    "984": {
+        "file_id": 91,
+        "content": "    \"1\": \"进球\",\n    \"2\": \"角球\",\n    \"3\": \"任意球\",\n    \"4\": \"黄牌\",\n    \"5\": \"红牌\",\n    \"6\": \"换人\",\n    \"7\": \"界外球\",\n}\n```\n- 数据集标注文件:\n```txt\ndatasets/EuroCup2016/label_cls8_train.json\ndatasets/EuroCup2016/label_cls8_val.json\n```\n- 数据集gts处理, 将原始标注数据处理成如下json格式\n```\n{\n    'fps': 5,\n    'gts': [\n        {\n            'url': 'xxx.mp4',\n            'total_frames': 6341,\n            'actions': [\n                {\n                    \"label_ids\": [7],\n                    \"label_names\": [\"界外球\"],\n                    \"start_id\": 395,\n                    \"end_id\": 399\n                },\n                ...\n            ]\n        },\n        ...\n    ]\n}\n```\n<a name=\"数据集下载\"></a>\n### 3.2 数据集下载\n数据集下载链接: [dataset_url.list](./datasets/EuroCup2016/dataset_url.list)\n可使用如下脚本下载：\n```\ncd datasets/EuroCup2016 && sh download_dataset.sh\n```\n<a name=\"数据预处理\"></a>\n### 3.3 数据预处理\n- 数据集抽帧, 由mp4, 得到frames和pcm, 这里需要添加ffmpeg环境\n```\ncd datasets/script && python get_frames_pcm.py\n```\n经过以上步骤，得到的代码结构如下所示：\n```\n|-- FootballAction\n   |--  checkpoints                # 模型存放路径\n   |--  datasets                   # 数据集和数据处理脚本",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:55-118"
+    },
+    "985": {
+        "file_id": 91,
+        "content": "This code defines a dictionary where each key corresponds to an action in the football game, such as \"进球\" or \"角球\". The dataset file contains these labeled examples of actions for training and validation purposes. The data preprocessing step involves handling these labels, creating a JSON format file containing frames per second (fps) and ground truth (gts) data with respective video URLs, total frames, and action information including label IDs, names, start and end frame indices. It also mentions that the dataset can be downloaded using a provided script, and that the code structure is organized in a specific way.",
+        "type": "comment"
+    },
+    "986": {
+        "file_id": 91,
+        "content": "        |--  EuroCup2016           # 数据存放路径\n            |--  feature_bmn       # bmn提取到的proposal\n            |--  features          # image和audio特征, image fps=5, audio 每秒(1024)\n            |--  input_for_bmn     # bmn训练的输入数据，widows=40\n            |--  input_for_lstm    # lstm训练的输入数据\n            |--  input_for_pptsm    # pptsm训练的数据数据\n            |--  mp4               # 原始视频.mp4\n            |--  frames            # 图像帧, fps=5, '.jpg'格式\n            |--  pcm               # 音频pcm, 音频采样率16000，采用通道数1\n            |--  url.list          # 视频列表\n            |--  url_val.list          # 视频列表\n            |--  label_cls8_train.json  # 训练集原始gts\n            |--  label_cls8_val.json    # 验证集原始gts\n            |--  label.json        # 动作label\n        |--  script                # 数据集处理脚本\n    |--  predict                   # 模型预测代码\n    |--  extractor                 # 特征提取脚本\n    |--  train_lstm                # lstm训练代码\n    |--  train_proposal            # pptsm、bmn训练代码\n        |--  configs               # pptsm、bmn配置文件\n```\n<a name=\"快速体验\"></a>",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:119-141"
+    },
+    "987": {
+        "file_id": 91,
+        "content": "This directory contains data and scripts related to the FootballAction dataset. It includes original MP4 videos, their image frames, audio PCM files, URL lists, and JSON files for ground truth labels and classifications. There are also separate folders for scripting data processing, feature extraction, model training (LSTM), and proposal-based object detection (PPTSM and BDN). The configs folder contains the configuration files needed for these training scripts.",
+        "type": "comment"
+    },
+    "988": {
+        "file_id": 91,
+        "content": "## 4. 快速体验\n首先，通过以下命令，下载训练好的模型文件：\n```bash\ncd checkpoints\nsh  download.sh\n```\n运行预测代码：\n```\ncd ${FootballAction_root}/predict && python predict.py\n```\n产出文件：results.json\n<a name=\"进阶使用\"></a>\n## 5. 进阶使用\n<a name=\"模型训练\"></a>\n### 5.1 模型训练\n采样方式：\n- image 采样频率fps=5，如果有些动作时间较短，可以适当提高采样频率\n- BMN windows=200，即40s，所以测试自己的数据时，视频时长需大于40s\n请先参考[使用说明](../../docs/zh-CN/usage.md)了解PaddleVideo模型库的使用。\n#### step1 PP-TSM训练\nPP-TSM模型使用文档参考[PP-TSM](../../docs/zh-CN/model_zoo/recognition/pp-tsm.md)\n##### step1.1  PP-TSM 训练数据处理\n使用如下命令结合frames和gts生成训练所需要的正负样本:\n```bash\ncd datasets/script && python get_instance_for_pptsm.py\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                   # 数据集和数据处理脚本\n        |--  EuroCup2016           # 数据存放路径\n            |--  input_for_pptsm   # pptsm训练的数据\n```\n文件按照如下格式命名：\n```\n'{}_{}_{}_{}'.format(video_basename, start_id, end_id, label)\n```\n##### step1.2 PP-TSM模型训练\n训练启动命令如下：\n```bash\ncd ${FootballAction_root}\ncd ../..  #进入PaddleVideo目录下\npython -B -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    --log_dir=./football/logs_pptsm \\\n    main.py  \\",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:142-201"
+    },
+    "989": {
+        "file_id": 91,
+        "content": "This code explains the steps to download a pre-trained model, run prediction, and perform advanced usage including training PP-TSM for the FootballAction application in PaddleVideo. It requires following specific commands and using provided scripts.",
+        "type": "comment"
+    },
+    "990": {
+        "file_id": 91,
+        "content": "    --validate \\\n    -c applications/FootballAction/train_proposal/configs/pptsm_football_v2.0.yaml  \\\n    -o output_dir=./football/pptsm\n```\n我们也提供了训练好的PP-TSM模型，下载链接已在快速体验章节中给出。\n##### step1.3 导出PP-TSM推理模型\n在转为预测模式前，需要修改 `PaddleVideo/paddlevideo/modeling/framework/recognizers/recognizer2d.py` 文件，将 init 和 infer_step 函数分别更新为如下代码：\n```python\n    def __init__(self, backbone=None, head=None):\n        super().__init__(backbone=backbone, head=head)\n        self.avgpool2d = paddle.nn.AdaptiveAvgPool2D((1, 1), data_format='NCHW')\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))\n        feature = self.backbone(imgs)\n        feat = self.avgpool2d(feature)\n        return feat\n```\n再执行如下命令：\n```bash\ncd ${PaddleVideo_root}\npython tools/export_model.py -c applications/FootballAction/train_proposal/configs/pptsm_football_v2.0.yaml \\\n                             -p ./football/pptsm/ppTSM_best.pdparams \\",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:202-230"
+    },
+    "991": {
+        "file_id": 91,
+        "content": "This code is updating the `recognizer2d.py` file in PaddleVideo, modifying the `__init__` and `infer_step` functions. Then it executes a command to export the PP-TSM inference model using the provided configuration file and best model parameters from previous training.",
+        "type": "comment"
+    },
+    "992": {
+        "file_id": 91,
+        "content": "                             -o ./football/inference_model\n```\n#####  step1.4  基于PP-TSM的视频特征提取\n将 `PaddleVideo/applications/FootballAction/predict/action_detect/models/pptsm_infer.py` 文件中41行的\n```python\nself.output_tensor = self.predictor.get_output_handle(output_names[1])\n```\n替换为\n```python\nself.output_tensor = self.predictor.get_output_handle(output_names[0])\n```\n使用如下命令进行image和audio特征的提取，默认使用下载的模型进行特征提取，如果使用自己数据训练的模型，请注意修改配置文件中模型的文件路径:\n```bash\ncd ${FootballAcation}\ncd extractor && python extract_feat.py\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                   # 训练数据集和处理脚本\n        |--  EuroCup2016            # 数据集\n            |--  features          # 视频的图像+音频特征\n```\n推理特征以pkl文件保存，格式如下：\n```txt\n# 特征维度, image(2048) + audio(1024)\nvideo_features = {'image_feature': np_image_features,\n                  'audio_feature': np_audio_features}\n```\n此特征接下来会用于BMN模型的训练。\n#### step2 BMN训练\nBMN模型使用文档参考[BMN](../../docs/zh-CN/model_zoo/localization/bmn.md)\n##### step2.1 BMN训练数据处理\n使用如下命令得到BMN训练所需要的数据集，默认使用windows=40，根据gts和特征得到训练所需的proposal：\n```bash",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:231-275"
+    },
+    "993": {
+        "file_id": 91,
+        "content": "In this code snippet, we are replacing a line of code in `pptsm_infer.py` to change the output tensor from the second to the first output name. This is followed by commands to extract image and audio features using the modified code, which are stored in the \"features\" folder within the respective dataset. These features will be used in the training of a BMN model.",
+        "type": "comment"
+    },
+    "994": {
+        "file_id": 91,
+        "content": "cd FootballAction/datasets/script && python get_instance_for_bmn.py\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                   # 训练数据集和处理脚本\n        |--  EuroCup2016            # 数据集\n            |--  input_for_bmn     # bmn训练的proposal\n                |--  feature\n                |--  label.json  \n```\n特征文件保存在`label.json`文件中，数据格式如下：\n```txt\n{\n    \"719b0a4bcb1f461eabb152298406b861_753_793\": {\n        \"duration_second\": 40.0,\n        \"duration_frame\": 200,\n        \"feature_frame\": 200,\n        \"subset\": \"train\",\n        \"annotations\": [\n            {\n                \"segment\": [\n                    15.0,\n                    22.0\n                ],\n                \"label\": \"3.0\",\n                \"label_name\": \"任意球\"\n            }\n        ]\n    },\n    ...\n}\n```\n##### step2.2  BMN模型训练\n训练启动命令如下：\n```bash\npython -B -m paddle.distributed.launch \\\n     --gpus=\"0,1\" \\\n     --log_dir=./football/logs_bmn \\\n     main.py  \\\n     --validate \\\n     -c applications/FootballAction/train_proposal/configs/bmn_football_v2.0.yaml \\\n     -o output_dir=./football/bmn",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:276-320"
+    },
+    "995": {
+        "file_id": 91,
+        "content": "This code changes the directory to \"FootballAction/datasets/script\" and runs a Python script named get_instance_for_bmn.py, which creates a dataset for BMN (Bounding Box Regression) model training. The resulting data is stored in the datasets folder with the instance information saved as JSON files within the input_for_bmn directory.",
+        "type": "comment"
+    },
+    "996": {
+        "file_id": 91,
+        "content": "```\n我们也提供了训练好的BMN模型，下载链接已在快速体验章节中给出。\n##### step2.3 导出BMN推理模型\n模型导出命令如下:\n```bash\npython tools/export_model.py -c applications/FootballAction/train_proposal/configs/bmn_football_v2.0.yaml \\\n                              -p ./football/bmn/BMN_epoch_00016.pdparams \\\n                               -o ./football/inference_model\n```\n##### step2.4  BMN模型预测\n使用如下命令进行预测，得到动作proposal信息： start_id, end_id, score。如果使用自己数据训练的模型，请注意修改配置文件中模型的文件路径:\n```\ncd extractor && python extract_bmn.py\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                   # 训练数据集和处理脚本\n        |--  EuroCup2016            # 数据集\n            |--  feature_bmn\n                 |--  prop.json    # bmn 预测结果\n```\n预测结果数据格式如下：\n```txt\n[\n    {\n        \"video_name\": \"c9516c903de3416c97dae91a59e968d7\",\n        \"num_proposal\": 5534,\n        \"bmn_results\": [\n            {\n                \"start\": 7850.0,\n                \"end\": 7873.0,\n                \"score\": 0.77194699622342\n            },\n            {\n                \"start\": 4400.0,\n                \"end\": 4443.0,\n                \"score\": 0.7663803287641536",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:321-362"
+    },
+    "997": {
+        "file_id": 91,
+        "content": "Step 2.3: Export BMN inference model with command `python tools/export_model.py -c applications/FootballAction/train_proposal/configs/bmn_football_v2.0.yaml -p ./football/bmn/BMN_epoch_00016.pdparams -o ./football/inference_model`.\nStep 2.4: Use command `cd extractor && python extract_bmn.py` to predict BMN proposal information.",
+        "type": "comment"
+    },
+    "998": {
+        "file_id": 91,
+        "content": "            },\n            ...\n        ]\n    },\n    ...\n]\n```\n#### step3 LSTM训练\nAttentionLSTM模型使用文档参考[AttentionLSTM](../../docs/zh-CN/model_zoo/localization/bmn.md)，此处我们对原始对AttentionLSTM模型进行了改进，包括：\n1. 不同模态特征在LSTM中使用不同的hiddne_size\n2. 加入了一个回归分支用于回归iou\n3. 模型中加入了BN层抑制过拟合\n##### step3.1  LSTM训练数据处理\n将BMN得到的proposal截断并处理成LSTM训练所需数据集。同理，注意数据集文件修改路径。\n```\ncd datasets/script && python get_instance_for_lstm.py\n```\n完成该步骤后，数据存储位置\n```\n   |--  datasets                    # 训练数据集和处理脚本\n        |--  EuroCup2016            # 数据集\n            |--  input_for_lstm     # lstm训练的proposal\n                ├── feature         # 特征\n                ├── label_info.json # 标签信息\n                ├── train.txt       # 训练文件列表\n                └── val.txt         # 测试文件列表\n```\n- `label_info.json`数据格式如下：\n```\n{\n    \"fps\": 5,\n    \"results\": [\n        {\n            \"url\": \"https://xxx.mp4\",\n            \"mode\": \"train\",        # train or validation\n            \"total_frames\": 6128,\n            \"num_gts\": 93,\n            \"num_proposals\": 5043,\n            \"proposal_actions\": [",
+        "type": "code",
+        "location": "/applications/FootballAction/README.md:363-408"
+    },
+    "999": {
+        "file_id": 91,
+        "content": "This code is part of the Attention LSTM model training process in the FootballAction application. It mentions a few improvements made to the original AttentionLSTM model, such as using different hidden sizes for different modal features and adding a regression branch for IOU. The code also discusses processing training data for LSTM training and provides an example of the label_info.json format.",
+        "type": "comment"
+    }
+}
\ No newline at end of file
diff --git a/docs/data/titles/0.json b/docs/data/titles/0.json
new file mode 100644
index 000000000..492ca0bb1
--- /dev/null
+++ b/docs/data/titles/0.json
@@ -0,0 +1,302 @@
+{
+    "/MANIFEST.in": "PaddleVideo Manifest Configuration",
+    "/README.md": "Advanced Video Processing with PaddleVideo",
+    "/README.md:1-22": "Advanced Video Processing with PaddleVideo",
+    "/README.md:25-58": "PaddleVideo: Comprehensive Video Tech Course and Code",
+    "/README.md:59-75": "PaddleVideo Documentation Table of Contents",
+    "/README_en.md": "PaddleVideo: Deep Learning for Video Processing",
+    "/README_en.md:1-20": "PaddleVideo: Industrial and Academic Video Toolset",
+    "/README_en.md:20-43": "PaddleVideo: Comprehensive Video AI Platform",
+    "/README_en.md:44-65": "PaddleVideo: Video Processing Deep Learning Library",
+    "/__init__.py": "Licensed Python Module: PaddleVideo",
+    "/applications/AbnormalActionDetection/README.md": "Abnormal Action Detection with PaddleVideo",
+    "/applications/AbnormalActionDetection/README.md:1-40": "Abnormal Action Detection with PaddleVideo",
+    "/applications/AbnormalActionDetection/README.md:115-153": "Video Action Detection with PaddleVideo",
+    "/applications/AbnormalActionDetection/README.md:42-114": "Abnormal Action Detection Pipeline",
+    "/applications/Anti-UAV/README.md": "Detect UAVs in Restricted Areas with PaddleDetection",
+    "/applications/Anti-UAV/README.md:1-21": "Paddle-Anti-UAV: Detecting Flying UAVs",
+    "/applications/Anti-UAV/README.md:23-36": "UAV Detection with PP-YOLO and PaddleDetection",
+    "/applications/Anti-UAV/README.md:36-39": "Customize Anti-UAV Demo with PaddleVideo",
+    "/applications/Anti-UAV/get_image_label.py": "Object Detection and Labeling Tool",
+    "/applications/Anti-UAV/get_image_label.py:1-53": "Initialize Directories and Info",
+    "/applications/Anti-UAV/get_image_label.py:102-128": "Labeling Frames by Object Presence",
+    "/applications/Anti-UAV/get_image_label.py:129-151": "Bounding Box Image Labelling",
+    "/applications/Anti-UAV/get_image_label.py:152-164": "Writing Annotation and Image Data to JSON Files",
+    "/applications/Anti-UAV/get_image_label.py:54-77": "Object Detection and Labeling in Images",
+    "/applications/Anti-UAV/get_image_label.py:78-101": "Write and Annotate Image Data",
+    "/applications/BasketballAction/README.md": "Basketball Action Detection App",
+    "/applications/BasketballAction/README.md:1-69": "Basketball Action Detection with PaddlePaddle",
+    "/applications/BasketballAction/README.md:100-135": "Prepare, Train, and Convert ppTSM Model",
+    "/applications/BasketballAction/README.md:136-163": "BasketballAction: Feature Extraction & BMN Training",
+    "/applications/BasketballAction/README.md:165-206": "Preparing and Training BMN Model for Basketball Action Dataset",
+    "/applications/BasketballAction/README.md:207-243": "BMN-Based Basketball Action Predictions",
+    "/applications/BasketballAction/README.md:244-284": "BasketballAction LSTM Training Data Structure",
+    "/applications/BasketballAction/README.md:285-319": "PaddleVideo BasketballAction: LSTM Data Formats",
+    "/applications/BasketballAction/README.md:320-365": "LSTM Model Inference and Evaluation",
+    "/applications/BasketballAction/README.md:366-389": "Optimized Action Detection with TSM and BMN",
+    "/applications/BasketballAction/README.md:389-389": "Author List",
+    "/applications/BasketballAction/README.md:70-99": "Basketball Action Dataset Structure",
+    "/applications/BasketballAction/predict/action_detect/action.py": "Basketball Action Detector",
+    "/applications/BasketballAction/predict/action_detect/action.py:1-44": "Basketball Action Detection with Python",
+    "/applications/BasketballAction/predict/action_detect/action.py:105-133": "Video Action Detection and Feature Extraction",
+    "/applications/BasketballAction/predict/action_detect/action.py:134-152": "Feature Extraction and Storage",
+    "/applications/BasketballAction/predict/action_detect/action.py:153-174": "Video Feature Inference and Storage",
+    "/applications/BasketballAction/predict/action_detect/action.py:45-71": "ModelPredict Class Initialization and Configuration",
+    "/applications/BasketballAction/predict/action_detect/action.py:72-104": "InferModel for Action Prediction",
+    "/applications/BasketballAction/predict/action_detect/logger.py": "Custom Logger for News Stripper",
+    "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py": "Audio Feature Extraction and Processing",
+    "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:1-41": "Audio Feature Extraction Functions",
+    "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:117-139": "Audio Feature Extraction and Preprocessing",
+    "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:140-158": "Extract Audio Features for Wav File",
+    "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:44-68": "Mel Spectrogram Matrix Creation",
+    "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:69-93": "Calculate MFCC for Audio Data",
+    "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py:94-116": "Mel Spectrogram Feature Extraction",
+    "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py": "Model-Based Audio Feature Extraction",
+    "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:1-42": "Audio Feature Extraction Model",
+    "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py:43-51": "Audio Feature List Generator",
+    "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py": "VGGish Parameters for Basketball Action Detection",
+    "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:1-29": "Global VGGish Parameters",
+    "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py:30-37": "Adam Optimizer with Epsilon Value",
+    "/applications/BasketballAction/predict/action_detect/models/audio_infer.py": "Audio Inference with InferModel",
+    "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:1-37": "Audio Inference Model Initialization",
+    "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:39-69": "Audio Inference Model",
+    "/applications/BasketballAction/predict/action_detect/models/audio_infer.py:71-80": "Audio Infer: Model Prediction and Time Calculation",
+    "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py": "Basketball Action BMN Inferencing",
+    "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:1-37": "BMN Inferencing Class Initialization",
+    "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:112-131": "Average Model Predictions for Action Detection",
+    "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:133-155": "Inference Time for Action Detection",
+    "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:38-63": "Basketball Action Detection Model",
+    "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:64-86": "Action Detection Model",
+    "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py:87-111": "Boundary Mask Prediction",
+    "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py": "LSTM-based Basketball Action Detection",
+    "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:1-36": "LSTM Inferencing Model in BasketballAction",
+    "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:113-141": "LSTM Model for Action Detection",
+    "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:142-145": "JSON Data Logging and Time Tracking",
+    "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:37-61": "GPU-Optimized LSTM Action Detector",
+    "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:62-90": "LSTM Basketball Action Detection",
+    "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py:91-112": "LSTM-Based Action Detection",
+    "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py": "PPTSM Action Detection Infer Model",
+    "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py:1-38": "PPTSM Action Detection Model Inference",
+    "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py:40-69": "PaddleVideo Action Prediction",
+    "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py:70-83": "Python Model Prediction with Timing",
+    "/applications/BasketballAction/predict/action_detect/reader/__init__.py": "Alphabetical Action Readers",
+    "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py": "Audio Reader for YouTube-8M Dataset",
+    "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:1-37": "AudioReader Class for YouTube-8M Dataset",
+    "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:38-70": "Audio Reader for Multiple Models",
+    "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py:71-78": "Audio Batch Manager",
+    "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py": "BMNINF Reader for Basketball Action Prediction",
+    "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:1-49": "BMNINF Reader: Generating Proposals for BMN Models",
+    "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:106-141": "BMNINF Reader Functionality",
+    "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:142-151": "Video Data Reader Class",
+    "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:50-73": "BMNINF Reader Initialization",
+    "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py:74-105": "BMNInf Reader Class",
+    "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py": "FeatureReader: YouTube-8M Dataset Reader and Model Support",
+    "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py:1-33": "FeatureReader: Efficient YouTube-8M Data Reader",
+    "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py:35-71": "Feature Reader: Data Batches for Basketball",
+    "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py:72-86": "Batching Action Features Reader",
+    "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py": "Video Reader Utils",
+    "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:1-34": "Customizable Reader Not Found Error Handling",
+    "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:35-83": "Video Data Reader Zoo",
+    "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py:84-109": "Singleton Reader Registry",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py": "TSMINF Image Reader",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:1-37": "TSMINF Reader: Efficient JPG Video Dataset Reader",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:122-144": "Fault-Tolerant Image Reader",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:145-172": "Transformative Image Reader: Applied Action Detection",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:173-203": "Image Transformation Function",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:204-239": "Random Crop Size Generator",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:240-262": "Crop Position Calculator",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:263-298": "Random Cropped Image Group",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:300-338": "Image Preprocessing for ML Models",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:339-366": "Image Cropper and Resizer",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:38-66": "Configuring TSN Inference Reader",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:67-97": "BasketballAction Video Reader",
+    "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py:98-120": "Multithreaded Video Frame to Image Conversion",
+    "/applications/BasketballAction/predict/action_detect/utils/config_utils.py": "BasketballAction Config Utils",
+    "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:1-46": "PaddleVideo BasketballAction Config Utils",
+    "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:47-79": "Config Parser and Printer",
+    "/applications/BasketballAction/predict/action_detect/utils/config_utils.py:80-80": "Logger for Code Separation",
+    "/applications/BasketballAction/predict/action_detect/utils/preprocess.py": "FFmpeg Functions for Video Processing",
+    "/applications/BasketballAction/predict/action_detect/utils/process_result.py": "Action Detection with NMS Filtration",
+    "/applications/BasketballAction/predict/action_detect/utils/process_result.py:1-39": "Non-Maximum Suppression Algorithm for Bounding Boxes",
+    "/applications/BasketballAction/predict/action_detect/utils/process_result.py:108-129": "Action Detection Processing",
+    "/applications/BasketballAction/predict/action_detect/utils/process_result.py:130-144": "NMS Action Result Processor",
+    "/applications/BasketballAction/predict/action_detect/utils/process_result.py:40-76": "Non-Maximal Suppression for Bounding Boxes",
+    "/applications/BasketballAction/predict/action_detect/utils/process_result.py:77-107": "Video Detection Filtering and Sorting",
+    "/applications/BasketballAction/predict/eval.py": "Optimal IOU Threshold for Basketball",
+    "/applications/BasketballAction/predict/eval.py:1-36": "Load Ground Truth Annotations (gts)",
+    "/applications/BasketballAction/predict/eval.py:121-144": "IoU-based Metric Calculation for Object Detection",
+    "/applications/BasketballAction/predict/eval.py:146-161": "Precision and Recall Calculator",
+    "/applications/BasketballAction/predict/eval.py:162-189": "Calculate F1 Score from Predictions",
+    "/applications/BasketballAction/predict/eval.py:190-218": "Video Action Detection Model Evaluation",
+    "/applications/BasketballAction/predict/eval.py:219-237": "Optimal IOU Threshold for Basketball",
+    "/applications/BasketballAction/predict/eval.py:37-67": "Evaluating Basketball Action Predictions",
+    "/applications/BasketballAction/predict/eval.py:68-93": "Box Sorting and Conversion Function",
+    "/applications/BasketballAction/predict/eval.py:94-120": "Box Evaluation Metrics Calculator",
+    "/applications/BasketballAction/predict/predict.py": "Basketball Action Prediction",
+    "/applications/BasketballAction/predict/predict.py:2-33": "Basketball Action Predictor",
+    "/applications/BasketballAction/predict/predict.py:34-35": "Write Indented JSON to File",
+    "/applications/EIVideo/EIVideo/README.MD": "CLI Guide for EIVideo Annotation Tool",
+    "/applications/EIVideo/EIVideo/__init__.py": "EIVideo __init__.py: Root Paths and Constants",
+    "/applications/EIVideo/EIVideo/api.py": "JSON Video Annotation Tool",
+    "/applications/EIVideo/EIVideo/api.py:1-39": "Image Handling Functions",
+    "/applications/EIVideo/EIVideo/api.py:102-130": "Image Resizing and Processing",
+    "/applications/EIVideo/EIVideo/api.py:131-134": "JSON Overlay Dictionary Saving",
+    "/applications/EIVideo/EIVideo/api.py:40-67": "PNG to JSON Image Parsing",
+    "/applications/EIVideo/EIVideo/api.py:68-101": "Video Processing: Save, Load, and Annotate JSON",
+    "/applications/EIVideo/EIVideo/main.py": "PaddleVideo Training with Distributed Support",
+    "/applications/EIVideo/EIVideo/main.py:1-29": "PaddleVideo Training Script",
+    "/applications/EIVideo/EIVideo/main.py:30-53": "Command Line Arguments for EIVideo",
+    "/applications/EIVideo/EIVideo/main.py:54-82": "Command-Line Arguments for Training Control",
+    "/applications/EIVideo/EIVideo/main.py:83-116": "Command-line Arguments Parser for Video Testing",
+    "/applications/EIVideo/EIVideo/paddlevideo/__init__.py": "PaddleVideo Library Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py": "Loading EIVideo Modules",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py": "Graceful Termination PaddleVideo Dataset Loader",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:1-31": "Building Pipeline with PaddleVideo",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:107-134": "Mix Collate Function for Stacked Batches",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:135-151": "Signal Handler Setup for Process and Group",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:32-80": "Dataset Loader Builder",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py:81-106": "Paddle Dataloader Builder",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py": "EIVideo Image Preprocessing Pipeline",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py": "Flexible Pipeline Transformation with Compose",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:1-31": "Compose Class for Pipeline Composition",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:32-59": "Compose Class Sequential Transform Composition",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py:60-76": "Compose Pipeline Class",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py": "Paddle Video Image Preprocessing",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:1-43": "RandomScale\\_manet Pipeline",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:111-134": "Cropped Labels from Image",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:135-163": "Random Region Flipping Transform",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:164-198": "Custom Image Transforms for PaddlePipelines",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:199-220": "Scribble Image to Foreground Mask",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:46-75": "Resize Image Pipeline",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py:76-109": "Custom Image Crop Transform",
+    "/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py": "Organizing PaddleVideo Functionalities with Registries",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py": "PaddleVideo Metrics Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py": "Abstract Base Metric Class for PaddleVideo's EIVideo",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py": "Apache-Licensed EIVideo Metric Builder",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py": "Registry-Based Metric Management",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py": "VOS Metric: Video Object Segmentation",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:1-38": "VOS Metric Class Registration",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:115-131": "Video Object Segmentation Metric",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:133-148": "Data Augmentation and Label Averaging",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:149-168": "Frame-wise Flipped Label Generation",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:169-192": "Average Time per Frame Calculation",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:193-211": "Frame Rate Metrics and Tensor Manipulation",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:212-224": "Tracking Sequence Numbers",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:225-238": "Range of Indices in Code",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:239-252": "VOS Metric Sequence",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:253-271": "Zip Folder and Image Mask Functions",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:272-279": "Zipping Metrics and Saving Results",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:39-67": "Video Processing Class Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:68-90": "VOS Metric Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py:91-114": "Embedding Preparation for EIVideo",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py": "PaddleVideo Model Registry",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py:1-23": "PaddleVideo Import Script",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py:24-27": "PaddleVideo Library Variables and Functions",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py": "DeepLab Import Statement",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py": "ASPP-MANET Backbone Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:1-32": "ASPP Layer Implementation in ASV Model",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:118-124": "ASPP Model Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:33-62": "ASPP Network Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:63-85": "ASPP Module with Dilation and Pooling",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py:86-117": "ASPP-MANET Backbone Class Definition",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py": "Manet Decoder Layer with Conv, BatchNorm, ReLU",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py:1-30": "Manet Decoder Class",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py:31-59": "Manet Decoder Block: Conv-BN-ReLU",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py:60-65": "Manet Decoder Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py": "Static BatchNorm2d and DeepLab Backbone",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py:1-26": "Frozen Batch Normalization Layer",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py:27-61": "DeepLab Network Backbone",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py:62-90": "DeepLab Model Creation and Evaluation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py": "ResNet-MANET Model Coding",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:1-31": "Bottleneck ResNet Definition",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:102-127": "ResNet-MANET Model Creation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:128-159": "Create ResNet Residual Block with Downsampling",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:160-191": "ResNet-MANET Backbone Model",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:192-227": "ResNet101 BatchNorm Backbone",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:230-245": "ResNet101 Model JSONizer",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:32-75": "ResNet: Efficient Video Backbone",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py:76-101": "ResNet-MANET Backbone Builder",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py": "PaddleVideo Model Builder",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:1-19": "Model Registration and Building Utilities",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:117-125": "Video Analysis Framework Builder",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:22-73": "Video Processing Model Components Builder",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py:74-116": "Model Builder: Configurable PaddleVideo Models",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py": "PaddleVideo Framework: BaseSegment & Manet Definitions",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py": "Python Segment Framework Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py": "Semi-Video Segmentation Base Class",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py:2-30": "Semi-Video Object Segmentation Base Class",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py:31-59": "Model Initialization and Processing",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py:60-95": "Abstract Step Methods for Video Modeling",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py": "Manet Stage 1 Video Segmentation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:1-26": "MANET Model Imports for Video Tasks",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:110-134": "Manet Stage 1: Initialization and Embeddings",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:136-157": "Manet Stage1: Batch Image Transformation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:158-176": "Reference Frame Embedding Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:177-195": "Save Interactive Scribble Image",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:196-216": "Scribble-Based Mask Generation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:217-234": "Manet Stage 1 Segmentation Model Iteration",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:235-254": "Temp Dictionary Creation for Labels",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:255-274": "Scribble-based Labeling for Video Annotation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:27-61": "Manet Model Definition and Implementation Plan",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:275-292": "Local and Global Map Calculation for Segmentation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:293-310": "Manet Segment Annotation Check",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:311-327": "Mask Creation and Storage in Video Model",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:328-347": "Save and Propagate Frames",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:348-365": "Manet Segmentation Model Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:366-383": "Dynamic SegHead Model for Video Prediction",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:384-402": "Image Segmentation Model Predictions",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:403-417": "Auto-Segmentation Framework",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:62-87": "Model Initialization and Evaluation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py:88-109": "Model State Check and Segmentation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py": "IntVOS: Nearest Neighbor Attention for Video Segmentation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:1-37": "Pairwise Squared L2 Distance Calculation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:114-134": "Nearest Neighbor Feature Calculation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:135-158": "Split and Apply Chunks",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:159-181": "Nearest Neighbor Features Calculation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:182-201": "Nearest Neighbor Calculator",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:202-224": "Nearest Neighbor Tensor Reshaping",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:225-252": "Squared L2 Distance Calculator",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:253-278": "Nearest Neighbor Features for Video Matching",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:279-298": "Nearest Neighbor Feature Extraction",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:300-325": "Local Distance and Offset Masks Calculation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:326-358": "Feature Extraction and Masking for IntVOS",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:359-390": "Convolutional Neural Network Architecture for Image Processing",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:38-60": "Pairwise Distance Calculation for Nearest Neighbor Attention",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:391-418": "Custom CNN Layer for Image Feature Extraction",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:421-442": "Split Separable Conv2D Layer",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:443-488": "Dynamic Segmentation Architecture",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:489-506": "IntVOS Class Definition",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:507-530": "Dynamic Semantic Segmentation Network Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:531-559": "IntVOS Model Head: Loss Calculation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:560-588": "Split-Apply-Combine: Prop SegHead Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:589-622": "IntVOS: Prop Segmentation Head",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:61-83": "Nearest Neighbor Distance Calculator",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:623-646": "IntVOS: Feature Embedding Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:647-665": "Nearest Neighbor Features Calculation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:667-687": "Sequence Name Check and Update",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:688-707": "Nearest Neighbor Features for Previous Frame",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:708-724": "Map Dictionaries Check",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:725-741": "InterVOS Frame Processing",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:742-763": "Video Modeling: Local Maps and Interaction Numbers",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:764-787": "Defining Int_seghead Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:788-813": "Local Distance Map Calculation in IntVOS",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:814-832": "Update Global Map with Nearest Neighbors",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:833-854": "Updating Global and Local Maps in Video Model",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:84-113": "Nearest Neighbor Features Calculation",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:856-878": "Dynamic Object Scene Embeddings",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py:879-893": "Segmentation Prediction in IntVOS",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py": "Copyright and Imports in PaddleVideo Heads",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py": "Registry Management in PaddleVideo's EIVideo",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py:1-27": "Video Component Registry in PaddleVideo's EIVideo",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py:28-31": "Four Model Registries for Efficient Video Processing",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py": "Customizable PaddlePaddle Weight Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:1-36": "Weight Initialization Functions",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:131-157": "Initialize Weights with Normal Distribution",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:158-158": "Random Weight Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:37-66": "Truncated Normal Weight Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:68-98": "Truncated Normal Weight Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py:99-130": "Truncated and Kaiming Normal Weight Init",
+    "/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py": "Importing Test Model Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py": "Multi-Card Model Testing",
+    "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py:1-31": "Model Testing without Gradient"
+}
\ No newline at end of file
diff --git a/docs/data/titles/1.json b/docs/data/titles/1.json
new file mode 100644
index 000000000..2937b1524
--- /dev/null
+++ b/docs/data/titles/1.json
@@ -0,0 +1,302 @@
+{
+    "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py:32-39": "Multi-card Test Configuration",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py": "PaddleVideo Library Utilities",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py": "Building Objects with Config and Registry",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py:1-31": "Build Object Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py:32-35": "Build and Validate Object Class",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py": "Config Parser and Checker",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:1-34": "Config Utilities Setup",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:110-139": "Config Override Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:140-170": "Dynamic Config Overrides",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:171-174": "Check and Print Config",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:35-67": "Config Parser and Printer",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py:68-109": "Config Utilities and Visualization Functions",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py": "Distributed Computing Utilities",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py": "Customizing PaddleVideo Logging",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:1-38": "Colorful Logger: Setting Up Colors for Logging Messages in PaddleVideo",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:101-113": "Initialize and Set Logger Level",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:39-71": "PaddleVideo Logger Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py:72-100": "Custom Logger Configuration for Python",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py": "OpenCV-Powered PyTorch Image Processing",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1-28": "Define Paddle Tensor Type Hints",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1000-1028": "Manet Initializer Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1029-1060": "Xavier Uniform Initialization for Tensors",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:106-146": "Morphology-Based Mask Overlay",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1062-1089": "Xavier Initialization for Tensors",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1091-1120": "Xavier Normal Distribution Initialization in PyTorch",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1121-1145": "Kaiming Uniform Initialization Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1146-1170": "Kaiming Normal Tensor Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1171-1196": "Orthogonal Matrix and Kaiming Initialization Functions",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1197-1233": "Sparsity-Ensured Normal Tensor Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1234-1268": "Sparsity-Init Torch Tensor\n(or)\nSparse Torch Tensor Init\n(or)\nTorch Tensor Sparse Initializer",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:1269-1295": "Deprecating Init Methods: Torch.nn to PaddleVideo",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:147-172": "Generate Overlay Images and JSON List",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:173-206": "Video Frame Loading and Labeling Utility",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:207-236": "Efficient Scribble Label Processing Functions",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:239-272": "Load and Save Pretrained Model Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:273-304": "Damage Masks Generator",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:29-41": "Unstructured Numeric Sequence",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:305-330": "Mask Damage Utility",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:331-361": "Randomly Shifting Numpy Mask",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:362-388": "Randomly Manipulating Binary Masks for AI Robustness",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:389-422": "Binary Mask Rotation and Scaling",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:42-54": "Extract Integer List",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:423-466": "PaddleVideo Utilities",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:467-500": "Tensor to PIL Image Conversion",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:501-529": "Adjusting Image Format for Compatibility",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:530-553": "Image Mode Validator",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:55-67": "List of Integers: Purpose Unclear",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:554-578": "Compatibility Checker",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:580-615": "Paddle-Torch Dictionary Converter",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:616-640": "Gradient Norm Clipping Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:641-666": "Total Norm of Parameters Calculation",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:667-682": "Manet Gradient Scaling",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:68-105": "Masked Damager Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:683-716": "Max Index Gathering with PaddlePaddle",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:717-745": "Tensor Sampling and Reshaping",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:746-774": "Tensor Initialization Functions",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:775-806": "Normalizing Tensor with PyTorch's Paddle",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:807-833": "Recommended Gain for Nonlinearity Functions",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:834-859": "Nonlinearity Mapping Function",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:862-895": "Uniform and Normal Tensor Initialization Functions",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:896-923": "Truncated Normal and Constant Tensor Filling",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:924-966": "Tensor Filling Methods: Constant, Ones, Zeros, and Identity Matrix",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py:968-998": "Dirac and Identity Tensor Initializers",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py": "Precise Batch Normalization Improvement",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:1-30": "Improved Batch Norm in EIVideo",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:31-54": "Precise Batch Normalization Statistics",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:55-80": "Precise BN Training Algorithm",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py:81-84": "Updating BatchNorm Layers in Model Training",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py": "PaddlePaddle Profiler Initialization",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:1-29": "Global Variables and Profiler Options",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:105-110": "Profiler Step Incrementer",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:30-52": "Profiler Options Class",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:53-76": "Profile Parser from String",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py:77-104": "Operator-Level Profiling with PaddlePaddle's Profiler",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py": "PaddleVideo Record: Metrics Tracking and Logging",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:1-32": "Metrics Logger for Paddle Video",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:114-141": "Epoch Metrics Logger",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:142-157": "Epoch Metrics Logger",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:33-49": "Framework-Specific Metric Recording",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:50-72": "Average Meter for Metrics Tracking",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py:73-113": "Tracking Metrics Class",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py": "Registry: Name-to-Object Mapping and Registration",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:1-34": "Registry Class for Custom Modules",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:36-70": "Object Registry Manager",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py:71-96": "Registry Class and Methods",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py": "ViT Adaptor with PaddlePaddle Compatibility",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:1-28": "ViT Model Adaptation for Existing Architecture",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:128-152": "Resnet and Vision Transformer Weights Loader",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:153-182": "Load and Save PaddlePaddle Models",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:29-49": "Maintaining 'pos_embed' Tensor Consistency",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:51-71": "Adjusting Time Embedding Shape",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:72-96": "Temporal State Dictionary Merge",
+    "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py:97-127": "Loading Pre-trained Model Parameters",
+    "/applications/EIVideo/EIVideo/paddlevideo/version.py": "PaddleVideo Version Info",
+    "/applications/EIVideo/EIVideo/setup.py": "Code Credits and Sources",
+    "/applications/EIVideo/EIVideo/version.py": "EIVideo Version Information",
+    "/applications/EIVideo/QEIVideo/__init__.py": "QEIVideo Path and Version",
+    "/applications/EIVideo/QEIVideo/build_gui.py": "Video GUI with PyQt5: Functionality Overview",
+    "/applications/EIVideo/QEIVideo/build_gui.py:1-36": "PyQt5 GUI Builder Script",
+    "/applications/EIVideo/QEIVideo/build_gui.py:109-135": "GUI Application Functions",
+    "/applications/EIVideo/QEIVideo/build_gui.py:136-151": "Update Frame in QEIVideo GUI",
+    "/applications/EIVideo/QEIVideo/build_gui.py:37-59": "Progress Bar and Play Button Functionality",
+    "/applications/EIVideo/QEIVideo/build_gui.py:60-78": "Interactive Video Controls: Stop, Start, Select",
+    "/applications/EIVideo/QEIVideo/build_gui.py:79-107": "Video Processing GUI with Eraser Mode",
+    "/applications/EIVideo/QEIVideo/gui/__init__.py": "PaddleVideo's EIVideo Copyright Comment Block",
+    "/applications/EIVideo/QEIVideo/gui/demo.py": "DrawFrame Class for QT UI",
+    "/applications/EIVideo/QEIVideo/gui/demo.py:1-36": "Interactive QWidget Drawing Class",
+    "/applications/EIVideo/QEIVideo/gui/demo.py:39-62": "DemoUI Frame Drawing Initialization",
+    "/applications/EIVideo/QEIVideo/gui/ui_main_window.py": "EIVideo App UI Initialization",
+    "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:1-32": "Video Application Main Window Initialization",
+    "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:102-122": "Video Player UI Creation and Interaction",
+    "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:123-142": "Initializing Push Buttons and Layouts",
+    "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:143-164": "Creating App's Main UI",
+    "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:165-167": "GUI Element Updates in MainWindow",
+    "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:33-56": "UI Initialization in Video App",
+    "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:57-77": "GUI Setup for EIVideo",
+    "/applications/EIVideo/QEIVideo/gui/ui_main_window.py:78-101": "Painting App UI Setup",
+    "/applications/EIVideo/QEIVideo/start.py": "Launch QEIVideo GUI with Python",
+    "/applications/EIVideo/QEIVideo/tools/__init__.py": "PaddleVideo QEIVideo Comment Block",
+    "/applications/EIVideo/QEIVideo/ui/__init__.py": "EIVideo UI Init File Comment",
+    "/applications/EIVideo/QEIVideo/ui/demo.py": "Interactive PyQt5 Video Player UI",
+    "/applications/EIVideo/QEIVideo/ui/demo.py:1-25": "PyQt5 UI Generated Main Window Code",
+    "/applications/EIVideo/QEIVideo/ui/demo.py:26-41": "Video Player Interface Setup",
+    "/applications/EIVideo/QEIVideo/ui/demo.py:42-58": "Creating Video Player Buttons and Slider",
+    "/applications/EIVideo/QEIVideo/ui/demo.py:59-77": "GUI Layout for Video Player Application",
+    "/applications/EIVideo/QEIVideo/ui/demo.py:78-97": "Creating Tab Widget with QProgressBar and QLabel",
+    "/applications/EIVideo/QEIVideo/ui/demo.py:98-113": "QEIVideo UI Configuration",
+    "/applications/EIVideo/QEIVideo/version.py": "EIVideo Version Info",
+    "/applications/EIVideo/QEIVideo/widget/PaintBoard.py": "PaintBoard: QWidget for Drawing & Erasing",
+    "/applications/EIVideo/QEIVideo/widget/PaintBoard.py:1-40": "PaintBoard: Custom QWidget for Graphic Editing",
+    "/applications/EIVideo/QEIVideo/widget/PaintBoard.py:42-78": "PaintBoard Class Functions",
+    "/applications/EIVideo/QEIVideo/widget/PaintBoard.py:80-106": "Mouse Event Handler for PaintBoard Drawing",
+    "/applications/EIVideo/README.md": "EIVideo: Windows Video Annotation Tool",
+    "/applications/EIVideo/README.md:1-15": "Interactive Intelligent Video Annotation Tool",
+    "/applications/EIVideo/README.md:121-123": "Emoji and Resource Sources",
+    "/applications/EIVideo/README.md:16-49": "Interactive Video Annotation Toolbox",
+    "/applications/EIVideo/README.md:51-85": "Introducing EIVideo: Customizable Interactive Video Annotation",
+    "/applications/EIVideo/README.md:86-119": "QEIVideo Installation and Roadmap Guide",
+    "/applications/EIVideo/resources/QT/demo.ui": "Qt Video Demo UI Designer",
+    "/applications/EIVideo/resources/QT/demo.ui:1-44": "Main Window Interface Design",
+    "/applications/EIVideo/resources/QT/demo.ui:126-169": "User Interface Layout Design",
+    "/applications/EIVideo/resources/QT/demo.ui:170-212": "Qt UI Layout Design",
+    "/applications/EIVideo/resources/QT/demo.ui:213-236": "Qt Application User Interface Layout: Demo.ui",
+    "/applications/EIVideo/resources/QT/demo.ui:45-86": "UI Design: Video Open Button",
+    "/applications/EIVideo/resources/QT/demo.ui:87-125": "UI Design with Chinese Buttons",
+    "/applications/EIVideo/resources/cmd": "Updating EIVideo: PaddleGit Operations",
+    "/applications/FightRecognition/README.md": "Fight Recognition Model Guide",
+    "/applications/FightRecognition/README.md:1-29": "Fight Recognition with PaddleVideo PP-TSM",
+    "/applications/FightRecognition/README.md:119-160": "Train and Validate Video Lists Generation",
+    "/applications/FightRecognition/README.md:162-192": "Cut Video Function",
+    "/applications/FightRecognition/README.md:193-245": "End of Model Training Code Snippet",
+    "/applications/FightRecognition/README.md:246-248": "Loading and Saving Pre-Trained Model",
+    "/applications/FightRecognition/README.md:31-55": "Python Script Executes Fight Prediction Model",
+    "/applications/FightRecognition/README.md:56-75": "Fight Detection Datasets and Training Approach",
+    "/applications/FightRecognition/README.md:77-118": "Multi-Dataset Fight Recognition Tool",
+    "/applications/FigureSkating/README.md": "OpenPose for Figure Skating Analysis",
+    "/applications/FigureSkating/README.md:1-46": "Figure Skating Action Data Processing with OpenPose",
+    "/applications/FigureSkating/README.md:48-92": "Training Figure Skating Models with Video Data",
+    "/applications/FootballAction/README.md": "FootballAction Model Improvements in PaddleVideo",
+    "/applications/FootballAction/README.md:1-54": "Soccer Action Detection Algorithm in PaddleVideo",
+    "/applications/FootballAction/README.md:119-141": "Comprehensive FootballAction Dataset Directory",
+    "/applications/FootballAction/README.md:142-201": "Download, Run and Train PP-TSM for FootballAction",
+    "/applications/FootballAction/README.md:202-230": "Updating Recognizer2D and Exporting PP-TSM Model",
+    "/applications/FootballAction/README.md:231-275": "Replacing Output Tensor and Extracting Features",
+    "/applications/FootballAction/README.md:276-320": "BMN Dataset Creation Script",
+    "/applications/FootballAction/README.md:321-362": "BMN Model Export and Prediction",
+    "/applications/FootballAction/README.md:363-408": "Attention LSTM Improvements in FootballAction",
+    "/applications/FootballAction/README.md:409-441": "LSTM Training Data Snippet",
+    "/applications/FootballAction/README.md:442-493": "LSTM Training and Prediction Code",
+    "/applications/FootballAction/README.md:494-513": "Improved PP-TSM Model for Football Action Detection",
+    "/applications/FootballAction/README.md:55-118": "Football Action Dataset Preprocessing",
+    "/applications/FootballAction/checkpoints/download.sh": "FootballAction Checkpoints Download Script",
+    "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list": "EuroCup2016 Video Dataset URLs",
+    "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:1-11": "EuroCup2016 Dataset URLs",
+    "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:12-22": "EuroCup2016 Dataset Download URLs",
+    "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:23-33": "EuroCup2016 Video URLs List",
+    "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:34-44": "EuroCup2016 Dataset URL Listing",
+    "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list:45-49": "EuroCup2016 Video URLs",
+    "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh": "Download EuroCup2016 Videos",
+    "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:1-13": "Download EuroCup2016 Videos",
+    "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:14-24": "EuroCup2016 Video Download Script",
+    "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:25-35": "EuroCup2016 Video Download",
+    "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:36-46": "EuroCup2016 Video Download Script",
+    "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh:47-51": "Download EuroCup2016 Mp4 Files",
+    "/applications/FootballAction/datasets/EuroCup2016/url.list": "EuroCup2016 Video URLs",
+    "/applications/FootballAction/datasets/EuroCup2016/url.list:1-26": "FootballAction EuroCup2016 URL List",
+    "/applications/FootballAction/datasets/EuroCup2016/url.list:27-49": "EuroCup2016: FootballAction URLs List",
+    "/applications/FootballAction/datasets/EuroCup2016/url_val.list": "Video URL List for EuroCup2016",
+    "/applications/FootballAction/datasets/script/get_frames_pcm.py": "Parallel FFmpeg Frame and Audio Extraction",
+    "/applications/FootballAction/datasets/script/get_frames_pcm.py:1-37": "Extract Frames and PCM Audio from Videos",
+    "/applications/FootballAction/datasets/script/get_frames_pcm.py:38-54": "Multithreaded MP4 Parser",
+    "/applications/FootballAction/datasets/script/get_instance_for_bmn.py": "BMN Instance Extraction Script",
+    "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:1-42": "BMN GT Data Processor Script",
+    "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:103-128": "Segmenting Actions with Before/After IDs",
+    "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:129-147": "Random Video Segment Selection and Annotation",
+    "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:148-178": "Saving Features with get_instance_for_bmn",
+    "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:180-205": "Reshaping and Concatenating Feature Arrays",
+    "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:206-216": "BMN Data Processing Pipeline",
+    "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:43-69": "Filtering Actions by Duration",
+    "/applications/FootballAction/datasets/script/get_instance_for_bmn.py:70-102": "BMN Window GT Data Combination",
+    "/applications/FootballAction/datasets/script/get_instance_for_lstm.py": "Python Script for Football Dataset Preparation",
+    "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:1-44": "IoU/IOA Calculator for LSTM Models",
+    "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:111-130": "Splitting Datasets for Football Actions",
+    "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:132-161": "Save Video Features and Labels to Files",
+    "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:162-172": "Label File Processing Script",
+    "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:45-80": "IoU and IOA Comparison Tool",
+    "/applications/FootballAction/datasets/script/get_instance_for_lstm.py:82-110": "Evaluate Proposals with IoU Threshold",
+    "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py": "Action Detection and Dataset Generation",
+    "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:1-38": "Action Instance Extractor",
+    "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:39-65": "Generating Positive and Negative Action Instances",
+    "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:66-96": "Multiprocessing Dataset Instantiation and Saving",
+    "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py:97-97": "File Path for Validation List",
+    "/applications/FootballAction/extractor/extract_bmn.py": "Video Classification and Detection Script",
+    "/applications/FootballAction/extractor/extract_bmn.py:1-49": "Video Classification Model with Baidu Cloud",
+    "/applications/FootballAction/extractor/extract_bmn.py:50-83": "Video Feature Extraction and Proposal Prediction",
+    "/applications/FootballAction/extractor/extract_bmn.py:84-91": "JSON Proposal Saver",
+    "/applications/FootballAction/extractor/extract_feat.py": "Baidu Cloud Model-based Video Classifier",
+    "/applications/FootballAction/extractor/extract_feat.py:1-50": "Baidu Cloud Action Video Classifier",
+    "/applications/FootballAction/extractor/extract_feat.py:51-74": "Video Feature Extraction and Conversion",
+    "/applications/FootballAction/extractor/extract_feat.py:75-100": "Video Feature Extractor and Classifier",
+    "/applications/FootballAction/predict/action_detect/action.py": "Baidu Cloud Action Detection System using ML",
+    "/applications/FootballAction/predict/action_detect/action.py:1-44": "Baidu Action Detection System",
+    "/applications/FootballAction/predict/action_detect/action.py:104-132": "Action Detection Methods and Tracking in Football",
+    "/applications/FootballAction/predict/action_detect/action.py:133-151": "Configure PPTSM Model and Predict Features",
+    "/applications/FootballAction/predict/action_detect/action.py:152-173": "Video Feature Processing for Action Detection",
+    "/applications/FootballAction/predict/action_detect/action.py:45-71": "Initialize ModelPredict Object",
+    "/applications/FootballAction/predict/action_detect/action.py:72-103": "Action Detection Model Initialization",
+    "/applications/FootballAction/predict/action_detect/logger.py": "Custom Logger for News Stripper",
+    "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py": "MFCC-based Action Detection in Football",
+    "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:1-38": "MFCC Feature Extraction Algorithm",
+    "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:112-136": "MFCC-based Audio Feature Extraction",
+    "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:137-157": "Audio Feature Extraction for Action Detection",
+    "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:159-182": "Audio Feature Extraction from WAV Files",
+    "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:39-69": "Mel Scale Audio Feature Extraction",
+    "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:70-90": "Extract MFCC Features from Speech Audio",
+    "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py:91-111": "Spectrogram Calculator Function",
+    "/applications/FootballAction/predict/action_detect/mfcc/model_config.py": "ModelAudio: Extract, Slice, Predict",
+    "/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py": "VGGish Model Parameters and Configurations",
+    "/applications/FootballAction/predict/action_detect/models/audio_infer.py": "Audio Inference with InferModel",
+    "/applications/FootballAction/predict/action_detect/models/bmn_infer.py": "BMN Infer Action Detection",
+    "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:1-37": "BMN Infer App Class Definition",
+    "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:112-131": "Average-Window Action Detection",
+    "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:133-156": "BMN Inference & JSON Saving",
+    "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:38-63": "BMN Inference Process",
+    "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:64-86": "Boundary Score Calculator",
+    "/applications/FootballAction/predict/action_detect/models/bmn_infer.py:87-111": "Boundary-Based Mask Selection",
+    "/applications/FootballAction/predict/action_detect/models/lstm_infer.py": "Efficient LSTM Football Action Prediction",
+    "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:1-36": "Football Action Inference with PaddlePaddle",
+    "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:111-137": "Initialize InferModel and Load Data",
+    "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:138-152": "Efficient Action Detection",
+    "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:37-61": "LSTM Model for Video Action Detection",
+    "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:62-91": "LSTM Data Processing and Prediction",
+    "/applications/FootballAction/predict/action_detect/models/lstm_infer.py:92-110": "LSTM Inferencing for Action Detection",
+    "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py": "PPTSM Inference for Football Actions",
+    "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py:1-38": "PPTSM Model Inference Class",
+    "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py:40-67": "PPTSM Inference Script",
+    "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py:69-78": "Football Action Prediction Model",
+    "/applications/FootballAction/predict/action_detect/reader/__init__.py": "Alphabetical Action Readers",
+    "/applications/FootballAction/predict/action_detect/reader/audio_reader.py": "AudioReader for YouTube-8M Dataset",
+    "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py": "BMNINF Reader for Football Action Detection",
+    "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:1-49": "BMNINF Reader: FootballAction Data Reader",
+    "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:106-138": "BMNINF Reader Function",
+    "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:139-155": "Video Batch Reader for Football Action Detection",
+    "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:50-73": "Bmninf Reader Initialization",
+    "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py:74-105": "Football Action Detection Reader",
+    "/applications/FootballAction/predict/action_detect/reader/feature_reader.py": "Attention-Based LSTM Feature Reader",
+    "/applications/FootballAction/predict/action_detect/reader/feature_reader.py:1-33": "Attention-Based LSTM Feature Reader",
+    "/applications/FootballAction/predict/action_detect/reader/feature_reader.py:35-71": "Feature Reader Initialization",
+    "/applications/FootballAction/predict/action_detect/reader/feature_reader.py:72-86": "Multi-Feature Reader",
+    "/applications/FootballAction/predict/action_detect/reader/reader_utils.py": "Video Reader Utils",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py": "Threaded TSMINF Reader for Football Action Detection",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:1-38": "TSMINF Video Reader Class",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:120-141": "Image Data Inference and Transformation",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:143-180": "Image Transformation Function",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:181-212": "Image Preprocessing for Football Action Detection",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:213-242": "Random Crop with Offset Adjustment",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:243-267": "Random Crop Sizes for Action Detection",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:268-307": "Image Processing Functions",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:308-349": "Image Group Manipulation Techniques",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:350-357": "Adaptive Image Resizer",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:39-64": "TSN Video Reader Initialization",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:65-97": "Video Image Batch Reader for Inference",
+    "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py:98-119": "Multithreaded Video Image Reader",
+    "/applications/FootballAction/predict/action_detect/utils/config_utils.py": "Config Utils for Basketball Action",
+    "/applications/FootballAction/predict/action_detect/utils/preprocess.py": "FFmpeg Tools for Video Processing",
+    "/applications/FootballAction/predict/action_detect/utils/process_result.py": "Action Detection with NMS Filtering",
+    "/applications/FootballAction/predict/eval.py": "Evaluating Model Performance with F1 Scores",
+    "/applications/FootballAction/predict/eval.py:1-36": "Initializing Ground Truth Data",
+    "/applications/FootballAction/predict/eval.py:121-144": "Box IOU Evaluator",
+    "/applications/FootballAction/predict/eval.py:146-161": "Subtask Precision and Recall Calculator",
+    "/applications/FootballAction/predict/eval.py:162-189": "FootballAction Prediction Evaluation",
+    "/applications/FootballAction/predict/eval.py:190-218": "Football Action Prediction Evaluation",
+    "/applications/FootballAction/predict/eval.py:219-237": "Optimal Threshold Selection",
+    "/applications/FootballAction/predict/eval.py:37-67": "IoU and Proposal Conversion Functions",
+    "/applications/FootballAction/predict/eval.py:68-93": "Filtered Boxes and Ground Truth Conversion",
+    "/applications/FootballAction/predict/eval.py:94-120": "Intersection over Union Evaluation Functions",
+    "/applications/FootballAction/predict/predict.py": "Football Action Detection Model Prediction",
+    "/applications/FootballAction/predict/predict.py:1-33": "Video Action Detection Script",
+    "/applications/FootballAction/predict/predict.py:35-37": "JSON Data Output in FootballAction App",
+    "/applications/Ma-Net/README.md": "MA-Net Model for PaddleVideo: DAVIS Dataset Training & Testing"
+}
\ No newline at end of file
diff --git a/docs/data/titles/10.json b/docs/data/titles/10.json
new file mode 100644
index 000000000..48efe694f
--- /dev/null
+++ b/docs/data/titles/10.json
@@ -0,0 +1,302 @@
+{
+    "/paddlevideo/modeling/backbones/agcn.py": "Adaptive Graph Convolutional Networks (AGCN) Backbone",
+    "/paddlevideo/modeling/backbones/agcn.py:1-27": "PaddlePaddle GCN Class Definition",
+    "/paddlevideo/modeling/backbones/agcn.py:111-128": "AGCN Backbone: Custom Normalization and Pooling",
+    "/paddlevideo/modeling/backbones/agcn.py:28-57": "3D Spatio-Temporal Convolutional Block",
+    "/paddlevideo/modeling/backbones/agcn.py:58-84": "GCN-TCN Residual Block Init.",
+    "/paddlevideo/modeling/backbones/agcn.py:87-110": "Adaptive Graph Convolutional Network (AGCN) Improvement",
+    "/paddlevideo/modeling/backbones/agcn2s.py": "AGCN2S Graph Convolutions in PaddlePaddle",
+    "/paddlevideo/modeling/backbones/agcn2s.py:1-32": "Temporal Convolutional Network Layer in PaddlePaddle",
+    "/paddlevideo/modeling/backbones/agcn2s.py:122-144": "Graph Class for NTURGB+D Dataset",
+    "/paddlevideo/modeling/backbones/agcn2s.py:146-176": "Adjacency Matrix Conversion Functions",
+    "/paddlevideo/modeling/backbones/agcn2s.py:177-212": "Graph Convolutional Neural Network Layer (GCNN)",
+    "/paddlevideo/modeling/backbones/agcn2s.py:213-229": "AGCN2S Transformation Layers",
+    "/paddlevideo/modeling/backbones/agcn2s.py:33-65": "AGCN Unit: Learning Spatio-Temporal Features",
+    "/paddlevideo/modeling/backbones/agcn2s.py:66-91": "AGCN2S Neural Network Backbone Definition",
+    "/paddlevideo/modeling/backbones/agcn2s.py:92-121": "AGCN-TS: Temporal Series Modeling with GCN and TCN",
+    "/paddlevideo/modeling/backbones/asrf.py": "ASRF: PaddleVideo Backbone Initiation",
+    "/paddlevideo/modeling/backbones/asrf.py:1-30": "Asrf Backbone Model Registration",
+    "/paddlevideo/modeling/backbones/asrf.py:33-65": "ASRF: Customizable Convolutional Backbone for CV",
+    "/paddlevideo/modeling/backbones/asrf.py:66-75": "ASRF Backbone Initialization and Forward Method",
+    "/paddlevideo/modeling/backbones/bmn.py": "BMN Backbone for Paddle Video",
+    "/paddlevideo/modeling/backbones/bmn.py:1-28": "Boundary-Matching Pair Mask Generator",
+    "/paddlevideo/modeling/backbones/bmn.py:104-137": "BMN: Backbone Model with ConvLayers",
+    "/paddlevideo/modeling/backbones/bmn.py:138-163": "Conv1D Block for BMN Model",
+    "/paddlevideo/modeling/backbones/bmn.py:164-189": "Initializing TEM and PEM Modules in Backbone Network",
+    "/paddlevideo/modeling/backbones/bmn.py:190-215": "BMN Backbone Model Initialization",
+    "/paddlevideo/modeling/backbones/bmn.py:216-246": "2D Conv Layers for BMSN Backbone",
+    "/paddlevideo/modeling/backbones/bmn.py:247-283": "Video Analysis Backbone Model: BMN",
+    "/paddlevideo/modeling/backbones/bmn.py:284-290": "Convolutional Neural Network Backbone",
+    "/paddlevideo/modeling/backbones/bmn.py:29-53": "Generating Sample Masks for Boundary-Matching Maps",
+    "/paddlevideo/modeling/backbones/bmn.py:54-77": "Video Frame Mask Generation Code",
+    "/paddlevideo/modeling/backbones/bmn.py:78-103": "BMN Layer for Temporal Action Proposal Generation",
+    "/paddlevideo/modeling/backbones/cfbi.py": "CFBI Model: FPN-DeepLab Backbone",
+    "/paddlevideo/modeling/backbones/cfbi.py:1-28": "FPN Layer Definition",
+    "/paddlevideo/modeling/backbones/cfbi.py:29-54": "CFBI Backbone Model Architecture",
+    "/paddlevideo/modeling/backbones/cfbi.py:56-84": "CFBI: DeepLab-FPN Backbone Model",
+    "/paddlevideo/modeling/backbones/cfbi.py:85-88": "CFBI: Multi-scale Feature Extraction",
+    "/paddlevideo/modeling/backbones/ctrgcn.py": "Introducing CTRGCN Backbone for Video Models",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:1-31": "CtrGCN Backbone Setup",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:128-155": "MultiScale Temporal Conv Layer",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:156-182": "Conv-Temporal RGN Backbone: Video Analysis Model",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:183-211": "Conv-Temporal Residual Group Convolutional Network Backbone",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:212-250": "Temporal and Graph Convolutional Network Units",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:251-276": "CTRGC Model Initialization",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:277-306": "Adaptive CTR GCN Initialization",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:307-335": "TCN-GCN Unit Definition",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:32-66": "Defining CTRGC: Convolutional Layer",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:336-363": "CTRGCN: Residual Graph Convolutional Network",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:364-397": "Generating Adjacency Matrices for CTRGCN",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:398-426": "CTRGCN: Skeleton Action Model",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:427-455": "CTRGCN: TCN-GCN Model Initialization",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:456-477": "Deep TCN-GCN Architecture for CTRGCN Model",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:478-511": "TCN-GCN Neural Network Model",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:512-514": "Final Neural Network Layer 10 Application",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:67-93": "Convolutional Temporal RGPN Backbone",
+    "/paddlevideo/modeling/backbones/ctrgcn.py:94-127": "Temporal Convolution Backbone",
+    "/paddlevideo/modeling/backbones/darknet.py": "Darknet Backbone with ConvBNLayer",
+    "/paddlevideo/modeling/backbones/darknet.py:1-32": "Darknet ConvBN Layer Definition",
+    "/paddlevideo/modeling/backbones/darknet.py:116-129": "Darknet Backbone: ConvBNLayer Sequence",
+    "/paddlevideo/modeling/backbones/darknet.py:130-150": "Darknet Neural Network Backbone Design",
+    "/paddlevideo/modeling/backbones/darknet.py:151-165": "Darknet Convolutional Branching",
+    "/paddlevideo/modeling/backbones/darknet.py:33-61": "Darknet Convolutional Block with BN and Leaky ReLU",
+    "/paddlevideo/modeling/backbones/darknet.py:62-92": "Darknet Backbone with ConvBNLayer and MaxPooling",
+    "/paddlevideo/modeling/backbones/darknet.py:93-115": "Darknet Layer Transpose Dimensions",
+    "/paddlevideo/modeling/backbones/deeplab.py": "DeepLab Network Construction",
+    "/paddlevideo/modeling/backbones/deeplab.py:1-33": "Fixed Batch Normalization Layer",
+    "/paddlevideo/modeling/backbones/deeplab.py:131-152": "DeepLab Model Creation with Conv Layers and BatchNorm",
+    "/paddlevideo/modeling/backbones/deeplab.py:153-176": "DeepLab Backbone Classification Layer Design",
+    "/paddlevideo/modeling/backbones/deeplab.py:177-207": "DeepLab Module Creation Function",
+    "/paddlevideo/modeling/backbones/deeplab.py:208-240": "DeepLab ConvNet Function",
+    "/paddlevideo/modeling/backbones/deeplab.py:241-269": "DeepLab ASPP Model Extraction",
+    "/paddlevideo/modeling/backbones/deeplab.py:270-307": "DeepLab ASPP Module Initialization",
+    "/paddlevideo/modeling/backbones/deeplab.py:308-330": "Dynamic ASPP Modules in DeepLab Backbone",
+    "/paddlevideo/modeling/backbones/deeplab.py:332-363": "DeepLab Backbone for Image Segmentation",
+    "/paddlevideo/modeling/backbones/deeplab.py:34-59": "DeepLab Bottleneck Layer Initialization",
+    "/paddlevideo/modeling/backbones/deeplab.py:364-395": "DeepLab Decoder Class",
+    "/paddlevideo/modeling/backbones/deeplab.py:396-426": "DeepLab Model for Segmentation",
+    "/paddlevideo/modeling/backbones/deeplab.py:427-454": "DeepLab Model Implementation",
+    "/paddlevideo/modeling/backbones/deeplab.py:60-86": "Bottleneck Conv Neuron Layer for DeepLab",
+    "/paddlevideo/modeling/backbones/deeplab.py:87-130": "ResNet: Residual Blocks with Conv and BatchNorm",
+    "/paddlevideo/modeling/backbones/movinet.py": "MoViNet: Mobile Video Analysis Model",
+    "/paddlevideo/modeling/backbones/movinet.py:1-27": "MOViNet Configuration",
+    "/paddlevideo/modeling/backbones/movinet.py:122-147": "Conv3D Layer Creation",
+    "/paddlevideo/modeling/backbones/movinet.py:148-177": "ConvBlock3D: Causal Convolutional Module",
+    "/paddlevideo/modeling/backbones/movinet.py:178-196": "Conv Type Check and Initialization",
+    "/paddlevideo/modeling/backbones/movinet.py:197-216": "Defining Conv Layers in Movinet Backbone",
+    "/paddlevideo/modeling/backbones/movinet.py:217-238": "Convolutional Video Backbone",
+    "/paddlevideo/modeling/backbones/movinet.py:239-269": "Temporal Causal Average Pooling 3D",
+    "/paddlevideo/modeling/backbones/movinet.py:270-296": "CausalModule: Cumulative Sum and Activation Control",
+    "/paddlevideo/modeling/backbones/movinet.py:28-55": "MobileNetV2 Architecture Defined",
+    "/paddlevideo/modeling/backbones/movinet.py:297-322": "SqueezeExcitation Layer Class",
+    "/paddlevideo/modeling/backbones/movinet.py:323-347": "Scale-Aware Spatial Pyramid Pooling",
+    "/paddlevideo/modeling/backbones/movinet.py:350-382": "BasicBneck Neural Network Layer",
+    "/paddlevideo/modeling/backbones/movinet.py:383-404": "3D ConvBlock for MoviNet Backbone",
+    "/paddlevideo/modeling/backbones/movinet.py:405-427": "ConvBlock3D Creation: Stride, Channels and Causal Convolution",
+    "/paddlevideo/modeling/backbones/movinet.py:428-464": "MoViNet: Video Backbone Model",
+    "/paddlevideo/modeling/backbones/movinet.py:465-487": "MOViNet Model Definition",
+    "/paddlevideo/modeling/backbones/movinet.py:488-510": "MOViNet Customizable Model Creation",
+    "/paddlevideo/modeling/backbones/movinet.py:511-539": "MoviNet 3D CNN Backbone",
+    "/paddlevideo/modeling/backbones/movinet.py:541-572": "MoviNet Backbone Class",
+    "/paddlevideo/modeling/backbones/movinet.py:56-94": "Conv2dBNActivation Layer for MoviNet",
+    "/paddlevideo/modeling/backbones/movinet.py:573-574": "Movinet 3D Causal Instance Generation",
+    "/paddlevideo/modeling/backbones/movinet.py:95-121": "Convolutional Neural Network Layers with Batch Normalization",
+    "/paddlevideo/modeling/backbones/ms_tcn.py": "Kaiming Uniform Initialization for MSTCN Backbone",
+    "/paddlevideo/modeling/backbones/ms_tcn.py:1-32": "MS TCN Initialization",
+    "/paddlevideo/modeling/backbones/ms_tcn.py:101-132": "Dilated Residual Layers in MSTCN Backbone",
+    "/paddlevideo/modeling/backbones/ms_tcn.py:133-154": "MS TCN Model Initialization",
+    "/paddlevideo/modeling/backbones/ms_tcn.py:34-68": "Kaiming Uniform Initialization in MS-TCN",
+    "/paddlevideo/modeling/backbones/ms_tcn.py:69-100": "SingleStage MS-TCN Model",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py": "MobileNetV2 Backbones for PaddlePaddle",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:1-30": "MobileNetV2 Backbone Code",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:105-132": "Inverted Residual Blocks for PPTSM MV2",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:133-151": "PPTSM_MV2 Residual Units Creation",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:152-187": "PPTSM-MV2 and MobileNet Model",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:188-207": "PPTSM-MV2 Backbone Initialization",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:208-232": "PPTSM-MV2 Backbone Implementation",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:233-266": "PPTSM MobileNetV2 Model Initialization",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:267-282": "Scaled MobileNet Functions in PaddleVideo",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:32-58": "PaddlePaddle MobileNetV2: ConvBNLayer",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:59-85": "Inverted Residual Unit Class",
+    "/paddlevideo/modeling/backbones/pptsm_mv2.py:86-103": "Initializing and Defining Convolutional Layers in PPTSM-MV2 Backbone",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py": "PPTSM-Mv3 Backbone in PaddleVideo",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:1-28": "PaddleVideo: PPTSM-MV3 Backbone",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:119-142": "Configurable MobileNetV3 Model Function",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:143-168": "PPTSM-MV3 Backbone Architecture",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:169-194": "PPTSM-MV3 Model Architecture",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:195-222": "PPTSM_MV3 Neural Network Model",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:223-258": "ConvBNLayer: Video Classification Backbone",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:259-292": "ResidualUnit: Expand Conv Layer",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:293-312": "PPTSM_MV3 Block Design",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:30-52": "MobileNetV3 Backbones: Stages and URLs",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:314-348": "PPTSM-MV3 Backbone: Temporal Shifting and SE Module",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:349-376": "Convolutional Neural Network Layer for PPTSM-MobileNetV3_small_x1_0",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:377-405": "Create MobileNetV3 Models via PaddlePaddle",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:406-408": "PPTSM-MV3 Backbone Instance Creation",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:53-79": "PPTSM-Mv3 Backbone Versions",
+    "/paddlevideo/modeling/backbones/pptsm_mv3.py:80-118": "MobileNetV3: Custom PyTorch Layer Definition",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py": "PPTSMv2 Video Backbone Python Module",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:1-27": "PaddlePaddle Neural Network Backbone Module",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:128-161": "Depthwise Separable Conv Layer Initialization",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:162-185": "PPTSM Backbone Model Initialization",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:186-209": "Downsample Convolution Layer with SE Module",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:210-238": "PPTSM_v2 Backbone: Convolutional Deep Learning Model",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:239-269": "PPTSM_V2 Backbone Implementation",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:270-303": "PPTSM_v2_LCNet: A Backbone Neural Network",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:29-64": "PPLCNetV2 Backbone for Video Processing",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:304-324": "PPTSM-v2 Backbone Model: DepthwiseSeparable Stages",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:325-347": "PPTSM_V2 Backbone: PaddleVideo Model",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:348-372": "PPTSM_v2 Backbone: Weights and Efficiency",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:373-405": "PPTSM_v2 Backbone: Video Analysis Model",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:65-96": "PPTSMV2: ConvBN Encoder with Attention",
+    "/paddlevideo/modeling/backbones/pptsm_v2.py:97-127": "P3-SE Module: Conv2D, BatchNorm2D, ReLU, SEModule",
+    "/paddlevideo/modeling/backbones/resnet.py": "Dynamic ResNet Backbone Model",
+    "/paddlevideo/modeling/backbones/resnet.py:1-34": "ConvBN Layer Class in ResNet",
+    "/paddlevideo/modeling/backbones/resnet.py:112-143": "ResNet Block Creation in PyTorch",
+    "/paddlevideo/modeling/backbones/resnet.py:144-178": "ResNet Backbone Model Definition",
+    "/paddlevideo/modeling/backbones/resnet.py:179-208": "ResNet Class Definition",
+    "/paddlevideo/modeling/backbones/resnet.py:210-229": "Dynamic ResNet Bottleneck Blocks",
+    "/paddlevideo/modeling/backbones/resnet.py:230-252": "Defining ResNet Model Layers",
+    "/paddlevideo/modeling/backbones/resnet.py:253-268": "Pretrained ResNet Loading Path",
+    "/paddlevideo/modeling/backbones/resnet.py:270-283": "ResNet Forward Function Definition",
+    "/paddlevideo/modeling/backbones/resnet.py:35-58": "ConvBNLayer Custom Layer",
+    "/paddlevideo/modeling/backbones/resnet.py:59-89": "ResNet Module with BN and Activation",
+    "/paddlevideo/modeling/backbones/resnet.py:90-111": "ResNet Backbone: ConvBNLayer Creation",
+    "/paddlevideo/modeling/backbones/resnet3d.py": "3D ResNet Model in PaddleVideo",
+    "/paddlevideo/modeling/backbones/resnet3d.py:1-37": "Simplifying ConvNet Layers with ConvBNLayer",
+    "/paddlevideo/modeling/backbones/resnet3d.py:116-140": "ResNet3D Block Configurations",
+    "/paddlevideo/modeling/backbones/resnet3d.py:141-171": "Initializing 3D ResNet Backbone Model",
+    "/paddlevideo/modeling/backbones/resnet3d.py:172-198": "Dilated 3D ResNet Conv Layers",
+    "/paddlevideo/modeling/backbones/resnet3d.py:199-239": "ResNet3D Block Definition",
+    "/paddlevideo/modeling/backbones/resnet3d.py:242-263": "Customizable ResNet 3D Backbone",
+    "/paddlevideo/modeling/backbones/resnet3d.py:264-282": "ResNet3D Backbone Parameters",
+    "/paddlevideo/modeling/backbones/resnet3d.py:283-302": "ResNet3D Parameters and Architecture Settings",
+    "/paddlevideo/modeling/backbones/resnet3d.py:303-331": "ResNet3D Backbone: 3D Deep Learning Model",
+    "/paddlevideo/modeling/backbones/resnet3d.py:332-357": "ResNet3D Model Initializer",
+    "/paddlevideo/modeling/backbones/resnet3d.py:358-387": "Configuring ResNet3D Model Attributes",
+    "/paddlevideo/modeling/backbones/resnet3d.py:38-56": "Extended Conv2D Layer for ResNet3D",
+    "/paddlevideo/modeling/backbones/resnet3d.py:388-412": "ResNet3D Layer Function",
+    "/paddlevideo/modeling/backbones/resnet3d.py:414-440": "Creating ResNet3D Residual Layers",
+    "/paddlevideo/modeling/backbones/resnet3d.py:441-457": "Customizable ResNet3D Backbone Model",
+    "/paddlevideo/modeling/backbones/resnet3d.py:458-481": "ResNet3D Residual Layer Creation",
+    "/paddlevideo/modeling/backbones/resnet3d.py:482-509": "Customizable ResNet3D Architecture",
+    "/paddlevideo/modeling/backbones/resnet3d.py:510-537": "3D Conv Resnet Inflation",
+    "/paddlevideo/modeling/backbones/resnet3d.py:538-561": "Inflating 2D ConvNet to 3D ResNet3D",
+    "/paddlevideo/modeling/backbones/resnet3d.py:562-586": "ResNet3D Param Loading & Stem Layer Creation",
+    "/paddlevideo/modeling/backbones/resnet3d.py:57-89": "3D ConvBN Layer Definition",
+    "/paddlevideo/modeling/backbones/resnet3d.py:587-620": "ResNet3D: Convolutional and Pooling Layers",
+    "/paddlevideo/modeling/backbones/resnet3d.py:621-641": "ResNet-3D Backbone Model Training",
+    "/paddlevideo/modeling/backbones/resnet3d.py:90-115": "Bottleneck3D Class Definition",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py": "Slowfast ResNet3d Backbone",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:1-30": "Slowfast ResNet3d: Reduced Fast Pathway",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:100-129": "Downsampling Block for ResNet3D",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:130-157": "Resnet3D Backbone Creation in PaddleVideo",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:158-180": "Loading and Resizing 2D Model Parameters",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:181-210": "ResNet3D Slow Only Pad Extension",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:211-214": "Resnet3D Slow-Only Strides and Dilations",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:31-60": "ResNet3D: Slowfast Residual Layer",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:62-79": "Defining Residual Module with Parameters",
+    "/paddlevideo/modeling/backbones/resnet3d_slowonly.py:80-98": "Build Residual Layers",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py": "SlowFast: Video Recognition Backbone",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:1-33": "ResNet SlowFast Backbone Initiation",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:114-139": "3D Convolutional Layer with Batch Normalization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:140-180": "SlowFast ResNet Blocks",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:181-198": "Defining ResNet Bottleneck Arguments",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:199-237": "ResBlock Class for Deep Neural Networks",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:238-260": "ResNet SlowFast Backbone Model",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:261-294": "SlowFast ResNet Stage",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:295-312": "ResStage Class Constructor",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:313-330": "ResStage Class for Residual Block",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:331-372": "ResNet SlowFast Model Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:34-66": "BottleneckTransform: Tx1x1, 1x3x3, Variable Kernel Sizes",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:373-404": "Slow-Fast ResNet Backbone with Pathways",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:405-432": "ResNet Basic Stem Module Definition",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:433-466": "SlowFast 3D Stem Module",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:467-492": "Resnet Slowfast Model Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:494-518": "ResNet SlowFast Stem and Fusion",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:519-544": "FuseFastToSlow Convolutional Layer",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:545-572": "ResNetSlowFast Model in PaddlePaddle",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:574-607": "ResNetSlowFast: Video Recognition Architecture",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:608-632": "SlowFast Model Construction",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:633-657": "SlowFast ResNet Backbone Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:659-678": "ResStage Configuration in ResNet SlowFast",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:67-87": "BottleneckTransform Class Definition",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:679-704": "ResNet SlowFast Model Architecture",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:705-733": "ResNet SlowFast Layer Definitions",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:734-762": "SlowFast ResNet Feature Extraction",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:763-788": "ResNet Slowfast Model: Initialization and Forwarding",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:790-795": "ResNet SlowFast Final Layer",
+    "/paddlevideo/modeling/backbones/resnet_slowfast.py:88-113": "Conv3D Layer for ResNet_SlowFast Backbone",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py": "ResNet SlowFast MRI Model",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:1-33": "PaddleVideo Backbone Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:114-139": "Conv3D Layer with BN in Resnet-Slowfast MRI",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:140-180": "SlowFast MRI ResNet Model",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:181-198": "Define ResNet Bottleneck",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:199-237": "ResBlock with Skip Connection",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:238-260": "ResNet SlowFast MRI Model with BN & BottleneckTransform",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:261-294": "ResStage for 3D ResNet SlowFast Networks",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:295-312": "ResStage Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:313-330": "ResStage Initialization: Resnet Slowfast MRI Backbone",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:331-372": "ResNet-SlowFast MRI Construction",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:34-66": "BottleneckTransform: Temporal Conv Layers",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:373-404": "SlowFast ResNet Backbone Initiation",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:405-432": "ResNetBasicStem: Kernel, Stride, Padding",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:433-466": "SlowFast Video Stem Module",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:467-492": "Resnet Slowfast MRI Backbone",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:494-518": "ResNet SlowFast MRI Fusion",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:519-544": "FuseFastToSlow Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:545-572": "ResNetSlowFast_MRI: Fusion Conv Layer",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:574-607": "ResNetSlowFast_MRI Model Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:608-632": "SlowFast Model Architecture",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:633-657": "ResNet SlowFast MRI Backbone",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:659-678": "Defining ResStage Layer Parameters",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:67-87": "BottleneckTransform Class in ResNet SlowFast MRI Model",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:679-704": "ResNet SlowFast Model Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:705-733": "ResNet SlowFast MRI Model Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:734-762": "ResNet SlowFast MRI Video Analysis Model",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:763-793": "SlowFast 3D ResNet Model Initialization",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:794-796": "SlowFast ResNet Fusion Layer",
+    "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py:88-113": "Initiating 3D ConvLayers with BatchNorm and Stride",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py": "ResNet-TSM Backbone Update",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:1-30": "ConvBNLayer: Combining Conv2D and BatchNorm2D",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:109-134": "TSM Backbone: Layer Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:135-164": "Temporal Shift ResNet Backbone",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:165-202": "Residual TSM Block: Alleviating Vanishing Gradients",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:203-241": "ResNet TSM Backbone Model: Flexible Depths",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:242-273": "ResNet-TSM Backbone Configuration",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:274-293": "ResNet TSM Bottleneck Blocks Creation",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:294-316": "ResNet TSM Backbone Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:31-53": "Custom ConvBNLayer Class Definition",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:317-332": "Initializing ResNet TSM Backbone Parameters",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:333-353": "ResNet TSM Forward Function",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:54-84": "BottleneckBlock: Conv, BatchNorm, Activation",
+    "/paddlevideo/modeling/backbones/resnet_tsm.py:85-108": "BottleneckBlock Class Definition",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py": "ResNet-TSM MRI Backbone",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:1-32": "Convolutional Batch Normalization Layer",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:113-134": "ResNet-D Branch Creation",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:136-166": "ResNet-TSM Backbone with Temporal Shifts",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:167-200": "ResNet TSM Backbone with BN and Leaky ReLU",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:201-229": "ResNetTSM_MRI Class Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:230-252": "ResNet-TSM Backbone in PaddleVideo",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:253-271": "Dynamic BottleneckBlock Naming",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:272-294": "Dynamic ResNet TSM Backbone Creation",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:295-311": "Initializing Backbone Neural Network",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:313-327": "Convolutional Layer Iterations",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:33-58": "ConvBNLayer Class Definition",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:59-83": "ResNet-D: Pooling and Convolution Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py:84-112": "ResNetTSM_MRI Backbone Design",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py": "ResNet-TSN Model for PaddlePaddle",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:1-29": "ResNet-TSN Backbone Model Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:112-144": "ResNet TSN Backbone with Conv and Shortcut Connection",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:145-167": "Defining BasicBlock for ResNet TSN MRI Model",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:168-202": "ResNetTSN_MRI Backbone Definition",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:204-232": "ResNet TSN Backbone Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:233-253": "ResNet TSN Model with Multiple Branch Inputs",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:254-275": "ResNet-TSN Bottleneck Block Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:276-294": "ResNet TSN Model Creation",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:295-311": "ResNet TSN Backbone Weight Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:31-58": "ConvBNPoolingLayer Class Definition",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:313-331": "Initializing and Checking Model Path",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:59-89": "Resnet_TSN Backbone Model Definition",
+    "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py:90-111": "BottleneckBlock: ResNet's Core Layer"
+}
\ No newline at end of file
diff --git a/docs/data/titles/11.json b/docs/data/titles/11.json
new file mode 100644
index 000000000..72800feb6
--- /dev/null
+++ b/docs/data/titles/11.json
@@ -0,0 +1,302 @@
+{
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py": "TSM ResNet Backbone for Temporal Segment Networks",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:1-31": "TSM ResNet Backbone Imports",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:106-132": "BottleneckBlock in ResNet-TSM",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:133-159": "Resnet TSM Backbone: Forward Method",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:160-192": "Temporal Shifted ResNet Backbone",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:193-217": "ResNet TSM Block Definition",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:218-253": "ResNet TSM Backbone Model Init & Forward",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:254-279": "ResNet Model Depth Customization",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:280-301": "TSM-ResNet Backbone with Tweaks",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:302-322": "ResNet Block Assignment Algorithm",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:32-54": "ConvBNLayer: Combined Conv2D and BatchNorm2D Layers",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:323-339": "ResNet TSM Backbone Weights Init",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:340-362": "Backbone Weights Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:55-78": "ResNet-D Tweak ConvBN Layer",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:79-105": "Tweakable Convolutional Neural Network with Batch Normalization",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py": "ResNet TSN Model Backbones: PaddleVideo",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:1-29": "ResNet Tweaks TSN Model Backbone",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:112-144": "ResNet Block Implementation",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:145-167": "BasicBlock Convolutional Layers and BatchNorm",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:168-203": "ResNetTweaksTSN Backbone: Convolution, Shortcut, and ReLU",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:204-232": "ResNet Backbone Configurations",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:233-254": "First Layer ResNet Backbone Definition",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:255-276": "ResNet-TSN Backbone Code",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:277-296": "Dynamic ResNet Tweaks for TSN",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:297-314": "Pre-Trained Weights Initialization in ResNet Backbone",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:31-58": "Tailored ResNet Backbone with Pooling",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:315-328": "ResNet Tweaks TSN Initialization",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:59-89": "ResNet-TSN Tweaks Backbone",
+    "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py:90-111": "Bottleneck Block: ResNet Parameter Reduction",
+    "/paddlevideo/modeling/backbones/resnext101.py": "ResNeXt-101 in PaddlePaddle",
+    "/paddlevideo/modeling/backbones/resnext101.py:1-31": "ConvBNLayer Class in PaddlePaddle",
+    "/paddlevideo/modeling/backbones/resnext101.py:123-148": "ResNext101 Backbone Architecture",
+    "/paddlevideo/modeling/backbones/resnext101.py:149-176": "ResNeXt-101 Model Implementation",
+    "/paddlevideo/modeling/backbones/resnext101.py:177-187": "ResNext101: Constructing and Applying Layers",
+    "/paddlevideo/modeling/backbones/resnext101.py:32-55": "ConvBNLayer Initialization",
+    "/paddlevideo/modeling/backbones/resnext101.py:56-82": "ResNeXt101 Bottleneck Block and Downsampling",
+    "/paddlevideo/modeling/backbones/resnext101.py:83-122": "ResNeXt Model Definition",
+    "/paddlevideo/modeling/backbones/stgcn.py": "STGCN: Skeleton-based Action Recognition Backbone",
+    "/paddlevideo/modeling/backbones/stgcn.py:1-37": "STGCN Backbone Definition",
+    "/paddlevideo/modeling/backbones/stgcn.py:105-120": "Node Initialization in STGCN and COCO Keypoint Backbones",
+    "/paddlevideo/modeling/backbones/stgcn.py:121-143": "Adjacency Matrix Initialization for STGCN",
+    "/paddlevideo/modeling/backbones/stgcn.py:144-174": "ConvTemporalGraphical Layer Initialization in STGCN",
+    "/paddlevideo/modeling/backbones/stgcn.py:175-209": "Temporal Graph Convolutions in STGCN",
+    "/paddlevideo/modeling/backbones/stgcn.py:210-251": "STGCN Model: Spatial and Temporal Processing",
+    "/paddlevideo/modeling/backbones/stgcn.py:251-278": "Skeleton Action Recognition with STGCN",
+    "/paddlevideo/modeling/backbones/stgcn.py:279-300": "ST-GCN Block Initialization",
+    "/paddlevideo/modeling/backbones/stgcn.py:301-327": "StGCN Edge Importance Initialization",
+    "/paddlevideo/modeling/backbones/stgcn.py:328-343": "ST-GCN Pooling and Reshaping",
+    "/paddlevideo/modeling/backbones/stgcn.py:38-80": "Graph Hopping and Normalization",
+    "/paddlevideo/modeling/backbones/stgcn.py:81-104": "Hop Distance Initialization in ST-GCN",
+    "/paddlevideo/modeling/backbones/swin_transformer.py": "Swin Transformer 3D Backbone in PaddleVideo",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:1-33": "Swin Transformer Stochastic Depth",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:100-137": "Swin Transformer Window Rearrangement",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:138-161": "Window-based Multi-Head Self Attention with Relative Position Bias",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:162-185": "Swin Transformer Self-Attention Initialization",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:186-204": "Swin Transformer: Relative Position Encoding",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:205-232": "Swin Transformer Backbone Initialization",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:233-261": "Swin Transformer Block 3D Implementation",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:262-282": "Swin Transformer Backbone Initialization",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:283-307": "Swin Transformer Backbone Initialization",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:308-330": "Swin Transformer Backbone Initialization",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:331-353": "Swin Transformer: Cyclic Shift and Windowed Self-Attention",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:34-64": "Stochastic Depth DropPath Layer",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:354-390": "Swin Transformer Forward Pass",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:391-426": "Swin Transformer Image Backbone",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:427-443": "Swin Transformer Attention Mask Generation",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:444-464": "Customizable Swin Transformer Layer",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:464-493": "Swin Transformer 3D Block Definition",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:494-522": "Swin Transformer Block for PaddleVideo",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:523-551": "Video Patch Embedding for Swin Transformer",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:553-581": "Swin Transformer Backbone Initialization",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:582-604": "Swin Transformer 3D Backbone: Paddle Video",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:605-627": "Swin Transformer Initialization Parameters",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:628-659": "Swin Transformer Model Initialization",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:65-99": "Defining Swin Transformer Layer",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:660-687": "Swin Transformer Backbone Initialization",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:689-720": "Swin Transformer Initialization",
+    "/paddlevideo/modeling/backbones/swin_transformer.py:721-742": "Swin Transformer Layer Processing",
+    "/paddlevideo/modeling/backbones/toshift_vit.py": "Shift-ViT: Versatile Image Processing Backbone",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:1-37": "Shifted Vision Transformer Backbone",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:105-138": "Self-Attention Module: TOShift-ViT Class",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:139-164": "Object Initialization in Toshift_VIT Model",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:165-186": "Temporal Attention Initialization",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:187-213": "Token-Shifting ViT Model Initialization",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:214-245": "ToshiftVIT: Custom Backbone for Vision Transformer Model",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:246-278": "TokenShift Vision Transformer Class",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:279-305": "Toshift ViT Class Initialization",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:306-330": "Transformer Backbone Model Setup",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:331-360": "Toshift_VIT Model Initialization",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:361-386": "TOShiftViT: Initializing and Processing Features",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:38-65": "Stochastic Depth Drop Path Implementation",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:387-413": "Positional Embedding and Attention Blocks",
+    "/paddlevideo/modeling/backbones/toshift_vit.py:66-104": "Self-Attention Mechanism Implementation",
+    "/paddlevideo/modeling/backbones/transnetv2.py": "OctConv3D Enhances TransNetV2 Backbone",
+    "/paddlevideo/modeling/backbones/transnetv2.py:1-28": "OctConv3D Layer Creation",
+    "/paddlevideo/modeling/backbones/transnetv2.py:105-131": "TransnetV2: Configurable Conv3D Backbone",
+    "/paddlevideo/modeling/backbones/transnetv2.py:132-150": "TransNetV2 Model Definition",
+    "/paddlevideo/modeling/backbones/transnetv2.py:151-179": "Stacked DDCNNV2: Neural Network Layer",
+    "/paddlevideo/modeling/backbones/transnetv2.py:181-207": "TransNetV2 Backbone Initialization",
+    "/paddlevideo/modeling/backbones/transnetv2.py:208-232": "Stochastic Depth ResNet Block",
+    "/paddlevideo/modeling/backbones/transnetv2.py:234-260": "TransNetV2 Layer Sequence",
+    "/paddlevideo/modeling/backbones/transnetv2.py:261-282": "TransNetV2 Backbone Initialization",
+    "/paddlevideo/modeling/backbones/transnetv2.py:283-307": "FrameSimilarity Layer Initialization",
+    "/paddlevideo/modeling/backbones/transnetv2.py:30-43": "Interleaved 3D Convolutional Paths for TransNetV2",
+    "/paddlevideo/modeling/backbones/transnetv2.py:309-330": "TransNetV2 Model Initialization and Similarity Calculation",
+    "/paddlevideo/modeling/backbones/transnetv2.py:331-346": "Tensor Indices and Regression",
+    "/paddlevideo/modeling/backbones/transnetv2.py:347-364": "TransNetV2 Conv3D Model",
+    "/paddlevideo/modeling/backbones/transnetv2.py:365-390": "TransnetV2 Frame Concatenation",
+    "/paddlevideo/modeling/backbones/transnetv2.py:391-411": "Color Histograms in TransnetV2",
+    "/paddlevideo/modeling/backbones/transnetv2.py:412-429": "Video Frame Histogram Comparison",
+    "/paddlevideo/modeling/backbones/transnetv2.py:430-446": "TransNetV2 Lookup Operation",
+    "/paddlevideo/modeling/backbones/transnetv2.py:44-58": "TransNetV2: Conv, Upsample, Downsample Backbone",
+    "/paddlevideo/modeling/backbones/transnetv2.py:449-473": "TransNetV2: Shot Transition Detection Model",
+    "/paddlevideo/modeling/backbones/transnetv2.py:473-484": "TransNetV2 ResNet Features Initialization",
+    "/paddlevideo/modeling/backbones/transnetv2.py:485-508": "TransNetv2 Backbone Initialization",
+    "/paddlevideo/modeling/backbones/transnetv2.py:510-526": "TransNetV2 Neural Network Model Initialization",
+    "/paddlevideo/modeling/backbones/transnetv2.py:527-548": "TransNetV2 Model Architecture",
+    "/paddlevideo/modeling/backbones/transnetv2.py:549-571": "TransNetV2 Feature Extraction and Pooling",
+    "/paddlevideo/modeling/backbones/transnetv2.py:572-581": "Transnetv2 Classification",
+    "/paddlevideo/modeling/backbones/transnetv2.py:60-90": "TransNetV2: Versatile Transformation Functions",
+    "/paddlevideo/modeling/backbones/transnetv2.py:91-104": "TransNetV2 Backbone Conv Layers",
+    "/paddlevideo/modeling/backbones/vit.py": "Vision Transformer Backbone in PaddleVideo",
+    "/paddlevideo/modeling/backbones/vit.py:1-37": "Drop Path Functions in PaddleVideo",
+    "/paddlevideo/modeling/backbones/vit.py:105-138": "Multi-Head Attention Layer Initialization",
+    "/paddlevideo/modeling/backbones/vit.py:139-166": "Vision Transformer Initializer",
+    "/paddlevideo/modeling/backbones/vit.py:167-185": "Vision Transformer Backbone Initialization",
+    "/paddlevideo/modeling/backbones/vit.py:186-210": "VI Forward Method",
+    "/paddlevideo/modeling/backbones/vit.py:211-235": "Spatial Attention in Vision Transformer Model",
+    "/paddlevideo/modeling/backbones/vit.py:236-267": "Vision Transformer Backbone: Averaging and Patch Embedding",
+    "/paddlevideo/modeling/backbones/vit.py:268-298": "VisionTransformer: Paddle Video Backbone",
+    "/paddlevideo/modeling/backbones/vit.py:299-327": "Initialize ViT Backbone Model Parameters",
+    "/paddlevideo/modeling/backbones/vit.py:328-350": "Initializing Vision Transformer Components",
+    "/paddlevideo/modeling/backbones/vit.py:351-379": "Vision Transformer Model Initialization",
+    "/paddlevideo/modeling/backbones/vit.py:38-65": "Stochastic Depth Dropout Paths for Vision Transformers",
+    "/paddlevideo/modeling/backbones/vit.py:380-405": "Vision Transformer Forward Initialization",
+    "/paddlevideo/modeling/backbones/vit.py:406-430": "Relative Position Embeddings in Vision Transformer Model",
+    "/paddlevideo/modeling/backbones/vit.py:431-455": "VIT Time Embedding and Attention Processing",
+    "/paddlevideo/modeling/backbones/vit.py:456-465": "Vit Model Frame Averaging Embeddings",
+    "/paddlevideo/modeling/backbones/vit.py:66-104": "Vision Transformer Backbone: MLP & Attention",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py": "VisionTransformer_tweaks: Time-Based Feature Modification",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:1-32": "VisionTransformer_tweaks Model",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:108-144": "Multi-Head Self-Attention Layer for Transformers",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:145-178": "Attention Block Class with QKV Decomposition",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:179-202": "Dynamic Norm Layer Instantiation",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:203-221": "Temporal Attention Module Initialization",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:223-247": "Flexible ViT Backbone with MLP and Attention Types",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:248-271": "Spatial Attention in ViT Models",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:273-302": "PatchEmbed Class in PaddleVideo Library",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:303-331": "Vision Transformer with Patch Input",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:332-360": "Vit Model Initialization",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:35-69": "Dropout and Bounding Box Functions",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:361-384": "Initializing Transformer Embeddings",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:385-414": "Transformer Model Customization",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:415-441": "Initializing Backbone Network Parameters",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:442-464": "Transformer Model Forward Pass: Positional Embedding Reshaping",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:465-487": "Vit Tweaks: Position & Time Embeddings",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:488-515": "Dynamic Frame Averaging for Vision Transformers",
+    "/paddlevideo/modeling/backbones/vit_tweaks.py:70-107": "Vit Tweak: Mlp Class with DropPath",
+    "/paddlevideo/modeling/backbones/yowo.py": "YOWO Video Backbone for Paddle",
+    "/paddlevideo/modeling/backbones/yowo.py:1-28": "CAM Module: PaddleVideo Backbone Custom Layer",
+    "/paddlevideo/modeling/backbones/yowo.py:109-129": "YOLOv5 Backbone Loading Weights",
+    "/paddlevideo/modeling/backbones/yowo.py:130-150": "YOWO Model Loading and Processing",
+    "/paddlevideo/modeling/backbones/yowo.py:29-53": "Channel-wise Attention CFAMPBlock",
+    "/paddlevideo/modeling/backbones/yowo.py:54-79": "YOWO Backbone: ConvLayers, BN, ReLU, CAM_Module",
+    "/paddlevideo/modeling/backbones/yowo.py:80-108": "YOWO Model Initialization",
+    "/paddlevideo/modeling/bbox_utils.py": "Bounding Box Utilities",
+    "/paddlevideo/modeling/bbox_utils.py:1-30": "Bounding Box Delta Calculator",
+    "/paddlevideo/modeling/bbox_utils.py:103-139": "Bounding Boxes Filters and Overlaps Calculation",
+    "/paddlevideo/modeling/bbox_utils.py:140-176": "Grid and YOLO Box Utilities",
+    "/paddlevideo/modeling/bbox_utils.py:177-204": "Bounding Box Utilities",
+    "/paddlevideo/modeling/bbox_utils.py:205-237": "Bounding Box IoU Calculator",
+    "/paddlevideo/modeling/bbox_utils.py:238-268": "Intersection Over Union Calculator for Bounding Boxes",
+    "/paddlevideo/modeling/bbox_utils.py:269-304": "Rotated Bounding Box Conversion",
+    "/paddlevideo/modeling/bbox_utils.py:305-339": "Bounding Box Regression Computation",
+    "/paddlevideo/modeling/bbox_utils.py:31-63": "Weighted Bounding Box Differential Conversion",
+    "/paddlevideo/modeling/bbox_utils.py:340-378": "Delta BBox Calculator",
+    "/paddlevideo/modeling/bbox_utils.py:379-417": "Decoding Bounding Boxes in Paddle Video",
+    "/paddlevideo/modeling/bbox_utils.py:418-447": "Bounding Box Dimensions and Angle Calculator",
+    "/paddlevideo/modeling/bbox_utils.py:448-475": "Find Best Begin Point in Coordinates",
+    "/paddlevideo/modeling/bbox_utils.py:476-503": "Rotated Rectangle to Polygon Conversion",
+    "/paddlevideo/modeling/bbox_utils.py:504-528": "Rotating Rectangles to Polygons",
+    "/paddlevideo/modeling/bbox_utils.py:65-102": "Bounding Box Utilities",
+    "/paddlevideo/modeling/builder.py": "Video Model Builder with Paddle",
+    "/paddlevideo/modeling/builder.py:1-19": "Video Object Detection Model Builder",
+    "/paddlevideo/modeling/builder.py:117-127": "Framework-Based Model Builder",
+    "/paddlevideo/modeling/builder.py:22-73": "Model Builder Functions",
+    "/paddlevideo/modeling/builder.py:74-116": "Dynamically Building Paddle Video Components",
+    "/paddlevideo/modeling/framework/__init__.py": "PaddleVideo Framework Base Classes",
+    "/paddlevideo/modeling/framework/__init__.py:1-24": "PaddleVideo Framework Base Classes",
+    "/paddlevideo/modeling/framework/__init__.py:25-28": "Model Classes in PaddleVideo Framework",
+    "/paddlevideo/modeling/framework/detectors/__init__.py": "Detector Imports in PaddleVideo",
+    "/paddlevideo/modeling/framework/detectors/base.py": "Abstract BaseDetector Class",
+    "/paddlevideo/modeling/framework/detectors/base.py:1-36": "Base Detector Class: Foundation for Detection Models",
+    "/paddlevideo/modeling/framework/detectors/base.py:37-51": "Abstract Base Classes for ML Training, Validation, and Testing",
+    "/paddlevideo/modeling/framework/detectors/fast_rcnn.py": "Fast R-CNN Detector",
+    "/paddlevideo/modeling/framework/detectors/fast_rcnn.py:1-30": "Fast R-CNN Detector",
+    "/paddlevideo/modeling/framework/detectors/fast_rcnn.py:31-34": "Fast RCNN Detector Builder",
+    "/paddlevideo/modeling/framework/detectors/two_stage.py": "Two-Stage Slowfast Detector",
+    "/paddlevideo/modeling/framework/detectors/two_stage.py:1-32": "Two-Stage Detector Base Class",
+    "/paddlevideo/modeling/framework/detectors/two_stage.py:125-152": "Two-Stage Detector Data Retrieval",
+    "/paddlevideo/modeling/framework/detectors/two_stage.py:153-176": "Two-Stage Detector: GT Bounding Boxes Generation",
+    "/paddlevideo/modeling/framework/detectors/two_stage.py:178-186": "Selecting Entity IDs with Paddle's Index Select",
+    "/paddlevideo/modeling/framework/detectors/two_stage.py:33-64": "Two-Stage Object Detector Class",
+    "/paddlevideo/modeling/framework/detectors/two_stage.py:66-91": "Two-Stage Detector Feature Extraction and Loss Computation",
+    "/paddlevideo/modeling/framework/detectors/two_stage.py:92-124": "SlowFast Model Detectors: Val, Test, Infer",
+    "/paddlevideo/modeling/framework/estimators/__init__.py": "Estimators Import and Definition",
+    "/paddlevideo/modeling/framework/estimators/base.py": "PaddleVideo BaseEstimator Class",
+    "/paddlevideo/modeling/framework/estimators/base.py:1-34": "BaseEstimator: PaddleVideo's Estimator Foundation",
+    "/paddlevideo/modeling/framework/estimators/base.py:35-66": "Versatile Estimator Framework",
+    "/paddlevideo/modeling/framework/estimators/base.py:67-82": "Abstract Methods for Model Validation",
+    "/paddlevideo/modeling/framework/estimators/depth_estimator.py": "DepthEstimator: Feature Extraction and Loss Metrics",
+    "/paddlevideo/modeling/framework/estimators/depth_estimator.py:1-31": "DepthEstimator: Framework for Feature Extraction",
+    "/paddlevideo/modeling/framework/estimators/depth_estimator.py:32-58": "Depth Estimator Steps: Train, Validate, Test, Infer",
+    "/paddlevideo/modeling/framework/estimators/depth_estimator.py:59-59": "Depth Estimator Results",
+    "/paddlevideo/modeling/framework/localizers/__init__.py": "PaddleVideo Localizers: Handling Video Localization Tasks",
+    "/paddlevideo/modeling/framework/localizers/base.py": "Localization Model Base Class (Python)",
+    "/paddlevideo/modeling/framework/localizers/base.py:1-27": "Base Class for Localization Models",
+    "/paddlevideo/modeling/framework/localizers/base.py:28-56": "Localizer Model Initialization",
+    "/paddlevideo/modeling/framework/localizers/base.py:58-74": "Abstract Classes for Model Steps",
+    "/paddlevideo/modeling/framework/localizers/bmn_localizer.py": "BMN Localizer Model for PaddleVideo",
+    "/paddlevideo/modeling/framework/localizers/bmn_localizer.py:1-36": "BMNLocalizer: PaddleVideo's Localization Framework",
+    "/paddlevideo/modeling/framework/localizers/bmn_localizer.py:37-69": "BMN Localizer: Training, Testing, Inferring",
+    "/paddlevideo/modeling/framework/localizers/yowo_localizer.py": "YOWO Localizer Evaluation",
+    "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:1-33": "YOWO Localizer Class: PaddleVideo",
+    "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:126-147": "YOWO Localizer Evaluation: Precision, Recall, F-Score",
+    "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:149-161": "YOWO Localizer Functions",
+    "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:34-67": "Model Training and Validation Process",
+    "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:68-90": "Non-Maximum Suppression and Precision Calculation",
+    "/paddlevideo/modeling/framework/localizers/yowo_localizer.py:91-125": "YOLOv3 Localizer Metrics Calculator",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py": "YOWO Utils: Non-Max Suppression and Tensor Movement",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:1-36": "YOWO Localizers: Truths Length & NMS",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:123-153": "YOLO Bounding Box Extractor",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:154-178": "YOLO Object Detection Utilities",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:179-213": "Intersection-Over-Union Calculator",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:214-241": "Intersection Over Union Code",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:244-268": "YOLO Localizer Ground Truth Builder",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:269-288": "IoU Calculation for YoWo Localizers",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:289-315": "IoU-Based Masking for Bounding Box Confidences",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:316-338": "YOLO Anchor Box Selection",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:340-357": "Object Localization Metric",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:358-359": "Localizers Framework Functions",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:37-67": "Functional Definitions for Bounding Boxes",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:68-94": "Box Regression via Reshaping and Sigmoid",
+    "/paddlevideo/modeling/framework/localizers/yowo_utils.py:96-122": "YOLOv2 Bounding Box Processing",
+    "/paddlevideo/modeling/framework/multimodal/__init__.py": "Multimodal Model Initialization",
+    "/paddlevideo/modeling/framework/multimodal/actbert.py": "Introducing ActBert: Multimodal Model Training",
+    "/paddlevideo/modeling/framework/multimodal/actbert.py:1-27": "ActBert: Multimodal Model Framework",
+    "/paddlevideo/modeling/framework/multimodal/actbert.py:28-46": "ActBert Dataset Train and Val Steps",
+    "/paddlevideo/modeling/framework/multimodal/actbert.py:47-64": "Multimodal ACT-BERT Model",
+    "/paddlevideo/modeling/framework/multimodal/base.py": "Multimodal Base Class for PaddleVideo",
+    "/paddlevideo/modeling/framework/multimodal/base.py:1-32": "Multimodal Base Class for PaddleVideo",
+    "/paddlevideo/modeling/framework/multimodal/base.py:33-63": "Multimodal Base Class with Selectable Step Functions",
+    "/paddlevideo/modeling/framework/multimodal/base.py:65-81": "Abstract Methods for Validation, Testing, and Inference",
+    "/paddlevideo/modeling/framework/partitioners/__init__.py": "PaddleVideo Partitioner Initialization",
+    "/paddlevideo/modeling/framework/partitioners/base.py": "BaseModelPartitioner: PaddleVideo's Modeling Framework",
+    "/paddlevideo/modeling/framework/partitioners/base.py:1-27": "Python Base Partitioner Class for PaddleVideo",
+    "/paddlevideo/modeling/framework/partitioners/base.py:28-55": "Partitioned Model Initialization",
+    "/paddlevideo/modeling/framework/partitioners/base.py:56-84": "Base Model Partitioner Class",
+    "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py": "TransNetV2 Partitioner in PaddleVideo",
+    "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py:1-32": "TransNetV2 Partitioner for PaddleVideo",
+    "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py:33-54": "TransNetV2 Partitioner Loss Metrics",
+    "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py:55-68": "TransnetV2 Partitioner Methods",
+    "/paddlevideo/modeling/framework/recognizers/__init__.py": "PaddleVideo Recognizers: Action & Motion",
+    "/paddlevideo/modeling/framework/recognizers/__init__.py:1-23": "PaddleVideo Recognizers: A Versatile Toolkit",
+    "/paddlevideo/modeling/framework/recognizers/__init__.py:25-30": "PaddleVideo Recognizer Models",
+    "/paddlevideo/modeling/framework/recognizers/base.py": "Base Recognizer Model in PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/base.py:1-33": "Base Recognizer Class: Override Train, Valid, Test Steps",
+    "/paddlevideo/modeling/framework/recognizers/base.py:34-66": "Initialize and Train Model's Head",
+    "/paddlevideo/modeling/framework/recognizers/base.py:67-81": "Abstract Base Recognizer Steps in PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/recognizer1d.py": "1D Recognizer Model in PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:1-29": "1D Recognizer Model Framework in PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:30-61": "1D Recognizer Model Processing Image and Audio Data",
+    "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:62-91": "1D Recognizer Model Framework",
+    "/paddlevideo/modeling/framework/recognizers/recognizer1d.py:93-111": "Shared Implementation in Validating, Testing, and Inferring Steps",
+    "/paddlevideo/modeling/framework/recognizers/recognizer2d.py": "2D Video Recognizer in PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/recognizer2d.py:1-27": "2D Recognizer Model Framework in PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/recognizer2d.py:28-60": "Video Analysis Recognizer2D Model",
+    "/paddlevideo/modeling/framework/recognizers/recognizer2d.py:60-69": "Recognizer2D Class and Methods: Forward Net and Infer Step",
+    "/paddlevideo/modeling/framework/recognizers/recognizer3d.py": "Recognizer3D: 3D Object Recognition Framework",
+    "/paddlevideo/modeling/framework/recognizers/recognizer3d.py:1-33": "3D Recognizer Framework in PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/recognizer3d.py:34-64": "Training and Validation Steps in 3D Recognizer Model",
+    "/paddlevideo/modeling/framework/recognizers/recognizer3d.py:66-93": "Reshape Input for ResNet3dSlowOnly",
+    "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py": "3D Recognizer Model in PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py:1-31": "3D MRI Recognizer Framework",
+    "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py:32-65": "3D MRI Recognizer Model Training and Testing",
+    "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py:66-81": "Dual Test/Infer Steps in Recognizer3D MRI Model",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py": "Recognizer Distillation in PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:1-34": "Recognizer Distillation Layer for PaddleVideo",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:116-136": "Distillation Recognizer Loss Calculation",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:137-165": "Training Step and Recognizer Distillation",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:167-193": "Distillation Recognizer Class",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:195-224": "Evaluating Student Model in Image Recognition",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:225-231": "Selecting and Applying Model",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:35-60": "Distillation Model Initialization",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:61-85": "Distillation Recognizer Model Initialization",
+    "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py:86-114": "Distillation Recognizer Framework",
+    "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py": "2D Image Classifier with PaddleVideo's RecognizerMRI",
+    "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py:1-27": "PaddleVideo: RecognizerMRI Framework",
+    "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py:28-59": "Image Classification Model Definition",
+    "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py:60-76": "Testing Steps without Head Loss"
+}
\ No newline at end of file
diff --git a/docs/data/titles/12.json b/docs/data/titles/12.json
new file mode 100644
index 000000000..f3e850529
--- /dev/null
+++ b/docs/data/titles/12.json
@@ -0,0 +1,302 @@
+{
+    "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py": "GCN Recognizer Model Framework",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py:1-33": "GCN Recognizer Model Framework",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py:34-66": "RecognizerGCN: Image Classification Model",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py:67-87": "GCN Recognizer Model with Test and Infer Steps",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py": "MoViNet Recognizer Framework",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py:1-33": "MoViNet Frame Recognizer Class",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py:34-57": "Training and Validation Steps in Recognizer",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py:58-78": "Model's Forward, Test, and Infer Steps",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py": "Transformer-Based Recognizer Model",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:1-31": "Transformer-Based Recognizer Framework",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:33-62": "Training, Validation, Testing Steps in Recognizer Transformer",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:63-86": "Multi-View Image Inference Model",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py:87-98": "Averaging Method for Recognizer Transformer",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py": "Recognizer-Transformer MRI Model Code",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:1-32": "RecognizerTransformer_MRI Model Definition",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:33-63": "Recognizer Transformer Image Classifier",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:65-89": "Average-View Model Inference",
+    "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py:90-104": "Combining Scores in Recognizer Transformer",
+    "/paddlevideo/modeling/framework/segment/__init__.py": "PaddleVideo Segment Models",
+    "/paddlevideo/modeling/framework/segment/base.py": "Semi-Video Object Segmentation Base Class",
+    "/paddlevideo/modeling/framework/segment/base.py:1-29": "Abstract Base Class for Semi-Video Object Segmentation",
+    "/paddlevideo/modeling/framework/segment/base.py:30-57": "Segment Model Initialization and Forward Pass",
+    "/paddlevideo/modeling/framework/segment/base.py:58-90": "Abstract Class for Model Training Phases",
+    "/paddlevideo/modeling/framework/segment/cfbi.py": "CFBI Model for AI Segmentation",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:1-30": "CFBI Model Python Class",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:109-127": "Resizing CFBI in PaddleVideo",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:128-144": "Preparing Frame Embeddings for Attention and Loss Calculation",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:145-164": "Distance Bias Assignment for Frame Sequences",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:165-184": "NotImplementedError Handler",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:185-200": "Global Matching Evaluation in CFBI Model",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:201-216": "Preparing Input for Local Matching Function",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:217-237": "Global/Local Background Subtraction for Image Segmentation",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:238-256": "Video Segmentation Model Code",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:257-279": "CFBI Attention Calculation",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:280-286": "Append and Pass for Attention Model",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:31-56": "CFBI Framework Testing",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:57-84": "PaddleVideo CFBI Else Block",
+    "/paddlevideo/modeling/framework/segment/cfbi.py:85-108": "Segmentation Head Initialization",
+    "/paddlevideo/modeling/framework/segment/utils.py": "PaddleVideo Framework for Object Segment Matching",
+    "/paddlevideo/modeling/framework/segment/utils.py:1-31": "Foreground to Background Distance Conversion",
+    "/paddlevideo/modeling/framework/segment/utils.py:114-138": "Query-Reference Frame Feature Computation",
+    "/paddlevideo/modeling/framework/segment/utils.py:139-167": "Global Matching with Embedding Chunks",
+    "/paddlevideo/modeling/framework/segment/utils.py:168-186": "Nearest Neighbor Distance Calculator",
+    "/paddlevideo/modeling/framework/segment/utils.py:187-209": "Spatial Pyramid Pooling Point Padding",
+    "/paddlevideo/modeling/framework/segment/utils.py:210-230": "Segmentation Method in PaddleVideo Library",
+    "/paddlevideo/modeling/framework/segment/utils.py:231-257": "Nearest Neighbor Video Segment Matching",
+    "/paddlevideo/modeling/framework/segment/utils.py:258-277": "Distance to Nearest Neighbor Calculator",
+    "/paddlevideo/modeling/framework/segment/utils.py:278-299": "Atrous Tensor Matching Function",
+    "/paddlevideo/modeling/framework/segment/utils.py:300-318": "Image Segmentation Atrous Rate Code",
+    "/paddlevideo/modeling/framework/segment/utils.py:32-68": "Pairwise L2 Distance Matrix Calculator",
+    "/paddlevideo/modeling/framework/segment/utils.py:320-338": "Concatenate and Pad Embeddings",
+    "/paddlevideo/modeling/framework/segment/utils.py:339-356": "Embedding Reshaper",
+    "/paddlevideo/modeling/framework/segment/utils.py:357-375": "Atrous Spatial Pyramid Pooling Padder",
+    "/paddlevideo/modeling/framework/segment/utils.py:376-394": "Flattened Embeddings Conversion",
+    "/paddlevideo/modeling/framework/segment/utils.py:395-415": "Feature Selection and Reshaping for Segment Matching",
+    "/paddlevideo/modeling/framework/segment/utils.py:416-442": "Pairwise L2 Distance Nearest Neighbor Features",
+    "/paddlevideo/modeling/framework/segment/utils.py:443-464": "Downsampling with Bilinear Interpolation",
+    "/paddlevideo/modeling/framework/segment/utils.py:465-492": "Atrous Dilation Pairwise Distance Calculation",
+    "/paddlevideo/modeling/framework/segment/utils.py:493-513": "Pairwise L2 Distance Compute Function",
+    "/paddlevideo/modeling/framework/segment/utils.py:514-537": "Downsizing and Padding Tensors",
+    "/paddlevideo/modeling/framework/segment/utils.py:539-566": "Distance Calculator for Frame Embeddings",
+    "/paddlevideo/modeling/framework/segment/utils.py:567-584": "Nearest Neighbor Video Segmentation",
+    "/paddlevideo/modeling/framework/segment/utils.py:585-609": "Parallel Nearest Neighbor Calculation",
+    "/paddlevideo/modeling/framework/segment/utils.py:610-635": "Pairwise Distance Calculator",
+    "/paddlevideo/modeling/framework/segment/utils.py:637-662": "Atrous Spatial Pyramid Pooling in PaddlePaddle",
+    "/paddlevideo/modeling/framework/segment/utils.py:663-684": "Image Segmentation with Distance Matrix",
+    "/paddlevideo/modeling/framework/segment/utils.py:685-709": "Attention Heads Calculator",
+    "/paddlevideo/modeling/framework/segment/utils.py:69-92": "Pairwise Distance Computation and Feature Extraction",
+    "/paddlevideo/modeling/framework/segment/utils.py:710-736": "Attention Head Evaluation",
+    "/paddlevideo/modeling/framework/segment/utils.py:737-754": "Total Head Calculation with Stability",
+    "/paddlevideo/modeling/framework/segment/utils.py:93-113": "Nearest Neighbor Features Calculator",
+    "/paddlevideo/modeling/framework/segmenters/__init__.py": "PaddleVideo Segmenter Modules",
+    "/paddlevideo/modeling/framework/segmenters/asrf.py": "ASRF Segmentation Model in PaddleVideo",
+    "/paddlevideo/modeling/framework/segmenters/asrf.py:1-33": "ASRF: PaddleVideo Segmenter Model",
+    "/paddlevideo/modeling/framework/segmenters/asrf.py:101-129": "ASRF Segmentation Model Inference",
+    "/paddlevideo/modeling/framework/segmenters/asrf.py:130-143": "Forward Pass and Sigmoid Application",
+    "/paddlevideo/modeling/framework/segmenters/asrf.py:34-67": "Segmentation Model Training Code",
+    "/paddlevideo/modeling/framework/segmenters/asrf.py:69-100": "ASRF Model Validation Step",
+    "/paddlevideo/modeling/framework/segmenters/base.py": "BaseSegmenter: Foundation for PaddleVideo Segmenters",
+    "/paddlevideo/modeling/framework/segmenters/base.py:1-30": "BaseSegmenter: Foundation for All Segmenters",
+    "/paddlevideo/modeling/framework/segmenters/base.py:100-100": "NotImplementedError in Base Segmenter",
+    "/paddlevideo/modeling/framework/segmenters/base.py:32-63": "Segmenter Base Class Init",
+    "/paddlevideo/modeling/framework/segmenters/base.py:64-99": "Trainable Segmenter Base Class",
+    "/paddlevideo/modeling/framework/segmenters/ms_tcn.py": "MS-TCN Video Segmentation Tool",
+    "/paddlevideo/modeling/framework/segmenters/ms_tcn.py:1-33": "MS-TCN Video Segmenter",
+    "/paddlevideo/modeling/framework/segmenters/ms_tcn.py:34-70": "MS-TCN Segmenter Training and Validation",
+    "/paddlevideo/modeling/framework/segmenters/ms_tcn.py:72-101": "MS-TCN Model: Train, Test, and Infer Functions",
+    "/paddlevideo/modeling/framework/segmenters/utils.py": "Gaussian Smoothing in PaddlePaddle",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:1-30": "Gaussian Smoothing in PaddlePaddle",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:149-176": "Boundary-Based Action Segmentation",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:177-203": "Majority Class Action Segmentation",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:204-242": "Smoothing and Relabeling Functions",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:243-270": "ASRF Post-Processing for Action Segmentation",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:271-301": "Tensor Fan-In/Out and Refinement Function",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:302-335": "Neural Network Weight Initialization Code",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:31-62": "Gaussian Kernel Initialization and Application",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:336-343": "Initialize Weights and Biases for Neural Network Layer",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:63-95": "1D Convolution and Argrelmax Functions for Image Processing",
+    "/paddlevideo/modeling/framework/segmenters/utils.py:97-146": "Tensor Conversion Functions",
+    "/paddlevideo/modeling/heads/__init__.py": "Importing Video Heads from PaddleVideo",
+    "/paddlevideo/modeling/heads/__init__.py:1-25": "Importing PaddleVideo Heads",
+    "/paddlevideo/modeling/heads/__init__.py:26-49": "Versatile Video Heads Import",
+    "/paddlevideo/modeling/heads/adds_head.py": "AddsHead: Object Detection in PaddleVideo",
+    "/paddlevideo/modeling/heads/adds_head.py:1-33": "AddsHead Class Definition",
+    "/paddlevideo/modeling/heads/adds_head.py:118-144": "Error Metrics in Depth Prediction",
+    "/paddlevideo/modeling/heads/adds_head.py:146-146": "Metrics for Regression Models",
+    "/paddlevideo/modeling/heads/adds_head.py:34-62": "AddsHead: Initialization and Forward Pass",
+    "/paddlevideo/modeling/heads/adds_head.py:63-95": "AddsHead: Compute Error Metrics",
+    "/paddlevideo/modeling/heads/adds_head.py:96-117": "Multi-GPU Tensor Averaging",
+    "/paddlevideo/modeling/heads/agcn2s_head.py": "AGCN2s Head: PaddleVideo's Versatile Model Component",
+    "/paddlevideo/modeling/heads/agcn2s_head.py:1-32": "AGCN2s Head Class in PaddleVideo",
+    "/paddlevideo/modeling/heads/agcn2s_head.py:33-56": "Agcn2sHead: Initialize Linear Layer and Reshape",
+    "/paddlevideo/modeling/heads/agcn2s_head.py:57-59": "Average-Then-FC Aggregation",
+    "/paddlevideo/modeling/heads/asrf_head.py": "ASRF Head: Action Recognition and Metrics",
+    "/paddlevideo/modeling/heads/asrf_head.py:1-32": "ASRF Head: PaddleVideo Modeling",
+    "/paddlevideo/modeling/heads/asrf_head.py:100-136": "ASRF Head and F1 Score Calculation",
+    "/paddlevideo/modeling/heads/asrf_head.py:137-170": "ASRF Head: Labels and Levenshtein Distance",
+    "/paddlevideo/modeling/heads/asrf_head.py:171-200": "Edit Score Calculation with Levenshtein Distance",
+    "/paddlevideo/modeling/heads/asrf_head.py:201-212": "ASRF Head: Calculating Metrics",
+    "/paddlevideo/modeling/heads/asrf_head.py:34-63": "ASRF Head Model Initialization",
+    "/paddlevideo/modeling/heads/asrf_head.py:64-98": "ASRF Head Model: Forward Pass and Weights Init",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py": "LSTM Attention Mechanism for PaddleVideo",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:1-32": "Attention LSTM Head: PaddleVideo's Neural Network Component",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:121-144": "Attention LSTM Sequence Modeling Head",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:145-173": "Attention LSTM Head Metrics",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:174-195": "Bidirectional LSTM Attention Mechanism for Multimodal Fusion",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:196-221": "Attention-based LSTM Head in PaddleVideo",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:222-244": "Bi-directional LSTM Attention Head",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:245-267": "LSTM-based Attention Pooling for Neural Networks",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:268-288": "LSTM Attention Head with Loss and Metrics",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:33-53": "Bi-directional LSTM Attention Head for Video Classification",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:54-74": "Bidirectional LSTM Attention Head",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:75-95": "Attention LSTM Head in PaddleVideo",
+    "/paddlevideo/modeling/heads/attention_lstm_head.py:96-120": "Attention LSTM Head",
+    "/paddlevideo/modeling/heads/base.py": "PaddleVideo Classification Head: Versatile, Distributed",
+    "/paddlevideo/modeling/heads/base.py:1-34": "Base Head Initializer: Initialize Weights for Subclasses",
+    "/paddlevideo/modeling/heads/base.py:114-143": "Classification Loss Function",
+    "/paddlevideo/modeling/heads/base.py:144-164": "Uniform Hard/Soft Loss Calculation",
+    "/paddlevideo/modeling/heads/base.py:165-178": "Average Metrics Across Devices",
+    "/paddlevideo/modeling/heads/base.py:35-65": "PaddleVideo Classification Head Base",
+    "/paddlevideo/modeling/heads/base.py:67-91": "Loss, Accuracy Calculator",
+    "/paddlevideo/modeling/heads/base.py:92-113": "Mix-up Loss for MRI Classification",
+    "/paddlevideo/modeling/heads/bbox_head.py": "BBoxHeadAVA: Box Detection and Evaluation",
+    "/paddlevideo/modeling/heads/bbox_head.py:1-32": "BBoxHeadAVA: Simple RoI Head with Pooling Options",
+    "/paddlevideo/modeling/heads/bbox_head.py:107-126": "BBox Head Generator in PaddleVideo",
+    "/paddlevideo/modeling/heads/bbox_head.py:128-152": "PaddleVideo Bbox Head Labeling and Comparison",
+    "/paddlevideo/modeling/heads/bbox_head.py:153-171": "Multi-Label Recall and Precision Calculation",
+    "/paddlevideo/modeling/heads/bbox_head.py:172-195": "BBox Head: Recall and Precision Calculation",
+    "/paddlevideo/modeling/heads/bbox_head.py:196-218": "Bounding Box Heads: Accuracy and Loss Calculation",
+    "/paddlevideo/modeling/heads/bbox_head.py:219-225": "Calculating BBox Scores in PaddleVideo Model",
+    "/paddlevideo/modeling/heads/bbox_head.py:33-61": "Class BBoxHeadAVA Initialization",
+    "/paddlevideo/modeling/heads/bbox_head.py:62-83": "BBoxHead Model Initialization",
+    "/paddlevideo/modeling/heads/bbox_head.py:85-106": "Bbox Head Classification and Debug Image Init",
+    "/paddlevideo/modeling/heads/cfbi_head.py": "Multi-Input Collaborative Ensembler Network",
+    "/paddlevideo/modeling/heads/cfbi_head.py:1-32": "IA_Gate Layer Class Definition",
+    "/paddlevideo/modeling/heads/cfbi_head.py:120-160": "Convolutional Feature Fusion Head (CFBI)",
+    "/paddlevideo/modeling/heads/cfbi_head.py:161-193": "Convolutional Feature Fusion Block and ASPP Module",
+    "/paddlevideo/modeling/heads/cfbi_head.py:195-218": "ASPP Modules with Global Pooling in CFBI Head",
+    "/paddlevideo/modeling/heads/cfbi_head.py:220-251": "CFBI Head: Deep Feature Extraction and Aggregation",
+    "/paddlevideo/modeling/heads/cfbi_head.py:254-279": "CollaborativeEnsemblerMS Class in PaddleVideo",
+    "/paddlevideo/modeling/heads/cfbi_head.py:281-306": "Multi-Stage Transformer Layer Initialization",
+    "/paddlevideo/modeling/heads/cfbi_head.py:308-332": "Feature Extraction and Fusion Model Components",
+    "/paddlevideo/modeling/heads/cfbi_head.py:33-65": "GCT Layer Definition and Initialization",
+    "/paddlevideo/modeling/heads/cfbi_head.py:333-360": "Neural Network Architecture for CV Task",
+    "/paddlevideo/modeling/heads/cfbi_head.py:361-401": "Instance Segmentation Network Architecture with ASPP Module",
+    "/paddlevideo/modeling/heads/cfbi_head.py:402-433": "Convolutional Feature Binding IA Head",
+    "/paddlevideo/modeling/heads/cfbi_head.py:435-448": "Augmented Background Logit Fusion",
+    "/paddlevideo/modeling/heads/cfbi_head.py:66-95": "PaddleVideo's CFBI Head",
+    "/paddlevideo/modeling/heads/cfbi_head.py:96-119": "CFBI Head: BatchNorm-ConvNet with ReLU",
+    "/paddlevideo/modeling/heads/ctrgcn_head.py": "CTR-GCN Neural Network Head",
+    "/paddlevideo/modeling/heads/ctrgcn_head.py:1-32": "CTR-GCN Head in PaddleVideo",
+    "/paddlevideo/modeling/heads/ctrgcn_head.py:34-63": "Neural Network Head Constructor with Dropout",
+    "/paddlevideo/modeling/heads/ctrgcn_head.py:65-65": "FC Layer in CTRGCN Head",
+    "/paddlevideo/modeling/heads/i3d_head.py": "I3D Classification Head in PaddleVideo",
+    "/paddlevideo/modeling/heads/i3d_head.py:1-31": "I3D Head: Classification for PaddleVideo",
+    "/paddlevideo/modeling/heads/i3d_head.py:32-59": "I3D Head Class Constructor",
+    "/paddlevideo/modeling/heads/i3d_head.py:60-91": "I3D Head: Feature Processing and Classification",
+    "/paddlevideo/modeling/heads/i3d_head.py:92-95": "Output Layer for PaddleVideo Classification",
+    "/paddlevideo/modeling/heads/movinet_head.py": "MoViNetHead: Custom Head for Video Classification",
+    "/paddlevideo/modeling/heads/ms_tcn_head.py": "MS-TCN Head: Loss Calculation",
+    "/paddlevideo/modeling/heads/ms_tcn_head.py:1-33": "MS-TCN Head: CrossEntropy and MSE Losses",
+    "/paddlevideo/modeling/heads/ms_tcn_head.py:106-137": "Edit Score Calculation Functions",
+    "/paddlevideo/modeling/heads/ms_tcn_head.py:138-165": "F-score Calculator for Labeled Sequences",
+    "/paddlevideo/modeling/heads/ms_tcn_head.py:34-68": "MS-TCN Head: Loss and F1 Score Calculation",
+    "/paddlevideo/modeling/heads/ms_tcn_head.py:69-105": "F1 Score Calculation and Label Extraction Algorithm",
+    "/paddlevideo/modeling/heads/pptimesformer_head.py": "PaddlePaddle TimeSformer Head",
+    "/paddlevideo/modeling/heads/pptimesformer_head.py:1-30": "Introducing ppTimeSformerHead Class",
+    "/paddlevideo/modeling/heads/pptimesformer_head.py:31-58": "PPTimesformerHead: Paddle Video Model Class",
+    "/paddlevideo/modeling/heads/pptimesformer_head.py:59-74": "PPTimesformer Head Definition",
+    "/paddlevideo/modeling/heads/pptsm_head.py": "PaddlePaddle Video: PPTSMHead Initialization",
+    "/paddlevideo/modeling/heads/pptsm_head.py:1-31": "ppTSMHead: PaddleVideo Registry Class",
+    "/paddlevideo/modeling/heads/pptsm_head.py:32-58": "PPTSM Head Initialization",
+    "/paddlevideo/modeling/heads/pptsm_head.py:59-87": "PPTSM Head Initialization",
+    "/paddlevideo/modeling/heads/pptsm_head.py:88-92": "PaddleVideo's PptsmHead FC Function",
+    "/paddlevideo/modeling/heads/pptsn_head.py": "PaddlePaddle PP-TSN Head Classification",
+    "/paddlevideo/modeling/heads/pptsn_head.py:1-30": "Python PP-TSN Head Implementation",
+    "/paddlevideo/modeling/heads/pptsn_head.py:31-54": "Adaptive Pooling PPTSN Head",
+    "/paddlevideo/modeling/heads/pptsn_head.py:56-84": "PaddlePaddle Classification Head Code",
+    "/paddlevideo/modeling/heads/pptsn_head.py:85-103": "PPTSN Head Processing",
+    "/paddlevideo/modeling/heads/roi_extractor.py": "RoIAlign: Region Feature Alignment",
+    "/paddlevideo/modeling/heads/roi_extractor.py:1-31": "RoIAlign: Feature Alignment Tool",
+    "/paddlevideo/modeling/heads/roi_extractor.py:32-53": "ROI Alignment with PaddlePaddle",
+    "/paddlevideo/modeling/heads/roi_head.py": "ROI Head for Object Detection",
+    "/paddlevideo/modeling/heads/roi_head.py:1-29": "Bounding Box to Detection Results Converter",
+    "/paddlevideo/modeling/heads/roi_head.py:115-134": "ROI Head: Bbox Loss Calculation and Assignment",
+    "/paddlevideo/modeling/heads/roi_head.py:135-158": "RoI Head BBox Prediction Functions",
+    "/paddlevideo/modeling/heads/roi_head.py:159-177": "Detect Bboxes Without Augmentation",
+    "/paddlevideo/modeling/heads/roi_head.py:30-59": "NMS-Based Bounding Box Filtering",
+    "/paddlevideo/modeling/heads/roi_head.py:60-93": "PaddlePaddle RoI Head Class",
+    "/paddlevideo/modeling/heads/roi_head.py:94-114": "Bbox Head Initialization and Feature Extraction",
+    "/paddlevideo/modeling/heads/single_straight3d.py": "Single Straight 3D ROI Extractor",
+    "/paddlevideo/modeling/heads/single_straight3d.py:1-28": "SingleRoIExtractor3D: RoI Extractor for 3D Features",
+    "/paddlevideo/modeling/heads/single_straight3d.py:29-55": "3D Head Feature Extraction",
+    "/paddlevideo/modeling/heads/single_straight3d.py:56-79": "Spatio-Temporal Feature Extraction and ROI Pooling",
+    "/paddlevideo/modeling/heads/slowfast_head.py": "SlowFast 3D Head Initialization",
+    "/paddlevideo/modeling/heads/slowfast_head.py:1-30": "SlowFast Head: PaddleVideo ResNeXt 3D Projection",
+    "/paddlevideo/modeling/heads/slowfast_head.py:114-137": "SlowFast Head: Pooling and Dropout Operations",
+    "/paddlevideo/modeling/heads/slowfast_head.py:31-56": "SlowFast_Head: Concatenating Multi-Pathway Classifier",
+    "/paddlevideo/modeling/heads/slowfast_head.py:57-83": "Initializing SlowFast Head Model Parameters",
+    "/paddlevideo/modeling/heads/slowfast_head.py:84-113": "SlowFast Head Model Initialization",
+    "/paddlevideo/modeling/heads/stgcn_head.py": "STGCN Head Initialization and Forward Pass",
+    "/paddlevideo/modeling/heads/stgcn_head.py:1-32": "STGCN Head: PaddlePaddle's Video Modeling Class",
+    "/paddlevideo/modeling/heads/stgcn_head.py:33-50": "Convolutional STGCN Head",
+    "/paddlevideo/modeling/heads/timesformer_head.py": "TimeSformer Head: TimeSformer's Model Head",
+    "/paddlevideo/modeling/heads/timesformer_head.py:1-29": "TimeSformer Head Class",
+    "/paddlevideo/modeling/heads/timesformer_head.py:30-60": "TimeSformer Head: PaddlePaddle's Dynamic Initialization",
+    "/paddlevideo/modeling/heads/timesformer_head.py:61-70": "Fully Connected Layer with Dropout Clarification",
+    "/paddlevideo/modeling/heads/token_shift_head.py": "TokenShiftHead: Paddle's Classification Framework",
+    "/paddlevideo/modeling/heads/token_shift_head.py:1-30": "TokenShiftHead: Transformer Classification Task Head",
+    "/paddlevideo/modeling/heads/token_shift_head.py:31-60": "Initializing Token Shift Head Parameters",
+    "/paddlevideo/modeling/heads/token_shift_head.py:61-79": "TokenShiftHead: Classification Scores for Each Frame",
+    "/paddlevideo/modeling/heads/transnetv2_head.py": "TransNetV2Head: Loss and F1 Score in Computer Vision",
+    "/paddlevideo/modeling/heads/transnetv2_head.py:1-29": "TransNetV2Head: CV Model Base Class",
+    "/paddlevideo/modeling/heads/transnetv2_head.py:30-45": "TransnetV2 Head: Loss and F1 Score Calculation",
+    "/paddlevideo/modeling/heads/tsm_head.py": "TSM Head: PaddleVideo's Temporal Segment Network",
+    "/paddlevideo/modeling/heads/tsm_head.py:1-33": "TSM Head Class",
+    "/paddlevideo/modeling/heads/tsm_head.py:34-57": "TSM Head: PyTorch Class Initialization",
+    "/paddlevideo/modeling/heads/tsm_head.py:58-89": "TSM Head Initialization",
+    "/paddlevideo/modeling/heads/tsm_head.py:90-99": "Temporal Segment Network Head Score Averaging",
+    "/paddlevideo/modeling/heads/tsn_head.py": "TSN Head: Image Classification in PaddleVideo",
+    "/paddlevideo/modeling/heads/tsn_head.py:1-30": "TSN Head: Image Classification Model",
+    "/paddlevideo/modeling/heads/tsn_head.py:31-63": "TSN Head Initialization",
+    "/paddlevideo/modeling/heads/tsn_head.py:64-93": "TSN Head: Average Pooling and Classification",
+    "/paddlevideo/modeling/losses/__init__.py": "Comprehensive Losses for PaddleVideo",
+    "/paddlevideo/modeling/losses/__init__.py:1-26": "Extensive Loss Functions for PaddleVideo",
+    "/paddlevideo/modeling/losses/__init__.py:27-29": "Loss Functions for PaddleVideo",
+    "/paddlevideo/modeling/losses/actbert_loss.py": "ActBert Loss Functions",
+    "/paddlevideo/modeling/losses/actbert_loss.py:1-32": "ActBertLoss: Custom Loss for ActBert Model",
+    "/paddlevideo/modeling/losses/actbert_loss.py:33-50": "ActBert Loss: Visual Classification with KLDivLoss",
+    "/paddlevideo/modeling/losses/actbert_loss.py:51-75": "Multi-Loss Calculation in ActBERT Model",
+    "/paddlevideo/modeling/losses/asrf_loss.py": "Custom Loss Functions for Video Modeling",
+    "/paddlevideo/modeling/losses/asrf_loss.py:1-32": "TMSE Loss: Temporal MSE for Action Segmentation",
+    "/paddlevideo/modeling/losses/asrf_loss.py:128-167": "Action Segmentation Loss: Flexible Implementations",
+    "/paddlevideo/modeling/losses/asrf_loss.py:168-198": "Initialize Loss Functions and Weights",
+    "/paddlevideo/modeling/losses/asrf_loss.py:200-221": "ASRF Loss: CrossEntropy with Class Weights",
+    "/paddlevideo/modeling/losses/asrf_loss.py:222-248": "Adjustable Sensitivity Ranking Fusion Loss",
+    "/paddlevideo/modeling/losses/asrf_loss.py:250-291": "Boundary Regression Loss Function Combination",
+    "/paddlevideo/modeling/losses/asrf_loss.py:292-321": "Positive Weight Calculator",
+    "/paddlevideo/modeling/losses/asrf_loss.py:322-359": "Multicriterion ASR Loss Function",
+    "/paddlevideo/modeling/losses/asrf_loss.py:33-66": "ASRF and Temporal MSE Loss Functions",
+    "/paddlevideo/modeling/losses/asrf_loss.py:360-373": "Initialize ActionSegmentationLoss Object",
+    "/paddlevideo/modeling/losses/asrf_loss.py:374-401": "Custom Loss Function for Video Modeling Framework",
+    "/paddlevideo/modeling/losses/asrf_loss.py:67-92": "Gaussian-weighted MSE Loss in Paddle",
+    "/paddlevideo/modeling/losses/asrf_loss.py:94-126": "ASRF and Focal Loss Calculations",
+    "/paddlevideo/modeling/losses/base.py": "PaddlePaddle Loss Base Class",
+    "/paddlevideo/modeling/losses/base.py:1-31": "Base Loss Function in PaddlePaddle",
+    "/paddlevideo/modeling/losses/base.py:32-49": "Weighted Loss Initialization and Forward Pass",
+    "/paddlevideo/modeling/losses/bmn_loss.py": "BMN Loss for PaddleVideo",
+    "/paddlevideo/modeling/losses/bmn_loss.py:1-32": "BMN Loss Function for PaddleVideo",
+    "/paddlevideo/modeling/losses/bmn_loss.py:102-126": "BMN Loss Calculation",
+    "/paddlevideo/modeling/losses/bmn_loss.py:127-147": "Forward Function: BMN Loss Calculation",
+    "/paddlevideo/modeling/losses/bmn_loss.py:149-155": "BMN Loss Calculation: PEM & TEAM Detection",
+    "/paddlevideo/modeling/losses/bmn_loss.py:33-55": "Binary Mask Network Loss",
+    "/paddlevideo/modeling/losses/bmn_loss.py:56-77": "Bi-directional Masked Object Detection Loss",
+    "/paddlevideo/modeling/losses/bmn_loss.py:78-101": "Uniform Mask Multiplication and Ratio Calculation",
+    "/paddlevideo/modeling/losses/cross_entropy_loss.py": "CrossEntropy Loss Function in PaddlePaddle",
+    "/paddlevideo/modeling/losses/cross_entropy_loss.py:1-30": "Custom Cross Entropy Loss in PaddlePaddle",
+    "/paddlevideo/modeling/losses/cross_entropy_loss.py:31-36": "Calculate CrossEntropy Loss in Paddle",
+    "/paddlevideo/modeling/losses/depth_loss.py": "Depth Loss Calculation for PaddleVideo",
+    "/paddlevideo/modeling/losses/depth_loss.py:1-29": "Smoothness Loss Function",
+    "/paddlevideo/modeling/losses/depth_loss.py:106-137": "SSIM Loss Calculation in ADDSLoss",
+    "/paddlevideo/modeling/losses/depth_loss.py:138-173": "Scale-Based Depth Loss Calculation",
+    "/paddlevideo/modeling/losses/depth_loss.py:174-197": "Depth Loss Computation Algorithm",
+    "/paddlevideo/modeling/losses/depth_loss.py:199-223": "Depth Loss Calculation",
+    "/paddlevideo/modeling/losses/depth_loss.py:225-250": "Computing Day-Night Losses for Video",
+    "/paddlevideo/modeling/losses/depth_loss.py:251-276": "Depth and Reconstruction Losses in Day-Night Scenes",
+    "/paddlevideo/modeling/losses/depth_loss.py:278-290": "Depth Loss Update",
+    "/paddlevideo/modeling/losses/depth_loss.py:30-67": "Depth Loss: DiffLoss and MSE for Disparity Estimation",
+    "/paddlevideo/modeling/losses/depth_loss.py:68-104": "Structured Loss Functions for PaddlePaddle",
+    "/paddlevideo/modeling/losses/distillation_loss.py": "Distillation & KL Divergence Losses",
+    "/paddlevideo/modeling/losses/distillation_loss.py:1-30": "Distillation Entropy Loss Class",
+    "/paddlevideo/modeling/losses/distillation_loss.py:31-60": "Distillation-Aware CrossEntropy Loss with Weighted Average",
+    "/paddlevideo/modeling/losses/distillation_loss.py:61-79": "Kullback-Leibler Divergence Loss Class",
+    "/paddlevideo/modeling/losses/transnetv2_loss.py": "TransNetV2 Loss Calculation",
+    "/paddlevideo/modeling/losses/transnetv2_loss.py:1-28": "TransNetV2 Loss Calculator",
+    "/paddlevideo/modeling/losses/transnetv2_loss.py:30-54": "TransNetV2 Loss Function",
+    "/paddlevideo/modeling/losses/transnetv2_loss.py:56-56": "TransNetV2 Total Loss Calculation",
+    "/paddlevideo/modeling/losses/yowo_loss.py": "YOLO Loss Functions in PaddleVideo",
+    "/paddlevideo/modeling/losses/yowo_loss.py:1-31": "Focal Loss: Focusing on Hard Examples",
+    "/paddlevideo/modeling/losses/yowo_loss.py:113-137": "Sigmoid Transformation for YOLO Anchors"
+}
\ No newline at end of file
diff --git a/docs/data/titles/13.json b/docs/data/titles/13.json
new file mode 100644
index 000000000..cbc399552
--- /dev/null
+++ b/docs/data/titles/13.json
@@ -0,0 +1,302 @@
+{
+    "/paddlevideo/modeling/losses/yowo_loss.py:138-155": "YOLOv5 Loss Assignment",
+    "/paddlevideo/modeling/losses/yowo_loss.py:156-169": "Prepare Data for Object Detection Training",
+    "/paddlevideo/modeling/losses/yowo_loss.py:170-181": "Anchor Width-Height Assignments for YOWO Loss",
+    "/paddlevideo/modeling/losses/yowo_loss.py:183-199": "YOLOv3 Loss Calculation in PaddleVideo",
+    "/paddlevideo/modeling/losses/yowo_loss.py:199-210": "YOLO Loss Setup",
+    "/paddlevideo/modeling/losses/yowo_loss.py:211-237": "GPU Variables Loss Calculation",
+    "/paddlevideo/modeling/losses/yowo_loss.py:238-249": "YOWO Loss: Coordinate and Classification",
+    "/paddlevideo/modeling/losses/yowo_loss.py:33-55": "Focal Loss with Alpha, Gamma, and Size Average",
+    "/paddlevideo/modeling/losses/yowo_loss.py:56-87": "Yowo Loss Function: GPU Optimized and Customizable",
+    "/paddlevideo/modeling/losses/yowo_loss.py:88-112": "Region Loss with Focal Loss and Threshold",
+    "/paddlevideo/modeling/registry.py": "Efficient Model Registry Organization",
+    "/paddlevideo/modeling/registry.py:1-27": "Model Registry Organization",
+    "/paddlevideo/modeling/registry.py:28-31": "Model Registries for Paddle Video",
+    "/paddlevideo/modeling/samplers/__init__.py": "Importing RandomSampler Class and Licensing Information",
+    "/paddlevideo/modeling/samplers/random_sampler.py": "Random Sampler for Bbox Sampling",
+    "/paddlevideo/modeling/samplers/random_sampler.py:1-28": "Random Sampling Class Definition",
+    "/paddlevideo/modeling/samplers/random_sampler.py:115-139": "Random Sampler: Positive and Negative Sample Selection",
+    "/paddlevideo/modeling/samplers/random_sampler.py:140-146": "Zero-Check Random Sampler",
+    "/paddlevideo/modeling/samplers/random_sampler.py:29-55": "Initializing Sampler Bounding Boxes",
+    "/paddlevideo/modeling/samplers/random_sampler.py:56-92": "RandomSampler: Randomly Sampling Bounding Boxes",
+    "/paddlevideo/modeling/samplers/random_sampler.py:93-114": "Random Sampler for Imbalanced Classes",
+    "/paddlevideo/modeling/weight_init.py": "Weight Initialization in PaddlePaddle",
+    "/paddlevideo/modeling/weight_init.py:1-36": "Weight Initialization for PaddlePaddle Layers",
+    "/paddlevideo/modeling/weight_init.py:131-156": "Neural Network Weight Initialization",
+    "/paddlevideo/modeling/weight_init.py:157-157": "Initialize Tensor with Values",
+    "/paddlevideo/modeling/weight_init.py:37-66": "Truncated Normal Weight Initialization",
+    "/paddlevideo/modeling/weight_init.py:68-98": "Truncated Gaussian Tensor Weight Init",
+    "/paddlevideo/modeling/weight_init.py:99-130": "Convolutional Layer Weight Initialization",
+    "/paddlevideo/solver/__init__.py": "Solver Package Imports",
+    "/paddlevideo/solver/custom_lr.py": "Custom Learning Rate Schedulers for PaddleVideo",
+    "/paddlevideo/solver/custom_lr.py:1-31": "Custom Warmup-Cosine Decay LR Scheduler",
+    "/paddlevideo/solver/custom_lr.py:107-133": "Customizable Piecewise Decay Learning Rate Scheduler",
+    "/paddlevideo/solver/custom_lr.py:134-158": "Custom Learning Rate Scheduler",
+    "/paddlevideo/solver/custom_lr.py:159-188": "Warmup Custom Learning Rate Policy",
+    "/paddlevideo/solver/custom_lr.py:189-222": "Customizable Warmup Cosine Decay Learning Rate Scheduler",
+    "/paddlevideo/solver/custom_lr.py:223-249": "Custom Learning Rate Scheduler",
+    "/paddlevideo/solver/custom_lr.py:251-282": "Custom Learning Rate Scheduler",
+    "/paddlevideo/solver/custom_lr.py:283-305": "Custom Learning Rate Scheduler",
+    "/paddlevideo/solver/custom_lr.py:306-332": "Custom Warmup Adjust Decay Scheduler",
+    "/paddlevideo/solver/custom_lr.py:32-54": "Cosine Annealing Learning Rate Scheduler",
+    "/paddlevideo/solver/custom_lr.py:333-338": "Custom Warmup Learning Rate",
+    "/paddlevideo/solver/custom_lr.py:55-80": "Custom Learning Rate Scheduler for PaddleVideo",
+    "/paddlevideo/solver/custom_lr.py:81-106": "Custom Learning Rate Scheduler with Warmup and Decay",
+    "/paddlevideo/solver/lr.py": "Learning Rate Scheduler Builder",
+    "/paddlevideo/solver/lr.py:1-28": "Learning Rate Scheduler Builder",
+    "/paddlevideo/solver/lr.py:30-52": "Custom Learning Rate Scheduler",
+    "/paddlevideo/solver/optimizer.py": "Python Optimizer Configurations",
+    "/paddlevideo/solver/optimizer.py:1-31": "Building PaddleVideo's Optimizer",
+    "/paddlevideo/solver/optimizer.py:110-133": "Multi-Precision Learning Rate Scheduler",
+    "/paddlevideo/solver/optimizer.py:134-136": "Optimizer Factory Function",
+    "/paddlevideo/solver/optimizer.py:32-63": "Optimizer Configuration and Learning Rate Scheduler",
+    "/paddlevideo/solver/optimizer.py:64-85": "AMP-Aware Optimizer Function",
+    "/paddlevideo/solver/optimizer.py:86-109": "L1-L2 Weight Decay Optimizer Config",
+    "/paddlevideo/tasks/__init__.py": "PaddleVideo Tasks Initialization",
+    "/paddlevideo/tasks/test.py": "Parallel Testing with PaddlePaddle",
+    "/paddlevideo/tasks/test.py:1-32": "Parallel PaddlePaddle Model Testing",
+    "/paddlevideo/tasks/test.py:34-61": "Model Initialization and Configuration",
+    "/paddlevideo/tasks/test.py:62-90": "Model Evaluation Loop",
+    "/paddlevideo/tasks/train.py": "Distributed Training with PaddlePaddle Fleet API",
+    "/paddlevideo/tasks/train.py:1-27": "Video Task Training Framework",
+    "/paddlevideo/tasks/train.py:125-150": "Training PaddleVideo with Datasets and Optimizers",
+    "/paddlevideo/tasks/train.py:151-172": "Training Mode Checker and Handler",
+    "/paddlevideo/tasks/train.py:173-204": "Efficient Model Training with Paddle's DataParallel",
+    "/paddlevideo/tasks/train.py:206-229": "Efficient AMP Training and Gradient Scaling",
+    "/paddlevideo/tasks/train.py:230-253": "Gradient Descent and Backward Pass in Train.py",
+    "/paddlevideo/tasks/train.py:254-277": "Gradient Clearing & Optimizer Progress",
+    "/paddlevideo/tasks/train.py:278-306": "PaddleVideo Model Training and Evaluation",
+    "/paddlevideo/tasks/train.py:28-51": "Training Model with PaddleVideo",
+    "/paddlevideo/tasks/train.py:307-330": "Training Model in PaddleVideo",
+    "/paddlevideo/tasks/train.py:331-351": "Evaluate Dataset and Log Performance Metrics",
+    "/paddlevideo/tasks/train.py:352-373": "Parallel Update: PreciseBN Accuracy Check",
+    "/paddlevideo/tasks/train.py:374-395": "Precise Batch Normalization and Validation in Deep Learning",
+    "/paddlevideo/tasks/train.py:396-417": "Saving Best Model and Metric Logging",
+    "/paddlevideo/tasks/train.py:418-426": "Periodic Model Saving",
+    "/paddlevideo/tasks/train.py:52-75": "Gradient Accumulation for Distributed PaddlePaddle Training",
+    "/paddlevideo/tasks/train.py:76-96": "Global Batch Size Configuration",
+    "/paddlevideo/tasks/train.py:97-124": "Static Model Conversion for Training and Validation",
+    "/paddlevideo/tasks/train_dali.py": "Train DALI with PaddleVideo",
+    "/paddlevideo/tasks/train_dali.py:1-25": "PaddleVideo: TSN-Dali Dataset Loading and Preparation",
+    "/paddlevideo/tasks/train_dali.py:117-141": "Train DALI: Batch Normalization and Saving Progress",
+    "/paddlevideo/tasks/train_dali.py:143-143": "Model Training Completion Logged",
+    "/paddlevideo/tasks/train_dali.py:26-63": "DALI Initialization and Training for TSN Model",
+    "/paddlevideo/tasks/train_dali.py:64-88": "Model Training Pipeline with Resume and Finetuning",
+    "/paddlevideo/tasks/train_dali.py:89-116": "Training Model with Backpropagation",
+    "/paddlevideo/tasks/train_multigrid.py": "Training Multigrid Models in PaddleVideo",
+    "/paddlevideo/tasks/train_multigrid.py:1-27": "Setting Up PaddleVideo Environment",
+    "/paddlevideo/tasks/train_multigrid.py:111-146": "Multigrid Training Initialization",
+    "/paddlevideo/tasks/train_multigrid.py:148-179": "Multigrid Model Training Setup",
+    "/paddlevideo/tasks/train_multigrid.py:181-210": "Multi-grid Training Optimizer Construction",
+    "/paddlevideo/tasks/train_multigrid.py:211-235": "Training Multigrid Models",
+    "/paddlevideo/tasks/train_multigrid.py:236-262": "Adaptive Learning Rate Optimization",
+    "/paddlevideo/tasks/train_multigrid.py:264-288": "Batch-wise Evaluation and Logging",
+    "/paddlevideo/tasks/train_multigrid.py:28-50": "Multigrid Data Loader Construction",
+    "/paddlevideo/tasks/train_multigrid.py:290-313": "Batch Normalization & Performance Logging",
+    "/paddlevideo/tasks/train_multigrid.py:314-335": "Automatic Model Saving and Evaluation in PaddleVideo",
+    "/paddlevideo/tasks/train_multigrid.py:51-77": "Adjust Batch Size for Multigrid Training",
+    "/paddlevideo/tasks/train_multigrid.py:78-110": "Training PaddleVideo Model with DataLoaders and Parallelization",
+    "/paddlevideo/utils/__init__.py": "PaddleVideo Utils: Imports, Build, Save & Load",
+    "/paddlevideo/utils/build_utils.py": "Build Utility Function",
+    "/paddlevideo/utils/config.py": "Config Management Utilities",
+    "/paddlevideo/utils/config.py:1-34": "Config Handling and Setup",
+    "/paddlevideo/utils/config.py:110-139": "Recursive Config Override Function",
+    "/paddlevideo/utils/config.py:140-170": "Config Utilities: Load, Update and Display",
+    "/paddlevideo/utils/config.py:171-174": "Verify and Print Config",
+    "/paddlevideo/utils/config.py:35-67": "Config Parsing and Dict Visualization Functions",
+    "/paddlevideo/utils/config.py:68-109": "Config Manipulation Functions",
+    "/paddlevideo/utils/dist_utils.py": "Distributed Computing Utilities",
+    "/paddlevideo/utils/logger.py": "Colorful Logging for PaddleVideo",
+    "/paddlevideo/utils/multigrid/__init__.py": "Multigrid Scheduler Imports",
+    "/paddlevideo/utils/multigrid/batchnorm_helper.py": "Batch Normalization for PyTorch Multigrid",
+    "/paddlevideo/utils/multigrid/batchnorm_helper.py:1-36": "Sub-BatchNorm Helper",
+    "/paddlevideo/utils/multigrid/batchnorm_helper.py:109-135": "Batch Normalization Helper Class",
+    "/paddlevideo/utils/multigrid/batchnorm_helper.py:136-142": "BatchNorm Multiplication and Normalization",
+    "/paddlevideo/utils/multigrid/batchnorm_helper.py:37-64": "Multi-Split Batch Normalization",
+    "/paddlevideo/utils/multigrid/batchnorm_helper.py:65-85": "BatchNorm Layer Initialization",
+    "/paddlevideo/utils/multigrid/batchnorm_helper.py:86-108": "BatchNorm3D Instantiation and Aggregation",
+    "/paddlevideo/utils/multigrid/interval_helper.py": "Multigrid Evaluation Function",
+    "/paddlevideo/utils/multigrid/multigrid.py": "Multigrid Schedule Management",
+    "/paddlevideo/utils/multigrid/multigrid.py:1-25": "Multigrid Scheduling Class Definition",
+    "/paddlevideo/utils/multigrid/multigrid.py:116-141": "Multi-Grid Training Schedule Calculator",
+    "/paddlevideo/utils/multigrid/multigrid.py:142-169": "Multigrid Training Schedule in PaddleVideo",
+    "/paddlevideo/utils/multigrid/multigrid.py:171-191": "Multigrid Iteration Calculator",
+    "/paddlevideo/utils/multigrid/multigrid.py:193-224": "Multigrid Learning Rate Scheduler",
+    "/paddlevideo/utils/multigrid/multigrid.py:225-233": "Schedule-Based Shape Iterator",
+    "/paddlevideo/utils/multigrid/multigrid.py:26-50": "Multi-Grid Training Schedule Initialization",
+    "/paddlevideo/utils/multigrid/multigrid.py:51-74": "Long Cycle Shape Update Function",
+    "/paddlevideo/utils/multigrid/multigrid.py:75-94": "Multigrid Configuration and Update Settings",
+    "/paddlevideo/utils/multigrid/multigrid.py:95-115": "Multigrid Configuration Checker",
+    "/paddlevideo/utils/multigrid/save_load_helper.py": "Ensuring State Dict Consistency in PaddleVideo",
+    "/paddlevideo/utils/multigrid/save_load_helper.py:1-31": "Converting Sub-BN to Normal BN Parameters",
+    "/paddlevideo/utils/multigrid/save_load_helper.py:104-135": "Compare Optimizer and Model Parameters",
+    "/paddlevideo/utils/multigrid/save_load_helper.py:136-163": "Update BN/Sub-BN Key Names",
+    "/paddlevideo/utils/multigrid/save_load_helper.py:164-190": "Save and Load Helper Functions",
+    "/paddlevideo/utils/multigrid/save_load_helper.py:191-216": "Checkpoint Loader and Shape Comparison",
+    "/paddlevideo/utils/multigrid/save_load_helper.py:217-237": "Loading Weights and Optimizer State: SaveLoadHelper",
+    "/paddlevideo/utils/multigrid/save_load_helper.py:32-58": "Sub-BN Conversion for Checkpoint Loading",
+    "/paddlevideo/utils/multigrid/save_load_helper.py:59-81": "Shape Comparison and Concatenation",
+    "/paddlevideo/utils/multigrid/save_load_helper.py:82-103": "Modify Optimizer State Dict Keys",
+    "/paddlevideo/utils/multigrid/short_sampler.py": "Efficient Distributed Video Data Loading",
+    "/paddlevideo/utils/multigrid/short_sampler.py:1-28": "Distributed ShortSampler for Dynamic Batch Sizing",
+    "/paddlevideo/utils/multigrid/short_sampler.py:103-130": "Dynamic Batch Sampler",
+    "/paddlevideo/utils/multigrid/short_sampler.py:131-146": "Efficient Video Sampler for PaddleVideo",
+    "/paddlevideo/utils/multigrid/short_sampler.py:29-51": "MultiGrid Initializer",
+    "/paddlevideo/utils/multigrid/short_sampler.py:52-79": "Multigrid Sampler Initialization",
+    "/paddlevideo/utils/multigrid/short_sampler.py:80-102": "Balanced Subsampling with Modulo Handling",
+    "/paddlevideo/utils/precise_bn.py": "Precise Batch Normalization Acceleration",
+    "/paddlevideo/utils/precise_bn.py:1-34": "Precise Batch Normalization: Accuracy and Efficiency Boost",
+    "/paddlevideo/utils/precise_bn.py:35-56": "Precise BN Stats Recomputation",
+    "/paddlevideo/utils/precise_bn.py:58-83": "Accurate Batch Normalization Update",
+    "/paddlevideo/utils/precise_bn.py:84-94": "Accurate Batch Normalization Update",
+    "/paddlevideo/utils/profiler.py": "PaddleVideo Profiler: Performance Analysis and Optimization",
+    "/paddlevideo/utils/profiler.py:1-29": "PaddleVideo Profiler Module Init",
+    "/paddlevideo/utils/profiler.py:106-128": "Profiler Object Initialization",
+    "/paddlevideo/utils/profiler.py:30-53": "Profiler Options Class",
+    "/paddlevideo/utils/profiler.py:54-77": "Python Profiler: Option Parser and Batch Range",
+    "/paddlevideo/utils/profiler.py:79-105": "Operator-Level Timing Profiler with PaddlePaddle",
+    "/paddlevideo/utils/record.py": "Efficient Training Metrics Recording",
+    "/paddlevideo/utils/record.py:1-32": "Record Builder and Logger Setup",
+    "/paddlevideo/utils/record.py:106-136": "Batch Logging and Metrics Calculation",
+    "/paddlevideo/utils/record.py:137-155": "Training Progress Logger",
+    "/paddlevideo/utils/record.py:157-168": "Mean Metric String Calculation and Formatting",
+    "/paddlevideo/utils/record.py:33-48": "Averaging Metrics in PaddleVideo Record",
+    "/paddlevideo/utils/record.py:49-65": "Conditional Metric Addition",
+    "/paddlevideo/utils/record.py:67-105": "Record Dictionary with AverageMeter Objects",
+    "/paddlevideo/utils/registry.py": "Registry Class for Object Mapping and Registration",
+    "/paddlevideo/utils/save_load.py": "Model Save and Load in PaddlePaddle",
+    "/paddlevideo/utils/save_load.py:1-30": "Swin Model Transfer in PaddleVideo",
+    "/paddlevideo/utils/save_load.py:106-126": "Adjusting Positional Embeddings for Patch Count",
+    "/paddlevideo/utils/save_load.py:127-147": "Loading Weights: Model Shape Check and Progress Bar",
+    "/paddlevideo/utils/save_load.py:148-172": "ResNet18 Weight Adaptation",
+    "/paddlevideo/utils/save_load.py:173-197": "Dynamic Weights Loading with Progress Updates",
+    "/paddlevideo/utils/save_load.py:198-226": "Load Pre-trained Model Parameters",
+    "/paddlevideo/utils/save_load.py:227-248": "Model Weights and Dictionary Loading",
+    "/paddlevideo/utils/save_load.py:249-282": "Save and Load Utilities",
+    "/paddlevideo/utils/save_load.py:283-289": "Save and Load Functions with Paddle",
+    "/paddlevideo/utils/save_load.py:31-61": "Ensuring Model State Consistency",
+    "/paddlevideo/utils/save_load.py:62-82": "Loading Weights for Position Bias",
+    "/paddlevideo/utils/save_load.py:83-105": "Model Parameter Transformation for ViT Models",
+    "/paddlevideo/version.py": "PaddleVideo Version Info",
+    "/run.sh": "PaddlePaddle: Train, Test, Export, Infer",
+    "/run.sh:1-18": "Distributed CUDA Training with 8 GPUs",
+    "/run.sh:20-36": "PaddlePaddle Video Recognition Training Script",
+    "/run.sh:38-54": "Distributed Deep Learning Training with PaddlePaddle",
+    "/run.sh:54-74": "Distributed Deep Learning Training and Testing Script",
+    "/run.sh:76-89": "PaddleVideo Test and Inference Guide",
+    "/setup.py": "PaddleVideo: Python Video Understanding Utility",
+    "/setup.py:1-31": "PaddleVideo Setup with Setuptools",
+    "/setup.py:32-53": "Setting Up PaddleVideo Package",
+    "/setup.py:54-56": "Python 3.7 Setup Metadata Classification",
+    "/test_tipc/README.md": "TIPC-Enabled PaddleVideo Tutorial",
+    "/test_tipc/README.md:114-126": "Clear and Consistent Naming Conventions for PaddleVideo",
+    "/test_tipc/README.md:127-133": "PaddleVideo Testing: Comprehensive Cases and Functionalities",
+    "/test_tipc/README.md:2-30": "PaddleVideo TIPC Overview",
+    "/test_tipc/README.md:31-55": "Test Tool for PaddleVideo: Supported Models and Configurations",
+    "/test_tipc/README.md:56-76": "Directory Structure and Testing Scripts of PaddleVideo test_tipc Project",
+    "/test_tipc/README.md:77-112": "Simplified TIPC Testing Process",
+    "/test_tipc/benchmark_train.sh": "PaddleVideo Benchmark Training",
+    "/test_tipc/benchmark_train.sh:1-42": "PaddlePaddle GPU Benchmark Training Script",
+    "/test_tipc/benchmark_train.sh:124-158": "Benchmark/Train Environment Modification",
+    "/test_tipc/benchmark_train.sh:159-197": "Benchmark Configuration Code Snippet",
+    "/test_tipc/benchmark_train.sh:198-220": "Batch Size and Precision Training: PaddleVideo Benchmark",
+    "/test_tipc/benchmark_train.sh:221-234": "Directory Creation and Logging Setup",
+    "/test_tipc/benchmark_train.sh:235-253": "Non-Profiled Script Execution",
+    "/test_tipc/benchmark_train.sh:255-274": "Python Log File Analysis Script",
+    "/test_tipc/benchmark_train.sh:275-288": "Speeding Up TimeSformer Training",
+    "/test_tipc/benchmark_train.sh:289-308": "Benchmark Training Script",
+    "/test_tipc/benchmark_train.sh:309-318": "Benchmark Training Iteration",
+    "/test_tipc/benchmark_train.sh:43-86": "Manipulating Configs for PaddleVideo",
+    "/test_tipc/benchmark_train.sh:87-123": "Training Model with Parameters",
+    "/test_tipc/common_func.sh": "Common Functions for Parsing and Status Checks",
+    "/test_tipc/common_func.sh:1-58": "Parameter Parsing and Status Functions",
+    "/test_tipc/common_func.sh:59-66": "Status Logging Function",
+    "/test_tipc/compare_results.py": "Log Parser and Comparer",
+    "/test_tipc/compare_results.py:1-40": "Command-line Parser and Shell Executor",
+    "/test_tipc/compare_results.py:119-146": "Validate Code Predictions with Ground Truth",
+    "/test_tipc/compare_results.py:147-170": "Compare and Validate Results",
+    "/test_tipc/compare_results.py:171-171": "Filename Formatting for Comparison",
+    "/test_tipc/compare_results.py:42-64": "Python/C++ Inference Result Parser",
+    "/test_tipc/compare_results.py:65-89": "Parse Log File Function",
+    "/test_tipc/compare_results.py:90-118": "Three Functions for Ground Truth Data Processing",
+    "/test_tipc/extract_loss.py": "Extract and Calculate Loss Expressions",
+    "/test_tipc/extract_loss.py:1-28": "Loss Expression Parser",
+    "/test_tipc/extract_loss.py:29-71": "Regular Expression Parsing and Validation Functions",
+    "/test_tipc/extract_loss.py:74-102": "Function for Tuples Calculation and Printing",
+    "/test_tipc/prepare.sh": "Preparing Video Detection Models in PaddlePaddle",
+    "/test_tipc/prepare.sh:1-44": "Prepare Environment for PaddlePaddle Video Object Detection",
+    "/test_tipc/prepare.sh:106-127": "Conditional Data Download and Extraction",
+    "/test_tipc/prepare.sh:128-149": "Model Weights and Data Preprocessing",
+    "/test_tipc/prepare.sh:150-168": "Pretraining with Whole Data",
+    "/test_tipc/prepare.sh:169-188": "Download Model Weights and Data",
+    "/test_tipc/prepare.sh:189-205": "TSM Data Preparation Script",
+    "/test_tipc/prepare.sh:206-223": "Model-Specific Data Preparation Script",
+    "/test_tipc/prepare.sh:224-241": "Preparing AttentionLSTM Model Environment",
+    "/test_tipc/prepare.sh:242-261": "Preparing Kinetics400 for PaddleVideo",
+    "/test_tipc/prepare.sh:262-285": "Handling Model Pretraining Scenarios",
+    "/test_tipc/prepare.sh:286-308": "Model-Specific Data Download and Preparation",
+    "/test_tipc/prepare.sh:309-329": "Model-Specific Pretrained File Downloads",
+    "/test_tipc/prepare.sh:329-345": "Prepare Dataset for AttentionLSTM Model",
+    "/test_tipc/prepare.sh:346-370": "Model-Based Actions in TIPC Preparation",
+    "/test_tipc/prepare.sh:371-385": "Model-Based Weights Download",
+    "/test_tipc/prepare.sh:386-406": "Script Downloads Pre-trained Model Weights",
+    "/test_tipc/prepare.sh:407-427": "Model Name Check and Download",
+    "/test_tipc/prepare.sh:428-446": "PaddleVideo Model Weights Download",
+    "/test_tipc/prepare.sh:447-468": "Model-Specific Data Downloads",
+    "/test_tipc/prepare.sh:45-67": "Conditional Download Tasks for Models and Datasets",
+    "/test_tipc/prepare.sh:469-497": "Model-Specific Data Preparation",
+    "/test_tipc/prepare.sh:498-520": "Prepare Inference Models",
+    "/test_tipc/prepare.sh:521-552": "Model Check and Download for TIPC",
+    "/test_tipc/prepare.sh:553-577": "Mode-Based Actions in TIPC Script",
+    "/test_tipc/prepare.sh:68-87": "Model-Specific Data Download Script",
+    "/test_tipc/prepare.sh:88-105": "Preparing Data and Weights for Models",
+    "/test_tipc/test_inference_cpp.sh": "PaddleVideo Inference Testing",
+    "/test_tipc/test_inference_cpp.sh:1-29": "Bash Script for C++ Inference Parser",
+    "/test_tipc/test_inference_cpp.sh:102-112": "Inference CPP Script Execution",
+    "/test_tipc/test_inference_cpp.sh:114-146": "Hardware Support and OpenCV Setup",
+    "/test_tipc/test_inference_cpp.sh:147-178": "Building PaddleVideo Libraries and Demo",
+    "/test_tipc/test_inference_cpp.sh:179-225": "Configuring PaddleVideo and Running Inference Tests",
+    "/test_tipc/test_inference_cpp.sh:226-228": "Executing C++ Inference Commands",
+    "/test_tipc/test_inference_cpp.sh:30-58": "PaddleVideo C++ Inference Setup",
+    "/test_tipc/test_inference_cpp.sh:59-72": "Skipping MKLDNN Quantized Tests",
+    "/test_tipc/test_inference_cpp.sh:73-85": "Inference Script Configuration and Execution",
+    "/test_tipc/test_inference_cpp.sh:86-101": "TRT Precision Combinations Test",
+    "/test_tipc/test_paddle2onnx.sh": "Automating Paddle2ONNX Conversion in test_tipc/test_paddle2onnx.sh",
+    "/test_tipc/test_paddle2onnx.sh:1-32": "Paddle2Onnx: Extracting Model Details from Log Files",
+    "/test_tipc/test_paddle2onnx.sh:33-58": "Setting Up Paddle2Onnx Inference",
+    "/test_tipc/test_paddle2onnx.sh:59-73": "Paddle2Onnx Conversion and Inference Logging",
+    "/test_tipc/test_paddle2onnx.sh:74-81": "Test: Export Count, IFS, and Echo Message",
+    "/test_tipc/test_ptq_inference_python.sh": "PaddleVideo GPU/CPU Inference Test",
+    "/test_tipc/test_ptq_inference_python.sh:1-29": "Python Shell Script for Model Inference",
+    "/test_tipc/test_ptq_inference_python.sh:113-129": "Model Export Preparation and Check",
+    "/test_tipc/test_ptq_inference_python.sh:130-132": "Python Inference Calling",
+    "/test_tipc/test_ptq_inference_python.sh:30-52": "Retrieving Config Values for Trainer and Inference",
+    "/test_tipc/test_ptq_inference_python.sh:55-74": "Python-Powered GPU/CPU Inference Logging",
+    "/test_tipc/test_ptq_inference_python.sh:74-88": "Looped GPU Inference Testing",
+    "/test_tipc/test_ptq_inference_python.sh:89-112": "Hardware-Optimized PaddleVideo Inference",
+    "/test_tipc/test_serving_infer_cpp.sh": "Streamline Bash Model Serving with GPU",
+    "/test_tipc/test_serving_infer_cpp.sh:1-28": "Custom Bash Script for Configuration and Image Classification",
+    "/test_tipc/test_serving_infer_cpp.sh:103-107": "Incrementing \"Count\" in Web Service Test",
+    "/test_tipc/test_serving_infer_cpp.sh:29-54": "Initialize Model and Config Files",
+    "/test_tipc/test_serving_infer_cpp.sh:55-73": "Setup C++ Server and Client on GPU",
+    "/test_tipc/test_serving_infer_cpp.sh:73-100": "PaddlePaddle Serving Server Test",
+    "/test_tipc/test_serving_infer_python.sh": "Automating Model Serving with Bash",
+    "/test_tipc/test_serving_infer_python.sh:1-29": "Bash Script Configures Model Inference Environment",
+    "/test_tipc/test_serving_infer_python.sh:30-54": "Model Serving Code Execution",
+    "/test_tipc/test_serving_infer_python.sh:56-77": "Automated Web Service Deployment with Python",
+    "/test_tipc/test_serving_infer_python.sh:78-105": "CUDA Test Environment Setup and Cleanup",
+    "/test_tipc/test_train_dy2static_python.sh": "Dygraph vs Dy2Static Model Comparison",
+    "/test_tipc/test_train_dy2static_python.sh:1-30": "Configure and Initialize Environment",
+    "/test_tipc/test_train_dy2static_python.sh:31-57": "Configure, Run and Analyze Dygraph and Dy2Static Models",
+    "/test_tipc/test_train_dy2static_python.sh:58-73": "Diff and Log Comparison of Models",
+    "/test_tipc/test_train_inference_python.sh": "PaddleVideo Model Optimizer",
+    "/test_tipc/test_train_inference_python.sh:1-30": "Parse Training Parameters",
+    "/test_tipc/test_train_inference_python.sh:105-125": "Configuration Extraction for Test and Train",
+    "/test_tipc/test_train_inference_python.sh:126-157": "Inference Code Configuration & Logging",
+    "/test_tipc/test_train_inference_python.sh:158-170": "Iterating Over Precision Values",
+    "/test_tipc/test_train_inference_python.sh:171-181": "Automating Test Loop with Python Script",
+    "/test_tipc/test_train_inference_python.sh:182-198": "Optimizing Inference Parameters",
+    "/test_tipc/test_train_inference_python.sh:200-212": "Inference Parameter Configuration",
+    "/test_tipc/test_train_inference_python.sh:214-243": "Inference Model Testing with PaddleVideo",
+    "/test_tipc/test_train_inference_python.sh:244-274": "Multi-GPU Inference Loop"
+}
\ No newline at end of file
diff --git a/docs/data/titles/14.json b/docs/data/titles/14.json
new file mode 100644
index 000000000..659fe4bd7
--- /dev/null
+++ b/docs/data/titles/14.json
@@ -0,0 +1,139 @@
+{
+    "/test_tipc/test_train_inference_python.sh:275-303": "GPU Environment Variable Setup",
+    "/test_tipc/test_train_inference_python.sh:304-325": "Conditional Assignment of Train and Export Tasks",
+    "/test_tipc/test_train_inference_python.sh:31-56": "Parsing Key-Value Configurations",
+    "/test_tipc/test_train_inference_python.sh:326-347": "Setting Model Training Parameters",
+    "/test_tipc/test_train_inference_python.sh:348-367": "Distributed PaddleVideo Training and Inference",
+    "/test_tipc/test_train_inference_python.sh:368-378": "Multi-GPU/Machine Training with PaddlePaddle",
+    "/test_tipc/test_train_inference_python.sh:378-395": "Train PaddleVideo Model with Parameters",
+    "/test_tipc/test_train_inference_python.sh:396-410": "Evaluate Model Parameters and Commands",
+    "/test_tipc/test_train_inference_python.sh:410-426": "Setting up Variables for Inference",
+    "/test_tipc/test_train_inference_python.sh:426-433": "Set CUDA Devices for Inference and Training",
+    "/test_tipc/test_train_inference_python.sh:57-79": "Configuration Parser and Variable Assigner",
+    "/test_tipc/test_train_inference_python.sh:80-104": "Config File Parsing for Inference Parameters",
+    "/test_tipc/test_train_inference_python_npu.sh": "NPU Script Updates and Config Changes",
+    "/test_tipc/test_train_inference_python_npu.sh:1-39": "Switching to NPU Execution Script",
+    "/test_tipc/test_train_inference_python_npu.sh:40-42": "Bash Script Execution",
+    "/test_tipc/test_train_inference_python_xpu.sh": "Update XPU Execution Script",
+    "/test_tipc/test_train_inference_python_xpu.sh:1-39": "PaddleVideo XPU Configuration Update",
+    "/test_tipc/test_train_inference_python_xpu.sh:40-42": "Bash Command Execution and Logging",
+    "/tools/__init__.py": "Tools Package Initialization",
+    "/tools/ava_predict.py": "AVA Model Inference and Action Detection",
+    "/tools/ava_predict.py:1-32": "AVA Action Unit Detection Python Script",
+    "/tools/ava_predict.py:126-160": "Video Frame Extractor",
+    "/tools/ava_predict.py:161-191": "PaddleVideo Inference with AVA Predict",
+    "/tools/ava_predict.py:192-222": "AVA Predict Function Arguments and Result Packaging",
+    "/tools/ava_predict.py:223-264": "Label Prediction Function",
+    "/tools/ava_predict.py:267-294": "Human Detection via Frame Paths",
+    "/tools/ava_predict.py:296-334": "Reads Detection Results File for Bounding Box Proposals",
+    "/tools/ava_predict.py:33-68": "AVA Annotation Utilities",
+    "/tools/ava_predict.py:337-365": "Extract Frames and Set Up Pipelines",
+    "/tools/ava_predict.py:366-395": "AVA Prediction Code Snippet",
+    "/tools/ava_predict.py:396-421": "SpatioTemporal Action Detection Code",
+    "/tools/ava_predict.py:422-455": "Tensorize and Predict",
+    "/tools/ava_predict.py:456-481": "Action Score Thresholding in AVA Predict",
+    "/tools/ava_predict.py:482-509": "Video Frame Processing and Visualization Tool",
+    "/tools/ava_predict.py:69-98": "Visualize Frames with Predicted Annotations",
+    "/tools/ava_predict.py:99-125": "Image Box Annotation Visualizer",
+    "/tools/export_model.py": "PaddleVideo Model Exporter",
+    "/tools/export_model.py:1-32": "PaddleVideo Model Export Tool",
+    "/tools/export_model.py:118-143": "Model Input Specification in PaddleVideo's Export Function",
+    "/tools/export_model.py:144-172": "Input Specifications for PaddleVideo Models",
+    "/tools/export_model.py:173-204": "Input Specifications for Various Model Names",
+    "/tools/export_model.py:205-236": "Model Input Specification Generator",
+    "/tools/export_model.py:237-267": "Export Model: Step-by-Step",
+    "/tools/export_model.py:33-57": "Export Model Script",
+    "/tools/export_model.py:58-87": "Model Export and Config Trimming in PaddleVideo",
+    "/tools/export_model.py:88-117": "Model-Specific Input Shape Definition",
+    "/tools/predict.py": "Paddle Video Tool: Command-Line Inference",
+    "/tools/predict.py:1-32": "Import-Heavy Function Definition",
+    "/tools/predict.py:108-134": "TensorRT Engine Setup for ST-GCN",
+    "/tools/predict.py:136-173": "Building Paddle Predictor in Python",
+    "/tools/predict.py:174-201": "Model Inference Processing",
+    "/tools/predict.py:202-227": "Video Prediction Pipeline",
+    "/tools/predict.py:228-251": "Directory Creation and Inference Processing",
+    "/tools/predict.py:252-275": "Installing auto_log and Configuring AutoLogger",
+    "/tools/predict.py:276-306": "Batch Inference Tool",
+    "/tools/predict.py:308-327": "Benchmarking Inference and Post-Processing Time",
+    "/tools/predict.py:33-59": "Command-Line Arguments for Paddle Video",
+    "/tools/predict.py:60-84": "Configuring Paddle Video Predictor Arguments",
+    "/tools/predict.py:85-107": "Optimizing PaddleVideo for Inference",
+    "/tools/summary.py": "Model Summary and FLOPs Calculation",
+    "/tools/summary.py:1-34": "Parsing Command Line Arguments in PaddleVideo",
+    "/tools/summary.py:35-69": "Argument Parsing for Config File and Model Building",
+    "/tools/summary.py:70-82": "Model Summary and FLOPs Calculator",
+    "/tools/utils.py": "PaddleVideo-based Action Recognition & Human Detection",
+    "/tools/utils.py:1-34": "Import, Error Handling and License Info",
+    "/tools/utils.py:1023-1051": "FFmpeg Import and Frame Batching",
+    "/tools/utils.py:1052-1074": "Video Frame Iterator: Converting Predictions to Scenes",
+    "/tools/utils.py:1075-1103": "Video Scene List Processing Algorithm",
+    "/tools/utils.py:1105-1133": "Frame Visualization Tool",
+    "/tools/utils.py:1134-1149": "Single and All Frame Predictions",
+    "/tools/utils.py:1150-1169": "Shot Boundary Scene Converter",
+    "/tools/utils.py:1171-1201": "ADDS Inference Helper Initialization",
+    "/tools/utils.py:1202-1237": "Image Preprocessing Class and Method",
+    "/tools/utils.py:123-147": "Softmax Postprocessing Function",
+    "/tools/utils.py:1238-1264": "Post-Process Outputs and Save Depth Maps",
+    "/tools/utils.py:1266-1291": "Image Conversion Function and Class",
+    "/tools/utils.py:1292-1321": "Init and Extract Frames for Video Analysis",
+    "/tools/utils.py:1322-1344": "Preprocessing Frames and Labels",
+    "/tools/utils.py:1345-1369": "Object Detection Frame Processing",
+    "/tools/utils.py:1370-1400": "Data Pipeline: Append Proposals and Scores",
+    "/tools/utils.py:1401-1433": "Human Detection Class with Pre/Post-Processing",
+    "/tools/utils.py:1434-1464": "Iterating and Appending Predictions",
+    "/tools/utils.py:1466-1490": "Frame Sequence Visualizer",
+    "/tools/utils.py:148-176": "Video Classifier Helper",
+    "/tools/utils.py:1492-1523": "Pose Estimation Class for Image Processing",
+    "/tools/utils.py:1524-1548": "PaddleVideo Image Processing",
+    "/tools/utils.py:1549-1574": "YOWO Image Classification/Detection Initialization",
+    "/tools/utils.py:1575-1606": "Video Input Preprocessing: OpenCV Frame Reading and Resizing",
+    "/tools/utils.py:1608-1638": "Normalize and Reshape Images for Classification",
+    "/tools/utils.py:1639-1660": "Object Detection and Recognition Algorithm",
+    "/tools/utils.py:1661-1670": "Video Object Detection System Algorithm",
+    "/tools/utils.py:177-211": "Image Processing Class for PaddleVideo",
+    "/tools/utils.py:212-245": "Video Preprocessing Class",
+    "/tools/utils.py:246-278": "BMN Inference Helper Class and Postprocessing",
+    "/tools/utils.py:279-302": "Calculates Snippet Xmin and Xmax Values",
+    "/tools/utils.py:303-328": "Non-Max Suppression for Bounding Boxes",
+    "/tools/utils.py:329-362": "TokenShift Inference Helper Class",
+    "/tools/utils.py:35-58": "Importing and Building PaddleVideo Models",
+    "/tools/utils.py:363-395": "Preprocessing for TimeSformer Inference",
+    "/tools/utils.py:396-427": "Video Processing Pipeline",
+    "/tools/utils.py:428-458": "Video Preprocessing Class",
+    "/tools/utils.py:459-489": "Image Preprocessing and Postprocessing Tool",
+    "/tools/utils.py:490-516": "Extract Top K Classes from Tensor",
+    "/tools/utils.py:517-542": "Video Frame Processing Function",
+    "/tools/utils.py:543-573": "Text Overlay on Video Frames",
+    "/tools/utils.py:574-595": "Video Frame Processing and GIF Generation",
+    "/tools/utils.py:596-620": "Process and Save GIF with postprocess Function",
+    "/tools/utils.py:60-86": "Building Inference Helper with Registry",
+    "/tools/utils.py:621-651": "SlowFast Video Inference Helper",
+    "/tools/utils.py:652-682": "Video Frame Preprocessing and Postprocessing Function",
+    "/tools/utils.py:683-706": "Top Classes and Scores from STGCN Inference",
+    "/tools/utils.py:707-740": "CTRGCN Inference Helper Class",
+    "/tools/utils.py:741-775": "Preprocessing Data Class",
+    "/tools/utils.py:776-807": "Preprocessing and MSTCN Inference Helper Classes",
+    "/tools/utils.py:809-840": "Video Feature File Handling Class",
+    "/tools/utils.py:841-867": "Video Feature Processing and Text File Generation",
+    "/tools/utils.py:868-898": "Initializing ASRF Inference Helper",
+    "/tools/utils.py:87-121": "Abstract Class for Batch Preprocessing",
+    "/tools/utils.py:899-932": "Feature Loading and Processing Class",
+    "/tools/utils.py:933-959": "Action-Labeled Video Processor",
+    "/tools/utils.py:960-993": "Attention LSTM Inference Preprocessor",
+    "/tools/utils.py:994-1022": "Video Inference with TransNetV2 Model",
+    "/tools/wheel.py": "Video Classification Wheel Tool",
+    "/tools/wheel.py:1-24": "Apache License Notice",
+    "/tools/wheel.py:116-145": "Parse and Download with Progress",
+    "/tools/wheel.py:146-168": "Download and Save Inference Model",
+    "/tools/wheel.py:169-197": "Initializing Paddle Predictor with Flags and Configs",
+    "/tools/wheel.py:198-232": "PaddleVideo Predictor Setup",
+    "/tools/wheel.py:233-253": "Model Download and Configuration",
+    "/tools/wheel.py:25-64": "PaddleVideo Model Environment Setup",
+    "/tools/wheel.py:254-277": "Video Label Prediction Code",
+    "/tools/wheel.py:279-301": "URL Video Processing",
+    "/tools/wheel.py:303-327": "Batch Inference Looping",
+    "/tools/wheel.py:328-353": "Iterating and Labeling Results in PaddleVideo",
+    "/tools/wheel.py:354-354": "Entry Point for Script Execution",
+    "/tools/wheel.py:65-93": "Command Line Parser Function",
+    "/tools/wheel.py:94-115": "Initializing Argument Parser with Default Values"
+}
\ No newline at end of file
diff --git a/docs/data/titles/2.json b/docs/data/titles/2.json
new file mode 100644
index 000000000..896d42bac
--- /dev/null
+++ b/docs/data/titles/2.json
@@ -0,0 +1,302 @@
+{
+    "/applications/Ma-Net/README.md:1-35": "Ma-Net: PaddleVideo's CVPR2020 Implementation",
+    "/applications/Ma-Net/README.md:36-47": "Run Local Environment Script",
+    "/applications/Ma-Net/README_cn.md": "Ma-Net视频分割实现README（中文）",
+    "/applications/Ma-Net/config.py": "Ma-Net Training Setup",
+    "/applications/Ma-Net/config.py:1-32": "Configuring Ma-Net Parameters",
+    "/applications/Ma-Net/config.py:33-53": "Ma-Net App Config: CLI Arguments",
+    "/applications/Ma-Net/config.py:54-70": "Ma-Net Model Configuration",
+    "/applications/Ma-Net/config.py:71-88": "Ma-Net Configuration Arguments",
+    "/applications/Ma-Net/config.py:90-96": "Default Initialization and Epoch Calculation",
+    "/applications/Ma-Net/dataloaders/DAVIS2017.md": "DAVIS2017 Dataset Download and Setup",
+    "/applications/Ma-Net/dataloaders/DAVIS2017_cn.md": "DAVIS2017 Dataset for Ma-Net",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py": "Data Augmentation for Video Object Detection",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:1-35": "Uniform Image Rescaling Class",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:125-154": "ScaleNRotate Class for Image Transformations",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:155-189": "Random Scaling, Rotation, and Warping Transform",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:191-229": "Data Augmentation Techniques in Ma-Net",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:230-261": "Normalizing and Initializing Custom Scribble Interaction",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:262-288": "Scribble Segmentation with Bresenham",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:289-310": "Generating GT Masks from Scribbles",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:312-330": "Dilated Mask Annotation Rounds Computation",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:331-366": "Ma-Net Data Loader: Video OD Transform",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:36-69": "Custom Image Resizer Transform",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:369-405": "Edge Mask Generation in Ma-Net Dataloader",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:406-416": "Edge Mask Creation with Parsing Mask",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:70-98": "Random Crop with Sufficient Scribble Elements",
+    "/applications/Ma-Net/dataloaders/custom_transforms_f.py:99-124": "Adaptive Image Crop and Resize",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py": "DAVIS 2017 Dataset Preprocessing for Ma-Net",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:1-40": "DAVIS 2017 Test Data Manager",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:110-135": "File Sequence Extension and Preprocessing",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:136-161": "DAVIS 2017 Dataset Loader Code",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:162-186": "Load Images and Labels from Path",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:187-219": "DAVIS 2017 Video Object Detection Data Loader",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:220-244": "DAVIS2017 Mask Reader and Dictionary Creation",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:246-273": "DAVIS 2017 Data Loader Initiation",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:274-299": "DAVIS 2017 Dataset Loader",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:300-320": "Davis Frame Processing: Loader",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:322-344": "Random Scribble Label Assigner",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:345-374": "Image Dataloader and Transform",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:375-400": "Data Loading Function for Sequence Lists",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:401-431": "DAVIS 2017 Dataset Class Definition",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:41-73": "DAVIS2017 Dataset Initialization and Loading",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:432-456": "DAVIS Dataset Custom Dataloader",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:457-485": "Custom Dataloader for Adjacent Frames",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:486-506": "Loading Data for Video Sequences",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:507-531": "Preparing DAVIS Dataset for Model",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:532-562": "Ma-Net Dataset Creator",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:563-585": "Update Frame and Scribble Masks in Dataset",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:586-610": "Random JSON Label Dataset Initialization",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:611-633": "JSON Parsing and Image Loading",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:634-662": "Validate Sequence Existence and Preprocess Data",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:664-672": "JSON Dataset Preprocessing",
+    "/applications/Ma-Net/dataloaders/davis_2017_f.py:74-109": "DAVIS 2017 Semantic Segmentation Dataset Loader",
+    "/applications/Ma-Net/dataloaders/helpers.py": "Functions for Tensor to Image Conversion and Model Naming",
+    "/applications/Ma-Net/dataloaders/helpers.py:1-46": "Image Processing Helpers",
+    "/applications/Ma-Net/dataloaders/helpers.py:47-78": "Model Name Construction and Image Computation Functions",
+    "/applications/Ma-Net/dataloaders/helpers.py:79-81": "Enhancing Background with Dilation",
+    "/applications/Ma-Net/dataloaders/samplers.py": "Random Identity Sampler",
+    "/applications/Ma-Net/dataloaders/samplers.py:1-31": "RandomIdentitySampler Class",
+    "/applications/Ma-Net/dataloaders/samplers.py:32-42": "Random Identity Sampler",
+    "/applications/Ma-Net/networks/IntVOS.py": "Ma-Net: IntVOS Video Segmentation",
+    "/applications/Ma-Net/networks/IntVOS.py:1-42": "Pairwise Distance Calculation in PaddlePaddle Video OD",
+    "/applications/Ma-Net/networks/IntVOS.py:119-141": "KNN Search with Chunking",
+    "/applications/Ma-Net/networks/IntVOS.py:142-169": "Nearest Neighbor Feature Computation",
+    "/applications/Ma-Net/networks/IntVOS.py:170-186": "Nearest Neighbor Distance Calculation",
+    "/applications/Ma-Net/networks/IntVOS.py:187-211": "Nearest Neighbor Features Calculation",
+    "/applications/Ma-Net/networks/IntVOS.py:212-235": "Local Neighbor Feature Extraction",
+    "/applications/Ma-Net/networks/IntVOS.py:236-261": "Boundary-Cross Correlation Sigmoid Transpose",
+    "/applications/Ma-Net/networks/IntVOS.py:262-287": "Pairwise L2 Distances Calculator",
+    "/applications/Ma-Net/networks/IntVOS.py:288-312": "Local Downsampling in IntVOS Network",
+    "/applications/Ma-Net/networks/IntVOS.py:313-336": "Sliding Window Distance Calculator",
+    "/applications/Ma-Net/networks/IntVOS.py:337-365": "Spatial Cross-Correlation Sampler",
+    "/applications/Ma-Net/networks/IntVOS.py:366-392": "IntVOS Nearest Neighbor Compute Function",
+    "/applications/Ma-Net/networks/IntVOS.py:393-421": "Nearest Neighbor Feature Calculation",
+    "/applications/Ma-Net/networks/IntVOS.py:422-454": "Offset Masks and Distance Tensor Calculation",
+    "/applications/Ma-Net/networks/IntVOS.py:43-65": "Pairwise Distance Calculator",
+    "/applications/Ma-Net/networks/IntVOS.py:455-486": "Residual Block and Segmentation Head for Ma-Net",
+    "/applications/Ma-Net/networks/IntVOS.py:487-513": "IntSegHead: Segmentation Neural Network",
+    "/applications/Ma-Net/networks/IntVOS.py:516-537": "Separable Conv Layer with BatchNorm",
+    "/applications/Ma-Net/networks/IntVOS.py:538-571": "Dynamic Segmentation Heads",
+    "/applications/Ma-Net/networks/IntVOS.py:572-589": "Initializing Network Architecture Components",
+    "/applications/Ma-Net/networks/IntVOS.py:590-616": "Dynamic Segmentation Network Forward Pass",
+    "/applications/Ma-Net/networks/IntVOS.py:617-640": "Splitting Input, Calling prop_seghead",
+    "/applications/Ma-Net/networks/IntVOS.py:641-664": "IntVOS Feature Extraction",
+    "/applications/Ma-Net/networks/IntVOS.py:66-88": "K-Nearest Neighbor Search with Padding Distance",
+    "/applications/Ma-Net/networks/IntVOS.py:665-685": "Interpolated Feature Embedding Extraction",
+    "/applications/Ma-Net/networks/IntVOS.py:686-704": "Extracting Nearest Neighbors per Object",
+    "/applications/Ma-Net/networks/IntVOS.py:705-725": "Check and Update Global Map Embedding",
+    "/applications/Ma-Net/networks/IntVOS.py:726-745": "Nearest Neighbor Feature Extraction for Video Sequences",
+    "/applications/Ma-Net/networks/IntVOS.py:746-764": "Sequence Map Initialization and Updating",
+    "/applications/Ma-Net/networks/IntVOS.py:765-781": "Updating Previous Frame Features",
+    "/applications/Ma-Net/networks/IntVOS.py:782-803": "Frame Feature Handling and Concatenation",
+    "/applications/Ma-Net/networks/IntVOS.py:804-829": "int_seghead Function Overview",
+    "/applications/Ma-Net/networks/IntVOS.py:830-853": "Interpolating Ma-Net Scribble Labels",
+    "/applications/Ma-Net/networks/IntVOS.py:854-877": "Updating Global and Local Maps: IntVOS.py:854-877",
+    "/applications/Ma-Net/networks/IntVOS.py:878-897": "Updating Distance and Temporary Dictionaries",
+    "/applications/Ma-Net/networks/IntVOS.py:89-118": "Nearest Neighbor Feature Calculation",
+    "/applications/Ma-Net/networks/IntVOS.py:898-921": "Tensor Operations for Segmentation Model",
+    "/applications/Ma-Net/networks/IntVOS.py:922-927": "Transposing Tensor and Storing in Dictionary",
+    "/applications/Ma-Net/networks/aspp.py": "ASPP Module: ASPP Pyramid Pooling in Ma-Net",
+    "/applications/Ma-Net/networks/aspp.py:1-34": "ASPP Module: Hierarchical Atrous Spatial Pooling",
+    "/applications/Ma-Net/networks/aspp.py:35-66": "ASPP Class: Building ASPP Network Modules",
+    "/applications/Ma-Net/networks/aspp.py:67-89": "ASPP Modules and Global Average Pooling Layer",
+    "/applications/Ma-Net/networks/aspp.py:90-123": "ASPP Module in Ma-Net's CNN",
+    "/applications/Ma-Net/networks/backbone/__init__.py": "Build Backbone Networks",
+    "/applications/Ma-Net/networks/backbone/drn.py": "Deep Residual Networks in PaddlePaddle",
+    "/applications/Ma-Net/networks/backbone/drn.py:1-29": "BasicBlock Class in DRN Network",
+    "/applications/Ma-Net/networks/backbone/drn.py:104-130": "DRN Network: Convolutional and Pooling Architecture",
+    "/applications/Ma-Net/networks/backbone/drn.py:131-147": "DRN Network Architecture",
+    "/applications/Ma-Net/networks/backbone/drn.py:148-170": "Defining MA-Net Backbone Layers",
+    "/applications/Ma-Net/networks/backbone/drn.py:171-193": "DRN Network Layer Construction",
+    "/applications/Ma-Net/networks/backbone/drn.py:194-234": "Deep Residual Network Backbone Architecture",
+    "/applications/Ma-Net/networks/backbone/drn.py:236-257": "DRN_A Class in Ma-Net Backbone",
+    "/applications/Ma-Net/networks/backbone/drn.py:258-279": "Creating Layers with _make_layer",
+    "/applications/Ma-Net/networks/backbone/drn.py:281-318": "DRN Model Functions in Ma-Net Backbone",
+    "/applications/Ma-Net/networks/backbone/drn.py:30-65": "Residual Bottleneck Block",
+    "/applications/Ma-Net/networks/backbone/drn.py:319-349": "Initializing DRN Models with Pre-Trained Weights",
+    "/applications/Ma-Net/networks/backbone/drn.py:350-380": "DRN Model Functions with Configurations",
+    "/applications/Ma-Net/networks/backbone/drn.py:381-400": "DRN Model Definition and Pretrained Weights Loading",
+    "/applications/Ma-Net/networks/backbone/drn.py:66-103": "Deep Residual Network Model",
+    "/applications/Ma-Net/networks/backbone/mobilenet.py": "Ma-Net: MobileNetV2 Backbone Initialization",
+    "/applications/Ma-Net/networks/backbone/mobilenet.py:1-33": "MobileNet Network Layer Definition",
+    "/applications/Ma-Net/networks/backbone/mobilenet.py:100-127": "MobileNet Backbone for Ma-Net Application",
+    "/applications/Ma-Net/networks/backbone/mobilenet.py:128-157": "Preparing MobileNet Backbone for Feature Extraction",
+    "/applications/Ma-Net/networks/backbone/mobilenet.py:158-163": "Kaiming Normal Init and Batch Norm for Mobilenet",
+    "/applications/Ma-Net/networks/backbone/mobilenet.py:34-63": "MobileNet Layer Creation: Convolutional Neural Network",
+    "/applications/Ma-Net/networks/backbone/mobilenet.py:64-99": "MobileNetV2 Model Definition",
+    "/applications/Ma-Net/networks/backbone/resnet.py": "ResNet Architecture with Batch Normalization",
+    "/applications/Ma-Net/networks/backbone/resnet.py:1-33": "Bottleneck ResNet Backbone Definition",
+    "/applications/Ma-Net/networks/backbone/resnet.py:104-126": "ResNet Network with Batch Normalization",
+    "/applications/Ma-Net/networks/backbone/resnet.py:127-157": "ResNet Block Builder Function",
+    "/applications/Ma-Net/networks/backbone/resnet.py:158-186": "ResNet Residual Block Builder",
+    "/applications/Ma-Net/networks/backbone/resnet.py:187-220": "ResNet Network Definition",
+    "/applications/Ma-Net/networks/backbone/resnet.py:221-239": "ResNet-101 Model Function",
+    "/applications/Ma-Net/networks/backbone/resnet.py:34-77": "ResNet Architecture Design: BatchNorm, ReLU, Downsample",
+    "/applications/Ma-Net/networks/backbone/resnet.py:78-103": "Initializing ResNet Backbone: Conv, BN, Pool and Residual Blocks",
+    "/applications/Ma-Net/networks/backbone/xception.py": "AlignedXception Backbone for Image Classification",
+    "/applications/Ma-Net/networks/backbone/xception.py:1-34": "Separable Conv Layer with BatchNorm",
+    "/applications/Ma-Net/networks/backbone/xception.py:103-144": "AlignedXception Network Code",
+    "/applications/Ma-Net/networks/backbone/xception.py:145-175": "AlignedXception Initialization Code",
+    "/applications/Ma-Net/networks/backbone/xception.py:176-201": "Xception Backbone: Block Architecture",
+    "/applications/Ma-Net/networks/backbone/xception.py:202-225": "Xception Block Creation and Implementation",
+    "/applications/Ma-Net/networks/backbone/xception.py:226-249": "Repeated Convolutions and Batch Normalization in Xception",
+    "/applications/Ma-Net/networks/backbone/xception.py:250-273": "Xception Convolutions and Block Initialization",
+    "/applications/Ma-Net/networks/backbone/xception.py:274-297": "Xception Blocks in Ma-Net's Image Classification",
+    "/applications/Ma-Net/networks/backbone/xception.py:298-323": "Xception Block Configurations",
+    "/applications/Ma-Net/networks/backbone/xception.py:324-348": "Xception: Separable Conv Layers",
+    "/applications/Ma-Net/networks/backbone/xception.py:349-390": "Xception Network Architecture",
+    "/applications/Ma-Net/networks/backbone/xception.py:35-67": "Xception Block Layer Initialization and Forward",
+    "/applications/Ma-Net/networks/backbone/xception.py:391-427": "Xception Model: Neural Network for Image Classification",
+    "/applications/Ma-Net/networks/backbone/xception.py:429-447": "Updating Pre-trained Xception Model Weights",
+    "/applications/Ma-Net/networks/backbone/xception.py:448-455": "Renaming Conv and BN Parameters",
+    "/applications/Ma-Net/networks/backbone/xception.py:68-102": "Xception Backbone Network Creation",
+    "/applications/Ma-Net/networks/decoder.py": "Decoder Network Construction",
+    "/applications/Ma-Net/networks/decoder.py:1-32": "Decoder Layer for Feature Classification",
+    "/applications/Ma-Net/networks/decoder.py:33-62": "Decoder Network Architecture",
+    "/applications/Ma-Net/networks/decoder.py:65-66": "Build Decoder Network Function",
+    "/applications/Ma-Net/networks/deeplab.py": "Freezing Batch Norm Layers in DeepLab",
+    "/applications/Ma-Net/networks/deeplab.py:1-31": "Frozen Batch Normalization for DeepLab",
+    "/applications/Ma-Net/networks/deeplab.py:32-64": "DeepLab Class Definition",
+    "/applications/Ma-Net/networks/deeplab.py:65-81": "Get ConvBN Layers' Parameters",
+    "/applications/Ma-Net/networks/loss.py": "Custom Loss Function for Image Classification",
+    "/applications/Ma-Net/networks/loss.py:1-28": "Custom BCE Loss Function",
+    "/applications/Ma-Net/networks/loss.py:110-130": "Hard Example Mining Loss Function",
+    "/applications/Ma-Net/networks/loss.py:131-148": "Weighted Hard Example Mining Loss",
+    "/applications/Ma-Net/networks/loss.py:149-153": "Top-k Mean Loss Calculation",
+    "/applications/Ma-Net/networks/loss.py:29-44": "Hard Example Mining Loss",
+    "/applications/Ma-Net/networks/loss.py:45-67": "Custom Loss Function with Hard Example Mining",
+    "/applications/Ma-Net/networks/loss.py:68-87": "Top K Percent Pixel Loss",
+    "/applications/Ma-Net/networks/loss.py:88-109": "Hard Example Mining and Top-k Pixel Selection Loss",
+    "/applications/Ma-Net/run.sh": "DeeplabV3_coco DAVIS Dataset Training and Testing",
+    "/applications/Ma-Net/run.sh:1-13": "Train DeeplabV3 on DAVIS Dataset",
+    "/applications/Ma-Net/run.sh:13-15": "Testing Video Object Segmentation",
+    "/applications/Ma-Net/test.py": "DAVIS2017 Video Object Detection with PaddlePaddle",
+    "/applications/Ma-Net/test.py:1-39": "Data Preprocessing for DAVIS2017",
+    "/applications/Ma-Net/test.py:115-139": "Scribble Sequence Retrieval and Memory Initialization",
+    "/applications/Ma-Net/test.py:140-163": "Interaction Detection Code: File Writing and Embedding",
+    "/applications/Ma-Net/test.py:164-182": "Extracting and Concatenating Embeddings",
+    "/applications/Ma-Net/test.py:183-203": "Scribble Labeling in Ma-Net",
+    "/applications/Ma-Net/test.py:204-224": "Save Scribble Image with Palette",
+    "/applications/Ma-Net/test.py:226-244": "Segmentation Model Initialization",
+    "/applications/Ma-Net/test.py:245-262": "Ma-Net Labeling: Predict, Resize, Max",
+    "/applications/Ma-Net/test.py:263-279": "Save Interactive Video Frame as Labeled Image",
+    "/applications/Ma-Net/test.py:280-298": "Video Object Segmentation Algorithm with Pre-trained Model",
+    "/applications/Ma-Net/test.py:299-318": "Function Call with Multiple Args and Interpolation",
+    "/applications/Ma-Net/test.py:320-338": "Image Saving for Prediction Labels",
+    "/applications/Ma-Net/test.py:339-356": "Folder and Image Saving Reset",
+    "/applications/Ma-Net/test.py:357-374": "Video Object Detection with PaddlePaddle's Prop Seghead",
+    "/applications/Ma-Net/test.py:375-394": "Dynamic Object Detection and Classification",
+    "/applications/Ma-Net/test.py:395-412": "Save Image in Directory Structure",
+    "/applications/Ma-Net/test.py:40-62": "Video Analysis Configuration Loading",
+    "/applications/Ma-Net/test.py:413-436": "Interactive Image Classification System",
+    "/applications/Ma-Net/test.py:437-468": "Filtering Scribble Labels in Ma-Net",
+    "/applications/Ma-Net/test.py:469-485": "75 Colors Palette Definition",
+    "/applications/Ma-Net/test.py:486-498": "List of Sequential Numbers",
+    "/applications/Ma-Net/test.py:499-511": "Incrementing Loop",
+    "/applications/Ma-Net/test.py:512-525": "Enigmatic Numerical Sequence",
+    "/applications/Ma-Net/test.py:63-87": "Preparing Image Dictionary for Model Training",
+    "/applications/Ma-Net/test.py:88-113": "Interactive Session Initialization",
+    "/applications/Ma-Net/train_stage1.py": "Ma-Net Video Detection Training",
+    "/applications/Ma-Net/train_stage1.py:1-34": "Train Stage 1: Ma-Net Setup",
+    "/applications/Ma-Net/train_stage1.py:116-144": "Model Resumption and Training",
+    "/applications/Ma-Net/train_stage1.py:145-172": "Preparing Input Data for Model Training",
+    "/applications/Ma-Net/train_stage1.py:173-194": "Initialize Label and Object Dictionaries",
+    "/applications/Ma-Net/train_stage1.py:195-217": "Video Object Detection Model Training: Stages and Loss Functions",
+    "/applications/Ma-Net/train_stage1.py:218-240": "Image Comparison and Normalization",
+    "/applications/Ma-Net/train_stage1.py:241-266": "Sigmoid Binary Cross-Entropy Masks",
+    "/applications/Ma-Net/train_stage1.py:267-286": "Loading and Preparing Test Datasets",
+    "/applications/Ma-Net/train_stage1.py:287-306": "Paddle Data Loader for Test Samples",
+    "/applications/Ma-Net/train_stage1.py:307-326": "Feature Extraction and Model Prediction",
+    "/applications/Ma-Net/train_stage1.py:327-348": "Frame-by-frame Prediction Saving Function",
+    "/applications/Ma-Net/train_stage1.py:349-378": "Training Ma-Net with Adaptive Learning Rate",
+    "/applications/Ma-Net/train_stage1.py:35-59": "Training Ma-Net in Stage 1",
+    "/applications/Ma-Net/train_stage1.py:379-391": "RGB Object Values List",
+    "/applications/Ma-Net/train_stage1.py:392-404": "Sequence Numbers in Ma-Net's train_stage1.py",
+    "/applications/Ma-Net/train_stage1.py:405-417": "Image Sequence Codes",
+    "/applications/Ma-Net/train_stage1.py:418-429": "Training Manager's Code and Function Call",
+    "/applications/Ma-Net/train_stage1.py:61-87": "Training Stage: Ma-Net Model Initiation",
+    "/applications/Ma-Net/train_stage1.py:88-114": "Dataset Preparation and Training Setup",
+    "/applications/Ma-Net/train_stage2.py": "Training Ma-Net Stage 2 with Learning Rates",
+    "/applications/Ma-Net/train_stage2.py:1-34": "Initialize Environment for Training",
+    "/applications/Ma-Net/train_stage2.py:120-145": "Ma-Net: Training Stage 2",
+    "/applications/Ma-Net/train_stage2.py:146-170": "Model Resuming and Training Loop",
+    "/applications/Ma-Net/train_stage2.py:171-191": "Dataset Initialization and Training Loop",
+    "/applications/Ma-Net/train_stage2.py:192-212": "Training Stage 2: Setting Up Model and Feature Extraction",
+    "/applications/Ma-Net/train_stage2.py:213-229": "Image Classification Code Snippet Initialization",
+    "/applications/Ma-Net/train_stage2.py:230-247": "Initialize and Process Sequences",
+    "/applications/Ma-Net/train_stage2.py:248-265": "Label and Object Dictionary Handling",
+    "/applications/Ma-Net/train_stage2.py:266-287": "Training Stage 2: Updates and Visualizations",
+    "/applications/Ma-Net/train_stage2.py:288-306": "Label and Prediction Visualization",
+    "/applications/Ma-Net/train_stage2.py:307-324": "Segmenting Image with Binary Cross-Entropy",
+    "/applications/Ma-Net/train_stage2.py:325-350": "Save Network at Intervals During Training",
+    "/applications/Ma-Net/train_stage2.py:35-61": "DataLoader Initialization and Configuration",
+    "/applications/Ma-Net/train_stage2.py:351-367": "Training Stage 2: Data Loader Setup",
+    "/applications/Ma-Net/train_stage2.py:368-386": "Scribble Labeling and Image Processing in Stage 2",
+    "/applications/Ma-Net/train_stage2.py:387-406": "Model Training: Concatenating Labels and GPU Check",
+    "/applications/Ma-Net/train_stage2.py:407-423": "Interpolated Image Classification with Interactor",
+    "/applications/Ma-Net/train_stage2.py:424-439": "Resizing and Updating Image Labels",
+    "/applications/Ma-Net/train_stage2.py:441-462": "Round-Based Video Model Training",
+    "/applications/Ma-Net/train_stage2.py:464-481": "Training Stage 2: Ma-Net Data Preparation",
+    "/applications/Ma-Net/train_stage2.py:482-500": "Train Dataset Update and Model Training Progress",
+    "/applications/Ma-Net/train_stage2.py:501-525": "Efficient ROI Operation for Scribble Labels",
+    "/applications/Ma-Net/train_stage2.py:526-556": "Training Stage 2: Load, Train, Save Network",
+    "/applications/Ma-Net/train_stage2.py:557-573": "RGB Palette Generation Code",
+    "/applications/Ma-Net/train_stage2.py:574-586": "List of Numbers (81-150)",
+    "/applications/Ma-Net/train_stage2.py:587-599": "Code Purpose Unclear",
+    "/applications/Ma-Net/train_stage2.py:600-612": "Manager Training with Image Dimensions",
+    "/applications/Ma-Net/train_stage2.py:62-89": "Initialize Manager Object for VOS Training",
+    "/applications/Ma-Net/train_stage2.py:91-119": "Train Stage 2: Ma-Net Model Init & Optimization",
+    "/applications/Ma-Net/utils/api.py": "Universal Tensor Utility API",
+    "/applications/Ma-Net/utils/api.py:1-49": "Utility Functions for PyTorch-Paddle Conversion",
+    "/applications/Ma-Net/utils/api.py:113-136": "Mode Validator for Image Data Types",
+    "/applications/Ma-Net/utils/api.py:137-161": "Verify Image Mode and Data Type",
+    "/applications/Ma-Net/utils/api.py:163-198": "Identity Class and Data Conversion Function",
+    "/applications/Ma-Net/utils/api.py:199-223": "Gradient Norm Clipping Function",
+    "/applications/Ma-Net/utils/api.py:224-250": "Max Absolute Value Finder",
+    "/applications/Ma-Net/utils/api.py:251-274": "Ma-Net: Non-finite Parameter Clipping",
+    "/applications/Ma-Net/utils/api.py:275-307": "Maximum Value and Index Extractor",
+    "/applications/Ma-Net/utils/api.py:308-338": "Weight Initialization without Gradient Calculation",
+    "/applications/Ma-Net/utils/api.py:339-364": "Truncated Normal Initialization",
+    "/applications/Ma-Net/utils/api.py:366-398": "Tensor Transformations and Nonlinearity Gains",
+    "/applications/Ma-Net/utils/api.py:399-425": "Gain Calculator for Non-linear Functions",
+    "/applications/Ma-Net/utils/api.py:426-454": "Initializing Tensor Distributions",
+    "/applications/Ma-Net/utils/api.py:455-483": "Truncated Normal Tensor Initialization",
+    "/applications/Ma-Net/utils/api.py:484-526": "Initializing Tensor Functions in PyTorch",
+    "/applications/Ma-Net/utils/api.py:50-83": "Tensor and Image Conversion Utilities",
+    "/applications/Ma-Net/utils/api.py:528-562": "Preserving Identity in Linear and Conv Layers: Functions",
+    "/applications/Ma-Net/utils/api.py:563-592": "Convolutional Layer Weights Init with Dirac Delta",
+    "/applications/Ma-Net/utils/api.py:593-627": "PaddlePaddle Tensor Utilities",
+    "/applications/Ma-Net/utils/api.py:628-655": "Glorot Initialization in Ma-Net API",
+    "/applications/Ma-Net/utils/api.py:656-687": "Xavier/Glorot Tensor Initialization",
+    "/applications/Ma-Net/utils/api.py:688-709": "Uniform Tensor Filler",
+    "/applications/Ma-Net/utils/api.py:710-732": "Kaiming Uniform Initialization in PyTorch",
+    "/applications/Ma-Net/utils/api.py:733-758": "Kaiming Weight Initialization",
+    "/applications/Ma-Net/utils/api.py:759-789": "QR Factorization of Tensors",
+    "/applications/Ma-Net/utils/api.py:790-822": "QR Decomposition and Scaling",
+    "/applications/Ma-Net/utils/api.py:824-857": "Kaiming Normal Initializer Function",
+    "/applications/Ma-Net/utils/api.py:84-112": "Compatibility Check: Adjust and Convert Image Data Types",
+    "/applications/Ma-Net/utils/mask_damaging.py": "Mask Damager: Rotation and Translation in PaddleVideo",
+    "/applications/Ma-Net/utils/mask_damaging.py:1-36": "Mask Damager: Random Transformations for Labels",
+    "/applications/Ma-Net/utils/mask_damaging.py:130-155": "Random Mask Damage Functions",
+    "/applications/Ma-Net/utils/mask_damaging.py:156-170": "Rotated and Translated Masks",
+    "/applications/Ma-Net/utils/mask_damaging.py:37-73": "Mask Damaging Functions",
+    "/applications/Ma-Net/utils/mask_damaging.py:74-98": "Mask Damaging Function",
+    "/applications/Ma-Net/utils/mask_damaging.py:99-129": "Mask Damaging in PaddleVideo Library",
+    "/applications/Ma-Net/utils/meters.py": "AverageMeter Class: Compute and Store Average",
+    "/applications/Ma-Net/utils/utils.py": "Label to RGB Conversion",
+    "/applications/MultimodalVideoTag/README.md": "Multimodal Video Tagging with PaddlePaddle 2.0",
+    "/applications/MultimodalVideoTag/README.md:1-37": "Multimodal Video Classification with PaddlePaddle",
+    "/applications/MultimodalVideoTag/README.md:38-65": "Training, Evaluation, and Inference with Multimodal Video Tagging",
+    "/applications/MultimodalVideoTag/README.md:67-77": "Multimodal Video Tagging with Attention Clusters",
+    "/applications/MultimodalVideoTag/download.sh": "Download ErnIE Model and Dataset",
+    "/applications/MultimodalVideoTag/eval_and_save_model.sh": "Env Var Set, Eval & Save Model Script",
+    "/applications/MultimodalVideoTag/inference.sh": "GPU-Based Inference Script",
+    "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py": "Multimodal Video Accuracy Metrics Calculator",
+    "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:1-35": "Accuracy Metrics Calculator",
+    "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:126-158": "Top-K Accuracy Calculation",
+    "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:159-160": "Top-K Hits for Multilabel Prediction",
+    "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:36-68": "Multimodal Video Tag Accuracy Metrics",
+    "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:69-95": "Video Tagging Metrics Computation",
+    "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py:96-125": "Multilabel Top-K Accuracy"
+}
\ No newline at end of file
diff --git a/docs/data/titles/3.json b/docs/data/titles/3.json
new file mode 100644
index 000000000..6eafd0116
--- /dev/null
+++ b/docs/data/titles/3.json
@@ -0,0 +1,302 @@
+{
+    "/applications/MultimodalVideoTag/scenario_lib/config.py": "Config Parser and Merger for Multimodal Video Tag",
+    "/applications/MultimodalVideoTag/scenario_lib/config.py:1-52": "Config Parser and Merger Function",
+    "/applications/MultimodalVideoTag/scenario_lib/config.py:53-71": "Config Updater and Printer",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py": "Multimodal Video Tag Datareader",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py": "ERNIE Reader for Multimodal Video Tagging",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:1-35": "Ernie Reader: MultimodalVideoTag's Python Component",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:103-131": "Record Creation from Text Tokenization",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:132-151": "Ensuring Correct BERT/ERNIE Sequences",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:152-179": "ERNIE Input Preparation",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:180-207": "ERNIE Batch Record Generation",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:208-235": "Padding Ernie Batch Reader",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:236-257": "ERNIE Task Data Processing",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:258-289": "ERNIE Text Data Generation",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:290-317": "Pad Instances to Max Sequence Length",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:318-334": "Preparing Return List in ERNIE Task Reader",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:37-74": "CSV Reader: BaseReader Class",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py:75-102": "Initializing and Configuring ERNIE Task Reader",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py": "Multimodal Video Feature Reader",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:1-39": "FeatureReader: Multimodal Video Feature Data Reader",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:114-140": "Multimodal Video Data Reader",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:143-173": "Function for Loading Video Files and Labels",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:174-212": "Label Data Manipulation Functions",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:213-251": "Efficient Data Reader for Multimodal Video Analysis",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:252-274": "Load and Return Dictionary of Words and Indices",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:40-67": "YouTube-8M Data Reader: LSTM, Attention Cluster, NextVlad",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:68-95": "Multimodal Data Reader Algorithm",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py:96-113": "Multi-Modal Dataset Feature Reader",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py": "Reader Manager: Custom Exceptions and Singleton Design",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py:1-30": "Custom Exception for Missing Reader",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py:31-73": "Video Data Reader Utilities",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py:74-91": "Reader Manager: Singleton for Registering and Retrieving Readers",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py": "Text Tokenization for Multimodal Video Tagging",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:1-32": "Python Unicode Converter Function",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:134-168": "End-to-End Tokenization with CharTokenizer",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:169-197": "Basic Tokenizer for Text Tokenization",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:198-229": "Chinese Text Tokenization and Processing",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:230-259": "Text Tokenization Functions",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:260-282": "CJK Unicode Checker",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:283-315": "Greedy Wordpiece Tokenizer",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:316-348": "Tokenization and Unknown Word Handling",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:33-61": "Universal Printable Text Encoder",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:349-382": "Tokenizing String Functions",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:383-405": "Detect Punctuation and Chinese Characters",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:406-441": "Chinese Text Tokenizer",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:62-96": "Vocabulary File Handler",
+    "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py:97-133": "FullTokenizer: Efficient Tokenization Class",
+    "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py": "Multimodal Video Tagging with PaddlePaddle",
+    "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:1-37": "Multimodal Video Tag Evaluation Code",
+    "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:124-145": "Evaluate and Save Multimodal Video Model",
+    "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:146-159": "Save Inference Model with Parameters",
+    "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:38-64": "Paddle Video Eval Argument Parser",
+    "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:65-94": "Evaluate and Save Inference Model",
+    "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py:95-123": "Save and Evaluate Model",
+    "/applications/MultimodalVideoTag/scenario_lib/inference.py": "Multimodal Video Tagging Inference",
+    "/applications/MultimodalVideoTag/scenario_lib/inference.py:1-38": "Paddle Video Inference Script",
+    "/applications/MultimodalVideoTag/scenario_lib/inference.py:100-122": "Multimodal Inference Function",
+    "/applications/MultimodalVideoTag/scenario_lib/inference.py:124-161": "Video Label Inference Function",
+    "/applications/MultimodalVideoTag/scenario_lib/inference.py:162-173": "MultimodalVideoTag Inference Function",
+    "/applications/MultimodalVideoTag/scenario_lib/inference.py:39-69": "InferModel Class and Load Inference Model Function",
+    "/applications/MultimodalVideoTag/scenario_lib/inference.py:70-98": "Multimodal Video Tagging Initialization",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py": "Multi-modal Video Tagging with ERNIE",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:1-34": "AttentionLstmErnie: Combining Scenario-Classify and ERNIE",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:109-131": "ERNIE Model Initialization and Freeze",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:132-154": "Attention-based LSTM Model for Video Tagging",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:155-172": "Dynamic LSTM for Image Features with Attention",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:173-194": "Multimodal LSTM with Audio and Visual Inputs",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:195-214": "Attention LSTM for Audio Reversal",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:215-235": "Multimodal Video Tagging with LSTM-Attention and ERNIE",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:236-260": "Attention-based Neural Feature Sequence Calculation",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:261-285": "Dropout and Batch Normalization for LSTM",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:286-312": "Attention LSTM Ernie Model with Dropout",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:313-334": "Loss Calculation with Piecewise Decay Optimizer",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:336-365": "Sigmoid Loss Function in Attention LSTM",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:35-59": "Attention LSTM ERNIE Model Initialization",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:366-400": "Attention LSTM ERNIE Model Functions",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:60-85": "AttentionLSTMERNIE Model Config Init",
+    "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py:86-108": "Ernie Model Data Feeding and Feature Extraction",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py": "ERNIE Multimodal Video Tagging Model",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:1-33": "Ernie Model Configuration",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:107-132": "ERNIE Model Initialization and Building",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:133-158": "Multimodal Video Tagging Embedding Combination",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:159-184": "Embedding Layer Initialization and Encoding",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:185-215": "Encoder Layer Initialization",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:216-243": "TextCNN Model for Sequence Feature Extraction",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:244-250": "1D Convolutional Layer Creation",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:34-73": "Ernie Model Configuration Class",
+    "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py:76-106": "ERNIE Model Class Definition",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py": "Transformer Encoder for NLP",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:1-32": "Multi-Head Attention Function",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:105-128": "Scaled Dot-Product Attention in Transformer Encoder",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:129-154": "Transformer Encoder Attention Mechanism",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:155-182": "Position-wise Feed-Forward Network in Transformer Encoder",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:183-208": "Transformer Encoder Layer for Multimodal Video Tagging",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:209-236": "Transformer Encoder Layer Implementation",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:237-268": "Transformer Encoder Layer with MH Attention",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:269-308": "Transformer Encoder Model Definition",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:309-338": "Transformer Encoder Function",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:33-57": "Multi-Head Attention Layer Code",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:58-80": "Transformer Encoder Layer Function",
+    "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py:81-104": "Split and Combine Attention Heads in Transformer Encoder",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py": "Video Model Training with PaddlePaddle",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py:1-34": "Train Model Using AttentionLstmErnie",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py:106-136": "Command-Line Arguments for Model Training",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py:137-160": "Training Model Setup",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py:161-190": "Model Building and Execution Setup",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py:191-213": "Data Parallelism with Pre-Trained Weights",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py:214-231": "Batch Size Setting in Multimodal Video Tagging",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py:232-263": "Train Model with Custom Arguments",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py:37-71": "Command-Line Logging for Paddle Video",
+    "/applications/MultimodalVideoTag/scenario_lib/train.py:72-105": "Training Options in MultimodalVideoTag",
+    "/applications/MultimodalVideoTag/scenario_lib/utils.py": "Multi-Task Framework for Video Tagging",
+    "/applications/MultimodalVideoTag/scenario_lib/utils.py:1-39": "Testing with PyReader Function",
+    "/applications/MultimodalVideoTag/scenario_lib/utils.py:120-141": "Epoch Training Metrics and Testing",
+    "/applications/MultimodalVideoTag/scenario_lib/utils.py:142-169": "Save and Stop Training Model Function",
+    "/applications/MultimodalVideoTag/scenario_lib/utils.py:170-201": "Load Pretrained Parameters",
+    "/applications/MultimodalVideoTag/scenario_lib/utils.py:204-218": "AttrDict: Dictionary as Class Attributes",
+    "/applications/MultimodalVideoTag/scenario_lib/utils.py:40-67": "PaddleVideo Test Suite",
+    "/applications/MultimodalVideoTag/scenario_lib/utils.py:68-96": "Train Model with PyReader: Epochs, Testing, and Early Stopping",
+    "/applications/MultimodalVideoTag/scenario_lib/utils.py:97-119": "ML Training Loop Metrics Tracker",
+    "/applications/MultimodalVideoTag/train.sh": "Efficient GPU Training of Attention LSTM Ernie",
+    "/applications/PP-Care/Readme.md": "Pre-Trained PP-Care Model for Video Understanding",
+    "/applications/PP-Care/Readme.md:1-55": "3DMRI Classification with PaddleVideo",
+    "/applications/PP-Care/Readme.md:107-110": "Efficient Video Neural Networks: A Comprehensive Guide",
+    "/applications/PP-Care/Readme.md:55-81": "Initializing PP-Care Model for MRI Data",
+    "/applications/PP-Care/Readme.md:81-106": "Optimized PP-Care Model Testing with ResNet50",
+    "/applications/PPHuman/README.md": "PaddleVideo to PP-Human Model Conversion Script",
+    "/applications/PPHuman/README.md:1-21": "Training Behavior Model with PaddleVideo",
+    "/applications/PPHuman/README.md:115-143": "Exporting PaddleVideo Model for PP-Human",
+    "/applications/PPHuman/README.md:22-42": "Data Preparation for PP-Human",
+    "/applications/PPHuman/README.md:44-60": "Keypoint Detection with Pretrained Models",
+    "/applications/PPHuman/README.md:62-83": "PPHuman: Human Keypoint Detection in Videos",
+    "/applications/PPHuman/README.md:84-114": "PPHuman JSON to Training Data Conversion",
+    "/applications/PPHuman/datasets/prepare_dataset.py": "Preparing Datasets for PaddleVideo and PPHuman",
+    "/applications/PPHuman/datasets/prepare_dataset.py:1-34": "UR Fall Dataset Conversion for PaddleVideo",
+    "/applications/PPHuman/datasets/prepare_dataset.py:35-69": "Consistent Dataset Preparation",
+    "/applications/PPHuman/datasets/prepare_dataset.py:70-98": "Prepare Dataset for PaddleVideo's PPHuman",
+    "/applications/README.md": "PaddleVideo: Versatile Application Cases",
+    "/applications/T2VLAD/README.md": "Introducing T2VLAD: Video Retrieval Model in PaddleVideo",
+    "/applications/T2VLAD/README.md:1-60": "T2VLAD: Text Video Retrieval Model Introduction",
+    "/applications/T2VLAD/README.md:61-75": "T2VLAD Performance Metrics",
+    "/applications/T2VLAD/README_en.md": "T2VLAD: Text-Video Retrieval with PaddleNLP",
+    "/applications/T2VLAD/README_en.md:1-31": "Install PaddleNLP Dependency",
+    "/applications/T2VLAD/README_en.md:32-59": "Train and Test T2VLAD on MSRVTT Dataset",
+    "/applications/T2VLAD/README_en.md:61-69": "Text-Video Retrieval Model Metrics: R@1, R@5, R@10",
+    "/applications/T2VLAD/base/__init__.py": "Importing Base Modules",
+    "/applications/T2VLAD/base/base_dataset.py": "Video Dataset Base Class",
+    "/applications/T2VLAD/base/base_dataset.py:1-36": "Copyright, Libraries, and Type Guarding in Python",
+    "/applications/T2VLAD/base/base_dataset.py:102-125": "Dataset Initialization",
+    "/applications/T2VLAD/base/base_dataset.py:127-152": "Default Video Retrieval Paths",
+    "/applications/T2VLAD/base/base_dataset.py:154-175": "Experts Configuration Initialization",
+    "/applications/T2VLAD/base/base_dataset.py:176-197": "Initializing Arrays for Model Evaluation",
+    "/applications/T2VLAD/base/base_dataset.py:199-217": "Expert Index Initialization",
+    "/applications/T2VLAD/base/base_dataset.py:218-237": "Video Feature Preparation and Test Captioning",
+    "/applications/T2VLAD/base/base_dataset.py:238-257": "Token Masking and Encoding in T2VLAD",
+    "/applications/T2VLAD/base/base_dataset.py:258-280": "Text Feature Creation and Split Configuration",
+    "/applications/T2VLAD/base/base_dataset.py:281-304": "Loading and Initializing Data for PaddleVideo",
+    "/applications/T2VLAD/base/base_dataset.py:305-327": "Batch Tensor Initialization",
+    "/applications/T2VLAD/base/base_dataset.py:329-350": "Data Preparation for Experts",
+    "/applications/T2VLAD/base/base_dataset.py:351-372": "Minibatch Creation for Video and Text Features",
+    "/applications/T2VLAD/base/base_dataset.py:37-76": "Base Dataset Class for Video Features",
+    "/applications/T2VLAD/base/base_dataset.py:373-397": "Video Dataset Class for Text-to-Video Retrieval",
+    "/applications/T2VLAD/base/base_dataset.py:398-413": "Video Frame Feature Segmentation",
+    "/applications/T2VLAD/base/base_dataset.py:414-437": "Random Captioning with Tokenization",
+    "/applications/T2VLAD/base/base_dataset.py:438-463": "Video Dataset Initialization",
+    "/applications/T2VLAD/base/base_dataset.py:464-492": "Defining Retrieval Data and Meta Dictionary",
+    "/applications/T2VLAD/base/base_dataset.py:493-516": "Feature Path Generator",
+    "/applications/T2VLAD/base/base_dataset.py:517-539": "Assertion Function and Summary Stats in T2VLAD Base Dataset",
+    "/applications/T2VLAD/base/base_dataset.py:540-562": "Partition and Analyze Datasets",
+    "/applications/T2VLAD/base/base_dataset.py:77-101": "Dataset Class Initialization",
+    "/applications/T2VLAD/base/base_model.py": "Base Model Abstract Class",
+    "/applications/T2VLAD/base/base_model.py:1-36": "Abstract Base Model for PaddleVideo",
+    "/applications/T2VLAD/base/base_model.py:37-37": "Trainable Parameters Counter",
+    "/applications/T2VLAD/base/base_trainer.py": "T2VLAD Trainer: Multi-Epoch Management and Checkpoints",
+    "/applications/T2VLAD/base/base_trainer.py:1-33": "Base Trainer Class Setup",
+    "/applications/T2VLAD/base/base_trainer.py:111-128": "Improved Performance Check",
+    "/applications/T2VLAD/base/base_trainer.py:129-151": "Early Stopping and Best Model Saving",
+    "/applications/T2VLAD/base/base_trainer.py:153-170": "Flexible Model Saving Conditions",
+    "/applications/T2VLAD/base/base_trainer.py:171-186": "Video Prediction Saving and Logging",
+    "/applications/T2VLAD/base/base_trainer.py:187-210": "Model Checkpoint Management & Purge",
+    "/applications/T2VLAD/base/base_trainer.py:211-238": "Stale Model Pruning",
+    "/applications/T2VLAD/base/base_trainer.py:239-258": "AutoSave Best Model During Training",
+    "/applications/T2VLAD/base/base_trainer.py:34-60": "Initializing Base Trainer Object",
+    "/applications/T2VLAD/base/base_trainer.py:62-89": "Training Trainer Class",
+    "/applications/T2VLAD/base/base_trainer.py:90-110": "Metrics Logging and Monitoring Enhancements",
+    "/applications/T2VLAD/data/download_features.sh": "Remote Dataset Download & Extraction",
+    "/applications/T2VLAD/data_loader/MSRVTT_dataset.py": "MSRVTT Dataset Loader",
+    "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:1-29": "MSRVTT Dataset Loader",
+    "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:110-126": "Validating Test Sets and Missing Queries",
+    "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:30-46": "Data Split Paths for MSRVTT Dataset",
+    "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:47-71": "MSRVTT Dataset Feature Loading",
+    "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:72-89": "Feature Aggregation for Expert in MSRVTT Dataset",
+    "/applications/T2VLAD/data_loader/MSRVTT_dataset.py:90-108": "Checking and Validating Text Features",
+    "/applications/T2VLAD/data_loader/data_loaders.py": "Efficient Data Loader with LRU Caching",
+    "/applications/T2VLAD/data_loader/data_loaders.py:1-36": "Paddle Dataset Loader Function",
+    "/applications/T2VLAD/data_loader/data_loaders.py:102-127": "Flush and Create Dataset Loader",
+    "/applications/T2VLAD/data_loader/data_loaders.py:129-145": "Training DataLoader Creator",
+    "/applications/T2VLAD/data_loader/data_loaders.py:37-69": "Create Dataset Function",
+    "/applications/T2VLAD/data_loader/data_loaders.py:71-101": "Data Loader Constructor",
+    "/applications/T2VLAD/logger/__init__.py": "Importing T2VLAD Logger and Parser Functions",
+    "/applications/T2VLAD/logger/log_parser.py": "Log Summary: Epoch Performance Stats",
+    "/applications/T2VLAD/logger/log_parser.py:1-24": "Log Performance Stats with log_summary",
+    "/applications/T2VLAD/logger/log_parser.py:101-104": "Fixed Epoch Logging",
+    "/applications/T2VLAD/logger/log_parser.py:26-56": "Log Parser: Identifying Seeds and Metrics in T2VLAD",
+    "/applications/T2VLAD/logger/log_parser.py:57-78": "Log Parser: Extracting Scores for Seeds",
+    "/applications/T2VLAD/logger/log_parser.py:79-99": "Geometric Mean Seed Selection",
+    "/applications/T2VLAD/logger/logger.py": "Configure Logging from JSON File",
+    "/applications/T2VLAD/model/loss.py": "Contrastive Loss for T2VLAD",
+    "/applications/T2VLAD/model/loss.py:1-28": "Max Margin Ranking Loss for T2VLAD",
+    "/applications/T2VLAD/model/loss.py:29-61": "Contrastive Loss for Image-Sentence Pairs",
+    "/applications/T2VLAD/model/loss.py:62-85": "Contrastive Learning Cost Calculation",
+    "/applications/T2VLAD/model/loss.py:86-102": "Video-Level Loss Calculation in T2VLAD",
+    "/applications/T2VLAD/model/metric.py": "Retrieval Metrics and Visualization Tool",
+    "/applications/T2VLAD/model/metric.py:1-30": "Retrieval Metrics Computation",
+    "/applications/T2VLAD/model/metric.py:100-122": "Average Rank Calculator",
+    "/applications/T2VLAD/model/metric.py:124-148": "Retrieval Metric Computation and Validity Checks",
+    "/applications/T2VLAD/model/metric.py:150-180": "Closest Caption Retrieval Metrics",
+    "/applications/T2VLAD/model/metric.py:181-199": "Optimistic or Averaging Caption Ranking",
+    "/applications/T2VLAD/model/metric.py:200-224": "Matrix Rank Checker: Sanity-checking Code",
+    "/applications/T2VLAD/model/metric.py:225-243": "Ranking Metrics Computation with Matplotlib and Numpy",
+    "/applications/T2VLAD/model/metric.py:31-58": "Retrieval Metrics Calculation",
+    "/applications/T2VLAD/model/metric.py:59-75": "Averaging Tie-Breaking in Similarity Matrix",
+    "/applications/T2VLAD/model/metric.py:76-98": "Efficient Tied Scores Handling",
+    "/applications/T2VLAD/model/model.py": "Enhanced Video Analysis with CENet",
+    "/applications/T2VLAD/model/model.py:1-34": "Importing Libraries for T2VLAD Model",
+    "/applications/T2VLAD/model/model.py:131-148": "Text Pooling in T2VLAD",
+    "/applications/T2VLAD/model/model.py:149-179": "Transformer Layer Implementation",
+    "/applications/T2VLAD/model/model.py:180-207": "Attention Functions in T2VLAD Model",
+    "/applications/T2VLAD/model/model.py:208-237": "Transformer Class with Multi-Head Attention",
+    "/applications/T2VLAD/model/model.py:238-275": "CEModule Class Definition",
+    "/applications/T2VLAD/model/model.py:277-297": "MOE Model Initialization",
+    "/applications/T2VLAD/model/model.py:298-323": "Model Initialization and Preparation",
+    "/applications/T2VLAD/model/model.py:325-350": "Gated Embedding Units for MOE Computation",
+    "/applications/T2VLAD/model/model.py:35-66": "Implementing Mish, Kronecker Product, and NaN Removal Functions",
+    "/applications/T2VLAD/model/model.py:351-374": "Gated Text Embeddings in Model.py",
+    "/applications/T2VLAD/model/model.py:376-397": "Multi-Modal MOE Weights and Feature Extraction",
+    "/applications/T2VLAD/model/model.py:398-422": "Cross-View Video Localization via VLAD and MOE",
+    "/applications/T2VLAD/model/model.py:423-456": "T2VLAD Model Layers Explained",
+    "/applications/T2VLAD/model/model.py:458-485": "Sharded Embedding Similarity Matrix Function",
+    "/applications/T2VLAD/model/model.py:486-507": "Video-Text Similarity Calculator",
+    "/applications/T2VLAD/model/model.py:508-526": "Tensor Weights Combination and Normalization",
+    "/applications/T2VLAD/model/model.py:527-533": "Video-Text Similarity Calculator",
+    "/applications/T2VLAD/model/model.py:67-98": "NaN Handling in CENet Model",
+    "/applications/T2VLAD/model/model.py:99-130": "Model Initialization and Time Estimation",
+    "/applications/T2VLAD/model/net_vlad.py": "NetVLAD in T2VLAD Model Initialization",
+    "/applications/T2VLAD/model/net_vlad.py:1-33": "NetVLAD Algorithm: Implementation and Parameters",
+    "/applications/T2VLAD/model/net_vlad.py:100-100": "VLAD Feature Extraction in NetVLAD Model",
+    "/applications/T2VLAD/model/net_vlad.py:34-44": "Initializing VLAD Model Parameters",
+    "/applications/T2VLAD/model/net_vlad.py:46-76": "T2VLAD: Sanity Checks and Forward Pass",
+    "/applications/T2VLAD/model/net_vlad.py:77-99": "Batch Normalized VLAD Representation Generation",
+    "/applications/T2VLAD/model/text.py": "Text Embedding for Video Descriptions",
+    "/applications/T2VLAD/model/text.py:1-37": "Text Embedding Interface",
+    "/applications/T2VLAD/model/text.py:103-130": "Text Embedding Class",
+    "/applications/T2VLAD/model/text.py:131-146": "OpenAI GPT Embedding Tokenizer",
+    "/applications/T2VLAD/model/text.py:38-73": "W2VEmbedding: Text Embedding with Word2Vec",
+    "/applications/T2VLAD/model/text.py:74-101": "Initializing Text2Vec Model Class",
+    "/applications/T2VLAD/parse_config.py": "ConfigParser: Config Management & Parsing",
+    "/applications/T2VLAD/parse_config.py:1-35": "ConfigParser Class Overview",
+    "/applications/T2VLAD/parse_config.py:113-134": "Config Parser and Custom Arguments",
+    "/applications/T2VLAD/parse_config.py:135-159": "Config File Processing and Class Initialization",
+    "/applications/T2VLAD/parse_config.py:160-190": "Overwriting Check and Config Updates",
+    "/applications/T2VLAD/parse_config.py:191-232": "Parse Config Class",
+    "/applications/T2VLAD/parse_config.py:233-239": "Nested Object Access and Modify Functions",
+    "/applications/T2VLAD/parse_config.py:36-62": "Initializing Argument Parser and Config Loading",
+    "/applications/T2VLAD/parse_config.py:63-88": "Config-Based Model Saving and Logging",
+    "/applications/T2VLAD/parse_config.py:89-112": "Directory Purging and Recreation in parse_config.py",
+    "/applications/T2VLAD/test.py": "PaddleVideo: Prediction Compression and Evaluation",
+    "/applications/T2VLAD/test.py:1-33": "Compress Predictions in PaddleVideo Library",
+    "/applications/T2VLAD/test.py:117-146": "Paddle Model Initialization and Dataset Preparation",
+    "/applications/T2VLAD/test.py:147-167": "Video Sub-Sample Processing with T2VLAD",
+    "/applications/T2VLAD/test.py:168-190": "Metrics Calculation and Logging",
+    "/applications/T2VLAD/test.py:193-206": "Argument Parsing and Configuration Loading",
+    "/applications/T2VLAD/test.py:34-51": "Input Shape Validation: Ensuring Compatibility",
+    "/applications/T2VLAD/test.py:52-84": "Function for Initializing Paddle.js Model and Data Loader",
+    "/applications/T2VLAD/test.py:85-116": "Model Evaluation Initialization",
+    "/applications/T2VLAD/train.py": "Video Analysis Model Training Script",
+    "/applications/T2VLAD/train.py:1-35": "PaddleVideo: Training Framework Setup",
+    "/applications/T2VLAD/train.py:116-133": "Command-line Arguments for Video Analysis Training",
+    "/applications/T2VLAD/train.py:135-151": "Command-Line Training Setup",
+    "/applications/T2VLAD/train.py:37-67": "Experiment Initialization Function",
+    "/applications/T2VLAD/train.py:68-92": "Model Initialization and Training Setup",
+    "/applications/T2VLAD/train.py:93-115": "Train Model and Save Best",
+    "/applications/T2VLAD/trainer/__init__.py": "Importing Trainer Functions",
+    "/applications/T2VLAD/trainer/trainer.py": "Memory-Efficient Video Retrieval Trainer",
+    "/applications/T2VLAD/trainer/trainer.py:1-31": "PaddlePaddle Video Retrieval Trainer",
+    "/applications/T2VLAD/trainer/trainer.py:119-150": "ML Model Training Loop and Scheduler",
+    "/applications/T2VLAD/trainer/trainer.py:151-171": "Model Evaluation Initialization",
+    "/applications/T2VLAD/trainer/trainer.py:172-190": "Batch Subsampling for Video ML Model",
+    "/applications/T2VLAD/trainer/trainer.py:191-209": "PaddlePaddle-based Similarity Calculation for T2VLAD Training",
+    "/applications/T2VLAD/trainer/trainer.py:210-228": "Epoch Metrics Tracking and Visualization",
+    "/applications/T2VLAD/trainer/trainer.py:229-249": "Batch-wise Validation Metrics Calculation and Logging",
+    "/applications/T2VLAD/trainer/trainer.py:250-267": "Top-K Metric Implementation",
+    "/applications/T2VLAD/trainer/trainer.py:268-280": "Nested Predictions and Progress Functions",
+    "/applications/T2VLAD/trainer/trainer.py:32-66": "Evaluation Samples Duplication and Yielding",
+    "/applications/T2VLAD/trainer/trainer.py:67-89": "Epoch-based Model Trainer Class",
+    "/applications/T2VLAD/trainer/trainer.py:91-117": "Batch Training with Model Loss Computation",
+    "/applications/T2VLAD/utils/__init__.py": "Import All from Util Module",
+    "/applications/T2VLAD/utils/util.py": "Utility Functions for T2VLAD",
+    "/applications/T2VLAD/utils/util.py:1-50": "Utility Functions",
+    "/applications/T2VLAD/utils/util.py:106-143": "JSON, Hashable Dictionaries & Configuration Utilities",
+    "/applications/T2VLAD/utils/util.py:144-165": "Modality Dimensional Organization"
+}
\ No newline at end of file
diff --git a/docs/data/titles/4.json b/docs/data/titles/4.json
new file mode 100644
index 000000000..bddd16d5a
--- /dev/null
+++ b/docs/data/titles/4.json
@@ -0,0 +1,302 @@
+{
+    "/applications/T2VLAD/utils/util.py:166-181": "Determining Input-Output Dimensions for Expert Types and Temporal Methods",
+    "/applications/T2VLAD/utils/util.py:182-202": "Dimensional Assignment for Experts: Util.py 182-202",
+    "/applications/T2VLAD/utils/util.py:203-226": "Configuring Expert Dimensions for T2VLAD",
+    "/applications/T2VLAD/utils/util.py:227-258": "Dimensionality Adjustment and Tensor Utilities",
+    "/applications/T2VLAD/utils/util.py:260-284": "Normalize and Convert Image Tensors to Numpy Array",
+    "/applications/T2VLAD/utils/util.py:285-321": "Utility Functions in util.py",
+    "/applications/T2VLAD/utils/util.py:323-327": "Create Directory If Non-Existent",
+    "/applications/T2VLAD/utils/util.py:51-76": "Multifunctional Memory, Dictionary, and Expert Categorization",
+    "/applications/T2VLAD/utils/util.py:77-103": "Temporal Expert Management and Utilities",
+    "/applications/TableTennis/ActionRecognition/README.md": "Table Tennis Action Recognition with VideoSwinTransformer",
+    "/applications/TableTennis/ActionRecognition/README.md:1-43": "Table Tennis Action Recognition with VideoSwinTransformer",
+    "/applications/TableTennis/ActionRecognition/README.md:44-66": "TableTennis Action Recognition with VideoSwin",
+    "/applications/TableTennis/ActionRecognition/README.md:68-98": "Visualizing Predictions in Table Tennis Action Recognition",
+    "/applications/TableTennis/datasets/script/submission_format_transfer.py": "JSON Table Tennis Data Formatter",
+    "/applications/TableTennis/datasets/script/submission_format_transfer.py:1-49": "JSON Frame Rate Conversion and Formatting",
+    "/applications/TableTennis/datasets/script/submission_format_transfer.py:50-64": "Table Tennis Submission Format",
+    "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py": "TableTennis Video Inferencer",
+    "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py:1-50": "BMN Model Loader for Baidu Cloud",
+    "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py:52-84": "Video Feature Extraction and Bmn Prediction",
+    "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py:85-93": "JSON Inference Results Writer",
+    "/applications/TableTennis/fix_bad_label.py": "Fix Labels in Table Tennis App",
+    "/applications/TableTennis/get_instance_for_bmn.py": "BMN Model Ground Truth Data for Table Tennis",
+    "/applications/TableTennis/get_instance_for_bmn.py:1-48": "BMN Model Ground Truth Generation",
+    "/applications/TableTennis/get_instance_for_bmn.py:109-134": "Segmenting and Processing Actions List for BMN",
+    "/applications/TableTennis/get_instance_for_bmn.py:135-154": "Randomized Video Segment Selection",
+    "/applications/TableTennis/get_instance_for_bmn.py:155-182": "Video Data Segmentation and Annotation",
+    "/applications/TableTennis/get_instance_for_bmn.py:183-207": "Video Feature Extraction and Parsing",
+    "/applications/TableTennis/get_instance_for_bmn.py:208-227": "Table Tennis Dataset Processing and Saving",
+    "/applications/TableTennis/get_instance_for_bmn.py:49-74": "Video Action Extraction Algorithm",
+    "/applications/TableTennis/get_instance_for_bmn.py:75-108": "Combile GTS Segments",
+    "/applications/TableTennis/gts_format_transfer.py": "JSON Format Converter",
+    "/applications/TableTennis/predict/action_detect/action.py": "Baidu Cloud Action Detection Script",
+    "/applications/TableTennis/predict/action_detect/action.py:1-48": "Python Action Detection with Baidu Cloud",
+    "/applications/TableTennis/predict/action_detect/action.py:109-136": "Video Classification and Feature Extraction Model",
+    "/applications/TableTennis/predict/action_detect/action.py:137-158": "Extracting Image and Audio Features",
+    "/applications/TableTennis/predict/action_detect/action.py:159-185": "Action Detection Model Inference",
+    "/applications/TableTennis/predict/action_detect/action.py:186-186": "File Data Writing",
+    "/applications/TableTennis/predict/action_detect/action.py:49-76": "ModelPredict Class Initialization and Configuration",
+    "/applications/TableTennis/predict/action_detect/action.py:77-108": "Action Detection via Multimodal Feature Extraction",
+    "/applications/TableTennis/predict/action_detect/logger.py": "Custom Logger for Action Detection",
+    "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py": "Audio Features for Table Tennis Prediction",
+    "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:1-39": "Audio Feature Extraction in TableTennis App",
+    "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:113-137": "Mel Spectrogram Conversion and Wav Data Processing",
+    "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:138-158": "Table Tennis Audio Feature Extraction",
+    "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:160-183": "MFCC Extraction with VGG-16",
+    "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:40-70": "Audio Feature Extraction with MFCCs",
+    "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:71-91": "MFCC Feature Extraction for Speech Processing",
+    "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py:92-112": "Generate Spectrogram from Audio Data",
+    "/applications/TableTennis/predict/action_detect/mfcc/model_config.py": "Audio Feature Extraction Model",
+    "/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py": "Global VGGish Parameters for Action Detection",
+    "/applications/TableTennis/predict/action_detect/models/audio_infer.py": "Audio Inference with PaddleVideo",
+    "/applications/TableTennis/predict/action_detect/models/audio_infer.py:1-37": "Audio Model Inference Initialization",
+    "/applications/TableTennis/predict/action_detect/models/audio_infer.py:39-67": "Audio Inference Class with PaddleVideo",
+    "/applications/TableTennis/predict/action_detect/models/audio_infer.py:69-78": "Audio Inferencing and Prediction",
+    "/applications/TableTennis/predict/action_detect/models/bmn_infer.py": "GPU-Optimized Action Detection",
+    "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:1-39": "BMN Infer Model Class",
+    "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:113-135": "Running Average Window Predictions",
+    "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:137-164": "BMN Infer Model Prediction",
+    "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:40-65": "Inference Model Setup and Generation",
+    "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:66-87": "Boundary Proposition Generator",
+    "/applications/TableTennis/predict/action_detect/models/bmn_infer.py:88-112": "Boundary-Based Prediction Model",
+    "/applications/TableTennis/predict/action_detect/models/lstm_infer.py": "LSTM-Based Table Tennis Action Detection",
+    "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:1-38": "LSTM Action Detection Model Inferencing",
+    "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:113-136": "LSTM-Based Table Tennis Action Detection",
+    "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:138-158": "LSTM Predicts Table Tennis Action",
+    "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:40-62": "LSTM Inferencing Setup",
+    "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:63-92": "Table Tennis Action Detection Model",
+    "/applications/TableTennis/predict/action_detect/models/lstm_infer.py:93-111": "LSTM Action Detection Inference",
+    "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py": "PPTSM Inference with PaddlePaddle",
+    "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py:1-38": "Initialize PPTSM InferModel with PaddlePaddle",
+    "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py:40-68": "InferModel Video Frame Inference",
+    "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py:69-77": "Efficient Image Inference and Prediction",
+    "/applications/TableTennis/predict/action_detect/reader/__init__.py": "Alphabetical Reader Registration",
+    "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py": "BMNINF Reader for Table Tennis",
+    "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:1-49": "BMNINF Reader for PaddleVideo",
+    "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:104-133": "CTCN Model Reader for Table Tennis Action Detection",
+    "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:134-154": "Inference Reader: Iterating and Yielding Batches",
+    "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:50-72": "BMNINF Reader for Table Tennis",
+    "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py:73-103": "Video Analysis Dataset Creation Code",
+    "/applications/TableTennis/predict/action_detect/reader/feature_reader.py": "Table Tennis Action Detection Reader",
+    "/applications/TableTennis/predict/action_detect/reader/feature_reader.py:1-34": "YouTube-8M Dataset Feature Reader",
+    "/applications/TableTennis/predict/action_detect/reader/feature_reader.py:36-71": "Table Tennis Action Detector Feature Reader",
+    "/applications/TableTennis/predict/action_detect/reader/feature_reader.py:72-91": "Feature Extractor: Table Tennis Action Prediction",
+    "/applications/TableTennis/predict/action_detect/reader/reader_utils.py": "Handling Errors in TableTennis Reader",
+    "/applications/TableTennis/predict/action_detect/reader/reader_utils.py:1-33": "Reader Error Handling in PaddleVideo TableTennis",
+    "/applications/TableTennis/predict/action_detect/reader/reader_utils.py:34-81": "Video Reader Classes and Registry",
+    "/applications/TableTennis/predict/action_detect/reader/reader_utils.py:82-107": "Singleton Reader Registration Utilities",
+    "/applications/TableTennis/predict/action_detect/utils/config_utils.py": "Config Utilities for TableTennis",
+    "/applications/TableTennis/predict/action_detect/utils/config_utils.py:1-47": "Config Utils for TableTennis",
+    "/applications/TableTennis/predict/action_detect/utils/config_utils.py:48-80": "Config Utils for AttrDict Manipulation",
+    "/applications/TableTennis/predict/action_detect/utils/config_utils.py:81-81": "Context Changer Logging",
+    "/applications/TableTennis/predict/action_detect/utils/preprocess.py": "Preprocess.py: Video, Audio, Image Toolkit",
+    "/applications/TableTennis/predict/action_detect/utils/process_result.py": "One-Dimensional NMS for Video Analysis",
+    "/applications/TableTennis/predict/action_detect/utils/process_result.py:1-39": "Video Action Detection Result Calculator",
+    "/applications/TableTennis/predict/action_detect/utils/process_result.py:111-136": "Sorting Prop Filter Timestamps for Action Detection",
+    "/applications/TableTennis/predict/action_detect/utils/process_result.py:137-155": "Non-Max Suppression Result Function",
+    "/applications/TableTennis/predict/action_detect/utils/process_result.py:42-78": "Non-Maximal Suppression for Bounding Boxes",
+    "/applications/TableTennis/predict/action_detect/utils/process_result.py:79-110": "Efficient Video Property Processing and Classification",
+    "/applications/TableTennis/predict/eval.py": "Optimized Table Tennis Predictions",
+    "/applications/TableTennis/predict/eval.py:1-41": "Loading and Processing Ground Truth Data",
+    "/applications/TableTennis/predict/eval.py:109-142": "Evaluating Model Performance on Video Frames",
+    "/applications/TableTennis/predict/eval.py:145-166": "IoU-based Box Evaluation",
+    "/applications/TableTennis/predict/eval.py:167-186": "Precision-Recall Calculator",
+    "/applications/TableTennis/predict/eval.py:187-210": "Table Tennis Prediction Model Evaluation",
+    "/applications/TableTennis/predict/eval.py:212-239": "Table Tennis Video Analysis Model Evaluator",
+    "/applications/TableTennis/predict/eval.py:240-270": "Table Tennis Prediction Evaluation",
+    "/applications/TableTennis/predict/eval.py:271-287": "Optimizing IOU and Scores for F1 Evaluation",
+    "/applications/TableTennis/predict/eval.py:42-73": "Computer Vision Interval Union: Filter Proposals",
+    "/applications/TableTennis/predict/eval.py:74-108": "Converters for Boxes and Labels",
+    "/applications/TableTennis/predict/predict.py": "TableTennis Video Prediction",
+    "/applications/TableTennis/predict/predict.py:1-35": "Video Prediction Setup: PaddleVideo's TableTennis",
+    "/applications/TableTennis/predict/predict.py:36-36": "Saving Data to File",
+    "/applications/TableTennis/val_split.py": "JSON Split for Validation and Training Sets",
+    "/applications/VideoQualityAssessment/README.md": "PaddlePaddle 2.1 Video Quality Assessment Model",
+    "/applications/VideoQualityAssessment/README.md:1-58": "Video Quality Assessment Model with PaddlePaddle",
+    "/applications/VideoQualityAssessment/README.md:101-144": "Epoch Analysis and Fine-tuning",
+    "/applications/VideoQualityAssessment/README.md:145-179": "PaddleVideo: TSM Regression for Video Quality",
+    "/applications/VideoQualityAssessment/README.md:181-189": "Video Quality Assessment with SROCC and PLCC",
+    "/applications/VideoQualityAssessment/README.md:59-98": "Multigpu Distributed Training with PaddleVideo",
+    "/applications/VideoQualityAssessment/main.py": "Video Quality Assessment Training and Testing with PaddleVideo",
+    "/applications/VideoQualityAssessment/main.py:1-30": "Training Models with PaddleVideo",
+    "/applications/VideoQualityAssessment/main.py:31-52": "Command-Line Arguments for Video Quality Assessment",
+    "/applications/VideoQualityAssessment/main.py:53-88": "Command Line Args for Model Training and Testing",
+    "/applications/VideoQualityAssessment/paddlevideo/__init__.py": "PaddleVideo License and Import",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py": "Video Dataset Loader for PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py": "PaddleVideo Dataset Builder and Loader",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:1-33": "Video Pipeline Builder in Python",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:34-74": "Video Quality Assessment Dataset and Dataloader Builder",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:75-97": "DistributedBatchSampler Creation",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py:99-126": "Create and Manage DataLoaders with Signal Handlers",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py": "Python Video Dataset Module",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py": "Video Dataset Loader Class",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py:1-34": "Base Dataset Class for Custom Loaders",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py:36-62": "Video Index Loader Class",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py:63-83": "Dataset Class with Train-Test Methods",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py": "Frame-Rec Dataset Loader for PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:1-32": "FrameRecDataset: PaddleVideo's Action Recognition Dataset",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:32-62": "Video Index File Loader Class",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:63-88": "Frame Dataset Preparer",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py:89-110": "Retry Loading Frames in Exception Handling",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py": "PaddleVideo Dataset Loader",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:1-32": "Python Video Dataset Loader",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:33-58": "Video Dataset Loader Initialization",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:59-81": "Video Dataset Loader: Robust Read, Pipeline, and Testing",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py:82-95": "Retry-Based Video File Reader",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py": "PaddleVideo Pipeline Modules",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py:1-40": "PaddleVideo Loader Pipelines",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py:41-50": "PaddleVideo Pipeline Modules",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py": "Multi-Scale Image Augmentation in PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:1-35": "Scale Class for Image Resizing",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:131-160": "MultiScaleCrop: Image Resizing and Cropping",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:161-192": "Random Crop Size Sampling",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:193-215": "Crop Position List Generator",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:216-247": "Random Crop and Flip Augmentation",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:248-281": "Random Flip and Image Pipeline",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:282-310": "PIL Images to Numpy Array Augmentation",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:311-344": "Dynamic Image Scaling Augmentation",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:345-370": "Jitter Resize and Random Scale Augmentation",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:36-61": "Preserve Aspect Ratio Resizing",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:371-403": "MultiCrop Image Resizer",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:404-430": "Random Cropping Augmentation",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:431-452": "Image Cropping with Random Offsets",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:453-484": "Image Crop and Append Function",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:485-498": "Slower Pathway Frame Selection",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:62-95": "Random Crop Pipeline for PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py:97-130": "Center Crop Image Augmentation",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py": "Compose Class for Video Pipeline",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py:1-33": "Compose Class Pipeline",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py:34-61": "Compose Class Sequentially Combines Transforms",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py:62-79": "Video Pipeline Composer",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py": "PaddleVideo: MP4 Decoding and Feature Extraction",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:1-42": "PaddleVideo: MP4 Decoding",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:114-139": "Initialize Feature Paddings and Masks",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:140-165": "Decode, Dequantize, One-Hot Labels",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:43-80": "Multi-Decoder for Data Types",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py:81-113": "Decoding Pipeline for .pkl Files",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py": "Mixup for Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py:1-36": "Mixup for Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py:37-72": "Cutmix: Mixing Images and Labels in Datasets",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py:74-91": "Random Bounding Box Data Augmentation",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py": "Custom Sampler for PIL-based Video Frames",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:1-32": "Sampler: Efficient Video Frame Sampling",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:33-70": "Sample Pipeline Class Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:71-96": "Video Frame Index Calculator",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py:97-102": "Frame Indexing in Video Loader",
+    "/applications/VideoQualityAssessment/paddlevideo/loader/registry.py": "PaddleVideo Registry Management",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py": "Video Quality Assessment Metrics Initiation",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py": "BaseMetric: Foundation for Video Quality Metrics",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py:1-36": "Base Metric Class for Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py:37-39": "Base Class Accumulate Method",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/build.py": "PaddleVideo Metric Builder",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py": "Pearson and Spearman Correlation Metric",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py:1-35": "Video Quality Metric Class Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py:36-62": "Pearson and Spearman Correlation Calculator Class",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py:64-72": "Calculate PLCC and SROCC from Output and Label Pair",
+    "/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py": "PaddleVideo Metrics Registry",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py": "PaddleVideo Modeling",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py:1-24": "Universal Model Registration",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py:26-45": "Video Model Building Toolkit",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py": "ResNet and ResNetTweaksTSM Imported",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py": "Dynamically Configurable ResNet Backbone",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:1-35": "ConvBNLayer: PaddlePaddle Backbone Customization",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:114-146": "Convolutional Neural Network BasicBlock",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:147-174": "ResNet Forward Function Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:175-210": "ResNet Backbone Creator",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:211-232": "ResNet Model Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:233-252": "Dynamic ResNet Model Architecture",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:253-270": "Backbone Model Weight Initialization",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:271-290": "Forward Pass for Backbone: Conv and Pooling",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:36-58": "ConvBNLayer Class Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:59-89": "ResNet Convolutional Layer Design",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py:90-113": "Bottleneck Block Construction in PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py": "TSM-ResNet-C Model",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:1-34": "ConvBNLayer: Convolutional BatchNorm Layer",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:115-140": "Custom ResNet-D Layer with Optional Pooling",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:141-170": "TSM Convolutional Block with Shortcut Connection",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:171-206": "ResNet TSM Backbone with Shortcuts",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:207-235": "ResNetTweaksTSM Instance Initialization",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:236-258": "TSM-ResNet Backbone Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:259-278": "TSM ResNet Model Builder",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:279-297": "ResNet Weights Initialization",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:297-317": "Initializing Backbone for Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:318-328": "ResNet-C Backbone for Video Quality",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:36-60": "ConvBNLayer: Batch Normalization and Activation Layer",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:61-85": "TSM ResNet Backbone Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py:86-114": "Bottleneck Block in ResNet Model",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py": "Computer Vision Model Builders",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py:1-36": "Building Computer Vision Model Components",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py:39-52": "Building Localizer and Model Functions",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py": "Importing Recognizers for Video Modeling",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py": "Importing Recognizers in Video Framework",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py": "Base Recognizer Class for PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py:1-38": "Base Recognizer: Train, Validate, Test Init",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py:39-75": "Base Model Recognizer Initialization",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py:76-97": "Abstract Recognizer Model Base Class",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py": "Training Recognizer2D for 2D Models",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py:1-29": "PaddleVideo's 2D Recognizer Model Training",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py:31-52": "Recognizer2D: Validation and Test Methods",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py": "PaddleVideo Heads for Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py": "VideoQualityAssessment BaseHead Class",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:1-36": "PaddleVideo BaseHead Class Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:121-143": "Label Smooth Loss and Accuracy Functions",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:37-67": "BaseHead: Initializing PaddleVideo's Head Network",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:69-96": "VideoQualityAssessment Base Head Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py:97-120": "Label Smoothing Loss Calculator",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py": "TSM Recurrent Head: TSN-Based Classifier for Video QA",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:1-33": "TSM RecHead: TSN-Based Classifier for TSMs",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:123-149": "Loss Calculation for TSM-REC Head",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:150-153": "Squeeze and Label Smooth Loss Calculation",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:34-62": "Uniform Weights Initialization for FC Layer",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:63-91": "TSM Recognition Head Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py:92-122": "Loss Function for Score Prediction Model",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py": "TSN Head: Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:1-31": "TSN Head: PaddlePaddle Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:32-64": "Image Classification Head with GAP and Dropout",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:65-96": "Forward Pass Function for Neural Network Head",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py:97-97": "Return Calculated Score",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py": "PaddleVideo Loss Functions",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py": "Base Loss Function in PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py:1-33": "Base Loss Function in PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py:34-51": "Abstract Loss Function Base Class",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py": "L1 Loss for Video Quality",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py:1-33": "L1 Loss for Video Quality",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py:34-38": "L1 Loss Calculator",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py": "Custom SmoothL1 Loss in Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py:1-33": "Custom Smooth L1 Loss Function",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py:34-39": "Smooth L1 Loss Calculation",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py": "Model Registry in PaddleVideo's Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py": "Weight Initialization in PaddlePaddle",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py:1-36": "Weight Initialization in PaddlePaddle Layers",
+    "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py:37-55": "Neural Network Weight Initialization",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py": "Video Quality Assessment Optimizer",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py": "Custom Learning Rate Schedulers for PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:1-33": "CustomWarmupCosineDecay Scheduler",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:109-134": "Customizable Learning Rate Scheduler",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:135-160": "Custom Learning Rate Scheduler for Optimizers",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:161-196": "Custom Learning Rate Scheduler for PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:197-201": "Custom Learning Rate Scheduler",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:34-55": "Customizable Learning Rate Scheduler",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:56-81": "CustomWarmupCosineDecay: Cosine Decay Learning Rate Optimizer",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py:82-108": "Warmup-Cosine LR Scheduler for Video Quality",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py": "VideoQualityAssessment LR Scheduler",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py:1-33": "Learning Rate Scheduler Builder",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py:35-49": "Learning Rate Configurer",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py": "Weight Decay Optimizer Scheduler",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py:1-36": "Configurable Optimizer Builder\n\nTitle within 3 to 7 words: Configurable Optimizer Builder",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py:37-68": "Adam Optimizer with L2Decay and L1Decay Regularization",
+    "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py:69-79": "Weight Decay Optimizer Configurator",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py": "Training and Testing Functions",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py": "Paddle Video Testing with Multi-Card Datasets",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py:1-35": "Test Model Using Paddle Framework",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py:36-66": "Multi-Card GPU Model Training",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py:67-78": "Batch Size, Metric Building, Iteration & Accumulation",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py": "Efficient Video Quality Assessment Training with PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:1-28": "Video Quality Assessment with PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:114-147": "Training Model Iteration",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:148-171": "Model Training Step in Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:173-198": "Backward Propagation Optimizer",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:199-229": "Training Video Quality Model: Update Learning Rate",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:230-254": "Validation Metrics Updater"
+}
\ No newline at end of file
diff --git a/docs/data/titles/5.json b/docs/data/titles/5.json
new file mode 100644
index 000000000..7782a306e
--- /dev/null
+++ b/docs/data/titles/5.json
@@ -0,0 +1,302 @@
+{
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:255-276": "Optimizer and Model State Saver",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:278-295": "Model Validation and Saving in Training Process",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:29-61": "GPU-Accelerated Model Training",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:62-89": "Efficient Model Training Setup",
+    "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py:90-113": "Resume Training with Data Loader and Optimizer",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py": "PaddleVideo Utilities Module",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py": "Python Config Module Builder",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py:1-30": "Build Module from Config",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py:32-36": "Retrieve and Instantiate Object Classes from Registry",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/config.py": "Config Management in PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:1-35": "PaddleVideo Config Class & Logger",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:111-142": "Config Override Function",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:143-174": "Config Override Function",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:175-180": "Config Parser and Validator",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:38-71": "Config Parser Functions",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/config.py:72-110": "Config File Utilities",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py": "Distributed Computation Utilities",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py:1-35": "Distributed Computation Utilities",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py:36-36": "Returning Modified Objects",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py": "Distributed Logger for PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:1-40": "PaddleVideo's Logger Class",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:104-117": "Setup and Retrieve Logger",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:41-74": "Initialize PaddleVideo Logger",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py:75-103": "Distributed App Logging Config",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py": "Precise Batch Normalization Update",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:1-31": "Precise Batch Normalization for PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:32-55": "Precise BN Stats for Improved Validation",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:56-82": "Precise Batch Normalization Accumulation",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py:83-84": "Dynamic Batch Normalization Update",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/record.py": "Training Metrics Logger",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:1-29": "Building Record List for PaddleVideo",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:117-122": "Epoch Logger with Color Coding",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:30-51": "Building Record List for Metric Tracking",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:52-90": "Record List and Average Meter Definition",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/record.py:91-115": "Batch and Epoch Metric Logger",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py": "Registry-Based Module Customization",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py:1-35": "Registry for Customizable Modules",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py:37-72": "Registry Class for Building Modules",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py:73-98": "Registry Class and Function Registration",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py": "Save/Load Weights Utilities",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py:1-37": "Load Checkpoint for Video Quality Assessment",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py:38-63": "Loading Checkpoint Weights in Paddle",
+    "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py:64-87": "Paddle Save, Load, and Create Directory Functions",
+    "/applications/VideoQualityAssessment/paddlevideo/version.py": "PaddleVideo Version Info",
+    "/applications/VideoQualityAssessment/run.sh": "TSM Model Training in PaddlePaddle",
+    "/applications/VideoQualityAssessment/run.sh:1-19": "CUDA-PaddlePaddle Shell Script",
+    "/applications/VideoQualityAssessment/run.sh:20-20": "Custom Predict Model with Disabled Benchmarking",
+    "/applications/VideoQualityAssessment/save_model.sh": "Export Best Model for Video Quality Assessment",
+    "/applications/VideoQualityAssessment/setup.py": "PaddleVideo: Video Analysis Toolkit",
+    "/applications/VideoQualityAssessment/setup.py:1-34": "Setting Up PaddleVideo Package",
+    "/applications/VideoQualityAssessment/setup.py:35-56": "ppvideo: PaddlePaddle-Based Video Package Setup",
+    "/applications/VideoQualityAssessment/setup.py:57-57": "Creating Empty Tuple",
+    "/applications/VideoTag/FineTune.md": "Fine-Tuning VideoTag: AttentionLSTM & TSN",
+    "/applications/VideoTag/FineTune.md:1-32": "Fine-Tuning VideoTag Models",
+    "/applications/VideoTag/FineTune.md:115-152": "PaddleVideo Fine-Tuning Guide",
+    "/applications/VideoTag/FineTune.md:153-188": "TSN Model Training, Evaluation and Prediction",
+    "/applications/VideoTag/FineTune.md:190-206": "Preparing Data for TSN and AttentionLSTM",
+    "/applications/VideoTag/FineTune.md:34-81": "TSN Features Extraction and AttentionLSTM Fine-tuning",
+    "/applications/VideoTag/FineTune.md:83-113": "Fine-tuning AttentionLSTM in VideoTag",
+    "/applications/VideoTag/README.md": "Large-scale Video Classification with PaddlePaddle",
+    "/applications/VideoTag/Run.md": "VideoTag App Installation and Usage",
+    "/applications/VideoTag/Run.md:1-54": "Install and Prepare Data for VideoTag",
+    "/applications/VideoTag/Run.md:106-109": "Video Classification Dictionary",
+    "/applications/VideoTag/Run.md:55-105": "Video Tag Testing Guide",
+    "/applications/VideoTag/Test.md": "VideoTag Testing Guide",
+    "/applications/VideoTag/eval.py": "PaddlePaddle Evaluation Setup",
+    "/applications/VideoTag/eval.py:1-33": "Setting Up PaddlePaddle Application Environment",
+    "/applications/VideoTag/eval.py:123-134": "Automating Paddle Test Metrics and GPU Checks",
+    "/applications/VideoTag/eval.py:34-64": "Command Line Argument Parser Function",
+    "/applications/VideoTag/eval.py:65-95": "Test Model Evaluation Function",
+    "/applications/VideoTag/eval.py:96-122": "Batch-by-batch Model Evaluation and Metrics",
+    "/applications/VideoTag/metrics/__init__.py": "Import Metrics Function for Video Analysis",
+    "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py": "AccuracyMetrics Calculator",
+    "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:1-34": "PaddleVideo MetricsCalculator Class",
+    "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:35-62": "Accuracy Metrics Computation",
+    "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:63-90": "Average Loss and Accuracy Metrics",
+    "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py:92-107": "Top-K Accuracy Metric",
+    "/applications/VideoTag/metrics/metrics_util.py": "Video Metrics Evaluator Class",
+    "/applications/VideoTag/metrics/metrics_util.py:1-33": "Video Metrics Evaluation Utilities",
+    "/applications/VideoTag/metrics/metrics_util.py:114-135": "Video Tagging Metrics Calculator",
+    "/applications/VideoTag/metrics/metrics_util.py:138-163": "Kinetics400 Metrics Calculator",
+    "/applications/VideoTag/metrics/metrics_util.py:164-187": "Evaluate Video Predictions and Losses",
+    "/applications/VideoTag/metrics/metrics_util.py:188-210": "Infer Results Printer Function",
+    "/applications/VideoTag/metrics/metrics_util.py:212-237": "Metrics Utilities: Save, Calculate, and Log",
+    "/applications/VideoTag/metrics/metrics_util.py:238-278": "MetricsZoo Class for Metrics Management",
+    "/applications/VideoTag/metrics/metrics_util.py:279-279": "Registering TSN Metric",
+    "/applications/VideoTag/metrics/metrics_util.py:34-69": "Youtube8m Metrics Calculation",
+    "/applications/VideoTag/metrics/metrics_util.py:70-90": "Accumulating Metrics for Video Tagging",
+    "/applications/VideoTag/metrics/metrics_util.py:92-113": "VideoTag: Logging Final Results for Each Video",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py": "Interpolated Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:1-23": "Interpolated Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:109-134": "Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:136-166": "Non-Interpolated Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:168-192": "Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:193-220": "Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:221-256": "Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:25-55": "Interpolated Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:257-274": "Normalized Predictions: Min-Max Scaling",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:57-86": "Average Precision Calculator Class",
+    "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py:87-108": "Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py": "PaddleVideo Metrics for Model Evaluation",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py:1-28": "YouTube8M Evaluation Utilities",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py:110-135": "Top-K Video Classification Evaluation",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py:136-164": "Evaluation Metrics Class",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py:165-190": "Batch Metrics Calculation Function",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py:191-219": "Epoch Metrics Calculator",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py:220-244": "YouTube8m Metrics Evaluator",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py:30-59": "Precision-Recall Average Hit at One",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py:60-87": "Global Average Precision Calculation",
+    "/applications/VideoTag/metrics/youtube8m/eval_util.py:88-109": "Top K Video Predictions Evaluation",
+    "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py": "YouTube-8m Mean Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:1-27": "Mean Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:112-113": "Mean Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:28-58": "Mean Average Precision Calculation in YouTube8M",
+    "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:59-79": "Mean Average Precision Calculator",
+    "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py:80-111": "Mean Average Precision Calculator",
+    "/applications/VideoTag/models/__init__.py": "Model Registry in VideoTag",
+    "/applications/VideoTag/models/attention_lstm/__init__.py": "Import Attention LSTM Functions",
+    "/applications/VideoTag/models/attention_lstm/attention_lstm.py": "Attention LSTM Video Tagging Model",
+    "/applications/VideoTag/models/attention_lstm/attention_lstm.py:1-31": "Attention LSTM Model Definition",
+    "/applications/VideoTag/models/attention_lstm/attention_lstm.py:104-125": "Attention LSTM Model for Video Tagging",
+    "/applications/VideoTag/models/attention_lstm/attention_lstm.py:126-151": "Attention LSTM Model with Learning Rate Decay",
+    "/applications/VideoTag/models/attention_lstm/attention_lstm.py:152-180": "Attention LSTM Model Class Definition",
+    "/applications/VideoTag/models/attention_lstm/attention_lstm.py:33-54": "Attention LSTM Model Configuration",
+    "/applications/VideoTag/models/attention_lstm/attention_lstm.py:55-76": "Initializing Feature and Label Inputs",
+    "/applications/VideoTag/models/attention_lstm/attention_lstm.py:77-103": "LSTM Attention Model with Multi-Input Features",
+    "/applications/VideoTag/models/attention_lstm/lstm_attention.py": "Bidirectional LSTM Attention Model",
+    "/applications/VideoTag/models/attention_lstm/lstm_attention.py:1-31": "LSTM Attention Model Code",
+    "/applications/VideoTag/models/attention_lstm/lstm_attention.py:32-58": "Dynamic LSTM for Video Tagging",
+    "/applications/VideoTag/models/attention_lstm/lstm_attention.py:60-83": "Dynamic LSTM Model for Sequence Classification",
+    "/applications/VideoTag/models/model.py": "Python Module: PaddleVideo's VideoTag Model Handler",
+    "/applications/VideoTag/models/model.py:1-36": "VideoTag Model Initialization",
+    "/applications/VideoTag/models/model.py:107-139": "Model Utilities: Dataset, Weights, and Pretraining",
+    "/applications/VideoTag/models/model.py:140-167": "Weight Handling Functions",
+    "/applications/VideoTag/models/model.py:168-192": "ModelZoo: Managing and Retrieving Models",
+    "/applications/VideoTag/models/model.py:37-69": "Custom Exceptions and Model Base Class",
+    "/applications/VideoTag/models/model.py:70-105": "Subclassing Model Class for Implementation",
+    "/applications/VideoTag/models/tsn/__init__.py": "Effortless TSN Import",
+    "/applications/VideoTag/models/tsn/tsn.py": "TSN Model Initialization",
+    "/applications/VideoTag/models/tsn/tsn.py:1-34": "TSN Model Class",
+    "/applications/VideoTag/models/tsn/tsn.py:102-129": "Piecewise Learning Rate Decay Optimizer",
+    "/applications/VideoTag/models/tsn/tsn.py:130-159": "Multi-Mode Model with Pre-Trained Weights",
+    "/applications/VideoTag/models/tsn/tsn.py:160-165": "Prune Pretrained Parameters",
+    "/applications/VideoTag/models/tsn/tsn.py:35-52": "TSN Model Initialization",
+    "/applications/VideoTag/models/tsn/tsn.py:53-75": "TSN Model Input Generation",
+    "/applications/VideoTag/models/tsn/tsn.py:77-101": "TSN Model Configurable Parameters",
+    "/applications/VideoTag/models/tsn/tsn_res_model.py": "TSN ResNet Model in PaddlePaddle",
+    "/applications/VideoTag/models/tsn/tsn_res_model.py:1-34": "TSN ResNet Model Definition",
+    "/applications/VideoTag/models/tsn/tsn_res_model.py:119-142": "ResNet Model Implementation with PaddlePaddle",
+    "/applications/VideoTag/models/tsn/tsn_res_model.py:143-161": "Adaptive Average Pooling and Softmax Output",
+    "/applications/VideoTag/models/tsn/tsn_res_model.py:35-63": "Convolutional Layer with Batch Normalization",
+    "/applications/VideoTag/models/tsn/tsn_res_model.py:65-86": "Bottleneck Block and Shortcut Functions",
+    "/applications/VideoTag/models/tsn/tsn_res_model.py:87-118": "TSN ResNet Model: Conv-Batch Normalization Layers",
+    "/applications/VideoTag/models/utils.py": "Comprehensive File Operations Utility",
+    "/applications/VideoTag/models/utils.py:1-36": "Decompress and Download Utilities",
+    "/applications/VideoTag/models/utils.py:39-47": "AttrDict Class: Access Attributes Easily",
+    "/applications/VideoTag/predict.py": "PaddleVideo: Predicting Video Tags with AI",
+    "/applications/VideoTag/predict.py:1-37": "Import and Initialization Script",
+    "/applications/VideoTag/predict.py:117-141": "Video Tag Prediction Model",
+    "/applications/VideoTag/predict.py:143-171": "Average Processing Time Logger",
+    "/applications/VideoTag/predict.py:38-64": "Setting Up Logger and Parsing Arguments",
+    "/applications/VideoTag/predict.py:65-87": "Video Tag Prediction Python Script",
+    "/applications/VideoTag/predict.py:88-115": "Building PaddleVideo Inference Model",
+    "/applications/VideoTag/reader/__init__.py": "Alphabetical Reader Registration",
+    "/applications/VideoTag/reader/feature_reader.py": "DataReader: LSTM-based YouTube Dataset Processing",
+    "/applications/VideoTag/reader/feature_reader.py:1-34": "Youtube-8M Dataset LSTM Feature Reader",
+    "/applications/VideoTag/reader/feature_reader.py:35-64": "Feature Reader Initialization",
+    "/applications/VideoTag/reader/feature_reader.py:65-80": "One-Hot Video Frame Labeling",
+    "/applications/VideoTag/reader/kinetics_reader.py": "Efficient Kinetics Dataset Reader",
+    "/applications/VideoTag/reader/kinetics_reader.py:1-41": "PaddleVideo's Kinetics Reader: Frame Data and License",
+    "/applications/VideoTag/reader/kinetics_reader.py:122-151": "MP4 Reader Function",
+    "/applications/VideoTag/reader/kinetics_reader.py:152-176": "Frames and Labels: Kinetics Reader",
+    "/applications/VideoTag/reader/kinetics_reader.py:177-204": "Video Frame Loader and Error Handler",
+    "/applications/VideoTag/reader/kinetics_reader.py:205-233": "Video Decoder Selector",
+    "/applications/VideoTag/reader/kinetics_reader.py:234-266": "Data Augmentation for Image Processing",
+    "/applications/VideoTag/reader/kinetics_reader.py:269-305": "Versatile Image and Video Processing Functions",
+    "/applications/VideoTag/reader/kinetics_reader.py:306-340": "Frame Subset Selector",
+    "/applications/VideoTag/reader/kinetics_reader.py:341-367": "Video Frame Duration Analyzer",
+    "/applications/VideoTag/reader/kinetics_reader.py:42-79": "Kinetics Reader: MP4/PKL Dataset Access",
+    "/applications/VideoTag/reader/kinetics_reader.py:80-98": "Kinetics Reader Initialization",
+    "/applications/VideoTag/reader/kinetics_reader.py:99-121": "Configure Video Reader: Batch Size, File List, Random Seed",
+    "/applications/VideoTag/reader/reader_utils.py": "Reader Zoo Class and Utilities",
+    "/applications/VideoTag/reader/reader_utils.py:1-31": "Importing Libraries and Defining Reader Exceptions",
+    "/applications/VideoTag/reader/reader_utils.py:32-70": "Video Reader Classes and Registry",
+    "/applications/VideoTag/reader/reader_utils.py:71-80": "Register and Retrieve Readers Class",
+    "/applications/VideoTag/train.py": "VideoTag: CUDA-Powered Model Training and Saving",
+    "/applications/VideoTag/train.py:1-32": "VideoTag: Setting Up and Importing",
+    "/applications/VideoTag/train.py:110-134": "Training Model Initialization",
+    "/applications/VideoTag/train.py:136-161": "GPU-Aware Program Compilation",
+    "/applications/VideoTag/train.py:162-181": "Batch Size and Data Loading Setup",
+    "/applications/VideoTag/train.py:182-205": "Video Tagging Model Training with PaddlePaddle",
+    "/applications/VideoTag/train.py:206-212": "Directory Check and Training Initiation",
+    "/applications/VideoTag/train.py:33-60": "Argparse Configuration and Default Values",
+    "/applications/VideoTag/train.py:61-82": "Command Line Arguments for Training Program",
+    "/applications/VideoTag/train.py:83-109": "Command Line Argument Parsing",
+    "/applications/VideoTag/tsn_extractor.py": "Video Inference and Feature Extraction",
+    "/applications/VideoTag/tsn_extractor.py:1-37": "Python Script for PaddlePaddle Model Training",
+    "/applications/VideoTag/tsn_extractor.py:120-144": "Model Weights Downloader and Inferencer",
+    "/applications/VideoTag/tsn_extractor.py:145-158": "Extract and Log Features for Inference",
+    "/applications/VideoTag/tsn_extractor.py:38-66": "Command-Line Arguments for Model Training",
+    "/applications/VideoTag/tsn_extractor.py:67-93": "Command Line Arguments and Parsing in TsnExtractor",
+    "/applications/VideoTag/tsn_extractor.py:94-118": "Infer Model Initialization",
+    "/applications/VideoTag/utils/config_utils.py": "Config Handler for VideoTag",
+    "/applications/VideoTag/utils/config_utils.py:1-37": "VideoTag Config Utils",
+    "/applications/VideoTag/utils/config_utils.py:38-73": "YAML Config Processing and Merging Utils",
+    "/applications/VideoTag/utils/config_utils.py:74-75": "Config Log Separation",
+    "/applications/VideoTag/utils/train_utils.py": "Train Utils with PaddlePaddle",
+    "/applications/VideoTag/utils/train_utils.py:1-32": "Logging Learning Rate in PaddlePaddle",
+    "/applications/VideoTag/utils/train_utils.py:110-135": "Training Progress Tracker",
+    "/applications/VideoTag/utils/train_utils.py:136-159": "Model Saving and Testing Procedure",
+    "/applications/VideoTag/utils/train_utils.py:161-161": "Incomplete Code Snippet",
+    "/applications/VideoTag/utils/train_utils.py:33-57": "Retrieve and Print Learning Rate\n(or)\nLearning Rate Retrieval and Display",
+    "/applications/VideoTag/utils/train_utils.py:58-80": "Train Model with Dataloader Function",
+    "/applications/VideoTag/utils/train_utils.py:81-109": "Epoch Loop Initialization",
+    "/applications/VideoTag/utils/utility.py": "Python Utility: PaddlePaddle Checker",
+    "/applications/VideoTag/utils/utility.py:1-37": "Python Utility Script",
+    "/applications/VideoTag/utils/utility.py:39-66": "Compatibility and GPU Handling for PaddlePaddle",
+    "/applications/VideoTag/utils/utility.py:67-70": "Check Version Installation",
+    "/applications/VideoTag/videotag_test.py": "Efficient Video Tagging with PaddlePaddle",
+    "/applications/VideoTag/videotag_test.py:1-34": "VideoTag Test Log Config",
+    "/applications/VideoTag/videotag_test.py:112-134": "Video Tagging Inference Model",
+    "/applications/VideoTag/videotag_test.py:135-154": "Extractor Setup and Timing in PaddleVideo",
+    "/applications/VideoTag/videotag_test.py:156-177": "Configure and Prepare Input Data",
+    "/applications/VideoTag/videotag_test.py:179-199": "Efficient Model Predictor Setup",
+    "/applications/VideoTag/videotag_test.py:201-221": "DataFeeder Initialization and Model Execution",
+    "/applications/VideoTag/videotag_test.py:222-238": "Inference Time Logger",
+    "/applications/VideoTag/videotag_test.py:35-62": "Command-Line Argument Parser for VideoTag",
+    "/applications/VideoTag/videotag_test.py:63-86": "Command-Line Arguments for Video Tagging",
+    "/applications/VideoTag/videotag_test.py:87-111": "Video Classification Model with PaddlePaddle",
+    "/benchmark/TimeSformer/README.md": "TimeSformer Benchmarking Guide",
+    "/benchmark/TimeSformer/run_all.sh": "TimeSformer Benchmarking Script",
+    "/benchmark/TimeSformer/run_all.sh:1-20": "TimeSformer Model Benchmark Setup",
+    "/benchmark/TimeSformer/run_all.sh:20-47": "TimeSformer Dataset Prep & Benchmark",
+    "/benchmark/TimeSformer/run_all.sh:47-57": "Multi-GPU Performance Testing for TimeSformer",
+    "/benchmark/TimeSformer/run_benchmark.sh": "TimeSformer Benchmarking",
+    "/benchmark/TimeSformer/run_benchmark.sh:1-28": "TimeSformer Benchmark Script",
+    "/benchmark/TimeSformer/run_benchmark.sh:29-48": "Run TimeSformer Benchmark",
+    "/benchmark/TimeSformer/run_benchmark.sh:49-77": "Run TimeSformer Benchmark",
+    "/data/50salads/prepare_asrf_data.py": "Prepare ASRF Data for 50Salads",
+    "/data/50salads/prepare_asrf_data.py:1-42": "Dataset Class ID Mapping and Argument Parsing",
+    "/data/50salads/prepare_asrf_data.py:107-113": "Checks Direct Execution",
+    "/data/50salads/prepare_asrf_data.py:43-74": "Setup Dataset Directory and Index Classes",
+    "/data/50salads/prepare_asrf_data.py:76-106": "Preparing ASRF Data for Salad Classification",
+    "/data/50salads/transform_segmentation_label.py": "Video Data Labeling Tool",
+    "/data/50salads/transform_segmentation_label.py:1-34": "Label Conversion Tool",
+    "/data/50salads/transform_segmentation_label.py:120-147": "Segmentation Label Writer",
+    "/data/50salads/transform_segmentation_label.py:148-173": "Label File Processing and Conversion",
+    "/data/50salads/transform_segmentation_label.py:174-195": "Command Line Arguments Parser for Label Conversion",
+    "/data/50salads/transform_segmentation_label.py:35-55": "Action Detection and Labeling in Transform Segmentation",
+    "/data/50salads/transform_segmentation_label.py:56-90": "Video Segmentation Label Conversion",
+    "/data/50salads/transform_segmentation_label.py:91-119": "Generate Action Labels from Segmentation",
+    "/data/ntu-rgb-d/download_dataset.sh": "Download and Extract Skeleton Data",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py": "NTU Dataset Data Cleaning & Denoising",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:1-38": "Setting Up Directories and Loggers",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:117-147": "Denoising Bodies by Spread Threshold",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:148-172": "Noisy Frame Filter",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:173-198": "Denoising Body Motion Data",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:200-225": "Denoising Bodies Data by Frame Length and Spread",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:226-252": "Denoising NTU RGB-D Data with Motion Integration",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:253-280": "Extract Joints and Colors from Body Data",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:281-303": "Missing Frame Detection and Update",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:305-329": "Extracting Bodies Data and Points",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:331-358": "Denoising and Extracting Data from NTU RGB-D Dataset",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:359-377": "Extracting Actor Data from NTU-RGB-D",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:378-403": "Extracting and Denoising Skeleton Data",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:39-63": "Organized Logging in NTU RGB-D Data Processing",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:405-419": "Raw Skeleton Data Processing",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:420-445": "Raw Skeleton Sequence Data Processing",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:446-471": "Data Extraction and Analysis Tool",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:64-86": "Denoising Skeleton Frames: Multiple Loggers",
+    "/data/ntu-rgb-d/get_raw_denoised_data.py:87-116": "Threshold-Based Body Filtration",
+    "/data/ntu-rgb-d/get_raw_skes_data.py": "NTU Skeleton Data Extractor",
+    "/data/ntu-rgb-d/get_raw_skes_data.py:1-28": "Extracting Skeleton Data",
+    "/data/ntu-rgb-d/get_raw_skes_data.py:101-130": "Combine Raw Skeleton Data Files",
+    "/data/ntu-rgb-d/get_raw_skes_data.py:132-157": "NTU Dataset Filter & Save",
+    "/data/ntu-rgb-d/get_raw_skes_data.py:29-58": "Joint Counting from .skeleton Files",
+    "/data/ntu-rgb-d/get_raw_skes_data.py:59-76": "Extract and Update Body Data",
+    "/data/ntu-rgb-d/get_raw_skes_data.py:77-100": "NTU Skeleton Data Retriever",
+    "/data/ntu-rgb-d/seq_transformation.py": "NTU-RGB-D Dataset Transformation",
+    "/data/ntu-rgb-d/seq_transformation.py:1-34": "Directory Check and Frame Filtering",
+    "/data/ntu-rgb-d/seq_transformation.py:119-150": "Sequence Transformation and Encoding Functions",
+    "/data/ntu-rgb-d/seq_transformation.py:151-176": "Train-Validation Split Function",
+    "/data/ntu-rgb-d/seq_transformation.py:178-204": "Evaluating and Initializing Data Paths",
+    "/data/ntu-rgb-d/seq_transformation.py:205-235": "Get Indices for Cross-Subject or View Evaluation",
+    "/data/ntu-rgb-d/seq_transformation.py:236-263": "NTU Load and Preprocessing",
+    "/data/ntu-rgb-d/seq_transformation.py:264-266": "Split and Process NTU Dataset",
+    "/data/ntu-rgb-d/seq_transformation.py:35-63": "Seq Transformation: Filtering and Calculating Origin Points",
+    "/data/ntu-rgb-d/seq_transformation.py:64-89": "Sequence Transformation for NTU RGB+D Dataset",
+    "/data/ntu-rgb-d/seq_transformation.py:90-118": "Skeleton Alignment and Frame Count Update",
+    "/deploy/cpp_infer/external-cmake/auto-log.cmake": "Including Git External Project with CMake",
+    "/deploy/cpp_infer/include/postprocess_op.h": "Softmax Inplace Run for PaddleVideo",
+    "/deploy/cpp_infer/include/postprocess_op.h:1-39": "Softmax In-Place Transformation",
+    "/deploy/cpp_infer/include/postprocess_op.h:40-43": "Postprocess Vector Float Iterators",
+    "/deploy/cpp_infer/include/preprocess_op.h": "Image Preprocessing Operations",
+    "/deploy/cpp_infer/include/preprocess_op.h:1-39": "Normalize Class in PaddleVideo Library",
+    "/deploy/cpp_infer/include/preprocess_op.h:40-74": "Versatile Image Preprocessing for PaddleVideo",
+    "/deploy/cpp_infer/include/utility.h": "Utility Functions for PaddleVideo",
+    "/deploy/cpp_infer/include/utility.h:1-40": "Utility Class for PaddleVideo",
+    "/deploy/cpp_infer/include/utility.h:42-54": "Utility Functions for PaddleVideo",
+    "/deploy/cpp_infer/include/video_rec.h": "VideoRecognizer: OpenCV-PaddlePaddle Integration"
+}
\ No newline at end of file
diff --git a/docs/data/titles/6.json b/docs/data/titles/6.json
new file mode 100644
index 000000000..62f4c99b1
--- /dev/null
+++ b/docs/data/titles/6.json
@@ -0,0 +1,302 @@
+{
+    "/deploy/cpp_infer/include/video_rec.h:1-34": "OpenCV & PaddlePaddle Licensing and Video Recording API",
+    "/deploy/cpp_infer/include/video_rec.h:36-57": "VideoRecognizer Object Creation and Configuration",
+    "/deploy/cpp_infer/include/video_rec.h:58-86": "Video Recognition Class Initialization",
+    "/deploy/cpp_infer/include/video_rec.h:87-105": "VideoRecognizer Initialization Code",
+    "/deploy/cpp_infer/readme.md": "C++ PaddleVideo Deployment Error",
+    "/deploy/cpp_infer/readme.md:1-45": "Deploying PaddleVideo Models with C++",
+    "/deploy/cpp_infer/readme.md:126-170": "Compiling Paddle Inference API Library: Steps and Build Parameters",
+    "/deploy/cpp_infer/readme.md:172-213": "Compiling PaddleVideo C++ Demo for Inference",
+    "/deploy/cpp_infer/readme.md:214-259": "C++ Prediction Demo Instructions",
+    "/deploy/cpp_infer/readme.md:260-273": "Video Recognition Model Execution Parameters",
+    "/deploy/cpp_infer/readme.md:274-289": "Model Configuration and Detection Demo",
+    "/deploy/cpp_infer/readme.md:290-304": "Optimizing Inference Engine Configuration",
+    "/deploy/cpp_infer/readme.md:304-324": "C++ Inference Time & Libcudnn Issue",
+    "/deploy/cpp_infer/readme.md:46-91": "Compiling OpenCV for C++ Video Inference",
+    "/deploy/cpp_infer/readme.md:93-125": "Two Ways to Obtain Paddle Prediction Library",
+    "/deploy/cpp_infer/readme_en.md": "PaddleVideo Linux Deployment Guide",
+    "/deploy/cpp_infer/readme_en.md:1-20": "C++ Deployment Guide for PaddleVideo",
+    "/deploy/cpp_infer/readme_en.md:110-123": "Downloading and Unzipping Paddle Inference Library",
+    "/deploy/cpp_infer/readme_en.md:123-150": "Install Paddle Prediction Library: C++ Edition",
+    "/deploy/cpp_infer/readme_en.md:150-173": "Library Generation and Version Information",
+    "/deploy/cpp_infer/readme_en.md:174-203": "Compiling PaddleVideo C++ Demo Instructions",
+    "/deploy/cpp_infer/readme_en.md:20-37": "Linux Video Reading Setup",
+    "/deploy/cpp_infer/readme_en.md:204-231": "TensorRT Deployment with PaddleVideo",
+    "/deploy/cpp_infer/readme_en.md:232-258": "Customize PaddleVideo Inference Parameters",
+    "/deploy/cpp_infer/readme_en.md:259-271": "Video Recognition Model Configuration Parameters",
+    "/deploy/cpp_infer/readme_en.md:273-289": "TensorRT CPP Inference Code Snippet",
+    "/deploy/cpp_infer/readme_en.md:290-308": "Missing CUDA Library: Inference Details",
+    "/deploy/cpp_infer/readme_en.md:309-316": "CMake: Missing libcudnn.so Error",
+    "/deploy/cpp_infer/readme_en.md:39-76": "OpenCV Linux Compilation Guide",
+    "/deploy/cpp_infer/readme_en.md:77-110": "OpenCV Library Setup and C++ Video Inference",
+    "/deploy/cpp_infer/src/main.cpp": "OpenCV Video Recognition in C++",
+    "/deploy/cpp_infer/src/main.cpp:1-35": "OpenCV License and Headers",
+    "/deploy/cpp_infer/src/main.cpp:110-138": "Video Inference Parameter Check",
+    "/deploy/cpp_infer/src/main.cpp:139-170": "Validate and Launch Recording Mode",
+    "/deploy/cpp_infer/src/main.cpp:171-173": "Program Termination and Return Statement",
+    "/deploy/cpp_infer/src/main.cpp:37-54": "Inference Parameters",
+    "/deploy/cpp_infer/src/main.cpp:55-85": "Batch Video Processing with Video Recognition",
+    "/deploy/cpp_infer/src/main.cpp:86-109": "Batch Video Frame Recognition with PaddleVideo",
+    "/deploy/cpp_infer/src/postprocess_op.cpp": "Softmax In-Place Normalization",
+    "/deploy/cpp_infer/src/postprocess_op.cpp:1-26": "Softmax In-place Implementation in PaddleVideo",
+    "/deploy/cpp_infer/src/postprocess_op.cpp:27-50": "Softmax Implementation",
+    "/deploy/cpp_infer/src/preprocess_op.cpp": "Image Preprocessing for PaddleVideo Inference",
+    "/deploy/cpp_infer/src/preprocess_op.cpp:1-36": "Permute Class for OpenCV and Paddle API",
+    "/deploy/cpp_infer/src/preprocess_op.cpp:105-132": "Ten-Crop Image Preprocessing",
+    "/deploy/cpp_infer/src/preprocess_op.cpp:133-135": "Pre-processing for PaddleVideo",
+    "/deploy/cpp_infer/src/preprocess_op.cpp:37-66": "Image Channel Preprocessing",
+    "/deploy/cpp_infer/src/preprocess_op.cpp:67-104": "Resizable and Croppable Image Processing",
+    "/deploy/cpp_infer/src/utility.cpp": "Utility Functions in PaddleVideo Library",
+    "/deploy/cpp_infer/src/utility.cpp:1-33": "Utility Function in PaddleVideo Library",
+    "/deploy/cpp_infer/src/utility.cpp:119-146": "Perspective Image Transformation",
+    "/deploy/cpp_infer/src/utility.cpp:147-181": "Video Frame Sampler",
+    "/deploy/cpp_infer/src/utility.cpp:182-192": "Video Frame Sampling",
+    "/deploy/cpp_infer/src/utility.cpp:34-67": "Reads Label File and Retrieves Directory Contents",
+    "/deploy/cpp_infer/src/utility.cpp:68-93": "Directory File Path Vectorization and Image Bounding Box",
+    "/deploy/cpp_infer/src/utility.cpp:94-118": "Crop and Standardize Image Points",
+    "/deploy/cpp_infer/src/video_rec.cpp": "AI-powered Video Processing",
+    "/deploy/cpp_infer/src/video_rec.cpp:1-26": "Batch Size Operations",
+    "/deploy/cpp_infer/src/video_rec.cpp:106-129": "Image Preprocessing for Video Frames",
+    "/deploy/cpp_infer/src/video_rec.cpp:130-152": "Vector Initialization and Inference",
+    "/deploy/cpp_infer/src/video_rec.cpp:153-175": "Softmax-Based AI Inference",
+    "/deploy/cpp_infer/src/video_rec.cpp:177-198": "Post-processing Object Detection Results",
+    "/deploy/cpp_infer/src/video_rec.cpp:199-223": "Paddle Video Recognizer Initialization",
+    "/deploy/cpp_infer/src/video_rec.cpp:225-247": "Configure TensorRT Engine for Video Models",
+    "/deploy/cpp_infer/src/video_rec.cpp:248-271": "Configure TensorRT Parameters",
+    "/deploy/cpp_infer/src/video_rec.cpp:27-55": "Video Frame Preprocessing for Inference",
+    "/deploy/cpp_infer/src/video_rec.cpp:272-304": "Initialize PaddleVideo Predictor with TRT Options",
+    "/deploy/cpp_infer/src/video_rec.cpp:56-80": "Video Frame Processing and Conversion",
+    "/deploy/cpp_infer/src/video_rec.cpp:81-105": "Batch Segment Data Preprocessing",
+    "/deploy/cpp_infer/tools/build.sh": "Build C++ Inference Script",
+    "/deploy/cpp_serving/paddle_env_install.sh": "PaddleVideo C++ Serving Environment Setup",
+    "/deploy/cpp_serving/paddle_env_install.sh:1-22": "Install TensorRT and PaddleVideo Dependencies",
+    "/deploy/cpp_serving/paddle_env_install.sh:23-35": "PaddleVideo C++ Serving Environment Setup",
+    "/deploy/cpp_serving/preprocess_ops.py": "Preprocessing Functions in CPP Serving",
+    "/deploy/cpp_serving/preprocess_ops.py:1-34": "Image Processing Composition",
+    "/deploy/cpp_serving/preprocess_ops.py:113-126": "Model-Based Preprocess Function",
+    "/deploy/cpp_serving/preprocess_ops.py:35-76": "Video Preprocessing Function",
+    "/deploy/cpp_serving/preprocess_ops.py:77-111": "Video Preprocessing Function",
+    "/deploy/cpp_serving/readme.md": "Deploy Paddle Serving with Docker",
+    "/deploy/cpp_serving/readme.md:1-32": "Deploy Paddle Serving with Docker",
+    "/deploy/cpp_serving/readme.md:119-158": "Deploy C++ Serving Server",
+    "/deploy/cpp_serving/readme.md:160-164": "Disable Proxies Before Starting Service",
+    "/deploy/cpp_serving/readme.md:34-64": "Speed Up PaddleServing Installation and Deployment",
+    "/deploy/cpp_serving/readme.md:65-81": "PaddleVideo Deployment Guide",
+    "/deploy/cpp_serving/readme.md:82-118": "Rename Alias to 'outputs' for Fetch Variable",
+    "/deploy/cpp_serving/readme_en.md": "Accelerated Docker PaddleServing Deployment",
+    "/deploy/cpp_serving/readme_en.md:1-17": "PaddleServing Docker Installation Guide",
+    "/deploy/cpp_serving/readme_en.md:123-152": "C++ PaddleVideo Serving Setup",
+    "/deploy/cpp_serving/readme_en.md:152-165": "Proxy Settings in Cpp Serving Deployment",
+    "/deploy/cpp_serving/readme_en.md:18-41": "Install Docker Container for PaddlePaddle Serving",
+    "/deploy/cpp_serving/readme_en.md:42-65": "Speed Up PaddleServing Deployment with Action Recognition",
+    "/deploy/cpp_serving/readme_en.md:66-79": "Directory and Model Specification for PaddleVideo Inference",
+    "/deploy/cpp_serving/readme_en.md:79-94": "Update Model Files and Configs",
+    "/deploy/cpp_serving/readme_en.md:96-122": "Compatibility Rename Function for Model Deployment",
+    "/deploy/cpp_serving/run_cpp_serving.sh": "Deploy PaddleVideo Server with PP-TSM/TSN",
+    "/deploy/cpp_serving/serving_client.py": "PaddleServing and PaddleVideo Integration",
+    "/deploy/cpp_serving/serving_client.py:1-32": "Postprocess Paddle Serving Predictions",
+    "/deploy/cpp_serving/serving_client.py:33-62": "CPP Serving Client Function",
+    "/deploy/cpp_serving/serving_client.py:63-95": "Video Prediction Client in Python",
+    "/deploy/paddle2onnx/predict_onnx.py": "Paddle2ONNX Video Detection",
+    "/deploy/paddle2onnx/predict_onnx.py:1-31": "PaddleVideo Inference Environment Setup",
+    "/deploy/paddle2onnx/predict_onnx.py:123-153": "Batch Video Inference with Paddle2Onnx Predictor",
+    "/deploy/paddle2onnx/predict_onnx.py:154-171": "Benchmarked Predict: Autolog and Postprocess",
+    "/deploy/paddle2onnx/predict_onnx.py:32-54": "Parse ONNX Prediction Parameters",
+    "/deploy/paddle2onnx/predict_onnx.py:57-92": "Onnx Predictor Creation and Inference",
+    "/deploy/paddle2onnx/predict_onnx.py:94-122": "Building ONNX Inference Helper",
+    "/deploy/paddle2onnx/readme.md": "Paddle to ONNX Conversion for Inference",
+    "/deploy/paddle2onnx/readme.md:1-48": "Paddle2ONNX Model Conversion",
+    "/deploy/paddle2onnx/readme.md:49-70": "ONNX-Paddle Inference Parity",
+    "/deploy/paddle2onnx/readme_en.md": "Deploy Paddle2ONNX for PP-TSN Prediction",
+    "/deploy/paddle2onnx/readme_en.md:1-28": "Paddle2ONNX Model Conversion",
+    "/deploy/paddle2onnx/readme_en.md:29-61": "Paddle2ONNX: Model Conversion and Prediction",
+    "/deploy/paddle2onnx/readme_en.md:62-70": "Generate Output for Video File using PaddleVideo",
+    "/deploy/python_serving/pipeline_http_client.py": "Video Model Serving Pipeline with HTTP Client",
+    "/deploy/python_serving/pipeline_http_client.py:1-30": "Python PaddleVideo Serving Client",
+    "/deploy/python_serving/pipeline_http_client.py:31-62": "Video HTTP Client",
+    "/deploy/python_serving/pipeline_http_client.py:63-70": "POST Request with JSON Data in Python",
+    "/deploy/python_serving/pipeline_rpc_client.py": "PaddleVideo Model Web Serving",
+    "/deploy/python_serving/pipeline_rpc_client.py:1-29": "Handling PaddleVideo Model Execution",
+    "/deploy/python_serving/pipeline_rpc_client.py:30-60": "Command Line RPC Client for Video Processing",
+    "/deploy/python_serving/pipeline_rpc_client.py:61-68": "Video Prediction with PaddleVideo Client",
+    "/deploy/python_serving/readme.md": "Deploy PaddlePaddle Model for Serving",
+    "/deploy/python_serving/readme.md:1-32": "Deploy PaddleServing Model on Linux",
+    "/deploy/python_serving/readme.md:105-152": "PaddleVideo Deployment: Input-Output Config",
+    "/deploy/python_serving/readme.md:154-185": "RPC-based PaddleVideo Prediction",
+    "/deploy/python_serving/readme.md:33-58": "Install PaddlePaddle for CPU and GPU",
+    "/deploy/python_serving/readme.md:59-83": "Converting PaddlePaddle Model for Server Deployment",
+    "/deploy/python_serving/readme.md:84-103": "Configure PP-TSM Model Transformation Parameters",
+    "/deploy/python_serving/readme_en.md": "Deploying PaddleServing for Deep Learning via HTTP",
+    "/deploy/python_serving/readme_en.md:1-16": "Deploying Deep Learning Model with PaddleServing",
+    "/deploy/python_serving/readme_en.md:120-145": "Start PaddleVideo Service with Python",
+    "/deploy/python_serving/readme_en.md:146-175": "Python Web Service for Model Prediction",
+    "/deploy/python_serving/readme_en.md:17-41": "Install PaddleServing for CPU and GPU",
+    "/deploy/python_serving/readme_en.md:175-185": "Closing Proxy, Starting Service",
+    "/deploy/python_serving/readme_en.md:42-63": "Deploy Behavior Recognition Service with PaddleServing",
+    "/deploy/python_serving/readme_en.md:64-83": "Model Conversion for Server Deployment",
+    "/deploy/python_serving/readme_en.md:83-94": "PP-TSM Inference Model Conversion and Serving",
+    "/deploy/python_serving/readme_en.md:95-119": "Config File Alias Name Modification for Model Compatibility",
+    "/deploy/python_serving/recognition_web_service.py": "PaddleVideo Web Service Setup",
+    "/deploy/python_serving/recognition_web_service.py:1-28": "Building Image Recognition Web Service Base",
+    "/deploy/python_serving/recognition_web_service.py:103-125": "Decode and Reshape Frames Data",
+    "/deploy/python_serving/recognition_web_service.py:126-149": "Image Preprocessing and Post-Processing Methods",
+    "/deploy/python_serving/recognition_web_service.py:150-182": "Video Web Service Input Parser",
+    "/deploy/python_serving/recognition_web_service.py:183-208": "Command-Line Parsing for PaddleVideo Service",
+    "/deploy/python_serving/recognition_web_service.py:29-62": "Preprocessing Function for Recognition Models",
+    "/deploy/python_serving/recognition_web_service.py:63-102": "Video Processing Class in Recognition Web Service",
+    "/deploy/python_serving/utils.py": "Video to Base64 Conversion Utils",
+    "/deploy/python_serving/utils.py:1-37": "Video and Numpy Array Conversion Utilities",
+    "/deploy/python_serving/utils.py:39-78": "Video Frames Parser",
+    "/deploy/python_serving/utils.py:79-81": "Joining File Paths from List",
+    "/deploy/slim/quant_post_static.py": "Quantized Model for GPU Efficiency",
+    "/deploy/slim/quant_post_static.py:1-32": "Python Licensing and Libraries",
+    "/deploy/slim/quant_post_static.py:117-120": "Post-Training Quantization Function",
+    "/deploy/slim/quant_post_static.py:33-63": "Post-Training Quantization Function",
+    "/deploy/slim/quant_post_static.py:65-84": "Dynamic Dataset Loading for Quantization",
+    "/deploy/slim/quant_post_static.py:86-114": "Post-Training Quantization with Static Graph",
+    "/deploy/slim/readme.md": "Model Compression with PaddleSlim",
+    "/deploy/slim/readme.md:2-44": "PaddleSlim: Model Compression for PaddleVideo",
+    "/deploy/slim/readme.md:46-91": "Offline Quantization in PaddleVideo",
+    "/deploy/slim/readme.md:93-133": "PaddleVideo Quantized Model Deployment",
+    "/deploy/slim/readme_en.md": "Efficient Model Compression for PaddleVideo",
+    "/deploy/slim/readme_en.md:1-9": "Efficient PaddleVideo Model Compression with PaddleSlim",
+    "/deploy/slim/readme_en.md:10-30": "PaddleSlim: Model Compression Tools",
+    "/deploy/slim/readme_en.md:112-132": "Model Pruning and Deployment with PaddleLite",
+    "/deploy/slim/readme_en.md:31-64": "Installing PaddleSlim, Model Preparation & Offline Quantization",
+    "/deploy/slim/readme_en.md:64-87": "Offline Quantization in PaddleVideo",
+    "/deploy/slim/readme_en.md:87-111": "Deploying PP-TSM Model for Prediction",
+    "/english_documents/benchmark.md": "PaddleVideo: Benchmarking Speed and Action Segmentation",
+    "/english_documents/benchmark.md:1-27": "PaddleVideo Speed Benchmark",
+    "/english_documents/benchmark.md:29-45": "PaddleVideo Model Comparison",
+    "/english_documents/benchmark.md:47-64": "Sequential Action Segmentation Model Comparison",
+    "/english_documents/benchmark.md:64-68": "PaddleVideo Benchmarking: Test Time & Parameters",
+    "/english_documents/benchmark.md:68-69": "Reasoning Model Tested on GPU with Batch Size 2",
+    "/english_documents/dataset/AVA.md": "AVA Dataset Preparation Process",
+    "/english_documents/dataset/AVA.md:1-23": "AVA Dataset Preparation Process",
+    "/english_documents/dataset/AVA.md:113-113": "Video Frame Count Calculator",
+    "/english_documents/dataset/AVA.md:26-78": "Preparing AVA Dataset for Action Recognition",
+    "/english_documents/dataset/AVA.md:79-112": "Folder Structure for AVA Dataset in PaddleVideo",
+    "/english_documents/dataset/ActivityNet.md": "ActivityNet Dataset Preparation",
+    "/english_documents/dataset/ActivityNet.md:1-24": "ActivityNet: Large-Scale Video Dataset for Understanding",
+    "/english_documents/dataset/ActivityNet.md:25-45": "ActivityNet Dataset Video Feature Extraction",
+    "/english_documents/dataset/ActivityNet.md:46-77": "ActivityNet Annotations Structure",
+    "/english_documents/dataset/ActivityNet.md:78-80": "Update Configuration Paths",
+    "/english_documents/dataset/Oxford_RobotCar.md": "Oxford-RobotCar Dataset Preparation",
+    "/english_documents/dataset/Oxford_RobotCar.md:1-24": "Oxford-RobotCar Dataset Preparation",
+    "/english_documents/dataset/Oxford_RobotCar.md:101-114": "Dynamic Frame Filtering and Timestamp Renaming",
+    "/english_documents/dataset/Oxford_RobotCar.md:115-137": "RobotCar Dataset with CycleGAN",
+    "/english_documents/dataset/Oxford_RobotCar.md:137-150": "Oxford-RobotCar Dataset Structure",
+    "/english_documents/dataset/Oxford_RobotCar.md:151-162": "Directory Structure: Day/Night Training & Verification Images",
+    "/english_documents/dataset/Oxford_RobotCar.md:25-46": "BibTeX Citations for Datasets",
+    "/english_documents/dataset/Oxford_RobotCar.md:47-64": "Oxford RobotCar Dataset Download",
+    "/english_documents/dataset/Oxford_RobotCar.md:65-80": "RobotCar Dataset Training Links",
+    "/english_documents/dataset/Oxford_RobotCar.md:81-98": "Oxford RobotCar Dataset File URLs",
+    "/english_documents/dataset/README.md": "Comprehensive Action Datasets Table",
+    "/english_documents/dataset/README.md:1-28": "Action Recognition Datasets Table",
+    "/english_documents/dataset/README.md:29-58": "Dataset Table: Skeleton, Depth, Text",
+    "/english_documents/dataset/README.md:58-73": "HTML Table of Multimodal Datasets with Publication Years",
+    "/english_documents/dataset/SegmentationDataset.md": "Video Action Segmentation Dataset",
+    "/english_documents/dataset/fsd.md": "Figure Skating Dataset Overview",
+    "/english_documents/dataset/fsd.md:1-26": "Figure Skating OpenPose Dataset",
+    "/english_documents/dataset/fsd.md:26-47": "Tensor Structure and Joint Points in Dataset",
+    "/english_documents/dataset/fsd.md:49-55": "Train Dataset Details",
+    "/english_documents/dataset/k400.md": "Kinetics-400 Dataset Download and Extraction",
+    "/english_documents/dataset/k400.md:1-27": "Kinetics-400 Dataset Download Options",
+    "/english_documents/dataset/k400.md:29-65": "Accelerating Network Training with Videos",
+    "/english_documents/dataset/k400.md:65-78": "Extracting K400 Video Frames",
+    "/english_documents/dataset/msrvtt.md": "MSR-VTT: Video Transformers Dataset",
+    "/english_documents/dataset/msrvtt.md:1-29": "MSR-VTT Dataset Overview",
+    "/english_documents/dataset/msrvtt.md:31-73": "ActBERT MSR-VTT Dataset Download",
+    "/english_documents/dataset/msrvtt.md:74-79": "Multi-Modal Transformer Database",
+    "/english_documents/dataset/ntu-rgbd.md": "NTU RGB+D Dataset Preparation for CTR-GCN",
+    "/english_documents/dataset/ntu-rgbd.md:1-23": "NTU-RGB+D Dataset Overview",
+    "/english_documents/dataset/ntu-rgbd.md:130-158": "NTU RGB+D Dataset Organization and Preprocessing",
+    "/english_documents/dataset/ntu-rgbd.md:23-59": "NTU-RGB-D Dataset Download and Unzipping",
+    "/english_documents/dataset/ntu-rgbd.md:60-93": "Preparing NTU-RGBD Dataset for CTR-GCN",
+    "/english_documents/dataset/ntu-rgbd.md:94-129": "NTU-RGBD Dataset Overview",
+    "/english_documents/dataset/ucf101.md": "UCF101 Dataset Organization",
+    "/english_documents/dataset/ucf101.md:1-40": "UCF101 Dataset Download and Extraction",
+    "/english_documents/dataset/ucf101.md:41-81": "UCF101 Dataset File Organization",
+    "/english_documents/dataset/ucf101.md:82-86": "UCF101: Video Categories and Clips",
+    "/english_documents/dataset/ucf24.md": "UCF24 Dataset Preparation Guide",
+    "/english_documents/dataset/ucf24.md:1-20": "UCF24 Dataset Preparation with PaddleVideo",
+    "/english_documents/dataset/ucf24.md:22-60": "UCF24 Dataset Preparation with PaddleVideo",
+    "/english_documents/dataset/ucf24.md:61-73": "UCF101 Dataset File Structure",
+    "/english_documents/dataset/youtube8m.md": "YouTube-8M: Massive Video Classification Dataset",
+    "/english_documents/dataset/youtube8m.md:1-20": "Large-scale Video Classification Data Set",
+    "/english_documents/dataset/youtube8m.md:21-44": "Prepare Dataset for PaddlePaddle",
+    "/english_documents/dataset/youtube8m.md:45-56": "Pkl File Splitting and List Generation",
+    "/english_documents/install.md": "PaddlePaddle & PaddleVideo Installation Guide",
+    "/english_documents/install.md:1-41": "PaddlePaddle GPU Installation Guide",
+    "/english_documents/install.md:42-72": "Install and Configure PaddleVideo",
+    "/english_documents/model_zoo/README.md": "Model Zoo: Action Recognition and Segmentation Models",
+    "/english_documents/model_zoo/README.md:1-26": "Action Recognition Model Zoo",
+    "/english_documents/model_zoo/README.md:101-106": "Empty HTML Table Cell or Row",
+    "/english_documents/model_zoo/README.md:27-61": "AI Model Zoo: Action Recognition & Segmentation Models",
+    "/english_documents/model_zoo/README.md:62-100": "PaddleVideo Model Zoo Table",
+    "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md": "SlowFast_FasterRCNN Action Detection Tutorial",
+    "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:1-24": "SlowFast_FasterRCNN: Video Action Detection Model",
+    "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:127-129": "GPU Acceleration, No TensorRT",
+    "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:26-64": "AVA Dataset Video Processing Guide",
+    "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:65-90": "Training and Testing SlowFast Faster RCNN on AVA Dataset",
+    "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md:93-126": "Action Detection with SlowFast+FasterRCNN",
+    "/english_documents/model_zoo/estimation/adds.md": "ADDS-DepthNet: Estimating Depth with Day & Night Images",
+    "/english_documents/model_zoo/estimation/adds.md:1-23": "ADDS-DepthNet: Self-Supervised Monocular Depth Estimation",
+    "/english_documents/model_zoo/estimation/adds.md:107-124": "Predicting Depth Maps with PaddlePaddle's ADDS Model",
+    "/english_documents/model_zoo/estimation/adds.md:126-133": "Self-supervised Monocular Depth Estimation",
+    "/english_documents/model_zoo/estimation/adds.md:26-49": "Adding Pre-Trained Model to Oxford RobotCar Dataset",
+    "/english_documents/model_zoo/estimation/adds.md:50-72": "Train and Test ADDS-DepthNet with Oxford RobotCar Dataset",
+    "/english_documents/model_zoo/estimation/adds.md:74-90": "ADDS Model Testing on RobotCar Dataset",
+    "/english_documents/model_zoo/estimation/adds.md:92-107": "Model Performance Comparison Table",
+    "/english_documents/model_zoo/localization/bmn.md": "BMN Model for Action Proposal Generation",
+    "/english_documents/model_zoo/localization/bmn.md:1-35": "BMN Model: Training and Evaluation Modules",
+    "/english_documents/model_zoo/localization/bmn.md:36-68": "BMN Localization Model Deployment with PaddlePaddle",
+    "/english_documents/model_zoo/localization/bmn.md:70-96": "BMN Model Inference: Export and Predict",
+    "/english_documents/model_zoo/localization/bmn.md:97-104": "BMN: Temporal Action Proposal Inference Results",
+    "/english_documents/model_zoo/localization/yowo.md": "YOWO: Efficient Feature Extraction Model",
+    "/english_documents/model_zoo/localization/yowo.md:1-36": "YOWO: Spatio-Temporal Feature Extraction for Localization",
+    "/english_documents/model_zoo/localization/yowo.md:102-122": "Generate and Predict YOWO Model",
+    "/english_documents/model_zoo/localization/yowo.md:124-138": "YOWO Model Predicts HorseRiding with 0.8 Confidence",
+    "/english_documents/model_zoo/localization/yowo.md:36-60": "YOWO Localizer: Download and Train with PaddleVideo",
+    "/english_documents/model_zoo/localization/yowo.md:61-80": "Faster AMP Mixed-Precision Training for Yowo",
+    "/english_documents/model_zoo/localization/yowo.md:80-101": "Evaluating YOWO Model Performance and Exporting Inference Model",
+    "/english_documents/model_zoo/multimodal/actbert.md": "ActBERT: Multimodal Pretrain Task for Video-Language Tasks",
+    "/english_documents/model_zoo/multimodal/actbert.md:1-25": "Introducing ActBERT: Multimodal Pretrain Task for Video-Language Tasks",
+    "/english_documents/model_zoo/multimodal/actbert.md:26-65": "Training ActBERT on HowTo100M Dataset",
+    "/english_documents/model_zoo/multimodal/actbert.md:66-98": "Training ActBERT with AMP and MSR-VTT",
+    "/english_documents/model_zoo/partition/transnetv2.md": "TransNetV2: Deep Learning Shot Transition Detection",
+    "/english_documents/model_zoo/partition/transnetv2.md:1-28": "TransNetV2: Video Segmentation with DDCNN V2",
+    "/english_documents/model_zoo/partition/transnetv2.md:28-62": "TransNetV2 Inference Model Guide",
+    "/english_documents/model_zoo/partition/transnetv2.md:64-80": "TransNetV2 Prediction Demo",
+    "/english_documents/model_zoo/recognition/agcn.md": "AGCN: Enhanced Video Recognition via Multi-Stream Graph Convs",
+    "/english_documents/model_zoo/recognition/agcn.md:1-46": "Adaptive Graph Convolution Network (AGCN) Implementation",
+    "/english_documents/model_zoo/recognition/agcn.md:118-129": "Multi-Stream Adaptive Graph Convolutional Network for Action Recognition",
+    "/english_documents/model_zoo/recognition/agcn.md:49-84": "AGCN Test Scripts and Results",
+    "/english_documents/model_zoo/recognition/agcn.md:87-117": "AGCN Video Recognition Model Usage",
+    "/english_documents/model_zoo/recognition/agcn2s.md": "AGCN2s: Enhanced Motion Recognition Model",
+    "/english_documents/model_zoo/recognition/agcn2s.md:1-20": "Introducing 2s-AGCN for Bone Motion Recognition",
+    "/english_documents/model_zoo/recognition/agcn2s.md:104-112": "AGCN2S Model Prediction Engine",
+    "/english_documents/model_zoo/recognition/agcn2s.md:20-40": "AGCN2S: Skeleton-based Gesture Recognition Network",
+    "/english_documents/model_zoo/recognition/agcn2s.md:41-71": "2s-AGCN Test Scripts & Results",
+    "/english_documents/model_zoo/recognition/agcn2s.md:73-79": "AGCN-2s Model Checkpoints",
+    "/english_documents/model_zoo/recognition/agcn2s.md:81-103": "Exporting AGCN2s for Action Recognition",
+    "/english_documents/model_zoo/recognition/attention_lstm.md": "AttentionLSTM Model for YouTube-8M Classification",
+    "/english_documents/model_zoo/recognition/attention_lstm.md:1-19": "Attention-LSTM for Video Recognition",
+    "/english_documents/model_zoo/recognition/attention_lstm.md:21-45": "Attention LSTM on Youtube-8M with 8 GPUs",
+    "/english_documents/model_zoo/recognition/attention_lstm.md:47-68": "Export and Use AttentionLSTM Model",
+    "/english_documents/model_zoo/recognition/attention_lstm.md:69-84": "AttentionLSTM for Video Classification",
+    "/english_documents/model_zoo/recognition/ctrgcn.md": "Bone-Based Behavior Recognition with CTR-GCN",
+    "/english_documents/model_zoo/recognition/ctrgcn.md:1-39": "Bone-Based Behavior Recognition with CTR-GCN",
+    "/english_documents/model_zoo/recognition/ctrgcn.md:122-128": "Top-1 Action Recognition Scores",
+    "/english_documents/model_zoo/recognition/ctrgcn.md:41-74": "PaddlePaddle CTR-GCN Model: NTU Dataset Training & Testing",
+    "/english_documents/model_zoo/recognition/ctrgcn.md:74-90": "CTRGCN Model Performance on NTU-RGB+D Dataset",
+    "/english_documents/model_zoo/recognition/ctrgcn.md:93-121": "PaddleVideo's CTRGCN Model for Action Recognition",
+    "/english_documents/model_zoo/recognition/movinet.md": "PaddleVideo's Efficient MoViNet Model",
+    "/english_documents/model_zoo/recognition/movinet.md:1-40": "MoViNet: Efficient Video Reasoning Model",
+    "/english_documents/model_zoo/recognition/movinet.md:41-73": "MoViNet Testing and Inference Guide",
+    "/english_documents/model_zoo/recognition/movinet.md:74-91": "MoViNet Model Configuration",
+    "/english_documents/model_zoo/recognition/posec3d.md": "PoseC3D: Skeleton-Based Action Recognition on UCF101",
+    "/english_documents/model_zoo/recognition/posec3d.md:1-24": "PoseC3D: Skeleton-based Action Recognition",
+    "/english_documents/model_zoo/recognition/posec3d.md:24-39": "Training PoseC3D on UCF101 with Pre-trained Weights",
+    "/english_documents/model_zoo/recognition/posec3d.md:40-82": "PoseC3D Model Testing and Inference Guide"
+}
\ No newline at end of file
diff --git a/docs/data/titles/7.json b/docs/data/titles/7.json
new file mode 100644
index 000000000..82046e18b
--- /dev/null
+++ b/docs/data/titles/7.json
@@ -0,0 +1,302 @@
+{
+    "/english_documents/model_zoo/recognition/posec3d.md:83-100": "Inferring PoseC3D without TensorRT and GPU Acceleration",
+    "/english_documents/model_zoo/recognition/pp-timesformer.md": "Enhanced Video Recognition with PP-TimeSformer",
+    "/english_documents/model_zoo/recognition/pp-timesformer.md:1-29": "PP-TimeSformer: Video Classification Model",
+    "/english_documents/model_zoo/recognition/pp-timesformer.md:108-120": "Export PP-TimeSformer Model for Video Recognition",
+    "/english_documents/model_zoo/recognition/pp-timesformer.md:121-147": "PaddlePaddle's ppTimeSformer for Video Recognition",
+    "/english_documents/model_zoo/recognition/pp-timesformer.md:147-156": "PP-Timesformer for Video Classification",
+    "/english_documents/model_zoo/recognition/pp-timesformer.md:31-58": "Download and Prepare Data for Video Recognition",
+    "/english_documents/model_zoo/recognition/pp-timesformer.md:60-75": "Efficient Video Recognition with PaddlePaddle Timesformer",
+    "/english_documents/model_zoo/recognition/pp-timesformer.md:78-92": "PP-TimeSformer Test Accuracy",
+    "/english_documents/model_zoo/recognition/pp-timesformer.md:93-108": "Launching PaddleVideo with Vision Transformer and UniformCrop",
+    "/english_documents/model_zoo/recognition/pp-tsm.md": "Optimized PP-TSM for Action Recognition",
+    "/english_documents/model_zoo/recognition/pp-tsm.md:1-31": "Optimized PP-TSM for Action Recognition",
+    "/english_documents/model_zoo/recognition/pp-tsm.md:122-127": "Pre-trained PP-TSM Models",
+    "/english_documents/model_zoo/recognition/pp-tsm.md:129-159": "Export and Use PPTSM Model for Video Classification",
+    "/english_documents/model_zoo/recognition/pp-tsm.md:160-167": "Top1 Prediction: Archery",
+    "/english_documents/model_zoo/recognition/pp-tsm.md:31-64": "Training TSM Models on Kinetics and UCF",
+    "/english_documents/model_zoo/recognition/pp-tsm.md:64-90": "Training PP-TSM with Pretrained Model",
+    "/english_documents/model_zoo/recognition/pp-tsm.md:92-122": "PP-TSM: Kinetics-400 Training & Testing",
+    "/english_documents/model_zoo/recognition/pp-tsn.md": "PP-TSN: Enhanced TSN with Mixed-Precision",
+    "/english_documents/model_zoo/recognition/pp-tsn.md:1-30": "PP-TSN Model Documentation",
+    "/english_documents/model_zoo/recognition/pp-tsn.md:105-125": "PP-TSN Model Export and Inference",
+    "/english_documents/model_zoo/recognition/pp-tsn.md:126-146": "PP-TSN Video Recognition Inference",
+    "/english_documents/model_zoo/recognition/pp-tsn.md:33-61": "Training PP-TSN on Kinetics-400 with 8 GPUs",
+    "/english_documents/model_zoo/recognition/pp-tsn.md:63-81": "Accelerating PP-TSN Training with AMP",
+    "/english_documents/model_zoo/recognition/pp-tsn.md:81-95": "Distinct Testing Method for PP-TSN Model",
+    "/english_documents/model_zoo/recognition/pp-tsn.md:96-105": "PP-TSN Model Test Results on Kinetics-400",
+    "/english_documents/model_zoo/recognition/slowfast.md": "SlowFast Model: Multigrid Training for Video Recognition",
+    "/english_documents/model_zoo/recognition/slowfast.md:1-38": "SlowFast: Video Recognition Model Docs",
+    "/english_documents/model_zoo/recognition/slowfast.md:113-120": "SlowFast Networks for Video Recognition Code",
+    "/english_documents/model_zoo/recognition/slowfast.md:39-58": "Multigrid-Accelerated SlowFast Training",
+    "/english_documents/model_zoo/recognition/slowfast.md:61-79": "Testing SlowFast Model in PaddleVideo",
+    "/english_documents/model_zoo/recognition/slowfast.md:82-112": "SlowFast Model Export and Inference Guide",
+    "/english_documents/model_zoo/recognition/stgcn.md": "ST-GCN Action Recognition Model Training and Testing",
+    "/english_documents/model_zoo/recognition/stgcn.md:1-49": "Skeleton-based Action Recognition with ST-GCN",
+    "/english_documents/model_zoo/recognition/stgcn.md:116-129": "STGCN Recognition Model",
+    "/english_documents/model_zoo/recognition/stgcn.md:50-89": "Test ST-GCN Model on FSD and NTU-RGB+D Datasets",
+    "/english_documents/model_zoo/recognition/stgcn.md:90-115": "Export and Predict with STGCN Model",
+    "/english_documents/model_zoo/recognition/timesformer.md": "TimeSformer: Top Video Classifier",
+    "/english_documents/model_zoo/recognition/timesformer.md:1-26": "TimeSformer: Efficient Video Classification",
+    "/english_documents/model_zoo/recognition/timesformer.md:108-133": "TimeSformer Predicts Video Class",
+    "/english_documents/model_zoo/recognition/timesformer.md:133-137": "TimeSformer: Space-Time Attention for Video Recognition",
+    "/english_documents/model_zoo/recognition/timesformer.md:28-57": "Train Timesformer on Kinetics-400 with 8 GPUs",
+    "/english_documents/model_zoo/recognition/timesformer.md:58-72": "Training Timesformer on Multiple GPUs with AMP",
+    "/english_documents/model_zoo/recognition/timesformer.md:75-90": "Optimizing TimeSformer Testing",
+    "/english_documents/model_zoo/recognition/timesformer.md:93-107": "Export TimeSformer Inference Model",
+    "/english_documents/model_zoo/recognition/tokenshift_transformer.md": "TokenShift Transformer: Versatile Video Classifier",
+    "/english_documents/model_zoo/recognition/tokenshift_transformer.md:1-36": "Token Shift Vision Transformer",
+    "/english_documents/model_zoo/recognition/tokenshift_transformer.md:117-125": "Top-1 Prediction: Brushing Teeth, Confidence 0.99",
+    "/english_documents/model_zoo/recognition/tokenshift_transformer.md:36-63": "TokenShift Transformer: UCF-101 Training Guide",
+    "/english_documents/model_zoo/recognition/tokenshift_transformer.md:64-78": "Token Shift Transformer Training on UCF101 Dataset",
+    "/english_documents/model_zoo/recognition/tokenshift_transformer.md:78-93": "VisionTransformer Testing on UCF-101",
+    "/english_documents/model_zoo/recognition/tokenshift_transformer.md:93-116": "TokenShift Vision Transformer Inference Guide",
+    "/english_documents/model_zoo/recognition/tsm.md": "Training TSM: ResNet-50 PaddlePaddle AMP UCF-101 Kinetics-400",
+    "/english_documents/model_zoo/recognition/tsm.md:1-33": "TSM Video Understanding with ResNet-50",
+    "/english_documents/model_zoo/recognition/tsm.md:118-144": "TSM Model Training with PaddleVideo",
+    "/english_documents/model_zoo/recognition/tsm.md:145-166": "CUDNN Batch Normalization Testing Script",
+    "/english_documents/model_zoo/recognition/tsm.md:168-181": "TSM Models with ResNet50 and Sampling Methods",
+    "/english_documents/model_zoo/recognition/tsm.md:182-203": "TSM Model Inference with PaddlePaddle",
+    "/english_documents/model_zoo/recognition/tsm.md:203-221": "TSM Training Strategy: Momentum, L2 Decay",
+    "/english_documents/model_zoo/recognition/tsm.md:35-62": "Training TSM Model on Kinetics-400 with PaddleVideo",
+    "/english_documents/model_zoo/recognition/tsm.md:64-91": "Training TSM Model with PaddlePaddle and AMP",
+    "/english_documents/model_zoo/recognition/tsm.md:91-118": "TSM Model Training on UCF-101 Dataset",
+    "/english_documents/model_zoo/recognition/tsn.md": "TSN: 2D-CNN Video Classification with Sparse Sampling",
+    "/english_documents/model_zoo/recognition/tsn.md:1-20": "Global TSN for Video Classification",
+    "/english_documents/model_zoo/recognition/tsn.md:103-119": "Multi-Scale Random Cropping for Frame Enhancement",
+    "/english_documents/model_zoo/recognition/tsn.md:121-123": "TSN Implementation in PaddleVideo",
+    "/english_documents/model_zoo/recognition/tsn.md:21-48": "Training TSN on Kinetics-400 Dataset",
+    "/english_documents/model_zoo/recognition/tsn.md:49-65": "Start Training TSN Model with Kinetics-400 and 8 GPUs\"\n\"Test TSN Model in Test Mode: TenCrop vs. CenterCrop",
+    "/english_documents/model_zoo/recognition/tsn.md:66-81": "TSN Model Testing and Inference",
+    "/english_documents/model_zoo/recognition/tsn.md:82-103": "GPU-Accelerated TSN Model for Video Recognition",
+    "/english_documents/model_zoo/recognition/tsn_dali.md": "Accelerating TSN with DALI",
+    "/english_documents/model_zoo/recognition/tsn_dali.md:1-45": "Accelerating TSN Training with DALI",
+    "/english_documents/model_zoo/recognition/tsn_dali.md:45-82": "TSN-DALI Training with PaddleVideo",
+    "/english_documents/model_zoo/recognition/tsn_dali.md:84-98": "TSN Action Recognition with DALI",
+    "/english_documents/model_zoo/recognition/videoswin.md": "Swin-Transformer for Video Accuracy",
+    "/english_documents/model_zoo/recognition/videoswin.md:1-33": "Video-Swin Transformer Model Card",
+    "/english_documents/model_zoo/recognition/videoswin.md:119-131": "VideoSwin-Transformer for Prediction",
+    "/english_documents/model_zoo/recognition/videoswin.md:35-60": "Training VideoSwin on Kinetics400 with 8 GPUs",
+    "/english_documents/model_zoo/recognition/videoswin.md:60-75": "Faster Video-Swin-Transformer Training with Mixed Precision",
+    "/english_documents/model_zoo/recognition/videoswin.md:77-89": "Optimized Video-Swin-Transformer Testing: UniformCrop for Accuracy",
+    "/english_documents/model_zoo/recognition/videoswin.md:89-92": "Pre-Trained Swin-Transformer Checkpoints",
+    "/english_documents/model_zoo/recognition/videoswin.md:94-117": "Export and Predict in PaddleVideo",
+    "/english_documents/model_zoo/segmentation/asrf.md": "ASRF: Enhanced Video Segmentation with PaddlePaddle",
+    "/english_documents/model_zoo/segmentation/asrf.md:1-35": "ASRF: Enhanced Video Segmentation Model with PaddlePaddle",
+    "/english_documents/model_zoo/segmentation/asrf.md:102-131": "ASRF Model Inference with PaddleVideo Example",
+    "/english_documents/model_zoo/segmentation/asrf.md:132-139": "Write Inference Results to Separate Files",
+    "/english_documents/model_zoo/segmentation/asrf.md:37-55": "Training ASRF, Testing MS-TCN with Pre-trained Model",
+    "/english_documents/model_zoo/segmentation/asrf.md:57-80": "MS-TCN Accuracy and Edit Distance on Three Datasets",
+    "/english_documents/model_zoo/segmentation/asrf.md:81-100": "ASRF_gtea Model Weights & F1",
+    "/english_documents/model_zoo/segmentation/cfbi.md": "CFBI Video Object Segmentation Model",
+    "/english_documents/model_zoo/segmentation/cfbi.md:1-29": "CFBI: Foreground-Background Collaborative Segmentation",
+    "/english_documents/model_zoo/segmentation/cfbi.md:31-46": "Training and Evaluating CFBIp Segmentation Model on DAVIS Dataset",
+    "/english_documents/model_zoo/segmentation/mstcn.md": "MS-TCN Model Evaluation and Comparison",
+    "/english_documents/model_zoo/segmentation/mstcn.md:1-35": "Optimized MS-TCN for Precise Video Segmentation",
+    "/english_documents/model_zoo/segmentation/mstcn.md:109-130": "Configuring MSTCN Segmentation Model",
+    "/english_documents/model_zoo/segmentation/mstcn.md:36-52": "MSTCN Segmentation Training and Testing",
+    "/english_documents/model_zoo/segmentation/mstcn.md:54-78": "MSTCN vs Paper Model: Dataset Comparison",
+    "/english_documents/model_zoo/segmentation/mstcn.md:79-108": "Export and Use MSTCN Inference Model",
+    "/english_documents/quick_start.md": "PaddleVideo Quick Start Guide: Installation and Usage",
+    "/english_documents/quick_start.md:1-36": "Quick Start Guide: PaddleVideo Installation and Usage",
+    "/english_documents/quick_start.md:107-122": "PaddleVideo Model Parameters",
+    "/english_documents/quick_start.md:123-142": "Consistent Top-5 Classification Performance",
+    "/english_documents/quick_start.md:143-157": "PaddleVideo's Action Recognition Model",
+    "/english_documents/quick_start.md:38-76": "Install and Run PP-Video",
+    "/english_documents/quick_start.md:78-107": "Video Inference with PaddleVideo and PP-TSM_v2",
+    "/english_documents/tools.md": "PaddleVideo Tools Guide",
+    "/english_documents/tutorials/Action Recognition Datasets": "Action Recognition Datasets: A Comprehensive List",
+    "/english_documents/tutorials/Action Recognition Papers": "Top Action Recognition Papers",
+    "/english_documents/tutorials/Action Recognition Papers:1-16": "Top Action Recognition Papers for AI",
+    "/english_documents/tutorials/Action Recognition Papers:17-28": "Action Recognition Papers: State-of-the-Art Models",
+    "/english_documents/tutorials/Action Recognition Papers:29-29": "Trajectory-Pooled Deep Convolutional Descriptors for Action Recognition",
+    "/english_documents/tutorials/Spatio-Temporal Action Detection Papers": "Spatio-Temporal Action Detection Papers (2015-2017)",
+    "/english_documents/tutorials/Spatio-Temporal Action Detection Papers:1-13": "Spatio-Temporal Action Detection Papers List",
+    "/english_documents/tutorials/Spatio-Temporal Action Detection Papers:14-24": "Spatio-Temporal Action Detection Papers 2015-2017",
+    "/english_documents/tutorials/Spatio-Temporal Action Detection Papers:25-30": "Spatio-Temporal Action Detection Papers Overview",
+    "/english_documents/tutorials/TSM.md": "TSM: Video Understanding with Spatio-Temporal Balance",
+    "/english_documents/tutorials/TSM.md:1-5": "Introducing TSM: Efficient Video Understanding Model",
+    "/english_documents/tutorials/TSM.md:11-21": "TSM: Balancing 2D and 3D CNNs for Video Understanding",
+    "/english_documents/tutorials/TSM.md:22-27": "TSM: 2D Conv for Spatial-Temporal Info",
+    "/english_documents/tutorials/TSM.md:28-40": "TSM: Bi-Direction, UNI-Direction, and Residual Variants",
+    "/english_documents/tutorials/TSM.md:40-58": "Accelerating TSM Implementation in PaddlePaddle",
+    "/english_documents/tutorials/TSM.md:6-10": "Efficient Video Understanding with Temporal Shift Module",
+    "/english_documents/tutorials/TSM.md:60-73": "Implementing TSM in PaddlePaddle",
+    "/english_documents/tutorials/Temporal Action Detection Papers": "Temporal Action Detection Papers",
+    "/english_documents/tutorials/Temporal Action Detection Papers:1-12": "Temporal Action Detection Papers: A Comprehensive List",
+    "/english_documents/tutorials/Temporal Action Detection Papers:12-21": "Temporal Action Detection Papers Collection",
+    "/english_documents/tutorials/Temporal Action Detection Papers:22-24": "Temporal Action Detection Papers: A Comprehensive Guide",
+    "/english_documents/tutorials/accelerate.md": "Dual-Language Tutorial: Accelerate",
+    "/english_documents/tutorials/config.md": "Dependency Injection with PaddleVideo: Config-based Modularity",
+    "/english_documents/tutorials/config.md:1-37": "IOC/DI for Modular PaddleVideo",
+    "/english_documents/tutorials/config.md:118-131": "Command-Line Arguments for Training Script",
+    "/english_documents/tutorials/config.md:39-89": "DI with Register and Builder: Module Mapping Tutorial",
+    "/english_documents/tutorials/config.md:90-117": "Dependency Injection via Config-Driven Class Instantiation",
+    "/english_documents/tutorials/customized_usage.md": "Customizing PaddleVideo Framework Tutorial",
+    "/english_documents/tutorials/demos": "Multi-Task Action Recognition Demo",
+    "/english_documents/tutorials/deployment.md": "Converting Dygraph Models to Static for Deployment",
+    "/english_documents/tutorials/deployment.md:1-24": "Dynamic to Static: Deploying Dygraph Models with PaddleVideo",
+    "/english_documents/tutorials/deployment.md:24-48": "PaddleInference Video Inference Testing",
+    "/english_documents/tutorials/modular_design.md": "Bilingual Modular Design Tutorial",
+    "/english_documents/tutorials/pp-tsm.md": "Introducing PP-TSM: Enhanced Video Recognition Model",
+    "/english_documents/tutorials/pp-tsm.md:1-22": "High-Performance Video Recognition with PP-TSM",
+    "/english_documents/tutorials/pp-tsm.md:23-32": "Optimizing Model Performance with Enhanced Techniques",
+    "/english_documents/tutorials/summarize.md": "Video Action Recognition: Deep Learning Techniques",
+    "/english_documents/tutorials/summarize.md:1-18": "Action Recognition: Applications and Classification in Multiple Fields",
+    "/english_documents/tutorials/summarize.md:105-121": "Dataset Comparison: Mexaction2 and ActivityNet",
+    "/english_documents/tutorials/summarize.md:122-138": "Action Recognition with Manual and Deep Learning Methods",
+    "/english_documents/tutorials/summarize.md:139-154": "Deep Learning in Video Classification",
+    "/english_documents/tutorials/summarize.md:156-173": "ActivityNet Competition: Large-Scale Action Recognition from YouTube Videos",
+    "/english_documents/tutorials/summarize.md:174-193": "Action Recognition and Video Classification References",
+    "/english_documents/tutorials/summarize.md:18-32": "Multi-modal Video Classification Tasks",
+    "/english_documents/tutorials/summarize.md:194-206": "Video Recognition Paper References List",
+    "/english_documents/tutorials/summarize.md:33-49": "Temporal Action Classification and Dense Captioning",
+    "/english_documents/tutorials/summarize.md:51-72": "Popular Video Action Datasets: Overview and Challenges",
+    "/english_documents/tutorials/summarize.md:73-87": "HMDB51 vs Kinetics: Action Recognition Datasets",
+    "/english_documents/tutorials/summarize.md:88-104": "Kinetics: Action Recognition Benchmark",
+    "/english_documents/usage.md": "Efficient PaddleVideo Training on Linux",
+    "/english_documents/usage.md:1-28": "Setting Up PaddleVideo Environment",
+    "/english_documents/usage.md:134-174": "Distributed PaddleVideo Testing and Inference",
+    "/english_documents/usage.md:175-177": "Enabling/Disabling GPU in PaddleVideo",
+    "/english_documents/usage.md:29-71": "Train and Test Models with PaddlePaddle",
+    "/english_documents/usage.md:71-89": "Training and Validation Log Format",
+    "/english_documents/usage.md:91-132": "PaddleVideo: Resume, Finetune, Test Usage",
+    "/main.py": "Distributed PaddleVideo Training",
+    "/main.py:1-29": "Train PaddleVideo Model with Argparse",
+    "/main.py:120-141": "Command-Line Driven Model Training",
+    "/main.py:30-52": "Command Line Arguments for PaddleVideo",
+    "/main.py:53-84": "Command-Line AMP Training Customization",
+    "/main.py:87-118": "Configure and Parse Arguments for Paddle's Main Function",
+    "/paddlevideo/__init__.py": "Initializing PaddleVideo",
+    "/paddlevideo/loader/__init__.py": "PaddleVideo Dataset and Loader",
+    "/paddlevideo/loader/builder.py": "Distributed PaddleVideo Data Loader",
+    "/paddlevideo/loader/builder.py:1-29": "PaddleVideo Pipeline Builder",
+    "/paddlevideo/loader/builder.py:30-74": "Paddle Video Data Loader Builder",
+    "/paddlevideo/loader/builder.py:75-96": "Data Sampler for ML/DL Models",
+    "/paddlevideo/loader/builder.py:97-132": "Variable-Length Batch Data Loader",
+    "/paddlevideo/loader/dali_loader.py": "Dali Loader: Video Processing with PaddleOps",
+    "/paddlevideo/loader/dali_loader.py:1-32": "Dali Loader: Importing PaddlePaddle Iterator",
+    "/paddlevideo/loader/dali_loader.py:112-142": "DALI Video Iterator for PaddleVideo",
+    "/paddlevideo/loader/dali_loader.py:143-160": "Dali Video Loader Initialization",
+    "/paddlevideo/loader/dali_loader.py:161-176": "DALI Loader for Image Processing",
+    "/paddlevideo/loader/dali_loader.py:177-202": "DALI Loader Operations",
+    "/paddlevideo/loader/dali_loader.py:203-206": "Loader Methods and Length Determination",
+    "/paddlevideo/loader/dali_loader.py:35-65": "Dali Loader: Class Initialization and DALI Reader Building",
+    "/paddlevideo/loader/dali_loader.py:66-88": "Sharding Data Distribution Code Snippet",
+    "/paddlevideo/loader/dali_loader.py:89-111": "Dali Loader: Parallel Video Preprocessing",
+    "/paddlevideo/loader/dataset/MRI.py": "MRI Dataset Loader in PaddleVideo",
+    "/paddlevideo/loader/dataset/MRI.py:1-31": "PaddleVideo: MRI Dataset Loader",
+    "/paddlevideo/loader/dataset/MRI.py:109-109": "MRI Dataset Loader: Numpy Arrays from Results",
+    "/paddlevideo/loader/dataset/MRI.py:31-61": "MRI Dataset Initialization",
+    "/paddlevideo/loader/dataset/MRI.py:62-86": "Loader: Handling Missing MRI Files",
+    "/paddlevideo/loader/dataset/MRI.py:87-108": "Retry Loading Frames: Exception Handling in MRI Dataset",
+    "/paddlevideo/loader/dataset/MRI_SlowFast.py": "MRI SlowFast Dataset Loader",
+    "/paddlevideo/loader/dataset/MRI_SlowFast.py:1-31": "MRI_SlowFast Dataset Loader",
+    "/paddlevideo/loader/dataset/MRI_SlowFast.py:109-111": "MRI Dataset Loader",
+    "/paddlevideo/loader/dataset/MRI_SlowFast.py:31-61": "MRI Dataset Loader Class",
+    "/paddlevideo/loader/dataset/MRI_SlowFast.py:62-86": "Paddle Video: MRI Dataset Loading",
+    "/paddlevideo/loader/dataset/MRI_SlowFast.py:87-108": "Retry Loader with Error Logging",
+    "/paddlevideo/loader/dataset/__init__.py": "PaddleVideo Datasets: Load and Understand",
+    "/paddlevideo/loader/dataset/__init__.py:1-25": "PaddleVideo Dataset Importer",
+    "/paddlevideo/loader/dataset/__init__.py:26-41": "Importing and Exporting Datasets",
+    "/paddlevideo/loader/dataset/actbert_dataset.py": "ActBERT Dataset Setup in PaddlePaddle",
+    "/paddlevideo/loader/dataset/actbert_dataset.py:1-31": "Setting Up ActBERT Dataset in PaddleVideo",
+    "/paddlevideo/loader/dataset/actbert_dataset.py:32-66": "Class ActBertDataset Loader",
+    "/paddlevideo/loader/dataset/actbert_dataset.py:67-74": "ActBERT Dataset Preparation",
+    "/paddlevideo/loader/dataset/asrf_dataset.py": "ASRF Dataset Loader",
+    "/paddlevideo/loader/dataset/asrf_dataset.py:1-38": "ASRF Dataset: Action Segmentation Videos",
+    "/paddlevideo/loader/dataset/asrf_dataset.py:39-68": "ASRF Dataset Loader",
+    "/paddlevideo/loader/dataset/asrf_dataset.py:69-92": "PaddleVideo Dataset Loader",
+    "/paddlevideo/loader/dataset/asrf_dataset.py:94-104": "Boundary Data Loading and Processing",
+    "/paddlevideo/loader/dataset/ava_dataset.py": "AVA Dataset Class in PaddleVideo",
+    "/paddlevideo/loader/dataset/ava_dataset.py:1-32": "AVA Dataset Class in PaddleVideo",
+    "/paddlevideo/loader/dataset/ava_dataset.py:123-148": "AVA Dataset Loader",
+    "/paddlevideo/loader/dataset/ava_dataset.py:149-170": "AVA Dataset Video Processing",
+    "/paddlevideo/loader/dataset/ava_dataset.py:171-197": "Initialize and Append Video Information",
+    "/paddlevideo/loader/dataset/ava_dataset.py:198-221": "Filtering and Padding AVA Dataset Proposals",
+    "/paddlevideo/loader/dataset/ava_dataset.py:222-240": "Feature Padding in Ava Dataset",
+    "/paddlevideo/loader/dataset/ava_dataset.py:241-249": "AVA Dataset Preparation and Evaluation",
+    "/paddlevideo/loader/dataset/ava_dataset.py:33-62": "AVA Dataset Initialization and Validation",
+    "/paddlevideo/loader/dataset/ava_dataset.py:63-93": "AVA Dataset Initialization and Validation",
+    "/paddlevideo/loader/dataset/ava_dataset.py:94-122": "Excluding Mismatched Entity Boxes",
+    "/paddlevideo/loader/dataset/base.py": "BaseDataset: Loading and Preparing PaddlePaddle Data",
+    "/paddlevideo/loader/dataset/base.py:1-32": "BaseDataset: Python Class for PaddlePaddle Datasets",
+    "/paddlevideo/loader/dataset/base.py:34-59": "Initializing Base Dataset Class",
+    "/paddlevideo/loader/dataset/base.py:60-80": "Dataset Class for Paddle.io Video Loading",
+    "/paddlevideo/loader/dataset/bmn_dataset.py": "BMN Dataset Loader",
+    "/paddlevideo/loader/dataset/bmn_dataset.py:1-36": "BMNDataset: Action Localization Videos",
+    "/paddlevideo/loader/dataset/bmn_dataset.py:38-64": "Video Index Loading and Sorting in BMN Dataset",
+    "/paddlevideo/loader/dataset/bmn_dataset.py:65-72": "Prepare Test Data with BMN Dataset",
+    "/paddlevideo/loader/dataset/davis_dataset.py": "Davis Dataset for Video Segmentation",
+    "/paddlevideo/loader/dataset/davis_dataset.py:1-37": "PaddleVideo's VOS Dataset Processing",
+    "/paddlevideo/loader/dataset/davis_dataset.py:128-158": "Davis 2017 Dataset Initialization",
+    "/paddlevideo/loader/dataset/davis_dataset.py:159-182": "VOS Test Dataset Preparation",
+    "/paddlevideo/loader/dataset/davis_dataset.py:183-189": "Dataset Loading in PaddleVideo",
+    "/paddlevideo/loader/dataset/davis_dataset.py:38-66": "Davis Dataset Initialization",
+    "/paddlevideo/loader/dataset/davis_dataset.py:67-94": "DAVIS Dataset Image Loader",
+    "/paddlevideo/loader/dataset/davis_dataset.py:96-127": "Dataset Sample Generation",
+    "/paddlevideo/loader/dataset/feature.py": "FeatureDataset: PaddleVideo's Action Recognition Tool",
+    "/paddlevideo/loader/dataset/feature.py:1-36": "FeatureDataset: Action Recognition in PaddleVideo",
+    "/paddlevideo/loader/dataset/feature.py:38-63": "Video Dataset Loader Methods",
+    "/paddlevideo/loader/dataset/feature.py:64-80": "Preparing Test Data with Pipeline",
+    "/paddlevideo/loader/dataset/frame.py": "PaddleVideo: Efficient Video Datasets",
+    "/paddlevideo/loader/dataset/frame.py:1-31": "Frame Dataset Class in PaddleVideo",
+    "/paddlevideo/loader/dataset/frame.py:111-136": "FrameDataset for Sports Videos",
+    "/paddlevideo/loader/dataset/frame.py:137-158": "Frame Directory Data Processing",
+    "/paddlevideo/loader/dataset/frame.py:159-177": "Retry Corrupted Video Data Preparation",
+    "/paddlevideo/loader/dataset/frame.py:31-61": "Video Index Loader Class",
+    "/paddlevideo/loader/dataset/frame.py:62-86": "Frame Data Reader with Exception Handling",
+    "/paddlevideo/loader/dataset/frame.py:87-108": "Exception Handling for Loading Frames",
+    "/paddlevideo/loader/dataset/ms_tcn_dataset.py": "MS-TCN Dataset Loader",
+    "/paddlevideo/loader/dataset/ms_tcn_dataset.py:1-38": "MS-TCN Dataset Registration",
+    "/paddlevideo/loader/dataset/ms_tcn_dataset.py:39-68": "MS-Tcn Dataset Initialization",
+    "/paddlevideo/loader/dataset/ms_tcn_dataset.py:69-95": "Video Feature and Label Dataset Loader",
+    "/paddlevideo/loader/dataset/ms_tcn_dataset.py:97-110": "Video Dataset Label Loading Function",
+    "/paddlevideo/loader/dataset/msrvtt.py": "MSRVTT Dataset Preparation",
+    "/paddlevideo/loader/dataset/msrvtt.py:1-31": "Python Script: LMDB & PaddleLNPTok",
+    "/paddlevideo/loader/dataset/msrvtt.py:121-142": "Image Box Resizing and Feature Concatenation",
+    "/paddlevideo/loader/dataset/msrvtt.py:143-163": "MSRVTT Dataset Loading and Feature Extraction",
+    "/paddlevideo/loader/dataset/msrvtt.py:165-187": "Padding and Conversion for Dataset",
+    "/paddlevideo/loader/dataset/msrvtt.py:188-220": "Data Preparation in MSR-VTT Dataset",
+    "/paddlevideo/loader/dataset/msrvtt.py:32-67": "MSR-VTT Dataset Loader",
+    "/paddlevideo/loader/dataset/msrvtt.py:68-93": "Video Caption Tokenization with BertTokenizer",
+    "/paddlevideo/loader/dataset/msrvtt.py:94-120": "Video Data Processing and Loader",
+    "/paddlevideo/loader/dataset/oxford.py": "MonoDataset: PaddleVideo Oxford Dataset",
+    "/paddlevideo/loader/dataset/oxford.py:1-37": "Creating MonoDataset for PaddleVideo",
+    "/paddlevideo/loader/dataset/oxford.py:39-62": "Oxford Dataset Loader",
+    "/paddlevideo/loader/dataset/skeleton.py": "SkeletonDataset: Action Recognition Loader",
+    "/paddlevideo/loader/dataset/skeleton.py:1-34": "Skeleton Dataset: Action Recognition Loader",
+    "/paddlevideo/loader/dataset/skeleton.py:35-55": "Skeleton Data Loader Class",
+    "/paddlevideo/loader/dataset/skeleton.py:56-78": "Skeleton DataLoader",
+    "/paddlevideo/loader/dataset/slowfast_video.py": "SF Video Dataset: PaddleVideo's Action Recognition",
+    "/paddlevideo/loader/dataset/slowfast_video.py:1-31": "SlowFast Video Dataset",
+    "/paddlevideo/loader/dataset/slowfast_video.py:113-137": "Retry and Logging Video Loader",
+    "/paddlevideo/loader/dataset/slowfast_video.py:138-143": "Size of Dataset Calculator",
+    "/paddlevideo/loader/dataset/slowfast_video.py:32-64": "SlowFast Video Dataset",
+    "/paddlevideo/loader/dataset/slowfast_video.py:65-87": "Random Seed and Index Loading",
+    "/paddlevideo/loader/dataset/slowfast_video.py:88-112": "Resilient Video Dataset Processing",
+    "/paddlevideo/loader/dataset/ucf101_skeleton.py": "UCF101 Skeleton Dataset PaddleVideo Loader",
+    "/paddlevideo/loader/dataset/ucf101_skeleton.py:1-35": "UCF101 Skeleton Dataset Loader",
+    "/paddlevideo/loader/dataset/ucf101_skeleton.py:36-66": "UCF101 Skeleton Annotation Loader",
+    "/paddlevideo/loader/dataset/ucf101_skeleton.py:67-89": "UCf101 Skeleton Dataset Preparation",
+    "/paddlevideo/loader/dataset/ucf24_dataset.py": "Ucf24Dataset Class for PaddleVideo",
+    "/paddlevideo/loader/dataset/ucf24_dataset.py:1-30": "UCF24 Dataset Python Class",
+    "/paddlevideo/loader/dataset/ucf24_dataset.py:31-59": "Ucf24Dataset: Video Data Loader",
+    "/paddlevideo/loader/dataset/ucf24_dataset.py:60-76": "UCF24 Dataset Preparation and Conversion",
+    "/paddlevideo/loader/dataset/video.py": "Video Dataset Loader",
+    "/paddlevideo/loader/dataset/video.py:1-31": "VideoDataset: Loading and Transforming Raw Videos",
+    "/paddlevideo/loader/dataset/video.py:32-57": "Video Loader Class Initialization",
+    "/paddlevideo/loader/dataset/video.py:58-80": "Video Dataset Preparer",
+    "/paddlevideo/loader/dataset/video.py:81-95": "Robust Video Loading and Testing",
+    "/paddlevideo/loader/pipelines/__init__.py": "Video Processing Pipelines",
+    "/paddlevideo/loader/pipelines/__init__.py:1-20": "Pipeline Initialization in PaddleVideo",
+    "/paddlevideo/loader/pipelines/__init__.py:21-38": "PaddleVideo Pipelines",
+    "/paddlevideo/loader/pipelines/__init__.py:39-55": "Customizable PaddleVideo Pipelines",
+    "/paddlevideo/loader/pipelines/__init__.py:56-56": "Empty Code Alert",
+    "/paddlevideo/loader/pipelines/anet_pipeline.py": "PaddleVideo: IoU-based Feature Extraction",
+    "/paddlevideo/loader/pipelines/anet_pipeline.py:1-32": "PaddleVideo Feature Data Loader",
+    "/paddlevideo/loader/pipelines/anet_pipeline.py:116-140": "Intersection Over Union Calculation for Anchor Boxes",
+    "/paddlevideo/loader/pipelines/anet_pipeline.py:141-150": "Annotating IOU Maps",
+    "/paddlevideo/loader/pipelines/anet_pipeline.py:33-62": "Temporal Matching Windows Generator",
+    "/paddlevideo/loader/pipelines/anet_pipeline.py:63-90": "Anchors Intersection Calculator"
+}
\ No newline at end of file
diff --git a/docs/data/titles/8.json b/docs/data/titles/8.json
new file mode 100644
index 000000000..05edf4199
--- /dev/null
+++ b/docs/data/titles/8.json
@@ -0,0 +1,302 @@
+{
+    "/paddlevideo/loader/pipelines/anet_pipeline.py:91-115": "Ground Truth Initialization in Video Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py": "Enhanced PaddleVideo Loader with Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations.py:1-34": "Scaling Images with PaddleVideo Loader",
+    "/paddlevideo/loader/pipelines/augmentations.py:1008-1037": "10-Crop Image Class",
+    "/paddlevideo/loader/pipelines/augmentations.py:1038-1069": "UniformCrop Pipeline for Image Sampling",
+    "/paddlevideo/loader/pipelines/augmentations.py:1070-1099": "Image Offset Determination for Cropping",
+    "/paddlevideo/loader/pipelines/augmentations.py:109-138": "Image Augmentation Pipeline Defined",
+    "/paddlevideo/loader/pipelines/augmentations.py:1100-1127": "Image Augmentation Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py:1128-1156": "Image Augmentation for PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations.py:1157-1193": "Color Jitter Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations.py:1194-1227": "ColorFlipAugmenter",
+    "/paddlevideo/loader/pipelines/augmentations.py:1228-1257": "Image Augmentation Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py:1258-1294": "YowoAug: Versatile Image Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations.py:1295-1322": "Image Augmentation and Detection Functions",
+    "/paddlevideo/loader/pipelines/augmentations.py:1323-1353": "Resizing and Normalizing Bounding Boxes",
+    "/paddlevideo/loader/pipelines/augmentations.py:1354-1390": "Bounding Box Jitter Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations.py:139-164": "Random Crop Augmentation in PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations.py:1391-1419": "Image Augmentation and Label Manipulation",
+    "/paddlevideo/loader/pipelines/augmentations.py:1420-1427": "Image Resizing and Conversion Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations.py:165-197": "Random Resizing and Cropping Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py:198-219": "Random Crop Generator",
+    "/paddlevideo/loader/pipelines/augmentations.py:220-249": "Image Cropper for PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations.py:250-276": "Center Cropping Image Class",
+    "/paddlevideo/loader/pipelines/augmentations.py:277-297": "Center Crop Images in Augmentations",
+    "/paddlevideo/loader/pipelines/augmentations.py:298-325": "MultiScaleCrop: Flexible Image Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations.py:326-360": "Multi-Scale Image Cropper",
+    "/paddlevideo/loader/pipelines/augmentations.py:35-61": "Resize Image Class",
+    "/paddlevideo/loader/pipelines/augmentations.py:361-384": "Random Crop with Grid Offsets",
+    "/paddlevideo/loader/pipelines/augmentations.py:385-405": "Random Cropping for Image Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations.py:406-440": "Random Flip Image Augmentation Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py:441-473": "Random Image Flipping and Brightness Adjustment",
+    "/paddlevideo/loader/pipelines/augmentations.py:474-508": "Random Image Augmentations in PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations.py:509-546": "Random Saturation and Hue Pipeline Transforms",
+    "/paddlevideo/loader/pipelines/augmentations.py:547-577": "Gamma and Color Jitter Augmentation in PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations.py:578-611": "Random Gamma Adjustment Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py:612-638": "Image to NumpyArray Transpose Class",
+    "/paddlevideo/loader/pipelines/augmentations.py:62-88": "Resizing Images in PaddleVideo Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py:639-665": "Normalization Class in PaddleVideo's Loader Pipelines",
+    "/paddlevideo/loader/pipelines/augmentations.py:667-693": "Image Normalization Class",
+    "/paddlevideo/loader/pipelines/augmentations.py:694-722": "Image Normalization and Scaling Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py:724-749": "Jitter Resize Image Sequence Function",
+    "/paddlevideo/loader/pipelines/augmentations.py:750-774": "Resize Image Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py:777-805": "MultiCenterCrop Class: Image Cropping Operations",
+    "/paddlevideo/loader/pipelines/augmentations.py:807-834": "Image Cropping Augmentations in PyAV",
+    "/paddlevideo/loader/pipelines/augmentations.py:835-865": "MultiCrop Pipeline for Paddle Tensor",
+    "/paddlevideo/loader/pipelines/augmentations.py:866-891": "Random Crop Augmentation Class",
+    "/paddlevideo/loader/pipelines/augmentations.py:89-108": "Resizing Image with Aspect Ratio Preservation",
+    "/paddlevideo/loader/pipelines/augmentations.py:892-914": "Image Size Check and Crop Generation",
+    "/paddlevideo/loader/pipelines/augmentations.py:915-944": "Crop Offsets Calculator",
+    "/paddlevideo/loader/pipelines/augmentations.py:946-975": "GroupFullResSample Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations.py:977-1007": "Image Augmentation via Crops and Flips",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py": "AVA Dataset Image Augmentation and Resizing in PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:1-34": "AVA Image Augmentations in PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:131-160": "Image Resizing Function for CV2 and Pillow",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:161-193": "EntityBoxCrop Scale Class",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:194-224": "Cropping Object Detection Proposals",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:225-249": "Horizontal Flipping of Entity Boxes",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:250-284": "Resizing Image Pipeline Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:286-303": "Image Augmentation Function in PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:304-334": "Initialize Resize Augmentation Object",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:335-363": "Image Resizing Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:35-64": "Properly Initializing Lazy Operations",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:364-393": "Random Rescaling Image Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:395-423": "Resize Augmentation Transform",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:424-455": "Rescale Augmentation for PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:456-487": "Image Resizing and Cropping in PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:488-517": "Random Cropping Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:519-544": "Crop Quadruple Adjustment",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:546-571": "Augmentations for Video Frames",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:572-609": "In-Place Flipping and Image Negation",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:610-631": "Flip Augmentation in PaddleVideo",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:632-660": "Flip Augmentation for Images",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:65-96": "AVA Image Augmentation Functions",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:661-693": "Image Augmentation Class",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:696-720": "Normalize Image Augmentation",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:722-749": "AVA Image Normalization Augmentation Pipeline",
+    "/paddlevideo/loader/pipelines/augmentations_ava.py:97-130": "Imresize: Scaling Images with Flexibility",
+    "/paddlevideo/loader/pipelines/compose.py": "Flexible Transformation Composition",
+    "/paddlevideo/loader/pipelines/decode.py": "TimeSformer Video Decoder Pipeline",
+    "/paddlevideo/loader/pipelines/decode.py:1-32": "Video Clip Processing Pipeline",
+    "/paddlevideo/loader/pipelines/decode.py:127-150": "Seek, Decode, Filter Frames",
+    "/paddlevideo/loader/pipelines/decode.py:151-177": "Decode and Sort Video Frames",
+    "/paddlevideo/loader/pipelines/decode.py:178-222": "Pipeline Classes for Data Decoding",
+    "/paddlevideo/loader/pipelines/decode.py:223-249": "Preparing Data for Model in PaddleVideo Loader",
+    "/paddlevideo/loader/pipelines/decode.py:250-275": "Video Feature Pad and Dequantize",
+    "/paddlevideo/loader/pipelines/decode.py:276-310": "ActionFeatureDecoder: Feature Decoding Class",
+    "/paddlevideo/loader/pipelines/decode.py:311-338": "Data Preprocessing for PaddlePaddle Video Pipeline",
+    "/paddlevideo/loader/pipelines/decode.py:33-69": "MP4 Decoder Class for Frame Extraction",
+    "/paddlevideo/loader/pipelines/decode.py:339-347": "Pad and Concatenate Feature Data",
+    "/paddlevideo/loader/pipelines/decode.py:70-98": "Video Decoder Pipeline",
+    "/paddlevideo/loader/pipelines/decode.py:99-125": "Video Duration Check and Decoding Indices",
+    "/paddlevideo/loader/pipelines/decode_image.py": "Decoding Images with PaddleVideo",
+    "/paddlevideo/loader/pipelines/decode_image.py:1-37": "PaddleVideo Image Decoder Pipeline",
+    "/paddlevideo/loader/pipelines/decode_image.py:117-149": "Image Decoding and Organization",
+    "/paddlevideo/loader/pipelines/decode_image.py:150-179": "Decode Image Pipeline: Setup and Side Detection",
+    "/paddlevideo/loader/pipelines/decode_image.py:180-206": "Pipeline for Decoding Images in PaddleVideo",
+    "/paddlevideo/loader/pipelines/decode_image.py:38-66": "Image Decoding Pipeline Class",
+    "/paddlevideo/loader/pipelines/decode_image.py:67-89": "Decode Image Pipeline Methods",
+    "/paddlevideo/loader/pipelines/decode_image.py:91-116": "Depth Image Resizer",
+    "/paddlevideo/loader/pipelines/decode_sampler.py": "Video Decoder Pipeline: Load, Decode, Clip",
+    "/paddlevideo/loader/pipelines/decode_sampler.py:1-30": "Fast Decoding and Sampling with DecodeSampler",
+    "/paddlevideo/loader/pipelines/decode_sampler.py:31-55": "Video Frame Sampler Class Initialization",
+    "/paddlevideo/loader/pipelines/decode_sampler.py:57-81": "MP4 Decoder with Short Cycle Adjustment",
+    "/paddlevideo/loader/pipelines/decode_sampler.py:82-93": "Decode Image Frames Pipeline",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py": "MRI Frame Decoder and Sampler",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:1-36": "SFMRI Decoder and Sampler",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:116-135": "Sampling Indices Calculator",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:136-157": "Randomly Selecting Frames Offsets",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:158-180": "Frame Index Assignment in MRI Decode Pipeline",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:181-203": "Generate Offsets for TSM",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:204-224": "Calculate Segment Offsets for 's' and 'f' Frames",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:37-64": "MRI Frame Segmenter",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:65-94": "MRI Sampler Decode Pipeline",
+    "/paddlevideo/loader/pipelines/decode_sampler_MRI.py:95-115": "Video Sampling Frame Handler",
+    "/paddlevideo/loader/pipelines/mix.py": "VideoMix: Augmented Image Classification with Controlled Mixup and Cutmix",
+    "/paddlevideo/loader/pipelines/mix.py:1-34": "Mixup Class for PaddleVideo",
+    "/paddlevideo/loader/pipelines/mix.py:104-116": "Random Mixup or Cutmix for Paddle Video",
+    "/paddlevideo/loader/pipelines/mix.py:35-70": "Cutmix: A Mixup Operator for Images",
+    "/paddlevideo/loader/pipelines/mix.py:72-103": "CutMix and MixUp Video Data Augmentation",
+    "/paddlevideo/loader/pipelines/multimodal.py": "Multimodal PaddleVideo Pipeline Expansion",
+    "/paddlevideo/loader/pipelines/multimodal.py:1-35": "Feature Padding Class in PaddlePaddle Video Analysis",
+    "/paddlevideo/loader/pipelines/multimodal.py:114-151": "Random Labeling and Masking in Multimodal Pipeline",
+    "/paddlevideo/loader/pipelines/multimodal.py:152-175": "Multimodal Data Loader for TensorFlow",
+    "/paddlevideo/loader/pipelines/multimodal.py:176-201": "Masking Tokens for LM Tasks",
+    "/paddlevideo/loader/pipelines/multimodal.py:202-225": "Token Masking in PaddleVideo",
+    "/paddlevideo/loader/pipelines/multimodal.py:226-255": "Randomizing Functions in Multimodal Pipeline",
+    "/paddlevideo/loader/pipelines/multimodal.py:256-285": "Random Masking Function",
+    "/paddlevideo/loader/pipelines/multimodal.py:286-308": "Multimodal Pipeline: BERT-based Feature Concatenation",
+    "/paddlevideo/loader/pipelines/multimodal.py:309-333": "Preparing Input for Multimodal Pipeline",
+    "/paddlevideo/loader/pipelines/multimodal.py:334-359": "Zero-Padding Sequences to Max Length",
+    "/paddlevideo/loader/pipelines/multimodal.py:36-59": "Multimodal Paddle Video Loader",
+    "/paddlevideo/loader/pipelines/multimodal.py:360-380": "Multimodal Pipeline: Feature Extraction",
+    "/paddlevideo/loader/pipelines/multimodal.py:60-81": "Multimodal Data Preprocessing",
+    "/paddlevideo/loader/pipelines/multimodal.py:82-113": "Random Caption Selector Pipeline",
+    "/paddlevideo/loader/pipelines/sample.py": "Efficient Frame Sampling with PaddleVideo",
+    "/paddlevideo/loader/pipelines/sample.py:1-38": "Python PaddleVideo Image Processing Pipeline",
+    "/paddlevideo/loader/pipelines/sample.py:120-144": "Video Frame Sampler: Handles Dense and Non-Dense Scenarios",
+    "/paddlevideo/loader/pipelines/sample.py:145-171": "Clip Offsets Calculator",
+    "/paddlevideo/loader/pipelines/sample.py:172-199": "Video Frame Sampling Algorithm",
+    "/paddlevideo/loader/pipelines/sample.py:200-223": "Video Frame Sampler Algorithm",
+    "/paddlevideo/loader/pipelines/sample.py:224-245": "Sampling Position Determination Algorithm",
+    "/paddlevideo/loader/pipelines/sample.py:246-268": "Indexing Frames by Duration",
+    "/paddlevideo/loader/pipelines/sample.py:270-292": "Random Frame Sampler",
+    "/paddlevideo/loader/pipelines/sample.py:293-327": "Video Sampler Class for PyTorch",
+    "/paddlevideo/loader/pipelines/sample.py:328-358": "Video Image Sampler in Paddle Video Pipeline",
+    "/paddlevideo/loader/pipelines/sample.py:359-382": "Video Sample Indexing Algorithm",
+    "/paddlevideo/loader/pipelines/sample.py:39-66": "Sampler Frame Selection",
+    "/paddlevideo/loader/pipelines/sample.py:67-96": "Image Format Converter Class",
+    "/paddlevideo/loader/pipelines/sample.py:97-119": "Video Decoding Pipeline with Multiple Backends",
+    "/paddlevideo/loader/pipelines/sample_ava.py": "SampleAVA Pipeline for PaddleVideo",
+    "/paddlevideo/loader/pipelines/sample_ava.py:1-35": "PaddleVideo: Frame Sampler",
+    "/paddlevideo/loader/pipelines/sample_ava.py:108-129": "Wrap-Around Frame Index Handling",
+    "/paddlevideo/loader/pipelines/sample_ava.py:130-166": "Abstract Class for Storage Backends",
+    "/paddlevideo/loader/pipelines/sample_ava.py:167-190": "Registering Backends: SampleAVA Pipeline Code",
+    "/paddlevideo/loader/pipelines/sample_ava.py:191-225": "FileClient: Handling File Operations and Pillow Image Conversion",
+    "/paddlevideo/loader/pipelines/sample_ava.py:226-247": "Pillow Image to Numpy Array Converter",
+    "/paddlevideo/loader/pipelines/sample_ava.py:248-270": "Image Conversion to Numpy Array",
+    "/paddlevideo/loader/pipelines/sample_ava.py:271-301": "Pipeline for Decoding Frames in Sample_AVA.py",
+    "/paddlevideo/loader/pipelines/sample_ava.py:303-326": "Resizing and Scaling Pipeline",
+    "/paddlevideo/loader/pipelines/sample_ava.py:327-354": "SampleAVAFrames Class Overview",
+    "/paddlevideo/loader/pipelines/sample_ava.py:355-374": "AVA Sample Frame Indexer",
+    "/paddlevideo/loader/pipelines/sample_ava.py:36-61": "AVA Frame Sampling Initialization",
+    "/paddlevideo/loader/pipelines/sample_ava.py:62-82": "Clip Offset Calculator",
+    "/paddlevideo/loader/pipelines/sample_ava.py:83-107": "AVA Video Sampler",
+    "/paddlevideo/loader/pipelines/sample_ucf24.py": "UCF24 Frame Sampler",
+    "/paddlevideo/loader/pipelines/sample_ucf24.py:1-33": "Video Sampler Class: SamplerUCF24",
+    "/paddlevideo/loader/pipelines/sample_ucf24.py:34-65": "Video Clip Pipeline Creation",
+    "/paddlevideo/loader/pipelines/sample_ucf24.py:66-69": "Keyframe Indexer",
+    "/paddlevideo/loader/pipelines/segmentation.py": "PaddleVideo: Enhanced Segmentation and Transformation",
+    "/paddlevideo/loader/pipelines/segmentation.py:1-32": "PaddleVideo Segmentation Pipeline Code",
+    "/paddlevideo/loader/pipelines/segmentation.py:125-130": "Image Normalization and Transposition",
+    "/paddlevideo/loader/pipelines/segmentation.py:33-65": "Multi-Scale Image Segmentation Function",
+    "/paddlevideo/loader/pipelines/segmentation.py:67-92": "Image Resizing and Flipping Pipeline",
+    "/paddlevideo/loader/pipelines/segmentation.py:93-124": "MultiNorm Image Preprocessing",
+    "/paddlevideo/loader/pipelines/segmentation_pipline.py": "Segmentation Sampler Python Class",
+    "/paddlevideo/loader/pipelines/segmentation_pipline.py:1-35": "Segmentation Sampler Python Pipeline",
+    "/paddlevideo/loader/pipelines/segmentation_pipline.py:36-40": "Segmentation Pipeline Code Snippet",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py": "Skeleton Pipeline for PaddleVideo",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1-34": "Skeleton Pipeline Registration",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1007-1034": "Skeleton Pipeline Image and Keypoints Flipping",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:102-133": "Skeleton Pipeline Data Padding",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1036-1065": "Horizontal Flip Augmentation in PaddleVideo",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1066-1090": "Flip and Flip Labels in Skeleton Pipeline",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1091-1117": "SkeletonPipeline: Flip and Register",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1119-1144": "Image Data Formatting Class",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1145-1169": "SkeletonPipeline: Image Processing and Reshaping",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1170-1197": "Image Data Converter: NHWC/NCHW/NPTCHW with Collapse Option",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1198-1220": "Collect Pipeline Class",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1221-1238": "Default Image Data Loading Parameters",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1239-1275": "Skeleton Pipeline Class",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1276-1294": "GeneratePoseTarget: Heatmap Generator",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1295-1318": "Skeleton Pipeline Initialization",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1319-1349": "SkeletonPipeline Heatmap Generation",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:134-170": "Skeleton Pipeline Classes",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1350-1375": "Gaussian Kernel Patch Heatmap Generation",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1377-1398": "Pseudo Heatmap Generation",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1399-1427": "Keypoint Distance Calculator",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1429-1453": "Gaussian Kernel Heatmap Dominant Point Calculator",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1455-1483": "Generate Heatmaps for Keypoints and Limbs",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1484-1517": "Pseudo Heatmap Generator for Skeleton Sequences",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1518-1547": "Heatmap Generator from Keypoints",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:1548-1554": "Skeleton Pipeline Formatting",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:173-202": "Random Rotation Skeleton Class",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:204-229": "Random Rotation Applier",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:230-265": "Skeleton Pipeline Class for Cropping",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:266-286": "Randomly Cropped and Biased Skeleton Data Processing",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:287-316": "Skeleton Data Transformation for Video Analysis",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:317-344": "Skeleton Pipeline for PaddleVideo",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:345-372": "Training Clip Sampler",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:36-68": "Lazy Operation Initialization",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:373-401": "Skeleton Clip Index Determination",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:402-427": "Random Frame Index Selection",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:428-459": "Skeleton Pipeline Class for PaddleVideo",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:460-493": "PoseDecode Class Loads Pose Keypoints",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:494-528": "PoseCompact: Compact Keypoint Representation",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:529-546": "Expand Bounding Boxes in Skeleton Pipeline",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:548-585": "Skeleton Pipeline Class",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:587-611": "Bounding Box Adjustment for Skeleton Pipeline",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:613-640": "Crop-Based Skeleton Detection",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:641-669": "Cropping Bounding Boxes in Skeleton Pipeline",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:670-693": "Random Resized Crop Pipeline V2",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:69-101": "Skeleton Pipeline: Auto-Padding and Feature Extraction",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:694-719": "Initialize Class for Cropping Bounding Box",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:720-741": "Random Crop Bounding Box Generator",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:742-766": "Random Crop with Aspect Ratios",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:767-791": "Crop Quadruple Adjustment for Skeleton Pipeline",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:792-815": "Image Cropping and Keypoints Extraction",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:816-843": "Skeleton Pipeline for PaddleVideo",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:844-880": "CenterCrop_V2 Pipeline and is\\_seq\\_of Function",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:881-909": "CenterCrop Augmentation Class",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:910-936": "Update Image Shape and Crop Coordinates",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:937-957": "Handling Flip Operation in Skeleton Pipeline",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:958-984": "Flip_V2 Pipeline Registration and Functionality",
+    "/paddlevideo/loader/pipelines/skeleton_pipeline.py:985-1006": "SkeletonPipeline: Direction-Based Keypoint Flipping",
+    "/paddlevideo/loader/registry.py": "PaddleVideo Registry Definition",
+    "/paddlevideo/metrics/ActivityNet/__init__.py": "Public API Added for ANETproposal",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py": "ActivityNet Proposal Metrics",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:1-29": "ActivityNet Metrics: AR@N & AUC",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:103-130": "Proposal JSON Parser",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:132-158": "ActivityNet Proposal Evaluation Metrics",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:159-182": "ActivityNet Proposal Metric",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:183-202": "Average Recall Calculator",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:203-231": "Max Average Proposals Per Video",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:232-255": "Exception Handling for Proposals and Ground Truth",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:257-278": "Average Recall Calculator",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:279-298": "Threshold-Based True Positives Calculation",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:299-324": "Efficient Video Metrics Calculation",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:31-54": "Initializing Class with Files and Defaults",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:325-349": "TIOU Calculation Function",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:350-359": "IoU Calculation for Segment Intersection",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:55-77": "Blocked Video Metrics Script",
+    "/paddlevideo/metrics/ActivityNet/anet_prop.py:79-102": "Read Ground Truth JSON and Return DataFrame",
+    "/paddlevideo/metrics/__init__.py": "PaddleVideo Metrics Library",
+    "/paddlevideo/metrics/__init__.py:1-25": "Video Metric Suite",
+    "/paddlevideo/metrics/__init__.py:26-36": "Comprehensive Metric Import for PaddleVideo",
+    "/paddlevideo/metrics/ava_evaluation/README.md": "AVA Evaluation Metrics in PaddleVideo",
+    "/paddlevideo/metrics/ava_evaluation/metrics.py": "AVA Metrics Calculation",
+    "/paddlevideo/metrics/ava_evaluation/metrics.py:1-30": "Precision and Recall Metrics Function",
+    "/paddlevideo/metrics/ava_evaluation/metrics.py:112-137": "Average Precision and CorLoc Metric",
+    "/paddlevideo/metrics/ava_evaluation/metrics.py:138-143": "Average Class Precision",
+    "/paddlevideo/metrics/ava_evaluation/metrics.py:32-58": "AVA Evaluation Array Validation",
+    "/paddlevideo/metrics/ava_evaluation/metrics.py:59-88": "Average Precision and Recall Calculation",
+    "/paddlevideo/metrics/ava_evaluation/metrics.py:89-111": "Validate and Concatenate Precision-Recall Arrays",
+    "/paddlevideo/metrics/ava_evaluation/np_box_list.py": "Validating Bounding Box Coordinates",
+    "/paddlevideo/metrics/ava_evaluation/np_box_list.py:1-26": "BoxList Class for Bounding Boxes",
+    "/paddlevideo/metrics/ava_evaluation/np_box_list.py:118-138": "Valid Box Check",
+    "/paddlevideo/metrics/ava_evaluation/np_box_list.py:28-52": "Numpy Box List Class",
+    "/paddlevideo/metrics/ava_evaluation/np_box_list.py:54-81": "Box Collection Manager: Count, Retrieve, and Add Data",
+    "/paddlevideo/metrics/ava_evaluation/np_box_list.py:82-117": "Numpy Box List Class",
+    "/paddlevideo/metrics/ava_evaluation/np_box_ops.py": "Numpy Box Operations for IoU",
+    "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:1-29": "Bounding Box Operations for Numpy Arrays",
+    "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:31-57": "Box Area and Intersection Calculations",
+    "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:58-90": "Pairwise IOU/IOA Computation for Box Collections",
+    "/paddlevideo/metrics/ava_evaluation/np_box_ops.py:91-98": "Pairwise IoU Calculation for Bounding Boxes",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py": "AVA Object Detection Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:1-21": "PaddleVideo Object Detection Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:121-139": "Object Detection Evaluation Class",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:140-160": "Object Detection Evaluation Module",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:161-179": "Validating Groundtruth Image Addition",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:180-198": "Checking Existing Image in Object Detection Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:199-219": "Ground Truth Difficulty Check",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:22-58": "Object Detection Evaluator Class",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:220-236": "Single Image Detection Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:237-259": "Detection Class Retrieval and Mask Validation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:260-290": "AVA Object Detection Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:291-315": "Mean Average Precision Calculator",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:316-338": "Object Detection Metrics Calculation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:339-375": "PaddleVideo Object Detection Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:376-402": "Object Detection Evaluation Setup",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:404-430": "Object Detection Evaluation Functions",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:432-447": "AVA Object Detection Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:448-467": "Add Ground Truth Data to Database",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:468-493": "Single Image Evaluation Function",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:494-516": "Create Numpy Array from Detection Masks",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:518-536": "Ground Truth Init for Object Detection",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:537-561": "PaddleVideo: Object Detection Metrics Calculation",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:562-583": "Ground Truth Update for Object Detection",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:584-605": "Object Detection Evaluation Metrics",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:59-86": "AVA Object Detection Evaluation Metric",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:606-629": "AVA-Based Object Detection Evaluation"
+}
\ No newline at end of file
diff --git a/docs/data/titles/9.json b/docs/data/titles/9.json
new file mode 100644
index 000000000..ef664be36
--- /dev/null
+++ b/docs/data/titles/9.json
@@ -0,0 +1,302 @@
+{
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:630-651": "Object Detection Evaluation Metrics",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:652-658": "Object Detection Metrics Calculator",
+    "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py:87-120": "Object Detection Evaluator: Ava Metrics",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py": "AVA Performance Metrics in PaddleVideo",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:1-20": "Single Image AVA Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:116-143": "Per-Image AVA Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:144-161": "AVA Metrics: Per-Image Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:162-183": "Checking and Storing Masks for AVA Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:184-202": "Per-Class Array Extraction",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:203-228": "Per-Image AVA Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:21-53": "Single Image Detection Metrics Evaluator",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:229-249": "Intersection Metrics: IoU and IoA",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:250-276": "Per-Image AVA Evaluation Labeling",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:277-295": "Difficult Box Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:296-322": "True Positive Detection via IoU and Scores",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:323-344": "AVA Per-Image Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:345-371": "Class-Specific Array Retriever",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:372-392": "AVA Metrics Function",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:393-421": "Class-Specific Results Extraction and Invalid Box Removal",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:422-443": "Filter and Slice Input Arrays",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:444-452": "Bounding Boxes and Scores",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:54-73": "AVA Evaluation: True Positives, False Positives, and Ignored Detections",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:74-91": "AVA Metric Calculation",
+    "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py:92-115": "Invalid Detection Box Removal for Object Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/standard_fields.py": "Standard Fields for AVA Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/standard_fields.py:1-26": "Standard Object Detection Fields",
+    "/paddlevideo/metrics/ava_evaluation/standard_fields.py:114-115": "Standard Fields: Detection Metrics",
+    "/paddlevideo/metrics/ava_evaluation/standard_fields.py:27-46": "Standard Fields for AVA Evaluation",
+    "/paddlevideo/metrics/ava_evaluation/standard_fields.py:47-66": "AVA Evaluation Dictionary Defined",
+    "/paddlevideo/metrics/ava_evaluation/standard_fields.py:67-86": "Standard AWA Field Definitions",
+    "/paddlevideo/metrics/ava_evaluation/standard_fields.py:89-113": "Standard Video Object Detector Naming Conventions",
+    "/paddlevideo/metrics/ava_metric.py": "AVAMetric: PaddleVideo's Metric for Video Object Detection",
+    "/paddlevideo/metrics/ava_metric.py:1-34": "AVAMetric: PaddleVideo Metric",
+    "/paddlevideo/metrics/ava_metric.py:35-60": "Video Metrics Initialization in PaddlePaddle",
+    "/paddlevideo/metrics/ava_metric.py:61-90": "AVA Metrics Calculation and Logging Class",
+    "/paddlevideo/metrics/ava_metric.py:92-92": "Class Method Returns Record List",
+    "/paddlevideo/metrics/ava_utils.py": "AVA Metrics Utilities for Video Object Detection",
+    "/paddlevideo/metrics/ava_utils.py:1-31": "AVA Metrics Evaluation Utilities",
+    "/paddlevideo/metrics/ava_utils.py:122-147": "CSV Object Detection Results Merger",
+    "/paddlevideo/metrics/ava_utils.py:149-181": "Excluding Images and Labelmap without Protocol Buffers",
+    "/paddlevideo/metrics/ava_utils.py:182-210": "Mean Average Precision for AVA Evaluation",
+    "/paddlevideo/metrics/ava_utils.py:211-240": "AVA Proposal Generation",
+    "/paddlevideo/metrics/ava_utils.py:241-265": "Average Recall and mAP Calculation",
+    "/paddlevideo/metrics/ava_utils.py:266-286": "Single Image Pascal Evaluator Addition",
+    "/paddlevideo/metrics/ava_utils.py:287-320": "AVA Metrics Code Snippet",
+    "/paddlevideo/metrics/ava_utils.py:32-64": "CSV Results Conversion Functions",
+    "/paddlevideo/metrics/ava_utils.py:323-357": "Collecting Results Across GPUs",
+    "/paddlevideo/metrics/ava_utils.py:358-384": "AVA Evaluation Utils",
+    "/paddlevideo/metrics/ava_utils.py:385-394": "Mean Average Precision Computation Code",
+    "/paddlevideo/metrics/ava_utils.py:66-97": "Utility Functions for Video Analysis",
+    "/paddlevideo/metrics/ava_utils.py:99-120": "CSV to Dictionaries: AVA Metrics",
+    "/paddlevideo/metrics/base.py": "PaddleVideo Metrics Base Class",
+    "/paddlevideo/metrics/base.py:1-31": "PaddleVideo Metrics Base Class Initialization",
+    "/paddlevideo/metrics/base.py:33-52": "All-Gather and Concatenation Function",
+    "/paddlevideo/metrics/bmn_metric.py": "BMN Metric for Paddle Video",
+    "/paddlevideo/metrics/bmn_metric.py:1-32": "Intersection over Union Calculation Code",
+    "/paddlevideo/metrics/bmn_metric.py:128-156": "Class Variables and Metrics Initialization",
+    "/paddlevideo/metrics/bmn_metric.py:157-182": "Boundary Detection Score Vector List Generation",
+    "/paddlevideo/metrics/bmn_metric.py:183-206": "Post-Process Video Metrics Calculation",
+    "/paddlevideo/metrics/bmn_metric.py:207-229": "Parallel Video Processing with bmn_post_Processing",
+    "/paddlevideo/metrics/bmn_metric.py:230-256": "Parallel Video Processing with Multiprocessing",
+    "/paddlevideo/metrics/bmn_metric.py:257-282": "Soft NMS Processing",
+    "/paddlevideo/metrics/bmn_metric.py:283-304": "Calculate Metrics with ANETproposal",
+    "/paddlevideo/metrics/bmn_metric.py:33-63": "Bounding Box Metrics Calculation",
+    "/paddlevideo/metrics/bmn_metric.py:65-98": "BMN Metric: Object Detection Algorithm",
+    "/paddlevideo/metrics/bmn_metric.py:99-127": "Initializing BMN Metric Class in PaddleVideo",
+    "/paddlevideo/metrics/build.py": "Building Metrics with Apache License",
+    "/paddlevideo/metrics/center_crop_metric.py": "Center Crop Metric: PaddleVideo's Batch-Aware Class",
+    "/paddlevideo/metrics/center_crop_metric.py:1-31": "CenterCrop Metric Registration",
+    "/paddlevideo/metrics/center_crop_metric.py:32-55": "Batch-Initializing Metric for Multi-GPU Data",
+    "/paddlevideo/metrics/center_crop_metric.py:56-79": "Batch Processing Metric",
+    "/paddlevideo/metrics/center_crop_metric_MRI.py": "Top-1/5 Accuracy Tracker",
+    "/paddlevideo/metrics/center_crop_metric_MRI.py:1-33": "CenterCropMetric_MRI: Video Metric Class",
+    "/paddlevideo/metrics/center_crop_metric_MRI.py:34-60": "Top-1/5 Accuracy Calculator",
+    "/paddlevideo/metrics/center_crop_metric_MRI.py:61-61": "Mean of Top-1 Accuracy",
+    "/paddlevideo/metrics/depth_metric.py": "DepthMetric: Distributed Batch Processing",
+    "/paddlevideo/metrics/depth_metric.py:1-34": "Depth Metric: Distributed Computing",
+    "/paddlevideo/metrics/depth_metric.py:35-57": "Distributed All-Reduce Metrics Averaging",
+    "/paddlevideo/metrics/depth_metric.py:58-77": "Batch Processing and Metric Accumulation",
+    "/paddlevideo/metrics/msrvtt_metric.py": "MSR-VTT Metrics Computation",
+    "/paddlevideo/metrics/msrvtt_metric.py:1-31": "MSRVTT Metric Initialization",
+    "/paddlevideo/metrics/msrvtt_metric.py:32-56": "MSR-VTT Rank Metrics Calculator",
+    "/paddlevideo/metrics/msrvtt_metric.py:57-62": "MSRVTT Metric Accumulator",
+    "/paddlevideo/metrics/multi_crop_metric.py": "Multi-Crop Metric in PaddleVideo",
+    "/paddlevideo/metrics/multi_crop_metric.py:1-35": "MultiCrop Metric: PaddleVideo Class",
+    "/paddlevideo/metrics/multi_crop_metric.py:105-108": "Multi-Crop Metric Average Accuracy Logging",
+    "/paddlevideo/metrics/multi_crop_metric.py:36-61": "Multi-Crop Metric Initialization",
+    "/paddlevideo/metrics/multi_crop_metric.py:62-83": "Multi-Crop Ensemble Metric",
+    "/paddlevideo/metrics/multi_crop_metric.py:84-104": "Multi-Crop Metric Calculation",
+    "/paddlevideo/metrics/recall.py": "Paddle Video Recall Metrics Calculation",
+    "/paddlevideo/metrics/recall.py:1-27": "PaddleRecall: Object Detection Recall Calculator",
+    "/paddlevideo/metrics/recall.py:29-62": "Precision-Recall Curve Calculation",
+    "/paddlevideo/metrics/recall.py:64-84": "Object Detection Recall Calculator",
+    "/paddlevideo/metrics/registry.py": "Registry-Based Metrics Management",
+    "/paddlevideo/metrics/segmentation_metric.py": "Label Change Detection Metric",
+    "/paddlevideo/metrics/segmentation_metric.py:1-35": "Segmentation Metric Function",
+    "/paddlevideo/metrics/segmentation_metric.py:128-161": "Segmentation Metric: Precision, Recall, F1",
+    "/paddlevideo/metrics/segmentation_metric.py:162-191": "Refining Object Detection Proposals",
+    "/paddlevideo/metrics/segmentation_metric.py:192-230": "Average Recall Calculation for Video Segmentation",
+    "/paddlevideo/metrics/segmentation_metric.py:231-264": "Segmentation Metric Initialization",
+    "/paddlevideo/metrics/segmentation_metric.py:265-295": "Accuracy Calculation via Segmentation",
+    "/paddlevideo/metrics/segmentation_metric.py:296-330": "Segmentation Metrics Accumulation",
+    "/paddlevideo/metrics/segmentation_metric.py:331-356": "Segmentation Metrics Calculator",
+    "/paddlevideo/metrics/segmentation_metric.py:358-385": "Segmentation Metric Calculator",
+    "/paddlevideo/metrics/segmentation_metric.py:36-57": "Segmentation Score Calculator",
+    "/paddlevideo/metrics/segmentation_metric.py:386-389": "Initialize Proposal Metrics List",
+    "/paddlevideo/metrics/segmentation_metric.py:58-91": "Segmentation Metric: Labeling and Distance Calculation",
+    "/paddlevideo/metrics/segmentation_metric.py:92-126": "Levenstein Distance for Video Segmentation",
+    "/paddlevideo/metrics/skeleton_metric.py": "SkeletonMetric: PaddleVideo's Skeleton-Based Metric Tool",
+    "/paddlevideo/metrics/skeleton_metric.py:1-38": "Skeleton Metric Calculator",
+    "/paddlevideo/metrics/skeleton_metric.py:39-65": "Metrics Tracking Class",
+    "/paddlevideo/metrics/skeleton_metric.py:66-88": "Accuracy Calculator",
+    "/paddlevideo/metrics/skeleton_metric.py:89-96": "Save Values to File: Skeleton Metric Logging",
+    "/paddlevideo/metrics/transnetv2_metric.py": "TransNetV2 Metric Calculator",
+    "/paddlevideo/metrics/transnetv2_metric.py:1-34": "Predictions to Scenes: Identifying Scene Changes",
+    "/paddlevideo/metrics/transnetv2_metric.py:121-152": "TransNetV2 Metric Calculation",
+    "/paddlevideo/metrics/transnetv2_metric.py:153-174": "Machine Learning Metric Calculator",
+    "/paddlevideo/metrics/transnetv2_metric.py:35-57": "Transnet V2 Metric Conversion",
+    "/paddlevideo/metrics/transnetv2_metric.py:58-80": "TransNet V2 Metric Calculation",
+    "/paddlevideo/metrics/transnetv2_metric.py:81-120": "Transnetv2 Metric Calculator",
+    "/paddlevideo/metrics/ucf24_utils.py": "UCF24 Metrics: PaddleVideo Utility Functions",
+    "/paddlevideo/metrics/ucf24_utils.py:1-33": "Average Precision Metrics in UCF101 Dataset",
+    "/paddlevideo/metrics/ucf24_utils.py:123-148": "Draw Text Box Around Rectangle",
+    "/paddlevideo/metrics/ucf24_utils.py:149-165": "Ucf24Metrics Constructor",
+    "/paddlevideo/metrics/ucf24_utils.py:166-181": "BoundingBox Class Definition",
+    "/paddlevideo/metrics/ucf24_utils.py:183-207": "Relative to Absolute Bounding Box Conversion",
+    "/paddlevideo/metrics/ucf24_utils.py:208-232": "Bounding Box Class for Image Formats",
+    "/paddlevideo/metrics/ucf24_utils.py:233-266": "Detection Result Class with Compare Method",
+    "/paddlevideo/metrics/ucf24_utils.py:268-294": "Bounding Box Comparison and Cloning",
+    "/paddlevideo/metrics/ucf24_utils.py:297-332": "Bounding Box Collection Class",
+    "/paddlevideo/metrics/ucf24_utils.py:333-359": "Bounding Box Utilities",
+    "/paddlevideo/metrics/ucf24_utils.py:34-81": "Bounding Box Converter Utilities",
+    "/paddlevideo/metrics/ucf24_utils.py:360-380": "Pascal VOC Metrics Calculation",
+    "/paddlevideo/metrics/ucf24_utils.py:381-397": "Class Metrics List",
+    "/paddlevideo/metrics/ucf24_utils.py:398-422": "Detection Metrics Initialization and Sorting",
+    "/paddlevideo/metrics/ucf24_utils.py:423-445": "Detection Metrics Calculation",
+    "/paddlevideo/metrics/ucf24_utils.py:446-465": "Precision, Recall, Average Precision Calculation",
+    "/paddlevideo/metrics/ucf24_utils.py:466-495": "Calculate Average Precision for Classes",
+    "/paddlevideo/metrics/ucf24_utils.py:496-523": "Interpolated Average Precision Calculation",
+    "/paddlevideo/metrics/ucf24_utils.py:524-553": "Calculating AP, AUC, and IoU in Video Metrics",
+    "/paddlevideo/metrics/ucf24_utils.py:554-583": "Bounding Box Intersection Utility",
+    "/paddlevideo/metrics/ucf24_utils.py:584-617": "Bounding Box Intersection and Union Calculation",
+    "/paddlevideo/metrics/ucf24_utils.py:618-648": "Validate Image Size and Coordinate Type Functions",
+    "/paddlevideo/metrics/ucf24_utils.py:649-680": "Bounding Box Reader Function",
+    "/paddlevideo/metrics/ucf24_utils.py:681-711": "Bounding Box Analyzer",
+    "/paddlevideo/metrics/ucf24_utils.py:712-743": "Mean Average Precision Calculator",
+    "/paddlevideo/metrics/ucf24_utils.py:744-772": "Average Precision Calculation",
+    "/paddlevideo/metrics/ucf24_utils.py:773-783": "Mean Average Precision Calculator",
+    "/paddlevideo/metrics/ucf24_utils.py:82-122": "Absolute Bounding Box Conversion and Visualization",
+    "/paddlevideo/metrics/vos_metric.py": "VOS Metric: Video Object Segmentation",
+    "/paddlevideo/metrics/vos_metric.py:1-38": "VOS Metric: PaddleVideo Segmentation",
+    "/paddlevideo/metrics/vos_metric.py:115-129": "Introducing New Labels in VOS Metric",
+    "/paddlevideo/metrics/vos_metric.py:131-147": "Average Max Prediction",
+    "/paddlevideo/metrics/vos_metric.py:149-168": "Frame-wise Mask Updating and Timing",
+    "/paddlevideo/metrics/vos_metric.py:169-191": "Average Time per Frame Calculator",
+    "/paddlevideo/metrics/vos_metric.py:192-209": "Flip and Save Mask Tensor",
+    "/paddlevideo/metrics/vos_metric.py:210-222": "Unknown Variable Range Identification",
+    "/paddlevideo/metrics/vos_metric.py:223-236": "Consecutive Integers Range",
+    "/paddlevideo/metrics/vos_metric.py:237-250": "Frame Metrics Analysis",
+    "/paddlevideo/metrics/vos_metric.py:251-272": "Masking and Saving Images in PaddleVideo Metrics",
+    "/paddlevideo/metrics/vos_metric.py:273-276": "Metrics Calculation Class Zip Savior",
+    "/paddlevideo/metrics/vos_metric.py:39-68": "VOS Metric Class Initialization",
+    "/paddlevideo/metrics/vos_metric.py:69-91": "Data Loading and Processing Loop",
+    "/paddlevideo/metrics/vos_metric.py:93-113": "Prepare Data for Video Object Detection Model",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py": "Average Precision Calculator for VOD",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:1-23": "Interpolated Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:109-134": "Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:136-166": "Non-Interpolated Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:168-192": "Non-Interpolated Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:193-220": "Non-Interpolated Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:221-256": "Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:25-55": "Average Precision Calculator for Long Lists",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:257-274": "Normalized Predictions for Average Precision",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:57-86": "Average Precision Calculator Class",
+    "/paddlevideo/metrics/youtube8m/average_precision_calculator.py:87-108": "Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/eval_util.py": "Paddlevideo Metrics Evaluation Utility",
+    "/paddlevideo/metrics/youtube8m/eval_util.py:1-29": "Eval Util for Model Metrics",
+    "/paddlevideo/metrics/youtube8m/eval_util.py:117-137": "Top-k Triplet Prediction Evaluation",
+    "/paddlevideo/metrics/youtube8m/eval_util.py:138-167": "Top-K Prediction Evaluation Metrics",
+    "/paddlevideo/metrics/youtube8m/eval_util.py:169-193": "HitOneMetric: Evaluating Metrics in Video Prediction Task",
+    "/paddlevideo/metrics/youtube8m/eval_util.py:194-205": "Calculating Gap in YouTube8m Evaluation",
+    "/paddlevideo/metrics/youtube8m/eval_util.py:32-60": "Video-level Annotation Precision",
+    "/paddlevideo/metrics/youtube8m/eval_util.py:61-90": "Video Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/eval_util.py:91-116": "Global Average Precision Calculation",
+    "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py": "Mean Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:1-27": "Mean Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:113-114": "Mean Average Precision Calculator",
+    "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:28-59": "Binary Classification Dataset Generation",
+    "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:60-80": "Mean Average Precision Calculator for Video Classification",
+    "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py:81-112": "Mean Average Precision Calculator",
+    "/paddlevideo/metrics/yowo_metric.py": "YOWO Metric Integration in PaddleVideo",
+    "/paddlevideo/metrics/yowo_metric.py:1-30": "YOWOMetric: PaddleVideo Metrics",
+    "/paddlevideo/metrics/yowo_metric.py:31-62": "BMN Metrics Initialization and Update",
+    "/paddlevideo/metrics/yowo_metric.py:63-82": "YOLOv5 Box Metrics Accumulator",
+    "/paddlevideo/modeling/__init__.py": "Video Recognition Modeling in PaddleVideo",
+    "/paddlevideo/modeling/__init__.py:1-22": "PaddleVideo Modeling Library",
+    "/paddlevideo/modeling/__init__.py:23-37": "Initializing PaddleVideo Models and Functions",
+    "/paddlevideo/modeling/assigners/__init__.py": "Importing MaxIoUAssignerAVA in PaddleVideo",
+    "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py": "MaxIOUAssignerAVA: Assigning Results Efficiently",
+    "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:1-27": "Max IoU Assigner AVA Class",
+    "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:110-126": "Max IOU Assigner: AVA Dataset Handling",
+    "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:127-148": "Max IOU Assigner Implementation",
+    "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:28-49": "MaxIoUAssignerAVA Initialization",
+    "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:50-75": "Max IOU Assigner: Assigning GT Boxes",
+    "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:76-93": "Max IoU Assigner",
+    "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py:94-109": "Max IOU Assigner Algorithm",
+    "/paddlevideo/modeling/backbones/__init__.py": "Versatile Backbone Models in PaddleVideo",
+    "/paddlevideo/modeling/backbones/__init__.py:1-27": "Backbone Models Initialization in PaddleVideo",
+    "/paddlevideo/modeling/backbones/__init__.py:28-55": "PaddleVideo Backbone Models",
+    "/paddlevideo/modeling/backbones/__init__.py:56-60": "PaddleVideo Backbones List",
+    "/paddlevideo/modeling/backbones/actbert.py": "Multimodal BERT Embeddings for Video Action Recognition",
+    "/paddlevideo/modeling/backbones/actbert.py:1-32": "PaddlePaddle BertEmbeddings Class",
+    "/paddlevideo/modeling/backbones/actbert.py:101-123": "BertSelfAttention Class Definition",
+    "/paddlevideo/modeling/backbones/actbert.py:1019-1034": "ActBERT Input Parameters",
+    "/paddlevideo/modeling/backbones/actbert.py:1035-1047": "Fixed BertLayer Parameters",
+    "/paddlevideo/modeling/backbones/actbert.py:1048-1058": "Transformer Model Default Parameters",
+    "/paddlevideo/modeling/backbones/actbert.py:1059-1092": "Initialize ActBert Model",
+    "/paddlevideo/modeling/backbones/actbert.py:1093-1116": "ActBERT Model Initialization",
+    "/paddlevideo/modeling/backbones/actbert.py:1117-1137": "ActBERT Input Layout",
+    "/paddlevideo/modeling/backbones/actbert.py:1138-1158": "ActBERT Function: Multimodal Prediction and Sequence Relationship",
+    "/paddlevideo/modeling/backbones/actbert.py:124-144": "Multi-Head Attention in ACT-BERT",
+    "/paddlevideo/modeling/backbones/actbert.py:146-169": "BertSelfOutput Layer Implementation",
+    "/paddlevideo/modeling/backbones/actbert.py:170-192": "ActBert: Transformer Backbone Model",
+    "/paddlevideo/modeling/backbones/actbert.py:193-219": "Attention-Based Transformer with Dropout",
+    "/paddlevideo/modeling/backbones/actbert.py:220-246": "BertEntAttention: Vision Attention Class",
+    "/paddlevideo/modeling/backbones/actbert.py:247-267": "Self-Attention Layers in ACTBERT",
+    "/paddlevideo/modeling/backbones/actbert.py:268-299": "Attention Mechanism for Vision and Text",
+    "/paddlevideo/modeling/backbones/actbert.py:300-321": "Multi-Head Attention Operation in ActBERT",
+    "/paddlevideo/modeling/backbones/actbert.py:322-342": "Dropout-based Attention Scoring in ActBERT",
+    "/paddlevideo/modeling/backbones/actbert.py:33-52": "ActBERT Embeddings Initialization",
+    "/paddlevideo/modeling/backbones/actbert.py:343-361": "Multi-scale Context Fusion in ActBERT",
+    "/paddlevideo/modeling/backbones/actbert.py:363-381": "Cross-Attention in Transformers",
+    "/paddlevideo/modeling/backbones/actbert.py:382-409": "BertEntOutput: Layer Normalization and Dropout",
+    "/paddlevideo/modeling/backbones/actbert.py:410-440": "Attention-Based Bert Layer with Dropout",
+    "/paddlevideo/modeling/backbones/actbert.py:441-461": "Bert Layer and Connection Layer Classes",
+    "/paddlevideo/modeling/backbones/actbert.py:462-487": "BertConnectionLayer Initialization",
+    "/paddlevideo/modeling/backbones/actbert.py:488-512": "ActBERT Input Streams Model",
+    "/paddlevideo/modeling/backbones/actbert.py:513-539": "Compute Layer Outputs for ActBert Pathways",
+    "/paddlevideo/modeling/backbones/actbert.py:53-75": "ActBert: Video Action Recognition Backbone",
+    "/paddlevideo/modeling/backbones/actbert.py:540-576": "BertEncoder: Initializing BERT Encoder Parameters",
+    "/paddlevideo/modeling/backbones/actbert.py:577-594": "ACT Bert Layer Initialization",
+    "/paddlevideo/modeling/backbones/actbert.py:595-622": "ActBERT: Multimodal Model for Text, Vision, and Action Embeddings",
+    "/paddlevideo/modeling/backbones/actbert.py:623-645": "Initializing Encoder Layers in ActBERT Model",
+    "/paddlevideo/modeling/backbones/actbert.py:647-669": "Multi-Modal Embedding with Attention Probs",
+    "/paddlevideo/modeling/backbones/actbert.py:670-693": "ActBERT Encoder Layers",
+    "/paddlevideo/modeling/backbones/actbert.py:695-729": "ActBert Pooler Class and Model Initialization",
+    "/paddlevideo/modeling/backbones/actbert.py:730-759": "Customized Bert Model Initialization",
+    "/paddlevideo/modeling/backbones/actbert.py:76-100": "Bert Self-Attention and Embedding Layers",
+    "/paddlevideo/modeling/backbones/actbert.py:760-775": "ACTBERT: Multi-modal Action Model Initiation",
+    "/paddlevideo/modeling/backbones/actbert.py:776-800": "ActBERT Model: Encoding Text, Action, and Visual Features",
+    "/paddlevideo/modeling/backbones/actbert.py:801-819": "Mask Generation for ActBERT",
+    "/paddlevideo/modeling/backbones/actbert.py:820-838": "ACTBERT Extended Mask Creation",
+    "/paddlevideo/modeling/backbones/actbert.py:839-865": "Multimodal ACTBERT Backbone Encoding",
+    "/paddlevideo/modeling/backbones/actbert.py:866-892": "BERT Prediction Heads: Transform and Classes",
+    "/paddlevideo/modeling/backbones/actbert.py:893-909": "BertLMPredictionHead Initialization",
+    "/paddlevideo/modeling/backbones/actbert.py:910-937": "Attention and Feedforward in BERT",
+    "/paddlevideo/modeling/backbones/actbert.py:938-956": "BertPreTrainingHeads Class Initialization",
+    "/paddlevideo/modeling/backbones/actbert.py:957-986": "Multi-Modal ACT-BERT Model",
+    "/paddlevideo/modeling/backbones/actbert.py:987-1018": "Custom ACT-BERT Backbone Model for Multi-Modality",
+    "/paddlevideo/modeling/backbones/adds.py": "PaddleVideo: Enhanced Modeling Backbones",
+    "/paddlevideo/modeling/backbones/adds.py:1-30": "PaddlePaddle Backbones Registration",
+    "/paddlevideo/modeling/backbones/adds.py:1020-1044": "Handling Dict and Non-Dict Model Inputs",
+    "/paddlevideo/modeling/backbones/adds.py:1046-1074": "Night/Day Pose Prediction Function",
+    "/paddlevideo/modeling/backbones/adds.py:105-151": "Transformation Matrix Conversion",
+    "/paddlevideo/modeling/backbones/adds.py:1075-1096": "Calculates Camera Transformation Parameters",
+    "/paddlevideo/modeling/backbones/adds.py:1097-1122": "Depth Estimation with Displacement Interpolation",
+    "/paddlevideo/modeling/backbones/adds.py:1123-1142": "Grid Sampling and Masking for Night Scenes",
+    "/paddlevideo/modeling/backbones/adds.py:1143-1146": "Selecting Input Data from Dictionary",
+    "/paddlevideo/modeling/backbones/adds.py:152-188": "Rotation Operations on 3D Vectors",
+    "/paddlevideo/modeling/backbones/adds.py:189-231": "Efficient Disparity Smoothness Loss Calculation",
+    "/paddlevideo/modeling/backbones/adds.py:232-264": "ResNet Model with Multi-Input Images",
+    "/paddlevideo/modeling/backbones/adds.py:265-294": "ResNet Model Creation Code",
+    "/paddlevideo/modeling/backbones/adds.py:295-330": "Conv3x3 and Depth Backprojection",
+    "/paddlevideo/modeling/backbones/adds.py:31-67": "Depth Prediction and Feature Extraction Functions",
+    "/paddlevideo/modeling/backbones/adds.py:331-355": "PaddleVideo Backbone Parameter Initialization",
+    "/paddlevideo/modeling/backbones/adds.py:357-385": "Camera Projection in Project3D",
+    "/paddlevideo/modeling/backbones/adds.py:386-417": "SSIM Loss Calculator from Pixel Coords",
+    "/paddlevideo/modeling/backbones/adds.py:419-441": "Multi-Input ResNet Model in PaddleVideo",
+    "/paddlevideo/modeling/backbones/adds.py:442-466": "ConvBN Layer Initialization",
+    "/paddlevideo/modeling/backbones/adds.py:467-497": "ConvBN Layer Custom Class",
+    "/paddlevideo/modeling/backbones/adds.py:498-528": "BasicBlock Class Definition",
+    "/paddlevideo/modeling/backbones/adds.py:529-563": "ResNet V1.5 Bottleneck Layer Definition",
+    "/paddlevideo/modeling/backbones/adds.py:564-597": "Bottleneck Convolutional Neural Network",
+    "/paddlevideo/modeling/backbones/adds.py:599-631": "DepthDecoder Class Definition",
+    "/paddlevideo/modeling/backbones/adds.py:633-660": "Decoder Convolutional Network Architecture",
+    "/paddlevideo/modeling/backbones/adds.py:661-686": "Convolutional PoseDecoder Layer",
+    "/paddlevideo/modeling/backbones/adds.py:68-104": "Transpose Conv with BatchNorm and Activation",
+    "/paddlevideo/modeling/backbones/adds.py:688-725": "ResNet Encoder with Adds Convolution",
+    "/paddlevideo/modeling/backbones/adds.py:726-753": "ResNet Backbone Creation and Checks",
+    "/paddlevideo/modeling/backbones/adds.py:754-776": "Shared Encoders and Decoder Backbone",
+    "/paddlevideo/modeling/backbones/adds.py:777-797": "Convolutional Layers with Batch Normalization",
+    "/paddlevideo/modeling/backbones/adds.py:798-817": "Normalizing Image Input for Day Encoder",
+    "/paddlevideo/modeling/backbones/adds.py:818-834": "Day-Night Encoder Convolutions",
+    "/paddlevideo/modeling/backbones/adds.py:835-861": "Day-Night Model Features Extraction",
+    "/paddlevideo/modeling/backbones/adds.py:862-889": "Resnet Encoder Pypaddle Module",
+    "/paddlevideo/modeling/backbones/adds.py:890-917": "ResNet Backbone Model with Multi-Image Inputs",
+    "/paddlevideo/modeling/backbones/adds.py:918-949": "ADDS Depth Estimation Network",
+    "/paddlevideo/modeling/backbones/adds.py:950-972": "Model Initialization and Configuration",
+    "/paddlevideo/modeling/backbones/adds.py:973-996": "Backbone Model for Pose Estimation Init",
+    "/paddlevideo/modeling/backbones/adds.py:997-1019": "Day-Night Backbone Model with Depth Feature Extraction"
+}
\ No newline at end of file
diff --git a/docs/doc/00423058-f625-4c51-ac8e-e6156ee8430a.json b/docs/doc/00423058-f625-4c51-ac8e-e6156ee8430a.json
new file mode 100644
index 000000000..a17984254
--- /dev/null
+++ b/docs/doc/00423058-f625-4c51-ac8e-e6156ee8430a.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code includes necessary headers for OpenCV and PaddlePaddle integration, defines operations like pre-processing, post-processing, and utility functions. The class creates a VideoRecognizer object with initialization variables, initializing the model and operation objects for inference steps.",
+    "details": [
+        {
+            "comment": "This code is licensing information and includes necessary headers for OpenCV and PaddlePaddle API integration. It defines various operations such as pre-processing, post-processing, utility functions, and possibly some video recording functionality using the PaddlePaddle library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/video_rec.h\":0-33",
+            "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#pragma once\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include \"paddle_api.h\"\n#include \"paddle_inference_api.h\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include <include/postprocess_op.h>\n#include <include/preprocess_op.h>\n#include <include/utility.h>"
+        },
+        {
+            "comment": "This class is for creating a VideoRecognizer object, which initializes variables such as the model directory, inference model name, use of GPU, number of segments, recording batch number, GPU ID, GPU memory, CPU math library threads, use of MKLDNN, label path, and optionally sets mean and scale values for image preprocessing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/video_rec.h\":35-56",
+            "content": "using namespace paddle_infer;\nnamespace PaddleVideo\n{\n    class VideoRecognizer\n    {\n    public:\n        explicit VideoRecognizer(const std::string &model_dir, const std::string &inference_model_name, const bool &use_gpu, const int &num_seg,\n                                 const int &rec_batch_num, const int &gpu_id,\n                                 const int &gpu_mem, const int &cpu_math_library_num_threads,\n                                 const bool &use_mkldnn, const std::string &label_path,\n                                 const bool &use_tensorrt, const std::string &precision, const std::vector<float> &_mean = {0.406, 0.456, 0.485},\n                                 const std::vector<float> &_scale = {0.225, 0.224, 0.229})\n        {\n            this->inference_model_name = inference_model_name;\n            this->use_gpu_ = use_gpu;\n            this->num_seg = num_seg;\n            this->rec_batch_num = rec_batch_num;\n            this->gpu_id_ = gpu_id;\n            this->gpu_mem_ = gpu_mem;\n            this->cpu_math_library_num_threads_ = cpu_math_library_num_threads;"
+        },
+        {
+            "comment": "This function initializes the video recognition class, sets member variables for model type (use_mkldnn_, use_tensorrt_), precision, mean values, scale values, and loads the label list from a given path. It also calls the LoadModel() function to load the inference model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/video_rec.h\":57-85",
+            "content": "            this->use_mkldnn_ = use_mkldnn;\n            this->use_tensorrt_ = use_tensorrt;\n            this->precision_ = precision;\n            this->mean_ = _mean;\n            this->scale_ = _scale;\n            this->label_list_ = Utility::ReadDict(label_path);\n            LoadModel(model_dir);\n        }\n        // Load Paddle inference model\n        void LoadModel(const std::string &model_dir);\n        void Run(const std::vector<string> &frames_batch_path, const std::vector<std::vector<cv::Mat> > &frames_batch, std::vector<double> *times);\n    private:\n        std::string inference_model_name;\n        std::shared_ptr<Predictor> predictor_;\n        bool use_gpu_ = false;\n        int gpu_id_ = 0;\n        int rec_batch_num = 1;\n        int gpu_mem_ = 4000;\n        int cpu_math_library_num_threads_ = 4;\n        bool use_mkldnn_ = false;\n        int num_seg = 8;\n        std::vector<std::string> label_list_;\n        std::vector<float> mean_ = {0.406, 0.456, 0.485};\n        std::vector<float> scale_ = {0.225, 0.224, 0.229};"
+        },
+        {
+            "comment": "This code initializes various operation objects for pre-processing and post-processing steps in the VideoRecognizer class of PaddleVideo library. It also sets default values for scale, precision, and use_tensorrt.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/video_rec.h\":86-104",
+            "content": "        bool is_scale_ = true;\n        bool use_tensorrt_ = false;\n        std::string precision_ = \"fp32\";\n        // Instantiate pre-process operation object(s)\n        Scale scale_op_;\n        CenterCrop centercrop_op_;\n        TenCrop tencrop_op_;\n        Normalize normalize_op_;\n        Permute permute_op_;\n        // Instantiate post-process operation object(s)\n        Softmax softmax_op_;\n    }; // class VideoRecognizer\n} // namespace PaddleVideo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/004a4436-34f9-4a92-b22d-5cf5767cc8d2.json b/docs/doc/004a4436-34f9-4a92-b22d-5cf5767cc8d2.json
new file mode 100644
index 000000000..74c7aab53
--- /dev/null
+++ b/docs/doc/004a4436-34f9-4a92-b22d-5cf5767cc8d2.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code initializes an LSTM model in PaddlePaddle for football action prediction and predicts sequences using pre-trained models, measuring time efficiency.",
+    "details": [
+        {
+            "comment": "The code defines a class \"InferModel\" that uses PaddlePaddle's inference API to predict football actions. It initializes with a configuration file specifying the model, parameters, and other settings for inference. The class contains variables such as topk, frame_offset, nms_thread, cls_thread, iou_score_thread, and label_map_file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py\":0-35",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import get_action_result\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"lstm infer\"\"\"\n    def __init__(self, cfg, name='ACTION'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        self.topk           = cfg[name]['topk']\n        self.frame_offset   = cfg[name]['nms_offset']\n        self.nms_thread     = cfg[name]['nms_thread']\n        self.cls_thread     = cfg[name]['classify_score_thread']\n        self.iou_thread     = cfg[name]['iou_score_thread']\n        self.label_map_file = cfg['COMMON']['label_dic']"
+        },
+        {
+            "comment": "This code initializes a LSTM model for video action detection. It sets the FPS, NMS ID, and configures GPU memory optimization. It also sets up input/output tensors for inference on the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py\":36-60",
+            "content": "        self.fps            = cfg['COMMON']['fps']\n        self.nms_id         = 5\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input1_tensor = self.predictor.get_input_handle(input_names[0])\n        self.input2_tensor = self.predictor.get_input_handle(input_names[1])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n    def infer(self, input1_arr, input1_lod, input2_arr=None, input2_lod=None):\n        \"\"\"infer\"\"\"\n        self.input1_tensor.copy_from_cpu(input1_arr)\n        self.input1_tensor.set_lod(input1_lod)"
+        },
+        {
+            "comment": "This code defines a class with methods for pre-processing, predicting, and potentially post-processing data. The pre_process method takes in an input and converts it into a suitable format for the model. The predict method uses a reader to iterate over data and generates results. The LSTM model is run after processing the input, returning the outputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py\":61-90",
+            "content": "        if not input2_arr is None:\n            self.input2_tensor.copy_from_cpu(input2_arr)\n            self.input2_tensor.set_lod(input2_lod)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        # print(output.shape)\n        return output1, output2\n    def pre_process(self, input):\n        \"\"\"pre process\"\"\"\n        input_arr = []\n        input_lod = [0]\n        start_lod = 0\n        end_lod = 0\n        for sub_item in input:\n            end_lod = start_lod + len(sub_item)\n            input_lod.append(end_lod)\n            input_arr.extend(sub_item)\n            start_lod = end_lod\n        input_arr = np.array(input_arr)\n        # print(input_arr.shape)\n        # print([input_lod])\n        return input_arr, [input_lod]\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)\n        results = []\n        for infer_iter, data in enumerate(infer_reader()):"
+        },
+        {
+            "comment": "The code takes in data and preprocesses it into input1_arr, input1_lod, input2_arr, and input2_lod. It then runs an infer function on these inputs to get output1 and output2. The code then extracts predictions_id and predictions_iou from the outputs. It sorts topk_inds in reverse order and appends video_id, preds_id, topk_inds, and preds_iou to the results list. Finally, it calls get_action_result with the results, label_map_file, fps, cls_thread, and iou_thread as arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py\":91-109",
+            "content": "            video_id = [[items[-2], items[-1]] for items in data]\n            input1 = [items[0] for items in data]\n            input2 = [items[1] for items in data]\n            input1_arr, input1_lod = self.pre_process(input1)\n            input2_arr, input2_lod = self.pre_process(input2)\n            output1, output2 = self.infer(input1_arr, input1_lod, input2_arr, input2_lod)\n            # output1, output2 = self.infer(input1_arr, input1_lod)\n            predictions_id = output1 \n            predictions_iou = output2\n            for i in range(len(predictions_id)):\n                topk_inds = predictions_id[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds_id = predictions_id[i][topk_inds]\n                preds_iou = predictions_iou[i][0]\n                results.append((video_id[i], preds_id.tolist(), topk_inds.tolist(), preds_iou.tolist()))\n        predict_result = get_action_result(results, self.label_map_file, self.fps, \n                                           self.cls_thread, self.iou_thread, "
+        },
+        {
+            "comment": "The code initializes an InferModel object with a given configuration file. It then loads proposal data from 'EuroCup2016' and 'WorldCup2018' datasets, storing them in a dictionary. The code also specifies the path for image frames and video features, which will be used for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py\":110-136",
+            "content": "                                           self.nms_id, self.nms_thread, self.frame_offset)\n        return predict_result\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    # proposal total\n    prop_dict = {}\n    for dataset in ['EuroCup2016', 'WorldCup2018']:\n        prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)\n        json_data = json.load(open(prop_json, 'r'))\n        for item in json_data:\n            basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n            basename = basename + '/' + item['video_name'] + '.mp4'\n            prop_dict[basename] = item['bmn_results']\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    # proposal\n    basename = imgs_path.replace('frames', 'mp4') + '.mp4'"
+        },
+        {
+            "comment": "The code predicts action sequences from video features using a pre-trained model, and saves the results in a JSON file. It measures and prints the time taken for prediction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py\":137-151",
+            "content": "    bmn_results = prop_dict[basename]\n    material = {'feature': video_features, 'proposal': bmn_results}\n    t0 = time.time()\n    outputs = model.predict(cfg, material)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    # print(outputs.shape)\n    t1 = time.time()\n    results = {'actions': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) \n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/00d2d52c-8283-48ca-bf88-f0f0a36b5b49.json b/docs/doc/00d2d52c-8283-48ca-bf88-f0f0a36b5b49.json
new file mode 100644
index 000000000..144ebf961
--- /dev/null
+++ b/docs/doc/00d2d52c-8283-48ca-bf88-f0f0a36b5b49.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports various reader classes and registers them using the regist_reader function. It sorts the registrations alphabetically. The BMNINFReader reads data from files with a \"BMN\" extension, while FeatureReader reads action data. No readers are registered for TSM or PPTSM in this version of the code.",
+    "details": [
+        {
+            "comment": "This code imports various reader classes and registers them using the regist_reader function. It sorts the registrations alphabetically. The BMNINFReader reads data from files with a \"BMN\" extension, while FeatureReader reads action data. No readers are registered for TSM or PPTSM in this version of the code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/__init__.py\":0-14",
+            "content": "\"\"\"\nread map for model\n\"\"\"\nfrom reader.reader_utils import regist_reader, get_reader\n# import reader.tsminf_reader as tsminf_reader\n# import reader.audio_reader as audio_reader\nimport reader.bmninf_reader as bmninf_reader\nimport reader.feature_reader as feature_reader\n# regist reader, sort by alphabet\n# regist_reader(\"TSM\", tsminf_reader.TSMINFReader)\n# regist_reader(\"PPTSM\", tsminf_reader.TSMINFReader)\n# regist_reader(\"AUDIO\", audio_reader.AudioReader)\nregist_reader(\"BMN\", bmninf_reader.BMNINFReader)\nregist_reader(\"ACTION\", feature_reader.FeatureReader)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/00e9f015-dcba-409c-ac17-67ef217dbea0.json b/docs/doc/00e9f015-dcba-409c-ac17-67ef217dbea0.json
new file mode 100644
index 000000000..368cf0399
--- /dev/null
+++ b/docs/doc/00e9f015-dcba-409c-ac17-67ef217dbea0.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The Python code initializes a ModelPredict class for basketball action detection using image, audio, and property features. It infers actions by extracting features and utilizing prepared models for classification before saving the output in 'results.json'.",
+    "details": [
+        {
+            "comment": "The code is a Python file containing a class for performing basketball action detection using a combination of image, audio, and property features. It uses various models for feature extraction and classification. The `record_time_info` function is a decorator to log the processing time of different functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/action.py\":0-43",
+            "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport functools\nimport numpy as np\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport mfcc.feature_extractor as mfcc_extractor\nimport models.pptsm_infer as image_model\nimport models.audio_infer as audio_model\nimport models.bmn_infer as prop_model\nimport models.lstm_infer as classify_model\nimport logger\nlogger = logger.Logger()\ndef record_time_info(func):\n    \"\"\"decorator func to log cost time for func\n    \"\"\"\n    @functools.wraps(func)\n    def timer(*args):\n        \"\"\"log cost time for func\n        \"\"\"\n        logger.info(\"function [{}] processing ...\".format(func.__name__))\n        start_time = time.time()\n        retval = func(*args)\n        cost_time = round(time.time() - start_time, 5)\n        logger.info(\"function [{}] run time: {:.2f} min\".format(func.__name__, cost_time / 60))\n        return retval\n    return timer\nclass ActionDetection(object):"
+        },
+        {
+            "comment": "The code initializes a ModelPredict class by parsing a configuration file, setting instance variables based on the configurations, and loading a model. The configurations include settings for debugging, whether to use only BMN (Basketball Motion Network), LSTM (Long Short-Term Memory) or PCM (Prediction of Coming Movement) models, and a dictionary of properties for specific datasets. The load_model method is decorated with @record_time_info, which suggests it records the time taken to execute this function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/action.py\":44-70",
+            "content": "    \"\"\"ModelPredict\"\"\"\n    def __init__(self, cfg_file=\"configs/configs.yaml\"):\n        cfg = parse_config(cfg_file)\n        self.configs = cfg\n        print_configs(self.configs, \"Infer\")\n        name = 'COMMON'\n        self.DEBUG          = cfg[name]['DEBUG']\n        self.BMN_ONLY       = cfg[name]['BMN_ONLY']\n        self.LSTM_ONLY      = cfg[name]['LSTM_ONLY']\n        self.PCM_ONLY       = cfg[name]['PCM_ONLY']\n        if self.LSTM_ONLY:\n            self.prop_dict = {}\n            for dataset in ['EuroCup2016']:\n                prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)\n                json_data = json.load(open(prop_json, 'r'))\n                for item in json_data:\n                    basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n                    basename = basename + '/' + item['video_name'] + '.mp4'\n                    self.prop_dict[basename] = item['bmn_results']\n    @record_time_info\n    def load_model(self):\n        \"\"\"\n        load_model\n        \"\"\""
+        },
+        {
+            "comment": "Code creates InferModels for image, audio, and classification tasks depending on the configurations. The main function infers action by extracting features from images and audio, then classifying them using the prepared models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/action.py\":71-103",
+            "content": "        if not self.DEBUG:\n            self.image_model = image_model.InferModel(self.configs)\n            if not self.PCM_ONLY:\n                self.audio_model = audio_model.InferModel(self.configs)\n        if not self.LSTM_ONLY:\n            self.prop_model = prop_model.InferModel(self.configs)\n        if not self.BMN_ONLY:\n            self.classify_model = classify_model.InferModel(self.configs)\n        logger.info(\"==> Action Detection prepared.\")\n    @record_time_info\n    def infer(self, imgs_path, pcm_path, fps=5):\n        \"\"\"\n        extract_feature\n        \"\"\"\n        print(\"imgs_path  = \", imgs_path)\n        self.imgs_path = imgs_path\n        self.pcm_path = pcm_path\n        self.configs['COMMON']['fps'] = fps\n        logger.info(\"==> input video {}\".format(os.path.basename(self.imgs_path)))\n        # step 1: extract feature\n        video_features = self.extract_feature()\n        # step2: get proposal\n        bmn_results = self.extract_proposal(video_features)\n        # step3: classify \n        material = {'feature': video_features, 'proposal': bmn_results}"
+        },
+        {
+            "comment": "This code defines classes for video feature extraction, proposal generation, and action detection. It utilizes model prediction with configured parameters and logs the shapes of results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/action.py\":104-132",
+            "content": "        action_results = self.video_classify(material)\n        return bmn_results, action_results\n    @record_time_info\n    def video_classify(self, material):\n        \"\"\"video classify\"\"\"\n        if self.BMN_ONLY:\n            return []\n        action_results = self.classify_model.predict(self.configs, material=material) \n        logger.info('action shape {}'.format(np.array(action_results).shape))\n        return action_results\n    @record_time_info\n    def extract_proposal(self, video_features):\n        \"\"\"extract proposal\"\"\"\n        if self.LSTM_ONLY:\n            basename = self.imgs_path.replace('frames', 'mp4') + '.mp4'\n            bmn_results = self.prop_dict[basename]\n            return bmn_results\n        bmn_results = self.prop_model.predict(self.configs, material=video_features)\n        logger.info('proposal shape {}'.format(np.array(bmn_results).shape))\n        return bmn_results\n    @record_time_info\n    def extract_feature(self):\n        \"\"\"extract feature\"\"\"\n        if not self.DEBUG:\n            image_path_list = get_images(self.imgs_path)"
+        },
+        {
+            "comment": "The code configures the model inputs, predicts image and audio features, and stores them in the video_features dictionary. If PCM_ONLY is true, it extracts pcm_features separately. Otherwise, it predicts audio_features along with image_features. If no features are available, it sets feature_path to the image path's corresponding features file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/action.py\":133-151",
+            "content": "            self.configs['PPTSM']['frame_list'] = image_path_list\n            self.configs['AUDIO']['pcm_file'] = self.pcm_path\n            image_features = self.image_model.predict(self.configs)\n            if self.PCM_ONLY:\n                sample_rate = self.configs['AUDIO']['sample_rate']\n                pcm_features = mfcc_extractor.extract_pcm(self.pcm_path, sample_rate)\n                audio_features = []\n            else:\n                audio_features, pcm_features = self.audio_model.predict(self.configs)\n            np_image_features = np.array(image_features, dtype=np.float32)\n            np_audio_features = np.array(audio_features, dtype=np.float32)\n            np_pcm_features = np.array(pcm_features, dtype=np.float32)\n            video_features = {'image_feature': np_image_features,\n                              'audio_feature': np_audio_features,\n                              'pcm_feature': np_pcm_features}\n        else:\n            feature_path = self.imgs_path.replace(\"frames\", \"features\") + '.pkl'"
+        },
+        {
+            "comment": "This code loads video features from file, checks the shape of image_feature, audio_feature, and pcm_feature arrays, and returns the video features. It then calls the ActionDetection model to infer on given image and audio paths, storing the results in bmn_results and action_results variables. Finally, it saves these results in a 'results.json' file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/action.py\":152-173",
+            "content": "            video_features = pickle.load(open(feature_path, 'rb'))\n        logger.info(\"feature shape {} {} {}\".format(video_features['image_feature'].shape,\n                                                    video_features['audio_feature'].shape,\n                                                    video_features['pcm_feature'].shape))\n        return video_features\nif __name__ == '__main__':\n    model_predict = ActionDetection(cfg_file=\"../configs/configs.yaml\")\n    model_predict.load_model()\n    imgs_path = \"/home/work/datasets/EuroCup2016/frames/1be705a8f67648da8ec4b4296fa80895\"\n    pcm_path = \"/home/work/datasets/EuroCup2016/pcm/1be705a8f67648da8ec4b4296fa80895.pcm\"\n    bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n    results = {'bmn_results': bmn_results, 'action_results': action_results}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/01afed3e-a390-4c93-bb7f-7e39d3402189.json b/docs/doc/01afed3e-a390-4c93-bb7f-7e39d3402189.json
new file mode 100644
index 000000000..e88295ce0
--- /dev/null
+++ b/docs/doc/01afed3e-a390-4c93-bb7f-7e39d3402189.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler to log action detection information into \"action_detect.log\" in the 'logs' directory. The logging level is set to INFO, which will log informational messages and above (DEBUG, WARNING, ERROR, CRITICAL).",
+    "details": [
+        {
+            "comment": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler to log action detection information into \"action_detect.log\" in the 'logs' directory. The logging level is set to INFO, which will log informational messages and above (DEBUG, WARNING, ERROR, CRITICAL).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/logger.py\":0-23",
+            "content": "\"\"\"\nlogger\n\"\"\"\nimport os\nimport logging\nclass Logger(logging.Logger):\n    \"\"\"Customized logger for news stripper\n    \"\"\"\n    def __init__(self):\n        super(Logger, self).__init__(self)\n        if not os.path.exists('logs'):\n            os.mkdir('logs')\n        handler = logging.FileHandler(\"logs/action_detect.log\")\n        # handler.setLevel(logging.DEBUG)\n        handler.setLevel(logging.INFO)\n        format = \"%(levelname)s: %(asctime)s: %(filename)s:%(lineno)d %(message)s\"\n        datefmt = \"%y-%m-%d %H:%M:%S\"\n        formatter = logging.Formatter(format, datefmt)\n        handler.setFormatter(formatter)\n        self.addHandler(handler)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/020ce968-d897-4a26-8839-6a0db38089c1.json b/docs/doc/020ce968-d897-4a26-8839-6a0db38089c1.json
new file mode 100644
index 000000000..00463cd1a
--- /dev/null
+++ b/docs/doc/020ce968-d897-4a26-8839-6a0db38089c1.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The code imports the ANETproposal class from the \"anet_prop\" module and adds it to the __all__ list, making it a public API in the package. This allows other modules to import and use this class directly.",
+    "details": [
+        {
+            "comment": "The code imports the ANETproposal class from the \"anet_prop\" module and adds it to the __all__ list, making it a public API in the package. This allows other modules to import and use this class directly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/__init__.py\":0-2",
+            "content": "from .anet_prop import ANETproposal\n__all__ = ['ANETproposal']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/03076d14-da10-47c4-8e24-22c47b2332ef.json b/docs/doc/03076d14-da10-47c4-8e24-22c47b2332ef.json
new file mode 100644
index 000000000..fd2e8ba77
--- /dev/null
+++ b/docs/doc/03076d14-da10-47c4-8e24-22c47b2332ef.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code reads parameters, separates configurations, and executes inference tests on different GPUs/CPUs for batch sizes. It sets up a loop for PaddleVideo model inference, handles hardware configurations, prepares settings for exporting models, logs results, and calls the \"func_inference\" function.",
+    "details": [
+        {
+            "comment": "The code reads a file, parses parameters for model name, Python version, GPU usage, quantization configuration file, model path, output directory, data directory, data annotation file, and batch numbers. It uses awk to extract specific lines from the file and functions defined in common_func.sh for parameter extraction. The MODE variable can have values to determine the type of task being performed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_ptq_inference_python.sh\":0-28",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\n# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer']\nMODE=$2\ndataline=$(awk 'NR==1, NR==32{print}'  $FILENAME)\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# The training params\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython=$(func_parser_value \"${lines[2]}\")\nuse_gpu_key=$(func_parser_key \"${lines[3]}\")\nuse_gpu_value=$(func_parser_value \"${lines[3]}\")\nquant_config_file_key=$(func_parser_key \"${lines[4]}\")\nquant_config_file_value=$(func_parser_value \"${lines[4]}\")\nmodel_path_key=$(func_parser_key \"${lines[5]}\")\nmodel_path_value=$(func_parser_value \"${lines[5]}\")\noutput_dir_key=$(func_parser_key \"${lines[6]}\")\noutput_dir_value=$(func_parser_value \"${lines[6]}\")\ndata_dir_key=$(func_parser_key \"${lines[7]}\")\ndata_dir_value=$(func_parser_value \"${lines[7]}\")\ndata_anno_key=$(func_parser_key \"${lines[8]}\")\ndata_anno_value=$(func_parser_value \"${lines[8]}\")\nbatch_num_key=$(func_parser_key \"${lines[9]}\")"
+        },
+        {
+            "comment": "The code retrieves values and keys from a configuration file, storing them in variables for later use. It separates trainer and inference configurations, including GPU usage, inferential model parameters, video directory path, and benchmark options.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_ptq_inference_python.sh\":29-51",
+            "content": "batch_num_value=$(func_parser_value \"${lines[9]}\")\nquant_batch_size_key=$(func_parser_key \"${lines[10]}\")\nquant_batch_size_value=$(func_parser_value \"${lines[10]}\")\n# parser trainer\ntrain_py=$(func_parser_value \"${lines[13]}\")\n# parser inference\ninference_py=$(func_parser_value \"${lines[16]}\")\nuse_gpu_key=$(func_parser_key \"${lines[17]}\")\nuse_gpu_list=$(func_parser_value \"${lines[17]}\")\ninfer_config_file_key=$(func_parser_key \"${lines[18]}\")\ninfer_config_file_value=$(func_parser_value \"${lines[18]}\")\ninfer_batch_size_key=$(func_parser_key \"${lines[19]}\")\ninfer_batch_size_list=$(func_parser_value \"${lines[19]}\")\ninfer_model_key=$(func_parser_key \"${lines[20]}\")\ninfer_model_value=$(func_parser_value \"${lines[20]}\")\ninfer_params_key=$(func_parser_key \"${lines[21]}\")\ninfer_params_value=$(func_parser_value \"${lines[21]}\")\ninfer_video_key=$(func_parser_key \"${lines[22]}\")\ninfer_video_dir=$(func_parser_value \"${lines[22]}\")\nbenchmark_key=$(func_parser_key \"${lines[23]}\")\nbenchmark_value=$(func_parser_value \"${lines[23]}\")"
+        },
+        {
+            "comment": "This function executes inference on different GPUs and CPUs for various batch sizes. It sets log paths, parameters, model file path, params file path, and config file path using helper functions. The script performs inference using Python and logs the results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_ptq_inference_python.sh\":54-73",
+            "content": "function func_inference(){\n    IFS='|'\n    _python=$1\n    _script=$2\n    _model_dir=$3\n    _log_path=$4\n    _img_dir=$5\n    # inference\n    for use_gpu in ${use_gpu_list[*]}; do\n        # cpu\n        if [ ${use_gpu} = \"False\" ] || [ ${use_gpu} = \"cpu\" ]; then\n            for batch_size in ${infer_batch_size_list[*]}; do\n                _save_log_path=\"${_log_path}/python_infer_cpu_batchsize_${batch_size}.log\"\n                set_infer_data=$(func_set_params \"${infer_video_key}\" \"${_img_dir}\")\n                set_benchmark=$(func_set_params \"${benchmark_key}\" \"${benchmark_value}\")\n                set_batchsize=$(func_set_params \"${infer_batch_size_key}\" \"${batch_size}\")\n                set_model_file_path=$(func_set_params \"${infer_model_key}\" \"${infer_model_value}\")\n                set_params_file_path=$(func_set_params \"${infer_params_key}\" \"${infer_params_value}\")\n                set_config_file_path=$(func_set_params \"${infer_config_file_key}\" \"${infer_config_file_value}\")\n                command=\"${_"
+        },
+        {
+            "comment": "This code is running a loop to execute inference tests on different GPU configurations. It sets variables for batch size, input data path, and model file path. The output logs are saved into specific files for later analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_ptq_inference_python.sh\":73-87",
+            "content": "python} ${_script} ${use_gpu_key}=${use_gpu} ${set_config_file_path} ${set_model_file_path} ${set_params_file_path} ${set_batchsize} ${set_infer_data} ${set_benchmark} > ${_save_log_path} 2>&1 \"\n                # echo $command\n                eval $command\n                last_status=${PIPESTATUS[0]}\n                eval \"cat ${_save_log_path}\"\n                status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\"\n            done\n        # gpu\n        elif [ ${use_gpu} = \"True\" ] || [ ${use_gpu} = \"gpu\" ]; then\n            for batch_size in ${infer_batch_size_list[*]}; do\n                _save_log_path=\"${_log_path}/python_infer_gpu_batchsize_${batch_size}.log\"\n                set_infer_data=$(func_set_params \"${infer_video_key}\" \"${_img_dir}\")\n                set_benchmark=$(func_set_params \"${benchmark_key}\" \"${benchmark_value}\")\n                set_batchsize=$(func_set_params \"${infer_batch_size_key}\" \"${batch_size}\")\n                set_model_file_path=$(func_set_params \"${infer_model_key}\" \"${infer_model_value}\")"
+        },
+        {
+            "comment": "This code is setting up a loop to run inference on the PaddleVideo model for different hardware configurations and modes. It sets the necessary parameters, files, and batch size, then executes the command and checks the status of the execution. The output is logged in a specified directory. If the mode is \"whole_infer\", it uses IFS to separate the export settings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_ptq_inference_python.sh\":88-111",
+            "content": "                set_params_file_path=$(func_set_params \"${infer_params_key}\" \"${infer_params_value}\")\n                set_config_file_path=$(func_set_params \"${infer_config_file_key}\" \"${infer_config_file_value}\")\n                command=\"${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_config_file_path} ${set_model_file_path} ${set_params_file_path} ${set_batchsize} ${set_infer_data} ${set_benchmark} > ${_save_log_path} 2>&1 \"\n                echo $command\n                eval $command\n                last_status=${PIPESTATUS[0]}\n                eval \"cat ${_save_log_path}\"\n                status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\"\n            done\n        else\n            echo \"Does not support hardware other than CPU and GPU Currently!\"\n        fi\n    done\n}\n# log\nLOG_PATH=\"./log/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_python.log\"\nif [ ${MODE} = \"whole_infer\" ]; then\n    IFS=\"|\"\n    # run export\n    set_output_dir=$(func_set_params \"${output_dir_key}\" \"${output_dir_value}\")"
+        },
+        {
+            "comment": "This code is preparing various settings for executing a command to export a model. It sets values from input variables, exports the model with specified parameters, logs the results, and then checks the status of the export. Finally, it prepares a directory for running inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_ptq_inference_python.sh\":112-128",
+            "content": "    set_data_dir=$(func_set_params \"${data_dir_key}\" \"${data_dir_value}\")\n    set_data_anno=$(func_set_params \"${data_anno_key}\" \"${data_anno_value}\")\n    set_batch_size=$(func_set_params \"${quant_batch_size_key}\" \"${quant_batch_size_value}\")\n    set_batch_num=$(func_set_params \"${batch_num_key}\" \"${batch_num_value}\")\n    set_model_path=$(func_set_params \"${model_path_key}\" \"${model_path_value}\")\n    set_config_file=$(func_set_params \"${quant_config_file_key}\" \"${quant_config_file_value}\")\n    set_use_gpu=$(func_set_params \"${use_gpu_key}\" \"${use_gpu_value}\")\n    export_log_path=\"${LOG_PATH}/${MODE}_export_${Count}.log\"\n    export_cmd=\"${python} ${train_py} ${set_use_gpu} ${set_config_file} ${set_model_path} ${set_batch_num} ${set_batch_size} ${set_data_dir} ${set_data_anno} ${set_output_dir} > ${export_log_path} 2>&1 \"\n    echo $export_cmd\n    eval $export_cmd\n    status_export=$?\n    status_check $status_export \"${export_cmd}\" \"${status_log}\" \"${model_name}\"\n    save_infer_dir=${output_dir_value}\n    #run inference"
+        },
+        {
+            "comment": "The code snippet is calling a function named \"func_inference\" with arguments such as python, inference_py (likely the path of the Python script), save_infer_dir, LOG_PATH and infer_video_dir. This could be a part of an if condition block, possibly initializing or running an inference process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_ptq_inference_python.sh\":129-131",
+            "content": "    func_inference \"${python}\" \"${inference_py}\" \"${save_infer_dir}\" \"${LOG_PATH}\" \"${infer_video_dir}\"\nfi"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/036e6775-6c6e-4d70-b4be-f225cf60a4ff.json b/docs/doc/036e6775-6c6e-4d70-b4be-f225cf60a4ff.json
new file mode 100644
index 000000000..2a1f10b1f
--- /dev/null
+++ b/docs/doc/036e6775-6c6e-4d70-b4be-f225cf60a4ff.json
@@ -0,0 +1,75 @@
+{
+    "summary": "This Python module provides video processing layers and functions, including Depthwise Separable Convolution layers initialization, PPTSMV2 model with convolutional layers, and batch normalization. It defines a PPTSM_v2 backbone model for video analysis with customizable options like pretrained models, scaling, depths, dropout probability, and additional arguments.",
+    "details": [
+        {
+            "comment": "This code is a Python module for the PaddlePaddle framework. It contains definitions of various layers and functions used in neural network backbones, including convolutional layers, pooling layers, batch normalization, linear layers, and more. The code also includes comments about copyright and licensing information, as well as imports necessary modules for these operations. Additionally, it references utility functions for weight initialization and model loading from checkpoints.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":0-26",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import, division, print_function\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear, BatchNorm2D\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import KaimingNormal\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt"
+        },
+        {
+            "comment": "This code defines the PPLCNetV2 backbone model for video processing tasks. It includes the URL to download a pretrained model, stages of the network (PPLCNet), and network configurations. The make_divisible function is used to round up numbers for better performance. The GlobalAttention class is a lightweight temporal attention module used in the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":28-63",
+            "content": "# MODEL_URLS = {\n#     \"PPLCNetV2\":\n#     \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_ssld_pretrained.pdparams\",\n# }\nMODEL_STAGES_PATTERN = {\n    \"PPLCNet\": [\"blocks2\", \"blocks3\", \"blocks4\", \"blocks5\", \"blocks6\"]\n}\nNET_CONFIG = {\n    # in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut\n    \"stage1\": [64, 3, False, False, False, False],\n    \"stage2\": [128, 3, False, False, False, False],\n    \"stage3\": [256, 5, True, True, True, False],\n    \"stage4\": [512, 5, False, True, False, True],\n}\ndef make_divisible(v, divisor=8, min_value=None):\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\nclass GlobalAttention(nn.Layer):\n    \"\"\"\n    Lightweight temporal attention module.\n    \"\"\"\n    def __init__(self, num_seg=8):\n        super().__init__()\n        self.fc = nn.Linear(in_features=num_seg,\n                            out_features=num_seg,"
+        },
+        {
+            "comment": "The code defines a ConvBNLayer class and a PPTSMV2 class. The ConvBNLayer class is a convolution layer followed by batch normalization, and the PPTSMV2 class is an encoder model that takes input of shape (-1, 3, H, W) where H and W are height and width respectively, and returns output of the same shape after processing. It first resizes the input, applies convolution with specified parameters, calculates attention maps, and performs element-wise multiplication between original input and attention maps to extract relevant features for each segmented region.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":64-95",
+            "content": "                            weight_attr=ParamAttr(learning_rate=5.0,\n                                                  regularizer=L2Decay(1e-4)),\n                            bias_attr=ParamAttr(learning_rate=10.0,\n                                                regularizer=L2Decay(0.0)))\n        self.num_seg = num_seg\n    def forward(self, x):\n        _, C, H, W = x.shape\n        x0 = x\n        x = x.reshape([-1, self.num_seg, C * H * W])\n        x = paddle.mean(x, axis=2)  # efficient way of avg_pool\n        x = x.squeeze(axis=-1)\n        x = self.fc(x)\n        attention = F.sigmoid(x)\n        attention = attention.reshape(\n            (-1, self.num_seg, 1, 1, 1))  #for broadcast\n        x0 = x0.reshape([-1, self.num_seg, C, H, W])\n        y = paddle.multiply(x0, attention)\n        y = y.reshape_([-1, C, H, W])\n        return y\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride,\n                 groups=1,"
+        },
+        {
+            "comment": "The code defines a Conv2D layer followed by a BatchNorm2D layer and an optional ReLU activation function. The SEModule class inherits from nn.Layer and contains an AdaptiveAvgPool2D layer for average pooling operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":96-126",
+            "content": "                 use_act=True):\n        super().__init__()\n        self.use_act = use_act\n        self.conv = Conv2D(in_channels=in_channels,\n                           out_channels=out_channels,\n                           kernel_size=kernel_size,\n                           stride=stride,\n                           padding=(kernel_size - 1) // 2,\n                           groups=groups,\n                           weight_attr=ParamAttr(initializer=KaimingNormal()),\n                           bias_attr=False)\n        self.bn = BatchNorm2D(out_channels,\n                              weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                              bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        if self.use_act:\n            self.act = nn.ReLU()\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        if self.use_act:\n            x = self.act(x)\n        return x\nclass SEModule(nn.Layer):\n    def __init__(self, channel, reduction=4):\n        super().__init__()\n        self.avg_pool = AdaptiveAvgPool2D(1)"
+        },
+        {
+            "comment": "This code initializes a depthwise separable convolution layer with optional parameters like in_channels, out_channels, stride, dw_size, split_pw, use_rep, and use_se. It contains Conv2D layers for convolution operations, ReLU activation, Sigmoid activation, and element-wise multiplication for identity shortcut connection.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":127-160",
+            "content": "        self.conv1 = Conv2D(in_channels=channel,\n                            out_channels=channel // reduction,\n                            kernel_size=1,\n                            stride=1,\n                            padding=0)\n        self.relu = nn.ReLU()\n        self.conv2 = Conv2D(in_channels=channel // reduction,\n                            out_channels=channel,\n                            kernel_size=1,\n                            stride=1,\n                            padding=0)\n        self.hardsigmoid = nn.Sigmoid()\n    def forward(self, x):\n        identity = x\n        x = self.avg_pool(x)\n        x = self.conv1(x)\n        x = self.relu(x)\n        x = self.conv2(x)\n        x = self.hardsigmoid(x)\n        x = paddle.multiply(x=identity, y=x)\n        return x\nclass RepDepthwiseSeparable(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 dw_size=3,\n                 split_pw=False,\n                 use_rep=False,\n                 use_se=False,"
+        },
+        {
+            "comment": "This code initializes a PPTSM backbone model. It creates a ConvBNLayer for each kernel size in the dw_size range, skipping 1x1 if stride is not 1. The layers are stored in the dw_conv_list. An additional Conv2D layer with the same number of input and output channels is also created. This model can be used for image classification tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":161-184",
+            "content": "                 use_shortcut=False):\n        super().__init__()\n        self.is_repped = False\n        self.dw_size = dw_size\n        self.split_pw = split_pw\n        self.use_rep = use_rep\n        self.use_se = use_se\n        self.use_shortcut = True if use_shortcut and stride == 1 and in_channels == out_channels else False\n        if self.use_rep:\n            self.dw_conv_list = nn.LayerList()\n            for kernel_size in range(self.dw_size, 0, -2):\n                if kernel_size == 1 and stride != 1:\n                    continue\n                dw_conv = ConvBNLayer(in_channels=in_channels,\n                                      out_channels=in_channels,\n                                      kernel_size=kernel_size,\n                                      stride=stride,\n                                      groups=in_channels,\n                                      use_act=False)\n                self.dw_conv_list.append(dw_conv)\n            self.dw_conv = nn.Conv2D(in_channels=in_channels,\n                                     out_channels=in_channels,"
+        },
+        {
+            "comment": "Code creates a ConvBNLayer object for downsample convolution with optional SE module and split point-wise convolution. It handles different configurations based on dw_size, stride, and use_se parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":185-208",
+            "content": "                                     kernel_size=dw_size,\n                                     stride=stride,\n                                     padding=(dw_size - 1) // 2,\n                                     groups=in_channels)\n        else:\n            self.dw_conv = ConvBNLayer(in_channels=in_channels,\n                                       out_channels=in_channels,\n                                       kernel_size=dw_size,\n                                       stride=stride,\n                                       groups=in_channels)\n        self.act = nn.ReLU()\n        if use_se:\n            self.se = SEModule(in_channels)\n        if self.split_pw:\n            pw_ratio = 0.5\n            self.pw_conv_1 = ConvBNLayer(in_channels=in_channels,\n                                         kernel_size=1,\n                                         out_channels=int(out_channels *\n                                                          pw_ratio),\n                                         stride=1)\n            self.pw_conv_2 = ConvBNLayer(in_channels=int(out_channels *"
+        },
+        {
+            "comment": "This code defines a backbone for a deep learning model, specifically the PPTSM_v2 architecture. It uses convolutional layers and Batch Normalization to process input data. The use of Point-wise Convolution (pw_conv) or Depth-wise Separable Convolutions (dw_conv) depends on certain conditions. If \"use_rep\" is True, it applies repeated depth-wise convolutions if the current layer has been repped. It also includes optional Squeeze and Excitation blocks for feature enhancement.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":209-237",
+            "content": "                                                         pw_ratio),\n                                         kernel_size=1,\n                                         out_channels=out_channels,\n                                         stride=1)\n        else:\n            self.pw_conv = ConvBNLayer(in_channels=in_channels,\n                                       kernel_size=1,\n                                       out_channels=out_channels,\n                                       stride=1)\n    def forward(self, x):\n        if self.use_rep:\n            input_x = x\n            if self.is_repped:\n                x = self.act(self.dw_conv(x))\n            else:\n                y = self.dw_conv_list[0](x)\n                for dw_conv in self.dw_conv_list[1:]:\n                    y += dw_conv(x)\n                x = self.act(y)\n        else:\n            x = self.dw_conv(x)\n        if self.use_se:\n            x = self.se(x)\n        if self.split_pw:\n            x = self.pw_conv_1(x)\n            x = self.pw_conv_2(x)\n        else:"
+        },
+        {
+            "comment": "This code implements a backbone for PPTSM_V2 model. It performs pointwise convolution, adds shortcut connection if enabled, and includes functions for representation fusion and fusing batch normalization tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":238-268",
+            "content": "            x = self.pw_conv(x)\n        if self.use_shortcut:\n            x = x + input_x\n        return x\n    def rep(self):\n        if self.use_rep:\n            self.is_repped = True\n            kernel, bias = self._get_equivalent_kernel_bias()\n            self.dw_conv.weight.set_value(kernel)\n            self.dw_conv.bias.set_value(bias)\n    def _get_equivalent_kernel_bias(self):\n        kernel_sum = 0\n        bias_sum = 0\n        for dw_conv in self.dw_conv_list:\n            kernel, bias = self._fuse_bn_tensor(dw_conv)\n            kernel = self._pad_tensor(kernel, to_size=self.dw_size)\n            kernel_sum += kernel\n            bias_sum += bias\n        return kernel_sum, bias_sum\n    def _fuse_bn_tensor(self, branch):\n        kernel = branch.conv.weight\n        running_mean = branch.bn._mean\n        running_var = branch.bn._variance\n        gamma = branch.bn.weight\n        beta = branch.bn.bias\n        eps = branch.bn._epsilon\n        std = (running_var + eps).sqrt()\n        t = (gamma / std).reshape((-1, 1, 1, 1))"
+        },
+        {
+            "comment": "The code defines a PPTSM_v2_LCNet class, which is a type of backbone neural network. It includes initialization parameters such as scale, depths, and class_num. The class also has methods for kernel multiplication, tensor padding, and other operations related to image processing and neural network layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":269-302",
+            "content": "        return kernel * t, beta - running_mean * gamma / std\n    def _pad_tensor(self, tensor, to_size):\n        from_size = tensor.shape[-1]\n        if from_size == to_size:\n            return tensor\n        pad = (to_size - from_size) // 2\n        return F.pad(tensor, [pad, pad, pad, pad])\nclass PPTSM_v2_LCNet(nn.Layer):\n    def __init__(self,\n                 scale,\n                 depths,\n                 class_num=400,\n                 dropout_prob=0,\n                 num_seg=8,\n                 use_temporal_att=False,\n                 pretrained=None,\n                 use_last_conv=True,\n                 class_expand=1280):\n        super().__init__()\n        self.scale = scale\n        self.use_last_conv = use_last_conv\n        self.class_expand = class_expand\n        self.num_seg = num_seg\n        self.use_temporal_att = use_temporal_att\n        self.pretrained = pretrained\n        self.stem = nn.Sequential(*[\n            ConvBNLayer(in_channels=3,\n                        kernel_size=3,\n                        out_channels=make_divisible(32 * scale),"
+        },
+        {
+            "comment": "This code defines a PPTSM-v2 backbone model, using DepthwiseSeparable blocks with varying configurations for different stages. It utilizes `make_divisible()` function to adjust the number of channels and kernel sizes, and a LayerList to create a sequence of layers for each stage. The NET_CONFIG determines the specifics of each stage's parameters like in_channels, kernel_size, split_pw, use_rep, use_se, and use_shortcut.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":303-323",
+            "content": "                        stride=2),\n            RepDepthwiseSeparable(in_channels=make_divisible(32 * scale),\n                                  out_channels=make_divisible(64 * scale),\n                                  stride=1,\n                                  dw_size=3)\n        ])\n        # stages\n        self.stages = nn.LayerList()\n        for depth_idx, k in enumerate(NET_CONFIG):\n            in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut = NET_CONFIG[\n                k]\n            self.stages.append(\n                nn.Sequential(*[\n                    RepDepthwiseSeparable(in_channels=make_divisible(\n                        (in_channels if i == 0 else in_channels * 2) * scale),\n                                          out_channels=make_divisible(\n                                              in_channels * 2 * scale),\n                                          stride=2 if i == 0 else 1,\n                                          dw_size=kernel_size,\n                                          split_pw=split_pw,"
+        },
+        {
+            "comment": "This code defines a PPTSM_V2 backbone for the PaddleVideo model. It includes multiple Conv2D layers, BatchNorm layers, AdaptiveAvgPool2D, and optional final convolutional layer, flatten layer, linear layer. The use of these components depends on certain conditions such as `use_rep`, `use_se`, `use_shortcut`, `use_last_conv`, and other parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":324-346",
+            "content": "                                          use_rep=use_rep,\n                                          use_se=use_se,\n                                          use_shortcut=use_shortcut)\n                    for i in range(depths[depth_idx])\n                ]))\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        if self.use_last_conv:\n            self.last_conv = Conv2D(in_channels=make_divisible(\n                NET_CONFIG[\"stage4\"][0] * 2 * scale),\n                                    out_channels=self.class_expand,\n                                    kernel_size=1,\n                                    stride=1,\n                                    padding=0,\n                                    bias_attr=False)\n            self.act = nn.ReLU()\n            self.dropout = Dropout(p=dropout_prob, mode=\"downscale_in_infer\")\n        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)\n        in_features = self.class_expand if self.use_last_conv else NET_CONFIG[\n            \"stage4\"][0] * 2 * scale\n        self.fc = Linear(in_features, class_num)"
+        },
+        {
+            "comment": "Code initializes weights for a PPTSM_v2 backbone model. It first checks if the pretrained weights are provided and then initializes the layers with specified methods. Stage 3 adds temporal attention and Temporal Shift Module (TSM) operations for efficiency.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":347-371",
+            "content": "        if self.use_temporal_att:\n            self.global_attention = GlobalAttention(num_seg=self.num_seg)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, x):\n        x = self.stem(x)\n        count = 0\n        for stage in self.stages:\n            # only add temporal attention and tsm in stage3 for efficiency\n            if count == 2:\n                # add temporal attention\n                if self.use_temporal_att:\n                    x = self.global_attention(x)\n                x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg)"
+        },
+        {
+            "comment": "Code snippet defines a PPTSM_v2 backbone model for video analysis. It consists of stages, an average pooling layer, and a convolution layer. The function also includes feature aggregation and reshaping operations before feeding the data to a fully connected layer. The pretrained model can be loaded from a given path, and it supports custom scaling, depths, dropout probability, and additional keyword arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py\":372-404",
+            "content": "            count += 1\n            x = stage(x)\n        x = self.avg_pool(x)\n        if self.use_last_conv:\n            x = self.last_conv(x)\n            x = self.act(x)\n            x = self.dropout(x)\n        # Feature aggregation\n        x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]])\n        x = paddle.mean(x, axis=1)\n        x = paddle.reshape(x, shape=[-1, self.class_expand])\n        x = self.fc(x)\n        return x\n@BACKBONES.register()\ndef PPTSM_v2(pretrained=None, use_ssld=False, **kwargs):\n    \"\"\"\n    PP-TSM_v2 model.\n    Args:\n        pretrained: str, means the path of the pretrained model.\n    Returns:\n        model: nn.Layer.\n    \"\"\"\n    model = PPTSM_v2_LCNet(pretrained=pretrained,\n                           scale=1.0,\n                           depths=[2, 2, 6, 2],\n                           dropout_prob=0.2,\n                           **kwargs)\n    return model"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/03bf07da-387b-4249-9ec5-8a856389eb2f.json b/docs/doc/03bf07da-387b-4249-9ec5-8a856389eb2f.json
new file mode 100644
index 000000000..6abed835e
--- /dev/null
+++ b/docs/doc/03bf07da-387b-4249-9ec5-8a856389eb2f.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code prepares an environment for training and testing a computer vision model using the DeeplabV3_coco pre-trained model on the DAVIS dataset. It sets parameters, saves intermediate results, and runs test.py for actual testing without IntSeg enabled.",
+    "details": [
+        {
+            "comment": "The code sets environment variables, then runs two training scripts for a computer vision model in stages. It uses pre-trained DeeplabV3_coco model, and saves intermediate and final results to specified directories. The model is trained with specific parameters on the DAVIS dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/run.sh\":0-12",
+            "content": "PRETRAIN_MODEL='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/DeeplabV3_coco.pdparams'\nVOS_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/MaNet_davis2017_stage1.pdparams'\n#VOS_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/stage1'\nINT_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/MANet_davis2017.pdparams'\n#INT_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/stage2'\nINT_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/result'\nRESCALE=416\nRANDOMCROP=416\nDATA_ROOT='/home/lc/PaddleVideo/data/DAVIS'\necho 'Stage1 training'\nCUDA_VISIBLE_DEVICE=3 python train_stage1.py --SAVE_RESULT_DIR $VOS_SAVE_RESULT_DIR --PRETRAINED_MODEL $PRETRAIN_MODEL --DATA_ROOT $DATA_ROOT --TRAIN_BATCH_SIZE 2 --DATA_RESCALE $RESCALE --DATA_RANDOMCROP $RANDOMCROP --TRAIN_LR 0.0007  --MODEL_MAX_LOCAL_DISTANCE 12\necho 'Stage2 training'\npython train_stage2.py --SAVE_RESULT_DIR $INT_SAVE_RESULT_DIR --SAVE_VOS_RESULT_DIR $"
+        },
+        {
+            "comment": "This code is setting up a testing environment for a video object segmentation task. It specifies the data root, save result directory, pre-trained model path and then executes a test.py script to perform the actual testing. The model is set to not use IntSeg and the TEST_MODE flag is set to True, indicating that this is indeed a testing run.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/run.sh\":12-14",
+            "content": "VOS_SAVE_RESULT_DIR --DATA_ROOT $DATA_ROOT --DATA_RESCALE $RESCALE --DATA_RANDOMCROP $RANDOMCROP  --PRETRAINED_MODEL $PRETRAIN_MODEL\necho 'Testing'\npython test.py --DATA_ROOT $DATA_ROOT --SAVE_RESULT_DIR $INT_SAVE_RESULT_DIR  --RESULT_ROOT $INT_RESULT_DIR --MODEL_USEIntSeg False --TEST_MODE True"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0563d2d7-2a00-442d-814d-edfb8fdf8064.json b/docs/doc/0563d2d7-2a00-442d-814d-edfb8fdf8064.json
new file mode 100644
index 000000000..14b203cd0
--- /dev/null
+++ b/docs/doc/0563d2d7-2a00-442d-814d-edfb8fdf8064.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code prepares the PaddlePaddle app environment, imports necessary libraries, handles config, defines model functions, loads test weights, logs metrics, and checks save directory. It also creates directories, logs arguments, verifies Paddle version, and runs a test function.",
+    "details": [
+        {
+            "comment": "This code snippet sets up the environment and logging for a PaddlePaddle application. It imports necessary libraries, handles basic configuration, and sets up logging output to the console. This script seems to be part of an AI model's evaluation process, as it also includes references to reader, metrics, and model files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/eval.py\":0-32",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport time\nimport logging\nimport argparse\nimport ast\nimport paddle\nimport paddle.static as static\nfrom utils.config_utils import *\nimport models\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'\nlogging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)"
+        },
+        {
+            "comment": "This code defines a function `parse_args()` that creates an ArgumentParser to parse command line arguments. It sets defaults for model name, config file path, batch size, GPU usage, and weight path. The parser also adds help messages for each argument.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/eval.py\":33-63",
+            "content": "logger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='AttentionCluster',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/attention_cluster.txt',\n                        help='path to config file of model')\n    parser.add_argument(\n        '--batch_size',\n        type=int,\n        default=None,\n        help='test batch size. None to use config file setting.')\n    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument(\n        '--weights',\n        type=str,\n        default='./data/checkpoints/AttentionLSTM_epoch9.pdparams',\n        help='weight path.')\n    parser.add_argument(\n        '--save_dir',\n        type=str,\n        default=os.path.join('data', 'evaluate_results'),"
+        },
+        {
+            "comment": "This code defines a function `test` that takes in arguments, parses a config file, merges it with test configuration, prints the configurations, builds a model using the provided model name and configurations, feeds the model, fetches the model outputs, creates an executor based on whether to use GPU or CPU, and checks if the weight directory exists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/eval.py\":64-94",
+            "content": "        help='output dir path, default to use ./data/evaluate_results')\n    parser.add_argument('--log_interval',\n                        type=int,\n                        default=1,\n                        help='mini-batch interval to log.')\n    args = parser.parse_args()\n    return args\ndef test(args):\n    # parse config\n    config = parse_config(args.config)\n    test_config = merge_configs(config, 'test', vars(args))\n    print_configs(test_config, \"Test\")\n    use_dali = test_config['TEST'].get('use_dali', False)\n    # build model\n    test_model = models.get_model(args.model_name, test_config, mode='test')\n    test_model.build_input(use_dataloader=False)\n    test_model.build_model()\n    test_feeds = test_model.feeds()\n    test_fetch_list = test_model.fetches()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(static.default_startup_program())\n    if args.weights:\n        assert os.path.exists(\n            args.weights), \"Given weight dir {} not exist.\".format(args.weights)"
+        },
+        {
+            "comment": "This code loads test weights, creates a reader and metrics for testing, runs the model with the data, calculates and logs the evaluation metrics for each batch, and checks if the save directory exists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/eval.py\":95-121",
+            "content": "    weights = args.weights or test_model.get_weights()\n    logger.info('load test weights from {}'.format(weights))\n    test_model.load_test_weights(exe, weights, static.default_main_program())\n    # get reader and metrics\n    test_reader = get_reader(args.model_name.upper(), 'test', test_config)\n    test_metrics = get_metrics(args.model_name.upper(), 'test', test_config)\n    test_feeder = paddle.fluid.DataFeeder(place=place, feed_list=test_feeds)\n    epoch_period = []\n    for test_iter, data in enumerate(test_reader()):\n        cur_time = time.time()\n        test_outs = exe.run(fetch_list=test_fetch_list,\n                            feed=test_feeder.feed(data))\n        period = time.time() - cur_time\n        epoch_period.append(period)\n        test_metrics.accumulate(test_outs)\n        # metric here\n        if args.log_interval > 0 and test_iter % args.log_interval == 0:\n            info_str = '[EVAL] Batch {}'.format(test_iter)\n            test_metrics.calculate_and_log_out(test_outs, info_str)\n    if not os.path.isdir(args.save_dir):"
+        },
+        {
+            "comment": "This code creates directories, finalizes and logs test metrics, checks if installed Paddle is compiled with GPU, verifies Paddle version, logs arguments, and runs a test function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/eval.py\":122-133",
+            "content": "        os.makedirs(args.save_dir)\n    test_metrics.finalize_and_log_out(\"[EVAL] eval finished. \", args.save_dir)\nif __name__ == \"__main__\":\n    args = parse_args()\n    # check whether the installed paddle is compiled with GPU\n    check_cuda(args.use_gpu)\n    check_version()\n    logger.info(args)\n    test(args)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/058c279b-9974-4624-a870-601b853baf65.json b/docs/doc/058c279b-9974-4624-a870-601b853baf65.json
new file mode 100644
index 000000000..f41e60108
--- /dev/null
+++ b/docs/doc/058c279b-9974-4624-a870-601b853baf65.json
@@ -0,0 +1,135 @@
+{
+    "summary": "This script prepares PaddlePaddle's video object detection models by handling data, installing packages, and downloading/preprocessing for TIPC models. It also prepares data for AttentionLSTM and SlowFast models. The code downloads pre-trained model data and weights for various models like ResNet50, TSN, TimeSformer, PP-TSM, and VideoSwin.",
+    "details": [
+        {
+            "comment": "This script is preparing the environment for training and inference on PaddlePaddle's video object detection models. It takes a filename as an argument, parses its contents to determine the model name and mode, installs required packages like auto-log, and prepares any necessary data or pretrained weights for the selected model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":0-43",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\n# set -xe\n:<<!\nMODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer',\n#                 'whole_infer',\n#                 'cpp_infer', ]\n!\nMODE=$2\ndataline=$(cat ${FILENAME})\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# determine python interpreter version\npython=python\n# install auto-log package.\n${python} -m pip install unrar\n${python} -m pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl\n# The training params\nmodel_name=$(func_parser_value \"${lines[1]}\")\ntrainer_list=$(func_parser_value \"${lines[14]}\")\nif [ ${MODE} = \"lite_train_lite_infer\" ];then\n    if [ ${model_name} == \"PP-TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate"
+        },
+        {
+            "comment": "This code checks the value of the 'model_name' variable and performs specific actions based on its value. If 'model_name' is \"PP-TSN\", it downloads pretrained weights for ResNet50 model. If 'model_name' is \"AGCN\" or \"STGCN\", it downloads training data for FSD10 dataset. The code uses pushd and popd commands to navigate directories, and wget command to download files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":44-66",
+            "content": "    elif [ ${model_name} == \"PP-TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"STGCN\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"AGCN2s\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10"
+        },
+        {
+            "comment": "This code segment downloads the necessary data and pretrained weights for different models (FSD, TSM, or TimeSformer). It checks the value of the model_name variable and performs specific actions accordingly. For FSD, it downloads train data and labels. For TSM and TimeSformer, it downloads lite train data and pretrained ResNet50 weights. The code also uses pushd and popd commands to change directories temporarily during the process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":67-86",
+            "content": "        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then"
+        },
+        {
+            "comment": "The code snippet is preparing data and downloading pretrained weights for different models. It first prepares the lite train data by downloading and decompressing a dataset, then installs TensorFlow GPU version 1.14.0, converts data format using tf2pkl.py script, splits the train data into multiple files, and finally, it downloads the pretrained weights for the specified model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":87-104",
+            "content": "        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        pushd data/yt8m\n        ## download & decompression training data\n        wget -nc https://videotag.bj.bcebos.com/Data/yt8m_rawframe_small.tar\n        tar -xf yt8m_rawframe_small.tar\n        ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple\n        ${python} tf2pkl.py ./frame ./pkl_frame/\n        ls pkl_frame/train*.pkl > train_small.list # \u5c06train*.pkl\u7684\u8def\u5f84\u5199\u5165train_small.list\n        ls pkl_frame/validate*.pkl > val_small.list # \u5c06validate*.pkl\u7684\u8def\u5f84\u5199\u5165val_small.list\n        ${python} split_yt8m.py train_small.list # \u62c6\u5206\u6bcf\u4e2atrain*.pkl\u53d8\u6210\u591a\u4e2atrain*_split*.pkl"
+        },
+        {
+            "comment": "This code is checking the value of the variable `model_name` and performing different operations based on its value. For example, if `model_name` equals \"SlowFast\", it changes directory to `./data/k400`, downloads a tar file containing data for pre-training a SlowFast model, and then extracts the tar file. Similarly, if `model_name` is \"BMN\" or \"TokenShiftVisionTransformer\", different operations are carried out, such as downloading and extracting necessary data files for pre-training these models. The code uses various commands like `pushd`, `popd`, `wget`, and `tar` to manipulate directories and files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":105-126",
+            "content": "        ${python} split_yt8m.py val_small.list # \u62c6\u5206\u6bcf\u4e2avalidate*.pkl\u53d8\u6210\u591a\u4e2avalidate*_split*.pkl\n        ls pkl_frame/train*_split*.pkl > train_small.list # \u5c06train*_split*.pkl\u7684\u8def\u5f84\u91cd\u65b0\u5199\u5165train_small.list\n        ls pkl_frame/validate*_split*.pkl > val_small.list # \u5c06validate*_split*.pkl\u7684\u8def\u5f84\u91cd\u65b0\u5199\u5165val_small.list\n        popd\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n    elif [ ${model_name} == \"BMN\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        mkdir bmn_data\n        cd bmn_data\n        wget -nc https://videotag.bj.bcebos.com/Data/BMN_lite/bmn_feat.tar.gz\n        tar -xf bmn_feat.tar.gz\n        wget -nc https://videotag.bj.bcebos.com/Data/BMN_lite/activitynet_1.3_annotations.json\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json\n        popd\n    elif [ ${model_name} == \"TokenShiftVisionTransformer\" ]; then"
+        },
+        {
+            "comment": "This script downloads pre-trained model weights and preprocesses training data for specific models. For ViT_base, it downloads the weight file. For PoseC3D, it downloads and unzips a small dataset. For YOWO, it downloads the necessary datasets and YOWO's pre-trained model at a specific epoch. Models not in TIPC are not processed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":127-148",
+            "content": "        # download pretrained weights\n        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"PoseC3D\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        mkdir posec3d_data\n        cd posec3d_data\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PoseC3D_data_small.tar\n        tar -xf PoseC3D_data_small.tar\n        popd\n    elif [ ${model_name} == \"YOWO\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        wget -nc https://videotag.bj.bcebos.com/Data/ucf-24-lite.zip\n        unzip -qo ucf-24-lite.zip\n        pushd ./ucf24\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/YOWO_epoch_00005.pdparams\n        popd\n    else\n        echo \"Not added into TIPC yet.\""
+        },
+        {
+            "comment": "Checking if MODE is \"whole_train_whole_infer\". If true, it determines the model (PP-TSM or PP-TSN) and performs specific actions for pretraining with whole training data. For PP-TSM, downloads Kinetics400 data, extracts raw frames, downloads annotations, and gets pretrained weights. For PP-TSN, similar steps are followed but with different data and models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":149-167",
+            "content": "    fi\nelif [ ${MODE} = \"whole_train_whole_infer\" ];then\n    if [ ${model_name} == \"PP-TSM\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"PP-TSN\" ]; then\n        # pretrain whole train data"
+        },
+        {
+            "comment": "This code downloads pre-trained model weights and data for different models, such as ResNet50_vd_ssld_v2, AGCN, and STGCN. It also downloads annotations and train/validation lists from specific URLs. The code uses pushd and popd commands to change directories temporarily and wget command to perform non-checking downloads (nc) of files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":168-187",
+            "content": "        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        # pretrain whole train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"STGCN\" ]; then\n        # pretrain whole train data\n        pushd data/fsd10"
+        },
+        {
+            "comment": "The code checks the value of 'model_name', and if it's \"TSM\", it performs specific actions. It changes to the directory ./data/k400, downloads train and val lists from the URLs provided, then uses bash scripts to download data based on the list. Afterwards, it extracts frames from video files using the 'extract_rawframes.py' script at a certain level and with specific extensions. It also downloads annotations for training and validation sets. Finally, it changes back to the previous directory and downloads pretrained ResNet50 weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":188-204",
+            "content": "        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"TSM\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate"
+        },
+        {
+            "comment": "The code checks if the model name is \"TSN\" or \"TimeSformer\" and then downloads corresponding data and pretrained weights for each model. It pushes to a directory, downloads training and validation lists, extracts frames from videos, downloads annotations, and finally pops out of the directory. This script appears to be part of a larger program that prepares data for specific models in a machine learning or deep learning context.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":205-222",
+            "content": "    elif [ ${model_name} == \"TSN\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list"
+        },
+        {
+            "comment": "This code is preparing the data and environment for a specific model named \"AttentionLSTM\". It downloads links for training and validation datasets, fetches annotations, and gets pre-trained weights. The model requires TensorFlow GPU version 1.14.0 and uses YT8M dataset partitioned into 2 parts - train and validate.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":223-240",
+            "content": "        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        # pretrain whole train data\n        pushd data/yt8m\n        mkdir frame\n        cd frame\n        ## download & decompression training data\n        curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python\n        curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python\n        ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple"
+        },
+        {
+            "comment": "The code is preparing the Kinetics400 dataset for PaddleVideo's SlowFast model by downloading and splitting the train and validation data. It changes directory, uses tf2pkl.py to convert frame files into pkl format, splits the pkl files using split_yt8m.py, and finally writes the file paths into train.list and val.list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":241-260",
+            "content": "        cd ..\n        ${python} tf2pkl.py ./frame ./pkl_frame/\n        ls pkl_frame/train*.pkl > train.list # \u5c06train*.pkl\u7684\u8def\u5f84\u5199\u5165train.list\n        ls pkl_frame/validate*.pkl > val.list # \u5c06validate*.pkl\u7684\u8def\u5f84\u5199\u5165val.list\n        ${python} split_yt8m.py train.list # \u62c6\u5206\u6bcf\u4e2atrain*.pkl\u53d8\u6210\u591a\u4e2atrain*_split*.pkl\n        ${python} split_yt8m.py val.list # \u62c6\u5206\u6bcf\u4e2avalidate*.pkl\u53d8\u6210\u591a\u4e2avalidate*_split*.pkl\n        ls pkl_frame/train*_split*.pkl > train.list # \u5c06train*_split*.pkl\u7684\u8def\u5f84\u91cd\u65b0\u5199\u5165train.list\n        ls pkl_frame/validate*_split*.pkl > val.list # \u5c06validate*_split*.pkl\u7684\u8def\u5f84\u91cd\u65b0\u5199\u5165val.list\n        popd\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # pretrain whole train data\n        pushd ./data/k400\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list\n        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list\n        bash download_k400_data.sh train_link.list\n        bash download_k400_data.sh val_link.list\n        # download annotations\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list"
+        },
+        {
+            "comment": "Code handles different scenarios based on the model_name and MODE variables. For BMN, it pretrains using whole train data by downloading necessary files from specified URLs. For PP-TSM in lite_train_whole_infer scenario, it pretrains using lite train data by downloading a tar file and pretrained weights. If none of the conditions match, it displays \"Not added into TIPC yet.\" message.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":261-284",
+            "content": "        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list\n        popd\n    elif [ ${model_name} == \"BMN\" ]; then\n        # pretrain whole train data\n        pushd ./data\n        mkdir bmn_data\n        cd bmn_data\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz\n        tar -xf bmn_feat.tar.gz\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json\n        popd\n    else\n        echo \"Not added into TIPC yet.\"\n    fi\nelif [ ${MODE} = \"lite_train_whole_infer\" ];then\n    if [ ${model_name} == \"PP-TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate"
+        },
+        {
+            "comment": "This code checks the value of the `model_name` variable and performs different actions accordingly. If it's \"PP-TSN\", it downloads pretrained weights and lite train data for PP-TSN model. If it's \"AGCN\" or \"STGCN\", it downloads lite train data. And if it's \"TSM\", it downloads lite train data for TSM model. It uses pushd/popd to change directories and wget to download files from specified URLs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":285-307",
+            "content": "    elif [ ${model_name} == \"PP-TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"STGCN\" ]; then\n        # pretrain lite train data\n        pushd data/fsd10\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy\n        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy\n        popd\n    elif [ ${model_name} == \"TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400"
+        },
+        {
+            "comment": "This code downloads pre-trained model files for different models. For \"PaddleVideo/test_tipc/prepare.sh\", it checks the value of $model_name and proceeds accordingly. It pushes to a specific data folder, then downloads rawframes or videos depending on the model type. Finally, it retrieves the pre-trained weights for each model from an HTTPS URL, handling network errors with -nc option.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":308-328",
+            "content": "        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https:/"
+        },
+        {
+            "comment": "This code snippet downloads and prepares the dataset for training an AttentionLSTM model. It first checks out the data from a specific URL, installs TensorFlow version 1.14.0, converts the raw video frames to pickle format, splits the data into training and validation sets, and finally lists the resulting files in train_small.list and val_small.list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":328-344",
+            "content": "/paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        # pretrain lite train data\n        pushd data/yt8m\n        ## download & decompression training data\n        wget -nc https://videotag.bj.bcebos.com/Data/yt8m_rawframe_small.tar\n        tar -xf yt8m_rawframe_small.tar\n        ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple\n        ${python} tf2pkl.py ./frame ./pkl_frame/\n        ls pkl_frame/train*.pkl > train_small.list # \u5c06train*.pkl\u7684\u8def\u5f84\u5199\u5165train_small.list\n        ls pkl_frame/validate*.pkl > val_small.list # \u5c06validate*.pkl\u7684\u8def\u5f84\u5199\u5165val_small.list\n        ${python} split_yt8m.py train_small.list # \u62c6\u5206\u6bcf\u4e2atrain*.pkl\u53d8\u6210\u591a\u4e2atrain*_split*.pkl\n        ${python} split_yt8m.py val_small.list # \u62c6\u5206\u6bcf\u4e2avalidate*.pkl\u53d8\u6210\u591a\u4e2avalidate*_split*.pkl\n        ls pkl_frame/train*_split*.pkl > train_small.list # \u5c06train*_split*.pkl\u7684\u8def\u5f84\u91cd\u65b0\u5199\u5165train_small.list\n        ls pkl_frame/validate*_split*.pkl > val_small.list # \u5c06validate*_split*.pkl\u7684\u8def\u5f84\u91cd\u65b0\u5199\u5165val_small.list"
+        },
+        {
+            "comment": "This code is checking the model_name and performing specific actions based on its value. If model_name is \"SlowFast\", it downloads pretrain lite train data for that model. If model_name is \"BMN\", it downloads required datasets for that model. For other model names, it prints a message indicating they are not added to TIPC yet. In the case of MODE being \"whole_infer\", it performs specific actions based on model_name such as downloading pretrained weights for PP-TSM and PP-TSN models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":345-369",
+            "content": "        popd\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n    elif [ ${model_name} == \"BMN\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        mkdir bmn_data\n        cd bmn_data\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz\n        tar -xf bmn_feat.tar.gz\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json\n        popd\n    else\n        echo \"Not added into TIPC yet.\"\n    fi\nelif [ ${MODE} = \"whole_infer\" ];then\n    if [ ${model_name} = \"PP-TSM\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams --no-check-certificate\n    elif [ ${model_name} = \"PP-TSN\" ]; then"
+        },
+        {
+            "comment": "This code is checking the value of 'model_name' variable and downloading the corresponding pretrained weights for different models using 'wget' command. If model name matches, it retrieves the respective model's file from a specific URL and saves it in the './data' directory without certificate checks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":370-384",
+            "content": "        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams --no-check-certificate\n    elif [ ${model_name} == \"STGCN\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSM\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSN\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then"
+        },
+        {
+            "comment": "This code downloads pre-trained model weights depending on the specified model name. It uses wget to retrieve the files from specific URLs and saves them in the \"./data\" directory. The code also checks if the MODE is \"benchmark_train\" and installs necessary packages using pip if so. Additionally, it changes the current directory to \"./data/k400\" to prepare for pre-training the Lite train data of PP-TSM model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":385-405",
+            "content": "        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams --no-check-certificate\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams --no-check-certificate\n    elif [ ${model_name} == \"BMN\" ]; then\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams --no-check-certificate\n    else\n        echo \"Not added into TIPC yet.\"\n    fi\nfi\nif [ ${MODE} = \"benchmark_train\" ];then\n    ${python} -m pip install -r requirements.txt\n    if [ ${model_name} == \"PP-TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400"
+        },
+        {
+            "comment": "Code snippet checks the value of `model_name` and performs specific actions based on its value. For example, if it is \"PaddleVideo/ResNet50\", it downloads pretrained weights for that model. If `model_name` is not recognized, it prints a message saying it's not added to TIPC yet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":406-426",
+            "content": "        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"PP-TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AGCN\" ]; then\n        echo \"Not added into TIPC yet.\"\n    elif [ ${model_name} == \"STGCN\" ]; then\n        echo \"Not added into TIPC yet.\"\n    elif [ ${model_name} == \"TSM\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar"
+        },
+        {
+            "comment": "This code is downloading pre-trained weights and data for PaddleVideo models, such as ResNet50, TSN, and TimeSformer. It checks the model_name and performs specific tasks accordingly: unzipping tar files, downloading lists of frames, and retrieving pretrained weights from specified URLs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":427-445",
+            "content": "        tar -xf k400_rawframes_small.tar\n        # download datalist for fleet benchmark\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/train_fleet_frames.list\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/val_fleet_frames.list\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TSN\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate\n    elif [ ${model_name} == \"TimeSformer\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar"
+        },
+        {
+            "comment": "Code snippet checks the model name and performs specific actions for each. If model is \"k400_videos_small\", it downloads pre-trained weights. If model is \"SlowFast\", it downloads lite train data. For \"BMN\", it downloads BMN training data, including annotations and JSON files. No action is taken for \"AttentionLSTM\" as it's not added to TIPC yet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":446-467",
+            "content": "        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate\n    elif [ ${model_name} == \"AttentionLSTM\" ]; then\n        echo \"Not added into TIPC yet.\"\n    elif [ ${model_name} == \"SlowFast\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n    elif [ ${model_name} == \"BMN\" ]; then\n        # pretrain lite train data\n        pushd ./data\n        mkdir bmn_data\n        cd bmn_data\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz\n        tar -xf bmn_feat.tar.gz\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json\n        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json\n        popd"
+        },
+        {
+            "comment": "This code checks if the model is VideoSwin or PP-TSM. If VideoSwin, it downloads pretrain lite train data and pretrained weights. If PP-TSM, it downloads lite data and inference model. Other models are not added to TIPC yet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":468-496",
+            "content": "    elif [ ${model_name} == \"VideoSwin\" ]; then\n        # pretrain lite train data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n        tar -xf k400_videos_small.tar\n        popd\n        # download pretrained weights\n        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_small_patch4_window7_224.pdparams --no-check-certificate\n    else\n        echo \"Not added into TIPC yet.\"\n    fi\nfi\nif [ ${MODE} = \"klquant_whole_infer\" ]; then\n    if [ ${model_name} = \"PP-TSM\" ]; then\n        # download lite data\n        pushd ./data/k400\n        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\n        tar -xf k400_rawframes_small.tar\n        popd\n        # download inference model\n        mkdir ./inference\n        pushd ./inference\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate\n        unzip ppTSM.zip\n        popd\n    else\n        echo \"Not added into TIPC yet.\"\n    fi"
+        },
+        {
+            "comment": "This code installs necessary packages, downloads pre-trained model weights for either PP-TSM or PP-TSN, and exports the inference models for these two models. This is typically done before running inference on new data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":497-519",
+            "content": "fi\nif [ ${MODE} = \"cpp_infer\" ];then\n    # install required packages\n    apt-get update\n    apt install libavformat-dev\n    apt install libavcodec-dev\n    apt install libswresample-dev\n    apt install libswscale-dev\n    apt install libavutil-dev\n    apt install libsdl1.2-dev\n    apt-get install ffmpeg\n    if [ ${model_name} = \"PP-TSM\" ]; then\n        # download pretrained weights\n        wget -nc -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams --no-check-certificate\n        # export inference model\n        ${python} tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml -p data/ppTSM_k400_uniform.pdparams -o ./inference/ppTSM\n    elif [ ${model_name} = \"PP-TSN\" ]; then\n        # download pretrained weights\n        wget -nc -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams --no-check-certificate\n        # export inference model\n        ${python} tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_videos.yaml -p data/ppTSN_k400.pdparams -o ./inference/ppTSN"
+        },
+        {
+            "comment": "This code checks if the model_name is either \"PP-TSM\" or \"PP-TSN\". If it matches, it prepares lite infer data and downloads the corresponding inference model for serving. If not, it displays a message indicating that the model is not added into TIPC now.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":520-551",
+            "content": "    else\n        echo \"Not added into TIPC now.\"\n    fi\nfi\nif [ ${MODE} = \"serving_infer_python\" ];then\n    if [[ ${model_name} == \"PP-TSM\" ]];then\n        # prepare lite infer data for serving\n        pushd ./data\n        mkdir python_serving_infer_video_dir\n        cp ./example.avi python_serving_infer_video_dir/\n        popd\n        # prepare inference model\n        mkdir ./inference\n        pushd ./inference\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate\n        unzip ppTSM.zip\n        popd\n    elif [[ ${model_name} == \"PP-TSN\" ]];then\n        # prepare lite infer data for serving\n        pushd ./data\n        mkdir python_serving_infer_video_dir\n        cp ./example.avi python_serving_infer_video_dir/\n        popd\n        # prepare inference model\n        mkdir ./inference\n        pushd ./inference\n        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip --no-check-certificate\n        unzip ppTSN.zip\n        popd\n    else\n        echo \"Not added into TIPC now.\""
+        },
+        {
+            "comment": "This code snippet checks the current mode and performs specific actions accordingly. If the mode is \"paddle2onnx_infer\", it installs paddle2onnx and onnxruntime with a specified Python interpreter. For the \"PP-TSM\" model, it displays a message indicating that it's not added to TIPC. For the \"PP-TSN\" model, it downloads and unzips the inference model from a specific URL. If the mode is not recognized, it indicates that the corresponding action is not available in TIPC.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/prepare.sh\":552-576",
+            "content": "    fi\nfi\nif [ ${MODE} = \"paddle2onnx_infer\" ];then\n    # install paddle2onnx\n    python_name_list=$(func_parser_value \"${lines[2]}\")\n    IFS='|'\n    array=(${python_name_list})\n    python_name=${array[0]}\n    ${python_name} -m pip install paddle2onnx\n    ${python_name} -m pip install onnxruntime==1.9.0\n    if [ ${model_name} = \"PP-TSM\" ]; then\n        echo \"Not added into TIPC now.\"\n    elif [ ${model_name} = \"PP-TSN\" ]; then\n        mkdir -p ./inference\n        wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip\n        # unzip inference model\n        pushd ./inference\n        unzip ppTSN.zip\n        popd\n    else\n        echo \"Not added into TIPC now.\"\n    fi\nfi"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/05c34c33-37bd-4d40-93e1-b9758902d882.json b/docs/doc/05c34c33-37bd-4d40-93e1-b9758902d882.json
new file mode 100644
index 000000000..3f503b3a7
--- /dev/null
+++ b/docs/doc/05c34c33-37bd-4d40-93e1-b9758902d882.json
@@ -0,0 +1,50 @@
+{
+    "summary": "This code defines functions for setting up imports, parsing command line arguments, and exporting PaddleVideo models. It includes model building, loading pretrained parameters, evaluating the model, providing input specifications, converting to static, saving, and printing saved model location.",
+    "details": [
+        {
+            "comment": "This code snippet is the first 31 lines of the \"export_model.py\" file in PaddleVideo's tools directory. It sets up imports and defines a function parse_args(). This function uses argparse to create an argument parser for the script. The script seems to be part of a model exporting tool designed for PaddleVideo, possibly used for command line arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py\":0-31",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nimport os.path as osp\nimport sys\nimport paddle\nfrom paddle.jit import to_static\nfrom paddle.static import InputSpec\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\nfrom paddlevideo.modeling.builder import build_model\nfrom paddlevideo.utils import get_config\ndef parse_args():\n    parser = argparse.ArgumentParser(\"PaddleVideo export model script\")"
+        },
+        {
+            "comment": "This code block is parsing command line arguments to specify the config file path, pre-trained parameters path, override options, and output path for exporting a model. The exported files will include pdiparams and pdmodel.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py\":32-56",
+            "content": "    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument('--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument(\"-p\",\n                        \"--pretrained_params\",\n                        default='./best.pdparams',\n                        type=str,\n                        help='params path')\n    parser.add_argument(\"-o\",\n                        \"--output_path\",\n                        type=str,\n                        default=\"./inference\",\n                        help='output path')\n    parser.add_argument('--save_name',\n                        type=str,\n                        default=None,\n                        help='specify the exported inference \\\n                             files(pdiparams and pdmodel) name,\\"
+        },
+        {
+            "comment": "This code appears to be involved in model exporting and configuration trimming. It defines three functions: \"export_model\" parses command line arguments, \"trim_config\" removes unused or unnecessary attributes from the configuration, and \"get_input_spec\" sets the input specification based on the given model name. The code seems to be a part of PaddleVideo library and involves several specific models such as TSM, MoViNet, ppTSM, and ppTSMv2.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py\":57-86",
+            "content": "                             only used in TIPC')\n    return parser.parse_args()\ndef trim_config(cfg):\n    \"\"\"\n    Reuse the trainging config will bring useless attributes, such as: backbone.pretrained model.\n    and some build phase attributes should be overrided, such as: backbone.num_seg.\n    Trim it here.\n    \"\"\"\n    model_name = cfg.model_name\n    if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):\n        cfg.MODEL.backbone.pretrained = \"\"  # not ued when inference\n    # for distillation\n    if cfg.MODEL.get('models'):\n        if cfg.MODEL.models[0]['Teacher']['backbone'].get('pretrained'):\n            cfg.MODEL.models[0]['Teacher']['backbone']['pretrained'] = \"\"\n        if cfg.MODEL.models[1]['Student']['backbone'].get('pretrained'):\n            cfg.MODEL.models[1]['Student']['backbone']['pretrained'] = \"\"\n    return cfg, model_name\ndef get_input_spec(cfg, model_name):\n    if model_name in ['ppTSM', 'TSM', 'MoViNet', 'ppTSMv2']:\n        input_spec = [[\n            InputSpec(\n                shape=[None, cfg.num_seg, 3, cfg.target_size, cfg.target_size],"
+        },
+        {
+            "comment": "The code snippet defines different input specifications based on the model name. It checks the model name and sets the shape and dtype of the input accordingly, handling various models such as 'PaddleVideo', 'TokenShiftVisionTransformer', 'TSN', 'ppTSN', 'BMN', 'TimeSformer', and 'ppTimeSformer'. The input specifications define the dimensions for inputs like number of frames, number of segments, channels, and target size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py\":87-116",
+            "content": "                dtype='float32'),\n        ]]\n    elif model_name in ['TokenShiftVisionTransformer']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['TSN', 'ppTSN']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, cfg.num_seg * 10, 3, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['BMN']:\n        input_spec = [[\n            InputSpec(shape=[None, cfg.feat_dim, cfg.tscale],\n                      dtype='float32',\n                      name='feat_input'),\n        ]]\n    elif model_name in ['TimeSformer', 'ppTimeSformer']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['VideoSwin']:"
+        },
+        {
+            "comment": "The code is defining input specifications for different model names in the PaddleVideo tool. It uses InputSpec to specify the shape and data type of inputs for each model, with varying numbers of inputs based on the model's requirements (e.g., RGB data, audio data, etc.). This allows the export_model function to handle various models appropriately.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py\":117-142",
+            "content": "        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_seg * cfg.seg_len * 1, cfg.target_size,\n                cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['VideoSwin_TableTennis']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_seg * cfg.seg_len * 3, cfg.target_size,\n                cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['AttentionLSTM']:\n        input_spec = [[\n            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],\n                      dtype='float32'),  # for rgb_data\n            InputSpec(shape=[\n                None,\n            ], dtype='int64'),  # for rgb_len\n            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],\n                      dtype='float32'),  # for rgb_mask\n            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],\n                      dtype='float32'),  # for audio_data"
+        },
+        {
+            "comment": "This code snippet defines input specifications for different models used in the PaddleVideo framework. It determines the shape and data type of inputs based on the model name provided, such as audio data for models like ResNet50, SlowFast, and temporal graph convolutional networks (TGCN) models like STGCN, AGCN, and CTRGCN. The shapes account for variables like number of frames, window size, and feature dimensions specific to each model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py\":143-171",
+            "content": "            InputSpec(shape=[\n                None,\n            ], dtype='int64'),  # for audio_len\n            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],\n                      dtype='float32'),  # for audio_mask\n        ]]\n    elif model_name in ['SlowFast']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,\n                cfg.target_size\n            ],\n                      dtype='float32',\n                      name='slow_input'),\n            InputSpec(shape=[\n                None, 3, cfg.num_frames, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32',\n                      name='fast_input'),\n        ]]\n    elif model_name in ['STGCN', 'AGCN', 'CTRGCN']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, cfg.num_channels, cfg.window_size, cfg.vertex_nums,\n                cfg.person_nums\n            ],\n                      dtype='float32'),\n        ]]\n    # \u7531\u4e8e\u5728\u6a21\u578b\u8fd0\u884c\u8fc7\u7a0b\u4e2d\u6d89\u53ca\u5230\u7b2c\u4e00\u7ef4\u4e58human\u4e2a\u6570(N*M), \u6240\u4ee5\u8fd9\u91cc\u75281\u4f5c\u4e3ashape"
+        },
+        {
+            "comment": "The code defines different input specifications for various model names. It handles models like AGCN2s, TransNetV2, MSTCN, ASRF, ADDs, and AVA_SlowFast_FastRcnn by specifying the shape of the input data and its data type ('float32'). The shapes are defined according to the specific model's input requirements.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py\":172-203",
+            "content": "    elif model_name in ['AGCN2s']:\n        input_spec = [[\n            InputSpec(shape=[\n                1, cfg.num_channels, cfg.window_size, cfg.vertex_nums,\n                cfg.person_nums\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['TransNetV2']:\n        input_spec = [[\n            InputSpec(shape=[\n                None,\n                cfg.num_frames,\n                cfg.height,\n                cfg.width,\n                cfg.num_channels,\n            ],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['MSTCN', 'ASRF']:\n        input_spec = [[\n            InputSpec(shape=[None, cfg.num_channels, None], dtype='float32'),\n        ]]\n    elif model_name in ['ADDS']:\n        input_spec = [[\n            InputSpec(shape=[None, cfg.num_channels, cfg.height, cfg.width],\n                      dtype='float32'),\n        ]]\n    elif model_name in ['AVA_SlowFast_FastRcnn']:\n        input_spec = [[\n            InputSpec(shape=[\n                None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,"
+        },
+        {
+            "comment": "This code defines a function that takes in a model name and returns the input specification for different models. The input specification determines the shape, dtype, and name of the input tensors for each model. Different models have different input specifications based on their architecture and requirements. The returned input specification is used to build the model correctly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py\":204-235",
+            "content": "                cfg.target_size\n            ],\n                      dtype='float32',\n                      name='slow_input'),\n            InputSpec(shape=[\n                None, 3, cfg.num_frames, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32',\n                      name='fast_input'),\n            InputSpec(shape=[None, None, 4], dtype='float32', name='proposals'),\n            InputSpec(shape=[None, 2], dtype='float32', name='img_shape')\n        ]]\n    elif model_name in ['PoseC3D']:\n        input_spec = [[\n            InputSpec(shape=[None, 1, 17, 48, 56, 56], dtype='float32'),\n        ]]\n    elif model_name in ['YOWO']:\n        input_spec = [[\n            InputSpec(shape=[\n                1, 3, cfg.num_seg, cfg.target_size, cfg.target_size\n            ],\n                      dtype='float32'),\n        ]]\n    return input_spec\ndef main():\n    args = parse_args()\n    cfg, model_name = trim_config(\n        get_config(args.config, overrides=args.override, show=False))\n    print(f\"Building model({model_name})...\")"
+        },
+        {
+            "comment": "Building the model, checking pretrained params are a file path, creating output directory if necessary, loading pretrained params, setting parameters to the model, evaluating the model, reppping layers if required, getting input specification, converting model to static, saving model with a specified name, and printing saved model location.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/export_model.py\":236-266",
+            "content": "    model = build_model(cfg.MODEL)\n    assert osp.isfile(\n        args.pretrained_params\n    ), f\"pretrained params ({args.pretrained_params} is not a file path.)\"\n    if not os.path.isdir(args.output_path):\n        os.makedirs(args.output_path)\n    print(f\"Loading params from ({args.pretrained_params})...\")\n    params = paddle.load(args.pretrained_params)\n    model.set_dict(params)\n    model.eval()\n    # for rep nets\n    for layer in model.sublayers():\n        if hasattr(layer, \"rep\") and not getattr(layer, \"is_repped\"):\n            layer.rep()\n    input_spec = get_input_spec(cfg.INFERENCE, model_name)\n    model = to_static(model, input_spec=input_spec)\n    paddle.jit.save(\n        model,\n        osp.join(args.output_path,\n                 model_name if args.save_name is None else args.save_name))\n    print(\n        f\"model ({model_name}) has been already saved in ({args.output_path}).\")\nif __name__ == \"__main__\":\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/06d1fdf4-f110-42ec-a870-5d8c495e2d54.json b/docs/doc/06d1fdf4-f110-42ec-a870-5d8c495e2d54.json
new file mode 100644
index 000000000..debb33540
--- /dev/null
+++ b/docs/doc/06d1fdf4-f110-42ec-a870-5d8c495e2d54.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code utilizes ffmpeg to extract frames and PCM audio from video files, creating folders if necessary. It can process multiple MP4 files in parallel with up to 10 workers using the \"extract_frames\", \"extract_pcm\", and \"process\" functions.",
+    "details": [
+        {
+            "comment": "This code retrieves frames and Pulse Code Modulation (PCM) audio from video files. It uses the ffmpeg tool for extraction, creating folders if they don't exist already, and removes existing files before processing new ones. The \"extract_frames\" function takes a video name and output folder to extract frames at a specified frame rate. The \"extract_pcm\" function converts audio from a video file to PCM format using ffmpeg. The \"process\" function prints each line, presumably for tracking progress or errors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_frames_pcm.py\":0-36",
+            "content": "\"\"\"\nget frames and pcm from video\n\"\"\"\nimport os\nfrom concurrent import futures\ndataset = \"../EuroCup2016\"\nurl_list = os.path.join(dataset, 'url.list')\ndst_frames = os.path.join(dataset, 'frames')\ndst_pcm = os.path.join(dataset, 'pcm')\nif not os.path.exists(dst_frames):\n    os.mkdir(dst_frames)\nif not os.path.exists(dst_pcm):\n    os.mkdir(dst_pcm)\ndef extract_frames(video_name, out_folder, fps=5):\n    if os.path.exists(out_folder):\n        os.system('rm -rf ' + out_folder + '/*')\n        os.system('rm -rf ' + out_folder)\n    os.makedirs(out_folder)\n    cmd = 'ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (video_name, fps,\n                                                      out_folder, '%08d')\n    os.system(cmd)\ndef extract_pcm(video_name, file_name_pcm):\n    cmd = 'ffmpeg -y -i %s -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' % (\n        video_name, file_name_pcm)\n    os.system(cmd)\ndef process(line):\n    print(line)\n    mp4_name = os.path.join(dataset, line)\n    basename = os.path.basename(line).split('.')[0]\n    folder_frame = os.path.join(dst_frames, basename)"
+        },
+        {
+            "comment": "Code is reading a list of URLs, extracting frames and audio from each MP4 file, then executing the process in multiple threads with up to 10 workers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_frames_pcm.py\":37-53",
+            "content": "    filename_pcm = os.path.join(dst_pcm, basename + '.pcm')\n    # extract\n    extract_frames(mp4_name, folder_frame)\n    extract_pcm(mp4_name, filename_pcm)\nif __name__ == \"__main__\":\n    with open(url_list, 'r') as f:\n        lines = f.readlines()\n    lines = [k.strip() for k in lines]\n    # multi thread\n    with futures.ProcessPoolExecutor(max_workers=10) as executer:\n        fs = [executer.submit(process, line) for line in lines]\n    #for line in lines:\n    #    process(line)\n    print(\"done\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/06f299fd-a77a-4245-8915-86813f4c644b.json b/docs/doc/06f299fd-a77a-4245-8915-86813f4c644b.json
new file mode 100644
index 000000000..059876222
--- /dev/null
+++ b/docs/doc/06f299fd-a77a-4245-8915-86813f4c644b.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code provides information about customizing different aspects of the PaddleVideo framework, including dataset, network, solvers, metrics, and debug tools. It discusses finetuning, adding new augmentations and batch augments in the pipeline, modular design, changing frameworks, initializing functions, loss functions, step/epoch decay, creating customized solvers, adding new data processing, records, and metrics, as well as using debug levels and FAQ.",
+    "details": [
+        {
+            "comment": "This code provides information about customizing different aspects of the PaddleVideo framework, including dataset, network, solvers, metrics, and debug tools. It discusses finetuning, adding new augmentations and batch augments in the pipeline, modular design, changing frameworks, initializing functions, loss functions, step/epoch decay, creating customized solvers, adding new data processing, records, and metrics, as well as using debug levels and FAQ.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/customized_usage.md\":0-43",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/tutorials/customized_usage.md) | English\n# Customized Usage\n## Customized Dataset\n1. finetune\nPlease refer to [finetune](../start.md#model_finetune) if only change a \"regular\" dataset.\n2. customized pipeline\n  - add new augments\n  - add new batch augments\n  **Note**: Be care of checking the difference of different modes.\n## Customized Network\n1. module function\nPlease refer to [modular desigh](modular_design.md) for more information.\n2. customized framework\n  - change framework\n  - change initialized function\n  - customized loss\n## Customized Solvers\n1. step decay and epoch decay\n2. customized solvers\n## Customized metrics\n  - add new data processing\n  - add new record\n  - add new metrics\n## Debug tools\n1. Debug level\n2. FAQ"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0754c534-e1de-4f8c-b0c1-e9d4562e2b1d.json b/docs/doc/0754c534-e1de-4f8c-b0c1-e9d4562e2b1d.json
new file mode 100644
index 000000000..874ce33fb
--- /dev/null
+++ b/docs/doc/0754c534-e1de-4f8c-b0c1-e9d4562e2b1d.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines the L1Loss class for computing the L1 loss, commonly used in image and video quality assessment tasks. The code calculates the L1 loss between 'score' and 'labels', ensuring compatible data types, and returns the resulting loss.",
+    "details": [
+        {
+            "comment": "This code defines a class called L1Loss that extends BaseWeightedLoss and implements the forward function for computing the L1 loss. The L1 loss is commonly used in image and video quality assessment tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py\":0-32",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass L1Loss(BaseWeightedLoss):\n    \"\"\"L1 Loss.\"\"\"\n    def _forward(self, score, labels):\n        \"\"\"Forward function.\n        Args:\n            score (paddle.Tensor): The class score.\n            labels (paddle.Tensor): The ground truth labels.\n        Returns:\n            loss (paddle.Tensor): The returned L1 loss."
+        },
+        {
+            "comment": "This code snippet calculates the L1 loss between 'score' and 'labels', ensuring they have compatible data types, and then returns the resulting loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py\":33-37",
+            "content": "        \"\"\"\n        labels = labels.astype(score.dtype)\n        loss = F.l1_loss(score, labels)\n        return loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/07e62c03-840e-4b69-a731-c0a13de042f6.json b/docs/doc/07e62c03-840e-4b69-a731-c0a13de042f6.json
new file mode 100644
index 000000000..c17624a2b
--- /dev/null
+++ b/docs/doc/07e62c03-840e-4b69-a731-c0a13de042f6.json
@@ -0,0 +1,75 @@
+{
+    "summary": "The VOSMetric class initializes attributes, performs data processing and augmentation for video object segmentation, measures frame rates, handles flipped labels, and frees memory. It also tracks sequences, compresses files into a zip, creates image masks, aggregates metrics, and logs results.",
+    "details": [
+        {
+            "comment": "This code defines a class VOSMetric that inherits from BaseMetric and is registered in the METRIC registry. It takes data_size, batch_size, result_root, zip_dir, and log_interval as parameters for metrics preparation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":0-37",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport os\nimport paddle\nimport zipfile\nimport time\nfrom PIL import Image\nfrom paddle.io import DataLoader\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom EIVideo.paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass VOSMetric(BaseMetric):\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 result_root,\n                 zip_dir,\n                 log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)"
+        },
+        {
+            "comment": "The code initializes a class with attributes to store video processing information, such as the total number of videos, count, result root, and zip directory. The update method takes batch ID, data, and model as inputs and processes each sequence by incrementing the video number and logging the current processed sequence. It then creates a data loader, calculates the total time and frame count for the current sequence, and stores reference embeddings and masks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":38-66",
+            "content": "        self.video_num = 0\n        self.total_time = 0\n        self.total_frame = 0\n        self.total_sfps = 0\n        self.total_video_num = data_size\n        self.count = 0\n        self.result_root = result_root\n        self.zip_dir = zip_dir\n    def update(self, batch_id, data, model):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        self.video_num += 1\n        seq_dataset = data\n        seq_name = seq_dataset.seq_name\n        logger.info('Prcessing Seq {} [{}/{}]:'.format(seq_name,\n                                                       self.video_num,\n                                                       self.total_video_num))\n        seq_dataloader = DataLoader(seq_dataset,\n                                    return_list=True,\n                                    batch_size=1,\n                                    shuffle=False,\n                                    num_workers=0)\n        seq_total_time = 0\n        seq_total_frame = 0\n        ref_embeddings = []\n        ref_masks = []\n        prev_embedding = []"
+        },
+        {
+            "comment": "The code initializes empty lists for reference embeddings, reference masks, previous embeddings, and previous masks. It then uses Paddle's no_grad context to iterate over a data loader with multiple samples. For each sample, it checks if the corresponding reference embedding and mask lists are long enough, appending them if necessary. It assigns current image, label, previous embedding, and mask values from the sample and converts the label into a tensor for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":67-89",
+            "content": "        prev_mask = []\n        with paddle.no_grad():\n            for frame_idx, samples in enumerate(seq_dataloader):\n                time_start = time.time()\n                all_preds = []\n                join_label = None\n                for aug_idx in range(len(samples)):\n                    if len(ref_embeddings) <= aug_idx:\n                        ref_embeddings.append([])\n                        ref_masks.append([])\n                        prev_embedding.append(None)\n                        prev_mask.append(None)\n                    sample = samples[aug_idx]\n                    ref_emb = ref_embeddings[aug_idx]\n                    ref_m = ref_masks[aug_idx]\n                    prev_emb = prev_embedding[aug_idx]\n                    prev_m = prev_mask[aug_idx]\n                    current_img = sample['current_img']\n                    if 'current_label' in sample.keys():\n                        current_label = sample['current_label']\n                        current_label = paddle.to_tensor(current_label)"
+        },
+        {
+            "comment": "This code is a part of the PaddleVideo framework, specifically for the EIVideo application. It prepares the data for model input and then runs it through the model to generate predictions and embeddings. If the current label is None (first frame), it logs an information message. The code also keeps track of references embeddings based on augmentation index (aug_idx) in ref_embeddings list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":90-113",
+            "content": "                    else:\n                        current_label = None\n                    obj_num = sample['meta']['obj_num']\n                    imgname = sample['meta']['current_name']\n                    ori_height = sample['meta']['height']\n                    ori_width = sample['meta']['width']\n                    current_img = current_img\n                    obj_num = obj_num\n                    bs, _, h, w = current_img.shape\n                    data_batch = [\n                        ref_emb, ref_m, prev_emb, prev_m, current_img,\n                        [ori_height, ori_width], obj_num\n                    ]\n                    all_pred, current_embedding = model(data_batch,\n                                                        mode='test')\n                    if frame_idx == 0:\n                        if current_label is None:\n                            logger.info(\n                                \"No first frame label in Seq {}.\".format(\n                                    seq_name))\n                        ref_embeddings[aug_idx].append(current_embedding)"
+        },
+        {
+            "comment": "This code appears to be part of a video object segmentation model, specifically for the YouTube-VOS task. It checks if there are new objects and updates labels accordingly while maintaining reference masks, previous embeddings, and all predictions. The code also handles flipping based on the 'meta' information of each sample.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":114-130",
+            "content": "                        ref_masks[aug_idx].append(current_label)\n                        prev_embedding[aug_idx] = current_embedding\n                        prev_mask[aug_idx] = current_label\n                    else:\n                        if sample['meta']['flip']:  #False\n                            all_pred = self.flip_tensor(all_pred, 3)\n                        #  In YouTube-VOS, not all the objects appear in the first frame for the first time. Thus, we\n                        #  have to introduce new labels for new objects, if necessary.\n                        if not sample['meta']['flip'] and not (\n                                current_label is None) and join_label is None:\n                            join_label = paddle.cast(current_label,\n                                                     dtype='int64')\n                        all_preds.append(all_pred)\n                        if current_label is not None:\n                            ref_embeddings[aug_idx].append(current_embedding)\n                        prev_embedding[aug_idx] = current_embedding"
+        },
+        {
+            "comment": "This code segment is performing data augmentation and label averaging. It first concatenates previous predictions, then calculates the mean to average the results from different augmentations. If a join_label exists, it performs element-wise multiplication with a keep mask to combine with current_label. Finally, it reshapes pred_label into a 1x1xori_heightxori_width tensor and flips it along the second dimension using self.flip_tensor function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":132-147",
+            "content": "                if frame_idx > 0:\n                    all_preds = paddle.concat(all_preds, axis=0)\n                    all_preds = paddle.mean(\n                        all_preds, axis=0)  #average results if augmentation\n                    pred_label = paddle.argmax(all_preds, axis=0)\n                    if join_label is not None:\n                        join_label = paddle.squeeze(paddle.squeeze(join_label,\n                                                                   axis=0),\n                                                    axis=0)\n                        keep = paddle.cast((join_label == 0), dtype=\"int64\")\n                        pred_label = pred_label * keep + join_label * (1 -\n                                                                       keep)\n                        pred_label = pred_label\n                    current_label = paddle.reshape(\n                        pred_label, shape=[1, 1, ori_height, ori_width])\n                    flip_pred_label = self.flip_tensor(pred_label, 1)"
+        },
+        {
+            "comment": "This code generates flipped labels for each frame of a video sequence. It checks the 'flip' flag in the sample metadata and appends either the current or flipped label to the corresponding list. The code also calculates the time taken per frame and updates total sequence time and frame count. Finally, it logs the frame number, object count, and time taken.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":148-167",
+            "content": "                    flip_current_label = paddle.reshape(\n                        flip_pred_label, shape=[1, 1, ori_height, ori_width])\n                    for aug_idx in range(len(samples)):\n                        if join_label is not None:\n                            if samples[aug_idx]['meta']['flip']:\n                                ref_masks[aug_idx].append(flip_current_label)\n                            else:\n                                ref_masks[aug_idx].append(current_label)\n                        if samples[aug_idx]['meta']['flip']:\n                            prev_mask[aug_idx] = flip_current_label\n                        else:\n                            prev_mask[\n                                aug_idx] = current_label  #update prev_mask\n                    one_frametime = time.time() - time_start\n                    seq_total_time += one_frametime\n                    seq_total_frame += 1\n                    obj_num = float(obj_num)\n                    logger.info('Frame: {}, Obj Num: {}, Time: {}'.format("
+        },
+        {
+            "comment": "This code calculates the average time per frame for each sequence and updates the total time, total frames, and average speed. It then calculates the average speed across all sequences. The code uses a \"else\" statement to handle cases where no object is detected in an image and calculates the time taken for processing that image. It also deletes variables used within the loop to free up memory before moving on to the next sequence or batch of images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":168-191",
+            "content": "                        imgname[0], obj_num, one_frametime))\n                    self.save_mask(\n                        pred_label,\n                        os.path.join(self.result_root, seq_name,\n                                     imgname[0].split('.')[0] + '.png'))\n                else:\n                    one_frametime = time.time() - time_start\n                    seq_total_time += one_frametime\n                    logger.info('Ref Frame: {}, Time: {}'.format(\n                        imgname[0], one_frametime))\n            del (ref_embeddings)\n            del (ref_masks)\n            del (prev_embedding)\n            del (prev_mask)\n            del (seq_dataset)\n            del (seq_dataloader)\n        seq_avg_time_per_frame = seq_total_time / seq_total_frame\n        self.total_time += seq_total_time\n        self.total_frame += seq_total_frame\n        total_avg_time_per_frame = self.total_time / self.total_frame\n        self.total_sfps += seq_avg_time_per_frame\n        avg_sfps = self.total_sfps / (batch_id + 1)"
+        },
+        {
+            "comment": "Logger is reporting sequence frame rate, total frame rate and frame rate per sequence.\nFunction flips tensor along a specified dimension.\nFunction saves a mask tensor to a given path using specific palette colors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":192-210",
+            "content": "        logger.info(\"Seq {} FPS: {}, Total FPS: {}, FPS per Seq: {}\".format(\n            seq_name, 1. / seq_avg_time_per_frame,\n            1. / total_avg_time_per_frame, 1. / avg_sfps))\n    def flip_tensor(self, tensor, dim=0):\n        inv_idx = paddle.cast(paddle.arange(tensor.shape[dim] - 1, -1, -1),\n                              dtype=\"int64\")\n        tensor = paddle.index_select(x=tensor, index=inv_idx, axis=dim)\n        return tensor\n    def save_mask(self, mask_tensor, path):\n        _palette = [\n            0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128,\n            0, 128, 128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191,\n            128, 0, 64, 0, 128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0,\n            64, 0, 128, 64, 0, 0, 191, 0, 128, 191, 0, 0, 64, 128, 128, 64,\n            128, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26,\n            27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32,\n            32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38,"
+        },
+        {
+            "comment": "This code represents a sequence of numbers, potentially used for various purposes within the codebase such as tracking or counting.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":211-223",
+            "content": "            38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43,\n            44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49,\n            49, 50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55,\n            55, 55, 56, 56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60,\n            61, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66,\n            66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72,\n            72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77,\n            78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 83, 83,\n            83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 89,\n            89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94, 94, 94,\n            95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n            100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104,\n            104, 105, 105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108,"
+        },
+        {
+            "comment": "The code contains a sequence of numbers ranging from 109 to 169, which could be used as array indices or other numeric identifiers in the following lines. Without further context, it's difficult to determine the exact purpose of these numbers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":224-237",
+            "content": "            109, 109, 109, 110, 110, 110, 111, 111, 111, 112, 112, 112, 113,\n            113, 113, 114, 114, 114, 115, 115, 115, 116, 116, 116, 117, 117,\n            117, 118, 118, 118, 119, 119, 119, 120, 120, 120, 121, 121, 121,\n            122, 122, 122, 123, 123, 123, 124, 124, 124, 125, 125, 125, 126,\n            126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130, 130,\n            130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134,\n            135, 135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139,\n            139, 139, 140, 140, 140, 141, 141, 141, 142, 142, 142, 143, 143,\n            143, 144, 144, 144, 145, 145, 145, 146, 146, 146, 147, 147, 147,\n            148, 148, 148, 149, 149, 149, 150, 150, 150, 151, 151, 151, 152,\n            152, 152, 153, 153, 153, 154, 154, 154, 155, 155, 155, 156, 156,\n            156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160, 160, 160,\n            161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n            165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169,"
+        },
+        {
+            "comment": "This code snippet appears to be a list or sequence of numbers, possibly representing frame coordinates, timestamps, or some other numerical data used in video processing or analysis. The specific application and purpose would require further context from the surrounding code and documentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":238-251",
+            "content": "            169, 170, 170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173,\n            174, 174, 174, 175, 175, 175, 176, 176, 176, 177, 177, 177, 178,\n            178, 178, 179, 179, 179, 180, 180, 180, 181, 181, 181, 182, 182,\n            182, 183, 183, 183, 184, 184, 184, 185, 185, 185, 186, 186, 186,\n            187, 187, 187, 188, 188, 188, 189, 189, 189, 190, 190, 190, 191,\n            191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195, 195,\n            195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199,\n            200, 200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204,\n            204, 204, 205, 205, 205, 206, 206, 206, 207, 207, 207, 208, 208,\n            208, 209, 209, 209, 210, 210, 210, 211, 211, 211, 212, 212, 212,\n            213, 213, 213, 214, 214, 214, 215, 215, 215, 216, 216, 216, 217,\n            217, 217, 218, 218, 218, 219, 219, 219, 220, 220, 220, 221, 221,\n            221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225, 225, 225,\n            226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,"
+        },
+        {
+            "comment": "This code snippet seems to define a function `zip_folder` that compresses files within a source folder into a zip file. It also includes a nested function that creates and saves an image mask, potentially for visualization purposes. However, the context or specific functionality of these functions is not clear without additional information about the larger codebase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":252-270",
+            "content": "            230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234,\n            234, 235, 235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238,\n            239, 239, 239, 240, 240, 240, 241, 241, 241, 242, 242, 242, 243,\n            243, 243, 244, 244, 244, 245, 245, 245, 246, 246, 246, 247, 247,\n            247, 248, 248, 248, 249, 249, 249, 250, 250, 250, 251, 251, 251,\n            252, 252, 252, 253, 253, 253, 254, 254, 254, 255, 255, 255\n        ]\n        mask = mask_tensor.cpu().numpy().astype('uint8')\n        mask = Image.fromarray(mask).convert('P')\n        mask.putpalette(_palette)\n        mask.save(path)\n    def zip_folder(self, source_folder, zip_dir):\n        f = zipfile.ZipFile(zip_dir, 'w', zipfile.ZIP_DEFLATED)\n        pre_len = len(os.path.dirname(source_folder))\n        for dirpath, dirnames, filenames in os.walk(source_folder):\n            for filename in filenames:\n                pathfile = os.path.join(dirpath, filename)\n                arcname = pathfile[pre_len:].strip(os.path.sep)"
+        },
+        {
+            "comment": "This code writes data to a file and then closes it. It defines a function called accumulate that aggregates metrics when all iterations are complete. This function zips the folder, creates a zip directory, and logs a message indicating that the result is saved in the specified directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py\":271-278",
+            "content": "                f.write(pathfile, arcname)\n        f.close()\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        self.zip_folder(self.result_root, self.zip_dir)\n        logger.info('Save result to {}.'.format(self.zip_dir))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/07eb3e9e-e741-437f-bd66-a0e0e9f673a5.json b/docs/doc/07eb3e9e-e741-437f-bd66-a0e0e9f673a5.json
new file mode 100644
index 000000000..8ee14dd99
--- /dev/null
+++ b/docs/doc/07eb3e9e-e741-437f-bd66-a0e0e9f673a5.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code introduces TSN, a 2D-CNN-based video classification solution that utilizes sparse sampling and ResNet-50 as its backbone. It trains on Kinetics-400 dataset with pre-trained weights, provides data preparation/model config details, tests different methods/backbones, and exports an \"TSN\" inference model.",
+    "details": [
+        {
+            "comment": "This code introduces TSN (Temporal Segment Network), a 2D-CNN-based solution for video classification. It uses sparse sampling to capture global information, reduce redundancy, and decrease computational burden. The model is based on single-channel RGB images and utilizes ResNet-50 as the backbone.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn.md\":0-19",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/tsn.md) | English\n# TSN\n## Content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Details](#Details)\n- [Reference](#Reference)\n## Introduction\nTemporal Segment Network (TSN) is a classic 2D-CNN-based solution in the field of video classification. This method mainly solves the problem of long-term behavior recognition of video, and replaces dense sampling by sparsely sampling video frames, which can not only capture the global information of the video, but also remove redundancy and reduce the amount of calculation. The core idea is to average the features of each frame as the overall feature of the video, and then enter the classifier for classification. The model implemented by this code is a TSN network based on a single-channel RGB image, and Backbone uses the ResNet-50 structure.\n<div align=\"center\">\n<img src=\"../../../images/tsn_architecture.png\" height=350 width=80000 hspace='10'/> <br />"
+        },
+        {
+            "comment": "This code provides instructions for training the Temporal Segment Networks model on the Kinetics-400 dataset. It explains how to download and add pre-trained ResNet50 weights as initialization parameters, and specifies where to find more information about data preparation and model configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn.md\":20-47",
+            "content": "</div>\nFor details, please refer to the ECCV 2016 paper [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)\n## Data\nPaddleVide provides training and testing scripts on the Kinetics-400 dataset. Kinetics-400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)\n## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Load the ResNet50 weights trained on ImageNet1000 as Backbone initialization parameters [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams), or download through the command line\n   ```bash\n   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/tsn/tsn_k400_frames.yaml`, and fill in the downloaded weight path below `pretrained:`\n   ```yaml\n   MODEL:\n       framework: \"Recognizer2D\"\n       backbone:\n           name: \"ResNet\""
+        },
+        {
+            "comment": "Start training: Use Kinetics-400 dataset and 8 GPUs for training, command to start the training process.\nTest: TSN model test mode uses TenCrop method for better accuracy, different from training's CenterCrop; obtain final index by testing best model after training completes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn.md\":48-64",
+            "content": "           pretrained: fill in the path here\n   ```\n#### Start training\n- Kinetics-400 data set uses 8 cards for training, the training start command for frames format data is as follows\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsn main.py --validate -c configs/recognition/ tsn/tsn_k400_frames.yaml\n  ```\n## Test\nSince the sampling method of the TSN model test mode is **TenCrop** with a slower speed but higher accuracy, which is different from the **CenterCrop** used in the verification mode during the training process, the verification index `topk Acc` recorded in the training log It does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsn main.py --test -c configs/recognition/ tsn/tsn_k400_frames.yaml -w \"output/TSN/TSN_best.pdparams\""
+        },
+        {
+            "comment": "The code is providing test indicator results for TSN model on the validation dataset of Kinetics-400 using different backbone, sampling methods, and training strategies. It also shows the checkpoints' URLs. Additionally, it exports an inference model named \"TSN\" into a folder called \"inference/TSN\" from the specified configuration file, model parameters, and output directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn.md\":65-80",
+            "content": "```\nWhen the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:\n| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 |                         checkpoints                          |\n| :------: | :-------------: | :---------------: | :-----: | :---------: | :---: | :----------------------------------------------------------: |\n| ResNet50 |     TenCrop     |       NCHW        |   3    |     224     | 69.81 | [TSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams) |\n| ResNet50 |     TenCrop     |       NCHW        |   8    |     224     | 71.70 | [TSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400_8.pdparams) |\n## Inference\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml \\\n                                -p data/TSN_k400.pdparams \\\n                                -o inference/TSN"
+        },
+        {
+            "comment": "This code is for generating and using the TSN model in PaddlePaddle for video recognition. It generates a model structure file (TSN.pdmodel) and weight file (TSN.pdiparams), and then uses predict.py to predict the labels of frames from a video file (example.avi) using the generated files, with GPU acceleration enabled. The model reads frames sparsely sampled from videos in the Kinetics-400 dataset, divides them into segments, extracts one frame per segment, and applies random data augmentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn.md\":81-102",
+            "content": "```\nThe above command will generate the model structure file `TSN.pdmodel` and the model weight file `TSN.pdiparams` required for prediction.\nFor the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-Model Reasoning)\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/tsn/tsn_k400_frames.yaml \\\n                           --model_file inference/TSN/TSN.pdmodel \\\n                           --params_file inference/TSN/TSN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\n## Details\n**data processing:**\n- The model reads the `mp4` data in the Kinetics-400 data set, first divides each piece of video data into `num_seg` segments, and then evenly extracts 1 frame of image from each segment to obtain sparsely sampled `num_seg` video frames , And then do the same random da"
+        },
+        {
+            "comment": "Enhances `num_seg` frame image with multi-scale random cropping, flips, normalization, and zooms to `target_size`. Momentum optimization is used for training, L2 decay with 1e-4 attenuation coefficient, global gradient clipping with a factor of 40.0. Total epochs are 100, learning rate decreases at epochs 40 and 80, dropout_ratio=0.4. KaimingNormal and Constant initializers used for convolutional layers and FC layer weights, respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn.md\":102-118",
+            "content": "ta enhancement to this `num_seg` frame image, including multi-scale random cropping, random left and right flips, data normalization, etc., and finally zoom to `target_size`\n**training strategy:**\n- Use Momentum optimization algorithm for training, momentum=0.9\n- Using L2_Decay, the weight attenuation coefficient is 1e-4\n- Use global gradient clipping, with a clipping factor of 40.0\n- The total number of epochs is 100, and the learning rate will be attenuated by 0.1 times when the epoch reaches 40 and 80\n- Dropout_ratio=0.4\n**parameter initialization**\n- The convolutional layer of the TSN model uses Paddle's default [KaimingNormal](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/initializer/KaimingNormal_cn.html#kaimingnormal) and [Constant](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/Constant_cn.html#constant) initialization method, with Normal(mean=0, std= 0.01) normal distribution to initialize the weight of the FC layer, and a constant 0 to initialize the bias of the FC layer"
+        },
+        {
+            "comment": "The code contains a reference to the paper \"Temporal Segment Networks: Towards Good Practices for Deep Action Recognition\" by Limin Wang et al., which provides information on the implementation of TSN model in PaddleVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn.md\":120-122",
+            "content": "## Reference\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/08106cdb-f567-4dfc-9b16-c78a14078db6.json b/docs/doc/08106cdb-f567-4dfc-9b16-c78a14078db6.json
new file mode 100644
index 000000000..893481a05
--- /dev/null
+++ b/docs/doc/08106cdb-f567-4dfc-9b16-c78a14078db6.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code implements AGCN model for improved ST-GCN accuracy on FSD-10 and NTU-RGBD datasets, achieving high Top-1 accuracies. It provides instructions for data preparation, training, testing, inference, evaluation, and exports an AGCN model for video recognition using Multi-stream Adaptive Graph Convolutional Networks.",
+    "details": [
+        {
+            "comment": "This code describes the Adaptive Graph Convolution Network (AGCN) implementation for improving the accuracy of ST-GCN, trained on FSD-10 and NTU-RGBD datasets. It provides instructions for data preparation, training, testing, and inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn.md\":0-45",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/agcn.md) | English\n# AGCN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe implemented Adaptive Graph Convolution Network to improve the accuracy of [ST-GCN](./stgcn.md).\n## Data\nPlease refer to FSD-10 data download and preparation doc [FSD](../../dataset/fsd.md)\nPlease refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)\n## Train\n### Train on FSD\n- Train AGCN on FSD scripts:\n```bash\npython3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml\n```\n- Turn off `valid` when training, as validation dataset is not available for the competition.\n### Train on NTU-RGBD\n- Train AGCN on NTU-RGBD scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_agcn  main.py  --validate -c configs/recognition/agcn/agcn_ntucs.yaml\n```\n- config file `agcn_ntucs.yaml` corresponding to the config of AGCN on NTU-RGB+D dataset with cross-subject splits."
+        },
+        {
+            "comment": "This code provides test scripts to evaluate the performance of the AGCN model on two datasets: FSD and NTU-RGB+D. The test scripts require specifying a configuration file (-c) and a weight path (-w). Evaluation results are saved in submission.csv, with final scores available on the competition website. Testing on FSD dataset returns a Top-1 accuracy of 62.29, while testing on NTU-RGB+D dataset (cross-subject split) returns a Top-1 accuracy of 83.27. The respective model checkpoints are also provided as links for further exploration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn.md\":48-83",
+            "content": "## Test\n### Test onf FSD\n- Test scripts\uff1a\n```bash\npython3.7 main.py --test -c configs/recognition/agcn/agcn_fsd.yaml  -w output/AGCN/AGCN_epoch_00100.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\n- Evaluation results will be saved in `submission.csv` file, final score can be obtained in [competition website](https://aistudio.baidu.com/aistudio/competition/detail/115).\nAccuracy on FSD dataset:\n| Test_Data | Top-1 | checkpoints |\n| :----: | :----: | :---- |\n| Test_A | 62.29 | [AGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams)|\n### Test on NTU-RGB+D\n- Test scripts\uff1a\n```bash\npython3.7 main.py --test -c configs/recognition/agcn/agcn_ntucs.yaml -w output/AGCN/AGCN_best.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on NTU-RGB+D dataset:\n| split | Top-1 | checkpoints |\n| :----: | :----: | :---- |\n| cross-subject | 83.27 | [AGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_ntucs.pdparams)|"
+        },
+        {
+            "comment": "This code provides instructions on how to export and use an inference model called AGCN for video recognition. It shows the command to obtain the architecture file (AGCN.pdmodel) and parameter file (AGCN.pdiparams), as well as an example of how to run prediction using the provided files, specifying input data, configuration, and whether to use GPU or not. The output includes the top-1 class and its corresponding score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn.md\":86-116",
+            "content": "## Inference\n### export inference model\n To get model architecture file `AGCN.pdmodel` and parameters file `AGCN.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml \\\n                                -p data/AGCN_fsd.pdparams \\\n                                -o inference/AGCN\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \\\n                           --config configs/recognition/agcn/agcn_fsd.yaml \\\n                           --model_file inference/AGCN/AGCN.pdmodel \\\n                           --params_file inference/AGCN/AGCN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/fsd10/example_skeleton.npy\n        top-1 class: 27\n        top-1 score: 0.8965644240379333"
+        },
+        {
+            "comment": "This code snippet implements a Multi-stream Adaptive Graph Convolutional Network for skeleton-based action recognition. It utilizes two input streams (spatial and temporal) to process the data and applies adaptive graph convolution on each stream separately, followed by concatenation of the two streams before being passed through MLP and softmax layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn.md\":117-128",
+            "content": "```\n## Reference\n- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin\n- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1805.07694), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu\n- [Skeleton-Based Action Recognition with Multi-Stream Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1912.06971), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu\n- Many thanks to [li7819559](https://github.com/li7819559) and [ZhaoJingjing713](https://github.com/ZhaoJingjing713) for contributing the code."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/08612506-c012-4cf5-bdad-9241be0885d5.json b/docs/doc/08612506-c012-4cf5-bdad-9241be0885d5.json
new file mode 100644
index 000000000..8bbcce155
--- /dev/null
+++ b/docs/doc/08612506-c012-4cf5-bdad-9241be0885d5.json
@@ -0,0 +1,140 @@
+{
+    "summary": "The code introduces a DropPath layer, Swin Transformer backbone with window-based multi-head attention for image processing, and implements the Swin Transformer Block 3D in PaddleVideo, which also features a 3D PatchEmbed3D and 3D backbone.",
+    "details": [
+        {
+            "comment": "Copyright notice, import statements, and drop_path function definition for stochastic depth in residual blocks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":0-32",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom functools import lru_cache, reduce\nfrom operator import mul\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Constant\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import trunc_normal_\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)\ndef drop_path(x, drop_prob=0., training=False):\n    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."
+        },
+        {
+            "comment": "This code snippet defines a \"DropPath\" layer that applies drop paths (Stochastic Depth) to the input, based on the provided drop probability. The drop paths are applied in the main path of residual blocks for each sample. This class also includes a forward method that drops out elements from the input with the specified probability during training but returns the original input unchanged when not training or if the drop probability is 0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":33-63",
+            "content": "    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    # issuecomment-532968956 ...\n    See discussion: https://github.com/tensorflow/tpu/issues/494\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob)\n    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)\n    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\nclass Mlp(nn.Layer):\n    \"\"\" Multilayer perceptron.\"\"\"\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,"
+        },
+        {
+            "comment": "The code above defines a layer for the Swin Transformer backbone. It contains two linear layers, an activation function (GELU), and a dropout layer. The `window_partition` function partitions input tensor based on specified window size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":64-98",
+            "content": "                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\ndef window_partition(x, window_size):\n    \"\"\"window_partition\n    Args:\n        x (Tensor): x.shape = [B, D, H, W, C]\n        window_size (tuple[int]): window_size\n    Returns:\n        Tensor: (B*num_windows, window_size*window_size, C)\n    \"\"\"\n    B, D, H, W, C = x.shape\n    x = x.reshape([\n        B, D // window_size[0], window_size[0], H // window_size[1],\n        window_size[1], W // window_size[2], window_size[2], C\n    ])\n    windows = x.transpose([0, 1, 3, 5, 2, 4, 6,"
+        },
+        {
+            "comment": "The code defines a function `window_reverse` that takes a set of windows and rearranges them back into the original image shape. The `get_window_size` function determines the appropriate window size based on input dimensions. Both functions are used in the Swin Transformer backbone model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":99-136",
+            "content": "                           7]).reshape([-1, reduce(mul, window_size), C])\n    return windows\nclass Identity(nn.Layer):\n    def __init__(self):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\ndef window_reverse(windows, window_size, B, D, H, W):\n    \"\"\"\n    Args:\n        windows: (B*num_windows, window_size, window_size, C)\n        window_size (tuple[int]): Window size\n        H (int): Height of image\n        W (int): Width of image\n    Returns:\n        x: (B, D, H, W, C)\n    \"\"\"\n    x = windows.reshape([\n        B, D // window_size[0], H // window_size[1], W // window_size[2],\n        window_size[0], window_size[1], window_size[2], -1\n    ])\n    x = x.transpose([0, 1, 4, 2, 5, 3, 6, 7]).reshape([B, D, H, W, -1])\n    return x\ndef get_window_size(x_size, window_size, shift_size=None):\n    use_window_size = list(window_size)\n    if shift_size is not None:\n        use_shift_size = list(shift_size)\n    for i in range(len(x_size)):\n        if x_size[i] <= window_size[i]:\n            use_window_size[i] = x_size[i]"
+        },
+        {
+            "comment": "This code defines a class called \"WindowAttention3D\" which implements a window-based multi-head self attention module with relative position bias. It supports both shifted and non-shifted windows, and takes in parameters such as the number of input channels (dim), temporal length, height and width of the window (window_size), number of attention heads (num_heads), whether to add a learnable bias to query, key, value (qkv_bias), override default qk scale of head_dim ** -0.5 if set (qk_scale), dropout ratio of attention weight (attn_drop), and dropout ratio of output (proj_drop). The function at the top part of the code determines whether to use window or shift size based on a given value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":137-160",
+            "content": "            if shift_size is not None:\n                use_shift_size[i] = 0\n    if shift_size is None:\n        return tuple(use_window_size)\n    else:\n        return tuple(use_window_size), tuple(use_shift_size)\nclass WindowAttention3D(nn.Layer):\n    \"\"\" Window based multi-head self attention (W-MSA) module with relative position bias.\n    It supports both of shifted and non-shifted window.\n    Args:\n        dim (int): Number of input channels.\n        window_size (tuple[int]): The temporal length, height and width of the window.\n        num_heads (int): Number of attention heads.\n        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set\n        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0\n        proj_drop (float, optional): Dropout ratio of output. Default: 0.0\n    \"\"\"\n    def __init__(self,\n                 dim,\n                 window_size,"
+        },
+        {
+            "comment": "This code initializes the Swin Transformer's self-attention module. It defines a window size and number of attention heads, calculates head dimensions, sets up position bias table, and adds parameters for position bias table and head dimensions. The code also creates coordinate arrays for dimension and height inside the window.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":161-184",
+            "content": "                 num_heads,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.,\n                 proj_drop=0.):\n        super().__init__()\n        self.dim = dim\n        self.window_size = window_size  # Wd, Wh, Ww\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n        # define a parameter table of relative position bias\n        self.relative_position_bias_table = self.create_parameter(\n            shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1) *\n                   (2 * window_size[2] - 1), num_heads),\n            default_initializer=zeros_,\n        )  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH\n        self.add_parameter(\"relative_position_bias_table\",\n                           self.relative_position_bias_table)\n        # get pair-wise relative position index for each token inside the window\n        coords_d = paddle.arange(self.window_size[0])\n        coords_h = paddle.arange(self.window_size[1])"
+        },
+        {
+            "comment": "This code performs relative position encoding for the Swin Transformer by calculating relative coordinates of patches within a sliding window. It first creates 2D and 3D coordinate grids, then subtracts them to obtain relative positions. Finally, it shifts and scales the relative coordinates to fit the range of the window size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":185-203",
+            "content": "        coords_w = paddle.arange(self.window_size[2])\n        coords = paddle.stack(paddle.meshgrid(coords_d, coords_h,\n                                              coords_w))  # 3, Wd, Wh, Ww\n        coords_flatten = paddle.flatten(coords, 1)  # 3, Wd*Wh*Ww\n        relative_coords = coords_flatten.unsqueeze(\n            axis=2) - coords_flatten.unsqueeze(axis=1)  # 3, Wd*Wh*Ww, Wd*Wh*Ww\n        # relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1)  # 3, Wd*Wh*Ww, Wd*Wh*Ww\n        relative_coords = relative_coords.transpose([1, 2, 0\n                                                     ])  # Wd*Wh*Ww, Wd*Wh*Ww, 3\n        relative_coords[:, :,\n                        0] += self.window_size[0] - 1  # shift to start from 0\n        relative_coords[:, :, 1] += self.window_size[1] - 1\n        relative_coords[:, :, 2] += self.window_size[2] - 1\n        relative_coords[:, :, 0] *= (2 * self.window_size[1] -\n                                     1) * (2 * self.window_size[2] - 1)\n        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)"
+        },
+        {
+            "comment": "This code initializes a Swin Transformer backbone by registering a buffer for relative position indices and defining the linear projections, dropouts, softmax function, and forward pass. The forward function takes input features of shape (num_windows*B, N, C) and performs multi-head self-attention with learned query, key, and value matrices, scaled by the square root of the dimension. Attention is calculated using dot product between queries and keys, and then passed through a softmax function for normalization before being multiplied by values and projected back to the original feature space.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":204-231",
+            "content": "        relative_position_index = relative_coords.sum(\n            axis=-1)  # Wd*Wh*Ww, Wd*Wh*Ww\n        self.register_buffer(\"relative_position_index\", relative_position_index)\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.attn_drop = nn.Dropout(attn_drop)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n        trunc_normal_(self.relative_position_bias_table, std=0.02)\n        self.softmax = nn.Softmax(axis=-1)\n    def forward(self, x, mask=None):\n        \"\"\" Forward function.\n        Args:\n            x: input features with shape of (num_windows*B, N, C)\n            mask: (0/-inf) mask with shape of (num_windows, N, N) or None\n        \"\"\"\n        B_, N, C = x.shape\n        qkv = self.qkv(x).reshape(\n            [B_, N, 3, self.num_heads,\n             C // self.num_heads]).transpose([2, 0, 3, 1, 4])\n        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C\n        q = q * self.scale\n        attn = q @ k.transpose([0, 1, 3, 2])\n        relative_position_bias = self.relative_position_bias_table["
+        },
+        {
+            "comment": "This code defines the Swin Transformer Block 3D, which implements a self-attention mechanism for multi-dimensional data. It adds relative position biases to the attention scores, applies a mask if provided, and applies softmax normalization. Finally, it passes the result through two dropout layers before outputting the transformed feature map.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":232-260",
+            "content": "            self.relative_position_index[:N, :N].reshape([-1])].reshape(\n                [N, N, -1])  # Wd*Wh*Ww,Wd*Wh*Ww,nH\n        relative_position_bias = relative_position_bias.transpose(\n            [2, 0, 1])  # nH, Wd*Wh*Ww, Wd*Wh*Ww\n        attn = attn + relative_position_bias.unsqueeze(0)  # B_, nH, N, N\n        if mask is not None:\n            nW = mask.shape[0]\n            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N\n                                 ]) + mask.unsqueeze(1).unsqueeze(0).astype(attn.dtype)\n            attn = attn.reshape([-1, self.num_heads, N, N])\n            attn = self.softmax(attn)\n        else:\n            attn = self.softmax(attn)\n        attn = self.attn_drop(attn)\n        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B_, N, C])\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\nclass SwinTransformerBlock3D(nn.Layer):\n    \"\"\" Swin Transformer Block.\n    Args:\n        dim (int): Number of input channels.\n        num_heads (int): Number of attention heads."
+        },
+        {
+            "comment": "This code initializes a class for the Swin Transformer backbone, specifying the dimensions, number of heads, window size, shift size, mlp ratio, and various optional parameters like dropout rates and activation layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":261-281",
+            "content": "        window_size (tuple[int]): Window size.\n        shift_size (tuple[int]): Shift size for SW-MSA.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.\n        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.\n        drop (float, optional): Dropout rate. Default: 0.0\n        attn_drop (float, optional): Attention dropout rate. Default: 0.0\n        drop_path (float, optional): Stochastic depth rate. Default: 0.0\n        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU\n        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm\n    \"\"\"\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 window_size=(2, 7, 7),\n                 shift_size=(0, 0, 0),\n                 mlp_ratio=4.,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,"
+        },
+        {
+            "comment": "The code defines a class for the Swin Transformer backbone in PaddleVideo. It takes input parameters such as dimension, number of attention heads, window size, and shift size, and initializes layers including norm_layer and attn layer. It performs assertions on shift sizes to ensure they are within the window size limits and then initializes the normalization layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":282-306",
+            "content": "                 drop_path=0.,\n                 act_layer=nn.GELU,\n                 norm_layer=nn.LayerNorm,\n                 use_checkpoint=False):\n        super().__init__()\n        self.dim = dim\n        self.num_heads = num_heads\n        self.window_size = window_size\n        self.shift_size = shift_size\n        self.mlp_ratio = mlp_ratio\n        # self.use_checkpoint=use_checkpoint\n        assert 0 <= self.shift_size[0] < self.window_size[\n            0], \"shift_size must in 0-window_size\"\n        assert 0 <= self.shift_size[1] < self.window_size[\n            1], \"shift_size must in 0-window_size\"\n        assert 0 <= self.shift_size[2] < self.window_size[\n            2], \"shift_size must in 0-window_size\"\n        self.norm1 = norm_layer(dim)\n        self.attn = WindowAttention3D(dim,\n                                      window_size=self.window_size,\n                                      num_heads=num_heads,\n                                      qkv_bias=qkv_bias,\n                                      qk_scale=qk_scale,"
+        },
+        {
+            "comment": "This code defines a Swin Transformer backbone class with parameters like window size, shift size, and drop path. It initializes the layers including attention and mlp blocks. The forward_part1 function pads input features to multiples of window size for processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":307-329",
+            "content": "                                      attn_drop=attn_drop,\n                                      proj_drop=drop)\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        self.norm2 = norm_layer(dim)\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n    def forward_part1(self, x, mask_matrix):\n        B = paddle.shape(x)[0]\n        _, D, H, W, C = x.shape\n        window_size, shift_size = get_window_size((D, H, W), self.window_size,\n                                                  self.shift_size)\n        x = self.norm1(x)\n        # pad feature maps to multiples of window size\n        pad_l = pad_t = pad_d0 = 0\n        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]\n        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]\n        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]"
+        },
+        {
+            "comment": "This code performs a cyclic shift on the input feature map, depending on the shift size. If any of the shift sizes are greater than 0, it applies the roll operation to the feature map along specific axes (1, 2, and 3). The shifted feature map is then partitioned into windows based on the window size specified. These windows go through a self-attention layer (self.attn) and are reshaped accordingly. Finally, a reverse cyclic shift is applied to the result before returning the output feature map. This process helps in performing window-based self-attention or spatial-wise self-attention in the Swin Transformer architecture.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":330-352",
+            "content": "        x = F.pad(x, (pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1),\n                  data_format='NDHWC')\n        _, Dp, Hp, Wp, _ = x.shape\n        # cyclic shift\n        if any(i > 0 for i in shift_size):\n            shifted_x = paddle.roll(x,\n                                    shifts=(-shift_size[0], -shift_size[1],\n                                            -shift_size[2]),\n                                    axis=(1, 2, 3))\n            attn_mask = mask_matrix\n        else:\n            shifted_x = x\n            attn_mask = None\n        # partition windows\n        x_windows = window_partition(shifted_x,\n                                     window_size)  # B*nW, Wd*Wh*Ww, C\n        # W-MSA/SW-MSA\n        attn_windows = self.attn(x_windows, mask=attn_mask)  # B*nW, Wd*Wh*Ww, C\n        # merge windows\n        attn_windows = attn_windows.reshape([-1, *(window_size + (C, ))])\n        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,\n                                   Wp)  # B D' H' W' C\n        # reverse cyclic shift"
+        },
+        {
+            "comment": "The code defines a function for the forward pass of a neural network. It consists of two parts: `forward_part1` and `forward_part2`. The function takes an input tensor, performs some operations, and returns the result. The `forward_part1` function applies a shift operation to the input based on a specified shift size, followed by a padding operation if necessary. The `forward_part2` function passes the input through a multi-layer perceptron (MLP) and applies dropout. Finally, the `forward` function combines the outputs of these two parts and returns the result after adding it to an initial shortcut connection.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":353-389",
+            "content": "        if any(i > 0 for i in shift_size):\n            x = paddle.roll(shifted_x,\n                            shifts=(shift_size[0], shift_size[1],\n                                    shift_size[2]),\n                            axis=(1, 2, 3))\n        else:\n            x = shifted_x\n        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:\n            x = x[:, :D, :H, :W, :]\n        return x\n    def forward_part2(self, x):\n        return self.drop_path(self.mlp(self.norm2(x)))\n    def forward(self, x, mask_matrix):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, D, H, W, C).\n            mask_matrix: Attention mask for cyclic shift.\n        \"\"\"\n        shortcut = x\n        x = self.forward_part1(x, mask_matrix)\n        x = shortcut + self.drop_path(x).astype(shortcut.dtype)\n        x = x + self.forward_part2(x).astype(x.dtype)\n        return x\nclass PatchMerging(nn.Layer):\n    \"\"\" Patch Merging Layer\n    Args:\n        dim (int): Number of input channels.\n        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm"
+        },
+        {
+            "comment": "The code defines a Swin Transformer backbone for an image model. The `__init__` method initializes the Swin Transformer with specified dimension and normalization layer. The forward function processes input feature by splitting, concatenating, normalizing, and reducing dimensions. The `compute_mask` function generates an image mask using LRU caching.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":390-425",
+            "content": "    \"\"\"\n    def __init__(self, dim, norm_layer=nn.LayerNorm):\n        super().__init__()\n        self.dim = dim\n        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)\n        self.norm = norm_layer(4 * dim)\n    def forward(self, x):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, D, H, W, C).\n        \"\"\"\n        B, D, H, W, C = x.shape\n        # padding\n        pad_input = (H % 2 == 1) or (W % 2 == 1)\n        if pad_input:\n            x = F.pad(x, (0, W % 2, 0, H % 2, 0, 0), data_format='NDHWC')\n        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C\n        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C\n        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C\n        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C\n        x = paddle.concat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C\n        x = self.norm(x)\n        x = self.reduction(x)\n        return x\n# cache each stage results\n@lru_cache()\ndef compute_mask(D, H, W, window_size, shift_size):\n    img_mask = paddle.zeros((1, D, H, W, 1))  # 1 Dp Hp Wp 1"
+        },
+        {
+            "comment": "This code generates an attention mask for a Swin Transformer model. It iterates through various dimensions (d, h, w) within the window size and shift size, assigning incremental values to each position in the img_mask tensor. The resulting img_mask is then partitioned into non-overlapping windows and squeezed along the last dimension to create mask_windows. Finally, attn_mask is created by subtracting the expanded version of mask_windows from itself, effectively creating a binary mask where values are either 0 or -100.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":426-442",
+            "content": "    cnt = 0\n    for d in slice(-window_size[0]), slice(-window_size[0],\n                                           -shift_size[0]), slice(\n                                               -shift_size[0], None):\n        for h in slice(-window_size[1]), slice(-window_size[1],\n                                               -shift_size[1]), slice(\n                                                   -shift_size[1], None):\n            for w in slice(-window_size[2]), slice(-window_size[2],\n                                                   -shift_size[2]), slice(\n                                                       -shift_size[2], None):\n                img_mask[:, d, h, w, :] = cnt\n                cnt += 1\n    mask_windows = window_partition(img_mask,\n                                    window_size)  # nW, ws[0]*ws[1]*ws[2], 1\n    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]\n    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)\n    # attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))"
+        },
+        {
+            "comment": "The code defines a Swin Transformer layer for one stage in a neural network. The BasicLayer class takes various arguments such as feature channel dimensions, depth, number of heads, local window size, etc. It also includes an MLP (Multi-Layer Perceptron) with a specified ratio, and provides options to add learnable bias, scale factors, dropout rates, stochastic depth rate, and a normalization layer for each input. This basic layer can be utilized in the Swin Transformer architecture for feature extraction and classification tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":443-463",
+            "content": "    huns = -100.0 * paddle.ones_like(attn_mask)\n    attn_mask = huns * (attn_mask != 0).astype(\"float32\")\n    return attn_mask\nclass BasicLayer(nn.Layer):\n    \"\"\" A basic Swin Transformer layer for one stage.\n    Args:\n        dim (int): Number of feature channels\n        depth (int): Depths of this stage.\n        num_heads (int): Number of attention head.\n        window_size (tuple[int]): Local window size. Default: (1,7,7).\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.\n        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.\n        drop (float, optional): Dropout rate. Default: 0.0\n        attn_drop (float, optional): Attention dropout rate. Default: 0.0\n        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0\n        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm\n   "
+        },
+        {
+            "comment": "This code defines a 3D Swin Transformer block with optional downsampling layer at the end. It takes parameters such as dim, depth, num_heads, window size, mlp ratio, etc., and initializes an instance of the class SwinTransformerBlock3D for each block in a LayerList. The window size is set to (1, 7, 7) by default and the shift size is determined based on whether the current index is even or odd.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":463-492",
+            "content": "     downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None\n    \"\"\"\n    def __init__(self,\n                 dim,\n                 depth,\n                 num_heads,\n                 window_size=(1, 7, 7),\n                 mlp_ratio=4.,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 norm_layer=nn.LayerNorm,\n                 downsample=None,\n                 use_checkpoint=False):\n        super().__init__()\n        self.window_size = window_size\n        self.shift_size = tuple(i // 2 for i in window_size)\n        self.depth = depth\n        self.use_checkpoint = use_checkpoint\n        # build blocks\n        self.blocks = nn.LayerList([\n            SwinTransformerBlock3D(\n                dim=dim,\n                num_heads=num_heads,\n                window_size=window_size,\n                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,\n                mlp_ratio=mlp_ratio,"
+        },
+        {
+            "comment": "This code defines a Swin Transformer block for the PaddleVideo library. It takes input dimensions and creates multiple linear layers for self-attention, followed by a downsampling operation if needed. The forward function calculates an attention mask based on window size and shifts before rearranging the input tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":493-521",
+            "content": "                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop,\n                attn_drop=attn_drop,\n                drop_path=drop_path[i]\n                if isinstance(drop_path, list) else drop_path,\n                norm_layer=norm_layer,\n                use_checkpoint=use_checkpoint,\n            ) for i in range(depth)\n        ])\n        self.downsample = downsample\n        if self.downsample is not None:\n            self.downsample = downsample(dim=dim, norm_layer=norm_layer)\n    def forward(self, x):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, C, D, H, W).\n        \"\"\"\n        # calculate attention mask for SW-MSA\n        B = paddle.shape(x)[0]\n        _, C, D, H, W = x.shape\n        window_size, shift_size = get_window_size((D, H, W), self.window_size,\n                                                  self.shift_size)\n        # x = rearrange(x, 'b c d h w -> b d h w c')\n        x = x.transpose([0, 2, 3, 4, 1])\n        Dp = int(np.ceil(D / window_size[0])) * window_size[0]"
+        },
+        {
+            "comment": "This code implements a PatchEmbed3D class, which embeds input video frames into patches for use in the Swin Transformer model. It takes the input video frames, divides them into non-overlapping patches, and performs linear projections on the patches to obtain embeddings. The patch size, number of input channels, and embedding dimension are configurable parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":522-550",
+            "content": "        Hp = int(np.ceil(H / window_size[1])) * window_size[1]\n        Wp = int(np.ceil(W / window_size[2])) * window_size[2]\n        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size)\n        for blk in self.blocks:\n            x = blk(x, attn_mask)\n        x = x.reshape([B, D, H, W, C])\n        if self.downsample is not None:\n            x = self.downsample(x)\n        x = x.transpose([0, 4, 1, 2, 3])\n        return x\nclass PatchEmbed3D(nn.Layer):\n    \"\"\" Video to Patch Embedding.\n    Args:\n        patch_size (int): Patch token size. Default: (2,4,4).\n        in_chans (int): Number of input video channels. Default: 3.\n        embed_dim (int): Number of linear projection output channels. Default: 96.\n        norm_layer (nn.Layer, optional): Normalization layer. Default: None\n    \"\"\"\n    def __init__(self,\n                 patch_size=(2, 4, 4),\n                 in_chans=3,\n                 embed_dim=96,\n                 norm_layer=None):\n        super().__init__()\n        self.patch_size = patch_size"
+        },
+        {
+            "comment": "This code is for the Swin Transformer backbone in PaddleVideo. It initializes the module with input channels (in_chans), embed dim, and patch size. If a norm layer is provided, it also initializes the normalization layer (norm). The forward function pads the input according to the dimensions and applies a convolution operation for feature extraction. If a normalization layer was initialized, it performs normalization on the features before returning them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":552-580",
+            "content": "        self.in_chans = in_chans\n        self.embed_dim = embed_dim\n        self.proj = nn.Conv3D(in_chans,\n                              embed_dim,\n                              kernel_size=patch_size,\n                              stride=patch_size)\n        if norm_layer is not None:\n            self.norm = norm_layer(embed_dim)\n        else:\n            self.norm = None\n    def forward(self, x):\n        _, _, D, H, W = x.shape\n        if W % self.patch_size[2] != 0:\n            x = F.pad(\n                x, (0, self.patch_size[2] - W % self.patch_size[2], 0, 0, 0, 0),\n                data_format='NCDHW')\n        if H % self.patch_size[1] != 0:\n            x = F.pad(\n                x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1], 0, 0),\n                data_format='NCDHW')\n        if D % self.patch_size[0] != 0:\n            x = F.pad(\n                x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]),\n                data_format='NCDHW')\n        x = self.proj(x)  # B C D Wh Ww\n        if self.norm is not None:"
+        },
+        {
+            "comment": "This code defines the Swin Transformer 3D backbone for Paddle Video. It takes an input tensor and performs normalization, transposition, and reshaping operations before returning the processed tensor. The class also registers with BACKBONES to be recognized as a valid backbone model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":581-603",
+            "content": "            D, Wh, Ww = x.shape[2], x.shape[3], x.shape[4]\n            x = x.flatten(2).transpose([0, 2, 1])\n            x = self.norm(x)\n            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, D, Wh, Ww])\n        return x\n@BACKBONES.register()\nclass SwinTransformer3D(nn.Layer):\n    \"\"\" Swin Transformer backbone.\n        A Paddle impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -\n          https://arxiv.org/pdf/2103.14030\n    Args:\n        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).\n        in_chans (int): Number of input image channels. Default: 3.\n        embed_dim (int): Number of linear projection output channels. Default: 96.\n        depths (tuple[int]): Depths of each Swin Transformer stage.\n        num_heads (tuple[int]): Number of attention head of each stage.\n        window_size (int): Window size. Default: 7.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.\n        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee"
+        },
+        {
+            "comment": "This code defines the initialization parameters for the SWIN Transformer model in PaddleVideo. Parameters include pretrained weights, patch size, input channels, embedding dimension, depths of each stage, number of heads per stage, window size, MLP ratio, qkv_bias, qk scale, drop rate, attn drop rate, and stochastic depth rate. The normalization layer and whether to freeze any stages can also be specified during initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":604-626",
+            "content": "        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.\n        drop_rate (float): Dropout rate.\n        attn_drop_rate (float): Attention dropout rate. Default: 0.\n        drop_path_rate (float): Stochastic depth rate. Default: 0.2.\n        norm_layer: Normalization layer. Default: nn.LayerNorm.\n        patch_norm (bool): If True, add normalization after patch embedding. Default: False.\n        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).\n            -1 means not freezing any parameters.\n    \"\"\"\n    def __init__(self,\n                 pretrained=None,\n                 patch_size=(4, 4, 4),\n                 in_chans=3,\n                 embed_dim=96,\n                 depths=[2, 2, 6, 2],\n                 num_heads=[3, 6, 12, 24],\n                 window_size=(2, 7, 7),\n                 mlp_ratio=4.,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.2,"
+        },
+        {
+            "comment": "The code initializes a Swin Transformer model with specified parameters, including depths, embed dimension, patch size, window size, and input channels. It creates the patch embedding layer and position dropout layer. Stochastic depth is applied using a decay rule. The layers are built using BasicLayer instances for each layer in the specified number of layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":627-658",
+            "content": "                 norm_layer=nn.LayerNorm,\n                 patch_norm=False,\n                 frozen_stages=-1,\n                 use_checkpoint=False):\n        super().__init__()\n        self.pretrained = pretrained\n        self.num_layers = len(depths)\n        self.embed_dim = embed_dim\n        self.patch_norm = patch_norm\n        self.frozen_stages = frozen_stages\n        self.window_size = window_size\n        self.patch_size = patch_size\n        # split image into non-overlapping patches\n        self.patch_embed = PatchEmbed3D(\n            patch_size=patch_size,\n            in_chans=in_chans,\n            embed_dim=embed_dim,\n            norm_layer=norm_layer if self.patch_norm else None)\n        self.pos_drop = nn.Dropout(p=drop_rate)\n        # stochastic depth\n        dpr = [\n            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))\n        ]  # stochastic depth decay rule\n        # build layers\n        self.layers = nn.LayerList()\n        for i_layer in range(self.num_layers):\n            layer = BasicLayer("
+        },
+        {
+            "comment": "This code initializes a Swin Transformer backbone with specified parameters and adds a norm layer for each output. It also includes a function to freeze certain stages of the model if desired.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":659-686",
+            "content": "                dim=int(embed_dim * 2**i_layer),\n                depth=depths[i_layer],\n                num_heads=num_heads[i_layer],\n                window_size=window_size,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop_rate,\n                attn_drop=attn_drop_rate,\n                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],\n                norm_layer=norm_layer,\n                downsample=PatchMerging\n                if i_layer < self.num_layers - 1 else None,\n                use_checkpoint=use_checkpoint)\n            self.layers.append(layer)\n        self.num_features = int(embed_dim * 2**(self.num_layers - 1))\n        # add a norm layer for each output\n        self.norm = norm_layer(self.num_features)\n        self._freeze_stages()\n    def _freeze_stages(self):\n        if self.frozen_stages >= 0:\n            self.patch_embed.eval()\n            for param in self.patch_embed.parameters():\n                param.stop_gradient = True"
+        },
+        {
+            "comment": "This code is part of a backbone model's initialization. It first applies an initializer function to the layers, then checks if pretrained weights are provided and loads them if available. The frozen_stages variable determines how many stages of the model should be frozen (set to eval mode) during inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":688-719",
+            "content": "        if self.frozen_stages >= 1:\n            self.pos_drop.eval()\n            for i in range(0, self.frozen_stages):\n                m = self.layers[i]\n                m.eval()\n                for param in m.parameters():\n                    param.stop_gradient = True\n    def _init_fn(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight, std=0.02)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):\n            zeros_(m.bias)\n            ones_(m.weight)\n    def init_weights(self):\n        \"\"\"Initialize the weights in backbone.\n        Args:\n            pretrained (str, optional): Path to pre-trained weights.\n                Defaults to None.\n        \"\"\"\n        \"\"\"First init model's weight\"\"\"\n        self.apply(self._init_fn)\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if isinstance(\n                self.pretrained, str\n        ) and self.pretrained.strip() != \"\":  # load pretrained weights\n            load_ckpt(self, self.pretrained)"
+        },
+        {
+            "comment": "If pretrained is None or empty, do nothing. Else, raise NotImplementedError. Forward function processes input through patch embedding and positional dropout, iterates over layers, transposes dimensions, normalizes, and returns output. Train mode keeps layers unfrozen by calling the superclass method and freezing stages.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/swin_transformer.py\":720-741",
+            "content": "        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            pass\n        else:\n            raise NotImplementedError\n    def forward(self, x):\n        \"\"\"Forward function.\"\"\"\n        x = self.patch_embed(x)\n        x = self.pos_drop(x)\n        for layer in self.layers:\n            x = layer(x)\n        x = x.transpose([0, 2, 3, 4, 1])\n        x = self.norm(x)\n        x = x.transpose([0, 4, 1, 2, 3])\n        return x\n    def train(self, mode=True):\n        \"\"\"Convert the model into training mode while keep layers freezed.\"\"\"\n        super(SwinTransformer3D, self).train(mode)\n        self._freeze_stages()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0a8c871b-e0b6-4ae0-af77-f9cac3ab432a.json b/docs/doc/0a8c871b-e0b6-4ae0-af77-f9cac3ab432a.json
new file mode 100644
index 000000000..50a7a37d8
--- /dev/null
+++ b/docs/doc/0a8c871b-e0b6-4ae0-af77-f9cac3ab432a.json
@@ -0,0 +1,75 @@
+{
+    "summary": "The code introduces a new \"FeaturePadding\" class to PaddlePaddle library, handles data preprocessing for multimodal tasks, and provides masking, region selection, and action perturbation functions for PaddleVideo.",
+    "details": [
+        {
+            "comment": "This code is part of a PaddlePaddle video analysis library. It registers a new class called \"FeaturePadding\" which performs feature padding to target shape. It imports necessary libraries and packages including decord, PIL, numpy, json, paddlenlp for ActBERT, and the PIPELINES registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":0-34",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nfrom PIL import Image\nimport decord as de\nimport copy\nimport json\nfrom ..registry import PIPELINES\ntry:\n    from paddlenlp.transformers import BertTokenizer\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT.\"\n    )\n@PIPELINES.register()\nclass FeaturePadding(object):\n    \"\"\"\n    Padding feature to target shape.\n    \"\"\""
+        },
+        {
+            "comment": "This code defines a class with an __init__ method and a __call__ method. The __init__ method initializes the maximum number of regions (36) and actions (5). The __call__ method takes in results as input, including feature packs for image and action data. It pads the features to their maximum allowed dimensions with zeroes if there are less than the specified maximum. This is useful for maintaining consistent input sizes in machine learning models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":35-58",
+            "content": "    def __init__(self, max_region_num=36, max_action_num=5):\n        self.max_region_num = max_region_num\n        self.max_action_num = max_action_num\n    def __call__(self, results):\n        \"\"\"\n        Padding feature.\n        \"\"\"\n        pack_feature = results['feature']\n        tokenizer = results['tokenizer']\n        image_feature_wp, image_target_wp, image_location_wp, \\\n                num_boxes,  image_h, image_w, image_id, caption, \\\n                action_feature_wp, action_target_wp, num_actions = pack_feature\n        image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)\n        image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)\n        image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)\n        action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)\n        action_target = np.zeros((self.max_action_num, ), dtype=np.int64)\n        num_boxes = int(num_boxes)\n        image_feature[:num_boxes] = image_feature_wp\n        image_target[:num_boxes] = image_target_wp"
+        },
+        {
+            "comment": "This code segment is responsible for resizing and normalizing the image and action feature coordinates, as well as deep copying the features. It also initializes the results dictionary with keys for image_feat and image_target. This appears to be part of a data preprocessing step in a multimodal pipeline.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":59-80",
+            "content": "        image_location[:num_boxes, :4] = image_location_wp\n        image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (\n            image_location[:, 2] - image_location[:, 0]) / (float(image_w) *\n                                                            float(image_h))\n        image_location[:, 0] = image_location[:, 0] / float(image_w)\n        image_location[:, 1] = image_location[:, 1] / float(image_h)\n        image_location[:, 2] = image_location[:, 2] / float(image_w)\n        image_location[:, 3] = image_location[:, 3] / float(image_h)\n        image_feature = copy.deepcopy(image_feature)\n        image_target = copy.deepcopy(image_target)\n        num_actions = int(num_actions)\n        action_feature[:num_actions] = action_feature_wp\n        action_target[:num_actions] = action_target_wp\n        action_feature = copy.deepcopy(action_feature)\n        action_target = copy.deepcopy(action_target)\n        results = dict(image_feat=image_feature,\n                       image_target=image_target,"
+        },
+        {
+            "comment": "The code defines a pipeline that randomly selects captions for the NSP task. It takes caption paths as input and returns random captions. The class has an `__init__` method to initialize the caption path, a `select_caption` method to randomly choose one from multiple captions, a `get_random_caption` method to select a random caption from all provided captions, and finally a `random_cap` method that combines these functionalities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":81-112",
+            "content": "                       caption=caption,\n                       image_loc=image_location,\n                       num_boxes=int(num_boxes),\n                       action_feat=action_feature,\n                       action_target=action_target,\n                       num_actions=int(num_actions),\n                       tokenizer=tokenizer)\n        return results\n@PIPELINES.register()\nclass RandomCap(object):\n    def __init__(self, caption_path):\n        \"\"\"\n        Random Caption for NSP task\n        \"\"\"\n        self.caption_path = caption_path\n    def select_caption(self, caption):\n        captions = caption.split('!')\n        rind = random.randint(0, len(captions) - 1)\n        caption = captions[rind]\n        return caption\n    def get_random_caption(self, all_captions):\n        num_caps = len(all_captions)\n        rand_doc_idx = random.randint(0, num_caps - 1)\n        caption = all_captions[rand_doc_idx]\n        caption = self.select_caption(caption)\n        return caption\n    def random_cap(self, caption, all_captions):"
+        },
+        {
+            "comment": "The code is part of a multi-modal pipeline, where it randomly generates labels (0 or 1) and selects captions from a list. It also includes classes for tokenizing captions and applying random masks on the text data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":113-150",
+            "content": "        if random.random() > 0.5:\n            label = 0\n        else:\n            caption = self.get_random_caption(all_captions)\n            label = 1\n        return caption, label\n    def __call__(self, results):\n        caption = results['caption']\n        all_captions = list(json.load(open(self.caption_path, 'r')))\n        caption = self.select_caption(caption)\n        caption, label = self.random_cap(caption, all_captions)\n        results['caption'] = caption\n        results['is_next'] = label\n        return results\n@PIPELINES.register()\nclass Tokenize(object):\n    def __init__(self, ):\n        \"\"\"\n        Tokenize caption\n        \"\"\"\n        pass\n    def __call__(self, results):\n        caption = results['caption']\n        tokenizer = results['tokenizer']\n        tokens_caption = tokenizer.tokenize(caption)\n        results['caption'] = tokens_caption\n        return results\n@PIPELINES.register()\nclass RandomMask(object):\n    def __init__(self,\n                 max_seq_length=36,\n                 max_action_length=5,"
+        },
+        {
+            "comment": "This code defines a class for loading multimodal data, including images and text, into TensorFlow datasets. The constructor takes the maximum sequence length, action length, and region length as arguments. It also includes functions to generate global image features and truncate a sequence pair if they exceed the maximum length.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":151-174",
+            "content": "                 max_region_length=36):\n        self.max_seq_length = max_seq_length\n        self.max_action_length = max_action_length\n        self.max_region_length = max_region_length\n    def get_image_global_feature(self, image_feat, image_loc, image_mask):\n        g_image_feat = np.sum(image_feat, axis=0) / np.sum(\n            image_mask, axis=0, keepdims=True)\n        image_feat = np.concatenate(\n            [np.expand_dims(g_image_feat, axis=0), image_feat],\n            axis=0).astype(\"float32\")\n        g_image_loc = np.array([0, 0, 1, 1, 1]).astype(\"float32\")\n        image_loc = np.concatenate(\n            [np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)\n        g_image_mask = np.array([1])\n        image_mask = np.concatenate([g_image_mask, image_mask], axis=0)\n        return image_feat, image_loc, image_mask\n    def _truncate_seq_pair(self, tokens_b, max_length):\n        \"\"\"Truncates a sequence pair in place to the maximum length.\n        This is a simple heuristic which will always truncate the longer sequence"
+        },
+        {
+            "comment": "The code is implementing a method to mask random tokens in a sentence for Language Model (LM) tasks. It first ensures that all sequences have equal length by truncating one token at a time from the longer sequence, then randomly masks 15% of the tokens in each sequence. The method also includes logic to handle tokenizer and produces masked tokens along with their related labels for LM prediction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":175-200",
+            "content": "        one token at a time. This makes more sense than truncating an equal percent\n        of tokens from each, since if one sequence is very short then each token\n        that's truncated likely contains more information than a longer sequence.\n        \"\"\"\n        while True:\n            total_length = len(tokens_b)\n            if total_length <= max_length:\n                break\n            tokens_b.pop()\n    def random_word(self, tokens, tokenizer):\n        \"\"\"\n        Masking some random tokens for Language Model task with probabilities as in the original BERT paper.\n        Args:\n            tokens: list of str, tokenized sentence.\n            tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)\n        Return:\n            (list of str, list of int), masked tokens and related labels for LM prediction\n        \"\"\"\n        output_label = []\n        for i, token in enumerate(tokens):\n            prob = random.random()\n            # mask token with 15% probability\n            if prob < 0.15:"
+        },
+        {
+            "comment": "This code modifies tokens in a given input sequence by randomly replacing them with mask tokens, random tokens from the vocabulary, or keeping them unchanged. The probability of each action is controlled by a variable 'prob', which is normalized to ensure the total probability sums up to 1.0. The resulting modified sequence is appended to 'output_label' for further prediction. Additionally, it handles unknown words by replacing them with '[UNK]'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":201-224",
+            "content": "                prob /= 0.15\n                # 80% randomly change token to mask token\n                if prob < 0.8:\n                    tokens[i] = \"[MASK]\"\n                # 10% randomly change token to random token\n                elif prob < 0.9:\n                    #tok = random.choice(list(tokenizer.vocab.items()))[0]\n                    tok = tokenizer.vocab.idx_to_token[random.randint(\n                        0,\n                        tokenizer.vocab_size,\n                    )]\n                    tokens[i] = tok\n                # rest 10% randomly keep current token\n                # append current token to output (we will predict these later)\n                try:\n                    output_label.append(tokenizer.vocab[token])\n                except KeyError:\n                    # For unknown words (should not occur with BPE vocab)\n                    output_label.append(tokenizer.vocab[\"[UNK]\"])\n                    print(\n                        \"Cannot find token '{}' in vocab. Using [UNK] insetad\"."
+        },
+        {
+            "comment": "The code defines three functions: \"random_region\", \"mask_token\", and \"random_action\". These functions are responsible for randomly masking tokens, selecting a random region from an image feature map, and randomly perturbing action features respectively. The random_region function masks 15% of the tokens in the input, while the random_action function perturbs 20% of the action features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":225-254",
+            "content": "                        format(token))\n            else:\n                # no masking token (will be ignored by loss function later)\n                output_label.append(-1)\n        return tokens, output_label\n    def random_region(self, image_feat, image_loc, num_boxes):\n        output_label = []\n        for i in range(num_boxes):\n            prob = random.random()\n            # mask token with 15% probability\n            if prob < 0.15:\n                prob /= 0.15\n                # 80% randomly change token to mask token\n                if prob < 0.9:\n                    image_feat[i] = 0\n                # rest 20% randomly keep current token\n                # append current token to output (we will predict these later)\n                output_label.append(1)\n            else:\n                # no masking token (will be ignored by loss function later)\n                output_label.append(-1)\n        return image_feat, image_loc, output_label\n    def random_action(self, action_feat, action_target, num_actions):"
+        },
+        {
+            "comment": "This code defines a function that applies random masking to an input sequence of actions. It randomly chooses to either replace 90% of the tokens with mask tokens, keep them unchanged (10%), or ignore them for loss calculation by setting their value to -1. The function takes as input various results from a pipeline and returns the masked action features and labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":255-284",
+            "content": "        output_label = []\n        for i in range(num_actions):\n            prob = random.random()\n            # mask token with 15% probability\n            if prob < 0.15:\n                prob /= 0.15\n                # 90% randomly change token to mask token\n                if prob < 0.9:\n                    action_feat[i] = 0\n                # rest 10% randomly keep current token\n                # append current token to output (we will predict these later)\n                output_label.append(action_target[i])\n            else:\n                # no masking token (will be ignored by loss function later)\n                output_label.append(-1)\n        return action_feat, output_label\n    def __call__(self, results):\n        caption = results['caption']\n        tokenizer = results['tokenizer']\n        image_feat = results['image_feat']\n        image_loc = results['image_loc']\n        num_boxes = results['num_boxes']\n        action_feat = results['action_feat']\n        action_target = results['action_target']\n        num_actions = results['num_actions']"
+        },
+        {
+            "comment": "This code is part of a multimodal pipeline that randomly selects words from the caption, regions from an image, and actions, then concatenates them using BERT's convention for sequence pairs. It also handles truncating the caption and assigning labels to the input features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":285-307",
+            "content": "        is_next = results['is_next']\n        image_target = results['image_target']\n        self._truncate_seq_pair(caption, self.max_seq_length - 2)\n        caption, caption_label = self.random_word(caption, tokenizer)\n        image_feat, image_loc, image_label = self.random_region(\n            image_feat, image_loc, num_boxes)\n        action_feat, action_label = self.random_action(action_feat,\n                                                       action_target,\n                                                       num_actions)\n        # concatenate lm labels and account for CLS, SEP, SEP\n        lm_label_ids = [-1] + caption_label + [-1]\n        # The convention in BERT is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids: 0   0   0   0  0     0 0\n        #"
+        },
+        {
+            "comment": "This code prepares input data for a multimodal pipeline in PaddleVideo. It appends special tokens \"[CLS]\" and \"[SEP]\" to the token list, assigns segment ID 0 to all tokens (indicating first sequence), converts tokens to input IDs using tokenizer, and creates a mask with 1 for real tokens and 0 for padding tokens. This allows the model to learn sequences and use the [CLS] vector as a \"sentence vector\" for classification tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":308-332",
+            "content": "        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary\n        # since the [SEP] token unambigiously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = []\n        segment_ids = []\n        tokens.append(\"[CLS]\")\n        segment_ids.append(0)\n        for token in caption:\n            tokens.append(token)\n            segment_ids.append(0)\n        tokens.append(\"[SEP]\")\n        segment_ids.append(0)\n        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to."
+        },
+        {
+            "comment": "Zero-padding visual, action, and input sequences to the maximum lengths. Asserting that all lists are of equal length after padding and match their respective max lengths.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":333-358",
+            "content": "        input_mask = [1] * (len(input_ids))\n        image_mask = [1] * (num_boxes)\n        action_mask = [1] * (num_actions)\n        # Zero-pad up to the visual sequence length.\n        while len(image_mask) < self.max_region_length:\n            image_mask.append(0)\n            image_label.append(-1)\n        while len(action_mask) < self.max_action_length:\n            action_mask.append(0)\n            action_label.append(-1)\n        # Zero-pad up to the sequence length.\n        while len(input_ids) < self.max_seq_length:\n            input_ids.append(0)\n            input_mask.append(0)\n            segment_ids.append(0)\n            lm_label_ids.append(-1)\n        assert len(input_ids) == self.max_seq_length\n        assert len(input_mask) == self.max_seq_length\n        assert len(segment_ids) == self.max_seq_length\n        assert len(lm_label_ids) == self.max_seq_length\n        assert len(image_mask) == self.max_region_length\n        assert len(image_label) == self.max_region_length\n        assert len(action_mask) == self.max_action_length"
+        },
+        {
+            "comment": "This code snippet is part of a pipeline function that asserts the length of 'action_label' matches the maximum allowed action length. It then calls another function to get global image features, and forms a list of feature arrays including input ids, action feature, image feature, location, segment ids, input mask, image mask, action label, lm_label_ids, is_next, image label, and image target. The results dictionary is updated with these features before the function returns.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/multimodal.py\":359-379",
+            "content": "        assert len(action_label) == self.max_action_length\n        image_feat, image_loc, image_mask = self.get_image_global_feature(\n            image_feat, image_loc, np.array(image_mask))\n        features = [\n            np.array(input_ids),\n            action_feat,\n            image_feat,\n            image_loc,\n            np.array(segment_ids),\n            np.array(input_mask),\n            image_mask,\n            np.array(action_mask),\n            np.array(lm_label_ids),\n            np.array(action_label),\n            np.array(is_next),\n            np.array(image_label),\n            image_target,\n        ]\n        results['features'] = features\n        return results"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0ac1a4dd-d2dd-4a35-9f3b-3b316552754c.json b/docs/doc/0ac1a4dd-d2dd-4a35-9f3b-3b316552754c.json
new file mode 100644
index 000000000..972325738
--- /dev/null
+++ b/docs/doc/0ac1a4dd-d2dd-4a35-9f3b-3b316552754c.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code trains the PaddleVideo model using command line arguments, initializes the environment, and performs operations with distributed training and automatic mixed precision support.",
+    "details": [
+        {
+            "comment": "This code is a Python script for training the PaddleVideo model. It imports necessary modules, defines functions to parse command line arguments and sets default configuration and parameter files. The script uses argparse to create an argument parser with a description \"PaddleVideo train script\". It also provides default paths for config file (\"configs/manet.yaml\") and parameter file (\"model/default_manet.pdparams\").",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/main.py\":0-28",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless requifFred by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport random\nimport numpy as np\nimport paddle\nfrom EIVideo.paddlevideo.tasks import (test_model)\nfrom EIVideo.paddlevideo.utils import get_config, get_dist_info\nfrom EIVideo import EI_VIDEO_ROOT, join_root_path\nDEF_CONFIG_FILE_PATH = join_root_path(\"configs/manet.yaml\")\nDEF_PARAMS_FILE_PATH = join_root_path(\"model/default_manet.pdparams\")\ndef parse_args():\n    parser = argparse.ArgumentParser(\"PaddleVideo train script\")"
+        },
+        {
+            "comment": "This code defines command line arguments for the EIVideo application. It sets default values and provides help messages for config file path, overriding options, testing a model, using Dali for training speedup, multigrid training, and weights for finetuning or testing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/main.py\":29-52",
+            "content": "    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default=DEF_CONFIG_FILE_PATH,\n                        help='config file path')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument('--test',\n                        action='store_true',\n                        help='whether to test a model')\n    parser.add_argument('--train_dali',\n                        action='store_true',\n                        help='whether to use dali to speed up training')\n    parser.add_argument('--multigrid',\n                        action='store_true',\n                        help='whether to use multigrid training')\n    parser.add_argument('-w',\n                        '--weights',\n                        type=str,\n                        default=DEF_PARAMS_FILE_PATH,\n                        help='weights for finetuning or testing')"
+        },
+        {
+            "comment": "This code snippet adds command-line arguments to a parser object. The \"--fleet\" argument enables distributed training using fleet, \"--amp\" enables automatic mixed precision training, \"--validate\" triggers checkpoint evaluation during training, \"--seed\" sets random seeds for deterministic behavior, \"--max_iters\" sets the maximum number of iterations, and \"--profiler_options\" sets profiler options in key-value pairs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/main.py\":53-81",
+            "content": "    parser.add_argument('--fleet',\n                        action='store_true',\n                        help='whether to use fleet run distributed training')\n    parser.add_argument('--amp',\n                        action='store_true',\n                        help='whether to open amp training.')\n    parser.add_argument(\n        '--validate',\n        action='store_true',\n        help='whether to evaluate the checkpoint during training')\n    parser.add_argument(\n        '--seed',\n        type=int,\n        default=None,\n        help='fixed all random seeds when the program is running')\n    parser.add_argument(\n        '--max_iters',\n        type=int,\n        default=None,\n        help='max iterations when training(this argonly used in test_tipc)')\n    parser.add_argument(\n        '-p',\n        '--profiler_options',\n        type=str,\n        default=None,\n        help='The option of profiler, which should be in format '\n             '\\\"key1=value1;key2=value2;key3=value3\\\".')\n    parser.add_argument('--use_npu',\n                        type=bool,"
+        },
+        {
+            "comment": "This code defines a `main` function that parses command-line arguments, updates the configuration with optional kwargs, sets the random seed if specified, initializes parallel environment if necessary, and then calls `test_model` to perform some operation. Finally, it returns the final result. It is called as `main(video_path='example/example1.mp4', save_path='./output')`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/main.py\":82-115",
+            "content": "                        default=False,\n                        help='whether use npu.')\n    args = parser.parse_args()\n    return args\ndef main(**kwargs):\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override)\n    # ToDo To AP-kai: \u4e0b\u9762\u8fd9\u884c\u4ee3\u7801\u76ee\u7684\u662f\u66f4\u65b0\u914d\u7f6e\uff0c\u8fd9\u6837\u7684\u8bdd\u6211\u4eec\u8c03\u7528main(use_npu = Ture)\uff0c\u8fd9\u65f6cfg.use_npu\u5c31\u662fTure\u4e86\n    for key, value in kwargs.items():\n        cfg.__setattr__(key, value)\n    # set seed if specified\n    seed = args.seed\n    if seed is not None:\n        assert isinstance(\n            seed,\n            int), f\"seed must be a integer when specified, but got {seed}\"\n        paddle.seed(seed)\n        np.random.seed(seed)\n        random.seed(seed)\n    _, world_size = get_dist_info()\n    parallel = world_size != 1\n    if parallel:\n        paddle.distributed.init_parallel_env()\n    final = test_model(cfg, weights=args.weights, parallel=parallel)\n    return final\nif __name__ == '__main__':\n    main(video_path='example/example1.mp4', save_path='./output')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0adabe49-fe07-4f8c-9e91-cc3393a33818.json b/docs/doc/0adabe49-fe07-4f8c-9e91-cc3393a33818.json
new file mode 100644
index 000000000..a51574348
--- /dev/null
+++ b/docs/doc/0adabe49-fe07-4f8c-9e91-cc3393a33818.json
@@ -0,0 +1,60 @@
+{
+    "summary": "PaddleVideo's BasketballAction app uses PaddlePaddle 2.0 and models for basketball action detection, achieving an F1-score of 80.14%. Developed by authors including hari and Joonseok Lee, it optimizes based on speed, time distribution, and feature fusion methods.",
+    "details": [
+        {
+            "comment": "This code is for basketball action detection, using PaddlePaddle 2.0 and incorporating various video models from PaddleVideo (ppTSM, BMN, attentionLSTM). The process includes image feature extraction with ppTSM, proposal extraction with BMN, and LSTM-based action classification and regression. Dataset preparation involves data handling, label format specification, gts processing to JSON format, and abstracting frames from mp4 files using ffmpeg.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":0-68",
+            "content": "# \u7bee\u7403\u52a8\u4f5c\u68c0\u6d4b\u6a21\u578b\n## \u5185\u5bb9\n- [\u6a21\u578b\u7b80\u4ecb](#\u6a21\u578b\u7b80\u4ecb)\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u8bad\u7ec3](#\u6a21\u578b\u8bad\u7ec3)\n- [\u6a21\u578b\u8bc4\u4f30](#\u6a21\u578b\u8bc4\u4f30)\n- [\u6a21\u578b\u63a8\u7406](#\u6a21\u578b\u63a8\u7406)\n- [\u6a21\u578b\u4f18\u5316](#\u6a21\u578b\u4f18\u5316)\n- [\u6a21\u578b\u90e8\u7f72](#\u6a21\u578b\u90e8\u7f72)\n- [\u53c2\u8003\u8bba\u6587](#\u53c2\u8003\u8bba\u6587)\n## \u6a21\u578b\u7b80\u4ecb\n\u8be5\u4ee3\u7801\u5e93\u7528\u4e8e\u7bee\u7403\u52a8\u4f5c\u68c0\u6d4b+\u8bc6\u522b, \u57fa\u4e8epaddle2.0\u7248\u672c\u5f00\u53d1\uff0c\u7ed3\u5408PaddleVideo\u4e2d\u7684ppTSM, BMN, attentionLSTM\u7684\u591a\u4e2a\u89c6\u9891\u6a21\u578b\u8fdb\u884c\u89c6\u9891\u65f6\u7a7a\u4e8c\u9636\u6bb5\u68c0\u6d4b\u7b97\u6cd5\u3002\n\u4e3b\u8981\u5206\u4e3a\u5982\u4e0b\u51e0\u6b65\n - \u7279\u5f81\u62bd\u53d6\n    - \u56fe\u50cf\u7279\u6027\uff0cppTSM\n    - \u97f3\u9891\u7279\u5f81\uff0cVggsound\n - proposal\u63d0\u53d6\uff0cBMN\n - LSTM\uff0c\u52a8\u4f5c\u5206\u7c7b + \u56de\u5f52\n## \u6570\u636e\u51c6\u5907\n\u6570\u636e\u96c6\u5904\u7406\u4ee3\u7801\n```\n\u53c2\u8003https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/datasets\n```\n- \u6570\u636e\u96c6label\u683c\u5f0f\n```\n{\n    \"0\": \"\u80cc\u666f\",\n    \"1\": \"\u56de\u653e\",\n    \"2\": \"\u8fdb\u7403-\u4e09\u5206\u7403\",\n    \"3\": \"\u8fdb\u7403-\u4e24\u5206\u7403\",\n    \"4\": \"\u8fdb\u7403-\u6263\u7bee\",\n    \"5\": \"\u7f5a\u7403\",\n    \"6\": \"\u8df3\u7403\"\n}\n```\n- \u6570\u636e\u96c6gts\u5904\u7406, \u5c06\u539f\u59cb\u6807\u6ce8\u6570\u636e\u5904\u7406\u6210\u5982\u4e0bjson\u683c\u5f0f\n```\n{\n    'fps': 5,\n    'gts': [\n        {\n            'url': 'xxx.mp4',\n            'total_frames': 6341,\n            'actions': [\n                {\n                    \"label_ids\": [6],\n                    \"label_names\": [\"\u8df3\u7403\"],\n                    \"start_id\": 395,\n                    \"end_id\": 399\n                },\n                ...\n            ]\n        },\n        ...\n    ]\n}\n```\n- \u6570\u636e\u96c6\u62bd\u5e27, \u7531mp4, \u5f97\u5230frames\u548cpcm, \u8fd9\u91cc\u9700\u8981\u6dfb\u52a0ffmpeg\u73af\u5883\n```\ncd datasets/script && python get_frames_pcm.py"
+        },
+        {
+            "comment": "This code describes the storage location and structure of a basketball action dataset, including video files (mp4), image frames, audio files (pcm), and JSON files containing ground truth data. It also references the PaddleVideo footbal",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":69-98",
+            "content": "```\n- \u6570\u636e\u9884\u5904\u7406\u540e\u4fdd\u5b58\u683c\u5f0f\u5982\u4e0b\n```\n   |--  datasets                   # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  basketball            # xx\u6570\u636e\u96c6\n            |--  mp4               # \u539f\u59cb\u89c6\u9891.mp4\n            |--  frames            # \u56fe\u50cf\u5e27, fps=5, '.jpg'\u683c\u5f0f\n            |--  pcm               # \u97f3\u9891pcm, \u97f3\u9891\u91c7\u6837\u738716000\uff0c\u91c7\u7528\u901a\u9053\u65701\n            |--  url.list          # \u89c6\u9891\u5217\u8868\n            |--  label_train.json  # \u8bad\u7ec3\u96c6\u539f\u59cbgts\n            |--  label_val.json    # \u9a8c\u8bc1\u96c6\u539f\u59cbgts\n```\n## \u6a21\u578b\u8bad\u7ec3\n\u4ee3\u7801\u53c2\u8003\u8db3\u7403\u52a8\u4f5c\u68c0\u6d4b\uff1ahttps://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction\n\u5c06\u8be5\u4ee3\u7801\u5e93\u7684\u6587\u4ef6\u5939 [datasets](https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/datasets)\uff0c[extractor](https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/extractor)\uff0c[train_lstm](https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/train_lstm)\uff0c \u62f7\u8d1d\u5230\u672c\u4ee3\u7801\u5e93\u590d\u7528\u3002\n - image \u91c7\u6837\u9891\u7387fps=5\uff0c\u5982\u679c\u6709\u4e9b\u52a8\u4f5c\u65f6\u95f4\u8f83\u77ed\uff0c\u53ef\u4ee5\u9002\u5f53\u63d0\u9ad8\u91c7\u6837\u9891\u7387\n - BMN windows=200\uff0c\u537340s\uff0c\u6240\u4ee5\u6d4b\u8bd5\u81ea\u5df1\u7684\u6570\u636e\u65f6\uff0c\u89c6\u9891\u65f6\u957f\u9700\u5927\u4e8e40s\n### \u57fa\u7840\u955c\u50cf\n```\ndocker pull tmtalgo/paddleaction:action-detection-v2\n```\n### step1 ppTSM\u8bad\u7ec3\n\u6211\u4eec\u63d0\u4f9b\u4e86\u7bee\u7403\u6570\u636e\u8bad\u7ec3\u7684\u6a21\u578b\uff0c\u53c2\u8003checkpoints_basketball\u3002\u5982\u679c\u4f7f\u7528\u63d0\u4f9b\u7684pptsm\u6a21\u578b\uff0c\u53ef\u76f4\u63a5\u8df3\u8fc7\u4e0b\u8fb9\u7684pptsm\u8bad\u7ec3\u6570\u636e\u5904\u7406\u548c\u8bad\u7ec3\u6b65\u9aa4\u3002"
+        },
+        {
+            "comment": "Step 1.1: Prepare ppTSM training data by combining frames and gts to generate positive and negative samples, following the format '{}_{}_{}_{}'.format(video_basename, start_id, end_id, label).\n\nStep 1.2: Train ppTSM model using the prepared dataset by modifying config.yaml parameters and running main.py with distributed launch script.\n\nStep 1.3: Convert trained ppTSM model to prediction mode for inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":99-134",
+            "content": "\u5982\u679c\u9700\u8981\u5728\u81ea\u5df1\u7684\u6570\u636e\u4e0a\u8bad\u7ec3\uff0cppTSM\u8bad\u7ec3\u4ee3\u7801\u4e3a\uff1ahttps://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\nppTSM\u6587\u6863\u53c2\u8003\uff1ahttps://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/pp-tsm.md\n#### step1.1  ppTSM \u8bad\u7ec3\u6570\u636e\u5904\u7406\n\u7531frames\u7ed3\u5408gts\u751f\u6210\u8bad\u7ec3\u6240\u9700\u8981\u7684\u6b63\u8d1f\u6837\u672c\n```\ncd ${BasketballAction}\ncd datasets/script && python get_instance_for_tsn.py\n# \u6587\u4ef6\u540d\u6309\u7167\u5982\u4e0b\u683c\u5f0f\n'{}_{}_{}_{}'.format(video_basename, start_id, end_id, label)\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                   # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  basketball           # xx\u6570\u636e\u96c6\n            |--  input_for_tsn     # tsn/tsm\u8bad\u7ec3\u7684\u6570\u636e\n```\n#### step1.2 ppTSM\u6a21\u578b\u8bad\u7ec3\n```\n# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\ncd ${PaddleVideo}\n# \u4fee\u6539config.yaml\u53c2\u6570\u4fee\u6539\u4e3a ${BasketballAcation}/configs_train/pptsm_basketball.yaml\npython -B -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    --log_dir=$save_dir/logs \\\n    main.py  \\\n    --validate \\\n    -c {BasketballAcation}/configs_train/pptsm_basketball.yaml \\\n    -o output_dir=$save_dir\n```\n#### step1.3 ppTSM\u6a21\u578b\u8f6c\u4e3a\u9884\u6d4b\u6a21\u5f0f\n```\n# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0"
+        },
+        {
+            "comment": "The provided code is related to a PaddleVideo application called BasketballAction. It performs two steps - extracting features from images and audio, and training a BMN model. The extracted features are stored in the datasets/basketball/features directory. The BMN training code can be found at this GitHub link, and more information about BMN can be found in this documentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":135-162",
+            "content": "$cd {PaddleVideo}\npython tools/export_model.py -c ${BasketballAcation}/configs_train/pptsm_basketball.yaml \\\n                               -p ${pptsm_train_dir}/checkpoints/models_pptsm/ppTSM_epoch_00057.pdparams \\\n                               -o {BasketballAcation}/checkpoints/ppTSM\n```\n####  step1.4 \u57fa\u4e8eppTSM\u89c6\u9891\u7279\u5f81\u63d0\u53d6\nimage and audio\u7279\u5f81\u63d0\u53d6\uff0c\u4fdd\u5b58\u5230datasets features\u6587\u4ef6\u5939\u4e0b\n```\ncd ${BasketballAcation}\ncd extractor && python extract_feat.py\n# \u7279\u5f81\u7ef4\u5ea6, image(2048) + audio(1024) + pcm(640)\n# \u7279\u5f81\u4fdd\u5b58\u683c\u5f0f\u5982\u4e0b\uff0c\u5c06\u5982\u4e0bdict\u4fdd\u5b58\u5728pkl\u683c\u5f0f\uff0c\u7528\u4e8e\u63a5\u4e0b\u6765\u7684BMN\u8bad\u7ec3\nvideo_features = {'image_feature': np_image_features,\n                  'audio_feature': np_audio_features\n                  'pcm_feature': np_pcm_features}\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                   # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  basketball            # xx\u6570\u636e\u96c6\n            |--  features          # \u89c6\u9891\u7684\u56fe\u50cf+\u97f3\u9891\u7279\u5f81\n```\n### step2 BMN\u8bad\u7ec3\nBMN\u8bad\u7ec3\u4ee3\u7801\u4e3a\uff1ahttps://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\nBMN\u6587\u6863\u53c2\u8003\uff1ahttps://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/localization/bmn.md"
+        },
+        {
+            "comment": "Step 2.1 involves processing the Basketball Action dataset to generate binary proposals for BMN training, with a window size of 40. This is done using the get_instance_for_bmn.py script in the datasets/script directory. The resulting data format consists of instance identifiers, duration and feature frame numbers, subset information (train or test), and annotations containing segment locations and labels. Step 2.2 involves training the BMN model, requiring modification of the config.yaml file with Basketball Action-specific parameters and launching the main.py script using PaddlePaddle's distributed training functionality.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":164-205",
+            "content": "#### step2.1 BMN\u8bad\u7ec3\u6570\u636e\u5904\u7406\n\u7528\u4e8e\u63d0\u53d6\u4e8c\u5206\u7c7b\u7684proposal\uff0cwindows=40\uff0c\u6839\u636egts\u548c\u7279\u5f81\u5f97\u5230BMN\u8bad\u7ec3\u6240\u9700\u8981\u7684\u6570\u636e\u96c6\n```\ncd ${BasketballAcation}\ncd datasets/script && python get_instance_for_bmn.py\n# \u6570\u636e\u683c\u5f0f\n{\n    \"719b0a4bcb1f461eabb152298406b861_753_793\": {\n        \"duration_second\": 40.0,\n        \"duration_frame\": 200,\n        \"feature_frame\": 200,\n        \"subset\": \"train\",\n        \"annotations\": [\n            {\n                \"segment\": [\n                    15.0,\n                    22.0\n                ],\n                \"label\": \"6.0\",\n                \"label_name\": \"\u8df3\u7403\"\n            }\n        ]\n    },\n    ...\n}\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                   # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  basketball            # xx\u6570\u636e\u96c6\n            |--  input_for_bmn     # bmn\u8bad\u7ec3\u7684proposal         \n```\n#### step2.2  BMN\u6a21\u578b\u8bad\u7ec3\n```\n# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\ncd ${PaddleVideo}\n# \u4fee\u6539config.yaml\u53c2\u6570\u4fee\u4e3a${BasketballAcation}/configs_train/bmn_basketball.yaml\npython -B -m paddle.distributed.launch \\\n     --gpus=\"0,1\" \\\n     --log_dir=$out_dir/logs \\\n     main.py  \\"
+        },
+        {
+            "comment": "The code is used in the BasketballAction application of the PaddleVideo library. It converts the BMN model to prediction mode (step2.3), exports it, and then performs BMN-based predictions (step2.4) for obtaining action proposal information like start, end, and score. This helps identify basketball actions from given videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":206-242",
+            "content": "     --validate \\\n     -c ${BasketballAcation}/configs_train/bmn_basketball.yaml \\\n     -o output_dir=$out_dir\n```\n#### step2.3 BMN\u6a21\u578b\u8f6c\u4e3a\u9884\u6d4b\u6a21\u5f0f\n```\n# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0\n${PaddleVideo}\npython tools/export_model.py -c $${BasketballAcation}/configs_train/bmn_basketball.yaml \\\n                               -p ${bmn_train_dir}/checkpoints/models_bmn/bmn_epoch16.pdparams \\\n                               -o {BasketballAcation}/checkpoints/BMN\n```\n#### step2.4  BMN\u6a21\u578b\u9884\u6d4b\n\u5f97\u5230\u52a8\u4f5cproposal\u4fe1\u606f\uff1a start_id, end_id, score\n```\ncd ${BasketballAcation}\ncd extractor && python extract_bmn.py\n# \u6570\u636e\u683c\u5f0f\n[\n    {\n        \"video_name\": \"c9516c903de3416c97dae91a59e968d7\",\n        \"num_proposal\": 5534,\n        \"bmn_results\": [\n            {\n                \"start\": 7850.0,\n                \"end\": 7873.0,\n                \"score\": 0.77194699622342\n            },\n            {\n                \"start\": 4400.0,\n                \"end\": 4443.0,\n                \"score\": 0.7663803287641536\n            },\n            ...\n        ]"
+        },
+        {
+            "comment": "This code represents a JSON object containing information about a dataset for LSTM training. It includes the frame rate (fps), whether it's for training or validation, the total number of frames, the number of ground truth (gt) instances, and the number of proposals. The proposals contain details like label, normalized IOU, start time, end time, and score. This data is used to train a LSTM model in BasketballAction application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":243-283",
+            "content": "    },\n    ...\n]\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                   # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  basketball            # xx\u6570\u636e\u96c6\n            |--  feature_bmn\n                 |--  prop.json    # bmn \u9884\u6d4b\u7ed3\u679c\n```\n### step3 LSTM\u8bad\u7ec3\nLSTM\u8bad\u7ec3\u4ee3\u7801\u4e3a\uff1atrain_lstm\n#### step3.1  LSTM\u8bad\u7ec3\u6570\u636e\u5904\u7406\n\u5c06BMN\u5f97\u5230\u7684proposal\u622a\u65ad\u5e76\u5904\u7406\u6210LSTM\u8bad\u7ec3\u6240\u9700\u6570\u636e\u96c6\n```\ncd ${BasketballAcation}\ncd datasets/script && python get_instance_for_lstm.py\n# \u6570\u636e\u683c\u5f0f1\uff0clabel_info\n{\n    \"fps\": 5,\n    \"results\": [\n        {\n            \"url\": \"https://xxx.mp4\",\n            \"mode\": \"train\",        # train or validation\n            \"total_frames\": 6128,\n            \"num_gts\": 93,\n            \"num_proposals\": 5043,\n            \"proposal_actions\": [\n                {\n                    \"label\": 6,\n                    \"norm_iou\": 0.7575757575757576,\n                    \"norm_ioa\": 0.7575757575757576,\n                    \"norm_start\": -0.32,\n                    \"proposal\": {\n                        \"start\": 5011,\n                        \"end\": 5036,\n                        \"score\": 0.7723643666324231\n                    },"
+        },
+        {
+            "comment": "This code is from the PaddleVideo library's BasketballAction application and provides information on data formats for LSTM training. The first format contains label information, start and end IDs in a JSON object. The second format includes features like audio, pcm, fps, and label info in a NumPy array. The third format is the label.txt file for LSTM training. After completing these steps, the trained data will be stored in the \"input_for_lstm\" folder within the BasketballAction dataset folder.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":284-318",
+            "content": "                    \"hit_gts\": {\n                        \"label_ids\": [\n                            6\n                        ],\n                        \"label_names\": [\n                            \"\u8df3\u7403\"\n                        ],\n                        \"start_id\": 5003,\n                        \"end_id\": 5036\n                    }\n                },\n                ...\n        },\n        ...\n}\n# \u6570\u636e\u683c\u5f0f2\uff0cLSTM\u8bad\u7ec3\u6240\u9700\u8981\u7684feature\n{\n    'features': np.array(feature_hit, dtype=np.float32),    # TSM audio and pcm \u7279\u5f81, \u53ef\u6839\u636e\u9700\u6c42\u9009\u62e9\u7ec4\u5408\n    'feature_fps': 5,                                       # fps = 5\n    'label_info': {'norm_iou': 0.5, 'label': 3, ...},       # \u6570\u636e\u683c\u5f0f1\u4e2d\u7684'proposal_actions'\n    'video_name': 'c9516c903de3416c97dae91a59e968d7'        # video_name\n}\n# \u6570\u636e\u683c\u5f0f3\uff0cLSTM\u8bad\u7ec3\u6240\u9700label.txt\n'{} {}'.format(filename, label)\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                   # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  basketball            # xx\u6570\u636e\u96c6\n            |--  input_for_lstm    # LSTM\u8bad\u7ec3\u6570\u636e\u96c6\n```\n#### step3.2  LSTM\u8bad\u7ec3\n```\n#conf.yaml\u4fee\u6539\u4e3a ${BasketballAcation}/configs_train/lstm_basketball.yaml"
+        },
+        {
+            "comment": "The code is converting the trained LSTM model to prediction mode, running model inference on testing data, and evaluating the results. This process involves using pre-prepared datasets and models from provided URLs for easy execution of prediction and evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":319-364",
+            "content": "cd ${BasketballAcation}\npython -u scenario_lib/train.py \\\n    --model_name=ActionNet \\\n    --config=${BasketballAcation}/configs_train/lstm_basketball.yaml \\\n    --save_dir=${out_dir}\"/models_lstm/\" \\\n    --log_interval=5 \\\n    --valid_interval=1\n```\n#### step3.3 LSTM\u6a21\u578b\u8f6c\u4e3a\u9884\u6d4b\u6a21\u5f0f\n```\n${BasketballAcation}\npython tools/export_model.py -c ${BasketballAction}/train_lstm/conf/conf.yaml \\\n                               -p ${lstm_train_dir}/checkpoints/models_lstm/bmn_epoch29.pdparams \\\n                               -o {BasketballAcation}/checkpoints/LSTM\n```\n## \u6a21\u578b\u63a8\u7406\n\u6d4b\u8bd5\u6570\u636e\u683c\u5f0f\uff0c\u53ef\u53c2\u8003\u4f7f\u7528\u6837\u4f8b\n```\nwget https://videotag.bj.bcebos.com/Applications/basketball/datasets.tar.gz\n```\n\u6d4b\u8bd5\u6a21\u578b\uff0c\u53ef\u4f7f\u7528\u6211\u4eec\u63d0\u4f9b\u7684\u6a21\u578b\n```\nwget https://videotag.bj.bcebos.com/Applications/basketball/checkpoints_basketball.tar.gz\n```\n\u8fd0\u884c\u9884\u6d4b\u4ee3\u7801\n```\ncd ${BasketballAction}\ncd predict\n# \u5982\u679c\u4f7f\u7528\u81ea\u5df1\u8bad\u7ec3\u7684\u6a21\u578b\uff0c\u8bf7\u5c06\u5404\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u8f6c\u6362\u7684inference\u6a21\u578b\u653e\u5230predict\u5e93\n# cp -rf ../checkpoints checkpoints_basketball\npython predict.py\n```\n\u4ea7\u51fa\u6587\u4ef6\n```\n${BasketballAction}/predict/results.json\n```\n## \u6a21\u578b\u8bc4\u4f30\n```\ncd ${BasketballAction}\ncd predict\npython eval.py results.json"
+        },
+        {
+            "comment": "This code uses the TSM and BMN models for efficient video understanding, allowing for action detection with a F1-score of 80.14%. The code can optimize based on motion speed, time distribution, and feature fusion methods, and is applicable to other action detection scenarios.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":365-388",
+            "content": "```\n## \u6a21\u578b\u4f18\u5316\n\u5728\u5b9e\u9645\u4f7f\u7528\u573a\u666f\u4e2d\u53ef\u6839\u636e\u89c6\u9891\u5185\u5bb9\u5c1d\u8bd5\u4f18\u5316\u7b56\u7565\n- \u53ef\u6839\u636e\u52a8\u4f5c\u8fd0\u52a8\u901f\u5ea6\uff0c\u8c03\u6574\u62bd\u5e27\u91c7\u6837\u7387\uff0c\u672c\u4ee3\u7801\u9ed8\u8ba4\u4e3afps=5\n- \u7edf\u8ba1\u52a8\u4f5c\u7684\u65f6\u95f4\u5206\u5e03\uff0c\u8c03\u6574bmn\u91c7\u6837\u7a97\u53e3\n- \u6839\u636e\u56fe\u50cf\u548c\u97f3\u9891\u7684\u5173\u8054\u7a0b\u5ea6\uff0c\u8c03\u6574\u56fe\u50cf\u548c\u97f3\u9891\u7279\u5f81\u7684\u878d\u5408\u65b9\u5f0f\uff1a\u672c\u4ee3\u7801\u5c06\u56fe\u50cf\u7279\u5f81\u548c\u97f3\u9891\u5728\u65f6\u95f4\u7ef4\u5ea6\u5bf9\u9f50\uff0c\u878d\u5408\u540e\u518d\u8fdb\u5165\u6a21\u578b\u8bad\u7ec3\u3002\u4e5f\u53ef\u5c1d\u8bd5\u5206\u522b\u6a21\u578b\u8bad\u7ec3\u540e\uff0c\u52a0\u6743\u878d\u5408\u7b49\n- \u672c\u4ee3\u7801\u7684\u89e3\u51b3\u65b9\u6848\u4e5f\u53ef\u7528\u4e8e\u5176\u4ed6\u52a8\u4f5c\u68c0\u6d4b\u3002\u53d8\u6362\u573a\u666f\u540e\uff0c\u56fe\u50cf\u7279\u5f81\u91cd\u65b0\u8bad\u7ec3\u6548\u679c\u66f4\u597d\u3002\u97f3\u9891\u7279\u5f81\u91c7\u7528\u7684VGGSound\u8bad\u7ec3\uff0c\u5982\u679c\u4f7f\u7528\u573a\u666f\u4ecd\u4e3a\u751f\u6d3b\u573a\u666f\uff0c\u53ef\u76f4\u63a5\u590d\u7528\u3002\n## \u6a21\u578b\u90e8\u7f72\n\u672c\u4ee3\u7801\u89e3\u51b3\u65b9\u6848\u5728\u52a8\u4f5c\u7684\u68c0\u6d4b\u548c\u53ec\u56de\u6307\u6807F1-score=80.14%\n<div align=\"center\">\n  <img src=\"images/BasketballAction_demo.gif\" width=\"640px\"/><br>\n</div>\n## \u53c2\u8003\u8bba\u6587\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han\n- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.\n- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen\n- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kot"
+        },
+        {
+            "comment": "Code represents authors of a paper or contributors to the project, including hari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/README.md\":388-388",
+            "content": "hari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0b0d02e7-2362-44bd-bc2e-463de08cb7d5.json b/docs/doc/0b0d02e7-2362-44bd-bc2e-463de08cb7d5.json
new file mode 100644
index 000000000..57618b78a
--- /dev/null
+++ b/docs/doc/0b0d02e7-2362-44bd-bc2e-463de08cb7d5.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The MoViNetRecognizerFrame class, extending BaseRecognizer, has forward_net and train_step methods for training steps. Three functions - forward_net, test_step, and infer_step are defined for model's testing or inference process.",
+    "details": [
+        {
+            "comment": "The code is defining a class named \"MoViNetRecognizerFrame\" which extends the BaseRecognizer class. It has two methods, forward_net and train_step. The forward_net method defines how the model will run from input to output by first cleaning activation buffers in the backbone and then passing the inputs through it to get outputs. Finally, the head is applied on these outputs to get class scores. The train_step method defines a training step for this model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py\":0-32",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom .base import BaseRecognizer\nfrom ...registry import RECOGNIZERS\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass MoViNetRecognizerFrame(BaseRecognizer):\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        self.backbone.clean_activation_buffers()\n        outputs = self.backbone(imgs)\n        cls_score = self.head(outputs)\n        return cls_score\n    def train_step(self, data_batch):"
+        },
+        {
+            "comment": "Training step: Implements a training step for the model, taking data_batch as input. Extracts images and labels, transposes data, applies forward pass in the network, calculates loss metrics, and computes top-1 and top-5 accuracy scores. Returns output with 'loss', 'top1', and 'top5' keys.\nValidating step: Implements a validating step for the model, similar to training step but used to validate the model on unseen data. Computes top-1 and top-5 accuracy scores along with loss metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py\":33-56",
+            "content": "        \"\"\"Training step.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1]  #.astype(\"int64\")\n        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])\n        # call forward\n        cls_score = self.forward_net(data)\n        loss_metrics = self.head.loss_func(cls_score, labels)\n        top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)\n        top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)\n        output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}\n        return output\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1]  #.astype(\"int64\")\n        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])\n        # call forward\n        cls_score = self.forward_net(data)\n        loss_metrics = self.head.loss_func(cls_score, labels)\n        top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)\n        top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)"
+        },
+        {
+            "comment": "This code defines three functions: `forward_net`, `test_step`, and `infer_step`. The `forward_net` function is the core of the model, responsible for forward propagation. The `test_step` and `infer_step` functions both take in a data batch, transpose the images, call the `forward_net` function to get class scores, and return these scores. These steps are likely part of a deep learning model's testing or inference process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py\":57-77",
+            "content": "        output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}\n        return output\n    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        imgs = data_batch[0]\n        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])\n        # call forward\n        cls_score = self.forward_net(data)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        imgs = data_batch[0]\n        # call forward\n        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])\n        cls_score = self.forward_net(data)\n        return cls_score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0bc264e7-d7fa-48cf-9fb5-ce251a6cd3c3.json b/docs/doc/0bc264e7-d7fa-48cf-9fb5-ce251a6cd3c3.json
new file mode 100644
index 000000000..9b96608c7
--- /dev/null
+++ b/docs/doc/0bc264e7-d7fa-48cf-9fb5-ce251a6cd3c3.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is used to find and include the Git package, declare an external project named \"extern_Autolog\" using FetchContent, set its base directory, specify the repository URL and tag, and finally make the external project available for use.",
+    "details": [
+        {
+            "comment": "This code is used to find and include the Git package, declare an external project named \"extern_Autolog\" using FetchContent, set its base directory, specify the repository URL and tag, and finally make the external project available for use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/external-cmake/auto-log.cmake\":0-11",
+            "content": "find_package(Git REQUIRED)\ninclude(FetchContent)\nset(FETCHCONTENT_BASE_DIR \"${CMAKE_CURRENT_BINARY_DIR}/third-party\")\nFetchContent_Declare(\n  extern_Autolog\n  PREFIX autolog\n  GIT_REPOSITORY https://github.com/LDOUBLEV/AutoLog.git\n  GIT_TAG        main\n)\nFetchContent_MakeAvailable(extern_Autolog)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0bd12859-cb6d-45f6-9e9d-231b130a9b71.json b/docs/doc/0bd12859-cb6d-45f6-9e9d-231b130a9b71.json
new file mode 100644
index 000000000..c59c59c4d
--- /dev/null
+++ b/docs/doc/0bd12859-cb6d-45f6-9e9d-231b130a9b71.json
@@ -0,0 +1,10 @@
+{
+    "summary": "Downloading and extracting datasets for MSRVTT dataset from remote server.",
+    "details": [
+        {
+            "comment": "Downloading and extracting datasets for MSRVTT dataset from remote server.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data/download_features.sh\":0-8",
+            "content": "mkdir MSRVTT\ncd MSRVTT\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/aggregated_text_feats.tar\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/mmt_feats.tar\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/raw-captions.pkl\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/train_list_jsfusion.txt\nwget https://videotag.bj.bcebos.com/Data/MSRVTT/val_list_jsfusion.txt\ntar -xvf aggregated_text_feats.tar\ntar -xvf mmt_feats.tar"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0d57e211-8fb4-43a9-9c4c-ed6cedd56ac5.json b/docs/doc/0d57e211-8fb4-43a9-9c4c-ed6cedd56ac5.json
new file mode 100644
index 000000000..3a13458b7
--- /dev/null
+++ b/docs/doc/0d57e211-8fb4-43a9-9c4c-ed6cedd56ac5.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This Bash script, using a configuration file and mode inputs, initializes a model, serves it via Python/C++, prepares the environment, logs execution, runs a GPU server, and tests a web service function with incrementing \"Count\" variable and IFS separation.",
+    "details": [
+        {
+            "comment": "This code is a Bash script that takes in two arguments: the filename of a configuration file and the mode to operate in. It uses `awk` to extract a specific section from the configuration file, parses this data into variables using custom functions, and then sets up various parameters for running an image classification model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_cpp.sh\":0-27",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\nMODE=$2\ndataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# parser serving\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython_list=$(func_parser_value \"${lines[2]}\")\ntrans_model_py=$(func_parser_value \"${lines[3]}\")\ninfer_model_dir_key=$(func_parser_key \"${lines[4]}\")\ninfer_model_dir_value=$(func_parser_value \"${lines[4]}\")\nmodel_filename_key=$(func_parser_key \"${lines[5]}\")\nmodel_filename_value=$(func_parser_value \"${lines[5]}\")\nparams_filename_key=$(func_parser_key \"${lines[6]}\")\nparams_filename_value=$(func_parser_value \"${lines[6]}\")\nserving_server_key=$(func_parser_key \"${lines[7]}\")\nserving_server_value=$(func_parser_value \"${lines[7]}\")\nserving_client_key=$(func_parser_key \"${lines[8]}\")\nserving_client_value=$(func_parser_value \"${lines[8]}\")\nserving_dir_value=$(func_parser_value \"${lines[9]}\")\nrun_model_path_key=$(func_parser_key \"${lines[10]}\")\nrun_model_path_value=$(func_parser_value \"${lines[10]}\")"
+        },
+        {
+            "comment": "This code parses keys and values from a configuration file, sets directory names and filenames for saving the model, and initializes variables for later use. It uses Python and potentially C++ for model serving. The code creates log files to store results of the model inference and transfer process, and prepares for the next steps involving Python scripts and possibly C++ client or server execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_cpp.sh\":28-53",
+            "content": "port_key=$(func_parser_key \"${lines[11]}\")\nport_value=$(func_parser_value \"${lines[11]}\")\ncpp_client_value=$(func_parser_value \"${lines[12]}\")\ninput_video_key=$(func_parser_key \"${lines[13]}\")\ninput_video_value=$(func_parser_value \"${lines[13]}\")\nLOG_PATH=\"./test_tipc/output/log/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_serving.log\"\nfunction func_serving(){\n    IFS='|'\n    _python=$1\n    _script=$2\n    _model_dir=$3\n    # phase 1: save model\n    set_dirname=$(func_set_params \"${infer_model_dir_key}\" \"${infer_model_dir_value}\")\n    set_model_filename=$(func_set_params \"${model_filename_key}\" \"${model_filename_value}\")\n    set_params_filename=$(func_set_params \"${params_filename_key}\" \"${params_filename_value}\")\n    set_serving_server=$(func_set_params \"${serving_server_key}\" \"${serving_server_value}\")\n    set_serving_client=$(func_set_params \"${serving_client_key}\" \"${serving_client_value}\")\n    python_list=(${python_list})\n    python=${python_list[0]}\n    trans_log=\"${LOG_PATH}/cpp_trans_model.log\""
+        },
+        {
+            "comment": "This code snippet modifies a serving configuration file, sets up the environment for running a C++ server and client, logs their execution, and finally runs the C++ server on GPU.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_cpp.sh\":54-72",
+            "content": "    trans_model_cmd=\"${python} ${trans_model_py} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_serving_server} ${set_serving_client} > ${trans_log} 2>&1 \"\n    eval ${trans_model_cmd}\n    last_status=${PIPESTATUS[0]}\n    status_check $last_status \"${trans_model_cmd}\" \"${status_log}\" \"${model_name}\"\n    # modify the alias name of fetch_var to \"outputs\"\n    server_fetch_var_line_cmd=\"sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \\\"outputs\\\"/' $serving_server_value/serving_server_conf.prototxt\"\n    eval ${server_fetch_var_line_cmd}\n    client_fetch_var_line_cmd=\"sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \\\"outputs\\\"/' $serving_client_value/serving_client_conf.prototxt\"\n    eval ${client_fetch_var_line_cmd}\n    cd ${serving_dir_value}\n    echo $PWD\n    unset https_proxy\n    unset http_proxy\n    _save_log_path=\"${LOG_PATH}/cpp_client_infer_gpu_batchsize_1.log\"\n    # phase 2: run server\n    server_log_path=\"${LOG_PATH}/cpp_server_gpu.log\"\n    cpp_ser"
+        },
+        {
+            "comment": "The script starts a PaddlePaddle serving server, runs a client against it, and performs status checks. The CUDA device can be set using the GPUID parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_cpp.sh\":72-99",
+            "content": "ver_cmd=\"${python} -m paddle_serving_server.serve ${run_model_path_key} ${run_model_path_value} ${port_key} ${port_value} > ${server_log_path} 2>&1 &\"\n    eval ${cpp_server_cmd}\n    sleep 20s\n    # phase 3: run client\n    real_model_name=${model_name/PP-/PP}\n    serving_client_conf_path=\"${serving_client_value/deploy\\/cpp_serving\\/}\"\n    serving_client_conf_path=\"${serving_client_conf_path/\\/\\//}serving_client_conf.prototxt\"\n    cpp_client_cmd=\"${python} ${cpp_client_value} -n ${real_model_name} -c ${serving_client_conf_path} ${input_video_key} ${input_video_value} > ${_save_log_path} 2>&1 \"\n    eval ${cpp_client_cmd}\n    last_status=${PIPESTATUS[0]}\n    eval \"cat ${_save_log_path}\"\n    cd ../../\n    status_check $last_status \"${cpp_server_cmd}\" \"${status_log}\" \"${model_name}\"\n    ps ux | grep -i 'paddle_serving_server' | awk '{print $2}' | xargs kill -s 9\n}\n# set cuda device\nGPUID=$3\nif [ ${#GPUID} -le 0 ];then\n    env=\" \"\nelse\n    env=\"export CUDA_VISIBLE_DEVICES=${GPUID}\"\nfi\nset CUDA_VISIBLE_DEVICES\neval $env"
+        },
+        {
+            "comment": "This code is executing a test function for serving a web service, incrementing the \"Count\" variable and using IFS to separate the function arguments with \"|\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_cpp.sh\":102-106",
+            "content": "echo \"################### run test ###################\"\nexport Count=0\nIFS=\"|\"\nfunc_serving \"${web_service_cmd}\""
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0dfe1f64-db69-47ea-a14e-d7398914567f.json b/docs/doc/0dfe1f64-db69-47ea-a14e-d7398914567f.json
new file mode 100644
index 000000000..ca4b3af14
--- /dev/null
+++ b/docs/doc/0dfe1f64-db69-47ea-a14e-d7398914567f.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code develops a PaddlePaddle 2.1 video quality assessment model using ppTSM network on KonVid-150k dataset, supports multigpu distributed training and evaluation, and references two papers for improved user experience and SROCC/PLCC scores.",
+    "details": [
+        {
+            "comment": "This code is for a video quality assessment model developed using PaddlePaddle 2.1. It uses the ppTSM network and is trained on KonVid-150k dataset, which contains 153842 UGC videos. The model can analyze video content to determine its quality, improve video previews, and enhance user experience. Requires specific environment setup and dependencies like Python 3.7, CUDA 10.1, and cuDNN 7.6.4.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/README.md\":0-57",
+            "content": "# \u89c6\u9891\u8d28\u91cf\u8bc4\u4ef7\u6a21\u578b\n---\n## \u5185\u5bb9\n- [\u6a21\u578b\u7b80\u4ecb](#\u6a21\u578b\u7b80\u4ecb)\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u8bad\u7ec3](#\u6a21\u578b\u8bad\u7ec3)\n- [\u6a21\u578b\u6d4b\u8bd5](#\u6a21\u578b\u6d4b\u8bd5)\n- [\u6a21\u578b\u4f18\u5316](#\u6a21\u578b\u4f18\u5316)\n- [\u6a21\u578b\u90e8\u7f72](#\u6a21\u578b\u90e8\u7f72)\n- [\u53c2\u8003\u8bba\u6587](#\u53c2\u8003\u8bba\u6587)\n## \u6a21\u578b\u7b80\u4ecb\n\u8be5\u4ee3\u7801\u5e93\u4e3b\u8981\u57fa\u4e8epaddle2.1\u7248\u672c\u5f00\u53d1\uff0c\u4e3b\u8981\u662f\u5728ppTSM\u7f51\u7edc\u6a21\u578b\u7684\u57fa\u7840\u4e0a\u4fee\u6539\u7684\u4e00\u79cd\u65e0\u53c2\u8003\u89c6\u9891\u8d28\u91cf\u8bc4\u4f30\u65b9\u6cd5\uff0c\u901a\u8fc7\u8bfb\u5165\u89c6\u9891\u7684\u89c6\u9891\u5e27\u6765\u5224\u65ad\u8be5\u89c6\u9891\u7684\u8d28\u91cf\u3002\n\u9488\u5bf9\u89c6\u9891\u5185\u5bb9\u7684\u7406\u89e3\uff0c\u53ef\u4ee5\u81ea\u52a8\u5206\u6790\u89c6\u9891\u5185\u5bb9\u7684\u8d28\u91cf\uff0c\u5e2e\u52a9\u9009\u51fa\u6700\u4f18\u7684\u5173\u952e\u5e27\u6216\u5173\u952e\u7247\u6bb5\u4f5c\u4e3a\u89c6\u9891\u5c01\u9762\uff0c\u63d0\u5347\u89c6\u9891\u7684\u70b9\u51fb\u8f6c\u6362\u548c\u7528\u6237\u4f53\u9a8c\u3002\n\u672c\u9879\u76ee\u76ee\u524d\u652f\u6301Linux\u4e0b\u7684GPU\u5355\u5361\u548c\u591a\u5361\u8fd0\u884c\u73af\u5883\u3002\n## \u6570\u636e\u51c6\u5907\n```\n\u6570\u636e\u96c6\u6765\u81ea\u516c\u5f00\u6570\u636e\u96c6KonVid-150k\uff0c\u5171153842\u4e2augc\u89c6\u9891\uff0c\u5176\u4e2d\u8bad\u7ec3\u96c6(KonVid-150k-A)152265\u4e2a\uff0c\u9a8c\u8bc1\u96c6(KonVid-150k-B)1577\u4e2a\n\u793a\u4f8b\u6570\u636e\u96c6\u4ee5\u53ca\u6570\u636e\u96c6\u5b98\u7f51\u5730\u5740: datasets/dataset_url.list\n\u6570\u636e\u96c6\u6807\u6ce8\u6587\u4ef6\u4e3adataset\u4e2d\u7684train.txt\u548ceval.txt\n```\n## \u6a21\u578b\u8bad\u7ec3\n\u73af\u5883\u5b89\u88c5\uff1a\n- PaddlePaddle >= 2.1.0\n- Python >= 3.7\n- PaddleX >= 2.0.0\n- CUDA >= 10.1\n- cuDNN >= 7.6.4\n- nccl >= 2.1.2\n\u5b89\u88c5Python\u4f9d\u8d56\u5e93\uff1a\nPython\u4f9d\u8d56\u5e93\u5728[requirements.txt](https://github.com/PaddlePaddle/PaddleVideo/blob/master/requirements.txt)\u4e2d\u7ed9\u51fa\uff0c\u53ef\u901a\u8fc7\u5982\u4e0b\u547d\u4ee4\u5b89\u88c5\uff1a\n```\npython3.7 -m pip install --upgrade pip\npip3.7 install --upgrade -r requirements.txt\n```\n\u4f7f\u7528`paddle.distributed.launch`\u542f\u52a8\u6a21\u578b\u8bad\u7ec3\u548c\u6d4b\u8bd5\u811a\u672c\uff08`main.py`\uff09\uff0c\u53ef\u4ee5\u66f4\u65b9\u4fbf\u5730\u542f\u52a8\u591a\u5361\u8bad\u7ec3\u4e0e\u6d4b\u8bd5\uff0c\u6216\u76f4\u63a5\u8fd0\u884c(./run.sh)\n```shell\nsh run.sh\n```\n\u6211\u4eec\u5c06\u6240\u6709\u6807\u51c6\u7684\u542f\u52a8\u547d\u4ee4\u90fd\u653e\u5728\u4e86```run.sh```\u4e2d\uff0c\u6ce8\u610f\u9009\u62e9\u60f3\u8981\u8fd0\u884c\u7684\u811a\u672c\u3002\n\u53c2\u8003\u5982\u4e0b\u65b9\u5f0f\u542f\u52a8\u6a21\u578b\u8bad\u7ec3\uff0c`paddle.distributed.launch`\u901a\u8fc7\u8bbe\u7f6e`gpus`\u6307\u5b9aGPU\u8fd0\u884c\u5361\u53f7\uff0c"
+        },
+        {
+            "comment": "This code is running PaddleVideo's multigpu distributed training in launch mode. It specifies the GPU devices to use, sets up the log directory, and uses AMP for mixed precision training. The `--validate` flag starts the evaluation during training and allows for updating configurations using the `-o` parameter. It also prints various metrics like loss, learning rate, batch cost, reader cost, and instances per second during train and eval phases.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/README.md\":58-97",
+            "content": "\u6307\u5b9a`--validate`\u6765\u542f\u52a8\u8bad\u7ec3\u65f6\u8bc4\u4f30\u3002\n```bash\n# PaddleVideo\u901a\u8fc7launch\u65b9\u5f0f\u542f\u52a8\u591a\u5361\u591a\u8fdb\u7a0b\u8bad\u7ec3\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    --log_dir=log_pptsm \\\n    main.py \\\n    --amp \\\n    --validate \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml\n```\n\u5176\u4e2d\uff0c`-c`\u7528\u4e8e\u6307\u5b9a\u914d\u7f6e\u6587\u4ef6\u7684\u8def\u5f84\uff0c\u53ef\u901a\u8fc7\u914d\u7f6e\u6587\u4ef6\u4fee\u6539\u76f8\u5173\u8bad\u7ec3\u914d\u7f6e\u4fe1\u606f\uff0c\u4e5f\u53ef\u4ee5\u901a\u8fc7\u6dfb\u52a0`-o`\u53c2\u6570\u6765\u66f4\u65b0\u914d\u7f6e\uff1a\n```bash\npython -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml \\\n    --validate \\\n    -o DATASET.batch_size=16\n```\n`-o`\u7528\u4e8e\u6307\u5b9a\u9700\u8981\u4fee\u6539\u6216\u8005\u6dfb\u52a0\u7684\u53c2\u6570\uff0c\u5176\u4e2d`-o DATASET.batch_size=16`\u8868\u793a\u66f4\u6539batch_size\u5927\u5c0f\u4e3a16\u3002\n\u8fd0\u884c\u4e0a\u8ff0\u547d\u4ee4\uff0c\u5c06\u4f1a\u8f93\u51fa\u8fd0\u884c\u65e5\u5fd7\uff0c\u5e76\u9ed8\u8ba4\u4fdd\u5b58\u5728./log\u76ee\u5f55\u4e0b\uff0c\u5982\uff1a`worker.0` , `worker.1` ... , worker\u65e5\u5fd7\u6587\u4ef6\u5bf9\u5e94\u6bcf\u5f20\u5361\u4e0a\u7684\u8f93\u51fa\n\u3010train\u9636\u6bb5\u3011\u6253\u5370\u5f53\u524d\u65f6\u95f4\uff0c\u5f53\u524depoch/epoch\u603b\u6570\uff0c\u5f53\u524dbatch id\uff0c\u8bc4\u4f30\u6307\u6807\uff0c\u8017\u65f6\uff0cips\u7b49\u4fe1\u606f\uff1a\n    [11/16 04:40:37] epoch:[  1/1  ] train step:100  loss: 5.31382 lr: 0.000250 batch_cost: 0.73082 sec, reader_cost: 0.38075 sec, ips: 5.47330 instance/sec.\n\u3010eval\u9636\u6bb5\u3011\u6253\u5370\u5f53\u524d\u65f6\u95f4\uff0c\u5f53\u524depoch/epoch\u603b\u6570\uff0c\u5f53\u524dbatch id\uff0c\u8bc4\u4f30\u6307\u6807\uff0c\u8017\u65f6\uff0cips\u7b49\u4fe1\u606f\uff1a\n    [11/16 04:40:37] epoch:[  1/1  ] val step:0    loss: 4.42741 batch_cost: 1.37882 sec, reader_cost: 0.00000 sec, ips: 2.90104 instance/sec."
+        },
+        {
+            "comment": "Epoch completion: Prints current time, learning rate, evaluation metrics, training duration, and instances per second.\nBest epoch detection: Prints best precision achieved during training.\nResuming training: Loads checkpoint weights to continue training from a specified epoch.\nModel fine-tuning: Loads pre-trained model for custom dataset fine-tuning.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/README.md\":100-143",
+            "content": "\u3010epoch\u7ed3\u675f\u3011\u6253\u5370\u5f53\u524d\u65f6\u95f4\uff0c\u5b66\u4e60\u7387\uff0c\u8bc4\u4f30\u6307\u6807\uff0c\u8017\u65f6\uff0cips\u7b49\u4fe1\u606f\uff1a\n    [11/16 04:40:37] lr=0.00012487\n    [11/16 04:40:37] train_SROCC=0.4456697876616565\n    [11/16 04:40:37] train_PLCC=0.48071880604403616\n    [11/16 04:40:37] END epoch:1   val loss_avg: 5.21620 avg_batch_cost: 0.04321 sec, avg_reader_cost: 0.00000 sec, batch_cost_sum: 112.69575 sec, avg_ips: 8.41203 instance/sec.\n\u5f53\u524d\u4e3a\u8bc4\u4f30\u7ed3\u679c\u6700\u597d\u7684epoch\u65f6\uff0c\u6253\u5370\u6700\u4f18\u7cbe\u5ea6\uff1a\n    [11/16 04:40:57] max_SROCC=0.7116468111328617\n    [11/16 04:40:57] max_PLCC=0.733503995526737\n### \u6a21\u578b\u6062\u590d\u8bad\u7ec3\n\u5982\u679c\u8bad\u7ec3\u4efb\u52a1\u7ec8\u6b62\uff0c\u53ef\u4ee5\u52a0\u8f7d\u65ad\u70b9\u6743\u91cd\u6587\u4ef6(\u4f18\u5316\u5668-\u5b66\u4e60\u7387\u53c2\u6570\uff0c\u65ad\u70b9\u6587\u4ef6)\u7ee7\u7eed\u8bad\u7ec3\u3002\n\u9700\u8981\u6307\u5b9a`-o resume_epoch`\u53c2\u6570\uff0c\u8be5\u53c2\u6570\u8868\u793a\u4ece```resume_epoch```\u8f6e\u5f00\u59cb\u7ee7\u7eed\u8bad\u7ec3.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    --amp \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml \\\n    --validate \\\n    -o resume_epoch=5\n```\n### \u6a21\u578b\u5fae\u8c03\n\u8fdb\u884c\u6a21\u578b\u5fae\u8c03\uff08Finetune\uff09\uff0c\u5bf9\u81ea\u5b9a\u4e49\u6570\u636e\u96c6\u8fdb\u884c\u6a21\u578b\u5fae\u8c03\uff0c\u9700\u8981\u6307\u5b9a `--weights` \u53c2\u6570\u6765\u52a0\u8f7d\u9884\u8bad\u7ec3\u6a21\u578b\u3002\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    --amp \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml \\"
+        },
+        {
+            "comment": "The code is launching PaddleVideo application for video quality assessment. It uses the TSM model with regression and loads the best trained weights from \"./output/model_name/ppTSM_best.pdparams\". The --test flag is used to run the model in test mode. The code also suggests optimizing strategies like using original input instead of RandomCrop, changing input size for better performance, and considering aspect ratios of 16:9 and 4:3 for improved results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/README.md\":144-178",
+            "content": "    --validate \\\n    --weights=./output/model_name/ppTSM_best.pdparams\n```\nPaddleVideo\u4f1a\u81ea\u52a8**\u4e0d\u52a0\u8f7d**shape\u4e0d\u5339\u914d\u7684\u53c2\u6570\n## \u6a21\u578b\u6d4b\u8bd5\n\u9700\u8981\u6307\u5b9a `--test`\u6765\u542f\u52a8\u6d4b\u8bd5\u6a21\u5f0f\uff0c\u5e76\u6307\u5b9a`--weights`\u6765\u52a0\u8f7d\u9884\u8bad\u7ec3\u6a21\u578b\u3002\n```bash\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/recognition/tsm/pptsm_regression.yaml \\\n    --test \\\n    --weights=./output/model_name/ppTSM_best.pdparams\n```\n## \u6a21\u578b\u4f18\u5316\n\u5728\u5b9e\u9645\u4f7f\u7528\u573a\u666f\u4e2d\u53ef\u6839\u636e\u89c6\u9891\u8d28\u91cf\u4ee5\u53ca\u5c3a\u5bf8\u5c1d\u8bd5\u4f18\u5316\u7b56\u7565\n- \u53ef\u901a\u8fc7\u539f\u56fe\u8f93\u5165\u6765\u66ff\u6362RandomCrop:224\u64cd\u4f5c\uff0c\u51c6\u786e\u7387\u7531SROCC=0.8176,PLCC=0.8361\u63d0\u5347\u5230SROCC=0.8617,PLCC=0.8910,\u4e0d\u540c\u6a21\u578b\u4ee5\u53ca\u7279\u5f81\u589e\u5f3a\u64cd\u4f5c\u7684\u6548\u679c\u5bf9\u6bd4\u5982\u4e0b\u8868\u6240\u793a\n  |  \u6a21\u578b  |                  \u7279\u5f81\u589e\u5f3a                   | val_SROCC | val_PLCC |\n  | :----: | :-----------------------------------------: | :-------: | :------: |\n  | GSTVQA |                  \u539f\u56fe\u8f93\u5165                   |  0.7932   |  0.8006  |\n  | ppTSM  | train--RandomCrop=224  val--center_crop=224 |  0.8176   |  0.8361  |\n  | ppTSM  | train--RandomCrop=512  val--center_crop=512 |  0.8603   |  0.8822  |\n  | ppTSM  |                  \u539f\u56fe\u8f93\u5165                   |  0.8617   |  0.8910  |\n- \u8003\u8651\u5e94\u7528\u573a\u666f\u89c6\u9891\u7684 aspect ratio \u5927\u90fd\u4e3a 16\uff1a9 \u548c 4\uff1a3 \u7b49\uff0c\u540c\u65f6\u4e3a\u4e86\u907f\u514d\u975e\u5747\u5300\u7f29\u653e\u62c9\u4f38\u5e26\u6765\u7684\u5e72\u6270 \uff0c\u53ef\u4ee5\u91c7\u7528\u4e86\uff08224x3\uff09x(224x2)=672x448 \u7684\u8f93\u5165\u5c3a\u5bf8\u6765\u66f4\u5145\u5206\u5f97\u5229\u7528\u6709\u9650\u7684\u8f93\u5165\u5c3a\u5bf8\u3002 "
+        },
+        {
+            "comment": "This code provides a solution for video quality assessment with SROCC and PLCC scores on official validation dataset. It references two papers: TSM: Temporal Shift Module for Efficient Video Understanding and Quality Assessment of In-the-Wild Videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/README.md\":180-188",
+            "content": "## \u6a21\u578b\u90e8\u7f72\n\u672c\u4ee3\u7801\u89e3\u51b3\u65b9\u6848\u5728\u5b98\u65b9\u9a8c\u8bc1\u96c6(KonVid-150k-B)\u4e0a\u7684\u6307\u6807\u6548\u679c\u4e3aSROCC=0.8176,PLCC=0.8361\u3002\n## \u53c2\u8003\u8bba\u6587\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han\n- [Quality Assessment of In-the-Wild Videos](https://dl.acm.org/citation.cfm?doid=3343031.3351028), Dingquan Li, Tingting Jiang, and Ming Jiang"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0e4af0b2-83f3-4e9a-9047-263fafe76cba.json b/docs/doc/0e4af0b2-83f3-4e9a-9047-263fafe76cba.json
new file mode 100644
index 000000000..e7910496c
--- /dev/null
+++ b/docs/doc/0e4af0b2-83f3-4e9a-9047-263fafe76cba.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet is a part of the PaddleVideo library and it defines various loss functions, including SmoothL1Loss and L1Loss. It also provides aliases for these losses in the __all__ list.",
+    "details": [
+        {
+            "comment": "This code snippet is a part of the PaddleVideo library and it defines various loss functions, including SmoothL1Loss and L1Loss. It also provides aliases for these losses in the __all__ list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py\":0-20",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .base import BaseWeightedLoss\nfrom .smooth_l1_loss import SmoothL1Loss\nfrom .l1_loss import L1Loss\n__all__ = ['SmoothL1Loss', 'L1Loss']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0e980b5c-ff4e-4180-abe2-cdd08cfcb21b.json b/docs/doc/0e980b5c-ff4e-4180-abe2-cdd08cfcb21b.json
new file mode 100644
index 000000000..9a7c6a20f
--- /dev/null
+++ b/docs/doc/0e980b5c-ff4e-4180-abe2-cdd08cfcb21b.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code imports necessary modules, sets up a logger, creates an AttrDict class for config handling, and defines functions to load, visualize, and override dictionary from YAML file. It also includes 'options' and 'get_config' functions to apply overrides and print or return the updated configuration.",
+    "details": [
+        {
+            "comment": "The code is importing necessary modules, defining an AttrDict class for config handling and setting up a logger. It also creates a function 'create_attr_dict' that takes in a yaml configuration file as input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/config.py\":0-33",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport yaml\nfrom paddlevideo.utils.logger import coloring, get_logger, setup_logger\n__all__ = ['get_config']\nlogger = setup_logger(\"./\", name=\"paddlevideo\", level=\"INFO\")\nclass AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef create_attr_dict(yaml_config):"
+        },
+        {
+            "comment": "The code defines two functions: \"parse_config\" and \"print_dict\". The \"parse_config\" function loads a config file into an AttrDict, while the \"print_dict\" function recursively visualizes a dictionary by indenting according to the relationship of keys.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/config.py\":34-66",
+            "content": "    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef print_dict(d, delimiter=0):\n    \"\"\"\n    Recursively visualize a dict and\n    indenting acrrording by the relationship of keys.\n    \"\"\"\n    placeholder = \"-\" * 60\n    for k, v in sorted(d.items()):\n        if isinstance(v, dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", coloring(k,\n                                                                   \"HEADER\")))"
+        },
+        {
+            "comment": "The code defines several functions related to handling and manipulating configurations. The \"print_config\" function visualizes the configs, while \"check_config\" checks the validity of the configs. The \"override\" function recursively replaces values in a dictionary or list using a provided key and value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/config.py\":67-108",
+            "content": "            print_dict(v, delimiter + 4)\n        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \",\n                                         coloring(str(k), \"HEADER\")))\n            for value in v:\n                print_dict(value, delimiter + 4)\n        else:\n            logger.info(\"{}{} : {}\".format(delimiter * \" \",\n                                           coloring(k, \"HEADER\"),\n                                           coloring(v, \"OKGREEN\")))\n        if k.isupper():\n            logger.info(placeholder)\ndef print_config(config):\n    \"\"\"\n    visualize configs\n    Arguments:\n        config: configs\n    \"\"\"\n    print_dict(config)\ndef check_config(config):\n    \"\"\"\n    Check config\n    \"\"\"\n    pass\ndef override(dl, ks, v):\n    \"\"\"\n    Recursively replace dict of list\n    Args:\n        dl(dict or list): dict or list to be replaced\n        ks(list): list of keys\n        v(str): value to be replaced\n    \"\"\"\n    def str2num(v):\n        try:\n            return eval(v)"
+        },
+        {
+            "comment": "This code is part of a function `override_config` that recursively overrides the given config with new options. The function first checks if the data `dl` is a list or dictionary, and then proceeds accordingly. If `dl` is a list, it extracts the first key from `ks`, converts it to an integer, and ensures that the index is within range of `dl`. It then updates the corresponding element in `dl` with the value `v`. If `ks` has more than one key, it calls the `override` function to update a specific field in `dl`. \n\nIf `dl` is a dictionary, it again handles two scenarios: when there's only one key and when there are multiple keys. In the single-key scenario, it checks if the key exists in `dl` (and warns if not) and updates its value. If there are multiple keys, it first asserts that the first key exists in `dl`, then calls the `override` function to update a specific field in `dl`.\n\nThe code also includes an exception handling block which simply returns the original value `v` in case of any error or exception.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/config.py\":109-138",
+            "content": "        except Exception:\n            return v\n    assert isinstance(dl, (list, dict)), (\"{} should be a list or a dict\")\n    assert len(ks) > 0, ('lenght of keys should larger than 0')\n    if isinstance(dl, list):\n        k = str2num(ks[0])\n        if len(ks) == 1:\n            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))\n            dl[k] = str2num(v)\n        else:\n            override(dl[k], ks[1:], v)\n    else:\n        if len(ks) == 1:\n            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))\n            if not ks[0] in dl:\n                logger.warning('A new filed ({}) detected!'.format(ks[0], dl))\n            dl[ks[0]] = str2num(v)\n        else:\n            assert ks[0] in dl, (\n                '({}) doesn\\'t exist in {}, a new dict field is invalid'.format(\n                    ks[0], dl))\n            override(dl[ks[0]], ks[1:], v)\ndef override_config(config, options=None):\n    \"\"\"\n    Recursively override the config\n    Args:\n        config(dict): dict to be replaced"
+        },
+        {
+            "comment": "This code defines two functions, `options(list)` and `get_config(fname, overrides=None, show=True)`. The `options(list)` function takes a list of pairs (key-value) as input and replaces the config with new values. The `get_config(fname, overrides=None, show=True)` function reads the config from a file, applies any overrides, and if 'show' is True, prints the updated configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/config.py\":139-169",
+            "content": "        options(list): list of pairs(key0.key1.idx.key2=value)\n            such as: [\n                epochs=20',\n                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'\n            ]\n    Returns:\n        config(dict): replaced config\n    \"\"\"\n    if options is not None:\n        for opt in options:\n            assert isinstance(opt,\n                              str), (\"option({}) should be a str\".format(opt))\n            assert \"=\" in opt, (\n                \"option({}) should contain a =\"\n                \"to distinguish between key and value\".format(opt))\n            pair = opt.split('=')\n            assert len(pair) == 2, (\"there can be only a = in the option\")\n            key, value = pair\n            keys = key.split('.')\n            override(config, keys, value)\n    return config\ndef get_config(fname, overrides=None, show=True):\n    \"\"\"\n    Read config from file\n    \"\"\"\n    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))\n    config = parse_config(fname)\n    override_config(config, overrides)"
+        },
+        {
+            "comment": "This code block checks if the 'show' variable is set to True, and if so, it calls a function named 'print_config' with the 'config' parameter. It then always executes another function called 'check_config' with the same 'config' argument before returning the 'config' variable.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/config.py\":170-173",
+            "content": "    if show:\n        print_config(config)\n    check_config(config)\n    return config"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0eda88ac-5c9d-46e3-b2a1-110fb5b0077f.json b/docs/doc/0eda88ac-5c9d-46e3-b2a1-110fb5b0077f.json
new file mode 100644
index 000000000..159840be7
--- /dev/null
+++ b/docs/doc/0eda88ac-5c9d-46e3-b2a1-110fb5b0077f.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code defines a function for precise batch normalization that recomputes and updates BN statistics, improving accuracy while speeding up training and saving memory.",
+    "details": [
+        {
+            "comment": "The code is importing necessary libraries and defining a function for the precise batch normalization (BN) technique. This BN improves accuracy by recomputing and updating batch norm statistics to make them more precise, which can speed up training and save memory. The function takes in a model, data loader, parallel flag, number of iterations, whether to use automatic mixed precision, and the AMP level.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/precise_bn.py\":0-33",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport itertools\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n\"\"\"\nImplement precise bn, which is useful for improving accuracy.\n\"\"\"\n@paddle.no_grad()  # speed up and save CUDA memory\ndef do_preciseBN(model,\n                 data_loader,\n                 parallel,\n                 num_iters=200,\n                 use_amp=False,\n                 amp_level=None):\n    \"\"\"\n    Recompute and update the batch norm stats to make them more precise. During"
+        },
+        {
+            "comment": "This function recomputes the BN stats for a given model using fixed weights to improve validation accuracy. It targets specific BN layers and runs iterations to compute precise mean and variance values. This is useful when training both BN stats and weights are changing with every iteration, affecting running averages.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/precise_bn.py\":34-55",
+            "content": "    training both BN stats and the weight are changing after every iteration, so\n    the running average can not precisely reflect the actual stats of the\n    current model.\n    In this function, the BN stats are recomputed with fixed weights, to make\n    the running average more precise. Specifically, it computes the true average\n    of per-batch mean/variance instead of the running average.\n    This is useful to improve validation accuracy.\n    Args:\n        model: the model whose bn stats will be recomputed\n        data_loader: an iterator. Produce data as input to the model\n        num_iters: number of iterations to compute the stats.\n    Return:\n        the model with precise mean and variance in bn layers.\n    \"\"\"\n    bn_layers_list = [\n        m for m in model.sublayers()\n        if any((isinstance(m, bn_type)\n                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,\n                                paddle.nn.BatchNorm3D))) and m.training\n    ]\n    if len(bn_layers_list) == 0:\n        return"
+        },
+        {
+            "comment": "This code resets the momentum in Batch Normalization layers to 0, calculates precise Batch Normalization by accumulating batch means and variances across iterations, and then updates the running mean and variance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/precise_bn.py\":57-82",
+            "content": "    # moving_mean=moving_mean*momentum+batch_mean*(1.\u2212momentum)\n    # we set momentum=0. to get the true mean and variance during forward\n    momentum_actual = [bn._momentum for bn in bn_layers_list]\n    for bn in bn_layers_list:\n        bn._momentum = 0.\n    running_mean = [paddle.zeros_like(bn._mean)\n                    for bn in bn_layers_list]  # pre-ignore\n    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]\n    ind = -1\n    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):\n        logger.info(\"Computing precise BN {} / {}...\".format(\n            ind + 1, num_iters))\n        if use_amp:\n            with paddle.amp.auto_cast(\n                    custom_black_list={\"reduce_mean\",\n                                       \"conv3d\"}, level=amp_level):\n                model(data, mode='train')\n        else:\n            model(data, mode='train')\n        for i, bn in enumerate(bn_layers_list):\n            # Accumulates the bn stats.\n            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)"
+        },
+        {
+            "comment": "This code updates batch normalization (BN) statistics based on the running mean, variance, and momentum. It asserts that the dataloader has run for the expected number of iterations before setting these values to the BN layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/precise_bn.py\":83-93",
+            "content": "            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)\n    assert ind == num_iters - 1, (\n        \"update_bn_stats is meant to run for {} iterations, but the dataloader stops at {} iterations.\"\n        .format(num_iters, ind))\n    # Sets the precise bn stats.\n    for i, bn in enumerate(bn_layers_list):\n        bn._mean.set_value(running_mean[i])\n        bn._variance.set_value(running_var[i])\n        bn._momentum = momentum_actual[i]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/0fea23e0-ae4e-4c5b-908f-e22fba946bd0.json b/docs/doc/0fea23e0-ae4e-4c5b-908f-e22fba946bd0.json
new file mode 100644
index 000000000..1890e527a
--- /dev/null
+++ b/docs/doc/0fea23e0-ae4e-4c5b-908f-e22fba946bd0.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code imports libraries and defines a test_model function for testing a model without gradient calculation. It sets configuration, updates model's test_step function with updated parameters, and performs multi-card testing.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and functions for testing a model. It defines a function called test_model that takes configuration (cfg), weights path (weights), and parallel flag as arguments. The function performs model testing without gradient calculation (paddle.no_grad()) to save computation resources.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py\":0-30",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom EIVideo.paddlevideo.utils import get_logger, load\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..metrics import build_metric\nfrom ..modeling.builder import build_model\nfrom ..modeling.framework import Manet\nlogger = get_logger(\"paddlevideo\")\n@paddle.no_grad()\ndef test_model(cfg, weights, parallel=True):\n    \"\"\"Test model entry\n    Args:\n        cfg (dict): configuration.\n        weights (str): weights path to load."
+        },
+        {
+            "comment": "This code sets the configuration for multi-card testing and then calls the Manet model's test_step function with updated configuration, weights, and parallel set to False.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py\":31-38",
+            "content": "        parallel (bool): Whether to do multi-cards testing. Default: True.\n    \"\"\"\n    if cfg.MODEL.framework == \"Manet\":\n        cfg_helper = {\"knns\": 1, \"is_save_image\": True}\n        cfg.update(cfg_helper)\n        final = Manet().test_step(**cfg, weights=weights, parallel=False)\n        return final"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1025a9ea-3d28-4c37-9e52-1ca7ee001ee0.json b/docs/doc/1025a9ea-3d28-4c37-9e52-1ca7ee001ee0.json
new file mode 100644
index 000000000..4ab62a4f5
--- /dev/null
+++ b/docs/doc/1025a9ea-3d28-4c37-9e52-1ca7ee001ee0.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code creates a STGCNHead class in PaddlePaddle's video modeling library, initializing a convolutional layer and applying forward pass for input x to produce N, C shaped output.",
+    "details": [
+        {
+            "comment": "This code snippet is for the STGCNHead class in PaddlePaddle's video modeling library. It's a subclass of BaseHead with 256 input feature channels and 10 number classes as default values, initialized using super(). This model can be customized further by passing additional keyword arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/stgcn_head.py\":0-31",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass STGCNHead(BaseHead):\n    \"\"\"\n    Head for ST-GCN model.\n    Args:\n        in_channels: int, input feature channels. Default: 256.\n        num_classes: int, number classes. Default: 10.\n    \"\"\"\n    def __init__(self, in_channels=256, num_classes=10, **kwargs):\n        super().__init__(num_classes, in_channels, **kwargs)"
+        },
+        {
+            "comment": "The code defines a head class with a convolutional layer, initializes its weights using a normal distribution with standard deviation 0.02, and applies the forward pass to input x by passing it through a convolutional layer and reshaping the output to shape N, C (N: number of samples, C: number of classes).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/stgcn_head.py\":32-49",
+            "content": "        self.fcn = nn.Conv2D(in_channels=in_channels,\n                             out_channels=num_classes,\n                             kernel_size=1)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv2D):\n                weight_init_(layer, 'Normal', std=0.02)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        x = self.fcn(x)\n        x = paddle.reshape_(x, (x.shape[0], -1))  # N,C,1,1 --> N,C\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1068a225-d288-45b3-9649-5d785f56f7a9.json b/docs/doc/1068a225-d288-45b3-9649-5d785f56f7a9.json
new file mode 100644
index 000000000..233b7f337
--- /dev/null
+++ b/docs/doc/1068a225-d288-45b3-9649-5d785f56f7a9.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code initializes a SamplingResult class for bbox sampling and defines a RandomSampler class to sample positive and negative bboxes from assigned results, ensuring enough samples are available in each case.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and defining a class called \"SamplingResult\" that holds the result of bbox sampling. The class has attributes for positive indices, negative indices, bboxes, gt_bboxes, assign_result, and gt_flags. It uses paddle library to index select the bboxes based on pos_inds. Negative_inds may be empty.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/samplers/random_sampler.py\":0-27",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport numpy as np\nfrom ..registry import BBOX_SAMPLERS\nclass SamplingResult():\n    \"\"\"Bbox sampling result.  \"\"\"\n    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,\n                 gt_flags):\n        self.pos_inds = pos_inds\n        self.neg_inds = neg_inds\n        self.pos_bboxes = paddle.index_select(bboxes,pos_inds)\n        # neg_inds may be empty\n        if neg_inds.shape[0]!=0:"
+        },
+        {
+            "comment": "This code initializes the negative bounding boxes, positive ground truth (gt) bounding boxes and labels for a sampler. It checks if there are any gt bboxes available, if not, it sets up a placeholder for them. The 'pos_bboxes' are then concatenated with the neg_bboxes if they exist. If assign_result.labels is not None, it also extracts and stores positive gt labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/samplers/random_sampler.py\":28-54",
+            "content": "            self.neg_bboxes = paddle.index_select(bboxes,neg_inds)\n        else:\n            self.neg_bboxes=None\n        self.pos_is_gt  = paddle.index_select(gt_flags,pos_inds)\n        self.num_gts = gt_bboxes.shape[0]\n        self.pos_assigned_gt_inds = paddle.index_select(assign_result.gt_inds,pos_inds) - 1\n        if float(gt_bboxes.numel()) == 0:\n            assert self.pos_assigned_gt_inds.numel() == 0\n            self.pos_gt_bboxes = paddle.empty_like(gt_bboxes).view(-1, 4)\n        else:\n            if len(gt_bboxes.shape) < 2:\n                gt_bboxes = gt_bboxes.view(-1, 4)\n            self.pos_gt_bboxes = paddle.index_select(gt_bboxes, self.pos_assigned_gt_inds)\n        if assign_result.labels is not None:\n            self.pos_gt_labels = paddle.index_select(assign_result.labels, pos_inds)\n        else:\n            self.pos_gt_labels = None\n    @property\n    def bboxes(self):\n        if self.neg_bboxes is not None:\n            ret = paddle.concat([self.pos_bboxes, self.neg_bboxes])\n        else:"
+        },
+        {
+            "comment": "This code defines a RandomSampler class which samples positive and negative bboxes from assigned results. It takes arguments like num, pos_fraction, neg_pos_ub, add_gt_as_proposals, etc. If add_gt_as_proposals is True and gt_bboxes are present, it raises a ValueError if gt_labels are not given. The sample method takes assign_result, bboxes, gt_bboxes, and gt_labels as arguments. It checks the shape of bboxes, converts them to 4-column format, and creates gt_flags.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/samplers/random_sampler.py\":55-91",
+            "content": "            # neg bbox may be empty\n            ret = self.pos_bboxes\n        return ret\n@BBOX_SAMPLERS.register()\nclass RandomSampler():\n    def __init__(self,\n                 num,\n                 pos_fraction,\n                 neg_pos_ub=-1,\n                 add_gt_as_proposals=True,\n                 **kwargs):\n        self.num = num\n        self.pos_fraction = pos_fraction\n        self.neg_pos_ub = neg_pos_ub\n        self.add_gt_as_proposals = add_gt_as_proposals\n    def sample(self,\n               assign_result,\n               bboxes,\n               gt_bboxes,\n               gt_labels=None,\n               **kwargs):\n        \"\"\"Sample positive and negative bboxes.  \"\"\"\n        if len(bboxes.shape) < 2:\n            bboxes = bboxes[None, :]\n        bboxes = bboxes[:, :4]\n        gt_flags = paddle.full([bboxes.shape[0], ], 0, dtype='int32')\n        if self.add_gt_as_proposals and len(gt_bboxes) > 0:\n            if gt_labels is None:\n                raise ValueError(\n                    'gt_labels must be given when add_gt_as_proposals is True')"
+        },
+        {
+            "comment": "This code samples positive and negative indices for assigning ground truth labels to objects, ensuring a desired ratio of positive and negative samples. It then creates a SamplingResult object containing these indices along with bounding boxes and other information. The random_choice function is used to randomly select a specific number of samples from a given set of objects.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/samplers/random_sampler.py\":92-113",
+            "content": "            bboxes = paddle.concat([gt_bboxes, bboxes])\n            assign_result.add_gt_(gt_labels)\n            gt_ones = paddle.full([gt_bboxes.shape[0], ], 1, dtype='int32')\n            gt_flags = paddle.concat([gt_ones, gt_flags])\n        #1. \u5f97\u5230\u6b63\u6837\u672c\u7684\u6570\u91cf, inds\n        num_expected_pos = int(self.num * self.pos_fraction)\n        pos_inds = self._sample_pos( assign_result, num_expected_pos, bboxes=bboxes, **kwargs)\n        pos_inds = paddle.to_tensor(np.unique(pos_inds.numpy()))\n        #2. \u5f97\u5230\u8d1f\u6837\u672c\u7684\u6570\u91cf, inds\n        num_sampled_pos = pos_inds.numel()\n        num_expected_neg = self.num - num_sampled_pos\n        neg_inds = self._sample_neg(\n            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)\n        neg_inds = paddle.to_tensor(np.unique(neg_inds.numpy()))\n        #3. \u5f97\u5230sampling result\n        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,\n                                         assign_result, gt_flags)\n        return sampling_result\n    def random_choice(self, gallery, num):"
+        },
+        {
+            "comment": "The code defines a random sampler that randomly selects elements from the gallery. It has two functions: _sample_pos, which randomly samples positive samples, and _sample_neg, which randomly samples negative samples. The _sample_pos function first finds indexes of assign_result with label greater than 0 (i.e., positive samples), then checks if the number of positive samples is less than or equal to num_expected. If it's less, returns the indices; otherwise, selects num_expected random samples from the available indices using the random_choice method. The _sample_neg function does a similar process for negative samples but doesn't return the indices if their number is 0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/samplers/random_sampler.py\":114-138",
+            "content": "        \"\"\"Random select some elements from the gallery.  \"\"\"\n        assert len(gallery) >= num\n        perm = paddle.arange(gallery.numel())[:num]\n        perm = paddle.randperm(gallery.numel())[:num] \n        rand_inds = paddle.index_select(gallery, perm)\n        return rand_inds\n    def _sample_pos(self, assign_result, num_expected, **kwargs):\n        \"\"\"Randomly sample some positive samples.\"\"\"\n        #1.\u9996\u5148\u770b\u4e00\u4e0b\u7ed9\u7684bboxes\u91cc\u9762\u6709\u54ea\u4e9blabel\u662f\u5927\u4e8e0\u7684 \u5f97\u5230\u4e86\u4ed6\u4eec\u7684index\n        pos_inds = paddle.nonzero(assign_result.gt_inds, as_tuple=False)\n        #2. \u53ea\u8981\u8fd9\u4e2apos_inds\u7684\u6570\u76ee\u4e0d\u662f0\u4e2a \u8fd9\u4e9b\u5c31\u90fd\u53ef\u4ee5\u662fpositive sample\n        # \u5f53pos_inds\u7684\u6570\u76ee\u5c0f\u4e8enum_expected(\u60f3\u8981\u7684sample\u7684\u6700\u5927\u6570\u76ee), \u5c31\u76f4\u63a5\u7528\u8fd9\u4e2apos_inds\n        # \u53cd\u4e4b\u5c31\u4ece\u8fd9\u4e48\u591aindex\u91cc\u968f\u673a\u91c7\u6837num_expected\u4e2a\u51fa\u6765\n        if float(pos_inds.numel()) != 0:\n            pos_inds = pos_inds.squeeze() \n        if float(pos_inds.numel()) <= num_expected:\n            return pos_inds\n        else:\n            return self.random_choice(pos_inds, num_expected)\n    def _sample_neg(self, assign_result, num_expected, **kwargs):\n        \"\"\"Randomly sample some negative samples.\"\"\""
+        },
+        {
+            "comment": "This code checks the assign_result's gt_inds for zero values, extracts their indices in neg_inds, and if there are non-zero values, squeezes them. If the number of non-zero values is less than or equal to expected, it returns neg_inds. Otherwise, it uses random_choice() to select required indices from neg_inds.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/samplers/random_sampler.py\":139-145",
+            "content": "        neg_inds = paddle.nonzero(assign_result.gt_inds == 0, as_tuple=False)\n        if float(neg_inds.numel()) != 0:\n            neg_inds = neg_inds.squeeze() \n        if (float(neg_inds.numel())) <= float(num_expected):\n            return neg_inds\n        else:\n            return self.random_choice(neg_inds, num_expected)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/109cd714-53b4-43e0-877e-96d62b02a891.json b/docs/doc/109cd714-53b4-43e0-877e-96d62b02a891.json
new file mode 100644
index 000000000..874d89049
--- /dev/null
+++ b/docs/doc/109cd714-53b4-43e0-877e-96d62b02a891.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code tests a model using Paddle framework, constructs multi-card datasets, enables parallel processing with DataParallel, updates state dictionary for evaluation. Batch size is set, metric object built based on configuration, and data iterated over from loader, using either parallel or sequential testing, updating metric per batch before accumulating results.",
+    "details": [
+        {
+            "comment": "This code is a function named \"test_model\" which tests a given model using specified configuration and weights path. It uses Paddle framework and utilizes functions from paddlevideo.utils, loader.builder, metrics, and modeling.builder to perform the testing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/test.py\":0-34",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..metrics import build_metric\nfrom ..modeling.builder import build_model\nfrom paddlevideo.utils import load\nimport time\nlogger = get_logger(\"paddlevideo\")\n@paddle.no_grad()\ndef test_model(cfg, weights, parallel=True):\n    \"\"\"Test model entry\n    Args:\n        cfg (dict): configuration.\n        weights (str): weights path to load."
+        },
+        {
+            "comment": "This code constructs a model and dataset for multi-card testing. It uses DataParallel to enable parallel processing on multiple GPUs, builds the dataloader with specified settings, sets the model to evaluation mode, loads state dictionaries from weights file, and updates the model's state dictionary. The metric data size is set to the length of the dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/test.py\":35-65",
+            "content": "        parallel (bool): Whether to do multi-cards testing. Default: True.\n    \"\"\"\n    # 1. Construct model.\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    # 2. Construct dataset and dataloader.\n    cfg.DATASET.test.test_mode = True\n    dataset = build_dataset((cfg.DATASET.test, cfg.PIPELINE.test))\n    batch_size = cfg.DATASET.get(\"test_batch_size\", 1)\n    places = paddle.set_device('gpu')\n    # default num worker: 0, which means no subprocess will be created\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    dataloader_setting = dict(batch_size=batch_size,\n                              num_workers=num_workers,\n                              places=places,\n                              drop_last=False,\n                              shuffle=False)\n    data_loader = build_dataloader(dataset, **dataloader_setting)\n    model.eval()\n    state_dicts = load(weights)\n    model.set_state_dict(state_dicts)\n    # add params to metrics\n    cfg.METRIC.data_size = len(dataset)"
+        },
+        {
+            "comment": "This code sets the batch size, builds a metric object based on configuration, and iterates over data from a loader. Inside the loop, it either uses parallel or sequential testing to get outputs, then updates the metric for each batch before accumulating results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/test.py\":66-77",
+            "content": "    cfg.METRIC.batch_size = batch_size\n    Metric = build_metric(cfg.METRIC)\n    for batch_id, data in enumerate(data_loader):\n        if parallel:\n            outputs = model._layers.test_step(data)\n        else:\n            outputs = model.test_step(data)\n        Metric.update(batch_id, data, outputs)\n    Metric.accumulate()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/10e6c062-a9f7-4bdc-83c7-fdf664335942.json b/docs/doc/10e6c062-a9f7-4bdc-83c7-fdf664335942.json
new file mode 100644
index 000000000..97ccd2d1e
--- /dev/null
+++ b/docs/doc/10e6c062-a9f7-4bdc-83c7-fdf664335942.json
@@ -0,0 +1,45 @@
+{
+    "summary": "The `test_with_pyreader` and `train_with_pyreader` functions are used in a framework to execute tests with `pyreader`, evaluate metrics, log intervals, train models, handle options like testing, saving, early stopping, measure processing time, and update metrics. The code snippet defines model saving functions, deletes directories, implements early stopping, initializes pre-trained parameters, and uses AttrDict for getter/setter functionality.",
+    "details": [
+        {
+            "comment": "The code defines a function `test_with_pyreader` which takes several parameters like `exe`, `compiled_test_prog`, etc., and appears to be part of a larger framework. It seems to execute a test with the help of `pyreader` for input data, fetch list for outputs, and metrics for evaluation. The function runs on an interval specified by `log_interval`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py\":0-38",
+            "content": "\"\"\"\nutils\n\"\"\"\n#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\nimport time\nimport traceback\nimport logging\nimport shutil\nimport numpy as np\nimport paddle\nimport paddle.static as static\nimport static as static\nlogger = logging.getLogger(__name__)\ndef test_with_pyreader(exe,\n                       compiled_test_prog,\n                       test_pyreader,\n                       test_fetch_list,\n                       test_metrics,\n                       log_interval=0):"
+        },
+        {
+            "comment": "The code tests a PaddleVideo application, using the \"test_pyreader\" to read data and runs it through a neural network. It accumulates and logs test metrics, handles exceptions and provides a final result with computed metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py\":39-66",
+            "content": "    \"\"\"test_with_pyreader\n    \"\"\"\n    if not test_pyreader:\n        logger.error(\"[TEST] get pyreader failed.\")\n    test_metrics.reset()\n    test_iter = 0\n    label_all = []\n    pred_all = []\n    try:\n        for data in test_pyreader():\n            test_outs = exe.run(compiled_test_prog,\n                                fetch_list=test_fetch_list,\n                                feed=data)\n            loss = np.array(test_outs[0])\n            pred = np.array(test_outs[1])\n            label = np.array(test_outs[-1])\n            pred_all.extend(pred)\n            label_all.extend(label)\n            test_metrics.accumulate(loss, pred, label)\n            test_iter += 1\n        test_metrics.finalize_and_log_out(\"[TEST] Finish\")\n    except Exception as e:\n        logger.warn(\n            \"[TEST] fail to execute test or calculate metrics: {}\".format(e))\n        traceback.print_exc()\n    metrics_dict, test_loss = test_metrics.get_computed_metrics()\n    metrics_dict['label_all'] = label_all\n    metrics_dict['pred_all'] = pred_all"
+        },
+        {
+            "comment": "The function `train_with_pyreader` trains a model for the specified number of epochs and returns the test loss and metrics dictionary. It also includes options for testing and saving the model, as well as early stopping based on a defined threshold. The code initializes variables and enters a loop over the number of epochs, resetting training metrics and checking if early stopping should occur before each iteration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py\":67-95",
+            "content": "    return test_loss, metrics_dict\ndef train_with_pyreader(exe, train_prog, compiled_train_prog, train_pyreader,\n                        train_fetch_list, train_metrics, epochs=10,\n                        log_interval=0, valid_interval=0,\n                        save_dir='./', save_model_name='model',\n                        test_exe=None, test_pyreader=None,\n                        test_fetch_list=None, test_metrics=None):\n    \"\"\"train_with_pyreader\n    \"\"\"\n    if not train_pyreader:\n        logger.error(\"[TRAIN] get pyreader failed.\")\n    EARLY_STOP_NUM = 20\n    early_stop = EARLY_STOP_NUM\n    global_iter = 0\n    train_iter = 0\n    iter_all = 0\n    best_test_acc1 = 0\n    for epoch in range(epochs):\n        lr = static.global_scope().find_var(\"learning_rate\").get_tensor()\n        logger.info(\n            \"------- learning rate {}, learning rate counter  -----\".format(\n                np.array(lr)))\n        if early_stop < 0:\n            logger.info('Earyly Stop !!!')\n            break\n        train_metrics.reset()"
+        },
+        {
+            "comment": "This code is part of a training loop for a machine learning model. It keeps track of the current iteration, measures the time taken for processing each data batch, and updates loss, prediction, and label metrics. If logging interval is met, it finalizes and logs the metrics for the current epoch's iteration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py\":96-118",
+            "content": "        global_iter += train_iter\n        epoch_periods = []\n        for data in train_pyreader():\n            try:\n                cur_time = time.time()\n                train_outs = exe.run(compiled_train_prog,\n                                     fetch_list=train_fetch_list,\n                                     feed=data)\n                iter_all += 1\n                period = time.time() - cur_time\n                epoch_periods.append(period)\n                loss = np.array(train_outs[0])\n                pred = np.array(train_outs[1])\n                label = np.array(train_outs[-1])\n                train_metrics.accumulate(loss, pred, label)\n                if log_interval > 0 and (train_iter % log_interval == 0):\n                    # eval here\n                    train_metrics.finalize_and_log_out(\n                                info='[TRAIN] Epoch {} iter {} everage: '.format(epoch, train_iter))\n                train_iter += 1\n            except Exception as e:\n                logger.info(\n                    \"[TRAIN] Epoch {}, iter {} data training failed: {}\"."
+        },
+        {
+            "comment": "The code finishes an epoch of training, logs the average time taken, and finalizes training metrics. If testing is enabled and a valid interval is set, it performs testing after each valid interval iteration and saves models with the best test accuracy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py\":119-140",
+            "content": "                    format(epoch, train_iter, str(e)))\n        if len(epoch_periods) < 1:\n            logger.info(\n                'No iteration was executed, please check the data reader')\n            sys.exit(1)\n        logger.info(\n            '[TRAIN] Epoch {} training finished, average time: {}'.format(\n                epoch, np.mean(epoch_periods)))\n        train_metrics.finalize_and_log_out( \\\n            info='[TRAIN] Finished ... Epoch {} all iters average: '.format(epoch))\n        # save models of min loss in best acc epochs\n        if test_exe and valid_interval > 0 and (epoch +\n                                                1) % valid_interval == 0:\n            # metrics_dict,loss = train_metrics.calculator.get_computed_metrics()\n            loss, metrics_dict_test = test_with_pyreader(\n                exe, test_exe, test_pyreader, test_fetch_list, test_metrics,\n                log_interval)\n            test_acc1 = metrics_dict_test['avg_acc1']\n            if test_acc1 > best_test_acc1:\n                best_test_acc1 = test_acc1"
+        },
+        {
+            "comment": "This code snippet defines functions for saving the model at specific epochs and after training has stopped. It checks if a directory with the model name exists, deletes it if necessary, and then saves the model using either fluid.io or static methods. The save_model function takes in execution context, program, save directory, model name, and optional postfix for the file name. The save_model_persist function is similar but uses the save_model method to save the model. The code also includes a check to stop training if the early stopping condition is met.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py\":141-168",
+            "content": "                save_model(exe, train_prog, save_dir, save_model_name,\n                           \"_epoch{}_acc{}\".format(epoch, best_test_acc1))\n                early_stop = EARLY_STOP_NUM\n            else:\n                early_stop -= 1\ndef save_model(exe, program, save_dir, model_name, postfix=None):\n    \"\"\"save_model\n    \"\"\"\n    model_path = os.path.join(save_dir, model_name + postfix)\n    if os.path.isdir(model_path):\n        shutil.rmtree(model_path)\n    # fluid.io.save_persistables(exe, model_path, main_program=program)\n    save_vars = [x for x in program.list_vars() \\\n                                 if isinstance(x, paddle.framework.Parameter)]\n    static.save_vars(exe,\n                       dirname=model_path,\n                       main_program=program,\n                       vars=save_vars,\n                       filename=\"param\")\ndef save_model_persist(exe, program, save_dir, model_name, postfix=None):\n    \"\"\"save_model\"\"\"\n    model_path = os.path.join(save_dir, model_name + postfix)\n    if os.path.isdir(model_path):"
+        },
+        {
+            "comment": "This function initializes the pre-trained parameters for a model. It first checks if the pretraining_params_path exists, and then loads any existing variables in the main program using static.load_vars(). If var is not a Parameter instance, it will return False. Finally, it logs that the pretraining parameters were loaded from the given path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py\":169-200",
+            "content": "        shutil.rmtree(model_path)\n    paddle.fluid.io.save_persistables(exe,\n                               save_dir,\n                               main_program=program,\n                               filename=model_path)\ndef init_pretraining_params(exe,\n                            pretraining_params_path,\n                            main_program,\n                            use_fp16=False):\n    \"\"\"\n    init pretrain_params\n    \"\"\"\n    assert os.path.exists(pretraining_params_path\n                          ), \"[%s] cann't be found.\" % pretraining_params_path\n    def existed_params(var):\n        \"\"\"\n        Load existed params\n        \"\"\"\n        if not isinstance(var, paddle.framework.Parameter):\n            return False\n        flag = os.path.exists(os.path.join(pretraining_params_path, var.name))\n        return flag\n    static.load_vars(exe,\n                       pretraining_params_path,\n                       main_program=main_program,\n                       predicate=existed_params)\n    logger.info(\n        \"Load pretraining parameters from {}.\".format(pretraining_params_path))"
+        },
+        {
+            "comment": "The code defines a subclass of Python's dictionary, named AttrDict. It overrides the `__getattr__` and `__setattr__` methods to provide getter and setter functionality for dictionary keys as attributes, similar to regular class attributes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py\":203-217",
+            "content": "class AttrDict(dict):\n    \"\"\"AttrDict\n    \"\"\"\n    def __getattr__(self, key):\n        \"\"\"getter\n        \"\"\"\n        return self[key]\n    def __setattr__(self, key, value):\n        \"\"\"setter\n        \"\"\"\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/10f9f848-c5bd-4ec0-9f2f-6c8f72cc311c.json b/docs/doc/10f9f848-c5bd-4ec0-9f2f-6c8f72cc311c.json
new file mode 100644
index 000000000..423593d4b
--- /dev/null
+++ b/docs/doc/10f9f848-c5bd-4ec0-9f2f-6c8f72cc311c.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The code incorporates image processing and Graph class, supports layouts like 'stgcn' and 'coco_keypoint'. It defines a STGCN model for spatio-temporal data processing using ConvTemporalGraphical layer. The code creates a STGCN class for skeleton-based action recognition with edge importance, applies networks, pools results, and averages before returning output.",
+    "details": [
+        {
+            "comment": "This code snippet imports necessary libraries, defines several functions (zero, iden, einsum), and registers the BACKBONES. The purpose of this module seems to be defining backbone architectures or functions used in image processing tasks. However, more context is needed to understand the specific functionality of these functions or their use within the BACKBONES registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":0-36",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\ndef zero(x):\n    return 0\ndef iden(x):\n    return x\ndef einsum(x, A):\n    \"\"\"paddle.einsum will be implemented in release/2.2.\n    \"\"\"\n    x = x.transpose((0, 2, 3, 1, 4))\n    n, c, t, k, v = x.shape\n    k2, v2, w = A.shape\n    assert (k == k2 and v == v2), \"Args of einsum not match!\""
+        },
+        {
+            "comment": "This code defines a Graph class and three functions: get_hop_distance, normalize_digraph, and a constructor for the Graph class. The Graph class initializes with layout, strategy, max_hop, and dilation parameters. The get_hop_distance function computes hop distances between nodes in a graph up to max_hop level. The normalize_digraph function calculates and applies row-wise node degrees to normalize the adjacency matrix. The constructor initializes an instance of the Graph class with given parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":37-79",
+            "content": "    x = x.reshape((n, c, t, k * v))\n    A = A.reshape((k * v, w))\n    y = paddle.matmul(x, A)\n    return y\ndef get_hop_distance(num_node, edge, max_hop=1):\n    A = np.zeros((num_node, num_node))\n    for i, j in edge:\n        A[j, i] = 1\n        A[i, j] = 1\n    # compute hop steps\n    hop_dis = np.zeros((num_node, num_node)) + np.inf\n    transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]\n    arrive_mat = (np.stack(transfer_mat) > 0)\n    for d in range(max_hop, -1, -1):\n        hop_dis[arrive_mat[d]] = d\n    return hop_dis\ndef normalize_digraph(A):\n    Dl = np.sum(A, 0)\n    num_node = A.shape[0]\n    Dn = np.zeros((num_node, num_node))\n    for i in range(num_node):\n        if Dl[i] > 0:\n            Dn[i, i] = Dl[i]**(-1)\n    AD = np.dot(A, Dn)\n    return AD\nclass Graph():\n    def __init__(self,\n                 layout='openpose',\n                 strategy='uniform',\n                 max_hop=1,\n                 dilation=1):\n        self.max_hop = max_hop\n        self.dilation = dilation\n        self.get_edge(layout)"
+        },
+        {
+            "comment": "The code initializes the hop distance and edge based on the number of nodes, maximum hops, and layout type. It defines self_link as intra-node connections and neighbor_link as inter-node connections. The center node is determined based on the layout.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":80-103",
+            "content": "        self.hop_dis = get_hop_distance(self.num_node,\n                                        self.edge,\n                                        max_hop=max_hop)\n        self.get_adjacency(strategy)\n    def __str__(self):\n        return self.A\n    def get_edge(self, layout):\n        # edge is a list of [child, parent] paris\n        if layout == 'fsd10':\n            self.num_node = 25\n            self_link = [(i, i) for i in range(self.num_node)]\n            neighbor_link = [(1, 8), (0, 1), (15, 0), (17, 15), (16, 0),\n                             (18, 16), (5, 1), (6, 5), (7, 6), (2, 1), (3, 2),\n                             (4, 3), (9, 8), (10, 9), (11, 10), (24, 11),\n                             (22, 11), (23, 22), (12, 8), (13, 12), (14, 13),\n                             (21, 14), (19, 14), (20, 19)]\n            self.edge = self_link + neighbor_link\n            self.center = 8\n        elif layout == 'ntu-rgb+d':\n            self.num_node = 25\n            self_link = [(i, i) for i in range(self.num_node)]"
+        },
+        {
+            "comment": "The code initializes 'self.edge' and 'self.center', defining nodes, self-links, and neighboring node links based on the specified layout ('stgcn' or 'coco_keypoint'). For 'stgcn', there are 25 nodes with various connections, while for 'coco_keypoint', there are 17 nodes with specific connections.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":104-119",
+            "content": "            neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),\n                              (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),\n                              (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),\n                              (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),\n                              (23, 8), (24, 25), (25, 12)]\n            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]\n            self.edge = self_link + neighbor_link\n            self.center = 21 - 1\n        elif layout == 'coco_keypoint':\n            self.num_node = 17\n            self_link = [(i, i) for i in range(self.num_node)]\n            neighbor_1base = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6),\n                              (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12),\n                              (11, 13), (12, 14), (13, 15), (14, 16), (11, 12)]\n            neighbor_link = [(i, j) for (i, j) in neighbor_1base]\n            self.edge = self_link + neighbor_link"
+        },
+        {
+            "comment": "This function sets the adjacency matrix for STGCN based on the strategy. It initializes the adjacency matrix as a zero matrix, then fills it with 1s for valid hops. The adjacency matrix is normalized using `normalize_digraph`. If the strategy is 'spatial', it iterates over each pair of nodes and populates the adjacency matrix accordingly based on their hop distance from the center node and their hop distance to each other.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":120-142",
+            "content": "            self.center = 11\n        else:\n            raise ValueError(\"Do Not Exist This Layout.\")\n    def get_adjacency(self, strategy):\n        valid_hop = range(0, self.max_hop + 1, self.dilation)\n        adjacency = np.zeros((self.num_node, self.num_node))\n        for hop in valid_hop:\n            adjacency[self.hop_dis == hop] = 1\n        normalize_adjacency = normalize_digraph(adjacency)\n        if strategy == 'spatial':\n            A = []\n            for hop in valid_hop:\n                a_root = np.zeros((self.num_node, self.num_node))\n                a_close = np.zeros((self.num_node, self.num_node))\n                a_further = np.zeros((self.num_node, self.num_node))\n                for i in range(self.num_node):\n                    for j in range(self.num_node):\n                        if self.hop_dis[j, i] == hop:\n                            if self.hop_dis[j, self.center] == self.hop_dis[\n                                    i, self.center]:\n                                a_root[j, i] = normalize_adjacency[j, i]"
+        },
+        {
+            "comment": "This code implements a ConvTemporalGraphical layer, which is a backbone architecture for STGCN. It initializes the ConvTemporalGraphical layer with input and output channels, kernel size, and temporal parameters. The code handles different strategies to build the adjacency matrix (A) for the graph convolution by considering close and further nodes based on hop distance from the central node. If hop == 0, it appends A to a list, otherwise appends both close and further A matrices. Finally, it stacks the A matrices into a numpy array and assigns it to self.A.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":143-173",
+            "content": "                            elif self.hop_dis[j, self.center] > self.hop_dis[\n                                    i, self.center]:\n                                a_close[j, i] = normalize_adjacency[j, i]\n                            else:\n                                a_further[j, i] = normalize_adjacency[j, i]\n                if hop == 0:\n                    A.append(a_root)\n                else:\n                    A.append(a_root + a_close)\n                    A.append(a_further)\n            A = np.stack(A)\n            self.A = A\n        else:\n            raise ValueError(\"Do Not Exist This Strategy\")\nclass ConvTemporalGraphical(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 t_kernel_size=1,\n                 t_stride=1,\n                 t_padding=0,\n                 t_dilation=1):\n        super().__init__()\n        self.kernel_size = kernel_size\n        self.conv = nn.Conv2D(in_channels,\n                              out_channels * kernel_size,"
+        },
+        {
+            "comment": "The code defines a ConvTemporalGraphical layer and an STGCNBlock. The ConvTemporalGraphical layer is a 2D convolutional layer with temporal kernel size, padding, stride, and dilation. The STGCNBlock is a residual block that takes input and output channels, temporal kernel size, stride, and dropout as inputs. It initializes the GCN layer and TCN layers sequentially, with the GCN layer performing temporal graph convolution and the TCN layers performing temporal convolutions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":174-208",
+            "content": "                              kernel_size=(t_kernel_size, 1),\n                              padding=(t_padding, 0),\n                              stride=(t_stride, 1),\n                              dilation=(t_dilation, 1))\n    def forward(self, x, A):\n        assert A.shape[0] == self.kernel_size\n        x = self.conv(x)\n        n, kc, t, v = x.shape\n        x = x.reshape((n, self.kernel_size, kc // self.kernel_size, t, v))\n        x = einsum(x, A)\n        return x, A\nclass st_gcn_block(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 dropout=0,\n                 residual=True):\n        super(st_gcn_block, self).__init__()\n        assert len(kernel_size) == 2\n        assert kernel_size[0] % 2 == 1\n        padding = ((kernel_size[0] - 1) // 2, 0)\n        self.gcn = ConvTemporalGraphical(in_channels, out_channels,\n                                         kernel_size[1])\n        self.tcn = nn.Sequential("
+        },
+        {
+            "comment": "This code defines a STGCN (Spatio-Temporal Graph Convolutional Network) model. It includes layers such as BatchNormalization, ReLU activation, and convolution operations for processing spatial and temporal data. The forward method applies these operations to input features x and adjacency matrix A.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":209-250",
+            "content": "            nn.BatchNorm2D(out_channels),\n            nn.ReLU(),\n            nn.Conv2D(\n                out_channels,\n                out_channels,\n                (kernel_size[0], 1),\n                (stride, 1),\n                padding,\n            ),\n            nn.BatchNorm2D(out_channels),\n            nn.Dropout(dropout),\n        )\n        if not residual:\n            self.residual = zero\n        elif (in_channels == out_channels) and (stride == 1):\n            self.residual = iden\n        else:\n            self.residual = nn.Sequential(\n                nn.Conv2D(in_channels,\n                          out_channels,\n                          kernel_size=1,\n                          stride=(stride, 1)),\n                nn.BatchNorm2D(out_channels),\n            )\n        self.relu = nn.ReLU()\n    def forward(self, x, A):\n        res = self.residual(x)\n        x, A = self.gcn(x, A)\n        x = self.tcn(x) + res\n        return self.relu(x), A\n@BACKBONES.register()\nclass STGCN(nn.Layer):\n    \"\"\"\n    ST-GCN model from:\n "
+        },
+        {
+            "comment": "This code defines the STGCN (Spatial Temporal Graph Convolutional Networks) class, which is a model for skeleton-based action recognition. It takes arguments like in_channels, edge_importance_weighting, and data_bn to determine the network configuration. It loads graph data and builds networks with specific kernel sizes for spatial and temporal dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":250-277",
+            "content": "   `\"Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition\" <https://arxiv.org/abs/1801.07455>`_\n    Args:\n        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.\n        edge_importance_weighting: bool, whether to use edge attention. Default True.\n        data_bn: bool, whether to use data BatchNorm. Default True.\n    \"\"\"\n    def __init__(self,\n                 in_channels=2,\n                 edge_importance_weighting=True,\n                 data_bn=True,\n                 layout='fsd10',\n                 strategy='spatial',\n                 **kwargs):\n        super(STGCN, self).__init__()\n        self.data_bn = data_bn\n        # load graph\n        self.graph = Graph(\n            layout=layout,\n            strategy=strategy,\n        )\n        A = paddle.to_tensor(self.graph.A, dtype='float32')\n        self.register_buffer('A', A)\n        # build networks\n        spatial_kernel_size = A.shape[0]\n        temporal_kernel_size = 9\n        kernel_size = (temporal_kernel_size, spatial_kernel_size)"
+        },
+        {
+            "comment": "This code initializes a series of ST-GCN blocks with different configurations for the ST-GCN backbone, including batch normalization and specific layer dimensions. These blocks are stored in a LayerList for flexibility and efficient computation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":278-299",
+            "content": "        self.data_bn = nn.BatchNorm1D(in_channels *\n                                      A.shape[1]) if self.data_bn else iden\n        kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}\n        self.st_gcn_networks = nn.LayerList((\n            st_gcn_block(in_channels,\n                         64,\n                         kernel_size,\n                         1,\n                         residual=False,\n                         **kwargs0),\n            st_gcn_block(64, 64, kernel_size, 1, **kwargs),\n            st_gcn_block(64, 64, kernel_size, 1, **kwargs),\n            st_gcn_block(64, 64, kernel_size, 1, **kwargs),\n            st_gcn_block(64, 128, kernel_size, 2, **kwargs),\n            st_gcn_block(128, 128, kernel_size, 1, **kwargs),\n            st_gcn_block(128, 128, kernel_size, 1, **kwargs),\n            st_gcn_block(128, 256, kernel_size, 2, **kwargs),\n            st_gcn_block(256, 256, kernel_size, 1, **kwargs),\n            st_gcn_block(256, 256, kernel_size, 1, **kwargs),\n        ))\n        # initialize parameters for edge importance weighting"
+        },
+        {
+            "comment": "Code creates edge importance parameters if edge_importance_weighting is True, otherwise sets all edge importances to 1. Initializes weights for convolutional layers and batch normalization layers with specified means and standard deviations. The forward function transposes the input tensor shape before processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":300-326",
+            "content": "        if edge_importance_weighting:\n            self.edge_importance = nn.ParameterList([\n                self.create_parameter(\n                    shape=self.A.shape,\n                    default_initializer=nn.initializer.Constant(1))\n                for i in self.st_gcn_networks\n            ])\n        else:\n            self.edge_importance = [1] * len(self.st_gcn_networks)\n        self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv2D):\n                weight_init_(layer, 'Normal', mean=0.0, std=0.02)\n            elif isinstance(layer, nn.BatchNorm2D):\n                weight_init_(layer, 'Normal', mean=1.0, std=0.02)\n            elif isinstance(layer, nn.BatchNorm1D):\n                weight_init_(layer, 'Normal', mean=1.0, std=0.02)\n    def forward(self, x):\n        # data normalization\n        N, C, T, V, M = x.shape\n        x = x.transpose((0, 4, 3, 1, 2))  # N, M, V, C, T"
+        },
+        {
+            "comment": "This code reshapes the input tensor and applies batch normalization before reshaping again. It then transposes the dimensions and reshapes once more. The main operation involves iterating through each ST-GCN network and applying it to the input with multiplied edge importance, followed by pooling. Finally, it reshapes the output, performs averaging over the third dimension, and returns the result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/stgcn.py\":327-342",
+            "content": "        x = x.reshape((N * M, V * C, T))\n        if self.data_bn:\n            x.stop_gradient = False\n        x = self.data_bn(x)\n        x = x.reshape((N, M, V, C, T))\n        x = x.transpose((0, 1, 3, 4, 2))  # N, M, C, T, V\n        x = x.reshape((N * M, C, T, V))\n        # forward\n        for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):\n            x, _ = gcn(x, paddle.multiply(self.A, importance))\n        x = self.pool(x)  # NM,C,T,V --> NM,C,1,1\n        C = x.shape[1]\n        x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1)  # N,C,1,1\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1172e92b-4c2a-47c0-a6fa-962a124fb2c5.json b/docs/doc/1172e92b-4c2a-47c0-a6fa-962a124fb2c5.json
new file mode 100644
index 000000000..eb46f3cfc
--- /dev/null
+++ b/docs/doc/1172e92b-4c2a-47c0-a6fa-962a124fb2c5.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is importing the \"Registry\" class from the \"utils\" module and initializing a new instance called \"METRIC\" that will store different types of metrics. The comment indicates it is part of the PaddleVideo library for Video Quality Assessment, licensed under the Apache License 2.0.",
+    "details": [
+        {
+            "comment": "This code is importing the \"Registry\" class from the \"utils\" module and initializing a new instance called \"METRIC\" that will store different types of metrics. The comment indicates it is part of the PaddleVideo library for Video Quality Assessment, licensed under the Apache License 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py\":0-18",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom ..utils import Registry\nMETRIC = Registry('metric')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1189c1da-bdb9-4251-8e31-01e1996c3964.json b/docs/doc/1189c1da-bdb9-4251-8e31-01e1996c3964.json
new file mode 100644
index 000000000..3a40f371b
--- /dev/null
+++ b/docs/doc/1189c1da-bdb9-4251-8e31-01e1996c3964.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This PaddleVideo code performs video action detection, using SlowFast+FasterRCNN model for abnormal behavior detection, with data preparation, training, evaluation, inference, and deployment. The code exports, deploys static models, and provides deployment instructions.",
+    "details": [
+        {
+            "comment": "This code is for abnormal behavior detection using the PaddleVideo framework with SlowFast+FasterRCNN model, consisting of 6 steps: data preparation (sparse frame extraction, target detection, generating pkl files), model training, evaluation, inference, and deployment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/AbnormalActionDetection/README.md\":0-39",
+            "content": "# \u5f02\u5e38\u884c\u4e3a\u8bc6\u522b\n## \u5185\u5bb9\n- [\u6a21\u578b\u7b80\u4ecb](#\u6a21\u578b\u7b80\u4ecb)\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u8bad\u7ec3](#\u6a21\u578b\u8bad\u7ec3)\n- [\u6a21\u578b\u8bc4\u4f30](#\u6a21\u578b\u8bc4\u4f30)\n- [\u6a21\u578b\u63a8\u7406](#\u6a21\u578b\u63a8\u7406)\n- [\u6a21\u578b\u90e8\u7f72](#\u6a21\u578b\u90e8\u7f72)\n- [\u53c2\u8003\u8bba\u6587](#\u53c2\u8003\u8bba\u6587)\n## \u6a21\u578b\u7b80\u4ecb\n\u8be5\u4ee3\u7801\u5e93\u7528\u4e8e\u5f02\u5e38\u884c\u4e3a\u68c0\u6d4b, \u57fa\u4e8epaddle2.2\u7248\u672c\u5f00\u53d1\uff0c\u7ed3\u5408PaddleVideo\u4e2d\u7684SlowFast+FasterRCNN\u6a21\u578b\u5b9e\u73b07\u4e2a\u5f02\u5e38\u884c\u4e3a\u7684\u68c0\u6d4b\u3002\n\u4e3b\u8981\u6846\u67b6\u5982\u4e0b\uff1a\n<div align=\"center\">\n  <img src=\"./images/SlowFast_FasterRCNN.png\" width=\"640px\"/><br>\n</div>\nAIStudio\u9879\u76ee: [\u57fa\u4e8e\u65f6\u7a7a\u4fe1\u606f\u7684\u5f02\u5e38\u884c\u4e3a\u68c0\u6d4b](https://aistudio.baidu.com/aistudio/projectdetail/3431613)\n## \u6570\u636e\u51c6\u5907\n### Step1 \u7a00\u758f\u62bd\u53d6\u89c6\u9891\u5e27\n\u9996\u5148\u7a00\u758f\u62bd\u53d6\u89c6\u9891\u5e27\u7528\u4e8e\u68c0\u6d4b\u6bcf\u5e27\u4e2d\u4eba\u7684\u4f4d\u7f6e\uff1a\n```\ncd data/ava/script && bash extract_video_frames.sh abnormal_action_videos abnormal_action_frames 2\n```\n* \u7b2c\u4e00\u4e2a\u53c2\u6570abnormal_action_videos\uff1a\u88ab\u62bd\u5e27\u7684\u89c6\u9891\u6839\u76ee\u5f55\uff1b\n* \u7b2c\u4e8c\u4e2a\u53c2\u6570abnormal_action_frames\uff1a\u62bd\u53d6\u7684\u89c6\u9891\u5e27\u5b58\u653e\u76ee\u5f55\uff1b\n* \u7b2c\u4e09\u4e2a\u53c2\u65702\uff1a\u62bd\u5e27\u5e27\u7387\u3002\n### Step2 \u76ee\u6807\u68c0\u6d4b\n\u7528\u6210\u719f\u7684\u53ef\u68c0\u6d4b\u4eba\u7684\u76ee\u6807\u68c0\u6d4b\u6a21\u578b\u68c0\u6d4b\u4e0a\u8ff0\u6b65\u9aa4\u62bd\u5f97\u7684\u89c6\u9891\u5e27\u4e2d\u7684\u4eba\u3002\u5982PaddleDetection\u5957\u4ef6\u4e2d\u7684\u57fa\u4e8ecoco\u6570\u636e\u96c6\u8bad\u7ec3\u5f97\u5230\u7684[PP-YOLOv2](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/ppyolo)\u6a21\u578b\u3002\n### Step3 \u751f\u6210pkl\u6587\u4ef6\n\u5c06\u4e0a\u8ff0\u6b65\u9aa4\u5f97\u5230\u7684\u6bcf\u4e2a\u89c6\u9891\u5e27\u7684\u68c0\u6d4b\u7ed3\u679c\u8fdb\u884c\u8f6c\u5316\uff0c\u5f97\u5230SlowFast_FasterRCNN\u6a21\u578b\u9700\u8981\u7684\u8f93\u5165\u683c\u5f0f\u3002\u6ce8\u610f\u6211\u4eec\u53ea\u9700\u8981\u4eba\u7684\u68c0\u6d4b\u7ed3\u679c\uff0c\u5176\u4ed6\u76ee\u6807\u4e0d\u9700\u8981\u3002\nSlowFast_FasterRCNN\u6a21\u578b\u9700\u8981\u7684proposals\u662fpkl\u683c\u5f0f\u6587\u4ef6\uff0c\u8be5\u6587\u4ef6\u4ee5\u5b57\u5178\u5f62\u5f0f\u5b58\u50a8\u68c0\u6d4b\u7ed3\u679c\uff0c\u5b57\u5178\u7684key\u662f\u89c6\u9891\u5e27\u7684\u7d22\u5f15\uff08video_id+frame_id\u62fc\u63a5\u5f97\u5230\uff09\uff0cvalue\u662f\u4e00\u4e2alist\uff0c\u6bcf\u4e2a\u5143\u7d20\u662f\u68c0\u6d4b\u5f97\u5230\u7684\u4eba\u7684\u4f4d\u7f6e\u4fe1\u606f\u548c\u7f6e\u4fe1\u5ea6\u3002"
+        },
+        {
+            "comment": "Step 4: Extracts video frames at a rate of 30fps for SlowFast_FasterRCNN input.\nStep 5: Stores label data as pbtxt files with action IDs starting from 1.\nModel training using pre-trained AVA model and config file.\nModel evaluation on abnormal action detection.\nModel inference using dynamic graph execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/AbnormalActionDetection/README.md\":41-113",
+            "content": "```\n{\n    \u6253\u67b6,0001:\n        [[0.036    0.098    0.55     0.979    0.995518] # x1,y1,x2,y2,score\n        [0.443    0.04     0.99     0.989    0.977824]]\n}\n```\n### Step4 \u5bc6\u96c6\u62bd\u53d6\u89c6\u9891\u5e27\n\u5bf9\u89c6\u9891\u6570\u636e\u8fdb\u884c\u5bc6\u96c6\u62bd\u5e27\u3002\nSlowFast_FasterRCNN\u8f93\u5165\u7684\u89c6\u9891\u5e27\u662f\u5bc6\u96c6\u5e27\uff0c\u56e0\u6b64\u9700\u8981\u518d\u6b21\u5bf9\u89c6\u9891\u8fdb\u884c\u62bd\u5e27\u3002\u5177\u4f53\u547d\u4ee4\u5982\u4e0b\uff1a\n```\ncd data/ava/script && bash extract_video_frames.sh abnormal_action_videos abnormal_action_frames_30fps 30\n```\n\u5177\u4f53\u53c2\u6570\u540c\u6b65\u9aa41\uff0c\u53ea\u4e0d\u8fc7\u6b21\u6570\u62bd\u5e27\u7387\u4e3a30fps\u3002\n### Step5 \u51c6\u5907\u6807\u7b7e\u6570\u636e\n\u6807\u7b7e\u6570\u636e\u4ee5pbtxt\u6587\u4ef6\u4e2a\u6570\u5b58\u50a8\uff0c\u672c\u6848\u4f8b\u5177\u4f53\u5982\u4e0b\uff08\u6ce8\u610f\u884c\u4e3a\u6807\u7b7eid\u4ece1\u5f00\u59cb\uff09\uff1a\n```\nitem {\n  name: \"\u6325\u68cd\"\n  id: 1\n}\nitem {\n  name: \"\u6253\u67b6\"\n  id: 2\n}\nitem {\n  name: \"\u8e22\u4e1c\u897f\"\n  id: 3\n}\nitem {\n  name: \"\u8ffd\u9010\"\n  id: 4\n}\nitem {\n  name: \"\u4e89\u5435\"\n  id: 5\n}\nitem {\n  name: \"\u5feb\u901f\u5954\u8dd1\"\n  id: 6\n}\nitem {\n  name: \"\u6454\u5012\"\n  id: 7\n}\n```\n## \u6a21\u578b\u8bad\u7ec3\n\u5f02\u5e38\u884c\u4e3a\u68c0\u6d4b\u6a21\u578b\u57fa\u4e8e\u5728AVA\u6570\u636e\u96c6\u4e0a\u8bad\u7ec3\u5f97\u5230\u6a21\u578b\u8fdb\u884c\u8fc1\u79fb\u5b66\u4e60\u3002\u5177\u4f53\u8bad\u7ec3\u547d\u4ee4\u5982\u4e0b\uff1a\n```\npython main.py --validate -w AVA_SlowFast_FastRcnn_best.pdparams \\\n -c configs/abnoraml_action.yaml\n```\n - w \u9884\u8bad\u7ec3\u6a21\u578b\u8def\u5f84\n - c \u914d\u7f6e\u6587\u4ef6\u8def\u5f84\n## \u6a21\u578b\u8bc4\u4f30\n```\npython main.py --test \\\n   -w abnormal_action_SlowFast_FastRcnn.pdparams \\\n   -c configs/abnoraml_action.yaml\n```\n## \u6a21\u578b\u63a8\u7406\n\u57fa\u4e8e\u52a8\u6001\u56fe\u7684\u63a8\u7406\uff1a\n```\npython tools/ava_predict.py \\\n  -c configs/abnoraml_action.yaml \\\n  -w abnormal_action_SlowFast_FastRcnn.pdparams \\"
+        },
+        {
+            "comment": "This code is for video action detection using PaddleVideo. It exports a static model, converts dynamic to static model, performs inference, and deploys the model. The parameters include video path, detection model name, and weights path. Deployment instructions are provided with a reference to a relevant paper.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/AbnormalActionDetection/README.md\":114-152",
+            "content": "  --video_path data/wave_9.mp4 \\\n  --detection_model_name 'faster_rcnn/faster_rcnn_r50_fpn_1x_coco' \\\n  --detection_model_weights 'faster_rcnn_r50_fpn_1x_coco.pdparams'\n```\n- video_path \u89c6\u9891\u8def\u5f84\n- detection_model_name \u68c0\u6d4b\u6a21\u578b\u540d\u79f0\n- detection_model_weights \u68c0\u6d4b\u6a21\u578b\u6743\u91cd\u8def\u5f84\n\u57fa\u4e8e\u9759\u6001\u56fe\u6a21\u578b\u8fdb\u884c\u63a8\u7406\uff1a\n\u5bfc\u51fa\u6a21\u578b\uff0c\u52a8\u6001\u56fe\u6a21\u578b\u8f6c\u6362\u4e3a\u9759\u6001\u56fe\u6a21\u578b\uff1a\n```\npython tools/export_model.py \\\n  -c configs/abnoraml_action.yaml \\\n  -o inference_output \\\n  -p abnormal_action_SlowFast_FastRcnn.pdparams\n```\n- o \u5bfc\u51fa\u6a21\u578b\u5b58\u653e\u6587\u4ef6\u5939\n- p \u88ab\u5bfc\u51fa\u6a21\u578b\u8def\u5f84\n\u57fa\u4e8e\u5bfc\u51fa\u7684\u6a21\u578b\u505a\u63a8\u7406\uff1a\n```\npython tools/predict.py \\\n    -c configs/abnoraml_action.yaml \\\n    --input_file \"data/wave_9.mp4\" \\\n    --model_file \"inference_output/abnormal_action_SlowFast_FastRcnn.pdmodel\" \\\n    --params_file \"inference_output/abnormal_action_SlowFast_FastRcnn.pdiparams\" \\\n    --use_gpu=True \\\n    --use_tensorrt=False\n```\n## \u6a21\u578b\u90e8\u7f72\n\u8bf7\u53c2\u8003[Paddle Inference\u793a\u4f8b](https://paddle-inference.readthedocs.io/en/latest/quick_start/python_demo.html)\n## \u53c2\u8003\u8bba\u6587\n- [SlowFast Networks for Video Recognition](https://arxiv.org/pdf/1812.03982.pdf), Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, Kaiming He"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/11972a29-3bec-4821-8499-f55a5df7b0d4.json b/docs/doc/11972a29-3bec-4821-8499-f55a5df7b0d4.json
new file mode 100644
index 000000000..df431ec30
--- /dev/null
+++ b/docs/doc/11972a29-3bec-4821-8499-f55a5df7b0d4.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The SlowFast model, designed for video recognition, utilizes a Multigrid training strategy to speed up training and provides English documentation. It offers testing instructions using PaddleVideo with GPU usage details, retrieves class name from ID, predicts top1 result for \"example.avi\", and is explained in detail in the reference paper.",
+    "details": [
+        {
+            "comment": "This code is the English version of SlowFast model documentation from PaddleVideo's model_zoo. It introduces SlowFast, a video recognition model that combines low and high frame rates for spatial semantic and motion information capture. The training script and data preparation are provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/slowfast.md\":0-37",
+            "content": "[\u7b80\u4f53\u4e2d\u6587 ](../../../zh-CN/model_zoo/recognition/slowfast.md) | English\n# SlowFast\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nSlowFast  involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast path-way, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition.\n<p align=\"center\">\n<img src=\"../../../images/SlowFast.png\" height=300 width=500 hspace='10'/> <br />\nSlowFast Overview\n</p>\n## Data\nWe use Kinetics-400 to train this model\uff0cdata preparation please refer to [Kinetics-400 dataset](../../dataset/k400.md).\n## Train\nYou can start training by\uff1a\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_slowfast  main.py --validate -c configs/recognition/slowfast/slowfast.yaml"
+        },
+        {
+            "comment": "This code implements Multigrid training strategy to speed up SlowFast model training, which is time-consuming. The provided training script and performance evaluation show that using the multigrid method reduces the training time by 2.89x compared to normal training. For more details, refer to the accelerate documentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/slowfast.md\":38-57",
+            "content": "```\n- Training would be efficent using our code. The training speed is 2x faster than the original implementation. Details can refer to [benchmark](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/en/benchmark.md).\n### Speed up training\nIt's time consuming to train SlowFast model.  So we implement [Multigrid training stragety](https://arxiv.org/abs/1912.00998) to speed up training. Training script:\n```bash\npython -B -m paddle.distributed.launch --selected_gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml\n```\nPerformance evaluation:\n| training stragety | time cost of one epoch/min | total training time/min | speed-up |\n| :------ | :-----: | :------: |:------: |\n| Multigrid | 27.25 |  9758 (6.7 days) | 2.89x |\n| Normal | 78.76 | 15438 (10.7days) | base |\nFor more details, please refer to [accelerate doc](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/tutorials/accelerate.md#%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5%E5%8A%A0%E9%80%9F)."
+        },
+        {
+            "comment": "This code provides instructions for testing the SlowFast model in PaddleVideo. It uses the distributed launch command to run on multiple GPUs, specifying the log directory and the model configuration file slowfast.yaml. The test accuracy for two configurations is also shown, with a note that Acc1 may be lower due to missing data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/slowfast.md\":60-78",
+            "content": "## Test\nYou can start testing by\uff1a\n```bash\npython -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_slowfast_test main.py --test -c  configs/recognition/slowfast/slowfast.yaml -w output/SlowFast/SlowFast_epoch_000196.pdparams\n```\n-  Args `-w` is used to specifiy the model path\uff0cyou can download our model in [SlowFast.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams).\nTest accuracy in Kinetics-400:\n| Configs | Acc1 | Acc5 | Weights |\n| :---: | :---: | :---: | :---: |\n|  [slowfast.yaml](../../../../configs/recognition/slowfast/slowfast.yaml) | 74.35 | 91.33 | [slowfast_4x16.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams) |\n|  [slowfast_multigrid.yaml](../../../../configs/recognition/slowfast/slowfast_multigrid.yaml) | 75.84  | 92.33 | [slowfast_8x8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) |\n- Acc1 may be lower than that released in papaer, as ~5% data of kinetics-400 is missing. Experiments have verified that if training with the same data, we can get the same accuracy."
+        },
+        {
+            "comment": "This code provides instructions for exporting and using the SlowFast model in PaddleVideo. The first command generates the architecture file (SlowFast.pdmodel) and parameter file (SlowFast.pdiparams). The second command demonstrates how to run inference with these files on an input video, specifying the model configuration and enabling GPU usage if available. It outputs the top-1 class and score for the predicted results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/slowfast.md\":81-111",
+            "content": "## Inference\n### export inference model\n To get model architecture file `SlowFast.pdmodel` and parameters file `SlowFast.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml \\\n                                -p data/SlowFast.pdparams \\\n                                -o inference/SlowFast\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/slowfast/slowfast.yaml \\\n                           --model_file inference/SlowFast/SlowFast.pdmodel \\\n                           --params_file inference/SlowFast/SlowFast.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 1.0"
+        },
+        {
+            "comment": "This code retrieves the class name from a given class ID using a map file and predicts the top1 result for a video named \"example.avi\". The reference provided is related to the SlowFast Networks for Video Recognition paper, which likely explains how this functionality works in detail.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/slowfast.md\":112-119",
+            "content": "```\nwe can get the class name using class id and map file `data/k400/Kinetics-400_label_list.txt`. The top1 prediction of `data/example.avi` is `archery`.\n## Reference\n- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1242c244-2702-4893-9d20-f2b36989dc42.json b/docs/doc/1242c244-2702-4893-9d20-f2b36989dc42.json
new file mode 100644
index 000000000..79a0805eb
--- /dev/null
+++ b/docs/doc/1242c244-2702-4893-9d20-f2b36989dc42.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The script prepares PaddleVideo's UR FALL dataset keypoints, normalizing them and handling inconsistencies for training. It also prepares a dataset for PPHuman, reading annotations, extracting data, and saving for training.",
+    "details": [
+        {
+            "comment": "This script converts keypoint results of UR FALL dataset into a format suitable for training by PaddleVideo. It normalizes keypoints using bounding boxes and adjusts the shape to be compatible with the PaddleVideo framework. The function also handles cases where the number of frames is more or less than 100.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/datasets/prepare_dataset.py\":0-33",
+            "content": "import os\nimport json\nimport numpy as np\nimport pickle\n\"\"\"\n This python script is used to convert keypoint results of UR FALL dataset\n   for training by PaddleVideo\n\"\"\"\ndef self_norm(kpt, bbox):\n    # kpt: (2, T, 17, 1),  bbox: (T, 4)\n    tl = bbox[:, 0:2]\n    wh = bbox[:, 2:]\n    tl = np.expand_dims(np.transpose(tl, (1, 0)), (2, 3))\n    wh = np.expand_dims(np.transpose(wh, (1, 0)), (2, 3))\n    res = (kpt - tl) / wh\n    res *= np.expand_dims(np.array([[384.], [512.]]), (2, 3))\n    return res\ndef convert_to_ppvideo(all_kpts, all_scores, all_bbox):\n    # shape of all_kpts is (T, 17, 2)\n    keypoint = np.expand_dims(np.transpose(all_kpts, [2, 0, 1]),\n                              -1)  #(2, T, 17, 1)\n    keypoint = self_norm(keypoint, all_bbox)\n    scores = all_scores\n    if keypoint.shape[1] > 100:\n        frame_start = (keypoint.shape[1] - 100) // 2\n        keypoint = keypoint[:, frame_start:frame_start + 100:2, :, :]\n        scores = all_scores[frame_start:frame_start + 100:2, :, :]\n    elif keypoint.shape[1] < 100:"
+        },
+        {
+            "comment": "The function `prepare_dataset` receives keypoint and scores as inputs. If the length of either is not divisible by 2, it pads them with zeros to maintain consistency. The else block simply takes every other value in both arrays. The `decode_json_path` function loads a JSON file, sorts its contents, extracts bounding boxes, keypoints, and scores from each entry, ignoring cases where there is more than one bounding box, and appends the processed data to separate lists for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/datasets/prepare_dataset.py\":34-68",
+            "content": "        keypoint = np.concatenate([\n            keypoint,\n            np.zeros((2, 100 - keypoint.shape[1], 17, 1), dtype=keypoint.dtype)\n        ], 1)[:, ::2, :, :]\n        scores = np.concatenate([\n            all_scores,\n            np.zeros((100 - all_scores.shape[0], 17, 1), dtype=keypoint.dtype)\n        ], 0)[::2, :, :]\n    else:\n        keypoint = keypoint[:, ::2, :, :]\n        scores = scores[::2, :, :]\n    return keypoint, scores\ndef decode_json_path(json_path):\n    content = json.load(open(json_path))\n    content = sorted(content, key=lambda x: x[0])\n    all_kpts = []\n    all_score = []\n    all_bbox = []\n    for annos in content:\n        bboxes = annos[1]\n        kpts = annos[2][0]\n        frame_id = annos[0]\n        if len(bboxes) != 1:\n            continue\n        kpt_res = []\n        kpt_score = []\n        for kpt in kpts[0]:\n            x, y, score = kpt\n            kpt_res.append([x, y])\n            kpt_score.append([score])\n        all_kpts.append(np.array(kpt_res))\n        all_score.append(np.array(kpt_score))"
+        },
+        {
+            "comment": "This code prepares a dataset for PaddleVideo's PPHuman application. It reads annotations from \"annotations\" folder, extracts keypoints, labels, and scores, then saves them into numpy arrays and pickle file for training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/datasets/prepare_dataset.py\":69-97",
+            "content": "        all_bbox.append([\n            bboxes[0][0], bboxes[0][1], bboxes[0][2] - bboxes[0][0],\n            bboxes[0][3] - bboxes[0][1]\n        ])\n    all_kpts_np = np.array(all_kpts)\n    all_score_np = np.array(all_score)\n    all_bbox_np = np.array(all_bbox)\n    video_anno, scores = convert_to_ppvideo(all_kpts_np, all_score_np,\n                                            all_bbox_np)\n    return video_anno, scores\nif __name__ == '__main__':\n    all_keypoints = []\n    all_labels = [[], []]\n    all_scores = []\n    for i, path in enumerate(os.listdir(\"annotations\")):\n        video_anno, score = decode_json_path(os.path.join(\"annotations\", path))\n        all_keypoints.append(video_anno)\n        all_labels[0].append(str(i))\n        all_labels[1].append(0)  #label 0 means falling\n        all_scores.append(score)\n    all_data = np.stack(all_keypoints, 0)\n    all_score_data = np.stack(all_scores, 0)\n    np.save(f\"train_data.npy\", all_data)\n    pickle.dump(all_labels, open(f\"train_label.pkl\", \"wb\"))\n    np.save(\"kptscore_data.npy\", all_score_data)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/124984cd-1046-4edd-b77a-75171501d444.json b/docs/doc/124984cd-1046-4edd-b77a-75171501d444.json
new file mode 100644
index 000000000..6315f3e67
--- /dev/null
+++ b/docs/doc/124984cd-1046-4edd-b77a-75171501d444.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code file is an __init__.py for EIVideo application, authored by Acer Zhang on Jan 6th. It sets root path and defines constants for temporary image and JSON file paths. The join_root_path function helps construct full paths from given partial paths.",
+    "details": [
+        {
+            "comment": "This code file is an __init__.py for EIVideo application, authored by Acer Zhang on Jan 6th. It sets root path and defines constants for temporary image and JSON file paths. The join_root_path function helps construct full paths from given partial paths.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/__init__.py\":0-15",
+            "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport os\nfrom EIVideo.version import __version__\nEI_VIDEO_ROOT = os.path.abspath(os.path.dirname(__file__))\nTEMP_IMG_SAVE_PATH = \"./temp.png\"\nTEMP_JSON_SAVE_PATH = \"./save.json\"\nTEMP_JSON_FINAL_PATH = \"./final.json\"\ndef join_root_path(path: str):\n    return os.path.join(EI_VIDEO_ROOT, path)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/127e4c33-f150-44d4-b70f-14f9e61c18cc.json b/docs/doc/127e4c33-f150-44d4-b70f-14f9e61c18cc.json
new file mode 100644
index 000000000..9b22fb7a2
--- /dev/null
+++ b/docs/doc/127e4c33-f150-44d4-b70f-14f9e61c18cc.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a Paddle's nn.Layer Decoder class with convolutional layers, BatchNorm, and ReLU activation functions for Manet architecture decoding. It imports the 'zero_' function to initialize all model biases to 0.",
+    "details": [
+        {
+            "comment": "This code defines a Decoder class using Paddle's nn.Layer, which takes in the number of classes and backbone type as parameters. It initializes the convolutional layers for feature extraction, BatchNorm layers for normalization, and ReLU activation functions. The last_conv sequence contains multiple Conv2D, BatchNorm, and ReLU layers for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py\":0-29",
+            "content": "import paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom EIVideo.paddlevideo.utils.manet_utils import kaiming_normal_\nclass Decoder(nn.Layer):\n    def __init__(self, num_classes, backbone, BatchNorm):\n        super(Decoder, self).__init__()\n        if backbone == 'resnet' or backbone == 'drn' or backbone == 'resnet_edge':\n            low_level_inplanes = 256\n        elif backbone == 'xception':\n            low_level_inplanes = 128\n        elif backbone == 'mobilenet':\n            low_level_inplanes = 24\n        else:\n            raise NotImplementedError\n        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)\n        self.bn1 = BatchNorm(48)\n        self.relu = nn.ReLU(True)\n        self.last_conv = nn.Sequential(\n            nn.Conv2D(304,\n                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(True),\n            nn.Sequential(),\n            nn.Conv2D(256,"
+        },
+        {
+            "comment": "This code defines a decoder block for the Manet architecture. It includes a 2D convolution layer, batch normalization, and ReLU activation. The forward function performs interpolation on input feature maps and concatenates them with low-level features before passing through a final convolution. The _init_weight function initializes the weights of the block using Kaiming initialization for convolutions and fills batch norm with a constant value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py\":30-58",
+            "content": "                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(True),\n            nn.Sequential())\n        self._init_weight()\n    def forward(self, x, low_level_feat):\n        low_level_feat = self.conv1(low_level_feat)\n        low_level_feat = self.bn1(low_level_feat)\n        low_level_feat = self.relu(low_level_feat)\n        x = F.interpolate(x,\n                          size=low_level_feat.shape[2:],\n                          mode='bilinear',\n                          align_corners=True)\n        x = paddle.concat((x, low_level_feat), axis=1)\n        x = self.last_conv(x)\n        return x\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from EIVideo.paddlevideo.utils.manet_utils import fill_\n                fill_(m.weight, 1)"
+        },
+        {
+            "comment": "This code imports the function 'zero_' from EIVideo.paddlevideo.utils.manet_utils and then defines a build_decoder function that returns an instance of Decoder class with provided parameters (num_classes, backbone, BatchNorm). The zero_(m.bias) line initializes all the bias in the model (m) to 0 using the imported 'zero_' function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py\":59-64",
+            "content": "                from EIVideo.paddlevideo.utils.manet_utils import zero_\n                zero_(m.bias)\ndef build_decoder(num_classes, backbone, BatchNorm):\n    return Decoder(num_classes, backbone, BatchNorm)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/12eb8ee3-6d8e-4bf8-aa1b-1a6465f46372.json b/docs/doc/12eb8ee3-6d8e-4bf8-aa1b-1a6465f46372.json
new file mode 100644
index 000000000..6787541df
--- /dev/null
+++ b/docs/doc/12eb8ee3-6d8e-4bf8-aa1b-1a6465f46372.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is from PaddleVideo's EIVideo module and includes util functions for distributed computing. It defines a function get_dist_info() to retrieve the current rank and world size, and main_only() is a decorator that only runs the wrapped function if the rank is 0 (used in distributed environments).",
+    "details": [
+        {
+            "comment": "This code is from PaddleVideo's EIVideo module and includes util functions for distributed computing. It defines a function get_dist_info() to retrieve the current rank and world size, and main_only() is a decorator that only runs the wrapped function if the rank is 0 (used in distributed environments).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport functools\nimport paddle\nimport paddle.distributed as dist\ndef get_dist_info():\n    world_size = dist.get_world_size()\n    rank = dist.get_rank()\n    return rank, world_size\ndef main_only(func):\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        rank, _ = get_dist_info()\n        if rank == 0:\n            return func(*args, **kwargs)\n    return wrapper"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/133c60ab-9ffd-445e-9459-52ac1d4dd941.json b/docs/doc/133c60ab-9ffd-445e-9459-52ac1d4dd941.json
new file mode 100644
index 000000000..0aef3f9f0
--- /dev/null
+++ b/docs/doc/133c60ab-9ffd-445e-9459-52ac1d4dd941.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The given code contains a list of unique URLs for MP4 video files from the \"EuroCup2016\" dataset in the \"FootballAction\" application, which can be used for training or testing purposes.",
+    "details": [
+        {
+            "comment": "This code contains a list of URLs to mp4 video files from the \"EuroCup2016\" dataset in the \"FootballAction\" application. These videos may be used for training or testing purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/url.list\":0-25",
+            "content": "mp4/63e51df254d2402fac703b6c4fdb4ea9.mp4\nmp4/76b5f7ee28d942988c6b224bfac136bd.mp4\nmp4/250b88724acf40dbb6d7e8ccb400ef38.mp4\nmp4/c9516c903de3416c97dae91a59e968d7.mp4\nmp4/e1982c90cdd74abaacc4d0692070b400.mp4\nmp4/1be705a8f67648da8ec4b4296fa80895.mp4\nmp4/de23c0b2be3a4eb1990c5c657061fb29.mp4\nmp4/2754615de6e64c4fb95ce1a8095dc1c1.mp4\nmp4/299fe30d8f3b4a45b89313fe31f9f3c0.mp4\nmp4/6cc7db52c5ef4e70b401a5e00d8dd67a.mp4\nmp4/22e89747689e4f7e83e3620620c93269.mp4\nmp4/2ceb6c549fc64305a06a75acb355642b.mp4\nmp4/719b0a4bcb1f461eabb152298406b861.mp4\nmp4/259856b769044b4d8dc94076deb356bf.mp4\nmp4/d0bd3eab1e794f0f9501c353a6d37827.mp4\nmp4/19eb47cc736240d6b2dd930ab69da839.mp4\nmp4/4435b708af6d48519a6b726144147d51.mp4\nmp4/ea16ad2a020643529e257bd6cb11b3c3.mp4\nmp4/eeebffbd4ec74222a9c2d0775d79b689.mp4\nmp4/8cfb4e605af44055b1576c37eb0e3209.mp4\nmp4/6bca62b57cc449c6935f0b17f28d06be.mp4\nmp4/70cfc31e520840b2afca458f93a01ce4.mp4\nmp4/6496960935e845578e391a5916739752.mp4\nmp4/d6d25403a4bb4784aecff5f21fd00dc5.mp4\nmp4/3e23d452a082403391f8abfb87bf2fb4.mp4\nmp4/4c5d9d9af4f044c4a68d134061dc264f.mp4"
+        },
+        {
+            "comment": "This code contains a list of URLs pointing to MP4 video files from the \"EuroCup2016\" dataset in the \"FootballAction\" application of the PaddleVideo library. The URLs are unique identifiers for each video file, allowing for easy access and retrieval.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/url.list\":26-48",
+            "content": "mp4/6994844c64b44c26b935cee9604bef0a.mp4\nmp4/d6322cb95f6a4402ac80432b561abd5d.mp4\nmp4/2c8b5587083a4784a51622e4fec87ccd.mp4\nmp4/5faa60d70ed141de8560110e840f2048.mp4\nmp4/45d08bc5cb0f424f9ed9d7874eb561cd.mp4\nmp4/6630aaf0e32146088d0b624e9288f071.mp4\nmp4/f2edbee29c1b4966b3a410260f78fbe3.mp4\nmp4/f24116fdd6a54214991db32f7dddef67.mp4\nmp4/0265731a0c6f4a9398c88db8e3d4a3bc.mp4\nmp4/02d2de09997f4215b06e3b00ff0502a0.mp4\nmp4/9c231896c56a43f291a5e190949f4333.mp4\nmp4/4afbbf9afcd44dfea45b044117cccb48.mp4\nmp4/745db97a080d4f44b450dc17a2bcf069.mp4\nmp4/5933d0ce17854483b81a318d7d45a34e.mp4\nmp4/d2cfef2da9f84237a6950c7f6659655c.mp4\nmp4/5572686cb90f440988ded956a60e555d.mp4\nmp4/8962ac5a332346e180c79d701ae0a175.mp4\nmp4/f6e64ee9b13a4088b24c45c257894c1e.mp4\nmp4/f6ed2b612b3d43baa0726be8b14ebe7c.mp4\nmp4/8ab7b0cba5744eb3b6fb10003dfda383.mp4\nmp4/1f0a0698e38d493988fe42a50f7e8723.mp4\nmp4/737fdb054ca141f2a45013c1740dd0a0.mp4\nmp4/bab63a9bcf204e4b99c4a887a01bfd60.mp4"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1425e044-d978-4014-9936-3d6c863fb682.json b/docs/doc/1425e044-d978-4014-9936-3d6c863fb682.json
new file mode 100644
index 000000000..999a36270
--- /dev/null
+++ b/docs/doc/1425e044-d978-4014-9936-3d6c863fb682.json
@@ -0,0 +1,195 @@
+{
+    "summary": "The model employs PaddlePaddle for object matching, k-nearest neighbor search, SpatialCorrelationSampler for pairwise distances, and a neural network with separable convolutional layers for semantic segmentation. The Ma-Net's int_seghead updates global and local maps for sequence processing and performs tensor operations for video object segmentation.",
+    "details": [
+        {
+            "comment": "This code snippet is from a PaddlePaddle-based video object detection model. It defines functions for calculating pairwise distances between embeddings and initializes some global variables. The model is designed to take reference and query embeddings as input, compute pairwise squared L2 distances, and returns the flattened tensor of distances. This distance calculation is likely used in the matching process of objects in the video frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":0-41",
+            "content": "import os\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport sys\nsys.path.append(\"..\")\nfrom config import cfg\nimport time\nimport paddle.nn.functional as F\nfrom utils.api import int_, float_, long_\nfrom utils.api import kaiming_normal_\n#############################################################GLOBAL_DIST_MAP\nMODEL_UNFOLD = True\nWRONG_LABEL_PADDING_DISTANCE = 1e20\ndef _pairwise_distances(x, y, ys=None):\n    \"\"\"Computes pairwise squared l2 distances between tensors x and y.\n    Args:\n    x: Tensor of shape [n, feature_dim].\n    y: Tensor of shape [m, feature_dim].\n    Returns:\n    Float32 distances tensor of shape [n, m].\n    \"\"\"\n    xs = paddle.sum(x * x, 1)\n    xs = xs.unsqueeze(1)\n    if ys is None:\n        ys = paddle.sum(y * y, 1)\n        ys = ys.unsqueeze(0)\n    else:\n        ys = ys\n    d = xs + ys - 2. * paddle.matmul(x, paddle.t(y))\n    return d, ys\n##################\ndef _flattened_pairwise_distances(reference_embeddings, query_embeddings, ys):\n    \"\"\"Calculates flattened tensor of pairwise distances between ref and query."
+        },
+        {
+            "comment": "The code calculates the distance between reference and query embeddings, performing pairwise distances calculations using the _pairwise_distances function. The result is a distance tensor with shape [reference_embeddings.size / embedding_dim, query_embeddings.size / embedding_dim]. This function also includes _nn_features_per_object_for_chunk which extracts features for each object using nearest neighbor attention.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":42-64",
+            "content": "    Args:\n    reference_embeddings: Tensor of shape [..., embedding_dim],\n      the embedding vectors for the reference frame\n    query_embeddings: Tensor of shape [n_query_images, height, width,\n      embedding_dim], the embedding vectors for the query frames.\n    Returns:\n    A distance tensor of shape [reference_embeddings.size / embedding_dim,\n    query_embeddings.size / embedding_dim]\n    \"\"\"\n    embedding_dim = query_embeddings.shape[-1]\n    reference_embeddings = reference_embeddings.reshape([-1, embedding_dim])\n    first_dim = -1\n    query_embeddings = query_embeddings.reshape([first_dim, embedding_dim])\n    dists, ys = _pairwise_distances(query_embeddings, reference_embeddings, ys)\n    return dists, ys\ndef _nn_features_per_object_for_chunk(reference_embeddings, query_embeddings,\n                                      wrong_label_mask, k_nearest_neighbors,\n                                      ys):\n    \"\"\"Extracts features for each object using nearest neighbor attention.\n  Args:\n    reference_embeddings: Tensor of shape [n_chunk, embedding_dim],"
+        },
+        {
+            "comment": "This code calculates pairwise distances between reference and query embedding vectors, selects the k-nearest neighbors, and returns the nearest neighbor features. It takes into account a wrong_label_mask and padding distance, which helps handle incorrect labels and avoid noisy data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":65-87",
+            "content": "      the embedding vectors for the reference frame.\n    query_embeddings: Tensor of shape [m_chunk, embedding_dim], the embedding\n      vectors for the query frames.\n    wrong_label_mask:\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n  Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [m_chunk, n_objects, feature_dim].\n    \"\"\"\n    #    reference_embeddings_key = reference_embeddings\n    #    query_embeddings_key = query_embeddings\n    dists, ys = _flattened_pairwise_distances(reference_embeddings,\n                                              query_embeddings, ys)\n    dists = (paddle.unsqueeze(dists, 1) +\n             paddle.unsqueeze(float_(wrong_label_mask), 0) *\n             WRONG_LABEL_PADDING_DISTANCE)\n    if k_nearest_neighbors == 1:\n        features = paddle.min(dists, 2, keepdim=True)\n    else:\n        dists, _ = paddle.topk(-dists, k=k_nearest_neighbors, axis=2)\n        dists = -dists\n        valid_mask = (dists < WRONG_LABEL_PADDING_DISTANCE)"
+        },
+        {
+            "comment": "The code calculates the nearest neighbor features for each object in chunks to save memory. It starts by masking and averaging distances between reference and query embeddings, then selects relevant indices from flattened arrays to calculate nearest neighbors for objects. The function takes `reference_embeddings_flat`, `query_embeddings_flat`, `reference_labels_flat`, `ref_obj_ids`, `k_nearest_neighbors`, and `n_chunks` as input and returns the features and labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":88-117",
+            "content": "        masked_dists = dists * valid_mask.float()\n        pad_dist = paddle.max(masked_dists, axis=2, keepdim=True)[0].tile(\n            (1, 1, masked_dists.shape[-1]))\n        dists = paddle.where(valid_mask, dists, pad_dist)\n        # take mean of distances\n        features = paddle.mean(dists, axis=2, keepdim=True)\n    return features, ys\n###\ndef _selected_pixel(ref_labels_flat, ref_emb_flat):\n    index_list = paddle.arange(len(ref_labels_flat))\n    index_list = index_list\n    index_ = paddle.masked_select(index_list, ref_labels_flat != -1)\n    index_ = long_(index_)\n    ref_labels_flat = paddle.index_select(ref_labels_flat, index_, 0)\n    ref_emb_flat = paddle.index_select(ref_emb_flat, index_, 0)\n    return ref_labels_flat, ref_emb_flat\n###\ndef _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,\n        ref_obj_ids, k_nearest_neighbors, n_chunks):\n    \"\"\"Calculates the nearest neighbor features per object in chunks to save mem."
+        },
+        {
+            "comment": "This code performs k-nearest neighbor search using chunking to save memory. It takes embedding vectors for reference and query frames, their class labels, object ids, the number of nearest neighbors, and the number of chunks as input. It calculates the chunk size based on the number of query frames and the specified number of chunks. If TEST_MODE is enabled, it selects some pixels from the input. Then, it checks if the reference labels are equal to unsqueezed object ids for each query frame and creates a mask for wrong labels. It returns nearest neighbor features of shape [m, n_objects, feature_dim].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":118-140",
+            "content": "    Uses chunking to bound the memory use.\n    Args:\n    reference_embeddings_flat: Tensor of shape [n, embedding_dim],\n      the embedding vectors for the reference frame.\n    query_embeddings_flat: Tensor of shape [m, embedding_dim], the embedding\n      vectors for the query frames.\n    reference_labels_flat: Tensor of shape [n], the class labels of the\n      reference frame.\n    ref_obj_ids: int tensor of unique object ids in the reference labels.\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n    n_chunks: Integer, the number of chunks to use to save memory\n      (set to 1 for no chunking).\n    Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [m, n_objects, feature_dim].\n    \"\"\"\n    chunk_size = int_(\n        np.ceil((float_(query_embeddings_flat.shape[0]) / n_chunks).numpy()))\n    if cfg.TEST_MODE:\n        reference_labels_flat, reference_embeddings_flat = _selected_pixel(\n            reference_labels_flat, reference_embeddings_flat)\n    wrong_label_mask = (reference_labels_flat != paddle.unsqueeze("
+        },
+        {
+            "comment": "This code calculates nearest neighbor features for each object across multiple chunks. It splits the query embeddings into different chunks, then computes the features for each chunk individually. If there is only one chunk, it returns the features directly. Otherwise, it concatenates all the computed features along axis 0 and returns them as nearest neighbor features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":141-168",
+            "content": "        ref_obj_ids, 1))\n    all_features = []\n    for n in range(n_chunks):\n        if n == 0:\n            ys = None\n        if n_chunks == 1:\n            query_embeddings_flat_chunk = query_embeddings_flat\n        else:\n            chunk_start = n * chunk_size\n            chunk_end = (n + 1) * chunk_size\n            query_embeddings_flat_chunk = query_embeddings_flat[\n                chunk_start:chunk_end]\n        features, ys = _nn_features_per_object_for_chunk(\n            reference_embeddings_flat, query_embeddings_flat_chunk,\n            wrong_label_mask, k_nearest_neighbors, ys)\n        all_features.append(features)\n    if n_chunks == 1:\n        nn_features = all_features[0]\n    else:\n        nn_features = paddle.concat(all_features, axis=0)\n    return nn_features\ndef nearest_neighbor_features_per_object(reference_embeddings,\n                                         query_embeddings,\n                                         reference_labels,\n                                         k_nearest_neighbors,\n                                         gt_ids=None,"
+        },
+        {
+            "comment": "This function calculates the distance between nearest neighbors in reference_embeddings and query_embeddings for each object. It uses the provided reference_labels to determine objects, subsamples if max_neighbors_per_object is specified, and considers k_nearest_neighbors. The gt_ids are used for determining unique ground truth ids in the first frame.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":169-185",
+            "content": "                                         n_chunks=100):\n    \"\"\"Calculates the distance to the nearest neighbor per object.\n    For every pixel of query_embeddings calculate the distance to the\n    nearest neighbor in the (possibly subsampled) reference_embeddings per object.\n    Args:\n    reference_embeddings: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the reference frame.\n    query_embeddings: Tensor of shape [n_query_images, height, width,\n      embedding_dim], the embedding vectors for the query frames.\n    reference_labels: Tensor of shape [height, width, 1], the class labels of\n      the reference frame.\n    max_neighbors_per_object: Integer, the maximum number of candidates\n      for the nearest neighbor query per object after subsampling,\n      or 0 for no subsampling.\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n    gt_ids: Int tensor of shape [n_objs] of the sorted unique ground truth\n      ids in the first frame. If None, it will be derived from"
+        },
+        {
+            "comment": "This function calculates the nearest neighbor features for query images using reference embeddings and labels. It first asserts that the shape of reference embeddings matches the shape of reference labels. Then, it flattens the reference labels and checks if gt_ids (ground truth ids) are provided. If not, it finds unique object ids in the reference labels, creates a tensor with those ids, and converts them to integer type. Else, it converts the given gt_ids to integers. The function reshapes the query and reference embeddings, calculates embedding dimensions, and returns the nearest neighbor features and gt_ids.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":186-210",
+            "content": "      reference_labels.\n    n_chunks: Integer, the number of chunks to use to save memory\n      (set to 1 for no chunking).\n    Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [n_query_images, height, width, n_objects, feature_dim].\n    gt_ids: An int32 tensor of the unique sorted object ids present\n      in the reference labels.\n    \"\"\"\n    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])\n    h, w, _ = query_embeddings.shape\n    reference_labels_flat = reference_labels.reshape([-1])\n    if gt_ids is None:\n        ref_obj_ids = paddle.unique(reference_labels_flat)[-1]\n        ref_obj_ids = np.arange(0, ref_obj_ids + 1)\n        gt_ids = paddle.to_tensor(ref_obj_ids)\n        gt_ids = int_(gt_ids)\n    else:\n        gt_ids = int_(paddle.arange(0, gt_ids + 1))\n    embedding_dim = query_embeddings.shape[-1]\n    query_embeddings_flat = query_embeddings.reshape([-1, embedding_dim])\n    reference_embeddings_flat = reference_embeddings.reshape(\n        [-1, embedding_dim])"
+        },
+        {
+            "comment": "This code chunk performs nearest neighbor feature extraction for each object in the image, reshapes it, and then returns it along with gt_ids. The local_pairwise_distances function computes pairwise squared l2 distances using a local search window. It is used to compare features between different points in an optimized manner, considering a maximum distance per dimension.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":211-234",
+            "content": "    nn_features = _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,\n        gt_ids, k_nearest_neighbors, n_chunks)\n    nn_features_dim = nn_features.shape[-1]\n    nn_features = nn_features.reshape(\n        [1, h, w, gt_ids.shape[0], nn_features_dim])\n    return nn_features.cuda(), gt_ids\n########################################################################LOCAL_DIST_MAP\ndef local_pairwise_distances(x, y, max_distance=9):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n    Optimized implementation using correlation_cost.\n    Args:\n    x: Float32 tensor of shape [height, width, feature_dim].\n    y: Float32 tensor of shape [height, width, feature_dim].\n    max_distance: Integer, the maximum distance in pixel coordinates\n      per dimension which is considered to be in the search window.\n    Returns:\n    Float32 distances tensor of shape\n      [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    if cfg.MODEL_LOCAL_DOWNSAMPLE:"
+        },
+        {
+            "comment": "This code is performing cross-correlation between two input tensors and applying a boundary condition to the resulting tensor. It then applies sigmoid activation, resizes the tensor to original dimensions, and transposes it back to the original shape before unsqueezing the last dimension.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":235-260",
+            "content": "        #####\n        ori_h, ori_w, _ = x.shape\n        x = x.transpose([2, 0, 1]).unsqueeze(0)\n        x = F.avg_pool2d(x, (2, 2), (2, 2))\n        y = y.transpose([2, 0, 1]).unsqueeze(0)\n        y = F.avg_pool2d(y, (2, 2), (2, 2))\n        x = x.squeeze(0).transpose([1, 2, 0])\n        y = y.squeeze(0).transpose([1, 2, 0])\n        corr = cross_correlate(x, y, max_distance=max_distance)\n        xs = paddle.sum(x * x, 2, keepdim=True)\n        ys = paddle.sum(y * y, 2, keepdim=True)\n        ones_ys = paddle.ones_like(ys)\n        ys = cross_correlate(ones_ys, ys, max_distance=max_distance)\n        d = xs + ys - 2 * corr\n        # Boundary should be set to Inf.\n        tmp = paddle.zeros_like(d)\n        boundary = paddle.equal(\n            cross_correlate(ones_ys, ones_ys, max_distance=max_distance), 0)\n        d = paddle.where(boundary, tmp.fill_(float_('inf')), d)\n        d = (paddle.nn.functional.sigmoid(d) - 0.5) * 2\n        d = d.transpose([2, 0, 1]).unsqueeze(0)\n        d = F.interpolate(d,\n                          size=(ori_h, ori_w),"
+        },
+        {
+            "comment": "This code calculates the pairwise squared l2 distances between two tensors using either correlation or cross-correlation method, depending on the mode. In correlation mode, it uses bilinear interpolation and aligns corners. Otherwise, it uses cross-correlate function with a max_distance parameter. It also handles boundary cases by setting values to infinity where necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":261-286",
+            "content": "                          mode='bilinear',\n                          align_corners=True)\n        d = d.squeeze(0).transpose([1, 2, 0])\n    else:\n        corr = cross_correlate(x, y, max_distance=max_distance)\n        xs = paddle.sum(x * x, 2, keepdim=True)\n        ys = paddle.sum(y * y, 2, keepdim=True)\n        ones_ys = paddle.ones_like(ys)\n        ys = cross_correlate(ones_ys, ys, max_distance=max_distance)\n        d = xs + ys - 2 * corr\n        # Boundary should be set to Inf.\n        tmp = paddle.zeros_like(d)\n        boundary = paddle.equal(\n            cross_correlate(ones_ys, ones_ys, max_distance=max_distance), 0)\n        d = paddle.where(boundary, tmp.fill_(float_('inf')), d)\n    return d\ndef local_pairwise_distances2(x, y, max_distance=9):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n    Naive implementation using map_fn.\n    Used as a slow fallback for when correlation_cost is not available.\n    Args:\n    x: Float32 tensor of shape [height, width, feature_dim].\n    y: Float32 tensor of shape [height, width, feature_dim]."
+        },
+        {
+            "comment": "This code section performs local downsampling on the input tensors x and y. It first transposes the tensors and applies average pooling with a 2x2 kernel to reduce their size. Then, it pads the result of y with a large value, calculates offsets using unfolding, subtracts them from x, and sums squared differences across channels. The result is a distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":287-311",
+            "content": "    max_distance: Integer, the maximum distance in pixel coordinates\n      per dimension which is considered to be in the search window.\n    Returns:\n    Float32 distances tensor of shape\n      [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    if cfg.MODEL_LOCAL_DOWNSAMPLE:\n        ori_h, ori_w, _ = x.shape\n        x = paddle.transpose(x, [2, 0, 1]).unsqueeze(0)\n        x = F.avg_pool2d(x, (2, 2), (2, 2))\n        y = paddle.transpose(y, [2, 0, 1]).unsqueeze(0)\n        y = F.avg_pool2d(y, (2, 2), (2, 2))\n        _, channels, height, width = x.shape\n        padding_val = 1e20\n        padded_y = F.pad(\n            y, (max_distance, max_distance, max_distance, max_distance),\n            mode='constant',\n            value=padding_val)\n        offset_y = F.unfold(padded_y, kernel_sizes=[height, width]).reshape(\n            [1, channels, height, width, -1])\n        x = x.reshape([1, channels, height, width, 1])\n        minus = x - offset_y\n        dists = paddle.sum(paddle.multiply(minus, minus),\n                           axis=1).reshape([1, height, width,"
+        },
+        {
+            "comment": "This code calculates the distance between a set of 2D points and another point, in a sliding window manner. It handles two cases: when the first point set has been divided into smaller blocks for faster computation, and when it hasn't. The result is stored in dists as a list of distance matrices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":312-335",
+            "content": "                                            -1]).transpose([0, 3, 1, 2])\n        dists = (paddle.nn.functional.sigmoid(dists) - 0.5) * 2\n        dists = F.interpolate(dists,\n                              size=[ori_h, ori_w],\n                              mode='bilinear',\n                              align_corners=True)\n        dists = dists.squeeze(0).transpose([1, 2, 0])\n    else:\n        padding_val = 1e20\n        padded_y = nn.functional.pad(\n            y, (0, 0, max_distance, max_distance, max_distance, max_distance),\n            mode='constant',\n            value=padding_val)\n        height, width, _ = x.shape\n        dists = []\n        for y_start in range(2 * max_distance + 1):\n            y_end = y_start + height\n            y_slice = padded_y[y_start:y_end]\n            for x_start in range(2 * max_distance + 1):\n                x_end = x_start + width\n                offset_y = y_slice[:, x_start:x_end]\n                dist = paddle.sum(paddle.pow((x - offset_y), 2), dim=2)\n                dists.append(dist)"
+        },
+        {
+            "comment": "This code defines the SpatialCorrelationSampler class and a function called cross_correlate. The cross_correlate function takes two tensors, x and y, of shape [height, width, feature_dim] as inputs. It computes the cross correlation of these tensors using an optimized implementation from the SpatialCorrelationSampler class. The output tensor has a shape of [height, width, (2 * max_distance + 1) ** 2].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":336-364",
+            "content": "        dists = paddle.stack(dists, dim=2)\n    return dists\nclass SpatialCorrelationSampler:\n    pass\ndef cross_correlate(x, y, max_distance=9):\n    \"\"\"Efficiently computes the cross correlation of x and y.\n  Optimized implementation using correlation_cost.\n  Note that we do not normalize by the feature dimension.\n  Args:\n    x: Float32 tensor of shape [height, width, feature_dim].\n    y: Float32 tensor of shape [height, width, feature_dim].\n    max_distance: Integer, the maximum distance in pixel coordinates\n      per dimension which is considered to be in the search window.\n  Returns:\n    Float32 tensor of shape [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    corr_op = SpatialCorrelationSampler(kernel_size=1,\n                                        patch_size=2 * max_distance + 1,\n                                        stride=1,\n                                        dilation_patch=1,\n                                        padding=0)\n    xs = x.transpose(2, 0, 1)\n    xs = paddle.unsqueeze(xs, 0)"
+        },
+        {
+            "comment": "This code is part of the IntVOS model and defines a function that computes nearest neighbor features, allowing only local matches. It takes previous frame embedding, query embedding, previous frame labels, ground truth IDs and maximum distance as input. It transposes and reshapes the tensors before returning the computed correlations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":365-391",
+            "content": "    ys = y.transpose(2, 0, 1)\n    ys = paddle.unsqueeze(ys, 0)\n    corr = corr_op(xs, ys)\n    bs, _, _, hh, ww = corr.shape\n    corr = corr.reshape([bs, -1, hh, ww])\n    corr = paddle.squeeze(corr, 0)\n    corr = corr.transpose(1, 2, 0)\n    return corr\ndef local_previous_frame_nearest_neighbor_features_per_object(\n        prev_frame_embedding,\n        query_embedding,\n        prev_frame_labels,\n        gt_ids,\n        max_distance=12):\n    \"\"\"Computes nearest neighbor features while only allowing local matches.\n  Args:\n    prev_frame_embedding: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the last frame.\n    query_embedding: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the query frames.\n    prev_frame_labels: Tensor of shape [height, width, 1], the class labels of\n      the previous frame.\n    gt_ids: Int Tensor of shape [n_objs] of the sorted unique ground truth\n      ids in the first frame.\n    max_distance: Integer, the maximum distance allowed for local matching."
+        },
+        {
+            "comment": "This function calculates the nearest neighbor features using local pairwise distances. If MODEL_UNFOLD is set, it pads and unfolds the labels for offset masks generation. Else, it directly creates masks by comparing prev_frame_labels to gt_ids.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":392-420",
+            "content": "  Returns:\n    nn_features: A float32 np.array of nearest neighbor features of shape\n      [1, height, width, n_objects, 1].\n    \"\"\"\n    d = local_pairwise_distances2(query_embedding,\n                                  prev_frame_embedding,\n                                  max_distance=max_distance)\n    height, width = prev_frame_embedding.shape[:2]\n    if MODEL_UNFOLD:\n        labels = float_(prev_frame_labels).transpose([2, 0, 1]).unsqueeze(0)\n        padded_labels = F.pad(labels, (\n            2 * max_distance,\n            2 * max_distance,\n            2 * max_distance,\n            2 * max_distance,\n        ))\n        offset_labels = F.unfold(padded_labels,\n                                 kernel_sizes=[height, width],\n                                 strides=[2, 2]).reshape([height, width, -1, 1])\n        offset_masks = paddle.equal(\n            offset_labels,\n            float_(gt_ids).unsqueeze(0).unsqueeze(0).unsqueeze(0))\n    else:\n        masks = paddle.equal(prev_frame_labels,\n                             gt_ids.unsqueeze(0).unsqueeze(0))"
+        },
+        {
+            "comment": "This code applies padding to masks and creates offset masks by slicing the padded masks. It then constructs a 3D tensor of offset masks using Paddle's stack function. It also tiles the input data 'd' along the gt_ids dimension and creates a padding tensor. The code then computes the minimum distance between the tiled input and the masked data, resulting in distances tensor. Finally, it reshapes the distances tensor to have a specific shape and returns it.\nThe _res_block class is a layer that takes an input dimension (in_dim) and an output dimension (out_dim).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":421-453",
+            "content": "        padded_masks = nn.functional.pad(masks, (\n            0,\n            0,\n            max_distance,\n            max_distance,\n            max_distance,\n            max_distance,\n        ))\n        offset_masks = []\n        for y_start in range(2 * max_distance + 1):\n            y_end = y_start + height\n            masks_slice = padded_masks[y_start:y_end]\n            for x_start in range(2 * max_distance + 1):\n                x_end = x_start + width\n                offset_mask = masks_slice[:, x_start:x_end]\n                offset_masks.append(offset_mask)\n        offset_masks = paddle.stack(offset_masks, axis=2)\n    d_tiled = d.unsqueeze(-1).tile((1, 1, 1, gt_ids.shape[0]))\n    pad = paddle.ones_like(d_tiled)\n    d_masked = paddle.where(offset_masks, d_tiled, pad)\n    dists = paddle.min(d_masked, axis=2)\n    dists = dists.reshape([1, height, width, gt_ids.shape[0], 1])\n    return dists\n##############################################################\n#################\nclass _res_block(nn.Layer):\n    def __init__(self, in_dim, out_dim):"
+        },
+        {
+            "comment": "This code defines a Residual Block and an Instance Segmentation Head for the Ma-Net model. The Residual Block consists of two 3x3 convolutions, batch normalization, and ReLU activations, while the IntSegHead layer takes in a specific input dimension for instance segmentation tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":454-485",
+            "content": "        super(_res_block, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,\n                               out_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.relu1 = nn.ReLU()\n        self.bn1 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg.TRAIN_BN_MOM)\n        self.conv2 = nn.Conv2D(out_dim,\n                               out_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.relu2 = nn.ReLU()\n        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg.TRAIN_BN_MOM)\n    def forward(self, x):\n        res = x\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu2(x)\n        x += res\n        return x\n####################\nclass IntSegHead(nn.Layer):\n    def __init__(self,\n                 in_dim=(cfg.MODEL_SEMANTIC_EMBEDDING_DIM + 3),"
+        },
+        {
+            "comment": "This code defines a neural network class called \"IntSegHead\" for segmentation tasks. It consists of multiple convolutional and batch normalization layers, followed by ReLU activations. The output is passed through another convolutional layer before being fed into the final convolutional layer to produce the result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":486-512",
+            "content": "                 emb_dim=cfg.MODEL_HEAD_EMBEDDING_DIM):\n        super(IntSegHead, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,\n                               emb_dim,\n                               kernel_size=7,\n                               stride=1,\n                               padding=3)\n        self.bn1 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg.TRAIN_BN_MOM)\n        self.relu1 = nn.ReLU(True)\n        self.res1 = _res_block(emb_dim, emb_dim)\n        self.res2 = _res_block(emb_dim, emb_dim)\n        self.conv2 = nn.Conv2D(256, emb_dim, kernel_size=3, stride=1, padding=1)\n        self.bn2 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg.TRAIN_BN_MOM)\n        self.relu2 = nn.ReLU(True)\n        self.conv3 = nn.Conv2D(emb_dim, 1, 1, 1)\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.res1(x)\n        x = self.res2(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu2(x)\n        x = self.conv3(x)\n        return x"
+        },
+        {
+            "comment": "This code defines a custom layer \"_split_separable_conv2d\" that performs separable convolution using two consecutive 2D convolutions. It consists of two 2D convolutions separated by Batch Normalization and ReLU activation functions. The first convolution is followed by Batch Normalization and ReLU, while the second convolution is also followed by another Batch Normalization and ReLU. Weights are initialized using Kaiming Normal initialization for both convolutions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":515-536",
+            "content": "class _split_separable_conv2d(nn.Layer):\n    def __init__(self, in_dim, out_dim, kernel_size=7):\n        super(_split_separable_conv2d, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,\n                               in_dim,\n                               kernel_size=kernel_size,\n                               stride=1,\n                               padding=int((kernel_size - 1) / 2),\n                               groups=in_dim)\n        self.relu1 = nn.ReLU(True)\n        self.bn1 = paddle.nn.BatchNorm2D(in_dim, momentum=cfg.TRAIN_BN_MOM)\n        self.conv2 = nn.Conv2D(in_dim, out_dim, kernel_size=1, stride=1)\n        self.relu2 = nn.ReLU(True)\n        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg.TRAIN_BN_MOM)\n        kaiming_normal_(self.conv1.weight, mode='fan_out', nonlinearity='relu')\n        kaiming_normal_(self.conv2.weight, mode='fan_out', nonlinearity='relu')\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.conv2(x)"
+        },
+        {
+            "comment": "The code defines a DynamicSegHead class with four split separable convolutional layers, followed by a 1x1 convolution. It also initializes an IntVOS class that takes in the configuration and feature extractor as parameters. The classes are used for semantic segmentation tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":537-570",
+            "content": "        x = self.bn2(x)\n        x = self.relu2(x)\n        return x\nclass DynamicSegHead(nn.Layer):\n    def __init__(self,\n                 in_dim=(cfg.MODEL_SEMANTIC_EMBEDDING_DIM + 3),\n                 embed_dim=cfg.MODEL_HEAD_EMBEDDING_DIM,\n                 kernel_size=1):\n        super(DynamicSegHead, self).__init__()\n        self.layer1 = _split_separable_conv2d(in_dim, embed_dim)\n        self.layer2 = _split_separable_conv2d(embed_dim, embed_dim)\n        self.layer3 = _split_separable_conv2d(embed_dim, embed_dim)\n        self.layer4 = _split_separable_conv2d(embed_dim, embed_dim)\n        self.conv = nn.Conv2D(embed_dim, 1, 1, 1)\n        kaiming_normal_(self.conv.weight, mode='fan_out', nonlinearity='relu')\n    def forward(self, x):\n        x = self.layer1(x)\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        x = self.conv(x)\n        return x\n##################\n###############\nclass IntVOS(nn.Layer):\n    def __init__(self, cfg, feature_extracter):\n        super(IntVOS, self).__init__()"
+        },
+        {
+            "comment": "The code initializes components for a network architecture. It creates feature extractors, convolutional layers, batch normalization layers, and ReLU activation functions to process and extract semantic features from input data. These features will be used for tasks such as object detection or image classification.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":571-588",
+            "content": "        self.feature_extracter = feature_extracter  ##embedding extractor\n        self.feature_extracter.cls_conv = nn.Sequential()\n        self.feature_extracter.upsample4 = nn.Sequential()\n        self.semantic_embedding = None\n        self.seperate_conv = nn.Conv2D(cfg.MODEL_ASPP_OUTDIM,\n                                       cfg.MODEL_ASPP_OUTDIM,\n                                       kernel_size=3,\n                                       stride=1,\n                                       padding=1,\n                                       groups=cfg.MODEL_ASPP_OUTDIM)\n        self.bn1 = paddle.nn.BatchNorm2D(cfg.MODEL_ASPP_OUTDIM,\n                                         momentum=cfg.TRAIN_BN_MOM)\n        self.relu1 = nn.ReLU(True)\n        self.embedding_conv = nn.Conv2D(cfg.MODEL_ASPP_OUTDIM,\n                                        cfg.MODEL_SEMANTIC_EMBEDDING_DIM, 1, 1)\n        self.relu2 = nn.ReLU(True)\n        self.bn2 = paddle.nn.BatchNorm2D(cfg.MODEL_SEMANTIC_EMBEDDING_DIM,\n                                         momentum=cfg.TRAIN_BN_MOM)"
+        },
+        {
+            "comment": "The code initializes the network's semantic embedding layer, consisting of a sequence of convolutional layers, and applies Kaiming initialization to the weights. It also creates a dynamic segmentation head (seghead) for propagation and an interaction segmentation head based on the configuration flag MODEL_USEIntSeg. The function defines the forward pass for the network, taking in various inputs such as image data, reference labels, and masks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":589-615",
+            "content": "        self.semantic_embedding = nn.Sequential(*[\n            self.seperate_conv, self.bn1, self.relu1, self.embedding_conv,\n            self.bn2, self.relu2\n        ])\n        for m in self.semantic_embedding:\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')\n        self.dynamic_seghead = DynamicSegHead()  # propagation segm head\n        if cfg.MODEL_USEIntSeg:\n            self.inter_seghead = IntSegHead(\n                in_dim=cfg.MODEL_SEMANTIC_EMBEDDING_DIM + 3)\n        else:\n            self.inter_seghead = DynamicSegHead(\n                in_dim=cfg.MODEL_SEMANTIC_EMBEDDING_DIM +\n                2)  # interaction segm head\n    def forward(self,\n                x=None,\n                ref_scribble_label=None,\n                previous_frame_mask=None,\n                normalize_nearest_neighbor_distances=True,\n                use_local_map=True,\n                seq_names=None,\n                gt_ids=None,\n                k_nearest_neighbors=1,"
+        },
+        {
+            "comment": "This code splits the input feature into three parts, then if `global_map_tmp_dic` is None, it passes these parts and other parameters to `prop_seghead()`, which returns a dictionary. If `global_map_tmp_dic` is not None, it also passes `global_map_tmp_dic` as an additional parameter before calling `prop_seghead()`. The function then returns the returned dictionary and updates `global_map_tmp_dic`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":616-639",
+            "content": "                global_map_tmp_dic=None,\n                local_map_dics=None,\n                interaction_num=None,\n                start_annotated_frame=None,\n                frame_num=None):\n        x = self.extract_feature(x)\n        #         print('extract_feature:', x.mean().item())\n        ref_frame_embedding, previous_frame_embedding, current_frame_embedding = paddle.split(\n            x, num_or_sections=3, axis=0)\n        if global_map_tmp_dic is None:\n            dic = self.prop_seghead(\n                ref_frame_embedding, previous_frame_embedding,\n                current_frame_embedding, ref_scribble_label,\n                previous_frame_mask, normalize_nearest_neighbor_distances,\n                use_local_map, seq_names, gt_ids, k_nearest_neighbors,\n                global_map_tmp_dic, local_map_dics, interaction_num,\n                start_annotated_frame, frame_num, self.dynamic_seghead)\n            return dic\n        else:\n            dic, global_map_tmp_dic = self.prop_seghead(\n                ref_frame_embedding, previous_frame_embedding,"
+        },
+        {
+            "comment": "The code defines a function that takes various inputs including frame embeddings, scribble labels, and masked frames. It performs feature extraction using a predefined feature extractor and semantic embedding. The function then returns the extracted features and temporary global map dictionaries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":640-663",
+            "content": "                current_frame_embedding, ref_scribble_label,\n                previous_frame_mask, normalize_nearest_neighbor_distances,\n                use_local_map, seq_names, gt_ids, k_nearest_neighbors,\n                global_map_tmp_dic, local_map_dics, interaction_num,\n                start_annotated_frame, frame_num, self.dynamic_seghead)\n            return dic, global_map_tmp_dic\n    def extract_feature(self, x):\n        x = self.feature_extracter(x)\n        x = self.semantic_embedding(x)\n        return x\n    def prop_seghead(self,\n                     ref_frame_embedding=None,\n                     previous_frame_embedding=None,\n                     current_frame_embedding=None,\n                     ref_scribble_label=None,\n                     previous_frame_mask=None,\n                     normalize_nearest_neighbor_distances=True,\n                     use_local_map=True,\n                     seq_names=None,\n                     gt_ids=None,\n                     k_nearest_neighbors=1,\n                     global_map_tmp_dic=None,"
+        },
+        {
+            "comment": "This code defines a function that takes various inputs and returns feature_embedding, global_match_map, local_match_map, and previous_frame_mask. It performs interpolation on the ref_scribble_label and previous_frame_mask using nearest mode to resize them to the same size as current_frame_embedding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":664-684",
+            "content": "                     local_map_dics=None,\n                     interaction_num=None,\n                     start_annotated_frame=None,\n                     frame_num=None,\n                     dynamic_seghead=None):\n        \"\"\"return: feature_embedding,global_match_map,local_match_map,previous_frame_mask\"\"\"\n        ###############\n        global_map_tmp_dic = global_map_tmp_dic\n        dic_tmp = {}\n        bs, c, h, w = current_frame_embedding.shape\n        if cfg.TEST_MODE:\n            scale_ref_scribble_label = float_(ref_scribble_label)\n        else:\n            scale_ref_scribble_label = paddle.nn.functional.interpolate(\n                float_(ref_scribble_label), size=(h, w), mode='nearest')\n        scale_ref_scribble_label = int_(scale_ref_scribble_label)\n        scale_previous_frame_label = paddle.nn.functional.interpolate(\n            float_(previous_frame_mask), size=(h, w), mode='nearest')\n        #         print(scale_previous_frame_label.sum())  # xx\n        #         print(previous_frame_mask.sum().item())  # xx"
+        },
+        {
+            "comment": "In this code snippet, we see the process of extracting nearest neighbor features per object for each batch of frames. The frames are transposed and labeled before finding the k_nearest_neighbors. These operations are performed within a loop for every frame in the batch (bs).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":685-703",
+            "content": "        scale_previous_frame_label = int_(scale_previous_frame_label)\n        #         print(scale_previous_frame_label.sum().item())  # xx\n        for n in range(bs):\n            seq_current_frame_embedding = current_frame_embedding[n]\n            seq_ref_frame_embedding = ref_frame_embedding[n]\n            seq_prev_frame_embedding = previous_frame_embedding[n]\n            seq_ref_frame_embedding = seq_ref_frame_embedding.transpose(\n                [1, 2, 0])\n            seq_current_frame_embedding = seq_current_frame_embedding.transpose(\n                [1, 2, 0])\n            seq_ref_scribble_label = scale_ref_scribble_label[n].transpose(\n                [1, 2, 0])\n            #########Global Map\n            nn_features_n, ref_obj_ids = nearest_neighbor_features_per_object(\n                reference_embeddings=seq_ref_frame_embedding,\n                query_embeddings=seq_current_frame_embedding,\n                reference_labels=seq_ref_scribble_label,\n                k_nearest_neighbors=k_nearest_neighbors,"
+        },
+        {
+            "comment": "This code segment checks if the current sequence name exists in the global map temporary dictionary. If it does not exist, a paddle.ones_like(nn_features_n) is created and assigned to the dictionary with shape [104, 1, 1, 1, 1]. Then, the code performs a where operation using nn_features_n, comparing it to the global map value for the current sequence name and frame number. If nn_features_n is less than or equal to the global map value, it remains unchanged; otherwise, the global map value overwrites nn_features_n. The last line transposes seq_prev_frame_embedding before continuing with the next chunk of code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":704-724",
+            "content": "                gt_ids=gt_ids[n],\n                n_chunks=10)\n            if normalize_nearest_neighbor_distances:\n                nn_features_n = (paddle.nn.functional.sigmoid(nn_features_n) -\n                                 0.5) * 2\n            if global_map_tmp_dic is not None:  ###when testing, use global map memory\n                if seq_names[n] not in global_map_tmp_dic:\n                    global_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                        nn_features_n).tile([104, 1, 1, 1, 1])\n                nn_features_n = paddle.where(\n                    nn_features_n <=\n                    global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0),\n                    nn_features_n,\n                    global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0))\n                global_map_tmp_dic[seq_names[n]][\n                    frame_num[n]] = nn_features_n.detach()[0]\n            #########################Local dist map\n            seq_prev_frame_embedding = seq_prev_frame_embedding.transpose("
+        },
+        {
+            "comment": "The code is performing nearest neighbor feature extraction for previous frames in a video sequence. It checks if the use_local_map flag is set, and depending on its value, either uses local_previous_frame_nearest_neighbor_features_per_object function or nearest_neighbor_features_per_object function to extract features. If use_local_map is true, it takes previous frame embedding, current frame embedding, previous frame labels, reference object IDs and max distance as inputs. Otherwise, it takes previous frame embeddings, current frame embeddings, previous frame labels, k-nearest neighbors, gt_ids (for current iteration), and number of chunks as inputs. The code then assigns the extracted features to prev_frame_nn_features_n variable.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":725-744",
+            "content": "                [1, 2, 0])\n            seq_previous_frame_label = scale_previous_frame_label[n].transpose(\n                [1, 2, 0])\n            if use_local_map:\n                prev_frame_nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(\n                    prev_frame_embedding=seq_prev_frame_embedding,\n                    query_embedding=seq_current_frame_embedding,\n                    prev_frame_labels=seq_previous_frame_label,\n                    gt_ids=ref_obj_ids,\n                    max_distance=cfg.MODEL_MAX_LOCAL_DISTANCE)\n            else:\n                prev_frame_nn_features_n, _ = nearest_neighbor_features_per_object(\n                    reference_embeddings=seq_prev_frame_embedding,\n                    query_embeddings=seq_current_frame_embedding,\n                    reference_labels=seq_previous_frame_label,\n                    k_nearest_neighbors=k_nearest_neighbors,\n                    gt_ids=gt_ids[n],\n                    n_chunks=20)\n                prev_frame_nn_features_n = ("
+        },
+        {
+            "comment": "This code segment is checking if the current sequence name is present in the local map dictionaries for distance and temporary maps. If it's not, it creates new entries with zeros initialized. The local map distance value is then updated based on the frame number and interaction number, using the absolute difference from a start annotated frame to determine the distance. This could be used in a video sequence processing context where local map dictionaries store temporary and distance maps for different sequences.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":745-763",
+            "content": "                    paddle.nn.functional.sigmoid(prev_frame_nn_features_n) -\n                    0.5) * 2\n#             print(prev_frame_nn_features_n.mean().item(), prev_frame_nn_features_n.shape, interaction_num)  # o\n#############\n            if local_map_dics is not None:  ##When testing, use local map memory\n                local_map_tmp_dic, local_map_dist_dic = local_map_dics\n                if seq_names[n] not in local_map_dist_dic:\n                    print(seq_names[n], 'not in local_map_dist_dic')\n                    local_map_dist_dic[seq_names[n]] = paddle.zeros(104, 9)\n                if seq_names[n] not in local_map_tmp_dic:\n                    print(seq_names[n], 'not in local_map_tmp_dic')\n                    local_map_tmp_dic[seq_names[n]] = paddle.zeros_like(\n                        prev_frame_nn_features_n).unsqueeze(0).tile(\n                            [104, 9, 1, 1, 1, 1])\n                local_map_dist_dic[seq_names[n]][\n                    frame_num[n], interaction_num -\n                    1] = 1.0 / (abs(frame_num[n] - start_annotated_frame)"
+        },
+        {
+            "comment": "This code block appears to be part of a larger function. It seems to store and update the features of previous frames for a given sequence, based on the interaction number and frame number. If the current interaction's distance is greater than the previous one, it updates the previous frame features. The code uses dictionaries to store these features, with the frame number and interaction number as keys. The detach() function seems to remove the feature tensor from the computation graph for memory efficiency, while unsqueeze(0) reshapes the tensor to have a batch dimension.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":764-780",
+            "content": "                                )  # bugs fixed.\n                local_map_tmp_dic[seq_names[n]][\n                    frame_num[n],\n                    interaction_num - 1] = prev_frame_nn_features_n.squeeze(\n                        0).detach()  # bugs fixed.\n                if interaction_num == 1:\n                    prev_frame_nn_features_n = local_map_tmp_dic[seq_names[n]][\n                        frame_num[n]][interaction_num - 1]\n                    prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                        0)\n                else:\n                    if local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 1] > \\\n                            local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 2]:\n                        prev_frame_nn_features_n = local_map_tmp_dic[\n                            seq_names[n]][frame_num[n]][interaction_num - 1]\n                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                            0)"
+        },
+        {
+            "comment": "This code snippet is part of a neural network model for video object detection. It deals with handling previous frame features, categorizing frames based on the reference object IDs, and concatenating different tensor inputs together. The code checks if the current frame's label matches the reference object ID, unsqueezes and tiles the tensors accordingly, transposes them, and finally concatenates these transformed tensors using `paddle.concat`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":781-802",
+            "content": "                    else:\n                        prev_frame_nn_features_n = local_map_tmp_dic[\n                            seq_names[n]][frame_num[n]][interaction_num - 2]\n                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                            0)\n                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)\n            to_cat_previous_frame = (\n                float_(seq_previous_frame_label) == float_(ref_obj_ids)\n            )  # float comparision?\n            to_cat_current_frame_embedding = current_frame_embedding[\n                n].unsqueeze(0).tile((ref_obj_ids.shape[0], 1, 1, 1))\n            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(\n                [2, 3, 0, 1])\n            to_cat_previous_frame = float_(\n                to_cat_previous_frame.unsqueeze(-1).transpose([2, 3, 0, 1]))\n            to_cat_prev_frame_nn_feature_n = prev_frame_nn_features_n.squeeze(\n                0).transpose([2, 3, 0, 1])\n            to_cat = paddle.concat("
+        },
+        {
+            "comment": "This function, int_seghead, takes various inputs such as reference frame embedding, scribble label, previous round label etc. It normalizes nearest neighbor distances if specified and returns the dictionary temporary (dic_tmp) containing predicted results for each sequence, along with optional global map temporary dictionary (global_map_tmp_dic) and local map dictionaries (local_map_dics). The interaction number (interaction_num), frame number (frame_num) and list of sequence names (seq_names) are also used.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":803-828",
+            "content": "                (to_cat_current_frame_embedding, to_cat_nn_feature_n,\n                 to_cat_prev_frame_nn_feature_n, to_cat_previous_frame), 1)\n            pred_ = dynamic_seghead(to_cat)\n            pred_ = pred_.transpose([1, 0, 2, 3])\n            dic_tmp[seq_names[n]] = pred_\n        if global_map_tmp_dic is None:\n            return dic_tmp\n        else:\n            if local_map_dics is None:\n                return dic_tmp, global_map_tmp_dic\n            else:\n                return dic_tmp, global_map_tmp_dic, local_map_dics\n    def int_seghead(self,\n                    ref_frame_embedding=None,\n                    ref_scribble_label=None,\n                    prev_round_label=None,\n                    normalize_nearest_neighbor_distances=True,\n                    global_map_tmp_dic=None,\n                    local_map_dics=None,\n                    interaction_num=None,\n                    seq_names=None,\n                    gt_ids=None,\n                    k_nearest_neighbors=1,\n                    frame_num=None,"
+        },
+        {
+            "comment": "This code segment is part of the Ma-Net network in PaddleVideo. It interpolates the reference scribble label and previous round label images, assigns ground truth IDs, and performs local distance map calculations on a sequence of frames for a batch of videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":829-852",
+            "content": "                    first_inter=True):\n        dic_tmp = {}\n        bs, c, h, w = ref_frame_embedding.shape\n        scale_ref_scribble_label = paddle.nn.functional.interpolate(\n            float_(ref_scribble_label), size=(h, w), mode='nearest')\n        scale_ref_scribble_label = int_(scale_ref_scribble_label)\n        if not first_inter:\n            scale_prev_round_label = paddle.nn.functional.interpolate(\n                float_(prev_round_label), size=(h, w), mode='nearest')\n            scale_prev_round_label = int_(scale_prev_round_label)\n        n_chunks = 500\n        for n in range(bs):\n            gt_id = paddle.arange(0, gt_ids[n] + 1)\n            gt_id = int_(gt_id)\n            seq_ref_frame_embedding = ref_frame_embedding[n]\n            ########################Local dist map\n            seq_ref_frame_embedding = paddle.transpose(seq_ref_frame_embedding,\n                                                       [1, 2, 0])\n            seq_ref_scribble_label = paddle.transpose(\n                scale_ref_scribble_label[n], [1, 2, 0])"
+        },
+        {
+            "comment": "This code segment is updating the global and local maps for a given sequence of frames. It first calculates the nearest neighbor features (nn_features_n) using the previous frame embedding, query embedding, previous frame labels, and ground truth IDs. Then it checks if this current sequence name exists in the global map temporary dictionary (global_map_tmp_dic). If not, it initializes a one-tensor for that sequence and tiles it to match the shape of nn_features_n. It then applies a where statement to compare nn_features_n with the global map tensor, selecting either nn_features_n or the global map tensor depending on which is smaller. Finally, it updates the global map temporary dictionary entry for this sequence at the current frame number with the selected tensor from the where statement.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":853-876",
+            "content": "            nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(\n                prev_frame_embedding=seq_ref_frame_embedding,\n                query_embedding=seq_ref_frame_embedding,\n                prev_frame_labels=seq_ref_scribble_label,\n                gt_ids=gt_id,\n                max_distance=cfg.MODEL_MAX_LOCAL_DISTANCE)\n            #######\n            ######################Global map update\n            if seq_names[n] not in global_map_tmp_dic:\n                global_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                    nn_features_n).tile([104, 1, 1, 1, 1])\n            nn_features_n_ = paddle.where(\n                nn_features_n <=\n                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0),\n                nn_features_n,\n                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0))\n            ###\n            ###\n            global_map_tmp_dic[seq_names[n]][\n                frame_num[n]] = nn_features_n_.detach()[0]\n            ##################Local map update"
+        },
+        {
+            "comment": "The code checks if a dictionary of local maps is provided. If so, it retrieves the temporary and distance dictionaries from it. It then updates these dictionaries for the current sequence name (seq_names[n]), adding a 0 to a specific element if the sequence name is not already in the distance dictionary. Finally, it creates embedding tensors for frame and feature comparison and prepares them for concatenation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":877-896",
+            "content": "            if local_map_dics is not None:\n                local_map_tmp_dic, local_map_dist_dic = local_map_dics\n                if seq_names[n] not in local_map_dist_dic:\n                    local_map_dist_dic[seq_names[n]] = paddle.zeros([104, 9])\n                if seq_names[n] not in local_map_tmp_dic:\n                    local_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                        nn_features_n).unsqueeze(0).tile([104, 9, 1, 1, 1, 1])\n                local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num -\n                                                               1] = 0\n                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)\n            ##################\n            to_cat_current_frame_embedding = ref_frame_embedding[n].unsqueeze(\n                0).tile((gt_id.shape[0], 1, 1, 1))\n            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(\n                [2, 3, 0, 1])\n            to_cat_scribble_mask_to_cat = (\n                float_(seq_ref_scribble_label) == float_(gt_id)"
+        },
+        {
+            "comment": "This code is performing a series of operations on tensors to create the 'to_cat' tensor for use in the model. It checks if it's the first iteration and adjusts the previous round label accordingly. Then, it concatenates three different tensor inputs along the 1st axis (channel dimension) and passes the result through a segmentation head network to get the final prediction 'pred_'. This code seems to be part of a larger neural network for video object segmentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":897-920",
+            "content": "            )  # float comparision?\n            to_cat_scribble_mask_to_cat = float_(\n                to_cat_scribble_mask_to_cat.unsqueeze(-1).transpose(\n                    [2, 3, 0, 1]))\n            if not first_inter:\n                seq_prev_round_label = scale_prev_round_label[n].transpose(\n                    [1, 2, 0])\n                to_cat_prev_round_to_cat = (\n                    float_(seq_prev_round_label) == float_(gt_id)\n                )  # float comparision?\n                to_cat_prev_round_to_cat = float_(\n                    to_cat_prev_round_to_cat.unsqueeze(-1).transpose(\n                        [2, 3, 0, 1]))\n            else:\n                to_cat_prev_round_to_cat = paddle.zeros_like(\n                    to_cat_scribble_mask_to_cat)\n                to_cat_prev_round_to_cat[0] = 1.\n            to_cat = paddle.concat(\n                (to_cat_current_frame_embedding, to_cat_scribble_mask_to_cat,\n                 to_cat_prev_round_to_cat), 1)\n            pred_ = self.inter_seghead(to_cat)"
+        },
+        {
+            "comment": "This code is transposing the tensor 'pred_' and storing it in 'dic_tmp' with corresponding sequence name as key. It then checks if 'local\\_map\\_dics' is None, and returns 'dic\\_tmp' or returns both 'dic\\_tmp' and 'local\\_map\\_dics'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/IntVOS.py\":921-926",
+            "content": "            pred_ = pred_.transpose([1, 0, 2, 3])\n            dic_tmp[seq_names[n]] = pred_\n        if local_map_dics is None:\n            return dic_tmp\n        else:\n            return dic_tmp, local_map_dics"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/14314a35-bbab-42ff-9a94-5625a4383107.json b/docs/doc/14314a35-bbab-42ff-9a94-5625a4383107.json
new file mode 100644
index 000000000..557a1f20e
--- /dev/null
+++ b/docs/doc/14314a35-bbab-42ff-9a94-5625a4383107.json
@@ -0,0 +1,65 @@
+{
+    "summary": "Video classification tasks involve recognizing actions through RGB images and skeleton data. Concepts include temporal action localization, dense-captioning events, popular datasets, feature extraction, motion representation, and classification using deep learning methods since 2014.",
+    "details": [
+        {
+            "comment": "Introduction to video classification (action recognition) with various applications in different fields, including online platforms and offline sectors like security, transportation, and quality inspection. Tasks include classification/recognition and detection, further subdivided by combining different scenes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":0-17",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/tutorials/summarize.md) | English\n# Introduction for video classification(action recognition)\n## Wide range of application scenarios\nVideo classification has a wide range of applications in many fields, such as online video platforms such as short videos, offline such as security, transportation, quality inspection and other fields\u3002\n## Multiple subtasks\nSimilar to image tasks, video tasks can also be divided into two categories: **classification (recognition) and detection**, and these two types of tasks can be specifically subdivided by combining different scenes\uff1a\n+ Task1\uff1aTrimmed Action Recognition. Users input a trimmed video,which contains only single action,then a video tag will be output by model as depicted in fig below:\n<p align=\"center\">\n<img src=\"../../images/action_classification.png\" height=300 width=700 hspace='10'/> <br />\n Action Classification\n</p>\n  In terms of the data modality used, classification tasks can be further subdivided into classification based on si"
+        },
+        {
+            "comment": "This code is describing different types of classification tasks in video analysis. It covers multi-modality data, RGB images, human skeleton data, and various perspectives such as first-person, third-person, and multiple perspectives. Additionally, it mentions untrimmed videos, temporal action proposals, and ROI extraction in image detection tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":17-31",
+            "content": "ngle modality data, classification based on multi-modality data, classification based on RGB images and classification based on human skeleton, etc, as shown in the figure below:\n  <p align=\"center\">\n  <img src=\"../../images/multimodality.png\" height=300 width=500 hspace='10'/> <br />\n multi-modality\n  </p>\nIn terms of the perspective of video, it can also be divided into first-person action recognition, \nthird-person action recognition, single perspective action recognition and multi-perspective fusion action recognition. \nUsers who are interested in these fields can refer to relevant literatures.\n+ Task2\uff1aUntrimmed Video Classification. \nUnlike trimmed videos, untrimmed videos often contain multiple actions and have a long time span. \nThere are a lot of movements that we may need not paying attention to. Through the global analysis of the input long video, and then make a soft classify to mutiple categories.\n+ Task3\uff1aTemporal Action Proposal. It is similar to the ROI extraction in the image detection task. "
+        },
+        {
+            "comment": "Task 4: Temporal Action Localization - find video segments with possible actions, classify them.\nTask 5: Dense-Captioning Events - describe untrimmed videos' actions in temporal dimension.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":32-48",
+            "content": "The task is to find the video clips that may contain action in a long video with a lot of actions.\n+ Task4\uff1aTemporal Action Localization. Compared with the temporal action proposal task as mentioned above, \ntemporal action localization task is more consistent with detection task in the field of imgae, \nit requires not only to find the video segments with possible actions from the video but also to classify them,\nas shown in the figure below\n <p align=\"center\">\n<img src=\"../../images/action_detection.png\" height=200 width=1000 hspace='10'/> <br />\n Action Detection\n</p>\n+ Task5\uff1aDense-Captioning Events. The reason why it is called dense captioning events is mainly \nbecause that this task requires video action description on the basis of temporal action localization \n(detection). That is to say, the task needs to locate the actions in a **untrimmed** video,in **temporal \ndimension** and describe the behavior of the **whole video** after obtaining many video segments which contain actions.\n## Introduction of datasets"
+        },
+        {
+            "comment": "The code provides a brief overview of popular video action recognition datasets, such as KTH and UCF101. It mentions that the datasets are essential for training and validating models, but overfitting may occur with larger 3D networks on smaller datasets like KTH.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":50-71",
+            "content": "### Classification datasets\nThe training and validation of the model cannot be done without comprehensive, \nlarge and well annotated datasets. With the deepening of research on video action recognition, \nmore and more datasets are applied to the research in this field. \nTypical datasets are as follows:\n+ KTH[<sup>1</sup>](#1)\nKTH dataset is an early small action recognition dataset, \nincluding 599 videos of 6 types of actions (walking, jumping, running, punching, waving and clapping). \nThe background is relatively still, except for the zoom in and out of the camera, \nthe camera movement is relatively slight. Since this data set is relatively small, \nit is easy to overfit when training heavy 3D networks, \nso most current researches are not based on this it.\n+ UCF10[<sup>2</sup>](#2)\nUCF101 is a medium-size dataset in which most videos are from YouTube. \nIt contains 13,320 videos with 101 types of actions. \nEach type of action is performed by 25 people, each of whom performs 4-7 sets of actions. \nThe UCF101 and HMDB51 datasets used to be the benchmarks to evaluate the effectiveness of action "
+        },
+        {
+            "comment": "HMDB51 is a dataset proposed by Brown University in 2011, consisting of movie and online video sources. It contains 6849 samples across 51 classes with at least 101 samples each. Kinetics is the largest action recognition dataset, created by Google's DeepMind team in 2017. It uses YouTube videos, now expanded to 600k videos in 700 categories. The categories are divided into human, human and animal, and human and human interaction. Kinetics can train deep networks like 3D-RESNET up to 152 layers without overfitting, solving the issue of small training datasets.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":72-86",
+            "content": "recognition model for a long time before the Kinetics dataset was released.\n+ HMDB51[<sup>3</sup>](#3)\nBrown University's proposed dataset named HMDB51 was released in 2011. \nMost of the videos come from movies, \nbut some come from public databases and online video libraries such as YouTube. \nThe datasets contains 6849 samples divided into 51 classes, \neach of which contains at least 101 samples.\n+ Kinetics[<sup>4</sup>](#4)\nKinetics is the most important large-scale action recognition dataset, which was proposed by Google's DeepMind team in 2017. The video data also comes from YouTube, with 400 categories (now expanded to 700 categories) and more than 300,000 videos (now expanded to 600,000 videos), each lasting about 10 seconds. \nThe action categories are mainly divided into three categories: \"human\", \"human and animal\", \"human and human interaction\". Kinetics can train 3D-RESNET up to 152 layers without over-fitting, \nwhich solves the problem that the previous training dataset is too small to train deep 3D network. "
+        },
+        {
+            "comment": "Kinetics is the benchmark for action recognition, replacing UCF101 and HMDB51. Most studies use this dataset for evaluation and pre-training. SomethingV1 has 108,499 annotated videos with 174 kinds of actions, requiring strong temporal modeling ability. Other datasets include Charades (complex action recognition), Breakfast Action, Sports 1M, THUMOS 2014 (action detection).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":87-103",
+            "content": "Kinetics has replaced UCF101 and HMDB51 as the benchmark in the field of action recognition. \nAt present, most studies use this dataset for evaluation and pre-training.\n+ Something-Something[<sup>5</sup>](#5)\nSomethingV1 contains 108,499 annotated videos (V2 has expanded to 220,847), each of which last two to six seconds. These videos contain 174 kinds of actions. Different from the previous dataset, \nthe identification of this data set requires stronger time information, \nso this dataset has a very important reference value in testing the temporal modeling ability of the model.\nIn addition to the above datasets, there are Charades[<sup>6</sup>](#6) dataset for complex Action recognition, Breakfast Action[<sup>7</sup>](#7), and Sports 1M[<sup>8</sup>](#8).\n### Detection datasets\n+ THUMOS 2014\nThis dataset is from THUMOS Challenge 2014, Its training set is UCF101, validation set and test set include 1010 and 1574 undivided video clips respectively. In the action detection task, only 20 kinds of unsegmented videos of actions were labeled with sequential action fragments, "
+        },
+        {
+            "comment": "The code describes two datasets, Mexaction2 and ActivityNet. Mexaction2 has horse riding and bullfighting actions, split into training, validation, and test sets. It includes YouTube clips, UCF101 horseback riding videos, and an unsegmented 77-hour INA video with low marked action proportions. ActivityNet is the largest database, including classification and detection tasks, but only provides YouTube links without direct downloads.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":104-120",
+            "content": "including 200 validation sets (3007 action fragments) and 213 test sets (3358 action fragments).\n+ MEXaction2\nThe Mexaction2 dataset contains two types of action: horse riding and bullfighting. \nThe dataset consists of three parts: YouTube videos, horseback riding videos in UCF101, and INA videos. \nYouTube clips and horseback riding videos in UCF101 are short segmented video clips that are used as training sets. \nThe INA video is a long unsegmented video with a total length of 77 hours, \nand it is divided into three parts: training, validation and test. \nThere are 1336 action segments in the training set, 310 in the validation set and 329 in the test set. \nMoreover, the Mexaction2 dataset is characterized by very long unsegmented video lengths, \nand marked action segments only account for a very low proportion of the total video length.\n+ ActivityNet\nAt present the largest database, also contains two tasks of classification and detection. \nThis dataset only provides a YouTube link to the video, not a direct download of the video, "
+        },
+        {
+            "comment": "The code discusses the process of feature extraction, motion representation and classification in action recognition. It highlights two stages - manual feature-based method and deep learning-based method. It mentions DTP and IDT as typical motion descriptors used before deep-learning was applied. The code also shows a framework diagram for action recognition.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":121-137",
+            "content": "so you also need to use the YouTube download tool in Python to automatically download the videos. \nThe dataset contains 200 action categories, 20,000 (training + verification + test set) videos, \nand a total of about 700 hours of video.\n## Introduction of classic models\nAs shown in the figure, \nthe action recognition framework mainly includes three steps: \nfeature extraction, motion representation and classification. \nHow to extract spatiotemporal features of video is the core problem of action recognition and video classification.\n <p align=\"center\">\n<img src=\"../../images/action_framework.png\" height=300 width=700 hspace='10'/> <br />\nFramework of action recognition\n</p>\nAccording to different methods, action recognition (video classification) methods can be generally summarized into two stages: \nmanual feature-based method and deep learning-based method. \nTypical motion descriptors in the manual feature-based method stage include DTP and IDT, \nwhich are also the most excellent motion descriptors accepted by most researchers before deep-learning is applied in this field. "
+        },
+        {
+            "comment": "The code discusses the application of deep learning methods in video classification since 2014, highlighting their effectiveness beyond manual motion design. It mentions various classic network structures proposed by researchers for representing motion characteristics and includes images to illustrate these models. The code also introduces PaddleVideo's inclusion of such models like TSN, TSM, slowfast, etc., and anticipates future analysis of these classical models and papers in the field. Additionally, it references an ActivityNet competition for further context.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":138-153",
+            "content": "Interinterested readers may refer to the relevant references at the end of this paper. \nSince 2014, deep learning methods have been gradually applied to the field of video classification. \nAt present, deep learning-based methods have become a hotspot of research in both academic and the practice, and the  effect is far beyond the motion features of manual design. \nSince 2014, many classic network structures have been put forward by the researchers regarding the problem of how to represent motion characteristics, \nas shown in the figure below:\n <p align=\"center\">\n<img src=\"../../images/classic_model.png\" height=300 width=700 hspace='10'/> <br />\nClassic Models\n</p>\nAt present,Paddlevideo has contained several classic models such as:TSN[<sup>9</sup>](#9),TSM[<sup>10</sup>](#10),slowfast[<sup>11</sup>](#11),et al.In the future,\nwe will analyze the classic models and papers in these fields. Please look forward to it\n## Introduction of competetion\n+ [ActivityNet](http://activity-net.org/challenges/2020/challenge.html)"
+        },
+        {
+            "comment": "This code snippet provides information about the ActivityNet competition, which is a large-scale action recognition event held annually since 2016. It focuses on identifying everyday activities from user-generated YouTube videos and has become the most influential in the field of action recognition. The code also includes references to relevant research papers for further reading.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":155-172",
+            "content": "ActivityNet is a large-scale action recognition competition. Since 2016, \nit has been held simultaneously with CVPR every year. Up to this year, \nit has been held for 4 consecutive sessions. It focuses on identifying everyday, high-level, goal-oriented activities from \nuser-generated videos taken from the Internet video portal YouTube. \nAt present, ActivityNet competition has become the most influential competition in the field of action recognition.\n## Reference\n<div id='1'>\n[1] Schuldt C, Laptev I, Caputo B.Recognizing Human Actions: A Local SVM Approach Proceedings of International Conference on Pattern Recognition. Piscataway, NJ: IEEE, 2004:23-26\n</div>\n<br/>\n<div id='2'>\n[2] Soomro K, Zamir A R, Shah M. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. arXiv:1212.0402,2012.\n</div>\n<br/>\n<div id='3'>\n[3] Kuehne H, Jhuang H, Garrote E, et al. HMDB: a large video database for human motion recognition Proceedings of IEEE International Conference on Computer Vision. Piscataway, NJ: IEEE, 2011:2556-2563."
+        },
+        {
+            "comment": "This code provides references to various research papers related to action recognition and video classification, such as the \"Quo Vadis, Action Recognition?\" paper by Carreira and Zisserman, and the \"Hollywood in Homes\" paper by Sigurdsson et al. These references are from well-known conferences like IEEE Conference on Computer Vision and Pattern Recognition (CVPR) and arXiv preprints.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":173-192",
+            "content": "</div>\n<br/>\n<div id='4'>\n[4] Carreira J , Zisserman A . Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2017:6299-6308.\n</div>\n<br/>\n<div id='5'>\n[5] Goyal R, Kahou S E, Michalski V. The \u201csomething something\u201d video database for learning and evaluating visual common sense. arXiv:1706.04261,2017.\n</div>\n<br/>\n<div id='6'>\n[6] Sigurdsson G A , Varol G\u00fcl, Wang Xiaolong, et al. Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding. arXiv: 604.01753,2016\n</div>\n<br/>\n<div id='7'>\n[7] Kuehne H, Arslan A, Serre T. The Language of Actions Recovering the Syntax and Semantics of Goal-Directed Human Activities  Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014.\n</div>\n<br/>\n<div id='8'>\n[8] Karpathy A , Toderici G , Shetty S , et al. Large-Scale Video Classification with Convolutional Neural Networks Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014:1725-1732."
+        },
+        {
+            "comment": "The code represents a list of references for papers related to video recognition. Each reference has an identifier (id) and the corresponding paper details like authors, title, and publication information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/summarize.md\":193-205",
+            "content": "</div>\n<br/>\n<div id='9'>\n[9] Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20\u201336. Springer, 2016.\n</div>\n<br/>\n<div id='10'>\n[10] Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018.\n</div>\n<br/>\n<div id='11'>\n[11] Feichtenhofer C , Fan Haoqi , Malik J , et al. SlowFast Networks for Video Recognition. arXiv:1812.03982,2018.\n</div>"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/15ebb946-02e0-453d-a13e-f3597abefd86.json b/docs/doc/15ebb946-02e0-453d-a13e-f3597abefd86.json
new file mode 100644
index 000000000..fcba42590
--- /dev/null
+++ b/docs/doc/15ebb946-02e0-453d-a13e-f3597abefd86.json
@@ -0,0 +1,55 @@
+{
+    "summary": "The AveragePrecisionCalculator calculates average precision in video object detection tasks, using a priority queue and providing methods for non-interpolated average precision. It also includes sorting, recall & precision computation, data shuffling, and prediction normalization.",
+    "details": [
+        {
+            "comment": "This code calculates the interpolated average precision for an entire list or top-n ranked items, following the definition provided in the given reference. It can be used as a static function call to calculate average precision for short ranked lists in memory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":0-22",
+            "content": "# Copyright 2020 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Calculate or keep track of the interpolated average precision.\nIt provides an interface for calculating interpolated average precision for an\nentire list or the top-n ranked items. For the definition of the\n(non-)interpolated average precision:\nhttp://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf\nExample usages:\n1) Use it as a static function call to directly calculate average precision for\na short ranked list in the memory."
+        },
+        {
+            "comment": "The code defines an AveragePrecisionCalculator class that calculates average precision based on a ranked list. The calculator can handle long lists that cannot fit in memory or partial predictions observed over time (such as from Tensorflow). It uses the accumulate method to process parts of the ranked list and peek_interpolated_ap_at_n to calculate the interpolated average precision at a specific recall level.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":24-54",
+            "content": "```\nimport random\np = np.array([random.random() for _ in xrange(10)])\na = np.array([random.choice([0, 1]) for _ in xrange(10)])\nap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)\n```\n2) Use it as an object for long ranked list that cannot be stored in memory or\nthe case where partial predictions can be observed at a time (Tensorflow\npredictions). In this case, we first call the function accumulate many times\nto process parts of the ranked list. After processing all the parts, we call\npeek_interpolated_ap_at_n.\n```\np1 = np.array([random.random() for _ in xrange(5)])\na1 = np.array([random.choice([0, 1]) for _ in xrange(5)])\np2 = np.array([random.random() for _ in xrange(5)])\na2 = np.array([random.choice([0, 1]) for _ in xrange(5)])\n# interpolated average precision at 10 using 1000 break points\ncalculator = average_precision_calculator.AveragePrecisionCalculator(10)\ncalculator.accumulate(p1, a1)\ncalculator.accumulate(p2, a2)\nap3 = calculator.peek_ap_at_n()\n```\n\"\"\"\nimport heapq\nimport random\nimport numbers"
+        },
+        {
+            "comment": "AveragePrecisionCalculator is a class used for calculating the average precision and average precision at n. It constructs an object to calculate average precision for single label, with optional top_n parameter for average precision at n. If top_n is not positive integer or None, a ValueError is raised. The class maintains heap of (prediction, actual) pairs and total positives seen. Heap size can be queried using the heap_size property.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":56-85",
+            "content": "import numpy\nclass AveragePrecisionCalculator(object):\n    \"\"\"Calculate the average precision and average precision at n.\"\"\"\n    def __init__(self, top_n=None):\n        \"\"\"Construct an AveragePrecisionCalculator to calculate average precision.\n    This class is used to calculate the average precision for a single label.\n    Args:\n      top_n: A positive Integer specifying the average precision at n, or\n        None to use all provided data points.\n    Raises:\n      ValueError: An error occurred when the top_n is not a positive integer.\n    \"\"\"\n        if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):\n            raise ValueError(\"top_n must be a positive integer or None.\")\n        self._top_n = top_n  # average precision at n\n        self._total_positives = 0  # total number of positives have seen\n        self._heap = []  # max heap of (prediction, actual)\n    @property\n    def heap_size(self):\n        \"\"\"Gets the heap size maintained in the class.\"\"\"\n        return len(self._heap)\n    @property"
+        },
+        {
+            "comment": "This code defines a class that calculates the average precision in video object detection tasks. It provides methods to accumulate positive samples and return the total number of positives. The accumulate method takes prediction scores, ground truth labels, and optional num_positives parameter for accurate tracking when inputs are incomplete.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":86-107",
+            "content": "    def num_accumulated_positives(self):\n        \"\"\"Gets the number of positive samples that have been accumulated.\"\"\"\n        return self._total_positives\n    def accumulate(self, predictions, actuals, num_positives=None):\n        \"\"\"Accumulate the predictions and their ground truth labels.\n    After the function call, we may call peek_ap_at_n to actually calculate\n    the average precision.\n    Note predictions and actuals must have the same shape.\n    Args:\n      predictions: a list storing the prediction scores.\n      actuals: a list storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      num_positives = If the 'predictions' and 'actuals' inputs aren't complete,\n      then it's possible some true positives were missed in them. In that case,\n      you can provide 'num_positives' in order to accurately track recall.\n    Raises:\n      ValueError: An error occurred when the format of the input is not the\n      numpy 1-D array or the shape of predictions and actuals does not match."
+        },
+        {
+            "comment": "This code checks if the length of predictions and actuals match. It also ensures that num_positives is a nonzero number, then adds positives to total_positives. The code uses heapq to push and pop elements in a priority queue based on top_n, ensuring correctness and efficiency.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":108-133",
+            "content": "    \"\"\"\n        if len(predictions) != len(actuals):\n            raise ValueError(\n                \"the shape of predictions and actuals does not match.\")\n        if not num_positives is None:\n            if not isinstance(num_positives,\n                              numbers.Number) or num_positives < 0:\n                raise ValueError(\n                    \"'num_positives' was provided but it wan't a nonzero number.\"\n                )\n        if not num_positives is None:\n            self._total_positives += num_positives\n        else:\n            self._total_positives += numpy.size(numpy.where(actuals > 0))\n        topk = self._top_n\n        heap = self._heap\n        for i in range(numpy.size(predictions)):\n            if topk is None or len(heap) < topk:\n                heapq.heappush(heap, (predictions[i], actuals[i]))\n            else:\n                if predictions[i] > heap[0][0]:  # heap[0] is the smallest\n                    heapq.heappop(heap)\n                    heapq.heappush(heap, (predictions[i], actuals[i]))"
+        },
+        {
+            "comment": "This code defines a class that calculates non-interpolated average precision. It has methods to clear accumulated predictions, peek the non-interpolated average precision at n, and calculate non-interpolated average precision from prediction and actual scores. The class uses numpy arrays and requires positive labels to be greater than 0 and negative labels as 0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":135-165",
+            "content": "    def clear(self):\n        \"\"\"Clear the accumulated predictions.\"\"\"\n        self._heap = []\n        self._total_positives = 0\n    def peek_ap_at_n(self):\n        \"\"\"Peek the non-interpolated average precision at n.\n    Returns:\n      The non-interpolated average precision at n (default 0).\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    \"\"\"\n        if self.heap_size <= 0:\n            return 0\n        predlists = numpy.array(list(zip(*self._heap)))\n        ap = self.ap_at_n(predlists[0],\n                          predlists[1],\n                          n=self._top_n,\n                          total_num_positives=self._total_positives)\n        return ap\n    @staticmethod\n    def ap(predictions, actuals):\n        \"\"\"Calculate the non-interpolated average precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      actuals: a numpy 1-D array storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives."
+        },
+        {
+            "comment": "This code calculates the non-interpolated average precision at a specified number 'n' from given predictions and actuals. It raises a ValueError if the input format is not a numpy 1D array, or the shape of predictions and actuals does not match.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":167-191",
+            "content": "    Returns:\n      The non-interpolated average precision at n.\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    Raises:\n      ValueError: An error occurred when the format of the input is not the\n      numpy 1-D array or the shape of predictions and actuals does not match.\n    \"\"\"\n        return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None)\n    @staticmethod\n    def ap_at_n(predictions, actuals, n=20, total_num_positives=None):\n        \"\"\"Calculate the non-interpolated average precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      actuals: a numpy 1-D array storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      n: the top n items to be considered in ap@n.\n      total_num_positives : (optionally) you can specify the number of total\n        positive\n      in the list. If specified, it will be used in calculation.\n    Returns:"
+        },
+        {
+            "comment": "This function calculates the non-interpolated average precision at a given rank 'n'. It checks if the lengths of predictions and actuals match, ensures 'n' is an integer greater than zero, and shuffles the lists to avoid overestimation. If any errors occur, it raises a ValueError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":192-219",
+            "content": "      The non-interpolated average precision at n.\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    Raises:\n      ValueError: An error occurred when\n      1) the format of the input is not the numpy 1-D array;\n      2) the shape of predictions and actuals does not match;\n      3) the input n is not a positive integer.\n    \"\"\"\n        if len(predictions) != len(actuals):\n            raise ValueError(\n                \"the shape of predictions and actuals does not match.\")\n        if n is not None:\n            if not isinstance(n, int) or n <= 0:\n                raise ValueError(\"n must be 'None' or a positive integer.\"\n                                 \" It was '%s'.\" % n)\n        ap = 0.0\n        predictions = numpy.array(predictions)\n        actuals = numpy.array(actuals)\n        # add a shuffler to avoid overestimating the ap\n        predictions, actuals = AveragePrecisionCalculator._shuffle(\n            predictions, actuals)\n        sortidx = sorted(range(len(predictions)),"
+        },
+        {
+            "comment": "This function calculates the average precision (AP) of a set of predictions and actuals. It first sorts the predictions by value, then calculates recall and precision to compute AP. If a total number of positives is provided, it uses that instead of counting non-zero actuals. The function also includes helper methods for shuffling the data and normalizing predictions with an epsilon value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":220-255",
+            "content": "                         key=lambda k: predictions[k],\n                         reverse=True)\n        if total_num_positives is None:\n            numpos = numpy.size(numpy.where(actuals > 0))\n        else:\n            numpos = total_num_positives\n        if numpos == 0:\n            return 0\n        if n is not None:\n            numpos = min(numpos, n)\n        delta_recall = 1.0 / numpos\n        poscount = 0.0\n        # calculate the ap\n        r = len(sortidx)\n        if n is not None:\n            r = min(r, n)\n        for i in range(r):\n            if actuals[sortidx[i]] > 0:\n                poscount += 1\n                ap += poscount / (i + 1) * delta_recall\n        return ap\n    @staticmethod\n    def _shuffle(predictions, actuals):\n        random.seed(0)\n        suffidx = random.sample(range(len(predictions)), len(predictions))\n        predictions = predictions[suffidx]\n        actuals = actuals[suffidx]\n        return predictions, actuals\n    @staticmethod\n    def _zero_one_normalize(predictions, epsilon=1e-7):"
+        },
+        {
+            "comment": "This function normalizes the predictions to a range of 0.0-1.0, ensuring that the rank in the original list remains unchanged and does not affect the average precision calculation. It prevents division by zero using a small epsilon value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py\":256-273",
+            "content": "        \"\"\"Normalize the predictions to the range between 0.0 and 1.0.\n    For some predictions like SVM predictions, we need to normalize them before\n    calculate the interpolated average precision. The normalization will not\n    change the rank in the original list and thus won't change the average\n    precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      epsilon: a small constant to avoid denominator being zero.\n    Returns:\n      The normalized prediction.\n    \"\"\"\n        denominator = numpy.max(predictions) - numpy.min(predictions)\n        ret = (predictions - numpy.min(predictions)) / numpy.max(\n            denominator, epsilon)\n        return ret"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/164872e3-44dd-4f30-91e5-cb0d20319c74.json b/docs/doc/164872e3-44dd-4f30-91e5-cb0d20319c74.json
new file mode 100644
index 000000000..7c7d98157
--- /dev/null
+++ b/docs/doc/164872e3-44dd-4f30-91e5-cb0d20319c74.json
@@ -0,0 +1,95 @@
+{
+    "summary": "The code presents a CTRGCN backbone for video models, initializes a CTRGC model with batch normalization layers and NTUGraph class, defines a neural network model with TCN_GCN_unit, and includes a final layer 10 (l10) to process input and return output.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries, defines a convolution initialization function and a batch normalization initialization function. It also sets up scale values for the batch normalization function and registers backbone models in the registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":0-30",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\ndef conv_init(conv):\n    if conv.weight is not None:\n        weight_init_(conv.weight, 'kaiming_normal_', mode='fan_in')\n    if conv.bias is not None:\n        nn.initializer.Constant(value=0.0)(conv.bias)\ndef bn_init(bn, scale):\n    nn.initializer.Constant(value=float(scale))(bn.weight)"
+        },
+        {
+            "comment": "Defines a CTRGC class, a type of convolutional neural network layer. It has two reductions: rel_reduction (defaults to 8) and mid_reduction (defaults to 1). Depending on the input channels, it assigns different channel numbers for rel_channels (always 8 if in_channels is 3 or 9; otherwise based on rel_reduction). It also initializes a Conv2D layer with the assigned channel numbers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":31-65",
+            "content": "    nn.initializer.Constant(value=0.0)(bn.bias)\ndef einsum(x1, x3):\n    \"\"\"paddle.einsum only support in dynamic graph mode.\n    x1 : n c u v\n    x2 : n c t v\n    \"\"\"\n    n, c, u, v1 = x1.shape\n    n, c, t, v3 = x3.shape\n    assert (v1 == v3), \"Args of einsum not match!\"\n    x1 = paddle.transpose(x1, perm=[0, 1, 3, 2])  # n c v u\n    y = paddle.matmul(x3, x1)\n    # out: n c t u\n    return y\nclass CTRGC(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 rel_reduction=8,\n                 mid_reduction=1):\n        super(CTRGC, self).__init__()\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        if in_channels == 3 or in_channels == 9:\n            self.rel_channels = 8\n            self.mid_channels = 16\n        else:\n            self.rel_channels = in_channels // rel_reduction\n            self.mid_channels = in_channels // mid_reduction\n        self.conv1 = nn.Conv2D(self.in_channels,\n                               self.rel_channels,"
+        },
+        {
+            "comment": "This code defines a Convolutional Temporal Relational Graph Convolutional Network (CTRGCN) backbone for a video model. It initializes weights and performs forward pass calculations. It uses convolution layers, tanh activation function, and optionally includes an additional input A.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":66-92",
+            "content": "                               kernel_size=1)\n        self.conv2 = nn.Conv2D(self.in_channels,\n                               self.rel_channels,\n                               kernel_size=1)\n        self.conv3 = nn.Conv2D(self.in_channels,\n                               self.out_channels,\n                               kernel_size=1)\n        self.conv4 = nn.Conv2D(self.rel_channels,\n                               self.out_channels,\n                               kernel_size=1)\n        self.tanh = nn.Tanh()\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                conv_init(m)\n            elif isinstance(m, nn.BatchNorm2D):\n                bn_init(m, 1)\n    def forward(self, x, A=None, alpha=1):\n        x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean(-2), self.conv3(\n            x)\n        x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2))\n        x1 = self.conv4(x1) * alpha + (\n            A.unsqueeze(0).unsqueeze(0) if A is not None else 0)  # N,C,V,V"
+        },
+        {
+            "comment": "Code snippet defines a class TemporalConv, which is a 2D convolutional layer for temporal data. It inherits from the paddle.nn.Layer and includes an instance of nn.Conv2D and nn.BatchNorm2D layers. The MultiScale_TemporalConv class is also defined but its implementation is missing, suggesting it extends TemporalConv with multiple temporal convolution blocks for multi-scale processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":93-126",
+            "content": "        # We only support 'paddle.einsum()' in dynamic graph mode, if use in infer model please implement self.\n        # x1 = paddle.einsum('ncuv,nctv->nctu', x1, x3)\n        x1 = einsum(x1, x3)\n        return x1\nclass TemporalConv(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 dilation=1):\n        super(TemporalConv, self).__init__()\n        pad = (kernel_size + (kernel_size - 1) * (dilation - 1) - 1) // 2\n        self.conv = nn.Conv2D(in_channels,\n                              out_channels,\n                              kernel_size=(kernel_size, 1),\n                              padding=(pad, 0),\n                              stride=(stride, 1),\n                              dilation=(dilation, 1))\n        self.bn = nn.BatchNorm2D(out_channels)\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        return x\nclass MultiScale_TemporalConv(nn.Layer):\n    def __init__(self,"
+        },
+        {
+            "comment": "This code defines a MultiScale_TemporalConv layer with multiple branches of temporal convolution. The number of branches is determined by the dilations, and out channels should be multiples of the number of branches for correct operation. Each branch has its own kernel size, and there are Conv2D layers followed by BatchNorm2D for each branch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":127-154",
+            "content": "                 in_channels,\n                 out_channels,\n                 kernel_size=3,\n                 stride=1,\n                 dilations=[1, 2, 3, 4],\n                 residual=True,\n                 residual_kernel_size=1):\n        super(MultiScale_TemporalConv, self).__init__()\n        assert out_channels % (\n            len(dilations) +\n            2) == 0, '# out channels should be multiples of # branches'\n        # Multiple branches of temporal convolution\n        self.num_branches = len(dilations) + 2\n        branch_channels = out_channels // self.num_branches\n        if type(kernel_size) == list:\n            assert len(kernel_size) == len(dilations)\n        else:\n            kernel_size = [kernel_size] * len(dilations)\n        # Temporal Convolution branches\n        self.branches = nn.LayerList([\n            nn.Sequential(\n                nn.Conv2D(in_channels,\n                          branch_channels,\n                          kernel_size=1,\n                          padding=0),\n                nn.BatchNorm2D(branch_channels),"
+        },
+        {
+            "comment": "This code defines a Conv-Temporal RGN backbone model for video analysis. It consists of multiple branches with various convolutional and pooling layers, including TemporalConv and MaxPool2D operations. The branches are appended to the model and initialized with respective settings such as kernel size, dilation rate, etc.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":155-181",
+            "content": "                nn.ReLU(),\n                TemporalConv(branch_channels,\n                             branch_channels,\n                             kernel_size=ks,\n                             stride=stride,\n                             dilation=dilation),\n            ) for ks, dilation in zip(kernel_size, dilations)\n        ])\n        # Additional Max & 1x1 branch\n        self.branches.append(\n            nn.Sequential(\n                nn.Conv2D(in_channels,\n                          branch_channels,\n                          kernel_size=1,\n                          padding=0), nn.BatchNorm2D(branch_channels),\n                nn.ReLU(),\n                nn.MaxPool2D(kernel_size=(3, 1),\n                             stride=(stride, 1),\n                             padding=(1, 0)), nn.BatchNorm2D(branch_channels)))\n        self.branches.append(\n            nn.Sequential(\n                nn.Conv2D(in_channels,\n                          branch_channels,\n                          kernel_size=1,\n                          padding=0,"
+        },
+        {
+            "comment": "This code defines a class for a Conv-Temporal Residual Group Convolutional Network (CTRGCN) backbone. The class contains a constructor that sets up the architecture, an initialization function to set the weights, and a forward pass function for feeding data into the model. It performs residual connections using temporal convolutions and has batch normalization layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":182-210",
+            "content": "                          stride=(stride, 1)), nn.BatchNorm2D(branch_channels)))\n        # Residual connection\n        if not residual:\n            self.residual = lambda x: 0\n        elif (in_channels == out_channels) and (stride == 1):\n            self.residual = lambda x: x\n        else:\n            self.residual = TemporalConv(in_channels,\n                                         out_channels,\n                                         kernel_size=residual_kernel_size,\n                                         stride=stride)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        # initialize\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                conv_init(m)\n            elif isinstance(m, nn.BatchNorm2D):\n                weight_init_(m.weight, 'Normal', std=0.02, mean=1.0)\n                nn.initializer.Constant(value=0.0)(m.bias)\n    def forward(self, x):\n        # Input dim: (N,C,T,V)\n        res = self.residual(x)\n        branch_outs = []\n        for tempconv in self.branches:"
+        },
+        {
+            "comment": "This code defines two classes: \"unit_tcn\" and \"unit_gcn\". The \"unit_tcn\" class is a Temporal Convolutional Network unit that performs temporal convolution with batch normalization and ReLU activation. The \"unit_gcn\" class is a Graph Convolutional Network unit that takes input channels, output channels, adjacency matrix A, coefficient embedding, adaptive flag, and residual flag as parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":211-249",
+            "content": "            out = tempconv(x)\n            branch_outs.append(out)\n        out = paddle.concat(branch_outs, axis=1)\n        out += res\n        return out\nclass unit_tcn(nn.Layer):\n    def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):\n        super(unit_tcn, self).__init__()\n        pad = int((kernel_size - 1) / 2)\n        self.conv = nn.Conv2D(in_channels,\n                              out_channels,\n                              kernel_size=(kernel_size, 1),\n                              padding=(pad, 0),\n                              stride=(stride, 1))\n        self.bn = nn.BatchNorm2D(out_channels)\n        self.relu = nn.ReLU()\n        conv_init(self.conv)\n        bn_init(self.bn, 1)\n    def forward(self, x):\n        x = self.bn(self.conv(x))\n        return x\nclass unit_gcn(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 A,\n                 coff_embedding=4,\n                 adaptive=True,\n                 residual=True):\n        super(unit_gcn, self).__init__()"
+        },
+        {
+            "comment": "This code initializes a CTRGC model with specified input and output channels. It also includes optional residual connection, batch normalization, and adaptive parameterization. The number of subsets is determined by the shape of A. If adaptive is set to True, it creates a trainable parameter for the subset of weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":250-275",
+            "content": "        inter_channels = out_channels // coff_embedding\n        self.inter_c = inter_channels\n        self.out_c = out_channels\n        self.in_c = in_channels\n        self.adaptive = adaptive\n        self.num_subset = A.shape[0]\n        self.convs = nn.LayerList()\n        for i in range(self.num_subset):\n            self.convs.append(CTRGC(in_channels, out_channels))\n        if residual:\n            if in_channels != out_channels:\n                self.down = nn.Sequential(\n                    nn.Conv2D(in_channels, out_channels, 1),\n                    nn.BatchNorm2D(out_channels))\n            else:\n                self.down = lambda x: x\n        else:\n            self.down = lambda x: 0\n        if self.adaptive:\n            pa_param = paddle.ParamAttr(\n                initializer=paddle.nn.initializer.Assign(A.astype(np.float32)))\n            self.PA = paddle.create_parameter(shape=A.shape,\n                                              dtype='float32',\n                                              attr=pa_param)"
+        },
+        {
+            "comment": "This code initializes the parameters A and alpha, sets up batch normalization (bn) layers with Softmax and ReLU activation functions, initializes weights using conv_init and bn_init functions, and defines a forward pass that adapts A based on the adaptive flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":276-305",
+            "content": "        else:\n            A_tensor = paddle.to_tensor(A, dtype=\"float32\")\n            self.A = paddle.create_parameter(\n                shape=A_tensor.shape,\n                dtype='float32',\n                default_initializer=paddle.nn.initializer.Assign(A_tensor))\n            self.A.stop_gradient = True\n        alpha_tensor = paddle.to_tensor(np.zeros(1), dtype=\"float32\")\n        self.alpha = paddle.create_parameter(\n            shape=alpha_tensor.shape,\n            dtype='float32',\n            default_initializer=paddle.nn.initializer.Assign(alpha_tensor))\n        self.bn = nn.BatchNorm2D(out_channels)\n        self.soft = nn.Softmax(-2)\n        self.relu = nn.ReLU()\n    def init_weights(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                conv_init(m)\n            elif isinstance(m, nn.BatchNorm2D):\n                bn_init(m, 1)\n        bn_init(self.bn, 1e-6)\n    def forward(self, x):\n        y = None\n        if self.adaptive:\n            A = self.PA\n        else:\n            A = self.A.cuda(x.get_device())"
+        },
+        {
+            "comment": "This code defines a TCN_GCN_unit class, which is a combination of Graph Convolutional Network (GCN) and Temporal Convolution units. The unit takes input channels, output channels, adjacency matrix A, stride, residual connection, adaptive flag, kernel size, and dilations as parameters. It initializes the GCN and TemporalConv layers, followed by a ReLU activation function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":306-334",
+            "content": "        for i in range(self.num_subset):\n            z = self.convs[i](x, A[i], self.alpha)\n            y = z + y if y is not None else z\n        y = self.bn(y)\n        y += self.down(x)\n        y = self.relu(y)\n        return y\nclass TCN_GCN_unit(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 A,\n                 stride=1,\n                 residual=True,\n                 adaptive=True,\n                 kernel_size=5,\n                 dilations=[1, 2]):\n        super(TCN_GCN_unit, self).__init__()\n        self.gcn1 = unit_gcn(in_channels, out_channels, A, adaptive=adaptive)\n        self.tcn1 = MultiScale_TemporalConv(out_channels,\n                                            out_channels,\n                                            kernel_size=kernel_size,\n                                            stride=stride,\n                                            dilations=dilations,\n                                            residual=False)\n        self.relu = nn.ReLU()"
+        },
+        {
+            "comment": "The code defines a `CTRGCN` class with a `forward` method and an `NTUDGraph` class. The `forward` method takes input `x`, applies `relu` activation and adds the residual output of a `unit_tcn` layer or simply passes through if specified conditions are met. The `NTUDGraph` initializes with a fixed number of nodes, self-links, and inward connections.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":335-362",
+            "content": "        if not residual:\n            self.residual = lambda x: 0\n        elif (in_channels == out_channels) and (stride == 1):\n            self.residual = lambda x: x\n        else:\n            self.residual = unit_tcn(in_channels,\n                                     out_channels,\n                                     kernel_size=1,\n                                     stride=stride)\n    def forward(self, x):\n        y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))\n        return y\nclass NTUDGraph:\n    def __init__(self, labeling_mode='spatial'):\n        num_node = 25\n        self_link = [(i, i) for i in range(num_node)]\n        inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),\n                            (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),\n                            (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),\n                            (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),\n                            (23, 8), (24, 25), (25, 12)]\n        inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]"
+        },
+        {
+            "comment": "Function `get_adjacency_matrix` generates adjacency matrices for the model. The function takes a parameter `labeling_mode`, which is optional. It initializes a set of variables: `inward`, `outward`, and `neighbor`. These variables store the connections between nodes in both directions. Then, it calls other helper functions to generate normalized adjacency matrices for self-links, inward edges, outward edges, and finally returns an array containing all these matrices. This is useful for inputting into a model that requires specific formatted input data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":363-396",
+            "content": "        outward = [(j, i) for (i, j) in inward]\n        neighbor = inward + outward\n        self.num_node = num_node\n        self.self_link = self_link\n        self.inward = inward\n        self.outward = outward\n        self.neighbor = neighbor\n        self.A = self.get_adjacency_matrix(labeling_mode)\n    def edge2mat(self, link, num_node):\n        A = np.zeros((num_node, num_node))\n        for i, j in link:\n            A[j, i] = 1\n        return A\n    def normalize_digraph(self, A):\n        Dl = np.sum(A, 0)\n        h, w = A.shape\n        Dn = np.zeros((w, w))\n        for i in range(w):\n            if Dl[i] > 0:\n                Dn[i, i] = Dl[i]**(-1)\n        AD = np.dot(A, Dn)\n        return AD\n    def get_spatial_graph(self, num_node, self_link, inward, outward):\n        I = self.edge2mat(self_link, num_node)\n        In = self.normalize_digraph(self.edge2mat(inward, num_node))\n        Out = self.normalize_digraph(self.edge2mat(outward, num_node))\n        A = np.stack((I, In, Out))\n        return A\n    def get_adjacency_matrix(self, labeling_mode=None):"
+        },
+        {
+            "comment": "This code is part of the CTRGCN class in the PaddleVideo library, which represents a specific type of model for skeleton-based action recognition. The function within this code block is used to return an adjacency matrix (A) based on a given labeling mode. If no labeling mode is specified, it returns the adjacency matrix from the instance variables. If the labeling mode is set to 'spatial', it calls another function to generate a spatial adjacency graph. Otherwise, if an invalid labeling mode is provided, it raises a ValueError exception.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":397-425",
+            "content": "        if labeling_mode is None:\n            return self.A\n        if labeling_mode == 'spatial':\n            A = self.get_spatial_graph(self.num_node, self.self_link,\n                                       self.inward, self.outward)\n        else:\n            raise ValueError()\n        return A\n@BACKBONES.register()\nclass CTRGCN(nn.Layer):\n    \"\"\"\n    CTR-GCN model from:\n    `\"Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition\" <https://arxiv.org/abs/2107.12213>`_\n    Args:\n        num_point: int, numbers of sketeton point.\n        num_person: int, numbers of person.\n        base_channel: int, model's hidden dim.\n        graph: str, sketeton adjacency matrix name.\n        graph_args: dict, sketeton adjacency graph class args.\n        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 3.\n        adaptive: bool, if adjacency matrix can adaptive.\n    \"\"\"\n    def __init__(self,\n                 num_point=25,\n                 num_person=2,\n                 base_channel=64,"
+        },
+        {
+            "comment": "This code defines the CTRGCN class, which initializes its graph and layers based on input parameters. It includes a batch normalization layer (data_bn) and three TCN_GCN_unit layers (l1, l2, l3). The graph is determined by the 'graph' parameter, with NTUDGraph used if 'ntu_rgb_d'. If another graph is provided, it raises a ValueError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":426-454",
+            "content": "                 graph='ntu_rgb_d',\n                 graph_args=dict(),\n                 in_channels=3,\n                 adaptive=True):\n        super(CTRGCN, self).__init__()\n        if graph == 'ntu_rgb_d':\n            self.graph = NTUDGraph(**graph_args)\n        else:\n            raise ValueError()\n        A = self.graph.A  # 3,25,25\n        self.num_point = num_point\n        self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point)\n        self.base_channel = base_channel\n        self.l1 = TCN_GCN_unit(in_channels,\n                               self.base_channel,\n                               A,\n                               residual=False,\n                               adaptive=adaptive)\n        self.l2 = TCN_GCN_unit(self.base_channel,\n                               self.base_channel,\n                               A,\n                               adaptive=adaptive)\n        self.l3 = TCN_GCN_unit(self.base_channel,\n                               self.base_channel,\n                               A,"
+        },
+        {
+            "comment": "The code initializes six TCN_GCN_unit layers, each with different configurations, for a CTRGCN model. The first layer (l4) has the base channel as input and output. Following layers (l5 to l8) increase the number of channels or apply strides. This represents a deep TCN-GCN architecture with progressively increasing depth and downsampling.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":455-476",
+            "content": "                               adaptive=adaptive)\n        self.l4 = TCN_GCN_unit(self.base_channel,\n                               self.base_channel,\n                               A,\n                               adaptive=adaptive)\n        self.l5 = TCN_GCN_unit(self.base_channel,\n                               self.base_channel * 2,\n                               A,\n                               stride=2,\n                               adaptive=adaptive)\n        self.l6 = TCN_GCN_unit(self.base_channel * 2,\n                               self.base_channel * 2,\n                               A,\n                               adaptive=adaptive)\n        self.l7 = TCN_GCN_unit(self.base_channel * 2,\n                               self.base_channel * 2,\n                               A,\n                               adaptive=adaptive)\n        self.l8 = TCN_GCN_unit(self.base_channel * 2,\n                               self.base_channel * 4,\n                               A,\n                               stride=2,"
+        },
+        {
+            "comment": "This code defines a neural network model with multiple layers. It uses Paddle's TCN_GCN_unit in the last two layers. The init_weights function initializes batch normalization for the data_bn layer, and the forward function processes input through multiple layers before returning the final output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":477-510",
+            "content": "                               adaptive=adaptive)\n        self.l9 = TCN_GCN_unit(self.base_channel * 4,\n                               self.base_channel * 4,\n                               A,\n                               adaptive=adaptive)\n        self.l10 = TCN_GCN_unit(self.base_channel * 4,\n                                self.base_channel * 4,\n                                A,\n                                adaptive=adaptive)\n    def init_weights(self):\n        bn_init(self.data_bn, 1)\n    def forward(self, x):\n        N, C, T, V, M = x.shape\n        x = paddle.transpose(x, perm=[0, 4, 3, 1, 2])\n        x = paddle.reshape(x, (N, M * V * C, T))\n        x = self.data_bn(x)\n        x = paddle.reshape(x, (N, M, V, C, T))\n        x = paddle.transpose(x, perm=(0, 1, 3, 4, 2))\n        x = paddle.reshape(x, (N * M, C, T, V))\n        x = self.l1(x)\n        x = self.l2(x)\n        x = self.l3(x)\n        x = self.l4(x)\n        x = self.l5(x)\n        x = self.l6(x)\n        x = self.l7(x)\n        x = self.l8(x)\n        x = self.l9(x)"
+        },
+        {
+            "comment": "This code represents the final step of a neural network function. It applies layer 10 (l10) to input x, and returns both the updated x and the original N, M values. This function seems to be part of a larger model, as it references previous layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ctrgcn.py\":511-513",
+            "content": "        x = self.l10(x)\n        return x, N, M"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/18afc32c-df62-4b9b-a373-19c278db53f3.json b/docs/doc/18afc32c-df62-4b9b-a373-19c278db53f3.json
new file mode 100644
index 000000000..b4322997e
--- /dev/null
+++ b/docs/doc/18afc32c-df62-4b9b-a373-19c278db53f3.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This script sets up a model environment, downloads weights if needed, initializes the Infer model, and runs inference on input videos while saving results and features.",
+    "details": [
+        {
+            "comment": "This code is a Python script with licensing information and import statements. It imports necessary libraries like numpy, paddle, and others for data processing, model training, and evaluation. The code also sets up the logging format, and checks for CUDA availability and PaddlePaddle version.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/tsn_extractor.py\":0-36",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport time\nimport logging\nimport argparse\nimport ast\nimport numpy as np\nimport paddle\nimport paddle.static as static\ntry:\n    import cPickle as pickle\nexcept:\n    import pickle\nfrom utils.config_utils import *\nimport models\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'"
+        },
+        {
+            "comment": "This code defines a function `parse_args()` to parse command-line arguments for training a model. The arguments include model name, config file path, whether to use GPU, weight path, and batch size. It uses argparse module for easy argument handling. By default, it sets the model name to 'AttentionCluster', config file path to 'configs/attention_cluster.txt', uses GPU if not specified otherwise, automatically downloads weights from Paddle if no specific path is provided, and sets batch size to 1.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/tsn_extractor.py\":37-65",
+            "content": "logging.basicConfig(level=logging.DEBUG, format=FORMAT, stream=sys.stdout)\nlogger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='AttentionCluster',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/attention_cluster.txt',\n                        help='path to config file of model')\n    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument(\n        '--weights',\n        type=str,\n        default=None,\n        help=\n        'weight path, None to automatically download weights provided by Paddle.'\n    )\n    parser.add_argument('--batch_size',\n                        type=int,\n                        default=1,\n                        help='sample number in a batch for inference.')"
+        },
+        {
+            "comment": "The code defines command line arguments for the TsnExtractor. It sets default values and provides help messages for each argument. The function then parses these arguments to create an 'args' object, which can be used throughout the program. Additionally, the 'infer' function is defined but not implemented.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/tsn_extractor.py\":66-92",
+            "content": "    parser.add_argument('--filelist',\n                        type=str,\n                        default='./data/TsnExtractor.list',\n                        help='path to inferenece data file lists file.')\n    parser.add_argument('--log_interval',\n                        type=int,\n                        default=1,\n                        help='mini-batch interval to log.')\n    parser.add_argument('--infer_topk',\n                        type=int,\n                        default=20,\n                        help='topk predictions to restore.')\n    parser.add_argument('--save_dir',\n                        type=str,\n                        default=os.path.join('data', 'tsn_features'),\n                        help='directory to store tsn feature results')\n    parser.add_argument('--video_path',\n                        type=str,\n                        default=None,\n                        help='directory to store results')\n    args = parser.parse_args()\n    return args\ndef infer(args):\n    # parse config\n    config = parse_config(args.config)"
+        },
+        {
+            "comment": "The code initializes the Infer model with provided configurations and merges them to create the infer_config. It then builds the input, model, and gets feeds and outputs for inference. The place and executor are set based on whether or not GPU is used. The filelist and video path are checked for existence before initializing the infer reader with the model name, mode (infer), and configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/tsn_extractor.py\":93-117",
+            "content": "    infer_config = merge_configs(config, 'infer', vars(args))\n    print_configs(infer_config, \"Infer\")\n    infer_model = models.get_model(args.model_name,\n                                   infer_config,\n                                   mode='infer',\n                                   is_videotag=True)\n    infer_model.build_input(use_dataloader=False)\n    infer_model.build_model()\n    infer_feeds = infer_model.feeds()\n    infer_outputs = infer_model.outputs()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(static.default_startup_program())\n    filelist = args.filelist or infer_config.INFER.filelist\n    filepath = args.video_path or infer_config.INFER.get('filepath', '')\n    if filepath != '':\n        assert os.path.exists(filepath), \"{} not exist.\".format(filepath)\n    else:\n        assert os.path.exists(filelist), \"{} not exist.\".format(filelist)\n    # get infer reader\n    infer_reader = get_reader(args.model_name.upper(), 'infer', infer_config)"
+        },
+        {
+            "comment": "This code snippet checks if the weights (model parameters) are provided as an argument. If not, it downloads them from Paddle's servers. Then, it loads the weights into the model and creates a DataFeeder for feeding data during inference. It also initializes metrics to measure inference performance. The code then iterates over each input video, running inference with the loaded model, and saving the results for each frame.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/tsn_extractor.py\":119-143",
+            "content": "    if args.weights:\n        assert os.path.exists(\n            args.weights), \"Given weight dir {} not exist.\".format(args.weights)\n    # if no weight files specified, download weights from paddle\n    weights = args.weights or infer_model.get_weights()\n    infer_model.load_test_weights(exe, weights, static.default_main_program())\n    infer_feeder = paddle.fluid.DataFeeder(place=place, feed_list=infer_feeds)\n    fetch_list = infer_model.fetches()\n    infer_metrics = get_metrics(args.model_name.upper(), 'infer', infer_config)\n    infer_metrics.reset()\n    if not os.path.isdir(args.save_dir):\n        os.makedirs(args.save_dir)\n    for infer_iter, data in enumerate(infer_reader()):\n        data_feed_in = [items[:-1] for items in data]\n        video_id = [items[-1] for items in data]\n        bs = len(video_id)\n        feature_outs = exe.run(fetch_list=fetch_list,\n                               feed=infer_feeder.feed(data_feed_in))\n        for i in range(bs):\n            filename = video_id[i].split('/')[-1][:-4]"
+        },
+        {
+            "comment": "Saves extracted features from the PaddleVideo/applications/VideoTag/tsn_extractor.py module using numpy's save function, then logs the end of feature extraction and calls infer function with argument args.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/tsn_extractor.py\":144-157",
+            "content": "            np.save(os.path.join(args.save_dir, filename + '.npy'),\n                    feature_outs[0][i])  #shape: seg_num*feature_dim\n    logger.info(\"Feature extraction End~\")\nif __name__ == \"__main__\":\n    args = parse_args()\n    # check whether the installed paddle is compiled with GPU\n    check_cuda(args.use_gpu)\n    check_version()\n    logger.info(args)\n    infer(args)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/195955fc-23f4-4ec7-8751-022d035c39d1.json b/docs/doc/195955fc-23f4-4ec7-8751-022d035c39d1.json
new file mode 100644
index 000000000..1c2562b8a
--- /dev/null
+++ b/docs/doc/195955fc-23f4-4ec7-8751-022d035c39d1.json
@@ -0,0 +1,90 @@
+{
+    "summary": "The code introduces a \"Scale\" class for image scaling and a MultiScaleCrop pipeline in PaddleVideo. It supports random or multi-crop based on test mode, maintaining aspect ratio while resizing/cropping images. A slower pathway is created by selecting specific frames from the fast_pathway array, rearranging dimensions, and then combined with the original for a list of frames before adding to 'results' dictionary.",
+    "details": [
+        {
+            "comment": "This code registers a new class \"Scale\" for image scaling in PaddleVideo's VideoQualityAssessment module. The Scale class takes a short_size parameter and scales the images accordingly. It is registered as part of the PIPELINES in the application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":0-34",
+            "content": "\"\"\"\n#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport random\nimport numpy as np\nimport math\nfrom PIL import Image\nfrom ..registry import PIPELINES\nfrom collections.abc import Sequence\n@PIPELINES.register()\nclass Scale(object):\n    \"\"\"\n    Scale images.\n    Args:\n        short_size(float | int): Short size of an image will be scaled to the short_size.\n    \"\"\"\n    def __init__(self, short_size):\n        self.short_size = short_size\n    def __call__(self, results):"
+        },
+        {
+            "comment": "The code defines a function that resizes PIL.Image objects in a list according to their aspect ratios and the short_size provided. If an image's width is less than or equal to its height, it is appended to resized_imgs without any modification. Otherwise, if the width is greater than the height, the image is scaled to fit within a square with the given short_size, maintaining aspect ratio using bilinear interpolation. If the height is greater than the width, the image is also scaled to fit within a square with the given short_size, again maintaining aspect ratio using bilinear interpolation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":35-60",
+            "content": "        \"\"\"\n        Performs resize operations.\n        Args:\n            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            resized_imgs: List where each item is a PIL.Image after scaling.\n        \"\"\"\n        imgs = results['imgs']\n        resized_imgs = []\n        for i in range(len(imgs)):\n            img = imgs[i]\n            w, h = img.size\n            if (w <= h and w == self.short_size) or (h <= w\n                                                     and h == self.short_size):\n                resized_imgs.append(img)\n                continue\n            if w < h:\n                ow = self.short_size\n                oh = int(self.short_size * 4.0 / 3.0)\n                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n            else:\n                oh = self.short_size\n                ow = int(self.short_size * 4.0 / 3.0)\n                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))"
+        },
+        {
+            "comment": "The code registers a custom pipeline for random cropping of images in PaddleVideo. It takes a target size as an argument and initializes the class with that target size. The __call__ method is used to perform random crop operations on a list of images. It first retrieves the original image sizes, ensures they are larger than the target size, then randomly selects x1 and y1 coordinates for the crop region, and appends the cropped image to a new list which is returned at the end.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":61-94",
+            "content": "        results['imgs'] = resized_imgs\n        return results\n@PIPELINES.register()\nclass RandomCrop(object):\n    \"\"\"\n    Random crop images.\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = target_size\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        w, h = imgs[0].size\n        th, tw = self.target_size, self.target_size\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size {}\".format(\n                w, h, self.target_size)\n        crop_images = []\n        x1 = random.randint(0, w - tw)\n        y1 = random.randint(0, h - th)"
+        },
+        {
+            "comment": "This code performs center cropping of images to a specified target size. It iterates through the list of images, checks if they are already at the target size, and appends them to the crop_images list. If the image is not at the target size, it crops the image to the center square of the original image and adds it to the crop_images list. The final results dictionary contains the list of cropped images. The class CenterCrop initializes with a target_size parameter and defines a __call__ method for applying the center crop operation on input images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":96-129",
+            "content": "        for img in imgs:\n            if w == tw and h == th:\n                crop_images.append(img)\n            else:\n                crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = crop_images\n        return results\n@PIPELINES.register()\nclass CenterCrop(object):\n    \"\"\"\n    Center crop images.\n    Args:\n        target_size(int): Center crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = target_size\n    def __call__(self, results):\n        \"\"\"\n        Performs Center crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            ccrop_imgs: List where each item is a PIL.Image after Center crop.\n        \"\"\"\n        imgs = results['imgs']\n        ccrop_imgs = []\n        for img in imgs:\n            w, h = img.size\n            th, tw = self.target_size, self.target_size\n            assert (w >= self.target_size) and (h >= self.target_size), \\"
+        },
+        {
+            "comment": "MultiScaleCrop applies image resizing and cropping to an input image. The target size, scales, max_distort, fix_crop, and more_fix_crop parameters are used for image manipulation. Images are cropped into smaller ones with varying sizes based on the defined scales.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":130-159",
+            "content": "                \"image width({}) and height({}) should be larger than crop size {}\".format(\n                    w, h, self.target_size)\n            x1 = int(round((w - tw) / 2.))\n            y1 = int(round((h - th) / 2.))\n            ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = ccrop_imgs\n        return results\n@PIPELINES.register()\nclass MultiScaleCrop(object):\n    def __init__(\n            self,\n            target_size,  #NOTE: named target size now, but still pass short size in it!\n            scales=None,\n            max_distort=1,\n            fix_crop=True,\n            more_fix_crop=True):\n        self.target_size = target_size\n        self.scales = scales if scales else [1, .875, .75, .66]\n        self.max_distort = max_distort\n        self.fix_crop = fix_crop\n        self.more_fix_crop = more_fix_crop\n    def __call__(self, results):\n        \"\"\"\n        Performs MultiScaleCrop operations.\n        Args:\n            imgs: List where wach item is a PIL.Image.\n            XXX:"
+        },
+        {
+            "comment": "This code defines a function to sample random crop sizes for image augmentation. It first calculates the possible crop sizes based on input size and scales, then filters pairs that have a difference within max_distort. Finally, it randomly chooses one of the filtered pairs for cropping and optionally adds a random offset if fix_crop is False.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":160-191",
+            "content": "        results:\n        \"\"\"\n        imgs = results['imgs']\n        input_size = [self.target_size, self.target_size]\n        im_size = imgs[0].size\n        # get random crop offset\n        def _sample_crop_size(im_size):\n            image_w, image_h = im_size[0], im_size[1]\n            base_size = min(image_w, image_h)\n            crop_sizes = [int(base_size * x) for x in self.scales]\n            crop_h = [\n                input_size[1] if abs(x - input_size[1]) < 3 else x\n                for x in crop_sizes\n            ]\n            crop_w = [\n                input_size[0] if abs(x - input_size[0]) < 3 else x\n                for x in crop_sizes\n            ]\n            pairs = []\n            for i, h in enumerate(crop_h):\n                for j, w in enumerate(crop_w):\n                    if abs(i - j) <= self.max_distort:\n                        pairs.append((w, h))\n            crop_pair = random.choice(pairs)\n            if not self.fix_crop:\n                w_offset = random.randint(0, image_w - crop_pair[0])"
+        },
+        {
+            "comment": "This code generates a list of crop positions for an image. If the image height is greater than the second value in the crop pair, it randomly selects a horizontal offset. Otherwise, it calculates step sizes for width and height, and creates a list of crop positions using these steps. Additional crop positions are added if self.more_fix_crop is True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":192-214",
+            "content": "                h_offset = random.randint(0, image_h - crop_pair[1])\n            else:\n                w_step = (image_w - crop_pair[0]) / 4\n                h_step = (image_h - crop_pair[1]) / 4\n                ret = list()\n                ret.append((0, 0))  # upper left\n                if w_step != 0:\n                    ret.append((4 * w_step, 0))  # upper right\n                if h_step != 0:\n                    ret.append((0, 4 * h_step))  # lower left\n                if h_step != 0 and w_step != 0:\n                    ret.append((4 * w_step, 4 * h_step))  # lower right\n                if h_step != 0 or w_step != 0:\n                    ret.append((2 * w_step, 2 * h_step))  # center\n                if self.more_fix_crop:\n                    ret.append((0, 2 * h_step))  # center left\n                    ret.append((4 * w_step, 2 * h_step))  # center right\n                    ret.append((2 * w_step, 4 * h_step))  # lower center\n                    ret.append((2 * w_step, 0 * h_step))  # upper center\n                    ret.append((1 * w_step, 1 * h_step))  # upper left quarter"
+        },
+        {
+            "comment": "The code randomly samples crop sizes from a set of predefined ratios, crops the input images accordingly, resizes them to the desired input size, and adds the flipped or cropped images to the results dictionary. It also includes an optional RandomFlip pipeline that randomly flips the image with a given probability.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":215-246",
+            "content": "                    ret.append((3 * w_step, 1 * h_step))  # upper right quarter\n                    ret.append((1 * w_step, 3 * h_step))  # lower left quarter\n                    ret.append((3 * w_step, 3 * h_step))  # lower righ quarter\n                w_offset, h_offset = random.choice(ret)\n            return crop_pair[0], crop_pair[1], w_offset, h_offset\n        crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)\n        crop_img_group = [\n            img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))\n            for img in imgs\n        ]\n        ret_img_group = [\n            img.resize((input_size[0], input_size[1]), Image.BILINEAR)\n            for img in crop_img_group\n        ]\n        results['imgs'] = ret_img_group\n        return results\n@PIPELINES.register()\nclass RandomFlip(object):\n    \"\"\"\n    Random Flip images.\n    Args:\n        p(float): Random flip images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.5):\n        self.p = p\n    def __call__(self, results):"
+        },
+        {
+            "comment": "This code defines two classes: \"RandomFlip\" and \"Image2Array\". RandomFlip performs random flips on a list of PIL images, while Image2Array converts a PIL image to a numpy array with optional transpose. Both are registered as pipelines using @PIPELINES.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":247-280",
+            "content": "        \"\"\"\n        Performs random flip operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            flip_imgs: List where each item is a PIL.Image after random flip.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            results['imgs'] = [\n                img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs\n            ]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass Image2Array(object):\n    \"\"\"\n    transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.\n    Args:\n        transpose: whether to transpose or not, default True, False for slowfast.\n    \"\"\"\n    def __init__(self, transpose=True):\n        self.transpose = transpose\n    def __call__(self, results):\n        \"\"\"\n        Performs Image to NumpyArray operations.\n        Args:\n            imgs: List where each item is a PIL.Image."
+        },
+        {
+            "comment": "This function converts a list of PIL images to a numpy array, optionally transposes it if needed, and stores the result in the 'imgs' key of the results dictionary. Additionally, the Normalization class initializes with mean and std values for normalization and reshapes them to fit the tensor shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":281-309",
+            "content": "            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            np_imgs: Numpy array.\n        \"\"\"\n        imgs = results['imgs']\n        np_imgs = (np.stack(imgs)).astype('float32')\n        if self.transpose:\n            np_imgs = np_imgs.transpose(0, 3, 1, 2)  #nchw\n        results['imgs'] = np_imgs\n        return results\n@PIPELINES.register()\nclass Normalization(object):\n    \"\"\"\n    Normalization.\n    Args:\n        mean(Sequence[float]): mean values of different channels.\n        std(Sequence[float]): std values of different channels.\n        tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]\n    \"\"\"\n    def __init__(self, mean, std, tensor_shape=[3, 1, 1]):\n        if not isinstance(mean, Sequence):\n            raise TypeError(\n                'Mean must be list, tuple or np.ndarray, but got {type(mean)}')\n        if not isinstance(std, Sequence):\n            raise TypeError(\n                'Std must be list, tuple or np.ndarray, but got {type(std)}')\n        self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)"
+        },
+        {
+            "comment": "The code is a part of the PaddleVideo library's VideoQualityAssessment module. It performs normalization operations on image arrays and registers a JitterScale class for image scaling with random short size selection between min_size and max_size, also including cycling factors for default minimum size functionality.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":310-343",
+            "content": "        self.std = np.array(std).reshape(tensor_shape).astype(np.float32)\n    def __call__(self, results):\n        \"\"\"\n        Performs normalization operations.\n        Args:\n            imgs: Numpy array.\n        return:\n            np_imgs: Numpy array after normalization.\n        \"\"\"\n        imgs = results['imgs']\n        norm_imgs = imgs / 255.\n        norm_imgs -= self.mean\n        norm_imgs /= self.std\n        results['imgs'] = norm_imgs\n        return results\n@PIPELINES.register()\nclass JitterScale(object):\n    \"\"\"\n    Scale image, while the target short size is randomly select between min_size and max_size.\n    Args:\n        min_size: Lower bound for random sampler.\n        max_size: Higher bound for random sampler.\n    \"\"\"\n    def __init__(self,\n                 min_size,\n                 max_size,\n                 short_cycle_factors=[0.5, 0.7071],\n                 default_min_size=256):\n        self.default_min_size = default_min_size\n        self.orig_min_size = self.min_size = min_size\n        self.max_size = max_size"
+        },
+        {
+            "comment": "This code performs jitter resize operations and applies random scaling. It takes a sequence of PIL.Image, scales each item based on min_size, max_size, and short_cycle_factors. If the number of images is less than 1, it throws an error. The size is determined by randomly selecting values between min_size and max_size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":344-369",
+            "content": "        self.short_cycle_factors = short_cycle_factors\n    def __call__(self, results):\n        \"\"\"\n        Performs jitter resize operations.\n        Args:\n            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            resized_imgs: List where each item is a PIL.Image after scaling.\n        \"\"\"\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx in [0, 1]:\n            self.min_size = int(\n                round(self.short_cycle_factors[short_cycle_idx] *\n                      self.default_min_size))\n        else:\n            self.min_size = self.orig_min_size\n        imgs = results['imgs']\n        size = int(round(np.random.uniform(self.min_size, self.max_size)))\n        assert (len(imgs) >= 1) , \\\n            \"len(imgs):{} should be larger than 1\".format(len(imgs))\n        width, height = imgs[0].size\n        if (width <= height and width == size) or (height <= width\n                                                   and height == size):"
+        },
+        {
+            "comment": "This code resizes the input images to a specified size while maintaining aspect ratio, and then applies random crop for multi-clip testing in the MultiCrop class. The target_size parameter determines the output image's dimensions after resizing and cropping.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":370-402",
+            "content": "            return results\n        new_width = size\n        new_height = size\n        if width < height:\n            new_height = int(math.floor((float(height) / width) * size))\n        else:\n            new_width = int(math.floor((float(width) / height) * size))\n        frames_resize = []\n        for j in range(len(imgs)):\n            img = imgs[j]\n            scale_img = img.resize((new_width, new_height), Image.BILINEAR)\n            frames_resize.append(scale_img)\n        results['imgs'] = frames_resize\n        return results\n@PIPELINES.register()\nclass MultiCrop(object):\n    \"\"\"\n    Random crop image.\n    This operation can perform multi-crop during multi-clip test, as in slowfast model.\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self,\n                 target_size,\n                 default_crop_size=224,\n                 short_cycle_factors=[0.5, 0.7071],\n                 test_mode=False):\n        self.orig_target_size = self.target_size = target_size"
+        },
+        {
+            "comment": "The function performs random crop operations on images. It takes a list of PIL Images as input and returns the cropped images. The code checks if the current short cycle index is 0 or 1, in which case it adjusts the target size based on the short_cycle_factors variable. If the image size matches the target size, it skips the crop operation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":403-429",
+            "content": "        self.short_cycle_factors = short_cycle_factors\n        self.default_crop_size = default_crop_size\n        self.test_mode = test_mode\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        spatial_sample_index = results['spatial_sample_index']\n        spatial_num_clips = results['spatial_num_clips']\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx in [0, 1]:\n            self.target_size = int(\n                round(self.short_cycle_factors[short_cycle_idx] *\n                      self.default_crop_size))\n        else:\n            self.target_size = self.orig_target_size  # use saved value before call\n        w, h = imgs[0].size\n        if w == self.target_size and h == self.target_size:"
+        },
+        {
+            "comment": "This function performs image cropping with or without random cropping. If not in test mode, it randomly selects x and y offsets within the image boundaries to crop an area of size self.target_size. In test mode, it performs multi-crop by dividing the image into equal parts based on spatial_num_clips, ensuring each part has a minimum size of self.target_size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":430-451",
+            "content": "            return results\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size({},{})\".format(w, h, self.target_size, self.target_size)\n        frames_crop = []\n        if not self.test_mode:\n            x_offset = random.randint(0, w - self.target_size)\n            y_offset = random.randint(0, h - self.target_size)\n        else:  #multi-crop\n            x_gap = int(\n                math.ceil((w - self.target_size) / (spatial_num_clips - 1)))\n            y_gap = int(\n                math.ceil((h - self.target_size) / (spatial_num_clips - 1)))\n            if h > w:\n                x_offset = int(math.ceil((w - self.target_size) / 2))\n                if spatial_sample_index == 0:\n                    y_offset = 0\n                elif spatial_sample_index == spatial_num_clips - 1:\n                    y_offset = h - self.target_size\n                else:\n                    y_offset = y_gap * spatial_sample_index\n            else:"
+        },
+        {
+            "comment": "The code takes a list of images and crops them based on specified offset values to create new images with the desired target size. It then appends these cropped images to a list, stores them in 'frames_crop', and returns a dictionary containing 'imgs'. The function PackOutput is used for getting the slow pathway from the fast pathway based on the alpha factor in the SlowFast model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":452-483",
+            "content": "                y_offset = int(math.ceil((h - self.target_size) / 2))\n                if spatial_sample_index == 0:\n                    x_offset = 0\n                elif spatial_sample_index == spatial_num_clips - 1:\n                    x_offset = w - self.target_size\n                else:\n                    x_offset = x_gap * spatial_sample_index\n        for img in imgs:\n            nimg = img.crop((x_offset, y_offset, x_offset + self.target_size,\n                             y_offset + self.target_size))\n            frames_crop.append(nimg)\n        results['imgs'] = frames_crop\n        return results\n@PIPELINES.register()\nclass PackOutput(object):\n    \"\"\"\n    In slowfast model, we want to get slow pathway from fast pathway based on\n    alpha factor.\n    Args:\n        alpha(int): temporal length of fast/slow\n    \"\"\"\n    def __init__(self, alpha):\n        self.alpha = alpha\n    def __call__(self, results):\n        fast_pathway = results['imgs']\n        # sample num points between start and end\n        slow_idx_start = 0"
+        },
+        {
+            "comment": "This code is creating a slower pathway by selecting specific frames from the fast_pathway array and rearranging the dimensions. The slower pathway is then combined with the original fast_pathway to create a list of frames, which is added to the 'results' dictionary before returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py\":484-497",
+            "content": "        slow_idx_end = fast_pathway.shape[0] - 1\n        slow_idx_num = fast_pathway.shape[0] // self.alpha\n        slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,\n                                       slow_idx_num).astype(\"int64\")\n        slow_pathway = fast_pathway[slow_idxs_select]\n        # T H W C -> C T H W.\n        slow_pathway = slow_pathway.transpose(3, 0, 1, 2)\n        fast_pathway = fast_pathway.transpose(3, 0, 1, 2)\n        # slow + fast\n        frames_list = [slow_pathway, fast_pathway]\n        results['imgs'] = frames_list\n        return results"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1a1eeeb7-7406-42a4-9b1d-489aa086755c.json b/docs/doc/1a1eeeb7-7406-42a4-9b1d-489aa086755c.json
new file mode 100644
index 000000000..e25ae44e6
--- /dev/null
+++ b/docs/doc/1a1eeeb7-7406-42a4-9b1d-489aa086755c.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines an LSTM Attention Model class with parameters and a forward method for computation, applying LSTM layers in both directions and performing dynamic LSTM on input tensor. It uses dropout, FC layer, sequence_softmax, scaling, and sum pooling to obtain the final output.",
+    "details": [
+        {
+            "comment": "This code is for a class called LSTMAttentionModel, which represents an LSTM Attention Model. It has three parameters: bias_attr, embedding_size (default 512), lstm_size (default 1024), and drop_rate (default 0.5). The class has an __init__ method to initialize these parameters and a forward method for performing the model's computation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/lstm_attention.py\":0-30",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.static as static\nclass LSTMAttentionModel(object):\n    \"\"\"LSTM Attention Model\"\"\"\n    def __init__(self,\n                 bias_attr,\n                 embedding_size=512,\n                 lstm_size=1024,\n                 drop_rate=0.5):\n        self.lstm_size = lstm_size\n        self.embedding_size = embedding_size\n        self.drop_rate = drop_rate\n    def forward(self, input, is_training):"
+        },
+        {
+            "comment": "This code initializes an LSTM layer for video tagging. It applies two fully connected layers (fc) to the input, one for forward and one for backward direction. The forward LSTM layer is created using dynamic_lstm function with size 4 times the lstm_size attribute and no reverse operation. The backward LSTM layer is also created similarly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/lstm_attention.py\":31-57",
+            "content": "        input_fc = static.nn.fc(\n            x=input,\n            size=self.embedding_size,\n            activation='tanh',\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)),\n            name='rgb_fc')\n        lstm_forward_fc = static.nn.fc(\n            x=input_fc,\n            size=self.lstm_size * 4,\n            activation=None,\n            bias_attr=False,  # video_tag\n            name='rgb_fc_forward')\n        lstm_forward, _ = paddle.fluid.layers.dynamic_lstm(input=lstm_forward_fc,\n                                                    size=self.lstm_size * 4,\n                                                    is_reverse=False,\n                                                    name='rgb_lstm_forward')\n        lsmt_backward_fc = static.nn.fc(\n            x=input_fc,\n            size=self.lstm_size * 4,\n            activation=None,\n            bias_attr=False,  #video_tag\n            name='rgb_fc_backward')"
+        },
+        {
+            "comment": "This code performs dynamic LSTM on input tensor with forward and backward directions, concatenates the results, applies dropout, then feeds the result into an FC layer for weight assignment using sequence_softmax. The final output is obtained by scaling the previous result with the weights and applying a sum pooling.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/lstm_attention.py\":59-82",
+            "content": "        lstm_backward, _ = paddle.fluid.layers.dynamic_lstm(input=lsmt_backward_fc,\n                                                     size=self.lstm_size * 4,\n                                                     is_reverse=True,\n                                                     name='rgb_lstm_backward')\n        lstm_concat = paddle.concat(x=[lstm_forward, lstm_backward],\n                                          axis=1)\n        lstm_dropout = paddle.nn.functional.dropout2d(x=lstm_concat,\n                                            p=self.drop_rate,\n                                            training=is_training)\n        lstm_weight = static.nn.fc(\n            x=lstm_dropout,\n            size=1,\n            activation='sequence_softmax',\n            bias_attr=False,  #video_tag\n            name='rgb_weight')\n        scaled = paddle.multiply(x=lstm_dropout,\n                                              y=lstm_weight)\n        lstm_pool = paddle.static.nn.sequence_pool(input=scaled, pool_type='sum')\n        return lstm_pool"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1a454a9c-ff43-4008-9b16-77e5a4df958d.json b/docs/doc/1a454a9c-ff43-4008-9b16-77e5a4df958d.json
new file mode 100644
index 000000000..e7a8b3a65
--- /dev/null
+++ b/docs/doc/1a454a9c-ff43-4008-9b16-77e5a4df958d.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code defines a PaddleVideo classification head base class and function for loss/accuracy calculation, supporting binary, multi-class, and specific MRI scenarios with label smoothing. It also calculates top5 accuracy, hard/soft labels, and performs all-reduce operation in distributed training.",
+    "details": [
+        {
+            "comment": "Base class for head part, all subclass should overwrite init_weights method for initializing weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/base.py\":0-33",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nfrom abc import abstractmethod\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..builder import build_loss\nfrom paddlevideo.utils import get_logger, get_dist_info\nlogger = get_logger(\"paddlevideo\")\nclass BaseHead(nn.Layer):\n    \"\"\"Base class for head part.\n    All head should subclass it.\n    All subclass should overwrite:\n    - Methods: ```init_weights```, initializing weights."
+        },
+        {
+            "comment": "This code is defining a base class for a classification head in PaddleVideo. It has an `__init__` method that sets the number of classes, input channels, and loss configuration. It also builds the loss function using the provided configuration. The `forward` method must be implemented by any subclasses, as it defines how the head will run during model inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/base.py\":34-64",
+            "content": "    - Methods: ```forward```, forward function.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channels in input feature.\n        loss_cfg (dict): Config for building loss. Default: dict(type='CrossEntropyLoss').\n        ls_eps (float): label smoothing epsilon. Default: 0. .\n    \"\"\"\n    def __init__(\n        self,\n        num_classes=None,\n        in_channels=None,\n        loss_cfg=dict(\n            name=\"CrossEntropyLoss\"\n        ),  #TODO(shipping): only pass a name or standard build cfg format.\n        #multi_class=False, NOTE(shipping): not supported now.\n        ls_eps=0.):\n        super().__init__()\n        self.num_classes = num_classes\n        self.in_channels = in_channels\n        self.loss_func = build_loss(loss_cfg)\n        #self.multi_class = multi_class NOTE(shipping): not supported now\n        self.ls_eps = ls_eps\n    @abstractmethod\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        raise NotImplemented"
+        },
+        {
+            "comment": "This function calculates the loss based on model output (scores) and target output (labels). It returns a dictionary containing 'loss', 'top1_acc', and 'top5_acc'. If labels are single, they are expanded. Label smoothing is applied if ls_eps is non-zero and not in valid mode. The loss function is used if label smoothing is not applicable. Top-1 and top-5 accuracy are also calculated if top-5 is set to True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/base.py\":66-90",
+            "content": "    def loss(self, scores, labels, valid_mode=False, if_top5=True, **kwargs):\n        \"\"\"Calculate the loss accroding to the model output ```scores```,\n           and the target ```labels```.\n        Args:\n            scores (paddle.Tensor): The output of the model.\n            labels (paddle.Tensor): The target output of the model.\n        Returns:\n            losses (dict): A dict containing field 'loss'(mandatory) and 'top1_acc', 'top5_acc'(optional).\n        \"\"\"\n        if len(labels) == 1:  #commonly case\n            labels = labels[0]\n            losses = dict()\n            if self.ls_eps != 0. and not valid_mode:  # label_smooth\n                loss = self.label_smooth_loss(scores, labels, **kwargs)\n            else:\n                loss = self.loss_func(scores, labels, **kwargs)\n            if if_top5:\n                top1, top5 = self.get_acc(scores, labels, valid_mode)\n                losses['top1'] = top1\n                losses['top5'] = top5\n                losses['loss'] = loss\n            else:"
+        },
+        {
+            "comment": "This code handles different cases for classification tasks. For binary and multi-class tasks, it calculates top1 accuracy and loss, while for the specific case of MRI with three labels (mix_up), it applies label smoothing or regular loss function and averages results for each sample to get the final loss and top1 accuracy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/base.py\":91-112",
+            "content": "                top1 = self.get_acc(scores, labels, valid_mode, if_top5)\n                losses['top1'] = top1\n                losses['loss'] = loss\n            return losses\n        # MRI\u76ee\u524d\u4e8c\u5206\u7c7b\u65e0top5\n        elif len(labels) == 3:  # mix_up\n            labels_a, labels_b, lam = labels\n            lam = lam[0]  # get lam value\n            losses = dict()\n            if self.ls_eps != 0:\n                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)\n                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)\n            else:\n                loss_a = self.loss_func(scores, labels_a, **kwargs)\n                loss_b = self.loss_func(scores, labels_b, **kwargs)\n            loss = lam * loss_a + (1 - lam) * loss_b\n            if if_top5:\n                top1a, top5a = self.get_acc(scores, labels_a, valid_mode)\n                top1b, top5b = self.get_acc(scores, labels_b, valid_mode)\n                top1 = lam * top1a + (1 - lam) * top1b\n                top5 = lam * top5a + (1 - lam) * top5b"
+        },
+        {
+            "comment": "Function defines a loss function for classification tasks, returning a dictionary of losses including top1 and overall loss. If valid_mode is True, calculates accuracy on validation set, otherwise on training set. Top1 accuracies for two sets are combined with specified lambda value to calculate final top1.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/base.py\":113-142",
+            "content": "                losses['top1'] = top1\n                losses['top5'] = top5\n                losses['loss'] = loss\n            else:\n                top1a = self.get_acc(scores, labels_a, valid_mode, if_top5)\n                top1b = self.get_acc(scores, labels_b, valid_mode, if_top5)\n                top1 = lam * top1a + (1 - lam) * top1b\n                losses['top1'] = top1\n                losses['loss'] = loss\n            return losses\n        else:\n            raise NotImplemented\n    def label_smooth_loss(self, scores, labels, **kwargs):\n        \"\"\"\n        Args:\n            scores (paddle.Tensor): [N, num_classes]\n            labels (paddle.Tensor): [N, ]\n        Returns:\n            paddle.Tensor: [1,]\n        \"\"\"\n        if paddle.is_compiled_with_custom_device('npu'):\n            \"\"\"\n            Designed for the lack of temporary operators of NPU,\n            main idea is to split smooth loss into uniform distribution loss\n            and hard label calculation\n            \"\"\"\n            hard_loss = (1.0 - self.ls_eps) * F.cross_entropy(scores, labels)"
+        },
+        {
+            "comment": "Code is for computing loss and accuracy in a classification model. If the hard label is given, it calculates uniform_loss based on scores and adds it to hard_loss for total loss. Otherwise, it computes soft labels using one-hot encoding and label smoothing, then calculates loss using the provided loss function with soft_label set to True. The get_acc function computes top1 and top5 accuracy, averages them across all cards if valid mode is on, and returns the accuracy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/base.py\":143-163",
+            "content": "            uniform_loss = (self.ls_eps / self.num_classes) * (\n                -F.log_softmax(scores, -1).sum(-1).mean(0))\n            loss = hard_loss + uniform_loss\n        else:\n            labels = F.one_hot(labels, self.num_classes)\n            labels = F.label_smooth(labels, epsilon=self.ls_eps)\n            labels = paddle.squeeze(labels, axis=1)\n            loss = self.loss_func(scores, labels, soft_label=True, **kwargs)\n        return loss\n    def get_acc(self, scores, labels, valid_mode, if_top5=True):\n        if if_top5:\n            top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)\n            top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)\n            _, world_size = get_dist_info()\n            #NOTE(shipping): deal with multi cards validate\n            if world_size > 1 and valid_mode:  #reduce sum when valid\n                paddle.distributed.all_reduce(\n                    top1, op=paddle.distributed.ReduceOp.SUM)\n                top1 = top1 / world_size\n                paddle.distributed.all_reduce("
+        },
+        {
+            "comment": "This code calculates the top1 and optionally top5 accuracy for a classification task. If distributed training is enabled, it performs all-reduce operation on the calculated metrics to ensure consistency across multiple cards/devices. The reduction operation used is sum, and the results are divided by the total number of devices (world_size) to obtain an average value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/base.py\":164-177",
+            "content": "                    top5, op=paddle.distributed.ReduceOp.SUM)\n                top5 = top5 / world_size\n            return top1, top5\n        else:\n            top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)\n            _, world_size = get_dist_info()\n            #NOTE(shipping): deal with multi cards validate\n            if world_size > 1 and valid_mode:  #reduce sum when valid\n                paddle.distributed.all_reduce(\n                    top1, op=paddle.distributed.ReduceOp.SUM)\n                top1 = top1 / world_size\n            return top1"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1a584934-3d9e-422a-a9a6-81d39d0bccc0.json b/docs/doc/1a584934-3d9e-422a-a9a6-81d39d0bccc0.json
new file mode 100644
index 000000000..6dc458268
--- /dev/null
+++ b/docs/doc/1a584934-3d9e-422a-a9a6-81d39d0bccc0.json
@@ -0,0 +1,245 @@
+{
+    "summary": "The code presents a PaddlePaddle BertEmbeddings class for BERT model embeddings in video action recognition, utilizing self-attention and ACTBERT's backbone for multimodal inputs including text, video, and action data.",
+    "details": [
+        {
+            "comment": "This code is a Python file containing a class named \"BertEmbeddings\" within the PaddlePaddle framework. The class inherits from nn.Layer and appears to contain embeddings for the BERT model. This code also includes comments with copyright information, license details, and an import section with necessary libraries. It introduces a dictionary, ACT2FN, that maps activation functions for use in the BertEmbeddings class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":0-31",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nimport numpy as np\nimport math\nimport copy\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout)\nfrom paddle.nn.initializer import Constant, Normal\nfrom ...utils.save_load import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nACT2FN = {\"gelu\": F.gelu, \"relu\": F.relu, \"swish\": F.swish}\nclass BertEmbeddings(nn.Layer):"
+        },
+        {
+            "comment": "BertEmbeddings initializes word, position, and token_type embeddings for a given vocabulary size, maximum position embedding size, type vocab size, hidden size, and hidden dropout probability. Forward function uses input ids and token type ids to generate position ids and then combines the different embeddings with layer normalization and dropout.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":32-51",
+            "content": "    \"\"\"Construct the embeddings from word, position and token_type embeddings.\n    \"\"\"\n    def __init__(self, vocab_size, max_position_embeddings, type_vocab_size,\n                 hidden_size, hidden_dropout_prob):\n        super(BertEmbeddings, self).__init__()\n        self.word_embeddings = nn.Embedding(vocab_size,\n                                            hidden_size,\n                                            padding_idx=0)\n        self.position_embeddings = nn.Embedding(max_position_embeddings,\n                                                hidden_size)\n        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)\n        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n    def forward(self, input_ids, token_type_ids=None):\n        seq_length = input_ids.shape[1]\n        position_ids = paddle.arange(end=seq_length, dtype=\"int64\")\n        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)\n        if token_type_ids is None:"
+        },
+        {
+            "comment": "This code defines the `ActBert` class, which is a backbone model for video action recognition. It initializes word embeddings, position embeddings, and token type embeddings. The class also includes a forward function that combines these embeddings, applies layer normalization and dropout, and returns the result. Additionally, there's the `BertImageEmbeddings` class which takes image features and their locations as input and uses linear layers to generate embeddings for both, followed by layer normalization and dropout.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":52-74",
+            "content": "            token_type_ids = paddle.zeros_like(input_ids)\n        words_embeddings = self.word_embeddings(input_ids)  #8,36  -> 8,36,768\n        position_embeddings = self.position_embeddings(\n            position_ids)  #8,36  -> 8,36,768\n        token_type_embeddings = self.token_type_embeddings(\n            token_type_ids)  #8,36  -> 8,36,768\n        embeddings = words_embeddings + position_embeddings + token_type_embeddings\n        embeddings = self.LayerNorm(embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings\nclass BertImageEmbeddings(nn.Layer):\n    def __init__(self, v_feature_size, v_hidden_size, v_hidden_dropout_prob):\n        super(BertImageEmbeddings, self).__init__()\n        self.image_embeddings = nn.Linear(v_feature_size, v_hidden_size)\n        self.image_location_embeddings = nn.Linear(5, v_hidden_size)\n        self.LayerNorm = nn.LayerNorm(v_hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(v_hidden_dropout_prob)\n    def forward(self, input_ids, input_loc):"
+        },
+        {
+            "comment": "This code defines two classes: `BertActionEmbeddings` and `BertSelfAttention`. The former takes action features as input, linearly projects them into hidden states, normalizes these using LayerNorm, applies dropout, and returns the embeddings. The latter performs self-attention over the embeddings produced by `BertActionEmbeddings`, applies a feed-forward network, applies LayerNorm, and finally applies dropout before returning the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":75-99",
+            "content": "        img_embeddings = self.image_embeddings(\n            input_ids)  #8,37,2048 -> 8,37,1024\n        loc_embeddings = self.image_location_embeddings(\n            input_loc)  #8,37,5 -> 8,37,1024\n        embeddings = self.LayerNorm(img_embeddings + loc_embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings  # shape: bs*seq_len*hs\nclass BertActionEmbeddings(nn.Layer):\n    def __init__(self, a_feature_size, a_hidden_size, a_hidden_dropout_prob):\n        super(BertActionEmbeddings, self).__init__()\n        self.action_embeddings = nn.Linear(a_feature_size, a_hidden_size)\n        self.LayerNorm = nn.LayerNorm(a_hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(a_hidden_dropout_prob)\n    def forward(self, input_ids):\n        action_embeddings = self.action_embeddings(\n            input_ids)  #8,5,2048 -> 8,5,768\n        embeddings = self.LayerNorm(action_embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings\nclass BertSelfAttention(nn.Layer):"
+        },
+        {
+            "comment": "This code defines a BertSelfAttention class with parameters: hidden_size, num_attention_heads, and attention_probs_dropout_prob. It checks if the hidden size is divisible by the number of attention heads. If not, it raises a ValueError. Then, it calculates attention_head_size and all_head_size. Finally, it initializes query, key, value linear layers and dropout layer for attention probabilities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":100-122",
+            "content": "    def __init__(self, hidden_size, num_attention_heads,\n                 attention_probs_dropout_prob):\n        super(BertSelfAttention, self).__init__()\n        if hidden_size % num_attention_heads != 0:\n            raise ValueError(\n                \"The hidden size (%d) is not a multiple of the number of attention \"\n                \"heads (%d)\" % (hidden_size, num_attention_heads))\n        self.num_attention_heads = num_attention_heads\n        self.attention_head_size = int(hidden_size / num_attention_heads)\n        self.all_head_size = self.num_attention_heads * self.attention_head_size\n        self.query = nn.Linear(hidden_size, self.all_head_size)\n        self.key = nn.Linear(hidden_size, self.all_head_size)\n        self.value = nn.Linear(hidden_size, self.all_head_size)\n        self.dropout = nn.Dropout(attention_probs_dropout_prob)\n    def transpose_for_scores(self, x):\n        new_x_shape = x.shape[:-1] + [\n            self.num_attention_heads,\n            self.attention_head_size,\n        ]\n        x = x.reshape(new_x_shape)"
+        },
+        {
+            "comment": "This code performs multi-head attention in an attention mechanism. It transposes the query, key, and value layers before calculating raw attention scores via dot product. The results are then normalized into probabilities using softmax. The attention mask is applied to the attention scores for masked self-attention.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":123-143",
+            "content": "        return x.transpose((0, 2, 1, 3))\n    def forward(self, hidden_states, attention_mask):\n        mixed_query_layer = self.query(hidden_states)\n        mixed_key_layer = self.key(hidden_states)\n        mixed_value_layer = self.value(hidden_states)\n        query_layer = self.transpose_for_scores(mixed_query_layer)\n        key_layer = self.transpose_for_scores(mixed_key_layer)\n        value_layer = self.transpose_for_scores(mixed_value_layer)\n        # Take the dot product between \"query\" and \"key\" to get the raw attention scores.\n        attention_scores = paddle.matmul(query_layer,\n                                         key_layer.transpose((0, 1, 3, 2)))\n        attention_scores = attention_scores / math.sqrt(\n            self.attention_head_size)\n        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)\n        attention_scores = attention_scores + attention_mask\n        # Normalize the attention scores to probabilities.\n        attention_probs = nn.Softmax(axis=-1)(attention_scores)"
+        },
+        {
+            "comment": "This code defines a BertSelfOutput layer that takes input hidden states, applies linear transformation and dropout for regularization, then passes the output through layer normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":145-168",
+            "content": "        # This is actually dropping out entire tokens to attend to, which might\n        # seem a bit unusual, but is taken from the original Transformer paper.\n        attention_probs = self.dropout(attention_probs)\n        context_layer = paddle.matmul(attention_probs, value_layer)\n        context_layer = context_layer.transpose((0, 2, 1, 3))\n        new_context_layer_shape = context_layer.shape[:-2] + [\n            self.all_head_size\n        ]\n        context_layer = context_layer.reshape(new_context_layer_shape)\n        return context_layer, attention_probs\nclass BertSelfOutput(nn.Layer):\n    def __init__(self, hidden_size, hidden_dropout_prob):\n        super(BertSelfOutput, self).__init__()\n        self.dense = nn.Linear(hidden_size, hidden_size)\n        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n    def forward(self, hidden_states, input_tensor):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.dropout(hidden_states)"
+        },
+        {
+            "comment": "This code defines three classes: `ActBert`, `BertAttention`, and `BertIntermediate`. \n\n`ActBert` appears to be a model that includes `BertAttention` and `BertIntermediate` as its layers. The `BertAttention` class defines forward function for attention mechanism, which takes in an input tensor and attention mask, and returns output and attention probabilities. The `BertIntermediate` class appears to be a dense layer with linear activation function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":169-191",
+            "content": "        hidden_states = self.LayerNorm(hidden_states + input_tensor)\n        return hidden_states\nclass BertAttention(nn.Layer):\n    def __init__(self, hidden_size, hidden_dropout_prob, num_attention_heads,\n                 attention_probs_dropout_prob):\n        super(BertAttention, self).__init__()\n        self.self = BertSelfAttention(hidden_size, num_attention_heads,\n                                      attention_probs_dropout_prob)\n        self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)\n    def forward(self, input_tensor, attention_mask):\n        self_output, attention_probs = self.self(input_tensor, attention_mask)\n        attention_output = self.output(self_output, input_tensor)\n        return attention_output, attention_probs\nclass BertIntermediate(nn.Layer):\n    def __init__(self, hidden_size, intermediate_size, hidden_act):\n        super(BertIntermediate, self).__init__()\n        self.dense = nn.Linear(hidden_size, intermediate_size)\n        if isinstance(hidden_act, str) or (sys.version_info[0] == 2"
+        },
+        {
+            "comment": "Code defines a class for an attention-based transformer model. It includes an intermediate activation function and forward pass layers for processing input, applying dropout regularization, and normalizing outputs. The BertEntAttention layer is the core module for the transformer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":192-218",
+            "content": "                                           and isinstance(hidden_act, str)):\n            self.intermediate_act_fn = ACT2FN[hidden_act]\n        else:\n            self.intermediate_act_fn = hidden_act\n    def forward(self, hidden_states):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.intermediate_act_fn(hidden_states)\n        return hidden_states\nclass BertOutput(nn.Layer):\n    def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob):\n        super(BertOutput, self).__init__()\n        self.dense = nn.Linear(intermediate_size, hidden_size)\n        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n    def forward(self, hidden_states, input_tensor):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.LayerNorm(hidden_states + input_tensor)\n        return hidden_states\nclass BertEntAttention(nn.Layer):\n    \"\"\"Core mudule of tangled transformer."
+        },
+        {
+            "comment": "This code defines a BertEntAttention class with parameters for hidden size, vision input hidden size, attention probabilities dropout probabilities, and bi-directional hidden size. It also checks if the hidden size is a multiple of the number of attention heads. The class initializes attributes such as the number of attention heads, attention head size, all head size, and linear layers for self-attention in vision input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":219-245",
+            "content": "    \"\"\"\n    def __init__(\n        self,\n        hidden_size,\n        v_hidden_size,\n        a_hidden_size,\n        bi_hidden_size,\n        attention_probs_dropout_prob,\n        v_attention_probs_dropout_prob,\n        a_attention_probs_dropout_prob,\n        av_attention_probs_dropout_prob,\n        at_attention_probs_dropout_prob,\n        bi_num_attention_heads,\n    ):\n        super(BertEntAttention, self).__init__()\n        if bi_hidden_size % bi_num_attention_heads != 0:\n            raise ValueError(\n                \"The hidden size (%d) is not a multiple of the number of attention \"\n                \"heads (%d)\" % (bi_hidden_size, bi_num_attention_heads))\n        self.num_attention_heads = bi_num_attention_heads\n        self.attention_head_size = int(bi_hidden_size / bi_num_attention_heads)\n        self.all_head_size = self.num_attention_heads * self.attention_head_size\n        # self attention layers for vision input\n        self.query1 = nn.Linear(v_hidden_size, self.all_head_size)\n        self.key1 = nn.Linear(v_hidden_size, self.all_head_size)"
+        },
+        {
+            "comment": "This code defines layers for self-attention in the ACTBERT model, including linear and dropout layers for text, action, action_text, and action_vision inputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":246-266",
+            "content": "        self.value1 = nn.Linear(v_hidden_size, self.all_head_size)\n        self.dropout1 = nn.Dropout(v_attention_probs_dropout_prob)\n        # self attention layers for text input\n        self.query2 = nn.Linear(hidden_size, self.all_head_size)\n        self.key2 = nn.Linear(hidden_size, self.all_head_size)\n        self.value2 = nn.Linear(hidden_size, self.all_head_size)\n        self.dropout2 = nn.Dropout(attention_probs_dropout_prob)\n        # self attention layers for action input\n        self.query3 = nn.Linear(a_hidden_size, self.all_head_size)\n        self.key3 = nn.Linear(a_hidden_size, self.all_head_size)\n        self.value3 = nn.Linear(a_hidden_size, self.all_head_size)\n        self.dropout3 = nn.Dropout(a_attention_probs_dropout_prob)\n        # self attention layers for action_text\n        self.key_at = nn.Linear(bi_hidden_size, self.all_head_size)\n        self.value_at = nn.Linear(bi_hidden_size, self.all_head_size)\n        self.dropout_at = nn.Dropout(av_attention_probs_dropout_prob)\n        # self attention layers for action_vision"
+        },
+        {
+            "comment": "This code defines a model for attention mechanism in a transformer architecture, used for both vision and text inputs. The key steps involve creating linear layers for keys and values, applying dropout, transposing the input tensors for scoring, and forwarding the input through these operations for both vision and text inputs separately.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":267-298",
+            "content": "        self.key_av = nn.Linear(bi_hidden_size, self.all_head_size)\n        self.value_av = nn.Linear(bi_hidden_size, self.all_head_size)\n        self.dropout_av = nn.Dropout(at_attention_probs_dropout_prob)\n    def transpose_for_scores(self, x):\n        new_x_shape = x.shape[:-1] + [\n            self.num_attention_heads,\n            self.attention_head_size,\n        ]\n        x = x.reshape(new_x_shape)\n        return x.transpose((0, 2, 1, 3))\n    def forward(\n        self,\n        input_tensor1,\n        attention_mask1,\n        input_tensor2,\n        attention_mask2,\n        input_tensor3,\n        attention_mask3,\n    ):\n        # for vision input.\n        mixed_query_layer1 = self.query1(input_tensor1)\n        mixed_key_layer1 = self.key1(input_tensor1)\n        mixed_value_layer1 = self.value1(input_tensor1)\n        query_layer1 = self.transpose_for_scores(mixed_query_layer1)\n        key_layer1 = self.transpose_for_scores(mixed_key_layer1)\n        value_layer1 = self.transpose_for_scores(mixed_value_layer1)\n        # for text input:"
+        },
+        {
+            "comment": "This code is performing multi-head attention operation. It first separates the input tensor into two parts, with each part going through its own set of linear layers to create query, key, and value tensors. Then it transposes the query and key tensors before computing attention scores by taking the dot product of the transposed query and key tensors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":299-320",
+            "content": "        mixed_query_layer2 = self.query2(input_tensor2)\n        mixed_key_layer2 = self.key2(input_tensor2)\n        mixed_value_layer2 = self.value2(input_tensor2)\n        query_layer2 = self.transpose_for_scores(mixed_query_layer2)\n        key_layer2 = self.transpose_for_scores(mixed_key_layer2)\n        value_layer2 = self.transpose_for_scores(mixed_value_layer2)\n        # for action input:\n        mixed_query_layer3 = self.query3(input_tensor3)\n        mixed_key_layer3 = self.key3(input_tensor3)\n        mixed_value_layer3 = self.value3(input_tensor3)\n        query_layer3 = self.transpose_for_scores(mixed_query_layer3)\n        key_layer3 = self.transpose_for_scores(mixed_key_layer3)\n        value_layer3 = self.transpose_for_scores(mixed_value_layer3)\n        def do_attention(query_layer, key_layer, value_layer, attention_mask,\n                         dropout):\n            \"\"\" compute attention \"\"\"\n            attention_scores = paddle.matmul(query_layer,\n                                             key_layer.transpose((0, 1, 3, 2)))"
+        },
+        {
+            "comment": "This code calculates attention scores between queries, keys, and values, normalizes them to probabilities using softmax, applies dropout, performs matrix multiplication with the values, transposes the result, reshapes it, and returns the context layer. It follows the Transformer paper's approach of dropping out entire tokens to attend to.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":321-341",
+            "content": "            attention_scores = attention_scores / math.sqrt(\n                self.attention_head_size)\n            attention_scores = attention_scores + attention_mask\n            # Normalize the attention scores to probabilities.\n            attention_probs = nn.Softmax(axis=-1)(attention_scores)\n            # This is actually dropping out entire tokens to attend to, which might\n            # seem a bit unusual, but is taken from the original Transformer paper.\n            attention_probs = dropout(attention_probs)\n            context_layer = paddle.matmul(attention_probs, value_layer)\n            context_layer = context_layer.transpose((0, 2, 1, 3))\n            new_context_layer_shape = context_layer.shape[:-2] + [\n                self.all_head_size\n            ]\n            context_layer = context_layer.reshape(new_context_layer_shape)\n            return context_layer\n        context_av = do_attention(query_layer3, key_layer1, value_layer1,\n                                  attention_mask1, self.dropout_av)"
+        },
+        {
+            "comment": "This code is performing attention mechanism for multi-scale context fusion. It uses interpolation to resize the context features, then adds them to the original key layers (key_layer2 and key_layer1). The purpose of this is to incorporate contextual information from different scales into the model's understanding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":342-360",
+            "content": "        context_at = do_attention(query_layer3, key_layer2, value_layer2,\n                                  attention_mask2, self.dropout_at)\n        context_key_av = self.key_av(context_av).transpose((0, 2, 1))\n        # interpolate only support 4-D tensor now.\n        context_key_av = F.interpolate(context_key_av.unsqueeze(-1),\n                                       size=(key_layer2.shape[2],\n                                             1)).squeeze(-1)\n        context_key_av = self.transpose_for_scores(\n            context_key_av.transpose((0, 2, 1)))\n        key_layer2 = key_layer2 + context_key_av\n        context_key_at = self.key_at(context_at).transpose((0, 2, 1))\n        context_key_at = F.interpolate(context_key_at.unsqueeze(-1),\n                                       size=(key_layer1.shape[2],\n                                             1)).squeeze(-1)\n        context_key_at = self.transpose_for_scores(\n            context_key_at.transpose((0, 2, 1)))\n        key_layer1 = key_layer1 + context_key_at"
+        },
+        {
+            "comment": "This code snippet is performing cross-attention in a transformer model. It first interpolates and adds context vectors to value layers, then applies attention mechanisms to compute context layers. This process helps to capture dependencies between different parts of the input data effectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":362-380",
+            "content": "        context_val_av = self.value_at(context_av).transpose((0, 2, 1))\n        context_val_av = F.interpolate(context_val_av.unsqueeze(-1),\n                                       size=(value_layer2.shape[2],\n                                             1)).squeeze(-1)\n        context_val_av = self.transpose_for_scores(\n            context_val_av.transpose((0, 2, 1)))\n        value_layer2 = value_layer2 + context_val_av\n        context_val_at = self.value_at(context_at).transpose((0, 2, 1))\n        context_val_at = F.interpolate(context_val_at.unsqueeze(-1),\n                                       size=(value_layer1.shape[2],\n                                             1)).squeeze(-1)\n        context_val_at = self.transpose_for_scores(\n            context_val_at.transpose((0, 2, 1)))\n        value_layer1 = value_layer1 + context_val_at\n        context_layer1 = do_attention(query_layer1, key_layer1, value_layer1,\n                                      attention_mask1, self.dropout1)\n        context_layer2 = do_attention(query_layer2, key_layer2, value_layer2,"
+        },
+        {
+            "comment": "The code defines a class \"BertEntOutput\" with several layers including dense and dropout. It uses bi-hidden size, hidden size, v_hidden_size, and corresponding dropout probabilities for initializing the layers. This class seems to be part of a model architecture where it performs layer normalization, applies dropout regularization, and linear transformations to process input data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":381-408",
+            "content": "                                      attention_mask2, self.dropout2)\n        context_layer3 = do_attention(query_layer3, key_layer3, value_layer3,\n                                      attention_mask3, self.dropout3)\n        return context_layer1, context_layer2, context_layer3  # vision, text, action\nclass BertEntOutput(nn.Layer):\n    def __init__(\n        self,\n        bi_hidden_size,\n        hidden_size,\n        v_hidden_size,\n        v_hidden_dropout_prob,\n        hidden_dropout_prob,\n    ):\n        super(BertEntOutput, self).__init__()\n        self.dense1 = nn.Linear(bi_hidden_size, v_hidden_size)\n        self.LayerNorm1 = nn.LayerNorm(v_hidden_size, epsilon=1e-12)\n        self.dropout1 = nn.Dropout(v_hidden_dropout_prob)\n        self.dense2 = nn.Linear(bi_hidden_size, hidden_size)\n        self.LayerNorm2 = nn.LayerNorm(hidden_size, epsilon=1e-12)\n        self.dropout2 = nn.Dropout(hidden_dropout_prob)\n        self.dense3 = nn.Linear(bi_hidden_size, hidden_size)\n        self.LayerNorm3 = nn.LayerNorm(hidden_size, epsilon=1e-12)"
+        },
+        {
+            "comment": "This code defines a BertLayer class with Dropout layers and dense layers, performing attention mechanism. The forward method applies these layers to hidden_states1,2,3, and returns the updated hidden states after adding them with their corresponding input tensors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":409-439",
+            "content": "        self.dropout3 = nn.Dropout(hidden_dropout_prob)\n    def forward(\n        self,\n        hidden_states1,\n        input_tensor1,\n        hidden_states2,\n        input_tensor2,\n        hidden_states3,\n        input_tensor3,\n    ):\n        context_state1 = self.dense1(hidden_states1)\n        context_state1 = self.dropout1(context_state1)\n        context_state2 = self.dense2(hidden_states2)\n        context_state2 = self.dropout2(context_state2)\n        context_state3 = self.dense3(hidden_states3)\n        context_state3 = self.dropout3(context_state3)\n        hidden_states1 = self.LayerNorm1(context_state1 + input_tensor1)\n        hidden_states2 = self.LayerNorm2(context_state2 + input_tensor2)\n        hidden_states3 = self.LayerNorm3(context_state3 + input_tensor3)\n        return hidden_states1, hidden_states2, hidden_states3\nclass BertLayer(nn.Layer):\n    def __init__(self, hidden_size, intermediate_size, hidden_act,\n                 hidden_dropout_prob, num_attention_heads,\n                 attention_probs_dropout_prob):"
+        },
+        {
+            "comment": "This code defines a BertLayer class and a BertConnectionLayer class. The BertLayer class includes an attention layer, an intermediate layer, and an output layer. It has a forward function that performs the calculations for these layers. The BertConnectionLayer class is a subclass of nn.Layer with various parameters for hidden sizes and attention dropout probabilities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":440-460",
+            "content": "        super(BertLayer, self).__init__()\n        self.attention = BertAttention(hidden_size, hidden_dropout_prob,\n                                       num_attention_heads,\n                                       attention_probs_dropout_prob)\n        self.intermediate = BertIntermediate(hidden_size, intermediate_size,\n                                             hidden_act)\n        self.output = BertOutput(intermediate_size, hidden_size,\n                                 hidden_dropout_prob)\n    def forward(self, hidden_states, attention_mask):\n        attention_output, attention_probs = self.attention(\n            hidden_states, attention_mask)\n        intermediate_output = self.intermediate(attention_output)\n        layer_output = self.output(intermediate_output, attention_output)\n        return layer_output, attention_probs\nclass BertConnectionLayer(nn.Layer):\n    def __init__(self, hidden_size, v_hidden_size, a_hidden_size,\n                 bi_hidden_size, bi_num_attention_heads,\n                 attention_probs_dropout_prob, v_attention_probs_dropout_prob,"
+        },
+        {
+            "comment": "This code initializes a BertConnectionLayer object with various parameters including hidden size, attention probabilities dropout probability, and intermediate size. It also initializes two other objects: BertEntAttention and BertEntOutput. The BertEntAttention object is responsible for performing entity-based attention while the BertEntOutput object is responsible for producing the output of the connection layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":461-486",
+            "content": "                 a_attention_probs_dropout_prob,\n                 av_attention_probs_dropout_prob,\n                 at_attention_probs_dropout_prob, intermediate_size,\n                 v_intermediate_size, a_intermediate_size, hidden_act,\n                 v_hidden_act, a_hidden_act, hidden_dropout_prob,\n                 v_hidden_dropout_prob, a_hidden_dropout_prob):\n        super(BertConnectionLayer, self).__init__()\n        self.ent_attention = BertEntAttention(\n            hidden_size,\n            v_hidden_size,\n            a_hidden_size,\n            bi_hidden_size,\n            attention_probs_dropout_prob,\n            v_attention_probs_dropout_prob,\n            a_attention_probs_dropout_prob,\n            av_attention_probs_dropout_prob,\n            at_attention_probs_dropout_prob,\n            bi_num_attention_heads,\n        )\n        self.ent_output = BertEntOutput(\n            bi_hidden_size,\n            hidden_size,\n            v_hidden_size,\n            v_hidden_dropout_prob,\n            hidden_dropout_prob,"
+        },
+        {
+            "comment": "This code defines a model with three input streams (v, t, and a) and initializes intermediate layers and output layers for each stream. The forward function takes in two pairs of input tensors and attention masks for each stream.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":487-511",
+            "content": "        )\n        self.v_intermediate = BertIntermediate(v_hidden_size,\n                                               v_intermediate_size,\n                                               v_hidden_act)\n        self.v_output = BertOutput(v_intermediate_size, v_hidden_size,\n                                   v_hidden_dropout_prob)\n        self.t_intermediate = BertIntermediate(hidden_size, intermediate_size,\n                                               hidden_act)\n        self.t_output = BertOutput(intermediate_size, hidden_size,\n                                   hidden_dropout_prob)\n        self.a_intermediate = BertIntermediate(a_hidden_size,\n                                               a_intermediate_size,\n                                               a_hidden_act)\n        self.a_output = BertOutput(a_intermediate_size, a_hidden_size,\n                                   a_hidden_dropout_prob)\n    def forward(\n        self,\n        input_tensor1,\n        attention_mask1,\n        input_tensor2,\n        attention_mask2,"
+        },
+        {
+            "comment": "This function computes the layer outputs for three different pathways using the ActBert Encoder. It utilizes attention masks and input tensors for each pathway, passing them through various intermediate layers before returning the final layer outputs. The BertEncoder class represents a combination of three multi-BertLayers and BertConnectionLayer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":512-538",
+            "content": "        input_tensor3,\n        attention_mask3,\n    ):\n        ent_output1, ent_output2, ent_output3 = self.ent_attention(\n            input_tensor1, attention_mask1, input_tensor2, attention_mask2,\n            input_tensor3, attention_mask3)\n        attention_output1, attention_output2, attention_output3 = self.ent_output(\n            ent_output1, input_tensor1, ent_output2, input_tensor2, ent_output3,\n            input_tensor3)\n        intermediate_output1 = self.v_intermediate(attention_output1)\n        layer_output1 = self.v_output(intermediate_output1, attention_output1)\n        intermediate_output2 = self.t_intermediate(attention_output2)\n        layer_output2 = self.t_output(intermediate_output2, attention_output2)\n        intermediate_output3 = self.a_intermediate(attention_output3)\n        layer_output3 = self.a_output(intermediate_output3, attention_output3)\n        return layer_output1, layer_output2, layer_output3\nclass BertEncoder(nn.Layer):\n    \"\"\"\n    ActBert Encoder, consists 3 pathway of multi-BertLayers and BertConnectionLayer."
+        },
+        {
+            "comment": "This code defines the `BertEncoder` class, which initializes various parameters for the BERT model's encoder. These parameters include attention IDs, fixed layer positions, hidden sizes, and dropout probabilities for different components of the model. The class extends `super(BertEncoder, self).__init__()`, indicating it inherits from another class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":539-575",
+            "content": "    \"\"\"\n    def __init__(\n        self,\n        v_ent_attention_id,\n        t_ent_attention_id,\n        a_ent_attention_id,\n        fixed_t_layer,\n        fixed_v_layer,\n        hidden_size,\n        v_hidden_size,\n        a_hidden_size,\n        bi_hidden_size,\n        intermediate_size,\n        v_intermediate_size,\n        a_intermediate_size,\n        hidden_act,\n        v_hidden_act,\n        a_hidden_act,\n        hidden_dropout_prob,\n        v_hidden_dropout_prob,\n        a_hidden_dropout_prob,\n        attention_probs_dropout_prob,\n        v_attention_probs_dropout_prob,\n        a_attention_probs_dropout_prob,\n        av_attention_probs_dropout_prob,\n        at_attention_probs_dropout_prob,\n        num_attention_heads,\n        v_num_attention_heads,\n        a_num_attention_heads,\n        bi_num_attention_heads,\n        num_hidden_layers,\n        v_num_hidden_layers,\n        a_num_hidden_layers,\n    ):\n        super(BertEncoder, self).__init__()\n        self.v_ent_attention_id = v_ent_attention_id\n        self.t_ent_attention_id = t_ent_attention_id"
+        },
+        {
+            "comment": "This code initializes three BertLayer objects and one BertConnectionLayer object with different hidden sizes, intermediate sizes, activation functions, dropout probabilities, and attention head numbers. These layers will be used for encoding input sequences in the Actor-Critic Transformer (ACT) model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":576-593",
+            "content": "        self.a_ent_attention_id = a_ent_attention_id\n        self.fixed_t_layer = fixed_t_layer\n        self.fixed_v_layer = fixed_v_layer\n        layer = BertLayer(hidden_size, intermediate_size, hidden_act,\n                          hidden_dropout_prob, num_attention_heads,\n                          attention_probs_dropout_prob)\n        v_layer = BertLayer(v_hidden_size, v_intermediate_size, v_hidden_act,\n                            v_hidden_dropout_prob, v_num_attention_heads,\n                            v_attention_probs_dropout_prob)\n        a_layer = BertLayer(a_hidden_size, a_intermediate_size, a_hidden_act,\n                            a_hidden_dropout_prob, a_num_attention_heads,\n                            a_attention_probs_dropout_prob)\n        connect_layer = BertConnectionLayer(\n            hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,\n            bi_num_attention_heads, attention_probs_dropout_prob,\n            v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,\n            av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,"
+        },
+        {
+            "comment": "This code defines a model with separate layers for text (txt_layer), vision (v_layer), and action (a_layer) embeddings. It also includes a connect_layer to combine the visual and action information for attention masks. The forward method takes input embeddings, attention masks, and an optional parameter to output all encoded layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":594-621",
+            "content": "            intermediate_size, v_intermediate_size, a_intermediate_size,\n            hidden_act, v_hidden_act, a_hidden_act, hidden_dropout_prob,\n            v_hidden_dropout_prob, a_hidden_dropout_prob)\n        self.layer = nn.LayerList(\n            [copy.deepcopy(layer) for _ in range(num_hidden_layers)])  #12\n        self.v_layer = nn.LayerList(\n            [copy.deepcopy(v_layer) for _ in range(v_num_hidden_layers)])  #2\n        self.a_layer = nn.LayerList(\n            [copy.deepcopy(a_layer) for _ in range(a_num_hidden_layers)])  #3\n        self.c_layer = nn.LayerList([\n            copy.deepcopy(connect_layer) for _ in range(len(v_ent_attention_id))\n        ]  #2  [0,1]\n                                    )\n    def forward(\n        self,\n        txt_embedding,\n        image_embedding,\n        action_embedding,\n        txt_attention_mask,\n        image_attention_mask,\n        action_attention_mask,\n        output_all_encoded_layers=True,\n    ):\n        v_start, a_start, t_start = 0, 0, 0\n        count = 0\n        all_encoder_layers_t = []"
+        },
+        {
+            "comment": "This code initializes empty lists for all encoder layers in vision and audio. It then iterates through the given layer IDs, splitting them into vision (v), audio (a), and time (t) layers. The code asserts that the fixed time layer is less than or equal to the last time layer, and the fixed vision layer is less than or equal to the last vision layer. Next, it iterates through all vision layers from the start index up to but not including the fixed vision layer. Inside this loop, it applies the corresponding vision layer to the image embedding and attention mask using Paddle's no_grad context manager. Finally, it loops over all vision layers from the start index to the end index (fixed vision layer excluded), applying the corresponding vision layer to the image embedding and attention mask.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":622-644",
+            "content": "        all_encoder_layers_v = []\n        all_encoder_layers_a = []\n        for v_layer_id, a_layer_id, t_layer_id in zip(self.v_ent_attention_id,\n                                                      self.a_ent_attention_id,\n                                                      self.t_ent_attention_id):\n            v_end = v_layer_id\n            a_end = a_layer_id\n            t_end = t_layer_id\n            assert self.fixed_t_layer <= t_end\n            assert self.fixed_v_layer <= v_end\n            ### region embedding\n            for idx in range(v_start,\n                             self.fixed_v_layer):  #\u4e24\u6b21\u8bad\u7ec3\uff0c\u8fd9\u4e2a\u5faa\u73af\u90fd\u6ca1\u6709\u8fdb\u53bb  #\u524d\u9762\u7684\u5c42\u56fa\u5b9a\u4f4f\n                with paddle.no_grad():\n                    image_embedding, image_attention_probs = self.v_layer[idx](\n                        image_embedding, image_attention_mask)\n                    v_start = self.fixed_v_layer\n            for idx in range(v_start, v_end):\n                image_embedding, image_attention_probs = self.v_layer[idx](\n                    image_embedding, image_attention_mask)"
+        },
+        {
+            "comment": "This code is performing multi-modal embedding by separately handling action, text, and image embeddings. It iterates through layers for each modality to compute the embeddings and attention probs. Finally, it combines the embeddings in a specific order before potentially updating start/end indices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":646-668",
+            "content": "            ### action embedding\n            for idx in range(a_start, a_end):\n                action_embedding, action_attention_probs = self.a_layer[idx](\n                    action_embedding, action_attention_mask)\n            ### text embedding\n            for idx in range(t_start, self.fixed_t_layer):\n                with paddle.no_grad():\n                    txt_embedding, txt_attention_probs = self.layer[idx](\n                        txt_embedding, txt_attention_mask)\n                    t_start = self.fixed_t_layer\n            for idx in range(t_start, t_end):\n                txt_embedding, txt_attention_probs = self.layer[idx](\n                    txt_embedding, txt_attention_mask)\n            image_embedding, txt_embedding, action_embedding = self.c_layer[\n                count](image_embedding, image_attention_mask, txt_embedding,\n                       txt_attention_mask, action_embedding,\n                       action_attention_mask)\n            v_start = v_end\n            t_start = t_end\n            a_start = a_end"
+        },
+        {
+            "comment": "This code is responsible for encoding text, image, and action inputs in a neural network model. It iterates over the layers of each input type to produce their respective encoded representations. If output_all_encoded_layers is set to True, it appends all intermediate encoded layers to separate lists; otherwise, only the final encoded layer is stored. This allows for flexibility in selecting which encoded layers to use in further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":669-692",
+            "content": "            count += 1\n            if output_all_encoded_layers:\n                all_encoder_layers_t.append(txt_embedding)\n                all_encoder_layers_v.append(image_embedding)\n                all_encoder_layers_a.append(action_embedding)\n        for idx in range(v_start, len(self.v_layer)):  # 1\n            image_embedding, image_attention_probs = self.v_layer[idx](\n                image_embedding, image_attention_mask)\n        for idx in range(a_start, len(self.a_layer)):\n            action_embedding, action_attention_probs = self.a_layer[idx](\n                action_embedding, action_attention_mask)\n        for idx in range(t_start, len(self.layer)):\n            txt_embedding, txt_attention_probs = self.layer[idx](\n                txt_embedding, txt_attention_mask)\n        # add the end part to finish.\n        if not output_all_encoded_layers:\n            all_encoder_layers_t.append(txt_embedding)  #8, 36, 768\n            all_encoder_layers_v.append(image_embedding)  #8, 37, 1024\n            all_encoder_layers_a.append(action_embedding)  #8, 5, 768"
+        },
+        {
+            "comment": "The code defines a BertPooler class that pools the model by taking the hidden state corresponding to the first token. It also includes a BertModel class with various parameters for initializing the BERT model, including vocab size, max position embeddings, type vocab size, and feature sizes for different entities (v, a). The code also defines attention IDs and fixed layers for tokens and aspects.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":694-728",
+            "content": "        return all_encoder_layers_t, all_encoder_layers_v, all_encoder_layers_a\nclass BertPooler(nn.Layer):\n    \"\"\" \"Pool\" the model by simply taking the hidden state corresponding\n        to the first token.\n    \"\"\"\n    def __init__(self, hidden_size, bi_hidden_size):\n        super(BertPooler, self).__init__()\n        self.dense = nn.Linear(hidden_size, bi_hidden_size)\n        self.activation = nn.ReLU()\n    def forward(self, hidden_states):\n        first_token_tensor = hidden_states[:, 0]  #8, 768\n        pooled_output = self.dense(first_token_tensor)\n        pooled_output = self.activation(pooled_output)\n        return pooled_output\nclass BertModel(nn.Layer):\n    def __init__(\n        self,\n        vocab_size,\n        max_position_embeddings,\n        type_vocab_size,\n        v_feature_size,\n        a_feature_size,\n        num_hidden_layers,\n        v_num_hidden_layers,\n        a_num_hidden_layers,\n        v_ent_attention_id,\n        t_ent_attention_id,\n        a_ent_attention_id,\n        fixed_t_layer,\n        fixed_v_layer,"
+        },
+        {
+            "comment": "This code is initializing a BertModel class with various parameters including hidden size, attention-related parameters, and embedding types. It uses the superclass constructor to initialize the base model and then further customizes it by adding word and image embeddings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":729-758",
+            "content": "        hidden_size,\n        v_hidden_size,\n        a_hidden_size,\n        bi_hidden_size,\n        intermediate_size,\n        v_intermediate_size,\n        a_intermediate_size,\n        hidden_act,\n        v_hidden_act,\n        a_hidden_act,\n        hidden_dropout_prob,\n        v_hidden_dropout_prob,\n        a_hidden_dropout_prob,\n        attention_probs_dropout_prob,\n        v_attention_probs_dropout_prob,\n        a_attention_probs_dropout_prob,\n        av_attention_probs_dropout_prob,\n        at_attention_probs_dropout_prob,\n        num_attention_heads,\n        v_num_attention_heads,\n        a_num_attention_heads,\n        bi_num_attention_heads,\n    ):\n        super(BertModel, self).__init__()\n        # initilize word embedding\n        self.embeddings = BertEmbeddings(vocab_size, max_position_embeddings,\n                                         type_vocab_size, hidden_size,\n                                         hidden_dropout_prob)\n        # initlize the region embedding\n        self.v_embeddings = BertImageEmbeddings(v_feature_size, v_hidden_size,"
+        },
+        {
+            "comment": "This code is initializing a model for the ACTBERT backbone, which includes an encoder and action embedding. The model has parameters for various hidden sizes, dropout probabilities, attention head numbers, and hidden layer counts for both textual (v), visual (t), and action (a) components. The model also uses different activation functions for each component.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":759-774",
+            "content": "                                                v_hidden_dropout_prob)\n        # initlize the action embedding\n        self.a_embeddings = BertActionEmbeddings(a_feature_size, a_hidden_size,\n                                                 a_hidden_dropout_prob)\n        self.encoder = BertEncoder(\n            v_ent_attention_id, t_ent_attention_id, a_ent_attention_id,\n            fixed_t_layer, fixed_v_layer, hidden_size, v_hidden_size,\n            a_hidden_size, bi_hidden_size, intermediate_size,\n            v_intermediate_size, a_intermediate_size, hidden_act, v_hidden_act,\n            a_hidden_act, hidden_dropout_prob, v_hidden_dropout_prob,\n            a_hidden_dropout_prob, attention_probs_dropout_prob,\n            v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,\n            av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,\n            num_attention_heads, v_num_attention_heads, a_num_attention_heads,\n            bi_num_attention_heads, num_hidden_layers, v_num_hidden_layers,"
+        },
+        {
+            "comment": "This code defines a class for a model that takes in text, action, and image features as inputs. It initializes three poolers for text, action, and visual features. The forward function processes the input data and returns encoded layers based on the inputs received. The output_all_encoded_layers parameter allows getting all encoded layers if set to True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":775-799",
+            "content": "            a_num_hidden_layers)\n        self.t_pooler = BertPooler(hidden_size, bi_hidden_size)\n        self.v_pooler = BertPooler(v_hidden_size, bi_hidden_size)\n        self.a_pooler = BertPooler(a_hidden_size, bi_hidden_size)\n    def forward(\n        self,\n        text_ids,\n        action_feat,\n        image_feat,\n        image_loc,\n        token_type_ids=None,\n        text_mask=None,\n        image_mask=None,\n        action_mask=None,\n        output_all_encoded_layers=False,\n    ):\n        \"\"\"\n        text_ids: input text ids. Shape: [batch_size, seqence_length]\n        action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]\n        image_feat: input image feature. Shape: [batch_size, region_length, image_feature_dim]]\n        image_loc: input region location. Shape: [batch_size, region_length, region_location_dim]\n        token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]\n        text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]"
+        },
+        {
+            "comment": "This code checks if the input masks for text, token_type, image, and action are None. If any of them are None, it generates a mask with the same shape as the corresponding feature tensor and fills it with ones (real tokens) or zeros (padding tokens). The attention mask is created from the 2D tensor mask to be used in the multi-head attention mechanism, which broadcasts to [batch_size, num_heads, from_seq_length, to_seq_length].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":800-818",
+            "content": "        image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]\n        action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]\n        output_all_encoded_layers: is output encoded layers feature or not. Type: Bool.\n        \"\"\"\n        if text_mask is None:\n            text_mask = paddle.ones_like(text_ids)\n        if token_type_ids is None:\n            token_type_ids = paddle.zeros_like(text_ids)\n        if image_mask is None:\n            image_mask = paddle.ones(image_feat.shape[0],\n                                     image_feat.shape[1]).astype(text_ids.dtype)\n        if action_mask is None:\n            action_mask = paddle.ones(action_feat.shape[0],\n                                      action_feat.shape[1]).astype(\n                                          text_ids.dtype)\n        # We create a 3D attention mask from a 2D tensor mask.\n        # Sizes are [batch_size, 1, 1, to_seq_length]\n        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]."
+        },
+        {
+            "comment": "This code segment is part of a backbone model for the ACTBERT. It creates extended masks for text, image, and action inputs by unsqueezing the existing masks along dimensions 1 and 2. The function set_mask is then used to multiply each mask with -10000.0 at positions where we want to attend, effectively removing those positions from the attention process. This is done for all three input types: text, image, and action. Finally, the code applies the embeddings to the text inputs using self.embeddings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":819-837",
+            "content": "        extended_text_mask = text_mask.unsqueeze(1).unsqueeze(2)\n        extended_image_mask = image_mask.unsqueeze(1).unsqueeze(2)\n        extended_action_mask = action_mask.unsqueeze(1).unsqueeze(2)\n        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for\n        # masked positions, this operation will create a tensor which is 0.0 for\n        # positions we want to attend and -10000.0 for masked positions.\n        # Since we are adding it to the raw scores before the softmax, this is\n        # effectively the same as removing these entirely.\n        def set_mask(extended_attention_mask):\n            extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0\n            return extended_attention_mask\n        extended_text_mask = set_mask(extended_text_mask)\n        extended_image_mask = set_mask(extended_image_mask)\n        extended_action_mask = set_mask(extended_action_mask)\n        t_embedding_output = self.embeddings(text_ids, token_type_ids)\n        v_embedding_output = self.v_embeddings(image_feat, image_loc)"
+        },
+        {
+            "comment": "This code is part of a backbone for a multimodal model, specifically ACTBERT. It first computes the embedding outputs for text (t), vision (v), and action (a) features. Then it passes these embeddings to an encoder to obtain encoded layers for each modality. The last hidden state from each encoder is used as a sequence output, and a pooled output is also computed using separate poolers for each modality. If output_all_encoded_layers is False, the code reduces the encoded layers to their last hidden states.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":838-864",
+            "content": "        a_embedding_output = self.a_embeddings(action_feat)\n        # var = [t_embedding_output, v_embedding_output, a_embedding_output]\n        # import numpy as np\n        # for i, item in enumerate(var):\n        #     np.save('tmp/' + str(i)+'.npy', item.numpy())\n        encoded_layers_t, encoded_layers_v, encoded_layers_a = self.encoder(\n            t_embedding_output,\n            v_embedding_output,\n            a_embedding_output,\n            extended_text_mask,\n            extended_image_mask,\n            extended_action_mask,\n            output_all_encoded_layers=output_all_encoded_layers,\n        )\n        sequence_output_t = encoded_layers_t[-1]  #get item from list\n        sequence_output_v = encoded_layers_v[-1]\n        sequence_output_a = encoded_layers_a[-1]\n        pooled_output_t = self.t_pooler(sequence_output_t)\n        pooled_output_v = self.v_pooler(sequence_output_v)\n        pooled_output_a = self.a_pooler(sequence_output_a)\n        if not output_all_encoded_layers:\n            encoded_layers_t = encoded_layers_t[-1]"
+        },
+        {
+            "comment": "This code defines two classes: `BertPredictionHeadTransform` and `BertLMPredictionHead`. The former is a transform layer used in BERT's prediction heads. It applies a dense layer followed by an activation function and layer normalization. The latter is the prediction head itself, which takes input hidden states, applies the transform defined in `BertPredictionHeadTransform`, and returns output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":865-891",
+            "content": "            encoded_layers_v = encoded_layers_v[-1]\n            encoded_layers_a = encoded_layers_a[-1]\n        return encoded_layers_t, encoded_layers_v, encoded_layers_a, \\\n            pooled_output_t, pooled_output_v, pooled_output_a\n# For Head\nclass BertPredictionHeadTransform(nn.Layer):\n    def __init__(self, hidden_size, hidden_act):\n        super(BertPredictionHeadTransform, self).__init__()\n        self.dense = nn.Linear(hidden_size, hidden_size)\n        if isinstance(hidden_act, str) or (sys.version_info[0] == 2\n                                           and isinstance(hidden_act, str)):\n            self.transform_act_fn = ACT2FN[hidden_act]\n        else:\n            self.transform_act_fn = hidden_act\n        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)\n    def forward(self, hidden_states):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.transform_act_fn(hidden_states)\n        hidden_states = self.LayerNorm(hidden_states)\n        return hidden_states\nclass BertLMPredictionHead(nn.Layer):"
+        },
+        {
+            "comment": "This code initializes the BertLMPredictionHead class, which is a part of the BERT model. It takes in hidden_size, hidden_act, and bert_model_embedding_weights as parameters. The class uses these to initialize its transform and decoder components. The decoder component has a weight equal to the input embedding weights with an output-only bias for each token. This implementation avoids creating additional large parameters by directly assigning the input embedding weights to the decoder's weight attribute.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":892-908",
+            "content": "    def __init__(self, hidden_size, hidden_act, bert_model_embedding_weights):\n        super(BertLMPredictionHead, self).__init__()\n        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)\n        # The output weights are the same as the input embeddings, but there is\n        # an output-only bias for each token.\n        assert bert_model_embedding_weights.shape[1] == hidden_size\n        vocab_size = bert_model_embedding_weights.shape[0]\n        # another implementation which would create another big params:\n        # self.decoder = nn.Linear(hidden_size, vocab_size)   # NOTE bias default: constant 0.0\n        # self.decoder.weight = self.create_parameter(shape=[hidden_size, vocab_size],\n        #                                             default_initializer=nn.initializer.Assign(\n        #                                                 bert_model_embedding_weights.t()))  # transpose\n        self.decoder_weight = bert_model_embedding_weights\n        self.decoder_bias = self.create_parameter("
+        },
+        {
+            "comment": "This code defines three classes: ActBert, BertImageActionPredictionHead, and BertPreTrainingHeads. ActBert is a layer that performs attention and feedforward operations in BERT's transformer layers. The BertImageActionPredictionHead class is responsible for the image action prediction task in BERT. Finally, the BertPreTrainingHeads class includes multiple layers for pre-training tasks such as masked language modeling, next sentence prediction, and SQUAD question answering.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":909-936",
+            "content": "            shape=[vocab_size],\n            dtype=bert_model_embedding_weights.dtype,\n            is_bias=True)  # NOTE bias default: constant 0.0\n    def forward(self, hidden_states):\n        hidden_states = self.transform(hidden_states)\n        hidden_states = paddle.tensor.matmul(\n            hidden_states, self.decoder_weight,\n            transpose_y=True) + self.decoder_bias\n        return hidden_states\nclass BertImageActionPredictionHead(nn.Layer):\n    def __init__(self, hidden_size, hidden_act, target_size):\n        super(BertImageActionPredictionHead, self).__init__()\n        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)\n        self.decoder = nn.Linear(hidden_size, target_size)\n    def forward(self, hidden_states):\n        hidden_states = self.transform(hidden_states)\n        hidden_states = self.decoder(hidden_states)\n        return hidden_states\nclass BertPreTrainingHeads(nn.Layer):\n    def __init__(self, hidden_size, v_hidden_size, a_hidden_size,\n                 bi_hidden_size, hidden_act, v_hidden_act, a_hidden_act,"
+        },
+        {
+            "comment": "The code defines a class BertPreTrainingHeads that extends an existing class. It initializes the necessary modules for prediction and fusion. The forward function performs pooling and fusion operations based on the specified fusion method ('sum').",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":937-955",
+            "content": "                 v_target_size, a_target_size, fusion_method,\n                 bert_model_embedding_weights):\n        super(BertPreTrainingHeads, self).__init__()\n        self.predictions = BertLMPredictionHead(hidden_size, hidden_act,\n                                                bert_model_embedding_weights)\n        self.seq_relationship = nn.Linear(bi_hidden_size, 2)\n        self.imagePredictions = BertImageActionPredictionHead(\n            v_hidden_size, v_hidden_act, v_target_size)  # visual class number\n        self.actionPredictions = BertImageActionPredictionHead(\n            a_hidden_size, a_hidden_act, a_target_size)  # action class number\n        self.fusion_method = fusion_method\n        self.dropout = nn.Dropout(0.1)\n    def forward(self, sequence_output_t, sequence_output_v, sequence_output_a,\n                pooled_output_t, pooled_output_v, pooled_output_a):\n        if self.fusion_method == 'sum':\n            pooled_output = self.dropout(pooled_output_t + pooled_output_v +\n                                         pooled_output_a)"
+        },
+        {
+            "comment": "In this code snippet, the model is returning prediction scores for text (t), video (v), and action (a) inputs along with a sequence relationship score. The model is a multi-modal pre-training BERT model with fusion method 'mul'. If the fusion method is not 'mul', it will raise an assertion error.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":956-985",
+            "content": "        elif self.fusion_method == 'mul':\n            pooled_output = self.dropout(pooled_output_t * pooled_output_v +\n                                         pooled_output_a)\n        else:\n            assert False\n        prediction_scores_t = self.predictions(\n            sequence_output_t)  # 8\uff0c 36 \uff0c30522\n        seq_relationship_score = self.seq_relationship(pooled_output)  # 8, 2\n        prediction_scores_v = self.imagePredictions(\n            sequence_output_v)  # 8, 37, 1601\n        prediction_scores_a = self.actionPredictions(\n            sequence_output_a)  # 8, 5, 401\n        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score\n@BACKBONES.register()\nclass BertForMultiModalPreTraining(nn.Layer):\n    \"\"\"BERT model with multi modal pre-training heads.\n    \"\"\"\n    def __init__(\n        self,\n        vocab_size=30522,\n        max_position_embeddings=512,\n        type_vocab_size=2,\n        v_target_size=1601,\n        a_target_size=700,\n        v_feature_size=2048,\n        a_feature_size=2048,"
+        },
+        {
+            "comment": "This code defines a custom transformer backbone model for ACT-BERT, with specific configurations for the text (t), video (v), and audio (a) modalities. It includes parameters such as hidden layer numbers, sizes, activation functions, dropout rates, attention heads, and fusion method. The pretrained parameter is set to None.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":986-1017",
+            "content": "        num_hidden_layers=12,\n        v_num_hidden_layers=2,\n        a_num_hidden_layers=3,\n        t_ent_attention_id=[10, 11],\n        v_ent_attention_id=[0, 1],\n        a_ent_attention_id=[0, 1],\n        fixed_t_layer=0,\n        fixed_v_layer=0,\n        hidden_size=768,\n        v_hidden_size=1024,\n        a_hidden_size=768,\n        bi_hidden_size=1024,\n        intermediate_size=3072,\n        v_intermediate_size=1024,\n        a_intermediate_size=3072,\n        hidden_act=\"gelu\",\n        v_hidden_act=\"gelu\",\n        a_hidden_act=\"gelu\",\n        hidden_dropout_prob=0.1,\n        v_hidden_dropout_prob=0.1,\n        a_hidden_dropout_prob=0.1,\n        attention_probs_dropout_prob=0.1,\n        v_attention_probs_dropout_prob=0.1,\n        a_attention_probs_dropout_prob=0.1,\n        av_attention_probs_dropout_prob=0.1,\n        at_attention_probs_dropout_prob=0.1,\n        num_attention_heads=12,\n        v_num_attention_heads=8,\n        a_num_attention_heads=12,\n        bi_num_attention_heads=8,\n        fusion_method=\"mul\",\n        pretrained=None,"
+        },
+        {
+            "comment": "This function defines the input parameters for an ActBERT model, including vocabulary size, maximum position embedding, type vocab size, visual and action target sizes, feature sizes for vision and actions, number of hidden layers in text, visual, and action transformers, index IDs for BertConnectionLayer, and a fixed layer index for the text transformer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":1018-1033",
+            "content": "    ):\n        \"\"\"\n        vocab_size: vocabulary size. Default: 30522.\n        max_position_embeddings: max position id. Default: 512.\n        type_vocab_size: max segment id. Default: 2.\n        v_target_size: class number of visual word. Default: 1601.\n        a_target_size: class number of action word. Default: 700.\n        v_feature_size: input visual feature dimension. Default: 2048.\n        a_feature_size: input action feature dimension. Default: 2048.\n        num_hidden_layers: number of BertLayer in text transformer. Default: 12.\n        v_num_hidden_layers: number of BertLayer in visual transformer. Default: 2.\n        a_num_hidden_layers: number of BertLayer in action transformer. Default:3.\n        t_ent_attention_id: index id of BertConnectionLayer in text transformer. Default: [10, 11].\n        v_ent_attention_id: index id of BertConnectionLayer in visual transformer. Default:[0, 1].\n        a_ent_attention_id: index id of BertConnectionLayer in action transformer. Default:[0, 1].\n        fixed_t_layer: index id of fixed BertLayer in text transformer. Default: 0."
+        },
+        {
+            "comment": "This code defines parameters for different BertLayers in a model. Fixed_v_layer is the index of a fixed BertLayer in the visual transformer, hidden_size is the hidden size for text and other BERT layers, v_hidden_size is the hidden size for visual BERT layer, a_hidden_size is the hidden size for action BERT layer, bi_hidden_size is the hidden size for BertConnectionLayer, intermediate_size is the intermediate size for text and other BERT layers, v_intermediate_size is the intermediate size for visual BERT layer, a_intermediate_size is the intermediate size for action BERT layer, hidden_act is the activation function for text BERT layer, v_hidden_act is the activation function for visual BERT layer, a_hidden_act is the activation function for action BERT layer, and hidden_dropout_prob is the dropout probability for text embedding layer. All default values are provided in case no other values are specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":1034-1046",
+            "content": "        fixed_v_layer: index id of fixed BertLayer in visual transformer. Default: 0.\n        hidden_size: hidden size in text BertLayer. Default: 768.\n        v_hidden_size: hidden size in visual BertLayer. Default: 1024.\n        a_hidden_size: hidden size in action BertLayer. Default: 768.\n        bi_hidden_size: hidden size in BertConnectionLayer. Default: 1024,\n        intermediate_size: intermediate size in text BertLayer. Default: 3072.\n        v_intermediate_size: intermediate size in visual BertLayer. Default: 1024.\n        a_intermediate_size: intermediate size in text BertLayer. Default: 3072.\n        hidden_act: hidden activation function in text BertLayer. Default: \"gelu\".\n        v_hidden_act: hidden activation function in visual BertLayer. Default: \"gelu\".\n        a_hidden_act: hidden activation function in action BertLayer. Default: \"gelu\".\n        hidden_dropout_prob: hidden dropout probability in text Embedding Layer. Default: 0.1\n        v_hidden_dropout_prob: hidden dropout probability in visual Embedding Layer. Default: 0.1"
+        },
+        {
+            "comment": "This code snippet defines default values for various parameters in a transformer model. These parameters include hidden dropout probabilities, attention dropout probabilities, number of attention heads, and fusion methods. The default values are provided for the text, visual, and action BertLayers as well as the BertConnectionLayer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":1047-1057",
+            "content": "        a_hidden_dropout_prob: hidden dropout probability in action Embedding Layer. Default: 0.1\n        attention_probs_dropout_prob: attention dropout probability in text BertLayer. Default: 0.1\n        v_attention_probs_dropout_prob: attention dropout probability in visual BertLayer. Default: 0.1\n        a_attention_probs_dropout_prob: attention dropout probability in action BertLayer. Default: 0.1\n        av_attention_probs_dropout_prob: attention dropout probability in action-visual BertConnectionLayer. Default: 0.1\n        at_attention_probs_dropout_prob: attention dropout probability in action-text BertConnectionLayer. Default: 0.1\n        num_attention_heads: number of heads in text BertLayer. Default: 12.\n        v_num_attention_heads: number of heads in visual BertLayer. Default: 8.\n        a_num_attention_heads: number of heads in action BertLayer. Default: 12.\n        bi_num_attention_heads: number of heads in BertConnectionLayer. Default: 8.\n        fusion_method: methods of fusing pooled output from 3 transformer. Default: \"mul\"."
+        },
+        {
+            "comment": "This code initializes an instance of the BertForMultiModalPreTraining class, which is a pre-trained model for multi-modal tasks. It takes in various parameters such as vocab_size, max_position_embeddings, type_vocab_size, v_feature_size, a_feature_size, num_hidden_layers, v_num_hidden_layers, a_num_hidden_layers, v_ent_attention_id, t_ent_attention_id, a_ent_attention_id, fixed_t_layer, fixed_v_layer, hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size, intermediate_size, v_intermediate_size, a_intermediate_size, hidden_act, v_hidden_act, a_hidden_act, hidden_dropout_prob, v_hidden_dropout_prob, and a_hidden_dropout_prob. These parameters define the architecture and behavior of the model. The super() function is used to call a method from the parent class, in this case, BertModel. The pretrained variable indicates whether the model should use pre-trained weights or not.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":1058-1091",
+            "content": "        \"\"\"\n        super(BertForMultiModalPreTraining, self).__init__()\n        self.pretrained = pretrained\n        self.vocab_size = vocab_size\n        self.a_target_size = a_target_size\n        self.bert = BertModel(\n            vocab_size,\n            max_position_embeddings,\n            type_vocab_size,\n            v_feature_size,\n            a_feature_size,\n            num_hidden_layers,\n            v_num_hidden_layers,\n            a_num_hidden_layers,\n            v_ent_attention_id,\n            t_ent_attention_id,\n            a_ent_attention_id,\n            fixed_t_layer,\n            fixed_v_layer,\n            hidden_size,\n            v_hidden_size,\n            a_hidden_size,\n            bi_hidden_size,\n            intermediate_size,\n            v_intermediate_size,\n            a_intermediate_size,\n            hidden_act,\n            v_hidden_act,\n            a_hidden_act,\n            hidden_dropout_prob,\n            v_hidden_dropout_prob,\n            a_hidden_dropout_prob,\n            attention_probs_dropout_prob,"
+        },
+        {
+            "comment": "This code initializes the parameters of a pre-trained ACTBERT model. It checks if the model has been pre-trained and, if not, initializes the weights for the layers (using normal distribution with standard deviation 0.02).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":1092-1115",
+            "content": "            v_attention_probs_dropout_prob,\n            a_attention_probs_dropout_prob,\n            av_attention_probs_dropout_prob,\n            at_attention_probs_dropout_prob,\n            num_attention_heads,\n            v_num_attention_heads,\n            a_num_attention_heads,\n            bi_num_attention_heads,\n        )\n        self.cls = BertPreTrainingHeads(\n            hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,\n            hidden_act, v_hidden_act, a_hidden_act, v_target_size,\n            a_target_size, fusion_method,\n            self.bert.embeddings.word_embeddings.weight)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, (nn.Linear, nn.Embedding)):\n                    weight_init_(layer, 'Normal', std=0.02)"
+        },
+        {
+            "comment": "The code defines a function \"forward\" that takes text_ids, action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, and action_mask as input. The text_ids represent input text ids of shape [batch_size, sequence_length]. Action_feat is the input action feature of shape [batch_size, action_length, action_feature_dim], while image_feat is the input image feature of shape [batch_size, region_length+1, image_feature_dim] (adding 1 for global image feature). Image_loc represents the input region location of shape [batch_size, region_length+1, region_location_dim] (adding 1 for global image feature location). Token_type_ids represent segment ids of each video clip and are of shape [batch_size, sequence_length]. Text_mask is a binary mask representing real tokens as 1 and padding tokens as 0 with shape [batch_size, sequence_length]. Image_mask and action_mask also serve similar functions but for image and action respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":1116-1136",
+            "content": "                elif isinstance(layer, nn.LayerNorm):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(\n            self,\n            text_ids,  #8,36\n            action_feat,  #8,5,2048\n            image_feat,  #8,37,2048\n            image_loc,  #8,37,5\n            token_type_ids=None,  #8,36\n            text_mask=None,  #8,36\n            image_mask=None,  #8,37\n            action_mask=None,  #8,5\n    ):\n        \"\"\"\n        text_ids: input text ids. Shape: [batch_size, seqence_length]\n        action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]\n        image_feat: input image feature. Shape: [batch_size, region_length+1, image_feature_dim]], add 1 for image global feature.\n        image_loc: input region location. Shape: [batch_size, region_length+1, region_location_dim], add 1 for image global feature location.\n        token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]\n        text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]"
+        },
+        {
+            "comment": "This code is a function that takes in text IDs, action feature, image feature, image location, token type IDs, text mask, image mask, and action mask as inputs. It uses the BERT model to process these inputs and returns prediction scores for each input (text, vision, action) and sequence relationship score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/actbert.py\":1137-1157",
+            "content": "        image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]\n        action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]\n        \"\"\"\n        sequence_output_t, sequence_output_v, sequence_output_a, \\\n        pooled_output_t, pooled_output_v, pooled_output_a = self.bert(\n            text_ids,\n            action_feat,\n            image_feat,\n            image_loc,\n            token_type_ids,\n            text_mask,\n            image_mask,\n            action_mask,\n            output_all_encoded_layers=False,\n        )\n        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.cls(\n            sequence_output_t, sequence_output_v, sequence_output_a,\n            pooled_output_t, pooled_output_v, pooled_output_a)\n        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1aa47eef-6ab0-4aee-91fd-da4e1ad068c0.json b/docs/doc/1aa47eef-6ab0-4aee-91fd-da4e1ad068c0.json
new file mode 100644
index 000000000..958c2a7b0
--- /dev/null
+++ b/docs/doc/1aa47eef-6ab0-4aee-91fd-da4e1ad068c0.json
@@ -0,0 +1,10 @@
+{
+    "summary": "Copyright notice, Apache License v2.0, software distributed as is without warranties or conditions. Imports registry and utils modules, defines build_metric function that builds metric using provided configuration.",
+    "details": [
+        {
+            "comment": "Copyright notice, Apache License v2.0, software distributed as is without warranties or conditions. Imports registry and utils modules, defines build_metric function that builds metric using provided configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py\":0-19",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import METRIC\nfrom ..utils import build\ndef build_metric(cfg):\n    return build(cfg, METRIC)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1b7cfb16-3135-4285-88ab-77f1b90d4dd9.json b/docs/doc/1b7cfb16-3135-4285-88ab-77f1b90d4dd9.json
new file mode 100644
index 000000000..40310f681
--- /dev/null
+++ b/docs/doc/1b7cfb16-3135-4285-88ab-77f1b90d4dd9.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code sets up PaddleVideo on Linux, supports multi-card training and testing with PaddlePaddle. It provides log format for phases, resumes sessions, fine-tunes with pretrained params and best accuracy achieved. The code launches PaddleVideo in distributed mode with 4 GPUs, tests, exports model, introduces `use_gpu` parameter, and benchmark results are available in benchmark document.",
+    "details": [
+        {
+            "comment": "This code provides instructions for setting up the environment, preparing data using the PaddleVideo library, and explains its supported functions. It also mentions that it only supports Linux operation systems with GPU environments and gives an example of how to run the library. The code outlines the default destination folders for output, log files, and inference files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/usage.md\":0-27",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../zh-CN/usage.md) | English\n# Usage\n---\nPlease refer to [installation documents](./install.md) to prepare the enviroment, and follow the steps mentioned in the [data preparation documents](./dataset/) to construct dataset, we will take you through the basic functions supported by PaddleVideo, all of it takes the ucf101 dataset with frame format as example.\nPaddleVideo only support linux operation system and GPU running time environment now.\nDefault detination folder of PaddleVideo files. running the [example config](../../configs/example.yaml) as example.\n```\nPaddleVideo\n    \u251c\u2500\u2500 paddlevideo\n    \u251c\u2500\u2500 ... #other source codes\n    \u251c\u2500\u2500 output #ouput destination\n    |    \u251c\u2500\u2500 example\n    |    |   \u251c\u2500\u2500 example_best.pdparams #path_to_weights\n    |    |   \u2514\u2500\u2500 ...  \n    |    \u2514\u2500\u2500 ...  \n    \u251c\u2500\u2500 log  #log file destination.\n    |    \u251c\u2500\u2500 worker.0\n    |    \u251c\u2500\u2500 worker.1\n    |    \u2514\u2500\u2500 ...  \n    \u2514\u2500\u2500 inference #inference files destination.\n         \u251c\u2500\u2500 .pdiparams file\n         \u251c\u2500\u2500 .pdimodel file\n         \u2514\u2500\u2500 .pdiparmas.info file"
+        },
+        {
+            "comment": "This code demonstrates how to train and test a model using PaddlePaddle, a popular deep learning framework. The training process involves running multi-card training scripts or tests by executing the `paddle.distributed.launch` command with appropriate arguments such as GPU selection, script path, and optional configuration file. The configuration file allows for flexible updates like changing batch sizes on the fly. After starting the training, log files are generated for tracking progress and analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/usage.md\":28-70",
+            "content": "```\n<a name=\"1\"></a>\n## 1. Train and Test\nStart running multi-cards training scripts or test scripts by `paddle.distributed.launch`, or run the `run.sh` directly.\n```bash\nsh run.sh\n```\nWe put all the start commands in advanced in the ```run.sh```, please uncomment the selected one to run.\n<a name=\"model_train\"></a>\n### 1.1 Train\nSwitch `--validate` on to validating while training.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    --validate \\\n    -c ./configs/example.yaml\n```\nIndicating `-c` to set configuration, and one can flexible add `-o` in the script to update it.\n```bash\npython -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/example.yaml \\\n    --validate \\\n    -o DATASET.batch_size=16\n```\nIndicating `-o DATASET.batch_size=16` can update batch size to 16, please refer to [configuration](tutorials/config.md#config-yaml-details) for more information.\nAfter starting training, log files will generated, "
+        },
+        {
+            "comment": "The code shows log output format for training and validation phases, including time, epoch, batch ID, metrics, elapse time (execution time), and ips (instances per second). It also displays the best accuracy achieved during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/usage.md\":70-88",
+            "content": "and its format is shown as below, it will output to both the screen and files. Default destination of log is under the `.log/` folder, and stored in the files named like `worker.0`, `worker.1` ...\n[train phase] current time, current epoch/ total epoch, batch id, metrics, elapse time, ips, etc.:\n    [12/28 17:31:26] epoch:[ 1/80 ] train step:0   loss: 0.04656 lr: 0.000100 top1: 1.00000 top5: 1.00000 elapse: 0.326 reader: 0.001s ips: 98.22489 instance/sec.\n[eval phase] current time, current epoch/ total epoch, batch id, metrics, elapse time, ips, etc.:\n    [12/28 17:31:32] epoch:[ 80/80 ] val step:0    loss: 0.20538 top1: 0.88281 top5: 0.99219 elapse: 1.589 reader: 0.000s ips: 20.14003 instance/sec.\n[epoch end] current time, metrics, elapse time, ips, etc.\n    [12/28 17:31:38] END epoch:80  val loss_avg: 0.52208 top1_avg: 0.84398 top5_avg: 0.97393 elapse_avg: 0.234 reader_avg: 0.000 elapse_sum: 7.021s ips: 136.73686 instance/sec.\n[the best Acc]  \n    [12/28 17:28:42] Already save the best model (top1 acc)0.8494"
+        },
+        {
+            "comment": "The code provides instructions on how to use PaddleVideo for three different tasks: resuming a training session, finetuning with pretrained parameters, and testing. In the resume task, the user should indicate \"-o resume_epoch\" to continue from a specific epoch, while in finetuning, \"--weights\" is used to load pretrained parameters. The test mode is activated using \"--test\". PaddleVideo will not load unmatched parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/usage.md\":90-131",
+            "content": "<a name=\"model_resume\"></a>\n### 1.2 Resume\nIndicate `-o resume_epoch` to resume, It will training from ```resume_epoch``` epoch, PaddleVideo will auto load optimizers parameters and checkpoints from `./output` folder, as it is the default output destination.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/example.yaml \\\n    --validate \\\n    -o resume_epoch=5\n```\n<a name=\"model_finetune\"></a>\n### 1.3 Finetune\nIndicate `--weights` to load pretrained parameters, PaddleVideo will auto treat it as a finetune mission.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/example.yaml \\\n    --validate \\\n    --weights=./outputs/example/path_to_weights\n```\nNote: PaddleVideo will NOT load shape unmatched parameters.\n<a name=\"model_test\"></a>\n### 1.4 Test\nSwitch `--test` on to start test mode, and indicate `--weights` to load pretrained model.\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3"
+        },
+        {
+            "comment": "This code is launching PaddleVideo in distributed mode with four GPUs, running the main.py script with a specified configuration file, and performing testing using weights from a particular path. Then it exports the model for inference by specifying the configuration file, pretrained weights, and output directory. Lastly, it uses the PaddleInference engine to infer a video using the exported model files, input video file, and optional TensorRT acceleration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/usage.md\":133-173",
+            "content": "python3 -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    main.py \\\n    -c ./configs/example.yaml \\\n    --test \\\n    --weights=./output/example/path_to_weights\n```\n<a name=\"model_inference\"></a>\n## 2. Infer\nFirst, export model.\nIndicate `-c` to set configuration, `-p` to load pretrained model, `-o` to set inference files destination.\n```bash\npython tools/export_model.py \\\n    -c ./configs/example.yaml \\\n    -p ./output/example/path_to_weights \\\n    -o ./inference\n```\nIt will generate `model_name.pdmodel` , `model_name.pdiparams` and `model_name.pdiparames.info`.\nSecond, start PaddleInference engine to infer a video.\n```bash\npython tools/predict.py \\\n    --input_file \"data/example.avi\" \\\n    --model_file \"./inference/example.pdmodel\" \\\n    --params_file \"./inference/example.pdiparams\" \\\n    --use_gpu=True \\\n    --use_tensorrt=False\n```\nAttributes:\n+ `input_file`: input file path or input directory, which contains input files(s).\n+ `model_file`: pdmodel file path.\n+ `params_file`: pdiparams file path.\n+ `use_tensorrt`: use tensorrt to acclerate or not, default: False."
+        },
+        {
+            "comment": "This code snippet is referring to the `use_gpu` parameter in PaddleVideo, which enables or disables GPU usage for inferencing. The default setting is set to True and benchmark results are available in the [benchmark](./benchmark.md) document.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/usage.md\":174-176",
+            "content": "+ `use_gpu`: use gpu to infer or not, default: True.\nbenchmark results are shown in th [benchmark](./benchmark.md)."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1ce4b7d2-cf40-4f79-94ce-2aa2e8d987c6.json b/docs/doc/1ce4b7d2-cf40-4f79-94ce-2aa2e8d987c6.json
new file mode 100644
index 000000000..19924d59d
--- /dev/null
+++ b/docs/doc/1ce4b7d2-cf40-4f79-94ce-2aa2e8d987c6.json
@@ -0,0 +1,45 @@
+{
+    "summary": "The PP-TimeSformer model is an enhanced version of TimeSformer for video recognition tasks, trained on Kinetics-400 dataset and supports multi-GPU. It uses PaddleVideo with Vision Transformer backbone for testing and exports PP-TimeSformer for prediction using a specific config file.",
+    "details": [
+        {
+            "comment": "This code describes the PP-TimeSformer video classification model, an improved version of the TimeSformer model. It outlines the training, testing, and inference processes, as well as providing data preparation instructions for Kinetics-400 dataset. The table shows the accuracy of different versions of the model on Kinetics-400 dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md\":0-28",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/pp-timesformer.md) | English\n# TimeSformer Video Classification Model\n## Content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe have improved the [TimeSformer model](./timesformer.md) and obtained a more accurate 2D practical video classification model **PP-TimeSformer**. Without increasing the amount of parameters and calculations, the accuracy on the UCF-101, Kinetics-400 and other data sets significantly exceeds the original version. The accuracy on the Kinetics-400 data set is shown in the table below.\n| Version | Top1 |\n| :------ | :----: |\n| Ours ([swa](#refer-anchor-1)+distill+16frame) | 79.44 |\n| Ours ([swa](#refer-anchor-1)+distill)  | 78.87 |\n| Ours ([swa](#refer-anchor-1)) | **78.61** |\n| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/timesformer#kinetics-400) | 77.92 |\n## Data\nK400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)"
+        },
+        {
+            "comment": "This code snippet explains how to download and prepare data for training a video recognition model. It mentions the required data sets, pre-trained models, and the specific commands to download and configure them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md\":30-57",
+            "content": "UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)\n## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Download the image pre-training model [ViT_base_patch16_224_miil_21k.pdparams](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through wget command\n   ```bash\n   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"RecognizerTransformer\"\n        backbone:\n            name: \"VisionTransformer_tweaks\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:"
+        },
+        {
+            "comment": "This code runs PaddlePaddle's Timesformer model for video recognition using a specific configuration file. It uses multiple GPUs and supports AMP mixed-precision training for faster processing. The script is customizable, allowing you to train or test on different datasets by modifying the configuration file's name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md\":59-74",
+            "content": "    ```bash\n    # videos data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptimesformer main.py --validate -c configs/recognition/ pptimesformer/pptimesformer_k400_videos.yaml\n    ```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n    ```bash\n    export FLAGS_conv_workspace_size_limit=800 # MB\n    export FLAGS_cudnn_exhaustive_search=1\n    export FLAGS_cudnn_batchnorm_spatial_persistent=1\n    # videos data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptimesformer main.py --amp --validate -c configs /recognition/pptimesformer/pptimesformer_k400_videos.yaml\n    ```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage."
+        },
+        {
+            "comment": "The PP-TimeSformer model is tested during training, and the best test accuracy can be found in the log with keyword \"best\". However, the verification index recorded in the log may not represent the final test score, so a separate testing script should be used to obtain the accurate result. Two such scripts are provided for 8-frames and 16-frames testing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md\":77-91",
+            "content": "## Test\n- The PP-TimeSformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```\n  Already save the best model (top1 acc)0.7258\n  ```\n- Because the sampling method of the PP-TimeSformer model test mode is a slightly slower but higher accuracy **UniformCrop**, which is different from the **RandomCrop** used in the verification mode during the training process, so the verification index recorded in the training log` topk Acc` does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:\n  ```bash\n  # 8-frames testing script\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptimesformer  main.py  --test -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml -w \"output/ppTimeSformer/ppTimeSformer_best.pdparams\"\n  # 16-frames testing script"
+        },
+        {
+            "comment": "This code is launching the PaddleVideo model for testing using Vision Transformer backbone with UniformCrop sampling method and 8 segments. It's running on multiple GPUs and using a specific configuration file, yaml, to set parameters like backbone, sampling method, number of segments, target size, and checkpoint file. The resulting test indicators are presented in tabular format for Kinetics-400 validation dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md\":92-107",
+            "content": "  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptimesformer main.py --test \\\n  -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \\\n  -o MODEL.backbone.num_seg=16 \\\n  -o MODEL.runtime_cfg.test.num_seg=16 \\\n  -o PIPELINE.test.decode.num_seg=16 \\\n  -o PIPELINE.test.sample.num_seg=16 \\\n  -w \"data/ppTimeSformer_k400_16f_distill.pdparams\"\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:\n   | backbone           | Sampling method | num_seg | target_size | Top-1 | checkpoints |\n   | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |\n   | Vision Transformer |   UniformCrop   |   8    |     224     | 78.61 | [ppTimeSformer_k400_8f.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f.pdparams) |\n   | Vision Transformer | UniformCrop | 8 | 224 | "
+        },
+        {
+            "comment": "The code snippet is exporting the PP-TimeSformer model for video recognition. The model uses linspace sampling strategy, uniformly generating sparse sampling points in time and space to create one clip from a single video. The command uses Python script `export_model.py`, with config file `configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml` and model parameters file `data/ppTimeSformer_k400_8f.pdparams`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md\":107-119",
+            "content": "78.87 | [ppTimeSformer_k400_8f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f_distill.pdparams) |\n   | Vision Transformer | UniformCrop | 16 | 224 | 79.44 | [ppTimeSformer_k400_16f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_16f_distill.pdparams) |\n- During the test, the PP-TimeSformer video sampling strategy is to use linspace sampling: in time sequence, from the first frame to the last frame of the video sequence to be sampled, `num_seg` sparse sampling points (including endpoints) are uniformly generated; spatially , Select 3 areas to sample at both ends of the long side and the middle position (left, middle, right or top, middle, and bottom). A total of 1 clip is sampled for 1 video.\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \\\n                                -p data/ppTimeSformer_k400_8f.pdparams \\"
+        },
+        {
+            "comment": "This code is for inference using PaddlePaddle's ppTimeSformer model. The command generates the required model structure and weight files for prediction and then executes the predict.py script with the given input file, configuration, model files, and parameters. It displays the top-1 class and score for the video file provided, trained on Kinetics 400 dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md\":120-146",
+            "content": "                                -o inference/ppTimeSformer\n```\nThe above command will generate the model structure file `ppTimeSformer.pdmodel` and the model weight file `ppTimeSformer.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Reasoning Method](../../start.md#2-Model Reasoning)\n### Use predictive engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \\\n                           --model_file inference/ppTimeSformer/ppTimeSformer.pdmodel \\\n                           --params_file inference/ppTimeSformer/ppTimeSformer.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nThe output example is as follows:\n```\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9997474551200867\n```\nIt can be seen that using the ppTimeSformer model trained on Ki"
+        },
+        {
+            "comment": "This code snippet is discussing the prediction of a category name using the PP-Timesformer model, specifically for predicting the content of `data/example.avi`. The predicted category id is 5 and its corresponding category name is \"archery\". This information is derived from the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`. The code provides references to several related papers which have influenced or been used in this model's development.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md\":146-155",
+            "content": "netics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By referring to the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.\n## Reference\n- [Is Space-TimeAttention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani\n- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean\n<div id=\"refer-anchor-1\"></div>\n- [Averaging Weights Leads to Wider Optima and Better Generalization](https://arxiv.org/abs/1803.05407v3), Pavel Izmailov, Dmitrii Podoprikhin, Timur Garipov\n- [ImageNet-21K Pretraining for the Masses](https://arxiv.org/pdf/2104.10972v4.pdf), Tal Ridnik, Emanuel Ben-Baruch, Asaf Noy"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1d6e284b-2445-4be9-9b0a-d6056e4139c4.json b/docs/doc/1d6e284b-2445-4be9-9b0a-d6056e4139c4.json
new file mode 100644
index 000000000..c71beb6db
--- /dev/null
+++ b/docs/doc/1d6e284b-2445-4be9-9b0a-d6056e4139c4.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code is part of the PaddleVideo framework's VideoTag application, which imports libraries, sets up a logger, and handles YAML configuration. It logs each key-value pair in the config file, separated by dashed lines.",
+    "details": [
+        {
+            "comment": "This code snippet is part of the PaddleVideo framework's VideoTag application. It imports yaml and AttrDict from utility, sets up a logger for logging messages, defines four configuration section strings, and provides two functions: parse_config() to load config files into an AttrDict object and create_attr_dict() to create an AttrDict object with the specified attributes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/config_utils.py\":0-36",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport yaml\nfrom .utility import AttrDict\nimport logging\nlogger = logging.getLogger(__name__)\nCONFIG_SECS = [\n    'train',\n    'valid',\n    'test',\n    'infer',\n]\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n    import yaml\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef create_attr_dict(yaml_config):"
+        },
+        {
+            "comment": "This code includes three functions. The first function, `config_utils.py`, is for processing the yaml configuration by converting certain types into AttrDicts and evaluating string values. The second function, `merge_configs()`, merges argument dictionaries with pre-existing config section dictionaries. It skips None values and attempts to set new attributes. Finally, the third function, `print_configs()`, prints configuration arguments in a formatted manner.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/config_utils.py\":37-72",
+            "content": "    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\n    return\ndef merge_configs(cfg, sec, args_dict):\n    assert sec in CONFIG_SECS, \"invalid config section {}\".format(sec)\n    sec_dict = getattr(cfg, sec.upper())\n    for k, v in args_dict.items():\n        if v is None:\n            continue\n        try:\n            if hasattr(sec_dict, k):\n                setattr(sec_dict, k, v)\n        except:\n            pass\n    return cfg\ndef print_configs(cfg, mode):\n    logger.info(\n        \"---------------- {:>5} Arguments ----------------\".format(mode))\n    for sec, sec_items in cfg.items():\n        logger.info(\"{}:\".format(sec))\n        for k, v in sec_items.items():"
+        },
+        {
+            "comment": "The code is logging information for each key-value pair in the configuration file, and then separating each set of logs with a dashed line.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/config_utils.py\":73-74",
+            "content": "            logger.info(\"    {}:{}\".format(k, v))\n    logger.info(\"-------------------------------------------------\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1e0a27ad-0bb8-4301-b8c2-7f1936b08d23.json b/docs/doc/1e0a27ad-0bb8-4301-b8c2-7f1936b08d23.json
new file mode 100644
index 000000000..8b252e429
--- /dev/null
+++ b/docs/doc/1e0a27ad-0bb8-4301-b8c2-7f1936b08d23.json
@@ -0,0 +1,45 @@
+{
+    "summary": "This code accelerates PaddleServing installation with Docker, supports Linux and GPU, simplifies action recognition service deployment, and provides a C++ serving environment setup guide.",
+    "details": [
+        {
+            "comment": "This code introduces the installation process for PaddleServing. It uses Docker to pull a GPU-based docker environment and creates a Serving-based Docker named \"test\". The port 9292 is mapped to access the serving environment, and this setup supports Linux platforms, with Windows currently unsupported.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme_en.md\":0-16",
+            "content": "English | [\u7b80\u4f53\u4e2d\u6587](./readme.md)\n# Model service deployment\n## Introduction\n[Paddle Serving](https://github.com/PaddlePaddle/Serving) aims to help deep learning developers easily deploy online prediction services, support one-click deployment of industrial-grade service capabilities, high concurrency between client and server Efficient communication and support for developing clients in multiple programming languages.\nThis section takes the HTTP prediction service deployment as an example to introduce how to use PaddleServing to deploy the model service in PaddleVideo. Currently, only Linux platform deployment is supported, and Windows platform is not currently supported.\n## Serving installation\nThe Serving official website recommends using docker to install and deploy the Serving environment. First, you need to pull the docker environment and create a Serving-based docker.\n```bash\n# start GPU docker\ndocker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel\nnvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash"
+        },
+        {
+            "comment": "Code installs necessary packages for PaddlePaddle serving client, app, server (CPU/GPU) and PaddlePaddle (CPU/GPU) in a Docker container using pip. The GPU versions are specified with different CUDA and TensorRT versions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme_en.md\":17-40",
+            "content": "nvidia-docker exec -it test bash\n# start CPU docker\ndocker pull paddlepaddle/serving:0.7.0-devel\ndocker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash\ndocker exec -it test bash\n```\nAfter entering docker, you need to install Serving-related python packages.\n```bash\npython3.7 -m pip install paddle-serving-client==0.7.0\npython3.7 -m pip install paddle-serving-app==0.7.0\n#If it is a CPU deployment environment:\npython3.7 -m pip install paddle-serving-server==0.7.0 #CPU\npython3.7 -m pip install paddlepaddle==2.2.0 # CPU\n#If it is a GPU deployment environment\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102 # GPU with CUDA10.2 + TensorRT6\npython3.7 -m pip install paddlepaddle-gpu==2.2.0 # GPU with CUDA10.2\n#Other GPU environments need to confirm the environment and then choose which one to execute\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101 # GPU with CUDA10.1 + TensorRT6\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112 # GPU with CUDA11.2 + TensorRT8"
+        },
+        {
+            "comment": "This code snippet provides instructions for speeding up the installation process and deploying an action recognition service using PaddleServing. It explains how to convert a saved inference model into a Serving model, using PP-TSM as an example.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme_en.md\":41-64",
+            "content": "```\n* If the installation speed is too slow, you can change the source through `-i https://pypi.tuna.tsinghua.edu.cn/simple` to speed up the installation process.\n* For more environment and corresponding installation packages, see: https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md\n## Action recognition service deployment\n### Model conversion\nWhen using PaddleServing for service deployment, you need to convert the saved inference model into a Serving model. The following uses the PP-TSM model as an example to introduce how to deploy the action recognition service.\n- Download PP-TSM inference model and convert to Serving model:\n  ```bash\n  # Enter PaddleVideo directory\n  cd PaddleVideo\n  # Download the inference model and extract it to ./inference\n  mkdir ./inference\n  pushd ./inference\n  wget https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip\n  unzip ppTSM.zip\n  popd\n  # Convert to Serving model\n  pushd deploy/cpp_serving\n  python3.7 -m paddle_serving_client.convert \\"
+        },
+        {
+            "comment": "The code is specifying the directory, model filename, and parameters filename for a PaddleVideo inference program conversion. It also sets the serving server and client executables to be used after the conversion. The `dirname` parameter holds the storage path of the converted model files. If no specific filenames are provided (model_filename or params_filename), the code defaults to \"None\" which will use default filenames (\"__model__\" and None respectively). The serving server and client executables are specified in the code to be used after the conversion process, allowing the model to be served for inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme_en.md\":65-78",
+            "content": "  --dirname ../../inference/ppTSM \\\n  --model_filename ppTSM.pdmodel \\\n  --params_filename ppTSM.pdiparams \\\n  --serving_server ./ppTSM_serving_server \\\n  --serving_client ./ppTSM_serving_client\n  popd\n  ```\n  | parameter | type | default value | description |\n  | ----------------- | ---- | ------------------ | ------- -------------------------------------------------- --- |\n  | `dirname` | str | - | The storage path of the model file to be converted. The program structure file and parameter file are saved in this directory. |\n  | `model_filename` | str | None | The name of the file storing the model Inference Program structure that needs to be converted. If set to None, use `__model__` as the default filename |\n  | `params_filename` | str | None | File name where all parameters of the model to be converted are stored. It needs to be specified if and only if all model parameters are stored in a single binary file. If the model parameters are stored in separate files, set it to None |\n  | `serving_"
+        },
+        {
+            "comment": "The code specifies two paths, \"serving_server\" and \"serving_client\", representing the storage locations for model files and configuration files. After model conversion, it generates two folders with associated file formats in the specified folder. Upon obtaining the model files, modify two specific text files to change `alias_name` under `fetch_var`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme_en.md\":78-93",
+            "content": "server` | str | `\"serving_server\"` | The storage path of the converted model files and configuration files. Default is serving_server |\n  | `serving_client` | str | `\"serving_client\"` | The converted client configuration file storage path. Default is serving_client |\n- After the inference model conversion is completed, two folders, `ppTSM_serving_client` and `ppTSM_serving_server` will be generated under the `deploy/cpp_serving` folder, with the following formats:\n  ```bash\n  PaddleVideo/deploy/cpp_serving\n  \u251c\u2500\u2500 ppTSM_serving_client\n  \u2502   \u251c\u2500\u2500 serving_client_conf.prototxt\n  \u2502   \u2514\u2500\u2500 serving_client_conf.stream.prototxt\n  \u2514\u2500\u2500 ppTSM_serving_server\n      \u251c\u2500\u2500 ppTSM.pdiparams\n      \u251c\u2500\u2500 ppTSM.pdmodel\n      \u251c\u2500\u2500 serving_server_conf.prototxt\n      \u2514\u2500\u2500 serving_server_conf.stream.prototxt\n  ```\n  After getting the model file, you need to modify `serving_client_conf.prototxt` under `ppTSM_serving_client` and `serving_server_conf.prototxt` under `ppTSM_serving_server` respectively, and change `alias_name` under `fetch_var` in both files to `outputs`"
+        },
+        {
+            "comment": "This code demonstrates a rename function for compatibility in model deployment. The modified `serving_server_conf.prototxt` shows how to alias the input and output names in the configuration file. This allows different models to be inferred and deployed without modifying the code, only by altering the `alias_name`. The `cpp_serving` directory contains scripts for starting the pipeline service, C++ serving service, and sending prediction requests.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme_en.md\":95-121",
+            "content": "  **Remarks**: In order to be compatible with the deployment of different models, Serving provides the function of input and output renaming. In this way, when different models are inferred and deployed, they only need to modify the `alias_name` of the configuration file, and the inference deployment can be completed without modifying the code.\n  The modified `serving_server_conf.prototxt` looks like this:\n  ```yaml\n  feed_var {\n    name: \"data_batch_0\"\n    alias_name: \"data_batch_0\"\n    is_lod_tensor: false\n    feed_type: 1\n    shape: 8\n    shape: 3\n    shape: 224\n    shape: 224\n  }\n  fetch_var {\n    name: \"linear_2.tmp_1\"\n    alias_name: \"outputs\"\n    is_lod_tensor: false\n    fetch_type: 1\n    shape: 400\n  }\n  ```\n### Service deployment and requests\nThe `cpp_serving` directory contains the code for starting the pipeline service, the C++ serving service and sending the prediction request, including:\n  ```bash\n  run_cpp_serving.sh # Start the script on the C++ serving server side\n  pipeline_http_client.py # The script on the client side to send data and get the prediction results"
+        },
+        {
+            "comment": "This code provides instructions for setting up and running a C++ serving environment for PaddleVideo. It explains how to navigate to the deployment directory, start the service, send requests using serving_client.py, and obtain the model prediction results. If an error occurs during execution, it will display the corresponding log information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme_en.md\":122-151",
+            "content": "  paddle_env_install.sh # Install C++ serving environment script\n  preprocess_ops.py # file to store preprocessing functions\n  ```\n#### C++ Serving\n- Go to the working directory:\n  ```bash\n  cd deploy/cpp_serving\n  ```\n- Start the service:\n  ```bash\n  # Start in the background, the logs printed during the process will be redirected and saved to nohup.txt\n  bash run_cpp_serving.sh\n  ```\n- Send the request and get the result:\n```bash\npython3.7 serving_client.py \\\n-n PPTSM \\\n-c ./ppTSM_serving_client/serving_client_conf.prototxt \\\n--input_file=../../data/example.avi\n```\nAfter a successful run, the results of the model prediction will be printed in the cmd window, and the results are as follows:\n  ```bash\n  I0510 04:33:00.110025 37097 naming_service_thread.cpp:202] brpc::policy::ListNamingService(\"127.0.0.1:9993\"): added 1\n  I0510 04:33:01.904764 37097 general_model.cpp:490] [client]logid=0,client_cost=1640.96ms,server_cost=1623.21ms.\n   {'class_id': '[5]', 'prob': '[0.9907387495040894]'}\n   ```\n**If an error is re"
+        },
+        {
+            "comment": "This code provides instructions for installing the necessary environment and resolving an issue where no result is returned or a decoding error occurs due to proxy settings. Users are advised not to set proxies when starting the service and sending requests, and should use the provided commands to close proxies beforehand. The script `paddle_env_install.sh` can be executed to install relevant environment requirements.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme_en.md\":151-164",
+            "content": "ported during the process and it shows that libnvinfer.so.6 cannot be found, you can execute the script `paddle_env_install.sh` to install the relevant environment**\n   ```bash\n   bash paddle_env_install.sh\n   ```\n## FAQ\n**Q1**: No result is returned after the request is sent or an output decoding error is prompted\n**A1**: Do not set the proxy when starting the service and sending the request. You can close the proxy before starting the service and sending the request. The command to close the proxy is:\n```\nunset https_proxy\nunset http_proxy\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1e54cef0-a235-46a7-ab45-c9ac47e4b976.json b/docs/doc/1e54cef0-a235-46a7-ab45-c9ac47e4b976.json
new file mode 100644
index 000000000..c768b7482
--- /dev/null
+++ b/docs/doc/1e54cef0-a235-46a7-ab45-c9ac47e4b976.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code introduces a VideoMix operator for data augmentation in image classification tasks, using mixup and cutmix operations with controllable parameters.",
+    "details": [
+        {
+            "comment": "Mixup class implements a mixup operator for PaddleVideo. It takes an alpha value as input and ensures it is greater than 0. The __call__ method takes a batch of images and labels, combines them with random weights determined by the alpha value, and returns the mixed up image batch and label batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/mix.py\":0-33",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass Mixup(object):\n    \"\"\"\n    Mixup operator.\n    Args:\n        alpha(float): alpha value.\n    \"\"\"\n    def __init__(self, alpha=0.2):\n        assert alpha > 0., \\\n                'parameter alpha[%f] should > 0.0' % (alpha)\n        self.alpha = alpha\n    def __call__(self, batch):\n        imgs, labels = list(zip(*batch))\n        imgs = np.array(imgs)"
+        },
+        {
+            "comment": "The code defines a Cutmix class for a mixup operator. It takes an alpha value as input, and randomly generates new images by cutting out a part of the original image and pasting it on top of another image, with alpha value determining the ratio of the two. The function rand_bbox is used to determine the dimensions and location of the cutout box.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/mix.py\":34-69",
+            "content": "        labels = np.array(labels)\n        bs = len(batch)\n        idx = np.random.permutation(bs)\n        lam = np.random.beta(self.alpha, self.alpha)\n        lams = np.array([lam] * bs, dtype=np.float32)\n        imgs = lam * imgs + (1 - lam) * imgs[idx]\n        return list(zip(imgs, labels, labels[idx], lams))\n@PIPELINES.register()\nclass Cutmix(object):\n    \"\"\" Cutmix operator\n    Args:\n        alpha(float): alpha value.\n    \"\"\"\n    def __init__(self, alpha=0.2):\n        assert alpha > 0., \\\n                'parameter alpha[%f] should > 0.0' % (alpha)\n        self.alpha = alpha\n    def rand_bbox(self, size, lam):\n        \"\"\" rand_bbox \"\"\"\n        w = size[2]\n        h = size[3]\n        cut_rat = np.sqrt(1. - lam)\n        cut_w = np.int(w * cut_rat)\n        cut_h = np.int(h * cut_rat)\n        # uniform\n        cx = np.random.randint(w)\n        cy = np.random.randint(h)\n        bbx1 = np.clip(cx - cut_w // 2, 0, w)\n        bby1 = np.clip(cy - cut_h // 2, 0, h)\n        bbx2 = np.clip(cx + cut_w // 2, 0, w)\n        bby2 = np.clip(cy + cut_h // 2, 0, h)"
+        },
+        {
+            "comment": "This code defines a VideoMix operator that performs data augmentation by either mixing or cutting images from different samples in the batch. The mixup_alpha and cutmix_alpha parameters control the degree of blending between samples, while the cutmix_prob parameter determines the probability of applying the cutmix operation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/mix.py\":71-102",
+            "content": "        return bbx1, bby1, bbx2, bby2\n    def __call__(self, batch):\n        imgs, labels = list(zip(*batch))\n        imgs = np.array(imgs)\n        labels = np.array(labels)\n        bs = len(batch)\n        idx = np.random.permutation(bs)\n        lam = np.random.beta(self.alpha, self.alpha)\n        bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)\n        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]\n        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /\n                   (imgs.shape[-2] * imgs.shape[-1]))\n        lams = np.array([lam] * bs, dtype=np.float32)\n        return list(zip(imgs, labels, labels[idx], lams))\n@PIPELINES.register()\nclass VideoMix(object):\n    \"\"\"\n    VideoMix operator.\n    Args:\n        cutmix_prob(float): prob choose cutmix\n        mixup_alpha(float): alpha for mixup aug\n        cutmix_alpha(float): alpha for cutmix aug\n    \"\"\"\n    def __init__(self, cutmix_prob=0.5, mixup_alpha=0.2, cutmix_alpha=1.0):\n        assert cutmix_prob > 0., \\\n                'parameter cutmix_prob[%f] should > 0.0' % (cutmix_prob)"
+        },
+        {
+            "comment": "This code asserts that mixup_alpha and cutmix_alpha are greater than 0.0, sets the cutmix_prob, creates Mixup and Cutmix objects with the provided alphas, and defines a __call__ method to randomly choose between applying either Mixup or Cutmix to the batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/mix.py\":103-115",
+            "content": "        assert mixup_alpha > 0., \\\n                'parameter mixup_alpha[%f] should > 0.0' % (mixup_alpha)\n        assert cutmix_alpha > 0., \\\n                'parameter cutmix_alpha[%f] should > 0.0' % (cutmix_alpha)\n        self.cutmix_prob = cutmix_prob\n        self.mixup = Mixup(mixup_alpha)\n        self.cutmix = Cutmix(cutmix_alpha)\n    def __call__(self, batch):\n        if np.random.random() < self.cutmix_prob:\n            return self.cutmix(batch)\n        else:\n            return self.mixup(batch)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1e9290c5-1d3a-441b-9232-fa75817c48a4.json b/docs/doc/1e9290c5-1d3a-441b-9232-fa75817c48a4.json
new file mode 100644
index 000000000..1fa0c47d0
--- /dev/null
+++ b/docs/doc/1e9290c5-1d3a-441b-9232-fa75817c48a4.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The `weight_init_` function initializes layer weights in PaddlePaddle with custom functions, supporting various initialization types such as Xavier and Uniform.",
+    "details": [
+        {
+            "comment": "This function, `weight_init_`, initializes the weights of a PaddlePaddle layer with user-defined functions. The function takes in a layer, an initialization function, optional weight and bias names, and additional keyword arguments. It performs an in-place parameter initialization and supports various types of initialization functions such as Xavier, Uniform, and others.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py\":0-35",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport numpy as np\nimport paddle.nn.initializer as init\ndef weight_init_(layer,\n                 func,\n                 weight_name=None,\n                 bias_name=None,\n                 bias_value=0.0,\n                 **kwargs):\n    \"\"\"\n    In-place params init function.\n    Usage:\n    .. code-block:: python\n        import paddle\n        import numpy as np\n        data = np.ones([3, 4], dtype='float32')\n        linear = paddle.nn.Linear(4, 4)"
+        },
+        {
+            "comment": "This code initializes the weights and biases of a neural network layer using the PaddlePaddle framework. It checks if the layer has weight and bias attributes, then applies weight initialization functions and potentially overrides their names.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py\":36-54",
+            "content": "        input = paddle.to_tensor(data)\n        print(linear.weight)\n        linear(input)\n        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)\n        print(linear.weight)\n    \"\"\"\n    if hasattr(layer, 'weight') and layer.weight is not None:\n        getattr(init, func)(**kwargs)(layer.weight)\n        if weight_name is not None:\n            # override weight name\n            layer.weight.name = weight_name\n    if hasattr(layer, 'bias') and layer.bias is not None:\n        init.Constant(bias_value)(layer.bias)\n        if bias_name is not None:\n            # override bias name\n            layer.bias.name = bias_name"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1ef7a7d7-f557-4529-95b5-1a41b11cb022.json b/docs/doc/1ef7a7d7-f557-4529-95b5-1a41b11cb022.json
new file mode 100644
index 000000000..2a69eeb47
--- /dev/null
+++ b/docs/doc/1ef7a7d7-f557-4529-95b5-1a41b11cb022.json
@@ -0,0 +1,150 @@
+{
+    "summary": "The code defines ResNetSlowFast and SlowFast models for video recognition and computer vision tasks, respectively, with separate pathways for slow and fast processing using 3D convolutional layers and multi-pathway models.",
+    "details": [
+        {
+            "comment": "Copyright notice and license information for the code. Imports necessary modules, defines function to get convolutional layer initialization parameters, and a function to set batch normalization layer parameters. No actual model or functionality defined yet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":0-32",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import KaimingNormal\nfrom ..registry import BACKBONES\nfrom paddlevideo.utils.multigrid import get_norm\nimport sys\nimport numpy as np\nimport paddle.distributed as dist\n# seed random seed\npaddle.framework.seed(0)\n# get init parameters for conv layer\ndef get_conv_init(fan_out):\n    return KaimingNormal(fan_in=fan_out)\ndef get_bn_param_attr(bn_weight=1.0, coeff=0.0):"
+        },
+        {
+            "comment": "This code defines a BottleneckTransform class in PaddleVideo for video models. It performs Tx1x1, 1x3x3, 1x1x1 transformations with variable temporal kernel sizes. The constructor takes in arguments like dim_in, dim_out, temp_kernel_size, stride, and more to configure the transformation layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":33-65",
+            "content": "    param_attr = paddle.ParamAttr(\n        initializer=paddle.nn.initializer.Constant(bn_weight),\n        regularizer=paddle.regularizer.L2Decay(coeff))\n    return param_attr\n\"\"\"Video models.\"\"\"\nclass BottleneckTransform(paddle.nn.Layer):\n    \"\"\"\n    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of\n        temporal kernel.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 temp_kernel_size,\n                 stride,\n                 dim_inner,\n                 num_groups,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 eps=1e-5,\n                 dilation=1,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (int): the channel dimensions of the input.\n            dim_out (int): the channel dimension of the output.\n            temp_kernel_size (int): the temporal kernel sizes of the middle\n                convolution in the bottleneck.\n            stride (int): the stride of the bottleneck."
+        },
+        {
+            "comment": "This code defines a class called BottleneckTransform with parameters such as dim_in, dim_out, stride, dim_inner, num_groups, and dilation. It also has attributes like _inplace_relu, _eps, and norm_module for various operations and settings. The _construct method is used to initialize the class with these parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":66-86",
+            "content": "            dim_inner (int): the inner dimension of the block.\n            num_groups (int): number of groups for the convolution. num_groups=1\n                is for standard ResNet like networks, and num_groups>1 is for\n                ResNeXt like networks.\n            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise\n                apply stride to the 3x3 conv.\n            inplace_relu (bool): if True, calculate the relu on the original\n                input without allocating new memory.\n            eps (float): epsilon for batch norm.\n            dilation (int): size of dilation.\n        \"\"\"\n        super(BottleneckTransform, self).__init__()\n        self.temp_kernel_size = temp_kernel_size\n        self._inplace_relu = inplace_relu\n        self._eps = eps\n        self._stride_1x1 = stride_1x1\n        self.norm_module = norm_module\n        self._construct(dim_in, dim_out, stride, dim_inner, num_groups,\n                        dilation)\n    def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,"
+        },
+        {
+            "comment": "Defines a Conv3D layer for the ResNet_SlowFast backbone, with specified dimensions and stride. Initializes Conv3D weights using get_conv_init function and includes batch normalization (BN) and ReLU activation layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":87-112",
+            "content": "                   dilation):\n        str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)\n        fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self.a = paddle.nn.Conv3D(\n            in_channels=dim_in,\n            out_channels=dim_inner,\n            kernel_size=[self.temp_kernel_size, 1, 1],\n            stride=[1, str1x1, str1x1],\n            padding=[int(self.temp_kernel_size // 2), 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.a_bn = self.norm_module(num_features=dim_inner,\n                                     epsilon=self._eps,\n                                     weight_attr=get_bn_param_attr(),\n                                     bias_attr=get_bn_param_attr(bn_weight=0.0))\n        # 1x3x3, BN, ReLU.\n        fan = (dim_inner) * (1 * 3 * 3)\n        initializer_tmp = get_conv_init(fan)\n        self.b = paddle.nn.Conv3D(\n            in_channels=dim_inner,\n            out_channels=dim_inner,"
+        },
+        {
+            "comment": "This code defines a 3D convolutional layer with specific kernel sizes, strides, padding, and grouping. It also includes batch normalization layers for the intermediate and output features. The initializer functions are used to set the weights of each layer, with different initializers for different layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":113-138",
+            "content": "            kernel_size=[1, 3, 3],\n            stride=[1, str3x3, str3x3],\n            padding=[0, dilation, dilation],\n            groups=num_groups,\n            dilation=[1, dilation, dilation],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.b_bn = self.norm_module(num_features=dim_inner,\n                                     epsilon=self._eps,\n                                     weight_attr=get_bn_param_attr(),\n                                     bias_attr=get_bn_param_attr(bn_weight=0.0))\n        # 1x1x1, BN.\n        fan = (dim_out) * (1 * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self.c = paddle.nn.Conv3D(\n            in_channels=dim_inner,\n            out_channels=dim_out,\n            kernel_size=[1, 1, 1],\n            stride=[1, 1, 1],\n            padding=[0, 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.c_bn = self.norm_module(\n            num_features=dim_out,"
+        },
+        {
+            "comment": "ResNetSlowFast forward function performs convolutions and Batch Normalization for each branch (2a, 2b, 2c), then applies ReLU activation. ResBlock is a layer implementing residual blocks with specified dimensions, stride, inner dimension, groups, dilation, and normalization method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":139-179",
+            "content": "            epsilon=self._eps,\n            weight_attr=get_bn_param_attr(bn_weight=0.0),\n            bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        # Branch2a.\n        x = self.a(x)\n        x = self.a_bn(x)\n        x = F.relu(x)\n        # Branch2b.\n        x = self.b(x)\n        x = self.b_bn(x)\n        x = F.relu(x)\n        # Branch2c\n        x = self.c(x)\n        x = self.c_bn(x)\n        return x\nclass ResBlock(paddle.nn.Layer):\n    \"\"\"\n    Residual block.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 temp_kernel_size,\n                 stride,\n                 dim_inner,\n                 num_groups=1,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 eps=1e-5,\n                 dilation=1,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        ResBlock class constructs redisual blocks. More details can be found in:\n            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.\n            \"Deep residual learning for image recognition.\""
+        },
+        {
+            "comment": "This code defines the arguments for constructing a ResNet bottleneck. It includes parameters for input and output channel dimensions, temporal kernel size, stride, transform function, inner dimension, number of groups for convolution, whether to apply stride to 1x1 or 3x3 conv, inplace_relu flag, and epsilon for batch normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":180-197",
+            "content": "            https://arxiv.org/abs/1512.03385\n        Args:\n            dim_in (int): the channel dimensions of the input.\n            dim_out (int): the channel dimension of the output.\n            temp_kernel_size (int): the temporal kernel sizes of the middle\n                convolution in the bottleneck.\n            stride (int): the stride of the bottleneck.\n            trans_func (string): transform function to be used to construct the\n                bottleneck.\n            dim_inner (int): the inner dimension of the block.\n            num_groups (int): number of groups for the convolution. num_groups=1\n                is for standard ResNet like networks, and num_groups>1 is for\n                ResNeXt like networks.\n            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise\n                apply stride to the 3x3 conv.\n            inplace_relu (bool): calculate the relu on the original input\n                without allocating new memory.\n            eps (float): epsilon for batch norm."
+        },
+        {
+            "comment": "The code defines a ResBlock class, which is a residual block used in deep neural networks. It initializes the block with input and output dimensions, kernel size, stride, inner dimension, number of groups, and skip connection settings. The constructor method _construct creates a 3D convolution layer for the skip connection if there is a change in dimensions or stride.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":198-236",
+            "content": "            dilation (int): size of dilation.\n        \"\"\"\n        super(ResBlock, self).__init__()\n        self._inplace_relu = inplace_relu\n        self._eps = eps\n        self.norm_module = norm_module\n        self._construct(\n            dim_in,\n            dim_out,\n            temp_kernel_size,\n            stride,\n            dim_inner,\n            num_groups,\n            stride_1x1,\n            inplace_relu,\n            dilation,\n        )\n    def _construct(\n        self,\n        dim_in,\n        dim_out,\n        temp_kernel_size,\n        stride,\n        dim_inner,\n        num_groups,\n        stride_1x1,\n        inplace_relu,\n        dilation,\n    ):\n        # Use skip connection with projection if dim or res change.\n        if (dim_in != dim_out) or (stride != 1):\n            fan = (dim_out) * (1 * 1 * 1)\n            initializer_tmp = get_conv_init(fan)\n            self.branch1 = paddle.nn.Conv3D(\n                in_channels=dim_in,\n                out_channels=dim_out,\n                kernel_size=1,\n                stride=[1, stride, stride],"
+        },
+        {
+            "comment": "This code defines a ResNet SlowFast backbone model. It includes convolution layers, batch normalization layers, and BottleneckTransform modules. The forward function checks if the \"branch1\" attribute exists to handle different stages of the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":237-259",
+            "content": "                padding=0,\n                weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n                bias_attr=False,\n                dilation=1)\n            self.branch1_bn = self.norm_module(\n                num_features=dim_out,\n                epsilon=self._eps,\n                weight_attr=get_bn_param_attr(),\n                bias_attr=get_bn_param_attr(bn_weight=0.0))\n        self.branch2 = BottleneckTransform(dim_in,\n                                           dim_out,\n                                           temp_kernel_size,\n                                           stride,\n                                           dim_inner,\n                                           num_groups,\n                                           stride_1x1=stride_1x1,\n                                           inplace_relu=inplace_relu,\n                                           dilation=dilation,\n                                           norm_module=self.norm_module)\n    def forward(self, x):\n        if hasattr(self, \"branch1\"):"
+        },
+        {
+            "comment": "The code defines a ResNet stage for multi-pathway (SlowFast) cases in video recognition. It takes one or more tensors as input and applies branching to separate paths with different kernel sizes. The output is added together, passed through ReLU activation, and returned. This stage supports 1x1 stride option and uses BatchNorm3D for normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":260-293",
+            "content": "            x1 = self.branch1(x)\n            x1 = self.branch1_bn(x1)\n            x2 = self.branch2(x)\n            x = paddle.add(x=x1, y=x2)\n        else:\n            x2 = self.branch2(x)\n            x = paddle.add(x=x, y=x2)\n        x = F.relu(x)\n        return x\nclass ResStage(paddle.nn.Layer):\n    \"\"\"\n    Stage of 3D ResNet. It expects to have one or more tensors as input for\n        multi-pathway (SlowFast) cases.  More details can be found here:\n        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.\n        \"Slowfast networks for video recognition.\"\n        https://arxiv.org/pdf/1812.03982.pdf\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 stride,\n                 temp_kernel_sizes,\n                 num_blocks,\n                 dim_inner,\n                 num_groups,\n                 num_block_temp_kernel,\n                 dilation,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 norm_module=paddle.nn.BatchNorm3D):"
+        },
+        {
+            "comment": "The ResStage class constructor takes several lists as arguments to build p streams of pathways, controlling input and output dimensions, temporal kernel sizes, strides, and block numbers for each pathway.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":294-311",
+            "content": "        \"\"\"\n        The `__init__` method of any subclass should also contain these arguments.\n        ResStage builds p streams, where p can be greater or equal to one.\n        Args:\n            dim_in (list): list of p the channel dimensions of the input.\n                Different channel dimensions control the input dimension of\n                different pathways.\n            dim_out (list): list of p the channel dimensions of the output.\n                Different channel dimensions control the input dimension of\n                different pathways.\n            temp_kernel_sizes (list): list of the p temporal kernel sizes of the\n                convolution in the bottleneck. Different temp_kernel_sizes\n                control different pathway.\n            stride (list): list of the p strides of the bottleneck. Different\n                stride control different pathway.\n            num_blocks (list): list of p numbers of blocks for each of the\n                pathway.\n            dim_inner (list): list of the p inner channel dimensions of the"
+        },
+        {
+            "comment": "This code defines a ResStage class for a residual block. It takes input dimensions and channel dimensions as parameters, and initializes the number of blocks and temporal kernel sizes based on these inputs. The code also ensures that the provided number of block temporary kernel sizes does not exceed the specified number of blocks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":312-329",
+            "content": "                input. Different channel dimensions control the input dimension\n                of different pathways.\n            num_groups (list): list of number of p groups for the convolution.\n                num_groups=1 is for standard ResNet like networks, and\n                num_groups>1 is for ResNeXt like networks.\n            num_block_temp_kernel (list): extent the temp_kernel_sizes to\n                num_block_temp_kernel blocks, then fill temporal kernel size\n                of 1 for the rest of the layers.\n            dilation (list): size of dilation for each pathway.\n        \"\"\"\n        super(ResStage, self).__init__()\n        assert all((num_block_temp_kernel[i] <= num_blocks[i]\n                    for i in range(len(temp_kernel_sizes))))\n        self.num_blocks = num_blocks\n        self.temp_kernel_sizes = [\n            (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +\n            [1] * (num_blocks[i] - num_block_temp_kernel[i])\n            for i in range(len(temp_kernel_sizes))"
+        },
+        {
+            "comment": "The code initializes a ResNet SlowFast model by creating instances of blocks based on given parameters. It ensures that the input and output dimensions are correctly set, creates the desired number of pathways, and applies the specified norm module. The constructor then iterates over each pathway and block, creating ResBlock instances with appropriate sizes and configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":330-371",
+            "content": "        ]\n        assert (len({\n            len(dim_in),\n            len(dim_out),\n            len(temp_kernel_sizes),\n            len(stride),\n            len(num_blocks),\n            len(dim_inner),\n            len(num_groups),\n            len(num_block_temp_kernel),\n        }) == 1)\n        self.num_pathways = len(self.num_blocks)\n        self.norm_module = norm_module\n        self._construct(\n            dim_in,\n            dim_out,\n            stride,\n            dim_inner,\n            num_groups,\n            stride_1x1,\n            inplace_relu,\n            dilation,\n        )\n    def _construct(\n        self,\n        dim_in,\n        dim_out,\n        stride,\n        dim_inner,\n        num_groups,\n        stride_1x1,\n        inplace_relu,\n        dilation,\n    ):\n        for pathway in range(self.num_pathways):\n            for i in range(self.num_blocks[pathway]):\n                res_block = ResBlock(\n                    dim_in[pathway] if i == 0 else dim_out[pathway],\n                    dim_out[pathway],\n                    self.temp_kernel_sizes[pathway][i],"
+        },
+        {
+            "comment": "This code defines a ResNet backbone with slow-fast pathways, which includes residual blocks and basic stem modules. The `forward` method processes inputs from each pathway and returns the outputs as a list. It uses getattr to access the correct residual block module for each iteration in each pathway.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":372-403",
+            "content": "                    stride[pathway] if i == 0 else 1,\n                    dim_inner[pathway],\n                    num_groups[pathway],\n                    stride_1x1=stride_1x1,\n                    inplace_relu=inplace_relu,\n                    dilation=dilation[pathway],\n                    norm_module=self.norm_module)\n                self.add_sublayer(\"pathway{}_res{}\".format(pathway, i),\n                                  res_block)\n    def forward(self, inputs):\n        output = []\n        for pathway in range(self.num_pathways):\n            x = inputs[pathway]\n            for i in range(self.num_blocks[pathway]):\n                m = getattr(self, \"pathway{}_res{}\".format(pathway, i))\n                x = m(x)\n            output.append(x)\n        return output\nclass ResNetBasicStem(paddle.nn.Layer):\n    \"\"\"\n    ResNe(X)t 3D stem module.\n    Performs spatiotemporal Convolution, BN, and Relu following by a\n        spatiotemporal pooling.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,"
+        },
+        {
+            "comment": "This code defines a class for ResNet basic stem module with options to specify kernel, stride, padding, and batch normalization. It initializes the Conv3D layer and BatchNorm3D module based on the specified parameters. The constructor also calls the _construct_stem method to further initialize the Conv3D layer and BatchNorm3D module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":404-431",
+            "content": "                 kernel,\n                 stride,\n                 padding,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        super(ResNetBasicStem, self).__init__()\n        self.kernel = kernel\n        self.stride = stride\n        self.padding = padding\n        self.eps = eps\n        self.norm_module = norm_module\n        self._construct_stem(dim_in, dim_out)\n    def _construct_stem(self, dim_in, dim_out):\n        fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])\n        initializer_tmp = get_conv_init(fan)\n        self._conv = paddle.nn.Conv3D(\n            in_channels=dim_in,\n            out_channels=dim_out,\n            kernel_size=self.kernel,\n            stride=self.stride,\n            padding=self.padding,\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self._bn = self.norm_module(num_features=dim_out,\n                                    epsilon=self.eps,\n                                    weight_attr=get_bn_param_attr(),"
+        },
+        {
+            "comment": "This code defines a 3D stem module for video input. It consists of convolutional, batch normalization, ReLU, and max pooling operations applied to both slow and fast pathways. The dim_in, dim_out, kernel, stride, padding parameters are used to configure the specifics of these operations. Epsilon (eps) is a small value for numerical stability, and norm_module is the batch normalization module being used.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":432-465",
+            "content": "                                    bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        x = self._conv(x)\n        x = self._bn(x)\n        x = F.relu(x)\n        x = F.max_pool3d(x=x,\n                         kernel_size=[1, 3, 3],\n                         stride=[1, 2, 2],\n                         padding=[0, 1, 1],\n                         data_format=\"NCDHW\")\n        return x\nclass VideoModelStem(paddle.nn.Layer):\n    \"\"\"\n    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool\n    on input data tensor for slow and fast pathways.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 kernel,\n                 stride,\n                 padding,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (list): the list of channel dimensions of the inputs.\n            dim_out (list): the output dimension of the convolution in the stem\n                layer.\n            kernel (list): the kernels' size of the convolutions in the stem"
+        },
+        {
+            "comment": "The code defines a VideoModelStem class with parameters for input and output dimensions, temporal kernel size, stride, padding, epsilon for batch norm, and the normalization module. It checks for consistent dimensions and initializes instance variables before calling a constructor method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":466-491",
+            "content": "                layers. Temporal kernel size, height kernel size, width kernel\n                size in order.\n            stride (list): the stride sizes of the convolutions in the stem\n                layer. Temporal kernel stride, height kernel size, width kernel\n                size in order.\n            padding (list): the paddings' sizes of the convolutions in the stem\n                layer. Temporal padding size, height padding size, width padding\n                size in order.\n            eps (float): epsilon for batch norm.\n        \"\"\"\n        super(VideoModelStem, self).__init__()\n        assert (len({\n            len(dim_in),\n            len(dim_out),\n            len(kernel),\n            len(stride),\n            len(padding),\n        }) == 1), \"Input pathway dimensions are not consistent.\"\n        self.num_pathways = len(dim_in)\n        self.kernel = kernel\n        self.stride = stride\n        self.padding = padding\n        self.eps = eps\n        self.norm_module = norm_module\n        self._construct_stem(dim_in, dim_out)"
+        },
+        {
+            "comment": "This code defines a class that constructs a stem for each pathway in ResNet, then applies it to the input tensors. The FuseFastToSlow class fuses information from the Fast pathway to the Slow pathway and returns the fused tensors in order.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":493-517",
+            "content": "    def _construct_stem(self, dim_in, dim_out):\n        for pathway in range(len(dim_in)):\n            stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],\n                                   self.kernel[pathway], self.stride[pathway],\n                                   self.padding[pathway], self.eps,\n                                   self.norm_module)\n            self.add_sublayer(\"pathway{}_stem\".format(pathway), stem)\n    def forward(self, x):\n        assert (len(x) == self.num_pathways\n                ), \"Input tensor does not contain {} pathway\".format(\n                    self.num_pathways)\n        for pathway in range(len(x)):\n            m = getattr(self, \"pathway{}_stem\".format(pathway))\n            x[pathway] = m(x[pathway])\n        return x\nclass FuseFastToSlow(paddle.nn.Layer):\n    \"\"\"\n    Fuses the information from the Fast pathway to the Slow pathway. Given the\n    tensors from Slow pathway and Fast pathway, fuse information from Fast to\n    Slow, then return the fused tensors from Slow and Fast pathway in order."
+        },
+        {
+            "comment": "This function initializes the FuseFastToSlow class, which takes in dimensions, fusion parameters, and other options. It sets up a convolutional layer to fuse information from the Fast pathway to the Slow pathway. It uses a specified channel ratio and kernel size for the convolution operation. The epsilon parameter is used for batch normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":518-543",
+            "content": "    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 fusion_conv_channel_ratio,\n                 fusion_kernel,\n                 alpha,\n                 fuse_bn_relu=1,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (int): the channel dimension of the input.\n            fusion_conv_channel_ratio (int): channel ratio for the convolution\n                used to fuse from Fast pathway to Slow pathway.\n            fusion_kernel (int): kernel size of the convolution used to fuse\n                from Fast pathway to Slow pathway.\n            alpha (int): the frame rate ratio between the Fast and Slow pathway.\n            eps (float): epsilon for batch norm.\n        \"\"\"\n        super(FuseFastToSlow, self).__init__()\n        self.fuse_bn_relu = fuse_bn_relu\n        fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self._conv_f2s = paddle.nn.Conv3D(\n            in_channels=dim_in,"
+        },
+        {
+            "comment": "This code defines a ResNetSlowFast model in PaddlePaddle, which is a variation of the SlowFast network. It includes a fusion convolution layer followed by batch normalization and ReLU activation if the fuse_bn_relu flag is set to True. The forward function performs concatenation of the input features and returns the result. This model is registered under BACKBONES for future use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":544-571",
+            "content": "            out_channels=dim_in * fusion_conv_channel_ratio,\n            kernel_size=[fusion_kernel, 1, 1],\n            stride=[alpha, 1, 1],\n            padding=[fusion_kernel // 2, 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,\n                               epsilon=eps,\n                               weight_attr=get_bn_param_attr(),\n                               bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        x_s = x[0]\n        x_f = x[1]\n        fuse = self._conv_f2s(x_f)\n        #  TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.\n        if self.fuse_bn_relu:\n            fuse = self._bn(fuse)\n            fuse = F.relu(fuse)\n        x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)\n        return [x_s_fuse, x_f]\n@BACKBONES.register()\nclass ResNetSlowFast(paddle.nn.Layer):\n    \"\"\"\n    SlowFast model builder for SlowFast network."
+        },
+        {
+            "comment": "This code defines a class ResNetSlowFast, which is a variant of the ResNet architecture with slow and fast paths for video recognition. It takes various parameters such as alpha, beta, bn_norm_type, etc., to build the network. The class also includes methods for initializing the model with the given parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":573-606",
+            "content": "    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.\n    \"Slowfast networks for video recognition.\"\n    https://arxiv.org/pdf/1812.03982.pdf\n    \"\"\"\n    def __init__(\n        self,\n        alpha,\n        beta,\n        bn_norm_type=\"batchnorm\",\n        bn_num_splits=1,\n        num_pathways=2,\n        depth=50,\n        num_groups=1,\n        input_channel_num=[3, 3],\n        width_per_group=64,\n        fusion_conv_channel_ratio=2,\n        fusion_kernel_sz=7,  #5?\n        pool_size_ratio=[[1, 1, 1], [1, 1, 1]],\n        fuse_bn_relu = 1,\n        spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]],\n        use_pool_af_s2 = 1,\n    ):\n        \"\"\"\n        Args:\n            cfg (CfgNode): model building configs, details are in the\n                comments of the config file.\n        \"\"\"\n        super(ResNetSlowFast, self).__init__()\n        self.alpha = alpha  #8\n        self.beta = beta  #8\n        self.norm_module = get_norm(bn_norm_type, bn_num_splits)\n        self.num_pathways = num_pathways\n        self.depth = depth"
+        },
+        {
+            "comment": "This code defines a SlowFast model for computer vision tasks. It takes in several parameters including the number of groups, input channel number, and others. The construct_network function builds the SlowFast model with separate pathways for Slow and Fast pathways using different temporal kernels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":607-631",
+            "content": "        self.num_groups = num_groups\n        self.input_channel_num = input_channel_num\n        self.width_per_group = width_per_group\n        self.fusion_conv_channel_ratio = fusion_conv_channel_ratio\n        self.fusion_kernel_sz = fusion_kernel_sz  # NOTE: modify to 7 in 8*8, 5 in old implement\n        self.pool_size_ratio = pool_size_ratio\n        self.fuse_bn_relu = fuse_bn_relu\n        self.spatial_strides = spatial_strides\n        self.use_pool_af_s2 = use_pool_af_s2\n        self._construct_network()\n    def _construct_network(self):\n        \"\"\"\n        Builds a SlowFast model.\n        The first pathway is the Slow pathway\n        and the second pathway is the Fast pathway.\n        Args:\n            cfg (CfgNode): model building configs, details are in the\n                comments of the config file.\n        \"\"\"\n        temp_kernel = [\n            [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.\n            [[1], [3]],  # res2 temporal kernel for slow and fast pathway.\n            [[1], [3]],  # res3 temporal kernel for slow and fast pathway."
+        },
+        {
+            "comment": "The code initializes a SlowFast ResNet backbone model. It defines temporal kernels for res4 and res5 pathways, creates a VideoModelStem layer with specific dimensions and parameters, and a FuseFastToSlow layer for fusion. The code also sets the model stage depth according to the chosen depth (50, 101, or 152).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":632-656",
+            "content": "            [[3], [3]],  # res4 temporal kernel for slow and fast pathway.\n            [[3], [3]],\n        ]  # res5 temporal kernel for slow and fast pathway.\n        self.s1 = VideoModelStem(\n            dim_in=self.input_channel_num,\n            dim_out=[self.width_per_group, self.width_per_group // self.beta],\n            kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],\n            stride=[[1, 2, 2]] * 2,\n            padding=[\n                [temp_kernel[0][0][0] // 2, 3, 3],\n                [temp_kernel[0][1][0] // 2, 3, 3],\n            ],\n            norm_module=self.norm_module)\n        self.s1_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu)\n        # ResNet backbone\n        MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}\n        (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]"
+        },
+        {
+            "comment": "This code defines a ResStage, which is a stage in the ResNet SlowFast model. It sets the dimensions and parameters for this stage including input and output widths, kernel sizes, and strides. The code also specifies temporary kernel sizes and spatial strides for this particular stage of the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":658-677",
+            "content": "        num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]\n        spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]\n        spatial_strides = self.spatial_strides\n        #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]\n        #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment\n        out_dim_ratio = self.beta // self.fusion_conv_channel_ratio  #4\n        dim_inner = self.width_per_group * self.num_groups  #64\n        self.s2 = ResStage(dim_in=[\n            self.width_per_group + self.width_per_group // out_dim_ratio,\n            self.width_per_group // self.beta,\n        ],\n                           dim_out=[\n                               self.width_per_group * 4,\n                               self.width_per_group * 4 // self.beta,\n                           ],\n                           dim_inner=[dim_inner, dim_inner // self.beta],\n                           temp_kernel_sizes=temp_kernel[1],\n                           stride=spatial_strides[0],"
+        },
+        {
+            "comment": "The code defines a resnet_slowfast model with two main components: s1 and s3. The s1 component consists of three branches, with the first two having 2x repeat_num_body blocks each, while the third has (2*repeat_num_body + 1) blocks. It also includes norm_module and spatial_dilations for the first branch. The s3 component contains a ResStage layer. The model uses parameters such as width_per_group, out_dim_ratio, dim_inner, repeat_num_body, alpha, beta, fusion_conv_channel_ratio, fusion_kernel_sz, norm_module, and fuse_bn_relu.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":678-703",
+            "content": "                           num_blocks=[d2] * 2,\n                           num_groups=[self.num_groups] * 2,\n                           num_block_temp_kernel=num_block_temp_kernel[0],\n                           dilation=spatial_dilations[0],\n                           norm_module=self.norm_module)\n        self.s2_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 4 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s3 = ResStage(\n            dim_in=[\n                self.width_per_group * 4 +\n                self.width_per_group * 4 // out_dim_ratio,\n                self.width_per_group * 4 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 8,\n                self.width_per_group * 8 // self.beta,\n            ],\n            dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],"
+        },
+        {
+            "comment": "The code initializes and defines different layers for the ResNet SlowFast model. It includes operations such as creating convolutional layers, fusing fast to slow features, and defining a stage layer with specified input and output dimensions. The alpha, fusion_kernel_sz, out_dim_ratio, beta, dim_inner values are used to control the specifics of these operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":704-732",
+            "content": "            temp_kernel_sizes=temp_kernel[2],\n            stride=spatial_strides[1],\n            num_blocks=[d3] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[1],\n            dilation=spatial_dilations[1],\n            norm_module=self.norm_module,\n        )\n        self.s3_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 8 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s4 = ResStage(\n            dim_in=[\n                self.width_per_group * 8 +\n                self.width_per_group * 8 // out_dim_ratio,\n                self.width_per_group * 8 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 16,\n                self.width_per_group * 16 // self.beta,\n            ],\n            dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],"
+        },
+        {
+            "comment": "The code defines the ResStage and FuseFastToSlow modules for a ResNet SlowFast model. It initializes these modules with specific dimensions and parameters, such as number of channels, fusion kernel size, alpha value, and dilation rates. These modules are used to extract features from the input and fuse them together for further processing in the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":733-761",
+            "content": "            temp_kernel_sizes=temp_kernel[3],\n            stride=spatial_strides[2],\n            num_blocks=[d4] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[2],\n            dilation=spatial_dilations[2],\n            norm_module=self.norm_module,\n        )\n        self.s4_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 16 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s5 = ResStage(\n            dim_in=[\n                self.width_per_group * 16 +\n                self.width_per_group * 16 // out_dim_ratio,\n                self.width_per_group * 16 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 32,\n                self.width_per_group * 32 // self.beta,\n            ],\n            dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],"
+        },
+        {
+            "comment": "This code defines a ResNet slowfast model with specified parameters, initializes the weights, and forwards the input data. It also includes an optional max-pooling operation for one of its stages (s2) depending on a flag value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":762-787",
+            "content": "            temp_kernel_sizes=temp_kernel[4],\n            stride=spatial_strides[3],\n            num_blocks=[d5] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[3],\n            dilation=spatial_dilations[3],\n            norm_module=self.norm_module,\n        )\n    def init_weights(self):\n        pass\n    def forward(self, x):\n        x = self.s1(x)  #VideoModelStem\n        x = self.s1_fuse(x)  #FuseFastToSlow\n        x = self.s2(x)  #ResStage\n        x = self.s2_fuse(x)\n        #  TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.\n        if self.use_pool_af_s2:\n            for pathway in range(self.num_pathways):\n                x[pathway] = F.max_pool3d(x=x[pathway],\n                                          kernel_size=self.pool_size_ratio[pathway],\n                                          stride=self.pool_size_ratio[pathway],\n                                          padding=[0, 0, 0],\n                                          data_format=\"NCDHW\")"
+        },
+        {
+            "comment": "This code snippet represents the final segment of a neural network model. It processes input data (x) through four sequential layers (s3, s4, and s5), then fuses their outputs before returning the result as output. Each layer is likely responsible for feature extraction or transformation at different levels of the network's architecture.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py\":789-794",
+            "content": "        x = self.s3(x)\n        x = self.s3_fuse(x)\n        x = self.s4(x)\n        x = self.s4_fuse(x)\n        x = self.s5(x)\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/1feadf68-13cb-4302-9caf-d3d0a954aea9.json b/docs/doc/1feadf68-13cb-4302-9caf-d3d0a954aea9.json
new file mode 100644
index 000000000..3158f5ced
--- /dev/null
+++ b/docs/doc/1feadf68-13cb-4302-9caf-d3d0a954aea9.json
@@ -0,0 +1,25 @@
+{
+    "summary": "ASPPModule is a CNN layer for ASPP modules in Ma-Net, implementing atrous spatial pyramid pooling with Conv2D, BatchNorm, and ReLU activation. The class initializes instance parameters and sets dilations for ASPP modules using _ASPPModule class.",
+    "details": [
+        {
+            "comment": "ASPPModule is a convolutional neural network layer that performs atrous spatial pyramid pooling. It consists of a Conv2D layer, BatchNorm layer, and ReLU activation function for feature extraction and normalization in a hierarchical manner. The weight initialization follows the Kaiming normal distribution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/aspp.py\":0-33",
+            "content": "import math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom utils.api import kaiming_normal_\nclass _ASPPModule(nn.Layer):\n    def __init__(self, inplanes, planes, kernel_size, padding, dilation,\n                 BatchNorm):\n        super(_ASPPModule, self).__init__()\n        self.atrous_conv = nn.Conv2D(inplanes,\n                                     planes,\n                                     kernel_size=kernel_size,\n                                     stride=1,\n                                     padding=padding,\n                                     dilation=dilation,\n                                     bias_attr=False)\n        self.bn = BatchNorm(planes)\n        self.relu = nn.ReLU(True)\n        self._init_weight()\n    def forward(self, x):\n        x = self.atrous_conv(x)\n        x = self.bn(x)\n        return self.relu(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):"
+        },
+        {
+            "comment": "The code defines a class \"ASPP\" that inherits from \"nn.Layer\". It initializes the instance with parameters such as backbone, output_stride, and BatchNorm. Depending on these inputs, it sets the dilations for the ASPP modules. These ASPP modules are instances of _ASPPModule class with specified input size, output size, kernel size, and dilation rate.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/aspp.py\":34-65",
+            "content": "                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)\nclass ASPP(nn.Layer):\n    def __init__(self, backbone, output_stride, BatchNorm):\n        super(ASPP, self).__init__()\n        if backbone == 'drn':\n            inplanes = 512\n        elif backbone == 'mobilenet':\n            inplanes = 320\n        else:\n            inplanes = 2048\n        if output_stride == 16:\n            dilations = [1, 6, 12, 18]\n        elif output_stride == 8:\n            dilations = [1, 12, 24, 36]\n        else:\n            raise NotImplementedError\n        self.aspp1 = _ASPPModule(inplanes,\n                                 256,\n                                 1,\n                                 padding=0,\n                                 dilation=dilations[0],\n                                 BatchNorm=BatchNorm)\n        self.aspp2 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[1],"
+        },
+        {
+            "comment": "This code defines three ASPP modules and a global average pooling layer for a neural network, with batch normalization and ReLU activations applied. The convolutional layers have specific dilations and padding values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/aspp.py\":66-88",
+            "content": "                                 dilation=dilations[1],\n                                 BatchNorm=BatchNorm)\n        self.aspp3 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[2],\n                                 dilation=dilations[2],\n                                 BatchNorm=BatchNorm)\n        self.aspp4 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[3],\n                                 dilation=dilations[3],\n                                 BatchNorm=BatchNorm)\n        self.global_avg_pool = nn.Sequential(\n            nn.AdaptiveAvgPool2D((1, 1)),\n            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),\n            BatchNorm(256), nn.ReLU())\n        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)\n        self.bn1 = BatchNorm(256)\n        self.relu = nn.ReLU(True)\n        self.dropout = nn.Dropout(0.1)"
+        },
+        {
+            "comment": "The code defines a convolutional neural network (CNN) for the ASPP (Aggregated Spatial Pyramid Pooling) module in PaddleVideo's Ma-Net. It includes an initialization function, forward pass computation, and a builder function to create the ASPP module with specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/aspp.py\":89-122",
+            "content": "        self._init_weight()\n    def forward(self, x):\n        x1 = self.aspp1(x)\n        x2 = self.aspp2(x)\n        x3 = self.aspp3(x)\n        x4 = self.aspp4(x)\n        x5 = self.global_avg_pool(x)\n        x5 = F.interpolate(x5,\n                           size=x4.shape[2:],\n                           mode='bilinear',\n                           align_corners=True)\n        x = paddle.concat((x1, x2, x3, x4, x5), axis=1)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        return x\n        return self.dropout(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)\ndef build_aspp(backbone, output_stride, BatchNorm):\n    return ASPP(backbone, output_stride, BatchNorm)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2017add4-c3af-4d8a-9e69-400b8b3d6db6.json b/docs/doc/2017add4-c3af-4d8a-9e69-400b8b3d6db6.json
new file mode 100644
index 000000000..f6f10218b
--- /dev/null
+++ b/docs/doc/2017add4-c3af-4d8a-9e69-400b8b3d6db6.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code processes video data, creating labels, organizing files, and parsing command line arguments. It allows for segmentation or localization labeling with features such as label conversion and ground truth processing.",
+    "details": [
+        {
+            "comment": "This code reads label files from a specified path, converts the labels to localization format and writes them into another specified output path. It also generates mapping information between the localization format and original format. The function takes prefix_data_path (path to read data), out_path (output path for results), action_dict (dictionary of action mappings) and fps (frames per second) as input parameters. It processes each label file in the prefix_data_path, updating labels_list with converted labels, and writes them to the output path. Finally, it generates mapping information in \"mapping.txt\" format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/transform_segmentation_label.py\":0-33",
+            "content": "import json\nimport numpy as np\nimport argparse\nimport os\nfrom tqdm import tqdm\ndef generate_mapping_list_txt(action_dict, out_path):\n    out_txt_file_path = os.path.join(out_path, \"mapping.txt\")\n    f = open(out_txt_file_path, \"w\", encoding='utf-8')\n    for key, action_name in action_dict.items():\n        str_str = str(key) + \" \" + action_name + \"\\n\"\n        f.write(str_str)\n    # add None\n    str_str = str(len(action_dict)) + \" None\" + \"\\n\"\n    f.write(str_str)\n    f.close()\ndef segmentation_convert_localization_label(prefix_data_path, out_path,\n                                            action_dict, fps):\n    label_path = os.path.join(prefix_data_path)\n    label_txt_name_list = os.listdir(label_path)\n    labels_dict = {}\n    labels_dict[\"fps\"] = fps\n    labels_list = []\n    for label_name in tqdm(label_txt_name_list, desc='label convert:'):\n        label_dict = {}\n        label_dict[\"url\"] = label_name.split(\".\")[0] + \".mp4\"\n        label_txt_path = os.path.join(prefix_data_path, label_name)\n        with open(label_txt_path, \"r\", encoding='utf-8') as f:"
+        },
+        {
+            "comment": "This code segment reads a ground truth file line by line and counts the total frames. It then identifies action boundaries, creates an actions list, and for each action, it extracts action name, start and end time (in seconds), and the corresponding label ID from the action dictionary to create a label_action_dict. This information will be useful in transforming segmentation labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/transform_segmentation_label.py\":34-54",
+            "content": "            gt = f.read().split(\"\\n\")[:-1]\n        label_dict[\"total_frames\"] = len(gt)\n        boundary_index_list = [0]\n        before_action_name = gt[0]\n        for index in range(1, len(gt)):\n            if before_action_name != gt[index]:\n                boundary_index_list.append(index)\n                before_action_name = gt[index]\n        actions_list = []\n        for index in range(len(boundary_index_list) - 1):\n            if gt[boundary_index_list[index]] != \"None\":\n                action_name = gt[boundary_index_list[index]]\n                start_sec = float(boundary_index_list[index]) / float(fps)\n                end_sec = float(boundary_index_list[index + 1] - 1) / float(fps)\n                action_id = action_dict[action_name]\n                label_action_dict = {}\n                label_action_dict[\"label_names\"] = action_name\n                label_action_dict[\"start_id\"] = start_sec\n                label_action_dict[\"end_id\"] = end_sec\n                label_action_dict[\"label_ids\"] = [action_id]"
+        },
+        {
+            "comment": "This code appears to be part of a larger program that performs video segmentation and labeling. It generates a dictionary containing action labels based on provided ground truth segmentation data, converts segmentation labels into localization format, and saves the results in JSON format for further use. The function `generate_action_dict()` generates an action dictionary, `load_action_dict()` loads an action dictionary from a file, and `localization_convert_segmentation_label()` converts segmentation labels into localization format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/transform_segmentation_label.py\":55-89",
+            "content": "                actions_list.append(label_action_dict)\n        label_dict[\"actions\"] = actions_list\n        labels_list.append(label_dict)\n    labels_dict[\"gts\"] = labels_list\n    output_path = os.path.join(out_path, \"output.json\")\n    f = open(output_path, \"w\", encoding='utf-8')\n    f.write(json.dumps(labels_dict, indent=4))\n    f.close()\ndef generate_action_dict(label):\n    action_dict = {}\n    for gt in label[\"gts\"]:\n        for action in gt[\"actions\"]:\n            label_id = action[\"label_ids\"][0]\n            label_name = action[\"label_names\"][0]\n            action_dict[label_id] = label_name\n    return action_dict\ndef load_action_dict(data_path):\n    mapping_txt_path = os.path.join(data_path, \"mapping.txt\")\n    with open(mapping_txt_path, \"r\", encoding='utf-8') as f:\n        actions = f.read().split(\"\\n\")[:-1]\n    class2id_map = dict()\n    for a in actions:\n        class2id_map[a.split()[1]] = int(a.split()[0])\n    return class2id_map\ndef localization_convert_segmentation_label(label, prefix_data_path, out_path):"
+        },
+        {
+            "comment": "The code checks if a directory exists and creates it if not. It then loops through each ground truth segmentation in the label, retrieves the corresponding video data, extracts relevant information like feature, action labels, start and end indices, and populates seg_label array with action labels for the specified time range.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/transform_segmentation_label.py\":90-118",
+            "content": "    path = os.path.join(out_path, \"groundTruth\")\n    isExists = os.path.exists(path)\n    if not isExists:\n        os.makedirs(path)\n        print(path + ' \u521b\u5efa\u6210\u529f')\n    else:\n        print(path + ' \u76ee\u5f55\u5df2\u5b58\u5728')\n    fps = float(label[\"fps\"])\n    video_list = []\n    for gt in tqdm(label[\"gts\"], desc='label convert:'):\n        video_name = gt[\"url\"].split(\".\")[0]\n        data_path = os.path.join(prefix_data_path, video_name + \".pkl\")\n        video_list.append(video_name + \".txt\")\n        feature = np.load(data_path, allow_pickle=True)[\"image_feature\"]\n        num_feture = feature.shape[0]\n        seg_label = [\"None\"] * (num_feture)\n        for action in gt[\"actions\"]:\n            start_id = action[\"start_id\"]\n            end_id = action[\"end_id\"]\n            label_name = action[\"label_names\"]\n            start_index = int(np.floor(start_id * fps))\n            end_index = int(np.floor(end_id * fps)) + 1\n            if end_index < num_feture - 1:\n                seg_label[start_index:end_index] = label_name * (end_index -"
+        },
+        {
+            "comment": "This code segment is part of a larger program that appears to be related to video data processing. The function is setting up a segmentation label and writing it to a file, as well as creating another list for training purposes. It determines the starting index based on the number of features, and fills in the label accordingly. The code then writes the label and video list to separate text files in the specified output path. This process is controlled by the \"args\" variable which contains command line arguments like \"mode\", \"label_path\", and \"out_path\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/transform_segmentation_label.py\":119-146",
+            "content": "                                                                 start_index)\n            elif start_index < num_feture - 1:\n                seg_label[start_index:] = label_name * (num_feture -\n                                                        start_index)\n            else:\n                pass\n        if len(seg_label) != num_feture:\n            seg_label = seg_label[:num_feture]\n        out_txt_file_path = os.path.join(out_path, \"groundTruth\",\n                                         video_name + \".txt\")\n        str = '\\n'\n        f = open(out_txt_file_path, \"w\", encoding='utf-8')\n        f.write(str.join(seg_label) + str)\n        f.close()\n    out_txt_file_path = os.path.join(out_path, \"train_list.txt\")\n    str = '\\n'\n    f = open(out_txt_file_path, \"w\", encoding='utf-8')\n    f.write(str.join(video_list) + str)\n    f.close()\ndef main():\n    args = get_arguments()\n    if args.mode in [\"segmentation\", \"localization\"]:\n        if args.mode == \"segmentation\":\n            with open(args.label_path, 'r', encoding='utf-8') as json_file:"
+        },
+        {
+            "comment": "The code reads a label file, determines the mode (segmentation or localization), and performs corresponding operations. It uses function calls like generate_action_dict, load_action_dict, segmentation_convert_localization_label. The get_arguments function parses command line arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/transform_segmentation_label.py\":147-172",
+            "content": "                label = json.load(json_file)\n            action_dict = generate_action_dict(label)\n            generate_mapping_list_txt(action_dict, args.out_path)\n            localization_convert_segmentation_label(label, args.data_path,\n                                                    args.out_path)\n        elif args.mode == \"localization\":\n            action_dict = load_action_dict(args.label_path)\n            segmentation_convert_localization_label(args.data_path,\n                                                    args.out_path,\n                                                    action_dict,\n                                                    fps=25.0)\n    else:\n        raise NotImplementedError\ndef get_arguments():\n    \"\"\"\n    parse all the arguments from command line inteface\n    return a list of parsed arguments\n    \"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"convert segmentation and localization label\")\n    parser.add_argument(\"label_path\", type=str, help=\"path of a label file\")"
+        },
+        {
+            "comment": "This code snippet defines command line arguments for the input data path, output path, and mode. It then parses these arguments and returns them, allowing the program to convert segmentation or localization labels as specified by the user.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/transform_segmentation_label.py\":173-194",
+            "content": "    parser.add_argument(\n        \"data_path\",\n        type=str,\n        help=\"path of video feature or segmentation label txt.\",\n    )\n    parser.add_argument(\n        \"out_path\",\n        type=str,\n        help=\"path of output file.\",\n    )\n    parser.add_argument(\n        \"--mode\",\n        type=str,\n        default=\"segmentation\",\n        help=\"Convert segmentation label or localization label.\",\n    )\n    return parser.parse_args()\nif __name__ == \"__main__\":\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/207c687e-6ba7-45cd-ab2d-3dea0024a48a.json b/docs/doc/207c687e-6ba7-45cd-ab2d-3dea0024a48a.json
new file mode 100644
index 000000000..3e8cffb1d
--- /dev/null
+++ b/docs/doc/207c687e-6ba7-45cd-ab2d-3dea0024a48a.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The code initializes models, checks CUDA availability and version, parses command line arguments, trains a video tagging model using data parallelism and saves if needed. The code also checks the version, logs arguments, creates a directory, and proceeds to train using those arguments.",
+    "details": [
+        {
+            "comment": "This code snippet contains the necessary import statements and license information for the VideoTag application in PaddleVideo. It also sets up the logging format and includes utility functions from other modules such as train_utils, config_utils, reader, metrics, and utility. The code checks if CUDA is available and verifies the PaddlePaddle version before proceeding with the training process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py\":0-31",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport argparse\nimport ast\nimport logging\nimport paddle\nimport paddle.static as static\nfrom utils.train_utils import train_with_dataloader\nimport models\nfrom utils.config_utils import *\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'"
+        },
+        {
+            "comment": "This code block sets up logging, defines a function parse_args which uses argparse to create an argument parser for specifying model name, config file path, batch size, learning rate and pretrain weights. It provides default values for these arguments in case they are not specified by the user.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py\":32-59",
+            "content": "logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)\nlogger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser(\"Paddle Video train script\")\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='AttentionCluster',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/attention_cluster.txt',\n                        help='path to config file of model')\n    parser.add_argument(\n        '--batch_size',\n        type=int,\n        default=None,\n        help='training batch size. None to use config file setting.')\n    parser.add_argument(\n        '--learning_rate',\n        type=float,\n        default=None,\n        help='learning rate use for training. None to use config file setting.')\n    parser.add_argument('--pretrain',\n                        type=str,\n                        default=None,\n                        help='path to pretrain weights.')"
+        },
+        {
+            "comment": "The code snippet is parsing command line arguments for a training program. The options include whether to use GPU, disable memory optimization, specify the epoch number, set validation interval, and provide a directory to save training snapshots.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py\":60-81",
+            "content": "    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument('--no_memory_optimize',\n                        action='store_true',\n                        default=False,\n                        help='whether to use memory optimize in train')\n    parser.add_argument('--epoch',\n                        type=int,\n                        default=None,\n                        help='epoch number, 0 for read from config file')\n    parser.add_argument('--valid_interval',\n                        type=int,\n                        default=1,\n                        help='validation epoch interval, 0 for no validation.')\n    parser.add_argument('--save_dir',\n                        type=str,\n                        default=os.path.join('data', 'checkpoints'),\n                        help='directory name to save train snapshoot')\n    parser.add_argument('--log_interval',\n                        type=int,"
+        },
+        {
+            "comment": "This code is parsing command line arguments, loading and merging configuration files, initializing models in training and validation modes, and setting up a static program guard for building the model. It also allows the option to fix random seeds for reproducibility.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py\":82-108",
+            "content": "                        default=1,\n                        help='mini-batch interval to log.')\n    parser.add_argument('--fix_random_seed',\n                        type=ast.literal_eval,\n                        default=False,\n                        help='If set True, enable continuous evaluation job.')\n    args = parser.parse_args()\n    return args\ndef train(args):\n    # parse config\n    config = parse_config(args.config)\n    train_config = merge_configs(config, 'train', vars(args))\n    valid_config = merge_configs(config, 'valid', vars(args))\n    print_configs(train_config, 'Train')\n    train_model = models.get_model(args.model_name, train_config, mode='train')\n    valid_model = models.get_model(args.model_name, valid_config, mode='valid')\n    # build model\n    startup = static.Program()\n    train_prog = static.Program()\n    if args.fix_random_seed:\n        startup.random_seed = 1000\n        train_prog.random_seed = 1000\n    with static.program_guard(train_prog, startup):\n        with paddle.utils.unique_name.guard():"
+        },
+        {
+            "comment": "This code initializes the training and validation models, builds their inputs, models, and feeds. It also sets up the dataloaders, optimizer, and executor for the training phase. If pre-trained parameters are specified, they will be loaded before training starts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py\":109-133",
+            "content": "            train_model.build_input(use_dataloader=True)\n            train_model.build_model()\n            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label\n            train_feeds = train_model.feeds()\n            train_fetch_list = train_model.fetches()\n            train_loss = train_fetch_list[0]\n            optimizer = train_model.optimizer()\n            optimizer.minimize(train_loss)\n            train_dataloader = train_model.dataloader()\n    valid_prog = static.Program()\n    with static.program_guard(valid_prog, startup):\n        with paddle.utils.unique_name.guard():\n            valid_model.build_input(use_dataloader=True)\n            valid_model.build_model()\n            valid_feeds = valid_model.feeds()\n            valid_fetch_list = valid_model.fetches()\n            valid_dataloader = valid_model.dataloader()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(startup)\n    if args.pretrain:\n        train_model.load_pretrain_params(exe, args.pretrain, train_prog)"
+        },
+        {
+            "comment": "This code initializes a BuildStrategy and an ExecutionStrategy. It then creates two CompiledPrograms, one for training and one for validation, with data parallelism enabled. The number of GPUs is checked using CUDA_VISIBLE_DEVICES environment variable, and the number of GPUs must match what was set in the train configuration file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py\":135-160",
+            "content": "    build_strategy = static.BuildStrategy()\n    build_strategy.enable_inplace = True\n    exec_strategy = static.ExecutionStrategy()\n    compiled_train_prog = static.CompiledProgram(\n        train_prog).with_data_parallel(loss_name=train_loss.name,\n                                       build_strategy=build_strategy,\n                                       exec_strategy=exec_strategy)\n    compiled_valid_prog = static.CompiledProgram(\n        valid_prog).with_data_parallel(share_vars_from=compiled_train_prog,\n                                       build_strategy=build_strategy,\n                                       exec_strategy=exec_strategy)\n    # get reader\n    bs_denominator = 1\n    if args.use_gpu:\n        # check number of GPUs\n        gpus = os.getenv(\"CUDA_VISIBLE_DEVICES\", \"\")\n        if gpus == \"\":\n            pass\n        else:\n            gpus = gpus.split(\",\")\n            num_gpus = len(gpus)\n            assert num_gpus == train_config.TRAIN.num_gpus, \\\n                   \"num_gpus({}) set by CUDA_VISIBLE_DEVICES \" \\"
+        },
+        {
+            "comment": "Sets batch size based on number of GPUs, initializes train and valid readers, gets metrics for training and validation, sets the sample list generator for dataloader.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py\":161-180",
+            "content": "                   \"shoud be the same as that \" \\\n                   \"set in {}({})\".format(\n                   num_gpus, args.config, train_config.TRAIN.num_gpus)\n        bs_denominator = train_config.TRAIN.num_gpus\n    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /\n                                        bs_denominator)\n    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /\n                                        bs_denominator)\n    train_reader = get_reader(args.model_name.upper(), 'train', train_config)\n    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)\n    # get metrics\n    train_metrics = get_metrics(args.model_name.upper(), 'train', train_config)\n    valid_metrics = get_metrics(args.model_name.upper(), 'valid', valid_config)\n    epochs = args.epoch or train_model.epoch_num()\n    exe_places = static.cuda_places() if args.use_gpu else static.cpu_places()\n    train_dataloader.set_sample_list_generator(train_reader, places=exe_places)"
+        },
+        {
+            "comment": "The code trains a video tagging model using PaddlePaddle framework. It sets the sample list generator for valid data and then calls the train_with_dataloader function with various parameters such as number of epochs, log and validation intervals, and data loaders for training and testing. The function trains and tests the model, saving it if necessary. The code also checks whether the installed PaddlePaddle is compiled with GPU support based on the argument provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py\":181-204",
+            "content": "    valid_dataloader.set_sample_list_generator(valid_reader, places=exe_places)\n    train_with_dataloader(exe,\n                          train_prog,\n                          compiled_train_prog,\n                          train_dataloader,\n                          train_fetch_list,\n                          train_metrics,\n                          epochs=epochs,\n                          log_interval=args.log_interval,\n                          valid_interval=args.valid_interval,\n                          save_dir=args.save_dir,\n                          save_model_name=args.model_name,\n                          fix_random_seed=args.fix_random_seed,\n                          compiled_test_prog=compiled_valid_prog,\n                          test_dataloader=valid_dataloader,\n                          test_fetch_list=valid_fetch_list,\n                          test_metrics=valid_metrics)\nif __name__ == \"__main__\":\n    args = parse_args()\n    # check whether the installed paddle is compiled with GPU\n    check_cuda(args.use_gpu)"
+        },
+        {
+            "comment": "This code snippet checks the version, logs the arguments, creates a directory if it doesn't exist, and then proceeds to train using those arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/train.py\":205-211",
+            "content": "    check_version()\n    logger.info(args)\n    if not os.path.exists(args.save_dir):\n        os.makedirs(args.save_dir)\n    train(args)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/20a62983-2c0e-419c-bea1-c9a43141235c.json b/docs/doc/20a62983-2c0e-419c-bea1-c9a43141235c.json
new file mode 100644
index 000000000..6ea507208
--- /dev/null
+++ b/docs/doc/20a62983-2c0e-419c-bea1-c9a43141235c.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a class \"TransNetV2Loss\" for calculating TransNetV2 model loss with transition_weight and many-hot_loss_weight parameters, using weighted binary cross-entropy loss for one-hot and many-hot predictions. The snippet returns the total loss from TransNetV2 components.",
+    "details": [
+        {
+            "comment": "This code defines a class called \"TransNetV2Loss\" for calculating the loss in TransNetV2 model. It inherits from BaseWeightedLoss and takes transition_weight and many_hot_loss_weight as parameters for customizing the loss calculation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/transnetv2_loss.py\":0-27",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass TransNetV2Loss(BaseWeightedLoss):\n    \"\"\"Loss for TransNetV2 model\n    \"\"\"\n    def __init__(self, transition_weight=5.0, many_hot_loss_weight=0.1):\n        self.transition_weight = transition_weight\n        self.many_hot_loss_weight = many_hot_loss_weight\n        super().__init__()"
+        },
+        {
+            "comment": "This code defines a loss function for the TransNetV2 model, taking in one-hot and many-hot predictions and ground truth labels. It calculates the binary cross-entropy loss for both types of predictions, applies a weighted factor based on transition weight, and averages the losses before summing them together.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/transnetv2_loss.py\":29-53",
+            "content": "    def _forward(self, one_hot_pred, one_hot_gt,\n                many_hot_pred=None, many_hot_gt=None, reg_losses=None):\n        assert transition_weight != 1\n        one_hot_pred = one_hot_pred[:, :, 0]\n        one_hot_gt = one_hot_gt.astype('float32')\n        one_hot_loss = F.binary_cross_entropy_with_logits(logit=one_hot_pred, label=one_hot_gt, reduction='none')\n        one_hot_loss *= 1 + one_hot_gt * (transition_weight - 1)\n        one_hot_loss = paddle.mean(one_hot_loss)\n        many_hot_loss = 0.\n        if many_hot_loss_weight != 0. and many_hot_pred is not None:\n            many_hot_loss = many_hot_loss_weight * paddle.mean(\n                F.binary_cross_entropy_with_logits(logit=many_hot_pred[:, :, 0],\n                                                   label=many_hot_gt.astype('float32'), reduction='none'))\n        total_loss = one_hot_loss + many_hot_loss\n        if reg_losses is not None:\n            for name, value in reg_losses.items():\n                if value is not None:\n                    total_loss += value"
+        },
+        {
+            "comment": "This code snippet is returning the total loss computed from various components of the TransNetV2 model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/transnetv2_loss.py\":55-55",
+            "content": "        return total_loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/21a03f52-a095-4440-ac9c-3d9d95ece5a1.json b/docs/doc/21a03f52-a095-4440-ac9c-3d9d95ece5a1.json
new file mode 100644
index 000000000..b71375835
--- /dev/null
+++ b/docs/doc/21a03f52-a095-4440-ac9c-3d9d95ece5a1.json
@@ -0,0 +1,20 @@
+{
+    "summary": "Mixup class in PaddleVideo enhances video quality assessment by mixing images and labels from batches using adjustable alpha values, while Cutmix operator randomly selects boxes for mixing operations. Data augmentation is applied with random bounding boxes, and lambda is calculated for loss calculation.",
+    "details": [
+        {
+            "comment": "This code defines a Mixup class for video quality assessment using PaddleVideo. It is an operator that randomly mixes images and labels from batches to enhance the model's learning ability, with an adjustable alpha value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py\":0-35",
+            "content": "\"\"\"\n#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport random\nimport numpy as np\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass Mixup(object):\n    \"\"\"\n    Mixup operator.\n    Args:\n        alpha(float): alpha value.\n    \"\"\"\n    def __init__(self, alpha=0.2):\n        assert alpha > 0., \\\n                'parameter alpha[%f] should > 0.0' % (alpha)\n        self.alpha = alpha\n    def __call__(self, batch):\n        imgs, labels = list(zip(*batch))\n        imgs = np.array(imgs)"
+        },
+        {
+            "comment": "This code defines the Cutmix operator, which is used to mix images and their corresponding labels in a dataset. It takes an alpha parameter that determines the mixing ratio, and randomly selects a box to cut out from each image. It then applies a random mixing operation within this box to create augmented versions of both the image and label. The final output is a list containing the original image, its original label, its new mixed label, and an array of the lambda values used for the mixing process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py\":36-71",
+            "content": "        labels = np.array(labels)\n        bs = len(batch)\n        idx = np.random.permutation(bs)\n        lam = np.random.beta(self.alpha, self.alpha)\n        lams = np.array([lam] * bs, dtype=np.float32)\n        imgs = lam * imgs + (1 - lam) * imgs[idx]\n        return list(zip(imgs, labels, labels[idx], lams))\n@PIPELINES.register()\nclass Cutmix(object):\n    \"\"\" Cutmix operator\n    Args:\n        alpha(float): alpha value.\n    \"\"\"\n    def __init__(self, alpha=0.2):\n        assert alpha > 0., \\\n                'parameter alpha[%f] should > 0.0' % (alpha)\n        self.alpha = alpha\n    def rand_bbox(self, size, lam):\n        \"\"\" rand_bbox \"\"\"\n        w = size[2]\n        h = size[3]\n        cut_rat = np.sqrt(1. - lam)\n        cut_w = np.int(w * cut_rat)\n        cut_h = np.int(h * cut_rat)\n        # uniform\n        cx = np.random.randint(w)\n        cy = np.random.randint(h)\n        bbx1 = np.clip(cx - cut_w // 2, 0, w)\n        bby1 = np.clip(cy - cut_h // 2, 0, h)\n        bbx2 = np.clip(cx + cut_w // 2, 0, w)\n        bby2 = np.clip(cy + cut_h // 2, 0, h)"
+        },
+        {
+            "comment": "This function generates random bounding boxes and applies data augmentation by replacing portions of images with random patches from the same image. It also calculates lambda, which is used for weighting the original and augmented samples in the loss calculation. The function returns the modified images, labels, original labels, and lambdas.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py\":73-90",
+            "content": "        return bbx1, bby1, bbx2, bby2\n    def __call__(self, batch):\n        imgs, labels = list(zip(*batch))\n        imgs = np.array(imgs)\n        labels = np.array(labels)\n        bs = len(batch)\n        idx = np.random.permutation(bs)\n        lam = np.random.beta(self.alpha, self.alpha)\n        bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)\n        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]\n        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /\n                   (imgs.shape[-2] * imgs.shape[-1]))\n        lams = np.array([lam] * bs, dtype=np.float32)\n        return list(zip(imgs, labels, labels[idx], lams))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/21fc55b2-e95e-4aec-8338-e3a65b639b4a.json b/docs/doc/21fc55b2-e95e-4aec-8338-e3a65b639b4a.json
new file mode 100644
index 000000000..705ac7667
--- /dev/null
+++ b/docs/doc/21fc55b2-e95e-4aec-8338-e3a65b639b4a.json
@@ -0,0 +1,90 @@
+{
+    "summary": "The PaddleVideo code offers video processing functions, including a VisionTransformer class. It initializes and applies the model using parameters, transformations, and blocks while setting up components for future use.",
+    "details": [
+        {
+            "comment": "This code snippet is from the PaddleVideo library and contains a copyright notice, license information, and several helper functions. The VisionTransformer class will be defined later in the file, which serves as a backbone model for video processing tasks. The code defines constants for zero and one values, a function to convert a single value into a tuple of length 2 (to_2tuple), and a drop path function that applies dropout to inputs with a specified probability during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":0-36",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections.abc import Callable\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Constant\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import trunc_normal_\n__all__ = ['VisionTransformer']\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)\ndef to_2tuple(x):\n    return tuple([x] * 2)\ndef drop_path(x, drop_prob=0., training=False):"
+        },
+        {
+            "comment": "This code defines three classes: \"DropPath\", \"Identity\". The DropPath class implements dropout paths (Stochastic Depth) for each sample in the main path of residual blocks. It takes a single parameter, 'drop_prob', to control the probability of dropping out features. If 'drop_prob' is 0 or not training, it returns the input unchanged. The Identity class simply returns its input without any transformation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":37-64",
+            "content": "    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    # issuecomment-532968956 ...\n    See discussion: https://github.com/tensorflow/tpu/issues/494\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)\n    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)\n    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\nclass Identity(nn.Layer):"
+        },
+        {
+            "comment": "The code defines three classes: Identity, Mlp, and Attention. Identity is a simple class that returns its input unchanged. Mlp stands for Multilayer Perceptron, and it's a feed-forward neural network layer. Attention is a class for implementing attention mechanisms in the model. Both Mlp and Attention classes take inputs and return outputs after applying their respective operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":65-103",
+            "content": "    def __init__(self):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.0):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\nclass Attention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.0,\n                 proj_drop=0.0):"
+        },
+        {
+            "comment": "This code initializes a multi-head attention layer, and defines the forward pass. It reshapes input into query (Q), key (K), and value (V) matrices, calculates attention scores, applies dropout, and reconstructs output using residual connections and layer normalization. The `Block` class is also defined for building a Vision Transformer model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":104-137",
+            "content": "        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n        self.attn_drop = nn.Dropout(attn_drop)\n    def forward(self, x):\n        N, C = x.shape[1:]\n        qkv = self.qkv(x).reshape(\n            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(\n                (2, 0, 3, 1, 4))\n        q, k, v = qkv[0], qkv[1], qkv[2]\n        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale\n        attn = nn.functional.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.0,\n                 qkv_bias=False,"
+        },
+        {
+            "comment": "This function is initializing a backbone model with specified parameters. It takes in arguments like attention_type, norm_layer, and others to define the model's layers, including its attention layer. If norm_layer is a string, it uses the given string as the normalization layer; if it's a Callable, it uses that function as the normalization layer. The code also checks if the attention type is 'divided_space_time'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":138-165",
+            "content": "                 qk_scale=None,\n                 drop=0.0,\n                 attn_drop=0.0,\n                 drop_path=0.1,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 attention_type='divided_space_time'):\n        super().__init__()\n        self.attention_type = attention_type\n        if isinstance(norm_layer, str):\n            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm1 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        self.attn = Attention(dim,\n                              num_heads=num_heads,\n                              qkv_bias=qkv_bias,\n                              qk_scale=qk_scale,\n                              attn_drop=attn_drop,\n                              proj_drop=drop)\n        # Temporal Attention Parameters\n        if self.attention_type == 'divided_space_time':"
+        },
+        {
+            "comment": "This code initializes the temporal normalization layer and attention mechanism for a Vision Transformer backbone. It also creates a linear layer and drop path, based on provided configurations. The norm_layer parameter can be a string representing the desired normalization layer or a Callable object. If not a valid type, it raises a TypeError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":166-184",
+            "content": "            if isinstance(norm_layer, str):\n                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n            elif isinstance(norm_layer, Callable):\n                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)\n            else:\n                raise TypeError(\n                    \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n            self.temporal_attn = Attention(dim,\n                                           num_heads=num_heads,\n                                           qkv_bias=qkv_bias,\n                                           qk_scale=qk_scale,\n                                           attn_drop=attn_drop,\n                                           proj_drop=drop)\n            self.temporal_fc = nn.Linear(dim, dim)\n        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        if isinstance(norm_layer, str):\n            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)"
+        },
+        {
+            "comment": "The code defines a class and its forward method. It sets the normalization layer, calculates the number of spatial tokens, checks the attention type, applies normalization and MLP layers to the input, and performs divided space-time attention.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":185-209",
+            "content": "        elif isinstance(norm_layer, Callable):\n            self.norm2 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n    def forward(self, x, B, T, W):\n        num_spatial_tokens = (x.shape[1] - 1) // T\n        H = num_spatial_tokens // W\n        if self.attention_type in ['space_only', 'joint_space_time']:\n            x = x + self.drop_path(self.attn(self.norm1(x)))\n            x = x + self.drop_path(self.mlp(self.norm2(x)))\n            return x\n        elif self.attention_type == 'divided_space_time':\n            ########## Temporal ##########\n            xt = x[:, 1:, :]\n            _, _, _, _t, _m = B, H, W, T, xt.shape[-1]\n            xt = xt.reshape([-1, _t, _m])\n            res_temporal = self.drop_path("
+        },
+        {
+            "comment": "This code performs spatial attention in the Vision Transformer model. It creates a cls_token, reshapes the input, concatenates it with the cls_token, and then passes it through a drop path and an attention layer. Finally, it extracts the cls_token for further use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":210-234",
+            "content": "                self.temporal_attn(self.temporal_norm1(xt)))\n            _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]\n            res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])\n            res_temporal = self.temporal_fc(res_temporal)\n            xt = x[:, 1:, :] + res_temporal\n            ########## Spatial ##########\n            init_cls_token = x[:, 0, :].unsqueeze(1)\n            cls_token = init_cls_token.tile((1, T, 1))\n            _b, _t, _m = cls_token.shape\n            cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)\n            xs = xt\n            _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]\n            xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(\n                (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])\n            xs = paddle.concat((cls_token, xs), axis=1)\n            res_spatial = self.drop_path(self.attn(self.norm1(xs)))\n            # Taking care of CLS token\n            cls_token = res_spatial[:, 0, :]\n            _, _t, _m = B, T, cls_token.shape[-1]\n            cls_token = cls_token.reshape([-1, _t, _m])"
+        },
+        {
+            "comment": "This code performs averaging across frames, reshapes the spatial features, concatenates initial class token and input sequence, adds a drop path and MLP layer, and returns the output. It also defines PatchEmbed for image to patch embedding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":235-266",
+            "content": "            # averaging for every frame\n            cls_token = paddle.mean(cls_token, axis=1, keepdim=True)\n            res_spatial = res_spatial[:, 1:, :]\n            _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]\n            res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(\n                (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])\n            res = res_spatial\n            x = xt\n            x = paddle.concat((init_cls_token, x), axis=1) + paddle.concat(\n                (cls_token, res), axis=1)\n            # Mlp\n            x = x + self.drop_path(self.mlp(self.norm2(x)))\n            return x\n        else:\n            raise NotImplementedError\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n    def __init__(self,\n                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768):\n        super().__init__()\n        img_size = to_2tuple(img_size)\n        patch_size = to_2tuple(patch_size)\n        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //"
+        },
+        {
+            "comment": "The code defines a VisionTransformer class that takes input patches of an image. It initializes the model parameters such as img_size, patch_size and num_patches. The forward function performs the transformation by projecting the input into embedding space using a convolutional layer. If the input image size does not match the expected model size, it raises an assertion error. This class is registered with BACKBONES for future use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":267-297",
+            "content": "                                                        patch_size[0])\n        self.img_size = img_size\n        self.patch_size = patch_size\n        self.num_patches = num_patches\n        self.proj = nn.Conv2D(in_channels,\n                              embed_dim,\n                              kernel_size=patch_size,\n                              stride=patch_size)\n    def forward(self, x):\n        B, C, T, H, W = x.shape\n        assert H == self.img_size[0] and W == self.img_size[1], \\\n            f\"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).\"\n        x = x.transpose((0, 2, 1, 3, 4))\n        x = x.reshape([-1, C, H, W])\n        x = self.proj(x)\n        W = x.shape[-1]\n        x = x.flatten(2).transpose((0, 2, 1))\n        return x, T, W\n@BACKBONES.register()\nclass VisionTransformer(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n    \"\"\"\n    def __init__(self,\n                 pretrained=None,\n                 img_size=224,\n                 patch_size=16,"
+        },
+        {
+            "comment": "This code initializes a Vision Transformer (ViT) backbone model with specified parameters such as input dimensions, embedding dimension, depth, number of heads, mlp ratio, and attention type. The code sets up the patch embedding layer, creates a class token, and defines the number of patches based on the input size provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":298-326",
+            "content": "                 in_channels=3,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.1,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 num_seg=8,\n                 attention_type='divided_space_time',\n                 **args):\n        super().__init__()\n        self.pretrained = pretrained\n        self.num_seg = num_seg\n        self.attention_type = attention_type\n        self.num_features = self.embed_dim = embed_dim\n        self.patch_embed = PatchEmbed(img_size=img_size,\n                                      patch_size=patch_size,\n                                      in_channels=in_channels,\n                                      embed_dim=embed_dim)\n        num_patches = self.patch_embed.num_patches\n        # Positional Embeddings\n        self.cls_token = self.create_parameter(shape=(1, 1, embed_dim),"
+        },
+        {
+            "comment": "This code initializes various components of a vision transformer model, including positional embeddings (pos_embed), classification token (cls_token), and dropout layers (pos_drop, time_drop). It also creates a LayerList of blocks with specified dimensions and parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":327-349",
+            "content": "                                               default_initializer=zeros_)\n        self.pos_embed = self.create_parameter(shape=(1, num_patches + 1,\n                                                      embed_dim),\n                                               default_initializer=zeros_)\n        self.pos_drop = nn.Dropout(p=drop_rate)\n        if self.attention_type != 'space_only':\n            self.time_embed = self.create_parameter(shape=(1, num_seg,\n                                                           embed_dim),\n                                                    default_initializer=zeros_)\n            self.time_drop = nn.Dropout(p=drop_rate)\n        self.add_parameter(\"pos_embed\", self.pos_embed)\n        self.add_parameter(\"cls_token\", self.cls_token)\n        dpr = np.linspace(0, drop_path_rate, depth)\n        self.blocks = nn.LayerList([\n            Block(dim=embed_dim,\n                  num_heads=num_heads,\n                  mlp_ratio=mlp_ratio,\n                  qkv_bias=qkv_bias,\n                  qk_scale=qk_scale,"
+        },
+        {
+            "comment": "This code initializes a Vision Transformer (ViT) model. It creates a series of blocks with specified dimensions, applies normalization layers, and initializes the weight values using truncated normal distribution. Additionally, if pre-trained weights are provided, it loads them into the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":350-378",
+            "content": "                  drop=drop_rate,\n                  attn_drop=attn_drop_rate,\n                  drop_path=dpr[i],\n                  norm_layer=norm_layer,\n                  epsilon=epsilon,\n                  attention_type=self.attention_type) for i in range(depth)\n        ])\n        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)\n    def init_weights(self):\n        \"\"\"First init model's weight\"\"\"\n        trunc_normal_(self.pos_embed, std=0.02)\n        trunc_normal_(self.cls_token, std=0.02)\n        self.apply(self._init_fn)\n        if self.attention_type == 'divided_space_time':\n            i = 0\n            for m in self.blocks.sublayers(include_self=True):\n                m_str = str(m)\n                if 'Block' in m_str:\n                    if i > 0:\n                        zeros_(m.temporal_fc.weight)\n                        zeros_(m.temporal_fc.bias)\n                    i += 1\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if isinstance(\n                self.pretrained, str\n        ) and self.pretrained.strip() != \"\":  # load pretrained weights"
+        },
+        {
+            "comment": "This code initializes the forward function of a Vision Transformer (ViT) model. It extracts features from input images, adds positional embeddings, and handles batch size changes. The trunc_normal_ and zeros_ functions are used to initialize weights and biases for layers like Linear and LayerNorm, respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":379-404",
+            "content": "            load_ckpt(self,\n                      self.pretrained,\n                      num_patches=self.patch_embed.num_patches,\n                      num_seg=self.num_seg,\n                      attention_type=self.attention_type)\n    def _init_fn(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):\n            ones_(m.weight)\n            zeros_(m.bias)\n    def forward_features(self, x):\n        # B = x.shape[0]\n        B = paddle.shape(x)[0]\n        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]\n        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]\n        x = paddle.concat((cls_tokens, x), axis=1)\n        pos_interp = (x.shape[1] != self.pos_embed.shape[1])\n        if pos_interp:\n            pos_embed = self.pos_embed\n            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)\n            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose("
+        },
+        {
+            "comment": "The code is applying relative position embeddings to the input features (x) for a vision transformer model. It first checks if a specific flag is set, then interpolates other position embeddings based on the size of the input and adds them to class position embeddings. If the flag is not set, it simply adds the position embeddings from the model. Afterward, the code applies time embeddings if the attention type is not \"space_only\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":405-429",
+            "content": "                (0, 2, 1))\n            P = int(other_pos_embed.shape[2]**0.5)\n            H = x.shape[1] // W\n            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])\n            new_pos_embed = F.interpolate(other_pos_embed,\n                                          size=(H, W),\n                                          mode='nearest')\n            new_pos_embed = new_pos_embed.flatten(2)\n            new_pos_embed = new_pos_embed.transpose((0, 2, 1))\n            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),\n                                          axis=1)\n            x = x + new_pos_embed\n        else:\n            x = x + self.pos_embed\n        x = self.pos_drop(x)\n        # Time Embeddings\n        if self.attention_type != 'space_only':\n            cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(\n                T)[0].index_select(paddle.to_tensor([0]), axis=1)\n            x = x[:, 1:]\n            _, _n, _m = x.shape\n            _t = T\n            x = x.reshape([-1, _t, _n, _m]).transpose("
+        },
+        {
+            "comment": "This code performs time embeddings resizing and adds them to the input feature maps. It then flattens the tensor, concatenates class tokens, processes through attention blocks, and finally, for space-only attention type, it makes predictions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":430-454",
+            "content": "                (0, 2, 1, 3)).reshape([-1, _t, _m])\n            # Resizing time embeddings in case they don't match\n            time_interp = (T != self.time_embed.shape[1])\n            if time_interp:  # T' != T\n                time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)\n                new_time_embed = F.interpolate(time_embed,\n                                               size=(T, x.shape[-1]),\n                                               mode='nearest').squeeze(0)\n                new_time_embed = new_time_embed.transpose((0, 2, 1))\n                x = x + new_time_embed\n            else:\n                x = x + self.time_embed\n            x = self.time_drop(x)\n            _, _t, _m = x.shape\n            x = x.reshape([-1, W * W * T, _m])\n            x = paddle.concat((cls_tokens, x), axis=1)\n        # Attention blocks\n        for blk in self.blocks:\n            x = blk(x, B, T, W)\n        # Predictions for space-only baseline\n        if self.attention_type == 'space_only':\n            _, _n, _m = x.shape"
+        },
+        {
+            "comment": "This code snippet is part of a Vision Transformer (ViT) model implementation. The function averages predictions for every frame and applies normalization before returning the embeddings for each image in the input sequence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit.py\":455-464",
+            "content": "            _t = T\n            x = x.reshape([-1, _t, _n, _m])\n            x = paddle.mean(x, 1)  # averaging predictions for every frame\n        x = self.norm(x)\n        return x[:, 0]  # [B,  embed_dim]\n    def forward(self, x):\n        x = self.forward_features(x)\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/225fb15b-381f-479b-a3a1-53e2129c8cd8.json b/docs/doc/225fb15b-381f-479b-a3a1-53e2129c8cd8.json
new file mode 100644
index 000000000..5be528069
--- /dev/null
+++ b/docs/doc/225fb15b-381f-479b-a3a1-53e2129c8cd8.json
@@ -0,0 +1,35 @@
+{
+    "summary": "TSMRecHead is a TSNHead-based classifier head for Temporal Segment Networks, performing average pooling, optional dropout, reshaping, mean operation, and applying a fully connected layer. It uses defined loss function to compare with labels, and calculates loss based on provided labels using label smoothing and weighted average.",
+    "details": [
+        {
+            "comment": "TSMRecHead is a TSNHead-based classifier head for Temporal Segment Networks (TSMs) with specified number of classes, input channels and registered under PaddlePaddle's HEADS registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py\":0-32",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport math\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle.nn import AdaptiveAvgPool2D, Linear, Dropout\nfrom .base import BaseHead\nfrom .tsn_head import TSNHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass TSMRecHead(TSNHead):\n    \"\"\" TSM Rec Head\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature."
+        },
+        {
+            "comment": "This function initializes the weights of the FC layer using a uniform distribution, and sets the standard deviation for normal initialization. The loss_cfg argument determines the type of loss function to use, and drop_ratio is the probability of dropping connections between layers during training. The stdv value is set based on the number of input channels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py\":33-61",
+            "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        drop_ratio(float): drop ratio. Default: 0.8.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.001.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='L1Loss'),\n                 drop_ratio=0.8,\n                 std=0.01,\n                 data_format=\"NCHW\",\n                 **kwargs):\n        super().__init__(num_classes,\n                         in_channels,\n                         loss_cfg,\n                         drop_ratio=drop_ratio,\n                         std=std,\n                         data_format=data_format,\n                         **kwargs)\n        self.stdv = 1.0 / math.sqrt(self.in_channels * 1.0)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'Uniform',"
+        },
+        {
+            "comment": "This code defines a head for TSM (Temporal Shift Module) Recognition task. It includes initialization of weights, setting learning rate, and applying L2 decay regularizer. The forward method performs average pooling, optional dropout, reshaping, mean operation, and finally passes the result through fully connected layer to obtain classification scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py\":62-90",
+            "content": "                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     low=-self.stdv,\n                     high=self.stdv)\n        self.fc.bias.learning_rate = 2.0\n        self.fc.bias.regularizer = paddle.regularizer.L2Decay(0.)\n    def forward(self, x, num_seg):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)\n        # [N * num_seg, in_channels, 1, 1]\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, 1, in_channels]\n        x = paddle.reshape(x, shape=[-1, self.in_channels])\n        # [N, in_channels]\n        score = self.fc(x)"
+        },
+        {
+            "comment": "The code defines a loss function for a model that predicts scores and compares them with the given labels. It calculates the loss between the predicted scores and the target labels, considering cases where there is only one label. The losses are returned in a dictionary format with 'loss' as the mandatory field.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py\":91-121",
+            "content": "        # [N, num_class]\n        #m = paddle.nn.Sigmoid()\n        #score = m(score)\n        return score\n    def loss(self, scores, labels, valid_mode=False, **kwargs):\n        \"\"\"Calculate the loss accroding to the model output ```scores```,\n           and the target ```labels```.\n        Args:\n            scores (paddle.Tensor): The output of the model.\n            labels (paddle.Tensor): The target output of the model.\n        Returns:\n            losses (dict): A dict containing field 'loss'(mandatory).\n        \"\"\"\n        if len(labels) == 1:  #commonly case\n            output = []\n            label = []\n            labels = labels[0]\n            losses = dict()\n            loss = self.loss_func(scores, labels, **kwargs)\n            score_list = paddle.tolist(scores)\n            label_list = paddle.tolist(labels)\n            score_list_len = len(score_list)\n            for i in range(score_list_len):\n                output.append(score_list[i][0])\n                label.append(label_list[i][0])\n            losses['loss'] = loss"
+        },
+        {
+            "comment": "This function calculates the loss based on the number of labels provided. If one label is given, it returns the output and label as losses. If three labels are given (a, b, lam), it casts the labels to float32, applies label smoothing or standard loss depending on epsilon, then calculates the weighted average loss for a and b. It returns the loss, output, and label in a dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py\":122-148",
+            "content": "            losses['output'] = output\n            losses['label'] = label\n            return losses\n        elif len(labels) == 3:\n            labels_a, labels_b, lam = labels\n            labels_a = paddle.cast(labels_a, dtype='float32')\n            labels_b = paddle.cast(labels_b, dtype='float32')\n            lam = lam[0]  # get lam value\n            losses = dict()\n            if self.ls_eps != 0:\n                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)\n                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)\n            else:\n                loss_a = self.loss_func(scores, labels_a, **kwargs)\n                loss_b = self.loss_func(scores, labels_a, **kwargs)\n            loss = lam * loss_a + (1 - lam) * loss_b\n            losses['loss'] = loss\n            losses['output'] = output\n            losses['label'] = label\n            return losses\n        else:\n            raise NotImplementedError\n    def label_smooth_loss(self, scores, labels, **kwargs):\n        \"\"\"label smooth loss\"\"\""
+        },
+        {
+            "comment": "Applies label smoothing to the input labels, squeezes the labels along a specified axis, and calculates the loss using a provided loss function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py\":149-152",
+            "content": "        labels = F.label_smooth(labels, epsilon=self.ls_eps)\n        labels = paddle.squeeze(labels, axis=1)\n        loss = self.loss_func(scores, labels, **kwargs)\n        return loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/22d919b3-064e-43b2-8d4a-c40b0e1afd3b.json b/docs/doc/22d919b3-064e-43b2-8d4a-c40b0e1afd3b.json
new file mode 100644
index 000000000..92504aa4f
--- /dev/null
+++ b/docs/doc/22d919b3-064e-43b2-8d4a-c40b0e1afd3b.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet appears to import the \"build_optimizer\" and \"build_lr\" functions from their respective modules within the \"paddlevideo.solver\" package. The comments at the top of the file indicate that this code is protected by copyright and licensed under the Apache License, Version 2.0.",
+    "details": [
+        {
+            "comment": "This code snippet appears to import the \"build_optimizer\" and \"build_lr\" functions from their respective modules within the \"paddlevideo.solver\" package. The comments at the top of the file indicate that this code is protected by copyright and licensed under the Apache License, Version 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/__init__.py\":0-15",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .optimizer import build_optimizer\nfrom .lr import build_lr"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/23ba667d-92f7-4c25-a37d-1fcd0aa5ab20.json b/docs/doc/23ba667d-92f7-4c25-a37d-1fcd0aa5ab20.json
new file mode 100644
index 000000000..e9b7eae05
--- /dev/null
+++ b/docs/doc/23ba667d-92f7-4c25-a37d-1fcd0aa5ab20.json
@@ -0,0 +1,35 @@
+{
+    "summary": "PaddleVideo Quick Start guide covers installation, usage details, and action recognition model for classifying video files. Highlights top-5 classes with high confidence using an example command, also suggesting alternative OpenCV installation method.",
+    "details": [
+        {
+            "comment": "Code is an English version of the Quick Start guide for PaddleVideo. It provides information on how to install the necessary packages, use PaddleVideo by command line and Python code, describes arguments, and answers frequently asked questions. The code also includes instructions for installing PaddlePaddle with or without a GPU, as well as the option to install the PaddleVideo Whl Package from pypi.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/quick_start.md\":0-35",
+            "content": "English | [\u7b80\u4f53\u4e2d\u6587](../zh-CN/quick_start.md)\n# PaddleVide Quick Start\n- [1. Installation](#1)\n  - [1.1 Install PaddlePaddle](#11)\n  - [1.2 Install PaddleVideo Whl Package](#12)\n- [2. Easy-to-Use](#2)\n  - [2.1 Use by Command Line](#21)\n  - [2.2 Use by Python Code](#22)\n- [3. Arguments description](#3)\n- [4.QA](#4)\n## 1. Installation\n<a name=\"11\"></a>\n### 1.1 Install PaddlePaddle\n- If you have CUDA 9 or CUDA 10 installed on your machine, please run the following command to install\n  ```bash\n  python3.7 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple\n  ```\n- If you have no available GPU on your machine, please run the following command to install the CPU version\n  ```bash\n  python3.7 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple\n  ```\nFor more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation.\n<a name=\"12\"></a>\n### 1.2 Install PaddleVideo Whl Package\n- option1: use pypi\uff08recommand\uff09"
+        },
+        {
+            "comment": "Install package using pip:\n```bash\npip3.7 install ppvideo==2.3.0\n```\nAlternatively, build and install locally:\n```bash\npython3.7 setup.py bdist_wheel\npython3.7 -m pip install dist/ppvideo-2.3.0-py3-none-any.whl\n```\nCommand to use by command line:\n```bash\nppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'\n```\nThis command uses PP-TSM_v2 model on CPU for inference on data/example.avi file, divided into 16 segments and frames combined before feeding into network. Results show top-1 prediction class_id as 5, scores as 1.0, and class name as 'archery'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/quick_start.md\":37-75",
+            "content": "```bash\npip3.7 install ppvideo==2.3.0\n```\n- option2: build and install locally\n```bash\npython3.7 setup.py bdist_wheel\npython3.7 -m pip install dist/ppvideo-2.3.0-py3-none-any.whl\n```\n## 2. Easy-to-Use\n<a name=\"21\"></a>\n### 2.1 Use by Command Line\nRun shell command\uff1a\n```bash\nppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'\n```\n- This command use `PP-TSM_v2` model to infer `data/example.avi` file in `CPU`.\n- The length of the example video is about 10s. When inference, the video is first divided into 16 segments according to the time axis, then extract one frame from each segment. Finally all frames are combined and feeded into the network.\nResults\uff1a\n```\nCurrent video file: data/example.avi\n        top-1 classes: [5]\n        top-1 scores: [1.]\n        top-1 label names: ['archery']\n```\nAs you can see, use `PP-TSM_v2` trained on Kinetics-400 to predict `data/example.avi` video\uff0ctop1 prediction class_id is `5`, scores is `1.0`, class name is `archery`.\n<a name=\"22\"></a>\n### 2.2 Use by Python Code"
+        },
+        {
+            "comment": "This code uses the PaddleVideo library with the PP-TSM_v2 model for video inference on a CPU. It predicts the top-1 class, score, and label name of the provided 'data/example.avi' video file. The model is trained on Kinetics-400 dataset. Arguments include model name (PP-TSM or PP-TSM_v2), video file path, GPU usage, and other optional parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/quick_start.md\":77-106",
+            "content": "Run python code\uff1a\n```python\nfrom ppvideo import PaddleVideo\nclas = PaddleVideo(model_name='ppTSM_v2', use_gpu=False)\nvideo_file='data/example.avi'\nclas.predict(video_file)\n```\n- This code use `PP-TSM_v2` model to infer `data/example.avi` file in `CPU`.\nResults:\n```\nCurrent video file: data/example.avi\n        top-1 classes: [5]\n        top-1 scores: [1.]\n        top-1 label names: ['archery']\n```\nAs you can see, use `PP-TSM_v2` trained on Kinetics-400 to predict `data/example.avi` video\uff0ctop1 prediction class_id is `5`, scores is `1.0`, class name is `archery`.\n<a name=\"3\"></a>\n## 3. Arguments description\n| name | type | description |\n| :---: | :---: | :--- |\n| model_name | str | optional, model name, `'ppTSM'` or `'ppTSM_v2'`. If None, please specify the path of your inference model by args `model_file` and `params_file`. |\n| video_file | str | required, Video file path, supported format: single video file path, or folder containing multiple videos. |\n| use_gpu | bool | whether to use GPU\uff0cdefault True\u3002 |\n| nu"
+        },
+        {
+            "comment": "The code defines several parameters for the PaddleVideo model including the number of segments, short and target frame sizes, model file paths, batch size, use of float16, TensorRT, MKLDNN, top_k, and label name path. It also provides a command example usage of the model with specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/quick_start.md\":106-121",
+            "content": "m_seg | int | The number of segments used in the TSM model, which is also the number of frames extracted from the video. 8 for `ppTSM`, 16 for `ppTSM_v2`, default 16. |\n| short_size | int |  short size of frame, default 256.|\n| target_size | int | target size of frame, default 224.|\n| model_file | str | optional\uff0cinference model(`.pdmodel`)path. |\n| params_file | str | optional, inference modle(`.pdiparams`) path. |\n| batch_size | int | Batch size, default 1.|\n| use_fp16 | bool | whether to use float16\uff0cdefault False.|\n| use_tensorrt | bool| whether to use Tensorrt, default False.|\n| gpu_mem | int | use GPU memory, default 8000.|\n| enable_mkldnn | bool | whether to use MKLDNN, default False.|\n| top_k | int | top_k, default 1. |\n| label_name_path | str | This file consists the relation of class_id and class_name. Default use `data/k400/Kinetics-400_label_list.txt` of Kinetics-400. You can replace it with your own label file. |\ncommand example1\uff1a\n```bash\nppvideo --model_name='ppTSM_v2' --num_seg=16 --video_file=\"data/mp4\" --batch_size=2  --top_k=5"
+        },
+        {
+            "comment": "The code displays the top-5 classes, scores, and label names for five different video files. It shows that the classifier consistently identifies the same top-5 classes with high confidence for each video file, indicating a reliable classification performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/quick_start.md\":122-141",
+            "content": "```\nResults\uff1a\n```txt\nCurrent video file: data/mp4/example3.avi\n        top-5 classes: [  5 345 311 159 327]\n        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]\n        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']\nCurrent video file: data/mp4/example2.avi\n        top-5 classes: [  5 345 311 159 327]\n        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]\n        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']\nCurrent video file: data/mp4/example.avi\n        top-5 classes: [  5 345 311 159 327]\n        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]\n        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']\nCurrent video file: data/mp4/example1.avi\n        top-5 classes: [  5 345 311 159 327]\n        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]"
+        },
+        {
+            "comment": "The code provides a list of top-5 label names for PaddleVideo's action recognition model. The command example demonstrates how to run the model with specific parameters, such as the model name, number of video segments, input video file, and batch size. Additionally, it suggests an alternative installation method for OpenCV-python if the regular installation is slow.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/quick_start.md\":142-156",
+            "content": "        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']\n```\ncommand example1\uff1a\n```bash\nppvideo --model_name='ppTSM' --num_seg=8 --video_file=\"data/mp4\" --batch_size=2  --top_k=5\n```\n<a name=\"4\"></a>\n## 4. QA\n1. opecv-python Installation maybe slow, you can try:\n```\npython3.7 -m pip install opencv-python==4.2.0.32 -i https://pypi.doubanio.com/simple\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/23c777e0-a6ea-4c5b-90a7-b744b8660384.json b/docs/doc/23c777e0-a6ea-4c5b-90a7-b744b8660384.json
new file mode 100644
index 000000000..9f38e4b1d
--- /dev/null
+++ b/docs/doc/23c777e0-a6ea-4c5b-90a7-b744b8660384.json
@@ -0,0 +1,130 @@
+{
+    "summary": "This code defines image augmentation and resizing functions for AVA dataset in PaddleVideo library using operations like resizing, lazy initialization, and RandomRescale/Resize transforms. It creates classes for ground truth bounding boxes and proposals, cropping and flipping entity boxes, and includes Flip and Normalize classes for image processing and normalization.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library and contains a module for image augmentations in the AVA dataset. It imports necessary libraries, defines conversion dictionaries for interpolation methods between PIL and OpenCV, and sets up registry entries for the PIPELINES module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":0-33",
+            "content": "#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nimport math\nfrom PIL import Image\nfrom ..registry import PIPELINES\nfrom collections.abc import Sequence\nimport cv2\npillow_interp_codes = {\n    'nearest': Image.NEAREST,\n    'bilinear': Image.BILINEAR,\n    'bicubic': Image.BICUBIC,\n    'box': Image.BOX,\n    'lanczos': Image.LANCZOS,\n    'hamming': Image.HAMMING\n}\ncv2_interp_codes = {\n    'nearest': cv2.INTER_NEAREST,\n    'bilinear': cv2.INTER_LINEAR,"
+        },
+        {
+            "comment": "This function initializes the lazy operation properly, ensuring a non-lazy operation is not accidentally mixed in. If 'img_shape' is not in results, it adds 'img_shape'. If 'lazy' is set to True and 'lazy' does not exist in results, it creates a new dictionary for lazy operation containing 'original_shape', 'img_shape', 'crop_bbox', 'flip', 'flip_direction', and 'interpolation'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":34-63",
+            "content": "    'bicubic': cv2.INTER_CUBIC,\n    'area': cv2.INTER_AREA,\n    'lanczos': cv2.INTER_LANCZOS4\n}\ndef _init_lazy_if_proper(results, lazy):\n    \"\"\"Initialize lazy operation properly.\n    Make sure that a lazy operation is properly initialized,\n    and avoid a non-lazy operation accidentally getting mixed in.\n    Required keys in results are \"imgs\" if \"img_shape\" not in results,\n    otherwise, Required keys in results are \"img_shape\", add or modified keys\n    are \"img_shape\", \"lazy\".\n    Add or modified keys in \"lazy\" are \"original_shape\", \"crop_bbox\", \"flip\",\n    \"flip_direction\", \"interpolation\".\n    Args:\n        results (dict): A dict stores data pipeline result.\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    if 'img_shape' not in results:\n        results['img_shape'] = results['imgs'][0].shape[:2]\n    if lazy:\n        if 'lazy' not in results:\n            img_h, img_w = results['img_shape']\n            lazyop = dict()\n            lazyop['original_shape'] = results['img_shape']"
+        },
+        {
+            "comment": "This code defines functions for image augmentations in the AVA dataset pipeline. The \"_scale_size\" function scales a size by a ratio, while \"rescale_size\" calculates the new size to be rescaled based on an input scale factor or maximum size. The code also includes the initialization of crop parameters and flipping options for lazy operations in the results dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":64-95",
+            "content": "            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],\n                                           dtype=np.float32)\n            lazyop['flip'] = False\n            lazyop['flip_direction'] = None\n            lazyop['interpolation'] = None\n            results['lazy'] = lazyop\n    else:\n        assert 'lazy' not in results, 'Use Fuse after lazy operations'\ndef _scale_size(size, scale):\n    \"\"\"Rescale a size by a ratio.\n    Args:\n        size (tuple[int]): (w, h).\n        scale (float): Scaling factor.\n    Returns:\n        tuple[int]: scaled size.\n    \"\"\"\n    w, h = size\n    return int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)\ndef rescale_size(old_size, scale, return_scale=False):\n    \"\"\"Calculate the new size to be rescaled to.\n    Args:\n        old_size (tuple[int]): The old size (w, h) of image.\n        scale (float | tuple[int]): The scaling factor or maximum size.\n            If it is a float number, then the image will be rescaled by this\n            factor, else if it is a tuple of 2 integers, then the image will"
+        },
+        {
+            "comment": "Function \"imresize\" resizes an image based on the provided scale factor. If the scale is a number, it's used directly as the scaling factor. If it's a tuple of ints, it sets max and min edge sizes for resizing. Returns new resized size or both size and scaling factor if requested.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":96-129",
+            "content": "            be rescaled as large as possible within the scale.\n        return_scale (bool): Whether to return the scaling factor besides the\n            rescaled image size.\n    Returns:\n        tuple[int]: The new rescaled image size.\n    \"\"\"\n    w, h = old_size\n    if isinstance(scale, (float, int)):\n        if scale <= 0:\n            raise ValueError(f'Invalid scale {scale}, must be positive.')\n        scale_factor = scale\n    elif isinstance(scale, tuple):\n        max_long_edge = max(scale)\n        max_short_edge = min(scale)\n        scale_factor = min(max_long_edge / max(h, w),\n                           max_short_edge / min(h, w))\n    else:\n        raise TypeError(\n            f'Scale must be a number or tuple of int, but got {type(scale)}')\n    new_size = _scale_size((w, h), scale_factor)\n    if return_scale:\n        return new_size, scale_factor\n    else:\n        return new_size\ndef imresize(img,\n             size,\n             return_scale=False,\n             interpolation='bilinear',\n             out=None,"
+        },
+        {
+            "comment": "This code defines a function for resizing an image to a given size. It supports two backends: 'cv2' and 'pillow'. If the backend is not specified, it defaults to 'cv2'. The function first gets the original image's height and width, checks if the backend is valid, handles unsupported backends, asserts the image type for 'pillow' backend, resizes the image using either OpenCV or Pillow library based on the backend, and finally returns the resized image along with scale factors if return_scale is True. The EntityBoxRescale class registers a pipeline to rescale entity boxes and proposals according to the image shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":130-159",
+            "content": "             backend=None):\n    \"\"\"Resize image to a given size.  \"\"\"\n    h, w = img.shape[:2]\n    if backend is None:\n        backend = 'cv2'\n    if backend not in ['cv2', 'pillow']:\n        raise ValueError(f'backend: {backend} is not supported for resize.'\n                         f\"Supported backends are 'cv2', 'pillow'\")\n    if backend == 'pillow':\n        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'\n        pil_image = Image.fromarray(img)\n        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])\n        resized_img = np.array(pil_image)\n    else:\n        resized_img = cv2.resize(\n            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])\n    if not return_scale:\n        return resized_img\n    else:\n        w_scale = size[0] / w\n        h_scale = size[1] / h\n        return resized_img, w_scale, h_scale\n@PIPELINES.register()\nclass EntityBoxRescale:\n    \"\"\"Rescale the entity box and proposals according to the image shape.\n    Required keys are \"proposals\", \"gt_bboxes\", added or modified keys are"
+        },
+        {
+            "comment": "The code defines a class called EntityBoxCrop that scales the ground truth bounding boxes (gt_bboxes) and proposals, if present, by a given scale factor. It ensures that the number of columns in the proposals is 4. This class can be registered as a pipeline for video augmentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":160-192",
+            "content": "    \"gt_bboxes\". If original \"proposals\" is not None, \"proposals\" and\n    will be added or modified.\n    Args:\n        scale_factor (np.ndarray): The scale factor used entity_box rescaling.\n    \"\"\"\n    def __init__(self, scale_factor):\n        self.scale_factor = scale_factor\n    def __call__(self, results):\n        scale_factor = np.concatenate([self.scale_factor, self.scale_factor])\n        if 'gt_bboxes' in results:\n            gt_bboxes = results['gt_bboxes']\n            results['gt_bboxes'] = gt_bboxes * scale_factor\n        if 'proposals' in results:\n            proposals = results['proposals']\n            if proposals is not None:\n                assert proposals.shape[1] == 4, (\n                    'proposals shape should be in '\n                    f'(n, 4), but got {proposals.shape}')\n                results['proposals'] = proposals * scale_factor\n        return results\n    def __repr__(self):\n        return f'{self.__class__.__name__}(scale_factor={self.scale_factor})'\n@PIPELINES.register()\nclass EntityBoxCrop:"
+        },
+        {
+            "comment": "This code initializes an object that crops the entity boxes and proposals according to the cropped images. The required keys are \"proposals\" and \"gt_bboxes\", while \"gt_bboxes\" is added or modified. If original \"proposals\" is not None, \"proposals\" will be modified. The crop_bbox argument specifies the bbox used to crop the original image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":193-223",
+            "content": "    \"\"\"Crop the entity boxes and proposals according to the cropped images.\n    Required keys are \"proposals\", \"gt_bboxes\", added or modified keys are\n    \"gt_bboxes\". If original \"proposals\" is not None, \"proposals\" will be\n    modified.\n    Args:\n        crop_bbox(np.ndarray | None): The bbox used to crop the original image.\n    \"\"\"\n    def __init__(self, crop_bbox):\n        self.crop_bbox = crop_bbox\n    def __call__(self, results):\n        proposals = results['proposals']\n        gt_bboxes = results['gt_bboxes']\n        if self.crop_bbox is None:\n            return results\n        x1, y1, x2, y2 = self.crop_bbox\n        img_w, img_h = x2 - x1, y2 - y1\n        assert gt_bboxes.shape[-1] == 4\n        gt_bboxes_ = gt_bboxes.copy()\n        gt_bboxes_[..., 0::2] = np.clip(gt_bboxes[..., 0::2] - x1, 0, img_w - 1)\n        gt_bboxes_[..., 1::2] = np.clip(gt_bboxes[..., 1::2] - y1, 0, img_h - 1)\n        results['gt_bboxes'] = gt_bboxes_\n        if proposals is not None:\n            assert proposals.shape[-1] == 4"
+        },
+        {
+            "comment": "This code defines two classes, \"EntityBoxFlip\" and a nameless class. The nameless class performs cropping operations on proposals based on given coordinates (x1, y1). It also updates the results['proposals'] with the modified proposals. The EntityBoxFlip class flips the entity boxes and proposals horizontally with a certain probability. It adds or modifies keys \"gt_bboxes\" in the results dictionary. If \"proposals\" is not None, it will also modify them. The img_shape tuple represents the image shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":224-248",
+            "content": "            proposals_ = proposals.copy()\n            proposals_[..., 0::2] = np.clip(proposals[..., 0::2] - x1, 0,\n                                            img_w - 1)\n            proposals_[..., 1::2] = np.clip(proposals[..., 1::2] - y1, 0,\n                                            img_h - 1)\n            results['proposals'] = proposals_\n        return results\n    def __repr__(self):\n        return f'{self.__class__.__name__}(crop_bbox={self.crop_bbox})'\n@PIPELINES.register()\nclass EntityBoxFlip:\n    \"\"\"Flip the entity boxes and proposals with a probability.\n    Reverse the order of elements in the given bounding boxes and proposals\n    with a specific direction. The shape of them are preserved, but the\n    elements are reordered. Only the horizontal flip is supported (seems\n    vertical flipping makes no sense). Required keys are \"proposals\",\n    \"gt_bboxes\", added or modified keys are \"gt_bboxes\". If \"proposals\"\n    is not None, it will also be modified.\n    Args:\n        img_shape (tuple[int]): The img shape."
+        },
+        {
+            "comment": "This code defines a pipeline for resizing images to a specific size. It first initializes the pipeline with an image shape and then, in the __call__ method, it adjusts the ground truth bounding boxes and proposal bounding boxes by subtracting their width values from the total image width minus 1. If there are no proposals, it sets proposals_ to None. The __repr__ method provides a string representation of the pipeline.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":249-283",
+            "content": "    \"\"\"\n    def __init__(self, img_shape):\n        self.img_shape = img_shape\n    def __call__(self, results):\n        proposals = results['proposals']\n        gt_bboxes = results['gt_bboxes']\n        img_h, img_w = self.img_shape\n        assert gt_bboxes.shape[-1] == 4\n        gt_bboxes_ = gt_bboxes.copy()\n        gt_bboxes_[..., 0::4] = img_w - gt_bboxes[..., 2::4] - 1\n        gt_bboxes_[..., 2::4] = img_w - gt_bboxes[..., 0::4] - 1\n        if proposals is not None:\n            assert proposals.shape[-1] == 4\n            proposals_ = proposals.copy()\n            proposals_[..., 0::4] = img_w - proposals[..., 2::4] - 1\n            proposals_[..., 2::4] = img_w - proposals[..., 0::4] - 1\n        else:\n            proposals_ = None\n        results['proposals'] = proposals_\n        results['gt_bboxes'] = gt_bboxes_\n        return results\n    def __repr__(self):\n        repr_str = f'{self.__class__.__name__}(img_shape={self.img_shape})'\n        return repr_str\n@PIPELINES.register()\nclass Resize:\n    \"\"\"Resize images to a specific size."
+        },
+        {
+            "comment": "This code defines a function for image augmentation in PaddleVideo. The function takes arguments like scale, keep_ratio, interpolation, and lazy. If keep_ratio is True, it scales the image by the given factor or resizes to the maximum size specified by the tuple. It uses bilinear interpolation by default. Lazy operation can be determined if lazy argument is set to True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":285-302",
+            "content": "    Required keys are \"imgs\", \"img_shape\", \"modality\", added or modified\n    keys are \"imgs\", \"img_shape\", \"keep_ratio\", \"scale_factor\", \"lazy\",\n    \"resize_size\". Required keys in \"lazy\" is None, added or modified key is\n    \"interpolation\".\n    Args:\n        scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling\n            factor or maximum size:\n            If it is a float number, the image will be rescaled by this\n            factor, else if it is a tuple of 2 integers, the image will\n            be rescaled as large as possible within the scale.\n            Otherwise, it serves as (w, h) of output size.\n        keep_ratio (bool): If set to True, Images will be resized without\n            changing the aspect ratio. Otherwise, it will resize images to a\n            given size. Default: True.\n        interpolation (str): Algorithm used for interpolation:\n            \"nearest\" | \"bilinear\". Default: \"bilinear\".\n        lazy (bool): Determine whether to apply lazy operation. Default: False."
+        },
+        {
+            "comment": "Initializes a resize augmentation object with scale, keeps ratio (True or False), interpolation method ('bilinear' default), and lazy flag (False).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":303-333",
+            "content": "    \"\"\"\n    def __init__(self,\n                 scale,\n                 keep_ratio=True,\n                 interpolation='bilinear',\n                 lazy=False):\n        if isinstance(scale, str):\n            scale = eval(scale)\n        if isinstance(scale, float):\n            if scale <= 0:\n                raise ValueError(f'Invalid scale {scale}, must be positive.')\n        elif isinstance(scale, tuple):\n            max_long_edge = max(scale)\n            max_short_edge = min(scale)\n            if max_short_edge == -1:\n                # assign np.inf to long edge for rescaling short edge later.\n                scale = (np.inf, max_long_edge)\n        else:\n            raise TypeError(\n                f'Scale must be float or tuple of int, but got {type(scale)}')\n        self.scale = scale\n        self.keep_ratio = keep_ratio\n        self.interpolation = interpolation\n        self.lazy = lazy\n    def __call__(self, results):\n        \"\"\"Performs the Resize augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed"
+        },
+        {
+            "comment": "This code resizes images and keypoints based on the 'scale_factor' or scale provided. If 'scale_factor' is not already in the results, it initializes it with a default value of [1, 1]. The code then calculates new image width (new_w) and height (new_h) based on the scale or keep_ratio setting. It updates 'img_shape', 'keep_ratio', and 'scale_factor' in the results dictionary, and if not lazy, it resizes images and keypoints accordingly using the imresize function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":334-362",
+            "content": "                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        if 'scale_factor' not in results:\n            results['scale_factor'] = np.array([1, 1], dtype=np.float32)\n        img_h, img_w = results['img_shape']\n        if self.keep_ratio:\n            new_w, new_h = rescale_size((img_w, img_h), self.scale)\n        else:\n            new_w, new_h = self.scale\n        self.scale_factor = np.array([new_w / img_w, new_h / img_h],\n                                     dtype=np.float32)\n        results['img_shape'] = (new_h, new_w)\n        results['keep_ratio'] = self.keep_ratio\n        results['scale_factor'] = results['scale_factor'] * self.scale_factor\n        if not self.lazy:\n            if 'imgs' in results:\n                results['imgs'] = [\n                    imresize(\n                        img, (new_w, new_h), interpolation=self.interpolation)\n                    for img in results['imgs']\n                ]\n            if 'keypoint' in results:\n                results['keypoint'] = results['keypoint'] * self.scale_factor"
+        },
+        {
+            "comment": "This code defines a class RandomRescale that performs random rescaling on images, maintaining the aspect ratio. It takes in a range for the short edge size, and an optional interpolation method. The class also has a lazy attribute to control whether the transformation is applied lazily or not. The class also includes an EntityBoxRescale function to rescale bounding boxes. The code ends with registering RandomRescale as a pipeline module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":363-392",
+            "content": "        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Put Flip at last for now')\n            lazyop['interpolation'] = self.interpolation\n        #if 'gt_bboxes' in results:\n        assert not self.lazy\n        entity_box_rescale = EntityBoxRescale(self.scale_factor)\n        results = entity_box_rescale(results)\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'scale={self.scale}, keep_ratio={self.keep_ratio}, '\n                    f'interpolation={self.interpolation}, '\n                    f'lazy={self.lazy})')\n        return repr_str\n@PIPELINES.register()\nclass RandomRescale:\n    \"\"\"Randomly resize images so that the short_edge is resized to a specific\n    size in a given range. The scale ratio is unchanged after resizing.\n    \"\"\"\n    def __init__(self, scale_range, interpolation='bilinear'):\n        scale_range = eval(scale_range)\n        self.scale_range = scale_range"
+        },
+        {
+            "comment": "This code defines a Resize augmentation transform with random scaling range, keeps aspect ratio and applies specified interpolation. It also includes a __repr__ method to provide class name, scale range and short edge value for debugging purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":394-422",
+            "content": "        assert len(scale_range) == 2\n        assert scale_range[0] < scale_range[1]\n        assert np.all([x > 0 for x in scale_range])\n        self.keep_ratio = True\n        self.interpolation = interpolation\n    def __call__(self, results):\n        \"\"\"Performs the Resize augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        short_edge = np.random.randint(self.scale_range[0],\n                                       self.scale_range[1] + 1)\n        resize = Resize((-1, short_edge),\n                        keep_ratio=True,\n                        interpolation=self.interpolation,\n                        lazy=False)\n        results = resize(results)\n        results['short_edge'] = short_edge\n        return results\n    def __repr__(self):\n        scale_range = self.scale_range\n        repr_str = (f'{self.__class__.__name__}('\n                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '"
+        },
+        {
+            "comment": "This code defines a Rescale augmentation class for image processing in the PaddleVideo framework. It resizes images so that the short edge length is within a specified range, while maintaining the aspect ratio. The interpolation method can be set to 'nearest' or 'bilinear'. This augmentation modifies the 'imgs', 'img_shape', 'keep_ratio', 'scale_factor', 'resize_size', and 'short_edge' keys in the results dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":423-454",
+            "content": "                    f'interpolation={self.interpolation})')\n        return repr_str\n@PIPELINES.register()\nclass Rescale:\n    \"\"\"resize images so that the short_edge is resized to a specific\n    size in a given range. The scale ratio is unchanged after resizing.\n    Required keys are \"imgs\", \"img_shape\", \"modality\", added or modified\n    keys are \"imgs\", \"img_shape\", \"keep_ratio\", \"scale_factor\", \"resize_size\",\n    \"short_edge\".\n    Args:\n        scale_range (tuple[int]): The range of short edge length. A closed\n            interval.\n        interpolation (str): Algorithm used for interpolation:\n            \"nearest\" | \"bilinear\". Default: \"bilinear\".\n    \"\"\"\n    def __init__(self, scale_range, interpolation='bilinear'):\n        scale_range = eval(scale_range)\n        self.scale_range = scale_range\n        self.keep_ratio = True\n        self.interpolation = interpolation\n    def __call__(self, results):\n        \"\"\"Performs the Resize augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed"
+        },
+        {
+            "comment": "This code defines a Resize transform and a RandomCrop_v2 class for image processing pipelines in PaddleVideo. The Resize transform scales images within a specified range and with optional interpolation, while the RandomCrop_v2 performs square random cropping on images to a specific output size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":455-486",
+            "content": "                to the next transform in pipeline.\n        \"\"\"\n        resize = Resize(\n            self.scale_range,\n            keep_ratio=True,\n            interpolation=self.interpolation,\n            lazy=False)\n        results = resize(results)\n        return results\n    def __repr__(self):\n        scale_range = self.scale_range\n        repr_str = (f'{self.__class__.__name__}('\n                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '\n                    f'interpolation={self.interpolation})')\n        return repr_str\n@PIPELINES.register()\nclass RandomCrop_v2:\n    \"\"\"Vanilla square random crop that specifics the output size.\n    Required keys in results are \"imgs\" and \"img_shape\", added or\n    modified keys are \"imgs\", \"lazy\"; Required keys in \"lazy\" are \"flip\",\n    \"crop_bbox\", added or modified key is \"crop_bbox\".\n    Args:\n        size (int): The output size of the images.\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    def __init__(self, size, lazy=False):"
+        },
+        {
+            "comment": "This code defines a class with an __init__ method that checks if the input 'size' is an integer, and a __call__ method to perform random cropping. The __call__ method takes a dictionary of results and performs random cropping based on the size attribute of the class instance. It asserts that the size is less than or equal to the image height and width, then randomly selects y and x offsets for cropping. If 'crop_quadruple' is not in the results dictionary, it adds a new entry with initial values. Finally, it calculates ratios for cropping based on the input size and image dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":487-516",
+            "content": "        if not isinstance(size, int):\n            raise TypeError(f'Size must be an int, but got {type(size)}')\n        self.size = size\n        self.lazy = lazy\n    def __call__(self, results):\n        \"\"\"Performs the RandomCrop augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        img_h, img_w = results['img_shape']\n        assert self.size <= img_h and self.size <= img_w\n        y_offset = 0\n        x_offset = 0\n        if img_h > self.size:\n            y_offset = int(np.random.randint(0, img_h - self.size))\n        if img_w > self.size:\n            x_offset = int(np.random.randint(0, img_w - self.size))\n        if 'crop_quadruple' not in results:\n            results['crop_quadruple'] = np.array(\n                [0, 0, 1, 1],  # x, y, w, h\n                dtype=np.float32)\n        x_ratio, y_ratio = x_offset / img_w, y_offset / img_h\n        w_ratio, h_ratio = self.size / img_w, self.size / img_h"
+        },
+        {
+            "comment": "This code segment is adjusting the crop quadruple, calculating a new crop bounding box, and updating the image shape based on provided offsets. It also handles lazy loading if enabled.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":518-543",
+            "content": "        old_crop_quadruple = results['crop_quadruple']\n        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]\n        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]\n        new_crop_quadruple = [\n            old_x_ratio + x_ratio * old_w_ratio,\n            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,\n            h_ratio * old_x_ratio\n        ]\n        results['crop_quadruple'] = np.array(\n            new_crop_quadruple, dtype=np.float32)\n        new_h, new_w = self.size, self.size\n        results['crop_bbox'] = np.array(\n            [x_offset, y_offset, x_offset + new_w, y_offset + new_h])\n        results['img_shape'] = (new_h, new_w)\n        if not self.lazy:\n            results['imgs'] = [\n                img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]\n                for img in results['imgs']\n            ]\n        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Put Flip at last for now')"
+        },
+        {
+            "comment": "This code section is responsible for applying augmentations to video frames, specifically crop and flip operations. It takes input parameters such as image size, whether it should be applied lazily or not, and the direction of flipping (if applicable). The code adjusts the crop region based on the specified offset values and stores them in the 'crop_bbox' field of the lazy operation dictionary. Additionally, if there are entity boxes present, they will also be processed according to the applied crop and flip operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":545-570",
+            "content": "            # record crop_bbox in lazyop dict to ensure only crop once in Fuse\n            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']\n            left = x_offset * (lazy_right - lazy_left) / img_w\n            right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w\n            top = y_offset * (lazy_bottom - lazy_top) / img_h\n            bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h\n            lazyop['crop_bbox'] = np.array(\n                [(lazy_left + left), (lazy_top + top), (lazy_left + right),\n                 (lazy_top + bottom)],\n                dtype=np.float32)\n        # Process entity boxes\n        if 'gt_bboxes' in results:\n            assert not self.lazy\n            entity_box_crop = EntityBoxCrop(results['crop_bbox'])\n            results = entity_box_crop(results)\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}(size={self.size}, '\n                    f'lazy={self.lazy})')\n        return repr_str\ndef imflip_(img, direction='horizontal'):"
+        },
+        {
+            "comment": "The provided code contains three functions: `inplace_flip`, `iminvert`, and a pipeline class called `Flip`. \n\n`inplace_flip` takes an image (`ndarray`) and the direction for flipping (horizontal, vertical or diagonal), asserts that the direction is valid, and returns the flipped image in-place. If the direction is horizontal, it uses `cv2.flip()` with parameter 1 to flip horizontally; if the direction is vertical, it uses `cv2.flip()` with parameter 0 to flip vertically; if the direction is diagonal, it uses `cv2.flip()` with parameter -1 for diagonal flipping.\n\n`iminvert` takes an image (`ndarray`) and returns its negative (inverted) version by subtracting the original image from a numpy array of full value 255 (the maximum possible value for an 8-bit image). This effectively reverses all pixel intensities in the image.\n\nThe `Flip` class is a pipeline module that flips the input images with a certain probability. It requires keys \"imgs\", \"img_shape\", and \"modality\" (although it does not modify them) and adds no new keys.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":571-608",
+            "content": "    \"\"\"Inplace flip an image horizontally or vertically.\n    Args:\n        img (ndarray): Image to be flipped.\n        direction (str): The flip direction, either \"horizontal\" or\n            \"vertical\" or \"diagonal\".\n    Returns:\n        ndarray: The flipped image (inplace).\n    \"\"\"\n    assert direction in ['horizontal', 'vertical', 'diagonal']\n    if direction == 'horizontal':\n        return cv2.flip(img, 1, img)\n    elif direction == 'vertical':\n        return cv2.flip(img, 0, img)\n    else:\n        return cv2.flip(img, -1, img)\ndef iminvert(img):\n    \"\"\"Invert (negate) an image.\n    Args:\n        img (ndarray): Image to be inverted.\n    Returns:\n        ndarray: The inverted image.\n    \"\"\"\n    return np.full_like(img, 255) - img\n@PIPELINES.register()\nclass Flip:\n    \"\"\"Flip the input images with a probability.\n    Reverse the order of elements in the given imgs with a specific direction.\n    The shape of the imgs is preserved, but the elements are reordered.\n    Required keys are \"imgs\", \"img_shape\", \"modality\", added or modified"
+        },
+        {
+            "comment": "This code defines a Flip augmentation class for image processing in PaddleVideo. It takes flip ratio, direction (horizontal or vertical), and lazy operation as parameters. The flip_ratio determines the probability of applying the flip transformation, while direction specifies whether to flip horizontally or vertically. If the 'lazy' parameter is True, the transformation will be applied lazily. This augmentation should be placed after cropping/reshaping transformations for proper crop_quadruple calculation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":609-630",
+            "content": "    keys are \"imgs\", \"lazy\" and \"flip_direction\". Required keys in \"lazy\" is\n    None, added or modified key are \"flip\" and \"flip_direction\". The Flip\n    augmentation should be placed after any cropping / reshaping augmentations,\n    to make sure crop_quadruple is calculated properly.\n    Args:\n        flip_ratio (float): Probability of implementing flip. Default: 0.5.\n        direction (str): Flip imgs horizontally or vertically. Options are\n            \"horizontal\" | \"vertical\". Default: \"horizontal\".\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    _directions = ['horizontal', 'vertical']\n    def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False):\n        if direction not in self._directions:\n            raise ValueError(f'Direction {direction} is not supported. '\n                             f'Currently support ones are {self._directions}')\n        self.flip_ratio = flip_ratio\n        self.direction = direction\n        self.lazy = lazy\n    def __call__(self, results):"
+        },
+        {
+            "comment": "The code snippet performs a Flip augmentation on images, randomly flipping them horizontally based on a given flip ratio. It also sets the 'flip' and 'flip_direction' keys in the results dictionary. If the 'lazy' option is not used (self.lazy), it iterates through the images, applying the flip transformation if necessary. It also handles 'gt_bboxes' if they exist in the results dictionary, ensuring horizontal flips are applied correctly without any issues.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":631-659",
+            "content": "        \"\"\"Performs the Flip augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        flip = np.random.rand() < self.flip_ratio\n        results['flip'] = flip\n        results['flip_direction'] = self.direction\n        if not self.lazy:\n            if flip:\n                for i, img in enumerate(results['imgs']):\n                    imflip_(img, self.direction)\n                lt = len(results['imgs'])\n            else:\n                results['imgs'] = list(results['imgs'])\n        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Use one Flip please')\n            lazyop['flip'] = flip\n            lazyop['flip_direction'] = self.direction\n        if 'gt_bboxes' in results and flip:\n            assert not self.lazy and self.direction == 'horizontal'\n            entity_box_flip = EntityBoxFlip(results['img_shape'])"
+        },
+        {
+            "comment": "This code contains a class for image augmentation, including flip ratio and direction, with a method to normalize an image using mean and std values. It also includes an inplace normalization function that converts BGR to RGB if necessary. The `__repr__` method returns a string representation of the class attributes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":660-692",
+            "content": "            results = entity_box_flip(results)\n        return results\n    def __repr__(self):\n        repr_str = (\n            f'{self.__class__.__name__}('\n            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '\n            f'lazy={self.lazy})')\n        return repr_str\ndef imnormalize_(img, mean, std, to_rgb=True):\n    \"\"\"Inplace normalize an image with mean and std.\n    Args:\n        img (ndarray): Image to be normalized.\n        mean (ndarray): The mean to be used for normalize.\n        std (ndarray): The std to be used for normalize.\n        to_rgb (bool): Whether to convert to rgb.\n    Returns:\n        ndarray: The normalized image.\n    \"\"\"\n    # cv2 inplace normalization does not accept uint8\n    assert img.dtype != np.uint8\n    mean = np.float64(mean.reshape(1, -1))\n    stdinv = 1 / np.float64(std.reshape(1, -1))\n    if to_rgb:\n        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace\n    cv2.subtract(img, mean, img)  # inplace\n    cv2.multiply(img, stdinv, img)  # inplace\n    return img"
+        },
+        {
+            "comment": "This code defines a class called \"Normalize\" that normalizes images based on given mean and std values. It can also convert channels from RGB to BGR if necessary. Additionally, it adjusts flow magnitude when modality is 'Flow' with an optional adjust_magnitude parameter. The class requires keys \"imgs\", \"img_shape\", \"modality\" with additional keys \"imgs\" and \"img_norm_cfg\" being added or modified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":695-719",
+            "content": "@PIPELINES.register()\nclass Normalize:\n    \"\"\"Normalize images with the given mean and std value.\n    Required keys are \"imgs\", \"img_shape\", \"modality\", added or modified\n    keys are \"imgs\" and \"img_norm_cfg\". If modality is 'Flow', additional\n    keys \"scale_factor\" is required\n    Args:\n        mean (Sequence[float]): Mean values of different channels.\n        std (Sequence[float]): Std values of different channels.\n        to_bgr (bool): Whether to convert channels from RGB to BGR.\n            Default: False.\n        adjust_magnitude (bool): Indicate whether to adjust the flow magnitude\n            on 'scale_factor' when modality is 'Flow'. Default: False.\n    \"\"\"\n    def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False):\n        if not isinstance(mean, Sequence):\n            raise TypeError(\n                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')\n        if not isinstance(std, Sequence):\n            raise TypeError(\n                f'Std must be list, tuple or np.ndarray, but got {type(std)}')"
+        },
+        {
+            "comment": "This code defines an augmentation pipeline for image normalization in AVA. It initializes mean, std, and to_bgr values, and then applies the normalization transformation to each input image. The normalized images are stored in 'imgs' and the configuration is saved in 'img_norm_cfg'. The __repr__ method provides a string representation of the object's state.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py\":721-748",
+            "content": "        self.mean = np.array(mean, dtype=np.float32)\n        self.std = np.array(std, dtype=np.float32)\n        self.to_bgr = to_bgr\n        self.adjust_magnitude = adjust_magnitude\n    def __call__(self, results):\n        n = len(results['imgs'])\n        h, w, c = results['imgs'][0].shape\n        imgs = np.empty((n, h, w, c), dtype=np.float32)\n        for i, img in enumerate(results['imgs']):\n            imgs[i] = img\n        for img in imgs:\n            imnormalize_(img, self.mean, self.std, self.to_bgr)\n        results['imgs'] = imgs\n        results['img_norm_cfg'] = dict(\n            mean=self.mean, std=self.std, to_bgr=self.to_bgr)\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'mean={self.mean}, '\n                    f'std={self.std}, '\n                    f'to_bgr={self.to_bgr}, '\n                    f'adjust_magnitude={self.adjust_magnitude})')\n        return repr_str"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/23cc0f4d-18b7-4a32-a685-092f8cd2c6b8.json b/docs/doc/23cc0f4d-18b7-4a32-a685-092f8cd2c6b8.json
new file mode 100644
index 000000000..9d815c764
--- /dev/null
+++ b/docs/doc/23cc0f4d-18b7-4a32-a685-092f8cd2c6b8.json
@@ -0,0 +1,15 @@
+{
+    "summary": "FastRCNN is a two-stage object detection class inheriting from TwoStageDetector, created with specified head, train and test configurations, and optional pretrained weights.",
+    "details": [
+        {
+            "comment": "Defines the FastRCNN class, a two-stage detector that inherits from TwoStageDetector. It takes backbone, head, train_cfg, test_cfg, neck, and pretrained as parameters for object detection.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/fast_rcnn.py\":0-29",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .two_stage import TwoStageDetector\nfrom ...registry import DETECTORS\n@DETECTORS.register()\nclass FastRCNN(TwoStageDetector):\n    def __init__(self,\n                 backbone,\n                 head=None,\n                 train_cfg=None,\n                 test_cfg=None,\n                 neck=None,\n                 pretrained=None):\n        super(FastRCNN, self).__init__(\n            backbone=backbone,\n            neck=neck,"
+        },
+        {
+            "comment": "Creates a Fast R-CNN detector with specified head, train and test configurations, and optionally pretrained weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/fast_rcnn.py\":30-33",
+            "content": "            roi_head=head,\n            train_cfg=train_cfg,\n            test_cfg=test_cfg,\n            pretrained=pretrained)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/24375566-77e4-4025-9c4f-d6ce173bd1c8.json b/docs/doc/24375566-77e4-4025-9c4f-d6ce173bd1c8.json
new file mode 100644
index 000000000..ef65b3bdd
--- /dev/null
+++ b/docs/doc/24375566-77e4-4025-9c4f-d6ce173bd1c8.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines a localizer model for PaddleVideo with forward network and methods for training, validating, testing, and inferring. It uses input data to predict bounding boxes, start position, and end position while calculating loss using ground truth values.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library and defines a BMNLocalizer class, which is a localization framework. It includes a forward_net method for calling the backbone's forward function and a train_step method for handling training steps with input data. The gt_iou_map, gt_start, and gt_end are provided as part of the data batch to be used in the training step.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/bmn_localizer.py\":0-35",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import LOCALIZERS\nfrom .base import BaseLocalizer\nimport paddle\n@LOCALIZERS.register()\nclass BMNLocalizer(BaseLocalizer):\n    \"\"\"BMN Localization framework\n    \"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Call backbone forward.\n        \"\"\"\n        preds = self.backbone(imgs)\n        return preds\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        x_data = data_batch[0]\n        gt_iou_map = data_batch[1]\n        gt_start = data_batch[2]\n        gt_end = data_batch[3]\n        gt_iou_map.stop_gradient = True"
+        },
+        {
+            "comment": "This code defines a localizer model for PaddleVideo. It includes functions for training, validating, testing, and inferring steps. The localizer has a forward network which takes input data and returns predictions for bounding boxes (pred_bm), start position (pred_start), and end position (pred_end). Loss is calculated using the provided ground truth values (gt_iou_map, gt_start, gt_end) and averaged over the batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/bmn_localizer.py\":36-68",
+            "content": "        gt_start.stop_gradient = True\n        gt_end.stop_gradient = True\n        # call Model forward\n        pred_bm, pred_start, pred_end = self.forward_net(x_data)\n        # call Loss forward\n        loss = self.loss(pred_bm, pred_start, pred_end, gt_iou_map, gt_start,\n                         gt_end)\n        avg_loss = paddle.mean(loss)\n        loss_metrics = dict()\n        loss_metrics['loss'] = avg_loss\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        x_data = data_batch[0]\n        pred_bm, pred_start, pred_end = self.forward_net(x_data)\n        return pred_bm, pred_start, pred_end\n    def infer_step(self, data_batch):\n        \"\"\"Infer step\n        \"\"\"\n        x_data = data_batch[0]\n        # call Model forward\n        pred_bm, pred_start, pred_end = self.forward_net(x_data)\n        return pred_bm, pred_start, pred_end"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/249d4907-6cf9-47a6-8199-4eb5c9f53708.json b/docs/doc/249d4907-6cf9-47a6-8199-4eb5c9f53708.json
new file mode 100644
index 000000000..9a892a02a
--- /dev/null
+++ b/docs/doc/249d4907-6cf9-47a6-8199-4eb5c9f53708.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code imports libraries, initializes an experiment, defines functions for training a video analysis model, handles command-line arguments and ensures checkpoints are saved before running the training process.",
+    "details": [
+        {
+            "comment": "The code imports necessary libraries, modules and packages for the PaddleVideo project. It also handles copyright and license information, sets seeds to ensure reproducibility, and includes utility functions for logging, model training, and data loading. It defines a Trainer class and an evaluation function, as well as parsing configuration files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/train.py\":0-34",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport os\nimport time\nimport copy\nimport socket\nimport paddle\nimport argparse\nimport warnings\nimport numpy as np\nimport model.loss as module_loss\nimport model.model as module_arch\nimport model.metric as module_metric\nimport data_loader.data_loaders as module_data\nfrom pathlib import Path\nfrom utils import set_seeds\nfrom trainer import Trainer\nfrom test import evaluation\nfrom mergedeep import merge, Strategy\nfrom parse_config import ConfigParser\nfrom logger.log_parser import log_summary\nfrom utils import compute_dims, compute_trn_config"
+        },
+        {
+            "comment": "This code snippet defines a function `run_exp()` that initializes an experiment. It sets the random seed, initializes the model (arch) and data loaders based on the given configuration. The seeds are obtained from command line arguments, and for each seed, it logs information about the setting and proceeds with the experiment initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/train.py\":36-66",
+            "content": "def run_exp(config):\n    warnings.filterwarnings('ignore')\n    logger = config.get_logger('train')\n    expert_dims, raw_input_dims = compute_dims(config, logger)\n    trn_config = compute_trn_config(config)\n    if config._args.group_seed:\n        seeds = [int(config._args.group_seed)]\n    else:\n        seeds = [int(x) for x in config._args.seeds.split(\",\")]\n    for ii, seed in enumerate(seeds):\n        tic = time.time()\n        logger.info(f\"{ii + 1}/{len(seeds)} Setting experiment random seed to {seed}\")\n        set_seeds(seed)\n        config[\"seed\"] = seed\n        model = config.init(\n            name='arch',\n            module=module_arch,\n            expert_dims=expert_dims,\n            text_dim=config[\"experts\"][\"text_dim\"],\n            ce_shared_dim=config[\"experts\"].get(\"ce_shared_dim\", None),\n            feat_aggregation=config[\"data_loader\"][\"args\"][\"feat_aggregation\"],\n        )\n        logger.info(model)\n        data_loaders = config.init(\n            name='data_loader',\n            module=module_data,"
+        },
+        {
+            "comment": "Initializing a model with specific configurations and defining the loss function, metrics to track progress, learning rate scheduler for dynamic adjustments, and an optimizer (AdamW) to update model parameters. Also creating a Trainer instance which combines all these components for training the model on given data loaders.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/train.py\":67-91",
+            "content": "            logger=logger,\n            raw_input_dims=raw_input_dims,\n            text_feat=config[\"experts\"][\"text_feat\"],\n            text_dim=config[\"experts\"][\"text_dim\"],\n            text_agg=config[\"experts\"][\"text_agg\"],\n            use_zeros_for_missing=config[\"experts\"].get(\"use_zeros_for_missing\", False),\n            eval_only=False,\n        )\n        loss = config.init(name=\"loss\", module=module_loss)\n        metrics = [getattr(module_metric, met) for met in config['metrics']]\n        lr_scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.0001, step_size=5, gamma=0.9)\n        optimizer = paddle.optimizer.AdamW(learning_rate=lr_scheduler, weight_decay=1e-4, parameters=model.parameters(), grad_clip=paddle.nn.ClipGradByGlobalNorm(2))\n        trainer = Trainer(\n            model,\n            loss,\n            metrics,\n            optimizer,\n            config=config,\n            data_loaders=data_loaders,\n            lr_scheduler=lr_scheduler,\n            mini_train=config._args.mini_train,\n            visualizer=None,"
+        },
+        {
+            "comment": "This code sets up a trainer with specified configuration, trains the model, saves the best model at 'best_model_path', logs training duration, reports relevant statistics if multiple runs were conducted, and prints the log file location.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/train.py\":92-114",
+            "content": "            val_freq=config[\"trainer\"].get(\"val_freq\", 1),\n            force_cpu_val=config.get(\"force_cpu_val\", False),\n            skip_first_n_saves=config[\"trainer\"].get(\"skip_first_n_saves\", 0),\n            include_optim_in_save_model=config[\"trainer\"].get(\"include_optim_in_save_model\", 1),\n            cache_targets=set(config.get(\"cache_targets\", [])),\n        )\n        trainer.train()\n        best_model_path = config.save_dir / \"trained_model.pdparams\"\n        duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic))\n        logger.info(f\"Training took {duration}\")\n    # If multiple runs were conducted, report relevant statistics\n    if len(seeds) > 1:\n        log_summary(\n            logger=logger,\n            log_path=config.log_path,\n            eval_mode=config[\"eval_mode\"],\n            fixed_num_epochs=config[\"trainer\"][\"epochs\"],\n        )\n    print(f\"Log file stored at {config.log_path}\")\n    # Report the location of the \"best\" model of the final seeded run (here\n    # \"best\" corresponds to the model with the highest geometric mean over the"
+        },
+        {
+            "comment": "This code defines the command-line arguments for the training script of a video analysis application. The arguments include config file path, resuming from a previous model, mini-batch training option, grouping experiments by ID, disabling workers, refreshing LRU cache, training a single epoch, purging existing experiments, and debugging options.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/train.py\":115-132",
+            "content": "    # R@1, R@5 and R@10 metrics when a validation set is used, or simply the final\n    # epoch of training for fixed-length schedules).\n    print(f\"The best performing model can be found at {str(best_model_path)}\")\ndef main():\n    args = argparse.ArgumentParser(description='Main entry point for training')\n    args.add_argument('--config', help='config file path')\n    args.add_argument('--resume', help='path to latest model (default: None)')\n    args.add_argument('--mini_train', action=\"store_true\")\n    args.add_argument('--group_id', help=\"if supplied, group these experiments\")\n    args.add_argument('--disable_workers', action=\"store_true\")\n    args.add_argument('--refresh_lru_cache', action=\"store_true\")\n    args.add_argument('--train_single_epoch', action=\"store_true\")\n    args.add_argument('--purge_exp_dir', action=\"store_true\",\n                      help=\"remove all previous experiments with the given config\")\n    args.add_argument(\"--dbg\", default=\"ipdb.set_trace\")\n    args.add_argument(\"--custom_args\", help=\"qualified key,val pairs\")"
+        },
+        {
+            "comment": "This code is parsing command-line arguments for seeds, setting environment variables, and asserting that the number of training epochs is greater than the save period to ensure checkpoints are saved. The function run_exp is then called with these configuration settings, and the main function is executed if the script is run directly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/train.py\":134-150",
+            "content": "    # Seeds can either be passed directly as a comma separated list at the command line,\n    # or individually for separate experiments as a group (used for slurm experiments)\n    seed_args = args.add_mutually_exclusive_group()\n    seed_args.add_argument('--seeds', default=\"0\", help=\"comma separated list of seeds\")\n    seed_args.add_argument('--group_seed', help=\"seed for group member\")\n    args = ConfigParser(args)\n    os.environ[\"PYTHONBREAKPOINT\"] = args._args.dbg\n    args[\"data_loader\"][\"args\"][\"refresh_lru_cache\"] = args._args.refresh_lru_cache\n    msg = (f\"Expected the number of training epochs ({args['trainer']['epochs']})\"\n           f\"to exceed the save period ({args['trainer']['save_period']}), otherwise\"\n           \" no checkpoints will be saved.\")\n    assert args[\"trainer\"][\"epochs\"] >= args[\"trainer\"][\"save_period\"], msg\n    run_exp(config=args)\nif __name__ == '__main__':\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/24e4e098-d91f-4056-ab8a-129403184220.json b/docs/doc/24e4e098-d91f-4056-ab8a-129403184220.json
new file mode 100644
index 000000000..adbb80f5d
--- /dev/null
+++ b/docs/doc/24e4e098-d91f-4056-ab8a-129403184220.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code provides a logger class for PaddleVideo's Video Quality Assessment app, enabling logging for distributed apps with rank-based output to file or console. It initializes loggers and disables log event propagation when verbosity level is set to \"DEBUG\".",
+    "details": [
+        {
+            "comment": "This code snippet is from PaddleVideo's Video Quality Assessment application, and it contains a logger class for logging messages. The logger imports necessary modules like logging, os, sys, datetime, and ParallelEnv, along with defining color codes and a function for colored output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/logger.py\":0-39",
+            "content": "\"\"\"\n#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport logging\nimport os\nimport sys\nimport datetime\nfrom paddle.distributed import ParallelEnv\nColor = {\n    'RED': '\\033[31m',\n    'HEADER': '\\033[35m',  # deep purple\n    'PURPLE': '\\033[95m',  # purple\n    'OKBLUE': '\\033[94m',\n    'OKGREEN': '\\033[92m',\n    'WARNING': '\\033[93m',\n    'FAIL': '\\033[91m',\n    'ENDC': '\\033[0m'\n}\ndef coloring(message, color=\"OKGREEN\"):\n    \"\"\"coloring\"\"\"\n    assert color in Color.keys()"
+        },
+        {
+            "comment": "Function `setup_logger` initializes the paddlevideo logger and sets its verbosity level to \"INFO\". It takes optional arguments for output file name or directory, and root module name. If the verbosity level is set to \"DEBUG\", the logger will have a lower threshold for logging messages. The function also disables propagation of log events to the root logger.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/logger.py\":40-73",
+            "content": "    if os.environ.get('COLORING', True):\n        return Color[color] + str(message) + Color[\"ENDC\"]\n    else:\n        return message\nlogger_initialized = []\ndef setup_logger(output=None, name=\"paddlevideo\", level=\"INFO\"):\n    \"\"\"\n    Initialize the paddlevideo logger and set its verbosity level to \"INFO\".\n    Args:\n        output (str): a file name or a directory to save log. If None, will not save log file.\n            If ends with \".txt\" or \".log\", assumed to be a file name.\n            Otherwise, logs will be saved to `output/log.txt`.\n        name (str): the root module name of this logger\n    Returns:\n        logging.Logger: a logger\n    \"\"\"\n    def time_zone(sec, fmt):\n        real_time = datetime.datetime.now()\n        return real_time.timetuple()\n    logging.Formatter.converter = time_zone\n    logger = logging.getLogger(name)\n    if level == \"INFO\":\n        logger.setLevel(logging.INFO)\n    elif level==\"DEBUG\":\n        logger.setLevel(logging.DEBUG)\n    logger.propagate = False\n    if level == \"DEBUG\":\n        plain_formatter = logging.Formatter("
+        },
+        {
+            "comment": "This code sets up logging configuration for a distributed application. It uses a logger to handle log messages, and based on the local rank of each process, it determines whether to output logs to standard out, standard err, or a file. If no output is provided, it defaults to a \".log.txt\" file in the specified directory. The log files for different ranks are distinguished by appending the rank number. If the directory doesn't exist, it creates one before writing the logs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/logger.py\":74-102",
+            "content": "            \"[%(asctime)s] %(name)s %(levelname)s: %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    else:\n        plain_formatter = logging.Formatter(\n            \"[%(asctime)s] %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    # stdout logging: master only\n    local_rank = ParallelEnv().local_rank\n    if local_rank == 0:\n        ch = logging.StreamHandler(stream=sys.stdout)\n        ch.setLevel(logging.DEBUG)\n        formatter = plain_formatter\n        ch.setFormatter(formatter)\n        logger.addHandler(ch)\n    # file logging: all workers\n    if output is not None:\n        if output.endswith(\".txt\") or output.endswith(\".log\"):\n            filename = output\n        else:\n            filename = os.path.join(output, \".log.txt\")\n        if local_rank > 0:\n            filename = filename + \".rank{}\".format(local_rank)\n        # PathManager.mkdirs(os.path.dirname(filename))\n        os.makedirs(os.path.dirname(filename), exist_ok=True)\n        # fh = logging.StreamHandler(_cached_log_stream(filename)\n        fh = logging.FileHandler(filename, mode='a')"
+        },
+        {
+            "comment": "This function `get_logger` sets up a logger with the given name. If the logger has already been initialized, it simply returns the existing logger. Otherwise, it calls `setup_logger` to initialize the logger with the given name and optional output. The logger is configured to handle debug level messages using plain formatter and this configuration is appended to the list of initialized loggers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/logger.py\":103-116",
+            "content": "        fh.setLevel(logging.DEBUG)\n        fh.setFormatter(plain_formatter)\n        logger.addHandler(fh)\n    logger_initialized.append(name)\n    return logger\ndef get_logger(name, output=None):\n    \"\"\"get logger\"\"\"\n    logger = logging.getLogger(name)\n    if name in logger_initialized:\n        return logger\n    return setup_logger(name=name, output=name)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/25008972-aec6-44bf-b83e-21babe2c3e0c.json b/docs/doc/25008972-aec6-44bf-b83e-21babe2c3e0c.json
new file mode 100644
index 000000000..6fc3c84ad
--- /dev/null
+++ b/docs/doc/25008972-aec6-44bf-b83e-21babe2c3e0c.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code defines a ResNeXt-101 model in PaddlePaddle, including downsample and residual blocks, BottleneckBlock, performs convolutions, activation, max pooling on input image.",
+    "details": [
+        {
+            "comment": "This code defines a ConvBNLayer class in PaddlePaddle, which is a convolution-batch normalization layer. It takes inputs like num_channels, num_filters, filter_size, stride, padding, dilation, groups, padding_mode, weight_attr, bias_attr, and name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnext101.py\":0-30",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom functools import partial\nimport paddle\nclass ConvBNLayer(paddle.nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 filter_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 padding_mode='zeros',\n                 weight_attr=None,\n                 bias_attr=None,\n                 name=None,"
+        },
+        {
+            "comment": "This code defines a ConvBNLayer class with specified parameters for convolutional and batch normalization layers. The convolutional layer uses Kaiming Normal initialization, while the batch normalization layer has fixed scales and offsets initialized to 1 and 0 respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnext101.py\":31-54",
+            "content": "                 data_format=\"NCDHW\"):\n        super(ConvBNLayer, self).__init__()\n        self._conv = paddle.nn.Conv3D(\n            in_channels=num_channels,\n            out_channels=num_filters,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            padding_mode=padding_mode,\n            weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.KaimingNormal(\n                fan_in=num_filters * filter_size * filter_size), name=name+'_weights'),\n            bias_attr=bias_attr,\n            data_format=data_format)\n        bn_name = \"bn_\" + name\n        self._batch_norm = paddle.nn.BatchNorm3D(\n            num_filters,\n            momentum=0.9,\n            epsilon=1e-05,\n            weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(\n                1.), name=bn_name + '_scale'),\n            bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(\n                0.), name=bn_name + '_offset'),"
+        },
+        {
+            "comment": "This code defines a BottleneckBlock class and a downsample function for the ResNeXt101 model in PaddlePaddle. The BottleneckBlock has an expansion factor of 2 and uses ConvBNLayer for convolution and batch normalization. The downsample function performs average pooling and concatenation to perform downsampling.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnext101.py\":55-81",
+            "content": "            data_format=data_format)\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        return y\ndef _downsample_basic_block(self, x, planes, stride):\n    out = paddle.nn.functional.avg_pool3d(x, kernel_size=1, stride=stride)\n    shape = out.shape\n    zero_pads = paddle.zeros(shape=[shape[0], planes - shape[1], shape[2], shape[3], shape[4]],\n                                   dtype='float32')\n    out = paddle.concat(x=[out, zero_pads], axis=1)\nclass BottleneckBlock(paddle.nn.Layer):\n    expansion = 2\n    def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None, name=None):\n        super(BottleneckBlock, self).__init__()\n        mid_planes = cardinality * int(planes / 32)\n        self.conv0 = ConvBNLayer(\n            inplanes, mid_planes, filter_size=1, bias_attr=False, name=name+'_branch2a')\n        self.conv1 = ConvBNLayer(mid_planes, mid_planes, filter_size=3, stride=stride,\n                                 padding=1, groups=cardinality, bias_attr=False, name=name+'_branch2b')"
+        },
+        {
+            "comment": "This code defines a ResNeXt model. The class ResNeXt has an initialization that sets inplanes to 64 and inherits from paddle.nn.Layer. It contains a convolution layer (conv) with 3 input channels, 64 output channels, filter size of 7, and stride of (1,2,2). The class also includes a ResNet-style residual block as a member variable named 'block'. It has layers, shortcut_type (defaults to B), and cardinality parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnext101.py\":82-121",
+            "content": "        self.conv2 = ConvBNLayer(mid_planes, planes * self.expansion,\n                                 filter_size=1, bias_attr=False, name=name+'_branch2c')\n        self.downsample = downsample\n        self.stride = stride\n        self.relu = paddle.nn.ReLU()\n    def forward(self, x):\n        residual = x\n        out = self.conv0(x)\n        out = self.relu(out)\n        out = self.conv1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass ResNeXt(paddle.nn.Layer):\n    def __init__(self,\n                 block,\n                 layers,\n                 shortcut_type='B',\n                 cardinality=32):\n        self.inplanes = 64\n        super(ResNeXt, self).__init__()\n        self.conv = ConvBNLayer(\n            3,\n            64,\n            filter_size=7,\n            stride=(1, 2, 2),\n            padding=(3, 3, 3),\n            bias_attr=False,\n            name=\"res_conv1\""
+        },
+        {
+            "comment": "The code defines a ResNext101 backbone for a deep learning model. It includes a ReLU activation function and max pooling operation, followed by four residual layers (layer1 to layer4) with varying numbers of planes (256, 512, 1024 respectively). The _make_layer method is used to create the layers, with options for downsampling and varying expansion rates.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnext101.py\":122-147",
+            "content": "        )\n        self.relu = paddle.nn.ReLU()\n        self.maxpool = paddle.nn.MaxPool3D(kernel_size=(3, 3, 3), stride=2, padding=1)\n        self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type,\n                                       cardinality, stride=1, name='layer1')\n        self.layer2 = self._make_layer(\n            block, 256, layers[1], shortcut_type, cardinality, stride=2, name='layer2')\n        self.layer3 = self._make_layer(\n            block, 512, layers[2], shortcut_type, cardinality, stride=2, name='layer3')\n        self.layer4 = self._make_layer(\n            block, 1024, layers[3], shortcut_type, cardinality, stride=2, name='layer4')\n        self.avgpool = paddle.nn.AvgPool3D((2, 1, 1), stride=1, exclusive=False)\n    def _make_layer(self,\n                    block,\n                    planes,\n                    blocks,\n                    shortcut_type,\n                    cardinality,\n                    stride=1,\n                    name=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:"
+        },
+        {
+            "comment": "This code defines a ResNeXt-101 model, implementing its downsample and residual blocks. It takes an input image, performs convolutions, applies ReLU activation, and max pooling before passing through the specified number of residual blocks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnext101.py\":148-175",
+            "content": "            if shortcut_type == 'A':\n                downsample = partial(self._downsample_basic_block,\n                                     planes=planes * block.expansion,\n                                     stride=stride)\n            else:\n                downsample = ConvBNLayer(\n                    self.inplanes,\n                    planes * block.expansion,\n                    1,\n                    stride=stride,\n                    bias_attr=False,\n                    name=name+'downsample'\n                )\n        layers = []\n        layers.append(\n            block(self.inplanes, planes, cardinality, stride, downsample, name=name+'_downsample'))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(block(self.inplanes, planes,\n                          cardinality, name=name+'_res_block'+str(i)))\n        return paddle.nn.Sequential(*layers)\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n        x = self.layer1(x)"
+        },
+        {
+            "comment": "The ResNext101 function constructs a ResNeXt-101 model using BottleneckBlock and the specified block configurations. It applies the layer2, layer3, and layer4 operations to x before returning the result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnext101.py\":176-186",
+            "content": "        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        return x\ndef ResNext101():\n    \"\"\"Constructs a ResNext-101 model.\n    \"\"\"\n    model = ResNeXt(BottleneckBlock, [3, 4, 23, 3])\n    return model"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/254f5728-b6b4-46fc-9bc5-c8740d72e8a4.json b/docs/doc/254f5728-b6b4-46fc-9bc5-c8740d72e8a4.json
new file mode 100644
index 000000000..2629d7934
--- /dev/null
+++ b/docs/doc/254f5728-b6b4-46fc-9bc5-c8740d72e8a4.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code creates PaddleVideo dataset loaders and sets up signal handlers for graceful termination of a process group upon receiving SIGINT or SIGTERM signals.",
+    "details": [
+        {
+            "comment": "This code snippet is a part of the PaddleVideo library and contains a function named build_pipeline. It imports various modules, defines a logger for logging purposes, and uses a function called build from utils. This function seems to be building some kind of pipeline based on the provided configuration (cfg). The purpose of this pipeline might be to process data or prepare it for model training in the context of PaddleVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport signal\nimport os\nimport paddle\nfrom paddle.io import BatchSampler, DataLoader, DistributedBatchSampler\nfrom .pipelines.compose import Compose\nfrom .registry import DATASETS, PIPELINES, DATALOADERS, BATCH_SAMPLERS, SAMPLERS\nfrom ..utils import get_logger\nfrom ..utils.build_utils import build\nimport numpy as np\nlogger = get_logger(\"paddlevideo\")\ndef build_pipeline(cfg):\n    \"\"\"Build pipeline.\n    Args:\n        cfg (dict): root config dict."
+        },
+        {
+            "comment": "This code defines several functions to build different components for a dataset loader. The main function is `build_dataset` which takes a configuration dictionary and returns a dataset object after building the pipeline, dataset, sampler, and dataloader as per the given configuration. It uses other helper functions like `build_pipeline`, `build_sampler`, `build_batch_pipeline`, and `build_custom_dataloader` to build these components.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py\":31-79",
+            "content": "    \"\"\"\n    if cfg == None:\n        return\n    return Compose(cfg)\ndef build_dataset(cfg):\n    \"\"\"Build dataset.\n    Args:\n        cfg (dict): root config dict.\n    Returns:\n        dataset: dataset.\n    \"\"\"\n    # XXX: ugly code here!\n    cfg_dataset, cfg_pipeline = cfg\n    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)\n    dataset = build(cfg_dataset, DATASETS, key=\"format\")\n    return dataset\ndef build_sampler(cfg):\n    \"\"\"Build batch_sampler.\n    Args:\n        cfg (dict): root config dict.\n    Returns:\n        batch_sampler: batch_sampler.\n    \"\"\"\n    sampler = build(cfg, SAMPLERS)\n    return sampler\ndef build_batch_pipeline(cfg):\n    batch_pipeline = build(cfg, PIPELINES)\n    return batch_pipeline\ndef build_custom_dataloader(cfg):\n    custom_dataloader = build(cfg, DATALOADERS, key='dataloader')\n    return custom_dataloader\ndef build_dataloader(dataset,\n                     batch_size,\n                     num_workers,\n                     places=None,\n                     shuffle=True,\n                     drop_last=True,"
+        },
+        {
+            "comment": "The code builds a Paddle Dataloader with optional custom sampler, shuffles data if necessary, and handles distributed batch sampling. It takes dataset, batch size, number of workers, and shuffle settings as input arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py\":80-105",
+            "content": "                     multigrid=False,\n                     collate_fn_cfg=None,\n                     **kwargs):\n    \"\"\"Build Paddle Dataloader.\n    XXX explain how the batch_sampler work!\n    Args:\n        dataset (paddle.dataset): A PaddlePaddle dataset object.\n        batch_size (int): batch size on single card.\n        num_worker (int): num_worker\n        shuffle(bool): whether to shuffle the data at every epoch.\n    \"\"\"\n    if not kwargs.get('sampler'):\n        batch_sampler = DistributedBatchSampler(dataset,\n                                                batch_size=batch_size,\n                                                shuffle=shuffle,\n                                                drop_last=drop_last)\n    else:\n        sampler = build_sampler(kwargs['sampler'])\n        batch_sampler = BatchSampler(dataset,\n                                     sampler=sampler,\n                                     batch_size=batch_size,\n                                     shuffle=shuffle,\n                                     drop_last=drop_last)"
+        },
+        {
+            "comment": "This code defines a mix_collate_fn for handling batches of data in a specific way. It first builds a batch pipeline and applies it to the input batch. Then, it collates the batch so that each item is stacked horizontally (axis=0) into a new batch. This function is used as the collate_fn if the collate_fn_cfg is not None.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py\":106-133",
+            "content": "    kwargs.update({'batch_sampler': batch_sampler})\n    # NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.\n    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:\n    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.\n    def mix_collate_fn(batch):\n        pipeline = build_batch_pipeline(collate_fn_cfg)\n        batch = pipeline(batch)\n        slots = []\n        for items in batch:\n            for i, item in enumerate(items):\n                if len(slots) < len(items):\n                    slots.append([item])\n                else:\n                    slots[i].append(item)\n        return [np.stack(slot, axis=0) for slot in slots]\n    # if collate_fn_cfg is not None:\n    # ugly code here. collate_fn is mix op config\n    #    collate_fn = mix_collate_fn(collate_fn_cfg)\n    data_loader = DataLoader(\n        dataset,\n        places=places,\n        num_workers=num_workers,\n        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,"
+        },
+        {
+            "comment": "This code is setting up signal handlers for SIGINT and SIGTERM signals. It retrieves the process ID (pid) and process group ID (pgid), logs a message, then sends a SIGKILL signal to all processes in the group upon receiving either of those signals.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py\":134-150",
+            "content": "        **kwargs)\n    return data_loader\ndef term_mp(sig_num, frame):\n    \"\"\" kill all child processes\n    \"\"\"\n    pid = os.getpid()\n    pgid = os.getpgid(os.getpid())\n    logger.info(\"main proc {} exit, kill process group \" \"{}\".format(pid, pgid))\n    os.killpg(pgid, signal.SIGKILL)\n    return\nsignal.signal(signal.SIGINT, term_mp)\nsignal.signal(signal.SIGTERM, term_mp)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/25b47995-09f5-4019-8d32-ecd9a15146e1.json b/docs/doc/25b47995-09f5-4019-8d32-ecd9a15146e1.json
new file mode 100644
index 000000000..8b513e350
--- /dev/null
+++ b/docs/doc/25b47995-09f5-4019-8d32-ecd9a15146e1.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code constructs a learning rate scheduler based on the 'OPTIMIZER' configuration provided, returns it with specified iterations, and handles custom cases such as converting 'learning_rate' to a custom object.",
+    "details": [
+        {
+            "comment": "This code is building a learning rate scheduler according to the \"OPTIMIZER\" configuration provided in the cfg dictionary. The scheduler is based on the 'PiecewiseDecay' name, and has boundaries and values for adjusting the learning rate at specified iterations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/lr.py\":0-27",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom typing import Dict\nfrom paddle.optimizer.lr import LRScheduler\nfrom . import custom_lr\ndef build_lr(cfg: Dict, num_iters: int) -> LRScheduler:\n    \"\"\"Build a learning rate scheduler accroding to ```OPTIMIZER``` configuration, and it always pass into the optimizer.\n    In configuration:\n    learning_rate:\n        name: 'PiecewiseDecay'\n        boundaries: [20, 60]\n        values: [0.00025, 0.000025, 0.0000025]"
+        },
+        {
+            "comment": "This function takes a learning rate configuration and the number of iterations, and returns a learning rate scheduler. If the configuration includes a 'learning_rate' key with a dictionary value, it converts it to a custom learning rate object using the build_lr() function. It also handles cases where 'iter_step' is present in the configuration, replacing it with 'num_iters'. The returned scheduler is obtained from the 'custom_lr' module with the specified 'name'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/lr.py\":29-51",
+            "content": "    Args:\n        cfg (Dict): learning rate configuration.\n        num_iters (int): The number of iterations that may be used when calculating the learning rate\n    Returns:\n        LRScheduler: learning rate scheduler.\n    \"\"\"\n    cfg_copy = cfg.copy()\n    #when learning_rate is LRScheduler\n    if cfg_copy.get('learning_rate') and isinstance(cfg_copy['learning_rate'],\n                                                    dict):\n        cfg_copy['learning_rate'] = build_lr(\n            cfg_copy['learning_rate'],\n            num_iters)  #not support only inner iter_step\n    lr_name = cfg_copy.pop('name')\n    if cfg_copy.get('iter_step'):\n        cfg_copy['num_iters'] = num_iters\n        cfg_copy.pop('iter_step')\n    return getattr(custom_lr, lr_name)(**cfg_copy)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2685801b-a741-4770-9e5c-857dc1bf3b84.json b/docs/doc/2685801b-a741-4770-9e5c-857dc1bf3b84.json
new file mode 100644
index 000000000..581ef7854
--- /dev/null
+++ b/docs/doc/2685801b-a741-4770-9e5c-857dc1bf3b84.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code defines 3D head projection classes for PaddleVideo library, initializes a SlowFast head model with dropout regularization and adaptive average pooling, performs convolutional inference, applies softmax activation, averages when not training, and reshapes before returning.",
+    "details": [
+        {
+            "comment": "This code is from the PaddleVideo library and defines a SlowFastHead class for ResNe(X)t 3D head. It performs a fully-connected projection during training and convolutional projection during testing, with different input sizes handled accordingly. The code includes import statements, registration using HEADS registry, and base class inheritance from BaseHead.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/slowfast_head.py\":0-29",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..registry import HEADS\nfrom .base import BaseHead\nimport paddle\nimport paddle.nn.functional as F\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass SlowFastHead(BaseHead):\n    \"\"\"\n    ResNe(X)t 3D head.\n    This layer performs a fully-connected projection during training, when the\n    input size is 1x1x1. It performs a convolutional projection during testing\n    when the input size is larger than 1x1x1. If the inputs are from multiple"
+        },
+        {
+            "comment": "The code defines a class for SlowFast_Head, which takes different pathways as input and concatenates the inputs after pooling. It has various parameters such as width_per_group, alpha, beta, etc. The ResNetBasicHead takes p pathways as input where p can be in the range of 1 to infinity. It has arguments for dim_in (list), num_classes (int), pool_size (list), and dropout_rate (float).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/slowfast_head.py\":30-55",
+            "content": "    different pathways, the inputs will be concatenated after pooling.\n    \"\"\"\n    def __init__(self,\n                 width_per_group,\n                 alpha,\n                 beta,\n                 num_classes,\n                 num_frames,\n                 crop_size,\n                 dropout_rate,\n                 pool_size_ratio=[[1, 1, 1], [1, 1, 1]],\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 multigrid_short=False,\n                 **kwargs):\n        \"\"\"\n        ResNetBasicHead takes p pathways as input where p in [1, infty].\n        Args:\n            dim_in (list): the list of channel dimensions of the p inputs to the\n                ResNetHead.\n            num_classes (int): the channel dimensions of the p outputs to the\n                ResNetHead.\n            pool_size (list): the list of kernel sizes of p spatial temporal\n                poolings, temporal pool kernel size, spatial pool kernel size,\n                spatial pool kernel size in order.\n            dropout_rate (float): dropout rate. If equal to 0.0, perform no"
+        },
+        {
+            "comment": "This code is initializing a SlowFast head model with specified parameters such as multigrid_short, width_per_group, alpha, beta, num_classes, num_frames, crop_size, and dropout_rate. It also sets the dimension input (dim_in) based on these parameters, and determines the pool size accordingly based on whether multigrid_short is True or False.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/slowfast_head.py\":56-82",
+            "content": "                dropout.\n        \"\"\"\n        super().__init__(num_classes, loss_cfg, **kwargs)\n        self.multigrid_short = multigrid_short\n        self.width_per_group = width_per_group\n        self.alpha = alpha\n        self.beta = beta\n        self.num_classes = num_classes\n        self.num_frames = num_frames\n        self.crop_size = crop_size\n        self.dropout_rate = dropout_rate\n        self.pool_size_ratio = pool_size_ratio\n        self.dim_in = [\n            self.width_per_group * 32,\n            self.width_per_group * 32 // self.beta,\n        ]\n        self.pool_size = [None, None] if self.multigrid_short else [\n            [\n                self.num_frames // self.alpha // self.pool_size_ratio[0][0],\n                self.crop_size // 32 // self.pool_size_ratio[0][1],\n                self.crop_size // 32 // self.pool_size_ratio[0][2],\n            ],\n            [\n                self.num_frames // self.pool_size_ratio[1][0],\n                self.crop_size // 32 // self.pool_size_ratio[1][1],\n                self.crop_size // 32 // self.pool_size_ratio[1][2],"
+        },
+        {
+            "comment": "This code initializes a SlowFast head model. It defines the number of pathways, applies dropout regularization, and initializes weights for linear projection. The forward method expects inputs with the same number of pathways as defined in the model. It then performs adaptive average pooling on each input pathway separately.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/slowfast_head.py\":83-112",
+            "content": "            ],\n        ]\n        assert (len({len(self.pool_size), len(self.dim_in)\n                     }) == 1), \"pathway dimensions are not consistent.\"\n        self.num_pathways = len(self.pool_size)\n        self.dropout = paddle.nn.Dropout(p=self.dropout_rate)\n        self.projection = paddle.nn.Linear(\n            in_features=sum(self.dim_in),\n            out_features=self.num_classes,\n        )\n    def init_weights(self):\n        weight_init_(self.projection,\n                     \"Normal\",\n                     bias_value=0.0,\n                     mean=0.0,\n                     std=0.01)\n    def forward(self, inputs):\n        assert (len(inputs) == self.num_pathways\n                ), \"Input tensor does not contain {} pathway\".format(\n                    self.num_pathways)\n        pool_out = []\n        for pathway in range(self.num_pathways):\n            if self.pool_size[pathway] is None:\n                tmp_out = F.adaptive_avg_pool3d(x=inputs[pathway],\n                                                output_size=(1, 1, 1),"
+        },
+        {
+            "comment": "This code performs pooling and dropout operations on input tensors, followed by projection and fully convolutional inference. It also applies softmax activation and averaging when not in training mode. The resulting tensor is reshaped before returning.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/slowfast_head.py\":113-136",
+            "content": "                                                data_format=\"NCDHW\")\n            else:\n                tmp_out = F.avg_pool3d(x=inputs[pathway],\n                                       kernel_size=self.pool_size[pathway],\n                                       stride=1,\n                                       data_format=\"NCDHW\")\n            pool_out.append(tmp_out)\n        x = paddle.concat(x=pool_out, axis=1)\n        x = paddle.transpose(x=x, perm=(0, 2, 3, 4, 1))\n        # Perform dropout.\n        if self.dropout_rate > 0.0:\n            x = self.dropout(x)\n        x = self.projection(x)\n        # Performs fully convlutional inference.\n        if not self.training:  # attr of base class\n            x = F.softmax(x, axis=4)\n            x = paddle.mean(x, axis=[1, 2, 3])\n        x = paddle.reshape(x, shape=(x.shape[0], -1))\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2789548c-6052-454a-b1c9-a17b3c27bb6c.json b/docs/doc/2789548c-6052-454a-b1c9-a17b3c27bb6c.json
new file mode 100644
index 000000000..c867abb18
--- /dev/null
+++ b/docs/doc/2789548c-6052-454a-b1c9-a17b3c27bb6c.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The \"SamplerUCF24\" class samples frames from videos using parameters like frame count and interval, utilizes PIL library, initializes pipeline, generates frame indices, returns sampled frames.",
+    "details": [
+        {
+            "comment": "This code defines a class \"SamplerUCF24\" for sampling frames in videos, taking parameters such as num_frames and frame_interval. It uses PIL instead of OpenCV to read images and returns the index of sampled frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ucf24.py\":0-32",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport random\nfrom PIL import Image\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass SamplerUCF24(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:\n        num_frames(int): The amount of frames used in a video\n        frame_interval(int): Sampling rate\n        valid_mode(bool): True or False.\n    Returns:\n        frames_idx: the index of sampled #frames."
+        },
+        {
+            "comment": "This code defines a pipeline for loading and creating clips from video files. The `__init__` method initializes the number of frames, frame interval (randomly determined if valid mode is False), and valid mode flag. The `_get` method retrieves images in order, converts them to RGB, and appends them to a list. The `_make_clip` method generates a set of frame indices that create a looped clip. The pipeline is called with the results as input, extracting the image folder and filename for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ucf24.py\":33-64",
+            "content": "    \"\"\"\n    def __init__(self,\n                 num_frames=16,\n                 frame_interval=1,\n                 valid_mode=False):\n        self.num_frames = num_frames\n        self.frame_interval = frame_interval if valid_mode else random.randint(1, 2)\n        self.valid_mode = valid_mode\n    def _get(self, frames_idxs, img_folder, results):\n        imgs = []\n        for idx in frames_idxs:\n            img = Image.open(\n                os.path.join(img_folder, '{:05d}.jpg'.format(idx))).convert('RGB')\n            imgs.append(img)\n        results['imgs'] = imgs\n        return results\n    def _make_clip(self, im_ind, max_num):\n        frame_idxs = []\n        for i in reversed(range(self.num_frames)):\n            # make it as a loop\n            i_temp = im_ind - i * self.frame_interval\n            if i_temp < 1:\n                i_temp = 1\n            elif i_temp > max_num:\n                i_temp = max_num\n            frame_idxs.append(i_temp)\n        return frame_idxs\n    def __call__(self, results):\n        img_folder, key_frame = os.path.split(results['filename'])"
+        },
+        {
+            "comment": "This code retrieves the number of frames in a folder, assigns a key frame index based on the input, generates frame indices for a video clip, and returns the requested frames from their folder.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ucf24.py\":65-68",
+            "content": "        frame_len = len(os.listdir(img_folder))\n        key_idx = int(key_frame[0:5])\n        frame_idxs = self._make_clip(key_idx, frame_len)\n        return self._get(frame_idxs, img_folder, results)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/28804844-7817-4113-9692-e83d3de8eb8e.json b/docs/doc/28804844-7817-4113-9692-e83d3de8eb8e.json
new file mode 100644
index 000000000..748e80d61
--- /dev/null
+++ b/docs/doc/28804844-7817-4113-9692-e83d3de8eb8e.json
@@ -0,0 +1,20 @@
+{
+    "summary": "Summary: A comprehensive list of notable Spatio-Temporal Action Detection papers and authors from 2015-2017, covering major conferences like ICCV, BMVC, ECCV, and arXiv.",
+    "details": [
+        {
+            "comment": "The code provides a list of useful Spatio-Temporal Action Detection papers and their corresponding authors, year of publication, and conference or journal.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Spatio-Temporal Action Detection Papers\":0-12",
+            "content": "Usefull Spatio-Temporal Action Detection Papers.\n    A Better Baseline for AVA - R. Girdhar et al., ActivityNet Workshop, CVPR2018.\n    Real-Time End-to-End Action Detection with Two-Stream Networks - A. El-Nouby and G. Taylor, arXiv2018.\n    Human Action Localization with Sparse Spatial Supervision - P. Weinzaepfel et al., arXiv2017.\n    Unsupervised Action Discovery and Localization in Videos - K. Soomro and M. Shah, ICCV2017.\n    Spatial-Aware Object Embeddings for Zero-Shot Localization and Classification of Actions - P. Mettes and C. G. M. Snoek, ICCV2017.\n    Action Tubelet Detector for Spatio-Temporal Action Localization - V. Kalogeiton et al, ICCV2017. \n    Tube Convolutional Neural Network (T-CNN) for Action Detection in Videos - R. Hou et al, ICCV2017. \n    Chained Multi-stream Networks Exploiting Pose, Motion, and Appearance for Action Classification and Detection - M. Zolfaghari et al, ICCV2017. \n    TORNADO: A Spatio-Temporal Convolutional Regression Network for Video Action Proposal - H. Zhu et al., ICCV2017."
+        },
+        {
+            "comment": "This code provides a list of notable research papers related to action detection and localization in videos, published between 2015 and 2017. The papers are from various conferences such as ICCV, BMVC, ECCV, and arXiv, demonstrating the advancements made in this field during that period.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Spatio-Temporal Action Detection Papers\":13-23",
+            "content": "    Online Real time Multiple Spatiotemporal Action Localisation and Prediction - G. Singh et al, ICCV2017. \n    AMTnet: Action-Micro-Tube regression by end-to-end trainable deep architecture - S. Saha et al, ICCV2017.\n    Am I Done? Predicting Action Progress in Videos - F. Becattini et al, BMVC2017.\n    Generic Tubelet Proposals for Action Localization - J. He et al, arXiv2017.\n    Incremental Tube Construction for Human Action Detection - H. S. Behl et al, arXiv2017.\n    Multi-region two-stream R-CNN for action detection - X. Peng and C. Schmid. ECCV2016. \n    Spot On: Action Localization from Pointly-Supervised Proposals - P. Mettes et al, ECCV2016.\n    Deep Learning for Detecting Multiple Space-Time Action Tubes in Videos - S. Saha et al, BMVC2016. \n    Learning to track for spatio-temporal action localization - P. Weinzaepfel et al. ICCV2015.\n    Action detection by implicit intentional motion clustering - W. Chen and J. Corso, ICCV2015.\n    Finding Action Tubes - G. Gkioxari and J. Malik CVPR2015. "
+        },
+        {
+            "comment": "List of papers on action localization and detection in videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Spatio-Temporal Action Detection Papers\":24-29",
+            "content": "    APT: Action localization proposals from dense trajectories - J. Gemert et al, BMVC2015. \n    Spatio-Temporal Object Detection Proposals - D. Oneata et al, ECCV2014.\n    Action localization with tubelets from motion - M. Jain et al, CVPR2014.\n    Spatiotemporal deformable part models for action detection - Y. Tian et al, CVPR2013. \n    Action localization in videos through context walk - K. Soomro et al, ICCV2015.\n    Fast Action Proposals for Human Action Detection and Search - G. Yu and J. Yuan, CVPR2015. "
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/288152c2-ba5b-46b3-96a6-dc101461e129.json b/docs/doc/288152c2-ba5b-46b3-96a6-dc101461e129.json
new file mode 100644
index 000000000..9315c8727
--- /dev/null
+++ b/docs/doc/288152c2-ba5b-46b3-96a6-dc101461e129.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code records metrics, calculates means, logs batch info and epoch progress in training processes with colored formatting for visibility. It uses PaddleVideo framework, AverageMeter and OrderedDict for efficient logging.",
+    "details": [
+        {
+            "comment": "Code snippet imports necessary libraries and defines functions for building a record, logging batches and epochs. It also sets up logger for the PaddleVideo framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/record.py\":0-31",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport datetime\nfrom collections import OrderedDict\nimport paddle\nfrom .logger import coloring, get_logger\nlogger = get_logger(\"paddlevideo\")\n__all__ = ['AverageMeter', 'build_record', 'log_batch', 'log_epoch']\ndef build_record(cfg):\n    record_list = [\n        (\"loss\", AverageMeter('loss', '7.5f')),\n        (\"lr\", AverageMeter('lr', 'f', need_avg=False)),\n    ]\n    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework"
+        },
+        {
+            "comment": "Code appends specific metrics to the record list based on the framework specified in cfg. Frameworks include 'PaddleVideo', 'Recognizer', and 'FastRCNN'. Metrics are averaged using AverageMeter and include 'hit_at_one', 'perr', 'gap', 'top1', 'top5', recall@thr=0.5, prec@thr=0.5, recall@top3, prec@top3, recall@top5, prec@top5, and mAP@0.5IOU.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/record.py\":32-47",
+            "content": "        record_list.append((\"hit_at_one\", AverageMeter(\"hit_at_one\", '.5f')))\n        record_list.append((\"perr\", AverageMeter(\"perr\", '.5f')))\n        record_list.append((\"gap\", AverageMeter(\"gap\", '.5f')))\n    elif 'Recognizer' in cfg.framework:\n        record_list.append((\"top1\", AverageMeter(\"top1\", '.5f')))\n        record_list.append((\"top5\", AverageMeter(\"top5\", '.5f')))\n    elif 'FastRCNN' in cfg.framework:\n        record_list.append(\n            (\"recall@thr=0.5\", AverageMeter(\"recall@thr=0.5\", '.5f')))\n        record_list.append((\"prec@thr=0.5\", AverageMeter(\"prec@thr=0.5\",\n                                                         '.5f')))\n        record_list.append((\"recall@top3\", AverageMeter(\"recall@top3\", '.5f')))\n        record_list.append((\"prec@top3\", AverageMeter(\"prec@top3\", '.5f')))\n        record_list.append((\"recall@top5\", AverageMeter(\"recall@top5\", '.5f')))\n        record_list.append((\"prec@top5\", AverageMeter(\"prec@top5\", '.5f')))\n        record_list.append((\"mAP@0.5IOU\", AverageMeter(\"mAP@0.5IOU\", '.5f')))"
+        },
+        {
+            "comment": "The code is conditionally adding metrics to the record list based on the value of 'cfg.framework'. It handles three different cases: 'DepthEstimator', 'MSTCN' or 'ASRF', and 'YOWOLocalizer'. For 'DepthEstimator', it adds 9 metrics, for 'MSTCN' or 'ASRF', it adds one metric, and for 'YOWOLocalizer', it adds two metrics. Each metric is associated with an AverageMeter object that keeps track of its mean value over time.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/record.py\":48-64",
+            "content": "    elif 'DepthEstimator' in cfg.framework:\n        record_list.append((\"abs_rel\", AverageMeter(\"abs_rel\", '.5f')))\n        record_list.append((\"sq_rel\", AverageMeter(\"sq_rel\", '.5f')))\n        record_list.append((\"rmse\", AverageMeter(\"rmse\", '.5f')))\n        record_list.append((\"rmse_log\", AverageMeter(\"rmse_log\", '.5f')))\n        record_list.append((\"a1\", AverageMeter(\"a1\", '.5f')))\n        record_list.append((\"a2\", AverageMeter(\"a2\", '.5f')))\n        record_list.append((\"a3\", AverageMeter(\"a3\", '.5f')))\n        record_list.append((\"losses_day\", AverageMeter(\"losses_day\", '.5f')))\n        record_list.append((\"losses_night\", AverageMeter(\"losses_night\",\n                                                         '.5f')))\n    elif 'MSTCN' in cfg.framework or 'ASRF' in cfg.framework:\n        record_list.append((\"F1@0.50\", AverageMeter(\"F1@0.50\", '.5f')))\n    elif 'YOWOLocalizer' in cfg.framework:\n        record_list.append((\"nCorrect\", AverageMeter('nCorrect', '.1f')))\n        record_list.append((\"fscore\", AverageMeter(\"fscore\", '.5f')))"
+        },
+        {
+            "comment": "This function creates a record dictionary containing two AverageMeter objects, one for batch time and another for reader time. It then converts the list to an OrderedDict and returns it. The AverageMeter class calculates and stores the average and current values of a given metric, allowing easy tracking of performance metrics during program execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/record.py\":66-104",
+            "content": "    record_list.append((\"batch_time\", AverageMeter('batch_cost', '.5f')))\n    record_list.append((\"reader_time\", AverageMeter('reader_cost', '.5f')))\n    record_list = OrderedDict(record_list)\n    return record_list\nclass AverageMeter(object):\n    \"\"\"\n    Computes and stores the average and current value\n    \"\"\"\n    def __init__(self, name='', fmt='f', need_avg=True):\n        self.name = name\n        self.fmt = fmt\n        self.need_avg = need_avg\n        self.reset()\n    def reset(self):\n        \"\"\" reset \"\"\"\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        \"\"\" update \"\"\"\n        if isinstance(val, paddle.Tensor):\n            val = float(val)\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n    @property\n    def total(self):\n        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)\n    @property\n    def total_minute(self):\n        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,"
+        },
+        {
+            "comment": "This code defines a class and functions for recording metrics, calculating means, and logging batch information. The `log_batch` function records the time taken for each batch, adds other metric values, and logs the total epoch, current epoch, mode, and step. It also calculates the remaining time for the current operation if provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/record.py\":105-135",
+            "content": "                                                            self=self)\n    @property\n    def mean(self):\n        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(\n            self=self) if self.need_avg else ''\n    @property\n    def value(self):\n        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)\ndef log_batch(metric_list,\n              batch_id,\n              epoch_id,\n              total_epoch,\n              mode,\n              ips,\n              eta_sec: int = None):\n    batch_cost = str(metric_list['batch_time'].value) + ' sec,'\n    reader_cost = str(metric_list['reader_time'].value) + ' sec,'\n    metric_values = []\n    for m in metric_list:\n        if not (m == 'batch_time' or m == 'reader_time'):\n            metric_values.append(metric_list[m].value)\n    metric_str = ' '.join([str(v) for v in metric_values])\n    epoch_str = \"epoch:[{:>3d}/{:<3d}]\".format(epoch_id, total_epoch)\n    step_str = \"{:s} step:{:<4d}\".format(mode, batch_id)\n    if eta_sec is not None:\n        eta_str = \"eta: {:s}\".format("
+        },
+        {
+            "comment": "This code logs the progress of an epoch in a training process. It formats the logged information with colors for better visibility. The logger displays the current step, metrics, time taken, batch cost and reader cost, estimated time remaining (ETA), and maximum memory reserved and allocated on CUDA devices if available.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/record.py\":136-154",
+            "content": "            str(datetime.timedelta(seconds=int(eta_sec))))\n    else:\n        eta_str = ''\n    max_mem_reserved_str = \"\"\n    max_mem_allocated_str = \"\"\n    if paddle.device.is_compiled_with_cuda():\n        max_mem_reserved_str = f\"max_mem_reserved: {format(paddle.device.cuda.max_memory_reserved() / (1024 ** 2), '.2f')} MB\"\n        max_mem_allocated_str = f\"max_mem_allocated: {format(paddle.device.cuda.max_memory_allocated() / (1024 ** 2), '.2f')} MB\"\n    logger.info(\"{:s} {:s} {:s} {:s} {:s} {} {:s}, {} {}\".format(\n        coloring(epoch_str, \"HEADER\") if batch_id == 0 else epoch_str,\n        coloring(step_str, \"PURPLE\"), coloring(metric_str, 'OKGREEN'),\n        coloring(batch_cost, \"OKGREEN\"), coloring(reader_cost, 'OKGREEN'), ips,\n        eta_str, max_mem_reserved_str, max_mem_allocated_str))\ndef log_epoch(metric_list, epoch, mode, ips):\n    batch_cost = 'avg_' + str(metric_list['batch_time'].value) + ' sec,'\n    reader_cost = 'avg_' + str(metric_list['reader_time'].value) + ' sec,'\n    batch_sum = str(metric_list['batch_time'].total) + ' sec,'"
+        },
+        {
+            "comment": "This code calculates the mean of metrics except 'batch_time' and 'reader_time', then joins them into a string. It formats an info message with RED for \"END epoch\", PURPLE for mode, GREEN for metric values, batch cost, reader cost, and batch sum, as well as ips (inferences per second). The logger outputs this formatted message.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/record.py\":156-167",
+            "content": "    metric_values = []\n    for m in metric_list:\n        if not (m == 'batch_time' or m == 'reader_time'):\n            metric_values.append(metric_list[m].mean)\n    metric_str = ' '.join([str(v) for v in metric_values])\n    end_epoch_str = \"END epoch:{:<3d}\".format(epoch)\n    logger.info(\"{:s} {:s} {:s} {:s} {:s} {:s} {}\".format(\n        coloring(end_epoch_str, \"RED\"), coloring(mode, \"PURPLE\"),\n        coloring(metric_str, \"OKGREEN\"), coloring(batch_cost, \"OKGREEN\"),\n        coloring(reader_cost, \"OKGREEN\"), coloring(batch_sum, \"OKGREEN\"), ips))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/28eff272-aa18-4b97-8e97-b84c16992ace.json b/docs/doc/28eff272-aa18-4b97-8e97-b84c16992ace.json
new file mode 100644
index 000000000..2586a0279
--- /dev/null
+++ b/docs/doc/28eff272-aa18-4b97-8e97-b84c16992ace.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code defines a Compose class for image processing steps and functions to preprocess video frames, returning input/output variables. The get_preprocess_func function selects the correct preprocessing function based on the model name. Invalid names raise ValueError.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and defines the VALID_MODELS variable. It then creates a Compose class that takes in a list of transforms, allowing for composition of multiple image processing steps to be applied sequentially.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/preprocess_ops.py\":0-33",
+            "content": "import os\nimport sys\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, \"../../\")))\nfrom paddlevideo.loader.pipelines import (CenterCrop, Image2Array,\n                                          Normalization, Sampler, Scale,\n                                          VideoDecoder, TenCrop)\nimport numpy as np\nfrom typing import Dict, Tuple, List, Callable\nVALID_MODELS = [\"PPTSM\", \"PPTSN\"]\nimport os\nimport sys\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, \"../../\")))\nfrom paddlevideo.loader.pipelines import (CenterCrop, Image2Array,\n                                          Normalization, Sampler, Scale,\n                                          VideoDecoder, TenCrop)\nimport numpy as np\nfrom typing import Dict, Tuple, List, Callable\nVALID_MODELS = [\"PPTSM\", \"PPTSN\"]\nclass Compose:\n    def __init__(self, transforms):\n        self.transforms = transforms\n    def __call__(self, img):\n        for t in self.transforms:"
+        },
+        {
+            "comment": "The code is defining a function `preprocess_PPTSM` that takes a video path as input, and applies several image preprocessing steps before returning the feed and fetch data. These steps include decoding the video frames, sampling, scaling, cropping, converting to array format, and normalization using specific mean and standard deviation values. The resulting processed data is stored in the `results` dictionary, which contains the images and metadata.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/preprocess_ops.py\":34-75",
+            "content": "            img = t(img)\n        return img\ndef np_softmax(x: np.ndarray, axis: int = 0) -> np.ndarray:\n    \"\"\"softmax function\n    Args:\n        x (np.ndarray): logits\n        axis (int): axis\n    Returns:\n        np.ndarray: probs\n    \"\"\"\n    x -= np.max(x, axis=axis, keepdims=True)\n    x = np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)\n    return x\ndef preprocess_PPTSM(video_path: str) -> Tuple[Dict[str, np.ndarray], List]:\n    \"\"\"preprocess\n    Args:\n        video_path (str): input video path\n    Returns:\n        Tuple[Dict[str, np.ndarray], List]: feed and fetch\n    \"\"\"\n    img_mean = [0.485, 0.456, 0.406]\n    img_std = [0.229, 0.224, 0.225]\n    seq = Compose([\n        VideoDecoder(),\n        Sampler(8, 1, valid_mode=True),\n        Scale(256),\n        CenterCrop(224),\n        Image2Array(),\n        Normalization(img_mean, img_std)\n    ])\n    results = {\"filename\": video_path}\n    results = seq(results)\n    tmp_inp = np.expand_dims(results[\"imgs\"], axis=0)  # [b,t,c,h,w]\n    tmp_inp = np.expand_dims(tmp_inp, axis=0)  # [1,b,t,c,h,w]"
+        },
+        {
+            "comment": "The function preprocess_PPTSN takes in a video path, applies a series of image processing steps to the video frames, and returns feed and fetch variables for input and output respectively. The get_preprocess_func function returns a preprocessing function based on the given model name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/preprocess_ops.py\":76-110",
+            "content": "    feed = {\"data_batch_0\": tmp_inp}\n    fetch = [\"outputs\"]\n    return feed, fetch\ndef preprocess_PPTSN(video_path: str) -> Tuple[Dict[str, np.ndarray], List]:\n    \"\"\"preprocess\n    Args:\n        video_path (str): input video path\n    Returns:\n        Tuple[Dict[str, np.ndarray], List]: feed and fetch\n    \"\"\"\n    img_mean = [0.485, 0.456, 0.406]\n    img_std = [0.229, 0.224, 0.225]\n    seq = Compose([\n        VideoDecoder(),\n        Sampler(25, 1, valid_mode=True, select_left=True),\n        Scale(256, fixed_ratio=True, do_round=True, backend='cv2'),\n        TenCrop(224),\n        Image2Array(),\n        Normalization(img_mean, img_std)\n    ])\n    results = {\"filename\": video_path}\n    results = seq(results)\n    tmp_inp = np.expand_dims(results[\"imgs\"], axis=0)  # [b,t,c,h,w]\n    tmp_inp = np.expand_dims(tmp_inp, axis=0)  # [1,b,t,c,h,w]\n    feed = {\"data_batch_0\": tmp_inp}\n    fetch = [\"outputs\"]\n    return feed, fetch\ndef get_preprocess_func(model_name: str) -> Callable:\n    \"\"\"get preprocess function by model_name"
+        },
+        {
+            "comment": "This function takes a model name as input and returns the corresponding preprocess function based on the conditionals provided. If the model name is \"PPTSM\", it will return the preprocess_PPTSM function, if the model name is \"PPTSN\" it will return preprocess_PPTSN, otherwise it raises a ValueError with an error message stating that the model name must be in VALID_MODELS.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/preprocess_ops.py\":112-125",
+            "content": "    Args:\n        model_name (str): model's name, must in `VALID_MODELS`\n    Returns:\n        Callable: preprocess function corresponding to model name\n    \"\"\"\n    if model_name == \"PPTSM\":\n        return preprocess_PPTSM\n    elif model_name == \"PPTSN\":\n        return preprocess_PPTSN\n    else:\n        raise ValueError(\n            f\"model_name must in {VALID_MODELS}, but got model_name\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/294e5c0a-0354-42fc-a66a-9bd9614a892d.json b/docs/doc/294e5c0a-0354-42fc-a66a-9bd9614a892d.json
new file mode 100644
index 000000000..6fbd4df64
--- /dev/null
+++ b/docs/doc/294e5c0a-0354-42fc-a66a-9bd9614a892d.json
@@ -0,0 +1,130 @@
+{
+    "summary": "This code initializes a DAVIS 2017 dataset class for loading and preprocessing, creates a custom dataloader, prepares input for Ma-Net model, and outputs JSON files with sequence data.",
+    "details": [
+        {
+            "comment": "This code snippet defines a DAVIS2017_Test_Manager class for loading and managing test data from the DAVIS 2017 dataset. It accepts parameters such as split, root directory, transformations to apply, and sequence name. The get_image() method retrieves an image from the specified directory based on the index, reads it into a numpy array, and converts it into float32 data type. This class can be used for loading test images in the DAVIS 2017 dataset for further processing or analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":0-39",
+            "content": "from __future__ import division\nimport json\nimport os\nimport shutil\nimport numpy as np\nimport paddle, cv2\nfrom random import choice\nfrom paddle.io import Dataset\nimport json\nfrom PIL import Image\nfrom davisinteractive.utils.scribbles import scribbles2mask, annotated_frames\nimport sys\nsys.path.append(\"..\")\nfrom config import cfg\nimport time\nclass DAVIS2017_Test_Manager():\n    def __init__(self,\n                 split='val',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False,\n                 seq_name=None):\n        self.split = split\n        self.db_root_dir = root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_name = seq_name\n    def get_image(self, idx):\n        frame_name = str(idx)\n        while len(frame_name) != 5:\n            frame_name = '0' + frame_name\n        imgpath = os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                               str(self.seq_name), frame_name + '.jpg')\n        img = cv2.imread(imgpath)\n        img = np.array(img, dtype=np.float32)"
+        },
+        {
+            "comment": "This code is initializing a dataset for DAVIS2017, which contains images and their features. It loads the image list from a specified directory, and applies optional transformations to the samples before returning them. The dataset supports different splits (e.g., training or validation) and allows for specifying an optional sequence name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":40-72",
+            "content": "        sample = {'img': img}\n        if self.transform is not None:\n            sample = self.transform(sample)\n        return sample\nclass DAVIS2017_Feature_Extract(Dataset):\n    def __init__(self,\n                 split='val',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False,\n                 seq_name=None):\n        self.split = split\n        self.db_root_dir = root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_name = seq_name\n        self.img_list = np.sort(\n            os.listdir(\n                os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                             str(seq_name))))\n    def __len__(self):\n        return len(self.img_list)\n    def __getitem__(self, idx):\n        img = self.img_list[idx]\n        imgpath = os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                               str(self.seq_name), img)\n        current_img = cv2.imread(imgpath)\n        current_img = np.array(current_img, dtype=np.float32)"
+        },
+        {
+            "comment": "The code defines a DAVIS2017_VOS_Test dataset class which loads data from the DAVIS 2017 dataset for semantic segmentation tasks. It takes various parameters such as split, root directory, transformation function, if RGB images are required, result root directory, and sequence name. It reads a list of sequences from a file and returns an image sample along with its metadata (sequence name, height, width, and image path).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":73-108",
+            "content": "        h, w, _ = current_img.shape\n        sample = {'img1': current_img}\n        sample['meta'] = {\n            'seq_name': self.seq_name,\n            'h_w': (h, w),\n            'img_path': imgpath\n        }\n        if self.transform is not None:\n            sample = self.transform(sample)\n        return sample\nclass DAVIS2017_VOS_Test(Dataset):\n    \"\"\"\n    \"\"\"\n    def __init__(self,\n                 split='val',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False,\n                 result_root=None,\n                 seq_name=None):\n        self.split = split\n        self.db_root_dir = root\n        self.result_root = result_root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_name = seq_name\n        self.seq_list_file = os.path.join(\n            self.db_root_dir, 'ImageSets', '2017',\n            '_'.join(self.split) + '_instances.txt')\n        self.seqs = []\n        for splt in self.split:\n            with open(\n                    os.path.join(self.db_root_dir, 'ImageSets', '2017',"
+        },
+        {
+            "comment": "The code reads sequences from a file and extends the existing sequence list. It then checks if preprocessing is required and performs it if necessary. The code asserts that the sequence name exists in the dictionary of sequences. Next, it retrieves image names and label names, creating lists of image paths and label paths respectively. Finally, it ensures that a specific file exists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":109-134",
+            "content": "                                 self.split + '.txt')) as f:\n                seqs_tmp = f.readlines()\n            seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n            self.seqs.extend(seqs_tmp)\n        if not self._check_preprocess():\n            self._preprocess()\n        assert self.seq_name in self.seq_dict.keys(\n        ), '{} not in {} set.'.format(self.seq_name, '_'.join(self.split))\n        names_img = np.sort(\n            os.listdir(\n                os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                             str(seq_name))))\n        img_list = list(\n            map(lambda x: os.path.join('JPEGImages/480p/', str(seq_name), x),\n                names_img))\n        name_label = np.sort(\n            os.listdir(\n                os.path.join(self.db_root_dir, 'Annotations/480p/',\n                             str(seq_name))))\n        labels = list(\n            map(lambda x: os.path.join('Annotations/480p/', str(seq_name), x),\n                name_label))\n        if not os.path.isfile("
+        },
+        {
+            "comment": "This code creates a data loader for the DAVIS 2017 dataset. It checks if the result directory exists, and if not, it creates it and copies the label file to the new directory. If the directory already exists, it simply copies the label file. The function then sets the first image and its label, as well as the remaining images in the list. Lastly, it defines the length of the dataset and a method for getting items from the dataset at specific indices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":135-160",
+            "content": "                os.path.join(self.result_root, seq_name, name_label[0])):\n            if not os.path.exists(os.path.join(self.result_root, seq_name)):\n                os.makedirs(os.path.join(self.result_root, seq_name))\n                shutil.copy(\n                    os.path.join(self.db_root_dir, labels[0]),\n                    os.path.join(self.result_root, seq_name, name_label[0]))\n            else:\n                shutil.copy(\n                    os.path.join(self.db_root_dir, labels[0]),\n                    os.path.join(self.result_root, seq_name, name_label[0]))\n        self.first_img = names_img[0]\n        self.first_label = name_label[0]\n        self.img_list = names_img[1:]\n    def __len__(self):\n        return len(self.img_list)\n    def __getitem__(self, idx):\n        img = self.img_list[idx]\n        imgpath = os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                               str(self.seq_name), img)\n        num_frame = int(img.split('.')[0])\n        ref_img = os.path.join(self.db_root_dir, 'JPEGImages/480p/',"
+        },
+        {
+            "comment": "The code snippet is responsible for loading images and labels from a specific path. It handles image path formatting, ensures all frames have 5 digits, reads images using cv2, converts them to numpy arrays with float32 dtype, and retrieves reference labels by opening and converting the label file to uint8 dtype.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":161-185",
+            "content": "                               str(self.seq_name), self.first_img)\n        prev_frame = num_frame - 1\n        prev_frame = str(prev_frame)\n        while len(prev_frame) != 5:\n            prev_frame = '0' + prev_frame\n        prev_img = os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                                str(self.seq_name),\n                                prev_frame + '.' + img.split('.')[-1])\n        current_img = cv2.imread(imgpath)\n        current_img = np.array(current_img, dtype=np.float32)\n        ref_img = cv2.imread(ref_img)\n        ref_img = np.array(ref_img, dtype=np.float32)\n        prev_img = cv2.imread(prev_img)\n        prev_img = np.array(prev_img, dtype=np.float32)\n        ref_label = os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                 str(self.seq_name), self.first_label)\n        ref_label = Image.open(ref_label)\n        ref_label = np.array(ref_label, dtype=np.uint8)\n        prev_label = os.path.join(\n            self.result_root, str(self.seq_name),"
+        },
+        {
+            "comment": "This code appears to be part of a data loader for a video object detection task. It loads frames and labels from a specific dataset (DAVIS 2017) and creates samples for each frame. The _check_preprocess function checks if the sequence list file exists, and if so, it loads the dictionary of sequences. The _preprocess function initializes an empty dictionary for the sequence dictionary and iterates over the specified sequences to process them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":186-218",
+            "content": "            prev_frame + '.' + self.first_label.split('.')[-1])\n        prev_label = Image.open(prev_label)\n        prev_label = np.array(prev_label, dtype=np.uint8)\n        obj_num = self.seq_dict[self.seq_name][-1]\n        sample = {\n            'ref_img': ref_img,\n            'prev_img': prev_img,\n            'current_img': current_img,\n            'ref_label': ref_label,\n            'prev_label': prev_label\n        }\n        sample['meta'] = {\n            'seq_name': self.seq_name,\n            'frame_num': num_frame,\n            'obj_num': obj_num,\n            'current_name': img\n        }\n        if self.transform is not None:\n            sample = self.transform(sample)\n        return sample\n    def _check_preprocess(self):\n        _seq_list_file = self.seq_list_file\n        if not os.path.isfile(_seq_list_file):\n            return False\n        else:\n            self.seq_dict = json.load(open(self.seq_list_file, 'r'))\n            return True\n    def _preprocess(self):\n        self.seq_dict = {}\n        for seq in self.seqs:"
+        },
+        {
+            "comment": "This code reads object masks from DAVIS 2017 dataset, obtains the number of objects, and creates a dictionary containing sequence names as keys and their corresponding unique object IDs as values. The dictionary is then saved to a file in JSON format for further use in the DAVIS2017_VOS_Train class, which serves as the training dataset for the DAVIS 2017 dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":219-243",
+            "content": "            # Read object masks and get number of objects\n            name_label = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/', seq)))\n            label_path = os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                      seq, name_label[0])\n            _mask = np.array(Image.open(label_path))\n            _mask_ids = np.unique(_mask)\n            n_obj = _mask_ids[-1]\n            self.seq_dict[seq] = list(range(1, n_obj + 1))\n        with open(self.seq_list_file, 'w') as outfile:\n            outfile.write('{{\\n\\t\"{:s}\": {:s}'.format(\n                self.seqs[0], json.dumps(self.seq_dict[self.seqs[0]])))\n            for ii in range(1, len(self.seqs)):\n                outfile.write(',\\n\\t\"{:s}\": {:s}'.format(\n                    self.seqs[ii], json.dumps(self.seq_dict[self.seqs[ii]])))\n            outfile.write('\\n}\\n')\n        print('Preprocessing finished')\nclass DAVIS2017_VOS_Train(Dataset):\n    \"\"\"DAVIS2017 dataset for training"
+        },
+        {
+            "comment": "This code initializes a class for loading and preprocessing data from the DAVIS 2017 dataset. It takes parameters such as split, root directory, transformation functions, and RGB mode. The code reads sequence lists and checks if pre-processing is necessary before creating a list of samples to be loaded.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":245-272",
+            "content": "    Return: imgs: N*2*3*H*W,label: N*2*1*H*W, seq-name: N, frame_num:N\n    \"\"\"\n    def __init__(self,\n                 split='train',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False):\n        self.split = split\n        self.db_root_dir = root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_list_file = os.path.join(\n            self.db_root_dir, 'ImageSets', '2017',\n            '_'.join(self.split) + '_instances.txt')\n        self.seqs = []\n        for splt in self.split:\n            with open(\n                    os.path.join(self.db_root_dir, 'ImageSets', '2017',\n                                 self.split + '.txt')) as f:\n                seqs_tmp = f.readlines()\n            seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n            self.seqs.extend(seqs_tmp)\n        self.imglistdic = {}\n        if not self._check_preprocess():\n            self._preprocess()\n        self.sample_list = []\n        for seq_name in self.seqs:\n            images = np.sort("
+        },
+        {
+            "comment": "The code defines a class for loading data from the DAVIS 2017 dataset, extracting image and annotation files based on the given sequence name. It also provides methods to get the length of the dataset and retrieve specific items by index. The dataset is organized into 'JPEGImages/480p' and 'Annotations/480p' directories with corresponding sequence names. It selects a random previous image from the list, increments its frame number by 1 to get the next image, and returns both the image and annotation files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":273-298",
+            "content": "                os.listdir(\n                    os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                                 seq_name.strip())))\n            images_path = list(\n                map(\n                    lambda x: os.path.join('JPEGImages/480p/', seq_name.strip(),\n                                           x), images))\n            lab = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                 seq_name.strip())))\n            lab_path = list(\n                map(\n                    lambda x: os.path.join('Annotations/480p/', seq_name.strip(\n                    ), x), lab))\n            self.imglistdic[seq_name] = (images, lab)\n    def __len__(self):\n        return len(self.seqs)\n    def __getitem__(self, idx):\n        seqname = self.seqs[idx]\n        imagelist, lablist = self.imglistdic[seqname]\n        prev_img = np.random.choice(imagelist[:-1], 1)\n        prev_img = prev_img[0]\n        frame_num = int(prev_img.split('.')[0]) + 1"
+        },
+        {
+            "comment": "Processing two adjacent frames and labels: Reads next image, prepares previous image and their corresponding labels from file paths.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":299-319",
+            "content": "        next_frame = str(frame_num)\n        while len(next_frame) != 5:\n            next_frame = '0' + next_frame\n        ###############################Processing two adjacent frames and labels\n        img2path = os.path.join('JPEGImages/480p/', seqname,\n                                next_frame + '.' + prev_img.split('.')[-1])\n        img2 = cv2.imread(os.path.join(self.db_root_dir, img2path))\n        img2 = np.array(img2, dtype=np.float32)\n        imgpath = os.path.join('JPEGImages/480p/', seqname, prev_img)\n        img1 = cv2.imread(os.path.join(self.db_root_dir, imgpath))\n        img1 = np.array(img1, dtype=np.float32)\n        ###############\n        labelpath = os.path.join(\n            'Annotations/480p/', seqname,\n            prev_img.split('.')[0] + '.' + lablist[0].split('.')[-1])\n        label1 = Image.open(os.path.join(self.db_root_dir, labelpath))\n        label2path = os.path.join('Annotations/480p/', seqname,\n                                  next_frame + '.' + lablist[0].split('.')[-1])\n        label2 = Image.open(os.path.join(self.db_root_dir, label2path))"
+        },
+        {
+            "comment": "This code randomly selects a reference image and associated scribble label for each video frame, ensuring the labels are unique and not from the same or consecutive frames. It also ensures that the selected images have corresponding annotations in the 480p folder.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":321-343",
+            "content": "        label1 = np.array(label1, dtype=np.uint8)\n        label2 = np.array(label2, dtype=np.uint8)\n        ###################\n        ref_img = np.random.choice(imagelist, 1)\n        ref_img = ref_img[0]\n        ref_img_name = ref_img\n        ref_scribble_label = Image.open(\n            os.path.join(\n                self.db_root_dir, 'Annotations/480p/', seqname,\n                ref_img_name.split('.')[0] + '.' + lablist[0].split('.')[-1]))\n        ref_scribble_label = np.array(ref_scribble_label, dtype=np.uint8)\n        while len(np.unique(ref_scribble_label)) < self.seq_dict[seqname][\n                -1] + 1 or ref_img == prev_img or ref_img == (\n                    next_frame + '.' + prev_img.split('.')[-1]):\n            ref_img = np.random.choice(imagelist, 1)\n            ref_img = ref_img[0]\n            ref_img_name = ref_img\n            ref_scribble_label = Image.open(\n                os.path.join(\n                    self.db_root_dir, 'Annotations/480p/', seqname,\n                    ref_img_name.split('.')[0] + '.' +"
+        },
+        {
+            "comment": "This code reads an image, splits it into RGB channels if required, and stores it in a dictionary along with other images and labels. It also assigns metadata to the sample. The transform is applied if not None.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":344-373",
+            "content": "                    lablist[0].split('.')[-1]))\n            ref_scribble_label = np.array(ref_scribble_label, dtype=np.int64)\n        ref_img = os.path.join('JPEGImages/480p/', seqname, ref_img)\n        ref_img = cv2.imread(os.path.join(self.db_root_dir, ref_img))\n        ref_img = np.array(ref_img, dtype=np.float32)\n        ####\n        ###################\n        if self.rgb:\n            img1 = img1[:, :, [2, 1, 0]]\n            img2 = img2[:, :, [2, 1, 0]]\n            ref_img = ref_img[:, :, [2, 1, 0]]\n        obj_num = self.seq_dict[seqname][-1]\n        sample = {\n            'ref_img': ref_img,\n            'img1': img1,\n            'img2': img2,\n            'ref_scribble_label': ref_scribble_label,\n            'label1': label1,\n            'label2': label2\n        }\n        sample['meta'] = {\n            'seq_name': seqname,\n            'frame_num': frame_num,\n            'obj_num': obj_num\n        }\n        if self.transform is not None:\n            sample = self.transform(sample)\n        sample['ref_scribble_label'] = paddle.to_tensor("
+        },
+        {
+            "comment": "The code defines a function that loads and preprocesses data from a specific source. It checks if the sequence list file exists and then proceeds to read object masks, getting the number of objects in each sequence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":374-399",
+            "content": "            sample['ref_scribble_label'], dtype='int64')\n        sample['label1'] = paddle.to_tensor(sample['label1'], dtype='int64')\n        sample['label2'] = paddle.to_tensor(sample['label2'], dtype='int64')\n        return sample\n    ########################\n    def _check_preprocess(self):\n        _seq_list_file = self.seq_list_file\n        if not os.path.isfile(_seq_list_file):\n            return False\n        else:\n            self.seq_dict = json.load(open(self.seq_list_file, 'r'))\n            return True\n    def _preprocess(self):\n        self.seq_dict = {}\n        for seq in self.seqs:\n            # Read object masks and get number of objects\n            name_label = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/', seq)))\n            label_path = os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                      seq, name_label[0])\n            _mask = np.array(Image.open(label_path))\n            _mask_ids = np.unique(_mask)"
+        },
+        {
+            "comment": "This code defines a class for the DAVIS2017 dataset used in training. It initializes the dataset object based on specified parameters, writes a sequence list file containing frame numbers for each sequence, and provides the functionality to load images, masks, and other data required for training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":400-430",
+            "content": "            n_obj = _mask_ids[-1]\n            self.seq_dict[seq] = list(range(1, n_obj + 1))\n        with open(self.seq_list_file, 'w') as outfile:\n            outfile.write('{{\\n\\t\"{:s}\": {:s}'.format(\n                self.seqs[0], json.dumps(self.seq_dict[self.seqs[0]])))\n            for ii in range(1, len(self.seqs)):\n                outfile.write(',\\n\\t\"{:s}\": {:s}'.format(\n                    self.seqs[ii], json.dumps(self.seq_dict[self.seqs[ii]])))\n            outfile.write('\\n}\\n')\n        print('Preprocessing finished')\nclass DAVIS2017_Train(Dataset):\n    \"\"\"DAVIS2017 dataset for training\n    Return: imgs: N*2*3*H*W,label: N*2*1*H*W, seq-name: N, frame_num:N\n    \"\"\"\n    def __init__(self,\n                 split='train',\n                 root=cfg.DATA_ROOT,\n                 transform=None,\n                 rgb=False):\n        self.split = split\n        self.db_root_dir = root\n        self.rgb = rgb\n        self.transform = transform\n        self.seq_list_file = os.path.join(\n            self.db_root_dir, 'ImageSets', '2017',"
+        },
+        {
+            "comment": "This code is creating a custom dataloader for the DAVIS dataset. It loads the image and annotation files, sorts them by name, checks if preprocessing needs to be done, and then forms a sample list containing the image paths and labels. The result will be used for training or testing purposes in the Ma-Net application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":431-455",
+            "content": "            '_'.join(self.split) + '_instances.txt')\n        self.seqs = []\n        for splt in self.split:\n            with open(\n                    os.path.join(self.db_root_dir, 'ImageSets', '2017',\n                                 self.split + '.txt')) as f:\n                seqs_tmp = f.readlines()\n            seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n            self.seqs.extend(seqs_tmp)\n        if not self._check_preprocess():\n            self._preprocess()\n        self.sample_list = []\n        for seq_name in self.seqs:\n            images = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'JPEGImages/480p/',\n                                 seq_name.strip())))\n            images_path = list(\n                map(\n                    lambda x: os.path.join('JPEGImages/480p/', seq_name.strip(),\n                                           x), images))\n            lab = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/',"
+        },
+        {
+            "comment": "The code creates a custom dataloader for a dataset with two adjacent frames and their corresponding labels. It takes the images and label paths, appends them to a list of dictionaries, and handles any necessary padding to ensure frame numbers have 5 digits. The length of the dataloader is determined by the number of samples in the sample_list, and the __getitem__ method retrieves specific samples based on their index.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":456-484",
+            "content": "                                 seq_name.strip())))\n            lab_path = list(\n                map(\n                    lambda x: os.path.join('Annotations/480p/', seq_name.strip(\n                    ), x), lab))\n            for img_path, label_path in zip(images_path[:-1], lab_path[:-1]):\n                tmp_dic = {\n                    'img': img_path,\n                    'label': label_path,\n                    'seq_name': seq_name,\n                    'frame_num': img_path.split('/')[-1].split('.')[0]\n                }\n                self.sample_list.append(tmp_dic)\n    def __len__(self):\n        return len(self.sample_list)\n    def __getitem__(self, idx):\n        tmp_sample = self.sample_list[idx]\n        imgpath = tmp_sample['img']\n        labelpath = tmp_sample['label']\n        seqname = tmp_sample['seq_name']\n        frame_num = int(tmp_sample['frame_num']) + 1\n        next_frame = str(frame_num)\n        while len(next_frame) != 5:\n            next_frame = '0' + next_frame\n        ###############################Processing two adjacent frames and labels"
+        },
+        {
+            "comment": "The code reads image and label files for a video sequence from their respective directories, converts them to numpy arrays of dtype float32 and int32 respectively for compatibility with the model's stack function. It also retrieves reference frame information from ref_frame_dic for the given sequence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":485-505",
+            "content": "        img2path = os.path.join('JPEGImages/480p/', seqname,\n                                next_frame + '.' + imgpath.split('.')[-1])\n        img2 = cv2.imread(os.path.join(self.db_root_dir, img2path))\n        img2 = np.array(img2, dtype=np.float32)\n        img1 = cv2.imread(os.path.join(self.db_root_dir, imgpath))\n        img1 = np.array(img1, dtype=np.float32)\n        ###############\n        label1 = Image.open(os.path.join(self.db_root_dir, labelpath))\n        label2path = os.path.join('Annotations/480p/', seqname,\n                                  next_frame + '.' + labelpath.split('.')[-1])\n        label2 = Image.open(os.path.join(self.db_root_dir, label2path))\n        label1 = np.array(\n            label1, dtype=np.int32\n        )  # fixed, uint8->int32, because layers.stack does not support uint8\n        label2 = np.array(\n            label2, dtype=np.int32\n        )  # fixed, uint8->int32, because layers.stack does not support uint8\n        ###################\n        ref_tmp_dic = self.ref_frame_dic[seqname]"
+        },
+        {
+            "comment": "This code reads the reference image, scribble label, and ground truth frame from a dictionary. It then converts them to appropriate data types for processing. If rgb=True, it changes the color order. It also gets the total number of objects in the sequence. Finally, it creates a sample dictionary with all these elements.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":506-530",
+            "content": "        ref_img = ref_tmp_dic['ref_frame']\n        ref_scribble_label = ref_tmp_dic['scribble_label']\n        ref_img = cv2.imread(os.path.join(self.db_root_dir, ref_img))\n        ref_img = np.array(ref_img, dtype=np.float32)\n        ref_frame_gt = ref_tmp_dic['ref_frame_gt']\n        ref_frame_gt = Image.open(os.path.join(self.db_root_dir, ref_frame_gt))\n        ref_frame_gt = np.array(\n            ref_frame_gt, dtype=np.int32\n        )  # fixed, uint8->int32, because layers.stack does not support uint8\n        ref_frame_num = ref_tmp_dic['ref_frame_num']\n        ###################\n        if self.rgb:\n            img1 = img1[:, :, [2, 1, 0]]\n            img2 = img2[:, :, [2, 1, 0]]\n            ref_img = ref_img[:, :, [2, 1, 0]]\n        obj_num = self.seq_dict[seqname][-1]\n        sample = {\n            'ref_img': ref_img,\n            'img1': img1,\n            'img2': img2,\n            'ref_scribble_label': ref_scribble_label,\n            'label1': label1,\n            'label2': label2,\n            'ref_frame_gt': ref_frame_gt"
+        },
+        {
+            "comment": "This code defines a function that creates a sample containing images, labels, and metadata for the Ma-Net model. It also includes a separate function to update the reference frame and label based on user input. The sample is then transformed using a specified transform if one is provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":531-561",
+            "content": "        }\n        if 'prev_round_label' in ref_tmp_dic:\n            prev_round_label = ref_tmp_dic['prev_round_label']\n            prev_round_label = prev_round_label.squeeze()\n            prev_round_label = prev_round_label.numpy()\n            sample = {\n                'ref_img': ref_img,\n                'img1': img1,\n                'img2': img2,\n                'ref_scribble_label': ref_scribble_label,\n                'label1': label1,\n                'label2': label2,\n                'ref_frame_gt': ref_frame_gt,\n                'prev_round_label': prev_round_label\n            }\n        sample['meta'] = {\n            'seq_name': seqname,\n            'frame_num': frame_num,\n            'obj_num': obj_num,\n            'ref_frame_num': ref_frame_num\n        }\n        if self.transform is not None:\n            sample = self.transform(sample)\n        return sample\n    def update_ref_frame_and_label(self,\n                                   round_scribble=None,\n                                   frame_num=None,\n                                   prev_round_label_dic=None):"
+        },
+        {
+            "comment": "Updating the reference frame and scribbles for each sequence in the dataset. If no frame number is given, uses the first frame from annotated_frames list. Ensures frame number is 5 digits long. Retrieves the corresponding reference image path and ground truth mask path. Reads the reference image. Resizes the image based on its height and width. Generates scribble masks for each frame using the provided scribble. If no frame number given, selects the first frame's scribble mask.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":562-584",
+            "content": "        ##########Update reference frame and scribbles\n        for seq in self.seqs:\n            scribble = round_scribble[seq]\n            if frame_num is None:\n                scr_frame = annotated_frames(scribble)[0]\n            else:\n                scr_frame = frame_num[seq]\n                scr_frame = int(scr_frame)\n            scr_f = str(scr_frame)\n            while len(scr_f) != 5:\n                scr_f = '0' + scr_f\n            ref_frame_path = os.path.join('JPEGImages/480p', seq,\n                                          scr_f + '.jpg')\n            #######################\n            ref_frame_gt = os.path.join('Annotations/480p/', seq,\n                                        scr_f + '.png')\n            #########################\n            ref_tmp = cv2.imread(os.path.join(self.db_root_dir, ref_frame_path))\n            h_, w_ = ref_tmp.shape[:2]\n            scribble_masks = scribbles2mask(scribble, (h_, w_))\n            if frame_num is None:\n                scribble_label = scribble_masks[scr_frame]"
+        },
+        {
+            "comment": "The code initializes a dictionary for reference frames, storing information such as reference frame path, scribble label, and ground truth data. If there is a previous round's label dictionary, it also includes the previous round's label in the current dictionary entry. It uses the database root directory to find the Scribbles folder and selects a random JSON file for each sequence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":585-609",
+            "content": "            else:\n                scribble_label = scribble_masks[0]\n            self.ref_frame_dic[seq] = {\n                'ref_frame': ref_frame_path,\n                'scribble_label': scribble_label,\n                'ref_frame_gt': ref_frame_gt,\n                'ref_frame_num': scr_frame\n            }\n            if prev_round_label_dic is not None:\n                self.ref_frame_dic[seq] = {\n                    'ref_frame': ref_frame_path,\n                    'scribble_label': scribble_label,\n                    'ref_frame_gt': ref_frame_gt,\n                    'ref_frame_num': scr_frame,\n                    'prev_round_label': prev_round_label_dic[seq]\n                }\n    def init_ref_frame_dic(self):\n        self.ref_frame_dic = {}\n        scribbles_path = os.path.join(self.db_root_dir, 'Scribbles')\n        for seq in self.seqs:\n            selected_json = np.random.choice(\n                ['001.json', '002.json', '003.json'], 1)\n            selected_json = selected_json[0]\n            scribble = os.path.join(self.db_root_dir, 'Scribbles', seq,"
+        },
+        {
+            "comment": "Reading JSON file for annotated frame, extracting frame path and loading reference image using OpenCV, determining the shape of the reference image, extracting the mask from the scribble, storing reference frame path in ref_frame_dic.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":610-632",
+            "content": "                                    selected_json)\n            with open(scribble) as f:\n                scribble = json.load(f)\n                #    print(scribble)\n                scr_frame = annotated_frames(scribble)[0]\n                scr_f = str(scr_frame)\n                while len(scr_f) != 5:\n                    scr_f = '0' + scr_f\n                ref_frame_path = os.path.join('JPEGImages/480p', seq,\n                                              scr_f + '.jpg')\n                ref_tmp = cv2.imread(\n                    os.path.join(self.db_root_dir, ref_frame_path))\n                h_, w_ = ref_tmp.shape[:2]\n                scribble_masks = scribbles2mask(scribble, (h_, w_))\n                ########################\n                ref_frame_gt = os.path.join('Annotations/480p/', seq,\n                                            scr_f + '.png')\n                ########################\n                scribble_label = scribble_masks[scr_frame]\n                self.ref_frame_dic[seq] = {\n                    'ref_frame': ref_frame_path,"
+        },
+        {
+            "comment": "The code reads a list of sequences from the sequence_list file and checks if it exists. If the file does not exist, it returns False; otherwise, it loads the sequence dictionary using json.load() and then proceeds to preprocess each sequence by reading object masks and finding the number of objects in the masks. The code stores this information in a dictionary format for later use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":633-661",
+            "content": "                    'scribble_label': scribble_label,\n                    'ref_frame_gt': ref_frame_gt,\n                    'ref_frame_num': scr_frame\n                }\n    ########################\n    def _check_preprocess(self):\n        _seq_list_file = self.seq_list_file\n        if not os.path.isfile(_seq_list_file):\n            return False\n        else:\n            self.seq_dict = json.load(open(self.seq_list_file, 'r'))\n            return True\n    def _preprocess(self):\n        self.seq_dict = {}\n        for seq in self.seqs:\n            # Read object masks and get number of objects\n            name_label = np.sort(\n                os.listdir(\n                    os.path.join(self.db_root_dir, 'Annotations/480p/', seq)))\n            label_path = os.path.join(self.db_root_dir, 'Annotations/480p/',\n                                      seq, name_label[0])\n            _mask = np.array(Image.open(label_path))\n            _mask_ids = np.unique(_mask)\n            n_obj = _mask_ids[-1]\n            self.seq_dict[seq] = list(range(1, n_obj + 1))"
+        },
+        {
+            "comment": "The code writes a JSON file containing video sequences and their corresponding dictionaries, which will be used for the dataset. It iterates over each sequence, formats the output as JSON strings in the file, and finishes by printing \"Preprocessing finished\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py\":663-671",
+            "content": "        with open(self.seq_list_file, 'w') as outfile:\n            outfile.write('{{\\n\\t\"{:s}\": {:s}'.format(\n                self.seqs[0], json.dumps(self.seq_dict[self.seqs[0]])))\n            for ii in range(1, len(self.seqs)):\n                outfile.write(',\\n\\t\"{:s}\": {:s}'.format(\n                    self.seqs[ii], json.dumps(self.seq_dict[self.seqs[ii]])))\n            outfile.write('\\n}\\n')\n        print('Preprocessing finished')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/29e71c71-93fa-4b6d-8c00-8886b0e95962.json b/docs/doc/29e71c71-93fa-4b6d-8c00-8886b0e95962.json
new file mode 100644
index 000000000..410ce0133
--- /dev/null
+++ b/docs/doc/29e71c71-93fa-4b6d-8c00-8886b0e95962.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code uses PaddleVideo library to classify videos, perform feature extraction, and predict bounding box results. It logs information, saves outputs if needed, and writes inference results into a JSON file.",
+    "details": [
+        {
+            "comment": "This code is for Baidu Cloud action and loads the model using the BMN (Behaved Motion Network) model from a given configuration file. It also defines a function to classify videos by predicting their actions and prints the information about the video being processed. The code uses logger for logging the information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py\":0-49",
+            "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport shutil\nimport numpy as np\nsys.path.append(\n    \"/workspace/bianjiang03/App_TableTennis/PaddleVideo/FootballAction/predict/action_detect\"\n)\nimport models.bmn_infer as prop_model\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport utils.config_utils as config_utils\nimport logger\nlogger = logger.Logger()\ndef load_model(cfg_file=\"configs/configs.yaml\"):\n    \"\"\"\n    load_model\n    \"\"\"\n    logger.info(\"load model ... \")\n    global infer_configs\n    infer_configs = parse_config(cfg_file)\n    print_configs(infer_configs, \"Infer\")\n    t0 = time.time()\n    global prop_model\n    prop_model = prop_model.InferModel(infer_configs)\n    t1 = time.time()\n    logger.info(\"step0: load model time: {} min\\n\".format((t1 - t0) * 1.0 / 60))\ndef video_classify(video_name, dataset_dir):\n    \"\"\"\n    extract_feature\n    \"\"\"\n    logger.info('predict ... ')\n    logger.info(video_name)"
+        },
+        {
+            "comment": "This code performs video feature extraction and proposal generation using the PaddleVideo library. It first loads the video features from a pickle file, then predicts bounding box minimum notation (Bmn) results using a pre-trained model. Finally, it returns the Bmn results and saves them in an output directory if it doesn't already exist.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py\":51-83",
+            "content": "    # step 1: extract feature\n    feature_path = dataset_dir + video_name\n    video_features = pickle.load(open(feature_path, 'rb'))\n    print('===video_features===', video_name)\n    # step2: get proposal\n    t0 = time.time()\n    bmn_results = prop_model.predict(infer_configs, material=video_features)\n    t1 = time.time()\n    logger.info(np.array(bmn_results).shape)\n    logger.info(\"step2: proposal time: {} min\".format((t1 - t0) * 1.0 / 60))\n    return bmn_results\nif __name__ == '__main__':\n    dataset_dir = '/workspace/bianjiang03/DATA/Features_competition_test_A/'\n    output_dir = '/workspace/bianjiang03/DATA'\n    if not os.path.exists(output_dir + '/Output_for_bmn'):\n        os.mkdir(output_dir + '/Output_for_bmn')\n    results = []\n    load_model()\n    directory = os.fsencode(dataset_dir)\n    for file in os.listdir(directory):\n        filename = os.fsdecode(file)\n        bmn_results = video_classify(filename, dataset_dir)\n        results.append({\n            'video_name': filename.split('.pkl')[0],\n            'num_proposal': len(bmn_results),"
+        },
+        {
+            "comment": "This code segment writes the inference results into a JSON file. It first stores the 'bmn_results' dictionary and then writes it to a file named \"Output_for_bmn/prop.json\". The JSON data is formatted for readability with indentation and using UTF-8 encoding. Once writing is complete, it prints \"Done with the inference!\" indicating successful execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py\":84-92",
+            "content": "            'bmn_results': bmn_results\n        })\n    with open(output_dir + '/Output_for_bmn/prop.json', 'w',\n              encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)\n    print('Done with the inference!')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2a3ee5e8-0ed0-470e-8093-d3d61b489df6.json b/docs/doc/2a3ee5e8-0ed0-470e-8093-d3d61b489df6.json
new file mode 100644
index 000000000..cb12621a6
--- /dev/null
+++ b/docs/doc/2a3ee5e8-0ed0-470e-8093-d3d61b489df6.json
@@ -0,0 +1,140 @@
+{
+    "summary": "OctConv3D is a configurable 3D convolutional layer in TransNetV2's backbone, utilizing features such as max pooling and SDDCNNV2 blocks for shot transition detection. ConvNextV2 applies feature extraction and pooling, while the code defines models using Linear and ConvexCombinationRegularization layers for classification tasks.",
+    "details": [
+        {
+            "comment": "This code defines a 3D convolutional neural network layer called OctConv3D. It takes input and output channels, kernel size, dilation rate, alpha (for octave pooling), use_bias flag, and initializer as parameters for creating the layer. This layer can be used in other models by utilizing the BACKBONES registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":0-27",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as functional\nimport random\nfrom paddle import ParamAttr\nfrom ..registry import BACKBONES\nclass OctConv3D(nn.Layer):\n    def __init__(self, in_filters, filters, kernel_size=3, dilation_rate=(1, 1, 1), alpha=0.25,\n                 use_bias=True, kernel_initializer=nn.initializer.KaimingNormal()):\n        super(OctConv3D, self).__init__()"
+        },
+        {
+            "comment": "Defines a 3D Convolutional network with interleaved low and high-resolution paths. Low-to-high and high-to-low convolutions are performed to maintain spatial resolution while reducing dimensionality for the TransNetV2 backbone model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":29-42",
+            "content": "        self.low_channels = int(filters * alpha)\n        self.high_channels = filters - self.low_channels\n        self.high_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,\n                                      dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),\n                                      weight_attr=ParamAttr(initializer=kernel_initializer),\n                                      bias_attr=ParamAttr(\n                                          initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)\n        self.high_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,\n                                     dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),\n                                     weight_attr=ParamAttr(initializer=kernel_initializer),\n                                     bias_attr=False)\n        self.low_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,\n                                     dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),"
+        },
+        {
+            "comment": "This code defines a TransNetV2 backbone model for video analysis. It includes convolutional layers, an upsampler, and downsampler to process input data. The `pad_to` function pads the tensor with zeros to match a target shape, useful for maintaining consistent dimensions throughout the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":43-57",
+            "content": "                                     weight_attr=ParamAttr(initializer=kernel_initializer),\n                                     bias_attr=False)\n        self.low_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,\n                                    dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),\n                                    weight_attr=ParamAttr(initializer=kernel_initializer),\n                                    bias_attr=ParamAttr(\n                                        initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)\n        self.upsampler = nn.Upsample(size=(1, 2, 2), data_format='NCDHW')\n        self.downsampler = nn.AvgPool3D(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=(0, 1, 1))\n    @staticmethod\n    def pad_to(tensor, target_shape):\n        shape = tensor.shape\n        padding = [[0, tar - curr] for curr, tar in zip(shape, target_shape)]\n        return functional.pad(tensor, padding, \"CONSTANT\", data_format='NCDHW')"
+        },
+        {
+            "comment": "The code defines a forward function that takes inputs and performs high-to-high, high-to-low, low-to-high, and low-to-low transformations. It also includes a Conv3DConfigurable class with parameters for in_filters, filters, dilation_rate, separable, octave, and use_bias. The code asserts that separable and octave cannot both be True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":59-89",
+            "content": "    @staticmethod\n    def crop_to(tensor, target_width, target_height):\n        return tensor[:, :, :target_height, :target_width]\n    def forward(self, inputs):\n        low_inputs, high_inputs = inputs\n        high_to_high = self.high_to_high(high_inputs)\n        high_to_low = self.high_to_low(self.downsampler(high_inputs))\n        low_to_high = self.upsampler(self.low_to_high(low_inputs))\n        low_to_low = self.low_to_low(low_inputs)\n        high_output = high_to_high[:, :, :, :low_to_high.shape[3], :low_to_high.shape[4]] + low_to_high\n        low_output = low_to_low + high_to_low[:, :, :, :low_to_low.shape[3], :low_to_low.shape[4]]\n        return low_output, high_output\nclass Conv3DConfigurable(nn.Layer):\n    def __init__(self,\n                 in_filters,\n                 filters,\n                 dilation_rate,\n                 separable=True,\n                 octave=False,\n                 use_bias=True):\n        super(Conv3DConfigurable, self).__init__()\n        assert not (separable and octave)\n        if separable:"
+        },
+        {
+            "comment": "The code initializes a Conv3D layer and an optional octave convolution layer for the TransNetV2 backbone. The Conv3D layers apply 3x3 kernel with varying dilation rates, while the optional OctConv3D layer has a 3x1x1 kernel and dilation rate (dilation_rate, 1, 1). The layers are added to a LayerList for further processing in the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":90-103",
+            "content": "            conv1 = nn.Conv3D(in_filters, 2 * filters, kernel_size=(1, 3, 3),\n                              dilation=(1, 1, 1), padding=(0, 1, 1),\n                              weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),\n                              bias_attr=False)\n            conv2 = nn.Conv3D(2 * filters, filters, kernel_size=(3, 1, 1),\n                              dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 0, 0),\n                              weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),\n                              bias_attr=ParamAttr(\n                                  initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)\n            self.layers = nn.LayerList([conv1, conv2])\n        elif octave:\n            conv = OctConv3D(in_filters, filters, kernel_size=3, dilation_rate=(dilation_rate, 1, 1),\n                             use_bias=use_bias,\n                             kernel_initializer=nn.initializer.KaimingNormal())"
+        },
+        {
+            "comment": "This code defines a neural network backbone called TransnetV2, which consists of convolutional layers. The Conv3DConfigurable class is used to configure the layers with specified input and output filters, kernel size, dilation rate, padding, and whether to use bias or batch normalization. The DilatedDCNNV2 class extends this concept by allowing the choice between octave convolution and batch normalization. Both classes inherit from nn.Layer and have a forward method for processing inputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":104-130",
+            "content": "            self.layers = [conv]\n        else:\n            conv = nn.Conv3D(in_filters, filters, kernel_size=3,\n                             dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 1, 1),\n                             weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),\n                             bias_attr=ParamAttr(\n                                 initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)\n            self.layers = nn.LayerList([conv])\n    def forward(self, inputs):\n        x = inputs\n        for layer in self.layers:\n            x = layer(x)\n        return x\nclass DilatedDCNNV2(nn.Layer):\n    def __init__(self,\n                 in_filters,\n                 filters,\n                 batch_norm=True,\n                 activation=None,\n                 octave_conv=False):\n        super(DilatedDCNNV2, self).__init__()\n        assert not (octave_conv and batch_norm)\n        self.Conv3D_1 = Conv3DConfigurable(in_filters, filters, 1, use_bias=not batch_norm, octave=octave_conv)"
+        },
+        {
+            "comment": "This code defines a TransNetV2 model, which uses multiple Conv3D layers to process input data. The model includes configurable convolution layers with different filter sizes (2, 4, and 8), batch normalization, and activation functions. The forward method applies these layers to the inputs and concatenates their outputs along the channel dimension.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":131-149",
+            "content": "        self.Conv3D_2 = Conv3DConfigurable(in_filters, filters, 2, use_bias=not batch_norm, octave=octave_conv)\n        self.Conv3D_4 = Conv3DConfigurable(in_filters, filters, 4, use_bias=not batch_norm, octave=octave_conv)\n        self.Conv3D_8 = Conv3DConfigurable(in_filters, filters, 8, use_bias=not batch_norm, octave=octave_conv)\n        self.octave = octave_conv\n        self.bn = nn.BatchNorm3D(filters * 4, momentum=0.99, epsilon=1e-03,\n                                 weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),\n                                 bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))\n                                 ) if batch_norm else None\n        self.activation = activation\n    def forward(self, inputs):\n        conv1 = self.Conv3D_1(inputs)\n        conv2 = self.Conv3D_2(inputs)\n        conv3 = self.Conv3D_4(inputs)\n        conv4 = self.Conv3D_8(inputs)\n        # shape of convi[j]/convi is [B, 3, T, H, W], concat in channel dimension\n        if self.octave:"
+        },
+        {
+            "comment": "The code defines a StackedDDCNNV2 class that is a type of neural network layer. It takes in parameters such as number of input filters, number of blocks, and output filters. The class uses convolutions with optional batch normalization and activation functions. The convolutions can be either octave or non-octave depending on the parameter setting. The pooling type is either max or average pooling, and there is a stochastic depth drop probability parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":150-178",
+            "content": "            x = [paddle.concat([conv1[0], conv2[0], conv3[0], conv4[0]], axis=1),\n                 paddle.concat([conv1[1], conv2[1], conv3[1], conv4[1]], axis=1)]\n        else:\n            x = paddle.concat([conv1, conv2, conv3, conv4], axis=1)\n        if self.bn is not None:\n            x = self.bn(x)\n        if self.activation is not None:\n            if self.octave:\n                x = [self.activation(x[0]), self.activation(x[1])]\n            else:\n                x = self.activation(x)\n        return x\nclass StackedDDCNNV2(nn.Layer):\n    def __init__(self,\n                 in_filters,\n                 n_blocks,\n                 filters,\n                 shortcut=True,\n                 use_octave_conv=False,\n                 pool_type=\"avg\",\n                 stochastic_depth_drop_prob=0.0):\n        super(StackedDDCNNV2, self).__init__()\n        assert pool_type == \"max\" or pool_type == \"avg\"\n        if use_octave_conv and pool_type == \"max\":\n            print(\"WARN: Octave convolution was designed with average pooling, not max pooling.\")"
+        },
+        {
+            "comment": "Initializes backbone layers and sets parameters. Applies octave convolution if use_octave_conv is True, and performs max or avg pooling depending on pool_type. Stochastic depth is applied with probability stochastic_depth_drop_prob. Forward pass applies blocks of DDCNNV2, concatenates and applies ReLU activation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":180-206",
+            "content": "        self.shortcut = shortcut\n        self.DDCNN = nn.LayerList([\n            DilatedDCNNV2(in_filters if i == 1 else filters * 4, filters, octave_conv=use_octave_conv,\n                          activation=functional.relu if i != n_blocks else None) for i in range(1, n_blocks + 1)\n        ])\n        self.pool = nn.MaxPool3D(kernel_size=(1, 2, 2)) if pool_type == \"max\" else nn.AvgPool3D(kernel_size=(1, 2, 2))\n        self.octave = use_octave_conv\n        self.stochastic_depth_drop_prob = stochastic_depth_drop_prob\n    def forward(self, inputs):\n        x = inputs\n        shortcut = None\n        if self.octave:\n            x = [self.pool(x), x]\n        for block in self.DDCNN:\n            x = block(x)\n            if shortcut is None:\n                shortcut = x\n        # shape of x[i] is [B, 3, T, H, W], concat in channel dimension\n        if self.octave:\n            x = paddle.concat([x[0], self.pool(x[1])], axis=1)\n        x = functional.relu(x)\n        if self.shortcut is not None:\n            if self.stochastic_depth_drop_prob != 0.:"
+        },
+        {
+            "comment": "This code defines a ResNetBlock class that consists of Conv2D layer and BatchNorm2D layer. The stochastic depth is applied during training by randomly dropping connections with a specified probability, while in non-octave cases, it applies pooling to the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":207-231",
+            "content": "                if self.training:\n                    if random.random() < self.stochastic_depth_drop_prob:\n                        x = shortcut\n                    else:\n                        x = x + shortcut\n                else:\n                    x = (1 - self.stochastic_depth_drop_prob) * x + shortcut\n            else:\n                x += shortcut\n        if not self.octave:\n            x = self.pool(x)\n        return x\nclass ResNetBlock(nn.Layer):\n    def __init__(self, in_filters, filters, strides=(1, 1)):\n        super(ResNetBlock, self).__init__()\n        self.conv1 = nn.Conv2D(in_filters, filters, kernel_size=(3, 3), stride=strides, padding=(1, 1),\n                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                               bias_attr=False)\n        self.bn1 = nn.BatchNorm2D(filters,\n                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),\n                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))"
+        },
+        {
+            "comment": "The code defines a Conv2D layer and BatchNorm2D layer in the `TransNetV2` class, followed by a forward function that applies these layers in sequence. The ResNetFeatures class initializes a Conv2D layer for extracting features from input images. Both classes are part of an object-oriented model architecture.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":233-259",
+            "content": "        self.conv2 = nn.Conv2D(filters, filters, kernel_size=(3, 3), padding=(1, 1),\n                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                               bias_attr=False)\n        self.bn2 = nn.BatchNorm2D(filters,\n                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)),\n                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))\n    def forward(self, inputs):\n        x = self.conv1(inputs)\n        x = self.bn1(x)\n        x = functional.relu(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        shortcut = inputs\n        x += shortcut\n        return functional.relu(x)\nclass ResNetFeatures(nn.Layer):\n    def __init__(self, in_filters=3,\n                 mean=[0.485, 0.456, 0.406],\n                 std=[0.229, 0.224, 0.225]):\n        super(ResNetFeatures, self).__init__()\n        self.conv1 = nn.Conv2D(in_channels=in_filters, out_channels=64, kernel_size=(7, 7),"
+        },
+        {
+            "comment": "This code is for TransNetV2 backbone model initialization. It includes a convolution layer with padding, batch normalization, max pooling, and ResNetBlocks (layer2a, layer2b). The forward function performs normalization, reshaping, convolution, and batch normalization on the input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":260-281",
+            "content": "                               stride=(2, 2), padding=(3, 3),\n                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                               bias_attr=False)\n        self.bn1 = nn.BatchNorm2D(num_features=64, momentum=0.99, epsilon=1e-03,\n                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),\n                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))\n                                  )\n        self.max_pool = nn.MaxPool2D(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n        self.layer2a = ResNetBlock(64, 64)\n        self.layer2b = ResNetBlock(64, 64)\n        self.mean = paddle.to_tensor(mean)\n        self.std = paddle.to_tensor(std)\n    def forward(self, inputs):\n        shape = inputs.shape\n        x = paddle.reshape(inputs, [shape[0] * shape[2], shape[1], shape[3], shape[4]])\n        x = (x - self.mean) / self.std\n        x = self.conv1(x)\n        x = self.bn1(x)"
+        },
+        {
+            "comment": "This code defines a class \"FrameSimilarity\" that takes in filters, similarity dimension, lookup window, output dimension, stop_gradient flag, and use_bias as parameters. It initializes the layer with a projection linear layer and an fc linear layer. The projection layer maps input features to a specified similarity dimension using XavierUniform initialization. The fc layer maps the lookup window to the output dimension, using XavierUniform initialization for weights and Constant initialization for biases.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":282-306",
+            "content": "        x = functional.relu(x)\n        x = self.max_pool(x)\n        x = self.layer2a(x)\n        x = self.layer2b(x)\n        new_shape = x.shape\n        x = paddle.reshape(x, [shape[0], new_shape[1], shape[2], new_shape[2], new_shape[3]])\n        return x\nclass FrameSimilarity(nn.Layer):\n    def __init__(self,\n                 in_filters,\n                 similarity_dim=128,\n                 lookup_window=101,\n                 output_dim=128,\n                 stop_gradient=False,\n                 use_bias=False):\n        super(FrameSimilarity, self).__init__()\n        self.projection = nn.Linear(in_filters, similarity_dim,\n                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                    bias_attr=use_bias)\n        self.fc = nn.Linear(lookup_window, output_dim,\n                            weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                            bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))"
+        },
+        {
+            "comment": "The code initializes a TransNetV2 model with lookup window and stop_gradient options. It then calculates similarities between time windows using batch mean, transpose, projection, and normalization. Finally, it pads the similarities for further calculations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":308-329",
+            "content": "        self.lookup_window = lookup_window\n        self.stop_gradient = stop_gradient\n        assert lookup_window % 2 == 1, \"`lookup_window` must be odd integer\"\n    def forward(self, inputs):\n        x = paddle.concat([paddle.mean(x, axis=[3, 4]) for x in inputs], axis=1)\n        x = paddle.transpose(x, (0, 2, 1))\n        if self.stop_gradient:\n            x = x.stop_gradient\n        x = self.projection(x)\n        x = functional.normalize(x, p=2, axis=2)\n        batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]\n        time_window = x.shape[1]\n        similarities = paddle.bmm(x, x.transpose([0, 2, 1]))  # [batch_size, time_window, time_window]\n        similarities_padded = functional.pad(similarities,\n                                             [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],\n                                             data_format='NCL')\n        batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])"
+        },
+        {
+            "comment": "This code is calculating the indices for gathering similarities from a padded tensor. It tiles and stacks batch, time, and lookup indices to create an array of valid indices. Then it uses these indices to gather similarities from the padded tensor and applies ReLU activation on top of an FC layer to return the output. The ConvexCombinationRegularization class initializes a projection layer with specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":330-345",
+            "content": "        batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])\n        time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])\n        time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])\n        lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])\n        lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices\n        indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)\n        similarities = paddle.gather_nd(similarities_padded, indices)\n        return functional.relu(self.fc(similarities))\nclass ConvexCombinationRegularization(nn.Layer):\n    def __init__(self, in_filters, filters=32, delta_scale=10., loss_weight=0.01):\n        super(ConvexCombinationRegularization, self).__init__()\n        self.projection = nn.Conv3D(in_filters, filters, kernel_size=1, dilation=1, padding=(0, 0, 0),\n                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),"
+        },
+        {
+            "comment": "This code defines a Conv3D model for the TransNetV2 backbone. It has a projection layer, relu activation, and takes in image_inputs and feature_inputs. The forward function processes these inputs, extracting the first and last frame windows.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":346-363",
+            "content": "                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))\n        self.features = nn.Conv3D((filters * 3), filters * 2,\n                                  kernel_size=(3, 3, 3), dilation=1, padding=(1, 1, 1),\n                                  weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))\n        self.dense = nn.Linear(64, 1, weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), bias_attr=True)\n        self.loss = nn.SmoothL1Loss(reduction='none')\n        self.delta_scale = delta_scale\n        self.loss_weight = loss_weight\n    def forward(self, image_inputs, feature_inputs):\n        x = feature_inputs\n        x = self.projection(x)\n        x = functional.relu(x)\n        batch_size = x.shape[0]\n        window_size = x.shape[2]\n        first_frame = paddle.tile(x[:, :, :1], [1, 1, window_size, 1, 1])\n        last_frame = paddle.tile(x[:, :, -1:], [1, 1, window_size, 1, 1])"
+        },
+        {
+            "comment": "This code is part of the TransnetV2 model in PaddleVideo. It concatenates frames, processes them through layers, and calculates alpha values for first and last images. It then combines these images based on the calculated alphas and performs loss calculation using a loss function. The ColorHistograms layer is initialized with a linear transformation layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":364-389",
+            "content": "        x = paddle.concat([x, first_frame, last_frame], 1)\n        x = self.features(x)\n        x = functional.relu(x)\n        x = paddle.mean(x, axis=[3, 4])\n        x = paddle.transpose(x, (0, 2, 1))\n        alpha = self.dense(x)\n        alpha = paddle.transpose(alpha, (0, 2, 1))\n        first_img = paddle.tile(image_inputs[:, :, :1], [1, 1, window_size, 1, 1])\n        last_img = paddle.tile(image_inputs[:, :, -1:], [1, 1, window_size, 1, 1])\n        alpha_ = functional.sigmoid(alpha)\n        alpha_ = paddle.reshape(alpha_, [batch_size, 1, window_size, 1, 1])\n        predictions_ = (alpha_ * first_img + (1 - alpha_) * last_img)\n        loss_ = self.loss(label=image_inputs / self.delta_scale, input=predictions_ / self.delta_scale)\n        loss_ = self.loss_weight * paddle.mean(loss_)\n        return alpha, loss_\nclass ColorHistograms(nn.Layer):\n    def __init__(self,\n                 lookup_window=101,\n                 output_dim=None):\n        super(ColorHistograms, self).__init__()\n        self.fc = nn.Linear(lookup_window, output_dim,"
+        },
+        {
+            "comment": "This code defines a function to compute color histograms of frames. It first converts the frame values to int32, then defines a function get_bin which extracts and scales RGB values. The batch size is extracted from the frames shape, and the frames are flattened into a 3-dimensional array if the number of channels is 3 or 6.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":390-410",
+            "content": "                            weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                            bias_attr=ParamAttr(\n                                initializer=nn.initializer.Constant(value=0.))) if output_dim is not None else None\n        self.lookup_window = lookup_window\n        assert lookup_window % 2 == 1, \"`lookup_window` must be odd integer\"\n    def compute_color_histograms(self, frames):\n        frames = frames.astype('int32')\n        def get_bin(frames):\n            # returns 0 .. 511\n            R, G, B = frames[:, :, 0], frames[:, :, 1], frames[:, :, 2]\n            R, G, B = R // 32, G // 32, B // 32\n            return (R * 64) + (G * 8) + B\n        batch_size = paddle.slice(frames.shape, starts=[0], ends=[1], axes=[0]) if frames.shape[0] == -1 else frames.shape[0]\n        time_window, height, width, no_channels = frames.shape[1:]\n        assert no_channels == 3 or no_channels == 6\n        if no_channels == 3:\n            frames_flatten = frames.reshape([-1, height * width, 3])"
+        },
+        {
+            "comment": "This code computes color histograms for each frame in a video and then calculates similarities between frames using batch matrix multiplication. It first checks the input shape to determine whether it should extract only the batch size and time window or use the total shape. It then reshapes and bins the frame values, normalizes the histograms, and finally computes the similarity matrix for each frame pair. The purpose is likely for video sequence analysis or comparison.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":411-428",
+            "content": "        else:\n            frames_flatten = frames.reshape([-1, height * width * 2, 3])\n        binned_values = get_bin(frames_flatten)\n        frame_bin_prefix = (paddle.arange(0, batch_size * time_window) * 512).reshape([-1, 1])\n        binned_values = (binned_values + frame_bin_prefix).reshape([-1, 1])\n        histograms = paddle.zeros_like(frame_bin_prefix, dtype='int32').tile([512]).reshape([-1])\n        histograms = histograms.scatter_nd_add(binned_values, paddle.ones_like(binned_values, dtype='int32').reshape([-1]))\n        histograms = histograms.reshape([batch_size, time_window, 512]).astype('float32')\n        histograms_normalized = functional.normalize(histograms, p=2, axis=2)\n        return histograms_normalized\n    def forward(self, inputs):\n        x = self.compute_color_histograms(inputs)\n        batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]\n        time_window = x.shape[1]\n        similarities = paddle.bmm(x, x.transpose([0, 2, 1]))  # [batch_size, time_window, time_window]"
+        },
+        {
+            "comment": "This code performs lookup on a padded tensor using gathered indices from batch, time window, and lookup window. It then applies an optional fully connected layer with ReLU activation function if present.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":429-445",
+            "content": "        similarities_padded = functional.pad(similarities,\n                                             [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],\n                                             data_format='NCL')\n        batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])\n        batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])\n        time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])\n        time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])\n        lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])\n        lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices\n        indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)\n        similarities = paddle.gather_nd(similarities_padded, indices)\n        if self.fc is not None:\n            return functional.relu(self.fc(similarities))\n        return similarities"
+        },
+        {
+            "comment": "The code defines the TransNetV2 model, a deep network architecture for shot transition detection. It has multiple input sources and various options to use or not use different features and operations. The mean and std are provided as initialization parameters to standardize the input data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":448-472",
+            "content": "@BACKBONES.register()\nclass TransNetV2(nn.Layer):\n    \"\"\"TransNetV2 model from\n    `\"TransNet V2: An effective deep network architecture for fast shot transition detection\" <https://arxiv.org/abs/2008.04838>`_\n    \"\"\"\n    def __init__(self,\n                 F=16, L=3, S=2, D=1024,\n                 use_many_hot_targets=True,\n                 use_frame_similarity=True,\n                 use_color_histograms=True,\n                 use_mean_pooling=False,\n                 dropout_rate=0.5,\n                 use_convex_comb_reg=False,\n                 use_resnet_features=False,\n                 use_resnet_like_top=False,\n                 frame_similarity_on_last_layer=False,\n                 mean=[0.485, 0.456, 0.406],\n                 std=[0.229, 0.224, 0.225]):\n        super(TransNetV2, self).__init__()\n        self.mean = np.array(mean, np.float32).reshape([1, 3, 1, 1]) * 255\n        self.std = np.array(std, np.float32).reshape([1, 3, 1, 1]) * 255\n        self.use_resnet_features = use_resnet_features\n        s"
+        },
+        {
+            "comment": "Code snippet is from PaddleVideo's TransNetV2 model. It checks if use_resnet_features is True and if so, initializes resnet_layers with ResNetFeatures. If resnet_like_top is also True, it then initializes resnet_like_top_conv and resnet_like_top_bn for ResNet-like top layers with specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":472-483",
+            "content": "elf.resnet_layers = ResNetFeatures(in_filters=3, mean=self.mean, std=self.std) if self.use_resnet_features else None\n        self.resnet_like_top = use_resnet_like_top\n        if self.resnet_like_top:\n            self.resnet_like_top_conv = nn.Conv3D(64 if self.use_resnet_features else 3, 32, kernel_size=(3, 7, 7),\n                                                  stride=(1, 2, 2),\n                                                  padding=(1, 3, 3),\n                                                  weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                                  bias_attr=False)\n            self.resnet_like_top_bn = nn.BatchNorm3D(32, momentum=0.99, epsilon=1e-03,\n                                                     weight_attr=ParamAttr(\n                                                         initializer=nn.initializer.Constant(value=1.)),\n                                                     bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))"
+        },
+        {
+            "comment": "This code initializes the model components of a TransNetv2 backbone. It sets up max pooling, creates a LayerList for SDDCNNV2 blocks, initializes frame similarity and color histogram layers based on flags, and includes dropout layer if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":484-507",
+            "content": "            self.resnet_like_top_max_pool = nn.MaxPool3D(kernel_size=(1, 3, 3), stride=(1, 2, 2),\n                                                         padding=(0, 1, 1))\n        if self.resnet_like_top:\n            in_filters = 32\n        elif self.use_resnet_features:\n            in_filters = 64\n        else:\n            in_filters = 3\n        self.SDDCNN = nn.LayerList(\n            [StackedDDCNNV2(in_filters=in_filters, n_blocks=S, filters=F,\n                            stochastic_depth_drop_prob=0.)] +\n            [StackedDDCNNV2(in_filters=(F * 2 ** (i - 1)) * 4, n_blocks=S, filters=F * 2 ** i) for i in range(1, L)]\n        )\n        self.frame_sim_layer = FrameSimilarity(\n            sum([(F * 2 ** i) * 4 for i in range(L)]), lookup_window=101, output_dim=128, similarity_dim=128,\n            use_bias=True\n        ) if use_frame_similarity else None\n        self.color_hist_layer = ColorHistograms(\n            lookup_window=101, output_dim=128\n        ) if use_color_histograms else None\n        self.dropout = nn.Dropout(dropout_rate) if dropout_rate is not None else None"
+        },
+        {
+            "comment": "This code initializes a neural network model with a linear layer (`self.fc1`) that takes an input dimension of 512 if certain conditions are met, otherwise it takes the output_dim calculated earlier. The layer has D output dimensions and uses Xavier uniform initialization for weights and constant initialization for biases. Additionally, there's another linear layer (`self.cls_layer1`) with 1 output dimension that is initialized with Xavier uniform initialization for weights and a constant value of 0 for biases. It takes an input dimension of either 1152 or D based on whether frame similarity is added to the last layer or not.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":509-525",
+            "content": "        output_dim = ((F * 2 ** (L - 1)) * 4) * 3 * 6  # 3x6 for spatial dimensions\n        if use_frame_similarity: output_dim += 128\n        if use_color_histograms: output_dim += 128\n        self.use_mean_pooling = use_mean_pooling\n        self.has_downsample = False\n        if self.use_resnet_features or self.resnet_like_top or self.use_mean_pooling:\n            self.has_downsample = True\n        self.fc1 = nn.Linear(512 if self.has_downsample else output_dim, D,\n                             weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                             bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))\n                             )\n        self.frame_similarity_on_last_layer = frame_similarity_on_last_layer\n        self.cls_layer1 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,\n                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))"
+        },
+        {
+            "comment": "The code defines a model with two layers, a Linear layer and ConvexCombinationRegularization, depending on the use_many_hot_targets and use_convex_comb_reg parameters. The Linear layer has 1 output for each frame, unless frame_similarity_on_last_layer is set, in which case it has D outputs. If use_many_hot_targets is False, the layer is None. The forward function receives inputs of shape [B, T, H, W, 3] and performs transpose, resnet_features processing (if use_resnet_features=True), and normalization to apply the model layers. It also clips the input values between 0 and 255 before applying the regularization if use_convex_comb_reg is True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":526-547",
+            "content": "                                    )\n        self.cls_layer2 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,\n                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),\n                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))\n                                    ) if use_many_hot_targets else None\n        self.convex_comb_reg = ConvexCombinationRegularization(\n            in_filters=(F * 2 ** (L - 1) * 4)) if use_convex_comb_reg else None\n    def forward(self, inputs):\n        assert list(inputs.shape[2:]) == [27, 48, 3] and inputs.dtype == paddle.float32, \\\n            \"incorrect input type and/or shape\"\n        out_dict = {}\n        # shape [B, T, H, W, 3] to shape [B, 3, T, H, W]\n        x = inputs.transpose([0, 4, 1, 2, 3])\n        if self.use_resnet_features:\n            x = self.resnet_layers(x)\n        else:\n            x = x / 255.\n        inputs = inputs.clip(min=0).astype('uint8')\n        if self.resnet_like_top:"
+        },
+        {
+            "comment": "This code performs feature extraction and pooling operations for a ConvNextV2 backbone model. It applies residual blocks, top convolutions, batch normalization, and max pooling to the input. Then it calculates convex combination regression if required. The code either applies mean pooling or 3D reshaping based on the use_mean_pooling flag. Finally, it concatenates frame similarity layer outputs and color histogram layer outputs before performing fully connected layer calculations and applying relu activation and dropout if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":548-570",
+            "content": "            x = self.resnet_like_top_conv(x)\n            x = self.resnet_like_top_bn(x)\n            x = self.resnet_like_top_max_pool(x)\n        block_features = []\n        for block in self.SDDCNN:\n            x = block(x)\n            block_features.append(x)\n        if self.convex_comb_reg is not None:\n            out_dict[\"alphas\"], out_dict[\"comb_reg_loss\"] = self.convex_comb_reg(inputs.transpose([0, 4, 1, 2, 3]), x)\n        if self.use_mean_pooling:\n            x = paddle.mean(x, axis=[3, 4])\n            x = x.transpose([0, 2, 1])\n        else:\n            x = x.transpose([0, 2, 3, 4, 1])\n            x = x.reshape([x.shape[0], x.shape[1], x.shape[2]*x.shape[3]*x.shape[4]])\n        if self.frame_sim_layer is not None:\n            x = paddle.concat([self.frame_sim_layer(block_features), x], 2)\n        if self.color_hist_layer is not None:\n            x = paddle.concat([self.color_hist_layer(inputs), x], 2)\n        x = self.fc1(x)\n        x = functional.relu(x)\n        if self.dropout is not None:\n            x = self.dropout(x)"
+        },
+        {
+            "comment": "This code checks if the frame similarity layer and classifier layers are not None, then performs a concatenation operation on block features and x. It applies the classifier layer to the resulting output and optionally applies another classifier layer. The function returns one_hot and an optional out_dict if they exist.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/transnetv2.py\":571-580",
+            "content": "        if self.frame_sim_layer is not None and self.frame_similarity_on_last_layer:\n            x = paddle.concat([self.frame_sim_layer(block_features), x], 2)\n        one_hot = self.cls_layer1(x)\n        if self.cls_layer2 is not None:\n            out_dict[\"many_hot\"] = self.cls_layer2(x)\n        if len(out_dict) > 0:\n            return one_hot, out_dict\n        return one_hot"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2a982ad5-ef2a-4e9a-baf5-63578a23e618.json b/docs/doc/2a982ad5-ef2a-4e9a-baf5-63578a23e618.json
new file mode 100644
index 000000000..afd7152ea
--- /dev/null
+++ b/docs/doc/2a982ad5-ef2a-4e9a-baf5-63578a23e618.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code calculates recall for object detection by iterating through images, selecting proposals based on scores, and calculating IoU between these proposals and ground truth boxes. It defines `recalls()` and `eval_recalls()`, with the latter computing recalls for each image.",
+    "details": [
+        {
+            "comment": "This code calculates recall metric for object detection. It takes in all IoUs (intersection over union), proposal numbers, and thresholds as input. It iterates through images and gt (ground truth) boxes, and then computes the recall scores for each image and stores them in an array. The recall is computed by finding the maximum IOU between ground truth and proposals for each image and storing it in a temporary array, then concatenating these values into the final result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/recall.py\":0-26",
+            "content": "import numpy as np\nimport paddle \ndef _recalls(all_ious, proposal_nums, thrs):\n    img_num = all_ious.shape[0]\n    total_gt_num = sum([ious.shape[0] for ious in all_ious])\n    ious_ = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)\n    for k, proposal_num in enumerate(proposal_nums):\n        tmp_ious = np.zeros(0)\n        for i in range(img_num):\n            ious = all_ious[i][:, :proposal_num].copy()\n            gt_ious = np.zeros(ious.shape[0])\n            if ious.size == 0:\n                tmp_ious = np.hstack((tmp_ious, gt_ious))\n                continue\n            for j in range(ious.shape[0]):\n                gt_max_overlaps = ious.argmax(axis=1)\n                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]\n                gt_idx = max_ious.argmax()\n                gt_ious[j] = max_ious[gt_idx]\n                box_idx = gt_max_overlaps[gt_idx]\n                ious[gt_idx, :] = -1\n                ious[:, box_idx] = -1\n            tmp_ious = np.hstack((tmp_ious, gt_ious))\n        ious_[k, :] = tmp_ious"
+        },
+        {
+            "comment": "The code defines two functions: `recalls()` and `eval_recalls()`. \n\n`recalls()` calculates the average precision-recall curve by comparing predicted bounding boxes with ground truth ones. It does this by sorting intersection over union (IOU) values, creating a recall matrix based on IOU thresholds, and averaging recall across images.\n\n`eval_recalls()` is a wrapper function that calls `recalls()`. It calculates recalls for each image given ground truths and proposals. It also checks input types and sets default parameters if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/recall.py\":28-61",
+            "content": "    ious_ = np.fliplr(np.sort(ious_, axis=1))\n    recalls = np.zeros((proposal_nums.size, thrs.size))\n    for i, thr in enumerate(thrs):\n        recalls[:, i] = (ious_ >= thr).sum(axis=1) / float(total_gt_num)\n    return recalls\ndef set_recall_param(proposal_nums, iou_thrs):\n    if isinstance(proposal_nums, list):\n        proposal_nums_ = np.array(proposal_nums)\n    elif isinstance(proposal_nums, int):\n        proposal_nums_ = np.array([proposal_nums])\n    else:\n        proposal_nums_ = proposal_nums\n    if iou_thrs is None:\n        _iou_thrs = np.array([0.5])\n    elif isinstance(iou_thrs, list):\n        _iou_thrs = np.array(iou_thrs)\n    elif isinstance(iou_thrs, float):\n        _iou_thrs = np.array([iou_thrs])\n    else:\n        _iou_thrs = iou_thrs\n    return proposal_nums_, _iou_thrs\ndef eval_recalls(gts, proposals, proposal_nums=None, iou_thrs=None):\n    \"\"\"Calculate recalls. \"\"\"\n    img_num = len(gts)\n    assert img_num == len(proposals)\n    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)"
+        },
+        {
+            "comment": "This code calculates recall for object detection. It iterates through images, sorts and selects proposals based on scores, and then calculates IoU (intersection over union) between these proposals and ground truth boxes. If no ground truth is found or it has zero boxes, all_ious is filled with zeros. The function returns recalls for each image number.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/recall.py\":63-83",
+            "content": "    all_ious = []\n    for i in range(img_num):\n        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:\n            scores = proposals[i][:, 4]\n            sort_idx = np.argsort(scores)[::-1]\n            img_proposal = proposals[i][sort_idx, :]\n        else:\n            img_proposal = proposals[i]\n        prop_num = min(img_proposal.shape[0], proposal_nums[-1])\n        if gts[i] is None or gts[i].shape[0] == 0:\n            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)\n        else:\n            ious = bbox_overlaps(\n                torch.tensor(gts[i]),\n                torch.tensor(img_proposal[:prop_num, :4]))\n            ious = ious.data.numpy()\n        all_ious.append(ious)\n    all_ious = np.array(all_ious)\n    recalls = _recalls(all_ious, proposal_nums, iou_thrs)\n    return recalls"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2cc8b836-a7ef-4eab-aabe-28f3ee4f2da3.json b/docs/doc/2cc8b836-a7ef-4eab-aabe-28f3ee4f2da3.json
new file mode 100644
index 000000000..6cef317ea
--- /dev/null
+++ b/docs/doc/2cc8b836-a7ef-4eab-aabe-28f3ee4f2da3.json
@@ -0,0 +1,75 @@
+{
+    "summary": "The code utilizes PaddleVideo's image processing pipeline for efficient frame sampling, defines a sampler class for video decoding and data conversion, and calculates sampling positions, offsets, and generates frame indices for video sequences.",
+    "details": [
+        {
+            "comment": "This code is a Python module that imports various libraries and defines an image processing pipeline for PaddleVideo. It checks if SimpleITK is installed, handles pickling, and registers the pipeline using PaddleVideo's registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":0-37",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport random\nimport numpy as np\nfrom PIL import Image\ntry:\n    import SimpleITK as sitk\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care.\"\n    )\nimport cv2\nfrom ..registry import PIPELINES\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\n@PIPELINES.register()"
+        },
+        {
+            "comment": "The `Sampler` class is used to sample frames based on various parameters such as number of segments, length of each segment, frame interval, valid mode, select left flag and whether to use PIL for reading images. It returns the index of sampled frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":38-65",
+            "content": "class Sampler(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:\n        num_seg(int): number of segments.\n        seg_len(int): number of sampled frames in each segment.\n        valid_mode(bool): True or False.\n        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.\n    Returns:\n        frames_idx: the index of sampled #frames.\n    \"\"\"\n    def __init__(self,\n                 num_seg,\n                 seg_len,\n                 frame_interval=None,\n                 valid_mode=False,\n                 select_left=False,\n                 dense_sample=False,\n                 linspace_sample=False,\n                 use_pil=True):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.frame_interval = frame_interval\n        self.valid_mode = valid_mode\n        self.select_left = select_left\n        self.dense_sample = dense_sample\n        self.linspace_sample = linspace_sample"
+        },
+        {
+            "comment": "The code defines a class with an attribute 'use_pil' that determines the image format. The '_get' method retrieves frames based on data format (frame, MRI, or video), applies necessary conversions and resizing, and stores them in 'imgs'. It uses different libraries such as Image, sitk, and cv2 for different formats.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":66-95",
+            "content": "        self.use_pil = use_pil\n    def _get(self, frames_idx, results):\n        data_format = results['format']\n        if data_format == \"frame\":\n            frame_dir = results['frame_dir']\n            imgs = []\n            for idx in frames_idx:\n                img = Image.open(\n                    os.path.join(frame_dir,\n                                 results['suffix'].format(idx))).convert('RGB')\n                imgs.append(img)\n        elif data_format == \"MRI\":\n            frame_dir = results['frame_dir']\n            imgs = []\n            MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))\n            for idx in frames_idx:\n                item = MRI[idx]\n                item = cv2.resize(item, (224, 224))\n                imgs.append(item)\n        elif data_format == \"video\":\n            if results['backend'] == 'cv2':\n                frames = np.array(results['frames'])\n                imgs = []\n                for idx in frames_idx:\n                    imgbuf = frames[idx]\n                    img = Image.fromarray(imgbuf, mode='RGB')"
+        },
+        {
+            "comment": "Code is handling video decoding using different backends such as 'opencv', 'decord', and 'pyav'. It appends the frames to imgs list, converts numpy array to image using Image.fromarray method for 'decord' backend, and handles frame indexing and data structures based on backend used.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":96-118",
+            "content": "                    imgs.append(img)\n            elif results['backend'] == 'decord':\n                container = results['frames']\n                if self.use_pil:\n                    frames_select = container.get_batch(frames_idx)\n                    # dearray_to_img\n                    np_frames = frames_select.asnumpy()\n                    imgs = []\n                    for i in range(np_frames.shape[0]):\n                        imgbuf = np_frames[i]\n                        imgs.append(Image.fromarray(imgbuf, mode='RGB'))\n                else:\n                    if frames_idx.ndim != 1:\n                        frames_idx = np.squeeze(frames_idx)\n                    frame_dict = {\n                        idx: container[idx].asnumpy()\n                        for idx in np.unique(frames_idx)\n                    }\n                    imgs = [frame_dict[idx] for idx in frames_idx]\n            elif results['backend'] == 'pyav':\n                imgs = []\n                frames = np.array(results['frames'])\n                for idx in frames_idx:"
+        },
+        {
+            "comment": "This code snippet is responsible for sampling frames from a video sequence, and it handles different scenarios based on the input parameters. If `dense_sample` is True, it adjusts the index before accessing the frame. The frames are then appended to a list called `imgs`. If neither of the else conditions are met, it raises a `NotImplementedError`. The function also includes another method, `_get_train_clips`, which calculates clip offsets for training purposes based on the number of frames and other parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":119-143",
+            "content": "                    if self.dense_sample:\n                        idx = idx - 1\n                    imgbuf = frames[idx]\n                    imgs.append(imgbuf)\n                imgs = np.stack(imgs)  # thwc\n            else:\n                raise NotImplementedError\n        else:\n            raise NotImplementedError\n        results['imgs'] = imgs\n        return results\n    def _get_train_clips(self, num_frames):\n        ori_seg_len = self.seg_len * self.frame_interval\n        avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg\n        if avg_interval > 0:\n            base_offsets = np.arange(self.num_seg) * avg_interval\n            clip_offsets = base_offsets + np.random.randint(avg_interval,\n                                                            size=self.num_seg)\n        elif num_frames > max(self.num_seg, ori_seg_len):\n            clip_offsets = np.sort(\n                np.random.randint(num_frames - ori_seg_len + 1,\n                                  size=self.num_seg))\n        elif avg_interval == 0:"
+        },
+        {
+            "comment": "The code defines a class with methods to determine clip offsets based on the number of frames and segment length. If the number of frames exceeds the original segment length, it calculates clip offsets for each segment. Otherwise, it sets all clip offsets to zero. The class also has a __call__ method that takes frames length as input and returns sampling indices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":144-170",
+            "content": "            ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg\n            clip_offsets = np.around(np.arange(self.num_seg) * ratio)\n        else:\n            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)\n        return clip_offsets\n    def _get_test_clips(self, num_frames):\n        ori_seg_len = self.seg_len * self.frame_interval\n        avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)\n        if num_frames > ori_seg_len - 1:\n            base_offsets = np.arange(self.num_seg) * avg_interval\n            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)\n        else:\n            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)\n        return clip_offsets\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            frames_len: length of frames.\n        return:\n            sampling id.\n        \"\"\"\n        frames_len = int(results['frames_len'])\n        frames_idx = []\n        if self.frame_interval is not None:\n            assert isinstance(self.frame_interval, int)"
+        },
+        {
+            "comment": "This code determines the sampling method for frames based on the mode (valid or train) and format ('video' or 'frame'). It calculates offsets, handles different formats, and if linspace_sample is True, it generates offsets using linear spacing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":171-198",
+            "content": "            if not self.valid_mode:\n                offsets = self._get_train_clips(frames_len)\n            else:\n                offsets = self._get_test_clips(frames_len)\n            offsets = offsets[:, None] + np.arange(\n                self.seg_len)[None, :] * self.frame_interval\n            offsets = np.concatenate(offsets)\n            offsets = offsets.reshape((-1, self.seg_len))\n            offsets = np.mod(offsets, frames_len)\n            offsets = np.concatenate(offsets)\n            if results['format'] == 'video':\n                frames_idx = offsets\n            elif results['format'] == 'frame':\n                frames_idx = list(offsets + 1)\n            else:\n                raise NotImplementedError\n            return self._get(frames_idx, results)\n        if self.linspace_sample:\n            if 'start_idx' in results and 'end_idx' in results:\n                offsets = np.linspace(results['start_idx'], results['end_idx'],\n                                      self.num_seg)\n            else:\n                offsets = np.linspace(0, frames_len - 1, self.num_seg)"
+        },
+        {
+            "comment": "This code segment calculates the frames to sample from a video, based on its format (video/frame/MRI). It also handles dense sampling for ppTSM. In non-dense mode, it selects random positions for each segment within the range of 1 to frames_len. For dense sampling in train mode, it generates a set of evenly spaced frame indices between start_idx and sample_pos, which is calculated based on frames_len and 64 (to ensure at least one frame within the window). The offsets are then used to fetch corresponding data using the _get method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":199-222",
+            "content": "            offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)\n            if results['format'] == 'video':\n                frames_idx = list(offsets)\n                frames_idx = [x % frames_len for x in frames_idx]\n            elif results['format'] == 'frame':\n                frames_idx = list(offsets + 1)\n            elif results['format'] == 'MRI':\n                frames_idx = list(offsets)\n            else:\n                raise NotImplementedError\n            return self._get(frames_idx, results)\n        average_dur = int(frames_len / self.num_seg)\n        if not self.select_left:\n            if self.dense_sample:  # For ppTSM\n                if not self.valid_mode:  # train\n                    sample_pos = max(1, 1 + frames_len - 64)\n                    t_stride = 64 // self.num_seg\n                    start_idx = 0 if sample_pos == 1 else np.random.randint(\n                        0, sample_pos - 1)\n                    offsets = [(idx * t_stride + start_idx) % frames_len + 1\n                               for idx in range(self.num_seg)]"
+        },
+        {
+            "comment": "The code determines the sampling position based on frames length, number of segments, and valid mode. If no offsets are provided, it calculates the starting positions for each segment using a linear space. Then, it generates the offsets by multiplying the stride with the current segment index and adding the start index. Finally, if in valid mode, it randomly selects indices within the average duration per segment and adds them to the offsets list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":223-244",
+            "content": "                    frames_idx = offsets\n                else:\n                    sample_pos = max(1, 1 + frames_len - 64)\n                    t_stride = 64 // self.num_seg\n                    start_list = np.linspace(0,\n                                             sample_pos - 1,\n                                             num=10,\n                                             dtype=int)\n                    offsets = []\n                    for start_idx in start_list.tolist():\n                        offsets += [\n                            (idx * t_stride + start_idx) % frames_len + 1\n                            for idx in range(self.num_seg)\n                        ]\n                    frames_idx = offsets\n            else:\n                for i in range(self.num_seg):\n                    idx = 0\n                    if not self.valid_mode:\n                        if average_dur >= self.seg_len:\n                            idx = random.randint(0, average_dur - self.seg_len)\n                            idx += i * average_dur"
+        },
+        {
+            "comment": "Code calculates index based on average duration, then appends corresponding frame indices to frames_idx list based on the format specified in results. If the format is not recognized, it raises NotImplementedError. Finally, it returns the frames_idx and results to an unknown method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":245-267",
+            "content": "                        elif average_dur >= 1:\n                            idx += i * average_dur\n                        else:\n                            idx = i\n                    else:\n                        if average_dur >= self.seg_len:\n                            idx = (average_dur - 1) // 2\n                            idx += i * average_dur\n                        elif average_dur >= 1:\n                            idx += i * average_dur\n                        else:\n                            idx = i\n                    for jj in range(idx, idx + self.seg_len):\n                        if results['format'] == 'video':\n                            frames_idx.append(int(jj % frames_len))\n                        elif results['format'] == 'frame':\n                            frames_idx.append(jj + 1)\n                        elif results['format'] == 'MRI':\n                            frames_idx.append(jj)\n                        else:\n                            raise NotImplementedError\n            return self._get(frames_idx, results)"
+        },
+        {
+            "comment": "This code generates random offsets for selecting frames from a video. If the valid mode is not enabled, it randomly selects frame offsets within the available duration or number of frames. If the valid mode is enabled, it evenly distributes the frames across the video duration. The 'format' variable determines if the selected frames are in video format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":269-291",
+            "content": "        else:  # for TSM\n            if not self.valid_mode:\n                if average_dur > 0:\n                    offsets = np.multiply(list(range(self.num_seg)),\n                                          average_dur) + np.random.randint(\n                                              average_dur, size=self.num_seg)\n                elif frames_len > self.num_seg:\n                    offsets = np.sort(\n                        np.random.randint(frames_len, size=self.num_seg))\n                else:\n                    offsets = np.zeros(shape=(self.num_seg, ))\n            else:\n                if frames_len > self.num_seg:\n                    average_dur_float = frames_len / self.num_seg\n                    offsets = np.array([\n                        int(average_dur_float / 2.0 + average_dur_float * x)\n                        for x in range(self.num_seg)\n                    ])\n                else:\n                    offsets = np.zeros(shape=(self.num_seg, ))\n            if results['format'] == 'video':\n                frames_idx = list(offsets)"
+        },
+        {
+            "comment": "This code snippet defines a SamplerPkl class that samples frames' indices for video loading. It takes arguments num_seg, seg_len, and backend and returns the index of sampled frames. Depending on the results format ('frame', 'MRI', or others), it sets the frames_idx accordingly before returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":292-326",
+            "content": "                frames_idx = [x % frames_len for x in frames_idx]\n            elif results['format'] == 'frame':\n                frames_idx = list(offsets + 1)\n            elif results['format'] == 'MRI':\n                frames_idx = list(offsets)\n            else:\n                raise NotImplementedError\n            return self._get(frames_idx, results)\n@PIPELINES.register()\nclass SamplerPkl(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:\n        num_seg(int): number of segments.\n        seg_len(int): number of sampled frames in each segment.\n        mode(str): 'train', 'valid'\n    Returns:\n        frames_idx: the index of sampled #frames.\n    \"\"\"\n    def __init__(self, num_seg, seg_len, backend='pillow', valid_mode=False):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.valid_mode = valid_mode\n        self.backend = backend\n    def _get(self, buf):\n        if isinstance(buf, str):\n            img = Image.open(StringIO(buf))\n        else:"
+        },
+        {
+            "comment": "This code is part of a pipeline for image sampling in video processing. It loads data from disk, converts images to RGB format, and handles labels. The `__call__` method takes results as input, retrieves the video name, label, and frames from the loaded data. If the label is a dictionary or has multiple elements, it assigns the label to '\u52a8\u4f5c\u7c7b\u578b' or randomly chooses between the first two elements. It sets the 'frames_len' based on the length of frames and calculates the average duration per segment. Then, it initializes an empty list for the images and loops through the segments to create image samples. If valid mode is not enabled, it also resets the index variable.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":327-357",
+            "content": "            img = Image.open(BytesIO(buf))\n        img = img.convert('RGB')\n        if self.backend != 'pillow':\n            img = np.array(img)\n        return img\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            frames_len: length of frames.\n        return:\n            sampling id.\n        \"\"\"\n        filename = results['frame_dir']\n        data_loaded = pickle.load(open(filename, 'rb'), encoding='bytes')\n        video_name, label, frames = data_loaded\n        if isinstance(label, dict):\n            label = label['\u52a8\u4f5c\u7c7b\u578b']\n            results['labels'] = label\n        elif len(label) == 1:\n            results['labels'] = int(label[0])\n        else:\n            results['labels'] = int(label[0]) if random.random() < 0.5 else int(\n                label[1])\n        results['frames_len'] = len(frames)\n        frames_len = results['frames_len']\n        average_dur = int(int(frames_len) / self.num_seg)\n        imgs = []\n        for i in range(self.num_seg):\n            idx = 0\n            if not self.valid_mode:"
+        },
+        {
+            "comment": "The code calculates the index for a segment of frames based on average duration and frame length. It then retrieves images from the frames list, appends them to imgs, sets backend type, and returns the results including the imgs and backend information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample.py\":358-381",
+            "content": "                if average_dur >= self.seg_len:\n                    idx = random.randint(0, average_dur - self.seg_len)\n                    idx += i * average_dur\n                elif average_dur >= 1:\n                    idx += i * average_dur\n                else:\n                    idx = i\n            else:\n                if average_dur >= self.seg_len:\n                    idx = (average_dur - 1) // 2\n                    idx += i * average_dur\n                elif average_dur >= 1:\n                    idx += i * average_dur\n                else:\n                    idx = i\n            for jj in range(idx, idx + self.seg_len):\n                imgbuf = frames[int(jj % results['frames_len'])]\n                img = self._get(imgbuf)\n                imgs.append(img)\n        results['backend'] = self.backend\n        results['imgs'] = imgs\n        return results"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2cedbf37-69dd-428a-baa3-e50f088fa25f.json b/docs/doc/2cedbf37-69dd-428a-baa3-e50f088fa25f.json
new file mode 100644
index 000000000..0176dbbeb
--- /dev/null
+++ b/docs/doc/2cedbf37-69dd-428a-baa3-e50f088fa25f.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code defines a Sampler class for sampling frame IDs in video data, using PIL to read images instead of OpenCV, and returns the index of sampled frames. It can calculate indices randomly or by formula.",
+    "details": [
+        {
+            "comment": "This code defines a Sampler class that samples frames IDs for video data. It takes arguments: num_seg (number of segments), seg_len (number of sampled frames in each segment), and mode ('train' or 'valid'). The class uses PIL to read images instead of OpenCV (cv2) for better compatibility. The sampler returns the index of sampled frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py\":0-31",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport random\nfrom PIL import Image\nfrom ..registry import PIPELINES\nimport os\nimport numpy as np\n@PIPELINES.register()\nclass Sampler(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:\n        num_seg(int): number of segments.\n        seg_len(int): number of sampled frames in each segment.\n        mode(str): 'train', 'valid'\n    Returns:\n        frames_idx: the index of sampled #frames."
+        },
+        {
+            "comment": "The code defines a class with an initialization function and two methods, \"_get\" and \"__call__\". The \"_get\" method takes frames_idx and results as arguments, and based on the data format (frame or video), it retrieves and appends images to imgs. If the format is not frame or video, it raises a NotImplementedError. The \"__call__\" method takes frames_len as an argument and returns a sampling id.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py\":32-69",
+            "content": "    \"\"\"\n    def __init__(self, num_seg, seg_len, valid_mode=False):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.valid_mode = valid_mode\n    def _get(self, frames_idx, results):\n        data_format =results['format']\n        if data_format == \"frame\":\n            frame_dir = results['frame_dir']\n            imgs = []\n            for idx in frames_idx:\n                img = Image.open(os.path.join(frame_dir, results['suffix'].format(idx))).convert('RGB')\n                imgs.append(img)\n        elif data_format == \"video\":\n            frames = np.array(results['frames'])\n            imgs = []\n            for idx in frames_idx:\n                imgbuf = frames[idx]\n                img = Image.fromarray(imgbuf, mode='RGB')\n                imgs.append(img)\n        else:\n            raise NotImplementedError\n        results['imgs'] = imgs\n        return results\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            frames_len: length of frames.\n        return:\n            sampling id."
+        },
+        {
+            "comment": "This code calculates frame indices for video or frame data. It takes in 'frames_len' and 'num_seg' as inputs, and if 'valid_mode' is False, it generates random frame indices within the valid frame range. If 'valid_mode' is True, it calculates frame indices based on specific formulas. The output is stored in 'frames_idx'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py\":70-95",
+            "content": "        \"\"\"\n        frames_len = int(results['frames_len'])\n        average_dur = int(int(frames_len) / self.num_seg)\n        frames_idx = []\n        for i in range(self.num_seg):\n            idx = 0\n            if not self.valid_mode:\n                if average_dur >= self.seg_len:\n                    idx = random.randint(0, average_dur - self.seg_len)\n                    idx += i * average_dur\n                elif average_dur >= 1:\n                    idx += i * average_dur\n                else: # average_dur = 0\n                    idx = i % frames_len\n            else:\n                if average_dur >= self.seg_len:\n                    idx = (average_dur - 1) // 2\n                    idx += i * average_dur\n                elif average_dur >= 1:\n                    idx += i * average_dur\n                else:\n                    idx = i % frames_len\n            for jj in range(idx, idx+self.seg_len):\n                if results['format'] == 'video':\n                    frames_idx.append(int(jj%frames_len))\n                elif results['format'] == 'frame':"
+        },
+        {
+            "comment": "This code snippet is part of a class method that retrieves frames from a video file based on their index. If the frame index (jj+1) is not equal to 0, it appends the index to the frames_idx list; otherwise, it raises a NotImplementedError. The method then returns the results using the _get method with the frames_idx and results as arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py\":96-101",
+            "content": "                    #frame from 000001\n                    frames_idx.append(jj+1)\n                else:\n                    raise NotImplementedError\n        return self._get(frames_idx, results)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2cfd042c-4704-4187-8777-a90abf09ff85.json b/docs/doc/2cfd042c-4704-4187-8777-a90abf09ff85.json
new file mode 100644
index 000000000..f983feeae
--- /dev/null
+++ b/docs/doc/2cfd042c-4704-4187-8777-a90abf09ff85.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code defines two functions, get_class2id_map and get_arguments. It reads ground truth text files, splits them by class labels, saves as .npy files, defines boundary frames for new actions, and saves these as separate .npy files. Assumes input files preprocessed and split by lines.",
+    "details": [
+        {
+            "comment": "This code defines two functions: get_class2id_map and get_arguments. The get_class2id_map function takes a dataset name (50salads, gtea, or breakfast) and the path to the dataset directory, and returns a dictionary mapping class names to their respective IDs by reading the \"mapping.txt\" file in the specified dataset directory. The get_arguments function parses all arguments from the command line interface for converting ground truth txt files to numpy arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/prepare_asrf_data.py\":0-41",
+            "content": "import argparse\nimport glob\nimport os\nimport sys\nfrom typing import Dict\nimport numpy as np\nsys.path.append(os.path.join(os.path.dirname(__file__), \"..\"))\ndataset_names = [\"50salads\", \"breakfast\", \"gtea\"]\ndef get_class2id_map(dataset: str,\n                     dataset_dir: str = \"./dataset\") -> Dict[str, int]:\n    \"\"\"\n    Args:\n        dataset: 50salads, gtea, breakfast\n        dataset_dir: the path to the datset directory\n    \"\"\"\n    assert (dataset in dataset_names\n            ), \"You have to choose 50salads, gtea or breakfast as dataset.\"\n    with open(os.path.join(dataset_dir, \"{}/mapping.txt\".format(dataset)),\n              \"r\") as f:\n        actions = f.read().split(\"\\n\")[:-1]\n    class2id_map = dict()\n    for a in actions:\n        class2id_map[a.split()[1]] = int(a.split()[0])\n    return class2id_map\ndef get_arguments() -> argparse.Namespace:\n    \"\"\"\n    parse all the arguments from command line inteface\n    return a list of parsed arguments\n    \"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"convert ground truth txt files to numpy array\")"
+        },
+        {
+            "comment": "This code sets up the dataset directory path and creates directories for saving ground truth numpy arrays. It also creates a class to index mapping using get_class2id_map function, and retrieves all groundTruth text files' paths in the specified dataset directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/prepare_asrf_data.py\":42-73",
+            "content": "    parser.add_argument(\n        \"--dataset_dir\",\n        type=str,\n        default=\"./dataset\",\n        help=\"path to a dataset directory (default: ./dataset)\",\n    )\n    return parser.parse_args()\ndef main() -> None:\n    args = get_arguments()\n    datasets = [\"50salads\", \"gtea\", \"breakfast\", \"baseball\"]\n    for dataset in datasets:\n        # make directory for saving ground truth numpy arrays\n        cls_save_dir = os.path.join(args.dataset_dir, dataset, \"gt_arr\")\n        if not os.path.exists(cls_save_dir):\n            os.mkdir(cls_save_dir)\n        # make directory for saving ground truth numpy arrays\n        boundary_save_dir = os.path.join(args.dataset_dir, dataset,\n                                         \"gt_boundary_arr\")\n        if not os.path.exists(boundary_save_dir):\n            os.mkdir(boundary_save_dir)\n        # class to index mapping\n        class2id_map = get_class2id_map(dataset, dataset_dir=args.dataset_dir)\n        gt_dir = os.path.join(args.dataset_dir, dataset, \"groundTruth\")\n        gt_paths = glob.glob(os.path.join(gt_dir, \"*.txt\"))"
+        },
+        {
+            "comment": "This code is reading ground truth text files, splitting them into arrays based on class labels, and saving these arrays as .npy files. It also defines boundary frames for new actions and saves these as separate .npy files. The code assumes that the input files are already processed and split by lines.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/prepare_asrf_data.py\":75-105",
+            "content": "        for gt_path in gt_paths:\n            # the name of ground truth text file\n            gt_name = os.path.relpath(gt_path, gt_dir)\n            with open(gt_path, \"r\") as f:\n                gt = f.read().split(\"\\n\")[:-1]\n            gt_array = np.zeros(len(gt))\n            for i in range(len(gt)):\n                gt_array[i] = class2id_map[gt[i]]\n            # save array\n            np.save(os.path.join(cls_save_dir, gt_name[:-4] + \".npy\"), gt_array)\n            # the name of ground truth text file\n            gt_name = os.path.relpath(gt_path, gt_dir)\n            with open(gt_path, \"r\") as f:\n                gt = f.read().split(\"\\n\")[:-1]\n            # define the frame where new action starts as boundary frame\n            boundary = np.zeros(len(gt))\n            last = gt[0]\n            boundary[0] = 1\n            for i in range(1, len(gt)):\n                if last != gt[i]:\n                    boundary[i] = 1\n                    last = gt[i]\n            # save array\n            np.save(os.path.join(boundary_save_dir, gt_name[:-4] + \".npy\"),"
+        },
+        {
+            "comment": "This code snippet defines a function named \"main\" and checks if the script is being run directly. If it is, the \"main\" function is called to execute the desired task. The code prints \"Done\" after completing the specified operation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/50salads/prepare_asrf_data.py\":106-112",
+            "content": "                    boundary)\n    print(\"Done\")\nif __name__ == \"__main__\":\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2d143039-84f2-4840-a4b5-3828135abccf.json b/docs/doc/2d143039-84f2-4840-a4b5-3828135abccf.json
new file mode 100644
index 000000000..d3b0d2afe
--- /dev/null
+++ b/docs/doc/2d143039-84f2-4840-a4b5-3828135abccf.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The Registry class supports module customization, allowing users to register objects and retrieve them via unique names in a name-to-object mapping system.",
+    "details": [
+        {
+            "comment": "The Registry class provides a name-to-object mapping, enabling third-party users to customize modules. To register an object, use @BACKBONES.register() or BACKBONES.register(ResNet).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/registry.py\":0-34",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nclass Registry(object):\n    \"\"\"\n    The registry that provides name -> object mapping, to support third-party users' custom modules.\n    To register an object:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        @BACKBONES.register()\n        class ResNet:\n            pass\n    Or:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        class ResNet:\n            pass\n        BACKBONES.register(ResNet)"
+        },
+        {
+            "comment": "The code provides a registry class for building modules based on their names. It allows registration of objects with unique names, and can be used as a decorator or without. The usage example demonstrates how to get a backbone module using its name from the registered objects map.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/registry.py\":36-71",
+            "content": "    Usage: To build a module.\n    .. code-block:: python\n        backbone_name = \"ResNet\"\n        b = BACKBONES.get(backbone_name)()\n    \"\"\"\n    def __init__(self, name):\n        \"\"\"\n        Args:\n            name (str): the name of this registry\n        \"\"\"\n        self._name = name\n        self._obj_map = {}\n    def __contains__(self, key):\n        return self._obj_map.get(key) is not None\n    def _do_register(self, name, obj):\n        \"\"\"do register\"\"\"\n        assert (\n            name not in self._obj_map\n        ), \"An object named '{}' was already registered in '{}' registry!\".format(\n            name, self._name)\n        self._obj_map[name] = obj\n    def register(self, obj=None, name=None):\n        \"\"\"\n        Register the given object under the the name `obj.__name__`.\n        Can be used as either a decorator or not. See docstring of this class for usage.\n        \"\"\"\n        if obj is None:\n            # used as a decorator\n            def deco(func_or_class, name=name):\n                if name is None:\n                    name = func_or_class.__name__"
+        },
+        {
+            "comment": "This code registers and retrieves objects in a registry. It allows registering functions or classes with optional names, and can retrieve the registered object by its name. The `get` function returns the class if found in the registry, otherwise raises KeyError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/registry.py\":72-97",
+            "content": "                self._do_register(name, func_or_class)\n                return func_or_class\n            return deco\n        # used as a function call\n        if name is None:\n            name = obj.__name__\n        self._do_register(name, obj)\n    def get(self, name):\n        \"\"\"Get the registry record.\n        Args:\n            name (str): The class name.\n        Returns:\n            ret: The class.\n        \"\"\"\n        ret = self._obj_map.get(name)\n        if ret is None:\n            raise KeyError(\n                \"No object named '{}' found in '{}' registry!\".format(\n                    name, self._name))\n        return ret"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2dbf4402-58a7-4db6-8e75-1ec98bde3314.json b/docs/doc/2dbf4402-58a7-4db6-8e75-1ec98bde3314.json
new file mode 100644
index 000000000..4340c5043
--- /dev/null
+++ b/docs/doc/2dbf4402-58a7-4db6-8e75-1ec98bde3314.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The Python script uses Baidu Cloud for action detection and includes audio, image processing functions. It has classes like ActionDetection and ModelPredict to initialize models, extract features from video input, retrieve proposals using BMN, classify actions based on extracted features and proposals, and log debugging information. Results are stored in a JSON file.",
+    "details": [
+        {
+            "comment": "This code is a Python script for action detection using Baidu Cloud, which includes functions for processing audio and image data to predict actions. It utilizes various models such as mfcc_extractor, image_model, audio_model, prop_model, and classify_model. The ActionDetection class is defined, which likely contains the main logic of the action detection algorithm. The record_time_info function is a decorator used to log the time taken for executing specific functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/action.py\":0-47",
+            "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport functools\nimport numpy as np\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport mfcc.feature_extractor as mfcc_extractor\nimport models.pptsm_infer as image_model\nimport models.audio_infer as audio_model\nimport models.bmn_infer as prop_model\nimport models.lstm_infer as classify_model\nimport logger\nlogger = logger.Logger()\ndef record_time_info(func):\n    \"\"\"decorator func to log cost time for func\n    \"\"\"\n    @functools.wraps(func)\n    def timer(*args):\n        \"\"\"log cost time for func\n        \"\"\"\n        logger.info(\"function [{}] processing ...\".format(func.__name__))\n        start_time = time.time()\n        retval = func(*args)\n        cost_time = round(time.time() - start_time, 5)\n        logger.info(\"function [{}] run time: {:.2f} min\".format(\n            func.__name__, cost_time / 60))\n        return retval\n    return timer\nclass ActionDetection(object):"
+        },
+        {
+            "comment": "This code defines a ModelPredict class with an initializer that reads configs from a specified file and prints them. It also checks certain conditions related to LSTM_ONLY, sets properties based on those conditions, and loads a model if not in DEBUG mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/action.py\":48-75",
+            "content": "    \"\"\"ModelPredict\"\"\"\n    def __init__(self, cfg_file=\"configs/configs.yaml\"):\n        cfg = parse_config(cfg_file)\n        self.configs = cfg\n        print_configs(self.configs, \"Infer\")\n        name = 'COMMON'\n        self.DEBUG = cfg[name]['DEBUG']\n        self.BMN_ONLY = cfg[name]['BMN_ONLY']\n        self.LSTM_ONLY = cfg[name]['LSTM_ONLY']\n        self.PCM_ONLY = cfg[name]['PCM_ONLY']\n        if self.LSTM_ONLY:\n            self.prop_dict = {}\n            for dataset in ['EuroCup2016']:\n                prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(\n                    dataset)\n                json_data = json.load(open(prop_json, 'r'))\n                for item in json_data:\n                    basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n                    basename = basename + '/' + item['video_name'] + '.mp4'\n                    self.prop_dict[basename] = item['bmn_results']\n    @record_time_info\n    def load_model(self):\n        \"\"\"\n        load_model\n        \"\"\"\n        if not self.DEBUG:"
+        },
+        {
+            "comment": "The code initializes different models for image, audio, and proposal extraction, and a classifier. It then extracts features from the input video, retrieves proposals using BMN (Bidirectional Motion Model), and finally classifies the action based on these extracted features and proposals.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/action.py\":76-107",
+            "content": "            self.image_model = image_model.InferModel(self.configs)\n            if not self.PCM_ONLY:\n                self.audio_model = audio_model.InferModel(self.configs)\n        if not self.LSTM_ONLY:\n            self.prop_model = prop_model.InferModel(self.configs)\n        if not self.BMN_ONLY:\n            self.classify_model = classify_model.InferModel(self.configs)\n        logger.info(\"==> Action Detection prepared.\")\n    @record_time_info\n    def infer(self, imgs_path, pcm_path, fps=5):\n        \"\"\"\n        extract_feature\n        \"\"\"\n        self.imgs_path = imgs_path\n        self.pcm_path = pcm_path\n        self.configs['COMMON']['fps'] = fps\n        logger.info(\"==> input video {}\".format(os.path.basename(\n            self.imgs_path)))\n        # step 1: extract feature\n        video_features = self.extract_feature()\n        # step2: get proposal\n        bmn_results = self.extract_proposal(video_features)\n        # step3: classify\n        material = {'feature': video_features, 'proposal': bmn_results}"
+        },
+        {
+            "comment": "This code defines several methods for video classification and feature extraction. It uses a model called \"classify_model\" to predict actions based on input material, and another model called \"prop_model\" to extract proposals. The BMN_ONLY and LSTM_ONLY flags determine if certain models are used or not. The code also includes logging for debugging purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/action.py\":108-135",
+            "content": "        action_results = self.video_classify(material)\n        return bmn_results, action_results\n    @record_time_info\n    def video_classify(self, material):\n        \"\"\"video classify\"\"\"\n        if self.BMN_ONLY:\n            return []\n        action_results = self.classify_model.predict(self.configs,\n                                                     material=material)\n        logger.info('action shape {}'.format(np.array(action_results).shape))\n        return action_results\n    @record_time_info\n    def extract_proposal(self, video_features):\n        \"\"\"extract proposal\"\"\"\n        if self.LSTM_ONLY:\n            basename = self.imgs_path.replace('frames', 'mp4') + '.mp4'\n            bmn_results = self.prop_dict[basename]\n            return bmn_results\n        bmn_results = self.prop_model.predict(self.configs,\n                                              material=video_features)\n        logger.info('proposal shape {}'.format(np.array(bmn_results).shape))\n        return bmn_results\n    @record_time_info\n    def extract_feature(self):"
+        },
+        {
+            "comment": "Extracts features from images and audio in a video file for further processing. If PCM_ONLY is True, extracts only MFCC features from audio using mfcc_extractor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/action.py\":136-157",
+            "content": "        \"\"\"extract feature\"\"\"\n        if not self.DEBUG:\n            image_path_list = get_images(self.imgs_path)\n            self.configs['PPTSM']['frame_list'] = image_path_list\n            self.configs['AUDIO']['pcm_file'] = self.pcm_path\n            image_features = self.image_model.predict(self.configs)\n            if self.PCM_ONLY:\n                sample_rate = self.configs['AUDIO']['sample_rate']\n                pcm_features = mfcc_extractor.extract_pcm(\n                    self.pcm_path, sample_rate)\n                audio_features = []\n            else:\n                audio_features, pcm_features = self.audio_model.predict(\n                    self.configs)\n            np_image_features = np.array(image_features, dtype=np.float32)\n            np_audio_features = np.array(audio_features, dtype=np.float32)\n            np_pcm_features = np.array(pcm_features, dtype=np.float32)\n            video_features = {\n                'image_feature': np_image_features,\n                'audio_feature': np_audio_features,"
+        },
+        {
+            "comment": "The code loads video features from frames or pcm file and returns the features in the form of a dictionary. It then proceeds to initialize an instance of the ActionDetection class, load the model, define image and audio paths, and finally calls the infer function to generate bmn_results and action_results which are stored in the results dictionary and saved into a json file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/action.py\":158-184",
+            "content": "                'pcm_feature': np_pcm_features\n            }\n        else:\n            feature_path = self.imgs_path.replace(\"frames\", \"features\") + '.pkl'\n            video_features = pickle.load(open(feature_path, 'rb'))\n        logger.info(\"feature shape {} {} {}\".format(\n            video_features['image_feature'].shape,\n            video_features['audio_feature'].shape,\n            video_features['pcm_feature'].shape))\n        return video_features\nif __name__ == '__main__':\n    model_predict = ActionDetection(cfg_file=\"../configs/configs.yaml\")\n    model_predict.load_model()\n    imgs_path = \"/home/work/datasets/EuroCup2016/frames/1be705a8f67648da8ec4b4296fa80895\"\n    pcm_path = \"/home/work/datasets/EuroCup2016/pcm/1be705a8f67648da8ec4b4296fa80895.pcm\"\n    bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n    results = {'bmn_results': bmn_results, 'action_results': action_results}\n    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)"
+        },
+        {
+            "comment": "Writes data to file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/action.py\":185-185",
+            "content": "        f.write(data)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2debd80a-ec06-4938-a0db-556a83147597.json b/docs/doc/2debd80a-ec06-4938-a0db-556a83147597.json
new file mode 100644
index 000000000..fed3e10f6
--- /dev/null
+++ b/docs/doc/2debd80a-ec06-4938-a0db-556a83147597.json
@@ -0,0 +1,105 @@
+{
+    "summary": "The code imports modules, defines a Manet class for video segmentation using PaddleVideo's Manet_Stage1 model, and implements training, inference, mask generation, parallel processing, and frame saving steps. It is for deep learning models, visualizations, and measuring time efficiency.",
+    "details": [
+        {
+            "comment": "This code is importing necessary modules and functions from different locations, including image processing utilities and machine learning libraries. It also defines some specific functions related to the MANET model in PaddlePaddle. The code is part of a larger framework for video modeling and image segmentation tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":0-25",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom EIVideo.paddlevideo.loader.builder import build_pipeline\nfrom EIVideo.paddlevideo.loader.pipelines import ToTensor_manet\nimport os\nimport timeit\nimport paddle\nfrom PIL import Image\nfrom davisinteractive.utils.scribbles import scribbles2mask, annotated_frames\nfrom paddle import nn\nfrom EIVideo.paddlevideo.utils import load\nfrom EIVideo.paddlevideo.utils.manet_utils import float_, _palette, damage_masks, long_, write_dict, rough_ROI"
+        },
+        {
+            "comment": "The code is defining a class \"Manet\" that inherits from the BaseSegment class for video segmentation. It has train_step, val_step, infer_step, and test_step methods which are defined but not implemented. The class checks if the model configuration is Manet and then builds the model using build_model function before calling the test_step method with additional parameters like weights and parallel set to False.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":26-60",
+            "content": "from EIVideo.api import load_video, get_scribbles, submit_masks\nfrom ...builder import build_model\nfrom ...registry import SEGMENT\nfrom .base import BaseSegment\n# if cfg.MODEL.framework == \"Manet\":\n#     cfg_helper = {\"knns\": 1,\n#                   \"is_save_image\": True}\n#     cfg.update(cfg_helper)\n#     build_model(cfg['MODEL']).test_step(**cfg,\n#                                         weights=weights,\n#                                         parallel=False)\n#     return\n@SEGMENT.register()\nclass Manet(BaseSegment):\n    def __init__(self, backbone=None, head=None, **cfg):\n        super().__init__(backbone, head, **cfg)\n    def train_step(self, data_batch, step, **cfg):\n        pass\n    def val_step(self, data_batch, **kwargs):\n        pass\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        pass\n    def test_step(self, weights, parallel=True, is_save_image=True, **cfg):\n        # 1. Construct model.\n        cfg['MODEL'].head.pretrained = ''"
+        },
+        {
+            "comment": "This code initializes the model with test mode enabled, builds it using a function, potentially makes it parallel, loads a video for data, prints \"stage1 load_video success\" message, creates a report save directory if it doesn't exist, sets the maximum number of interactions to 8, and evaluates the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":61-86",
+            "content": "        cfg['MODEL'].head.test_mode = True\n        model = build_model(cfg['MODEL'])\n        if parallel:\n            model = paddle.DataParallel(model)\n        # 2. Construct data.\n        sequence = cfg[\"video_path\"].split('/')[-1].split('.')[0]\n        obj_nums = 1\n        images, _ = load_video(cfg[\"video_path\"], 480)\n        print(\"stage1 load_video success\")\n        # [195, 389, 238, 47, 244, 374, 175, 399]\n        # .shape: (502, 480, 600, 3)\n        report_save_dir = cfg.get(\"output_dir\",\n                                  f\"./output/{cfg['model_name']}\")\n        if not os.path.exists(report_save_dir):\n            os.makedirs(report_save_dir)\n            # Configuration used in the challenges\n        max_nb_interactions = 8  # Maximum number of interactions\n        # Interactive parameters\n        model.eval()\n        state_dicts_ = load(weights)['state_dict']\n        state_dicts = {}\n        for k, v in state_dicts_.items():\n            if 'num_batches_tracked' not in k:\n                state_dicts['head.' + k] = v"
+        },
+        {
+            "comment": "This code segment checks if certain keys are present in the model's state dictionary. If not, it prints a message and writes the state dictionaries to a file named 'model_for_infer.txt'. It then sets the model's state dict with the state dictionaries, opens an inter_file.txt for writing, and initializes a variable 'seen_seq' as False. Inside a no_grad context, it retrieves scribbles and iterates over them, calculating total time, image shape, and checks if there are any annotated frames. If not, it assigns the previous label storage as final masks and submits those masks to the specified save path with corresponding images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":87-108",
+            "content": "                if ('head.' + k) not in model.state_dict().keys():\n                    print(f'pretrained -----{k} -------is not in model')\n        write_dict(state_dicts, 'model_for_infer.txt', **cfg)\n        model.set_state_dict(state_dicts)\n        inter_file = open(\n            os.path.join(\n                cfg.get(\"output_dir\", f\"./output/{cfg['model_name']}\"),\n                'inter_file.txt'), 'w')\n        seen_seq = False\n        with paddle.no_grad():\n            # Get the current iteration scribbles\n            for scribbles, first_scribble in get_scribbles():\n                t_total = timeit.default_timer()\n                f, h, w = images.shape[:3]\n                if 'prev_label_storage' not in locals().keys():\n                    prev_label_storage = paddle.zeros([f, h, w])\n                if len(annotated_frames(scribbles)) == 0:\n                    final_masks = prev_label_storage\n                    # ToDo To AP-kai: save_path\u4f20\u8fc7\u6765\u4e86\n                    submit_masks(cfg[\"save_path\"], final_masks.numpy(), images)"
+        },
+        {
+            "comment": "The code handles the first round of scribbles and initializes memory for future interactions. It writes information to an inter_file, extracts pixel embeddings if it's the first round, and sets up variables for tracking interactions and embedding memories.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":109-133",
+            "content": "                    continue\n                # if no scribbles return, keep masks in previous round\n                start_annotated_frame = annotated_frames(scribbles)[0]\n                pred_masks = []\n                pred_masks_reverse = []\n                if first_scribble:  # If in the first round, initialize memories\n                    n_interaction = 1\n                    eval_global_map_tmp_dic = {}\n                    local_map_dics = ({}, {})\n                    total_frame_num = f\n                else:\n                    n_interaction += 1\n                inter_file.write(sequence + ' ' + 'interaction' +\n                                 str(n_interaction) + ' ' + 'frame' +\n                                 str(start_annotated_frame) + '\\n')\n                if first_scribble:  # if in the first round, extract pixel embbedings.\n                    if not seen_seq:\n                        seen_seq = True\n                        inter_turn = 1\n                        embedding_memory = []\n                        places = paddle.set_device('cpu')"
+        },
+        {
+            "comment": "This code is iterating through each image in a batch and applying a pipeline transformation if testing mode is enabled. It then creates frame embeddings either by looping over model children or directly from the model head. The frame embeddings are appended to a list, concatenated, and stored as embedding_memory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":135-156",
+            "content": "                        for imgs in images:\n                            if cfg['PIPELINE'].get('test'):\n                                imgs = paddle.to_tensor([\n                                    build_pipeline(cfg['PIPELINE'].test)({\n                                        'img1':\n                                            imgs\n                                    })['img1']\n                                ])\n                            else:\n                                imgs = paddle.to_tensor([imgs])\n                            if parallel:\n                                for c in model.children():\n                                    frame_embedding = c.head.extract_feature(\n                                        imgs)\n                            else:\n                                frame_embedding = model.head.extract_feature(\n                                    imgs)\n                            embedding_memory.append(frame_embedding)\n                        del frame_embedding\n                        embedding_memory = paddle.concat(embedding_memory, 0)"
+        },
+        {
+            "comment": "The code initializes the reference frame embedding and handles cases where the annotation is present or not. It extracts the reference frame embedding from the embedding memory, reshapes it, and then creates a scribble sample with the scribble label for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":157-175",
+            "content": "                        _, _, emb_h, emb_w = embedding_memory.shape\n                        ref_frame_embedding = embedding_memory[\n                            start_annotated_frame]\n                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                    else:\n                        inter_turn += 1\n                        ref_frame_embedding = embedding_memory[\n                            start_annotated_frame]\n                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                else:\n                    ref_frame_embedding = embedding_memory[\n                        start_annotated_frame]\n                    ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                ########\n                scribble_masks = scribbles2mask(scribbles, (emb_h, emb_w))\n                scribble_label = scribble_masks[start_annotated_frame]\n                scribble_sample = {'scribble_label': scribble_label}\n                scribble_sample = ToTensor_manet()(scribble_sample)"
+        },
+        {
+            "comment": "This code snippet is responsible for saving an interactive scribble image. It first retrieves the scribble label, then constructs the file path to save the image based on configuration settings and iteration parameters. If the directory doesn't exist, it creates one. Finally, it saves the scribble image using a specific palette.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":176-194",
+            "content": "                #                     print(ref_frame_embedding, ref_frame_embedding.shape)\n                scribble_label = scribble_sample['scribble_label']\n                scribble_label = scribble_label.unsqueeze(0)\n                model_name = cfg['model_name']\n                output_dir = cfg.get(\"output_dir\", f\"./output/{model_name}\")\n                inter_file_path = os.path.join(\n                    output_dir, sequence, 'interactive' + str(n_interaction),\n                                          'turn' + str(inter_turn))\n                if is_save_image:\n                    ref_scribble_to_show = scribble_label.squeeze().numpy()\n                    im_ = Image.fromarray(\n                        ref_scribble_to_show.astype('uint8')).convert('P', )\n                    im_.putpalette(_palette)\n                    ref_img_name = str(start_annotated_frame)\n                    if not os.path.exists(inter_file_path):\n                        os.makedirs(inter_file_path)\n                    im_.save("
+        },
+        {
+            "comment": "This code segment is part of a video modeling framework. It deals with handling scribbles and generating masks based on them. If there are no scribbles after the first one, it prints a message and continues execution by submitting the previous label storage as final masks. This code also checks for parallel processing and seems to be part of an interaction segmentation head.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":195-215",
+            "content": "                        os.path.join(inter_file_path,\n                                     'inter_' + ref_img_name + '.png'))\n                if first_scribble:\n                    prev_label = None\n                    prev_label_storage = paddle.zeros([f, h, w])\n                else:\n                    prev_label = prev_label_storage[start_annotated_frame]\n                    prev_label = prev_label.unsqueeze(0).unsqueeze(0)\n                # check if no scribbles.\n                if not first_scribble and paddle.unique(\n                        scribble_label).shape[0] == 1:\n                    print(\n                        'not first_scribble and paddle.unique(scribble_label).shape[0] == 1'\n                    )\n                    print(paddle.unique(scribble_label))\n                    final_masks = prev_label_storage\n                    submit_masks(cfg[\"save_path\"], final_masks.numpy(), images)\n                    continue\n                ###inteaction segmentation head\n                if parallel:"
+        },
+        {
+            "comment": "This code is part of the Manet_Stage1 segmentation model in PaddleVideo. It iterates through the children of the model and calls the 'int_seghead' function to generate temporary dictionaries and local map dictionaries for each child. The 'int_seghead' function takes various parameters such as reference frame embedding, previous round label, global map temporary dictionary, etc., and returns a tuple containing the temporary dictionary and local map dictionaries. If there are no children in the model, it directly calls the 'int_seghead' function on the model's head for the same set of parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":216-233",
+            "content": "                    for c in model.children():\n                        tmp_dic, local_map_dics = c.head.int_seghead(\n                            ref_frame_embedding=ref_frame_embedding,\n                            ref_scribble_label=scribble_label,\n                            prev_round_label=prev_label,\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            frame_num=[start_annotated_frame],\n                            first_inter=first_scribble)\n                else:\n                    tmp_dic, local_map_dics = model.head.int_seghead(\n                        ref_frame_embedding=ref_frame_embedding,\n                        ref_scribble_label=scribble_label,\n                        prev_round_label=prev_label,\n                        global_map_tmp_dic=eval_global_map_tmp_dic,"
+        },
+        {
+            "comment": "Creates a temporary dictionary with local maps and other parameters. Obtains the predicted label for the sequence, interpolates it to original size, gets the argument of maximum value along axis 1, adds it to prediction masks list, stores the first predicted label for current frame in prev_label_storage if saving images, converts pred_label to numpy array and displays unique elements.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":234-253",
+            "content": "                        local_map_dics=local_map_dics,\n                        interaction_num=n_interaction,\n                        seq_names=[sequence],\n                        gt_ids=paddle.to_tensor([obj_nums]),\n                        frame_num=[start_annotated_frame],\n                        first_inter=first_scribble)\n                pred_label = tmp_dic[sequence]\n                pred_label = nn.functional.interpolate(pred_label,\n                                                       size=(h, w),\n                                                       mode='bilinear',\n                                                       align_corners=True)\n                pred_label = paddle.argmax(pred_label, axis=1)\n                pred_masks.append(float_(pred_label))\n                # np.unique(pred_label)\n                # array([0], dtype=int64)\n                prev_label_storage[start_annotated_frame] = float_(\n                    pred_label[0])\n                if is_save_image:  # save image\n                    pred_label_to_save = pred_label.squeeze(0).numpy()"
+        },
+        {
+            "comment": "The code segment is generating annotated images from predicted labels and creating scribble-based reference labels. It saves the images in a specified folder path, and initializes variables for iterating through the frames of the video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":254-273",
+            "content": "                    im = Image.fromarray(\n                        pred_label_to_save.astype('uint8')).convert('P', )\n                    im.putpalette(_palette)\n                    imgname = str(start_annotated_frame)\n                    while len(imgname) < 5:\n                        imgname = '0' + imgname\n                    if not os.path.exists(inter_file_path):\n                        os.makedirs(inter_file_path)\n                    im.save(os.path.join(inter_file_path, imgname + '.png'))\n                #######################################\n                if first_scribble:\n                    scribble_label = rough_ROI(scribble_label)\n                ##############################\n                ref_prev_label = pred_label.unsqueeze(0)\n                prev_label = pred_label.unsqueeze(0)\n                prev_embedding = ref_frame_embedding\n                for ii in range(start_annotated_frame + 1, total_frame_num):\n                    current_embedding = embedding_memory[ii]\n                    current_embedding = current_embedding.unsqueeze(0)"
+        },
+        {
+            "comment": "The code iterates over the model's children and calls `prop_seghead` on each child, passing relevant embeddings and labels to calculate local maps and global maps for segmentation. It also takes into account nearest neighbors, interaction numbers, and annotated frame start.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":274-291",
+            "content": "                    prev_label = prev_label\n                    if parallel:\n                        for c in model.children():\n                            tmp_dic, eval_global_map_tmp_dic, local_map_dics = c.head.prop_seghead(\n                                ref_frame_embedding,\n                                prev_embedding,\n                                current_embedding,\n                                scribble_label,\n                                prev_label,\n                                normalize_nearest_neighbor_distances=True,\n                                use_local_map=True,\n                                seq_names=[sequence],\n                                gt_ids=paddle.to_tensor([obj_nums]),\n                                k_nearest_neighbors=cfg['knns'],\n                                global_map_tmp_dic=eval_global_map_tmp_dic,\n                                local_map_dics=local_map_dics,\n                                interaction_num=n_interaction,\n                                start_annotated_frame=start_annotated_frame,"
+        },
+        {
+            "comment": "Code segment is part of a larger function in PaddleVideo library. It checks if frame number is the start_annotated_frame, if so, it extracts the current embedding, else it calls head.prop_seghead to get temporary dictionary, global map temporary dictionary and local maps based on reference frame embedding, previous embedding, current embedding, scribble label and previous label using Paddle (a deep learning framework). It also considers K nearest neighbors and interaction number while performing its operation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":292-309",
+            "content": "                                frame_num=[ii],\n                                dynamic_seghead=c.head.dynamic_seghead)\n                    else:\n                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.head.prop_seghead(\n                            ref_frame_embedding,\n                            prev_embedding,\n                            current_embedding,\n                            scribble_label,\n                            prev_label,\n                            normalize_nearest_neighbor_distances=True,\n                            use_local_map=True,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            k_nearest_neighbors=cfg['knns'],\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            start_annotated_frame=start_annotated_frame,"
+        },
+        {
+            "comment": "This code segment is responsible for predicting the labels, creating masks and storing them in a list, and possibly saving an image. The predicted label is interpolated to match the frame size, converted to mask and added to the list of masks. This process continues for each frame. If saving images, the predicted labels are converted to an image format and saved as a grayscale PALETTE image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":310-326",
+            "content": "                            frame_num=[ii],\n                            dynamic_seghead=model.head.dynamic_seghead)\n                    pred_label = tmp_dic[sequence]\n                    pred_label = nn.functional.interpolate(pred_label,\n                                                           size=(h, w),\n                                                           mode='bilinear',\n                                                           align_corners=True)\n                    pred_label = paddle.argmax(pred_label, axis=1)\n                    pred_masks.append(float_(pred_label))\n                    prev_label = pred_label.unsqueeze(0)\n                    prev_embedding = current_embedding\n                    prev_label_storage[ii] = float_(pred_label[0])\n                    if is_save_image:\n                        pred_label_to_save = pred_label.squeeze(0).numpy()\n                        im = Image.fromarray(\n                            pred_label_to_save.astype('uint8')).convert('P', )\n                        im.putpalette(_palette)"
+        },
+        {
+            "comment": "Code snippet saves frames to disk, initializes variables for propagation loop, and begins the propagation process by iterating through frames from start_annotated_frame down to 0. The model's children are then processed in parallel for segmentation head propagation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":327-346",
+            "content": "                        imgname = str(ii)\n                        while len(imgname) < 5:\n                            imgname = '0' + imgname\n                        if not os.path.exists(inter_file_path):\n                            os.makedirs(inter_file_path)\n                        im.save(os.path.join(inter_file_path,\n                                             imgname + '.png'))\n                #######################################\n                prev_label = ref_prev_label\n                prev_embedding = ref_frame_embedding\n                #######\n                # Propagation <-\n                for ii in range(start_annotated_frame):\n                    current_frame_num = start_annotated_frame - 1 - ii\n                    current_embedding = embedding_memory[current_frame_num]\n                    current_embedding = current_embedding.unsqueeze(0)\n                    prev_label = prev_label\n                    if parallel:\n                        for c in model.children():\n                            tmp_dic, eval_global_map_tmp_dic, local_map_dics = c.head.prop_seghead("
+        },
+        {
+            "comment": "This code appears to be part of a deep learning model for video segmentation. It is calling the \"prop_seghead\" function from the \"model.head\" object with specific parameters including reference frame embedding, previous and current embeddings, scribble label, and previous label. If certain conditions are met, additional parameters such as normalize nearest neighbor distances, use local map, sequence names, ground truth IDs, number of nearest neighbors, start annotated frame, and dynamic seghead are passed. The function returns a temporary dictionary, evaluation global map temporary dictionary, and local map dictionaries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":347-364",
+            "content": "                                ref_frame_embedding,\n                                prev_embedding,\n                                current_embedding,\n                                scribble_label,\n                                prev_label,\n                                normalize_nearest_neighbor_distances=True,\n                                use_local_map=True,\n                                seq_names=[sequence],\n                                gt_ids=paddle.to_tensor([obj_nums]),\n                                k_nearest_neighbors=cfg['knns'],\n                                global_map_tmp_dic=eval_global_map_tmp_dic,\n                                local_map_dics=local_map_dics,\n                                interaction_num=n_interaction,\n                                start_annotated_frame=start_annotated_frame,\n                                frame_num=[current_frame_num],\n                                dynamic_seghead=c.head.dynamic_seghead)\n                    else:\n                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.head.prop_seghead("
+        },
+        {
+            "comment": "This code is calculating the predictions for a specific sequence by using various embeddings, labels, and configurations. It involves interacting with multiple dictionaries, tensor operations, and a dynamic seghead model. The predicted label is then interpolated to match the resolution of the original frame.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":365-382",
+            "content": "                            ref_frame_embedding,\n                            prev_embedding,\n                            current_embedding,\n                            scribble_label,\n                            prev_label,\n                            normalize_nearest_neighbor_distances=True,\n                            use_local_map=True,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            k_nearest_neighbors=cfg['knns'],\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            start_annotated_frame=start_annotated_frame,\n                            frame_num=[current_frame_num],\n                            dynamic_seghead=model.head.dynamic_seghead)\n                    pred_label = tmp_dic[sequence]\n                    pred_label = nn.functional.interpolate(pred_label,"
+        },
+        {
+            "comment": "This code snippet is part of an image segmentation model. It extracts predictions from the model, converts them to masks, and stores previous label information for each frame. Additionally, it saves visualizations of these predictions as palette-colored images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":383-401",
+            "content": "                                                           size=(h, w),\n                                                           mode='bilinear',\n                                                           align_corners=True)\n                    pred_label = paddle.argmax(pred_label, axis=1)\n                    pred_masks_reverse.append(float_(pred_label))\n                    prev_label = pred_label.unsqueeze(0)\n                    prev_embedding = current_embedding\n                    ####\n                    prev_label_storage[current_frame_num] = float_(\n                        pred_label[0])\n                    ###\n                    if is_save_image:\n                        pred_label_to_save = pred_label.squeeze(0).numpy()\n                        im = Image.fromarray(\n                            pred_label_to_save.astype('uint8')).convert('P', )\n                        im.putpalette(_palette)\n                        imgname = str(current_frame_num)\n                        while len(imgname) < 5:"
+        },
+        {
+            "comment": "This code saves images and their corresponding masks, creates final masks, and writes the total time for a single interaction. It handles non-existent folders by creating them before saving images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py\":402-416",
+            "content": "                            imgname = '0' + imgname\n                        if not os.path.exists(inter_file_path):\n                            os.makedirs(inter_file_path)\n                        im.save(os.path.join(inter_file_path,\n                                             imgname + '.png'))\n                pred_masks_reverse.reverse()\n                pred_masks_reverse.extend(pred_masks)\n                final_masks = paddle.concat(pred_masks_reverse, 0)\n                submit_masks(cfg[\"save_path\"], final_masks.numpy(), images)\n                t_end = timeit.default_timer()\n                print('Total time for single interaction: ' +\n                      str(t_end - t_total))\n        inter_file.close()\n        return None"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2e744e01-920a-4c0b-8699-31b8562984ee.json b/docs/doc/2e744e01-920a-4c0b-8699-31b8562984ee.json
new file mode 100644
index 000000000..d63e461b2
--- /dev/null
+++ b/docs/doc/2e744e01-920a-4c0b-8699-31b8562984ee.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code file contains the initialization for a Video Quality Assessment application. It includes registrar for metrics, builder function for metrics and defines the QualityMetric class. The code is licensed under Apache License, Version 2.0 and distributed as-is without warranties or conditions.",
+    "details": [
+        {
+            "comment": "This code file contains the initialization for a Video Quality Assessment application. It includes registrar for metrics, builder function for metrics and defines the QualityMetric class. The code is licensed under Apache License, Version 2.0 and distributed as-is without warranties or conditions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py\":0-22",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .registry import METRIC\nfrom .build import build_metric\nfrom .quality_metric import QuqlityMetric\n__all__ = [\n    'METRIC', 'build_metric', 'QuqlityMetric'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2ef6c407-6499-4b33-b88c-692844b93c1e.json b/docs/doc/2ef6c407-6499-4b33-b88c-692844b93c1e.json
new file mode 100644
index 000000000..656a04c7f
--- /dev/null
+++ b/docs/doc/2ef6c407-6499-4b33-b88c-692844b93c1e.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The BaseSegmenter class serves as a foundation for PaddleVideo segmenters, handling training, validation, testing, and inference with a mode parameter. Subclasses must implement train_step, valid_step, test_step, and feature extraction modules.",
+    "details": [
+        {
+            "comment": "The code is defining a BaseSegmenter class, which serves as the base class for all segmenters. It requires subclasses to override train_step, valid_step, and test_step methods. The class also accepts backbone and head modules to extract features and process them respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/base.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseSegmenter(nn.Layer):\n    \"\"\"Base class for segementers.\n    All segementers should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Classification head to process feature."
+        },
+        {
+            "comment": "This code defines a segmenter base class for PaddleVideo. It initializes the backbone, head, and loss layers based on user input. The `forward` method specifies how the model processes data in either infer or train mode. Initializing weights is optional but can be called if the layer supports it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/base.py\":31-62",
+            "content": "    \"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__()\n        # build backbone\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        # build head\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n        # build loss\n        if loss is not None:\n            self.loss_name = loss.name\n            self.loss = builder.build_loss(loss)\n            if hasattr(self.loss, 'init_weights'):\n                self.loss.init_weights()\n        else:\n            self.loss = None\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output."
+        },
+        {
+            "comment": "This code defines a base class for segmenters that supports training, validation, testing, and inference steps. The `mode` parameter determines which step to execute, and abstract methods must be implemented by subclasses for each step.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/base.py\":63-98",
+            "content": "        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\""
+        },
+        {
+            "comment": "This code block raises a NotImplementedError, indicating that the current implementation of the function or method is not complete and requires further development.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/base.py\":99-99",
+            "content": "        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/2fcf21c5-4cd0-4b12-a56c-5e2b57e23fb2.json b/docs/doc/2fcf21c5-4cd0-4b12-a56c-5e2b57e23fb2.json
new file mode 100644
index 000000000..54b5a8217
--- /dev/null
+++ b/docs/doc/2fcf21c5-4cd0-4b12-a56c-5e2b57e23fb2.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The BMNINFReader class in PaddleVideo reads and processes data from the BMN model for football action detection, filtering invalid proposals and handling image/audio data. This code creates a batch reader that pairs video features with names and scales, yielding batches until completion.",
+    "details": [
+        {
+            "comment": "This code defines a class called BMNINFReader which is a data reader for the BMN model. It reads data that has been extracted by prior networks and uses the \"get_sw_prop\" function to filter out invalid proposals. The get_sw_prop function calculates proposal regions based on a given duration, window size, and step size. Proposals with less than one second in the video are filtered out. This data reader is part of the PaddleVideo package for FootballAction application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py\":0-48",
+            "content": "\"\"\"\n# @File  : bmninf_reader.py  \n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport os\nimport random\nimport pickle\nimport json\nimport numpy as np\nimport multiprocessing\nimport numpy as np\nfrom .reader_utils import DataReader\ndef get_sw_prop(duration, window=200, step=10):\n    \"\"\"\n    get_sw_prop\n    \"\"\"\n    pr = []\n    local_boxes = []\n    for k in np.arange(0, duration - window + step, step):\n        start_id = k\n        end_id = min(duration, k + window)\n        if end_id - start_id < window:\n            start_id = end_id - window\n        local_boxes = (start_id, end_id)\n        pr.append(local_boxes)\n    def valid_proposal(duration, span):\n        \"\"\"\n        valid_proposal\n        \"\"\"\n        # fileter proposals\n        # a valid proposal should have at least one second in the video\n        real_span = min(duration, span[1]) - span[0]\n        return real_span >= 1\n    pr = list(filter(lambda x: valid_proposal(duration, x), pr))\n    return pr\nclass BMNINFReader(DataReader):\n    \"\"\"\n    Data reader for BMN model, which was stored as features extracted by prior networks"
+        },
+        {
+            "comment": "This code initializes a class, likely for data reading and processing. It takes parameters such as name, mode, configuration (cfg), and material. It sets attributes like temporal length (tscale) and duration scale (dscale) from the configuration. The code reshapes pcm_feature to fit the needed shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py\":49-72",
+            "content": "    dataset cfg: feat_path, feature path,\n                 tscale, temporal length of BM map,\n                 dscale, duration scale of BM map,\n                 anchor_xmin, anchor_xmax, the range of each point in the feature sequence,\n                 batch_size, batch size of input data,\n                 num_threads, number of threads of data processing\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.tscale = cfg[self.name.upper()]['tscale']  # 200\n        self.dscale = cfg[self.name.upper()]['dscale']  # 200\n        # self.subset = cfg[self.name.upper()]['subset']\n        self.tgap = 1. / self.tscale\n        self.step = cfg[self.name.upper()]['window_step']\n        self.material = material\n        src_feature = self.material\n        image_feature = src_feature['image_feature']\n        pcm_feature = src_feature['pcm_feature']\n        pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))\n        # print(rgb_feature.shape, audio_feature.shape, pcm_feature.shape)"
+        },
+        {
+            "comment": "This code reads image and audio data for video analysis, concatenates them into a feature vector, sets the duration, window size, and batch size. It then retrieves the list of videos to process and creates a match map for analyzing video frames. The code is part of a machine learning model used in football action detection.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py\":73-104",
+            "content": "        min_length = min(image_feature.shape[0], pcm_feature.shape[0])\n        #if min_length == 0:\n        #    continue\n        image_feature = image_feature[:min_length, :]\n        pcm_feature = pcm_feature[:min_length, :]\n        self.features = np.concatenate((image_feature, pcm_feature), axis=1)\n        self.duration = len(self.features)\n        self.window = self.tscale\n        self.get_dataset_dict()\n        self.get_match_map()\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        if (mode == 'test') or (mode == 'infer'):\n            self.num_threads = 1  # set num_threads as 1 for test and infer\n    def get_dataset_dict(self):\n        \"\"\"\n        get_dataset_dict\n        \"\"\"\n        self.video_list = get_sw_prop(self.duration, self.window, self.step)\n    def get_match_map(self):\n        \"\"\"\n        get_match_map\n        \"\"\"\n        match_map = []\n        for idx in range(self.tscale):\n            tmp_match_window = []\n            xmin = self.tgap * idx\n            for jdx in range(1, self.tscale + 1):"
+        },
+        {
+            "comment": "This code is for creating a reader function to handle BMNINF file loading and defining the match_map attribute. It defines the load_file function, create_reader function, and make_infer_reader function. The load_file function loads features from a given video window range, converts them to float32 type, and transposes the data. The create_reader function creates a reader for the CTCN model. The make_infer_reader function defines a reader for inference purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py\":105-137",
+            "content": "                xmax = xmin + self.tgap * jdx\n                tmp_match_window.append([xmin, xmax])\n            match_map.append(tmp_match_window)\n        match_map = np.array(match_map)\n        match_map = np.transpose(match_map, [1, 0, 2])\n        match_map = np.reshape(match_map, [-1, 2])\n        self.match_map = match_map\n        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]\n        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]\n    def load_file(self, video_wind):\n        \"\"\"\n        load_file\n        \"\"\"\n        start_feat_id = video_wind[0]\n        end_feat_id = video_wind[1]\n        video_feat = self.features[video_wind[0]: video_wind[1]]\n        video_feat = video_feat.T\n        video_feat = video_feat.astype(\"float32\")\n        return video_feat\n    def create_reader(self):\n        \"\"\"\n        reader creator for ctcn model\n        \"\"\"\n        return self.make_infer_reader()\n    def make_infer_reader(self):\n        \"\"\"\n        reader for inference\n        \"\"\"\n        def reader():"
+        },
+        {
+            "comment": "This code creates a batch reader for video data in a football action detection application. It loads features from videos, pairs them with their corresponding names and scales, and yields batches of this data until the batch size is reached or all videos are processed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py\":138-154",
+            "content": "            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            # for video_name in self.video_list:\n            for video_wind in self.video_list:\n                video_idx = self.video_list.index(video_wind)\n                video_feat = self.load_file(video_wind)\n                batch_out.append((video_feat, video_wind, [self.duration, self.dscale]))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/311bc662-f84a-48a4-b3b0-9dd6651566f7.json b/docs/doc/311bc662-f84a-48a4-b3b0-9dd6651566f7.json
new file mode 100644
index 000000000..1d0d1d338
--- /dev/null
+++ b/docs/doc/311bc662-f84a-48a4-b3b0-9dd6651566f7.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The code defines the AttentionLstmHead class for LSTM-based attention mechanism in PaddleVideo, performing feature extraction and softmax normalization for video and audio classification tasks.",
+    "details": [
+        {
+            "comment": "This code defines a class called AttentionLstmHead, which is a type of head used in a neural network. It is part of the PaddleVideo library and inherits from the BaseHead class. The class uses LSTM for attention, has its own parameters (specified by ParamAttr), and utilizes weight initialization. This code also includes license information, documentation on arguments, and registration in the HEADS registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":0-31",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Normal\nfrom paddle.regularizer import L2Decay\nimport paddle.nn.functional as F\nfrom ...metrics.youtube8m import eval_util as youtube8m_metrics\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nfrom .base import BaseHead\n@HEADS.register()\nclass AttentionLstmHead(BaseHead):\n    \"\"\"AttentionLstmHead.\n    Args: TODO\n    \"\"\"\n    def __init__(self,"
+        },
+        {
+            "comment": "This code initializes an AttentionLstmHead object with specified parameters. It creates a Linear layer for each feature dimension (rgb, audio) and adds a bi-directional LSTM layer with specified sizes. The AttentionLstmHead will be used to process video frames and audio data in parallel for classification tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":32-52",
+            "content": "                 num_classes=3862,\n                 feature_num=2,\n                 feature_dims=[1024, 128],\n                 embedding_size=512,\n                 lstm_size=1024,\n                 in_channels=2048,\n                 loss_cfg=dict(name='CrossEntropyLoss')):\n        super(AttentionLstmHead, self).__init__(num_classes, in_channels,\n                                                loss_cfg)\n        self.num_classes = num_classes\n        self.feature_dims = feature_dims\n        self.embedding_size = embedding_size\n        self.lstm_size = lstm_size\n        self.feature_num = len(self.feature_dims)\n        for i in range(self.feature_num):  # 0:rgb, 1:audio\n            fc_feature = paddle.nn.Linear(in_features=self.feature_dims[i],\n                                          out_features=self.embedding_size)\n            self.add_sublayer(\"fc_feature{}\".format(i), fc_feature)\n            bi_lstm = paddle.nn.LSTM(input_size=self.embedding_size,\n                                     hidden_size=self.lstm_size,"
+        },
+        {
+            "comment": "The code initializes an LSTM layer with bidirectional capability and adds dropout for regularization. It defines a linear layer (att_fc) to map the output of the LSTM layer to 1 feature, applies softmax activation, and then defines two fully connected layers (fc_out1 and fc_out2) for further processing with specific activations and parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":53-73",
+            "content": "                                     direction=\"bidirectional\")\n            self.add_sublayer(\"bi_lstm{}\".format(i), bi_lstm)\n            drop_rate = 0.5\n            self.dropout = paddle.nn.Dropout(drop_rate)\n            att_fc = paddle.nn.Linear(in_features=self.lstm_size * 2,\n                                      out_features=1)\n            self.add_sublayer(\"att_fc{}\".format(i), att_fc)\n            self.softmax = paddle.nn.Softmax()\n        self.fc_out1 = paddle.nn.Linear(in_features=self.lstm_size * 4,\n                                        out_features=8192,\n                                        bias_attr=ParamAttr(\n                                            regularizer=L2Decay(0.0),\n                                            initializer=Normal()))\n        self.relu = paddle.nn.ReLU()\n        self.fc_out2 = paddle.nn.Linear(in_features=8192,\n                                        out_features=4096,\n                                        bias_attr=ParamAttr(\n                                            regularizer=L2Decay(0.0),"
+        },
+        {
+            "comment": "The code defines a class for an attention LSTM head in PaddleVideo. It initializes two linear layers and a sigmoid activation function. The `init_weights` method is currently empty, and the `forward` method takes inputs of different lengths and processes them before storing the results in the `att_outs` list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":74-94",
+            "content": "                                            initializer=Normal()))\n        self.fc_logit = paddle.nn.Linear(in_features=4096,\n                                         out_features=self.num_classes,\n                                         bias_attr=ParamAttr(\n                                             regularizer=L2Decay(0.0),\n                                             initializer=Normal()))\n        self.sigmoid = paddle.nn.Sigmoid()\n    def init_weights(self):\n        pass\n    def forward(self, inputs):\n        # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)]\n        # deal with features with different length\n        # 1. padding to same lenght, make a tensor\n        # 2. make a mask tensor with the same shpae with 1\n        # 3. compute output using mask tensor, s.t. output is nothing todo with padding\n        assert (len(inputs) == self.feature_num\n                ), \"Input tensor does not contain {} features\".format(\n                    self.feature_num)\n        att_outs = []"
+        },
+        {
+            "comment": "The code performs feature extraction, bi-directional LSTM processing, attention weight calculation, and finally softmax normalization on each input in a list. It uses dropout to prevent overfitting, applies masking for attention calculations, and calculates the denominator using power function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":95-119",
+            "content": "        for i in range(len(inputs)):\n            # 1. fc\n            m = getattr(self, \"fc_feature{}\".format(i))\n            output_fc = m(inputs[i][0])\n            output_fc = paddle.tanh(output_fc)\n            # 2. bi_lstm\n            m = getattr(self, \"bi_lstm{}\".format(i))\n            lstm_out, _ = m(inputs=output_fc, sequence_length=inputs[i][1])\n            lstm_dropout = self.dropout(lstm_out)\n            # 3. att_fc\n            m = getattr(self, \"att_fc{}\".format(i))\n            lstm_weight = m(lstm_dropout)\n            # 4. softmax replace start, for it's relevant to sum in time step\n            lstm_exp = paddle.exp(lstm_weight)\n            lstm_mask = paddle.mean(inputs[i][2], axis=2)\n            lstm_mask = paddle.unsqueeze(lstm_mask, axis=2)\n            lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask)\n            lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1)\n            exponent = -1\n            lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent)\n            lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2)"
+        },
+        {
+            "comment": "This code performs LSTM-based attention mechanism for a sequence modeling task. It applies softmax, dropout, and mask operations on the LSTM outputs to compute the attention weights. The attention weights are then used to generate an attentive pooling of the sequence, which is passed through fully connected layers and sigmoid activation for the final output. The loss function uses labels with stop_gradient=True for training the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":120-143",
+            "content": "            lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator)\n            lstm_weight = lstm_softmax\n            # softmax replace end\n            lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight)\n            # 5. sequence_pool's replace start, for it's relevant to sum in time step\n            lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask)\n            fea_lens = inputs[i][1]\n            fea_len = int(fea_lens[0])\n            lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1)\n            # sequence_pool's replace end\n            att_outs.append(lstm_pool)\n        att_out = paddle.concat(att_outs, axis=1)\n        fc_out1 = self.fc_out1(att_out)\n        fc_out1_act = self.relu(fc_out1)\n        fc_out2 = self.fc_out2(fc_out1_act)\n        fc_out2_act = paddle.tanh(fc_out2)\n        fc_logit = self.fc_logit(fc_out2_act)\n        output = self.sigmoid(fc_logit)\n        return fc_logit, output\n    def loss(self, lstm_logit, labels, **kwargs):\n        labels.stop_gradient = True"
+        },
+        {
+            "comment": "This code defines an ActionAttentionLstmHead class which is a type of BaseHead. It uses LSTM for attention and takes in various arguments like num_classes, feature_num, feature_dims, embedding_size, lstm_size, in_channels, and loss_cfg. The metric function calculates hit_at_one, perr (precision at equal recall rate), and gap values from the LSTM output and labels. The sum_cost function calculates the loss using BCEWithLogitsLoss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":144-172",
+            "content": "        losses = dict()\n        bce_logit_loss = paddle.nn.BCEWithLogitsLoss(reduction='sum')\n        sum_cost = bce_logit_loss(lstm_logit, labels)\n        return sum_cost\n    def metric(self, lstm_output, labels):\n        pred = lstm_output.numpy()\n        label = labels.numpy()\n        hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)\n        perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(\n            pred, label)\n        gap = youtube8m_metrics.calculate_gap(pred, label)\n        return hit_at_one, perr, gap\n@HEADS.register()\nclass ActionAttentionLstmHead(BaseHead):\n    \"\"\"AttentionLstmHead for FootballAction\n    Args: TODO\n    \"\"\"\n    def __init__(self,\n                 num_classes=8,\n                 feature_num=2,\n                 feature_dims=[2048, 1024],\n                 embedding_size=512,\n                 lstm_size=1024,\n                 in_channels=2048,\n                 loss_cfg=dict(name='CrossEntropyLoss')):\n        super(ActionAttentionLstmHead, self).__init__(num_classes, in_channels,"
+        },
+        {
+            "comment": "This code initializes a LSTM network for feature processing and attention mechanism. It defines bidirectional LSTM layers for each feature dimension (RGB, audio), followed by dropout and fully connected layers. The model has 8192 output features and is used for multimodal fusion in a video understanding task.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":173-194",
+            "content": "                                                      loss_cfg)\n        self.num_classes = num_classes\n        self.feature_dims = feature_dims\n        self.embedding_size = embedding_size\n        self.lstm_size = lstm_size\n        self.feature_num = len(self.feature_dims)\n        for i in range(self.feature_num):  # 0:rgb, 1:audio\n            bi_lstm = paddle.nn.LSTM(input_size=self.feature_dims[i],\n                                     hidden_size=self.feature_dims[i],\n                                     direction=\"bidirectional\")\n            self.add_sublayer(\"bi_lstm{}\".format(i), bi_lstm)\n            drop_rate = 0.5\n            self.dropout = paddle.nn.Dropout(drop_rate)\n            att_fc = paddle.nn.Linear(in_features=self.feature_dims[i] * 2,\n                                      out_features=1)\n            self.add_sublayer(\"att_fc{}\".format(i), att_fc)\n            self.softmax = paddle.nn.Softmax()\n        self.fc1 = paddle.nn.Linear(in_features=2 * sum(self.feature_dims),\n                                    out_features=8192,"
+        },
+        {
+            "comment": "This code defines a class for an attention-based LSTM head in PaddleVideo. It includes several fully connected layers, batch normalization, dropout, and two linear layers. The `init_weights` function is not implemented, and the `forward` method takes input data as a tuple of (rgb_data, rgb_len, rgb_mask) and (audio_data, audio_len, audio_mask).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":195-220",
+            "content": "                                    bias_attr=ParamAttr(\n                                        regularizer=L2Decay(0.0),\n                                        initializer=Normal()))\n        self.bn1 = paddle.nn.BatchNorm(num_channels=8192)\n        self.dropout1 = paddle.nn.Dropout(0.5)\n        self.fc2 = paddle.nn.Linear(in_features=8192,\n                                    out_features=4096,\n                                    bias_attr=ParamAttr(\n                                        regularizer=L2Decay(0.0),\n                                        initializer=Normal()))\n        self.bn2 = paddle.nn.BatchNorm(num_channels=4096)\n        self.dropout2 = paddle.nn.Dropout(0.5)\n        self.fc3 = paddle.nn.Linear(\n            in_features=4096,\n            out_features=self.num_classes,\n        )\n        self.fc4 = paddle.nn.Linear(\n            in_features=4096,\n            out_features=1,\n        )\n    def init_weights(self):\n        pass\n    def forward(self, inputs):\n        # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)]"
+        },
+        {
+            "comment": "This code handles features with varying lengths. It pads features to the same length, creates a mask tensor, and computes the output using the mask tensor, effectively ignoring padding values. It asserts that the input tensor contains the expected number of features. It iterates over each feature, performs bi-directional LSTM, applies dropout, calculates weighted sum using attention mechanism, applies softmax to the weights, multiplies by a mask, and stores the results in att_outs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":221-243",
+            "content": "        # deal with features with different length\n        # 1. padding to same lenght, make a tensor\n        # 2. make a mask tensor with the same shpae with 1\n        # 3. compute output using mask tensor, s.t. output is nothing todo with padding\n        assert (len(inputs) == self.feature_num\n                ), \"Input tensor does not contain {} features\".format(\n                    self.feature_num)\n        att_outs = []\n        for i in range(len(inputs)):\n            m = getattr(self, \"bi_lstm{}\".format(i))\n            lstm_out, _ = m(inputs=inputs[i][0], sequence_length=inputs[i][1])\n            lstm_dropout = self.dropout(lstm_out)\n            # 3. att_fc\n            m = getattr(self, \"att_fc{}\".format(i))\n            lstm_weight = m(lstm_dropout)\n            # 4. softmax replace start, for it's relevant to sum in time step\n            lstm_exp = paddle.exp(lstm_weight)\n            lstm_mask = paddle.mean(inputs[i][2], axis=2)\n            lstm_mask = paddle.unsqueeze(lstm_mask, axis=2)\n            lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask)"
+        },
+        {
+            "comment": "This code segment calculates the attention scores using LSTM and applies them to sequence pooling. It then passes the output through multiple layers of neural networks, including fully connected layers, batch normalization, ReLU activation, and dropout. The final result is stored in `att_out` for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":244-266",
+            "content": "            lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1)\n            exponent = -1\n            lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent)\n            lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2)\n            lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator)\n            lstm_weight = lstm_softmax\n            # softmax replace end\n            lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight)\n            # 5. sequence_pool's replace start, for it's relevant to sum in time step\n            lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask)\n            # fea_lens = inputs[i][1]\n            # fea_len = int(fea_lens[0])\n            lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1)\n            # sequence_pool's replace end\n            att_outs.append(lstm_pool)\n        att_out = paddle.concat(att_outs, axis=1)\n        y = self.fc1(att_out)\n        y = self.bn1(y)\n        y = F.relu(y)\n        y = self.dropout1(y)\n        y = self.fc2(y)"
+        },
+        {
+            "comment": "The code contains two main components: a LSTM attention head and loss/metric functions. The LSTM attention head computes attention weights for the input sequence, followed by softmax and sigmoid activation functions. The loss function calculates cross-entropy and mean squared error losses, with alpha as a weight parameter. The metric function computes top1 and top5 accuracy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py\":267-287",
+            "content": "        y = self.bn2(y)\n        y = F.relu(y)\n        y = self.dropout2(y)\n        out1 = self.fc3(y)\n        out1 = F.softmax(out1)\n        out2 = self.fc4(y)\n        out2 = F.sigmoid(out2)\n        return out1, out2\n    def loss(self, logits, iou, labels, labels_iou, **kwargs):\n        alpha = 10\n        softmax_loss = F.cross_entropy(logits, labels)\n        labels_iou = labels_iou.astype('float32')\n        mse_loss = paddle.sum(F.square_error_cost(iou, labels_iou), axis=-1)\n        sum_loss = softmax_loss + alpha * mse_loss\n        return sum_loss\n    def metric(self, scores, labels):\n        top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)\n        top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)\n        return top1, top5"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/31782bb7-e595-45e6-9f2a-de9c0ea82e0e.json b/docs/doc/31782bb7-e595-45e6-9f2a-de9c0ea82e0e.json
new file mode 100644
index 000000000..d33630f61
--- /dev/null
+++ b/docs/doc/31782bb7-e595-45e6-9f2a-de9c0ea82e0e.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code sets environment variables for GPU utilization and then runs the eval_and_save_model.py script in scenario_lib, evaluating a model named AttentionLstmErnie with provided configuration file and saving its parameters and inference models to specified directories. The \"--save_only\" flag is not used, so both evaluation and saving will occur.",
+    "details": [
+        {
+            "comment": "This code sets environment variables for GPU utilization and then runs the eval_and_save_model.py script in scenario_lib, evaluating a model named AttentionLstmErnie with provided configuration file and saving its parameters and inference models to specified directories. The \"--save_only\" flag is not used, so both evaluation and saving will occur.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/eval_and_save_model.sh\":0-12",
+            "content": "# eval sh \nexport CUDA_VISIBLE_DEVICES=0\nexport FLAGS_eager_delete_tensor_gb=0.0\nexport FLAGS_sync_nccl_allreduce=1\nexport FLAGS_fast_eager_deletion_mode=1\nexport FLAGS_fraction_of_gpu_memory_to_use=0.5\nexport FLAGS_reallocate_gpu_memory_in_mb=0\nexport FLAGS_memory_fraction_of_eager_deletion=1\npython scenario_lib/eval_and_save_model.py --model_name=AttentionLstmErnie \\\n--config=./conf/conf.txt \\\n--save_model_param_dir=checkpoints_save \\\n--save_inference_model=inference_models_save \\\n# --save_only"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/31dfc0aa-3043-4679-8ce5-2bf63a00a92f.json b/docs/doc/31dfc0aa-3043-4679-8ce5-2bf63a00a92f.json
new file mode 100644
index 000000000..fede83cd8
--- /dev/null
+++ b/docs/doc/31dfc0aa-3043-4679-8ce5-2bf63a00a92f.json
@@ -0,0 +1,75 @@
+{
+    "summary": "This code handles video sequence object detection, converting results to CSV format and evaluating AVA metrics using error handling, utility functions, and GPU-based processing.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and modules for evaluating AVA (Activity-driven Visual Attention) metrics. It also includes a license notice, time management functions, and error handling measures. The code uses defaultdict from collections and eval_recalls function from the same repository to perform evaluation tasks related to object detection in video sequences. Additionally, it incorporates paddlevideo's get_logger() function for logging, dist library for distributed processing, and numpy for numerical operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":0-30",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport csv\nimport heapq\nimport logging\nimport time\nfrom collections import defaultdict\nfrom .ava_evaluation import object_detection_evaluation as det_eval\nfrom .ava_evaluation import standard_fields\nfrom .recall import eval_recalls\nimport shutil\nimport pickle\nimport time\nimport os\nimport os.path as osp\nfrom paddlevideo.utils import get_logger, get_dist_info\nimport paddle.distributed as dist\nimport sys\nimport numpy as np"
+        },
+        {
+            "comment": "The code defines two functions: \"det2csv\" and \"results2csv\". \"det2csv\" takes in information, dataset length, results, and custom classes (if any), and returns a list of tuples representing the results in CSV format. It loops through each entry, extracts relevant data, converts tensors to numpy arrays if needed, and appends the information to the csv_results list. \"results2csv\" checks if the results are organized by class or not, then calls either \"det2csv\" or performs CSV conversion directly using it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":31-63",
+            "content": "from pathlib import Path\nfrom datetime import datetime\nimport paddle\ndef det2csv(info, dataset_len, results, custom_classes):\n    csv_results = []\n    for idx in range(dataset_len):\n        video_id = info[idx]['video_id']\n        timestamp = info[idx]['timestamp']\n        result = results[idx]\n        for label, _ in enumerate(result):\n            for bbox in result[label]:\n                if type(bbox) == paddle.Tensor:\n                    bbox = bbox.numpy()\n                bbox_ = tuple(bbox.tolist())\n                if custom_classes is not None:\n                    actual_label = custom_classes[label + 1]\n                else:\n                    actual_label = label + 1\n                csv_results.append((\n                    video_id,\n                    timestamp,\n                ) + bbox_[:4] + (actual_label, ) + bbox_[4:])\n    return csv_results\n# results is organized by class\ndef results2csv(info, dataset_len, results, out_file, custom_classes=None):\n    if isinstance(results[0], list):\n        csv_results = det2csv(info, dataset_len, results, custom_classes)"
+        },
+        {
+            "comment": "This code snippet contains several utility functions used for video analysis. The \"tostr\" function converts a float to a string representation with 3 decimal places, while the \"print_time\" function calculates and prints the time elapsed since a given start point. The \"make_image_key\" function generates a unique identifier for a video ID and timestamp, and \"read_csv\" function loads boxes and class labels from a CSV file in AVA format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":65-96",
+            "content": "    # save space for float\n    def tostr(item):\n        if isinstance(item, float):\n            return f'{item:.3f}'\n        return str(item)\n    with open(out_file, 'w') as f:\n        for csv_result in csv_results:\n            f.write(','.join(map(lambda x: tostr(x), csv_result)))\n            f.write('\\n')\ndef print_time(message, start):\n    print('==> %g seconds to %s' % (time.time() - start, message))\ndef make_image_key(video_id, timestamp):\n    \"\"\"Returns a unique identifier for a video id & timestamp.\"\"\"\n    return f'{video_id},{int(timestamp):04d}'\ndef read_csv(csv_file, class_whitelist=None, capacity=0):\n    \"\"\"Loads boxes and class labels from a CSV file in the AVA format.\n    CSV file format described at https://research.google.com/ava/download.html.\n    Args:\n        csv_file: A file object.\n        class_whitelist: If provided, boxes corresponding to (integer) class\n        labels not in this set are skipped.\n        capacity: Maximum number of labeled boxes allowed for each example.\n        Default is 0 where there is no limit."
+        },
+        {
+            "comment": "This code reads a CSV file with video frame data, and for each row, it creates dictionaries for boxes, labels, and scores. It checks the class whitelist before adding the data to the respective lists. If scores are not provided in the CSV, they default to 1.0. The time taken for this process is measured at the beginning with start = time.time().",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":98-119",
+            "content": "    Returns:\n        boxes: A dictionary mapping each unique image key (string) to a list of\n        boxes, given as coordinates [y1, x1, y2, x2].\n        labels: A dictionary mapping each unique image key (string) to a list\n        of integer class lables, matching the corresponding box in `boxes`.\n        scores: A dictionary mapping each unique image key (string) to a list\n        of score values lables, matching the corresponding label in `labels`.\n        If scores are not provided in the csv, then they will default to 1.0.\n    \"\"\"\n    start = time.time()\n    entries = defaultdict(list)\n    boxes = defaultdict(list)\n    labels = defaultdict(list)\n    scores = defaultdict(list)\n    reader = csv.reader(csv_file)\n    for row in reader:\n        assert len(row) in [7, 8], 'Wrong number of columns: ' + row\n        image_key = make_image_key(row[0], row[1])\n        x1, y1, x2, y2 = [float(n) for n in row[2:6]]\n        action_id = int(row[6])\n        if class_whitelist and action_id not in class_whitelist:\n            continue"
+        },
+        {
+            "comment": "This code reads a CSV file containing object detection results and stores them in three lists: boxes, labels, and scores. The code also handles exclusions by reading a separate CSV file that contains excluded timestamps. The score is determined based on the length of each row in the CSV file and added to the corresponding image key's entry in the entries dictionary if the capacity allows or if the score is higher than the current highest score for that image key. The code then sorts the entries by descending scores and appends them to the boxes, labels, and scores lists for each image key.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":121-146",
+            "content": "        score = 1.0\n        if len(row) == 8:\n            score = float(row[7])\n        if capacity < 1 or len(entries[image_key]) < capacity:\n            heapq.heappush(entries[image_key],\n                           (score, action_id, y1, x1, y2, x2))\n        elif score > entries[image_key][0][0]:\n            heapq.heapreplace(entries[image_key],\n                              (score, action_id, y1, x1, y2, x2))\n    for image_key in entries:\n        # Evaluation API assumes boxes with descending scores\n        entry = sorted(entries[image_key], key=lambda tup: -tup[0])\n        for item in entry:\n            score, action_id, y1, x1, y2, x2 = item\n            boxes[image_key].append([y1, x1, y2, x2])\n            labels[image_key].append(action_id)\n            scores[image_key].append(score)\n    print_time('read file ' + csv_file.name, start)\n    return boxes, labels, scores\ndef read_exclusions(exclusions_file):\n    \"\"\"Reads a CSV file of excluded timestamps.\n    Args:\n        exclusions_file: A file object containing a csv of video-id,timestamp."
+        },
+        {
+            "comment": "Function `read_excluded_images` reads an exclusions file and returns a set of image keys to exclude. The input file is read row by row, and for each row the function checks that there are exactly two columns and adds the image key (combination of column 1 and column 2) to the excluded set.\n\nFunction `read_labelmap` reads a labelmap file without using protocol buffers. It iterates over the file line by line. When it encounters a line starting with 'name:', it extracts the class name, and when it encounters a line starting with 'id:' it extracts the class id. The function then appends a dictionary containing the id and name to the labelmap list and adds the id to the set of valid class ids.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":148-180",
+            "content": "    Returns:\n        A set of strings containing excluded image keys, e.g.\n        \"aaaaaaaaaaa,0904\",\n        or an empty set if exclusions file is None.\n    \"\"\"\n    excluded = set()\n    if exclusions_file:\n        reader = csv.reader(exclusions_file)\n    for row in reader:\n        assert len(row) == 2, 'Expected only 2 columns, got: ' + row\n        excluded.add(make_image_key(row[0], row[1]))\n    return excluded\ndef read_labelmap(labelmap_file):\n    \"\"\"Reads a labelmap without the dependency on protocol buffers.\n    Args:\n        labelmap_file: A file object containing a label map protocol buffer.\n    Returns:\n        labelmap: The label map in the form used by the\n        object_detection_evaluation\n        module - a list of {\"id\": integer, \"name\": classname } dicts.\n        class_ids: A set containing all of the valid class id integers.\n    \"\"\"\n    labelmap = []\n    class_ids = set()\n    name = ''\n    class_id = ''\n    for line in labelmap_file:\n        if line.startswith('  name:'):\n            name = line.split('\"')[1]"
+        },
+        {
+            "comment": "This function ava_eval() takes several file paths as input and evaluates the results using mean average precision (mAP). It uses a label map to convert class labels from detections to their corresponding IDs. The code checks for 'id' or 'label_id' in each line of the label file, appends the ID and name to the label map, and adds the ID to a set of class_ids. The function also handles custom classes by excluding any category whose ID is not in the custom_classes list. The gt_boxes, gt_labels, and _ are loaded from the ann_file for evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":181-209",
+            "content": "        elif line.startswith('  id:') or line.startswith('  label_id:'):\n            class_id = int(line.strip().split(' ')[-1])\n            labelmap.append({'id': class_id, 'name': name})\n            class_ids.add(class_id)\n    return labelmap, class_ids\n# Seems there is at most 100 detections for each image\ndef ava_eval(result_file,\n             result_type,\n             label_file,\n             ann_file,\n             exclude_file,\n             max_dets=(100, ),\n             verbose=True,\n             custom_classes=None):\n    assert result_type in ['mAP']\n    start = time.time()\n    categories, class_whitelist = read_labelmap(open(label_file))\n    if custom_classes is not None:\n        custom_classes = custom_classes[1:]\n        assert set(custom_classes).issubset(set(class_whitelist))\n        class_whitelist = custom_classes\n        categories = [cat for cat in categories if cat['id'] in custom_classes]\n    # loading gt, do not need gt score\n    gt_boxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist, 0)"
+        },
+        {
+            "comment": "The code reads detection results from a file, excludes certain keys if specified in an exclude file, and measures the time taken to read the results. It then checks if the result type is 'proposal' and creates proposals based on the gt_boxes for each image key present in boxes or adds a fake one if no corresponding proposal exists. Proposals include scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":210-239",
+            "content": "    if verbose:\n        print_time('Reading detection results', start)\n    if exclude_file is not None:\n        excluded_keys = read_exclusions(open(exclude_file))\n    else:\n        excluded_keys = list()\n    start = time.time()\n    boxes, labels, scores = read_csv(open(result_file), class_whitelist, 0)\n    if verbose:\n        print_time('Reading detection results', start)\n    if result_type == 'proposal':\n        gts = [\n            np.array(gt_boxes[image_key], dtype=float) for image_key in gt_boxes\n        ]\n        proposals = []\n        for image_key in gt_boxes:\n            if image_key in boxes:\n                proposals.append(\n                    np.concatenate(\n                        (np.array(boxes[image_key], dtype=float),\n                         np.array(scores[image_key], dtype=float)[:, None]),\n                        axis=1))\n            else:\n                # if no corresponding proposal, add a fake one\n                proposals.append(np.array([0, 0, 1, 1, 1]))\n        # Proposals used here are with scores"
+        },
+        {
+            "comment": "This code calculates the Average Recall (AR) and Recall@0.5 (R@0.5) for different detection numbers using the eval_recalls function. It then prints the results and stores them in a dictionary. If the result type is 'mAP', it initializes a PascalDetectionEvaluator, adds ground truth information for each image key, and calculates the mean average precision (mAP).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":240-264",
+            "content": "        recalls = eval_recalls(gts, proposals, np.array(max_dets),\n                               np.arange(0.5, 0.96, 0.05))\n        ar = recalls.mean(axis=1)\n        ret = {}\n        for i, num in enumerate(max_dets):\n            print(f'Recall@0.5@{num}\\t={recalls[i, 0]:.4f}')\n            print(f'AR@{num}\\t={ar[i]:.4f}')\n            ret[f'Recall@0.5@{num}'] = recalls[i, 0]\n            ret[f'AR@{num}'] = ar[i]\n        return ret\n    if result_type == 'mAP':\n        pascal_evaluator = det_eval.PascalDetectionEvaluator(categories)\n        start = time.time()\n        for image_key in gt_boxes:\n            if verbose and image_key in excluded_keys:\n                logging.info(\n                    'Found excluded timestamp in detections: %s.'\n                    'It will be ignored.', image_key)\n                continue\n            pascal_evaluator.add_single_ground_truth_image_info(\n                image_key, {\n                    standard_fields.InputDataFields.groundtruth_boxes:\n                    np.array(gt_boxes[image_key], dtype=float),"
+        },
+        {
+            "comment": "This code adds single detected image information to a Pascal evaluator. It converts groundtruth labels and boxes into appropriate data structures, handles excluded timestamps, and processes detection boxes and classes for evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":265-285",
+            "content": "                    standard_fields.InputDataFields.groundtruth_classes:\n                    np.array(gt_labels[image_key], dtype=int),\n                    standard_fields.InputDataFields.groundtruth_difficult:\n                    np.zeros(len(gt_boxes[image_key]), dtype=bool)\n                })\n        if verbose:\n            print_time('Convert groundtruth', start)\n        start = time.time()\n        for image_key in boxes:\n            if verbose and image_key in excluded_keys:\n                logging.info(\n                    'Found excluded timestamp in detections: %s.'\n                    'It will be ignored.', image_key)\n                continue\n            pascal_evaluator.add_single_detected_image_info(\n                image_key, {\n                    standard_fields.DetectionResultFields.detection_boxes:\n                    np.array(boxes[image_key], dtype=float),\n                    standard_fields.DetectionResultFields.detection_classes:\n                    np.array(labels[image_key], dtype=int),"
+        },
+        {
+            "comment": "Code snippet performs AVA evaluation for detection results, and prints or returns specific metrics. It also includes functions for creating directories, dumping objects to files using pickle library, and has a function for time measurement called print_time.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":286-319",
+            "content": "                    standard_fields.DetectionResultFields.detection_scores:\n                    np.array(scores[image_key], dtype=float)\n                })\n        if verbose:\n            print_time('convert detections', start)\n        start = time.time()\n        metrics = pascal_evaluator.evaluate()\n        if verbose:\n            print_time('run_evaluator', start)\n        for display_name in metrics:\n            print(f'{display_name}=\\t{metrics[display_name]}')\n        ret = {\n            display_name: metrics[display_name]\n            for display_name in metrics if 'ByCategory' not in display_name\n        }\n        return ret\ndef mkdir_or_exist(dir_name, mode=0o777):\n    if dir_name == '':\n        return\n    dir_name = osp.expanduser(dir_name)\n    os.makedirs(dir_name, mode=mode, exist_ok=True)\ndef dump_to_fileobj(obj, file, **kwargs):\n    kwargs.setdefault('protocol', 2)\n    pickle.dump(obj, file, **kwargs)\ndef dump_to_path(obj, filepath, mode='wb'):\n    with open(filepath, mode) as f:\n        dump_to_fileobj(obj, f)"
+        },
+        {
+            "comment": "This code defines three functions: `load_from_fileobj`, `load_from_path`, and `collect_results_cpu`. The first two are used to load data from files or file paths, respectively. The third function, `collect_results_cpu`, is a CPU-based method for collecting results across multiple GPUs by saving them in a temporary directory ('tmpdir') and having the rank 0 worker collect them. It checks if all parts exist, waits if not, then loads and returns the collected results once they are all available.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":322-356",
+            "content": "def load_from_fileobj(file, **kwargs):\n    return pickle.load(file, **kwargs)\ndef load_from_path(filepath, mode='rb'):\n    with open(filepath, mode) as f:\n        return load_from_fileobj(f)\ndef collect_results_cpu(result_part, size):\n    \"\"\"Collect results in cpu mode.\n    It saves the results on different gpus to 'tmpdir' and collects\n    them by the rank 0 worker.\n    \"\"\"\n    tmpdir = osp.join('./', 'collect_results_cpu')\n    #1. load results of all parts from tmp dir\n    mkdir_or_exist(tmpdir)\n    rank, world_size = get_dist_info()\n    dump_to_path(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))\n    dist.barrier()\n    if rank != 0:\n        return None\n    #2. collect all parts\n    while 1:\n        all_exist = True\n        for i in range(world_size):\n            part_file = osp.join(tmpdir, f'part_{i}.pkl')\n            if not Path(part_file).exists():\n                all_exist = False\n        if all_exist:\n            break\n        else:\n            time.sleep(60)\n    time.sleep(120)\n    #3. load results of all parts from tmp dir"
+        },
+        {
+            "comment": "This code is used for evaluating AVA results by splitting the computation across multiple processes, then combining and ordering the partial results before deleting temporary files. It takes in information about the dataset, the evaluation results, custom class labels, and file paths for input and exclusion lists. The code creates a temporary result file, converts the results to a CSV format, performs AVA evaluation on the temporary file, and returns an evaluation result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":357-383",
+            "content": "    part_list = []\n    for i in range(world_size):\n        part_file = osp.join(tmpdir, f'part_{i}.pkl')\n        part_list.append(load_from_path(part_file))\n    #4. sort the results\n    ordered_results = []\n    for res in zip(*part_list):\n        ordered_results.extend(list(res))\n    ordered_results = ordered_results[:\n                                      size]  #the dataloader may pad some samples\n    #5. remove results of all parts from tmp dir, avoid dump_file fail to tmp dir when dir not exists.\n    for i in range(world_size):\n        part_file = osp.join(tmpdir, f'part_{i}.pkl')\n        os.remove(part_file)\n    return ordered_results\ndef ava_evaluate_results(info, dataset_len, results, custom_classes, label_file,\n                         file_path, exclude_file):\n    # need to create a temp result file\n    time_now = datetime.now().strftime('%Y%m%d_%H%M%S')\n    temp_file = f'AVA_{time_now}_result.csv'\n    results2csv(info, dataset_len, results, temp_file)\n    ret = {}\n    eval_result = ava_eval(\n        temp_file,"
+        },
+        {
+            "comment": "This code is computing the mean average precision (mAP) for object detection metrics. It reads from a label file, file path, and excludes certain classes as specified. The results are stored in the 'ret' dictionary before removing a temporary file and returning the final results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_utils.py\":384-393",
+            "content": "        'mAP',\n        label_file,\n        file_path,  #ann_file,\n        exclude_file,\n        custom_classes=custom_classes)\n    ret.update(eval_result)\n    os.remove(temp_file)\n    return ret"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/326037c1-296b-4b1d-8c56-5d011e6df707.json b/docs/doc/326037c1-296b-4b1d-8c56-5d011e6df707.json
new file mode 100644
index 000000000..b5931ed92
--- /dev/null
+++ b/docs/doc/326037c1-296b-4b1d-8c56-5d011e6df707.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The PaddleVideo library's code contains a utility function, ReadDict, which reads a dictionary file and performs image cropping, input point adjustment, calculates image size, and converts points to standard format. Another function captures frames from a video at specific indices and releases the video object post-capture.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library, specifically in the cpp_infer/src/utility.cpp file. It includes necessary headers for utility functions, and defines the ReadDict function within the PaddleVideo namespace. The function reads a dictionary file located at the given path, and stores each line into a vector of strings named m_vec.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/utility.cpp\":0-32",
+            "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include <dirent.h>\n#include <include/utility.h>\n#include <iostream>\n#include <ostream>\n#include <sys/stat.h>\n#include <sys/types.h>\n#include <vector>\nnamespace PaddleVideo\n{\n    std::vector<std::string> Utility::ReadDict(const std::string &path)\n    {\n        std::ifstream in(path);\n        std::string line;\n        std::vector<std::string> m_vec;\n        if (in)\n        {\n            while (getline(in, line))"
+        },
+        {
+            "comment": "The code reads a label file and returns its contents as a vector. It also retrieves all files in a directory, adding them to a vector if the directory is valid.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/utility.cpp\":33-66",
+            "content": "            {\n                m_vec.push_back(line);\n            }\n        }\n        else\n        {\n            std::cout << \"no such label file: \" << path << \", exit the program...\"\n                      << std::endl;\n            exit(1);\n        }\n        return m_vec; // Use fstream to read the category list and return with vector\n    }\n    void Utility::GetAllFiles(const char *dir_name, std::vector<std::string> &all_inputs)\n    {\n        if (NULL == dir_name)\n        {\n            std::cout << \" dir_name is null ! \" << std::endl;\n            return;\n        }\n        struct stat s;\n        lstat(dir_name, &s);\n        if (!S_ISDIR(s.st_mode))\n        {\n            std::cout << \"dir_name is not a valid directory !\" << std::endl;\n            all_inputs.push_back(dir_name);\n            return;\n        }\n        else\n        {\n            struct dirent *filename; // return value for readdir()\n            DIR *dir;                // return value for opendir()\n            dir = opendir(dir_name);\n            if (NULL == dir)"
+        },
+        {
+            "comment": "The code snippet opens a directory, reads all files except \".\" and \"..\", and adds the file paths to a vector. The GetRotateCropImage function takes an image and a bounding box as input, copies the source image, and stores x and y coordinates of the bounding box in separate arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/utility.cpp\":67-92",
+            "content": "            {\n                std::cout << \"Can not open dir \" << dir_name << std::endl;\n                return;\n            }\n            std::cout << \"Successfully opened the dir !\" << std::endl;\n            while ((filename = readdir(dir)) != NULL)\n            {\n                if (strcmp(filename->d_name, \".\") == 0 ||\n                    strcmp(filename->d_name, \"..\") == 0)\n                    continue;\n                // img_dir + std::string(\"/\") + all_inputs[0];\n                all_inputs.push_back(dir_name + std::string(\"/\") +\n                                     std::string(filename->d_name));\n            }\n        }\n    }\n    cv::Mat Utility::GetRotateCropImage(const cv::Mat &srcimage, std::vector<std::vector<int>> box)\n    {\n        cv::Mat image;\n        srcimage.copyTo(image);\n        std::vector<std::vector<int>> points = box;\n        int x_collect[4] = {box[0][0], box[1][0], box[2][0], box[3][0]};\n        int y_collect[4] = {box[0][1], box[1][1], box[2][1], box[3][1]};\n        int left = int(*std::min_element(x_collect, x_collect + 4));"
+        },
+        {
+            "comment": "This code crops an image based on the x and y coordinates of its bounding box, then adjusts the input points accordingly. It calculates the width and height of the cropped image using the Euclidean distance formula, and converts the original input points to a standard format for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/utility.cpp\":93-117",
+            "content": "        int right = int(*std::max_element(x_collect, x_collect + 4));\n        int top = int(*std::min_element(y_collect, y_collect + 4));\n        int bottom = int(*std::max_element(y_collect, y_collect + 4));\n        cv::Mat img_crop;\n        image(cv::Rect(left, top, right - left, bottom - top)).copyTo(img_crop);\n        for (int i = 0; i < points.size(); i++)\n        {\n            points[i][0] -= left;\n            points[i][1] -= top;\n        }\n        int img_crop_width = int(sqrt(pow(points[0][0] - points[1][0], 2) +\n                                      pow(points[0][1] - points[1][1], 2)));\n        int img_crop_height = int(sqrt(pow(points[0][0] - points[3][0], 2) +\n                                       pow(points[0][1] - points[3][1], 2)));\n        cv::Point2f pts_std[4];\n        pts_std[0] = cv::Point2f(0., 0.);\n        pts_std[1] = cv::Point2f(img_crop_width, 0.);\n        pts_std[2] = cv::Point2f(img_crop_width, img_crop_height);\n        pts_std[3] = cv::Point2f(0.f, img_crop_height);\n        cv::Point2f pointsf[4];"
+        },
+        {
+            "comment": "This code initializes four points using cv::Point2f, gets a perspective transform matrix M using getPerspectiveTransform, warps the image using warpPerspective, checks if the resized image's rows exceed 1.5 times its columns, and if so, transposes and flips the image before returning it; otherwise, returns the resized image directly. This is part of a function that samples frames from a video file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/utility.cpp\":118-145",
+            "content": "        pointsf[0] = cv::Point2f(points[0][0], points[0][1]);\n        pointsf[1] = cv::Point2f(points[1][0], points[1][1]);\n        pointsf[2] = cv::Point2f(points[2][0], points[2][1]);\n        pointsf[3] = cv::Point2f(points[3][0], points[3][1]);\n        cv::Mat M = cv::getPerspectiveTransform(pointsf, pts_std);\n        cv::Mat dst_img;\n        cv::warpPerspective(img_crop, dst_img, M,\n                            cv::Size(img_crop_width, img_crop_height),\n                            cv::BORDER_REPLICATE);\n        if (float(dst_img.rows) >= float(dst_img.cols) * 1.5)\n        {\n            cv::Mat srcCopy = cv::Mat(dst_img.rows, dst_img.cols, dst_img.depth());\n            cv::transpose(dst_img, srcCopy);\n            cv::flip(srcCopy, srcCopy, 0);\n            return srcCopy;\n        }\n        else\n        {\n            return dst_img;\n        }\n    }\n    std::vector<cv::Mat> Utility::SampleFramesFromVideo(const std::string &VideoPath, const int &num_seg, const int &seg_len)\n    {\n        cv::VideoCapture capture(VideoPath); // Create a video object"
+        },
+        {
+            "comment": "This code snippet checks if the video can be opened and exits if it cannot. It then calculates the number of frames in the video, determines the frame indices to sample for each segment based on the length of the segment and average duration between frames, and stores the sampled frames in a vector.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/utility.cpp\":146-180",
+            "content": "        if (!capture.isOpened())\n        {\n            printf(\"[Error] video cannot be opened, please check the video [%s]\\n\", VideoPath.c_str());\n            capture.release();\n            exit(1);\n        }\n        int frames_len = capture.get(cv::CAP_PROP_FRAME_COUNT); // Get the total number of video frames\n        int average_dur = int(frames_len / num_seg);\n        std::vector<int> frames_idx;\n        for (int i = 0; i < num_seg; ++i)\n        {\n            int idx = 0;\n            if (average_dur >= seg_len)\n            {\n                idx = (average_dur - 1) / 2;\n                idx += i * average_dur;\n            }\n            else if (average_dur >= 1)\n            {\n                idx += i * average_dur;\n            }\n            else\n            {\n                idx = i;\n            }\n            for (int j = idx; j < idx + seg_len; ++j)\n            {\n                frames_idx.emplace_back(j % frames_len);\n            }\n        }\n        std::vector<cv::Mat> sampled_frames;\n        cv::Mat frame; // Create an object for storing sampled frames"
+        },
+        {
+            "comment": "This function captures frames from a video at specific indices, stores them in sampled_frames vector, and releases the video object after capture.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/utility.cpp\":181-191",
+            "content": "        for (int i = 0; i < num_seg; ++i)\n        {\n            const int &frame_idx = frames_idx[i];\n            capture.set(cv::CAP_PROP_POS_FRAMES, frame_idx); // Set to frame_idx frame\n            capture >> frame;\n            sampled_frames.push_back(frame);\n        }\n        capture.release(); // Release the video object\n        return sampled_frames;\n    }\n} // namespace PaddleVideo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/326f0f75-c6ba-4259-8a55-bc71c6c6be8b.json b/docs/doc/326f0f75-c6ba-4259-8a55-bc71c6c6be8b.json
new file mode 100644
index 000000000..fc6497b64
--- /dev/null
+++ b/docs/doc/326f0f75-c6ba-4259-8a55-bc71c6c6be8b.json
@@ -0,0 +1,75 @@
+{
+    "summary": "This code provides text to Unicode conversion and printable encoding functions, with tokenization classes for Chinese characters, punctuation splitting, and WordpieceTokenizing, preparing the text for further processing.",
+    "details": [
+        {
+            "comment": "This code block is the first 30 lines of a Python file and includes a comment with license information, a documentation string, and an import section. The function \"convert_to_unicode\" converts text to Unicode, assuming utf-8 input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":0-31",
+            "content": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#         http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Tokenization classes.\"\"\"\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\nfrom __future__ import absolute_import\nfrom io import open\nimport collections\nimport unicodedata\nimport six\ndef convert_to_unicode(text):\n    \"\"\"Converts `text` to Unicode (if it's not already), assuming utf-8 input.\"\"\"\n    if six.PY3:"
+        },
+        {
+            "comment": "This code is a function named \"printable_text\" that takes a text parameter and returns it encoded in a way suitable for print or tf.logging. It handles both Python 2 and Python 3 by checking the environment using six.PY2 and six.PY3, and converting strings to str format before returning them. The function checks the type of the input text (str or bytes) and decodes it accordingly (from utf-8 encoding \"ignore\"). If the input type is not supported, it raises a ValueError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":32-60",
+            "content": "        if isinstance(text, str):\n            return text\n        elif isinstance(text, bytes):\n            return text.decode(\"utf-8\", \"ignore\")\n        else:\n            raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n    elif six.PY2:\n        if isinstance(text, str):\n            return text.decode(\"utf-8\", \"ignore\")\n        elif isinstance(text, unicode):\n            return text\n        else:\n            raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n    else:\n        raise ValueError(\"Not running on Python2 or Python 3?\")\ndef printable_text(text):\n    \"\"\"Returns text encoded in a way suitable for print or `tf.logging`.\"\"\"\n    # These functions want `str` for both Python2 and Python3, but in one case\n    # it's a Unicode string and in the other it's a byte string.\n    if six.PY3:\n        if isinstance(text, str):\n            return text\n        elif isinstance(text, bytes):\n            return text.decode(\"utf-8\", \"ignore\")\n        else:\n            raise ValueError(\"Unsupported string type: %s\" % (type(text)))"
+        },
+        {
+            "comment": "This code handles loading and converting vocabulary files. It checks the Python version, loads a vocabulary file into an ordered dictionary, and defines functions to convert sequences of tokens or IDs using the vocab.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":61-95",
+            "content": "    elif six.PY2:\n        if isinstance(text, str):\n            return text\n        elif isinstance(text, unicode):\n            return text.encode(\"utf-8\")\n        else:\n            raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n    else:\n        raise ValueError(\"Not running on Python2 or Python 3?\")\ndef load_vocab(vocab_file):\n    \"\"\"Loads a vocabulary file into a dictionary.\"\"\"\n    vocab = collections.OrderedDict()\n    with open(vocab_file, encoding='utf8') as fin:\n        for num, line in enumerate(fin):\n            items = convert_to_unicode(line.strip()).split(\"\\t\")\n            if len(items) > 2:\n                break\n            token = items[0]\n            index = items[1] if len(items) == 2 else num\n            token = token.strip()\n            vocab[token] = int(index)\n    return vocab\ndef convert_by_vocab(vocab, items):\n    \"\"\"Converts a sequence of [tokens|ids] using the vocab.\"\"\"\n    output = []\n    for item in items:\n        output.append(vocab[item])\n    return output\ndef convert_tokens_to_ids(vocab, tokens):"
+        },
+        {
+            "comment": "This code defines a FullTokenizer class for end-to-end tokenization. It utilizes two other classes, BasicTokenizer and WordpieceTokenizer, to perform basic whitespace cleaning and splitting on text data. The FullTokenizer initializes with a vocab file, load_vocab function, and an optional flag for case sensitivity. The tokenize method processes the input text by iterating over each token produced from both BasicTokenizer and WordpieceTokenizer, resulting in split tokens for further processing or analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":96-132",
+            "content": "    \"\"\"convert_tokens_to_ids\n    \"\"\"\n    return convert_by_vocab(vocab, tokens)\ndef convert_ids_to_tokens(inv_vocab, ids):\n    \"\"\"convert_ids_to_tokens\n    \"\"\"\n    return convert_by_vocab(inv_vocab, ids)\ndef whitespace_tokenize(text):\n    \"\"\"Runs basic whitespace cleaning and splitting on a peice of text.\"\"\"\n    text = text.strip()\n    if not text:\n        return []\n    tokens = text.split()\n    return tokens\nclass FullTokenizer(object):\n    \"\"\"Runs end-to-end tokenziation.\"\"\"\n    def __init__(self, vocab_file, do_lower_case=True):\n        \"\"\"init\n        \"\"\"\n        self.vocab = load_vocab(vocab_file)\n        self.inv_vocab = {v: k for k, v in self.vocab.items()}\n        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)\n        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)\n    def tokenize(self, text):\n        \"\"\"tokenize\n        \"\"\"\n        split_tokens = []\n        for token in self.basic_tokenizer.tokenize(text):\n            for sub_token in self.wordpiece_tokenizer.tokenize(token):"
+        },
+        {
+            "comment": "The code defines a CharTokenizer class for end-to-end tokenization. It initializes with a vocab_file and do_lower_case parameter. The class has methods to tokenize text, convert tokens to ids, and convert ids to tokens using the vocab file and inverse vocab file. The tokenization process involves lowercasing the input text, splitting it into words, and then tokenizing each word using a WordpieceTokenizer with the same vocab file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":133-167",
+            "content": "                split_tokens.append(sub_token)\n        return split_tokens\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\"convert_tokens_to_ids\n        \"\"\"\n        return convert_by_vocab(self.vocab, tokens)\n    def convert_ids_to_tokens(self, ids):\n        \"\"\"convert_ids_to_tokens\n        \"\"\"\n        return convert_by_vocab(self.inv_vocab, ids)\nclass CharTokenizer(object):\n    \"\"\"Runs end-to-end tokenziation.\"\"\"\n    def __init__(self, vocab_file, do_lower_case=True):\n        self.vocab = load_vocab(vocab_file)\n        self.inv_vocab = {v: k for k, v in self.vocab.items()}\n        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)\n    def tokenize(self, text):\n        \"\"\"tokenize\n        \"\"\"\n        split_tokens = []\n        for token in text.lower().split(\" \"):\n            for sub_token in self.wordpiece_tokenizer.tokenize(token):\n                split_tokens.append(sub_token)\n        return split_tokens\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\"convert_tokens_to_ids\n        \"\"\""
+        },
+        {
+            "comment": "This code defines a `BasicTokenizer` class that performs basic text tokenization, including punctuation splitting and lower casing. It also includes methods for converting tokens to IDs and vice versa using vocabularies. The class has an optional `do_lower_case` parameter controlling whether the input should be lowercased or not.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":168-196",
+            "content": "        return convert_by_vocab(self.vocab, tokens)\n    def convert_ids_to_tokens(self, ids):\n        \"\"\"convert_ids_to_tokens\n        \"\"\"\n        return convert_by_vocab(self.inv_vocab, ids)\nclass BasicTokenizer(object):\n    \"\"\"Runs basic tokenization (punctuation splitting, lower casing, etc.).\"\"\"\n    def __init__(self, do_lower_case=True):\n        \"\"\"Constructs a BasicTokenizer.\n        Args:\n            do_lower_case: Whether to lower case the input.\n        \"\"\"\n        self.do_lower_case = do_lower_case\n    def tokenize(self, text):\n        \"\"\"Tokenizes a piece of text.\"\"\"\n        text = convert_to_unicode(text)\n        text = self._clean_text(text)\n        # This was added on November 1st, 2018 for the multilingual and Chinese\n        # models. This is also applied to the English models now, but it doesn't\n        # matter since the English models were not trained on any Chinese data\n        # and generally don't have any Chinese data in them (there are Chinese\n        # characters in the vocabulary because Wikipedia does have some Chinese"
+        },
+        {
+            "comment": "The code segment tokenizes Chinese characters, performs lower casing if needed, strips accents from text, and splits the punctuation on a given piece of text. This process is to prepare the text for further processing in the application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":197-228",
+            "content": "        # words in the English Wikipedia.).\n        text = self._tokenize_chinese_chars(text)\n        orig_tokens = whitespace_tokenize(text)\n        split_tokens = []\n        for token in orig_tokens:\n            if self.do_lower_case:\n                token = token.lower()\n                token = self._run_strip_accents(token)\n            split_tokens.extend(self._run_split_on_punc(token))\n        output_tokens = whitespace_tokenize(\" \".join(split_tokens))\n        return output_tokens\n    def _run_strip_accents(self, text):\n        \"\"\"Strips accents from a piece of text.\"\"\"\n        text = unicodedata.normalize(\"NFD\", text)\n        output = []\n        for char in text:\n            cat = unicodedata.category(char)\n            if cat == \"Mn\":\n                continue\n            output.append(char)\n        return \"\".join(output)\n    def _run_split_on_punc(self, text):\n        \"\"\"Splits punctuation on a piece of text.\"\"\"\n        chars = list(text)\n        i = 0\n        start_new_word = True\n        output = []\n        while i < len(chars):"
+        },
+        {
+            "comment": "This code defines functions for tokenizing and processing text data. The `_is_punctuation` function identifies punctuation characters, while the `tokenize_text` function separates words by detecting new word starts. The `_tokenize_chinese_chars` function adds whitespace around Chinese characters to separate them from surrounding text.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":229-258",
+            "content": "            char = chars[i]\n            if _is_punctuation(char):\n                output.append([char])\n                start_new_word = True\n            else:\n                if start_new_word:\n                    output.append([])\n                start_new_word = False\n                output[-1].append(char)\n            i += 1\n        return [\"\".join(x) for x in output]\n    def _tokenize_chinese_chars(self, text):\n        \"\"\"Adds whitespace around any CJK character.\"\"\"\n        output = []\n        for char in text:\n            cp = ord(char)\n            if self._is_chinese_char(cp):\n                output.append(\" \")\n                output.append(char)\n                output.append(\" \")\n            else:\n                output.append(char)\n        return \"\".join(output)\n    def _is_chinese_char(self, cp):\n        \"\"\"Checks whether CP is the codepoint of a CJK character.\"\"\"\n        # This defines a \"chinese character\" as anything in the CJK Unicode block:\n        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)"
+        },
+        {
+            "comment": "The code checks if a character falls within the CJK Unicode block, which includes Japanese and Korean characters. It returns True if any of these characters are found, indicating that the text is in one of these languages. The function also performs invalid character removal and whitespace cleanup on the given text.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":259-281",
+            "content": "        #\n        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,\n        # despite its name. The modern Korean Hangul alphabet is a different block,\n        # as is Japanese Hiragana and Katakana. Those alphabets are used to write\n        # space-separated words, so they are not treated specially and handled\n        # like the all of the other languages.\n        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #\n            (cp >= 0x3400 and cp <= 0x4DBF) or  #\n            (cp >= 0x20000 and cp <= 0x2A6DF) or  #\n            (cp >= 0x2A700 and cp <= 0x2B73F) or  #\n            (cp >= 0x2B740 and cp <= 0x2B81F) or  #\n            (cp >= 0x2B820 and cp <= 0x2CEAF) or\n            (cp >= 0xF900 and cp <= 0xFAFF) or  #\n            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #\n            return True\n        return False\n    def _clean_text(self, text):\n        \"\"\"Performs invalid character removal and whitespace cleanup on text.\"\"\"\n        output = []\n        for char in text:\n            cp = ord(char)"
+        },
+        {
+            "comment": "This code defines a WordpieceTokenizer class that tokenizes text into word pieces using a greedy longest-match-first algorithm and a given vocabulary. The tokenize method takes in a text input, performs tokenization by matching the longest possible substrings from the vocabulary, and returns a list of wordpiece tokens.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":282-314",
+            "content": "            if cp == 0 or cp == 0xfffd or _is_control(char):\n                continue\n            if _is_whitespace(char):\n                output.append(\" \")\n            else:\n                output.append(char)\n        return \"\".join(output)\nclass WordpieceTokenizer(object):\n    \"\"\"Runs WordPiece tokenziation.\"\"\"\n    def __init__(self, vocab, unk_token=\"[UNK]\", max_input_chars_per_word=100):\n        self.vocab = vocab\n        self.unk_token = unk_token\n        self.max_input_chars_per_word = max_input_chars_per_word\n    def tokenize(self, text):\n        \"\"\"Tokenizes a piece of text into its word pieces.\n        This uses a greedy longest-match-first algorithm to perform tokenization\n        using the given vocabulary.\n        For example:\n            input = \"unaffable\"\n            output = [\"un\", \"##aff\", \"##able\"]\n        Args:\n            text: A single token or whitespace separated tokens. This should have\n                already been passed through `BasicTokenizer.\n        Returns:\n            A list of wordpiece tokens."
+        },
+        {
+            "comment": "This code tokenizes text by splitting it into words, checks if each word is in the vocabulary. If not, it adds a special unknown token. It handles long words by splitting them into smaller parts and checking each part separately.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":315-347",
+            "content": "        \"\"\"\n        text = convert_to_unicode(text)\n        output_tokens = []\n        for token in whitespace_tokenize(text):\n            chars = list(token)\n            if len(chars) > self.max_input_chars_per_word:\n                output_tokens.append(self.unk_token)\n                continue\n            is_bad = False\n            start = 0\n            sub_tokens = []\n            while start < len(chars):\n                end = len(chars)\n                cur_substr = None\n                while start < end:\n                    substr = \"\".join(chars[start:end])\n                    if start > 0:\n                        substr = \"##\" + substr\n                    if substr in self.vocab:\n                        cur_substr = substr\n                        break\n                    end -= 1\n                if cur_substr is None:\n                    is_bad = True\n                    break\n                sub_tokens.append(cur_substr)\n                start = end\n            if is_bad:\n                output_tokens.append(self.unk_token)"
+        },
+        {
+            "comment": "The code defines several functions for tokenizing a string: _is_whitespace checks if the character is whitespace, _is_control identifies control characters, and _is_punctuation classifies punctuation. The main function extends output tokens based on these character types.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":348-381",
+            "content": "            else:\n                output_tokens.extend(sub_tokens)\n        return output_tokens\ndef _is_whitespace(char):\n    \"\"\"Checks whether `chars` is a whitespace character.\"\"\"\n    # \\t, \\n, and \\r are technically contorl characters but we treat them\n    # as whitespace since they are generally considered as such.\n    if char == \" \" or char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return True\n    cat = unicodedata.category(char)\n    if cat == \"Zs\":\n        return True\n    return False\ndef _is_control(char):\n    \"\"\"Checks whether `chars` is a control character.\"\"\"\n    # These are technically control characters but we count them as whitespace\n    # characters.\n    if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return False\n    cat = unicodedata.category(char)\n    if cat.startswith(\"C\"):\n        return True\n    return False\ndef _is_punctuation(char):\n    \"\"\"Checks whether `chars` is a punctuation character.\"\"\"\n    cp = ord(char)\n    # We treat all non-letter/number ASCII as punctuation.\n    # Characters such as \"^\", \"$\", and \"`\" are not in the Unicode"
+        },
+        {
+            "comment": "This code checks if a given character is a punctuation or Chinese character by checking its Unicode category and code point range. It returns True if the character is a punctuation or Chinese character, and False otherwise. The function is used to tokenize Chinese characters in text by adding whitespace around them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":382-404",
+            "content": "    # Punctuation class but we treat them as punctuation anyways, for\n    # consistency.\n    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or\n        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):\n        return True\n    cat = unicodedata.category(char)\n    if cat.startswith(\"P\"):\n        return True\n    return False\ndef tokenize_chinese_chars(text):\n    \"\"\"Adds whitespace around any CJK character.\"\"\"\n    def _is_chinese_char(cp):\n        \"\"\"Checks whether CP is the codepoint of a CJK character.\"\"\"\n        # This defines a \"chinese character\" as anything in the CJK Unicode block:\n        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)\n        #\n        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,\n        # despite its name. The modern Korean Hangul alphabet is a different block,\n        # as is Japanese Hiragana and Katakana. Those alphabets are used to write\n        # space-separated words, so they are not treated specially and handled"
+        },
+        {
+            "comment": "This function tokenizes text by detecting Chinese characters and whitespace, appending non-Chinese characters to a buffer and adding the buffer to the output when a space or Chinese character is found. Finally, it appends any remaining buffer content.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py\":405-440",
+            "content": "        # like the all of the other languages.\n        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #\n            (cp >= 0x3400 and cp <= 0x4DBF) or  #\n            (cp >= 0x20000 and cp <= 0x2A6DF) or  #\n            (cp >= 0x2A700 and cp <= 0x2B73F) or  #\n            (cp >= 0x2B740 and cp <= 0x2B81F) or  #\n            (cp >= 0x2B820 and cp <= 0x2CEAF) or\n            (cp >= 0xF900 and cp <= 0xFAFF) or  #\n            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #\n            return True\n        return False\n    def _is_whitespace(c):\n        \"\"\"_is_whitespace\n        \"\"\"\n        if c == \" \" or c == \"\\t\" or c == \"\\r\" or c == \"\\n\" or ord(c) == 0x202F:\n            return True\n        return False\n    output = []\n    buff = \"\"\n    for char in text:\n        cp = ord(char)\n        if _is_chinese_char(cp) or _is_whitespace(char):\n            if buff != \"\":\n                output.append(buff)\n                buff = \"\"\n            output.append(char)\n        else:\n            buff += char\n    if buff != \"\":\n        output.append(buff)\n    return output"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/328051fb-fd67-458d-8465-2d715eefefcb.json b/docs/doc/328051fb-fd67-458d-8465-2d715eefefcb.json
new file mode 100644
index 000000000..39018964a
--- /dev/null
+++ b/docs/doc/328051fb-fd67-458d-8465-2d715eefefcb.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines the TransNetV2Head class, a type of head used in computer vision models, inheriting from BaseHead with arguments for number of classes, input channels, and loss configuration. It also includes TransNetV2Loss class registered as HEADS registry, and two methods (loss and get_score) for calculating loss and F1 score between predictions and ground truth.",
+    "details": [
+        {
+            "comment": "This code defines the TransNetV2Head class, which is a type of head used in computer vision models. It inherits from BaseHead and takes arguments for number of classes, input channels, and loss configuration. The TransNetV2Loss class is registered with the HEADS registry to be used by this head. Additionally, the code provides comments about licensing and copyright information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/transnetv2_head.py\":0-28",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..losses import TransNetV2Loss\nfrom ...metrics.transnetv2_metric import create_scene_based_summaries\n@HEADS.register()\nclass TransNetV2Head(BaseHead):\n    \"\"\"TransNetV2 Head.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name=\"TransNetV2Loss\")\n                 ):\n        super().__init__(num_classes,"
+        },
+        {
+            "comment": "This code defines a class with two methods, `loss` and `get_score`. The `loss` method calculates the loss between predictions and ground truth, while `get_score` method calculates an F1 score based on one-hot predictions and ground truth. The calculated losses are stored in a dictionary for further use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/transnetv2_head.py\":29-44",
+            "content": "                         in_channels,\n                         loss_cfg)\n    def loss(self, one_hot_pred, one_hot_gt,\n                many_hot_pred=None, many_hot_gt=None, reg_losses=None):\n        losses = dict()\n        loss = self.loss_func(scores, labels, **kwargs)\n        f1 = self.get_score(one_hot_pred, one_hot_gt)\n        losses['f1'] = f1\n        losses['loss'] = loss\n        return losses\n    def get_score(self, one_hot_pred, one_hot_gt):\n        f1 = create_scene_based_summaries(one_hot_pred, one_hot_gt)\n        return f1"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/334d3227-e933-4a5b-b9a2-192dee28148a.json b/docs/doc/334d3227-e933-4a5b-b9a2-192dee28148a.json
new file mode 100644
index 000000000..526b06161
--- /dev/null
+++ b/docs/doc/334d3227-e933-4a5b-b9a2-192dee28148a.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This function takes a label input, converts it into an array and uses bitwise operations to create a color map representing the label values in RGB format.",
+    "details": [
+        {
+            "comment": "This function takes a label input, converts it into an array and uses bitwise operations to create a color map representing the label values in RGB format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/utils.py\":0-11",
+            "content": "import numpy as np\ndef label2colormap(label):\n    m = label.astype(np.uint8)\n    r, c = m.shape\n    cmap = np.zeros((r, c, 3), dtype=np.uint8)\n    cmap[:, :, 0] = (m & 1) << 7 | (m & 8) << 3 | (m & 64) >> 1\n    cmap[:, :, 1] = (m & 2) << 6 | (m & 16) << 2 | (m & 128) >> 2\n    cmap[:, :, 2] = (m & 4) << 5 | (m & 32) << 1\n    return cmap"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/33b4abff-6ca7-4d03-98eb-625dbfade5e9.json b/docs/doc/33b4abff-6ca7-4d03-98eb-625dbfade5e9.json
new file mode 100644
index 000000000..4bdec7550
--- /dev/null
+++ b/docs/doc/33b4abff-6ca7-4d03-98eb-625dbfade5e9.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports necessary functions and classes from other modules, defines the exported symbols (build_batch_pipeline and Compose), and sets license information.",
+    "details": [
+        {
+            "comment": "This code imports necessary functions and classes from other modules, defines the exported symbols (build_batch_pipeline and Compose), and sets license information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py\":0-19",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .builder import build_batch_pipeline\nfrom .pipelines.compose import Compose\n__all__ = [\n    'build_batch_pipeline','Compose'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/33d49b0b-7b36-4022-84a8-496d8342a35b.json b/docs/doc/33d49b0b-7b36-4022-84a8-496d8342a35b.json
new file mode 100644
index 000000000..c9378d77e
--- /dev/null
+++ b/docs/doc/33d49b0b-7b36-4022-84a8-496d8342a35b.json
@@ -0,0 +1,55 @@
+{
+    "summary": "The code configures logging, imports libraries, initializes a video tagging model using PaddlePaddle and PaddleVideo. It sets up input data with efficient execution on GPU/CPU resources, measures predictor model's execution time for performance analysis or optimization within the main script function.",
+    "details": [
+        {
+            "comment": "Code sets the logging configuration for INFO level, defines the log format, and redirects the logs to stdout. It also imports necessary libraries and modules, and configures logging handlers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":0-33",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport time\nimport logging\nimport argparse\nimport ast\nimport numpy as np\nimport paddle\nimport paddle.static as static\nfrom utils.config_utils import *\nimport models\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'\nlogging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)"
+        },
+        {
+            "comment": "This code snippet defines a function \"parse_args()\" which uses the argparse module to parse command-line arguments. It sets default values for extractor and predictor model configurations, names, and enables GPU usage by default.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":34-61",
+            "content": "logger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--extractor_config',\n                        type=str,\n                        default='configs/tsn.yaml',\n                        help='path to config file of model')\n    parser.add_argument('--extractor_name',\n                        type=str,\n                        default='TSN',\n                        help='extractor model name, default TSN')\n    parser.add_argument('--predictor_config',\n                        '--pconfig',\n                        type=str,\n                        default='configs/attention_lstm.yaml',\n                        help='path to config file of model')\n    parser.add_argument(\n        '--predictor_name',\n        '--pname',\n        type=str,\n        default='AttentionLSTM',\n        help='predictor model name, as AttentionLSTM, AttentionCluster, NEXTVLAD'\n    )\n    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,"
+        },
+        {
+            "comment": "This code snippet uses the argparse module to define command-line arguments for a video tagging application. These arguments include GPU usage, extractor and predictor weight paths, input file list, output directory, and Chinese label file path. The function `parser.parse_args()` is called at the end to return these arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":62-85",
+            "content": "                        help='default use gpu.')\n    parser.add_argument('--extractor_weights',\n                        type=str,\n                        default='weights/tsn',\n                        help='extractor weight path')\n    parser.add_argument('--predictor_weights',\n                        '--pweights',\n                        type=str,\n                        default='weights/attention_lstm',\n                        help='predictor weight path')\n    parser.add_argument('--filelist',\n                        type=str,\n                        default='./data/VideoTag_test.list',\n                        help='path of video data, multiple video')\n    parser.add_argument('--save_dir',\n                        type=str,\n                        default='data/VideoTag_results',\n                        help='output file path')\n    parser.add_argument('--label_file',\n                        type=str,\n                        default='label_3396.txt',\n                        help='chinese label file path')\n    args = parser.parse_args()"
+        },
+        {
+            "comment": "This code defines a video classification model with two stages: extracting features from the input video using an extractor and predicting the classification results based on those extracted features. It uses PaddlePaddle's static graph mode for performance improvement and organizes the code within name scopes \"extractor_scope\" and \"predictor_scope\". The code also checks if the save directory exists, creates it if not, and measures time taken by the extractor stage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":86-110",
+            "content": "    return args\ndef main():\n    \"\"\"\n    Video classification model of 3000 Chinese tags.\n    videotag_extractor_prdictor (as videotag_TSN_AttentionLSTM)\n    two stages in our model:\n        1. extract feature from input video(mp4 format) using extractor\n        2. predict classification results from extracted feature  using predictor\n    we implement this using two name scopes, ie. extractor_scope and predictor_scope.\n    \"\"\"\n    if not os.path.isdir(args.save_dir):\n        os.makedirs(args.save_dir)\n    extractor_config = parse_config(args.extractor_config)\n    extractor_infer_config = merge_configs(extractor_config, 'infer',\n                                           vars(args))\n    extractor_start_time = time.time()\n    extractor_scope = paddle.static.Scope()\n    with static.scope_guard(extractor_scope):\n        extractor_startup_prog = static.Program()\n        extractor_main_prog = static.Program()\n        with static.program_guard(extractor_main_prog, extractor_startup_prog):\n            paddle.disable_static()"
+        },
+        {
+            "comment": "This code builds a model, sets up the necessary parameters for execution, and loads pre-trained weights from a specified location. The model is built in inferencing mode for video tagging tasks, and it utilizes GPU or CPU resources based on the provided arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":111-133",
+            "content": "                # build model\n            extractor_model = models.get_model(args.extractor_name,\n                                               extractor_infer_config,\n                                               mode='infer',\n                                               is_videotag=True)\n            extractor_model.build_input(use_dataloader=False)\n            extractor_model.build_model()\n            extractor_feeds = extractor_model.feeds()\n            extractor_fetch_list = extractor_model.fetches()\n            place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n            exe = static.Executor(place)\n            exe.run(extractor_startup_prog)\n            logger.info('load extractor weights from {}'.format(\n                args.extractor_weights))\n            extractor_model.load_pretrain_params(exe,\n                                                 args.extractor_weights,\n                                                 extractor_main_prog)\n                # get reader and metrics"
+        },
+        {
+            "comment": "The code is setting up a reader and feeder for an extractor in PaddleVideo, iterating through data from the reader, running the extractor using static mode, and logging progress. It also measures and prints the time taken for extraction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":134-153",
+            "content": "            extractor_reader = get_reader(args.extractor_name, 'infer',\n                                          extractor_infer_config)\n            extractor_feeder = paddle.fluid.DataFeeder(place=place,\n                                                feed_list=extractor_feeds)\n            feature_list = []\n            file_list = []\n            for idx, data in enumerate(extractor_reader()):\n                file_id = [item[-1] for item in data]\n                feed_data = [item[:-1] for item in data]\n                feature_out = exe.run(fetch_list=extractor_fetch_list,\n                                      feed=extractor_feeder.feed(feed_data))\n                feature_list.append(feature_out[0])  #get out from list\n                file_list.append(file_id)\n                logger.info(\n                    '========[Stage 1 Sample {} ] Extractor finished======'.\n                    format(idx))\n            paddle.enable_static()\n        extractor_end_time = time.time()\n        print('extractor_time', extractor_end_time - extractor_start_time)"
+        },
+        {
+            "comment": "This code configures and prepares input data for a predictor model. It first parses the predictor configuration file, then merges it with command line arguments to create an inferencing configuration. Depending on the specified predictor model, it either extracts relevant segments from feature lists or uses the entire feature list. The resulting data is added to a list of inputs for the predictor model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":155-176",
+            "content": "    predictor_config = parse_config(args.predictor_config)\n    predictor_infer_config = merge_configs(predictor_config, 'infer',\n                                           vars(args))\n    # get Predictor input from Extractor output\n    predictor_feed_list = []\n    for i in range(len(feature_list)):\n        feature_out = feature_list[i]\n        if args.predictor_name == \"AttentionCluster\":\n            extractor_seg_num = extractor_infer_config.INFER.seg_num\n            predictor_seg_num = predictor_infer_config.MODEL.seg_num\n            idxs = []\n            stride = float(extractor_seg_num) / predictor_seg_num\n            for j in range(predictor_seg_num):\n                pos = (j + np.random.random()) * stride\n                idxs.append(min(extractor_seg_num - 1, int(pos)))\n            extractor_feature = feature_out[:, idxs, :].astype(\n                float)  # get from bs dim\n        else:\n            extractor_feature = feature_out.astype(float)\n        predictor_feed_data = [extractor_feature]\n        predictor_feed_list.append((predictor_feed_data, file_list[i]))"
+        },
+        {
+            "comment": "This code sets up a predictor model, builds its inputs, builds the model itself, initializes feeds, runs a startup program, loads test weights from a specified location, and performs these actions within scopes and programs for efficient execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":178-198",
+            "content": "    predictor_start_time = time.time()\n    predictor_scope = paddle.static.Scope()\n    with static.scope_guard(predictor_scope):\n        predictor_startup_prog = static.Program()\n        predictor_main_prog = static.Program()\n        with static.program_guard(predictor_main_prog, predictor_startup_prog):\n            paddle.disable_static()\n                # parse config\n            predictor_model = models.get_model(args.predictor_name,\n                                               predictor_infer_config,\n                                               mode='infer')\n            predictor_model.build_input(use_dataloader=False)\n            predictor_model.build_model()\n            predictor_feeds = predictor_model.feeds()\n            exe.run(predictor_startup_prog)\n            logger.info('load predictor weights from {}'.format(\n                args.predictor_weights))\n            predictor_model.load_test_weights(exe, args.predictor_weights,\n                                              predictor_main_prog)"
+        },
+        {
+            "comment": "This code snippet is initializing a DataFeeder for predictor model, fetching the list of metrics for predictor model and resetting them. It then iterates over the feed data, runs the model with each data instance, accumulates the final results in the metrics object, and finally logs the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":200-220",
+            "content": "            predictor_feeder = paddle.fluid.DataFeeder(place=place,\n                                                feed_list=predictor_feeds)\n            predictor_fetch_list = predictor_model.fetches()\n            predictor_metrics = get_metrics(args.predictor_name.upper(),\n                                            'infer', predictor_infer_config)\n            predictor_metrics.reset()\n            for idx, data in enumerate(predictor_feed_list):\n                file_id = data[1]\n                predictor_feed_data = data[0]\n                final_outs = exe.run(\n                    fetch_list=predictor_fetch_list,\n                    feed=predictor_feeder.feed(predictor_feed_data))\n                logger.info(\n                    '=======[Stage 2 Sample {} ] Predictor finished========'\n                    .format(idx))\n                final_result_list = [item\n                                     for item in final_outs] + [file_id]\n                predictor_metrics.accumulate(final_result_list)\n            predictor_metrics.finalize_and_log_out("
+        },
+        {
+            "comment": "The code measures the time taken for a predictor to run and outputs it. It also records the total time taken for inferencing and displays the result, indicating when the inference is finished. This code snippet appears within the main function of the script, suggesting that this timing information is used for performance analysis or optimization purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/videotag_test.py\":221-237",
+            "content": "                savedir=args.save_dir, label_file=args.label_file)\n            paddle.enable_static()\n    predictor_end_time = time.time()\n    print('predictor_time', predictor_end_time - predictor_start_time)\nif __name__ == '__main__':\n    start_time = time.time()\n    args = parse_args()\n    print(args)\n    check_cuda(args.use_gpu)\n    check_version()\n    logger.info(args)\n    main()\n    end_time = time.time()\n    period = end_time - start_time\n    print('[INFER] infer finished. cost time: {}'.format(period))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/34596d26-8de0-4e85-9571-2b797bdf81bf.json b/docs/doc/34596d26-8de0-4e85-9571-2b797bdf81bf.json
new file mode 100644
index 000000000..ee913c33f
--- /dev/null
+++ b/docs/doc/34596d26-8de0-4e85-9571-2b797bdf81bf.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code calculates precision, recall, and F1-score for a TransNetV2 metric machine learning model by handling scene location transformations and errors. It iterates through different thresholds before logging the results.",
+    "details": [
+        {
+            "comment": "This function, named \"predictions_to_scenes\", takes in a list of predictions and outputs a list of scene locations. The scenes are determined by identifying changes from 0 to 1 and vice versa. If all predictions are 1, the function adds a final scene ending at the last index. The code also includes error checking for cases where all predictions are 1 or when there is a disruption in prediction data flow.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/transnetv2_metric.py\":0-33",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\ndef predictions_to_scenes(predictions):\n    scenes = []\n    t, t_prev, start = -1, 0, 0\n    for i, t in enumerate(predictions):\n        if t_prev == 1 and t == 0:\n            start = i\n        if t_prev == 0 and t == 1 and i != 0:\n            scenes.append([start, i])\n        t_prev = t\n    if t == 0:\n        scenes.append([start, i])\n    # just fix if all predictions are 1"
+        },
+        {
+            "comment": "This function converts scene lists to transition lists. If there are no scenes, it returns a transition list with one element. The function is based on an external source and adapted for specific use cases. It can handle different tolerance margins, which affects how the pred_scenes and gt_scenes are transformed into prediction transitions (pred_trans) and ground truth transitions (gt_trans), respectively. A \"HIT\" or \"MISS\" status is determined based on these converted lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/transnetv2_metric.py\":34-56",
+            "content": "    if len(scenes) == 0:\n        return np.array([[0, len(predictions) - 1]], dtype=np.int32)\n    return np.array(scenes, dtype=np.int32)\ndef evaluate_scenes(gt_scenes, pred_scenes, n_frames_miss_tolerance=2):\n    \"\"\"\n    Adapted from: https://github.com/gyglim/shot-detection-evaluation\n    The original based on: http://imagelab.ing.unimore.it/imagelab/researchActivity.asp?idActivity=19\n    n_frames_miss_tolerance:\n        Number of frames it is possible to miss ground truth by, and still being counted as a correct detection.\n    Examples of computation with different tolerance margin:\n    n_frames_miss_tolerance = 0\n      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.5, 5.5]]\n      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[5.5, 5.5]] -> HIT\n      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[4.5, 4.5]] -> MISS\n    n_frames_miss_tolerance = 1\n      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.0, 6.0]]\n      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[5.0, 6.0]] -> HIT\n      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[4.0, 5.0]] -> HIT"
+        },
+        {
+            "comment": "This code adjusts and transforms input frame scene and transition data, and then iterates through both to calculate true positives (TP), false positives (FP), and false negatives (FN) for evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/transnetv2_metric.py\":57-79",
+            "content": "      gt_scenes:   [[0, 3], [4, 9]] -> gt_trans:   [[3.0, 4.0]] -> MISS\n    n_frames_miss_tolerance = 2\n      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[4.5, 6.5]]\n      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[4.5, 6.5]] -> HIT\n      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[3.5, 5.5]] -> HIT\n      gt_scenes:   [[0, 3], [4, 9]] -> gt_trans:   [[2.5, 4.5]] -> HIT\n      gt_scenes:   [[0, 2], [3, 9]] -> gt_trans:   [[1.5, 3.5]] -> MISS\n      Users should be careful about adopting these functions in any commercial matters.\n    \"\"\"\n    shift = n_frames_miss_tolerance / 2\n    gt_scenes = gt_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])\n    pred_scenes = pred_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])\n    gt_trans = np.stack([gt_scenes[:-1, 1], gt_scenes[1:, 0]], 1)\n    pred_trans = np.stack([pred_scenes[:-1, 1], pred_scenes[1:, 0]], 1)\n    i, j = 0, 0\n    tp, fp, fn = 0, 0, 0\n    while i < len(gt_trans) or j < len(pred_trans):\n        if j == len(pred_trans) or pred_trans[j, 0] > gt_trans[i, 1]:"
+        },
+        {
+            "comment": "This function calculates precision, recall, and F1-score for transnetv2 metric given ground truth (gt) and predicted (pred) transcript sequences. It iterates through the sequences to count true positives (tp), false negatives (fn), and false positives (fp). Afterwards, it computes precision, recall, and F1-score based on these counts. The function also asserts that the total number of true positives matches the length of gt_trans and the total number of false positives matches the length of pred_trans. It then returns the calculated metrics and the count of tp, fp, and fn. The create_scene_based_summaries function generates precision, recall, and F1-score for different thresholds using a numpy array. It initializes these metrics as well as the counts of true positives, false negatives, false positives, and false negatives to zero, then iterates over the thresholds to calculate the metric values for each threshold.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/transnetv2_metric.py\":80-119",
+            "content": "            fn += 1\n            i += 1\n        elif i == len(gt_trans) or pred_trans[j, 1] < gt_trans[i, 0]:\n            fp += 1\n            j += 1\n        else:\n            i += 1\n            j += 1\n            tp += 1\n    if tp + fp != 0:\n        p = tp / (tp + fp)\n    else:\n        p = 0\n    if tp + fn != 0:\n        r = tp / (tp + fn)\n    else:\n        r = 0\n    if p + r != 0:\n        f1 = (p * r * 2) / (p + r)\n    else:\n        f1 = 0\n    assert tp + fn == len(gt_trans)\n    assert tp + fp == len(pred_trans)\n    return p, r, f1, (tp, fp, fn)\ndef create_scene_based_summaries(one_hot_pred, one_hot_gt):\n    thresholds = np.array([\n        0.02, 0.06, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9\n    ])\n    precision, recall, f1, tp, fp, fn = np.zeros_like(thresholds), np.zeros_like(thresholds),\\\n                                        np.zeros_like(thresholds), np.zeros_like(thresholds),\\\n                                        np.zeros_like(thresholds), np.zeros_like(thresholds)\n    gt_scenes = predictions_to_scenes(one_hot_gt)"
+        },
+        {
+            "comment": "This code is from the TransNetV2Metric class, which calculates metrics for a model's predictions. It iterates through different thresholds to compute precision, recall, F1 score, and true positive, false positive, and false negative counts. The update method appends predictions and computes metrics when a new file is encountered.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/transnetv2_metric.py\":120-151",
+            "content": "    for i in range(len(thresholds)):\n        pred_scenes = predictions_to_scenes(\n            (one_hot_pred > thresholds[i]).astype(np.uint8)\n        )\n        precision[i], recall[i], f1[i], (tp[i], fp[i], fn[i]) = evaluate_scenes(gt_scenes, pred_scenes)\n    best_idx = np.argmax(f1)\n    return f1[best_idx]\n@METRIC.register\nclass TransNetV2Metric(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.predictions = []\n        self.total_stats = {\"tp\": 0, \"fp\": 0, \"fn\": 0}\n    def update(self, batch_id, data, one_hot):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        if isinstance(one_hot, tuple):\n            one_hot = one_hot[0]\n        one_hot = paddle.nn.functional.sigmoid(one_hot)[0]\n        self.predictions.append(one_hot.numpy()[25:75])\n        gt_scenes = data[1]\n        is_new_file = data[2]\n        if is_new_file:\n            self.compute(gt_scenes)\n        # preds ensemble"
+        },
+        {
+            "comment": "The code calculates precision, recall, and F1 score for a machine learning model. It accumulates the metrics after processing all batches and logs the results using logger. It also displays the Precision, Recall, and F1 Score at the end of computation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/transnetv2_metric.py\":152-173",
+            "content": "        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def compute(self, gt_scenes):\n        predictions = np.concatenate(self.predictions, 0)[:len(frames)]\n        _, _, _, (tp, fp, fn), fp_mistakes, fn_mistakes = evaluate_scenes(\n            gt_scenes, predictions_to_scenes((predictions >= args.thr).astype(np.uint8)))\n        self.total_stats[\"tp\"] += tp\n        self.total_stats[\"fp\"] += fp\n        self.total_stats[\"fn\"] += fn\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        p = self.total_stats[\"tp\"] / (self.total_stats[\"tp\"] + self.total_stats[\"fp\"])\n        r = self.total_stats[\"tp\"] / (self.total_stats[\"tp\"] + self.total_stats[\"fn\"])\n        f1 = (p * r * 2) / (p + r)\n        logger.info('[TEST] finished, Precision= {:5.2f}, Recall= {:5.2f} , F1 Score= {:5.2f} '.format(\n            p * 100, r * 100, f1 * 100))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/346792a9-3f14-4fe1-91d3-fa862c3ec732.json b/docs/doc/346792a9-3f14-4fe1-91d3-fa862c3ec732.json
new file mode 100644
index 000000000..0537160c7
--- /dev/null
+++ b/docs/doc/346792a9-3f14-4fe1-91d3-fa862c3ec732.json
@@ -0,0 +1,10 @@
+{
+    "summary": "Code snippet registers two models, AttentionLSTM and TSN, in the application's model registry using the functions regist_model() and get_model(). Models are sorted alphabetically for easy retrieval.",
+    "details": [
+        {
+            "comment": "Code snippet registers two models, AttentionLSTM and TSN, in the application's model registry using the functions regist_model() and get_model(). Models are sorted alphabetically for easy retrieval.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/__init__.py\":0-6",
+            "content": "from .model import regist_model, get_model\nfrom .attention_lstm import AttentionLSTM\nfrom .tsn import TSN\n# regist models, sort by alphabet\nregist_model(\"AttentionLSTM\", AttentionLSTM)\nregist_model(\"TSN\", TSN)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/34a9d680-3937-45e1-9d1d-379d0f3be01e.json b/docs/doc/34a9d680-3937-45e1-9d1d-379d0f3be01e.json
new file mode 100644
index 000000000..356cbb867
--- /dev/null
+++ b/docs/doc/34a9d680-3937-45e1-9d1d-379d0f3be01e.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code reads the YouTube-8M dataset, featuring three models (LSTM, attention cluster, nextVlad), and is used for table tennis action detection. It uses cPickle and numpy, and a feature reader initializes for training or inference batches, extracting image, audio, and pcm features.",
+    "details": [
+        {
+            "comment": "This code is a data reader for the YouTube-8M dataset, which contains features extracted by prior networks for three models: LSTM, attention cluster, and nextVlad. It uses cPickle to load data from storage and numpy for numerical operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/feature_reader.py\":0-33",
+            "content": "\"\"\"\nattention-lstm feature reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\nimport numpy as np\nimport random\nimport code\nfrom .reader_utils import DataReader\nclass FeatureReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm, attention cluster, nextvlad"
+        },
+        {
+            "comment": "This code initializes a feature reader for table tennis action detection. It takes in parameters such as name, mode, configuration, and material. The reader creates lists of image, audio, and pcm features, reshapes the pcm_feature_list, and shuffles proposal list if in train mode. It then defines a reader function that iterates through proposal list to create batches for training or inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/feature_reader.py\":35-70",
+            "content": "    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        self.feature = material['feature']\n        self.proposal = material['proposal']\n        self.fps = 5\n    def create_reader(self):\n        \"\"\"\n        create_reader\n        \"\"\"\n        image_feature_list = self.feature['image_feature']\n        audio_feature_list = self.feature['audio_feature']\n        pcm_feature_list = self.feature['pcm_feature']\n        pcm_feature_list = pcm_feature_list.reshape(\n            (pcm_feature_list.shape[0] * 5, 640))\n        fl = self.proposal\n        if self.mode == 'train':\n            random.shuffle(fl)\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            for prop_info in fl:\n                start_id = int(prop_info['start'])"
+        },
+        {
+            "comment": "This code segment is part of a feature reader for Table Tennis action prediction. It extracts image, audio, and pcm features from their respective lists based on start and end IDs. If batch size is reached, it yields the batch and resets the batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/feature_reader.py\":71-90",
+            "content": "                end_id = int(prop_info['end'])\n                bmn_score = float(prop_info['score'])\n                try:\n                    image_feature = image_feature_list[start_id:end_id]\n                    audio_feature = audio_feature_list[int(start_id / self.fps\n                                                           ):int(end_id /\n                                                                 self.fps)]\n                    pcm_feature = pcm_feature_list[start_id:end_id]\n                    # image_feature = np.concatenate((image_feature, pcm_feature), axis=1)\n                    batch_out.append(\n                        (image_feature, audio_feature, 0, prop_info))\n                    if len(batch_out) == self.batch_size:\n                        yield batch_out\n                        batch_out = []\n                except Exception as e:\n                    continue\n        return reader"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/34da3e09-faf6-4ce8-83de-df2c137c5653.json b/docs/doc/34da3e09-faf6-4ce8-83de-df2c137c5653.json
new file mode 100644
index 000000000..4849562f4
--- /dev/null
+++ b/docs/doc/34da3e09-faf6-4ce8-83de-df2c137c5653.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines `ReaderZoo` class for handling errors in PaddleVideo TableTennis, allowing registration and retrieval of different types of readers using named parameters.",
+    "details": [
+        {
+            "comment": "This Python script defines a ReaderNotFoundError exception and a class for handling reader-related errors in the PaddleVideo TableTennis application. It includes license information and allows for checking if a specified reader is available by comparing it to a list of available readers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/reader_utils.py\":0-32",
+            "content": "\"\"\"\nreader_util\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nclass ReaderNotFoundError(Exception):\n    \"\"\"\n    \"Error: reader not found\"\n    \"\"\"\n    def __init__(self, reader_name, avail_readers):\n        super(ReaderNotFoundError, self).__init__()\n        self.reader_name = reader_name\n        self.avail_readers = avail_readers\n    def __str__(self):\n        msg = \"Reader {} Not Found.\\nAvailiable readers:\\n\".format(\n            self.reader_name)"
+        },
+        {
+            "comment": "This code defines a `DataReader` class and a `ReaderZoo` class. The `DataReader` class is a data reader for video input, with methods such as `create_reader`, which should be implemented but is currently empty, and `get_config_from_sec`, which gets configuration from a section in the given config file. The `ReaderZoo` class registers different types of readers based on their names and ensures they inherit from the `DataReader` class. The code also includes functionality to retrieve a reader given its name, mode, configuration, and optionally a material type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/reader_utils.py\":33-80",
+            "content": "        for reader in self.avail_readers:\n            msg += \"  {}\\n\".format(reader)\n        return msg\nclass DataReader(object):\n    \"\"\"\n    data reader for video input\n    \"\"\"\n    def __init__(self, model_name, mode, cfg):\n        self.name = model_name\n        self.mode = mode\n        self.cfg = cfg\n    def create_reader(self):\n        \"\"\"\n        Not implemented\n        \"\"\"\n        pass\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"\n        get_config_from_sec\n        \"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ReaderZoo(object):\n    \"\"\"\n    ReaderZoo\n    \"\"\"\n    def __init__(self):\n        \"\"\"\n        __init__\n        \"\"\"\n        self.reader_zoo = {}\n    def regist(self, name, reader):\n        \"\"\"\n        regist\n        \"\"\"\n        assert reader.__base__ == DataReader, \"Unknow model type {}\".format(\n            type(reader))\n        self.reader_zoo[name] = reader\n    def get(self, name, mode, cfg, material=None):"
+        },
+        {
+            "comment": "This code defines a singleton reader_zoo, allows for registration of readers using the regist_reader() function, and retrieves the registered reader using get_reader() function. The reader instance is created by calling create_reader() on the retrieved reader model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/reader_utils.py\":81-106",
+            "content": "        \"\"\"\n        get\n        \"\"\"\n        for k, v in self.reader_zoo.items():\n            if k == name:\n                return v(name, mode, cfg, material)\n        raise ReaderNotFoundError(name, self.reader_zoo.keys())\n# singleton reader_zoo\nreader_zoo = ReaderZoo()\ndef regist_reader(name, reader):\n    \"\"\"\n    regist_reader\n    \"\"\"\n    reader_zoo.regist(name, reader)\ndef get_reader(name, mode, cfg, material=None):\n    \"\"\"\n    get_reader\n    \"\"\"\n    reader_model = reader_zoo.get(name, mode, cfg, material)\n    return reader_model.create_reader()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/34eb7bf8-5164-4b64-84b1-483de0790aa1.json b/docs/doc/34eb7bf8-5164-4b64-84b1-483de0790aa1.json
new file mode 100644
index 000000000..b3439e83d
--- /dev/null
+++ b/docs/doc/34eb7bf8-5164-4b64-84b1-483de0790aa1.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The code defines an Ernie model configuration and initializes the ERNIE model using Paddle's embedding layer. It also includes a multimodal video tagging model, embeddings, data pre-processing, attention mask creation, encoder usage, and TextCNN for sequence feature extraction. The code creates 1D convolutional layers with specified parameters and returns the output.",
+    "details": [
+        {
+            "comment": "This code snippet contains the Ernie model class definition. It imports necessary modules, defines logging, and initializes a class for configuring the Ernie model. The class inherits from `object` and represents the configuration to be used in the Ernie model architecture.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py\":0-32",
+            "content": "#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Ernie model.\"\"\"\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\nfrom __future__ import absolute_import\nimport json\nimport six\nimport logging\nimport paddle\nimport paddle.static as static\nfrom io import open\nfrom .transformer_encoder import encoder, pre_process_layer\nlog = logging.getLogger(__name__)\nclass ErnieConfig(object):"
+        },
+        {
+            "comment": "This code defines a class for an Ernie model configuration. It initializes the config with a given path, parses the config file using JSON, allows getting and setting items from/to the configuration dictionary, and provides a print_config method to display the configuration in a readable format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py\":33-72",
+            "content": "    \"\"\"\n    Erine model config\n    \"\"\"\n    def __init__(self, config_path):\n        \"\"\"\n        init\n        \"\"\"\n        self._config_dict = self._parse(config_path)\n    def _parse(self, config_path):\n        \"\"\"\n        parse config\n        \"\"\"\n        try:\n            with open(config_path, 'r', encoding='utf8') as json_file:\n                config_dict = json.load(json_file)\n        except Exception:\n            raise IOError(\"Error in parsing Ernie model config file '%s'\" %\n                          config_path)\n        else:\n            return config_dict\n    def __getitem__(self, key):\n        \"\"\"\n        get item\n        \"\"\"\n        return self._config_dict.get(key, None)\n    def __setitem__(self, key, value):\n        \"\"\"\n        set item\n        \"\"\"\n        self._config_dict[key] = value\n    def print_config(self):\n        \"\"\"\n        print config\n        \"\"\"\n        for arg, value in sorted(six.iteritems(self._config_dict)):\n            log.info('%s: %s' % (arg, value))\n        log.info('------------------------------------------------')"
+        },
+        {
+            "comment": "The code defines the ErnieModel class, which initializes an ERINE model with parameters such as source ids, position ids, sentence ids, task ids, input mask, configuration, weight sharing, and use of fp16. The class attributes are initialized based on the provided configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py\":75-105",
+            "content": "class ErnieModel(object):\n    \"\"\"\n    ERINE Model\n    \"\"\"\n    def __init__(self,\n                 src_ids,\n                 position_ids,\n                 sentence_ids,\n                 task_ids,\n                 input_mask,\n                 config,\n                 weight_sharing=True,\n                 use_fp16=False):\n        \"\"\"\n        init model\n        \"\"\"\n        self._emb_size = config['hidden_size']\n        self._n_layer = config['num_hidden_layers']\n        self._n_head = config['num_attention_heads']\n        self._voc_size = config['vocab_size']\n        self._max_position_seq_len = config['max_position_embeddings']\n        if config['sent_type_vocab_size']:\n            self._sent_types = config['sent_type_vocab_size']\n        else:\n            self._sent_types = config['type_vocab_size']\n        self._use_task_id = config['use_task_id']\n        if self._use_task_id:\n            self._task_types = config['task_type_vocab_size']\n        self._hidden_act = config['hidden_act']\n        self._prepostprocess_dropout = config['hidden_dropout_prob']"
+        },
+        {
+            "comment": "This code initializes the ERNIE model parameters and builds the model. It sets various attributes such as attention dropout probability, embedding names for word, position, sentence, and task, data types, and initializer range. The _build_model function is then called to create the model using Paddle's embedding layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py\":106-131",
+            "content": "        self._attention_dropout = config['attention_probs_dropout_prob']\n        self._weight_sharing = weight_sharing\n        self._word_emb_name = \"word_embedding\"\n        self._pos_emb_name = \"pos_embedding\"\n        self._sent_emb_name = \"sent_embedding\"\n        self._task_emb_name = \"task_embedding\"\n        self._dtype = \"float16\" if use_fp16 else \"float32\"\n        self._emb_dtype = \"float32\"\n        # Initialize all weigths by truncated normal initializer, and all biases\n        # will be initialized by constant zero by default.\n        self._param_initializer = paddle.nn.initializer.TruncatedNormal(\n            std=config['initializer_range'])\n        self._build_model(src_ids, position_ids, sentence_ids, task_ids,\n                          input_mask)\n    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,\n                     input_mask):\n        \"\"\"\n        build  model\n        \"\"\"\n        # padding id in vocabulary must be set to 0\n        emb_out = static.nn.embedding(\n            input=src_ids,"
+        },
+        {
+            "comment": "This code initializes and concatenates three embeddings - word, position, and sentence - in a multimodal video tagging model. The embeddings are defined with specific sizes and data types. Two embeddings (position_emb_out and sent_emb_out) are added to the original embedding (emb_out), and then these combined embeddings are returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py\":132-157",
+            "content": "            size=[self._voc_size, self._emb_size],\n            dtype=self._emb_dtype,\n            param_attr=paddle.ParamAttr(\n                name=self._word_emb_name, initializer=self._param_initializer),\n            is_sparse=False)\n        position_emb_out = static.nn.embedding(\n            input=position_ids,\n            size=[self._max_position_seq_len, self._emb_size],\n            dtype=self._emb_dtype,\n            param_attr=paddle.ParamAttr(\n                name=self._pos_emb_name, initializer=self._param_initializer))\n        sent_emb_out = static.nn.embedding(\n            sentence_ids,\n            size=[self._sent_types, self._emb_size],\n            dtype=self._emb_dtype,\n            param_attr=paddle.ParamAttr(\n                name=self._sent_emb_name, initializer=self._param_initializer))\n        # emb_out = emb_out + position_emb_out\n        # emb_out = emb_out + sent_emb_out\n        emb_out = paddle.add(x=emb_out, y=position_emb_out)\n        emb_out = paddle.add(x=emb_out, y=sent_emb_out)\n        if self._use_task_id:"
+        },
+        {
+            "comment": "This code initializes an embedding layer for task types, adds it to the embeddings, applies pre-processing with dropout if necessary, and casts the embeddings to the desired dtype. It also creates a self-attention mask, stacks it for each attention head, sets its gradient to stop during backpropagation, and passes the embeddings through an encoder.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py\":158-183",
+            "content": "            task_emb_out = static.nn.embedding(\n                task_ids,\n                size=[self._task_types, self._emb_size],\n                dtype=self._emb_dtype,\n                param_attr=paddle.ParamAttr(\n                    name=self._task_emb_name,\n                    initializer=self._param_initializer))\n            emb_out = emb_out + task_emb_out\n        emb_out = pre_process_layer(\n            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')\n        if self._dtype == \"float16\":\n            emb_out = paddle.cast(x=emb_out, dtype=self._dtype)\n            input_mask = paddle.cast(x=input_mask, dtype=self._dtype)\n        self_attn_mask = paddle.matmul(\n            x=input_mask, y=input_mask, transpose_y=True)\n        self_attn_mask = paddle.scale(\n            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)\n        n_head_self_attn_mask = paddle.stack(\n            x=[self_attn_mask] * self._n_head, axis=1)\n        n_head_self_attn_mask.stop_gradient = True\n        self._enc_out = encoder("
+        },
+        {
+            "comment": "This code is defining and initializing a model for an encoder layer in a deep learning application. The model takes several parameters such as embedding size, number of layers and heads, dropout rates, activation function, etc. It then casts the output to the specified data type if necessary. The `get_sequence_output` method returns the sequence output from the encoder layer and `get_sequence_textcnn_output` takes in a feature sequence and an input mask to generate the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py\":184-214",
+            "content": "            enc_input=emb_out,\n            attn_bias=n_head_self_attn_mask,\n            n_layer=self._n_layer,\n            n_head=self._n_head,\n            d_key=self._emb_size // self._n_head,\n            d_value=self._emb_size // self._n_head,\n            d_model=self._emb_size,\n            d_inner_hid=self._emb_size * 4,\n            prepostprocess_dropout=self._prepostprocess_dropout,\n            attention_dropout=self._attention_dropout,\n            relu_dropout=0,\n            hidden_act=self._hidden_act,\n            preprocess_cmd=\"\",\n            postprocess_cmd=\"dan\",\n            param_initializer=self._param_initializer,\n            name='encoder')\n        if self._dtype == \"float16\":\n            self._enc_out = paddle.cast(\n                x=self._enc_out, dtype=self._emb_dtype)\n    def get_sequence_output(self):\n        \"\"\"\n        get sequence output\n        \"\"\"\n        return self._enc_out\n    def get_sequence_textcnn_output(self, sequence_feature, input_mask):\n        \"\"\"\n        get sequence output\n        \"\"\""
+        },
+        {
+            "comment": "This code defines a TextCNN model for sequence feature extraction. It pads the input sequence, applies convolutions with various window sizes, and pools the results. The get_pooled_output function extracts the first feature of each sequence for classification by applying an FC layer with tanh activation. The textcnn function initializes a TextCNN model with specified window sizes and hidden dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py\":215-242",
+            "content": "        seq_len = paddle.sum(x=input_mask, axis=[1, 2])\n        seq_len = paddle.cast(seq_len, 'int64')\n        sequence_feature = paddle.static.nn.sequence_unpad(sequence_feature, seq_len)\n        return self.textcnn(sequence_feature)\n    def get_pooled_output(self):\n        \"\"\"Get the first feature of each sequence for classification\"\"\"\n        next_sent_feat = paddle.slice(\n            input=self._enc_out, axes=[1], starts=[0], ends=[1])\n        next_sent_feat = static.nn.fc(\n            x=next_sent_feat,\n            size=self._emb_size,\n            activation=\"tanh\",\n            weight_attr=paddle.ParamAttr(\n                name=\"pooled_fc.w_0\", initializer=self._param_initializer),\n            bias_attr=\"pooled_fc.b_0\")\n        return next_sent_feat\n    def textcnn(self, feature, name='text_cnn'):\n        \"\"\"\n        TextCNN sequence feature extraction\n        \"\"\"\n        win_sizes = [2, 3, 4]\n        hid_dim = 256\n        convs = []\n        for win_size in win_sizes:\n            conv_h = paddle.fluid.nets.sequence_conv_pool(input=feature,"
+        },
+        {
+            "comment": "This code is creating a 1D convolutional layer with specified parameters, including the number of filters, filter size, activation function, and pooling type. The resulting convolutional layers are appended to the `convs` list, and then concatenated along axis 1 to form `convs_out`. Finally, the function returns `convs_out`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py\":243-249",
+            "content": "                                                   num_filters=hid_dim,\n                                                   filter_size=win_size,\n                                                   act=\"tanh\",\n                                                   pool_type=\"max\")\n            convs.append(conv_h)\n        convs_out = paddle.concat(x=convs, axis=1)\n        return convs_out"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/354a97b8-a143-47ed-9284-36d917d0fd1a.json b/docs/doc/354a97b8-a143-47ed-9284-36d917d0fd1a.json
new file mode 100644
index 000000000..99a278517
--- /dev/null
+++ b/docs/doc/354a97b8-a143-47ed-9284-36d917d0fd1a.json
@@ -0,0 +1,10 @@
+{
+    "summary": "Code sets the QEI_VIDEO_ROOT variable to the absolute path of the directory containing the current file. It also imports the version module from QEIVideo and assigns its __version__ attribute to a variable. This may be used for identifying the version of the QEIVideo application.",
+    "details": [
+        {
+            "comment": "Code sets the QEI_VIDEO_ROOT variable to the absolute path of the directory containing the current file. It also imports the version module from QEIVideo and assigns its __version__ attribute to a variable. This may be used for identifying the version of the QEIVideo application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/__init__.py\":0-12",
+            "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport os\nQEI_VIDEO_ROOT = os.path.abspath(os.path.dirname(__file__))\nimport os\nfrom QEIVideo.version import __version__\nQEI_VIDEO_ROOT = os.path.abspath(os.path.dirname(__file__))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/355ad68e-66eb-428c-b635-fbad5b4f117a.json b/docs/doc/355ad68e-66eb-428c-b635-fbad5b4f117a.json
new file mode 100644
index 000000000..e92f66b0e
--- /dev/null
+++ b/docs/doc/355ad68e-66eb-428c-b635-fbad5b4f117a.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The MSR-VTT dataset contains 10K videos available on its website, organized in a \"data\" directory for ActBERT model use. The lock.mdb file is a database used for storing and managing data related to multi-modal transformers for video retrieval as described in a 2020 ECCV paper.",
+    "details": [
+        {
+            "comment": "This code provides an overview of the MSR-VTT dataset, its download process for T2VLAD and ActBERT applications, and references for more information. It consists of 10K video clips from 20 categories, each with 20 English sentences, and is available on the MSRVTT website.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/msrvtt.md\":0-28",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/msrvtt.md) | English\n# MSR-VTT Preparation\n- [Introduction](#1.1)\n- [Download for T2VLAD](#1.2)\n- [Download for ActBERT](#1.3)\n- [Reference](#1.4)\n<a name=\"1.1\"></a>\n## Introduction\nMSR-VTT(Microsoft Research Video to Text) is a large-scale dataset containing videos and subtitles, which is composed of 10000 video clips from 20 categories, and each video clip is annotated with 20 English sentences. We used 9000 video clips for training and 1000 for testing. For more details, please refer to the website: [MSRVTT](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)\n<a name=\"1.2\"></a>\n## Download for T2VLAD\n[T2VLAD doc](../../../applications/T2VLAD/README_en.md)\nFor ease of use, we provided extracted features of video.\nFirst, make sure to enter the following command in the `applications/T2VLAD/data` directory to download the dataset.\n```bash\nbash download_features.sh\n```\nAfter downloading, the files in the data directory are organized as follows:"
+        },
+        {
+            "comment": "Code provides the instructions to download and decompress data features required for ActBERT model, specifically for MSR-VTT dataset. The data is organized in the \"data\" directory with a .lmdb file and a CSV file containing JSFusion test data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/msrvtt.md\":30-72",
+            "content": "```\n\u251c\u2500\u2500 data\n|   \u251c\u2500\u2500 MSR-VTT\n|   \u2502   \u251c\u2500\u2500 raw-captions.pkl\n|   \u2502   \u251c\u2500\u2500 train_list_jsfusion.txt\n|   \u2502   \u251c\u2500\u2500 val_list_jsfusion.txt\n|   \u2502   \u251c\u2500\u2500 aggregated_text_feats\n|   |   |   \u251c\u2500\u2500 w2v_MSRVTT_openAIGPT.pickle\n|   |   \u251c\u2500\u2500 mmt_feats\n|   \u2502   \u2502   \u251c\u2500\u2500 features.audio.pkl\n|   \u2502   \u2502   \u251c\u2500\u2500 features.face_agg.pkl\n|   \u2502   \u2502   \u251c\u2500\u2500 features.flos_agg.pkl\n|   \u2502   \u2502   \u251c\u2500\u2500 features.ocr.pkl\n|   \u2502   \u2502   \u251c\u2500\u2500 features.rgb_agg.pkl\n|   \u2502   \u2502   \u251c\u2500\u2500 features.s3d.pkl\n|   \u2502   \u2502   \u251c\u2500\u2500 features.scene.pkl\n|   \u2502   \u2502   \u251c\u2500\u2500 features.speech.pkl\n```\n<a name=\"1.3\"></a>\n## Download for ActBERT\n[ActBERT doc](../model_zoo/multimodal/actbert.md)\nDownload data features:\n```\nwget https://videotag.bj.bcebos.com/Data/ActBERT/msrvtt_test.lmdb.tar\nwget https://videotag.bj.bcebos.com/Data/ActBERT/MSRVTT_JSFUSION_test.csv\n```\nDecompress the `msrvtt_test.lmdb.tar`\uff1a\n```\ntar -zxvf msrvtt_test.lmdb.tar\n```\nThe files in the data directory are organized as follows:\n```\n\u251c\u2500\u2500 data\n|   \u251c\u2500\u2500 MSR-VTT\n|   \u2502   \u251c\u2500\u2500 MSRVTT_JSFUSION_test.csv\n|   \u2502   \u251c\u2500\u2500 msrvtt_test.lmdb\n|   \u2502       \u251c\u2500\u2500 data.mdb"
+        },
+        {
+            "comment": "lock.mdb: Database file used for storing and managing data in codebase related to multi-modal transformer for video retrieval as described in the 2020 ECCV paper by Valentin Gabeur et al.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/msrvtt.md\":73-78",
+            "content": "|   \u2502       \u251c\u2500\u2500 lock.mdb\n```\n<a name=\"1.4\"></a>\n## Reference\n- Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. Multi-modal transformer for video retrieval. In ECCV, 2020."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/35885247-4755-4325-8af8-b4d9692791e2.json b/docs/doc/35885247-4755-4325-8af8-b4d9692791e2.json
new file mode 100644
index 000000000..f8b7196ad
--- /dev/null
+++ b/docs/doc/35885247-4755-4325-8af8-b4d9692791e2.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code imports libraries, defines a function for parsing command line arguments, and sets up paths and licenses before building the model using PaddleVideo. It initializes segments (num_seg) and summarizes the model's parameters while calculating FLOPs if enabled.",
+    "details": [
+        {
+            "comment": "This code snippet is importing necessary libraries and defining a function for parsing command line arguments in the PaddleVideo project. The code also sets up some paths and licenses, ensuring compliance with the Apache License, Version 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/summary.py\":0-33",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nimport sys\nimport os.path as osp\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle.jit import to_static\nimport paddleslim\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\nfrom paddlevideo.modeling.builder import build_model\nfrom paddlevideo.utils import get_config\ndef parse_args():\n    parser = argparse.ArgumentParser(\"PaddleVideo Summary\")"
+        },
+        {
+            "comment": "This code parses arguments for the config file path, image size, and number of segments. It then trims unnecessary attributes from the training configuration before building the model using the parsed arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/summary.py\":34-68",
+            "content": "    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument(\"--img_size\", type=int, default=224)\n    parser.add_argument(\"--num_seg\", type=int, default=8)\n    parser.add_argument(\"--FLOPs\",\n                        action=\"store_true\",\n                        help=\"whether to print FLOPs\")\n    return parser.parse_args()\ndef _trim(cfg, args):\n    \"\"\"\n    Reuse the trainging config will bring useless attribute, such as: backbone.pretrained model. Trim it here.\n    \"\"\"\n    model_name = cfg.model_name\n    cfg = cfg.MODEL\n    cfg.backbone.pretrained = \"\"\n    if 'num_seg' in cfg.backbone:\n        cfg.backbone.num_seg = args.num_seg\n    return cfg, model_name\ndef main():\n    args = parse_args()\n    cfg, model_name = _trim(get_config(args.config, show=False), args)\n    print(f\"Building model({model_name})...\")\n    model = build_model(cfg)\n    img_size = args.img_size"
+        },
+        {
+            "comment": "This code snippet initializes the number of segments (num_seg) and currently only supports tsm. It generates a summary of the model's parameters using Paddle's summary function, and if FLOPs is enabled, it also calculates and prints the model's floating-point operations using paddleslim's analysis.flops function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/summary.py\":69-81",
+            "content": "    num_seg = args.num_seg\n    #NOTE: only support tsm now, will refine soon\n    params_info = paddle.summary(model, (1, 1, num_seg, 3, img_size, img_size))\n    print(params_info)\n    if args.FLOPs:\n        flops_info = paddleslim.analysis.flops(\n            model, [1, 1, num_seg, 3, img_size, img_size])\n        print(flops_info)\nif __name__ == \"__main__\":\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/35c844fa-68b5-4457-abfb-24c68e2bdf80.json b/docs/doc/35c844fa-68b5-4457-abfb-24c68e2bdf80.json
new file mode 100644
index 000000000..190caccb9
--- /dev/null
+++ b/docs/doc/35c844fa-68b5-4457-abfb-24c68e2bdf80.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The MultiCropMetric class in PaddleVideo computes top-1/5 accuracy using multi-crop metrics for each video label and logs average values.",
+    "details": [
+        {
+            "comment": "This code snippet defines a class for MultiCropMetric, which is part of the PaddleVideo library. It initializes instances with various parameters such as data_size, batch_size, num_ensemble_views, num_spatial_crops, and num_classes. The log_interval parameter determines how often to update logs during training. This metric appears to be related to slowfast video processing, as specified in the comment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/multi_crop_metric.py\":0-34",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nfrom paddle.hapi.model import _all_gather\nfrom paddlevideo.utils import get_logger\nfrom .registry import METRIC\nfrom .base import BaseMetric\nlogger = get_logger(\"paddlevideo\")\n\"\"\" An example for metrics class.\n    MultiCropMetric for slowfast.\n\"\"\"\n@METRIC.register\nclass MultiCropMetric(BaseMetric):\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 num_ensemble_views,\n                 num_spatial_crops,\n                 num_classes,\n                 log_interval=1):"
+        },
+        {
+            "comment": "This code initializes a multi-crop metric class, which takes data size, batch size, log interval, number of ensemble views, and number of spatial crops as parameters. It calculates the number of videos and clips, creates arrays to store video predictions and labels, and initializes a clip_count dictionary. The update method is used to update metrics during each iteration by gathering data across multiple cards if needed, converting outputs to numpy arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/multi_crop_metric.py\":35-60",
+            "content": "        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.num_ensemble_views = num_ensemble_views\n        self.num_spatial_crops = num_spatial_crops\n        self.num_classes = num_classes\n        self.num_clips = self.num_ensemble_views * self.num_spatial_crops\n        num_videos = self.data_size // self.num_clips\n        self.video_preds = np.zeros((num_videos, self.num_classes))\n        self.video_labels = np.zeros((num_videos, 1), dtype=\"int64\")\n        self.clip_count = {}\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        labels = data[2]\n        clip_ids = data[3]\n        # gather mulit card, results of following process in each card is the same.\n        if self.world_size > 1:\n            outputs = _all_gather(outputs, self.world_size)\n            labels = _all_gather(labels.cuda(), self.world_size)\n            clip_ids = _all_gather(clip_ids.cuda(), self.world_size)\n        # to numpy"
+        },
+        {
+            "comment": "The code loops through each prediction and label for video clips in a batch. It checks if the clip index has been encountered before for a particular video ID, and updates the count if it's a new clip or performs ensemble by summing predictions if it's not. If there are labels for a video, it asserts they match and updates the label accordingly. The code also logs processing information at log intervals.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/multi_crop_metric.py\":61-82",
+            "content": "        preds = outputs.numpy()\n        labels = labels.numpy().astype(\"int64\")\n        clip_ids = clip_ids.numpy()\n        # preds ensemble\n        for ind in range(preds.shape[0]):\n            vid_id = int(clip_ids[ind]) // self.num_clips\n            ts_idx = int(clip_ids[ind]) % self.num_clips\n            if vid_id not in self.clip_count:\n                self.clip_count[vid_id] = []\n            if ts_idx in self.clip_count[vid_id]:\n                logger.info(\n                    \"[TEST] Passed!! read video {} clip index {} / {} repeatedly.\"\n                    .format(vid_id, ts_idx, clip_ids[ind]))\n            else:\n                self.clip_count[vid_id].append(ts_idx)\n                self.video_preds[vid_id] += preds[ind]  # ensemble method: sum\n                if self.video_labels[vid_id].sum() > 0:\n                    assert self.video_labels[vid_id] == labels[ind]\n                self.video_labels[vid_id] = labels[ind]\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format("
+        },
+        {
+            "comment": "This code defines a class that accumulates metrics when all iterations are finished. It checks if the number of clips and their counts match, logging an error if not. Then, it converts video predictions and labels to Paddle tensors and calculates top-1 and top-5 accuracy using Paddle's metric library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/multi_crop_metric.py\":83-103",
+            "content": "                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        # check clip index of each video\n        for key in self.clip_count.keys():\n            if len(self.clip_count[key]) != self.num_clips or sum(\n                    self.clip_count[key]) != self.num_clips * (self.num_clips -\n                                                               1) / 2:\n                logger.info(\n                    \"[TEST] Count Error!! video [{}] clip count [{}] not match number clips {}\"\n                    .format(key, self.clip_count[key], self.num_clips))\n        video_preds = paddle.to_tensor(self.video_preds)\n        video_labels = paddle.to_tensor(self.video_labels)\n        acc_top1 = paddle.metric.accuracy(input=video_preds,\n                                          label=video_labels,\n                                          k=1)\n        acc_top5 = paddle.metric.accuracy(input=video_preds,"
+        },
+        {
+            "comment": "Calculates top-1 and top-5 accuracy using multi-crop metric for each video label, then logs the average accuracy values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/multi_crop_metric.py\":104-107",
+            "content": "                                          label=video_labels,\n                                          k=5)\n        logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {} '.format(\n            acc_top1.numpy(), acc_top5.numpy()))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3633f597-c414-410b-98c3-14cb08e86f41.json b/docs/doc/3633f597-c414-410b-98c3-14cb08e86f41.json
new file mode 100644
index 000000000..164b2a132
--- /dev/null
+++ b/docs/doc/3633f597-c414-410b-98c3-14cb08e86f41.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code implements max margin ranking loss and calculates cosine similarity between images and sentences, including a ContrastiveLoss class for contrastive learning. It also computes the cost for contrastive learning and video-level loss in T2VLAD models with masks, comparisons, and scalings.",
+    "details": [
+        {
+            "comment": "This code snippet contains an implementation of the max margin ranking loss, modified from a source code, and includes functions to calculate cosine similarity between images and sentences. The original code is licensed under the Apache License 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/loss.py\":0-27",
+            "content": "\"\"\"This module contains an implementation of the max margin ranking loss, slightly\nmodified from this code:\nhttps://github.com/antoine77340/Mixture-of-Embedding-Experts/blob/master/loss.py\nThe modification is the `fix_norm` conditional, which removes zero terms from the\ndiagonal when performing the averaging calculation.\nOriginal licence below.\n\"\"\"\n# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\ndef cosine_sim(im, s):"
+        },
+        {
+            "comment": "This code calculates cosine similarity between image and sentence pairs, and defines a ContrastiveLoss class to compute contrastive loss for contrastive learning.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/loss.py\":28-60",
+            "content": "  '''cosine similarity between all the image and sentence pairs\n  '''\n  inner_prod = im.mm(s.t())\n  im_norm = paddle.sqrt((im ** 2).sum(axis=1).reshape([-1, 1]) + 1e-18) \n  s_norm = paddle.sqrt((s ** 2).sum(axis=1).reshape([-1, 1]) + 1e-18)\n  sim = inner_prod / (im_norm * s_norm)\n  return sim\nclass ContrastiveLoss(nn.Layer):\n  '''compute contrastive loss\n  '''\n  def __init__(self, margin=0, max_violation=True, direction='bi', topk=1):\n    '''Args:\n      direction: i2t for negative sentence, t2i for negative image, bi for both\n    '''\n    super().__init__()\n    self.margin = margin\n    self.max_violation = max_violation\n    self.direction = direction\n    self.topk = topk\n  def forward(self, scores, margin=None, average_batch=True):\n    '''\n    Args:\n      scores: image-sentence score matrix, (batch, batch)\n        the same row of im and s are positive pairs, different rows are negative pairs\n    '''\n    if margin is None:\n      margin = self.margin\n    batch_size = scores.shape[0] \n    diagonal = paddle.diagonal(scores).reshape([batch_size, 1])"
+        },
+        {
+            "comment": "This code segment calculates the cost for negative pairs in a contrastive learning task. It first creates masks to clear diagonal values, then compares each diagonal score with scores within its column or row (depending on direction), and applies a margin to create positive pairs. The cost is calculated based on the max violation method and averaged according to specific conditions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/loss.py\":61-84",
+            "content": "    # mask to clear diagonals which are positive pairs\n    pos_masks = paddle.eye(batch_size).astype('bool') \n    batch_topk = min(batch_size, self.topk)\n    if self.direction == 'i2t' or self.direction == 'bi':\n      d1 = diagonal.expand_as(scores) # same collumn for im2s (negative sentence)\n      # compare every diagonal score to scores in its collumn\n      # caption retrieval\n      cost_s = (margin + scores - d1).clip(min=0)\n      cost_s[pos_masks] =  0 \n      if self.max_violation:\n        cost_s, _ = paddle.topk(cost_s, batch_topk, axis=1)\n        cost_s = cost_s / batch_topk\n        if average_batch:\n          cost_s = cost_s / batch_size\n      else:\n        if average_batch:\n          cost_s = cost_s / (batch_size * (batch_size - 1))\n      cost_s = paddle.sum(cost_s)\n    if self.direction == 't2i' or self.direction == 'bi':\n      d2 = diagonal.t().expand_as(scores) # same row for s2im (negative image)\n      # compare every diagonal score to scores in its row\n      cost_im = (margin + scores - d2).clip(min=0)"
+        },
+        {
+            "comment": "This code calculates the video-level loss in a T2VLAD model. It first sets the positions of the correct matches to 0, then applies various scaling operations based on parameters. Finally, it sums the resulting cost and returns the appropriate value depending on the direction (i2t or t2i).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/loss.py\":85-101",
+            "content": "      cost_im[pos_masks] = 0 \n      if self.max_violation:\n        cost_im, _ = paddle.topk(cost_im, batch_topk, axis=0)\n        cost_im = cost_im / batch_topk\n        if average_batch:\n          cost_im = cost_im / batch_size\n      else:\n        if average_batch:\n          cost_im = cost_im / (batch_size * (batch_size - 1))\n      cost_im = paddle.sum(cost_im)\n    if self.direction == 'i2t':\n      return cost_s\n    elif self.direction == 't2i':\n      return cost_im\n    else:\n      return cost_s + cost_im"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/36691932-c5e5-450a-a5f5-f7d207b048e3.json b/docs/doc/36691932-c5e5-450a-a5f5-f7d207b048e3.json
new file mode 100644
index 000000000..26f4ed27b
--- /dev/null
+++ b/docs/doc/36691932-c5e5-450a-a5f5-f7d207b048e3.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The script prepares PaddleVideo for TimeSformer benchmarking, downloads UCF101 dataset, and performs batch experiments with various configurations on one or eight GPUs.",
+    "details": [
+        {
+            "comment": "This code snippet is part of a script for running the TimeSformer model benchmark in PaddleVideo. It sets up the environment, installs required dependencies, and copies necessary configuration files and data to ensure stable performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/run_all.sh\":0-19",
+            "content": "# \u63d0\u4f9b\u53ef\u7a33\u5b9a\u590d\u73b0\u6027\u80fd\u7684\u811a\u672c\uff0c\u9ed8\u8ba4\u5728\u6807\u51c6docker\u73af\u5883\u5185py37\u6267\u884c\uff1a paddlepaddle/paddle:latest-gpu-cuda10.2-cudnn7  paddle=2.1.2  py=37\n# \u6267\u884c\u76ee\u5f55\uff1a\u9700\u8bf4\u660e\nsed -i '/set\\ -xe/d' run_benchmark.sh\ncd ../../ # cd\u5230PaddleVideo\u9879\u76ee\u6839\u76ee\u5f55\u4e0b\ngit checkout benchmark_dev\nlog_path=${LOG_PATH_INDEX_DIR:-$(pwd)}  #  benchmark\u7cfb\u7edf\u6307\u5b9a\u8be5\u53c2\u6570,\u4e0d\u9700\u8981\u8dd1profile\u65f6,log_path\u6307\u5411\u5b58speed\u7684\u76ee\u5f55\n# 1 \u5b89\u88c5\u8be5\u6a21\u578b\u9700\u8981\u7684\u4f9d\u8d56 (\u5982\u9700\u5f00\u542f\u4f18\u5316\u7b56\u7565\u8bf7\u6ce8\u660e)\npython -m pip install -r requirements.txt\n# 2 \u62f7\u8d1d\u8be5\u6a21\u578b\u9700\u8981\u6570\u636e\u3001\u9884\u8bad\u7ec3\u6a21\u578b\nunalias cp\ncp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs1.yaml configs/recognition/timesformer/\ncp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs1_mp.yaml configs/recognition/timesformer/\ncp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs14.yaml configs/recognition/timesformer/\ncp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs14_mp.yaml configs/recognition/timesformer/\nif [ ! -f \"data/ucf101/trainlist_benchmark_mp.txt\" ]; then\n    wget -P data/ucf101/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/trainlist_benchmark_mp.txt\nfi\nwget -P "
+        },
+        {
+            "comment": "This script downloads and prepares the UCF101 dataset for the TimeSformer model in PaddleVideo. It checks if the user wants to download data or use local data, then proceeds accordingly. The script also sets up a loop to run batch experiments with different model modes (TimeSformer), floating point items (fp32, fp16), and batch sizes (1). The log name is based on the model mode, run mode (speed), batch size, and floating point item. It uses CUDA_VISIBLE_DEVICES=0 to specify the GPU for execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/run_all.sh\":19-46",
+            "content": "data/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams\nalias cp='cp -i'\ncd data/ucf101 # \u8fdb\u5165PaddleVideo/data/ucf101\nif [ $1 = \"down_data\" ];then\n    wget --no-check-certificate \"https://www.crcv.ucf.edu/data/UCF101/UCF101.rar\" # \u4e0b\u8f7d\u8bad\u7ec3\u6570\u636e\n    unrar x UCF101.rar # \u89e3\u538b\n    mv ./UCF-101 ./videos # \u91cd\u547d\u540d\u6587\u4ef6\u5939\u4e3a./videos\n    rm -rf ./UCF101.rar\nelse    # \u4f7f\u7528\u672c\u5730\u6570\u636e\n    rm -rf videos\n    ln -s ${data_path}/dygraph_data/TSM/ucf101/videos ./videos\nfi\ncd ../../ # \u8fd4\u56dePaddleVideo\n# 3 \u6279\u91cf\u8fd0\u884c\uff08\u5982\u4e0d\u65b9\u4fbf\u6279\u91cf\uff0c1\uff0c2\u9700\u653e\u5230\u5355\u4e2a\u6a21\u578b\u4e2d\uff09\nmodel_mode_list=(TimeSformer)\nfp_item_list=(fp32 fp16)\nbs_item_list=(1)    #  14\nfor model_mode in ${model_mode_list[@]}; do\n      for fp_item in ${fp_item_list[@]}; do\n          for bs_item in ${bs_item_list[@]}\n            do\n            run_mode=sp\n            log_name=video_${model_mode}_${run_mode}_bs${bs_item}_${fp_item}   # \u5982:clas_MobileNetv1_mp_bs32_fp32_8\n            echo \"index is speed, 1gpus, begin, ${log_name}\"\n            CUDA_VISIBLE_DEVICES=0 bash benchmark/${model_m"
+        },
+        {
+            "comment": "The script iterates over different model modes, batch sizes, and floating-point types. It first runs a benchmark with one GPU and logs the results, then sleeps for 60 seconds. Next, it repeats the process but uses eight GPUs in parallel. The script aims to test various configurations and collect performance data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/run_all.sh\":46-56",
+            "content": "ode}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${model_mode} | tee ${log_path}/${log_name}_speed_1gpus 2>&1\n            sleep 60\n            run_mode=mp\n            log_name=video_${model_mode}_${run_mode}_bs${bs_item}_${fp_item}   # \u5982:clas_MobileNetv1_mp_bs32_fp32_8\n            echo \"index is speed, 8gpus, run_mode is multi_process, begin, ${log_name}\"\n            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/${model_mode}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${model_mode} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1\n            sleep 60\n            done\n      done\ndone"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/367d02ef-7ba3-457b-9be2-0fe49a2bf272.json b/docs/doc/367d02ef-7ba3-457b-9be2-0fe49a2bf272.json
new file mode 100644
index 000000000..7a578bc30
--- /dev/null
+++ b/docs/doc/367d02ef-7ba3-457b-9be2-0fe49a2bf272.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code provides instructions to detect UAVs in restricted areas using PaddleDetection, with data preparation and dependency installation steps. Users can customize the configuration file and trained model for specific use cases.",
+    "details": [
+        {
+            "comment": "This code is for the Paddle-Anti-UAV application that uses PaddleDetection to detect flying UAVs in restricted areas. It provides details on data preparation, where to download and unzip the dataset, and how to install PaddleDetection.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/README.md\":0-20",
+            "content": "# Paddle-Anti-UAV\nAnti-UAV base on PaddleDetection\n## Background\nUAVs are very popular and we can see them in many public spaces, such as parks and playgrounds. Most people use UAVs for taking photos.\nHowever, many areas like airport forbiden UAVs since they are potentially dangerous. In this case, we need to detect the flying UAVs in\nthese areas.\nIn this repository, we show how to train a detection model using [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection).\n## Data preparation\nThe dataset can be found [here](https://anti-uav.github.io/dataset/). We direcly download the ```test-dev``` split composed of 140 videos\ntrain the detection model.\n* Download the ```test-dev``` dataset.\n* Run `unzip Anti_UAV_test_dev.zip -d Anti_UAV`.\n* Run `python get_image_label.py`. In this step, you may change the path to the videos and the value of `interval`.\nAfter the above steps, you will get a MSCOCO-style datasst for object detection.\n## Install PaddleDetection\nPlease refer to this [link](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.3/docs/tutorials/INSTALL.md)."
+        },
+        {
+            "comment": "The code outlines the process to train and use PP-YOLO for UAV detection using PaddleDetection in a specific environment. It involves cloning a repository, moving dataset files, adjusting configurations, and running training and inference commands with specific arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/README.md\":22-35",
+            "content": "We use `python=3.7`, `Paddle=2.2.1`, `CUDA=10.2`.\n## Train PP-YOLO\nWe use [PP-YOLO](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.3/configs/ppyolo) as the detector.\n* Run `git clone https://github.com/PaddlePaddle/PaddleDetection.git`. Note that you should finish this step when you install PaddleDetection.\n* Move the anti-UAV dataset to `dataset`.\n* Move `anti_uav.yml` to `configs/datasets`, move `ppyolo_r50vd_dcn_1x_antiuav.yml` to `configs/ppyolo` and move `ppyolo_r50vd_dcn_antiuav.yml`\nto `configs/ppyolo/_base`.\n* Keep the value of `anchors` in `configs/ppyolo/_base/ppyolo_reader.yml` the same as `ppyolo_r50vd_dcn_antiuav.yml`.\n* Run `python -m paddle.distributed.launch --log_dir=./ppyolo_dygraph/ --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_antiuav.yml &>ppyolo_dygraph.log 2>&1 &`.\nNote that you may change the arguments, such as `batch_size` and `gups`.\n## Inference\nPlease refer to the infernce section on this [webpage](https://github.com/Paddle"
+        },
+        {
+            "comment": "The code snippet is referring to the README file of an anti-UAV application based on PaddleVideo. It demonstrates two GIFs showing the demo in action and mentions that users can customize the configuration file and trained model for their own use cases.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/README.md\":35-38",
+            "content": "Paddle/PaddleDetection/blob/release/2.3/docs/tutorials/GETTING_STARTED.md). You can just switch the configeration file and trained model to your own files.\n![](https://github.com/qingzwang/Paddle-Anti-UAV/blob/main/demo1.gif)\n![](https://github.com/qingzwang/Paddle-Anti-UAV/blob/main/demo.gif)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3699dc05-7c89-49b1-93b8-78e4f66ac812.json b/docs/doc/3699dc05-7c89-49b1-93b8-78e4f66ac812.json
new file mode 100644
index 000000000..9bd1745ee
--- /dev/null
+++ b/docs/doc/3699dc05-7c89-49b1-93b8-78e4f66ac812.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code sets up benchmark tests for TimeSformer video classification models in PaddleVideo, allowing users to customize parameters and analyze logs. The train() function is used for model training with specified parameters.",
+    "details": [
+        {
+            "comment": "This script is a bash file for running benchmark tests on TimeSformer video classification models. It sets parameters such as single or multi-GPU mode, batch size, floating point precision, and model item. The function _train() will be used to train the model with specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/run_benchmark.sh\":0-27",
+            "content": "#!/usr/bin/env bash\nset -xe\n# \u8fd0\u884c\u793a\u4f8b\uff1aCUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}\n# \u53c2\u6570\u8bf4\u660e\nfunction _set_params(){\n    run_mode=${1:-\"sp\"}          # \u5355\u5361sp|\u591a\u5361mp\n    batch_size=${2:-\"1\"}\n    fp_item=${3:-\"fp32\"}        # fp32|fp16\n    model_item=${4:-\"model_item\"}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR \u540e\u7eedQA\u8bbe\u7f6e\u8be5\u53c2\u6570\n# \u6dfb\u52a0benchmark\u65e5\u5fd7\u89e3\u6790\u6240\u9700\u53c2\u6570\n    base_batch_size=${batch_size}\n    mission_name=\"\u89c6\u9891\u5206\u7c7b\"\n    direction_id=\"0\"\n    ips_unit=\"instance/sec\"\n    skip_steps=10                     # \u89e3\u6790\u65e5\u5fd7\uff0c\u6709\u4e9b\u6a21\u578b\u524d\u51e0\u4e2astep\u8017\u65f6\u957f\uff0c\u9700\u8981\u8df3\u8fc7                                    (\u5fc5\u586b)\n    keyword=\"ips:\"                 # \u89e3\u6790\u65e5\u5fd7\uff0c\u7b5b\u9009\u51fa\u6570\u636e\u6240\u5728\u884c\u7684\u5173\u952e\u5b57                                             (\u5fc5\u586b)\n    index=\"1\"\n    model_name=${model_item}_bs${batch_size}_${fp_item}\n#   \u4ee5\u4e0b\u4e0d\u7528\u4fee\u6539   \n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}\n}\nfunction _train(){\n    echo \"Train on ${num_gpu_devices} GPUs\""
+        },
+        {
+            "comment": "This code is running a benchmark for the TimeSformer model in PaddleVideo. It checks if fp_item is either 'fp32' or 'fp16', and then calls the main script with the appropriate configuration file, based on the mode (sp or mp) and the chosen precision. The output logs are directed to a specified directory for analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/run_benchmark.sh\":28-47",
+            "content": "    echo \"current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size\"\n    case ${run_mode} in\n    sp) \n        if [ ${fp_item} == 'fp32' ]; then\n            train_cmd=\"python -u main.py -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}.yaml\"\n        elif [ ${fp_item} == 'fp16' ]; then\n            train_cmd=\"python -u main.py --amp -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}.yaml\"\n        else\n            echo \"choose fp_item(fp32 or fp16)\"\n            exit 1\n        fi;;\n    mp)\n        rm -rf ./mylog\n        if [ ${fp_item} == 'fp32' ]; then\n            train_cmd=\"python -u -B -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES --log_dir=./mylog main.py \\\n            -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}_mp.yaml\"\n            log_parse_file=\"mylog/workerlog.0\"\n        elif [ ${fp_item} == 'fp16' ]; then\n            train_cmd=\"python -u -B -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES --log_dir=./mylog main.py --amp \\"
+        },
+        {
+            "comment": "This code is part of a shell script for benchmarking the TimeSformer model. It sets up the command to run the training and checks for specific parameters like batch size and precision. It then executes the command with timeout, logs the result as success or failure, and removes intermediate log files if running in multi-process mode. The script also sources a separate file for further analysis of the log data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/run_benchmark.sh\":48-76",
+            "content": "            -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}_mp.yaml\"\n            log_parse_file=\"mylog/workerlog.0\"\n        else\n            echo \"choose fp_item(fp32 or fp16)\"\n            exit 1\n        fi;;\n    *) echo \"choose run_mode(sp or mp)\"; exit 1;\n    esac\n# \u4ee5\u4e0b\u4e0d\u7528\u4fee\u6539\n    timeout 15m ${train_cmd} > ${log_file} 2>&1\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n        export job_fail_flag=1\n    else\n        echo -e \"${model_name}, SUCCESS\"\n        export job_fail_flag=0\n    fi\n    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ $run_mode = \"mp\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.0 ${log_file}\n    fi\n}\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # \u5728\u8be5\u811a\u672c\u4e2d\u4f1a\u5bf9\u7b26\u5408benchmark\u89c4\u8303\u7684log\u4f7f\u7528analysis.py \u811a\u672c\u8fdb\u884c\u6027\u80fd\u6570\u636e\u89e3\u6790;\u8be5\u811a\u672c\u5728\u8fde\u8c03\u65f6\u53ef\u4ecebenchmark repo\u4e2d\u4e0b\u8f7dhttps://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;\u5982\u679c\u4e0d\u8054\u8c03\u53ea\u60f3\u8981\u4ea7\u51fa\u8bad\u7ec3log\u53ef\u4ee5\u6ce8\u6389\u672c\u884c,\u63d0\u4ea4\u65f6\u9700\u6253\u5f00\n_set_params $@\n# _train       # \u5982\u679c\u53ea\u60f3\u4ea7\u51fa\u8bad\u7ec3log,\u4e0d\u89e3\u6790,\u53ef\u53d6\u6d88\u6ce8\u91ca\n_run     # \u8be5\u51fd\u6570\u5728run_model.sh\u4e2d,\u6267\u884c\u65f6\u4f1a\u8c03\u7528_train; \u5982\u679c\u4e0d\u8054\u8c03\u53ea\u60f3\u8981\u4ea7\u51fa\u8bad\u7ec3log\u53ef\u4ee5\u6ce8\u6389\u672c\u884c,\u63d0\u4ea4\u65f6\u9700\u6253\u5f00"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/36a13f58-7b2d-47c3-8431-d877fccb0deb.json b/docs/doc/36a13f58-7b2d-47c3-8431-d877fccb0deb.json
new file mode 100644
index 000000000..9aba063e7
--- /dev/null
+++ b/docs/doc/36a13f58-7b2d-47c3-8431-d877fccb0deb.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a Python class for the UCF101 Skeleton Dataset in PaddleVideo, loading skeleton features and normalizing data for action recognition tasks. The dataset includes train and test methods for preparing frames with `prepare_train` and `prepare_test` functions.",
+    "details": [
+        {
+            "comment": "This code snippet is a Python class for UCF101 Skeleton Dataset in PaddleVideo. It loads skeleton features and applies normalization operations, registering the dataset for action recognition tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ucf101_skeleton.py\":0-34",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nimport pickle\nimport paddle\nfrom paddle.io import Dataset\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass UCF101SkeletonDataset(BaseDataset):\n    \"\"\"\n    Skeleton dataset for action recognition.\n    The dataset loads skeleton feature, and apply norm operatations."
+        },
+        {
+            "comment": "This code defines a class that loads annotation data from a file, specifically for the UCF101 dataset's skeleton information. It takes arguments such as the file path, pipeline object, and whether it's building a test dataset. The load_file method checks if the file is a .pkl file and calls load_pkl_annotations to get video information. If the split argument is provided, it only uses the specified part of the data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ucf101_skeleton.py\":35-65",
+            "content": "    Args:\n        file_path (str): Path to the index file.\n        pipeline(obj): Define the pipeline of data preprocessing.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 split,\n                 repeat_times,\n                 test_mode=False):\n        self.split = split\n        self.repeat_times = repeat_times\n        super().__init__(file_path, pipeline, test_mode=test_mode)\n        self._ori_len = len(self.info)\n        self.start_index = 0\n        self.modality = \"Pose\"\n    def load_file(self):\n        \"\"\"Load annotation file to get video information.\"\"\"\n        assert self.file_path.endswith('.pkl')\n        return self.load_pkl_annotations()\n    def load_pkl_annotations(self):\n        with open(self.file_path, \"rb\") as f:\n            data = pickle.load(f)\n        if self.split:\n            split, data = data['split'], data['annotations']\n            identifier = 'filename' if 'filename' in data[0] else 'frame_dir'"
+        },
+        {
+            "comment": "This code defines a dataset for PaddleVideo, containing train and test methods for preparing frames. The `prepare_train` and `prepare_test` functions create new results by copying the original information, setting modality and start index based on the given index. The `__len__` function returns the size of the dataset by multiplying the number of info items with repeat times.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ucf101_skeleton.py\":66-88",
+            "content": "            data = [x for x in data if x[identifier] in split[self.split]]\n        return data\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx % self._ori_len])\n        results['modality'] = self.modality\n        results['start_index'] = self.start_index\n        return self.pipeline(results)\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for testing given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx % self._ori_len])\n        results['modality'] = self.modality\n        results['start_index'] = self.start_index\n        return self.pipeline(results)\n    def __len__(self):\n        \"\"\"get the size of the dataset.\"\"\"\n        return len(self.info) * self.repeat_times"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/37739699-d923-4eae-b76e-e27b23c261f6.json b/docs/doc/37739699-d923-4eae-b76e-e27b23c261f6.json
new file mode 100644
index 000000000..446c7ea23
--- /dev/null
+++ b/docs/doc/37739699-d923-4eae-b76e-e27b23c261f6.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The Bash script configures a model serving environment, sets up an API server, transfers the model using provided Python code, and handles cleanup tasks. It also adjusts alias names, logs paths, and CUDA visible devices while running video processing pipeline tests.",
+    "details": [
+        {
+            "comment": "This Bash script is parsing a configuration file and extracting various parameters for running model inference. It assigns values to variables such as 'model_name', 'python_list', and others that will be used later in the code. The purpose is to set up an environment for serving the model and potentially run inferences on videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_python.sh\":0-28",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\ndataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)\nMODE=$2\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# parser serving\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython_list=$(func_parser_value \"${lines[2]}\")\ntrans_model_py=$(func_parser_value \"${lines[3]}\")\ninfer_model_dir_key=$(func_parser_key \"${lines[4]}\")\ninfer_model_dir_value=$(func_parser_value \"${lines[4]}\")\nmodel_filename_key=$(func_parser_key \"${lines[5]}\")\nmodel_filename_value=$(func_parser_value \"${lines[5]}\")\nparams_filename_key=$(func_parser_key \"${lines[6]}\")\nparams_filename_value=$(func_parser_value \"${lines[6]}\")\nserving_server_key=$(func_parser_key \"${lines[7]}\")\nserving_server_value=$(func_parser_value \"${lines[7]}\")\nserving_client_key=$(func_parser_key \"${lines[8]}\")\nserving_client_value=$(func_parser_value \"${lines[8]}\")\nserving_dir_value=$(func_parser_value \"${lines[9]}\")\nweb_service_py=$(func_parser_value \"${lines[10]}\")\npipeline_py=$(func_parser_value \"${lines[11]}\")\nvideo_dir_key=$(func_parser_key \"${lines[12]}\")"
+        },
+        {
+            "comment": "The code defines a function `func_serving` that takes Python executable path, script, model directory, and sets various parameters for serving. It then executes a command to transfer the model to the specified server or client using a provided Python script. The output is logged in the `trans_log` file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_python.sh\":29-53",
+            "content": "video_dir_value=$(func_parser_value \"${lines[12]}\")\nLOG_PATH=\"./test_tipc/output/log/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_serving.log\"\nfunction func_serving(){\n    IFS='|'\n    _python=$1\n    _script=$2\n    _model_dir=$3\n    # python serving code\n    set_dirname=$(func_set_params \"${infer_model_dir_key}\" \"${infer_model_dir_value}\")\n    set_model_filename=$(func_set_params \"${model_filename_key}\" \"${model_filename_value}\")\n    set_params_filename=$(func_set_params \"${params_filename_key}\" \"${params_filename_value}\")\n    set_serving_server=$(func_set_params \"${serving_server_key}\" \"${serving_server_value}\")\n    set_serving_client=$(func_set_params \"${serving_client_key}\" \"${serving_client_value}\")\n    python_list=(${python_list})\n    python=${python_list[0]}\n    trans_log=\"${LOG_PATH}/python_trans_model.log\"\n    trans_model_cmd=\"${python} ${trans_model_py} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_serving_server} ${set_serving_client} > ${trans_log} 2>&1 \""
+        },
+        {
+            "comment": "This code modifies alias names in configuration files, sets log paths and starts a web service using Python. It also checks the status of the service, sleeps for 30 seconds, and saves logs into a specific path. The code is executed within a specific directory and sets environment variables before running the commands.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_python.sh\":55-76",
+            "content": "    eval ${trans_model_cmd}\n    # modify the alias name of fetch_var to \"outputs\"\n    server_fetch_var_line_cmd=\"sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \\\"outputs\\\"/' $serving_server_value/serving_server_conf.prototxt\"\n    eval ${server_fetch_var_line_cmd}\n    client_fetch_var_line_cmd=\"sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \\\"outputs\\\"/' $serving_client_value/serving_client_conf.prototxt\"\n    eval ${client_fetch_var_line_cmd}\n    cd ${serving_dir_value}\n    echo 'PWD= '$PWD\n    unset https_proxy\n    unset http_proxy\n    server_log_path=\"${LOG_PATH}/python_server_gpu.log\"\n    web_service_cmd=\"${python} ${web_service_py} > ${server_log_path} 2>&1 &\"\n    eval $web_service_cmd\n    last_status=${PIPESTATUS[0]}\n    status_check $last_status \"${web_service_cmd}\" \"${status_log}\" \"${model_name}\"\n    sleep 30s # not too short is ok\n    _save_log_path=\"../../${LOG_PATH}/python_server_infer_gpu_batchsize_1.log\"\n    set_video_dir=$(func_set_params \"${video_dir_key}\" \"${video_dir_value}\")"
+        },
+        {
+            "comment": "This code is setting up the environment and running a test for a video processing pipeline. It sets the CUDA visible devices, runs the test using specified command, and performs clean-up by killing related processes after the test.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_serving_infer_python.sh\":77-104",
+            "content": "    pipeline_cmd=\"${python} ${pipeline_py} ${set_video_dir} > ${_save_log_path} 2>&1 \"\n    eval $pipeline_cmd\n    last_status=${PIPESTATUS[0]}\n    eval \"cat ${_save_log_path}\"\n    cd ../../\n    status_check $last_status \"${pipeline_cmd}\" \"${status_log}\" \"${model_name}\"\n    ps ux | grep -E 'web_service|pipeline' | awk '{print $2}' | xargs kill -s 9\n}\n# set cuda device\nGPUID=$3\nif [ ${#GPUID} -le 0 ];then\n    env=\" \"\nelse\n    env=\"export CUDA_VISIBLE_DEVICES=${GPUID}\"\nfi\nset CUDA_VISIBLE_DEVICES\neval $env\necho \"################### run test ###################\"\nexport Count=0\nIFS=\"|\"\nfunc_serving \"${web_service_cmd}\""
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/37ab8cf2-e1d0-4276-9749-038da87d02f7.json b/docs/doc/37ab8cf2-e1d0-4276-9749-038da87d02f7.json
new file mode 100644
index 000000000..6bbac7992
--- /dev/null
+++ b/docs/doc/37ab8cf2-e1d0-4276-9749-038da87d02f7.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code defines global parameters for the VGGish model, including architectural constants, hyperparameters, and optimizer settings. It extracts audio features from spectrogram patches using PCA quantization and embedding processing, with options to adjust STFT window and hop lengths, mel frequency bins, and learning rate.",
+    "details": [
+        {
+            "comment": "This code sets global parameters for the VGGish model. It defines architectural constants, hyperparameters for feature and example generation, embedding postprocessing, and training. The VGGish model is used to extract audio features from spectrogram patches, with options for PCA-based quantization and embedding processing. Hyperparameters control the STFT window and hop lengths, mel frequency bins, and learning rate for Adam optimizer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py\":0-28",
+            "content": "\"\"\"Global parameters for the VGGish model.\nSee vggish_slim.py for more information.\n\"\"\"\n# Architectural constants.\nNUM_FRAMES = 50  # Frames in input mel-spectrogram patch.\nNUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.\nEMBEDDING_SIZE = 128  # Size of embedding layer.\n# Hyperparameters used in feature and example generation.\nSAMPLE_RATE = 16000\nSTFT_WINDOW_LENGTH_SECONDS = 0.040\nSTFT_HOP_LENGTH_SECONDS = 0.020\nNUM_MEL_BINS = NUM_BANDS\nMEL_MIN_HZ = 125\nMEL_MAX_HZ = 7500\nLOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.\nEXAMPLE_WINDOW_SECONDS = 1.00  # Each example contains 96 10ms frames\nEXAMPLE_HOP_SECONDS = 1.00  # with zero overlap.\n# Parameters used for embedding postprocessing.\nPCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'\nPCA_MEANS_NAME = 'pca_means'\nQUANTIZE_MIN_VAL = -2.0\nQUANTIZE_MAX_VAL = +2.0\n# Hyperparameters used in training.\nINIT_STDDEV = 0.01  # Standard deviation used to initialize weights.\nLEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer."
+        },
+        {
+            "comment": "This code sets the Adam optimizer's epsilon value to 1e-8, defines names for input and output operations, tensors, and features. It also assigns the name \"audio_embedding\" to a feature.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py\":29-36",
+            "content": "ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.\n# Names of ops, tensors, and features.\nINPUT_OP_NAME = 'vggish/input_features'\nINPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'\nOUTPUT_OP_NAME = 'vggish/embedding'\nOUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'\nAUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/382773ae-ed48-48ec-bf7c-7d0ca872f997.json b/docs/doc/382773ae-ed48-48ec-bf7c-7d0ca872f997.json
new file mode 100644
index 000000000..bb6e20e68
--- /dev/null
+++ b/docs/doc/382773ae-ed48-48ec-bf7c-7d0ca872f997.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This Python script loads Baidu Cloud models for video classification, extracts audio and pcm features, logs details, saves features in a pickle file, creates \"features\" directory if necessary, and classifies videos from a specified dataset directory.",
+    "details": [
+        {
+            "comment": "This Python script is for the Baidu Cloud action, loading and initializing image and audio models according to a given configuration file. It also provides a function to classify videos. The script logs information about model loading time and the progress of video classification.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/extractor/extract_feat.py\":0-49",
+            "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport shutil\nimport numpy as np\nsys.path.append(\"../predict/action_detect\")\nimport models.pptsm_infer as image_model\nimport models.audio_infer as audio_model\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport utils.config_utils as config_utils\nimport logger\nlogger = logger.Logger()\ndef load_model(cfg_file=\"configs/configs.yaml\"):\n    \"\"\"\n    load_model\n    \"\"\"\n    logger.info(\"load model ... \")\n    global infer_configs\n    infer_configs = parse_config(cfg_file)\n    print_configs(infer_configs, \"Infer\")\n    t0 = time.time()\n    global image_model, audio_model\n    image_model = image_model.InferModel(infer_configs)\n    audio_model = audio_model.InferModel(infer_configs)\n    t1 = time.time()\n    logger.info(\"step0: load model time: {} min\\n\".format((t1 - t0) * 1.0 / 60))\ndef video_classify(video_name):\n    \"\"\"\n    extract_feature\n    \"\"\"\n    logger.info('predict ... ')"
+        },
+        {
+            "comment": "Extracting video features, specifically images and audio. Converting extracted features to numpy arrays. Logging shapes of the arrays and time taken for feature extraction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/extractor/extract_feat.py\":50-73",
+            "content": "    logger.info(video_name)\n    imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")\n    pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n    # step 1: extract feature\n    t0 = time.time()\n    image_path_list = get_images(imgs_path)\n    infer_configs['PPTSM']['frame_list'] = image_path_list\n    infer_configs['AUDIO']['pcm_file'] = pcm_path\n    image_features = image_model.predict(infer_configs)\n    audio_features, pcm_features = audio_model.predict(infer_configs)\n    np_image_features = np.array(image_features, dtype=np.float32)\n    np_audio_features = np.array(audio_features, dtype=np.float32)\n    np_pcm_features = np.array(pcm_features, dtype=np.float32)\n    t1 = time.time()\n    logger.info('{} {} {}'.format(np_image_features.shape,\n                                  np_audio_features.shape,\n                                  np_pcm_features.shape))\n    logger.info(\"step1: feature extract time: {} min\".format(\n        (t1 - t0) * 1.0 / 60))\n    video_features = {\n        'image_feature': np_image_features,"
+        },
+        {
+            "comment": "The code extracts audio and pcm features from video files, saves them in a pickle file named after the original video, creates a \"features\" directory if it doesn't exist, then classifies each video based on its location in a specified dataset directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/extractor/extract_feat.py\":74-99",
+            "content": "        'audio_feature': np_audio_features,\n        'pcm_feature': np_pcm_features\n    }\n    # save feature\n    feature_path = video_name.replace(\".mp4\", \".pkl\").replace(\"mp4\", \"features\")\n    feat_pkl_str = pickle.dumps(video_features,\n                                protocol=pickle.HIGHEST_PROTOCOL)\n    with open(feature_path, 'wb') as fout:\n        fout.write(feat_pkl_str)\nif __name__ == '__main__':\n    dataset_dir = \"../datasets/EuroCup2016\"\n    if not os.path.exists(dataset_dir + '/features'):\n        os.mkdir(dataset_dir + '/features')\n    load_model()\n    video_url = os.path.join(dataset_dir, 'url.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]\n    for line in lines:\n        video_classify(line)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3868dce9-c7e8-4031-a111-90b800d1f64e.json b/docs/doc/3868dce9-c7e8-4031-a111-90b800d1f64e.json
new file mode 100644
index 000000000..8dd782b66
--- /dev/null
+++ b/docs/doc/3868dce9-c7e8-4031-a111-90b800d1f64e.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a dataset class for loading video information, requires subclassing to define load_file and prepare_train/test methods, prepares data for training/testing, and addresses DataLoader's dict type handling limitation.",
+    "details": [
+        {
+            "comment": "Base class for datasets, subclass it. Subclasses should overwrite load_file (load info from index file), prepare_train (provide train data), and prepare_test (provide test data).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py\":0-33",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os.path as osp\nimport copy\nimport numpy as np\nfrom abc import ABC, abstractmethod\nimport paddle\nfrom paddle.io import Dataset\nclass BaseDataset(Dataset, ABC):\n    \"\"\"Base class for datasets\n    All datasets should subclass it.\n    All subclass should overwrite:\n    - Method: `load_file`, load info from index file.\n    - Method: `prepare_train`, providing train data.\n    - Method: `prepare_test`, providing test data."
+        },
+        {
+            "comment": "This code defines a class for loading video information from an index file path. It takes arguments such as the file_path, pipeline, data_prefix, and test_mode. The load_file method abstractly loads the video information from the index file. The prepare_train method prepares data for training/valid given the index. Note: DataLoader cannot support dict type retval, so it converts to list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py\":35-61",
+            "content": "    Args:\n        file_path (str): index file path.\n        pipeline (Sequence XXX)\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): whether to build test dataset. Default: False.\n    \"\"\"\n    def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):\n        super().__init__()\n        self.file_path = file_path\n        self.data_prefix = osp.realpath(data_prefix) if \\\n            data_prefix is not None and osp.isdir(data_prefix) else data_prefix\n        self.test_mode = test_mode\n        self.pipeline = pipeline\n        self.info = self.load_file()\n    @abstractmethod\n    def load_file(self):\n        \"\"\"load the video information from the index file path.\"\"\"\n        pass\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)"
+        },
+        {
+            "comment": "The code defines a dataset class with methods for preparing data for training and testing, as well as returning the size of the dataset. The test_mode flag is used to determine whether to use the prepare_test or prepare_train method when accessing the dataset. Paddle.io.DataLoader cannot currently handle dict type return values, so they are converted to lists within these methods.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py\":62-82",
+            "content": "        #unsqueeze label to list\n        return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        #unsqueeze label to list\n        return results['imgs'], np.array([results['labels']])\n    def __len__(self):\n        \"\"\"get the size of the dataset.\"\"\"\n        return len(self.info)\n    def __getitem__(self, idx):\n        \"\"\" Get the sample for either training or testing given index\"\"\"\n        if self.test_mode:\n            return self.prepare_test(idx)\n        else:\n            return self.prepare_train(idx)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/386fe09e-4bc7-49c0-96ec-e7f0c2a9ba17.json b/docs/doc/386fe09e-4bc7-49c0-96ec-e7f0c2a9ba17.json
new file mode 100644
index 000000000..6dfe2d467
--- /dev/null
+++ b/docs/doc/386fe09e-4bc7-49c0-96ec-e7f0c2a9ba17.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code file is a part of the PaddleVideo library and contains a function named \"build_metric\". It imports necessary modules, defines a metric registry, and provides a build function to construct metrics according to the specified configuration (cfg). The code is licensed under Apache License 2.0.",
+    "details": [
+        {
+            "comment": "This code file is a part of the PaddleVideo library and contains a function named \"build_metric\". It imports necessary modules, defines a metric registry, and provides a build function to construct metrics according to the specified configuration (cfg). The code is licensed under Apache License 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/build.py\":0-22",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .registry import METRIC\nfrom ..utils import build\ndef build_metric(cfg):\n    \"\"\"build metric\"\"\"\n    return build(cfg, METRIC)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/387159e6-5d8e-4b52-a71c-0d46a3d29883.json b/docs/doc/387159e6-5d8e-4b52-a71c-0d46a3d29883.json
new file mode 100644
index 000000000..f82b17c30
--- /dev/null
+++ b/docs/doc/387159e6-5d8e-4b52-a71c-0d46a3d29883.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This script sets environment variables for GPU usage and then runs the \"inference.py\" Python script from the \"scenario_lib\" directory, specifying a model name (AttentionLstmErnie), configuration file path (./conf/conf.txt), saving inference models path (inference_models_save), and output file for results (output.json).",
+    "details": [
+        {
+            "comment": "This script sets environment variables for GPU usage and then runs the \"inference.py\" Python script from the \"scenario_lib\" directory, specifying a model name (AttentionLstmErnie), configuration file path (./conf/conf.txt), saving inference models path (inference_models_save), and output file for results (output.json).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/inference.sh\":0-11",
+            "content": "# inference sh \nexport CUDA_VISIBLE_DEVICES=0\nexport FLAGS_eager_delete_tensor_gb=0.0\nexport FLAGS_sync_nccl_allreduce=1\nexport FLAGS_fast_eager_deletion_mode=1\nexport FLAGS_fraction_of_gpu_memory_to_use=0.5\nexport FLAGS_reallocate_gpu_memory_in_mb=0\nexport FLAGS_memory_fraction_of_eager_deletion=1\npython scenario_lib/inference.py --model_name=AttentionLstmErnie \\\n--config=./conf/conf.txt \\\n--save_inference_model=inference_models_save \\\n--output='output.json'"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/38e736e9-f0a8-46f8-9d49-1e3493acdee5.json b/docs/doc/38e736e9-f0a8-46f8-9d49-1e3493acdee5.json
new file mode 100644
index 000000000..2c252c417
--- /dev/null
+++ b/docs/doc/38e736e9-f0a8-46f8-9d49-1e3493acdee5.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines functions to build components of a computer vision model, including backbone, head, loss, recognizer, and a model builder that selects the appropriate builder based on framework type.",
+    "details": [
+        {
+            "comment": "The code is defining functions to build different components of a computer vision model. The `build_backbone`, `build_head`, and `build_loss` functions are used to build the backbone, head, and loss for the model respectively using the `build` function from the `utils` module. The `build_recognizer` function is used to build a recognizer component for the model using the specified framework key.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py\":0-35",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS\nfrom ..utils import build\ndef build_backbone(cfg):\n    \"\"\"Build backbone.\"\"\"\n    return build(cfg, BACKBONES)\ndef build_head(cfg):\n    \"\"\"Build head.\"\"\"\n    return build(cfg, HEADS)\ndef build_loss(cfg):\n    \"\"\"Build loss.\"\"\"\n    return build(cfg, LOSSES)\ndef build_recognizer(cfg):\n    \"\"\"Build recognizer.\"\"\"\n    return build(cfg, RECOGNIZERS, key='framework')"
+        },
+        {
+            "comment": "This code defines functions for building localizer and model, and a build_model function that selects the appropriate builder based on the framework type specified in the configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py\":38-51",
+            "content": "def build_localizer(cfg):\n    \"\"\"Build localizer.\"\"\"\n    return build(cfg, LOCALIZERS, key='framework')\ndef build_model(cfg):\n    cfg_copy = cfg.copy()\n    framework_type = cfg_copy.get('framework')\n    if framework_type in RECOGNIZERS:\n        return build_recognizer(cfg)\n    elif framework_type in LOCALIZERS:\n        return build_localizer(cfg)\n    else:\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/398b9feb-19d9-447a-bdeb-ce6acf427f06.json b/docs/doc/398b9feb-19d9-447a-bdeb-ce6acf427f06.json
new file mode 100644
index 000000000..5f9e5a8c4
--- /dev/null
+++ b/docs/doc/398b9feb-19d9-447a-bdeb-ce6acf427f06.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code initializes a logger, defines functions for logging metrics such as loss and learning rate during training. It also creates a class for tracking various metrics with update method and logs batch metrics at specified batch IDs in log_batch function, while formatting log string with colors for clarity in video processing tasks.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and defines functions for logging metrics such as loss, learning rate during training. It also initializes a logger with the name \"paddlevideo\" and specifies the available classes or functions that can be accessed from this file. The build_record function takes a configuration file and creates an ordered dictionary of metrics to record based on the framework type specified in the configuration file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py\":0-31",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections import OrderedDict\nimport paddle\nfrom .logger import coloring, get_logger\nlogger = get_logger(\"paddlevideo\")\n__all__ = ['AverageMeter', 'build_record', 'log_batch', 'log_epoch']\ndef build_record(cfg):\n    framework_type = cfg.get('framework', '')\n    record_list = [\n        (\"loss\", AverageMeter('loss', '7.5f')),\n        (\"lr\", AverageMeter('lr', 'f', need_avg=False)),\n    ]\n    if 'Recognizer1D' in framework_type:  #TODO: required specify str in framework"
+        },
+        {
+            "comment": "This code is part of a function that handles recording different metrics for various framework types. It appends specific metric names and instances of the AverageMeter class to the record_list depending on the framework type. If 'Recognizer' is in the framework type, it records 'top1' and 'top5' metrics. If 'FastRCNN' is present, it records a series of recall and precision metrics along with mAP@0.5IOU. The function continues with more conditions for different framework types after this snippet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py\":32-48",
+            "content": "        record_list.append((\"hit_at_one\", AverageMeter(\"hit_at_one\", '.5f')))\n        record_list.append((\"perr\", AverageMeter(\"perr\", '.5f')))\n        record_list.append((\"gap\", AverageMeter(\"gap\", '.5f')))\n    elif 'Recognizer' in framework_type:\n        record_list.append((\"top1\", AverageMeter(\"top1\", '.5f')))\n        record_list.append((\"top5\", AverageMeter(\"top5\", '.5f')))\n    elif 'FastRCNN' in framework_type:\n        record_list.append(\n            (\"recall@thr=0.5\", AverageMeter(\"recall@thr=0.5\", '.5f')))\n        record_list.append(\n            (\"prec@thr=0.5\", AverageMeter(\"prec@thr=0.5\", '.5f')))\n        record_list.append((\"recall@top3\", AverageMeter(\"recall@top3\", '.5f')))\n        record_list.append((\"prec@top3\", AverageMeter(\"prec@top3\", '.5f')))\n        record_list.append((\"recall@top5\", AverageMeter(\"recall@top5\", '.5f')))\n        record_list.append((\"prec@top5\", AverageMeter(\"prec@top5\", '.5f')))\n        record_list.append((\"mAP@0.5IOU\", AverageMeter(\"mAP@0.5IOU\", '.5f')))\n    elif 'DepthEstimator' in cfg.framework:"
+        },
+        {
+            "comment": "This code defines a list of metrics to be tracked and an AverageMeter class that computes and stores the average and current value for each metric. The list includes various metrics like \"abs_rel\", \"sq_rel\", \"rmse\", \"rmse_log\", \"a1\", \"a2\", \"a3\", \"losses_day\", \"losses_night\", \"batch_time\", and \"reader_time\". The list is then converted to an OrderedDict.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py\":49-71",
+            "content": "        record_list.append((\"abs_rel\", AverageMeter(\"abs_rel\", '.5f')))\n        record_list.append((\"sq_rel\", AverageMeter(\"sq_rel\", '.5f')))\n        record_list.append((\"rmse\", AverageMeter(\"rmse\", '.5f')))\n        record_list.append((\"rmse_log\", AverageMeter(\"rmse_log\", '.5f')))\n        record_list.append((\"a1\", AverageMeter(\"a1\", '.5f')))\n        record_list.append((\"a2\", AverageMeter(\"a2\", '.5f')))\n        record_list.append((\"a3\", AverageMeter(\"a3\", '.5f')))\n        record_list.append((\"losses_day\", AverageMeter(\"losses_day\", '.5f')))\n        record_list.append(\n            (\"losses_night\", AverageMeter(\"losses_night\", '.5f')))\n    record_list.append((\"batch_time\", AverageMeter('batch_cost', '.5f')))\n    record_list.append((\"reader_time\", AverageMeter('reader_cost', '.5f')))\n    record_list = OrderedDict(record_list)\n    return record_list\nclass AverageMeter(object):\n    \"\"\"\n    Computes and stores the average and current value\n    \"\"\"\n    def __init__(self, name='', fmt='f', need_avg=True):\n        self.name = name"
+        },
+        {
+            "comment": "This code defines a class for tracking metrics such as sum, count, average, and total values. The `update` method allows updating the metric with a new value, while the `total`, `total_minute`, `mean`, and `value` properties retrieve the current metric value in different formats. The `log_batch` function logs batch metrics for a list of metrics at a specified batch ID.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py\":72-112",
+            "content": "        self.fmt = fmt\n        self.need_avg = need_avg\n        self.reset()\n    def reset(self):\n        \"\"\" reset \"\"\"\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        \"\"\" update \"\"\"\n        if isinstance(val, paddle.Tensor):\n            val = float(val)\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n    @property\n    def total(self):\n        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)\n    @property\n    def total_minute(self):\n        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,\n                                                            self=self)\n    @property\n    def mean(self):\n        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(\n            self=self) if self.need_avg else ''\n    @property\n    def value(self):\n        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)\ndef log_batch(metric_list,\n              batch_id,"
+        },
+        {
+            "comment": "This function logs epoch metrics and step information for a video processing task. It formats the log string with different colors for each section: epoch or iteration, step number, metric values, batch time, reader time, and ips (images per second). The logger outputs this formatted string to provide an informative summary of the task's progress.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py\":113-140",
+            "content": "              epoch_id,\n              total_epoch,\n              mode,\n              ips,\n              tot_step=None,\n              max_iters=None):\n    batch_cost = str(metric_list['batch_time'].value) + ' sec,'\n    reader_cost = str(metric_list['reader_time'].value) + ' sec,'\n    metric_values = []\n    for m in metric_list:\n        if not (m == 'batch_time' or m == 'reader_time'):\n            metric_values.append(metric_list[m].value)\n    metric_str = ' '.join([str(v) for v in metric_values])\n    if max_iters:\n        epoch_str = \"iter:[{:>3d}/{:<3d}]\".format(tot_step, max_iters)\n    else:\n        epoch_str = \"epoch:[{:>3d}/{:<3d}]\".format(epoch_id, total_epoch)\n    step_str = \"{:s} step:{:<4d}\".format(mode, batch_id)\n    logger.info(\"{:s} {:s} {:s} {:s} {:s} {}\".format(\n        coloring(epoch_str, \"HEADER\") if batch_id == 0 else epoch_str,\n        coloring(step_str, \"PURPLE\"), coloring(metric_str, 'OKGREEN'),\n        coloring(batch_cost, \"OKGREEN\"), coloring(reader_cost, 'OKGREEN'),\n        ips))\ndef log_epoch(metric_list, epoch, mode, ips):"
+        },
+        {
+            "comment": "This code is formatting and logging information at the end of an epoch. It calculates various metric values, constructs a formatted string with different colors, and then logs this information using logger.info(). The metrics include batch time, reader time, total cost, mode, and inference per second. The strings are color-coded for visual clarity.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py\":141-156",
+            "content": "    batch_cost = 'avg_' + str(metric_list['batch_time'].value) + ' sec,'\n    reader_cost = 'avg_' + str(metric_list['reader_time'].value) + ' sec,'\n    batch_sum = str(metric_list['batch_time'].total) + ' sec,'\n    metric_values = []\n    for m in metric_list:\n        if not (m == 'batch_time' or m == 'reader_time'):\n            metric_values.append(metric_list[m].mean)\n    metric_str = ' '.join([str(v) for v in metric_values])\n    end_epoch_str = \"END epoch:{:<3d}\".format(epoch)\n    logger.info(\"{:s} {:s} {:s} {:s} {:s} {:s} {}\".format(\n        coloring(end_epoch_str, \"RED\"), coloring(mode, \"PURPLE\"),\n        coloring(metric_str, \"OKGREEN\"), coloring(batch_cost, \"OKGREEN\"),\n        coloring(reader_cost, \"OKGREEN\"), coloring(batch_sum, \"OKGREEN\"), ips))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3a1c90c1-39ee-49d9-b34e-b8b77b32fbce.json b/docs/doc/3a1c90c1-39ee-49d9-b34e-b8b77b32fbce.json
new file mode 100644
index 000000000..1dd662775
--- /dev/null
+++ b/docs/doc/3a1c90c1-39ee-49d9-b34e-b8b77b32fbce.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code reads a file for model details, extracts lines with common functions, sets up inference directories and parameters, enables ONNX checker, converts using paddle2onnx, saves logs, runs inference, and checks status for the \"func_paddle2onnx\" function.",
+    "details": [
+        {
+            "comment": "Code is reading a file, parsing specific lines to extract model name, python path, and other parameters for paddle2onnx conversion. It's using common functions from \"common_func.sh\" and \"awk\" command for line extraction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_paddle2onnx.sh\":0-31",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\nMODE=$2\ndataline=$(cat ${FILENAME})\nlines=(${dataline})\n# common params\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython=$(func_parser_value \"${lines[2]}\")\n# parser params\ndataline=$(awk 'NR==1, NR==14{print}'  $FILENAME)\nIFS=$'\\n'\nlines=(${dataline})\n# parser paddle2onnx\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython=$(func_parser_value \"${lines[2]}\")\npadlle2onnx_cmd=$(func_parser_value \"${lines[3]}\")\ninfer_model_dir_key=$(func_parser_key \"${lines[4]}\")\ninfer_model_dir_value=$(func_parser_value \"${lines[4]}\")\nmodel_filename_key=$(func_parser_key \"${lines[5]}\")\nmodel_filename_value=$(func_parser_value \"${lines[5]}\")\nparams_filename_key=$(func_parser_key \"${lines[6]}\")\nparams_filename_value=$(func_parser_value \"${lines[6]}\")\nsave_file_key=$(func_parser_key \"${lines[7]}\")\nsave_file_value=$(func_parser_value \"${lines[7]}\")\nopset_version_key=$(func_parser_key \"${lines[8]}\")\nopset_version_value=$(func_parser_value \"${lines[8]}\")\nenable_onnx_checker_key=$(func_parser_key \"${lines[9]}\")"
+        },
+        {
+            "comment": "Creating function \"func_paddle2onnx\" with arguments _script, setting up log path and directories for paddle2onnx inference. It then sets parameters such as infer_model_dir_key, model_filename_key, params_filename_key, save_file_key, and opset_version_key.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_paddle2onnx.sh\":32-57",
+            "content": "enable_onnx_checker_value=$(func_parser_value \"${lines[9]}\")\n# parser onnx inference\ninference_py=$(func_parser_value \"${lines[10]}\")\nconfig_key=$(func_parser_key \"${lines[11]}\")\nconfig_value=$(func_parser_value \"${lines[11]}\")\nmodel_key=$(func_parser_key \"${lines[12]}\")\ninput_file_key=$(func_parser_key \"${lines[13]}\")\ninput_file_value=$(func_parser_value \"${lines[13]}\")\nLOG_PATH=\"./log/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_paddle2onnx.log\"\nfunction func_paddle2onnx(){\n    IFS='|'\n    _script=$1\n    # paddle2onnx\n    _save_log_path=\"${LOG_PATH}/paddle2onnx_infer_cpu.log\"\n    set_dirname=$(func_set_params \"${infer_model_dir_key}\" \"${infer_model_dir_value}\")\n    set_model_filename=$(func_set_params \"${model_filename_key}\" \"${model_filename_value}\")\n    set_params_filename=$(func_set_params \"${params_filename_key}\" \"${params_filename_value}\")\n    set_save_model=$(func_set_params \"${save_file_key}\" \"${save_file_value}\")\n    set_opset_version=$(func_set_params \"${opset_version_key}\" \"${opset_version_value}\")"
+        },
+        {
+            "comment": "The code sets enable_onnx_checker and uses it to execute paddle2onnx conversion, saves the log. Then, it runs inference using Python and saves the status check log.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_paddle2onnx.sh\":58-72",
+            "content": "    set_enable_onnx_checker=$(func_set_params \"${enable_onnx_checker_key}\" \"${enable_onnx_checker_value}\")\n    trans_log=\"${LOG_PATH}/trans_model.log\"\n    trans_model_cmd=\"${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} > ${trans_log} 2>&1 \"\n    eval $trans_model_cmd\n    last_status=${PIPESTATUS[0]}\n    status_check $last_status \"${trans_model_cmd}\" \"${status_log}\" \"${model_name}\"\n    # python inference\n    set_gpu=$(func_set_params \"${use_gpu_key}\" \"${use_gpu_value}\")\n    set_model_dir=$(func_set_params \"${model_key}\" \"${save_file_value}\")\n    set_input_file=$(func_set_params \"${input_file_key}\" \"${input_file_value}\")\n    set_config=$(func_set_params \"${config_key}\" \"${config_value}\")\n    infer_model_cmd=\"${python} ${inference_py} ${set_config} ${set_input_file} ${set_model_dir} > ${_save_log_path} 2>&1 \"\n    eval $infer_model_cmd\n    last_status=${PIPESTATUS[0]}\n    status_check $last_status \"${infer_model_cmd}\" \"${status_log}\" \"${model_name}\""
+        },
+        {
+            "comment": "This code segment is running a test for the function \"func_paddle2onnx\" by exporting Count variable, setting IFS to \"|\", and echoing a message.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_paddle2onnx.sh\":73-80",
+            "content": "}\necho \"################### run test ###################\"\nexport Count=0\nIFS=\"|\"\nfunc_paddle2onnx"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3a21a7b5-da83-452c-b74a-d98a60640456.json b/docs/doc/3a21a7b5-da83-452c-b74a-d98a60640456.json
new file mode 100644
index 000000000..58f46cc90
--- /dev/null
+++ b/docs/doc/3a21a7b5-da83-452c-b74a-d98a60640456.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is a part of PaddleVideo's VideoQualityAssessment module and it imports and defines functions for training and testing models.",
+    "details": [
+        {
+            "comment": "This code is a part of PaddleVideo's VideoQualityAssessment module and it imports and defines functions for training and testing models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py\":0-19",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .train import train_model\nfrom .test import test_model\n__all__ = ['train_model', 'test_model']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3abbc625-0fe6-4346-b69e-b63e8e1e2634.json b/docs/doc/3abbc625-0fe6-4346-b69e-b63e8e1e2634.json
new file mode 100644
index 000000000..364af237a
--- /dev/null
+++ b/docs/doc/3abbc625-0fe6-4346-b69e-b63e8e1e2634.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code from PaddleVideo's VideoQualityAssessment module builds a record list for tracking metrics during training, appends metric names and AverageMeter instances for various frameworks and models, formats and logs epoch, mode, metric average, and image processing speed information with color-coded visual distinction.",
+    "details": [
+        {
+            "comment": "This code is from the PaddleVideo library's VideoQualityAssessment module. It imports necessary classes and functions, defines logger variables, and provides a function to build a record list for loss and learning rate metrics. The framework type is specified, and if Recognizer1D is part of the specified framework, additional steps may be required.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/record.py\":0-28",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom collections import OrderedDict\nfrom .logger import get_logger, coloring\nlogger = get_logger(\"paddlevideo\")\n__all__ = ['AverageMeter', 'build_record', 'build_rec_record', 'log_batch', 'log_epoch']\ndef build_record(cfg):\n    framework_type = cfg.get('framework')\n    record_list = [\n        (\"loss\", AverageMeter('loss', '7.5f')),\n        (\"lr\", AverageMeter('lr', 'f', need_avg=False)),\n    ]\n    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework"
+        },
+        {
+            "comment": "This code is building a record list for tracking metrics during the training process. It appends various metric names to the record list along with their corresponding AverageMeter instances for different frameworks and models. The AverageMeter keeps track of the average value over time, and each meter has its format specifier for displaying the values. The code also includes a function build_rec_record to create the record list based on the given configuration (cfg).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/record.py\":29-50",
+            "content": "        record_list.append((\"hit_at_one\", AverageMeter(\"hit_at_one\", '.5f')))\n        record_list.append((\"perr\", AverageMeter(\"perr\", '.5f')))\n        record_list.append((\"gap\", AverageMeter(\"gap\", '.5f')))\n    elif 'Recognizer' in cfg.framework:\n        record_list.append((\"top1\", AverageMeter(\"top1\", '.5f')))\n        record_list.append((\"top5\", AverageMeter(\"top5\", '.5f')))\n    record_list.append((\"batch_time\", AverageMeter('elapse', '.3f')))\n    record_list.append((\"reader_time\", AverageMeter('reader', '.3f')))\n    record_list = OrderedDict(record_list)\n    return record_list\ndef build_rec_record(cfg):\n    \"\"\"build rec record\"\"\"\n    framework_type = cfg.get('framework')\n    record_list = [\n        (\"loss\", AverageMeter('loss', '7.5f')),\n        (\"lr\", AverageMeter('lr', 'f', need_avg=False)),\n    ]\n    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework\n        record_list.append((\"hit_at_one\", AverageMeter(\"hit_at_one\", '.5f')))\n        record_list.append((\"perr\", AverageMeter(\"perr\", '.5f')))"
+        },
+        {
+            "comment": "This code defines a function `record_list` and a class `AverageMeter`. The function creates a list of record names and their corresponding AverageMeter objects, then converts the list to an OrderedDict. The AverageMeter class computes and stores average values, resets upon initialization, updates with new values, and provides properties for displaying total sum and total sum in minutes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/record.py\":51-89",
+            "content": "        record_list.append((\"gap\", AverageMeter(\"gap\", '.5f')))\n    record_list.append((\"batch_time\", AverageMeter('elapse', '.3f')))\n    record_list.append((\"reader_time\", AverageMeter('reader', '.3f')))\n    record_list = OrderedDict(record_list)\n    return record_list\nclass AverageMeter(object):\n    \"\"\"\n    Computes and stores the average and current value\n    \"\"\"\n    def __init__(self, name='', fmt='f', need_avg=True):\n        self.name = name\n        self.fmt = fmt\n        self.need_avg = need_avg\n        self.reset()\n    def reset(self):\n        \"\"\" reset \"\"\"\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        \"\"\" update \"\"\"\n        if isinstance(val, paddle.Tensor):\n            val = float(val)\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n    @property\n    def total(self):\n        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)\n    @property\n    def total_minute(self):"
+        },
+        {
+            "comment": "The code provides functions to log batch and epoch information for a video quality assessment task. The `log_batch` function takes in metric list, batch ID, epoch ID, total epochs, mode, and ips as input and logs the metrics, current epoch/total epochs, and step details. The `log_epoch` function calculates the mean of the metrics and logs the mean values along with the total batch time for an epoch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/record.py\":90-114",
+            "content": "        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,\n                                                            self=self)\n    @property\n    def mean(self):\n        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(\n            self=self) if self.need_avg else ''\n    @property\n    def value(self):\n        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)\ndef log_batch(metric_list, batch_id, epoch_id, total_epoch, mode, ips):\n    metric_str = ' '.join([str(m.value) for m in metric_list.values()])\n    epoch_str = \"epoch:[{:>3d}/{:<3d}]\".format(epoch_id, total_epoch)\n    step_str = \"{:s} step:{:<4d}\".format(mode, batch_id)\n    logger.info(\"{:s} {:s} {:s}s {}\".format(\n        coloring(epoch_str, \"HEADER\") if batch_id == 0 else epoch_str,\n        coloring(step_str, \"PURPLE\"), coloring(metric_str, 'OKGREEN'), ips))\ndef log_epoch(metric_list, epoch, mode, ips):\n    metric_avg = ' '.join([str(m.mean) for m in metric_list.values()] +\n                          [metric_list['batch_time'].total])"
+        },
+        {
+            "comment": "This code snippet is formatting and logging information related to an epoch, mode, metric average, and image processing speed. It uses the \"coloring\" function to color certain parts of the log text (RED, PURPLE, OKGREEN) for better visual distinction. The logger then logs this information with time stamp.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/record.py\":116-121",
+            "content": "    end_epoch_str = \"END epoch:{:<3d}\".format(epoch)\n    logger.info(\"{:s} {:s} {:s}s {}\".format(coloring(end_epoch_str, \"RED\"),\n                                            coloring(mode, \"PURPLE\"),\n                                            coloring(metric_avg, \"OKGREEN\"),\n                                            ips))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3afd1d71-0f56-4c75-a411-549376b01b34.json b/docs/doc/3afd1d71-0f56-4c75-a411-549376b01b34.json
new file mode 100644
index 000000000..f3f58897d
--- /dev/null
+++ b/docs/doc/3afd1d71-0f56-4c75-a411-549376b01b34.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code sets up a colorful logging function for PaddleVideo, initializes logger with verbosity levels, and ensures non-propagation of logs. It configures logger for Python's logging module using different formats and handlers based on local rank.",
+    "details": [
+        {
+            "comment": "This code is from the \"logger.py\" file in the PaddleVideo project, and it sets up a coloring function for logging messages with optional colors using ANSI escape sequences. The function takes a message and an optional color parameter, which should be one of the defined colors in the Color dictionary. It asserts that the provided color is indeed a key in the dictionary, and then returns the message with the specified color applied. The function also checks the environment variable \"COLORING\" to determine whether coloring should be enabled or not (default is True).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py\":0-37",
+            "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport logging\nimport os\nimport sys\nimport datetime\nfrom paddle.distributed import ParallelEnv\nColor = {\n    'RED': '\\033[31m',\n    'HEADER': '\\033[35m',  # deep purple\n    'PURPLE': '\\033[95m',  # purple\n    'OKBLUE': '\\033[94m',\n    'OKGREEN': '\\033[92m',\n    'WARNING': '\\033[93m',\n    'FAIL': '\\033[91m',\n    'ENDC': '\\033[0m'\n}\ndef coloring(message, color=\"OKGREEN\"):\n    assert color in Color.keys()\n    if os.environ.get('COLORING', True):"
+        },
+        {
+            "comment": "This code initializes the PaddleVideo logger and sets its verbosity level to \"INFO\" or \"DEBUG\", depending on the input argument. It also defines a custom time zone converter for logging, and ensures that the logger does not propagate logs to its parent loggers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py\":38-70",
+            "content": "        return Color[color] + str(message) + Color[\"ENDC\"]\n    else:\n        return message\nlogger_initialized = []\ndef setup_logger(output=None, name=\"paddlevideo\", level=\"INFO\"):\n    \"\"\"\n    Initialize the paddlevideo logger and set its verbosity level to \"INFO\".\n    Args:\n        output (str): a file name or a directory to save log. If None, will not save log file.\n            If ends with \".txt\" or \".log\", assumed to be a file name.\n            Otherwise, logs will be saved to `output/log.txt`.\n        name (str): the root module name of this logger\n    Returns:\n        logging.Logger: a logger\n    \"\"\"\n    def time_zone(sec, fmt):\n        real_time = datetime.datetime.now()\n        return real_time.timetuple()\n    logging.Formatter.converter = time_zone\n    logger = logging.getLogger(name)\n    if level == \"INFO\":\n        logger.setLevel(logging.INFO)\n    elif level==\"DEBUG\":\n        logger.setLevel(logging.DEBUG)\n    logger.propagate = False\n    if level == \"DEBUG\":\n        plain_formatter = logging.Formatter("
+        },
+        {
+            "comment": "This code configures a logger for Python's logging module. It uses different formats and handlers (stdout, file) based on the local rank of the process, creating separate log files for each worker ranked greater than 0. If the output is a .txt or .log file, it will be used as-is; otherwise, a .log.txt file with optional rank appended will be created. The code also ensures that missing directories for the log file are created beforehand.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py\":71-99",
+            "content": "            \"[%(asctime)s] %(name)s %(levelname)s: %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    else:\n        plain_formatter = logging.Formatter(\n            \"[%(asctime)s] %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n    # stdout logging: master only\n    local_rank = ParallelEnv().local_rank\n    if local_rank == 0:\n        ch = logging.StreamHandler(stream=sys.stdout)\n        ch.setLevel(logging.DEBUG)\n        formatter = plain_formatter\n        ch.setFormatter(formatter)\n        logger.addHandler(ch)\n    # file logging: all workers\n    if output is not None:\n        if output.endswith(\".txt\") or output.endswith(\".log\"):\n            filename = output\n        else:\n            filename = os.path.join(output, \".log.txt\")\n        if local_rank > 0:\n            filename = filename + \".rank{}\".format(local_rank)\n        # PathManager.mkdirs(os.path.dirname(filename))\n        os.makedirs(os.path.dirname(filename), exist_ok=True)\n        # fh = logging.StreamHandler(_cached_log_stream(filename)\n        fh = logging.FileHandler(filename, mode='a')"
+        },
+        {
+            "comment": "This code initializes a logger object and sets its level to DEBUG, adds a file handler with a plain formatter, and appends the logger's name to an initialized list. The function returns the logger if it has been previously initialized for the given name; otherwise, it sets up the logger using the provided name and optional output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py\":100-112",
+            "content": "        fh.setLevel(logging.DEBUG)\n        fh.setFormatter(plain_formatter)\n        logger.addHandler(fh)\n    logger_initialized.append(name)\n    return logger\ndef get_logger(name, output=None):\n    logger = logging.getLogger(name)\n    if name in logger_initialized:\n        return logger\n    return setup_logger(name=name, output=name)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3c27a33b-b94d-4fed-9cd1-3ef6fa629d86.json b/docs/doc/3c27a33b-b94d-4fed-9cd1-3ef6fa629d86.json
new file mode 100644
index 000000000..7636366ac
--- /dev/null
+++ b/docs/doc/3c27a33b-b94d-4fed-9cd1-3ef6fa629d86.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code imports modules, sets up environment, creates an ONNX predictor for video object detection, and performs inference on batches of input files while supporting benchmarking if enabled.",
+    "details": [
+        {
+            "comment": "This code imports necessary modules and defines a function for parsing command-line arguments. It sets up the environment to execute PaddleVideo Inference model scripts. The code also includes license information, ensuring compliance with the Apache License, Version 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/predict_onnx.py\":0-30",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nimport sys\nfrom os import path as osp\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../tools')))\nfrom utils import build_inference_helper, get_config\ndef parse_args():\n    def str2bool(v):\n        return v.lower() in (\"true\", \"t\", \"1\")\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Inference model script\")"
+        },
+        {
+            "comment": "This code snippet is parsing command line arguments for config file, input file path, and ONNX model file path. It also includes parameters for ONNX prediction like batch size, use of GPU, precision, IR optimization, enable benchmark, and CPU threads.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/predict_onnx.py\":31-53",
+            "content": "    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument(\"-i\", \"--input_file\", type=str, help=\"input file path\")\n    parser.add_argument(\"--onnx_file\", type=str, help=\"onnx model file path\")\n    # params for onnx predict\n    parser.add_argument(\"-b\", \"--batch_size\", type=int, default=1)\n    parser.add_argument(\"--use_gpu\",\n                        type=str2bool,\n                        default=False,\n                        help=\"set to False when using onnx\")\n    parser.add_argument(\"--precision\", type=str, default=\"fp32\")\n    parser.add_argument(\"--ir_optim\", type=str2bool, default=True)\n    parser.add_argument(\"--enable_benchmark\",\n                        type=str2bool,\n                        default=False,\n                        help=\"set to False when using onnx\")\n    parser.add_argument(\"--cpu_threads\", type=int, default=4)\n    return parser.parse_args()"
+        },
+        {
+            "comment": "The code defines a function to create an ONNX predictor by loading an ONNX file and setting configuration options. It also includes functions for parsing file paths and handling command-line arguments. This code is used for onnx model inference, specifically for video object detection tasks. The main function calls other utility functions to parse the input file path and load configuration settings before executing the actual prediction using the created ONNX predictor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/predict_onnx.py\":56-91",
+            "content": "def create_onnx_predictor(args, cfg=None):\n    import onnxruntime as ort\n    onnx_file = args.onnx_file\n    config = ort.SessionOptions()\n    if args.use_gpu:\n        raise ValueError(\n            \"onnx inference now only supports cpu! please set `use_gpu` to False.\"\n        )\n    else:\n        config.intra_op_num_threads = args.cpu_threads\n        if args.ir_optim:\n            config.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL\n    predictor = ort.InferenceSession(onnx_file, sess_options=config)\n    return config, predictor\ndef parse_file_paths(input_path: str) -> list:\n    if osp.isfile(input_path):\n        files = [\n            input_path,\n        ]\n    else:\n        files = os.listdir(input_path)\n        files = [\n            file for file in files\n            if (file.endswith(\".avi\") or file.endswith(\".mp4\"))\n        ]\n        files = [osp.join(input_path, file) for file in files]\n    return files\ndef main():\n    \"\"\"predict using onnx model\n    \"\"\"\n    args = parse_args()\n    cfg = get_config(args.config, show=False)"
+        },
+        {
+            "comment": "This code builds an inference helper, creates an ONNX predictor, gets input and output names, processes file paths, performs benchmarking, and initializes an auto log for the given model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/predict_onnx.py\":93-121",
+            "content": "    model_name = cfg.model_name\n    print(f\"Inference model({model_name})...\")\n    InferenceHelper = build_inference_helper(cfg.INFERENCE)\n    inference_config, predictor = create_onnx_predictor(args)\n    # get input_tensor and output_tensor\n    input_names = predictor.get_inputs()[0].name\n    output_names = predictor.get_outputs()[0].name\n    # get the absolute file path(s) to be processed\n    files = parse_file_paths(args.input_file)\n    if args.enable_benchmark:\n        test_video_num = 12\n        num_warmup = 3\n        # instantiate auto log\n        try:\n            import auto_log\n        except ImportError as e:\n            print(f\"{e}, [git+https://github.com/LDOUBLEV/AutoLog] \"\n                  f\"package and it's dependencies is required for \"\n                  f\"python-inference when enable_benchmark=True.\")\n        pid = os.getpid()\n        autolog = auto_log.AutoLogger(\n            model_name=cfg.model_name,\n            model_precision=args.precision,\n            batch_size=args.batch_size,\n            data_shape=\"dynamic\","
+        },
+        {
+            "comment": "Code snippet performs video inference on batches of input files using a predictor. It preprocesses the batch inputs, runs inference for each batch, and records pre-processing and inference time costs if benchmarking is enabled.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/predict_onnx.py\":122-152",
+            "content": "            save_path=\"./output/auto_log.lpg\",\n            inference_config=inference_config,\n            pids=pid,\n            process_name=None,\n            gpu_ids=None,\n            time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],\n            warmup=num_warmup)\n        files = [args.input_file for _ in range(test_video_num + num_warmup)]\n    # Inferencing process\n    batch_num = args.batch_size\n    for st_idx in range(0, len(files), batch_num):\n        ed_idx = min(st_idx + batch_num, len(files))\n        # auto log start\n        if args.enable_benchmark:\n            autolog.times.start()\n        # Pre process batched input\n        batched_inputs = InferenceHelper.preprocess_batch(files[st_idx:ed_idx])\n        # get pre process time cost\n        if args.enable_benchmark:\n            autolog.times.stamp()\n        # run inference\n        batched_outputs = predictor.run(\n            output_names=[output_names],\n            input_feed={input_names: batched_inputs[0]})\n        # get inference process time cost"
+        },
+        {
+            "comment": "The code segment is controlling the benchmark execution. If `args.enable_benchmark` is True, it stamps the current time using autolog, then calls postprocess function on batched outputs with `not args.enable_benchmark`. After that, it ends the timer using autolog and reports the benchmark log if `args.enable_benchmark` is still True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/predict_onnx.py\":153-170",
+            "content": "        if args.enable_benchmark:\n            autolog.times.stamp()\n        InferenceHelper.postprocess(batched_outputs, not args.enable_benchmark)\n        # get post process time cost\n        if args.enable_benchmark:\n            autolog.times.end(stamp=True)\n        # time.sleep(0.01)  # sleep for T4 GPU\n    # report benchmark log if enabled\n    if args.enable_benchmark:\n        autolog.report()\nif __name__ == \"__main__\":\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3ca025f8-4e3a-47be-be41-e1cad067fd2a.json b/docs/doc/3ca025f8-4e3a-47be-be41-e1cad067fd2a.json
new file mode 100644
index 000000000..dedd441c9
--- /dev/null
+++ b/docs/doc/3ca025f8-4e3a-47be-be41-e1cad067fd2a.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This file contains the definitions for BaseSegment and Manet classes, both part of PaddleVideo framework. These classes are likely used in video modeling or segmentation tasks. The code is licensed under Apache License 2.0 and distributed as-is without warranties or conditions.",
+    "details": [
+        {
+            "comment": "This file contains the definitions for BaseSegment and Manet classes, both part of PaddleVideo framework. These classes are likely used in video modeling or segmentation tasks. The code is licensed under Apache License 2.0 and distributed as-is without warranties or conditions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py\":0-18",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .segment import BaseSegment, Manet\n__all__ = ['BaseSegment',\n           'Manet'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3dffe712-135b-480f-8096-a59d64202e9d.json b/docs/doc/3dffe712-135b-480f-8096-a59d64202e9d.json
new file mode 100644
index 000000000..f7d9555d6
--- /dev/null
+++ b/docs/doc/3dffe712-135b-480f-8096-a59d64202e9d.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This file is a Python module for video dataset loading and processing in PaddleVideo. It contains functions to build datasets, data loaders, and batch pipelines, along with the VideoDataset class.",
+    "details": [
+        {
+            "comment": "This file is a Python module for video dataset loading and processing in PaddleVideo. It contains functions to build datasets, data loaders, and batch pipelines, along with the VideoDataset class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py\":0-20",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .builder import build_dataset, build_dataloader, build_batch_pipeline\nfrom .dataset import VideoDataset\n__all__ = [\n    'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3e162e69-611f-4080-90c9-982004493277.json b/docs/doc/3e162e69-611f-4080-90c9-982004493277.json
new file mode 100644
index 000000000..bf6aedc31
--- /dev/null
+++ b/docs/doc/3e162e69-611f-4080-90c9-982004493277.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The NetVLAD class in PaddleVideo's \"T2VLAD\" model initializes neural network parameters, performs checks and calculations for VLAD representations with batch size x dimension K.",
+    "details": [
+        {
+            "comment": "NetVLAD is a class for implementing the NetVLAD algorithm. It takes parameters such as cluster_size, feature_size, ghost_clusters, and add_batch_norm. The feature_size represents the size of each feature, while the cluster_size represents the number of clusters. Ghost_clusters determines whether to include extra clusters for better performance. Add_batch_norm is a boolean value that decides whether or not to use batch normalization in the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/net_vlad.py\":0-32",
+            "content": "\"\"\"NetVLAD implementation.\n\"\"\"\n# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport numpy as np\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nclass NetVLAD(nn.Layer):\n    def __init__(self, cluster_size, feature_size, ghost_clusters=0,\n                 add_batch_norm=True):\n        super().__init__()\n        self.feature_size = feature_size\n        self.cluster_size = cluster_size\n        self.ghost_clusters = ghost_clusters\n        init_sc = (1 / math.sqrt(feature_size))"
+        },
+        {
+            "comment": "This code initializes the neural network parameters for a VLAD model. It creates two sets of cluster weights, and assigns random values within a certain range to these weights using Paddle's `paddle.randn` function with a specified initialization scale (`init_sc`). Additionally, it creates batch normalization layers (`BatchNorm1D`) for the clusters if `add_batch_norm` is True. The code also defines the output dimension as the product of cluster size and feature size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/net_vlad.py\":33-43",
+            "content": "        init_sc = paddle.to_tensor(init_sc)\n        clusters = cluster_size + ghost_clusters\n        # The `clusters` weights are the `(w,b)` in the paper\n        self.clusters = paddle.create_parameter([feature_size, clusters], dtype='float32', default_initializer=nn.initializer.Assign(paddle.randn([feature_size, clusters]) * init_sc))\n        self.batch_norm1 = nn.BatchNorm1D(clusters) if add_batch_norm else None\n        self.batch_norm2 = nn.BatchNorm1D(clusters) if add_batch_norm else None\n        # The `clusters2` weights are the visual words `c_k` in the paper\n        self.clusters1 = paddle.create_parameter([1, feature_size, cluster_size], dtype='float32', default_initializer=nn.initializer.Assign(paddle.randn([1, feature_size, cluster_size]) * init_sc))\n        self.clusters2 = paddle.create_parameter([1, feature_size, cluster_size], dtype='float32', default_initializer=nn.initializer.Assign(paddle.randn([1, feature_size, cluster_size]) * init_sc)) \n        self.out_dim = self.cluster_size * feature_size"
+        },
+        {
+            "comment": "The code snippet is a part of the \"T2VLAD\" model in PaddleVideo. It performs sanity checks to ensure there are no NaN inputs or clusters, and then proceeds with the forward pass. In the forward function, it reshapes input, applies batch normalization, and calculates the assignment between input features and clusters. This is used for aggregating feature maps into a fixed-size representation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/net_vlad.py\":45-75",
+            "content": "    def sanity_checks(self, x):\n        \"\"\"Catch any nans in the inputs/clusters\"\"\"\n        if paddle.isnan(paddle.sum(x)):\n            raise ValueError(\"nan inputs\")\n        if paddle.isnan(self.clusters[0][0]): \n            raise ValueError(\"nan clusters\")\n    def forward(self, x, freeze=False, mask=None):\n        \"\"\"Aggregates feature maps into a fixed size representation.  In the following\n        notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size.\n        Args:\n            x (th.Tensor): B x N x D\n        Returns:\n            (th.Tensor): B x DK\n        \"\"\"\n        self.sanity_checks(x)\n        max_sample = x.shape[1] \n        x = x.reshape([-1, self.feature_size]) # B x N x D -> BN x D\n        if freeze == True:\n            clusters = self.clusters.detach()\n            clusters2 = self.clusters1\n            batch_norm =  self.batch_norm1\n        else:\n            clusters = self.clusters\n            clusters2 = self.clusters2\n            batch_norm =  self.batch_norm2\n        assignment = paddle.matmul(x, clusters) # (BN x D) x (D x (K+G)) -> BN x (K+G)"
+        },
+        {
+            "comment": "In this code snippet, it performs batch normalization on the assignment matrix, applies softmax for normalization, reshapes the assignment matrix multiple times, calculates a sum of clusters and multiplies by cluster centers, performs matrix multiplication to generate a VLAD representation, normalizes the intra-cluster L2 norm, and finally reshapes and applies normalization for the final VLAD representation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/net_vlad.py\":76-98",
+            "content": "        if batch_norm:\n            assignment = batch_norm(assignment)\n        assignment = F.softmax(assignment, axis=1) # BN x (K+G) -> BN x (K+G)\n        save_ass = assignment.reshape([-1, max_sample, self.cluster_size+1])\n        assignment = assignment[:, :self.cluster_size]\n        assignment = assignment.reshape([-1, max_sample, self.cluster_size]) # -> B x N x K\n        a_sum = paddle.sum(assignment, axis=1, keepdim=True) # B x N x K -> B x 1 x K\n        a = a_sum * self.clusters2\n        assignment = assignment.transpose([0, 2, 1])  # B x N x K -> B x K x N\n        x = x.reshape([-1, max_sample, self.feature_size]) # BN x D -> B x N x D\n        vlad = paddle.matmul(assignment, x) # (B x K x N) x (B x N x D) -> B x K x D\n        vlad = vlad.transpose([0, 2, 1]) # -> B x D x K\n        vlad = vlad - a\n        # L2 intra norm\n        vlad_ = F.normalize(vlad)\n        # flattening + L2 norm\n        vlad = vlad_.reshape([-1, self.cluster_size * self.feature_size])  # -> B x DK\n        vlad = F.normalize(vlad)"
+        },
+        {
+            "comment": "The code is returning the VLAD (Vector of Locally Aggregated Descriptors) feature representations and their respective variables for Batch size x Dimension K.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/net_vlad.py\":99-99",
+            "content": "        return vlad, vlad_, save_ass  # B x DK"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/3f8ca3d4-39cf-43a3-a4bb-98ff2cd75d93.json b/docs/doc/3f8ca3d4-39cf-43a3-a4bb-98ff2cd75d93.json
new file mode 100644
index 000000000..9e08dbaf6
--- /dev/null
+++ b/docs/doc/3f8ca3d4-39cf-43a3-a4bb-98ff2cd75d93.json
@@ -0,0 +1,75 @@
+{
+    "summary": "The code utilizes an ArgumentParser to handle command line arguments, downloads and saves a model, initializes PaddleVideo with GPU/MKLDNN usage for video label prediction, and iterates through results to print top classes/scores/labels.",
+    "details": [
+        {
+            "comment": "This code block is a license notice for the Apache License, Version 2.0, which grants permission to use this file as long as it complies with the terms of the license.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":0-23",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,"
+        },
+        {
+            "comment": "This code imports necessary modules, defines paths and model names for PaddleVideo inference models, and includes a function to parse command line arguments. The code is setting up the environment for using different PaddleVideo models and downloading them if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":24-63",
+            "content": "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\n__dir__ = os.path.dirname(__file__)\nsys.path.append(os.path.join(__dir__, ''))\nimport numpy as np\nimport tarfile\nimport requests\nfrom tqdm import tqdm\nimport shutil\nfrom paddle import inference\nfrom paddle.inference import Config, create_predictor\nfrom tools.utils import ppTSM_Inference_helper\n__all__ = ['PaddleVideo']\n# path of download model and data\nBASE_DIR = os.path.expanduser(\"~/.paddlevideo_inference/\")\nBASE_INFERENCE_MODEL_DIR = os.path.join(BASE_DIR, 'inference_model')\nBASE_VIDEOS_DIR = os.path.join(BASE_DIR, 'videos')\n# support Models\nMODELS = {\n    'ppTSM':\n    'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_infer.tar',\n    'ppTSM_v2':\n    'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_v2_infer.tar'\n}\nMODEL_NAMES = list(MODELS.keys())\ndef parse_args(mMain=True, add_help=True):"
+        },
+        {
+            "comment": "This code defines a function that creates an ArgumentParser object for command line arguments. It includes various argument types and default values, such as model name, video file, use GPU flag, number of segments, short and target sizes, and batch size. The function is intended to be used in the main section of a Python script when set to True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":64-92",
+            "content": "    \"\"\"\n    Args:\n        mMain: bool. True for command args, False for python interface\n    \"\"\"\n    import argparse\n    def str2bool(v):\n        return v.lower() in (\"true\", \"t\", \"1\")\n    if mMain == True:\n        # general params\n        parser = argparse.ArgumentParser(add_help=add_help)\n        parser.add_argument(\"--model_name\", type=str, default='')\n        parser.add_argument(\"-v\", \"--video_file\", type=str, default='')\n        parser.add_argument(\"--use_gpu\", type=str2bool, default=True)\n        # params for decode and sample\n        parser.add_argument(\"--num_seg\", type=int, default=16)\n        # params for preprocess\n        parser.add_argument(\"--short_size\", type=int, default=256)\n        parser.add_argument(\"--target_size\", type=int, default=224)\n        # params for predict\n        parser.add_argument(\"--model_file\", type=str, default='')\n        parser.add_argument(\"--params_file\", type=str)\n        parser.add_argument(\"-b\", \"--batch_size\", type=int, default=1)\n        parser.add_argument(\"--use_fp16\", type=str2bool, default=False)"
+        },
+        {
+            "comment": "This code is initializing argument parser with default values for various options like ir_optim, use_tensorrt, gpu_mem, top_k and enable_mkldnn. It then parses the arguments using argparse and returns the resulting Namespace.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":93-114",
+            "content": "        parser.add_argument(\"--ir_optim\", type=str2bool, default=True)\n        parser.add_argument(\"--use_tensorrt\", type=str2bool, default=False)\n        parser.add_argument(\"--gpu_mem\", type=int, default=8000)\n        parser.add_argument(\"--top_k\", type=int, default=1)\n        parser.add_argument(\"--enable_mkldnn\", type=bool, default=False)\n        parser.add_argument(\"--label_name_path\", type=str, default='')\n        return parser.parse_args()\n    else:\n        return argparse.Namespace(model_name='',\n                                  video_file='',\n                                  use_gpu=True,\n                                  num_seg=16,\n                                  short_size=256,\n                                  target_size=224,\n                                  model_file='',\n                                  params_file='',\n                                  batch_size=1,\n                                  use_fp16=False,\n                                  ir_optim=True,\n                                  use_tensorrt=False,"
+        },
+        {
+            "comment": "Function `parse_file_paths` takes an input path as a parameter, checks if it is a file or a directory. If it's a file, it returns the file itself; otherwise, it lists all files in the directory, filters out those that don't end with \".avi\" or \".mp4\", and joins the input path with each filtered file to form an absolute path. These paths are then returned as a list.\n\nFunction `download_with_progressbar` downloads data from the given URL in chunks while providing progress updates using tqdm's progress bar. It sets the total size of the download based on the 'content-length' header from the response, and writes each chunk to the specified save path in a 'wb' mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":115-144",
+            "content": "                                  gpu_mem=8000,\n                                  top_k=1,\n                                  enable_mkldnn=False,\n                                  label_name_path='')\ndef parse_file_paths(input_path: str) -> list:\n    if os.path.isfile(input_path):\n        files = [\n            input_path,\n        ]\n    else:\n        files = os.listdir(input_path)\n        files = [\n            file for file in files\n            if (file.endswith(\".avi\") or file.endswith(\".mp4\"))\n        ]\n        files = [os.path.join(input_path, file) for file in files]\n    return files\ndef download_with_progressbar(url, save_path):\n    response = requests.get(url, stream=True)\n    total_size_in_bytes = int(response.headers.get('content-length', 0))\n    block_size = 1024  # 1 Kibibyte\n    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)\n    with open(save_path, 'wb') as file:\n        for data in response.iter_content(block_size):\n            progress_bar.update(len(data))\n            file.write(data)"
+        },
+        {
+            "comment": "The code downloads an inference model from a given URL and saves it to the specified directory. It first checks if the required files ('inference.pdiparams' and 'inference.pdmodel') exist, then creates temporary directories for downloading, prints the download progress, extracts the tar archive containing the model files, and raises an exception if any issue occurs during the process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":145-167",
+            "content": "    progress_bar.close()\n    if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes:\n        raise Exception(\"Something went wrong while downloading models\")\ndef download_inference_model(model_storage_directory, url):\n    # using custom model\n    tar_file_name_list = [\n        'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel'\n    ]\n    if not os.path.exists(\n            os.path.join(model_storage_directory,\n                         'inference.pdiparams')) or not os.path.exists(\n                             os.path.join(model_storage_directory,\n                                          'inference.pdmodel')):\n        tmp_path = os.path.join(model_storage_directory, url.split('/')[-1])\n        print('download {} to {}'.format(url, tmp_path))\n        os.makedirs(model_storage_directory, exist_ok=True)\n        download_with_progressbar(url, tmp_path)  #download\n        #save to directory\n        with tarfile.open(tmp_path, 'r') as tarObj:\n            for member in tarObj.getmembers():"
+        },
+        {
+            "comment": "This code is initializing a Paddle predictor by reading arguments and configuring the model accordingly. It enables GPU use or MKLDNN based on the provided flags, sets the log level, and switches IR optimization if requested.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":168-196",
+            "content": "                filename = None\n                for tar_file_name in tar_file_name_list:\n                    if tar_file_name in member.name:\n                        filename = tar_file_name\n                if filename is None:\n                    continue\n                file = tarObj.extractfile(member)\n                with open(os.path.join(model_storage_directory, filename),\n                          'wb') as f:\n                    f.write(file.read())\n        os.remove(tmp_path)\ndef create_paddle_predictor(args):\n    config = Config(args.model_file, args.params_file)\n    if args.use_gpu:\n        config.enable_use_gpu(args.gpu_mem, 0)\n    else:\n        config.disable_gpu()\n        if args.enable_mkldnn:\n            # cache 10 different shapes for mkldnn to avoid memory leak\n            config.set_mkldnn_cache_capacity(10)\n            config.enable_mkldnn()\n    config.disable_glog_info()\n    config.switch_ir_optim(args.ir_optim)  # default true\n    if args.use_tensorrt:\n        config.enable_tensorrt_engine("
+        },
+        {
+            "comment": "The code snippet is initializing a PaddleVideo object and creating a predictor. It sets the precision mode based on the `args.use_fp16` flag, enables memory optimization, and switches off zero copy operations. It also loads a label name dictionary from the specified path. The purpose of this code is to facilitate model inference using PaddleVideo and provide a user-friendly interface.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":197-231",
+            "content": "            precision_mode=Config.Precision.Half\n            if args.use_fp16 else Config.Precision.Float32,\n            max_batch_size=args.batch_size)\n    config.enable_memory_optim()\n    # use zero copy\n    config.switch_use_feed_fetch_ops(False)\n    predictor = create_predictor(config)\n    return predictor\ndef load_label_name_dict(path):\n    result = {}\n    if not os.path.exists(path):\n        print(\n            'Warning: If want to use your own label_dict, please input legal path!\\nOtherwise label_names will be empty!'\n        )\n    else:\n        for line in open(path, 'r'):\n            partition = line.split('\\n')[0].partition(' ')\n            try:\n                result[int(partition[0])] = str(partition[-1])\n            except:\n                result = {}\n                break\n    return result\nclass PaddleVideo(object):\n    def __init__(self, **kwargs):\n        print(\n            '\\nInference models that Paddle provides are listed as follows:\\n{}'\n            .format(MODEL_NAMES), '\\n')\n        process_params = parse_args(mMain=False, add_help=False)"
+        },
+        {
+            "comment": "This code checks if the model file exists, if not it prompts for a model name and downloads a pre-trained model from the provided URL if the model name is in the MODEL_NAMES list. It creates directories for the downloaded files and updates process_params with paths to the inference.pdmodel, inference.pdiparams, label_name_path files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":232-252",
+            "content": "        process_params.__dict__.update(**kwargs)\n        if not os.path.exists(process_params.model_file):\n            if process_params.model_name is None:\n                raise Exception('Please input model name that you want to use!')\n            if process_params.model_name in MODEL_NAMES:\n                url = MODELS[process_params.model_name]\n                download_path = os.path.join(BASE_INFERENCE_MODEL_DIR,\n                                             process_params.model_name)\n                if not os.path.exists(download_path):\n                    os.makedirs(download_path)\n                #create pretrained model download_path\n                download_inference_model(model_storage_directory=download_path,\n                                         url=url)\n                process_params.model_file = os.path.join(\n                    download_path, 'inference.pdmodel')\n                process_params.params_file = os.path.join(\n                    download_path, 'inference.pdiparams')\n                process_params.label_name_path = os.path.join("
+        },
+        {
+            "comment": "The code initializes an object that can predict video labels using PaddleVideo. It checks for the presence of required parameters and allows user-specified models, then loads label name dictionary, and finally defines a \"predict\" method to classify videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":253-276",
+            "content": "                    __dir__, '../data/k400/Kinetics-400_label_list.txt')\n            else:\n                raise Exception(\n                    'If you want to use your own model, Please input model_file as model path!'\n                )\n        else:\n            print('Using user-specified model and params!')\n        print(\"process params are as follows: \\n{}\".format(process_params))\n        self.label_name_dict = load_label_name_dict(\n            process_params.label_name_path)\n        self.args = process_params\n        self.predictor = create_paddle_predictor(process_params)\n    def predict(self, video):\n        \"\"\"\n        predict label of video with paddlevideo\n        Args:\n            video:input video for clas, support single video , internet url, folder path containing series of videos\n        Returns:\n            list[dict:{videoname: \"\",class_ids: [], scores: [], label_names: []}],if label name path is None,label names will be empty\n        \"\"\"\n        video_list = []\n        assert isinstance(video, (str))"
+        },
+        {
+            "comment": "The code fetches input and output tensor names from the predictor, then retrieves their handles. If the video is a URL, it downloads the internet video and saves it to the BASE_VIDEOS_DIR. The downloaded video file path replaces the original URL. It checks if the video is not legal (not a string) and outputs an error message.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":278-300",
+            "content": "        # get input_tensor and output_tensor\n        input_names = self.predictor.get_input_names()\n        output_names = self.predictor.get_output_names()\n        input_tensor_list = []\n        output_tensor_list = []\n        for item in input_names:\n            input_tensor_list.append(self.predictor.get_input_handle(item))\n        for item in output_names:\n            output_tensor_list.append(self.predictor.get_output_handle(item))\n        if isinstance(video, str):\n            # download internet video\n            if video.startswith('http'):\n                if not os.path.exists(BASE_VIDEOS_DIR):\n                    os.makedirs(BASE_VIDEOS_DIR)\n                video_path = os.path.join(BASE_VIDEOS_DIR, 'tmp.mp4')\n                download_with_progressbar(video, video_path)\n                print(\"Current using video from Internet:{}, renamed as: {}\".\n                      format(video, video_path))\n                video = video_path\n            files = parse_file_paths(video)\n        else:\n            print('Please input legal video!')"
+        },
+        {
+            "comment": "Looping over each chunk of files, preprocesses and runs inference on batched inputs, then post-processes the outputs to store in `batched_outputs`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":302-326",
+            "content": "        # Inferencing process\n        InferenceHelper = ppTSM_Inference_helper(\n            num_seg=self.args.num_seg,\n            short_size=self.args.short_size,\n            target_size=self.args.target_size,\n            top_k=self.args.top_k)\n        batch_num = self.args.batch_size\n        for st_idx in range(0, len(files), batch_num):\n            ed_idx = min(st_idx + batch_num, len(files))\n            # Pre process batched input\n            batched_inputs = InferenceHelper.preprocess_batch(\n                files[st_idx:ed_idx])\n            # run inference\n            for i in range(len(input_tensor_list)):\n                input_tensor_list[i].copy_from_cpu(batched_inputs[i])\n            self.predictor.run()\n            batched_outputs = []\n            for j in range(len(output_tensor_list)):\n                batched_outputs.append(output_tensor_list[j].copy_to_cpu())\n            results_list = InferenceHelper.postprocess(batched_outputs,\n                                                       print_output=False,"
+        },
+        {
+            "comment": "This code block is iterating through the 'results_list' and adding labels to each result. If the 'label_name_dict' is not empty, it assigns the corresponding label names from the dictionary to the results. It then prints various information about each result such as video file name and top classes/scores/labels. The main function initializes the PaddleVideo class and calls its 'predict' method with a specific video file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":327-352",
+            "content": "                                                       return_result=True)\n            for res in results_list:\n                classes = res[\"topk_class\"]\n                label_names = []\n                if len(self.label_name_dict) != 0:\n                    label_names = [self.label_name_dict[c] for c in classes]\n                res[\"label_names\"] = label_names\n                print(\"Current video file: {0}\".format(res[\"video_id\"]))\n                print(\"\\ttop-{0} classes: {1}\".format(len(res[\"topk_class\"]),\n                                                      res[\"topk_class\"]))\n                print(\"\\ttop-{0} scores: {1}\".format(len(res[\"topk_scores\"]),\n                                                     res[\"topk_scores\"]))\n                print(\"\\ttop-{0} label names: {1}\".format(\n                    len(res[\"label_names\"]), res[\"label_names\"]))\ndef main():\n    # for cmd\n    args = parse_args(mMain=True)\n    clas_engine = PaddleVideo(**(args.__dict__))\n    clas_engine.predict(args.video_file)\nif __name__ == '__main__':"
+        },
+        {
+            "comment": "This line of code likely represents the entry point for the execution of the script, calling the main function to kick off the program's logic. The specific function or operations within this main() function will depend on the rest of the codebase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/wheel.py\":353-353",
+            "content": "    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/401cb792-b6b6-4218-b04f-42235071fca2.json b/docs/doc/401cb792-b6b6-4218-b04f-42235071fca2.json
new file mode 100644
index 000000000..fed534b83
--- /dev/null
+++ b/docs/doc/401cb792-b6b6-4218-b04f-42235071fca2.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The `mask_damager` function from PaddleVideo library takes a mask and maximum rotation angle, damages and scales it by applying rotations and translations to create damaged labels.",
+    "details": [
+        {
+            "comment": "This code defines a function, `mask_damager`, which randomly applies transformations to the input labels. It can make the entire label black with 20% probability or rotate and scale it by random values from predefined ranges. If morphology transformation is applied with 50% probability, it uses an open operation to modify the labels using a randomly generated kernel.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/mask_damaging.py\":0-35",
+            "content": "import numpy as np\nfrom scipy.ndimage import interpolation\ntry:\n    from skimage import morphology, transform\nexcept ImportError as e:\n    print(\n        f\"{e}, [scikit-image] package and it's dependencies is required for MA-Net.\"\n    )\nimport paddle\nimport cv2\nimport random\n####\ndef mask_damager(labels=None, p_black=0.2):\n    scales = (0.8, 1.0, 1.2)\n    kernel_size = random.randint(10, 15)\n    kernel = np.ones((kernel_size, kernel_size), np.uint8)\n    if random.random() < p_black:\n        final_label = paddle.zeros_like(labels)\n        final_label = final_label.squeeze().numpy()\n    else:\n        prot = random.randint(5, 15)\n        nrot = random.randint(-15, -5)\n        rots = [prot, nrot, 0]\n        rot = rots[random.randint(0, 2)]\n        sc = scales[random.randint(0, 2)]\n        _, _, h, w = labels.shape\n        tmp = labels.squeeze()\n        tmp = tmp.unsqueeze(-1)\n        tmp = tmp.numpy().astype(np.uint8)\n        morph_p = random.random()\n        if morph_p < 0.5:\n            tmp = cv2.morphologyEx(tmp, cv2.MORPH_OPEN, kernel)"
+        },
+        {
+            "comment": "The code defines two functions, \"damage_masks\" and \"damage_masks_np\". The former applies mask damaging to a batch of input labels while the latter performs the actual mask damaging on individual numpy arrays. These functions can be used to alter the input masks by applying shifts, scales, and rotations. The output is then converted into a tensor for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/mask_damaging.py\":36-72",
+            "content": "        else:\n            tmp = cv2.morphologyEx(tmp, cv2.MORPH_CLOSE, kernel)\n        tmp = tmp.astype(np.uint8)\n        center = (w / 2, h / 2)\n        M = cv2.getRotationMatrix2D(center, rot, sc)\n        final_label = cv2.warpAffine(tmp, M, (w, h), cv2.INTER_NEAREST)\n    return final_label\n#####\ndef damage_masks(labels, shift=True, scale=True, rotate=True):\n    \"\"\"\n    Args:\n    labels: numpy array (batch_size * 1 * h * w)\n    \"\"\"\n    bs, _, h, w = labels.shape\n    labels = labels.transpose([0, 2, 3, 1])\n    labels = labels.numpy()\n    final_label = []\n    for i in range(bs):\n        label = labels[i]\n        damaged_label = damage_masks_np(label, shift, scale, rotate)\n        final_label.append(damaged_label)\n    final_label = np.array(final_label)\n    final_label = paddle.to_tensor(final_label)\n    final_label = final_label.transpose([0, 3, 1, 2])\n    return final_label\ndef damage_masks_np(labels, shift=True, scale=True, rotate=True):\n    \"\"\"Performs the actual mask damaging in numpy.\n    Args:\n    labels: Int32 numpy array of shape (height, width, 1)."
+        },
+        {
+            "comment": "This function takes a mask and applies various damage operations such as shifting, scaling, rotation, or dilation to it. It first extracts unique labels from the input mask, shuffles them for random depth ordering, and creates an empty damaged labels array. Then, for each unique label, it applies the single object mask damaging function to the corresponding mask region. The damaged masks are then combined with their original labels to create the final damaged labels array.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/mask_damaging.py\":73-97",
+            "content": "    shift: Boolean, whether to damage the masks by shifting.\n    scale: Boolean, whether to damage the masks by scaling.\n    rotate: Boolean, whether to damage the masks by rotation.\n    dilate: Boolean, whether to damage the masks by dilation.\n    Returns:\n    The damaged version of labels.\n    \"\"\"\n    unique_labels = np.unique(labels)\n    unique_labels = np.setdiff1d(unique_labels, [0])\n    # Shuffle to get random depth ordering when combining together.\n    np.random.shuffle(unique_labels)\n    damaged_labels = np.zeros_like(labels)\n    for l in unique_labels:\n        obj_mask = (labels == l)\n        damaged_obj_mask = _damage_single_object_mask(obj_mask, shift, scale,\n                                                      rotate)\n        damaged_labels[damaged_obj_mask] = l\n    return damaged_labels\ndef _damage_single_object_mask(mask, shift, scale, rotate):\n    \"\"\"Performs mask damaging in numpy for a single object.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    shift: Boolean, whether to damage the masks by shifting."
+        },
+        {
+            "comment": "This code is from the PaddleVideo library and performs mask damage on a given input mask. The mask can be damaged by shifting, scaling, and/or rotation depending on the provided boolean parameters. The function _shift_mask() shifts the mask randomly based on the maximum shift factor. The returned mask is the damaged version of the original input mask.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/mask_damaging.py\":98-128",
+            "content": "    scale: Boolean, whether to damage the masks by scaling.\n    rotate: Boolean, whether to damage the masks by rotation.\n    dilate: Boolean, whether to damage the masks by dilation.\n    Returns:\n    The damaged version of mask.\n    \"\"\"\n    if shift:\n        mask = _shift_mask(mask)\n    if scale:\n        mask = _scale_mask(mask)\n    if rotate:\n        mask = _rotate_mask(mask)\n    return mask\ndef _shift_mask(mask, max_shift_factor=0.05):\n    \"\"\"Damages a mask for a single object by randomly shifting it in numpy.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    max_shift_factor: Float scalar, the maximum factor for random shifting.\n    Returns:\n    The shifted version of mask.\n    \"\"\"\n    nzy, nzx, _ = mask.nonzero()\n    h = nzy.max() - nzy.min()\n    w = nzx.max() - nzx.min()\n    size = np.sqrt(h * w)\n    offset = np.random.uniform(-size * max_shift_factor,\n                               size * max_shift_factor, 2)\n    shifted_mask = interpolation.shift(np.squeeze(mask, axis=2),\n                                       offset,"
+        },
+        {
+            "comment": "The code contains three functions: _damage_mask, _scale_mask, and _rotate_mask. These functions are used to randomly damage a mask for a single object in the image. The _damage_mask function applies random noise to the mask by using random values from a uniform distribution and subtracting it from the original mask. The _scale_mask function scales the mask in numpy by applying a scale factor randomly generated within a specific range around 1.0. The _rotate_mask function rotates the mask by a random angle between -max_rot_degrees to max_rot_degrees degrees. These functions can be used together or separately depending on the desired damage to the mask.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/mask_damaging.py\":129-154",
+            "content": "                                       order=0).astype('bool')[..., np.newaxis]\n    return shifted_mask\ndef _scale_mask(mask, scale_amount=0.025):\n    \"\"\"Damages a mask for a single object by randomly scaling it in numpy.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    scale_amount: Float scalar, the maximum factor for random scaling.\n    Returns:\n    The scaled version of mask.\n    \"\"\"\n    nzy, nzx, _ = mask.nonzero()\n    cy = 0.5 * (nzy.max() - nzy.min())\n    cx = 0.5 * (nzx.max() - nzx.min())\n    scale_factor = np.random.uniform(1.0 - scale_amount, 1.0 + scale_amount)\n    shift = transform.SimilarityTransform(translation=[-cx, -cy])\n    inv_shift = transform.SimilarityTransform(translation=[cx, cy])\n    s = transform.SimilarityTransform(scale=[scale_factor, scale_factor])\n    m = (shift + (s + inv_shift)).inverse\n    scaled_mask = transform.warp(mask, m) > 0.5\n    return scaled_mask\ndef _rotate_mask(mask, max_rot_degrees=3.0):\n    \"\"\"Damages a mask for a single object by randomly rotating it in numpy."
+        },
+        {
+            "comment": "This function takes a boolean numpy array mask and maximum rotation angle, then returns a scaled version of the mask after applying random rotations and translations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/mask_damaging.py\":155-169",
+            "content": "    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    max_rot_degrees: Float scalar, the maximum number of degrees to rotate.\n    Returns:\n    The scaled version of mask.\n    \"\"\"\n    cy = 0.5 * mask.shape[0]\n    cx = 0.5 * mask.shape[1]\n    rot_degrees = np.random.uniform(-max_rot_degrees, max_rot_degrees)\n    shift = transform.SimilarityTransform(translation=[-cx, -cy])\n    inv_shift = transform.SimilarityTransform(translation=[cx, cy])\n    r = transform.SimilarityTransform(rotation=np.deg2rad(rot_degrees))\n    m = (shift + (r + inv_shift)).inverse\n    scaled_mask = transform.warp(mask, m) > 0.5\n    return scaled_mask"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4074085e-82d4-480b-a2cd-258f08196ddc.json b/docs/doc/4074085e-82d4-480b-a2cd-258f08196ddc.json
new file mode 100644
index 000000000..6829fa185
--- /dev/null
+++ b/docs/doc/4074085e-82d4-480b-a2cd-258f08196ddc.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code contains four functions that utilize the FFmpeg tool for handling video and audio files. \"ffmpeg_frames\" extracts frames from a given MP4 file, \"ffmpeg_pcm\" extracts audio in PCM format, \"ffmpeg_mp4\" downloads an MP4 file, and \"get_images\" lists the images inside a specified image directory.",
+    "details": [
+        {
+            "comment": "This code contains four functions that utilize the FFmpeg tool for handling video and audio files. \"ffmpeg_frames\" extracts frames from a given MP4 file, \"ffmpeg_pcm\" extracts audio in PCM format, \"ffmpeg_mp4\" downloads an MP4 file, and \"get_images\" lists the images inside a specified image directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/preprocess.py\":0-34",
+            "content": "\"\"\" extract frames and pcm\"\"\"\nimport os\nimport sys\nimport shutil\ndef ffmpeg_frames(mp4_addr, frame_out_folder, fps=5):\n    \"\"\"ffmpeg_frames\"\"\"\n    if os.path.exists(frame_out_folder):\n        shutil.rmtree(frame_out_folder)\n    os.makedirs(frame_out_folder)\n    cmd = './src/utils/ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (mp4_addr, fps, frame_out_folder, '%08d')\n    os.system(cmd)\ndef ffmpeg_pcm(mp4_addr, save_file_name):\n    \"\"\"ffmpeg_pcm\"\"\"\n    cmd = './src/utils/ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' \\\n        % (mp4_addr, save_file_name)\n    os.system(cmd)\ndef ffmpeg_mp4(mp4_url, mp4_addr):\n    \"\"\"ffmpeg_mp4\"\"\"\n    cmd = \"wget %s -O %s -q\" % (mp4_url, mp4_addr)\n    print (\"cmd = \", cmd)\n    os.system(cmd)\ndef get_images(image_path):\n    \"\"\"get_images\"\"\"\n    images = sorted(os.listdir(image_path))\n    images = images\n    images_path_list = [image_path + '/' + im for im in images]\n    return images_path_list"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/40c75bd6-da2e-49e5-90a6-02102a291481.json b/docs/doc/40c75bd6-da2e-49e5-90a6-02102a291481.json
new file mode 100644
index 000000000..02f6341a0
--- /dev/null
+++ b/docs/doc/40c75bd6-da2e-49e5-90a6-02102a291481.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This Python function uses PaddlePaddle to test models, enabling parallel processing and logging. It initializes the device, constructs model, dataset, and dataloader in test mode with adjustable parameters. The code builds a dataloader, loads state_dicts, sets up metrics, and iterates over batches for output or metric updates before accumulating the final result.",
+    "details": [
+        {
+            "comment": "The code is a Python function for testing a model using PaddlePaddle framework. It takes configuration (cfg) and weights path (weights) as inputs, and allows for parallel processing. The logger captures any log messages from the function execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/test.py\":0-31",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddlevideo.utils import get_logger, load\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..metrics import build_metric\nfrom ..modeling.builder import build_model\nlogger = get_logger(\"paddlevideo\")\n@paddle.no_grad()\ndef test_model(cfg, weights, parallel=True):\n    \"\"\"Test model entry\n    Args:\n        cfg (dict): configuration.\n        weights (str): weights path to load.\n        parallel (bool): Whether to do multi-cards testing. Default: True."
+        },
+        {
+            "comment": "This code block initializes the model's device, constructs and configures the model, dataset, and dataloader. It also sets test mode and adjusts batch size and number of workers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/test.py\":33-60",
+            "content": "    \"\"\"\n    if cfg.get('use_npu', False):\n        places = paddle.set_device('npu')\n    elif cfg.get('use_xpu', False):\n        places = paddle.set_device('xpu')\n    else:\n        places = paddle.set_device('gpu')\n    # 1. Construct model.\n    if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):\n        cfg.MODEL.backbone.pretrained = ''  # disable pretrain model init\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    # 2. Construct dataset and dataloader.\n    cfg.DATASET.test.test_mode = True\n    dataset = build_dataset((cfg.DATASET.test, cfg.PIPELINE.test))\n    batch_size = cfg.DATASET.get(\"test_batch_size\", 8)\n    # default num worker: 0, which means no subprocess will be created\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    num_workers = cfg.DATASET.get('test_num_workers', num_workers)\n    dataloader_setting = dict(batch_size=batch_size,\n                              num_workers=num_workers,\n                              places=places,"
+        },
+        {
+            "comment": "The code builds a dataloader for the dataset, loads state_dicts into the model, and sets up metrics. It then iterates over batches of data from the dataloader to either update the metric directly or get outputs from the model before updating the metric. After processing all batches, it accumulates the final result in the metric.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/test.py\":61-89",
+            "content": "                              drop_last=False,\n                              shuffle=False)\n    data_loader = build_dataloader(\n        dataset, **dataloader_setting) if cfg.model_name not in ['CFBI'\n                                                                 ] else dataset\n    model.eval()\n    state_dicts = load(weights)\n    model.set_state_dict(state_dicts)\n    # add params to metrics\n    cfg.METRIC.data_size = len(dataset)\n    cfg.METRIC.batch_size = batch_size\n    Metric = build_metric(cfg.METRIC)\n    if cfg.MODEL.framework == \"FastRCNN\":\n        Metric.set_dataset_info(dataset.info, len(dataset))\n    for batch_id, data in enumerate(data_loader):\n        if cfg.model_name in [\n                'CFBI'\n        ]:  # for VOS task, dataset for video and dataloader for frames in each video\n            Metric.update(batch_id, data, model)\n        else:\n            outputs = model(data, mode='test')\n            Metric.update(batch_id, data, outputs)\n    Metric.accumulate()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/40dbbb97-f039-42fd-a668-0f9b89bc7745.json b/docs/doc/40dbbb97-f039-42fd-a668-0f9b89bc7745.json
new file mode 100644
index 000000000..a7e256089
--- /dev/null
+++ b/docs/doc/40dbbb97-f039-42fd-a668-0f9b89bc7745.json
@@ -0,0 +1,20 @@
+{
+    "summary": "TSNHead class is defined for image classification, inheriting from BaseHead with num_classes, in_channels, and loss_cfg parameters. It uses AdaptiveAvgPool2D, Linear, Dropout layers and weight_init function for initialization. The function defines a head for TSN model that performs average pooling, reshapes, takes mean, applies dropout if enabled, and passes through fully connected layer for classification scores.",
+    "details": [
+        {
+            "comment": "The code is defining a TSNHead class for image classification, which inherits from the BaseHead class. It has parameters for num_classes, in_channels, and loss_cfg, and uses AdaptiveAvgPool2D, Linear, Dropout layers. The weight_init function is also imported for weight initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/tsn_head.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle.nn import AdaptiveAvgPool2D, Linear, Dropout\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass TSNHead(BaseHead):\n    \"\"\"TSN Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss')."
+        },
+        {
+            "comment": "Initializes a TSN head with specified parameters, including num_classes, in_channels, loss_cfg, drop_ratio, std, and data_format. It creates an adaptive average pooling layer, a dropout layer if drop_ratio is non-zero, and a fully connected linear layer (fc). The fc layer weights are then initialized with normal distribution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/tsn_head.py\":30-62",
+            "content": "        drop_ratio(float): drop ratio. Default: 0.4.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 drop_ratio=0.4,\n                 std=0.01,\n                 data_format=\"NCHW\",\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.drop_ratio = drop_ratio\n        self.std = std\n        #NOTE: global pool performance\n        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)\n        if self.drop_ratio != 0:\n            self.dropout = Dropout(p=self.drop_ratio)\n        else:\n            self.dropout = None\n        self.fc = Linear(self.in_channels, self.num_classes)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'Normal',"
+        },
+        {
+            "comment": "The function defines a head for the TSN model. It performs average pooling, reshapes the input, takes the mean along an axis, applies dropout if enabled, and passes the result through a fully connected layer to output scores for classification.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/tsn_head.py\":63-92",
+            "content": "                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.,\n                     std=self.std)\n    def forward(self, x, num_seg):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        #XXX: check dropout location!\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, in_channels]\n        if self.dropout is not None:\n            x = self.dropout(x)\n            # [N, in_channels]\n        score = self.fc(x)\n        # [N, num_class]\n        #x = F.softmax(x)  #NOTE remove\n        return score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/413c1454-cc8c-4299-a143-292ac01c3855.json b/docs/doc/413c1454-cc8c-4299-a143-292ac01c3855.json
new file mode 100644
index 000000000..8c5bd562b
--- /dev/null
+++ b/docs/doc/413c1454-cc8c-4299-a143-292ac01c3855.json
@@ -0,0 +1,65 @@
+{
+    "summary": "This code initializes a reader class for PaddleVideo's MultimodalVideoTag application, preprocesses text, formats input sequences for BERT/ERNIE models, creates Record objects, generates batches with padding, and handles data generation for ERNIE models.",
+    "details": [
+        {
+            "comment": "This code is for the \"ernie\" reader, a part of PaddleVideo's MultimodalVideoTag application. It includes licensing information and various import statements for different functionalities like file handling, JSON parsing, random number generation, logging, numpy operations, and namedtuple creation. The log variable is initialized for error reporting.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":0-34",
+            "content": "\"\"\"\nernie reader\n\"\"\"\n#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\nfrom __future__ import absolute_import\nimport sys\nimport os\nimport json\nimport random\nimport logging\nimport numpy as np\nimport six\nfrom io import open\nfrom collections import namedtuple\nfrom .tokenization import FullTokenizer, convert_to_unicode\nlog = logging.getLogger(__name__)"
+        },
+        {
+            "comment": "This code snippet defines a `BaseReader` class which initializes an object with various parameters related to text preprocessing, including maximum sequence length, tokenizer, and other properties. It also includes a utility function `csv_reader` that reads data from files in CSV format. The code adjusts the Python output stream encoding if running on Python 3, ensuring consistent text handling across all outputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":36-73",
+            "content": "if six.PY3:\n    import io\n    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')\n    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')\ndef csv_reader(fd, delimiter='\\t'):\n    \"\"\"csv_reader\n    \"\"\"\n    def gen():\n        \"\"\"gen\n        \"\"\"\n        for i in fd:\n            yield i.rstrip('\\n').split(delimiter)\n    return gen()\nclass BaseReader(object):\n    \"\"\"BaseReader\n    \"\"\"\n    def __init__(self,\n                 vocab_path,\n                 label_map_config=None,\n                 max_seq_len=512,\n                 do_lower_case=True,\n                 in_tokens=False,\n                 is_inference=False,\n                 random_seed=None,\n                 tokenizer=\"FullTokenizer\",\n                 is_classify=True,\n                 is_regression=False,\n                 for_cn=True,\n                 task_id=0):\n        self.max_seq_len = max_seq_len\n        self.tokenizer = FullTokenizer(vocab_file=vocab_path,\n                                       do_lower_case=do_lower_case)\n        self.vocab = self.tokenizer.vocab"
+        },
+        {
+            "comment": "This code initializes various attributes of the class and sets up some configurations for tokenizing input data. It also loads a label map from a file if provided, or sets it to None otherwise. The \"_truncate_seq_pair\" function truncates sequence pairs in place to the maximum length specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":74-101",
+            "content": "        self.pad_id = self.vocab[\"[PAD]\"]\n        self.cls_id = self.vocab[\"[CLS]\"]\n        self.sep_id = self.vocab[\"[SEP]\"]\n        self.in_tokens = in_tokens\n        self.is_inference = is_inference\n        self.for_cn = for_cn\n        self.task_id = task_id\n        np.random.seed(random_seed)\n        self.is_classify = is_classify\n        self.is_regression = is_regression\n        self.current_example = 0\n        self.current_epoch = 0\n        self.num_examples = 0\n        if label_map_config:\n            with open(label_map_config, encoding='utf8') as f:\n                self.label_map = json.load(f)\n        else:\n            self.label_map = None\n    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):\n        \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n        # This is a simple heuristic which will always truncate the longer sequence\n        # one token at a time. This makes more sense than truncating an equal percent\n        # of tokens from each, since if one sequence is very short then each token"
+        },
+        {
+            "comment": "This function converts an example into a record. It tokenizes text_a and optionally text_b, then truncates the sequences if they exceed max_seq_length by popping tokens from either tokens_a or tokens_b.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":102-130",
+            "content": "        # that's truncated likely contains more information than a longer sequence.\n        while True:\n            total_length = len(tokens_a) + len(tokens_b)\n            if total_length <= max_length:\n                break\n            if len(tokens_a) > len(tokens_b):\n                tokens_a.pop()\n            else:\n                tokens_b.pop()\n    def _convert_example_to_record(self, example, max_seq_length, tokenizer):\n        \"\"\"Converts a single `Example` into a single `Record`.\"\"\"\n        text_a = convert_to_unicode(example.text_a)\n        tokens_a = tokenizer.tokenize(text_a)\n        tokens_b = None\n        has_text_b = False\n        if isinstance(example, dict):\n            has_text_b = \"text_b\" in example.keys()\n        else:\n            has_text_b = \"text_b\" in example._fields\n        if has_text_b:\n            text_b = convert_to_unicode(example.text_b)\n            tokens_b = tokenizer.tokenize(text_b)\n        if tokens_b:\n            # Modifies `tokens_a` and `tokens_b` in place so that the total"
+        },
+        {
+            "comment": "The code ensures that the input sequences for BERT/ERNIE models are formatted correctly. If the sequence length is less than the specified maximum length, it accounts for [CLS], [SEP], and [SEP] tokens with adjustments. If the sequence length exceeds the limit, it truncates the longer token sequence accordingly. The code also assigns type_ids to indicate whether it's the first or second sequence, as these are used in the model's embedding vectors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":131-150",
+            "content": "            # length is less than the specified length.\n            # Account for [CLS], [SEP], [SEP] with \"- 3\"\n            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n        else:\n            # Account for [CLS] and [SEP] with \"- 2\"\n            if len(tokens_a) > max_seq_length - 2:\n                tokens_a = tokens_a[0:(max_seq_length - 2)]\n        # The convention in BERT/ERNIE is:\n        # (a) For sequence pairs:\n        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1\n        # (b) For single sequences:\n        #  tokens:   [CLS] the dog is hairy . [SEP]\n        #  type_ids: 0     0   0   0  0     0 0\n        #\n        # Where \"type_ids\" are used to indicate whether this is the first\n        # sequence or the second sequence. The embedding vectors for `type=0` and\n        # `type=1` were learned during pre-training and are added to the wordpiece\n        # embedding vector (and position vector). This is not *strictly* necessary"
+        },
+        {
+            "comment": "This code prepares input data for the ERNIE model by combining tokens from two input sequences (tokens_a and tokens_b) into a single sequence. It appends \"[CLS]\" at the start, \"[SEP]\" to separate the sequences, and assigns text_type_id 0 or 1 based on the source sequence. The code also converts the tokens to token ids and generates position ids for the input data. This is specifically designed for classification tasks where the \"[CLS]\" vector represents the overall sentence vector after fine-tuning the entire model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":151-178",
+            "content": "        # since the [SEP] token unambiguously separates the sequences, but it makes\n        # it easier for the model to learn the concept of sequences.\n        #\n        # For classification tasks, the first vector (corresponding to [CLS]) is\n        # used as as the \"sentence vector\". Note that this only makes sense because\n        # the entire model is fine-tuned.\n        tokens = []\n        text_type_ids = []\n        tokens.append(\"[CLS]\")\n        text_type_ids.append(0)\n        for token in tokens_a:\n            tokens.append(token)\n            text_type_ids.append(0)\n        tokens.append(\"[SEP]\")\n        text_type_ids.append(0)\n        if tokens_b:\n            for token in tokens_b:\n                tokens.append(token)\n                text_type_ids.append(1)\n            tokens.append(\"[SEP]\")\n            text_type_ids.append(1)\n        token_ids = tokenizer.convert_tokens_to_ids(tokens)\n        position_ids = list(range(len(token_ids)))\n        if self.is_inference:\n            Record = namedtuple('Record',"
+        },
+        {
+            "comment": "This code defines a function to create a \"Record\" object, which contains token_ids, text_type_ids, position_ids (possibly label_id and qid depending on the example). It also includes another function _prepare_batch_data that generates batch records from examples. The batch size and phase are also taken as parameters in this function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":179-206",
+            "content": "                                ['token_ids', 'text_type_ids', 'position_ids'])\n            record = Record(token_ids=token_ids,\n                            text_type_ids=text_type_ids,\n                            position_ids=position_ids)\n        else:\n            if self.label_map:\n                label_id = self.label_map[example.label]\n            else:\n                label_id = example.label\n            Record = namedtuple('Record', [\n                'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'\n            ])\n            qid = None\n            if \"qid\" in example._fields:\n                qid = example.qid\n            record = Record(token_ids=token_ids,\n                            text_type_ids=text_type_ids,\n                            position_ids=position_ids,\n                            label_id=label_id,\n                            qid=qid)\n        return record\n    def _prepare_batch_data(self, examples, batch_size, phase=None):\n        \"\"\"generate batch records\"\"\"\n        batch_records, max_len = [], 0"
+        },
+        {
+            "comment": "This code iterates through examples and converts them to records. It then appends the records to a batch and pads the batch with zeros if it reaches the maximum size. It yields batches of records, ensuring that each batch is padded to the same length before being passed to the next step in the process. This class inherits from BaseReader and is used for getting Ernie embedding. The method _pad_batch_records pads the batch with zeros if it exceeds the maximum size, ensuring all batches are of equal length.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":207-234",
+            "content": "        for index, example in enumerate(examples):\n            if phase == \"train\":\n                self.current_example = index\n            record = self._convert_example_to_record(example, self.max_seq_len,\n                                                     self.tokenizer)\n            max_len = max(max_len, len(record.token_ids))\n            if self.in_tokens:\n                to_append = (len(batch_records) + 1) * max_len <= batch_size\n            else:\n                to_append = len(batch_records) < batch_size\n            if to_append:\n                batch_records.append(record)\n            else:\n                yield self._pad_batch_records(batch_records)\n                batch_records, max_len = [record], len(record.token_ids)\n        if batch_records:\n            yield self._pad_batch_records(batch_records)\nclass ExtractEmbeddingReader(BaseReader):\n    \"\"\"\n    data prepare for getting erine embedding \n    \"\"\"\n    def _pad_batch_records(self, batch_records):\n        \"\"\"\n        \u5bf9\u5b57\u6807\u53f7\uff0c\u4f4d\u7f6e\u6807\u53f7\u7279\u5f81\u8fdb\u884c\u56fa\u5b9a\u957f\u5ea6\u8865\u5168\n        batch_records \u5305\u542b\u591a\u6761\u6587\u672c\u7684\u6807\u53f7"
+        },
+        {
+            "comment": "This code is processing a batch of records and padding token ids, text type ids, position ids, and task ids for an ERNIE (Enhanced Refined Network with Incremental Learning and Exploration) model. The processed data will be used as input for the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":235-256",
+            "content": "        return [\u5b57\u6807\u53f7\u5217\u8868\uff0c\u6587\u672c\u7c7b\u578b\u5217\u8868\uff0c\u4f4d\u7f6e\u7279\u5f81\u5217\u8868\uff0c\u4efb\u52a1\u6807\u53f7\u5217\u8868\uff0c\u63a9\u7801\u5217\u8868]\n        \"\"\"\n        batch_token_ids = [record.token_ids for record in batch_records]\n        batch_text_type_ids = [\n            record.text_type_ids for record in batch_records\n        ]\n        batch_position_ids = [record.position_ids for record in batch_records]\n        # padding\n        padded_token_ids, input_mask, seq_lens = pad_batch_data(\n            batch_token_ids,\n            pad_idx=self.pad_id,\n            return_input_mask=True,\n            return_seq_lens=True,\n            max_len=self.max_seq_len)\n        padded_text_type_ids = pad_batch_data(batch_text_type_ids,\n                                              pad_idx=self.pad_id,\n                                              max_len=self.max_seq_len)\n        padded_position_ids = pad_batch_data(batch_position_ids,\n                                             pad_idx=self.pad_id,\n                                             max_len=self.max_seq_len)\n        padded_task_ids = np.ones_like(padded_token_ids,"
+        },
+        {
+            "comment": "This code is related to text processing and data generation for a specific task reader. It converts input texts into indexed representations and pads the data to ensure consistent sequence lengths. The function `data_generate_from_text` takes in a single text, converts it into a record, pads the batch of records, and returns the resulting one-hot encoded text representation. The `pad_batch_data` function is used for padding other types of data as well.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":257-288",
+            "content": "                                       dtype=\"int64\") * self.task_id\n        return_list = [\n            padded_token_ids, padded_text_type_ids, padded_position_ids,\n            padded_task_ids, input_mask\n        ]\n        return return_list\n    def data_generate_from_text(self, text):\n        \"\"\"\n        trans text to idx\n        input single text\n        return 5*maxlen*1\n        \"\"\"\n        Example = namedtuple('Example', ['text_a', 'label'])\n        example = Example(text, 0)\n        records = [\n            self._convert_example_to_record(example, self.max_seq_len,\n                                            self.tokenizer)\n        ]\n        pad_records = self._pad_batch_records(records)\n        text_one_hot = np.concatenate(pad_records, axis=0).astype('int64')\n        return text_one_hot\ndef pad_batch_data(insts,\n                   pad_idx=0,\n                   max_len=None,\n                   return_pos=False,\n                   return_input_mask=False,\n                   return_max_len=False,\n                   return_num_token=False,"
+        },
+        {
+            "comment": "This function pads instances to the maximum sequence length in a batch. It first calculates the max_len based on instance lengths and then adds padding to shorter instances if necessary. It creates a 3D tensor of input data, position data (if required), and attention masks (if required). These tensors are added to a return list before being returned by the function. The padding is used to make no effect on parameter gradients by being masked out with weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":289-316",
+            "content": "                   return_seq_lens=False):\n    \"\"\"\n    Pad the instances to the max sequence length in batch, and generate the\n    corresponding position data and attention bias.\n    \"\"\"\n    return_list = []\n    if max_len is None:\n        max_len = max(len(inst) for inst in insts)\n    # Any token included in dict can be used to pad, since the paddings' loss\n    # will be masked out by weights and make no effect on parameter gradients.\n    inst_data = np.array(\n        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])\n    return_list += [inst_data.astype(\"int64\").reshape([-1, max_len, 1])]\n    # position data\n    if return_pos:\n        inst_pos = np.array([\n            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))\n            for inst in insts\n        ])\n        return_list += [inst_pos.astype(\"int64\").reshape([-1, max_len, 1])]\n    if return_input_mask:\n        # This is used to avoid attention on paddings.\n        input_mask_data = np.array(\n            [[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])"
+        },
+        {
+            "comment": "This code prepares a return list by adding various elements like input_mask_data, max_len (if required), number of tokens (if required), and sequence lengths (if required) before returning the final list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py\":317-333",
+            "content": "        input_mask_data = np.expand_dims(input_mask_data, axis=-1)\n        return_list += [input_mask_data.astype(\"float32\")]\n    if return_max_len:\n        return_list += [max_len]\n    if return_num_token:\n        num_token = 0\n        for inst in insts:\n            num_token += len(inst)\n        return_list += [num_token]\n    if return_seq_lens:\n        seq_lens = np.array([len(inst) for inst in insts])\n        return_list += [seq_lens.astype(\"int64\").reshape([-1])]\n    return return_list if len(return_list) > 1 else return_list[0]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/417ea4e8-07a5-41cb-9904-3076ff6620e1.json b/docs/doc/417ea4e8-07a5-41cb-9904-3076ff6620e1.json
new file mode 100644
index 000000000..19e02f8ab
--- /dev/null
+++ b/docs/doc/417ea4e8-07a5-41cb-9904-3076ff6620e1.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This Python code defines a SegmentationSampler class in PaddleVideo, which samples data at a specified rate and registers it for the Action Segmentation Dataset. It is part of a video processing library's pipeline, likely for segmentation purposes.",
+    "details": [
+        {
+            "comment": "This Python code is from the PaddleVideo library and defines a SegmentationSampler class. It samples data at a specified rate, only keeps every nth element in a 1D array, and registers this pipeline operation for the Action Segmentation Dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/segmentation_pipline.py\":0-34",
+            "content": "#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport os\nimport numpy as np\nimport random\nimport paddle\nfrom ..registry import PIPELINES\n\"\"\"\npipeline ops for Action Segmentation Dataset.\n\"\"\"\n@PIPELINES.register()\nclass SegmentationSampler(object):\n    def __init__(self, sample_rate):\n        self.sample_rate = sample_rate\n    def __call__(self, results):\n        for key, data in results.items():\n            if len(data.shape) == 1:\n                data = data[::self.sample_rate]"
+        },
+        {
+            "comment": "This code segment appears to be part of a pipeline in a video processing library, possibly for segmentation. It selects specific data based on the sample rate and stores it in a results dictionary with deep copy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/segmentation_pipline.py\":35-39",
+            "content": "                results[key] = copy.deepcopy(data)\n            else:\n                data = data[:, ::self.sample_rate]\n                results[key] = copy.deepcopy(data)\n        return results"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/41f3e7a8-79e7-42b4-a57e-9533db88921c.json b/docs/doc/41f3e7a8-79e7-42b4-a57e-9533db88921c.json
new file mode 100644
index 000000000..2f08dafc6
--- /dev/null
+++ b/docs/doc/41f3e7a8-79e7-42b4-a57e-9533db88921c.json
@@ -0,0 +1,65 @@
+{
+    "summary": "This script prepares environment for benchmarking PaddleVideo model, trains with varying batch sizes and precisions, measures execution time, and processes log files to extract performance metrics.",
+    "details": [
+        {
+            "comment": "This script is a Bash function for running benchmark training on PaddlePaddle GPU. It sets environment variables, parses command line arguments, and executes the benchmark training using the provided configuration file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":0-41",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\n# set env\npython=python\nexport model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d\"/\" -f 3`\nexport model_commit=$(git log|head -n1|awk '{print $2}')\nexport str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)\nexport frame_version=${str_tmp%%.post*}\nexport frame_commit=$(echo `${python} -c \"import paddle;print(paddle.version.commit)\"`)\n# BENCHMARK_ROOT='.'  # only for self-test\n# run benchmark sh\n# Usage:\n# bash run_benchmark_train.sh config.txt params\n# or\n# bash run_benchmark_train.sh config.txt\nfunction func_parser_params(){\n    strs=$1\n    IFS=\"=\"\n    array=(${strs})\n    tmp=${array[1]}\n    echo ${tmp}\n}\nfunction func_sed_params(){\n    filename=$1\n    line=$2\n    param_value=$3\n    params=`sed -n \"${line}p\" $filename`\n    IFS=\":\"\n    array=(${params})\n    key=${array[0]}\n    value=${array[1]}\n    if [[ $value =~ 'benchmark_train' ]];then\n        IFS='='\n        _val=(${value})\n        param_value=\"${param_value}\"\n    fi\n    new_params=\"${key}:${param_value}\""
+        },
+        {
+            "comment": "This code defines functions to modify parameters in a file and set GPU IDs. It then copies the input filename, sets the mode as \"benchmark_train\", and processes additional parameters. The script performs operations such as modifying lines in the file and replacing \"trainer:norm_train\" with \"trainer:to_static_train\". The purpose of this code seems to be related to manipulating configuration files for a program using PaddleVideo's test_tipc directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":42-85",
+            "content": "    IFS=\";\"\n    cmd=\"sed -i '${line}s/.*/${new_params}/' '${filename}'\"\n    eval $cmd\n}\nfunction set_gpu_id(){\n    string=$1\n    _str=${string:1:6}\n    IFS=\"C\"\n    arr=(${_str})\n    M=${arr[0]}\n    P=${arr[1]}\n    gn=`expr $P - 1`\n    gpu_num=`expr $gn / $M`\n    seq=`seq -s \",\" 0 $gpu_num`\n    echo $seq\n}\nfunction get_repo_name(){\n    IFS=\";\"\n    cur_dir=$(pwd)\n    IFS=\"/\"\n    arr=(${cur_dir})\n    echo ${arr[-1]}\n}\nFILENAME=$1\n# copy FILENAME as new\nnew_filename=\"./test_tipc/benchmark_train.txt\"\ncmd=`yes|cp $FILENAME $new_filename`\nFILENAME=$new_filename\n# MODE must be one of ['benchmark_train']\nMODE=$2\nPARAMS=$3\nREST_ARGS=$4\n# bash test_tipc/benchmark_train.sh /workspace/PaddleVideo/test_tipc/configs/BMN/train_infer_python.txt benchmark_train dynamicTostatic_bs8_fp32_DP_N1C8\nto_static=\"\"\n# parse \"to_static\" options and modify trainer into \"to_static_trainer\"\nif [[ $PARAMS =~ \"dynamicTostatic\" ]] ;then\n   to_static=\"d2sT_\"\n   sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME\n   # clear PARAM contents\n   if [ $PARAMS = \"to_static\" ] ;then"
+        },
+        {
+            "comment": "The code is parsing parameters from the \"train_benchmark.txt\" file and setting variables such as model name, batch size, fp_items, epoch, profile option key, profile option parameters, flags value, and max_iters value for training purposes. These values will be used to train a specific model with given parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":86-122",
+            "content": "    PARAMS=\"\"\n   fi\nfi\nIFS=$'\\n'\n# parser params from train_benchmark.txt\ndataline=`cat $FILENAME`\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\nmodel_name=$(func_parser_value \"${lines[1]}\")\n# \u83b7\u53d6'train_benchmark_params'\u6240\u5728\u7684\u884c\u6570\nline_num=`grep -n -w \"train_benchmark_params\" $FILENAME  | cut -d \":\" -f 1`\n# for train log parser\nbatch_size=$(func_parser_value \"${lines[line_num]}\")\nline_num=`expr $line_num + 1`\nfp_items=$(func_parser_value \"${lines[line_num]}\")\nline_num=`expr $line_num + 1`\nepoch=$(func_parser_value \"${lines[line_num]}\")\nline_num=`expr $line_num + 1`\nprofile_option_key=$(func_parser_key \"${lines[line_num]}\")\nprofile_option_params=$(func_parser_value \"${lines[line_num]}\")\nprofile_option=\"${profile_option_key}:${profile_option_params}\"\nline_num=`expr $line_num + 1`\nflags_value=$(func_parser_value \"${lines[line_num]}\")\n# \u8bbe\u7f6e\u6bcf\u4e2a\u6a21\u578bmax-iters\uff0c\u4ee5\u83b7\u53d6\u7a33\u5b9a\u7684ips\nline_num=`expr $line_num + 1`\nmax_iters_value=$(func_parser_value \"${lines[line_num]}\")\n# set flags\nIFS=\";\"\nflags_list=(${flags_value})\nfor _flag in ${flags_list[*]}; do"
+        },
+        {
+            "comment": "This code is setting environment variables, defining log file locations and names, and using sed commands to modify a configuration file. It then executes the modified configuration file with additional command line parameters. This is likely part of a benchmarking or training process for machine learning or video processing tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":123-157",
+            "content": "    cmd=\"export ${_flag}\"\n    eval $cmd\ndone\n# set log_name\nrepo_name=$(get_repo_name )\nSAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)}   # */benchmark_log\nmkdir -p \"${SAVE_LOG}/benchmark_log/\"\nstatus_log=\"${SAVE_LOG}/benchmark_log/results.log\"\n# get benchmark profiling params : PROFILING_TIMER_ONLY=no|True|False\nPROFILING_TIMER_ONLY=${PROFILING_TIMER_ONLY:-\"True\"}\n# The number of lines in which train params can be replaced.\nline_python=3\nline_gpuid=4\nline_precision=6\nline_epoch=7\nline_batchsize=9\nline_profile=12\nline_eval_py=24\nline_eval_py_2=25\nline_export_py=38\nline_export_py_2=28\nline_export_py_3=30\nline_norm_train=16\nfunc_sed_params \"$FILENAME\" \"${line_eval_py}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_eval_py_2}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_export_py}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_export_py_2}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_export_py_3}\" \"null\"\nfunc_sed_params \"$FILENAME\" \"${line_python}\"  \"$python\"\n# \u672b\u5c3e\u52a0\u4e0a--max_iters=30\u548c--log_interval=1\uff0c\u4ee5\u4fbf\u8fd0\u884c\u5e76\u8f93\u51fa\u8db3\u91cf\u6570\u636e\nset_log_interval_cmd=\"sed -i '${line_norm_train}s/.*/& --max_iters=${max_iters_value} -o log_interval=1/' '${filename}'\""
+        },
+        {
+            "comment": "This code is parsing parameters and configuring the environment for benchmarking. It removes \"validate\" from the command, checks if the input is a dynamic or static parameter, and then assigns variables based on the type of model, batch size, precision, run mode, and device number. If the precision is null, it defaults to fp32.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":158-196",
+            "content": "eval $set_log_interval_cmd\n# \u53bb\u6389--validate\uff0cbenchmark\u4e0d\u9700\u8981validate\nremove_validate_cmd=\"sed -i '${line_norm_train}s/--validate//' '${filename}'\"\neval $remove_validate_cmd\n# if params\nif  [ ! -n \"$PARAMS\" ] ;then\n    # PARAMS input is not a word.\n    IFS=\"|\"\n    batch_size_list=(${batch_size})\n    fp_items_list=(${fp_items})\n    device_num_list=(N1C4)\n    run_mode=\"DP\"\nelif [[ ${PARAMS} = \"dynamicTostatic\" ]] ;then\n    IFS=\"|\"\n    model_type=$PARAMS\n    batch_size_list=(${batch_size})\n    fp_items_list=(${fp_items})\n    device_num_list=(N1C4)\n    run_mode=\"DP\"\nelse\n    # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}\n    IFS=\"_\"\n    params_list=(${PARAMS})\n    model_type=${params_list[0]}\n    batch_size=${params_list[1]}\n    batch_size=`echo  ${batch_size} | tr -cd \"[0-9]\" `\n    precision=${params_list[2]}\n    run_mode=${params_list[3]}\n    device_num=${params_list[4]}\n    IFS=\";\"\n    if [ ${precision} = \"null\" ];then\n        precision=\"fp32\"\n    fi\n    fp_items_list=($precision)\n    batch_size_list=($batch_size)"
+        },
+        {
+            "comment": "The code is iterating over different combinations of batch sizes and precisions to train the PaddleVideo model. It sets up various environment variables and uses sed to modify a file before running the training script on specific GPUs. The profile option determines if only timer information should be logged or if full profiling data should be collected.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":197-219",
+            "content": "    device_num_list=($device_num)\nfi\nlog_interval='--log_interval 1'\nIFS=\"|\"\nfor batch_size in ${batch_size_list[*]}; do\n    for precision in ${fp_items_list[*]}; do\n        for device_num in ${device_num_list[*]}; do\n            # sed batchsize and precision\n            func_sed_params \"$FILENAME\" \"${line_precision}\" \"$precision\"\n            func_sed_params \"$FILENAME\" \"${line_batchsize}\" \"$batch_size\"\n            func_sed_params \"$FILENAME\" \"${line_epoch}\" \"$epoch\"\n            gpu_id=$(set_gpu_id $device_num)\n            if [ ${#gpu_id} -le 1 ];then\n                func_sed_params \"$FILENAME\" \"${line_gpuid}\" \"0\"  # sed used gpu_id \n                if [[ ${PROFILING_TIMER_ONLY} != \"no\" ]];then\n                    echo \"run profile\"\n                    # The default value of profile_option's timer_only parameter is True\n                    if [[ ${PROFILING_TIMER_ONLY} = \"False\" ]];then\n                        profile_option=\"${profile_option};timer_only=False\"\n                    fi\n                    log_path=\"$SAVE_LOG/profiling_log\""
+        },
+        {
+            "comment": "Creates a directory for log storage, sets the name of the log file based on various parameters, modifies profile option settings if necessary (for TimeSformer models), and then runs test_train_inference_python.sh script with provided arguments, redirecting output to the specified log path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":220-233",
+            "content": "                    mkdir -p $log_path\n                    log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling\"\n                    # set profile_option params\n                    tmp=`sed -i \"${line_profile}s/.*/\\\"${profile_option}\\\"/\" \"${FILENAME}\"`\n                    # for models which need to accumulate gradient.\n                    if [[ ${model_name} =~ \"TimeSformer\" ]]; then\n                        global_bs=`expr ${batch_size} \\* ${device_num:3:4} \\* 8`\n                        modify_global_bs_cmd=\"sed -i '${line_norm_train}s/.*/& -o GRADIENT_ACCUMULATION.global_batch_size=${global_bs}/' '${filename}'\"\n                        eval $modify_global_bs_cmd\n                    fi\n                    # run test_train_inference_python.sh\n                    cmd=\"timeout 5m bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 \"\n                    echo $cmd\n                    eval ${cmd}"
+        },
+        {
+            "comment": "This code snippet executes a script without profiling. It sets the log and speed log paths, creates directories if necessary, and then runs a command to execute the test_train_inference_python.sh script. The run time is measured and stored in model_run_time variable. Finally, it displays the execution log.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":234-252",
+            "content": "                    eval \"cat ${log_path}/${log_name}\"\n                fi\n                echo \"run without profile\"  \n                # without profile\n                log_path=\"$SAVE_LOG/train_log\"\n                speed_log_path=\"$SAVE_LOG/index\"\n                mkdir -p $log_path\n                mkdir -p $speed_log_path\n                log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log\"\n                speed_log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed\"\n                func_sed_params \"$FILENAME\" \"${line_profile}\" \"null\"  # sed profile_id as null\n                cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 \"\n                echo $cmd\n                job_bt=`date '+%Y%m%d%H%M%S'`\n                eval $cmd\n                job_et=`date '+%Y%m%d%H%M%S'`\n                export model_run_time=$((${job_et}-${job_bt}))\n                eval \"cat ${log_path}/${log_name}\""
+        },
+        {
+            "comment": "This code section is using Python to execute an analysis script. The analysis script processes log files, extracting performance metrics like inference per second (ips) and loss convergence data. It also handles skipping steps during processing and considers the device used for computation. The resulting status is logged into a specified file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":254-273",
+            "content": "                # parser log\n                _model_name=\"${model_name}_bs${batch_size}_${precision}_${run_mode}\"\n                cmd=\"${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \\\n                        --speed_log_file '${speed_log_path}/${speed_log_name}' \\\n                        --model_name ${_model_name} \\\n                        --base_batch_size ${batch_size} \\\n                        --run_mode ${run_mode} \\\n                        --fp_item ${precision} \\\n                        --keyword ips: \\\n                        --skip_steps 5 \\\n                        --device_num ${device_num} \\\n                        --speed_unit instance/sec \\\n                        --convergence_key loss: \"\n                echo $cmd\n                eval $cmd\n                last_status=${PIPESTATUS[0]}\n                status_check $last_status \"${cmd}\" \"${status_log}\" \"${model_name}\"\n            else\n                IFS=\";\"\n                unset_env=`unset CUDA_VISIBLE_DEVICES`"
+        },
+        {
+            "comment": "Creates log and speed directories, sets variable names for logging files. Uses sed to modify the config file with gpu_id, profile option as null, and adjusts global batch size for TimeSformer model that needs gradient accumulation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":274-287",
+            "content": "                log_path=\"$SAVE_LOG/train_log\"\n                speed_log_path=\"$SAVE_LOG/index\"\n                mkdir -p $log_path\n                mkdir -p $speed_log_path\n                log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log\"\n                speed_log_name=\"${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed\"\n                func_sed_params \"$FILENAME\" \"${line_gpuid}\" \"$gpu_id\"  # sed used gpu_id\n                func_sed_params \"$FILENAME\" \"${line_profile}\" \"null\"  # sed --profile_option as null\n                # for models which need to accumulate gradient.\n                if [[ ${model_name} =~ \"TimeSformer\" ]]; then\n                    global_bs=`expr ${batch_size} \\* ${device_num:3:4} \\* 8`\n                    modify_global_bs_cmd=\"sed -i '${line_norm_train}s/.*/& -o GRADIENT_ACCUMULATION.global_batch_size=${global_bs}/' '${filename}'\"\n                    eval $modify_global_bs_cmd"
+        },
+        {
+            "comment": "This code segment is running a benchmark training script and logging the results. It measures the model run time, parses the log to extract information about the speed, and then passes this information to another script for further analysis. The script is designed to handle different batch sizes, precision types, and run modes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":288-307",
+            "content": "                fi\n                cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 \"\n                echo $cmd\n                job_bt=`date '+%Y%m%d%H%M%S'`\n                eval $cmd\n                job_et=`date '+%Y%m%d%H%M%S'`\n                export model_run_time=$((${job_et}-${job_bt}))\n                eval \"cat ${log_path}/${log_name}\"\n                # parser log\n                _model_name=\"${model_name}_bs${batch_size}_${precision}_${run_mode}\"\n                cmd=\"${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \\\n                        --speed_log_file '${speed_log_path}/${speed_log_name}' \\\n                        --model_name ${_model_name} \\\n                        --base_batch_size ${batch_size} \\\n                        --run_mode ${run_mode} \\\n                        --fp_item ${precision} \\\n                        --keyword ips: \\\n                        --skip_steps 5 \\\n                        --device_num ${device_num} \\"
+        },
+        {
+            "comment": "This code iterates through different models and configurations, running them with specified parameters. It logs the commands and checks their status to ensure successful execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/benchmark_train.sh\":308-317",
+            "content": "                        --speed_unit instance/sec \\\n                        --convergence_key loss: \"\n                echo $cmd\n                eval $cmd\n                last_status=${PIPESTATUS[0]}\n                status_check $last_status \"${cmd}\" \"${status_log}\" \"${model_name}\"\n            fi\n        done\n    done\ndone"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/425817af-113d-48b2-881e-e1becef9b76b.json b/docs/doc/425817af-113d-48b2-881e-e1becef9b76b.json
new file mode 100644
index 000000000..822efaaad
--- /dev/null
+++ b/docs/doc/425817af-113d-48b2-881e-e1becef9b76b.json
@@ -0,0 +1,60 @@
+{
+    "summary": "The ConvBNLayer class introduces PaddlePaddle's MobileNetV2 backbone model for image/video processing with pretrained weights and inverted residual units. It initializes and returns three models (PPTSM_MobileNetV2_x0_75, PPTSM_MobileNetV2_x1_5, PPTSM_MobileNetV2_x2_0).",
+    "details": [
+        {
+            "comment": "This code is part of the PaddlePaddle deep learning framework, specifically for the MobileNetV2 backbone model. It imports necessary libraries and defines functions for the architecture, weight initialization, and pre-trained model downloading. The commented sections provide licensing information and download URLs for pretrained models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":0-29",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn import Conv2D, BatchNorm, Linear, Dropout\nfrom paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt\n# Download URL of pretrained model\n# {\n# \"MobileNetV2\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_ssld_pretrained.pdparams\","
+        },
+        {
+            "comment": "This code defines the ConvBNLayer class, which inherits from nn.Layer and contains a convolutional layer followed by a batch normalization layer. The constructor takes several parameters such as number of channels, filter size, etc., to define the specifics of the convolutional layer. The URLs provided indicate that pretrained models are available for MobileNetV2 with various scaling factors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":31-57",
+            "content": "# \"MobileNetV2_x0_25\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_25_pretrained.pdparams\",\n# \"MobileNetV2_x0_5\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_5_pretrained.pdparams\",\n# \"MobileNetV2_x0_75\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_75_pretrained.pdparams\",\n# \"MobileNetV2_x1_5\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x1_5_pretrained.pdparams\",\n# \"MobileNetV2_x2_0\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x2_0_pretrained.pdparams\"\n# }\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 filter_size,\n                 num_filters,\n                 stride,\n                 padding,\n                 channels=None,\n                 num_groups=1,\n                 name=None,\n                 use_cudnn=True):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=num_channels,"
+        },
+        {
+            "comment": "The code defines a class for an inverted residual unit with batch normalization. The unit takes input, performs convolution, applies batch normalization, and optionally applies activation if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":58-84",
+            "content": "                            out_channels=num_filters,\n                            kernel_size=filter_size,\n                            stride=stride,\n                            padding=padding,\n                            groups=num_groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        self._batch_norm = BatchNorm(\n            num_filters,\n            param_attr=ParamAttr(name=name + \"_bn_scale\"),\n            bias_attr=ParamAttr(name=name + \"_bn_offset\"),\n            moving_mean_name=name + \"_bn_mean\",\n            moving_variance_name=name + \"_bn_variance\")\n    def forward(self, inputs, if_act=True):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if if_act:\n            y = F.relu6(y)\n        return y\nclass InvertedResidualUnit(nn.Layer):\n    def __init__(self, num_channels, num_in_filter, num_filters, stride,\n                 filter_size, padding, expansion_factor, name, num_seg):\n        super(InvertedResidualUnit, self).__init__()"
+        },
+        {
+            "comment": "This code initializes and assigns class attributes for a backbone model. It defines two convolutional layers, one for expansion (num_channels to num_expfilter) and another for bottleneck (num_expfilter to num_expfilter), both followed by BN operations. The layers are named with the prefix \"name\" for future reference or identification.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":85-102",
+            "content": "        self.num_seg = num_seg\n        num_expfilter = int(round(num_in_filter * expansion_factor))\n        self._expand_conv = ConvBNLayer(num_channels=num_channels,\n                                        num_filters=num_expfilter,\n                                        filter_size=1,\n                                        stride=1,\n                                        padding=0,\n                                        num_groups=1,\n                                        name=name + \"_expand\")\n        self._bottleneck_conv = ConvBNLayer(num_channels=num_expfilter,\n                                            num_filters=num_expfilter,\n                                            filter_size=filter_size,\n                                            stride=stride,\n                                            padding=padding,\n                                            num_groups=num_expfilter,\n                                            use_cudnn=False,\n                                            name=name + \"_dwise\")"
+        },
+        {
+            "comment": "This code defines a neural network layer, likely for image or video processing. It contains a series of convolutional layers and activation functions. The \"forward\" function applies temporal shift to the input based on the number of segments and performs convolutions in different stages. The \"InvresiBlocks\" class defines an Inverted Residual block with initial parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":104-131",
+            "content": "        self._linear_conv = ConvBNLayer(num_channels=num_expfilter,\n                                        num_filters=num_filters,\n                                        filter_size=1,\n                                        stride=1,\n                                        padding=0,\n                                        num_groups=1,\n                                        name=name + \"_linear\")\n    def forward(self, inputs, ifshortcut):\n        # add temporal shift module\n        y = inputs\n        if ifshortcut:\n            y = F.temporal_shift(y, self.num_seg, 1.0 / self.num_seg)\n        y = self._expand_conv(y, if_act=True)\n        y = self._bottleneck_conv(y, if_act=True)\n        y = self._linear_conv(y, if_act=False)\n        if ifshortcut:\n            y = paddle.add(inputs, y)\n        return y\nclass InvresiBlocks(nn.Layer):\n    def __init__(self, in_c, t, c, n, s, name, num_seg):\n        super(InvresiBlocks, self).__init__()\n        self._first_block = InvertedResidualUnit(num_channels=in_c,\n                                                 num_in_filter=in_c,"
+        },
+        {
+            "comment": "The code defines a function for the PPTSM_MV2 model, creating an InvertedResidualUnit with specified parameters and adding it to a list. The loop iterates from 1 to n-1, building multiple residual units with increasing indexes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":132-150",
+            "content": "                                                 num_filters=c,\n                                                 stride=s,\n                                                 filter_size=3,\n                                                 padding=1,\n                                                 expansion_factor=t,\n                                                 name=name + \"_1\",\n                                                 num_seg=num_seg)\n        self._block_list = []\n        for i in range(1, n):\n            block = self.add_sublayer(name + \"_\" + str(i + 1),\n                                      sublayer=InvertedResidualUnit(\n                                          num_channels=c,\n                                          num_in_filter=c,\n                                          num_filters=c,\n                                          stride=1,\n                                          filter_size=3,\n                                          padding=1,\n                                          expansion_factor=t,"
+        },
+        {
+            "comment": "This code defines a PPTSM-MV2 backbone and MobileNet model for image processing. The `__init__` function initializes the model with class number, scaling factor, pretrained weights, prefix name, and number of segments. The `forward` function passes inputs through each block in sequence. The `MobileNet` class defines a convolutional neural network (CNN) architecture with specific parameters for each stage, including the number of filters and stride sizes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":151-186",
+            "content": "                                          name=name + \"_\" + str(i + 1),\n                                          num_seg=num_seg))\n            self._block_list.append(block)\n    def forward(self, inputs):\n        y = self._first_block(inputs, ifshortcut=False)\n        for block in self._block_list:\n            y = block(y, ifshortcut=True)\n        return y\nclass MobileNet(nn.Layer):\n    def __init__(self,\n                 class_num=400,\n                 scale=1.0,\n                 pretrained=None,\n                 prefix_name=\"\",\n                 num_seg=8):\n        super(MobileNet, self).__init__()\n        self.scale = scale\n        self.class_num = class_num\n        self.pretrained = pretrained\n        self.num_seg = num_seg\n        bottleneck_params_list = [\n            (1, 16, 1, 1),\n            (6, 24, 2, 2),\n            (6, 32, 3, 2),\n            (6, 64, 4, 2),\n            (6, 96, 3, 1),\n            (6, 160, 3, 2),\n            (6, 320, 1, 1),\n        ]\n        self.conv1 = ConvBNLayer(num_channels=3,\n                                 num_filters=int(32 * scale),"
+        },
+        {
+            "comment": "This code initializes a PPTSM_MV2 backbone model. It adds a convolution layer with specific parameters, and creates a list of block layers using InvresiBlocks with varying settings. The scale value affects the number of input channels in each layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":187-206",
+            "content": "                                 filter_size=3,\n                                 stride=2,\n                                 padding=1,\n                                 name=prefix_name + \"conv1_1\")\n        self.block_list = []\n        i = 1\n        in_c = int(32 * scale)\n        for layer_setting in bottleneck_params_list:\n            t, c, n, s = layer_setting\n            i += 1\n            block = self.add_sublayer(prefix_name + \"conv\" + str(i),\n                                      sublayer=InvresiBlocks(in_c=in_c,\n                                                             t=t,\n                                                             c=int(c * scale),\n                                                             n=n,\n                                                             s=s,\n                                                             name=prefix_name +\n                                                             \"conv\" + str(i),\n                                                             num_seg=num_seg))"
+        },
+        {
+            "comment": "This code defines a class, appends blocks to block_list, sets output channels based on scale factor, initializes convolution and pooling layers, and defines an initialization function for the weights. It seems to be part of a deep learning model backbone implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":207-231",
+            "content": "            self.block_list.append(block)\n            in_c = int(c * scale)\n        self.out_c = int(1280 * scale) if scale > 1.0 else 1280\n        self.conv9 = ConvBNLayer(num_channels=in_c,\n                                 num_filters=self.out_c,\n                                 filter_size=1,\n                                 stride=1,\n                                 padding=0,\n                                 name=prefix_name + \"conv9\")\n        self.pool2d_avg = AdaptiveAvgPool2D(1)\n        self.out = Linear(self.out_c,\n                          class_num,\n                          weight_attr=ParamAttr(name=prefix_name +\n                                                \"fc10_weights\"),\n                          bias_attr=ParamAttr(name=prefix_name + \"fc10_offset\"))\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":"
+        },
+        {
+            "comment": "Initializes a PPTSM MobileNetV2 model with optional pretrained weights and customizable scale. Initializes the underlying MobileNet model and applies custom modifications. Iterates through sublayers, applying Kaiming Normal initialization for Conv2D layers and constant initialization for BatchNorm2D layers. Defines the forward pass of the PPTSM MobileNetV2 model. Registers multiple PPTSM MobileNetV2 variants with different scales.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":232-265",
+            "content": "            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        y = self.conv1(inputs, if_act=True)\n        for block in self.block_list:\n            y = block(y)\n        y = self.conv9(y, if_act=True)\n        y = self.pool2d_avg(y)\n        y = paddle.reshape(y, [-1, self.num_seg, y.shape[1]])\n        y = paddle.mean(y, axis=1)\n        y = paddle.reshape(y, shape=[-1, self.out_c])\n        y = self.out(y)\n        return y\n@BACKBONES.register()\ndef PPTSM_MobileNetV2(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=1.0, **kwargs)\n    return model\ndef PPTSM_MobileNetV2_x0_25(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=0.25, **kwargs)\n    return model\ndef PPTSM_MobileNetV2_x0_5(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=0.5, **kwargs)"
+        },
+        {
+            "comment": "The code defines three functions, PPTSM_MobileNetV2_x0_75, PPTSM_MobileNetV2_x1_5, and PPTSM_MobileNetV2_x2_0. Each function creates a MobileNet model with different scales (0.75, 1.5, and 2.0) using the MobileNet class. The pretrained option allows for loading pre-trained weights if set to True. The functions return the created models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py\":266-281",
+            "content": "    return model\ndef PPTSM_MobileNetV2_x0_75(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=0.75, **kwargs)\n    return model\ndef PPTSM_MobileNetV2_x1_5(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=1.5, **kwargs)\n    return model\ndef PPTSM_MobileNetV2_x2_0(pretrained=None, **kwargs):\n    model = MobileNet(pretrained=pretrained, scale=2.0, **kwargs)\n    return model"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/42da3399-dbce-42b4-8259-d88277b5384d.json b/docs/doc/42da3399-dbce-42b4-8259-d88277b5384d.json
new file mode 100644
index 000000000..11ead2a34
--- /dev/null
+++ b/docs/doc/42da3399-dbce-42b4-8259-d88277b5384d.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code provides instructions on how to download and organize the DAVIS2017 dataset for use in the Ma-Net application. It includes links to the necessary datasets, such as the DAVIS-2017-trainval-480p.zip and DAVIS-2017-scribbles-trainval.zip files, and provides a template directory structure for organizing the data within the PaddleVideo project.",
+    "details": [
+        {
+            "comment": "This code provides instructions on how to download and organize the DAVIS2017 dataset for use in the Ma-Net application. It includes links to the necessary datasets, such as the DAVIS-2017-trainval-480p.zip and DAVIS-2017-scribbles-trainval.zip files, and provides a template directory structure for organizing the data within the PaddleVideo project.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/DAVIS2017_cn.md\":0-26",
+            "content": "[English](../../en/dataset/DAVIS2017.md) | \u7b80\u4f53\u4e2d\u6587\n# DAVIS2017 \u6570\u636e\u96c6\u51c6\u5907\n## 1.\u6570\u636e\u4e0b\u8f7d\n\u4e0b\u8f7d [DAVIS2017](https://data.vision.ee.ethz.ch/csergi/share/davis/DAVIS-2017-trainval-480p.zip) \u548c [scribbles](https://data.vision.ee.ethz.ch/csergi/share/DAVIS-Interactive/DAVIS-2017-scribbles-trainval.zip)\u5230\u540c\u4e00\u4e2a\u6587\u4ef6\u5939\u4e2d\u3002\u8bf7\u53c2\u9605[DAVIS](https://davischallenge.org/davis2017/code.html).\n\u5982\u679c\u60a8\u9700\u8981\u6587\u4ef6\"DAVIS2017/ImageSets/2017/v_a_l_instances.txt\"\uff0c\u8bf7\u53c2\u9605[google](https://drive.google.com/file/d/1aLPaQ_5lyAi3Lk3d2fOc_xewSrfcrQlc/view?usp=sharing)\u94fe\u63a5\n## 2.\u76ee\u5f55\u7ed3\u6784\n\u6574\u4e2a\u9879\u76ee(Ma-Net)\u7684\u76ee\u5f55\u7ed3\u6784\u5982\u4e0b\u6240\u793a\uff1a\n```shell\nPaddleVideo\n\u251c\u2500\u2500 configs\n\u251c\u2500\u2500 paddlevideo\n\u251c\u2500\u2500 docs\n\u251c\u2500\u2500 tools\n\u251c\u2500\u2500 data\n\u2502 \t\u2514\u2500\u2500 DAVIS2017\n\u2502   \u2502 \t\u251c\u2500\u2500 Annotations\n\u2502   \u2502 \t\u251c\u2500\u2500 ImageSets\n\u2502   \u2502 \t\u251c\u2500\u2500 JPEGImages\n\u2502   \u2502 \t\u2514\u2500\u2500 Scribbles\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/42efd6d7-5026-4a7c-b967-1321ca165543.json b/docs/doc/42efd6d7-5026-4a7c-b967-1321ca165543.json
new file mode 100644
index 000000000..c418985cb
--- /dev/null
+++ b/docs/doc/42efd6d7-5026-4a7c-b967-1321ca165543.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code is a Paddle Video inference script with argparse handling and an \"InferModel\" class for model loading and prediction. It supports GPU usage, multimodal video tagging, and customizable parameters. The inference function takes videos, labels, predicts, and outputs results to a JSON file.",
+    "details": [
+        {
+            "comment": "This code is a Paddle Video inference script for a specific model. It parses arguments such as the model name, config file path, output path, use_gpu, and save_inference_model flag. The script utilizes argparse to handle these command-line arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py\":0-37",
+            "content": "#!/usr/bin/env python\n# coding=utf-8\n\"\"\"\ninfer model\n\"\"\"\nimport sys\nimport os\nimport numpy as np\nimport json\nimport pickle\nimport argparse\nimport time\nimport numpy as np\nimport paddle\nfrom datareader import get_reader\nfrom config import merge_configs, parse_config, print_configs\ndef parse_args():\n    \"\"\"parse_args\n    \"\"\"\n    parser = argparse.ArgumentParser(\"Paddle Video infer script\")\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='BaiduNet',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/conf.txt',\n                        help='path to config file of model')\n    parser.add_argument('--output', type=str, default=None, help='output path')\n    parser.add_argument('--use_gpu',\n                        type=bool,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument('--save_inference_model',"
+        },
+        {
+            "comment": "The code is defining a class \"InferModel\" which initializes variables such as name, threshold, and label_map. It also defines a method \"load_inference_model\" that loads the model file, configures GPU usage, and creates a predictor for inference. The code takes input arguments like model_dir, use_gpu, and other configuration parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py\":38-68",
+            "content": "                        type=str,\n                        default=None,\n                        help='save inference path')\n    args = parser.parse_args()\n    return args\nclass InferModel(object):\n    \"\"\"lstm infer\"\"\"\n    def __init__(self, cfg, name='ACTION'): \n        name = name.upper()\n        self.name           = name\n        self.threshold      = cfg.INFER.threshold\n        self.cfg            = cfg\n        self.label_map      = load_class_file(cfg.MODEL.class_name_file)\n    def load_inference_model(self, model_dir, use_gpu=True):\n        \"\"\"model_init\n        \"\"\"\n        model_file = os.path.join(model_dir, \"model\")\n        params_file = os.path.join(model_dir, \"params\")\n        config = paddle.inference.Config(model_file, params_file)\n        if use_gpu:\n            config.enable_use_gpu(1024)\n        else:\n            config.disable_gpu()\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = paddle.inference.create_predictor(config)"
+        },
+        {
+            "comment": "This code initializes input and output tensors for a multimodal video tagging scenario. It builds the input tensor from RGB frames, audio data, and text data, and retrieves the output tensor. The `preprocess_for_lod_data` function converts input data into a list of arrays with length indicators (LOD) for efficient handling.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py\":69-97",
+            "content": "        # build input tensor and output tensor\n        self.build_input_output()\n    def build_input_output(self):\n        \"\"\"build_input_output\n        \"\"\"\n        input_names = self.predictor.get_input_names()\n        # input\n        self.input_rgb_tensor = self.predictor.get_input_handle(input_names[0])\n        self.input_audio_tensor = self.predictor.get_input_handle(input_names[1])\n        self.input_text_tensor = self.predictor.get_input_handle(input_names[2])\n        # output\n        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def preprocess_for_lod_data(self, input):\n        \"\"\"pre process\"\"\"\n        input_arr = []\n        input_lod = [0]\n        start_lod = 0\n        end_lod = 0\n        for sub_item in input:\n            end_lod = start_lod + len(sub_item)\n            input_lod.append(end_lod)\n            input_arr.extend(sub_item)\n            start_lod = end_lod\n        input_arr = np.array(input_arr)\n        return input_arr, [input_lod]"
+        },
+        {
+            "comment": "In this code, a function named \"predict\" is defined. It uses a reader to process data for inference and iterates through each instance of the data. The instances contain RGB images, audio, text, and video IDs. These instances are preprocessed for LOD (Level Of Detail) data using the preprocess_for_lod_data method. The preprocessed data is then copied to corresponding input tensors (RGB, audio, and text) for inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py\":99-121",
+            "content": "    def predict(self):\n        \"\"\"predict\"\"\"\n        infer_reader = get_reader(self.name, 'infer', self.cfg)\n        probs = []\n        video_ids = []\n        label_map_inverse = {value: key for key, value in self.label_map.items()}\n        for infer_iter, data in enumerate(infer_reader()):\n            # video_id = [[items[-2], items[-1]] for items in data]\n            rgb = [items[0] for items in data]\n            audio = [items[1] for items in data]\n            text = np.array([items[2] for items in data])\n            videos = np.array([items[3] for items in data])\n            rgb_arr, rgb_lod = self.preprocess_for_lod_data(rgb)\n            audio_arr, audio_lod = self.preprocess_for_lod_data(audio)\n            self.input_rgb_tensor.copy_from_cpu(rgb_arr.astype('float32'))\n            self.input_rgb_tensor.set_lod(rgb_lod)\n            self.input_audio_tensor.copy_from_cpu(audio_arr.astype('float32'))\n            self.input_audio_tensor.set_lod(audio_lod)\n            self.input_text_tensor.copy_from_cpu(text.astype('int64'))"
+        },
+        {
+            "comment": "The code defines an inference function that takes a set of videos and their associated labels, and returns the predicted labels for each video. It also includes functions to load class information from a file and parse command-line arguments. The inference function first runs the predictor on the input data, then extracts the output probabilities and corresponding video IDs. It checks that the number of video IDs matches the number of probabilities. Then, for each video-probability pair, it identifies the indices where the probability is above a certain threshold and uses these to determine the predicted labels. The resulting dictionary contains the video ID, as well as a list of tuples containing the label name and corresponding probability for each detected label.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py\":123-160",
+            "content": "            self.predictor.run()\n            output = self.output_tensor.copy_to_cpu()\n            probs.extend(list(output))\n            video_ids.extend(videos)\n        assert len(video_ids) == len(probs)\n        result = []\n        for video_id, prob in zip(video_ids, probs):\n            label_idx = list(np.where(prob >= self.threshold)[0])\n            result.append({\n                \"video_id\": video_id,\n                \"labels\": [\n                    (label_map_inverse[str(idx)], float(prob[idx])) for idx in label_idx\n                ]\n            })\n        return result\ndef load_class_file(class_file):\n    \"\"\"\n    load_class_file\n    \"\"\"\n    class_lines = open(class_file, 'r', encoding='utf8').readlines()\n    class_dict = {}\n    for i, line in enumerate(class_lines):\n        tmp = line.strip().split('\\t')\n        word = tmp[0]\n        index = str(i)\n        if len(tmp) == 2:\n            index = tmp[1]\n        class_dict[word] = index\n    return class_dict\ndef infer(args):\n    \"\"\"\n    infer main\n    \"\"\"\n    config = parse_config(args.config)"
+        },
+        {
+            "comment": "This code snippet is from the \"inference.py\" file in the PaddleVideo MultimodalVideoTag application. It defines a function `infer` that performs model inference on input data and outputs the results. The code merges configs for the infer stage, prints them out, creates an InferModel object with those configs, loads the inference model from a given file (if provided), runs inference, and finally saves the results to a JSON file if requested.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py\":161-172",
+            "content": "    infer_config = merge_configs(config, 'infer', vars(args))\n    print_configs(infer_config, 'infer')\n    infer_obj = InferModel(infer_config, name=args.model_name)\n    infer_obj.load_inference_model(args.save_inference_model, use_gpu=args.use_gpu)\n    rt = infer_obj.predict()\n    if args.output:\n        with open(args.output, 'w') as f:\n            json.dump(rt, f, ensure_ascii=False, indent=4)\nif __name__ == \"__main__\":\n    args = parse_args()\n    infer(args)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/437b6fc8-ae66-4984-9f7e-ab5d28eb045c.json b/docs/doc/437b6fc8-ae66-4984-9f7e-ab5d28eb045c.json
new file mode 100644
index 000000000..352093176
--- /dev/null
+++ b/docs/doc/437b6fc8-ae66-4984-9f7e-ab5d28eb045c.json
@@ -0,0 +1,80 @@
+{
+    "summary": "This code imports libraries, defines a class for metrics calculation, retrieves data from ActivityNet API, compares results, creates DataFrames, evaluates proposals using AUC-RC, and calculates average recall. It extracts videos, computes proposal scores, IOU scores, handles exceptions, determines recall with thresholds, and efficiently computes IoU for target and candidate segments.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries, defines a class for calculating AR@N and AUC, and sets the API URL for accessing ActivityNet data. The class uses ground truth fields and proposal fields to compare results. Code is transferred from the ActivityNet GitHub repository.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":0-28",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport json\nimport numpy as np\nimport pandas as pd\nimport urllib.request as urllib2\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\nclass ANETproposal(object):\n    \"\"\"\n    This class is used for calculating AR@N and AUC;\n    Code transfer from ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)\n    \"\"\"\n    GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']\n    PROPOSAL_FIELDS = ['results', 'version', 'external_data']\n    API = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/challenge19/api.py'"
+        },
+        {
+            "comment": "Initializing the class with ground truth and proposal filenames as required, setting default parameters, and checking if both files exist.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":30-53",
+            "content": "    def __init__(self,\n                 ground_truth_filename=None,\n                 proposal_filename=None,\n                 ground_truth_fields=GROUND_TRUTH_FIELDS,\n                 proposal_fields=PROPOSAL_FIELDS,\n                 tiou_thresholds=np.linspace(0.5, 0.95, 10),\n                 max_avg_nr_proposals=None,\n                 subset='validation',\n                 verbose=False,\n                 check_status=True):\n        if not ground_truth_filename:\n            raise IOError('Please input a valid ground truth file.')\n        if not proposal_filename:\n            raise IOError('Please input a valid proposal file.')\n        self.subset = subset\n        self.tiou_thresholds = tiou_thresholds\n        self.max_avg_nr_proposals = max_avg_nr_proposals\n        self.verbose = verbose\n        self.gt_fields = ground_truth_fields\n        self.pred_fields = proposal_fields\n        self.recall = None\n        self.avg_recall = None\n        self.proposals_per_video = None\n        self.check_status = check_status"
+        },
+        {
+            "comment": "This code retrieves blocked videos from a server, imports ground truth and proposals, and checks if the ground truth file is well formatted. It also prints information about the number of ground truth instances and proposals, as well as the fixed threshold for tiou score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":54-76",
+            "content": "        # Retrieve blocked videos from server.\n        if self.check_status:\n            self.blocked_videos = self.get_blocked_videos()\n        else:\n            self.blocked_videos = list()\n        # Import ground truth and proposals.\n        self.ground_truth, self.activity_index = self._import_ground_truth(\n            ground_truth_filename)\n        self.proposal = self._import_proposal(proposal_filename)\n        if self.verbose:\n            print('[INIT] Loaded annotations from {} subset.'.format(subset))\n            nr_gt = len(self.ground_truth)\n            print('\\tNumber of ground truth instances: {}'.format(nr_gt))\n            nr_pred = len(self.proposal)\n            print('\\tNumber of proposals: {}'.format(nr_pred))\n            print('\\tFixed threshold for tiou score: {}'.format(\n                self.tiou_thresholds))\n    def _import_ground_truth(self, ground_truth_filename):\n        \"\"\"\n        Reads ground truth file, checks if it is well formatted, and returns\n        the ground truth instances and the activity classes."
+        },
+        {
+            "comment": "This function reads a ground truth JSON file and returns a DataFrame containing the instances. It also returns a dictionary of class indices. The function checks if the input file has the required fields, skips videos not in the specified subset, and ignores blocked videos. If an activity label is not found in the activity_index, it adds it to the index and increments the counter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":78-101",
+            "content": "        Parameters:\n        ground_truth_filename (str): full path to the ground truth json file.\n        Returns:\n        ground_truth (df): Data frame containing the ground truth instances.\n        activity_index (dict): Dictionary containing class index.\n        \"\"\"\n        with open(ground_truth_filename, 'r') as fobj:\n            data = json.load(fobj)\n        # Checking format\n        if not all([field in data.keys() for field in self.gt_fields]):\n            raise IOError('Please input a valid ground truth file.')\n        # Read ground truth data.\n        activity_index, cidx = {}, 0\n        video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []\n        for videoid, v in data['database'].items():\n            if self.subset != v['subset']:\n                continue\n            if videoid in self.blocked_videos:\n                continue\n            for ann in v['annotations']:\n                if ann['label'] not in activity_index:\n                    activity_index[ann['label']] = cidx\n                    cidx += 1"
+        },
+        {
+            "comment": "The code reads a proposal file, checks its format and returns proposal instances in the form of a data frame. It also generates ground truth data by appending video IDs, start and end times, and labels to lists before creating a DataFrame. The function takes a string as input for the full path to the proposal JSON file and returns a data frame containing the proposal instances.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":102-129",
+            "content": "                video_lst.append(videoid)\n                t_start_lst.append(float(ann['segment'][0]))\n                t_end_lst.append(float(ann['segment'][1]))\n                label_lst.append(activity_index[ann['label']])\n        ground_truth = pd.DataFrame({\n            'video-id': video_lst,\n            't-start': t_start_lst,\n            't-end': t_end_lst,\n            'label': label_lst\n        })\n        return ground_truth, activity_index\n    def _import_proposal(self, proposal_filename):\n        \"\"\"\n        Reads proposal file, checks if it is well formatted, and returns\n        the proposal instances.\n        Parameters:\n        proposal_filename (str): Full path to the proposal json file.\n        Returns:\n        proposal (df): Data frame containing the proposal instances.\n        \"\"\"\n        with open(proposal_filename, 'r') as fobj:\n            data = json.load(fobj)\n        # Checking format...\n        if not all([field in data.keys() for field in self.pred_fields]):\n            raise IOError('Please input a valid proposal file.')"
+        },
+        {
+            "comment": "The code reads predictions from a data source, extracts relevant information (video IDs, start and end timestamps, scores), stores them in a DataFrame, and defines two functions: one for evaluating proposal files by computing area under the average recall vs average number of proposals per video curve. The evaluation function calls another function to compute this metric using ground truth data and the stored proposal data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":131-157",
+            "content": "        # Read predictions.\n        video_lst, t_start_lst, t_end_lst = [], [], []\n        score_lst = []\n        for videoid, v in data['results'].items():\n            if videoid in self.blocked_videos:\n                continue\n            for result in v:\n                video_lst.append(videoid)\n                t_start_lst.append(float(result['segment'][0]))\n                t_end_lst.append(float(result['segment'][1]))\n                score_lst.append(result['score'])\n        proposal = pd.DataFrame({\n            'video-id': video_lst,\n            't-start': t_start_lst,\n            't-end': t_end_lst,\n            'score': score_lst\n        })\n        return proposal\n    def evaluate(self):\n        \"\"\"\n        Evaluates a proposal file. To measure the performance of a\n        method for the proposal task, we computes the area under the\n        average recall vs average number of proposals per video curve.\n        \"\"\"\n        recall, avg_recall, proposals_per_video = self.average_recall_vs_avg_nr_proposals(\n            self.ground_truth,"
+        },
+        {
+            "comment": "Calculates the area under the curve of recall vs average number of proposals for ActivityNet proposal task, writes result to file and stores recall, average recall, and proposals per video in class attributes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":158-181",
+            "content": "            self.proposal,\n            max_avg_nr_proposals=self.max_avg_nr_proposals,\n            tiou_thresholds=self.tiou_thresholds)\n        area_under_curve = np.trapz(avg_recall, proposals_per_video)\n        if self.verbose:\n            print('[RESULTS] Performance on ActivityNet proposal task.')\n            with open(\"data/bmn/BMN_Test_results/auc_result.txt\",\n                      \"a\") as text_file:\n                text_file.write(\n                    '\\tArea Under the AR vs AN curve: {}% \\n'.format(\n                        100. * float(area_under_curve) /\n                        proposals_per_video[-1]))\n            print('\\tArea Under the AR vs AN curve: {}%'.format(\n                100. * float(area_under_curve) / proposals_per_video[-1]))\n        self.recall = recall\n        self.avg_recall = avg_recall\n        self.proposals_per_video = proposals_per_video\n    def average_recall_vs_avg_nr_proposals(self,\n                                           ground_truth,\n                                           proposals,"
+        },
+        {
+            "comment": "This code defines a function that computes average recall for given average number of proposals per video. It takes ground truth and proposal data frames as input, along with optional tiou_thresholds. It returns recall and average_recall arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":182-201",
+            "content": "                                           max_avg_nr_proposals=None,\n                                           tiou_thresholds=np.linspace(\n                                               0.5, 0.95, 10)):\n        \"\"\"\n        Computes the average recall given an average number of\n        proposals per video.\n        Parameters:\n        ground_truth(df): Data frame containing the ground truth instances.\n            Required fields: ['video-id', 't-start', 't-end']\n        proposal(df): Data frame containing the proposal instances.\n            Required fields: ['video-id, 't-start', 't-end', 'score']\n        tiou_thresholds(1d-array | optional): array with tiou thresholds.\n        Returns:\n        recall(2d-array): recall[i,j] is recall at ith tiou threshold at the jth\n            average number of average number of proposals per video.\n        average_recall(1d-array): recall averaged over a list of tiou threshold.\n            This is equivalent to recall.mean(axis=0).\n        proposals_per_video(1d-array): average number of proposals per video."
+        },
+        {
+            "comment": "This code retrieves a list of videos, calculates the maximum average number of proposals per video, groups proposals and ground truth by video ID, and then computes Tiou scores between ground-truth instances and retrieved proposals for each video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":202-230",
+            "content": "        \"\"\"\n        # Get list of videos.\n        video_lst = ground_truth['video-id'].unique()\n        if not max_avg_nr_proposals:\n            max_avg_nr_proposals = float(\n                proposals.shape[0]) / video_lst.shape[0]\n        ratio = max_avg_nr_proposals * float(\n            video_lst.shape[0]) / proposals.shape[0]\n        # Adaptation to query faster\n        ground_truth_gbvn = ground_truth.groupby('video-id')\n        proposals_gbvn = proposals.groupby('video-id')\n        # For each video, computes tiou scores among the retrieved proposals.\n        score_lst = []\n        total_nr_proposals = 0\n        for videoid in video_lst:\n            # Get ground-truth instances associated to this video.\n            ground_truth_videoid = ground_truth_gbvn.get_group(videoid)\n            this_video_ground_truth = ground_truth_videoid.loc[:, [\n                't-start', 't-end'\n            ]].values\n            # Get proposals for this video.\n            try:\n                proposals_videoid = proposals_gbvn.get_group(videoid)"
+        },
+        {
+            "comment": "This code block is part of a function that handles exceptions when dealing with video proposals and ground truth. It appends a zero matrix to the score list if there are no video proposals or ground truth data for the current video. If there are proposals, it sorts them by score in descending order and expands dimensions as necessary before proceeding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":231-254",
+            "content": "            except:\n                n = this_video_ground_truth.shape[0]\n                score_lst.append(np.zeros((n, 1)))\n                continue\n            this_video_proposals = proposals_videoid.loc[:,\n                                                         ['t-start', 't-end'\n                                                          ]].values\n            if this_video_proposals.shape[0] == 0:\n                n = this_video_ground_truth.shape[0]\n                score_lst.append(np.zeros((n, 1)))\n                continue\n            # Sort proposals by score.\n            sort_idx = proposals_videoid['score'].argsort()[::-1]\n            this_video_proposals = this_video_proposals[sort_idx, :]\n            if this_video_proposals.ndim != 2:\n                this_video_proposals = np.expand_dims(this_video_proposals,\n                                                      axis=0)\n            if this_video_ground_truth.ndim != 2:\n                this_video_ground_truth = np.expand_dims(\n                    this_video_ground_truth, axis=0)"
+        },
+        {
+            "comment": "This code calculates average recall for a set of video proposals. It sets the number of proposals based on a ratio, computes IOU scores, and stores the results in lists. The average recall is computed using a predetermined maximum number of proposals and the total number of proposals retrieved, considering the variable length of videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":256-277",
+            "content": "            nr_proposals = np.minimum(\n                int(this_video_proposals.shape[0] * ratio),\n                this_video_proposals.shape[0])\n            total_nr_proposals += nr_proposals\n            this_video_proposals = this_video_proposals[:nr_proposals, :]\n            # Compute tiou scores.\n            tiou = self.wrapper_segment_iou(this_video_proposals,\n                                            this_video_ground_truth)\n            score_lst.append(tiou)\n        # Given that the length of the videos is really varied, we\n        # compute the number of proposals in terms of a ratio of the total\n        # proposals retrieved, i.e. average recall at a percentage of proposals\n        # retrieved per video.\n        # Computes average recall.\n        pcn_lst = np.arange(1, 101) / 100.0 * (max_avg_nr_proposals * float(\n            video_lst.shape[0]) / total_nr_proposals)\n        matches = np.empty((video_lst.shape[0], pcn_lst.shape[0]))\n        positives = np.empty(video_lst.shape[0])\n        recall = np.empty((tiou_thresholds.shape[0], pcn_lst.shape[0]))"
+        },
+        {
+            "comment": "Code iterates over different tiou thresholds and positive scores, computing the number of true positives based on threshold and percentage of proposals. It calculates matches per video and computes recall for each set of matches.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":278-297",
+            "content": "        # Iterates over each tiou threshold.\n        for ridx, tiou in enumerate(tiou_thresholds):\n            # Inspect positives retrieved per video at different\n            # number of proposals (percentage of the total retrieved).\n            for i, score in enumerate(score_lst):\n                # Total positives per video.\n                positives[i] = score.shape[0]\n                # Find proposals that satisfies minimum tiou threshold.\n                true_positives_tiou = score >= tiou\n                # Get number of proposals as a percentage of total retrieved.\n                pcn_proposals = np.minimum(\n                    (score.shape[1] * pcn_lst).astype(int), score.shape[1])\n                for j, nr_proposals in enumerate(pcn_proposals):\n                    # Compute the number of matches for each percentage of the proposals\n                    matches[i, j] = np.count_nonzero(\n                        (true_positives_tiou[:, :nr_proposals]).sum(axis=1))\n            # Computes recall given the set of matches per video."
+        },
+        {
+            "comment": "The function calculates recall and average recall for detected objects in videos, based on the number of true positives and total proposals. It also returns the average number of proposals per video. The second function retrieves a list of blocked videos from an API. The third function computes intersection over union between target and candidate segments efficiently.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":298-323",
+            "content": "            recall[ridx, :] = matches.sum(axis=0) / positives.sum()\n        # Recall is averaged.\n        avg_recall = recall.mean(axis=0)\n        # Get the average number of proposals per video.\n        proposals_per_video = pcn_lst * (float(total_nr_proposals) /\n                                         video_lst.shape[0])\n        return recall, avg_recall, proposals_per_video\n    def get_blocked_videos(self, api=API):\n        api_url = '{}?action=get_blocked'.format(api)\n        req = urllib2.Request(api_url)\n        response = urllib2.urlopen(req)\n        return json.loads(response.read())\n    def wrapper_segment_iou(self, target_segments, candidate_segments):\n        \"\"\"\n        Compute intersection over union btw segments\n        Parameters:\n        target_segments(nd-array): 2-dim array in format [m x 2:=[init, end]]\n        candidate_segments(nd-array): 2-dim array in format [n x 2:=[init, end]]\n        Returns:\n        tiou(nd-array): 2-dim array [n x m] with IOU ratio.\n        Note: It assumes that candidate-segments are more scarce that target-segments"
+        },
+        {
+            "comment": "This function calculates the temporal intersection over union (TIOU) between a target segment and multiple candidate segments. If the dimensions of arguments are not 2, it raises a ValueError. It loops through each candidate segment, compares their starting and ending times with the target segment's times using np.maximum, and stores the TIOU results in a 2D array.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":324-348",
+            "content": "        \"\"\"\n        if candidate_segments.ndim != 2 or target_segments.ndim != 2:\n            raise ValueError('Dimension of arguments is incorrect')\n        n, m = candidate_segments.shape[0], target_segments.shape[0]\n        tiou = np.empty((n, m))\n        for i in range(m):\n            tiou[:, i] = self.segment_iou(target_segments[i, :],\n                                          candidate_segments)\n        return tiou\n    def segment_iou(self, target_segment, candidate_segments):\n        \"\"\"\n        Compute the temporal intersection over union between a\n        target segment and all the test segments.\n        Parameters:\n        target_segment(1d-array): Temporal target segment containing [starting, ending] times.\n        candidate_segments(2d-array): Temporal candidate segments containing N x [starting, ending] times.\n        Returns:\n        tiou(1d-array): Temporal intersection over union score of the N's candidate segments.\n        \"\"\"\n        tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])"
+        },
+        {
+            "comment": "Computes intersection over union (IoU) of two segments by finding the minimum endpoints, calculating intersection and union, and dividing intersection by union.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py\":349-358",
+            "content": "        tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])\n        # Intersection including Non-negative overlap score.\n        segments_intersection = (tt2 - tt1).clip(0)\n        # Segment union.\n        segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \\\n                         + (target_segment[1] - target_segment[0]) - segments_intersection\n        # Compute overlap as the ratio of the intersection\n        # over union of two segments.\n        tIoU = segments_intersection.astype(float) / segments_union\n        return tIoU"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/43a2a3f1-ec47-40cf-aae2-e3c51154519a.json b/docs/doc/43a2a3f1-ec47-40cf-aae2-e3c51154519a.json
new file mode 100644
index 000000000..f71aff03a
--- /dev/null
+++ b/docs/doc/43a2a3f1-ec47-40cf-aae2-e3c51154519a.json
@@ -0,0 +1,110 @@
+{
+    "summary": "This code optimizes PaddleVideo model performance by configuring environment variables for efficient training or export tasks, evaluates models, saves trained models, logs, and runs evaluation scripts.",
+    "details": [
+        {
+            "comment": "The code reads a file containing training parameters and parses the values using different functions. These parameters include the model name, Python version, GPU list, auto-cast settings, epoch number, batch size, pre-trained model, and training model name. The parsed values will be used for further processing in the script.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":0-29",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\n# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer']\nMODE=$2\ndataline=$(cat ${FILENAME})\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# The training params\nmodel_name=$(func_parser_value \"${lines[1]}\")\npython=$(func_parser_value \"${lines[2]}\")\ngpu_list=$(func_parser_value \"${lines[3]}\")\ntrain_use_gpu_key=$(func_parser_key \"${lines[4]}\")\ntrain_use_gpu_value=$(func_parser_value \"${lines[4]}\")\nautocast_list=$(func_parser_value \"${lines[5]}\")\nautocast_key=$(func_parser_key \"${lines[5]}\")\nepoch_key=$(func_parser_key \"${lines[6]}\")\nepoch_num=$(func_parser_value \"${lines[6]}\")\nsave_model_key=$(func_parser_key \"${lines[7]}\")\ntrain_batch_key=$(func_parser_key \"${lines[8]}\")\ntrain_batch_value=$(func_parser_value \"${lines[8]}\")\npretrain_model_key=$(func_parser_key \"${lines[9]}\")\npretrain_model_value=$(func_parser_value \"${lines[9]}\")\ntrain_model_name=$(func_parser_value \"${lines[10]}\")\ntrain_param_key1=$(func_parser_key \"${lines[12]}\")"
+        },
+        {
+            "comment": "This code is parsing key-value pairs from different lines and assigning them to specific variables. The variables are used for trainer, pact, fpgm, distill, amp, evaluator, save_infer, and export weight configurations. This information will likely be utilized in subsequent parts of the script or program.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":30-55",
+            "content": "train_param_value1=$(func_parser_value \"${lines[12]}\")\ntrain_param_key2=$(func_parser_key \"${lines[11]}\")\ntrain_param_value2=$(func_parser_value \"${lines[11]}\")\ntrainer_list=$(func_parser_value \"${lines[14]}\")\ntrainer_norm=$(func_parser_key \"${lines[15]}\")\nnorm_trainer=$(func_parser_value \"${lines[15]}\")\npact_key=$(func_parser_key \"${lines[16]}\")\npact_trainer=$(func_parser_value \"${lines[16]}\")\nfpgm_key=$(func_parser_key \"${lines[17]}\")\nfpgm_trainer=$(func_parser_value \"${lines[17]}\")\ndistill_key=$(func_parser_key \"${lines[18]}\")\ndistill_trainer=$(func_parser_value \"${lines[18]}\")\namp_key=$(func_parser_key \"${lines[19]}\")\namp_trainer=$(func_parser_value \"${lines[19]}\")\ntrainer_key2=$(func_parser_key \"${lines[20]}\")\ntrainer_value2=$(func_parser_value \"${lines[20]}\")\neval_py=$(func_parser_value \"${lines[23]}\")\neval_key1=$(func_parser_key \"${lines[24]}\")\neval_value1=$(func_parser_value \"${lines[24]}\")\nsave_infer_key=$(func_parser_key \"${lines[27]}\")\nsave_infer_value=$(func_parser_value \"${lines[27]}\")\nexport_weight=$(func_parser_key \"${lines[28]}\")"
+        },
+        {
+            "comment": "The code parses various configuration values and keys from the lines of a script. It extracts normalization, quantization, and distillation settings; inference directory path; model directories for inference; whether to use GPU, MKLDNN, specify CPU threads, and batch size. The variables are assigned with these parsed values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":56-78",
+            "content": "norm_export=$(func_parser_value \"${lines[29]}\")\npact_export=$(func_parser_value \"${lines[30]}\")\nfpgm_export=$(func_parser_value \"${lines[31]}\")\ndistill_export=$(func_parser_value \"${lines[32]}\")\nexport_key1=$(func_parser_key \"${lines[33]}\")\nexport_value1=$(func_parser_value \"${lines[33]}\")\nexport_key2=$(func_parser_key \"${lines[34]}\")\nexport_value2=$(func_parser_value \"${lines[34]}\")\ninference_dir=$(func_parser_value \"${lines[35]}\")\n# parser inference model\ninfer_model_dir_list=$(func_parser_value \"${lines[36]}\")\ninfer_export_list=$(func_parser_value \"${lines[37]}\")\ninfer_is_quant=$(func_parser_value \"${lines[38]}\")\n# parser inference\ninference_py=$(func_parser_value \"${lines[39]}\")\nuse_gpu_key=$(func_parser_key \"${lines[40]}\")\nuse_gpu_list=$(func_parser_value \"${lines[40]}\")\nuse_mkldnn_key=$(func_parser_key \"${lines[41]}\")\nuse_mkldnn_list=$(func_parser_value \"${lines[41]}\")\ncpu_threads_key=$(func_parser_key \"${lines[42]}\")\ncpu_threads_list=$(func_parser_value \"${lines[42]}\")\nbatch_size_key=$(func_parser_key \"${lines[43]}\")"
+        },
+        {
+            "comment": "This code is parsing the function parameters from a configuration file. The batch size, use_trt, precision, infer model value, video directory path, log saving flag, and benchmark are being assigned to respective variables. A specific line number is obtained using grep command for a keyword \"to_static_train_benchmark_params\". Then it checks if the mode is set to \"klquant_whole_infer\" and processes the first and 17th lines of the configuration file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":79-103",
+            "content": "batch_size_list=$(func_parser_value \"${lines[43]}\")\nuse_trt_key=$(func_parser_key \"${lines[44]}\")\nuse_trt_list=$(func_parser_value \"${lines[44]}\")\nprecision_key=$(func_parser_key \"${lines[45]}\")\nprecision_list=$(func_parser_value \"${lines[45]}\")\ninfer_model_key=$(func_parser_key \"${lines[46]}\")\ninfer_model_value=$(func_parser_value \"${lines[46]}\")\nvideo_dir_key=$(func_parser_key \"${lines[47]}\")\ninfer_video_dir=$(func_parser_value \"${lines[47]}\")\nsave_log_key=$(func_parser_key \"${lines[48]}\")\nbenchmark_key=$(func_parser_key \"${lines[49]}\")\nbenchmark_value=$(func_parser_value \"${lines[49]}\")\ninfer_key1=$(func_parser_key \"${lines[50]}\")\ninfer_value1=$(func_parser_value \"${lines[50]}\")\nline_num=`grep -n -w \"to_static_train_benchmark_params\" $FILENAME  | cut -d \":\" -f 1`\nto_static_key=$(func_parser_key \"${lines[line_num]}\")\nto_static_trainer=$(func_parser_value \"${lines[line_num]}\")\n# parser klquant_infer\nif [ ${MODE} = \"klquant_whole_infer\" ]; then\n    dataline=$(awk 'NR==1 NR==17{print}'  $FILENAME)\n    lines=(${dataline})"
+        },
+        {
+            "comment": "The code is parsing the configuration file to extract specific values for different variables like model name, python version, inference model directory list, and more. These values are used later in the script to execute specific commands related to the test and train operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":104-124",
+            "content": "    model_name=$(func_parser_value \"${lines[1]}\")\n    python=$(func_parser_value \"${lines[2]}\")\n    # parser inference model\n    infer_model_dir_list=$(func_parser_value \"${lines[3]}\")\n    infer_export_list=$(func_parser_value \"${lines[4]}\")\n    infer_is_quant=$(func_parser_value \"${lines[5]}\")\n    # parser inference\n    inference_py=$(func_parser_value \"${lines[6]}\")\n    use_gpu_key=$(func_parser_key \"${lines[7]}\")\n    use_gpu_list=$(func_parser_value \"${lines[7]}\")\n    use_mkldnn_key=$(func_parser_key \"${lines[8]}\")\n    use_mkldnn_list=$(func_parser_value \"${lines[8]}\")\n    cpu_threads_key=$(func_parser_key \"${lines[9]}\")\n    cpu_threads_list=$(func_parser_value \"${lines[9]}\")\n    batch_size_key=$(func_parser_key \"${lines[10]}\")\n    batch_size_list=$(func_parser_value \"${lines[10]}\")\n    use_trt_key=$(func_parser_key \"${lines[11]}\")\n    use_trt_list=$(func_parser_value \"${lines[11]}\")\n    precision_key=$(func_parser_key \"${lines[12]}\")\n    precision_list=$(func_parser_value \"${lines[12]}\")\n    infer_model_key=$(func_parser_key \"${lines[13]}\")"
+        },
+        {
+            "comment": "This code sets variables for video directory, log path, inference functions, and other parameters. It then loops through various conditions to perform inferences with different combinations of GPU and MKLDNN usage, while keeping track of the results in the specified log file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":125-156",
+            "content": "    video_dir_key=$(func_parser_key \"${lines[14]}\")\n    infer_video_dir=$(func_parser_value \"${lines[14]}\")\n    save_log_key=$(func_parser_key \"${lines[15]}\")\n    benchmark_key=$(func_parser_key \"${lines[16]}\")\n    benchmark_value=$(func_parser_value \"${lines[16]}\")\n    infer_key1=$(func_parser_key \"${lines[17]}\")\n    infer_value1=$(func_parser_value \"${lines[17]}\")\nfi\nLOG_PATH=\"./test_tipc/output/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_python.log\"\nfunction func_inference(){\n    IFS='|'\n    _python=$1\n    _script=$2\n    _model_dir=$3\n    _log_path=$4\n    _video_dir=$5\n    _flag_quant=$6\n    _gpu=$7\n    # inference\n    for use_gpu in ${use_gpu_list[*]}; do\n        if [ ${use_gpu} = \"False\" ] || [ ${use_gpu} = \"cpu\" ]; then\n            for use_mkldnn in ${use_mkldnn_list[*]}; do\n                if [[ ${use_mkldnn} = \"False\" ]] && [[ ${_flag_quant} = \"True\" ]]; then\n                    continue\n                fi\n                for threads in ${cpu_threads_list[*]}; do\n                    for batch_size in ${batch_size_list[*]}; do"
+        },
+        {
+            "comment": "This code is iterating over a list of precision values, checking conditions to decide whether to continue or skip the current iteration. It sets the appropriate log path, creates directories if needed, and calls functions to set parameters for inference data and benchmarking.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":157-169",
+            "content": "                        for precision in ${precision_list[*]}; do\n                            if [[ ${use_mkldnn} = \"False\" ]] && [[ ${precision} = \"fp16\" ]]; then\n                                continue\n                            fi # skip when enable fp16 but disable mkldnn\n                            if [[ ${_flag_quant} = \"True\" ]] && [[ ${precision} != \"int8\" ]]; then\n                                continue\n                            fi # skip when quant model inference but precision is not int8\n                            set_precision=$(func_set_params \"${precision_key}\" \"${precision}\")\n                            _save_log_path=\"${_log_path}/python_infer_cpu_gpus_${_gpu}_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log\"\n                            mkdir -p ${_log_path}\n                            set_infer_data=$(func_set_params \"${video_dir_key}\" \"${infer_video_dir}\")\n                            set_benchmark=$(func_set_params \"${benchmark_key}\" \"${benchmark_value}\")"
+        },
+        {
+            "comment": "The code is setting variables, constructing a command using environment variables, and executing it. It then checks the status of the execution and logs the output for later inspection. This appears to be part of a loop that's running multiple tests or experiments with varying parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":170-180",
+            "content": "                            set_batchsize=$(func_set_params \"${batch_size_key}\" \"${batch_size}\")\n                            set_cpu_threads=$(func_set_params \"${cpu_threads_key}\" \"${threads}\")\n                            set_model_dir=$(func_set_params \"${infer_model_key}\" \"${_model_dir}/${infer_model_value}\")\n                            set_infer_params1=$(func_set_params \"${infer_key1}\" \"${_model_dir}/${infer_value1}\")\n                            command=\"${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 \"\n                            eval $command\n                            last_status=${PIPESTATUS[0]}\n                            eval \"cat ${_save_log_path}\"\n                            status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\" \"${_save_log_path}\"\n                        done\n                    done"
+        },
+        {
+            "comment": "This code snippet is checking various conditions for model inference using different parameters like GPU usage, precision, and batch size. It iterates through a list of options to set up the necessary configurations for logging and execution. The purpose seems to be running inference tests with varying settings to optimize performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":181-197",
+            "content": "                done\n            done\n        elif [ ${use_gpu} = \"True\" ] || [ ${use_gpu} = \"gpu\" ]; then\n            for use_trt in ${use_trt_list[*]}; do\n                for precision in ${precision_list[*]}; do\n                    if [[ ${_flag_quant} = \"False\" ]] && [[ ${precision} =~ \"int8\" ]]; then\n                        continue\n                    fi\n                    if [[ ${precision} =~ \"fp16\" || ${precision} =~ \"int8\" ]] && [[ ${use_trt} = \"False\" ]]; then\n                        continue\n                    fi\n                    if [[ ${use_trt} = \"False\" || ${precision} =~ \"int8\" ]] && [[ ${_flag_quant} = \"True\" ]]; then\n                        continue\n                    fi\n                    for batch_size in ${batch_size_list[*]}; do\n                        _save_log_path=\"${_log_path}/python_infer_gpu_gpus_${_gpu}_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log\"\n                        set_infer_data=$(func_set_params \"${video_dir_key}\" \"${infer_video_dir}\")"
+        },
+        {
+            "comment": "This code sets parameters for benchmark, batch size, tensorrt usage, precision, model directory, and infer params1. It then executes a command with these parameters to run inference, saves the log, and checks the status of the execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":199-211",
+            "content": "                        set_benchmark=$(func_set_params \"${benchmark_key}\" \"${benchmark_value}\")\n                        set_batchsize=$(func_set_params \"${batch_size_key}\" \"${batch_size}\")\n                        set_tensorrt=$(func_set_params \"${use_trt_key}\" \"${use_trt}\")\n                        set_precision=$(func_set_params \"${precision_key}\" \"${precision}\")\n                        set_model_dir=$(func_set_params \"${infer_model_key}\" \"${_model_dir}/${infer_model_value}\")\n                        set_infer_params1=$(func_set_params \"${infer_key1}\" \"${_model_dir}/${infer_value1}\")\n                        command=\"${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 \"\n                        eval $command\n                        last_status=${PIPESTATUS[0]}\n                        eval \"cat ${_save_log_path}\"\n                        status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\" \"${_save_log_path}\""
+        },
+        {
+            "comment": "This code is part of a script that tests and runs inference models using PaddleVideo. It checks the hardware being used (CPU or GPU) and sets appropriate environment variables accordingly. It then iterates through each inference model, running them with specific exported weights and saving the output logs for further analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":213-242",
+            "content": "                    done\n                done\n            done\n        else\n            echo \"Does not support hardware other than CPU and GPU Currently!\"\n        fi\n    done\n}\nif [ ${MODE} = \"whole_infer\" ] || [ ${MODE} = \"klquant_whole_infer\" ]; then\n    GPUID=$3\n    if [ ${#GPUID} -le 0 ];then\n        env=\" \"\n    else\n        env=\"export CUDA_VISIBLE_DEVICES=${GPUID}\"\n    fi\n    set CUDA_VISIBLE_DEVICES\n    eval $env\n    export Count=0\n    IFS=\"|\"\n    infer_run_exports=(${infer_export_list})\n    infer_quant_flag=(${infer_is_quant})\n    for infer_model in ${infer_model_dir_list[*]}; do\n        # run export\n        if [ ${infer_run_exports[Count]} != \"null\" ];then\n            save_infer_dir=$(dirname $infer_model)\n            set_export_weight=$(func_set_params \"${export_weight}\" \"${infer_model}\")\n            set_save_infer_key=$(func_set_params \"${save_infer_key}\" \"${save_infer_dir}\")\n            export_log_path=\"${LOG_PATH}_export_${Count}.log\"\n            export_cmd=\"${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 \""
+        },
+        {
+            "comment": "This code is iterating through a list of GPUs, setting the visible CUDA devices accordingly and running inference for each GPU. It also checks if exporting is needed, saves the infer directory, runs quantized inference if it's a klquant_infer mode, and keeps track of the count to ensure all GPUs are considered. If a GPU list item is -1, it uses no GPU, otherwise, it sets the environment for that specific GPU. The code block also checks whether the GPU count is less than or equal to 15 to avoid potential issues with larger lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":243-273",
+            "content": "            echo ${infer_run_exports[Count]}\n            eval $export_cmd\n            echo $export_cmd\n            status_export=$?\n            status_check $status_export \"${export_cmd}\" \"${status_log}\" \"${model_name}\" \"${export_log_path}\"\n        else\n            save_infer_dir=${infer_model}\n        fi\n        #run inference\n        is_quant=${infer_quant_flag[Count]}\n        if [ ${MODE} = \"klquant_infer\" ]; then\n            is_quant=\"True\"\n        fi\n        func_inference \"${python}\" \"${inference_py}\" \"${save_infer_dir}\" \"${LOG_PATH}\" \"${infer_video_dir}\" ${is_quant} \"${gpu}\"\n        Count=$(($Count + 1))\n    done\nelse\n    IFS=\"|\"\n    export Count=0\n    USE_GPU_KEY=(${train_use_gpu_value})\n    for gpu in ${gpu_list[*]}; do\n        train_use_gpu=${USE_GPU_KEY[Count]}\n        Count=$(($Count + 1))\n        ips=\"\"\n        if [ ${gpu} = \"-1\" ];then\n            env=\"\"\n        elif [ ${#gpu} -le 1 ];then\n            env=\"export CUDA_VISIBLE_DEVICES=${gpu}\"\n            eval ${env}\n        elif [ ${#gpu} -le 15 ];then"
+        },
+        {
+            "comment": "This code is setting up environment variables for parallel GPU usage, iterating through different autocast and trainer configurations to execute training or export tasks based on the provided key values. The flag_quant variable tracks whether quantization is required for a particular configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":274-302",
+            "content": "            IFS=\",\"\n            array=(${gpu})\n            env=\"export CUDA_VISIBLE_DEVICES=${array[0]}\"\n            IFS=\"|\"\n        else\n            IFS=\";\"\n            array=(${gpu})\n            ips=${array[0]}\n            gpu=${array[1]}\n            IFS=\"|\"\n            env=\" \"\n        fi\n        for autocast in ${autocast_list[*]}; do\n            if [ ${autocast} = \"fp16\" ]; then\n                set_amp_config=\"--amp --amp_level 'O2'\"\n            else\n                set_amp_config=\" \"\n            fi\n            for trainer in ${trainer_list[*]}; do\n                flag_quant=False\n                if [ ${trainer} = ${pact_key} ]; then\n                    run_train=${pact_trainer}\n                    run_export=${pact_export}\n                    flag_quant=True\n                elif [ ${trainer} = \"${fpgm_key}\" ]; then\n                    run_train=${fpgm_trainer}\n                    run_export=${fpgm_export}\n                elif [ ${trainer} = \"${distill_key}\" ]; then\n                    run_train=${distill_trainer}"
+        },
+        {
+            "comment": "This code uses conditional statements to assign values to `run_train` and `run_export` variables based on the value of `trainer`. It handles multiple scenarios, including cases with specific keys, and triggers \"to_static\" logic in 'train.py' when needed. If `run_train` is assigned as null, it continues without executing further.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":303-324",
+            "content": "                    run_export=${distill_export}\n                elif [ ${trainer} = ${amp_key} ]; then\n                    run_train=${amp_trainer}\n                    run_export=${norm_export}\n                elif [[ ${trainer} = ${trainer_key2} ]]; then\n                    run_train=${trainer_value2}\n                    run_export=${export_value2}\n                # In case of @to_static, we re-used norm_traier,\n                # but append \"-o to_static=True\" for config\n                # to trigger \"to_static\" logic in 'train.py'\n                elif [ ${trainer} = \"${to_static_key}\" ]; then\n                    run_train=\"${norm_trainer}  ${to_static_trainer}\"\n                    run_export=${norm_export}\n                else\n                    run_train=${norm_trainer}\n                    run_export=${norm_export}\n                fi\n                if [ ${run_train} = \"null\" ]; then\n                    continue\n                fi\n                if [[ ${MODE} != \"benchmark_train\" ]] && [[ ! ${MODE} =~ \"whole_train\" ]]; then"
+        },
+        {
+            "comment": "This code is setting up parameters for model training and inference. It appends --max_iters=30 and --log_interval=1 to the run_train string for better data output, sets autocast, epoch, pretrain values, and checks if MODE includes \"whole_train\" to set certain variables to empty strings or nulls. The code also uses func_set_params to set batch size and train parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":325-346",
+            "content": "                    # \u8bad\u7ec3\u53c2\u6570\u672b\u5c3e\u52a0\u4e0a--max_iters=30\u548c--log_interval=1\uff0c\u4ee5\u4fbf\u8fd0\u884c\u5e76\u8f93\u51fa\u8db3\u91cf\u6570\u636e\n                    run_train=${run_train}\" --max_iters=30\"\n                fi\n                set_autocast=$(func_set_params \"${autocast_key}\" \"${autocast}\")\n                set_epoch=$(func_set_params \"${epoch_key}\" \"${epoch_num}\")\n                if [[ $MODE =~ \"whole_train\" ]]; then\n                    set_epoch=\"\"\n                fi\n                set_pretrain=$(func_set_params \"${pretrain_model_key}\" \"${pretrain_model_value}\")\n                if [[ $MODE =~ \"whole_train\" ]]; then\n                    train_batch_key=\"\"\n                    train_batch_value=\"\"\n                fi\n                set_batchsize=$(func_set_params \"${train_batch_key}\" \"${train_batch_value}\")\n                if [[ $MODE =~ \"whole_train\" ]]; then\n                    train_param_key1=\"\"\n                    train_param_value1=\"\"\n                fi\n                set_train_params1=$(func_set_params \"${train_param_key1}\" \"${train_param_value1}\")\n                if [[ $MODE =~ \"whole_train\" ]]; then"
+        },
+        {
+            "comment": "This code sets up parameters for training and inference of a PaddleVideo model. It determines whether the training is on a single machine or multiple machines based on the number of IPs provided. Depending on this, it sets the number of nodes, logs information accordingly, and loads pre-training from normal training if the current trainer is PACT or FPGM.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":347-366",
+            "content": "                    train_param_key2=\"\"\n                    train_param_value2=\"\"\n                fi\n                set_train_params2=$(func_set_params \"${train_param_key2}\" \"${train_param_value2}\")\n                set_use_gpu=$(func_set_params \"${train_use_gpu_key}\" \"${train_use_gpu}\")\n                if [ ${#ips} -le 15 ];then\n                    # len(ips)<=15, single machine\n                    nodes=1\n                    save_log=\"${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}\"\n                else\n                    # if length of ips > 15, then it is seen as multi-machine\n                    # 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0\n                    IFS=\",\"\n                    ips_array=(${ips})\n                    IFS=\"|\"\n                    nodes=${#ips_array[@]}\n                    save_log=\"${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}\"\n                fi\n                # load pretrain from norm training if current trainer is pact or fpgm trainer"
+        },
+        {
+            "comment": "This code checks if the trainer is either 'pact_key' or 'fpgm_key', and if the number of nodes is less than or equal to 1. If true, it sets the 'set_pretrain' variable to 'load_norm_train_model'. The code then determines the appropriate command based on whether the number of GPUs is 2 or less (train with CPU or single GPU), up to 15 (train with multi-GPU), or more than 15 (train with multiple machines). The command uses PaddlePaddle's distributed training capabilities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":367-377",
+            "content": "                if ([ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]) && [ ${nodes} -le 1 ]; then\n                    set_pretrain=\"${load_norm_train_model}\"\n                fi\n                set_save_model=$(func_set_params \"${save_model_key}\" \"${save_log}\")\n                if [ ${#gpu} -le 2 ];then  # train with cpu or single gpu\n                    cmd=\"${python} ${run_train} ${set_amp_config} ${set_use_gpu}  ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_train_params2} > ${LOG_PATH}/train.log 2>&1\"\n                elif [ ${#ips} -le 15 ];then  # train with multi-gpu\n                    cmd=\"${python} -B -m paddle.distributed.launch --devices=\\\"${gpu}\\\" ${run_train} ${set_amp_config} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_train_params2} > ${LOG_PATH}/train.log 2>&1\"\n                else     # train with multi-machine\n                    cmd=\"${python} -B -m paddle.distr"
+        },
+        {
+            "comment": "This code snippet is running a training script for a PaddleVideo model. It sets parameters, evaluates pre-trained models, and saves the trained models. The script also displays logs for benchmarking and checks the status of the operation. If there's a single node and trainer, it loads a norm-train model for further usage. Finally, it runs an evaluation script if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":377-394",
+            "content": "ibuted.launch --ips=${ips} --devices=\\\"${gpu}\\\" ${run_train} ${set_amp_config} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_train_params1} ${set_train_params2} > ${LOG_PATH}/train.log 2>&1\"\n                fi\n                # run train\n                eval $cmd\n                # display log for benchmark train\n                eval \"cat ${LOG_PATH}/train.log\"\n                eval \"cat ${LOG_PATH}/train.log >> ${save_log}.log\"\n                status_check $? \"${cmd}\" \"${status_log}\" \"${model_name}\" \"${save_log}.log\"\n                # set_eval_pretrain=$(func_set_params \"${pretrain_model_key}\" \"${save_log}/${train_model_name}\")\n                # save norm trained models to set pretrain for pact training and fpgm training\n                if [ [${trainer} = ${trainer_norm}] ] && [ [${nodes} -le 1] ]; then\n                    load_norm_train_model=${set_eval_pretrain}\n                fi\n                # run eval\n                if [ ${eval_py} != \"null\" ]; then\n                    real_model_name=${model_name/PP-/pp}"
+        },
+        {
+            "comment": "The code sets the evaluation parameters and prepares a command to evaluate a model using specified inputs. If the MODE includes \"lite_infer\" and train_param_key1 is not null, it appends additional parameters to the command. Finally, it runs the command and checks the status of the evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":395-409",
+            "content": "                    set_eval_params1=$(func_set_params \"${eval_key1}\" \"${save_log}/${real_model_name}_epoch_00001.pdparams\")\n                    eval_log_path=\"${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log\"\n                    if [[ $MODE =~ \"lite_infer\" ]] && [[ ${train_param_key1} != \"null\" ]]; then\n                        eval_cmd=\"${python} ${eval_py} ${set_use_gpu} ${set_eval_params1} ${train_param_key1}=${train_param_value1} > ${eval_log_path} 2>&1 \"\n                    else\n                        eval_cmd=\"${python} ${eval_py} ${set_use_gpu} ${set_eval_params1} > ${eval_log_path} 2>&1 \"\n                    fi\n                    eval $eval_cmd\n                    status_check $? \"${eval_cmd}\" \"${status_log}\" \"${model_name}\" \"${eval_log_path}\"\n                fi\n                # run export model\n                if [ ${run_export} != \"null\" ]; then\n                    save_infer_path=\"${save_log}\"\n                    real_model_name=${model_name/PP-/pp}\n     "
+        },
+        {
+            "comment": "This code is setting up variables for exporting weights, saving inference key, and defining the export command. It then executes the export command and checks its status before running inference. If an inference directory is provided, it sets the inference model directory accordingly. Finally, it calls a function for inference processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":409-425",
+            "content": "               set_export_weight=$(func_set_params \"${export_weight}\" \"${save_log}/${real_model_name}_epoch_00001.pdparams\")\n                    set_save_infer_key=$(func_set_params \"${save_infer_key}\" \"${save_log}\")\n                    export_log_path=\"${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_export.log\"\n                    export_cmd=\"${python} ${run_export} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 \"\n                    eval $export_cmd\n                    status_check $? \"${export_cmd}\" \"${status_log}\" \"${model_name}\" \"${export_log_path}\"\n                    #run inference\n                    eval $env\n                    save_infer_path=\"${save_log}\"\n                    if [ ${inference_dir} != \"null\" ] && [ ${inference_dir} != '##' ]; then\n                        infer_model_dir=${save_infer_path}\n                    else\n                        infer_model_dir=${save_infer_path}\n                    fi\n                    func_inference "
+        },
+        {
+            "comment": "This code snippet is a bash script that iterates through trainers, autocast options, and GPUs. It sets CUDA_VISIBLE_DEVICES to empty if the current mode is inference. The purpose is likely to set up an environment for training or inference based on different configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python.sh\":425-432",
+            "content": "\"${python}\" \"${inference_py}\" \"${infer_model_dir}\" \"${LOG_PATH}\" \"${infer_video_dir}\" \"${flag_quant}\" \"${gpu}\"\n                    eval \"unset CUDA_VISIBLE_DEVICES\"\n                fi\n            done  # done with:    for trainer in ${trainer_list[*]}; do\n        done      # done with:    for autocast in ${autocast_list[*]}; do\n    done          # done with:    for gpu in ${gpu_list[*]}; do\nfi  # end if [ ${MODE} = \"infer\" ]; then"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/43c05170-0416-4c9f-b927-084bc4f984f2.json b/docs/doc/43c05170-0416-4c9f-b927-084bc4f984f2.json
new file mode 100644
index 000000000..4cfefe453
--- /dev/null
+++ b/docs/doc/43c05170-0416-4c9f-b927-084bc4f984f2.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code documents a SlowFast_FasterRCNN model for action detection tasks, providing installation and processing instructions. It trains and tests the model with PaddleDetection and exports it for inference, using GPU acceleration and disabling TensorRT optimization.",
+    "details": [
+        {
+            "comment": "This code provides documentation for the SlowFast_FasterRCNN model, a high-precision video model used for action detection tasks. It takes human detection results and video frames as input, uses the SlowFast model to extract spatiotemporal features, and employs FasterRCNN's head to obtain the actions and positions of humans in the frame. Users need to install additional dependencies before getting started.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md\":0-23",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md) | English\n# SlowFast_FasterRCNN\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install moviepy\npython -m pip install et_xmlfile\npython -m pip install paddledet\n```\n## Introduction\nThe [SlowFast](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/slowfast.md) model is one of the high-precision models in the video field. For action detection task, it is also neccessary to detect the person in current frame. Therefore, the SlowFast_FasterRCNN model takes human detection results and video frames as input, extracts spatiotemporal features through the SlowFast model, and then uses FasterRCNN's head gets the actions and positions of humans in the frame.\nThe corresponding AI Studio Notebook Link\uff1a[\u57fa\u4e8eSlowFast+FasterRCNN\u7684\u52a8\u4f5c\u8bc6\u522b](https://aistudio.baidu.com/aistudio/projectdetail/3267637?contributionType=1)"
+        },
+        {
+            "comment": "The provided code is a set of instructions for downloading videos, annotations, and proposals as well as cutting videos and extracting frames from the AVA dataset. This dataset contains 430 videos annotated in 1 second intervals for action detection using SlowFast Networks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md\":25-63",
+            "content": "For details, please refer to the paper [SlowFast Networks for Video Recognition](https://arxiv.org/pdf/1812.03982.pdf).\n## Data\nWe use [AVA dataset](https://research.google.com/ava/download.html) for action detection. The AVA v2.2 dataset contains 430 videos split into 235 for training, 64 for validation, and 131 for test. Each video has 15 minutes annotated in 1 second intervals.\n### 1 Dowload Videos\n```\nbash  download_videos.sh\n```\n### 2 Download Annotations\n```\nbash  download_annotations.sh\n```\n### 3 Download Proposals\n```\nbash  fetch_ava_proposals.sh\n```\n### 4 Cut Videos\n```\nbash  cut_videos.sh\n```\n### 5 Extract Frames\n```\nbash  extract_rgb_frames.sh\n```\nFor AVA v2.1, there is a simple introduction to some key files\uff1a\n* 'ava_videos_15min_frames' dir stores video frames extracted with FPS as the frame rate\uff1b\n* 'ava_train_v2.1.csv' file stores the trainning annotations\uff1b\n* 'ava_train_excluded_timestamps_v2.1.csv' file stores excluded timestamps\uff1b\n* 'ava_dense_proposals_train.FAIR.recall_93.9.pkl' file stores humans' bboxes and scores of key frames\uff1b"
+        },
+        {
+            "comment": "This code describes the training and testing procedures for a SlowFast model using Faster RCNN on AVA dataset. The training process requires a config file, pre-trained model weights, and evaluates the model during training with the --validate flag. Testing is done based on the best model provided with specifications like architecture, depth, pretrain model, frame length, sample rate, MAP, AVA version, and a link to the trained model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md\":64-89",
+            "content": "* 'ava_action_list_v2.1_for_activitynet_2018.pbtxt' file stores\u4e3a action list.\n## Train\n* `-c`: config file path;\n* `-w`: weights of model. The pretrained model can be downloaded from the table below;\n* `--validate`: evaluate model during training.\n```\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=logdir.ava main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava.yaml\n```\n## Test\nTest model based on the best model:\n```\npython main.py --test \\\n   -w output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams \\\n   -c configs/detection/ava/ava.yaml\n```\n| architecture | depth | Pretrain Model |  frame length x sample rate  | MAP | AVA version | model |\n| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |------------- |\n| SlowFast | R50 | [Kinetics 400](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) | 8 x 8 | 23.2 | 2.1 | [`link`](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SlowFastRCNN_AVA.pdparams) |"
+        },
+        {
+            "comment": "In this code, the inference process is outlined for an action detection project using SlowFast+FasterRCNN model. It requires installing PaddleDetection and downloading a detection model from provided URL. The \"export_model\" script prepares the model for inference, while the \"predict.py\" script performs inference based on the exported model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md\":92-125",
+            "content": "## Inference\nThe action detection of this project is divided into two stages. In the first stage, humans' proposals are obtained, and then input into the SlowFast+FasterRCNN model for action recognition.\nFor human detection\uff0cyou can use the trained model in [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection).\nInstall PaddleDetection:\n```\ncd PaddleDetection/\npip install -r requirements.txt\n!python setup.py install\n```\nDownload detection model:\n```\n# faster_rcnn_r50_fpn_1x_coco as an example\nwget https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams\n```\nexport model:\n```\npython tools/export_model.py \\\n  -c configs/detection/ava/ava.yaml \\\n  -o inference_output \\\n  -p output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams\n```\ninference based on the exported model:\n```\npython tools/predict.py \\\n    -c configs/detection/ava/ava.yaml \\\n    --input_file \"data/-IELREHXDEMO.mp4\" \\\n    --model_file \"inference_output/AVA_SlowFast_FastRcnn.pdmodel\" \\\n    --params_file \"inference_output/AVA_SlowFast_FastRcnn.pdiparams\" \\"
+        },
+        {
+            "comment": "The code sets `use_gpu` to True and `use_tensorrt` to False. This means the model will use GPU acceleration and not utilize TensorRT for optimizing performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md\":126-128",
+            "content": "    --use_gpu=True \\\n    --use_tensorrt=False\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4491a653-9505-4cce-9c8b-ca3be2d18b07.json b/docs/doc/4491a653-9505-4cce-9c8b-ca3be2d18b07.json
new file mode 100644
index 000000000..d825e1d95
--- /dev/null
+++ b/docs/doc/4491a653-9505-4cce-9c8b-ca3be2d18b07.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code employs machine learning and deep learning for Baidu Cloud's action detection system, including preprocessing, feature extraction, model application, and time-tracked execution. It configures PPTSM model, predicts features from data, creates a video feature dictionary, loads pre-existing features, checks shapes, and writes results to JSON file.",
+    "details": [
+        {
+            "comment": "This code is for the Baidu Cloud action detection system, which uses machine learning and deep learning models to classify actions from both audio and image inputs. It includes utilities for preprocessing data, extracting features, and applying various models including image, audio, and propensity models. The code also has a logger module to log processing time information. A class ActionDetection is defined which likely handles the overall action detection process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/action.py\":0-43",
+            "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport functools\nimport numpy as np\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport mfcc.feature_extractor as mfcc_extractor\nimport models.pptsm_infer as image_model\nimport models.audio_infer as audio_model\nimport models.bmn_infer as prop_model\nimport models.lstm_infer as classify_model\nimport logger\nlogger = logger.Logger()\ndef record_time_info(func):\n    \"\"\"decorator func to log cost time for func\n    \"\"\"\n    @functools.wraps(func)\n    def timer(*args):\n        \"\"\"log cost time for func\n        \"\"\"\n        logger.info(\"function [{}] processing ...\".format(func.__name__))\n        start_time = time.time()\n        retval = func(*args)\n        cost_time = round(time.time() - start_time, 5)\n        logger.info(\"function [{}] run time: {:.2f} min\".format(func.__name__, cost_time / 60))\n        return retval\n    return timer\nclass ActionDetection(object):"
+        },
+        {
+            "comment": "This code initializes a ModelPredict object with various configuration settings and properties. It reads configuration data from a specified file, prints relevant information for debugging, and sets attributes related to model components such as BMN_ONLY, LSTM_ONLY, and PCM_ONLY. If LSTM_ONLY is set to true, it populates a prop_dict with video names and their corresponding BMN results for later use in the load_model function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/action.py\":44-70",
+            "content": "    \"\"\"ModelPredict\"\"\"\n    def __init__(self, cfg_file=\"configs/configs.yaml\"):\n        cfg = parse_config(cfg_file)\n        self.configs = cfg\n        print_configs(self.configs, \"Infer\")\n        name = 'COMMON'\n        self.DEBUG          = cfg[name]['DEBUG']\n        self.BMN_ONLY       = cfg[name]['BMN_ONLY']\n        self.LSTM_ONLY      = cfg[name]['LSTM_ONLY']\n        self.PCM_ONLY       = cfg[name]['PCM_ONLY']\n        if self.LSTM_ONLY:\n            self.prop_dict = {}\n            for dataset in ['EuroCup2016']:\n                prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)\n                json_data = json.load(open(prop_json, 'r'))\n                for item in json_data:\n                    basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n                    basename = basename + '/' + item['video_name'] + '.mp4'\n                    self.prop_dict[basename] = item['bmn_results']\n    @record_time_info\n    def load_model(self):\n        \"\"\"\n        load_model\n        \"\"\""
+        },
+        {
+            "comment": "This code initializes models for image, audio, property prediction, and action classification. If DEBUG is not set, it creates InferModel instances for each model type. It then extracts features from input video, gets proposals using BMN (Bidirectional Mixture of Experts Network), and classifies the actions based on these features and proposals.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/action.py\":71-102",
+            "content": "        if not self.DEBUG:\n            self.image_model = image_model.InferModel(self.configs)\n            if not self.PCM_ONLY:\n                self.audio_model = audio_model.InferModel(self.configs)\n        if not self.LSTM_ONLY:\n            self.prop_model = prop_model.InferModel(self.configs)\n        if not self.BMN_ONLY:\n            self.classify_model = classify_model.InferModel(self.configs)\n        logger.info(\"==> Action Detection prepared.\")\n    @record_time_info\n    def infer(self, imgs_path, pcm_path, fps=5):\n        \"\"\"\n        extract_feature\n        \"\"\"\n        self.imgs_path = imgs_path\n        self.pcm_path = pcm_path\n        self.configs['COMMON']['fps'] = fps\n        logger.info(\"==> input video {}\".format(os.path.basename(self.imgs_path)))\n        # step 1: extract feature\n        video_features = self.extract_feature()\n        # step2: get proposal\n        bmn_results = self.extract_proposal(video_features)\n        # step3: classify \n        material = {'feature': video_features, 'proposal': bmn_results}"
+        },
+        {
+            "comment": "The code contains multiple methods: `video_classify`, `extract_proposal`, and `extract_feature`. The `video_classify` method predicts actions using a classification model, while the `extract_proposal` method extracts proposals (BMN results) for an input video. Both methods are decorated with the `@record_time_info` decorator to track execution time. The `extract_feature` method extracts features from images in the given path and is only executed if `DEBUG` flag is not set.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/action.py\":103-131",
+            "content": "        action_results = self.video_classify(material)\n        return bmn_results, action_results\n    @record_time_info\n    def video_classify(self, material):\n        \"\"\"video classify\"\"\"\n        if self.BMN_ONLY:\n            return []\n        action_results = self.classify_model.predict(self.configs, material=material) \n        logger.info('action shape {}'.format(np.array(action_results).shape))\n        return action_results\n    @record_time_info\n    def extract_proposal(self, video_features):\n        \"\"\"extract proposal\"\"\"\n        if self.LSTM_ONLY:\n            basename = self.imgs_path.replace('frames', 'mp4') + '.mp4'\n            bmn_results = self.prop_dict[basename]\n            return bmn_results\n        bmn_results = self.prop_model.predict(self.configs, material=video_features)\n        logger.info('proposal shape {}'.format(np.array(bmn_results).shape))\n        return bmn_results\n    @record_time_info\n    def extract_feature(self):\n        \"\"\"extract feature\"\"\"\n        if not self.DEBUG:\n            image_path_list = get_images(self.imgs_path)"
+        },
+        {
+            "comment": "This code configures the PPTSM model with image and audio data, then predicts features for both. If PCM_ONLY is True, it extracts MFCC from pcm file. It creates a video feature dictionary containing the predicted image, audio, and (if applicable) pcm features. If no input images are given, it loads the corresponding features from the specified feature path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/action.py\":132-150",
+            "content": "            self.configs['PPTSM']['frame_list'] = image_path_list\n            self.configs['AUDIO']['pcm_file'] = self.pcm_path\n            image_features = self.image_model.predict(self.configs)\n            if self.PCM_ONLY:\n                sample_rate = self.configs['AUDIO']['sample_rate']\n                pcm_features = mfcc_extractor.extract_pcm(self.pcm_path, sample_rate)\n                audio_features = []\n            else:\n                audio_features, pcm_features = self.audio_model.predict(self.configs)\n            np_image_features = np.array(image_features, dtype=np.float32)\n            np_audio_features = np.array(audio_features, dtype=np.float32)\n            np_pcm_features = np.array(pcm_features, dtype=np.float32)\n            video_features = {'image_feature': np_image_features,\n                              'audio_feature': np_audio_features,\n                              'pcm_feature': np_pcm_features}\n        else:\n            feature_path = self.imgs_path.replace(\"frames\", \"features\") + '.pkl'"
+        },
+        {
+            "comment": "Code loads pre-existing video features from a file, checks their shapes and returns them for further processing. It then creates an instance of the ActionDetection model, loads the model, and calls infer function with image and audio paths. Finally, it writes results to a JSON file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/action.py\":151-172",
+            "content": "            video_features = pickle.load(open(feature_path, 'rb'))\n        logger.info(\"feature shape {} {} {}\".format(video_features['image_feature'].shape,\n                                                    video_features['audio_feature'].shape,\n                                                    video_features['pcm_feature'].shape))\n        return video_features\nif __name__ == '__main__':\n    model_predict = ActionDetection(cfg_file=\"../configs/configs.yaml\")\n    model_predict.load_model()\n    imgs_path = \"/home/work/datasets/EuroCup2016/frames/1be705a8f67648da8ec4b4296fa80895\"\n    pcm_path = \"/home/work/datasets/EuroCup2016/pcm/1be705a8f67648da8ec4b4296fa80895.pcm\"\n    bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n    results = {'bmn_results': bmn_results, 'action_results': action_results}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/44af1e9d-a252-4491-9e4b-218ec8498ceb.json b/docs/doc/44af1e9d-a252-4491-9e4b-218ec8498ceb.json
new file mode 100644
index 000000000..38517f7b7
--- /dev/null
+++ b/docs/doc/44af1e9d-a252-4491-9e4b-218ec8498ceb.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code defines a BMN loss function for PaddleVideo, considering time-scale attributes and ratio of positive entries. It also includes a loss function for object detection models with weighted samples, position losses, and ground truth IoU masks. The code further defines a loss function for PEM and TEAM tasks by combining predicted and ground truth values using three loss functions.",
+    "details": [
+        {
+            "comment": "This code defines a BMN loss function for the PaddleVideo library. It is registered in the LOSSES registry and takes two arguments: dscale and tscale, which represent max duration length and sequence length respectively. The class extends BaseWeightedLoss, suggesting it combines multiple weighted losses.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/bmn_loss.py\":0-31",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass BMNLoss(BaseWeightedLoss):\n    \"\"\"Loss for BMN model\n    Args:\n        tscale (int): sequence length, default 100.\n        dscale (int): max duration length, default 100.\n    \"\"\"\n    def __init__(self, dscale, tscale):\n        super().__init__()\n        self.dscale = dscale"
+        },
+        {
+            "comment": "This code defines a class with a time-scale attribute, a method to create binary mask arrays, and a loss function for a specific task. The loss function takes in predicted start and end positions along with ground truth values and calculates a ratio between the number of entries and positive values. This ratio is then used to calculate a coefficient for the loss function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/bmn_loss.py\":32-54",
+            "content": "        self.tscale = tscale\n    def _get_mask(self, dscale, tscale):\n        bm_mask = []\n        for idx in range(dscale):\n            mask_vector = [1 for i in range(tscale - idx)\n                           ] + [0 for i in range(idx)]\n            bm_mask.append(mask_vector)\n        bm_mask = np.array(bm_mask, dtype='float32')\n        bm_mask = paddle.to_tensor(bm_mask)\n        bm_mask.stop_gradient = True\n        return bm_mask\n    def tem_loss_func(self, pred_start, pred_end, gt_start, gt_end):\n        def bi_loss(pred_score, gt_label, datatype):\n            pred_score = paddle.reshape(x=pred_score, shape=[-1])\n            gt_label = paddle.reshape(x=gt_label, shape=[-1])\n            gt_label.stop_gradient = True\n            pmask = paddle.cast(x=(gt_label > 0.5), dtype=datatype)\n            num_entries = paddle.cast(paddle.shape(pmask), dtype=datatype)\n            num_positive = paddle.cast(paddle.sum(pmask), dtype=datatype)\n            ratio = num_entries / num_positive\n            coef_0 = 0.5 * ratio / (ratio - 1)"
+        },
+        {
+            "comment": "The code defines a loss function for object detection models. It calculates the loss by considering positive and negative samples, applying weights to each sample based on their ratio, and then combines them. The bi_loss function is used to calculate losses for start and end positions. In another function, Pem_reg_loss_func, it separates ground truth IoU map into three masks: high (>0.7), medium (<=0.7 & >0.3), and low (<=0.3 & >=0). It then applies these masks to calculate the loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/bmn_loss.py\":55-76",
+            "content": "            coef_1 = 0.5 * ratio\n            epsilon = 0.000001\n            loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)\n            loss_pos = coef_1 * paddle.mean(loss_pos)\n            loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),\n                                       (1.0 - pmask))\n            loss_neg = coef_0 * paddle.mean(loss_neg)\n            loss = -1 * (loss_pos + loss_neg)\n            return loss\n        loss_start = bi_loss(pred_start, gt_start, pred_start.dtype)\n        loss_end = bi_loss(pred_end, gt_end, pred_start.dtype)\n        loss = loss_start + loss_end\n        return loss\n    def pem_reg_loss_func(self, pred_score, gt_iou_map, mask):\n        gt_iou_map = paddle.multiply(gt_iou_map, mask)\n        u_hmask = paddle.cast(x=gt_iou_map > 0.7, dtype=pred_score.dtype)\n        u_mmask = paddle.logical_and(gt_iou_map <= 0.7, gt_iou_map > 0.3)\n        u_mmask = paddle.cast(x=u_mmask, dtype=pred_score.dtype)\n        u_lmask = paddle.logical_and(gt_iou_map <= 0.3, gt_iou_map >= 0.)"
+        },
+        {
+            "comment": "Calculating the number of elements in different masks and using them to calculate ratios for later mask operations. Creating uniform masks and multiplying them with corresponding existing masks, then casting the results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/bmn_loss.py\":77-100",
+            "content": "        u_lmask = paddle.cast(x=u_lmask, dtype=pred_score.dtype)\n        u_lmask = paddle.multiply(u_lmask, mask)\n        num_h = paddle.cast(paddle.sum(u_hmask), dtype=pred_score.dtype)\n        num_m = paddle.cast(paddle.sum(u_mmask), dtype=pred_score.dtype)\n        num_l = paddle.cast(paddle.sum(u_lmask), dtype=pred_score.dtype)\n        r_m = num_h / num_m\n        u_smmask = paddle.uniform(shape=[\n            gt_iou_map.shape[1], gt_iou_map.shape[2]\n        ],\n                                  min=0.0,\n                                  max=1.0).astype(pred_score.dtype)\n        u_smmask = paddle.multiply(u_mmask, u_smmask)\n        u_smmask = paddle.cast(x=(u_smmask > (1. - r_m)),\n                               dtype=pred_score.dtype)\n        r_l = num_h / num_l\n        u_slmask = paddle.uniform(shape=[\n            gt_iou_map.shape[1], gt_iou_map.shape[2]\n        ],\n                                  min=0.0,\n                                  max=1.0).astype(pred_score.dtype)\n        u_slmask = paddle.multiply(u_lmask, u_slmask)"
+        },
+        {
+            "comment": "In this code, u_slmask is created by comparing r_l with 1 and casting the result to the dtype of pred_score. Then, weights are calculated by adding u_hmask, u_smmask, and u_slmask. The stop_gradient attribute of weights is set to True. Loss is calculated using square error cost between pred_score and gt_iou_map, multiplied by weights, averaged, and returned.\nIn the pem_cls_loss_func, gt_iou_map is multiplied by mask and marked as non-trainable (stop_gradient = True). Pmask and nmask are created based on conditions with gt_iou_map and mask. Num_positive and num_entries are calculated. Ratios are used to determine coef_0 and coef_1. Loss_pos is log(pred_score + epsilon) multiplied by pmask.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/bmn_loss.py\":101-125",
+            "content": "        u_slmask = paddle.cast(x=(u_slmask > (1. - r_l)),\n                               dtype=pred_score.dtype)\n        weights = u_hmask + u_smmask + u_slmask\n        weights.stop_gradient = True\n        loss = F.square_error_cost(pred_score, gt_iou_map)\n        loss = paddle.multiply(loss, weights)\n        loss = 0.5 * paddle.sum(loss) / paddle.sum(weights)\n        return loss\n    def pem_cls_loss_func(self, pred_score, gt_iou_map, mask):\n        gt_iou_map = paddle.multiply(gt_iou_map, mask)\n        gt_iou_map.stop_gradient = True\n        pmask = paddle.cast(x=(gt_iou_map > 0.9), dtype=pred_score.dtype)\n        nmask = paddle.cast(x=(gt_iou_map <= 0.9), dtype=pred_score.dtype)\n        nmask = paddle.multiply(nmask, mask)\n        num_positive = paddle.sum(pmask)\n        num_entries = num_positive + paddle.sum(nmask)\n        ratio = num_entries / num_positive\n        coef_0 = 0.5 * ratio / (ratio - 1)\n        coef_1 = 0.5 * ratio\n        epsilon = 0.000001\n        loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)"
+        },
+        {
+            "comment": "Function `forward` takes in `pred_bm`, `pred_start`, `pred_end`, `gt_iou_map`, `gt_start`, and `gt_end`. It first extracts `pred_bm_reg` and `pred_bm_cls` by slicing `pred_bm` along the specified axes. Then, it calculates the `bm_mask` using `_get_mask` with given scales. The function returns the calculated loss from the input parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/bmn_loss.py\":126-146",
+            "content": "        loss_pos = coef_1 * paddle.sum(loss_pos)\n        loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),\n                                   nmask)\n        loss_neg = coef_0 * paddle.sum(loss_neg)\n        loss = -1 * (loss_pos + loss_neg) / num_entries\n        return loss\n    def forward(self, pred_bm, pred_start, pred_end, gt_iou_map, gt_start,\n                gt_end):\n        pred_bm_reg = paddle.squeeze(paddle.slice(pred_bm,\n                                                  axes=[1],\n                                                  starts=[0],\n                                                  ends=[1]),\n                                     axis=[1])\n        pred_bm_cls = paddle.squeeze(paddle.slice(pred_bm,\n                                                  axes=[1],\n                                                  starts=[1],\n                                                  ends=[2]),\n                                     axis=[1])\n        bm_mask = self._get_mask(self.dscale, self.tscale)"
+        },
+        {
+            "comment": "This code calculates the loss for PEM and TEAM detection tasks by combining the predicted and ground truth values. It uses three loss functions: `pem_reg_loss_func`, `pem_cls_loss_func`, and `tem_loss_func`. The final loss is the sum of the temporal (TEM) loss, 10 times the PEM regression loss, and the PEM classification loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/bmn_loss.py\":148-154",
+            "content": "        pem_reg_loss = self.pem_reg_loss_func(pred_bm_reg, gt_iou_map, bm_mask)\n        pem_cls_loss = self.pem_cls_loss_func(pred_bm_cls, gt_iou_map, bm_mask)\n        tem_loss = self.tem_loss_func(pred_start, pred_end, gt_start, gt_end)\n        loss = tem_loss + 10 * pem_reg_loss + pem_cls_loss\n        return loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/455ffff8-f746-4f1a-9660-e6765fa42cf5.json b/docs/doc/455ffff8-f746-4f1a-9660-e6765fa42cf5.json
new file mode 100644
index 000000000..bc6fd9e7e
--- /dev/null
+++ b/docs/doc/455ffff8-f746-4f1a-9660-e6765fa42cf5.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The DepthEstimator class inherits from BaseEstimator and contains a forward_net method for feature extraction. It has training, validation, testing, and inference methods with loss metrics calculated using the forward_net and head.loss.",
+    "details": [
+        {
+            "comment": "The code defines a DepthEstimator class that inherits from BaseEstimator. It has a forward_net method that takes inputs and optionally applies a backbone network for feature extraction. The results are stored in outputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/depth_estimator.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nfrom paddlevideo.modeling.framework.estimators.base import BaseEstimator\nfrom paddlevideo.modeling.registry import ESTIMATORS\nfrom paddlevideo.utils import get_logger\nfrom ... import builder\nlogger = get_logger(\"paddlevideo\")\n@ESTIMATORS.register()\nclass DepthEstimator(BaseEstimator):\n    \"\"\"DepthEstimator\n    \"\"\"\n    def forward_net(self, inputs, day_or_night='day_and_night'):\n        if self.backbone is not None:\n            outputs = self.backbone(inputs, day_or_night)\n        else:\n            outputs = inputs"
+        },
+        {
+            "comment": "The code defines four methods: train_step, val_step, test_step, and infer_step. The main purpose of each step is to calculate the loss metrics from the input data. The 'forward_net' method is used in all steps to process the inputs and generate outputs, which are then passed to the 'head.loss' method to compute the loss metrics for each step.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/depth_estimator.py\":31-57",
+            "content": "        return outputs\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        inputs, _ = data_batch\n        outputs = self.forward_net(inputs, day_or_night='day_and_night')\n        loss_metrics = self.head.loss(inputs, outputs)\n        return loss_metrics\n    def val_step(self, data_batch):\n        inputs, day_or_night = data_batch\n        outputs = self.forward_net(inputs, day_or_night=day_or_night)\n        loss_metrics = self.head.loss(inputs, outputs)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        inputs, day_or_night = data_batch\n        outputs = self.forward_net(inputs, day_or_night=day_or_night)\n        loss_metrics = self.head.loss(inputs, outputs)\n        return loss_metrics\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        inputs = data_batch[0]\n        outputs = self.forward_net(inputs, day_or_night='day')"
+        },
+        {
+            "comment": "This code snippet returns the output results from a depth estimator model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/depth_estimator.py\":58-58",
+            "content": "        return outputs"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/45cb0c72-1c7b-45ae-96be-26a6cf633a8a.json b/docs/doc/45cb0c72-1c7b-45ae-96be-26a6cf633a8a.json
new file mode 100644
index 000000000..4b257b5fe
--- /dev/null
+++ b/docs/doc/45cb0c72-1c7b-45ae-96be-26a6cf633a8a.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code trains and validates PoseC3D, a skeleton-based action recognition model, on the UCF101 dataset, using pre-trained weights. It details testing and inference processes without GPU acceleration or TensorRT.",
+    "details": [
+        {
+            "comment": "PoseC3D is a skeleton-based action recognition approach that utilizes 3D head pose features and aims to overcome the limitations of GCN-based methods in terms of robustness, interoperability, and scalability. It involves training on UCF101, testing on UCF101, exporting an inference model, and inferring using the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/posec3d.md\":0-23",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/posec3d.md) | English\n# PoseC3D\n---\n## Contents\n- [PoseC3D](#PoseC3D)\n  - [Contents](#contents)\n  - [Introduction](#introduction)\n  - [Data](#data)\n  - [Train](#train)\n    - [Train on UCF101.](#train-on-ucf101)\n  - [Test](#test)\n    - [Test onf UCF101](#test-onf-ucf101)\n  - [Inference](#inference)\n    - [export inference model](#export-inference-model)\n    - [infer](#infer)\n  - [Reference](#reference)\n## Introduction\nHuman  skeleton,  as  a  compact  representation  of  hu-man  action,  has  received  increasing  attention  in  recentyears.    Many  skeleton-based  action  recognition  methodsadopt graph convolutional networks (GCN) to extract fea-tures on top of human skeletons.   Despite the positive re-sults  shown  in  previous  works,  GCN-based  methods  aresubject  to  limitations  in  robustness,  interoperability,  andscalability.  In this work, we propose PoseC3D, a new ap-proach  to  skeleton-based  action  recognition,  which  relieson  a  3D  hea"
+        },
+        {
+            "comment": "This code is for training the PoseC3D model on UCF101 dataset. It requires downloading pre-trained model weights from a specific URL. The command \"python3.7 main.py --validate -c configs/recognition/posec3d/posec3d.yaml --weights res3d_k400.pdparams\" is used to train the PoseC3D model using a provided configuration file and pre-trained weights. The trained model will be validated, likely to assess its performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/posec3d.md\":23-38",
+            "content": "tmap  stack  instead  of  a  graph  sequence  asthe base representation of human skeletons.  Compared toGCN-based methods, PoseC3D is more effective in learningspatiotemporal features, more robust against pose estima-tion noises, and generalizes better in cross-dataset settings.Also, PoseC3D can handle multiple-person scenarios with-out additional computation cost, and its features can be eas-ily integrated with other modalities at early fusion stages,which  provides  a  great  design  space  to  further  boost  theperformance. On four challenging datasets, PoseC3D con-sistently obtains superior performance, when used alone onskeletons and in combination with the RGB modality.\n## Data\nPlease download UCF101 skeletons datasets and pretraind model weights.\n[https://aistudio.baidu.com/aistudio/datasetdetail/140593](https://aistudio.baidu.com/aistudio/datasetdetail/140593)\n## Train\n### Train on UCF101.\n- Train PoseC3D model:\n```bash\npython3.7 main.py --validate -c configs/recognition/posec3d/posec3d.yaml --weights res3d_k400.pdparams"
+        },
+        {
+            "comment": "This code provides instructions for testing and inference of the PoseC3D model on UCF101 dataset. The test script specifies the config file and weight path, while the inference steps explain how to export the model architecture and parameters for further usage. The link leads to additional information on model inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/posec3d.md\":39-81",
+            "content": "```\n## Test\n### Test onf UCF101\n- Test scripts\uff1a\n```bash\npython3.7 main.py --test -c configs/recognition/posec3d/posec3d.yaml  -w output/PoseC3D/PoseC3D_epoch_0012.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on UCF101 dataset:\n| Test_Data | Top-1 | checkpoints |\n| :----: | :----: | :---- |\n| UCF101 test1 | 87.05 | [PoseC3D_ucf101.pdparams]() |\n## Inference\n### export inference model\n To get model architecture file `PoseC3D.pdmodel` and parameters file `PoseC3D.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/posec3d/posec3d.yaml \\\n                                -p data/PoseC3D_ucf101.pdparams \\\n                                -o inference/PoseC3D\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example_UCF101_skeleton.pkl\\\n                           --config configs/recognition/posec3d/posec3d.yaml \\"
+        },
+        {
+            "comment": "Running PoseC3D model for inference with GPU acceleration and without TensorRT.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/posec3d.md\":82-99",
+            "content": "                           --model_file inference/PoseC3D/PoseC3D.pdmodel \\\n                           --params_file inference/PoseC3D/PoseC3D.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example_UCF101_skeleton.pkl\n\ttop-1 class: 0\n\ttop-1 score: 0.6731489896774292\n```\n## Reference\n- [Revisiting Skeleton-based Action Recognition](https://arxiv.org/pdf/2104.13586v1.pdf), Haodong Duan, Yue Zhao, Kai Chen, Dian Shao, Dahua Lin, Bo Dai"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/45ce3ebf-27d8-4371-a3cd-e3850e588fe8.json b/docs/doc/45ce3ebf-27d8-4371-a3cd-e3850e588fe8.json
new file mode 100644
index 000000000..d39e58f2c
--- /dev/null
+++ b/docs/doc/45ce3ebf-27d8-4371-a3cd-e3850e588fe8.json
@@ -0,0 +1,185 @@
+{
+    "summary": "This function computes L2 distances, applies nearest neighbor attention and feature extraction, considers padding, uses local search windows and average pooling. It introduces a custom layer, calculates nearest neighbor features with embeddings, updates global map dictionaries, and processes inputs to return output dictionaries after calculations on local distance maps for each frame. The code segment updates global and local map dictionaries, calculates frame embeddings and masks, obtains segmentation predictions, and processes data for improved video processing accuracy.",
+    "details": [
+        {
+            "comment": "This code defines a function that calculates pairwise squared L2 distances between two tensors. It takes in two tensors, x and y, and optionally a third tensor ys. The function first computes the sum of squares for each row in tensor x and stores them in xs. If ys is None, it then computes the sum of squares for each row in tensor y and stores them in ys. Otherwise, it uses the provided ys. Finally, the function calculates the pairwise distances using the formula xs + ys - 2 * paddle.matmul(x, paddle.t(y)).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":0-36",
+            "content": "import numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom EIVideo.paddlevideo.utils.manet_utils import int_, float_, long_, load\nfrom EIVideo.paddlevideo.utils.manet_utils import kaiming_normal_\n#############################################################GLOBAL_DIST_MAP\nMODEL_UNFOLD = True\nWRONG_LABEL_PADDING_DISTANCE = 1e20\ndef _pairwise_distances(x, y, ys=None):\n    \"\"\"Computes pairwise squared l2 distances between tensors x and y.\n    Args:\n    x: Tensor of shape [n, feature_dim].\n    y: Tensor of shape [m, feature_dim].\n    Returns:\n    Float32 distances tensor of shape [n, m].\n    \"\"\"\n    xs = paddle.sum(x * x, 1)\n    xs = xs.unsqueeze(1)\n    if ys is None:\n        ys = paddle.sum(y * y, 1)\n        ys = ys.unsqueeze(0)\n    else:\n        ys = ys\n    d = xs + ys - 2. * paddle.matmul(x, paddle.t(y))\n    return d, ys\n##################\ndef _flattened_pairwise_distances(reference_embeddings, query_embeddings, ys):\n    \"\"\"Calculates flattened tensor of pairwise distances between ref and query."
+        },
+        {
+            "comment": "This function takes reference and query embeddings as input, calculates pairwise distances between them using the _pairwise_distances function, and returns the distances in dists and ys. The _nn_features_per_object_for_chunk function extracts features for each object using nearest neighbor attention, taking reference embeddings, query embeddings, wrong_label_mask, k_nearest_neighbors, and ys as input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":37-59",
+            "content": "    Args:\n    reference_embeddings: Tensor of shape [..., embedding_dim],\n      the embedding vectors for the reference frame\n    query_embeddings: Tensor of shape [n_query_images, height, width,\n      embedding_dim], the embedding vectors for the query frames.\n    Returns:\n    A distance tensor of shape [reference_embeddings.size / embedding_dim,\n    query_embeddings.size / embedding_dim]\n    \"\"\"\n    embedding_dim = query_embeddings.shape[-1]\n    reference_embeddings = reference_embeddings.reshape([-1, embedding_dim])\n    first_dim = -1\n    query_embeddings = query_embeddings.reshape([first_dim, embedding_dim])\n    dists, ys = _pairwise_distances(query_embeddings, reference_embeddings, ys)\n    return dists, ys\ndef _nn_features_per_object_for_chunk(reference_embeddings, query_embeddings,\n                                      wrong_label_mask, k_nearest_neighbors,\n                                      ys):\n    \"\"\"Extracts features for each object using nearest neighbor attention.\n  Args:\n    reference_embeddings: Tensor of shape [n_chunk, embedding_dim],"
+        },
+        {
+            "comment": "This function calculates the pairwise distances between reference and query embeddings, selects the nearest neighbors based on those distances, and returns the nearest neighbor features. It handles cases with different numbers of reference and query embeddings by padding with a specified distance value for missing embeddings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":60-82",
+            "content": "      the embedding vectors for the reference frame.\n    query_embeddings: Tensor of shape [m_chunk, embedding_dim], the embedding\n      vectors for the query frames.\n    wrong_label_mask:\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n  Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [m_chunk, n_objects, feature_dim].\n    \"\"\"\n    #    reference_embeddings_key = reference_embeddings\n    #    query_embeddings_key = query_embeddings\n    dists, ys = _flattened_pairwise_distances(reference_embeddings,\n                                              query_embeddings, ys)\n    dists = (paddle.unsqueeze(dists, 1) +\n             paddle.unsqueeze(float_(wrong_label_mask), 0) *\n             WRONG_LABEL_PADDING_DISTANCE)\n    if k_nearest_neighbors == 1:\n        features = paddle.min(dists, 2, keepdim=True)\n    else:\n        dists, _ = paddle.topk(-dists, k=k_nearest_neighbors, axis=2)\n        dists = -dists\n        valid_mask = (dists < WRONG_LABEL_PADDING_DISTANCE)"
+        },
+        {
+            "comment": "The code calculates the mean of distances between valid points and assigns the result to \"features\". The function _selected_pixel() selects pixels from flattened arrays where reference labels are not -1. The function _nearest_neighbor_features_per_object_in_chunks() operates on flattened embeddings, labels, and object ids to compute nearest neighbor features per object in chunks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":83-112",
+            "content": "        masked_dists = dists * valid_mask.float()\n        pad_dist = paddle.max(masked_dists, axis=2, keepdim=True)[0].tile(\n            (1, 1, masked_dists.shape[-1]))\n        dists = paddle.where(valid_mask, dists, pad_dist)\n        # take mean of distances\n        features = paddle.mean(dists, axis=2, keepdim=True)\n    return features, ys\n###\ndef _selected_pixel(ref_labels_flat, ref_emb_flat):\n    index_list = paddle.arange(len(ref_labels_flat))\n    index_list = index_list\n    index_ = paddle.masked_select(index_list, ref_labels_flat != -1)\n    index_ = long_(index_)\n    ref_labels_flat = paddle.index_select(ref_labels_flat, index_, 0)\n    ref_emb_flat = paddle.index_select(ref_emb_flat, index_, 0)\n    return ref_labels_flat, ref_emb_flat\n###\ndef _nearest_neighbor_features_per_object_in_chunks(reference_embeddings_flat,\n                                                    query_embeddings_flat,\n                                                    reference_labels_flat,\n                                                    ref_obj_ids,"
+        },
+        {
+            "comment": "This function calculates the nearest neighbor features per object in chunks to save memory, using chunking for bounding memory usage. It takes embedding vectors for reference and query frames, their class labels, unique object IDs, number of nearest neighbors, and number of chunks as input. The function returns a tensor of nearest neighbor features for the query frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":113-133",
+            "content": "                                                    k_nearest_neighbors,\n                                                    n_chunks, **cfg):\n    \"\"\"Calculates the nearest neighbor features per object in chunks to save mem.\n    Uses chunking to bound the memory use.\n    Args:\n    reference_embeddings_flat: Tensor of shape [n, embedding_dim],\n      the embedding vectors for the reference frame.\n    query_embeddings_flat: Tensor of shape [m, embedding_dim], the embedding\n      vectors for the query frames.\n    reference_labels_flat: Tensor of shape [n], the class labels of the\n      reference frame.\n    ref_obj_ids: int tensor of unique object ids in the reference labels.\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n    n_chunks: Integer, the number of chunks to use to save memory\n      (set to 1 for no chunking).\n    Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [m, n_objects, feature_dim].\n    \"\"\"\n    # reference_embeddings_flat = reference_embeddings_flat.cpu()"
+        },
+        {
+            "comment": "This code splits the query embeddings into multiple chunks, depending on the number of chunks specified. It then applies a function to each chunk and appends the results to the all_features list. If in test mode, it selects pixels from the reference and query embeddings. It also creates a wrong label mask for the reference labels and query embeddings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":134-157",
+            "content": "    # query_embeddings_flat = query_embeddings_flat.cpu()\n    # reference_labels_flat = reference_labels_flat.cpu()\n    # ref_obj_ids = ref_obj_ids.cpu()\n    chunk_size = int_(\n        np.ceil((float_(query_embeddings_flat.shape[0]) / n_chunks).numpy()))\n    if cfg.get('test_mode'):\n        reference_labels_flat, reference_embeddings_flat = _selected_pixel(\n            reference_labels_flat, reference_embeddings_flat)\n    wrong_label_mask = (reference_labels_flat != paddle.unsqueeze(\n        ref_obj_ids, 1))\n    all_features = []\n    for n in range(n_chunks):\n        if n == 0:\n            ys = None\n        if n_chunks == 1:\n            query_embeddings_flat_chunk = query_embeddings_flat\n        else:\n            chunk_start = n * chunk_size\n            chunk_end = (n + 1) * chunk_size\n            query_embeddings_flat_chunk = query_embeddings_flat[\n                chunk_start:chunk_end]\n        features, ys = _nn_features_per_object_for_chunk(\n            reference_embeddings_flat, query_embeddings_flat_chunk,"
+        },
+        {
+            "comment": "This code calculates the nearest neighbor features per object using reference embeddings, query embeddings, and reference labels. It takes into account k-nearest neighbors and can handle a specified number of chunks for subsampling. The function returns the nearest neighbor features in the form of a tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":158-180",
+            "content": "            wrong_label_mask, k_nearest_neighbors, ys)\n        all_features.append(features)\n    if n_chunks == 1:\n        nn_features = all_features[0]\n    else:\n        nn_features = paddle.concat(all_features, axis=0)\n    return nn_features\ndef nearest_neighbor_features_per_object(reference_embeddings,\n                                         query_embeddings,\n                                         reference_labels,\n                                         k_nearest_neighbors,\n                                         gt_ids=None,\n                                         n_chunks=100,\n                                         **cfg):\n    \"\"\"Calculates the distance to the nearest neighbor per object.\n    For every pixel of query_embeddings calculate the distance to the\n    nearest neighbor in the (possibly subsampled) reference_embeddings per object.\n    Args:\n    reference_embeddings: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the reference frame.\n    query_embeddings: Tensor of shape [n_query_images, height, width,"
+        },
+        {
+            "comment": "This code calculates nearest neighbors for query frames based on the given embedding vectors. It takes input parameters like reference frame class labels, maximum number of candidates, and number of nearest neighbors to use. The function returns nearest neighbor features, unique sorted object ids present in the reference labels, and potentially gt_ids if provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":181-200",
+            "content": "      embedding_dim], the embedding vectors for the query frames.\n    reference_labels: Tensor of shape [height, width, 1], the class labels of\n      the reference frame.\n    max_neighbors_per_object: Integer, the maximum number of candidates\n      for the nearest neighbor query per object after subsampling,\n      or 0 for no subsampling.\n    k_nearest_neighbors: Integer, the number of nearest neighbors to use.\n    gt_ids: Int tensor of shape [n_objs] of the sorted unique ground truth\n      ids in the first frame. If None, it will be derived from\n      reference_labels.\n    n_chunks: Integer, the number of chunks to use to save memory\n      (set to 1 for no chunking).\n    Returns:\n    nn_features: A float32 tensor of nearest neighbor features of shape\n      [n_query_images, height, width, n_objects, feature_dim].\n    gt_ids: An int32 tensor of the unique sorted object ids present\n      in the reference labels.\n    \"\"\"\n    # reference_embeddings = reference_embeddings.detach().cpu()\n    # query_embeddings = query_embeddings.detach().cpu()"
+        },
+        {
+            "comment": "This code is reshaping tensors and calculating nearest neighbor features for each object in chunks. It first reshapes the embeddings, then applies a function to find the closest neighbors and returns a tensor of these features. This process is done in chunks for efficiency and memory management.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":201-223",
+            "content": "    # reference_labels = reference_labels.detach().cpu()\n    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])\n    h, w, _ = query_embeddings.shape\n    reference_labels_flat = reference_labels.reshape([-1])\n    if gt_ids is None:\n        ref_obj_ids = paddle.unique(reference_labels_flat)[-1]\n        ref_obj_ids = np.arange(0, ref_obj_ids + 1)\n        gt_ids = paddle.to_tensor(ref_obj_ids)\n        gt_ids = int_(gt_ids)\n    else:\n        gt_ids = int_(paddle.arange(0, gt_ids + 1))\n    embedding_dim = query_embeddings.shape[-1]\n    query_embeddings_flat = query_embeddings.reshape([-1, embedding_dim])\n    reference_embeddings_flat = reference_embeddings.reshape(\n        [-1, embedding_dim])\n    nn_features = _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat,\n        reference_labels_flat, gt_ids, k_nearest_neighbors, n_chunks, **cfg)\n    nn_features_dim = nn_features.shape[-1]\n    nn_features = nn_features.reshape(\n        [1, h, w, gt_ids.shape[0], nn_features_dim])"
+        },
+        {
+            "comment": "This function calculates pairwise squared L2 distances using a local search window, with naive implementation using map_fn. It is used as a fallback when correlation_cost is not available. Inputs are tensors x and y of shape [height, width, feature\\_dim]. It returns a tensor of squared distance values shaped [height, width, (2 * max\\_distance + 1) ** 2], where max\\_distance is an integer representing the maximum distance in pixel coordinates per dimension. The function also applies average pooling with a 2x2 filter and pads the tensors x and y before calculating the distances.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":224-251",
+            "content": "    return nn_features.cuda(), gt_ids\n########################################################################LOCAL_DIST_MAP\ndef local_pairwise_distances2(x, y, max_distance=9):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n    Naive implementation using map_fn.\n    Used as a slow fallback for when correlation_cost is not available.\n    Args:\n    x: Float32 tensor of shape [height, width, feature_dim].\n    y: Float32 tensor of shape [height, width, feature_dim].\n    max_distance: Integer, the maximum distance in pixel coordinates\n      per dimension which is considered to be in the search window.\n    Returns:\n    Float32 distances tensor of shape\n      [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    ori_h, ori_w, _ = x.shape\n    x = paddle.transpose(x, [2, 0, 1]).unsqueeze(0)\n    x = F.avg_pool2d(x, (2, 2), (2, 2))\n    y = paddle.transpose(y, [2, 0, 1]).unsqueeze(0)\n    y = F.avg_pool2d(y, (2, 2), (2, 2))\n    _, channels, height, width = x.shape\n    padding_val = 1e20\n    padded_y = F.pad(y,"
+        },
+        {
+            "comment": "This code calculates the nearest neighbor features for local matches in a video. It takes in parameters like previous frame embedding, query embedding, previous frame labels, and ground truth IDs. The function computes distances between frames using Sigmoid activation and bilinear interpolation. Max distance determines the maximum allowed distance for a match to be considered valid.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":252-277",
+            "content": "                     (max_distance, max_distance, max_distance, max_distance),\n                     mode='constant',\n                     value=padding_val)\n    offset_y = F.unfold(padded_y, kernel_sizes=[height, width]).reshape(\n        [1, channels, height, width, -1])\n    x = x.reshape([1, channels, height, width, 1])\n    minus = x - offset_y\n    dists = paddle.sum(paddle.multiply(minus, minus),\n                       axis=1).reshape([1, height, width,\n                                        -1]).transpose([0, 3, 1, 2])\n    dists = (paddle.nn.functional.sigmoid(dists) - 0.5) * 2\n    dists = F.interpolate(dists,\n                          size=[ori_h, ori_w],\n                          mode='bilinear',\n                          align_corners=True)\n    dists = dists.squeeze(0).transpose([1, 2, 0])\n    return dists\ndef local_previous_frame_nearest_neighbor_features_per_object(\n        prev_frame_embedding,\n        query_embedding,\n        prev_frame_labels,\n        gt_ids,\n        max_distance=12):\n    \"\"\"Computes nearest neighbor features while only allowing local matches."
+        },
+        {
+            "comment": "This code calculates the nearest neighbor features by comparing embedding vectors of query frames with the last frame. It takes input tensors for embedding vectors, previous frame labels, and ground truth IDs along with a maximum distance limit. The function returns the nearest neighbor features in a specific shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":278-297",
+            "content": "  Args:\n    prev_frame_embedding: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the last frame.\n    query_embedding: Tensor of shape [height, width, embedding_dim],\n      the embedding vectors for the query frames.\n    prev_frame_labels: Tensor of shape [height, width, 1], the class labels of\n      the previous frame.\n    gt_ids: Int Tensor of shape [n_objs] of the sorted unique ground truth\n      ids in the first frame.\n    max_distance: Integer, the maximum distance allowed for local matching.\n  Returns:\n    nn_features: A float32 np.array of nearest neighbor features of shape\n      [1, height, width, n_objects, 1].\n    \"\"\"\n    #     print(query_embedding.shape, prev_frame_embedding.shape)\n    #     print(query_embedding.place, prev_frame_embedding.place)\n    #     query_embedding = query_embedding.cpu()\n    #     prev_frame_embedding = prev_frame_embedding.cpu()\n    #     prev_frame_labels = prev_frame_labels.cpu()\n    #     print(prev_frame_labels.place, prev_frame_embedding.place, query_embedding.place)"
+        },
+        {
+            "comment": "Code snippet performs local pairwise distance calculation between query and previous frame embeddings. If MODEL_UNFOLD is enabled, it generates offset labels by unfolding padded labels with kernel sizes matching height and width of the previous frame embedding. It then creates offset masks by checking equality between offset labels and gt_ids. If MODEL_UNFOLD is not enabled, it directly creates masks by comparing previous frame labels and gt_ids. Finally, it pads the masks using nn.functional.pad with specified padding values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":299-324",
+            "content": "    d = local_pairwise_distances2(query_embedding,\n                                  prev_frame_embedding,\n                                  max_distance=max_distance)\n    height, width = prev_frame_embedding.shape[:2]\n    if MODEL_UNFOLD:\n        labels = float_(prev_frame_labels).transpose([2, 0, 1]).unsqueeze(0)\n        padded_labels = F.pad(labels, (\n            2 * max_distance,\n            2 * max_distance,\n            2 * max_distance,\n            2 * max_distance,\n        ))\n        offset_labels = F.unfold(padded_labels,\n                                 kernel_sizes=[height, width],\n                                 strides=[2,\n                                          2]).reshape([height, width, -1, 1])\n        offset_masks = paddle.equal(\n            offset_labels,\n            float_(gt_ids).unsqueeze(0).unsqueeze(0).unsqueeze(0))\n    else:\n        masks = paddle.equal(prev_frame_labels,\n                             gt_ids.unsqueeze(0).unsqueeze(0))\n        padded_masks = nn.functional.pad(masks, ("
+        },
+        {
+            "comment": "The code is performing feature extraction and masking for a specific model. It first tiles input data, then applies offset masks to selected regions, and finally extracts minimum distances using the tiled and masked data. The result is a new set of distances which are then reshaped for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":325-357",
+            "content": "            0,\n            0,\n            max_distance,\n            max_distance,\n            max_distance,\n            max_distance,\n        ))\n        offset_masks = []\n        for y_start in range(2 * max_distance + 1):\n            y_end = y_start + height\n            masks_slice = padded_masks[y_start:y_end]\n            for x_start in range(2 * max_distance + 1):\n                x_end = x_start + width\n                offset_mask = masks_slice[:, x_start:x_end]\n                offset_masks.append(offset_mask)\n        offset_masks = paddle.stack(offset_masks, axis=2)\n    d_tiled = d.unsqueeze(-1).tile((1, 1, 1, gt_ids.shape[0]))\n    pad = paddle.ones_like(d_tiled)\n    d_masked = paddle.where(offset_masks, d_tiled, pad)\n    dists = paddle.min(d_masked, axis=2)\n    dists = dists.reshape([1, height, width, gt_ids.shape[0], 1])\n    return dists\n##############################################################\n#################\nclass _res_block(nn.Layer):\n    def __init__(self, in_dim, out_dim, **cfg):\n        super(_res_block, self).__init__()"
+        },
+        {
+            "comment": "This code defines a convolutional neural network (CNN) architecture for image processing tasks. The class `IntVOS` contains two 2D convolutions, batch normalization, and ReLU activations in its forward pass. The `IntSegHead` class initializes another CNN with different parameters, which seems to be a part of the overall model. Both classes extend `nn.Layer`, indicating they are PaddlePaddle's version of PyTorch layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":358-389",
+            "content": "        self.conv1 = nn.Conv2D(in_dim,\n                               out_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.relu1 = nn.ReLU()\n        self.bn1 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg['train_bn_mom'])\n        self.conv2 = nn.Conv2D(out_dim,\n                               out_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.relu2 = nn.ReLU()\n        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg['train_bn_mom'])\n    def forward(self, x):\n        res = x\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu2(x)\n        x += res\n        return x\n####################\nclass IntSegHead(nn.Layer):\n    def __init__(self, in_dim, emb_dim, **cfg):\n        super(IntSegHead, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,"
+        },
+        {
+            "comment": "This code defines a custom Convolutional Neural Network (CNN) layer for extracting features from input images. It consists of multiple convolutions, batch normalizations, and ReLU activations. The input image is first passed through several convolution layers with different configurations, followed by batch normalization and ReLU activation functions to improve model performance. Finally, the output is returned after passing it through a single convolution layer and another batch normalization and ReLU activation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":390-417",
+            "content": "                               emb_dim,\n                               kernel_size=7,\n                               stride=1,\n                               padding=3)\n        self.bn1 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg['train_bn_mom'])\n        self.relu1 = nn.ReLU(True)\n        self.res1 = _res_block(emb_dim, emb_dim, **cfg)\n        self.res2 = _res_block(emb_dim, emb_dim, **cfg)\n        self.conv2 = nn.Conv2D(256,\n                               emb_dim,\n                               kernel_size=3,\n                               stride=1,\n                               padding=1)\n        self.bn2 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg['train_bn_mom'])\n        self.relu2 = nn.ReLU(True)\n        self.conv3 = nn.Conv2D(emb_dim, 1, 1, 1)\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.res1(x)\n        x = self.res2(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu2(x)\n        x = self.conv3(x)\n        return x"
+        },
+        {
+            "comment": "This code defines a custom layer _split_separable_conv2d, which consists of two convolutional layers followed by ReLU and batch normalization. The first convolution is performed with the same number of input and output channels, while the second has fewer output channels than input dimensions. This architecture helps to reduce parameters and computational cost in a deep learning model for image processing tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":420-441",
+            "content": "class _split_separable_conv2d(nn.Layer):\n    def __init__(self, in_dim, out_dim, kernel_size=7, **cfg):\n        super(_split_separable_conv2d, self).__init__()\n        self.conv1 = nn.Conv2D(in_dim,\n                               in_dim,\n                               kernel_size=kernel_size,\n                               stride=1,\n                               padding=int((kernel_size - 1) / 2),\n                               groups=in_dim)\n        self.relu1 = nn.ReLU(True)\n        self.bn1 = paddle.nn.BatchNorm2D(in_dim, momentum=cfg['train_bn_mom'])\n        self.conv2 = nn.Conv2D(in_dim, out_dim, kernel_size=1, stride=1)\n        self.relu2 = nn.ReLU(True)\n        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg['train_bn_mom'])\n        kaiming_normal_(self.conv1.weight, mode='fan_out', nonlinearity='relu')\n        kaiming_normal_(self.conv2.weight, mode='fan_out', nonlinearity='relu')\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu1(x)\n        x = self.conv2(x)"
+        },
+        {
+            "comment": "The code defines two classes: IntVOS and DynamicSegHead. IntVOS is a subclass of nn.Layer and utilizes the DynamicSegHead class as its segmentation head. DynamicSegHead is also a subclass of nn.Layer and consists of several layers (layer1, layer2, layer3, layer4) that apply separable convolutions to the input. Finally, there's a nn.Conv2D layer with Kaiming initialization for the output. This architecture can be used for segmentation tasks in computer vision applications.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":442-487",
+            "content": "        x = self.bn2(x)\n        x = self.relu2(x)\n        return x\nclass DynamicSegHead(nn.Layer):\n    def __init__(self, in_dim, embed_dim, **cfg):\n        super(DynamicSegHead, self).__init__()\n        self.layer1 = _split_separable_conv2d(in_dim, embed_dim, **cfg)\n        self.layer2 = _split_separable_conv2d(embed_dim, embed_dim, **cfg)\n        self.layer3 = _split_separable_conv2d(embed_dim, embed_dim, **cfg)\n        self.layer4 = _split_separable_conv2d(embed_dim, embed_dim, **cfg)\n        self.conv = nn.Conv2D(embed_dim, 1, 1, 1)\n        kaiming_normal_(self.conv.weight, mode='fan_out', nonlinearity='relu')\n    def forward(self, x):\n        x = self.layer1(x)\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        x = self.conv(x)\n        return x\nfrom ..registry import HEADS\n\"\"\"\n\u8986\u76d6\u539f\u7406\nclass c1:\n    def __init__(self):\n        self.a = 1\nclass c2(c1):\n    def __init__(self):\n        super(c2, self).__init__()\n        self.a = 2\nc = c2()\nprint(c.a)\n\"\"\"\n@HEADS.register()\nclass IntVOS(nn.Layer):"
+        },
+        {
+            "comment": "This code defines a class called IntVOS. The constructor takes in a feature_extracter and **cfg parameters, initializes the instance variables, and adds layers to the feature_extracter if required. It also initializes the embedding convolution layer for semantic embedding extraction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":488-505",
+            "content": "    def __init__(self, feature_extracter, **cfg):\n        super(IntVOS, self).__init__()\n        self.feature_extracter = feature_extracter  ##embedding extractor\n        self.feature_extracter.cls_conv = nn.Sequential()\n        self.feature_extracter.upsample4 = nn.Sequential()\n        self.semantic_embedding = None\n        self.seperate_conv = nn.Conv2D(cfg['model_aspp_outdim'],\n                                       cfg['model_aspp_outdim'],\n                                       kernel_size=3,\n                                       stride=1,\n                                       padding=1,\n                                       groups=cfg['model_aspp_outdim'])\n        self.bn1 = paddle.nn.BatchNorm2D(cfg['model_aspp_outdim'],\n                                         momentum=cfg['train_bn_mom'])\n        self.relu1 = nn.ReLU(True)\n        self.embedding_conv = nn.Conv2D(cfg['model_aspp_outdim'],\n                                        cfg['model_semantic_embedding_dim'], 1,\n                                        1)"
+        },
+        {
+            "comment": "The code initializes and configures the layers for semantic segmentation. It creates a ReLU activation function, a batch normalization layer with specified parameters, and a sequential neural network containing the separate convolution, first batch norm, first ReLU, embedding convolution, second batch norm, and second ReLU. The code also initializes the dynamic segmentation head and (optionally) an inter-segmentation head depending on the config's 'model_useintseg' flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":506-529",
+            "content": "        self.relu2 = nn.ReLU(True)\n        self.bn2 = paddle.nn.BatchNorm2D(cfg['model_semantic_embedding_dim'],\n                                         momentum=cfg['train_bn_mom'])\n        self.semantic_embedding = nn.Sequential(*[\n            self.seperate_conv, self.bn1, self.relu1, self.embedding_conv,\n            self.bn2, self.relu2\n        ])\n        for m in self.semantic_embedding:\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')\n        self.dynamic_seghead = DynamicSegHead(\n            in_dim=cfg['model_semantic_embedding_dim'] + 3,\n            embed_dim=cfg['model_head_embedding_dim'],\n            **cfg)  # propagation segm head\n        if cfg['model_useintseg']:\n            self.inter_seghead = IntSegHead(\n                in_dim=cfg['model_semantic_embedding_dim'] + 3,\n                emb_dim=cfg['model_head_embedding_dim'],\n                **cfg)\n        else:\n            self.inter_seghead = DynamicSegHead(\n                in_dim=cfg['model_semantic_embedding_dim'] + 2,"
+        },
+        {
+            "comment": "This code defines a class for a model head that takes input, initializes weights (loading pretrained if available), and calculates the loss during forward pass. It uses various input parameters such as x, ref_scribble_label, previous_frame_mask, etc. The forward function extracts features from the input, and is responsible for losses related to the model head.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":530-558",
+            "content": "                embed_dim=cfg['model_head_embedding_dim'],\n                **cfg)  # interaction segm head\n        self.pretrained = cfg.get('pretrained', None)\n        self.cfg = cfg\n    def init_weights(self):\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            self.set_state_dict(load(self.pretrained, self.state_dict()))\n            print('loaded pretrained model')\n    def loss(self, **kwargs):\n        return self.loss_func(**kwargs)\n    def forward(self,\n                x=None,\n                ref_scribble_label=None,\n                previous_frame_mask=None,\n                normalize_nearest_neighbor_distances=True,\n                use_local_map=True,\n                seq_names=None,\n                gt_ids=None,\n                k_nearest_neighbors=1,\n                global_map_tmp_dic=None,\n                local_map_dics=None,\n                interaction_num=None,\n                start_annotated_frame=None,\n                frame_num=None):\n        x = self.extract_feature(x)"
+        },
+        {
+            "comment": "This code is splitting input feature x into three parts (ref, previous, current frame embeddings), then calling the prop_seghead function to compute a dictionary of results. If global_map_tmp_dic is None, it returns only the dictionary; otherwise, it also updates global_map_tmp_dic and returns both.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":559-587",
+            "content": "        #         print('extract_feature:', x.mean().item())\n        ref_frame_embedding, previous_frame_embedding, current_frame_embedding = paddle.split(\n            x, num_or_sections=3, axis=0)\n        if global_map_tmp_dic is None:\n            dic = self.prop_seghead(\n                ref_frame_embedding,\n                previous_frame_embedding,\n                current_frame_embedding,\n                ref_scribble_label,\n                previous_frame_mask,\n                normalize_nearest_neighbor_distances,\n                use_local_map,\n                seq_names,\n                gt_ids,\n                k_nearest_neighbors,\n                global_map_tmp_dic,\n                local_map_dics,\n                interaction_num,\n                start_annotated_frame,\n                frame_num,\n                self.dynamic_seghead,\n            )\n            return dic\n        else:\n            dic, global_map_tmp_dic = self.prop_seghead(\n                ref_frame_embedding,\n                previous_frame_embedding,"
+        },
+        {
+            "comment": "This code defines a class with three methods: \"IntVOS\", \"extract_feature\", and \"prop_seghead\". The \"IntVOS\" function returns two dictionaries after performing some operations. The \"extract_feature\" method extracts features from input image using feature extracter and semantic embedding. The \"prop_seghead\" method takes various inputs, including frame embeddings, scribble label, and mask, and performs propagation segmentation head task with optional normalization and local map usage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":588-621",
+            "content": "                current_frame_embedding,\n                ref_scribble_label,\n                previous_frame_mask,\n                normalize_nearest_neighbor_distances,\n                use_local_map,\n                seq_names,\n                gt_ids,\n                k_nearest_neighbors,\n                global_map_tmp_dic,\n                local_map_dics,\n                interaction_num,\n                start_annotated_frame,\n                frame_num,\n                self.dynamic_seghead,\n            )\n            return dic, global_map_tmp_dic\n    def extract_feature(self, x):\n        x = self.feature_extracter(x)\n        x = self.semantic_embedding(x)\n        return x\n    def prop_seghead(\n        self,\n        ref_frame_embedding=None,\n        previous_frame_embedding=None,\n        current_frame_embedding=None,\n        ref_scribble_label=None,\n        previous_frame_mask=None,\n        normalize_nearest_neighbor_distances=True,\n        use_local_map=True,\n        seq_names=None,\n        gt_ids=None,\n        k_nearest_neighbors=1,"
+        },
+        {
+            "comment": "This function takes in various parameters and returns feature_embedding, global_match_map, local_match_map, and previous_frame_mask. It initializes global_map_tmp_dic, dic_tmp, bs, c, h, w from current_frame_embedding, checks if it is in test mode, scales ref_scribble_label and previous_frame_mask using interpolation for matching dimensions, and then iterates through a range of bs, performing operations on seq_current_frame_embedding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":622-645",
+            "content": "        global_map_tmp_dic=None,\n        local_map_dics=None,\n        interaction_num=None,\n        start_annotated_frame=None,\n        frame_num=None,\n        dynamic_seghead=None,\n    ):\n        \"\"\"return: feature_embedding,global_match_map,local_match_map,previous_frame_mask\"\"\"\n        ###############\n        cfg = self.cfg\n        global_map_tmp_dic = global_map_tmp_dic\n        dic_tmp = {}\n        bs, c, h, w = current_frame_embedding.shape\n        if cfg.get('test_mode'):\n            scale_ref_scribble_label = float_(ref_scribble_label)\n        else:\n            scale_ref_scribble_label = paddle.nn.functional.interpolate(\n                float_(ref_scribble_label), size=(h, w), mode='nearest')\n        scale_ref_scribble_label = int_(scale_ref_scribble_label)\n        scale_previous_frame_label = paddle.nn.functional.interpolate(\n            float_(previous_frame_mask), size=(h, w), mode='nearest')\n        scale_previous_frame_label = int_(scale_previous_frame_label)\n        for n in range(bs):\n            seq_current_frame_embedding = current_frame_embedding[n]"
+        },
+        {
+            "comment": "This code calculates nearest neighbor features for each object using reference and current frame embeddings, and scribble labels. It transposes the embeddings and label to match the global map format and uses k-nearest neighbors to find the corresponding features. If normalization is enabled, it applies a sigmoid function to normalize the distances.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":646-664",
+            "content": "            seq_ref_frame_embedding = ref_frame_embedding[n]\n            seq_prev_frame_embedding = previous_frame_embedding[n]\n            seq_ref_frame_embedding = seq_ref_frame_embedding.transpose(\n                [1, 2, 0])\n            seq_current_frame_embedding = seq_current_frame_embedding.transpose(\n                [1, 2, 0])\n            seq_ref_scribble_label = scale_ref_scribble_label[n].transpose(\n                [1, 2, 0])\n            #########Global Map\n            nn_features_n, ref_obj_ids = nearest_neighbor_features_per_object(\n                reference_embeddings=seq_ref_frame_embedding,\n                query_embeddings=seq_current_frame_embedding,\n                reference_labels=seq_ref_scribble_label,\n                k_nearest_neighbors=k_nearest_neighbors,\n                gt_ids=gt_ids[n],\n                n_chunks=10)\n            if normalize_nearest_neighbor_distances:\n                nn_features_n = (paddle.nn.functional.sigmoid(nn_features_n) -\n                                 0.5) * 2"
+        },
+        {
+            "comment": "This code section checks if a sequence name exists in the global map dictionary, and if not, creates an entry for it. It then compares the current frame's features to the corresponding value in the global map for that sequence. If the current frame's features are less than or equal to the stored value, they remain unchanged; otherwise, they get updated with the stored value. Finally, it updates the global map entry with the new frame's features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":666-686",
+            "content": "            #             print(nn_features_n)\n            ###\n            if global_map_tmp_dic is not None:  ###when testing, use global map memory\n                if seq_names[n] not in global_map_tmp_dic:\n                    global_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                        nn_features_n).tile([1000, 1, 1, 1, 1])\n                nn_features_n = paddle.where(\n                    nn_features_n <= global_map_tmp_dic[seq_names[n]][\n                        frame_num[n]].unsqueeze(0), nn_features_n,\n                    global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(\n                        0))\n                #                 print('detach 1')\n                #                 print(nn_features_n.shape)\n                # nn_features_n = nn_features_n.detach()\n                global_map_tmp_dic[seq_names[n]][\n                    frame_num[n]] = nn_features_n.detach()[0]\n            #########################Local dist map\n            seq_prev_frame_embedding = seq_prev_frame_embedding.transpose("
+        },
+        {
+            "comment": "This code is finding the nearest neighbor features for the previous frame's embedding, based on whether local mapping is used or not. If local mapping is used, it calls a separate function `local_previous_frame_nearest_neighbor_features_per_object` to get the features and labels. Otherwise, it uses the `nearest_neighbor_features_per_object` function with specified parameters to find the nearest neighbors. The resulting features are stored in `prev_frame_nn_features_n`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":687-706",
+            "content": "                [1, 2, 0])\n            seq_previous_frame_label = scale_previous_frame_label[n].transpose(\n                [1, 2, 0])\n            if use_local_map:\n                prev_frame_nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(\n                    prev_frame_embedding=seq_prev_frame_embedding,\n                    query_embedding=seq_current_frame_embedding,\n                    prev_frame_labels=seq_previous_frame_label,\n                    gt_ids=ref_obj_ids,\n                    max_distance=cfg['model_max_local_distance'])\n            else:\n                prev_frame_nn_features_n, _ = nearest_neighbor_features_per_object(\n                    reference_embeddings=seq_prev_frame_embedding,\n                    query_embeddings=seq_current_frame_embedding,\n                    reference_labels=seq_previous_frame_label,\n                    k_nearest_neighbors=k_nearest_neighbors,\n                    gt_ids=gt_ids[n],\n                    n_chunks=20)\n                prev_frame_nn_features_n = ("
+        },
+        {
+            "comment": "This code is checking if the local map dictionaries are not None, indicating testing with local map memory. If a specific sequence name isn't in the local map distance dictionary or temporary map dictionary, it prints an error message and creates a new zero tensor to store the data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":707-723",
+            "content": "                    paddle.nn.functional.sigmoid(prev_frame_nn_features_n) -\n                    0.5) * 2\n            #             print(prev_frame_nn_features_n.mean().item(), prev_frame_nn_features_n.shape, interaction_num)  # o\n            #############\n            if local_map_dics is not None:  ##When testing, use local map memory\n                local_map_tmp_dic, local_map_dist_dic = local_map_dics\n                if seq_names[n] not in local_map_dist_dic:\n                    print(seq_names[n], 'not in local_map_dist_dic')\n                    local_map_dist_dic[seq_names[n]] = paddle.zeros(1000, 9)\n                if seq_names[n] not in local_map_tmp_dic:\n                    print(seq_names[n], 'not in local_map_tmp_dic')\n                    local_map_tmp_dic[seq_names[n]] = paddle.zeros_like(\n                        prev_frame_nn_features_n).unsqueeze(0).tile(\n                            [1000, 9, 1, 1, 1, 1])\n                #                 print(local_map_dist_dic[seq_names[n]].shape)\n                #                 print('detach 2')"
+        },
+        {
+            "comment": "This code segment appears to be part of a larger function that processes video frames and interactions. It stores the distance from the current frame to the first annotated frame in the local_map_dist_dic dictionary, as well as the corresponding previous frame features in the local_map_tmp_dic. The code also updates the value of prev_frame_nn_features_n based on certain conditions involving interaction numbers and distances between frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":724-740",
+            "content": "                # prev_frame_nn_features_n = prev_frame_nn_features_n.detach()\n                local_map_dist_dic[seq_names[n]][\n                    frame_num[n], interaction_num -\n                    1] = 1.0 / (abs(frame_num[n] - start_annotated_frame)\n                                )  # bugs fixed.\n                local_map_tmp_dic[seq_names[n]][\n                    frame_num[n],\n                    interaction_num - 1] = prev_frame_nn_features_n.squeeze(\n                        0).detach()  # bugs fixed.\n                if interaction_num == 1:\n                    prev_frame_nn_features_n = local_map_tmp_dic[seq_names[n]][\n                        frame_num[n]][interaction_num - 1]\n                    prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                        0)\n                else:\n                    if local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 1] > \\\n                            local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 2]:"
+        },
+        {
+            "comment": "This code appears to be part of a video modeling process. It seems to involve local map dictionaries and interaction numbers, comparing previous frames with current ones for float comparisons, unsqueezing and reshaping features and labels, and potentially using these operations in some video modeling or analysis task.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":741-762",
+            "content": "                        prev_frame_nn_features_n = local_map_tmp_dic[\n                            seq_names[n]][frame_num[n]][interaction_num - 1]\n                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                            0)\n                    else:\n                        prev_frame_nn_features_n = local_map_tmp_dic[\n                            seq_names[n]][frame_num[n]][interaction_num - 2]\n                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(\n                            0)\n                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)\n            to_cat_previous_frame = (\n                float_(seq_previous_frame_label) == float_(ref_obj_ids)\n            )  # float comparision?\n            to_cat_current_frame_embedding = current_frame_embedding[\n                n].unsqueeze(0).tile((ref_obj_ids.shape[0], 1, 1, 1))\n            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(\n                [2, 3, 0, 1])\n            to_cat_previous_frame = float_("
+        },
+        {
+            "comment": "This code is defining a function \"int_seghead\" that takes in various inputs and returns output dictionaries. It concatenates embeddings and features, passes them to the dynamic_seghead function, transposes the result, and stores it in a dictionary. If global_map_tmp_dic is not None, the function also returns other dictionaries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":763-786",
+            "content": "                to_cat_previous_frame.unsqueeze(-1).transpose([2, 3, 0, 1]))\n            to_cat_prev_frame_nn_feature_n = prev_frame_nn_features_n.squeeze(\n                0).transpose([2, 3, 0, 1])\n            to_cat = paddle.concat(\n                (to_cat_current_frame_embedding, to_cat_nn_feature_n,\n                 to_cat_prev_frame_nn_feature_n, to_cat_previous_frame), 1)\n            pred_ = dynamic_seghead(to_cat)\n            pred_ = pred_.transpose([1, 0, 2, 3])\n            dic_tmp[seq_names[n]] = pred_\n        if global_map_tmp_dic is None:\n            return dic_tmp\n        else:\n            if local_map_dics is None:\n                return dic_tmp, global_map_tmp_dic\n            else:\n                return dic_tmp, global_map_tmp_dic, local_map_dics\n    def int_seghead(self,\n                    ref_frame_embedding=None,\n                    ref_scribble_label=None,\n                    prev_round_label=None,\n                    normalize_nearest_neighbor_distances=True,\n                    global_map_tmp_dic=None,"
+        },
+        {
+            "comment": "This code snippet calculates the local distance map for each frame in the batch and possibly a previous round if it's not the first interaction. The function takes in various parameters such as ref_frame_embedding, prev_round_label, gt_ids, etc., and performs interpolation to resize the reference scribble label and previous round label. It then iterates over each frame in the batch, creating a gt_id array, and calculating the local distance map for the current frame's embedding. This process may involve interpolation and integer conversion of the resized labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":787-812",
+            "content": "                    local_map_dics=None,\n                    interaction_num=None,\n                    seq_names=None,\n                    gt_ids=None,\n                    k_nearest_neighbors=1,\n                    frame_num=None,\n                    first_inter=True):\n        dic_tmp = {}\n        bs, c, h, w = ref_frame_embedding.shape\n        scale_ref_scribble_label = paddle.nn.functional.interpolate(\n            float_(ref_scribble_label), size=(h, w), mode='nearest')\n        scale_ref_scribble_label = int_(scale_ref_scribble_label)\n        if not first_inter:\n            scale_prev_round_label = paddle.nn.functional.interpolate(\n                float_(prev_round_label), size=(h, w), mode='nearest')\n            scale_prev_round_label = int_(scale_prev_round_label)\n        n_chunks = 500\n        for n in range(bs):\n            gt_id = paddle.arange(0, gt_ids[n] + 1)\n            gt_id = int_(gt_id)\n            seq_ref_frame_embedding = ref_frame_embedding[n]\n            ########################Local dist map"
+        },
+        {
+            "comment": "Updating the global map with the nearest neighbor features for each sequence, only if it's not already in the global_map_tmp_dic.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":813-831",
+            "content": "            seq_ref_frame_embedding = paddle.transpose(seq_ref_frame_embedding,\n                                                       [1, 2, 0])\n            seq_ref_scribble_label = paddle.transpose(\n                scale_ref_scribble_label[n], [1, 2, 0])\n            nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(\n                prev_frame_embedding=seq_ref_frame_embedding,\n                query_embedding=seq_ref_frame_embedding,\n                prev_frame_labels=seq_ref_scribble_label,\n                gt_ids=gt_id,\n                max_distance=self.cfg['model_max_local_distance'])\n            #######\n            ######################Global map update\n            if seq_names[n] not in global_map_tmp_dic:\n                global_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                    nn_features_n).tile([1000, 1, 1, 1, 1])\n            nn_features_n_ = paddle.where(\n                nn_features_n <=\n                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0),"
+        },
+        {
+            "comment": "This code segment appears to be updating the global and local map dictionaries in a video processing model. The global_map_tmp_dic is being updated with nn_features_n_.detach()[0] at the current frame. Additionally, if the sequence name exists in the local_map_dist_dic or local_map_tmp_dic it is being modified accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":832-853",
+            "content": "                nn_features_n,\n                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0))\n            ###\n            ###\n            #             print('detach 3')\n            # nn_features_n_ = nn_features_n_.detach()\n            global_map_tmp_dic[seq_names[n]][\n                frame_num[n]] = nn_features_n_.detach()[0]\n            ##################Local map update\n            if local_map_dics is not None:\n                local_map_tmp_dic, local_map_dist_dic = local_map_dics\n                if seq_names[n] not in local_map_dist_dic:\n                    local_map_dist_dic[seq_names[n]] = paddle.zeros([1000, 9])\n                if seq_names[n] not in local_map_tmp_dic:\n                    local_map_tmp_dic[seq_names[n]] = paddle.ones_like(\n                        nn_features_n).unsqueeze(0).tile([1000, 9, 1, 1, 1, 1])\n                local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num\n                                                               - 1] = 0\n                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)"
+        },
+        {
+            "comment": "This code calculates the current frame embedding and nn_feature_n for each object instance in the scene. It then creates a scribble mask for each object and, if not the first iteration, also creates a previous round mask. The code uses transpose and unsqueeze functions for tensor manipulation and float comparisons to create binary masks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":855-877",
+            "content": "            ##################\n            to_cat_current_frame_embedding = ref_frame_embedding[n].unsqueeze(\n                0).tile((gt_id.shape[0], 1, 1, 1))\n            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(\n                [2, 3, 0, 1])\n            to_cat_scribble_mask_to_cat = (\n                float_(seq_ref_scribble_label) == float_(gt_id)\n            )  # float comparision?\n            to_cat_scribble_mask_to_cat = float_(\n                to_cat_scribble_mask_to_cat.unsqueeze(-1).transpose(\n                    [2, 3, 0, 1]))\n            if not first_inter:\n                seq_prev_round_label = scale_prev_round_label[n].transpose(\n                    [1, 2, 0])\n                to_cat_prev_round_to_cat = (\n                    float_(seq_prev_round_label) == float_(gt_id)\n                )  # float comparision?\n                to_cat_prev_round_to_cat = float_(\n                    to_cat_prev_round_to_cat.unsqueeze(-1).transpose(\n                        [2, 3, 0, 1]))\n            else:"
+        },
+        {
+            "comment": "In this code, a concatenation of current frame embedding, scribble mask, and previous round information is passed to inter_seghead for segmentation prediction. The predictions are then transposed before being added to dic_tmp for further processing. If local_map_dics is None, the function returns dic_tmp; otherwise, it returns both dic_tmp and local_map_dics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py\":878-892",
+            "content": "                to_cat_prev_round_to_cat = paddle.zeros_like(\n                    to_cat_scribble_mask_to_cat)\n                to_cat_prev_round_to_cat[0] = 1.\n            to_cat = paddle.concat(\n                (to_cat_current_frame_embedding, to_cat_scribble_mask_to_cat,\n                 to_cat_prev_round_to_cat), 1)\n            pred_ = self.inter_seghead(to_cat)\n            pred_ = pred_.transpose([1, 0, 2, 3])\n            dic_tmp[seq_names[n]] = pred_\n        if local_map_dics is None:\n            return dic_tmp\n        else:\n            return dic_tmp, local_map_dics"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4613d6ed-c878-4172-a444-515ae063a628.json b/docs/doc/4613d6ed-c878-4172-a444-515ae063a628.json
new file mode 100644
index 000000000..4b7f2452a
--- /dev/null
+++ b/docs/doc/4613d6ed-c878-4172-a444-515ae063a628.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code provides a list of useful action recognition datasets along with their respective links for further reference. These datasets are essential for training and evaluating action recognition models, each serving its specific purpose in the field of computer vision.",
+    "details": [
+        {
+            "comment": "This code provides a list of useful action recognition datasets along with their respective links for further reference. These datasets are essential for training and evaluating action recognition models, each serving its specific purpose in the field of computer vision.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Action Recognition Datasets\":0-11",
+            "content": "Usefull Action Recognition Datasets.\n    AVA,  https://arxiv.org/abs/1705.08421\n    Kinetics, https://arxiv.org/abs/1705.06950\n    YouTube-8M, https://arxiv.org/abs/1609.08675\n    ActivityNet, http://www.cv-foundation.org/openaccess/content_cvpr_2015/html/Heilbron_ActivityNet_A_Large-Scale_2015_CVPR_paper.html\n    Moments in Time, https://arxiv.org/pdf/1801.03150.pdf\n    Charades, https://arxiv.org/abs/1604.01753\n    EPIC-Kitchens, https://arxiv.org/abs/1804.02748\n    THUMOS, https://arxiv.org/abs/1604.06182\n    UCF-101, http://crcv.ucf.edu/papers/UCF101_CRCV-TR-12-01.pdf\n    HMDB51, http://serre-lab.clps.brown.edu/wp-content/uploads/2012/08/Kuehne_etal_iccv11.pdf"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/462dc8cb-b385-4035-b04c-9d58c98e7e51.json b/docs/doc/462dc8cb-b385-4035-b04c-9d58c98e7e51.json
new file mode 100644
index 000000000..63750305a
--- /dev/null
+++ b/docs/doc/462dc8cb-b385-4035-b04c-9d58c98e7e51.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The \"BoxList\" class manages bounding boxes, and the _is_valid_boxes function checks if data array of shape [N, 4] representing box coordinates adheres to the correct format. The function returns a boolean indicating whether all ymax are greater than or equal to ymin, all xmax are greater than or equal to xmin, and the data is not empty.",
+    "details": [
+        {
+            "comment": "The code defines a class called \"BoxList\" that represents a collection of bounding boxes as a numpy array. Each box is represented by 4 numbers: y_min, x_min, y_max, and x_max. It assumes all boxes in the list correspond to a single image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_list.py\":0-25",
+            "content": "# Copyright 2017 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# =============================================================================\n\"\"\"Numpy BoxList classes and functions.\"\"\"\nimport numpy as np\nclass BoxList:\n    \"\"\"Box collection.\n    BoxList represents a list of bounding boxes as numpy array, where each\n    bounding box is represented as a row of 4 numbers,\n    [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes within\n    a given list correspond to a single image."
+        },
+        {
+            "comment": "This code defines a class for box collections, where users can optionally add additional related fields such as objectness or classification scores. The `__init__` method checks if the input data is a numpy array, has valid dimensions and data type (float), and raises a ValueError if any of these conditions are not met. It then stores the data in a dictionary with key \"boxes\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_list.py\":27-51",
+            "content": "    Optionally, users can add additional related fields (such as\n    objectness/classification scores).\n    \"\"\"\n    def __init__(self, data):\n        \"\"\"Constructs box collection.\n        Args:\n            data: a numpy array of shape [N, 4] representing box coordinates\n        Raises:\n            ValueError: if bbox data is not a numpy array\n            ValueError: if invalid dimensions for bbox data\n        \"\"\"\n        if not isinstance(data, np.ndarray):\n            raise ValueError('data must be a numpy array.')\n        if len(data.shape) != 2 or data.shape[1] != 4:\n            raise ValueError('Invalid dimensions for box data.')\n        if data.dtype != np.float32 and data.dtype != np.float64:\n            raise ValueError(\n                'Invalid data type for box data: float is required.')\n        if not self._is_valid_boxes(data):\n            raise ValueError('Invalid box data. data must be a numpy array of '\n                             'N*[y_min, x_min, y_max, x_max]')\n        self.data = {'boxes': data}"
+        },
+        {
+            "comment": "This code defines a class with methods to handle box collections. It provides functionality for counting the number of boxes, retrieving non-box fields, checking if a specific field exists, and adding data to an existing or new field while handling errors related to field existence and data dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_list.py\":53-80",
+            "content": "    def num_boxes(self):\n        \"\"\"Return number of boxes held in collections.\"\"\"\n        return self.data['boxes'].shape[0]\n    def get_extra_fields(self):\n        \"\"\"Return all non-box fields.\"\"\"\n        return [k for k in self.data if k != 'boxes']\n    def has_field(self, field):\n        return field in self.data\n    def add_field(self, field, field_data):\n        \"\"\"Add data to a specified field.\n        Args:\n            field: a string parameter used to speficy a related field to be\n                accessed.\n            field_data: a numpy array of [N, ...] representing the data\n                associated with the field.\n        Raises:\n            ValueError: if the field is already exist or the dimension of the\n                field data does not matches the number of boxes.\n        \"\"\"\n        if self.has_field(field):\n            raise ValueError('Field ' + field + 'already exists')\n        if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(\n        ):\n            raise ValueError('Invalid dimensions for field data')"
+        },
+        {
+            "comment": "The code defines a class with methods to access box coordinates from a stored dataset. The \"get\" method returns a numpy array of shape [N, 4] representing box corners. The \"get_field\" method is used to access data related to a specific field in the box collection. If an invalid field is provided, it raises a ValueError. The \"get_coordinates\" method returns a list of 4 1-d numpy arrays containing y_min, x_min, y_max, and x_max values for each box.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_list.py\":81-116",
+            "content": "        self.data[field] = field_data\n    def get(self):\n        \"\"\"Convenience function for accesssing box coordinates.\n        Returns:\n            a numpy array of shape [N, 4] representing box corners\n        \"\"\"\n        return self.get_field('boxes')\n    def get_field(self, field):\n        \"\"\"Accesses data associated with the specified field in the box\n        collection.\n        Args:\n            field: a string parameter used to speficy a related field to be\n                accessed.\n        Returns:\n            a numpy 1-d array representing data of an associated field\n        Raises:\n            ValueError: if invalid field\n        \"\"\"\n        if not self.has_field(field):\n            raise ValueError(f'field {field} does not exist')\n        return self.data[field]\n    def get_coordinates(self):\n        \"\"\"Get corner coordinates of boxes.\n        Returns:\n            a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]\n        \"\"\"\n        box_coordinates = self.get()\n        y_min = box_coordinates[:, 0]"
+        },
+        {
+            "comment": "This code defines a function `_is_valid_boxes` which checks if the data array of shape [N, 4] representing box coordinates fulfills the format N*[ymin, xmin, ymax, xmax]. It returns a boolean indicating whether all ymax of boxes are equal or greater than ymin and all xmax of boxes are equal or greater than xmin. The function also checks if the data is not empty.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_list.py\":117-137",
+            "content": "        x_min = box_coordinates[:, 1]\n        y_max = box_coordinates[:, 2]\n        x_max = box_coordinates[:, 3]\n        return [y_min, x_min, y_max, x_max]\n    def _is_valid_boxes(self, data):\n        \"\"\"Check whether data fullfills the format of N*[ymin, xmin, ymax,\n        xmin].\n        Args:\n            data: a numpy array of shape [N, 4] representing box coordinates\n        Returns:\n            a boolean indicating whether all ymax of boxes are equal or greater\n            than ymin, and all xmax of boxes are equal or greater than xmin.\n        \"\"\"\n        if len(data):\n            for v in data:\n                if v[0] > v[2] or v[1] > v[3]:\n                    return False\n        return True"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/46462441-92b5-4cd2-9738-198b110c5ae8.json b/docs/doc/46462441-92b5-4cd2-9738-198b110c5ae8.json
new file mode 100644
index 000000000..57cd25ae5
--- /dev/null
+++ b/docs/doc/46462441-92b5-4cd2-9738-198b110c5ae8.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The document explains the Oxford-RobotCar data preparation process for day-night depth estimation and provides related file download links. It outlines dataset preprocessing steps for ADDS-DepthNet training, including filtering, renaming, and image processing. The code showcases a directory structure with consistent training/verification sequences for day and night images.",
+    "details": [
+        {
+            "comment": "This is a brief introduction to the Oxford-RobotCar-for-ADDS data preparation document. It provides information on downloading and preprocessing the dataset for autonomous driving tasks, specifically day-night depth estimation. The original dataset can be found at the link provided in the text, and any use of this modified version should cite the referenced paper.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md\":0-23",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/Oxford_RobotCar.md) | English\n# Oxford-RobotCar-for-ADDS data preparation\n- [Introduction](#Introduction)\n- [Data Set Download](#Download)\n- [Preprocessing](#Preprocessing)\n- [1. Image De-distortion](#1-Image-de-distortion)\n- [2. Dynamic frame filter](#2-Dynamic-frame-filter)\n- [3. Image Rename](#3-Image-Rename)\n- [4. Preparation for Day-Pseudo Night Image Pair](#4-Day-Pseudo-Night-Image-Pair-Preparation)\n## Introduction\n[Oxford RobotCar Dataset](https://robotcar-dataset.robots.ox.ac.uk/) is a large-scale autonomous driving data set that contains a large amount of data in different autonomous driving scenarios.\nWhat is used here is to filter a part of the data used for day-night depth estimation from the original Oxford RobotCar data set, namely Oxford-RobotCar-for-ADDS.\nIf you want to use Oxford-RobotCar-for-ADDS, please cite the following papers:\n```latex\n@article{maddern20171,\n  title={1 year, 1000 km: The oxford robotcar dataset},\n  author={Maddern, Will and Pascoe, Geoffrey and Linegar, Chris and Newman, Paul},"
+        },
+        {
+            "comment": "This code represents the citation for two research papers in the format of BibTeX. The first paper is titled \"Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation\" and was published in 2021 at the IEEE/CVF International Conference on Computer Vision. The second paper is an Oxford RobotCar dataset study published in The International Journal of Robotics Research in 2017. Both papers are cited within a broader document, likely discussing the use or application of these datasets for computer vision tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md\":24-45",
+            "content": "  journal={The International Journal of Robotics Research},\n  volume={36},\n  number={1},\n  pages={3--15},\n  year={2017},\n  publisher={SAGE Publications Sage UK: London, England}\n}\n```\n```latex\n@inproceedings{liu2021self,\n  title={Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation},\n  author={Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun},\n  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},\n  pages={12737--12746},\n  year={2021}\n}\n```\n## Download\n1. Download the left eye image of Bumblebee XB3 in the sequence [2014-12-09](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-09-13-21-02/) as For the training set of the daytime scene, the downloaded images are decompressed in the same folder.\n2. Download the left eye image of Bumblebee XB3 in the sequence [2014-12-16](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-16-18-44-24/) as The training set of the night scene, the downloaded images are unzipped in the same folder."
+        },
+        {
+            "comment": "The code provides download links for validation and original raw data sets in Oxford RobotCar dataset, used for image and depth truth values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md\":46-63",
+            "content": "3. The images and depth truth values \u200b\u200bof the validation set are filtered from the original data set and downloaded from the link we gave. (The data download links are below)\n    ```shell\n    https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt\n    https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt\n    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.001\n    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.002\n    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.001\n    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.002\n    https://videotag.bj.bcebos.com/Data/ADDS/day_val_451.7z\n    https://videotag.bj.bcebos.com/Data/ADDS/day_val_451_gt.7z\n    https://videotag.bj.bcebos.com/Data/ADDS/night_val_411.7z\n    https://videotag.bj.bcebos.com/Data/ADDS/night_val_411_gt.7z\n    ```\n    the original raw data download links:\n    ```shell\n    # data in day\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.001\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.002"
+        },
+        {
+            "comment": "Links to parts of a 7z compressed file containing training data for day and night scenes from the Oxford RobotCar dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md\":64-79",
+            "content": "    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.003\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.004\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.005\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.006\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.007\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.008\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.009\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.010\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.011\n    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.012\n    # data in night\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.001\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.002\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.003\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.004"
+        },
+        {
+            "comment": "This code provides a list of URLs for various file segments (005 to 015) related to the \"night_train_all.7z\" file. These files are likely part of the Oxford RobotCar dataset and are used in preprocessing steps, such as image de-distortion, which pairs sequences from specific dates. The official toolbox mentioned is necessary for this process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md\":80-97",
+            "content": "    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.005\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.006\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.007\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.008\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.009\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.010\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.011\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.012\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.013\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.014\n    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.015\n    ```\n## Preprocessing\n### 1-Image-de-distortion\nUse the official toolbox [robotcar-dataset-sdk](https://github.com/ori-mrg/robotcar-dataset-sdk/tree/master/python) to pair the sequence 2014-12-09 and 2014-12- The image of 16 is de-distorted."
+        },
+        {
+            "comment": "This code segment discusses two main components: 1) dynamic frame filtering for self-supervised training, and 2) renaming of original image timestamps to create continuous number sequences. The dataset contains daytime and nighttime training images, as well as daytime verification images with corresponding depth truth values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md\":100-113",
+            "content": "### 2-Dynamic-frame-filter\nSince we use the self-supervised method, we need to filter out dynamic frames for training. The filtering principle is that the inter-frame pose change is greater than 0.1m and it is considered a dynamic frame. After filtering, the sequence of the training set is obtained.\n### 3-Image-Rename\nRename the original image timestamp to a continuous number sequence. For daytime scene correspondence, see [1209_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt), for night scene correspondence, see [1216_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt). The renamed data format is as follows:\n```\n\u251c\u2500\u2500 oxford_processing\n    \u251c\u2500\u2500 day_train_all #Day training image folder (day_train_all.7z.001 ~ day_train_all.7z.012)\n    \u251c\u2500\u2500 night_train_all #Night training image folder (night_train_all.7z.001 ~ day_train_all.7z.015)\n    \u251c\u2500\u2500 day_val_451 #Daytime verification image folder (day_val_451.7z)\n    \u251c\u2500\u2500 day_val_451_gt #Daytime verification depth truth value folder (day_val_451_gt.7z)"
+        },
+        {
+            "comment": "This code provides the location of image folders and annotation files for a robot car dataset, as well as the sequence used for training and verification. It also mentions the usage of CycleGAN to generate day-pseudo-night image pairs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md\":114-136",
+            "content": "    \u251c\u2500\u2500 night_val_411 #night verification image folder (night_val_411.7z)\n    \u2514\u2500\u2500 night_val_411_gt #Night verification depth truth value folder (night_val_411_gt.7z)\n```\nannotation files download links are below:\n```shell\nhttps://videotag.bj.bcebos.com/Data/ADDS/train_files.txt\nhttps://videotag.bj.bcebos.com/Data/ADDS/val_day_files.txt\nhttps://videotag.bj.bcebos.com/Data/ADDS/val_night_files.txt\n```\nThe sequence used for training and verification is as follows:\n```\nsplits/oxford_day/train_files.txt # training sequence during the day\nsplits/oxford_night/train_files.txt # training sequence at night\nsplits/oxford_day_451/val_files.txt # verification sequence during the day\nsplits/oxford_night_411/val_files.txt # night verification sequence\n```\n### 4-Day-Pseudo-Night-Image-Pair-Preparation\nIn order to use our framework to extract the common information of day and night images, we use [CycleGAN](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix) to generate day-pseudo-night image pairs, where pseudo-n"
+        },
+        {
+            "comment": "This code describes the file structure and data format of the Oxford-RobotCar dataset for ADDS-DepthNet training and verification. It includes daytime and nighttime images, as well as their ground truth depth values, organized into separate folders for training and validation purposes. The data has been preprocessed and scaled, with corresponding pseudo-night images generated using CycleGAN and histogram equalization applied to night images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md\":136-149",
+            "content": "ight The night images corresponding to the daytime generated for CycleGAN, all images are scaled to 192x640, the night images are enhanced with histogram equalization, 75 epochs are trained, and the Oxford-RobotCar-for-ADDS is finally obtained. The generated day-pseudo-night image pair The data format is as follows, which can be directly used for training and verification of ADDS-DepthNet:\n```\n\u251c\u2500\u2500 oxford_processing_forADDS\n    \u251c\u2500\u2500 day_train_all #Day training image folder (day_train_all.7z.001 ~ day_train_all.7z.002)\n    \u251c\u2500\u2500 night_train_all #Night training image folder (night_train_all.7z.001 ~ day_train_all.7z.002)\n    \u251c\u2500\u2500 day_val_451 #Daytime verification image folder (day_val_451.7z)\n    \u251c\u2500\u2500 day_val_451_gt #Daytime verification depth truth value folder (day_val_451_gt.7z)\n    \u251c\u2500\u2500 night_val_411 #night verification image folder (night_val_411.7z)\n    \u2514\u2500\u2500 night_val_411_gt #Night verification depth truth value folder (night_val_411_gt.7z)\ndata\n\u2514\u2500\u2500 oxford\n    \u251c\u2500\u2500 splits\n        \u251c\u2500\u2500 train_files.txt\n        \u251c\u2500\u2500 val_day_files.txt"
+        },
+        {
+            "comment": "The code represents a directory structure containing day and night training and verification image folders, along with their respective depth truth value folders. The sequences used for both training and verification are consistent.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/Oxford_RobotCar.md\":150-161",
+            "content": "        \u2514\u2500\u2500 val_night_files.txt\n    \u2514\u2500\u2500 oxford_processing_forADDS\n        \u251c\u2500\u2500 day_train_all/      #Day training image folder (from day_train_all.7z.001 ~ day_train_all.7z.002)\n        \u251c\u2500\u2500 night_train_all/    #Night training image folder (from night_train_all.7z.001 ~ day_train_all.7z.002)\n        \u251c\u2500\u2500 day_val_451/        #Daytime verification image folder (from day_val_451.7z)\n        \u251c\u2500\u2500 day_val_451_gt/     #Daytime verification depth truth value folder (from day_val_451_gt.7z)\n        \u251c\u2500\u2500 night_val_411/      #night verification image folder (from night_val_411.7z)\n        \u2514\u2500\u2500 night_val_411_gt/   #Night verification depth truth value folder (from night_val_411_gt.7z)\n```\nThe sequences used for training and verification are consistent with the foregoing."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/468ec0a1-622d-48fe-993d-08d86f88d526.json b/docs/doc/468ec0a1-622d-48fe-993d-08d86f88d526.json
new file mode 100644
index 000000000..c0d9268e6
--- /dev/null
+++ b/docs/doc/468ec0a1-622d-48fe-993d-08d86f88d526.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines a config parser and provides two functions for parsing, merging, setting, and printing configuration in different sections. The code handles section validity and updates values when merging.",
+    "details": [
+        {
+            "comment": "This code defines a config parser and provides two functions: `parse_config` and `merge_configs`. The `parse_config` function reads a configuration file and returns an `AttrDict` object containing the parsed configurations for different sections ('train', 'valid', 'test', 'infer'). The `merge_configs` function takes an existing configuration object, a section name, and an optional dictionary of arguments to merge into the configuration. It checks if the section is valid before attempting to merge the new arguments. If a value is None, it is ignored during the merging process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/config.py\":0-51",
+            "content": "\"\"\"\nconfig parser\n\"\"\"\ntry:\n    from configparser import ConfigParser\nexcept BaseException:\n    from ConfigParser import ConfigParser\nfrom utils import AttrDict\nimport logging\nlogger = logging.getLogger(__name__)\nCONFIG_SECS = [\n    'train',\n    'valid',\n    'test',\n    'infer',\n]\ndef parse_config(cfg_file):\n    \"\"\"parse_config\n    \"\"\"\n    parser = ConfigParser()\n    cfg = AttrDict()\n    parser.read(cfg_file)\n    for sec in parser.sections():\n        sec_dict = AttrDict()\n        for k, v in parser.items(sec):\n            try:\n                v = eval(v)\n            except BaseException:\n                pass\n            setattr(sec_dict, k, v)\n        setattr(cfg, sec.upper(), sec_dict)\n    return cfg\ndef merge_configs(cfg, sec, args_dict):\n    \"\"\"merge_configs\n    \"\"\"\n    assert sec in CONFIG_SECS, \"invalid config section {}\".format(sec)\n    sec_dict = getattr(cfg, sec.upper())\n    for k, v in args_dict.items():\n        if v is None:\n            continue\n        # try:\n        #     if hasattr(sec_dict, k):\n        #         setattr(sec_dict, k, v)"
+        },
+        {
+            "comment": "This code defines two functions, `set_config` and `print_configs`. The `set_config` function takes a dictionary (cfg) as input, iterates through its keys and values, and sets the value for each key in the configuration dictionary (sec_dict). If the key already exists in sec_dict, it updates its value. Finally, the function returns the updated configuration dictionary. The `print_configs` function prints out the configuration in a formatted way using the logger module. It iterates through the sections and their corresponding values in the configuration dictionary and logs them to the console with proper indentation and section names.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/config.py\":52-70",
+            "content": "        # except BaseException:\n        #     pass\n        if k in sec_dict:\n            setattr(sec_dict, k, v)\n    return cfg\ndef print_configs(cfg, mode):\n    \"\"\"print_configs\n    \"\"\"\n    logger.info(\"---------------- {:>5} Arguments ----------------\".format(mode))\n    for sec, sec_items in cfg.items():\n        if isinstance(sec_items, dict) is True:\n            logger.info(\"{}:\".format(sec))\n            for k, v in sec_items.items():\n                logger.info(\"    {}:{}\".format(k, v))\n        else:\n            logger.info(\"{}:{}\".format(sec, sec_items))\n    logger.info(\"-------------------------------------------------\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/46b06b32-b631-43dd-9c96-079811df2c97.json b/docs/doc/46b06b32-b631-43dd-9c96-079811df2c97.json
new file mode 100644
index 000000000..464b4b050
--- /dev/null
+++ b/docs/doc/46b06b32-b631-43dd-9c96-079811df2c97.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code initializes and defines various backbone models for video analysis tasks in PaddleVideo, including ResNet, Vision Transformer, AGCN, and popular models such as ResNetTSN_MRI, ResNetTSM_MRI, and SwinTransformer3D. These models form the foundation for object detection, segmentation, motion estimation, and various computer vision applications in PaddlePaddle framework.",
+    "details": [
+        {
+            "comment": "This code is an initialization file for backbone models in PaddleVideo. It imports various model classes from submodules, including BertForMultiModalPreTraining, ADDS_DepthNet, AGCN, ASRF, BMN, CFBI, MoViNet, MSTCN, ResNet, ResNetSlowFast, ResNetSlowFast_MRI, and ResNetTSM, ResNetTSM_MRI. These models can be used for video analysis tasks in the PaddlePaddle framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/__init__.py\":0-26",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .actbert import BertForMultiModalPreTraining\nfrom .adds import ADDS_DepthNet\nfrom .agcn import AGCN\nfrom .asrf import ASRF\nfrom .bmn import BMN\nfrom .cfbi import CFBI\nfrom .movinet import MoViNet\nfrom .ms_tcn import MSTCN\nfrom .resnet import ResNet\nfrom .resnet_slowfast import ResNetSlowFast\nfrom .resnet_slowfast_MRI import ResNetSlowFast_MRI\nfrom .resnet_tsm import ResNetTSM\nfrom .resnet_tsm_MRI import ResNetTSM_MRI"
+        },
+        {
+            "comment": "The code imports various backbone models for video analysis from different modules within the PaddleVideo library, including ResNet, Vision Transformer, STGCN, AGCN, and more. The models are used for tasks like object detection, segmentation, and motion estimation in video processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/__init__.py\":27-54",
+            "content": "from .resnet_tsn_MRI import ResNetTSN_MRI\nfrom .resnet_tweaks_tsm import ResNetTweaksTSM\nfrom .resnet_tweaks_tsn import ResNetTweaksTSN\nfrom .stgcn import STGCN\nfrom .swin_transformer import SwinTransformer3D\nfrom .transnetv2 import TransNetV2\nfrom .vit import VisionTransformer\nfrom .vit_tweaks import VisionTransformer_tweaks\nfrom .ms_tcn import MSTCN\nfrom .asrf import ASRF\nfrom .resnet_tsn_MRI import ResNetTSN_MRI\nfrom .resnet_tsm_MRI import ResNetTSM_MRI\nfrom .resnet_slowfast_MRI import ResNetSlowFast_MRI\nfrom .cfbi import CFBI\nfrom .ctrgcn import CTRGCN\nfrom .agcn2s import AGCN2s\nfrom .movinet import MoViNet\nfrom .resnet3d_slowonly import ResNet3dSlowOnly\nfrom .toshift_vit import TokenShiftVisionTransformer\nfrom .pptsm_mv2 import PPTSM_MobileNetV2\nfrom .pptsm_mv3 import PPTSM_MobileNetV3\nfrom .pptsm_v2 import PPTSM_v2\nfrom .yowo import YOWO\n__all__ = [\n    'ResNet', 'ResNetTSM', 'ResNetTweaksTSM', 'ResNetSlowFast', 'BMN',\n    'ResNetTweaksTSN', 'VisionTransformer', 'STGCN', 'AGCN', 'TransNetV2',\n    'ADDS_DepthNet', 'VisionTransformer_tweaks', 'BertForMultiModalPreTraining',"
+        },
+        {
+            "comment": "This code defines a list of available backbones for video processing tasks, including popular models such as ResNetTSN_MRI, ResNetTSM_MRI, and SwinTransformer3D. These backbones serve as the foundation for various computer vision applications in PaddleVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/__init__.py\":55-59",
+            "content": "    'ResNetTSN_MRI', 'ResNetTSM_MRI', 'ResNetSlowFast_MRI', 'CFBI', 'MSTCN',\n    'ASRF', 'MoViNet', 'SwinTransformer3D', 'CTRGCN',\n    'TokenShiftVisionTransformer', 'AGCN2s', 'PPTSM_MobileNetV2',\n    'PPTSM_MobileNetV3', 'PPTSM_v2', 'ResNet3dSlowOnly', 'YOWO'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/46c12cb0-b8fb-4249-8642-3ffc2ee2b978.json b/docs/doc/46c12cb0-b8fb-4249-8642-3ffc2ee2b978.json
new file mode 100644
index 000000000..b3331ee87
--- /dev/null
+++ b/docs/doc/46c12cb0-b8fb-4249-8642-3ffc2ee2b978.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code initializes an LSTM-based model for basketball action detection using PaddlePaddle's inference API, with preprocessing and GPU memory optimization functions. It loads a pre-trained model, predicts actions in videos, and saves results in JSON format without ASCII conversion.",
+    "details": [
+        {
+            "comment": "This code is for an LSTM-based inferencing model in the BasketballAction application. It includes functions for preprocessing, processing results, and using PaddlePaddle's inference API. The class InferModel initializes the model based on a configuration file that contains information such as model and parameter files, GPU memory, device ID, and thread settings for different tasks like NMS and classification scoring. It also includes a label mapping file for classification purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py\":0-35",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import get_action_result\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"lstm infer\"\"\"\n    def __init__(self, cfg, name='ACTION'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        self.topk           = cfg[name]['topk']\n        self.frame_offset   = cfg[name]['nms_offset']\n        self.nms_thread     = cfg[name]['nms_thread']\n        self.cls_thread     = cfg[name]['classify_score_thread']\n        self.iou_thread     = cfg[name]['iou_score_thread']\n        self.label_map_file = cfg['COMMON']['label_dic']"
+        },
+        {
+            "comment": "This code initializes an LSTM-based predictor model for action detection. It sets FPS, NMS ID, and configures the model to enable GPU usage and memory optimization. The code then creates a zero copy feed fetch operator and assigns input and output tensors for the infer method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py\":36-60",
+            "content": "        self.fps            = cfg['COMMON']['fps']\n        self.nms_id         = 5\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input1_tensor = self.predictor.get_input_handle(input_names[0])\n        #self.input2_tensor = self.predictor.get_input_handle(input_names[1])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n    def infer(self, input1_arr, input1_lod, input2_arr=None, input2_lod=None):\n        \"\"\"infer\"\"\"\n        self.input1_tensor.copy_from_cpu(input1_arr)\n        self.input1_tensor.set_lod(input1_lod)"
+        },
+        {
+            "comment": "This code appears to be part of a Python class that uses LSTM models for action detection in basketball videos. It preprocesses input data, runs the predictor, and returns output1 and output2 as results. The pre_process function takes an input, creates lod (lengths of dimensions) and arranges sub-items in a specific order to prepare it for the model. The predict function uses a reader to iterate through data, performing action detection on each video frame and returning the results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py\":61-89",
+            "content": "        if not input2_arr is None:\n            self.input2_tensor.copy_from_cpu(input2_arr)\n            self.input2_tensor.set_lod(input2_lod)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        # print(output.shape)\n        return output1, output2\n    def pre_process(self, input):\n        \"\"\"pre process\"\"\"\n        input_arr = []\n        input_lod = [0]\n        start_lod = 0\n        end_lod = 0\n        for sub_item in input:\n            end_lod = start_lod + len(sub_item)\n            input_lod.append(end_lod)\n            input_arr.extend(sub_item)\n            start_lod = end_lod\n        input_arr = np.array(input_arr)\n        return input_arr, [input_lod]\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)\n        results = []\n        for infer_iter, data in enumerate(infer_reader()):\n            video_id = [[items[-2], items[-1]] for items in data]"
+        },
+        {
+            "comment": "This code is a function that performs action detection on video frames using an LSTM model. It preprocesses the input data, infers predictions from the model, selects the top-k detections for each frame, and then combines these results to generate an action detection result. The results are returned after post-processing with additional functions. A main function is also provided that can be used to run inference on a video with specific configuration settings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py\":90-111",
+            "content": "            input1 = [items[0] for items in data]\n            input1_arr, input1_lod = self.pre_process(input1)\n            output1, output2 = self.infer(input1_arr, input1_lod)\n            predictions_id = output1 \n            predictions_iou = output2\n            for i in range(len(predictions_id)):\n                topk_inds = predictions_id[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds_id = predictions_id[i][topk_inds]\n                preds_iou = predictions_iou[i][0]\n                results.append((video_id[i], preds_id.tolist(), topk_inds.tolist(), preds_iou.tolist()))\n        predict_result = get_action_result(results, self.label_map_file, self.fps, \n                                           self.cls_thread, self.iou_thread, \n                                           self.nms_id, self.nms_thread, self.frame_offset)\n        return predict_result\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)"
+        },
+        {
+            "comment": "The code loads and initializes a pre-trained LSTM model for action detection. It then retrieves the video features and proposal information from JSON files. Finally, it uses the loaded model to predict actions based on the given material (features and proposals) and saves the results in a json file named 'results.json'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py\":112-140",
+            "content": "    model = InferModel(cfg)\n    # proposal total\n    prop_dict = {}\n    for dataset in ['EuroCup2016', 'WorldCup2018']:\n        prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)\n        json_data = json.load(open(prop_json, 'r'))\n        for item in json_data:\n            basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n            basename = basename + '/' + item['video_name'] + '.mp4'\n            prop_dict[basename] = item['bmn_results']\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    # proposal\n    basename = imgs_path.replace('frames', 'mp4') + '.mp4'\n    bmn_results = prop_dict[basename]\n    material = {'feature': video_features, 'proposal': bmn_results}\n    t0 = time.time()\n    outputs = model.predict(cfg, material)\n    t1 = time.time()\n    results = {'actions': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:"
+        },
+        {
+            "comment": "The code dumps the results in JSON format with indentation and without converting special characters to ASCII. Then, it writes this data to a file and prints the time taken in minutes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py\":141-144",
+            "content": "       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) \n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/46ef5c44-324f-4720-96fd-a36a2db1e04e.json b/docs/doc/46ef5c44-324f-4720-96fd-a36a2db1e04e.json
new file mode 100644
index 000000000..14416ed24
--- /dev/null
+++ b/docs/doc/46ef5c44-324f-4720-96fd-a36a2db1e04e.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The \"InferModel\" class is for audio inference, initializing the model and creating a predictor object. It takes input, performs inference, returns output, and measures time taken. The code loads an audio file, sets path, performs prediction, prints shape, first output, and time.",
+    "details": [
+        {
+            "comment": "This code defines a class named \"InferModel\" for audio inference. It initializes the model by reading configuration files, enabling GPU usage, and creating a predictor object. The input name and handle are stored for later use during inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/audio_infer.py\":0-36",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"audio infer\"\"\"\n    def __init__(self, cfg, name='AUDIO'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])"
+        },
+        {
+            "comment": "The code defines a model that takes audio input, performs inference using the predictor, and returns output. The predict method reads data from infer_config and for each iteration, it prepares inputs, runs inference, collects feature lists and pcm lists, then combines them into feature_values and pcm_values before returning.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/audio_infer.py\":38-68",
+            "content": "        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        pcm_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = np.array(data, dtype = 'float32')\n            output = self.infer(inputs)\n            feature_list.append(np.squeeze(output))\n            pcm_list.append(inputs)\n        feature_values = np.vstack(feature_list)\n        pcm_values = np.vstack(pcm_list)\n        return feature_values, pcm_values\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)"
+        },
+        {
+            "comment": "This code loads an audio file, sets the path for it in the configuration file, performs prediction on the model, prints the shape and first output of the prediction, and calculates and prints the time taken in minutes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/audio_infer.py\":70-79",
+            "content": "    pcm_path = '/home/work/datasets/WorldCup2018/pcm/6e577252c4004961ac7caa738a52c238.pcm'\n    t0 = time.time()\n    cfg['AUDIO']['pcm_file'] = pcm_path\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print(outputs[0])\n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/478e000f-a0fc-403b-88e2-d2301a2afd4b.json b/docs/doc/478e000f-a0fc-403b-88e2-d2301a2afd4b.json
new file mode 100644
index 000000000..2761be271
--- /dev/null
+++ b/docs/doc/478e000f-a0fc-403b-88e2-d2301a2afd4b.json
@@ -0,0 +1,145 @@
+{
+    "summary": "The code provides data conversion, tensor handling, input compatibility, gradient norm clipping, and gain calculations for PyTorch and PaddlePaddle. It also offers tensor initialization using various methods with backward compatibility.",
+    "details": [
+        {
+            "comment": "This code file contains utility functions for converting data from PyTorch to Paddle, filling tensors with values, zeroing out tensor values, and changing the tensor's dtype. It also includes a function that attempts to import PyTorch and converts corresponding data types accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":0-48",
+            "content": "import math\nimport warnings\nimport numpy\nimport numpy as np\nfrom numpy import inf\nfrom paddle import Tensor, concat, reshape, nn\nimport paddle\nfrom typing import Union, Iterable\n_tensor_or_tensors = Union[paddle.Tensor, Iterable[paddle.Tensor]]\nimport paddle\nimport PIL\nimport numbers\nimport numpy as np\nfrom PIL import Image\nfrom paddle.vision.transforms import BaseTransform\nfrom paddle.vision.transforms import functional as F\ndef torch2paddle(data):\n    try:\n        import torch\n        if isinstance(data, dict):\n            np_data = {}\n            for k, v in data.items():\n                np_data[k] = paddle.to_tensor(v.detach().numpy())\n            return np_data\n        else:\n            return paddle.to_tensor(data.detach().numpy())\n    except:\n        pass\ndef fill_(tensor: Tensor, value):\n    return tensor.set_value(paddle.full_like(tensor, value))\ndef zero_(tensor: Tensor):\n    return tensor.set_value(paddle.zeros_like(tensor))\ndef float_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='float32')\ndef long_(tensor: Tensor):"
+        },
+        {
+            "comment": "The code provides three tensor conversion functions: `int64`, `int32`, and `byte` that convert tensors into specific data types. It also includes a class `ToPILImage` for converting images to PIL Image format with the option to specify mode and keys. The function `_apply_image` checks if the input is a tensor or numpy array, raises an error if not, and then proceeds to convert 2D or 3D images into PIL Image format by adding channel dimension if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":49-82",
+            "content": "    return paddle.to_tensor(tensor, dtype='int64')\ndef int_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='int32')\ndef byte_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='bool')\nclass ToPILImage(BaseTransform):\n    def __init__(self, mode=None, keys=None):\n        super(ToPILImage, self).__init__(keys)\n    def _apply_image(self, pic):\n        \"\"\"\n        Args:\n            pic (Tensor|np.ndarray): Image to be converted to PIL Image.\n        Returns:\n            PIL: Converted image.\n        \"\"\"\n        if not (isinstance(pic, paddle.Tensor) or isinstance(pic, np.ndarray)):\n            raise TypeError('pic should be Tensor or ndarray. Got {}.'.format(\n                type(pic)))\n        elif isinstance(pic, paddle.Tensor):\n            if pic.ndimension() not in {2, 3}:\n                raise ValueError(\n                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(\n                        pic.ndimension()))\n            elif pic.ndimension() == 2:\n                # if 2D image, add channel dimension (CHW)"
+        },
+        {
+            "comment": "Code checks if the input 'pic' is a Paddle or numpy array, adjusts dimensions if necessary, and converts data types accordingly. If 'pic' is not compatible with the code, it raises an error. This code ensures that the input image is in the correct format for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":83-111",
+            "content": "                pic = pic.unsqueeze(0)\n        elif isinstance(pic, np.ndarray):\n            if pic.ndim not in {2, 3}:\n                raise ValueError(\n                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(\n                        pic.ndim))\n            elif pic.ndim == 2:\n                # if 2D image, add channel dimension (HWC)\n                pic = np.expand_dims(pic, 2)\n        npimg = pic\n        if isinstance(pic, paddle.Tensor) and \"float\" in str(\n                pic.numpy().dtype) and self.mode != 'F':\n            pic = pic.mul(255).byte()\n        if isinstance(pic, paddle.Tensor):\n            npimg = np.transpose(pic.numpy(), (1, 2, 0))\n        if not isinstance(npimg, np.ndarray):\n            raise TypeError(\n                'Input pic must be a paddle.Tensor or NumPy ndarray, ' +\n                'not {}'.format(type(npimg)))\n        if npimg.shape[2] == 1:\n            expected_mode = None\n            npimg = npimg[:, :, 0]\n            if npimg.dtype == np.uint8:\n                expected_mode = 'L'"
+        },
+        {
+            "comment": "This code checks the data type of npimg and sets the expected mode accordingly. It then compares the input's mode to the expected mode, raising a ValueError if they don't match. For 2D inputs with 2 channels, only 'LA' mode is supported; it sets self.mode to 'LA' if necessary. For 4-channel inputs, the code supports modes like 'RGBA', 'CMYK', and 'RGBX'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":112-135",
+            "content": "            elif npimg.dtype == np.int16:\n                expected_mode = 'I;16'\n            elif npimg.dtype == np.int32:\n                expected_mode = 'I'\n            elif npimg.dtype == np.float32:\n                expected_mode = 'F'\n            if self.mode is not None and self.mode != expected_mode:\n                raise ValueError(\n                    \"Incorrect self.mode ({}) supplied for input type {}. Should be {}\"\n                    .format(self.mode, np.dtype, expected_mode))\n            self.mode = expected_mode\n        elif npimg.shape[2] == 2:\n            permitted_2_channel_modes = ['LA']\n            if self.mode is not None and self.mode not in permitted_2_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 2D inputs\".format(\n                        permitted_2_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'LA'\n        elif npimg.shape[2] == 4:\n            permitted_4_channel_modes = ['RGBA', 'CMYK', 'RGBX']"
+        },
+        {
+            "comment": "This code checks the input image mode and data type, ensuring it matches with the supported modes for 3D or 4D inputs. If the mode is not recognized, a ValueError is raised. If no mode is provided and the data type is np.uint8, it assigns the appropriate default mode (RGB or RGBA). Finally, if there is no mode specified and the input data type is unsupported, a TypeError is raised. The code is part of a class called Identity which seems to be an identity operator for neural networks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":136-160",
+            "content": "            if self.mode is not None and self.mode not in permitted_4_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 4D inputs\".format(\n                        permitted_4_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'RGBA'\n        else:\n            permitted_3_channel_modes = ['RGB', 'YCbCr', 'HSV']\n            if self.mode is not None and self.mode not in permitted_3_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 3D inputs\".format(\n                        permitted_3_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'RGB'\n        if self.mode is None:\n            raise TypeError('Input type {} is not supported'.format(\n                npimg.dtype))\n        return Image.fromarray(npimg, mode=self.mode)\nclass Identity(nn.Layer):\n    r\"\"\"A placeholder identity operator that is argument-insensitive."
+        },
+        {
+            "comment": "This code defines a class \"Identity\" that performs identity forwarding and a function \"convert\" to convert data between Paddle and Torch formats. It takes a dictionary of data, converts it into either Paddle or Torch format based on the specified type, and returns a new dictionary with the converted data. If the data is a numpy ndarray, it can also be casted to a specific dtype.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":162-197",
+            "content": "    Args:\n        args: any argument (unused)\n        kwargs: any keyword argument (unused)\n    \"\"\"\n    def __init__(self, *args, **kwargs):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\ndef convert(data: dict, to, dtype=None):\n    assert isinstance(data, dict)\n    input = {}\n    for k, v in data.items():\n        if 'paddle' == to:\n            if isinstance(v, np.ndarray):\n                if dtype is not None:\n                    input[k] = paddle.to_tensor(v.astype(dtype))\n                else:\n                    input[k] = paddle.to_tensor(v)\n            else:\n                input[k] = v\n        elif 'torch' == to:\n            try:\n                import torch\n                if isinstance(v, np.ndarray):\n                    if dtype is not None:\n                        input[k] = torch.tensor(v.astype(dtype))\n                    else:\n                        input[k] = torch.tensor(v)\n                else:\n                    input[k] = v\n            except:\n                pass"
+        },
+        {
+            "comment": "This code defines a function that clips the gradient norm of an iterable of parameters. It takes in an iterable of Tensors (parameters) and a maximum norm value, computes the norm over all gradients, and modifies them in-place if necessary. The norm type can be specified as well, with options including 'inf' for infinity norm. If nonfinite norms are present, an error will occur unless error_if_nonfinite is set to False.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":198-222",
+            "content": "        else:\n            if isinstance(v, np.ndarray):\n                input[k] = v.astype(to)\n            else:\n                input[k] = v\n    return input\ndef clip_grad_norm_(parameters: _tensor_or_tensors,\n                    max_norm: float,\n                    norm_type: float = 2.0,\n                    error_if_nonfinite: bool = False) -> paddle.Tensor:\n    r\"\"\"Clips gradient norm of an iterable of parameters.\n    The norm is computed over all gradients together, as if they were\n    concatenated into a single vector. Gradients are modified in-place.\n    Args:\n        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a\n            single Tensor that will have gradients normalized\n        max_norm (float or int): max norm of the gradients\n        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for\n            infinity norm.\n        error_if_nonfinite (bool): if True, an error is thrown if the total\n            norm of the gradients from :attr:``parameters`` is ``nan``,"
+        },
+        {
+            "comment": "This function calculates the total norm of parameters viewed as a single vector. It takes parameters and optional arguments max_norm and norm_type for the maximum allowed norm value and type of norm (inf or other), respectively. If no parameters are provided, it returns 0. If max_norm is infinity, it finds the maximum absolute value among parameters. Otherwise, it calculates the norm of gradients using the provided norm_type. If error_if_nonfinite is True and the total norm is NaN or Inf, a RuntimeError is raised.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":223-249",
+            "content": "            ``inf``, or ``-inf``. Default: False (will switch to True in the future)\n    Returns:\n        Total norm of the parameters (viewed as a single vector).\n    \"\"\"\n    import time\n    if isinstance(parameters, paddle.Tensor):\n        parameters = [parameters]\n    parameters = [p for p in parameters if p.grad is not None]\n    detached_grads = [p.grad.detach() for p in parameters]\n    max_norm = float(max_norm)\n    norm_type = float(norm_type)\n    if len(parameters) == 0:\n        return paddle.to_tensor(0.)\n    if norm_type == inf:\n        norms = [p.abs().max() for p in parameters]\n        total_norm = norms[0] if len(norms) == 1 else paddle.max(\n            paddle.stack(norms))\n    else:\n        total_norm = paddle.norm(\n            paddle.stack([paddle.norm(g, norm_type) for g in detached_grads]),\n            norm_type)\n    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),\n                                                total_norm.isinf()):\n        raise RuntimeError(\n            f'The total norm of order {norm_type} for gradients from '"
+        },
+        {
+            "comment": "This code snippet is a part of the PaddleVideo framework's Ma-Net application. It checks if 'parameters' are non-finite and clips them if not. It also defines a function to find the maximum value in a tensor, similar to numpy's max() function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":250-273",
+            "content": "            '`parameters` is non-finite, so it cannot be clipped. To disable '\n            'this error and scale the gradients by the non-finite norm anyway, '\n            'set `error_if_nonfinite=False`')\n    clip_coef = max_norm / (total_norm + 1e-6)\n    # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so\n    # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization\n    # when the gradients do not reside in CPU memory.\n    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)\n    for i, p in enumerate(parameters):\n        p.grad.set_value(detached_grads[i] * clip_coef_clamped)  # fixed\n    return total_norm\ndef max(a: paddle.Tensor, axis=0, keepdim=True):\n    \"\"\"ndarray=numpy.array([[1, 2, 3, 4],\n           [4, 3, 2, 1],\n           [5, 6, 7, 8],\n           [8, 7, 6, 5]])\n    np.where(ndarray == np.max(ndarray))\n    (array([2, 3]), array([3, 0]))\n    ndarray[np.where(ndarray == np.max(ndarray))]\n    array([8, 8])\n    \"\"\"\n    max_ = a.max(axis).unsqueeze(-1)"
+        },
+        {
+            "comment": "This code calculates the maximum value in a tensor and returns the corresponding index for each dimension. It also provides functions to gather data along different dimensions using gather or index_sample operations, depending on the shape of the input tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":274-306",
+            "content": "    index = paddle.argmax(a, axis=axis, keepdim=keepdim)\n    max_ = max_.numpy()\n    index = index.numpy()\n    # index = paddle.argmax(a, axis=axis, keepdim=keepdim)[-1].flatten()\n    return max_, index\ndef gather(tmp: paddle.Tensor, ind: paddle.Tensor):\n    shape = tmp.shape\n    tmp = paddle.to_tensor(tmp)\n    ind = paddle.to_tensor(ind)\n    if len(shape) == 2:\n        b = shape[0]\n        return concat([\n            reshape(paddle.gather(tmp[i, :], ind[i, :]), [1, -1])\n            for i in range(b)\n        ],\n                      axis=0)\n    elif len(shape) == 3:\n        out = []\n        for i in range(tmp.shape[0]):\n            _ = paddle.index_sample(tmp[i], ind[i])\n            out.append(_)\n        return paddle.to_tensor(out)\n    elif len(shape) == 4:\n        b, c, d = shape[:3]\n        return concat([\n            reshape(\n                concat([\n                    reshape(\n                        concat([\n                            reshape(\n                                paddle.gather(tmp[i, j, k, :], ind[i, j, k, :]),"
+        },
+        {
+            "comment": "This code defines three functions (_no_grad_uniform_, _no_grad_normal_, and _no_grad_trunc_normal_) to initialize the weights of a tensor using different distributions while ensuring the computations are performed without gradient calculation. The main purpose is to prevent unnecessary memory usage and computation time for backpropagation in cases where gradients are not required.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":307-337",
+            "content": "                                [1, -1]) for k in range(d)\n                        ],\n                               axis=0), [1, d, -1]) for j in range(c)\n                ],\n                       axis=0), [1, c, d, -1]) for i in range(b)\n        ],\n                      axis=0)\n    else:\n        pass\n# These no_grad_* functions are necessary as wrappers around the parts of these\n# functions that use `with torch.no_grad()`. The JIT doesn't support context\n# managers, so these need to be implemented as builtins. Using these wrappers\n# lets us keep those builtins small and re-usable.\ndef _no_grad_uniform_(tensor, a, b):\n    with paddle.no_grad():\n        tensor.set_value(paddle.uniform(tensor.shape, min=a, max=b))\n        return tensor\ndef _no_grad_normal_(tensor, mean, std):\n    with paddle.no_grad():\n        tensor.set_value(paddle.normal(shape=tensor.shape, mean=mean, std=std))\n        return tensor\ndef _no_grad_trunc_normal_(tensor, mean, std, a, b):\n    from scipy import special\n    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf"
+        },
+        {
+            "comment": "This function initializes the weights of a neural network using truncated normal distribution. It first computes the standard normal cumulative distribution function and checks if the mean is more than 2 std away from [a, b]. If so, it issues a warning. Then it generates uniform values in [l, u] and transforms them to truncated standard normal distribution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":338-363",
+            "content": "    def norm_cdf(x):\n        # Computes standard normal cumulative distribution function\n        return (1. + math.erf(x / math.sqrt(2.))) / 2.\n    if (mean < a - 2 * std) or (mean > b + 2 * std):\n        warnings.warn(\n            \"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. \"\n            \"The distribution of values may be incorrect.\",\n            stacklevel=2)\n    with paddle.no_grad():\n        # Values are generated by using a truncated uniform distribution and\n        # then using the inverse CDF for the normal distribution.\n        # Get upper and lower cdf values\n        l = norm_cdf((a - mean) / std)\n        u = norm_cdf((b - mean) / std)\n        # Uniformly fill tensor with values from [l, u], then translate to\n        # [2l-1, 2u-1].\n        tensor.set_value(\n            paddle.uniform(tensor.shape, min=2 * l - 1, max=2 * u - 1))\n        # tensor.uniform_(2 * l - 1, 2 * u - 1)\n        # Use inverse cdf transform for normal distribution to get truncated\n        # standard normal\n        tensor.set_value(special.erfinv(tensor))"
+        },
+        {
+            "comment": "The provided code contains functions for transforming, filling, and zeroing tensors. It also includes a function that calculates the recommended gain value for different nonlinearity functions. The gain values are 1 for Linear/Identity, Conv{1,2,3}D, and Sigmoid; and 5/3 for Tanh, while for ReLU it is sqrt(2).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":365-397",
+            "content": "        # Transform to proper mean, std\n        tensor.set_value(tensor.multiply(paddle.to_tensor(std * math.sqrt(2.))))\n        tensor.add_(mean)\n        # Clamp to ensure it's in the proper range\n        tensor.clip_(min=a, max=b)\n        return tensor\ndef _no_grad_fill_(tensor, val):\n    with paddle.no_grad():\n        tensor.set_value(paddle.full_like(tensor, fill_value=val))\n        return tensor\ndef _no_grad_zero_(tensor):\n    with paddle.no_grad():\n        tensor.set_value(paddle.zeros_like(tensor))\n        return tensor\ndef calculate_gain(nonlinearity, param=None):\n    r\"\"\"Return the recommended gain value for the given nonlinearity function.\n    The values are as follows:\n    ================= ====================================================\n    nonlinearity      gain\n    ================= ====================================================\n    Linear / Identity :math:`1`\n    Conv{1,2,3}D      :math:`1`\n    Sigmoid           :math:`1`\n    Tanh              :math:`\\frac{5}{3}`\n    ReLU              :math:`\\sqrt{2}`"
+        },
+        {
+            "comment": "This function calculates the gain value for different non-linear functions used in neural networks, such as Leaky ReLU, SELU, and others. It returns appropriate gain values depending on the specified nonlinearity parameter, considering any optional parameters as well.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":398-424",
+            "content": "    Leaky Relu        :math:`\\sqrt{\\frac{2}{1 + \\text{negative\\_slope}^2}}`\n    SELU              :math:`\\frac{3}{4}`\n    ================= ====================================================\n    Args:\n        nonlinearity: the non-linear function (`nn.functional` name)\n        param: optional parameter for the non-linear function\n    Examples:\n        >>> gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2\n    \"\"\"\n    linear_fns = [\n        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',\n        'conv_transpose2d', 'conv_transpose3d'\n    ]\n    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':\n        return 1\n    elif nonlinearity == 'tanh':\n        return 5.0 / 3\n    elif nonlinearity == 'relu':\n        return math.sqrt(2.0)\n    elif nonlinearity == 'leaky_relu':\n        if param is None:\n            negative_slope = 0.01\n        elif not isinstance(param, bool) and isinstance(\n                param, int) or isinstance(param, float):\n            # True/False are instances of int, hence check above"
+        },
+        {
+            "comment": "The code defines two functions, `uniform_` and `normal_`, used for initializing tensors with uniform or normal distribution respectively. The `_no_grad_uniform_` function is used internally by `uniform_`. The `else` and `if-elif` structures are used to handle nonlinearity cases in the `init_` function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":425-453",
+            "content": "            negative_slope = param\n        else:\n            raise ValueError(\n                \"negative_slope {} not a valid number\".format(param))\n        return math.sqrt(2.0 / (1 + negative_slope**2))\n    elif nonlinearity == 'selu':\n        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)\n    else:\n        raise ValueError(\"Unsupported nonlinearity {}\".format(nonlinearity))\ndef uniform_(tensor: Tensor, a: float = 0., b: float = 1.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from the uniform\n    distribution :math:`\\mathcal{U}(a, b)`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the lower bound of the uniform distribution\n        b: the upper bound of the uniform distribution\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.uniform_(w)\n    \"\"\"\n    return _no_grad_uniform_(tensor, a, b)\ndef normal_(tensor: Tensor, mean: float = 0., std: float = 1.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from the normal"
+        },
+        {
+            "comment": "This code initializes or fills a tensor with values drawn from a normal distribution, truncating values outside the specified range [a, b]. The function `_no_grad_normal_` initializes a tensor with values from a normal distribution with given mean and standard deviation. The `trunc_normal_` function initializes a tensor with values from a truncated normal distribution within the specified bounds [a, b].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":454-482",
+            "content": "    distribution :math:`\\mathcal{N}(\\text{mean}, \\text{std}^2)`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        mean: the mean of the normal distribution\n        std: the standard deviation of the normal distribution\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.normal_(w)\n    \"\"\"\n    return _no_grad_normal_(tensor, mean, std)\ndef trunc_normal_(tensor: Tensor,\n                  mean: float = 0.,\n                  std: float = 1.,\n                  a: float = -2.,\n                  b: float = 2.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from a truncated\n    normal distribution. The values are effectively drawn from the\n    normal distribution :math:`\\mathcal{N}(\\text{mean}, \\text{std}^2)`\n    with values outside :math:`[a, b]` redrawn until they are within\n    the bounds. The method used for generating the random values works\n    best when :math:`a \\leq \\text{mean} \\leq b`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        mean: the mean of the normal distribution"
+        },
+        {
+            "comment": "This code defines several functions for initializing Tensor objects in PyTorch. These functions include `trunc_normal_`, `constant_`, `ones_`, and `zeros_`. The `trunc_normal_` function initializes a tensor with values drawn from a truncated normal distribution, while the other three functions fill the tensor with constant values (specified by the user), ones, or zeros respectively. These functions can be used to set the initial values of a tensor before training a neural network model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":483-525",
+            "content": "        std: the standard deviation of the normal distribution\n        a: the minimum cutoff value\n        b: the maximum cutoff value\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.trunc_normal_(w)\n    \"\"\"\n    return _no_grad_trunc_normal_(tensor, mean, std, a, b)\ndef constant_(tensor: Tensor, val: float) -> Tensor:\n    r\"\"\"Fills the input Tensor with the value :math:`\\text{val}`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        val: the value to fill the tensor with\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.constant_(w, 0.3)\n    \"\"\"\n    return _no_grad_fill_(tensor, val)\ndef ones_(tensor: Tensor) -> Tensor:\n    r\"\"\"Fills the input Tensor with the scalar value `1`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.ones_(w)\n    \"\"\"\n    return _no_grad_fill_(tensor, 1.)\ndef zeros_(tensor: Tensor) -> Tensor:\n    r\"\"\"Fills the input Tensor with the scalar value `0`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`"
+        },
+        {
+            "comment": "The code above contains three functions: _no_grad_zero_, eye_, and dirac_. The _no_grad_zero_ function returns a tensor with all elements set to zero while preserving the identity of inputs in Linear layers. The eye_ function fills a 2-dimensional input tensor with an identity matrix, preserving as many inputs as possible in Linear layers. Lastly, the dirac_ function fills a 3, 4, or 5-dimensional input tensor with Dirac delta functions while preserving the identity of inputs in Convolutional layers, considering groups if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":527-561",
+            "content": "    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.zeros_(w)\n    \"\"\"\n    return _no_grad_zero_(tensor)\ndef eye_(tensor):\n    r\"\"\"Fills the 2-dimensional input `Tensor` with the identity\n    matrix. Preserves the identity of the inputs in `Linear` layers, where as\n    many inputs are preserved as possible.\n    Args:\n        tensor: a 2-dimensional `torch.Tensor`\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.eye_(w)\n    \"\"\"\n    if tensor.ndimension() != 2:\n        raise ValueError(\"Only tensors with 2 dimensions are supported\")\n    with paddle.no_grad():\n        tensor.set_value(paddle.eye(*tensor.shape))\n    return tensor\ndef dirac_(tensor, groups=1):\n    r\"\"\"Fills the {3, 4, 5}-dimensional input `Tensor` with the Dirac\n    delta function. Preserves the identity of the inputs in `Convolutional`\n    layers, where as many input channels are preserved as possible. In case\n    of groups>1, each group of channels preserves identity\n    Args:\n        tensor: a {3, 4, 5}-dimensional `torch.Tensor`"
+        },
+        {
+            "comment": "The code is a function that initializes the convolutional layer weights using Dirac delta distribution for 3, 4, or 5-dimensional tensors. It first checks if the tensor dimensions are supported and then raises an error if not. Then it calculates the number of output channels per group and minimum dimension. The code then zeroes out the tensor and initializes the weights with Dirac delta distribution for specified groups and dimensions, performing a temporal convolution in 3-dimensions or spatial convolution in 4-dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":562-591",
+            "content": "        groups (optional): number of groups in the conv layer (default: 1)\n    Examples:\n        >>> w = torch.empty(3, 16, 5, 5)\n        >>> nn.init.dirac_(w)\n        >>> w = torch.empty(3, 24, 5, 5)\n        >>> nn.init.dirac_(w, 3)\n    \"\"\"\n    dimensions = tensor.ndimension()\n    if dimensions not in [3, 4, 5]:\n        raise ValueError(\n            \"Only tensors with 3, 4, or 5 dimensions are supported\")\n    sizes = tensor.shape\n    if sizes[0] % groups != 0:\n        raise ValueError('dim 0 must be divisible by groups')\n    out_chans_per_grp = sizes[0] // groups\n    min_dim = min(out_chans_per_grp, sizes[1])\n    with paddle.no_grad():\n        tensor.zero_()\n        for g in range(groups):\n            for d in range(min_dim):\n                if dimensions == 3:  # Temporal convolution\n                    tensor[g * out_chans_per_grp + d, d,\n                           tensor.shape[2] // 2] = 1\n                elif dimensions == 4:  # Spatial convolution\n                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,"
+        },
+        {
+            "comment": "This code defines several utility functions related to tensors in PaddlePaddle. The `_calculate_fan_in_and_fan_out` function calculates the fan-in and fan-out of a tensor, while the `LongTensor` function converts an input to a long tensor (dtype: int64). The `IntTensor` function does the same but with an int32 dtype. Lastly, `xavier_uniform_` initializes a tensor's parameters using Xavier Uniform initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":592-626",
+            "content": "                           tensor.shape[3] // 2] = 1\n                else:  # Volumetric convolution\n                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,\n                           tensor.shape[3] // 2, tensor.shape[4] // 2] = 1\n    return tensor\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = tensor.dim()\n    if dimensions < 2:\n        raise ValueError(\n            \"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions\"\n        )\n    num_input_fmaps = tensor.shape[1]  # .size(1)\n    num_output_fmaps = tensor.shape[0]  # .size(0)\n    receptive_field_size = 1\n    if tensor.dim() > 2:\n        for s in tensor.shape[2:]:\n            receptive_field_size *= s  # fixed\n    fan_in = num_input_fmaps * receptive_field_size\n    fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef LongTensor(x):\n    return paddle.to_tensor(x, dtype='int64')\ndef IntTensor(x):\n    return paddle.to_tensor(x, dtype='int32')\ndef xavier_uniform_(tensor: Tensor, gain: float = 1.) -> Tensor:"
+        },
+        {
+            "comment": "This code initializes the input tensor with values following the Glorot initialization method. It uses a uniform distribution and calculates the scaling factor 'a' based on gain, fan_in, and fan_out dimensions of the tensor. The resulting tensor is sampled from a uniform distribution between -a and a. The function xavier_normal_ is a variation that also fills the input tensor but with values from a normal distribution instead of uniform.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":627-654",
+            "content": "    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Understanding the difficulty of training deep feedforward\n    neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform\n    distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{U}(-a, a)` where\n    .. math::\n        a = \\text{gain} \\times \\sqrt{\\frac{6}{\\text{fan\\_in} + \\text{fan\\_out}}}\n    Also known as Glorot initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        gain: an optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation\n    return _no_grad_uniform_(tensor, -a, a)\ndef xavier_normal_(tensor: Tensor, gain: float = 1.) -> Tensor:\n    r\"\"\"Fills the input `Tensor` with values according to the method"
+        },
+        {
+            "comment": "This function initializes a tensor with Xavier/Glorot normal distribution, using a normal distribution and a scaling factor 'gain'. The resulting tensor values are sampled from the normal distribution N(0, std^2), where std = gain * sqrt(2 / (fan_in + fan_out)). It also includes a function _calculate_correct_fan that checks for valid modes 'fan_in' or 'fan_out'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":655-686",
+            "content": "    described in `Understanding the difficulty of training deep feedforward\n    neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal\n    distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{N}(0, \\text{std}^2)` where\n    .. math::\n        \\text{std} = \\text{gain} \\times \\sqrt{\\frac{2}{\\text{fan\\_in} + \\text{fan\\_out}}}\n    Also known as Glorot initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        gain: an optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.xavier_normal_(w)\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    return _no_grad_normal_(tensor, 0., std)\ndef _calculate_correct_fan(tensor, mode):\n    mode = mode.lower()\n    valid_modes = ['fan_in', 'fan_out']\n    if mode not in valid_modes:\n        raise ValueError(\"Mode {} not supported, please use one of {}\".format(\n            mode, valid_modes))\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)"
+        },
+        {
+            "comment": "This function fills a tensor with values from a uniform distribution according to the method described in He et al.'s (2015) paper, using either fan_in or fan_out mode. It also takes an optional argument for the negative slope of the rectifier used after this layer when 'leaky_relu' is specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":687-708",
+            "content": "    return fan_in if mode == 'fan_in' else fan_out\ndef kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Delving deep into rectifiers: Surpassing human-level\n    performance on ImageNet classification` - He, K. et al. (2015), using a\n    uniform distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{U}(-\\text{bound}, \\text{bound})` where\n    .. math::\n        \\text{bound} = \\text{gain} \\times \\sqrt{\\frac{3}{\\text{fan\\_mode}}}\n    Also known as He initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the negative slope of the rectifier used after this layer (only\n            used with ``'leaky_relu'``)\n        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``\n            preserves the magnitude of the variance of the weights in the\n            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the"
+        },
+        {
+            "comment": "This code is a PyTorch implementation of the Kaiming Uniform initialization method, used for initializing weights in neural networks. The function takes in a tensor and sets its values according to a uniform distribution with bounds calculated based on the tensor shape and nonlinearity (default is 'leaky_relu'). It also calculates the fan and gain based on the mode and nonlinearity to determine the standard deviation for the uniform distribution. The function then uses Paddle's `uniform` method to set the values in the tensor with the calculated bounds.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":709-731",
+            "content": "            backwards pass.\n        nonlinearity: the non-linear function (`nn.functional` name),\n            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    bound = math.sqrt(\n        3.0) * std  # Calculate uniform bounds from standard deviation\n    with paddle.no_grad():\n        tensor.set_value(paddle.uniform(tensor.shape, min=-bound, max=bound))\n        return tensor\ndef kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Delving deep into rectifiers: Surpassing human-level\n    performance on ImageNet classification` - He, K. et al. (2015), using a\n    normal distribution. The resulting tensor will have values sampled from"
+        },
+        {
+            "comment": "Function initializes weights using Kaiming normal distribution. It takes in an n-dimensional tensor, a slope value for rectifier (optional), mode as 'fan_in' or 'fan_out', and nonlinearity function. It preserves weight variance in forward pass with 'fan_in' and backward pass with 'fan_out'. Recommended to use with 'relu' or 'leaky_relu'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":732-757",
+            "content": "    :math:`\\mathcal{N}(0, \\text{std}^2)` where\n    .. math::\n        \\text{std} = \\frac{\\text{gain}}{\\sqrt{\\text{fan\\_mode}}}\n    Also known as He initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the negative slope of the rectifier used after this layer (only\n            used with ``'leaky_relu'``)\n        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``\n            preserves the magnitude of the variance of the weights in the\n            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the\n            backwards pass.\n        nonlinearity: the non-linear function (`nn.functional` name),\n            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> kaiming_normal_(w, mode='fan_out', nonlinearity='relu')\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    with paddle.no_grad():"
+        },
+        {
+            "comment": "The code initializes a tensor with a (semi) orthogonal matrix based on the input tensor. The tensor must have at least 2 dimensions, and the trailing dimensions are flattened. If rows are less than columns, transpose the tensor. It computes the QR factorization of the tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":758-788",
+            "content": "        tensor.set_value(paddle.normal(shape=tensor.shape, mean=0, std=std))\n        return tensor\ndef orthogonal_(tensor, gain=1):\n    r\"\"\"Fills the input `Tensor` with a (semi) orthogonal matrix, as\n    described in `Exact solutions to the nonlinear dynamics of learning in deep\n    linear neural networks` - Saxe, A. et al. (2013). The input tensor must have\n    at least 2 dimensions, and for tensors with more than 2 dimensions the\n    trailing dimensions are flattened.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`, where :math:`n \\geq 2`\n        gain: optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.orthogonal_(w)\n    \"\"\"\n    if tensor.ndimension() < 2:\n        raise ValueError(\"Only tensors with 2 or more dimensions are supported\")\n    rows = tensor.shape[0]  # .size(0)\n    cols = tensor.numel() // rows\n    flattened = tensor.new(rows, cols).normal_(0, 1)\n    if rows < cols:\n        flattened.t_()\n    # Compute the qr factorization\n    q, r = paddle.to_tensor(np.linalg.qr(flattened.numpy()))"
+        },
+        {
+            "comment": "Line 789: q, r = torch.qr(flattened) - Performs QR decomposition on flattened tensor\nLine 790-792: Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf - Applies diag function on r, multiplies q by ph, and transposes q if rows < cols\nLine 794-798: with paddle.no_grad(): tensor.view_as(q).copy_(q) - Uses no_grad context manager to prevent gradients from being recorded during the operation\nLine 799: tensor.mul_(gain) - Multiplies tensor by a gain factor\nLine 802: return tensor - Returns the modified tensor after applying QR decomposition and scaling",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":789-821",
+            "content": "    # q, r = torch.qr(flattened)\n    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf\n    d = paddle.diag(r, 0)\n    ph = d.sign()\n    q *= ph\n    if rows < cols:\n        q.t_()\n    with paddle.no_grad():\n        tensor.view_as(q).copy_(q)\n        tensor.mul_(gain)\n    return tensor\ndef sparse_(tensor, sparsity, std=0.01):\n    r\"\"\"Fills the 2D input `Tensor` as a sparse matrix, where the\n    non-zero elements will be drawn from the normal distribution\n    :math:`\\mathcal{N}(0, 0.01)`, as described in `Deep learning via\n    Hessian-free optimization` - Martens, J. (2010).\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        sparsity: The fraction of elements in each column to be set to zero\n        std: the standard deviation of the normal distribution used to generate\n            the non-zero values\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.sparse_(w, sparsity=0.1)\n    \"\"\"\n    if tensor.ndimension() != 2:\n        raise ValueError(\"Only tensors with 2 dimensions are supported\")"
+        },
+        {
+            "comment": "This code defines a function that initializes the values of a tensor using the Kaiming normal distribution. It also includes a deprecated method for backward compatibility, warning users to use the new method instead.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/api.py\":823-856",
+            "content": "    rows, cols = tensor.shape\n    num_zeros = int(math.ceil(sparsity * rows))\n    with paddle.no_grad():\n        tensor.normal_(0, std)\n        for col_idx in range(cols):\n            row_indices = paddle.randperm(rows)\n            zero_indices = row_indices[:num_zeros]\n            tensor[zero_indices, col_idx] = 0\n    return tensor\n# for backward compatibility\ndef _make_deprecate(meth):\n    new_name = meth.__name__\n    old_name = new_name[:-1]\n    def deprecated_init(*args, **kwargs):\n        warnings.warn(\n            \"nn.init.{} is now deprecated in favor of nn.init.{}.\".format(\n                old_name, new_name),\n            stacklevel=2)\n        return meth(*args, **kwargs)\n    deprecated_init.__doc__ = r\"\"\"\n    {old_name}(...)\n    .. warning::\n        This method is now deprecated in favor of :func:`torch.nn.init.{new_name}`.\n    See :func:`~torch.nn.init.{new_name}` for details.\"\"\".format(\n        old_name=old_name, new_name=new_name)\n    deprecated_init.__name__ = old_name\n    return deprecated_init"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/47abd731-e6ef-4b69-9c7a-4f6bb0d86c93.json b/docs/doc/47abd731-e6ef-4b69-9c7a-4f6bb0d86c93.json
new file mode 100644
index 000000000..bd112fc11
--- /dev/null
+++ b/docs/doc/47abd731-e6ef-4b69-9c7a-4f6bb0d86c93.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code creates a class to calculate top-1 and possibly top-5 accuracy metrics in image classification tasks, tracking and averaging them during iteration, with support for multi-GPU scenarios using all-reduce operations.",
+    "details": [
+        {
+            "comment": "This code defines the class CenterCropMetric_MRI, a metric for a video processing framework. It initializes variables and tracks top1 accuracy during iteration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/center_crop_metric_MRI.py\":0-32",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass CenterCropMetric_MRI(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1, if_slowfast=0):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.top1 = []\n        self.if_slowfast = if_slowfast\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter"
+        },
+        {
+            "comment": "This code snippet defines a class for calculating top-1 and possibly top-5 accuracy metrics in an image classification task. It collects the metrics for each batch during testing, then averages them at the end of all iterations. The code handles multi-GPU scenarios by performing all-reduce operations on the metric values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/center_crop_metric_MRI.py\":33-59",
+            "content": "        \"\"\"\n        labels = data[1]\n        if self.if_slowfast:\n            labels = data[2]\n        top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)\n        #top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)\n        #NOTE(shipping): deal with multi cards validate\n        if self.world_size > 1:\n            top1 = paddle.distributed.all_reduce(\n                top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            # top5 = paddle.distributed.all_reduce(\n            #     top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size\n        self.top1.append(top1.numpy())\n        #self.top5.append(top5.numpy())\n        # preds ensemble\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        logger.info('[TEST] finished, avg_acc1= {}'.format("
+        },
+        {
+            "comment": "Calculates mean of top-1 accuracy across all samples in the batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/center_crop_metric_MRI.py\":60-60",
+            "content": "            np.mean(np.array(self.top1))))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/487818af-e889-45da-85cd-ee8335367a08.json b/docs/doc/487818af-e889-45da-85cd-ee8335367a08.json
new file mode 100644
index 000000000..b3495b1c4
--- /dev/null
+++ b/docs/doc/487818af-e889-45da-85cd-ee8335367a08.json
@@ -0,0 +1,85 @@
+{
+    "summary": "The code introduces an AttentionLstmErnie class for ERNIE-based scenario classification, implementing an LSTM-based attention model for video tagging using text and audio data. It employs dropout, batch normalization, and Neural Machine Translation approach.",
+    "details": [
+        {
+            "comment": "This code defines the AttentionLstmErnie class, which extends the functionality of scenario-classify by incorporating text information. It uses ERNIE to extract text features and operates in either 'train' or 'infer' mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":0-33",
+            "content": "#!/usr/bin/env python\n# coding=utf-8\n\"\"\"\nattention lstm add ernie model\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport paddle\nimport paddle.static as static\nfrom .ernie import ErnieConfig, ErnieModel\nclass AttentionLstmErnie(object):\n    \"\"\"\n    Base on scenario-classify (image + audio), add text information\n    use ERNIE to extract text feature\n    \"\"\"\n    def __init__(self, name, cfg, mode='train'):\n        self.cfg = cfg\n        self.name = name"
+        },
+        {
+            "comment": "This code initializes a model by setting attributes from a configuration file and calling the `get_config` function. The `get_config` function retrieves the model's configurations, including feature numbers, dimensions, data types, and more. It also gets mode-specific settings like batch size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":34-58",
+            "content": "        self.mode = mode\n        self.py_reader = None\n        self.get_config()\n    def get_config(self):\n        \"\"\"get_config\n        \"\"\"\n        # get model configs\n        self.feature_num = self.cfg.MODEL.feature_num\n        self.feature_names = self.cfg.MODEL.feature_names\n        self.feature_dims = self.cfg.MODEL.feature_dims\n        self.feature_dtypes = self.cfg.MODEL.feature_dtypes\n        self.feature_lod_level = self.cfg.MODEL.feature_lod_level\n        self.num_classes = self.cfg.MODEL.num_classes\n        self.embedding_size = self.cfg.MODEL.embedding_size\n        self.lstm_size_img = self.cfg.MODEL.lstm_size_img\n        self.lstm_size_audio = self.cfg.MODEL.lstm_size_audio\n        self.ernie_freeze = self.cfg.MODEL.ernie_freeze\n        self.lstm_pool_mode = self.cfg.MODEL.lstm_pool_mode\n        self.drop_rate = self.cfg.MODEL.drop_rate\n        self.loss_type = self.cfg.TRAIN.loss_type\n        self.ernie_pretrain_dict_path = self.cfg.TRAIN.ernie_pretrain_dict_path\n        # get mode configs\n        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size', 1)"
+        },
+        {
+            "comment": "This code is a part of the AttentionLSTMERNIE model. It initializes the number of GPUs, learning rate, weight decay, and other parameters for training mode. The function get_config_from_sec retrieves values from a configuration file using section and item names. The build_input function constructs input data by iterating over feature names, dimensions, data types, and lod levels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":59-84",
+            "content": "        self.num_gpus = self.get_config_from_sec(self.mode, 'num_gpus', 1)\n        if self.mode == 'train':\n            self.learning_rate = self.get_config_from_sec(\n                'train', 'learning_rate', 1e-3)\n            self.weight_decay = self.get_config_from_sec(\n                'train', 'weight_decay', 8e-4)\n            self.num_samples = self.get_config_from_sec(\n                'train', 'num_samples', 5000000)\n            self.decay_epochs = self.get_config_from_sec(\n                'train', 'decay_epochs', [5])\n            self.decay_gamma = self.get_config_from_sec(\n                'train', 'decay_gamma', 0.1)\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"get_config_from_sec\"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\n    def build_input(self, use_pyreader):\n        \"\"\"\n        build input\n        \"\"\"\n        self.feature_input = []\n        for name, dim, dtype, lod_level in zip(self.feature_names,"
+        },
+        {
+            "comment": "Code initializes the reader for data feeding into the model, sets the label input, and defines a function \"ernie_encoder\" that extracts text features using the Ernie model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":85-107",
+            "content": "                                               self.feature_dims,\n                                               self.feature_dtypes,\n                                               self.feature_lod_level):\n            self.feature_input.append(\n                static.data(shape=dim,\n                                  lod_level=lod_level,\n                                  dtype=dtype,\n                                  name=name))\n        self.label_input = static.data(shape=[self.num_classes],\n                                             dtype='float32',\n                                             name='label')\n        self.py_reader = paddle.fluid.io.PyReader(feed_list=self.feature_input +\n                                           [self.label_input],\n                                           capacity=1024,\n                                           iterable=True)\n    def ernie_encoder(self):\n        \"\"\"\n        text feature extractor\n        \"\"\"\n        ernie_config = ErnieConfig(\n            os.path.join(self.ernie_pretrain_dict_path, 'ernie_config.json'))"
+        },
+        {
+            "comment": "This code initializes an ErnieModel with features extracted from input data. If self.ernie_freeze is True, it freezes the ERNIE model's parameters to prevent further training. It then retrieves the sequence output and applies a dropout if in train mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":108-130",
+            "content": "        if self.mode != 'train':\n            ernie_config['attention_probs_dropout_prob'] = 0.0\n            ernie_config['hidden_dropout_prob'] = 0.0\n        src_ids = self.feature_input[2][:, 0]\n        sent_ids = self.feature_input[2][:, 1]\n        position_ids = self.feature_input[2][:, 2]\n        task_ids = self.feature_input[2][:, 3]\n        input_mask = self.feature_input[2][:, 4].astype('float32')\n        ernie = ErnieModel(src_ids=src_ids,\n                           position_ids=position_ids,\n                           sentence_ids=sent_ids,\n                           task_ids=task_ids,\n                           input_mask=input_mask,\n                           config=ernie_config)\n        enc_out = ernie.get_sequence_output()\n        # to Freeze ERNIE param\n        if self.ernie_freeze is True:\n            enc_out.stop_gradient = True\n        # ernie cnn\n        enc_out_cnn = ernie.get_sequence_textcnn_output(enc_out, input_mask)\n        enc_out_cnn_drop = paddle.nn.functional.dropout(enc_out_cnn, p=self.drop_rate, training=(self.mode=='train'))"
+        },
+        {
+            "comment": "This code defines a function called \"build_model\" that creates and returns the model for video tagging. The model takes image, audio, and text features as input to generate attention-based LSTM features from the image. It applies fully connected layers (fc) on the image features and then passes them through dynamic LSTMs to obtain the forward and backward LSTM outputs. These outputs are used in further processing for video tagging.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":131-153",
+            "content": "        return enc_out_cnn_drop\n    def build_model(self):\n        \"\"\"build_model\n        \"\"\"\n        # ---------------- transfer from old paddle ---------------\n        # get image,audio,text feature\n        video_input_tensor = self.feature_input[0]\n        audio_input_tensor = self.feature_input[1]\n        self.ernie_feature = self.ernie_encoder()\n        # ------image------\n        lstm_forward_fc = static.nn.fc(x=video_input_tensor,\n                                          size=self.lstm_size_img * 4,\n                                          activation=None,\n                                          bias_attr=False)\n        lstm_forward, _ = paddle.fluid.layers.dynamic_lstm(input=lstm_forward_fc,\n                                                    size=self.lstm_size_img *\n                                                    4,\n                                                    is_reverse=False,\n                                                    use_peepholes=True)\n        lsmt_backward_fc = static.nn.fc(x=video_input_tensor,"
+        },
+        {
+            "comment": "This code defines a dynamic LSTM layer for image features, concatenates it with the backward pass, and applies dropout if in training mode. If 'text_guide' pooling mode is selected, it computes attention weights between text features and LSTM output using seq2seq attention. Otherwise, it uses an FC layer to reduce the dimensions of the LSTM output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":154-171",
+            "content": "                                           size=self.lstm_size_img * 4,\n                                           activation=None,\n                                           bias_attr=None)\n        lstm_backward, _ = paddle.fluid.layers.dynamic_lstm(input=lsmt_backward_fc,\n                                                     size=self.lstm_size_img *\n                                                     4,\n                                                     is_reverse=True,\n                                                     use_peepholes=True)\n        lstm_forward_img = paddle.concat(\n            x=[lstm_forward, lstm_backward], axis=1)\n        lstm_dropout = paddle.nn.functional.dropout(lstm_forward_img, p=self.drop_rate, training=(self.mode=='train'))\n        if self.lstm_pool_mode == 'text_guide':\n            lstm_weight = self.attention_weight_by_feature_seq2seq_attention(\n                self.ernie_feature, lstm_dropout, self.lstm_size_img * 2)\n        else:\n            lstm_weight = static.nn.fc(x=lstm_dropout,"
+        },
+        {
+            "comment": "This code snippet is defining a LSTM model for processing both visual and audio inputs. It initializes a LSTM layer with dropout, applies element-wise multiplication with weights, performs sequence pooling on the output, and defines FC layers followed by LSTM for processing audio input. Regularization is applied to the audio LSTM layer using an L2 decay regularizer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":172-193",
+            "content": "                                          size=1,\n                                          activation='sequence_softmax',\n                                          bias_attr=None)\n        scaled = paddle.multiply(x=lstm_dropout,\n                                              y=lstm_weight)\n        self.lstm_pool = paddle.static.nn.sequence_pool(input=scaled,\n                                                    pool_type='sum')\n        # ------audio------\n        lstm_forward_fc_audio = static.nn.fc(\n            x=audio_input_tensor,\n            size=self.lstm_size_audio * 4,\n            activation=None,\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)))\n        lstm_forward_audio, _ = paddle.fluid.layers.dynamic_lstm(\n            input=lstm_forward_fc_audio,\n            size=self.lstm_size_audio * 4,\n            is_reverse=False,\n            use_peepholes=True)\n        lsmt_backward_fc_audio = static.nn.fc(x=audio_input_tensor,"
+        },
+        {
+            "comment": "This code is creating a dynamic LSTM for audio input, reversing it, concatenating the forward and backward outputs, applying dropout if in training mode, and then performing attention weight calculation based on the pooling mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":194-213",
+            "content": "                                                 size=self.lstm_size_audio * 4,\n                                                 activation=None,\n                                                 bias_attr=False)\n        lstm_backward_audio, _ = paddle.fluid.layers.dynamic_lstm(\n            input=lsmt_backward_fc_audio,\n            size=self.lstm_size_audio * 4,\n            is_reverse=True,\n            use_peepholes=True)\n        lstm_forward_audio = paddle.concat(\n            x=[lstm_forward_audio, lstm_backward_audio], axis=1)\n        lstm_dropout_audio = paddle.nn.functional.dropout(lstm_forward_audio, p=self.drop_rate, training=(self.mode=='train'))\n        if self.lstm_pool_mode == 'text_guide':\n            lstm_weight_audio = self.attention_weight_by_feature_seq2seq_attention(\n                self.ernie_feature, lstm_dropout_audio,\n                self.lstm_size_audio * 2)\n        else:\n            lstm_weight_audio = static.nn.fc(x=lstm_dropout_audio,\n                                                size=1,"
+        },
+        {
+            "comment": "This code implements a LSTM-based attention model that combines audio and text data for video tagging. It consists of three main parts: LSTM layers, attention mechanism, and fully connected (FC) layer. The LSTM layers process the text and audio inputs separately and then concatenate them with the ERNIE feature. The attention mechanism is applied to calculate the weight for each LSTM output sequence. The FC layer has a softmax activation function if loss type is set to 'softmax', otherwise, it uses no activation when loss type is 'sigmoid'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":214-234",
+            "content": "                                                activation='sequence_softmax',\n                                                bias_attr=None)\n        scaled_audio = paddle.multiply(x=lstm_dropout_audio,\n                                                    y=lstm_weight_audio)\n        self.lstm_pool_audio = paddle.static.nn.sequence_pool(input=scaled_audio,\n                                                          pool_type='sum')\n        lstm_concat = paddle.concat(\n            x=[self.lstm_pool, self.lstm_pool_audio, self.ernie_feature],\n            axis=1,\n            name='final_concat')\n        # lstm_concat = self.add_bn(lstm_concat)\n        if self.loss_type == 'softmax':\n            self.fc = static.nn.fc(x=lstm_concat,\n                                      size=self.num_classes,\n                                      activation='softmax')\n        elif self.loss_type == 'sigmoid':\n            self.fc = static.nn.fc(x=lstm_concat,\n                                      size=self.num_classes,\n                                      activation=None)"
+        },
+        {
+            "comment": "This code calculates attention weights for a feature sequence using a Neural Machine Translation approach. It expands the text feature across the sequence, concatenates it with the original sequence feature, and passes it through an FC layer with 'tanh' activation to calculate energy values. The calculated energy is then used to determine attention weights by feature.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":235-259",
+            "content": "            self.logit = self.fc\n            self.fc = paddle.nn.functional.sigmoid(self.fc)\n        self.network_outputs = [self.fc]\n    def attention_weight_by_feature_seq2seq_attention(\n            self,\n            text_feature,\n            sequence_feature,\n            sequence_feature_dim,\n            name_prefix=\"seq2seq_attention\"):\n        \"\"\"\n        caculate weight by feature\n        Neural Machine Translation by Jointly Learning to Align and Translate\n        \"\"\"\n        text_feature_expand = paddle.static.nn.sequence_expand(text_feature,\n                                                           sequence_feature,\n                                                           ref_level=0)\n        sequence_text_concat = paddle.concat(\n            x=[sequence_feature, text_feature_expand],\n            axis=-1,\n            name='video_text_concat')\n        energy = static.nn.fc(x=sequence_text_concat,\n                                 size=sequence_feature_dim,\n                                 activation='tanh',"
+        },
+        {
+            "comment": "This function adds dropout and batch normalization to the LSTM concatenation. It projects the input to 8192 dimensions using an FC layer, applies batch normalization, and then applies a relu activation if in training mode. If not in training mode (is_test), it skips the batch normalization step. Finally, it applies dropout to the result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":260-284",
+            "content": "                                 name=name_prefix + \"_tanh_fc\")\n        weight_vector = static.nn.fc(x=energy,\n                                        size=1,\n                                        activation='sequence_softmax',\n                                        bias_attr=None,\n                                        name=name_prefix + \"_softmax_fc\")\n        return weight_vector\n    def add_bn(self, lstm_concat):\n        \"\"\"\n        v2.5 add drop out and batch norm\n        \"\"\"\n        input_fc_proj = static.nn.fc(\n            x=lstm_concat,\n            size=8192,\n            activation=None,\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)))\n        input_fc_proj_bn = paddle.static.nn.batch_norm(\n            input=input_fc_proj,\n            act=\"relu\",\n            is_test=(not self.mode == 'train'))\n        input_fc_proj_dropout = paddle.nn.functional.dropout(\n            input_fc_proj_bn,"
+        },
+        {
+            "comment": "This code defines an attention LSTM model using Ernie. It applies dropout, fully connected layers with batch normalization and dropout again. The optimizer function sets a learning rate that decays over specified epochs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":285-311",
+            "content": "            p=self.drop_rate,\n            training=(self.mode=='train'))\n        input_fc_hidden = static.nn.fc(\n            x=input_fc_proj_dropout,\n            size=4096,\n            activation=None,\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)))\n        input_fc_hidden_bn = paddle.static.nn.batch_norm(\n            input=input_fc_hidden,\n            act=\"relu\",\n            is_test=(not self.mode == 'train'))\n        input_fc_hidden_dropout = paddle.nn.functional.dropout(\n            input_fc_hidden_bn,\n            p=self.drop_rate,\n            training=(self.mode=='train'))\n        return input_fc_hidden_dropout\n    def optimizer(self):\n        \"\"\"\n        optimizer\n        \"\"\"\n        assert self.mode == 'train', \"optimizer only can be get in train mode\"\n        values = [\n            self.learning_rate * (self.decay_gamma ** i)\n            for i in range(len(self.decay_epochs) + 1)"
+        },
+        {
+            "comment": "This code defines a class that contains two methods: one for initializing an optimizer with piecewise decay learning rate and another for calculating the softlabel cross-entropy loss. The optimizer uses RMSProp algorithm and decays the learning rate based on defined epochs. The loss is calculated using the soft label version of cross entropy, suitable for certain types of neural networks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":312-333",
+            "content": "        ]\n        iter_per_epoch = self.num_samples / self.batch_size\n        boundaries = [e * iter_per_epoch for e in self.decay_epochs]\n        return paddle.optimizer.RMSProp(\n            learning_rate=paddle.optimizer.lr.PiecewiseDecay(values=values,\n                                                       boundaries=boundaries),\n            centered=True,\n            weight_decay=paddle.regularizer.L2Decay(\n                coeff=self.weight_decay))\n    def softlabel_cross_entropy_loss(self):\n        \"\"\"\n        softlabel_cross_entropy_loss\n        \"\"\"\n        assert self.mode != 'infer', \"invalid loss calculationg in infer mode\"\n        '''\n        cost = paddle.nn.functional.cross_entropy(input=self.network_outputs[0], \\\n                                          label=self.label_input)\n        '''\n        cost = paddle.nn.functional.cross_entropy(input=self.network_outputs[0], \\\n                                          label=self.label_input,\n                                          soft_label=True)"
+        },
+        {
+            "comment": "The code defines a loss function that takes in a loss type ('sigmoid' or others) and returns the calculated loss value. It includes functions for computing the sum of losses (sum_cost) and scaling it based on the number of GPUs (self.num_gpus). For the 'sigmoid' loss type, it uses binary cross-entropy with a reduction to be none, and calculates the mean loss over all batch elements. The scale operation is used to adjust the loss value for distributed training across multiple GPUs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":335-364",
+            "content": "        cost = paddle.sum(x=cost, axis=-1)\n        sum_cost = paddle.sum(x=cost)\n        self.loss_ = paddle.scale(sum_cost,\n                                        scale=self.num_gpus,\n                                        bias_after_scale=False)\n        return self.loss_\n    def sigmoid_cross_entropy_loss(self):\n        \"\"\"\n        sigmoid_cross_entropy_loss\n        \"\"\"\n        assert self.mode != 'infer', \"invalid loss calculationg in infer mode\"\n        cost = paddle.nn.functional.binary_cross_entropy(input=self.logit,\\\n                                          label=self.label_input, reduction=None)\n        cost = paddle.sum(x=cost, axis=-1)\n        sum_cost = paddle.sum(x=cost)\n        self.loss_ = paddle.scale(sum_cost,\n                                        scale=self.num_gpus,\n                                        bias_after_scale=False)\n        return self.loss_\n    def loss(self):\n        \"\"\"\n        loss\n        \"\"\"\n        if self.loss_type == 'sigmoid':\n            return self.sigmoid_cross_entropy_loss()"
+        },
+        {
+            "comment": "This code defines several methods for a model class. The `softlabel_cross_entropy_loss()` method returns the soft label cross-entropy loss. The `outputs()` method returns the network outputs. The `feeds()` method returns the feature and label inputs based on the current mode. The `pyreader()` method returns the PyReader object. Finally, the `epoch_num()` method returns the number of training epochs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py\":365-399",
+            "content": "        else:\n            return self.softlabel_cross_entropy_loss()\n    def outputs(self):\n        \"\"\"\n        get outputs\n        \"\"\"\n        return self.network_outputs\n    def feeds(self):\n        \"\"\"\n        get feeds\n        \"\"\"\n        return self.feature_input if self.mode == 'infer' else self.feature_input + [\n            self.label_input\n        ]\n    def pyreader(self):\n        \"\"\"pyreader\"\"\"\n        return self.py_reader\n    def epoch_num(self):\n        \"\"\"get train epoch num\"\"\"\n        return self.cfg.TRAIN.epoch\n    def load_test_weights_file(self, exe, weights, prog, place):\n        \"\"\"\n        load_test_weights_file\n        \"\"\"\n        load_vars = [x for x in prog.list_vars() \\\n                     if isinstance(x, paddle.framework.Parameter)]\n        static.load_vars(exe,\n                           dirname=weights,\n                           vars=load_vars,\n                           filename=\"param\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/491e594e-2388-4319-80c0-77b4ae736363.json b/docs/doc/491e594e-2388-4319-80c0-77b4ae736363.json
new file mode 100644
index 000000000..98e025fbb
--- /dev/null
+++ b/docs/doc/491e594e-2388-4319-80c0-77b4ae736363.json
@@ -0,0 +1,45 @@
+{
+    "summary": "This code initializes a video/painting app with UI elements, connects events to functions for smooth operation, and displays \"Hi, This is EIVideo\" on stop.",
+    "details": [
+        {
+            "comment": "This code initializes the main window of a video application, sets its size and geometry, creates an empty list for capturing frames, and starts a timer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py\":0-31",
+            "content": "# -*- coding: utf-8 -*-\n# Form implementation generated from reading ui file 'GUI.ui'\n#\n# Created by: PyQt5 UI code generator 5.15.2\n#\n# WARNING: Any manual changes made to this file will be lost when pyuic5 is\n# run again.  Do not edit this file unless you know what you are doing.\nfrom PyQt5 import QtCore, QtWidgets\nfrom PyQt5.QtGui import *\nfrom PyQt5.QtWidgets import *\nfrom PyQt5.QtCore import *\nfrom QEIVideo.widget.PaintBoard import PaintBoard\nclass Ui_MainWindow(object):\n    def setupUi(self, MainWindow):\n        MainWindow.setObjectName(\"EIVideo\")\n        MainWindow.resize(1101, 751)\n        self.centralwidget = QtWidgets.QWidget(MainWindow)\n        self.centralwidget.setObjectName(\"centralwidget\")\n        self.frame = QtWidgets.QFrame(self.centralwidget)\n        self.frame.setGeometry(QtCore.QRect(20, 20, 1271, 771))\n        self.frame.setFrameShadow(QtWidgets.QFrame.Raised)\n        self.frame.setObjectName(\"frame\")\n        self.cap = []\n        self.all_frames = []\n        self.fps = None\n        self.timer = QTimer(self.frame)"
+        },
+        {
+            "comment": "This code is initializing UI elements in a window, setting up progress slider for video position tracking, adding a picture label and paint board for drawing, and enabling interaction with eraser checkbox and clear button.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py\":32-55",
+            "content": "        self.time_label = QLabel('--/--', self.frame)\n        self.progress_slider = QSlider(self.frame)\n        self.progress_slider.setEnabled(True)\n        self.progress_slider.setOrientation(Qt.Horizontal)\n        self.progress_slider.setFixedWidth(710)\n        self.progress_slider.setFixedHeight(20)\n        self.progress_slider.setSingleStep(1)  # \u8bbe\u7f6e\u53d8\u5316\u6b65\u957f\n        self.progress_slider.setValue(0)\n        self.progress_slider.sliderReleased.connect(self.update_video_position_func)  # \u62d6\u62fd\u8fdb\u5ea6\u6761\n        self.picturelabel = QtWidgets.QLabel(self.frame)\n        self.picturelabel.setGeometry(30, 30, 810, 458)\n        self.picturelabel.setText(\"\")\n        self.picturelabel.setObjectName(\"picturelabel\")\n        self.paintBoard = PaintBoard(self.frame)\n        self.paintBoard.setGeometry(30, 30, 810, 458)\n        self.cbtn_Eraser = QCheckBox(\"\u6a61\u76ae\u64e6\")\n        self.cbtn_Eraser.setParent(self.frame)\n        self.cbtn_Eraser.move(950, 40)\n        self.cbtn_Eraser.clicked.connect(self.on_cbtn_eraser_clicked)\n        self.btn_Clear = QPushButton(\"\u6e05\u7a7a\u753b\u677f\")"
+        },
+        {
+            "comment": "The code sets the parent object of btn_Clear to self.frame, moves the btn_Clear and label_penColor widgets to specific positions, connects a button click event to clear the paintBoard, fills a comboBox_penColor with color options, places it at a particular location, links its currentIndexChanged signal to on_pen_color_change function, and provides guidance for using EIVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py\":56-76",
+            "content": "        self.btn_Clear.setParent(self.frame)  # \u8bbe\u7f6e\u7236\u5bf9\u8c61\u4e3a\u672c\u754c\u9762\n        self.btn_Clear.move(950, 60)\n        self.btn_Clear.clicked.connect(self.paintBoard.clear)\n        self.label_penColor = QLabel(self.frame)\n        self.label_penColor.setText(\"\u753b\u7b14\u989c\u8272\")\n        self.label_penColor.move(990, 100)\n        # \u83b7\u53d6\u989c\u8272\u5217\u8868(\u5b57\u7b26\u4e32\u7c7b\u578b)\n        self.colorList = QColor.colorNames()\n        self.comboBox_penColor = QComboBox(self.frame)\n        self.fill_color_list(self.comboBox_penColor)  # \u7528\u5404\u79cd\u989c\u8272\u586b\u5145\u4e0b\u62c9\u5217\u8868\n        self.comboBox_penColor.move(1080, 80)\n        self.comboBox_penColor.currentIndexChanged.connect(\n            self.on_pen_color_change)  # \u5173\u8054\u4e0b\u62c9\u5217\u8868\u7684\u5f53\u524d\u7d22\u5f15\u53d8\u66f4\u4fe1\u53f7\u4e0e\u51fd\u6570on_PenColorChange\n        self.helplabel = QLabel()\n        self.helplabel.setText(\"Hi,Welcome to use EIVideo\\n\"\n                               \"This is a guide for EIVideo,\\n\"\n                               \"please check\\n\"\n                               \"1. Choose 'Add' for a video\\n\"\n                               \"2. Click 'Play' to start playing\\n\"\n                               \"3. At this point, all functions \\n\""
+        },
+        {
+            "comment": "This code sets up a user interface layout for a painting application. It includes a help label, buttons to clear and switch the eraser, a color picker, and a 'GO' push button. The layout is organized in a vertical box and horizontal box arrangement.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py\":77-100",
+            "content": "                               \"are unlocked\\n\"\n                               \"4. Paint and enjoy it!\\n\")\n        self.widget2 = QtWidgets.QWidget(self.frame)\n        self.widget2.setGeometry(860, 60, 200, 300)\n        self.widget2.setObjectName(\"widget2\")\n        self.rightLayout = QtWidgets.QVBoxLayout(self.widget2)\n        self.rightLayout.setContentsMargins(0, 0, 0, 0)\n        self.rightLayout.setObjectName(\"rightLayout\")\n        self.rightLayout.addWidget(self.helplabel)\n        self.rightLayout.addSpacing(50)\n        self.rightLayout.addWidget(self.cbtn_Eraser)\n        self.rightLayout.addWidget(self.btn_Clear)\n        self.colorLayout = QtWidgets.QHBoxLayout(self.widget2)\n        self.colorLayout.setContentsMargins(0, 0, 0, 0)\n        self.colorLayout.setObjectName('colorLayout')\n        self.colorLayout.addWidget(self.label_penColor)\n        self.colorLayout.addWidget(self.comboBox_penColor)\n        self.rightLayout.addLayout(self.colorLayout)\n        # pushButton_6 -> GO\n        self.pushButton_6 = QtWidgets.QPushButton(self.frame)"
+        },
+        {
+            "comment": "This code is creating a UI for a video player. It sets the position and function of a play button, as well as defining layouts for other UI elements such as time display. The play button's click event is connected to a method called \"btn_func\" which takes the play button as an argument.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py\":101-121",
+            "content": "        self.pushButton_6.setGeometry(870, 600, 150, 90)\n        self.pushButton_6.setObjectName(\"pushButton_6\")\n        self.pushButton_6.clicked.connect(self.infer)\n        self.widget1 = QtWidgets.QWidget(self.frame)\n        self.widget1.move(60, 520)\n        self.widget1.setObjectName(\"widget1\")\n        self.barLayout = QtWidgets.QVBoxLayout(self.widget1)\n        self.barLayout.setContentsMargins(0, 0, 0, 0)\n        self.barLayout.setObjectName(\"barLayout\")\n        self.horizontalLayout = QtWidgets.QHBoxLayout(self.widget1)\n        self.horizontalLayout.setContentsMargins(0, 0, 0, 0)\n        self.horizontalLayout.setObjectName(\"horizontalLayout\")\n        self.timeLayout = QtWidgets.QHBoxLayout(self.widget1)\n        self.timeLayout.setContentsMargins(0, 0, 0, 0)\n        self.timeLayout.setObjectName(\"horizontalLayout\")\n        self.playbtn = QtWidgets.QPushButton(self.widget1)\n        self.playbtn.setObjectName(\"playbtn\")\n        self.playbtn.clicked.connect(lambda: self.btn_func(self.playbtn))\n        self.horizontalLayout.addWidget(self.playbtn)"
+        },
+        {
+            "comment": "This code initializes two push buttons, connects their click events to a function, adds them to a horizontal layout, and adds the layout to a splitter. It also adds time-related widgets to another layout, and adds both layouts to the splitter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py\":122-141",
+            "content": "        self.pushButton_2 = QtWidgets.QPushButton(self.widget1)\n        self.pushButton_2.setObjectName(\"pushButton_2\")\n        self.pushButton_2.clicked.connect(lambda: self.btn_func(self.pushButton_2))\n        self.horizontalLayout.addWidget(self.pushButton_2)\n        self.pushButton_4 = QtWidgets.QPushButton(self.widget1)\n        self.pushButton_4.setObjectName(\"pushButton_4\")\n        self.pushButton_4.clicked.connect(lambda: self.btn_func(self.pushButton_4))\n        self.horizontalLayout.addWidget(self.pushButton_4)\n        self.timeLayout.addWidget(self.progress_slider)\n        self.timeLayout.addWidget(self.time_label)\n        self.barLayout.addSpacing(20)\n        self.barLayout.addLayout(self.timeLayout)\n        self.barLayout.addSpacing(30)\n        self.barLayout.addLayout(self.horizontalLayout)\n        self.splitter = QtWidgets.QSplitter(self.frame)\n        self.splitter.setGeometry(QtCore.QRect(71, 670, 750, 20))\n        self.splitter.setOrientation(QtCore.Qt.Horizontal)\n        self.splitter.setObjectName(\"splitter\")"
+        },
+        {
+            "comment": "This code is creating a user interface for a main window of an application. It includes a label, progress bar, menu bar, and status bar. The window has a title and two buttons: \"GO\" and \"Play\". The `retranslateUi` function is used to set the window title and button labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py\":142-163",
+            "content": "        self.label = QtWidgets.QLabel(self.splitter)\n        self.label.setObjectName(\"label\")\n        self.progressBar = QtWidgets.QProgressBar(self.splitter)\n        self.progressBar.setProperty(\"value\", 0)\n        self.progressBar.setObjectName(\"progressBar\")\n        MainWindow.setCentralWidget(self.centralwidget)\n        self.menubar = QtWidgets.QMenuBar(MainWindow)\n        self.menubar.setGeometry(QtCore.QRect(0, 0, 1327, 23))\n        self.menubar.setObjectName(\"menubar\")\n        MainWindow.setMenuBar(self.menubar)\n        self.statusbar = QtWidgets.QStatusBar(MainWindow)\n        self.statusbar.setObjectName(\"statusbar\")\n        MainWindow.setStatusBar(self.statusbar)\n        self.retranslateUi(MainWindow)\n        QtCore.QMetaObject.connectSlotsByName(MainWindow)\n    def retranslateUi(self, MainWindow):\n        _translate = QtCore.QCoreApplication.translate\n        MainWindow.setWindowTitle(_translate(\"MainWindow\", \"MainWindow\"))\n        self.pushButton_6.setText(_translate(\"MainWindow\", \"GO\"))\n        self.playbtn.setText(_translate(\"MainWindow\", \"Play\"))"
+        },
+        {
+            "comment": "This code updates the text on three GUI elements in the \"MainWindow\" class. The first button is labeled \"Stop,\" the second button is labeled \"Add,\" and a label displays \"Hi, This is EIVideo.\"",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py\":164-166",
+            "content": "        self.pushButton_2.setText(_translate(\"MainWindow\", \"Stop\"))\n        self.pushButton_4.setText(_translate(\"MainWindow\", \"Add\"))\n        self.label.setText(_translate(\"MainWindow\", \"Hi, This is EIVideo\"))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/496e7372-817b-427a-803a-7bf4a1e61f3e.json b/docs/doc/496e7372-817b-427a-803a-7bf4a1e61f3e.json
new file mode 100644
index 000000000..2a29e6907
--- /dev/null
+++ b/docs/doc/496e7372-817b-427a-803a-7bf4a1e61f3e.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code uses Paddle Serving for postprocessing and PaddleVideo framework for video processing, initializing client, preprocessing input, sending data to server, receiving prediction, and printing output.",
+    "details": [
+        {
+            "comment": "This code snippet is importing necessary libraries and defining a function for postprocessing prediction outputs from a Paddle Serving client. The function takes raw predictions in the form of a numpy array and returns the postprocessed prediction as a dictionary containing any desired data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/serving_client.py\":0-31",
+            "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nfrom typing import Any, Dict\nimport numpy as np\nfrom paddle_serving_client import Client\nfrom preprocess_ops import get_preprocess_func, np_softmax\ndef postprocess(fetch_map: Dict[str, np.ndarray]) -> Dict[str, Any]:\n    \"\"\"postprocess\n    Args:\n        fetch_map (Dict[str, np.ndarray]): raw prediction\n    Returns:\n        Dict[str, Any]: postprocessed prediction\n    \"\"\"\n    score_list = fetch_map[\"outputs\"]  # [b,num_classes]"
+        },
+        {
+            "comment": "The code defines a function that calculates the class id and probability based on scores, converts them to strings, and returns a dictionary with these values. It also includes a function for parsing arguments such as model name, serving client config file path, and URL to access the CPP serving.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/serving_client.py\":32-61",
+            "content": "    fetch_dict = {\"class_id\": [], \"prob\": []}\n    for score in score_list:\n        score = np_softmax(score, axis=0)\n        score = score.tolist()\n        max_score = max(score)\n        fetch_dict[\"class_id\"].append(score.index(max_score))\n        fetch_dict[\"prob\"].append(max_score)\n    fetch_dict[\"class_id\"] = str(fetch_dict[\"class_id\"])\n    fetch_dict[\"prob\"] = str(fetch_dict[\"prob\"])\n    return fetch_dict\ndef parse_args():\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo CPP Serving model script\")\n    parser.add_argument(\"-n\",\n                        \"--name\",\n                        type=str,\n                        default=\"PPTSM\",\n                        help=\"model's name, such as PPTSM, PPTSN...\")\n    parser.add_argument(\n        \"-c\",\n        \"--config\",\n        type=str,\n        help=\"serving client config file(serving_client_conf.prototxt) path\")\n    parser.add_argument(\"--url\",\n                        type=str,\n                        default=\"127.0.0.1:9993\",\n                        help=\"url to access cpp serving\")"
+        },
+        {
+            "comment": "This code is a Python function that parses command line arguments, initializes a client object for video processing, preprocesses input video file, sends data to server, receives prediction, post-processes results and prints output. It uses the PaddleVideo framework with specific model configuration file and preprocessing function based on input name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/serving_client.py\":62-94",
+            "content": "    parser.add_argument(\"--logid\", type=int, default=\"10000\", help=\"log id\")\n    parser.add_argument(\"--input_file\",\n                        type=str,\n                        default=\"../../data/example.avi\",\n                        help=\"input video file\")\n    return parser.parse_args()\nif __name__ == \"__main__\":\n    # parse args\n    args = parse_args()\n    url = args.url\n    logid = args.logid\n    input_file_path = args.input_file\n    model_name = args.name\n    # get preprocess by model name\n    preprocess = get_preprocess_func(model_name)\n    # initialize client object & connect\n    client = Client()\n    client.load_client_config(args.config)\n    client.connect([url])\n    # preprocess\n    feed, fetch = preprocess(input_file_path)\n    # send data & get prediction from server\n    fetch_map = client.predict(feed=feed, fetch=fetch)\n    # postprocess & output\n    result = postprocess(fetch_map)\n    print(result)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/49a26df6-daa0-45ce-b445-d01a9f4aa985.json b/docs/doc/49a26df6-daa0-45ce-b445-d01a9f4aa985.json
new file mode 100644
index 000000000..d7e7dbbab
--- /dev/null
+++ b/docs/doc/49a26df6-daa0-45ce-b445-d01a9f4aa985.json
@@ -0,0 +1,50 @@
+{
+    "summary": "This code shows deploying PaddleServing for deep learning model prediction via HTTP using PP-TSM models and Docker on Linux. Issues with proxy, no response; check log file for errors at \"./deploy/python_serving/PipelineServingLogs/pipeline.log\". Refer to Serving's GitHub for more deployment types like RPC prediction service.",
+    "details": [
+        {
+            "comment": "This code provides an overview of deploying a model service using PaddleServing for deep learning predictions. It uses HTTP prediction service deployment as an example and suggests installing Serving through Docker on Linux platforms, while Windows is currently not supported.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md\":0-15",
+            "content": "English | [\u7b80\u4f53\u4e2d\u6587](./readme.md)\n# Model service deployment\n## Introduction\n[Paddle Serving](https://github.com/PaddlePaddle/Serving) aims to help deep learning developers easily deploy online prediction services, support one-click deployment of industrial-grade service capabilities, high concurrency between client and server Efficient communication and support for developing clients in multiple programming languages.\nThis section takes the HTTP prediction service deployment as an example to introduce how to use PaddleServing to deploy the model service in PaddleVideo. Currently, only Linux platform deployment is supported, and Windows platform is not currently supported.\n## Serving installation\nThe Serving official website recommends using docker to install and deploy the Serving environment. First, you need to pull the docker environment and create a Serving-based docker.\n```bash\n# start GPU docker\ndocker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel\nnvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash"
+        },
+        {
+            "comment": "Install PaddleServing server and client packages for CPU and GPU environments, depending on the deployment type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md\":16-40",
+            "content": "nvidia-docker exec -it test bash\n# start CPU docker\ndocker pull paddlepaddle/serving:0.7.0-devel\ndocker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash\ndocker exec -it test bash\n```\nAfter entering docker, you need to install Serving-related python packages.\n```bash\npython3.7 -m pip install paddle-serving-client==0.7.0\npython3.7 -m pip install paddle-serving-app==0.7.0\npython3.7 -m pip install faiss-cpu==1.7.1post2\n#If it is a CPU deployment environment:\npython3.7 -m pip install paddle-serving-server==0.7.0 #CPU\npython3.7 -m pip install paddlepaddle==2.2.0 # CPU\n#If it is a GPU deployment environment\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102 # GPU with CUDA10.2 + TensorRT6\npython3.7 -m pip install paddlepaddle-gpu==2.2.0 # GPU with CUDA10.2\n#Other GPU environments need to confirm the environment and then choose which one to execute\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101 # GPU with CUDA10.1 + TensorRT6\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112 # GPU with CUDA11.2 + TensorRT8"
+        },
+        {
+            "comment": "This code snippet provides instructions on how to deploy a behavior recognition service using PaddleServing. It explains that the model must be converted into a Serving model and provides an example of PP-TSM model conversion process. The user is guided to enter the PaddleVideo directory, download the trained PP-TSM model, convert it into an inference model, and finally, provide an option to download a pre-converted inference model if desired.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md\":41-62",
+            "content": "```\n* If the installation speed is too slow, you can change the source through `-i https://pypi.tuna.tsinghua.edu.cn/simple` to speed up the installation process\n* For more environment and corresponding installation packages, see: https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md\n## Behavior recognition service deployment\n### Model conversion\nWhen using PaddleServing for service deployment, you need to convert the saved inference model into a Serving model. The following uses the PP-TSM model as an example to introduce how to deploy the behavior recognition service.\n- Download the trained PP-TSM model and convert it into an inference model:\n  ```bash\n  # Enter PaddleVideo directory\n  cd PaddleVideo\n  wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams\n  python3.7 tools/export_model.py \\\n  -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n  -p data/ppTSM_k400_uniform.pdparams \\\n  -o inference/ppTSM\n  ```\n- We also provide the converted inference model, download and unzip by the following command"
+        },
+        {
+            "comment": "This code downloads a pre-trained model and converts it into a format suitable for server deployment using paddle_serving_client. The converted model is saved in the specified directory with the corresponding program and parameter files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md\":63-82",
+            "content": "  ```bash\n  mkdir ./inference\n  wget -nc -P ./inference https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate\n  pushd ./inference\n  unzip ppTSM.zip\n  popd\n  ```\n- Use paddle_serving_client to convert the converted inference model into a model format that is easy for server deployment:\n  ```bash\n  python3.7 -m paddle_serving_client.convert \\\n  --dirname inference/ppTSM \\\n  --model_filename ppTSM.pdmodel \\\n  --params_filename ppTSM.pdiparams \\\n  --serving_server ./deploy/python_serving/ppTSM_serving_server/ \\\n  --serving_client ./deploy/python_serving/ppTSM_serving_client/\n  ```\n  | parameter | type | default value | description |\n  | ----------------- | ---- | ------------------ | ------- -------------------------------------------------- --- |\n  | `dirname` | str | - | The storage path of the model file to be converted. The program structure file and parameter file are saved in this directory. |\n  | `model_filename` | str | None | The name of the file storing the model In"
+        },
+        {
+            "comment": "This code defines the required parameters for converting a PaddleVideo PP-TSM inference model. Upon successful conversion, it creates `ppTSM_serving_server` and `ppTSM_serving_client` folders with necessary files for the converted model's serving.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md\":82-93",
+            "content": "ference Program structure that needs to be converted. If set to None, use `__model__` as the default filename |\n  | `params_filename` | str | None | File name where all parameters of the model to be converted are stored. It needs to be specified if and only if all model parameters are stored in a single binary file. If the model parameters are stored in separate files, set it to None |\n  | `serving_server` | str | `\"serving_server\"` | The storage path of the converted model files and configuration files. Default is serving_server |\n  | `serving_client` | str | `\"serving_client\"` | The converted client configuration file storage path. Default is serving_client |\nAfter the PP-TSM inference model is converted, there will be additional folders of `ppTSM_serving_server` and `ppTSM_serving_client` in the current folder, with the following formats:\n  ```bash\n  PaddleVideo/deploy/python_serving\n  \u251c\u2500\u2500 ppTSM_serving_server\n      \u251c\u2500\u2500 ppTSM.pdiparams\n      \u251c\u2500\u2500 ppTSM.pdmodel\n      \u251c\u2500\u2500 serving_server_conf.prototxt"
+        },
+        {
+            "comment": "This code snippet is modifying the model configuration files `serving_server_conf.prototxt` and `serving_client_conf.stream.prototxt`. It changes the `alias_name` under `fetch_var` to \"outputs\" in both files for compatibility with different models during deployment. This allows the inference and deployment of various models without modifying the code, simply by updating the configuration file's alias names.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md\":94-118",
+            "content": "      \u2514\u2500\u2500 serving_server_conf.stream.prototxt\n  \u251c\u2500\u2500 ppTSM_serving_client\n      \u251c\u2500\u2500 serving_client_conf.prototxt\n      \u2514\u2500\u2500 serving_client_conf.stream.prototxt\n  ```\nAfter getting the model files, you need to modify the files `serving_server_conf.prototxt` under `ppTSM_serving_server` and `ppTSM_serving_client` respectively, and change `alias_name` under `fetch_var` in both files to `outputs`\n**Remarks**: In order to be compatible with the deployment of different models, Serving provides the function of input and output renaming. In this way, when different models are inferred and deployed, they only need to modify the `alias_name` of the configuration file, and the inference deployment can be completed without modifying the code.\nThe modified `serving_server_conf.prototxt` looks like this:\n```yaml\nfeed_var {\n  name: \"data_batch_0\"\n  alias_name: \"data_batch_0\"\n  is_lod_tensor: false\n  feed_type: 1\n  shape: 8\n  shape: 3\n  shape: 224\n  shape: 224\n}\nfetch_var {\n  name: \"linear_2.tmp_1\"\n  alias_name: \"outputs\"\n  is_lod_tensor: false"
+        },
+        {
+            "comment": "This code snippet is for starting the PaddleVideo pipeline service in Python using the recognition_web_service.py script. The `-n` flag specifies the name of the model, and the `-c` flag points to the configuration file for the pipeline service.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md\":119-144",
+            "content": "  fetch_type: 1\n  shape: 400\n}\n```\n### Service deployment and requests\nThe `python_serving` directory contains the code for starting the pipeline service, C++ serving service (TODO) and sending prediction requests, including:\n```bash\n__init__.py\nconfigs/xxx.yaml            # start the configuration file of the pipeline service\npipeline_http_client.py     # python script for sending pipeline prediction request via http\npipeline_rpc_client.py      # python script for sending pipeline prediction request in rpc mode\nrecognition_web_service.py  # python script that starts the pipeline server\nutils.py                    # common functions used in inference, such as parse_file_paths, numpy_to_base64, video_to_numpy\n```\n#### Python Serving\n- Go to the working directory:\n```bash\ncd deploy/python_serving\n```\n- Start the service:\n```bash\n# Start in the current command line window and stay in front\npython3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml\n# Start in the background, the logs printed during the process will be redirected and saved to log.txt"
+        },
+        {
+            "comment": "This code is running a web service for model prediction and two client scripts to send prediction requests via HTTP and RPC, printing the results in the command line. The result shows an example output with probabilities and labels for a given input video file. If no result is returned or there's an output decoding error, it might be related to the proxy setting when starting the service.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md\":145-174",
+            "content": "python3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml &>log.txt &\n```\n- send request:\n```bash\n# Send a prediction request in http and receive the result\npython3.7 pipeline_http_client.py -i ../../data/example.avi\n# Send a prediction request in rpc and receive the result\npython3.7 pipeline_rpc_client.py -i ../../data/example.avi\n```\nAfter a successful run, the results of the model prediction will be printed in the cmd window, and the results are as follows:\n```bash\n# http method print result\n{'err_no': 0, 'err_msg': '', 'key': ['label', 'prob'], 'value': [\"['archery']\", '[0.9907388687133789]'], 'tensors ': []}\n# The result of printing in rpc mode\nPipelineClient::predict pack_data time:1645631086.764019\nPipelineClient::predict before time:1645631086.8485317\nkey: \"label\"\nkey: \"prob\"\nvalue: \"[\\'archery\\']\"\nvalue: \"[0.9907388687133789]\"\n```\n## FAQ\n**Q1**: No result is returned after the request is sent or an output decoding error is prompted\n**A1**: Do not set the proxy when starting the service an"
+        },
+        {
+            "comment": "Closing the proxy before starting the service and sending request using \"unset https_proxy; unset http_proxy\". No response after server started, check log file for error message at \"./deploy/python_serving/PipelineServingLogs/pipeline.log\". For more deployment types like RPC prediction service, refer to Serving's GitHub official website.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme_en.md\":174-184",
+            "content": "d sending the request. You can close the proxy before starting the service and sending the request. The command to close the proxy is:\n```\nunset https_proxy\nunset http_proxy\n```\n**Q2**: There is no response after the server is started, and it has been stopped at `start proxy service`\n**A2**: It is likely that a problem was encountered during the startup process. You can view the detailed error message in the `./deploy/python_serving/PipelineServingLogs/pipeline.log` log file\nFor more service deployment types, such as `RPC prediction service`, you can refer to Serving's [github official website](https://github.com/PaddlePaddle/Serving/tree/v0.7.0/examples)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4a0caae3-5a32-4b12-8eaf-ea69856a0f76.json b/docs/doc/4a0caae3-5a32-4b12-8eaf-ea69856a0f76.json
new file mode 100644
index 000000000..9b137a708
--- /dev/null
+++ b/docs/doc/4a0caae3-5a32-4b12-8eaf-ea69856a0f76.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet is providing two links to tutorials, one in English and the other in Simplified Chinese (\u7b80\u4f53\u4e2d\u6587). The English tutorial can be accessed at \"../../zh-CN/tutorials/accelerate.md\" and the Chinese one at the current location \"PaddleVideo/english_documents/tutorials/accelerate.md\".",
+    "details": [
+        {
+            "comment": "This code snippet is providing two links to tutorials, one in English and the other in Simplified Chinese (\u7b80\u4f53\u4e2d\u6587). The English tutorial can be accessed at \"../../zh-CN/tutorials/accelerate.md\" and the Chinese one at the current location \"PaddleVideo/english_documents/tutorials/accelerate.md\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/accelerate.md\":0-0",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/tutorials/accelerate.md) | English"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4a6e8677-5817-4bc6-b864-c058c6d70d4c.json b/docs/doc/4a6e8677-5817-4bc6-b864-c058c6d70d4c.json
new file mode 100644
index 000000000..e00e964fe
--- /dev/null
+++ b/docs/doc/4a6e8677-5817-4bc6-b864-c058c6d70d4c.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The code imports necessary libraries, sets up a DALI reader, defines TSN_Dali_loader class, initializes parallel video preprocessing, handles potential import errors, and returns output and label using PaddleOps for normalization.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries, sets up logger, and attempts to import DALI pipeline and related functions for creating a generic iterator for PaddlePaddle. If any of these imports fail, it falls back by setting the respective variable as an object.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py\":0-31",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport math\nimport paddle\nfrom paddle.distributed import ParallelEnv\nimport paddle.distributed as dist\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\ntry:\n    from nvidia.dali.pipeline import Pipeline\n    import nvidia.dali.ops as ops\n    import nvidia.dali.types as types\n    import tempfile\n    from nvidia.dali.plugin.paddle import DALIGenericIterator\nexcept:\n    Pipeline = object"
+        },
+        {
+            "comment": "The code defines a class `TSN_Dali_loader` that initializes attributes related to batch size, file path, number of segments, segment length, input and target image sizes. It also sets variables for distributed training, data normalization, and builds a DALI reader for training data using shuffled full lines from the file path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py\":34-64",
+            "content": "def get_input_data(data):\n    return paddle.to_tensor(data[0]['image']), paddle.to_tensor(\n        data[0]['label'])\nclass TSN_Dali_loader(object):\n    def __init__(self, cfg):\n        self.batch_size = cfg.batch_size\n        self.file_path = cfg.file_path\n        self.num_seg = cfg.num_seg\n        self.seglen = cfg.seglen\n        self.short_size = cfg.short_size\n        self.target_size = cfg.target_size\n        # set num_shards and shard_id when distributed training is implemented\n        self.num_shards = dist.get_world_size()\n        self.shard_id = ParallelEnv().local_rank\n        self.dali_mean = cfg.mean * (self.num_seg * self.seglen)\n        self.dali_std = cfg.std * (self.num_seg * self.seglen)\n    def build_dali_reader(self):\n        \"\"\"\n        build dali training reader\n        \"\"\"\n        def reader_():\n            with open(self.file_path) as flist:\n                full_lines = [line for line in flist]\n                if (not hasattr(reader_, 'seed')):\n                    reader_.seed = 0\n                random.Random(reader_.seed).shuffle(full_lines)"
+        },
+        {
+            "comment": "This code snippet initializes a reader and distributes the data evenly across multiple shards. It calculates the number of lines to be assigned to each shard based on the total number of lines and the number of shards. It then ensures that the full_lines list is an even multiple of the total_lines by appending additional items if necessary. The snippet asserts that the length of full_lines equals total_lines, assigns lines to trainers based on their shard ID, and logs information about the distribution of data among shards.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py\":65-87",
+            "content": "                logger.info(f\"reader shuffle seed: {reader_.seed}.\")\n                if reader_.seed is not None:\n                    reader_.seed += 1\n                per_node_lines = int(\n                    math.ceil(len(full_lines) * 1.0 / self.num_shards))\n                total_lines = per_node_lines * self.num_shards\n                # aligned full_lines so that it can evenly divisible\n                full_lines += full_lines[:(total_lines - len(full_lines))]\n                assert len(full_lines) == total_lines\n                # trainer get own sample\n                lines = full_lines[self.shard_id:total_lines:self.num_shards]\n                assert len(lines) == per_node_lines\n                logger.info(\n                    f\"shard_id: {self.shard_id}, trainer_count: {self.num_shards}\"\n                )\n                logger.info(\n                    f\"read videos from {self.shard_id * per_node_lines}, \"\n                    f\"length: {per_node_lines}, \"\n                    f\"lines length: {len(lines)}, \""
+        },
+        {
+            "comment": "This code initializes a PaddlePaddle VideoPipe instance, loading and preprocessing video files in parallel for training. It sets the batch size, number of threads, device ID, file list, sequence length, number of segments, segment length, resize shorter scale, crop target size, whether it's in training mode, and the number of shards and shard ID.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py\":88-110",
+            "content": "                    f\"total: {len(full_lines)}\")\n            video_files = ''.join([item for item in lines])\n            tf = tempfile.NamedTemporaryFile()\n            tf.write(str.encode(video_files))\n            tf.flush()\n            video_files = tf.name\n            device_id = ParallelEnv().local_rank\n            logger.info(f'---------- device_id: {device_id} -----------')\n            pipe = VideoPipe(batch_size=self.batch_size,\n                             num_threads=1,\n                             device_id=device_id,\n                             file_list=video_files,\n                             sequence_length=self.num_seg * self.seglen,\n                             num_seg=self.num_seg,\n                             seg_length=self.seglen,\n                             resize_shorter_scale=self.short_size,\n                             crop_target_size=self.target_size,\n                             is_training=True,\n                             num_shards=self.num_shards,\n                             shard_id=self.shard_id,"
+        },
+        {
+            "comment": "This code initializes a DALI (Data Augmentation Library for Images) generic iterator to load video data from a file list, and returns it. It uses a VideoPipe class to define the pipeline configuration, including parameters such as batch size, number of threads, device ID, sequence length, and more.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py\":111-141",
+            "content": "                             dali_mean=self.dali_mean,\n                             dali_std=self.dali_std)\n            logger.info(\n                'initializing dataset, it will take several minutes if it is too large .... '\n            )\n            video_loader = DALIGenericIterator([pipe], ['image', 'label'],\n                                               len(lines),\n                                               dynamic_shape=True,\n                                               auto_reset=True)\n            return video_loader\n        dali_reader = reader_()\n        return dali_reader\nclass VideoPipe(Pipeline):\n    def __init__(self,\n                 batch_size,\n                 num_threads,\n                 device_id,\n                 file_list,\n                 sequence_length,\n                 num_seg,\n                 seg_length,\n                 resize_shorter_scale,\n                 crop_target_size,\n                 is_training=False,\n                 initial_prefetch_size=20,\n                 num_shards=1,"
+        },
+        {
+            "comment": "This code initializes a VideoPipe object with the given parameters, including file list, sequence length, and number of segments. It uses ops.VideoReader to read video data from the file list in the specified format. Due to the limitations of resize function, it transposes and reshapes the data before performing resizing operation on the 2-D image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py\":142-159",
+            "content": "                 shard_id=0,\n                 dali_mean=0.,\n                 dali_std=1.0):\n        super(VideoPipe, self).__init__(batch_size, num_threads, device_id)\n        self.input = ops.VideoReader(device=\"gpu\",\n                                     file_list=file_list,\n                                     sequence_length=sequence_length,\n                                     num_seg=num_seg,\n                                     seg_length=seg_length,\n                                     is_training=is_training,\n                                     num_shards=num_shards,\n                                     shard_id=shard_id,\n                                     random_shuffle=is_training,\n                                     initial_fill=initial_prefetch_size)\n        # the sequece data read by ops.VideoReader is of shape [F, H, W, C]\n        # Because the ops.Resize does not support sequence data,\n        # it will be transposed into [H, W, F, C],\n        # then reshaped to [H, W, FC], and then resized like a 2-D image."
+        },
+        {
+            "comment": "The code creates a DALI loader for image processing, with transpose, reshape, resize operations, and implements crop and mirror normalization. It also includes uniform distribution generators for position and mirror. The normalization will be implemented using PaddleOps due to the difficulty of dimension broadcasting in DALI.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py\":160-175",
+            "content": "        self.transpose = ops.Transpose(device=\"gpu\", perm=[1, 2, 0, 3])\n        self.reshape = ops.Reshape(device=\"gpu\",\n                                   rel_shape=[1.0, 1.0, -1],\n                                   layout='HWC')\n        self.resize = ops.Resize(device=\"gpu\",\n                                 resize_shorter=resize_shorter_scale)\n        # crops and mirror are applied by ops.CropMirrorNormalize.\n        # Normalization will be implemented in paddle due to the difficulty of dimension broadcast,\n        # It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.\n        self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))\n        self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))\n        self.mirror_generator = ops.Uniform(range=(0.0, 1.0))\n        self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)\n        self.crop_mirror_norm = ops.CropMirrorNormalize(\n            device=\"gpu\",\n            crop=[crop_target_size, crop_target_size],"
+        },
+        {
+            "comment": "The code defines a DALI loader and its associated operations for image processing. It includes mean and std for normalization, reshaping, casting to int64, transpose, resize, normalization by dividing by 255, generating positional information, cropping with mirror flag, and finally reshaping the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py\":176-201",
+            "content": "            mean=dali_mean,\n            std=dali_std)\n        self.reshape_back = ops.Reshape(\n            device=\"gpu\",\n            shape=[num_seg, seg_length * 3, crop_target_size, crop_target_size],\n            layout='FCHW')\n        self.cast_label = ops.Cast(device=\"gpu\", dtype=types.DALIDataType.INT64)\n    def define_graph(self):\n        output, label = self.input(name=\"Reader\")\n        output = self.transpose(output)\n        output = self.reshape(output)\n        output = self.resize(output)\n        output = output / 255.\n        pos_x = self.pos_rng_x()\n        pos_y = self.pos_rng_y()\n        mirror_flag = self.mirror_generator()\n        mirror_flag = (mirror_flag > 0.5)\n        mirror_flag = self.cast_mirror(mirror_flag)\n        output = self.crop_mirror_norm(output,\n                                       crop_pos_x=pos_x,\n                                       crop_pos_y=pos_y,\n                                       mirror=mirror_flag)\n        output = self.reshape_back(output)\n        label = self.cast_label(label)"
+        },
+        {
+            "comment": "The code defines a method that returns an output and label, and another method for determining the length of the loader.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dali_loader.py\":202-205",
+            "content": "        return output, label\n    def __len__(self):\n        return self.epoch_size()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4a9f6fd5-205b-4d02-99f9-ba84b7a3b91f.json b/docs/doc/4a9f6fd5-205b-4d02-99f9-ba84b7a3b91f.json
new file mode 100644
index 000000000..23c6eb5a3
--- /dev/null
+++ b/docs/doc/4a9f6fd5-205b-4d02-99f9-ba84b7a3b91f.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code calculates Pearson and Spearman correlation coefficients (PLCC & SROCC) for a given output and label pair using numpy arrays and scipy's stats functions.",
+    "details": [
+        {
+            "comment": "This code defines the QuqlityMetric class for measuring video quality. It imports necessary libraries, registers it with METRIC, and initializes attributes including data_size, batch_size, and log_interval. The output and label lists are used to store data during processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py\":0-34",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\"\"\"\nimport numpy as np\nimport paddle\nfrom paddle.hapi.model import _all_gather\nfrom scipy import stats\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass QuqlityMetric(BaseMetric):\n    \"\"\"CenterCropQualityMetric\"\"\"\n    def __init__(self, data_size, batch_size, log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.output = []\n        self.label = []"
+        },
+        {
+            "comment": "This code defines a class for calculating Pearson and Spearman correlation coefficients. The `update` method updates the metrics for each batch during training, while the `accumulate` method calculates the final Pearson (PLCC) and Spearman (SROCC) correlation coefficients after all iterations are finished.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py\":35-61",
+            "content": "        self.y_pred = np.zeros(data_size)\n        self.y_test = np.zeros(data_size)\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        labels = data[1]\n        predict_output = paddle.tolist(outputs)\n        predict_label = paddle.tolist(labels)\n        predict_output_len = len(predict_output)\n        for i in range(predict_output_len):\n            self.output.append(predict_output[i][0])\n            self.label.append(predict_label[i][0])\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        test_output_np = np.array(self.output)\n        test_label_np = np.array(self.label)\n        PLCC = stats.pearsonr(test_output_np, test_label_np)[0]\n        SROCC = stats.spearmanr(test_output_np, test_label_np)[0]"
+        },
+        {
+            "comment": "This code snippet calculates the Pearson and Spearman correlation coefficients (PLCC and SROCC) for a given output and label pair. It uses numpy arrays to convert the input into numeric data types, then calculates the correlation values using scipy's stats.pearsonr and stats.spearmanr functions respectively. Finally, it returns the calculated PLCC and SROCC values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py\":63-71",
+            "content": "        logger.info('[TEST] finished, PLCC= {}, SROCC= {} '.format(PLCC, SROCC))\n    def accumulate_train(self, output, label):\n        \"\"\"accumulate_train\"\"\"\n        output_np = np.array(output)\n        label_np = np.array(label)\n        PLCC = stats.pearsonr(output_np, label_np)[0]\n        SROCC = stats.spearmanr(output_np, label_np)[0]\n        return PLCC, SROCC"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4b0429f3-5899-40ff-ba98-adb02f3d59f4.json b/docs/doc/4b0429f3-5899-40ff-ba98-adb02f3d59f4.json
new file mode 100644
index 000000000..9620ade1e
--- /dev/null
+++ b/docs/doc/4b0429f3-5899-40ff-ba98-adb02f3d59f4.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code includes functions for parsing, setting parameters, and performing status checks. The last function logs success or failure with command details to the specified run_log file using tee -a.",
+    "details": [
+        {
+            "comment": "This code defines several functions for parsing and setting parameters, as well as performing status checks. The functions extract keys and values from strings using specific delimiters, set parameters based on their presence and content, and check the exit code of previous commands.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/common_func.sh\":0-57",
+            "content": "#!/bin/bash\nfunction func_parser_key(){\n    strs=$1\n    IFS=\":\"\n    array=(${strs})\n    tmp=${array[0]}\n    echo ${tmp}\n}\nfunction func_parser_value(){\n    strs=$1\n    IFS=\":\"\n    array=(${strs})\n    tmp=${array[1]}\n    echo ${tmp}\n}\nfunction func_set_params(){\n    key=$1\n    value=$2\n    if [ ${key}x = \"null\"x ];then\n        echo \" \"\n    elif [[ ${value} = \"null\" ]] || [[ ${value} = \" \" ]] || [ ${#value} -le 0 ];then\n        echo \" \"\n    else \n        echo \"${key}=${value}\"\n    fi\n}\nfunction func_parser_params(){\n    strs=$1\n    IFS=\":\"\n    array=(${strs})\n    key=${array[0]}\n    tmp=${array[1]}\n    IFS=\"|\"\n    res=\"\"\n    for _params in ${tmp[*]}; do\n        IFS=\"=\"\n        array=(${_params})\n        mode=${array[0]}\n        value=${array[1]}\n        if [[ ${mode} = ${MODE} ]]; then\n            IFS=\"|\"\n            #echo $(func_set_params \"${mode}\" \"${value}\")\n            echo $value\n            break\n        fi\n        IFS=\"|\"\n    done\n    echo ${res}\n}\nfunction status_check(){\n    last_status=$1   # the exit code\n    run_command=$2\n    run_log=$3"
+        },
+        {
+            "comment": "This function checks the last status and logs whether the run was successful or failed with specific command details. It appends the log to the specified run_log file using tee -a command.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/common_func.sh\":58-65",
+            "content": "    model_name=$4\n    log_path=$5\n    if [ $last_status -eq 0 ]; then\n        echo -e \"\\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path} \\033[0m\" | tee -a ${run_log}\n    else\n        echo -e \"\\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path} \\033[0m\" | tee -a ${run_log}\n    fi\n}"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4b99f1e3-4868-4798-b9f8-fe5ed1e1cfed.json b/docs/doc/4b99f1e3-4868-4798-b9f8-fe5ed1e1cfed.json
new file mode 100644
index 000000000..d41531013
--- /dev/null
+++ b/docs/doc/4b99f1e3-4868-4798-b9f8-fe5ed1e1cfed.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is licensing information and imports the RandomSampler class from a submodule, then defines the __all__ variable to include only the RandomSampler class.",
+    "details": [
+        {
+            "comment": "This code is licensing information and imports the RandomSampler class from a submodule, then defines the __all__ variable to include only the RandomSampler class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/samplers/__init__.py\":0-16",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .random_sampler import RandomSampler\n__all__ = ['RandomSampler']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4c899b69-445e-40cf-8ce1-c99c3ff3da42.json b/docs/doc/4c899b69-445e-40cf-8ce1-c99c3ff3da42.json
new file mode 100644
index 000000000..3ccef4c67
--- /dev/null
+++ b/docs/doc/4c899b69-445e-40cf-8ce1-c99c3ff3da42.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a PaddleVideo-based `InferModel` class for action detection using PPTSM with inference and prediction methods. It loads model, config file, specifies image paths, predicts on images, prints output shape, and time taken for prediction.",
+    "details": [
+        {
+            "comment": "This code defines a class `InferModel` that uses the PPTSM (Pose-aware Two-Stream Temporal Segmentation Model) for action detection. The model is initialized with a configuration file specifying the model and parameter files, as well as GPU memory and device ID settings. The configuration is optimized for efficient inference using feed fetch operations disabled and enabling memory optimization. The input tensor handle for the model is also retrieved.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py\":0-37",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"pptsm infer\"\"\"\n    def __init__(self, cfg, name='PPTSM'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])"
+        },
+        {
+            "comment": "This code defines a class with methods for inferring and predicting actions from the PaddleVideo framework. It uses the PaddlePaddle library for inference and gets output names and handles to extract the results. The code also includes a main function that can be run if the file is executed directly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py\":39-68",
+            "content": "        output_names = self.predictor.get_output_names()\n        print(\"output_names = \", output_names)\n        #self.output_tensor = self.predictor.get_output_handle(output_names[1])\n        self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = [items[:-1] for items in data]\n            inputs = np.array(inputs)\n            output = self.infer(inputs)\n            #print(\"inputs\", inputs.shape)\n            #print(\"outputs\", output.shape)\n            feature_list.append(np.squeeze(output))\n        feature_list = np.vstack(feature_list)\n        return feature_list\nif __name__ == \"__main__\":"
+        },
+        {
+            "comment": "This code loads a model, config file, and specifies image paths. It then predicts using the loaded model on images in the specified path and prints the shape of the output as well as the time taken to perform prediction. The comment is suitable for code chunks that explain what each section does, like loading a model, reading input files, or performing computations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py\":69-82",
+            "content": "    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238/' \n    imgs_list = get_images(imgs_path)\n    t0 = time.time()\n    cfg['PPTSM']['frame_list'] = imgs_list\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4cb0d2f5-3f59-492f-8273-48b5fec17c86.json b/docs/doc/4cb0d2f5-3f59-492f-8273-48b5fec17c86.json
new file mode 100644
index 000000000..779e33bee
--- /dev/null
+++ b/docs/doc/4cb0d2f5-3f59-492f-8273-48b5fec17c86.json
@@ -0,0 +1,25 @@
+{
+    "summary": "TSMHead, a classification task-oriented class extending TSNHead, initializes weights and registers in the HEADS registry. It is part of PaddleVideo's temporal segment network head, with parameters for weights and data format, forward function with average pooling and optional dropout, and possible tensor reshaping.",
+    "details": [
+        {
+            "comment": "This code defines the TSMHead class, which extends the TSNHead class. It is used for classification tasks with a specific number of classes and input feature channels. The class is registered in the HEADS registry and follows a certain weight initialization method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/tsm_head.py\":0-32",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nfrom paddle import ParamAttr\nfrom paddle.nn import Linear\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom .tsn_head import TSNHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass TSMHead(TSNHead):\n    \"\"\" TSM Head\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature."
+        },
+        {
+            "comment": "The code defines a class with an __init__ method that initializes the TsmHead object. It takes arguments like num_classes, in_channels, drop_ratio, std, and data_format to set up the internal structure of the class. The Linear layer is also initialized with specific learning rates for weights and biases.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/tsm_head.py\":33-56",
+            "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        drop_ratio(float): drop ratio. Default: 0.5.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.001.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 drop_ratio=0.5,\n                 std=0.001,\n                 data_format=\"NCHW\",\n                 **kwargs):\n        super().__init__(num_classes,\n                         in_channels,\n                         drop_ratio=drop_ratio,\n                         std=std,\n                         data_format=data_format,\n                         **kwargs)\n        self.fc = Linear(self.in_channels,\n                         self.num_classes,\n                         weight_attr=ParamAttr(learning_rate=5.0,\n                                               regularizer=L2Decay(1e-4)),\n                         bias_attr=ParamAttr(learning_rate=10.0,"
+        },
+        {
+            "comment": "The code initializes a TSM head, sets the data format and standard deviation for weights, initializes FC layer parameters, defines the forward function to perform average pooling, dropout if applicable, and reshapes the tensor based on the data format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/tsm_head.py\":57-88",
+            "content": "                                             regularizer=L2Decay(0.0)))\n        assert (data_format in [\n            'NCHW', 'NHWC'\n        ]), f\"data_format must be 'NCHW' or 'NHWC', but got {data_format}\"\n        self.data_format = data_format\n        self.stdv = std\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)\n    def forward(self, x, num_seg):\n        \"\"\"Define how the tsm-head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # x.shape = [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)  # [N * num_segs, in_channels, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)  # [N * num_seg, in_channels, 1, 1]\n        if self.data_format == 'NCHW':\n            x = paddle.reshape(x, x.shape[:2])"
+        },
+        {
+            "comment": "This code is part of a temporal segment network head in PaddleVideo. If the input is not 3-channel, it reshapes the feature map to have only the first third of channels. Then, it passes the reshaped feature through a fully connected layer and averages across segments to get a score for each class. The shape of the scores is then modified accordingly, and softmax could be applied (note: comment indicates that softmax might be removed).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/tsm_head.py\":89-98",
+            "content": "        else:\n            x = paddle.reshape(x, x.shape[::3])\n        score = self.fc(x)  # [N * num_seg, num_class]\n        score = paddle.reshape(\n            score, [-1, num_seg, score.shape[1]])  # [N, num_seg, num_class]\n        score = paddle.mean(score, axis=1)  # [N, num_class]\n        score = paddle.reshape(score,\n                               shape=[-1, self.num_classes])  # [N, num_class]\n        # score = F.softmax(score)  #NOTE remove\n        return score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4cff8309-3d7b-4715-9d30-609fc3431144.json b/docs/doc/4cff8309-3d7b-4715-9d30-609fc3431144.json
new file mode 100644
index 000000000..3389a7457
--- /dev/null
+++ b/docs/doc/4cff8309-3d7b-4715-9d30-609fc3431144.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The ASRFHead class is a model for action recognition using convolutional layers, and computes precision, recall, F1 score. It creates an ASRF head class for video processing with label retrieval, Levenshtein distance methods, edit scores, true positives, false positives, IoU measures, and selects the best scoring segment.",
+    "details": [
+        {
+            "comment": "The code defines a class for the ASRFHead, which is an instance of BaseHead and registered in HEADS registry. It imports necessary libraries, defines several models including SingleStageModel, and includes various utility functions from other modules. It also initializes weights with KaimingUniform_like_torch method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/asrf_head.py\":0-31",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# https://github.com/yiskw713/asrf/libs/models/tcn.py\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom paddle import ParamAttr\nfrom ..backbones.ms_tcn import SingleStageModel\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nfrom ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch\n@HEADS.register()\nclass ASRFHead(BaseHead):"
+        },
+        {
+            "comment": "The code above initializes an object of a class representing a feature extraction and classification model for action recognition. It takes several parameters such as the number of classes, features, stages in the action segmentation branch (ASB), stages in the boundary refinement branch (BRB), and layers per stage. The object is initialized by first calling the superclass constructor and then setting up the necessary components like the convolutional layers for class scores and boundary prediction, as well as multiple SingleStageModel instances for the action segmentation branch if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/asrf_head.py\":33-62",
+            "content": "    def __init__(self,\n                 num_classes,\n                 num_features,\n                 num_stages,\n                 num_layers,\n                 num_stages_asb=None,\n                 num_stages_brb=None):\n        super().__init__(num_classes=num_classes, in_channels=num_features)\n        if not isinstance(num_stages_asb, int):\n            num_stages_asb = num_stages\n        if not isinstance(num_stages_brb, int):\n            num_stages_brb = num_stages\n        self.num_layers = num_layers\n        self.num_stages_asb = num_stages_asb\n        self.num_stages_brb = num_stages_brb\n        self.num_features = num_features\n        # cls score\n        self.overlap = 0.5\n        self.conv_cls = nn.Conv1D(self.num_features, self.num_classes, 1)\n        self.conv_boundary = nn.Conv1D(self.num_features, 1, 1)\n        # action segmentation branch\n        asb = [\n            SingleStageModel(self.num_layers, self.num_features,\n                             self.num_classes, self.num_classes)\n            for _ in range(self.num_stages_asb - 1)"
+        },
+        {
+            "comment": "This code defines a ASRF head model, initializes its weights and performs forward pass for classification and boundary regression tasks. It uses Conv1D layers and LayerList for flexibility. The weight initialization follows Kaiming uniform distribution and applies bias if present. The outputs of both tasks are stored separately in lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/asrf_head.py\":63-97",
+            "content": "        ]\n        # boundary regression branch\n        brb = [\n            SingleStageModel(self.num_layers, self.num_features, 1, 1)\n            for _ in range(self.num_stages_brb - 1)\n        ]\n        self.brb = nn.LayerList(brb)\n        self.asb = nn.LayerList(asb)\n        self.activation_asb = nn.Softmax(axis=1)\n        self.activation_brb = nn.Sigmoid()\n    def init_weights(self):\n        \"\"\"\n        initialize model layers' weight\n        \"\"\"\n        # init weight\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv1D):\n                layer.weight.set_value(\n                    KaimingUniform_like_torch(layer.weight).astype('float32'))\n                if layer.bias is not None:\n                    layer.bias.set_value(\n                        init_bias(layer.weight, layer.bias).astype('float32'))\n    def forward(self, x):\n        \"\"\"\n        ASRF head\n        \"\"\"\n        out_cls = self.conv_cls(x)\n        out_boundary = self.conv_boundary(x)\n        outputs_cls = [out_cls]\n        outputs_boundary = [out_boundary]"
+        },
+        {
+            "comment": "This code implements an ASRF head for a model, which takes in input and outputs classified classes and boundary scores. It also includes a get_F1_score function to calculate precision, recall, and F1 score for classification tasks. The F1 score is calculated based on the correctness of predicted class labels compared to ground truth labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/asrf_head.py\":99-135",
+            "content": "        for as_stage in self.asb:\n            out_cls = as_stage(self.activation_asb(out_cls))\n            outputs_cls.append(out_cls)\n        for br_stage in self.brb:\n            out_boundary = br_stage(self.activation_brb(out_boundary))\n            outputs_boundary.append(out_boundary)\n        return outputs_cls, outputs_boundary\n    def get_F1_score(self, predicted, groundTruth):\n        recog_content = list(predicted.numpy())\n        gt_content = list(groundTruth[0].numpy())\n        # cls score\n        correct = 0\n        total = 0\n        edit = 0\n        for i in range(len(gt_content)):\n            total += 1\n            if gt_content[i] == recog_content[i]:\n                correct += 1\n        edit_num = self.edit_score(recog_content, gt_content)\n        edit += edit_num\n        tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)\n        # cls metric\n        precision = tp / float(tp + fp)\n        recall = tp / float(fp + fn)\n        if precision + recall > 0.0:\n            f1 = 2.0 * (precision * recall) / (precision + recall)"
+        },
+        {
+            "comment": "The code defines an ASRF head class that seems to be related to video processing and includes methods for retrieving label information and calculating the Levenshtein distance between two sequences. The get_labels_start_end_time method converts frame-wise labels into a list of labels, their respective start times, and end times. The levenstein method calculates the Levenshtein distance, which is used to compare two sequences of characters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/asrf_head.py\":136-169",
+            "content": "        else:\n            f1 = 0.0\n        f1 = np.nan_to_num(f1)\n        return f1\n    def get_labels_start_end_time(self, frame_wise_labels):\n        labels = []\n        starts = []\n        ends = []\n        last_label = frame_wise_labels[0]\n        labels.append(frame_wise_labels[0])\n        starts.append(0)\n        for i in range(len(frame_wise_labels)):\n            if frame_wise_labels[i] != last_label:\n                labels.append(frame_wise_labels[i])\n                starts.append(i)\n                ends.append(i)\n                last_label = frame_wise_labels[i]\n        ends.append(i + 1)\n        return labels, starts, ends\n    def levenstein(self, p, y, norm=False):\n        m_row = len(p)\n        n_col = len(y)\n        D = np.zeros([m_row + 1, n_col + 1], np.float)\n        for i in range(m_row + 1):\n            D[i, 0] = i\n        for i in range(n_col + 1):\n            D[0, i] = i\n        for j in range(1, n_col + 1):\n            for i in range(1, m_row + 1):\n                if y[j - 1] == p[i - 1]:\n                    D[i, j] = D[i - 1, j - 1]"
+        },
+        {
+            "comment": "The code contains a function to calculate the edit score between two sequences. It uses the Levenshtein distance algorithm to compare recognized and ground truth labels, considering insertions, deletions, and substitutions. The f_score function calculates true positive (tp) and false positive (fp) values based on label overlaps, and normalizes the edit score if required.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/asrf_head.py\":170-199",
+            "content": "                else:\n                    D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,\n                                  D[i - 1, j - 1] + 1)\n        if norm:\n            score = (1 - D[-1, -1] / max(m_row, n_col)) * 100\n        else:\n            score = D[-1, -1]\n        return score\n    def edit_score(self, recognized, ground_truth, norm=True):\n        P, _, _ = self.get_labels_start_end_time(recognized)\n        Y, _, _ = self.get_labels_start_end_time(ground_truth)\n        return self.levenstein(P, Y, norm)\n    def f_score(self, recognized, ground_truth, overlap):\n        p_label, p_start, p_end = self.get_labels_start_end_time(recognized)\n        y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)\n        tp = 0\n        fp = 0\n        hits = np.zeros(len(y_label))\n        for j in range(len(p_label)):\n            intersection = np.minimum(p_end[j], y_end) - np.maximum(\n                p_start[j], y_start)\n            union = np.maximum(p_end[j], y_end) - np.minimum(\n                p_start[j], y_start)"
+        },
+        {
+            "comment": "This code calculates true positives (tp), false positives (fp) and false negatives (fn). It measures IoU between predicted and actual labels, selects best scoring segment and tracks hits and misses. The method returns tp, fp, fn as float values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/asrf_head.py\":200-211",
+            "content": "            IoU = (1.0 * intersection / union) * (\n                [p_label[j] == y_label[x] for x in range(len(y_label))])\n            # Get the best scoring segment\n            idx = np.array(IoU).argmax()\n            if IoU[idx] >= overlap and not hits[idx]:\n                tp += 1\n                hits[idx] = 1\n            else:\n                fp += 1\n        fn = len(y_label) - sum(hits)\n        return float(tp), float(fp), float(fn)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4d06c7ec-ce63-4739-aff3-681d841badf7.json b/docs/doc/4d06c7ec-ce63-4739-aff3-681d841badf7.json
new file mode 100644
index 000000000..c371c374b
--- /dev/null
+++ b/docs/doc/4d06c7ec-ce63-4739-aff3-681d841badf7.json
@@ -0,0 +1,60 @@
+{
+    "summary": "This code calculates BMN metric for object detection in computer vision frameworks, supports batch_size and world_size as 1, initializes class variables, processes video data, logs progress, saves results, performs soft NMS, calculates proposal lists, evaluates performance using the \"cal_metrics\" function.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and defines a function to compute the Intersection over Union (IoU) between a box and anchors. It appears to be related to object detection or proposal generation within a computer vision framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":0-31",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport os\nimport json\nimport numpy as np\nimport pandas as pd\nimport multiprocessing as mp\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom .ActivityNet import ANETproposal\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\ndef iou_with_anchors(anchors_min, anchors_max, box_min, box_max):\n    \"\"\"Compute jaccard score between a box and the anchors.\n    \"\"\"\n    len_anchors = anchors_max - anchors_min\n    int_xmin = np.maximum(anchors_min, box_min)\n    int_xmax = np.minimum(anchors_max, box_max)"
+        },
+        {
+            "comment": "inter_len calculates the intersection length between two bounding boxes. union_len computes the total length of both boxes and jaccard index is calculated by dividing inter_len with union_len. This function returns the Jaccard index.\nThe boundary_choose() function selects start and end boundaries based on a given score list. It identifies the highest score and creates three arrays - score_list, score_front, score_back for comparison. mask_peak is created by comparing these arrays, followed by generating a binary mask of True values.\nSoft_nms function sorts proposals generated by network based on scores in descending order. It takes alpha value (Gaussian decaying function), and two threshold values t1, t2.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":32-62",
+            "content": "    inter_len = np.maximum(int_xmax - int_xmin, 0.)\n    union_len = len_anchors - inter_len + box_max - box_min\n    jaccard = np.divide(inter_len, union_len)\n    return jaccard\ndef boundary_choose(score_list):\n    \"\"\"Choose start and end boundary from score.\n    \"\"\"\n    max_score = max(score_list)\n    mask_high = (score_list > max_score * 0.5)\n    score_list = list(score_list)\n    score_middle = np.array([0.0] + score_list + [0.0])\n    score_front = np.array([0.0, 0.0] + score_list)\n    score_back = np.array(score_list + [0.0, 0.0])\n    mask_peak = ((score_middle > score_front) & (score_middle > score_back))\n    mask_peak = mask_peak[1:-1]\n    mask = (mask_high | mask_peak).astype('float32')\n    return mask\ndef soft_nms(df, alpha, t1, t2):\n    '''\n    df: proposals generated by network;\n    alpha: alpha value of Gaussian decaying function;\n    t1, t2: threshold for soft nms.\n    '''\n    df = df.sort_values(by=\"score\", ascending=False)\n    tstart = list(df.xmin.values[:])\n    tend = list(df.xmax.values[:])\n    tscore = list(df.score.values[:])"
+        },
+        {
+            "comment": "The code calculates BMN metric for object detection by iterating through a list of scores and appending the maximum score, along with its corresponding start and end positions, to new lists. It then creates a new DataFrame using these lists before returning it as the final output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":64-97",
+            "content": "    rstart = []\n    rend = []\n    rscore = []\n    while len(tscore) > 1 and len(rscore) < 101:\n        max_index = tscore.index(max(tscore))\n        tmp_iou_list = iou_with_anchors(np.array(tstart), np.array(tend),\n                                        tstart[max_index], tend[max_index])\n        for idx in range(0, len(tscore)):\n            if idx != max_index:\n                tmp_iou = tmp_iou_list[idx]\n                tmp_width = tend[max_index] - tstart[max_index]\n                if tmp_iou > t1 + (t2 - t1) * tmp_width:\n                    tscore[idx] = tscore[idx] * np.exp(\n                        -np.square(tmp_iou) / alpha)\n        rstart.append(tstart[max_index])\n        rend.append(tend[max_index])\n        rscore.append(tscore[max_index])\n        tstart.pop(max_index)\n        tend.pop(max_index)\n        tscore.pop(max_index)\n    newDf = pd.DataFrame()\n    newDf['score'] = rscore\n    newDf['xmin'] = rstart\n    newDf['xmax'] = rend\n    return newDf\n@METRIC.register\nclass BMNMetric(BaseMetric):\n    \"\"\"\n    Metrics for BMN. Two Stages in this metric:"
+        },
+        {
+            "comment": "This code initializes an instance of BMNMetric class with various parameters such as data_size, batch_size, tscale, dscale, file_path, ground_truth_filename, subset, output_path, result_path, get_metrics, and log_interval. It also performs assertions to ensure batch_size is 1 and world_size is 1, as the code currently supports only these conditions. The class is a part of PaddleVideo library for video analysis tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":98-126",
+            "content": "    (1) Get test results using trained model, results will be saved in BMNMetric.result_path;\n    (2) Calculate metrics using results file from stage (1).\n    \"\"\"\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 tscale,\n                 dscale,\n                 file_path,\n                 ground_truth_filename,\n                 subset,\n                 output_path,\n                 result_path,\n                 get_metrics=True,\n                 log_interval=1):\n        \"\"\"\n        Init for BMN metrics.\n        Params:\n            get_metrics: whether to calculate AR@N and AUC metrics or not, default True.\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        assert self.batch_size == 1, \" Now we just support batch_size==1 test\"\n        assert self.world_size == 1, \" Now we just support single-card test\"\n        self.tscale = tscale\n        self.dscale = dscale\n        self.file_path = file_path\n        self.ground_truth_filename = ground_truth_filename"
+        },
+        {
+            "comment": "The code initializes the class variables and checks if the output and result directories exist, creating them if not. It then calls a method to get the dataset dictionary and list based on the provided file path and subset. The update method takes batch ID, data, and outputs as inputs to update metrics during each iteration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":127-155",
+            "content": "        self.subset = subset\n        self.output_path = output_path\n        self.result_path = result_path\n        self.get_metrics = get_metrics\n        if not os.path.isdir(self.output_path):\n            os.makedirs(self.output_path)\n        if not os.path.isdir(self.result_path):\n            os.makedirs(self.result_path)\n        self.video_dict, self.video_list = self.get_dataset_dict(\n            self.file_path, self.subset)\n    def get_dataset_dict(self, file_path, subset):\n        annos = json.load(open(file_path))\n        video_dict = {}\n        for video_name in annos.keys():\n            video_subset = annos[video_name][\"subset\"]\n            if subset in video_subset:\n                video_dict[video_name] = annos[video_name]\n        video_list = list(video_dict.keys())\n        video_list.sort()\n        return video_dict, video_list\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        fid = data[4].numpy()\n        pred_bm, pred_start, pred_end = outputs"
+        },
+        {
+            "comment": "Code snippet performs boundary detection and creates a score vector list for each detection. It uses the provided prediction, start and end values to calculate the xmin and xmax values within the defined time scale. Then it checks if the start and end mask conditions are met and adds the corresponding score value to the score vector list. This information is used for further analysis or evaluation of video frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":156-181",
+            "content": "        pred_bm = pred_bm.numpy()\n        pred_start = pred_start[0].numpy()\n        pred_end = pred_end[0].numpy()\n        snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]\n        snippet_xmaxs = [\n            1.0 / self.tscale * i for i in range(1, self.tscale + 1)\n        ]\n        cols = [\"xmin\", \"xmax\", \"score\"]\n        video_name = self.video_list[fid[0]]\n        pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]\n        start_mask = boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_vector_list = []\n        for idx in range(self.dscale):\n            for jdx in range(self.tscale):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < self.tscale and start_mask[\n                        start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = snippet_xmins[start_index]\n                    xmax = snippet_xmaxs[end_index]\n                    xmin_score = pred_start[start_index]"
+        },
+        {
+            "comment": "This code snippet performs post-processing on video data and calculates metrics. It first processes the video data, then accumulates the metrics for each batch during processing. The code uses numpy arrays to handle score vectors, Pandas DataFrame to store and manipulate data, and logging to provide progress updates. The results are saved in a CSV file at the specified output path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":182-205",
+            "content": "                    xmax_score = pred_end[end_index]\n                    bm_score = pred_bm[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bm_score\n                    score_vector_list.append([xmin, xmax, conf_score])\n        score_vector_list = np.stack(score_vector_list)\n        video_df = pd.DataFrame(score_vector_list, columns=cols)\n        video_df.to_csv(os.path.join(self.output_path, \"%s.csv\" % video_name),\n                        index=False)\n        if batch_id % self.log_interval == 0:\n            logger.info(\"Processing................ batch {}\".format(batch_id))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        # check clip index of each video\n        #Stage1\n        self.bmn_post_processing(self.video_dict, self.subset, self.output_path,\n                                 self.result_path)\n        if self.get_metrics:\n            logger.info(\"[TEST] calculate metrics...\")\n            #Stage2\n            uniform_average_nr_proposals_valid, uniform_average_recall_valid, uniform_recall_valid = self.cal_metrics("
+        },
+        {
+            "comment": "This code is initializing a bmn_post_processing function that will process multiple videos in parallel using multiple processes. It creates a result dictionary and divides the video list into equal parts to assign each part to a separate process. It also logs the average recall at different thresholds for different numbers of detections.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":206-228",
+            "content": "                self.ground_truth_filename,\n                os.path.join(self.result_path, \"bmn_results_validation.json\"),\n                max_avg_nr_proposals=100,\n                tiou_thresholds=np.linspace(0.5, 0.95, 10),\n                subset='validation')\n            logger.info(\"AR@1; AR@5; AR@10; AR@100\")\n            logger.info(\"%.02f %.02f %.02f %.02f\" %\n                        (100 * np.mean(uniform_recall_valid[:, 0]),\n                         100 * np.mean(uniform_recall_valid[:, 4]),\n                         100 * np.mean(uniform_recall_valid[:, 9]),\n                         100 * np.mean(uniform_recall_valid[:, -1])))\n    def bmn_post_processing(self, video_dict, subset, output_path, result_path):\n        video_list = list(video_dict.keys())\n        global result_dict\n        result_dict = mp.Manager().dict()\n        pp_num = 12\n        num_videos = len(video_list)\n        num_videos_per_thread = int(num_videos / pp_num)\n        processes = []\n        for tid in range(pp_num - 1):\n            tmp_video_list = video_list[tid * num_videos_per_thread:(tid + 1) *"
+        },
+        {
+            "comment": "The code creates multiple processes to handle video processing tasks in parallel, using multiprocessing. It then joins all the results together into a single output dictionary before writing it to a JSON file. This approach allows for efficient and concurrent processing of large numbers of videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":229-255",
+            "content": "                                        num_videos_per_thread]\n            p = mp.Process(target=self.video_process,\n                           args=(tmp_video_list, video_dict, output_path,\n                                 result_dict))\n            p.start()\n            processes.append(p)\n        tmp_video_list = video_list[(pp_num - 1) * num_videos_per_thread:]\n        p = mp.Process(target=self.video_process,\n                       args=(tmp_video_list, video_dict, output_path,\n                             result_dict))\n        p.start()\n        processes.append(p)\n        for p in processes:\n            p.join()\n        result_dict = dict(result_dict)\n        output_dict = {\n            \"version\": \"VERSION 1.3\",\n            \"results\": result_dict,\n            \"external_data\": {}\n        }\n        outfile = open(\n            os.path.join(result_path, \"bmn_results_%s.json\" % subset), \"w\")\n        # json.dump(output_dict, outfile)\n        # in case of file name in chinese\n        json.dump(output_dict, outfile, ensure_ascii=False)"
+        },
+        {
+            "comment": "This function takes a list of video names, corresponding metadata dictionaries, output path, and result dictionary. It processes each video by reading its CSV file, performs soft NMS if the dataframe has more than one row, calculates proposal list for each video, and appends them to the result dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":256-281",
+            "content": "        outfile.close()\n    def video_process(self,\n                      video_list,\n                      video_dict,\n                      output_path,\n                      result_dict,\n                      snms_alpha=0.4,\n                      snms_t1=0.55,\n                      snms_t2=0.9):\n        for video_name in video_list:\n            logger.info(\"Processing video........\" + video_name)\n            df = pd.read_csv(os.path.join(output_path, video_name + \".csv\"))\n            if len(df) > 1:\n                df = soft_nms(df, snms_alpha, snms_t1, snms_t2)\n            video_duration = video_dict[video_name][\"duration_second\"]\n            proposal_list = []\n            for idx in range(min(100, len(df))):\n                tmp_prop={\"score\":df.score.values[idx], \\\n                          \"segment\":[max(0,df.xmin.values[idx])*video_duration, \\\n                                     min(1,df.xmax.values[idx])*video_duration]}\n                proposal_list.append(tmp_prop)\n            video_name = video_name[2:] if video_name[:2] == 'v_' else video_name"
+        },
+        {
+            "comment": "The code defines a function \"cal_metrics\" that takes in ground truth and proposal filenames, calculates the average recall, average proposals per video, and overall recall using ANETproposal class. This function is used to evaluate performance based on given thresholds and subsets of data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/bmn_metric.py\":282-303",
+            "content": "            result_dict[video_name] = proposal_list\n    def cal_metrics(self,\n                    ground_truth_filename,\n                    proposal_filename,\n                    max_avg_nr_proposals=100,\n                    tiou_thresholds=np.linspace(0.5, 0.95, 10),\n                    subset='validation'):\n        anet_proposal = ANETproposal(ground_truth_filename,\n                                     proposal_filename,\n                                     tiou_thresholds=tiou_thresholds,\n                                     max_avg_nr_proposals=max_avg_nr_proposals,\n                                     subset=subset,\n                                     verbose=True,\n                                     check_status=False)\n        anet_proposal.evaluate()\n        recall = anet_proposal.recall\n        average_recall = anet_proposal.avg_recall\n        average_nr_proposals = anet_proposal.proposals_per_video\n        return (average_nr_proposals, average_recall, recall)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4d09faa8-ce8a-428b-983c-98e28504bd5a.json b/docs/doc/4d09faa8-ce8a-428b-983c-98e28504bd5a.json
new file mode 100644
index 000000000..02e1df172
--- /dev/null
+++ b/docs/doc/4d09faa8-ce8a-428b-983c-98e28504bd5a.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The PaintBoard class, derived from QWidget, handles data initialization and view size, provides methods for clearing board, changing pen attributes, painting events, and retrieving content. The code implements a mouse event handler for drawing on the board in PaintBoard mode or eraser mode based on user selection.",
+    "details": [
+        {
+            "comment": "The code defines a class `PaintBoard` which inherits from `QWidget`. It initializes data such as the size of the board, an empty QPixmap, a boolean for EraserMode, and variables to store the position and pen attributes. Then it sets the view's fixed size based on the initialized data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/widget/PaintBoard.py\":0-39",
+            "content": "from PyQt5.QtWidgets import QWidget\nfrom PyQt5.Qt import QPixmap, QPainter, QPoint, QPaintEvent, QMouseEvent, QPen, \\\n    QColor, QSize\nfrom PyQt5.QtCore import Qt\nclass PaintBoard(QWidget):\n    def __init__(self, parent=None):\n        '''\n        Constructor\n        '''\n        super().__init__(parent)\n        self.__init_data()  # \u5148\u521d\u59cb\u5316\u6570\u636e\uff0c\u518d\u521d\u59cb\u5316\u754c\u9762\n        self.__init_view()\n    def __init_data(self):\n        self.__size = QSize(810, 458)\n        # \u65b0\u5efaQPixmap\u4f5c\u4e3a\u753b\u677f\uff0c\u5c3a\u5bf8\u4e3a__size\n        self.__board = QPixmap(self.__size)\n        self.__board.fill(Qt.transparent)  # \u7528\u900f\u660e\u586b\u5145\u753b\u677f\n        self.__IsEmpty = True  # \u9ed8\u8ba4\u4e3a\u7a7a\u753b\u677f\n        self.EraserMode = False  # \u9ed8\u8ba4\u4e3a\u7981\u7528\u6a61\u76ae\u64e6\u6a21\u5f0f\n        self.__lastPos = QPoint(0, 0)  # \u4e0a\u4e00\u6b21\u9f20\u6807\u4f4d\u7f6e\n        self.__currentPos = QPoint(0, 0)  # \u5f53\u524d\u7684\u9f20\u6807\u4f4d\u7f6e\n        self.__painter = QPainter()  # \u65b0\u5efa\u7ed8\u56fe\u5de5\u5177\n        self.__thickness = 15  # \u9ed8\u8ba4\u753b\u7b14\u7c97\u7ec6\u4e3a10px\n        self.__penColor = QColor(\"black\")  # \u8bbe\u7f6e\u9ed8\u8ba4\u753b\u7b14\u989c\u8272\u4e3a\u9ed1\u8272\n        self.__colorList = QColor.colorNames()  # \u83b7\u53d6\u989c\u8272\u5217\u8868\n    def __init_view(self):\n        # \u8bbe\u7f6e\u754c\u9762\u7684\u5c3a\u5bf8\u4e3a__size\n        self.setFixedSize(self.__size)"
+        },
+        {
+            "comment": "This code defines a class with methods to clear the paint board, change pen color and thickness, check if the board is empty, retrieve content as QImage, and handle painting events. The paintEvent method utilizes QPainter to draw the pixmap on the board in the correct location.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/widget/PaintBoard.py\":41-77",
+            "content": "    def clear(self):\n        # \u6e05\u7a7a\u753b\u677f\n        # self.__board.fill(Qt.white)\n        self.__board = QPixmap(self.__size)\n        self.__board.fill(Qt.transparent)  # \u7528\u900f\u660e\u586b\u5145\u753b\u677f\n        self.update()\n        self.__IsEmpty = True\n    def change_pen_color(self, color=\"black\"):\n        # \u6539\u53d8\u753b\u7b14\u989c\u8272\n        # rgbaColor = QColor(255, 255, 0, 100)\n        self.__penColor = QColor(color)\n    def change_pen_thickness(self, thickness=10):\n        # \u6539\u53d8\u753b\u7b14\u7c97\u7ec6\n        self.__thickness = thickness\n    def is_empty(self):\n        # \u8fd4\u56de\u753b\u677f\u662f\u5426\u4e3a\u7a7a\n        return self.__IsEmpty\n    def get_content_as_q_image(self):\n        # \u83b7\u53d6\u753b\u677f\u5185\u5bb9\uff08\u8fd4\u56deQImage\uff09\n        image = self.__board.toImage()\n        return image\n    def paintEvent(self, paint_event):\n        # \u7ed8\u56fe\u4e8b\u4ef6\n        # \u7ed8\u56fe\u65f6\u5fc5\u987b\u4f7f\u7528QPainter\u7684\u5b9e\u4f8b\uff0c\u6b64\u5904\u4e3a__painter\n        # \u7ed8\u56fe\u5728begin()\u51fd\u6570\u4e0eend()\u51fd\u6570\u95f4\u8fdb\u884c\n        # begin(param)\u7684\u53c2\u6570\u8981\u6307\u5b9a\u7ed8\u56fe\u8bbe\u5907\uff0c\u5373\u628a\u56fe\u753b\u5728\u54ea\u91cc\n        # drawPixmap\u7528\u4e8e\u7ed8\u5236QPixmap\u7c7b\u578b\u7684\u5bf9\u8c61\n        self.__painter.begin(self)\n        # 0,0\u4e3a\u7ed8\u56fe\u7684\u5de6\u4e0a\u89d2\u8d77\u70b9\u7684\u5750\u6807\uff0c__board\u5373\u8981\u7ed8\u5236\u7684\u56fe\n        self.__painter.drawPixmap(0, 0, self.__board)\n        self.__painter.end()"
+        },
+        {
+            "comment": "This code implements a mouse event handler for drawing on a PaintBoard. When the mouse is pressed, the current position is saved as the previous position. As the mouse moves, it draws lines between the last and current positions based on whether eraser mode is enabled or not. Upon mouse release, the board is marked as not empty. The drawing is updated to reflect the changes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/widget/PaintBoard.py\":79-105",
+            "content": "    def mousePressEvent(self, mouse_event):\n        # \u9f20\u6807\u6309\u4e0b\u65f6\uff0c\u83b7\u53d6\u9f20\u6807\u7684\u5f53\u524d\u4f4d\u7f6e\u4fdd\u5b58\u4e3a\u4e0a\u4e00\u6b21\u4f4d\u7f6e\n        self.__currentPos = mouse_event.pos()\n        self.__lastPos = self.__currentPos\n    def mouseMoveEvent(self, mouse_event):\n        # \u9f20\u6807\u79fb\u52a8\u65f6\uff0c\u66f4\u65b0\u5f53\u524d\u4f4d\u7f6e\uff0c\u5e76\u5728\u4e0a\u4e00\u4e2a\u4f4d\u7f6e\u548c\u5f53\u524d\u4f4d\u7f6e\u95f4\u753b\u7ebf\n        self.__currentPos = mouse_event.pos()\n        self.__painter.begin(self.__board)\n        if self.EraserMode == False:\n            # \u975e\u6a61\u76ae\u64e6\u6a21\u5f0f\n            self.__painter.setPen(QPen(self.__penColor, self.__thickness))  # \u8bbe\u7f6e\u753b\u7b14\u989c\u8272\uff0c\u7c97\u7ec6\n        else:\n            # \u6a61\u76ae\u64e6\u6a21\u5f0f\u4e0b\u753b\u7b14\u4e3a\u7eaf\u767d\u8272\uff0c\u7c97\u7ec6\u4e3a10\n            self.__painter.setPen(QPen(Qt.transparent, 10))\n        # \u753b\u7ebf\n        # print(self.__lastPos + self.__currentPos)\n        self.__painter.drawLine(self.__lastPos, self.__currentPos)\n        self.__painter.end()\n        self.__lastPos = self.__currentPos\n        self.update()  # \u66f4\u65b0\u663e\u793a\n    def mouseReleaseEvent(self, mouseEvent):\n        self.__IsEmpty = False  # \u753b\u677f\u4e0d\u518d\u4e3a\u7a7a"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4e27e6ff-cf4b-435a-ac68-53362cde9b5d.json b/docs/doc/4e27e6ff-cf4b-435a-ac68-53362cde9b5d.json
new file mode 100644
index 000000000..40f07e82f
--- /dev/null
+++ b/docs/doc/4e27e6ff-cf4b-435a-ac68-53362cde9b5d.json
@@ -0,0 +1,45 @@
+{
+    "summary": "The Python script prepares MSRVTTDataset by importing libraries, creating a class, tokenizing captions, retrieving features, and preparing sequences for processing. It processes image data, performs array operations, pads, resizes, calculates features, and converts to float32 for training/testing.",
+    "details": [
+        {
+            "comment": "The code is a Python script that imports various libraries and packages, checks for the availability of 'lmdb' library, and tries to import 'BertTokenizer' from 'paddlenlp'. It also includes license information and copyright notice.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/msrvtt.py\":0-30",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\ntry:\n    import lmdb\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT.\"\n    )\nimport pickle\ntry:\n    from paddlenlp.transformers import BertTokenizer\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT.\"\n    )"
+        },
+        {
+            "comment": "The code defines the `MSRVTTDataset` class for text-video clip retrieval from MSR-VTT dataset, registering it in the registry. It takes parameters such as file path, pipeline, and maximum sequence length for initializing the dataset, and provides attributes like bert model, padding index, and other dimensions for processing the dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/msrvtt.py\":31-66",
+            "content": "from ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass MSRVTTDataset(BaseDataset):\n    \"\"\"MSR-VTT dataset for text-video clip retrieval.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        features_path,\n        bert_model=\"bert-base-uncased\",\n        padding_index=0,\n        max_seq_length=36,\n        max_region_num=36,\n        max_action_num=5,\n        vision_feature_dim=2048,\n        action_feature_dim=2048,\n        spatials_dim=5,\n        data_prefix=None,\n        test_mode=False,\n    ):\n        self.features_path = features_path\n        self.bert_model = bert_model\n        self.padding_index = padding_index\n        self.max_seq_length = max_seq_length\n        self.max_region_num = max_region_num\n        self._max_action_num = max_action_num\n        self.vision_feature_dim = vision_feature_dim\n        self.action_feature_dim = action_feature_dim\n        self.spatials_dim = spatials_dim"
+        },
+        {
+            "comment": "The code snippet initializes a BertTokenizer object, loads file containing video information, tokenizes each entry's caption using the initialized tokenizer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/msrvtt.py\":67-92",
+            "content": "        self._tokenizer = BertTokenizer.from_pretrained(bert_model,\n                                                        do_lower_case=True)\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n        self.tokenize()\n        self.gen_feature()\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        with open(self.file_path) as fin:\n            self.image_entries = []\n            self.caption_entries = []\n            for line in fin.readlines():\n                line = line.strip()\n                vid_id = line.split(',')[0]\n                self.image_entries.append(vid_id)\n                self.caption_entries.append({\n                    \"caption\": line.split(',')[1],\n                    \"vid_id\": vid_id\n                })\n        self.env = lmdb.open(self.features_path)\n    def tokenize(self):\n        for entry in self.caption_entries:\n            tokens = []\n            tokens.append(\"[CLS]\")\n            for token in self._tokenizer.tokenize(entry[\"caption\"]):"
+        },
+        {
+            "comment": "This code is part of a class that processes video data. It appends tokens to an entry, converts tokens to ids, creates segment and input masks, and pads the sequence if necessary. The \"get_image_feature\" function retrieves image features from a database for a given video ID.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/msrvtt.py\":93-119",
+            "content": "                tokens.append(token)\n            tokens.append(\"[SEP]\")\n            tokens = self._tokenizer.convert_tokens_to_ids(tokens)\n            segment_ids = [0] * len(tokens)\n            input_mask = [1] * len(tokens)\n            if len(tokens) < self.max_seq_length:\n                padding = [self.padding_index\n                           ] * (self.max_seq_length - len(tokens))\n                tokens = tokens + padding\n                input_mask += padding\n                segment_ids += padding\n            entry[\"token\"] = np.array(tokens).astype('int64')\n            entry[\"input_mask\"] = np.array(input_mask)\n            entry[\"segment_ids\"] = np.array(segment_ids).astype('int64')\n    def get_image_feature(self, video_id):\n        video_id = str(video_id).encode()\n        with self.env.begin(write=False) as txn:\n            item = pickle.loads(txn.get(video_id))\n            video_id = item[\"video_id\"]\n            image_h = int(item[\"image_h\"])\n            image_w = int(item[\"image_w\"])\n            features = item[\"features\"].reshape(-1, self.vision_feature_dim)"
+        },
+        {
+            "comment": "This code is resizing and calculating the image location for each box in a dataset. It also concatenates the average feature to the start of the features array, and handles reshaping action_features. The code uses numpy functions extensively for array operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/msrvtt.py\":120-141",
+            "content": "            boxes = item[\"boxes\"].reshape(-1, 4)\n            num_boxes = features.shape[0]\n            g_feat = np.sum(features, axis=0) / num_boxes\n            num_boxes = num_boxes + 1\n            features = np.concatenate(\n                [np.expand_dims(g_feat, axis=0), features], axis=0)\n            action_features = item[\"action_features\"].reshape(\n                -1, self.action_feature_dim)\n            image_location = np.zeros((boxes.shape[0], self.spatials_dim),\n                                      dtype=np.float32)\n            image_location[:, :4] = boxes\n            image_location[:,\n                           4] = ((image_location[:, 3] - image_location[:, 1]) *\n                                 (image_location[:, 2] - image_location[:, 0]) /\n                                 (float(image_w) * float(image_h)))\n            image_location[:, 0] = image_location[:, 0] / float(image_w)\n            image_location[:, 1] = image_location[:, 1] / float(image_h)\n            image_location[:, 2] = image_location[:, 2] / float(image_w)"
+        },
+        {
+            "comment": "The code defines a function that returns features, number of boxes, image location, and action features after processing an input image. It also initializes arrays for all instances of features, action features, spatial locations, and masks. The code then iterates over each image ID and calls another function to get the respective features, num_boxes, boxes, and action_features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/msrvtt.py\":142-162",
+            "content": "            image_location[:, 3] = image_location[:, 3] / float(image_h)\n            g_location = np.array([0, 0, 1, 1, 1])\n            image_location = np.concatenate(\n                [np.expand_dims(g_location, axis=0), image_location], axis=0)\n        return features, num_boxes, image_location, action_features\n    def gen_feature(self):\n        num_inst = len(self.image_entries)  #1000\n        self.features_all = np.zeros(\n            (num_inst, self.max_region_num, self.vision_feature_dim))\n        self.action_features_all = np.zeros(\n            (num_inst, self._max_action_num, self.action_feature_dim))\n        self.spatials_all = np.zeros(\n            (num_inst, self.max_region_num, self.spatials_dim))\n        self.image_mask_all = np.zeros((num_inst, self.max_region_num))\n        self.action_mask_all = np.zeros((num_inst, self._max_action_num))\n        for i, image_id in enumerate(self.image_entries):\n            features, num_boxes, boxes, action_features = self.get_image_feature(\n                image_id)"
+        },
+        {
+            "comment": "The code handles the padding of features, boxes and masks for a dataset. It ensures that all sequences have the same length by padding them with zeros if necessary. The mixed features (maximum region number), boxes, and action features are assigned to respective lists. These lists will be used later in the program. The code also converts the features list to float32 data type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/msrvtt.py\":164-186",
+            "content": "            mix_num_boxes = min(int(num_boxes), self.max_region_num)\n            mix_boxes_pad = np.zeros((self.max_region_num, self.spatials_dim))\n            mix_features_pad = np.zeros(\n                (self.max_region_num, self.vision_feature_dim))\n            image_mask = [1] * (int(mix_num_boxes))\n            while len(image_mask) < self.max_region_num:\n                image_mask.append(0)\n            action_mask = [1] * (self._max_action_num)\n            while len(action_mask) < self._max_action_num:\n                action_mask.append(0)\n            mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]\n            mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]\n            self.features_all[i] = mix_features_pad\n            x = action_features.shape[0]\n            self.action_features_all[i][:x] = action_features[:]\n            self.image_mask_all[i] = np.array(image_mask)\n            self.action_mask_all[i] = np.array(action_mask)\n            self.spatials_all[i] = mix_boxes_pad\n        self.features_all = self.features_all.astype(\"float32\")"
+        },
+        {
+            "comment": "This code initializes data types and provides methods for preparing training and testing data. The `prepare_train` method is left empty, while `prepare_test` takes an index, retrieves the corresponding entry, creates a target array, and returns various data arrays to be used in testing. The length of the dataset is determined by the number of caption entries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/msrvtt.py\":187-219",
+            "content": "        self.action_features_all = self.action_features_all.astype(\"float32\")\n        self.image_mask_all = self.image_mask_all.astype(\"int64\")\n        self.action_mask_all = self.action_mask_all.astype(\"int64\")\n        self.spatials_all = self.spatials_all.astype(\"float32\")\n    def prepare_train(self, idx):\n        pass\n    def prepare_test(self, idx):\n        entry = self.caption_entries[idx]\n        caption = entry[\"token\"]\n        input_mask = entry[\"input_mask\"]\n        segment_ids = entry[\"segment_ids\"]\n        target_all = np.zeros(1000)\n        for i, image_id in enumerate(self.image_entries):\n            if image_id == entry[\"vid_id\"]:\n                target_all[i] = 1\n        return (\n            caption,\n            self.action_features_all,\n            self.features_all,\n            self.spatials_all,\n            segment_ids,\n            input_mask,\n            self.image_mask_all,\n            self.action_mask_all,\n            target_all,\n        )\n    def __len__(self):\n        return len(self.caption_entries)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4e2977e4-6b4e-4937-b357-76a12da3b124.json b/docs/doc/4e2977e4-6b4e-4937-b357-76a12da3b124.json
new file mode 100644
index 000000000..16cb93e0b
--- /dev/null
+++ b/docs/doc/4e2977e4-6b4e-4937-b357-76a12da3b124.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This function processes skeleton data, extracts body information, updates a dictionary with the data and returns the skeleton name, body data, and frame count. It handles missing frames and calculates motion using NTU RGB-D dataset data. Additionally, it combines and processes raw skeleton data from multiple files, updating progress, filters out missing frames, logs events, and saves the filtered data into pickle files.",
+    "details": [
+        {
+            "comment": "This function gets raw bodies data from a skeleton sequence by loading the file and checking its existence. It returns a dictionary with three key-value pairs: name (skeleton filename), data (raw data of each body), and num_frames (number of valid frames).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_skes_data.py\":0-27",
+            "content": "# ref: https://github.com/Uason-Chen/CTR-GCN/blob/main/data/ntu/get_raw_skes_data.py\nimport os.path as osp\nimport os\nimport numpy as np\nimport pickle\nimport logging\ndef get_raw_bodies_data(skes_path, ske_name, frames_drop_skes,\n                        frames_drop_logger):\n    \"\"\"\n    Get raw bodies data from a skeleton sequence.\n    Each body's data is a dict that contains the following keys:\n      - joints: raw 3D joints positions. Shape: (num_frames x 25, 3)\n      - colors: raw 2D color locations. Shape: (num_frames, 25, 2)\n      - interval: a list which stores the frame indices of this body.\n      - motion: motion amount (only for the sequence with 2 or more bodyIDs).\n    Return:\n      a dict for a skeleton sequence with 3 key-value pairs:\n        - name: the skeleton filename.\n        - data: a dict which stores raw data of each body.\n        - num_frames: the number of valid frames.\n    \"\"\"\n    ske_file = osp.join(skes_path, ske_name + '.skeleton')\n    assert osp.exists(ske_file), 'Error: Skeleton file %s not found' % ske_file"
+        },
+        {
+            "comment": "Reading and processing .skeleton file data into a list, storing number of frames, ignoring frames with no bodies, extracting body IDs, and counting the number of joints for each body.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_skes_data.py\":28-57",
+            "content": "    # Read all data from .skeleton file into a list (in string format)\n    print('Reading data from %s' % ske_file[-29:])\n    with open(ske_file, 'r') as fr:\n        str_data = fr.readlines()\n    num_frames = int(str_data[0].strip('\\r\\n'))\n    frames_drop = []\n    bodies_data = dict()\n    valid_frames = -1  # 0-based index\n    current_line = 1\n    for f in range(num_frames):\n        num_bodies = int(str_data[current_line].strip('\\r\\n'))\n        current_line += 1\n        if num_bodies == 0:  # no data in this frame, drop it\n            frames_drop.append(f)  # 0-based index\n            continue\n        valid_frames += 1\n        joints = np.zeros((num_bodies, 25, 3), dtype=np.float32)\n        colors = np.zeros((num_bodies, 25, 2), dtype=np.float32)\n        for b in range(num_bodies):\n            bodyID = str_data[current_line].strip('\\r\\n').split()[0]\n            current_line += 1\n            num_joints = int(str_data[current_line].strip('\\r\\n'))  # 25 joints\n            current_line += 1\n            for j in range(num_joints):"
+        },
+        {
+            "comment": "This code reads data from a file, extracts joint and color information for each body, and updates or adds body data to a dictionary based on the body ID. The joint and color arrays are created using numpy functions, and the joints array is stacked along the frame order if the body's data already exists in the dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_skes_data.py\":58-75",
+            "content": "                temp_str = str_data[current_line].strip('\\r\\n').split()\n                joints[b, j, :] = np.array(temp_str[:3], dtype=np.float32)\n                colors[b, j, :] = np.array(temp_str[5:7], dtype=np.float32)\n                current_line += 1\n            if bodyID not in bodies_data:  # Add a new body's data\n                body_data = dict()\n                body_data['joints'] = joints[b]  # ndarray: (25, 3)\n                body_data['colors'] = colors[b,\n                                             np.newaxis]  # ndarray: (1, 25, 2)\n                body_data['interval'] = [valid_frames\n                                         ]  # the index of the first frame\n            else:  # Update an already existed body's data\n                body_data = bodies_data[bodyID]\n                # Stack each body's data of each frame along the frame order\n                body_data['joints'] = np.vstack(\n                    (body_data['joints'], joints[b]))\n                body_data['colors'] = np.vstack("
+        },
+        {
+            "comment": "This code retrieves raw data for a specific subject's skeleton (ske_name) from the NTU RGB-D dataset. It handles missing frames, calculates motion based on body data with multiple bodyIDs and returns the skeleton name, body data and updated frame count.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_skes_data.py\":76-99",
+            "content": "                    (body_data['colors'], colors[b, np.newaxis]))\n                pre_frame_idx = body_data['interval'][-1]\n                body_data['interval'].append(pre_frame_idx +\n                                             1)  # add a new frame index\n            bodies_data[bodyID] = body_data  # Update bodies_data\n    num_frames_drop = len(frames_drop)\n    assert num_frames_drop < num_frames, \\\n        'Error: All frames data (%d) of %s is missing or lost' % (num_frames, ske_name)\n    if num_frames_drop > 0:\n        frames_drop_skes[ske_name] = np.array(frames_drop, dtype=np.int)\n        frames_drop_logger.info('{}: {} frames missed: {}\\n'.format(\n            ske_name, num_frames_drop, frames_drop))\n    # Calculate motion (only for the sequence with 2 or more bodyIDs)\n    if len(bodies_data) > 1:\n        for body_data in bodies_data.values():\n            body_data['motion'] = np.sum(np.var(body_data['joints'], axis=0))\n    return {\n        'name': ske_name,\n        'data': bodies_data,\n        'num_frames': num_frames - num_frames_drop"
+        },
+        {
+            "comment": "This function retrieves raw skeleton data from multiple files, processes it, and saves the combined data in a file. It keeps track of the number of frames for each file and prints progress updates every 1000 files processed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_skes_data.py\":100-129",
+            "content": "    }\ndef get_raw_skes_data():\n    skes_name = np.loadtxt(skes_name_file, dtype=str)\n    num_files = skes_name.size\n    print('Found %d available skeleton files.' % num_files)\n    raw_skes_data = []\n    frames_cnt = np.zeros(num_files, dtype=np.int)\n    for (idx, ske_name) in enumerate(skes_name):\n        bodies_data = get_raw_bodies_data(skes_path, ske_name, frames_drop_skes,\n                                          frames_drop_logger)\n        raw_skes_data.append(bodies_data)\n        frames_cnt[idx] = bodies_data['num_frames']\n        if (idx + 1) % 1000 == 0:\n            print('Processed: %.2f%% (%d / %d)' % \\\n                  (100.0 * (idx + 1) / num_files, idx + 1, num_files))\n    with open(save_data_pkl, 'wb') as fw:\n        pickle.dump(raw_skes_data, fw, pickle.HIGHEST_PROTOCOL)\n    np.savetxt(osp.join(save_path, 'raw_data', 'frames_cnt.txt'),\n               frames_cnt,\n               fmt='%d')\n    print('Saved raw bodies data into %s' % save_data_pkl)\n    print('Total frames: %d' % np.sum(frames_cnt))"
+        },
+        {
+            "comment": "This code reads data from the NTU-RGB+D dataset, filters out frames with missing skeleton data, and saves it into two pickle files. The data is read from a specific path, and if the raw_data directory does not exist, it creates one. A logger for frames drop events is also set up and logs to a file. Finally, the code dumps the filtered frames data into another pickle file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_skes_data.py\":131-156",
+            "content": "    with open(frames_drop_pkl, 'wb') as fw:\n        pickle.dump(frames_drop_skes, fw, pickle.HIGHEST_PROTOCOL)\nif __name__ == '__main__':\n    save_path = './'\n    skes_path = '../ntu-rgb-d/nturgb+d_skeletons/'\n    stat_path = osp.join(save_path, 'statistics')\n    if not osp.exists('./raw_data'):\n        os.makedirs('./raw_data')\n    skes_name_file = osp.join(stat_path, 'skes_available_name.txt')\n    save_data_pkl = osp.join(save_path, 'raw_data', 'raw_skes_data.pkl')\n    frames_drop_pkl = osp.join(save_path, 'raw_data', 'frames_drop_skes.pkl')\n    frames_drop_logger = logging.getLogger('frames_drop')\n    frames_drop_logger.setLevel(logging.INFO)\n    frames_drop_logger.addHandler(\n        logging.FileHandler(osp.join(save_path, 'raw_data', 'frames_drop.log')))\n    frames_drop_skes = dict()\n    get_raw_skes_data()\n    with open(frames_drop_pkl, 'wb') as fw:\n        pickle.dump(frames_drop_skes, fw, pickle.HIGHEST_PROTOCOL)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/4f50e070-a940-4db2-83a2-23168083178f.json b/docs/doc/4f50e070-a940-4db2-83a2-23168083178f.json
new file mode 100644
index 000000000..d648769d3
--- /dev/null
+++ b/docs/doc/4f50e070-a940-4db2-83a2-23168083178f.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a 3D ROI extractor class and head, performing feature extraction with optional temporal pooling. The forward method executes feature extraction based on input features, RoIs, and number of RoIs, and returns the final output after applying ROI layer and stacking features along axis 2.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and registers a new ROI (Region of Interest) extractor named \"SingleRoIExtractor3D\". This class inherits from nn.Layer and is designed to extract RoI features from a single level feature map with specific options such as roi_layer_type, featmap_stride, output_size, and sampling_ratio.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/single_straight3d.py\":0-27",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport numpy as np\nfrom ..registry import ROI_EXTRACTORS\nfrom .roi_extractor import RoIAlign\n@ROI_EXTRACTORS.register()\nclass SingleRoIExtractor3D(nn.Layer):\n    \"\"\"Extract RoI features from a single level feature map.  \"\"\"\n    def __init__(self,\n                 roi_layer_type='RoIAlign',\n                 featmap_stride=16,\n                 output_size=16,\n                 sampling_ratio=0,"
+        },
+        {
+            "comment": "This code defines a class for a 3D head that takes input features and regions of interest (ROIs) to extract features using the RoIAlign layer. It also includes an optional temporal pooling operation and an initialization function. The forward method performs feature extraction given the input features, ROIs, and number of ROIs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/single_straight3d.py\":28-54",
+            "content": "                 pool_mode='avg',\n                 aligned=True,\n                 with_temporal_pool=True,\n                 with_global=False):\n        super().__init__()\n        self.roi_layer_type = roi_layer_type\n        assert self.roi_layer_type in ['RoIPool', 'RoIAlign']\n        self.featmap_stride = featmap_stride\n        self.spatial_scale = 1. / self.featmap_stride\n        self.output_size = output_size\n        self.sampling_ratio = sampling_ratio\n        self.pool_mode = pool_mode\n        self.aligned = aligned\n        self.with_temporal_pool = with_temporal_pool\n        self.with_global = with_global\n        self.roi_layer = RoIAlign(resolution=self.output_size,\n                                  spatial_scale=self.spatial_scale,\n                                  sampling_ratio=self.sampling_ratio,\n                                  aligned=self.aligned)\n    def init_weights(self):\n        pass\n    # The shape of feat is N, C, T, H, W\n    def forward(self, feat, rois, rois_num):\n        if len(feat) >= 2:"
+        },
+        {
+            "comment": "This code performs temporal pooling, concatenates slow and fast features, extracts frame-wise features using index selection, squeezes the dimensions to prevent deletion when N=1, applies a ROI layer on each frame, stacks the resulting features along axis 2, and returns the final output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/single_straight3d.py\":55-78",
+            "content": "            assert self.with_temporal_pool\n        if self.with_temporal_pool:\n            xi = 0\n            for x in feat:\n                xi = xi + 1\n                y = paddle.mean(x, 2, keepdim=True)\n            feat = [paddle.mean(x, 2, keepdim=True) for x in feat]\n        feat = paddle.concat(feat, axis=1)  # merge slow and fast\n        roi_feats = []\n        for t in range(feat.shape[2]):\n            if type(t) == paddle.static.Variable:\n                index = paddle.to_tensor(t)\n            else:\n                data_index = np.array([t]).astype('int32')\n                index = paddle.to_tensor(data_index)\n            frame_feat = paddle.index_select(feat, index, axis=2)\n            frame_feat = paddle.squeeze(frame_feat,\n                                        axis=2)  #axis=2,\u907f\u514dN=1\u65f6, \u7b2c\u4e00\u7ef4\u5ea6\u88ab\u5220\u9664.\n            roi_feat = self.roi_layer(frame_feat, rois, rois_num)\n            roi_feats.append(roi_feat)\n        ret = paddle.stack(roi_feats, axis=2)\n        return ret"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5014415b-96eb-4188-85fe-6c50f490f696.json b/docs/doc/5014415b-96eb-4188-85fe-6c50f490f696.json
new file mode 100644
index 000000000..ac8ba843f
--- /dev/null
+++ b/docs/doc/5014415b-96eb-4188-85fe-6c50f490f696.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code uses MeanAveragePrecisionCalculator to calculate mAP for ranked lists, initializes AveragePrecisionCalculator objects, supports interpolated precisions, and ensures shape compatibility. It averages average precisions of each class to provide the final result as mAP.",
+    "details": [
+        {
+            "comment": "This code calculates the mean average precision for a ranked list of items. It provides an interface to calculate this metric for the entire list or top-n ranked items. The example usage demonstrates accumulating data in parts and then using peek_map_at_n function to calculate the final result. The provided numpy array is used for demonstration purposes, representing a ranked list of values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py\":0-26",
+            "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Calculate the mean average precision.\nIt provides an interface for calculating mean average precision\nfor an entire list or the top-n ranked items.\nExample usages:\nWe first call the function accumulate many times to process parts of the ranked\nlist. After processing all the parts, we call peek_map_at_n\nto calculate the mean average precision.\n```\nimport random\np = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])"
+        },
+        {
+            "comment": "Creates a numpy array with 1000 samples, each containing 50 binary random choices. Initializes MeanAveragePrecisionCalculator object with specified number of classes (in this case, 50). Accumulates predictions and ground truth for calculating average precision. Retrieves the average precision map at a given point in time.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py\":27-58",
+            "content": "a = np.array([[random.choice([0, 1]) for _ in xrange(50)]\n     for _ in xrange(1000)])\n# mean average precision for 50 classes.\ncalculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(\n            num_class=50)\ncalculator.accumulate(p, a)\naps = calculator.peek_map_at_n()\n```\n\"\"\"\nimport numpy\nfrom . import average_precision_calculator\nclass MeanAveragePrecisionCalculator(object):\n    \"\"\"This class is to calculate mean average precision.\n  \"\"\"\n    def __init__(self, num_class):\n        \"\"\"Construct a calculator to calculate the (macro) average precision.\n    Args:\n      num_class: A positive Integer specifying the number of classes.\n      top_n_array: A list of positive integers specifying the top n for each\n      class. The top n in each class will be used to calculate its average\n      precision at n.\n      The size of the array must be num_class.\n    Raises:\n      ValueError: An error occurred when num_class is not a positive integer;\n      or the top_n_array is not a list of positive integers."
+        },
+        {
+            "comment": "This code defines a class for calculating Mean Average Precision (mAP) in the context of video classification. The constructor checks if num_class is a positive integer and initializes a list to store AveragePrecisionCalculator objects. The accumulate method takes predictions and actuals as input, accumulating prediction scores with their corresponding ground truth labels. If num_positives is provided, it represents the number of true positives for each class; otherwise, it defaults to no value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py\":59-79",
+            "content": "    \"\"\"\n        if not isinstance(num_class, int) or num_class <= 1:\n            raise ValueError(\"num_class must be a positive integer.\")\n        self._ap_calculators = []  # member of AveragePrecisionCalculator\n        self._num_class = num_class  # total number of classes\n        for i in range(num_class):\n            self._ap_calculators.append(\n                average_precision_calculator.AveragePrecisionCalculator())\n    def accumulate(self, predictions, actuals, num_positives=None):\n        \"\"\"Accumulate the predictions and their ground truth labels.\n    Args:\n      predictions: A list of lists storing the prediction scores. The outer\n      dimension corresponds to classes.\n      actuals: A list of lists storing the ground truth labels. The dimensions\n      should correspond to the predictions input. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      num_positives: If provided, it is a list of numbers representing the\n      number of true positives for each class. If not provided, the number of"
+        },
+        {
+            "comment": "This code calculates the mean average precision for each class in a dataset, and provides methods to clear and check if the calculators are empty. The peek_map_at_n function returns an array of non-interpolated average precisions at n for each class. It also checks for shape compatibility between predictions and actuals arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py\":80-111",
+            "content": "      true positives will be inferred from the 'actuals' array.\n    Raises:\n      ValueError: An error occurred when the shape of predictions and actuals\n      does not match.\n    \"\"\"\n        if not num_positives:\n            num_positives = [None for i in predictions.shape[1]]\n        calculators = self._ap_calculators\n        for i in range(len(predictions)):\n            calculators[i].accumulate(predictions[i], actuals[i],\n                                      num_positives[i])\n    def clear(self):\n        for calculator in self._ap_calculators:\n            calculator.clear()\n    def is_empty(self):\n        return ([calculator.heap_size for calculator in self._ap_calculators] ==\n                [0 for _ in range(self._num_class)])\n    def peek_map_at_n(self):\n        \"\"\"Peek the non-interpolated mean average precision at n.\n    Returns:\n      An array of non-interpolated average precision at n (default 0) for each\n      class.\n    \"\"\"\n        aps = [\n            self._ap_calculators[i].peek_ap_at_n()\n            for i in range(self._num_class)"
+        },
+        {
+            "comment": "This code calculates the mean average precision (mAP) by averaging the average precisions of each class. It returns the mAP value as a result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py\":112-113",
+            "content": "        ]\n        return aps"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5040f134-25fe-4f2c-acdc-a26d37111ea0.json b/docs/doc/5040f134-25fe-4f2c-acdc-a26d37111ea0.json
new file mode 100644
index 000000000..53cb0959a
--- /dev/null
+++ b/docs/doc/5040f134-25fe-4f2c-acdc-a26d37111ea0.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code repository features a PaddlePaddle implementation of Table Tennis action recognition with the VideoSwinTransformer model, supporting feature extraction, classification, single/multi-GPU training, and pre-trained models. Running prediction generates gif files overlaid with predictions, while optimization can be done by adjusting sampling parameters or hyperparameters.",
+    "details": [
+        {
+            "comment": "This code repository contains a PaddlePaddle implementation of Table Tennis action recognition using the VideoSwinTransformer model. The code is based on PaddlePaddle 2.2, and the training data should be placed in the \"data\" directory. It includes steps for image feature extraction and action classification using SwinTransformer3D and I3DHead respectively. Training can be done with single or multi-GPU configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/ActionRecognition/README.md\":0-42",
+            "content": "# \u4e52\u4e53\u7403\u52a8\u4f5c\u8bc6\u522b\u6a21\u578b\n## \u5185\u5bb9\n- [\u6a21\u578b\u7b80\u4ecb](#\u6a21\u578b\u7b80\u4ecb)\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u8bad\u7ec3](#\u6a21\u578b\u8bad\u7ec3)\n- [\u6a21\u578b\u63a8\u7406](#\u6a21\u578b\u63a8\u7406)\n- [\u6a21\u578b\u4f18\u5316](#\u6a21\u578b\u4f18\u5316)\n- [\u6a21\u578b\u90e8\u7f72](#\u6a21\u578b\u90e8\u7f72)\n- [\u53c2\u8003\u8bba\u6587](#\u53c2\u8003\u8bba\u6587)\n\u5728\u5f00\u59cb\u4f7f\u7528\u4e4b\u524d\uff0c\u60a8\u9700\u8981\u6309\u7167\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5\u989d\u5916\u7684\u4f9d\u8d56\u5305\uff1a\n```bash\npython -m pip install imageio\n```\n## \u6a21\u578b\u7b80\u4ecb\n\u8be5\u4ee3\u7801\u5e93\u7528\u4e8e\u4e52\u4e53\u7403\u52a8\u4f5c\u8bc6\u522b, \u57fa\u4e8epaddle2.2\u7248\u672c\u5f00\u53d1\uff0c\u7ed3\u5408PaddleVideo\u4e2d\u7684VideoSwinTransformer\u6a21\u578b\uff0c\u5bf9\u7ed9\u5b9a\u7684\u4e52\u4e53\u7403\u89c6\u9891\u8fdb\u884c\u52a8\u4f5c\u5206\u7c7b\u3002\n\u4e3b\u8981\u5206\u4e3a\u5982\u4e0b\u51e0\u6b65\n - \u56fe\u50cf\u7279\u5f81\u62bd\u53d6\uff0cSwinTransformer3D\n - \u52a8\u4f5c\u5206\u7c7b\uff0cI3DHead\n## \u6570\u636e\u51c6\u5907\nTODO\n## \u6a21\u578b\u8bad\u7ec3\n\u4e3b\u8981\u4ee3\u7801\u6765\u81eaVideoSwin\u6a21\u578b\uff1a[VideoSwin](../../../docs/zh-CN/model_zoo/recognition/videoswin.md)\n1. \u4f7f\u7528VideoSwin\u5728K400\u4e0a\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u57fa\u7840\u4e0a\u8fdb\u884cfinetune\uff0c\u56e0\u6b64\u9996\u5148\u4e0b\u8f7dK400\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u5e76\u653e\u7f6e\u5230`data`\u76ee\u5f55\u4e0b\n    ```bash\n    wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_k400.pdparams\n    ```\n2. \u4f7f\u7528`TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml`\u914d\u7f6e\u6587\u4ef6\u8fdb\u884c\u8bad\u7ec3\n    \u8bad\u7ec3\u542f\u52a8\u547d\u4ee4\u5982\u4e0b\uff1a\n    ```bash\n    # \u5355\u5361\n    python3.7 -u main.py --amp --validate -c applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml\n    # \u591a\u5361\n    python3.7 -u -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_videoswin_tabletennis main.py --amp --validate -c applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml"
+        },
+        {
+            "comment": "This code snippet is for training and testing the VideoSwin transformer model on the TableTennis dataset using PaddlePaddle. It provides instructions to download pre-trained models and example input video files, and demonstrates how to export an inference model using the provided config file and model parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/ActionRecognition/README.md\":43-65",
+            "content": "    ```\n## \u6a21\u578b\u8bc4\u4f30\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_videoswin_tabletennis  main.py  --test -c configs/recognition/video_swin_transformer/videoswin_tabletennis.yaml -w \"output/VideoSwin_TableTennis/VideoSwin_TableTennis_best.pdparams\"\n```\n## \u6a21\u578b\u63a8\u7406\n\u6211\u4eec\u63d0\u4f9b\u4e86\u4e00\u4e2a\u5728\u4e52\u4e53\u7403\u6570\u636e\u96c6\u4e0a\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u4ee5\u53ca\u4e00\u4e2a\u4e52\u4e53\u7403\u6837\u4f8b\u7684\u89c6\u9891pkl\u6587\u4ef6\uff0c\u4ee5\u4f9b\u6d4b\u8bd5\n```\nwget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_tennis.pdparams # \u4e0b\u8f7d\u4e52\u4e53\u7403\u6570\u636e\u96c6\u4e0a\u8bad\u7ec3\u597d\u7684\u6a21\u578b\nwget -P data/ https://videotag.bj.bcebos.com/Data/example_tennis.pkl # \u4e0b\u8f7d\u4e52\u4e53\u7403\u6837\u4f8b\u8f93\u5165\u89c6\u9891pkl\u6587\u4ef6\n```\n### \u5bfc\u51fa\u63a8\u7406\u6a21\u578b\n```\npython3.7 tools/export_model.py -c applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml \\\n                                -p output/VideoSwin_TableTennis/VideoSwin_TableTennis_best.pdparams \\\n                                -o inference/VideoSwin_TableTennis\n```\n\u4e0a\u8ff0\u547d\u4ee4\u4f1a\u6839\u636e\u4f20\u5165\u7684`.pdparams`\u6a21\u578b\uff0c\u5728`inference/VideoSwin_TableTennis`\u6587\u4ef6\u5939\u4e0b\u751f\u6210\u63a8\u7406\u6a21\u578b\uff0c\u4e3b\u8981\u5305\u62ec3\u4e2a\u6587\u4ef6\uff1a`VideoSwin_TableTennis.pdiparams`\u3001`VideoSwin_TableTennis.pdmodel`\u3001`VideoSwin_TableTennis.info`"
+        },
+        {
+            "comment": "Running prediction code with provided arguments will generate a gif file showing the video overlaid with predicted results (top 1 class and probability) in the results folder.\nThe model can be optimized based on video content, by adjusting sampling parameters like num_seg and seg_len or hyperparameters for training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/ActionRecognition/README.md\":67-97",
+            "content": "### \u4f7f\u7528\u63a8\u7406\u6a21\u578b\n\u6d4b\u8bd5\u6587\u4ef6\u4f7f\u7528`.pkl`\u6587\u4ef6\uff0c\u5176\u5305\u542b\u4e86\u5df2\u62bd\u53d6\u7684\u7528\u4e8e\u9884\u6d4b\u7684\u4e52\u4e53\u7403\u89c6\u9891\u5e27\u3002\n\u8fd0\u884c\u9884\u6d4b\u4ee3\u7801\n```bash\npython3.7 tools/predict.py --input_file data/example_tennis_7.pkl \\\n                           --config applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml \\\n                           --model_file inference/VideoSwin_TableTennis/VideoSwin_TableTennis.pdmodel \\\n                           --params_file inference/VideoSwin_TableTennis/VideoSwin_TableTennis.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\n\u6267\u884c\u4ee5\u4e0a\u547d\u4ee4\u4f1a\u4ea7\u51fa\u4e00\u4e2a\u539f\u89c6\u9891\u53e0\u52a0\u9884\u6d4b\u7ed3\u679c\u6587\u672c(Top1\u7c7b\u522b+\u6982\u7387)\u7684gif\u56fe\u7247\uff0c\u4fdd\u5b58\u5728\u672c\u76ee\u5f55\u7684results\u6587\u4ef6\u5939\u4e0b\uff0cgif\u6587\u4ef6\u540d\u4e0e\u8f93\u5165\u7684pkl\u6587\u4ef6\u540d\u76f8\u540c\u3002\n\u6548\u679c\u5982\u4e0b\u56fe\uff1a\n![example_7.gif](results/example_tennis_7.gif)\n## \u6a21\u578b\u4f18\u5316\n\u5728\u5b9e\u9645\u4f7f\u7528\u573a\u666f\u4e2d\u53ef\u6839\u636e\u89c6\u9891\u5185\u5bb9\u5c1d\u8bd5\u4f18\u5316\u7b56\u7565\n- \u53ef\u6839\u636e\u52a8\u4f5c\u6301\u7eed\u65f6\u95f4\u7684\u957f\u77ed\uff0c\u8c03\u6574\u91c7\u6837\u7684\u6bb5\u6570num_seg\u548c\u6bb5\u5185\u91c7\u6837\u7684\u5e27\u6570seg_len\n- \u53ef\u4ee5\u6839\u636e\u6570\u636e\u96c6\u5927\u5c0f\u8c03\u6574\u6a21\u578b\u8bad\u7ec3\u7684\u8d85\u53c2\u6570\uff0c\u5305\u62ec\u6743\u91cd\u8870\u51cf\u3001DropOut\u6982\u7387\u3001\u5b66\u4e60\u7387\u3001\u66f4\u6362\u4f18\u5316\u5668\u7b49\uff0c\u4ee5\u83b7\u5f97\u66f4\u4f18\u7684\u7ed3\u679c\u3002\n- \u672c\u4ee3\u7801\u7684backbone\u90e8\u5206\u53ef\u4ee5\u4f5c\u4e3a\u89c6\u9891\u7279\u5f81\u63d0\u53d6\u6a21\u5757\uff0c\u4ee3\u66ff\u5176\u5b83\u7684\u52a8\u4f5c\u8bc6\u522bbackbone\uff0c\u4ee5\u83b7\u5f97\u8868\u5f81\u80fd\u529b\u66f4\u5f3a\u7684\u89c6\u9891\u7279\u5f81\uff0c\u4ee5\u63d0\u5347\u6574\u4f53\u4efb\u52a1\u7684\u7cbe\u5ea6\u3002\n## \u6a21\u578b\u90e8\u7f72\nTODO\n## \u53c2\u8003\u8bba\u6587\n- [Video Swin Transformer](https://arxiv.org/pdf/2106.13230.pdf), Ze Liu, Jia Ning, Yue Cao, Yixuan Wei"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/50866672-274d-4c7f-94c1-0ff51774db70.json b/docs/doc/50866672-274d-4c7f-94c1-0ff51774db70.json
new file mode 100644
index 000000000..162700dab
--- /dev/null
+++ b/docs/doc/50866672-274d-4c7f-94c1-0ff51774db70.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports functions from submodules \"optimizer\" and \"lr\" to build optimizer and learning rate for PaddleVideo's Video Quality Assessment application.",
+    "details": [
+        {
+            "comment": "This code imports functions from submodules \"optimizer\" and \"lr\" to build optimizer and learning rate for PaddleVideo's Video Quality Assessment application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py\":0-16",
+            "content": "\"\"\"\n# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .optimizer import build_optimizer\nfrom .lr import build_lr"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/50c4b9b3-d253-4b49-96f6-0c5f6f76e603.json b/docs/doc/50c4b9b3-d253-4b49-96f6-0c5f6f76e603.json
new file mode 100644
index 000000000..20a4307bc
--- /dev/null
+++ b/docs/doc/50c4b9b3-d253-4b49-96f6-0c5f6f76e603.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The BaseMetric class serves as a foundation for various video quality assessment metrics, requiring subclasses to implement the update and overridden methods. It utilizes numpy, paddle, and PaddleVideo's utils for data manipulation and distribution information.",
+    "details": [
+        {
+            "comment": "This Python class, named BaseMetric, is a base class for different video quality assessment metrics. It initializes with data size, batch size, log interval, and optional keyword arguments. The update method must be implemented by subclasses to update the metric values. The class also has abstract methods that must be overridden in subclasses for actual functionality. It utilizes numpy, paddle, and PaddleVideo's utils for data manipulation and distribution information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/base.py\":0-35",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\"\"\"\nfrom abc import abstractmethod\nimport numpy as np\nimport paddle\nfrom paddlevideo.utils import get_dist_info\nfrom .registry import METRIC\nclass BaseMetric(object):\n    \"\"\"Base Metric\"\"\"\n    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):\n        self.data_size = data_size\n        self.batch_size = batch_size\n        _, self.world_size = get_dist_info()\n        self.log_interval = log_interval\n    @abstractmethod\n    def update(self):\n        \"\"\"update\"\"\"\n        raise NotImplementedError\n    @abstractmethod"
+        },
+        {
+            "comment": "This code defines an \"accumulate\" method in a base class, but it raises a NotImplementedError to indicate that subclasses must override this method with their own implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/base.py\":36-38",
+            "content": "    def accumulate(self):\n        \"\"\"accumulate\"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/50c87944-4812-4383-b04b-bddfb3fcb327.json b/docs/doc/50c87944-4812-4383-b04b-bddfb3fcb327.json
new file mode 100644
index 000000000..57986ef66
--- /dev/null
+++ b/docs/doc/50c87944-4812-4383-b04b-bddfb3fcb327.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code installs PaddleVideo's VideoTag app, provides instructions for data preparation and model inference, and represents a dictionary containing information about classified video objects.",
+    "details": [
+        {
+            "comment": "This code provides installation instructions for PaddleVideo's VideoTag application, including dependencies and downloading pre-trained weights. It also outlines the data preparation process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/Run.md\":0-53",
+            "content": "# \u6837\u4f8b\u4ee3\u7801\u8fd0\u884c\u6307\u5357\n---\n## \u5185\u5bb9\n\u53c2\u8003\u672c\u6587\u6863\uff0c\u60a8\u53ef\u4ee5\u5feb\u901f\u719f\u6089VideoTag\u7684\u4f7f\u7528\u65b9\u6cd5\uff0c\u89c2\u5bdfVideoTag\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u5728\u793a\u4f8b\u89c6\u9891\u4e0a\u7684\u9884\u6d4b\u7ed3\u679c\u3002\n\u6587\u6863\u5185\u5bb9\u5305\u62ec:\n- [\u5b89\u88c5\u8bf4\u660e](#\u5b89\u88c5\u8bf4\u660e)\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u63a8\u65ad](#\u6a21\u578b\u63a8\u65ad)\n## \u5b89\u88c5\u8bf4\u660e\n### \u73af\u5883\u4f9d\u8d56\uff1a\n```\n    CUDA >= 9.0\n    cudnn >= 7.5\n```\n### \u4f9d\u8d56\u5b89\u88c5:\n- 1.7.0 <= PaddlePaddle\u7248\u672c <= 2.0.0: pip install paddlepaddle-gpu==1.8.4.post97 -i https://mirror.baidu.com/pypi/simple\n- opencv\u7248\u672c >= 4.1.0: pip install opencv-python==4.2.0.32\n## \u6570\u636e\u51c6\u5907\n### \u9884\u8bad\u7ec3\u6743\u91cd\u4e0b\u8f7d\n\u6211\u4eec\u63d0\u4f9b\u4e86[TSN](https://videotag.bj.bcebos.com/video_tag_tsn.tar)\u548c[AttentionLSTM](https://videotag.bj.bcebos.com/video_tag_lstm.tar)\u9884\u8bad\u7ec3\u6743\u91cd\uff0c\u8bf7\u5728video\\_tag\u76ee\u5f55\u4e0b\u65b0\u5efaweights\u76ee\u5f55\uff0c\u5e76\u5c06\u4e0b\u8f7d\u89e3\u538b\u540e\u7684\u53c2\u6570\u6587\u4ef6\u653e\u5728weights\u76ee\u5f55\u4e0b:\n```\n    mkdir weights\n    cd weights\n    wget https://videotag.bj.bcebos.com/video_tag_tsn.tar\n    wget https://videotag.bj.bcebos.com/video_tag_lstm.tar\n    tar -zxvf video_tag_tsn.tar\n    tar -zxvf video_tag_lstm.tar\n    rm video_tag_tsn.tar -rf\n    rm video_tag_lstm.tar -rf\n    mv video_tag_tsn/* .\n    mv attention_lstm/* .\n    rm video_tag_tsn/ -rf\n    rm attention_lstm -rf\n```\n\u6240\u5f97\u76ee\u5f55\u7ed3\u6784\u5982\u4e0b\uff1a\n```\nvideo_tag\n  \u251c\u2500\u2500weights\n    \u251c\u2500\u2500 attention_lstm.pdmodel\n    \u251c\u2500\u2500 attention_lstm.pdopt  "
+        },
+        {
+            "comment": "This code provides instructions on how to download an example video for testing, how to run model inference, and how to save the results. The example video can be downloaded from a provided link and should be extracted into a specific directory structure. The model inference script is named videotag_test.py and prints prediction probabilities. Users can specify a different output directory using the --save\\_dir parameter. The predictions are saved as JSON files in the specified directory, with each file corresponding to a video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/Run.md\":54-104",
+            "content": "    \u251c\u2500\u2500 attention_lstm.pdparams\n    \u251c\u2500\u2500 tsn.pdmodel\n    \u251c\u2500\u2500 tsn.pdopt\n    \u2514\u2500\u2500 tsn.pdparams\n```\n### \u793a\u4f8b\u89c6\u9891\u4e0b\u8f7d\n\u6211\u4eec\u63d0\u4f9b\u4e86[\u6837\u4f8b\u89c6\u9891](https://videotag.bj.bcebos.com/mp4.tar)\u65b9\u4fbf\u7528\u6237\u6d4b\u8bd5\uff0c\u8bf7\u4e0b\u8f7d\u540e\u89e3\u538b\uff0c\u5e76\u5c06\u89c6\u9891\u6587\u4ef6\u653e\u7f6e\u5728video\\_tag/data/mp4\u76ee\u5f55\u4e0b:\n```\n    cd data/\n    wget https://videotag.bj.bcebos.com/mp4.tar\n    tar -zxvf mp4.tar\n    rm mp4.tar -rf\n```\n\u6240\u5f97\u76ee\u5f55\u7ed3\u6784\u5982\u4e0b\uff1a\n```\nvideo_tag\n  \u251c\u2500\u2500data\n    \u251c\u2500\u2500 mp4\n      \u251c\u2500\u2500 1.mp4\n      \u251c\u2500\u2500 2.mp4\n      \u2514\u2500\u2500 ...\n```\n## \u6a21\u578b\u63a8\u65ad\n\u6a21\u578b\u63a8\u65ad\u7684\u542f\u52a8\u65b9\u5f0f\u5982\u4e0b\uff1a\n    python videotag_test.py\n- \u9884\u6d4b\u7ed3\u679c\u4f1a\u4ee5\u65e5\u5fd7\u65b9\u5f0f\u6253\u5370\uff0c\u793a\u4f8b\u5982\u4e0b:\n```\n[========video_id [ data/mp4/1.mp4 ] , topk(20) preds: ========]\nclass_id: 3110, class_name: \u8bad\u7ec3 ,  probability:  0.97730666399\nclass_id: 2159, class_name: \u8e72 ,  probability:  0.945082366467\n...\n[========video_id [ data/mp4/2.mp4 ] , topk(20) preds: ========]\nclass_id: 2773, class_name: \u821e\u8e48 ,  probability:  0.850423932076\nclass_id: 1128, class_name: \u8868\u6f14\u827a\u672f ,  probability:  0.0446354188025\n...\n```\n- \u901a\u8fc7--save\\_dir\u53ef\u6307\u5b9a\u9884\u6d4b\u7ed3\u679c\u5b58\u50a8\u8def\u5f84\uff0c\u9ed8\u8ba4\u4e3avideo\\_tag/data/VideoTag\\_results\uff0c\u4e0d\u540c\u8f93\u5165\u89c6\u9891\u7684\u9884\u6d4b\u7ed3\u679c\u5206\u6587\u4ef6\u4fdd\u5b58\u5728\u4e0d\u540c\u7684json\u6587\u4ef6\u4e2d\uff0c\u6587\u4ef6\u7684\u5185\u5bb9\u683c\u5f0f\u4e3a\uff1a\n```\n    [file_path,\n     {\"class_name\": class_name1, \"probability\": probability1, \"class_id\": class_id1},"
+        },
+        {
+            "comment": "This code represents a dictionary containing information about a classified video object. The 'class_name' key holds the name of the class, 'probability' stores the confidence level of the classification, and 'class_id' contains the identifier of the recognized class. These dictionaries are stored in an array, potentially for multiple classifications within the same video or different videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/Run.md\":105-108",
+            "content": "     {\"class_name\": class_name2, \"probability\": probability2, \"class_id\": class_id2},\n     ...\n    ]\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/50ce1e9f-ecb1-49a0-9e7f-bab5a7669020.json b/docs/doc/50ce1e9f-ecb1-49a0-9e7f-bab5a7669020.json
new file mode 100644
index 000000000..bc50443c9
--- /dev/null
+++ b/docs/doc/50ce1e9f-ecb1-49a0-9e7f-bab5a7669020.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code downloads, extracts and provides label information for 19228 videos' feature frames in \"activitynet_1.3_annotations.json\" for PaddleVideo model pre-training, using decompressed data from \"bmn_feat.tar.gz\". Users need to modify `feat_path` and `file_path` in the configuration file.",
+    "details": [
+        {
+            "comment": "ActivityNet is a large-scale dataset for video understanding tasks like action localization and recognition. The code provides instructions on how to download the processed ActivityNet 1.3 dataset, consisting of videos with corresponding labels, durations, and frames. Users can choose between two methods: downloading precompressed packages or clicking provided hyperlinks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ActivityNet.md\":0-23",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/ActivityNet.md) | English\n# ActivityNet data preparation\n- [Introduction](#Introduction)\n- [Download](#Download)\n## Introduction\nActivityNet is a dataset for large-scale video understanding tasks, which can be used for tasks such as action localization, action recognition, etc.\n## Download\n1. The BMN model uses the processed ActivityNet 1.3 dataset. There are two ways to use it:\n    - Using our processed ActivityNet 1.3 dataset (compressed package is about 5.5G), each video has corresponding action labels, duration intervals, duration frames, duration seconds and other information\n        Download with the following command:\n        ```bash\n        wget https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz # Download the processed video feature data\n        wget https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json # Download the processed label data\n        ```\n        Or click the following hyperlinks to download:\n        [Video feature data](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)"
+        },
+        {
+            "comment": "The code is explaining how to download and extract video feature data from the \"activitynet_1.3_annotations.json\" file for a model in PaddleVideo. It mentions decompressing the \"bmn_feat.tar.gz\" file, extracting features by yourself using TSN, and providing the necessary files and instructions to download and pre-train the TSN model. The \"activitynet_1.3_annotations.json\" file contains information about video annotations for training purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ActivityNet.md\":24-44",
+            "content": "        [Video feature data](https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json)\n        then decompression `bmn_feat.tar.gz`\n        ```bash\n        tar -xf bmn_feat.tar.gz\n        ```\n    - Extract features by yourself\n        First refer to [Download Instructions](https://github.com/activitynet/ActivityNet/tree/master/Crawler) to download the original dataset. When training this model, you need to use TSN to extract features from the source files first. You can [self-extract](https://github.com/yjxiong/temporal-segment-networks) video frame and optical flow information, and the pre-trained TSN model can be downloaded from [here](https://github.com/ yjxiong/anet2016-cuhk) download.\n    The information in the `activitynet_1.3_annotations.json` tag file is as follows:\n    ```json\n    {\n        \"v_QOlSCBRmfWY\": {\n            \"duration_second\": 82.73,\n            \"subset\": \"training\",\n            \"duration_frame\": 2067,\n            \"annotations\": [{\n                \"segment\": [6.195294851794072, 77.73085420904837],"
+        },
+        {
+            "comment": "The code represents a dictionary containing label information and video feature frame data for 19228 videos. Each key represents a video, and the corresponding value is another dictionary with 'duration_second', 'subset', 'duration_frame', 'feature_frame' keys, and an array of 'annotations' which includes 'segment' (time range) and 'label' information. The code also mentions that there will be 19228 video feature npy files obtained from the activitynet_1.3_annotations.json file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ActivityNet.md\":45-76",
+            "content": "                \"label\": \"Ballet\"\n            }],\n            \"feature_frame\": 2064\n        },\n        \"v_ehGHCYKzyZ8\": {\n            \"duration_second\": 61.7189999999999994,\n            \"subset\": \"training\",\n            \"duration_frame\": 1822,\n            \"annotations\": [{\n                \"segment\": [43.95990729267573, 45.401932082395355],\n                \"label\": \"Doing crunches\"\n            }],\n            \"feature_frame\": 1808\n        },\n        ...,\n        ...\n    }\n    ```\n    In the end, `19228` video feature npy files are obtained, corresponding to the `19228` label information in the `activitynet_1.3_annotations.json` file.\n2. Create a new `data/bmn_data` folder, and then unzip the video feature data after downloading and put it in this folder, and finally it should be organized into the following form:\n    ```\n    PaddleVideo\n    \u251c\u2500\u2500 data\n    \u2502   \u251c\u2500\u2500 bmn_data\n    \u2502   \u2502   \u251c\u2500\u2500 fix_feat_100\n    \u2502   \u2502   \u2502   \u251c\u2500\u2500 v___c8enCfzqw.npy\n    \u2502   \u2502   \u2502   \u251c\u2500\u2500 v___dXUJsj3yo.npy\n    \u2502   \u2502   \u2502   \u251c\u2500\u2500 ...\n    \u2502   \u2502   \u2502\n    \u2502   \u2502   \u2514\u2500\u2500 activitynet_1.3_annotations.json"
+        },
+        {
+            "comment": "In the code, it is instructing to modify two fields in the configuration file. The `feat_path` field needs updating with the feature directory path, and the `file_path` should be specified for the label file path. This ensures proper data access during program execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ActivityNet.md\":77-79",
+            "content": "    ```\n3. Finally, modify the `feat_path` field in the configuration file configs/localization/bmn.yaml to specify the feature directory path, and the `file_path` field to specify the label file path."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/51e06ebf-f1dc-4c8b-b8ca-8d4734756a79.json b/docs/doc/51e06ebf-f1dc-4c8b-b8ca-8d4734756a79.json
new file mode 100644
index 000000000..28c346f0a
--- /dev/null
+++ b/docs/doc/51e06ebf-f1dc-4c8b-b8ca-8d4734756a79.json
@@ -0,0 +1,45 @@
+{
+    "summary": "Two learning rate scheduler classes, CustomWarmupCosineDecay and CosineAnnealingDecay, are provided for optimizing models with warm-up and stepwise cosine decay. The `CustomWarmupPiecewiseDecay` class is a custom scheduler for PaddleVideo, implementing piecewise function and warmup phase with linear decay.",
+    "details": [
+        {
+            "comment": "This code defines a custom learning rate scheduler called CustomWarmupCosineDecay, which combines warm-up and stepwise cosine decay for optimizing models. It extends the LRScheduler class and allows users to define specific start learning rates and the number of epochs for warm-up before applying stepwise cosine decay.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py\":0-32",
+            "content": "\"\"\"\n# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport math\nfrom paddle.optimizer.lr import *\n\"\"\"\nPaddleVideo Learning Rate Schedule:\nYou can use paddle.optimizer.lr\nor define your custom_lr in this file.\n\"\"\"\nclass CustomWarmupCosineDecay(LRScheduler):\n    \"\"\"\n    We combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        warmup_start_lr (float): start learning rate used in warmup stage.\n        warmup_epochs (int): the number epochs of warmup."
+        },
+        {
+            "comment": "This code defines a class \"CosineAnnealingDecay\" for scheduling the learning rate. It takes parameters such as base learning rate, total epochs, number of iterations per epoch, and initializes instance variables accordingly. The step() method will update the last_lr/last_epoch/base_lr based on the provided parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py\":33-54",
+            "content": "        cosine_base_lr (float|int, optional): base learning rate in cosine schedule.\n        max_epoch (int): total training epochs.\n        num_iters(int): number iterations of each epoch.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CosineAnnealingDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 warmup_start_lr,\n                 warmup_epochs,\n                 cosine_base_lr,\n                 max_epoch,\n                 num_iters,\n                 last_epoch=-1,\n                 verbose=False):\n        self.warmup_start_lr = warmup_start_lr\n        self.warmup_epochs = warmup_epochs\n        self.cosine_base_lr = cosine_base_lr\n        self.max_epoch = max_epoch\n        self.num_iters = num_iters\n        #call step() in base class, last_lr/last_epoch/base_lr will be update"
+        },
+        {
+            "comment": "The code defines a CustomWarmupCosineDecay class that extends the Optimizer. It has an __init__ method to initialize last_epoch and verbose, and a step method to update learning rate based on current epoch. The step method also handles cases where epoch is None or provided manually. Additionally, there is a _lr_func_cosine method for calculating the learning rate using a cosine function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py\":55-80",
+            "content": "        super(CustomWarmupCosineDecay, self).__init__(last_epoch=last_epoch,\n                                                      verbose=verbose)\n    def step(self, epoch=None):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if self.last_epoch == -1:\n                self.last_epoch += 1\n            else:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def _lr_func_cosine(self, cur_epoch, cosine_base_lr, max_epoch):"
+        },
+        {
+            "comment": "This code defines a custom learning rate (LR) scheduler that combines warmup and stepwise-cosine decay. It starts with a warmup phase, then uses a cosine annealing LR schedule. The `get_lr` function calculates the current learning rate based on the current epoch, maximum epoch, warmup epochs, and other parameters. This scheduler is used in the \"slowfast\" model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py\":81-107",
+            "content": "        \"\"\"start to cosine\"\"\"\n        return cosine_base_lr * (math.cos(math.pi * cur_epoch / max_epoch) +\n                                 1.0) * 0.5\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        lr = self._lr_func_cosine(self.last_epoch, self.cosine_base_lr,\n                                  self.max_epoch)\n        lr_end = self._lr_func_cosine(self.warmup_epochs, self.cosine_base_lr,\n                                      self.max_epoch)\n        # Perform warm up.\n        if self.last_epoch < self.warmup_epochs:\n            lr_start = self.warmup_start_lr\n            alpha = (lr_end - lr_start) / self.warmup_epochs\n            lr = self.last_epoch * alpha + lr_start\n        return lr\nclass CustomWarmupPiecewiseDecay(LRScheduler):\n    \"\"\"\n    This op combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        warmup_start_lr (float): start learning rate used in warmup stage.\n        warmup_epochs (int): the number epochs of warmup.\n        step_base_lr (float|int, optional): base learning rate in step schedule."
+        },
+        {
+            "comment": "This code defines a class `CustomWarmupPiecewiseDecay` for scheduling learning rates. It takes several parameters like warmup start lr, warmup epochs, step base lr, lrs (list of lr values), gamma, steps, max_epoch, num_iters, last_epoch and verbose. The constructor initializes the class with these parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py\":108-133",
+            "content": "        max_epoch (int): total training epochs.\n        num_iters(int): number iterations of each epoch.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CustomWarmupPiecewiseDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 warmup_start_lr,\n                 warmup_epochs,\n                 step_base_lr,\n                 lrs,\n                 gamma,\n                 steps,\n                 max_epoch,\n                 num_iters,\n                 last_epoch=0,\n                 verbose=False):\n        self.warmup_start_lr = warmup_start_lr\n        self.warmup_epochs = warmup_epochs\n        self.step_base_lr = step_base_lr\n        self.lrs = lrs\n        self.gamma = gamma\n        self.steps = steps\n        self.max_epoch = max_epoch\n        self.num_iters = num_iters"
+        },
+        {
+            "comment": "This code defines a custom learning rate scheduler for optimizers, allowing the learning rate to be updated based on epochs. The `step` function is used to update the learning rate, and the `_lr_func_steps_with_relative_lrs` function seems to set the learning rates for each parameter group.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py\":134-159",
+            "content": "        self.last_epoch = last_epoch\n        self.last_lr = self.warmup_start_lr  # used in first iter\n        self.verbose = verbose\n        self._var_name = None\n    def step(self, epoch=None, rebuild=False):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if not rebuild:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def _lr_func_steps_with_relative_lrs(self, cur_epoch, lrs, base_lr, steps,"
+        },
+        {
+            "comment": "This code defines a custom learning rate (LR) scheduler for the PaddleVideo library. It uses a piecewise function to define different LRs at various epochs, and also implements a warmup phase with a linear decay from an initial LR to the first defined LR after the warmup period. The code provides functions to calculate the LR at each epoch based on the given steps, LR values, base LR, and maximum epoch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py\":160-195",
+            "content": "                                         max_epoch):\n        \"\"\"lr func steps with relative lrs\"\"\"\n        # get step index\n        steps = steps + [max_epoch]\n        for ind, step in enumerate(steps):\n            if cur_epoch < step:\n                break\n        return lrs[ind - 1] * base_lr\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        lr = self._lr_func_steps_with_relative_lrs(\n            self.last_epoch,\n            self.lrs,\n            self.step_base_lr,\n            self.steps,\n            self.max_epoch,\n        )\n        lr_end = self._lr_func_steps_with_relative_lrs(\n            self.warmup_epochs,\n            self.lrs,\n            self.step_base_lr,\n            self.steps,\n            self.max_epoch,\n        )\n        # Perform warm up.\n        if self.last_epoch < self.warmup_epochs:\n            lr_start = self.warmup_start_lr\n            alpha = (lr_end - lr_start) / self.warmup_epochs\n            lr = self.last_epoch * alpha + lr_start\n        return lr\nclass CustomPiecewiseDecay(PiecewiseDecay):"
+        },
+        {
+            "comment": "This code defines a custom learning rate scheduler, which initializes an instance of the class and takes keyword arguments. The 'num_iters' argument is specifically excluded from being passed as a parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py\":196-200",
+            "content": "    \"\"\"CustomPiecewiseDecay\"\"\"\n    def __init__(self, **kargs):\n        \"\"\"start\"\"\"\n        kargs.pop('num_iters')\n        super().__init__(**kargs)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/526a992a-fb80-424c-8273-2748fcece73e.json b/docs/doc/526a992a-fb80-424c-8273-2748fcece73e.json
new file mode 100644
index 000000000..9945041a6
--- /dev/null
+++ b/docs/doc/526a992a-fb80-424c-8273-2748fcece73e.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports the function 'get_metrics' from the 'metrics_util' module in the same application directory. This function is likely used to calculate and retrieve various metrics related to video processing or analysis.",
+    "details": [
+        {
+            "comment": "This code imports the function 'get_metrics' from the 'metrics_util' module in the same application directory. This function is likely used to calculate and retrieve various metrics related to video processing or analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/__init__.py\":0-0",
+            "content": "from .metrics_util import get_metrics"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/52e7b804-02ff-454a-9a3d-fc306cd4077e.json b/docs/doc/52e7b804-02ff-454a-9a3d-fc306cd4077e.json
new file mode 100644
index 000000000..e19be9747
--- /dev/null
+++ b/docs/doc/52e7b804-02ff-454a-9a3d-fc306cd4077e.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code loads an action detection model, reads a list of videos from a URL file, and for each video, it prints its name, creates paths for image frames and audio, calls the infer function to get bone-coordinate matrices (bmn_results) and action results, stores them in a list named 'results', and writes JSON data to \"results.json\" file.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries, setting the path to access an action detection model. The model is loaded and a list of videos are read from a URL file. For each video, its name is printed, the required paths for image frames and audio are created, and the action detection model's infer function is called to get bone-coordinate matrices (bmn_results) and action results. These results are then stored in a list named 'results'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/predict.py\":0-32",
+            "content": "import os\nimport sys\nimport json\nsys.path.append('action_detect')\nfrom action import ActionDetection\nif __name__ == '__main__':\n    #dataset_dir = \"/workspace/PaddleVideo/applications/FootballAction/datasets/EuroCup2016\"\n    dataset_dir = \"../datasets/EuroCup2016\"\n    model_predict = ActionDetection(cfg_file=\"./configs/configs.yaml\")\n    model_predict.load_model()\n    video_url = os.path.join(dataset_dir, 'url_val.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]\n    results = []\n    for line in lines:\n        video_name = line\n        print(video_name)\n        imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")\n        pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n        bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n        results.append({\n            'video_name': line,\n            'bmn_results': bmn_results,\n            'action_results': action_results\n        })"
+        },
+        {
+            "comment": "Writes JSON data to \"results.json\" file, ensuring UTF-8 encoding and readable indentation for improved readability.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/predict.py\":34-36",
+            "content": "    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/52ee5f2d-38b9-4f24-b70a-c634554414e2.json b/docs/doc/52ee5f2d-38b9-4f24-b70a-c634554414e2.json
new file mode 100644
index 000000000..9cd6ca3ce
--- /dev/null
+++ b/docs/doc/52ee5f2d-38b9-4f24-b70a-c634554414e2.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code contains the version information for PaddleVideo, licensed under the Apache License 2.0, and defines the current version as \"0.0.1\".",
+    "details": [
+        {
+            "comment": "This code contains the version information for PaddleVideo, licensed under the Apache License 2.0, and defines the current version as \"0.0.1\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/version.py\":0-15",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n__all__ = [\"paddlevideo_version\"]\npaddlevideo_version = \"0.0.1\""
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5319de35-5759-4c56-a6a3-0b026f570722.json b/docs/doc/5319de35-5759-4c56-a6a3-0b026f570722.json
new file mode 100644
index 000000000..ec7b3da97
--- /dev/null
+++ b/docs/doc/5319de35-5759-4c56-a6a3-0b026f570722.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code initializes and defines a class for MS-RNN/VTT model metrics computation, updates metrics using input data, calculates rank metrics (r1, r5, r10, medr, mean), logs these metrics, and signals the end of iterations.",
+    "details": [
+        {
+            "comment": "The code is from the MSRVTTMetric class in paddlevideo's metrics module. It initializes an instance of the class, prepares for metrics computation, and creates score_matrix and target_matrix using numpy with zeroes. These matrices will be used to store results during metric calculations. The class also registers with the base BaseMetric class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/msrvtt_metric.py\":0-30",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass MSRVTTMetric(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.score_matrix = np.zeros((data_size, data_size))\n        self.target_matrix = np.zeros((data_size, data_size))"
+        },
+        {
+            "comment": "This code initializes a rank matrix, updates score and target matrices based on input data, calculates r1, r5, r10 rank metrics and median rank (medr) and mean rank, then logs these metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/msrvtt_metric.py\":31-55",
+            "content": "        self.rank_matrix = np.ones((data_size)) * data_size\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        target = data[-1]\n        cm_logit = outputs[-1]\n        self.score_matrix[batch_id, :] = F.softmax(\n            cm_logit, axis=1)[:, 0].reshape([-1]).numpy()\n        self.target_matrix[batch_id, :] = target.reshape([-1]).numpy()\n        rank = np.where((np.argsort(-self.score_matrix[batch_id]) == np.where(\n            self.target_matrix[batch_id] == 1)[0][0]) == 1)[0][0]\n        self.rank_matrix[batch_id] = rank\n        rank_matrix_tmp = self.rank_matrix[:batch_id + 1]\n        r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp)\n        r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp)\n        r10 = 100.0 * np.sum(rank_matrix_tmp < 10) / len(rank_matrix_tmp)\n        medr = np.floor(np.median(rank_matrix_tmp) + 1)\n        meanr = np.mean(rank_matrix_tmp) + 1\n        logger.info(\n            \"[{}] Final r1:{:.3f}, r5:{:.3f}, r10:{:.3f}, mder:{:.3f}, meanr:{:.3f}\""
+        },
+        {
+            "comment": "This code defines a class for accumulating metrics related to the MS-RNN/VTT model. It seems to have methods for updating and finalizing the metric calculations. The update method takes in values r1, r5, r10, medr, and meanr, which are likely performance scores. The accumulate method signals the end of iterations by logging a message saying \"Eval Finished!\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/msrvtt_metric.py\":56-61",
+            "content": "            .format(batch_id, r1, r5, r10, medr, meanr))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        logger.info(\"Eval Finished!\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/537299c8-cfa2-4eb5-a8b7-1f055d82abc1.json b/docs/doc/537299c8-cfa2-4eb5-a8b7-1f055d82abc1.json
new file mode 100644
index 000000000..66634b296
--- /dev/null
+++ b/docs/doc/537299c8-cfa2-4eb5-a8b7-1f055d82abc1.json
@@ -0,0 +1,120 @@
+{
+    "summary": "The `BaseDataset` class serves as a base for creating video feature datasets, handling missing values and encoding text while supporting efficient dataset partitioning.",
+    "details": [
+        {
+            "comment": "Copyright and license information, importing necessary libraries, and type guarding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":0-35",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport time\nimport json\nimport random\nimport paddle\nimport inspect\nimport logging\nimport functools\nimport data_loader\nimport numpy as np\nimport pickle as pkl\nfrom pathlib import Path\nfrom abc import abstractmethod\nfrom typing import Dict, Union\nfrom numpy.random import randint\nfrom typeguard import typechecked\nfrom collections import OrderedDict\nfrom zsvision.zs_utils import memcache\ntry:\n    from paddlenlp.transformers import BertTokenizer\nexcept ImportError as e:\n    print(\n        f\"{e}, [paddlenlp] package and it's dependencies is required for T2VLAD.\""
+        },
+        {
+            "comment": "This code defines a base class `BaseDataset` for creating and loading video features dataset. It contains methods for generating required paths, performing sanity checks on loaded data, and loading features from disk. The class is abstract and requires subclass implementation of these methods. It also includes utility functions and settings like `dataset_paths`, `sanity_checks`, and `load_features`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":36-75",
+            "content": "    )\nfrom utils import ensure_tensor, expert_tensor_storage\n# For SLURM usage, buffering makes it difficult to see events as they happen, so we set\n# the global print statement to enforce flushing\nprint = functools.partial(print, flush=True)\nclass BaseDataset(paddle.io.Dataset):\n    @staticmethod\n    @abstractmethod\n    @typechecked\n    def dataset_paths() -> Dict[str, Union[Path, str]]:\n        \"\"\"Generates a datastructure containing all the paths required to load features\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def sanity_checks(self):\n        \"\"\"Run sanity checks on loaded data\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def load_features(self):\n        \"\"\"Load features from disk\n        \"\"\"\n        raise NotImplementedError\n    @typechecked\n    def __init__(\n        self,\n        data_dir: Path,\n        eval_only: bool,\n        use_zeros_for_missing: bool,\n        text_agg: str,\n        text_feat: str,\n        split_name: str,\n        cls_partition: str,\n        root_feat_folder: str,"
+        },
+        {
+            "comment": "The code above defines a class for a dataset, with various parameters such as text_dim, num_test_captions, and max_tokens. It sets the necessary attributes including logger, text_feat, data_dir, and experts. The class also initializes the tokenizer and sets the restrict_test_captions and text_features attributes before calling load_features() method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":76-100",
+            "content": "        text_dim: int,\n        num_test_captions: int,\n        restrict_train_captions: int,\n        max_tokens: Dict[str, int],\n        logger: logging.Logger,\n        raw_input_dims: Dict[str, int],\n        feat_aggregation: Dict[str, Dict],\n    ):\n        self.eval_only = eval_only\n        self.logger = logger\n        self.text_feat = text_feat\n        self.data_dir = data_dir\n        self.text_dim = text_dim\n        self.restrict_train_captions = restrict_train_captions\n        self.max_tokens = max_tokens\n        self.cls_partition = cls_partition\n        self.num_test_captions = num_test_captions\n        self.feat_aggregation = feat_aggregation\n        self.root_feat = data_dir / root_feat_folder\n        self.experts = set(raw_input_dims.keys())\n        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n        # This attributes can be overloaded by different datasets, so it must be set\n        # before the `load_features() method call`\n        self.restrict_test_captions = None\n        self.text_features = None"
+        },
+        {
+            "comment": "This code initializes class variables for a dataset object. It sets the label features, video labels, raw captions, and features to None. It loads the word2int mapping from a JSON file. The code allows for one caption per video in training minibatches. It creates an ordered list of experts based on input dimensions. The training and test lists are set by dataset-specific subclasses. The code is for retrieval tasks and uses a single dataloader, handling retrieval data separately. It sets the sample list to the training partition and calculates the total number of samples.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":101-124",
+            "content": "        self.label_features = None\n        self.video_labels = None\n        self.raw_captions = None\n        self.features = None\n        self.word2int = json.load(open('word2int.json'))\n        # Use a single caption per video when forming training minibatches (different\n        # captions from the same video may still be used across different minibatches)\n        self.captions_per_video = 1\n        self.ordered_experts = list(raw_input_dims.keys())\n        # Training and test lists are set by dataset-specific subclasses\n        self.partition_lists = {}\n        self.configure_train_test_splits(split_name=split_name)\n        # All retrieval-based tasks use a single dataloader (and handle the retrieval\n        # data separately), whereas for classification we use one dataloader for\n        # training and one for validation.\n        self.logger.info(\"The current task is retrieval\")\n        self.sample_list = self.partition_lists[\"train\"]\n        self.num_samples = len(self.sample_list)\n        num_val = len(self.partition_lists[\"val\"])"
+        },
+        {
+            "comment": "The code sets default paths for video retrieval, defines missing value strategy based on use_zeros_for_missing argument, loads dataset-specific features into memory and averages text features when text_agg is set to \"avg\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":126-151",
+            "content": "        self.raw_input_dims = raw_input_dims\n        # we store default paths to enable visualisations (this can be overloaded by\n        # dataset-specific classes)\n        self.video_path_retrieval = [\n            f\"videos/{x}.mp4\" for x in self.partition_lists[\"val\"]\n        ]\n        # NOTE: We use nans rather than zeros to indicate missing faces, unless we wish\n        # to test single modality strength, which requires passing zeroed features for\n        # missing videos\n        if use_zeros_for_missing:\n            self.MISSING_VAL = 0\n        else:\n            self.MISSING_VAL = np.nan\n        # load the dataset-specific features into memory\n        self.load_features()\n        if text_agg == \"avg\":\n            self.logger.info(\"averaging the text features...\")\n            for key, val in self.text_features.items():\n                self.text_features[key] = [\n                    np.mean(x, 0, keepdims=1) for x in val\n                ]\n            self.logger.info(\"finished averaging the text features\")"
+        },
+        {
+            "comment": "This code initializes training and raw configuration dictionaries, creates a tensor storage object, iterates through static experts, adds their relevant configurations to the dictionaries, and then builds a retrieval dictionary for both fixed and variable experts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":153-174",
+            "content": "        self.trn_config = {}\n        self.raw_config = {}\n        self.tensor_storage = expert_tensor_storage(self.experts,\n                                                    self.feat_aggregation)\n        for static_expert in self.tensor_storage[\"fixed\"]:\n            if static_expert in self.feat_aggregation:\n                if \"trn_seg\" in self.feat_aggregation[static_expert].keys():\n                    self.trn_config[static_expert] = \\\n                        self.feat_aggregation[static_expert][\"trn_seg\"]\n                if \"raw\" in self.feat_aggregation[static_expert][\"temporal\"]:\n                    self.raw_config[static_expert] = 1\n        retrieval = {\n            expert: np.zeros(\n                (num_val, self.max_tokens[expert], raw_input_dims[expert]))\n            for expert in self.tensor_storage[\"variable\"]\n        }\n        retrieval.update({\n            expert: np.zeros((num_val, raw_input_dims[expert]))\n            for expert in self.tensor_storage[\"fixed\"]\n        })\n        self.retrieval = retrieval"
+        },
+        {
+            "comment": "The code is initializing various arrays and tensors for evaluating the model on validation data. It sets up masks, retrieval tensors for text, captions, and attention, and prepares an empty list for saving the validation captions. This code is part of a larger function that appears to be setting up a dataset for video captioning or related task.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":175-196",
+            "content": "        self.test_ind = {\n            expert: paddle.ones([num_val])\n            for expert in self.experts\n        }\n        self.raw_captions_retrieval = [None] * num_val\n        # avoid evaluation on missing queries\n        self.query_masks = np.zeros((num_val, num_test_captions))\n        self.text_token_mask = np.zeros((num_val, num_test_captions))\n        self.text_retrieval = np.zeros((num_val, self.num_test_captions,\n                                        self.max_tokens[\"text\"], self.text_dim))\n        self.cap_retrieval = paddle.zeros(\n            [num_val, self.num_test_captions, self.max_tokens[\"text\"]],\n            dtype='int64'\n        )  #self.cap_retrieval = th.zeros((num_val, self.num_test_captions, self.max_tokens[\"text\"]))\n        self.att_retrieval = paddle.zeros(\n            [num_val, self.num_test_captions, self.max_tokens[\"text\"]],\n            dtype='int64'\n        )  #self.att_retrieval = th.zeros((num_val, self.num_test_captions, self.max_tokens[\"text\"]))\n        save_cap = []\n        for ii, video_name in enumerate(self.partition_lists[\"val\"]):"
+        },
+        {
+            "comment": "This code initializes the retrieval and test indices for each expert in both fixed and variable tensor storage. It handles missing values by replacing them with 'MISSING_VAL' and binarizing non-missing features using marker values if requested.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":198-216",
+            "content": "            self.raw_captions_retrieval[ii] = self.raw_captions[video_name]\n            for expert in self.tensor_storage[\"fixed\"].intersection(\n                    self.experts):\n                feats = self.features[expert][video_name]\n                drop = self.has_missing_values(feats)\n                self.test_ind[expert][ii] = not drop\n                self.retrieval[expert][ii] = feats\n                if drop:\n                    self.retrieval[expert][ii][:] = self.MISSING_VAL\n                if self.feat_aggregation[expert].get(\"binarise\", False):\n                    keep = np.logical_not(\n                        np.isnan(self.retrieval[expert][:, 0, 0]))\n                    marker = np.ones_like(self.retrieval[expert][keep])\n                    self.retrieval[expert][keep] = marker\n            for expert in self.tensor_storage[\"variable\"].intersection(\n                    self.experts):\n                feats = self.features[expert][video_name]\n                drop = self.has_missing_values(feats)"
+        },
+        {
+            "comment": "The code is handling the process of selecting video features and test captions for a specific expert. It drops certain entries, sets missing values where needed, applies binarization if required, and limits the number of tokens based on maximum token limit. It also restricts test captions if specified by the user. Finally, it sets query masks to prepare for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":217-236",
+            "content": "                self.test_ind[expert][ii] = not drop\n                if drop:\n                    self.retrieval[expert][ii][:] = self.MISSING_VAL\n                if self.feat_aggregation[expert].get(\"binarise\", False):\n                    keep = np.logical_not(\n                        np.isnan(self.retrieval[expert][:, 0, 0]))\n                    marker = np.ones_like(self.retrieval[expert][keep])\n                    self.retrieval[expert][keep] = marker\n                if self.test_ind[expert][ii]:\n                    keep = min(self.max_tokens[expert], len(feats))\n                    self.retrieval[expert][ii, :keep, :] = feats[:keep]\n            candidates_sentences = self.text_features[video_name]\n            if self.restrict_test_captions is not None:\n                keep_sent_idx = self.restrict_test_captions[video_name]\n                candidates_sentences = [candidates_sentences[keep_sent_idx]]\n            self.query_masks[ii, :len(candidates_sentences)] = 1\n            for test_caption_idx in range(self.num_test_captions):"
+        },
+        {
+            "comment": "This code is iterating over a list of candidate sentences, breaking when the index exceeds the list length. For each sentence, it sets the number of tokens to keep based on the maximum allowed and masks the corresponding tokens. It then encodes the sentence into tokenized input IDs and attention mask for PaddlePaddle's model, appending the original sentence to a save list, storing the tokenized inputs in 'cap_retrieval', and the attention masks in 'att_retrieval'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":237-256",
+            "content": "                if len(candidates_sentences) <= test_caption_idx:\n                    break\n                keep = min(len(candidates_sentences[test_caption_idx]),\n                           self.max_tokens[\"text\"])\n                self.text_token_mask[ii, test_caption_idx] = keep\n                sent = self.raw_captions_retrieval[ii][test_caption_idx]\n                sent = \" \".join(sent)\n                sent = sent.strip()\n                encoded_dict = self.tokenizer.__call__(\n                    sent,\n                    max_seq_len=self.max_tokens[\"text\"],\n                    pad_to_max_seq_len=True,\n                    return_attention_mask=True,\n                    truncation_strategy='longest_first')\n                cap_ids = paddle.to_tensor(encoded_dict['input_ids'])\n                attention_mask = paddle.to_tensor(\n                    encoded_dict['attention_mask'])\n                save_cap.append(sent)\n                self.cap_retrieval[ii, test_caption_idx, :] = cap_ids\n                self.att_retrieval[ii, test_caption_idx, :] = attention_mask"
+        },
+        {
+            "comment": "The code is checking the progress of a dataset evaluation, creating text features for each sentence in the list, storing them in an array and then dumping the saved captions into a file called 'run_cap.pkl'. It also includes a function to configure train/test splits of the dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":257-279",
+            "content": "                if ii % 500 == 0 and test_caption_idx == 0:\n                    msg = (\n                        f\"{ii}/{len(self.partition_lists['val'])} will evaluate \"\n                        f\"sentence {test_caption_idx} out of \"\n                        f\"{len(candidates_sentences)} (has {keep} words) \"\n                        f\"{video_name}\")\n                    self.logger.info(msg)\n                text_feats = candidates_sentences[test_caption_idx][:keep]\n                if text_feats.shape[0] == 0:\n                    text_feats = 0\n                    raise ValueError(\"empty text features!\")\n                self.text_retrieval[ii, test_caption_idx, :keep, :] = text_feats\n        with open('run_cap.pkl', 'wb') as f:\n            pkl.dump(save_cap, f)\n        self.sanity_checks()\n    def configure_train_test_splits(self, split_name):\n        \"\"\"Partition the datset into train/val/test splits.\n        Args:\n            split_name (str): the name of the split\n        \"\"\"\n        self.paths = type(self).dataset_paths()"
+        },
+        {
+            "comment": "The code loads training/validation splits, reads and stores them in partition lists for later use, and initializes tensor storage for the PaddleVideo application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":280-303",
+            "content": "        print(\"loading training/val splits....\")\n        tic = time.time()\n        for subset, path in self.paths[\"subset_list_paths\"][split_name].items():\n            root_feat = Path(self.root_feat)\n            subset_list_path = root_feat / path\n            if subset == \"train\" and self.eval_only:\n                rows = []\n            else:\n                with open(subset_list_path) as f:\n                    rows = f.read().splitlines()\n            self.partition_lists[subset] = rows\n        print(\"done in {:.3f}s\".format(time.time() - tic))\n        self.split_name = split_name\n    def collate_data(self, data):\n        batch_size = len(data)\n        tensors = {}\n        for expert in self.tensor_storage[\"fixed\"]:\n            if expert in self.trn_config.keys():\n                tensors[expert] = paddle.to_tensor(\n                    np.zeros((batch_size, self.trn_config[expert],\n                              self.raw_input_dims[expert])))\n            else:\n                tensors[expert] = paddle.to_tensor("
+        },
+        {
+            "comment": "This code initializes tensors for a batch of data in a dataset. It creates zero-initialized tensors for each expert (modality), and separate tensors for text data including token masks, cap IDs, and attention mask. These will be filled with actual data as the batch is processed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":304-326",
+            "content": "                    np.zeros((batch_size, self.raw_input_dims[expert])))\n        # Track which indices of each modality are available in the present batch\n        ind = {\n            expert: paddle.to_tensor(np.zeros(batch_size))\n            for expert in self.experts\n        }\n        tensors.update({\n            expert: paddle.to_tensor(\n                np.zeros((batch_size, self.max_tokens[expert],\n                          self.raw_input_dims[expert])))\n            for expert in self.tensor_storage[\"variable\"]\n        })\n        text_tensor = paddle.to_tensor(\n            np.zeros((batch_size, self.captions_per_video,\n                      self.max_tokens[\"text\"], self.text_dim)))\n        text_token_mask = paddle.to_tensor(\n            np.zeros((batch_size, self.captions_per_video)))\n        text_cap_id = paddle.zeros([batch_size, self.max_tokens[\"text\"]],\n                                   dtype='int64')\n        text_att_mask = paddle.zeros([batch_size, self.max_tokens[\"text\"]],\n                                     dtype='int64')"
+        },
+        {
+            "comment": "This code iterates through a dataset, extracting data for various experts and creating tensors from it. It handles missing values and stores text and mask information in separate tensors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":328-349",
+            "content": "        for ii, _ in enumerate(data):\n            datum = data[ii]\n            for expert in self.experts:\n                ind[expert][ii] = datum[f\"{expert}_ind\"]\n            for expert in self.tensor_storage[\"fixed\"]:\n                tensors[expert][ii] = datum[expert]\n            for expert in self.tensor_storage[\"variable\"]:\n                if ind[expert][ii]:\n                    keep = min(len(datum[expert]), self.max_tokens[expert])\n                    if keep:\n                        tensors[expert][ii, :keep, :] = datum[expert][:keep]\n                else:\n                    tensors[expert][ii, :, :] = self.MISSING_VAL\n            text = datum[\"text\"]\n            cap_id = datum[\"cap_id\"]\n            att_mask = datum[\"att_mask\"]\n            text_cap_id[ii, :] = paddle.to_tensor(cap_id)\n            text_att_mask[ii, :] = paddle.to_tensor(att_mask)\n            for jj in range(self.captions_per_video):\n                keep = min(len(text[jj]), self.max_tokens[\"text\"])\n                text_tensor[ii, jj, :keep, :] = text[jj][:keep]"
+        },
+        {
+            "comment": "This code creates a minibatch for video features and text data. It applies binarization to some features, converts tensors, and prepares inputs for machine learning models. The process_sent function sets default values for EOS and UNK consistent with the word2int.json file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":350-371",
+            "content": "                text_token_mask[ii, jj] = keep\n        ind = {key: ensure_tensor(val) for key, val in ind.items()}\n        experts = OrderedDict(\n            (expert, paddle.to_tensor(tensors[expert], dtype='float32'))\n            for expert in self.ordered_experts)\n        for expert in self.experts:\n            if self.feat_aggregation[expert].get(\"binarise\", False):\n                replace = np.logical_not(paddle.isnan(experts[expert][:, 0, 0]))\n                experts[expert][replace] = paddle.ones_like(\n                    experts[expert][replace])\n        minibatch = {\"experts\": experts, \"ind\": ind}\n        minibatch[\"text\"] = paddle.to_tensor(text_tensor, dtype='float32')\n        minibatch[\"cap_id\"] = paddle.to_tensor(text_cap_id, dtype='int64')\n        minibatch[\"att_mask\"] = paddle.to_tensor(text_att_mask, dtype='int64')\n        minibatch[\"text_token_mask\"] = paddle.to_tensor(text_token_mask)\n        return minibatch\n    def process_sent(self, sent, max_words, EOS: int = 1, UNK: int = 2):\n        # set EOS=1, UNK=2 by default, consistent with file 'word2int.json'."
+        },
+        {
+            "comment": "This code defines a dataset class that loads and processes video features for text-to-video retrieval. It takes a list of videos, extracts expert features, and pads them to a fixed length. The class also supports indexing and has methods for getting the number of samples in the dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":372-396",
+            "content": "        tokens = [self.word2int.get(w, UNK) for w in sent]\n        tokens = tokens[:max_words]\n        tokens_len = len(tokens)\n        tokens = np.array(tokens + [EOS] * (max_words - tokens_len))\n        return tokens, tokens_len\n    def __len__(self):\n        return self.num_samples\n    def __getitem__(self, idx):\n        if idx < self.num_samples:\n            vid = self.sample_list[idx]\n            features = {}\n            for expert in self.experts:\n                if expert not in self.trn_config.keys():\n                    if expert in self.raw_config.keys():\n                        features[expert] = np.mean(self.features[expert][vid],\n                                                   axis=0)\n                    else:\n                        features[expert] = self.features[expert][vid]\n                else:\n                    raw_frame_feats = self.features[expert][vid]\n                    new_length = 1\n                    num_frames = raw_frame_feats.shape[0]\n                    avg_duration = ((num_frames - new_length + 1) //"
+        },
+        {
+            "comment": "The code segments video frame features into smaller segments with a specified average duration, accounts for the last segment if the duration is not divisible by the specified interval, and ensures the number of new feature segments matches the expected number.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":397-412",
+            "content": "                                    self.trn_config[expert])\n                    assert avg_duration > 0, \"average duration must be positive\"\n                    if avg_duration > 0:\n                        # maybe we could change to use average for each tiny segment\n                        # seems like use everything per iter\n                        offsets = np.multiply(\n                            list(range(self.trn_config[expert])), avg_duration)\n                        offsets += randint(avg_duration,\n                                           size=self.trn_config[expert])\n                        new_frame_feats = np.zeros(\n                            (self.trn_config[expert], raw_frame_feats.shape[1]))\n                        for idx, xx in enumerate(offsets):\n                            new_frame_feats[idx, :] = raw_frame_feats[xx, :]\n                        msg = \"returning a wrong feature != segment num\"\n                        assert new_frame_feats.shape[0] == self.trn_config[\n                            expert], msg"
+        },
+        {
+            "comment": "This code is responsible for handling inconsistencies in text features storage. It randomly selects a caption from a list of captions for a given video, applies tokenization, and ensures that the sequence length does not exceed a maximum threshold. The result is stored in the encoded_dict variable.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":413-436",
+            "content": "                        features[expert] = new_frame_feats\n            ind = {}\n            for expert in self.ordered_experts:\n                if expert in self.tensor_storage[\"flaky\"]:\n                    ind[expert] = not self.has_missing_values(features[expert])\n                else:\n                    ind[expert] = 1\n            # Handle some inconsistencies between how the text features are stored\n            text = self.text_features[vid]\n            if isinstance(text, list):\n                pick = np.random.choice(len(text), size=self.captions_per_video)\n                sent = self.raw_captions[vid][pick[0]]\n                sent = \" \".join(sent)\n                sent = sent.strip()\n                text = np.array(text)[pick]\n                encoded_dict = self.tokenizer.__call__(\n                    sent,\n                    max_seq_len=self.max_tokens[\"text\"],\n                    pad_to_max_seq_len=True,\n                    return_attention_mask=True,\n                    truncation_strategy='longest_first')"
+        },
+        {
+            "comment": "This code is initializing a sample for video dataset, using either given or randomly chosen text. It creates a dictionary with cap_id, attention mask, and other tensors as key-value pairs, and returns the sample. The get_retrieval_data function converts retrieval data to tensors and adds them to a dictionary containing text and experts keys before returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":437-462",
+            "content": "                cap_id = encoded_dict['input_ids']\n                token_type_ids = encoded_dict['token_type_ids']\n                attention_mask = encoded_dict['attention_mask']\n            else:\n                pick = None\n                text = np.random.choice(text, size=self.captions_per_video)\n        # Return both the missing indices as well as the tensors\n        sample = {\"text\": text}\n        sample.update({\"cap_id\": cap_id})\n        sample.update({\"att_mask\": attention_mask})\n        sample.update({f\"{key}_ind\": val for key, val in ind.items()})\n        sample.update(features)\n        return sample\n    def get_retrieval_data(self):\n        experts = OrderedDict(\n            (expert, paddle.to_tensor(self.retrieval[expert], dtype='float32'))\n            for expert in self.ordered_experts)\n        retrieval_data = {\n            \"text\":\n            paddle.to_tensor(ensure_tensor(self.text_retrieval),\n                             dtype='float32'),\n            \"experts\":\n            experts,\n            \"cap_id\":"
+        },
+        {
+            "comment": "The function defines a dictionary 'retrieval_data' containing cap_retrieval, att_mask, test_ind, and text_token_mask. It also defines the 'meta' dictionary containing query_masks, raw_captions, and paths. The function returns both 'retrieval_data' and 'meta'. The code provides a path lookup for visual features and skips loading if the feature is not requested.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":463-491",
+            "content": "            paddle.to_tensor(self.cap_retrieval, dtype='int64'),\n            \"att_mask\":\n            paddle.to_tensor(self.att_retrieval, dtype='int64'),\n            \"ind\":\n            self.test_ind,\n            \"text_token_mask\":\n            paddle.to_tensor(self.text_token_mask)\n        }\n        meta = {\n            \"query_masks\": self.query_masks,\n            \"raw_captions\": self.raw_captions_retrieval,\n            \"paths\": self.video_path_retrieval,\n        }\n        return retrieval_data, meta\n    def has_missing_values(self, x):\n        return isinstance(x, float) and np.isnan(x)\n    def visual_feat_paths(self, model_spec, tag=None):\n        \"\"\"Canonical path lookup for visual features\n        \"\"\"\n        if model_spec not in self.ordered_experts:\n            self.logger.info(\n                f\"Skipping load for {model_spec} (feature not requested)\")\n            return f\"SKIPPED-{model_spec}\"\n        feat_type, model_name, _ = model_spec.split(\".\")\n        aggs = self.feat_aggregation[model_spec]\n        base = f\"aggregated_{feat_type.replace('-', '_')}\""
+        },
+        {
+            "comment": "The code defines a function that generates feature paths based on the provided arguments. It assembles a base string with parameters like fps, pixel_dim, and stride. If the feature type is \"facecrops\" or \"faceboxes\", it includes those parameters in the base string. For other types except for \"ocr\", \"speech\", and \"audio\", it also includes those parameters in the base string. It then adds optional parameters like offset and inner_stride if present. Finally, it generates a feature path list with file names and appends the tag if provided. The function also defines a logging assertion function that writes assertions to logs using a recipe from an external link.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":492-515",
+            "content": "        required = (\"fps\", \"pixel_dim\", \"stride\")\n        fps, pixel_dim, stride = [aggs.get(x, None) for x in required]\n        if feat_type in {\"facecrops\", \"faceboxes\"}:\n            base = f\"{base}_{fps}fps_{pixel_dim}px_stride{stride}\"\n        elif feat_type not in {\"ocr\", \"speech\", \"audio\"}:\n            base = f\"{base}_{fps}fps_{pixel_dim}px_stride{stride}\"\n        for option in \"offset\", \"inner_stride\":\n            if aggs.get(option, None) is not None:\n                base += f\"_{option}{aggs[option]}\"\n        feat_paths = []\n        for agg in aggs[\"temporal\"].split(\"-\"):\n            fname = f\"{model_name}-{agg}\"\n            if aggs[\"type\"] == \"logits\":\n                fname = f\"{fname}-logits\"\n            if tag is not None:\n                fname += f\"-{tag}\"\n            feat_paths.append(Path(base) / f\"{fname}.pickle\")\n        return feat_paths\n    def log_assert(self, bool_, msg=\"\", verbose=True):\n        \"\"\"Use assertions that will be written to the logs. This is a recipe from:\n        http://code.activestate.com/recipes/577074-logging-asserts/"
+        },
+        {
+            "comment": "The code snippet is a function that checks an assertion. If the assertion fails, it constructs an exception message containing the traceback from the calling frame and raises an AssertionError with this message. Another function called \"summary_stats\" reports basic statistics about feature availability and variable lengths across different data subsets.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":516-538",
+            "content": "        \"\"\"\n        try:\n            assert bool_, msg\n        except AssertionError:\n            # construct an exception message from the code of the calling frame\n            last_stackframe = inspect.stack()[-2]\n            source_file, line_no, func = last_stackframe[1:4]\n            source = f\"Traceback (most recent call last):\\n\" + \\\n                     f\" File {source_file}, line {line_no}, in {func}\\n\"\n            if verbose:\n                # include more lines than that where the statement was made\n                source_code = open(source_file).readlines()\n                source += \"\".join(source_code[line_no - 3:line_no + 1])\n            else:\n                source += last_stackframe[-2][0].strip()\n            self.logger.debug(f\"{msg}\\n{source}\")\n            raise AssertionError(f\"{msg}\\n{source}\")\n    def summary_stats(self):\n        \"\"\"Report basic statistics about feature availability and variable lengths\n        across the different subsets of the data.\n        \"\"\"\n        self.logger.info(\"Computing feature stats...\")"
+        },
+        {
+            "comment": "This code partitions datasets based on predefined subsets and checks the sizes of the features. It prints a summary for each subset, counting missing values and displaying the minimum, maximum, and mean sizes of features. This ensures that the dataset is properly partitioned and allows for efficient analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_dataset.py\":539-561",
+            "content": "        queries = self.ordered_experts + [\"text\"]\n        for subset, keep in self.partition_lists.items():\n            keep = set(keep)\n            print(f\"Summary for {subset}\")\n            for expert in queries:\n                if expert in self.features:\n                    feats = self.features[expert]\n                else:\n                    feats = self.text_features\n                vals = [feats[key] for key in keep]\n                missing = 0\n                sizes = []\n                for val in vals:\n                    if self.has_missing_values(val):\n                        missing += 1\n                    else:\n                        sizes.append(len(val))\n                if sizes:\n                    stat_str = (f\"min: {np.min(sizes):4}, \"\n                                f\"max: {np.max(sizes):4}, \"\n                                f\"mean: {np.mean(sizes):.1f}\")\n                    print(\n                        f\"{subset}: missing: {missing:4}, {stat_str} {expert}\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/548fa608-fac1-4cf7-8d94-aa8296587051.json b/docs/doc/548fa608-fac1-4cf7-8d94-aa8296587051.json
new file mode 100644
index 000000000..29654b97c
--- /dev/null
+++ b/docs/doc/548fa608-fac1-4cf7-8d94-aa8296587051.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code introduces the PP-TSM, a high-performance and efficient video recognition model optimized based on TSM in PaddleVideo. It outlines various strategies like ImageNet pretraining, data augmentation, and optimizer improvements to enhance performance and achieve fast inference speed on V101 GPU with top-1 accuracy on UCF101 and Kinetics400 datasets.",
+    "details": [
+        {
+            "comment": "This code describes the PP-TSM, a high-performance and efficient video recognition model optimized based on TSM in PaddleVideo. It mentions its better performance and inference speed compared to TSM paper and other open source TSM models. Requires PaddlePaddle2.0 for execution. When using ImageNet for pretrain and 8X1 sample, it achieves high top-1 accuracy on UCF101 and Kinetics400 datasets with fast inference speed on V100 GPU.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/pp-tsm.md\":0-21",
+            "content": "# High performance recognition 2D architecture PP-TSM\nPP-TSM\uff1aAn Effective and Efficient video-recognition model   \nPP-TSM is an optimized model based on TSM in PaddleVideo,   \nwhose performance (top-1 on UCF101 and Kinetics400) and inference spped   \nare better than TSM paper(https://arxiv.org/abs/1811.08383 ) and   \nother open source TSM\uff0cPaddlePaddle2.0(available on pip now) or   \nDaily Version( https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-dev )   \nis required to run PP-TSM.    \nWhen only use ImageNet for pretrain and only use 8X1 sample\uff0c  \nPP-TSM\u2019s top1 reached to 89.5% and 73.5% on UCF101 and Kinetics400,   \nand inference speed of FP32 on single V100 is 147 VPS on Kinectics400 dataset.  \ninference speed of FP16 with TensorRT on single V100 isTODO.  \nAs far as we know, under the same conditions,    \ntop1=73.5% on Kinetics400 is the best performance for 2D video model until now.  \nPP-TSM improved performance and speed of TSM with following methods:   \n1\u3001Model Tweaks: ResNet50vd  \uff0c+2.5%  "
+        },
+        {
+            "comment": "This code outlines several strategies implemented to improve the performance of a model, including ImageNet pretraining, better batch size and L2 values, label smoothing, better learning rate decay, data augmentation, and updated epoch numbers. The code also mentions using Knowledge Distillation, optimizer improvements, and plans for integrating PaddleInference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/pp-tsm.md\":22-31",
+            "content": "2\u3001ImageNet pretrain weights based on Knowledge Distillation  \uff0c +1.3%    \n3\u3001beter batch size  \uff0c+0.2%   \n4\u3001beter L2  \uff0c+0.3%  \n5\u3001label_smoothing  \uff0c+0.2%  \n6\u3001beter lr decay  \uff0c+0.15%  \n7\u3001Data augmentation  \uff0c+0.3%  \n8\u3001beter epoch num  \uff0c+0.15%  \n9\u3001bn strategy  \uff0c+0.4%  \n10\u3001integrated PaddleInference  \n11\u3001more strategies todo: Knowledge Distillation\u3001optimizer and so on.  "
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/54c8f988-1d1e-473c-8ad0-7b86dabb0428.json b/docs/doc/54c8f988-1d1e-473c-8ad0-7b86dabb0428.json
new file mode 100644
index 000000000..98e05ce55
--- /dev/null
+++ b/docs/doc/54c8f988-1d1e-473c-8ad0-7b86dabb0428.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a base class for multimodal models in PaddleVideo, requiring subclasses to override train_step, valid_step, test_step, and define abstract methods for validating, testing, and inference steps.",
+    "details": [
+        {
+            "comment": "This code defines a base class for multimodal models in PaddleVideo. It requires subclasses to override train_step, valid_step, and test_step methods. The constructor accepts optional backbone, head, and loss parameters which are built using the builder module. If provided, the backbone is initialized with its init_weights method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/base.py\":0-31",
+            "content": "from abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseMultimodal(nn.Layer):\n    \"\"\"Base class for Multimodal.\n    All Multimodal model should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Head to process feature.\n        loss(dict): Loss function.\n    \"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)"
+        },
+        {
+            "comment": "The code defines a base class for multimodal models, with an initializer to set up the head and loss functions. The `forward` method selects the appropriate step function based on the given mode (train, valid, test, or infer). The abstract `train_step` method must be implemented in subclasses for training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/base.py\":32-62",
+            "content": "            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n        if loss is not None:\n            self.loss = builder.build_loss(loss)\n        else:\n            self.loss = None\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError"
+        },
+        {
+            "comment": "This code defines three abstract methods: val_step, test_step, and infer_step. These methods represent validating, testing, and inference steps respectively. The methods are not yet implemented and will need to be filled by subclasses according to the specific requirements of the model being developed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/base.py\":64-80",
+            "content": "    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/54e499bb-46a2-451a-98ec-6b736112b2db.json b/docs/doc/54e499bb-46a2-451a-98ec-6b736112b2db.json
new file mode 100644
index 000000000..77ddba084
--- /dev/null
+++ b/docs/doc/54e499bb-46a2-451a-98ec-6b736112b2db.json
@@ -0,0 +1,130 @@
+{
+    "summary": "The code prepares for DAVIS2017 image processing, initializes variables, and utilizes PaddlePaddle for video object detection. It involves an interactive image classification system with 8 turns, optimizing scribble labels and filtering keys.",
+    "details": [
+        {
+            "comment": "The code imports necessary libraries and defines functions for processing image data from the DAVIS2017 dataset. It sets up the required transforms, initializes the network models, and reads in the dataset sequences.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":0-38",
+            "content": "import cv2\nimport os\nimport json\nimport paddle\nfrom PIL import Image\nimport timeit\nimport numpy as np\nfrom paddle.vision import transforms\nfrom dataloaders.davis_2017_f import DAVIS2017_Feature_Extract\nimport dataloaders.custom_transforms_f as tr\nfrom davisinteractive.session import DavisInteractiveSession\nfrom networks.deeplab import DeepLab\nfrom networks.IntVOS import IntVOS\nimport time\nfrom davisinteractive.utils.scribbles import scribbles2mask, annotated_frames\nfrom config import cfg\nfrom paddle import nn\nfrom paddle.io import DataLoader\nfrom utils.api import float_, byte_\n@paddle.no_grad()\ndef main():\n    paddle.set_device(\"gpu:0\")\n    total_frame_num_dic = {}\n    #################\n    seqs = []\n    with open(os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',\n                           'val' + '.txt')) as f:\n        seqs_tmp = f.readlines()\n        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n        seqs.extend(seqs_tmp)\n    h_w_dic = {}\n    for seq_name in seqs:\n        images = np.sort(\n            os.listdir("
+        },
+        {
+            "comment": "This code reads a configuration file and initializes variables for video analysis. It loads image information from disk, checks if an existing imgnum dictionary is available, and if not, it populates it by iterating through directories and counting images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":39-61",
+            "content": "                os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/',\n                             seq_name.strip())))\n        total_frame_num_dic[seq_name] = len(images)\n        im_ = cv2.imread(\n            os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/', seq_name,\n                         '00000.jpg'))\n        im_ = np.array(im_, dtype=np.float32)\n        hh_, ww_ = im_.shape[:2]\n        h_w_dic[seq_name] = (hh_, ww_)\n    _seq_list_file = os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',\n                                  'v_a_l' + '_instances.txt')\n    seq_dict = json.load(open(_seq_list_file, 'r'))\n    ##################\n    seq_imgnum_dict_ = {}\n    seq_imgnum_dict = os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',\n                                   'val_imgnum.txt')\n    if os.path.isfile(seq_imgnum_dict):\n        seq_imgnum_dict_ = json.load(open(seq_imgnum_dict, 'r'))\n    else:\n        for seq in os.listdir(os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/')):\n            seq_imgnum_dict_[seq] = len(\n                os.listdir(os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/',"
+        },
+        {
+            "comment": "Creating a dictionary of image numbers and saving it, setting save flags for predicted masks, checking if results directory exists and creating it if not, defining maximum interactive parameters, importing DeepLab model and Instant VOS model, and loading the saved model from specified location.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":62-86",
+            "content": "                                        seq)))\n        with open(seq_imgnum_dict, 'w') as f:\n            json.dump(seq_imgnum_dict_, f)\n    ##################\n    is_save_image = False  # Save the predicted masks\n    report_save_dir = cfg.RESULT_ROOT\n    save_res_dir = cfg.SAVE_RESULT_DIR  # changed to path\n    if not os.path.exists(cfg.RESULT_ROOT):\n        os.makedirs(cfg.RESULT_ROOT)\n        # Configuration used in the challenges\n    max_nb_interactions = 8  # Maximum number of interactions\n    max_time_per_interaction = 30  # Maximum time per interaction per object\n    # Total time available to interact with a sequence and an initial set of scribbles\n    max_time = max_nb_interactions * max_time_per_interaction  # Maximum time per object\n    # Interactive parameters\n    subset = 'val'\n    host = 'localhost'  # 'localhost' for subsets train and val.\n    feature_extracter = DeepLab(backbone='resnet', freeze_bn=False)\n    model = IntVOS(cfg, feature_extracter)\n    print('model loading...')\n    saved_model_dict = save_res_dir"
+        },
+        {
+            "comment": "This code loads a pre-trained model, evaluates it, and initializes variables for processing scribbles. The code also defines a transform to resize images to specific dimensions, opens a file for writing, and sets up a DavisInteractiveSession object for iterating over interaction data. The session will continue until there are no more interactions left in the dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":87-112",
+            "content": "    pretrained_dict = paddle.load(saved_model_dict)\n    load_network(model, pretrained_dict)\n    print(f'model loading from {saved_model_dict} finished!')\n    model.eval()\n    inter_file = open(os.path.join(cfg.RESULT_ROOT, 'inter_file.txt'), 'w')\n    resized_h, resized_w = 480, 854\n    ###############################\n    composed_transforms = transforms.Compose(\n        [tr.Resize((resized_h, resized_w)),\n         tr.ToTensor()])\n    ###############################\n    seen_seq = []\n    n = 0\n    max_n = 1\n    with DavisInteractiveSession(host=host,\n                                 davis_root=cfg.DATA_ROOT,\n                                 subset=subset,\n                                 report_save_dir=report_save_dir,\n                                 max_nb_interactions=max_nb_interactions,\n                                 max_time=max_time,\n                                 metric_to_optimize='J') as sess:\n        while sess.next():\n            t_total = timeit.default_timer()\n            # Get the current iteration scribbles"
+        },
+        {
+            "comment": "The code is retrieving scribbles and their corresponding sequence, image dimensions are assigned based on the dictionary h_w_dic. If there are no annotated frames from the scribbles, it returns previous masks and submits them. Otherwise, it initializes memories for the first round.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":114-138",
+            "content": "            sequence, scribbles, first_scribble = sess.get_scribbles(\n                only_last=True)\n            h, w = h_w_dic[sequence]\n            if 'prev_label_storage' not in locals().keys():\n                prev_label_storage = paddle.zeros(\n                    [104, h, w])  # because the maximum length of frames is 104.\n            print(sequence)\n            h, w = h_w_dic[sequence]\n            if len(\n                    annotated_frames(scribbles)\n            ) == 0:  # if no scribbles return, keep masks in previous round\n                final_masks = prev_label_storage[:seq_imgnum_dict_[sequence]]\n                sess.submit_masks(final_masks.numpy())\n            else:\n                start_annotated_frame = annotated_frames(scribbles)[0]\n                pred_masks = []\n                pred_masks_reverse = []\n                if first_scribble:  # If in the first round, initialize memories\n                    n_interaction = 1\n                    eval_global_map_tmp_dic = {}\n                    local_map_dics = ({}, {})"
+        },
+        {
+            "comment": "This code is part of an interaction detection process. It writes the interaction details to a file, including the sequence name, type, and frame number. It also checks if the sequence has been seen before and prepares embedding memory for reference image processing. The code uses DAVIS2017_Feature_Extract to extract pixel embeddings in the first round of annotations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":139-162",
+            "content": "                    total_frame_num = total_frame_num_dic[sequence]\n                    obj_nums = seq_dict[sequence][-1]\n                else:\n                    n_interaction += 1\n                ##\n                inter_file.write(sequence + ' ' + 'interaction' +\n                                 str(n_interaction) + ' ' + 'frame' +\n                                 str(start_annotated_frame) + '\\n')\n                ##\n                ##########################Reference image process\n                if first_scribble:  # if in the first round, extract pixel embbedings.\n                    if sequence not in seen_seq:\n                        inter_turn = 1\n                        seen_seq.append(sequence)\n                        embedding_memory = []\n                        test_dataset = DAVIS2017_Feature_Extract(\n                            root=cfg.DATA_ROOT,\n                            transform=composed_transforms,\n                            seq_name=sequence)\n                        testloader = DataLoader(test_dataset,"
+        },
+        {
+            "comment": "This code is iterating through testloader and extracting frame embeddings for each image. The extracted embeddings are then concatenated to form a single embedding memory. If annotated frames are present, the reference frame embedding is extracted from the embedding memory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":163-181",
+            "content": "                                                batch_size=14,\n                                                shuffle=False,\n                                                num_workers=cfg.NUM_WORKER)\n                        for ii, sample in enumerate(testloader):\n                            imgs = sample['img1']\n                            frame_embedding = model.extract_feature(imgs)\n                            embedding_memory.append(frame_embedding)\n                        del frame_embedding\n                        embedding_memory = paddle.concat(embedding_memory, 0)\n                        _, _, emb_h, emb_w = embedding_memory.shape\n                        ref_frame_embedding = embedding_memory[\n                            start_annotated_frame]\n                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                    else:\n                        inter_turn += 1\n                        ref_frame_embedding = embedding_memory[\n                            start_annotated_frame]"
+        },
+        {
+            "comment": "The code applies scribbles to an image using a mask and generates corresponding labels. It then creates a scribble sample and converts it into a tensor. If is_save_image is True, the scribble label image is saved as a PALETTE image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":182-202",
+            "content": "                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                else:\n                    ref_frame_embedding = embedding_memory[\n                        start_annotated_frame]\n                    ref_frame_embedding = ref_frame_embedding.unsqueeze(0)\n                ########\n                scribble_masks = scribbles2mask(scribbles, (emb_h, emb_w))\n                scribble_label = scribble_masks[start_annotated_frame]\n                scribble_sample = {'scribble_label': scribble_label}\n                scribble_sample = tr.ToTensor()(scribble_sample)\n                #                     print(ref_frame_embedding, ref_frame_embedding.shape)\n                scribble_label = scribble_sample['scribble_label']\n                scribble_label = scribble_label.unsqueeze(0)\n                ######\n                if is_save_image:\n                    ref_scribble_to_show = scribble_label.squeeze().numpy()\n                    im_ = Image.fromarray(\n                        ref_scribble_to_show.astype('uint8')).convert('P', )"
+        },
+        {
+            "comment": "This code segment saves a scribble image with the palette applied to a specific directory path based on input parameters. It first checks if the necessary directory exists and creates it if it doesn't, then proceeds to save the image within this directory. The 'first_scribble' variable is used in decision making further down the code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":203-223",
+            "content": "                    im_.putpalette(_palette)\n                    ref_img_name = str(start_annotated_frame)\n                    if not os.path.exists(\n                            os.path.join(cfg.RESULT_ROOT, sequence,\n                                         'interactive' + str(n_interaction),\n                                         'turn' + str(inter_turn))):\n                        os.makedirs(\n                            os.path.join(cfg.RESULT_ROOT, sequence,\n                                         'interactive' + str(n_interaction),\n                                         'turn' + str(inter_turn)))\n                    im_.save(\n                        os.path.join(cfg.RESULT_ROOT, sequence,\n                                     'interactive' + str(n_interaction),\n                                     'turn' + str(inter_turn),\n                                     'inter_' + ref_img_name + '.png'))\n                scribble_label = scribble_label\n                #######\n                if first_scribble:"
+        },
+        {
+            "comment": "This code snippet initializes variables and checks for specific conditions in a segmentation model. It handles the previous label and label storage, updates them based on certain conditions, and submits the final masks to the session if necessary. The interaction segmentation head is printed as a comment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":225-243",
+            "content": "                    prev_label = None\n                    prev_label_storage = paddle.zeros([104, h, w])\n                    prev_label_storage = prev_label_storage\n                else:\n                    prev_label = prev_label_storage[start_annotated_frame]\n                    prev_label = prev_label.unsqueeze(0).unsqueeze(0)\n                if not first_scribble and paddle.unique(\n                        scribble_label).shape[0] == 1:\n                    final_masks = prev_label_storage[:\n                                                     seq_imgnum_dict_[sequence]]\n                    sess.submit_masks(final_masks.numpy())\n                else:  ###inteaction segmentation head\n                    print('inteaction segmentation head')\n                    tmp_dic, local_map_dics = model.int_seghead(\n                        ref_frame_embedding=ref_frame_embedding,\n                        ref_scribble_label=scribble_label,\n                        prev_round_label=prev_label,\n                        global_map_tmp_dic=eval_global_map_tmp_dic,"
+        },
+        {
+            "comment": "This code snippet is part of a PaddleVideo application called Ma-Net, which seems to be related to object detection and video analysis. The code creates a dictionary for input data, retrieves the predicted label from a temporal dictionary, applies interpolation to resize the label, selects the maximum value along an axis, appends the mask to a list of masks, stores the previous label at a specific frame, and saves the predicted label as a numpy array if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":244-261",
+            "content": "                        local_map_dics=local_map_dics,\n                        interaction_num=n_interaction,\n                        seq_names=[sequence],\n                        gt_ids=paddle.to_tensor([obj_nums]),\n                        frame_num=[start_annotated_frame],\n                        first_inter=first_scribble)\n                    pred_label = tmp_dic[sequence]\n                    pred_label = nn.functional.interpolate(pred_label,\n                                                           size=(h, w),\n                                                           mode='bilinear',\n                                                           align_corners=True)\n                    pred_label = paddle.argmax(pred_label, axis=1)\n                    pred_masks.append(float_(pred_label))\n                    prev_label_storage[start_annotated_frame] = float_(\n                        pred_label[0])\n                    if is_save_image:  # save image\n                        pred_label_to_save = pred_label.squeeze(0).numpy()"
+        },
+        {
+            "comment": "This code snippet saves an interactive video frame as a labeled image. It first converts the predicted label to an array, then converts it into a format suitable for saving as an image with the 'P' mode and using a specific palette. The filename is created based on the current frame number and if the directory doesn't exist, it creates one. Finally, the labeled image is saved in the specified directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":262-278",
+            "content": "                        im = Image.fromarray(\n                            pred_label_to_save.astype('uint8')).convert('P', )\n                        im.putpalette(_palette)\n                        imgname = str(start_annotated_frame)\n                        while len(imgname) < 5:\n                            imgname = '0' + imgname\n                        if not os.path.exists(\n                                os.path.join(cfg.RESULT_ROOT, sequence,\n                                             'interactive' + str(n_interaction),\n                                             'turn' + str(inter_turn))):\n                            os.makedirs(\n                                os.path.join(cfg.RESULT_ROOT, sequence,\n                                             'interactive' + str(n_interaction),\n                                             'turn' + str(inter_turn)))\n                        im.save(\n                            os.path.join(cfg.RESULT_ROOT, sequence,\n                                         'interactive' + str(n_interaction),"
+        },
+        {
+            "comment": "This code is part of a video object detection algorithm. It's using a pre-trained model to generate segmentation masks for each frame in the video. The 'turn' and 'imgname' are used as file names for saving images. It applies initial scribble_label if it's the first frame, then it updates previous label and embedding for propagating prediction to next frames. It uses the model's prop_seghead function to generate predictions for each frame's segmentation mask.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":279-297",
+            "content": "                                         'turn' + str(inter_turn),\n                                         imgname + '.png'))\n                    #######################################\n                    if first_scribble:\n                        scribble_label = rough_ROI(scribble_label)\n                    ##############################\n                    ref_prev_label = pred_label.unsqueeze(0)\n                    prev_label = pred_label.unsqueeze(0)\n                    prev_embedding = ref_frame_embedding\n                    #### Propagation ->\n                    for ii in range(start_annotated_frame + 1, total_frame_num):\n                        current_embedding = embedding_memory[ii]\n                        current_embedding = current_embedding.unsqueeze(0)\n                        prev_label = prev_label\n                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.prop_seghead(\n                            ref_frame_embedding,\n                            prev_embedding,\n                            current_embedding,"
+        },
+        {
+            "comment": "Code snippet is calling a function, passing several arguments such as scribble_label, prev_label, and others. It assigns the returned value to pred_label after applying interpolation on it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":298-317",
+            "content": "                            scribble_label,\n                            prev_label,\n                            normalize_nearest_neighbor_distances=True,\n                            use_local_map=True,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            k_nearest_neighbors=cfg.KNNS,\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            start_annotated_frame=start_annotated_frame,\n                            frame_num=[ii],\n                            dynamic_seghead=model.dynamic_seghead)\n                        pred_label = tmp_dic[sequence]\n                        pred_label = nn.functional.interpolate(\n                            pred_label,\n                            size=(h, w),\n                            mode='bilinear',\n                            align_corners=True)"
+        },
+        {
+            "comment": "Code snippet handles image saving for each prediction label in a loop, storing the prediction label as a numpy array and converting it to an image using Pillow library. It then sets the palette and saves the image with a sequence number and interaction number, creating directories if they don't exist.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":319-337",
+            "content": "                        pred_label = paddle.argmax(pred_label, axis=1)\n                        pred_masks.append(float_(pred_label))\n                        prev_label = pred_label.unsqueeze(0)\n                        prev_embedding = current_embedding\n                        prev_label_storage[ii] = float_(pred_label[0])\n                        ####\n                        if is_save_image:\n                            pred_label_to_save = pred_label.squeeze(0).numpy()\n                            im = Image.fromarray(\n                                pred_label_to_save.astype('uint8')).convert(\n                                    'P', )\n                            im.putpalette(_palette)\n                            imgname = str(ii)\n                            while len(imgname) < 5:\n                                imgname = '0' + imgname\n                            if not os.path.exists(\n                                    os.path.join(\n                                        cfg.RESULT_ROOT, sequence,\n                                        'interactive' + str(n_interaction),"
+        },
+        {
+            "comment": "Code creates folders and saves image, then resets variables for frame propagation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":338-355",
+            "content": "                                        'turn' + str(inter_turn))):\n                                os.makedirs(\n                                    os.path.join(\n                                        cfg.RESULT_ROOT, sequence,\n                                        'interactive' + str(n_interaction),\n                                        'turn' + str(inter_turn)))\n                            im.save(\n                                os.path.join(cfg.RESULT_ROOT, sequence,\n                                             'interactive' + str(n_interaction),\n                                             'turn' + str(inter_turn),\n                                             imgname + '.png'))\n                    #######################################\n                    prev_label = ref_prev_label\n                    prev_embedding = ref_frame_embedding\n                    #######\n                    # Propagation <-\n                    for ii in range(start_annotated_frame):\n                        current_frame_num = start_annotated_frame - 1 - ii"
+        },
+        {
+            "comment": "This code section is using PaddlePaddle, a machine learning framework. It seems to be part of an object detection model for video sequences. The function 'model.prop_seghead' is being called with multiple embeddings and labels, and it returns three outputs (tmp_dic, eval_global_map_tmp_dic, local_map_dics) based on the input parameters. Normalization and nearest neighbor distances are used in this process as well. The 'cfg.KNS' likely refers to a pre-defined constant or configuration related to k-nearest neighbors (kNN). Finally, 'n_interaction', 'start_annotated_frame' variables represent interaction numbers and starting frame for annotated frames, respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":356-373",
+            "content": "                        current_embedding = embedding_memory[current_frame_num]\n                        current_embedding = current_embedding.unsqueeze(0)\n                        prev_label = prev_label\n                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.prop_seghead(\n                            ref_frame_embedding,\n                            prev_embedding,\n                            current_embedding,\n                            scribble_label,\n                            prev_label,\n                            normalize_nearest_neighbor_distances=True,\n                            use_local_map=True,\n                            seq_names=[sequence],\n                            gt_ids=paddle.to_tensor([obj_nums]),\n                            k_nearest_neighbors=cfg.KNNS,\n                            global_map_tmp_dic=eval_global_map_tmp_dic,\n                            local_map_dics=local_map_dics,\n                            interaction_num=n_interaction,\n                            start_annotated_frame=start_annotated_frame,"
+        },
+        {
+            "comment": "This code appears to be part of a larger function or script. It seems to involve image processing and potentially object detection or classification. The code is looping through frames of an input video, extracting features and predictions from a model, interpolating the predictions for size consistency, then appending the predicted labels (or masks) to a list. It also stores the last prediction for each frame and optionally saves one of those predictions as an image. The code appears to be part of an object detection or classification task where it is updating the output based on new frames and previous frames' outputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":374-393",
+            "content": "                            frame_num=[current_frame_num],\n                            dynamic_seghead=model.dynamic_seghead)\n                        pred_label = tmp_dic[sequence]\n                        pred_label = nn.functional.interpolate(\n                            pred_label,\n                            size=(h, w),\n                            mode='bilinear',\n                            align_corners=True)\n                        pred_label = paddle.argmax(pred_label, axis=1)\n                        pred_masks_reverse.append(float_(pred_label))\n                        prev_label = pred_label.unsqueeze(0)\n                        prev_embedding = current_embedding\n                        ####\n                        prev_label_storage[current_frame_num] = float_(\n                            pred_label[0])\n                        ###\n                        if is_save_image:\n                            pred_label_to_save = pred_label.squeeze(0).numpy()\n                            im = Image.fromarray("
+        },
+        {
+            "comment": "This code saves the predicted label as an image in a specific directory structure based on the current frame number, sequence name, and interaction turn. It ensures that the directory for the given combination of parameters exists before saving the image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":394-411",
+            "content": "                                pred_label_to_save.astype('uint8')).convert(\n                                    'P', )\n                            im.putpalette(_palette)\n                            imgname = str(current_frame_num)\n                            while len(imgname) < 5:\n                                imgname = '0' + imgname\n                            if not os.path.exists(\n                                    os.path.join(\n                                        cfg.RESULT_ROOT, sequence,\n                                        'interactive' + str(n_interaction),\n                                        'turn' + str(inter_turn))):\n                                os.makedirs(\n                                    os.path.join(\n                                        cfg.RESULT_ROOT, sequence,\n                                        'interactive' + str(n_interaction),\n                                        'turn' + str(inter_turn)))\n                            im.save(\n                                os.path.join(cfg.RESULT_ROOT, sequence,"
+        },
+        {
+            "comment": "This code appears to be part of an interactive image classification system. The code is submitting masks for each turn and interacts up to 8 times, storing the results in memory and then clearing them after completion. At the end, it prints the total time taken for a single interaction and gets the report and summary from the session. The rough_ROI function seems to calculate distances based on input labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":412-435",
+            "content": "                                             'interactive' + str(n_interaction),\n                                             'turn' + str(inter_turn),\n                                             imgname + '.png'))\n                    pred_masks_reverse.reverse()\n                    pred_masks_reverse.extend(pred_masks)\n                    final_masks = paddle.concat(pred_masks_reverse, 0)\n                    sess.submit_masks(final_masks.numpy())\n            if inter_turn == 3 and n_interaction == 8:\n                del eval_global_map_tmp_dic\n                del local_map_dics\n                del embedding_memory\n                del prev_label_storage\n            t_end = timeit.default_timer()\n            print('Total time for single interaction: ' + str(t_end - t_total))\n        report = sess.get_report()\n        summary = sess.get_global_summary(\n            save_file=os.path.join(report_save_dir, 'summary.json'))\n    inter_file.close()\ndef rough_ROI(ref_scribble_labels):\n    dist = 20\n    b, _, h, w = ref_scribble_labels.shape"
+        },
+        {
+            "comment": "The code is applying a filter to refine the scribble labels, where it creates a filter based on the position of non-background pixels and then applies it to the original scribble labels. The function load_network filters out unnecessary keys from pretrained_dict and overwrites entries in the state dict of the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":436-467",
+            "content": "    filter_ = paddle.zeros_like(ref_scribble_labels)\n    to_fill = paddle.zeros_like(ref_scribble_labels)\n    for i in range(b):\n        no_background = (ref_scribble_labels[i] != -1)\n        no_background = no_background.squeeze(0)\n        no_b = no_background.nonzero()\n        (h_min, w_min) = paddle.min(no_b, 0)\n        (h_max, w_max) = paddle.max(no_b, 0)\n        filter_[i, 0,\n                max(h_min - dist, 0):min(h_max + dist, h - 1),\n                max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1\n    final_scribble_labels = paddle.where(byte_(filter_), ref_scribble_labels,\n                                         to_fill)\n    return final_scribble_labels\ndef load_network(net, pretrained_dict):\n    model_dict = net.state_dict()\n    # 1. filter out unnecessary keys\n    f_pretrained_dict = {}\n    for k, v in pretrained_dict.items():\n        if k in model_dict:\n            f_pretrained_dict[k] = v\n        else:\n            print(k)\n    print(len(model_dict.keys()), len(pretrained_dict.keys()))\n    # 2. overwrite entries in the existing state dict"
+        },
+        {
+            "comment": "This code defines a palette with RGB values for 75 different colors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":468-484",
+            "content": "    model_dict.update(pretrained_dict)\n    net.set_state_dict(model_dict)\n_palette = [\n    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,\n    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,\n    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,\n    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,\n    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,\n    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,\n    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,\n    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,\n    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,\n    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,\n    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,"
+        },
+        {
+            "comment": "This code appears to be a sequence of numbers, which could potentially represent a list or array in the code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":485-497",
+            "content": "    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,\n    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,\n    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,\n    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,\n    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,\n    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,\n    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,\n    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,\n    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,\n    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,\n    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,"
+        },
+        {
+            "comment": "This code contains 210 consecutive numbers, possibly representing the iteration or indexing in a loop.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":498-510",
+            "content": "    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,\n    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,\n    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,\n    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,\n    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,\n    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,\n    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,\n    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,\n    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,\n    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,\n    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,"
+        },
+        {
+            "comment": "The code consists of a sequence of integers. It's not executable and doesn't have any apparent function or variable assignments. The specific use case or purpose of these numbers is unclear without further context.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/test.py\":511-524",
+            "content": "    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,\n    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,\n    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,\n    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,\n    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,\n    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,\n    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,\n    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,\n    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,\n    255, 255\n]\nif __name__ == '__main__':\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5538fc80-7b52-4b10-b854-88f8952bc1e4.json b/docs/doc/5538fc80-7b52-4b10-b854-88f8952bc1e4.json
new file mode 100644
index 000000000..7cd1668d2
--- /dev/null
+++ b/docs/doc/5538fc80-7b52-4b10-b854-88f8952bc1e4.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code retrieves data, applies NMS to bounding box proposals, filters detected actions from videos using NMS, and stores relevant information in the \"video_results\" list. It defines a function `get_action_result` that takes inputs and performs NMS on processed results.",
+    "details": [
+        {
+            "comment": "This code defines two functions: `get_data_res` and `base_nms`. The first function takes in a label map, data (a list of features), and a topk value. It iterates through each video in the data, extracts relevant information from the feature, and appends this information to a new list called `video_result`. Finally, it returns the `video_result` list. The second function is an incomplete definition for a non-maximum suppression algorithm used for bounding boxes. It takes in bboxes (bounding box coordinates), thresh (threshold value), delta (optional parameter with default value 0), and nms_id (an identifier for the NMS operation, with a default value of 2).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/process_result.py\":0-38",
+            "content": "\"\"\"\n# @File  : process_result.py  \n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport sys\nimport os\nimport re\nimport numpy as np\nimport pickle\nimport json\nimport logger\nlogger = logger.Logger()\ndef get_data_res(label_map, data, topk):\n    \"\"\"get_data_res\"\"\"\n    sum_vid = len(data)\n    video_result = []\n    for i in range(sum_vid):\n        vid_name = data[i][0][0]\n        # true_label predict_start predict_end predict_score predict_len gt_iou gt_start gt_ioa\n        feature_start_id = float(data[i][0][1]['start'])\n        feature_end_id = float(data[i][0][1]['end'])\n        feature_stage1_score = data[i][0][1]['score']\n        predict_res = []\n        for k in range(topk):\n            score_top = data[i][1][k]\n            labelid_top = data[i][2][k]\n            label_iou = data[i][3]\n            labelname_top = label_map[str(labelid_top)]\n            video_result.append([feature_start_id, feature_end_id, labelid_top, labelname_top, score_top, label_iou])\n    return video_result\ndef base_nms(bboxes, thresh, delta=0, nms_id=2):"
+        },
+        {
+            "comment": "This code performs non-maximal suppression on bounding box proposals. It filters out overlapping boxes by keeping only those with the highest scores and discarding the rest. The function process_proposal takes source bounding box proposals, applies non-maximal suppression with a threshold, and returns the filtered results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/process_result.py\":39-75",
+            "content": "    \"\"\"\n    One-dimensional non-maximal suppression\n    :param bboxes: [[vid, label, st, ed, score, ...], ...]\n    :param thresh:\n    :return:\n    \"\"\"\n    \"\"\"\n    t1 = bboxes[:, 0]\n    t2 = bboxes[:, 1]\n    scores = bboxes[:, nms_id]\n    \"\"\"\n    t1 = np.array([max(0, x[0] - delta) for x in bboxes])\n    t2 = np.array([x[1] + delta for x in bboxes])\n    scores = np.array([x[nms_id] for x in bboxes])\n    durations = t2 - t1\n    order = scores.argsort()[::-1]\n    keep = []\n    while order.size > 0:\n        i = order[0]\n        keep.append(i)\n        tt1 = np.maximum(t1[i], t1[order[1:]])\n        tt2 = np.minimum(t2[i], t2[order[1:]])\n        intersection = tt2 - tt1\n        IoU = intersection / (durations[i] + durations[order[1:]] - intersection).astype(float)\n        inds = np.where(IoU <= thresh)[0]\n        order = order[inds + 1]\n    return [bboxes[i] for i in keep]\ndef process_proposal(source_prop_box, min_frame_thread=5, nms_thresh=0.7, score_thresh=0.01):\n    \"\"\"process_video_prop\"\"\"\n    prop_box = []\n    for items in source_prop_box:"
+        },
+        {
+            "comment": "This code is part of a video classification process. It filters and sorts the detected actions in a video, discarding background or weak detections. The results are stored in 'prop_res' and 'video_results'. The code applies non-maximum suppression (NMS) to filter and sort the detections based on frame duration, score threshold, and other parameters like fps, nms_thread, and nms_delta.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/process_result.py\":76-106",
+            "content": "        start_frame = float(items[0])\n        end_frame = float(items[1])\n        score = float(items[2])\n        if end_frame - start_frame < min_frame_thread or score < score_thresh:\n            continue\n        prop_box.append([start_frame, end_frame, score])\n    prop_box_keep = base_nms(prop_box, nms_thresh)\n    prop_res = []\n    for res in prop_box_keep:\n        prop_res.append({'start': res[0], 'end': res[1], 'score': res[2]})\n    return prop_res\ndef process_video_classify(video_prop, fps, score_thread, iou_thread, \\\n                           nms_id=5, nms_thread=0.01, nms_delta=10, backgroundid=0):\n    \"\"\"process_video_classify\"\"\"\n    prop_filter = []\n    for item in video_prop:\n        if item[2] == backgroundid:\n            continue\n        prop_filter.append(item)\n    # prop_filter = sorted(prop_filter, key=lambda x: x[nms_id], reverse=True)\n    prop_filter = base_nms(prop_filter, nms_thread, nms_delta, nms_id)\n    prop_filter = sorted(prop_filter, key=lambda x: x[0])\n    video_results = []\n    for item in prop_filter:"
+        },
+        {
+            "comment": "This code calculates the start and end time in seconds, frame IDs, and other relevant details of detected actions from a video. It then appends these details as a dictionary to the \"video_results\" list if the classify score and IoU score exceed certain thresholds.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/process_result.py\":107-128",
+            "content": "        start_sec = item[0] / fps\n        end_sec = item[1] / fps\n        start_id_frame = item[0]\n        end_id_frame = item[1]\n        # start_time = \"%02d:%02d:%02d\" % ((start_id_frame / fps) / 3600, \\\n        #     ((start_id_frame / fps) % 3600) / 60, (start_id_frame / fps) % 60)\n        # end_time = \"%02d:%02d:%02d\" % ((end_id_frame / fps) / 3600, \\\n        #     ((end_id_frame / fps) % 3600) / 60, (end_id_frame / fps) % 60)\n        start_time = int(start_id_frame / fps)\n        end_time = int(end_id_frame / fps)\n        label_id = item[2]\n        label_name = item[3]\n        label_classify_score = item[4]\n        label_iou_score = item[5]\n        if label_classify_score > score_thread and label_iou_score > iou_thread:\n            video_results.append({\"start_time\": start_time,\n                                  \"end_time\": end_time,\n                                  \"label_id\": label_id,\n                                  \"label_name\": label_name,\n                                  \"classify_score\": label_classify_score,"
+        },
+        {
+            "comment": "This code defines a function `get_action_result` that takes in `result_info`, `label_map_file`, `fps`, `score_thread`, `iou_thread`, `nms_id`, `nms_thread`, and `frame_offset` as inputs. It reads the label map from `label_map_file`, processes the result data using `get_data_res` function, performs non-maximum suppression (NMS) on the processed results with specified parameters, and returns the final NMS results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/process_result.py\":129-143",
+            "content": "                                  \"iou_score\": label_iou_score})\n    return video_results\ndef get_action_result(result_info, label_map_file, fps, score_thread=0, \\\n                      iou_thread=0, nms_id=5, nms_thread=0.01, frame_offset=10, topk=1):\n    \"\"\"get_action_result\"\"\"\n    label_map = json.load(open(label_map_file, 'r', encoding='utf-8'))\n    org_result = get_data_res(label_map, result_info, topk)\n    nms_result = process_video_classify(org_result, fps, score_thread, iou_thread, nms_id, nms_thread, frame_offset)\n    return nms_result"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/56c474d4-4253-4559-ad0f-108c809f42ab.json b/docs/doc/56c474d4-4253-4559-ad0f-108c809f42ab.json
new file mode 100644
index 000000000..398484f0b
--- /dev/null
+++ b/docs/doc/56c474d4-4253-4559-ad0f-108c809f42ab.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code defines BMNINFReader class for reading and processing BMN model data, includes get_sw_prop function generating proposals, filters less than one-second proposals, performs calculations, and creates a reader class to load video data for training or prediction.",
+    "details": [
+        {
+            "comment": "This code is defining a class BMNINFReader, which extends DataReader and provides functionality for reading data from BMN model. It includes a function get_sw_prop, which generates proposals of a specific window size and step over a given duration. The class also filters out any proposals that are less than one second long.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py\":0-48",
+            "content": "\"\"\"\n# @File  : bmninf_reader.py  \n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport os\nimport random\nimport pickle\nimport json\nimport numpy as np\nimport multiprocessing\nimport numpy as np\nfrom .reader_utils import DataReader\ndef get_sw_prop(duration, window=200, step=10):\n    \"\"\"\n    get_sw_prop\n    \"\"\"\n    pr = []\n    local_boxes = []\n    for k in np.arange(0, duration - window + step, step):\n        start_id = k\n        end_id = min(duration, k + window)\n        if end_id - start_id < window:\n            start_id = end_id - window\n        local_boxes = (start_id, end_id)\n        pr.append(local_boxes)\n    def valid_proposal(duration, span):\n        \"\"\"\n        valid_proposal\n        \"\"\"\n        # fileter proposals\n        # a valid proposal should have at least one second in the video\n        real_span = min(duration, span[1]) - span[0]\n        return real_span >= 1\n    pr = list(filter(lambda x: valid_proposal(duration, x), pr))\n    return pr\nclass BMNINFReader(DataReader):\n    \"\"\"\n    Data reader for BMN model, which was stored as features extracted by prior networks"
+        },
+        {
+            "comment": "This code initializes a class that reads BMNINF data. It takes arguments for name, mode, and configuration (cfg). The tscale and dscale are set from the config file. The tgap, step, image_feature, and pcm_feature variables are calculated and reshaped accordingly. Minimum length is found to ensure both features have same length.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py\":49-72",
+            "content": "    dataset cfg: feat_path, feature path,\n                 tscale, temporal length of BM map,\n                 dscale, duration scale of BM map,\n                 anchor_xmin, anchor_xmax, the range of each point in the feature sequence,\n                 batch_size, batch size of input data,\n                 num_threads, number of threads of data processing\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.tscale = cfg[self.name.upper()]['tscale']  # 200\n        self.dscale = cfg[self.name.upper()]['dscale']  # 200\n        self.tgap = 1. / self.tscale\n        self.step = cfg[self.name.upper()]['window_step']\n        self.material = material\n        src_feature = self.material\n        image_feature = src_feature['image_feature']\n        pcm_feature = src_feature['pcm_feature']\n        pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))\n        min_length = min(image_feature.shape[0], pcm_feature.shape[0])\n        image_feature = image_feature[:min_length, :]"
+        },
+        {
+            "comment": "This code defines a class with methods for getting dataset dictionary and match map. It takes configuration file as input, extracts relevant features from images and pcm data, sets batch size and number of threads based on mode, and creates video list and match map using duration, window size, step, and gap values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py\":73-104",
+            "content": "        pcm_feature = pcm_feature[:min_length, :]\n        self.features = np.concatenate((image_feature, pcm_feature), axis=1)\n        self.duration = len(self.features)\n        self.window = self.tscale\n        self.get_dataset_dict()\n        self.get_match_map()\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        if (mode == 'test') or (mode == 'infer'):\n            self.num_threads = 1  # set num_threads as 1 for test and infer\n    def get_dataset_dict(self):\n        \"\"\"\n        get_dataset_dict\n        \"\"\"\n        self.video_list = get_sw_prop(self.duration, self.window, self.step)\n    def get_match_map(self):\n        \"\"\"\n        get_match_map\n        \"\"\"\n        match_map = []\n        for idx in range(self.tscale):\n            tmp_match_window = []\n            xmin = self.tgap * idx\n            for jdx in range(1, self.tscale + 1):\n                xmax = xmin + self.tgap * jdx\n                tmp_match_window.append([xmin, xmax])\n            match_map.append(tmp_match_window)\n        match_map = np.array(match_map)"
+        },
+        {
+            "comment": "The code is a reader for BMNINF files. It transposes, reshapes, and stores match_map data. The load_file function loads video features based on start and end feature IDs. The create_reader function creates an inferencer reader. Finally, the make_infer_reader function returns a reader for inference tasks that iterates through video windows.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py\":105-140",
+            "content": "        match_map = np.transpose(match_map, [1, 0, 2])\n        match_map = np.reshape(match_map, [-1, 2])\n        self.match_map = match_map\n        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]\n        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]\n    def load_file(self, video_wind):\n        \"\"\"\n        load_file\n        \"\"\"\n        start_feat_id = video_wind[0]\n        end_feat_id = video_wind[1]\n        video_feat = self.features[video_wind[0]: video_wind[1]]\n        video_feat = video_feat.T\n        video_feat = video_feat.astype(\"float32\")\n        return video_feat\n    def create_reader(self):\n        \"\"\"\n        reader creator for ctcn model\n        \"\"\"\n        return self.make_infer_reader()\n    def make_infer_reader(self):\n        \"\"\"\n        reader for inference\n        \"\"\"\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            # for video_name in self.video_list:\n            for video_wind in self.video_list:\n                video_idx = self.video_list.index(video_wind)"
+        },
+        {
+            "comment": "This code defines a reader class that loads and processes video data, creating batches of features for model training or prediction. It uses the `load_file` method to read video files and appends them to the `batch_out` list. When the list reaches the specified `batch_size`, it yields the batch and resets the list. If there are remaining items in the list upon exiting the function, it yields those final batches.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py\":141-150",
+            "content": "                video_feat = self.load_file(video_wind)\n                batch_out.append((video_feat, video_wind, [self.duration, self.dscale]))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/577fb8a8-7e54-4187-8ddc-dd58514de87f.json b/docs/doc/577fb8a8-7e54-4187-8ddc-dd58514de87f.json
new file mode 100644
index 000000000..c0303d80b
--- /dev/null
+++ b/docs/doc/577fb8a8-7e54-4187-8ddc-dd58514de87f.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code deploys a PaddlePaddle model for serving using PaddleServing in PaddleVideo, supporting GPU and CPU installations on Linux platforms. Input/output variables are set for Python serving, and the RPC method is used for prediction. Results are displayed in cmd window.",
+    "details": [
+        {
+            "comment": "This code provides instructions on how to deploy a model service using PaddleServing in the PaddleVideo platform. It starts by explaining that this deployment example uses an HTTP prediction server and is currently only supported on Linux platforms. The instructions then cover how to install Serving, specifying steps for both GPU-accelerated docker installation and CPU-only docker installation, as well as installing the necessary Python packages.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme.md\":0-31",
+            "content": "\u7b80\u4f53\u4e2d\u6587 | [English](./readme_en.md)\n# \u6a21\u578b\u670d\u52a1\u5316\u90e8\u7f72\n## \u7b80\u4ecb\n[Paddle Serving](https://github.com/PaddlePaddle/Serving) \u65e8\u5728\u5e2e\u52a9\u6df1\u5ea6\u5b66\u4e60\u5f00\u53d1\u8005\u8f7b\u677e\u90e8\u7f72\u5728\u7ebf\u9884\u6d4b\u670d\u52a1\uff0c\u652f\u6301\u4e00\u952e\u90e8\u7f72\u5de5\u4e1a\u7ea7\u7684\u670d\u52a1\u80fd\u529b\u3001\u5ba2\u6237\u7aef\u548c\u670d\u52a1\u7aef\u4e4b\u95f4\u9ad8\u5e76\u53d1\u548c\u9ad8\u6548\u901a\u4fe1\u3001\u5e76\u652f\u6301\u591a\u79cd\u7f16\u7a0b\u8bed\u8a00\u5f00\u53d1\u5ba2\u6237\u7aef\u3002\n\u8be5\u90e8\u5206\u4ee5 HTTP \u9884\u6d4b\u670d\u52a1\u90e8\u7f72\u4e3a\u4f8b\uff0c\u4ecb\u7ecd\u600e\u6837\u5728 PaddleVideo \u4e2d\u4f7f\u7528 PaddleServing \u90e8\u7f72\u6a21\u578b\u670d\u52a1\u3002\u76ee\u524d\u53ea\u652f\u6301 Linux \u5e73\u53f0\u90e8\u7f72\uff0c\u6682\u4e0d\u652f\u6301 Windows \u5e73\u53f0\u3002\n## Serving \u5b89\u88c5\nServing \u5b98\u7f51\u63a8\u8350\u4f7f\u7528 docker \u5b89\u88c5\u5e76\u90e8\u7f72 Serving \u73af\u5883\u3002\u9996\u5148\u9700\u8981\u62c9\u53d6 docker \u73af\u5883\u5e76\u521b\u5efa\u57fa\u4e8e Serving \u7684 docker\u3002\n```bash\n# \u542f\u52a8GPU docker\ndocker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel\nnvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash\nnvidia-docker exec -it test bash\n# \u542f\u52a8CPU docker\ndocker pull paddlepaddle/serving:0.7.0-devel\ndocker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash\ndocker exec -it test bash\n```\n\u8fdb\u5165 docker \u540e\uff0c\u9700\u8981\u5b89\u88c5 Serving \u76f8\u5173\u7684 python \u5305\u3002\n```bash\npython3.7 -m pip install paddle-serving-client==0.7.0\npython3.7 -m pip install paddle-serving-app==0.7.0\npython3.7 -m pip install faiss-cpu==1.7.1post2\n#\u82e5\u4e3aCPU\u90e8\u7f72\u73af\u5883:\npython3.7 -m pip install paddle-serving-server==0.7.0  # CPU"
+        },
+        {
+            "comment": "Install PaddlePaddle for CPU and GPU environments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme.md\":32-57",
+            "content": "python3.7 -m pip install paddlepaddle==2.2.0           # CPU\n#\u82e5\u4e3aGPU\u90e8\u7f72\u73af\u5883\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102  # GPU with CUDA10.2 + TensorRT6\npython3.7 -m pip install paddlepaddle-gpu==2.2.0                   # GPU with CUDA10.2\n#\u5176\u4ed6GPU\u73af\u5883\u9700\u8981\u786e\u8ba4\u73af\u5883\u518d\u9009\u62e9\u6267\u884c\u54ea\u4e00\u6761\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101  # GPU with CUDA10.1 + TensorRT6\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112  # GPU with CUDA11.2 + TensorRT8\n```\n* \u5982\u679c\u5b89\u88c5\u901f\u5ea6\u592a\u6162\uff0c\u53ef\u4ee5\u901a\u8fc7 `-i https://pypi.tuna.tsinghua.edu.cn/simple` \u66f4\u6362\u6e90\uff0c\u52a0\u901f\u5b89\u88c5\u8fc7\u7a0b\n* \u66f4\u591a\u73af\u5883\u548c\u5bf9\u5e94\u7684\u5b89\u88c5\u5305\u8be6\u89c1\uff1ahttps://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md\n## \u884c\u4e3a\u8bc6\u522b\u670d\u52a1\u90e8\u7f72\n### \u6a21\u578b\u8f6c\u6362\n\u4f7f\u7528 PaddleServing \u505a\u670d\u52a1\u5316\u90e8\u7f72\u65f6\uff0c\u9700\u8981\u5c06\u4fdd\u5b58\u7684 inference \u6a21\u578b\u8f6c\u6362\u4e3a Serving \u6a21\u578b\u3002\u4e0b\u9762\u4ee5 PP-TSM \u6a21\u578b\u4e3a\u4f8b\uff0c\u4ecb\u7ecd\u5982\u4f55\u90e8\u7f72\u884c\u4e3a\u8bc6\u522b\u670d\u52a1\u3002\n- \u4e0b\u8f7d\u8bad\u7ec3\u597d\u7684 PP-TSM \u7684\u6a21\u578b\uff0c\u5e76\u8f6c\u5316\u4e3a\u63a8\u7406\u6a21\u578b\uff1a\n  ```bash\n  # \u8fdb\u5165PaddleVideo\u76ee\u5f55\n  cd PaddleVideo\n  wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams\n  python3.7 tools/export_model.py \\\n  -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\"
+        },
+        {
+            "comment": "This code is converting a pre-trained PaddlePaddle model to a format suitable for serving on the server. It downloads and unzips the pre-trained model, then uses paddle_serving_client to convert it into the correct format for deployment with specified directories for serving server and client. The `dirname` specifies where the pre-trained model files are stored, while `model_filename` names the Inference Program structure file, defaulting to \"__model__\" if not specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme.md\":58-82",
+            "content": "  -p data/ppTSM_k400_uniform.pdparams \\\n  -o inference/ppTSM\n  ```\n- \u6211\u4eec\u4e5f\u63d0\u4f9b\u4e86\u8f6c\u6362\u597d\u7684\u63a8\u7406\u6a21\u578b\uff0c\u6309\u4ee5\u4e0b\u547d\u4ee4\u4e0b\u8f7d\u5e76\u89e3\u538b\n  ```bash\n  mkdir ./inference\n  wget -nc -P ./inference https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate\n  pushd ./inference\n  unzip ppTSM.zip\n  popd\n  ```\n- \u7528 paddle_serving_client \u628a\u8f6c\u6362\u597d\u7684\u63a8\u7406\u6a21\u578b\u518d\u8f6c\u6362\u6210\u6613\u4e8e Server \u90e8\u7f72\u7684\u6a21\u578b\u683c\u5f0f\uff1a\n  ```bash\n  python3.7 -m paddle_serving_client.convert \\\n  --dirname inference/ppTSM \\\n  --model_filename ppTSM.pdmodel \\\n  --params_filename ppTSM.pdiparams \\\n  --serving_server ./deploy/python_serving/ppTSM_serving_server/ \\\n  --serving_client ./deploy/python_serving/ppTSM_serving_client/\n  ```\n  | \u53c2\u6570              | \u7c7b\u578b | \u9ed8\u8ba4\u503c             | \u63cf\u8ff0                                                         |\n  | ----------------- | ---- | ------------------ | ------------------------------------------------------------ |\n  | `dirname`         | str  | -                  | \u9700\u8981\u8f6c\u6362\u7684\u6a21\u578b\u6587\u4ef6\u5b58\u50a8\u8def\u5f84\uff0cProgram\u7ed3\u6784\u6587\u4ef6\u548c\u53c2\u6570\u6587\u4ef6\u5747\u4fdd\u5b58\u5728\u6b64\u76ee\u5f55\u3002 |\n  | `model_filename`  | str  | None               | \u5b58\u50a8\u9700\u8981\u8f6c\u6362\u7684\u6a21\u578bInference Program\u7ed3\u6784\u7684\u6587\u4ef6\u540d\u79f0\u3002\u5982\u679c\u8bbe\u7f6e\u4e3aNone\uff0c\u5219\u4f7f\u7528 `__model__` \u4f5c\u4e3a\u9ed8\u8ba4\u7684\u6587\u4ef6\u540d |"
+        },
+        {
+            "comment": "The code provides parameters for the PP-TSM model transformation, including a parameter file name (params_filename), and paths to store the converted model files (serving_server) and client configuration files (serving_client). The resulting files will be organized in separate folders (ppTSM_serving_server and ppTSM_serving_client), with specific formats. The alias names 'outputs' must be set for both fetch_var in serving_server_conf.prototxt to ensure compatibility and easy deployment of different models without modifying the code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme.md\":83-102",
+            "content": "  | `params_filename` | str  | None               | \u5b58\u50a8\u9700\u8981\u8f6c\u6362\u7684\u6a21\u578b\u6240\u6709\u53c2\u6570\u7684\u6587\u4ef6\u540d\u79f0\u3002\u5f53\u4e14\u4ec5\u5f53\u6240\u6709\u6a21\u578b\u53c2\u6570\u88ab\u4fdd>\u5b58\u5728\u4e00\u4e2a\u5355\u72ec\u7684\u4e8c\u8fdb\u5236\u6587\u4ef6\u4e2d\uff0c\u5b83\u624d\u9700\u8981\u88ab\u6307\u5b9a\u3002\u5982\u679c\u6a21\u578b\u53c2\u6570\u662f\u5b58\u50a8\u5728\u5404\u81ea\u5206\u79bb\u7684\u6587\u4ef6\u4e2d\uff0c\u8bbe\u7f6e\u5b83\u7684\u503c\u4e3aNone |\n  | `serving_server`  | str  | `\"serving_server\"` | \u8f6c\u6362\u540e\u7684\u6a21\u578b\u6587\u4ef6\u548c\u914d\u7f6e\u6587\u4ef6\u7684\u5b58\u50a8\u8def\u5f84\u3002\u9ed8\u8ba4\u503c\u4e3aserving_server |\n  | `serving_client`  | str  | `\"serving_client\"` | \u8f6c\u6362\u540e\u7684\u5ba2\u6237\u7aef\u914d\u7f6e\u6587\u4ef6\u5b58\u50a8\u8def\u5f84\u3002\u9ed8\u8ba4\u503c\u4e3aserving_client       |\nPP-TSM \u63a8\u7406\u6a21\u578b\u8f6c\u6362\u5b8c\u6210\u540e\uff0c\u4f1a\u5728\u5f53\u524d\u6587\u4ef6\u5939\u591a\u51fa `ppTSM_serving_server` \u548c `ppTSM_serving_client` \u7684\u6587\u4ef6\u5939\uff0c\u5177\u5907\u5982\u4e0b\u683c\u5f0f\uff1a\n  ```bash\n  PaddleVideo/deploy/python_serving\n  \u251c\u2500\u2500 ppTSM_serving_server\n      \u251c\u2500\u2500 ppTSM.pdiparams\n      \u251c\u2500\u2500 ppTSM.pdmodel\n      \u251c\u2500\u2500 serving_server_conf.prototxt\n      \u2514\u2500\u2500 serving_server_conf.stream.prototxt\n  \u251c\u2500\u2500 ppTSM_serving_client\n      \u251c\u2500\u2500 serving_client_conf.prototxt\n      \u2514\u2500\u2500 serving_client_conf.stream.prototxt\n  ```\n\u5f97\u5230\u6a21\u578b\u6587\u4ef6\u4e4b\u540e\uff0c\u9700\u8981\u5206\u522b\u4fee\u6539 `ppTSM_serving_server` \u548c `ppTSM_serving_client` \u4e0b\u7684\u6587\u4ef6 `serving_server_conf.prototxt`\uff0c\u5c06 \u4e24\u4efd\u6587\u4ef6\u4e2d`fetch_var` \u4e0b\u7684 `alias_name` \u5747\u6539\u4e3a `outputs`\n**\u5907\u6ce8**:  Serving \u4e3a\u4e86\u517c\u5bb9\u4e0d\u540c\u6a21\u578b\u7684\u90e8\u7f72\uff0c\u63d0\u4f9b\u4e86\u8f93\u5165\u8f93\u51fa\u91cd\u547d\u540d\u7684\u529f\u80fd\u3002\u8fd9\u6837\uff0c\u4e0d\u540c\u7684\u6a21\u578b\u5728\u63a8\u7406\u90e8\u7f72\u65f6\uff0c\u53ea\u9700\u8981\u4fee\u6539\u914d\u7f6e\u6587\u4ef6\u7684`alias_name`\u5373\u53ef\uff0c\u65e0\u9700\u4fee\u6539\u4ee3\u7801\u5373\u53ef\u5b8c\u6210\u63a8\u7406\u90e8\u7f72\u3002\n\u4fee\u6539\u540e\u7684`serving_server_conf.prototxt`\u5982\u4e0b\u6240\u793a:"
+        },
+        {
+            "comment": "The code represents the configuration for input (\"feed_var\") and output (\"fetch_var\") variables in the PaddleVideo deployment's Python serving. The input variable has a shape of 8,3,224,224 and the output variable has a shape of 400. This code is part of the setup process for sending prediction requests to the PaddleVideo pipeline service using either HTTP or RPC methods.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme.md\":104-151",
+            "content": "```yaml\nfeed_var {\n  name: \"data_batch_0\"\n  alias_name: \"data_batch_0\"\n  is_lod_tensor: false\n  feed_type: 1\n  shape: 8\n  shape: 3\n  shape: 224\n  shape: 224\n}\nfetch_var {\n  name: \"linear_2.tmp_1\"\n  alias_name: \"outputs\"\n  is_lod_tensor: false\n  fetch_type: 1\n  shape: 400\n}\n```\n### \u670d\u52a1\u90e8\u7f72\u548c\u8bf7\u6c42\n`python_serving` \u76ee\u5f55\u5305\u542b\u4e86\u542f\u52a8 pipeline \u670d\u52a1\u3001C++ serving\u670d\u52a1(TODO)\u548c\u53d1\u9001\u9884\u6d4b\u8bf7\u6c42\u7684\u4ee3\u7801\uff0c\u5177\u4f53\u5305\u62ec\uff1a\n```bash\n__init__.py\nconfigs/xxx.yaml            # \u542f\u52a8pipeline\u670d\u52a1\u7684\u914d\u7f6e\u6587\u4ef6\npipeline_http_client.py     # http\u65b9\u5f0f\u53d1\u9001pipeline\u9884\u6d4b\u8bf7\u6c42\u7684python\u811a\u672c\npipeline_rpc_client.py      # rpc\u65b9\u5f0f\u53d1\u9001pipeline\u9884\u6d4b\u8bf7\u6c42\u7684python\u811a\u672c\nrecognition_web_service.py  # \u542f\u52a8pipeline\u670d\u52a1\u7aef\u7684python\u811a\u672c\nutils.py                    # \u50a8\u5b58\u9884\u6d4b\u8fc7\u7a0b\u4e2d\u5e38\u7528\u7684\u51fd\u6570\uff0c\u5982parse_file_paths, numpy_to_base64, video_to_numpy\n```\n#### Python Serving\n- \u8fdb\u5165\u5de5\u4f5c\u76ee\u5f55\uff1a\n```bash\ncd deploy/python_serving\n```\n- \u542f\u52a8\u670d\u52a1\uff1a\n```bash\n# \u5728\u5f53\u524d\u547d\u4ee4\u884c\u7a97\u53e3\u542f\u52a8\u5e76\u4fdd\u6301\u5728\u524d\u7aef\npython3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml\n# \u5728\u540e\u53f0\u542f\u52a8\uff0c\u8fc7\u7a0b\u4e2d\u6253\u5370\u8f93\u51fa\u7684\u65e5\u5fd7\u4f1a\u91cd\u5b9a\u5411\u4fdd\u5b58\u5230log.txt\u4e2d\npython3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml &>log.txt &\n```\n- \u53d1\u9001\u8bf7\u6c42\uff1a\n```bash\n# \u4ee5http\u65b9\u5f0f\u7684\u53d1\u9001\u9884\u6d4b\u8bf7\u6c42\u5e76\u63a5\u53d7\u7ed3\u679c\npython3.7 pipeline_http_client.py -i ../../data/example.avi"
+        },
+        {
+            "comment": "This code demonstrates running the PaddleVideo model for prediction using the RPC (Remote Procedure Call) method. The command \"python3.7 pipeline_rpc_client.py -i ../../data/example.avi\" is used to execute the prediction, and the results are printed in the cmd window.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/readme.md\":153-184",
+            "content": "# \u4ee5rpc\u65b9\u5f0f\u7684\u53d1\u9001\u9884\u6d4b\u8bf7\u6c42\u5e76\u63a5\u53d7\u7ed3\u679c\npython3.7 pipeline_rpc_client.py -i ../../data/example.avi\n```\n\u6210\u529f\u8fd0\u884c\u540e\uff0c\u6a21\u578b\u9884\u6d4b\u7684\u7ed3\u679c\u4f1a\u6253\u5370\u5728 cmd \u7a97\u53e3\u4e2d\uff0c\u7ed3\u679c\u5982\u4e0b\uff1a\n```bash\n# http\u65b9\u5f0f\u6253\u5370\u7684\u7ed3\u679c\n{'err_no': 0, 'err_msg': '', 'key': ['label', 'prob'], 'value': [\"['archery']\", '[0.9907388687133789]'], 'tensors': []}\n# rpc\u65b9\u5f0f\u6253\u5370\u7684\u7ed3\u679c\nPipelineClient::predict pack_data time:1645631086.764019\nPipelineClient::predict before time:1645631086.8485317\nkey: \"label\"\nkey: \"prob\"\nvalue: \"[\\'archery\\']\"\nvalue: \"[0.9907388687133789]\"\n```\n## FAQ\n**Q1**\uff1a \u53d1\u9001\u8bf7\u6c42\u540e\u6ca1\u6709\u7ed3\u679c\u8fd4\u56de\u6216\u8005\u63d0\u793a\u8f93\u51fa\u89e3\u7801\u62a5\u9519\n**A1**\uff1a \u542f\u52a8\u670d\u52a1\u548c\u53d1\u9001\u8bf7\u6c42\u65f6\u4e0d\u8981\u8bbe\u7f6e\u4ee3\u7406\uff0c\u53ef\u4ee5\u5728\u542f\u52a8\u670d\u52a1\u524d\u548c\u53d1\u9001\u8bf7\u6c42\u524d\u5173\u95ed\u4ee3\u7406\uff0c\u5173\u95ed\u4ee3\u7406\u7684\u547d\u4ee4\u662f\uff1a\n```\nunset https_proxy\nunset http_proxy\n```\n**Q2**\uff1a \u670d\u52a1\u7aef\u542f\u52a8\u540e\u6ca1\u6709\u53cd\u5e94\uff0c\u4e00\u76f4\u505c\u5728`start proxy service`\u4e0d\u52a8\n**A2**\uff1a \u5f88\u53ef\u80fd\u662f\u542f\u52a8\u8fc7\u7a0b\u4e2d\u9047\u5230\u4e86\u95ee\u9898\uff0c\u53ef\u4ee5\u5728`./deploy/python_serving/PipelineServingLogs/pipeline.log`\u65e5\u5fd7\u6587\u4ef6\u4e2d\u67e5\u770b\u8be6\u7ec6\u62a5\u9519\u4fe1\u606f\n\u66f4\u591a\u7684\u670d\u52a1\u90e8\u7f72\u7c7b\u578b\uff0c\u5982 `RPC \u9884\u6d4b\u670d\u52a1` \u7b49\uff0c\u53ef\u4ee5\u53c2\u8003 Serving \u7684[github \u5b98\u7f51](https://github.com/PaddlePaddle/Serving/tree/v0.7.0/examples)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5797c0a4-1a31-4b45-bb26-5a35b4cf2ce2.json b/docs/doc/5797c0a4-1a31-4b45-bb26-5a35b4cf2ce2.json
new file mode 100644
index 000000000..709d7f6d6
--- /dev/null
+++ b/docs/doc/5797c0a4-1a31-4b45-bb26-5a35b4cf2ce2.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The CTRGCNHead class is a neural network head for the CTR-GCN model in PaddleVideo library, containing layers initialization, weight initialization, and forward pass function definition. The ctrgcn_head class returns the result of passing input x through a fully connected layer (fc) for feature processing and prediction.",
+    "details": [
+        {
+            "comment": "This code snippet is a part of the PaddleVideo library, specifically the CTRGCNHead class. It is a neural network head for the CTR-GCN model that takes in input feature channels and outputs the number of classes, with an optional dropout ratio. The code imports necessary libraries, registers the class under the HEADS registry, and defines the class itself as part of the BaseHead class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ctrgcn_head.py\":0-31",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn as nn\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass CTRGCNHead(BaseHead):\n    \"\"\"\n    Head for CTR-GCN model.\n    Args:\n        in_channels: int, input feature channels. Default: 64.\n        num_classes: int, output the number of classes.\n        drop_out: float, dropout ratio of layer. Default: 0.\n    \"\"\""
+        },
+        {
+            "comment": "Class constructor for a neural network head with optional dropout. Initializes layers, applies weight initialization, and defines the forward pass function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ctrgcn_head.py\":33-62",
+            "content": "    def __init__(self, in_channels=64, num_classes=10, drop_out=0, **kwargs):\n        super().__init__(num_classes, in_channels, **kwargs)\n        self.in_channels = in_channels\n        self.drop_out = drop_out\n        self.fc = nn.Linear(self.in_channels * 4, self.num_classes)\n        if drop_out:\n            self.drop_out = nn.Dropout(self.drop_out)\n        else:\n            self.drop_out = lambda x: x\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv2D):\n                weight_init_(layer.weight,\n                             'Normal',\n                             mean=0.0,\n                             std=math.sqrt(2. / self.num_classes))\n    def forward(self, output_patch):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        x, N, M = output_patch\n        # N*M,C,T,V\n        _, c_new, T, V = x.shape\n        x = paddle.reshape(x, shape=[N, M, c_new, T * V])\n        x = x.mean(3).mean(1)\n        x = self.drop_out(x)"
+        },
+        {
+            "comment": "This code snippet is from the ctrgcn_head class, and it returns the result of passing the input x through a fully connected layer (fc). The purpose might be to process the features extracted by the previous layers in the model for making predictions or generating output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ctrgcn_head.py\":64-64",
+            "content": "        return self.fc(x)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/57d1a9e8-3e06-4b6c-be05-05a9659ff66b.json b/docs/doc/57d1a9e8-3e06-4b6c-be05-05a9659ff66b.json
new file mode 100644
index 000000000..805f367cd
--- /dev/null
+++ b/docs/doc/57d1a9e8-3e06-4b6c-be05-05a9659ff66b.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code is a PaddlePaddle base class for loss functions, requiring subclasses to override `_forward()` and supports optional weight scaling. It initializes the loss class and defines forward pass computation.",
+    "details": [
+        {
+            "comment": "This code is the base class for a loss function in PaddlePaddle. It requires subclasses to override the `_forward()` method, which returns the normal loss without weights. The `loss_weight` parameter is optional and defaults to 1.0, which can be used to scale the final loss value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/base.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom abc import  abstractmethod\nimport paddle\nimport paddle.nn as nn\n#XXX use _forward?? or forward??\nclass BaseWeightedLoss(nn.Layer):\n    \"\"\"Base class for loss.\n    All subclass should overwrite the ``_forward()`` method which returns the\n    normal loss without loss weights.\n    Args:\n        loss_weight (float): Factor scalar multiplied on the loss.\n            Default: 1.0.\n    \"\"\"\n    def __init__(self, loss_weight=1.0):"
+        },
+        {
+            "comment": "Initializes the loss class with a weight and defines forward pass for computation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/base.py\":31-48",
+            "content": "        super().__init__()\n        self.loss_weight = loss_weight\n    @abstractmethod\n    def _forward(self, *args, **kwargs):\n        pass\n    def forward(self, *args, **kwargs):\n        \"\"\"Defines the computation performed at every call.\n        Args:\n            *args: The positional arguments for the corresponding\n                loss.\n            **kwargs: The keyword arguments for the corresponding\n                loss.\n        Returns:\n            paddle.Tensor: The calculated loss.\n        \"\"\"\n        return self._forward(*args, **kwargs) * self.loss_weight"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/57e1d0db-97fa-4149-8697-27535f8ae236.json b/docs/doc/57e1d0db-97fa-4149-8697-27535f8ae236.json
new file mode 100644
index 000000000..e3170ec90
--- /dev/null
+++ b/docs/doc/57e1d0db-97fa-4149-8697-27535f8ae236.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code imports modules, defines Kaiming uniform initialization and SingleStageModel class. It initializes MSTCN backbone with DilatedResidualLayer stages and applies softmax to previous outputs, concatenating them together while initializing weights for convolutional layers with KaimingUniform_like_torch.",
+    "details": [
+        {
+            "comment": "This code snippet appears to be part of a larger file and sets up some initial definitions, imports, and checks for necessary conditions. It includes license information, imports various modules, and defines a function to calculate fan-in and fan-out for tensor dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ms_tcn.py\":0-31",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nimport copy\nimport random\nimport math\nfrom paddle import ParamAttr\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = len(tensor.shape)\n    if dimensions < 2:\n        raise ValueError(\"Fan in and fan out can not be computed \\\n        for tensor with fewer than 2 dimensions\")"
+        },
+        {
+            "comment": "This code defines three functions: `_calculate_fan_in_and_fan_out`, `calculate_gain`, and `KaimingUniform_like_torch`. The first function calculates the fan-in and fan-out values based on the input tensor's dimensions. The second function determines the gain value depending on the nonlinearity used. The third function applies the Kaiming uniform initialization to the weight_npy parameter, utilizing the previous two functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ms_tcn.py\":33-67",
+            "content": "    if dimensions == 2:  # Linear\n        fan_in = tensor.shape[1]\n        fan_out = tensor.shape[0]\n    else:\n        num_input_fmaps = tensor.shape[1]\n        num_output_fmaps = tensor.shape[0]\n        receptive_field_size = 1\n        if tensor.dim() > 2:\n            receptive_field_size = tensor[0][0].numel()\n        fan_in = num_input_fmaps * receptive_field_size\n        fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef calculate_gain(nonlinearity=None, a=None):\n    if nonlinearity == 'tanh':\n        return 5.0 / 3\n    elif nonlinearity == 'relu':\n        return math.sqrt(2.0)\n    elif nonlinearity == 'leaky_relu':\n        if a != None:\n            return math.sqrt(2.0 / (1 + a**2))\n        else:\n            return math.sqrt(2.0 / (1 + 0.01**2))\n    elif nonlinearity == 'selu':\n        return 3.0 / 4\n    else:\n        return 1\ndef KaimingUniform_like_torch(weight_npy,\n                              mode='fan_in',\n                              nonlinearity='leaky_relu'):\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)"
+        },
+        {
+            "comment": "This code defines a SingleStageModel class that inherits from nn.Layer and consists of a convolutional layer, multiple DilatedResidualLayers, and another convolutional layer. The model is initialized with specified parameters: number of layers, number of feature maps, input dimension, and number of output classes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ms_tcn.py\":68-99",
+            "content": "    if mode == 'fan_in':\n        fan_mode = fan_in\n    else:\n        fan_mode = fan_out\n    a = math.sqrt(5.0)\n    gain = calculate_gain(nonlinearity=nonlinearity, a=a)\n    std = gain / math.sqrt(fan_mode)\n    bound = math.sqrt(3.0) * std\n    return np.random.uniform(-bound, bound, weight_npy.shape)\ndef init_bias(weight_npy, bias_npy):\n    # attention this weight is not bias\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)\n    bound = 1.0 / math.sqrt(fan_in)\n    return np.random.uniform(-bound, bound, bias_npy.shape)\nclass SingleStageModel(nn.Layer):\n    def __init__(self, num_layers, num_f_maps, dim, num_classes):\n        super(SingleStageModel, self).__init__()\n        self.conv_in = nn.Conv1D(dim, num_f_maps, 1)\n        self.layers = nn.LayerList([\n            copy.deepcopy(DilatedResidualLayer(2**i, num_f_maps, num_f_maps))\n            for i in range(num_layers)\n        ])\n        self.conv_out = nn.Conv1D(num_f_maps, num_classes, 1)\n    def forward(self, x):\n        out = self.conv_in(x)\n        for layer in self.layers:"
+        },
+        {
+            "comment": "The code defines a DilatedResidualLayer, which is a type of residual layer used in the MSTCN backbone. The MSTCN class initializes a SingleStageModel and a list of stages using the provided parameters. Each stage within the model is an instance of the DilatedResidualLayer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ms_tcn.py\":100-131",
+            "content": "            out = layer(out)\n        out = self.conv_out(out)\n        return out\nclass DilatedResidualLayer(nn.Layer):\n    def __init__(self, dilation, in_channels, out_channels):\n        super(DilatedResidualLayer, self).__init__()\n        self.conv_dilated = nn.Conv1D(in_channels,\n                                      out_channels,\n                                      3,\n                                      padding=dilation,\n                                      dilation=dilation)\n        self.conv_in = nn.Conv1D(out_channels, out_channels, 1)\n        self.dropout = nn.Dropout()\n    def forward(self, x):\n        out = F.relu(self.conv_dilated(x))\n        out = self.conv_in(out)\n        out = self.dropout(out)\n        return (x + out)\n@BACKBONES.register()\nclass MSTCN(nn.Layer):\n    def __init__(self, num_stages, num_layers, num_f_maps, dim, num_classes):\n        super().__init__()\n        self.stage1 = SingleStageModel(num_layers, num_f_maps, dim, num_classes)\n        self.stages = nn.LayerList([\n            copy.deepcopy("
+        },
+        {
+            "comment": "The code defines a forward function for MSTCN model and initializes the weights for convolutional layers. It iterates over stages, applying softmax to previous output and concatenating it to previous outputs. Weights are initialized with KaimingUniform_like_torch for conv1D layers and bias is set according to the layer's weight.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/ms_tcn.py\":132-153",
+            "content": "                SingleStageModel(num_layers, num_f_maps, num_classes,\n                                 num_classes)) for s in range(num_stages - 1)\n        ])\n    def forward(self, x):\n        \"\"\" MSTCN forward\n        \"\"\"\n        out = self.stage1(x)\n        outputs = out.unsqueeze(0)\n        for s in self.stages:\n            out = s(F.softmax(out, axis=1))\n            outputs = paddle.concat((outputs, out.unsqueeze(0)), axis=0)\n        return outputs\n    def init_weights(self):\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv1D):\n                layer.weight.set_value(\n                    KaimingUniform_like_torch(layer.weight).astype('float32'))\n                if layer.bias is not None:\n                    layer.bias.set_value(\n                        init_bias(layer.weight, layer.bias).astype('float32'))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/57ebb0bb-5f73-4cec-b336-49525daca8bd.json b/docs/doc/57ebb0bb-5f73-4cec-b336-49525daca8bd.json
new file mode 100644
index 000000000..73dabc71f
--- /dev/null
+++ b/docs/doc/57ebb0bb-5f73-4cec-b336-49525daca8bd.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This script downloads and extracts four tar files (audio, pptsm, bmn, lstm) related to the FootballAction application within PaddleVideo. The tar files are then deleted after extraction.",
+    "details": [
+        {
+            "comment": "This script downloads and extracts four tar files (audio, pptsm, bmn, lstm) related to the FootballAction application within PaddleVideo. The tar files are then deleted after extraction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/checkpoints/download.sh\":0-17",
+            "content": "# audio\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/audio.tar\n# pptsm\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/pptsm.tar\n# bmn\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/bmn.tar\n# lstm\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/lstm.tar\ntar -xvf audio.tar\ntar -xvf pptsm.tar\ntar -xvf bmn.tar\ntar -xvf lstm.tar\nrm -f audio.tar\nrm -f pptsm.tar\nrm -f bmn.tar\nrm -f lstm.tar"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5845a5da-55cb-4023-80e7-a48cbe9a0448.json b/docs/doc/5845a5da-55cb-4023-80e7-a48cbe9a0448.json
new file mode 100644
index 000000000..71568d88b
--- /dev/null
+++ b/docs/doc/5845a5da-55cb-4023-80e7-a48cbe9a0448.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The MetricsCalculator class calculates accuracy, average loss, and mean loss for multimodal video tagging models with various top-k values. It compares predictions to actual labels in a multilabel classification and logs the metrics using a logger.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and defines a class for calculating metrics. The MetricsCalculator class initializes with name, mode ('train', 'val', or 'test'), and metrics_args.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py\":0-34",
+            "content": "#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import unicode_literals\nfrom __future__ import print_function\nfrom __future__ import division\nimport numpy as np\nimport logging\nlogger = logging.getLogger(__name__)\nclass MetricsCalculator():\n    \"\"\"\n    MetricsCalculator\n    \"\"\"\n    def __init__(self, name, mode, metrics_args):\n        \"\"\"\n        init\n        \"\"\"\n        self.name = name\n        self.mode = mode  # 'train', 'val', 'test'"
+        },
+        {
+            "comment": "This code initializes an AccuracyMetrics class with metrics arguments, resets the metrics values, finalizes and calculates the accuracy and average loss for each top-n value, and returns the computed metrics. The purpose is to measure the performance of a multimodal video tagging model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py\":35-67",
+            "content": "        self.acc_dict = {}\n        self.top_n_list = metrics_args.MODEL.top_n\n        self.num_classes = metrics_args.MODEL.num_classes\n        self.reset()\n    def reset(self):\n        \"\"\"\n        reset\n        \"\"\"\n        logger.info('Resetting {} metrics...'.format(self.mode))\n        for topk in self.top_n_list:\n            self.acc_dict['avg_acc%d' % (topk)] = 0.0\n        self.aggr_loss = 0.0\n        self.aggr_batch_size = 0\n    def finalize_metrics(self):\n        \"\"\"finalize_metrics\n        \"\"\"\n        for key, value in self.acc_dict.items():\n            self.acc_dict[key] = value / self.aggr_batch_size\n        self.aggr_loss = self.aggr_loss / self.aggr_batch_size\n    def get_computed_metrics(self):\n        \"\"\"get_computed_metrics\n        \"\"\"\n        acc_dict = {}\n        for key, value in self.acc_dict.items():\n            acc_dict[key] = value / self.aggr_batch_size\n        aggr_loss = self.aggr_loss / self.aggr_batch_size\n        return acc_dict, aggr_loss\n    def accumulate(self, loss, softmax, labels):"
+        },
+        {
+            "comment": "This code snippet is part of a class that accumulates metrics for video tagging. It computes the mean loss, average accuracy for different top k values, and then logs these metrics in an informative format using a logger.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py\":68-94",
+            "content": "        \"\"\"accumulate\n        \"\"\"\n        cur_batch_size = softmax.shape[0]\n        # if returned loss is None for e.g. test, just set loss to be 0.\n        if loss is None:\n            cur_loss = 0.\n        else:\n            cur_loss = np.mean(np.array(loss))  #\n        self.aggr_batch_size += cur_batch_size\n        self.aggr_loss += cur_loss * cur_batch_size\n        for top_k in self.top_n_list:\n            self.acc_dict['avg_acc%d' %\n                          (top_k)] += cur_batch_size * compute_topk_accuracy(\n                              softmax, labels, top_k=top_k) * 100.\n        return\n    def finalize_and_log_out(self, info=''):\n        \"\"\"finalize_and_log_out\n        \"\"\"\n        metrics_dict, loss = self.get_computed_metrics()\n        acc_str = []\n        for name, value in metrics_dict.items():\n            acc_str.append('{}:{},'.format('%s' % name, '%.2f' % value))\n        acc_str = '\\t'.join(acc_str)\n        logger.info(info +\n                    '\\tLoss: {},\\t{}'.format('%.6f' % loss, '%s' % acc_str))"
+        },
+        {
+            "comment": "This code computes the number of correct hits for a given top_k in multilabel classification, where it calculates the top_k predictions and checks if any of them match with the actual labels. It returns the total number of correct hits across all samples in the batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py\":95-124",
+            "content": "        return\ndef compute_topk_correct_hits_multilabel(top_k, preds, labels):\n    '''Compute the number of corret hits'''\n    batch_size = preds.shape[0]\n    top_k_preds = np.zeros((batch_size, 10), dtype=np.float32)\n    for i in range(batch_size):\n        top_k_preds[i, :] = np.argsort(-preds[i, :])[:10]\n    correctness = np.zeros(batch_size, dtype=np.float32)\n    for i in range(batch_size):\n        correc_sum = 0\n        for label_id in range(len(labels[i])):\n            label_hit = labels[i][label_id]\n            if label_hit == 0 or label_hit < 0.1:\n                continue\n            if label_id in top_k_preds[i, :top_k].astype(np.int32).tolist():\n                # correc_sum += 1\n                correc_sum = 1\n                break\n        correctness[i] = correc_sum\n    correct_hits = sum(correctness)\n    return correct_hits\ndef compute_topk_correct_hits(top_k, preds, labels):\n    '''Compute the number of corret hits'''\n    batch_size = preds.shape[0]\n    top_k_preds = np.zeros((batch_size, top_k), dtype=np.float32)"
+        },
+        {
+            "comment": "This code calculates the top-k accuracy for a batch of predictions and labels. It first computes the top-k predictions and then checks if the ground truth label is within the top-k predictions. The function returns the number of correct hits divided by the batch size to obtain the accuracy. The main section demonstrates usage with example data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py\":125-157",
+            "content": "    for i in range(batch_size):\n        top_k_preds[i, :] = np.argsort(-preds[i, :])[:top_k]\n    correctness = np.zeros(batch_size, dtype=np.int32)\n    for i in range(batch_size):\n        if labels[i] in top_k_preds[i, :].astype(np.int32).tolist():\n            correctness[i] = 1\n    correct_hits = sum(correctness)\n    return correct_hits\ndef compute_topk_accuracy(softmax, labels, top_k):\n    \"\"\"compute_topk_accuracy\n    \"\"\"\n    computed_metrics = {}\n    assert labels.shape[0] == softmax.shape[0], \"Batch size mismatch.\"\n    aggr_batch_size = labels.shape[0]\n    # aggr_top_k_correct_hits = compute_topk_correct_hits(top_k, softmax, labels)\n    aggr_top_k_correct_hits = compute_topk_correct_hits_multilabel(\n        top_k, softmax, labels)\n    # normalize results\n    computed_metrics = \\\n        float(aggr_top_k_correct_hits) / aggr_batch_size\n    return computed_metrics\nif __name__ == \"__main__\":\n    pred = np.array([[0.5, 0.2, 0.3, 0, 0]])\n    label = np.array([[0.5, 0.5, 0, 0, 0]])\n    print('pred:  ', pred)\n    print('label:  ', label)"
+        },
+        {
+            "comment": "Computing top-1 and top-5 hits for multilabel prediction using compute_topk_correct_hits_multilabel function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py\":158-159",
+            "content": "    print('Top 1 hits', compute_topk_correct_hits_multilabel(1, pred, label))\n    print('Top 5 hits', compute_topk_correct_hits_multilabel(5, pred, label))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/584f8aff-900a-4892-a9a9-3f8ed867f6b8.json b/docs/doc/584f8aff-900a-4892-a9a9-3f8ed867f6b8.json
new file mode 100644
index 000000000..593697cb5
--- /dev/null
+++ b/docs/doc/584f8aff-900a-4892-a9a9-3f8ed867f6b8.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code imports required modules, handles web serving for PaddleVideo models and includes a client to make predictions by passing encoded frames and shape to the predict method. It outputs labels and probabilities as results.",
+    "details": [
+        {
+            "comment": "This code is importing necessary modules, defining a function to parse command line arguments, and setting up a parser for those arguments. The main purpose of this file seems to be handling the execution of the PaddleVideo model in a web serving environment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/pipeline_rpc_client.py\":0-28",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\ntry:\n    from paddle_serving_server_gpu.pipeline import PipelineClient\nexcept ImportError:\n    from paddle_serving_server.pipeline import PipelineClient\nimport argparse\nfrom utils import numpy_to_base64, parse_file_paths, video_to_numpy\ndef parse_args():\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Web Serving model script\")\n    parser.add_argument('-c',\n                        '--config',"
+        },
+        {
+            "comment": "This code sets up command line arguments for the serving config file path, RPC port number, and input file/directory path. It then connects to a server at the specified port, processes the input files (decoding videos and converting ndarrays to base64 strings), and transports the data to the server for processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/pipeline_rpc_client.py\":29-59",
+            "content": "                        type=str,\n                        default='configs/PP-TSM.yaml',\n                        help='serving config file path')\n    parser.add_argument('-ptn',\n                        '--port_number',\n                        type=int,\n                        default=9993,\n                        help='rpc port number')\n    parser.add_argument('-i',\n                        '--input_file',\n                        type=str,\n                        help='input file path or directory path')\n    return parser.parse_args()\nif __name__ == \"__main__\":\n    args = parse_args()\n    client = PipelineClient()\n    client.connect([f'127.0.0.1:{args.port_number}'])\n    files_list = parse_file_paths(args.input_file)\n    for file_path in files_list:\n        # decoding video and get stacked frames as ndarray\n        decoded_frames = video_to_numpy(file_path=file_path)\n        # encode ndarray to base64 string for transportation.\n        decoded_frames_base64 = numpy_to_base64(decoded_frames)\n        # transport to server & get get results."
+        },
+        {
+            "comment": "This code snippet uses a PaddleVideo client to make a prediction. It passes base64 encoded frames and their shape to the client's predict method, fetching both labels and probabilities as results. The print statement outputs these results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/pipeline_rpc_client.py\":60-67",
+            "content": "        ret = client.predict(feed_dict={\n            \"frames\": decoded_frames_base64,\n            \"frames_shape\": str(decoded_frames.shape)\n        },\n                             fetch=[\"label\", \"prob\"])\n        # print result\n        print(ret)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5a13773a-b59f-4c5a-98b2-946896833581.json b/docs/doc/5a13773a-b59f-4c5a-98b2-946896833581.json
new file mode 100644
index 000000000..fb763050c
--- /dev/null
+++ b/docs/doc/5a13773a-b59f-4c5a-98b2-946896833581.json
@@ -0,0 +1,20 @@
+{
+    "summary": "ActBERT is a multimodal pretrain task using global action info and TaNgled Transformer block (TNT) for text-object interactions. It outperforms state-of-the-art in video-language tasks and can be trained on HowTo100M dataset with AMP for faster training, evaluated on MSR-VTT, and found at the provided link.",
+    "details": [
+        {
+            "comment": "This is an introduction to ActBERT, a multimodal pretrain task proposed by Baidu in CVPR2020. It uses global action information to analyze mutual interactions between linguistic texts and local regional objects. The method introduces TaNgled Transformer block (TNT) to encode three sources of information. ActBERT outperforms state-of-the-art in five video-and-language tasks, including text-video clip retrieval, video captioning, and action segmentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/multimodal/actbert.md\":0-24",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/multimodal/actbert.md) | English\n# ActBERT\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Reference](#Reference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install paddlenlp\npython -m pip install lmdb\n```\n## Introduction\nActbert is proposed by Baidu in CVPR2020 for multimodal pretrain task. It leverage global action information to cat- alyze mutual interactions between linguistic texts and local regional objects.  This method introduce a TaNgled Transformer block (TNT) to encode three sources of information, i.e., global actions, local regional objects, and linguistic descriptions. ActBERT significantly outperforms the state- of-the-art in five downstream video-and-language tasks, i.e., text-video clip retrieval, video captioning, video question answering, action segmentation, and action step localization.\n<div align=\"center\">\n<img src=\"../../../images/actbert.png\" height=400 width=500 hspace='10'/> <br />"
+        },
+        {
+            "comment": "This code describes how to train ActBERT on HowTo100M dataset. It first requires downloading the pretrain-model \"bert-base-uncased\" from a specified URL and adding its path to the config file. Then, it provides the command to start training using the provided script with specific configuration and GPU allocation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/multimodal/actbert.md\":25-64",
+            "content": "</div>\n## Data\nPlease refer to Kinetics400 data download and preparation doc [HowTo100M-data](../../dataset/howto100m.md)\nPlease refer to MSR-VTT data download and preparation doc [MSR-VTT-data](../../dataset/umsrvtt.md)\n## Train\n### Train on HowTo100M\n#### download pretrain-model\nPlease download [bert-base-uncased](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams) as pretraind model:\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams\n```\nand add path to `MODEL.framework.backbone.pretrained` in config file as\uff1a\n```yaml\nMODEL:\n    framework: \"ActBert\"\n    backbone:\n        name: \"BertForMultiModalPreTraining\"\n        pretrained: your weight path\n```\n- We provide training option on small data, config file is for reference only.\n#### Start training\n- Train ActBERT on HowTo100M scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_actbert  main.py  --validate -c configs/multimodal/actbert/actbert.yaml"
+        },
+        {
+            "comment": "This code shows how to train a model using PaddlePaddle with Automatic Mixed Precision (AMP) for faster training, evaluate it on the MSR-VTT dataset, and provides metrics such as R@1, R@5, R@10, and Median R. The ActBERT model can be found at the provided link in the reference section.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/multimodal/actbert.md\":65-97",
+            "content": "```\n- AMP is useful for speeding up training:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 #MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_actbert  main.py  --amp --validate -c configs/multimodal/actbert/actbert.yaml\n```\n## Test\n- Evaluation performs on downstream task, i.e. text-video clip retrieval on MSR-VTT dataset, test accuracy can be obtained using scripts:\n```bash\npython3.7 main.py --test -c configs/multimodal/actbert/actbert_msrvtt.yaml -w Actbert.pdparams\n```\nMetrics on MSR-VTT:\n| R@1 | R@5 | R@10 | Median R | Mean R | checkpoints |\n| :------: | :----------: | :----: | :----: | :----: | :----: |\n| 8.6 | 31.2 | 45.5 | 13.0 | 28.5 | [ActBERT.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ActBERT.pdparams) |\n## Reference\n- [ActBERT: Learning Global-Local Video-Text Representations\n](https://arxiv.org/abs/2011.07231), Linchao Zhu, Yi Yang"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5a3e94c4-95f1-4cb3-8256-434f1a293916.json b/docs/doc/5a3e94c4-95f1-4cb3-8256-434f1a293916.json
new file mode 100644
index 000000000..766a5afb2
--- /dev/null
+++ b/docs/doc/5a3e94c4-95f1-4cb3-8256-434f1a293916.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet provides instructions for downloading the DAVIS2017 dataset and organizing its folder structure within the PaddleVideo project directory. It also provides a link to access the file \"DAVIS2017/ImageSets/2017/v_a_l_instances.txt\" if needed.",
+    "details": [
+        {
+            "comment": "This code snippet provides instructions for downloading the DAVIS2017 dataset and organizing its folder structure within the PaddleVideo project directory. It also provides a link to access the file \"DAVIS2017/ImageSets/2017/v_a_l_instances.txt\" if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/DAVIS2017.md\":0-26",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/DAVIS2017.md) | English\n# DAVIS2017 Data Preparation\n## 1.Data Download\nDownload [DAVIS2017](https://data.vision.ee.ethz.ch/csergi/share/davis/DAVIS-2017-trainval-480p.zip) and [scribbles](https://data.vision.ee.ethz.ch/csergi/share/DAVIS-Interactive/DAVIS-2017-scribbles-trainval.zip) into one folder. Please refer to [DAVIS](https://davischallenge.org/davis2017/code.html).\nIf you need the file \"DAVIS2017/ImageSets/2017/v_a_l_instances.txt\", please refer to the link [google]( https://drive.google.com/file/d/1aLPaQ_5lyAi3Lk3d2fOc_xewSrfcrQlc/view?usp=sharing)\n## 2.Folder Structure\nIn the context of the whole project (for Ma-Net only), the folder structure will look like:\n```shell\nPaddleVideo\n\u251c\u2500\u2500 configs\n\u251c\u2500\u2500 paddlevideo\n\u251c\u2500\u2500 docs\n\u251c\u2500\u2500 tools\n\u251c\u2500\u2500 data\n\u2502 \t\u2514\u2500\u2500 DAVIS2017\n\u2502   \u2502 \t\u251c\u2500\u2500 Annotations\n\u2502   \u2502 \t\u251c\u2500\u2500 ImageSets\n\u2502   \u2502 \t\u251c\u2500\u2500 JPEGImages\n\u2502   \u2502 \t\u2514\u2500\u2500 Scribbles\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5b85de12-586e-4b81-9578-d4b6c7687199.json b/docs/doc/5b85de12-586e-4b81-9578-d4b6c7687199.json
new file mode 100644
index 000000000..27b976c27
--- /dev/null
+++ b/docs/doc/5b85de12-586e-4b81-9578-d4b6c7687199.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code initializes optimizer configurations, handles weight decay, grad clip, and excludes parameters for L2 decay. It sets learning rate with LRScheduler, supports multi-precision, and creates an optimizer based on inputs.",
+    "details": [
+        {
+            "comment": "This code is from the \"optimizer.py\" file in the PaddleVideo library, and it's responsible for building an optimizer. It imports necessary modules, checks compatibility with Python versions, defines a function build_optimizer that takes parameters such as configuration (cfg), learning rate scheduler (lr_scheduler), model, and optional AMP usage (use_amp). This file also includes some license information and comments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/optimizer.py\":0-30",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport inspect\n# for python3.11\nif not hasattr(inspect, 'getargspec'):\n    inspect.getargspec = inspect.getfullargspec\nfrom typing import Dict\nimport paddle\nfrom paddle.optimizer.lr import LRScheduler\nfrom paddle.regularizer import L1Decay, L2Decay\nfrom paddlevideo.utils import get_logger\ndef build_optimizer(cfg: Dict,\n                    lr_scheduler: LRScheduler,\n                    model: paddle.nn.Layer,\n                    use_amp: bool = False,"
+        },
+        {
+            "comment": "Builds an optimizer and learning rate scheduler according to the OPTIMIZER field in the configuration. The Momentum or Adam optimizers are applied to optimize the network, and L1Decay or L2Decay regularizers are used to avoid overfitting. The function takes optimizer configuration (cfg) and learning rate scheduler (lr_scheduler) as arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/optimizer.py\":31-62",
+            "content": "                    amp_level: str = None) -> paddle.optimizer.Optimizer:\n    \"\"\"Build an optimizer and learning rate scheduler to optimize parameters accroding to ```OPTIMIZER``` field in configuration.\n    In configuration:\n    OPTIMIZER:\n        name: Momentum\n        momentum: 0.9\n        weight_decay: 0.001\n    or\n    OPTIMIZER:\n        name: Momentum\n        momentum: 0.9\n        weight_decay:\n            name: \"L1\"\n            value: 0.001\n    Momentum optimizer will be applied to optimize network and L1Decay regularizer will be applied to avoid overfit.\n    OPTIMIZER:\n        name: Adam\n        weight_decay:\n            name: \"L2\"\n            value: 0.001\n    Adam optimizer will be applied to optimize network and L2Decay regularizer will applied to avoid overfit.\n    Refer to ```https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/regularizer/L2Decay_en.html``` for more details.\n    Args:\n        cfg (Dict): optimizer configuration.\n        lr_scheduler (LRScheduler): learning rate scheduler."
+        },
+        {
+            "comment": "This code defines a function that creates an optimizer for a given model. It accepts parameters such as the model, whether to use AMP or not, and the AMP level. The function also handles weight decay by checking if a 'weight_decay' configuration is present and applying the appropriate settings (L1 or L2 decay).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/optimizer.py\":63-84",
+            "content": "        model (paddle.nn.Layer, optional): model which contains parameters to be optimized. Defaults to None.\n        use_amp (bool, optional): Whether use amp. Defaults to False.\n        amp_level (str, optional): amp level when amp is enabled. Defaults to None.\n    Returns:\n        paddle.optimizer.Optimizer: an optimizer for the input model.\n    \"\"\"\n    logger = get_logger(\"paddlevideo\")\n    cfg_copy = cfg.copy()\n    # NOTE: check none and illegal cfg!!!\n    opt_name = cfg_copy.pop('name')\n    # deal with weight decay\n    if cfg_copy.get('weight_decay'):\n        if isinstance(cfg_copy.get('weight_decay'),\n                      float):  # just an float factor\n            cfg_copy['weight_decay'] = cfg_copy.get('weight_decay')\n        elif 'L1' in cfg_copy.get('weight_decay').get(\n                'name').upper():  # specify L2 wd and it's float factor\n            cfg_copy['weight_decay'] = L1Decay(\n                cfg_copy.get('weight_decay').get('value'))\n        elif 'L2' in cfg_copy.get('weight_decay').get("
+        },
+        {
+            "comment": "This code is initializing the configuration for an optimizer, handling L1 and L2 weight decay, grad clip, and no_weight_decay parameters. If 'name' is specified for L1 wd, it sets the 'weight_decay' to the float factor. For grad clip, if a float value is given, it is set as the 'grad_clip', or if 'global' in name, creates a ClipGradByGlobalNorm object. If 'no_weight_decay_name' is specified, it extracts the list of parameters to exclude from L2 decay.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/optimizer.py\":85-108",
+            "content": "                'name').upper():  # specify L1 wd and it's float factor\n            cfg_copy['weight_decay'] = L2Decay(\n                cfg_copy.get('weight_decay').get('value'))\n        else:\n            raise ValueError\n    # deal with grad clip\n    if cfg_copy.get('grad_clip'):\n        if isinstance(cfg_copy.get('grad_clip'), float):\n            cfg_copy['grad_clip'] = cfg_copy.get('grad_clip').get('value')\n        elif 'global' in cfg_copy.get('grad_clip').get('name').lower():\n            cfg_copy['grad_clip'] = paddle.nn.ClipGradByGlobalNorm(\n                cfg_copy.get('grad_clip').get('value'))\n        else:\n            raise ValueError\n    # Set for optimizers that cannot be applied to l2decay, i.e. AdamW\n    if cfg_copy.get('no_weight_decay_name'):\n        no_weight_decay_name = cfg_copy.pop('no_weight_decay_name')\n        no_weight_decay_name_list = no_weight_decay_name.split(' ')\n        # NOTE: use param.name not name\n        no_weight_decay_param_list = [\n            param.name for name, param in model.named_parameters()"
+        },
+        {
+            "comment": "This code checks if there are any parameters without weight decay, and sets the learning rate using a LRScheduler. It also handles multi-precision for optimizer when use_amp is True and amp_level is 'O2'. The code updates the optimizer_setting with no_weight_decay_param_list and \"multi_precision\" if required, logging relevant information throughout.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/optimizer.py\":109-132",
+            "content": "            if any(key_word in name for key_word in no_weight_decay_name_list)\n        ]  # get the full param name of no weight decay\n        _apply_decay_param_fun = lambda name: name not in no_weight_decay_param_list\n        cfg_copy['apply_decay_param_fun'] = _apply_decay_param_fun\n        logger.info(\n            f\"No weight Decay list :({len(no_weight_decay_param_list)})\",\n            no_weight_decay_param_list)\n    cfg_copy.pop('learning_rate')\n    # set multi_precision\n    optimizer_setting = {\n        'learning_rate': lr_scheduler,\n        'parameters': model.parameters(),\n        **cfg_copy\n    }\n    optimizer_init_args = inspect.getargspec(\n        getattr(paddle.optimizer, opt_name).__init__).args\n    if use_amp and amp_level == \"O2\" and \"multi_precision\" in optimizer_init_args:\n        # support \"multi_precision\" arg in optimizer's __init__ function.\n        optimizer_setting.update({\"multi_precision\": True})\n        logger.info(\n            \"Set multi_precision=True for optimizer when use_amp=True and amp_level='O2'\""
+        },
+        {
+            "comment": "This code is creating and returning an optimizer based on the given \"opt_name\" and \"optimizer_setting\". The optimizer type is determined by using \"paddle.optimizer[opt_name]\" and the parameters are passed through **optimizer_settings** to initialize the optimizer object.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/optimizer.py\":133-135",
+            "content": "        )\n    return getattr(paddle.optimizer, opt_name)(**optimizer_setting)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5bd04d81-0bdb-4e0d-9f02-626dd02a7f21.json b/docs/doc/5bd04d81-0bdb-4e0d-9f02-626dd02a7f21.json
new file mode 100644
index 000000000..85519cdb9
--- /dev/null
+++ b/docs/doc/5bd04d81-0bdb-4e0d-9f02-626dd02a7f21.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The code sets command-line arguments for Paddle Video tool, configures the predictor, supports GPU/NPU usage, and utilizes TensorRT engine with YOWO model. It creates a directory, preprocesses data, performs inference, post-processes output, benchmarks, and guides users to install \"auto_log\".",
+    "details": [
+        {
+            "comment": "Code snippet is an import-heavy function definition. It begins with a lengthy comment mentioning the copyright and license details, followed by multiple imports from various modules. The only executable code present is the \"parse_args\" function definition. This function uses argparse to create a parser for general parameters of PaddleVideo Inference model script.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":0-31",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nfrom os import path as osp\nimport paddle\nfrom paddle import inference\nfrom paddle.inference import Config, create_predictor\nfrom utils import build_inference_helper\nfrom paddlevideo.utils import get_config\ndef parse_args():\n    def str2bool(v):\n        return v.lower() in (\"true\", \"t\", \"1\")\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Inference model script\")\n    parser.add_argument("
+        },
+        {
+            "comment": "The code defines command-line arguments for a Paddle Video tool. It allows the user to specify the config file, input file, model and parameters files, batch size, and GPU/XPU usage. The `str2bool` type converts string inputs to boolean values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":32-58",
+            "content": "        '-c',\n        '--config',\n        type=str,\n        default='configs/example.yaml',\n        help='config file path')\n    parser.add_argument(\n        '-o',\n        '--override',\n        action='append',\n        default=[],\n        help='config options to be overridden')\n    parser.add_argument(\"-i\", \"--input_file\", type=str, help=\"input file path\")\n    parser.add_argument(\n        \"--time_test_file\",\n        type=str2bool,\n        default=False,\n        help=\"whether input time test file\")\n    parser.add_argument(\"--model_file\", type=str)\n    parser.add_argument(\"--params_file\", type=str)\n    # params for paddle predict\n    parser.add_argument(\"-b\", \"--batch_size\", type=int, default=1)\n    parser.add_argument(\"--use_gpu\", type=str2bool, default=True)\n    parser.add_argument(\"--use_xpu\", type=str2bool, default=False)\n    parser.add_argument(\"--use_npu\", type=str2bool, default=False)\n    parser.add_argument(\"--precision\", type=str, default=\"fp32\")\n    parser.add_argument(\"--ir_optim\", type=str2bool, default=True)"
+        },
+        {
+            "comment": "This code is parsing arguments to configure a Paddle video predictor. It adds various arguments for use_tensorrt, gpu_mem, enable_benchmark, enable_mkldnn, cpu_threads, and disable_glog. The create_paddle_predictor function creates a config object with the provided arguments, enabling GPU or NPU usage if specified, and disabling GPU if not. It also sets the number of CPU threads if provided, and enables MKLDNN if enabled.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":59-83",
+            "content": "    parser.add_argument(\"--use_tensorrt\", type=str2bool, default=False)\n    parser.add_argument(\"--gpu_mem\", type=int, default=8000)\n    parser.add_argument(\"--enable_benchmark\", type=str2bool, default=False)\n    parser.add_argument(\"--enable_mkldnn\", type=str2bool, default=False)\n    parser.add_argument(\"--cpu_threads\", type=int, default=None)\n    parser.add_argument(\"--disable_glog\", type=str2bool, default=False)\n    # parser.add_argument(\"--hubserving\", type=str2bool, default=False)  #TODO\n    return parser.parse_args()\ndef create_paddle_predictor(args, cfg):\n    config = Config(args.model_file, args.params_file)\n    if args.use_gpu:\n        config.enable_use_gpu(args.gpu_mem, 0)\n    elif args.use_npu:\n        config.enable_npu()\n    elif args.use_xpu:\n        config.enable_xpu()\n    else:\n        config.disable_gpu()\n        if args.cpu_threads:\n            config.set_cpu_math_library_num_threads(args.cpu_threads)\n        if args.enable_mkldnn:\n            # cache 10 different shapes for mkldnn to avoid memory leak"
+        },
+        {
+            "comment": "The code configures the PaddleVideo model for inference by setting the MKLDNN cache capacity, enabling MKLDNN and optionally BFloat16, disabling GLOG info, switching IR optim, and handling precision and batch size when TensorRT is enabled.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":84-106",
+            "content": "            config.set_mkldnn_cache_capacity(10)\n            config.enable_mkldnn()\n            if args.precision == \"fp16\":\n                config.enable_mkldnn_bfloat16()\n    # config.disable_glog_info()\n    config.switch_ir_optim(args.ir_optim)  # default true\n    if args.use_tensorrt:\n        # choose precision\n        if args.precision == \"fp16\":\n            precision = inference.PrecisionType.Half\n        elif args.precision == \"int8\":\n            precision = inference.PrecisionType.Int8\n        else:\n            precision = inference.PrecisionType.Float32\n        # calculate real max batch size during inference when tenrotRT enabled\n        max_batch_size = args.batch_size\n        if 'num_seg' in cfg.INFERENCE:\n            # num_seg: number of segments when extracting frames.\n            # seg_len: number of frames extracted within a segment, default to 1.\n            # num_views: the number of video frame groups obtained by cropping and flipping,\n            # uniformcrop=3, tencrop=10, centercrop=1."
+        },
+        {
+            "comment": "The code sets the number of segments and views based on the model name, calculates the maximum batch size, enables TensorRT engine with specified precision mode, enables memory optimization, disables glog if instructed to do so, and potentially deletes a pass for ST-GCN TensorRT case usage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":107-133",
+            "content": "            num_seg = cfg.INFERENCE.num_seg\n            seg_len = cfg.INFERENCE.get('seg_len', 1)\n            num_views = 1\n            if 'tsm' in cfg.model_name.lower():\n                num_views = 1  # CenterCrop\n            elif 'tsn' in cfg.model_name.lower():\n                num_views = 10  # TenCrop\n            elif 'timesformer' in cfg.model_name.lower():\n                num_views = 3  # UniformCrop\n            elif 'videoswin' in cfg.model_name.lower():\n                num_views = 3  # UniformCrop\n            elif 'tokenshift' in cfg.model_name.lower():\n                num_views = 3  # UniformCrop\n            max_batch_size = args.batch_size * num_views * num_seg * seg_len\n        config.enable_tensorrt_engine(\n            precision_mode=precision, max_batch_size=max_batch_size)\n    config.enable_memory_optim()\n    # use zero copy\n    config.switch_use_feed_fetch_ops(False)\n    # disable glog\n    if args.disable_glog:\n        config.disable_glog_info()\n    # for ST-GCN tensorRT case usage\n    # config.delete_pass(\"shuffle_channel_detect_pass\")"
+        },
+        {
+            "comment": "The code is implementing a main function for predicting using Paddle Inference model. It first parses arguments from command-line, then retrieves configuration and overrides for the inference task, prints an informative message, builds the inference helper, and creates paddle predictor with the given arguments and configuration. After this, it gets input and output names, initializes empty lists for input and output tensors, and iterates through input names to populate these lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":135-172",
+            "content": "    predictor = create_predictor(config)\n    return config, predictor\ndef parse_file_paths(input_path: str) -> list:\n    if osp.isfile(input_path):\n        files = [\n            input_path,\n        ]\n    else:\n        files = os.listdir(input_path)\n        files = [\n            file for file in files\n            if (file.endswith(\".avi\") or file.endswith(\".mp4\"))\n        ]\n        files = [osp.join(input_path, file) for file in files]\n    return files\ndef main():\n    \"\"\"predict using paddle inference model\n    \"\"\"\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override, show=False)\n    model_name = cfg.model_name\n    print(f\"Inference model({model_name})...\")\n    InferenceHelper = build_inference_helper(cfg.INFERENCE)\n    inference_config, predictor = create_paddle_predictor(args, cfg)\n    # get input_tensor and output_tensor\n    input_names = predictor.get_input_names()\n    output_names = predictor.get_output_names()\n    input_tensor_list = []\n    output_tensor_list = []\n    for item in input_names:"
+        },
+        {
+            "comment": "The code is processing input files for a specific model and running inference. For certain models, it preprocesses the input files using InferenceHelper and then runs inference by setting input tensors and calling predictor.run(). Finally, if the model is AVA_SlowFast_FastRcnn, it post-processes the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":173-200",
+            "content": "        input_tensor_list.append(predictor.get_input_handle(item))\n    for item in output_names:\n        output_tensor_list.append(predictor.get_output_handle(item))\n    # get the absolute file path(s) to be processed\n    if model_name in [\"MSTCN\", \"ASRF\"]:\n        files = InferenceHelper.get_process_file(args.input_file)\n    else:\n        files = parse_file_paths(args.input_file)\n    if model_name == 'TransNetV2':\n        for file in files:\n            inputs = InferenceHelper.preprocess(file)\n            outputs = []\n            for input in inputs:\n                # Run inference\n                for i in range(len(input_tensor_list)):\n                    input_tensor_list[i].copy_from_cpu(input)\n                predictor.run()\n                output = []\n                for j in range(len(output_tensor_list)):\n                    output.append(output_tensor_list[j].copy_to_cpu())\n                outputs.append(output)\n            # Post process output\n            InferenceHelper.postprocess(outputs)\n    elif model_name == 'AVA_SlowFast_FastRcnn':"
+        },
+        {
+            "comment": "Iterates through each video file in the list. \nPreprocesses the input data for a model. \nRuns inference for each input, copying CPU memory. \nStores the output of each run. \nPost processes the outputs using InferenceHelper function. For YOWO model, also does filename operations and saves results to specified directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":201-226",
+            "content": "        for file in files:  # for videos\n            inputs = InferenceHelper.preprocess(file)\n            outputs = []\n            for input in inputs:\n                # Run inference\n                input_len = len(input_tensor_list)\n                for i in range(input_len):\n                    if type(input[i]) == paddle.Tensor:\n                        input_tmp = input[i].numpy()\n                    else:\n                        input_tmp = input[i]\n                    input_tensor_list[i].copy_from_cpu(input_tmp)\n                predictor.run()\n                output = []\n                for j in range(len(output_tensor_list)):\n                    output.append(output_tensor_list[j].copy_to_cpu())\n                outputs.append(output)\n            # Post process output\n            InferenceHelper.postprocess(outputs)\n    elif model_name == 'YOWO':\n        for file in files:  # for videos\n            (_, filename) = os.path.split(file)\n            (filename, _) = os.path.splitext(filename)\n            save_dir = osp.join('inference', 'YOWO_infer')"
+        },
+        {
+            "comment": "This code creates a directory and checks if the save path exists, then preprocesses input data for inference. It runs inference using a predictor, post-processes the output, and if benchmarking is enabled, it instantiates auto log.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":227-250",
+            "content": "            if not osp.exists('inference'):\n                os.mkdir('inference')\n            if not osp.exists(save_dir):\n                os.mkdir(save_dir)\n            save_path = osp.join(save_dir, filename)\n            if not osp.exists(save_path):\n                os.mkdir(save_path)\n            inputs, frames = InferenceHelper.preprocess(file)\n            for idx, input in enumerate(inputs):\n                # Run inference\n                outputs = []\n                input_len = len(input_tensor_list)\n                for i in range(input_len):\n                    input_tensor_list[i].copy_from_cpu(input[i])\n                predictor.run()\n                for j in range(len(output_tensor_list)):\n                    outputs.append(output_tensor_list[j].copy_to_cpu())\n                # Post process output\n                InferenceHelper.postprocess(outputs, frames[idx], osp.join(save_path, str(idx).zfill(3)))\n    else:\n        if args.enable_benchmark:\n            num_warmup = 3\n            # instantiate auto log"
+        },
+        {
+            "comment": "This code snippet attempts to import the \"auto_log\" package and if it fails, provides instructions on how to install it. Then, it creates an instance of AutoLogger, configuring various parameters like model name, batch size, data shape, etc., and specifies which timing metrics to track during inference. If no time test file is provided, the code sets the number of test videos to 15 and assigns all input files to these tests.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":251-274",
+            "content": "            try:\n                import auto_log\n            except ImportError as e:\n                print(f\"{e}, [git+https://github.com/LDOUBLEV/AutoLog] \"\n                      f\"package and it's dependencies is required for \"\n                      f\"python-inference when enable_benchmark=True.\")\n            pid = os.getpid()\n            autolog = auto_log.AutoLogger(\n                model_name=cfg.model_name,\n                model_precision=args.precision,\n                batch_size=args.batch_size,\n                data_shape=\"dynamic\",\n                save_path=\"./output/auto_log.lpg\",\n                inference_config=inference_config,\n                pids=pid,\n                process_name=None,\n                gpu_ids=0 if args.use_gpu else None,\n                time_keys=[\n                    'preprocess_time', 'inference_time', 'postprocess_time'\n                ],\n                warmup=num_warmup)\n            if not args.time_test_file:\n                test_video_num = 15\n                files = [args.input_file for _ in range(test_video_num)]"
+        },
+        {
+            "comment": "This code reads input files, processes them in batches, runs inference on a model, and collects output. It also supports benchmarking and logs processing times for each step.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":275-305",
+            "content": "            else:\n                f_input = open(args.input_file, 'r')\n                files = [i.strip() for i in f_input.readlines()]\n                test_video_num = len(files)\n                f_input.close()\n        # Inferencing process\n        batch_num = args.batch_size\n        for st_idx in range(0, len(files), batch_num):\n            ed_idx = min(st_idx + batch_num, len(files))\n            # auto log start\n            if args.enable_benchmark:\n                autolog.times.start()\n            # Pre process batched input\n            batched_inputs = InferenceHelper.preprocess_batch(\n                files[st_idx:ed_idx])\n            # get pre process time cost\n            if args.enable_benchmark:\n                autolog.times.stamp()\n            # run inference\n            for i in range(len(input_tensor_list)):\n                input_tensor_list[i].copy_from_cpu(batched_inputs[i])\n            predictor.run()\n            batched_outputs = []\n            for j in range(len(output_tensor_list)):\n                batched_outputs.append(output_tensor_list[j].copy_to_cpu())"
+        },
+        {
+            "comment": "Enables benchmarking for inference time, processes outputs and records post-processing time, then reports the benchmark log if enabled.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/predict.py\":307-326",
+            "content": "            # get inference process time cost\n            if args.enable_benchmark:\n                autolog.times.stamp()\n            InferenceHelper.postprocess(batched_outputs,\n                                        not args.enable_benchmark)\n            # get post process time cost\n            if args.enable_benchmark:\n                autolog.times.end(stamp=True)\n            # time.sleep(0.01)  # sleep for T4 GPU\n    # report benchmark log if enabled\n    if args.enable_benchmark:\n        autolog.report()\nif __name__ == \"__main__\":\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5c0275bb-fc51-4c0c-8d57-15e6c45e913c.json b/docs/doc/5c0275bb-fc51-4c0c-8d57-15e6c45e913c.json
new file mode 100644
index 000000000..5733b19b5
--- /dev/null
+++ b/docs/doc/5c0275bb-fc51-4c0c-8d57-15e6c45e913c.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The Python code defines a \"Ucf24Dataset\" class for loading and transforming UCF24 dataset in PaddleVideo, with methods to prepare data for training/validation and testing. It extracts relevant information like image paths, labels, and frame indices, and converts image path names from 'jpg' to 'txt'.",
+    "details": [
+        {
+            "comment": "This code is a Python class for the UCF24 dataset used in PaddleVideo, which loads raw videos and applies specified transformations on them. It is registered within the registry module and utilizes other modules such as BaseDataset and gets logger from utils. The license information and import statements are also included.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ucf24_dataset.py\":0-29",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass UCF24Dataset(BaseDataset):\n    \"\"\"Dataset for YOWO\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates\n       a sample video with the filepath and label, which are split with a whitesapce."
+        },
+        {
+            "comment": "This code defines a dataset class, \"Ucf24Dataset\", which loads video information from an index file and prepares data for training or validation. It takes a file path, pipeline, and additional keyword arguments. The load_file method reads the index file to extract video information, such as filenames, while the prepare_train method prepares data for training/validation given an index.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ucf24_dataset.py\":30-58",
+            "content": "       Example of a inde file:\n       .. code-block:: txt\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(self, file_path, pipeline, num_retries=5, **kwargs):\n        self.num_retries = num_retries\n        super().__init__(file_path, pipeline, **kwargs)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            lines = fin.readlines()\n        for line in lines:\n            line = line.strip()  # 'data/ucf24/labels/class_name/video_name/key_frame.txt'\n            filename = line.replace('txt', 'jpg').replace(\n                'labels', 'rgb-images')  # key frame path\n            info.append(dict(filename=filename))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])"
+        },
+        {
+            "comment": "Code from \"PaddleVideo/paddlevideo/loader/dataset/ucf24_dataset.py\" prepares data for testing by copying the info at index idx, applying a pipeline function to it and extracting relevant information like image paths and labels. The code also converts image path names from 'jpg' to 'txt'. Finally, it returns images, labels, and frame indices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ucf24_dataset.py\":59-75",
+            "content": "        results = self.pipeline(results)\n        im_path = results['filename']\n        im_path = im_path.replace('jpg', 'txt')\n        im_split = im_path.split('/')\n        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]\n        return results['imgs'], np.array([results['labels']]), frame_index\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for test given the index.\"\"\"\n        # Try to catch Exception caused by reading corrupted video file\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        im_path = results['filename']\n        im_path = im_path.replace('jpg', 'txt')\n        im_split = im_path.split('/')\n        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]\n        return results['imgs'], np.array([results['labels']]), frame_index"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5c03e93c-1a74-4df0-8602-f5ce3df30ca0.json b/docs/doc/5c03e93c-1a74-4df0-8602-f5ce3df30ca0.json
new file mode 100644
index 000000000..eb71da8ae
--- /dev/null
+++ b/docs/doc/5c03e93c-1a74-4df0-8602-f5ce3df30ca0.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports the MaxIoUAssignerAVA class from the max_iou_assigner_ava module and adds it to the __all__ list, making it importable by default. The comment at the top of the file contains license information and copyright notices.",
+    "details": [
+        {
+            "comment": "This code imports the MaxIoUAssignerAVA class from the max_iou_assigner_ava module and adds it to the __all__ list, making it importable by default. The comment at the top of the file contains license information and copyright notices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/__init__.py\":0-16",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .max_iou_assigner_ava import MaxIoUAssignerAVA\n__all__ = ['MaxIoUAssignerAVA']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5c276339-0250-4b32-8f3e-e5eec13f32c4.json b/docs/doc/5c276339-0250-4b32-8f3e-e5eec13f32c4.json
new file mode 100644
index 000000000..ae5dac844
--- /dev/null
+++ b/docs/doc/5c276339-0250-4b32-8f3e-e5eec13f32c4.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code installs PaddleNLP, trains T2VLAD on MSRVTT dataset, and demonstrates retrieval performance with metrics R@1, R@5, R@10, and median rank at 26.1, 54.7, 68.1, and 4 respectively. Based on the paper \"T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval\" by Xiaohan Wang et al.",
+    "details": [
+        {
+            "comment": "Code snippet for installing additional dependencies:\n```bash\npython -m pip install paddlenlp\n```\nThis code is for installing Paddlepaddle Natural Language Processing (NLP) library, which is a required dependency for running T2VLAD application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/README_en.md\":0-30",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](./README.md) | English\n# T2VLAD\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Reference](#Reference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install paddlenlp\n```\n## Introduction\nT2VLAD is proposed by Baidu in CVPR2021 for text-video retrieval. Text-video retrieval is a challenging task that aims to search relevant video contents based on natural language descriptions. The key to this problem is to measure text- video similarities in a joint embedding space. T2VLAD designs an efficient global-local alignment method. This model achieves consistent improvements on three standard text-video retrieval benchmarks and outperform the state- of-the-art by a clear margin.\n<div align=\"center\">\n<img src=\"./imgs/t2vlad.png\" height=400 width=700 hspace='10'/> <br />\n</div>\n## Data\nPlease refer to MSR-VTT data download and preparation doc [MSR-VTT data](../../docs/en/dataset/msrvtt.md)\n## Train\n### Train on MSR-VTT"
+        },
+        {
+            "comment": "This code outlines the steps to train and test T2VLAD on the MSRVTT dataset. It requires downloading data, moving it to a specified folder, and executing training and testing scripts with appropriate configuration files. The Ranger optimizer is not currently supported, so AdamW should be used instead. Test accuracy results are provided for text-to-video clip retrieval on the MSRVTT dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/README_en.md\":31-58",
+            "content": "Download data then move to `data/MSRVTT` folder.\n#### Start training\n- Train T2VLAD on MSRVTT scripts:\n```bash\nexport CUDA_VISIBLE_DEVICES=0\npython3.7 train.py --config ./configs/msrvtt_transformers.json\n```\nT2VLAD uses the Ranger optimizer during training. We haven't supported the implementation of Ranger optimizer, for now, the AdamW optimizer can be used to complete the training.\n## Test\n- Evaluation performs on downstream task, i.e. text-video clip retrieval on MSR-VTT dataset, test accuracy can be obtained using scripts:\n```bash\nexport CUDA_VISIBLE_DEVICES=0\npython3.7 test.py --config ./configs/msrvtt_transformers.json --resume ./T2VLAD_msrvtt.pdparams\n```\nAccuracy on MSR-VTT:\nText $\\rightarrow$ Video\n| R@1  | R@5  | R@10 | Median R |                         checkpoints                          |\n| :--: | :--: | :--: | :------: | :----------------------------------------------------------: |\n| 29.5 | 59.0 | 70.1 |   4      | [T2VLAD.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/T2VLAD_msrvtt.pdparams) |"
+        },
+        {
+            "comment": "This code shows the retrieval performance of a Text-Video Retrieval model, with metrics R@1, R@5, R@10, and median rank at 26.1, 54.7, 68.1, and 4 respectively. The reference is the paper \"T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval\" by Xiaohan Wang et al.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/README_en.md\":60-68",
+            "content": "Video $\\rightarrow$ Text\n| R@1  | R@5  | R@10 | Median R |\n| :--: | :--: | :--: | :------: |\n| 26.1 | 54.7 | 68.1 |   4      |\n## Reference\n- [T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval\n](https://arxiv.org/pdf/2104.10054.pdf), Xiaohan Wang, Linchao Zhu, Yi Yang"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5c3df183-8b2a-4db7-af6f-5c152335213a.json b/docs/doc/5c3df183-8b2a-4db7-af6f-5c152335213a.json
new file mode 100644
index 000000000..d0a4e17f6
--- /dev/null
+++ b/docs/doc/5c3df183-8b2a-4db7-af6f-5c152335213a.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code imports libraries, defines AVAMetric class for PaddleVideo and prepares metrics for video object detection. It also includes methods for logging during iterations, setting dataset info, and calculating final results.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and registers a class called AVAMetric as a metric for PaddleVideo. It initializes the AVAMetric with specified data size, batch size, and file path. The class inherits from BaseMetric and has an __init__ method which sets instance variables for data_size, batch_size, file_path, result_filename, and other properties.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_metric.py\":0-33",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nfrom collections import OrderedDict\nfrom paddlevideo.utils import get_logger, load, log_batch, AverageMeter\nfrom .registry import METRIC\nfrom .base import BaseMetric\nimport time\nfrom datetime import datetime\nfrom .ava_utils import ava_evaluate_results\nlogger = get_logger(\"paddlevideo\")\n\"\"\" An example for metrics class.\n    MultiCropMetric for slowfast.\n\"\"\"\n@METRIC.register\nclass AVAMetric(BaseMetric):\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 file_path,"
+        },
+        {
+            "comment": "The code initializes a class for preparing metrics in video object detection. It takes various parameters like file path, exclude file, label file, custom classes, and log interval for initialization. The class uses AverageMeter to store metrics such as loss, recall@thr=0.5, prec@thr=0.5, recall@top3, prec@top3, recall@top5, prec@top5, mAP@0.5IOU, batch time, and reader time.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_metric.py\":34-59",
+            "content": "                 exclude_file,\n                 label_file,\n                 custom_classes,\n                 log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.file_path = file_path\n        self.exclude_file = exclude_file\n        self.label_file = label_file\n        self.custom_classes = custom_classes\n        self.results = []\n        record_list = [\n            (\"loss\", AverageMeter('loss', '7.5f')),\n            (\"recall@thr=0.5\", AverageMeter(\"recall@thr=0.5\", '.5f')),\n            (\"prec@thr=0.5\", AverageMeter(\"prec@thr=0.5\", '.5f')),\n            (\"recall@top3\", AverageMeter(\"recall@top3\", '.5f')),\n            (\"prec@top3\", AverageMeter(\"prec@top3\", '.5f')),\n            (\"recall@top5\", AverageMeter(\"recall@top5\", '.5f')),\n            (\"prec@top5\", AverageMeter(\"prec@top5\", '.5f')),\n            (\"mAP@0.5IOU\", AverageMeter(\"mAP@0.5IOU\", '.5f')),\n            (\"batch_time\", AverageMeter('batch_cost', '.5f')),\n            (\"reader_time\", AverageMeter('reader_cost', '.5f')),"
+        },
+        {
+            "comment": "This code defines a class for metrics calculation and logging, with methods for updating metrics during iterations, setting dataset information, and accumulating final results. The update method extends the results list, updates batch time, logs batch time, and logs instance per second (ips). The accumulate method calculates final test results using ava_evaluate_results function and updates the record list with the final values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_metric.py\":60-89",
+            "content": "        ]\n        self.record_list = OrderedDict(record_list)\n        self.tic = time.time()\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        self.results.extend(outputs)\n        self.record_list['batch_time'].update(time.time() - self.tic)\n        tic = time.time()\n        ips = \"ips: {:.5f} instance/sec.\".format(\n            self.batch_size / self.record_list[\"batch_time\"].val)\n        log_batch(self.record_list, batch_id, 0, 0, \"test\", ips)\n    def set_dataset_info(self, info, dataset_len):\n        self.info = info\n        self.dataset_len = dataset_len\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        test_res = ava_evaluate_results(self.info, self.dataset_len,\n                                        self.results, None, self.label_file,\n                                        self.file_path, self.exclude_file)\n        for name, value in test_res.items():\n            self.record_list[name].update(value, self.batch_size)"
+        },
+        {
+            "comment": "The code snippet is returning the record list from a class method. It seems that this method might have been responsible for recording or storing some data in the `record_list` attribute of the class instance, and now it's returning that data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_metric.py\":91-91",
+            "content": "        return self.record_list"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5c655b59-6f3c-4a98-974a-5487f8e4108a.json b/docs/doc/5c655b59-6f3c-4a98-974a-5487f8e4108a.json
new file mode 100644
index 000000000..456aa5ea4
--- /dev/null
+++ b/docs/doc/5c655b59-6f3c-4a98-974a-5487f8e4108a.json
@@ -0,0 +1,70 @@
+{
+    "summary": "The code creates a function for label change detection in video segmentation, computes precision, recall, and F1 score, uses Levenstein distance, and evaluates ground truth and predicted actions.",
+    "details": [
+        {
+            "comment": "This code is a part of PaddleVideo library and defines a function get_labels_scores_start_end_time that takes input, frame-wise labels, actions dictionary, and optional background class. It returns labels, starts, ends, and scores based on the input and labels. The function also keeps track of the boundary score pointer and the last label.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":0-34",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport argparse\nimport pandas as pd\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\ndef get_labels_scores_start_end_time(input_np,\n                                     frame_wise_labels,\n                                     actions_dict,\n                                     bg_class=[\"background\", \"None\"]):\n    labels = []\n    starts = []\n    ends = []\n    scores = []\n    boundary_score_ptr = 0\n    last_label = frame_wise_labels[0]"
+        },
+        {
+            "comment": "This code segment is a part of a larger video analysis algorithm. It identifies changes in frame-wise labels and calculates scores for those changes based on input_np data associated with the actions_dict[labels]. The scores are then appended to the 'scores' list, while starts and ends lists keep track of the start and end indices for each identified change. Finally, if the last label is not in bg_class, it adds an ending index to the 'ends' list and calculates a score using input_np data associated with starts and ends indices. The code then updates boundary_score_ptr and proceeds to the next iteration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":35-56",
+            "content": "    if frame_wise_labels[0] not in bg_class:\n        labels.append(frame_wise_labels[0])\n        starts.append(0)\n    for i in range(len(frame_wise_labels)):\n        if frame_wise_labels[i] != last_label:\n            if frame_wise_labels[i] not in bg_class:\n                labels.append(frame_wise_labels[i])\n                starts.append(i)\n            if last_label not in bg_class:\n                ends.append(i)\n                score = np.mean(\n                        input_np[actions_dict[labels[boundary_score_ptr]], \\\n                            starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]\n                        )\n                scores.append(score)\n                boundary_score_ptr = boundary_score_ptr + 1\n            last_label = frame_wise_labels[i]\n    if last_label not in bg_class:\n        ends.append(i + 1)\n        score = np.mean(\n                    input_np[actions_dict[labels[boundary_score_ptr]], \\\n                        starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]"
+        },
+        {
+            "comment": "The first code chunk is a function that takes a list of frame-wise labels, iterates over the frames, and returns lists for the label names, starting indices, and ending indices. It appends new label names to the labels list and new starting indices to the starts list when a new label appears, and adds corresponding ending indices to the ends list if the previous label was not \"background\" or \"None\". The last label's ending index is added after the loop.\n\nThe second code chunk defines a function that calculates the Levenshtein distance between two strings (p and y) using dynamic programming, which measures the minimum number of operations required to transform one string into another (insertion, deletion, or substitution). The function creates a 2D array D of size (m_row + 1) x (n_col + 1), where m_row is the length of p and n_col is the length of y. It then fills the array using dynamic programming, considering different operations at each step to calculate the minimum distance. This function likely uses the D array for further calculations or returns it as a result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":57-90",
+            "content": "                    )\n        scores.append(score)\n        boundary_score_ptr = boundary_score_ptr + 1\n    return labels, starts, ends, scores\ndef get_labels_start_end_time(frame_wise_labels,\n                              bg_class=[\"background\", \"None\"]):\n    labels = []\n    starts = []\n    ends = []\n    last_label = frame_wise_labels[0]\n    if frame_wise_labels[0] not in bg_class:\n        labels.append(frame_wise_labels[0])\n        starts.append(0)\n    for i in range(len(frame_wise_labels)):\n        if frame_wise_labels[i] != last_label:\n            if frame_wise_labels[i] not in bg_class:\n                labels.append(frame_wise_labels[i])\n                starts.append(i)\n            if last_label not in bg_class:\n                ends.append(i)\n            last_label = frame_wise_labels[i]\n    if last_label not in bg_class:\n        ends.append(i + 1)\n    return labels, starts, ends\ndef levenstein(p, y, norm=False):\n    m_row = len(p)\n    n_col = len(y)\n    D = np.zeros([m_row + 1, n_col + 1], np.float)\n    for i in range(m_row + 1):"
+        },
+        {
+            "comment": "The code contains a function called \"levenstein\" that calculates the Levenstein distance between two sequences. The Levenstein distance is a metric used to measure the difference between two strings of characters, such as words or labels. In this case, it's used to compare the recognized and ground truth labels for video segmentation. The function takes in a pair of lists P and Y representing recognized and ground truth labels respectively, and an optional norm parameter which normalizes the score if True. The output is a single numeric value representing the distance between the two lists of labels. This score can be used to evaluate the accuracy of the recognition algorithm in comparison with the ground truth data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":91-125",
+            "content": "        D[i, 0] = i\n    for i in range(n_col + 1):\n        D[0, i] = i\n    for j in range(1, n_col + 1):\n        for i in range(1, m_row + 1):\n            if y[j - 1] == p[i - 1]:\n                D[i, j] = D[i - 1, j - 1]\n            else:\n                D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,\n                              D[i - 1, j - 1] + 1)\n    if norm:\n        score = (1 - D[-1, -1] / max(m_row, n_col)) * 100\n    else:\n        score = D[-1, -1]\n    return score\ndef edit_score(recognized,\n               ground_truth,\n               norm=True,\n               bg_class=[\"background\", \"None\"]):\n    P, _, _ = get_labels_start_end_time(recognized, bg_class)\n    Y, _, _ = get_labels_start_end_time(ground_truth, bg_class)\n    return levenstein(P, Y, norm)\ndef f_score(recognized, ground_truth, overlap, bg_class=[\"background\", \"None\"]):\n    p_label, p_start, p_end = get_labels_start_end_time(recognized, bg_class)\n    y_label, y_start, y_end = get_labels_start_end_time(ground_truth, bg_class)\n    tp = 0\n    fp = 0"
+        },
+        {
+            "comment": "The code calculates the precision, recall, and F1 score for image segmentation by iterating through predicted and ground truth labels. It assigns hits when there is a match between predictions and ground truth, and counts true positives (TP), false positives (FP), and false negatives (FN). The boundary_AR function takes in predicted and ground truth boundaries, sorts them based on scores, and calculates various metrics for image segmentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":127-160",
+            "content": "    hits = np.zeros(len(y_label))\n    for j in range(len(p_label)):\n        intersection = np.minimum(p_end[j], y_end) - np.maximum(\n            p_start[j], y_start)\n        union = np.maximum(p_end[j], y_end) - np.minimum(p_start[j], y_start)\n        IoU = (1.0 * intersection / union) * (\n            [p_label[j] == y_label[x] for x in range(len(y_label))])\n        # Get the best scoring segment\n        idx = np.array(IoU).argmax()\n        if IoU[idx] >= overlap and not hits[idx]:\n            tp += 1\n            hits[idx] = 1\n        else:\n            fp += 1\n    fn = len(y_label) - sum(hits)\n    return float(tp), float(fp), float(fn)\ndef boundary_AR(pred_boundary, gt_boundary, overlap_list, max_proposal):\n    p_label, p_start, p_end, p_scores = pred_boundary\n    y_label, y_start, y_end, _ = gt_boundary\n    # sort proposal\n    pred_dict = {\n        \"label\": p_label,\n        \"start\": p_start,\n        \"end\": p_end,\n        \"scores\": p_scores\n    }\n    pdf = pd.DataFrame(pred_dict)\n    pdf = pdf.sort_values(by=\"scores\", ascending=False)"
+        },
+        {
+            "comment": "This code segment handles the refinement of proposals in an object detection model. If the number of proposals is less than the maximum allowed, it repeats the last proposal to meet the requirement. If there are more proposals than the maximum allowed, it discards extra proposals. The code then calculates the average recall (AR) by iterating over the overlap list and counting true positives (tp) and false positives (fp). It also initializes hits for each proposal in the ground truth labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":161-190",
+            "content": "    p_label = list(pdf[\"label\"])\n    p_start = list(pdf[\"start\"])\n    p_end = list(pdf[\"end\"])\n    p_scores = list(pdf[\"scores\"])\n    # refine AN\n    if len(p_label) < max_proposal and len(p_label) > 0:\n        p_label = p_label + [p_label[-1]] * (max_proposal - len(p_label))\n        p_start = p_start + [p_start[-1]] * (max_proposal - len(p_start))\n        p_start = p_start + p_start[len(p_start) -\n                                    (max_proposal - len(p_start)):]\n        p_end = p_end + [p_end[-1]] * (max_proposal - len(p_end))\n        p_scores = p_scores + [p_scores[-1]] * (max_proposal - len(p_scores))\n    elif len(p_label) > max_proposal:\n        p_label[max_proposal:] = []\n        p_start[max_proposal:] = []\n        p_end[max_proposal:] = []\n        p_scores[max_proposal:] = []\n    t_AR = np.zeros(len(overlap_list))\n    for i in range(len(overlap_list)):\n        overlap = overlap_list[i]\n        tp = 0\n        fp = 0\n        hits = np.zeros(len(y_label))\n        for j in range(len(p_label)):\n            intersection = np.minimum(p_end[j], y_end) - np.maximum("
+        },
+        {
+            "comment": "This code calculates the Average Recall (AR) for video segmentation models. It iterates through ground truth and predicted labels, calculates intersection over union (IoU), counts true positives (tp), false positives (fp), and false negatives (fn). Then it computes recall and averages them to obtain AR. The SegmentationMetric class initializes with various parameters for the metric calculation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":191-229",
+            "content": "                p_start[j], y_start)\n            union = np.maximum(p_end[j], y_end) - np.minimum(\n                p_start[j], y_start)\n            IoU = (1.0 * intersection / union)\n            # Get the best scoring segment\n            idx = np.array(IoU).argmax()\n            if IoU[idx] >= overlap and not hits[idx]:\n                tp += 1\n                hits[idx] = 1\n            else:\n                fp += 1\n        fn = len(y_label) - sum(hits)\n        recall = float(tp) / (float(tp) + float(fn))\n        t_AR[i] = recall\n    AR = np.mean(t_AR)\n    return AR\n@METRIC.register\nclass SegmentationMetric(BaseMetric):\n    \"\"\"\n    Test for Video Segmentation based model.\n    \"\"\"\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 overlap,\n                 actions_map_file_path,\n                 log_interval=1,\n                 tolerance=5,\n                 boundary_threshold=0.7,\n                 max_proposal=100):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)"
+        },
+        {
+            "comment": "This code initializes a SegmentationMetric object, reads an actions map file, and prepares to update metrics during each iteration. It calculates true positives (cls_tp), false positives (cls_fp), and false negatives (cls_fn) for each frame's overlap. The AR_at_AN is also initialized with empty lists for each max_proposal value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":230-263",
+            "content": "        # actions dict generate\n        file_ptr = open(actions_map_file_path, 'r')\n        actions = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        self.actions_dict = dict()\n        for a in actions:\n            self.actions_dict[a.split()[1]] = int(a.split()[0])\n        # cls score\n        self.overlap = overlap\n        self.overlap_len = len(overlap)\n        self.cls_tp = np.zeros(self.overlap_len)\n        self.cls_fp = np.zeros(self.overlap_len)\n        self.cls_fn = np.zeros(self.overlap_len)\n        self.total_correct = 0\n        self.total_edit = 0\n        self.total_frame = 0\n        self.total_video = 0\n        # boundary score\n        self.max_proposal = max_proposal\n        self.AR_at_AN = [[] for _ in range(max_proposal)]\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        groundTruth = data[1]\n        predicted = outputs['predict']\n        output_np = outputs['output_np']\n        outputs_np = predicted.numpy()\n        outputs_arr = output_np.numpy()[0, :]"
+        },
+        {
+            "comment": "This code segment compares ground truth and predicted actions for a video. It converts the numpy arrays to lists, generates predicted and ground truth boundaries using the `get_labels_scores_start_end_time` function, and then initializes variables for accuracy calculation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":264-294",
+            "content": "        gt_np = groundTruth.numpy()[0, :]\n        recognition = []\n        for i in range(outputs_np.shape[0]):\n            recognition = np.concatenate((recognition, [\n                list(self.actions_dict.keys())[list(\n                    self.actions_dict.values()).index(outputs_np[i])]\n            ]))\n        recog_content = list(recognition)\n        gt_content = []\n        for i in range(gt_np.shape[0]):\n            gt_content = np.concatenate((gt_content, [\n                list(self.actions_dict.keys())[list(\n                    self.actions_dict.values()).index(gt_np[i])]\n            ]))\n        gt_content = list(gt_content)\n        pred_boundary = get_labels_scores_start_end_time(\n            outputs_arr, recog_content, self.actions_dict)\n        gt_boundary = get_labels_scores_start_end_time(\n            np.ones(outputs_arr.shape), gt_content, self.actions_dict)\n        # cls score\n        correct = 0\n        total = 0\n        edit = 0\n        for i in range(len(gt_content)):\n            total += 1\n            #accumulate"
+        },
+        {
+            "comment": "This code calculates segmentation metrics, including accuracy, false positives, and false negatives for video frames. It also keeps track of total correct predictions, edit distances, proposal scores, and accumulates these metrics per video. The `accumulate` function is used to calculate classification accuracy when all iterations are finished.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":295-329",
+            "content": "            self.total_frame += 1\n            if gt_content[i] == recog_content[i]:\n                correct += 1\n                #accumulate\n                self.total_correct += 1\n        edit_num = edit_score(recog_content, gt_content)\n        edit += edit_num\n        self.total_edit += edit_num\n        for s in range(self.overlap_len):\n            tp1, fp1, fn1 = f_score(recog_content, gt_content, self.overlap[s])\n            # accumulate\n            self.cls_tp[s] += tp1\n            self.cls_fp[s] += fp1\n            self.cls_fn[s] += fn1\n        # accumulate\n        self.total_video += 1\n        # proposal score\n        for AN in range(self.max_proposal):\n            AR = boundary_AR(pred_boundary,\n                             gt_boundary,\n                             self.overlap,\n                             max_proposal=(AN + 1))\n            self.AR_at_AN[AN].append(AR)\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        # cls metric\n        Acc = 100 * float(self.total_correct) / self.total_frame"
+        },
+        {
+            "comment": "This code calculates segmentation metrics, including Edit distance, F1 score at different overlap levels, and proposal area under the curve (AUC). It then stores these values in a dictionary and computes average AUCs for different overlap thresholds. The code also calculates an ensemble metric based on accuracy (Acc) and Edit distance. Finally, it logs this information as a string.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":330-355",
+            "content": "        Edit = (1.0 * self.total_edit) / self.total_video\n        Fscore = dict()\n        for s in range(self.overlap_len):\n            precision = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fp[s])\n            recall = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fn[s])\n            f1 = 2.0 * (precision * recall) / (precision + recall)\n            f1 = np.nan_to_num(f1) * 100\n            Fscore[self.overlap[s]] = f1\n        # proposal metric\n        proposal_AUC = np.array(self.AR_at_AN) * 100\n        AUC = np.mean(proposal_AUC)\n        AR_at_AN1 = np.mean(proposal_AUC[0, :])\n        AR_at_AN5 = np.mean(proposal_AUC[4, :])\n        AR_at_AN15 = np.mean(proposal_AUC[14, :])\n        # log metric\n        log_mertic_info = \"dataset model performence: \"\n        # preds ensemble\n        log_mertic_info += \"Acc: {:.4f}, \".format(Acc)\n        log_mertic_info += 'Edit: {:.4f}, '.format(Edit)\n        for s in range(len(self.overlap)):\n            log_mertic_info += 'F1@{:0.2f}: {:.4f}, '.format(\n                self.overlap[s], Fscore[self.overlap[s]])"
+        },
+        {
+            "comment": "This code calculates and logs various segmentation metrics, including AUC, AR@AN1, AR@AN5, and AR@AN15. It also updates the metric dictionary with F1 scores for different overlap thresholds and clears the classifier statistics for the next epoch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":357-384",
+            "content": "        # boundary metric\n        log_mertic_info += \"Auc: {:.4f}, \".format(AUC)\n        log_mertic_info += \"AR@AN1: {:.4f}, \".format(AR_at_AN1)\n        log_mertic_info += \"AR@AN5: {:.4f}, \".format(AR_at_AN5)\n        log_mertic_info += \"AR@AN15: {:.4f}, \".format(AR_at_AN15)\n        logger.info(log_mertic_info)\n        # log metric\n        metric_dict = dict()\n        metric_dict['Acc'] = Acc\n        metric_dict['Edit'] = Edit\n        for s in range(len(self.overlap)):\n            metric_dict['F1@{:0.2f}'.format(\n                self.overlap[s])] = Fscore[self.overlap[s]]\n        metric_dict['Auc'] = AUC\n        metric_dict['AR@AN1'] = AR_at_AN1\n        metric_dict['AR@AN5'] = AR_at_AN5\n        metric_dict['AR@AN15'] = AR_at_AN15\n        # clear for next epoch\n        # cls\n        self.cls_tp = np.zeros(self.overlap_len)\n        self.cls_fp = np.zeros(self.overlap_len)\n        self.cls_fn = np.zeros(self.overlap_len)\n        self.total_correct = 0\n        self.total_edit = 0\n        self.total_frame = 0\n        self.total_video = 0"
+        },
+        {
+            "comment": "This code initializes the attribute \"AR_at_AN\" as a list of empty lists, with the length equal to the maximum number of proposals. This is done within the context of a class method and the list will likely be used for storing evaluation metrics related to these proposals. The method then returns a dictionary (metric_dict) containing other potentially calculated metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/segmentation_metric.py\":385-388",
+            "content": "        # proposal\n        self.AR_at_AN = [[] for _ in range(self.max_proposal)]\n        return metric_dict"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5cbc0b06-bf90-4596-b043-079aaad0a819.json b/docs/doc/5cbc0b06-bf90-4596-b043-079aaad0a819.json
new file mode 100644
index 000000000..0e263d2dc
--- /dev/null
+++ b/docs/doc/5cbc0b06-bf90-4596-b043-079aaad0a819.json
@@ -0,0 +1,55 @@
+{
+    "summary": "This code sets up PaddleVideo model inference, performs tests with MKLDNN or float point precision, iterates through thread settings and precisions, logs results, configures and builds PaddleVideo, sets OpenCV, CUDA, CUDNN directories, checks GPUID, runs inference tests on a list of model directories.",
+    "details": [
+        {
+            "comment": "This script uses Bash to parse input file lines, extracting model information and inference parameters for C++ models. It sources a common function script and then proceeds to parse each line of the input file into various variables like model name, OpenCV usage, C++ inference model directory list, inference command, GPU/MKLDNN/CPU thread settings, etc. These parsed values are stored in different variables for further use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":0-28",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nFILENAME=$1\nMODE=$2\ndataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)\n# parser params\nIFS=$'\\n'\nlines=(${dataline})\n# parser cpp inference model\nmodel_name=$(func_parser_value \"${lines[1]}\")\nuse_opencv=$(func_parser_value \"${lines[2]}\")\ncpp_infer_model_dir_list=$(func_parser_value \"${lines[3]}\")\ncpp_infer_is_quant=$(func_parser_value \"${lines[4]}\")\n# parser cpp inference\ninference_cmd=$(func_parser_value \"${lines[5]}\")\ncpp_use_gpu_key=$(func_parser_key \"${lines[6]}\")\ncpp_use_gpu_list=$(func_parser_value \"${lines[6]}\")\ncpp_use_mkldnn_key=$(func_parser_key \"${lines[7]}\")\ncpp_use_mkldnn_list=$(func_parser_value \"${lines[7]}\")\ncpp_cpu_threads_key=$(func_parser_key \"${lines[8]}\")\ncpp_cpu_threads_list=$(func_parser_value \"${lines[8]}\")\ncpp_batch_size_key=$(func_parser_key \"${lines[9]}\")\ncpp_batch_size_list=$(func_parser_value \"${lines[9]}\")\ncpp_use_trt_key=$(func_parser_key \"${lines[10]}\")\ncpp_use_trt_list=$(func_parser_value \"${lines[10]}\")\ncpp_precision_key=$(func_parser_key \"${lines[11]}\")"
+        },
+        {
+            "comment": "This code is setting up variables for running a PaddleVideo model inference using C++. It sets the precision list, infer model key, image directory key and value, and other keys and values required for the benchmarking process. The code also creates a log path for storing results of the C++ inference and prepares to loop through possible GPU usage and MKLDNN configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":29-57",
+            "content": "cpp_precision_list=$(func_parser_value \"${lines[11]}\")\ncpp_infer_model_key=$(func_parser_key \"${lines[12]}\")\ncpp_image_dir_key=$(func_parser_key \"${lines[13]}\")\ncpp_infer_img_dir=$(func_parser_value \"${lines[13]}\")\ncpp_infer_key1=$(func_parser_key \"${lines[14]}\")\ncpp_infer_value1=$(func_parser_value \"${lines[14]}\")\ncpp_benchmark_key=$(func_parser_key \"${lines[15]}\")\ncpp_benchmark_value=$(func_parser_value \"${lines[15]}\")\ncpp_infer_key2=$(func_parser_key \"${lines[16]}\")\ncpp_infer_value2=$(func_parser_value \"${lines[16]}\")\ncpp_infer_key3=$(func_parser_key \"${lines[17]}\")\ncpp_infer_value3=$(func_parser_value \"${lines[17]}\")\nLOG_PATH=\"./test_tipc/output/${model_name}/${MODE}\"\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_cpp.log\"\nfunction func_cpp_inference(){\n    IFS='|'\n    _script=$1\n    _model_dir=$2\n    _log_path=$3\n    _img_dir=$4\n    _flag_quant=$5\n    # inference\n    for use_gpu in ${cpp_use_gpu_list[*]}; do\n        if [ ${use_gpu} = \"False\" ] || [ ${use_gpu} = \"cpu\" ]; then\n            for use_mkldnn in ${cpp_use_mkldnn_list[*]}; do"
+        },
+        {
+            "comment": "This code checks if MKLDNN is not being used and quantized precision is true. If so, it continues without executing the loop. Otherwise, it iterates through different thread settings, batch sizes, and precisions to execute inference tests on CPU using MKLDNN (if enabled) or float point precision (default). Logs are saved with details of parameters used for each run.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":58-71",
+            "content": "                if [ ${use_mkldnn} = \"False\" ] && [ ${_flag_quant} = \"True\" ]; then\n                    continue\n                fi\n                for threads in ${cpp_cpu_threads_list[*]}; do\n                    for batch_size in ${cpp_batch_size_list[*]}; do\n                        precision=\"fp32\"\n                        if [ ${use_mkldnn} = \"False\" ] && [ ${_flag_quant} = \"True\" ]; then\n                            precison=\"int8\"\n                        fi\n                        _save_log_path=\"${_log_path}/cpp_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log\"\n                        set_infer_data=$(func_set_params \"${cpp_image_dir_key}\" \"${_img_dir}\")\n                        set_benchmark=$(func_set_params \"${cpp_benchmark_key}\" \"${cpp_benchmark_value}\")\n                        set_batchsize=$(func_set_params \"${cpp_batch_size_key}\" \"${batch_size}\")\n                        set_cpu_threads=$(func_set_params \"${cpp_cpu_threads_key}\" \"${threads}\")"
+        },
+        {
+            "comment": "This code is iterating over different model names and configurations, setting various parameters such as GPU usage and thread count. It then executes a command to run inference on the model and saves the log file. The script checks the status of the execution and logs any errors or warnings for debugging purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":72-84",
+            "content": "                        set_model_dir=$(func_set_params \"${cpp_infer_model_key}\" \"${_model_dir}\")\n                        set_infer_params1=$(func_set_params \"${cpp_infer_key1}\" \"${cpp_infer_value1}\")\n                        set_infer_params2=$(func_set_params \"${cpp_infer_key2}\" \"${cpp_infer_value2}\")\n                        set_infer_params3=$(func_set_params \"${cpp_infer_key3}\" \"${cpp_infer_value3}\")\n                        command=\"${_script} ${cpp_use_gpu_key}=${use_gpu} ${cpp_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params2} ${set_infer_params3} > ${_save_log_path} 2>&1 \"\n                        eval $command\n                        last_status=${PIPESTATUS[0]}\n                        eval \"cat ${_save_log_path}\"\n                        status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\"\n                    done\n                done\n            done\n        elif [ ${use_gpu} = \"True\" ] || [ ${use_gpu} = \"gpu\" ]; then"
+        },
+        {
+            "comment": "The code snippet is performing nested loops to iterate over different combinations of TensorRT (TRT) usage and precision options. It checks specific conditions using if statements, such as avoiding quantized precision with non-quantized flag set or excluding certain combinations based on TRT and precision values. Finally, it sets variables for the log path, input data parameters, benchmark value, and batch size before potentially executing further code within these loops.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":85-100",
+            "content": "            for use_trt in ${cpp_use_trt_list[*]}; do\n                for precision in ${cpp_precision_list[*]}; do\n                    if [[ ${_flag_quant} = \"False\" ]] && [[ ${precision} =~ \"int8\" ]]; then\n                        continue\n                    fi\n                    if [[ ${precision} =~ \"fp16\" || ${precision} =~ \"int8\" ]] && [ ${use_trt} = \"False\" ]; then\n                        continue\n                    fi\n                    if [[ ${use_trt} = \"False\" || ${precision} =~ \"int8\" ]] && [ ${_flag_quant} = \"True\" ]; then\n                        continue\n                    fi\n                    for batch_size in ${cpp_batch_size_list[*]}; do\n                        _save_log_path=\"${_log_path}/cpp_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log\"\n                        set_infer_data=$(func_set_params \"${cpp_image_dir_key}\" \"${_img_dir}\")\n                        set_benchmark=$(func_set_params \"${cpp_benchmark_key}\" \"${cpp_benchmark_value}\")\n                        set_batchsize=$(func_set_params \"${cpp_batch_size_key}\" \"${batch_size}\")"
+        },
+        {
+            "comment": "The code is setting parameters for a TensorRT inference script. It assigns values to various keys and directories before executing the script and saving the output log file. The last status of the command execution is checked, and the log file is displayed if no issues occurred.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":101-111",
+            "content": "                        set_tensorrt=$(func_set_params \"${cpp_use_trt_key}\" \"${use_trt}\")\n                        set_precision=$(func_set_params \"${cpp_precision_key}\" \"${precision}\")\n                        set_model_dir=$(func_set_params \"${cpp_infer_model_key}\" \"${_model_dir}\")\n                        set_infer_params1=$(func_set_params \"${cpp_infer_key1}\" \"${cpp_infer_value1}\")\n                        set_infer_params2=$(func_set_params \"${cpp_infer_key2}\" \"${cpp_infer_value2}\")\n                        set_infer_params3=$(func_set_params \"${cpp_infer_key3}\" \"${cpp_infer_value3}\")\n                        command=\"${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params2} ${set_infer_params3} > ${_save_log_path} 2>&1 \"\n                        eval $command\n                        last_status=${PIPESTATUS[0]}\n                        eval \"cat ${_save_log_path}\"\n                        status_check $last_status \"${command}\" \"${status_log}\" \"${model_name}\""
+        },
+        {
+            "comment": "The code checks if the current hardware supports CPU and GPU, and if not, it prints a message. If the OpenCV library is missing or outdated, it downloads the latest version and builds it. It then sets up the installation path for the built OpenCV library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":113-145",
+            "content": "                    done\n                done\n            done\n        else\n            echo \"Does not support hardware other than CPU and GPU Currently!\"\n        fi\n    done\n}\ncd deploy/cpp_infer\nif [ ${use_opencv} = \"True\" ]; then\n    if [ -d \"opencv-3.4.7/opencv3/\" ] && [ $(md5sum opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = \"faa2b5950f8bee3f03118e600c74746a\" ];then\n        echo \"################### build opencv skipped ###################\"\n    else\n        echo \"################### building opencv ###################\"\n        rm -rf opencv-3.4.7.tar.gz opencv-3.4.7/\n        wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/opencv-3.4.7.tar.gz\n        tar -xf opencv-3.4.7.tar.gz\n        cd opencv-3.4.7/\n        install_path=$(pwd)/opencv3\n        rm -rf build\n        mkdir build\n        cd build\n        cmake .. \\\n            -DCMAKE_INSTALL_PREFIX=${install_path} \\\n            -DCMAKE_BUILD_TYPE=Release \\\n            -DBUILD_SHARED_LIBS=OFF \\\n            -DWITH_IPP=OFF \\\n            -DBUILD_IPP_IW=OFF \\"
+        },
+        {
+            "comment": "This code sets various CMake flags to configure the build process, then proceeds with making and installing the required libraries. It checks if a directory exists, downloads necessary files if needed, and finally starts building the PaddleVideo demo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":146-177",
+            "content": "            -DWITH_LAPACK=OFF \\\n            -DWITH_EIGEN=OFF \\\n            -DCMAKE_INSTALL_LIBDIR=lib64 \\\n            -DWITH_ZLIB=ON \\\n            -DBUILD_ZLIB=ON \\\n            -DWITH_JPEG=ON \\\n            -DBUILD_JPEG=ON \\\n            -DWITH_PNG=ON \\\n            -DBUILD_PNG=ON \\\n            -DWITH_TIFF=ON \\\n            -DBUILD_TIFF=ON \\\n            -DWITH_FFMPEG=ON\n        make -j\n        make install\n        cd ../\n        echo \"################### building opencv finished ###################\"\n    fi\nfi\nif [ !-d \"paddle_inference\" ]; then\n    echo \"################### download inference lib skipped ###################\"\nelse\n    echo \"################### downloading inference lib ###################\"\n    wget -nc https://paddle-inference-lib.bj.bcebos.com/2.1.1-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddle_inference.tgz\n    tar -xf paddle_inference.tgz\n    echo \"################### downloading inference lib finished ###################\"\nfi\necho \"################### building PaddleVideo demo ####################\"\nif [ ${use_opencv} = \"True\" ]; then"
+        },
+        {
+            "comment": "This code is configuring and building PaddleVideo, setting up OpenCV, CUDA, and CUDNN directories, and preparing for running test inference. It also checks if GPUID is set and sets the CUDA_VISIBLE_DEVICES environment variable accordingly. Finally, it loops through a list of model directories to run inference tests.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":178-224",
+            "content": "    OPENCV_DIR=$(pwd)/opencv-3.4.7/opencv3\nelse\n    OPENCV_DIR=''\nfi\nLIB_DIR=$(pwd)/paddle_inference\nCUDA_LIB_DIR=$(dirname `find /usr -name libcudart.so`)\nCUDNN_LIB_DIR=$(dirname `find /usr -name libcudnn.so`)\nBUILD_DIR=build\nrm -rf ${BUILD_DIR}\nmkdir ${BUILD_DIR}\ncd ${BUILD_DIR}\ncmake .. \\\n    -DPADDLE_LIB=${LIB_DIR} \\\n    -DWITH_MKL=ON \\\n    -DWITH_GPU=OFF \\\n    -DWITH_STATIC_LIB=OFF \\\n    -DWITH_TENSORRT=OFF \\\n    -DOPENCV_DIR=${OPENCV_DIR} \\\n    -DCUDNN_LIB=${CUDNN_LIB_DIR} \\\n    -DCUDA_LIB=${CUDA_LIB_DIR} \\\n    -DTENSORRT_DIR=${TENSORRT_DIR} \\\nmake -j\ncd ../../../\necho \"################### building PaddleVideo demo finished ###################\"\n# set cuda device\nGPUID=$2\nif [ ${#GPUID} -le 0 ];then\n    env=\" \"\nelse\n    env=\"export CUDA_VISIBLE_DEVICES=${GPUID}\"\nfi\nset CUDA_VISIBLE_DEVICES\neval $env\necho \"################### running test ###################\"\nexport Count=0\nIFS=\"|\"\ninfer_quant_flag=(${cpp_infer_is_quant})\nfor infer_model in ${cpp_infer_model_dir_list[*]}; do\n    #run inference\n    is_quant=${infer_quant_flag[Count]}"
+        },
+        {
+            "comment": "This code snippet is calling a function \"func_cpp_inference\" to execute inference commands, incrementing the Count variable on each iteration of a loop. The function is called with input parameters for the command, model path, log path, image directory, and a quantization flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_inference_cpp.sh\":225-227",
+            "content": "    func_cpp_inference \"${inference_cmd}\" \"${infer_model}\" \"${LOG_PATH}\" \"${cpp_infer_img_dir}\" ${is_quant}\n    Count=$(($Count + 1))\ndone"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5cf2faea-1634-403f-86a4-7941b7b7f934.json b/docs/doc/5cf2faea-1634-403f-86a4-7941b7b7f934.json
new file mode 100644
index 000000000..c4e8ef558
--- /dev/null
+++ b/docs/doc/5cf2faea-1634-403f-86a4-7941b7b7f934.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The Python class FeatureDataset, part of the PaddleVideo library, initializes attributes and provides methods for action recognition tasks. The code also includes a prepare_test function to prepare data for testing by applying a pipeline and checking 'iou_norm' results.",
+    "details": [
+        {
+            "comment": "This code is a Python class named FeatureDataset, which is a subclass of BaseDataset. It appears to be part of the PaddleVideo library and is used for action recognition tasks. The class has an __init__ method that initializes various attributes such as file_path, pipeline, data_prefix, test_mode, and suffix. The class is registered in the DATASETS registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/feature.py\":0-35",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport os.path as osp\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\n@DATASETS.register()\nclass FeatureDataset(BaseDataset):\n    \"\"\"Feature dataset for action recognition\n       Example:(TODO)\n       Args:(TODO)\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        data_prefix=None,\n        test_mode=False,\n        suffix=None,\n    ):\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)"
+        },
+        {
+            "comment": "The code defines two methods. The `load_file` method reads an index file and retrieves video information by parsing each line, stripping whitespace, splitting the filename, and optionally appending a specified suffix or joining with a data prefix. It returns a list of dictionaries containing the filenames. The `prepare_train` method takes an index and prepares training/validation data using a specified pipeline function. If 'iou_norm' is present in the results, it returns multiple data types (e.g., rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask) along with labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/feature.py\":37-62",
+            "content": "    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                filename = line.strip().split()[0]\n                if self.data_prefix is not None:\n                    filename = osp.join(self.data_prefix, filename)\n                if self.suffix is not None:\n                    filename = filename + self.suffix\n                info.append(dict(filename=filename))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        if 'iou_norm' in results:\n            return results['rgb_data'], results['rgb_len'], results[\n                'rgb_mask'], results['audio_data'], results[\n                    'audio_len'], results['audio_mask'], results[\n                        'labels'], results['iou_norm']\n        else:\n            return results['rgb_data'], results['rgb_len'], results["
+        },
+        {
+            "comment": "The code defines a function prepare_test that prepares data for testing. It creates a deep copy of the dataset information at the given index, applies a pipeline to it and then checks if 'iou_norm' is in the results. If it is, it returns 7 elements including 'rgb_data', 'audio_data', 'labels' and 'iou_norm'. Otherwise, it returns 6 elements without 'iou_norm'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/feature.py\":63-79",
+            "content": "                'rgb_mask'], results['audio_data'], results[\n                    'audio_len'], results['audio_mask'], results['labels']\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for testing given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        if 'iou_norm' in results:\n            return results['rgb_data'], results['rgb_len'], results[\n                'rgb_mask'], results['audio_data'], results[\n                    'audio_len'], results['audio_mask'], results[\n                        'labels'], results['iou_norm']\n        else:\n            return results['rgb_data'], results['rgb_len'], results[\n                'rgb_mask'], results['audio_data'], results[\n                    'audio_len'], results['audio_mask'], results['labels']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5d9caa45-642a-4200-943c-4e886465283e.json b/docs/doc/5d9caa45-642a-4200-943c-4e886465283e.json
new file mode 100644
index 000000000..c5640f8b3
--- /dev/null
+++ b/docs/doc/5d9caa45-642a-4200-943c-4e886465283e.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code uses OpenCV and other libraries, processes video frames in batches with PaddleVideo's Recognition class, enables benchmarking if set, and handles main function execution and program termination.",
+    "details": [
+        {
+            "comment": "This code file contains copyright information, license details, and includes necessary header files for OpenCV, Google Logging, GFlags, and other utilities. It also includes the header file for video_rec and utility functions. This seems to be part of a larger codebase related to video processing or analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/main.cpp\":0-34",
+            "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include \"glog/logging.h\"\n#include \"omp.h\"\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include <include/video_rec.h>\n#include <include/utility.h>\n#include <sys/stat.h>\n#include <gflags/gflags.h>\n#include \"auto_log/autolog.h\""
+        },
+        {
+            "comment": "This code defines various parameters for an inference process. The use_gpu flag determines if GPU or CPU is used, gpu_id specifies the device id of the GPU, gpu_mem sets the GPU id for inferencing with GPU, cpu_threads indicates the number of threads for CPU usage, enable_mkldnn enables MKL-DNN for CPU operations, use_tensorrt utilizes TensorRT, precision selects the desired precision format (fp32/fp16/int8), benchmark tracks inference timings, and video recognition parameters include the input video directory, model path, model name, number of frames per segment, and batch number.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/main.cpp\":36-53",
+            "content": "// general parameters\nDEFINE_bool(use_gpu, false, \"Infering with GPU or CPU.\");\nDEFINE_int32(gpu_id, 0, \"Device id of GPU to execute.\");\nDEFINE_int32(gpu_mem, 4000, \"GPU id when infering with GPU.\");\nDEFINE_int32(cpu_threads, 10, \"Num of threads with CPU.\");\nDEFINE_bool(enable_mkldnn, false, \"Whether use mkldnn with CPU.\");\nDEFINE_bool(use_tensorrt, false, \"Whether use tensorrt.\");\nDEFINE_string(precision, \"fp32\", \"Precision be one of fp32/fp16/int8.\");\nDEFINE_bool(benchmark, true, \"Whether to log and report benchmark information during inference.\");\n// video recognition related\nDEFINE_string(video_dir, \"\", \"Dir of input video(s).\");\nDEFINE_string(rec_model_dir, \"../example_video_dir\", \"Path of video rec inference model.\");\nDEFINE_string(inference_model_name, \"ppTSM\", \"The name of the model used in the prediction.\");\nDEFINE_int32(num_seg, 8, \"number of frames input to model, which are extracted from a video.\");\nDEFINE_int32(seg_len, 1, \"number of frames from a segment.\");\nDEFINE_int32(rec_batch_num, 1, \"rec_batch_num.\");"
+        },
+        {
+            "comment": "Initializing a video recognition object and processing each video in batches.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/main.cpp\":54-84",
+            "content": "DEFINE_string(char_list_file, \"../../data/k400/Kinetics-400_label_list.txt\", \"Path of dictionary.\");\nusing namespace std;\nusing namespace cv;\nusing namespace PaddleVideo;\nstatic bool PathExists(const std::string& path)\n{\n#ifdef _WIN32\n    struct _stat buffer;\n    return (_stat(path.c_str(), &buffer) == 0);\n#else\n    struct stat buffer;\n    return (stat(path.c_str(), &buffer) == 0);\n#endif  // !_WIN32\n}\nint main_rec(std::vector<cv::String> &cv_all_video_names)\n{\n    std::vector<double> time_info = {0, 0, 0}; // Statement time statistics vector\n    VideoRecognizer rec(FLAGS_rec_model_dir, FLAGS_inference_model_name, FLAGS_use_gpu, FLAGS_num_seg,\n                        FLAGS_rec_batch_num, FLAGS_gpu_id,\n                        FLAGS_gpu_mem, FLAGS_cpu_threads,\n                        FLAGS_enable_mkldnn, FLAGS_char_list_file,\n                        FLAGS_use_tensorrt, FLAGS_precision); // Instantiate a video recognition object\n    int batch_num = FLAGS_rec_batch_num;\n    for (int i = 0, n = cv_all_video_names.size(); i < n; i += batch_num) // Process each video"
+        },
+        {
+            "comment": "This code is processing a batch of video frames using PaddleVideo's Recognition class. It initializes time consumption statistics, then runs the recognition method on each frame within the specified batch and stores the results in `time_info`. Additionally, it enables benchmarking if FLAGS_benchmark flag is set.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/main.cpp\":85-108",
+            "content": "    {\n        int start_idx = i;\n        int end_idx = min(i + batch_num, n);\n        std::vector<std::vector<cv::Mat> > frames_batch;\n        for (int j = start_idx; j < end_idx; ++j)\n        {\n            std::vector<cv::Mat> frames = Utility::SampleFramesFromVideo(cv_all_video_names[i], FLAGS_num_seg, FLAGS_seg_len);\n            frames_batch.emplace_back(frames);\n        }\n        std::vector<double> rec_times; // Initialization time consumption statistics\n        // Take the read several video frames and send them to the run method of the recognition class to predict\n        rec.Run(std::vector<string>(cv_all_video_names.begin() + start_idx, cv_all_video_names.begin() + end_idx), frames_batch, &rec_times);\n        time_info[0] += rec_times[0];\n        time_info[1] += rec_times[1];\n        time_info[2] += rec_times[2];\n    }\n    if (FLAGS_benchmark)\n    {\n        AutoLogger autolog(\"rec\",\n                           FLAGS_use_gpu,\n                           FLAGS_use_tensorrt,\n                           FLAGS_enable_mkldnn,"
+        },
+        {
+            "comment": "This code segment is checking the parameters for running the video inference. If it's in recording mode, it ensures that both rec_model_dir and video_dir are not empty. It also checks if the precision specified (fp32, fp16, or int8) is valid. If any error is found, it displays an appropriate usage message and exits with an error code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/main.cpp\":109-137",
+            "content": "                           FLAGS_cpu_threads,\n                           FLAGS_rec_batch_num,\n                           \"dynamic\",\n                           FLAGS_precision,\n                           time_info,\n                           cv_all_video_names.size()); // Generate detailed information on the run\n        autolog.report(); // Print running details\n    }\n    return 0;\n}\nvoid check_params(char* mode)\n{\n    if (strcmp(mode, \"rec\") == 0)\n    {\n        std::cout << \"[\" << FLAGS_rec_model_dir << \"]\" << std::endl;\n        std::cout << \"[\" << FLAGS_video_dir << \"]\" << std::endl;\n        if (FLAGS_rec_model_dir.empty() || FLAGS_video_dir.empty())\n        {\n            std::cout << \"Usage[rec]: ./ppvideo --rec_model_dir=/PATH/TO/REC_INFERENCE_MODEL/ \"\n                      << \"--video_dir=/PATH/TO/INPUT/VIDEO/\" << std::endl;\n            exit(1);\n        }\n    }\n    if (FLAGS_precision != \"fp32\" && FLAGS_precision != \"fp16\" && FLAGS_precision != \"int8\")\n    {\n        cout << \"precison should be 'fp32'(default), 'fp16' or 'int8'. \" << endl;"
+        },
+        {
+            "comment": "The code checks the user input and ensures the correct mode (\"rec\") is chosen. If not, it outputs an error message and returns -1. It also validates if the video directory exists and displays the total number of videos found. Finally, it calls the main_rec function for recording mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/main.cpp\":138-169",
+            "content": "        exit(1);\n    }\n}\nint main(int argc, char **argv)\n{\n    if (argc <= 1 || (strcmp(argv[1], \"rec\") != 0)) //Get user input and check\n    {\n        std::cout << \"Please choose one mode of [rec] !\" << std::endl;\n        return -1;\n    }\n    std::cout << \"mode: \" << argv[1] << endl; // Type of inference task required for output\n    // Parsing command-line\n    google::ParseCommandLineFlags(&argc, &argv, true);\n    check_params(argv[1]);\n    if (!PathExists(FLAGS_video_dir)) // Determine whether the directory where the video exists\n    {\n        std::cerr << \"[ERROR] video path not exist! video_dir: \" << FLAGS_video_dir << endl;\n        exit(1);\n    }\n    std::vector<cv::String> cv_all_video_names; // Store all video paths\n    cv::glob(FLAGS_video_dir, cv_all_video_names); // Search all videos under FLAGS_video_dir, save in cv_all_video_names\n    std::cout << \"total videos num: \" << cv_all_video_names.size() << endl; // \u8f93\u51fa\u641c\u7d22\u5230\u7684\u89c6\u9891\u4e2a\u6570\n    if (strcmp(argv[1], \"rec\") == 0)\n    {\n        return main_rec(cv_all_video_names); // Output the number of videos searched"
+        },
+        {
+            "comment": "The code snippet represents the end of the main function where a closing curly brace is followed by a return statement, indicating successful execution and termination of the program.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/main.cpp\":170-172",
+            "content": "    }\n    return 0;\n}"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5f429bc7-5b87-405c-8393-ba3eef352764.json b/docs/doc/5f429bc7-5b87-405c-8393-ba3eef352764.json
new file mode 100644
index 000000000..e1d3dfbf3
--- /dev/null
+++ b/docs/doc/5f429bc7-5b87-405c-8393-ba3eef352764.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The `DepthMetric` class inherits from `BaseMetric`, processes batches, accumulates metrics and performs distributed all-reduce operations before averaging metric values.",
+    "details": [
+        {
+            "comment": "This code defines a class `DepthMetric` that inherits from `BaseMetric`. It initializes lists for various metric values and then updates these metrics during each iteration. The code also includes logic to handle distributed computing, using all-reduce operation to average the results across different processes in the same training job.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/depth_metric.py\":0-33",
+            "content": "import numpy as np\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom .base import BaseMetric\nfrom .registry import METRIC\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass DepthMetric(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.abs_rel = []\n        self.sq_rel = []\n        self.rmse = []\n        self.rmse_log = []\n        self.a1 = []\n        self.a2 = []\n        self.a3 = []\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = outputs['abs_rel'], outputs['sq_rel'], outputs['rmse'], \\\n                                                      outputs['rmse_log'], outputs['a1'], outputs['a2'],outputs['a3']\n        # preds ensemble\n        if self.world_size > 1:\n            abs_rel = paddle.distributed.all_reduce(\n                outputs['abs_rel'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size"
+        },
+        {
+            "comment": "This code performs distributed all-reduce operations on several metrics (sq_rel, rmse, rmse\\_log, a1, a2, a3) and calculates their average values by dividing by the world size. These averaged metric values are then appended to corresponding lists (abs_rel, sq_rel, rmse, rmse_log).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/depth_metric.py\":34-56",
+            "content": "            sq_rel = paddle.distributed.all_reduce(\n                outputs['sq_rel'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            rmse = paddle.distributed.all_reduce(\n                outputs['rmse'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            rmse_log = paddle.distributed.all_reduce(\n                outputs['rmse_log'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            a1 = paddle.distributed.all_reduce(\n                outputs['a1'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            a2 = paddle.distributed.all_reduce(\n                outputs['a2'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            a3 = paddle.distributed.all_reduce(\n                outputs['a3'],\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n        self.abs_rel.append(abs_rel)\n        self.sq_rel.append(sq_rel)\n        self.rmse.append(rmse)\n        self.rmse_log.append(rmse_log)"
+        },
+        {
+            "comment": "This code defines a class with methods for processing batches and accumulating metrics. The `process_batch` method appends data to lists, logs progress if the batch ID is divisible by log_interval, and handles the next batch. The `accumulate` method calculates mean values for each metric list and logs them using a logger with the corresponding metric values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/depth_metric.py\":57-76",
+            "content": "        self.a1.append(a1)\n        self.a2.append(a2)\n        self.a3.append(a3)\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        logger.info(\n            '[TEST] finished, abs_rel= {}, sq_rel= {} , rmse= {}, rmse_log= {},'\n            'a1= {}, a2= {}, a3= {}'.format(np.mean(np.array(self.abs_rel)),\n                                            np.mean(np.array(self.sq_rel)),\n                                            np.mean(np.array(self.rmse)),\n                                            np.mean(np.array(self.rmse_log)),\n                                            np.mean(np.array(self.a1)),\n                                            np.mean(np.array(self.a2)),\n                                            np.mean(np.array(self.a3))))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5f54ee4a-1c2d-458b-9a28-bddedb1dd5af.json b/docs/doc/5f54ee4a-1c2d-458b-9a28-bddedb1dd5af.json
new file mode 100644
index 000000000..08cfbfd92
--- /dev/null
+++ b/docs/doc/5f54ee4a-1c2d-458b-9a28-bddedb1dd5af.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code defines a `TSN_ResNet` class for creating Temporal Segment Network ResNet models in PaddlePaddle, using bottleneck_block function and performs adaptive average pooling, reshaping, and activation functions.",
+    "details": [
+        {
+            "comment": "This code defines a class `TSN_ResNet` for creating a Temporal Segment Network ResNet model. It has parameters such as layers, segment number, training flag and extractor flag. The class contains a method `conv_bn_layer()` to create a convolution-batch normalization layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py\":0-33",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport time\nimport sys\nimport paddle\nimport paddle.static as static\nimport math\nclass TSN_ResNet():\n    def __init__(self,\n                 layers=50,\n                 seg_num=7,\n                 is_training=True,\n                 is_extractor=False):\n        self.layers = layers\n        self.seg_num = seg_num\n        self.is_training = is_training\n        self.is_extractor = is_extractor\n    def conv_bn_layer(self,"
+        },
+        {
+            "comment": "This function defines a convolutional layer and returns it after passing through a batch normalization layer. It takes input, number of filters, filter size, stride, groups (number of groups in the layers), activation function if any, and name as arguments. If the name is \"conv1\", the bn_name would be \"bn_conv1\" else, it would be \"bn0\", followed by the original name. The batch normalization layer takes input, activation function if any, whether it's in test mode or not, scale and offset attribute names for parameters, and names for moving mean and variance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py\":34-62",
+            "content": "                      input,\n                      num_filters,\n                      filter_size,\n                      stride=1,\n                      groups=1,\n                      act=None,\n                      name=None):\n        conv = paddle.static.nn.conv2d(\n            input=input,\n            num_filters=num_filters,\n            filter_size=filter_size,\n            stride=stride,\n            padding=(filter_size - 1) // 2,\n            groups=groups,\n            param_attr=paddle.ParamAttr(name=name + \"_weights\"),\n            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        return paddle.static.nn.batch_norm(\n            input=conv,\n            act=act,\n            is_test=(not self.is_training),\n            param_attr=paddle.ParamAttr(name=bn_name + \"_scale\"),\n            bias_attr=paddle.ParamAttr(bn_name + '_offset'),\n            moving_mean_name=bn_name + \"_mean\",\n            moving_variance_name=bn_name + '_variance')"
+        },
+        {
+            "comment": "This code defines two functions: 'shortcut' and 'bottleneck_block'. The shortcut function determines if input dimensions match the desired output, and returns either a convolution-batch normalization layer or the input itself. The bottleneck_block function applies two consecutive 1x1 and 3x3 convolutions with batch normalization and ReLU activations in between.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py\":64-85",
+            "content": "    def shortcut(self, input, ch_out, stride, name):\n        ch_in = input.shape[1]\n        if ch_in != ch_out or stride != 1:\n            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)\n        else:\n            return input\n    def bottleneck_block(self, input, num_filters, stride, name):\n        conv0 = self.conv_bn_layer(input=input,\n                                   num_filters=num_filters,\n                                   filter_size=1,\n                                   act='relu',\n                                   name=name + \"_branch2a\")\n        conv1 = self.conv_bn_layer(input=conv0,\n                                   num_filters=num_filters,\n                                   filter_size=3,\n                                   stride=stride,\n                                   act='relu',\n                                   name=name + \"_branch2b\")\n        conv2 = self.conv_bn_layer(input=conv1,\n                                   num_filters=num_filters * 4,\n                                   filter_size=1,"
+        },
+        {
+            "comment": "The code defines a function `net` that takes an input, performs operations based on the specified number of layers (50, 101 or 152), and reshapes the input. It then applies different configurations of convolutional and batch normalization layers to the input for each specified layer. The final output is the sum of two previous calculations (conv and short).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py\":86-117",
+            "content": "                                   act=None,\n                                   name=name + \"_branch2c\")\n        short = self.shortcut(input,\n                              num_filters * 4,\n                              stride,\n                              name=name + \"_branch1\")\n        return paddle.add(x=short, y=conv2)\n    def net(self, input, class_dim=101):\n        layers = self.layers\n        seg_num = self.seg_num\n        supported_layers = [50, 101, 152]\n        assert layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(supported_layers, layers)\n        # reshape input\n        channels = input.shape[2]\n        short_size = input.shape[3]\n        input = paddle.reshape(\n            x=input, shape=[-1, channels, short_size, short_size])\n        if layers == 50:\n            depth = [3, 4, 6, 3]\n        elif layers == 101:\n            depth = [3, 4, 23, 3]\n        elif layers == 152:\n            depth = [3, 8, 36, 3]\n        num_filters = [64, 128, 256, 512]\n        conv = self.conv_bn_layer(input=input,"
+        },
+        {
+            "comment": "This code defines a ResNet model with multiple convolutional layers and pooling operations. It uses the PaddlePaddle library and includes a bottleneck_block function for the residual blocks. The number of filters, filter size, and stride are defined based on the layer and depth.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py\":118-141",
+            "content": "                                  num_filters=64,\n                                  filter_size=7,\n                                  stride=2,\n                                  act='relu',\n                                  name='conv1')\n        conv = paddle.nn.functional.max_pool2d(x=conv,\n                                   kernel_size=3,\n                                   stride=2,\n                                   padding=1)\n        for block in range(len(depth)):\n            for i in range(depth[block]):\n                if layers in [101, 152] and block == 2:\n                    if i == 0:\n                        conv_name = \"res\" + str(block + 2) + \"a\"\n                    else:\n                        conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                else:\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                conv = self.bottleneck_block(\n                    input=conv,\n                    num_filters=num_filters[block],\n                    stride=2 if i == 0 and block != 0 else 1,"
+        },
+        {
+            "comment": "This code performs adaptive average pooling, reshapes the feature map, and if not an extractor, calculates the mean along axis 1. Then, it applies a softmax activation function and returns the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py\":142-160",
+            "content": "                    name=conv_name)\n        pool = paddle.nn.functional.adaptive_avg_pool2d(x=conv, output_size=1)\n        feature = paddle.reshape(x=pool,\n                                       shape=[-1, seg_num, pool.shape[1]])\n        if self.is_extractor:\n            out = feature\n        else:\n            out = paddle.mean(x=feature, axis=1)\n            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)\n            out = static.nn.fc(\n                x=out,\n                size=class_dim,\n                activation='softmax',\n                weight_attr=paddle.ParamAttr(\n                    initializer=paddle.nn.initializer.Uniform(low=-stdv, high=stdv)))\n        return out"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/5fab68b5-7be8-46d4-bb8b-df4716f7fbde.json b/docs/doc/5fab68b5-7be8-46d4-bb8b-df4716f7fbde.json
new file mode 100644
index 000000000..e13e63aba
--- /dev/null
+++ b/docs/doc/5fab68b5-7be8-46d4-bb8b-df4716f7fbde.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This line imports all functions, classes, and variables from the \"util\" module located in the same directory as this file. It allows easy access to all utility functions defined in the \"util\" module without explicitly specifying each function or variable.",
+    "details": [
+        {
+            "comment": "This line imports all functions, classes, and variables from the \"util\" module located in the same directory as this file. It allows easy access to all utility functions defined in the \"util\" module without explicitly specifying each function or variable.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/__init__.py\":0-0",
+            "content": "from .util import *"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/60407808-dbe0-46dc-9baa-bd1bbef9e95c.json b/docs/doc/60407808-dbe0-46dc-9baa-bd1bbef9e95c.json
new file mode 100644
index 000000000..c61c85e24
--- /dev/null
+++ b/docs/doc/60407808-dbe0-46dc-9baa-bd1bbef9e95c.json
@@ -0,0 +1,20 @@
+{
+    "summary": "PaddleVideo is a Python library for advanced video processing, featuring industry-specific models and data production to deployment pipeline support. The documentation includes sections on distillation, inference deployment, datasets, application scenarios, and licensing information (Apache 2.0).",
+    "details": [
+        {
+            "comment": "PaddleVideo is a Python library for advanced video processing, providing extensive and cutting-edge tools to assist researchers and industry professionals in the field of computer vision. The recent updates include an open-source video annotation tool (BILS), a lightweight action recognition model (PP-TSMv2), knowledge distillation functionality, transformer-based models, and single-stage action detection models (YOWO).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/README.md\":0-21",
+            "content": "[English](README_en.md) | \u4e2d\u6587\n# PaddleVideo\n![python version](https://img.shields.io/badge/python-3.7+-orange.svg) ![paddle version](https://img.shields.io/badge/PaddlePaddle-2.3.1-blue)\n## \u7b80\u4ecb\nPaddleVideo\u65e8\u5728\u6253\u9020\u4e00\u5957\u4e30\u5bcc\u3001\u9886\u5148\u4e14\u5b9e\u7528\u7684Video\u5de5\u5177\u5e93\uff0c\u65e8\u5728\u5e2e\u52a9\u5f00\u53d1\u8005\u66f4\u597d\u7684\u8fdb\u884c\u89c6\u9891\u9886\u57df\u7684\u5b66\u672f\u7814\u7a76\u548c\u4ea7\u4e1a\u5b9e\u8df5\u3002\n<div align=\"center\">\n  <img src=\"docs/images/home.gif\" width=\"450px\"/><br>\n</div>\n## \u8fd1\u671f\u66f4\u65b0\n- \u5f00\u6e90\u89c6\u9891\u6807\u6ce8\u5de5\u5177\ud83c\udf1f[BILS](./docs/zh-CN/annotation_tools.md)\uff0c\u6b22\u8fce\u4e0b\u8f7d\u5b89\u88c5\u5305\u4f53\u9a8c\uff5e\n- \u53d1\u5e03\u8f7b\u91cf\u5316\u884c\u4e3a\u8bc6\u522b\u6a21\u578b**\ud83d\udd25[PP-TSMv2](./docs/zh-CN/model_zoo/recognition/pp-tsm_v2.md)**, Kinetics-400\u7cbe\u5ea675.16%\uff0c25fps\u768410s\u89c6\u9891cpu\u63a8\u7406\u65f6\u95f4\u4ec5\u9700456ms.\u5404\u6a21\u578b\u6027\u80fd\u5bf9\u6bd4[benchmark](./docs/zh-CN/benchmark.md).\n- \u65b0\u589e[\u77e5\u8bc6\u84b8\u998f](./docs/zh-CN/distillation.md)\u529f\u80fd.\n- \u65b0\u589e\u57fa\u4e8etransformer\u7684\u884c\u4e3a\u8bc6\u522b\u6a21\u578b[TokenShift](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/tokenshift_transformer.md).\n- \u65b0\u589e\u57fa\u4e8e\u9aa8\u9abc\u70b9\u7684\u884c\u4e3a\u8bc6\u522b\u6a21\u578b[2s-ACGN](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/agcn2s.md)\u3001[CTR-GCN](./docs/zh-CN/model_zoo/recognition/ctrgcn.md).\n- \u65b0\u589e\u5355\u9636\u6bb5\u65f6\u7a7a\u52a8\u4f5c\u68c0\u6d4b\u6a21\u578b[YOWO](./docs/zh-CN/model_zoo/localization/yowo.md)."
+        },
+        {
+            "comment": "This code is for PaddleVideo, a series of industry-level video technology and application case courses. It supports various video cutting-edge algorithms, creates industry-specific models PP-TSM and PP-TSMv2, and covers the entire data production, model training, compression, and deployment pipeline. The code provides quick start instructions, scene application examples, and documentation for tutorials on different topics such as recognition, model library, and model compression. It also includes links to join discussion groups and course replay.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/README.md\":24-57",
+            "content": "\ud83d\udc40 \ud83c\udf1f  **\u300a\u4ea7\u4e1a\u7ea7\u89c6\u9891\u6280\u672f\u4e0e\u5e94\u7528\u6848\u4f8b\u300b\u7cfb\u5217\u8bfe\u7a0b\u56de\u653e\u94fe\u63a5**:  https://aistudio.baidu.com/aistudio/course/introduce/6742 \ud83c\udf1f\n\u200b\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t  \ud83d\udc96 **\u6b22\u8fce\u5927\u5bb6\u626b\u7801\u5165\u7fa4\u8ba8\u8bba** \ud83d\udc96\n<div align=\"center\">\n  <img src=\"docs/images/user_group.png\" width=250/></div>\n- \u6dfb\u52a0\u6210\u529f\u540e\u56de\u590d\u3010\u89c6\u9891\u3011\u52a0\u5165\u4ea4\u6d41\u7fa4\n## \u7279\u6027\n\u652f\u6301\u591a\u79cdVideo\u76f8\u5173\u524d\u6cbf\u7b97\u6cd5\uff0c\u5728\u6b64\u57fa\u7840\u4e0a\u6253\u9020\u4ea7\u4e1a\u7ea7\u7279\u8272\u6a21\u578b[PP-TSM](docs/zh-CN/model_zoo/recognition/pp-tsm.md)\u548c[PP-TSMv2](docs/zh-CN/model_zoo/recognition/pp-tsm_v2.md)\uff0c\u5e76\u6253\u901a\u6570\u636e\u751f\u4ea7\u3001\u6a21\u578b\u8bad\u7ec3\u3001\u538b\u7f29\u3001\u9884\u6d4b\u90e8\u7f72\u5168\u6d41\u7a0b\u3002\n<div align=\"center\">\n    <img src=\"./docs/images/features.png\" width=\"700\">\n</div>\n## \u5feb\u901f\u5f00\u59cb\n- \u4e00\u884c\u547d\u4ee4\u5feb\u901f\u4f7f\u7528: [\u5feb\u901f\u5f00\u59cb](./docs/zh-CN/quick_start.md)\n## \u573a\u666f\u5e94\u7528\nPaddleVideo\u573a\u666f\u5e94\u7528\u8986\u76d6\u4f53\u80b2\u3001\u4e92\u8054\u7f51\u3001\u5de5\u4e1a\u3001\u533b\u7597\u884c\u4e1a\uff0c\u5728PP-TSM\u7684\u57fa\u7840\u80fd\u529b\u4e4b\u4e0a\uff0c\u4ee5\u6848\u4f8b\u7684\u5f62\u5f0f\u5c55\u793a\u5229\u7528\u573a\u666f\u6570\u636e\u5fae\u8c03\u3001\u6a21\u578b\u4f18\u5316\u65b9\u6cd5\u3001\u6570\u636e\u589e\u5e7f\u7b49\u5185\u5bb9\uff0c\u4e3a\u5f00\u53d1\u8005\u5b9e\u9645\u843d\u5730\u63d0\u4f9b\u793a\u8303\u4e0e\u542f\u53d1\u3002\u8be6\u60c5\u53ef\u67e5\u770b[\u5e94\u7528](./applications/)\u3002\n## \u6587\u6863\u6559\u7a0b\n- [\u5feb\u901f\u5f00\u59cb](./docs/zh-CN/quick_start.md)\n- [\u5b89\u88c5\u8bf4\u660e](./docs/zh-CN/install.md)\n- [\u8bad\u7ec3/\u6d4b\u8bd5/\u63a8\u7406\u5168\u6d41\u7a0b\u4f7f\u7528\u6307\u5357](./docs/zh-CN/usage.md)\n- [PP-TSM\u884c\u4e3a\u8bc6\u522b\ud83d\udd25](./docs/zh-CN/model_zoo/recognition/pp-tsm.md)\n  - [\u6a21\u578b\u5e93](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#7)\n  - [\u6a21\u578b\u8bad\u7ec3](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#4)\n  - [\u6a21\u578b\u538b\u7f29](./deploy/slim/)\n      - [\u6a21\u578b\u91cf\u5316](./deploy/slim/readme.md)"
+        },
+        {
+            "comment": "This code provides a table of contents for the PaddleVideo documentation, including sections on distillation, inference deployment using Python and C++ engines, server-side deployment, converting to ONNX models, state-of-the-art algorithms and models, datasets, application scenarios, data labeling tools, competition support, contributing code, and licensing information (Apache 2.0).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/README.md\":58-74",
+            "content": "      - [\u77e5\u8bc6\u84b8\u998f](./docs/zh-CN/distillation.md)\n  - [\u63a8\u7406\u90e8\u7f72](./deploy/)\n      - [\u57fa\u4e8ePython\u9884\u6d4b\u5f15\u64ce\u63a8\u7406](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#62)\n      - [\u57fa\u4e8eC++\u9884\u6d4b\u5f15\u64ce\u63a8\u7406](./deploy/cpp_infer/readme.md)\n      - [\u670d\u52a1\u7aef\u90e8\u7f72](./deploy/python_serving/readme.md)\n      - [Paddle2ONNX\u6a21\u578b\u8f6c\u5316\u4e0e\u9884\u6d4b](./deploy/paddle2onnx/readme.md)\n      - [Benchmark](./docs/zh-CN/benchmark.md)\n- [\u524d\u6cbf\u7b97\u6cd5\u4e0e\u6a21\u578b](./docs/zh-CN/model_zoo/README.md)\ud83d\ude80\n- [\u6570\u636e\u96c6](./docs/zh-CN/dataset/README.md)\n- [\u573a\u666f\u5e94\u7528](./applications/README.md)\n- [\u6570\u636e\u6807\u6ce8](./docs/zh-CN/annotation_tools.md)\n- [\u8d5b\u4e8b\u652f\u6301](./docs/zh-CN/competition.md)\n- [\u8d21\u732e\u4ee3\u7801](./docs/zh-CN/contribute/README.md)\n## \u8bb8\u53ef\u8bc1\u4e66\n\u672c\u9879\u76ee\u7684\u53d1\u5e03\u53d7[Apache 2.0 license](LICENSE)\u8bb8\u53ef\u8ba4\u8bc1\u3002"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/60684849-b451-4aa4-b8c8-fb5805eb66d6.json b/docs/doc/60684849-b451-4aa4-b8c8-fb5805eb66d6.json
new file mode 100644
index 000000000..424083619
--- /dev/null
+++ b/docs/doc/60684849-b451-4aa4-b8c8-fb5805eb66d6.json
@@ -0,0 +1,45 @@
+{
+    "summary": "The code defines a ResNet-MANET model with BatchNorm, ReLU activation, and residual blocks using convolution, batch normalization, and max pooling layers. The model is initialized and processes input to obtain output and low-level features as JSON files.",
+    "details": [
+        {
+            "comment": "This code defines the Bottleneck class for ResNet architecture, consisting of convolutional layers and batch normalization layers. It takes parameters such as inplanes, planes, stride, dilation, downsample, and BatchNorm for initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py\":0-30",
+            "content": "import paddle.nn as nn\n# from reprod_log.utils import paddle2np\nfrom EIVideo.paddlevideo.utils.manet_utils import fill_, zero_\nclass Bottleneck(nn.Layer):\n    expansion = 4\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 stride=1,\n                 dilation=1,\n                 downsample=None,\n                 BatchNorm=None):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)\n        self.bn1 = BatchNorm(planes)\n        self.conv2 = nn.Conv2D(planes,\n                               planes,\n                               kernel_size=3,\n                               stride=stride,\n                               dilation=dilation,\n                               padding=dilation,\n                               bias_attr=False)\n        self.bn2 = BatchNorm(planes)\n        self.conv3 = nn.Conv2D(planes,\n                               planes * 4,\n                               kernel_size=1,\n                               bias_attr=False)"
+        },
+        {
+            "comment": "Class \"ResNet\" is a Residual Network backbone with multiple blocks and layers. It utilizes BatchNorm for normalization, ReLU as the activation function, and supports different output strides.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py\":31-74",
+            "content": "        self.bn3 = BatchNorm(planes * 4)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n    def forward(self, x):\n        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass ResNet(nn.Layer):\n    def __init__(self,\n                 block,\n                 layers,\n                 output_stride,\n                 BatchNorm,\n                 pretrained=None):\n        self.inplanes = 64\n        super(ResNet, self).__init__()\n        blocks = [1, 2, 4]\n        if output_stride == 16:\n            strides = [1, 2, 2, 1]\n            dilations = [1, 1, 1, 2]\n        elif output_stride == 8:\n            strides = [1, 2, 1, 1]"
+        },
+        {
+            "comment": "The code defines a ResNet-MANET backbone model with BatchNorm and ReLU activation functions. It initializes convolution, batch normalization, ReLU, max pooling layers along with the first two residual blocks based on input parameters such as block type, number of channels, number of layers, and strides. Dilations are assigned based on the provided conditions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py\":75-100",
+            "content": "            dilations = [1, 1, 2, 4]\n        else:\n            raise NotImplementedError\n        # Modules\n        self.conv1 = nn.Conv2D(3,\n                               64,\n                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)\n        self.bn1 = BatchNorm(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block,\n                                       64,\n                                       layers[0],\n                                       stride=strides[0],\n                                       dilation=dilations[0],\n                                       BatchNorm=BatchNorm)\n        self.layer2 = self._make_layer(block,\n                                       128,\n                                       layers[1],\n                                       stride=strides[1],\n                                       dilation=dilations[1],"
+        },
+        {
+            "comment": "This code defines a ResNet-MANET model, creating layers and functions for the network architecture. It includes the creation of three main layers (layer1, layer2, and layer3), using blocks and specific parameters such as stride and dilation. The _make_MG_unit function is used to create an additional MG unit in the layer4. Finally, the init_weight method initializes the weight for the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py\":101-126",
+            "content": "                                       BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block,\n                                       256,\n                                       layers[2],\n                                       stride=strides[2],\n                                       dilation=dilations[2],\n                                       BatchNorm=BatchNorm)\n        self.layer4 = self._make_MG_unit(block,\n                                         512,\n                                         blocks=blocks,\n                                         stride=strides[3],\n                                         dilation=dilations[3],\n                                         BatchNorm=BatchNorm)\n        self.init_weight()\n    def _make_layer(self,\n                    block,\n                    planes,\n                    blocks,\n                    stride=1,\n                    dilation=1,\n                    BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:"
+        },
+        {
+            "comment": "This code defines a function _make_MG_unit that creates a residual block with downsampling for a ResNet model. The downsample operation is determined based on stride and inplanes values, and BatchNorm layer is optional.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py\":127-158",
+            "content": "            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes, planes, stride, dilation, downsample,\n                  BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      dilation=dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def _make_MG_unit(self,\n                      block,\n                      planes,\n                      blocks,\n                      stride=1,\n                      dilation=1,\n                      BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:"
+        },
+        {
+            "comment": "This code defines a ResNet-MANET backbone model. It uses BatchNorm layers and block functions to create multiple convolutional layers with different dilation rates. The forward function applies the first layer, batch normalization, and ReLU activation before returning the sequence of layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py\":159-190",
+            "content": "            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes,\n                  planes,\n                  stride,\n                  dilation=blocks[0] * dilation,\n                  downsample=downsample,\n                  BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, len(blocks)):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      stride=1,\n                      dilation=blocks[i] * dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def forward(self, input):\n        x = self.conv1(input)\n        x = self.bn1(x)\n        x = self.relu(x)"
+        },
+        {
+            "comment": "This code defines a ResNet101 model with BatchNorm and outputs the features at different stages. It initializes the weights of convolutional layers, and builds a backbone based on output stride and pretrained parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py\":191-226",
+            "content": "        x = self.maxpool(x)\n        x = self.layer1(x)\n        low_level_feat = x\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        return x, low_level_feat\n    def init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                fill_(m.weight, 1)\n            elif isinstance(m, nn.BatchNorm2D):\n                fill_(m.weight, 1)\n                zero_(m.bias)\n        return self.sublayers()\ndef ResNet101(output_stride, BatchNorm, pretrained=None):\n    \"\"\"Constructs a ResNet-101 model.\n    Args:\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\n    \"\"\"\n    model = ResNet(Bottleneck, [3, 4, 23, 3],\n                   output_stride,\n                   BatchNorm,\n                   pretrained=pretrained)\n    return model\ndef build_backbone(output_stride, BatchNorm, pretrained):\n    return ResNet101(output_stride, BatchNorm, pretrained)"
+        },
+        {
+            "comment": "The code initializes a ResNet101 model, generates random input, passes it through the model to obtain output and low-level features, and saves them as JSON files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py\":229-244",
+            "content": "if __name__ == \"__main__\":\n    import paddle\n    model = ResNet101(BatchNorm=nn.BatchNorm2D,\n                      pretrained=True,\n                      output_stride=8)\n    input = paddle.rand([1, 3, 512, 512])\n    output, low_level_feat = model(input)\n    print(output.shape)\n    print(low_level_feat.shape)\n    import json\n    with open('output.txt', 'w') as f:\n        json.dump(output.tolist(), f)\n    with open('low_level_feat.txt', 'w') as f:\n        json.dump(low_level_feat.tolist(), f)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/611e4ac9-9776-4ba4-882d-f3b68370ee66.json b/docs/doc/611e4ac9-9776-4ba4-882d-f3b68370ee66.json
new file mode 100644
index 000000000..36b995fa1
--- /dev/null
+++ b/docs/doc/611e4ac9-9776-4ba4-882d-f3b68370ee66.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet is the version information for the EIVideo application, created by Acer Zhang on January 11th, 2022. It has a version number \"0.1a\" and the author requests proper attribution if reusing the code.",
+    "details": [
+        {
+            "comment": "This code snippet is the version information for the EIVideo application, created by Acer Zhang on January 11th, 2022. It has a version number \"0.1a\" and the author requests proper attribution if reusing the code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/version.py\":0-5",
+            "content": "# Author: Acer Zhang\n# Datetime: 2022/1/11 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\n__version__ = \"0.1a\""
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/618b1404-93d6-41b8-9866-5e268b6b4b59.json b/docs/doc/618b1404-93d6-41b8-9866-5e268b6b4b59.json
new file mode 100644
index 000000000..072ebb78c
--- /dev/null
+++ b/docs/doc/618b1404-93d6-41b8-9866-5e268b6b4b59.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This Python class defines the \"MonoDataset\" for PaddleVideo, initializes with file path, data prefix, and pipeline support. The code contains `load_file`, `prepare_train`, and `prepare_test` methods for dataset preparation and information retrieval.",
+    "details": [
+        {
+            "comment": "This code is a Python class defining the \"MonoDataset\" dataset for PaddleVideo. It requires file path, data prefix, and pipeline for initialization, supports retries when accessing files, and utilizes the pil_loader function for loading RGB images from file paths with specified suffixes. The code also registers MonoDataset with DATASETS registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/oxford.py\":0-36",
+            "content": "# Copyright Niantic 2019. Patent Pending. All rights reserved.\n#\n# This software is licensed under the terms of the Monodepth2 licence\n# which allows for non-commercial use only, the full terms of which are made\n# available in the LICENSE file.\nfrom __future__ import absolute_import, division, print_function\nimport copy\nfrom os import path as osp\nfrom PIL import Image\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\ndef pil_loader(path):\n    # open path as file to avoid ResourceWarning\n    # (https://github.com/python-pillow/Pillow/issues/835)\n    with open(path, 'rb') as f:\n        with Image.open(f) as img:\n            return img.convert('RGB')\n@DATASETS.register()\nclass MonoDataset(BaseDataset):\n    def __init__(self,\n                 file_path,\n                 data_prefix,\n                 pipeline,\n                 num_retries=0,\n                 suffix='.png',\n                 **kwargs):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, **kwargs)"
+        },
+        {
+            "comment": "The code defines three methods: `load_file`, `prepare_train`, and `prepare_test`. The `load_file` method reads a file containing information about image files, stripping off newline characters, and appending the necessary file suffix. It then appends a dictionary to the `info` list with data path, filename, folder location, and frame index. The `prepare_train` and `prepare_test` methods copy an entry from `self.info` and apply a pipeline before returning relevant information (e.g., images, labels).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/oxford.py\":38-61",
+            "content": "    def load_file(self):\n        info = []\n        with open(self.file_path, 'r') as f:\n            for line in f:\n                filename = line.strip() + self.suffix\n                folder = osp.dirname(filename)\n                frame_index = line.strip().split('/')[1]\n                info.append(\n                    dict(data_path=self.data_prefix,\n                         filename=filename,\n                         folder=folder,\n                         frame_index=int(frame_index)))\n        return info\n    def prepare_train(self, idx):\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        results['imgs']['idx'] = idx\n        return results['imgs'], results['day_or_night']\n    def prepare_test(self, idx):\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        return results['imgs'], results['day_or_night']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/61a34886-574f-4ef7-b81c-18e6b5d404a3.json b/docs/doc/61a34886-574f-4ef7-b81c-18e6b5d404a3.json
new file mode 100644
index 000000000..87ec4ce82
--- /dev/null
+++ b/docs/doc/61a34886-574f-4ef7-b81c-18e6b5d404a3.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This function builds a backbone network for the specified model (resnet, xception, drn, or mobilenet) with the given output stride and BatchNorm implementation.",
+    "details": [
+        {
+            "comment": "This function builds a backbone network for the specified model (resnet, xception, drn, or mobilenet) with the given output stride and BatchNorm implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/__init__.py\":0-13",
+            "content": "from networks.backbone import resnet, xception, drn, mobilenet\ndef build_backbone(backbone, output_stride, BatchNorm):\n    if backbone == 'resnet':\n        return resnet.ResNet101(output_stride, BatchNorm)\n    elif backbone == 'xception':\n        return xception.AlignedXception(output_stride, BatchNorm)\n    elif backbone == 'drn':\n        return drn.drn_d_54(BatchNorm)\n    elif backbone == 'mobilenet':\n        return mobilenet.MobileNetV2(output_stride, BatchNorm)\n    else:\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/61b4dc3e-6eac-40db-882c-b20a9c142305.json b/docs/doc/61b4dc3e-6eac-40db-882c-b20a9c142305.json
new file mode 100644
index 000000000..31846324f
--- /dev/null
+++ b/docs/doc/61b4dc3e-6eac-40db-882c-b20a9c142305.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code defines a Paddle Inference engine class for the \"bmn infer\" application, performing action detection through averaging predictions, generating proposal results, and processing score outcomes. It initializes models, loads data, performs inference, saves results, outputs masks, and prints execution time.",
+    "details": [
+        {
+            "comment": "This code defines a class `InferModel` for the \"bmn infer\" application. It imports necessary modules and utilities, sets up model configuration parameters from a JSON file, and initializes the Paddle Inference engine with specified model and parameter files. The GPU memory and device ID are also configured according to the input JSON file. Additionally, some threshold values for NMS (non-maximum suppression) and minimum prediction scores are set.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py\":0-36",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import process_proposal\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"bmn infer\"\"\"\n    def __init__(self, cfg, name='BMN'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        self.nms_thread          = cfg[name]['nms_thread']\n        self.min_pred_score      = cfg[name]['score_thread']\n        self.min_frame_thread    = cfg['COMMON']['fps']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true"
+        },
+        {
+            "comment": "The code initializes a predictor and sets up input/output tensors for inferencing. It then runs the inference process, copying input data from CPU and output results to CPU, allowing for further processing or analysis. The generate_props function generates properties based on predictions, start, and end timestamps, with adjustable window size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py\":37-62",
+            "content": "        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n        self.output3_tensor = self.predictor.get_output_handle(output_names[2])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        output3 = self.output3_tensor.copy_to_cpu()\n        return output1, output2, output3\n    def generate_props(self, pred_bmn, pred_start, pred_end, max_window=200, min_window=5):\n        \"\"\"generate_props\"\"\""
+        },
+        {
+            "comment": "This code is calculating action boundaries from predicted start and end frames, along with a binary mask network (BMN) score. It extracts relevant data from the input, loops through the range of potential window sizes, checks if start and end indices fall within the video length and if boundary masks are activated. If these conditions are met, it calculates the confidence score and appends to the results list. Finally, it returns the list of action boundaries with their respective scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py\":63-85",
+            "content": "        video_len = min(pred_bmn.shape[-1], min(pred_start.shape[-1], pred_end.shape[-1]))\n        pred_bmn = pred_bmn[0, :, :] * pred_bmn[1, :, :]\n        start_mask = self.boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = self.boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_results = []\n        for idx in range(min_window, max_window):\n            for jdx in range(video_len):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < video_len and start_mask[start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = start_index\n                    xmax = end_index\n                    xmin_score = pred_start[start_index]\n                    xmax_score = pred_end[end_index]\n                    bmn_score = pred_bmn[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bmn_score\n                    score_results.append([xmin, xmax, conf_score])\n        return score_results\n    def boundary_choose(self, score_list):"
+        },
+        {
+            "comment": "This code defines a function that chooses the boundary based on score. The function then returns a mask representing the chosen boundary. The predict function reads data from an infer_reader and loops through each iteration, extracting inputs and feature information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py\":86-110",
+            "content": "        \"\"\"boundary_choose\"\"\"\n        max_score = max(score_list)\n        mask_high = (score_list > max_score * 0.5)\n        score_list = list(score_list)\n        score_middle = np.array([0.0] + score_list + [0.0])\n        score_front = np.array([0.0, 0.0] + score_list)\n        score_back = np.array(score_list + [0.0, 0.0])\n        mask_peak = ((score_middle > score_front) & (score_middle > score_back))\n        mask_peak = mask_peak[1:-1]\n        mask = (mask_high | mask_peak).astype('float32')\n        return mask\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs      = [items[0] for items in data]\n            winds       = [items[1] for items in data]\n            feat_info   = [items[2] for items in data]\n            feature_T   = feat_info[0][0]\n            feature_N   = feat_info[0][1]\n            inputs = np.array(inputs)"
+        },
+        {
+            "comment": "The code performs action detection by averaging predictions from multiple windows and then generates proposal results. It takes input, infers predictions for each window, sums the predictions within their corresponding windows, divides them by the count of frames in the window to get average predictions, and passes these averages to generate_props function to produce score_result. The process_proposal function is then used to process the score result based on some parameters like minimum frame thread, nms thread, and minimum prediction score to obtain the final results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py\":111-130",
+            "content": "            pred_bmn, pred_sta, pred_end = self.infer(inputs)\n            if infer_iter == 0:\n                sum_pred_bmn = np.zeros((2, feature_N, feature_T))\n                sum_pred_sta = np.zeros((feature_T, ))\n                sum_pred_end = np.zeros((feature_T, ))\n                sum_pred_cnt = np.zeros((feature_T, ))\n            for idx, sub_wind in enumerate(winds):\n                sum_pred_bmn[:, :, sub_wind[0]: sub_wind[1]] += pred_bmn[idx]\n                sum_pred_sta[sub_wind[0]: sub_wind[1]] += pred_sta[idx]\n                sum_pred_end[sub_wind[0]: sub_wind[1]] += pred_end[idx]\n                sum_pred_cnt[sub_wind[0]: sub_wind[1]] += np.ones((sub_wind[1] - sub_wind[0], ))\n        pred_bmn = sum_pred_bmn / sum_pred_cnt\n        pred_sta = sum_pred_sta / sum_pred_cnt\n        pred_end = sum_pred_end / sum_pred_cnt\n        score_result = self.generate_props(pred_bmn, pred_sta, pred_end)\n        results = process_proposal(score_result, self.min_frame_thread, self.nms_thread, self.min_pred_score)"
+        },
+        {
+            "comment": "This code initializes a model, loads data, and performs inference. It then saves the results to a JSON file and prints the time taken for execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py\":132-155",
+            "content": "        return results\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    t0 = time.time()\n    outputs = model.predict(cfg, video_features)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    results = {'proposal': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) \n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/628ec3ce-6382-4f0b-87a9-72ac9a842cb3.json b/docs/doc/628ec3ce-6382-4f0b-87a9-72ac9a842cb3.json
new file mode 100644
index 000000000..1bf037810
--- /dev/null
+++ b/docs/doc/628ec3ce-6382-4f0b-87a9-72ac9a842cb3.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code initializes a class for MS-TCN dataset, loads video features and labels for training or testing, and converts label data to integers using a dictionary mapping.",
+    "details": [
+        {
+            "comment": "Imports required modules and registers a class for the MS-TCN dataset, a video dataset for action segmentation. The class initializes with file paths and other parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ms_tcn_dataset.py\":0-37",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport os\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass MSTCNDataset(BaseDataset):\n    \"\"\"Video dataset for action segmentation.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        feature_path,\n        gt_path,\n        actions_map_file_path,\n        **kwargs,"
+        },
+        {
+            "comment": "This code initializes a class, likely for data loading in a video dataset. It takes file paths as parameters, reads an actions map file to create a dictionary of action classes and their corresponding labels. The class also has a method load_file() to read the index file, and a method prepare_train() to prepare training data given an index.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ms_tcn_dataset.py\":38-67",
+            "content": "    ):\n        super().__init__(file_path, pipeline, **kwargs)\n        self.gt_path = gt_path\n        self.actions_map_file_path = actions_map_file_path\n        self.feature_path = feature_path\n        # actions dict generate\n        file_ptr = open(self.actions_map_file_path, 'r')\n        actions = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        self.actions_dict = dict()\n        for a in actions:\n            self.actions_dict[a.split()[1]] = int(a.split()[0])\n        self.num_classes = len(self.actions_dict.keys())\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        file_ptr = open(self.file_path, 'r')\n        info = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID: Prepare data for training/valid given the index.\"\"\"\n        results = {}\n        video_name = self.info[idx]\n        # load video feature\n        file_name = video_name.split('.')[0] + \".npy\"\n        feat_file_path = os.path.join(self.feature_path, file_name)"
+        },
+        {
+            "comment": "This code is loading video features and labels from a dataset, likely for training or testing purposes. It first checks the path of the feature file, then loads both the video feature and label data from specified paths. The code converts the label data into integer format using a dictionary mapping and performs some potential preprocessing steps (not shown here). Finally, it returns the video feature and label data for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ms_tcn_dataset.py\":68-94",
+            "content": "        #TODO: check path\n        video_feat = np.load(feat_file_path)\n        # load label\n        target_file_path = os.path.join(self.gt_path, video_name)\n        file_ptr = open(target_file_path, 'r')\n        content = file_ptr.read().split('\\n')[:-1]\n        classes = np.zeros(min(np.shape(video_feat)[1], len(content)), dtype='int64')\n        for i in range(len(classes)):\n            classes[i] = self.actions_dict[content[i]]\n        # classes = classes * (-100)\n        results['video_feat'] = copy.deepcopy(video_feat)\n        results['video_gt'] = copy.deepcopy(classes)\n        results = self.pipeline(results)\n        return results['video_feat'], results['video_gt']\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        results = {}\n        video_name = self.info[idx]\n        # load video feature\n        file_name = video_name.split('.')[0] + \".npy\"\n        feat_file_path = os.path.join(self.feature_path, file_name)\n        #TODO: check path\n        video_feat = np.load(feat_file_path)"
+        },
+        {
+            "comment": "This function loads labels for a video dataset. It reads the label file, converts content to class numbers using actions_dict, assigns class values to classes array, scales the classes, and returns the feature and ground truth data for the video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ms_tcn_dataset.py\":96-109",
+            "content": "        # load label\n        target_file_path = os.path.join(self.gt_path, video_name)\n        file_ptr = open(target_file_path, 'r')\n        content = file_ptr.read().split('\\n')[:-1]\n        classes = np.zeros(min(np.shape(video_feat)[1], len(content)))\n        for i in range(len(classes)):\n            classes[i] = self.actions_dict[content[i]]\n        # classes = classes * (-100)\n        results['video_feat'] = copy.deepcopy(video_feat)\n        results['video_gt'] = copy.deepcopy(classes)\n        results = self.pipeline(results)\n        return results['video_feat'], results['video_gt']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/62d092fb-8910-4f90-ad82-f1ef3acdcaa3.json b/docs/doc/62d092fb-8910-4f90-ad82-f1ef3acdcaa3.json
new file mode 100644
index 000000000..fa1d9cbc0
--- /dev/null
+++ b/docs/doc/62d092fb-8910-4f90-ad82-f1ef3acdcaa3.json
@@ -0,0 +1,80 @@
+{
+    "summary": "The code introduces PPTSM-Mv3 backbone networks and MobileNetV3 models in PaddleVideo using PyTorch, with diverse parameters, weight initialization, pretrained model URLs, network configuration dictionaries. It also constructs CNN layers with Batch Normalization and builds the PPTSM-MV3 backbone model using temporal shifting, convolutions, SE modules, and implements Hardsigmoid function separately.",
+    "details": [
+        {
+            "comment": "Copyright notice, license information, and reference to the associated research paper. The code imports necessary libraries and registers the backbone model within the PaddleVideo module registry. It also includes a function for weight initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":0-27",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# reference: https://arxiv.org/abs/1905.02244\nfrom __future__ import absolute_import, division, print_function\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear\nfrom paddle.regularizer import L2Decay\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt"
+        },
+        {
+            "comment": "The code defines pretrained model URLs for MobileNetV3_small_x1_0 and MobileNetV3_large_x1_0, as well as lists of stages for each model. The MODEL_STAGES_PATTERN contains different depthwise blocks' parameters such as kernel size, channel numbers, activation function, and stride. NET_CONFIG is a dictionary containing configurations for specific network architectures with different parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":29-51",
+            "content": "# Download URL of pretrained model\n# MODEL_URLS = {\n#     \"MobileNetV3_small_x1_0\":\n#     \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_ssld_pretrained.pdparams\",\n#     \"MobileNetV3_large_x1_0\":\n#     \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_ssld_pretrained.pdparams\",\n# }\nMODEL_STAGES_PATTERN = {\n    \"MobileNetV3_small\": [\"blocks[0]\", \"blocks[2]\", \"blocks[7]\", \"blocks[10]\"],\n    \"MobileNetV3_large\":\n    [\"blocks[0]\", \"blocks[2]\", \"blocks[5]\", \"blocks[11]\", \"blocks[14]\"]\n}\n# \"large\", \"small\" is just for MobinetV3_large, MobileNetV3_small respectively.\n# The type of \"large\" or \"small\" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.\n# k: kernel_size\n# exp: middle channel number in depthwise block\n# c: output channel number in depthwise block\n# se: whether to use SE block\n# act: which activation to use\n# s: stride in depthwise block\nNET_CONFIG = {"
+        },
+        {
+            "comment": "This code defines two versions of the PPTSM-Mv3 backbone network architecture for the PaddleVideo library: \"large\" and \"small\". The backbone is a series of convolutional layers, with different configurations specified by parameters k (kernel size), exp (expansion factor), c (number of channels), se (if using squeeze-and-excitation), act (activation function), and s (strides). The large version has more layers and higher capacities for learning, while the small version is optimized for inference speed. Each layer's configuration is defined in a list of lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":52-78",
+            "content": "    \"large\": [\n        # k, exp, c, se, act, s\n        [3, 16, 16, False, \"relu\", 1],\n        [3, 64, 24, False, \"relu\", 2],\n        [3, 72, 24, False, \"relu\", 1],\n        [5, 72, 40, True, \"relu\", 2],\n        [5, 120, 40, True, \"relu\", 1],\n        [5, 120, 40, True, \"relu\", 1],\n        [3, 240, 80, False, \"hardswish\", 2],\n        [3, 200, 80, False, \"hardswish\", 1],\n        [3, 184, 80, False, \"hardswish\", 1],\n        [3, 184, 80, False, \"hardswish\", 1],\n        [3, 480, 112, True, \"hardswish\", 1],\n        [3, 672, 112, True, \"hardswish\", 1],\n        [5, 672, 160, True, \"hardswish\", 2],\n        [5, 960, 160, True, \"hardswish\", 1],\n        [5, 960, 160, True, \"hardswish\", 1],\n    ],\n    \"small\": [\n        # k, exp, c, se, act, s\n        [3, 16, 16, True, \"relu\", 2],\n        [3, 72, 24, False, \"relu\", 2],\n        [3, 88, 24, False, \"relu\", 1],\n        [5, 96, 40, True, \"hardswish\", 2],\n        [5, 240, 40, True, \"hardswish\", 1],\n        [5, 240, 40, True, \"hardswish\", 1],\n        [5, 120, 48, True, \"hardswish\", 1],"
+        },
+        {
+            "comment": "This code defines the MobileNetV3 model with various parameters such as channel numbers, activation functions, and division rules for each layer. The class \"MobileNetV3\" is a custom PyTorch Layer that represents the network architecture, utilizing convolutional layers and activation functions like Hardswish or ReLU. The function \"_make_divisible\" ensures proper alignment of channel numbers with hardware considerations, while \"_create_act\" creates instances of the specified activation functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":79-117",
+            "content": "        [5, 144, 48, True, \"hardswish\", 1],\n        [5, 288, 96, True, \"hardswish\", 2],\n        [5, 576, 96, True, \"hardswish\", 1],\n        [5, 576, 96, True, \"hardswish\", 1],\n    ]\n}\n# first conv output channel number in MobileNetV3\nSTEM_CONV_NUMBER = 16\n# last second conv output channel for \"small\"\nLAST_SECOND_CONV_SMALL = 576\n# last second conv output channel for \"large\"\nLAST_SECOND_CONV_LARGE = 960\n# last conv output channel number for \"large\" and \"small\"\nLAST_CONV = 1280\ndef _make_divisible(v, divisor=8, min_value=None):\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\ndef _create_act(act):\n    if act == \"hardswish\":\n        return nn.Hardswish()\n    elif act == \"relu\":\n        return nn.ReLU()\n    elif act is None:\n        return None\n    else:\n        raise RuntimeError(\n            \"The activation function is not supported: {}\".format(act))\nclass MobileNetV3(nn.Layer):\n    \"\"\""
+        },
+        {
+            "comment": "The function defines a MobileNetV3 model with configurable parameters like depthwise blocks, scale, class number, inplanes, class_squeeze, class_expand, dropout probability, and number of segments. It takes these parameters as inputs and returns the specific MobileNetV3 model based on the arguments provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":118-141",
+            "content": "    MobileNetV3\n    Args:\n        config: list. MobileNetV3 depthwise blocks config.\n        scale: float=1.0. The coefficient that controls the size of network parameters.\n        class_num: int=1000. The number of classes.\n        inplanes: int=16. The output channel number of first convolution layer.\n        class_squeeze: int=960. The output channel number of penultimate convolution layer.\n        class_expand: int=1280. The output channel number of last convolution layer.\n        dropout_prob: float=0.2.  Probability of setting units to zero.\n    Returns:\n        model: nn.Layer. Specific MobileNetV3 model depends on args.\n    \"\"\"\n    def __init__(self,\n                 config,\n                 stages_pattern,\n                 scale=1.0,\n                 class_num=400,\n                 inplanes=STEM_CONV_NUMBER,\n                 class_squeeze=LAST_SECOND_CONV_LARGE,\n                 class_expand=LAST_CONV,\n                 dropout_prob=0.2,\n                 num_seg=8,\n                 pretrained=None,\n                 return_patterns=None,"
+        },
+        {
+            "comment": "This code defines a PPTSM-MV3 backbone model with specified configurations, including input planes, scale factor, class parameters, and number of segments. It uses convolutional layers and residual units for feature extraction and processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":142-167",
+            "content": "                 return_stages=None):\n        super().__init__()\n        self.cfg = config\n        self.scale = scale\n        self.inplanes = inplanes\n        self.class_squeeze = class_squeeze\n        self.class_expand = class_expand\n        self.class_num = class_num\n        self.num_seg = num_seg\n        self.pretrained = pretrained\n        self.conv = ConvBNLayer(in_c=3,\n                                out_c=_make_divisible(self.inplanes *\n                                                      self.scale),\n                                filter_size=3,\n                                stride=2,\n                                padding=1,\n                                num_groups=1,\n                                if_act=True,\n                                act=\"hardswish\")\n        self.blocks = nn.Sequential(*[\n            ResidualUnit(in_c=_make_divisible(self.inplanes * self.scale if i ==\n                                              0 else self.cfg[i - 1][2] *\n                                              self.scale),"
+        },
+        {
+            "comment": "The code initializes a PPTSM-MV3 model, which consists of several convolutional blocks and a final classification layer. The convolutional blocks are defined by the `self.cfg` list, where each element contains the kernel size, expansion factor, output channels, whether to use SE module, and activation function, along with the stride. The last convolutional block is followed by an average pooling layer and a final convolution layer for classification.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":168-193",
+            "content": "                         mid_c=_make_divisible(self.scale * exp),\n                         out_c=_make_divisible(self.scale * c),\n                         filter_size=k,\n                         stride=s,\n                         use_se=se,\n                         num_seg=self.num_seg,\n                         act=act)\n            for i, (k, exp, c, se, act, s) in enumerate(self.cfg)\n        ])\n        self.last_second_conv = ConvBNLayer(\n            in_c=_make_divisible(self.cfg[-1][2] * self.scale),\n            out_c=_make_divisible(self.scale * self.class_squeeze),\n            filter_size=1,\n            stride=1,\n            padding=0,\n            num_groups=1,\n            if_act=True,\n            act=\"hardswish\")\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        self.last_conv = Conv2D(in_channels=_make_divisible(self.scale *\n                                                            self.class_squeeze),\n                                out_channels=self.class_expand,\n                                kernel_size=1,"
+        },
+        {
+            "comment": "This code defines a neural network model for the PPTSM_MV3 backbone. It includes convolutional layers, blocks, a Hardswish activation function, optional dropout, and fully connected layers for classification. The `init_weights` method initializes the network's weights, and the `forward` method passes input through the model layers to generate output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":194-221",
+            "content": "                                stride=1,\n                                padding=0,\n                                bias_attr=False)\n        self.hardswish = nn.Hardswish()\n        if dropout_prob is not None:\n            self.dropout = Dropout(p=dropout_prob, mode=\"downscale_in_infer\")\n        else:\n            self.dropout = None\n        self.fc = Linear(self.class_expand, class_num)\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.blocks(x)"
+        },
+        {
+            "comment": "This code defines a ConvBNLayer class that takes input and output channels, filter size, stride, padding, number of groups, activation function flag, and activation type as parameters. It initializes the layers for convolutional neural network and applies Batch Normalization and activation functions if specified. The class also returns the last layer of the model after feature aggregation for video classification.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":222-257",
+            "content": "        x = self.last_second_conv(x)\n        x = self.avg_pool(x)\n        x = self.last_conv(x)\n        x = self.hardswish(x)\n        if self.dropout is not None:\n            x = self.dropout(x)\n        # feature aggregation for video\n        x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]])\n        x = paddle.mean(x, axis=1)\n        x = paddle.reshape(x, shape=[-1, self.class_expand])\n        x = self.fc(x)\n        return x\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_c,\n                 out_c,\n                 filter_size,\n                 stride,\n                 padding,\n                 num_groups=1,\n                 if_act=True,\n                 act=None):\n        super().__init__()\n        self.conv = Conv2D(in_channels=in_c,\n                           out_channels=out_c,\n                           kernel_size=filter_size,\n                           stride=stride,\n                           padding=padding,\n                           groups=num_groups,\n                           bias_attr=False)"
+        },
+        {
+            "comment": "The code defines a ResidualUnit class with an expand_conv layer containing ConvBNLayer, used for building the residual unit in PPTSM-MV3 model. It also includes optional BatchNorm (bn) and activation (act) layers based on provided parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":258-291",
+            "content": "        self.bn = BatchNorm(num_channels=out_c,\n                            act=None,\n                            param_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.if_act = if_act\n        self.act = _create_act(act)\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        if self.if_act:\n            x = self.act(x)\n        return x\nclass ResidualUnit(nn.Layer):\n    def __init__(self,\n                 in_c,\n                 mid_c,\n                 out_c,\n                 filter_size,\n                 stride,\n                 use_se,\n                 num_seg=8,\n                 act=None):\n        super().__init__()\n        self.if_shortcut = stride == 1 and in_c == out_c\n        self.if_se = use_se\n        self.num_seg = num_seg\n        self.expand_conv = ConvBNLayer(in_c=in_c,\n                                       out_c=mid_c,\n                                       filter_size=1,\n                                       stride=1,"
+        },
+        {
+            "comment": "Defines a PPTSM_MV3 block with ConvBNLayer, bottleneck convolution layer, optional SEModule for spatial attention, and a linear convolution layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":292-311",
+            "content": "                                       padding=0,\n                                       if_act=True,\n                                       act=act)\n        self.bottleneck_conv = ConvBNLayer(in_c=mid_c,\n                                           out_c=mid_c,\n                                           filter_size=filter_size,\n                                           stride=stride,\n                                           padding=int((filter_size - 1) // 2),\n                                           num_groups=mid_c,\n                                           if_act=True,\n                                           act=act)\n        if self.if_se:\n            self.mid_se = SEModule(mid_c)\n        self.linear_conv = ConvBNLayer(in_c=mid_c,\n                                       out_c=out_c,\n                                       filter_size=1,\n                                       stride=1,\n                                       padding=0,\n                                       if_act=False,\n                                       act=None)"
+        },
+        {
+            "comment": "This code defines a PPTSM-MV3 backbone model for video analysis. It uses temporal shifting, convolutions, and SE module (if specified) in its forward pass. The Hardsigmoid function is implemented as a separate class to apply hard sigmoid activation with customizable slope and offset parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":313-347",
+            "content": "    def forward(self, x):\n        identity = x\n        if self.if_shortcut:\n            x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg)\n        x = self.expand_conv(x)\n        x = self.bottleneck_conv(x)\n        if self.if_se:\n            x = self.mid_se(x)\n        x = self.linear_conv(x)\n        if self.if_shortcut:\n            x = paddle.add(identity, x)\n        return x\n# nn.Hardsigmoid can't transfer \"slope\" and \"offset\" in nn.functional.hardsigmoid\nclass Hardsigmoid(nn.Layer):\n    def __init__(self, slope=0.2, offset=0.5):\n        super().__init__()\n        self.slope = slope\n        self.offset = offset\n    def forward(self, x):\n        return nn.functional.hardsigmoid(x,\n                                         slope=self.slope,\n                                         offset=self.offset)\nclass SEModule(nn.Layer):\n    def __init__(self, channel, reduction=4):\n        super().__init__()\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        self.conv1 = Conv2D(in_channels=channel,\n                            out_channels=channel // reduction,"
+        },
+        {
+            "comment": "The code defines a Convolutional Neural Network layer for the PPTSM-MobileNetV3_small_x1_0 model. It consists of an average pooling layer, two 1x1 convolution layers with ReLU and hard sigmoid activations. The forward function performs element-wise multiplication between input and output to implement residual learning.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":348-375",
+            "content": "                            kernel_size=1,\n                            stride=1,\n                            padding=0)\n        self.relu = nn.ReLU()\n        self.conv2 = Conv2D(in_channels=channel // reduction,\n                            out_channels=channel,\n                            kernel_size=1,\n                            stride=1,\n                            padding=0)\n        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)\n    def forward(self, x):\n        identity = x\n        x = self.avg_pool(x)\n        x = self.conv1(x)\n        x = self.relu(x)\n        x = self.conv2(x)\n        x = self.hardsigmoid(x)\n        return paddle.multiply(x=identity, y=x)\ndef PPTSM_MobileNetV3_small_x1_0(pretrained=None, **kwargs):\n    \"\"\"\n    MobileNetV3_small_x1_0\n    Args:\n        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.\n                    If str, means the path of the pretrained model.\n        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True."
+        },
+        {
+            "comment": "This code defines a function that returns specific MobileNetV3 models based on given arguments. The \"MobileNetV3\" class is used to create the models, and parameters such as config, scale, stages_pattern, class_squeeze, pretrained, and other optional keyword arguments are passed to the constructor of the class. The function is then registered with BACKBONES for future use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":376-404",
+            "content": "    Returns:\n        model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.\n    \"\"\"\n    model = MobileNetV3(\n        config=NET_CONFIG[\"small\"],\n        scale=1.0,\n        stages_pattern=MODEL_STAGES_PATTERN[\"MobileNetV3_small\"],\n        class_squeeze=LAST_SECOND_CONV_SMALL,\n        pretrained=pretrained,\n        **kwargs)\n    return model\n@BACKBONES.register()\ndef PPTSM_MobileNetV3(pretrained=None, **kwargs):\n    \"\"\"\n    MobileNetV3_large_x1_0\n    Args:\n        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.\n                    If str, means the path of the pretrained model.\n        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.\n    Returns:\n        model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.\n    \"\"\"\n    model = MobileNetV3(\n        config=NET_CONFIG[\"large\"],\n        scale=1.0,\n        stages_pattern=MODEL_STAGES_PATTERN[\"MobileNetV3_large\"],\n        class_squeeze=LAST_SECOND_CONV_LARGE,"
+        },
+        {
+            "comment": "This code is creating an instance of the PPTSM-MV3 backbone model with specified pretrained weights and returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py\":405-407",
+            "content": "        pretrained=pretrained,\n        **kwargs)\n    return model"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/62d7c90d-10cf-4d89-b01c-3cc72df41656.json b/docs/doc/62d7c90d-10cf-4d89-b01c-3cc72df41656.json
new file mode 100644
index 000000000..b8c947361
--- /dev/null
+++ b/docs/doc/62d7c90d-10cf-4d89-b01c-3cc72df41656.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This script trains multiple deep learning models for various computer vision tasks with PaddlePaddle framework and demonstrates running BMN test, exporting models, inference using PaddleVideo toolkit, and provides training time calculation.",
+    "details": [
+        {
+            "comment": "This script sets the CUDA visible devices, launches distributed training for multiple models (pp-tsm, pp-tsm_v2, ava), and specifies log directories and configurations. It runs all at once using 8 GPUs and Python3.7.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/run.sh\":0-17",
+            "content": "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n#export FLAGS_conv_workspace_size_limit=800 #MB\n#export FLAGS_cudnn_exhaustive_search=1\n#export FLAGS_cudnn_batchnorm_spatial_persistent=1\nstart_time=$(date +%s)\n# run pp-tsm training\n#python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml\n# run pp-tsm_v2 distillation training\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm_v2  main.py --validate -c configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml\n# run ava training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=logdir.ava_part main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava_part.yaml\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=logdir.ava_all.1203 main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava_all.yaml"
+        },
+        {
+            "comment": "This code contains various command lines to run different video recognition training processes using PaddlePaddle framework on multiple GPUs. Each line specifies the model architecture, the configuration file, and the options like validation, amp (automatic mixed precision), and GPU allocation for each specific task.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/run.sh\":19-35",
+            "content": "# run adds training\n# python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20\n# run tsm training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_frames.yaml\n# run tsm amp training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml\n# run tsm amp training, nhwc\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml\n# run tsn training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsn main.py  --validate -c configs/recognition/tsn/tsn_k400_frames.yaml\n# run video-swin-transformer training\n# python3.7 -u -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_videoswin main.py --amp --validate -c configs/recognition/videoswin/videoswin_k400_videos.yaml"
+        },
+        {
+            "comment": "This code executes multiple deep learning model training scripts for various computer vision tasks such as recognition, localization, and more using PaddlePaddle framework. The training runs in distributed mode with GPU utilization to speed up the process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/run.sh\":37-53",
+            "content": "# run slowfast training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_slowfast  main.py --validate -c configs/recognition/slowfast/slowfast.yaml\n# run slowfast multi-grid training\n# python3.7 -B -m paddle.distributed.launch --selected_gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml\n# run bmn training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_bmn main.py  --validate -c configs/localization/bmn.yaml\n# run attention_lstm training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_attetion_lstm  main.py  --validate -c configs/recognition/attention_lstm/attention_lstm_youtube-8m.yaml\n# run pp-tsn training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsn  main.py  --validate -c configs/recognition/pptsn/pptsn_k400_frames.yaml\n# run timesformer training\n# python3.7 -B -m paddle."
+        },
+        {
+            "comment": "This code executes multiple deep learning model training and testing scripts for various tasks. It launches distributed training with specific GPU configurations, sets log directories, and uses different configuration files depending on the task. The tasks include pp-timesformer, st-gcn, agcn, actbert, tsn dali training, and example test.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/run.sh\":53-73",
+            "content": "distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_timesformer  main.py  --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml\n# run pp-timesformer training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptimesformer  main.py  --validate -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml\n# run st-gcn training\n# python3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml\n# run agcn training\n# python3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml\n# run actbert training\n# python3.7 main.py  --validate -c configs/multimodal/actbert/actbert.yaml\n# run tsn dali training\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml\n# test.sh\n# just use `example` as example, please replace to real name.\n# python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_test main.py --test -c configs/example.yaml -w \"output/example/example_best.pdparams\""
+        },
+        {
+            "comment": "This script demonstrates the process of running BMN test, exporting models, and performing inference using the PaddleVideo toolkit. It highlights the commands required for each step, including the necessary configuration files and output directories. The script also calculates and outputs the training time in minutes and seconds.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/run.sh\":75-88",
+            "content": "# NOTE: run bmn test, only support single card, bs=1\n# python3.7 main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00010.pdparams -o DATASET.batch_size=1\n# export_models script\n# just use `example` as example, please replace to real name.\n# python3.7 tools/export_model.py -c configs/example.yaml -p output/example/example_best.pdparams -o ./inference\n# predict script\n# just use `example` as example, please replace to real name.\n# python3.7 tools/predict.py -v example.avi --model_file \"./inference/example.pdmodel\" --params_file \"./inference/example.pdiparams\" --enable_benchmark=False --model=\"example\" --num_seg=8\nend_time=$(date +%s)\ncost_time=$[ $end_time-$start_time ]\necho \"Time to train is $(($cost_time/60))min $(($cost_time%60))s\""
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/630fe9e7-2ea2-4aa7-98fb-3fac2e58863d.json b/docs/doc/630fe9e7-2ea2-4aa7-98fb-3fac2e58863d.json
new file mode 100644
index 000000000..7359ecf08
--- /dev/null
+++ b/docs/doc/630fe9e7-2ea2-4aa7-98fb-3fac2e58863d.json
@@ -0,0 +1,95 @@
+{
+    "summary": "The PaddleVideo library's backbones code introduces the VisionTransformer_tweaks model with weight initialization, stochastic depth, spatial attention in ViT models, and transformer configurations. It is a time-based feature modification model that computes space-only predictions through attention blocks.",
+    "details": [
+        {
+            "comment": "This code is from the PaddleVideo library's backbones module and defines the VisionTransformer_tweaks model. It imports necessary libraries, sets constant values, and includes function definitions for weight initialization and regularizers. The BACKBONES registry is also defined to categorize the model type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":0-31",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections.abc import Callable\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Constant\nfrom paddle.regularizer import L2Decay\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import trunc_normal_\n__all__ = ['VisionTransformer_tweaks']\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)"
+        },
+        {
+            "comment": "The code defines three functions. The \"to_2tuple\" function takes an input and returns a tuple with the same value repeated twice. The \"rand_bbox\" function generates random bounding box coordinates within the size of an image, given a specified probability. The \"drop_path\" function applies stochastic depth (dropout) to each sample in the main path of residual blocks with a specified dropout rate.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":34-68",
+            "content": "def to_2tuple(x):\n    return tuple([x] * 2)\ndef rand_bbox(size, lam):\n    \"\"\" rand_bbox \"\"\"\n    w = size[2]\n    h = size[3]\n    cut_rat = np.sqrt(1. - lam)\n    cut_w = np.int(w * cut_rat)\n    cut_h = np.int(h * cut_rat)\n    # uniform\n    cx = np.random.randint(w)\n    cy = np.random.randint(h)\n    bbx1 = np.clip(cx - cut_w // 2, 0, w)\n    bby1 = np.clip(cy - cut_h // 2, 0, h)\n    bbx2 = np.clip(cx + cut_w // 2, 0, w)\n    bby2 = np.clip(cy + cut_h // 2, 0, h)\n    return bbx1, bby1, bbx2, bby2\ndef drop_path(x, drop_prob=0., training=False):\n    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    # issuecomment-532968956 ...\n    See discussion: https://github.com/tensorflow/tpu/issues/494\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob)\n    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)"
+        },
+        {
+            "comment": "This code defines a class called `Mlp` which is a fully connected layer with a middle layer and an output layer. It also includes an activation function (GELU by default) and a dropout layer (with drop probability specified). The class `DropPath` applies drop paths to stochastically mask layers during training, while the `Identity` class simply returns its input unchanged.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":69-106",
+            "content": "    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\nclass Identity(nn.Layer):\n    def __init__(self):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.,\n                 wd_bias=True,\n                 lr_mult=1.0):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)"
+        },
+        {
+            "comment": "The code defines a neural network layer called \"Attention\" with several components including a Linear layer for the query-key-value (QKV) transform, and separate Dropout layers for the attention and projection operations. The forward function performs the multi-head self-attention operation on the input tensor x, reshaping it to apply the QKV transform, and then applying dropout for both the attention and projection steps before returning the result. This layer is commonly used in transformer models for processing sequential data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":107-143",
+            "content": "        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\nclass Attention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.,\n                 proj_drop=0.,\n                 wd_bias=True,\n                 lr_mult=1.0):\n        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n        self.attn_drop = nn.Dropout(attn_drop)\n    def forward(self, x):\n        N, C = x.shape[1:]\n        qkv = self.qkv(x).reshape(\n            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose("
+        },
+        {
+            "comment": "This code defines a `Block` class that implements an attention mechanism using query-key value (QKV) decomposition. The block also includes a multi-layer perceptron (MLP) layer and supports different attention types. The input dimensions, number of heads in the attention mechanism, and other parameters are passed to the constructor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":144-177",
+            "content": "                (2, 0, 3, 1, 4))\n        q, k, v = qkv[0], qkv[1], qkv[2]\n        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale\n        attn = nn.functional.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.0,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop=0.0,\n                 attn_drop=0.0,\n                 drop_path=0.1,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 attention_type='divided_space_time',\n                 wd_bias=True,\n                 lr_mult=1.0):\n        super().__init__()\n        self.attention_type = attention_type\n        if isinstance(norm_layer, str):\n            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)"
+        },
+        {
+            "comment": "This code checks the type of norm_layer and creates an instance of either a str or a paddle.nn.layer.Layer class for self.norm1. If no temporal attention is required, it raises a TypeError if norm_layer is neither a str nor a Callable. If divided space time attention is selected, it checks the type of norm_layer again and creates an instance of either a str or a paddle.nn.layer.Layer class for self.temporal_norm1.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":178-201",
+            "content": "        elif isinstance(norm_layer, Callable):\n            self.norm1 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        self.attn = Attention(dim,\n                              num_heads=num_heads,\n                              qkv_bias=qkv_bias,\n                              qk_scale=qk_scale,\n                              attn_drop=attn_drop,\n                              proj_drop=drop,\n                              wd_bias=wd_bias,\n                              lr_mult=lr_mult)\n        # Temporal Attention Parameters\n        if self.attention_type == 'divided_space_time':\n            if isinstance(norm_layer, str):\n                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n            elif isinstance(norm_layer, Callable):\n                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)\n            else:\n                raise TypeError(\n                    \"The norm_layer must be str or paddle.nn.layer.Layer class\")"
+        },
+        {
+            "comment": "This code initializes the temporal attention module, a linear layer for temporal features, and a drop path for stochastic depth. It also handles norm_layer initialization according to its type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":202-220",
+            "content": "            self.temporal_attn = Attention(dim,\n                                           num_heads=num_heads,\n                                           qkv_bias=qkv_bias,\n                                           qk_scale=qk_scale,\n                                           attn_drop=attn_drop,\n                                           proj_drop=drop,\n                                           wd_bias=wd_bias,\n                                           lr_mult=lr_mult)\n            self.temporal_fc = nn.Linear(dim, dim)\n        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        if isinstance(norm_layer, str):\n            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm2 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")"
+        },
+        {
+            "comment": "Code defines a backbone for Vision Transformer (ViT) with tweaks and handles the forward pass. The MLP layer is added, and attention type can be space-only, joint space-time or divided space-time. In divided space-time, it also includes temporal attention.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":222-246",
+            "content": "        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop,\n                       wd_bias=wd_bias,\n                       lr_mult=lr_mult)\n    def forward(self, x, B, T, W):\n        num_spatial_tokens = (x.shape[1] - 1) // T\n        H = num_spatial_tokens // W\n        if self.attention_type in ['space_only', 'joint_space_time']:\n            x = paddle.add(x, self.drop_path(self.attn(self.norm1(x))))\n            x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))\n            return x\n        elif self.attention_type == 'divided_space_time':\n            ########## Temporal ##########\n            xt = x[:, 1:, :]\n            _, _, _, _t, _m = B, H, W, T, xt.shape[-1]\n            xt = xt.reshape([-1, _t, _m])\n            res_temporal = self.drop_path(\n                self.temporal_attn(self.temporal_norm1(xt)))\n            _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]"
+        },
+        {
+            "comment": "This code performs spatial attention in a ViT model. It reshapes the input, concatenates the class token with the reshaped input, applies normalization and self-attention, and finally averages the class tokens for each frame to obtain a contextual representation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":247-270",
+            "content": "            res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])\n            res_temporal = self.temporal_fc(res_temporal)\n            xt = paddle.add(x[:, 1:, :], res_temporal)\n            ########## Spatial ##########\n            init_cls_token = x[:, 0, :].unsqueeze(1)\n            cls_token = init_cls_token.tile((1, T, 1))\n            _b, _t, _m = cls_token.shape\n            cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)\n            xs = xt\n            _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]\n            xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(\n                (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])\n            xs = paddle.concat((cls_token, xs), axis=1)\n            res_spatial = self.drop_path(self.attn(self.norm1(xs)))\n            # Taking care of CLS token\n            cls_token = res_spatial[:, 0, :]\n            _, _t, _m = B, T, cls_token.shape[-1]\n            cls_token = cls_token.reshape([-1, _t, _m])\n            # averaging for every frame\n            cls_token = paddle.mean(cls_token, axis=1, keepdim=True)"
+        },
+        {
+            "comment": "This code is from the PaddleVideo library and defines a PatchEmbed class for image to patch embedding. It takes in parameters such as img_size, patch_size, in_channels, embed_dim, wd_bias, and lr_mult. The class performs image to patch embedding by dividing the input image into patches of specified size and flattening them into a 2D feature map. The code also includes a NotImplementedError for certain conditions, suggesting that some parts may not be fully implemented yet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":272-301",
+            "content": "            res_spatial = res_spatial[:, 1:, :]\n            _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]\n            res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(\n                (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])\n            res = res_spatial\n            x = xt\n            x = paddle.add(paddle.concat((init_cls_token, x), axis=1),\n                           paddle.concat((cls_token, res), axis=1))\n            # Mlp\n            x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))\n            return x\n        else:\n            raise NotImplementedError\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n    def __init__(self,\n                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768,\n                 wd_bias=True,\n                 lr_mult=1.0):\n        super().__init__()\n        img_size = to_2tuple(img_size)\n        patch_size = to_2tuple(patch_size)\n        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //"
+        },
+        {
+            "comment": "This code defines a VisionTransformer with patch input. The model takes an image of size img_size and divides it into patches of size patch_size, extracting features from each patch using the Conv2D layer. The forward method reshapes the input and passes it through the projection convolution. It then flattens the output and returns the result along with the number of patches (T) and the total number of image pixels (W).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":302-330",
+            "content": "                                                        patch_size[0])\n        self.img_size = img_size\n        self.patch_size = patch_size\n        self.num_patches = num_patches\n        self.proj = nn.Conv2D(in_channels,\n                              embed_dim,\n                              kernel_size=patch_size,\n                              stride=patch_size)\n    def forward(self, x):\n        B, C, T, H, W = x.shape\n        assert H == self.img_size[0] and W == self.img_size[1], \\\n            f\"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).\"\n        x = x.transpose((0, 2, 1, 3, 4))  # [B,T,C,H,W]\n        x = x.reshape([-1, C, H, W])  # [BT,C,H,W]\n        x = self.proj(x)  # [BT,F,nH,nW]\n        W = x.shape[-1]\n        x = x.flatten(2).transpose((0, 2, 1))  # [BT,F,nHnW]\n        return x, T, W\n@BACKBONES.register()\nclass VisionTransformer_tweaks(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n    \"\"\"\n    def __init__(self,\n                 pretrained=None,"
+        },
+        {
+            "comment": "This code initializes a ViT (Vision Transformer) model with specified dimensions and parameters. It uses the PatchEmbed class to embed input images, sets the number of segments for attention, and defines the learning rate multipliers for each stage of the model. It also specifies whether to use pre-trained weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":331-359",
+            "content": "                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.1,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 num_seg=8,\n                 attention_type='divided_space_time',\n                 wd_bias=True,\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],\n                 **args):\n        super().__init__()\n        self.pretrained = pretrained\n        self.num_seg = num_seg\n        self.attention_type = attention_type\n        self.lr_mult_list = lr_mult_list\n        self.num_features = self.embed_dim = embed_dim\n        self.patch_embed = PatchEmbed(img_size=img_size,\n                                      patch_size=patch_size,\n                                      in_channels=in_channels,"
+        },
+        {
+            "comment": "This code initializes the positional and time embeddings for a transformer model. It creates a cls_token, pos_embed and optionally time_embed with specified dimensions and regularizers. It also adds dropout layers for positional and temporal features, if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":360-383",
+            "content": "                                      embed_dim=embed_dim,\n                                      wd_bias=wd_bias,\n                                      lr_mult=self.lr_mult_list[0])\n        num_patches = self.patch_embed.num_patches\n        # Positional Embeddings\n        self.cls_token = self.create_parameter(\n            shape=(1, 1, embed_dim),\n            default_initializer=zeros_,\n            attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.pos_embed = self.create_parameter(\n            shape=(1, num_patches + 1, embed_dim),\n            default_initializer=zeros_,\n            attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.pos_drop = nn.Dropout(p=drop_rate)\n        if self.attention_type != 'space_only':\n            self.time_embed = self.create_parameter(\n                shape=(1, num_seg, embed_dim),\n                default_initializer=zeros_,\n                attr=ParamAttr(regularizer=L2Decay(0.0)))\n            self.time_drop = nn.Dropout(p=drop_rate)\n        self.add_parameter(\"pos_embed\", self.pos_embed)"
+        },
+        {
+            "comment": "The code initializes a transformer model with blocks, adds parameters for position and classification tokens, creates a layer list of blocks with varying drop paths and attention types, and applies weight initialization to the positional embeddings, classification token, and layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":384-413",
+            "content": "        self.add_parameter(\"cls_token\", self.cls_token)\n        dpr = np.linspace(0, drop_path_rate, depth)\n        self.blocks = nn.LayerList([\n            Block(dim=embed_dim,\n                  num_heads=num_heads,\n                  mlp_ratio=mlp_ratio,\n                  qkv_bias=qkv_bias,\n                  qk_scale=qk_scale,\n                  drop=drop_rate,\n                  attn_drop=attn_drop_rate,\n                  drop_path=dpr[i],\n                  norm_layer=norm_layer,\n                  epsilon=epsilon,\n                  attention_type=self.attention_type,\n                  wd_bias=wd_bias,\n                  lr_mult=self.lr_mult_list[(i // 4) + 1]) for i in range(depth)\n        ])\n        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)\n    def init_weights(self):\n        \"\"\"First init model's weight\"\"\"\n        trunc_normal_(self.pos_embed, std=0.02)\n        trunc_normal_(self.cls_token, std=0.02)\n        self.apply(self._init_fn)\n        if self.attention_type == 'divided_space_time':\n            i = 0"
+        },
+        {
+            "comment": "Initializing the backbone network by iterating through each sublayer, setting temporal_fc weight and bias to zeros if it's a Block type. If pretrained weights are provided, load them after checking the input. Else, continue with no change or raise an error for unsupported inputs. Initialize the network parameters using truncated normal distribution for Linear layers and setting bias of LayerNorm layers to zero.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":414-440",
+            "content": "            for m in self.blocks.sublayers(include_self=True):\n                m_str = str(m)\n                if 'Block' in m_str:\n                    if i > 0:\n                        zeros_(m.temporal_fc.weight)\n                        zeros_(m.temporal_fc.bias)\n                    i += 1\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if isinstance(\n                self.pretrained, str\n        ) and self.pretrained.strip() != \"\":  # load pretrained weights\n            load_ckpt(self,\n                      self.pretrained,\n                      num_patches=self.patch_embed.num_patches,\n                      num_seg=self.num_seg,\n                      attention_type=self.attention_type)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            pass\n        else:\n            raise NotImplementedError\n    def _init_fn(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):"
+        },
+        {
+            "comment": "This code snippet is part of a transformer model's forward pass implementation. It reshapes the positional embeddings to match the patch embedding dimension and performs interpolation if necessary, ensuring the correct size for the subsequent layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":441-463",
+            "content": "            ones_(m.weight)\n            zeros_(m.bias)\n    def forward_features(self, x):\n        # B = x.shape[0]\n        B = paddle.shape(x)[0]\n        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]\n        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]\n        x = paddle.concat((cls_tokens, x), axis=1)\n        pos_interp = (x.shape[1] != self.pos_embed.shape[1])\n        if pos_interp:\n            pos_embed = self.pos_embed\n            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)\n            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(\n                (0, 2, 1))\n            P = int(other_pos_embed.shape[2]**0.5)\n            H = x.shape[1] // W\n            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])\n            new_pos_embed = F.interpolate(other_pos_embed,\n                                          size=(H, W),\n                                          mode='nearest')\n            new_pos_embed = new_pos_embed.flatten(2)\n            new_pos_embed = new_pos_embed.transpose((0, 2, 1))"
+        },
+        {
+            "comment": "This code is part of a vision transformer model. It concatenates the class position embeddings with new position embeddings, adds them to the input tensor, and applies positional dropout. If attention type is not \"space_only,\" it extracts time embeddings from the input tensor, reshapes them, interpolates time embeddings if their size doesn't match, and performs some operations on them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":464-486",
+            "content": "            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),\n                                          axis=1)\n            x = paddle.add(x, new_pos_embed)\n        else:\n            x = paddle.add(x, self.pos_embed)\n        x = self.pos_drop(x)\n        # Time Embeddings\n        if self.attention_type != 'space_only':\n            cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(\n                T)[0].index_select(paddle.to_tensor([0]), axis=1)\n            x = x[:, 1:]\n            _, _n, _m = x.shape\n            _t = T\n            x = x.reshape([-1, _t, _n, _m]).transpose(\n                (0, 2, 1, 3)).reshape([-1, _t, _m])\n            # Resizing time embeddings in case they don't match\n            time_interp = (T != self.time_embed.shape[1])\n            if time_interp:  # T' != T\n                time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)\n                new_time_embed = F.interpolate(time_embed,\n                                               size=(T, x.shape[-1]),"
+        },
+        {
+            "comment": "This code performs time-based feature modification and passes the data through attention blocks for a Vision Transformer model. It also provides an option to compute space-only predictions by averaging predictions for every frame. The forward function applies the forward_features transformation before passing data through attention blocks and normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py\":487-514",
+            "content": "                                               mode='nearest').squeeze(0)\n                new_time_embed = new_time_embed.transpose((0, 2, 1))\n                x = paddle.add(x, new_time_embed)\n            else:\n                x = paddle.add(x, self.time_embed)\n            x = self.time_drop(x)\n            _, _t, _m = x.shape\n            x = x.reshape([-1, W * W * T, _m])\n            x = paddle.concat((cls_tokens, x), axis=1)\n        # Attention blocks\n        for blk in self.blocks:\n            x = blk(x, B, T, W)\n        # Predictions for space-only baseline\n        if self.attention_type == 'space_only':\n            _, _n, _m = x.shape\n            _t = T\n            x = x.reshape([-1, _t, _n, _m])\n            x = paddle.mean(x, 1)  # averaging predictions for every frame\n        x = self.norm(x)\n        return x[:, 0]  # [B,  embed_dim]\n    def forward(self, x):\n        x = self.forward_features(x)\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6328dbec-c3c6-4f2a-9834-8760ba5bdc9f.json b/docs/doc/6328dbec-c3c6-4f2a-9834-8760ba5bdc9f.json
new file mode 100644
index 000000000..18cce6598
--- /dev/null
+++ b/docs/doc/6328dbec-c3c6-4f2a-9834-8760ba5bdc9f.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The MetricsCalculator class in PaddleVideo's VideoTag application handles metric calculation, providing methods for finalizing, computing, and accumulating metrics. It calculates average loss and accuracy over multiple batches using a top-k accuracy function, accumulating per batch size before returning the final result.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo project's VideoTag application, and it defines a class called MetricsCalculator. It handles calculating various metrics for different modes such as train, val, or test. The code imports necessary libraries, initializes logger, and sets up the MetricsCalculator class with an initialization method (__init__) and a reset method to reset the metrics values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/kinetics/accuracy_metrics.py\":0-33",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import unicode_literals\nfrom __future__ import print_function\nfrom __future__ import division\nimport numpy as np\nimport datetime\nimport logging\nlogger = logging.getLogger(__name__)\nclass MetricsCalculator():\n    def __init__(self, name, mode):\n        self.name = name\n        self.mode = mode  # 'train', 'val', 'test'\n        self.reset()\n    def reset(self):\n        logger.info('Resetting {} metrics...'.format(self.mode))"
+        },
+        {
+            "comment": "The class initializes variables for accumulating aggregated accuracy, loss, and batch size. The `finalize_metrics` method calculates average metrics by dividing the accumulated values by the total batch size. The `get_computed_metrics` returns a JSON object containing the average loss and accuracy for top 1 and top 5 predictions. The `calculate_metrics` computes the accuracy for top 1 and top 5 predictions, and the `accumulate` method accumulates the loss and updates the batch size if the returned loss is not None.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/kinetics/accuracy_metrics.py\":34-61",
+            "content": "        self.aggr_acc1 = 0.0\n        self.aggr_acc5 = 0.0\n        self.aggr_loss = 0.0\n        self.aggr_batch_size = 0\n    def finalize_metrics(self):\n        self.avg_acc1 = self.aggr_acc1 / self.aggr_batch_size\n        self.avg_acc5 = self.aggr_acc5 / self.aggr_batch_size\n        self.avg_loss = self.aggr_loss / self.aggr_batch_size\n    def get_computed_metrics(self):\n        json_stats = {}\n        json_stats['avg_loss'] = self.avg_loss\n        json_stats['avg_acc1'] = self.avg_acc1\n        json_stats['avg_acc5'] = self.avg_acc5\n        return json_stats\n    def calculate_metrics(self, loss, softmax, labels):\n        accuracy1 = compute_topk_accuracy(softmax, labels, top_k=1) * 100.\n        accuracy5 = compute_topk_accuracy(softmax, labels, top_k=5) * 100.\n        return accuracy1, accuracy5\n    def accumulate(self, loss, softmax, labels):\n        cur_batch_size = softmax.shape[0]\n        # if returned loss is None for e.g. test, just set loss to be 0.\n        if loss is None:\n            cur_loss = 0.\n        else:"
+        },
+        {
+            "comment": "This code calculates the average loss and accuracy over multiple batches. It uses a function called \"compute_topk_accuracy\" to calculate the accuracy for top 1 and top 5 predictions. The computed values are then accumulated per batch size, with the final result being returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/kinetics/accuracy_metrics.py\":62-89",
+            "content": "            cur_loss = np.mean(np.array(loss))  #\n        self.aggr_batch_size += cur_batch_size\n        self.aggr_loss += cur_loss * cur_batch_size\n        accuracy1 = compute_topk_accuracy(softmax, labels, top_k=1) * 100.\n        accuracy5 = compute_topk_accuracy(softmax, labels, top_k=5) * 100.\n        self.aggr_acc1 += accuracy1 * cur_batch_size\n        self.aggr_acc5 += accuracy5 * cur_batch_size\n        return\n# ----------------------------------------------\n# other utils\n# ----------------------------------------------\ndef compute_topk_correct_hits(top_k, preds, labels):\n    '''Compute the number of corret hits'''\n    batch_size = preds.shape[0]\n    top_k_preds = np.zeros((batch_size, top_k), dtype=np.float32)\n    for i in range(batch_size):\n        top_k_preds[i, :] = np.argsort(-preds[i, :])[:top_k]\n    correctness = np.zeros(batch_size, dtype=np.int32)\n    for i in range(batch_size):\n        if labels[i] in top_k_preds[i, :].astype(np.int32).tolist():\n            correctness[i] = 1\n    correct_hits = sum(correctness)"
+        },
+        {
+            "comment": "The function `compute_topk_accuracy` computes the top-k accuracy by first asserting that the batch size of labels and softmax are equal, then it computes the correct hits for each batch element using the `compute_topk_correct_hits` function. Finally, it normalizes the results and returns the computed metric as a float value representing accuracy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/kinetics/accuracy_metrics.py\":91-106",
+            "content": "    return correct_hits\ndef compute_topk_accuracy(softmax, labels, top_k):\n    computed_metrics = {}\n    assert labels.shape[0] == softmax.shape[0], \"Batch size mismatch.\"\n    aggr_batch_size = labels.shape[0]\n    aggr_top_k_correct_hits = compute_topk_correct_hits(top_k, softmax, labels)\n    # normalize results\n    computed_metrics = \\\n        float(aggr_top_k_correct_hits) / aggr_batch_size\n    return computed_metrics"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/632b319c-6ccf-4e84-9185-616f39959d82.json b/docs/doc/632b319c-6ccf-4e84-9185-616f39959d82.json
new file mode 100644
index 000000000..58f4c8d87
--- /dev/null
+++ b/docs/doc/632b319c-6ccf-4e84-9185-616f39959d82.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This Python class is part of PaddleVideo's modeling framework, serving as a base for partitioners and initializing partitioned models. It includes backbone and head components initialization, optional weight initialization, and defines a forward function. A base class for model partitioners is also defined with methods for train, validate, test, and infer steps, leaving the actual implementation to subclasses.",
+    "details": [
+        {
+            "comment": "This code is a Python class for base partitioner in PaddleVideo's modeling framework. It is an abstract class that serves as the foundation for all partitioners and requires its subclasses to define specific methods like train_step, valid_step, and test_step.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/base.py\":0-26",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom abc import abstractmethod\nimport paddle.nn as nn\nfrom ... import builder\nclass BasePartitioner(nn.Layer):\n    \"\"\"Base class for Partition.\n    All partitioner should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, define your train step.\n    - Methods:``valid_step``, define your valid step, always the same as train_step.\n    - Methods:``test_step``, define your test step.\n    \"\"\""
+        },
+        {
+            "comment": "This code initializes a partitioned model by building backbone and head components. It also includes an option to initialize weights for these components, and provides a forward function defining the model's execution path depending on the provided mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/base.py\":27-54",
+            "content": "    def __init__(self, backbone=None, head=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n    def init_weights(self):\n        \"\"\"Initialize the model network weights. \"\"\"\n        if getattr(self.backbone, 'init_weights'):\n            self.backbone.init_weights()\n        else:\n            pass\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py"
+        },
+        {
+            "comment": "The code defines a base class for model partitioners, which includes methods for train, validate, test, and infer steps. Each step takes a data batch as input and returns either a loss metric or the output. If an unsupported mode is provided, it raises a NotImplementedError. The actual implementation of these steps is left to subclasses.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/base.py\":55-83",
+            "content": "        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.  input_data_batch -> loss_metric\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating setp. input_data_batch -> loss_metric\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Tets setp. to get acc in test data. input_data_batch -> output\n        \"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/63391ca2-e149-46d8-82eb-deeed9ffec78.json b/docs/doc/63391ca2-e149-46d8-82eb-deeed9ffec78.json
new file mode 100644
index 000000000..a381d1b20
--- /dev/null
+++ b/docs/doc/63391ca2-e149-46d8-82eb-deeed9ffec78.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code introduces the PP-TSN model, an enhanced version of TSN. It describes implementation, data preparation and training processes, using mixed-precision training for speed. The PP-TSN model can be customized and tested on Kinetics-400, providing models for video file inference.",
+    "details": [
+        {
+            "comment": "This code is a documentation for the PP-TSN model, which is an improved version of the TSN model. The documentation includes sections on introduction, data, train, test, inference, and reference. It also provides accuracy information and guidance on how to download and prepare K400 and UCF101 data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsn.md\":0-29",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/pp-tsn.md) | English\n# PP-TSN\n## Content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe have improved the [TSN model](./tsn.md) and obtained a more accurate 2D practical video classification model **PP-TSN**. Without increasing the amount of parameters and calculations, the accuracy on the UCF-101, Kinetics-400 and other data sets significantly exceeds the original version. The accuracy on the Kinetics-400 data set is shown in the following table.\n| Version | Top1 |\n| :------ | :----: |\n| Ours (distill) | 75.06 |\n| Ours | **73.68** |\n| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn#kinetics-400) | 71.80 |\n## Data\nK400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)\nUCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)"
+        },
+        {
+            "comment": "This code describes how to train the \"PPTSN\" model on the Kinetics-400 dataset using 8 GPUs. It first requires downloading a pre-trained ResNet50_vd_ssld_v2 model, then configuring its path in the yaml file, and finally running training with the provided command.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsn.md\":32-60",
+            "content": "## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Download the image distillation pre-training model [ResNet50_vd_ssld_v2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams) as the Backbone initialization parameter, or download it through wget\n   ```bash\n   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/pptsn/pptsn_k400_frames.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"Recognizer2D\"\n        backbone:\n            name: \"ResNetTweaksTSN\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:\n    ```bash\n    # frames data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --validate -c configs/recognition/ pptsn/pptsn_k400_frames.yaml"
+        },
+        {
+            "comment": "This code demonstrates how to run PaddleVideo's pp-tsn model with amp mixed-precision training for faster processing. It supports both videos and frames data formats, and allows customization of parameter configurations for different datasets.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsn.md\":62-80",
+            "content": "    # videos data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --validate -c configs/recognition/ pptsn/pptsn_k400_videos.yaml\n    ```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n    ```bash\n    export FLAGS_conv_workspace_size_limit=800 # MB\n    export FLAGS_cudnn_exhaustive_search=1\n    export FLAGS_cudnn_batchnorm_spatial_persistent=1\n    # frames data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --amp --validate -c configs /recognition/pptsn/pptsn_k400_frames.yaml\n    # videos data format\n    python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --amp --validate -c configs /recognition/pptsn/pptsn_k400_videos.yaml\n    ```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is "
+        },
+        {
+            "comment": "The PP-TSN model's testing process is different from training verification due to the sampling method used. The final test score should be obtained after testing the best model in test mode, as opposed to using the top-k accuracy recorded during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsn.md\":80-94",
+            "content": "recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.\n## Test\n- The PP-TSN model is verified during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n\t```\n  Already save the best model (top1 acc)0.7004\n\t```\n- Since the sampling method of the PP-TSN model test mode is **TenCrop**, which is slightly slower but more accurate, it is different from the **CenterCrop** used in the verification mode during the training process, so the verification index recorded in the training log is `topk Acc `Does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:\n\t```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_pptsn main.py --test -c configs/recognition/ pptsn/pptsn_k400_frames.yaml -w \"output/ppTSN/ppTSN_best.pdparams\""
+        },
+        {
+            "comment": "This code outlines the test results of PP-TSN model using different configurations on the validation dataset of Kinetics-400. The table presents backbone, sampling method, distillation method, number of segments, target image size, and Top-1 accuracy for each configuration. Checkpoints are also provided for each configuration. The PP-TSN video sampling strategy is TenCrop sampling, which samples frames from different positions in the video sequence and spatial areas.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsn.md\":95-104",
+            "content": "\t```\n\tWhen the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:\n\t| backbone | Sampling method | distill | num_seg | target_size | Top-1 |       checkpoints       |\n\t| :------: | :-------------: | :-----: | :-----: | :---------: | :---- | :---------------------: |\n\t| ResNet50 |     TenCrop     |  False  |    3    |     224     | 73.68 | [ppTSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams) |\n\t| ResNet50 |     TenCrop     |  True   |    8    |     224     | 75.06 | [ppTSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400_8.pdparams) |\n- The PP-TSN video sampling strategy is TenCrop sampling: in time sequence, the input video is evenly divided into num_seg segments, and the middle position of each segment is sampled 1 frame; spatially, from the upper left corner, upper right corner, center point, lower left corner, and lower right corner Each"
+        },
+        {
+            "comment": "The code exports the pre-trained model for inference and uses the prediction engine to perform predictions on input video files. Distillation is used for obtaining the pre-trained model, and the generated model structure file and weight files are stored in the `inference/ppTSN/` directory. The provided bash commands assist in exporting and predicting with the model respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsn.md\":104-124",
+            "content": " of the 5 sub-regions sampled an area of 224x224, and the horizontal flip was added to obtain a total of 10 sampling results. A total of 1 clip is sampled for 1 video.\n- Distill is `True`, which means that the pre-trained model obtained by distillation is used. For the specific distillation scheme, please refer to [ppTSM Distillation Scheme]().\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_frames.yaml -p data/ppTSN_k400.pdparams -o inference/ppTSN\n```\nThe above command will generate the model structure file `ppTSN.pdmodel` and model weight files `ppTSN.pdiparams` and `ppTSN.pdiparams.info` files required for prediction, all of which are stored in the `inference/ppTSN/` directory\nFor the meaning of each parameter in the above bash command, please refer to [Model Reasoning Method](https://github.com/HydrogenSulfate/PaddleVideo/blob/PPTSN-v1/docs/en/start.md#2-infer)\n### Use prediction engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\"
+        },
+        {
+            "comment": "This code is running an inference on a video file using the PP-TSN model trained on Kinetics-400. The top-1 category and its corresponding confidence are being outputted for the given video file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsn.md\":125-145",
+            "content": "                           --config configs/recognition/pptsn/pptsn_k400_frames.yaml \\\n                           --model_file inference/ppTSN/ppTSN.pdmodel \\\n                           --params_file inference/ppTSN/ppTSN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nThe output example is as follows:\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.998979389667511\n```\nIt can be seen that using the PP-TSN model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By consulting the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.\n## Reference\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/pdf/1608.00859.pdf), Limin Wang, Yuanjun Xiong, Zhe Wang\n- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/63b14d4a-9e6b-4edc-a580-fc8ac0c6255b.json b/docs/doc/63b14d4a-9e6b-4edc-a580-fc8ac0c6255b.json
new file mode 100644
index 000000000..c87e5deea
--- /dev/null
+++ b/docs/doc/63b14d4a-9e6b-4edc-a580-fc8ac0c6255b.json
@@ -0,0 +1,35 @@
+{
+    "summary": "PaddleVideo library enables feature extraction and map creation, while GetVideoLabel class calculates IoU for object detection tasks. The code stores max IoU values and prepares data for evaluation or processing.",
+    "details": [
+        {
+            "comment": "This code is part of PaddleVideo library, specifically for loading feature data from a given path. It defines a class \"LoadFeat\" and uses the numpy library to load .npy files based on the video name provided in the results dictionary. The file path is constructed using the specified feat_path and video name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py\":0-31",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport numpy as np\nfrom ..registry import PIPELINES\n\"\"\"pipeline ops for Activity Net.\n\"\"\"\n@PIPELINES.register()\nclass LoadFeat(object):\n    def __init__(self, feat_path):\n        self.feat_path = feat_path\n    def __call__(self, results):\n        video_name = results['video_name']\n        file_name = video_name + \".npy\"\n        file_path = os.path.join(self.feat_path, file_name)\n        #TODO: check path\n        video_feat = np.load(file_path)"
+        },
+        {
+            "comment": "This code defines a pipeline function that generates matching maps for an input video. It creates temporal matching windows of varying sizes and reshapes the result into a specific format. The anchor positions are also extracted for later use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py\":32-61",
+            "content": "        video_feat = video_feat.T\n        video_feat = video_feat.astype(\"float32\")\n        results['video_feat'] = video_feat\n        return results\n@PIPELINES.register()\nclass GetMatchMap(object):\n    def __init__(self, tscale):\n        self.tscale = tscale\n        self.tgap = 1. / self.tscale\n    def __call__(self, results):\n        match_map = []\n        for idx in range(self.tscale):\n            tmp_match_window = []\n            xmin = self.tgap * idx\n            for jdx in range(1, self.tscale + 1):\n                xmax = xmin + self.tgap * jdx\n                tmp_match_window.append([xmin, xmax])\n            match_map.append(tmp_match_window)\n        match_map = np.array(match_map)\n        match_map = np.transpose(match_map, [1, 0, 2])\n        match_map = np.reshape(match_map, [-1, 2])\n        anchor_xmin = [self.tgap * i for i in range(self.tscale)]\n        anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]\n        results['match_map'] = match_map\n        results['anchor_xmin'] = anchor_xmin"
+        },
+        {
+            "comment": "This code defines a class called \"GetVideoLabel\" which calculates the Intersection over Union (IOU) and intersection scores between a box and the anchors. It also initializes variables for time and distance scaling, and box type data types. The \"iou_with_anchors\" method calculates the Jaccard score and the \"ioa_with_anchors\" method computes the intersection. These methods can be used to determine the best match between an anchor box and a target box in object detection tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py\":62-89",
+            "content": "        results['anchor_xmax'] = anchor_xmax\n        return results\n@PIPELINES.register()\nclass GetVideoLabel(object):\n    def __init__(self, tscale, dscale, datatype=\"float32\"):\n        self.tscale = tscale\n        self.dscale = dscale\n        self.tgap = 1. / self.tscale\n        self.datatype = datatype\n    def iou_with_anchors(self, anchors_min, anchors_max, box_min, box_max):\n        \"\"\"Compute jaccard score between a box and the anchors.\n        \"\"\"\n        len_anchors = anchors_max - anchors_min\n        int_xmin = np.maximum(anchors_min, box_min)\n        int_xmax = np.minimum(anchors_max, box_max)\n        inter_len = np.maximum(int_xmax - int_xmin, 0.)\n        union_len = len_anchors - inter_len + box_max - box_min\n        jaccard = np.divide(inter_len, union_len)\n        return jaccard\n    def ioa_with_anchors(self, anchors_min, anchors_max, box_min, box_max):\n        \"\"\"Compute intersection between score a box and the anchors.\n        \"\"\"\n        len_anchors = anchors_max - anchors_min\n        int_xmin = np.maximum(anchors_min, box_min)"
+        },
+        {
+            "comment": "The function initializes gt_bbox and gt_iou_map variables to store ground truth bounding box coordinates and their IoU with anchor boxes. It then iterates through video labels, calculating the start and end timestamps in video seconds for each ground truth box. The IoU between match map and the current ground truth is computed using the iou_with_anchors function and stored in gt_iou_map, reshaped to match the dimensions of dscale and tscale.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py\":90-114",
+            "content": "        int_xmax = np.minimum(anchors_max, box_max)\n        inter_len = np.maximum(int_xmax - int_xmin, 0.)\n        scores = np.divide(inter_len, len_anchors)\n        return scores\n    def __call__(self, results):\n        video_info = results['video_info']\n        match_map = results['match_map']\n        anchor_xmin = results['anchor_xmin']\n        anchor_xmax = results['anchor_xmax']\n        video_second = video_info['duration_second']\n        video_labels = video_info['annotations']\n        gt_bbox = []\n        gt_iou_map = []\n        for gt in video_labels:\n            tmp_start = max(min(1, gt[\"segment\"][0] / video_second), 0)\n            tmp_end = max(min(1, gt[\"segment\"][1] / video_second), 0)\n            gt_bbox.append([tmp_start, tmp_end])\n            tmp_gt_iou_map = self.iou_with_anchors(match_map[:, 0],\n                                                   match_map[:, 1], tmp_start,\n                                                   tmp_end)\n            tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,\n                                        [self.dscale, self.tscale])"
+        },
+        {
+            "comment": "This code calculates the intersection over union (IoU) between ground truth bounding boxes and anchor boxes. It stores the maximum IoU values for each ground truth box and anchor pair, then calculates the maximum IoU values for start and end positions of anchor boxes. This information will be used to determine if a prediction matches with a ground truth box and assign appropriate scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py\":115-139",
+            "content": "            gt_iou_map.append(tmp_gt_iou_map)\n        gt_iou_map = np.array(gt_iou_map)\n        gt_iou_map = np.max(gt_iou_map, axis=0)\n        gt_bbox = np.array(gt_bbox)\n        gt_xmins = gt_bbox[:, 0]\n        gt_xmaxs = gt_bbox[:, 1]\n        gt_len_small = 3 * self.tgap\n        gt_start_bboxs = np.stack(\n            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)\n        gt_end_bboxs = np.stack(\n            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)\n        match_score_start = []\n        for jdx in range(len(anchor_xmin)):\n            match_score_start.append(\n                np.max(\n                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],\n                                          gt_start_bboxs[:, 0],\n                                          gt_start_bboxs[:, 1])))\n        match_score_end = []\n        for jdx in range(len(anchor_xmin)):\n            match_score_end.append(\n                np.max(\n                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],"
+        },
+        {
+            "comment": "This code is storing ground truth (gt) IOU map, start and end indices for the annotations into the 'results' dictionary. The IOU map is converted to specified datatype before storage. These values will be used later for evaluation or further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py\":140-149",
+            "content": "                                          gt_end_bboxs[:, 0], gt_end_bboxs[:,\n                                                                           1])))\n        gt_start = np.array(match_score_start)\n        gt_end = np.array(match_score_end)\n        results['gt_iou_map'] = gt_iou_map.astype(self.datatype)\n        results['gt_start'] = gt_start.astype(self.datatype)\n        results['gt_end'] = gt_end.astype(self.datatype)\n        return results"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6462bc26-102a-44ca-91a8-18cb0728b1b2.json b/docs/doc/6462bc26-102a-44ca-91a8-18cb0728b1b2.json
new file mode 100644
index 000000000..136f15405
--- /dev/null
+++ b/docs/doc/6462bc26-102a-44ca-91a8-18cb0728b1b2.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The MRI.py file in PaddleVideo library provides an action recognition dataset loader, utilizing a MRIDataset class for transform operations on raw frames and includes license information, copyright notices, and data structure registration. It reads data, stores components in a list, handles missing files through retry and exception handling, and logs errors. The code snippet returns a numpy array for images and another for labels from the 'results' dictionary, likely used in a function that processes data from MRI datasets where 'imgs' contains image data and 'labels' stores their corresponding labels or annotations.",
+    "details": [
+        {
+            "comment": "The code snippet is from the MRI.py file within the PaddleVideo library, which appears to be a loader dataset for action recognition tasks. It imports necessary libraries and defines the MRIDataset class that inherits from BaseDataset. This class loads raw frames from frame files and applies specified transform operations on them. The index file is used by the dataset loader. The code also includes license information, copyright notices, and data structure registration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI.py\":0-30",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass MRIDataset(BaseDataset):\n    \"\"\"Rawframe dataset for action recognition.\n    The dataset loads raw frames from frame files, and apply specified transform operatation them.\n    The indecx file is"
+        },
+        {
+            "comment": "This function initializes the MRI dataset object, taking the file path to the index file as well as other optional arguments. The load_file method is used to load the index file and retrieve video information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI.py\":30-60",
+            "content": " a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.\n    Example of an index file:\n    .. code-block:: txt\n        file_path-1 150 1\n        file_path-2 160 1\n        file_path-3 170 2\n        file_path-4 180 2\n    Args:\n        file_path (str): Path to the index file.\n        pipeline(XXX):\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 num_retries=5,\n                 data_prefix=None,\n                 test_mode=False,\n                 suffix='img_{:05}.jpg'):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\""
+        },
+        {
+            "comment": "This code reads data from a file, splits it into different components like frame directory, frames length, and labels, and stores it in a list. It also handles missing files by retrying multiple times using exception handling.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI.py\":61-85",
+            "content": "        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                frame_dir, frames_len, labels = line_split\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(\n                    dict(\n                        frame_dir=frame_dir,\n                        #suffix=self.suffix,\n                        frames_len=frames_len,\n                        labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid gisven index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:"
+        },
+        {
+            "comment": "The code is attempting to load frames for testing by trying multiple times in case of an exception caused by missing frames. It uses a logger to inform about the error and tries again with different frames until successful or reaching the maximum retries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI.py\":86-107",
+            "content": "                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return np.array(results['imgs']), np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue"
+        },
+        {
+            "comment": "The code snippet returns a numpy array for images and another for labels from the 'results' dictionary. It is likely used in a function that processes data from MRI datasets, where 'imgs' contains image data and 'labels' stores their corresponding labels or annotations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI.py\":108-108",
+            "content": "            return np.array(results['imgs']), np.array([results['labels']])"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/64633a42-32ba-4b11-a641-59aadaf57e1e.json b/docs/doc/64633a42-32ba-4b11-a641-59aadaf57e1e.json
new file mode 100644
index 000000000..cf6ce7a7d
--- /dev/null
+++ b/docs/doc/64633a42-32ba-4b11-a641-59aadaf57e1e.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code defines a ppTSMHead class, a subclass of TSNHead with L2Decay regularizer. It initializes the PPTSM model head with average pooling and dropout, defining an 'init_weights' function for FC layer parameters. This is part of the PaddlePaddle Video library.",
+    "details": [
+        {
+            "comment": "This code defines a ppTSMHead class, which is a subclass of TSNHead. It has arguments such as num_classes, in_channels, and loss_cfg. The class is registered under the HEADS registry for future use. The L2Decay regularizer is used, and weight initialization is performed using the weight_init function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsm_head.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle import ParamAttr\nfrom paddle.nn import Linear\nfrom paddle.regularizer import L2Decay\nfrom .tsn_head import TSNHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass ppTSMHead(TSNHead):\n    \"\"\" ppTSM Head\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss')."
+        },
+        {
+            "comment": "This code defines a class with an __init__ method that takes arguments for number of classes, input channels, dropout ratio, std value, data format, and optional keyword arguments. It initializes the base class and sets up a linear layer (self.fc) with specified learning rates and regularizers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsm_head.py\":31-57",
+            "content": "        drop_ratio(float): drop ratio. Default: 0.8.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.001.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(\n            self,\n            num_classes,\n            in_channels,  # NOTE: 2048 for >= R50, 512 for <= R34\n            drop_ratio=0.8,\n            std=0.01,\n            data_format=\"NCHW\",\n            num_seg=8,\n            **kwargs):\n        super().__init__(num_classes,\n                         in_channels,\n                         drop_ratio=drop_ratio,\n                         std=std,\n                         data_format=data_format,\n                         **kwargs)\n        self.fc = Linear(self.in_channels,\n                         self.num_classes,\n                         weight_attr=ParamAttr(learning_rate=5.0,\n                                               regularizer=L2Decay(1e-4)),\n                         bias_attr=ParamAttr(learning_rate=10.0,\n                                             regularizer=L2Decay(0.0)))"
+        },
+        {
+            "comment": "The code initializes a head for the PPTSM model, which includes an average pooling layer, dropout if specified, and reshaping operations. It then returns the classification scores for input samples. The 'init_weights' function initializes the FC layer parameters with normal distribution using the given standard deviation (stdv).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsm_head.py\":58-86",
+            "content": "        self.stdv = std\n        self.num_seg = num_seg\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)\n    def forward(self, x, num_seg=None):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        #XXX: check dropout location!\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)\n            # [N * num_seg, in_channels, 1, 1]\n        num_seg = num_seg if num_seg is not None else self.num_seg\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, in_channels]\n        x = paddle.reshape(x, shape=[-1, self.in_channels])"
+        },
+        {
+            "comment": "This code snippet is part of the PaddlePaddle Video (PaddleVideo) library. It defines a function within a class called \"pptsm_head\". The function takes input 'x' and performs a fully connected operation using \"self.fc\", returning the scores in the form of \"score\" with dimensions [N, in_channels]. The line \"#x = F.softmax(x) #NOTE remove\" was likely removed from the code, but its original purpose would have been to apply softmax function on 'x' and return the normalized probabilities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsm_head.py\":87-91",
+            "content": "        # [N, in_channels]\n        score = self.fc(x)\n        # [N, num_class]\n        #x = F.softmax(x)  #NOTE remove\n        return score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/64b7f641-e3fa-4a01-ae6d-2ac87e05c4fb.json b/docs/doc/64b7f641-e3fa-4a01-ae6d-2ac87e05c4fb.json
new file mode 100644
index 000000000..ca69449e1
--- /dev/null
+++ b/docs/doc/64b7f641-e3fa-4a01-ae6d-2ac87e05c4fb.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code initializes a Table Tennis action detection model using LSTM, loads configurations, and processes proposals for multiple datasets. It applies inference, predicts actions on video features, sorts predictions, and saves results in JSON format.",
+    "details": [
+        {
+            "comment": "This code defines a class named InferModel that implements an LSTM model for action detection. The model is initialized with configuration parameters, including the path to the model and parameter files, GPU memory usage, and device ID. Additional configuration settings include topk, frame_offset, nms_thread, classify_score_thread, iou_score_thread, label_dic, fps, and nms_id. These parameters control various aspects of the action detection process. The code imports necessary libraries and modules for preprocessing, config utilities, result processing, and model loading from PaddlePaddle.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py\":0-37",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import get_action_result\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"lstm infer\"\"\"\n    def __init__(self, cfg, name='ACTION'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        self.topk = cfg[name]['topk']\n        self.frame_offset = cfg[name]['nms_offset']\n        self.nms_thread = cfg[name]['nms_thread']\n        self.cls_thread = cfg[name]['classify_score_thread']\n        self.iou_thread = cfg[name]['iou_score_thread']\n        self.label_map_file = cfg['COMMON']['label_dic']\n        self.fps = cfg['COMMON']['fps']\n        self.nms_id = 5"
+        },
+        {
+            "comment": "Initializes model and sets up input/output tensors for inferencing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py\":39-61",
+            "content": "        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input1_tensor = self.predictor.get_input_handle(input_names[0])\n        self.input2_tensor = self.predictor.get_input_handle(input_names[1])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n    def infer(self, input1_arr, input1_lod, input2_arr=None, input2_lod=None):\n        \"\"\"infer\"\"\"\n        self.input1_tensor.copy_from_cpu(input1_arr)\n        self.input1_tensor.set_lod(input1_lod)\n        if not input2_arr is None:\n            self.input2_tensor.copy_from_cpu(input2_arr)"
+        },
+        {
+            "comment": "The code is part of a model for action detection in Table Tennis. It sets the input's layout of dimension (LOD) and performs preprocessing, prediction, and returns output results. The LOD defines the shape of data along the spatial dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py\":62-91",
+            "content": "            self.input2_tensor.set_lod(input2_lod)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        # print(output.shape)\n        return output1, output2\n    def pre_process(self, input):\n        \"\"\"pre process\"\"\"\n        input_arr = []\n        input_lod = [0]\n        start_lod = 0\n        end_lod = 0\n        for sub_item in input:\n            end_lod = start_lod + len(sub_item)\n            input_lod.append(end_lod)\n            input_arr.extend(sub_item)\n            start_lod = end_lod\n        input_arr = np.array(input_arr)\n        # print(input_arr.shape)\n        # print([input_lod])\n        return input_arr, [input_lod]\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name,\n                                         'infer',\n                                         infer_config,\n                                         material=material)\n        results = []"
+        },
+        {
+            "comment": "This code iterates through a data source, preprocesses the input, and performs inference on it using a model. The resulting outputs are then sorted to obtain the top k predictions for each input. The video ID, predicted action IDs, sorted indices, and IOU scores are stored in a results list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py\":92-110",
+            "content": "        for infer_iter, data in enumerate(infer_reader()):\n            video_id = [[items[-2], items[-1]] for items in data]\n            input1 = [items[0] for items in data]\n            input2 = [items[1] for items in data]\n            input1_arr, input1_lod = self.pre_process(input1)\n            input2_arr, input2_lod = self.pre_process(input2)\n            output1, output2 = self.infer(input1_arr, input1_lod, input2_arr,\n                                          input2_lod)\n            # output1, output2 = self.infer(input1_arr, input1_lod)\n            predictions_id = output1\n            predictions_iou = output2\n            for i in range(len(predictions_id)):\n                topk_inds = predictions_id[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds_id = predictions_id[i][topk_inds]\n                preds_iou = predictions_iou[i][0]\n                results.append((video_id[i], preds_id.tolist(),\n                                topk_inds.tolist(), preds_iou.tolist()))"
+        },
+        {
+            "comment": "The code is a part of a Table Tennis action detection model implemented using LSTM (Long Short-Term Memory). It loads configurations from a YAML file, initializes the model, and processes proposals for multiple datasets. The model takes results from previous processing steps, applies inference based on labels, frame rate, and other parameters, and returns the final prediction result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py\":112-135",
+            "content": "        predict_result = get_action_result(results, self.label_map_file,\n                                           self.fps, self.cls_thread,\n                                           self.iou_thread, self.nms_id,\n                                           self.nms_thread, self.frame_offset)\n        return predict_result\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    # proposal total\n    prop_dict = {}\n    for dataset in ['EuroCup2016', 'WorldCup2018']:\n        prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(\n            dataset)\n        json_data = json.load(open(prop_json, 'r'))\n        for item in json_data:\n            basename = prop_json.replace('feature_bmn/prop.json', 'mp4')\n            basename = basename + '/' + item['video_name'] + '.mp4'\n            prop_dict[basename] = item['bmn_results']\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'"
+        },
+        {
+            "comment": "This code loads video features and proposals, then predicts action using the LSTM model. The results are saved in a JSON file and the time taken is printed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py\":137-157",
+            "content": "    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    # proposal\n    basename = imgs_path.replace('frames', 'mp4') + '.mp4'\n    bmn_results = prop_dict[basename]\n    material = {'feature': video_features, 'proposal': bmn_results}\n    t0 = time.time()\n    outputs = model.predict(cfg, material)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    # print(outputs.shape)\n    t1 = time.time()\n    results = {'actions': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6577a076-ca7d-434e-a249-6d0fbbefe72c.json b/docs/doc/6577a076-ca7d-434e-a249-6d0fbbefe72c.json
new file mode 100644
index 000000000..f366b255f
--- /dev/null
+++ b/docs/doc/6577a076-ca7d-434e-a249-6d0fbbefe72c.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code includes utility functions for managing distributed computation in PaddleVideo's Video Quality Assessment application, providing current rank and world size info, and a decorator to limit function execution to the main process.",
+    "details": [
+        {
+            "comment": "This code provides utility functions for handling distributed computation in PaddleVideo's Video Quality Assessment application. The `get_dist_info()` function returns the current rank and world size, while `main_only(func)` is a decorator that ensures a function only runs on the main process (rank 0).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py\":0-34",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport functools\nimport paddle\nimport paddle.distributed as dist\ndef get_dist_info():\n    \"\"\"get_dist_info\"\"\"\n    world_size = dist.get_world_size()\n    rank = dist.get_rank()\n    return rank, world_size\ndef main_only(func):\n    \"\"\"main_only\"\"\"\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        \"\"\"wrapper\"\"\"\n        rank, _ = get_dist_info()\n        if rank == 0:\n            return func(*args, **kwargs)"
+        },
+        {
+            "comment": "This function returns the modified or wrapped object, which can be a tensor, model, or other data structure.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py\":35-35",
+            "content": "    return wrapper"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/65bcaa19-d2b8-46f9-83f1-c9ff07f19e09.json b/docs/doc/65bcaa19-d2b8-46f9-83f1-c9ff07f19e09.json
new file mode 100644
index 000000000..7d82be53a
--- /dev/null
+++ b/docs/doc/65bcaa19-d2b8-46f9-83f1-c9ff07f19e09.json
@@ -0,0 +1,285 @@
+{
+    "summary": "The code develops efficient data processing classes for PaddleVideo, including interpolation, cropping, and pipeline optimization. It performs image flipping operations, augments 'Flow' modality images, transforms data formats, collapses dimensions, includes ML data pipeline support, generates heatmaps for keypoints and limbs in image sequences using input parameters, applies data augmentation, and uses Gaussian filtering with specified sigma value.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and defining a function to create ActivityNet-style pipeline operations. It also registers these pipelines for future use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":0-33",
+            "content": "#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport collections\nfrom itertools import repeat\nimport copy as cp\nfrom collections import abc\nimport numpy as np\nimport paddle.nn.functional as F\nimport random\nimport paddle\nfrom ..registry import PIPELINES\nfrom .augmentations_ava import iminvert, imflip_\n\"\"\"pipeline ops for Activity Net.\n\"\"\"\ndef _ntuple(n):\n    def parse(x):\n        if isinstance(x, collections.abc.Iterable):\n            return tuple(x)\n        return tuple(repeat(x, n))"
+        },
+        {
+            "comment": "This function initializes the lazy operation properly by checking if \"img_shape\" is in results, and adds or modifies keys \"lazy\", \"original_shape\", \"crop_bbox\", \"flip\", \"flip_direction\", and \"interpolation\" based on whether \"lazy\" is True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":35-67",
+            "content": "    return parse\n_single = _ntuple(1)\n_pair = _ntuple(2)\n_triple = _ntuple(3)\n_quadruple = _ntuple(4)\ndef _init_lazy_if_proper(results, lazy):\n    \"\"\"Initialize lazy operation properly.\n    Make sure that a lazy operation is properly initialized,\n    and avoid a non-lazy operation accidentally getting mixed in.\n    Required keys in results are \"imgs\" if \"img_shape\" not in results,\n    otherwise, Required keys in results are \"img_shape\", add or modified keys\n    are \"img_shape\", \"lazy\".\n    Add or modified keys in \"lazy\" are \"original_shape\", \"crop_bbox\", \"flip\",\n    \"flip_direction\", \"interpolation\".\n    Args:\n        results (dict): A dict stores data pipeline result.\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    if 'img_shape' not in results:\n        results['img_shape'] = results['imgs'][0].shape[:2]\n    if lazy:\n        if 'lazy' not in results:\n            img_h, img_w = results['img_shape']\n            lazyop = dict()\n            lazyop['original_shape'] = results['img_shape']"
+        },
+        {
+            "comment": "The code is defining a Pipeline class, specifically for auto-padding and skeleton feature extraction from image data. It first checks if the 'lazy' operation has been performed, then initializes necessary parameters for padding or sampling frames based on window size and random_pad setting. The get_frame_num function calculates the number of frames containing valid data, and the __call__ method applies the pipeline to the results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":68-100",
+            "content": "            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],\n                                           dtype=np.float32)\n            lazyop['flip'] = False\n            lazyop['flip_direction'] = None\n            lazyop['interpolation'] = None\n            results['lazy'] = lazyop\n    else:\n        assert 'lazy' not in results, 'Use Fuse after lazy operations'\n@PIPELINES.register()\nclass AutoPadding(object):\n    \"\"\"\n    Sample or Padding frame skeleton feature.\n    Args:\n        window_size: int, temporal size of skeleton feature.\n        random_pad: bool, whether do random padding when frame length < window size. Default: False.\n    \"\"\"\n    def __init__(self, window_size, random_pad=False):\n        self.window_size = window_size\n        self.random_pad = random_pad\n    def get_frame_num(self, data):\n        C, T, V, M = data.shape\n        for i in range(T - 1, -1, -1):\n            tmp = np.sum(data[:, i, :, :])\n            if tmp > 0:\n                T = i + 1\n                break\n        return T\n    def __call__(self, results):"
+        },
+        {
+            "comment": "Code snippet performs data padding to ensure consistent frame size for skeleton data in the Skeleton Pipeline. It checks the current frame number (T) and pads it with zeroes if T is smaller than the window size, or selects a subset of frames from the original data if T is larger than the window size. The result is then returned as 'results'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":101-132",
+            "content": "        data = results['data']\n        C, T, V, M = data.shape\n        T = self.get_frame_num(data)\n        if T == self.window_size:\n            data_pad = data[:, :self.window_size, :, :]\n        elif T < self.window_size:\n            begin = random.randint(\n                0, self.window_size - T) if self.random_pad else 0\n            data_pad = np.zeros((C, self.window_size, V, M))\n            data_pad[:, begin:begin + T, :, :] = data[:, :T, :, :]\n        else:\n            if self.random_pad:\n                index = np.random.choice(\n                    T, self.window_size, replace=False).astype('int64')\n            else:\n                index = np.linspace(0, T, self.window_size).astype(\"int64\")\n            data_pad = data[:, index, :, :]\n        results['data'] = data_pad\n        return results\n@PIPELINES.register()\nclass SkeletonNorm(object):\n    \"\"\"\n    Normalize skeleton feature.\n    Args:\n        aixs: dimensions of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default: 2.\n    \"\"\"\n    def __init__(self, axis=2, squeeze=False):"
+        },
+        {
+            "comment": "This code defines two classes, \"SkeletonPipeline\" and \"Iden\", which are used as PaddleVideo pipeline components. The SkeletonPipeline class is responsible for centralizing the data along a specified axis and reshaping it if squeeze is True. The Iden class simply converts the 'data' to float32 type and expands the shape of 'label' if it exists and label_expand is set to True. Both classes return updated results after processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":133-169",
+            "content": "        self.axis = axis\n        self.squeeze = squeeze\n    def __call__(self, results):\n        data = results['data']\n        # Centralization\n        data = data - data[:, :, 8:9, :]\n        data = data[:self.axis, :, :, :]  # get (x,y) from (x,y, acc)\n        C, T, V, M = data.shape\n        if self.squeeze:\n            data = data.reshape((C, T, V))  # M = 1\n        results['data'] = data.astype('float32')\n        if 'label' in results:\n            label = results['label']\n            results['label'] = np.expand_dims(label, 0).astype('int64')\n        return results\n@PIPELINES.register()\nclass Iden(object):\n    \"\"\"\n    Wrapper Pipeline\n    \"\"\"\n    def __init__(self, label_expand=True):\n        self.label_expand = label_expand\n    def __call__(self, results):\n        data = results['data']\n        results['data'] = data.astype('float32')\n        if 'label' in results and self.label_expand:\n            label = results['label']\n            results['label'] = np.expand_dims(label, 0).astype('int64')\n        return results"
+        },
+        {
+            "comment": "The code defines a random rotation skeleton class for applying random rotations to input data. It takes arguments for rotation and rotation rate, and has a method for performing the rotation operation on the input data. The method calculates rotation matrices for rotation around the x, y, and z axes using the given rotation rate and applies them to the input data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":172-201",
+            "content": "@PIPELINES.register()\nclass RandomRotation(object):\n    \"\"\"\n    Random rotation sketeton.\n    Args:\n        argument: bool, if rotation.\n        theta: float, rotation rate.\n    \"\"\"\n    def __init__(self, argument, theta=0.3):\n        self.theta = theta\n        self.argument = argument\n    def _rot(self, rot):\n        \"\"\"\n        rot: T,3\n        \"\"\"\n        cos_r, sin_r = np.cos(rot), np.sin(rot)  # T,3\n        zeros = np.zeros((rot.shape[0], 1))  # T,1\n        ones = np.ones((rot.shape[0], 1))  # T,1\n        r1 = np.stack((ones, zeros, zeros), axis=-1)  # T,1,3\n        rx2 = np.stack((zeros, cos_r[:, 0:1], sin_r[:, 0:1]), axis=-1)  # T,1,3\n        rx3 = np.stack((zeros, -sin_r[:, 0:1], cos_r[:, 0:1]), axis=-1)  # T,1,3\n        rx = np.concatenate((r1, rx2, rx3), axis=1)  # T,3,3\n        ry1 = np.stack((cos_r[:, 1:2], zeros, -sin_r[:, 1:2]), axis=-1)\n        r2 = np.stack((zeros, ones, zeros), axis=-1)\n        ry3 = np.stack((sin_r[:, 1:2], zeros, cos_r[:, 1:2]), axis=-1)\n        ry = np.concatenate((ry1, r2, ry3), axis=1)"
+        },
+        {
+            "comment": "This code defines a class with two methods: `_rot` and `__call__`. The `_rot` method takes rotation angles and returns the rotation matrix. The `__call__` method applies random rotations to input data, performs the rotations using `_rot`, and adjusts the shape of the data before returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":203-228",
+            "content": "        rz1 = np.stack((cos_r[:, 2:3], sin_r[:, 2:3], zeros), axis=-1)\n        r3 = np.stack((zeros, zeros, ones), axis=-1)\n        rz2 = np.stack((-sin_r[:, 2:3], cos_r[:, 2:3], zeros), axis=-1)\n        rz = np.concatenate((rz1, rz2, r3), axis=1)\n        rot = np.matmul(np.matmul(rz, ry), rx)\n        return rot\n    def __call__(self, results):\n        # C,T,V,M\n        data = results['data']\n        if self.argument:\n            C, T, V, M = data.shape\n            data_numpy = np.transpose(data, (1, 0, 2, 3)).conjugate().reshape(\n                T, C, V * M)  # T,3,V*M\n            rot = np.random.uniform(-self.theta, self.theta, 3)\n            rot = np.stack(\n                [\n                    rot,\n                ] * T, axis=0)\n            rot = self._rot(rot)  # T,3,3\n            data_numpy = np.matmul(rot, data_numpy)\n            data_numpy = data_numpy.reshape(T, C, V, M)\n            data_numpy = np.transpose(data_numpy, (1, 0, 2, 3))\n            data = data_numpy\n        results['data'] = data.astype(np.float32)"
+        },
+        {
+            "comment": "This code defines a Pipeline class for cropping sample data using the Sketeon crop model. It supports only the 'center' crop model and takes window size, crop model (default 'center'), and p_interval (default 1) as arguments. The __call__ method is used to apply the crop operation on the input results by selecting a center crop based on the crop model and p_interval values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":229-264",
+            "content": "        return results\n@PIPELINES.register()\nclass SketeonCropSample(object):\n    \"\"\"\n    Sketeon Crop Sampler.\n    Args:\n        crop_model: str, crop model, support: ['center'].\n        p_interval: list, crop len\n        window_size: int, sample windows size.\n    \"\"\"\n    def __init__(self, window_size, crop_model='center', p_interval=1):\n        assert crop_model in ['center'], \"Don't support :\" + crop_model\n        self.crop_model = crop_model\n        self.window_size = window_size\n        self.p_interval = p_interval\n    def __call__(self, results):\n        if self.crop_model == 'center':\n            # input: C,T,V,M\n            data = results['data']\n            valid_frame_num = np.sum(data.sum(0).sum(-1).sum(-1) != 0)\n            C, T, V, M = data.shape\n            begin = 0\n            end = valid_frame_num\n            valid_size = end - begin\n            #crop\n            if len(self.p_interval) == 1:\n                p = self.p_interval[0]\n                bias = int((1 - p) * valid_size / 2)\n                data = data[:, begin + bias:end - bias, :, :]  # center_crop"
+        },
+        {
+            "comment": "This code randomly selects a cropped length within a specified interval, then applies random bias to the cropped length. It reshapes and transposes the data before performing interpolation on the tensor for up or down sampling using bilinear mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":265-285",
+            "content": "                cropped_length = data.shape[1]\n            else:\n                p = np.random.rand(1) * (self.p_interval[1] - self.p_interval[0]\n                                         ) + self.p_interval[0]\n                # constraint cropped_length lower bound as 64\n                cropped_length = np.minimum(\n                    np.maximum(int(np.floor(valid_size * p)), 64), valid_size)\n                bias = np.random.randint(0, valid_size - cropped_length + 1)\n                data = data[:, begin + bias:begin + bias + cropped_length, :, :]\n            # resize\n            data = np.transpose(data, (0, 2, 3, 1)).conjugate().reshape(\n                C * V * M, cropped_length)\n            data = data[None, None, :, :]\n            # could perform both up sample and down sample\n            data_tensor = paddle.to_tensor(data)\n            data_tensor = F.interpolate(\n                data_tensor,\n                size=(C * V * M, self.window_size),\n                mode='bilinear',\n                align_corners=False).squeeze()"
+        },
+        {
+            "comment": "This code is part of the PaddleVideo library and appears to be a function or class related to skeleton data transformation for video analysis tasks. The code seems to handle reshaping and transposing data based on certain parameters, such as window size, crop model, and more. This could potentially be used for video processing in computer vision applications like action recognition or pose estimation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":286-315",
+            "content": "            data = paddle.transpose(\n                paddle.reshape(data_tensor, (C, V, M, self.window_size)),\n                (0, 3, 1, 2)).numpy()\n        else:\n            raise NotImplementedError\n        results['data'] = data\n        return results\n@PIPELINES.register()\nclass SketeonModalityTransform(object):\n    \"\"\"\n    Sketeon Crop Sampler.\n    Args:\n        crop_model: str, crop model, support: ['center'].\n        p_interval: list, crop len\n        window_size: int, sample windows size.\n    \"\"\"\n    def __init__(self, bone, motion, joint=True, graph='ntu_rgb_d'):\n        self.joint = joint\n        self.bone = bone\n        self.motion = motion\n        self.graph = graph\n        if self.graph == \"ntu_rgb_d\":\n            self.bone_pairs = ((1, 2), (2, 21), (3, 21), (4, 3), (5, 21),\n                               (6, 5), (7, 6), (8, 7), (9, 21), (10, 9),\n                               (11, 10), (12, 11), (13, 1), (14, 13), (15, 14),\n                               (16, 15), (17, 1), (18, 17), (19, 18), (20, 19),"
+        },
+        {
+            "comment": "This code defines a class for skeleton processing in PaddleVideo. If joints are enabled, it returns the results as is. If bones are enabled, it calculates bone data by subtracting corresponding bone vertices from each other. If motion is enabled, it sets the last frame's coordinates to 0. The UniformSampleFrames pipeline uniformly samples frames from a video by dividing it into equal segments and randomly selecting one from each segment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":316-343",
+            "content": "                               (22, 23), (21, 21), (23, 8), (24, 25), (25, 12))\n        else:\n            raise NotImplementedError\n    def __call__(self, results):\n        if self.joint:\n            return results\n        data_numpy = results['data']\n        if self.bone:\n            bone_data_numpy = np.zeros_like(data_numpy)\n            for v1, v2 in self.bone_pairs:\n                bone_data_numpy[:, :, v1 -\n                                1] = data_numpy[:, :, v1 -\n                                                1] - data_numpy[:, :, v2 - 1]\n            data_numpy = bone_data_numpy\n        if self.motion:\n            data_numpy[:, :-1] = data_numpy[:, 1:] - data_numpy[:, :-1]\n            data_numpy[:, -1] = 0\n        results['data'] = data_numpy\n        return results\n@PIPELINES.register()\nclass UniformSampleFrames:\n    \"\"\"Uniformly sample frames from the video.\n    To sample an n-frame clip from the video. UniformSampleFrames basically\n    divide the video into n segments of equal length and randomly sample one"
+        },
+        {
+            "comment": "This code snippet defines a class with an __init__ method that initializes the clip_len, num_clips, test_mode, and seed. The _get_train_clips method uniformly samples indices for training clips based on the given number of frames and clip length. This is used in PaddleVideo for loading and processing video data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":344-371",
+            "content": "    frame from each segment. To make the testing results reproducible, a\n    random seed is set during testing, to make the sampling results\n    deterministic.\n    Required keys are \"total_frames\", \"start_index\" , added or modified keys\n    are \"frame_inds\", \"clip_len\", \"frame_interval\" and \"num_clips\".\n    Args:\n        clip_len (int): Frames of each sampled output clip.\n        num_clips (int): Number of clips to be sampled. Default: 1.\n        test_mode (bool): Store True when building test or validation dataset.\n            Default: False.\n        seed (int): The random seed used during test time. Default: 255.\n    \"\"\"\n    def __init__(self, clip_len, num_clips=1, test_mode=False, seed=255):\n        self.clip_len = clip_len\n        self.num_clips = num_clips\n        self.test_mode = test_mode\n        self.seed = seed\n    def _get_train_clips(self, num_frames, clip_len):\n        \"\"\"Uniformly sample indices for training clips.\n        Args:\n            num_frames (int): The number of frames.\n            clip_len (int): The length of the clip."
+        },
+        {
+            "comment": "This code determines the indices for a skeleton clip from a given number of frames and clip length. It handles three scenarios: when the number of frames is less than the clip length, between the clip length and twice the clip length, or more than twice the clip length. The function returns the sampled indices accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":372-400",
+            "content": "        \"\"\"\n        assert self.num_clips == 1\n        if num_frames < clip_len:\n            start = np.random.randint(0, num_frames)\n            inds = np.arange(start, start + clip_len)\n        elif clip_len <= num_frames < 2 * clip_len:\n            basic = np.arange(clip_len)\n            inds = np.random.choice(\n                clip_len + 1, num_frames - clip_len, replace=False)\n            offset = np.zeros(clip_len + 1, dtype=np.int64)\n            offset[inds] = 1\n            offset = np.cumsum(offset)\n            inds = basic + offset[:-1]\n        else:\n            bids = np.array(\n                [i * num_frames // clip_len for i in range(clip_len + 1)])\n            bsize = np.diff(bids)\n            bst = bids[:clip_len]\n            offset = np.random.randint(bsize)\n            inds = bst + offset\n        return inds\n    def _get_test_clips(self, num_frames, clip_len):\n        \"\"\"Uniformly sample indices for testing clips.\n        Args:\n            num_frames (int): The number of frames.\n            clip_len (int): The length of the clip."
+        },
+        {
+            "comment": "The code handles the random selection of frame indices for a given clip length and total number of frames. It considers three scenarios: when there are fewer frames than the clip length, exactly equal to the clip length, or between the clip length and twice the clip length. It uses list comprehension and numpy functions to generate the desired indices for each case.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":401-426",
+            "content": "        \"\"\"\n        np.random.seed(self.seed)\n        if num_frames < clip_len:\n            # Then we use a simple strategy\n            if num_frames < self.num_clips:\n                start_inds = list(range(self.num_clips))\n            else:\n                start_inds = [\n                    i * num_frames // self.num_clips\n                    for i in range(self.num_clips)\n                ]\n            inds = np.concatenate(\n                [np.arange(i, i + clip_len) for i in start_inds])\n        elif clip_len <= num_frames < clip_len * 2:\n            all_inds = []\n            for i in range(self.num_clips):\n                basic = np.arange(clip_len)\n                inds = np.random.choice(\n                    clip_len + 1, num_frames - clip_len, replace=False)\n                offset = np.zeros(clip_len + 1, dtype=np.int64)\n                offset[inds] = 1\n                offset = np.cumsum(offset)\n                inds = basic + offset[:-1]\n                all_inds.append(inds)\n            inds = np.concatenate(all_inds)"
+        },
+        {
+            "comment": "This code defines a class for generating frame indices for skeleton data in PaddleVideo. It has methods to generate clips for training or testing, and returns the generated clips as results. The class takes parameters such as clip length, number of clips, total frames, etc. It ensures that the returned frame indices are within the range of total frames and converts them to integers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":427-458",
+            "content": "        else:\n            bids = np.array(\n                [i * num_frames // clip_len for i in range(clip_len + 1)])\n            bsize = np.diff(bids)\n            bst = bids[:clip_len]\n            all_inds = []\n            for i in range(self.num_clips):\n                offset = np.random.randint(bsize)\n                all_inds.append(bst + offset)\n            inds = np.concatenate(all_inds)\n        return inds\n    def __call__(self, results):\n        num_frames = results['total_frames']\n        if self.test_mode:\n            inds = self._get_test_clips(num_frames, self.clip_len)\n        else:\n            inds = self._get_train_clips(num_frames, self.clip_len)\n        inds = np.mod(inds, num_frames)\n        start_index = results['start_index']\n        inds = inds + start_index\n        results['frame_inds'] = inds.astype(np.int)\n        results['clip_len'] = self.clip_len\n        results['frame_interval'] = None\n        results['num_clips'] = self.num_clips\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('"
+        },
+        {
+            "comment": "This code defines a PoseDecode class that loads and decodes pose with given indices. It requires \"keypoint\" and \"frame_inds\" keys, and optionally \"keypoint_score\". The _load_kp static method loads keypoint coordinates based on frame indices, while the _load_kpscore method loads keypoint scores with frame indices. Both methods return arrays of float32 values for keypoint coordinates or scores respectively. This class is registered at PIPelines for further usage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":459-492",
+            "content": "                    f'clip_len={self.clip_len}, '\n                    f'num_clips={self.num_clips}, '\n                    f'test_mode={self.test_mode}, '\n                    f'seed={self.seed})')\n        return repr_str\n@PIPELINES.register()\nclass PoseDecode:\n    \"\"\"Load and decode pose with given indices.\n    Required keys are \"keypoint\", \"frame_inds\" (optional), \"keypoint_score\"\n    (optional), added or modified keys are \"keypoint\", \"keypoint_score\" (if\n    applicable).\n    \"\"\"\n    @staticmethod\n    def _load_kp(kp, frame_inds):\n        \"\"\"Load keypoints given frame indices.\n        Args:\n            kp (np.ndarray): The keypoint coordinates.\n            frame_inds (np.ndarray): The frame indices.\n        \"\"\"\n        return [x[frame_inds].astype(np.float32) for x in kp]\n    @staticmethod\n    def _load_kpscore(kpscore, frame_inds):\n        \"\"\"Load keypoint scores given frame indices.\n        Args:\n            kpscore (np.ndarray): The confidence scores of keypoints.\n            frame_inds (np.ndarray): The frame indices."
+        },
+        {
+            "comment": "This code defines a PoseCompact class, which is a pipeline for converting keypoint coordinates into a more compact representation. It takes results from previous steps and processes 'keypoint_score' and 'keypoint' keys based on frame indices. If present, it extracts the keypoint scores and keypoint coordinates for the specified frames and converts them to float32 type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":493-527",
+            "content": "        \"\"\"\n        return [x[frame_inds].astype(np.float32) for x in kpscore]\n    def __call__(self, results):\n        if 'frame_inds' not in results:\n            results['frame_inds'] = np.arange(results['total_frames'])\n        if results['frame_inds'].ndim != 1:\n            results['frame_inds'] = np.squeeze(results['frame_inds'])\n        offset = results.get('offset', 0)\n        frame_inds = results['frame_inds'] + offset\n        if 'keypoint_score' in results:\n            kpscore = results['keypoint_score']\n            results['keypoint_score'] = kpscore[:, frame_inds].astype(\n                np.float32)\n        if 'keypoint' in results:\n            results['keypoint'] = results['keypoint'][:, frame_inds].astype(\n                np.float32)\n        return results\n    def __repr__(self):\n        repr_str = f'{self.__class__.__name__}()'\n        return repr_str\n@PIPELINES.register()\nclass PoseCompact:\n    \"\"\"Convert the coordinates of keypoints to make it more compact.\n    Specifically, it first find a tight bounding box that surrounds all joints"
+        },
+        {
+            "comment": "This function expands tight bounding boxes by a given padding ratio and adds new key \"crop_quadruple\". It requires keys \"img_shape\", \"keypoint\" and may modify them. The threshold determines if the box is too small to expand, hw_ratio sets the box aspect ratio (optional), and allow_imgpad allows expanding outside image for hw_ratio (optional). Default values are padding=0.25, threshold=10, hw_ratio=None, allow_imgpad=True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":528-545",
+            "content": "    in each frame, then we expand the tight box by a given padding ratio. For\n    example, if 'padding == 0.25', then the expanded box has unchanged center,\n    and 1.25x width and height.\n    Required keys in results are \"img_shape\", \"keypoint\", add or modified keys\n    are \"img_shape\", \"keypoint\", \"crop_quadruple\".\n    Args:\n        padding (float): The padding size. Default: 0.25.\n        threshold (int): The threshold for the tight bounding box. If the width\n            or height of the tight bounding box is smaller than the threshold,\n            we do not perform the compact operation. Default: 10.\n        hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded\n            box. Float indicates the specific ratio and tuple indicates a\n            ratio range. If set as None, it means there is no requirement on\n            hw_ratio. Default: None.\n        allow_imgpad (bool): Whether to allow expanding the box outside the\n            image to meet the hw_ratio requirement. Default: True."
+        },
+        {
+            "comment": "This code is initializing a class for skeleton pipeline. It takes parameters such as padding, threshold, hw_ratio, and allow_imgpad. The class also has methods to combine quadruples, apply transformations, and handle keypoints in the image. The code performs various operations like making NaN values zero, finding minimum and maximum keypoint coordinates, and applying padding if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":547-584",
+            "content": "    Returns:\n        type: Description of returned object.\n    \"\"\"\n    def __init__(self,\n                 padding=0.25,\n                 threshold=10,\n                 hw_ratio=None,\n                 allow_imgpad=True):\n        self.padding = padding\n        self.threshold = threshold\n        if hw_ratio is not None:\n            hw_ratio = _pair(hw_ratio)\n        self.hw_ratio = hw_ratio\n        self.allow_imgpad = allow_imgpad\n        assert self.padding >= 0\n    def _combine_quadruple(self, a, b):\n        return (a[0] + a[2] * b[0], a[1] + a[3] * b[1], a[2] * b[2],\n                a[3] * b[3])\n    def __call__(self, results):\n        img_shape = results['img_shape']\n        h, w = img_shape\n        kp = results['keypoint']\n        # Make NaN zero\n        kp[np.isnan(kp)] = 0.\n        kp_x = kp[..., 0]\n        kp_y = kp[..., 1]\n        min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)\n        min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)\n        max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)\n        max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)"
+        },
+        {
+            "comment": "This code checks if the compact area is too small and adjusts the bounding box parameters accordingly. It calculates the center, half-width, and half-height of the bounding box. If the aspect ratio should be maintained (hw_ratio), it ensures that by adjusting half_height based on half_width. The code then updates the minimum and maximum x and y values within the constraints of the image's width and height, unless allow_imgpad is True, in which case it doesn't limit the bounding box size. Finally, it adjusts the x and y coordinates of the keypoints by subtracting the new min_x and min_y to maintain their relative positions within the adjusted bounding box.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":586-610",
+            "content": "        # The compact area is too small\n        if max_x - min_x < self.threshold or max_y - min_y < self.threshold:\n            return results\n        center = ((max_x + min_x) / 2, (max_y + min_y) / 2)\n        half_width = (max_x - min_x) / 2 * (1 + self.padding)\n        half_height = (max_y - min_y) / 2 * (1 + self.padding)\n        if self.hw_ratio is not None:\n            half_height = max(self.hw_ratio[0] * half_width, half_height)\n            half_width = max(1 / self.hw_ratio[1] * half_height, half_width)\n        min_x, max_x = center[0] - half_width, center[0] + half_width\n        min_y, max_y = center[1] - half_height, center[1] + half_height\n        # hot update\n        if not self.allow_imgpad:\n            min_x, min_y = int(max(0, min_x)), int(max(0, min_y))\n            max_x, max_y = int(min(w, max_x)), int(min(h, max_y))\n        else:\n            min_x, min_y = int(min_x), int(min_y)\n            max_x, max_y = int(max_x), int(max_y)\n        kp_x[kp_x != 0] -= min_x\n        kp_y[kp_y != 0] -= min_y"
+        },
+        {
+            "comment": "This code segment is part of a pipeline for skeleton detection in images. It calculates the new image shape based on the cropping region, updates the 'crop_quadruple' in the results dictionary, and defines two static methods for cropping keypoints (_crop_kps) and cropped images (_crop_imgs). The CropBase class provides functionality to crop keypoints based on the provided crop region.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":612-639",
+            "content": "        new_shape = (max_y - min_y, max_x - min_x)\n        results['img_shape'] = new_shape\n        # the order is x, y, w, h (in [0, 1]), a tuple\n        crop_quadruple = results.get('crop_quadruple', (0., 0., 1., 1.))\n        new_crop_quadruple = (min_x / w, min_y / h, (max_x - min_x) / w,\n                              (max_y - min_y) / h)\n        crop_quadruple = self._combine_quadruple(crop_quadruple,\n                                                 new_crop_quadruple)\n        results['crop_quadruple'] = crop_quadruple\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '\n                    f'threshold={self.threshold}, '\n                    f'hw_ratio={self.hw_ratio}, '\n                    f'allow_imgpad={self.allow_imgpad})')\n        return repr_str\nclass CropBase:\n    @staticmethod\n    def _crop_kps(kps, crop_bbox):\n        return kps - crop_bbox[:2]\n    @staticmethod\n    def _crop_imgs(imgs, crop_bbox):\n        x1, y1, x2, y2 = crop_bbox"
+        },
+        {
+            "comment": "This code defines a function `_all_box_crop` that crops the gt_bboxes and proposals in results according to the crop_bbox. It first applies the `_box_crop` function to 'gt_bboxes', then if 'proposals' are present and not None, it also applies the `_box_crop` function to them. The `_box_crop` function crops bounding boxes by subtracting the x1, y1 coordinates from their x and y values respectively, ensuring they fall within the new image dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":640-668",
+            "content": "        return [img[y1:y2, x1:x2] for img in imgs]\n    @staticmethod\n    def _box_crop(box, crop_bbox):\n        \"\"\"Crop the bounding boxes according to the crop_bbox.\n        Args:\n            box (np.ndarray): The bounding boxes.\n            crop_bbox(np.ndarray): The bbox used to crop the original image.\n        \"\"\"\n        x1, y1, x2, y2 = crop_bbox\n        img_w, img_h = x2 - x1, y2 - y1\n        box_ = box.copy()\n        box_[..., 0::2] = np.clip(box[..., 0::2] - x1, 0, img_w - 1)\n        box_[..., 1::2] = np.clip(box[..., 1::2] - y1, 0, img_h - 1)\n        return box_\n    def _all_box_crop(self, results, crop_bbox):\n        \"\"\"Crop the gt_bboxes and proposals in results according to crop_bbox.\n        Args:\n            results (dict): All information about the sample, which contain\n                'gt_bboxes' and 'proposals' (optional).\n            crop_bbox(np.ndarray): The bbox used to crop the original image.\n        \"\"\"\n        results['gt_bboxes'] = self._box_crop(results['gt_bboxes'], crop_bbox)\n        if 'proposals' in results and results['proposals'] is not None:"
+        },
+        {
+            "comment": "This code defines a RandomResizedCrop_V2 pipeline that randomly crops an image to a specified area and height-weight ratio range. The required keys in results are \"img_shape\", \"crop_bbox\", and \"imgs\" (optional). The modified keys are \"imgs\", \"keypoint\", \"crop_bbox\", and \"lazy\". The required keys in \"lazy\" are \"flip\", \"crop_bbox\". It provides an area range and aspect ratio range for the output cropped images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":669-692",
+            "content": "            assert results['proposals'].shape[1] == 4\n            results['proposals'] = self._box_crop(results['proposals'],\n                                                  crop_bbox)\n        return results\n    def __call__(self, results):\n        raise NotImplementedError\n@PIPELINES.register()\nclass RandomResizedCrop_V2(CropBase):\n    \"\"\"Random crop that specifics the area and height-weight ratio range.\n    Required keys in results are \"img_shape\", \"crop_bbox\", \"imgs\" (optional),\n    \"keypoint\" (optional), added or modified keys are \"imgs\", \"keypoint\",\n    \"crop_bbox\" and \"lazy\"; Required keys in \"lazy\" are \"flip\", \"crop_bbox\",\n    added or modified key is \"crop_bbox\".\n    Args:\n        area_range (Tuple[float]): The candidate area scales range of\n            output cropped images. Default: (0.08, 1.0).\n        aspect_ratio_range (Tuple[float]): The candidate aspect ratio range of\n            output cropped images. Default: (3 / 4, 4 / 3).\n        lazy (bool): Determine whether to apply lazy operation. Default: False."
+        },
+        {
+            "comment": "This code initializes a class with area_range, aspect_ratio_range, and lazy parameters. It checks if the ranges are tuples of floats, and raises TypeError if not. The get_crop_bbox static method takes image shape, area range, aspect ratio range, and max attempts as arguments to return a crop bounding box.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":693-718",
+            "content": "    \"\"\"\n    def __init__(self,\n                 area_range=(0.08, 1.0),\n                 aspect_ratio_range=(3 / 4, 4 / 3),\n                 lazy=False):\n        self.area_range = eval(area_range)\n        self.aspect_ratio_range = aspect_ratio_range\n        self.lazy = lazy\n        if not is_tuple_of(self.area_range, float):\n            raise TypeError(f'Area_range must be a tuple of float, '\n                            f'but got {type(area_range)}')\n        if not is_tuple_of(self.aspect_ratio_range, float):\n            raise TypeError(f'Aspect_ratio_range must be a tuple of float, '\n                            f'but got {type(aspect_ratio_range)}')\n    @staticmethod\n    def get_crop_bbox(img_shape,\n                      area_range,\n                      aspect_ratio_range,\n                      max_attempts=10):\n        \"\"\"Get a crop bbox given the area range and aspect ratio range.\n        Args:\n            img_shape (Tuple[int]): Image shape\n            area_range (Tuple[float]): The candidate area scales range of"
+        },
+        {
+            "comment": "This function generates a random crop bounding box within a specified area range and aspect ratio range. It takes image shape, area range, and aspect ratio range as input parameters. The function first checks the validity of the ranges, then calculates the image's total area, minimum and maximum aspect ratios from the aspect ratio range. It uses numpy to generate a list of random candidate bounding box aspect ratios and target areas within the specified ranges.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":719-740",
+            "content": "                output cropped images. Default: (0.08, 1.0).\n            aspect_ratio_range (Tuple[float]): The candidate aspect\n                ratio range of output cropped images. Default: (3 / 4, 4 / 3).\n                max_attempts (int): The maximum of attempts. Default: 10.\n            max_attempts (int): Max attempts times to generate random candidate\n                bounding box. If it doesn't qualified one, the center bounding\n                box will be used.\n        Returns:\n            (list[int]) A random crop bbox within the area range and aspect\n            ratio range.\n        \"\"\"\n        assert 0 < area_range[0] <= area_range[1] <= 1\n        assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]\n        img_h, img_w = img_shape\n        area = img_h * img_w\n        min_ar, max_ar = aspect_ratio_range\n        aspect_ratios = np.exp(\n            np.random.uniform(\n                np.log(min_ar), np.log(max_ar), size=max_attempts))\n        target_areas = np.random.uniform(*area_range, size=max_attempts) * area"
+        },
+        {
+            "comment": "This code calculates random crop sizes based on the aspect ratios and target areas, then attempts to find a suitable crop region within the image. If a suitable crop is found, it returns the offsets and dimensions of that crop. If not, it falls back to a centered crop with minimum size. This function is called as part of a pipeline for image augmentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":741-765",
+            "content": "        candidate_crop_w = np.round(np.sqrt(\n            target_areas * aspect_ratios)).astype(np.int32)\n        candidate_crop_h = np.round(np.sqrt(\n            target_areas / aspect_ratios)).astype(np.int32)\n        for i in range(max_attempts):\n            crop_w = candidate_crop_w[i]\n            crop_h = candidate_crop_h[i]\n            if crop_h <= img_h and crop_w <= img_w:\n                x_offset = random.randint(0, img_w - crop_w)\n                y_offset = random.randint(0, img_h - crop_h)\n                return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h\n        # Fallback\n        crop_size = min(img_h, img_w)\n        x_offset = (img_w - crop_size) // 2\n        y_offset = (img_h - crop_size) // 2\n        return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size\n    def __call__(self, results):\n        \"\"\"Performs the RandomResizeCrop augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline."
+        },
+        {
+            "comment": "This code initializes and adjusts the crop quadruple of an image based on its aspect ratio, area range, and size. It ensures that the 'keypoint' is not applied if lazy augmentation is enabled. The crop quadruple contains x, y, width, and height values representing the image's cropping region, which are updated according to the original image's dimensions and the desired aspect ratio range.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":766-790",
+            "content": "        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        if 'keypoint' in results:\n            assert not self.lazy, ('Keypoint Augmentations are not compatible '\n                                   'with lazy == True')\n        img_h, img_w = results['img_shape']\n        left, top, right, bottom = self.get_crop_bbox(\n            (img_h, img_w), self.area_range, self.aspect_ratio_range)\n        new_h, new_w = bottom - top, right - left\n        if 'crop_quadruple' not in results:\n            results['crop_quadruple'] = np.array(\n                [0, 0, 1, 1],  # x, y, w, h\n                dtype=np.float32)\n        x_ratio, y_ratio = left / img_w, top / img_h\n        w_ratio, h_ratio = new_w / img_w, new_h / img_h\n        old_crop_quadruple = results['crop_quadruple']\n        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]\n        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]\n        new_crop_quadruple = [\n            old_x_ratio + x_ratio * old_w_ratio,"
+        },
+        {
+            "comment": "This code performs cropping on images and bboxes based on given ratios. It updates the results dictionary with cropped quadruple, crop bbox, and new image shape. If not in lazy mode, it crops keypoints and images using these values. If in lazy mode, it stores the left, top, right, and bottom crop positions for later fusion.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":791-814",
+            "content": "            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,\n            h_ratio * old_h_ratio\n        ]\n        results['crop_quadruple'] = np.array(\n            new_crop_quadruple, dtype=np.float32)\n        crop_bbox = np.array([left, top, right, bottom])\n        results['crop_bbox'] = crop_bbox\n        results['img_shape'] = (new_h, new_w)\n        if not self.lazy:\n            if 'keypoint' in results:\n                results['keypoint'] = self._crop_kps(results['keypoint'],\n                                                     crop_bbox)\n            if 'imgs' in results:\n                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)\n        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Put Flip at last for now')\n            # record crop_bbox in lazyop dict to ensure only crop once in Fuse\n            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']\n            left = left * (lazy_right - lazy_left) / img_w"
+        },
+        {
+            "comment": "This code is a pipeline for skeleton processing in PaddleVideo. It scales the bounding box and applies it to the lazy operation, performs cropping based on the new bounding box, and if 'gt_bboxes' is present in results, it crops all other boxes accordingly. The class also has a __repr__ method for string representation. There is also a helper function, is_seq_of, which checks whether a sequence contains items of a specific type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":815-842",
+            "content": "            right = right * (lazy_right - lazy_left) / img_w\n            top = top * (lazy_bottom - lazy_top) / img_h\n            bottom = bottom * (lazy_bottom - lazy_top) / img_h\n            lazyop['crop_bbox'] = np.array(\n                [(lazy_left + left), (lazy_top + top), (lazy_left + right),\n                 (lazy_top + bottom)],\n                dtype=np.float32)\n        if 'gt_bboxes' in results:\n            assert not self.lazy\n            results = self._all_box_crop(results, results['crop_bbox'])\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'area_range={self.area_range}, '\n                    f'aspect_ratio_range={self.aspect_ratio_range}, '\n                    f'lazy={self.lazy})')\n        return repr_str\ndef is_seq_of(seq, expected_type, seq_type=None):\n    \"\"\"Check whether it is a sequence of some type.\n    Args:\n        seq (Sequence): The sequence to be checked.\n        expected_type (type): Expected type of sequence items."
+        },
+        {
+            "comment": "The code defines a function `is_seq_of` that checks if a given sequence is of the expected type. It also defines two partial methods, `is_tuple_of`, which uses `is_seq_of` to check if a sequence is a tuple of a certain type. Lastly, it registers a new pipeline `CenterCrop_V2` for cropping the center area from images with required keys \"img_shape\", \"imgs\" (optional), and modified or added keys as mentioned in the function description.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":843-879",
+            "content": "        seq_type (type, optional): Expected sequence type.\n    Returns:\n        bool: Whether the sequence is valid.\n    \"\"\"\n    if seq_type is None:\n        exp_seq_type = abc.Sequence\n    else:\n        assert isinstance(seq_type, type)\n        exp_seq_type = seq_type\n    if not isinstance(seq, exp_seq_type):\n        return False\n    for item in seq:\n        if not isinstance(item, expected_type):\n            return False\n    return True\ndef is_tuple_of(seq, expected_type):\n    \"\"\"Check whether it is a tuple of some type.\n    A partial method of :func:`is_seq_of`.\n    \"\"\"\n    return is_seq_of(seq, expected_type, seq_type=tuple)\n@PIPELINES.register()\nclass CenterCrop_V2(CropBase):\n    \"\"\"Crop the center area from images.\n    Required keys are \"img_shape\", \"imgs\" (optional), \"keypoint\" (optional),\n    added or modified keys are \"imgs\", \"keypoint\", \"crop_bbox\", \"lazy\" and\n    \"img_shape\". Required keys in \"lazy\" is \"crop_bbox\", added or modified key\n    is \"crop_bbox\".\n    Args:\n        crop_size (int | tuple[int]): (w, h) of crop size."
+        },
+        {
+            "comment": "This code defines a class for CenterCrop augmentation. It initializes with crop size and lazy operation flag, checks the validity of input parameters, performs CenterCrop operation on images, and handles keypoint augmentations if present in results dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":880-908",
+            "content": "        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    def __init__(self, crop_size, lazy=False):\n        self.crop_size = _pair(crop_size)\n        self.lazy = lazy\n        if not is_tuple_of(self.crop_size, int):\n            raise TypeError(f'Crop_size must be int or tuple of int, '\n                            f'but got {type(crop_size)}')\n    def __call__(self, results):\n        \"\"\"Performs the CenterCrop augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        if 'keypoint' in results:\n            assert not self.lazy, ('Keypoint Augmentations are not compatible '\n                                   'with lazy == True')\n        img_h, img_w = results['img_shape']\n        crop_w, crop_h = self.crop_size\n        left = (img_w - crop_w) // 2\n        top = (img_h - crop_h) // 2\n        right = left + crop_w\n        bottom = top + crop_h"
+        },
+        {
+            "comment": "This code calculates the new image shape and crop box coordinates based on the provided top, left, right, and bottom values. It then updates the 'crop_bbox' and 'img_shape' in the results dictionary. If 'crop_quadruple' is not already present in the results, it creates and appends it. The code then calculates new crop quadruple coordinates by adjusting the old ones with the ratios of the original and new image widths and heights. Finally, if 'keypoint' is present in the results, the code proceeds further (presumably for lazy mode).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":909-935",
+            "content": "        new_h, new_w = bottom - top, right - left\n        crop_bbox = np.array([left, top, right, bottom])\n        results['crop_bbox'] = crop_bbox\n        results['img_shape'] = (new_h, new_w)\n        if 'crop_quadruple' not in results:\n            results['crop_quadruple'] = np.array(\n                [0, 0, 1, 1],  # x, y, w, h\n                dtype=np.float32)\n        x_ratio, y_ratio = left / img_w, top / img_h\n        w_ratio, h_ratio = new_w / img_w, new_h / img_h\n        old_crop_quadruple = results['crop_quadruple']\n        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]\n        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]\n        new_crop_quadruple = [\n            old_x_ratio + x_ratio * old_w_ratio,\n            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,\n            h_ratio * old_h_ratio\n        ]\n        results['crop_quadruple'] = np.array(\n            new_crop_quadruple, dtype=np.float32)\n        if not self.lazy:\n            if 'keypoint' in results:"
+        },
+        {
+            "comment": "This code is handling the case where 'lazyop' contains a flip operation. It records crop_bbox in lazyop to ensure only one crop operation is performed in Fuse. If 'gt_bboxes' is present in results, it indicates ground truth bbox information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":936-956",
+            "content": "                results['keypoint'] = self._crop_kps(results['keypoint'],\n                                                     crop_bbox)\n            if 'imgs' in results:\n                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)\n        else:\n            lazyop = results['lazy']\n            if lazyop['flip']:\n                raise NotImplementedError('Put Flip at last for now')\n            # record crop_bbox in lazyop dict to ensure only crop once in Fuse\n            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']\n            left = left * (lazy_right - lazy_left) / img_w\n            right = right * (lazy_right - lazy_left) / img_w\n            top = top * (lazy_bottom - lazy_top) / img_h\n            bottom = bottom * (lazy_bottom - lazy_top) / img_h\n            lazyop['crop_bbox'] = np.array(\n                [(lazy_left + left), (lazy_top + top), (lazy_left + right),\n                 (lazy_top + bottom)],\n                dtype=np.float32)\n        if 'gt_bboxes' in results:"
+        },
+        {
+            "comment": "This code snippet is registering a pipeline class called \"Flip_V2\". The Flip_V2 class flips the input images with a probability and reverses the order of elements in the given imgs with a specific direction. It requires keys such as \"img_shape\", \"modality\", and \"imgs\" while adding or modifying keys like \"imgs\", \"keypoint\", \"lazy\", and \"flip_direction\". Flip_V2 should be placed after cropping/reshaping augmentations to ensure crop_quadruple is calculated properly. The flip ratio, which determines the probability of implementing flip, is set to 0.5 by default.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":957-983",
+            "content": "            assert not self.lazy\n            results = self._all_box_crop(results, results['crop_bbox'])\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}(crop_size={self.crop_size}, '\n                    f'lazy={self.lazy})')\n        return repr_str\n@PIPELINES.register()\nclass Flip_V2:\n    \"\"\"Flip the input images with a probability.\n    Reverse the order of elements in the given imgs with a specific direction.\n    The shape of the imgs is preserved, but the elements are reordered.\n    Required keys are \"img_shape\", \"modality\", \"imgs\" (optional), \"keypoint\"\n    (optional), added or modified keys are \"imgs\", \"keypoint\", \"lazy\" and\n    \"flip_direction\". Required keys in \"lazy\" is None, added or modified key\n    are \"flip\" and \"flip_direction\". The Flip augmentation should be placed\n    after any cropping / reshaping augmentations, to make sure crop_quadruple\n    is calculated properly.\n    Args:\n        flip_ratio (float): Probability of implementing flip. Default: 0.5."
+        },
+        {
+            "comment": "This code snippet is for an object called \"SkeletonPipeline\". It has parameters for direction, flip_label_map, left_kp, right_kp, and lazy. The direction parameter can be either 'horizontal' or 'vertical'. Flip_label_map is a dictionary used to transform the label of flipped images. Left_kp and right_kp are indexes used for flipping keypoints. Lazy determines whether to apply lazy operations, default set to False. The function checks if the given direction is within ['horizontal', 'vertical'].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":984-1005",
+            "content": "        direction (str): Flip imgs horizontally or vertically. Options are\n            \"horizontal\" | \"vertical\". Default: \"horizontal\".\n        flip_label_map (Dict[int, int] | None): Transform the label of the\n            flipped image with the specific label. Default: None.\n        left_kp (list[int]): Indexes of left keypoints, used to flip keypoints.\n            Default: None.\n        right_kp (list[ind]): Indexes of right keypoints, used to flip\n            keypoints. Default: None.\n        lazy (bool): Determine whether to apply lazy operation. Default: False.\n    \"\"\"\n    _directions = ['horizontal', 'vertical']\n    def __init__(self,\n                 flip_ratio=0.5,\n                 direction='horizontal',\n                 flip_label_map=None,\n                 left_kp=None,\n                 right_kp=None,\n                 lazy=False):\n        if direction not in self._directions:\n            raise ValueError(f'Direction {direction} is not supported. '\n                             f'Currently support ones are {self._directions}')"
+        },
+        {
+            "comment": "This code is a part of the skeleton_pipeline module in PaddleVideo. It initializes parameters such as flip ratio, direction, and flips label map, and defines a function _flip_imgs for image flipping and another function _flip_kps for keypoints flipping based on the direction provided. The code also includes conditions to handle flow images specifically by inverting the first frame of each two frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1006-1033",
+            "content": "        self.flip_ratio = flip_ratio\n        self.direction = direction\n        self.flip_label_map = flip_label_map\n        self.left_kp = left_kp\n        self.right_kp = right_kp\n        self.lazy = lazy\n    def _flip_imgs(self, imgs, modality):\n        _ = [imflip_(img, self.direction) for img in imgs]\n        lt = len(imgs)\n        if modality == 'Flow':\n            # The 1st frame of each 2 frames is flow-x\n            for i in range(0, lt, 2):\n                imgs[i] = iminvert(imgs[i])\n        return imgs\n    def _flip_kps(self, kps, kpscores, img_width):\n        kp_x = kps[..., 0]\n        kp_x[kp_x != 0] = img_width - kp_x[kp_x != 0]\n        new_order = list(range(kps.shape[2]))\n        if self.left_kp is not None and self.right_kp is not None:\n            for left, right in zip(self.left_kp, self.right_kp):\n                new_order[left] = right\n                new_order[right] = left\n        kps = kps[:, :, new_order]\n        if kpscores is not None:\n            kpscores = kpscores[:, :, new_order]\n        return kps, kpscores"
+        },
+        {
+            "comment": "This code snippet is from the PaddleVideo library's skeleton pipeline. It defines a function for flipping bounding boxes and a method that applies horizontal flip augmentation to images, unless the image contains keypoints where only horizontal flip is supported. The code checks if the modality of the image is 'Flow'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1035-1064",
+            "content": "    @staticmethod\n    def _box_flip(box, img_width):\n        \"\"\"Flip the bounding boxes given the width of the image.\n        Args:\n            box (np.ndarray): The bounding boxes.\n            img_width (int): The img width.\n        \"\"\"\n        box_ = box.copy()\n        box_[..., 0::4] = img_width - box[..., 2::4]\n        box_[..., 2::4] = img_width - box[..., 0::4]\n        return box_\n    def __call__(self, results):\n        \"\"\"Performs the Flip augmentation.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        _init_lazy_if_proper(results, self.lazy)\n        if 'keypoint' in results:\n            assert not self.lazy, ('Keypoint Augmentations are not compatible '\n                                   'with lazy == True')\n            assert self.direction == 'horizontal', (\n                'Only horizontal flips are'\n                'supported for human keypoints')\n        modality = results['modality']\n        if modality == 'Flow':"
+        },
+        {
+            "comment": "The code checks if the direction is horizontal, flips the image randomly based on a flip ratio, and updates results accordingly. If the flip label map is not None and flip occurs, it updates the label in the results. If lazy is not set, it flips images and keypoints if necessary, updating results. Otherwise, it stores the operation for later execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1065-1089",
+            "content": "            assert self.direction == 'horizontal'\n        flip = np.random.rand() < self.flip_ratio\n        results['flip'] = flip\n        results['flip_direction'] = self.direction\n        img_width = results['img_shape'][1]\n        if self.flip_label_map is not None and flip:\n            results['label'] = self.flip_label_map.get(results['label'],\n                                                       results['label'])\n        if not self.lazy:\n            if flip:\n                if 'imgs' in results:\n                    results['imgs'] = self._flip_imgs(results['imgs'], modality)\n                if 'keypoint' in results:\n                    kp = results['keypoint']\n                    kpscore = results.get('keypoint_score', None)\n                    kp, kpscore = self._flip_kps(kp, kpscore, img_width)\n                    results['keypoint'] = kp\n                    if 'keypoint_score' in results:\n                        results['keypoint_score'] = kpscore\n        else:\n            lazyop = results['lazy']"
+        },
+        {
+            "comment": "This code snippet is part of the \"SkeletonPipeline\" class in PaddleVideo. It checks if the 'flip' parameter is set and applies horizontal flipping to the 'gt_bboxes' and 'proposals' (if present) based on the direction specified. It also defines a __repr__ method for the class, providing information about its attributes, and registers a new pipeline called \"FormatShape\" for use in the system.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1090-1116",
+            "content": "            if lazyop['flip']:\n                raise NotImplementedError('Use one Flip please')\n            lazyop['flip'] = flip\n            lazyop['flip_direction'] = self.direction\n        if 'gt_bboxes' in results and flip:\n            assert not self.lazy and self.direction == 'horizontal'\n            width = results['img_shape'][1]\n            results['gt_bboxes'] = self._box_flip(results['gt_bboxes'], width)\n            if 'proposals' in results and results['proposals'] is not None:\n                assert results['proposals'].shape[1] == 4\n                results['proposals'] = self._box_flip(results['proposals'],\n                                                      width)\n        return results\n    def __repr__(self):\n        repr_str = (\n            f'{self.__class__.__name__}('\n            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '\n            f'flip_label_map={self.flip_label_map}, lazy={self.lazy})')\n        return repr_str\n@PIPELINES.register()\nclass FormatShape:\n    \"\"\"Format final imgs shape to the given input_format."
+        },
+        {
+            "comment": "The code defines a class for formatting image data in a specific format based on the input_format parameter. The class takes an input_format and a collapse boolean argument, and checks if the input_format is valid (options are 'NCTHW', 'NCHW', 'NCHW_Flow', or 'NPTCHW'). If results['imgs'] is not of type np.ndarray, it converts it to np.ndarray.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1118-1143",
+            "content": "    Required keys are \"imgs\", \"num_clips\" and \"clip_len\", added or modified\n    keys are \"imgs\" and \"input_shape\".\n    Args:\n        input_format (str): Define the final imgs format.\n        collapse (bool): To collpase input_format N... to ... (NCTHW to CTHW,\n            etc.) if N is 1. Should be set as True when training and testing\n            detectors. Default: False.\n    \"\"\"\n    def __init__(self, input_format, collapse=False):\n        self.input_format = input_format\n        self.collapse = collapse\n        if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:\n            raise ValueError(\n                f'The input format {self.input_format} is invalid.')\n    def __call__(self, results):\n        \"\"\"Performs the FormatShape formating.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        if not isinstance(results['imgs'], np.ndarray):\n            results['imgs'] = np.array(results['imgs'])"
+        },
+        {
+            "comment": "This code is a part of the SkeletonPipeline class in PaddleVideo. It processes images from results and reshapes them based on the input format specified. If input_format is 'NCTHW', it transposes and reshapes the images accordingly, if it's 'NCHW', it only transposes, and if it's 'NCHW_Flow', it also performs reshaping similar to 'NCTHW'. The 'collapse' check ensures that if results have multiple clips, it won't collapse them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1144-1168",
+            "content": "        imgs = results['imgs']\n        # [M x H x W x C]\n        # M = 1 * N_crops * N_clips * L\n        if self.collapse:\n            assert results['num_clips'] == 1\n        if self.input_format == 'NCTHW':\n            num_clips = results['num_clips']\n            clip_len = results['clip_len']\n            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])\n            # N_crops x N_clips x L x H x W x C\n            imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))\n            # N_crops x N_clips x C x L x H x W\n            imgs = imgs.reshape((-1, ) + imgs.shape[2:])\n            # M' x C x L x H x W\n            # M' = N_crops x N_clips\n        elif self.input_format == 'NCHW':\n            imgs = np.transpose(imgs, (0, 3, 1, 2))\n            # M x C x H x W\n        elif self.input_format == 'NCHW_Flow':\n            num_clips = results['num_clips']\n            clip_len = results['clip_len']\n            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])\n            # N_crops x N_clips x L x H x W x C"
+        },
+        {
+            "comment": "This code transforms image data into various formats depending on the input format specified. It supports 'NHWC', 'NCHW', and 'NPTCHW' formats. If the collapse parameter is True, it squeezes the first dimension of the images array. The results dictionary is updated with the transformed images and their shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1169-1196",
+            "content": "            imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4))\n            # N_crops x N_clips x L x C x H x W\n            imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) +\n                                imgs.shape[4:])\n            # M' x C' x H x W\n            # M' = N_crops x N_clips\n            # C' = L x C\n        elif self.input_format == 'NPTCHW':\n            num_proposals = results['num_proposals']\n            num_clips = results['num_clips']\n            clip_len = results['clip_len']\n            imgs = imgs.reshape((num_proposals, num_clips * clip_len) +\n                                imgs.shape[1:])\n            # P x M x H x W x C\n            # M = N_clips x L\n            imgs = np.transpose(imgs, (0, 1, 4, 2, 3))\n            # P x M x C x H x W\n        if self.collapse:\n            assert imgs.shape[0] == 1\n            imgs = imgs.squeeze(0)\n        results['imgs'] = imgs\n        results['input_shape'] = imgs.shape\n        return results\n    def __repr__(self):\n        repr_str = self.__class__.__name__"
+        },
+        {
+            "comment": "The code defines a Pipeline class called \"Collect\" that collects specific data from the loader relevant to the task. It keeps keys as is and gathers items in meta_keys into a meta item called meta_name, typically used as the last stage of the data loader pipeline. The Collect class takes keys, meta_name, and meta_keys as arguments, with default values provided for meta_name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1197-1219",
+            "content": "        repr_str += f\"(input_format='{self.input_format}')\"\n        return repr_str\n@PIPELINES.register()\nclass Collect:\n    \"\"\"Collect data from the loader relevant to the specific task.\n    This keeps the items in ``keys`` as it is, and collect items in\n    ``meta_keys`` into a meta item called ``meta_name``.This is usually\n    the last stage of the data loader pipeline.\n    For example, when keys='imgs', meta_keys=('filename', 'label',\n    'original_shape'), meta_name='img_metas', the results will be a dict with\n    keys 'imgs' and 'img_metas', where 'img_metas' is a DataContainer of\n    another dict with keys 'filename', 'label', 'original_shape'.\n    Args:\n        keys (Sequence[str]): Required keys to be collected.\n        meta_name (str): The name of the key that contains meta infomation.\n            This key is always populated. Default: \"img_metas\".\n        meta_keys (Sequence[str]): Keys that are collected under meta_name.\n            The contents of the ``meta_name`` dictionary depends on\n            ``meta_keys``."
+        },
+        {
+            "comment": "This code defines a dictionary containing default parameters for image data loading. It includes fields such as \"filename\", \"label\", \"original_shape\", \"img_shape\", \"pad_shape\", \"flip_direction\", and \"img_norm_cfg\". The \"nested\" argument determines whether these parameters should be applied recursively to all items within the data dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1220-1237",
+            "content": "            By default this includes:\n            - \"filename\": path to the image file\n            - \"label\": label of the image file\n            - \"original_shape\": original shape of the image as a tuple\n                (h, w, c)\n            - \"img_shape\": shape of the image input to the network as a tuple\n                (h, w, c).  Note that images may be zero padded on the\n                bottom/right, if the batch tensor is larger than this shape.\n            - \"pad_shape\": image shape after padding\n            - \"flip_direction\": a str in (\"horiziontal\", \"vertival\") to\n                indicate if the image is fliped horizontally or vertically.\n            - \"img_norm_cfg\": a dict of normalization information:\n                - mean - per channel mean subtraction\n                - std - per channel std divisor\n                - to_rgb - bool indicating if bgr was converted to rgb\n        nested (bool): If set as True, will apply data[x] = [data[x]] to all\n            items in data. The arg is added for compatibility. Default: False."
+        },
+        {
+            "comment": "The code defines a class that initializes with specified keys and optional metadata keys. It executes a call method to perform Collect formating on input results, appending each key's data into a list and adding any specified metadata as well. The __repr__ method provides a string representation of the object including its attributes. This pipeline component is registered in @PIPELINES using decorator.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1238-1274",
+            "content": "    \"\"\"\n    def __init__(self,\n                 keys,\n                 meta_keys=('filename', 'label', 'original_shape', 'img_shape',\n                            'pad_shape', 'flip_direction', 'img_norm_cfg'),\n                 meta_name='img_metas'):\n        self.keys = keys\n        self.meta_keys = meta_keys\n        self.meta_name = meta_name\n    def __call__(self, results):\n        \"\"\"Performs the Collect formating.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        data = []\n        for key in self.keys:\n            data.append(results[key])\n        if len(self.meta_keys) != 0:\n            meta = {}\n            for key in self.meta_keys:\n                meta[key] = results[key]\n            data.append(meta)\n        return data\n    def __repr__(self):\n        return (f'{self.__class__.__name__}('\n                f'keys={self.keys}, meta_keys={self.meta_keys}, '\n                f'nested={self.nested})')\n@PIPELINES.register()"
+        },
+        {
+            "comment": "This code defines a class `GeneratePoseTarget` that generates pseudo heatmaps based on joint coordinates and confidence, with optional use of score, limbs, or skeletons. It takes in required keys \"keypoint\", \"img_shape\", and \"keypoint_score\" (optional). It adds or modifies keys as \"imgs\". The class has parameters like sigma, use_score, with_kp, with_limb, skeletons, and double.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1275-1293",
+            "content": "class GeneratePoseTarget:\n    \"\"\"Generate pseudo heatmaps based on joint coordinates and confidence.\n    Required keys are \"keypoint\", \"img_shape\", \"keypoint_score\" (optional),\n    added or modified keys are \"imgs\".\n    Args:\n        sigma (float): The sigma of the generated gaussian map. Default: 0.6.\n        use_score (bool): Use the confidence score of keypoints as the maximum\n            of the gaussian maps. Default: True.\n        with_kp (bool): Generate pseudo heatmaps for keypoints. Default: True.\n        with_limb (bool): Generate pseudo heatmaps for limbs. At least one of\n            'with_kp' and 'with_limb' should be True. Default: False.\n        skeletons (tuple[tuple]): The definition of human skeletons.\n            Default: ((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), (7, 9),\n                      (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), (13, 15),\n                      (6, 12), (12, 14), (14, 16), (11, 12)),\n            which is the definition of COCO-17p skeletons.\n        double (bool): Output both original heatmaps and flipped heatmaps."
+        },
+        {
+            "comment": "The function initializes skeleton parameters such as sigma, use_score, with_kp, with_limb, skeletons, double, left_kp, and right_kp. It sets default values for these parameters to be used in the skeleton pipeline process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1294-1317",
+            "content": "            Default: False.\n        left_kp (tuple[int]): Indexes of left keypoints, which is used when\n            flipping heatmaps. Default: (1, 3, 5, 7, 9, 11, 13, 15),\n            which is left keypoints in COCO-17p.\n        right_kp (tuple[int]): Indexes of right keypoints, which is used when\n            flipping heatmaps. Default: (2, 4, 6, 8, 10, 12, 14, 16),\n            which is right keypoints in COCO-17p.\n    \"\"\"\n    def __init__(self,\n                 sigma=0.6,\n                 use_score=True,\n                 with_kp=True,\n                 with_limb=False,\n                 skeletons=((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),\n                            (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),\n                            (13, 15), (6, 12), (12, 14), (14, 16), (11, 12)),\n                 double=False,\n                 left_kp=(1, 3, 5, 7, 9, 11, 13, 15),\n                 right_kp=(2, 4, 6, 8, 10, 12, 14, 16)):\n        self.sigma = sigma\n        self.use_score = use_score\n        self.with_kp = with_kp"
+        },
+        {
+            "comment": "This code is part of the SkeletonPipeline class, which appears to be related to skeleton detection or tracking. The class takes in parameters such as with_limb, double, eps, left_kp, right_kp, and skeletons. It generates a heatmap for one keypoint in one frame using the generate_a_heatmap method. This method takes in img_h, img_w, centers, sigma, and max_values as parameters to create a pseudo heatmap with a zero initial state, iterates through each center-max_value pair, and fills the heatmap accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1318-1348",
+            "content": "        self.with_limb = with_limb\n        self.double = double\n        # an auxiliary const\n        self.eps = 1e-4\n        assert self.with_kp or self.with_limb, (\n            'At least one of \"with_limb\" '\n            'and \"with_kp\" should be set as True.')\n        self.left_kp = left_kp\n        self.right_kp = right_kp\n        self.skeletons = skeletons\n    def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values):\n        \"\"\"Generate pseudo heatmap for one keypoint in one frame.\n        Args:\n            img_h (int): The height of the heatmap.\n            img_w (int): The width of the heatmap.\n            centers (np.ndarray): The coordinates of corresponding keypoints\n                (of multiple persons).\n            sigma (float): The sigma of generated gaussian.\n            max_values (np.ndarray): The max values of each keypoint.\n        Returns:\n            np.ndarray: The generated pseudo heatmap.\n        \"\"\"\n        heatmap = np.zeros([img_h, img_w], dtype=np.float32)\n        for center, max_value in zip(centers, max_values):"
+        },
+        {
+            "comment": "This function generates a heatmap for a limb in a frame by calculating the Gaussian kernel patch, and updating the heatmap with it. It checks if the keypoint is within the image boundaries before processing. The keypoint positions are calculated based on center coordinates and sigma values. If the keypoints are not within the image bounds, the function continues to the next iteration without updating the heatmap.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1349-1374",
+            "content": "            mu_x, mu_y = center[0], center[1]\n            if max_value < self.eps:\n                continue\n            st_x = max(int(mu_x - 3 * sigma), 0)\n            ed_x = min(int(mu_x + 3 * sigma) + 1, img_w)\n            st_y = max(int(mu_y - 3 * sigma), 0)\n            ed_y = min(int(mu_y + 3 * sigma) + 1, img_h)\n            x = np.arange(st_x, ed_x, 1, np.float32)\n            y = np.arange(st_y, ed_y, 1, np.float32)\n            # if the keypoint not in the heatmap coordinate system\n            if not (len(x) and len(y)):\n                continue\n            y = y[:, None]\n            patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2)\n            patch = patch * max_value\n            heatmap[st_y:ed_y, st_x:ed_x] = np.maximum(\n                heatmap[st_y:ed_y, st_x:ed_x], patch)\n        return heatmap\n    def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,\n                                start_values, end_values):\n        \"\"\"Generate pseudo heatmap for one limb in one frame."
+        },
+        {
+            "comment": "This function takes in parameters such as image height, width, keypoint coordinates, and values for each limb. It then generates a pseudo heatmap by iterating through the inputs, calculates a value coefficient for each limb based on their start and end values, and returns a numpy array representing the generated heatmap.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1376-1397",
+            "content": "        Args:\n            img_h (int): The height of the heatmap.\n            img_w (int): The width of the heatmap.\n            starts (np.ndarray): The coordinates of one keypoint in the\n                corresponding limbs (of multiple persons).\n            ends (np.ndarray): The coordinates of the other keypoint in the\n                corresponding limbs (of multiple persons).\n            sigma (float): The sigma of generated gaussian.\n            start_values (np.ndarray): The max values of one keypoint in the\n                corresponding limbs.\n            end_values (np.ndarray): The max values of the other keypoint in\n                the corresponding limbs.\n        Returns:\n            np.ndarray: The generated pseudo heatmap.\n        \"\"\"\n        heatmap = np.zeros([img_h, img_w], dtype=np.float32)\n        for start, end, start_value, end_value in zip(starts, ends,\n                                                      start_values, end_values):\n            value_coeff = min(start_value, end_value)"
+        },
+        {
+            "comment": "This code calculates the distance between a pair of keypoints (start and end) for every pixel in the image, based on certain conditions like value_coeff and sigma. It also adjusts the x and y coordinates to avoid out-of-bounds errors. If the resulting arrays of x and y are empty, it skips processing this pair of keypoints. The calculated distances are used for further processing in the codebase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1398-1426",
+            "content": "            if value_coeff < self.eps:\n                continue\n            min_x, max_x = min(start[0], end[0]), max(start[0], end[0])\n            min_y, max_y = min(start[1], end[1]), max(start[1], end[1])\n            min_x = max(int(min_x - 3 * sigma), 0)\n            max_x = min(int(max_x + 3 * sigma) + 1, img_w)\n            min_y = max(int(min_y - 3 * sigma), 0)\n            max_y = min(int(max_y + 3 * sigma) + 1, img_h)\n            x = np.arange(min_x, max_x, 1, np.float32)\n            y = np.arange(min_y, max_y, 1, np.float32)\n            if not (len(x) and len(y)):\n                continue\n            y = y[:, None]\n            x_0 = np.zeros_like(x)\n            y_0 = np.zeros_like(y)\n            # distance to start keypoints\n            d2_start = ((x - start[0])**2 + (y - start[1])**2)\n            # distance to end keypoints\n            d2_end = ((x - end[0])**2 + (y - end[1])**2)\n            # the distance between start and end keypoints.\n            d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2)"
+        },
+        {
+            "comment": "This code calculates the dominant points and updates the heatmap by applying a Gaussian kernel. It checks if a point is within the start or end of a line segment, and if not, it computes the distance between the point and the line segment. It then uses this distance to compute a weight for each dominant point (start, end) and updates the heatmap using these weights. This ensures that the dominant points have more influence on the heatmap than the less dominant ones.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1428-1452",
+            "content": "            if d2_ab < 1:\n                full_map = self.generate_a_heatmap(img_h, img_w, [start], sigma,\n                                                   [start_value])\n                heatmap = np.maximum(heatmap, full_map)\n                continue\n            coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab\n            a_dominate = coeff <= 0\n            b_dominate = coeff >= 1\n            seg_dominate = 1 - a_dominate - b_dominate\n            position = np.stack([x + y_0, y + x_0], axis=-1)\n            projection = start + np.stack([coeff, coeff],\n                                          axis=-1) * (end - start)\n            d2_line = position - projection\n            d2_line = d2_line[:, :, 0]**2 + d2_line[:, :, 1]**2\n            d2_seg = (a_dominate * d2_start + b_dominate * d2_end +\n                      seg_dominate * d2_line)\n            patch = np.exp(-d2_seg / 2. / sigma**2)\n            patch = patch * value_coeff\n            heatmap[min_y:max_y, min_x:max_x] = np.maximum(\n                heatmap[min_y:max_y, min_x:max_x], patch)"
+        },
+        {
+            "comment": "This code generates a heatmap for all keypoints and limbs in one frame. It takes the height, width, coordinates of keypoints, sigma value, and confidence scores as input. The function iterates over each keypoint to generate separate heatmaps if 'with_kp' is enabled, then appends these individual heatmaps to a list called 'heatmaps'. If 'with_limb' is also enabled, the code generates additional heatmaps for each limb defined in 'self.skeletons' by iterating over them and finding corresponding start and end indices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1454-1482",
+            "content": "        return heatmap\n    def generate_heatmap(self, img_h, img_w, kps, sigma, max_values):\n        \"\"\"Generate pseudo heatmap for all keypoints and limbs in one frame (if\n        needed).\n        Args:\n            img_h (int): The height of the heatmap.\n            img_w (int): The width of the heatmap.\n            kps (np.ndarray): The coordinates of keypoints in this frame.\n            sigma (float): The sigma of generated gaussian.\n            max_values (np.ndarray): The confidence score of each keypoint.\n        Returns:\n            np.ndarray: The generated pseudo heatmap.\n        \"\"\"\n        heatmaps = []\n        if self.with_kp:\n            num_kp = kps.shape[1]\n            for i in range(num_kp):\n                heatmap = self.generate_a_heatmap(img_h, img_w, kps[:, i],\n                                                  sigma, max_values[:, i])\n                heatmaps.append(heatmap)\n        if self.with_limb:\n            for limb in self.skeletons:\n                start_idx, end_idx = limb\n                starts = kps[:, start_idx]"
+        },
+        {
+            "comment": "The code defines a function that generates pseudo heatmaps for all frames in an image sequence. It extracts keypoint coordinates and scores from the input results, and then creates heatmaps by calling another function to generate limb heatmaps for each frame. These heatmaps are appended into a list and finally stacked along a specific axis to form the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1483-1516",
+            "content": "                ends = kps[:, end_idx]\n                start_values = max_values[:, start_idx]\n                end_values = max_values[:, end_idx]\n                heatmap = self.generate_a_limb_heatmap(\n                    img_h, img_w, starts, ends, sigma, start_values, end_values)\n                heatmaps.append(heatmap)\n        return np.stack(heatmaps, axis=-1)\n    def gen_an_aug(self, results):\n        \"\"\"Generate pseudo heatmaps for all frames.\n        Args:\n            results (dict): The dictionary that contains all info of a sample.\n        Returns:\n            list[np.ndarray]: The generated pseudo heatmaps.\n        \"\"\"\n        all_kps = results['keypoint']\n        kp_shape = all_kps.shape\n        if 'keypoint_score' in results:\n            all_kpscores = results['keypoint_score']\n        else:\n            all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32)\n        img_h, img_w = results['img_shape']\n        num_frame = kp_shape[1]\n        imgs = []\n        for i in range(num_frame):\n            sigma = self.sigma"
+        },
+        {
+            "comment": "This code defines a class that generates heatmaps from keypoints and applies data augmentation to images. It takes in results as input, generates image heatmaps based on the keypoints, and optionally doubles the output by applying horizontal flipping with a specified left and right keypoint. The sigma value is used for Gaussian filtering, and the use_score flag determines whether scores are considered for generating heatmaps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1517-1546",
+            "content": "            kps = all_kps[:, i]\n            kpscores = all_kpscores[:, i]\n            max_values = np.ones(kpscores.shape, dtype=np.float32)\n            if self.use_score:\n                max_values = kpscores\n            hmap = self.generate_heatmap(img_h, img_w, kps, sigma, max_values)\n            imgs.append(hmap)\n        return imgs\n    def __call__(self, results):\n        if not self.double:\n            results['imgs'] = np.stack(self.gen_an_aug(results))\n        else:\n            results_ = cp.deepcopy(results)\n            flip = Flip_V2(\n                flip_ratio=1, left_kp=self.left_kp, right_kp=self.right_kp)\n            results_ = flip(results_)\n            results['imgs'] = np.concatenate(\n                [self.gen_an_aug(results),\n                 self.gen_an_aug(results_)])\n        results['label'] = np.array([results['label']])\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'sigma={self.sigma}, '\n                    f'use_score={self.use_score}, '"
+        },
+        {
+            "comment": "This code is formatting a string that represents the parameters for a skeleton pipeline. The parameters include whether to output keypoints and limbs, the number of skeletons, and left/right keypoint options.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py\":1547-1553",
+            "content": "                    f'with_kp={self.with_kp}, '\n                    f'with_limb={self.with_limb}, '\n                    f'skeletons={self.skeletons}, '\n                    f'double={self.double}, '\n                    f'left_kp={self.left_kp}, '\n                    f'right_kp={self.right_kp})')\n        return repr_str"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/66865755-6b7e-42fe-96dd-ffce2b85fd05.json b/docs/doc/66865755-6b7e-42fe-96dd-ffce2b85fd05.json
new file mode 100644
index 000000000..debc48b29
--- /dev/null
+++ b/docs/doc/66865755-6b7e-42fe-96dd-ffce2b85fd05.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This is the Chinese version of README file for Ma-Net video segmentation model implementation in Paddle. It introduces a Paddle implementation of CVPR 2020 paper, \"Memory aggregation networks for efficient interactive video object segmentation\". The code currently supports model training and testing on DAVIS dataset, with future support for model inference on any given video. Download stage1 pre-trained model or trained model for direct training, and use the provided model for testing. Achieved J@60 and AUC scores of 0.761 and 0.749 respectively on DAVIS2017 test set.",
+    "details": [
+        {
+            "comment": "This is the Chinese version of README file for Ma-Net video segmentation model implementation in Paddle. It introduces a Paddle implementation of CVPR 2020 paper, \"Memory aggregation networks for efficient interactive video object segmentation\". The code currently supports model training and testing on DAVIS dataset, with future support for model inference on any given video. Download stage1 pre-trained model or trained model for direct training, and use the provided model for testing. Achieved J@60 and AUC scores of 0.761 and 0.749 respectively on DAVIS2017 test set.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/README_cn.md\":0-45",
+            "content": "[English](README.md) | \u7b80\u4f53\u4e2d\u6587\n# Ma-Net\u89c6\u9891\u5207\u5206\u6a21\u578b\n## \u5185\u5bb9\n- [\u6a21\u578b\u7b80\u4ecb](#\u6a21\u578b\u7b80\u4ecb)\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u8bad\u7ec3](#\u6a21\u578b\u8bad\u7ec3)\n- [\u6a21\u578b\u6d4b\u8bd5](#\u6a21\u578b\u6d4b\u8bd5)\n- [\u6a21\u578b\u63a8\u7406](#\u6a21\u578b\u63a8\u7406)\n## \u6a21\u578b\u7b80\u4ecb\n\u8fd9\u662fCVPR2020\u8bba\u6587\"[Memory aggregation networks for efficient interactive video object segmentation](https://arxiv.org/abs/2003.13246)\"\u7684Paddle\u5b9e\u73b0\u3002\n![avatar](images/1836-teaser.gif)\n\u6b64\u4ee3\u7801\u76ee\u524d\u652f\u6301\u5728 DAVIS \u6570\u636e\u96c6\u4e0a\u8fdb\u884c\u6a21\u578b\u6d4b\u8bd5\u548c\u6a21\u578b\u8bad\u7ec3\uff0c\u5e76\u4e14\u5c06\u5728\u4e4b\u540e\u63d0\u4f9b\u5bf9\u4efb\u4f55\u7ed9\u5b9a\u89c6\u9891\u7684\u6a21\u578b\u63a8\u7406\u3002\n## \u6570\u636e\u51c6\u5907\nDAVIS\u6570\u636e\u4e0b\u8f7d\u53ca\u51c6\u5907\u8bf7\u53c2\u8003[DAVIS2017\u6570\u636e\u51c6\u5907](dataloaders/DAVIS2017_cn.md)\n## \u6a21\u578b\u8bad\u7ec3\u4e0e\u6d4b\u8bd5\n- \u60a8\u53ef\u4ee5\u4e0b\u8f7d[paddle\u7248\u672c\u7684stage1\u9884\u8bad\u7ec3\u6a21\u578b](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/DeeplabV3_coco.pdparams) \u89e3\u538b\u7f29\u5b83\u4ee5\u7528\u4e8e\u8bad\u7ec3\u7684\u7b2c\u4e00\u9636\u6bb5\u3002\n- \u60a8\u53ef\u4ee5\u4e0b\u8f7d[stage1\u8bad\u7ec3\u7ed3\u679c\u6a21\u578b](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MaNet_davis2017_stage1.pdparams) \u89e3\u538b\u7f29\u5b83\u4ee5\u76f4\u63a5\u8bad\u7ec3\u7684\u7b2c\u4e8c\u9636\u6bb5\u8df3\u8fc7\u7b2c\u4e00\u9636\u6bb5\u7684\u8bad\u7ec3\u3002\n  ```bash\n  sh run.sh\n  ```\n- \u60a8\u53ef\u4ee5\u4e0b\u8f7d[\u6211\u4eec\u7684\u6a21\u578b](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MANet_davis2017.pdparams) \u89e3\u538b\u7f29\u5b83\u4ee5\u7528\u4e8e\u6d4b\u8bd5\u3002\n\u5728 DAVIS2017\u4e0a\u7684\u6d4b\u8bd5\u7cbe\u5ea6:\n| J@60  |  AUC  |\n| :---: | :---: |\n| 0.761 | 0.749 |"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/66ee10d8-ca1c-4ae7-a954-ccbc36442fa4.json b/docs/doc/66ee10d8-ca1c-4ae7-a954-ccbc36442fa4.json
new file mode 100644
index 000000000..2c934b352
--- /dev/null
+++ b/docs/doc/66ee10d8-ca1c-4ae7-a954-ccbc36442fa4.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code is from PaddleVideo's BasketballAction application, importing modules and defining AttrDict class. It loads config file into an AttrDict object, processes nested dictionaries, prints configurations, and logs a separator line using the logger module for organization and readability purposes.",
+    "details": [
+        {
+            "comment": "This code is from the PaddleVideo library's BasketballAction application. It imports yaml and ast modules, as well as a logger class. The code defines a constant list of section names (train, valid, test, infer). It also defines an AttrDict class to handle dictionaries with attributes like getattr and setattr methods. The parse_config function is defined which takes a configuration file as input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/config_utils.py\":0-45",
+            "content": "\"\"\"\nconfig_utils\n\"\"\"\n#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport yaml\nimport ast\nimport logger\nlogger = logger.Logger()\nCONFIG_SECS = [\n    'train',\n    'valid',\n    'test',\n    'infer',\n]\nclass AttrDict(dict):\n    \"\"\"\n    AttrDict\n    \"\"\"\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef parse_config(cfg_file):"
+        },
+        {
+            "comment": "This code is responsible for loading a configuration file into an AttrDict object, processing the nested dictionary structure, and printing the configurations. It uses the yaml library to load the file, and the create_attr_dict function to handle nested dictionaries and convert strings to appropriate data types. The print_configs function prints the configuration in a formatted manner for readability.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/config_utils.py\":46-78",
+            "content": "    \"\"\"Load a config file into AttrDict\"\"\"\n    import yaml\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef create_attr_dict(yaml_config):\n    \"\"\"create_attr_dict\"\"\"\n    for key, value in yaml_config.items():\n        if isinstance(value, dict):\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = ast.literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\n    return\ndef print_configs(cfg, mode):\n    \"\"\"print_configs\"\"\"\n    logger.info(\"---------------- {:>5} Arguments ----------------\".format(\n        mode))\n    for sec, sec_items in cfg.items():\n        logger.info(\"{}:\".format(sec))\n        for k, v in sec_items.items():\n            logger.info(\"    {}:{}\".format(k, v))"
+        },
+        {
+            "comment": "This code snippet is logging a separator line using the logger module. The purpose of this logger statement might be to visually separate different sections or parts of the code for readability and organization purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/utils/config_utils.py\":79-79",
+            "content": "    logger.info(\"-------------------------------------------------\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6743942c-ab21-45d8-aa16-a29816e42569.json b/docs/doc/6743942c-ab21-45d8-aa16-a29816e42569.json
new file mode 100644
index 000000000..cca9cb22c
--- /dev/null
+++ b/docs/doc/6743942c-ab21-45d8-aa16-a29816e42569.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code presents a comprehensive table of datasets for action recognition, localization, and spatio-temporal action detection, covering various categories like Skeleton-based Action Recognition and Text-Video Retrieval. The table includes dataset names, homepages, and publication years from different conferences like CVPR and ICCV.",
+    "details": [
+        {
+            "comment": "The code provides a table listing various datasets for action recognition, action localization, and spatio-temporal action detection. It includes links to dataset homepages and their respective publication years.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/README.md\":0-27",
+            "content": "English | [\u7b80\u4f53\u4e2d\u6587](../../zh_CN/dataset/README.md)\n# Dataset\n## 1. Dataset List\n<table>\n  <tbody><tr>\n    <td colspan=\"4\">Action Recognition</td>\n  </tr>\n  <tr>\n    <td><a href=\"./k400.md\">Kinetics-400</a> (<a href=\"https://deepmind.com/research/open-source/kinetics/\" rel=\"nofollow\">Homepage</a>) (CVPR'2017)</td>\n    <td><a href=\"./ucf101.md\">UCF101</a> (<a href=\"https://www.crcv.ucf.edu/research/data-sets/ucf101/\" rel=\"nofollow\">Homepage</a>) (CRCV-IR-12-01)</td>\n    <td><a href=\"./ActivityNet.md\">ActivityNet</a> (<a href=\"http://activity-net.org/\" rel=\"nofollow\">Homepage</a>) (CVPR'2015)</td>\n    <td><a href=\"./youtube8m.md\">YouTube-8M</a> (<a href=\"https://research.google.com/youtube8m/\" rel=\"nofollow\">Homepage</a>) (CVPR'2017)</td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Action Localization</td>\n  </tr>\n  <tr>\n    <td><a href=\"./ActivityNet.md\">ActivityNet</a> (<a href=\"http://activity-net.org/\" rel=\"nofollow\">Homepage</a>) (CVPR'2015)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Spatio-Temporal Action Detection</td>"
+        },
+        {
+            "comment": "This code snippet is a table of datasets. It mentions dataset names, their corresponding homepages, and the year they were published in. The categories include Skeleton-based Action Recognition, Depth Estimation, and Text-Video Retrieval.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/README.md\":28-57",
+            "content": "  </tr>\n  <tr>\n    <td><a href=\"./AVA.md\">AVA</a> (<a href=\"https://research.google.com/ava/index.html\" rel=\"nofollow\">Homepage</a>) (CVPR'2018)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Skeleton-based Action Recognition</td>\n  </tr>\n  <tr>\n    <td><a href=\"./ntu-rgbd.md\">NTURGB+D</a> (<a href=\"https://rose1.ntu.edu.sg/dataset/actionRecognition/\" rel=\"nofollow\">Homepage</a>) (IEEE CS'2016)</td>\n    <td><a href=\"./fsd.md\">FSD</a> (<a href=\"https://aistudio.baidu.com/aistudio/competition/detail/115/0/introduction\" rel=\"nofollow\">Homepage</a>)</td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Depth Estimation</td>\n  </tr>\n  <tr>\n    <td><a href=\"./Oxford_RobotCar.md\">Oxford-RobotCar</a> (<a href=\"https://robotcar-dataset.robots.ox.ac.uk/\" rel=\"nofollow\">Homepage</a>) (IJRR'2017)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Text-Video Retrieval</td>\n  </tr>\n  <tr>\n    <td><a href=\"docs/zh-CN/dataset/msrvtt.md\">MSR-VTT</a> (<"
+        },
+        {
+            "comment": "This code appears to be part of a table within an HTML file. The table lists different datasets related to video and text, with their respective names, descriptions, and links to their homepages or documentation. It also provides information on the year of publication for each dataset, which seems to be from various conferences such as CVPR and ICCV.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/README.md\":57-72",
+            "content": "a href=\"https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/\" rel=\"nofollow\">Homepage</a>) (CVPR'2016)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"4\">Text-Video Pretrained Model</td>\n  </tr>\n  <tr>\n    <td><a href=\"docs/zh-CN/dataset/howto100m.md\">HowTo100M</a> (<a href=\"https://www.di.ens.fr/willow/research/howto100m/\" rel=\"nofollow\">Homepage</a>) (ICCV'2019)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n</tbody>\n</table>"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/67f9dbca-b1f4-4ef2-93a4-4af3f80cadfa.json b/docs/doc/67f9dbca-b1f4-4ef2-93a4-4af3f80cadfa.json
new file mode 100644
index 000000000..cc2830448
--- /dev/null
+++ b/docs/doc/67f9dbca-b1f4-4ef2-93a4-4af3f80cadfa.json
@@ -0,0 +1,80 @@
+{
+    "summary": "This code constructs a PaddlePaddle DeepLab network with convolution layers, batch normalization and activation functions in Bottleneck and ResNet classes. It includes additional layers for better performance, initializes ASPP modules in the DeepLab model for feature extraction, defines a segmentation model with ResNet backbone, adaptive pooling, and Decoder modules, and performs inference using forward function.",
+    "details": [
+        {
+            "comment": "This code defines a class `FrozenBatchNorm2D` which is a type of batch normalization layer where the batch statistics and affine parameters are fixed. It inherits from `nn.Layer` and initializes `paddle.ones` and `paddle.zeros` tensors as its parameters, representing fixed batch statistics and affine transformation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":0-32",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport copy\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nclass FrozenBatchNorm2D(nn.Layer):\n    \"\"\"\n    BatchNorm2D where the batch statistics and the affine parameters\n    are fixed\n    \"\"\"\n    def __init__(self, n, epsilon=1e-5):\n        super(FrozenBatchNorm2D, self).__init__()\n        x1 = paddle.ones([n])\n        x2 = paddle.zeros([n])\n        weight = self.create_parameter("
+        },
+        {
+            "comment": "The code defines a DeepLab class, initializes its parameters, and creates a Bottleneck layer. The DeepLab class contains a weight parameter for the convolution operation, a bias parameter to adjust output, and running_mean and running_var parameters used in normalization. The Bottleneck layer has an expansion factor of 4, implying it will increase the number of channels by this factor. This code is part of a neural network backbone implementation using PaddlePaddle framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":33-58",
+            "content": "            shape=x1.shape, default_initializer=nn.initializer.Assign(x1))\n        bias = self.create_parameter(\n            shape=x2.shape, default_initializer=nn.initializer.Assign(x2))\n        running_mean = self.create_parameter(\n            shape=x2.shape, default_initializer=nn.initializer.Assign(x2))\n        running_var = self.create_parameter(\n            shape=x1.shape, default_initializer=nn.initializer.Assign(x1))\n        self.add_parameter('weight', weight)\n        self.add_parameter('bias', bias)\n        self.add_parameter('running_mean', running_mean)\n        self.add_parameter('running_var', running_var)\n        self.epsilon = epsilon\n    def forward(self, x):\n        scale = self.weight * paddle.rsqrt((self.running_var + self.epsilon))\n        bias = self.bias - self.running_mean * scale\n        scale = paddle.reshape(scale, [1, -1, 1, 1])\n        bias = paddle.reshape(bias, [1, -1, 1, 1])\n        return x * scale + bias\nclass Bottleneck(nn.Layer):\n    expansion = 4\n    def __init__(self,\n                 inplanes,"
+        },
+        {
+            "comment": "Bottleneck class is a convolution neural network layer with batch normalization, designed for DeepLab model. It consists of 3 consecutive convolutions with varying kernel sizes and stride. BatchNorm layers are used after each convolution to normalize the activations, followed by ReLU activation function. The output channels are scaled by 4 in the final convolution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":59-85",
+            "content": "                 planes,\n                 stride=1,\n                 dilation=1,\n                 downsample=None,\n                 BatchNorm=None):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)\n        self.bn1 = BatchNorm(planes)\n        self.conv2 = nn.Conv2D(planes,\n                               planes,\n                               kernel_size=3,\n                               stride=stride,\n                               dilation=dilation,\n                               padding=dilation,\n                               bias_attr=False)\n        self.bn2 = BatchNorm(planes)\n        self.conv3 = nn.Conv2D(planes,\n                               planes * 4,\n                               kernel_size=1,\n                               bias_attr=False)\n        self.bn3 = BatchNorm(planes * 4)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n    def forward(self, x):"
+        },
+        {
+            "comment": "Code snippet performs residual block operations using convolutional layers and batch normalization with ReLU activation. It also includes downsampling if specified, and returns the output after applying final ReLU. The class ResNet initializes a ResNet network with given number of blocks, output stride, BatchNorm type, and pretrained option.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":86-129",
+            "content": "        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass ResNet(nn.Layer):\n    def __init__(self,\n                 block,\n                 layers,\n                 output_stride,\n                 BatchNorm,\n                 pretrained=False):\n        self.inplanes = 64\n        super(ResNet, self).__init__()\n        blocks = [1, 2, 4]\n        if output_stride == 16:\n            strides = [1, 2, 2, 1]\n            dilations = [1, 1, 1, 2]\n        elif output_stride == 8:\n            strides = [1, 2, 1, 1]\n            dilations = [1, 1, 2, 4]\n        else:\n            raise NotImplementedError\n        # Modules\n        self.conv1 = nn.Conv2D(3,\n                               64,"
+        },
+        {
+            "comment": "This code is defining a deep learning model with convolutional layers, batch normalization, and activation functions. It uses the DeepLab backbone architecture and specifies parameters such as kernel sizes, strides, padding, and dilation rates for each layer. The BatchNorm parameter allows for optional batch normalization between layers, improving model performance by reducing internal covariate shift.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":130-151",
+            "content": "                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)\n        self.bn1 = BatchNorm(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block,\n                                       64,\n                                       layers[0],\n                                       stride=strides[0],\n                                       dilation=dilations[0],\n                                       BatchNorm=BatchNorm)\n        self.layer2 = self._make_layer(block,\n                                       128,\n                                       layers[1],\n                                       stride=strides[1],\n                                       dilation=dilations[1],\n                                       BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block,\n                                       256,"
+        },
+        {
+            "comment": "This code defines a class with two layers, layer3 and layer4. Layer3 is created using the _make_MG_unit function with specific parameters like block, planes, blocks, stride, dilation, and BatchNorm. Layer4 is also created by calling _make_layer function. Downsampling is done if stride is not 1 or inplanes are not equal to planes*block.expansion.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":152-175",
+            "content": "                                       layers[2],\n                                       stride=strides[2],\n                                       dilation=dilations[2],\n                                       BatchNorm=BatchNorm)\n        self.layer4 = self._make_MG_unit(block,\n                                         512,\n                                         blocks=blocks,\n                                         stride=strides[3],\n                                         dilation=dilations[3],\n                                         BatchNorm=BatchNorm)\n        self._init_weight()\n    def _make_layer(self,\n                    block,\n                    planes,\n                    blocks,\n                    stride=1,\n                    dilation=1,\n                    BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,"
+        },
+        {
+            "comment": "This code defines a function _make_MG_unit that creates a module for the DeepLab model, which includes multiple layers of a specified block. It takes in parameters such as block, planes, blocks, stride, dilation, and BatchNorm (optional). The function first checks if downsampling is needed based on stride and inplanes. If so, it creates a Conv2D layer for downsampling. Then, it appends the initial layer with the specified parameters and expands the number of layers as required. Finally, it returns the created sequence of layers as a nn.Sequential module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":176-206",
+            "content": "                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes, planes, stride, dilation, downsample,\n                  BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      dilation=dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def _make_MG_unit(self,\n                      block,\n                      planes,\n                      blocks,\n                      stride=1,\n                      dilation=1,\n                      BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,"
+        },
+        {
+            "comment": "This code defines a function that creates a convolutional neural network for the DeepLab model. It takes input, creates layers with specified parameters, and returns a Sequential object representing the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":207-239",
+            "content": "                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes,\n                  planes,\n                  stride,\n                  dilation=blocks[0] * dilation,\n                  downsample=downsample,\n                  BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, len(blocks)):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      stride=1,\n                      dilation=blocks[i] * dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def forward(self, input, return_mid_level=False):\n        x = self.conv1(input)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n        x = self.layer1(x)"
+        },
+        {
+            "comment": "This code defines a DeepLab model that utilizes an ASPP module. It extracts low and mid-level features from the input, has multiple layers of convolutions, and initializes weights using specific initializers. The ASPP module applies atrous convolutions with different dilation rates for feature extraction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":240-268",
+            "content": "        low_level_feat = x\n        x = self.layer2(x)\n        mid_level_feat = x\n        x = self.layer3(x)\n        x = self.layer4(x)\n        if return_mid_level:\n            return x, low_level_feat, mid_level_feat\n        else:\n            return x, low_level_feat\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)\nclass _ASPPModule(nn.Layer):\n    def __init__(self, inplanes, planes, kernel_size, padding, dilation,\n                 BatchNorm):\n        super(_ASPPModule, self).__init__()\n        self.atrous_conv = nn.Conv2D(inplanes,\n                                     planes,\n                                     kernel_size=kernel_size,\n                                     stride=1,\n                                     padding=padding,\n                                     dilation=dilation,"
+        },
+        {
+            "comment": "The code defines a DeepLab class with an ASPP module for feature extraction. It initializes the layers and sets their weights using Kaiming normal initialization or fills BatchNorm2D weight with 1 and biases with 0. The ASPP class accepts backbone and output_stride as parameters to determine dilations for the ASPP modules.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":269-306",
+            "content": "                                     bias_attr=False)\n        self.bn = BatchNorm(planes)\n        self.relu = nn.ReLU()\n        self._init_weight()\n    def forward(self, x):\n        x = self.atrous_conv(x)\n        x = self.bn(x)\n        return self.relu(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                m.weight_attr = nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.BatchNorm2D):\n                m.weight.data.fill_(1)\n                m.bias.data.zero_()\nclass ASPP(nn.Layer):\n    def __init__(self, backbone, output_stride, BatchNorm):\n        super(ASPP, self).__init__()\n        if backbone == 'drn':\n            inplanes = 512\n        elif backbone == 'mobilenet':\n            inplanes = 320\n        else:\n            inplanes = 2048\n        if output_stride == 16:\n            dilations = [1, 6, 12, 18]\n        elif output_stride == 8:\n            dilations = [1, 12, 24, 36]\n        else:\n            raise NotImplementedError\n        self.aspp1 = _ASPPModule(inplanes,"
+        },
+        {
+            "comment": "This code initializes four instances of the _ASPPModule class, each with different dilation rates and padding values for the DeepLab model's ASPP feature extraction module. The inplanes parameter is consistent across all four modules, indicating the number of input feature planes. BatchNorm specifies whether to apply batch normalization or not.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":307-329",
+            "content": "                                 256,\n                                 1,\n                                 padding=0,\n                                 dilation=dilations[0],\n                                 BatchNorm=BatchNorm)\n        self.aspp2 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[1],\n                                 dilation=dilations[1],\n                                 BatchNorm=BatchNorm)\n        self.aspp3 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[2],\n                                 dilation=dilations[2],\n                                 BatchNorm=BatchNorm)\n        self.aspp4 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[3],\n                                 dilation=dilations[3],\n                                 BatchNorm=BatchNorm)"
+        },
+        {
+            "comment": "This code defines a DeepLab backbone model for image segmentation. It has adaptive global average pooling, multiple ASPP modules, and convolutional layers with batch normalization, ReLU activation, and dropout regularization. The constructor initializes the model's sublayers with Kaiming Normal initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":331-362",
+            "content": "        self.global_avg_pool = nn.Sequential(\n            nn.AdaptiveAvgPool2D((1, 1)),\n            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),\n            BatchNorm(256), nn.ReLU())\n        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)\n        self.bn1 = BatchNorm(256)\n        self.relu = nn.ReLU()\n        self.dropout = nn.Dropout(0.1)\n        self._init_weight()\n    def forward(self, x):\n        x1 = self.aspp1(x)\n        x2 = self.aspp2(x)\n        x3 = self.aspp3(x)\n        x4 = self.aspp4(x)\n        x5 = self.global_avg_pool(x)\n        x5 = F.interpolate(x5,\n                           size=x4.shape[2:],\n                           mode='bilinear',\n                           align_corners=True)\n        x = paddle.concat(x=[x1, x2, x3, x4, x5], axis=1)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        return self.dropout(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()"
+        },
+        {
+            "comment": "This code is defining a Decoder class that takes in a backbone and BatchNorm as arguments. It initializes a convolution layer, batch normalization layer, and ReLU activation function. The last convolution sequence includes two convolutional layers with BatchNorm between them, followed by an optional second sequence of layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":363-394",
+            "content": "            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)\nclass Decoder(nn.Layer):\n    def __init__(self, backbone, BatchNorm):\n        super(Decoder, self).__init__()\n        if backbone == 'resnet':\n            low_level_inplanes = 256\n        elif backbone == 'mobilenet':\n            raise NotImplementedError\n        else:\n            raise NotImplementedError\n        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)\n        self.bn1 = BatchNorm(48)\n        self.relu = nn.ReLU()\n        self.last_conv = nn.Sequential(\n            nn.Conv2D(304,\n                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(),\n            nn.Sequential(),\n            nn.Conv2D(256,\n                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,"
+        },
+        {
+            "comment": "The provided code defines a DeepLab model for segmentation. It includes a convolution layer, batch normalization, ReLU activation function, and interpolation operation. The forward method processes input features and returns output features. The _init_weight method initializes the weight of each sublayer. The DeepLab class takes parameters like backbone type, output stride, and freeze batch normalization flag for model initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":395-425",
+            "content": "                      bias_attr=False), BatchNorm(256), nn.ReLU(),\n            nn.Sequential())\n        self._init_weight()\n    def forward(self, x, low_level_feat):\n        low_level_feat = self.conv1(low_level_feat)\n        low_level_feat = self.bn1(low_level_feat)\n        low_level_feat = self.relu(low_level_feat)\n        x = F.interpolate(x,\n                          size=low_level_feat.shape[2:],\n                          mode='bilinear',\n                          align_corners=True)\n        x = paddle.concat(x=[x, low_level_feat], axis=1)\n        x = self.last_conv(x)\n        return x\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)\nclass DeepLab(nn.Layer):\n    \"\"\"DeepLab model for segmentation\"\"\"\n    def __init__(self, backbone='resnet', output_stride=16, freeze_bn=True):"
+        },
+        {
+            "comment": "The code defines a DeepLab class with an optional frozen Batch Normalization layer. It initializes the backbone network (ResNet) and adds ASPP and Decoder modules. The forward function performs inference, returning either the final output or additional intermediate features depending on the return_aspp flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/deeplab.py\":426-453",
+            "content": "        super(DeepLab, self).__init__()\n        if freeze_bn == True:\n            print(\"Use frozen BN in DeepLab!\")\n            BatchNorm = FrozenBatchNorm2D\n        else:\n            BatchNorm = nn.BatchNorm2D\n        self.backbone = ResNet(Bottleneck, [3, 4, 23, 3],\n                               output_stride,\n                               BatchNorm,\n                               pretrained=True)\n        self.aspp = ASPP(backbone, output_stride, BatchNorm)\n        self.decoder = Decoder(backbone, BatchNorm)\n    def forward(self, input, return_aspp=False):\n        \"\"\"forward function\"\"\"\n        if return_aspp:\n            x, low_level_feat, mid_level_feat = self.backbone(input, True)\n        else:\n            x, low_level_feat = self.backbone(input)\n        aspp_x = self.aspp(x)\n        x = self.decoder(aspp_x, low_level_feat)\n        if return_aspp:\n            return x, aspp_x, low_level_feat, mid_level_feat\n        else:\n            return x, low_level_feat"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/683b79f6-e8d9-41ea-8f71-b0ddc8cdb9cf.json b/docs/doc/683b79f6-e8d9-41ea-8f71-b0ddc8cdb9cf.json
new file mode 100644
index 000000000..073c792c9
--- /dev/null
+++ b/docs/doc/683b79f6-e8d9-41ea-8f71-b0ddc8cdb9cf.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This Python package, \"paddlevideo\", utilizes PaddlePaddle toolkits for video understanding and supports multiple Python versions. It is set up using setuptools and includes dependencies and documentation.",
+    "details": [
+        {
+            "comment": "This code is setting up a Python package using setuptools for the PaddleVideo library, specifying its name as \"paddlevideo\". It includes the necessary dependencies listed in the \"requirements.txt\" file and provides a README file located at 'docs/en/whl_en.md' for documentation purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/setup.py\":0-33",
+            "content": "\"\"\"\n# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom setuptools import setup\nfrom io import open\nwith open('requirements.txt', encoding=\"utf-8-sig\") as f:\n    requirements = f.readlines()\ndef readme():\n    \"\"\"readme\"\"\"\n    with open('docs/en/whl_en.md', encoding=\"utf-8-sig\") as f:\n        README = f.read()\n    return README\nsetup(\n    name='paddlevideo', #name of .whl file\n    packages=['ppvideo'], #install package name\n    package_dir={'ppvideo': ''},\n    include_package_data=True, #Accept all data files and directories matched by MANIFEST.in"
+        },
+        {
+            "comment": "This code is a setup file for a Python package named \"ppvideo\" that utilizes PaddlePaddle toolkits for video understanding. It specifies installation requirements, entry points, version, license, description, URL, download link, keywords, and classifiers. The package supports multiple versions of Python and is categorized under the Utilities topic.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/setup.py\":34-55",
+            "content": "    install_requires=requirements,\n    entry_points={\"console_scripts\": [\"ppvideo= ppvideo.tools.paddlevideo_clas:main\"]},\n    version='0.0.1',\n    license='Apache License 2.0',\n    description='Awesome Video toolkits based on PaddlePaddle ',\n    long_description=readme(),\n    long_description_content_type='text/markdown',\n    url='https://github.com/PaddlePaddle/PaddleVideo',\n    download_url='https://github.com/PaddlePaddle/PaddleVideo.git',\n    keywords=[\n    'A treasure chest for video understanding powered by PaddlePaddle.'\n    ],\n    classifiers=[\n        'Intended Audience :: Developers', 'Operating System :: OS Independent',\n        'Natural Language :: Chinese (Simplified)',\n        'Programming Language :: Python :: 3',\n        'Programming Language :: Python :: 3.2',\n        'Programming Language :: Python :: 3.3',\n        'Programming Language :: Python :: 3.4',\n        'Programming Language :: Python :: 3.5',\n        'Programming Language :: Python :: 3.6',\n        'Programming Language :: Python :: 3.7', 'Topic :: Utilities'"
+        },
+        {
+            "comment": "This code is creating a tuple with empty elements. The specific purpose or usage of this tuple in the context of the setup.py file might require more information to provide an accurate and relevant comment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/setup.py\":56-56",
+            "content": "    ],)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/68a58ce5-b0c3-46e1-b9a4-31e20350a75a.json b/docs/doc/68a58ce5-b0c3-46e1-b9a4-31e20350a75a.json
new file mode 100644
index 000000000..1b2aee9ed
--- /dev/null
+++ b/docs/doc/68a58ce5-b0c3-46e1-b9a4-31e20350a75a.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The code reads files from a directory, retrieves their names without extensions, checks if the names exist in another set of labels, and deletes any mismatched labels. It then writes the fixed label file with updated sizes.",
+    "details": [
+        {
+            "comment": "The code reads files from a directory, retrieves their names without extensions, checks if the names exist in another set of labels, and deletes any mismatched labels. It then writes the fixed label file with updated sizes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/fix_bad_label.py\":0-36",
+            "content": "import copy\nimport json\nimport re\nimport os\nurl = '/home/aistudio/work/BMN/Input_for_bmn/feature/'\ndirectory = os.fsencode(url)\ncount = 0\ntarget_set = []\nfor file in os.listdir(directory):\n    filename = os.fsdecode(file)\n    target_name = filename.split('.npy')[0]\n    target_set.append(target_name)\n    count += 1\nprint('Feature size:', len(target_set))\nwith open('/home/aistudio/work/BMN/Input_for_bmn/label.json') as f:\n    data = json.load(f)\ndelet_set = []\nfor key in data.keys():\n    if not key in target_set:\n        delet_set.append(key)\nprint('(Label) Original size:', len(data))\nprint('(Label) Deleted size:', len(delet_set))\nfor item in delet_set:\n    data.pop(item, None)\nprint('(Label) Fixed size:', len(data))\njsonString = json.dumps(data, indent=4, ensure_ascii=False)\njsonFile = open('/home/aistudio/work/BMN/Input_for_bmn/label_fixed.json', 'w')\njsonFile.write(jsonString)\njsonFile.close()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6963a95f-f75e-4530-869d-4afbc5a040d6.json b/docs/doc/6963a95f-f75e-4530-869d-4afbc5a040d6.json
new file mode 100644
index 000000000..8f8765a8f
--- /dev/null
+++ b/docs/doc/6963a95f-f75e-4530-869d-4afbc5a040d6.json
@@ -0,0 +1,50 @@
+{
+    "summary": "This function ensures state dict consistency by comparing optimizer and model parameters, saving/loading checkpoints, and converting sub-bn to normal bn. It checks if certain layers are set to load and prints a message for unloaded weights before loading pre-trained weights and setting the optimizer's state dictionary.",
+    "details": [
+        {
+            "comment": "This function converts Sub-BN parameters to normal BN parameters in a state dict. It renames `bn.bn` to `bn`, and modifies `_mean` and `_variance` accordingly. This is done before saving or evaluation to maintain consistency with normal BN layers. The modifications are made by iterating through the dictionary and checking if the key ends with the appropriate string, then updating it accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py\":0-30",
+            "content": "import os\nimport numpy as np\nimport paddle\nimport copy\ndef sub_to_normal_bn(sd):\n    \"\"\"\n    When save, Convert the Sub-BN paprameters to normal BN parameters in a state dict.\n    There are two copies of BN layers in a Sub-BN implementation: `bn.bn` and\n    `bn.split_bn`. `bn.split_bn` is used during training and\n    \"compute_precise_bn\". Before saving or evaluation, its stats are copied to\n    `bn.bn`. We rename `bn.bn` to `bn` and store it to be consistent with normal\n    BN layers.\n    Args:\n        sd (OrderedDict): a dict of parameters which might contain Sub-BN\n        parameters.\n    Returns:\n        new_sd (OrderedDict): a dict with Sub-BN parameters reshaped to\n        normal parameters.\n    \"\"\"\n    modifications = [\n        (\"bn.bn._mean\", \"bn._mean\"),\n        (\"bn.bn._variance\", \"bn._variance\"),\n    ]\n    to_remove = [\"bn.bn.\", \".split_bn.\"]\n    key_list = list(sd.keys())  #odict_keys to list\n    for key in key_list:\n        for before, after in modifications:\n            if key.endswith(before):\n                new_key = key.split(before)[0] + after"
+        },
+        {
+            "comment": "This function converts BN parameters to Sub-BN parameters when loading a checkpoint into a model containing Sub-BNs. It loops through the model's parameters, if a parameter has the \"bn.split_bn.\" prefix and is not the weight or bias of BN, it renames and moves the corresponding value from the checkpoint dict to the bn.bn key in the same subdict. Finally, it adjusts the shape of the Sub-BN parameters to match the original BN parameters' shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py\":31-57",
+            "content": "                sd[new_key] = sd.pop(key)\n        for rm in to_remove:\n            if rm in key and key in sd:\n                del sd[key]\ndef normal_to_sub_bn(checkpoint_sd, model_sd):\n    \"\"\"\n    When load, Convert BN parameters to Sub-BN parameters if model contains Sub-BNs.\n    Args:\n        checkpoint_sd (OrderedDict): source dict of parameters.\n        model_sd (OrderedDict): target dict of parameters.\n    Returns:\n        new_sd (OrderedDict): converted dict of parameters.\n    \"\"\"\n    for key in model_sd:\n        if key not in checkpoint_sd:\n            # not to replace bn.weight and bn.bias\n            if \"bn.split_bn.\" in key and \"bn.weight\" not in key and \"bn.bias\" not in key:\n                load_key = key.replace(\"bn.split_bn.\", \"bn.\")\n                bn_key = key.replace(\"bn.split_bn.\", \"bn.bn.\")\n                checkpoint_sd[key] = checkpoint_sd.pop(load_key)\n                checkpoint_sd[bn_key] = checkpoint_sd[key]\n    # match the shape of bn.split_bn._xx\n    # model_sd: split_bn.rm.shape = num_feature*num_split"
+        },
+        {
+            "comment": "This code is comparing the shape of certain keys in the model and checkpoint dictionaries. If they match certain criteria, it will concatenate the checkpoint key to expand its size based on the model's shape. This is done for specific keys in the dictionary, except 'split_bn'. The function prints out the before and after shapes of the affected keys.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py\":58-80",
+            "content": "    # checkpoint_sd: split_bn.rm.shape = bn.rm.shape = num_feature\n    for key in model_sd:\n        if key in checkpoint_sd:\n            model_blob_shape = model_sd[key].shape  #bn.split_bn\n            c2_blob_shape = checkpoint_sd[key].shape  #bn.bn\n            if (len(model_blob_shape) == 1 and len(c2_blob_shape) == 1\n                    and model_blob_shape[0] > c2_blob_shape[0]\n                    and model_blob_shape[0] % c2_blob_shape[0] == 0):\n                before_shape = checkpoint_sd[key].shape\n                checkpoint_sd[key] = np.concatenate(\n                    [checkpoint_sd[key]] *\n                    (model_blob_shape[0] // c2_blob_shape[0]))\n                if 'split_bn' not in key:  #split_bn is excepted\n                    print(\"{} {} -> {}\".format(key, before_shape,\n                                               checkpoint_sd[key].shape))\n    return checkpoint_sd\ndef mapping_opt_dict(opt_dict, model_key_list):\n    \"\"\"\n    Paddle Name schedule: conv_1.w -> conv_2.w\n    Sometimes: sub_bn -> bn"
+        },
+        {
+            "comment": "This function takes an optimizer state dict and a list of parameter names from a rebuilt model. It aims to modify the keys in the optimizer state dict to match the new parameters' names, while also considering any added index for better compatibility. The function then returns the modified optimizer state dict.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py\":81-102",
+            "content": "    when re-build model, we desire the parameter name to be coincident,\n    but the parameters name index will be added, as conv_1 to conv_2, not conv_1.\n    It will raise error if we set old saved parameters to new created optimizer.\n    as conv_2 cannot find in state_dict(only conv_1).\n    Args:\n        opt_dict: optimizer state dict, including the name and value of parameters gradient.\n        model_key_list: the parameters name list of re-build model.\n    Return: optimizer state dict with modified keys\n    \"\"\"\n    def get_name_info(PNAME, PN_key_list, key_list):\n        min_index = float('inf')\n        max_index = 0\n        for name in PN_key_list[1:]:\n            for key in key_list:\n                if name in key:\n                    index = int(key.split('.')[0].split(name)[-1])\n                    if index < min_index:\n                        min_index = index\n                    if index > max_index:\n                        max_index = index\n            num_name = max_index - min_index + 1\n            PNAME[name].append((min_index, max_index, num_name))"
+        },
+        {
+            "comment": "This code appears to be a part of a larger program that compares the parameters in an optimizer state dict with those in a re-built model. It calculates and prints information about the number of parameters associated with each prefix, checks if batch normalization layers need their names changed, and potentially removes the \"sub_batch_norm3d_\" prefix from consideration. The code assumes that the \"opt_dict\" and \"model\" variables have already been defined elsewhere.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py\":103-134",
+            "content": "            min_index = float('inf')\n            max_index = 0\n    PNAME = {\n        \"LR_Scheduler\": [],\n        \"conv3d_\": [],\n        \"linear_\": [],\n        \"sub_batch_norm3d_\": [],\n        \"batch_norm3d_\": [],\n    }\n    pd_key_list = list(opt_dict.keys())\n    print(\"The number of parameters in saved optimizer state dict = {}\".format(\n        len(pd_key_list)))\n    print(\"The number of parameters in re-build model list = {}\".format(\n        len(model_key_list)))\n    # 1 may be LR_Scheduler\n    PN_key_list = list(PNAME.keys())\n    # get the number of each PNAME\n    get_name_info(PNAME, PN_key_list, pd_key_list)\n    get_name_info(PNAME, PN_key_list, model_key_list)\n    print(\"[Parameters info] prefix: min_index, max_index, number_params: \\n\",\n          PNAME)\n    # whether to change name of bn layer\n    change_name = False\n    if PNAME[\"sub_batch_norm3d_\"][0][-1] == -float('inf'):\n        PN_key_list.remove(\"sub_batch_norm3d_\")\n        if PNAME[\"sub_batch_norm3d_\"][1][-1] != -float('inf'):\n            print(\n                \"Optimizer state dict saved bn, but Re-build model use sub_bn, changed name!\""
+        },
+        {
+            "comment": "The code checks if the optimizer state dict saved batch normalization (bn) or sub_batch_normalization and updates the key names accordingly. If the state dict saved bn but the model uses sub_bn, it prints a message and changes the name. If the state dict saved sub_bn and the model also uses sub_bn, it prints a separate message. The code then defines a change_dict mapping and iterates over the key list to update the names if required.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py\":135-162",
+            "content": "            )\n            change_name = True\n        else:\n            print(\"Optimizer state dict saved bn, and Re-build model use bn\")\n    else:\n        PN_key_list.remove(\"batch_norm3d_\")\n        if PNAME[\"sub_batch_norm3d_\"][1][-1] == -float('inf'):\n            print(\n                \"Optimizer state dict saved sub_bn, but Re-build model use bn, changed name!\"\n            )\n            change_name = True\n        else:\n            print(\n                \"Optimizer state dict saved sub_bn, Re-build model use sub_bn\")\n    #update key name\n    # sub_bn -> bn name mapping, pre-define dict\n    change_dict = {\n        \"sub_batch_norm3d_\": \"batch_norm3d_\",\n        \"batch_norm3d_\": \"sub_batch_norm3d_\"\n    }\n    for key in pd_key_list:\n        for name in PN_key_list[1:]:\n            if key.startswith(name):\n                start = change_dict[name] if (\n                    change_name and \"batch_norm\" in name) else name\n                str_index = key.split('.')[0].split(name)[-1]\n                index = int(str_index)"
+        },
+        {
+            "comment": "This code defines two functions: \"subn_save\" and \"subn_load\". \"subn_save\" saves a model's state dictionary along with the optimizer's state dictionary to specified directories in a specific format. It also converts sub-bn to normal bn before saving, and prints a message confirming the save operation. \"subn_load\" loads checkpoints from given files into the specified model and optionally an optimizer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py\":163-189",
+            "content": "                new_index = str(index +\n                                (PNAME[start][1][0] - PNAME[name][0][0]))\n                end = key.split('.')[-1]\n                update_key = start + new_index + '.' + end\n                opt_dict[update_key] = opt_dict.pop(key)\n    return opt_dict\ndef subn_save(save_dir, name_prefix, epoch, video_model, optimizer):\n    if not os.path.isdir(save_dir):\n        os.makedirs(save_dir)\n    model_path = os.path.join(save_dir, name_prefix + \"{:05d}\".format(epoch))\n    model_dict = video_model.state_dict()\n    sub_to_normal_bn(model_dict)\n    opti_dict = optimizer.state_dict()\n    paddle.save(model_dict, model_path + '.pdparams')\n    paddle.save(opti_dict, model_path + '.pdopt')\n    print('[Saved Epoch {} parameters and optimizer state ]'.format(epoch))\ndef subn_load(model, ck_path, optimizer=None):\n    \"\"\"\n    Load the checkpoint from the given file.\n    Args:\n        model (model): model to load the weights from the checkpoint.\n        optimizer (optim, optional): optimizer to load the historical state."
+        },
+        {
+            "comment": "This function loads checkpoints from a specific path and returns the number of training epochs. It ensures that the given directory has .pdparams file, prints the checkpoint loading information, copies model state dictionary, and compares the shapes of pre-trained weights to current model weights for matching. It also identifies layers that are not loaded with pre-trained weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py\":190-215",
+            "content": "        ck_path (str): checkpoint path\n    Returns:\n        (int): the number of training epoch of the checkpoint.\n    \"\"\"\n    assert os.path.exists(ck_path + \".pdparams\"), \\\n        \"Given dir {}.pdparams not exist.\".format(ck_path)\n    print(\"load checkpint from {}.pdparams\".format(ck_path))\n    model_dict = model.state_dict()\n    checkpoint_dict = paddle.load(ck_path + \".pdparams\")\n    #    checkpoint_dict = copy.deepcopy(checkpoint_dict_orig)  #not modify when multi card\n    pre_train_dict = normal_to_sub_bn(checkpoint_dict, model_dict)\n    # Match pre-trained weights that have same shape as current model.\n    pre_train_dict_match = {\n        k: v\n        for k, v in pre_train_dict.items()\n        if k in model_dict and tuple(v.shape) == tuple(model_dict[k].shape)\n    }\n    # Weights that do not have match from the pre-trained model.\n    not_load_layers = [\n        k for k in model_dict.keys() if k not in pre_train_dict_match.keys()\n    ]\n    # Log weights that are not loaded with the pre-trained weights."
+        },
+        {
+            "comment": "This code block checks if certain layers in the model are not set to load, and prints a message if those weights are not loaded. It then loads the pre-trained weights for the model and checks if a specific file exists before loading the optimizer's state dictionary from that file. The function mapping_opt_dict is called to create a new dictionary containing only parameters that require gradient, which is then set as the state dictionary of the optimizer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/save_load_helper.py\":216-236",
+            "content": "    if not_load_layers:\n        for k in not_load_layers:\n            if 'bn.weight' not in k and 'bn.bias' not in k:\n                print(\"Network weights {} not loaded.\".format(k))\n    # Load pre-trained weights.\n    model.set_state_dict(pre_train_dict_match)\n    if optimizer:\n        assert os.path.exists(ck_path + \".pdopt\"), \\\n            \"Given dir {}.pdopt not exist.\".format(ck_path)\n        print(\"load checkpint from {}.pdopt\".format(ck_path))\n        opt_dict = paddle.load(ck_path + \".pdopt\")\n        # get parameters that required gradient from re-build model\n        model_key_list = []\n        for param in model.parameters():\n            if param.stop_gradient == False:\n                model_key_list.append(param.name)\n        new_opt_dict = mapping_opt_dict(opt_dict, model_key_list)\n        optimizer.set_state_dict(new_opt_dict)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/69686180-705e-42c2-bf0c-84e8e04bb6bb.json b/docs/doc/69686180-705e-42c2-bf0c-84e8e04bb6bb.json
new file mode 100644
index 000000000..e02f77f78
--- /dev/null
+++ b/docs/doc/69686180-705e-42c2-bf0c-84e8e04bb6bb.json
@@ -0,0 +1,150 @@
+{
+    "summary": "The PaddleVideo library includes functions for convolutional layers, ResBlock classes, Slow and Fast branches, and a VideoModelStem class to initialize a ResNet SlowFast model with MRI for video analysis.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library, licensed under Apache 2.0. It imports necessary modules and defines functions for initializing convolutional layers with KaimingNormal distribution. It also includes a function for batch normalization parameters and registers backbones in PaddlePaddle's registry. The code uses paddle, numpy, and other libraries for various functionalities such as seeding random numbers for reproducibility.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":0-32",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import KaimingNormal\nfrom ..registry import BACKBONES\nfrom paddlevideo.utils.multigrid import get_norm\nimport sys\nimport numpy as np\nimport paddle.distributed as dist\n# seed random seed\npaddle.framework.seed(0)\n# get init parameters for conv layer\ndef get_conv_init(fan_out):\n    return KaimingNormal(fan_in=fan_out)\ndef get_bn_param_attr(bn_weight=1.0, coeff=0.0):"
+        },
+        {
+            "comment": "This code defines a BottleneckTransform class, which is a layer for video models. It performs temporal convolutions with 1x1, 1x3x3, and 1x1x1 layers, where T is the size of the temporal kernel. The class takes various parameters such as dimension, stride, and kernel sizes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":33-65",
+            "content": "    param_attr = paddle.ParamAttr(\n        initializer=paddle.nn.initializer.Constant(bn_weight),\n        regularizer=paddle.regularizer.L2Decay(coeff))\n    return param_attr\n\"\"\"Video models.\"\"\"\nclass BottleneckTransform(paddle.nn.Layer):\n    \"\"\"\n    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of\n        temporal kernel.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 temp_kernel_size,\n                 stride,\n                 dim_inner,\n                 num_groups,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 eps=1e-5,\n                 dilation=1,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (int): the channel dimensions of the input.\n            dim_out (int): the channel dimension of the output.\n            temp_kernel_size (int): the temporal kernel sizes of the middle\n                convolution in the bottleneck.\n            stride (int): the stride of the bottleneck."
+        },
+        {
+            "comment": "The code defines a class 'BottleneckTransform' with parameters for dimensions, stride, inner dimension, number of groups, and other attributes. It inherits from another class and initializes its own instance variables before constructing the model structure. The constructor method takes in various arguments to configure the bottleneck transformation block.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":66-86",
+            "content": "            dim_inner (int): the inner dimension of the block.\n            num_groups (int): number of groups for the convolution. num_groups=1\n                is for standard ResNet like networks, and num_groups>1 is for\n                ResNeXt like networks.\n            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise\n                apply stride to the 3x3 conv.\n            inplace_relu (bool): if True, calculate the relu on the original\n                input without allocating new memory.\n            eps (float): epsilon for batch norm.\n            dilation (int): size of dilation.\n        \"\"\"\n        super(BottleneckTransform, self).__init__()\n        self.temp_kernel_size = temp_kernel_size\n        self._inplace_relu = inplace_relu\n        self._eps = eps\n        self._stride_1x1 = stride_1x1\n        self.norm_module = norm_module\n        self._construct(dim_in, dim_out, stride, dim_inner, num_groups,\n                        dilation)\n    def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,"
+        },
+        {
+            "comment": "This code initializes two 3D convolutional layers with Batch Normalization and ReLU activation. The first layer has stride 1 for all dimensions, while the second layer has stride 1 for the first dimension and a different value (determined by _stride_1x1) for the remaining two dimensions. Both layers have specified kernel sizes, padding, and use custom initializers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":87-112",
+            "content": "                   dilation):\n        str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)\n        fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self.a = paddle.nn.Conv3D(\n            in_channels=dim_in,\n            out_channels=dim_inner,\n            kernel_size=[self.temp_kernel_size, 1, 1],\n            stride=[1, str1x1, str1x1],\n            padding=[int(self.temp_kernel_size // 2), 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.a_bn = self.norm_module(num_features=dim_inner,\n                                     epsilon=self._eps,\n                                     weight_attr=get_bn_param_attr(),\n                                     bias_attr=get_bn_param_attr(bn_weight=0.0))\n        # 1x3x3, BN, ReLU.\n        fan = (dim_inner) * (1 * 3 * 3)\n        initializer_tmp = get_conv_init(fan)\n        self.b = paddle.nn.Conv3D(\n            in_channels=dim_inner,\n            out_channels=dim_inner,"
+        },
+        {
+            "comment": "The code defines a Conv3D layer with 1x3x3 kernel, stride of str3x3, and dilation of dilation. It also includes a batch normalization (BN) module for the output. The BN module is applied to the output of the previous layer and has no bias. Finally, another Conv3D layer with 1x1x1 kernel and no BN is defined followed by another BN without a bias.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":113-138",
+            "content": "            kernel_size=[1, 3, 3],\n            stride=[1, str3x3, str3x3],\n            padding=[0, dilation, dilation],\n            groups=num_groups,\n            dilation=[1, dilation, dilation],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.b_bn = self.norm_module(num_features=dim_inner,\n                                     epsilon=self._eps,\n                                     weight_attr=get_bn_param_attr(),\n                                     bias_attr=get_bn_param_attr(bn_weight=0.0))\n        # 1x1x1, BN.\n        fan = (dim_out) * (1 * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self.c = paddle.nn.Conv3D(\n            in_channels=dim_inner,\n            out_channels=dim_out,\n            kernel_size=[1, 1, 1],\n            stride=[1, 1, 1],\n            padding=[0, 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self.c_bn = self.norm_module(\n            num_features=dim_out,"
+        },
+        {
+            "comment": "ResNetSlowFastMRI: A residual network model that adds Slow and Fast branches to extract temporal features. ResBlock is a residual block class used in the architecture, which utilizes BatchNorm3D for normalization and applies ReLU activations after each branch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":139-179",
+            "content": "            epsilon=self._eps,\n            weight_attr=get_bn_param_attr(bn_weight=0.0),\n            bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        # Branch2a.\n        x = self.a(x)\n        x = self.a_bn(x)\n        x = F.relu(x)\n        # Branch2b.\n        x = self.b(x)\n        x = self.b_bn(x)\n        x = F.relu(x)\n        # Branch2c\n        x = self.c(x)\n        x = self.c_bn(x)\n        return x\nclass ResBlock(paddle.nn.Layer):\n    \"\"\"\n    Residual block.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 temp_kernel_size,\n                 stride,\n                 dim_inner,\n                 num_groups=1,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 eps=1e-5,\n                 dilation=1,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        ResBlock class constructs redisual blocks. More details can be found in:\n            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.\n            \"Deep residual learning for image recognition.\""
+        },
+        {
+            "comment": "This function defines a bottleneck for ResNet and ResNeXt-like networks with specified parameters. It takes channel dimensions, temporal kernel sizes, stride, transform function, inner dimension, number of groups, if applying stride to 1x1 conv, inplace relu calculation, and epsilon for batch norm as arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":180-197",
+            "content": "            https://arxiv.org/abs/1512.03385\n        Args:\n            dim_in (int): the channel dimensions of the input.\n            dim_out (int): the channel dimension of the output.\n            temp_kernel_size (int): the temporal kernel sizes of the middle\n                convolution in the bottleneck.\n            stride (int): the stride of the bottleneck.\n            trans_func (string): transform function to be used to construct the\n                bottleneck.\n            dim_inner (int): the inner dimension of the block.\n            num_groups (int): number of groups for the convolution. num_groups=1\n                is for standard ResNet like networks, and num_groups>1 is for\n                ResNeXt like networks.\n            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise\n                apply stride to the 3x3 conv.\n            inplace_relu (bool): calculate the relu on the original input\n                without allocating new memory.\n            eps (float): epsilon for batch norm."
+        },
+        {
+            "comment": "This code defines a ResBlock class with skip connection, which performs convolution operations for image processing tasks. The constructor takes various parameters like dimensions, kernel size, stride, etc., and initializes the necessary components based on whether a skip connection is needed or not.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":198-236",
+            "content": "            dilation (int): size of dilation.\n        \"\"\"\n        super(ResBlock, self).__init__()\n        self._inplace_relu = inplace_relu\n        self._eps = eps\n        self.norm_module = norm_module\n        self._construct(\n            dim_in,\n            dim_out,\n            temp_kernel_size,\n            stride,\n            dim_inner,\n            num_groups,\n            stride_1x1,\n            inplace_relu,\n            dilation,\n        )\n    def _construct(\n        self,\n        dim_in,\n        dim_out,\n        temp_kernel_size,\n        stride,\n        dim_inner,\n        num_groups,\n        stride_1x1,\n        inplace_relu,\n        dilation,\n    ):\n        # Use skip connection with projection if dim or res change.\n        if (dim_in != dim_out) or (stride != 1):\n            fan = (dim_out) * (1 * 1 * 1)\n            initializer_tmp = get_conv_init(fan)\n            self.branch1 = paddle.nn.Conv3D(\n                in_channels=dim_in,\n                out_channels=dim_out,\n                kernel_size=1,\n                stride=[1, stride, stride],"
+        },
+        {
+            "comment": "This code defines a ResNet SlowFast MRI model with batch normalization (BN) for the branch1 and a BottleneckTransform layer for branch2. The forward function checks if \"branch1\" exists, suggesting it may be conditionally defined elsewhere in the codebase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":237-259",
+            "content": "                padding=0,\n                weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n                bias_attr=False,\n                dilation=1)\n            self.branch1_bn = self.norm_module(\n                num_features=dim_out,\n                epsilon=self._eps,\n                weight_attr=get_bn_param_attr(),\n                bias_attr=get_bn_param_attr(bn_weight=0.0))\n        self.branch2 = BottleneckTransform(dim_in,\n                                           dim_out,\n                                           temp_kernel_size,\n                                           stride,\n                                           dim_inner,\n                                           num_groups,\n                                           stride_1x1=stride_1x1,\n                                           inplace_relu=inplace_relu,\n                                           dilation=dilation,\n                                           norm_module=self.norm_module)\n    def forward(self, x):\n        if hasattr(self, \"branch1\"):"
+        },
+        {
+            "comment": "This code defines a ResStage class for 3D ResNet, which can handle multi-pathway cases in SlowFast networks. It consists of one or more tensors as input. The stage includes operations such as adding tensors and applying ReLU activation. It also has a branch1 and branch2 for different paths, with batch normalization applied to the first path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":260-293",
+            "content": "            x1 = self.branch1(x)\n            x1 = self.branch1_bn(x1)\n            x2 = self.branch2(x)\n            x = paddle.add(x=x1, y=x2)\n        else:\n            x2 = self.branch2(x)\n            x = paddle.add(x=x, y=x2)\n        x = F.relu(x)\n        return x\nclass ResStage(paddle.nn.Layer):\n    \"\"\"\n    Stage of 3D ResNet. It expects to have one or more tensors as input for\n        multi-pathway (SlowFast) cases.  More details can be found here:\n        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.\n        \"Slowfast networks for video recognition.\"\n        https://arxiv.org/pdf/1812.03982.pdf\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 stride,\n                 temp_kernel_sizes,\n                 num_blocks,\n                 dim_inner,\n                 num_groups,\n                 num_block_temp_kernel,\n                 dilation,\n                 stride_1x1=False,\n                 inplace_relu=True,\n                 norm_module=paddle.nn.BatchNorm3D):"
+        },
+        {
+            "comment": "The given code is the initialization method of a ResStage class in PaddleVideo. It accepts arguments such as dim_in, dim_out, temp_kernel_sizes, stride, and num_blocks to build p pathways with different channel dimensions, temporal kernel sizes, strides, and numbers of blocks for each pathway.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":294-311",
+            "content": "        \"\"\"\n        The `__init__` method of any subclass should also contain these arguments.\n        ResStage builds p streams, where p can be greater or equal to one.\n        Args:\n            dim_in (list): list of p the channel dimensions of the input.\n                Different channel dimensions control the input dimension of\n                different pathways.\n            dim_out (list): list of p the channel dimensions of the output.\n                Different channel dimensions control the input dimension of\n                different pathways.\n            temp_kernel_sizes (list): list of the p temporal kernel sizes of the\n                convolution in the bottleneck. Different temp_kernel_sizes\n                control different pathway.\n            stride (list): list of the p strides of the bottleneck. Different\n                stride control different pathway.\n            num_blocks (list): list of p numbers of blocks for each of the\n                pathway.\n            dim_inner (list): list of the p inner channel dimensions of the"
+        },
+        {
+            "comment": "This function initializes a ResStage object, which is a layer of a network. It takes in parameters such as the number of blocks and temporal kernel sizes for each pathway, and asserts that the number of block_temp_kernel does not exceed the number of blocks. The temp_kernel_sizes are extended to num_block_temp_kernel blocks with temporal kernel size 1 for the rest of the layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":312-329",
+            "content": "                input. Different channel dimensions control the input dimension\n                of different pathways.\n            num_groups (list): list of number of p groups for the convolution.\n                num_groups=1 is for standard ResNet like networks, and\n                num_groups>1 is for ResNeXt like networks.\n            num_block_temp_kernel (list): extent the temp_kernel_sizes to\n                num_block_temp_kernel blocks, then fill temporal kernel size\n                of 1 for the rest of the layers.\n            dilation (list): size of dilation for each pathway.\n        \"\"\"\n        super(ResStage, self).__init__()\n        assert all((num_block_temp_kernel[i] <= num_blocks[i]\n                    for i in range(len(temp_kernel_sizes))))\n        self.num_blocks = num_blocks\n        self.temp_kernel_sizes = [\n            (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +\n            [1] * (num_blocks[i] - num_block_temp_kernel[i])\n            for i in range(len(temp_kernel_sizes))"
+        },
+        {
+            "comment": "The code creates an instance of a backbone network with adjustable pathways and blocks. It checks the input parameters' length, assigns the number of pathways, initializes a norm module, and calls a private method to construct the network structure. The _construct method loops through each pathway and block, creating ResBlock instances.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":330-371",
+            "content": "        ]\n        assert (len({\n            len(dim_in),\n            len(dim_out),\n            len(temp_kernel_sizes),\n            len(stride),\n            len(num_blocks),\n            len(dim_inner),\n            len(num_groups),\n            len(num_block_temp_kernel),\n        }) == 1)\n        self.num_pathways = len(self.num_blocks)\n        self.norm_module = norm_module\n        self._construct(\n            dim_in,\n            dim_out,\n            stride,\n            dim_inner,\n            num_groups,\n            stride_1x1,\n            inplace_relu,\n            dilation,\n        )\n    def _construct(\n        self,\n        dim_in,\n        dim_out,\n        stride,\n        dim_inner,\n        num_groups,\n        stride_1x1,\n        inplace_relu,\n        dilation,\n    ):\n        for pathway in range(self.num_pathways):\n            for i in range(self.num_blocks[pathway]):\n                res_block = ResBlock(\n                    dim_in[pathway] if i == 0 else dim_out[pathway],\n                    dim_out[pathway],\n                    self.temp_kernel_sizes[pathway][i],"
+        },
+        {
+            "comment": "This code defines a ResNet backbone with SlowFast pathways, including stem module and residual blocks. It initializes the layers for each pathway and then defines a forward function to process inputs through the specified number of pathways and blocks. The ResNetBasicStem performs spatiotemporal convolution, batch normalization, and ReLU before pooling in the 3D stem module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":372-403",
+            "content": "                    stride[pathway] if i == 0 else 1,\n                    dim_inner[pathway],\n                    num_groups[pathway],\n                    stride_1x1=stride_1x1,\n                    inplace_relu=inplace_relu,\n                    dilation=dilation[pathway],\n                    norm_module=self.norm_module)\n                self.add_sublayer(\"pathway{}_res{}\".format(pathway, i),\n                                  res_block)\n    def forward(self, inputs):\n        output = []\n        for pathway in range(self.num_pathways):\n            x = inputs[pathway]\n            for i in range(self.num_blocks[pathway]):\n                m = getattr(self, \"pathway{}_res{}\".format(pathway, i))\n                x = m(x)\n            output.append(x)\n        return output\nclass ResNetBasicStem(paddle.nn.Layer):\n    \"\"\"\n    ResNe(X)t 3D stem module.\n    Performs spatiotemporal Convolution, BN, and Relu following by a\n        spatiotemporal pooling.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,"
+        },
+        {
+            "comment": "This code defines a ResNetBasicStem class that initializes the stem of a ResNet network. The constructor takes parameters for kernel size, stride, padding, epsilon value, and norm module. The _construct_stem method creates a 3D convolutional layer with specified dimensions and uses an appropriate initializer for its weights. A batch normalization layer is also created using the provided norm module and epsilon value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":404-431",
+            "content": "                 kernel,\n                 stride,\n                 padding,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        super(ResNetBasicStem, self).__init__()\n        self.kernel = kernel\n        self.stride = stride\n        self.padding = padding\n        self.eps = eps\n        self.norm_module = norm_module\n        self._construct_stem(dim_in, dim_out)\n    def _construct_stem(self, dim_in, dim_out):\n        fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])\n        initializer_tmp = get_conv_init(fan)\n        self._conv = paddle.nn.Conv3D(\n            in_channels=dim_in,\n            out_channels=dim_out,\n            kernel_size=self.kernel,\n            stride=self.stride,\n            padding=self.padding,\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self._bn = self.norm_module(num_features=dim_out,\n                                    epsilon=self.eps,\n                                    weight_attr=get_bn_param_attr(),"
+        },
+        {
+            "comment": "This code defines a video stem module for slow and fast pathways, performing Conv, BN, ReLU, MaxPool operations on input data tensors. The function takes dim_in, dim_out, kernel, stride, padding, eps, and norm_module as arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":432-465",
+            "content": "                                    bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        x = self._conv(x)\n        x = self._bn(x)\n        x = F.relu(x)\n        x = F.max_pool3d(x=x,\n                         kernel_size=[1, 3, 3],\n                         stride=[1, 2, 2],\n                         padding=[0, 1, 1],\n                         data_format=\"NCDHW\")\n        return x\nclass VideoModelStem(paddle.nn.Layer):\n    \"\"\"\n    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool\n    on input data tensor for slow and fast pathways.\n    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 dim_out,\n                 kernel,\n                 stride,\n                 padding,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (list): the list of channel dimensions of the inputs.\n            dim_out (list): the output dimension of the convolution in the stem\n                layer.\n            kernel (list): the kernels' size of the convolutions in the stem"
+        },
+        {
+            "comment": "The code defines a class VideoModelStem with parameters for dimensions, kernel size, stride, padding, and epsilon for batch normalization. It checks for consistent input pathway dimensions and initializes instance variables before constructing the stem layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":466-491",
+            "content": "                layers. Temporal kernel size, height kernel size, width kernel\n                size in order.\n            stride (list): the stride sizes of the convolutions in the stem\n                layer. Temporal kernel stride, height kernel size, width kernel\n                size in order.\n            padding (list): the paddings' sizes of the convolutions in the stem\n                layer. Temporal padding size, height padding size, width padding\n                size in order.\n            eps (float): epsilon for batch norm.\n        \"\"\"\n        super(VideoModelStem, self).__init__()\n        assert (len({\n            len(dim_in),\n            len(dim_out),\n            len(kernel),\n            len(stride),\n            len(padding),\n        }) == 1), \"Input pathway dimensions are not consistent.\"\n        self.num_pathways = len(dim_in)\n        self.kernel = kernel\n        self.stride = stride\n        self.padding = padding\n        self.eps = eps\n        self.norm_module = norm_module\n        self._construct_stem(dim_in, dim_out)"
+        },
+        {
+            "comment": "This code defines a class that constructs and fuses two pathways (slow and fast) in a video processing model. It initializes the stem layers for each pathway and then fuses the information from the fast pathway to the slow pathway. The input tensor should contain the specified number of pathways, and the output is returned as tensors from both pathways in order.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":493-517",
+            "content": "    def _construct_stem(self, dim_in, dim_out):\n        for pathway in range(len(dim_in)):\n            stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],\n                                   self.kernel[pathway], self.stride[pathway],\n                                   self.padding[pathway], self.eps,\n                                   self.norm_module)\n            self.add_sublayer(\"pathway{}_stem\".format(pathway), stem)\n    def forward(self, x):\n        assert (len(x) == self.num_pathways\n                ), \"Input tensor does not contain {} pathway\".format(\n                    self.num_pathways)\n        for pathway in range(len(x)):\n            m = getattr(self, \"pathway{}_stem\".format(pathway))\n            x[pathway] = m(x[pathway])\n        return x\nclass FuseFastToSlow(paddle.nn.Layer):\n    \"\"\"\n    Fuses the information from the Fast pathway to the Slow pathway. Given the\n    tensors from Slow pathway and Fast pathway, fuse information from Fast to\n    Slow, then return the fused tensors from Slow and Fast pathway in order."
+        },
+        {
+            "comment": "The code is initializing a class called FuseFastToSlow with parameters for input channel dimension, fusion convolution channel ratio, fusion kernel size, and frame rate ratio. It sets the number of channels in the 3D Convolution layer for fusing frames from Fast to Slow pathways based on given ratios. The fusion operation is performed using a Conv3D layer initialized with the given fan value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":518-543",
+            "content": "    \"\"\"\n    def __init__(self,\n                 dim_in,\n                 fusion_conv_channel_ratio,\n                 fusion_kernel,\n                 alpha,\n                 fuse_bn_relu=1,\n                 eps=1e-5,\n                 norm_module=paddle.nn.BatchNorm3D):\n        \"\"\"\n        Args:\n            dim_in (int): the channel dimension of the input.\n            fusion_conv_channel_ratio (int): channel ratio for the convolution\n                used to fuse from Fast pathway to Slow pathway.\n            fusion_kernel (int): kernel size of the convolution used to fuse\n                from Fast pathway to Slow pathway.\n            alpha (int): the frame rate ratio between the Fast and Slow pathway.\n            eps (float): epsilon for batch norm.\n        \"\"\"\n        super(FuseFastToSlow, self).__init__()\n        self.fuse_bn_relu = fuse_bn_relu\n        fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)\n        initializer_tmp = get_conv_init(fan)\n        self._conv_f2s = paddle.nn.Conv3D(\n            in_channels=dim_in,"
+        },
+        {
+            "comment": "This code defines a ResNetSlowFast_MRI model, which is a type of SlowFast network. It includes a fusion convolution layer and a batch normalization (BN) layer with optional ReLU activation after the fusion convolution. The forward method combines input x_s and x_f using concat before returning both inputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":544-571",
+            "content": "            out_channels=dim_in * fusion_conv_channel_ratio,\n            kernel_size=[fusion_kernel, 1, 1],\n            stride=[alpha, 1, 1],\n            padding=[fusion_kernel // 2, 0, 0],\n            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),\n            bias_attr=False)\n        self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,\n                               epsilon=eps,\n                               weight_attr=get_bn_param_attr(),\n                               bias_attr=get_bn_param_attr(bn_weight=0.0))\n    def forward(self, x):\n        x_s = x[0]\n        x_f = x[1]\n        fuse = self._conv_f2s(x_f)\n        #  TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.\n        if self.fuse_bn_relu:\n            fuse = self._bn(fuse)\n            fuse = F.relu(fuse)\n        x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)\n        return [x_s_fuse, x_f]\n@BACKBONES.register()\nclass ResNetSlowFast_MRI(paddle.nn.Layer):\n    \"\"\"\n    SlowFast model builder for SlowFast network."
+        },
+        {
+            "comment": "This code initializes a ResNetSlowFast_MRI model with specified parameters, including alpha and beta values for the network architecture. The class extends from an existing superclass and includes properties like the norm module type, number of pathways, depth, group numbers, input channel numbers, width per group, fusion convolution channel ratio, pool size ratios, whether to use a pooling average operation at spatial stride 2, and spatial strides. The class also initializes these specified attributes for the model configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":573-606",
+            "content": "    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.\n    \"Slowfast networks for video recognition.\"\n    https://arxiv.org/pdf/1812.03982.pdf\n    \"\"\"\n    def __init__(\n        self,\n        alpha,\n        beta,\n        bn_norm_type=\"batchnorm\",\n        bn_num_splits=1,\n        num_pathways=2,\n        depth=50,\n        num_groups=1,\n        input_channel_num=[1, 1],\n        width_per_group=64,\n        fusion_conv_channel_ratio=2,\n        fusion_kernel_sz=7,  #5?\n        pool_size_ratio=[[1, 1, 1], [1, 1, 1]],\n        fuse_bn_relu=1,\n        spatial_strides=[[1, 1], [2, 2], [2, 2], [2, 2]],\n        use_pool_af_s2=1,\n    ):\n        \"\"\"\n        Args:\n            cfg (CfgNode): model building configs, details are in the\n                comments of the config file.\n        \"\"\"\n        super(ResNetSlowFast_MRI, self).__init__()\n        self.alpha = alpha  #8\n        self.beta = beta  #8\n        self.norm_module = get_norm(bn_norm_type, bn_num_splits)\n        self.num_pathways = num_pathways\n        self.depth = depth"
+        },
+        {
+            "comment": "The code defines a SlowFast model with separate slow and fast pathways. The constructor sets parameters like input channel number, group number, fusion convolution channel ratio, and more. It also includes functions to build the network structure. The temporal kernels for each layer of both pathways are defined within the code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":607-631",
+            "content": "        self.num_groups = num_groups\n        self.input_channel_num = input_channel_num\n        self.width_per_group = width_per_group\n        self.fusion_conv_channel_ratio = fusion_conv_channel_ratio\n        self.fusion_kernel_sz = fusion_kernel_sz  # NOTE: modify to 7 in 8*8, 5 in old implement\n        self.pool_size_ratio = pool_size_ratio\n        self.fuse_bn_relu = fuse_bn_relu\n        self.spatial_strides = spatial_strides\n        self.use_pool_af_s2 = use_pool_af_s2\n        self._construct_network()\n    def _construct_network(self):\n        \"\"\"\n        Builds a SlowFast model.\n        The first pathway is the Slow pathway\n        and the second pathway is the Fast pathway.\n        Args:\n            cfg (CfgNode): model building configs, details are in the\n                comments of the config file.\n        \"\"\"\n        temp_kernel = [\n            [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.\n            [[1], [3]],  # res2 temporal kernel for slow and fast pathway.\n            [[1], [3]],  # res3 temporal kernel for slow and fast pathway."
+        },
+        {
+            "comment": "This code defines a ResNet backbone for the MRI dataset. It sets the temporal kernels for res4 and res5 pathways, initializes the video model stem (s1) with specified dimensions and stride, adds a fuseFastToSlow module for fusion, and defines model stage depth based on the chosen depth of ResNet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":632-656",
+            "content": "            [[3], [3]],  # res4 temporal kernel for slow and fast pathway.\n            [[3], [3]],\n        ]  # res5 temporal kernel for slow and fast pathway.\n        self.s1 = VideoModelStem(\n            dim_in=self.input_channel_num,\n            dim_out=[self.width_per_group, self.width_per_group // self.beta],\n            kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],\n            stride=[[1, 2, 2]] * 2,\n            padding=[\n                [temp_kernel[0][0][0] // 2, 3, 3],\n                [temp_kernel[0][1][0] // 2, 3, 3],\n            ],\n            norm_module=self.norm_module)\n        self.s1_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu)\n        # ResNet backbone\n        MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}\n        (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]"
+        },
+        {
+            "comment": "The code is defining the parameters for a ResStage layer in a residual network model. It sets the input and output dimensions, inner dimensions, temporal kernel sizes, and stride values based on previously defined values from the function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":658-677",
+            "content": "        num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]\n        spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]\n        spatial_strides = self.spatial_strides\n        #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]\n        #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment\n        out_dim_ratio = self.beta // self.fusion_conv_channel_ratio  #4\n        dim_inner = self.width_per_group * self.num_groups  #64\n        self.s2 = ResStage(dim_in=[\n            self.width_per_group + self.width_per_group // out_dim_ratio,\n            self.width_per_group // self.beta,\n        ],\n                           dim_out=[\n                               self.width_per_group * 4,\n                               self.width_per_group * 4 // self.beta,\n                           ],\n                           dim_inner=[dim_inner, dim_inner // self.beta],\n                           temp_kernel_sizes=temp_kernel[1],\n                           stride=spatial_strides[0],"
+        },
+        {
+            "comment": "The code initializes a ResNet SlowFast model with multiple layers and parameters. It creates two branches (slow and fast) for the network, each with its own set of layers and parameters. The slow branch has 2x more blocks than the fast branch, and both branches have identical group numbers and dilation rates. The code also initializes a fusing layer that combines features from the fast and slow branches and a stage for the third level of the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":678-703",
+            "content": "                           num_blocks=[d2] * 2,\n                           num_groups=[self.num_groups] * 2,\n                           num_block_temp_kernel=num_block_temp_kernel[0],\n                           dilation=spatial_dilations[0],\n                           norm_module=self.norm_module)\n        self.s2_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 4 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s3 = ResStage(\n            dim_in=[\n                self.width_per_group * 4 +\n                self.width_per_group * 4 // out_dim_ratio,\n                self.width_per_group * 4 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 8,\n                self.width_per_group * 8 // self.beta,\n            ],\n            dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],"
+        },
+        {
+            "comment": "This code is initializing a ResNet SlowFast model for MRI. It creates an instance of the class, sets parameters like kernel sizes, strides, block numbers, group numbers, dilation rates, and normalization module. The model consists of multiple stages, including s1, s2, s3_fuse, and s4, each with different dimensions, inner dimensions, and configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":704-732",
+            "content": "            temp_kernel_sizes=temp_kernel[2],\n            stride=spatial_strides[1],\n            num_blocks=[d3] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[1],\n            dilation=spatial_dilations[1],\n            norm_module=self.norm_module,\n        )\n        self.s3_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 8 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s4 = ResStage(\n            dim_in=[\n                self.width_per_group * 8 +\n                self.width_per_group * 8 // out_dim_ratio,\n                self.width_per_group * 8 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 16,\n                self.width_per_group * 16 // self.beta,\n            ],\n            dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],"
+        },
+        {
+            "comment": "This code defines a ResNet SlowFast model with MRI for video analysis. It includes creating layers for stage 4, fusing fast and slow features, and defining the stage 5 layer with specific dimensions for input, output, and inner dimensions. The model utilizes group width, out_dim_ratio, beta, and dim_inner parameters to control its behavior.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":733-761",
+            "content": "            temp_kernel_sizes=temp_kernel[3],\n            stride=spatial_strides[2],\n            num_blocks=[d4] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[2],\n            dilation=spatial_dilations[2],\n            norm_module=self.norm_module,\n        )\n        self.s4_fuse = FuseFastToSlow(\n            dim_in=self.width_per_group * 16 // self.beta,\n            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,\n            fusion_kernel=self.fusion_kernel_sz,\n            alpha=self.alpha,\n            norm_module=self.norm_module,\n            fuse_bn_relu=self.fuse_bn_relu,\n        )\n        self.s5 = ResStage(\n            dim_in=[\n                self.width_per_group * 16 +\n                self.width_per_group * 16 // out_dim_ratio,\n                self.width_per_group * 16 // self.beta,\n            ],\n            dim_out=[\n                self.width_per_group * 32,\n                self.width_per_group * 32 // self.beta,\n            ],\n            dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],"
+        },
+        {
+            "comment": "This code initializes a 3D ResNet SlowFast model, sets the weights, and applies several stages of convolutions and fusions to process video input. The forward function sequentially passes the input through multiple stages, potentially applying max pooling for AVA if use_pool_af_s2 is True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":762-792",
+            "content": "            temp_kernel_sizes=temp_kernel[4],\n            stride=spatial_strides[3],\n            num_blocks=[d5] * 2,\n            num_groups=[self.num_groups] * 2,\n            num_block_temp_kernel=num_block_temp_kernel[3],\n            dilation=spatial_dilations[3],\n            norm_module=self.norm_module,\n        )\n    def init_weights(self):\n        pass\n    def forward(self, x):\n        x = self.s1(x)  #VideoModelStem\n        x = self.s1_fuse(x)  #FuseFastToSlow\n        x = self.s2(x)  #ResStage\n        x = self.s2_fuse(x)\n        #  TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.\n        if self.use_pool_af_s2:\n            for pathway in range(self.num_pathways):\n                x[pathway] = F.max_pool3d(\n                    x=x[pathway],\n                    kernel_size=self.pool_size_ratio[pathway],\n                    stride=self.pool_size_ratio[pathway],\n                    padding=[0, 0, 0],\n                    data_format=\"NCDHW\")\n        x = self.s3(x)\n        x = self.s3_fuse(x)\n        x = self.s4(x)"
+        },
+        {
+            "comment": "This code snippet is part of a ResNet SlowFast model. It fuses the outputs from previous stages (s4), passes them through another stage (s5) and returns the result as output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py\":793-795",
+            "content": "        x = self.s4_fuse(x)\n        x = self.s5(x)\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/697c842e-2194-4f41-9c5e-8c5b11a49a12.json b/docs/doc/697c842e-2194-4f41-9c5e-8c5b11a49a12.json
new file mode 100644
index 000000000..7bfe9e679
--- /dev/null
+++ b/docs/doc/697c842e-2194-4f41-9c5e-8c5b11a49a12.json
@@ -0,0 +1,70 @@
+{
+    "summary": "The code initializes a ResNet-TSM backbone with convolutional layers and bottleneck blocks, but may be deprecated and needs data preparation changes for better compatibility.",
+    "details": [
+        {
+            "comment": "This code snippet defines a ConvBNLayer class which combines a Conv2D and BatchNorm2D layer. It is imported from paddle.nn and will be used for creating convolutional neural network layers in the PaddlePaddle framework. The layer can be utilized to process image data or other types of spatial input data by applying convolution operations followed by batch normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":0-29",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:"
+        },
+        {
+            "comment": "This code defines a class called ConvBNLayer which inherits from an unspecified parent class. It takes in parameters such as number of input and output channels, kernel size, stride, groups, activation function, name, and data format for the Conv2D layer. The class initializes a Conv2D layer using these parameters and adds BatchNorm2D and ReLU layers after it. Weight and bias initialization values are named in the restore parameters, and they are explicitly declared in the init_weights method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":30-52",
+            "content": "        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 act=None,\n                 name=None,\n                 data_format=\"NCHW\"):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,"
+        },
+        {
+            "comment": "This code defines a BottleneckBlock class with a convolution layer, batch normalization, and optional activation function. The forward pass applies the convolution, batch normalization, and activation if present.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":53-83",
+            "content": "                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False,\n                            data_format=data_format)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(\n            out_channels,\n            weight_attr=ParamAttr(name=bn_name + \"_scale\",\n                                  regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(name=bn_name + \"_offset\",\n                                regularizer=L2Decay(0.0)),\n            data_format=data_format)\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,"
+        },
+        {
+            "comment": "This code defines a class called BottleneckBlock, which is a part of a neural network model. It contains several ConvBNLayer objects for processing input data, with different parameters such as in_channels, out_channels, kernel_size, stride, and act (activation function). The class also has an attribute for data_format and initializes the ConvBNLayer objects with specific names.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":84-107",
+            "content": "                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 num_seg=8,\n                 name=None,\n                 data_format=\"NCHW\"):\n        super(BottleneckBlock, self).__init__()\n        self.data_format = data_format\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\",\n                                 data_format=data_format)\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2b\",\n                                 data_format=data_format)\n        self.conv2 = ConvBNLayer(in_channels=out_channels,"
+        },
+        {
+            "comment": "This code is initializing a ConvBNLayer and a shortcut connection for the TSM backbone. The layers have specific out_channels, kernel_size, stride, name, and data_format configurations. If a shortcut is not provided, it initializes another ConvBNLayer. The forward function reshapes input to have a shape of [N, T, C, H, W] for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":108-133",
+            "content": "                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\",\n                                 data_format=data_format)\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\",\n                                     data_format=data_format)\n        self.shortcut = shortcut\n        self.num_seg = num_seg\n    def forward(self, inputs):\n        if paddle.is_compiled_with_custom_device('npu'):\n            x = inputs\n            seg_num = self.num_seg\n            shift_ratio = 1.0 / self.num_seg\n            shape = x.shape  #[N*T, C, H, W]\n            reshape_x = x.reshape(\n                (-1, seg_num, shape[1], shape[2], shape[3]))  #[N, T, C, H, W]"
+        },
+        {
+            "comment": "This code performs temporal shift operation on the input tensor. If a certain condition is met, it pads and concatenates slices of the input tensor before performing the reshape and temporal shift operations. The resulting output is then passed through several convolutional layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":134-163",
+            "content": "            pad_x = F.pad(reshape_x, [\n                0,\n                0,\n                1,\n                1,\n                0,\n                0,\n                0,\n                0,\n                0,\n                0,\n            ])  #[N, T+2, C, H, W]\n            c1 = int(shape[1] * shift_ratio)\n            c2 = int(shape[1] * 2 * shift_ratio)\n            slice1 = pad_x[:, :seg_num, :c1, :, :]\n            slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]\n            slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]\n            concat_x = paddle.concat([slice1, slice2, slice3],\n                                     axis=2)  #[N, T, C, H, W]\n            shifts = concat_x.reshape(shape)\n        else:\n            shifts = F.temporal_shift(inputs,\n                                      self.num_seg,\n                                      1.0 / self.num_seg,\n                                      data_format=self.data_format)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:"
+        },
+        {
+            "comment": "The code defines a BasicBlock class which is a residual block. It contains two 3x3 convolutional layers followed by BN and ReLU activations. If the shortcut connection is not used, it also includes an additional convolution layer for the shortcut path. The purpose of this residual block is to alleviate the problem of vanishing gradients in deeper networks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":164-201",
+            "content": "            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.relu(y)\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None,\n                 data_format=\"NCHW\"):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            filter_size=3,\n            stride=stride,\n            act=\"relu\",\n            name=name + \"_branch2a\",\n            data_format=data_format,\n        )\n        self.conv1 = ConvBNLayer(\n            in_channels=out_channels,\n            out_channels=out_channels,\n            filter_size=3,\n            act=None,\n            name=name + \"_branch2b\",\n            data_format=data_format,\n        )\n        if not shortcut:\n            self.short = ConvBNLayer(\n                in_channels=in_channels,"
+        },
+        {
+            "comment": "The code defines a ResNet TSM backbone model with specified depth and data format. It consists of an initialization, a forward function for processing inputs, and the ability to be registered at BACKBONES. It also supports different layers like 18, 34, 50, 101, and 152.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":202-240",
+            "content": "                out_channels=out_channels,\n                filter_size=1,\n                stride=stride,\n                name=name + \"_branch1\",\n                data_format=data_format,\n            )\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTSM(nn.Layer):\n    \"\"\"ResNet TSM backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, num_seg=8, data_format=\"NCHW\", pretrained=None):\n        super(ResNetTSM, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        self.num_seg = num_seg\n        self.data_format = data_format\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\"
+        },
+        {
+            "comment": "This code initializes a ResNet-TSM backbone with different depth configurations based on the input layers. It includes a convolution layer, max pooling 2D layer, and a block list for deeper networks. The code checks if the layers are supported (18, 34, 50, 101, or 152) and assigns corresponding depth and number of channels accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":241-272",
+            "content": "            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = 64\n        out_channels = [64, 128, 256, 512]\n        self.conv = ConvBNLayer(in_channels=3,\n                                out_channels=64,\n                                kernel_size=7,\n                                stride=2,\n                                act=\"relu\",\n                                name=\"conv1\",\n                                data_format=self.data_format)\n        self.pool2D_max = MaxPool2D(\n            kernel_size=3,\n            stride=2,\n            padding=1,\n            data_format=self.data_format,\n        )\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):"
+        },
+        {
+            "comment": "Code creates bottleneck blocks for ResNet TSM architecture, varying the number of input channels based on layer index and configuration. It adds sublayers with specified parameters including number of segments and stride for each block.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":273-292",
+            "content": "                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        conv_name,\n                        BottleneckBlock(\n                            in_channels=in_channels\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            num_seg=self.num_seg,\n                            shortcut=shortcut,\n                            name=conv_name,\n                            data_format=self.data_format))"
+        },
+        {
+            "comment": "Code initializes a ResNet TSM model backbone, with block-specific in_channels and adds either bottleneck or basic blocks depending on the depth configuration. Init_weights function is also defined to initialize weights for the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":293-315",
+            "content": "                    in_channels = out_channels[block] * 4\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(\n                            in_channels=in_channels[block]\n                            if i == 0 else out_channels[block],\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            shortcut=shortcut,\n                            name=conv_name,\n                            data_format=self.data_format,\n                        ))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):"
+        },
+        {
+            "comment": "This code initializes parameters for a ResNet TSM backbone. If a pretrained loading path is provided, it loads the weights from that path; otherwise, it uses specific initialization functions for Conv2D and BatchNorm2d layers. No bias is used in Conv2D layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":316-331",
+            "content": "        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):"
+        },
+        {
+            "comment": "This code defines a forward function for a backbone model. It uses convolution and pooling layers to extract features from input data. The comments indicate that this implementation may be deprecated, and the data preparation should be modified according to recognizer2d.py for better compatibility with paddlepaddle's to_static method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py\":332-352",
+            "content": "                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        #NOTE: (deprecated design) Already merge axis 0(batches) and axis 1(clips) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27\n        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        #NOTE: As paddlepaddle to_static method need a \"pure\" model to trim. It means from\n        #  1. the phase of generating data[images, label] from dataloader\n        #     to\n        #  2. last layer of a model, always is FC layer\n        y = self.conv(inputs)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/69888b77-5f29-424b-a6fd-2c258d265b45.json b/docs/doc/69888b77-5f29-424b-a6fd-2c258d265b45.json
new file mode 100644
index 000000000..49ec046fd
--- /dev/null
+++ b/docs/doc/69888b77-5f29-424b-a6fd-2c258d265b45.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines a SkeletonDataset class for action recognition, loading skeleton features and applying normalization operations. It imports libraries, registers the dataset, includes a logger, and has a class for loading skeleton data with optional label path and test mode parameter. The class loads and returns data for training or testing, preparing features based on training/testing needs and considering labels if available.",
+    "details": [
+        {
+            "comment": "This code defines a SkeletonDataset class for action recognition. It loads skeleton features and applies normalization operations. It also imports necessary libraries, registers the dataset with DATASETS, and includes a logger for logging purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/skeleton.py\":0-33",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nimport pickle\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass SkeletonDataset(BaseDataset):\n    \"\"\"\n    Skeleton dataset for action recognition.\n    The dataset loads skeleton feature, and apply norm operatations.\n    Args:\n        file_path (str): Path to the index file."
+        },
+        {
+            "comment": "This code defines a class for loading skeleton data. It takes file path, pipeline, and label path (optional) as input parameters. It also has an optional test mode parameter. The `__init__` method initializes the class with provided parameters. The `load_file` method loads feature files to get skeleton information and handles different file types for labels. If a label path is given and it ends with 'npy' or 'pkl', it will load the label; otherwise, it just outputs predictions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/skeleton.py\":34-54",
+            "content": "        pipeline(obj): Define the pipeline of data preprocessing.\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n    \"\"\"\n    def __init__(self, file_path, pipeline, label_path=None, test_mode=False):\n        self.label_path = label_path\n        super().__init__(file_path, pipeline, test_mode=test_mode)\n    def load_file(self):\n        \"\"\"Load feature file to get skeleton information.\"\"\"\n        logger.info(\"Loading data, it will take some moment...\")\n        self.data = np.load(self.file_path)\n        if self.label_path:\n            if self.label_path.endswith('npy'):\n                self.label = np.load(self.label_path)\n            elif self.label_path.endswith('pkl'):\n                with open(self.label_path, 'rb') as f:\n                    sample_name, self.label = pickle.load(f)\n        else:\n            logger.info(\n                \"Label path not provided when test_mode={}, here just output predictions.\""
+        },
+        {
+            "comment": "The code defines a class for loading, preparing, and returning data for training or testing. The `__getitem__` method loads the data and returns it when accessed by index. The `prepare_train` method prepares the feature for training/validation given an index. The `prepare_test` method prepares the feature for testing given an index, considering label if available.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/skeleton.py\":55-77",
+            "content": "                .format(self.test_mode))\n        logger.info(\"Data Loaded!\")\n        return self.data  # used for __len__\n    def prepare_train(self, idx):\n        \"\"\"Prepare the feature for training/valid given index. \"\"\"\n        results = dict()\n        results['data'] = copy.deepcopy(self.data[idx])\n        results['label'] = copy.deepcopy(self.label[idx])\n        results = self.pipeline(results)\n        return results['data'], results['label']\n    def prepare_test(self, idx):\n        \"\"\"Prepare the feature for test given index. \"\"\"\n        results = dict()\n        results['data'] = copy.deepcopy(self.data[idx])\n        if self.label_path:\n            results['label'] = copy.deepcopy(self.label[idx])\n            results = self.pipeline(results)\n            return results['data'], results['label']\n        else:\n            results = self.pipeline(results)\n            return [results['data']]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/69960b55-1729-4511-930a-f4f153b795a5.json b/docs/doc/69960b55-1729-4511-930a-f4f153b795a5.json
new file mode 100644
index 000000000..8b369b9a9
--- /dev/null
+++ b/docs/doc/69960b55-1729-4511-930a-f4f153b795a5.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is a module for the PaddleVideo package that includes various utility functions and classes, such as Registry, build_utils, config, logger, record, dist_utils, save_load, and precise_bn. It also defines __all__ to include Registry and build.",
+    "details": [
+        {
+            "comment": "This code is a module for the PaddleVideo package that includes various utility functions and classes, such as Registry, build_utils, config, logger, record, dist_utils, save_load, and precise_bn. It also defines __all__ to include Registry and build.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py\":0-24",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .registry import Registry\nfrom .build_utils import build\nfrom .config import *\nfrom .logger import setup_logger, coloring, get_logger\nfrom .record import AverageMeter, build_record, build_rec_record, log_batch, log_epoch\nfrom .dist_utils import get_dist_info, main_only\nfrom .save_load import save, load, load_ckpt, mkdir\nfrom .precise_bn import do_preciseBN\n__all__ = ['Registry', 'build']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/69a563ed-fcd4-4c89-b365-43dad88d4cc6.json b/docs/doc/69a563ed-fcd4-4c89-b365-43dad88d4cc6.json
new file mode 100644
index 000000000..a00a4cd63
--- /dev/null
+++ b/docs/doc/69a563ed-fcd4-4c89-b365-43dad88d4cc6.json
@@ -0,0 +1,10 @@
+{
+    "summary": "Code file \"preprocess.py\" contains four functions: \n1. ffmpeg_frames extracts frames from a video using ffmpeg and saves them as jpg files in a specified folder, at the specified frame rate.\n2. ffmpeg_pcm extracts audio from a video and saves it as a PCM file.\n3. ffmpeg_mp4 downloads a video file from a URL to the local machine.\n4. get_images retrieves all image files in a directory, sorts them, and stores their paths in a list.",
+    "details": [
+        {
+            "comment": "Code file \"preprocess.py\" contains four functions: \n1. ffmpeg_frames extracts frames from a video using ffmpeg and saves them as jpg files in a specified folder, at the specified frame rate.\n2. ffmpeg_pcm extracts audio from a video and saves it as a PCM file.\n3. ffmpeg_mp4 downloads a video file from a URL to the local machine.\n4. get_images retrieves all image files in a directory, sorts them, and stores their paths in a list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/preprocess.py\":0-35",
+            "content": "\"\"\" extract frames and pcm\"\"\"\nimport os\nimport sys\nimport shutil\ndef ffmpeg_frames(mp4_addr, frame_out_folder, fps=5):\n    \"\"\"ffmpeg_frames\"\"\"\n    if os.path.exists(frame_out_folder):\n        shutil.rmtree(frame_out_folder)\n    os.makedirs(frame_out_folder)\n    cmd = './src/utils/ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (\n        mp4_addr, fps, frame_out_folder, '%08d')\n    os.system(cmd)\ndef ffmpeg_pcm(mp4_addr, save_file_name):\n    \"\"\"ffmpeg_pcm\"\"\"\n    cmd = './src/utils/ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' \\\n        % (mp4_addr, save_file_name)\n    os.system(cmd)\ndef ffmpeg_mp4(mp4_url, mp4_addr):\n    \"\"\"ffmpeg_mp4\"\"\"\n    cmd = \"wget %s -O %s -q\" % (mp4_url, mp4_addr)\n    print(\"cmd = \", cmd)\n    os.system(cmd)\ndef get_images(image_path):\n    \"\"\"get_images\"\"\"\n    images = sorted(os.listdir(image_path))\n    images = images\n    images_path_list = [image_path + '/' + im for im in images]\n    return images_path_list"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6a1dd251-9bc5-473b-b8bc-e9eca8392b41.json b/docs/doc/6a1dd251-9bc5-473b-b8bc-e9eca8392b41.json
new file mode 100644
index 000000000..0c95dc832
--- /dev/null
+++ b/docs/doc/6a1dd251-9bc5-473b-b8bc-e9eca8392b41.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The AttentionLSTM model is presented, using LSTMs and an Attention layer to weigh frame features. The code trains and tests on YouTube-8M with PaddleVideo, exporting the model for classification. It accurately predicts top-1 class 11 with 0.9841 confidence.",
+    "details": [
+        {
+            "comment": "This code introduces the AttentionLSTM model, which utilizes two-way LSTMs to encode all video frame features and adds an Attention layer for adaptive weighting. This improves upon traditional methods by linearly weighing final feature vectors based on hidden state outputs at each moment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/attention_lstm.md\":0-18",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/attention_lstm.md) | English\n# AttentionLSTM\n## content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nRecurrent Neural Networks (RNN) are often used in the processing of sequence data, which can model the sequence information of multiple consecutive frames of video, and are commonly used methods in the field of video classification.\nThis model uses a two-way long and short-term memory network (LSTM) to encode all the frame features of the video in sequence. Unlike the traditional method that directly uses the output of the last moment of LSTM, this model adds an Attention layer, and the hidden state output at each moment has an adaptive weight, and then linearly weights the final feature vector. The reference paper implements a two-layer LSTM structure, while **this model implements a two-way LSTM with Attention**.\nThe Attention layer can refer to the paper [AttentionCluster](https://arxiv.org/abs/1711.09550)"
+        },
+        {
+            "comment": "This code provides instructions on how to train and test a model using PaddleVideo's attention LSTM on the Youtube-8M dataset. It mentions the required commands for training and testing, and also states that 8 GPUs are used during the process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/attention_lstm.md\":20-44",
+            "content": "## Data\nPaddleVide provides training and testing scripts on the Youtube-8M dataset. Youtube-8M data download and preparation please refer to [YouTube-8M data preparation](../../dataset/youtube8m.md)\n## Train\n### Youtube-8M data set training\n#### Start training\n- The Youtube-8M data set uses 8 cards for training. In the feature format, video and audio features will be used as input. The training start command of the data is as follows\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_attetion_lstm main.py --validate -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml\n  ```\n## Test\nThe command is as follows:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_attetion_lstm main.py --test -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml -w \"output/AttentionLSTM/AttentionLSTM_best.pdparams\"\n```\nWhen the test configuration uses the following parameters, the test indicators on the validation data set of Youtube-8M are as follows:"
+        },
+        {
+            "comment": "This code provides instructions to export an inference model and use the prediction engine for it. The exported model will be stored as AttentionLSTM.pdmodel and AttentionLSTM.pdiparams files, which are necessary for making predictions. Users can use the tools/predict.py script with the input file data/example.pkl and the configuration file configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml to perform inference using the prediction engine.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/attention_lstm.md\":46-67",
+            "content": "| Hit@1 | PERR | GAP | checkpoints |\n| :-----: | :---------: | :---: | ----- |\n| 89.05 | 80.49 | 86.30 | [AttentionLSTM_yt8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams) |\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \\\n                                -p data/AttentionLSTM_yt8.pdparams \\\n                                -o inference/AttentionLSTM\n```\nThe above command will generate the model structure file `AttentionLSTM.pdmodel` and the model weight file `AttentionLSTM.pdiparams` required for prediction.\nFor the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0.0/docs/en/start.md#2-infer)\n### Use prediction engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.pkl \\\n                           --config configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \\"
+        },
+        {
+            "comment": "This code executes the AttentionLSTM model for video classification on a specific file (data/example.pkl). The predicted top-1 class is 11, and the confidence is 0.9841002225875854. This result utilizes the model trained on YouTube-8M dataset, indicating its accuracy in video classification tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/attention_lstm.md\":68-83",
+            "content": "                           --model_file inference/AttentionLSTM/AttentionLSTM.pdmodel \\\n                           --params_file inference/AttentionLSTM/AttentionLSTM.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nAn example of the output is as follows:\n```bash\nCurrent video file: data/example.pkl\n         top-1 class: 11\n         top-1 score: 0.9841002225875854\n```\nIt can be seen that using the AttentionLSTM model trained on Youtube-8M to predict data/example.pkl, the output top1 category id is 11, and the confidence is 0.98.\n## Reference paper\n- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen\n- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6b1c905d-49db-4909-8ae6-bf47e973e703.json b/docs/doc/6b1c905d-49db-4909-8ae6-bf47e973e703.json
new file mode 100644
index 000000000..aa604bb76
--- /dev/null
+++ b/docs/doc/6b1c905d-49db-4909-8ae6-bf47e973e703.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code adapts ViT model parameters, modifies pos_embed and time_embed for compatibility, and includes functions for loading/saving PaddlePaddle models with parallel/non-parallel handling and progress bar.",
+    "details": [
+        {
+            "comment": "The code is a function that converts pre-trained ViT model parameters to match the existing model. It takes in the model, state_dicts, num_patches, seg_num, and attention_type as arguments. The function adapts the ViT's pre-trained model parameters for better compatibility with the existing model structure.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py\":0-27",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport os.path as osp\nimport time\nfrom tqdm import tqdm\nimport paddle\nimport paddle.nn.functional as F\nfrom EIVideo.paddlevideo.utils import get_logger\nfrom EIVideo.paddlevideo.utils import main_only\ndef pretrain_vit_param_trans(model, state_dicts, num_patches, seg_num,\n                             attention_type):\n    \"\"\"\n    Convert ViT's pre-trained model parameters to a parameter dictionary that matches the existing model"
+        },
+        {
+            "comment": "This code modifies the 'pos_embed' tensor in state_dicts if its shape doesn't match the expected shape. It interpolates the other_pos_embed to fit the desired size, then concatenates the cls_pos_embed and new_pos_embed and updates the state_dicts['pos_embed']. This allows the code to maintain consistency in the 'pos_embed' tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py\":28-48",
+            "content": "    \"\"\"\n    if 'head' + '.weight' in state_dicts:\n        del state_dicts['head' + '.weight']\n    if 'head' + '.bias' in state_dicts:\n        del state_dicts['head' + '.bias']\n    total_len = len(model.state_dict())\n    if num_patches + 1 != state_dicts['pos_embed'].shape[1]:\n        pos_embed = state_dicts['pos_embed']\n        cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)\n        other_pos_embed = pos_embed[0,\n                                    1:, :].unsqueeze(0).unsqueeze(1).transpose(\n                                        (0, 1, 3, 2))\n        new_pos_embed = F.interpolate(other_pos_embed,\n                                      size=(other_pos_embed.shape[-2],\n                                            num_patches),\n                                      mode='nearest')\n        new_pos_embed = new_pos_embed.squeeze(0).transpose((0, 2, 1))\n        new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), axis=1)\n        state_dicts['pos_embed'] = new_pos_embed\n        time.sleep(0.01)"
+        },
+        {
+            "comment": "The code checks if a specific key 'time_embed' exists in the state_dicts and adjusts its shape accordingly. It then interpolates the time_embed using nearest mode and transposes it to fit into the new shape. After that, it creates a progress bar \"Loading weights\" using tqdm for the total length of data and sets the description as the current key being processed. If 'attn' is present in the key and 'blocks', it replaces 'attn' with 'temporal_attn' if not already present in state_dicts and adds it to new_state_dicts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py\":50-70",
+            "content": "    if 'time_embed' in state_dicts and seg_num != state_dicts[\n            'time_embed'].shape[1]:\n        time_embed = state_dicts['time_embed'].transpose((0, 2, 1)).unsqueeze(0)\n        new_time_embed = F.interpolate(time_embed,\n                                       size=(time_embed.shape[-2], seg_num),\n                                       mode='nearest')\n        state_dicts['time_embed'] = new_time_embed.squeeze(0).transpose(\n            (0, 2, 1))\n        time.sleep(0.01)\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        if attention_type == 'divided_space_time':\n            new_state_dicts = state_dicts.copy()\n            for key in tqdm(state_dicts):\n                if 'blocks' in key and 'attn' in key:\n                    desc.set_description(\"Loading %s\" % key)\n                    new_key = key.replace('attn', 'temporal_attn')\n                    if not new_key in state_dicts:\n                        new_state_dicts[new_key] = state_dicts[key]"
+        },
+        {
+            "comment": "This code is checking if a certain key exists in the state_dict and creating a new key with 'temporal' added to it. It is also updating the description for loading weights and returning the updated state_dicts. The function pretrain_resnet18_param_trans compares loaded dict with encoder and pose_encoder dictionaries in the model, possibly for parameter transfer learning.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py\":71-95",
+            "content": "                    else:\n                        new_state_dicts[new_key] = state_dicts[new_key]\n                if 'blocks' in key and 'norm1' in key:\n                    desc.set_description(\"Loading %s\" % key)\n                    new_key = key.replace('norm1', 'temporal_norm1')\n                    if not new_key in state_dicts:\n                        new_state_dicts[new_key] = state_dicts[key]\n                    else:\n                        new_state_dicts[new_key] = state_dicts[new_key]\n                time.sleep(0.01)\n    ret_str = \"loading {:<20d} weights completed.\".format(\n        len(model.state_dict()))\n    desc.set_description(ret_str)\n    return new_state_dicts\ndef pretrain_resnet18_param_trans(model, loaded_dict):\n    encoder_dict = model.encoder.state_dict()\n    pose_encoder_dict = model.pose_encoder.state_dict()\n    names = ['encoder.', 'encoder_day.', 'encoder_night.']\n    for name in names:\n        for key, value in loaded_dict.items():\n            key = str(name + key)\n            if key in encoder_dict:"
+        },
+        {
+            "comment": "The code is defining a function to load pre-trained model parameters, which requires converting the parameters of the pre-trained model into the parameters needed for the current model. The function first checks if the weight path exists and raises an IOError if it does not. Then, it loads the state_dicts from the given weight_path using paddle.load().",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py\":96-126",
+            "content": "                encoder_dict[key] = value\n    num_input_images = 2\n    loaded_dict['conv1.weight'] = paddle.concat(\n        [loaded_dict['conv1.weight']] * num_input_images, 1) / num_input_images\n    for name, value in loaded_dict.items():\n        name = str('encoder.' + name)\n        if name in pose_encoder_dict:\n            pose_encoder_dict[name] = value\n    return encoder_dict, pose_encoder_dict\n#XXX(shipping): maybe need load N times because of different cards have different params.\n@main_only\ndef load_ckpt(model, weight_path, **kargs):\n    \"\"\"\n    1. Load pre-trained model parameters\n    2. Extract and convert from the pre-trained model to the parameters\n    required by the existing model\n    3. Load the converted parameters of the existing model\n    \"\"\"\n    #model.set_state_dict(state_dict)\n    if not osp.isfile(weight_path):\n        raise IOError(f'{weight_path} is not a checkpoint file')\n    #state_dicts = load(weight_path)\n    logger = get_logger(\"paddlevideo\")\n    state_dicts = paddle.load(weight_path)"
+        },
+        {
+            "comment": "This code is used to load weights for a model, specifically handling Resnet Encoder and Vision Transformer cases. For Resnet Encoder, it updates the state dictionary with separate dictionaries for encoder and pose_encoder. For Vision Transformer (TimeSformer), it uses pretrain_vit_param_trans function. If the model is neither of these types, it initializes an empty dictionary. The code also includes a tqdm progress bar to show the loading progress.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py\":127-151",
+            "content": "    if 'ResnetEncoder' in str(model):\n        encoder_dict, pose_encoder_dict = pretrain_resnet18_param_trans(\n            model, state_dicts)\n        tmp = model.state_dict()\n        tmp.update(\n            {'backbone.encoder.' + k: v\n             for (k, v) in encoder_dict.items()})\n        tmp.update({\n            'backbone.pose_encoder.' + k: v\n            for (k, v) in pose_encoder_dict.items()\n        })\n    elif \"VisionTransformer\" in str(model):  # For TimeSformer case\n        tmp = pretrain_vit_param_trans(model, state_dicts, kargs['num_patches'],\n                                       kargs['seg_num'],\n                                       kargs['attention_type'])\n    else:\n        tmp = {}\n        total_len = len(model.state_dict())\n        with tqdm(total=total_len,\n                  position=1,\n                  bar_format='{desc}',\n                  desc=\"Loading weights\") as desc:\n            for item in tqdm(model.state_dict(), total=total_len, position=0):\n                name = item\n                desc.set_description('Loading %s' % name)"
+        },
+        {
+            "comment": "This code snippet defines functions for loading and saving PaddlePaddle models. It checks if the model is parallel or non-parallel, converts the state dictionaries accordingly, and updates the model's state dictionary. The `mkdir` function creates a directory if it doesn't exist already, and there are separate save and load functions defined for ease of use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py\":152-181",
+            "content": "                if name not in state_dicts:  # Convert from non-parallel model\n                    if str('backbone.' + name) in state_dicts:\n                        tmp[name] = state_dicts['backbone.' + name]\n                else:  # Convert from parallel model\n                    tmp[name] = state_dicts[name]\n                time.sleep(0.01)\n        ret_str = \"loading {:<20d} weights completed.\".format(\n            len(model.state_dict()))\n        desc.set_description(ret_str)\n    model.set_state_dict(tmp)\ndef mkdir(dir):\n    if not os.path.exists(dir):\n        # avoid error when train with multiple gpus\n        try:\n            os.makedirs(dir)\n        except:\n            pass\n@main_only\ndef save(obj, path):\n    paddle.save(obj, path)\ndef load(file_name):\n    if not osp.isfile(file_name):\n        raise IOError(f'{file_name} not exist')\n    return paddle.load(file_name)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6b644553-9c32-4362-acb9-3a20852badfc.json b/docs/doc/6b644553-9c32-4362-acb9-3a20852badfc.json
new file mode 100644
index 000000000..95eb92b2e
--- /dev/null
+++ b/docs/doc/6b644553-9c32-4362-acb9-3a20852badfc.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code imports modules and registers functions for building a video object detection model, as well as dynamically constructing components based on a configuration file.",
+    "details": [
+        {
+            "comment": "Imports necessary modules and registers various components for video object detection model building.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/builder.py\":0-18",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, BBOX_CODERS, PARTITIONERS, MULTIMODAL, SEGMENT, SEGMENTERS\nfrom ..utils import build\nfrom .registry import (BACKBONES, BBOX_ASSIGNERS, BBOX_CODERS, BBOX_SAMPLERS,\n                       DETECTORS, ESTIMATORS, HEADS, LOCALIZERS, LOSSES,\n                       MULTIMODAL, PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)"
+        },
+        {
+            "comment": "This code defines various building functions for different parts of a model. The \"build_backbone\" function builds the backbone of the model, while \"build_roi_extractor\", \"build_assigner\", and \"build_sampler\" build the region of interest extractor, box assigner, and box sampler respectively. \"build_head\" builds the head of the model, \"build_loss\" builds the loss function, and \"build_recognizer\" and \"build_segmenter\" build recognizers and segmenters with different frameworks or keys. The functions use the \"build\" method from an unspecified source (\"*\") to perform the actual building process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/builder.py\":21-72",
+            "content": "def build_backbone(cfg):\n    \"\"\"Build backbone.\"\"\"\n    return build(cfg, BACKBONES)\ndef build_roi_extractor(cfg):\n    \"\"\"Build roi extractor.\"\"\"\n    return build(cfg, ROI_EXTRACTORS)\ndef build_assigner(cfg, **default_args):\n    \"\"\"Builder of box assigner.\"\"\"\n    return build(cfg, BBOX_ASSIGNERS)\ndef build_sampler(cfg, **default_args):\n    \"\"\"Builder of box sampler.\"\"\"\n    return build(cfg, BBOX_SAMPLERS)\ndef build_roi_extractor(cfg):\n    \"\"\"Build roi extractor.\"\"\"\n    return build(cfg, ROI_EXTRACTORS)\ndef build_assigner(cfg, **default_args):\n    \"\"\"Builder of box assigner.\"\"\"\n    return build(cfg, BBOX_ASSIGNERS)\ndef build_sampler(cfg, **default_args):\n    \"\"\"Builder of box sampler.\"\"\"\n    return build(cfg, BBOX_SAMPLERS)\ndef build_head(cfg):\n    \"\"\"Build head.\"\"\"\n    return build(cfg, HEADS)\ndef build_loss(cfg):\n    \"\"\"Build loss.\"\"\"\n    return build(cfg, LOSSES)\ndef build_recognizer(cfg):\n    \"\"\"Build recognizer.\"\"\"\n    return build(cfg, RECOGNIZERS, key='framework')\ndef build_segmenter(cfg):\n    \"\"\"Build segmenter.\"\"\""
+        },
+        {
+            "comment": "The code is a builder function that dynamically builds various components (recognizers, localizers, detectors, partitioners, estimators, and multimodal) based on the specified framework type in the configuration file. It utilizes the 'build' function to return the appropriate component for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/builder.py\":73-115",
+            "content": "    return build(cfg, SEGMENTERS, key='framework')\ndef build_localizer(cfg):\n    \"\"\"Build localizer.\"\"\"\n    return build(cfg, LOCALIZERS, key='framework')\ndef build_detector(cfg, train_cfg=None, test_cfg=None):\n    \"\"\"Build detector.\"\"\"\n    return build(cfg, DETECTORS, key='framework')\ndef build_partitioner(cfg):\n    \"\"\"Build partitioner.\"\"\"\n    return build(cfg, PARTITIONERS, key='framework')\ndef build_estimator(cfg):\n    \"\"\"Build estimator.\"\"\"\n    return build(cfg, ESTIMATORS, key='framework')\ndef build_multimodal(cfg):\n    \"\"\"Build multimodal.\"\"\"\n    return build(cfg, MULTIMODAL, key='framework')\ndef build_segment(cfg):\n    \"\"\"Build segment.\"\"\"\n    return build(cfg, SEGMENT, key='framework')\ndef build_model(cfg):\n    cfg_copy = cfg.copy()\n    framework_type = cfg_copy.get('framework')\n    if framework_type in RECOGNIZERS:\n        return build_recognizer(cfg)\n    elif framework_type in LOCALIZERS:\n        return build_localizer(cfg)\n    elif framework_type in PARTITIONERS:\n        return build_partitioner(cfg)\n    elif framework_type in DETECTORS:"
+        },
+        {
+            "comment": "This code selects a specific function to build based on the framework type. It checks if the framework type is in defined lists of detectors, estimators, multimodal models, segmenters, or segments. If none match, it raises NotImplementedError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/builder.py\":116-126",
+            "content": "        return build_detector(cfg)\n    elif framework_type in ESTIMATORS:\n        return build_estimator(cfg)\n    elif framework_type in MULTIMODAL:\n        return build_multimodal(cfg)\n    elif framework_type in SEGMENTERS:\n        return build_segmenter(cfg)\n    elif framework_type in SEGMENT:\n        return build_segment(cfg)\n    else:\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6b83d6d1-3f2f-49f7-88b0-8b6ffdd861c6.json b/docs/doc/6b83d6d1-3f2f-49f7-88b0-8b6ffdd861c6.json
new file mode 100644
index 000000000..2b4abee05
--- /dev/null
+++ b/docs/doc/6b83d6d1-3f2f-49f7-88b0-8b6ffdd861c6.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code trains and evaluates the MS-TCN video action segmentation model using provided datasets, compares performance with PaddleVideo's MSTCN, exports inference models, uses metrics like accuracy and F1 score, and runs with GPU usage enabled.",
+    "details": [
+        {
+            "comment": "Introduction: MS-TCN model for video motion segmentation was published in 2019 and optimized for higher precision results in PaddleVideo.\nData: Choose from 50salads, breakfast, gtea datasets for training. Refer to Video Action Segmentation dataset download and preparation doc.\nTrain: After preparing the dataset, run scripts with provided command example.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/mstcn.md\":0-34",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/segmentation/mstcn.md) | English\n# MS-TCN : Video Action Segmentation Model\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nMs-tcn model is a classic model of video motion segmentation model, which was published on CVPR in 2019. We optimized the officially implemented pytorch code and obtained higher precision results in paddlevideo.\n<p align=\"center\">\n<img src=\"../../../images/mstcn.PNG\" height=300 width=400 hspace='10'/> <br />\nMS-TCN Overview\n</p>\n## Data\nMS-TCN can choose 50salads, breakfast, gtea as trianing set. Please refer to Video Action Segmentation dataset download and preparation doc [Video Action Segmentation dataset](../../dataset/SegmentationDataset.md)\n## Train\nAfter prepare dataset, we can run sprits.\n```bash\n# gtea dataset\nexport CUDA_VISIBLE_DEVICES=3\npython3.7 main.py  --validate -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --seed 1538574472"
+        },
+        {
+            "comment": "The code snippet provides instructions for training and testing the video action segmentation model, specifically MSTCN. It mentions that single sample training is supported, and demonstrates how to test MSTCN on a dataset using the provided command line or script program. Additionally, it explains the evaluation method used for datasets and refers to the author's provided evaluation script.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/mstcn.md\":35-51",
+            "content": "```\n- Start the training by using the above command line or script program. There is no need to use the pre training model. The video action segmentation model is usually a full convolution network. Due to the different lengths of videos, the `DATASET.batch_size` of the video action segmentation model is usually set to `1`, that is, batch training is not required. At present, only **single sample** training is supported.\n## Test\nTest MS-TCN on dataset scripts:\n```bash\npython main.py  --test -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --weights=./output/MSTCN/MSTCN_split_1.pdparams\n```\n- The specific implementation of the index is to calculate ACC, edit and F1 scores by referring to the test script[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py) provided by the author of ms-tcn.\n- The evaluation method of data set adopts the folding verification method in ms-tcn paper, and the division method of folding is the same as that in ms-tcn paper.\nAccuracy on Breakfast dataset(4 folding verification):"
+        },
+        {
+            "comment": "This table compares the performance of a paper model and PaddleVideo's MSTCN model on different datasets. The metrics include accuracy (Acc), edit distance, and F1 score (F1@0.1, F1@0.25, F1@0.5). The models are validated with 5-fold cross-validation on the 50salads dataset and 4-fold on the gtea dataset. The provided checkpoints are for gtea dataset splits.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/mstcn.md\":53-77",
+            "content": "| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 66.3% | 61.7% | 48.1% | 48.1% | 37.9% |\n| paddle | 65.2% | 61.5% | 53.7% | 49.2% | 38.8% |\nAccuracy on 50salads dataset(5 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 80.7% | 67.9% | 76.3% | 74.0% | 64.5% |\n| paddle | 81.1% | 71.5% | 77.9% | 75.5% | 66.5% |\nAccuracy on gtea dataset(4 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 79.2% | 81.4% | 87.5% | 85.4% | 74.6% |\n| paddle | 76.9% | 81.8% | 86.4% | 84.7% | 74.8% |\nModel weight for gtea\nTest_Data| F1@0.5 | checkpoints |\n| :----: | :----: | :---- |\n| gtea_split1 | 70.2509 | [MSTCN_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_1.pdparams) |\n| gtea_split2 | 70.7224 | [MSTCN_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_2.pdparams) |"
+        },
+        {
+            "comment": "This code provides instructions for exporting and using an inference model. The `export_model.py` script is used to create the architecture file (`MSTCN.pdmodel`) and parameters file (`MSTCN.pdiparams`). These files can be obtained by running the script with the given configuration file, pre-trained parameters file path, and output directory. The inference process involves providing a list of input files in the format `S1_<item>_C1.npy`. To execute the inference, run the `predict.py` script with the input file list as an argument.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/mstcn.md\":78-107",
+            "content": "| gtea_split3 | 80.0 | [MSTCN_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_3.pdparams) |\n| gtea_split4 | 78.1609 | [MSTCN_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_4.pdparams) |\n## Infer\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \\\n                                -p data/MSTCN_gtea_split_1.pdparams \\\n                                -o inference/MSTCN\n```\nTo get model architecture file `MSTCN.pdmodel` and parameters file `MSTCN.pdiparams`, use:\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\nInput file are the file list for infering, for example:\n```\nS1_Cheese_C1.npy\nS1_CofHoney_C1.npy\nS1_Coffee_C1.npy\nS1_Hotdog_C1.npy\n...\n```\n```bash\npython3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \\"
+        },
+        {
+            "comment": "The code is specifying the configuration file, model file, and parameter file for running the MSTCN (Multi-Stage Temporal Convolutional Network) segmentation model. It also sets the GPU usage to True and TensorRT to False.\nExample logs show the results being written into respective text files in the inference/infer_results folder.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/mstcn.md\":108-129",
+            "content": "                           --config configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \\\n                           --model_file inference/MSTCN/MSTCN.pdmodel \\\n                           --params_file inference/MSTCN/MSTCN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```bash\nresult write in : ./inference/infer_results/S1_Cheese_C1.txt\nresult write in : ./inference/infer_results/S1_CofHoney_C1.txt\nresult write in : ./inference/infer_results/S1_Coffee_C1.txt\nresult write in : ./inference/infer_results/S1_Hotdog_C1.txt\nresult write in : ./inference/infer_results/S1_Pealate_C1.txt\nresult write in : ./inference/infer_results/S1_Peanut_C1.txt\nresult write in : ./inference/infer_results/S1_Tea_C1.txt\n```\n## Reference\n- [MS-TCN: Multi-Stage Temporal Convolutional Network for Action Segmentation](https://arxiv.org/pdf/1903.01945.pdf), Y. Abu Farha and J. Gall."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6cc7ca89-cbfe-416d-b637-65338839e3f4.json b/docs/doc/6cc7ca89-cbfe-416d-b637-65338839e3f4.json
new file mode 100644
index 000000000..7b13017ba
--- /dev/null
+++ b/docs/doc/6cc7ca89-cbfe-416d-b637-65338839e3f4.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code is a part of PaddleVideo's TableTennis application, containing an AttrDict class and parse_config function for parsing YAML configuration files using yaml and ast libraries. It also imports the logger module for logging purposes, and logs a separator string to indicate context changes.",
+    "details": [
+        {
+            "comment": "The code is part of the PaddleVideo TableTennis application and contains a class called AttrDict that extends the Python dictionary functionality. The file also includes the parse_config function, which likely reads and parses configuration files. The code uses the yaml and ast libraries for processing configuration data in a format-agnostic manner. Additionally, it defines a list of configuration types (CONFIG_SECS) and utilizes the logger module for logging purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/config_utils.py\":0-46",
+            "content": "\"\"\"\nconfig_utils\n\"\"\"\n#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport yaml\nimport ast\nimport logger\nlogger = logger.Logger()\nCONFIG_SECS = [\n    'train',\n    'valid',\n    'test',\n    'infer',\n]\nclass AttrDict(dict):\n    \"\"\"\n    AttrDict\n    \"\"\"\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef parse_config(cfg_file):"
+        },
+        {
+            "comment": "This code imports the yaml library and loads a configuration file into an AttrDict object, allowing for easier manipulation of nested dictionary data. It also includes functions to create an AttrDict from a string and print the configurations in a formatted manner.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/config_utils.py\":47-79",
+            "content": "    \"\"\"Load a config file into AttrDict\"\"\"\n    import yaml\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef create_attr_dict(yaml_config):\n    \"\"\"create_attr_dict\"\"\"\n    for key, value in yaml_config.items():\n        if isinstance(value, dict):\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = ast.literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\n    return\ndef print_configs(cfg, mode):\n    \"\"\"print_configs\"\"\"\n    logger.info(\n        \"---------------- {:>5} Arguments ----------------\".format(mode))\n    for sec, sec_items in cfg.items():\n        logger.info(\"{}:\".format(sec))\n        for k, v in sec_items.items():\n            logger.info(\"    {}:{}\".format(k, v))"
+        },
+        {
+            "comment": "This code snippet logs a separator string to the logger, indicating a change in context or section within the program.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/config_utils.py\":80-80",
+            "content": "    logger.info(\"-------------------------------------------------\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6d049a5b-64d9-43c3-9132-f86a22f43e35.json b/docs/doc/6d049a5b-64d9-43c3-9132-f86a22f43e35.json
new file mode 100644
index 000000000..1cc3685ae
--- /dev/null
+++ b/docs/doc/6d049a5b-64d9-43c3-9132-f86a22f43e35.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The PaddleVideo library's FrameDataset and FrameDataset_Sport classes load, transform, and process video data with error handling for missing or corrupted files under Apache License 2.0.",
+    "details": [
+        {
+            "comment": "This code is a Python class for a FrameDataset, which loads raw frames from frame files and applies specified transform operations. It is part of the PaddleVideo library and follows Apache License 2.0. The dataset index file is used to organize the loaded data. This class inherits from BaseDataset, suggesting it has some common functionalities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/frame.py\":0-30",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass FrameDataset(BaseDataset):\n    \"\"\"Rawframe dataset for action recognition.\n    The dataset loads raw frames from frame files, and apply specified transform operatation them.\n    The indecx file "
+        },
+        {
+            "comment": "This code initializes a class for loading video information from an index file. The index file contains the directory of frames, total frames, and label for each video. It supports pipeline and data_prefix, and has options for test mode and suffix format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/frame.py\":30-60",
+            "content": "is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.\n    Example of an index file:\n    .. code-block:: txt\n        file_path-1 150 1\n        file_path-2 160 1\n        file_path-3 170 2\n        file_path-4 180 2\n    Args:\n        file_path (str): Path to the index file.\n        pipeline(XXX):\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 num_retries=5,\n                 data_prefix=None,\n                 test_mode=False,\n                 suffix='img_{:05}.jpg'):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\""
+        },
+        {
+            "comment": "This code reads data from a file and returns information related to frames, such as the frame directory, suffix, number of frames, and labels. It also includes a try-catch block that attempts to prepare the frames for training or validation multiple times if an exception occurs while reading the frames files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/frame.py\":61-85",
+            "content": "        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                frame_dir, frames_len, labels = line_split\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(\n                    dict(frame_dir=frame_dir,\n                         suffix=self.suffix,\n                         frames_len=frames_len,\n                         labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info("
+        },
+        {
+            "comment": "The code handles exceptions for loading missing frames in the dataset. It tries to load frames multiple times within a specified range and logs errors when needed. If an exception occurs, it randomly selects another index and continues the process until successful.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/frame.py\":86-107",
+            "content": "                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])"
+        },
+        {
+            "comment": "The code defines a FrameDataset_Sport class for loading raw videos and applying specified transforms. It uses an index file containing video file paths and labels, and takes arguments for file path, data transforms pipeline, retry attempts, and other BaseDataset kwargs. The load_file function reads the index file to obtain video information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/frame.py\":110-135",
+            "content": "@DATASETS.register()\nclass FrameDataset_Sport(BaseDataset):\n    \"\"\"Video dataset for action recognition\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates\n       a sample video with the filepath and label, which are split with a whitesapce.\n       Example of a inde file:\n       .. code-block:: txt\n           path/000.mp4 1\n           path/001.mp4 1\n           path/002.mp4 2\n           path/003.mp4 2\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, **kwargs)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:"
+        },
+        {
+            "comment": "This code reads lines from a file, each representing a frame directory and associated information. It then splits the line into components and appends the frame directory and suffix to the info list. The prepare_train function attempts to process data for training or validation, handling exceptions by retrying up to a specified number of times before selecting another index at random.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/frame.py\":136-157",
+            "content": "            for line in fin:\n                line_split = line.strip().split()\n                frame_dir = line_split[0]\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(dict(frame_dir=frame_dir, suffix=self.suffix))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)"
+        },
+        {
+            "comment": "The function `prepare_test` is attempting to prepare data for testing by iterating through a certain number of retries in case of exceptions caused by corrupted video files. If an exception occurs, it logs the error and tries again with a different random index from the list of info. Once successful, it returns the images and labels as arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/frame.py\":158-176",
+            "content": "                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for test given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6d190f3b-28cc-4448-a3fc-ef0b99294dd5.json b/docs/doc/6d190f3b-28cc-4448-a3fc-ef0b99294dd5.json
new file mode 100644
index 000000000..e0484fc63
--- /dev/null
+++ b/docs/doc/6d190f3b-28cc-4448-a3fc-ef0b99294dd5.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is importing the recognizers module from the paddlevideo.modeling.framework package, and defining the BaseRecognizer and Recognizer2D classes as part of its API. The __all__ variable lists these two classes as the public API elements of this package.",
+    "details": [
+        {
+            "comment": "This code is importing the recognizers module from the paddlevideo.modeling.framework package, and defining the BaseRecognizer and Recognizer2D classes as part of its API. The __all__ variable lists these two classes as the public API elements of this package.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py\":0-21",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .recognizers import BaseRecognizer, Recognizer2D\n__all__ = [\n    'BaseRecognizer',\n    'Recognizer2D',\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6de2121f-318f-4bea-a0d9-5d3a9b77d550.json b/docs/doc/6de2121f-318f-4bea-a0d9-5d3a9b77d550.json
new file mode 100644
index 000000000..45eb797bb
--- /dev/null
+++ b/docs/doc/6de2121f-318f-4bea-a0d9-5d3a9b77d550.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The Python class VOS_Test extends BaseDataset for video object segmentation tasks and supports pipeline mode, color type options, and resizing. The Davis 2017 dataset is initialized in PaddleVideo and returns a sequence dataset with images, labels, and fixed resolution of 480 pixels.",
+    "details": [
+        {
+            "comment": "This code snippet is from PaddleVideo's davis_dataset.py file and it appears to be a Python class named VOS_Test, which extends the BaseDataset class from the same module. The class is used for processing frames in each video of a dataset. It takes image_root and label_root as input parameters for accessing the required data. The logger is imported from paddle.utils to log any relevant information during execution. This dataset seems to be designed for video object segmentation (VOS) tasks, commonly used in computer vision applications.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/davis_dataset.py\":0-36",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nimport shutil\nfrom PIL import Image\nimport cv2\nfrom paddle.io import Dataset\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\nclass VOS_Test(Dataset):\n    \"\"\"process frames in each video\n    \"\"\"\n    def __init__(self,\n                 image_root,\n                 label_root,"
+        },
+        {
+            "comment": "This code initializes the dataset with image and label file lists, image root and label root paths. It sets object number, total frames, pipeline mode, color type, resolution, creates an object numbers list, and assigns object numbers from labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/davis_dataset.py\":37-65",
+            "content": "                 seq_name,\n                 images,\n                 labels,\n                 pipeline=None,\n                 rgb=False,\n                 resolution=None):\n        self.image_root = image_root\n        self.label_root = label_root\n        self.seq_name = seq_name\n        self.images = images  # image file list\n        self.labels = labels\n        self.obj_num = 1\n        self.num_frame = len(self.images)\n        self.pipeline = pipeline\n        self.rgb = rgb\n        self.resolution = resolution\n        self.obj_nums = []\n        temp_obj_num = 0\n        for img_name in self.images:\n            self.obj_nums.append(temp_obj_num)\n            current_label_name = img_name.split('.')[0] + '.png'\n            if current_label_name in self.labels:\n                current_label = self.read_label(current_label_name)\n                if temp_obj_num < np.unique(\n                        current_label)[-1]:  #get object number from label_id\n                    temp_obj_num = np.unique(current_label)[-1]\n    def __len__(self):"
+        },
+        {
+            "comment": "This code defines a class that loads data from the DAVIS dataset. It first returns the number of images in the dataset, then reads an image at a given index, and finally reads a corresponding label for the image. The class also allows resizing the images to a specified resolution if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/davis_dataset.py\":66-93",
+            "content": "        return len(self.images)\n    def read_image(self, idx):\n        img_name = self.images[idx]\n        img_path = os.path.join(self.image_root, self.seq_name, img_name)\n        img = cv2.imread(img_path)\n        img = np.array(img, dtype=np.float32)\n        if self.rgb:\n            img = img[:, :, [2, 1, 0]]\n        return img\n    def read_label(self, label_name):\n        label_path = os.path.join(self.label_root, self.seq_name, label_name)\n        label = Image.open(label_path)\n        label = np.array(label, dtype=np.uint8)\n        return label\n    def __getitem__(self, idx):\n        img_name = self.images[idx]\n        current_img = self.read_image(idx)\n        current_img = np.array(current_img)\n        height, width, channels = current_img.shape\n        if self.resolution is not None:\n            width = int(np.ceil(float(width) * self.resolution / float(height)))\n            height = int(self.resolution)\n        current_label_name = img_name.split('.')[0] + '.png'\n        obj_num = self.obj_nums[idx]"
+        },
+        {
+            "comment": "The function generates a sample for a dataset, including image and label data. It checks if the current_label_name is in labels, reads the label if present, creates a sample dictionary, adds metadata to the sample, applies a pipeline if one is specified, and converts 'current_img' to numpy array format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/davis_dataset.py\":95-126",
+            "content": "        if current_label_name in self.labels:\n            current_label = self.read_label(current_label_name)\n            current_label = np.array(current_label)\n            sample = {\n                'current_img': current_img,\n                'current_label': current_label\n            }\n        else:\n            sample = {\n                'current_img': current_img\n            }  #only the first frame contains label\n        sample['meta'] = {\n            'seq_name': self.seq_name,\n            'frame_num': self.num_frame,\n            'obj_num': obj_num,\n            'current_name': img_name,\n            'height': height,\n            'width': width,\n            'flip': False\n        }\n        if self.pipeline is not None:\n            sample = self.pipeline(sample)\n        for s in sample:\n            s['current_img'] = np.array(s['current_img'])\n            if 'current_label' in s.keys():\n                s['current_label'] = s['current_label']\n        return sample\n@DATASETS.register()\nclass DavisDataset(BaseDataset):"
+        },
+        {
+            "comment": "The code represents the initialization and file loading process for the Davis 2017 dataset in PaddleVideo. The constructor takes various parameters like file path, result root, pipeline, data prefix, test mode, year, rgb, and resolution to initialize the class attributes. The load_file() method sets image and label roots based on the provided resolution and reads the sequence names from a specified file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/davis_dataset.py\":127-157",
+            "content": "    \"\"\"Davis 2017 dataset.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        result_root,\n        pipeline,\n        data_prefix=None,\n        test_mode=False,\n        year=2017,\n        rgb=False,\n        resolution='480p',\n    ):\n        self.rgb = rgb\n        self.result_root = result_root\n        self.resolution = resolution\n        self.year = year\n        self.spt = 'val' if test_mode else 'train'\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        self.image_root = os.path.join(self.file_path, 'JPEGImages',\n                                       self.resolution)\n        self.label_root = os.path.join(self.file_path, 'Annotations',\n                                       self.resolution)\n        seq_names = []\n        with open(\n                os.path.join(self.file_path, 'ImageSets', str(self.year),\n                             self.spt + '.txt')) as f:\n            seqs_tmp = f.readlines()\n        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))"
+        },
+        {
+            "comment": "This function prepares a test dataset for the VOS task. It retrieves the video name from the info list, then lists all image files in the corresponding directory and adds the first frame as the target label. If the target label does not exist in the result directory, it creates the necessary directories and copies the target label file to the correct location.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/davis_dataset.py\":158-181",
+            "content": "        seq_names.extend(seqs_tmp)\n        self.info = list(np.unique(seq_names))\n        return self.info\n    def prepare_test(self, idx):\n        seq_name = self.info[idx]  #video name\n        images = list(\n            np.sort(os.listdir(os.path.join(self.image_root, seq_name))))\n        labels = [images[0].replace('jpg', 'png')]  #we have first frame target\n        # copy first frame target\n        if not os.path.isfile(\n                os.path.join(self.result_root, seq_name, labels[0])):\n            if not os.path.exists(os.path.join(self.result_root, seq_name)):\n                os.makedirs(os.path.join(self.result_root, seq_name))\n            source_label_path = os.path.join(self.label_root, seq_name,\n                                             labels[0])\n            result_label_path = os.path.join(self.result_root, seq_name,\n                                             labels[0])\n            shutil.copy(source_label_path, result_label_path)\n        seq_dataset = VOS_Test(self.image_root,\n                               self.label_root,"
+        },
+        {
+            "comment": "This code is returning a sequence dataset named seq_name with associated images and labels, processed by the pipeline function specified, potentially using RGB format if self.rgb is True, and a fixed resolution of 480 pixels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/davis_dataset.py\":182-188",
+            "content": "                               seq_name,\n                               images,\n                               labels,\n                               self.pipeline,\n                               rgb=self.rgb,\n                               resolution=480)\n        return seq_dataset"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6e1d2d2e-6ddf-4b67-9f49-231e705fd17f.json b/docs/doc/6e1d2d2e-6ddf-4b67-9f49-231e705fd17f.json
new file mode 100644
index 000000000..a15bdf0aa
--- /dev/null
+++ b/docs/doc/6e1d2d2e-6ddf-4b67-9f49-231e705fd17f.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This is a Chinese comment for an interactive video annotation tool's Command Line Interface (CLI). It mentions installing the \"scikit-image\" package, running the program in inference mode using a specific configuration and model file, and provides a reference document link.",
+    "details": [
+        {
+            "comment": "This is a Chinese comment for an interactive video annotation tool's Command Line Interface (CLI). It mentions installing the \"scikit-image\" package, running the program in inference mode using a specific configuration and model file, and provides a reference document link.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/README.MD\":0-14",
+            "content": "# \u4ea4\u4e92\u5f0f\u89c6\u9891\u667a\u80fd\u6807\u6ce8\u5de5\u5177 - CLI(Command Line Interface)\n\u5728\u5f00\u59cb\u4f7f\u7528\u4e4b\u524d\uff0c\u60a8\u9700\u8981\u6309\u7167\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5\u989d\u5916\u7684\u4f9d\u8d56\u5305\uff1a\n```bash\npython -m pip install scikit-image\n```\n## \u63a8\u7406\u8fd0\u884c\u65b9\u5f0f\n```shell\nC:\\Python\\Python37\\python.exe main.py --test -c E:/PaddlePaddle_Project/EIVideo/resources/backend/configs/manet.yaml -w E:/PaddlePaddle_Project/EIVideo/resources/backend/model/save_step_80000.pdparams\nC:\\Python\\Python37\\python.exe resources/backend/main.py --test -c E:/PaddlePaddle_Project/EIVideo/resources/backend/configs/manet.yaml -w E:/PaddlePaddle_Project/EIVideo/resources/backend/model/save_step_80000.pdparams\n```\n## \u53c2\u8003\u6587\u6863\n[manet](docs/zh-CN/manet.md)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6e9bbbd2-a926-421a-bbb0-ba8383f3362a.json b/docs/doc/6e9bbbd2-a926-421a-bbb0-ba8383f3362a.json
new file mode 100644
index 000000000..ebd886a2c
--- /dev/null
+++ b/docs/doc/6e9bbbd2-a926-421a-bbb0-ba8383f3362a.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The code introduces a RecognizerDistillation class for recognizer distillation in PaddleVideo's framework, and includes model selection, modes like training and validation, loss functions, accuracy functions, and forward pass capabilities.",
+    "details": [
+        {
+            "comment": "This code defines a RecognizerDistillation class that inherits from nn.Layer in PaddleVideo's framework. It implements recognizer distillation, which is a machine learning framework for object recognition tasks. The class takes optional arguments such as freeze_params_list (a list to set models trainable/not), models, and loss. It is registered under RECOGNIZERS and uses logger from paddlevideo's utils.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py\":0-33",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nimport paddle\nimport paddle.nn as nn\nfrom ...registry import RECOGNIZERS\nfrom ... import builder\nfrom paddlevideo.utils import get_logger, get_dist_info\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerDistillation(nn.Layer):\n    \"\"\"recognizer Distillation framework.\"\"\"\n    def __init__(self,\n                 freeze_params_list=None,\n                 models=None,\n                 loss=None,\n                 **kargs):\n        \"\"\"\n        Args:\n            freeze_params_list: list, set each model is trainable or not"
+        },
+        {
+            "comment": "This code initializes an instance of a distillation model. It takes in a list of models and loss configurations, as well as a freeze_params_list (optional). It checks the lengths of the input lists, builds teacher and student models, and initializes backbone and head if they exist in the configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py\":34-59",
+            "content": "            models: config of distillaciton model.\n            loss: config of loss list\n        \"\"\"\n        super().__init__()\n        self.model_list = []\n        self.model_name_list = []\n        self.loss_cfgs = loss\n        if freeze_params_list is None:\n            freeze_params_list = [False] * len(models)\n        assert len(freeze_params_list) == len(models)\n        # build Teacher and Student model\n        for idx, model_config in enumerate(models):\n            assert len(model_config) == 1\n            key = list(model_config.keys())[0]  #Teacher or Student\n            model_config = model_config[key]\n            model_name = model_config['backbone']['name']\n            backbone, head = None, None\n            if model_config.get('backbone'):\n                backbone = builder.build_backbone(model_config['backbone'])\n                if hasattr(backbone, 'init_weights'):\n                    backbone.init_weights()\n            if model_config.get('head'):\n                head = builder.build_head(model_config['head'])"
+        },
+        {
+            "comment": "Builds a distillation model by appending a head to the backbone, initializes weights for the head if possible, and sets trainable parameters based on freeze_params_list. Constructs loss functions using builder.build_loss().",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py\":60-84",
+            "content": "                if hasattr(head, 'init_weights'):\n                    head.init_weights()\n            model = nn.Sequential(backbone, head)\n            logger.info('build distillation {} model done'.format(key))\n            # for add all parameters in nn.Layer class\n            self.model_list.append(self.add_sublayer(key, model))\n            self.model_name_list.append({model_name: key})\n            # set model trainable or not\n            if freeze_params_list[idx]:\n                for param in model.parameters():\n                    param.trainable = False\n        # build loss: support for loss list\n        self.loss_func_list = []\n        mode_keys = list(loss.keys())\n        for mode in mode_keys:\n            loss_cfgs = loss[mode]\n            for loss_cfg in loss_cfgs:\n                loss_func_dict = {}\n                model_name_pairs = loss_cfg.pop('model_name_pairs')\n                loss_func = builder.build_loss(loss_cfg)\n                loss_func_dict['mode'] = mode\n                loss_func_dict['loss_func'] = loss_func"
+        },
+        {
+            "comment": "This code defines a class for handling different modes of operation (train, valid, test, infer) and includes methods to handle each mode. It also contains a method to calculate the loss based on output and labels in 'Train' or 'Val' mode. The code is likely used in a model framework and the class might be used to control the flow and operations of the model depending on the mode it runs in.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py\":85-113",
+            "content": "                loss_func_dict['model_name_pairs'] = model_name_pairs\n                self.loss_func_list.append(loss_func_dict)\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    def get_loss(self, output, labels, mode):\n        \"\"\"\n        Args:\n            output: dict, output name and its value\n            labels: label of data\n            mode: str, 'Train' or 'Val'\n        \"\"\"\n        output['GroundTruth'] = labels\n        loss_list = []"
+        },
+        {
+            "comment": "This code is iterating over a list of loss function dictionaries to find the appropriate loss function based on the input mode. It then calculates the loss value and appends it to a list. Finally, it adds up all the loss values to get the total loss. In the `get_acc` method, it defines an inner function that calculates top-1 and top-5 accuracy scores using PaddlePaddle's `metric.accuracy` function. It also handles multi-card validation by reducing the sum of top-1 and top-5 accuracy scores across multiple cards.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py\":115-135",
+            "content": "        for loss_func_dict in self.loss_func_list:\n            if mode == loss_func_dict['mode']:\n                model_name_pairs = loss_func_dict['model_name_pairs']\n                loss_func = loss_func_dict['loss_func']\n                loss_val = loss_func(output[model_name_pairs[0]],\n                                     output[model_name_pairs[1]])\n                loss_list.append(loss_val)\n        total_loss = paddle.add_n(loss_list)\n        return total_loss\n    def get_acc(self, scores, labels, mode='Train'):\n        def _get_acc(score, label, mode='Train'):\n            top1 = paddle.metric.accuracy(input=score, label=label, k=1)\n            top5 = paddle.metric.accuracy(input=score, label=label, k=5)\n            _, world_size = get_dist_info()\n            # Deal with multi cards validate\n            if world_size > 1 and mode == 'Val':  #reduce sum when valid\n                top1 = paddle.distributed.all_reduce(\n                    top1, op=paddle.distributed.ReduceOp.SUM) / world_size\n                top5 = paddle.distributed.all_reduce("
+        },
+        {
+            "comment": "The code snippet contains a recognizerDistillation function, which calculates accuracy based on given scores and labels. It also includes a forward_model function for reshaping images and applying model operations. The train_step function defines the training process from input to output, including loss metrics calculation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py\":136-164",
+            "content": "                    top5, op=paddle.distributed.ReduceOp.SUM) / world_size\n            return top1, top5\n        if len(labels) == 1:\n            label = labels[0]\n            return _get_acc(scores, label)\n        # Deal with VideoMix\n        elif len(labels) == 3:\n            label_a, label_b, lam = labels\n            top1a, top5a = _get_acc(scores, label_a, mode)\n            top1b, top5b = _get_acc(scores, label_b, mode)\n            top1 = lam * top1a + (1 - lam) * top1b\n            top5 = lam * top5a + (1 - lam) * top5b\n            return top1, top5\n    def forward_model(self, imgs, model_name, model):\n        if model_name in ['PPTSM_v2', 'ResNetTweaksTSM']:\n            # [N,T,C,H,W] -> [N*T,C,H,W]\n            imgs = paddle.reshape(imgs, [-1] + list(imgs.shape[2:]))\n        return model(imgs)\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        out = {}\n        loss_metrics = {}\n        imgs = data_batch[0]\n        labels = data_batch[1:]"
+        },
+        {
+            "comment": "This code defines a class that implements a recognizer for distillation. The model takes in an image and a model name, and returns the output from both the student and teacher models. It calculates loss using the student and teacher outputs, as well as top-1 and top-5 accuracy metrics from the student's output only. This is used for both training (train_step) and validation (val_step). The class utilizes a list of model names and corresponding models for both types, student and teacher, and iterates over them to apply the forward pass and calculate loss and metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py\":166-192",
+            "content": "        for idx, item in enumerate(self.model_name_list):\n            model = self.model_list[idx]\n            model_name = list(item.keys())[0]\n            model_type = item[model_name]  # Teacher or Student\n            out[model_type] = self.forward_model(imgs, model_name, model)\n        # out_student, out_teacher\n        loss = self.get_loss(out, labels, 'Train')\n        loss_metrics['loss'] = loss\n        # calculate acc with student output\n        top1, top5 = self.get_acc(out['Student'], labels)\n        loss_metrics['top1'] = top1\n        loss_metrics['top5'] = top5\n        return loss_metrics\n    def val_step(self, data_batch):\n        out = {}\n        loss_metrics = {}\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        for idx, item in enumerate(self.model_name_list):\n            model = self.model_list[idx]\n            model_name = list(item.keys())[0]\n            model_type = item[model_name]  # Teacher or Student\n            out[model_type] = self.forward_model(imgs, model_name, model)"
+        },
+        {
+            "comment": "In this code snippet, the get_loss and get_acc functions are used to calculate loss and accuracy metrics for a \"Student\" model. The test_step function tests the Student model using forward_model function, and the infer_step function is not implemented here. This code seems related to evaluating the performance of a student model in image recognition tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py\":194-223",
+            "content": "        # Loss of student with gt:  out_student, label\n        loss = self.get_loss(out, labels, 'Val')\n        loss_metrics['loss'] = loss\n        top1, top5 = self.get_acc(out['Student'], labels, 'Val')\n        loss_metrics['top1'] = top1\n        loss_metrics['top5'] = top5\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        # Use Student to test\n        for idx, item in enumerate(self.model_name_list):\n            model = self.model_list[idx]\n            model_name = list(item.keys())[0]\n            model_type = item[model_name]  # Teacher or Student\n            if model_type == \"Student\":\n                out = self.forward_model(imgs, model_name, model)\n        return out\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        # Use Student to infer\n        for idx, item in enumerate(self.model_name_list):"
+        },
+        {
+            "comment": "The code selects a model from the model_list based on the idx, and assigns its name to model_name. If the model type is \"Student\", it calls forward_model function passing imgs, model_name, and model as parameters, and returns the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py\":224-230",
+            "content": "            model = self.model_list[idx]\n            model_name = list(item.keys())[0]\n            model_type = item[model_name]  # Teacher or Student\n            if model_type == \"Student\":\n                out = self.forward_model(imgs, model_name, model)\n        return out"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6f66441f-8644-4395-8219-f58f193d243f.json b/docs/doc/6f66441f-8644-4395-8219-f58f193d243f.json
new file mode 100644
index 000000000..87a254f82
--- /dev/null
+++ b/docs/doc/6f66441f-8644-4395-8219-f58f193d243f.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The \"AddsHead\" class in PaddleVideo handles object detection, loss calculation during training and metrics like abs_rel, rmse in inference, while supporting multi-GPU scenarios with all-reduce operations.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library, defining a class called AddsHead for object detection. It uses input features with specific number of channels and classes to be classified. The class is registered in the registry under HEADS. MIN_DEPTH and MAX_DEPTH constants define the minimum and maximum depth values respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/adds_head.py\":0-32",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport cv2\nimport numpy as np\nimport paddle.nn as nn\nfrom paddlevideo.utils import get_dist_info\nimport paddle\nfrom ..builder import build_loss\nfrom ..registry import HEADS\nMIN_DEPTH = 1e-3\nMAX_DEPTH = 80\n@HEADS.register()\nclass AddsHead(nn.Layer):\n    \"\"\"TimeSformerHead Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature."
+        },
+        {
+            "comment": "The code represents the initialization and forward pass of a class named \"AddsHead\". The class takes in parameters like avg_reprojection, disparity_smoothness, no_ssim, etc. It builds a loss function using build_loss method with the provided configuration (loss_cfg). During training, it returns the result of the loss function on inputs and outputs. In inference mode, it uses get_metrics method to calculate metrics such as abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/adds_head.py\":33-61",
+            "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 avg_reprojection,\n                 disparity_smoothness,\n                 no_ssim,\n                 loss_cfg=dict(name='ADDSLoss'),\n                 max_gt_depth=60,\n                 pred_depth_scale_factor=1):\n        super(AddsHead, self).__init__()\n        loss_cfg['avg_reprojection'] = avg_reprojection\n        loss_cfg['disparity_smoothness'] = disparity_smoothness\n        loss_cfg['no_ssim'] = no_ssim\n        self.max_gt_depth = max_gt_depth\n        self.pred_depth_scale_factor = pred_depth_scale_factor\n        self.loss_func = build_loss(loss_cfg)\n    def forward(self):\n        raise NotImplemented\n    def loss(self, inputs, outputs):\n        if self.training:\n            return self.loss_func(inputs, outputs)\n        else:\n            abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.get_metrics("
+        },
+        {
+            "comment": "This code snippet defines an \"AddsHead\" class that returns a dictionary of metrics including absolute relative error, squared relative error, root mean square error, and additional error measures. The get_metrics function resizes the predicted displacement to match the ground truth depth, scales and adjusts the predicted depth based on certain factors, and then computes the specified errors using another function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/adds_head.py\":62-94",
+            "content": "                outputs['pred_disp'], outputs['gt'])\n            outputs['abs_rel'] = abs_rel\n            outputs['sq_rel'] = sq_rel\n            outputs['rmse'] = rmse\n            outputs['rmse_log'] = rmse_log\n            outputs['a1'] = a1\n            outputs['a2'] = a2\n            outputs['a3'] = a3\n            return outputs\n    def get_metrics(self, pred_disp, gt_depth):\n        gt_height, gt_width = gt_depth.shape[:2]\n        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))\n        pred_depth = 1 / pred_disp\n        mask = gt_depth > 0\n        pred_depth = pred_depth[mask]\n        gt_depth = gt_depth[mask]\n        pred_depth *= self.pred_depth_scale_factor\n        ratio = np.median(gt_depth) / np.median(pred_depth)\n        pred_depth *= ratio\n        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH\n        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH\n        mask2 = gt_depth <= self.max_gt_depth\n        pred_depth = pred_depth[mask2]\n        gt_depth = gt_depth[mask2]\n        abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.compute_errors("
+        },
+        {
+            "comment": "This code is performing all-reduce operations on tensors for multi-GPU scenarios, ensuring that the sum of tensor values across GPUs is reduced and then divided by the total number of participating GPUs. This allows for accurate averaging of results when working with multiple GPUs in a distributed environment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/adds_head.py\":95-116",
+            "content": "            gt_depth, pred_depth)\n        _, world_size = get_dist_info()\n        if world_size > 1:\n            # educe sum when valid\n            # TODO: there are some problems with multi gpu gather code.\n            abs_rel = paddle.to_tensor(abs_rel)\n            sq_rel = paddle.to_tensor(sq_rel)\n            rmse = paddle.to_tensor(rmse)\n            rmse_log = paddle.to_tensor(rmse_log)\n            a1 = paddle.to_tensor(a1)\n            a2 = paddle.to_tensor(a2)\n            a3 = paddle.to_tensor(a3)\n            abs_rel = paddle.distributed.all_reduce(\n                abs_rel, op=paddle.distributed.ReduceOp.SUM) / world_size\n            sq_rel = paddle.distributed.all_reduce(\n                sq_rel, op=paddle.distributed.ReduceOp.SUM) / world_size\n            rmse = paddle.distributed.all_reduce(\n                rmse, op=paddle.distributed.ReduceOp.SUM) / world_size\n            rmse_log = paddle.distributed.all_reduce(\n                rmse_log, op=paddle.distributed.ReduceOp.SUM) / world_size\n            a1 = paddle.distributed.all_reduce("
+        },
+        {
+            "comment": "The code defines a function that computes error metrics between predicted and ground truth depths. It uses all-reduce operations to calculate average values and returns multiple metrics including absolute relative error, squared relative error, RMSE, log RMSE, and three averages (a1, a2, a3) indicating the percentage of thresholds below certain levels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/adds_head.py\":117-143",
+            "content": "                a1, op=paddle.distributed.ReduceOp.SUM) / world_size\n            a2 = paddle.distributed.all_reduce(\n                a2, op=paddle.distributed.ReduceOp.SUM) / world_size\n            a3 = paddle.distributed.all_reduce(\n                a3, op=paddle.distributed.ReduceOp.SUM) / world_size\n            return abs_rel.item(), sq_rel.item(), rmse.item(), rmse_log.item(\n            ), a1.item(), a2.item(), a3.item()\n        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3\n    def compute_errors(self, gt, pred):\n        \"\"\"Computation of error metrics between predicted and ground truth depths\n        \"\"\"\n        thresh = np.maximum((gt / pred), (pred / gt))\n        a1 = (thresh < 1.25).mean()\n        a2 = (thresh < 1.25**2).mean()\n        a3 = (thresh < 1.25**3).mean()\n        rmse = (gt - pred)**2\n        rmse = np.sqrt(rmse.mean())\n        rmse_log = (np.log(gt) - np.log(pred))**2\n        rmse_log = np.sqrt(rmse_log.mean())\n        abs_rel = np.mean(np.abs(gt - pred) / gt)\n        sq_rel = np.mean(((gt - pred)**2) / gt)"
+        },
+        {
+            "comment": "This code returns six metrics: abs_rel, sq_rel, rmse, rmse_log, a1, and a2. These metrics are likely related to evaluating the performance of some model or algorithm in a regression task. The 'abs_rel' might stand for absolute relative error, 'sq_rel' could be squared relative error, 'rmse' represents root mean squared error, 'rmse_log' could be the logarithm of rmse, and 'a1', 'a2', and 'a3' are possibly other evaluation metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/adds_head.py\":145-145",
+            "content": "        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6fa4e1bb-e7ce-485a-b81a-1905868b1ed0.json b/docs/doc/6fa4e1bb-e7ce-485a-b81a-1905868b1ed0.json
new file mode 100644
index 000000000..85a0b6d58
--- /dev/null
+++ b/docs/doc/6fa4e1bb-e7ce-485a-b81a-1905868b1ed0.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code creates a 2D image classifier model using PaddleVideo's RecognizerMRI, with train_step and val_step calculating loss metrics, and test_step for testing without calling head.loss during inference.",
+    "details": [
+        {
+            "comment": "Code is from PaddleVideo's RecognizerMRI class, a 2D recognizer model framework. It has a forward_net method that takes imgs as input and returns the output of the network. The number of segments is obtained from the image shape and used to call the self.head method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerMRI.py\":0-26",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nimport paddle\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerMRI(BaseRecognizer):\n    \"\"\"2D recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.\n        num_segs = imgs.shape[\n            1]  # imgs.shape=[N,T,C,H,W], for most commonly case"
+        },
+        {
+            "comment": "This code defines a model for image classification. It first reshapes and casts the input images to float32 type, then passes them through a backbone network if one is defined. After that, it sends the resulting feature map through a head network (if defined) to produce class scores. The train_step function uses these class scores to calculate loss metrics during training, while the val_step function performs similar operations but does not compute losses.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerMRI.py\":27-58",
+            "content": "        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))\n        imgs = paddle.cast(imgs, \"float32\")  #############\n        imgs = imgs.unsqueeze(1)\n        if self.backbone != None:\n            feature = self.backbone(imgs)\n        else:\n            feature = imgs\n        if self.head != None:\n            cls_score = self.head(feature, num_segs)\n        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score,"
+        },
+        {
+            "comment": "The code defines a test_step and infer_step function for a model, which takes in data_batch as input and returns the classification scores from the forward_net function. The test_step specifically mentions that during testing, the net won't call head.loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizerMRI.py\":59-75",
+            "content": "                                      labels,\n                                      valid_mode=True,\n                                      if_top5=False)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics\n        imgs = data_batch[0]\n        cls_score = self.forward_net(imgs)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        cls_score = self.forward_net(imgs)\n        return cls_score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6fd87d67-4e55-45c8-978e-44ad5f14fddb.json b/docs/doc/6fd87d67-4e55-45c8-978e-44ad5f14fddb.json
new file mode 100644
index 000000000..b45fffc9c
--- /dev/null
+++ b/docs/doc/6fd87d67-4e55-45c8-978e-44ad5f14fddb.json
@@ -0,0 +1,100 @@
+{
+    "summary": "The code trains and applies Ma-Net model for video object detection, preprocesses images, visualizes results, and uses neural network segmentation and classification. The \"train_stage1.py\" file in PaddleVideo/applications/Ma-Net project creates a manager object and trains it with numbers as arguments or configuration.",
+    "details": [
+        {
+            "comment": "The code imports necessary libraries, defines classes for data loaders and networks, sets up device and environment configurations, and initializes the Manager class with optional parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":0-33",
+            "content": "import cv2\nimport paddle\nimport paddle.nn as nn\nimport os\nimport numpy as np\nfrom paddle.io import DataLoader\nimport paddle.optimizer as optim\nfrom paddle.vision import transforms\nfrom dataloaders.davis_2017_f import DAVIS2017_VOS_Train, DAVIS2017_VOS_Test\nimport dataloaders.custom_transforms_f as tr\nfrom dataloaders.samplers import RandomIdentitySampler\nfrom networks.deeplab import DeepLab\nfrom networks.IntVOS import IntVOS\nfrom networks.loss import Added_BCEWithLogitsLoss, Added_CrossEntropyLoss\nfrom config import cfg\nfrom utils.api import float_, clip_grad_norm_, int_, long_\nfrom utils.meters import AverageMeter\nfrom utils.mask_damaging import damage_masks\nfrom utils.utils import label2colormap\nfrom PIL import Image\nimport scipy.misc as sm\nimport time\n# import logging\npaddle.disable_static()\npaddle.device.set_device('gpu:0')\nclass Manager(object):\n    def __init__(self,\n                 use_gpu=True,\n                 time_budget=None,\n                 save_result_dir=cfg.SAVE_RESULT_DIR,\n                 pretrained=True,"
+        },
+        {
+            "comment": "The code initializes a model for stage 1 training in Ma-Net application. It loads pretrained model if specified and sets the model to train mode. The `train` method starts the actual training by setting the model to train mode, initializing loss meters, and looping over batches to compute loss and time metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":34-58",
+            "content": "                 interactive_test=False,\n                 freeze_bn=False):\n        self.save_res_dir = save_result_dir\n        self.time_budget = time_budget\n        self.feature_extracter = DeepLab(backbone='resnet', freeze_bn=freeze_bn)\n        if pretrained:\n            pretrained_dict = paddle.load(cfg.PRETRAINED_MODEL)\n            # pretrained_dict = np.load(cfg.PRETRAINED_MODEL, allow_pickle=True).item()\n            pretrained_dict = pretrained_dict['state_dict']\n            self.load_network(self.feature_extracter, pretrained_dict)\n            print('load pretrained model successfully.')\n        self.model = IntVOS(cfg, self.feature_extracter)\n        self.use_gpu = use_gpu\n        if use_gpu:\n            self.model = self.model\n    def train(self,\n              damage_initial_previous_frame_mask=True,\n              lossfunc='cross_entropy',\n              model_resume=False):\n        ###################\n        self.model.train()\n        running_loss = AverageMeter()\n        running_time = AverageMeter()"
+        },
+        {
+            "comment": "This code defines a training stage for the Ma-Net model. It initializes parameters for three parts of the model: feature extractor, semantic embedding, and dynamic segment head. An optimizer using momentum is set up with specified learning rate, momentum, weight decay, and gradient clipping. A series of data transformations are applied to input images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":60-86",
+            "content": "        param_list = [{\n            'params': self.model.feature_extracter.parameters()\n        }, {\n            'params': self.model.semantic_embedding.parameters()\n        }, {\n            'params': self.model.dynamic_seghead.parameters()\n        }]\n        ########\n        clip = paddle.nn.ClipGradByGlobalNorm(\n            clip_norm=cfg.TRAIN_CLIP_GRAD_NORM)\n        #         clip = None\n        optimizer = optim.Momentum(parameters=param_list,\n                                   learning_rate=cfg.TRAIN_LR,\n                                   momentum=cfg.TRAIN_MOMENTUM,\n                                   weight_decay=cfg.TRAIN_WEIGHT_DECAY,\n                                   use_nesterov=True,\n                                   grad_clip=clip)\n        self.param_list = param_list\n        ###################\n        composed_transforms = transforms.Compose([\n            tr.RandomHorizontalFlip(cfg.DATA_RANDOMFLIP),\n            tr.RandomScale(),\n            tr.RandomCrop((cfg.DATA_RANDOMCROP, cfg.DATA_RANDOMCROP), 5),"
+        },
+        {
+            "comment": "The code initializes the training dataset and creates a data loader for it. It also defines the loss function based on user input, either binary cross-entropy or cross-entropy, and sets the maximum number of iterations to run. The dataset processing includes resizing and converting images to tensors using specified transforms.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":87-113",
+            "content": "            tr.Resize(cfg.DATA_RESCALE),\n            tr.ToTensor()\n        ])\n        print('dataset processing...')\n        train_dataset = DAVIS2017_VOS_Train(root=cfg.DATA_ROOT,\n                                            transform=composed_transforms)\n        trainloader = DataLoader(\n            train_dataset,\n            collate_fn=None,\n            batch_size=cfg.TRAIN_BATCH_SIZE,\n            shuffle=True,\n            num_workers=8,\n        )\n        print('dataset processing finished.')\n        if lossfunc == 'bce':\n            criterion = Added_BCEWithLogitsLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,\n                                                cfg.TRAIN_HARD_MINING_STEP)\n        elif lossfunc == 'cross_entropy':\n            criterion = Added_CrossEntropyLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,\n                                               cfg.TRAIN_HARD_MINING_STEP)\n        else:\n            print(\n                'unsupported loss funciton. Please choose from [cross_entropy,bce]'\n            )\n        max_itr = cfg.TRAIN_TOTAL_STEPS"
+        },
+        {
+            "comment": "This code initializes the model, loads previously saved data if specified and resumes training from a certain step. It keeps track of training time per epoch and adjusts learning rate as needed. The loop iterates through the dataset, performing operations on each sample until maximum number of iterations is reached.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":115-143",
+            "content": "        step = 0\n        if model_resume:\n            saved_model_ = os.path.join(self.save_res_dir, cfg.TRAIN_RESUME_DIR)\n            saved_model_ = paddle.load(saved_model_)\n            self.model = self.load_network(self.model, saved_model_)\n            step = int(cfg.RESUME_DIR.split('.')[0].split('_')[-1])\n            print('resume from step {}'.format(step))\n        while step < cfg.TRAIN_TOTAL_STEPS:\n            if step > 100001:\n                break\n            t1 = time.time()\n            if step > 0:\n                running_time.update(time.time() - t1)\n            print(\n                f'{time.asctime()}: new epoch starts. last epoch time: {running_time.avg:.3f} s.',\n            )\n            for ii, sample in enumerate(trainloader):\n                now_lr = self._adjust_lr(optimizer, step, max_itr)\n                if step >= max_itr:\n                    step += 1\n                    break\n                ref_imgs = sample['ref_img']  # batch_size * 3 * h * w\n                img1s = sample['img1']"
+        },
+        {
+            "comment": "This code segment prepares the input data for a model by concatenating images and assigning labels. It also handles any potential errors with damage masks and adjusts the label1s accordingly. The GPU usage is conditionally set based on whether `self.use_gpu` is true or false.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":144-171",
+            "content": "                img2s = sample['img2']\n                ref_scribble_labels = sample[\n                    'ref_scribble_label']  # batch_size * 1 * h * w\n                label1s = sample['label1']\n                label2s = sample['label2']\n                seq_names = sample['meta']['seq_name']\n                obj_nums = sample['meta']['obj_num']\n                bs, _, h, w = img2s.shape\n                inputs = paddle.concat((ref_imgs, img1s, img2s), 0)\n                if damage_initial_previous_frame_mask:\n                    try:\n                        label1s = damage_masks(label1s)\n                    except:\n                        label1s = label1s\n                        print('damage_error')\n                ##########\n                if self.use_gpu:\n                    inputs = inputs\n                    ref_scribble_labels = ref_scribble_labels\n                    label1s = label1s\n                    label2s = label2s\n                ##########\n                tmp_dic = self.model(inputs,\n                                     ref_scribble_labels,"
+        },
+        {
+            "comment": "This code initializes label and object dictionaries using the provided label1s. It then iterates over sequence names, creating key-value pairs for the label_and_obj_dic dictionary. For each sequence, it interpolates the temporary prediction logits with bilinear mode, aligning corners. Lastly, it retrieves the label and object number from the label_and_obj_dic and generates a tensor of object ids.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":172-193",
+            "content": "                                     label1s,\n                                     use_local_map=True,\n                                     seq_names=seq_names,\n                                     gt_ids=obj_nums,\n                                     k_nearest_neighbors=cfg.KNNS)\n                label_and_obj_dic = {}\n                label_dic = {}\n                for i, seq_ in enumerate(seq_names):\n                    label_and_obj_dic[seq_] = (label2s[i], obj_nums[i])\n                for seq_ in tmp_dic.keys():\n                    tmp_pred_logits = tmp_dic[seq_]\n                    tmp_pred_logits = nn.functional.interpolate(\n                        tmp_pred_logits,\n                        size=(h, w),\n                        mode='bilinear',\n                        align_corners=True)\n                    tmp_dic[seq_] = tmp_pred_logits\n                    label_tmp, obj_num = label_and_obj_dic[seq_]\n                    obj_ids = np.arange(1, obj_num + 1)\n                    obj_ids = paddle.to_tensor(obj_ids)"
+        },
+        {
+            "comment": "This code snippet is training a video object detection model in stages. It applies binary cross-entropy or cross-entropy loss depending on the specified loss function, updates the model's parameters using an optimizer, and tracks the average loss during training. The progress is logged every 50 steps along with the current learning rate.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":194-216",
+            "content": "                    obj_ids = int_(obj_ids)\n                    if lossfunc == 'bce':\n                        label_tmp = label_tmp.transpose([1, 2, 0])\n                        label = (float_(label_tmp) == float_(obj_ids))\n                        label = label.unsqueeze(-1).transpose([3, 2, 0, 1])\n                        label_dic[seq_] = float_(label)\n                    elif lossfunc == 'cross_entropy':\n                        label_dic[seq_] = long_(label_tmp)\n                loss = criterion(tmp_dic, label_dic, step)\n                loss = loss / bs\n                optimizer.clear_grad()\n                loss.backward()\n                optimizer.step()\n                running_loss.update(loss.item(), bs)\n                ##############Visulization during training\n                if step % 50 == 0:\n                    print(time.asctime(), end='\\t')\n                    log = 'step:{},now_lr:{} ,loss:{:.4f}({:.4f})'.format(\n                        step, now_lr, running_loss.val, running_loss.avg)\n                    print(log)"
+        },
+        {
+            "comment": "This code extracts images, applies normalization, and visualizes the reference image, two input images, and a ground truth label. The images are normalized by subtracting mean values and dividing by standard deviation. The ground truth label is converted to a color map for visualization. The predicted output is interpolated to match the size of the other images before visualizing them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":217-239",
+            "content": "                    #                     logging.info(log)\n                    show_ref_img = ref_imgs.numpy()[0]\n                    show_img1 = img1s.numpy()[0]\n                    show_img2 = img2s.numpy()[0]\n                    mean = np.array([[[0.485]], [[0.456]], [[0.406]]])\n                    sigma = np.array([[[0.229]], [[0.224]], [[0.225]]])\n                    show_ref_img = show_ref_img * sigma + mean\n                    show_img1 = show_img1 * sigma + mean\n                    show_img2 = show_img2 * sigma + mean\n                    show_gt = label2s[0]\n                    show_gt = show_gt.squeeze(0).numpy()\n                    show_gtf = label2colormap(show_gt).transpose((2, 0, 1))\n                    show_preds = tmp_dic[seq_names[0]]\n                    show_preds = nn.functional.interpolate(show_preds,\n                                                           size=(h, w),\n                                                           mode='bilinear',\n                                                           align_corners=True)"
+        },
+        {
+            "comment": "Applies sigmoid function to binary cross-entropy predictions, converts them to segmentation masks using argmax for cross entropy, calculates pixel accuracy, and saves network at specified intervals.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":240-265",
+            "content": "                    show_preds = show_preds.squeeze(0)\n                    if lossfunc == 'bce':\n                        show_preds = (paddle.nn.functional.sigmoid(show_preds) >\n                                      0.5)\n                        show_preds_s = paddle.zeros((h, w))\n                        for i in range(show_preds.size(0)):\n                            show_preds_s[show_preds[i]] = i + 1\n                    elif lossfunc == 'cross_entropy':\n                        show_preds_s = paddle.argmax(show_preds, axis=0)\n                    show_preds_s = show_preds_s.numpy()\n                    show_preds_sf = label2colormap(show_preds_s).transpose(\n                        (2, 0, 1))\n                    pix_acc = np.sum(show_preds_s == show_gt) / (h * w)\n                    ###########TODO\n                if step % 20000 == 0 and step != 0:\n                    self.save_network(self.model, step)\n                step += 1\n    def test_VOS(self, use_gpu=True):\n        seqs = []\n        with open(\n                os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',"
+        },
+        {
+            "comment": "Loading and preparing the test datasets for sequence processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":266-285",
+            "content": "                             'val' + '.txt')) as f:\n            seqs_tmp = f.readlines()\n        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))\n        seqs.extend(seqs_tmp)\n        print('model loading...')\n        saved_model_dict = os.path.join(self.save_res_dir, cfg.TEST_CHECKPOINT)\n        pretrained_dict = paddle.load(saved_model_dict)\n        self.model = self.load_network(self.model, pretrained_dict)\n        print('model load finished')\n        self.model.eval()\n        with paddle.no_grad():\n            for seq_name in seqs:\n                print('prcessing seq:{}'.format(seq_name))\n                test_dataset = DAVIS2017_VOS_Test(root=cfg.DATA_ROOT,\n                                                  transform=tr.ToTensor(),\n                                                  result_root=cfg.RESULT_ROOT,\n                                                  seq_name=seq_name)\n                test_dataloader = DataLoader(test_dataset,\n                                             batch_size=1,"
+        },
+        {
+            "comment": "This code creates a Paddle data loader for the test dataset, ensuring it doesn't overwrite existing results. It then iterates through each sample in the test loader, extracting necessary images and labels, and concatenating them into an input array. If GPU is used, the inputs and labels are transferred to GPU memory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":286-305",
+            "content": "                                             shuffle=False,\n                                             num_workers=0)\n                if not os.path.exists(os.path.join(cfg.RESULT_ROOT, seq_name)):\n                    os.makedirs(os.path.join(cfg.RESULT_ROOT, seq_name))\n                time_start = time.time()\n                for ii, sample in enumerate(test_dataloader):\n                    ref_img = sample['ref_img']\n                    prev_img = sample['prev_img']\n                    current_img = sample['current_img']\n                    ref_label = sample['ref_label']\n                    prev_label = sample['prev_label']\n                    obj_num = sample['meta']['obj_num']\n                    seqnames = sample['meta']['seq_name']\n                    imgname = sample['meta']['current_name']\n                    bs, _, h, w = current_img.shape\n                    inputs = paddle.concat((ref_img, prev_img, current_img), 0)\n                    if use_gpu:\n                        inputs = inputs\n                        ref_label = ref_label"
+        },
+        {
+            "comment": "Feature extraction and model prediction for a video frame. Time measurement for feature extractor and model execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":306-325",
+            "content": "                        prev_label = prev_label\n                    ################\n                    t1 = time.time()\n                    tmp = self.model.extract_feature(inputs)\n                    ref_frame_embedding, previous_frame_embedding, current_frame_embedding = paddle.split(\n                        tmp, num_or_sections=3, axis=0)\n                    t2 = time.time()\n                    print('feature_extracter time:{}'.format(t2 - t1))\n                    tmp_dic = self.model.prop_seghead(\n                        ref_frame_embedding, previous_frame_embedding,\n                        current_frame_embedding, ref_label, prev_label, True,\n                        seqnames, obj_num, cfg.KNNS, self.model.dynamic_seghead)\n                    t3 = time.time()\n                    print('after time:{}'.format(t3 - t2))\n                    #######################\n                    pred_label = tmp_dic[seq_name]\n                    pred_label = nn.functional.interpolate(pred_label,\n                                                           size=(h, w),"
+        },
+        {
+            "comment": "This code segment is part of a function that saves the predicted labels for each frame as an image with a palette. It extracts the predictions from a pre-trained network, converts them to an image, and saves it in a specified directory. The function also prints the time taken for processing each frame.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":326-347",
+            "content": "                                                           mode='bilinear',\n                                                           align_corners=True)\n                    pred_label = paddle.argmax(pred_label, axis=1)\n                    pred_label = pred_label.squeeze(0)\n                    pred_label = pred_label.numpy()\n                    im = Image.fromarray(pred_label.astype('uint8')).convert(\n                        'P', )\n                    im.putpalette(_palette)\n                    im.save(\n                        os.path.join(cfg.RESULT_ROOT, seq_name,\n                                     imgname[0].split('.')[0] + '.png'))\n                    one_frametime = time.time()\n                    print('seq name:{} frame:{} time:{}'.format(\n                        seq_name, imgname[0], one_frametime - time_start))\n                    time_start = time.time()\n    def load_network(self, net, pretrained_dict):\n        # pretrained_dict = pretrained_dict\n        model_dict = net.state_dict()\n        # 1. filter out unnecessary keys"
+        },
+        {
+            "comment": "This code snippet is part of a training process for a neural network called Ma-Net. It loads pretrained weights into the model and then saves the network at different steps. The learning rate is adjusted during the training process to improve performance. The _palette variable appears unrelated, as it stores RGB values for colors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":348-377",
+            "content": "        pretrained_dict = {\n            k: v\n            for k, v in pretrained_dict.items() if k in model_dict\n        }\n        # 2. overwrite entries in the existing state dict\n        # for k in model_dict:\n        #     if k not in pretrained_dict:\n        #         print(k, 'not in loaded weights.')\n        model_dict.update(pretrained_dict)\n        net.set_state_dict(model_dict)\n        return net\n    def save_network(self, net, step):\n        save_path = self.save_res_dir\n        if not os.path.exists(save_path):\n            os.makedirs(save_path)\n        save_file = 'save_step_%s.pth' % (step)\n        paddle.save(net.state_dict(), os.path.join(save_path, save_file))\n    def _adjust_lr(self, optimizer, itr, max_itr):\n        now_lr = cfg.TRAIN_LR * (1 - itr / (max_itr + 1))**cfg.TRAIN_POWER\n        optimizer._param_groups[0]['lr'] = now_lr\n        return now_lr\n_palette = [\n    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,\n    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,"
+        },
+        {
+            "comment": "This code is a list of RGB values for different objects in an image, possibly for object detection or classification.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":378-390",
+            "content": "    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,\n    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,\n    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,\n    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,\n    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,\n    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,\n    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,\n    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,\n    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,\n    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,\n    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,\n    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,"
+        },
+        {
+            "comment": "This code appears to contain a sequence of numbers, possibly representing some kind of iteration or indexing in the larger context of the script. The exact meaning would depend on the specifics of how and where this code is used within the \"train_stage1.py\" file of the PaddleVideo/applications/Ma-Net project.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":391-403",
+            "content": "    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,\n    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,\n    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,\n    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,\n    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,\n    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,\n    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,\n    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,\n    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,\n    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,\n    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,"
+        },
+        {
+            "comment": "The code consists of a sequence of numbers, possibly representing frame indices or image IDs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":404-416",
+            "content": "    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,\n    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,\n    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,\n    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,\n    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,\n    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,\n    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,\n    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,\n    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,\n    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,\n    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,"
+        },
+        {
+            "comment": "The code creates a manager object and calls its train function. The list of numbers appears to be arguments or configuration for the training process, but without context it's difficult to determine their exact purpose.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage1.py\":417-428",
+            "content": "    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,\n    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,\n    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,\n    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,\n    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,\n    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,\n    255, 255\n]\nmanager = Manager()\nmanager.train()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/6ff25d46-6c9b-49dc-88ce-c9f0fdfdd53e.json b/docs/doc/6ff25d46-6c9b-49dc-88ce-c9f0fdfdd53e.json
new file mode 100644
index 000000000..c6163e057
--- /dev/null
+++ b/docs/doc/6ff25d46-6c9b-49dc-88ce-c9f0fdfdd53e.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code uses PaddlePaddle and AttentionLstmErnie for multimodal video tagging, including model building, adjusting batch size, calculating metrics, testing, and saving parameters. The main function handles argument parsing and evaluation.",
+    "details": [
+        {
+            "comment": "This code is an evaluation function for a multimodal video tagging application. It imports necessary libraries, disables dynamic memory allocation, sets up the PaddlePaddle environment, and includes functions for reading data, defining the model architecture, and calculating metrics. The code also defines a \"parse_args\" function to handle command line arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py\":0-36",
+            "content": "\"\"\"\neval main\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\nimport time\nimport argparse\nimport logging\nimport pickle\nimport numpy as np\nimport paddle\npaddle.enable_static()\nimport paddle.static as static\nfrom accuracy_metrics import MetricsCalculator\nfrom datareader import get_reader\nfrom config import parse_config, merge_configs, print_configs\nfrom models.attention_lstm_ernie import AttentionLstmErnie\nfrom utils import test_with_pyreader\ndef parse_args():"
+        },
+        {
+            "comment": "This code defines an argument parser for the Paddle Video evaluate script. It allows users to input a model name, config file path, pretrain weights path, output path, use_gpu flag, and save_model_param_dir. The default values are provided for each argument in case they aren't specified by the user.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py\":37-63",
+            "content": "    \"\"\"parse_args\n    \"\"\"\n    parser = argparse.ArgumentParser(\"Paddle Video evaluate script\")\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='BaiduNet',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/conf.txt',\n                        help='path to config file of model')\n    parser.add_argument(\n        '--pretrain',\n        type=str,\n        default=None,\n        help=\n        'path to pretrain weights. None to use default weights path in  ~/.paddle/weights.'\n    )\n    parser.add_argument('--output', type=str, default=None, help='output path')\n    parser.add_argument('--use_gpu',\n                        type=bool,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument('--save_model_param_dir',\n                        type=str,\n                        default=None,\n                        help='checkpoint path')"
+        },
+        {
+            "comment": "This code defines command-line arguments for saving and evaluating an inference model, parses the configuration file, builds a model using AttentionLstmErnie, and sets up static programs for evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py\":64-93",
+            "content": "    parser.add_argument('--save_inference_model',\n                        type=str,\n                        default=None,\n                        help='save inference path')\n    parser.add_argument('--save_only',\n                        action='store_true',\n                        default=False,\n                        help='only save model, do not evaluate model')\n    args = parser.parse_args()\n    return args\ndef evaluate(args):\n    \"\"\"evaluate\n    \"\"\"\n    # parse config\n    config = parse_config(args.config)\n    valid_config = merge_configs(config, 'valid', vars(args))\n    print_configs(valid_config, 'Valid')\n    # build model\n    valid_model = AttentionLstmErnie(args.model_name,\n                                     valid_config,\n                                     mode='valid')\n    startup = static.Program()\n    valid_prog = static.default_main_program().clone(for_test=True)\n    with static.program_guard(valid_prog, startup):\n        paddle.disable_static()\n        valid_model.build_input(True)\n        valid_model.build_model()"
+        },
+        {
+            "comment": "This code is loading the model from a specified directory, compiling the program and running it. It checks if the save weight directory exists and loads the test weights into the model. If necessary, it saves the inference model and if only saving the model is required, it exits. The batch size is adjusted by dividing it by a denominator.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py\":94-122",
+            "content": "        valid_feeds = valid_model.feeds()\n        valid_outputs = valid_model.outputs()\n        valid_loss = valid_model.loss()\n        valid_pyreader = valid_model.pyreader()\n        paddle.enable_static()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(startup)\n    compiled_valid_prog = static.CompiledProgram(valid_prog)\n    # load weights\n    assert os.path.exists(args.save_model_param_dir), \\\n            \"Given save weight dir {} not exist.\".format(args.save_model_param_dir)\n    valid_model.load_test_weights_file(exe, args.save_model_param_dir,\n                                       valid_prog, place)\n    if args.save_inference_model:\n        save_model_params(exe, valid_prog, valid_model,\n                          args.save_inference_model)\n    if args.save_only is True:\n        print('save model only, exit')\n        return\n    # get reader\n    bs_denominator = 1\n    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /\n                                        bs_denominator)"
+        },
+        {
+            "comment": "This code retrieves a valid reader, calculates metrics, and decorates the sample list generator with specified execution places. It then tests the model using the reader, program, and fetch list to obtain test loss and accuracy, which are printed. The function `save_model_params` saves the model parameters in the provided directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py\":123-144",
+            "content": "    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)\n    # get metrics\n    valid_metrics = MetricsCalculator(args.model_name.upper(), 'valid',\n                                      valid_config)\n    valid_fetch_list = [valid_loss.name] + [x.name for x in valid_outputs\n                                            ] + [valid_feeds[-1].name]\n    # get reader\n    exe_places = static.cuda_places() if args.use_gpu else static.cpu_places()\n    valid_pyreader.decorate_sample_list_generator(valid_reader,\n                                                  places=exe_places)\n    test_loss, metrics_dict_test = test_with_pyreader(exe, compiled_valid_prog,\n                                                      valid_pyreader,\n                                                      valid_fetch_list,\n                                                      valid_metrics)\n    test_acc1 = metrics_dict_test['avg_acc1']\n    print(test_loss)\n    print(test_acc1)\ndef save_model_params(exe, program, model_object, save_dir):"
+        },
+        {
+            "comment": "This code defines a function \"save_model_params\" that takes the directory path, saves the inference model, and specifies the feeded variable names, main program, target variables, executor, and filenames for the model and parameters. It also includes a main function that parses arguments and evaluates them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py\":145-158",
+            "content": "    \"\"\"save_model_params\n    \"\"\"\n    feeded_var_names = [var.name for var in model_object.feeds()][:-1]\n    static.save_inference_model(dirname=save_dir,\n                                  feeded_var_names=feeded_var_names,\n                                  main_program=program,\n                                  target_vars=model_object.outputs(),\n                                  executor=exe,\n                                  model_filename='model',\n                                  params_filename='params')\nif __name__ == \"__main__\":\n    args = parse_args()\n    evaluate(args)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/70cc0e1d-d3a5-4774-8f93-8fd07d6bfeea.json b/docs/doc/70cc0e1d-d3a5-4774-8f93-8fd07d6bfeea.json
new file mode 100644
index 000000000..ee5b820e0
--- /dev/null
+++ b/docs/doc/70cc0e1d-d3a5-4774-8f93-8fd07d6bfeea.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines ActBertLoss and actBertLoss classes as loss functions for ActBert model, using CrossEntropyLoss and nn.KLDivLoss. The total loss is calculated by summing masked text, masked image, masked action, and next sentence losses, based on predictions and labels from various sources.",
+    "details": [
+        {
+            "comment": "This code defines the ActBertLoss class, which is a loss function for the ActBert model. It uses the CrossEntropyLoss from PaddlePaddle's nn library and takes two arguments: vocab_size and a_target_size. The class inherits from BaseWeightedLoss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/actbert_loss.py\":0-31",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass ActBertLoss(BaseWeightedLoss):\n    \"\"\"Loss for ActBert model\n    \"\"\"\n    def __init__(self, vocab_size=30522, a_target_size=700):\n        super().__init__()\n        self.vocab_size = vocab_size\n        self.a_target_size = a_target_size\n        self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1)"
+        },
+        {
+            "comment": "This code defines a class for an actBert loss function. It uses the nn.KLDivLoss function as a criterion and takes in prediction scores, sequence labels, image labels, image targets, action labels, and next sentence labels as input to compute the visual loss (img_loss). The prediction_scores_v variable is modified by removing the first element from each sequence, likely for consistency purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/actbert_loss.py\":32-49",
+            "content": "        self.vis_criterion = nn.KLDivLoss(reduction=\"none\")\n    def forward(self, prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \\\n                text_labels, image_label, image_target, action_label, next_sentence_label):\n        \"\"\"\n        Args:\n            text_label: text label(with mask). Shape: [batch_size, seqence_length]\n            image_label: image label(with mask). Shape: [batch_size, region_length]\n            image_target: label of image feature distribution,\n                            Shape: [batch_size, region_length-1, num_image_class](minus 1 for xxx).\n            action label: action label(with mask), Shape: [batch_size, action_length]\n            next_sentence_label: is next sentence or not. Shape: [batch_size]\n        \"\"\"\n        prediction_scores_v = prediction_scores_v[:,\n                                                  1:]  #8,37,1601 --> 8,36,1601\n        img_loss = self.vis_criterion(\n            F.log_softmax(prediction_scores_v, axis=2),"
+        },
+        {
+            "comment": "This code calculates a total loss by summing the masked text loss, masked image loss, masked action loss, and next sentence loss. The losses are calculated based on predictions and labels from various sources. The `loss_fct` function is used to compute these losses, and they are reshaped before being added together for the final total loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/actbert_loss.py\":50-74",
+            "content": "            image_target  #8,36,1601\n        )\n        masked_img_loss = paddle.sum(\n            img_loss * (image_label == 1).unsqueeze(2).astype('float32')) / max(\n                paddle.sum((image_label == 1).astype('float32')), 1e-6)\n        masked_text_loss = self.loss_fct(\n            prediction_scores_t.reshape([-1, self.vocab_size]),  #8,36,30522\n            text_labels.reshape([-1]),  #8,36   # label -1 will be ignored\n        )\n        masked_action_loss = self.loss_fct(\n            prediction_scores_a.reshape([-1, self.a_target_size]),  #8,5,700\n            action_label.reshape([-1]),  #8,5\n        )\n        next_sentence_loss = self.loss_fct(\n            seq_relationship_score.reshape([-1, 2]),\n            next_sentence_label.reshape([-1])  #8,2\n        )\n        total_loss = masked_text_loss.unsqueeze(0) + masked_img_loss.unsqueeze(\n            0) + masked_action_loss.unsqueeze(0) + next_sentence_loss.unsqueeze(\n                0)\n        return total_loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/71c0a7a0-1364-471c-ad22-19596c791961.json b/docs/doc/71c0a7a0-1364-471c-ad22-19596c791961.json
new file mode 100644
index 000000000..d7e02ec30
--- /dev/null
+++ b/docs/doc/71c0a7a0-1364-471c-ad22-19596c791961.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The FeatureReader class in Python reads data for YouTube-8M dataset, supports LSTM, Attention Cluster, and NextVLAD models, initializes feature reader with parameters, shuffles proposals, generates batches, and yields when batch size is reached.",
+    "details": [
+        {
+            "comment": "This code is a Python class called FeatureReader, which inherits from DataReader. It serves as a data reader for the YouTube-8M dataset, using features extracted by prior networks. It supports three models: LSTM, Attention Cluster, and NextVLAD. The class imports necessary libraries and modules to read, parse, and manipulate the dataset efficiently.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/feature_reader.py\":0-32",
+            "content": "\"\"\"\nattention-lstm feature reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\nimport numpy as np\nimport random\nimport code\nfrom .reader_utils import DataReader\nclass FeatureReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm, attention cluster, nextvlad"
+        },
+        {
+            "comment": "The code initializes a feature reader, takes in parameters such as name, mode, configuration, and material (featuring image, audio, and pcm features). It shuffles the proposals if in training mode. The reader function generates batches of data by iterating through the proposal list, extracting relevant features from specific ID ranges, and storing them in a batch_out list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/feature_reader.py\":34-70",
+            "content": "    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        self.feature = material['feature']\n        self.proposal = material['proposal']\n        self.fps = 5\n    def create_reader(self):\n        \"\"\"\n        create_reader\n        \"\"\"\n        image_feature_list = self.feature['image_feature']\n        audio_feature_list = self.feature['audio_feature']\n        pcm_feature_list = self.feature['pcm_feature']\n        pcm_feature_list = pcm_feature_list.reshape((pcm_feature_list.shape[0] * 5, 640))\n        fl = self.proposal\n        if self.mode == 'train':\n            random.shuffle(fl)\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            for prop_info in fl:\n                start_id = int(prop_info['start'])\n                end_id = int(prop_info['end'])"
+        },
+        {
+            "comment": "This code snippet is part of a feature reader for an action detection system. It reads image, audio, and pcm features from feature lists, concatenates them if needed, creates a batch, and yields the batch when it reaches the specified batch size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/feature_reader.py\":71-85",
+            "content": "                bmn_score = float(prop_info['score'])\n                try:\n                    image_feature = image_feature_list[start_id: end_id]\n                    audio_feature = audio_feature_list[int(start_id / self.fps): int(end_id / self.fps)]\n                    pcm_feature = pcm_feature_list[start_id: end_id]\n                    image_feature = np.concatenate((image_feature, pcm_feature), axis=1)\n                    batch_out.append((image_feature, audio_feature, 0, prop_info))\n                    if len(batch_out) == self.batch_size:\n                        yield batch_out\n                        batch_out = []\n                except Exception as e:\n                    continue\n        return reader"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7341eb50-8847-4b5f-954c-8958e26d6f93.json b/docs/doc/7341eb50-8847-4b5f-954c-8958e26d6f93.json
new file mode 100644
index 000000000..29ca72e47
--- /dev/null
+++ b/docs/doc/7341eb50-8847-4b5f-954c-8958e26d6f93.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code lists various application cases in PaddleVideo, including football action detection, basketball action detection, table tennis action recognition, figure skating action identification, video tagging, multimodal video classification, video quality assessment, 3DMRI medical image recognition, video interactive segmentation tool, UAV detection, abnormal behavior detection, and human analysis scenario action recognition.",
+    "details": [
+        {
+            "comment": "This code lists various application cases in PaddleVideo, including football action detection, basketball action detection, table tennis action recognition, figure skating action identification, video tagging, multimodal video classification, video quality assessment, 3DMRI medical image recognition, video interactive segmentation tool, UAV detection, abnormal behavior detection, and human analysis scenario action recognition.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/README.md\":0-17",
+            "content": "# \u5e94\u7528\u6848\u4f8b\n## 1. \u6982\u89c8\n| Applications | Descriptions |\n| :--------------- | :-------- |\n| [FootballAction](./FootballAction) | \u8db3\u7403\u52a8\u4f5c\u68c0\u6d4b\u65b9\u6848|\n| [BasketballAction](./BasketballAction) | \u7bee\u7403\u52a8\u4f5c\u68c0\u6d4b\u65b9\u6848 |\n| [TableTennis](./TableTennis) | \u4e52\u4e53\u7403\u52a8\u4f5c\u8bc6\u522b\u65b9\u6848|\n| [FigureSkating](./FigureSkating) | \u82b1\u6837\u6ed1\u51b0\u52a8\u4f5c\u8bc6\u522b\u65b9\u6848|\n| [VideoTag](./VideoTag) | 3000\u7c7b\u5927\u89c4\u6a21\u89c6\u9891\u5206\u7c7b\u65b9\u6848 |\n| [MultimodalVideoTag](./MultimodalVideoTag) | \u591a\u6a21\u6001\u89c6\u9891\u5206\u7c7b\u65b9\u6848|\n| [VideoQualityAssessment](.s/VideoQualityAssessment) | \u89c6\u9891\u8d28\u91cf\u8bc4\u4f30\u65b9\u6848|\n| [PP-Care](./PP-Care) | 3DMRI\u533b\u7597\u56fe\u50cf\u8bc6\u522b\u65b9\u6848 |\n| [EIVideo](./EIVideo) | \u89c6\u9891\u4ea4\u4e92\u5f0f\u5206\u5272\u5de5\u5177|\n| [Anti-UAV](./Anti-UAV) |\u65e0\u4eba\u673a\u68c0\u6d4b\u65b9\u6848|\n| [AbnormalActionDetection](./AbnormalActionDetection) |\u5f02\u5e38\u884c\u4e3a\u68c0\u6d4b\u65b9\u6848|\n| [PP-Human](./PPHuman) | \u884c\u4eba\u5206\u6790\u573a\u666f\u52a8\u4f5c\u8bc6\u522b\u65b9\u6848 |"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/73925f43-4347-466a-8d2e-03273cc295ba.json b/docs/doc/73925f43-4347-466a-8d2e-03273cc295ba.json
new file mode 100644
index 000000000..a6f436c59
--- /dev/null
+++ b/docs/doc/73925f43-4347-466a-8d2e-03273cc295ba.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a `FrozenBatchNorm2d` class for batch normalization without updating statistics and a `DeepLab` class with backbone, ASPP module, decoder, and methods to freeze batch norm layers. It also provides a function that iterates through certain modules, yielding parameters requiring gradient updates for potentially applying different learning rates.",
+    "details": [
+        {
+            "comment": "The code defines a `FrozenBatchNorm2d` class that extends the `nn.Layer` and overrides the `forward()` function to perform batch normalization without updating statistics. The `DeepLab` class inherits from `nn.Layer` and serves as a backbone for the deeplab network architecture, incorporating a backbone network, ASPP module, and decoder.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/deeplab.py\":0-30",
+            "content": "import paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom networks.aspp import build_aspp\nfrom networks.decoder import build_decoder\nfrom networks.backbone import build_backbone\nclass FrozenBatchNorm2d(nn.Layer):\n    def __init__(self, n):\n        super(FrozenBatchNorm2d, self).__init__()\n        self.register_buffer(\"weight\", paddle.ones(n))\n        self.register_buffer(\"bias\", paddle.zeros(n))\n        self.register_buffer(\"running_mean\", paddle.zeros(n))\n        self.register_buffer(\"running_var\", paddle.ones(n))\n    def forward(self, x):\n        if x.dtype == paddle.float16:\n            self.weight = self.weight.half()\n            self.bias = self.bias.half()\n            self.running_mean = self.running_mean.half()\n            self.running_var = self.running_var.half()\n        scale = self.weight * self.running_var.rsqrt()\n        bias = self.bias - self.running_mean * scale\n        scale = scale.reshape(1, -1, 1, 1)\n        bias = bias.reshape(1, -1, 1, 1)\n        return x * scale + bias\nclass DeepLab(nn.Layer):"
+        },
+        {
+            "comment": "This code defines the DeepLab class with an initializer that takes arguments for backbone, output stride, number of classes, and whether to freeze batch normalization layers. It also includes methods to freeze batch norm layers, retrieve parameters for 1x learning rate, and a forward pass function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/deeplab.py\":31-63",
+            "content": "    def __init__(self,\n                 backbone='resnet',\n                 output_stride=16,\n                 num_classes=21,\n                 sync_bn=True,\n                 freeze_bn=False):\n        super(DeepLab, self).__init__()\n        if backbone == 'drn':\n            output_stride = 8\n        if freeze_bn == True:\n            print(\"Use frozen BN in DeepLab\")\n            BatchNorm = FrozenBatchNorm2d\n        else:\n            BatchNorm = nn.BatchNorm2D\n        self.backbone = build_backbone(backbone, output_stride, BatchNorm)\n        self.aspp = build_aspp(backbone, output_stride, BatchNorm)\n        self.decoder = build_decoder(num_classes, backbone, BatchNorm)\n    def forward(self, input):\n        x, low_level_feat = self.backbone(input)\n        x = self.aspp(x)\n        x = self.decoder(x, low_level_feat)\n        return x\n    def freeze_bn(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.BatchNorm2D):\n                m.eval()\n    def get_1x_lr_params(self):\n        modules = [self.backbone]"
+        },
+        {
+            "comment": "This code defines a function that iterates through certain modules of the network, specifically looking for convolution and batch normalization layers. It then yields the parameters of these layers that require gradient updates. This process is used in both the main body and the get_10x_lr_params method to potentially apply different learning rates to specific parts of the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/deeplab.py\":64-80",
+            "content": "        for i in range(len(modules)):\n            for m in modules[i].named_modules():\n                if isinstance(m[1], nn.Conv2D) or isinstance(\n                        m[1], nn.BatchNorm2D):\n                    for p in m[1].parameters():\n                        if p.requires_grad:\n                            yield p\n    def get_10x_lr_params(self):\n        modules = [self.aspp, self.decoder]\n        for i in range(len(modules)):\n            for m in modules[i].named_modules():\n                if isinstance(m[1], nn.Conv2D) or isinstance(\n                        m[1], nn.BatchNorm2D):\n                    for p in m[1].parameters():\n                        if p.requires_grad:\n                            yield p"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/73bbcc1a-2f27-45c9-8e1a-c6aca413d5a7.json b/docs/doc/73bbcc1a-2f27-45c9-8e1a-c6aca413d5a7.json
new file mode 100644
index 000000000..c9f7f6998
--- /dev/null
+++ b/docs/doc/73bbcc1a-2f27-45c9-8e1a-c6aca413d5a7.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code imports libraries, defines the RecognizerTransformer_MRI model class with forward method and training/validation steps, using loss metrics. It includes two inference methods: 'test_step' and 'infer_step', which split input into multiple views for classification score generation. The average_view function combines these scores across views, using either 'score' or 'prob' averaging types.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries, defines a class for the RecognizerTransformer_MRI model, and sets the input image shape. The forward_net method preprocesses input images by casting them to float32 type and adding an extra dimension for compatibility with the transformer architecture.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py\":0-31",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport paddle.nn.functional as F\nfrom paddlevideo.utils import get_logger\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerTransformer_MRI(BaseRecognizer):\n    \"\"\"Transformer's recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        # imgs.shape=[N,C,T,H,W], for transformer case\n        imgs = paddle.cast(imgs, \"float32\")  #############\n        imgs = imgs.unsqueeze(1)\n        if self.backbone != None:"
+        },
+        {
+            "comment": "This code defines a recognizer transformer model for image classification. The `forward_net` method processes images and returns class scores, while the `train_step` and `val_step` methods perform training and validation steps by passing data batches to the model and computing loss metrics using sigmoid activation and the head's loss function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py\":32-62",
+            "content": "            feature = self.backbone(imgs)\n        else:\n            feature = imgs\n        if self.head != None:\n            cls_score = self.head(feature)\n        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score,\n                                      labels,\n                                      valid_mode=True,\n                                      if_top5=False)\n        return loss_metrics"
+        },
+        {
+            "comment": "The code defines two methods, 'test_step' and 'infer_step', for the model to infer from input to output. It splits the input into multiple views based on the number of segments in each view. For each view, it applies the forward network to generate a set of classification scores. Finally, it averages the scores across all views using the average_view method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py\":64-88",
+            "content": "    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        imgs = data_batch[0]\n        num_views = imgs.shape[2] // self.backbone.seg_num\n        cls_score = []\n        for i in range(num_views):\n            view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *\n                        self.backbone.seg_num]\n            cls_score.append(self.forward_net(view))\n        cls_score = self.average_view(cls_score)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        imgs = data_batch[0]\n        num_views = imgs.shape[2] // self.backbone.seg_num\n        cls_score = []\n        for i in range(num_views):\n            view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *\n                        self.backbone.seg_num]\n            cls_score.append(self.forward_net(view))\n        cls_score = self.average_view(cls_score)\n        return cls_score\n    def average_view(self, cls_score, average_type='score'):"
+        },
+        {
+            "comment": "This function combines the scores of multiple views, taking two arguments: a list of cls_scores and an optional average_type. It asserts that average_type is either 'score' or 'prob'. If 'score', it adds all scores in the list and divides by the count. If 'avg', it first applies softmax to each score, then adds them and divides by the count. Otherwise, it raises a NotImplementedError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py\":89-103",
+            "content": "        \"\"\"Combine the scores of different views\n        Args:\n            cls_score (list): Scores of multiple views\n            average_type (str, optional): Average calculation method. Defaults to 'score'.\n        \"\"\"\n        assert average_type in ['score', 'prob'], \\\n            f\"Currently only the average of 'score' or 'prob' is supported, but got {average_type}\"\n        if average_type == 'score':\n            return paddle.add_n(cls_score) / len(cls_score)\n        elif average_type == 'avg':\n            return paddle.add_n([F.softmax(score)\n                                 for score in cls_score]) / len(cls_score)\n        else:\n            raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/73d53f98-0749-46af-9c68-554b9e10f342.json b/docs/doc/73d53f98-0749-46af-9c68-554b9e10f342.json
new file mode 100644
index 000000000..63a0f0f4c
--- /dev/null
+++ b/docs/doc/73d53f98-0749-46af-9c68-554b9e10f342.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code imports various head classes from different modules in the PaddleVideo library for video object detection, segmentation, or action recognition tasks, and adds them to the `__all__` list for easy access.",
+    "details": [
+        {
+            "comment": "This code is importing various classes from different modules in the PaddleVideo library. These classes represent different types of heads used in video modeling, such as AttentionLstmHead and BBoxHeadAVA. The code also includes licenses and copyright information for the PaddlePaddle Authors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/__init__.py\":0-24",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .adds_head import AddsHead\nfrom .asrf_head import ASRFHead\nfrom .attention_lstm_head import AttentionLstmHead, ActionAttentionLstmHead\nfrom .base import BaseHead\nfrom .bbox_head import BBoxHeadAVA\nfrom .cfbi_head import CollaborativeEnsemblerMS\nfrom .i3d_head import I3DHead\nfrom .movinet_head import MoViNetHead\nfrom .ms_tcn_head import MSTCNHead\nfrom .pptimesformer_head import ppTimeSformerHead\nfrom .pptsm_head import ppTSMHead"
+        },
+        {
+            "comment": "This code imports various head classes from different modules and adds them to the `__all__` list, making them accessible for import when using this module. These head classes are used in video object detection, segmentation or action recognition tasks. They include ppTSNHead, TSNHead, TSMHead, ppTSMHead, SlowFastHead, TimeSformerHead and more. Each head class has its own specific functionality for different tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/__init__.py\":25-48",
+            "content": "from .pptsn_head import ppTSNHead\nfrom .roi_head import AVARoIHead\nfrom .single_straight3d import SingleRoIExtractor3D\nfrom .slowfast_head import SlowFastHead\nfrom .stgcn_head import STGCNHead\nfrom .timesformer_head import TimeSformerHead\nfrom .transnetv2_head import TransNetV2Head\nfrom .tsm_head import TSMHead\nfrom .tsn_head import TSNHead\nfrom .ms_tcn_head import MSTCNHead\nfrom .asrf_head import ASRFHead\nfrom .ctrgcn_head import CTRGCNHead\nfrom .movinet_head import MoViNetHead\nfrom .agcn2s_head import AGCN2sHead\nfrom .token_shift_head import TokenShiftHead\n__all__ = [\n    'BaseHead', 'TSNHead', 'TSMHead', 'ppTSMHead', 'ppTSNHead', 'SlowFastHead',\n    'AttentionLstmHead', 'TimeSformerHead', 'STGCNHead', 'TransNetV2Head',\n    'I3DHead', 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'AddsHead',\n    'ppTimeSformerHead', 'CollaborativeEnsemblerMS', 'MSTCNHead', 'ASRFHead',\n    'MoViNetHead', 'CTRGCNHead', 'TokenShiftHead', 'ActionAttentionLstmHead',\n    'AGCN2sHead'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/743e660a-5a99-4037-aa2b-61f14ad50502.json b/docs/doc/743e660a-5a99-4037-aa2b-61f14ad50502.json
new file mode 100644
index 000000000..fb153aab0
--- /dev/null
+++ b/docs/doc/743e660a-5a99-4037-aa2b-61f14ad50502.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The script initializes a larger application, sets up logging and imports modules, predicts video tags using PaddleVideo's models, configures parameters, builds the model, prepares inputs/outputs, runs inference, checks file existence, retrieves infer reader, sets up data feeder, fetches model outputs, collects results with video IDs, logs/saves average processing time, and checks GPU availability and version compatibility before running.",
+    "details": [
+        {
+            "comment": "This code appears to be an import and initialization script for a larger application. It sets up logging, imports various modules and libraries, checks the CUDA availability, and performs version checking. The code also includes licensing information and copyright notices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/predict.py\":0-36",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport time\nimport logging\nimport argparse\nimport ast\nimport numpy as np\nimport paddle\nimport paddle.static as static\ntry:\n    import cPickle as pickle\nexcept:\n    import pickle\nfrom utils.config_utils import *\nimport models\nfrom reader import get_reader\nfrom metrics import get_metrics\nfrom utils.utility import check_cuda\nfrom utils.utility import check_version\nlogging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'"
+        },
+        {
+            "comment": "The code imports logging, sets up a logger with debug level and configures the format. It then defines a function 'parse_args' that uses argparse to set default values for model name, config file path, whether to use GPU or not, weight path, and batch size for inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/predict.py\":37-63",
+            "content": "logging.basicConfig(level=logging.DEBUG, format=FORMAT, stream=sys.stdout)\nlogger = logging.getLogger(__name__)\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--model_name',\n                        type=str,\n                        default='AttentionCluster',\n                        help='name of model to train.')\n    parser.add_argument('--config',\n                        type=str,\n                        default='configs/attention_cluster.txt',\n                        help='path to config file of model')\n    parser.add_argument('--use_gpu',\n                        type=ast.literal_eval,\n                        default=True,\n                        help='default use gpu.')\n    parser.add_argument(\n        '--weights',\n        type=str,\n        default='./data/checkpoints/AttentionLSTM_epoch9.pdparams',\n        help='weight path.')\n    parser.add_argument('--batch_size',\n                        type=int,\n                        default=1,\n                        help='sample number in a batch for inference.')"
+        },
+        {
+            "comment": "This code snippet is part of a Python script for video tag prediction. It uses an argument parser to specify input files, log intervals, top k predictions and output directory. The default directories and paths are provided if no arguments are specified by the user.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/predict.py\":64-86",
+            "content": "    parser.add_argument('--filelist',\n                        type=str,\n                        default=None,\n                        help='path to inferenece data file lists file.')\n    parser.add_argument('--log_interval',\n                        type=int,\n                        default=1,\n                        help='mini-batch interval to log.')\n    parser.add_argument('--infer_topk',\n                        type=int,\n                        default=20,\n                        help='topk predictions to restore.')\n    parser.add_argument('--save_dir',\n                        type=str,\n                        default=os.path.join('data', 'predict_results',\n                                             'attention_lstm'),\n                        help='directory to store results')\n    parser.add_argument('--video_path',\n                        type=str,\n                        default=None,\n                        help='directory to store results')\n    parser.add_argument('--label_file',\n                        type=str,"
+        },
+        {
+            "comment": "The code defines a function that takes arguments, parses the config file, and builds an inference model using PaddleVideo's models. It then builds the inputs and outputs of the model, sets up the Executor based on GPU availability, and runs the startup program. Finally, it checks if the video or filelist path exists before proceeding with the inference process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/predict.py\":87-114",
+            "content": "                        default='label_3396.txt',\n                        help='chinese label file path')\n    args = parser.parse_args()\n    return args\ndef infer(args):\n    # parse config\n    config = parse_config(args.config)\n    infer_config = merge_configs(config, 'infer', vars(args))\n    print_configs(infer_config, \"Infer\")\n    infer_model = models.get_model(args.model_name, infer_config, mode='infer')\n    infer_model.build_input(use_dataloader=False)\n    infer_model.build_model()\n    infer_feeds = infer_model.feeds()\n    infer_outputs = infer_model.outputs()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(static.default_startup_program())\n    filelist = args.filelist or infer_config.INFER.filelist\n    filepath = args.video_path or infer_config.INFER.get('filepath', '')\n    if filepath != '':\n        assert os.path.exists(filepath), \"{} not exist.\".format(filepath)\n    else:\n        assert os.path.exists(filelist), \"{} not exist.\".format(filelist)"
+        },
+        {
+            "comment": "This code retrieves an infer reader, checks and loads weights for the model, sets up a data feeder, fetches outputs from the model, and collects results with video IDs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/predict.py\":116-140",
+            "content": "    # get infer reader\n    infer_reader = get_reader(args.model_name.upper(), 'infer', infer_config)\n    if args.weights:\n        assert os.path.exists(\n            args.weights), \"Given weight dir {} not exist.\".format(args.weights)\n    # if no weight files specified, download weights from paddle\n    weights = args.weights or infer_model.get_weights()\n    infer_model.load_test_weights(exe, weights, static.default_main_program())\n    infer_feeder = paddle.fluid.DataFeeder(place=place, feed_list=infer_feeds)\n    fetch_list = infer_model.fetches()\n    infer_metrics = get_metrics(args.model_name.upper(), 'infer', infer_config)\n    infer_metrics.reset()\n    periods = []\n    cur_time = time.time()\n    for infer_iter, data in enumerate(infer_reader()):\n        data_feed_in = [items[:-1] for items in data]\n        video_id = [items[-1] for items in data]\n        infer_outs = exe.run(fetch_list=fetch_list,\n                             feed=infer_feeder.feed(data_feed_in))\n        infer_result_list = [item for item in infer_outs] + [video_id]"
+        },
+        {
+            "comment": "The code calculates the average processing time for each sample, logs the information, and saves the final output. It uses a log interval to report progress, and the `infer_metrics` object accumulates data for logging and saving. The code also checks for GPU availability and version compatibility before running.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/predict.py\":142-170",
+            "content": "        prev_time = cur_time\n        cur_time = time.time()\n        period = cur_time - prev_time\n        periods.append(period)\n        infer_metrics.accumulate(infer_result_list)\n        if args.log_interval > 0 and infer_iter % args.log_interval == 0:\n            logger.info('Processed {} samples'.format(\n                (infer_iter + 1) * len(video_id)))\n    logger.info('[INFER] infer finished. average time: {}'.format(\n        np.mean(periods)))\n    if not os.path.isdir(args.save_dir):\n        os.makedirs(args.save_dir)\n    infer_metrics.finalize_and_log_out(savedir=args.save_dir,\n                                       label_file=args.label_file)\nif __name__ == \"__main__\":\n    args = parse_args()\n    # check whether the installed paddle is compiled with GPU\n    check_cuda(args.use_gpu)\n    check_version()\n    logger.info(args)\n    infer(args)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/74717c3a-94e8-48a6-82cc-b5cc16dc457b.json b/docs/doc/74717c3a-94e8-48a6-82cc-b5cc16dc457b.json
new file mode 100644
index 000000000..be55baa12
--- /dev/null
+++ b/docs/doc/74717c3a-94e8-48a6-82cc-b5cc16dc457b.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines a TokenShiftHead class for classification tasks in Paddle. It inherits from BaseHead, uses Linear module, and returns classification scores after passing input tensor x through fully connected layer self.fc.",
+    "details": [
+        {
+            "comment": "The code is defining a class called TokenShiftHead, which is a Transformer head for classification tasks. It has attributes such as num_classes, in_channels, and num_seg (defaulted to 8). The class inherits from BaseHead and is registered under the HEADS registry. The code imports necessary modules and functions, and uses Paddle's Linear module for the layer implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/token_shift_head.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom paddle.nn import Linear\nimport paddle\nfrom ..registry import HEADS\nfrom ..weight_init import trunc_normal_, weight_init_\nfrom .base import BaseHead\n@HEADS.register()\nclass TokenShiftHead(BaseHead):\n    \"\"\"TokenShift Transformer Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        num_seg(int): The number of segments. Default: 8. "
+        },
+        {
+            "comment": "__init__ function initializes the class with specified parameters, and init_weights is used to initialize the FC layer's parameters using truncated normal distribution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/token_shift_head.py\":30-59",
+            "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        ls_eps (float): Label smoothing epsilon. Default: 0.01.\n        std (float): Std(Scale) Value in normal initilizar. Default: 0.02.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 num_seg=8,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 ls_eps=0.01,\n                 std=0.02,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, ls_eps)\n        self.num_seg = num_seg\n        self.std = std\n        self.fc = Linear(self.in_channels, self.num_classes)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'TruncatedNormal',\n                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.0,\n                     std=self.std)\n        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal"
+        },
+        {
+            "comment": "This code defines a TokenShiftHead, which performs classification tasks. The forward function takes input tensor x and passes it through fully connected layer self.fc, resulting in classification scores for each sample. It then reshapes the score to average predictions for every frame, finally squeezing the axis to return the final score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/token_shift_head.py\":60-78",
+            "content": "        trunc_normal_(self.fc.weight, std=self.std)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # XXX: check dropout location!\n        # x.shape = [N, embed_dim]\n        score = self.fc(x)\n        # [N*T, num_class]\n        _, _m = score.shape\n        _t = self.num_seg\n        score = score.reshape([-1, _t, _m])\n        score = paddle.mean(score, 1)  # averaging predictions for every frame\n        score = paddle.squeeze(score, axis=1)\n        return score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/75917cee-6bd6-49b8-9390-93ec47bdaa44.json b/docs/doc/75917cee-6bd6-49b8-9390-93ec47bdaa44.json
new file mode 100644
index 000000000..485d92a14
--- /dev/null
+++ b/docs/doc/75917cee-6bd6-49b8-9390-93ec47bdaa44.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The code introduces a video action segmentation dataset that utilizes breakfast, 50salads, and gtea datasets. The pre-training model's extracted features are used for the dataset. The dataset tree and data tree structure are provided, along with details of their folder contents.",
+    "details": [
+        {
+            "comment": "The code introduces a video action segmentation dataset that utilizes breakfast, 50salads, and gtea datasets. The pre-training model's extracted features are used for the dataset. The dataset tree and data tree structure are provided, along with details of their folder contents.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/SegmentationDataset.md\":0-34",
+            "content": "English | [\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/SegmentationDataset.md)\n# Video Action Segmentation Dataset\nThe video motion segmentation model uses breakfast, 50salads and gtea data sets. The use method is to use the features extracted by the pre training model, which can be obtained from the ms-tcn official code base.[feat](https://zenodo.org/record/3625992#.Xiv9jGhKhPY)\n- Dataset tree\n```txt\n\u2500\u2500\u2500 gtea\n    \u251c\u2500\u2500 features\n    \u2502   \u251c\u2500\u2500 S1_Cheese_C1.npy\n    \u2502   \u251c\u2500\u2500 S1_Coffee_C1.npy\n    \u2502   \u251c\u2500\u2500 S1_CofHoney_C1.npy\n    \u2502   \u2514\u2500\u2500 ...\n    \u251c\u2500\u2500 groundTruth\n    \u2502   \u251c\u2500\u2500 S1_Cheese_C1.txt\n    \u2502   \u251c\u2500\u2500 S1_Coffee_C1.txt\n    \u2502   \u251c\u2500\u2500 S1_CofHoney_C1.txt\n    \u2502   \u2514\u2500\u2500 ...\n    \u251c\u2500\u2500 splits\n    \u2502   \u251c\u2500\u2500 test.split1.bundle\n    \u2502   \u251c\u2500\u2500 test.split2.bundle\n    \u2502   \u251c\u2500\u2500 test.split3.bundle\n    \u2502   \u2514\u2500\u2500 ...\n    \u2514\u2500\u2500 mapping.txt\n```\n- data tree\n```txt\n\u2500\u2500\u2500 data\n    \u251c\u2500\u2500 50salads\n    \u251c\u2500\u2500 breakfast\n    \u251c\u2500\u2500 gtea\n    \u2514\u2500\u2500 ...\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/75ac2cd6-4dbc-4b74-9b97-d5f8a78af490.json b/docs/doc/75ac2cd6-4dbc-4b74-9b97-d5f8a78af490.json
new file mode 100644
index 000000000..b74e0a8d2
--- /dev/null
+++ b/docs/doc/75ac2cd6-4dbc-4b74-9b97-d5f8a78af490.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code provides usage instructions for various tools in PaddleVideo. It shows how to retrieve model parameters, calculate FLOPs, and test an exported model (coming soon). The code examples use Python 3.7 and require specific configuration files.",
+    "details": [
+        {
+            "comment": "This code provides usage instructions for various tools in PaddleVideo. It shows how to retrieve model parameters, calculate FLOPs, and test an exported model (coming soon). The code examples use Python 3.7 and require specific configuration files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tools.md\":0-21",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../zh-CN/tools.md) | English\n# Tools\nThis page includes the usage of some useful tools in PaddleVideo.\n## Params\nTo get the params of a model.\n```shell\npython3.7 tools/summary.py -c configs/recognization/tsm/tsm.yaml\n```\n## FLOPS\nto print FLOPs.\n```shell\npython3.7 tools/summary.py -c configs/recognization/tsm/tsm.yaml --FLOPs\n```\n## Test the export model <sup>coming soon</sup>"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/75dbef95-cd79-46a1-83f1-5311534490bc.json b/docs/doc/75dbef95-cd79-46a1-83f1-5311534490bc.json
new file mode 100644
index 000000000..aee2c63d2
--- /dev/null
+++ b/docs/doc/75dbef95-cd79-46a1-83f1-5311534490bc.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code implements the CFBI Video Object Segmentation model proposed by Baidu in ECCV 2020, considering background and foreground for segmentation, predicting on current frames given reference frame and previous frame. It follows DAVIS guidelines, uses \"cfbip_davis.yaml\" configuration file, pretrained weights \"CFBIp_davis.pdparams\", saves predictions to \"result_root\", provides evaluation metrics including J&F-Mean, and references checkpoint file \"CFBIp_r101_davis.pdparams\".",
+    "details": [
+        {
+            "comment": "This code describes the CFBI Video Object Segmentation model, proposed by Baidu in ECCV 2020. It considers background as important as foreground and uses collaborative integration for segmentation. The model predicts segmentation of current frames given reference frame and previous frame. Data preparation follows DAVIS guidelines.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/cfbi.md\":0-28",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/cfbi.md) | English\n# CFBI\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Test](#Test)\n- [Reference](#Reference)\n## Introduction\nCFBI is a Video Object Segmentation model proposed by Baidu in ECCV 2020. This method consider background should be equally treated and thus propose Collaborative video object segmentation by Foreground-Background Integration (CFBI) approach. Our CFBI implicitly imposes the feature embedding from the target foreground object and its corresponding background to be contrastive, promoting the segmentation results accordingly.  Given the image and target segmentation of the reference frame (the first frame) and the previous frame, the model will predict the segmentation of the current frame.\n<div align=\"center\">\n<img src=\"../../../images/cfbi.png\" height=400 width=600 hspace='10'/> <br />\n</div>\n## Data\nPlease refer to DAVIS data download and preparation doc [DAVIS-data](../../dataset/davis.md)\n## Test\n- Test scripts:"
+        },
+        {
+            "comment": "This code is running a segmentation model trained using the \"cfbip_davis.yaml\" configuration file, and testing it with pretrained weights stored in \"CFBIp_davis.pdparams\". The predicted results will be saved to the \"result_root\" directory. Evaluation metrics for this model on DAVIS dataset are provided, including J&F-Mean, J-Mean, J-Recall, J-Decay, F-Mean, F-Recall and F-Decay. The checkpoint file is referenced as \"CFBIp_r101_davis.pdparams\" which can be found at the provided URL.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/cfbi.md\":30-45",
+            "content": "```bash\npython3.7 main.py --test -c configs/segmentation/cfbip_davis.yaml -w CFBIp_davis.pdparams\n```\n- Predicted results will be saved in `result_root`. To get evaluation metrics, please use [davis2017-evaluation tools](https://github.com/davisvideochallenge/davis2017-evaluation).\nMetrics on DAVIS:\n| J&F-Mean | J-Mean | J-Recall | J-Decay | F-Mean | F-Recall | F-Decay | checkpoints |\n| :------: | :-----: | :----: | :----: | :----: | :----: | :----: | :----: |\n| 0.823 | 0.793 | 0.885 | 0.083 | 0.852 | 0.932 | 0.100 | [CFBIp_r101_davis.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/CFBIp_r101_davis.pdparams) |\n## Reference\n- [Collaborative Video Object Segmentation by Foreground-Background Integration](https://arxiv.org/abs/2003.08333), Zongxin Yang, Yunchao Wei, Yi Yang"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7603bda9-6e6d-4af5-b5f3-5ff70b5f3eeb.json b/docs/doc/7603bda9-6e6d-4af5-b5f3-5ff70b5f3eeb.json
new file mode 100644
index 000000000..5a6739b7a
--- /dev/null
+++ b/docs/doc/7603bda9-6e6d-4af5-b5f3-5ff70b5f3eeb.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports and registers various readers for different formats (TSM, PPTSM, AUDIO, BMN, ACTION) to read map files for the model. The readers are registered in alphabetical order.",
+    "details": [
+        {
+            "comment": "This code imports and registers various readers for different formats (TSM, PPTSM, AUDIO, BMN, ACTION) to read map files for the model. The readers are registered in alphabetical order.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/__init__.py\":0-14",
+            "content": "\"\"\"\nread map for model\n\"\"\"\nfrom reader.reader_utils import regist_reader, get_reader\nimport reader.tsminf_reader as tsminf_reader\nimport reader.audio_reader as audio_reader\nimport reader.bmninf_reader as bmninf_reader\nimport reader.feature_reader as feature_reader\n# regist reader, sort by alphabet\nregist_reader(\"TSM\", tsminf_reader.TSMINFReader)\nregist_reader(\"PPTSM\", tsminf_reader.TSMINFReader)\nregist_reader(\"AUDIO\", audio_reader.AudioReader)\nregist_reader(\"BMN\", bmninf_reader.BMNINFReader)\nregist_reader(\"ACTION\", feature_reader.FeatureReader)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7660795f-9adf-4ac1-95f7-99eb92def73f.json b/docs/doc/7660795f-9adf-4ac1-95f7-99eb92def73f.json
new file mode 100644
index 000000000..3d81edf3d
--- /dev/null
+++ b/docs/doc/7660795f-9adf-4ac1-95f7-99eb92def73f.json
@@ -0,0 +1,45 @@
+{
+    "summary": "This code defines a ResNet architecture with batch normalization and ReLU activation functions, featuring output strides of 16 or 8, multiple blocks, residual connections through convolutional layers, a residual block for ResNet-101, and optional pretrained model loading on ImageNet.",
+    "details": [
+        {
+            "comment": "This code defines a Bottleneck class for ResNet backbone, which contains three 2D convolutional layers and two batch normalization layers. It has an expansion factor of 4. The convolutional layers have configurable parameters such as inplanes, planes, stride, dilation, and downsample.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/resnet.py\":0-32",
+            "content": "import math\nimport paddle.nn as nn\n# from reprod_log.utils import paddle2np\nimport paddle\nfrom utils.api import normal_, fill_, zero_\nclass Bottleneck(nn.Layer):\n    expansion = 4\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 stride=1,\n                 dilation=1,\n                 downsample=None,\n                 BatchNorm=None):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)\n        self.bn1 = BatchNorm(planes)\n        self.conv2 = nn.Conv2D(planes,\n                               planes,\n                               kernel_size=3,\n                               stride=stride,\n                               dilation=dilation,\n                               padding=dilation,\n                               bias_attr=False)\n        self.bn2 = BatchNorm(planes)\n        self.conv3 = nn.Conv2D(planes,\n                               planes * 4,\n                               kernel_size=1,\n                               bias_attr=False)"
+        },
+        {
+            "comment": "This code defines a ResNet architecture with BatchNorm, ReLU activation functions, and downsample layers. It allows for different output strides (16 or 8) and has multiple blocks (1, 2, 4). The forward function performs residual connections and applies the appropriate number of convolutional layers based on block specifications.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/resnet.py\":33-76",
+            "content": "        self.bn3 = BatchNorm(planes * 4)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n    def forward(self, x):\n        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass ResNet(nn.Layer):\n    def __init__(self,\n                 block,\n                 layers,\n                 output_stride,\n                 BatchNorm,\n                 pretrained=False):\n        self.inplanes = 64\n        super(ResNet, self).__init__()\n        blocks = [1, 2, 4]\n        if output_stride == 16:\n            strides = [1, 2, 2, 1]\n            dilations = [1, 1, 1, 2]\n        elif output_stride == 8:\n            strides = [1, 2, 1, 1]"
+        },
+        {
+            "comment": "This code is initializing a ResNet backbone. It defines convolutional layers, batch normalization, and pooling layers followed by multiple residual blocks. Dilation rates are implemented for the blocks. If an unsupported option is chosen, it raises a NotImplementedError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/resnet.py\":77-102",
+            "content": "            dilations = [1, 1, 2, 4]\n        else:\n            raise NotImplementedError\n        # Modules\n        self.conv1 = nn.Conv2D(3,\n                               64,\n                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)\n        self.bn1 = BatchNorm(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block,\n                                       64,\n                                       layers[0],\n                                       stride=strides[0],\n                                       dilation=dilations[0],\n                                       BatchNorm=BatchNorm)\n        self.layer2 = self._make_layer(block,\n                                       128,\n                                       layers[1],\n                                       stride=strides[1],\n                                       dilation=dilations[1],"
+        },
+        {
+            "comment": "This code defines a ResNet network with multiple layers and blocks, using BatchNormalization for normalization. It also includes an optional pretrained model loading functionality.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/resnet.py\":103-125",
+            "content": "                                       BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block,\n                                       256,\n                                       layers[2],\n                                       stride=strides[2],\n                                       dilation=dilations[2],\n                                       BatchNorm=BatchNorm)\n        self.layer4 = self._make_MG_unit(block,\n                                         512,\n                                         blocks=blocks,\n                                         stride=strides[3],\n                                         dilation=dilations[3],\n                                         BatchNorm=BatchNorm)\n        # self.layer4 = self._make_layer(block, 512, layers[3], stride=strides[3], dilation=dilations[3], BatchNorm=BatchNorm)\n        self._init_weight()\n        if pretrained:\n            self._load_pretrained_model()\n    def _make_layer(self,\n                    block,\n                    planes,\n                    blocks,"
+        },
+        {
+            "comment": "This code defines a function to create a residual block for a ResNet network with specific parameters such as number of blocks, stride, dilation rate, and BatchNorm layer. It returns a Sequential model containing the block layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/resnet.py\":126-156",
+            "content": "                    stride=1,\n                    dilation=1,\n                    BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes, planes, stride, dilation, downsample,\n                  BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      dilation=dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def _make_MG_unit(self,\n                      block,\n                      planes,"
+        },
+        {
+            "comment": "This code defines a residual block for ResNet using the input planes, number of blocks, and other parameters. It creates a downsampling layer if necessary and then appends multiple instances of the given block to form the final residual block.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/resnet.py\":157-185",
+            "content": "                      blocks,\n                      stride=1,\n                      dilation=1,\n                      BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes,\n                          planes * block.expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(\n            block(self.inplanes,\n                  planes,\n                  stride,\n                  dilation=blocks[0] * dilation,\n                  downsample=downsample,\n                  BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, len(blocks)):\n            layers.append(\n                block(self.inplanes,\n                      planes,\n                      stride=1,"
+        },
+        {
+            "comment": "The code defines a ResNet network with multiple layers, including convolution, batch normalization, and pooling. The forward function performs inference by passing the input through each layer sequentially. The _init_weight function initializes the weights of the network using either Xavier or Gaussian distribution, depending on the type of the layer. The _load_pretrained_model function loads a pre-trained model from a specified file path, but it is currently empty and marked as TODO.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/resnet.py\":186-219",
+            "content": "                      dilation=blocks[i] * dilation,\n                      BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def forward(self, input):\n        #         print('input:', input.mean().item())\n        x = self.conv1(input)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n        x = self.layer1(x)\n        low_level_feat = x\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        return x, low_level_feat\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                fill_(m.weight, 1)\n                # normal_(m.weight, 0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2D):\n                fill_(m.weight, 1)\n                zero_(m.bias)\n        return self.sublayers()\n    def _load_pretrained_model(self):\n        # TODO\n        pretrain_dict = paddle.load(\n            '/home/lc/manet/manet_paddle/model_best.pdparams.tar')"
+        },
+        {
+            "comment": "This code defines a ResNet-101 model function that takes output stride, BatchNorm flag, and pretrained option as arguments. It creates a ResNet model with Bottleneck blocks, layers, output stride, and BatchNorm implementation. If pretrained is set to True, the function returns a pre-trained model on ImageNet. The code also updates the model's state dictionary by merging pretrain_dict into the state dict.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/resnet.py\":220-238",
+            "content": "        model_dict = {}\n        state_dict = self.state_dict()\n        for k, v in pretrain_dict.items():\n            if k in state_dict:\n                model_dict[k] = v\n        state_dict.update(model_dict)\n        self.set_state_dict(state_dict)\ndef ResNet101(output_stride, BatchNorm, pretrained=False):\n    \"\"\"Constructs a ResNet-101 model.\n    Args:\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\n    \"\"\"\n    model = ResNet(Bottleneck, [3, 4, 23, 3],\n                   output_stride,\n                   BatchNorm,\n                   pretrained=pretrained)\n    return model"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7669eb12-1b11-4ab0-bde0-dcc25e0cbfd4.json b/docs/doc/7669eb12-1b11-4ab0-bde0-dcc25e0cbfd4.json
new file mode 100644
index 000000000..624097abc
--- /dev/null
+++ b/docs/doc/7669eb12-1b11-4ab0-bde0-dcc25e0cbfd4.json
@@ -0,0 +1,115 @@
+{
+    "summary": "The code defines a MoViNet model configuration with MobileNetV2 layers and parameters, constructing CNN layers and a MoViNet backbone class for video analysis. The model is configurable and can be causal or non-causal based on the 'causal' parameter.",
+    "details": [
+        {
+            "comment": "This code contains the configuration for a MOViNet model. It specifies the number of blocks, convolutional layers, and filter sizes for each stage of the network. The configuration is stored in a dictionary format with keys like 'A0', 'b2_l0', and so on, representing different parts of the model architecture.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":0-26",
+            "content": "import collections.abc\nfrom itertools import repeat\nfrom typing import Any, Callable, Optional, Tuple, Union\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.layer import Identity\nfrom ..registry import BACKBONES\nfrom collections import OrderedDict\ncontainer_abcs = collections.abc\n\"\"\"Model Config\n\"\"\"\nA0 = {'block_num': [0, 1, 3, 3, 4, 4]}\nA0['conv1'] = [3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1)]\nA0['b2_l0'] = [8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1)]\nA0['b3_l0'] = [8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0)]\nA0['b3_l1'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b3_l2'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b4_l0'] = [32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0)]\nA0['b4_l1'] = [56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b4_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b5_l0'] = [56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1)]\nA0['b5_l1'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]"
+        },
+        {
+            "comment": "This code defines a model architecture for the MobileNetV2 network, specifying the number of filters, kernel sizes, and stride values at each layer. The `_ntuple` function parses layer configurations, and `_make_divisible` ensures all layers have a divisible channel number. The dictionary `MODEL_CONFIG` stores these configuration parameters for the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":27-54",
+            "content": "A0['b5_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b5_l3'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]\nA0['b6_l0'] = [56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1)]\nA0['b6_l1'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]\nA0['b6_l2'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]\nA0['b6_l3'] = [104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]\nA0['conv7'] = [104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0)]\nMODEL_CONFIG = {'A0': A0}\ndef _ntuple(n):\n    def parse(x):\n        if isinstance(x, container_abcs.Iterable):\n            return x\n        return tuple(repeat(x, n))\n    return parse\ndef _make_divisible(v: float,\n                    divisor: int,\n                    min_value: Optional[int] = None) -> int:\n    \"\"\"\n    This function is taken from the original tf repo.\n    It ensures that all layers have a channel number that is divisible by 8.\n    It can be seen here:\n    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py"
+        },
+        {
+            "comment": "The code defines a CausalModule class that contains an activation layer and resets it when needed. Conv2dBNActivation is a Sequential module with optional normalization and activation layers, used in the construction of the MoviNet backbone model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":55-93",
+            "content": "    \"\"\"\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    # Make sure that round down does not go down by more than 10%.\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\n_single = _ntuple(1)\n_pair = _ntuple(2)\n_triple = _ntuple(3)\n_quadruple = _ntuple(4)\nclass CausalModule(nn.Layer):\n    def __init__(self) -> None:\n        super().__init__()\n        self.activation = None\n    def reset_activation(self) -> None:\n        self.activation = None\nclass Conv2dBNActivation(nn.Sequential):\n    def __init__(\n        self,\n        in_planes: int,\n        out_planes: int,\n        kernel_size: Union[int, Tuple[int, int]],\n        padding: Union[int, Tuple[int, int]],\n        stride: Union[int, Tuple[int, int]] = 1,\n        groups: int = 1,\n        norm_layer: Optional[Callable[..., nn.Layer]] = None,\n        activation_layer: Optional[Callable[..., nn.Layer]] = None,\n        **kwargs: Any,\n    ) -> None:\n        kernel_size = _pair(kernel_size)"
+        },
+        {
+            "comment": "This code defines two classes, `Conv2dBNActivation` and `Conv3DBNActivation`, which are convolutional neural network layers with batch normalization and activation functions. The layers have adjustable input (in_planes), output (out_planes), kernel size, stride, padding, and groups parameters. The batch normalization layer uses a momentum of 0.1, and the activation function is an Identity function by default but can be overridden with another specified activation function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":94-120",
+            "content": "        stride = _pair(stride)\n        padding = _pair(padding)\n        if norm_layer is None:\n            norm_layer = Identity\n        if activation_layer is None:\n            activation_layer = Identity\n        self.kernel_size = kernel_size\n        self.stride = stride\n        dict_layers = (nn.Conv2D(in_planes,\n                                 out_planes,\n                                 kernel_size=kernel_size,\n                                 stride=stride,\n                                 padding=padding,\n                                 groups=groups,\n                                 **kwargs), norm_layer(out_planes,\n                                                       momentum=0.1),\n                       activation_layer())\n        self.out_channels = out_planes\n        super(Conv2dBNActivation, self).__init__(dict_layers[0], dict_layers[1],\n                                                 dict_layers[2])\nclass Conv3DBNActivation(nn.Sequential):\n    def __init__(\n        self,\n        in_planes: int,"
+        },
+        {
+            "comment": "This function is creating a Conv3D layer with specified parameters, including the number of input and output planes, kernel size, padding, stride, groups, and optional norm and activation layers. The function also ensures that the input values for kernel_size, stride, and padding are correctly formatted as triples.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":121-146",
+            "content": "        out_planes: int,\n        kernel_size: Union[int, Tuple[int, int, int]],\n        padding: Union[int, Tuple[int, int, int]],\n        stride: Union[int, Tuple[int, int, int]] = 1,\n        groups: int = 1,\n        norm_layer: Optional[Callable[..., nn.Layer]] = None,\n        activation_layer: Optional[Callable[..., nn.Layer]] = None,\n        **kwargs: Any,\n    ) -> None:\n        kernel_size = _triple(kernel_size)\n        stride = _triple(stride)\n        padding = _triple(padding)\n        if norm_layer is None:\n            norm_layer = Identity\n        if activation_layer is None:\n            activation_layer = Identity\n        self.kernel_size = kernel_size\n        self.stride = stride\n        dict_layers = (nn.Conv3D(in_planes,\n                                 out_planes,\n                                 kernel_size=kernel_size,\n                                 stride=stride,\n                                 padding=padding,\n                                 groups=groups,\n                                 **kwargs), norm_layer(out_planes,"
+        },
+        {
+            "comment": "The code defines a class named `ConvBlock3D` as a subclass of `CausalModule`. It takes inputs such as the number of input and output planes, kernel size, causality status, convolution type, padding, stride, normalization layer, activation layer, bias attribute, and optional keyword arguments. It initializes the class variables and creates an instance of the `Conv3DBNActivation` class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":147-176",
+            "content": "                                                       momentum=0.1),\n                       activation_layer())\n        self.out_channels = out_planes\n        super(Conv3DBNActivation, self).__init__(dict_layers[0], dict_layers[1],\n                                                 dict_layers[2])\nclass ConvBlock3D(CausalModule):\n    def __init__(\n        self,\n        in_planes: int,\n        out_planes: int,\n        kernel_size: Union[int, Tuple[int, int, int]],\n        causal: bool,\n        conv_type: str,\n        padding: Union[int, Tuple[int, int, int]] = 0,\n        stride: Union[int, Tuple[int, int, int]] = 1,\n        norm_layer: Optional[Callable[..., nn.Layer]] = None,\n        activation_layer: Optional[Callable[..., nn.Layer]] = None,\n        bias_attr: bool = False,\n        **kwargs: Any,\n    ) -> None:\n        super().__init__()\n        kernel_size = _triple(kernel_size)\n        stride = _triple(stride)\n        padding = _triple(padding)\n        self.conv_2 = None\n        if causal is True:\n            padding = (0, padding[1], padding[2])"
+        },
+        {
+            "comment": "This code is checking the convolution type and raising a ValueError if it's neither \"2plus1d\" nor \"3d\". If the type is \"2plus1d\", it initializes two Conv2dBNActivation layers with appropriate parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":177-195",
+            "content": "        if conv_type != \"2plus1d\" and conv_type != \"3d\":\n            raise ValueError(\"only 2plus2d or 3d are \" +\n                             \"allowed as 3d convolutions\")\n        if conv_type == \"2plus1d\":\n            self.conv_1 = Conv2dBNActivation(in_planes,\n                                             out_planes,\n                                             kernel_size=(kernel_size[1],\n                                                          kernel_size[2]),\n                                             padding=(padding[1], padding[2]),\n                                             stride=(stride[1], stride[2]),\n                                             activation_layer=activation_layer,\n                                             norm_layer=norm_layer,\n                                             bias_attr=bias_attr,\n                                             **kwargs)\n            if kernel_size[0] > 1:\n                self.conv_2 = Conv2dBNActivation(\n                    in_planes,\n                    out_planes,"
+        },
+        {
+            "comment": "The code defines a layer with different convolution types (\"2d\" or \"3d\") and initializes the corresponding Conv2D or Conv3D layers with specified parameters such as input/output planes, kernel size, padding, activation layer, norm layer, stride, bias attribute and other keyword arguments. It also stores the padding and kernel size for future use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":196-215",
+            "content": "                    kernel_size=(kernel_size[0], 1),\n                    padding=(padding[0], 0),\n                    stride=(stride[0], 1),\n                    activation_layer=activation_layer,\n                    norm_layer=norm_layer,\n                    bias_attr=bias_attr,\n                    **kwargs)\n        elif conv_type == \"3d\":\n            self.conv_1 = Conv3DBNActivation(in_planes,\n                                             out_planes,\n                                             kernel_size=kernel_size,\n                                             padding=padding,\n                                             activation_layer=activation_layer,\n                                             norm_layer=norm_layer,\n                                             stride=stride,\n                                             bias_attr=bias_attr,\n                                             **kwargs)\n        self.padding = padding\n        self.kernel_size = kernel_size\n        self.dim_pad = self.kernel_size[0] - 1"
+        },
+        {
+            "comment": "This code defines a class with an attribute `_forward` method. The constructor takes stride, causal, and conv_type as parameters. If causal is True, stream buffer is concatenated to the input tensor. Depending on conv_type, the tensor shape may be reshaped for proper processing. Finally, if conv_2 is not None, it applies a convolution operation to the tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":216-237",
+            "content": "        self.stride = stride\n        self.causal = causal\n        self.conv_type = conv_type\n    def _forward(self, x: paddle.Tensor) -> paddle.Tensor:\n        if self.dim_pad > 0 and self.conv_2 is None and self.causal is True:\n            x = self._cat_stream_buffer(x)\n        b, c, t, h, w = x.shape\n        if self.conv_type == \"2plus1d\":\n            x = paddle.transpose(x, (0, 2, 1, 3, 4))  # bcthw --> btchw\n            x = paddle.reshape_(x, (-1, c, h, w))  # btchw --> bt,c,h,w\n        x = self.conv_1(x)\n        if self.conv_type == \"2plus1d\":\n            b, c, h, w = x.shape\n            x = paddle.reshape_(x, (-1, t, c, h, w))  # bt,c,h,w --> b,t,c,h,w\n            x = paddle.transpose(x, (0, 2, 1, 3, 4))  # b,t,c,h,w --> b,c,t,h,w\n            if self.conv_2 is not None:\n                if self.dim_pad > 0 and self.causal is True:\n                    x = self._cat_stream_buffer(x)\n                b, c, t, h, w = x.shape\n                x = paddle.reshape_(x, (b, c, t, h * w))\n                x = self.conv_2(x)"
+        },
+        {
+            "comment": "1. Reshapes input tensor to (b, c, t, h, w).\n2. Defines a forward function that applies the _forward function and returns the result.\n3. Concatenates the activation tensor with the input along dimension 2.\n4. Saves the last self.dim_pad rows of the input in the activation tensor.\n5. Sets up the activation tensor with zeros and self.dim_pad rows for future use.\n6. TemporalCGAvgPool3D is a CausalModule class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":238-268",
+            "content": "                b, c, t, _ = x.shape\n                x = paddle.reshape_(x, (b, c, t, h, w))\n        return x\n    def forward(self, x: paddle.Tensor) -> paddle.Tensor:\n        x = self._forward(x)\n        return x\n    def _cat_stream_buffer(self, x: paddle.Tensor) -> paddle.Tensor:\n        if self.activation is None:\n            self._setup_activation(x.shape)\n        x = paddle.concat((self.activation, x), 2)\n        self._save_in_activation(x)\n        return x\n    def _save_in_activation(self, x: paddle.Tensor) -> None:\n        assert self.dim_pad > 0\n        self.activation = paddle.to_tensor(x.numpy()[:, :, -self.dim_pad:,\n                                                     ...]).clone().detach()\n    def _setup_activation(self, input_shape: Tuple[float, ...]) -> None:\n        assert self.dim_pad > 0\n        self.activation = paddle.zeros(shape=[\n            *input_shape[:2],  # type: ignore\n            self.dim_pad,\n            *input_shape[3:]\n        ])\nclass TemporalCGAvgPool3D(CausalModule):\n    def __init__(self, ) -> None:"
+        },
+        {
+            "comment": "The code defines a forward function for a CausalModule that performs cumulative sum operation on input tensor. It also includes methods to detach and reset the activation tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":269-295",
+            "content": "        super().__init__()\n        self.n_cumulated_values = 0\n        self.register_forward_post_hook(self._detach_activation)\n    def forward(self, x: paddle.Tensor) -> paddle.Tensor:\n        input_shape = x.shape\n        cumulative_sum = paddle.cumsum(x, axis=2)\n        if self.activation is None:\n            self.activation = cumulative_sum[:, :, -1:].clone()\n        else:\n            cumulative_sum += self.activation\n            self.activation = cumulative_sum[:, :, -1:].clone()\n        noe = paddle.arange(1, input_shape[2] + 1)\n        axis = paddle.to_tensor([0, 1, 3, 4])\n        noe = paddle.unsqueeze(noe, axis=axis)\n        divisor = noe.expand(x.shape)\n        x = cumulative_sum / (self.n_cumulated_values + divisor)\n        self.n_cumulated_values += input_shape[2]\n        return x\n    @staticmethod\n    def _detach_activation(module: CausalModule, inputs: paddle.Tensor,\n                           output: paddle.Tensor) -> None:\n        module.activation.detach()\n    def reset_activation(self) -> None:"
+        },
+        {
+            "comment": "This code defines a SqueezeExcitation layer class with input channels, activation functions, convolution type, causality flag, and squeeze factor as parameters. It initializes the layer by setting the causal flag's multiplier, dividing the input channel count by the squeeze factor, rounding up to 8 using make_divisible function, and adding temporal cumulative average pooling and convolution blocks with specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":296-321",
+            "content": "        super().reset_activation()\n        self.n_cumulated_values = 0\nclass SqueezeExcitation(nn.Layer):\n    def __init__(self,\n                 input_channels: int,\n                 activation_2: nn.Layer,\n                 activation_1: nn.Layer,\n                 conv_type: str,\n                 causal: bool,\n                 squeeze_factor: int = 4,\n                 bias_attr: bool = True) -> None:\n        super().__init__()\n        self.causal = causal\n        se_multiplier = 2 if causal else 1\n        squeeze_channels = _make_divisible(\n            input_channels // squeeze_factor * se_multiplier, 8)\n        self.temporal_cumualtive_GAvg3D = TemporalCGAvgPool3D()\n        self.fc1 = ConvBlock3D(input_channels * se_multiplier,\n                               squeeze_channels,\n                               kernel_size=(1, 1, 1),\n                               padding=0,\n                               causal=causal,\n                               conv_type=conv_type,\n                               bias_attr=bias_attr)"
+        },
+        {
+            "comment": "The code defines a class with two activation functions, and a _scale method that scales the input tensor based on temporal average or average pooling. The forward method applies the scale to the input for spatial pyramid pooling.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":322-346",
+            "content": "        self.activation_1 = activation_1()\n        self.activation_2 = activation_2()\n        self.fc2 = ConvBlock3D(squeeze_channels,\n                               input_channels,\n                               kernel_size=(1, 1, 1),\n                               padding=0,\n                               causal=causal,\n                               conv_type=conv_type,\n                               bias_attr=bias_attr)\n    def _scale(self, inputs: paddle.Tensor) -> paddle.Tensor:\n        if self.causal:\n            x_space = paddle.mean(inputs, axis=[3, 4], keepdim=True)\n            scale = self.temporal_cumualtive_GAvg3D(x_space)\n            scale = paddle.concat((scale, x_space), axis=1)\n        else:\n            scale = F.adaptive_avg_pool3d(inputs, 1)\n        scale = self.fc1(scale)\n        scale = self.activation_1(scale)\n        scale = self.fc2(scale)\n        return self.activation_2(scale)\n    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:\n        scale = self._scale(inputs)\n        return scale * inputs"
+        },
+        {
+            "comment": "This code defines the BasicBneck class which is a neural network layer. It has multiple parameters such as input_channels, out_channels, expanded_channels, kernel_size, stride, padding, padding_avg, causal, conv_type, norm_layer, and activation_layer. If expanded_channels is not equal to out_channels, it will first expand the channels using ConvBlock3D. The class also checks for illegal stride values to prevent unexpected behavior.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":349-381",
+            "content": "class BasicBneck(nn.Layer):\n    def __init__(\n        self,\n        input_channels,\n        out_channels,\n        expanded_channels,\n        kernel_size,\n        stride,\n        padding,\n        padding_avg,\n        causal: bool,\n        conv_type: str,\n        norm_layer: Optional[Callable[..., nn.Layer]] = None,\n        activation_layer: Optional[Callable[..., nn.Layer]] = None,\n    ) -> None:\n        super().__init__()\n        assert type(stride) is tuple\n        if (not stride[0] == 1 or not (1 <= stride[1] <= 2)\n                or not (1 <= stride[2] <= 2)):\n            raise ValueError('illegal stride value')\n        self.res = None\n        layers = []\n        if expanded_channels != out_channels:\n            # expand\n            self.expand = ConvBlock3D(in_planes=input_channels,\n                                      out_planes=expanded_channels,\n                                      kernel_size=(1, 1, 1),\n                                      padding=(0, 0, 0),\n                                      causal=causal,"
+        },
+        {
+            "comment": "This code defines a ConvBlock3D for MoviNet backbone, includes deepwise convolution and SE (Squeeze Excitation) layers. These components process 3D feature maps with various configurations depending on the input planes, kernel size, stride, padding, etc., applying different activation functions based on the conv_type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":382-403",
+            "content": "                                      conv_type=conv_type,\n                                      norm_layer=norm_layer,\n                                      activation_layer=activation_layer)\n        # deepwise\n        self.deep = ConvBlock3D(in_planes=expanded_channels,\n                                out_planes=expanded_channels,\n                                kernel_size=kernel_size,\n                                padding=padding,\n                                stride=stride,\n                                groups=expanded_channels,\n                                causal=causal,\n                                conv_type=conv_type,\n                                norm_layer=norm_layer,\n                                activation_layer=activation_layer)\n        # SE\n        self.se = SqueezeExcitation(\n            expanded_channels,\n            causal=causal,\n            activation_1=activation_layer,\n            activation_2=(nn.Sigmoid if conv_type == \"3d\" else nn.Hardsigmoid),\n            conv_type=conv_type)"
+        },
+        {
+            "comment": "This code defines a ConvBlock3D for projecting the input channels to the desired output channels. If the stride is not (1, 1, 1) or input and output channels are different, it adds an average pooling layer and another ConvBlock3D with appropriate parameters. The causal parameter determines if causal convolution should be used, and Identity activation layer is applied without any transformation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":404-426",
+            "content": "        # project\n        self.project = ConvBlock3D(expanded_channels,\n                                   out_channels,\n                                   kernel_size=(1, 1, 1),\n                                   padding=(0, 0, 0),\n                                   causal=causal,\n                                   conv_type=conv_type,\n                                   norm_layer=norm_layer,\n                                   activation_layer=Identity)\n        if not (stride == (1, 1, 1) and input_channels == out_channels):\n            if stride != (1, 1, 1):\n                layers.append(\n                    nn.AvgPool3D((1, 3, 3), stride=stride, padding=padding_avg))\n            layers.append(\n                ConvBlock3D(\n                    in_planes=input_channels,\n                    out_planes=out_channels,\n                    kernel_size=(1, 1, 1),\n                    padding=(0, 0, 0),\n                    norm_layer=norm_layer,\n                    activation_layer=Identity,\n                    causal=causal,"
+        },
+        {
+            "comment": "The code defines the MoViNet class, which is a backbone model for video analysis. It initializes layers based on input parameters and then performs feature extraction using the defined layers. The forward method applies residual connections and a scale factor to combine the extracted features with the input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":427-463",
+            "content": "                    conv_type=conv_type,\n                ))\n            self.res = nn.Sequential(*layers)\n        self.alpha = self.create_parameter(shape=[1], dtype=\"float32\")\n    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:\n        if self.res is not None:\n            residual = self.res(inputs)\n        else:\n            residual = inputs\n        if self.expand is not None:\n            x = self.expand(inputs)\n        else:\n            x = inputs\n        x = self.deep(x)\n        x = self.se(x)\n        x = self.project(x)\n        result = residual + self.alpha * x\n        return result\n@BACKBONES.register()\nclass MoViNet(nn.Layer):\n    def __init__(\n        self,\n        model_type: str = 'A0',\n        hidden_dim: int = 2048,\n        causal: bool = True,\n        num_classes: int = 400,\n        conv_type: str = \"3d\",\n    ) -> None:\n        super().__init__()\n        \"\"\"\n        causal: causal mode\n        num_classes: number of classes for classifcation\n        conv_type: type of convolution either 3d or 2plus1d"
+        },
+        {
+            "comment": "The code defines a MOViNet model, which consists of a ConvBlock3D (conv1) and multiple BasicBneck blocks. It takes in parameters such as the number of input and output planes, kernel size, stride, padding, causal flag, conv type, norm layer, and activation layer. These parameters are extracted from the MODEL_CONFIG dictionary based on the model type. The blocks are organized in an OrderedDict called blocks_dic for future reference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":464-486",
+            "content": "        \"\"\"\n        blocks_dic = OrderedDict()\n        cfg = MODEL_CONFIG[model_type]\n        norm_layer = nn.BatchNorm3D if conv_type == \"3d\" else nn.BatchNorm2D\n        activation_layer = nn.Swish if conv_type == \"3d\" else nn.Hardswish\n        # conv1\n        self.conv1 = ConvBlock3D(in_planes=cfg['conv1'][0],\n                                 out_planes=cfg['conv1'][1],\n                                 kernel_size=cfg['conv1'][2],\n                                 stride=cfg['conv1'][3],\n                                 padding=cfg['conv1'][4],\n                                 causal=causal,\n                                 conv_type=conv_type,\n                                 norm_layer=norm_layer,\n                                 activation_layer=activation_layer)\n        # blocks\n        for i in range(2, len(cfg['block_num']) + 1):\n            for j in range(cfg['block_num'][i - 1]):\n                blocks_dic[f'b{i}_l{j}'] = BasicBneck(\n                    cfg[f'b{i}_l{j}'][0],\n                    cfg[f'b{i}_l{j}'][1],"
+        },
+        {
+            "comment": "This code is defining a MOViNet model with specific configurations for blocks, convolutional layers, and pooling operations. It initializes the blocks as sequential layers and adds an additional 3D ConvBlock layer ('conv7') followed by a classifier.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":487-509",
+            "content": "                    cfg[f'b{i}_l{j}'][2],\n                    cfg[f'b{i}_l{j}'][3],\n                    cfg[f'b{i}_l{j}'][4],\n                    cfg[f'b{i}_l{j}'][5],\n                    cfg[f'b{i}_l{j}'][6],\n                    causal=causal,\n                    conv_type=conv_type,\n                    norm_layer=norm_layer,\n                    activation_layer=activation_layer)\n        self.blocks = nn.Sequential(*(blocks_dic.values()))\n        # conv7\n        self.conv7 = ConvBlock3D(in_planes=cfg['conv7'][0],\n                                 out_planes=cfg['conv7'][1],\n                                 kernel_size=cfg['conv7'][2],\n                                 stride=cfg['conv7'][3],\n                                 padding=cfg['conv7'][4],\n                                 causal=causal,\n                                 conv_type=conv_type,\n                                 norm_layer=norm_layer,\n                                 activation_layer=activation_layer)\n        # pool\n        self.classifier = nn.Sequential("
+        },
+        {
+            "comment": "This code defines a 3D Convolutional Neural Network (CNN) backbone for MoviNet. It includes dense layers, convolution blocks, and optional temporal pooling. The model architecture can be causal or non-causal depending on the `causal` parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":510-538",
+            "content": "            # dense9\n            ConvBlock3D(in_planes=cfg['conv7'][1],\n                        out_planes=hidden_dim,\n                        kernel_size=(1, 1, 1),\n                        causal=causal,\n                        conv_type=conv_type,\n                        bias_attr=True),\n            nn.Swish(),\n            nn.Dropout(p=0.2),\n            # dense10d\n            ConvBlock3D(in_planes=hidden_dim,\n                        out_planes=num_classes,\n                        kernel_size=(1, 1, 1),\n                        causal=causal,\n                        conv_type=conv_type,\n                        bias_attr=True),\n        )\n        if causal:\n            self.cgap = TemporalCGAvgPool3D()\n        self.apply(self._weight_init)\n        self.causal = causal\n    def avg(self, x: paddle.Tensor) -> paddle.Tensor:\n        if self.causal:\n            avg = F.adaptive_avg_pool3d(x, (x.shape[2], 1, 1))\n            avg = self.cgap(avg)[:, :, -1:]\n        else:\n            avg = F.adaptive_avg_pool3d(x, 1)\n        return avg"
+        },
+        {
+            "comment": "The code defines a class for a MoviNet backbone, which performs convolutions and has block layers. The forward function passes the input through these layers and then flattens the result before returning it. A static method initializes the network weights based on the layer type. Another static method cleans activation buffers in CausalModule subclasses.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":540-571",
+            "content": "    @staticmethod\n    def _weight_init(m):\n        if isinstance(m, nn.Conv3D):\n            nn.initializer.KaimingNormal(m.weight)\n            if m.bias is not None:\n                nn.initializer.Constant(0.0)(m.bias)\n        elif isinstance(m, (nn.BatchNorm3D, nn.BatchNorm2D, nn.GroupNorm)):\n            nn.initializer.Constant(1.0)(m.weight)\n            nn.initializer.Constant(0.0)(m.bias)\n        elif isinstance(m, nn.Linear):\n            nn.initializer.Normal(m.weight, 0, 0.01)\n            nn.initializer.Constant(0.0)(m.bias)\n    def forward(self, x: paddle.Tensor) -> paddle.Tensor:\n        x = self.conv1(x)\n        x = self.blocks(x)\n        x = self.conv7(x)\n        x = self.avg(x)\n        x = self.classifier(x)\n        x = x.flatten(1)\n        return x\n    @staticmethod\n    def _clean_activation_buffers(m):\n        if issubclass(type(m), CausalModule):\n            m.reset_activation()\n    def clean_activation_buffers(self) -> None:\n        self.apply(self._clean_activation_buffers)\nif __name__ == '__main__':"
+        },
+        {
+            "comment": "Creating a MoViNet network instance with causal set to False and 3D convolution type, then generating summary using Paddle's summary function with input size (1, 3, 8, 224, 224).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/movinet.py\":572-573",
+            "content": "    net = MoViNet(causal=False, conv_type='3d')\n    paddle.summary(net, input_size=(1, 3, 8, 224, 224))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/772d794a-a336-4a0e-ae99-bd5b4df98427.json b/docs/doc/772d794a-a336-4a0e-ae99-bd5b4df98427.json
new file mode 100644
index 000000000..c5c189ad4
--- /dev/null
+++ b/docs/doc/772d794a-a336-4a0e-ae99-bd5b4df98427.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code file imports two backbone models (ResNet and ResNetTweaksTSM) from their respective modules, and then defines the available models in this module as 'ResNet' and 'ResNetTweaksTSM'.",
+    "details": [
+        {
+            "comment": "This code file imports two backbone models (ResNet and ResNetTweaksTSM) from their respective modules, and then defines the available models in this module as 'ResNet' and 'ResNetTweaksTSM'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py\":0-19",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .resnet import ResNet\nfrom .resnet_tweaks_tsm import ResNetTweaksTSM\n__all__ = ['ResNet', 'ResNetTweaksTSM']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7759f95e-9948-4737-aaf5-eba6ca9b081c.json b/docs/doc/7759f95e-9948-4737-aaf5-eba6ca9b081c.json
new file mode 100644
index 000000000..0a31dd89e
--- /dev/null
+++ b/docs/doc/7759f95e-9948-4737-aaf5-eba6ca9b081c.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is part of the PaddleVideo library, containing imports and definitions for functions related to dataset building and data loading. It allows users to build datasets, dataloaders, batch pipelines, and utilize the TSN_Dali_loader and get_input_data functions.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library, containing imports and definitions for functions related to dataset building and data loading. It allows users to build datasets, dataloaders, batch pipelines, and utilize the TSN_Dali_loader and get_input_data functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/__init__.py\":0-21",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .builder import build_dataset, build_dataloader, build_batch_pipeline\nfrom .dataset import VideoDataset\nfrom .dali_loader import TSN_Dali_loader, get_input_data\n__all__ = [\n    'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset',\n    'TSN_Dali_loader', 'get_input_data'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/77f0b9a0-1a2b-4ea1-924e-36bac16c4347.json b/docs/doc/77f0b9a0-1a2b-4ea1-924e-36bac16c4347.json
new file mode 100644
index 000000000..1c26db7e2
--- /dev/null
+++ b/docs/doc/77f0b9a0-1a2b-4ea1-924e-36bac16c4347.json
@@ -0,0 +1,35 @@
+{
+    "summary": "PaddleVideo's SFVideoDataset is a video dataset for action recognition, which extends BaseDataset with index file information and optional parameters. It prepares data for training by setting random seeds, loading index files, and appending entries before handling corrupted videos through retry mechanisms and calculating dataset size.",
+    "details": [
+        {
+            "comment": "This code snippet is from the PaddleVideo module and defines a class called SFVideoDataset. It is a video dataset for action recognition, loading raw videos and applying specified transforms on them. The index file contains multiple lines with information about each video. The class extends BaseDataset and registers it in the DATASETS registry. The code also includes license and copyright information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/slowfast_video.py\":0-30",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass SFVideoDataset(BaseDataset):\n    \"\"\"Video dataset for action recognition\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates"
+        },
+        {
+            "comment": "The code defines a class that represents an index file containing paths to video files and their corresponding labels. It takes arguments such as the path to the index file, data transforms pipeline, and optional parameters for ensemble views and spatial crops. It also includes keyword arguments for the BaseDataset class. The super() function is used to call the parent class's constructor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/slowfast_video.py\":31-63",
+            "content": "       a sample video with the filepath and label, which are split with a whitesapce.\n       Example of a inde file:\n       .. code-block:: txt\n           path/000.mp4 1\n           path/001.mp4 1\n           path/002.mp4 2\n           path/003.mp4 2\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           num_ensemble_views(int): temporal segment when multi-crop test\n           num_spatial_crops(int): spatial crop number when multi-crop test\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        num_ensemble_views=1,\n        num_spatial_crops=1,\n        num_retries=5,\n        num_samples_precise_bn=None,\n        **kwargs,\n    ):\n        self.num_ensemble_views = num_ensemble_views\n        self.num_spatial_crops = num_spatial_crops\n        self.num_retries = num_retries\n        self.num_samples_precise_bn = num_samples_precise_bn\n        super().__init__(file_path, pipeline, **kwargs)"
+        },
+        {
+            "comment": "Sets random seed for reproducibility, loads index file to get video information, and appends dictionary entries containing filename, labels, temporal, and spatial sample indices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/slowfast_video.py\":64-86",
+            "content": "        #set random seed\n        random.seed(0)\n        np.random.seed(0)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                filename, labels = line_split\n                if self.data_prefix is not None:\n                    filename = osp.join(self.data_prefix, filename)\n                for tidx in range(self.num_ensemble_views):\n                    for sidx in range(self.num_spatial_crops):\n                        info.append(\n                            dict(\n                                filename=filename,\n                                labels=int(labels),\n                                temporal_sample_index=tidx,\n                                spatial_sample_index=sidx,\n                                temporal_num_clips=self.num_ensemble_views,\n                                spatial_num_clips=self.num_spatial_crops,"
+        },
+        {
+            "comment": "The code is responsible for preparing data for training in the context of a video dataset. It handles potential exceptions caused by reading corrupted video files and allows retries to avoid failures. The function takes an index as input, checks if it's a tuple or not, iterates over a specified number of retries, performs data processing using a pipeline, and handles any exceptions that occur during the process. If there are no exceptions, the results are returned; otherwise, the code logs an error message and tries again.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/slowfast_video.py\":87-111",
+            "content": "                            ))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        short_cycle = False\n        if isinstance(idx, tuple):\n            idx, short_cycle_idx = idx\n            short_cycle = True\n        for ir in range(self.num_retries):\n            try:\n                #Multi-grid short cycle\n                if short_cycle:\n                    results = copy.deepcopy(self.info[idx])\n                    results['short_cycle_idx'] = short_cycle_idx\n                else:\n                    results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))"
+        },
+        {
+            "comment": "The code is implementing a retry mechanism for loading video files. If a corrupted file is encountered, it will attempt to load another random file up to the specified number of retries. If still unsuccessful, it will return an error. The function also includes a logging system to report exceptions and progress in retry attempts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/slowfast_video.py\":112-136",
+            "content": "                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'][0], results['imgs'][1], np.array(\n                [results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for test given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'][0], results['imgs'][1], np.array(\n                [results['labels']]), np.array([idx])\n    def __len__(self):"
+        },
+        {
+            "comment": "This code calculates the size of the dataset. If num_samples_precise_bn is None, it returns the length of self.info. Otherwise, shuffles self.info and returns the minimum value between num_samples_precise_bn and the length of self.info.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/slowfast_video.py\":137-142",
+            "content": "        \"\"\"get the size of the dataset.\"\"\"\n        if self.num_samples_precise_bn is None:\n            return len(self.info)\n        else:\n            random.shuffle(self.info)\n            return min(self.num_samples_precise_bn, len(self.info))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/78685746-196c-4dc7-9e9a-64862315217c.json b/docs/doc/78685746-196c-4dc7-9e9a-64862315217c.json
new file mode 100644
index 000000000..65fdf05fc
--- /dev/null
+++ b/docs/doc/78685746-196c-4dc7-9e9a-64862315217c.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code sets up an environment for video prediction using PaddleVideo's TableTennis application, initializing an ActionDetection object and loading a model to predict actions and body movements in each video, storing results in a JSON file.",
+    "details": [
+        {
+            "comment": "This code is setting up an environment for video prediction using the PaddleVideo's TableTennis application. It appends the \"action_detect\" directory to the Python path, initializes an ActionDetection object with a configuration file, loads the model, and then iterates through a list of video URLs to predict actions and body movements in each video, storing the results in a JSON file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/predict.py\":0-34",
+            "content": "import os\nimport sys\nimport json\nsys.path.append('action_detect')\nfrom action import ActionDetection\nif __name__ == '__main__':\n    dataset_dir = \"/home/work/datasets/EuroCup2016\"\n    model_predict = ActionDetection(cfg_file=\"./configs/configs.yaml\")\n    model_predict.load_model()\n    video_url = os.path.join(dataset_dir, 'url_val.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]\n    results = []\n    for line in lines:\n        video_name = line\n        print(video_name)\n        imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")\n        pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n        bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n        results.append({\n            'video_name': line,\n            'bmn_results': bmn_results,\n            'action_results': action_results\n        })\n    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)"
+        },
+        {
+            "comment": "Writes the data to file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/predict.py\":35-35",
+            "content": "        f.write(data)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7935ded9-60ba-4abb-8e9d-09fca22a25d8.json b/docs/doc/7935ded9-60ba-4abb-8e9d-09fca22a25d8.json
new file mode 100644
index 000000000..043689a6f
--- /dev/null
+++ b/docs/doc/7935ded9-60ba-4abb-8e9d-09fca22a25d8.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code file contains the Python implementation of segment models in PaddleVideo, including BaseSegment and CFBI classes. It is licensed under the Apache License, Version 2.0. The __all__ variable lists the available segments: BaseSegment and CFBI.",
+    "details": [
+        {
+            "comment": "This code file contains the Python implementation of segment models in PaddleVideo, including BaseSegment and CFBI classes. It is licensed under the Apache License, Version 2.0. The __all__ variable lists the available segments: BaseSegment and CFBI.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/__init__.py\":0-15",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseSegment\nfrom .cfbi import CFBI\n__all__ = ['BaseSegment', 'CFBI']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/79783676-1913-47e3-83a8-1b2389777349.json b/docs/doc/79783676-1913-47e3-83a8-1b2389777349.json
new file mode 100644
index 000000000..1b0ab39a0
--- /dev/null
+++ b/docs/doc/79783676-1913-47e3-83a8-1b2389777349.json
@@ -0,0 +1,55 @@
+{
+    "summary": "The BBoxHeadAVA class generates classification targets, handles dropout, constructs labels, calculates recall/precision, computes losses, and uses a bbox_head for object detection. The code defines \"get_det_bboxes\" and \"multilabel_accuracy\" functions for detecting boxes and computing recall/precision, respectively. Loss is computed using binary cross-entropy with sigmoid activation.",
+    "details": [
+        {
+            "comment": "This code defines a BBoxHeadAVA class, which is the simplest RoI (region of interest) head with two fully connected layers for classification and regression. The temporal_pool_type and spatial_pool_type parameters allow users to choose different pooling methods, while in_channels specifies the number of input channels for the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":0-31",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle \nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom .. import builder\nfrom ..registry import HEADS\n@HEADS.register()\nclass BBoxHeadAVA(nn.Layer):\n    \"\"\"Simplest RoI head, with only two fc layers for classification and\n    regression respectively.  \"\"\"\n    def __init__(\n            self,\n            temporal_pool_type='avg',\n            spatial_pool_type='max',\n            in_channels=2048,"
+        },
+        {
+            "comment": "Class BBoxHeadAVA is being initialized with specified parameters including in_channels, num_classes, dropout_ratio, temporal and spatial pool types, topk values for pooling results, and multilabel flag. The code performs checks on the provided parameters to ensure their validity before assigning them to instance variables.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":32-60",
+            "content": "            num_classes=81,# The first class is reserved, to classify bbox as pos / neg\n            dropout_ratio=0,\n            dropout_before_pool=True,\n            topk=(3, 5),\n            multilabel=True):\n        super(BBoxHeadAVA, self).__init__()\n        assert temporal_pool_type in ['max', 'avg']\n        assert spatial_pool_type in ['max', 'avg']\n        self.temporal_pool_type = temporal_pool_type\n        self.spatial_pool_type = spatial_pool_type\n        self.in_channels = in_channels\n        self.num_classes = num_classes\n        self.dropout_ratio = dropout_ratio\n        self.dropout_before_pool = dropout_before_pool\n        self.multilabel = multilabel\n        if topk is None:\n            self.topk = ()\n        elif isinstance(topk, int):\n            self.topk = (topk, )\n        elif isinstance(topk, tuple):\n            assert all([isinstance(k, int) for k in topk])\n            self.topk = topk\n        else:\n            raise TypeError('topk should be int or tuple[int], '\n                            f'but get {type(topk)}')"
+        },
+        {
+            "comment": "This code initializes the BBoxHead model, which is a part of PaddleVideo. It sets up different layers such as temporal and spatial pooling layers, and dropout layer if needed. The code also specifies the parameters for weights and biases in these layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":61-82",
+            "content": "        # Class 0 is ignored when calculaing multilabel accuracy,\n        # so topk cannot be equal to num_classes\n        assert all([k < num_classes for k in self.topk])\n        assert self.multilabel\n        in_channels = self.in_channels\n        if self.temporal_pool_type == 'avg':\n            self.temporal_pool = nn.AdaptiveAvgPool3D((1, None, None))\n        else:\n            self.temporal_pool = nn.AdaptiveMaxPool3D((1, None, None))\n        if self.spatial_pool_type == 'avg':\n            self.spatial_pool = nn.AdaptiveAvgPool3D((None, 1, 1))\n        else:\n            self.spatial_pool = nn.AdaptiveMaxPool3D((None, 1, 1))\n        if dropout_ratio > 0:\n            self.dropout = nn.Dropout(dropout_ratio)\n        weight_attr = paddle.framework.ParamAttr(name=\"weight\",\n                                                 initializer=paddle.nn.initializer.Normal(mean=0.0, std=0.01))\n        bias_attr = paddle.ParamAttr(name=\"bias\",\n                                     initializer=paddle.nn.initializer.Constant(value=0.0))"
+        },
+        {
+            "comment": "This code defines a bbox_head with a linear layer (fc_cls) for classification and initializes debug images. It also performs forward pass by computing ROI features, applying dropout if enabled, and pooling the features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":84-105",
+            "content": "        self.fc_cls = nn.Linear(in_channels, num_classes, weight_attr=weight_attr, bias_attr=bias_attr)\n        self.debug_imgs = None\n    def forward(self, x,rois, rois_num):\n        roi = paddle.concat(rois)\n        roi_x1 = paddle.index_select(roi, index=paddle.to_tensor(0), axis=1)\n        roi_x2 = paddle.index_select(roi, index=paddle.to_tensor(2), axis=1)\n        roi_w = roi_x2 - roi_x1\n        roi_y1 = paddle.index_select(roi, index=paddle.to_tensor(1), axis=1)\n        roi_y2 = paddle.index_select(roi, index=paddle.to_tensor(3), axis=1)\n        roi_h = roi_y2 - roi_y1\n        roi_area = paddle.multiply(roi_w, roi_h)\n        A = roi_area\n        A1 = paddle.full(A.shape, 1, dtype='int32')\n        A2 = paddle.where(A == 0, paddle.zeros_like(A1), A1)\n        AE = paddle.expand(A2, [A.shape[0], x.shape[1]])\n        rois_num = paddle.to_tensor(rois_num, dtype='int32')\n        if self.dropout_before_pool and self.dropout_ratio > 0 :\n            x = self.dropout(x)\n        x = self.temporal_pool(x)\n        x = self.spatial_pool(x)"
+        },
+        {
+            "comment": "Code snippet is part of a Bounding Box (BBox) head in PaddleVideo, responsible for generating classification targets and handling dropout before pooling. The code also includes functions to generate bbox targets based on positive and negative proposals, ground truth labels, and a positional weight.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":106-125",
+            "content": "        if not self.dropout_before_pool and self.dropout_ratio > 0 :\n            x = self.dropout(x)\n        x = paddle.reshape(x, [x.shape[0], -1])\n        x = paddle.multiply(x, paddle.cast(AE,\"float32\"))\n        cls_score = self.fc_cls(x)\n        # We do not predict bbox, so return None\n        return cls_score, None\n    def get_targets(self, sampling_results, gt_bboxes, gt_labels, pos_weight):\n        pos_proposals = [res.pos_bboxes for res in sampling_results]\n        neg_proposals = [res.neg_bboxes for res in sampling_results]\n        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]\n        cls_reg_targets = self.bbox_target(pos_proposals, neg_proposals,\n                                      pos_gt_labels, pos_weight)\n        return cls_reg_targets\n    def bbox_target(self, pos_bboxes_list, neg_bboxes_list, gt_labels, pos_weight):\n        \"\"\"Generate classification targets for bboxes.  \"\"\"\n        labels, label_weights = [], []\n        pos_weight = 1.0 if pos_weight <= 0 else pos_weight"
+        },
+        {
+            "comment": "This code snippet is part of the PaddleVideo library's bbox_head module. It asserts that three lists have equal lengths and then iterates over each list, counting positive (pos) and negative (neg) bounding boxes. It constructs a label by concatenating ground truth labels with zero-filled negatives. The function returns the generated labels for training. The recall_prec function compares prediction vectors to target vectors, creating a correct vector before filling it with 1s or 0s based on their logical AND operation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":127-151",
+            "content": "        assert len(pos_bboxes_list) == len(neg_bboxes_list) == len(gt_labels)\n        length = len(pos_bboxes_list)\n        for i in range(length):\n            pos_bboxes = pos_bboxes_list[i]\n            neg_bboxes = neg_bboxes_list[i]\n            gt_label = gt_labels[i]\n            num_pos = pos_bboxes.shape[0]\n            if neg_bboxes is not None:\n                num_neg = neg_bboxes.shape[0]\n            else:\n                num_neg = 0\n            num_samples = num_pos + num_neg\n            neg_label = paddle.zeros([num_neg, gt_label.shape[1]])\n            label = paddle.concat([gt_label,neg_label])\n            labels.append(label)\n        labels = paddle.concat(labels, 0)\n        return labels\n    def recall_prec(self, pred_vec, target_vec):\n        correct = paddle.to_tensor(np.logical_and(pred_vec.numpy(), target_vec.numpy()))\n        correct = paddle.where(correct, \n                                    paddle.full(correct.shape,1,dtype='int32'),\n                                    paddle.full(correct.shape,0,dtype='int32'))"
+        },
+        {
+            "comment": "This code calculates recall and precision for multi-label classification tasks. It first computes recall and precision for each sample, then calculates the mean recall and precision across all samples. The function uses threshold values of 0.5 and 1e-6 for target and prediction vectors to ensure numerical stability.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":152-170",
+            "content": "        recall_correct = paddle.cast(paddle.sum(correct, axis=1), 'float32')\n        target_vec = paddle.where(target_vec, \n                                    paddle.full(target_vec.shape,1,dtype='int32'),\n                                    paddle.full(target_vec.shape,0,dtype='int32'))\n        recall_target = paddle.cast(paddle.sum(target_vec, axis=1),'float32')\n        recall = recall_correct / recall_target\n        pred_vec = paddle.where(pred_vec, \n                                    paddle.full(pred_vec.shape,1,dtype='int32'),\n                                    paddle.full(pred_vec.shape,0,dtype='int32'))\n        prec_target = paddle.cast(paddle.sum(pred_vec, axis=1) + 1e-6, 'float32')\n        prec = recall_correct / prec_target\n        recall_mean = paddle.mean(recall)\n        prec_mean = paddle.mean(prec)\n        return recall_mean, prec_mean\n    def multilabel_accuracy(self, pred, target, thr=0.5):\n        pred = paddle.nn.functional.sigmoid(pred)\n        pred_vec = pred > thr\n        target_vec = target > 0.5"
+        },
+        {
+            "comment": "Code creates a bbox_head for object detection. It computes recall and precision given predicted and target vectors, and returns the results. In loss function, it only considers cls_score if available and computes losses based on pos_inds (positive indices) and labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":171-194",
+            "content": "        recall_thr, prec_thr = self.recall_prec(pred_vec, target_vec)\n        recalls, precs = [], []\n        for k in self.topk:\n            _, pred_label = paddle.topk(pred, k, 1, True, True)\n            pred_vec = paddle.full(pred.shape,0,dtype='bool')\n            num_sample = pred.shape[0]\n            for i in range(num_sample):\n                pred_vec[i, pred_label[i].numpy()] = 1  \n            recall_k, prec_k = self.recall_prec(pred_vec, target_vec)\n            recalls.append(recall_k)\n            precs.append(prec_k)\n        return recall_thr, prec_thr, recalls, precs\n    def loss(self,\n             cls_score,\n             labels):\n        losses = dict()\n        if cls_score is not None:\n            # Only use the cls_score\n            labels = labels[:, 1:]\n            pos_inds_bool = paddle.sum(labels, axis=-1) > 0\n            pos_inds = paddle.where(paddle.sum(labels, axis=-1) > 0,\n                                    paddle.full([labels.shape[0]],1,dtype='int32'),\n                                    paddle.full([labels.shape[0]],0,dtype='int32'))"
+        },
+        {
+            "comment": "This code defines two functions: \"get_det_bboxes\" and \"multilabel_accuracy\". The \"get_det_bboxes\" function takes ROIs, cls_score, img_shape, flip, and crop_quadruple as inputs to calculate detection boxes for each bounding box. The \"multilabel_accuracy\" function calculates recall and precision for different thresholds and top-k values from the given cls_score and labels arrays. The code also computes loss using binary cross-entropy with logits and adds it to the losses dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":195-217",
+            "content": "            pos_inds = paddle.nonzero(pos_inds, as_tuple=False)\n            cls_score = paddle.index_select(cls_score, pos_inds, axis=0)\n            cls_score = cls_score[:, 1:] \n            labels = paddle.index_select(labels, pos_inds, axis=0)\n            bce_loss = F.binary_cross_entropy_with_logits\n            loss = bce_loss(cls_score, labels, reduction='none')\n            losses['loss'] = paddle.mean(loss)\n            recall_thr, prec_thr, recall_k, prec_k = self.multilabel_accuracy(\n                cls_score, labels, thr=0.5)\n            losses['recall@thr=0.5'] = recall_thr\n            losses['prec@thr=0.5'] = prec_thr\n            for i, k in enumerate(self.topk):\n                losses[f'recall@top{k}'] = recall_k[i]\n                losses[f'prec@top{k}'] = prec_k[i]\n        return losses\n    def get_det_bboxes(self,\n                       rois,\n                       cls_score,\n                       img_shape,\n                       flip=False,\n                       crop_quadruple=None,\n                       cfg=None):"
+        },
+        {
+            "comment": "The code checks if cls_score is a list, calculates the mean of its elements if it's a list, asserts that self.multilabel is True, applies sigmoid activation to cls_score, and assigns resulting scores to variable 'scores'. It also assigns rois to bboxes and returns both bboxes and scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/bbox_head.py\":218-224",
+            "content": "        if isinstance(cls_score, list):\n            cls_score = sum(cls_score) / float(len(cls_score))\n        assert self.multilabel\n        m = paddle.nn.Sigmoid()\n        scores = m(cls_score)\n        bboxes = rois\n        return bboxes, scores"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/79e414f3-ed2d-4816-8fbd-a796aa4b4532.json b/docs/doc/79e414f3-ed2d-4816-8fbd-a796aa4b4532.json
new file mode 100644
index 000000000..73405c7dd
--- /dev/null
+++ b/docs/doc/79e414f3-ed2d-4816-8fbd-a796aa4b4532.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The script modifies PaddleVideo configuration to use XPU, disables benchmarking, and updates the execution script for Python 3.9 NPU backend. The code logs the execution start after running a bash command with specified parameters.",
+    "details": [
+        {
+            "comment": "This script changes the configuration file for PaddleVideo to use XPU instead of GPU, disables benchmarking and uses Python 3.9 for NPU backend, and updates the test_train_inference_python.sh execution script to use \"xpu\" instead of \"gpu\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python_xpu.sh\":0-38",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nfunction readlinkf() {\n    perl -MCwd -e \"print Cwd::abs_path shift\" \"$1\";\n}\nfunction func_parser_config() {\n    strs=$1\n    IFS=\" \"\n    array=(${strs})\n    tmp=${array[2]}\n    echo ${tmp}\n}\nBASEDIR=$(dirname \"$0\")\nREPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)\nFILENAME=$1\n# disable mkldnn on non x86_64 env\narch=$(uname -i)\nif [ $arch != \"x86_64\" ]; then\n    sed -i \"s/--enable_mkldnn:True|False/--enable_mkldnn:False/g\" $FILENAME\n    sed -i \"s/--enable_mkldnn:True/--enable_mkldnn:False/g\" $FILENAME\nfi\n# change gpu to xpu in tipc txt configs\nsed -i \"s/use_gpu/use_xpu/g\" $FILENAME\n# disable benchmark as AutoLog required nvidia-smi command\nsed -i \"s/--enable_benchmark:True/--enable_benchmark:False/g\" $FILENAME\n# python has been updated to version 3.9 for npu backend\nsed -i \"s/python3.7/python3.9/g\" $FILENAME\ndataline=`cat $FILENAME`\n# change gpu to xpu in execution script\nsed -i \"s/\\\"gpu\\\"/\\\"xpu\\\"/g\" test_tipc/test_train_inference_python.sh\n# pass parameters to test_train_inference_python.sh"
+        },
+        {
+            "comment": "This code executes a bash command with specified parameters, logging the start of execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python_xpu.sh\":39-41",
+            "content": "cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} $2\"\necho -e \"\\033[1;32m Started to run command: ${cmd}!  \\033[0m\"\neval $cmd"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7a591640-3392-49a3-8249-3d5b8f8f9bd3.json b/docs/doc/7a591640-3392-49a3-8249-3d5b8f8f9bd3.json
new file mode 100644
index 000000000..a5c74cbb8
--- /dev/null
+++ b/docs/doc/7a591640-3392-49a3-8249-3d5b8f8f9bd3.json
@@ -0,0 +1,35 @@
+{
+    "summary": "PaddleSlim is a library for model compression in PaddleVideo, offering quantization, pruning, distillation, and search for enhanced inference performance and reduced computational complexity. It can be installed via pip install and demonstrates PP-TSM offline quantization with deployment options in Python and C++ using PaddleLite's opt tool.",
+    "details": [
+        {
+            "comment": "This code introduces PaddleSlim, a model compression library for compressing PaddleVideo models. It includes functions for model quantization (reducing full precision to fixed-point numbers) and model pruning (cutting unimportant convolution kernels). This improves inference performance and reduces computational complexity while preserving accuracy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme_en.md\":0-8",
+            "content": "## Slim function introduction\nA complex model is beneficial to improve the performance of the model, but it also leads to some redundancy in the model. This part provides the function of reducing the model, including two parts: model quantization (quantization training, offline quantization), model pruning.\nAmong them, model quantization reduces the full precision to fixed-point numbers to reduce this redundancy, so as to reduce the computational complexity of the model and improve the inference performance of the model.\nModel quantization can convert FP32-precision model parameters to Int8-precision without losing the accuracy of the model, reducing the size of model parameters and speeding up the calculation. Using the quantized model has a speed advantage when deploying on mobile terminals.\nModel pruning cuts out the unimportant convolution kernels in the CNN, reduces the amount of model parameters, and thus reduces the computational complexity of the model.\nThis tutorial will introduce how to use PaddleSlim, a paddle model compression library, to compress PaddleVideo models."
+        },
+        {
+            "comment": "This code snippet provides a brief introduction to PaddleSlim, which offers model pruning, quantization, distillation, and neural network search for model compression. It highlights the quick start process, explaining that after training a model, quantization or pruning can be used to further compress it while speeding up predictions. The code then provides instructions on how to install PaddleSlim via pip install.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme_en.md\":9-29",
+            "content": "[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) integrates model pruning, quantization (including quantization training and offline quantization), distillation and neural network search and other commonly used and leading model compression functions in the industry. If you are interested, you can follow and understand.\nBefore starting this tutorial, it is recommended to understand [PaddleVideo model training method](../../docs/zh-CN/usage.md) and [PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/ latest/index.html)\n## quick start\nAfter training a model, if you want to further compress the model size and speed up prediction, you can use quantization or pruning to compress the model.\nModel compression mainly includes five steps:\n1. Install PaddleSlim\n2. Prepare the trained model\n3. Model Compression\n4. Export the quantitative inference model\n5. Quantitative Model Prediction Deployment\n### 1. Install PaddleSlim\n* It can be installed by pip install.\n```bash\npython3.7 -m pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple"
+        },
+        {
+            "comment": "In this code, it explains how to install the latest features of PaddleSlim, prepare a trained model for quantization (either using provided models or regular training), and perform model compression including offline quantization. The offline quantization process requires pre-training model loading and defining the quantization strategy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme_en.md\":30-63",
+            "content": "```\n* If you get the latest features of PaddleSlim, you can install it from source.\n```bash\ngit clone https://github.com/PaddlePaddle/PaddleSlim.git\ncd Paddleslim\npython3.7 setup.py install\n```\n### 2. Prepare the trained model\nPaddleVideo provides a series of trained [models](../../docs/zh-CN/model_zoo/README.md). If the model to be quantized is not in the list, you need to follow the [regular training](../ ../docs/zh-CN/usage.md) method to get the trained model.\n### 3. Model Compression\nGo to PaddleVideo root directory\n```bash\ncd PaddleVideo\n```\nThe offline quantization code is located in `deploy/slim/quant_post_static.py`.\n#### 3.1 Model Quantization\nQuantization training includes offline quantization training and online quantization training (TODO). The effect of online quantization training is better. The pre-training model needs to be loaded, and the model can be quantized after the quantization strategy is defined.\n##### 3.1.1 Online quantitative training\nTODO\n##### 3.1.2 Offline Quantization\n**Note"
+        },
+        {
+            "comment": "The code explains the process of offline quantization for a trained model using the PaddleVideo framework. The user must first export an inference model from the trained model and download calibration data before executing the quantization script with specific parameters. The configuration file, `pptsm_k400_frames_uniform_quantization.yaml`, contains all quantization environment parameters except for `use_gpu`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme_en.md\":63-86",
+            "content": "**: For offline quantization, you must use the `inference model` exported from the trained model for quantization. For general model export `inference model`, please refer to [Tutorial](../../docs/zh-CN/usage.md#5-Model Inference).\nGenerally speaking, the offline quantization loss model has more accuracy.\nTaking the PP-TSM model as an example, after generating the `inference model`, the offline quantization operation is as follows\n```bash\n# download a small amount of data for calibration\npushd ./data/k400\nwget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\ntar -xf k400_rawframes_small.tar\npopd\n# then switch to deploy/slim\ncd deploy/slim\n# execute quantization script\npython3.7 quant_post_static.py \\\n-c ../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml \\\n--use_gpu=True\n```\nAll quantization environment parameters except `use_gpu` are configured in `pptsm_k400_frames_uniform_quantization.yaml` file\nWhere `inference_model_dir` represents the directory path of the "
+        },
+        {
+            "comment": "This code demonstrates how to use the PP-TSM offline quantization model for prediction. After exporting the inference model, the __model__ and __params__ files are generated in the specified output directory (quant_output_dir). These files can be used directly for prediction deployment without re-exporting the model. The provided example uses Python's tools/predict.py script to make predictions on a video file (data/example.avi), using the specified configuration (configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml). The results include the top-1 class and score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme_en.md\":86-110",
+            "content": "`inference model` exported in the previous step, and `quant_output_dir` represents the output directory path of the quantization model\nAfter successful execution, the `__model__` file and the `__params__` file are generated in the `quant_output_dir` directory, which are used to store the generated offline quantization model\nSimilar to the usage of `inference model`, you can directly use these two files for prediction deployment without re-exporting the model.\n```bash\n# Use PP-TSM offline quantization model for prediction\n# Go back to the PaddleVideo directory\ncd ../../\n# Use the quantized model to make predictions\npython3.7 tools/predict.py \\\n--input_file data/example.avi \\\n--config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n--model_file ./inference/ppTSM/quant_model/__model__ \\\n--params_file ./inference/ppTSM/quant_model/__params__ \\\n--use_gpu=True \\\n--use_tensorrt=False\n```\nThe output is as follows:\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9997928738594055"
+        },
+        {
+            "comment": "This code provides an overview of model pruning, exporting the model, and deployment. It mentions using PaddleLite's opt model conversion tool for deployment and refers to two serving deployments: Python and C++. For quantitative training, it suggests loading pre-trained models, adjusting learning rates, and modifying the number of epochs while maintaining other configuration settings unchanged.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme_en.md\":111-131",
+            "content": "```\n#### 3.2 Model pruning\nTODO\n### 4. Export the model\nTODO\n### 5. Model Deployment\nThe model exported in the above steps can be converted through the opt model conversion tool of PaddleLite.\nReference for model deployment\n[Serving Python Deployment](../python_serving/readme.md)\n[Serving C++ Deployment](../cpp_serving/readme.md)\n## Training hyperparameter suggestions\n* During quantitative training, it is recommended to load the pre-trained model obtained from regular training to accelerate the convergence of quantitative training.\n* During quantitative training, it is recommended to modify the initial learning rate to `1/20~1/10` of conventional training, and modify the number of training epochs to `1/5~1/2` of conventional training. In terms of learning rate strategy, add On Warmup, other configuration information is not recommended to be modified."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7a73be9d-7cbf-408a-a600-5a503347f8d8.json b/docs/doc/7a73be9d-7cbf-408a-a600-5a503347f8d8.json
new file mode 100644
index 000000000..f39a271ef
--- /dev/null
+++ b/docs/doc/7a73be9d-7cbf-408a-a600-5a503347f8d8.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The \"registry.py\" file in PaddleVideo's EIVideo application defines Registry classes for components of the video processing pipeline, and organizes them into four registries for different functionalities: bbox_coder, estimator, multimodal, and segment.",
+    "details": [
+        {
+            "comment": "The code snippet is from the \"registry.py\" file in PaddleVideo's EIVideo application. It defines several Registry classes, each representing a different component of the video processing pipeline: BACKBONES, HEADS, RECOGNIZERS, LOCALIZERS, PARTITIONERS, SEGMENT, LOSSES, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, and BBOX_SAMPLERS. These Registry classes will be used to register and manage different instances of these components in the video processing pipeline.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py\":0-26",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nBACKBONES = Registry('backbone')\nHEADS = Registry('head')\nRECOGNIZERS = Registry('recognizer')\nLOCALIZERS = Registry('localizer')\nPARTITIONERS = Registry('partitioner')\nSEGMENT = Registry('segmentation')\nLOSSES = Registry('loss')\nROI_EXTRACTORS = Registry('roi_extractor')\nDETECTORS = Registry('detectors')\nBBOX_ASSIGNERS = Registry('bbox_assigner')\nBBOX_SAMPLERS = Registry('bbox_sampler')"
+        },
+        {
+            "comment": "This code defines four registries for different functionalities: bbox_coder, estimator, multimodal, and segment. These registries will be used to organize and manage different types of models or algorithms in the codebase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py\":27-30",
+            "content": "BBOX_CODERS = Registry('bbox_coder')\nESTIMATORS = Registry('estimator')\nMULTIMODAL = Registry('multimodal')\nSEGMENT = Registry('segment')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7aa3a469-2617-466a-aa43-c2a519e30fdd.json b/docs/doc/7aa3a469-2617-466a-aa43-c2a519e30fdd.json
new file mode 100644
index 000000000..6b20f374c
--- /dev/null
+++ b/docs/doc/7aa3a469-2617-466a-aa43-c2a519e30fdd.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code defines a RecognizerTransformer class for implementing a transformer-based recognizer model, which includes feature extraction, training, validation, and testing steps. It also defines a model for inferring image results from multiple views using forward_net function and averaging based on 'avg_type'.",
+    "details": [
+        {
+            "comment": "This code defines a RecognizerTransformer class that inherits from BaseRecognizer and implements a transformer-based recognizer model framework. It takes in an input tensor imgs of shape [N,C,T,H,W] where N is the batch size, C is the number of channels, T is the temporal length, H is the height, and W is the width. If a backbone is specified, it applies the backbone to the images for feature extraction; otherwise, it uses the input images directly. The resulting feature tensor is returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport paddle.nn.functional as F\nfrom paddlevideo.utils import get_logger\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerTransformer(BaseRecognizer):\n    \"\"\"Transformer's recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        # imgs.shape=[N,C,T,H,W], for transformer case\n        if self.backbone is not None:\n            feature = self.backbone(imgs)\n        else:\n            feature = imgs"
+        },
+        {
+            "comment": "The code defines a model's training, validation, and testing steps. The train_step calculates the loss between predicted class scores and actual labels. The val_step is similar but marks some samples as valid in validation mode. The test_step infers by processing views of images and stores class scores in a list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py\":32-61",
+            "content": "        if self.head is not None:\n            cls_score = self.head(feature)\n        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        imgs = data_batch[0]\n        num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg\n        cls_score = []\n        for i in range(num_views):\n            view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *"
+        },
+        {
+            "comment": "This code defines a model for inferring the results from multiple views of images. The `forward_net` function is used to process each view, and then the results are averaged using the `_average_view` function based on the specified average type. This allows the model to make predictions from different perspectives of an image and combine them for a more accurate result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py\":62-85",
+            "content": "                        self.runtime_cfg.test.num_seg]\n            cls_score.append(self.forward_net(view))\n        cls_score = self._average_view(cls_score,\n                                       self.runtime_cfg.test.avg_type)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        imgs = data_batch[0]\n        num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg\n        cls_score = []\n        for i in range(num_views):\n            view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *\n                        self.runtime_cfg.test.num_seg]\n            cls_score.append(self.forward_net(view))\n        cls_score = self._average_view(cls_score,\n                                       self.runtime_cfg.test.avg_type)\n        return cls_score\n    def _average_view(self, cls_score, avg_type='score'):\n        \"\"\"Combine the predicted results of different views\n        Args:\n            cls_score (list): results of multiple views"
+        },
+        {
+            "comment": "This code defines a class method with an optional 'avg_type' parameter for average calculation. It checks if the input is either 'score' or 'prob'. If 'score', it returns the sum of 'cls_score' divided by its length. If 'prob', it applies softmax to each element in 'cls_score', then averages their sum and length. Otherwise, it raises a NotImplementedError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py\":86-97",
+            "content": "            avg_type (str, optional): Average calculation method. Defaults to 'score'.\n        \"\"\"\n        assert avg_type in ['score', 'prob'], \\\n            f\"Currently only the average of 'score' or 'prob' is supported, but got {avg_type}\"\n        if avg_type == 'score':\n            return paddle.add_n(cls_score) / len(cls_score)\n        elif avg_type == 'prob':\n            return paddle.add_n(\n                [F.softmax(score, axis=-1)\n                 for score in cls_score]) / len(cls_score)\n        else:\n            raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7b570af5-cab4-4c09-9a63-3dacb62b23c9.json b/docs/doc/7b570af5-cab4-4c09-9a63-3dacb62b23c9.json
new file mode 100644
index 000000000..f2bc1247e
--- /dev/null
+++ b/docs/doc/7b570af5-cab4-4c09-9a63-3dacb62b23c9.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code imports PaddleVideo library functions for data augmentation, composition, decoding, and sampling in video analysis tasks, while using a list of pipeline modules to perform operations like mixing, cropping, and scaling.",
+    "details": [
+        {
+            "comment": "This code imports various functions and classes from different modules in the PaddleVideo library, which are used for data augmentation, composition, decoding, and sampling in video analysis tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py\":0-39",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .augmentations import (\n    Scale,\n    RandomCrop,\n    CenterCrop,\n    RandomFlip,\n    Image2Array,\n    Normalization,\n    JitterScale,\n    MultiCrop,\n    PackOutput,\n)\nfrom .compose import Compose\nfrom .decode import VideoDecoder, FrameDecoder\nfrom .sample import Sampler\nfrom .mix import Mixup, Cutmix\n__all__ = [\n    'Scale',\n    'RandomCrop',\n    'CenterCrop',\n    'RandomFlip',\n    'Image2Array',\n    'Normalization',"
+        },
+        {
+            "comment": "The code above is a list of pipeline modules used in the PaddleVideo framework for video processing tasks. These modules perform various operations such as data augmentation, mixing, cropping, and scaling before feeding into the model for training or evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py\":40-49",
+            "content": "    'Compose',\n    'VideoDecoder',\n    'FrameDecoder',\n    'Sampler',\n    'Mixup',\n    'Cutmix',\n    'JitterScale',\n    'MultiCrop',\n    'PackOutput',\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7bb95d6a-909e-4bcf-b406-a0b4122a932d.json b/docs/doc/7bb95d6a-909e-4bcf-b406-a0b4122a932d.json
new file mode 100644
index 000000000..d520d9cf4
--- /dev/null
+++ b/docs/doc/7bb95d6a-909e-4bcf-b406-a0b4122a932d.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code introduces a quantization function in PaddleVideo for GPU utilization and performs post-training quantization in static graph mode, writing the quantized model for execution on specified placement. It checks if executed directly, parses command-line arguments, and calls appropriate functions based on GPU usage flag.",
+    "details": [
+        {
+            "comment": "This code is likely part of a larger program and it begins by defining the licensing information, then imports necessary libraries for the function. It also includes the path to other related files and defines a function parse_args(). This suggests that the function will be used later to parse command line arguments or configuration file data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/quant_post_static.py\":0-31",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport os\nimport os.path as osp\nimport sys\nimport numpy as np\nimport paddle\nfrom paddleslim.quant import quant_post_static\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))\nfrom paddlevideo.loader.builder import build_dataloader, build_dataset\nfrom paddlevideo.utils import get_config, get_logger\ndef parse_args():\n    def str2bool(v):"
+        },
+        {
+            "comment": "This code defines a function for post-training quantization in PaddleVideo. It includes an argument parser to specify the configuration file path and optionally override config options. The function also takes a boolean parameter for whether to use GPU during quantization, and logs messages using get_logger(\"paddlevideo\").",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/quant_post_static.py\":32-62",
+            "content": "        return v.lower() in (\"true\", \"t\", \"1\")\n    parser = argparse.ArgumentParser(\"PaddleVideo Inference model script\")\n    parser.add_argument(\n        '-c',\n        '--config',\n        type=str,\n        default=\n        '../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml',\n        help='quantization config file path')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument(\"--use_gpu\",\n                        type=str2bool,\n                        default=True,\n                        help=\"whether use gpui during quantization\")\n    return parser.parse_args()\ndef post_training_quantization(cfg, use_gpu: bool = True):\n    \"\"\"Quantization entry\n    Args:\n        cfg (dict): quntization configuration.\n        use_gpu (bool, optional): whether to use gpu during quantization. Defaults to True.\n    \"\"\"\n    logger = get_logger(\"paddlevideo\")"
+        },
+        {
+            "comment": "This code configures the placement (CPU or GPU) based on use_gpu flag, retrieves defined parameters from cfg, builds a dataloader for quantization with specified dataset and settings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/quant_post_static.py\":64-83",
+            "content": "    place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()\n    # get defined params\n    batch_nums = cfg.DATASET.pop('batch_nums')\n    batch_size = cfg.DATASET.get('batch_size', 1)\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    inference_file_name = cfg.get('model_name', 'inference')\n    inference_model_dir = cfg.get('inference_model_dir',\n                                  f'./inference/{inference_file_name}')\n    quant_output_dir = cfg.get('quant_output_dir',\n                               osp.join(inference_model_dir, 'quant_model'))\n    # build dataloader for quantization, lite data is enough\n    slim_dataset = build_dataset((cfg.DATASET.quant, cfg.PIPELINE.quant))\n    slim_dataloader_setting = dict(batch_size=batch_size,\n                                   num_workers=num_workers,\n                                   places=place,\n                                   drop_last=False,\n                                   shuffle=False)\n    slim_loader = build_dataloader(slim_dataset, **slim_dataloader_setting)"
+        },
+        {
+            "comment": "This code performs post-training quantization for a model, enabling static graph mode in PaddlePaddle and using the specified sample generator for data processing. It also utilizes a specific algorithm (KL) for quantization and writes the quantized model to disk. The execution is done with an executor on the given place.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/quant_post_static.py\":85-113",
+            "content": "    logger.info(\"Build slim_loader finished\")\n    def sample_generator(loader):\n        def __reader__():\n            for indx, data in enumerate(loader):\n                # must return np.ndarray, not paddle.Tensor\n                videos = np.array(data[0])\n                yield videos\n        return __reader__\n    # execute quantization in static graph mode\n    paddle.enable_static()\n    exe = paddle.static.Executor(place)\n    logger.info(\"Staring Post-Training Quantization...\")\n    quant_post_static(executor=exe,\n                      model_dir=inference_model_dir,\n                      quantize_model_path=quant_output_dir,\n                      sample_generator=sample_generator(slim_loader),\n                      model_filename=f'{inference_file_name}.pdmodel',\n                      params_filename=f'{inference_file_name}.pdiparams',\n                      batch_size=batch_size,\n                      batch_nums=batch_nums,\n                      algo='KL')\n    logger.info(\"Post-Training Quantization finished...\")"
+        },
+        {
+            "comment": "The code checks if the script is being executed directly, then parses command-line arguments and gets a configuration file. It then calls a function for post-training quantization based on GPU usage flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/quant_post_static.py\":116-119",
+            "content": "if __name__ == '__main__':\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override)\n    post_training_quantization(cfg, args.use_gpu)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7bda9261-294f-4912-b1ae-397c5520e9e2.json b/docs/doc/7bda9261-294f-4912-b1ae-397c5520e9e2.json
new file mode 100644
index 000000000..6c99ecc90
--- /dev/null
+++ b/docs/doc/7bda9261-294f-4912-b1ae-397c5520e9e2.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The given code implements the Softmax function in-place, calculating exponential elements and normalizing them for PaddleVideo library tasks. The class defines a Run method that performs softmax normalization on vector elements by iteratively computing exponential values and accumulating them for normalization.",
+    "details": [
+        {
+            "comment": "This code is implementing the Softmax function in-place, calculating the exponential of elements and normalizing them by summing up all the elements. This is part of the PaddleVideo library for video analysis tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/postprocess_op.cpp\":0-25",
+            "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include <include/postprocess_op.h>\nnamespace PaddleVideo\n{\n    void Softmax::Inplace_Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end)\n    {\n        const float max_value = *std::max_element(_begin, _end);\n        float denominator = 0.0f;\n        for (auto it = _begin; it != _end; ++it)\n        {\n            *it = std::exp((*it) - max_value);\n            denominator += (*it);"
+        },
+        {
+            "comment": "This code defines a Softmax class with a Run method that performs softmax normalization on a given range of vector elements. It first calculates the maximum value in the range, then iteratively computes the exponential of each element minus the maximum value and accumulates them into a denominator for normalization. Finally, it returns the normalized probability vector.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/postprocess_op.cpp\":26-49",
+            "content": "        }\n        for (auto it = _begin; it != _end; ++it)\n        {\n            *it /= denominator;\n        }\n    }\n    std::vector<float> Softmax::Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end)\n    {\n        std::vector<float> prob(_begin, _end);\n        const float max_value = *std::max_element(prob.begin(), prob.end());\n        float denominator = 0.0f;\n        for (auto it = _begin, it_p = prob.begin(); it != _end; ++it, ++it_p)\n        {\n            (*it_p) = std::exp((*it) - max_value);\n            denominator += (*it_p);\n        }\n        for (auto it = prob.begin(); it != prob.end(); ++it)\n        {\n            (*it) /= denominator;\n        }\n        return prob;\n    }\n} // namespace PaddleVideo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7c2a72a9-c669-4627-9673-d8f974b475f5.json b/docs/doc/7c2a72a9-c669-4627-9673-d8f974b475f5.json
new file mode 100644
index 000000000..84b92e59d
--- /dev/null
+++ b/docs/doc/7c2a72a9-c669-4627-9673-d8f974b475f5.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code uses the slowfast model for object detection, initializes components and parameters, supports pretrained weights, and provides methods for training, testing, and inference. It also retrieves data from PaddleVideo's two-stage detector with various inputs and entity ID selection using index_select.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries, registering a two-stage detector class (TwoStageDetector) within the DETECTORS registry, and initializing its components. The class serves as a base for implementing two-stage object detection algorithms.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py\":0-31",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom ... import builder\nimport paddle.distributed as dist\nfrom ...registry import DETECTORS\nfrom .base import BaseDetector\n@DETECTORS.register()\nclass TwoStageDetector(BaseDetector):\n    \"\"\"Base class for two-stage detectors.  \"\"\"\n    def __init__(self,\n                 backbone,\n                 neck=None,\n                 rpn_head=None,\n                 roi_head=None,\n                 train_cfg=None,"
+        },
+        {
+            "comment": "This code defines a class for a two-stage object detection model. It initializes the backbone, neck (if provided), and heads for RPN and ROI. The constructor also takes optional train_cfg and test_cfg parameters for each head. Additional pretrained weights can be loaded later if provided. The @property methods check whether the detector has RPN or ROI head.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py\":32-63",
+            "content": "                 test_cfg=None,\n                 pretrained=None):\n        super(TwoStageDetector, self).__init__()\n        self.backbone = builder.build_backbone(backbone)\n        if neck is not None:\n            self.neck = neck  # useless\n        if rpn_head is not None:\n            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None\n            rpn_head_ = rpn_head.copy()\n            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)\n            self.rpn_head = builder.build_head(rpn_head_)\n        if roi_head is not None:\n            self.roi_head = builder.build_head(roi_head)\n        self.train_cfg = train_cfg\n        self.test_cfg = test_cfg\n        if pretrained is not None:\n            self.init_weights(pretrained=pretrained)\n    @property\n    def with_rpn(self):\n        \"\"\"whether the detector has RPN\"\"\"\n        return hasattr(self, 'rpn_head') and self.rpn_head is not None\n    @property\n    def with_roi_head(self):\n        \"\"\"whether the detector has a RoI head\"\"\"\n        return hasattr(self, 'roi_head') and self.roi_head is not None"
+        },
+        {
+            "comment": "The code initializes the weights of a two-stage detector and extracts features from its backbone. The train_step function takes input data, extracts features using the extract_feat method, and computes roi_losses using the roi_head's train_step method. These losses are then stored in the losses dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py\":65-90",
+            "content": "    def init_weights(self, pretrained=None):\n        \"\"\"Initialize the weights in detector.  \"\"\"\n        super(TwoStageDetector, self).init_weights(pretrained)\n        self.backbone.init_weights(pretrained=pretrained)\n        if self.with_rpn:\n            self.rpn_head.init_weights()\n        if self.with_roi_head:\n            self.roi_head.init_weights(pretrained)\n    def extract_feat(self, img):\n        \"\"\"Directly extract features from the backbone.\"\"\"\n        x = self.backbone(img)\n        return x\n    def train_step(self, data, **kwargs):\n        img_slow = data[0]\n        img_fast = data[1]\n        proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(\n            data)\n        img_shape = data[7]\n        img_idx = data[8]\n        img_metas = scores, entity_ids\n        x = self.extract_feat(img=[img_slow, img_fast])\n        roi_losses = self.roi_head.train_step(x, img_metas, proposals,\n                                              gt_bboxes, gt_labels, **kwargs)\n        losses = dict()"
+        },
+        {
+            "comment": "This code defines three methods, val_step, test_step, and infer_step. All these methods extract features using the slowfast model and then pass them to roi_head for further processing. Val_step is used for validation while test_step is used for testing. Infer_step performs inference using previously obtained data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py\":91-123",
+            "content": "        losses.update(roi_losses)\n        return losses\n    def val_step(self, data, rescale=False):\n        img_slow = data[0]\n        img_fast = data[1]\n        proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(\n            data)\n        img_shape = data[7]\n        img_metas = scores, entity_ids\n        x = self.extract_feat(img=[img_slow, img_fast])\n        return self.roi_head.simple_test(x,\n                                         proposals[0],\n                                         img_shape,\n                                         rescale=rescale)\n    def test_step(self, data, rescale=False):\n        return self.val_step(data, rescale)\n    def infer_step(self, data, rescale=False):\n        ''' model inference'''\n        img_slow = data[0]\n        img_fast = data[1]\n        proposals = data[2]\n        img_shape = data[3]\n        # using slowfast model to extract spatio-temporal features\n        x = self.extract_feat(img=[img_slow, img_fast])\n        ret = self.roi_head.simple_test(x,"
+        },
+        {
+            "comment": "This code snippet is part of the PaddleVideo library's two-stage detector implementation. It defines a function that retrieves original data from padded dataset, and another function for getting unpadded datas. The first function takes in a set of proposals, ground truth bboxes, labels, scores, and entity ids, and returns them as unpadded data based on the number of proposals at each index. The second function retrieves original datas padded in dataset for two-stage detector implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py\":124-151",
+            "content": "                                        proposals[0],\n                                        img_shape,\n                                        rescale=rescale)\n        return ret\n    def get_unpad_datas(self, data):\n        ''' get original datas padded in dataset '''\n        pad_proposals = data[2]\n        pad_gt_bboxes = data[3]\n        pad_gt_labels = data[4]\n        pad_scores, pad_entity_ids = data[5], data[6]\n        len_proposals = data[9]\n        len_gt_bboxes = data[10]\n        len_gt_labels = data[11]\n        len_scores = data[12]\n        len_entity_ids = data[13]\n        N = pad_proposals.shape[0]\n        proposals = []\n        gt_bboxes = []\n        gt_labels = []\n        scores = []\n        entity_ids = []\n        for bi in range(N):\n            pad_proposal = pad_proposals[bi]\n            len_proposal = len_proposals[bi]\n            index_proposal = paddle.arange(len_proposal)\n            proposal = paddle.index_select(x=pad_proposal,\n                                           index=index_proposal,"
+        },
+        {
+            "comment": "This code creates a list of proposals, and corresponding ground truth bounding boxes (gt_bboxes), labels (gt_labels), and scores. It handles batches by iterating over each batch index (bi) and for each batch, it performs index selection on the padded data based on the indices of the current length of the batch to extract the relevant gt_bbox, gt_label, and score information. These are then appended to their respective lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py\":152-175",
+            "content": "                                           axis=0)\n            proposals.append(proposal)\n            pad_gt_bbox = pad_gt_bboxes[bi]\n            len_gt_bbox = len_gt_bboxes[bi]\n            index_gt_bbox = paddle.arange(len_gt_bbox)\n            gt_bbox = paddle.index_select(x=pad_gt_bbox,\n                                          index=index_gt_bbox,\n                                          axis=0)\n            gt_bboxes.append(gt_bbox)\n            pad_gt_label = pad_gt_labels[bi]\n            len_gt_label = len_gt_labels[bi]\n            index_gt_label = paddle.arange(len_gt_label)\n            gt_label = paddle.index_select(x=pad_gt_label,\n                                           index=index_gt_label,\n                                           axis=0)\n            gt_labels.append(gt_label)\n            pad_score = pad_scores[bi]\n            len_score = len_scores[bi]\n            index_score = paddle.arange(len_score)\n            score = paddle.index_select(x=pad_score, index=index_score, axis=0)\n            scores.append(score)"
+        },
+        {
+            "comment": "This code segment is selecting specific entity IDs from a list and appending them to the 'entity_ids' list. It uses Paddle's index_select function to achieve this.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py\":177-185",
+            "content": "            pad_entity_id = pad_entity_ids[bi]\n            len_entity_id = len_entity_ids[bi]\n            index_entity_id = paddle.arange(len_entity_id)\n            entity_id = paddle.index_select(x=pad_entity_id,\n                                            index=index_entity_id,\n                                            axis=0)\n            entity_ids.append(entity_id)\n        return proposals, gt_bboxes, gt_labels, scores, entity_ids"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7ca67506-e8c9-4595-873a-57dc7c294081.json b/docs/doc/7ca67506-e8c9-4595-873a-57dc7c294081.json
new file mode 100644
index 000000000..7f0157717
--- /dev/null
+++ b/docs/doc/7ca67506-e8c9-4595-873a-57dc7c294081.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code provides instructions for training and testing ST-GCN, a skeleton-based action recognition model, on FSD and NTU-RGB+D datasets, with accuracy results given. It exports the model's architecture and parameters using `export_model.py` and allows inference with optional GPU usage via `predict.py`.",
+    "details": [
+        {
+            "comment": "This code is a documentation for ST-GCN, a skeleton-based action recognition model. It explains the model's introduction, data requirements (FSD and NTU-RGBD), training instructions on both datasets, and how to perform inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/stgcn.md\":0-48",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/stgcn.md) | English\n# ST-GCN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nST-GCN is skeleton-based action recognition model proposed in AAAI 2018.\n<div align=\"center\">\n<img src=\"../../../images/st-gcn.png\" height=200 width=950 hspace='10'/> <br />\n</div>\n## Data\nPlease refer to FSD data download and preparation doc [FSD](../../dataset/fsd.md)\nPlease refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)\n## Train\n### Train on FSD\n- Train ST-GCN on FSD scripts:\n```bash\npython3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml\n```\n- Turn off `valid` when training, as validation dataset is not available for the competition.\n### Train on NTU-RGBD\n- Train ST-GCN on NTU-RGBD scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_stgcn  main.py  --validate -c configs/recognition/stgcn/stgcn_ntucs.yaml"
+        },
+        {
+            "comment": "This code provides instructions for testing the ST-GCN model on two datasets: FSD and NTU-RGB+D. The user is directed to run specific test scripts with provided command lines, specifying the configuration file and weight path. Results are saved in a submission.csv file and the final scores can be obtained from the competition website. Accuracy results for both datasets are also included.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/stgcn.md\":49-88",
+            "content": "```\n- config file `stgcn_ntucs.yaml` corresponding to the config of ST-GCN on NTU-RGB+D dataset with cross-subject splits.\n## Test\n### Test on FSD\n- Test scripts\uff1a\n```bash\npython3.7 main.py --test -c configs/recognition/stgcn/stgcn_fsd.yaml -w output/STGCN/STGCN_epoch_00090.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\n- Evaluation results will be saved in `submission.csv` file, final score can be obtained in [competition website](https://aistudio.baidu.com/aistudio/competition/detail/115).\nAccuracy on FSD-10 dataset:\nTest_Data| Top-1 | checkpoints |\n| :----: | :----: | :---- |\n| Test_A | 59.07 | [STGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams) |\n### Test on NTU-RGB+D\n- Test scripts\uff1a\n```bash\npython3.7 main.py --test -c configs/recognition/stgcn/stgcn_ntucs.yaml -w output/STGCN/STGCN_best.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on NTU-RGB+D dataset:\n| split | Top-1 | checkpoints |"
+        },
+        {
+            "comment": "This code provides the commands to export the model architecture and parameters for a STGCN model, as well as how to use the model to make inferences. The `export_model.py` script is used to generate the `STGCN.pdmodel` and `STGCN.pdiparams` files. The `predict.py` script is then used for making predictions using the exported model with optional GPU usage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/stgcn.md\":89-114",
+            "content": "| :----: | :----: | :---- |\n| cross-subject | 82.28 | [STGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_ntucs.pdparams) |\n## Inference\n### export inference model\n To get model architecture file `STGCN.pdmodel` and parameters file `STGCN.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml \\\n                                -p data/STGCN_fsd.pdparams \\\n                                -o inference/STGCN\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \\\n                           --config configs/recognition/stgcn/stgcn_fsd.yaml \\\n                           --model_file inference/STGCN/STGCN.pdmodel \\\n                           --params_file inference/STGCN/STGCN.pdiparams \\\n                           --use_gpu=True \\"
+        },
+        {
+            "comment": "False",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/stgcn.md\":115-128",
+            "content": "                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/fsd10/example_skeleton.npy\n        top-1 class: 27\n        top-1 score: 0.9912770986557007\n```\n## Reference\n- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7ca86e1e-5661-4976-bb34-3680e0ab54f3.json b/docs/doc/7ca86e1e-5661-4976-bb34-3680e0ab54f3.json
new file mode 100644
index 000000000..e6881c3ea
--- /dev/null
+++ b/docs/doc/7ca86e1e-5661-4976-bb34-3680e0ab54f3.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code sets up ActBERT dataset in PaddlePaddle's video processing library, initializing the dataset with necessary libraries and packages. It defines two methods: \"prepare_train\" for preparing frames for training and a placeholder \"prepare_test\".",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and packages, checking for missing dependencies, and setting up the ActBERT dataset in PaddlePaddle's video processing library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/actbert_dataset.py\":0-30",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\ntry:\n    import lmdb\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT.\"\n    )\nimport pickle\nimport json\ntry:\n    from paddlenlp.transformers import BertTokenizer\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT.\""
+        },
+        {
+            "comment": "Class ActBertDataset is a dataset for PaddleVideo, initialized with file path, pipeline, bert_model, data_prefix and test mode. It loads the index file to get video information, uses the tokenizer from pre-trained bert model, and stores information in the info list. The load_file method is used to load the feature data and prepare the dataset for training or validation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/actbert_dataset.py\":31-65",
+            "content": "    )\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass ActBertDataset(BaseDataset):\n    \"\"\"ActBert dataset.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        bert_model=\"bert-base-uncased\",\n        data_prefix=None,\n        test_mode=False,\n    ):\n        self.bert_model = bert_model\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        feature_data = np.load(self.file_path, allow_pickle=True)\n        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model,\n                                                       do_lower_case=True)\n        self.info = []\n        for item in feature_data:\n            self.info.append(dict(feature=item, tokenizer=self.tokenizer))\n        return self.info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid given index. \"\"\""
+        },
+        {
+            "comment": "This code defines two methods: \"prepare_train\" and \"prepare_test\". The former prepares the frames for training given an index by creating a deep copy of info at that index, applies the pipeline to it, and returns the features from the result. The latter is a placeholder method with no implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/actbert_dataset.py\":66-73",
+            "content": "        results = copy.deepcopy(self.info[idx])\n        #print('==results==', results)\n        results = self.pipeline(results)\n        return results['features']\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        pass"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7d0bc801-4d49-45e1-a566-6e18b8bca7e0.json b/docs/doc/7d0bc801-4d49-45e1-a566-6e18b8bca7e0.json
new file mode 100644
index 000000000..2542a373e
--- /dev/null
+++ b/docs/doc/7d0bc801-4d49-45e1-a566-6e18b8bca7e0.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This file imports necessary modules for PaddleVideo data preprocessing and model training/testing, including annotations, video labels, augmentation, decoding, mixing, segmentation, and sampling. It also defines a list of pipeline components for custom video processing pipelines.",
+    "details": [
+        {
+            "comment": "This file contains the initialization and imports from different pipeline classes in PaddleVideo. It includes functions for loading annotations, getting video labels, and various augmentation techniques. The license and copyright information are also present.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/__init__.py\":0-19",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .anet_pipeline import GetMatchMap, GetVideoLabel, LoadFeat\nfrom .augmentations import (CenterCrop, ColorJitter, GroupRandomFlip,\n                            GroupResize, Image2Array, JitterScale, MultiCrop,\n                            Normalization, PackOutput, RandomCrop, RandomFlip,\n                            RandomResizedCrop, Scale, TenCrop, ToArray,\n                            UniformCrop, RandomGamma, MultiCenterCrop,"
+        },
+        {
+            "comment": "The code imports various classes and functions for different image, video, and skeleton-related pipelines. It includes modules for augmentations, decoding, mixing, segmentation, sampling, and more, used in the PaddleVideo library. These pipelines are used to preprocess, decode, and sample data for training and testing models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/__init__.py\":20-37",
+            "content": "                            RandomBrightness, RandomHue, RandomSaturation, YowoAug)\nfrom .augmentations_ava import *\nfrom .compose import Compose\nfrom .decode import FeatureDecoder, FrameDecoder, VideoDecoder, ActionFeatureDecoder\nfrom .decode_image import ImageDecoder\nfrom .decode_sampler import DecodeSampler\nfrom .mix import Cutmix, Mixup, VideoMix\nfrom .multimodal import FeaturePadding, RandomCap, RandomMask, Tokenize\nfrom .sample import Sampler, SamplerPkl\nfrom .sample_ava import *\nfrom .segmentation import MultiNorm, MultiRestrictSize\nfrom .skeleton_pipeline import AutoPadding, Iden, SkeletonNorm\nfrom .skeleton_pipeline import SketeonCropSample, SketeonModalityTransform, RandomRotation\nfrom .skeleton_pipeline import (UniformSampleFrames, PoseDecode, PoseCompact,\n                                RandomResizedCrop_V2, Flip_V2, CenterCrop_V2,\n                                GeneratePoseTarget, FormatShape, Collect)\nfrom .decode_sampler_MRI import SFMRI_DecodeSampler\nfrom .segmentation_pipline import SegmentationSampler"
+        },
+        {
+            "comment": "This code is importing the \"SamplerUCF24\" class from the \"sample_ucf24\" module and defining a list of available pipeline components for PaddleVideo, including image and feature decoders, transforms, samplers, and more. These components can be used to build custom video processing pipelines for various tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/__init__.py\":38-54",
+            "content": "from .sample_ucf24 import SamplerUCF24\n__all__ = [\n    'ImageDecoder', 'RandomMask', 'UniformCrop', 'SkeletonNorm', 'Tokenize',\n    'Sampler', 'FeatureDecoder', 'DecodeSampler', 'TenCrop', 'Compose',\n    'AutoPadding', 'Normalization', 'Mixup', 'Image2Array', 'Scale',\n    'GroupResize', 'VideoDecoder', 'FrameDecoder', 'PackOutput',\n    'ActionFeatureDecoder', 'GetVideoLabel', 'Cutmix', 'CenterCrop',\n    'RandomCrop', 'LoadFeat', 'RandomCap', 'JitterScale', 'Iden', 'VideoMix',\n    'ColorJitter', 'RandomFlip', 'ToArray', 'FeaturePadding', 'GetMatchMap',\n    'GroupRandomFlip', 'MultiCrop', 'SFMRI_DecodeSampler', 'MultiRestrictSize',\n    'MultiNorm', 'RandomResizedCrop', 'SamplerPkl', 'SegmentationSampler',\n    'SketeonCropSample', 'SketeonModalityTransform', 'RandomRotation',\n    'RandomGamma', 'MultiCenterCrop', 'RandomBrightness', 'RandomHue',\n    'RandomSaturation', 'UniformSampleFrames', 'PoseDecode', 'PoseCompact',\n    'Resize', 'RandomResizedCrop_V2', 'Flip_V2', 'GeneratePoseTarget',\n    'FormatShape', 'Collect', 'RandomSaturation', 'SamplerUCF24', 'YowoAug'"
+        },
+        {
+            "comment": "The code appears to be incomplete or empty, as there are no visible operations or assignments. It could potentially be a placeholder or an intentionally empty function/class definition.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/__init__.py\":55-55",
+            "content": "]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7dd5bc6c-f748-4f35-a4ba-f76321564cb0.json b/docs/doc/7dd5bc6c-f748-4f35-a4ba-f76321564cb0.json
new file mode 100644
index 000000000..f1ad9e6a1
--- /dev/null
+++ b/docs/doc/7dd5bc6c-f748-4f35-a4ba-f76321564cb0.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code updates batch normalization in PaddleVideo library, improving accuracy by using true mean and variance for validation during training.",
+    "details": [
+        {
+            "comment": "This code snippet is part of the PaddleVideo library and aims to implement a precise batch normalization method. The function \"do_preciseBN\" takes in a model, data loader, parallel flag, and number of iterations as parameters. It updates the batch norm stats more precisely by recomputing them after every iteration during training. This improves accuracy by better reflecting the actual stats of the dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py\":0-30",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nimport itertools\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n\"\"\"\nImplement precise bn, which is useful for improving accuracy.\n\"\"\"\ndef do_preciseBN(model, data_loader, parallel, num_iters=200):\n    \"\"\"\n    Recompute and update the batch norm stats to make them more precise. During\n    training both BN stats and the weight are changing after every iteration, so\n    the running average can not precisely reflect the actual stats of the"
+        },
+        {
+            "comment": "The function precisely computes BN stats with fixed weights for a model using a data loader and a specified number of iterations. It replaces running averages in BN layers with true mean and variance to improve validation accuracy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py\":31-54",
+            "content": "    current model.\n    In this function, the BN stats are recomputed with fixed weights, to make\n    the running average more precise. Specifically, it computes the true average\n    of per-batch mean/variance instead of the running average.\n    This is useful to improve validation accuracy.\n    Args:\n        model: the model whose bn stats will be recomputed\n        data_loader: an iterator. Produce data as input to the model\n        num_iters: number of iterations to compute the stats.\n    Return:\n        the model with precise mean and variance in bn layers.\n    \"\"\"\n    bn_layers_list = [\n        m for m in model.sublayers()\n        if any((isinstance(m, bn_type)\n                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,\n                                paddle.nn.BatchNorm3D))) and m.training\n    ]\n    if len(bn_layers_list) == 0:\n        return\n    # moving_mean=moving_mean*momentum+batch_mean*(1.\u2212momentum)\n    # we set momentum=0. to get the true mean and variance during forward\n    momentum_actual = [bn._momentum for bn in bn_layers_list]"
+        },
+        {
+            "comment": "This code initializes zeroed variables and then performs precise batch normalization (BN) by accumulating the BN statistics for a specified number of iterations. It updates the mean and variance values for each Batch Normalization layer in the model, ensuring accurate and precise normalization during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py\":55-81",
+            "content": "    for bn in bn_layers_list:\n        bn._momentum = 0.\n    running_mean = [paddle.zeros_like(bn._mean)\n                    for bn in bn_layers_list]  #pre-ignore\n    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]\n    ind = -1\n    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):\n        logger.info(\"doing precise BN {} / {}...\".format(ind + 1, num_iters))\n        if parallel:\n            model._layers.train_step(data)\n        else:\n            model.train_step(data)\n        for i, bn in enumerate(bn_layers_list):\n            # Accumulates the bn stats.\n            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)\n            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)\n    assert ind == num_iters - 1, (\n        \"update_bn_stats is meant to run for {} iterations, but the dataloader stops at {} iterations.\"\n        .format(num_iters, ind))\n    # Sets the precise bn stats.\n    for i, bn in enumerate(bn_layers_list):\n        bn._mean.set_value(running_mean[i])"
+        },
+        {
+            "comment": "These lines update the batch normalization layer's variance and momentum values with the corresponding values from the running average array. This helps maintain the normal distribution of activation values in the neural network, improving performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py\":82-83",
+            "content": "        bn._variance.set_value(running_var[i])\n        bn._momentum = momentum_actual[i]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7dea8362-0386-43b2-81c1-149bc1ac3971.json b/docs/doc/7dea8362-0386-43b2-81c1-149bc1ac3971.json
new file mode 100644
index 000000000..a19dbab13
--- /dev/null
+++ b/docs/doc/7dea8362-0386-43b2-81c1-149bc1ac3971.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The TextEmbedding interface utilizes Word2Vec for embedding video descriptions and queries. It initializes with model, dimensionality, and optional parameters, providing methods for GPT or Word2Vec extraction while ensuring CPU-only execution. The code initializes an OpenAI GPT model, tokenizes input, converts to vocabulary indices, obtains embeddings from hidden states, and returns squeezed dimensions.",
+    "details": [
+        {
+            "comment": "This module defines the TextEmbedding interface for converting video descriptions and queries into embeddings. The class, TextEmbedding, initializes with a model and dimensionality of embedding. It has an abstract method, text2vec, that converts a string of text into an embedding, returning a (d x n) array, where d is the dimensionality of the embedding and `n` is the number of words successfully parsed from the text string. Some text embedding models may drop certain kinds of stop words.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/text.py\":0-36",
+            "content": "\"\"\"This module defines the TextEmbedding interface for converting video descriptions and\nqueries into embeddings.\n\"\"\"\nimport zipfile\nimport functools\nfrom abc import abstractmethod\nfrom pathlib import Path\nimport numpy as np\nimport paddle\nimport gensim\nimport requests\nimport transformers\nfrom typeguard import typechecked\nfrom zsvision.zs_utils import BlockTimer\nfrom model.s3dg import S3D\nclass TextEmbedding:\n    def __init__(self, model, dim: int):\n        self.model = model\n        self.dim = dim\n        #self.device = None\n    @abstractmethod\n    def text2vec(self, text: str) -> np.ndarray:\n        \"\"\"Convert a string of text into an embedding.\n        Args:\n            text: the content to be embedded\n        Returns:\n            (d x n) array, where d is the dimensionality of the embedding and `n` is the\n                number of words that were successfully parsed from the text string.\n        NOTE: For some text embedding models (such as word2vec), not all words are\n        converted to vectors (e.g. certain kinds of stop words) - these are dropped from"
+        },
+        {
+            "comment": "This code defines a class for the W2VEmbedding model, which embeds text using the Word2Vec algorithm. It has methods for loading pre-trained Word2Vec models from disk or fetching them online. The set_device method allows specifying the device to use (CPU or GPU). The load_w2v_model_from_cache function loads a Word2Vec model from disk, and the fetch_model function downloads it from a given URL.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/text.py\":37-72",
+            "content": "        the output.\n        \"\"\"\n        raise NotImplementedError\n    #@typechecked\n    #def set_device(self, device: torch.device):\n    #    self.model = self.model.to(device)\n    #    self.device = device\n@functools.lru_cache(maxsize=64, typed=False)\ndef load_w2v_model_from_cache(\n        w2v_weights: Path,\n) -> gensim.models.keyedvectors.Word2VecKeyedVectors:\n    with BlockTimer(\"Loading w2v from disk\"):\n        model = gensim.models.KeyedVectors.load_word2vec_format(\n            fname=w2v_weights,\n            binary=True,\n        )\n    return model\n@typechecked\ndef fetch_model(url: str, weights_path: Path):\n    weights_path.parent.mkdir(exist_ok=True, parents=True)\n    with BlockTimer(f\"Fetching weights {url} -> {weights_path}\"):\n        resp = requests.get(url, verify=False)\n        with open(weights_path, \"wb\") as f:\n            f.write(resp.content)\nclass W2VEmbedding(TextEmbedding):\n    \"\"\"This model embeds text using the google-released implementation of the word2vec\n    model introduced in:\n        Mikolov, T., Sutskever, I., Chen, K., Corrado, G. S., & Dean, J. (2013)."
+        },
+        {
+            "comment": "The code initializes a class with dimensions, mirror, and weights_path parameters. If the weights path doesn't exist, it fetches them or raises an error. It then loads the word2vec model from the cache and initializes the superclass. The text2vec method converts input text to tokens processed by w2v, excluding 'a' and tokens not in vocab.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/text.py\":73-100",
+            "content": "        Distributed representations of words and phrases and their compositionality.\n        In Advances in neural information processing systems (pp. 3111-3119).\n    For words that are present in the w2v vocabulary, a 300-dimensional embedding is\n    produced via a lookup table.\n    \"\"\"\n    @typechecked\n    def __init__(\n            self,\n            dim: int,\n            mirror: str,\n            weights_path: Path,\n            fetch_weights: bool = True,\n    ):\n        if not weights_path.exists():\n            if fetch_weights:\n                fetch_model(url=mirror, weights_path=weights_path)\n            else:\n                raise ValueError(f\"w2v weights missing at {weights_path}\")\n        model = load_w2v_model_from_cache(weights_path)\n        super().__init__(model=model, dim=dim)\n    @typechecked\n    def text2vec(self, text: str) -> np.ndarray:\n        # convert the text string to tokens that can be processed by w2v.  We handle\n        # 'a' as a special case.\n        tokens = [x for x in text.split(\" \") if x != \"a\" and x in self.model.vocab]"
+        },
+        {
+            "comment": "The code defines a class \"TextEmbedding\" that provides methods to extract embeddings from text tokens using either the GPT model or Word2Vec. The \"get_vector\" method returns embeddings in the expected format for the CE codebase, and it handles empty sequences by returning zeros with the correct dimensionality. The class also includes a \"set_device\" method that asserts the device type is CPU-only, as GPT model only supports CPU execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/text.py\":102-129",
+            "content": "        embeddings = []\n        for token in tokens:\n            embeddings.append(self.model.get_vector(token))\n        embeddings = np.array(embeddings)\n        # For empty sequences, we use zeros with the dimensionality of the features on\n        # the second dimension (this is the format expected by the CE codebase)\n        if embeddings.size == 0:\n            embeddings = np.zeros((0, self.dim))\n        return embeddings\n    #@typechecked\n    #def set_device(self, device: torch.device):\n    #    msg = f\"w2v only supports CPU-based execution found {device.type}\"\n    #    assert device.type == \"cpu\", msg\nclass OpenAI_GPT(TextEmbedding):\n    \"\"\"This model produces 768-embeddings using a pretrained GPT model, introduced\n    in the paper:\n    Radford, A., Narasimhan, K., Salimans, T., & Sutskever, I. (2018).\n    Improving language understanding by generative pre-training,\n    https://cdn.openai.com/research-covers/language-unsupervised/language_understanding\n    _paper.pdf\n    \"\"\"\n    def __init__(self):\n        self.tokenizer = transformers.OpenAIGPTTokenizer.from_pretrained(\"openai-gpt\")"
+        },
+        {
+            "comment": "This code initializes an OpenAI GPT model, tokenizes text input, converts tokens to vocabulary indices, and obtains embeddings from the model's hidden states. The embeddings are then returned after squeezing dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/text.py\":130-145",
+            "content": "        model = transformers.OpenAIGPTModel.from_pretrained(\"openai-gpt\")\n        model.eval()\n        super().__init__(model=model)\n    @typechecked\n    def text2vec(self, text: str) -> np.ndarray:\n        tokenized_text = self.tokenizer.tokenize(text)\n        # Convert token to vocabulary indices\n        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)\n        tokens_tensor = paddle.to_tensor(indexed_tokens, dtype='int64') #tokens_tensor = torch.LongTensor([indexed_tokens]).to(self.model.device)\n        with paddle.no_grad():\n            hidden_states = self.model(tokens_tensor)\n            embeddings = hidden_states[0].numpy()\n        return embeddings.squeeze(0)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7e9227b9-283e-44fb-b67b-72a781c41bd2.json b/docs/doc/7e9227b9-283e-44fb-b67b-72a781c41bd2.json
new file mode 100644
index 000000000..4c3e48a70
--- /dev/null
+++ b/docs/doc/7e9227b9-283e-44fb-b67b-72a781c41bd2.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The AVA dataset preparation includes downloading and cutting videos, extracting frames, and organizing into PaddleVideo's rawframes, videos, and annotation folders. The code provides a function to count video frames for processing and analysis purposes.",
+    "details": [
+        {
+            "comment": "This document introduces the AVA dataset preparation process, including video data download, annotation preparation, cutting video files, extracting RGB frames, and pulling proposal files. Ensure the directory is at `$PaddleVideo/data/ava/script`. Users can download the dataset from its official site or through a provided script. Video files are also available for download via Baidu cloud disk.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/AVA.md\":0-22",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/k400.md) | English\n# AVA Data Preparation\nThis document mainly introduces the preparation process of AVA dataset.\nIt mainly includes five parts: Video Data Download, Prepare Annotations, Cut video files,\nExtract the RGB frames, Pulling Proposal Files,et al.\nBefore we start, please make sure that the directory is located at `$PaddleVideo/data/ava/script`.\n---\n## 1. Video data Download\nFor basic dataset information, you can refer to the official website [AVA](https://research.google.com/ava/index.html).\nFor the dataset download, you can refer to the [AVA Download](https://github.com/cvdfoundation/ava-dataset) \uff0c\nwhich introduce the way to download the dataset. We also provide the shell script for downloading the video files\n```shell\nbash download_videos.sh\n```\nFurthermore,considering the difficulty in downloading,\nwe upload the video files to Baidu cloud disk in the form of zip packages, and users can download it by themselves according to their needs.\n[Link]() <sup>coming soon</sup>."
+        },
+        {
+            "comment": "This code outlines the steps to prepare a dataset for AVA, a video action recognition task. It involves downloading and extracting annotations, cutting videos to specific time ranges, extracting RGB frames with ffmpeg, and fetching pre-computed proposal lists. The final step shows the expected folder structure of the prepared dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/AVA.md\":25-77",
+            "content": "**Note: the video files should be placed in `data/ava/videos`**\n---\n## 2.Prepare Annotations\nNext, you can run the following script to prepare annotations.\n```shell\nbash download_annotations.sh\n```\nThis command will download `ava_v2.1.zip` for AVA `v2.1` annotation. If you need the AVA `v2.2` annotation, you can try the following script.\n```shell\nVERSION=2.2 bash download_annotations.sh\n```\n**Note: In fact,we will also provide the annotation zip files in Baidu cloud disk**\n---\n## 3. cut video files\nCut each video from its 15th to 30th minute and make them at 30 fps.\n```shell\nbash cut_videos.sh\n```\n---\n## 4. Extract RGB Frames\nyou can use the ffmpeg to extract RGB frames by the following script.\n```shell\nbash extract_rgb_frames.sh\n```\n---\n## 5.Pulling Proposal Files\nThe scripts are adapted from FAIR's [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks).\nRun the follow scripts to fetch pre-computed proposal list.\n```shell\nbash fetch_ava_proposals.sh\n```\n---\n## 6.Folder Structure\nAfter the whole data pipeline for AVA preparation."
+        },
+        {
+            "comment": "The code represents the folder structure for AVA dataset in PaddleVideo, including rawframes (RGB), videos, and annotation files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/AVA.md\":78-111",
+            "content": "you can get the rawframes (RGB), videos and annotation files for AVA.\nIn the context of the whole project (for AVA only), the folder structure will look like:\n```\nPaddleVideo\n\u251c\u2500\u2500 configs\n\u251c\u2500\u2500 paddlevideo\n\u251c\u2500\u2500 docs\n\u251c\u2500\u2500 tools\n\u251c\u2500\u2500 data\n\u2502   \u251c\u2500\u2500 ava\n\u2502   \u2502   \u251c\u2500\u2500 annotations\n\u2502   \u2502   |   \u251c\u2500\u2500 ava_dense_proposals_train.FAIR.recall_93.9.pkl\n\u2502   \u2502   |   \u251c\u2500\u2500 ava_dense_proposals_val.FAIR.recall_93.9.pkl\n\u2502   \u2502   |   \u251c\u2500\u2500 ava_dense_proposals_test.FAIR.recall_93.9.pkl\n\u2502   \u2502   |   \u251c\u2500\u2500 ava_train_v2.1.csv\n\u2502   \u2502   |   \u251c\u2500\u2500 ava_val_v2.1.csv\n\u2502   \u2502   |   \u251c\u2500\u2500 ava_train_excluded_timestamps_v2.1.csv\n\u2502   \u2502   |   \u251c\u2500\u2500 ava_val_excluded_timestamps_v2.1.csv\n\u2502   \u2502   |   \u251c\u2500\u2500 ava_action_list_v2.1_for_activitynet_2018.pbtxt\n\u2502   \u2502   \u251c\u2500\u2500 videos\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 053oq2xB3oU.mkv\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 0f39OWEqJ24.mp4\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 ...\n\u2502   \u2502   \u251c\u2500\u2500 videos_15min\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 053oq2xB3oU.mkv\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 0f39OWEqJ24.mp4\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 ...\n\u2502   \u2502   \u251c\u2500\u2500 rawframes\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 053oq2xB3oU\n|   \u2502   \u2502   \u2502   \u251c\u2500\u2500 img_00001.jpg\n|   \u2502   \u2502   \u2502   \u251c\u2500\u2500 img_00002.jpg\n|   \u2502   \u2502   \u2502   \u251c\u2500\u2500 ..."
+        },
+        {
+            "comment": "The code snippet defines a function that calculates the total number of frames in a given video. This can be useful for processing or analyzing videos, as it provides the necessary information about the duration of the video and how many frames to expect.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/AVA.md\":112-112",
+            "content": "```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7f9ae73b-0500-420c-b1ce-eac33b5e3d73.json b/docs/doc/7f9ae73b-0500-420c-b1ce-eac33b5e3d73.json
new file mode 100644
index 000000000..552e026d2
--- /dev/null
+++ b/docs/doc/7f9ae73b-0500-420c-b1ce-eac33b5e3d73.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The TSMINFReader class is a specialized data reader for JPG video datasets, utilizing threading for image preprocessing and data augmentation. It improves action detection models by manipulating football game images via functions like \"crop_and_resize\", \"group_random_crop\", and \"group_random_flip\" to fit target size and apply random crop sizes for augmentation.",
+    "details": [
+        {
+            "comment": "The code defines a class called TSMINFReader that inherits from DataReader. It is a data reader for video datasets in the JPG format and can be used in specific modes with different configurations. The class takes parameters such as name, mode, cfg, and material (optional) to initialize its instance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":0-37",
+            "content": "\"\"\"\ntsn frame reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport random\nimport functools\nimport concurrent.futures\nimport multiprocessing\nimport numpy as np\nimport paddle\nfrom PIL import Image, ImageEnhance\nfrom .reader_utils import DataReader\nclass TSMINFReader(DataReader):\n    \"\"\"\n    Data reader for video dataset of jpg folder.\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        super(TSMINFReader, self).__init__(name, mode, cfg)"
+        },
+        {
+            "comment": "This code initializes the TSN video reader by setting various attributes based on provided configuration (name) and then calls a function to create the reader object with specified parameters. It also sets image mean and std values for normalization, and stores the material type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":38-63",
+            "content": "        name = name.upper()\n        self.seg_num        = cfg[name]['seg_num']\n        self.seglen         = cfg[name]['seglen']\n        self.short_size     = cfg[name]['short_size']\n        self.target_size    = cfg[name]['target_size']\n        self.batch_size     = cfg[name]['batch_size']\n        self.reader_threads = cfg[name]['reader_threads']\n        self.buf_size       = cfg[name]['buf_size']\n        self.video_path     = cfg[name]['frame_list']\n        self.img_mean       = np.array(cfg[name]['image_mean']).reshape([3, 1, 1]).astype(np.float32)\n        self.img_std        = np.array(cfg[name]['image_std']).reshape([3, 1, 1]).astype(np.float32)\n        self.material = material\n    def create_reader(self):\n        \"\"\"\n        batch loader for TSN\n        \"\"\"\n        _reader = self._inference_reader_creator_longvideo(\n                self.video_path,\n                self.mode,\n                seg_num=self.seg_num,\n                seglen=self.seglen,\n                short_size=self.short_size,\n                target_size=self.target_size,"
+        },
+        {
+            "comment": "This code defines a class with an image batch reader for inference on video data. The reader function reads images from the specified video path, applying mean and standard deviation normalization. It also sets the number of threads, buffer size, and creates a batch generator using _batch_reader method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":64-96",
+            "content": "                img_mean=self.img_mean,\n                img_std=self.img_std,\n                num_threads = self.reader_threads,\n                buf_size = self.buf_size)\n        def _batch_reader():\n            batch_out = []\n            for imgs, label in _reader():\n                if imgs is None:\n                    continue\n                batch_out.append((imgs, label))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 1:\n                yield batch_out[:-1]\n        return _batch_reader\n    def _inference_reader_creator_longvideo(self, video_path, mode, seg_num, seglen,\n                                  short_size, target_size, img_mean, img_std, num_threads, buf_size):\n        \"\"\"\n        inference reader for video\n        \"\"\"\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            def image_buf(image_id_path_buf):\n                \"\"\"\n                image_buf reader\n                \"\"\"  "
+        },
+        {
+            "comment": "The code segment is responsible for reading images from a video file in chunks using multiple threads. It opens each image, converts it to RGB format and stores them in an array. The code handles exceptions for opening bad or missing images and uses the ThreadPoolExecutor class from concurrent futures module to execute operations asynchronously with maximum worker threads specified. Finally, it creates a list of images from the segments and adds a fault-tolerant mechanism to handle bad images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":97-118",
+            "content": "                try:\n                    img_path = image_id_path_buf[1]\n                    img = Image.open(img_path).convert(\"RGB\")\n                    image_id_path_buf[2] = img\n                except:\n                    image_id_path_buf[2] = None\n            frame_len = len(video_path)\n            read_thread_num = seg_num\n            for i in range(0, frame_len, read_thread_num):\n                image_list_part = video_path[i: i + read_thread_num]\n                image_id_path_buf_list = []\n                for k in range(len(image_list_part)):\n                    image_id_path_buf_list.append([k, image_list_part[k], None])\n                with concurrent.futures.ThreadPoolExecutor(max_workers=read_thread_num) as executor:\n                    executor.map(lambda image_id_path_buf: image_buf(image_id_path_buf), image_id_path_buf_list)\n                imgs_seg_list = [x[2] for x in image_id_path_buf_list]\n                # add the fault-tolerant for bad image\n                for k in range(len(image_id_path_buf_list)):"
+        },
+        {
+            "comment": "This code aims to read image data and perform inference by transforming the images. It appends missing image buffers to imgs_seg_list, handles reading errors, and yields the complete list of transformed images. The imgs_transform function performs further transformations on the input images based on provided parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":119-140",
+            "content": "                    img_buf = image_id_path_buf_list[k][2]\n                    pad_id = 1\n                    while pad_id < seg_num and img_buf is None:\n                        img_buf = imgs_seg_list[(k + pad_id)%seg_num][2]\n                    if img_buf is None:\n                        logger.info(\"read img erro from {} to {}\".format(i, i + read_thread_num))\n                        exit(0)\n                    else:\n                        imgs_seg_list[k] = img_buf\n                for pad_id in range(len(imgs_seg_list), seg_num):\n                    imgs_seg_list.append(imgs_seg_list[-1])\n                yield imgs_seg_list      \n        def inference_imgs_transform(imgs_list, mode, seg_num, seglen, short_size,\\\n                                    target_size, img_mean, img_std):\n            \"\"\"\n            inference_imgs_transform\n            \"\"\" \n            imgs_ret = imgs_transform(imgs_list, mode, seg_num, seglen, short_size,\n                        target_size, img_mean, img_std)\n            label_ret = 0"
+        },
+        {
+            "comment": "This code defines a function `imgs_transform` which performs various image transformations on input images based on the given mode. It groups images by scale, crops them randomly if in training mode (using TSM or center crop otherwise), and applies horizontal flips. The function returns the transformed images as a numpy array.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":142-179",
+            "content": "            return imgs_ret, label_ret\n        mapper = functools.partial(\n            inference_imgs_transform,\n            mode=mode,\n            seg_num=seg_num,\n            seglen=seglen,\n            short_size=short_size,\n            target_size=target_size,\n            img_mean=img_mean,\n            img_std=img_std)\n        return paddle.reader.xmap_readers(mapper, reader, num_threads, buf_size, order=True)\ndef imgs_transform(imgs,\n                   mode,\n                   seg_num,\n                   seglen,\n                   short_size,\n                   target_size,\n                   img_mean,\n                   img_std,\n                   name=''):\n    \"\"\"\n    imgs_transform\n    \"\"\"\n    imgs = group_scale(imgs, short_size)\n    if mode == 'train':\n        if name == \"TSM\":\n            imgs = group_multi_scale_crop(imgs, short_size)\n        imgs = group_random_crop(imgs, target_size)\n        imgs = group_random_flip(imgs)\n    else:\n        imgs = group_center_crop(imgs, target_size)\n    np_imgs = (np.array(imgs[0]).astype('float32').transpose("
+        },
+        {
+            "comment": "This code is responsible for loading and preprocessing images for an action detection model in a football game. It resizes, normalizes, and concatenates the images, then applies data augmentation techniques to create a more diverse dataset for training the model. The `group_multi_scale_crop` function generates crop offsets and resizes the images with different scales, providing a robust dataset for improving the model's performance in recognizing various actions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":180-211",
+            "content": "        (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255\n    for i in range(len(imgs) - 1):\n        img = (np.array(imgs[i + 1]).astype('float32').transpose(\n            (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255\n        np_imgs = np.concatenate((np_imgs, img))\n    imgs = np_imgs\n    imgs -= img_mean\n    imgs /= img_std\n    imgs = np.reshape(imgs, (seg_num, seglen * 3, target_size, target_size))\n    return imgs\ndef group_multi_scale_crop(img_group, target_size, scales=None, \\\n        max_distort=1, fix_crop=True, more_fix_crop=True):\n    \"\"\"\n    group_multi_scale_crop\n    \"\"\"\n    scales = scales if scales is not None else [1, .875, .75, .66]\n    input_size = [target_size, target_size]\n    im_size = img_group[0].size\n    # get random crop offset\n    def _sample_crop_size(im_size):\n        \"\"\"\n         _sample_crop_size\n        \"\"\"\n        image_w, image_h = im_size[0], im_size[1]\n        base_size = min(image_w, image_h)\n        crop_sizes = [int(base_size * x) for x in scales]\n        crop_h = ["
+        },
+        {
+            "comment": "This code calculates crop sizes, generates pairs of crop heights and widths, and randomly selects a pair to use for cropping an image. If the 'fix_crop' parameter is True, it also applies random offsets or steps to adjust the position of the cropped area in the image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":212-241",
+            "content": "            input_size[1] if abs(x - input_size[1]) < 3 else x\n            for x in crop_sizes\n        ]\n        crop_w = [\n            input_size[0] if abs(x - input_size[0]) < 3 else x\n            for x in crop_sizes\n        ]\n        pairs = []\n        for i, h in enumerate(crop_h):\n            for j, w in enumerate(crop_w):\n                if abs(i - j) <= max_distort:\n                    pairs.append((w, h))\n        crop_pair = random.choice(pairs)\n        if not fix_crop:\n            w_offset = random.randint(0, image_w - crop_pair[0])\n            h_offset = random.randint(0, image_h - crop_pair[1])\n        else:\n            w_step = (image_w - crop_pair[0]) / 4\n            h_step = (image_h - crop_pair[1]) / 4\n            ret = list()\n            ret.append((0, 0))  # upper left\n            if w_step != 0:\n                ret.append((4 * w_step, 0))  # upper right\n            if h_step != 0:\n                ret.append((0, 4 * h_step))  # lower left\n            if h_step != 0 and w_step != 0:\n                ret.append((4 * w_step, 4 * h_step))  # lower right"
+        },
+        {
+            "comment": "This code sample is from a video action detection application and generates random crop sizes for data augmentation. It considers different crop positions based on the step values provided, then randomly selects one of them to create a dictionary of crop information including width, height, offset for width, and offset for height.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":242-266",
+            "content": "            if h_step != 0 or w_step != 0:\n                ret.append((2 * w_step, 2 * h_step))  # center\n            if more_fix_crop:\n                ret.append((0, 2 * h_step))  # center left\n                ret.append((4 * w_step, 2 * h_step))  # center right\n                ret.append((2 * w_step, 4 * h_step))  # lower center\n                ret.append((2 * w_step, 0 * h_step))  # upper center\n                ret.append((1 * w_step, 1 * h_step))  # upper left quarter\n                ret.append((3 * w_step, 1 * h_step))  # upper right quarter\n                ret.append((1 * w_step, 3 * h_step))  # lower left quarter\n                ret.append((3 * w_step, 3 * h_step))  # lower righ quarter\n            w_offset, h_offset = random.choice(ret)\n            crop_info = {\n                'crop_w': crop_pair[0],\n                'crop_h': crop_pair[1],\n                'offset_w': w_offset,\n                'offset_h': h_offset\n                }\n        return crop_info\n    crop_info = _sample_crop_size(im_size)"
+        },
+        {
+            "comment": "The code contains three functions: \"crop_and_resize\" which crops and resizes images based on provided crop information, \"group_random_crop\" which randomly crops a group of images to the target size, and \"group_random_flip\" which performs random horizontal flipping on a group of images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":267-306",
+            "content": "    crop_w = crop_info['crop_w']\n    crop_h = crop_info['crop_h']\n    offset_w = crop_info['offset_w']\n    offset_h = crop_info['offset_h']\n    crop_img_group = [\n        img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))\n        for img in img_group\n    ]\n    ret_img_group = [\n        img.resize((input_size[0], input_size[1]), Image.BILINEAR)\n        for img in crop_img_group\n    ]\n    return ret_img_group\ndef group_random_crop(img_group, target_size):\n    \"\"\"\n    group_random_crop\n    \"\"\"\n    w, h = img_group[0].size\n    th, tw = target_size, target_size\n    assert (w >= target_size) and (h >= target_size), \\\n          \"image width({}) and height({}) should be larger than crop size\".format(w, h)\n    out_images = []\n    x1 = random.randint(0, w - tw)\n    y1 = random.randint(0, h - th)\n    for img in img_group:\n        if w == tw and h == th:\n            out_images.append(img)\n        else:\n            out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n    return out_images\ndef group_random_flip(img_group):"
+        },
+        {
+            "comment": "The code defines three functions: `group_random_flip`, `group_center_crop`, and `group_scale`. These functions are used to manipulate image groups by flipping, cropping, or resizing them to fit a target size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":307-348",
+            "content": "    \"\"\"\n    group_random_flip\n    \"\"\"\n    v = random.random()\n    if v < 0.5:\n        ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]\n        return ret\n    else:\n        return img_group\ndef group_center_crop(img_group, target_size):\n    \"\"\"\n    group_center_crop\n    \"\"\"\n    img_crop = []\n    for img in img_group:\n        w, h = img.size\n        th, tw = target_size, target_size\n        assert (w >= target_size) and (h >= target_size), \\\n             \"image width({}) and height({}) should be larger than crop size\".format(w, h)\n        x1 = int(round((w - tw) / 2.))\n        y1 = int(round((h - th) / 2.))\n        img_crop.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n    return img_crop\ndef group_scale(imgs, target_size):\n    \"\"\"\n    group_scale\n    \"\"\"\n    resized_imgs = []\n    for i in range(len(imgs)):\n        img = imgs[i]\n        w, h = img.size\n        if (w <= h and w == target_size) or (h <= w and h == target_size):\n            resized_imgs.append(img)\n            continue\n        if w < h:\n            ow = target_size"
+        },
+        {
+            "comment": "This code resizes images according to the aspect ratio. If the image's aspect ratio is 4:3, it resizes to target_size; otherwise, it resizes to target_size and then calculates a new height and width for the image. It appends these resized images to the 'resized_imgs' list and returns this list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py\":349-356",
+            "content": "            oh = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n        else:\n            oh = target_size\n            ow = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n    return resized_imgs"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/7fbf6288-c444-4967-974b-6d98e8a221e7.json b/docs/doc/7fbf6288-c444-4967-974b-6d98e8a221e7.json
new file mode 100644
index 000000000..0471ccc9f
--- /dev/null
+++ b/docs/doc/7fbf6288-c444-4967-974b-6d98e8a221e7.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines the PaddleVideo framework's \"Recognizer2D\" class for 2D model training, with methods train_step(), recognizer2d.py, val_step, and test_step handling loss metrics calculation in different modes.",
+    "details": [
+        {
+            "comment": "This code snippet is part of the PaddleVideo framework and defines a class called \"Recognizer2D\" which is a 2D recognizer model for training. It inherits from \"BaseRecognizer\" and has a method \"train_step()\" that handles how the model trains, taking input data batch as argument and returning output. The \"recognizers\" registry is used to register this class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py\":0-28",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\"\"\"\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nimport paddle\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass Recognizer2D(BaseRecognizer):\n    \"\"\"2D recognizer model framework.\"\"\"\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        #NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method."
+        },
+        {
+            "comment": "These code snippets define three different methods: recognizer2d.py, val_step, and test_step. The first method appears to be a base for the other two and is used to calculate loss metrics from input images and labels using the head's loss function. The val_step also calculates loss metrics, but in valid mode only. Lastly, the test_step does not call the head's loss function and instead returns the class scores directly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py\":30-51",
+            "content": "        #labels = labels.squeeze()\n        #XXX: unsqueeze label to [label] ?\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self(imgs)\n        loss_metrics = self.head.loss(cls_score, labels)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self(imgs)\n        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        #NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics\n        imgs = data_batch[0]\n        cls_score = self(imgs)\n        return cls_score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/80255346-6676-43d5-834a-00d48abd162e.json b/docs/doc/80255346-6676-43d5-834a-00d48abd162e.json
new file mode 100644
index 000000000..74cf571de
--- /dev/null
+++ b/docs/doc/80255346-6676-43d5-834a-00d48abd162e.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code utilizes two functions: \"numpy_to_base64\" converts numpy arrays to base64 strings and \"video_to_numpy\" reads video frames with OpenCV, returning a stack of frames as a numpy array. The parse_file_paths function retrieves file paths or directories containing .avi/.mp4 files and joins them.",
+    "details": [
+        {
+            "comment": "This code contains two functions: \"numpy_to_base64\" and \"video_to_numpy\". The first function converts a numpy array to a base64 encoded string. The second function decodes a video file using OpenCV (cv2) and returns a stack of frames as a numpy array.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/utils.py\":0-36",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport base64\nimport os\nimport os.path as osp\nimport cv2\nimport numpy as np\ndef numpy_to_base64(array: np.ndarray) -> str:\n    \"\"\"numpy_to_base64\n    Args:\n        array (np.ndarray): input ndarray.\n    Returns:\n        bytes object: encoded str.\n    \"\"\"\n    return base64.b64encode(array).decode('utf8')\ndef video_to_numpy(file_path: str) -> np.ndarray:\n    \"\"\"decode video with cv2 and return stacked frames\n       as numpy."
+        },
+        {
+            "comment": "The code reads video frames from a file path and returns them as numpy array. The parse_file_paths function retrieves either the file path or directory containing .avi/.mp4 files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/utils.py\":38-77",
+            "content": "    Args:\n        file_path (str): video file path.\n    Returns:\n        np.ndarray: [T,H,W,C] in uint8.\n    \"\"\"\n    cap = cv2.VideoCapture(file_path)\n    videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n    decoded_frames = []\n    for i in range(videolen):\n        ret, frame = cap.read()\n        # maybe first frame is empty\n        if ret is False:\n            continue\n        img = frame[:, :, ::-1]\n        decoded_frames.append(img)\n    decoded_frames = np.stack(decoded_frames, axis=0)\n    return decoded_frames\ndef parse_file_paths(input_path: str) -> list:\n    \"\"\"get data pathes from input_path\n    Args:\n        input_path (str): input file path or directory which contains input file(s).\n    Returns:\n        list: path(es) of input file(s)\n    \"\"\"\n    assert osp.exists(input_path), \\\n        f\"{input_path} did not exists!\"\n    if osp.isfile(input_path):\n        files = [\n            input_path,\n        ]\n    else:\n        files = os.listdir(input_path)\n        files = [\n            file for file in files\n            if (file.endswith(\".avi\") or file.endswith(\".mp4\"))"
+        },
+        {
+            "comment": "This code is joining the input_path with each file in the files list and returning the resulting list of file paths.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/utils.py\":78-80",
+            "content": "        ]\n        files = [osp.join(input_path, file) for file in files]\n    return files"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/81b22ac0-b39e-40e0-bab4-f281ec2e5eaf.json b/docs/doc/81b22ac0-b39e-40e0-bab4-f281ec2e5eaf.json
new file mode 100644
index 000000000..bd9eaa570
--- /dev/null
+++ b/docs/doc/81b22ac0-b39e-40e0-bab4-f281ec2e5eaf.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports various functions and classes from different modules within the PaddleVideo library. It also sets up logger and profiler functionality, provides a build function for creating objects, and handles saving and loading data.",
+    "details": [
+        {
+            "comment": "This code imports various functions and classes from different modules within the PaddleVideo library. It also sets up logger and profiler functionality, provides a build function for creating objects, and handles saving and loading data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py\":0-23",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import Registry\nfrom .build_utils import build\nfrom .config import *\nfrom .logger import setup_logger, coloring, get_logger\nfrom .record import AverageMeter, build_record, log_batch, log_epoch\nfrom .dist_utils import get_dist_info, main_only\nfrom .save_load import save, load, load_ckpt, mkdir\nfrom .precise_bn import do_preciseBN\nfrom .profiler import add_profiler_step\n__all__ = ['Registry', 'build']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/82e6a62a-d0cc-414a-8cc3-0f6befc1bc12.json b/docs/doc/82e6a62a-d0cc-414a-8cc3-0f6befc1bc12.json
new file mode 100644
index 000000000..8e52d39dd
--- /dev/null
+++ b/docs/doc/82e6a62a-d0cc-414a-8cc3-0f6befc1bc12.json
@@ -0,0 +1,20 @@
+{
+    "summary": "PaddleVideo is a deep learning library for video processing, offering pre-trained models, training, compression, inference, and deployment options, along with installation guides, datasets, and annotation tools under the Apache 2.0 license.",
+    "details": [
+        {
+            "comment": "This code is for PaddleVideo, a toolset for video tasks in industry and academia. It provides examples and best practice guidelines for deep learning algorithms in the video domain. Recent updates include the release of PP-TSMv2 (lite action recognition model), addition of Knowledge Distillation framework code, and TokenShift and 2s-ACGN models. Python version required is 3.7+, and it uses PaddlePaddle version 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/README_en.md\":0-19",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](README.md) | English\n# PaddleVideo\n![python version](https://img.shields.io/badge/python-3.7+-orange.svg) ![paddle version](https://img.shields.io/badge/PaddlePaddle-2.0-blue)\n## Introduction\nPaddleVideo is a toolset for video tasks prepared for the industry and academia. This repository provides examples and best practice guildelines for exploring deep learning algorithm in the scene of video area.\n<div align=\"center\">\n  <img src=\"docs/images/home.gif\" width=\"450px\"/><br>\n</div>\n## Update:\n- release **\ud83d\udd25[PP-TSMv2](./docs/zh-CN/model_zoo/recognition/pp-tsm.md)**, an lite action recognition model, top1_acc on Kinetics-400 is 74.38%\uff0ccpu inference time on 10s video with 25fps is only 433ms. [benchmark](./docs/zh-CN/benchmark.md).\n- add [Knowledge Distilltion](./docs/zh-CN/distillation.md) framework code.\n- add [TokenShift](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/tokenshift_transformer.md), [2s-ACGN](https://github.com/PaddlePaddle/PaddleVideo/b"
+        },
+        {
+            "comment": "This code is from the \"PaddleVideo\" project's README file. It introduces PaddleVideo as a platform that supports various cutting-edge video algorithms, developed industrial featured models like PP-TSM and PP-TSMv2, and provides a full process of data production, model training, compression, inference, and deployment. The code also mentions the availability of quick start guides and tutorials to make it easier for users to get started with PaddleVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/README_en.md\":19-42",
+            "content": "lob/develop/docs/zh-CN/model_zoo/recognition/agcn2s.md) and [CTR-GCN](./docs/zh-CN/model_zoo/recognition/ctrgcn.md) model.\n\u200b \ud83d\udc96 **Welcome to scan the code and join the group discussion** \ud83d\udc96\n<div align=\"center\">\n  <img src=\"docs/images/user_group.png\" width=250/></div>\n- Scan the QR code below with your Wechat and reply \"video\", you can access to official technical exchange group. Look forward to your participation.\n## Features\nPaddleVideo support a variety of cutting-edge algorithms related to video, and developed industrial featured models/solution [PP-TSM](docs/zh-CN/model_zoo/recognition/pp-tsm.md) and [PP-TSMv2](docs/zh-CN/model_zoo/recognition/pp-tsm.md) on this basis, and get through the whole process of data production, model training, compression, inference and deployment.\n<div align=\"center\">\n    <img src=\"./docs/images/features_en.png\" width=\"700\">\n</div>\n## Quick Start\n- One line of code quick use: [Quick Start](./docs/zh-CN/quick_start.md)\n## Tutorials\n- [Quick Start](./docs/zh-CN/quick_start.md)"
+        },
+        {
+            "comment": "This code outlines the main components of PaddleVideo, a deep learning library for video processing. It includes installation instructions, usage guidelines, model zoo (pre-trained models), model training, model compression techniques such as quantization and knowledge distillation, inference and deployment options including Python, C++, and serving, academic algorithms, datasets, data annotation tool (BILS), and licensing information under Apache 2.0 license.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/README_en.md\":43-64",
+            "content": "- [Installation](./docs/zh-CN/install.md)\n- [Usage](./docs/zh-CN/usage.md)\n- [PP-TSM\ud83d\udd25](./docs/zh-CN/model_zoo/recognition/pp-tsm.md)\n  - [Model Zoo](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#7)\n  - [Model training](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#4)\n  - [Model Compression](./deploy/slim/)\n      - [Model Quantization](./deploy/slim/readme.md)\n      - [Knowledge Distillation](./docs/zh-CN/distillation.md)\n  - [Inference and Deployment](./deploy/)\n      - [Python Inference](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#62)\n      - [C++ Inference](./deploy/cpp_infer/readme.md)\n      - [Serving](./deploy/python_serving/readme.md)\n      - [Paddle2ONNX](./deploy/paddle2onnx/readme.md)\n      - [Benchmark](./docs/zh-CN/benchmark.md)\n- [Academic algorithms](./docs/en/model_zoo/README.md)\ud83d\ude80\n- [Datasets](./docs/en/dataset/README.md)\n- [Data Annotation](./applications/BILS)\n- [Contribute](./docs/zh-CN/contribute/README.md)\n## License\nPaddleVideo is released under the [Apache 2.0 license](LICENSE)."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/82fc4d1b-6775-44e6-ae19-cb50e584e271.json b/docs/doc/82fc4d1b-6775-44e6-ae19-cb50e584e271.json
new file mode 100644
index 000000000..20566dfd5
--- /dev/null
+++ b/docs/doc/82fc4d1b-6775-44e6-ae19-cb50e584e271.json
@@ -0,0 +1,55 @@
+{
+    "summary": "This code calculates smoothness and reprojection losses for depth estimation tasks, combining identity and reprojection losses to compute disparity loss. It handles day and night scenarios while saving images if necessary. The total loss is stored in the losses dictionary.",
+    "details": [
+        {
+            "comment": "This code defines a function \"get_smooth_loss\" that calculates the smoothness loss for disparity images using color image gradients and disparity image gradients. It uses PaddlePaddle library functions like paddle.abs() and paddle.mean(). The function is part of the BaseWeightedLoss class in the LOSSES registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":0-28",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\ndef get_smooth_loss(disp, img):\n    \"\"\"Computes the smoothness loss for a disparity image\n    The color image is used for edge-aware smoothness\n    \"\"\"\n    grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])\n    grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])\n    grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),"
+        },
+        {
+            "comment": "The code defines two classes: DiffLoss and MSE. DiffLoss calculates the loss between two inputs using L2 norm, while MSE calculates mean squared error loss for a single input. The function on lines 29-66 calculates gradients of disparity maps using image differences, applies exponential decay based on gradient values, and returns their average. This seems to be related to depth estimation or disparity prediction in computer vision tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":29-66",
+            "content": "                             1,\n                             keepdim=True)\n    grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),\n                             1,\n                             keepdim=True)\n    grad_disp_x *= paddle.exp(-grad_img_x)\n    grad_disp_y *= paddle.exp(-grad_img_y)\n    return grad_disp_x.mean() + grad_disp_y.mean()\nclass DiffLoss(nn.Layer):\n    def __init__(self):\n        super(DiffLoss, self).__init__()\n    def forward(self, input1, input2):\n        batch_size = input1.shape[0]\n        input1 = input1.reshape([batch_size, -1])\n        input2 = input2.reshape([batch_size, -1])\n        input1_l2 = input1\n        input2_l2 = input2\n        diff_loss = 0\n        dim = input1.shape[1]\n        for i in range(input1.shape[0]):\n            diff_loss = diff_loss + paddle.mean(\n                ((input1_l2[i:i + 1, :].mm(input2_l2[i:i + 1, :].T)).pow(2)) /\n                dim)\n        diff_loss = diff_loss / input1.shape[0]\n        return diff_loss\nclass MSE(nn.Layer):\n    def __init__(self):"
+        },
+        {
+            "comment": "MSE class is a mean squared error loss function for PaddlePaddle, SIMSE class calculates the structured iterative mean squared error loss, and SSIM class computes the structural similarity index (SSIM) loss between a pair of images using various pooling operations and constants.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":67-103",
+            "content": "        super(MSE, self).__init__()\n    def forward(self, pred, real):\n        diffs = paddle.add(real, -pred)\n        n = paddle.numel(diffs)\n        mse = paddle.sum(diffs.pow(2)) / n\n        return mse\nclass SIMSE(nn.Layer):\n    def __init__(self):\n        super(SIMSE, self).__init__()\n    def forward(self, pred, real):\n        diffs = paddle.add(real, -pred)\n        n = paddle.numel(diffs)\n        simse = paddle.sum(diffs).pow(2) / (n**2)\n        return simse\nclass SSIM(nn.Layer):\n    \"\"\"Layer to compute the SSIM loss between a pair of images\n    \"\"\"\n    def __init__(self):\n        super(SSIM, self).__init__()\n        self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.refl = nn.Pad2D(1, mode='reflect')\n        self.C1 = 0.01**2\n        self.C2 = 0.03**2"
+        },
+        {
+            "comment": "This code defines a forward function for calculating the SSIM loss, which is used in the ADDSLoss class. The SSIM loss measures the structural similarity between two images and takes into account luminance (mu_x and mu_y) and contrast (sigma_x and sigma_y) for each image. It also considers the covariance of the two images (sigma_xy). The SSIM loss is then used in the ADDSLoss class to compute the reprojection loss between predicted and target images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":105-136",
+            "content": "    def forward(self, x, y):\n        x = self.refl(x)\n        y = self.refl(y)\n        mu_x = self.mu_x_pool(x)\n        mu_y = self.mu_y_pool(y)\n        sigma_x = self.sig_x_pool(x**2) - mu_x**2\n        sigma_y = self.sig_y_pool(y**2) - mu_y**2\n        sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y\n        SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)\n        SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)\n        return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)\n@LOSSES.register()\nclass ADDSLoss(BaseWeightedLoss):\n    def __init__(self, avg_reprojection, disparity_smoothness, no_ssim):\n        super(ADDSLoss, self).__init__()\n        self.avg_reprojection = avg_reprojection\n        self.disparity_smoothness = disparity_smoothness\n        self.no_ssim = no_ssim\n        self.loss_diff = DiffLoss()\n        self.loss_recon1 = MSE()\n        self.loss_recon2 = SIMSE()\n        self.loss_similarity = MSE()\n    def compute_reprojection_loss(self, pred, target):\n        \"\"\"Computes reprojection loss between a batch of predicted and target images"
+        },
+        {
+            "comment": "This code computes the reprojection and smoothness losses for a minibatch by iterating over different scales. It calculates the L1 loss between the predicted depth and the target depth, and optionally computes the SSIM (Structural Similarity Index) loss as well. The reprojection loss is determined based on these two values, with 85% weighted towards the SSIM loss and 15% towards the L1 loss. The total loss for the minibatch is accumulated in the \"total_loss\" variable.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":137-172",
+            "content": "        \"\"\"\n        abs_diff = paddle.abs(target - pred)\n        l1_loss = abs_diff.mean(1, True)\n        if not self.no_ssim:\n            self.ssim = SSIM()\n        if self.no_ssim:\n            reprojection_loss = l1_loss\n        else:\n            ssim_loss = self.ssim(pred, target).mean(1, True)\n            reprojection_loss = 0.85 * ssim_loss + 0.15 * l1_loss\n        return reprojection_loss\n    def compute_losses(self, inputs, outputs, is_night):\n        \"\"\"Compute the reprojection and smoothness losses for a minibatch\n        \"\"\"\n        losses = {}\n        total_loss = 0\n        for scale in outputs['scales']:\n            loss = 0\n            reprojection_losses = []\n            source_scale = 0\n            disp = outputs[(\"disp\", scale)]\n            if is_night:\n                color = inputs[(\"color_n\", 0, scale)]\n                target = inputs[(\"color_n\", 0, source_scale)]\n            else:\n                color = inputs[(\"color\", 0, scale)]\n                target = inputs[(\"color\", 0, source_scale)]\n            for frame_id in outputs['frame_ids'][1:]:"
+        },
+        {
+            "comment": "This code computes reprojection losses for day and night scenarios, concatenates them into a single tensor, and then checks if average reprojection loss should be computed. If not, it saves both images and performs minimum operation all at once.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":173-196",
+            "content": "                pred = outputs[(\"color\", frame_id, scale)]\n                reprojection_losses.append(\n                    self.compute_reprojection_loss(pred, target))\n            reprojection_losses = paddle.concat(reprojection_losses, 1)\n            identity_reprojection_losses = []\n            for frame_id in outputs['frame_ids'][1:]:\n                if is_night:\n                    pred = inputs[(\"color_n\", frame_id, source_scale)]\n                else:\n                    pred = inputs[(\"color\", frame_id, source_scale)]\n                identity_reprojection_losses.append(\n                    self.compute_reprojection_loss(pred, target))\n            identity_reprojection_losses = paddle.concat(\n                identity_reprojection_losses, 1)\n            if self.avg_reprojection:\n                identity_reprojection_loss = identity_reprojection_losses.mean(\n                    1, keepdim=True)\n            else:\n                # save both images, and do min all at once below\n                identity_reprojection_loss = identity_reprojection_losses"
+        },
+        {
+            "comment": "This code calculates the depth loss by combining identity and reprojection losses, adds random numbers to break ties, concatenates them, selects minimum values for optimization, and calculates disparity smoothness loss. It then updates the total loss and stores it in the losses dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":198-222",
+            "content": "            if self.avg_reprojection:\n                reprojection_loss = reprojection_losses.mean(1, keepdim=True)\n            else:\n                reprojection_loss = reprojection_losses\n            # add random numbers to break ties\n            identity_reprojection_loss = identity_reprojection_loss + paddle.randn(\n                identity_reprojection_loss.shape) * 0.00001\n            combined = paddle.concat(\n                (identity_reprojection_loss, reprojection_loss), axis=1)\n            if combined.shape[1] == 1:\n                to_optimise = combined\n            else:\n                to_optimise = paddle.min(combined, axis=1)\n            loss = loss + to_optimise.mean()\n            mean_disp = disp.mean(2, True).mean(3, True)\n            norm_disp = disp / (mean_disp + 1e-7)\n            smooth_loss = get_smooth_loss(norm_disp, color)\n            loss = loss + self.disparity_smoothness * smooth_loss / (2**scale)\n            total_loss = total_loss + loss\n            losses[\"loss/{}\".format(scale)] = loss"
+        },
+        {
+            "comment": "This code computes losses for both day and night scenes in a video, using the compute_losses function. It appends two target differences to the 'loss' list and adds them to the total loss. The target_diff1 and target_diff2 are calculated by the loss_diff function, comparing specific elements from the outputs. Target_diff3 and target_diff4 are also computed in a similar manner. The final total loss is divided by the number of scales and stored in the losses dictionary before returning.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":224-249",
+            "content": "        total_loss /= len(outputs['scales'])\n        losses[\"loss\"] = total_loss\n        return losses\n    def forward(self, inputs, outputs):\n        losses_day = self.compute_losses(inputs, outputs, 'day')\n        losses_night = self.compute_losses(inputs, outputs['outputs_night'],\n                                           'night')\n        loss = 0\n        losses = []\n        # diff\n        target_diff1 = 0.5 * self.loss_diff(\n            outputs['result'][0], outputs['result'][2])  # 10 when batchsize=1\n        target_diff2 = 0.5 * self.loss_diff(outputs['result_night'][0],\n                                            outputs['result_night'][2])\n        losses.append(target_diff1)\n        losses.append(target_diff2)\n        loss = loss + target_diff1\n        loss = loss + target_diff2\n        target_diff3 = 1 * self.loss_diff(\n            outputs['result'][1], outputs['result'][3])  # 10 when batchsize=1\n        target_diff4 = 1 * self.loss_diff(outputs['result_night'][1],\n                                          outputs['result_night'][3])"
+        },
+        {
+            "comment": "The code calculates multiple losses, including depth and reconstruction, for both daytime and night-time scenes. It then adds these losses to the total loss and appends them to the 'losses' list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":250-275",
+            "content": "        losses.append(target_diff3)\n        losses.append(target_diff4)\n        loss = loss + target_diff3\n        loss = loss + target_diff4\n        # recon\n        target_mse = 1 * self.loss_recon1(outputs['result'][5],\n                                          inputs[\"color_aug\", 0, 0])\n        loss = loss + target_mse\n        target_simse = 1 * self.loss_recon2(outputs['result'][5],\n                                            inputs[\"color_aug\", 0, 0])\n        loss = loss + target_simse\n        losses.append(target_mse)\n        losses.append(target_simse)\n        target_mse_night = 1 * self.loss_recon1(outputs['result_night'][5],\n                                                inputs[\"color_n_aug\", 0, 0])\n        loss = loss + target_mse_night\n        target_simse_night = 1 * self.loss_recon2(outputs['result_night'][5],\n                                                  inputs[\"color_n_aug\", 0, 0])\n        loss = loss + target_simse_night\n        losses.append(target_mse_night)\n        losses.append(target_simse_night)"
+        },
+        {
+            "comment": "This code calculates a depth loss by comparing predicted depths with detached pseudo-labels, then adds it to the overall loss and appends it to the losses list. Finally, it updates the output dictionary with the total loss and separate day/night losses before returning the updated outputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/depth_loss.py\":277-289",
+            "content": "        # depth loss\n        pseudo_label = outputs[(\"disp\", 0)].detach()\n        depth_loss = 1 * self.loss_similarity(\n            outputs['outputs_night'][(\"disp\", 0)], pseudo_label)\n        loss = loss + depth_loss\n        losses.append(depth_loss)\n        outputs['loss'] = loss + losses_day['loss'] + losses_night['loss']\n        outputs['losses_day'] = losses_day['loss']\n        outputs['losses_night'] = losses_night['loss']\n        return outputs"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/84255506-9b77-41c0-9f9a-be657b23b846.json b/docs/doc/84255506-9b77-41c0-9f9a-be657b23b846.json
new file mode 100644
index 000000000..39dfe7980
--- /dev/null
+++ b/docs/doc/84255506-9b77-41c0-9f9a-be657b23b846.json
@@ -0,0 +1,15 @@
+{
+    "summary": "BaseDetector class serves as a parent for detectors, providing common features and abstract train_step method implementation. Abstract base classes are defined for training, validating, and testing steps in machine learning models.",
+    "details": [
+        {
+            "comment": "BaseDetector class is the parent class for detectors, providing common functionality like extracting features and initializing weights. It defines an abstract train_step method that needs to be implemented by subclasses for training. The class also contains methods for feature extraction, model forward pass, and handling different modes (train, valid, test, infer).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/base.py\":0-35",
+            "content": "from abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nfrom ...registry import DETECTORS\n@DETECTORS.register()\nclass BaseDetector(nn.Layer):\n    \"\"\"Base class for detectors.  \"\"\"\n    def __init__(self, backbone=None, head=None):\n        super().__init__()\n    def init_weights(self):\n        \"\"\"Initialize the model network weights. \"\"\"\n        self.backbone.init_weights()  \n        self.head.init_weights()\n    def extract_feature(self, imgs, iter_num):\n        \"\"\"Extract features through a backbone.  \"\"\"\n        feature = self.backbone(imgs)\n        return feature\n    def forward(self,  data_batch, mode='infer'):\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):"
+        },
+        {
+            "comment": "This code defines abstract base classes for training, validating, and testing steps in a machine learning model. These methods must be implemented by subclasses to perform the specific tasks accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/base.py\":36-50",
+            "content": "        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/855278f2-de7a-4666-97dc-8f76f5bcb976.json b/docs/doc/855278f2-de7a-4666-97dc-8f76f5bcb976.json
new file mode 100644
index 000000000..df16ca7b8
--- /dev/null
+++ b/docs/doc/855278f2-de7a-4666-97dc-8f76f5bcb976.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code creates an AudioReader class for youtube-8M dataset, initializing audio readers and loading pcm data. It manages audio batches by appending audios to batch_out until reaching the specified batch size, then yields the batch. Any remaining audios are yielded upon completion.",
+    "details": [
+        {
+            "comment": "This code defines an AudioReader class for the youtube-8M dataset, which reads features extracted by prior networks. It imports necessary libraries and modules, such as numpy, random, code, DataReader from reader_utils, feature_extractor from mfcc, pickle for file input/output, and StringIO or BytesIO depending on the availability of cPickle. The class inherits from DataReader, indicating it follows a standard data reading structure, and uses a feature extractor to extract audio features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/audio_reader.py\":0-36",
+            "content": "\"\"\"\naudio reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nimport os\nimport _pickle as cPickle\n#from .reader_utils import DataReader\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\nimport numpy as np\nimport random\nimport code\nfrom .reader_utils import DataReader\nimport mfcc.feature_extractor as feature_extractor\nclass AudioReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks"
+        },
+        {
+            "comment": "This code initializes an audio reader for three models (LSTM, Attention Cluster, NextVlad). It takes parameters such as name, mode, and configuration file. The batch size, sample rate, and file list are set according to the given configuration. The pcm data is loaded from a binary file and converted to numpy array. Finally, a reader function is defined that iterates through examples and appends them to batches.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/audio_reader.py\":37-69",
+            "content": "    This is for the three models: lstm, attention cluster, nextvlad\n    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        # set batch size and file list\n        self.sample_rate = cfg[self.name.upper()]['sample_rate']\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        self.pcm_file = cfg[self.name.upper()]['pcm_file']\n        self.material = material\n    def create_reader(self):\n        \"\"\"create_reader\"\"\"\n        with open(self.pcm_file, \"rb\") as f:\n            pcm_data = f.read()\n        audio_data = np.fromstring(pcm_data, dtype=np.int16)\n        examples = feature_extractor.wav_to_example(audio_data, self.sample_rate)\n        # print(examples.shape)\n        def reader():\n            \"\"\"reader\"\"\"\n            batch_out = []\n            batch_out_pre = []\n            for audio in examples:\n                # batch_out.append([audio])"
+        },
+        {
+            "comment": "This code is creating and managing audio batches in the audio reader class. It appends each audio to batch_out until it reaches the specified batch size, then yields the batch and resets batch_out. If there are remaining audios in batch_out after the loop ends, it yields them before returning the reader object.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/audio_reader.py\":70-77",
+            "content": "                batch_out.append(audio)\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/858c078f-36a0-4e77-911f-9b31ee8cf7ee.json b/docs/doc/858c078f-36a0-4e77-911f-9b31ee8cf7ee.json
new file mode 100644
index 000000000..c311452b1
--- /dev/null
+++ b/docs/doc/858c078f-36a0-4e77-911f-9b31ee8cf7ee.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines a base class for metrics in the PaddleVideo library, initializing with data size, batch size, world size, and log interval. It includes all-gather and concatenation methods, along with abstract update and accumulate functions to be implemented by subclasses.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library and defines a base class for metrics. It initializes the metric with data size, batch size, world size, and log interval. The gather_from_gpu method gathers Tensors from all GPUs into a list and concatenates them on a specified axis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/base.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nimport paddle\nfrom paddlevideo.utils import get_dist_info\nfrom .registry import METRIC\nclass BaseMetric(object):\n    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):\n        self.data_size = data_size\n        self.batch_size = batch_size\n        _, self.world_size = get_dist_info()\n        self.log_interval = log_interval\n    def gather_from_gpu(self,\n                        gather_object: paddle.Tensor,\n                        concat_axis=0) -> paddle.Tensor:\n        \"\"\"gather Tensor from all gpus into a list and concatenate them on `concat_axis`."
+        },
+        {
+            "comment": "Function that performs all-gather and concatenation on the gather object Tensor. Abstract methods for update and accumulate that must be implemented in subclasses.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/base.py\":32-51",
+            "content": "        Args:\n            gather_object (paddle.Tensor): gather object Tensor\n            concat_axis (int, optional): axis for concatenation. Defaults to 0.\n        Returns:\n            paddle.Tensor: gatherd & concatenated Tensor\n        \"\"\"\n        gather_object_list = []\n        paddle.distributed.all_gather(gather_object_list, gather_object.cuda())\n        return paddle.concat(gather_object_list, axis=concat_axis)\n    @abstractmethod\n    def update(self):\n        raise NotImplementedError(\n            \"'update' method must be implemented in subclass\")\n    @abstractmethod\n    def accumulate(self):\n        raise NotImplementedError(\n            \"'accumulate' method must be implemented in subclass\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/85f1dba3-5dba-4fb4-988b-9b2c5535e7bc.json b/docs/doc/85f1dba3-5dba-4fb4-988b-9b2c5535e7bc.json
new file mode 100644
index 000000000..b690c6440
--- /dev/null
+++ b/docs/doc/85f1dba3-5dba-4fb4-988b-9b2c5535e7bc.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code defines Paddle Video's image preprocessing classes for resizing, aspect ratio adjustment, and custom cropping transforms. It performs horizontal flipping, object detection, and returns foreground/nocare masks from a scribble image.",
+    "details": [
+        {
+            "comment": "This code defines a Paddle Video pipeline class, \"RandomScale\\_manet,\" which resizes the input image and its corresponding ground truth to random scales. The allowed scales are [0.75, 1, 1.25]. For elements like 'img1', 'img2', or 'ref\\_img,' it uses cv2's INTER_CUBIC interpolation. For other elements, it utilizes cv2's INTER_NEAREST interpolation. The pipeline is registered at PIPELINES for further usage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py\":0-42",
+            "content": "import os\nimport random\nimport cv2\nimport numpy as np\nimport paddle\nfrom PIL import Image\nfrom davisinteractive.utils.operations import bresenham\nfrom ..registry import PIPELINES\ncv2.setNumThreads(0)\nNEW_BRANCH = True\n@PIPELINES.register()\nclass RandomScale_manet(object):\n    \"\"\"Randomly resize the image and the ground truth to specified scales.\n    Args:\n        scales (list): the list of scales\n    \"\"\"\n    def __init__(self, scales=[0.75, 1, 1.25]):\n        self.scales = scales\n    def __call__(self, sample):\n        # Fixed range of scales\n        sc = self.scales[random.randint(0, len(self.scales) - 1)]\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':\n                flagval = cv2.INTER_CUBIC\n            else:\n                flagval = cv2.INTER_NEAREST\n            tmp = cv2.resize(tmp, None, fx=sc, fy=sc, interpolation=flagval)\n            sample[elem] = tmp\n        return sample"
+        },
+        {
+            "comment": "The code defines a Resize_manet class, which is a pipeline for resizing an image to a specified output size. The input could be either an integer or a tuple representing the desired output dimensions. If the input is an integer, it will use that as the smaller edge of the image and maintain aspect ratio. The code checks if the current image size matches the desired output size; if so, it returns the results without modification, otherwise it resizes the image to match the desired output size. This class is used in a computer vision context for preprocessing images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py\":45-74",
+            "content": "@PIPELINES.register()\nclass Resize_manet(object):\n    \"\"\"Rescale the image in a results to a given size.\n    Args:\n        output_size (tuple or int): Desired output size. If tuple, output is\n            matched to output_size. If int, smaller of image edges is matched\n            to output_size keeping aspect ratio the same.\n    \"\"\"\n    def __init__(self, output_size):\n        assert isinstance(output_size, (int, list))\n        if isinstance(output_size, int):\n            self.output_size = (output_size, output_size)\n        else:\n            self.output_size = output_size\n    #        self.seg_interpolation = cv2.INTER_CUBIC if is_continuous else cv2.INTER_NEAREST\n    #        self.fix = fix\n    def __call__(self, results):\n        img1 = results['img1']\n        h, w = img1.shape[:2]\n        if self.output_size == (h, w):\n            return results\n        else:\n            new_h, new_w = self.output_size\n        new_h, new_w = int(new_h), int(new_w)\n        for elem in results.keys():\n            if 'meta' in elem:"
+        },
+        {
+            "comment": "This code defines a custom transform for image processing, specifically random cropping. It takes an input image and crops it randomly to the specified output size. The interpolation method used during resizing is determined by the element type in the 'results' dictionary. If the element is 'img1', 'img2', or 'ref_img', cubic interpolation is used, otherwise nearest neighbor interpolation is used. The cropped image is then stored back into the results dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py\":75-108",
+            "content": "                continue\n            tmp = results[elem]\n            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':\n                flagval = cv2.INTER_CUBIC\n            else:\n                flagval = cv2.INTER_NEAREST\n            tmp = cv2.resize(tmp, dsize=(new_w, new_h), interpolation=flagval)\n            results[elem] = tmp\n        return results\n@PIPELINES.register()\nclass RandomCrop_manet(object):\n    \"\"\"Crop randomly the image in a results.\n    Args:\n        output_size (tuple or int): Desired output size. If int, square crop\n            is made.\n    \"\"\"\n    def __init__(self, output_size, step=None):\n        assert isinstance(output_size, (int, list))\n        if isinstance(output_size, int):\n            self.output_size = (output_size, output_size)\n        else:\n            assert len(output_size) == 2\n            self.output_size = output_size\n        self.step = step\n    def __call__(self, results):\n        image = results['img1']\n        h, w = image.shape[:2]\n        new_h, new_w = self.output_size"
+        },
+        {
+            "comment": "This code randomly crops an image and its associated labels to the specified new height and width. It checks if the cropped reference scribble label contains only one class label before updating other labels accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py\":110-133",
+            "content": "        new_h = h if new_h >= h else new_h\n        new_w = w if new_w >= w else new_w\n        is_contain_obj = False\n        #        while (not is_contain_obj) and (step < 5):\n        if self.step is None:\n            while not is_contain_obj:\n                #                step += 1\n                top = np.random.randint(0, h - new_h + 1)\n                left = np.random.randint(0, w - new_w + 1)\n                ref_scribble_label = results['ref_scribble_label']\n                new_ref_scribble_label = ref_scribble_label[top:top + new_h,\n                                                            left:left + new_w]\n                if len(np.unique(new_ref_scribble_label)) == 1:\n                    continue\n                else:\n                    for elem in results.keys():\n                        if 'meta' in elem:\n                            continue\n                        tmp = results[elem]\n                        tmp = tmp[top:top + new_h, left:left + new_w]\n                        results[elem] = tmp"
+        },
+        {
+            "comment": "This code randomly selects a region within the original image and applies random horizontal flipping to it. It checks if the selected region contains an object (by checking if the number of unique labels in ref_scribble_label is greater than 1) and continues flipping until either an object is found or the maximum allowed steps are reached. The function then returns the modified dictionary with the updated data for each key, except 'meta'. This custom transform is registered as a pipeline module for use in image processing tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py\":134-162",
+            "content": "                    break\n        else:\n            st = 0\n            while not is_contain_obj and st < self.step:\n                st += 1\n                top = np.random.randint(0, h - new_h + 1)\n                left = np.random.randint(0, w - new_w + 1)\n                ref_scribble_label = results['ref_scribble_label']\n                new_ref_scribble_label = ref_scribble_label[top:top + new_h,\n                                                            left:left + new_w]\n                if len(np.unique(\n                        new_ref_scribble_label)) == 1 or st < self.step - 1:\n                    continue\n                else:\n                    for elem in results.keys():\n                        if 'meta' in elem:\n                            continue\n                        tmp = results[elem]\n                        tmp = tmp[top:top + new_h, left:left + new_w]\n                        results[elem] = tmp\n                    break\n        return results\n@PIPELINES.register()\nclass RandomHorizontalFlip_manet(object):"
+        },
+        {
+            "comment": "This code snippet contains two custom transforms for image processing. The first one, HorizontalFlip, randomly flips the given image and ground truth horizontally with a probability of 0.5. The second one, ToTensor_manet, converts ndarrays in results to Tensors by normalizing the images and reshaping them as required. Both transforms are added to the PADDLEPIPELINES registry for later use in image processing pipelines.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py\":163-197",
+            "content": "    \"\"\"Horizontally flip the given image and ground truth randomly with a probability of 0.5.\"\"\"\n    def __init__(self, prob):\n        self.p = prob\n    def __call__(self, results):\n        if random.random() < self.p:\n            for elem in results.keys():\n                if 'meta' in elem:\n                    continue\n                tmp = results[elem]\n                tmp = cv2.flip(tmp, flipCode=1)\n                results[elem] = tmp\n        return results\n@PIPELINES.register()\nclass ToTensor_manet(object):\n    \"\"\"Convert ndarrays in results to Tensors.\"\"\"\n    def __call__(self, results):\n        for elem in results.keys():\n            if 'meta' in elem:\n                continue\n            tmp = results[elem]\n            if tmp.ndim == 2:\n                tmp = tmp[:, :, np.newaxis]\n            else:\n                tmp = tmp / 255.\n                tmp -= (0.485, 0.456, 0.406)\n                tmp /= (0.229, 0.224, 0.225)\n            tmp = tmp.transpose([2, 0, 1])\n            results[elem] = paddle.to_tensor(tmp)"
+        },
+        {
+            "comment": "This function takes in a scribble image and optionally dilation and nocare area values. It returns the foreground mask and nocare mask. If the maximum value of the scribble is 1, it computes the foreground by dilating the scribble using an ellipse kernel. Else, it assigns the scribble as the foreground. Then, if a nocare area is given, it computes the nocare mask by dilating the foreground with another ellipse kernel and subtracting the original foreground.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py\":198-219",
+            "content": "        return results\ndef gt_from_scribble(scr, dilation=11, nocare_area=21):\n    # Compute foreground\n    if scr.max() == 1:\n        kernel_fg = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,\n                                              (dilation, dilation))\n        fg = cv2.dilate(scr.astype(np.uint8),\n                        kernel=kernel_fg).astype(scr.dtype)\n    else:\n        fg = scr\n    # Compute nocare area\n    if nocare_area is None:\n        nocare = None\n    else:\n        kernel_nc = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,\n                                              (nocare_area, nocare_area))\n        nocare = cv2.dilate(fg, kernel=kernel_nc) - fg\n    return fg, nocare"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/865d27dc-fad6-4333-b206-a70e9019bd81.json b/docs/doc/865d27dc-fad6-4333-b206-a70e9019bd81.json
new file mode 100644
index 000000000..34834fafb
--- /dev/null
+++ b/docs/doc/865d27dc-fad6-4333-b206-a70e9019bd81.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code file in the PaddleVideo library imports various recognizer classes for video recognition tasks, including 1D, 2D, 3D, transformer-based, GCN, MRI, and MoViNet frame-based recognizers. These models are used for action recognition and motion estimation tasks.",
+    "details": [
+        {
+            "comment": "This code file imports various recognizer classes from different modules within the PaddleVideo framework for video recognition tasks. These recognizers include 1D, 2D, 3D, transformer-based, GCN, MRI, 3D MRI, and MoViNet frame-based recognizers, as well as a distillation-based recognizer. Each recognizer is designed for specific types of recognition tasks in video analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/__init__.py\":0-22",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseRecognizer\nfrom .recognizer1d import Recognizer1D, RecognizerAction\nfrom .recognizer2d import Recognizer2D\nfrom .recognizer3d import Recognizer3D\nfrom .recognizer_transformer import RecognizerTransformer\nfrom .recognizer_gcn import RecognizerGCN\nfrom .recognizerMRI import RecognizerMRI\nfrom .recognizer3dMRI import Recognizer3DMRI\nfrom .recognizer_transformer_MRI import RecognizerTransformer_MRI\nfrom .recognizer_movinet_frame import MoViNetRecognizerFrame\nfrom .recognizerDistillation import RecognizerDistillation"
+        },
+        {
+            "comment": "This code snippet in the PaddleVideo library defines various recognizer models including BaseRecognizer, Recognizer1D, Recognizer2D, and more. These classes are used for video recognition tasks like action recognition and motion estimation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/__init__.py\":24-29",
+            "content": "__all__ = [\n    'BaseRecognizer', 'Recognizer1D', 'Recognizer2D', 'Recognizer3D',\n    'RecognizerTransformer', 'RecognizerGCN', 'RecognizerMRI',\n    'Recognizer3DMRI', 'RecognizerTransformer_MRI', 'MoViNetRecognizerFrame',\n    'RecognizerAction', 'RecognizerDistillation'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/86654146-c598-48ef-8353-87802015719f.json b/docs/doc/86654146-c598-48ef-8353-87802015719f.json
new file mode 100644
index 000000000..3d196544f
--- /dev/null
+++ b/docs/doc/86654146-c598-48ef-8353-87802015719f.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code initializes an ASPP network using Paddle, defines its architecture, sets parameters for BatchNorm layers and global average pooling operation, creates the ASPP-MANET backbone model class, initializes layers, and applies weight initialization.",
+    "details": [
+        {
+            "comment": "Imports Paddle, nn, and functional modules for creating a module that implements the ASPP layer with Conv2D, BatchNorm, and ReLU layers. Initializes weights using Kaiming normal distribution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py\":0-31",
+            "content": "import paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom EIVideo.paddlevideo.utils.manet_utils import kaiming_normal_\nclass _ASPPModule(nn.Layer):\n    def __init__(self, inplanes, planes, kernel_size, padding, dilation,\n                 BatchNorm):\n        super(_ASPPModule, self).__init__()\n        self.atrous_conv = nn.Conv2D(inplanes,\n                                     planes,\n                                     kernel_size=kernel_size,\n                                     stride=1,\n                                     padding=padding,\n                                     dilation=dilation,\n                                     bias_attr=False)\n        self.bn = BatchNorm(planes)\n        self.relu = nn.ReLU(True)\n        self._init_weight()\n    def forward(self, x):\n        x = self.atrous_conv(x)\n        x = self.bn(x)\n        return self.relu(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)"
+        },
+        {
+            "comment": "This code initializes an ASPP (Atrous Spatial Pyramid Pooling) network. The network architecture is defined based on the selected backbone and output stride. The BatchNorm layer weights are set to 1, and its biases are set to zero using functions from the manet_utils module. Dilation rates for each ASPPModule are determined based on the chosen output stride.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py\":32-61",
+            "content": "            elif isinstance(m, nn.BatchNorm2D):\n                from EIVideo.paddlevideo.utils.manet_utils import fill_\n                fill_(m.weight, 1)\n                from EIVideo.paddlevideo.utils.manet_utils import zero_\n                zero_(m.bias)\nclass ASPP(nn.Layer):\n    def __init__(self, backbone, output_stride, BatchNorm):\n        super(ASPP, self).__init__()\n        if backbone == 'drn':\n            inplanes = 512\n        elif backbone == 'mobilenet':\n            inplanes = 320\n        else:\n            inplanes = 2048\n        if output_stride == 16:\n            dilations = [1, 6, 12, 18]\n        elif output_stride == 8:\n            dilations = [1, 12, 24, 36]\n        else:\n            raise NotImplementedError\n        self.aspp1 = _ASPPModule(inplanes,\n                                 256,\n                                 1,\n                                 padding=0,\n                                 dilation=dilations[0],\n                                 BatchNorm=BatchNorm)\n        self.aspp2 = _ASPPModule(inplanes,"
+        },
+        {
+            "comment": "This code defines an ASPP module with three branches, each having different dilation rates. It also includes a global average pooling operation and subsequent convolution layers to extract features from the input planes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py\":62-84",
+            "content": "                                 256,\n                                 3,\n                                 padding=dilations[1],\n                                 dilation=dilations[1],\n                                 BatchNorm=BatchNorm)\n        self.aspp3 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[2],\n                                 dilation=dilations[2],\n                                 BatchNorm=BatchNorm)\n        self.aspp4 = _ASPPModule(inplanes,\n                                 256,\n                                 3,\n                                 padding=dilations[3],\n                                 dilation=dilations[3],\n                                 BatchNorm=BatchNorm)\n        self.global_avg_pool = nn.Sequential(\n            nn.AdaptiveAvgPool2D((1, 1)),\n            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),\n            BatchNorm(256), nn.ReLU())\n        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)"
+        },
+        {
+            "comment": "This code defines a class for ASPP-MANET backbone model in the ASPP-MANET network architecture. It initializes batch normalization, ReLU activation and dropout layers, with weight initialization function defined separately. The forward pass applies aspp1 to x and other similar operations on x, then concatenates them along axis=1. Conv2D layer is applied on the concatenated input, followed by BatchNorm2D and ReLU activation functions. Finally, it returns the output and also drops out some values using dropout. The weight initialization follows Kaiming Normal distribution for convolutional layers and uses fill function for batch normalization layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py\":85-116",
+            "content": "        self.bn1 = BatchNorm(256)\n        self.relu = nn.ReLU(True)\n        self.dropout = nn.Dropout(0.1)\n        self._init_weight()\n    def forward(self, x):\n        x1 = self.aspp1(x)\n        x2 = self.aspp2(x)\n        x3 = self.aspp3(x)\n        x4 = self.aspp4(x)\n        x5 = self.global_avg_pool(x)\n        x5 = F.interpolate(x5,\n                           size=x4.shape[2:],\n                           mode='bilinear',\n                           align_corners=True)\n        x = paddle.concat((x1, x2, x3, x4, x5), axis=1)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        return x\n        return self.dropout(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                # n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                # m.weight.normal_(0, math.sqrt(2. / n))\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from EIVideo.paddlevideo.utils.manet_utils import fill_"
+        },
+        {
+            "comment": "This code defines a function called build_aspp that returns an instance of the ASPP class. The function initializes the model's weights to 1 and sets the bias to zero using the zero_ function from the manet_utils module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py\":117-123",
+            "content": "                fill_(m.weight, 1)\n                from EIVideo.paddlevideo.utils.manet_utils import zero_\n                zero_(m.bias)\ndef build_aspp(backbone, output_stride, BatchNorm):\n    return ASPP(backbone, output_stride, BatchNorm)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8672a5da-5798-4c08-8e08-f15f4978ddcc.json b/docs/doc/8672a5da-5798-4c08-8e08-f15f4978ddcc.json
new file mode 100644
index 000000000..969cfbeb2
--- /dev/null
+++ b/docs/doc/8672a5da-5798-4c08-8e08-f15f4978ddcc.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The SkeletonMetric class in PaddleVideo measures skeleton-based model performance metrics, supports batch size 1 and single card testing, and calculates top1 and top5 accuracy for batches with labels. It logs processing info, updates progress, and accumulates metrics while saving results to 'submission.csv'.",
+    "details": [
+        {
+            "comment": "This code is for the SkeletonMetric class in PaddleVideo, a machine learning framework. The class measures performance metrics for skeleton-based models. It supports batch size 1 and single card testing. Results can be saved to a file named 'submission.csv'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/skeleton_metric.py\":0-37",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle\nimport csv\nimport paddle.nn.functional as F\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass SkeletonMetric(BaseMetric):\n    \"\"\"\n    Test for Skeleton based model.\n    note: only support batch size = 1, single card test.\n    Args:\n        out_file: str, file to save test results.\n    \"\"\"\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 out_file='submission.csv',"
+        },
+        {
+            "comment": "This code initializes a metrics class for tracking accuracy metrics during training. The `__init__` function sets up the top1, top5, and values lists to store metric results, as well as an output file path and the desired top k value. The `update` method processes data from each batch iteration, updating the metrics based on whether the input data contains labels or not. It also handles distributed training by averaging across multiple workers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/skeleton_metric.py\":38-64",
+            "content": "                 log_interval=1,\n                 top_k=5):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.top1 = []\n        self.top5 = []\n        self.values = []\n        self.out_file = out_file\n        self.k = top_k\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        if data[0].shape[0] != outputs.shape[0]:\n            num_segs = data[0].shape[1]\n            batch_size = outputs.shape[0]\n            outputs = outputs.reshape(\n                [batch_size // num_segs, num_segs, outputs.shape[-1]])\n            outputs = outputs.mean(axis=1)\n        if len(data) == 2:  # data with label\n            labels = data[1]\n            top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)\n            top5 = paddle.metric.accuracy(input=outputs, label=labels, k=self.k)\n            if self.world_size > 1:\n                top1 = paddle.distributed.all_reduce(\n                    top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size"
+        },
+        {
+            "comment": "This code segment is part of a class that handles metrics for a testing process. It calculates top1 and top5 accuracy for batches with labels and stores them. For batches without labels, it performs softmax on outputs and gets the class with highest probability. It logs processing information and updates progress. Finally, it accumulates metrics when all iterations are done.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/skeleton_metric.py\":65-87",
+            "content": "                top5 = paddle.distributed.all_reduce(\n                    top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            self.top1.append(top1.numpy())\n            self.top5.append(top5.numpy())\n        else:  # data without label, only support batch_size=1. Used for fsd-10.\n            prob = F.softmax(outputs)\n            clas = paddle.argmax(prob, axis=1).numpy()[0]\n            self.values.append((batch_id, clas))\n        # preds ensemble\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        if self.top1:  # data with label\n            logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {}'.format(\n                np.mean(np.array(self.top1)), np.mean(np.array(self.top5))))\n        else:\n            headers = ['sample_index', 'predict_category']"
+        },
+        {
+            "comment": "Writes headers and values from self.values to file, saves results in out_file and logs success.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/skeleton_metric.py\":88-95",
+            "content": "            with open(\n                    self.out_file,\n                    'w',\n            ) as fp:\n                writer = csv.writer(fp)\n                writer.writerow(headers)\n                writer.writerows(self.values)\n            logger.info(\"Results saved in {} !\".format(self.out_file))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/87fe8510-d6e8-44ea-bc41-702869bc8a33.json b/docs/doc/87fe8510-d6e8-44ea-bc41-702869bc8a33.json
new file mode 100644
index 000000000..60a8841b6
--- /dev/null
+++ b/docs/doc/87fe8510-d6e8-44ea-bc41-702869bc8a33.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code extracts audio features, converts data to [-1.0, +1.0] range, applies log mel spectrogram, frames into examples for further processing, reads pcm data as bytes, and prints the shape of resulting examples batch.",
+    "details": [
+        {
+            "comment": "This code defines functions for audio feature extraction, including framing the audio data, applying a window function, and computing the short-time Fourier transform (STFT) magnitude. The _MEL_BREAK_FREQUENCY_HERTZ and _MEL_HIGH_FREQUENCY_Q variables are used for converting frequency values to Mel scale.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py\":0-40",
+            "content": "\"\"\"\naudio feature extract\n\"\"\"\n# coding: utf-8\nimport os\nimport numpy as np\nimport pickle\nimport mfcc.vgg_params as vgg_params\ndef frame(data, window_length, hop_length):\n    \"\"\"\n    frame\n    \"\"\"\n    num_samples = data.shape[0]\n    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))\n    shape = (num_frames, window_length) + data.shape[1:]\n    strides = (data.strides[0] * hop_length, ) + data.strides\n    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)\ndef periodic_hann(window_length):\n    \"\"\"\n    periodic_hann\n    \"\"\"\n    return 0.5 - (0.5 *\n                  np.cos(2 * np.pi / window_length * np.arange(window_length)))\ndef stft_magnitude(signal, fft_length, hop_length=None, window_length=None):\n    \"\"\"\n    stft_magnitude\n    \"\"\"\n    frames = frame(signal, window_length, hop_length)\n    window = periodic_hann(window_length)\n    windowed_frames = frames * window\n    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))\n_MEL_BREAK_FREQUENCY_HERTZ = 700.0\n_MEL_HIGH_FREQUENCY_Q = 1127.0"
+        },
+        {
+            "comment": "This code defines two functions: 'hertz_to_mel' and 'spectrogram_to_mel_matrix'. The 'hertz_to_mel' function converts frequencies in hertz to the Mel scale. The 'spectrogram_to_mel_matrix' function creates a mel spectrogram matrix from a given number of mel bins, spectrogram bins, audio sample rate, and frequency limits. It first calculates the spectrogram bins frequencies and then converts them to the Mel scale for each bin edge.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py\":43-67",
+            "content": "def hertz_to_mel(frequencies_hertz):\n    \"\"\"\n    hertz_to_mel\n    \"\"\"\n    return _MEL_HIGH_FREQUENCY_Q * np.log(1.0 + (frequencies_hertz /\n                                                 _MEL_BREAK_FREQUENCY_HERTZ))\ndef spectrogram_to_mel_matrix(num_mel_bins=20,\n                              num_spectrogram_bins=129,\n                              audio_sample_rate=8000,\n                              lower_edge_hertz=125.0,\n                              upper_edge_hertz=3800.0):\n    \"\"\"\n    spectrogram_to_mel_matrix\n    \"\"\"\n    nyquist_hertz = audio_sample_rate / 2.\n    if lower_edge_hertz >= upper_edge_hertz:\n        raise ValueError(\"lower_edge_hertz %.1f >= upper_edge_hertz %.1f\" %\n                         (lower_edge_hertz, upper_edge_hertz))\n    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz,\n                                         num_spectrogram_bins)\n    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)\n    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),\n                                 hertz_to_mel(upper_edge_hertz),"
+        },
+        {
+            "comment": "The code defines a function to calculate the mel-frequency cepstral coefficients (MFCC) for audio data. It initializes an empty matrix for storing the MFCCs, and then iterates through each frequency band. For each band, it calculates the lower and upper slopes of the triangular filter used in the MFCC calculation. The code ensures that the calculated values do not go below zero or exceed the maximum value. Finally, it sets the first row of the matrix to zeros before returning the resulting mel-frequency cepstral coefficients.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py\":68-92",
+            "content": "                                 num_mel_bins + 2)\n    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))\n    for i in range(num_mel_bins):\n        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]\n        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /\n                       (center_mel - lower_edge_mel))\n        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /\n                       (upper_edge_mel - center_mel))\n        mel_weights_matrix[:,\n                           i] = np.maximum(0.0,\n                                           np.minimum(lower_slope, upper_slope))\n    mel_weights_matrix[0, :] = 0.0\n    return mel_weights_matrix\ndef log_mel_spectrogram(data,\n                        audio_sample_rate=8000,\n                        log_offset=0.0,\n                        window_length_secs=0.025,\n                        hop_length_secs=0.010,\n                        **kwargs):\n    \"\"\"\n    log_mel_spectrogram\n    \"\"\"\n    window_length_samples = int(round(audio_sample_rate * window_length_secs))"
+        },
+        {
+            "comment": "Code extracts audio features like STFT spectrogram, converts to Mel scale, and returns log of Mel spectrogram after padding zeroes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py\":93-115",
+            "content": "    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))\n    fft_length = 2**int(np.ceil(np.log(window_length_samples) / np.log(2.0)))\n    spectrogram = stft_magnitude(data,\n                                 fft_length=fft_length,\n                                 hop_length=hop_length_samples,\n                                 window_length=window_length_samples)\n    mel_spectrogram = np.dot(\n        spectrogram,\n        spectrogram_to_mel_matrix(num_spectrogram_bins=spectrogram.shape[1],\n                                  audio_sample_rate=audio_sample_rate,\n                                  **kwargs))\n    return np.log(mel_spectrogram + log_offset)\ndef wav_to_example(wav_data, sample_rate):\n    \"\"\"\n    wav_to_example\n    \"\"\"\n    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype\n    pad_zero_num = int(sample_rate * (vgg_params.STFT_WINDOW_LENGTH_SECONDS -\n                                      vgg_params.STFT_HOP_LENGTH_SECONDS))\n    wav_data_extend = np.hstack((wav_data, np.zeros(pad_zero_num)))"
+        },
+        {
+            "comment": "This code extracts and preprocesses audio features from a wav file. It converts the wav data to [-1.0, +1.0] range, applies log mel spectrogram, frames into examples with specific window lengths and hop lengths for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py\":116-138",
+            "content": "    wav_data = wav_data_extend\n    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]\n    if len(wav_data.shape) > 1:\n        wav_data = np.mean(wav_data, axis=1)\n    log_mel = log_mel_spectrogram(\n        wav_data,\n        audio_sample_rate=vgg_params.SAMPLE_RATE,\n        log_offset=vgg_params.LOG_OFFSET,\n        window_length_secs=vgg_params.STFT_WINDOW_LENGTH_SECONDS,\n        hop_length_secs=vgg_params.STFT_HOP_LENGTH_SECONDS,\n        num_mel_bins=vgg_params.NUM_MEL_BINS,\n        lower_edge_hertz=vgg_params.MEL_MIN_HZ,\n        upper_edge_hertz=vgg_params.MEL_MAX_HZ)\n    # Frame features into examples.\n    features_sample_rate = 1.0 / vgg_params.STFT_HOP_LENGTH_SECONDS\n    example_window_length = int(\n        round(vgg_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))\n    example_hop_length = int(\n        round(vgg_params.EXAMPLE_HOP_SECONDS * features_sample_rate))\n    log_mel_examples = frame(log_mel,\n                             window_length=example_window_length,\n                             hop_length=example_hop_length)"
+        },
+        {
+            "comment": "The code extracts audio features from a wav file using pcm data. It reads the pcm data as bytes, converts it to np.int16 array, applies the wav_to_example function with sample rate 16000 to convert audio data into examples batch, and prints the shape of the resulting examples_batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py\":139-157",
+            "content": "    return log_mel_examples\ndef extract_pcm(pcm_file, sample_rate):\n    with open(pcm_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype=np.int16)\n    examples = wav_to_example(audio_data, sample_rate)\n    return examples\nif __name__ == \"__main__\":\n    wav_file = sys.argv[1]\n    print(\"wav_file = \", wav_file)\n    with open(wav_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype = np.int16)\n    examples_batch = wav_to_example(audio_data, 16000)\n    print(\"examples_batch.shape\", examples_batch.shape)   "
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8818dbc4-c599-4efc-b86f-41c14b428207.json b/docs/doc/8818dbc4-c599-4efc-b86f-41c14b428207.json
new file mode 100644
index 000000000..bf5b0ffe8
--- /dev/null
+++ b/docs/doc/8818dbc4-c599-4efc-b86f-41c14b428207.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The code evaluates precision, recall, and F1 scores for a model's predictions using IoU thresholds and label ranges. It iterates through score thresholds, selects the best F1 score, and saves the results.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries, defines global variables fps and mode, loads a JSON file containing indexed labels for 8 categories, and initializes a gts_data dictionary with the frame rate (fps) and an empty dictionary to store ground truth data. It also iterates over eval_datasets, label_files, and individual gt data to update fps, populate gts_data, and assign mode for each ground truth item.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py\":0-35",
+            "content": "\"\"\"\nget instance for lstm\n\u6839\u636egts\u8ba1\u7b97\u6bcf\u4e2aproposal_bmn\u7684iou\u3001ioa\u3001label\u7b49\u4fe1\u606f\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nimport io\nsys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding = 'utf-8')\ndataset = \"/home/work/datasets\"\nlabel_index_file = './configs/index_label_football_8.json'\neval_datasets = ['EuroCup2016']\nlabel_files = {'train': 'label_cls8_train.json',\n               'validation': 'label_cls8_val.json'}\nglobal fps, mode\nlabel_index = json.load(open(label_index_file, 'rb'))\ndef load_gts():\n    global fps\n    gts_data = {'fps': 0, 'gts': {}}\n    for eval_data in eval_datasets:\n        for item, value in label_files.items():\n            label_file = '{}/{}/{}'.format(dataset, eval_data, value)\n            gts = json.load(open(label_file, 'rb'))\n            gts_data['fps'] = gts['fps']\n            fps = gts['fps']\n            for gt in gts['gts']:\n                gt['mode'] = item\n                basename = '{}/{}/mp4/{}'.format(dataset, eval_data, os.path.basename(gt['url']))"
+        },
+        {
+            "comment": "The code defines functions for evaluating ground truth (GT) labels and computing Intersection over Union (IoU). It also includes a function to convert proposals with score threshold filtering. The IoU function calculates the area of intersection, the area of union, and returns the IoU value. The 'computeIoU' function can be used for both regular and proposal modes. The 'convert_proposal' function sorts boxes based on scores and selects those above a given threshold to generate new proposals. It assigns each proposal an ID and calculates their respective start and end times.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py\":36-66",
+            "content": "                gts_data['gts'][basename] = gt\n    return gts_data['gts']\ndef computeIoU(e1, e2):\n    \"\"\"\n    clc iou and ioa\n    \"\"\"\n    if not (e1['label'] == e2['label'] and e1['basename'] == e2['basename']):\n        return 0.\n    area1 = e1[\"end\"] - e1[\"start\"]\n    area2 = e2[\"end\"] - e2[\"start\"]\n    x1 = np.maximum(e1[\"start\"], e2[\"start\"])\n    x2 = np.minimum(e1[\"end\"], e2[\"end\"])\n    inter = np.maximum(0.0, x2 - x1)\n    iou = 0.0 if (area1 + area2 - inter) == 0 else inter * 1.0 / (area1 + area2 - inter)\n    if not mode == 'proposal':\n        iou = 0.0 if area2 == 0 else inter * 1.0 / area2\n    return iou\ndef convert_proposal(boxes, basename, score_threshold=0.01):\n    boxes = sorted(boxes, key=lambda x:float(x['score']), reverse=True)\n    res = []\n    for box in boxes:\n        if not float(box['score']) >= score_threshold:\n            continue\n        res.append({'basename': basename,\n                    'start': int(float(box['start']) / fps),\n                    'end': int(float(box['end']) / fps),\n                    'label': 0})"
+        },
+        {
+            "comment": "This code is defining a function called convert_classify that takes in boxes, basename, iou_threshold and score_threshold as parameters. The function sorts the boxes based on their classify_score and iou_score in descending order. If both iou_score and classify_score meet the threshold values, it appends the box details to a list named res. It returns this list of results. \n\nThe code also defines another function called convert_groundtruth that takes in boxes, basename and phase as parameters. This function iterates through each box and its corresponding label IDs. If the phase is 'proposal', it assigns a value of 0 to the label variable; otherwise, it assigns the item from box['label_ids']. It appends the result to a list named res.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py\":67-92",
+            "content": "    return res\ndef convert_classify(boxes, basename, iou_threshold, score_threshold):\n    boxes = sorted(boxes, key=lambda x:(float(x['classify_score']), float(x['iou_score'])), reverse=True)\n    def convert_time_to_frame(time_type):\n        return int(time_type)\n        h, m, s = time_type.split(':')\n        return int(h) * 3600 + int(m) * 60 + int(s)\n    res = []\n    for box in boxes:\n        if not (box['iou_score'] >= iou_threshold and\n                box['classify_score'] >= score_threshold):\n            continue\n        res.append({'basename': basename,\n                    'start': convert_time_to_frame(box['start_time']),\n                    'end': convert_time_to_frame(box['end_time']),\n                    'label': box['label_id']})\n    return res\ndef convert_groundtruth(boxes, basename, phase=None):\n    res = []\n    for box in boxes:\n        for item in box['label_ids']:\n            label = 0 if phase == 'proposal' else item\n            res.append({'basename': basename,\n                        'start': box['start_id'],"
+        },
+        {
+            "comment": "This code defines four functions: \"evaluation\", \"print_result\", \"print_head\", and \"computeIoU\". The \"evaluation\" function computes the intersection over union (IoU) between predicted boxes and ground truth boxes. It then passes these results to \"print_head\" and \"print_result\" for displaying progress and final evaluation results, respectively. The other two functions are used internally by the main \"evaluation\" function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py\":93-119",
+            "content": "                        'end': box['end_id'],\n                        'label': label})\n    return res\ndef print_head(iou):\n    print(\"\\nioa = {:.1f}\".format(iou))\n    res_str = ''\n    for item in ['label_name']:\n        res_str += '{:<12s}'.format(item)\n    for item in ['label_id', 'precision', 'recall', 'hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10s}'.format(item)\n    print(res_str)\ndef print_result(res_dict, label='avg'):\n    if label == 'avg':\n        res_str = '{:<22s}'.format(str(label))\n    else:\n        res_str = '{0:{2}<6s}{1:<10s}'.format(label_index[str(label)], str(label), chr(12288))\n    for item in ['prec', 'recall']:\n        res_str += '{:<10.4f}'.format(res_dict[item])\n    for item in ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10d}'.format(res_dict[item])\n    print(res_str)\ndef evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = False):\n    iou_map = [computeIoU(resId, gtsId) for resId in res_boxes \\\n                                        for gtsId in gts_boxes]"
+        },
+        {
+            "comment": "This code calculates the precision, recall, and F1 score for a set of predicted boxes and ground truth boxes. It iterates over a range of intersection over union (IOU) thresholds and label ranges to produce average results. The results are stored in dictionaries for each IOU threshold and label range combination.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py\":120-143",
+            "content": "    iou_map = np.array(iou_map).reshape((len(res_boxes), len(gts_boxes)))\n    hit_map_prop_total = np.max(iou_map, axis=1)\n    hit_map_index_total = np.argmax(iou_map, axis=1)\n    res_dict = ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']\n    for iou_threshold in iou_range:\n        if show_sub:\n            print_head(iou_threshold)\n        iou_prop = np.array([k >= iou_threshold for k in hit_map_prop_total])\n        average_results = {}\n        for label_id in label_range:\n            sub_results = {}\n            label_prop = np.array([k['label'] == label_id for k in res_boxes])\n            label_gts = np.array([k['label'] == label_id for k in gts_boxes])\n            sub_results['num_prop'] = sum(label_prop)\n            sub_results['num_gts'] = sum(label_gts)\n            if sub_results['num_prop'] == 0:\n                hit_prop_index = []\n            else:\n                hit_prop_index = label_prop & iou_prop\n            sub_results['hit_prop'] = sum(hit_prop_index)\n            sub_results['hit_gts'] = len(set(hit_map_index_total[hit_prop_index]))"
+        },
+        {
+            "comment": "This code calculates precision and recall values for various subtasks, averages them if applicable, and stores the results in a dictionary. It also prints the result for each subtask if show_sub is set to True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py\":145-160",
+            "content": "            sub_results['prec'] = 0.0 if sub_results['num_prop'] == 0 \\\n                                      else sub_results['hit_prop'] * 1.0 / sub_results['num_prop']\n            sub_results['recall'] = 0.0 if sub_results['num_gts'] == 0 \\\n                                        else sub_results['hit_gts'] * 1.0 / sub_results['num_gts']\n            if show_sub:\n                print_result(sub_results, label=label_id)\n            for item in res_dict:\n                if not item in average_results:\n                    average_results[item] = 0\n                average_results[item] += sub_results[item]\n        if len(label_range) == 1:   # proposal \u4e0d\u9700\u8981\u8f93\u51faaverage\u503c\n            continue\n        average_results['prec'] = 0.0 if average_results['num_prop'] == 0 \\\n                                      else average_results['hit_prop'] * 1.0 / average_results['num_prop']\n        average_results['recall'] = 0.0 if average_results['num_gts'] == 0 \\\n                                        else average_results['hit_gts'] * 1.0 / average_results['num_gts']"
+        },
+        {
+            "comment": "This code calculates evaluation results for FootballAction model predictions. It checks if the prediction is for a specific evaluation dataset and then processes proposal phase results, extracting bounding box coordinates from predicted proposals, and appending them to res_boxes list. F1 score is calculated based on precision and recall values. The function returns the average evaluation results (F1, precision, recall, IoU) for each video in the predicts dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py\":161-188",
+            "content": "        if show_sub:\n            print_result(average_results)\n        average_results['F1'] = 0.0 if (average_results['prec'] + average_results['recall'] == 0) \\\n                                    else 2 * average_results['prec'] * average_results['recall'] / \\\n                                            (average_results['prec'] + average_results['recall'])\n        return average_results\ndef get_eval_results(predicts, gts_data, phase, iou_threshold = 0.3, score_threshold = 0.3, show_sub = False):\n    global mode\n    mode = phase\n    res_boxes = []\n    gts_boxes = []\n    for ped_data in predicts:\n        basename = ped_data['video_name']\n        # eval sub data\n        such_eval = False\n        for eval_name in eval_datasets:\n            if eval_name in basename:\n                such_eval = True\n                break\n        if not such_eval:\n            continue\n        gts = gts_data[basename]['actions']\n        if phase == 'proposal':\n            res_boxes.extend(convert_proposal(ped_data['bmn_results'], basename, score_threshold))"
+        },
+        {
+            "comment": "The code handles the evaluation of football action predictions. If ground truth is given, it extends proposal boxes and sets label range and iou range accordingly; otherwise, it extends classify results, ground truth boxes, and sets label range and iou range. It then calculates evaluation results using the specified functions. The code also allows for testing different iou_threshold and score_threshold combinations to find the best ones.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py\":189-217",
+            "content": "            gts_boxes.extend(convert_groundtruth(gts, basename, phase='proposal'))\n            label_range = [0]\n            iou_range = np.arange(0.1, 1, 0.1)\n        else:\n            res_boxes.extend(convert_classify(ped_data['action_results'], basename, iou_threshold, score_threshold))\n            gts_boxes.extend(convert_groundtruth(gts, basename))\n            label_range = range(1, len(label_index))\n            iou_range = np.arange(0.5, 0.6, 0.1)\n    eval_results = evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = show_sub)\n    return eval_results\nif __name__ == \"__main__\":\n    result_file = sys.argv[1]\n    predicts = json.load(open(result_file, 'r', encoding='utf-8'))\n    gts_data = load_gts()\n    get_eval_results(predicts, gts_data, 'proposal', \n                     score_threshold = 0.03,\n                     show_sub = True)\n    #get_eval_results(predicts, gts_data, 'actions')\n    best_F1 = -0.1\n    best_res = {}\n    best_iou_threshold = 0.\n    best_score_threshold = 0.\n    for iou_threshold in np.arange(0.1, 0.9, 0.1):"
+        },
+        {
+            "comment": "The code iterates through score thresholds and calculates the average results for each threshold. It selects the best F1 score, stores corresponding iou_threshold and score_threshold. Finally, it prints these values along with a headline and a detailed result, then saves the best results by running the get_eval_results function again.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/eval.py\":218-236",
+            "content": "        for score_threshold in np.arange(0.1, 1, 0.1):\n            avg_res = get_eval_results(predicts, gts_data, 'actions', \n                                       iou_threshold = iou_threshold,\n                                       score_threshold = score_threshold,\n                                       show_sub = False)\n            if best_F1 < avg_res['F1']:\n                best_F1 = avg_res['F1']\n                best_res = avg_res\n                best_iou_threshold = iou_threshold\n                best_score_threshold = score_threshold\n    print(\"best iou threshold = {:.1f}\".format(best_iou_threshold))\n    print(\"best score threshold = {:.1f}\".format(best_score_threshold))\n    print('best F1 score = {:.4f}'.format(best_F1))\n    print_head(0.5)\n    print_result(best_res)\n    get_eval_results(predicts, gts_data, 'actions', iou_threshold = best_iou_threshold,\n                                                    score_threshold = best_score_threshold,\n                                                    show_sub = True)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/881ad374-03ef-435f-a923-dc342e4b2b00.json b/docs/doc/881ad374-03ef-435f-a923-dc342e4b2b00.json
new file mode 100644
index 000000000..dd18be792
--- /dev/null
+++ b/docs/doc/881ad374-03ef-435f-a923-dc342e4b2b00.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The script downloads 12 EuroCup2016 dataset videos using 'wget' command, creating a \"mp4\" directory and accessing bj.bcebos.com server under tmt-pub/datasets/EuroCup2016 directory.",
+    "details": [
+        {
+            "comment": "This script creates a new directory called \"mp4\" and then changes into it. It then uses the wget command to download 12 MP4 video files from a specified URL, one after another. The purpose of this script is likely to download all the videos in the EuroCup2016 dataset for further use or processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh\":0-12",
+            "content": "mkdir mp4\ncd mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/63e51df254d2402fac703b6c4fdb4ea9.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/76b5f7ee28d942988c6b224bfac136bd.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/250b88724acf40dbb6d7e8ccb400ef38.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/c9516c903de3416c97dae91a59e968d7.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/e1982c90cdd74abaacc4d0692070b400.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/1be705a8f67648da8ec4b4296fa80895.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/de23c0b2be3a4eb1990c5c657061fb29.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/2754615de6e64c4fb95ce1a8095dc1c1.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/299fe30d8f3b4a45b89313fe31f9f3c0.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6cc7db52c5ef4e70b401a5e00d8dd67a.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/22e89747689e4f7e83e3620620c93269.mp4"
+        },
+        {
+            "comment": "This code is using wget to download multiple video files from a specified URL. The videos are part of the EuroCup2016 dataset, and each file has a unique identifier.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh\":13-23",
+            "content": "wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/2ceb6c549fc64305a06a75acb355642b.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/719b0a4bcb1f461eabb152298406b861.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/259856b769044b4d8dc94076deb356bf.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d0bd3eab1e794f0f9501c353a6d37827.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/19eb47cc736240d6b2dd930ab69da839.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/4435b708af6d48519a6b726144147d51.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/ea16ad2a020643529e257bd6cb11b3c3.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/eeebffbd4ec74222a9c2d0775d79b689.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/8cfb4e605af44055b1576c37eb0e3209.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6bca62b57cc449c6935f0b17f28d06be.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/70cfc31e520840b2afca458f93a01ce4.mp4"
+        },
+        {
+            "comment": "This code is using the wget command to download multiple video files from a specific URL. The videos are part of the EuroCup2016 dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh\":24-34",
+            "content": "wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6496960935e845578e391a5916739752.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d6d25403a4bb4784aecff5f21fd00dc5.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/3e23d452a082403391f8abfb87bf2fb4.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/4c5d9d9af4f044c4a68d134061dc264f.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6994844c64b44c26b935cee9604bef0a.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d6322cb95f6a4402ac80432b561abd5d.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/2c8b5587083a4784a51622e4fec87ccd.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/5faa60d70ed141de8560110e840f2048.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/45d08bc5cb0f424f9ed9d7874eb561cd.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6630aaf0e32146088d0b624e9288f071.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f2edbee29c1b4966b3a410260f78fbe3.mp4"
+        },
+        {
+            "comment": "The code is using the 'wget' command to download multiple MP4 video files from different URLs, presumably to create or expand a local dataset of EuroCup2016 videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh\":35-45",
+            "content": "wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f24116fdd6a54214991db32f7dddef67.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/0265731a0c6f4a9398c88db8e3d4a3bc.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/02d2de09997f4215b06e3b00ff0502a0.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/9c231896c56a43f291a5e190949f4333.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/4afbbf9afcd44dfea45b044117cccb48.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/745db97a080d4f44b450dc17a2bcf069.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/5933d0ce17854483b81a318d7d45a34e.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d2cfef2da9f84237a6950c7f6659655c.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/5572686cb90f440988ded956a60e555d.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/8962ac5a332346e180c79d701ae0a175.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f6e64ee9b13a4088b24c45c257894c1e.mp4"
+        },
+        {
+            "comment": "This code uses the wget command to download multiple mp4 files from a specific URL related to EuroCup2016 dataset. It downloads each file one by one, indicated by the different file names. The files are being downloaded from the bj.bcebos.com server under tmt-pub/datasets/EuroCup2016 directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh\":46-50",
+            "content": "wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f6ed2b612b3d43baa0726be8b14ebe7c.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/8ab7b0cba5744eb3b6fb10003dfda383.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/1f0a0698e38d493988fe42a50f7e8723.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/737fdb054ca141f2a45013c1740dd0a0.mp4\nwget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/bab63a9bcf204e4b99c4a887a01bfd60.mp4"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/881ff712-039a-47aa-a3e1-af424b8c7e02.json b/docs/doc/881ff712-039a-47aa-a3e1-af424b8c7e02.json
new file mode 100644
index 000000000..41f28f043
--- /dev/null
+++ b/docs/doc/881ff712-039a-47aa-a3e1-af424b8c7e02.json
@@ -0,0 +1,45 @@
+{
+    "summary": "This code generates ground truth data for the BMN model in table tennis applications, using the `combile_gts` function to extract action segments from root actions. It calculates video segments, appends annotations, and returns a dataset dictionary for TableTennis.",
+    "details": [
+        {
+            "comment": "This code is responsible for generating ground truth data for the BMN model in a table tennis application. It takes in original gts (ground truth) data and outputs the modified gts_bmn dictionary. The code first sets the fps value from gts_data, then iterates over each sub-item in gts_data['gts']. If a sub-item has actions but its duration is less than bmn_window, it is skipped. Otherwise, the sub-item data gets stored in gts_bmn.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/get_instance_for_bmn.py\":0-47",
+            "content": "\"\"\"\nget instance for bmn\n\u4f7f\u7528winds=8\u7684\u6ed1\u7a97\uff0c\u5c06\u6240\u6709\u5b50\u7a97\u53e3\u7684\u957f\u5ea6\u4e4b\u548c\u5c0f\u4e8ewinds\u7684\u8fdb\u884c\u5408\u5e76\n\u5408\u5e76\u540e\uff0c\u7236\u7a97\u53e3\u4ee3\u8868bmn\u8bad\u7ec3\u6570\u636e\uff0c\u5b50\u7a97\u53e3\u4ee3\u8868tsn\u8bad\u7ec3\u6570\u636e\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nimport math\n# for table tennis\nbmn_window = 8\ndataset = \"/home/aistudio/work/BMN/\"\nfeat_dir = dataset + '/Features_example'\nout_dir = dataset + '/Input_for_bmn'\nlabel_files = {\n    'train': 'label_cls14_small_train.json',\n    'validation': 'label_cls14_small_test.json'\n}\nglobal fps\ndef gen_gts_for_bmn(gts_data):\n    \"\"\"\n    @param, gts_data, original gts for action detection\n    @return, gts_bmn, output gts dict for bmn\n    \"\"\"\n    fps = gts_data['fps']\n    gts_bmn = {'fps': fps, 'gts': []}\n    for sub_item in gts_data['gts']:\n        url = sub_item['url']\n        max_length = sub_item['total_frames']\n        gts_bmn['gts'].append({\n            'url': url,\n            'total_frames': max_length,\n            'root_actions': []\n        })\n        sub_actions = sub_item['actions']\n        # \u8df3\u8fc7\u6ca1\u6709\u52a8\u4f5c\u7684\u7247\u6bb5\n        if len(sub_actions) == 0:\n            continue\n        # duration > bmn_window\uff0c \u52a8\u4f5c\u6301\u7eed\u65f6\u95f4\u5927\u4e8ebmn_windows\uff0c\u76f4\u63a5\u5220\u9664"
+        },
+        {
+            "comment": "This code extracts video action segments using a sliding window and stores them in the \"root_actions\" list. If the duration of an action is too long, it splits it into multiple actions and appends them to the \"gts_bmn['gts'][-1]['root_actions']\". The \"before_id\" and \"after_id\" keep track of the first and last frame of each extracted action, while the \"bmn_window\" determines the maximum duration for a single action.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/get_instance_for_bmn.py\":48-73",
+            "content": "        for idx, sub_action in enumerate(sub_actions):\n            if sub_action['end_id'] - sub_action['start_id'] > bmn_window:\n                sub_actions.pop(idx)\n        # \u3010\u6ed1\u52a8\u7a97\u53e3\uff0c\u628a\u6bcf\u4e00\u4e2a\u89c6\u9891\u91cc\u7684\u52a8\u4f5c\u7247\u6bb5\u63d0\u53d6\u51fa\u6765\u3011\n        root_actions = [sub_actions[0]]\n        # before_id, \u524d\u4e00\u52a8\u4f5c\u7684\u6700\u540e\u4e00\u5e27\n        # after_id, \u540e\u4e00\u52a8\u4f5c\u7684\u7b2c\u4e00\u5e27\n        before_id = 0\n        for idx in range(1, len(sub_actions)):\n            cur_action = sub_actions[idx]\n            duration = (cur_action['end_id'] - root_actions[0]['start_id'])\n            if duration > bmn_window:  # windows\u53ea\u80fd\u5305\u4f4f\u4e00\u4e2a\u52a8\u4f5c\u5c31\u5305\uff0c\u5305\u4e0d\u4f4f\u5c31\u5305\u591a\u4e2a\n                after_id = cur_action['start_id']\n                gts_bmn['gts'][-1]['root_actions'].append({\n                    'before_id':\n                    before_id,\n                    'after_id':\n                    after_id,\n                    'actions':\n                    root_actions\n                })\n                before_id = root_actions[-1]['end_id']  #\u66f4\u65b0\u6ed1\u7a97\n                root_actions = [cur_action]\n            else:\n                root_actions.append(cur_action)"
+        },
+        {
+            "comment": "The code is defining a function `combile_gts` that takes in `gts_bmn`, `gts_process`, and `mode` as parameters. It sets `fps` based on the `gts_process` data, calculates `duration_frame` and `feature_frame`. Then it iterates over the `gts_process['gts']` list to extract action segments from each item's root actions, appending them to the `segments` list. The function returns these segments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/get_instance_for_bmn.py\":74-107",
+            "content": "            if idx == len(sub_actions) - 1:\n                after_id = max_length\n                gts_bmn['gts'][-1]['root_actions'].append({\n                    'before_id':\n                    before_id,\n                    'after_id':\n                    after_id,\n                    'actions':\n                    root_actions\n                })\n    return gts_bmn\ndef combile_gts(gts_bmn, gts_process, mode):\n    \"\"\"\n    1\u3001bmn_window \u8303\u56f4\u5185\u53ea\u6709\u4e00\u4e2a\u52a8\u4f5c\uff0c\u53ea\u53d6\u4e00\u4e2a\u76ee\u6807\u6846\n    2\u3001bmn_window \u8303\u56f4\u5185\u6709\u591a\u4e2a\u52a8\u4f5c\uff0c\u53d6\u4e09\u4e2a\u76ee\u6807\u6846(\u7b2c\u4e00\u4e2a\u52a8\u4f5c\u3001\u6700\u540e\u4e00\u4e2a\u52a8\u4f5c\u3001\u6240\u6709\u52a8\u4f5c)\n    \"\"\"\n    global fps\n    fps = gts_process['fps']\n    duration_second = bmn_window * 1.0\n    duration_frame = bmn_window * fps\n    feature_frame = duration_frame\n    for item in gts_process['gts']:\n        url = item['url']\n        basename = os.path.basename(url).split('.')[0]\n        root_actions = item['root_actions']\n        # \u628a\u6bcf\u4e00\u4e2a\u89c6\u9891\u91cc\u7684\u52a8\u4f5c\u7247\u6bb5\u63d0\u53d6\u51fa\u6765\n        for root_action in root_actions:\n            segments = []\n            # all actions\n            segments.append({\n                'actions': root_action['actions'],"
+        },
+        {
+            "comment": "This code processes a list of actions and splits them into segments based on the number of elements. It adds extra segments for the first and last actions if there are more than one. Then, it processes each segment to fit a window size for compatibility with BMN input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/get_instance_for_bmn.py\":108-133",
+            "content": "                'before_id': root_action['before_id'],\n                'after_id': root_action['after_id']\n            })\n            if len(root_action['actions']) > 1:  #\u5982\u679c\u6709\u591a\u4e2a\u52a8\u4f5c\uff0c\u5219\u7b2c\u4e00\u4e2a\u52a8\u4f5c\u548c\u6700\u540e\u4e00\u4e2a\u52a8\u4f5c\uff0c\u989d\u5916\u6dfb\u52a0\u4e00\u6b21\n                # first action\n                segments.append({\n                    'actions': [root_action['actions'][0]],\n                    'before_id':\n                    root_action['before_id'],\n                    'after_id':\n                    root_action['actions'][1]['start_id']\n                })\n                # last action\n                segments.append({\n                    'actions': [root_action['actions'][-1]],\n                    'before_id':\n                    root_action['actions'][-2]['end_id'],\n                    'after_id':\n                    root_action['after_id']\n                })\n            # \u628a\u52a8\u4f5c\u7247\u6bb5\u5904\u7406\u6210window size\u5927\u5c0f\uff0c\u4ee5\u9002\u914dBMN\u8f93\u5165\n            for segment in segments:\n                before_id = segment['before_id']\n                after_id = segment['after_id']\n                actions = segment['actions']"
+        },
+        {
+            "comment": "This code snippet is determining the start and end points for a segment of video data based on action IDs. It ensures that the segment contains the entire sequence of actions, with some randomness in selecting the starting point within the specified range. The selected segment will be used to create an instance of the TableTennis application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/get_instance_for_bmn.py\":134-153",
+            "content": "                # before_id\u5230after_id\u592a\u957f\u4e86\uff0c\u4ece\u91cc\u9762\u53d6window_size\u5e27\uff0c\u8981\u5148\u786e\u5b9a\u4e00\u4e2a\u8d77\u59cb\u70b9\uff0c\u7136\u540e\u52a8\u4f5c\u90fd\u8981\u5305\u4f4f\n                box0 = max(actions[-1]['end_id'] - bmn_window,\n                           before_id)  #\u786e\u5b9a\u8d77\u59cb\u70b9\n                box1 = min(actions[0]['start_id'],\n                           after_id - bmn_window)  #\u786e\u5b9e\u8d77\u59cb\u70b9\n                if box0 <= box1:  # \u4e00\u6b21\u68c0\u67e5\n                    if int(box0) - int(box1) == 0:\n                        cur_start = box0\n                    else:\n                        box0 = math.ceil(box0)\n                        box1 = int(box1)\n                        cur_start = random.randint(box0, box1)\n                    cur_end = cur_start + bmn_window\n                    cur_start = round(cur_start, 2)\n                    cur_end = round(cur_end, 2)\n                    name = '{}_{}_{}'.format(basename, cur_start, cur_end)\n                    annotations = []\n                    for action in actions:\n                        label = str(1.0 * action['label_ids'][0])\n                        label_name = action['label_names'][0]"
+        },
+        {
+            "comment": "The code segment defines a function that calculates segments of video data based on start and end IDs. It then appends the calculated segments, along with their corresponding labels and label names, to an 'annotations' list. The function returns a dictionary containing information about the duration, frame rate, feature frames, subset type, and annotations for a given dataset or model (in this case, named 'bmn'). Additionally, the code defines another function that saves the calculated features to a specified folder if it doesn't exist, and handles any missing files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/get_instance_for_bmn.py\":154-181",
+            "content": "                        seg0 = 1.0 * round((action['start_id'] - cur_start),\n                                           2)  #\u5b58\u50a8\u7684\u662f\u5230\u5f00\u59cb\u4f4d\u7f6e(\u65f6\u95f4: s)\u7684\u8ddd\u79bb\n                        seg1 = 1.0 * round((action['end_id'] - cur_start), 2)\n                        annotations.append({\n                            'segment': [seg0, seg1],\n                            'label': label,\n                            'label_name': label_name\n                        })\n                    gts_bmn[name] = {\n                        'duration_second': duration_second,\n                        'duration_frame': duration_frame,\n                        'feature_frame': feature_frame,\n                        'subset': mode,\n                        'annotations': annotations\n                    }\n    return gts_bmn\ndef save_feature_to_numpy(gts_bmn, folder):\n    global fps\n    print('save feature for bmn ...')\n    if not os.path.exists(folder):\n        os.mkdir(folder)\n    process_gts_bmn = {}\n    miss = 0\n    for item, value in gts_bmn.items():\n        # split to rsplit \u9488\u5bf9\u6587\u4ef6\u547d\u540d\u4fee\u6539"
+        },
+        {
+            "comment": "The code is parsing video file names and extracting features from them. It then stores these features in a dictionary with corresponding start and end timestamps, and checks if any segments exceed the video length before saving the feature cut.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/get_instance_for_bmn.py\":182-206",
+            "content": "        basename, start_id, end_id = item.rsplit('_', 2)\n        if not basename in process_gts_bmn:\n            process_gts_bmn[basename] = []\n        process_gts_bmn[basename].append({\n            'name': item,\n            'start': float(start_id),\n            'end': float(end_id)\n        })\n    for item, values in process_gts_bmn.items():\n        feat_path = os.path.join(feat_dir, item + '.pkl')\n        feature_video = pickle.load(open(feat_path, 'rb'))['image_feature']\n        for value in values:\n            save_cut_name = os.path.join(folder, value['name'])\n            a, b, c = save_cut_name.rsplit('_', 2)\n            if float(b) > 360:\n                print(b)\n            start_frame = round(value['start'] * fps)\n            end_frame = round(value['end'] * fps)\n            if end_frame > len(feature_video):\n                miss += 1\n                continue\n            feature_cut = [\n                feature_video[i] for i in range(start_frame, end_frame)\n            ]\n            np_feature_cut = np.array(feature_cut, dtype=np.float32)"
+        },
+        {
+            "comment": "The code is saving processed data for a table tennis dataset. It creates a dictionary 'gts_bmn' from json files, processes it using 'gen_gts_for_bmn', combines it with existing data in 'gts_bmn', and then saves it as 'label.json' and 'feature'. It also handles creating the output directory if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/get_instance_for_bmn.py\":207-226",
+            "content": "            np.save(save_cut_name, np_feature_cut)\n    print('miss number (broken sample):', miss)\nif __name__ == \"__main__\":\n    if not os.path.exists(out_dir):\n        os.mkdir(out_dir)\n    gts_bmn = {}\n    for item, value in label_files.items():\n        label_file = os.path.join(dataset, value)\n        gts_data = json.load(open(label_file, 'rb'))\n        gts_process = gen_gts_for_bmn(gts_data)\n        gts_bmn = combile_gts(gts_bmn, gts_process, item)\n    with open(out_dir + '/label.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(gts_bmn, indent=4, ensure_ascii=False)\n        f.write(data)\n    save_feature_to_numpy(gts_bmn, out_dir + '/feature')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/884319ff-6b34-4e5a-9f4c-af09557da1e5.json b/docs/doc/884319ff-6b34-4e5a-9f4c-af09557da1e5.json
new file mode 100644
index 000000000..85e58c811
--- /dev/null
+++ b/docs/doc/884319ff-6b34-4e5a-9f4c-af09557da1e5.json
@@ -0,0 +1,10 @@
+{
+    "summary": "Copyright notice and license information for the module. Imports IntVOS from the same directory and adds it to __all__.",
+    "details": [
+        {
+            "comment": "Copyright notice and license information for the module. Imports IntVOS from the same directory and adds it to __all__.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py\":0-16",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .IntVOS import IntVOS\n__all__ = ['IntVOS'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/88c1bf3f-ff23-4b9a-a606-14fabeddedb5.json b/docs/doc/88c1bf3f-ff23-4b9a-a606-14fabeddedb5.json
new file mode 100644
index 000000000..3987c1e84
--- /dev/null
+++ b/docs/doc/88c1bf3f-ff23-4b9a-a606-14fabeddedb5.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code imports libraries, defines functions for PaddleVideo model training, and allows users to specify command-line arguments. It uses the Apache License 2.0 and enables parallel execution with distributed environments. The method to be executed is determined by the command line arguments.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and defines functions for training a PaddleVideo model. It also handles command line arguments using argparse. The script is licensed under the Apache License, Version 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/main.py\":0-28",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport random\nimport numpy as np\nimport paddle\nfrom paddlevideo.tasks import (test_model, train_dali, train_model,\n                               train_model_multigrid)\nfrom paddlevideo.utils import get_config, get_dist_info\ndef parse_args():\n    parser = argparse.ArgumentParser(\"PaddleVideo train script\")\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,"
+        },
+        {
+            "comment": "This code segment is defining command line arguments using the 'argparse' library for a PaddleVideo program. It allows users to set configuration file paths, override config options, test models, enable DALI for training, use multigrid training, specify weights for finetuning or testing, and utilize fleet run distributed training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/main.py\":29-51",
+            "content": "                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument('--test',\n                        action='store_true',\n                        help='whether to test a model')\n    parser.add_argument('--train_dali',\n                        action='store_true',\n                        help='whether to use dali to speed up training')\n    parser.add_argument('--multigrid',\n                        action='store_true',\n                        help='whether to use multigrid training')\n    parser.add_argument('-w',\n                        '--weights',\n                        type=str,\n                        help='weights for finetuning or testing')\n    parser.add_argument('--fleet',\n                        action='store_true',\n                        help='whether to use fleet run distributed training')"
+        },
+        {
+            "comment": "The code adds command-line arguments for AMP (automatic mixed precision) training, validation, random seed, maximum iterations, and profiler options. It then parses these arguments to customize the program's behavior during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/main.py\":52-83",
+            "content": "    parser.add_argument('--amp',\n                        action='store_true',\n                        help='whether to open amp training.')\n    parser.add_argument(\n        '--amp_level',\n        type=str,\n        default=None,\n        help=\"optimize level when open amp training, can only be 'O1' or 'O2'.\")\n    parser.add_argument(\n        '--validate',\n        action='store_true',\n        help='whether to evaluate the checkpoint during training')\n    parser.add_argument(\n        '--seed',\n        type=int,\n        default=1234,\n        help='fixed all random seeds when the program is running')\n    parser.add_argument(\n        '--max_iters',\n        type=int,\n        default=None,\n        help='max iterations when training(this arg only used in test_tipc)')\n    parser.add_argument(\n        '-p',\n        '--profiler_options',\n        type=str,\n        default=None,\n        help='The option of profiler, which should be in format '\n        '\\\"key1=value1;key2=value2;key3=value3\\\".')\n    args = parser.parse_args()\n    return args"
+        },
+        {
+            "comment": "This code snippet defines a `main` function that parses arguments, configures settings based on provided overrides and device availability (NPU or XPU), sets seed for random number generation if specified, and enables parallel execution using Paddle's distributed environment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/main.py\":86-117",
+            "content": "def main():\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override)\n    # enable to use npu if paddle is built with npu\n    if paddle.is_compiled_with_custom_device('npu') :\n        cfg.__setattr__(\"use_npu\", True)\n    elif paddle.device.is_compiled_with_xpu():\n        cfg.__setattr__(\"use_xpu\", True)\n    # set seed if specified\n    seed = args.seed\n    if seed is not None:\n        assert isinstance(\n            seed, int), f\"seed must be a integer when specified, but got {seed}\"\n        random.seed(seed)\n        np.random.seed(seed)\n        paddle.seed(seed)\n    # set amp_level if amp is enabled\n    if args.amp:\n        if args.amp_level is None:\n            args.amp_level = 'O1'  # set defaualt amp_level to 'O1'\n        else:\n            assert args.amp_level in [\n                'O1', 'O2'\n            ], f\"amp_level must be 'O1' or 'O2' when amp enabled, but got {args.amp_level}.\"\n    _, world_size = get_dist_info()\n    parallel = world_size != 1\n    if parallel:\n        paddle.distributed.init_parallel_env()"
+        },
+        {
+            "comment": "This code determines the method to be executed based on command line arguments. If '--test' is given, it executes 'test_model'. If '--train_dali' is given, it executes 'train_dali'. If '--multigrid' is given, it executes 'train_model_multigrid'. Otherwise, it executes 'train_model', passing the necessary parameters to perform model training or validation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/main.py\":119-140",
+            "content": "    if args.test:\n        test_model(cfg, weights=args.weights, parallel=parallel)\n    elif args.train_dali:\n        train_dali(cfg, weights=args.weights, parallel=parallel)\n    elif args.multigrid:\n        train_model_multigrid(cfg,\n                              world_size=world_size,\n                              validate=args.validate)\n    else:\n        train_model(cfg,\n                    weights=args.weights,\n                    parallel=parallel,\n                    validate=args.validate,\n                    use_fleet=args.fleet,\n                    use_amp=args.amp,\n                    amp_level=args.amp_level,\n                    max_iters=args.max_iters,\n                    profiler_options=args.profiler_options)\nif __name__ == '__main__':\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/88e997ea-065f-4556-8e56-abf2d8e84e1e.json b/docs/doc/88e997ea-065f-4556-8e56-abf2d8e84e1e.json
new file mode 100644
index 000000000..f190a6d5f
--- /dev/null
+++ b/docs/doc/88e997ea-065f-4556-8e56-abf2d8e84e1e.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code block is importing modules and defining the contents of the package. It sets __all__ to include 'utils', 'PaddleVideo', and 'ava_predict'. The code block also includes copyright, license information, and imports from different . files within the package.",
+    "details": [
+        {
+            "comment": "This code block is importing modules and defining the contents of the package. It sets __all__ to include 'utils', 'PaddleVideo', and 'ava_predict'. The code block also includes copyright, license information, and imports from different . files within the package.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/__init__.py\":0-18",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n__all__ = ['utils', 'PaddleVideo', 'ava_predict']\nfrom . import utils\nfrom .wheel import PaddleVideo\nfrom . import ava_predict"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/892f214e-5286-4407-b58d-a45a0a31366a.json b/docs/doc/892f214e-5286-4407-b58d-a45a0a31366a.json
new file mode 100644
index 000000000..b41401e75
--- /dev/null
+++ b/docs/doc/892f214e-5286-4407-b58d-a45a0a31366a.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The MSTCN model is a video segmentation tool that extends BaseSegmenter class, includes an optional backbone and head, and defines training/validation steps with loss calculation. The code includes three functions: forward_net for training, test_step for testing, and infer_step for inference.",
+    "details": [
+        {
+            "comment": "Class MSTCN defines a model for video segmentation, extending BaseSegmenter class. It contains an optional backbone and head for feature extraction and classification. The forward_net function maps input to output through these components if present.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/ms_tcn.py\":0-32",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import SEGMENTERS\nfrom .base import BaseSegmenter\nimport paddle\nimport paddle.nn.functional as F\n@SEGMENTERS.register()\nclass MSTCN(BaseSegmenter):\n    \"\"\"MS-TCN model framework.\"\"\"\n    def forward_net(self, video_feature):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        if self.backbone is not None:\n            feature = self.backbone(video_feature)\n        else:\n            feature = video_feature\n        if self.head is not None:\n            cls_score = self.head(feature)"
+        },
+        {
+            "comment": "This code defines a training step, validation step, and a method to predict the class score for video segmentation. The training step calculates the loss based on the forward network output and ground truth labels, while the validation step does the same but doesn't return a loss. Both methods return predicted results and loss metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/ms_tcn.py\":33-69",
+            "content": "        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        video_feat, video_gt = data_batch\n        # call forward\n        output = self.forward_net(video_feat)\n        loss = 0.\n        for i in range(len(output)):\n            loss += self.head.loss(output[i], video_gt)\n        predicted = paddle.argmax(output[-1], axis=1)\n        predicted = paddle.squeeze(predicted)\n        loss_metrics = dict()\n        loss_metrics['loss'] = loss\n        loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        video_feat, video_gt = data_batch\n        # call forward\n        output = self.forward_net(video_feat)\n        loss = 0.\n        for i in range(len(output)):\n            loss += self.head.loss(output[i], video_gt)\n        predicted = paddle.argmax(output[-1], axis=1)\n        predicted = paddle.squeeze(predicted)"
+        },
+        {
+            "comment": "This code defines three functions: \"forward_net\" for training, \"test_step\" for testing, and \"infer_step\" for inference. The forward pass of the model is called within each function. In the training step, the loss is calculated and an F1 score is computed using the head module. The predicted labels are also stored. For testing and inference, the predicted labels and output after sigmoid activation are returned separately.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/ms_tcn.py\":71-100",
+            "content": "        outputs_dict = dict()\n        outputs_dict['loss'] = loss\n        outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)\n        return outputs_dict\n    def test_step(self, data_batch):\n        \"\"\"Testing setp.\n        \"\"\"\n        video_feat, _ = data_batch\n        outputs_dict = dict()\n        # call forward\n        output = self.forward_net(video_feat)\n        predicted = paddle.argmax(output[-1], axis=1)\n        predicted = paddle.squeeze(predicted)\n        outputs_dict['predict'] = predicted\n        outputs_dict['output_np'] = F.sigmoid(output[-1])\n        return outputs_dict\n    def infer_step(self, data_batch):\n        \"\"\"Infering setp.\n        \"\"\"\n        video_feat = data_batch[0]\n        # call forward\n        output = self.forward_net(video_feat)\n        predicted = paddle.argmax(output[-1], axis=1)\n        predicted = paddle.squeeze(predicted)\n        output_np = F.sigmoid(output[-1])\n        return predicted, output_np"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/89984d3f-723d-4fe2-a003-7b328d68234f.json b/docs/doc/89984d3f-723d-4fe2-a003-7b328d68234f.json
new file mode 100644
index 000000000..77cce07d9
--- /dev/null
+++ b/docs/doc/89984d3f-723d-4fe2-a003-7b328d68234f.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code improves TSN model training speed with DALI in PaddleVideo, using Kinetics400/UCF101 datasets and ResNet50 pretrained models. It provides detailed guidelines for action recognition tasks, including model download, config file usage, and separate sections for tests and inferences.",
+    "details": [
+        {
+            "comment": "This code aims to speed up the TSN (Two-Stream Networks) model training using DALI (Data Augmentation Library for Images and Videos) in PaddleVideo. The author reimplemented segment sampling in VideoReader as NVIDIA DALI does not support TSN sampling way. They tested the performance with a Tesla v100 GPU and reported improvements in batch cost/s, reader cost/s, and instance/sec compared to Dataloader and base implementation. The docker image for this implementation is huangjun12/paddlevideo:tsn_dali_cuda9_0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn_dali.md\":0-44",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/tsn_dali.md) | English\n# TSN DALI\n- [Introduction](#Introduction)\n- [Requirement](#Requirement)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe aims to speed up TSN model training using DALI in this code. As [nvidia DALI](https://github.com/NVIDIA/DALI) not support TSN sampling way, we reimplemented segment sampling in VideoReader.\n### Performance\nTest Environment: \n```\nCard: Tesla v100\nMemory: 4 * 16G\nCuda: 9.0\nbatch_size of single card: 32\n```\n| Training way | batch cost/s  | reader cost/s | ips:instance/sec | Speed up |\n| :--------------- | :--------: | :------------: | :------------: | :------------: |\n| DALI | 2.083 | 1.804 | 15.36597  |  1.41x |\n| Dataloader: num_workers=4 | 2.943 | 2.649 | 10.87460| base |\n| pytorch\u5b9e\u73b0 | TODO | TODO | TODO | TODO | \n## Requirement\ndocker image:\n```\n    huangjun12/paddlevideo:tsn_dali_cuda9_0\n```\nTo build container, you can use:\n```bash\nnvidia-docker run --name t"
+        },
+        {
+            "comment": "This code snippet is a command for running TSN (Two-Stream Network) with DALI (Data Augmentation and Layout Innovation) on PaddleVideo. It utilizes the Kinetics400 and UCF101 datasets, downloads the ResNet50 pretrained model, and starts the training process using Python and PaddlePaddle framework. The command also specifies the GPU usage and log directory for tracking progress.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn_dali.md\":44-81",
+            "content": "sn-DALI -v /home:/workspace --network=host -it --shm-size 64g -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video huangjun12/paddlevideo:tsn_dali_cuda9_0 /bin/bash\n```\n## Data\n- Kinetics400 dataset please refer to [K400 data](../../dataset/k400.md)\n- UCF101 dataset please refer to [UCF101 data](../../dataset/ucf101.md)\n## Train\n### download pretrain-model\n- Please download [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams) as pretraind model:\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams\n```\nand add path to MODEL.framework.backbone.pretrained in config file as\uff1a\n```yaml\nMODEL:\n    framework: \"Recognizer2D\"\n    backbone:\n        name: \"ResNet\"\n        pretrained: your weight path\n```\n### Start training\nYou can start training by: \n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml -o log_level=\"INFO\"\n```\n- Args -c is used to specify config file\uff0cdefault is ```configs/recognition/tsn/tsn_dali.yaml```\u3002"
+        },
+        {
+            "comment": "This code is providing information on how to use the TSN model for action recognition. It mentions downloading the trained model file, using a config file, and refers users to separate sections for test and inference processes. The reference section includes the original paper link.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsn_dali.md\":83-97",
+            "content": "- For finetune please download our trained model [TSN.pdparams]()<sup>coming soon</sup>\uff0cand specify file path with --weights. \n- For the config file usage\uff0cplease refer to [config](../../tutorials/config.md).\n## Test\nPlease refer to [TSN Test](./tsn.md)\n## Inference\nPlease refer to [TSN Inference](./tsn.md)\n## Reference\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/89a0021d-5489-4dbc-8ebc-6befe7e119dd.json b/docs/doc/89a0021d-5489-4dbc-8ebc-6befe7e119dd.json
new file mode 100644
index 000000000..41fe464d3
--- /dev/null
+++ b/docs/doc/89a0021d-5489-4dbc-8ebc-6befe7e119dd.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code imports modules from PaddleVideo library, initializes a model registry, and provides functions for building video recognition models and defining loss functions.",
+    "details": [
+        {
+            "comment": "This code is an import statement from the PaddleVideo library, including various modules for backbones, builders, detectors, recognizers, and heads. It also includes license information and copyright details. The code allows users to access and build models using these imported modules.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/__init__.py\":0-21",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .assigners import MaxIoUAssignerAVA\nfrom .backbones import ResNet\nfrom .builder import (build_backbone, build_head, build_localizer, build_loss,\n                      build_recognizer)\nfrom .framework.detectors import BaseDetector, FastRCNN, TwoStageDetector\nfrom .framework.recognizers import BaseRecognizer, Recognizer2D\nfrom .heads import (AVARoIHead, BaseHead, BBoxHeadAVA, SingleRoIExtractor3D,\n                    TSNHead)"
+        },
+        {
+            "comment": "This code imports various modules, initializes a registry of models and functions, and lists all the available ones. It also defines a few key functions like `build_recognizer` and `build_localizer`, as well as some important loss functions such as `CrossEntropyLoss`. The code is part of PaddleVideo's modeling package and seems to be involved in building different parts of a video recognition model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/__init__.py\":22-36",
+            "content": "from .losses import CrossEntropyLoss\nfrom .registry import (BACKBONES, DETECTORS, HEADS, LOCALIZERS, LOSSES,\n                       PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)\nfrom .samplers import RandomSampler\nfrom .weight_init import kaiming_normal_, trunc_normal_, weight_init_\n__all__ = [\n    'BACKBONES', 'HEADS', 'RECOGNIZERS', 'LOCALIZERS', 'PARTITIONERS', 'LOSSES',\n    'build_recognizer', 'build_localizer', 'build_head', 'build_backbone',\n    'build_loss', 'ResNet', 'TSNHead', 'BaseHead', 'BaseRecognizer',\n    'Recognizer2d', 'CrossEntropyLoss', 'ROI_EXTRACTORS',\n    'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'MaxIoUAssignerAVA',\n    'RandomSampler', 'DETECTORS', 'kaiming_normal_', 'trunc_normal_',\n    'weight_init_'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/89b44dba-75d2-4f44-b775-2d9b8dd6dd76.json b/docs/doc/89b44dba-75d2-4f44-b775-2d9b8dd6dd76.json
new file mode 100644
index 000000000..57e6e1523
--- /dev/null
+++ b/docs/doc/89b44dba-75d2-4f44-b775-2d9b8dd6dd76.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This script downloads the pre-trained ernie model, its corresponding checkpoints, and a test dataset using wget and tar commands for decompression.",
+    "details": [
+        {
+            "comment": "This script downloads the pre-trained ernie model, its corresponding checkpoints, and a test dataset using wget and tar commands for decompression.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/download.sh\":0-10",
+            "content": "# download ernie 1.0 model\nwget https://videotag.bj.bcebos.com/Applications/MultimodalVideoTag/model_pretrained_ernie.tar.gz\ntar -xzvf model_pretrained_ernie.tar.gz\n# download pretrain model\nwget https://videotag.bj.bcebos.com/Applications/MultimodalVideoTag/checkpoints_save.tar.gz\ntar -xzvf checkpoints_save.tar.gz\n# download test dataset\nwget https://videotag.bj.bcebos.com/Applications/MultimodalVideoTag/datasets.tar.gz\ntar -xzvf datasets.tar.gz"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/89d41881-a163-49a9-a578-7e401dff943e.json b/docs/doc/89d41881-a163-49a9-a578-7e401dff943e.json
new file mode 100644
index 000000000..ceed915ee
--- /dev/null
+++ b/docs/doc/89d41881-a163-49a9-a578-7e401dff943e.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code initializes weights for a PaddlePaddle layer with options for customization and truncated normal distribution, offering proper initialization for deep learning models using normal distribution and PaddlePaddle's Normal initializer.",
+    "details": [
+        {
+            "comment": "The code is a function for initializing the weights of a given layer using different functions. It supports in-place parameter initialization and can be used with PaddlePaddle framework. The function accepts various arguments to customize the weight initialization process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py\":0-35",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn.initializer as init\nimport numpy as np\nfrom scipy import special\ndef weight_init_(layer,\n                 func,\n                 weight_name=None,\n                 bias_name=None,\n                 bias_value=0.0,\n                 **kwargs):\n    \"\"\"\n    In-place params init function.\n    Usage:\n    .. code-block:: python\n        import paddle\n        import numpy as np\n        data = np.ones([3, 4], dtype='float32')"
+        },
+        {
+            "comment": "Code initializes a linear layer, prints its weight, applies weight initialization with normal distribution, and prints the weight again. The _no_grad_trunc_normal_ function sets tensor values to be truncated normal with specified mean, std, a, and b parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py\":36-65",
+            "content": "        linear = paddle.nn.Linear(4, 4)\n        input = paddle.to_tensor(data)\n        print(linear.weight)\n        linear(input)\n        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)\n        print(linear.weight)\n    \"\"\"\n    if hasattr(layer, 'weight') and layer.weight is not None:\n        getattr(init, func)(**kwargs)(layer.weight)\n        if weight_name is not None:\n            # override weight name\n            layer.weight.name = weight_name\n    if hasattr(layer, 'bias') and layer.bias is not None:\n        init.Constant(bias_value)(layer.bias)\n        if bias_name is not None:\n            # override bias name\n            layer.bias.name = bias_name\ndef _no_grad_trunc_normal_(tensor, mean, std, a, b):\n    def norm_cdf(x):\n        # Computes standard normal cumulative distribution function\n        return (1. + math.erf(x / math.sqrt(2.))) / 2.\n    if (mean < a - 2 * std) or (mean > b + 2 * std):\n        print(\"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. \"\n              \"The distribution of values may be incorrect.\")"
+        },
+        {
+            "comment": "This code initializes the weights of a tensor by generating values from a truncated normal distribution, with the lower and upper bounds defined by 'a' and 'b'. It then transforms these values to ensure they are within the desired range, mean, and standard deviation. The resulting tensor is set as the new value for the original tensor. This process ensures proper initialization for deep learning models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py\":67-97",
+            "content": "    with paddle.no_grad():\n        # Values are generated by using a truncated uniform distribution and\n        # then using the inverse CDF for the normal distribution.\n        # Get upper and lower cdf values\n        l = norm_cdf((a - mean) / std)\n        u = norm_cdf((b - mean) / std)\n        # Uniformly fill tensor with values from [l, u], then translate to [2l-1, 2u-1].\n        tmp = np.random.uniform(2 * l - 1, 2 * u - 1,\n                                size=list(tensor.shape)).astype(np.float32)\n        # Use inverse cdf transform for normal distribution to get truncated\n        # standard normal\n        tmp = special.erfinv(tmp)\n        # Transform to proper mean, std\n        tmp *= (std * math.sqrt(2.0))\n        tmp += mean\n        # Clamp to ensure it's in the proper range\n        tmp = np.clip(tmp, a, b)\n        tensor.set_value(paddle.to_tensor(tmp))\n        return tensor\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = tensor.dim()\n    if dimensions < 2:\n        raise ValueError(\n            \"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions\""
+        },
+        {
+            "comment": "The code defines a function for initializing the weight of a neural network. It first calculates the fan-in and fan-out based on the shape and dimensions of the tensor. Then, it provides options to initialize weights with truncated normal or Kaiming normal distributions. The trunc_normal_ and kaiming_normal_ functions are also defined to handle different initialization methods with optional parameters for mean, std, a, b, mode, and nonlinearity.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py\":98-129",
+            "content": "        )\n    num_input_fmaps = tensor.shape[1]\n    num_output_fmaps = tensor.shape[0]\n    receptive_field_size = 1\n    if tensor.dim() > 2:\n        receptive_field_size = tensor[0][0].numel()\n    fan_in = num_input_fmaps * receptive_field_size\n    fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):\n    return _no_grad_trunc_normal_(tensor, mean, std, a, b)\ndef kaiming_normal_(tensor, a=0., mode='fan_in', nonlinearity='leaky_relu'):\n    def _calculate_correct_fan(tensor, mode):\n        mode = mode.lower()\n        valid_modes = ['fan_in', 'fan_out']\n        if mode not in valid_modes:\n            raise ValueError(\n                \"Mode {} not supported, please use one of {}\".format(\n                    mode, valid_modes))\n        fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n        return fan_in if mode == 'fan_in' else fan_out\n    def calculate_gain(nonlinearity, param=None):\n        linear_fns = [\n            'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',"
+        },
+        {
+            "comment": "This function initializes the weights of a tensor with a normal distribution. It checks the type of nonlinearity function and returns an appropriate gain factor, then calculates the standard deviation for weight initialization using fan inversion formula. The final step is to initialize the tensor with Normal initializer from PaddlePaddle library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py\":130-156",
+            "content": "            'conv_transpose2d', 'conv_transpose3d'\n        ]\n        if nonlinearity in linear_fns or nonlinearity == 'sigmoid':\n            return 1\n        elif nonlinearity == 'tanh':\n            return 5.0 / 3\n        elif nonlinearity == 'relu':\n            return math.sqrt(2.0)\n        elif nonlinearity == 'leaky_relu':\n            if param is None:\n                negative_slope = 0.01\n            elif not isinstance(param, bool) and isinstance(\n                    param, int) or isinstance(param, float):\n                negative_slope = param\n            else:\n                raise ValueError(\n                    \"negative_slope {} not a valid number\".format(param))\n            return math.sqrt(2.0 / (1 + negative_slope**2))\n        else:\n            raise ValueError(\n                \"Unsupported nonlinearity {}\".format(nonlinearity))\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    with paddle.no_grad():\n        paddle.nn.initializer.Normal(0, std)(tensor)"
+        },
+        {
+            "comment": "Initializing weights for a neural network model.\nThis function returns the initialized tensor with random values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py\":157-157",
+            "content": "        return tensor"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/89e4841f-b7dd-41b0-b204-7594540da115.json b/docs/doc/89e4841f-b7dd-41b0-b204-7594540da115.json
new file mode 100644
index 000000000..733407df1
--- /dev/null
+++ b/docs/doc/89e4841f-b7dd-41b0-b204-7594540da115.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines a Registry class for mapping names to objects and provides methods for registering, getting, and unregistering objects. It utilizes the @BACKBONES.register() decorator or BACKBONES.register(ResNet) function for registration, and also verifies if an object with a given name exists in the registry using the `get` method.",
+    "details": [
+        {
+            "comment": "This code defines a Registry class that provides name to object mapping, allowing third-party users to register their custom modules. Users can register their objects by using the @BACKBONES.register() decorator or by calling BACKBONES.register(ResNet).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py\":0-33",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nclass Registry(object):\n    \"\"\"\n    The registry that provides name -> object mapping, to support third-party users' custom modules.\n    To register an object:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        @BACKBONES.register()\n        class ResNet:\n            pass\n    Or:\n    .. code-block:: python\n        BACKBONES = Registry('backbone')\n        class ResNet:\n            pass\n        BACKBONES.register(ResNet)"
+        },
+        {
+            "comment": "This code is a registry class for storing and managing objects. It allows registering objects under their names or using decorators, and provides functions to check if an object with a given name exists in the registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py\":35-69",
+            "content": "    Usage: To build a module.\n    .. code-block:: python\n        backbone_name = \"ResNet\"\n        b = BACKBONES.get(backbone_name)()\n    \"\"\"\n    def __init__(self, name):\n        \"\"\"\n        Args:\n            name (str): the name of this registry\n        \"\"\"\n        self._name = name\n        self._obj_map = {}\n    def __contains__(self, key):\n        return self._obj_map.get(key) is not None\n    def _do_register(self, name, obj):\n        assert (\n            name not in self._obj_map\n        ), \"An object named '{}' was already registered in '{}' registry!\".format(\n            name, self._name)\n        self._obj_map[name] = obj\n    def register(self, obj=None, name=None):\n        \"\"\"\n        Register the given object under the the name `obj.__name__`.\n        Can be used as either a decorator or not. See docstring of this class for usage.\n        \"\"\"\n        if obj is None:\n            # used as a decorator\n            def deco(func_or_class, name=name):\n                if name is None:\n                    name = func_or_class.__name__"
+        },
+        {
+            "comment": "The code defines a class with methods for registering, getting and unregistering objects in a registry. The `_do_register` method is used to store the object's name and function or class into a dictionary. If no name is provided when calling the function, it defaults to the object's name. The `get` method retrieves an object from the registry using its name. If the object is not found, it raises a KeyError with an error message.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py\":70-95",
+            "content": "                self._do_register(name, func_or_class)\n                return func_or_class\n            return deco\n        # used as a function call\n        if name is None:\n            name = obj.__name__\n        self._do_register(name, obj)\n    def get(self, name):\n        \"\"\"Get the registry record.\n        Args:\n            name (str): The class name.\n        Returns:\n            ret: The class.\n        \"\"\"\n        ret = self._obj_map.get(name)\n        if ret is None:\n            raise KeyError(\n                \"No object named '{}' found in '{}' registry!\".format(\n                    name, self._name))\n        return ret"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/89fc3cf0-d88c-4d92-99d8-9d47400bcb20.json b/docs/doc/89fc3cf0-d88c-4d92-99d8-9d47400bcb20.json
new file mode 100644
index 000000000..e36f282bf
--- /dev/null
+++ b/docs/doc/89fc3cf0-d88c-4d92-99d8-9d47400bcb20.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This function calculates precision and recall metrics from scores, labels, and ground truth instances, raising ValueError for incorrect inputs. It computes average precision using valid precision and recall arrays and calculates CorLoc performance metrics for object detection with given ground truth and detected images per class.",
+    "details": [
+        {
+            "comment": "This code defines a function to compute precision and recall metrics based on input scores, labels, and the number of ground truth instances. It also raises a ValueError if the input is in the incorrect format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py\":0-29",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Functions for computing metrics like precision, recall, CorLoc and etc.\"\"\"\nimport numpy as np\ndef compute_precision_recall(scores, labels, num_gt):\n    \"\"\"Compute precision and recall.\n    Args:\n        scores: A float numpy array representing detection score\n        labels: A boolean numpy array representing true/false positive labels\n        num_gt: Number of ground truth instances\n    Raises:\n        ValueError: if the input is not of the correct format"
+        },
+        {
+            "comment": "This code checks if input 'labels' and 'scores' are valid arrays. It verifies that 'labels' is a one-dimensional boolean numpy array, 'scores' is a one-dimensional numpy array, the number of true positives is less than num_gt (number of ground truth labels), and the lengths of 'scores' and 'labels' are equal. If any conditions are not met, it raises a ValueError with an appropriate error message. If all checks pass and there are no ground truth labels, the function returns None for both precision and recall.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py\":31-57",
+            "content": "    Returns:\n        precision: Fraction of positive instances over detected ones. This\n            value is None if no ground truth labels are present.\n        recall: Fraction of detected positive instance over all positive\n            instances. This value is None if no ground truth labels are\n            present.\n    \"\"\"\n    if (not isinstance(labels, np.ndarray) or labels.dtype != np.bool\n            or len(labels.shape) != 1):\n        raise ValueError('labels must be single dimension bool numpy array')\n    if not isinstance(scores, np.ndarray) or len(scores.shape) != 1:\n        raise ValueError('scores must be single dimension numpy array')\n    if num_gt < np.sum(labels):\n        raise ValueError(\n            'Number of true positives must be smaller than num_gt.')\n    if len(scores) != len(labels):\n        raise ValueError('scores and labels must be of the same size.')\n    if num_gt == 0:\n        return None, None\n    sorted_indices = np.argsort(scores)\n    sorted_indices = sorted_indices[::-1]\n    labels = labels.astype(int)"
+        },
+        {
+            "comment": "Computes precision and recall from sorted labels, returns both values.\nDefines a function to compute average precision using precision and recall arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py\":58-87",
+            "content": "    true_positive_labels = labels[sorted_indices]\n    false_positive_labels = 1 - true_positive_labels\n    cum_true_positives = np.cumsum(true_positive_labels)\n    cum_false_positives = np.cumsum(false_positive_labels)\n    precision = cum_true_positives.astype(float) / (\n        cum_true_positives + cum_false_positives)\n    recall = cum_true_positives.astype(float) / num_gt\n    return precision, recall\ndef compute_average_precision(precision, recall):\n    \"\"\"Compute Average Precision according to the definition in VOCdevkit.\n    Precision is modified to ensure that it does not decrease as recall\n    decrease.\n    Args:\n        precision: A float [N, 1] numpy array of precisions\n        recall: A float [N, 1] numpy array of recalls\n    Raises:\n        ValueError: if the input is not of the correct format\n    Returns:\n        average_precison: The area under the precision recall curve. NaN if\n            precision and recall are None.\n    \"\"\"\n    if precision is None:\n        if recall is not None:\n            raise ValueError('If precision is None, recall must also be None')"
+        },
+        {
+            "comment": "This function checks the data types and ranges of precision and recall arrays, ensuring they are numpy float arrays within the range [0,1] and have the same size. If all conditions pass, it then concatenates recall and precision arrays with 0 and 1 at the end respectively before preprocessing precision to be a non-decreasing array.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py\":88-110",
+            "content": "        return np.NAN\n    if not isinstance(precision, np.ndarray) or not isinstance(\n            recall, np.ndarray):\n        raise ValueError('precision and recall must be numpy array')\n    if precision.dtype != np.float or recall.dtype != np.float:\n        raise ValueError('input must be float numpy array.')\n    if len(precision) != len(recall):\n        raise ValueError('precision and recall must be of the same size.')\n    if not precision.size:\n        return 0.0\n    if np.amin(precision) < 0 or np.amax(precision) > 1:\n        raise ValueError('Precision must be in the range of [0, 1].')\n    if np.amin(recall) < 0 or np.amax(recall) > 1:\n        raise ValueError('recall must be in the range of [0, 1].')\n    if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):\n        raise ValueError('recall must be a non-decreasing array')\n    recall = np.concatenate([[0], recall, [1]])\n    precision = np.concatenate([[0], precision, [0]])\n    # Preprocess precision to be a non-decreasing array\n    for i in range(len(precision) - 2, -1, -1):"
+        },
+        {
+            "comment": "This code computes the average precision and CorLoc, which is a metric used to evaluate object detection performance. It takes in arrays of ground truth images per class and correctly detected images per class. The average precision function calculates the average precision by comparing recall values, while the compute_cor_loc function calculates the CorLoc score for each class based on these inputs. If there are no ground truth images for a class, it returns NaN.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py\":111-136",
+            "content": "        precision[i] = np.maximum(precision[i], precision[i + 1])\n    indices = np.where(recall[1:] != recall[:-1])[0] + 1\n    average_precision = np.sum(\n        (recall[indices] - recall[indices - 1]) * precision[indices])\n    return average_precision\ndef compute_cor_loc(num_gt_imgs_per_class,\n                    num_images_correctly_detected_per_class):\n    \"\"\"Compute CorLoc according to the definition in the following paper.\n    https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf\n    Returns nans if there are no ground truth images for a class.\n    Args:\n        num_gt_imgs_per_class: 1D array, representing number of images\n            containing at least one object instance of a particular class\n        num_images_correctly_detected_per_class: 1D array, representing number\n            of images that are correctly detected at least one object instance\n            of a particular class\n    Returns:\n        corloc_per_class: A float numpy array represents the corloc score of\n            each class"
+        },
+        {
+            "comment": "Divides the number of images correctly detected by the number of ground truth images per class, ignoring division by zero for classes with no examples.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py\":137-142",
+            "content": "    \"\"\"\n    # Divide by zero expected for classes with no gt examples.\n    with np.errstate(divide='ignore', invalid='ignore'):\n        return np.where(\n            num_gt_imgs_per_class == 0, np.nan,\n            num_images_correctly_detected_per_class / num_gt_imgs_per_class)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8a4593c5-a977-4422-a8c9-2b87abf2054a.json b/docs/doc/8a4593c5-a977-4422-a8c9-2b87abf2054a.json
new file mode 100644
index 000000000..9bc810530
--- /dev/null
+++ b/docs/doc/8a4593c5-a977-4422-a8c9-2b87abf2054a.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines the SmoothL1Loss class as a custom loss function in PaddlePaddle's VideoQualityAssessment library, and implements _forward method to calculate smooth L1 loss between predicted scores and ground truth labels. It extends BaseWeightedLoss for handling outliers in computer vision tasks.",
+    "details": [
+        {
+            "comment": "This code defines the SmoothL1Loss class, a custom loss function in PaddlePaddle's VideoQualityAssessment library. It extends BaseWeightedLoss and implements a _forward method for calculating the smooth L1 loss between predicted scores and ground truth labels. The smooth L1 loss is used in computer vision tasks to handle outliers and improve robustness.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py\":0-32",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass SmoothL1Loss(BaseWeightedLoss):\n    \"\"\"smooth L1 Loss.\"\"\"\n    def _forward(self, score, labels):\n        \"\"\"Forward function.\n        Args:\n            score (paddle.Tensor): The class score.\n            labels (paddle.Tensor): The ground truth labels.\n        Returns:\n            loss (paddle.Tensor): The returned smooth L1 Loss."
+        },
+        {
+            "comment": "This code snippet defines a function that calculates the smooth L1 loss between two arrays, \"score\" and \"labels\". It converts labels to the data type of score, then applies F.smooth_l1_loss() to compute the loss. Finally, it returns the computed loss value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py\":33-38",
+            "content": "        \"\"\"\n        labels = labels.astype(score.dtype)\n        loss = F.smooth_l1_loss(score, labels)\n        return loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8ab7f923-4447-4b3e-b2be-9f313238cbfb.json b/docs/doc/8ab7f923-4447-4b3e-b2be-9f313238cbfb.json
new file mode 100644
index 000000000..27baaa0e8
--- /dev/null
+++ b/docs/doc/8ab7f923-4447-4b3e-b2be-9f313238cbfb.json
@@ -0,0 +1,20 @@
+{
+    "summary": "TransNetV2 is a deep learning-based video segmentation model for shot transition detection, using DDCNN V2 structure, RGB color histograms, and frame similarity. The provided code demonstrates usage of predict.py to infer predictions on input files, with output probabilities and lens boundaries.",
+    "details": [
+        {
+            "comment": "TransNetV2 is a video segmentation model based on deep learning using DDCNN V2 structure for feature learning, RGB color histograms, and video frame similarity for effective feature extraction. This code supports inference only, with training and testing to be provided later. Suitable for industrial applications, more details are available in the paper.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/partition/transnetv2.md\":0-27",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/partition/transnetv2.md) | English\n# TransNetV2\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Details](#Details)\n- [Reference](#Reference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install ffmpeg-python==0.2.0\n```\n## Introduction\nTransNetV2 is a video segmentation model based on deep learning. It performs feature learning through the DDCNN V2 structure, and adds RGB color histograms and video frame similarity for more effective feature extraction, and finally obtains whether each frame is a shot boundary frame Probability, thereby completing the video segmentation. The algorithm has good effect and efficient calculation, which is very suitable for industrial landing.\n![](../../../images/transnetv2.png)\nThis code currently only supports model inference, and model training and testing will be provided in the future.\nPlease refer to the pap"
+        },
+        {
+            "comment": "This code provides instructions to load and export the TransNet V2 inference model for shot transition detection. It mentions the required weights trained on ClipShots and TRECVID IACC.3 dataset, as well as the URL to download them using wget command. The script also outlines how to use `export_model.py` tool to generate the `TransNetV2.pdmodel` and `TransNetV2.pdiparams` files for prediction purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/partition/transnetv2.md\":27-61",
+            "content": "er for details. [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838)\n## Data\ncoming soon\n## Train\ncoming soon\n## Test\ncoming soon\n## Inference\nLoad the TransNetV2 weights trained on ClipShots and TRECVID IACC.3 dataset [TransNetV2_shots.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams), or download through the command line\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams\n```\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/partitioners/transnetv2/transnetv2.yaml -p data/TransNetV2_shots.pdparams -o inference/TransNetV2\n```\nThe above command will generate the model structure file`TransNetV2.pdmodel`and the model weight file`TransNetV2.pdiparams`required for prediction.\nFor the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-Model Reasoning)"
+        },
+        {
+            "comment": "This code snippet demonstrates the usage of predict.py to infer TransNetV2 model predictions for a given input file (example.avi). The model configuration is specified in transnetv2.yaml, and the trained model files are provided as inputs. Prediction probability per frame is output to example_predictions.txt and lens boundary is output to example_scenes.txt. Visualization can be enabled for better interpretation of results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/partition/transnetv2.md\":63-79",
+            "content": "### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/partitioners/transnetv2/transnetv2.yaml \\\n                           --model_file inference/TransNetV2/TransNetV2.pdmodel \\\n                           --params_file inference/TransNetV2/TransNetV2.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nBy defining the `output_path` parameters in `transnetv2.yaml`, the prediction probability of each frame can be output to `{output_path}/example_predictions.txt`, and the predicted lens boundary is output to `{output_path}/example_scenes.txt`.\nBy defining the `visualize` parameter in `transnetv2.yaml`, the predicted results can be visualized, and the visual results are saved to `{output_path}/example_vis.png`.\n## Reference\n- [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838), Tom\u00e1\u0161 Sou\u010dek, Jakub Loko\u010d"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8b576a44-8ebe-49cb-b188-25d9facca9a9.json b/docs/doc/8b576a44-8ebe-49cb-b188-25d9facca9a9.json
new file mode 100644
index 000000000..e735ed342
--- /dev/null
+++ b/docs/doc/8b576a44-8ebe-49cb-b188-25d9facca9a9.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The TimeSformer model is a top-performing video classifier that uses time series modeling and space-time attention, trained on Kinetics-400 using 8 GPUs with mixed-precision training.",
+    "details": [
+        {
+            "comment": "This code implements the TimeSformer, a video classification model based on vision transformer with global receptive field and strong time series modeling ability. It achieves SOTA accuracy on Kinetics-400 dataset and has shorter training time compared to other models. The code showcases the time-space separated attention cascade network architecture, and requires data preparation from Kinetics-400 dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/timesformer.md\":0-25",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/timesformer.md) | English\n# TimeSformer\n## Content\n- [Introduction](#Introduction)\n- [Data](#DATA)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nTimeSformer is a video classification model based on vision transformer, which has the characteristics of no convolution, global receptive field, and strong time series modeling ability. At present, it has achieved SOTA accuracy on the Kinetics-400 data set, surpassing the classic CNN-based video classification models TSN, TSM and Slowfast, and has a shorter training time (the Kinetics-400 data set training time is 39 hourss). **This code implements the time-space separated attention cascade network in the paper**.\n<div align=\"center\">\n<img src=\"../../../images/timesformer_attention_arch.png\" alt=\"image-20210628210446041\"/><img src=\"../../../images/timesformer_attention_visualize.png\" alt=\"image-20210628210446041\"  />\n</div>\n## Data\nK400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)"
+        },
+        {
+            "comment": "Download and prepare UCF101 data, then download the ViT_base_patch16_224 pre-trained model. Update the config file with the model's path and train the Kinetics-400 dataset using 8 GPUs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/timesformer.md\":27-56",
+            "content": "UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)\n## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Download the image pre-training model [ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through the wget command\n   ```bash\n   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/timesformer/timesformer_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"RecognizerTransformer\"\n        backbone:\n            name: \"VisionTransformer\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:\n```bash"
+        },
+        {
+            "comment": "This code executes the training of a Paddle Video model called \"timesformer\" on 8 GPUs for video data. The command is to be run in a Linux terminal, and it uses mixed-precision training with AMP (Automatic Mixed Precision) to speed up the process. The command also sets some environment variables to configure CUDA behavior. The configuration file name includes the model and dataset names as well as data format and sampling method. For more details on configuring parameters, refer to the provided link.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/timesformer.md\":57-71",
+            "content": "# videos data format\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_timesformer main.py --validate -c configs/recognition/ timesformer/timesformer_k400_videos.yaml\n```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 # MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\n# videos data format\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_timesformer main.py --amp --validate -c configs/recognition/ timesformer/timesformer_k400_videos.yaml\n```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage."
+        },
+        {
+            "comment": "The code provides instructions for testing the TimeSformer model, using a different sampling method in test mode for higher accuracy. The best model is identified by the log's \"best\" keyword, and final test scores are obtained after training by using the provided command.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/timesformer.md\":74-89",
+            "content": "## Test\n- The TimeSformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```\n  Already save the best model (top1 acc)0.7258\n  ```\n- Since the sampling method of the TimeSformer model test mode is **UniformCrop** with a slower speed but higher accuracy, which is different from the **RandomCrop** used in the verification mode during the training process, so the verification index recorded in the training log is `topk Acc `Does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_timesformer main.py --test -c configs/recognition/ timesformer/timesformer_k400_videos.yaml -w \"output/TimeSformer/TimeSformer_best.pdparams\"\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:"
+        },
+        {
+            "comment": "This code snippet is for exporting the TimeSformer inference model. It uses the PaddlePaddle framework and requires a configuration file, a pre-trained model file, and an output directory. The TimeSformer is a video recognition model that utilizes the Vision Transformer architecture and Linspace sampling strategy for its inference process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/timesformer.md\":92-106",
+            "content": "  | backbone | Sampling method | num_seg | target_size | Top-1 | checkpoints |\n  | :----------------: | :-----: | :-----: | :---------: | :----: | :----------------------------------------------------------: |\n  | Vision Transformer | UniformCrop | 8 | 224 | 77.29 | [TimeSformer_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams) |\n- During the test, the TimeSformer video sampling strategy is to use Linspace sampling: in time sequence, num_seg sparse sampling points are uniformly generated from the video sequence to be sampled; in space, select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions are sampled. A total of 1 clip is sampled for 1 video.\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml \\\n                                -p data/TimeSformer_k400.pdparams \\\n                                -o inference/TimeSformer"
+        },
+        {
+            "comment": "This code snippet demonstrates the process of using the TimeSformer model to predict the video file 'data/example.avi'. The model is trained on Kinetics-400 and the prediction command uses python3.7 to run 'tools/predict.py' with relevant parameters such as input_file, config, model_file, params_file, use_gpu, and use_tensorrt. The output shows the top-1 class and its corresponding score for the video file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/timesformer.md\":107-132",
+            "content": "```\nThe above command will generate the model structure file `TimeSformer.pdmodel` and the model weight file `TimeSformer.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Reasoning Method](../../start.md#2-infer)\n### Use prediction engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/timesformer/timesformer_k400_videos.yaml \\\n                           --model_file inference/TimeSformer/TimeSformer.pdmodel \\\n                           --params_file inference/TimeSformer/TimeSformer.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nThe output example is as follows:\n```\nCurrent video file: data/example.avi\n    top-1 class: 5\n    top-1 score: 0.9999722242355347\n```\nIt can be seen that using the TimeSformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confiden"
+        },
+        {
+            "comment": "Code comments: The code calculates the category id and name from a table, which is used to determine the predicted category name. It references a paper on space-time attention for video understanding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/timesformer.md\":132-136",
+            "content": "ce is 0.99. By consulting the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be seen that the predicted category name is `archery`.\n## Reference\n- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8bc6f049-6538-4bfd-aa7b-2aaaddb9979c.json b/docs/doc/8bc6f049-6538-4bfd-aa7b-2aaaddb9979c.json
new file mode 100644
index 000000000..d91469a9d
--- /dev/null
+++ b/docs/doc/8bc6f049-6538-4bfd-aa7b-2aaaddb9979c.json
@@ -0,0 +1,35 @@
+{
+    "summary": "ASRF is an improved video action segmentation model built upon ms-tcn, utilizing PaddlePaddle framework for training and exporting inference models. It provides accuracy results and performance metrics, with examples for running inference on PaddleVideo.",
+    "details": [
+        {
+            "comment": "ASRF is an improved video action segmentation model built upon ms-tcn, which was published in 2021. It utilizes the PaddlePaddle framework and can be trained on datasets such as 50salads, breakfast, or gtea. The model requires additional data construction using a specific script for preparation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/asrf.md\":0-34",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/segmentation/asrf.md) | English\n# ASRF : Video Action Segmentation Model\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nASRF model is an improvement on the video motion segmentation model ms-tcn, which was published on WACV in 2021. We reproduce the officially implemented pytorch code and obtain approximate results in paddlevideo.\n<p align=\"center\">\n<img src=\"../../../images/asrf.png\" height=300 width=400 hspace='10'/> <br />\nMS-TCN Overview\n</p>\n## Data\nASRF can choose 50salads, breakfast, gtea as trianing set. Please refer to Video Action Segmentation dataset download and preparation doc [Video Action Segmentation dataset](../../dataset/SegmentationDataset.md)\nUnlike MS-TCN, ASRF model requires additional data construction. The script process is as follows\n```bash\npython data/50salads/prepare_asrf_data.py --dataset_dir data/\n```\n## Train\nAfter prepare dataset, we can run sprits."
+        },
+        {
+            "comment": "This code is running a training command for an action segmentation model (ASRF) on the GTEA dataset, specifically without using pre-training. It uses CUDA device 3 and a provided configuration file. The test command tests MS-TCN on a dataset using a previously trained model's weights. The index calculation in the test refers to an evaluation script provided by the original author of ms-tcn. The codebase is from the official ASRF repository in PyTorch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/asrf.md\":36-54",
+            "content": "```bash\n# gtea dataset\nexport CUDA_VISIBLE_DEVICES=3\npython3.7 main.py  --validate -c configs/segmentation/asrf/asrf_gtea.yaml\n```\n- Start the training by using the above command line or script program. There is no need to use the pre training model. The video action segmentation model is usually a full convolution network. Due to the different lengths of videos, the `DATASET.batch_size` of the video action segmentation model is usually set to `1`, that is, batch training is not required. At present, only **single sample** training is supported.\n## Test\nTest MS-TCN on dataset scripts:\n```bash\npython main.py  --test -c configs/segmentation/asrf/asrf_gtea.yaml --weights=./output/ASRF/ASRF_split_1.pdparams\n```\n- The specific implementation of the index is to calculate ACC, edit and F1 scores by referring to the test script[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py) provided by the author of ms-tcn.\nThe reproduction of pytorch comes from the official [code base](https://github.com/yiskw713/asrf)"
+        },
+        {
+            "comment": "The code provides accuracy results for different models on three datasets, Breakfast, 50salads, and GTEA, using a 4 or 5-fold validation method as per the MS-TCN paper. The performance metrics include Accuracy (Acc), Edit Distance (Edit), and F1 scores at different thresholds (F1@0.1, F1@0.25, F1@0.5).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/asrf.md\":56-79",
+            "content": "- The evaluation method of data set adopts the folding verification method in ms-tcn paper, and the division method of folding is the same as that in ms-tcn paper.\nAccuracy on Breakfast dataset(4 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 67.6% | 72.4% | 74.3% | 68.9% | 56.1% |\n| pytorch | 65.8% | 71.0% | 72.3% | 66.5% | 54.9% |\n| paddle | 66.1% | 71.9% | 73.3% | 67.9% | 55.7% |\nAccuracy on 50salads dataset(5 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 84.5% | 79.3% | 82.9% | 83.5% | 77.3% |\n| pytorch | 81.4% | 75.6% | 82.7% | 81.2% | 77.2% |\n| paddle | 81.6% | 75.8% | 83.0% | 81.5% | 74.8% |\nAccuracy on gtea dataset(4 folding verification):\n| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |\n| :---: | :---: | :---: | :---: | :---: | :---: |\n| paper | 77.3% | 83.7% | 89.4% | 87.8% | 79.8% |\n| pytorch | 76.3% | 79.6% | 87.3% | 85.8% | 74.9% |"
+        },
+        {
+            "comment": "Table showing model weight for gtea with corresponding F1@0.5 and checkpoint links, followed by command to export inference model for ASRF_gtea using given parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/asrf.md\":80-99",
+            "content": "| paddle | 77.1% | 83.3% | 88.9% | 87.5% | 79.1% |\nModel weight for gtea\nTest_Data| F1@0.5 | checkpoints |\n| :----: | :----: | :---- |\n| gtea_split1 | 72.4409 | [ASRF_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_1.pdparams) |\n| gtea_split2 | 76.6666 | [ASRF_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_2.pdparams) |\n| gtea_split3 | 84.5528 | [ASRF_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_3.pdparams) |\n| gtea_split4 | 82.6771 | [ASRF_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_4.pdparams) |\n## Infer\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/segmentation/asrf/asrf_gtea.yaml \\\n                                -p data/ASRF_gtea_split_1.pdparams \\\n                                -o inference/ASRF\n```\nTo get model architecture file `ASRF.pdmodel` and parameters file `ASRF.pdiparams`, use:"
+        },
+        {
+            "comment": "This code provides an example of how to run model inference using the ASRF segmentation model from PaddleVideo. The input file should contain a list of .npy files, and the code demonstrates how to execute it with specific configuration, model, and parameter files. It also shows the location where the results will be written after inference is complete.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/asrf.md\":101-130",
+            "content": "- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\nInput file are the file list for infering, for example:\n```\nS1_Cheese_C1.npy\nS1_CofHoney_C1.npy\nS1_Coffee_C1.npy\nS1_Hotdog_C1.npy\n...\n```\n```bash\npython3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \\\n                           --config configs/segmentation/asrf/asrf_gtea.yaml \\\n                           --model_file inference/ASRF/ASRF.pdmodel \\\n                           --params_file inference/ASRF/ASRF.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```bash\nresult write in : ./inference/infer_results/S1_Cheese_C1.txt\nresult write in : ./inference/infer_results/S1_CofHoney_C1.txt\nresult write in : ./inference/infer_results/S1_Coffee_C1.txt\nresult write in : ./inference/infer_results/S1_Hotdog_C1.txt\nresult write in : ./inference/infer_results/S1_Pealate_C1.txt"
+        },
+        {
+            "comment": "Writes the inference results to separate text files for \"Peanut\" and \"Tea\" scenes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/segmentation/asrf.md\":131-138",
+            "content": "result write in : ./inference/infer_results/S1_Peanut_C1.txt\nresult write in : ./inference/infer_results/S1_Tea_C1.txt\n```\n## Reference\n- [Alleviating Over-segmentation Errors by Detecting Action Boundaries](https://arxiv.org/pdf/2007.06866v1.pdf), Yuchi Ishikawa, Seito Kasai, Yoshimitsu Aoki, Hirokatsu Kataoka"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8bc86c23-1eb5-460c-92d4-d58ace8bfbc3.json b/docs/doc/8bc86c23-1eb5-460c-92d4-d58ace8bfbc3.json
new file mode 100644
index 000000000..5ad09918c
--- /dev/null
+++ b/docs/doc/8bc86c23-1eb5-460c-92d4-d58ace8bfbc3.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code imports libraries, registers a backbone model in PaddleVideo, initializes an ASRF class for computer vision tasks, and sets layer biases using init_bias function. The ASRF forward method performs convolution on input x and iterates through shared layers before returning the output.",
+    "details": [
+        {
+            "comment": "This code block is importing necessary libraries and modules, as well as registering a backbone model within the PaddleVideo framework. It also includes references to external repositories for inspiration or implementation guidance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/asrf.py\":0-29",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# https://github.com/yabufarha/ms-tcn/blob/master/model.py\n# https://github.com/yiskw713/asrf/libs/models/tcn.py\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nimport copy\nimport random\nimport math\nfrom paddle import ParamAttr\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom .ms_tcn import DilatedResidualLayer\nfrom ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch"
+        },
+        {
+            "comment": "The ASRF class is a type of backbone model for computer vision tasks. It initializes convolutional layers and shared dilated residual layers, and sets their weights using KaimingUniform initialization. The number of features, stages, and layers are configurable parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/asrf.py\":32-64",
+            "content": "@BACKBONES.register()\nclass ASRF(nn.Layer):\n    def __init__(self, in_channel, num_features, num_classes, num_stages,\n                 num_layers):\n        super().__init__()\n        self.in_channel = in_channel\n        self.num_features = num_features\n        self.num_classes = num_classes\n        self.num_stages = num_stages\n        self.num_layers = num_layers\n        # define layers\n        self.conv_in = nn.Conv1D(self.in_channel, self.num_features, 1)\n        shared_layers = [\n            DilatedResidualLayer(2**i, self.num_features, self.num_features)\n            for i in range(self.num_layers)\n        ]\n        self.shared_layers = nn.LayerList(shared_layers)\n        self.init_weights()\n    def init_weights(self):\n        \"\"\"\n        initialize model layers' weight\n        \"\"\"\n        # init weight\n        for layer in self.sublayers():\n            if isinstance(layer, nn.Conv1D):\n                layer.weight.set_value(\n                    KaimingUniform_like_torch(layer.weight).astype('float32'))\n                if layer.bias is not None:"
+        },
+        {
+            "comment": "This code sets the initial values of layer biases using init_bias function. The ASRF forward method performs convolution on input x, then iterates through shared layers to modify the output before returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/asrf.py\":65-74",
+            "content": "                    layer.bias.set_value(\n                        init_bias(layer.weight, layer.bias).astype('float32'))\n    def forward(self, x):\n        \"\"\" ASRF forward\n        \"\"\"\n        out = self.conv_in(x)\n        for layer in self.shared_layers:\n            out = layer(out)\n        return out"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8bdc6b2c-d69d-441b-9c9b-0e5a04ddfd3f.json b/docs/doc/8bdc6b2c-d69d-441b-9c9b-0e5a04ddfd3f.json
new file mode 100644
index 000000000..8ab35edac
--- /dev/null
+++ b/docs/doc/8bdc6b2c-d69d-441b-9c9b-0e5a04ddfd3f.json
@@ -0,0 +1,65 @@
+{
+    "summary": "TSMINFReader is a multiprocessing-enabled video reader in jpg format, applying transformations for machine learning models. It computes crop positions, performs random cropping, resizing, flipping, and centering on groups of images with fault-tolerant image reading.",
+    "details": [
+        {
+            "comment": "TSMINFReader is a data reader for video datasets in jpg format. It inherits from DataReader and takes parameters name, mode, and cfg. It supports multiprocessing for improved performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":0-36",
+            "content": "\"\"\"\ntsn frame reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport random\nimport functools\nimport concurrent.futures\nimport multiprocessing\nimport numpy as np\nimport paddle\nfrom PIL import Image, ImageEnhance\nfrom .reader_utils import DataReader\nclass TSMINFReader(DataReader):\n    \"\"\"\n    Data reader for video dataset of jpg folder.\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        super(TSMINFReader, self).__init__(name, mode, cfg)"
+        },
+        {
+            "comment": "The code sets various configuration parameters such as number of segments, segment length, short and target image sizes, batch size, reader threads, buffer size, video path, and image mean and standard deviation for a TSN inference reader. It then creates the TSN inference reader using these parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":37-65",
+            "content": "        name = name.upper()\n        self.num_seg = cfg[name]['num_seg']\n        self.seglen = cfg[name]['seglen']\n        self.short_size = cfg[name]['short_size']\n        self.target_size = cfg[name]['target_size']\n        self.batch_size = cfg[name]['batch_size']\n        self.reader_threads = cfg[name]['reader_threads']\n        self.buf_size = cfg[name]['buf_size']\n        self.video_path = cfg[name]['frame_list']\n        self.img_mean = np.array(cfg[name]['image_mean']).reshape(\n            [3, 1, 1]).astype(np.float32)\n        self.img_std = np.array(cfg[name]['image_std']).reshape(\n            [3, 1, 1]).astype(np.float32)\n        self.material = material\n    def create_reader(self):\n        \"\"\"\n        batch loader for TSN\n        \"\"\"\n        _reader = self._inference_reader_creator_longvideo(\n            self.video_path,\n            self.mode,\n            num_seg=self.num_seg,\n            seglen=self.seglen,\n            short_size=self.short_size,\n            target_size=self.target_size,\n            img_mean=self.img_mean,"
+        },
+        {
+            "comment": "This code defines a video inference reader for the PaddleVideo application's BasketballAction module. It creates a batch reader to process images and labels from the given video path, considering various parameters such as image mean, standard deviation, number of threads, and buffer size. The batch reader yields batches of images and labels until reaching the specified batch size or finishing processing all data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":66-96",
+            "content": "            img_std=self.img_std,\n            num_threads=self.reader_threads,\n            buf_size=self.buf_size)\n        def _batch_reader():\n            batch_out = []\n            for imgs, label in _reader():\n                if imgs is None:\n                    continue\n                batch_out.append((imgs, label))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 1:\n                yield batch_out[:-1]\n        return _batch_reader\n    def _inference_reader_creator_longvideo(self, video_path, mode, num_seg,\n                                            seglen, short_size, target_size,\n                                            img_mean, img_std, num_threads,\n                                            buf_size):\n        \"\"\"\n        inference reader for video\n        \"\"\"\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            def image_buf(image_id_path_buf):\n                \"\"\""
+        },
+        {
+            "comment": "This code uses multithreading to process video frames into images. It opens image paths, converts them to RGB format, and stores them in a list for later use. The code then maps the image processing task onto multiple threads to increase efficiency. Finally, it collects the processed images from each thread and stores them in the imgs_seg_list variable.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":97-119",
+            "content": "                image_buf reader\n                \"\"\"\n                try:\n                    img_path = image_id_path_buf[1]\n                    img = Image.open(img_path).convert(\"RGB\")\n                    image_id_path_buf[2] = img\n                except:\n                    image_id_path_buf[2] = None\n            frame_len = len(video_path)\n            read_thread_num = num_seg\n            for i in range(0, frame_len, read_thread_num):\n                image_list_part = video_path[i:i + read_thread_num]\n                image_id_path_buf_list = []\n                for k in range(len(image_list_part)):\n                    image_id_path_buf_list.append([k, image_list_part[k], None])\n                with concurrent.futures.ThreadPoolExecutor(\n                        max_workers=read_thread_num) as executor:\n                    executor.map(\n                        lambda image_id_path_buf: image_buf(image_id_path_buf),\n                        image_id_path_buf_list)\n                imgs_seg_list = [x[2] for x in image_id_path_buf_list]"
+        },
+        {
+            "comment": "This code handles fault-tolerant reading of images for a specified range. It checks if the image buffer is None and if so, attempts to retrieve it from other segments. If an image cannot be retrieved, it prints an error message and exits. Additionally, it appends extra segments with the last image in case there are fewer than num_segments required. Finally, it yields the updated imgs_seg_list for further processing in the inference_imgs_transform function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":121-143",
+            "content": "                # add the fault-tolerant for bad image\n                for k in range(len(image_id_path_buf_list)):\n                    img_buf = image_id_path_buf_list[k][2]\n                    pad_id = 1\n                    while pad_id < num_seg and img_buf is None:\n                        img_buf = imgs_seg_list[(k + pad_id) % num_seg][2]\n                    if img_buf is None:\n                        print(\"read img erro from {} to {}\".format(\n                            i, i + read_thread_num))\n                        exit(0)\n                    else:\n                        imgs_seg_list[k] = img_buf\n                for pad_id in range(len(imgs_seg_list), num_seg):\n                    imgs_seg_list.append(imgs_seg_list[-1])\n                yield imgs_seg_list\n        def inference_imgs_transform(imgs_list, mode, num_seg, seglen, short_size,\\\n                                    target_size, img_mean, img_std):\n            \"\"\"\n            inference_imgs_transform\n            \"\"\"\n            imgs_ret = imgs_transform(imgs_list, mode, num_seg, seglen,"
+        },
+        {
+            "comment": "This code defines a function `imgs_transform` that takes in images, mode, number of segments, segment length, short size, target size as input. It applies transformations to the images based on the given parameters and returns the transformed images. The `mapper` is defined as a partial function of `inference_imgs_transform`, with parameters such as mode, num_seg, seglen, short_size, target_size, img_mean, and img_std. Finally, the code returns the result of `paddle.reader.xmap_readers` which applies the mapper function to the reader, with given parameters like num_threads and buf_size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":144-171",
+            "content": "                                      short_size, target_size, img_mean,\n                                      img_std)\n            label_ret = 0\n            return imgs_ret, label_ret\n        mapper = functools.partial(inference_imgs_transform,\n                                   mode=mode,\n                                   num_seg=num_seg,\n                                   seglen=seglen,\n                                   short_size=short_size,\n                                   target_size=target_size,\n                                   img_mean=img_mean,\n                                   img_std=img_std)\n        return paddle.reader.xmap_readers(mapper,\n                                          reader,\n                                          num_threads,\n                                          buf_size,\n                                          order=True)\ndef imgs_transform(imgs,\n                   mode,\n                   num_seg,\n                   seglen,\n                   short_size,\n                   target_size,"
+        },
+        {
+            "comment": "The code defines a function \"imgs_transform\" that takes in images and applies various transformations depending on the mode ('train' or 'test'). For training, it performs multi-scale cropping, random cropping, and random flipping. For testing, it centers crops the images. It then normalizes the images by subtracting the mean and dividing by standard deviation. Finally, it reshapes the images into a specific format and returns them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":172-202",
+            "content": "                   img_mean,\n                   img_std,\n                   name=''):\n    \"\"\"\n    imgs_transform\n    \"\"\"\n    imgs = group_scale(imgs, short_size)\n    if mode == 'train':\n        if name == \"TSM\":\n            imgs = group_multi_scale_crop(imgs, short_size)\n        imgs = group_random_crop(imgs, target_size)\n        imgs = group_random_flip(imgs)\n    else:\n        imgs = group_center_crop(imgs, target_size)\n    np_imgs = (np.array(imgs[0]).astype('float32').transpose(\n        (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255\n    for i in range(len(imgs) - 1):\n        img = (np.array(imgs[i + 1]).astype('float32').transpose(\n            (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255\n        np_imgs = np.concatenate((np_imgs, img))\n    imgs = np_imgs\n    imgs -= img_mean\n    imgs /= img_std\n    imgs = np.reshape(imgs, (num_seg, seglen * 3, target_size, target_size))\n    return imgs\ndef group_multi_scale_crop(img_group, target_size, scales=None, \\\n        max_distort=1, fix_crop=True, more_fix_crop=True):"
+        },
+        {
+            "comment": "This code generates a random crop size based on predefined scales and applies it to an input image. It ensures the generated crop size is close to the target size and adjusts the offset to maintain aspect ratio if necessary. The function also handles cases where the maximum distance between width and height is specified by the max_distort parameter. If fix_crop is False, it further adds random offsets to the selected crop size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":203-238",
+            "content": "    \"\"\"\n    group_multi_scale_crop\n    \"\"\"\n    scales = scales if scales is not None else [1, .875, .75, .66]\n    input_size = [target_size, target_size]\n    im_size = img_group[0].size\n    # get random crop offset\n    def _sample_crop_size(im_size):\n        \"\"\"\n         _sample_crop_size\n        \"\"\"\n        image_w, image_h = im_size[0], im_size[1]\n        base_size = min(image_w, image_h)\n        crop_sizes = [int(base_size * x) for x in scales]\n        crop_h = [\n            input_size[1] if abs(x - input_size[1]) < 3 else x\n            for x in crop_sizes\n        ]\n        crop_w = [\n            input_size[0] if abs(x - input_size[0]) < 3 else x\n            for x in crop_sizes\n        ]\n        pairs = []\n        for i, h in enumerate(crop_h):\n            for j, w in enumerate(crop_w):\n                if abs(i - j) <= max_distort:\n                    pairs.append((w, h))\n        crop_pair = random.choice(pairs)\n        if not fix_crop:\n            w_offset = random.randint(0, image_w - crop_pair[0])\n            h_offset = random.randint(0, image_h - crop_pair[1])"
+        },
+        {
+            "comment": "This code calculates crop positions for an image based on its width and height. It generates a list of potential cropping locations, including upper left/right, lower left/right, center, center left/right, upper left quarter, upper right quarter, and center. The calculations are done in case more_fix_crop is set to True, otherwise only the basic crop positions will be included.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":239-261",
+            "content": "        else:\n            w_step = (image_w - crop_pair[0]) / 4\n            h_step = (image_h - crop_pair[1]) / 4\n            ret = list()\n            ret.append((0, 0))  # upper left\n            if w_step != 0:\n                ret.append((4 * w_step, 0))  # upper right\n            if h_step != 0:\n                ret.append((0, 4 * h_step))  # lower left\n            if h_step != 0 and w_step != 0:\n                ret.append((4 * w_step, 4 * h_step))  # lower right\n            if h_step != 0 or w_step != 0:\n                ret.append((2 * w_step, 2 * h_step))  # center\n            if more_fix_crop:\n                ret.append((0, 2 * h_step))  # center left\n                ret.append((4 * w_step, 2 * h_step))  # center right\n                ret.append((2 * w_step, 4 * h_step))  # lower center\n                ret.append((2 * w_step, 0 * h_step))  # upper center\n                ret.append((1 * w_step, 1 * h_step))  # upper left quarter\n                ret.append((3 * w_step, 1 * h_step))  # upper right quarter"
+        },
+        {
+            "comment": "The code defines a function `group_random_crop` that takes an image group and a target size as input, performs random cropping on each image in the group with different crop sizes, offsets, and resizes them to the specified target size. The cropped images are then returned in a group.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":262-297",
+            "content": "                ret.append((1 * w_step, 3 * h_step))  # lower left quarter\n                ret.append((3 * w_step, 3 * h_step))  # lower righ quarter\n            w_offset, h_offset = random.choice(ret)\n            crop_info = {\n                'crop_w': crop_pair[0],\n                'crop_h': crop_pair[1],\n                'offset_w': w_offset,\n                'offset_h': h_offset\n            }\n        return crop_info\n    crop_info = _sample_crop_size(im_size)\n    crop_w = crop_info['crop_w']\n    crop_h = crop_info['crop_h']\n    offset_w = crop_info['offset_w']\n    offset_h = crop_info['offset_h']\n    crop_img_group = [\n        img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))\n        for img in img_group\n    ]\n    ret_img_group = [\n        img.resize((input_size[0], input_size[1]), Image.BILINEAR)\n        for img in crop_img_group\n    ]\n    return ret_img_group\ndef group_random_crop(img_group, target_size):\n    \"\"\"\n    group_random_crop\n    \"\"\"\n    w, h = img_group[0].size\n    th, tw = target_size, target_size"
+        },
+        {
+            "comment": "This code is used to preprocess images by cropping, flipping, and centering them for machine learning models. The \"group_center_crop\" function crops images to a specific target size while ensuring the image dimensions are larger than the crop size. The \"group_random_flip\" function randomly flips the images horizontally with a 50% chance. The preprocessed images are returned in a list format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":299-337",
+            "content": "    assert (w >= target_size) and (h >= target_size), \\\n          \"image width({}) and height({}) should be larger than crop size\".format(w, h)\n    out_images = []\n    x1 = random.randint(0, w - tw)\n    y1 = random.randint(0, h - th)\n    for img in img_group:\n        if w == tw and h == th:\n            out_images.append(img)\n        else:\n            out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n    return out_images\ndef group_random_flip(img_group):\n    \"\"\"\n    group_random_flip\n    \"\"\"\n    v = random.random()\n    if v < 0.5:\n        ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]\n        return ret\n    else:\n        return img_group\ndef group_center_crop(img_group, target_size):\n    \"\"\"\n    group_center_crop\n    \"\"\"\n    img_crop = []\n    for img in img_group:\n        w, h = img.size\n        th, tw = target_size, target_size\n        assert (w >= target_size) and (h >= target_size), \\\n             \"image width({}) and height({}) should be larger than crop size\".format(w, h)\n        x1 = int(round((w - tw) / 2.))"
+        },
+        {
+            "comment": "The code defines two functions: \"crop_imgs\" and \"group_scale\". The \"crop_imgs\" function takes an image and crops it based on the provided top-left x, y coordinates, width, and height. It then appends the cropped images to a list and returns that list. The \"group_scale\" function resizes a group of images to a target size by checking if each image's dimensions already match the target size, and if not, it adjusts the dimensions using a 4:3 aspect ratio. It then appends the resized images to a list and returns that list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py\":338-365",
+            "content": "        y1 = int(round((h - th) / 2.))\n        img_crop.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n    return img_crop\ndef group_scale(imgs, target_size):\n    \"\"\"\n    group_scale\n    \"\"\"\n    resized_imgs = []\n    for i in range(len(imgs)):\n        img = imgs[i]\n        w, h = img.size\n        if (w <= h and w == target_size) or (h <= w and h == target_size):\n            resized_imgs.append(img)\n            continue\n        if w < h:\n            ow = target_size\n            oh = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n        else:\n            oh = target_size\n            ow = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n    return resized_imgs"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8d127c8b-b6bb-428d-adf0-597f36b6649f.json b/docs/doc/8d127c8b-b6bb-428d-adf0-597f36b6649f.json
new file mode 100644
index 000000000..760bdb453
--- /dev/null
+++ b/docs/doc/8d127c8b-b6bb-428d-adf0-597f36b6649f.json
@@ -0,0 +1,15 @@
+{
+    "summary": "RoIAlign is a class for region of interest alignment. It takes features, regions of interest (roi), and number of roi as inputs, and uses PaddlePaddle's roi_align operation to extract aligned features.",
+    "details": [
+        {
+            "comment": "RoIAlign is a class used for region of interest alignment. It takes features, regions of interest (roi), and the number of roi as inputs. The aligned parameter specifies whether to return aligned features or not.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_extractor.py\":0-30",
+            "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\n#@register\nclass RoIAlign(object):\n    def __init__(self,\n                 resolution=14,\n                 spatial_scale=0.0625,\n                 sampling_ratio=0,\n                 aligned=False):\n        super(RoIAlign, self).__init__()\n        self.resolution = resolution\n        self.spatial_scale = spatial_scale\n        self.sampling_ratio = sampling_ratio\n        self.aligned = aligned\n    def __call__(self, feats, roi, rois_num):"
+        },
+        {
+            "comment": "This code concatenates ROIs and ensures correct data type, then uses the PaddlePaddle library's roi_align operation to extract features from input features (feats) based on ROIs. If there is only one feature, it performs alignment for all ROIs. Otherwise, it creates a list of aligned feature tensors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_extractor.py\":31-52",
+            "content": "        roi = paddle.concat(roi) if len(roi) > 1 else roi[0]\n        rois_num = paddle.to_tensor(rois_num, dtype='int32')\n        rois_num = paddle.cast(rois_num, dtype='int32')\n        if len(feats) == 1:\n            roi_feat = paddle.vision.ops.roi_align(feats,\n                                     roi,\n                                     rois_num,\n                                     self.resolution,\n                                     self.spatial_scale,\n                                     self.sampling_ratio,\n                                     self.aligned)\n        else:\n            rois_feat_list = []\n            roi_feat = paddle.vision.ops.roi_align(feats,\n                                     roi,\n                                     rois_num,\n                                     self.resolution,\n                                     self.spatial_scale,\n                                     self.sampling_ratio,\n                                     self.aligned)\n        return roi_feat"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8d2bff4b-1fa7-4560-bc92-160a274fd5a9.json b/docs/doc/8d2bff4b-1fa7-4560-bc92-160a274fd5a9.json
new file mode 100644
index 000000000..305b37326
--- /dev/null
+++ b/docs/doc/8d2bff4b-1fa7-4560-bc92-160a274fd5a9.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is the initialization file of PaddleVideo's partitioner module. It imports base and TransNetV2Partitioner classes, and declares them as part of the public interface (__all__). The license and copyright information are also included.",
+    "details": [
+        {
+            "comment": "This code is the initialization file of PaddleVideo's partitioner module. It imports base and TransNetV2Partitioner classes, and declares them as part of the public interface (__all__). The license and copyright information are also included.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/__init__.py\":0-17",
+            "content": "# copyright (c) 2020  paddlepaddle authors. all rights reserved.\n#\n# licensed under the apache license, version 2.0 (the \"license\"\n# you may not use this file except in compliance with the license.\n# you may obtain a copy of the license at\n#\n#     http://www.apache.org/licenses/license-2.0\n#\n# unless required by applicable law or agreed to in writing, software\n# distributed under the license is distributed on an \"as is\" basis,\n# without warranties or conditions of any kind, either express or implied.\n# see the license for the specific language governing permissions and\n# limitations under the license.\nfrom .base import BasePartitioner\nfrom .transnetv2_partitioner import TransNetV2Partitioner\n__all__ = ['BasePartitioner', 'TransNetV2Partitioner']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8d7f9b6a-41f7-4abc-a197-22246acbf883.json b/docs/doc/8d7f9b6a-41f7-4abc-a197-22246acbf883.json
new file mode 100644
index 000000000..dd8845b52
--- /dev/null
+++ b/docs/doc/8d7f9b6a-41f7-4abc-a197-22246acbf883.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The PaddleVideo library evaluates video classification models using GAP, hit@one, precision error, and loss metrics. The `EvaluationMetrics` class accumulates these metrics per mini-batch or epoch using AveragePrecisionCalculator for GAP calculation.",
+    "details": [
+        {
+            "comment": "Code snippet is a part of the PaddleVideo library, providing functions to help evaluate video classification models. It includes flattening list functionality and calculates hit at one for predictions and actuals using numpy operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py\":0-27",
+            "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Provides functions to help with evaluating models.\"\"\"\nimport datetime\nimport numpy\nfrom . import mean_average_precision_calculator as map_calculator\nfrom . import average_precision_calculator as ap_calculator\ndef flatten(l):\n    \"\"\" Merges a list of lists into a single list. \"\"\"\n    return [item for sublist in l for item in sublist]\ndef calculate_hit_at_one(predictions, actuals):\n    \"\"\"Performs a local (numpy) calculation of the hit at one."
+        },
+        {
+            "comment": "This code calculates the average hit at one and precision at equal recall rate for a batch of predictions and corresponding actuals. These are metrics commonly used in evaluation of machine learning models, particularly in video classification tasks. The functions take as input two matrices: 'predictions' containing model outputs and 'actuals' with ground truth labels. They return the average hit at one and precision at equal recall rate across the entire batch respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py\":29-58",
+            "content": "  Args:\n    predictions: Matrix containing the outputs of the model.\n      Dimensions are 'batch' x 'num_classes'.\n    actuals: Matrix containing the ground truth labels.\n      Dimensions are 'batch' x 'num_classes'.\n  Returns:\n    float: The average hit at one across the entire batch.\n  \"\"\"\n    top_prediction = numpy.argmax(predictions, 1)\n    hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]\n    return numpy.average(hits)\ndef calculate_precision_at_equal_recall_rate(predictions, actuals):\n    \"\"\"Performs a local (numpy) calculation of the PERR.\n  Args:\n    predictions: Matrix containing the outputs of the model.\n      Dimensions are 'batch' x 'num_classes'.\n    actuals: Matrix containing the ground truth labels.\n      Dimensions are 'batch' x 'num_classes'.\n  Returns:\n    float: The average precision at equal recall rate across the entire batch.\n  \"\"\"\n    aggregated_precision = 0.0\n    num_videos = actuals.shape[0]\n    for row in numpy.arange(num_videos):\n        num_labels = int(numpy.sum(actuals[row]))"
+        },
+        {
+            "comment": "The code calculates the global average precision (GAP) using the top_k predictions and actuals for each video. It uses a function called AveragePrecisionCalculator to calculate the metric. The function first partitions the predictions based on their values, then iterates through the top indices, adding up the correct ones to calculate item precision. Finally, it averages the item precisions across all videos to get the GAP.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py\":59-86",
+            "content": "        top_indices = numpy.argpartition(predictions[row],\n                                         -num_labels)[-num_labels:]\n        item_precision = 0.0\n        for label_index in top_indices:\n            if predictions[row][label_index] > 0:\n                item_precision += actuals[row][label_index]\n        item_precision /= top_indices.size\n        aggregated_precision += item_precision\n    aggregated_precision /= num_videos\n    return aggregated_precision\ndef calculate_gap(predictions, actuals, top_k=20):\n    \"\"\"Performs a local (numpy) calculation of the global average precision.\n  Only the top_k predictions are taken for each of the videos.\n  Args:\n    predictions: Matrix containing the outputs of the model.\n      Dimensions are 'batch' x 'num_classes'.\n    actuals: Matrix containing the ground truth labels.\n      Dimensions are 'batch' x 'num_classes'.\n    top_k: How many predictions to use per video.\n  Returns:\n    float: The global average precision.\n  \"\"\"\n    gap_calculator = ap_calculator.AveragePrecisionCalculator()"
+        },
+        {
+            "comment": "This code extracts the top k predictions for each video, sorted by class. It returns a tuple containing the sparse_predictions, sparse_labels, and num_positives. The gap_calculator accumulates the flattened sparse_predictions, flattened sparse_labels, and sum of num_positives. Finally, it returns the average precision at n using peek_ap_at_n() from the gap_calculator.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py\":87-108",
+            "content": "    sparse_predictions, sparse_labels, num_positives = top_k_by_class(\n        predictions, actuals, top_k)\n    gap_calculator.accumulate(flatten(sparse_predictions),\n                              flatten(sparse_labels), sum(num_positives))\n    return gap_calculator.peek_ap_at_n()\ndef top_k_by_class(predictions, labels, k=20):\n    \"\"\"Extracts the top k predictions for each video, sorted by class.\n  Args:\n    predictions: A numpy matrix containing the outputs of the model.\n      Dimensions are 'batch' x 'num_classes'.\n    k: the top k non-zero entries to preserve in each prediction.\n  Returns:\n    A tuple (predictions,labels, true_positives). 'predictions' and 'labels'\n    are lists of lists of floats. 'true_positives' is a list of scalars. The\n    length of the lists are equal to the number of classes. The entries in the\n    predictions variable are probability predictions, and\n    the corresponding entries in the labels variable are the ground truth for\n    those predictions. The entries in 'true_positives' are the number of true"
+        },
+        {
+            "comment": "Function evaluates predictions and labels for each video and calculates top-k triplets (prediction, class) for each class. If k is not a positive integer, it raises ValueError. It returns out_predictions, out_labels, and out_true_positives for further analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py\":109-134",
+            "content": "    positives for each class in the ground truth.\n  Raises:\n    ValueError: An error occurred when the k is not a positive integer.\n  \"\"\"\n    if k <= 0:\n        raise ValueError(\"k must be a positive integer.\")\n    k = min(k, predictions.shape[1])\n    num_classes = predictions.shape[1]\n    prediction_triplets = []\n    for video_index in range(predictions.shape[0]):\n        prediction_triplets.extend(\n            top_k_triplets(predictions[video_index], labels[video_index], k))\n    out_predictions = [[] for v in range(num_classes)]\n    out_labels = [[] for v in range(num_classes)]\n    for triplet in prediction_triplets:\n        out_predictions[triplet[0]].append(triplet[1])\n        out_labels[triplet[0]].append(triplet[2])\n    out_true_positives = [numpy.sum(labels[:, i]) for i in range(num_classes)]\n    return out_predictions, out_labels, out_true_positives\ndef top_k_triplets(predictions, labels, k=20):\n    \"\"\"Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in\n  (prediction, class) format\"\"\""
+        },
+        {
+            "comment": "The code defines a class `EvaluationMetrics` to store various evaluation metrics for video classification. The `__init__` method initializes the metrics such as hit@one, precision error (perr), and loss. It also initializes two calculators: MeanAveragePrecisionCalculator and AveragePrecisionCalculator. The `accumulate` method updates these metrics based on predictions, labels, and loss values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py\":135-163",
+            "content": "    m = len(predictions)\n    k = min(k, m)\n    indices = numpy.argpartition(predictions, -k)[-k:]\n    return [(index, predictions[index], labels[index]) for index in indices]\nclass EvaluationMetrics(object):\n    \"\"\"A class to store the evaluation metrics.\"\"\"\n    def __init__(self, num_class, top_k):\n        \"\"\"Construct an EvaluationMetrics object to store the evaluation metrics.\n    Args:\n      num_class: A positive integer specifying the number of classes.\n      top_k: A positive integer specifying how many predictions are considered per video.\n    Raises:\n      ValueError: An error occurred when MeanAveragePrecisionCalculator cannot\n        not be constructed.\n    \"\"\"\n        self.sum_hit_at_one = 0.0\n        self.sum_perr = 0.0\n        self.sum_loss = 0.0\n        self.map_calculator = map_calculator.MeanAveragePrecisionCalculator(\n            num_class)\n        self.global_ap_calculator = ap_calculator.AveragePrecisionCalculator()\n        self.top_k = top_k\n        self.num_examples = 0\n    #def accumulate(self, predictions, labels, loss):"
+        },
+        {
+            "comment": "The code defines a function \"accumulate\" that takes in predictions, labels and loss from a mini-batch. It calculates three metrics: mean_hit_at_one, mean_perr, and mean_loss. The function then returns a dictionary containing these metrics for the batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py\":164-189",
+            "content": "    def accumulate(self, loss, predictions, labels):\n        \"\"\"Accumulate the metrics calculated locally for this mini-batch.\n    Args:\n      predictions: A numpy matrix containing the outputs of the model.\n        Dimensions are 'batch' x 'num_classes'.\n      labels: A numpy matrix containing the ground truth labels.\n        Dimensions are 'batch' x 'num_classes'.\n      loss: A numpy array containing the loss for each sample.\n    Returns:\n      dictionary: A dictionary storing the metrics for the mini-batch.\n    Raises:\n      ValueError: An error occurred when the shape of predictions and actuals\n        does not match.\n    \"\"\"\n        batch_size = labels.shape[0]\n        mean_hit_at_one = calculate_hit_at_one(predictions, labels)\n        mean_perr = calculate_precision_at_equal_recall_rate(\n            predictions, labels)\n        mean_loss = numpy.mean(loss)\n        # Take the top 20 predictions.\n        sparse_predictions, sparse_labels, num_positives = top_k_by_class(\n            predictions, labels, self.top_k)"
+        },
+        {
+            "comment": "This code calculates and accumulates various evaluation metrics during the epoch, including hit_at_one, perr, and loss. It then returns a dictionary with these metrics after an entire epoch of training. If no examples were accumulated during the epoch, it raises a ValueError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py\":190-218",
+            "content": "        self.map_calculator.accumulate(sparse_predictions, sparse_labels,\n                                       num_positives)\n        self.global_ap_calculator.accumulate(flatten(sparse_predictions),\n                                             flatten(sparse_labels),\n                                             sum(num_positives))\n        self.num_examples += batch_size\n        self.sum_hit_at_one += mean_hit_at_one * batch_size\n        self.sum_perr += mean_perr * batch_size\n        self.sum_loss += mean_loss * batch_size\n        return {\n            \"hit_at_one\": mean_hit_at_one,\n            \"perr\": mean_perr,\n            \"loss\": mean_loss\n        }\n    def get(self):\n        \"\"\"Calculate the evaluation metrics for the whole epoch.\n    Raises:\n      ValueError: If no examples were accumulated.\n    Returns:\n      dictionary: a dictionary storing the evaluation metrics for the epoch. The\n        dictionary has the fields: avg_hit_at_one, avg_perr, avg_loss, and\n        aps (default nan).\n    \"\"\"\n        if self.num_examples <= 0:"
+        },
+        {
+            "comment": "This code defines a class for evaluating metrics in video tagging. It calculates average hit at one, perr, and loss, as well as maps and global APs. The clear method resets the metrics to zero.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py\":219-243",
+            "content": "            raise ValueError(\"total_sample must be positive.\")\n        avg_hit_at_one = self.sum_hit_at_one / self.num_examples\n        avg_perr = self.sum_perr / self.num_examples\n        avg_loss = self.sum_loss / self.num_examples\n        aps = self.map_calculator.peek_map_at_n()\n        gap = self.global_ap_calculator.peek_ap_at_n()\n        epoch_info_dict = {}\n        return {\n            \"avg_hit_at_one\": avg_hit_at_one,\n            \"avg_perr\": avg_perr,\n            \"avg_loss\": avg_loss,\n            \"aps\": aps,\n            \"gap\": gap\n        }\n    def clear(self):\n        \"\"\"Clear the evaluation metrics and reset the EvaluationMetrics object.\"\"\"\n        self.sum_hit_at_one = 0.0\n        self.sum_perr = 0.0\n        self.sum_loss = 0.0\n        self.map_calculator.clear()\n        self.global_ap_calculator.clear()\n        self.num_examples = 0"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8d8ad343-569e-45a7-a33f-92251d72be90.json b/docs/doc/8d8ad343-569e-45a7-a33f-92251d72be90.json
new file mode 100644
index 000000000..036e17203
--- /dev/null
+++ b/docs/doc/8d8ad343-569e-45a7-a33f-92251d72be90.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code introduces the ActBert model for multimodal tasks, including training and validation steps. It utilizes a backbone function for predictions with text, video, and action scores along with sequence relationship scores. The infer_step is yet to be implemented.",
+    "details": [
+        {
+            "comment": "This code snippet defines the ActBert class, which is a multimodal model framework. It registers the model under MULTIMODAL in the registry and includes a forward_net method for processing text, action, and image data. The self.backbone function is used to make predictions based on this data. The code also includes import statements, variable definitions, and a get_logger function call for logging purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/actbert.py\":0-26",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import MULTIMODAL\nfrom .base import BaseMultimodal\nimport paddle\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@MULTIMODAL.register()\nclass ActBert(BaseMultimodal):\n    \"\"\"ActBert model framework.\"\"\"\n    def forward_net(self, text_ids, action_feat, image_feat, image_loc,\n                    token_type_ids, text_mask, image_mask, action_mask):\n        pred = self.backbone(text_ids, action_feat, image_feat, image_loc,\n                             token_type_ids, text_mask, image_mask, action_mask)"
+        },
+        {
+            "comment": "This code defines a train_step and val_step for ActBert Dataset. In the train_step, it takes input data (text_ids, action_feat, image_feat, etc.), passes them through the backbone model to get prediction scores, calculates loss, and returns a loss metric dictionary. The val_step does not appear to have any additional functionality.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/actbert.py\":27-45",
+            "content": "        return pred\n    def train_step(self, data_batch):\n        \"\"\"For ActBert Dataset. Define how the model is going to train, from input to output.\n        \"\"\"\n        text_ids, action_feat, image_feat, image_loc, \\\n        token_type_ids, text_mask, image_mask, action_mask, \\\n        text_labels, action_label, next_sentence_label, image_label, image_target = data_batch\n        loss_metrics = dict()\n        pred = self.backbone(text_ids, action_feat, image_feat, image_loc,\n                             token_type_ids, text_mask, image_mask, action_mask)\n        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = pred\n        total_loss = self.loss(prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \\\n                text_labels, image_label, image_target, action_label, next_sentence_label)\n        loss_metrics['loss'] = paddle.mean(total_loss)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"For ActBert Dataset. Define how the model is going to val, from input to output."
+        },
+        {
+            "comment": "The code defines a model that takes in multiple inputs like text_ids, action_feat, image_feat, and more. It performs testing with the test_step() function and returns prediction scores for text, video, and action, along with the sequence relationship score. The infer_step() function is not implemented yet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/actbert.py\":46-63",
+            "content": "        \"\"\"\n        return self.train_step(data_batch)\n    def test_step(self, data_batch):\n        \"\"\"For MSR-VTT Dataset. Define how the model is going to test, from input to output.\"\"\"\n        text_ids, action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask = data_batch[:\n                                                                                                                      -1]\n        action_feat = action_feat.squeeze(0)\n        image_feat = image_feat.squeeze(0)\n        image_loc = image_loc.squeeze(0)\n        image_mask = image_mask.squeeze(0)\n        action_mask = action_mask.squeeze(0)\n        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.forward_net(text_ids, \\\n            action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask)\n        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score\n    def infer_step(self, data_batch):\n        pass"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8da98a26-70c5-4704-ac2b-a021a899805a.json b/docs/doc/8da98a26-70c5-4704-ac2b-a021a899805a.json
new file mode 100644
index 000000000..3d35c9d07
--- /dev/null
+++ b/docs/doc/8da98a26-70c5-4704-ac2b-a021a899805a.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is importing modules and creating registries for pipelines and datasets in the PaddleVideo application. The registries allow easy management of different pipeline and dataset types, making it convenient to extend or customize them later on.",
+    "details": [
+        {
+            "comment": "This code is importing modules and creating registries for pipelines and datasets in the PaddleVideo application. The registries allow easy management of different pipeline and dataset types, making it convenient to extend or customize them later on.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/registry.py\":0-19",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom ..utils import Registry\nPIPELINES = Registry(\"pipeline\")\nDATASETS = Registry(\"datasets\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8dd65d2e-d515-4060-a2e7-336f403298db.json b/docs/doc/8dd65d2e-d515-4060-a2e7-336f403298db.json
new file mode 100644
index 000000000..fb10ead62
--- /dev/null
+++ b/docs/doc/8dd65d2e-d515-4060-a2e7-336f403298db.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This guide details fine-tuning the VideoTag model using custom data, covering AttentionLSTM and TSN models, feature extraction, and multi/single GPU support. The code trains, evaluates, predicts with TSN, requires specific weight files, allows save directories, and preprocesses videos into images.",
+    "details": [
+        {
+            "comment": "This is a guide for fine-tuning the VideoTag pre-trained model with custom training data, covering AttentionLSTM and TSN models, principle explanations, and reference papers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/FineTune.md\":0-31",
+            "content": "# \u6a21\u578b\u5fae\u8c03\u6307\u5357\n---\n## \u5185\u5bb9\n\u53c2\u8003\u672c\u6587\u6863\uff0c\u60a8\u53ef\u4ee5\u4f7f\u7528\u81ea\u5df1\u7684\u8bad\u7ec3\u6570\u636e\u5728VideoTag\u9884\u8bad\u7ec3\u6a21\u578b\u4e0a\u8fdb\u884cfine-tune\uff0c\u8bad\u7ec3\u51fa\u81ea\u5df1\u7684\u6a21\u578b\u3002\n\u6587\u6863\u5185\u5bb9\u5305\u62ec:\n- [\u539f\u7406\u89e3\u6790](#\u539f\u7406\u89e3\u6790)\n- [\u5bf9AttentionLSTM\u6a21\u578b\u8fdb\u884c\u5fae\u8c03](#\u5bf9AttentionLSTM\u6a21\u578b\u8fdb\u884c\u5fae\u8c03)\n- [\u5bf9TSN\u6a21\u578b\u8fdb\u884c\u5fae\u8c03](#\u5bf9TSN\u6a21\u578b\u8fdb\u884c\u5fae\u8c03)\n- [\u6269\u5c55\u5185\u5bb9](#\u6269\u5c55\u5185\u5bb9)\n- [\u53c2\u8003\u8bba\u6587](#\u53c2\u8003\u8bba\u6587)\n## \u539f\u7406\u89e3\u6790\nVideoTag\u91c7\u7528\u4e24\u9636\u6bb5\u5efa\u6a21\u65b9\u5f0f\uff0c\u7531\u4e24\u4e2a\u6a21\u578b\u7ec4\u6210: TSN + AttentionLSTM\u3002\nTemporal Segment Network (TSN) \u662f\u7ecf\u5178\u7684\u57fa\u4e8e2D-CNN\u7684\u89c6\u9891\u5206\u7c7b\u6a21\u578b\u3002\u8be5\u6a21\u578b\u901a\u8fc7\u7a00\u758f\u91c7\u6837\u89c6\u9891\u5e27\u7684\u65b9\u5f0f\uff0c\u5728\u6355\u83b7\u89c6\u9891\u65f6\u5e8f\u4fe1\u606f\u7684\u540c\u65f6\u964d\u4f4e\u4e86\u8ba1\u7b97\u91cf\u3002\u8be6\u7ec6\u5185\u5bb9\u8bf7\u53c2\u8003\u8bba\u6587[Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)\nAttentionLSTM\u4ee5\u89c6\u9891\u7684\u7279\u5f81\u5411\u91cf\u4f5c\u4e3a\u8f93\u5165\uff0c\u91c7\u7528\u53cc\u5411\u957f\u77ed\u65f6\u8bb0\u5fc6\u7f51\u7edc\uff08LSTM\uff09\u5bf9\u6240\u6709\u5e27\u7279\u5f81\u8fdb\u884c\u7f16\u7801\uff0c\u5e76\u589e\u52a0Attention\u5c42\uff0c\u5c06\u6bcf\u4e2a\u65f6\u523b\u7684\u9690\u72b6\u6001\u8f93\u51fa\u4e0e\u81ea\u9002\u5e94\u6743\u91cd\u7ebf\u6027\u52a0\u6743\u5f97\u5230\u6700\u7ec8\u5206\u7c7b\u5411\u91cf\u3002\u8be6\u7ec6\u5185\u5bb9\u8bf7\u53c2\u8003\u8bba\u6587[AttentionCluster](https://arxiv.org/abs/1711.09550)\nVideoTag\u8bad\u7ec3\u65f6\u5206\u4e24\u4e2a\u9636\u6bb5: \u7b2c\u4e00\u9636\u6bb5\u4f7f\u7528\u5c11\u91cf\u89c6\u9891\u6837\u672c\uff08\u5341\u4e07\u7ea7\u522b\uff09\u8bad\u7ec3\u5927\u89c4\u6a21\u89c6\u9891\u7279\u5f81\u63d0\u53d6\u6a21\u578b(TSN)\uff1b\u7b2c\u4e8c\u9636\u6bb5\u4f7f\u7528\u5343\u4e07\u7ea7\u6570\u636e\u8bad\u7ec3\u9884\u6d4b\u5668(AttentionLSTM)\u3002\nVideoTag\u9884\u6d4b\u65f6\u4e5f\u5206\u4e24\u4e2a\u9636\u6bb5: \u7b2c\u4e00\u9636\u6bb5\u4ee5\u89c6\u9891\u6587\u4ef6\u4f5c\u4e3a\u8f93\u5165\uff0c\u7ecf\u8fc7\u53bb\u9664\u4e86\u5168\u8fde\u63a5\u5c42\u4ee5\u53ca\u635f\u5931\u51fd\u6570\u5c42\u7684TSN\u7f51\u7edc\u540e\u5f97\u5230\u8f93\u51fa\u7279\u5f81\u5411\u91cf\uff1b\u7b2c\u4e8c\u9636\u6bb5\u4ee5TSN\u7f51\u7edc\u8f93\u51fa\u7684\u7279\u5f81\u5411\u91cf\u4f5c\u4e3a\u8f93\u5165\uff0c\u7ecf\u8fc7AttentionLSTM\u540e\u5f97\u5230\u6700\u7ec8\u7684\u5206\u7c7b\u7ed3\u679c\u3002\n\u57fa\u4e8e\u6211\u4eec\u7684\u9884\u6a21\u578b\uff0c\u60a8\u53ef\u4ee5\u4f7f\u7528\u81ea\u5df1\u7684\u8bad\u7ec3\u6570\u636e\u8fdb\u884cfine-tune:\n- [\u5bf9AttentionLSTM\u6a21\u578b\u8fdb\u884c\u5fae\u8c03](#\u5bf9AttentionLSTM\u6a21\u578b\u8fdb\u884c\u5fae\u8c03)\n- [\u5bf9TSN\u6a21\u578b\u8fdb\u884c\u5fae\u8c03](#\u5bf9TSN\u6a21\u578b\u8fdb\u884c\u5fae\u8c03)\n## \u5bf9AttentionLSTM\u6a21\u578b\u8fdb\u884c\u5fae\u8c03\nAttentionLSTM\u4ee5\u89c6\u9891\u7279\u5f81\u4f5c\u4e3a\u8f93\u5165\uff0c\u663e\u5b58\u5360\u7528\u5c11\uff0c\u8bad\u7ec3\u901f\u5ea6\u8f83TSN\u66f4\u5feb\uff0c\u56e0\u6b64\u63a8\u8350\u4f18\u5148\u5bf9AttentionLSTM\u6a21\u578b\u8fdb\u884c\u5fae\u8c03\u3002\u8f93\u5165\u89c6\u9891\u9996\u5148\u7ecf\u8fc7TSN\u9884\u8bad\u7ec3\u6a21\u578b\u63d0\u53d6\u7279\u5f81\u5411\u91cf\uff0c\u7136\u540e\u5c06\u7279\u5f81\u5411\u91cf\u4f5c\u4e3a\u8bad\u7ec3\u8f93\u5165\u6570\u636e\uff0c\u5fae\u8c03AttentionLSTM\u6a21\u578b\u3002"
+        },
+        {
+            "comment": "Extract features from TSN pre-trained model, save the extracted features in specified directory. AttentionLSTM model fine-tuning requires TSN extracted features with corresponding labels in the train.list file. Label indices are defined in a separate text file, e.g., label_3396.txt.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/FineTune.md\":33-80",
+            "content": "### TSN\u9884\u6a21\u578b\u63d0\u53d6\u7279\u5f81\u5411\u91cf\n#### \u6570\u636e\u51c6\u5907\n- \u9884\u8bad\u7ec3\u6743\u91cd\u4e0b\u8f7d: \u53c2\u8003[\u6837\u4f8b\u4ee3\u7801\u8fd0\u884c\u6307\u5357-\u6570\u636e\u51c6\u5907-\u9884\u8bad\u7ec3\u6743\u91cd\u4e0b\u8f7d](./Run.md)\n- \u51c6\u5907\u8bad\u7ec3\u6570\u636e: \u51c6\u5907\u597d\u5f85\u8bad\u7ec3\u7684\u89c6\u9891\u6570\u636e\uff0c\u5e76\u5728video\\_tag/data/TsnExtractor.list\u6587\u4ef6\u4e2d\u6307\u5b9a\u5f85\u8bad\u7ec3\u7684\u6587\u4ef6\u8def\u5f84\uff0c\u5185\u5bb9\u683c\u5f0f\u5982\u4e0b:\n```\nmy_video_path/my_video_file1.mp4\nmy_video_path/my_video_file2.mp4\n...\n```\n#### \u7279\u5f81\u63d0\u53d6\n\u7279\u5f81\u63d0\u53d6\u811a\u672c\u5982\u4e0b:\n```\npython tsn_extractor.py --model_name=TSN --config=./configs/tsn.yaml --weights=./weights/tsn.pdparams\n```\n- \u901a\u8fc7--weights\u53ef\u6307\u5b9aTSN\u6743\u91cd\u53c2\u6570\u7684\u5b58\u50a8\u8def\u5f84\uff0c\u9ed8\u8ba4\u4e3avideo\\_tag/weights/tsn.pdparams\n- \u901a\u8fc7--save\\_dir\u53ef\u6307\u5b9a\u7279\u5f81\u5411\u91cf\u4fdd\u5b58\u8def\u5f84\uff0c\u9ed8\u8ba4\u4e3avideo\\_tag/data/tsn\\_features\uff0c\u4e0d\u540c\u8f93\u5165\u89c6\u9891\u7684\u7279\u5f81\u5411\u91cf\u63d0\u53d6\u7ed3\u679c\u5206\u6587\u4ef6\u4fdd\u5b58\u5728\u4e0d\u540c\u7684npy\u6587\u4ef6\u4e2d\uff0c\u76ee\u5f55\u5f62\u5f0f\u4e3a:\n```\nvideo_tag\n  \u251c\u2500\u2500data\n    \u251c\u2500\u2500tsn_features\n      \u251c\u2500\u2500 my_feature_file1.npy\n      \u251c\u2500\u2500 my_feature_file2.npy\n      ...\n```\n- tsn\u63d0\u53d6\u7684\u7279\u5f81\u5411\u91cf\u7ef4\u5ea6\u4e3a```\u5e27\u6570*\u7279\u5f81\u7ef4\u5ea6```\uff0c\u9ed8\u8ba4\u4e3a300 * 2048\u3002\n### AttentionLSTM\u6a21\u578bFine-tune\n#### \u6570\u636e\u51c6\u5907\nVideoTag\u4e2d\u7684AttentionLSTM\u4ee5TSN\u6a21\u578b\u63d0\u53d6\u7684\u7279\u5f81\u5411\u91cf\u4f5c\u4e3a\u8f93\u5165\u3002\u5728video\\_tag/data/dataset/attention\\_lstm/train.list\u6587\u4ef6\u4e2d\u6307\u5b9a\u5f85\u8bad\u7ec3\u7684\u6587\u4ef6\u8def\u5f84\u548c\u5bf9\u5e94\u7684\u6807\u7b7e\uff0c\u5185\u5bb9\u683c\u5f0f\u5982\u4e0b:\n```\nmy_feature_path/my_feature_file1.npy label1 label2\nmy_feature_path/my_feature_file2.npy label1\n...\n```\n- \u4e00\u4e2a\u8f93\u5165\u89c6\u9891\u53ef\u4ee5\u6709\u591a\u4e2a\u6807\u7b7e\uff0c\u6807\u7b7e\u7d22\u5f15\u4e3a\u6574\u578b\u6570\u636e\uff0c\u6587\u4ef6\u540d\u4e0e\u6807\u7b7e\u4e4b\u95f4\u3001\u591a\u4e2a\u6807\u7b7e\u4e4b\u95f4\u4ee5\u4e00\u4e2a\u7a7a\u683c\u5206\u9694\uff1b\n- \u6807\u7b7e\u7d22\u5f15\u4e0e\u6807\u7b7e\u540d\u79f0\u7684\u4e4b\u95f4\u7684\u5bf9\u5e94\u5173\u7cfb\u4ee5list\u6587\u4ef6\u6307\u5b9a\uff0c\u53ef\u53c2\u8003VideoTag\u7528\u5230\u7684label_3396.txt\u6587\u4ef6\u6784\u9020\uff0c\u884c\u7d22\u5f15\u5bf9\u5e94\u6807\u7b7e\u7d22\u5f15;"
+        },
+        {
+            "comment": "This code chunk is for fine-tuning the AttentionLSTM model in PaddleVideo's VideoTag application. It provides instructions for training the model with multiple GPUs or a single GPU, and specifies the configuration file and pretrained weights required. The code also demonstrates how to evaluate the trained model using eval.py script. The precision metrics printed include GAP and Hit@1.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/FineTune.md\":82-112",
+            "content": "- \u9a8c\u8bc1\u96c6\u3001\u6d4b\u8bd5\u96c6\u4ee5\u53ca\u9884\u6d4b\u6570\u636e\u96c6\u7684\u6784\u9020\u65b9\u5f0f\u540c\u8bad\u7ec3\u96c6\u7c7b\u4f3c\uff0c\u4ec5\u9700\u8981\u5728video\\_tag/data/attention\\_lstm/\u76ee\u5f55\u4e0b\u5bf9\u5e94\u7684list\u6587\u4ef6\u4e2d\u6307\u5b9a\u76f8\u5173\u6587\u4ef6\u8def\u5f84/\u6807\u7b7e\u5373\u53ef\u3002\n#### \u6a21\u578b\u8bad\u7ec3\n\u4f7f\u7528VideoTag\u4e2d\u7684AttentionLSTM\u9884\u6a21\u578b\u8fdb\u884cfine-tune\u8bad\u7ec3\u811a\u672c\u5982\u4e0b:\n```\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython train.py --model_name=AttentionLSTM --config=./configs/attention_lstm.yaml --pretrain=./weights/attention_lstm\n```\n- AttentionLSTM\u6a21\u578b\u9ed8\u8ba4\u4f7f\u75288\u5361\u8bad\u7ec3\uff0c\u603b\u7684batch size\u6570\u662f1024\u3002\u82e5\u4f7f\u7528\u5355\u5361\u8bad\u7ec3\uff0c\u8bf7\u4fee\u6539\u73af\u5883\u53d8\u91cf\uff0c\u811a\u672c\u5982\u4e0b:\n```\nexport CUDA_VISIBLE_DEVICES=0\npython train.py --model_name=AttentionLSTM --config=./configs/attention_lstm-single.yaml --pretrain=./weights/attention_lstm\n```\n- \u8bf7\u786e\u4fdd\u8bad\u7ec3\u6837\u672c\u6570\u5927\u4e8ebatch_size\u6570\n- \u901a\u8fc7--pretrain\u53c2\u6570\u53ef\u6307\u5b9aAttentionLSTM\u9884\u8bad\u7ec3\u6a21\u578b\u7684\u8def\u5f84\uff0c\u9ed8\u8ba4\u4e3a./weights/attention\\_lstm\uff1b\n- \u6a21\u578b\u76f8\u5173\u914d\u7f6e\u5199\u5728video_tag/configs/attention\\_lstm.yaml\u6587\u4ef6\u4e2d\uff0c\u53ef\u4ee5\u65b9\u4fbf\u7684\u8c03\u8282\u5404\u9879\u8d85\u53c2\u6570\uff1b\n- \u901a\u8fc7--save_dir\u53c2\u6570\u53ef\u6307\u5b9a\u8bad\u7ec3\u6a21\u578b\u53c2\u6570\u7684\u4fdd\u5b58\u8def\u5f84\uff0c\u9ed8\u8ba4\u4e3a./data/checkpoints\uff1b\n#### \u6a21\u578b\u8bc4\u4f30\n\u53ef\u7528\u5982\u4e0b\u65b9\u5f0f\u8fdb\u884c\u6a21\u578b\u8bc4\u4f30:\n```\npython eval.py --model_name=AttentionLSTM --config=./configs/attention_lstm.yaml --weights=./data/checkpoints/AttentionLSTM_epoch9.pdparams\n```\n- \u901a\u8fc7--weights\u53c2\u6570\u53ef\u6307\u5b9a\u8bc4\u4f30\u9700\u8981\u7684\u6743\u91cd\uff0c\u9ed8\u8ba4\u4e3a./data/checkpoints/AttentionLSTM_epoch9.pdparams\uff1b\n- \u8bc4\u4f30\u7ed3\u679c\u4ee5log\u7684\u5f62\u5f0f\u76f4\u63a5\u6253\u5370\u8f93\u51faGAP\u3001Hit@1\u7b49\u7cbe\u5ea6\u6307\u6807\u3002"
+        },
+        {
+            "comment": "This code provides instructions for model inference and fine-tuning using the PaddleVideo framework's VideoTag application. It explains how to specify the model, configuration file, weights, label files, and save directory for prediction results. Additionally, it outlines the steps for preparing data, training, and executing a pre-trained TSN model in the VideoTag application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/FineTune.md\":114-151",
+            "content": "#### \u6a21\u578b\u63a8\u65ad\n\u53ef\u7528\u5982\u4e0b\u65b9\u5f0f\u8fdb\u884c\u6a21\u578b\u63a8\u65ad:\n```\npython predict.py --model_name=AttentionLSTM --config=./configs/attention_lstm.yaml --weights=./data/checkpoints/AttentionLSTM_epoch9.pdparams\n```\n- \u901a\u8fc7--weights\u53c2\u6570\u53ef\u6307\u5b9a\u63a8\u65ad\u9700\u8981\u7684\u6743\u91cd\uff0c\u9ed8\u8ba4\u4e3a./data/checkpoints/AttentionLSTM_epoch9.pdparams\uff1b\n- \u901a\u8fc7--label_file\u53c2\u6570\u6307\u5b9a\u6807\u7b7e\u6587\u4ef6\uff0c\u8bf7\u6839\u636e\u81ea\u5df1\u7684\u6570\u636e\u4fee\u6539\uff0c\u9ed8\u8ba4\u4e3a./label_3396.txt;\n- \u9884\u6d4b\u7ed3\u679c\u4f1a\u4ee5\u65e5\u5fd7\u5f62\u5f0f\u6253\u5370\u51fa\u6765\uff0c\u540c\u65f6\u4e5f\u4fdd\u5b58\u5728json\u6587\u4ef6\u4e2d\uff0c\u901a\u8fc7--save_dir\u53c2\u6570\u53ef\u6307\u5b9a\u9884\u6d4b\u7ed3\u679c\u4fdd\u5b58\u8def\u5f84\uff0c\u9ed8\u8ba4\u4e3a./data/predict_results/attention_lstm\u3002\n## \u5bf9TSN\u6a21\u578b\u8fdb\u884c\u5fae\u8c03\nVideoTag\u4e2d\u4f7f\u7528\u7684TSN\u6a21\u578b\u4ee5mp4\u6587\u4ef6\u4e3a\u8f93\u5165\uff0cbackbone\u4e3aResNet101\u3002\n### \u6570\u636e\u51c6\u5907\n\u51c6\u5907\u597d\u8bad\u7ec3\u89c6\u9891\u6587\u4ef6\u540e\uff0c\u5728video\\_tag/data/dataset/tsn/train.list\u6587\u4ef6\u4e2d\u6307\u5b9a\u5f85\u8bad\u7ec3\u7684\u6587\u4ef6\u8def\u5f84\u548c\u5bf9\u5e94\u7684\u6807\u7b7e\u5373\u53ef\uff0c\u5185\u5bb9\u683c\u5f0f\u5982\u4e0b:\n```\nmy_video_path/my_video_file1.mp4 label1\nmy_video_path/my_video_file2.mp4 label2\n...\n```\n- \u4e00\u4e2a\u8f93\u5165\u89c6\u9891\u53ea\u80fd\u6709\u4e00\u4e2a\u6807\u7b7e\uff0c\u6807\u7b7e\u7d22\u5f15\u4e3a\u6574\u578b\u6570\u636e\uff0c\u6807\u7b7e\u7d22\u5f15\u4e0e\u6587\u4ef6\u540d\u4e4b\u95f4\u4ee5\u4e00\u4e2a\u7a7a\u683c\u5206\u9694\uff1b\n- \u9a8c\u8bc1\u96c6\u3001\u6d4b\u8bd5\u96c6\u4ee5\u53ca\u9884\u6d4b\u6570\u636e\u96c6\u7684\u6784\u9020\u65b9\u5f0f\u540c\u8bad\u7ec3\u96c6\u7c7b\u4f3c\uff0c\u4ec5\u9700\u8981\u5728video\\_tag/data/dataset/tsn\u76ee\u5f55\u4e0b\u5bf9\u5e94\u7684list\u6587\u4ef6\u4e2d\u6307\u5b9a\u76f8\u5173\u6587\u4ef6\u8def\u5f84/\u6807\u7b7e\u5373\u53ef\u3002\n#### \u6a21\u578b\u8bad\u7ec3\n\u4f7f\u7528VideoTag\u4e2d\u7684TSN\u9884\u6a21\u578b\u8fdb\u884cfine-tune\u8bad\u7ec3\u811a\u672c\u5982\u4e0b:\n```\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython train.py --model_name=TSN --config=./configs/tsn.yaml --pretrain=./weights/tsn\n```\n- TSN\u6a21\u578b\u9ed8\u8ba4\u4f7f\u75288\u5361\u8bad\u7ec3\uff0c\u603b\u7684batch size\u6570\u662f256\u3002\u82e5\u4f7f\u7528\u5355\u5361\u8bad\u7ec3\uff0c\u8bf7\u4fee\u6539\u73af\u5883\u53d8\u91cf\uff0c\u811a\u672c\u5982\u4e0b:\n```"
+        },
+        {
+            "comment": "This code is for training, evaluating and predicting with the TSN model. It uses different Python scripts (train.py, eval.py, and predict.py) along with a configuration file (tsn.yaml). The TSN model requires specific weight files saved at certain locations. It also has options to specify save directories for checkpoints, evaluation results, and prediction outputs. To speed up the training process, videos can be preprocessed into images before training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/FineTune.md\":152-187",
+            "content": "export CUDA_VISIBLE_DEVICES=0\npython train.py --model_name=TSN --config=./configs/tsn-single.yaml --pretrain=./weights/tsn\n```\n- \u901a\u8fc7--pretrain\u53c2\u6570\u53ef\u6307\u5b9aTSN\u9884\u8bad\u7ec3\u6a21\u578b\u7684\u8def\u5f84\uff0c\u793a\u4f8b\u4e3a./weights/tsn\uff1b\n- \u6a21\u578b\u76f8\u5173\u914d\u7f6e\u5199\u5728video_tag/configs/tsn.yaml\u6587\u4ef6\u4e2d\uff0c\u53ef\u4ee5\u65b9\u4fbf\u7684\u8c03\u8282\u5404\u9879\u8d85\u53c2\u6570\uff1b\n- \u901a\u8fc7--save_dir\u53c2\u6570\u53ef\u6307\u5b9a\u8bad\u7ec3\u6a21\u578b\u53c2\u6570\u7684\u4fdd\u5b58\u8def\u5f84\uff0c\u9ed8\u8ba4\u4e3a./data/checkpoints\uff1b\n#### \u6a21\u578b\u8bc4\u4f30\n\u53ef\u7528\u5982\u4e0b\u65b9\u5f0f\u8fdb\u884c\u6a21\u578b\u8bc4\u4f30:\n```\npython eval.py --model_name=TSN --config=./configs/tsn.yaml --weights=./data/checkpoints/TSN_epoch44.pdparams\n```\n- \u901a\u8fc7--weights\u53c2\u6570\u53ef\u6307\u5b9a\u8bc4\u4f30\u9700\u8981\u7684\u6743\u91cd\uff0c\u793a\u4f8b\u4e3a./data/checkpoints/TSN_epoch44.pdparams\uff1b\n- \u8bc4\u4f30\u7ed3\u679c\u4ee5log\u7684\u5f62\u5f0f\u76f4\u63a5\u6253\u5370\u8f93\u51faTOP1_ACC\u3001TOP5_ACC\u7b49\u7cbe\u5ea6\u6307\u6807\u3002\n#### \u6a21\u578b\u63a8\u65ad\n\u53ef\u7528\u5982\u4e0b\u65b9\u5f0f\u8fdb\u884c\u6a21\u578b\u63a8\u65ad:\n```\npython predict.py --model_name=TSN --config=./configs/tsn.yaml --weights=./data/checkpoints/TSN_epoch44.pdparams --save_dir=./data/predict_results/tsn/\n```\n- \u901a\u8fc7--weights\u53c2\u6570\u53ef\u6307\u5b9a\u63a8\u65ad\u9700\u8981\u7684\u6743\u91cd\uff0c\u793a\u4f8b\u4e3a./data/checkpoints/TSN_epoch44.pdparams\uff1b\n- \u901a\u8fc7--label_file\u53c2\u6570\u6307\u5b9a\u6807\u7b7e\u6587\u4ef6\uff0c\u8bf7\u6839\u636e\u81ea\u5df1\u7684\u6570\u636e\u4fee\u6539\uff0c\u9ed8\u8ba4\u4e3a./label_3396.txt;\n- \u9884\u6d4b\u7ed3\u679c\u4f1a\u4ee5\u65e5\u5fd7\u5f62\u5f0f\u6253\u5370\u51fa\u6765\uff0c\u540c\u65f6\u4e5f\u4fdd\u5b58\u5728json\u6587\u4ef6\u4e2d\uff0c\u901a\u8fc7--save_dir\u53c2\u6570\u53ef\u6307\u5b9a\u9884\u6d4b\u7ed3\u679c\u4fdd\u5b58\u8def\u5f84\uff0c\u793a\u4f8b\u4e3a./data/predict_results/tsn\u3002\n### \u8bad\u7ec3\u52a0\u901f\nTSN\u6a21\u578b\u9ed8\u8ba4\u4ee5mp4\u7684\u89c6\u9891\u6587\u4ef6\u4f5c\u4e3a\u8f93\u5165\uff0c\u8bad\u7ec3\u65f6\u9700\u8981\u5148\u5bf9\u89c6\u9891\u6587\u4ef6\u89e3\u7801\uff0c\u518d\u5c06\u89e3\u7801\u540e\u7684\u6570\u636e\u9001\u5165\u7f51\u7edc\u8fdb\u884c\u8bad\u7ec3\uff0c\u5982\u679c\u89c6\u9891\u6587\u4ef6\u5f88\u5927\uff0c\u8fd9\u4e2a\u8fc7\u7a0b\u5c06\u4f1a\u5f88\u8017\u65f6\u3002\n\u4e3a\u52a0\u901f\u8bad\u7ec3\uff0c\u53ef\u4ee5\u5148\u5c06\u89c6\u9891\u89e3\u7801\u6210\u56fe\u7247\uff0c\u7136\u540e\u4fdd\u5b58\u4e0b\u6765\uff0c\u8bad\u7ec3\u65f6\u76f4\u63a5\u6839\u636e\u7d22\u5f15\u8bfb\u53d6\u5e27\u56fe\u7247\u4f5c\u4e3a\u8f93\u5165\uff0c\u52a0\u5feb\u8bad\u7ec3\u8fc7\u7a0b\u3002"
+        },
+        {
+            "comment": "The code is preparing the data by decoding videos into frames and generating a file path list for these frames. It modifies the configuration file, changing the model format to \"frames\" and updating the filelist accordingly. Additional information about TSN and AttentionLSTM models can be found in their respective PaddleCV repositories, with references provided to the original papers as well.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/FineTune.md\":189-205",
+            "content": "- \u6570\u636e\u51c6\u5907: \u9996\u5148\u5c06\u89c6\u9891\u89e3\u7801\uff0c\u5b58\u6210\u5e27\u56fe\u7247\uff1b\u7136\u540e\u751f\u6210\u5e27\u56fe\u7247\u7684\u6587\u4ef6\u8def\u5f84\u5217\u8868\u3002\u5b9e\u73b0\u8fc7\u7a0b\u53ef\u53c2\u8003[ucf-101\u6570\u636e\u51c6\u5907](../../../../dygraph/tsn/data/dataset/ucf101/README.md)\n- \u4fee\u6539\u914d\u7f6e\u6587\u4ef6: \u4fee\u6539\u914d\u7f6e\u6587\u4ef6./config/tsn.yaml\uff0c\u5176\u4e2dMODEL.format\u503c\u6539\u4e3a\"frames\"\uff0c\u4e0d\u540c\u6a21\u5f0f\u4e0b\u7684filelist\u503c\u6539\u4e3a\u5bf9\u5e94\u7684\u5e27\u56fe\u7247\u6587\u4ef6list\u3002\n## \u6269\u5c55\u5185\u5bb9\n- \u66f4\u591a\u5173\u4e8eTSN\u6a21\u578b\u7684\u5185\u5bb9\u53ef\u53c2\u8003PaddleCV\u89c6\u9891\u5e93[TSN\u89c6\u9891\u5206\u7c7b\u6a21\u578b](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/video/models/tsn/README.md)\u3002\n- \u66f4\u591a\u5173\u4e8eAttentionLSTM\u6a21\u578b\u7684\u5185\u5bb9\u53ef\u53c2\u8003PaddleCV\u89c6\u9891\u5e93[AttentionLSTM\u89c6\u9891\u5206\u7c7b\u6a21\u578b](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/video/models/attention_lstm)\u3002\n## \u53c2\u8003\u8bba\u6587\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool\n- [Beyond Short Snippets: Deep Networks for Video Classification](https://arxiv.org/abs/1503.08909) Joe Yue-Hei Ng, Matthew Hausknecht, Sudheendra Vijayanarasimhan, Oriol Vinyals, Rajat Monga, George Toderici"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8dddd838-dabc-4192-8aba-da185088ea18.json b/docs/doc/8dddd838-dabc-4192-8aba-da185088ea18.json
new file mode 100644
index 000000000..ce83f6090
--- /dev/null
+++ b/docs/doc/8dddd838-dabc-4192-8aba-da185088ea18.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The text outlines the process of preparing UCF24 dataset with PaddleVideo's build_split.py command, installing unrar tool and providing a download script for ease of access. The file structure contains video sequences and split files for training and testing.",
+    "details": [
+        {
+            "comment": "Introduction to UCF24 dataset preparation process, including download of RGB frame and annotation files. PaddleVideo provides a download script for easier access. Requires unrar tool installation. RGB frames stored in rgb-images directory, annotations in labels directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ucf24.md\":0-19",
+            "content": "English | [\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/ucf24.md)\n# UCF24 Data Preparation\nThis document mainly introduces the preparation process of UCF24 dataset. It mainly includes the download of the RGB frame files, the annotation files and the pathlist of the generated file.\n---\n## 1. Data Download\nDetailed information on UCF24 data can be found on the website [UCF24](http://www.thumos.info/download.html). For ease of use, PaddleVideo provides a download script for the RGB frame, annotation file of the UCF24 data.\nFirst, please ensure access to the [data/ucf24/ directory](../../../data/ucf24) and enter the following command for downloading the RGB frame, annotation file of the UCF24 dataset.\n```shell\nbash download_frames_annotations.sh\n```\n- To run this command you need to install the unrar decompression tool, which can be installed using the pip method.\n- The RGB frame files will be stored in the [data/ucf24/rgb-images/ directory](../../../data/ucf24/rgb-images)\n- The annotation files will be stored in the [data/ucf24/lables/ directory](../../../data/ucf24/labels)"
+        },
+        {
+            "comment": "This code describes the process of generating file path lists and the resulting folder structure for UCF24 dataset preparation using PaddleVideo's build_split.py command with the raw_path parameter, dividing data into groundtruths_ucf, labels, and rgb-images subfolders containing video clips and corresponding files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ucf24.md\":21-59",
+            "content": "---\n## 2. File Pathlist Generation\nTo specify the format for dividing the file, enter the following command\n```python\npython build_split.py --raw_path ./splitfiles\n```\n**Description of parameters**\n`--raw_path`\uff1a indicates the storage path of the original division file\n# Folder Structure\nAfter the whole data pipeline for UCF24 preparation, the folder structure will look like:\n```\n\u251c\u2500\u2500 data\n\u2502   \u251c\u2500\u2500 ucf24\n\u2502   |   \u251c\u2500\u2500 groundtruths_ucf\n\u2502   |   \u251c\u2500\u2500 labels\n\u2502   |   |   \u251c\u2500\u2500 Basketball\n\u2502   |   |   |   \u251c\u2500\u2500 v_Basketball_g01_c01\n\u2502   |   |   |   |   \u251c\u2500\u2500 00009.txt\n\u2502   |   |   |   |   \u251c\u2500\u2500 00010.txt\n\u2502   |   |   |   |   \u251c\u2500\u2500 ...\n\u2502   |   |   |   |   \u251c\u2500\u2500 00050.txt\n\u2502   |   |   |   |   \u251c\u2500\u2500 00051.txt\n\u2502   |   |   \u251c\u2500\u2500 ...\n\u2502   |   |   \u251c\u2500\u2500 WalkingWithDog\n\u2502   |   |   |   \u251c\u2500\u2500 v_WalkingWithDog_g01_c01\n\u2502   |   |   |   \u251c\u2500\u2500 ...\n\u2502   |   |   |   \u251c\u2500\u2500 v_WalkingWithDog_g25_c04\n\u2502   |   \u251c\u2500\u2500 rgb-images\n\u2502   |   |   \u251c\u2500\u2500 Basketball\n\u2502   |   |   |   \u251c\u2500\u2500 v_Basketball_g01_c01\n\u2502   |   |   |   |   \u251c\u2500\u2500 00001.jpg\n\u2502   |   |   |   |   \u251c\u2500\u2500 00002.jpg\n\u2502   |   |   |   |   \u251c\u2500\u2500 ..."
+        },
+        {
+            "comment": "The code represents a file structure of the UCF101 dataset, containing various video sequences and split files for training and testing purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ucf24.md\":60-72",
+            "content": "\u2502   |   |   |   |   \u251c\u2500\u2500 00140.jpg\n\u2502   |   |   |   |   \u251c\u2500\u2500 00141.jpg\n\u2502   |   |   \u251c\u2500\u2500 ...\n\u2502   |   |   \u251c\u2500\u2500 WalkingWithDog\n\u2502   |   |   |   \u251c\u2500\u2500 v_WalkingWithDog_g01_c01\n\u2502   |   |   |   \u251c\u2500\u2500 ...\n\u2502   |   |   |   \u251c\u2500\u2500 v_WalkingWithDog_g25_c04\n\u2502   |   \u251c\u2500\u2500 splitfiles\n\u2502   |   |   \u251c\u2500\u2500 trainlist01.txt\n\u2502   |   |   |\u2500\u2500 testlist01.txt \n\u2502   |   \u251c\u2500\u2500 trainlist.txt\n\u2502   |   |\u2500\u2500 testlist.txt \n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8e1104d6-0fb2-4891-9ca2-0f69f5577975.json b/docs/doc/8e1104d6-0fb2-4891-9ca2-0f69f5577975.json
new file mode 100644
index 000000000..7db32c500
--- /dev/null
+++ b/docs/doc/8e1104d6-0fb2-4891-9ca2-0f69f5577975.json
@@ -0,0 +1,10 @@
+{
+    "summary": "These comments provide author, date, and copyright information, indicating the source should be cited if the code is reprinted.",
+    "details": [
+        {
+            "comment": "These comments provide author, date, and copyright information, indicating the source should be cited if the code is reprinted.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/setup.py\":0-3",
+            "content": "# Author: Acer Zhang\n# Datetime: 2022/1/11\n# Copyright belongs to the author.\n# Please indicate the source for reprinting."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8ea88484-a9f1-4daa-8c3d-5b3291deee68.json b/docs/doc/8ea88484-a9f1-4daa-8c3d-5b3291deee68.json
new file mode 100644
index 000000000..522c9d49b
--- /dev/null
+++ b/docs/doc/8ea88484-a9f1-4daa-8c3d-5b3291deee68.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines a custom exception \"ReaderNotFoundError\" and manages reader instances using a singleton ReaderZoo object. The `regist_reader` function registers new readers, while the `get_reader` function retrieves and returns an instance of the requested reader based on name, mode, and configuration.",
+    "details": [
+        {
+            "comment": "The code defines a class \"ReaderNotFoundError\" which is an exception to handle situations when a reader is not found. It takes the name of the missing reader and a list of available readers as arguments, and provides a formatted error message with the missing reader's name and a list of available readers. This can be useful for raising custom errors in cases where the required reader cannot be found or is not compatible with the provided options.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py\":0-29",
+            "content": "\"\"\"\nreader utils\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nclass ReaderNotFoundError(Exception):\n    \"Error: reader not found\"\n    def __init__(self, reader_name, avail_readers):\n        super(ReaderNotFoundError, self).__init__()\n        self.reader_name = reader_name\n        self.avail_readers = avail_readers\n    def __str__(self):\n        msg = \"Reader {} Not Found.\\nAvailiable readers:\\n\".format(\n            self.reader_name)\n        for reader in self.avail_readers:"
+        },
+        {
+            "comment": "This code defines classes for data readers and a reader registry. The `DataReader` class is a base class for different video input data readers, while the `ReaderZoo` class manages a dictionary of registered readers. The code snippet includes methods to register readers and retrieve them by name, mode, and configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py\":30-72",
+            "content": "            msg += \"  {}\\n\".format(reader)\n        return msg\nclass DataReader(object):\n    \"\"\"data reader for video input\"\"\"\n    def __init__(self, model_name, mode, cfg):\n        self.name = model_name\n        self.mode = mode\n        self.cfg = cfg\n    def create_reader(self):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"get_config_from_sec\n        \"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ReaderZoo(object):\n    \"\"\"ReaderZoo\n    \"\"\"\n    def __init__(self):\n        self.reader_zoo = {}\n    def regist(self, name, reader):\n        \"\"\"regist\n        \"\"\"\n        assert reader.__base__ == DataReader, \"Unknow model type {}\".format(\n            type(reader))\n        self.reader_zoo[name] = reader\n    def get(self, name, mode, cfg):\n        \"\"\"get\n        \"\"\"\n        for k, v in self.reader_zoo.items():\n            if k == name:\n                return v(name, mode, cfg)"
+        },
+        {
+            "comment": "This code snippet is responsible for managing reader instances, using a singleton ReaderZoo object. The `regist_reader` function allows registration of new readers, while the `get_reader` function retrieves and returns an instance of the requested reader based on the provided name, mode, and configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py\":73-90",
+            "content": "        raise ReaderNotFoundError(name, self.reader_zoo.keys())\n# singleton reader_zoo\nreader_zoo = ReaderZoo()\ndef regist_reader(name, reader):\n    \"\"\"regist_reader\n    \"\"\"\n    reader_zoo.regist(name, reader)\ndef get_reader(name, mode, cfg):\n    \"\"\"get_reader\n    \"\"\"\n    reader_model = reader_zoo.get(name, mode, cfg)\n    return reader_model.create_reader()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8ed1f1ea-7e25-4cda-be10-917e93f1c559.json b/docs/doc/8ed1f1ea-7e25-4cda-be10-917e93f1c559.json
new file mode 100644
index 000000000..569bea0de
--- /dev/null
+++ b/docs/doc/8ed1f1ea-7e25-4cda-be10-917e93f1c559.json
@@ -0,0 +1,40 @@
+{
+    "summary": "ADDS-DepthNet code estimates depth using day and night images, requiring scikit-image and matplotlib, utilizes Oxford RobotCar dataset, offers Resnet18_Imagenet pre-trained model addition, and provides training, testing instructions, and download URL. It demonstrates PaddlePaddle's predict.py tool for inference and saves results as pseudo-colored depth maps with two input images (RGB and depth estimation).",
+    "details": [
+        {
+            "comment": "This code is for the ADDS-DepthNet model, which is based on a self-supervised monocular depth estimation paper by Baidu Robotics and Autonomous Driving Laboratory. The code utilizes day and night images to reproduce the model and achieve advanced depth estimation results on the Oxford RobotCar dataset, mitigating the impact of lighting changes between day and night images. Additional dependencies like scikit-image and matplotlib are required before using the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/estimation/adds.md\":0-22",
+            "content": "[Simplified Chinese](../../../zh-CN/model_zoo/estimation/adds.md) | English\n# ADDS-DepthNet model\n## content\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install scikit-image\npython -m pip install matplotlib\n```\n## Introduction\nThis model is based on the ICCV 2021 paper **[Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628)** of Baidu Robotics and Autonomous Driving Laboratory,\nThe self-supervised monocular depth estimation model based on day and night images is reproduced, which utilizes the complementary nature of day and night image data, and slows down the large domain shift of day and night images and the accuracy of depth estimation caused by lighting changes. Impact, the most advanced depth estimation results of all-sky images have been achieved on the challenging Oxford RobotCar data set."
+        },
+        {
+            "comment": "This code provides instructions for downloading and adding a pre-trained model to the Oxford RobotCar dataset. It mentions the necessary steps to download the pre-training model, Resnet18_Imagenet.pdparams, using the wget command and specifying its path in the adds.yaml file. The code also highlights the importance of filling in the correct fields in the configuration file for proper association with the relevant model types and frameworks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/estimation/adds.md\":25-48",
+            "content": "## Data\nFor data download and preparation of Oxford RobotCar dataset, please refer to [Oxford RobotCar dataset data preparation](../../dataset/Oxford_RobotCar.md)\n## Train\n### Oxford RobotCar dataset training\n#### Download and add pre-trained models\n1. Download the image pre-training model [resnet18.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams) as Backbone initialization parameters, or download through the wget command\n   ```bash\n   wget -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams\n   ```\n2. Open `PaddleVideo/configs/estimation/adds/adds.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL: #MODEL field\n        framework: \"DepthEstimator\" #Mandatory, indicate the type of network, associate to the'paddlevideo/modeling/framework/'.\n        backbone: #Mandatory, indicate the type of backbone, associate to the'paddlevideo/modeling/backbones/'.\n            name: 'ADDS_DepthNet'"
+        },
+        {
+            "comment": "This code snippet provides instructions for training and testing the ADDS-DepthNet model using the Oxford RobotCar dataset. The provided commands initiate the training process with specific configuration file (`configs/estimation/adds/adds.yaml`) and seed value (20). Testing involves running separate commands to test day and night data sets, then recording their respective indicators. A download URL for a pre-trained model is also provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/estimation/adds.md\":49-71",
+            "content": "            pretrained: fill in the path here\n    ```\n#### Start training\n- The Oxford RobotCar dataset uses a single card for training, and the starting command for the training method is as follows:\n    ```bash\n    python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20\n    ```\n## Test\n- The ADDS-DepthNet model is verified synchronously during training (only the day or night data is verified). You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```bash\n  Already save the best model (rmse)8.5531\n  ```\n- Because the model can only test one day or night data set at a given path in the yaml file at a time, to get the complete test score at the beginning of this document, you need to run 4 test commands and record their indicators ( 40m during the day, 60m during the day, 40m at night, 60m at night)\n- Download URL of the trained model: [ADDS_car.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ADDS_car.pdparams)"
+        },
+        {
+            "comment": "The code provides test commands for running the ADDS model on the Oxford RobotCar dataset with varying maximum ground truth depth limits and different light conditions (night and daytime). It uses Python 3.7 to execute the main.py file from the PaddleVideo library, configs/estimation/adds/adds.yaml configuration, and specific dataset files for testing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/estimation/adds.md\":73-89",
+            "content": "- The test commands are as follows:\n  ```bash\n  # Night 40m\n  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w \"output/ADDS/ADDS_best.pdparams\" -o DATASET.test.file_path=\"data/oxford/splits/oxford_day/val_night_files.txt\" -o MODEL.head.max_gt_depth=40\n  # Night 60m\n  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w \"output/ADDS/ADDS_best.pdparams\" -o DATASET.test.file_path=\"data/oxford/splits/oxford_day/val_night_files.txt\" -o MODEL.head.max_gt_depth=60\n  # Daytime 40m\n  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w \"output/ADDS/ADDS_best.pdparams\" -o DATASET.test.file_path=\"data/oxford/splits/oxford_day/val_day_files.txt\" -o MODEL.head.max_gt_depth=40\n  # Daytime 60m\n  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w \"output/ADDS/ADDS_best.pdparams\" -o DATASET.test.file_path=\"data/oxford/splits/oxford_day/val_day_files.txt\" -o MODEL.head.max_gt_depth=60\n  ```\n    The test indicators on the validation dataset of Oxford RobotCar dataset are as follows:"
+        },
+        {
+            "comment": "The code presents a table comparing performance metrics of different models under various conditions. It shows the version, maximum depth, and several error measures for each model. The table also includes whether or not the delta value is less than 1.25 raised to different powers. The text describes how to run a command to export an inference model using Python script with specific configuration file, pre-trained parameters, and output directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/estimation/adds.md\":91-106",
+            "content": "  | version | Max Depth | Abs Rel | Sq Rel | RMSE | RMSE log | <img src=\"https://latex.codecogs.com/svg.image?\\delta&space;<&space;1.25&space;\" title=\"\\delta < 1.25 \" /> | <img src=\"https://latex.codecogs.com/svg.image?\\delta&space;<&space;1.25^2\" title=\"\\delta < 1.25^2\" /> | <img src=\"https://latex.codecogs.com/svg.image?\\delta&space;<&space;1.25^3\" title=\"\\delta < 1.25^3\" /> |\n  | ----------- | --------- | ------- | ------ | ----- | ------- | ----------------- |------------------- | ------------------- |\n  | ours(night) | 40 | 0.209 | 1.741 | 6.031 | 0.243 | 0.708 | 0.923 | 0.975 |\n  | ours(night) | 60 | 0.207 | 2.052 | 7.888 | 0.258 | 0.686 | 0.909 | 0.970 |\n  | ours(day) | 40 | 0.114 | 0.574 | 3.411 | 0.157 | 0.860 | 0.977 | 0.993 |\n  | ours(day) | 60 | 0.119 | 0.793 | 4.842 | 0.173 | 0.838 | 0.967 | 0.991 |\n## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/estimation/adds/adds.yaml -p data/ADDS_car.pdparams -o inference/ADDS\n```\nThe above command will"
+        },
+        {
+            "comment": "This code snippet demonstrates the usage of PaddlePaddle's predict.py tool for model inference. It uses a pre-trained model, ADDS, to estimate depth maps from input images. The model files are specified using the --model_file and --params_files parameters, while the input image file is provided with --input_file. The command also includes options for GPU usage (--use_gpu) and TensorRT acceleration (--use_tensorrt). The inference results will be saved as pseudo-colored depth maps by default.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/estimation/adds.md\":106-123",
+            "content": " generate the model structure file `ADDS.pdmodel` and model weight files `ADDS.pdiparams` and `ADDS.pdiparams.info` files needed for prediction, all of which are stored in the `inference/ADDS/` directory\nFor the meaning of each parameter in the above bash command, please refer to [Model Inference Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/en/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)\n### Use predictive engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.png \\\n                           --config configs/estimation/adds/adds.yaml \\\n                           --model_file inference/ADDS/ADDS.pdmodel \\\n                           --params_file inference/ADDS/ADDS.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nAt the end of the inference, the depth map estimated by the model will be saved in pseudo-color by default.\nThe following is a sample picture and the corresponding predicted depth map\uff1a"
+        },
+        {
+            "comment": "The code includes two images, one for regular RGB image and the other for depth estimation from the paper \"Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation\" by Liu et al.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/estimation/adds.md\":125-132",
+            "content": "<img src=\"../../../images/oxford_image.png\" width = \"512\" height = \"256\" alt=\"image\" align=center />\n<img src=\"../../../images/oxford_image_depth.png\" width = \"512\" height = \"256\" alt=\"depth\" align=center />\n## Reference\n- [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628), Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/8fb27118-de58-4d64-9c32-3cf7c53614bd.json b/docs/doc/8fb27118-de58-4d64-9c32-3cf7c53614bd.json
new file mode 100644
index 000000000..0648e8b95
--- /dev/null
+++ b/docs/doc/8fb27118-de58-4d64-9c32-3cf7c53614bd.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The function `is_eval_epoch` determines whether the model should be evaluated at a given epoch based on the provided configs, current epoch, and multigrid training schedule. If the current epoch is equal to the total number of epochs or if there's a non-null multigrid schedule, it checks if the current epoch is a time for evaluation based on the schedule intervals. The function returns True when an evaluation should occur and False otherwise.",
+    "details": [
+        {
+            "comment": "The function `is_eval_epoch` determines whether the model should be evaluated at a given epoch based on the provided configs, current epoch, and multigrid training schedule. If the current epoch is equal to the total number of epochs or if there's a non-null multigrid schedule, it checks if the current epoch is a time for evaluation based on the schedule intervals. The function returns True when an evaluation should occur and False otherwise.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/interval_helper.py\":0-18",
+            "content": "def is_eval_epoch(cfg, cur_epoch, total_epochs, multigrid_schedule):\n    \"\"\"\n    Determine if the model should be evaluated at the current epoch.\n    Args:\n        cfg (CfgNode): configs. Details can be found in\n            slowfast/config/defaults.py\n        cur_epoch (int): current epoch.\n        multigrid_schedule (List): schedule for multigrid training.\n    \"\"\"\n    if cur_epoch + 1 == total_epochs:\n        return True\n    if multigrid_schedule is not None:\n        prev_epoch = 0\n        for s in multigrid_schedule:\n            if cur_epoch < s[-1]:\n                period = max(\n                    (s[-1] - prev_epoch) // cfg.MULTIGRID.EVAL_FREQ + 1, 1)\n                return (s[-1] - 1 - cur_epoch) % period == 0\n            prev_epoch = s[-1]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9096484c-68d0-4d1c-bb82-be7b63d078f7.json b/docs/doc/9096484c-68d0-4d1c-bb82-be7b63d078f7.json
new file mode 100644
index 000000000..c6289f791
--- /dev/null
+++ b/docs/doc/9096484c-68d0-4d1c-bb82-be7b63d078f7.json
@@ -0,0 +1,75 @@
+{
+    "summary": "This code introduces SampleFrames class for PaddleVideo's loader pipelines using OpenCV, supports various modes and training, includes storage backend classes, converts images to numpy arrays, defines pipeline, and provides SampleAVAFrames class for sampling video frames.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and registering a class SampleFrames under PaddleVideo's loader pipelines. The class appears to sample frames from video, supporting different reading modes (color/grayscale/unchanged). It uses OpenCV (cv2) as the image processing backend.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":0-34",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nfrom PIL import Image\nfrom ..registry import PIPELINES\nimport os\nimport numpy as np\nimport io\nimport os.path as osp\nfrom abc import ABCMeta, abstractmethod\nimport cv2\nfrom cv2 import IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_UNCHANGED\nimport inspect\nimread_backend = 'cv2'\nimread_flags = {\n    'color': IMREAD_COLOR,\n    'grayscale': IMREAD_GRAYSCALE,\n    'unchanged': IMREAD_UNCHANGED\n}\n@PIPELINES.register()\nclass SampleFrames:"
+        },
+        {
+            "comment": "The function `__init__` initializes the parameters for sampling frames from a video, including clip length, frame interval, number of clips, temporal jittering options, and out-of-bound handling. The `_get_train_clips` function calculates the clip offsets in training mode by determining the average interval between clips based on the total number of frames. It then generates random base offsets and adds random offsets to create the final clip offsets.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":35-60",
+            "content": "    \"\"\"Sample frames from the video. \"\"\"\n    def __init__(self,\n                 clip_len,\n                 frame_interval=1,\n                 num_clips=1,\n                 temporal_jitter=False,\n                 twice_sample=False,\n                 out_of_bound_opt='loop',\n                 test_mode=False):\n        self.clip_len = clip_len\n        self.frame_interval = frame_interval\n        self.num_clips = num_clips\n        self.temporal_jitter = temporal_jitter\n        self.twice_sample = twice_sample\n        self.out_of_bound_opt = out_of_bound_opt\n        self.test_mode = test_mode\n        assert self.out_of_bound_opt in ['loop', 'repeat_last']\n    def _get_train_clips(self, num_frames):\n        \"\"\"Get clip offsets in train mode. \"\"\"\n        ori_clip_len = self.clip_len * self.frame_interval\n        avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips\n        if avg_interval > 0:\n            base_offsets = np.arange(self.num_clips) * avg_interval\n            clip_offsets = base_offsets + np.random.randint("
+        },
+        {
+            "comment": "This code calculates clip offsets for video sampling based on the number of frames and other parameters. It handles different scenarios, such as when the number of frames exceeds or equals the original clip length, when average interval is 0, and in test mode. The clip_offsets are returned at the end.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":61-81",
+            "content": "                avg_interval, size=self.num_clips)\n        elif num_frames > max(self.num_clips, ori_clip_len):\n            clip_offsets = np.sort(\n                np.random.randint(\n                    num_frames - ori_clip_len + 1, size=self.num_clips))\n        elif avg_interval == 0:\n            ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips\n            clip_offsets = np.around(np.arange(self.num_clips) * ratio)\n        else:\n            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)\n        return clip_offsets\n    def _get_test_clips(self, num_frames):\n        \"\"\"Get clip offsets in test mode. \"\"\"\n        ori_clip_len = self.clip_len * self.frame_interval\n        avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)\n        if num_frames > ori_clip_len - 1:\n            base_offsets = np.arange(self.num_clips) * avg_interval\n            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)\n            if self.twice_sample:\n                clip_offsets = np.concatenate([clip_offsets, base_offsets])"
+        },
+        {
+            "comment": "This code defines a class that samples video clips and loads frames based on different modes, such as testing or training. It takes the total number of frames in a video and returns the corresponding clip offsets and frame indices for loading. The sampling mode, temporal jitter, and out-of-bound options can be specified to customize the sampling process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":82-106",
+            "content": "        else:\n            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)\n        return clip_offsets\n    def _sample_clips(self, num_frames):\n        \"\"\"Choose clip offsets for the video in a given mode. \"\"\"\n        if self.test_mode:\n            clip_offsets = self._get_test_clips(num_frames)\n        else:\n            clip_offsets = self._get_train_clips(num_frames)\n        return clip_offsets\n    def __call__(self, results):\n        \"\"\"Perform the SampleFrames loading. \"\"\"\n        total_frames = results['total_frames']\n        clip_offsets = self._sample_clips(total_frames)\n        frame_inds = clip_offsets[:, None] + np.arange(\n            self.clip_len)[None, :] * self.frame_interval\n        frame_inds = np.concatenate(frame_inds)\n        if self.temporal_jitter:\n            perframe_offsets = np.random.randint(\n                self.frame_interval, size=len(frame_inds))\n            frame_inds += perframe_offsets\n        frame_inds = frame_inds.reshape((-1, self.clip_len))\n        if self.out_of_bound_opt == 'loop':"
+        },
+        {
+            "comment": "Code handles out-of-bound frame indices by wrapping them around, repeating the last frame, or throwing an error. It then updates results with frame indices, clip length, and number of clips.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":107-128",
+            "content": "            frame_inds = np.mod(frame_inds, total_frames)\n        elif self.out_of_bound_opt == 'repeat_last':\n            safe_inds = frame_inds < total_frames\n            unsafe_inds = 1 - safe_inds\n            last_ind = np.max(safe_inds * frame_inds, axis=1)\n            new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)\n            frame_inds = new_inds\n        else:\n            raise ValueError('Illegal out_of_bound option.')\n        start_index = results['start_index']\n        frame_inds = np.concatenate(frame_inds) + start_index\n        results['frame_inds'] = frame_inds.astype(np.int)\n        results['clip_len'] = self.clip_len\n        results['frame_interval'] = self.frame_interval\n        results['num_clips'] = self.num_clips\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'clip_len={self.clip_len}, '\n                    f'frame_interval={self.frame_interval}, '\n                    f'num_clips={self.num_clips}, '"
+        },
+        {
+            "comment": "This code defines three classes: `BaseStorageBackend` (abstract class), `HardDiskBackend`, and a generic file client called `FileClient`. The `BaseStorageBackend` is an abstract class that provides two methods, `get()` and `get_text()`, which are expected to be implemented by subclasses. The `HardDiskBackend` implements these methods for handling files stored on the hard disk. Finally, the `FileClient` serves as a generic file client to access files in different backends.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":129-165",
+            "content": "                    f'temporal_jitter={self.temporal_jitter}, '\n                    f'twice_sample={self.twice_sample}, '\n                    f'out_of_bound_opt={self.out_of_bound_opt}, '\n                    f'test_mode={self.test_mode})')\n        return repr_str\nclass BaseStorageBackend(metaclass=ABCMeta):\n    \"\"\"Abstract class of storage backends. \"\"\"\n    @abstractmethod\n    def get(self, filepath):\n        pass\n    @abstractmethod\n    def get_text(self, filepath):\n        pass\nclass HardDiskBackend(BaseStorageBackend):\n    \"\"\"Raw hard disks storage backend.\"\"\"\n    def get(self, filepath):\n        filepath = str(filepath)\n        with open(filepath, 'rb') as f:\n            value_buf = f.read()\n        return value_buf\n    def get_text(self, filepath):\n        filepath = str(filepath)\n        with open(filepath, 'r') as f:\n            value_buf = f.read()\n        return value_buf\nclass FileClient:\n    \"\"\"A general file client to access files in different backend. \"\"\"\n    _backends = {\n        'disk': HardDiskBackend,"
+        },
+        {
+            "comment": "This code defines a class with an initializer and a class method for registering backends. The initializer takes a backend argument, checks if it is supported, and initializes the client object with that backend. If the name or backend type is incorrect, it raises TypeError. The _register_backend method allows for backend registration by name, checking if it is a string and if the backend is a subclass of BaseStorageBackend. Raises KeyError if already registered.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":166-189",
+            "content": "    }\n    def __init__(self, backend='disk', **kwargs):\n        if backend not in self._backends:\n            raise ValueError(\n                f'Backend {backend} is not supported. Currently supported ones'\n                f' are {list(self._backends.keys())}')\n        self.backend = backend\n        self.client = self._backends[backend](**kwargs)\n    @classmethod\n    def _register_backend(cls, name, backend, force=False):\n        if not isinstance(name, str):\n            raise TypeError('the backend name should be a string, '\n                            f'but got {type(name)}')\n        if not inspect.isclass(backend):\n            raise TypeError(\n                f'backend should be a class but got {type(backend)}')\n        if not issubclass(backend, BaseStorageBackend):\n            raise TypeError(\n                f'backend {backend} is not a subclass of BaseStorageBackend')\n        if not force and name in cls._backends:\n            raise KeyError(\n                f'{name} is already registered as a storage backend, '"
+        },
+        {
+            "comment": "This code defines a class called FileClient, which handles file operations like registration and retrieval. It also registers a pipeline named RawFrameDecode for loading and decoding frames using specified backends for I/O and decoding. The class has an _pillow2array method to convert PIL image to numpy array in specific channel order.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":190-224",
+            "content": "                'add \"force=True\" if you want to override it')\n        cls._backends[name] = backend\n    @classmethod\n    def register_backend(cls, name, backend=None, force=False):\n        \"\"\"Register a backend to FileClient. \"\"\"\n        if backend is not None:\n            cls._register_backend(name, backend, force=force)\n            return\n        def _register(backend_cls):\n            cls._register_backend(name, backend_cls, force=force)\n            return backend_cls\n        return _register\n    def get(self, filepath):\n        return self.client.get(filepath)\n    def get_text(self, filepath):\n        return self.client.get_text(filepath)\n@PIPELINES.register()\nclass RawFrameDecode:\n    \"\"\"Load and decode frames with given indices. \"\"\"\n    def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs):\n        self.io_backend = io_backend\n        self.decoding_backend = decoding_backend\n        self.kwargs = kwargs\n        self.file_client = None\n    def _pillow2array(self,img, flag='color', channel_order='bgr'):"
+        },
+        {
+            "comment": "This code converts a Pillow image to a numpy array. It checks the channel order and flag, then either keeps the array unchanged or converts it from RGB to BGR if necessary. If the image mode is not RGB, it converts it to RGB first using convert('RGB'). If the mode is LA, a random color is used for the canvas to avoid shadowing black objects in the foreground.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":225-246",
+            "content": "        \"\"\"Convert a pillow image to numpy array. \"\"\"\n        channel_order = channel_order.lower()\n        if channel_order not in ['rgb', 'bgr']:\n            raise ValueError('channel order must be either \"rgb\" or \"bgr\"')\n        if flag == 'unchanged':\n            array = np.array(img)\n            if array.ndim >= 3 and array.shape[2] >= 3:  # color image\n                array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR\n        else:\n            # If the image mode is not 'RGB', convert it to 'RGB' first.\n            if img.mode != 'RGB':\n                if img.mode != 'LA':\n                    # Most formats except 'LA' can be directly converted to RGB\n                    img = img.convert('RGB')\n                else:\n                    # When the mode is 'LA', the default conversion will fill in\n                    #  the canvas with black, which sometimes shadows black objects\n                    #  in the foreground.\n                    #\n                    # Therefore, a random color (124, 117, 104) is used for canvas"
+        },
+        {
+            "comment": "The code reads an image from bytes and converts it into a numpy array based on the provided flag (color or grayscale) and channel order. It first checks if the flag is valid, then decodes the image using OpenCV's imdecode function. If the flag is color and channel order is rgb, it returns the image as is. For other combinations, it converts the image to RGB or grayscale before returning the numpy array.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":247-269",
+            "content": "                    img_rgba = img.convert('RGBA')\n                    img = Image.new('RGB', img_rgba.size, (124, 117, 104))\n                    img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha\n            if flag == 'color':\n                array = np.array(img)\n                if channel_order != 'rgb':\n                    array = array[:, :, ::-1]  # RGB to BGR\n            elif flag == 'grayscale':\n                img = img.convert('L')\n                array = np.array(img)\n            else:\n                raise ValueError(\n                    'flag must be \"color\", \"grayscale\" or \"unchanged\", '\n                    f'but got {flag}')\n        return array\n    def _imfrombytes(self,content, flag='color', channel_order='bgr'):#, backend=None):\n        \"\"\"Read an image from bytes. \"\"\"\n        img_np = np.frombuffer(content, np.uint8)\n        flag = imread_flags[flag] if isinstance(flag, str) else flag\n        img = cv2.imdecode(img_np, flag)\n        if flag == IMREAD_COLOR and channel_order == 'rgb':"
+        },
+        {
+            "comment": "This code defines a pipeline for decoding frames using the RawFrameDecode transform. It reads image files from the specified directory and suffix, handles different frame indices, and utilizes a file client to retrieve images in binary format. The cv2.cvtColor function is used to convert the color of images from BGR to RGB. The code also checks if the frame indices have the correct dimensions and squeezes them if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":270-300",
+            "content": "            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)\n        return img\n    def __call__(self, results):\n        \"\"\"Perform the ``RawFrameDecode`` to pick frames given indices.\n        Args:\n            results (dict): The resulting dict to be modified and passed\n                to the next transform in pipeline.\n        \"\"\"\n        # mmcv.use_backend(self.decoding_backend)\n        directory = results['frame_dir']\n        suffix = results['suffix']\n        #modality = results['modality']\n        if self.file_client is None:\n            self.file_client = FileClient(self.io_backend, **self.kwargs)\n        imgs = list()\n        if results['frame_inds'].ndim != 1:\n            results['frame_inds'] = np.squeeze(results['frame_inds'])\n        offset = results.get('offset', 0)\n        for frame_idx in results['frame_inds']:\n            frame_idx += offset\n            filepath = osp.join(directory, suffix.format(frame_idx))\n            img_bytes = self.file_client.get(filepath) #\u4ee5\u4e8c\u8fdb\u5236\u65b9\u5f0f\u8bfb\u53d6\u56fe\u7247\n            # Get frame with channel order RGB directly."
+        },
+        {
+            "comment": "Function applies image processing and resizing to input, appends frames to a list, and scales gt_bboxes and proposals accordingly. It then returns the results. The __repr__ function provides a string representation of the object's class and arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":302-325",
+            "content": "            cur_frame = self._imfrombytes(img_bytes, channel_order='rgb')\n            imgs.append(cur_frame)\n        results['imgs'] = imgs\n        results['original_shape'] = imgs[0].shape[:2]\n        results['img_shape'] = imgs[0].shape[:2]\n        # we resize the gt_bboxes and proposals to their real scale\n        h, w = results['img_shape']\n        scale_factor = np.array([w, h, w, h])\n        if 'gt_bboxes' in results:\n            gt_bboxes = results['gt_bboxes']\n            gt_bboxes_new = (gt_bboxes * scale_factor).astype(np.float32)\n            results['gt_bboxes'] = gt_bboxes_new\n        if 'proposals' in results and results['proposals'] is not None:\n            proposals = results['proposals']\n            proposals = (proposals * scale_factor).astype(np.float32)\n            results['proposals'] = proposals\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'io_backend={self.io_backend}, '\n                    f'decoding_backend={self.decoding_backend})')"
+        },
+        {
+            "comment": "The code defines a class called SampleAVAFrames, which inherits from SampleFrames. It takes clip length, frame interval, and test mode as arguments during initialization. The _get_clips method calculates the start and end indices for a given center index, taking into account skip offsets and shot information. The __call__ method retrieves fps, timestamp, timestamp_start, and shot_info from the results dictionary, and then calculates the center index to sample video frames around that index.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":326-353",
+            "content": "        return repr_str\n@PIPELINES.register()\nclass SampleAVAFrames(SampleFrames):\n    def __init__(self, clip_len, frame_interval=2, test_mode=False):\n        super().__init__(clip_len, frame_interval, test_mode=test_mode)\n    def _get_clips(self, center_index, skip_offsets, shot_info):\n        start = center_index - (self.clip_len // 2) * self.frame_interval\n        end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval\n        frame_inds = list(range(start, end, self.frame_interval))\n        frame_inds = frame_inds + skip_offsets\n        frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1)\n        return frame_inds\n    def __call__(self, results):\n        fps = results['fps']\n        timestamp = results['timestamp']\n        timestamp_start = results['timestamp_start']\n        shot_info = results['shot_info']\n        #delta=(timestamp - timestamp_start) \u4e3a\u8be5\u5e27\u8ddd\u79bb15min\u89c6\u9891\u5f00\u5934\u6709\u51e0\u79d2\n        #center_index=fps*delta\u4e3a\u8be5\u5e27\u8ddd\u79bb15min\u89c6\u9891\u5f00\u5934\u6709\u51e0\u5e27\n        #center_index+1\u662f\u4e3a\u4e86\u907f\u514d\u540e\u7eed\u91c7\u6837\u65f6\u51fa\u73b0\u8d1f\u6570? \n        #\u540e\u7eed\u9700\u8981\u4ee5center_index\u4e3a\u4e2d\u5fc3\u524d\u540e\u91c7\u6837\u89c6\u9891\u5e27\u7247\u6bb5"
+        },
+        {
+            "comment": "This function samples a video clip by calculating the center index and generating random skip offsets to select frames. It returns frame indices, clip length, frame interval, number of clips, and crop quadruple in a dictionary format for further processing. The `__repr__` method provides a concise string representation of the object's attributes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/sample_ava.py\":354-373",
+            "content": "        center_index = fps * (timestamp - timestamp_start) + 1\n        skip_offsets = np.random.randint(\n            -self.frame_interval // 2, (self.frame_interval + 1) // 2,\n            size=self.clip_len)\n        frame_inds = self._get_clips(center_index, skip_offsets, shot_info)\n        results['frame_inds'] = np.array(frame_inds, dtype=np.int)\n        results['clip_len'] = self.clip_len\n        results['frame_interval'] = self.frame_interval\n        results['num_clips'] = 1\n        results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32)\n        return results\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'clip_len={self.clip_len}, '\n                    f'frame_interval={self.frame_interval}, '\n                    f'test_mode={self.test_mode})')\n        return repr_str"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/909a3d87-fe65-4b1b-ba04-656643af5c4a.json b/docs/doc/909a3d87-fe65-4b1b-ba04-656643af5c4a.json
new file mode 100644
index 000000000..bd93c720e
--- /dev/null
+++ b/docs/doc/909a3d87-fe65-4b1b-ba04-656643af5c4a.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This Python file utilizes PaddleVideo and PaddlePaddle library to construct video pipelines, defining functions for dataset, pipeline, and dataloader creation. It also includes signal handlers to terminate child processes upon receiving specific signals.",
+    "details": [
+        {
+            "comment": "This code is a Python file for building video pipeline in PaddleVideo, which uses PaddlePaddle library. It imports necessary modules and defines a function to build the pipeline according to the provided configuration. The logger is used for logging purposes, and numpy is imported for numerical operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/builder.py\":0-32",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport signal\nimport os\nimport paddle\nfrom paddle.io import DataLoader, DistributedBatchSampler\nfrom .registry import DATASETS, PIPELINES\nfrom ..utils.build_utils import build\nfrom .pipelines.compose import Compose\nfrom paddlevideo.utils import get_logger\nimport numpy as np\nlogger = get_logger(\"paddlevideo\")\ndef build_pipeline(cfg):\n    \"\"\"Build pipeline.\n    Args:\n        cfg (dict): root config dict.\n    \"\"\""
+        },
+        {
+            "comment": "This code defines functions to build a dataset, batch pipeline, and dataloader for PaddleVideo's Video Quality Assessment application. The build_dataset function constructs the dataset using cfg config dictionary. The build_batch_pipeline function builds the batch pipeline. Lastly, the build_dataloader function creates a Paddle Dataloader using the constructed dataset and other parameters like batch size, num_workers, etc.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/builder.py\":33-73",
+            "content": "    return Compose(cfg)\ndef build_dataset(cfg):\n    \"\"\"Build dataset.\n    Args:\n        cfg (dict): root config dict.\n    Returns:\n        dataset: dataset.\n    \"\"\"\n    #XXX: ugly code here!\n    cfg_dataset, cfg_pipeline = cfg\n    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)\n    dataset = build(cfg_dataset, DATASETS, key=\"format\")\n    return dataset\ndef build_batch_pipeline(cfg):\n    \"\"\"build batch pipeline\"\"\"\n    batch_pipeline = build(cfg, PIPELINES)\n    return batch_pipeline\ndef build_dataloader(dataset,\n                     batch_size,\n                     num_workers,\n                     places,\n                     shuffle=True,\n                     drop_last=True,\n                     multigrid=False,\n                     collate_fn_cfg=None,\n                     **kwargs):\n    \"\"\"Build Paddle Dataloader.\n    XXX explain how the dataloader work!\n    Args:\n        dataset (paddle.dataset): A PaddlePaddle dataset object.\n        batch_size (int): batch size on single card.\n        num_worker (int): num_worker"
+        },
+        {
+            "comment": "The code creates a DistributedBatchSampler for dataset with optional shuffle and drop_last parameters, and defines a mix_collate_fn function that applies a predefined collate_fn_cfg to batch data and returns it in a specific format using build_batch_pipeline.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/builder.py\":74-96",
+            "content": "        shuffle(bool): whether to shuffle the data at every epoch.\n    \"\"\"\n    sampler = DistributedBatchSampler(dataset,\n                                      batch_size=batch_size,\n                                      shuffle=shuffle,\n                                      drop_last=drop_last)\n    #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.\n    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:\n    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.\n    def mix_collate_fn(batch):\n        \"\"\"mix collate fn\"\"\"\n        pipeline = build_batch_pipeline(collate_fn_cfg)\n        batch = pipeline(batch)\n        slots = []\n        for items in batch:\n            for i, item in enumerate(items):\n                if len(slots) < len(items):\n                    slots.append([item])\n                else:\n                    slots[i].append(item)\n        return [np.stack(slot, axis=0) for slot in slots]"
+        },
+        {
+            "comment": "The code defines a function that returns a DataLoader object. If collate_fn_cfg is not None, it creates a mix_collate_fn and assigns it to the collate_fn variable. The returned DataLoader has its collate_fn set according to the value of collate_fn_cfg. The code also sets up signal handlers for SIGINT and SIGTERM, calling the term_mp function on receipt of either signal. The term_mp function kills all child processes in the current process group.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/builder.py\":98-125",
+            "content": "    #if collate_fn_cfg is not None:\n    #ugly code here. collate_fn is mix op config\n    #    collate_fn = mix_collate_fn(collate_fn_cfg)\n    data_loader = DataLoader(\n        dataset,\n        batch_sampler=sampler,\n        places=places,\n        num_workers=num_workers,\n        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,\n        return_list=True,\n        **kwargs)\n    return data_loader\ndef term_mp(sig_num, frame):\n    \"\"\" kill all child processes\n    \"\"\"\n    pid = os.getpid()\n    pgid = os.getpgid(os.getpid())\n    logger.info(\"main proc {} exit, kill process group \" \"{}\".format(pid, pgid))\n    os.killpg(pgid, signal.SIGKILL)\n    return\nsignal.signal(signal.SIGINT, term_mp)\nsignal.signal(signal.SIGTERM, term_mp)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/911f800a-47b0-41b0-8261-06017d9187f6.json b/docs/doc/911f800a-47b0-41b0-8261-06017d9187f6.json
new file mode 100644
index 000000000..f0d6bf54f
--- /dev/null
+++ b/docs/doc/911f800a-47b0-41b0-8261-06017d9187f6.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code imports libraries and sets up an argument parser for PaddleVideo's Ma-Net application. It defines a function, configures model parameters, and trains video object detection tasks. The `--TEST_CHECKPOINT` and `--TEST_MODE` arguments are initialized with default values, and the training epoch count is calculated based on batch size and total steps.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and defining a function. It also sets up an argument parser, and provides default values for various parameters including the root directory, experiment name, save result directories, number of workers, KNNs, and pre-trained model path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/config.py\":0-31",
+            "content": "import paddle\nimport argparse\nimport os\nimport sys\nimport cv2\nimport time\ndef str2bool(v):\n    if isinstance(v, bool):\n        return v\n    if v.lower() in ('yes', 'true', 't', 'y', '1'):\n        return True\n    elif v.lower() in ('no', 'false', 'f', 'n', '0'):\n        return False\n    else:\n        raise argparse.ArgumentTypeError('Boolean value expected.')\nparser = argparse.ArgumentParser(description='intvos config')\nparser.add_argument('--ROOT_DIR',\n                    type=str,\n                    default=os.path.abspath(\n                        os.path.join(os.path.dirname(\"__file__\"))))\nparser.add_argument('--EXP_NAME', type=str, default='deeplabv3+coco')\nparser.add_argument('--SAVE_RESULT_DIR', type=str, default='../afs/result/')\nparser.add_argument('--SAVE_VOS_RESULT_DIR', type=str, default='')\nparser.add_argument('--NUM_WORKER', type=int, default=4)\nparser.add_argument('--KNNS', type=int, default=1)\nparser.add_argument('--PRETRAINED_MODEL',\n                    type=str,\n                    default='./model_best.pth.tar')"
+        },
+        {
+            "comment": "This code snippet is from the 'config.py' file in the PaddleVideo/applications/Ma-Net directory, and it defines command line arguments for the application. It sets default values for parameters related to result storage location, data configuration, and model configuration. These arguments can be overridden when running the application by specifying them on the command line.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/config.py\":32-52",
+            "content": "parser.add_argument(\n    '--RESULT_ROOT',\n    type=str,\n    default=os.path.join('../afs/vos_result/result_total_80000'))\n######DATA_CONFIG\nparser.add_argument('--DATA_NAME', type=str, default='COCO2017')\nparser.add_argument('--DATA_AUG', type=str2bool, default=True)\nparser.add_argument('--DATA_WORKERS', type=int, default=4)\nparser.add_argument('--DATA_RESCALE', type=int, default=416)\nparser.add_argument('--DATA_RANDOMCROP', type=int, default=416)\nparser.add_argument('--DATA_RANDOMROTATION', type=int, default=0)\nparser.add_argument('--DATA_RANDOM_H', type=int, default=10)\nparser.add_argument('--DATA_RANDOM_S', type=int, default=10)\nparser.add_argument('--DATA_RANDOM_V', type=int, default=10)\nparser.add_argument('--DATA_RANDOMFLIP', type=float, default=0.5)\nparser.add_argument('--DATA_ROOT', type=str, default='../data/DAVIS')\n######MODEL_CONFIG\nparser.add_argument('--MODEL_NAME', type=str, default='deeplabv3plus')\nparser.add_argument('--MODEL_BACKBONE', type=str, default='res101_atrous')\nparser.add_argument('--MODEL_OUTPUT_STRIDE', type=int, default=16)"
+        },
+        {
+            "comment": "This code snippet is from the \"config.py\" file in PaddleVideo's Ma-Net application, and it sets various model parameters like output dimension, shortcut dimensions, kernel size, number of classes, embedding dimensions, downsampling method, selection percentage, and training parameters such as learning rate, gamma, momentum, weight decay, and power. These parameters are used to configure and train the Ma-Net model for video object detection tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/config.py\":53-69",
+            "content": "parser.add_argument('--MODEL_ASPP_OUTDIM', type=int, default=256)\nparser.add_argument('--MODEL_SHORTCUT_DIM', type=int, default=48)\nparser.add_argument('--MODEL_SHORTCUT_KERNEL', type=int, default=1)\nparser.add_argument('--MODEL_NUM_CLASSES', type=int, default=21)\nparser.add_argument('--MODEL_SEMANTIC_EMBEDDING_DIM', type=int, default=100)\nparser.add_argument('--MODEL_HEAD_EMBEDDING_DIM', type=int, default=256)\nparser.add_argument('--MODEL_LOCAL_DOWNSAMPLE', type=str2bool, default=True)\nparser.add_argument('--MODEL_MAX_LOCAL_DISTANCE', type=int, default=12)\nparser.add_argument('--MODEL_SELECT_PERCENT', type=float, default=0.8)\nparser.add_argument('--MODEL_USEIntSeg', type=str2bool, default=False)\n######TRAIN_CONFIG\nparser.add_argument('--TRAIN_LR', type=float, default=0.0007)\nparser.add_argument('--TRAIN_LR_GAMMA', type=float, default=0.1)\nparser.add_argument('--TRAIN_MOMENTUM', type=float, default=0.9)\nparser.add_argument('--TRAIN_WEIGHT_DECAY', type=float, default=0.00004)\nparser.add_argument('--TRAIN_POWER', type=float, default=0.9)"
+        },
+        {
+            "comment": "This code snippet is part of the configuration file for the Ma-Net application in PaddleVideo. It includes various arguments and their default values for training the model, such as batch size, shuffling, gradient norm, number of epochs, total steps, loss lambda, logging settings, BN momentum, top K percent pixels, hard mining step, LR step size, and resuming from a specific directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/config.py\":70-87",
+            "content": "parser.add_argument('--TRAIN_BATCH_SIZE', type=int, default=2)\nparser.add_argument('--TRAIN_SHUFFLE', type=str2bool, default=True)\nparser.add_argument('--TRAIN_CLIP_GRAD_NORM', type=float, default=5.)\nparser.add_argument('--TRAIN_MINEPOCH', type=int, default=9)\nparser.add_argument('--TRAIN_TOTAL_STEPS', type=int, default=101000)\nparser.add_argument('--TRAIN_LOSS_LAMBDA', type=int, default=0)\nparser.add_argument('--TRAIN_TBLOG', type=str2bool, default=False)\nparser.add_argument('--TRAIN_BN_MOM', type=float,\n                    default=0.9997)  # fixed. difs between paddle and torch.\nparser.add_argument('--TRAIN_TOP_K_PERCENT_PIXELS', type=float, default=0.15)\nparser.add_argument('--TRAIN_HARD_MINING_STEP', type=int, default=50000)\nparser.add_argument('--TRAIN_LR_STEPSIZE', type=int, default=2000)\nparser.add_argument('--TRAIN_INTER_USE_TRUE_RESULT',\n                    type=str2bool,\n                    default=True)\nparser.add_argument('--TRAIN_RESUME_DIR', type=str, default='')\nparser.add_argument('--LOG_DIR', type=str, default=os.path.join('./log'))"
+        },
+        {
+            "comment": "This code snippet initializes the `--TEST_CHECKPOINT` and `--TEST_MODE` arguments using default values, then calculates the number of training epochs based on the batch size and the total number of steps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/config.py\":89-95",
+            "content": "parser.add_argument('--TEST_CHECKPOINT',\n                    type=str,\n                    default='save_step_100000.pth')\nparser.add_argument('--TEST_MODE', type=str2bool, default=False)\ncfg = parser.parse_args()\ncfg.TRAIN_EPOCHS = int(200000 * cfg.TRAIN_BATCH_SIZE / 60.)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/916a4e33-c5fa-4891-a1bc-0700e02e5986.json b/docs/doc/916a4e33-c5fa-4891-a1bc-0700e02e5986.json
new file mode 100644
index 000000000..f1e26bc14
--- /dev/null
+++ b/docs/doc/916a4e33-c5fa-4891-a1bc-0700e02e5986.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This introduction explains how to install PaddlePaddle and PaddleVideo, their requirements like Python 3.7 and CUDA 10.1, enabling distribution feature, setting shared memory in Docker, cloning PaddleVideo repo, upgrading pip and requirements, installing ppvideo package, and usage example specifying model, disabling GPU, and input video file.",
+    "details": [
+        {
+            "comment": "Introduction: Describes how to install PaddlePaddle, PaddleVideo, and their requirements.\nInstall PaddlePaddle: Requires Python 3.7, CUDA 10.1, CUDNN7.6.4 nccl2.1.2 and supports GPU training only. Follow the instructions on the website if PaddlePaddle on the device is lower than v2.0.\nInstallation commands: Use pip3 to install paddlepaddle-gpu or compile from source code, following instructions on the website.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/install.md\":0-40",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../zh-CN/install.md) | English\n# Installation\n---\n- [Introduction](#Introduction)\n- [Install PaddlePaddle](#Install-PaddlePaddle)\n- [Install PaddleVideo](#Install-PaddleVideo)\n## Introduction\nThis document introduces how to install PaddlePaddle\u3001PaddleVideo and its requirements.\n## Install PaddlePaddle\nPython 3.7, CUDA 10.1, CUDNN7.6.4 nccl2.1.2 and later version are required at first, For now, PaddleVideo only support training on the GPU device. Please follow the instructions in the [Installation](http://www.paddlepaddle.org.cn/install/quick) if the PaddlePaddle on the device is lower than v2.0\n**Install PaddlePaddle**\n```bash\npip3 install paddlepaddle-gpu --upgrade\n```\nor compile from source code, please refer to [Installation](http://www.paddlepaddle.org.cn/install/quick).\nVerify Installation\n```python\nimport paddle\npaddle.utils.run_check()\n```\nCheck PaddlePaddle version\uff1a\n```bash\npython3 -c \"import paddle; print(paddle.__version__)\"\n```\nNote:\n- Make sure the compiled version is later than PaddlePaddle2.0."
+        },
+        {
+            "comment": "WITH_DISTRIBUTE=ON: Enables the distribution feature in PaddleVideo, refer to Instruction for more details.\nDocker shm_size: Set --shm_size=32g when creating a docker container for enough shared memory.\nClone PaddleVideo: Navigate to desired path and clone the repository from GitHub.\nRequirements upgrade: Ensure pip is up-to-date before installing requirements.txt.\nInstall python package: Use pip3.7 to install specific version of ppvideo package.\nUsage example: Specify model, disable GPU usage, and input video file when running ppvideo script.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/install.md\":41-71",
+            "content": "- Indicate **WITH_DISTRIBUTE=ON** when compiling, Please refer to [Instruction](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#id3) for more details.\n- When running in the docker, in order to ensure that the container has enough shared memory for data read acceleration of Paddle, please set the parameter `--shm_size=32g` at creating a docker container, if conditions permit, you can set it to a larger value.\n---\n## Install PaddleVideo\n**Clone PaddleVideo:**\n```bash\ncd path_to_clone_PaddleVideo\ngit clone https://github.com/PaddlePaddle/PaddleVideo.git\n```\n**Install requirements**\n```bash\npython3.7 -m pip install --upgrade pip\npip3.7 install --upgrade -r requirements.txt\n```\n**Install python package**\n```bash\npip3.7 install ppvideo==2.3.0\n```\nuse scripts:\n```bash\nppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/916b43f1-7297-4ac5-9be5-e1b9bc622090.json b/docs/doc/916b43f1-7297-4ac5-9be5-e1b9bc622090.json
new file mode 100644
index 000000000..602f5c9a8
--- /dev/null
+++ b/docs/doc/916b43f1-7297-4ac5-9be5-e1b9bc622090.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code defines an AttentionLSTM class for a video tagging model, extending ModelBase. It initializes properties, retrieves configurations and dimensions, builds the LSTM attention model, applies fully connected layers, and uses an optimizer with piecewise learning rate decay and L2 regularization.",
+    "details": [
+        {
+            "comment": "The code is importing necessary modules and defining a class called AttentionLSTM. It extends the ModelBase class and has an __init__ method to initialize its properties such as name, configuration, mode, and is_videotag flag. The get_config method is also defined for retrieving configuration from a file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py\":0-30",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nfrom ..model import ModelBase\nfrom .lstm_attention import LSTMAttentionModel\nimport logging\nimport paddle\nimport paddle.static as static\nlogger = logging.getLogger(__name__)\n__all__ = [\"AttentionLSTM\"]\nclass AttentionLSTM(ModelBase):\n    def __init__(self, name, cfg, mode='train', is_videotag=False):\n        super(AttentionLSTM, self).__init__(name, cfg, mode)\n        self.is_videotag = is_videotag\n        self.get_config()"
+        },
+        {
+            "comment": "The code defines a model's configuration method, retrieving feature names, dimensions, number of classes, embedding size, LSTM size, and drop rate. It also gets mode-specific configurations such as batch size, number of GPUs, learning rate, weight decay, total training samples, and epochs for learning rate decay.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py\":32-53",
+            "content": "    def get_config(self):\n        # get model configs\n        self.feature_names = self.cfg.MODEL.feature_names\n        self.feature_dims = self.cfg.MODEL.feature_dims\n        self.num_classes = self.cfg.MODEL.num_classes\n        self.embedding_size = self.cfg.MODEL.embedding_size\n        self.lstm_size = self.cfg.MODEL.lstm_size\n        self.drop_rate = self.cfg.MODEL.drop_rate\n        # get mode configs\n        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size', 1)\n        self.num_gpus = self.get_config_from_sec(self.mode, 'num_gpus', 1)\n        if self.mode == 'train':\n            self.learning_rate = self.get_config_from_sec(\n                'train', 'learning_rate', 1e-3)\n            self.weight_decay = self.get_config_from_sec(\n                'train', 'weight_decay', 8e-4)\n            self.num_samples = self.get_config_from_sec('train', 'num_samples',\n                                                        5000000)\n            self.decay_epochs = self.get_config_from_sec(\n                'train', 'decay_epochs', [5])"
+        },
+        {
+            "comment": "The code initializes feature and label inputs for the model, depending on the mode. It also builds a dataloader if use_dataloader is True, but not recommended in infer mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py\":54-75",
+            "content": "            self.decay_gamma = self.get_config_from_sec('train', 'decay_gamma',\n                                                        0.1)\n    def build_input(self, use_dataloader):\n        self.feature_input = []\n        for name, dim in zip(self.feature_names, self.feature_dims):\n            self.feature_input.append(\n                static.data(shape=[None, dim],\n                           lod_level=1,\n                           dtype='float32',\n                           name=name))\n        if self.mode != 'infer':\n            self.label_input = static.data(shape=[None, self.num_classes],\n                                          dtype='float32',\n                                          name='label')\n        else:\n            self.label_input = None\n        if use_dataloader:\n            assert self.mode != 'infer', \\\n                    'dataloader is not recommendated when infer, please set use_dataloader to be false.'\n            self.dataloader = paddle.io.DataLoader.from_generator(\n                feed_list=self.feature_input + [self.label_input],"
+        },
+        {
+            "comment": "This code defines a LSTM attention model with multiple input features. It concatenates output of each feature, applies fully connected layers, and returns the result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py\":76-102",
+            "content": "                capacity=8,\n                iterable=True)\n    def build_model(self):\n        att_outs = []\n        for i, (input_dim,\n                feature) in enumerate(zip(self.feature_dims,\n                                          self.feature_input)):\n            att = LSTMAttentionModel(input_dim, self.embedding_size,\n                                     self.lstm_size, self.drop_rate)\n            att_out = att.forward(feature, is_training=(self.mode == 'train'))\n            att_outs.append(att_out)\n        if len(att_outs) > 1:\n            out = paddle.concat(x=att_outs, axis=1)\n        else:\n            out = att_outs[0]  # video only, without audio in videoTag\n        fc1 = static.nn.fc(\n            x=out,\n            size=8192,\n            activation='relu',\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)),\n            name='fc1')\n        fc2 = static.nn.fc(\n            x=fc1,"
+        },
+        {
+            "comment": "This code defines an attention LSTM model for video tagging. It uses a tanh activation function, L2 decay regularizer, and normal initializer for the fully connected layers. The logit layer applies sigmoid activation to output probabilities for each class. The optimizer function sets up a learning rate schedule using RMSProp optimizer with decay epochs and boundaries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py\":103-124",
+            "content": "            size=4096,\n            activation='tanh',\n            bias_attr=paddle.ParamAttr(\n                regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                initializer=paddle.nn.initializer.Normal(std=0.0)),\n            name='fc2')\n        self.logit = static.nn.fc(x=fc2, size=self.num_classes, activation=None, \\\n                              bias_attr=paddle.ParamAttr(regularizer=paddle.regularizer.L2Decay(coeff=0.0),\n                                                  initializer=paddle.nn.initializer.Normal(std=0.0)), name='output')\n        self.output = paddle.nn.functional.sigmoid(self.logit)\n    def optimizer(self):\n        assert self.mode == 'train', \"optimizer only can be get in train mode\"\n        values = [\n            self.learning_rate * (self.decay_gamma**i)\n            for i in range(len(self.decay_epochs) + 1)\n        ]\n        iter_per_epoch = self.num_samples / self.batch_size\n        boundaries = [e * iter_per_epoch for e in self.decay_epochs]\n        return paddle.optimizer.RMSProp("
+        },
+        {
+            "comment": "This code defines a model with an LSTM layer and attention mechanism. It uses piecewise learning rate decay, L2 weight decay regularization, calculates binary cross-entropy loss for classification tasks, and supports both training, validation, and inference modes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py\":125-150",
+            "content": "            learning_rate=paddle.optimizer.lr.PiecewiseDecay(values=values,\n                                                       boundaries=boundaries),\n            centered=True,\n            weight_decay=paddle.regularizer.L2Decay(coeff=self.weight_decay))\n    def loss(self):\n        assert self.mode != 'infer', \"invalid loss calculationg in infer mode\"\n        cost = paddle.nn.functional.binary_cross_entropy(\n            input=self.logit, label=self.label_input, reduction=None)\n        cost = paddle.sum(x=cost, axis=-1)\n        sum_cost = paddle.sum(x=cost)\n        self.loss_ = paddle.scale(sum_cost,\n                                        scale=self.num_gpus,\n                                        bias_after_scale=False)\n        return self.loss_\n    def outputs(self):\n        return [self.output, self.logit]\n    def feeds(self):\n        return self.feature_input if self.mode == 'infer' else self.feature_input + [\n            self.label_input\n        ]\n    def fetches(self):\n        if self.mode == 'train' or self.mode == 'valid':"
+        },
+        {
+            "comment": "This code defines a class with three methods. The first method, `fetch_list()`, returns the fetch list for different modes ('train', 'test', or 'infer'). In 'train' mode, it calculates losses and includes them in the fetch list. In 'test' mode, it does the same. In 'infer' mode, only the output is included in the fetch list. If an unrecognized mode is provided, a `NotImplementedError` is raised. The `weights_info()` method returns no information as it is not implemented yet. Lastly, the `load_pretrain_params()` method loads pretrained weights from a given file, excluding any \"fc_0\" layer parameters, and logs a message confirming this action.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py\":151-179",
+            "content": "            losses = self.loss()\n            fetch_list = [losses, self.output, self.label_input]\n        elif self.mode == 'test':\n            losses = self.loss()\n            fetch_list = [losses, self.output, self.label_input]\n        elif self.mode == 'infer':\n            fetch_list = [self.output]\n        else:\n            raise NotImplementedError('mode {} not implemented'.format(\n                self.mode))\n        return fetch_list\n    def weights_info(self):\n        return None, None\n    def load_pretrain_params(self, exe, pretrain, prog):\n        logger.info(\n            \"Load pretrain weights from {}, exclude fc layer.\".format(pretrain))\n        state_dict = paddle.static.load_program_state(pretrain)\n        dict_keys = list(state_dict.keys())\n        for name in dict_keys:\n            if \"fc_0\" in name:\n                del state_dict[name]\n                logger.info(\n                    'Delete {} from pretrained parameters. Do not load it'.\n                    format(name))\n        paddle.static.set_program_state(prog, state_dict)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9184c97c-2073-483f-84da-9aff0d4e4d06.json b/docs/doc/9184c97c-2073-483f-84da-9aff0d4e4d06.json
new file mode 100644
index 000000000..137ecfc90
--- /dev/null
+++ b/docs/doc/9184c97c-2073-483f-84da-9aff0d4e4d06.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This Python code implements a PaddlePaddle neural network head for classification tasks using ppTSN Head, initializing the base class and applying dropout regularization with an FC layer. The init_weights function sets the FC layer's initial weights.",
+    "details": [
+        {
+            "comment": "This code is the Python implementation of ppTSN Head, a classification model head used in PaddleVideo. The class has the number of classes and input channels as arguments. It inherits from BaseHead and is registered to the HEADS registry using @HEADS.register(). The code also imports necessary libraries and functions for its operations such as Linear, AdaptiveAvgPool2D, Dropout, ParamAttr, L2Decay, paddle.nn, and PaddleVideo's base and weight_init modules.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsn_head.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle import ParamAttr\nfrom paddle.nn import AdaptiveAvgPool2D, Linear, Dropout\nfrom paddle.regularizer import L2Decay\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass ppTSNHead(BaseHead):\n    \"\"\"ppTSN Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature."
+        },
+        {
+            "comment": "This code defines a class with an __init__ method, taking parameters such as num_classes, in_channels, loss_cfg, drop_ratio, std, data_format, and fclr5. It initializes the base class and sets the drop_ratio, std, and creates an AdaptiveAvgPool2D object for global pooling performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsn_head.py\":30-53",
+            "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        drop_ratio(float): drop ratio. Default: 0.4.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        data_format(str): data format of input tensor in ['NCHW', 'NHWC']. Default: 'NCHW'.\n        fclr5(bool): Whether to increase the learning rate of the fully connected layer. Default: True\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 drop_ratio=0.4,\n                 std=0.01,\n                 data_format=\"NCHW\",\n                 fclr5=True,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.drop_ratio = drop_ratio\n        self.std = std\n        # NOTE: global pool performance\n        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)"
+        },
+        {
+            "comment": "This code initializes and defines a PaddlePaddle neural network head for classification tasks. It includes optional dropout regularization, an FC layer with learnable parameters, and a forward function to process input data. The init_weights function is used to set the initial weights of the FC layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsn_head.py\":55-83",
+            "content": "        if self.drop_ratio != 0:\n            self.dropout = Dropout(p=self.drop_ratio)\n        else:\n            self.dropout = None\n        self.fc = Linear(\n            self.in_channels,\n            self.num_classes,\n            weight_attr=ParamAttr(learning_rate=5.0 if fclr5 else 1.0,\n                                  regularizer=L2Decay(1e-4)),\n            bias_attr=ParamAttr(learning_rate=10.0 if fclr5 else 1.0,\n                                regularizer=L2Decay(0.0)))\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'Normal',\n                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.,\n                     std=self.std)\n    def forward(self, x, num_seg=8):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples."
+        },
+        {
+            "comment": "This code snippet is responsible for processing the input and output of a PPTSN head model. It performs average pooling, reshapes the tensor, calculates the mean along an axis, applies dropout if applicable, reshapes again, and finally passes the result through a fully connected layer to produce scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptsn_head.py\":84-102",
+            "content": "        \"\"\"\n        # XXX: check dropout location!\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, in_channels]\n        if self.dropout is not None:\n            x = self.dropout(x)\n            # [N, in_channels]\n        x = paddle.reshape(x, shape=[-1, self.in_channels])\n        # [N, in_channels]\n        score = self.fc(x)\n        # [N, num_class]\n        # x = F.softmax(x)  # NOTE remove\n        return score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/918d2d55-8374-4222-b2e7-d252e0bbbd0f.json b/docs/doc/918d2d55-8374-4222-b2e7-d252e0bbbd0f.json
new file mode 100644
index 000000000..904e96a97
--- /dev/null
+++ b/docs/doc/918d2d55-8374-4222-b2e7-d252e0bbbd0f.json
@@ -0,0 +1,150 @@
+{
+    "summary": "The code trains Ma-Net stage 2 models with adjustable learning rates, applies binary cross-entropy loss, and evaluates performance. It also performs image processing, ROI operations, and video analysis using pretrained network weights.",
+    "details": [
+        {
+            "comment": "Import necessary libraries and modules, define custom data loader class, set device to GPU0, and disable static mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":0-33",
+            "content": "import cv2\nimport paddle\nimport paddle.nn as nn\nimport os\nimport numpy as np\n# from paddle.io import DataLoader\nimport paddle.optimizer as optim\nfrom paddle.vision import transforms\nfrom dataloaders.davis_2017_f import DAVIS2017_Train\nimport dataloaders.custom_transforms_f as tr\nfrom dataloaders.samplers import RandomIdentitySampler\nfrom networks.deeplab import DeepLab\nfrom networks.IntVOS import IntVOS\nfrom networks.loss import Added_BCEWithLogitsLoss, Added_CrossEntropyLoss\nfrom config import cfg\nfrom utils.api import float_, long_, byte_\nfrom utils.meters import AverageMeter\nfrom utils.mask_damaging import damage_masks, mask_damager\nfrom utils.utils import label2colormap\nfrom PIL import Image\nimport random\nimport scipy.misc as sm\nimport time\nimport davisinteractive.robot.interactive_robot as interactive_robot\npaddle.disable_static()\npaddle.device.set_device(\"gpu:0\")\nclass DataLoader(paddle.io.DataLoader):\n    def __init__(self,\n                 dataset,\n                 batch_size=1,\n                 shuffle=False,"
+        },
+        {
+            "comment": "The code initializes a DataLoader with parameters. It checks if the dataset contains tuples or lists, then sets return_list accordingly. It initializes the DataLoader using the dataset, batch size, shuffle, etc., and returns a DataLoader object for loading data efficiently.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":34-60",
+            "content": "                 sampler=None,\n                 batch_sampler=None,\n                 num_workers=0,\n                 collate_fn=None,\n                 pin_memory=False,\n                 drop_last=False,\n                 timeout=0,\n                 worker_init_fn=None,\n                 multiprocessing_context=None,\n                 generator=None):\n        if isinstance(dataset[0], (tuple, list)):\n            return_list = True\n        else:\n            return_list = False\n        super().__init__(dataset,\n                         feed_list=None,\n                         places=None,\n                         return_list=return_list,\n                         batch_sampler=batch_sampler,\n                         batch_size=batch_size,\n                         shuffle=shuffle,\n                         drop_last=drop_last,\n                         collate_fn=collate_fn,\n                         num_workers=num_workers,\n                         use_buffer_reader=True,\n                         use_shared_memory=False,"
+        },
+        {
+            "comment": "This code initializes a Manager object with options for GPU usage, time budget, result directory, pretrained model, and interactive testing. It loads the feature extractor, DeepLab, and the VOS model using provided parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":61-88",
+            "content": "                         timeout=timeout,\n                         worker_init_fn=worker_init_fn)\n        if sampler is not None:\n            self.batch_sampler.sampler = sampler\nclass Manager(object):\n    def __init__(self,\n                 use_gpu=True,\n                 time_budget=None,\n                 save_result_dir=cfg.SAVE_RESULT_DIR,\n                 pretrained=True,\n                 interactive_test=False):\n        self.save_res_dir = save_result_dir\n        self.time_budget = time_budget\n        self.feature_extracter = DeepLab(backbone='resnet')\n        if pretrained:\n            pretrained_dict = paddle.load(cfg.PRETRAINED_MODEL)\n            pretrained_dict = pretrained_dict['state_dict']\n            self.load_network(self.feature_extracter, pretrained_dict)\n            print('load pretrained model successfully.')\n        self.model = IntVOS(cfg, self.feature_extracter)\n        model_filename = cfg.SAVE_VOS_RESULT_DIR\n        pd = paddle.load(model_filename)\n        self.load_network(self.model, pd)"
+        },
+        {
+            "comment": "This code initializes a model and optimizer for training stage 2 of the Ma-Net application. It uses a GPU if specified, sets up training parameters, and initializes transforms to apply data augmentation during training. The model's segment head is trained using Momentum optimization with specified learning rate, momentum, and weight decay values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":90-118",
+            "content": "        print('load stage 1 model from', model_filename)\n        self.use_gpu = use_gpu\n        if use_gpu:\n            self.model = self.model\n    ##################################\n    def train(self,\n              damage_initial_previous_frame_mask=True,\n              lossfunc='cross_entropy',\n              model_resume=False,\n              eval_total=False,\n              init_prev=False):\n        ###################\n        interactor = interactive_robot.InteractiveScribblesRobot()\n        self.model.train()\n        running_loss = AverageMeter()\n        optimizer = optim.Momentum(parameters=[{\n            'params':\n            self.model.inter_seghead.parameters()\n        }],\n                                   learning_rate=cfg.TRAIN_LR,\n                                   momentum=cfg.TRAIN_MOMENTUM,\n                                   weight_decay=cfg.TRAIN_WEIGHT_DECAY)\n        ###################\n        composed_transforms = transforms.Compose([\n            tr.RandomHorizontalFlip(cfg.DATA_RANDOMFLIP),\n            tr.RandomScale(),"
+        },
+        {
+            "comment": "The code initializes a dataset, applies transformations to the images, and selects the loss function. It then sets up the maximum number of iterations, and keeps track of current iteration and round numbers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":119-144",
+            "content": "            tr.RandomCrop((cfg.DATA_RANDOMCROP, cfg.DATA_RANDOMCROP), 10),\n            tr.Resize(cfg.DATA_RESCALE),\n            tr.ToTensor()\n        ])\n        print('dataset processing...')\n        train_dataset = DAVIS2017_Train(root=cfg.DATA_ROOT,\n                                        transform=composed_transforms)\n        train_list = train_dataset.seqs\n        print('dataset processing finished.')\n        if lossfunc == 'bce':\n            criterion = Added_BCEWithLogitsLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,\n                                                cfg.TRAIN_HARD_MINING_STEP)\n        elif lossfunc == 'cross_entropy':\n            criterion = Added_CrossEntropyLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,\n                                               cfg.TRAIN_HARD_MINING_STEP)\n        else:\n            print(\n                'unsupported loss funciton. Please choose from [cross_entropy,bce]'\n            )\n        max_itr = cfg.TRAIN_TOTAL_STEPS\n        step = 0\n        round_ = 3\n        epoch_per_round = 30"
+        },
+        {
+            "comment": "The code checks if model resuming is enabled, and if so, loads a saved model from a specific step and updates the current model. It then enters a loop where it trains the interaction branch for each round, performing various data transformations like random flipping, scaling, cropping, resizing, and converting to tensor. The training stops after 80,001 steps or if r is not equal to 0 (first round).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":145-169",
+            "content": "        if model_resume:\n            saved_model_ = os.path.join(self.save_res_dir,\n                                        'save_step_75000.pth')\n            saved_model_ = paddle.load(saved_model_)\n            self.model = self.load_network(self.model, saved_model_)\n            step = 75000\n            print('resume from step {}'.format(step))\n        while step < cfg.TRAIN_TOTAL_STEPS:\n            if step > 80001:\n                break\n            for r in range(round_):\n                if r == 0:  #### r==0: Train the interaction branch in the first round\n                    print('start new')\n                    global_map_tmp_dic = {}\n                    train_dataset.transform = transforms.Compose([\n                        tr.RandomHorizontalFlip(cfg.DATA_RANDOMFLIP),\n                        tr.RandomScale(),\n                        tr.RandomCrop(\n                            (cfg.DATA_RANDOMCROP, cfg.DATA_RANDOMCROP)),\n                        tr.Resize(cfg.DATA_RESCALE),\n                        tr.ToTensor()"
+        },
+        {
+            "comment": "The code initializes a dataset, creates a data loader with a random sampler, and adjusts the learning rate. It then loops through the dataset for a specified number of epochs, accessing relevant sample features and labels. The length of the dataset and data loader are printed before training begins.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":170-190",
+            "content": "                    ])\n                    train_dataset.init_ref_frame_dic()\n                trainloader = DataLoader(train_dataset,\n                                         sampler=RandomIdentitySampler(\n                                             train_dataset.sample_list),\n                                         shuffle=False,\n                                         batch_size=cfg.TRAIN_BATCH_SIZE,\n                                         num_workers=0)\n                print('round:{} start'.format(r))\n                print(len(train_dataset))\n                print(len(trainloader))\n                for epoch in range(epoch_per_round):\n                    for ii, sample in enumerate(trainloader):\n                        now_lr = self._adjust_lr(optimizer, step, max_itr)\n                        ref_imgs = sample['ref_img']  # batch_size * 3 * h * w\n                        ref_scribble_labels = sample[\n                            'ref_scribble_label']  # batch_size * 1 * h * w\n                        seq_names = sample['meta']['seq_name']"
+        },
+        {
+            "comment": "The code initializes variables and sets up the model for training stage 2. It handles GPU usage, evaluates the model's feature extractor and semantic embedding, and extracts feature embeddings from reference frame images. It then checks if it's processing the first inter-frame instance and calls int_seghead function with reference frame embeddings as input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":191-211",
+            "content": "                        obj_nums = sample['meta']['obj_num']\n                        ref_frame_nums = sample['meta']['ref_frame_num']\n                        ref_frame_gts = sample['ref_frame_gt']\n                        bs, _, h, w = ref_imgs.shape\n                        ##########\n                        if self.use_gpu:\n                            inputs = ref_imgs\n                            ref_scribble_labels = ref_scribble_labels\n                            ref_frame_gts = ref_frame_gts\n                        ##########\n                        with paddle.no_grad():\n                            self.model.feature_extracter.eval()\n                            self.model.semantic_embedding.eval()\n                            ref_frame_embedding = self.model.extract_feature(\n                                inputs)\n                        if r == 0:\n                            first_inter = True\n                            tmp_dic = self.model.int_seghead(\n                                ref_frame_embedding=ref_frame_embedding,"
+        },
+        {
+            "comment": "This code snippet seems to be a part of a larger function and appears to involve image classification tasks. The code initializes variables such as `ref_scribble_labels`, `prev_round_label`, `normalize_nearest_neighbor_distances`, `global_map_tmp_dic`, `seq_names`, `gt_ids`, `k_nearest_neighbors`, and `frame_num`. The code also checks if a variable named `first_inter` exists, and if not, initializes it as `False` along with `prev_round_label` and performs some operations using the `model.int_seghead()` method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":212-228",
+            "content": "                                ref_scribble_label=ref_scribble_labels,\n                                prev_round_label=None,\n                                normalize_nearest_neighbor_distances=True,\n                                global_map_tmp_dic={},\n                                seq_names=seq_names,\n                                gt_ids=obj_nums,\n                                k_nearest_neighbors=cfg.KNNS,\n                                frame_num=ref_frame_nums,\n                                first_inter=first_inter)\n                        else:\n                            first_inter = False\n                            prev_round_label = sample['prev_round_label']\n                            prev_round_label = prev_round_label\n                            tmp_dic = self.model.int_seghead(\n                                ref_frame_embedding=ref_frame_embedding,\n                                ref_scribble_label=ref_scribble_labels,\n                                prev_round_label=prev_round_label,"
+        },
+        {
+            "comment": "The code initializes an empty dictionary for label and object dictionaries. It then iterates through the sequence names, assigning the corresponding ground truth frame and object number to each sequence in the label_and_obj_dic dictionary. Next, it iterates through the temporary dictionary keys, interpolating the predicted logits of each sequence to a fixed size (h, w) using bilinear interpolation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":229-246",
+            "content": "                                normalize_nearest_neighbor_distances=True,\n                                global_map_tmp_dic={},\n                                seq_names=seq_names,\n                                gt_ids=obj_nums,\n                                k_nearest_neighbors=cfg.KNNS,\n                                frame_num=ref_frame_nums,\n                                first_inter=first_inter)\n                        label_and_obj_dic = {}\n                        label_dic = {}\n                        for i, seq_ in enumerate(seq_names):\n                            label_and_obj_dic[seq_] = (ref_frame_gts[i],\n                                                       obj_nums[i])\n                        for seq_ in tmp_dic.keys():\n                            tmp_pred_logits = tmp_dic[seq_]\n                            tmp_pred_logits = nn.functional.interpolate(\n                                tmp_pred_logits,\n                                size=(h, w),\n                                mode='bilinear',"
+        },
+        {
+            "comment": "This code section is responsible for handling label and object dictionaries, preparing the data for different loss functions, and calculating the loss based on the provided data. It also performs necessary tensor conversions and optimizer operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":247-264",
+            "content": "                                align_corners=True)\n                            tmp_dic[seq_] = tmp_pred_logits\n                            label_tmp, obj_num = label_and_obj_dic[seq_]\n                            obj_ids = np.arange(0, obj_num + 1)\n                            obj_ids = paddle.to_tensor(obj_ids)\n                            obj_ids = paddle.to_tensor(obj_ids, dtype='int64')\n                            if lossfunc == 'bce':\n                                label_tmp = label_tmp.permute(1, 2, 0)\n                                label = (float_(label_tmp) == float_(obj_ids))\n                                label = label.unsqueeze(-1).permute(3, 2, 0, 1)\n                                label_dic[seq_] = float_(label)\n                            elif lossfunc == 'cross_entropy':\n                                label_dic[seq_] = long_(label_tmp)\n                        loss = criterion(tmp_dic, label_dic, step)\n                        loss = loss / bs\n                        optimizer.clear_grad()"
+        },
+        {
+            "comment": "Updating the running loss and printing details, including step, current learning rate, and loss values. Visualizing reference image and ground truth frame, and converting scribble labels to color maps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":265-286",
+            "content": "                        loss.backward()\n                        optimizer.step()\n                        running_loss.update(loss.item(), bs)\n                        if step % 50 == 0:\n                            print(\n                                'step:{},now_lr:{} ,loss:{:.4f}({:.4f})'.format(\n                                    step, now_lr, running_loss.val,\n                                    running_loss.avg))\n                            show_ref_img = ref_imgs.numpy()[0]\n                            mean = np.array([[[0.485]], [[0.456]], [[0.406]]])\n                            sigma = np.array([[[0.229]], [[0.224]], [[0.225]]])\n                            show_ref_img = show_ref_img * sigma + mean\n                            show_gt = ref_frame_gts[0].squeeze(0).numpy()\n                            show_gtf = label2colormap(show_gt).transpose(\n                                (2, 0, 1))\n                            show_scrbble = ref_scribble_labels[0].squeeze(\n                                0).numpy()"
+        },
+        {
+            "comment": "This code is handling the visualization of labels and predictions. If r is not zero, it retrieves the previous round label, maps it to a color map, and transposes it for visualization. If r is zero, it creates a zero-filled array for the previous round label. The final step is getting the predictions for the first sequence name, interpolating them to fit the image size, and preparing them for visualization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":287-305",
+            "content": "                            show_scrbble = label2colormap(\n                                show_scrbble).transpose((2, 0, 1))\n                            if r != 0:\n                                show_prev_round_label = prev_round_label[\n                                    0].squeeze(0).numpy()\n                                show_prev_round_label = label2colormap(\n                                    show_prev_round_label).transpose((2, 0, 1))\n                            else:\n                                show_prev_round_label = np.zeros_like(show_gt)\n                                show_prev_round_label = label2colormap(\n                                    show_prev_round_label).transpose((2, 0, 1))\n                            ##########\n                            show_preds = tmp_dic[seq_names[0]]\n                            show_preds = nn.functional.interpolate(\n                                show_preds,\n                                size=(h, w),\n                                mode='bilinear',"
+        },
+        {
+            "comment": "This code is segmenting an image by applying a binary cross-entropy or cross-entropy loss function to the output of a PaddlePaddle neural network. The resulting segmentation map is stored in 'show_preds_s' after being converted to a numpy array and then transformed using the 'label2colormap' function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":306-323",
+            "content": "                                align_corners=True)\n                            show_preds = show_preds.squeeze(0)\n                            if lossfunc == 'bce':\n                                show_preds = show_preds[1:]\n                                show_preds = (\n                                    paddle.nn.functional.sigmoid(show_preds) >\n                                    0.5)\n                                marker = paddle.argmax(show_preds, axis=0)\n                                show_preds_s = paddle.zeros((h, w))\n                                for i in range(show_preds.size(0)):\n                                    tmp_mask = (marker\n                                                == i) & (show_preds[i] > 0.5)\n                                    show_preds_s[tmp_mask] = i + 1\n                            elif lossfunc == 'cross_entropy':\n                                show_preds_s = paddle.argmax(show_preds, axis=0)\n                            show_preds_s = show_preds_s.numpy()\n                            show_preds_sf = label2colormap("
+        },
+        {
+            "comment": "This code block is responsible for saving the network at certain intervals during training. It checks if the current step is a multiple of 20,000 and not the first step, then calls save_network function to store the model's parameters. The model is also evaluated on the trainset and its performance might be influenced by the cfg.TRAIN_INTER_USE_TRUE_RESULT flag which determines whether to use the true result for evaluation. This block also resets transforms of the traindataset at specific rounds (r != round_-1).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":324-349",
+            "content": "                                show_preds_s).transpose((2, 0, 1))\n                            pix_acc = np.sum(show_preds_s == show_gt) / (h * w)\n                            ###########TODO\n                        if step % 20000 == 0 and step != 0:\n                            self.save_network(self.model, step)\n                        step += 1\n                print('trainset evaluating...')\n                print('*' * 100)\n                if cfg.TRAIN_INTER_USE_TRUE_RESULT:\n                    if r != round_ - 1:\n                        if r == 0:\n                            prev_round_label_dic = {}\n                        self.model.eval()\n                        with paddle.no_grad():\n                            round_scribble = {}\n                            frame_num_dic = {}\n                            train_dataset.transform = transforms.Compose(\n                                [tr.Resize(cfg.DATA_RESCALE),\n                                 tr.ToTensor()])\n                            trainloader = DataLoader("
+        },
+        {
+            "comment": "The code is initializing a data loader for training stage 2. It uses the RandomIdentitySampler with the train_dataset, sets shuffle to False, batch size to 1, and num_workers to 0. Then it iterates through the trainloader, extracting ref_imgs, img1s, img2s, ref_scribble_labels, label1s, label2s, and seq_names from each sample.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":350-366",
+            "content": "                                train_dataset,\n                                sampler=RandomIdentitySampler(\n                                    train_dataset.sample_list),\n                                shuffle=False,\n                                batch_size=1,\n                                num_workers=0)\n                            for ii, sample in enumerate(trainloader):\n                                ref_imgs = sample[\n                                    'ref_img']  # batch_size * 3 * h * w\n                                img1s = sample['img1']\n                                img2s = sample['img2']\n                                ref_scribble_labels = sample[\n                                    'ref_scribble_label']  # batch_size * 1 * h * w\n                                label1s = sample['label1']\n                                label2s = sample['label2']\n                                seq_names = sample['meta']['seq_name']\n                                obj_nums = sample['meta']['obj_num']"
+        },
+        {
+            "comment": "This code segment is a part of an image processing and scribble labeling task. It extracts the frame numbers from sample metadata, concatenates reference images, img1, and img2. It applies rough ROI (region of interest) operation on ref_scribble_labels if r equals 0. Then, it processes the label1s, creating a tensor for scribble labels to be used in the model's input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":367-385",
+            "content": "                                frame_nums = sample['meta']['frame_num']\n                                bs, _, h, w = img2s.shape\n                                inputs = paddle.concat((ref_imgs, img1s, img2s),\n                                                       0)\n                                if r == 0:\n                                    ref_scribble_labels = self.rough_ROI(\n                                        ref_scribble_labels)\n                                print(seq_names[0])\n                                label1s_tocat = None\n                                for i in range(bs):\n                                    l = label1s[i]\n                                    l = l.unsqueeze(0)\n                                    l = mask_damager(l, 0.0)\n                                    l = paddle.to_tensor(l)\n                                    l = l.unsqueeze(0).unsqueeze(0)\n                                    if label1s_tocat is None:\n                                        label1s_tocat = float_(l)"
+        },
+        {
+            "comment": "This code is part of a machine learning model training process. It appears to be concatenating label data (label1s_tocat) and checking if GPU usage is required. The model then processes input data, reference scribble labels, and labels (label1s) to produce outputs (tmp_dic). The specific output used is determined by the variable 'pred_label'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":386-405",
+            "content": "                                    else:\n                                        label1s_tocat = paddle.concat(\n                                            (label1s_tocat, float_(l)), 0)\n                                label1s = label1s_tocat\n                                if self.use_gpu:\n                                    inputs = inputs\n                                    ref_scribble_labels = ref_scribble_labels\n                                    label1s = label1s\n                                tmp_dic, global_map_tmp_dic = self.model(\n                                    inputs,\n                                    ref_scribble_labels,\n                                    label1s,\n                                    seq_names=seq_names,\n                                    gt_ids=obj_nums,\n                                    k_nearest_neighbors=cfg.KNNS,\n                                    global_map_tmp_dic=global_map_tmp_dic,\n                                    frame_num=frame_nums)\n                                pred_label = tmp_dic["
+        },
+        {
+            "comment": "The code is performing inference for an image classification task. It detaches, interpolates and converts the predicted label tensor to obtain the final prediction. The try-except block handles potential errors when applying a function called \"damage_masks\" on the prediction label. Finally, it applies the \"interact\" function from a class called \"interactor\" on the sequence with name \"seq_names[0]\" using numpy arrays for prediction and ground truth labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":406-422",
+            "content": "                                    seq_names[0]].detach().cpu()\n                                pred_label = nn.functional.interpolate(\n                                    pred_label,\n                                    size=(h, w),\n                                    mode='bilinear',\n                                    align_corners=True)\n                                pred_label = paddle.argmax(pred_label, axis=1)\n                                pred_label = pred_label.unsqueeze(0)\n                                try:\n                                    pred_label = damage_masks(pred_label)\n                                except:\n                                    pred_label = pred_label\n                                pred_label = pred_label.squeeze(0)\n                                round_scribble[\n                                    seq_names[0]] = interactor.interact(\n                                        seq_names[0], pred_label.numpy(),\n                                        float_(label2s).squeeze(0).numpy(),"
+        },
+        {
+            "comment": "This code opens an image and resizes the prediction label to match the image's height and width. Then, it updates the reference frame and label in the training dataset for a specific sequence name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":423-438",
+            "content": "                                        obj_nums)\n                                frame_num_dic[seq_names[0]] = frame_nums[0]\n                                pred_label = pred_label.unsqueeze(0)\n                                img_ww = Image.open(\n                                    os.path.join(cfg.DATA_ROOT,\n                                                 'JPEGImages/480p/',\n                                                 seq_names[0], '00000.jpg'))\n                                img_ww = np.array(img_ww)\n                                or_h, or_w = img_ww.shape[:2]\n                                pred_label = paddle.nn.functional.interpolate(\n                                    float_(pred_label), (or_h, or_w),\n                                    mode='nearest')\n                                prev_round_label_dic[\n                                    seq_names[0]] = pred_label.squeeze(0)\n                        train_dataset.update_ref_frame_and_label(\n                            round_scribble, frame_num_dic, prev_round_label_dic)"
+        },
+        {
+            "comment": "This code segment appears to be a part of a training process for a video analysis model. It's updating the reference frame and label based on the current round, possibly in a round-based training loop. The `RandomIdentitySampler` seems to be used to load the data for this specific round.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":440-461",
+            "content": "                    print(f'round {r}', 'trainset evaluating finished!')\n                    print('*' * 100)\n                    self.model.train()\n                    print('updating ref frame and label')\n                    train_dataset.transform = composed_transforms\n                    print('updating ref frame and label finished!')\n                else:\n                    if r != round_ - 1:\n                        round_scribble = {}\n                        if r == 0:\n                            prev_round_label_dic = {}\n                        frame_num_dic = {}\n                        train_dataset.transform = tr.ToTensor()\n                        trainloader = DataLoader(train_dataset,\n                                                 sampler=RandomIdentitySampler(\n                                                     train_dataset.sample_list),\n                                                 shuffle=False,\n                                                 batch_size=1,\n                                                 num_workers=0)"
+        },
+        {
+            "comment": "This code is in the \"train_stage2.py\" file of PaddleVideo's Ma-Net application, and it prepares for training by setting the model to evaluation mode, disabling gradient tracking with paddle.no_grad(), iterating over training data using trainloader, and extracting necessary samples. It also applies a mask_damager to label2s with 0.1 intensity to potentially improve model performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":463-480",
+            "content": "                        self.model.eval()\n                        with paddle.no_grad():\n                            for ii, sample in enumerate(trainloader):\n                                ref_imgs = sample[\n                                    'ref_img']  # batch_size * 3 * h * w\n                                img1s = sample['img1']\n                                img2s = sample['img2']\n                                ref_scribble_labels = sample[\n                                    'ref_scribble_label']  # batch_size * 1 * h * w\n                                label1s = sample['label1']\n                                label2s = sample['label2']\n                                seq_names = sample['meta']['seq_name']\n                                obj_nums = sample['meta']['obj_num']\n                                frame_nums = sample['meta']['frame_num']\n                                bs, _, h, w = img2s.shape\n                                print(seq_names[0])\n                                label2s_ = mask_damager(label2s, 0.1)"
+        },
+        {
+            "comment": "This code updates the reference frame and label for the train_dataset, sets the model to training mode, and prints progress messages. The interactor is used to interact with the first sequence's data and update the round_scribble variable. Label2s and frame_nums are used in this process, and prev_round_label_dic stores the previous round's label for future reference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":481-499",
+            "content": "                                round_scribble[\n                                    seq_names[0]] = interactor.interact(\n                                        seq_names[0],\n                                        np.expand_dims(label2s_, axis=0),\n                                        float_(label2s).squeeze(0).numpy(),\n                                        obj_nums)\n                                label2s__ = paddle.to_tensor(label2s_)\n                                frame_num_dic[seq_names[0]] = frame_nums[0]\n                                prev_round_label_dic[seq_names[0]] = label2s__\n                        print(f'round {r}', 'trainset evaluating finished!')\n                        print('*' * 100)\n                        print('updating ref frame and label')\n                        train_dataset.update_ref_frame_and_label(\n                            round_scribble, frame_num_dic, prev_round_label_dic)\n                        self.model.train()\n                        train_dataset.transform = composed_transforms"
+        },
+        {
+            "comment": "This function rough_ROI takes ref_scribble_labels as input and performs a region of interest (ROI) operation. It iterates over each batch element, identifies the valid non-background regions, calculates the minimum and maximum coordinates within these regions, and then creates a filter mask. The filter mask is used to selectively copy the ref_scribble_labels into final_scribble_labels for further processing. The operation ensures that only relevant regions are considered, improving efficiency.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":500-524",
+            "content": "                        print('updating ref frame and label finished!')\n    #############################################\n    def rough_ROI(self, ref_scribble_labels):\n        #### b*1*h*w\n        dist = 15\n        b, _, h, w = ref_scribble_labels.shape\n        filter_ = paddle.zeros_like(ref_scribble_labels)\n        to_fill = paddle.zeros_like(ref_scribble_labels)\n        for i in range(b):\n            no_background = (ref_scribble_labels[i] != -1)\n            no_background = no_background.squeeze(0)\n            no_b = no_background.nonzero()\n            h_min, w_min = paddle.min(no_b, 0)  # fixed\n            h_max, w_max = paddle.max(no_b, 0)  # fixed\n            filter_[i, 0,\n                    max(h_min - dist, 0):min(h_max + dist, h - 1),\n                    max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1\n        final_scribble_labels = paddle.where(byte_(filter_),\n                                             ref_scribble_labels,\n                                             to_fill)  # uint8_ fixed."
+        },
+        {
+            "comment": "The code defines three functions: \n1. `train_stage2.py:525-555`: The `return final_scribble_labels` statement concludes the function, returning the final scribble labels after some operations on them.\n2. `load_network`: Loads pretrained weights into a network by matching the keys in the pretrained dictionary with those in the model dictionary and updating the state dict accordingly.\n3. `save_network`: Saves the current state of the network at a specified step to a given directory, creating the directory if it doesn't exist.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":525-555",
+            "content": "        return final_scribble_labels\n    def load_network(self, net, pretrained_dict):\n        # pretrained_dict = pretrained_dict\n        model_dict = net.state_dict()\n        # 1. filter out unnecessary keys\n        pretrained_dict = {\n            k: v\n            for k, v in pretrained_dict.items() if k in model_dict\n        }\n        # 2. overwrite entries in the existing state dict\n        # for k in model_dict:\n        #     if k not in pretrained_dict:\n        #         print(k, 'not in loaded weights.')\n        model_dict.update(pretrained_dict)\n        net.set_state_dict(model_dict)\n        return net\n    def save_network(self, net, step):\n        save_path = self.save_res_dir\n        if not os.path.exists(save_path):\n            os.makedirs(save_path)\n        save_file = 'save_step_%s.pth' % (step)\n        paddle.save(net.state_dict(), os.path.join(save_path, save_file))\n    def _adjust_lr(self, optimizer, itr, max_itr):\n        now_lr = cfg.TRAIN_LR * (1 - itr / (max_itr + 1))**cfg.TRAIN_POWER\n        optimizer._param_groups[0]['lr'] = now_lr"
+        },
+        {
+            "comment": "The code defines a color palette with 81 RGB colors, ranging from black (0, 0, 0) to white (255, 255, 255). Each value represents the color's red, green, and blue components.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":556-572",
+            "content": "        return now_lr\n_palette = [\n    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,\n    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,\n    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,\n    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,\n    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,\n    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,\n    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,\n    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,\n    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,\n    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,\n    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,\n    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,"
+        },
+        {
+            "comment": "This code represents a list of numbers ranging from 81 to 150.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":573-585",
+            "content": "    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,\n    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,\n    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,\n    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,\n    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,\n    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,\n    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,\n    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,\n    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,\n    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,\n    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,"
+        },
+        {
+            "comment": "This code appears to be a sequence of numbers, and it is not clear what the purpose or functionality of this specific section of code is without further context.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":586-598",
+            "content": "    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,\n    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,\n    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,\n    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,\n    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,\n    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,\n    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,\n    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,\n    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,\n    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,\n    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,"
+        },
+        {
+            "comment": "The code snippet initializes a Manager object and calls its train() method. The list of numbers represents the dimensions of an image, potentially used for resizing or preprocessing during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/train_stage2.py\":599-611",
+            "content": "    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,\n    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,\n    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,\n    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,\n    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,\n    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,\n    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,\n    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,\n    255, 255\n]\nmanager = Manager()\nmanager.train()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/91c7470d-ffc8-4732-bca4-350b25078914.json b/docs/doc/91c7470d-ffc8-4732-bca4-350b25078914.json
new file mode 100644
index 000000000..c7ead9d4c
--- /dev/null
+++ b/docs/doc/91c7470d-ffc8-4732-bca4-350b25078914.json
@@ -0,0 +1,35 @@
+{
+    "summary": "DistributedShortSampler streamlines distributed data loading, dynamic batch sizes, and GPU support for PaddleVideo's multigrid. It efficiently calculates average batch size and offers sample dropping options.",
+    "details": [
+        {
+            "comment": "The code defines a DistributedShortSampler class which is a sampler for restricting data loading to a subset of the dataset in distributed training. It allows each process to load exclusive subsets by passing the DistributedBatchSampler as a DataLoader sampler and supports dynamic batch size changes following short cycle schedules. The class takes in a dataset, batch_sizes list, and optionally num_replicas (process number in distributed training).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/short_sampler.py\":0-27",
+            "content": "from __future__ import print_function\nfrom __future__ import division\nimport numpy as np\nimport math\nimport paddle\n__all__ = [\"DistributedShortSampler\"]\nclass DistributedShortSampler(paddle.io.BatchSampler):\n    \"\"\"Sampler that restricts data loading to a subset of the dataset.\n    In such case, each process can pass a DistributedBatchSampler instance\n    as a DataLoader sampler, and load a subset of the original dataset that\n    is exclusive to it.\n    .. note::\n        Batch size is dynamic changed following short cycle schedule.\n    Args:\n        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement\n                     or other python object which implemented\n                     `__len__` for BatchSampler to get sample\n                     number of data source.\n        batch_sizes(list): batch size list of one cycle.\n        num_replicas(int, optional): porcess number in distributed training.\n            If :attr:`num_replicas` is None, :attr:`num_replicas` will be\n            retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`."
+        },
+        {
+            "comment": "The `__init__` method initializes an instance of the class with a dataset, batch sizes, number of replicas (optional), rank (optional), whether to shuffle indices (optional), and whether to drop last incomplete batch (optional). The batch_sizes should be positive integers. The method performs assertions on the inputs to ensure validity.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/short_sampler.py\":28-50",
+            "content": "            Default None.\n        rank(int, optional): the rank of the current process among :attr:`num_replicas`\n            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from\n            :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.\n        shuffle(bool): whther to shuffle indices order before genrating\n            batch indices. Default False.\n        drop_last(bool): whether drop the last incomplete batch dataset size\n            is not divisible by the batch size. Default False\n    \"\"\"\n    def __init__(self,\n                 dataset,\n                 batch_sizes,\n                 num_replicas=None,\n                 rank=None,\n                 shuffle=False,\n                 drop_last=False):\n        self.dataset = dataset\n        assert any(isinstance(batch_size, int) and batch_size > 0 for batch_size in batch_sizes), \\\n            \"batch_size should be a positive integer\"\n        self.batch_sizes = batch_sizes\n        self.len_batch_sizes = len(self.batch_sizes)\n        assert isinstance(shuffle, bool), \\"
+        },
+        {
+            "comment": "The code initializes a MultigridSampler object, which manages the sampling of data across multiple ranks in distributed training. It checks for valid input values (boolean for shuffle and drop_last) and ensures positive integer for num_replicas. It determines the number of ranks and local rank based on provided values or environment. The total number of samples is calculated based on the dataset size and number of ranks, and an array of indices is created.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/short_sampler.py\":51-78",
+            "content": "            \"shuffle should be a boolean value\"\n        self.shuffle = shuffle\n        assert isinstance(drop_last, bool), \\\n            \"drop_last should be a boolean number\"\n        if num_replicas is not None:\n            assert isinstance(num_replicas, int) and num_replicas > 0, \\\n                \"num_replicas should be a positive integer\"\n            self.nranks = num_replicas\n        else:\n            self.nranks = paddle.distributed.ParallelEnv().nranks\n        if rank is not None:\n            assert isinstance(rank, int) and rank >= 0, \\\n                \"rank should be a non-negative integer\"\n            self.local_rank = rank\n        else:\n            self.local_rank = paddle.distributed.ParallelEnv().local_rank\n        self.drop_last = drop_last\n        self.epoch = 0\n        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))\n        self.total_size = self.num_samples * self.nranks\n    def __iter__(self):\n        num_samples = len(self.dataset)\n        indices = np.arange(num_samples).tolist()"
+        },
+        {
+            "comment": "This code ensures that the number of samples selected is equal to the total size, and then subsamples them by batch sizes. It handles the last batch with potentially fewer samples due to modulo operations and shuffles the indices if desired.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/short_sampler.py\":79-101",
+            "content": "        indices += indices[:(self.total_size -\n                             len(indices))]  #completion last iter\n        assert len(indices) == self.total_size\n        if self.shuffle:\n            np.random.RandomState(self.epoch).shuffle(indices)\n            self.epoch += 1\n        # subsample\n        def _get_indices_by_batch_size(indices):\n            total_batch_size = sum(self.batch_sizes)\n            subsampled_indices = []\n            last_batch_size = self.total_size % (\n                total_batch_size * self.nranks)  #number samples of last batch\n            assert last_batch_size % self.nranks == 0\n            last_local_batch_size = last_batch_size // self.nranks\n            for i in range(self.local_rank * total_batch_size,\n                           len(indices) - last_batch_size,\n                           total_batch_size * self.nranks):\n                subsampled_indices.extend(indices[i:i + total_batch_size])\n            indices = indices[len(indices) - last_batch_size:]\n            subsampled_indices.extend("
+        },
+        {
+            "comment": "This code is responsible for creating a sampler that supports dynamic batch sizes. It first sub-samples the input indices based on the local rank and local batch size. Then, it handles cases with multiple GPUs (ranks > 1), dividing the indices into batches of uniform size. Finally, it yields these batches until all samples have been used, or if the drop_last flag is set to False, it yields remaining samples even if they don't form a full batch. The average batch size is also calculated and stored in the class variable avg_batch_size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/short_sampler.py\":102-129",
+            "content": "                indices[self.local_rank *\n                        last_local_batch_size:(self.local_rank + 1) *\n                        last_local_batch_size])\n            return subsampled_indices\n        if self.nranks > 1:\n            indices = _get_indices_by_batch_size(indices)\n        assert len(indices) == self.num_samples  #index length in each card\n        _sample_iter = iter(indices)\n        batch_indices = []\n        counter = 0\n        batch_size = self.batch_sizes[0]\n        for idx in _sample_iter:\n            batch_indices.append(\n                (idx, counter %\n                 self.len_batch_sizes))  #to be used in dataloader get_item\n            if len(batch_indices) == batch_size:\n                yield batch_indices\n                counter += 1\n                batch_size = self.batch_sizes[counter % self.len_batch_sizes]\n                batch_indices = []\n        if not self.drop_last and len(batch_indices) > 0:\n            yield batch_indices\n    def __len__(self):\n        avg_batch_size = sum(self.batch_sizes) / float(self.len_batch_sizes)"
+        },
+        {
+            "comment": "This code defines a class for a sampler that can be used with PaddleVideo's multigrid. It calculates the number of samples to return based on batch size and either rounds down or up depending on whether drop_last is set. The set_epoch method sets the epoch number and, when shuffle is True, uses it as seeds for random numbers. This can result in the same ordering being yielded at all epochs if the same number is set each time.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/short_sampler.py\":130-145",
+            "content": "        if self.drop_last:\n            return int(np.floor(self.num_samples / avg_batch_size))\n        else:\n            return int(np.ceil(self.num_samples / avg_batch_size))\n    def set_epoch(self, epoch):\n        \"\"\"\n        Sets the epoch number. When :attr:`shuffle=True`, this number is used\n        as seeds of random numbers. By default, users may not set this, all\n        replicas (workers) use a different random ordering for each epoch.\n        If set same number at each epoch, this sampler will yield the same\n        ordering at all epoches.\n        Arguments:\n            epoch (int): Epoch number.\n        \"\"\"\n        self.epoch = epoch"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9207f54d-f34e-430b-b1ba-3c78b87b0be4.json b/docs/doc/9207f54d-f34e-430b-b1ba-3c78b87b0be4.json
new file mode 100644
index 000000000..5f57d69ec
--- /dev/null
+++ b/docs/doc/9207f54d-f34e-430b-b1ba-3c78b87b0be4.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a PaddlePaddle base class for semi-Video Object Segmentation, with methods for training, validating, testing, and inference, determined by the mode argument.",
+    "details": [
+        {
+            "comment": "This code is a base class for semi-Video Object Segmentation in PaddlePaddle. It requires subclasses to overwrite training, validation, and testing forward methods. The class also includes backbone modules for feature extraction and head modules for processing features, with specified loss functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py\":1-29",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseSegment(nn.Layer):\n    \"\"\"Base class for semi-Video Object Segmentation.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Head to process feature.\n        loss(dict): Loss function."
+        },
+        {
+            "comment": "The code initializes a model by building the backbone, head, and loss components. In the forward function, it defines how the model processes input data for training or inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py\":30-58",
+            "content": "    \"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__()\n        if backbone != None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head != None:\n            self.head_name = head.name\n            if head.name == 'IntVOS':\n                head.update({'feature_extracter': self.backbone})\n                self.head = builder.build_head(head)\n            else:\n                self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n        if loss != None:\n            self.loss = builder.build_loss(loss)\n        else:\n            self.loss = None\n    def forward(self, data_batch, mode='infer', **kwargs):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step"
+        },
+        {
+            "comment": "This code defines a class with different step methods for training, validating, testing, and inference. The mode argument determines which method to execute based on the current task. Each step method is marked as an abstractmethod requiring subclass implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py\":59-94",
+            "content": "        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch, **kwargs)\n        elif mode == 'valid':\n            return self.val_step(data_batch, **kwargs)\n        elif mode == 'test':\n            return self.test_step(data_batch, **kwargs)\n        elif mode == 'infer':\n            return self.infer_step(data_batch, **kwargs)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/929ed878-ca6a-465e-bfb5-f439a1a19297.json b/docs/doc/929ed878-ca6a-465e-bfb5-f439a1a19297.json
new file mode 100644
index 000000000..48fc19509
--- /dev/null
+++ b/docs/doc/929ed878-ca6a-465e-bfb5-f439a1a19297.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The script uses PyQt5 to create a video processing GUI with functions for initializing variables, opening file dialogs, handling combo box indexing, and pen color changes. It also includes a `open_frame` function that updates the progress slider and stops the video at the last frame.",
+    "details": [
+        {
+            "comment": "This code is the initial part of a Python script for building a GUI (Graphical User Interface) application using PyQt5 library. It defines a class called BuildGUI that inherits from QMainWindow and Ui_MainWindow, which likely contains the layout and design elements of the GUI. The __init__ method sets up some initial variables such as the selected video path and save path for results. The infer method is meant to start an inference process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/build_gui.py\":0-35",
+            "content": "# Author: Acer Zhang\n# Datetime:2022/1/11 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport json\nimport os\nimport numpy as np\nfrom PIL import Image\nfrom PyQt5 import QtCore, QtWidgets\nfrom PyQt5.QtGui import *\nfrom PyQt5.QtWidgets import *\nfrom PyQt5.QtCore import *\nimport cv2\nfrom EIVideo.api import json2frame, png2json, load_video\nfrom EIVideo.main import main\n# ToDo To AP-kai: \u8fd9\u662f\u5b9a\u4e49\u524d\u7aef\u4e34\u65f6\u4fdd\u5b58\u7528\u4e8e\u63a8\u7406\u7684json\u7684\u5730\u70b9\u4e4b\u7c7b\u7684\uff0c\u56e0\u4e3a\u662f\u56fa\u5b9a\u7684\uff0c\u6240\u4ee5\u58f0\u660e\u4e3a\u5168\u5c40\u5e38\u91cf\u662f\u6700\u597d\u7684\nfrom EIVideo import TEMP_JSON_SAVE_PATH, TEMP_IMG_SAVE_PATH, TEMP_JSON_FINAL_PATH\nfrom QEIVideo.gui.ui_main_window import Ui_MainWindow\nclass BuildGUI(QMainWindow, Ui_MainWindow):\n    def __init__(self):\n        super(BuildGUI, self).__init__()\n        # ToDo To AP-kai: \u8fd9\u91cc\u5b9a\u4e49\u5f53\u524d\u9009\u62e9\u7684\u89c6\u9891\u8def\u5f84\u7684\u5360\u4f4d\u7b26\uff0c\u76f8\u5f53\u4e8e\u5168\u5c40\u53d8\u91cf\n        self.select_video_path = None\n        # ToDo To AP-kai: \u672a\u6765\u4e3a\u7528\u6237\u63d0\u4f9b\u4e2a\u4fdd\u5b58\u8def\u5f84\u7684\u5165\u53e3\u54c8\uff0c\u8fd9\u91cc\u5148\u968f\u610f\u5b9a\u4e49\u4e86\u4e2a\u8def\u5f84\n        self.save_path = \"./result\"\n        os.makedirs(self.save_path, exist_ok=True)\n        self.setupUi(self)\n    def infer(self):\n        self.label.setText(\"Start infer\")"
+        },
+        {
+            "comment": "The code snippet sets the progress bar value, saves an image, prints frame numbers, calls a main function to perform inference, loads JSON frames, opens a frame, and updates a label when play button is clicked.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/build_gui.py\":36-58",
+            "content": "        self.progressBar.setProperty(\"value\", 0)\n        image = self.paintBoard.get_content_as_q_image()\n        image.save(TEMP_IMG_SAVE_PATH)\n        print(self.slider_frame_num)\n        self.progressBar.setProperty(\"value\", 25)\n        # ToDo To AP-kai:\u76f8\u540c\u7684\u6587\u4ef6\u8def\u5f84\uff0c\u76f4\u63a5\u5b9a\u4e49\u4e00\u4e2a\u5e38\u91cf\u5c31\u597d\n        png2json(TEMP_IMG_SAVE_PATH, self.slider_frame_num, TEMP_JSON_SAVE_PATH)\n        self.progressBar.setProperty(\"value\", 50)\n        # ToDo To AP-kai:\u6253\u5370\u7684\u4fe1\u606f\uff0c\u9700\u8981\u6ce8\u610f\u9996\u5b57\u6bcd\u5927\u5199\n        # ToDo To AP-kai: \u6b64\u5904\u4f20\u5165\u4fdd\u5b58\u8def\u5f84\u4ee5\u53ca\u5f53\u524d\u9009\u62e9\u7684\u89c6\u9891\u8def\u5f84\uff0c\u6700\u540e\u4f1a\u5728manet_stage1.py\u91cc\u901a\u8fc7cfg\u6765\u4f20\u5165\n        out = main(video_path=self.select_video_path, save_path=self.save_path)\n        print('Infer ok')\n        self.progressBar.setProperty(\"value\", 75)\n        self.all_frames = json2frame(TEMP_JSON_FINAL_PATH)\n        print(\"Success get submit_masks\")\n        self.open_frame()\n        self.progressBar.setProperty(\"value\", 100)\n        self.label.setText(\"Infer succeed\")\n    def btn_func(self, btn):\n        if btn == self.playbtn:\n            self.label.setText(\"Play video\")\n            if self.progress_slider.value() == self.cap.get(7) - 1:"
+        },
+        {
+            "comment": "The code above contains three elif conditions for button press events. If the self.pushButton_1 is pressed, it initializes variables and starts a timer to update the video frame. If self.pushButton_2 is pressed, it stops the video and sets the label text to \"Stop video\". If self.pushButton_4 is pressed, it opens a file dialog for choosing a video file, and if a file is chosen, it prints the selected video file path. The current code is checking if there is a non-empty selected video file path after the file dialog is closed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/build_gui.py\":59-77",
+            "content": "                self.slider_frame_num = 0\n                self.progress_slider.setValue(self.slider_frame_num)\n                self.time_label.setText('{}/{}'.format(self.slider_frame_num, self.cap.get(7)))\n            self.timer_camera = QTimer()  # \u5b9a\u4e49\u5b9a\u65f6\u5668\n            self.timer_camera.start(1000 / self.cap.get(cv2.CAP_PROP_FPS))\n            self.slider_frame_num = self.progress_slider.value()\n            self.timer_camera.timeout.connect(self.open_frame)\n        elif btn == self.pushButton_2:\n            self.label.setText(\"Stop video\")\n            self.slot_stop()\n        elif btn == self.pushButton_4:\n            self.label.setText(\"Choose video\")\n            self.select_video_path, _ = QFileDialog.getOpenFileName(self.frame, \"Open\", \"\", \"*.mp4;;All Files(*)\")\n            print(\"-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\")\n            print(\"Select video file path:\\t\" + self.select_video_path)\n            # ToDo To AP-kai:\u4e0b\u65ad\u70b9\u6765\u770b\u4e00\u4e0b\uff0c\u5982\u679c\u4e0d\u9009\u62e9\u7684\u65f6\u5019\u8fd4\u56de\u503c\u662f\u4ec0\u4e48\u6837\u7684\uff0c\u7136\u540e\u518d\u505a\u5224\u65ad\uff0c\u76ee\u524d\u8fd9\u4e2aif\u6ca1\u6709\u751f\u6548\n            if self.select_video_path != \"\":"
+        },
+        {
+            "comment": "The code snippet creates a GUI for video processing. It sets up a VideoCapture object, stores all frames, and initializes a progress slider with the total number of frames. The Eraser button toggles between EraserMode on/off in the paintBoard. The fill_color_list function populates a color combo box with predefined colors, including black at a specific index.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/build_gui.py\":78-106",
+            "content": "                self.cap = cv2.VideoCapture(self.select_video_path)\n                # \u5b58\u6240\u6709frame\n                self.save_temp_frame()\n                print(\"save temp frame done\")\n                self.progress_slider.setRange(0, self.cap.get(cv2.CAP_PROP_FRAME_COUNT))\n                self.slider_frame_num = 0\n                self.open_frame()\n            # ToDo To AP-kai: \u672a\u6765\u8fd9\u4e2a\u5730\u65b9\u589e\u52a0\u63d0\u793a\u6846\uff0c\u544a\u8bc9\u4ed6\u6ca1\u6709\u9009\u62e9\u6587\u4ef6\n    def on_cbtn_eraser_clicked(self):\n        self.label.setText(\"Eraser On\")\n        if self.cbtn_Eraser.isChecked():\n            self.paintBoard.EraserMode = True  # \u8fdb\u5165\u6a61\u76ae\u64e6\u6a21\u5f0f\n        else:\n            self.paintBoard.EraserMode = False  # \u9000\u51fa\u6a61\u76ae\u64e6\u6a21\u5f0f\n    def fill_color_list(self, combo_box):\n        index_black = 0\n        index = 0\n        for color in self.colorList:\n            if color == \"black\":\n                index_black = index\n            index += 1\n            pix = QPixmap(70, 20)\n            pix.fill(QColor(color))\n            combo_box.addItem(QIcon(pix), None)\n            combo_box.setIconSize(QSize(70, 20))\n            combo_box.setSizeAdjustPolicy(QComboBox.AdjustToContents)"
+        },
+        {
+            "comment": "This code defines several functions for a GUI application. It sets the current index of a combo box, handles pen color changes by updating the paintBoard's color, updates the video position based on progress slider input, saves a temporary frame from a video file, and stops the timer if it is running.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/build_gui.py\":108-134",
+            "content": "        combo_box.setCurrentIndex(index_black)\n    def on_pen_color_change(self):\n        self.label.setText(\"Change pen color\")\n        color_index = self.comboBox_penColor.currentIndex()\n        color_str = self.colorList[color_index]\n        self.paintBoard.change_pen_color(color_str)\n    # \u62d6\u62fd\u8fdb\u5ea6\u6761\n    def update_video_position_func(self):\n        self.label.setText(\"Change slider position\")\n        self.slider_frame_num = self.progress_slider.value()\n        self.slot_stop()\n        self.open_frame()\n        self.progress_slider.setValue(self.slider_frame_num)\n        self.time_label.setText('{}/{}'.format(self.slider_frame_num, self.cap.get(7)))\n    def save_temp_frame(self):\n        _, self.all_frames = load_video(self.select_video_path, 480)\n    def slot_stop(self):\n        if self.cap != []:\n            self.timer_camera.stop()  # \u505c\u6b62\u8ba1\u65f6\u5668\n        else:\n            # ToDo To AP-kai: QMessageBox.warning\u6ca1\u6709\u8fd4\u56de\u503c\uff0c\u8fd9\u91cc\u6211\u628aWarming = QMessageBox.warning\u7684Warming\u5220\u53bb\u4e86\n            QMessageBox.warning(self, \"Warming\", \"Push the left upper corner button to Quit.\","
+        },
+        {
+            "comment": "This code defines a function `open_frame` which updates the progress slider, displays the current frame using QImage and QPixmap, increments the slider value, sets the time label, and stops the video if at the last frame.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/build_gui.py\":135-150",
+            "content": "                                QMessageBox.Yes)\n    def open_frame(self):\n        self.progress_slider.setValue(self.slider_frame_num)\n        self.slider_frame_num = self.progress_slider.value()\n        self.frame = self.all_frames[self.slider_frame_num]\n        frame = self.frame\n        height, width, bytes_per_component = frame.shape\n        bytes_per_line = bytes_per_component * width\n        q_image = QImage(frame.data, width, height, bytes_per_line,\n                         QImage.Format_RGB888).scaled(self.picturelabel.width(), self.picturelabel.height())\n        self.picturelabel.setPixmap(QPixmap.fromImage(q_image))\n        self.slider_frame_num = self.slider_frame_num + 1\n        self.time_label.setText('{}/{}'.format(self.slider_frame_num, self.cap.get(7)))\n        if self.progress_slider.value() == self.cap.get(7) - 1:\n            self.slot_stop()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/92ab2251-9095-46cf-9122-551bb4a0a6c6.json b/docs/doc/92ab2251-9095-46cf-9122-551bb4a0a6c6.json
new file mode 100644
index 000000000..1bbcdb6b2
--- /dev/null
+++ b/docs/doc/92ab2251-9095-46cf-9122-551bb4a0a6c6.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code extracts audio features for Table Tennis prediction, using spectrogram and Mel scale transformation, and reads WAV files with VGG-16 model for MFCC and STFT feature extraction.",
+    "details": [
+        {
+            "comment": "This code is for audio feature extraction in TableTennis application. It defines functions `frame`, `periodic_hann` and `stft_magnitude`. The `frame` function resizes the data array into frames with specified window length and hop length. The `periodic_hann` function generates a periodic Hann window for the STFT operation. Finally, `stft_magnitude` calculates the magnitude of the Short-Time Fourier Transform (STFT) of an audio signal.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py\":0-38",
+            "content": "\"\"\"\naudio feature extract\n\"\"\"\n# coding: utf-8\nimport os\nimport numpy as np\nimport pickle\nimport mfcc.vgg_params as vgg_params\nimport sys\ndef frame(data, window_length, hop_length):\n    \"\"\"\n    frame\n    \"\"\"\n    num_samples = data.shape[0]\n    #print(\"window_length , hop_length\", window_length, hop_length)\n    #print(\"num_sample = \", num_samples)\n    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))\n    #print(\" num_frames = \", num_frames)\n    shape = (num_frames, window_length) + data.shape[1:]\n    #print(\" shape = \", shape)\n    strides = (data.strides[0] * hop_length, ) + data.strides\n    #print(\"data.strides = \", data.strides)\n    #print(\"strides = \", strides)\n    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)\ndef periodic_hann(window_length):\n    \"\"\"\n    periodic_hann\n    \"\"\"\n    return 0.5 - (0.5 *\n                  np.cos(2 * np.pi / window_length * np.arange(window_length)))\ndef stft_magnitude(signal, fft_length, hop_length=None, window_length=None):\n    \"\"\"\n    stft_magnitude"
+        },
+        {
+            "comment": "The code defines functions for feature extraction and conversion of audio signals. The \"hertz_to_mel\" function converts frequencies from Hertz to Mel scale, which is used in psychoacoustics. The \"spectrogram_to_mel_matrix\" function creates a mel-frequency cepstral coefficients (MFCC) matrix for audio spectrograms. It checks for lower and upper frequency edge validity and calculates Mel frequencies based on the provided parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py\":39-69",
+            "content": "    \"\"\"\n    frames = frame(signal, window_length, hop_length)\n    window = periodic_hann(window_length)\n    windowed_frames = frames * window\n    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))\n_MEL_BREAK_FREQUENCY_HERTZ = 700.0\n_MEL_HIGH_FREQUENCY_Q = 1127.0\ndef hertz_to_mel(frequencies_hertz):\n    \"\"\"\n    hertz_to_mel\n    \"\"\"\n    return _MEL_HIGH_FREQUENCY_Q * np.log(1.0 + (frequencies_hertz /\n                                                 _MEL_BREAK_FREQUENCY_HERTZ))\ndef spectrogram_to_mel_matrix(num_mel_bins=20,\n                              num_spectrogram_bins=129,\n                              audio_sample_rate=8000,\n                              lower_edge_hertz=125.0,\n                              upper_edge_hertz=3800.0):\n    \"\"\"\n    spectrogram_to_mel_matrix\n    \"\"\"\n    nyquist_hertz = audio_sample_rate / 2.\n    if lower_edge_hertz >= upper_edge_hertz:\n        raise ValueError(\"lower_edge_hertz %.1f >= upper_edge_hertz %.1f\" %\n                         (lower_edge_hertz, upper_edge_hertz))"
+        },
+        {
+            "comment": "This code is performing Mel frequency cepstral coefficients (MFCC) feature extraction on audio data. It creates spectrogram bins in Hz, converts them to the mel scale, defines mel band edges, and computes the corresponding mel weights matrix. The function returns this matrix after setting the first row to zero. This process is commonly used for speech processing and analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py\":70-90",
+            "content": "    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz,\n                                         num_spectrogram_bins)\n    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)\n    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),\n                                 hertz_to_mel(upper_edge_hertz),\n                                 num_mel_bins + 2)\n    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))\n    for i in range(num_mel_bins):\n        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]\n        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /\n                       (center_mel - lower_edge_mel))\n        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /\n                       (upper_edge_mel - center_mel))\n        mel_weights_matrix[:,\n                           i] = np.maximum(0.0,\n                                           np.minimum(lower_slope, upper_slope))\n    mel_weights_matrix[0, :] = 0.0\n    return mel_weights_matrix\ndef log_mel_spectrogram(data,"
+        },
+        {
+            "comment": "This code defines a function called `log_mel_spectrogram` which takes audio data, sample rate, and optional keyword arguments. It calculates window length in samples, hop length in samples, FFT length, and then uses the Short-Time Fourier Transform (STFT) to generate a spectrogram from the input audio data. The resulting spectrogram is stored in the `spectrogram` variable and its shape is printed for debugging or reference purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py\":91-111",
+            "content": "                        audio_sample_rate=8000,\n                        log_offset=0.0,\n                        window_length_secs=0.025,\n                        hop_length_secs=0.010,\n                        **kwargs):\n    \"\"\"\n    log_mel_spectrogram\n    \"\"\"\n    window_length_samples = int(round(audio_sample_rate * window_length_secs))\n    #print(\"audio_sample_rate = \", audio_sample_rate)\n    #print(\"window_length_secs = \", window_length_secs)\n    #print(\"window_length_sample \", window_length_samples)\n    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))\n    #print(\"hop_length_samples \", hop_length_samples)\n    fft_length = 2**int(np.ceil(np.log(window_length_samples) / np.log(2.0)))\n    #print(\" fft_lengt = \", fft_length)\n    spectrogram = stft_magnitude(data,\n                                 fft_length=fft_length,\n                                 hop_length=hop_length_samples,\n                                 window_length=window_length_samples)\n    #print(\" spectrogram.shape = \", spectrogram.shape)"
+        },
+        {
+            "comment": "Function `spectrogram_to_mel_matrix` converts the spectrogram to Mel scale. Code calculates Mel spectrogram by taking dot product of spectrogram with `spectrogram_to_mel_matrix`. The result is then log transformed to avoid numerical underflow and returned.\nThe function `wav_to_example` takes wav file data, validates sample type, pads zeros to achieve desired window length, scales the wav data to range -1 to 1 by dividing by 32768.0. It is used for audio feature extraction in TableTennis application of PaddleVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py\":112-136",
+            "content": "    mel_spectrogram = np.dot(\n        spectrogram,\n        spectrogram_to_mel_matrix(num_spectrogram_bins=spectrogram.shape[1],\n                                  audio_sample_rate=audio_sample_rate,\n                                  **kwargs))\n    return np.log(mel_spectrogram + log_offset)\ndef wav_to_example(wav_data, sample_rate):\n    \"\"\"\n    wav_to_example\n    \"\"\"\n    #sample_rate, wav_data = wavfile.read(wav_file)\n    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype\n    #wav_data = wav_data[:16000*30]\n    #print(\" wav_data \", wav_data.shape)\n    #print(\" wav_data \", wav_data.shape)\n    pad_zero_num = int(sample_rate * (vgg_params.STFT_WINDOW_LENGTH_SECONDS -\n                                      vgg_params.STFT_HOP_LENGTH_SECONDS))\n    wav_data_extend = np.hstack((wav_data, np.zeros(pad_zero_num)))\n    wav_data = wav_data_extend\n    #print(\" wav_data \", wav_data.shape)\n    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]\n    #print(\" wav_data after convert to -1 1\", wav_data)"
+        },
+        {
+            "comment": "This code extracts audio features for Table Tennis prediction. It first reshapes and resamples the input wav_data if necessary, then calculates log mel spectrogram from wav_data using given parameters. Finally, it frames these features into examples with a specific window length.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py\":137-157",
+            "content": "    #if wav_data.shape[0] > max_second * sample_rate:\n    #    wav_data = wav_data[:max_second * sample_rate, :]\n    if len(wav_data.shape) > 1:\n        wav_data = np.mean(wav_data, axis=1)\n    #print(\" wav_data after mean\", wav_data.shape, len(wav_data.shape), wav_data)\n    # Resample to the rate assumed by vgg.\n    #if sample_rate != vgg_params.SAMPLE_RATE:\n    #    wav_data = resampy.resample(wav_data, sample_rate, vgg_params.SAMPLE_RATE)\n    log_mel = log_mel_spectrogram(\n        wav_data,\n        audio_sample_rate=vgg_params.SAMPLE_RATE,\n        log_offset=vgg_params.LOG_OFFSET,\n        window_length_secs=vgg_params.STFT_WINDOW_LENGTH_SECONDS,\n        hop_length_secs=vgg_params.STFT_HOP_LENGTH_SECONDS,\n        num_mel_bins=vgg_params.NUM_MEL_BINS,\n        lower_edge_hertz=vgg_params.MEL_MIN_HZ,\n        upper_edge_hertz=vgg_params.MEL_MAX_HZ)\n    # Frame features into examples.\n    features_sample_rate = 1.0 / vgg_params.STFT_HOP_LENGTH_SECONDS\n    example_window_length = int(\n        round(vgg_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))"
+        },
+        {
+            "comment": "This code extracts audio features from a WAV file using the VGG-16 model, specifically focusing on MFCC (Mel Frequency Cepstral Coefficients) and STFT (Short-Time Fourier Transform). The code also defines a function to convert PCM data into examples and another to extract MFCC features. Lastly, it demonstrates how to use the code by reading a WAV file and printing its shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py\":159-182",
+            "content": "    example_hop_length = int(\n        round(vgg_params.EXAMPLE_HOP_SECONDS * features_sample_rate))\n    log_mel_examples = frame(log_mel,\n                             window_length=example_window_length,\n                             hop_length=example_hop_length)\n    return log_mel_examples\ndef extract_pcm(pcm_file, sample_rate):\n    with open(pcm_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype=np.int16)\n    examples = wav_to_example(audio_data, sample_rate)\n    return examples\nif __name__ == \"__main__\":\n    wav_file = sys.argv[1]\n    print(\"wav_file = \", wav_file)\n    with open(wav_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype=np.int16)\n    examples_batch = wav_to_example(audio_data, 16000)\n    print(\"examples_batch.shape\", examples_batch.shape)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/92c95f7f-34ab-4b71-809d-39d9cb2e56df.json b/docs/doc/92c95f7f-34ab-4b71-809d-39d9cb2e56df.json
new file mode 100644
index 000000000..f5931ebbc
--- /dev/null
+++ b/docs/doc/92c95f7f-34ab-4b71-809d-39d9cb2e56df.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet contains copyright and license information, imports necessary modules from the PaddlePaddle framework, and defines two classes 'BaseSegment' and 'Manet'. It also specifies that these are the components included in the current module.",
+    "details": [
+        {
+            "comment": "This code snippet contains copyright and license information, imports necessary modules from the PaddlePaddle framework, and defines two classes 'BaseSegment' and 'Manet'. It also specifies that these are the components included in the current module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py\":0-18",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseSegment\nfrom .manet_stage1 import Manet\n__all__ = [\n    'BaseSegment',\n    'Manet',\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/92e86a04-a8ed-4bb9-b909-a920495c6ac7.json b/docs/doc/92e86a04-a8ed-4bb9-b909-a920495c6ac7.json
new file mode 100644
index 000000000..c733e658b
--- /dev/null
+++ b/docs/doc/92e86a04-a8ed-4bb9-b909-a920495c6ac7.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines the Compose class for video transformation pipelines, composing multiple pipeline elements and handling temporary list-type parameters while including a workaround for old format config files.",
+    "details": [
+        {
+            "comment": "This code defines the Compose class, which composes multiple pipelines (decode func, sample func, and transforms) together. It registers the class in the PIPELINES registry. The code also handles temporary list-type configuration parameters for flexibility.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py\":0-32",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom collections.abc import Sequence\nfrom ..registry import PIPELINES\nimport traceback\nfrom ...utils import build\nfrom ...utils import get_logger\n@PIPELINES.register()\nclass Compose(object):\n    \"\"\"\n    Composes several pipelines(include decode func, sample func, and transforms) together.\n    Note: To deal with ```list``` type cfg temporaray, like:\n        transform:\n            - Crop: # A list\n                attribute: 10"
+        },
+        {
+            "comment": "This code is creating a Compose class which takes a list of transforms and composes them sequentially. It checks if the input is in the correct format, builds each transform using the build function from PIPELINES, and stores them in a list. The code also includes a workaround for handling old format config files that may have inconsistent key-value pairs in their lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py\":33-60",
+            "content": "            - Resize: # A list\n                attribute: 20\n    every key of list will pass as the key name to build a module.\n    XXX: will be improved in the future.\n    Args:\n        pipelines (list): List of transforms to compose.\n    Returns:\n        A compose object which is callable, __call__ for this Compose\n        object will call each given :attr:`transforms` sequencely.\n    \"\"\"\n    def __init__(self, pipelines):\n        #assert isinstance(pipelines, Sequence)\n        self.pipelines = []\n        for p in pipelines.values():\n            if isinstance(p, dict):\n                p = build(p, PIPELINES)\n                self.pipelines.append(p)\n            elif isinstance(p, list):\n                for t in p:\n                    #XXX: to deal with old format cfg, ugly code here!\n                    temp_dict = dict(name=list(t.keys())[0])\n                    for all_sub_t in t.values():\n                        if all_sub_t is not None:\n                            temp_dict.update(all_sub_t) \n                    t = build(temp_dict, PIPELINES)"
+        },
+        {
+            "comment": "This code is defining a class for video transformation pipelines. It appends callable functions or dictionaries to the pipeline list and has a __call__ method that applies each pipeline operation to data in sequence, handling exceptions and logging failures if they occur.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py\":61-78",
+            "content": "                    self.pipelines.append(t)\n            elif callable(p):\n                self.pipelines.append(p)\n            else:\n                raise TypeError('pipelines must be callable or a dict,'\n                                'but got {type(p)}')\n    def __call__(self, data):\n        \"\"\"call\"\"\"\n        for p in self.pipelines:\n            try:\n                data = p(data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger = get_logger(\"paddlevideo\")\n                logger.info(\"fail to perform transform [{}] with error: \"\n                      \"{} and stack:\\n{}\".format(p, e, str(stack_info)))\n                raise e\n        return data"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/930c71be-937d-41c7-b08e-fa16358f3a52.json b/docs/doc/930c71be-937d-41c7-b08e-fa16358f3a52.json
new file mode 100644
index 000000000..113a1e8d7
--- /dev/null
+++ b/docs/doc/930c71be-937d-41c7-b08e-fa16358f3a52.json
@@ -0,0 +1,70 @@
+{
+    "summary": "This code imports libraries, defines a ResNet-TSN model with basic and bottleneck blocks in PaddlePaddle, initializes weights for training, and outputs results.",
+    "details": [
+        {
+            "comment": "This code is for importing necessary libraries, defining a ResNet-TSN backbone model in PaddlePaddle, and registering it to the BACKBONES registry. It also includes license information and mentions function-level future imports for compatibility and division settings. The code initializes parameters, defines Conv2D, BatchNorm, MaxPool2D, AvgPool2D layers, and sets up weight initialization functions and loading checkpoints utilities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":0-28",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn import Conv2D, BatchNorm\nfrom paddle.nn import MaxPool2D, AvgPool2D\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt"
+        },
+        {
+            "comment": "Defines a ConvBNLayer class with an average pooling layer and convolutional layer. The class takes input parameters for channels, kernel size, stride, groups, and more. It initializes the layers and sets is_tweaks_mode flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":30-57",
+            "content": "__all__ = [\"ResNetTSN_MRI\"]\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 lr_mult=1.0,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode\n        self._pool2d_avg = AvgPool2D(kernel_size=2,\n                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\",\n                                                  learning_rate=lr_mult),"
+        },
+        {
+            "comment": "This code defines a class for Resnet_TSN, which is a type of backbone model. It includes an initialization function that initializes the BatchNorm layer and a forward function that applies pooling (if in tweaks mode), convolution, and batch normalization to inputs. Additionally, there is a BottleneckBlock class defined for creating bottleneck blocks within the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":58-88",
+            "content": "                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._batch_norm = BatchNorm(\n            out_channels,\n            act=act,\n            param_attr=ParamAttr(name=bn_name + '_scale',\n                                 learning_rate=lr_mult,\n                                 regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(bn_name + '_offset',\n                                learning_rate=lr_mult,\n                                regularizer=L2Decay(0.0)),\n            moving_mean_name=bn_name + '_mean',\n            moving_variance_name=bn_name + '_variance')\n    def forward(self, inputs):\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,"
+        },
+        {
+            "comment": "The code defines a BottleneckBlock class, which is a layer in the ResNet model. It consists of three ConvBNLayer layers with different properties such as kernel size, stride, and activation functions. The class initializes these layers and takes input and output channel counts, learning rate multiplier, and name as parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":89-110",
+            "content": "                 if_first=False,\n                 lr_mult=1.0,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,"
+        },
+        {
+            "comment": "The code defines a ResNet TSN backbone with two branches, where the first branch contains convolutional layers and the second branch has a shortcut connection. The forward function performs addition between the shortcut connection and the output of the convolutional layers. The BasicBlock class is a subclass for implementing basic building blocks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":111-143",
+            "content": "                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     lr_mult=lr_mult,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        y = F.relu(y)\n        return y\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,"
+        },
+        {
+            "comment": "This code defines a BasicBlock class in PaddleVideo for ResNet TSN MRI model. It has an input, output channels, and stride. The class initializes convolution layers (conv0 and conv1) with specified parameters. If shortcut is not set, it also includes a ConvBNLayer as the 'short' attribute.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":144-166",
+            "content": "                 shortcut=True,\n                 if_first=False,\n                 lr_mult=1.0,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 act=None,\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,"
+        },
+        {
+            "comment": "This code defines a ResNetTSN_MRI backbone with specified depth, pretrained model option, and learning rate multipliers for each layer. The forward function performs convolutions and shortcut connections, applying ReLU activation at the end. This backbone is registered in BACKBONES for use in the PaddleVideo library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":167-201",
+            "content": "                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     lr_mult=lr_mult,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTSN_MRI(nn.Layer):\n    \"\"\"ResNetTweaksTSN backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self,\n                 layers=50,\n                 pretrained=None,\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],\n                 in_channels=1):\n        super(ResNetTSN_MRI, self).__init__()"
+        },
+        {
+            "comment": "This code initializes a ResNet TSN backbone model with specified layers, in_channels, and pretrained weight option. It supports specific layer options (18, 34, 50, 101, 152, 200) and checks if the input layer is within supported range. The code also ensures lr_mult_list is a list or tuple and has a length of 5. Depending on the layers, it assigns depth values for each block in the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":203-231",
+            "content": "        self.pretrained = pretrained\n        self.layers = layers\n        supported_layers = [18, 34, 50, 101, 152, 200]\n        assert layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, layers)\n        self.lr_mult_list = lr_mult_list\n        self.in_channels = in_channels\n        assert isinstance(\n            self.lr_mult_list,\n            (list, tuple\n             )), \"lr_mult_list should be in (list, tuple) but got {}\".format(\n                 type(self.lr_mult_list))\n        assert len(\n            self.lr_mult_list\n        ) == 5, \"lr_mult_list length should should be 5 but got {}\".format(\n            len(self.lr_mult_list))\n        if layers == 18:\n            depth = [2, 2, 2, 2]\n        elif layers == 34 or layers == 50:\n            depth = [3, 4, 6, 3]\n        elif layers == 101:\n            depth = [3, 4, 23, 3]\n        elif layers == 152:\n            depth = [3, 8, 36, 3]\n        elif layers == 200:\n            depth = [3, 12, 48, 3]"
+        },
+        {
+            "comment": "This code defines a ResNet model for Temporal Segment Networks (TSN) with multiple branch inputs. It initializes the layers of the network, including convolutional and batch normalization operations. The number of channels and filters used in each layer depend on the total number of layers specified. Different learning rate multipliers are assigned to each layer for efficient training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":232-252",
+            "content": "        num_channels = [64, 256, 512, 1024\n                        ] if layers >= 50 else [64, 64, 128, 256]\n        num_filters = [64, 128, 256, 512]\n        self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=2,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_1\")\n        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,"
+        },
+        {
+            "comment": "Initializing layers of ResNet-TSN with specified depth, creating bottleneck blocks for each layer. If layers are 101, 152 or 200 and block is 2, specific naming convention applied. BottleneckBlock is added as sublayer in a sequential manner.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":253-274",
+            "content": "                                   stride=1,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_3\")\n        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if layers in [101, 152, 200] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' % (block, i),\n                        BottleneckBlock(\n                            in_channels=num_channels[block]"
+        },
+        {
+            "comment": "The code creates a ResNet TSN model with bottleneck and basic blocks. It initializes the block_list by adding each block, sets shortcut to True for the first block of each stage, and appends each block to block_list. The number of filters, out_channels, stride, and other parameters are determined based on the stage and block indexes. The name of each block is also specified according to its position in the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":275-293",
+            "content": "                            if i == 0 else num_filters[block] * 4,\n                            out_channels=num_filters[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            lr_mult=self.lr_mult_list[block + 1],\n                            name=conv_name))\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        'bb_%d_%d' % (block, i),\n                        BasicBlock(in_channels=num_channels[block]\n                                   if i == 0 else num_filters[block],\n                                   out_channels=num_filters[block],"
+        },
+        {
+            "comment": "This code initializes the weights of a ResNet TSN backbone model. It creates blocks with specified parameters and appends them to the block list. The `init_weights` function initializes the parameters based on whether pretrained loading path is indicated or not, following specific initialization functions for Conv2D and BatchNorm2d layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":294-310",
+            "content": "                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   if_first=block == i == 0,\n                                   name=conv_name,\n                                   lr_mult=self.lr_mult_list[block + 1]))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be\n            initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        # XXX: check bias!!! check pretrained!!!"
+        },
+        {
+            "comment": "This code is checking if the pretrained model path is provided and initializing weights for Conv2D and BatchNorm2D layers if not. The forward function performs convolutions, max pooling, and processes through blocks to output a result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py\":312-330",
+            "content": "        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    # XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2d_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/93ea5b89-273b-4528-9e44-c40bd93ff284.json b/docs/doc/93ea5b89-273b-4528-9e44-c40bd93ff284.json
new file mode 100644
index 000000000..f2fe94e0e
--- /dev/null
+++ b/docs/doc/93ea5b89-273b-4528-9e44-c40bd93ff284.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code is an import script for PaddleVideo, registering various modules like backbones, heads, recognizers, localizers, and losses in relevant registries. It defines key function names and includes popular models like 'DeepLab' and 'IntVOS'.",
+    "details": [
+        {
+            "comment": "This code appears to be an import script for PaddleVideo, importing various modules such as backbones, heads, recognizers, localizers, and losses from different parts of the codebase. It also registers these items in relevant registries (e.g., BACKBONES, HEADS) and defines __all__ to include those registered items.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py\":0-22",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .backbones import DeepLab\nfrom .builder import (build_backbone, build_head, build_localizer, build_loss,\n                      build_recognizer)\nfrom .heads import IntVOS\nfrom .registry import (BACKBONES, DETECTORS, HEADS, LOCALIZERS, LOSSES,\n                       PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)\nfrom .weight_init import kaiming_normal_, trunc_normal_, weight_init_\n__all__ = [\n    'BACKBONES', 'HEADS', 'RECOGNIZERS', 'LOCALIZERS', 'PARTITIONERS',"
+        },
+        {
+            "comment": "This code defines several variables and function names used in the PaddleVideo library, including loss functions ('build_loss'), backbone building ('build_backbone'), detectors ('DETECTORS'), and initialization methods ('weight_init_'). It also includes references to popular models such as 'DeepLab' and 'IntVOS'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py\":23-26",
+            "content": "    'LOSSES', 'build_recognizer', 'build_localizer', 'build_head',\n    'build_backbone', 'build_loss', 'DETECTORS', 'kaiming_normal_', 'trunc_normal_',\n    'weight_init_', 'DeepLab', 'IntVOS'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/94337949-c250-4aa2-8a25-eba76cf708c9.json b/docs/doc/94337949-c250-4aa2-8a25-eba76cf708c9.json
new file mode 100644
index 000000000..6aa0fcc91
--- /dev/null
+++ b/docs/doc/94337949-c250-4aa2-8a25-eba76cf708c9.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code adds a YOWOMetric class to the PaddleVideo framework for measuring YOWO metrics in two stages: saving test results and calculating metrics from saved results files. The code also handles batch processing, logging progress, and evaluates mAP metrics.",
+    "details": [
+        {
+            "comment": "This code defines a YOWOMetric class within the PaddleVideo framework. The class measures metrics for YOWO in two stages: first, it saves test results using a trained model, and then calculates metrics from the saved results file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/yowo_metric.py\":0-29",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport os\nfrom paddlevideo.utils import get_logger\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom .ucf24_utils import get_mAP\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass YOWOMetric(BaseMetric):\n    \"\"\"\n    Metrics for YOWO. Two Stages in this metric:\n    (1) Get test results using trained model, results will be saved in YOWOMetric.result_path;\n    (2) Calculate metrics using results file from stage (1)."
+        },
+        {
+            "comment": "The code initializes an instance of a BMN metrics class with specified parameters. It checks if the result path exists and creates it if not, then updates the metric by writing detection results to corresponding files in the result path for each batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/yowo_metric.py\":30-61",
+            "content": "    \"\"\"\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 gt_folder,\n                 result_path,\n                 threshold=0.5,\n                 save_path=None,\n                 log_interval=1):\n        \"\"\"\n        Init for BMN metrics.\n        Params:\n            gtfolder:groundtruth folder path for ucf24\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)\n        self.result_path = result_path\n        self.gt_folder = gt_folder\n        self.threshold = threshold\n        self.save_path = save_path\n        if not osp.isdir(self.result_path):\n            os.makedirs(self.result_path)\n    def update(self, batch_id, data, outputs):\n        frame_idx = outputs['frame_idx']\n        boxes = outputs[\"boxes\"]\n        for j in range(len(frame_idx)):\n            detection_path = osp.join(self.result_path, frame_idx[j])\n            with open(detection_path, 'w+') as f_detect:\n                for box in boxes[j]:\n                    x1 = round(float(box[0] - box[2] / 2.0) * 320.0)"
+        },
+        {
+            "comment": "This code snippet is part of the PaddleVideo library. It calculates and writes yolo v5 box information into a file, handling batch processing and logging progress with an interval. The accumulate function collects mAP metrics for evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/yowo_metric.py\":62-81",
+            "content": "                    y1 = round(float(box[1] - box[3] / 2.0) * 240.0)\n                    x2 = round(float(box[0] + box[2] / 2.0) * 320.0)\n                    y2 = round(float(box[1] + box[3] / 2.0) * 240.0)\n                    det_conf = float(box[4])\n                    for j in range((len(box) - 5) // 2):\n                        cls_conf = float(box[5 + 2 * j].item())\n                        prob = det_conf * cls_conf\n                        f_detect.write(\n                            str(int(box[6]) + 1) + ' ' + str(prob) + ' ' + str(x1) + ' ' + str(y1) + ' ' + str(\n                                x2) + ' ' + str(y2) + '\\n')\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        metric_list = get_mAP(self.gt_folder, self.result_path, self.threshold, self.save_path)\n        for info in metric_list:\n            logger.info(info)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/945ca542-cb79-4595-ab34-f63406fa5b18.json b/docs/doc/945ca542-cb79-4595-ab34-f63406fa5b18.json
new file mode 100644
index 000000000..d9a1b152a
--- /dev/null
+++ b/docs/doc/945ca542-cb79-4595-ab34-f63406fa5b18.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code sets up the logging configuration based on a provided JSON file. If the file is found, it modifies the filename paths in the configuration according to the save_dir and then uses logging.config.dictConfig() to configure the logging system. If the file is not found, it uses basicConfig() with default level to set up logging. The function returns the filename for the \"info_file_handler\".",
+    "details": [
+        {
+            "comment": "This code sets up the logging configuration based on a provided JSON file. If the file is found, it modifies the filename paths in the configuration according to the save_dir and then uses logging.config.dictConfig() to configure the logging system. If the file is not found, it uses basicConfig() with default level to set up logging. The function returns the filename for the \"info_file_handler\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/logger.py\":0-24",
+            "content": "import os\nimport logging\nimport logging.config\nfrom pathlib import Path\nfrom utils import read_json\ndef setup_logging(save_dir, log_config='logger/logger_config.json',\n                  default_level=logging.INFO):\n    \"\"\"Setup logging configuration.\"\"\"\n    print(os.getcwd())\n    log_config = Path(log_config)\n    print(f\"log config: {log_config} exists: {log_config.exists()}\")\n    if log_config.is_file():\n        config = read_json(log_config)\n        # modify logging paths based on run config\n        for _, handler in config['handlers'].items():\n            if 'filename' in handler:\n                handler['filename'] = str(save_dir / handler['filename'])\n        logging.config.dictConfig(config)\n    else:\n        print(f\"Warning: logging configuration file is not found in {log_config}.\")\n        logging.basicConfig(level=default_level)\n    return config[\"handlers\"][\"info_file_handler\"][\"filename\"]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/94a1a1ca-0f94-45cf-ad4f-6f8a8d7d67d2.json b/docs/doc/94a1a1ca-0f94-45cf-ad4f-6f8a8d7d67d2.json
new file mode 100644
index 000000000..3e782a306
--- /dev/null
+++ b/docs/doc/94a1a1ca-0f94-45cf-ad4f-6f8a8d7d67d2.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code imports diverse loss functions from different modules, such as CrossEntropyLoss, BMNLoss, and TransNetV2Loss for PaddleVideo's video recognition, segmentation tasks, providing a comprehensive list of usable losses. The PaddleVideo model uses BaseWeightedLoss, ASRFLoss, DistillationCELoss, and DistillationDMLLoss for audio-visual speech recognition, distillation-based learning, and region-specific loss computation.",
+    "details": [
+        {
+            "comment": "This code imports different types of loss functions from various modules, such as CrossEntropyLoss, BMNLoss, and TransNetV2Loss. These losses are used in PaddleVideo for various applications like video recognition, segmentation, and more. The code provides a comprehensive list of loss functions that can be used depending on the specific task.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/__init__.py\":0-25",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .base import BaseWeightedLoss\nfrom .bmn_loss import BMNLoss\nfrom .cross_entropy_loss import CrossEntropyLoss\nfrom .depth_loss import ADDSLoss\nfrom .transnetv2_loss import TransNetV2Loss\nfrom .actbert_loss import ActBertLoss\nfrom .asrf_loss import ASRFLoss\nfrom .distillation_loss import DistillationCELoss, DistillationDMLLoss\nfrom .yowo_loss import RegionLoss\n__all__ = [\n    'CrossEntropyLoss', 'BMNLoss', 'TransNetV2Loss', 'ActBertLoss', 'ADDSLoss',"
+        },
+        {
+            "comment": "The code defines a list of loss functions used in the PaddleVideo model. These losses include BaseWeightedLoss, ASRFLoss, DistillationCELoss, and DistillationDMLLoss for various tasks like audio-visual speech recognition, distillation-based learning, and region-specific loss computation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/__init__.py\":26-28",
+            "content": "    'BaseWeightedLoss', 'ASRFLoss', 'DistillationCELoss', 'DistillationDMLLoss',\n    'RegionLoss'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9540c56a-4dbe-4ffb-abf9-efb046cf75de.json b/docs/doc/9540c56a-4dbe-4ffb-abf9-efb046cf75de.json
new file mode 100644
index 000000000..9a57ebcc8
--- /dev/null
+++ b/docs/doc/9540c56a-4dbe-4ffb-abf9-efb046cf75de.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines a PaddlePaddle class \"ppTimeSformerHead\" as a head for the TimeSformer model, extending BaseHead and initializing fully connected layers with truncated normal distribution.",
+    "details": [
+        {
+            "comment": "This code defines a class called \"ppTimeSformerHead\" which is a head for the TimeSformer model in PaddlePaddle framework. It extends the BaseHead class, and has attributes such as num_classes, in_channels. The class also registers itself in the HEADS registry of the PaddleVideo module. The code uses Linear and ParamAttr from paddle.nn and weight_init from .base, and imports trunc_normal_ and L2Decay from other modules.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptimesformer_head.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom paddle.nn import Linear\nfrom ..registry import HEADS\nfrom ..weight_init import trunc_normal_, weight_init_\nfrom .base import BaseHead\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n@HEADS.register()\nclass ppTimeSformerHead(BaseHead):\n    \"\"\"TimeSformerHead Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature."
+        },
+        {
+            "comment": "The code defines a class named \"PPTimesformerHead\" with an __init__ method that takes parameters such as num_classes, in_channels, loss_cfg (with default value), std (with default 0.02), and optional kwargs. It initializes superclass attributes, sets self.std, and initializes the FC layer parameters using weight_init_. The TruncatedNormal initialization method is used with specific attribute names.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptimesformer_head.py\":30-57",
+            "content": "        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 std=0.02,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.std = std\n        self.fc = Linear(self.in_channels,\n                         self.num_classes,\n                         bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'TruncatedNormal',\n                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.0,\n                     std=self.std)\n        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal"
+        },
+        {
+            "comment": "The code defines a head for the PPTimesformer model. It initializes the fully connected layer (fc) with truncated normal distribution and defines the forward pass, which involves passing input through fc to generate scores for classification tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/pptimesformer_head.py\":58-73",
+            "content": "        trunc_normal_(self.fc.weight, std=self.std)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # XXX: check dropout location!\n        # x.shape = [N, embed_dim]\n        score = self.fc(x)\n        # [N, num_class]\n        # x = F.softmax(x)  # NOTE remove\n        return score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9540f655-8d4a-4d20-b072-692b7b466706.json b/docs/doc/9540f655-8d4a-4d20-b072-692b7b466706.json
new file mode 100644
index 000000000..217345615
--- /dev/null
+++ b/docs/doc/9540f655-8d4a-4d20-b072-692b7b466706.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines a Softmax class with an Inplace_Run method that applies softmax function in-place to iterator ranges of float vectors. The code also includes a virtual function for postprocessing operations in the PaddleVideo library.",
+    "details": [
+        {
+            "comment": "This code defines a class Softmax that contains a method Inplace_Run. The method takes an iterator range of a vector of floats and applies softmax function in-place to the values within this range.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/postprocess_op.h\":0-38",
+            "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#pragma once\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include \"include/utility.h\"\nnamespace PaddleVideo\n{\n    class Softmax\n    {\n    public:\n        virtual void Inplace_Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end);"
+        },
+        {
+            "comment": "This code defines a virtual function that takes in two iterators to a vector of floats and returns a vector of floats as output. It is part of the PaddleVideo library's postprocessing operation namespace.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/postprocess_op.h\":39-42",
+            "content": "        virtual std::vector<float> Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end);\n    };\n} // namespace PaddleVideo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/95a099dd-1734-422a-b059-32270567b137.json b/docs/doc/95a099dd-1734-422a-b059-32270567b137.json
new file mode 100644
index 000000000..db14ba1c7
--- /dev/null
+++ b/docs/doc/95a099dd-1734-422a-b059-32270567b137.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The 2s-AGCN model, an enhanced ST-GCN version for motion recognition, utilizes dual-flow adaptive convolutional networks and focuses on second-order bone data. Code offers test scripts, accuracy results, and download links for models trained on different datasets, with PaddleVideo exporting an action recognition model using AGCN2s.",
+    "details": [
+        {
+            "comment": "This code provides an introduction to the 2s-AGCN model, an improved version of ST-GCN published in CVPR2019. It uses a dual-flow adaptive convolutional network and focuses on the second-order information of bone data for motion recognition.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn2s.md\":0-19",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/2sAGCN.md) | English\n# CTR-GCN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\n![\u6a21\u578b\u7ed3\u6784\u56fe](../../../images/agcn2s.png)\n[2s-AGCN](https://openaccess.thecvf.com/content_CVPR_2019/papers/Shi_Two-Stream_Adaptive_Graph_Convolutional_Networks_for_Skeleton-Based_Action_Recognition_CVPR_2019_paper.pdf) is an improved article on ST-GCN published in CVPR2019. It proposes a dual-flow adaptive convolutional network, which improves the shortcomings of the original ST-GCN. In the existing GCN based approach, the topology of the graph is set manually and fixed to all layers and input samples. In addition, the second-order information of bone data (bone length and orientation) is naturally more beneficial and discriminating for motion recognition, which was rarely studied in the methods at that time. Therefore, this paper puts forward a node and bones of tw"
+        },
+        {
+            "comment": "This code provides information on the AGCN2S model, a skeleton-based gesture recognition network. It uses data from NTU-RGBD, with details of its preparation found in another file. The code also outlines how to train the CTR-GCN model on various configurations such as cross-subject and cross-view training scenarios, using bone or joint data. This serves as a guide for running the model's training scripts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn2s.md\":19-39",
+            "content": "o kinds of information fusion based on skeleton shuangliu network, and join in figure convolution adjacency matrix adaptive matrix, a sharp rise in the bones of gesture recognition accuracy, also has laid the foundation for subsequent work (the subsequent basic skeleton gesture recognition are based on the flow of network framework).\n## Data\nData download and processing are consistent with CTR-GCN. For details, please refer to [NTU-RGBD Data Preparation](../../dataset/ntu-rgbd.md)\n## Train\n### Train on NTU-RGBD\nTrain CTR-GCN on NTU-RGBD scripts using single gpu\uff1a\n```bash\n# train cross subject with bone data\npython main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucs_bone.yaml --seed 1\n# train cross subject with joint data\npython main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucs_joint.yaml --seed 1\n# train cross view with bone data\npython main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucv_bone.yaml --seed 1\n# train cross view with joint data\npython main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucv_joint.yaml --seed 1"
+        },
+        {
+            "comment": "The code provides test scripts for the 2s-AGCN model on the NTU-RGB+D dataset, both with cross-subject and cross-view splits. The accuracy results for joint and bone data are given, along with a download link to the training log.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn2s.md\":40-70",
+            "content": "```\nconfig file `agcn2s_ntucs_joint.yaml` corresponding to the config of 2s-AGCN on NTU-RGB+D dataset with cross-subject splits.\n## Test\n### Test on NTU-RGB+D\nTest scripts\uff1a\n```bash\n# test cross subject with bone data\npython main.py --test -c configs/recognition/2sagcn/2sagcn_ntucs_bone.yaml -w data/2SAGCN_ntucs_bone.pdparams\n# test cross subject with joint data\npython main.py --test -c configs/recognition/2sagcn/2sagcn_ntucs_joint.yaml -w data/2SAGCN_ntucs_joint.pdparams\n# test cross view with bone data\npython main.py --test -c configs/recognition/2sagcn/2sagcn_ntucv_bone.yaml -w data/2SAGCN_ntucv_bone.pdparams\n# test cross view with joint data\npython main.py --test -c configs/recognition/2sagcn/2sagcn_ntucv_joint.yaml -w data/2SAGCN_ntucv_joint.pdparams\n```\n* Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on NTU-RGB+D dataset:\n|                |  CS   |   CV   |\n| :------------: | :---: | :----: |\n| Js-AGCN(joint) | 85.8% | 94.13% |\n| Bs-AGCN(bone)  | 86.7% | 93.9%  |\nTrain log\uff1a[download](https://github.com/ELKYang/2s-AGCN-paddle/tree/main/work_dir/ntu)"
+        },
+        {
+            "comment": "Code snippet contains download links for different checkpoints of the AGCN-2s model trained on various datasets:\n1. ntu_cs_agcn_joint\n2. ntu_cs_agcn_bone\n3. ntu_cv_agcn_joint\n4. ntu_cv_agcn_bone",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn2s.md\":72-78",
+            "content": "VisualDL log\uff1a[download](https://github.com/ELKYang/2s-AGCN-paddle/tree/main/runs)\ncheckpoints\uff1a\n|                            CS-Js                             |                            CS-Bs                             |                            CV-Js                             |                            CV-Bs                             |\n| :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| [ntu_cs_agcn_joint](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cs_agcn_joint-48-30674.pdparams) | [ntu_cs_agcn_bone](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cs_agcn_bone-44-28170.pdparams) | [ntu_cv_agcn_joint](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cv_agcn_joint-38-22932.pdparams) | [ntu_cv_agcn_bone](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cv_agcn_bone-49-29400.pdparams) |"
+        },
+        {
+            "comment": "This code is exporting and inferring a model for action recognition using PaddleVideo's AGCN2s. It uses the `export_model.py` script to generate an inference model archive, which includes the model architecture file (AGCN2s_ntucs_joint.pdmodel) and parameters file (AGCN2s_ntucs_joint.pdiparams). The `predict.py` script is then used to perform inference on input data with the specified configuration and model files, using GPU if available and disabling TensorRT.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn2s.md\":80-102",
+            "content": "## Inference\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/agcn2s/2sagcn_ntucs_joint.yaml \\\n                                -p data/AGCN2s_ntucs_joint.pdparams \\\n                                -o inference/AGCN2s_ntucs_joint\n```\nTo get model architecture file `AGCN2s_ntucs_joint.pdmodel` and parameters file `AGCN2s_ntucs_joint.pdiparams`.\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \\\n                           --config configs/recognition/agcn2s/2sagcn_ntucs_joint.yaml \\\n                           --model_file inference/AGCN2s_ntucs_joint/AGCN2s_ntucs_joint.pdmodel \\\n                           --params_file inference/AGCN2s_ntucs_joint/AGCN2s_ntucs_joint.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False"
+        },
+        {
+            "comment": "This code block shows the prediction engine result for the AGCN2S model. It displays an image of the prediction results and references the original paper on Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/agcn2s.md\":103-111",
+            "content": "```\n### infer result\n![\u9884\u6d4b\u5f15\u64ce\u63a8\u7406\u7ed3\u679c\u56fe](../../../images/agcn2s_result.png)\n## Reference\n- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://openaccess.thecvf.com/content_CVPR_2019/papers/Shi_Two-Stream_Adaptive_Graph_Convolutional_Networks_for_Skeleton-Based_Action_Recognition_CVPR_2019_paper.pdf), Lei Shi and Yifan Zhang and Jian Cheng and Hanqing Lu"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/95a0b7ca-1040-4328-868c-0d127a7bbaca.json b/docs/doc/95a0b7ca-1040-4328-868c-0d127a7bbaca.json
new file mode 100644
index 000000000..5493d4185
--- /dev/null
+++ b/docs/doc/95a0b7ca-1040-4328-868c-0d127a7bbaca.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code defines a PyTorch class for batch normalization, initializing a BatchNorm3D layer and including methods to compute mean and standard deviation. It also supports aggregating statistics from multiple splits and performs forward pass for training or evaluation.",
+    "details": [
+        {
+            "comment": "This code defines a function `get_norm` that returns the normalization layer based on the provided bn_norm_type and bn_num_splits. If bn_norm_type is 'batchnorm', it returns paddle.nn.BatchNorm3D, otherwise if it's 'sub_batchnorm', it returns a partially applied SubBatchNorm3D function with num_splits parameter set to bn_num_splits. If the norm type isn't supported, it raises a NotImplementedError. It also defines `aggregate_sub_bn_stats` function that recursively finds all SubBN modules in the given model and aggregates sub-BN stats by calling aggregate_stats() on each found SubBatchNorm3D module. It returns the count of SubBN modules found.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py\":0-35",
+            "content": "from functools import partial\nimport paddle\ndef get_norm(bn_norm_type, bn_num_splits):\n    \"\"\"\n    Args:\n        cfg (CfgNode): model building configs, details are in the comments of\n            the config file.\n    Returns:\n        nn.Layer: the normalization layer.\n    \"\"\"\n    if bn_norm_type == \"batchnorm\":\n        return paddle.nn.BatchNorm3D\n    elif bn_norm_type == \"sub_batchnorm\":\n        return partial(SubBatchNorm3D, num_splits=bn_num_splits)\n    else:\n        raise NotImplementedError(\n            \"Norm type {} is not supported\".format(bn_norm_type))\ndef aggregate_sub_bn_stats(model):\n    \"\"\"\n    Recursively find all SubBN modules and aggregate sub-BN stats.\n    Args:\n        model (nn.Layer): model to be aggregate sub-BN stats\n    Returns:\n        count (int): number of SubBN module found.\n    \"\"\"\n    count = 0\n    for child in model.children():\n        if isinstance(child, SubBatchNorm3D):\n            child.aggregate_stats()\n            count += 1\n        else:\n            count += aggregate_sub_bn_stats(child)"
+        },
+        {
+            "comment": "The code defines a SubBatchNorm3D class that implements Batch Normalization with the option to split the batch dimension into N splits. It computes stats for each subset of examples independently during training and aggregates them during evaluation. The class takes num_splits as an argument and other parameters such as num_features, weight_attr, and bias_attr are set in its constructor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py\":36-63",
+            "content": "    return count\nclass SubBatchNorm3D(paddle.nn.Layer):\n    \"\"\"\n    Implement based on paddle2.0.\n    The standard BN layer computes stats across all examples in a GPU. In some\n    cases it is desirable to compute stats across only a subset of examples\n    SubBatchNorm3D splits the batch dimension into N splits, and run BN on\n    each of them separately (so that the stats are computed on each subset of\n    examples (1/N of batch) independently. During evaluation, it aggregates\n    the stats from all splits into one BN.\n    \"\"\"\n    def __init__(self, num_splits, **args):\n        \"\"\"\n        Args:\n            num_splits (int): number of splits.\n            args (list): list of args\n        \"\"\"\n        super(SubBatchNorm3D, self).__init__()\n        self.num_splits = num_splits\n        self.num_features = args[\"num_features\"]\n        self.weight_attr = args[\"weight_attr\"]\n        self.bias_attr = args[\"bias_attr\"]\n        # Keep only one set of weight and bias (outside).\n        if self.weight_attr == False:\n            self.weight = self.create_parameter("
+        },
+        {
+            "comment": "This code initializes the weight and bias parameters of a BatchNorm layer in PaddlePaddle. If learning rate is 0, it sets weight to have no gradient update, and if `bias_attr` is False, it sets the bias to True and stops its gradients from being updated.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py\":64-84",
+            "content": "                attr=None,\n                shape=[self.num_features],\n                default_initializer=paddle.nn.initializer.Constant(1.0))\n            self.weight.stop_gradient = True\n        else:\n            self.weight = self.create_parameter(\n                attr=self.weight_attr,\n                shape=[self.num_features],\n                default_initializer=paddle.nn.initializer.Constant(1.0))\n            self.weight.stop_gradient = self.weight_attr is not None \\\n                                        and self.weight_attr.learning_rate == 0.\n        if self.bias_attr == False:\n            self.bias = self.create_parameter(attr=None,\n                                              shape=[self.num_features],\n                                              is_bias=True)\n            self.bias.stop_gradient = True\n        else:\n            self.bias = self.create_parameter(attr=self.bias_attr,\n                                              shape=[self.num_features],\n                                              is_bias=True)"
+        },
+        {
+            "comment": "Class is initializing a BatchNorm3D layer and storing two instances of it (self.bn and self.split_bn). The first instance has its weights and bias set as fixed (inner), while the second instance handles splitting the features for a specified number of splits. The function _get_aggregated_mean_std calculates the aggregated mean and standard deviation by summing each set's means and stds, then dividing them by the total count to get the average values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py\":85-107",
+            "content": "            self.bias.stop_gradient = self.bias_attr is not None \\\n                                      and self.bias_attr.learning_rate == 0.\n        # set weights and bias fixed (inner).\n        args[\"weight_attr\"] = False\n        args[\"bias_attr\"] = False\n        self.bn = paddle.nn.BatchNorm3D(**args)\n        # update number of features used in split_bn\n        args[\"num_features\"] = self.num_features * self.num_splits\n        self.split_bn = paddle.nn.BatchNorm3D(**args)\n    def _get_aggregated_mean_std(self, means, stds, n):\n        \"\"\"\n        Calculate the aggregated mean and stds.\n        Use the method of update mean and std when merge multi-part data.\n        Args:\n            means (tensor): mean values.\n            stds (tensor): standard deviations.\n            n (int): number of sets of means and stds.\n        \"\"\"\n        mean = paddle.sum(paddle.reshape(means, (n, -1)), axis=0) / n\n        std = (paddle.sum(paddle.reshape(stds, (n, -1)), axis=0) / n +\n               paddle.sum(paddle.reshape("
+        },
+        {
+            "comment": "This code is defining a class that implements batch normalization in PyTorch. The class has methods to compute the mean and standard deviation, aggregate statistics from multiple splits of batch normalization, and perform forward pass for training or evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py\":108-134",
+            "content": "                   paddle.pow((paddle.reshape(means, (n, -1)) - mean), 2),\n                   (n, -1)),\n                          axis=0) / n)\n        return mean, std\n    def aggregate_stats(self):\n        \"\"\"\n        Synchronize running_mean, and running_var to self.bn.\n        Call this before eval, then call model.eval();\n        When eval, forward function will call self.bn instead of self.split_bn,\n        During this time the running_mean, and running_var of self.bn has been obtained from\n        self.split_bn.\n        \"\"\"\n        if self.split_bn.training:\n            bn_mean_tensor, bn_variance_tensor = self._get_aggregated_mean_std(\n                self.split_bn._mean,\n                self.split_bn._variance,\n                self.num_splits,\n            )\n            self.bn._mean.set_value(bn_mean_tensor)\n            self.bn._variance.set_value(bn_variance_tensor)\n    def forward(self, x):\n        if self.training:\n            n, c, t, h, w = x.shape\n            x = paddle.reshape(\n                x, (n // self.num_splits, c * self.num_splits, t, h, w))"
+        },
+        {
+            "comment": "The code applies batch normalization to the input tensor and multiplies it by a weight matrix. Then, it adds a bias vector and returns the normalized tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py\":135-141",
+            "content": "            x = self.split_bn(x)\n            x = paddle.reshape(x, (n, c, t, h, w))\n        else:\n            x = self.bn(x)\n        x = paddle.multiply(x, paddle.reshape(self.weight, (-1, 1, 1, 1)))\n        x = paddle.add(x, paddle.reshape(self.bias, (-1, 1, 1, 1)))\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/96d6083f-7eee-4521-879b-900fb8576159.json b/docs/doc/96d6083f-7eee-4521-879b-900fb8576159.json
new file mode 100644
index 000000000..01cf49d48
--- /dev/null
+++ b/docs/doc/96d6083f-7eee-4521-879b-900fb8576159.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports all functions and classes from the logger and log_parser modules in the T2VLAD application of PaddleVideo.",
+    "details": [
+        {
+            "comment": "This code imports all functions and classes from the logger and log_parser modules in the T2VLAD application of PaddleVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/__init__.py\":0-1",
+            "content": "from .logger import *\nfrom .log_parser import *"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/97ee9329-376e-4b29-bc36-82c6df1c716a.json b/docs/doc/97ee9329-376e-4b29-bc36-82c6df1c716a.json
new file mode 100644
index 000000000..a01bc1ff7
--- /dev/null
+++ b/docs/doc/97ee9329-376e-4b29-bc36-82c6df1c716a.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The code outlines six different tasks in action recognition and detection using various algorithms, including TSN (Two-Stream Convolutional Networks), TSM (Temporal Shift Module), SlowFast Networks, LSTM (Long Short-Term Memory), and BNM (Boundary-aware Multi-scale Network). The tasks include single-class action recognition, multi-class action recognition, action localization, spatio-temporal action detection, 3000-class tagging application, and highlights detection application.",
+    "details": [
+        {
+            "comment": "The code outlines six different tasks in action recognition and detection using various algorithms, including TSN (Two-Stream Convolutional Networks), TSM (Temporal Shift Module), SlowFast Networks, LSTM (Long Short-Term Memory), and BNM (Boundary-aware Multi-scale Network). The tasks include single-class action recognition, multi-class action recognition, action localization, spatio-temporal action detection, 3000-class tagging application, and highlights detection application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/demos\":0-7",
+            "content": "some useful demo todo.  \n1\u3001single-class action recognition\uff0c tsn/tsm/slowfast  \n2\u3001multi-class action recognition\uff0clstm  \n3\u3001action localization\uff0cbmn  \n4\u3001spatio temporal action detection\uff0ctodo  \n5\u30013000-class tagging application(videotag)\uff1atsn+lstm  \n6\u3001Highlights detection application\uff1abmn+tsn+lstm  "
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9899d8bb-c2af-4b2f-977e-15fab28c8797.json b/docs/doc/9899d8bb-c2af-4b2f-977e-15fab28c8797.json
new file mode 100644
index 000000000..7b38d7e40
--- /dev/null
+++ b/docs/doc/9899d8bb-c2af-4b2f-977e-15fab28c8797.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The ModelAudio class extracts audio features using wav_to_example and slices the data into parts, calculating features for each part. The predict method appends these features to a list and returns the audio feature list after dividing by sample rate.",
+    "details": [
+        {
+            "comment": "The code defines a ModelAudio class which takes in audio-related configurations and performs audio feature extraction using the feature_extractor module's wav_to_example function. The class also predicts audio by converting PCM data to numpy array and handles audio file reading exceptions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/model_config.py\":0-41",
+            "content": "\"\"\"\naudio model config\n\"\"\"\nimport numpy as np\nimport mfcc.feature_extractor as feature_extractor\nclass ModelAudio(object):\n    \"\"\"\n    modelAudio\n    \"\"\"\n    def __init__(self, configs, use_gpu=1):\n        self.use_gpu = use_gpu\n        self.audio_fps = configs.COMMON.fps\n        self.audio_feat_scale = configs.TSN.audio_scale\n        self.sample_rate = 16000\n    def predict_slice(self, wav_data, sample_rate):\n        \"\"\"\n        audio predict\n        \"\"\"\n        examples_batch = feature_extractor.wav_to_example(\n            wav_data, sample_rate)[0]\n        return examples_batch\n    def predict_audio(self, audio_file):\n        \"\"\"\n        predict_audio\n        \"\"\"\n        audio_feature_list = []\n        # read pcm\n        sample_rate = self.sample_rate\n        try:\n            with open(audio_file, \"rb\") as f:\n                pcm_data = f.read()\n            audio_data = np.fromstring(pcm_data, dtype=np.int16)\n            audio_status = \"audio load success\"\n        except Exception as e:\n            audio_data = []\n            audio_status = \"audio load failed\""
+        },
+        {
+            "comment": "The code slices the audio data into parts of size 'step' and calculates features for each part using a predict method, then appends the features to a list. The length of the entire audio data is divided by the sample rate to determine how many steps can fit in it. This function returns the audio feature list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/mfcc/model_config.py\":42-50",
+            "content": "        step = 1\n        len_video = int(len(audio_data) / sample_rate)\n        print(len_video)\n        for i in range(0, len_video, step):\n            audio_data_part = audio_data[i * sample_rate:(i + step) *\n                                         sample_rate]\n            feature_audio = self.predict_slice(audio_data_part, sample_rate)\n            audio_feature_list.append(feature_audio)\n        return audio_feature_list"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/98ccb6a1-3f49-4753-a871-0e0f550fa97f.json b/docs/doc/98ccb6a1-3f49-4753-a871-0e0f550fa97f.json
new file mode 100644
index 000000000..d6f7ca89c
--- /dev/null
+++ b/docs/doc/98ccb6a1-3f49-4753-a871-0e0f550fa97f.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The `log_summary` function gathers performance stats, identifies seeds, searches metrics, and calculates scores for epochs. If evaluation mode is \"fixed_num_epochs,\" it logs the fixed training length, then calculates mean and standard deviation for each metric in aggregated scores using numpy functions.",
+    "details": [
+        {
+            "comment": "The function `log_summary` extracts performance statistics from log files and takes arguments such as a logger reference, log file path, evaluation mode (test run, fixed number of epochs or geometric mean), and optional fixed number of epochs. The log is read, and the performance statistics are extracted based on the given evaluation mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/log_parser.py\":0-23",
+            "content": "import re\nimport scipy.stats\nimport logging\nimport numpy as np\nfrom collections import defaultdict\ndef log_summary(logger, log_path, eval_mode=\"test_run\", fixed_num_epochs=None):\n    \"\"\"Extract performace statistics from experiment log files.\n    Args:\n        logger (logger): reference to primary logging instance\n        log_path (Path): the path to the log file\n        eval_mode (str): the method use to collect the statistics. Can be one of:\n            `test_run`, `fixed_num_epochs` or `geometric_mean`\n    NOTE: The `eval_mode` argument differs by dataset: for datasets which provide a\n    validation set, we use validation set performance to complete a single test run.  For\n    datasets where no validation set is available, we aim to match prior work by either\n    fixing the number of training epochs, or selecting directly from validation set\n    performance (Details can be found in the supplementary material of the paper.)\n    \"\"\"\n    with open(str(log_path), \"r\") as f:\n        log = f.read().splitlines()"
+        },
+        {
+            "comment": "This code is parsing a log file, identifying the random seed used for each part of the log. It searches for specific metrics and extracts information related to \"R1\", \"R5\", \"R10\", \"R50\", \"MedR\", and \"MeanR\" in two modes: \"t2v\" and \"v2t\". It also differentiates between evaluation modes like \"test_run\" and \"val\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/log_parser.py\":25-55",
+            "content": "    # keep track of the random seed used for the part of the logfile being processed\n    current_seed = None\n    # Regex tag for finding the seed\n    seed_tag = \"Setting experiment random seed to\"\n    if eval_mode == \"test_run\":\n        subset = \"test\"\n    else:\n        subset = \"val\"\n    for mode in \"t2v\", \"v2t\":\n        logger.info(\"\")\n        logger.info(\"----------------------------------------------------\")\n        logger.info(f\"[{mode}] loaded log file with {len(log)} lines....\")\n        logger.info(\"----------------------------------------------------\")\n        # Search for the following metrics\n        scores = {\n            \"R1\": defaultdict(list),\n            \"R5\": defaultdict(list),\n            \"R10\": defaultdict(list),\n            \"R50\": defaultdict(list),\n            \"MedR\": defaultdict(list),\n            \"MeanR\": defaultdict(list),\n        }\n        for row in log:\n            if seed_tag in row:\n                # Search for the log file entry describing the current random seed\n                match = re.search(seed_tag + \" (\\d+)$\", row)  # NOQA"
+        },
+        {
+            "comment": "This code is parsing log data, extracting relevant metrics and scores for a specific seed. It asserts that the log matches the expected format and then populates a dictionary of scores for each seed. If the log contains a specific tag, it extracts the corresponding value and adds it to the appropriate score list. Finally, it defines an empty dictionary for aggregation and raises a NotImplementedError if evaluating in geometric mean mode as it needs to be fixed for new log format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/log_parser.py\":56-77",
+            "content": "                assert len(match.groups()) == 1, \"expected a single regex match\"\n                current_seed = match.groups()[0]\n            if f\"{subset}_{mode}_metrics\" in row:\n                tokens = row.split(\" \")\n                for key in scores:\n                    tag = f\"{subset}_{mode}_metrics_{key}:\"\n                    if tag in tokens:\n                        pos = tokens.index(tag) + 1\n                        val = tokens[pos]\n                        val = float(val)\n                        assert current_seed is not None, \"failed to determine the seed\"\n                        scores[key][current_seed].append(val)\n        agg_scores = {\"R1\": [], \"R5\": [], \"R10\": [], \"R50\": [], \"MedR\": [], \"MeanR\": []}\n        # compute the best performance for a single epoch (i.e. sharing the same model\n        # to compute all stats)\n        geometric_stats = defaultdict(list)\n        best_epochs = {}\n        if eval_mode == \"geometric_mean\":\n            raise NotImplementedError(\"Need to fix this for new log format\")"
+        },
+        {
+            "comment": "Code calculates scores for different seeds and metrics, then selects the best epochs based on geometric means. It then determines the final score statistic for each metric depending on the eval_mode, and appends it to agg_scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/log_parser.py\":78-98",
+            "content": "            consider = [\"R1\", \"R5\", \"R10\"]\n            seeds = list(scores[\"R1\"].keys())\n            for seed in seeds:\n                for metric, subdict in scores.items():\n                    if metric in consider:\n                        geometric_stats[seed].append(subdict[seed])\n                gms_raw = np.array(geometric_stats[seed])\n                geo_means = scipy.stats.mstats.gmean(gms_raw, axis=0)\n                best_epochs[seed] = np.argmax(geo_means)\n        for metric, subdict in scores.items():\n            for seed, values in subdict.items():\n                if eval_mode == \"test_run\":\n                    stat = values[0]\n                elif eval_mode == \"fixed_num_epochs\":\n                    stat = values[fixed_num_epochs - 1]\n                elif \"LSMDC\" in log_path and eval_mode == \"geometric_mean\":\n                    stat = values[best_epochs[seed]]\n                else:\n                    raise ValueError(f\"unrecognised eval_mode: {eval_mode}\")\n                agg_scores[metric].append(stat)"
+        },
+        {
+            "comment": "This code snippet checks if the evaluation mode is set to \"fixed_num_epochs\". If so, it logs a message indicating the fixed training length. Then, for each metric in the aggregated scores, it calculates the mean and standard deviation using numpy's `np.mean()` and `np.std()`, respectively, and logs the values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/logger/log_parser.py\":100-103",
+            "content": "        if eval_mode == \"fixed_num_epochs\":\n            logger.info(f\"Reporting stats with fixed training length: {fixed_num_epochs}\")\n        for metric, values in agg_scores.items():\n            logger.info(f\"{metric}: {np.mean(values):.1f}, {np.std(values, ddof=1):.1f}\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/990248c3-d36d-4951-ba5c-f6108aeb20bc.json b/docs/doc/990248c3-d36d-4951-ba5c-f6108aeb20bc.json
new file mode 100644
index 000000000..88fa479cf
--- /dev/null
+++ b/docs/doc/990248c3-d36d-4951-ba5c-f6108aeb20bc.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code imports modules, registers them in the registry, and exports a list of model-building modules with functions to build these models and specific classes like ResNet and TSNHead.",
+    "details": [
+        {
+            "comment": "This code is importing various modules from different sub-directories and registers them in the registry. It also includes a license notice and a function for weight initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py\":0-23",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .backbones import ResNet\nfrom .builder import (build_backbone, build_head, build_recognizer,\n                      build_localizer, build_loss)\nfrom .heads import BaseHead, TSNHead, TSMRecHead\nfrom .losses import SmoothL1Loss, L1Loss\nfrom .framework.recognizers import BaseRecognizer, recognizer2d\nfrom .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS\nfrom .weight_init import weight_init_"
+        },
+        {
+            "comment": "This code exports a list of modules for model building, including backbones, heads, recognizers, localizers, and losses. It also includes functions to build these models and specific model classes like ResNet and TSNHead.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py\":25-44",
+            "content": "__all__ = [\n    'BACKBONES',\n    'HEADS',\n    'RECOGNIZERS',\n    'LOCALIZERS',\n    'LOSSES',\n    'build_recognizer',\n    'build_localizer',\n    'build_head',\n    'build_backbone',\n    'build_loss',\n    'ResNet',\n    'TSNHead',\n    'BaseHead',\n    'TSMRecHead',\n    'BaseRecognizer',\n    'Recognizer2d',\n    'SmoothL1Loss',\n    'L1Loss',\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9a349970-8717-43c2-b78c-b04b5474c921.json b/docs/doc/9a349970-8717-43c2-b78c-b04b5474c921.json
new file mode 100644
index 000000000..c7b9bfdc4
--- /dev/null
+++ b/docs/doc/9a349970-8717-43c2-b78c-b04b5474c921.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet is importing the \"test_model\" function from the \"test.py\" module in the same directory and adding it to the __all__ list. The text at the beginning of the file contains license information and copyright notice.",
+    "details": [
+        {
+            "comment": "This code snippet is importing the \"test_model\" function from the \"test.py\" module in the same directory and adding it to the __all__ list. The text at the beginning of the file contains license information and copyright notice.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py\":0-18",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .test import test_model\n__all__ = [\n    'test_model',\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9a7eb176-dea7-494e-87cd-4eff0b41bece.json b/docs/doc/9a7eb176-dea7-494e-87cd-4eff0b41bece.json
new file mode 100644
index 000000000..0868bb748
--- /dev/null
+++ b/docs/doc/9a7eb176-dea7-494e-87cd-4eff0b41bece.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code imports libraries, defines functions for parsing arguments and log files, checks for names, reads a log file, stores results in \"parser_results\", loads ground truth from multiple files, and compares log results with ground truth for testing.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and defines functions for parsing command-line arguments, running shell commands, and retrieving results from log files. It uses ArgumentParser to handle command line arguments, subprocess to execute shell commands, and os to check file existence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/compare_results.py\":0-39",
+            "content": "import numpy as np\nimport os\nimport subprocess\nimport json\nimport argparse\nimport glob\ndef init_args():\n    parser = argparse.ArgumentParser()\n    # params for testing assert allclose\n    parser.add_argument(\"--atol\", type=float, default=1e-3)\n    parser.add_argument(\"--rtol\", type=float, default=1e-3)\n    parser.add_argument(\"--gt_file\", type=str, default=\"\")\n    parser.add_argument(\"--log_file\", type=str, default=\"\")\n    parser.add_argument(\"--precision\", type=str, default=\"fp32\")\n    return parser\ndef parse_args():\n    parser = init_args()\n    return parser.parse_args()\ndef run_shell_command(cmd):\n    p = subprocess.Popen(cmd,\n                         stdout=subprocess.PIPE,\n                         stderr=subprocess.PIPE,\n                         shell=True)\n    out, err = p.communicate()\n    if p.returncode == 0:\n        return out.decode('utf-8')\n    else:\n        return None\ndef parser_results_from_log_by_name(log_path, names_list):\n    if not os.path.exists(log_path):\n        raise ValueError(\"The log file {} does not exists!\".format(log_path))"
+        },
+        {
+            "comment": "This code checks if there are any names in the \"names_list\" and reads a log file at the specified \"log_path\". If the file contains \"python_infer\", it parses the python inference results, while for other log files, it parses C++ inference results. It stores the results in the \"parser_results\" dictionary with names as keys and corresponding values as either integers or floats.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/compare_results.py\":41-63",
+            "content": "    if names_list is None or len(names_list) < 1:\n        return []\n    parser_results = {}\n    lines = open(log_path, 'r').read().splitlines()\n    if 'python_infer' in log_path:  # parse python inference\n        for line in lines:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            for name in names_list:\n                if name in line:\n                    if '.' in split_items[-1]:\n                        parser_results[name] = float(split_items[-1])\n                    else:\n                        parser_results[name] = int(split_items[-1])\n    else:  # parse cpp inference\n        for line in lines:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            if all([(name + ':') in split_items for name in names_list]):\n                # print(split_items)"
+        },
+        {
+            "comment": "This code defines a function `load_gt_from_file` that reads and parses the contents of a log file. It first checks if the file exists, then opens it in read mode. For each line containing 'top-1 class' or 'top-1 score', it extracts the class and score values, storing them as key-value pairs in `parser_gt`. If the file is not found, it raises a ValueError with an error message. The code also handles dictionaries with string keys, allowing for easy integration into larger programs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/compare_results.py\":64-88",
+            "content": "                parser_results['class'] = int(split_items[2])\n                parser_results['score'] = float(split_items[-1])\n    return parser_results\ndef load_gt_from_file(gt_file):\n    if not os.path.exists(gt_file):\n        raise ValueError(\"The log file {} does not exists!\".format(gt_file))\n    with open(gt_file, 'r') as f:\n        data = f.readlines()\n        f.close()\n    parser_gt = {}\n    for line in data:\n        if 'top-1 class' in line:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            parser_gt['top-1 class'] = int(split_items[-1])\n        elif 'top-1 score' in line:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            parser_gt['top-1 score'] = float(split_items[-1])\n        elif \"score\" in line and 'segment' in line:\n            location_dict = eval(line)"
+        },
+        {
+            "comment": "The code defines three functions:\n1. `load_gt_from_file` loads ground truth data from a file, handling both the cases when each line contains location details or class and score information.\n2. `collect_predict_from_logs` collects predict results from logs based on given key list.\n3. `load_gt_from_txts` loads ground truth collections from multiple files (fp32, fp16, int8), organizing them under corresponding keys in a dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/compare_results.py\":89-117",
+            "content": "            parser_gt[f\"score_{len(parser_gt)}\"] = location_dict['score']\n            parser_gt[f\"segment_{len(parser_gt)}\"] = location_dict['segment']\n        elif \"class:\" in line and \"score:\" in line:\n            split_items = line.replace('\\t', ' ')\n            split_items = split_items.split(' ')\n            split_items = [item for item in split_items if len(item) > 0]\n            parser_gt['class'] = int(split_items[2])\n            parser_gt['score'] = float(split_items[-1])\n    return parser_gt\ndef load_gt_from_txts(gt_file):\n    gt_list = glob.glob(gt_file)\n    gt_collection = {}\n    for gt_f in gt_list:\n        gt_dict = load_gt_from_file(gt_f)\n        basename = os.path.basename(gt_f)\n        if \"fp32\" in basename:\n            gt_collection[\"fp32\"] = [gt_dict, gt_f]\n        elif \"fp16\" in basename:\n            gt_collection[\"fp16\"] = [gt_dict, gt_f]\n        elif \"int8\" in basename:\n            gt_collection[\"int8\"] = [gt_dict, gt_f]\n        else:\n            continue\n    return gt_collection\ndef collect_predict_from_logs(log_path, key_list):"
+        },
+        {
+            "comment": "The code reads logs from specified file paths and compares the results with ground truth data for testing purposes. It uses numpy's assert_allclose function to validate the accuracy of the predicted results against the ground truth. The usage example provides command line options to compare Python and C++ inferencing results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/compare_results.py\":118-145",
+            "content": "    log_list = glob.glob(log_path)\n    pred_collection = {}\n    for log_f in log_list:\n        pred_dict = parser_results_from_log_by_name(log_f, key_list)\n        key = os.path.basename(log_f)\n        pred_collection[key] = pred_dict\n    return pred_collection\ndef testing_assert_allclose(dict_x, dict_y, atol=1e-7, rtol=1e-7):\n    for k in dict_x:\n        np.testing.assert_allclose(np.array(dict_x[k]),\n                                   np.array(dict_y[k]),\n                                   atol=atol,\n                                   rtol=rtol)\nif __name__ == \"__main__\":\n    # Usage example:\n    # test python infer:\n    ## python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/PP-TSM/*.txt  --log_file=./test_tipc/output/PP-TSM/python_infer_*.log\n    # test cpp infer:\n    ## python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/PP-TSM_CPP/*.txt  --log_file=./test_tipc/output/PP-TSM_CPP/cpp_infer_*.log\n    args = parse_args()\n    gt_collection = load_gt_from_txts(args.gt_file)"
+        },
+        {
+            "comment": "Iterates through the log files, compares \"fp32\", \"fp16\" and \"int8\" results with ground truth, uses testing_assert_allclose to check for consistency and prints success/failure messages.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/compare_results.py\":146-169",
+            "content": "    key_list = gt_collection[\"fp32\"][0].keys()\n    pred_collection = collect_predict_from_logs(args.log_file, key_list)\n    for filename in pred_collection.keys():\n        if \"fp32\" in filename:\n            gt_dict, gt_filename = gt_collection[\"fp32\"]\n        elif \"fp16\" in filename:\n            gt_dict, gt_filename = gt_collection[\"fp16\"]\n        elif \"int8\" in filename:\n            gt_dict, gt_filename = gt_collection[\"int8\"]\n        else:\n            continue\n        pred_dict = pred_collection[filename]\n        try:\n            testing_assert_allclose(gt_dict,\n                                    pred_dict,\n                                    atol=args.atol,\n                                    rtol=args.rtol)\n            print(\n                \"Assert allclose passed! The results of {} and {} are consistent!\"\n                .format(filename, gt_filename))\n        except Exception as E:\n            print(E)\n            raise ValueError(\n                \"The results of {} and the results of {} are inconsistent!\"."
+        },
+        {
+            "comment": "This line of code formats the filename and ground truth filename for comparison purposes in the context of image or video analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/compare_results.py\":170-170",
+            "content": "                format(filename, gt_filename))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9aea14bf-d2ff-4d11-99d2-f8c6e9e4ece3.json b/docs/doc/9aea14bf-d2ff-4d11-99d2-f8c6e9e4ece3.json
new file mode 100644
index 000000000..f39237054
--- /dev/null
+++ b/docs/doc/9aea14bf-d2ff-4d11-99d2-f8c6e9e4ece3.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is a Python module with the license and copyright information. It imports a class named PaddleVideo from the tools package, and defines its availability as part of the __all__ list.",
+    "details": [
+        {
+            "comment": "This code is a Python module with the license and copyright information. It imports a class named PaddleVideo from the tools package, and defines its availability as part of the __all__ list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/__init__.py\":0-15",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n__all__ = ['PaddleVideo']\nfrom .tools import PaddleVideo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9aee4f8d-0e16-441a-8b8e-242b67a6e02a.json b/docs/doc/9aee4f8d-0e16-441a-8b8e-242b67a6e02a.json
new file mode 100644
index 000000000..c957fdc8b
--- /dev/null
+++ b/docs/doc/9aee4f8d-0e16-441a-8b8e-242b67a6e02a.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is the initialization file for PaddleVideo's tasks module. It imports functions from various task-specific modules and adds them to `__all__` for export. The code also includes a license notice, copyright information, and a disclaimer.",
+    "details": [
+        {
+            "comment": "This code is the initialization file for PaddleVideo's tasks module. It imports functions from various task-specific modules and adds them to `__all__` for export. The code also includes a license notice, copyright information, and a disclaimer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/__init__.py\":0-19",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .train import train_model\nfrom .test import test_model\nfrom .train_dali import train_dali\nfrom .train_multigrid import train_model_multigrid\n__all__ = ['train_model', 'test_model', 'train_dali', 'train_model_multigrid']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9b615ae9-82b6-4320-aff8-c1adac9d33d2.json b/docs/doc/9b615ae9-82b6-4320-aff8-c1adac9d33d2.json
new file mode 100644
index 000000000..f76b07166
--- /dev/null
+++ b/docs/doc/9b615ae9-82b6-4320-aff8-c1adac9d33d2.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The code processes video frames, performs inference using an AI model and measures processing times. It preprocesses data in batches and utilizes TensorRT with GPU optimizations and MKLDNN support for efficiency.",
+    "details": [
+        {
+            "comment": "The code initializes variables and performs batch size operations. It copies the batch of frames and resizes it to accommodate for multiple segments per frame batch. The times vector will store execution time values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":0-25",
+            "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include <include/video_rec.h>\nnamespace PaddleVideo\n{\n    void VideoRecognizer::Run(const std::vector<string> &frames_batch_path, const std::vector<std::vector<cv::Mat> > &frames_batch, std::vector<double> *times)\n    {\n        // Copy parameters to the function\n        int real_batch_num = frames_batch.size();\n        std::vector<cv::Mat> srcframes(real_batch_num * this->num_seg, cv::Mat());\n        for (int i = 0; i < real_batch_num; ++i)"
+        },
+        {
+            "comment": "This code preprocesses video frames for model inference. It copies frames from source to destination, resizes them using a scale operation, and performs center cropping. The number of views is set to 1 if the model name is \"ppTSM\". The preprocessing steps include scaling and centering cropping to ensure the frames are properly formatted for inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":26-54",
+            "content": "        {\n            for (int j = 0; j < this->num_seg; ++j)\n            {\n                frames_batch[i][j].copyTo(srcframes[i * this->num_seg + j]);\n            }\n        }\n        auto preprocess_start = std::chrono::steady_clock::now();\n        /* Preprocess */\n        std::vector<cv::Mat> resize_frames;\n        std::vector<cv::Mat> crop_frames;\n        std::vector<float> input;\n        int num_views = 1;\n        if (this->inference_model_name == \"ppTSM\")\n        {\n            num_views = 1;\n            // 1. Scale\n            resize_frames = std::vector<cv::Mat>(real_batch_num * this->num_seg, cv::Mat());\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    this->scale_op_.Run(srcframes[i * this->num_seg + j], resize_frames[i * this->num_seg + j], this->use_tensorrt_, 256);\n                }\n            }\n            // 2. CenterCrop\n            crop_frames = std::vector<cv::Mat>(real_batch_num * num_views * this->num_seg, cv::Mat());"
+        },
+        {
+            "comment": "This code performs image preprocessing and conversion on video frames before feeding them into a neural network. It first resizes, centers, and crops the video frames using `centercrop_op_.Run()`. Then it normalizes the frames in an in-place operation using `normalize_op_.Run()`, with the mean and scale values provided. Finally, it converts the normalized frames into a single array using the dimensions from the first frame, and stores them in the 'input' vector.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":55-79",
+            "content": "            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    this->centercrop_op_.Run(resize_frames[i * this->num_seg + j], crop_frames[i * this->num_seg + j], this->use_tensorrt_, 224);\n                }\n            }\n            // 3. Normalization(inplace operation)\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    for (int k = 0; k < num_views; ++k)\n                    {\n                        this->normalize_op_.Run(&crop_frames[i * num_views * this->num_seg + j * num_views + k], this->mean_, this->scale_, this->is_scale_);\n                    }\n                }\n            }\n            // 4. Image2Array\n            int rh = crop_frames[0].rows;\n            int rw = crop_frames[0].cols;\n            int rc = crop_frames[0].channels();\n            input = std::vector<float>(real_batch_num * num_views * this->num_seg *  crop_frames[0].rows * crop_frames[0].cols * rc, 0.0f);"
+        },
+        {
+            "comment": "The code is iterating over real_batch_num number of batches and num_seg segments within each batch. For each segment, it's applying a set of operations (permute, scale, TenCrop) to a series of frames. These operations are used for data preprocessing before inputting into an inference model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":80-104",
+            "content": "            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    for (int k = 0; k < num_views; ++k)\n                    {\n                        this->permute_op_.Run(&crop_frames[i * num_views * this->num_seg + j * num_views + k], input.data() + (i * num_views * this->num_seg + j * num_views + k) * (rh * rw * rc));\n                    }\n                }\n            }\n        }\n        else if(this->inference_model_name == \"ppTSN\")\n        {\n            num_views = 10;\n            // 1. Scale\n            resize_frames = std::vector<cv::Mat>(real_batch_num * this->num_seg, cv::Mat());\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    this->scale_op_.Run(srcframes[i * this->num_seg + j], resize_frames[i * this->num_seg + j], this->use_tensorrt_, 256);\n                }\n            }\n            // 2. TenCrop"
+        },
+        {
+            "comment": "This code performs image preprocessing for video frames. It initializes a vector of crop_frames, iterates through real_batch_num and num_seg to run resizing and cropping operations on each frame using tencrop_op_. Next, it applies normalization inplace operation on each frame using normalize_op_. Finally, it converts the processed frames into an array by extracting rows and columns size from the first crop_frame.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":105-128",
+            "content": "            crop_frames = std::vector<cv::Mat>(real_batch_num * this->num_seg * num_views, cv::Mat());\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    this->tencrop_op_.Run(resize_frames[i * this->num_seg + j], crop_frames, (i * this->num_seg  + j) * num_views, this->use_tensorrt_, 224);\n                }\n            }\n            // 3. Normalization(inplace operation)\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    for (int k = 0; k < num_views; ++k)\n                    {\n                        this->normalize_op_.Run(&crop_frames[i * this->num_seg * num_views + j * num_views + k], this->mean_, this->scale_, this->is_scale_);\n                    }\n                }\n            }\n            // 4. Image2Array\n            int rh = crop_frames[0].rows;\n            int rw = crop_frames[0].cols;"
+        },
+        {
+            "comment": "The code initializes a vector with zeros based on the number of frames, segments, views, and channels. It then iterates over the real batch number, segments, and views to permute and populate the input vector. Finally, it performs inference by reshaping the input tensor for prediction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":129-151",
+            "content": "            int rc = crop_frames[0].channels();\n            input = std::vector<float>(real_batch_num * this->num_seg * num_views *  crop_frames[0].rows * crop_frames[0].cols * rc, 0.0f);\n            for (int i = 0; i < real_batch_num; ++i)\n            {\n                for (int j = 0; j < this->num_seg; ++j)\n                {\n                    for (int k = 0; k < num_views; ++k)\n                    {\n                        this->permute_op_.Run(&crop_frames[i * this->num_seg * num_views + j * num_views + k], input.data() + (i * this->num_seg * num_views + j * num_views + k) * (rh * rw * rc));\n                    }\n                }\n            }\n        }\n        else\n        {\n            throw \"[Error] Not implemented yet\";\n        }\n        auto preprocess_end = std::chrono::steady_clock::now();\n        /* Inference */\n        auto input_names = this->predictor_->GetInputNames();\n        auto input_t = this->predictor_->GetInputHandle(input_names[0]);\n        input_t->Reshape({real_batch_num * num_views * this->num_seg, 3, crop_frames[0].rows, crop_frames[0].cols});"
+        },
+        {
+            "comment": "This code segment performs inference using an AI model, gathers the output probabilities, and applies softmax operation to convert logits into probabilities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":152-174",
+            "content": "        auto inference_start = std::chrono::steady_clock::now();\n        input_t->CopyFromCpu(input.data());\n        this->predictor_->Run(); // Use the inference library to predict\n        std::vector<float> predict_batch;\n        auto output_names = this->predictor_->GetOutputNames();\n        auto output_t = this->predictor_->GetOutputHandle(output_names[0]);\n        auto predict_shape = output_t->shape();\n        // Get the number of class\n        int class_num = predict_shape[1];\n        int out_numel = std::accumulate(predict_shape.begin(), predict_shape.end(), 1, std::multiplies<int>());\n        predict_batch.resize(out_numel); // NxC\n        output_t->CopyToCpu(predict_batch.data()); // Copy the model output to predict_batch\n        // Convert output (logits) into probabilities\n        for (int i = 0; i < real_batch_num; ++i)\n        {\n            this->softmax_op_.Inplace_Run(predict_batch.begin() + i * class_num, predict_batch.begin() + (i + 1) * class_num);\n        }\n        auto inference_end = std::chrono::steady_clock::now();"
+        },
+        {
+            "comment": "This code snippet is responsible for post-processing the results of object detection after model inference. It calculates the class and score for each frame, outputs it, and stores the processing times.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":176-197",
+            "content": "        // output decode\n        auto postprocess_start = std::chrono::steady_clock::now();\n        std::vector<std::string> str_res;\n        std::vector<float>scores;\n        for (int i = 0; i < real_batch_num; ++i)\n        {\n            int argmax_idx = int(Utility::argmax(predict_batch.begin() + i * class_num, predict_batch.begin() + (i + 1) * class_num));\n            float score = predict_batch[argmax_idx];\n            scores.push_back(score);\n            str_res.push_back(this->label_list_[argmax_idx]);\n        }\n        auto postprocess_end = std::chrono::steady_clock::now();\n        for (int i = 0; i < str_res.size(); i++)\n        {\n            std::cout << frames_batch_path[i] << \"\\tclass: \" << str_res[i] << \"\\tscore: \" << scores[i] << endl;\n        }\n        std::chrono::duration<float> preprocess_diff = preprocess_end - preprocess_start;\n        times->push_back(double(preprocess_diff.count() * 1000));\n        std::chrono::duration<float> inference_diff = inference_end - inference_start;\n        times->push_back(double(inference_diff.count() * 1000));"
+        },
+        {
+            "comment": "This code initializes a Paddle Video recognizer by loading the model from a given directory. It also sets up GPU and TensorRT configurations if needed, and specifies precision based on the provided string value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":198-222",
+            "content": "        std::chrono::duration<float> postprocess_diff = postprocess_end - postprocess_start;\n        times->push_back(double(postprocess_diff.count() * 1000));\n    }\n    void VideoRecognizer::LoadModel(const std::string &model_dir)\n    {\n        //   AnalysisConfig config;\n        paddle_infer::Config config;\n        config.SetModel(model_dir + \"/\" + this->inference_model_name + \".pdmodel\",\n                        model_dir + \"/\" + this->inference_model_name + \".pdiparams\");\n        if (this->use_gpu_)\n        {\n            config.EnableUseGpu(this->gpu_mem_, this->gpu_id_);\n            if (this->use_tensorrt_)\n            {\n                auto precision = paddle_infer::Config::Precision::kFloat32;\n                if (this->precision_ == \"fp16\")\n                {\n                    precision = paddle_infer::Config::Precision::kHalf;\n                }\n                else if (this->precision_ == \"int8\")\n                {\n                    precision = paddle_infer::Config::Precision::kInt8;\n                }"
+        },
+        {
+            "comment": "This code checks the inference model name and configures TensorRT engine accordingly for different models like ppTSM, TSM, ppTSN, or TSN. It sets workspace size to a large value, maxBatchSize based on number of segments, minSubgraphSize to 3, precision, and disables useStatic and useCalibMode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":224-246",
+            "content": "                if (this->inference_model_name == \"ppTSM\" || this->inference_model_name == \"TSM\")\n                {\n                    config.EnableTensorRtEngine(\n                        1 << 30, // workspaceSize\n                        this->rec_batch_num * this->num_seg * 1, // maxBatchSize\n                        3, // minSubgraphSize\n                        precision, // precision\n                        false,// useStatic\n                        false //useCalibMode\n                    );\n                }\n                else if(this->inference_model_name == \"ppTSN\" || this->inference_model_name == \"TSN\")\n                {\n                    config.EnableTensorRtEngine(\n                        1 << 30,\n                        this->rec_batch_num * this->num_seg * 10,\n                        3, // minSubgraphSize\n                        precision,// precision\n                        false,// useStatic\n                        false //useCalibMode\n                    );\n                }\n                else"
+        },
+        {
+            "comment": "The code enables the TensorRT engine with specific parameters, such as workspace size, max batch size, minimum subgraph size, and precision. It checks if TensorRT is enabled and deactivates it by default for models that do not support dynamic shape. The code also defines input shape ranges (min, opt) for a particular key (\"data_batch_0\").",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":247-270",
+            "content": "                {\n                    config.EnableTensorRtEngine(\n                        1 << 30, // workspaceSize\n                        this->rec_batch_num, // maxBatchSize\n                        3, // minSubgraphSize\n                        precision,// precision\n                        false,// useStatic\n                        false //useCalibMode\n                    );\n                }\n                std::cout << \"Enable TensorRT is: \" << config.tensorrt_engine_enabled() << std::endl;\n                /* some model dose not suppport dynamic shape with TRT, deactivate it by default */\n                // std::map<std::string, std::vector<int> > min_input_shape =\n                // {\n                //     {\"data_batch_0\", {1, this->num_seg, 3, 1, 1}}\n                // };\n                // std::map<std::string, std::vector<int> > max_input_shape =\n                // {\n                //     {\"data_batch_0\", {1, this->num_seg, 3, 256, 256}}\n                // };\n                // std::map<std::string, std::vector<int> > opt_input_shape ="
+        },
+        {
+            "comment": "This code initializes a PaddleVideo predictor with TensorRT configuration options. It sets the GPU usage, enables MKLDNN (if needed), specifies input names and optimizations, disables INFO log messages, and creates the predictor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/video_rec.cpp\":271-303",
+            "content": "                // {\n                //     {\"data_batch_0\", {this->rec_batch_num,  this->num_seg, 3, 224, 224}}\n                // };\n                // config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,\n                //                               opt_input_shape);\n            }\n        }\n        else\n        {\n            config.DisableGpu();\n            if (this->use_mkldnn_)\n            {\n                config.EnableMKLDNN();\n                // cache 10 different shapes for mkldnn to avoid memory leak\n                config.SetMkldnnCacheCapacity(10);\n            }\n            config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);\n        }\n        config.SwitchUseFeedFetchOps(false);\n        // true for multiple input\n        config.SwitchSpecifyInputNames(true);\n        config.SwitchIrOptim(true);\n        config.EnableMemoryOptim();\n        config.DisableGlogInfo();\n        this->predictor_ = CreatePredictor(config);\n    }\n} // namespace PaddleVideo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9b695b9a-1208-48a8-a906-7c88c21cbc10.json b/docs/doc/9b695b9a-1208-48a8-a906-7c88c21cbc10.json
new file mode 100644
index 000000000..6d0928642
--- /dev/null
+++ b/docs/doc/9b695b9a-1208-48a8-a906-7c88c21cbc10.json
@@ -0,0 +1,105 @@
+{
+    "summary": "The code measures object detection performance, handles class labels and non-maximum suppression, and calculates true/false positives using an IoU threshold. It is used for AVA evaluation and contains functions to select class-specific data, remove invalid boxes, and filter input arrays.",
+    "details": [
+        {
+            "comment": "This code file is for evaluating object detection results on a single image. It determines true positives or false positives based on a predefined IOU ratio. Non Maximum Supression and multi-class detection are supported. The evaluation can be performed either on boxes or object masks, depending on the settings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":0-19",
+            "content": "# Copyright 2017 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# =============================================================================\n\"\"\"Evaluate Object Detection result on a single image.\nAnnotate each detected result as true positives or false positive according to\na predefined IOU ratio. Non Maximum Supression is used by default. Multi class\ndetection is supported by default. Based on the settings, per image evaluation\nis either performed on boxes or on object masks."
+        },
+        {
+            "comment": "This code initializes a class for evaluating detection results of a single image. It takes in parameters such as the number of ground truth classes and matching IOU threshold, and computes object detection metrics using detected boxes, scores, class labels, etc.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":20-52",
+            "content": "\"\"\"\nimport numpy as np\nfrom . import np_box_list, np_box_ops\nclass PerImageEvaluation:\n    \"\"\"Evaluate detection result of a single image.\"\"\"\n    def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5):\n        \"\"\"Initialized PerImageEvaluation by evaluation parameters.\n        Args:\n            num_groundtruth_classes: Number of ground truth object classes\n            matching_iou_threshold: A ratio of area intersection to union,\n                which is the threshold to consider whether a detection is true\n                positive or not\n        \"\"\"\n        self.matching_iou_threshold = matching_iou_threshold\n        self.num_groundtruth_classes = num_groundtruth_classes\n    def compute_object_detection_metrics(\n        self,\n        detected_boxes,\n        detected_scores,\n        detected_class_labels,\n        groundtruth_boxes,\n        groundtruth_class_labels,\n        groundtruth_is_difficult_list,\n        groundtruth_is_group_of_list,\n        detected_masks=None,\n        groundtruth_masks=None,"
+        },
+        {
+            "comment": "This function evaluates detections as true positives, false positives or ignored based on the detected and ground truth boxes. It works in two stages: 1) matching all detections to non group-of boxes for true positives, ignoring difficult ones; and 2) ignoring detections matched to group-of boxes. The inputs are numpy arrays of detected boxes, scores, class labels, and ground truth boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":53-72",
+            "content": "    ):\n        \"\"\"Evaluates detections as being tp, fp or ignored from a single image.\n        The evaluation is done in two stages:\n        1. All detections are matched to non group-of boxes; true positives\n            are determined and detections matched to difficult boxes are\n            ignored.\n        2. Detections that are determined as false positives are matched\n            against group-of boxes and ignored if matched.\n        Args:\n            detected_boxes: A float numpy array of shape [N, 4], representing N\n                regions of detected object regions.\n                Each row is of the format [y_min, x_min, y_max, x_max]\n            detected_scores: A float numpy array of shape [N, 1], representing\n                the confidence scores of the detected N object instances.\n            detected_class_labels: A integer numpy array of shape [N, 1],\n                repreneting the class labels of the detected N object\n                instances.\n            groundtruth_boxes: A float numpy array of shape [M, 4],"
+        },
+        {
+            "comment": "This code function accepts multiple parameters including ground truth regions, class labels, difficult instances, group-of tags, optional detected masks and ground truth masks. It returns a list of scores representing K scores detected with object class label c.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":73-90",
+            "content": "                representing M regions of object instances in ground truth\n            groundtruth_class_labels: An integer numpy array of shape [M, 1],\n                representing M class labels of object instances in ground truth\n            groundtruth_is_difficult_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a difficult instance or\n                not\n            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box has group-of tag\n            detected_masks: (optional) A uint8 numpy array of shape\n                [N, height, width]. If not None, the metrics will be computed\n                based on masks.\n            groundtruth_masks: (optional) A uint8 numpy array of shape\n                [M, height, width].\n        Returns:\n            scores: A list of C float numpy arrays. Each numpy array is of\n                shape [K, 1], representing K scores detected with object class\n                label c"
+        },
+        {
+            "comment": "This function is part of the AvaEvaluation class, which evaluates object detection performance in videos. It computes true positive and false positive labels for detected object instances based on ground truth information. The function removes invalid detection boxes before computing the tp_fp_labels. This helps in evaluating the accuracy of detected objects.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":91-114",
+            "content": "            tp_fp_labels: A list of C boolean numpy arrays. Each numpy array\n                is of shape [K, 1], representing K True/False positive label of\n                object instances detected with class label c\n        \"\"\"\n        (\n            detected_boxes,\n            detected_scores,\n            detected_class_labels,\n            detected_masks,\n        ) = self._remove_invalid_boxes(\n            detected_boxes,\n            detected_scores,\n            detected_class_labels,\n            detected_masks,\n        )\n        scores, tp_fp_labels = self._compute_tp_fp(\n            detected_boxes=detected_boxes,\n            detected_scores=detected_scores,\n            detected_class_labels=detected_class_labels,\n            groundtruth_boxes=groundtruth_boxes,\n            groundtruth_class_labels=groundtruth_class_labels,\n            groundtruth_is_difficult_list=groundtruth_is_difficult_list,\n            groundtruth_is_group_of_list=groundtruth_is_group_of_list,\n            detected_masks=detected_masks,"
+        },
+        {
+            "comment": "This code calculates true/false positives for object detection in an image across all classes. It takes detected boxes, scores, class labels, ground truth boxes, class labels, and optional masks as input, returning the computed scores and tp_fp_labels. The separate function computes tp_fp for a single image given the above inputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":115-142",
+            "content": "            groundtruth_masks=groundtruth_masks,\n        )\n        return scores, tp_fp_labels\n    def _compute_tp_fp(\n        self,\n        detected_boxes,\n        detected_scores,\n        detected_class_labels,\n        groundtruth_boxes,\n        groundtruth_class_labels,\n        groundtruth_is_difficult_list,\n        groundtruth_is_group_of_list,\n        detected_masks=None,\n        groundtruth_masks=None,\n    ):\n        \"\"\"Labels true/false positives of detections of an image across all\n        classes.\n        Args:\n            detected_boxes: A float numpy array of shape [N, 4], representing N\n                regions of detected object regions.\n                Each row is of the format [y_min, x_min, y_max, x_max]\n            detected_scores: A float numpy array of shape [N, 1], representing\n                the confidence scores of the detected N object instances.\n            detected_class_labels: A integer numpy array of shape [N, 1],\n                repreneting the class labels of the detected N object"
+        },
+        {
+            "comment": "The function takes input parameters like instances, groundtruth_boxes, groundtruth_class_labels, groundtruth_is_difficult_list, groundtruth_is_group_of_list, detected_masks and groundtruth_masks. It returns a list of float numpy arrays representing result scores based on these inputs. The function computes scores considering masks if detected_masks is not None and groundtruth_masks if it's not None.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":143-160",
+            "content": "                instances.\n            groundtruth_boxes: A float numpy array of shape [M, 4],\n                representing M regions of object instances in ground truth\n            groundtruth_class_labels: An integer numpy array of shape [M, 1],\n                representing M class labels of object instances in ground truth\n            groundtruth_is_difficult_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a difficult instance or\n                not\n            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box has group-of tag\n            detected_masks: (optional) A np.uint8 numpy array of shape\n                [N, height, width]. If not None, the scores will be computed\n                based on masks.\n            groundtruth_masks: (optional) A np.uint8 numpy array of shape\n                [M, height, width].\n        Returns:\n            result_scores: A list of float numpy arrays. Each numpy array is of"
+        },
+        {
+            "comment": "This function checks if both detected_masks and groundtruth_masks are not null. It then initializes result_scores and result_tp_fp_labels lists for storing scores and True/False positive labels of object instances respectively. If only one of the masks is available, it raises a ValueError. This function seems to be part of an AVA evaluation process where it deals with class label c, groundtruth_class_labels, detected_masks, and groundtruth_masks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":161-182",
+            "content": "                shape [K, 1], representing K scores detected with object class\n                label c\n            result_tp_fp_labels: A list of boolean numpy array. Each numpy\n                array is of shape [K, 1], representing K True/False positive\n                label of object instances detected with class label c\n        Raises:\n            ValueError: If detected masks is not None but groundtruth masks are\n                None, or the other way around.\n        \"\"\"\n        if detected_masks is not None and groundtruth_masks is None:\n            raise ValueError(\n                'Detected masks is available but groundtruth masks is not.')\n        if detected_masks is None and groundtruth_masks is not None:\n            raise ValueError(\n                'Groundtruth masks is available but detected masks is not.')\n        result_scores = []\n        result_tp_fp_labels = []\n        for i in range(self.num_groundtruth_classes):\n            groundtruth_is_difficult_list_at_ith_class = (\n                groundtruth_is_difficult_list[groundtruth_class_labels == i])"
+        },
+        {
+            "comment": "The code is extracting per-class arrays for detected and ground truth objects. It separates the data into specific classes, computes true positive and false positive labels using a single class function, and assigns them to their respective variables.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":183-201",
+            "content": "            groundtruth_is_group_of_list_at_ith_class = (\n                groundtruth_is_group_of_list[groundtruth_class_labels == i])\n            (\n                gt_boxes_at_ith_class,\n                gt_masks_at_ith_class,\n                detected_boxes_at_ith_class,\n                detected_scores_at_ith_class,\n                detected_masks_at_ith_class,\n            ) = self._get_ith_class_arrays(detected_boxes, detected_scores,\n                                           detected_masks,\n                                           detected_class_labels,\n                                           groundtruth_boxes,\n                                           groundtruth_masks,\n                                           groundtruth_class_labels, i)\n            scores, tp_fp_labels = self._compute_tp_fp_for_single_class(\n                detected_boxes=detected_boxes_at_ith_class,\n                detected_scores=detected_scores_at_ith_class,\n                groundtruth_boxes=gt_boxes_at_ith_class,\n                groundtruth_is_difficult_list=("
+        },
+        {
+            "comment": "This code is calculating overlapping regions and scores between detected and ground truth boxes. It's taking in arrays of detected box coordinates, classification scores, ground truth box coordinates, and ground truth group indicators. The code then returns the resultant scores and true positive/false positive labels for each image. This seems to be part of an object detection or instance segmentation evaluation metric.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":202-227",
+            "content": "                    groundtruth_is_difficult_list_at_ith_class),\n                groundtruth_is_group_of_list=(\n                    groundtruth_is_group_of_list_at_ith_class),\n                detected_masks=detected_masks_at_ith_class,\n                groundtruth_masks=gt_masks_at_ith_class,\n            )\n            result_scores.append(scores)\n            result_tp_fp_labels.append(tp_fp_labels)\n        return result_scores, result_tp_fp_labels\n    def _get_overlaps_and_scores_box_mode(\n        self,\n        detected_boxes,\n        detected_scores,\n        groundtruth_boxes,\n        groundtruth_is_group_of_list,\n    ):\n        \"\"\"Computes overlaps and scores between detected and groudntruth boxes.\n        Args:\n            detected_boxes: A numpy array of shape [N, 4] representing detected\n                box coordinates\n            detected_scores: A 1-d numpy array of length N representing\n                classification score\n            groundtruth_boxes: A numpy array of shape [M, 4] representing\n                ground truth box coordinates"
+        },
+        {
+            "comment": "Code computes intersection over union (IoU) and intersection over area (IoA) between detected boxlists and ground truth boxlists. It also returns the scores of the detected boxes and the number of non-maximum suppressed detected boxes. The ground truth is_group_of_list is used to ignore group-of boxes during calculation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":228-248",
+            "content": "            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box has group-of tag. If a\n                groundtruth box is group-of box, every detection matching this\n                box is ignored.\n        Returns:\n            iou: A float numpy array of size [num_detected_boxes,\n                num_gt_boxes]. If gt_non_group_of_boxlist.num_boxes() == 0 it\n                will be None.\n            ioa: A float numpy array of size [num_detected_boxes,\n                num_gt_boxes]. If gt_group_of_boxlist.num_boxes() == 0 it will\n                be None.\n            scores: The score of the detected boxlist.\n            num_boxes: Number of non-maximum suppressed detected boxes.\n        \"\"\"\n        detected_boxlist = np_box_list.BoxList(detected_boxes)\n        detected_boxlist.add_field('scores', detected_scores)\n        gt_non_group_of_boxlist = np_box_list.BoxList(\n            groundtruth_boxes[~groundtruth_is_group_of_list])\n        iou = np_box_ops.iou(detected_boxlist.get(),"
+        },
+        {
+            "comment": "This function labels boxes detected with the same class from the same image as true positives or false positives. It takes in the detected boxes, scores, ground truth boxes, and other relevant information to perform this labeling task. The output is determined based on the intersection-over-union (IoU) threshold between detected and ground truth boxes. If a detected box has an IoU greater than 0.5 with any ground truth box in the same class and image, it is considered a true positive (tp). Otherwise, it's considered a false positive (fp). The function also computes the number of detected boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":249-275",
+            "content": "                             gt_non_group_of_boxlist.get())\n        scores = detected_boxlist.get_field('scores')\n        num_boxes = detected_boxlist.num_boxes()\n        return iou, None, scores, num_boxes\n    def _compute_tp_fp_for_single_class(\n        self,\n        detected_boxes,\n        detected_scores,\n        groundtruth_boxes,\n        groundtruth_is_difficult_list,\n        groundtruth_is_group_of_list,\n        detected_masks=None,\n        groundtruth_masks=None,\n    ):\n        \"\"\"Labels boxes detected with the same class from the same image as\n        tp/fp.\n        Args:\n            detected_boxes: A numpy array of shape [N, 4] representing detected\n                box coordinates\n            detected_scores: A 1-d numpy array of length N representing\n                classification score\n            groundtruth_boxes: A numpy array of shape [M, 4] representing\n                groundtruth box coordinates\n            groundtruth_is_difficult_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a difficult instance or"
+        },
+        {
+            "comment": "This function computes true positive (TP) and false positive (FP) labels for detected boxes based on whether they match difficult or group-of ground truth boxes. It returns scores and TP/FP labels, ignoring any detections that match these challenging boxes. Optional mask inputs are also supported to compute scores based on pixel-wise comparisons instead of bounding box overlaps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":276-294",
+            "content": "                not. If a groundtruth box is difficult, every detection\n                matching this box is ignored.\n            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box has group-of tag. If a\n                groundtruth box is group-of box, every detection matching this\n                box is ignored.\n            detected_masks: (optional) A uint8 numpy array of shape\n                [N, height, width]. If not None, the scores will be computed\n                based on masks.\n            groundtruth_masks: (optional) A uint8 numpy array of shape\n                [M, height, width].\n        Returns:\n            Two arrays of the same size, containing all boxes that were\n            evaluated as being true positives or false positives; if a box\n            matched to a difficult box or to a group-of box, it is ignored.\n            scores: A numpy array representing the detection scores.\n            tp_fp_labels: a boolean numpy array indicating whether a detection"
+        },
+        {
+            "comment": "This code checks for true positive detections by first obtaining the Intersection over Union (IoU) and scores between detected boxes and ground truth boxes. If there are no detected or ground truth boxes, it returns empty arrays. Then, it initializes variables to keep track of whether a detection is matched to a difficult box or a group-of box. The code proceeds in two stages: first, all detections are matched to non-group-of boxes, determining true positives, and then detections matched to difficult boxes are identified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":295-321",
+            "content": "                is a true positive.\n        \"\"\"\n        if detected_boxes.size == 0:\n            return np.array([], dtype=float), np.array([], dtype=bool)\n        (\n            iou,\n            _,\n            scores,\n            num_detected_boxes,\n        ) = self._get_overlaps_and_scores_box_mode(\n            detected_boxes=detected_boxes,\n            detected_scores=detected_scores,\n            groundtruth_boxes=groundtruth_boxes,\n            groundtruth_is_group_of_list=groundtruth_is_group_of_list,\n        )\n        if groundtruth_boxes.size == 0:\n            return scores, np.zeros(num_detected_boxes, dtype=bool)\n        tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool)\n        is_matched_to_difficult_box = np.zeros(num_detected_boxes, dtype=bool)\n        is_matched_to_group_of_box = np.zeros(num_detected_boxes, dtype=bool)\n        # The evaluation is done in two stages:\n        # 1. All detections are matched to non group-of boxes; true positives\n        #    are determined and detections matched to difficult boxes are"
+        },
+        {
+            "comment": "This code performs a TP-FP evaluation for non-group of boxes, ignoring difficult ground truth boxes and false positives matched to group-of boxes. It checks the IOU between detected boxes and ground truth boxes, and assigns labels accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":322-343",
+            "content": "        #    ignored.\n        # 2. Detections that are determined as false positives are matched\n        #    against group-of boxes and ignored if matched.\n        # Tp-fp evaluation for non-group of boxes (if any).\n        if iou.shape[1] > 0:\n            groundtruth_nongroup_of_is_difficult_list = (\n                groundtruth_is_difficult_list[~groundtruth_is_group_of_list])\n            max_overlap_gt_ids = np.argmax(iou, axis=1)\n            is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)\n            for i in range(num_detected_boxes):\n                gt_id = max_overlap_gt_ids[i]\n                if iou[i, gt_id] >= self.matching_iou_threshold:\n                    if not groundtruth_nongroup_of_is_difficult_list[gt_id]:\n                        if not is_gt_box_detected[gt_id]:\n                            tp_fp_labels[i] = True\n                            is_gt_box_detected[gt_id] = True\n                    else:\n                        is_matched_to_difficult_box[i] = True\n        return (\n            scores[~is_matched_to_difficult_box & ~is_matched_to_group_of_box],"
+        },
+        {
+            "comment": "This function, _get_ith_class_arrays, takes in various numpy arrays of detected and ground truth boxes, masks, and class labels. It then returns the corresponding numpy arrays for a specific class index.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":344-370",
+            "content": "            tp_fp_labels[~is_matched_to_difficult_box\n                         & ~is_matched_to_group_of_box],\n        )\n    def _get_ith_class_arrays(\n        self,\n        detected_boxes,\n        detected_scores,\n        detected_masks,\n        detected_class_labels,\n        groundtruth_boxes,\n        groundtruth_masks,\n        groundtruth_class_labels,\n        class_index,\n    ):\n        \"\"\"Returns numpy arrays belonging to class with index `class_index`.\n        Args:\n            detected_boxes: A numpy array containing detected boxes.\n            detected_scores: A numpy array containing detected scores.\n            detected_masks: A numpy array containing detected masks.\n            detected_class_labels: A numpy array containing detected class\n                labels.\n            groundtruth_boxes: A numpy array containing groundtruth boxes.\n            groundtruth_masks: A numpy array containing groundtruth masks.\n            groundtruth_class_labels: A numpy array containing groundtruth\n                class labels."
+        },
+        {
+            "comment": "This function returns ground truth boxes, masks (if provided), detected boxes, scores, and masks for a given class index. It selects the data corresponding to the class index from input arrays and returns them in separate numpy arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":371-391",
+            "content": "            class_index: An integer index.\n        Returns:\n            gt_boxes_at_ith_class: A numpy array containing groundtruth boxes\n                labeled as ith class.\n            gt_masks_at_ith_class: A numpy array containing groundtruth masks\n                labeled as ith class.\n            detected_boxes_at_ith_class: A numpy array containing detected\n                boxes corresponding to the ith class.\n            detected_scores_at_ith_class: A numpy array containing detected\n                scores corresponding to the ith class.\n            detected_masks_at_ith_class: A numpy array containing detected\n                masks corresponding to the ith class.\n        \"\"\"\n        selected_groundtruth = groundtruth_class_labels == class_index\n        gt_boxes_at_ith_class = groundtruth_boxes[selected_groundtruth]\n        if groundtruth_masks is not None:\n            gt_masks_at_ith_class = groundtruth_masks[selected_groundtruth]\n        else:\n            gt_masks_at_ith_class = None\n        selected_detections = detected_class_labels == class_index"
+        },
+        {
+            "comment": "This code defines two functions: \n1. _get_class_specific_results: Extracts class-specific results from the given data and returns them in a tuple.\n2. _remove_invalid_boxes: Removes entries with invalid boxes from the given data. An invalid box is one where xmax < xmin or ymax < ymin.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":392-420",
+            "content": "        detected_boxes_at_ith_class = detected_boxes[selected_detections]\n        detected_scores_at_ith_class = detected_scores[selected_detections]\n        if detected_masks is not None:\n            detected_masks_at_ith_class = detected_masks[selected_detections]\n        else:\n            detected_masks_at_ith_class = None\n        return (\n            gt_boxes_at_ith_class,\n            gt_masks_at_ith_class,\n            detected_boxes_at_ith_class,\n            detected_scores_at_ith_class,\n            detected_masks_at_ith_class,\n        )\n    def _remove_invalid_boxes(\n        self,\n        detected_boxes,\n        detected_scores,\n        detected_class_labels,\n        detected_masks=None,\n    ):\n        \"\"\"Removes entries with invalid boxes.\n        A box is invalid if either its xmax is smaller than its xmin, or its\n        ymax is smaller than its ymin.\n        Args:\n            detected_boxes: A float numpy array of size [num_boxes, 4]\n                containing box coordinates in [ymin, xmin, ymax, xmax] format."
+        },
+        {
+            "comment": "This function performs a filtering operation on the input arrays (detected_boxes, detected_scores, detected_class_labels, and detected_masks). It keeps only those elements where the first element of the detected_box is less than its fourth element, and the second element of detected_box is less than its third element. The resulting valid indices are used to slice the input arrays into their valid subsets (valid_detected_boxes, valid_detected_scores, valid_detected_class_labels, and valid_detected_masks).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":421-442",
+            "content": "            detected_scores: A float numpy array of size [num_boxes].\n            detected_class_labels: A int32 numpy array of size [num_boxes].\n            detected_masks: A uint8 numpy array of size\n                [num_boxes, height, width].\n        Returns:\n            valid_detected_boxes: A float numpy array of size\n                [num_valid_boxes, 4] containing box coordinates in\n                [ymin, xmin, ymax, xmax] format.\n            valid_detected_scores: A float numpy array of size\n                [num_valid_boxes].\n            valid_detected_class_labels: A int32 numpy array of size\n                [num_valid_boxes].\n            valid_detected_masks: A uint8 numpy array of size\n                [num_valid_boxes, height, width].\n        \"\"\"\n        valid_indices = np.logical_and(\n            detected_boxes[:, 0] < detected_boxes[:, 2],\n            detected_boxes[:, 1] < detected_boxes[:, 3],\n        )\n        detected_boxes = detected_boxes[valid_indices]\n        detected_scores = detected_scores[valid_indices]"
+        },
+        {
+            "comment": "Function returns detected bounding boxes, scores, class labels, and masks (if available) for valid indices only.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py\":443-451",
+            "content": "        detected_class_labels = detected_class_labels[valid_indices]\n        if detected_masks is not None:\n            detected_masks = detected_masks[valid_indices]\n        return [\n            detected_boxes,\n            detected_scores,\n            detected_class_labels,\n            detected_masks,\n        ]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9b7cc702-4908-444a-aa8a-f03e223706d9.json b/docs/doc/9b7cc702-4908-444a-aa8a-f03e223706d9.json
new file mode 100644
index 000000000..8c0c03a80
--- /dev/null
+++ b/docs/doc/9b7cc702-4908-444a-aa8a-f03e223706d9.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code constructs an optimizer and learning rate scheduler for parameter optimization, adjustable parameters, and applies regularizers to prevent overfitting. It sets weight decay based on name and value from configuration and returns the optimizer with specified parameters.",
+    "details": [
+        {
+            "comment": "This code builds an optimizer and learning rate scheduler for parameter optimization based on the given configuration file. It allows for different optimizer types (e.g., Momentum) with adjustable parameters like momentum and weight decay.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py\":0-35",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport copy\nimport paddle\ndef build_optimizer(cfg, lr_scheduler, parameter_list=None):\n    \"\"\"\n    Build an optimizer and learning rate scheduler to optimize parameters accroding to ```OPTIMIZER``` field in configuration .\n    In configuration:\n    OPTIMIZER:\n        name: Momentum\n        momentum: 0.9\n        weight_decay: 0.001\n    or\n    OPTIMIZER:\n        name: Momentum\n        momentum: 0.9\n        weight_decay:"
+        },
+        {
+            "comment": "This code defines an optimizer function that creates an optimizer based on the provided configuration. It uses an Adam optimizer to optimize a network and applies an L2Decay regularizer to avoid overfitting. The L1Decay regularizer can also be applied. The function takes an optimizer configuration dictionary, learning rate scheduler, and a list of parameters to be optimized as inputs and returns a paddle optimizer object. It checks for none and illegal configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py\":36-67",
+            "content": "            name: \"L1\"\n            value: 0.001\n    Momentum optimizer will be applied to optimize network and L1Decay regularizer will be applied to avoid overfit.\n    OPTIMIZER:\n        name: Adam\n        weight_decay:\n            name: \"L2\"\n            value: 0.001\n    Adam optimizer will be applied to optimize network and L2Decay regularizer will applied to avoid overfit.\n    Refer to ```https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/regularizer/L2Decay_en.html``` for more details.\n    Args:\n        cfg (dict): optimizer configuration.\n        lr_schduler: learning rate scheduler.\n        parameter_list (list): parameters to be optimized.\n    Returns:\n        optimizer (paddle.optimizer): paddle optimizer.\n    \"\"\"\n    cfg_copy = cfg.copy()\n    #XXX check none and illegal cfg!!!\n    opt_name = cfg_copy.pop('name')\n    # deal with weight decay\n    if cfg_copy.get('weight_decay'):\n        if isinstance(cfg_copy.get('weight_decay'), float) or 'L1' in cfg_copy.get('weight_decay').get('name').upper():"
+        },
+        {
+            "comment": "This code sets the weight decay based on its name and value from configuration. If 'L2' is in the name, it adds L2 Decay regularizer. Otherwise, it raises a ValueError. It then removes learning_rate from config and returns an optimizer with specified parameters and other configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py\":68-78",
+            "content": "            cfg_copy['weight_decay'] = cfg_copy.get('weight_decay').get('value')\n        elif 'L2' in cfg_copy.get('weight_decay').get('name').upper():\n            cfg_copy['weight_decay'] = paddle.regularizer.L2Decay(cfg_copy.get('weight_decay').get('value'))\n        else:\n            raise ValueError\n    cfg_copy.pop('learning_rate')\n    return getattr(paddle.optimizer, opt_name)(lr_scheduler,\n                                               parameters=parameter_list,\n                                               **cfg_copy)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9b86ea07-cecc-4dcb-b37e-5c9ca154d616.json b/docs/doc/9b86ea07-cecc-4dcb-b37e-5c9ca154d616.json
new file mode 100644
index 000000000..58ea7ec49
--- /dev/null
+++ b/docs/doc/9b86ea07-cecc-4dcb-b37e-5c9ca154d616.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports and registers a reader named \"ATTENTIONLSTMERNIE\" from the FeatureReader class, following alphabetical order. It is part of the PaddleVideo MultimodalVideoTag project, likely for video feature extraction or analysis.",
+    "details": [
+        {
+            "comment": "This code imports and registers a reader named \"ATTENTIONLSTMERNIE\" from the FeatureReader class, following alphabetical order. It is part of the PaddleVideo MultimodalVideoTag project, likely for video feature extraction or analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py\":0-12",
+            "content": "#!/usr/bin/env python\n# coding=utf-8\n\"\"\"\nCopyright 2021 Baidu.com, Inc. All Rights Reserved\nDescription: \nAuthors: wanghewei(wanghewei@baidu.com)\nLastEditors: wanghewei(wanghewei@baidu.com)\nDate: 2021-11-26 16:31:59\n\"\"\"\nfrom .reader_utils import regist_reader, get_reader\nfrom .feature_reader import FeatureReader\n# regist reader, sort by alphabet\nregist_reader(\"ATTENTIONLSTMERNIE\", FeatureReader)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9b8b9be5-e57b-45ac-af36-0ea1debb288c.json b/docs/doc/9b8b9be5-e57b-45ac-af36-0ea1debb288c.json
new file mode 100644
index 000000000..8b645b054
--- /dev/null
+++ b/docs/doc/9b8b9be5-e57b-45ac-af36-0ea1debb288c.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The code imports modules, transfers model parameters, adjusts positional embeddings, and provides save/load functions for Resnet18, VisionTransformer (TimeSformer), SwinTransformer3D models using PaddlePaddle library.",
+    "details": [
+        {
+            "comment": "This code is from the PaddleVideo library and it imports necessary modules, defines a function for transferring pre-trained Swin model parameters, and deletes the classifier's weights from state_dicts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport os.path as osp\nimport time\nimport paddle\nimport paddle.nn.functional as F\nfrom paddlevideo.utils import get_logger, main_only\nfrom tqdm import tqdm\nimport numpy as np\nfrom scipy import ndimage\ndef pretrain_swin_param_trans(model, state_dicts):\n    # delete classifier's params\n    if 'head.fc' + '.weight' in state_dicts:\n        del state_dicts['head.fc' + '.weight']\n    if 'head.fc' + '.bias' in state_dicts:"
+        },
+        {
+            "comment": "This code checks if the loaded state dictionaries match the model's state dictionaries and handles any inconsistencies. It removes unnecessary keys, adjusts certain weights, and bicubically interpolates relative position bias tables if they don't match to ensure proper loading of 2D or 3D weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":30-60",
+            "content": "        del state_dicts['head.fc' + '.bias']\n    state_dicts = {\n        k.replace('backbone.', ''): v\n        for k, v in state_dicts.items()\n    }\n    if len(state_dicts) == len(model.state_dict()):\n        print(\"Load 3D weights\")\n        return state_dicts\n    print(\"Load 2D weights\")\n    relative_position_index_keys = [\n        k for k in state_dicts.keys() if \"relative_position_index\" in k\n    ]\n    for k in relative_position_index_keys:\n        del state_dicts[k]\n    # delete attn_mask since we always re-init it\n    attn_mask_keys = [k for k in state_dicts.keys() if \"attn_mask\" in k]\n    for k in attn_mask_keys:\n        del state_dicts[k]\n    state_dicts['patch_embed.proj.weight'] = state_dicts[\n        'patch_embed.proj.weight'].unsqueeze(2).tile(\n            [1, 1, model.patch_size[0], 1, 1]) / model.patch_size[0]\n    # bicubic interpolate relative_position_bias_table if not match\n    relative_position_bias_table_keys = [\n        k for k in state_dicts.keys() if \"relative_position_bias_table\" in k\n    ]"
+        },
+        {
+            "comment": "Loading weights for relative position bias tables from pretrained and current model state dictionaries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":61-81",
+            "content": "    total_len = len(relative_position_bias_table_keys)\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        for key in tqdm(relative_position_bias_table_keys,\n                        total=total_len,\n                        position=0):\n            relative_position_bias_table_pretrained = state_dicts[key]\n            relative_position_bias_table_current = model.state_dict()[key]\n            L1, nH1 = relative_position_bias_table_pretrained.shape\n            L2, nH2 = relative_position_bias_table_current.shape\n            L2 = (2 * model.window_size[1] - 1) * (2 * model.window_size[2] - 1)\n            wd = model.window_size[0]\n            if nH1 != nH2:\n                desc.set_description(f\"Error in loading {key}, skip\")\n            else:\n                if L1 != L2:\n                    S1 = int(L1**0.5)\n                    relative_position_bias_table_pretrained_resized = paddle.nn.functional.interpolate(\n                        relative_position_bias_table_pretrained.transpose("
+        },
+        {
+            "comment": "Function is loading pre-trained model parameters, resizing a table, and setting the description.\nThe code is performing model parameter transformation for ViT models, deleting unnecessary weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":82-104",
+            "content": "                            [1, 0]).reshape([1, nH1, S1, S1]),\n                        size=(2 * model.window_size[1] - 1,\n                              2 * model.window_size[2] - 1),\n                        mode='bicubic')\n                    relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.reshape(\n                        [nH2, L2]).transpose([1, 0])\n                desc.set_description(f\"Loading {key}\")\n            state_dicts[key] = relative_position_bias_table_pretrained.tile(\n                [2 * wd - 1, 1])\n            time.sleep(0.01)\n    ret_str = \"loading {:<20d} weights completed.\".format(\n        len(model.state_dict()))\n    desc.set_description(ret_str)\n    return state_dicts\ndef pretrain_vit_param_trans(model, state_dicts, num_patches, num_seg,\n                             attention_type):\n    \"\"\"\n    Convert ViT's pre-trained model parameters to a parameter dictionary that matches the existing model\n    \"\"\"\n    if 'head' + '.weight' in state_dicts:\n        del state_dicts['head' + '.weight']"
+        },
+        {
+            "comment": "This code block checks the shape of the 'pos_embed' tensor and adjusts it based on the number of patches provided. It resizes the tensor using Paddle's ndimage.zoom function and then reconstructs the updated positional embedding for the model. This is necessary when the number of patches changes, ensuring the positional embeddings are consistent with the new patch count.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":105-125",
+            "content": "    if 'head' + '.bias' in state_dicts:\n        del state_dicts['head' + '.bias']\n    total_len = len(model.state_dict())\n    if num_patches + 1 != state_dicts['pos_embed'].shape[1]:  # when\n        pos_embed = state_dicts['pos_embed']\n        cls_pos_embed = paddle.to_tensor(\n            pos_embed[0, 0, :]).unsqueeze(0).unsqueeze(1)\n        other_pos_embed = paddle.to_tensor(pos_embed[0, 1:, :])\n        gs_new = int(np.sqrt(num_patches))\n        gs_old = int(np.sqrt(other_pos_embed.shape[0]))\n        zoom = (gs_new / gs_old, gs_new / gs_old, 1)\n        other_pos_embed = paddle.reshape(other_pos_embed, [gs_old, gs_old, -1])\n        other_pos_embed = ndimage.zoom(other_pos_embed, zoom, order=1)\n        other_pos_embed = paddle.to_tensor(other_pos_embed)\n        new_pos_embed = paddle.reshape(other_pos_embed, [1, num_patches, -1])\n        new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), axis=1)\n        state_dicts['pos_embed'] = new_pos_embed\n        time.sleep(0.01)\n    if 'time_embed' in state_dicts and num_seg != state_dicts["
+        },
+        {
+            "comment": "This code block is part of a larger program that loads pre-trained model weights. It first checks if the shape of 'time_embed' matches a specific condition, and if not, it performs some transformations on it. Afterwards, it starts a progress bar with the description \"Loading weights\" to show the progress of loading these weights. If the attention type is 'divided_space_time', it makes a copy of state_dicts and iterates over its keys, replacing 'attn' keys with 'temporal_attn' if not already present.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":126-146",
+            "content": "            'time_embed'].shape[1]:\n        time_embed = state_dicts['time_embed'].transpose((0, 2, 1)).unsqueeze(0)\n        new_time_embed = F.interpolate(time_embed,\n                                       size=(time_embed.shape[-2], num_seg),\n                                       mode='nearest')\n        state_dicts['time_embed'] = new_time_embed.squeeze(0).transpose(\n            (0, 2, 1))\n        time.sleep(0.01)\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        if attention_type == 'divided_space_time':\n            new_state_dicts = state_dicts.copy()\n            for key in tqdm(state_dicts):\n                if 'blocks' in key and 'attn' in key:\n                    desc.set_description(\"Loading %s\" % key)\n                    new_key = key.replace('attn', 'temporal_attn')\n                    if not new_key in state_dicts:\n                        new_state_dicts[new_key] = state_dicts[key]\n                    else:"
+        },
+        {
+            "comment": "This code appears to be related to model weight loading and adaptation for a pre-trained ResNet18 in a specific context. It modifies the state_dicts of certain keys, like replacing 'norm1' with 'temporal_norm1', possibly to adapt the weights to fit the new model structure. The code also checks if a certain key exists and copies it if not, ensuring the new model has all necessary parameters. Finally, it updates the description for the loading process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":147-171",
+            "content": "                        new_state_dicts[new_key] = state_dicts[new_key]\n                if 'blocks' in key and 'norm1' in key:\n                    desc.set_description(\"Loading %s\" % key)\n                    new_key = key.replace('norm1', 'temporal_norm1')\n                    if not new_key in state_dicts:\n                        new_state_dicts[new_key] = state_dicts[key]\n                    else:\n                        new_state_dicts[new_key] = state_dicts[new_key]\n                time.sleep(0.01)\n        elif attention_type == 'space_only':  # tokenshift raw vit\n            new_state_dicts = state_dicts.copy()\n    ret_str = \"loading {:<20d} weights completed.\".format(\n        len(model.state_dict()))\n    desc.set_description(ret_str)\n    return new_state_dicts\ndef pretrain_resnet18_param_trans(model, loaded_dict):\n    encoder_dict = model.encoder.state_dict()\n    pose_encoder_dict = model.pose_encoder.state_dict()\n    names = ['encoder.', 'encoder_day.', 'encoder_night.']\n    for name in names:\n        total_len = len(loaded_dict.items())"
+        },
+        {
+            "comment": "This code is loading weights from a dictionary, updating the encoder_dict if the key already exists. It also updates loaded_dict for a specific convolution layer based on the number of input images and uses tqdm to provide progress updates.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":172-196",
+            "content": "        with tqdm(total=total_len,\n                  position=1,\n                  bar_format='{desc}',\n                  desc=\"Loading weights\") as desc:\n            for key, value in tqdm(loaded_dict.items(),\n                                   total=total_len,\n                                   position=0):\n                key = str(name + key)\n                if key in encoder_dict:\n                    encoder_dict[key] = value\n                    desc.set_description('Loading %s' % key)\n                time.sleep(0.01)\n    num_input_images = 2\n    loaded_dict['conv1.weight'] = paddle.concat(\n        [loaded_dict['conv1.weight']] * num_input_images, 1) / num_input_images\n    total_len = len(loaded_dict.items())\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        for name, value in tqdm(loaded_dict.items(),\n                                total=total_len,\n                                position=0):\n            name = str('encoder.' + name)"
+        },
+        {
+            "comment": "This code loads pre-trained model parameters from a specified file path and converts them for use in the existing model. If the weight_path is not a valid checkpoint file, it raises an IOError. The code also utilizes Paddle's `paddle.load()` function to load state_dicts from the specified file path. It handles loading of Resnet18 parameters specifically with the `pretrain_resnet18_param_trans()` function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":197-225",
+            "content": "            if name in pose_encoder_dict:\n                pose_encoder_dict[name] = value\n                desc.set_description('Loading %s' % key)\n            time.sleep(0.01)\n        ret_str = \"loading {:<20d} weights completed.\".format(\n            len(model.state_dict()))\n        desc.set_description(ret_str)\n    return encoder_dict, pose_encoder_dict\n#XXX(shipping): maybe need load N times because of different cards have different params.\n@main_only\ndef load_ckpt(model, weight_path, **kargs):\n    \"\"\"\n    1. Load pre-trained model parameters\n    2. Extract and convert from the pre-trained model to the parameters\n    required by the existing model\n    3. Load the converted parameters of the existing model\n    \"\"\"\n    #model.set_state_dict(state_dict)\n    if not osp.isfile(weight_path):\n        raise IOError(f'{weight_path} is not a checkpoint file')\n    #state_dicts = load(weight_path)\n    logger = get_logger(\"paddlevideo\")\n    state_dicts = paddle.load(weight_path)\n    if 'ResnetEncoder' in str(model):\n        encoder_dict, pose_encoder_dict = pretrain_resnet18_param_trans("
+        },
+        {
+            "comment": "This code is loading the model's weights and dictionary entries. It checks the type of the model and then either loads or transposes the parameters accordingly, handling cases such as VisionTransformer (TimeSformer) and SwinTransformer3D. For other models, it simply initializes an empty dictionary and starts loading each item from the state_dict in a tqdm progress bar.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":226-247",
+            "content": "            model, state_dicts)\n        model.encoder.load_dict(encoder_dict)\n        model.pose_encoder.load_dict(pose_encoder_dict)\n        tmp = model.state_dict()\n    elif \"VisionTransformer\" in str(model):  # For TimeSformer case\n        tmp = pretrain_vit_param_trans(model, state_dicts, kargs['num_patches'],\n                                       kargs['num_seg'],\n                                       kargs['attention_type'])\n    elif 'SwinTransformer3D' in str(model):\n        tmp = pretrain_swin_param_trans(model, state_dicts)\n    else:\n        tmp = {}\n        total_len = len(model.state_dict())\n        with tqdm(total=total_len,\n                  position=1,\n                  bar_format='{desc}',\n                  desc=\"Loading weights\") as desc:\n            for item in tqdm(model.state_dict(), total=total_len, position=0):\n                name = item\n                desc.set_description('Loading %s' % name)\n                if name not in state_dicts:  # Convert from non-parallel model\n                    if str('backbone.' + name) in state_dicts:"
+        },
+        {
+            "comment": "This code saves a PaddlePaddle model's state dictionary and optionally the student model's state dictionary to separate files. It also has functionality for handling parallel models, converting them into separate state dictionaries. The `mkdir` function is used to create directories if they don't exist already. If the `save_student_model` flag is set to True, it will save both the main and student model weights in separate files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":248-281",
+            "content": "                        tmp[name] = state_dicts['backbone.' + name]\n                else:  # Convert from parallel model\n                    tmp[name] = state_dicts[name]\n                time.sleep(0.01)\n        ret_str = \"loading {:<20d} weights completed.\".format(\n            len(model.state_dict()))\n        desc.set_description(ret_str)\n    model.set_state_dict(tmp)\ndef mkdir(dir):\n    if not os.path.exists(dir):\n        # avoid error when train with multiple gpus\n        try:\n            os.makedirs(dir)\n        except:\n            pass\ndef _extract_student_weights(all_params, student_prefix=\"Student.\"):\n    s_params = {\n        key[len(student_prefix):]: all_params[key]\n        for key in all_params if student_prefix in key\n    }\n    return s_params\n@main_only\ndef save(obj, path, save_student_model=False):\n    if save_student_model:\n        s_params = _extract_student_weights(obj)\n        student_path = path.replace(\".pdparams\", \"_student.pdparams\")\n        if len(s_params) > 0:\n            paddle.save(s_params, student_path)"
+        },
+        {
+            "comment": "This code defines two functions: \"save\" and \"load\". The \"save\" function uses the Paddle library to save an object (obj) at a specified path. The \"load\" function checks if a file exists, raises an IOError if it does not, and then loads the object from the file using the Paddle library's load function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/save_load.py\":282-288",
+            "content": "    paddle.save(obj, path)\ndef load(file_name):\n    if not osp.isfile(file_name):\n        raise IOError(f'{file_name} not exist')\n    return paddle.load(file_name)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9c22d291-cc3d-4f0e-8dc5-8896129cbc20.json b/docs/doc/9c22d291-cc3d-4f0e-8dc5-8896129cbc20.json
new file mode 100644
index 000000000..89c3e5ccd
--- /dev/null
+++ b/docs/doc/9c22d291-cc3d-4f0e-8dc5-8896129cbc20.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet is a part of the PaddleVideo framework and contains copyright information, license details, and registry definitions for pipelines and datasets. It defines two registries, \"pipeline\" and \"datasets\", using the Registry class from the utils module, allowing the creation and management of custom pipeline and dataset classes.",
+    "details": [
+        {
+            "comment": "This code snippet is a part of the PaddleVideo framework and contains copyright information, license details, and registry definitions for pipelines and datasets. It defines two registries, \"pipeline\" and \"datasets\", using the Registry class from the utils module, allowing the creation and management of custom pipeline and dataset classes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/registry.py\":0-17",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nPIPELINES = Registry(\"pipeline\")\nDATASETS = Registry(\"datasets\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9c6566fc-5944-4999-8a72-8663167f668b.json b/docs/doc/9c6566fc-5944-4999-8a72-8663167f668b.json
new file mode 100644
index 000000000..0b75e5607
--- /dev/null
+++ b/docs/doc/9c6566fc-5944-4999-8a72-8663167f668b.json
@@ -0,0 +1,20 @@
+{
+    "summary": "TransNetV2 Partitioner in PaddleVideo framework defines a model partitioner, includes forwarding methods for image processing and computing loss metrics. It has three methods: \"loss_metrics\", \"test_step\", and \"infer_step\" for training, testing, and inference phases respectively.",
+    "details": [
+        {
+            "comment": "TransNetV2 Partitioner class for PaddleVideo framework, with forward_net and train_step methods for image processing and model training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py\":0-31",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import PARTITIONERS\nfrom .base import BasePartitioner\nimport paddle\n@PARTITIONERS.register()\nclass TransNetV2Partitioner(BasePartitioner):\n    \"\"\"TransNetV2 Partitioner framework\n    \"\"\"\n    def forward_net(self, imgs):\n        one_hot_pred = self.backbone(imgs)\n        return one_hot_pred\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        frame_sequence = data_batch[0]\n        one_hot_gt, many_hot_gt = data_batch[1:]\n        one_hot_pred = self.forward_net(frame_sequence)"
+        },
+        {
+            "comment": "Code defines a model partitioner for TransNetV2. It returns loss metrics from the validation step by forwarding frame sequences through the model, extracting one-hot and many-hot predictions and ground truths, and applying losses based on provided dictionaries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py\":32-53",
+            "content": "        dict_ = {}\n        if isinstance(one_hot_pred, tuple):\n            one_hot_pred, dict_ = one_hot_pred\n        many_hot_pred = dict_.get(\"many_hot\", None)\n        comb_reg_loss = dict_.get(\"comb_reg_loss\", None)\n        loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,\n                                    many_hot_pred, many_hot_gt,\n                                    reg_losses={\"comb_reg\": comb_reg_loss})\n        return loss_metrics\n    def val_step(self, data_batch):\n        frame_sequence = data_batch[0]\n        one_hot_gt, many_hot_gt = data_batch[1:]\n        one_hot_pred = self.forward_net(frame_sequence)\n        dict_ = {}\n        if isinstance(one_hot_pred, tuple):\n            one_hot_pred, dict_ = one_hot_pred\n        many_hot_pred = dict_.get(\"many_hot\", None)\n        comb_reg_loss = dict_.get(\"comb_reg_loss\", None)\n        loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,\n                                      many_hot_pred, many_hot_gt,\n                                      reg_losses={\"comb_reg\": comb_reg_loss})"
+        },
+        {
+            "comment": "The code defines three methods: \"loss_metrics\" returns loss and metrics for training, \"test_step\" performs testing by forwarding frames through the net without calculating loss, and \"infer_step\" also performs testing with forwarding frames but without specifying if it's for a test or inference phase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py\":54-67",
+            "content": "        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics\n        frame_sequence = data_batch[0]\n        one_hot_pred = self.forward_net(frame_sequence)\n        return one_hot_pred\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        frame_sequence = data_batch[0]\n        one_hot_pred = self.forward_net(frame_sequence)\n        return one_hot_pred"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9c9b671b-0b87-4c96-a0cc-f02fb04bbdcc.json b/docs/doc/9c9b671b-0b87-4c96-a0cc-f02fb04bbdcc.json
new file mode 100644
index 000000000..6e8714b56
--- /dev/null
+++ b/docs/doc/9c9b671b-0b87-4c96-a0cc-f02fb04bbdcc.json
@@ -0,0 +1,145 @@
+{
+    "summary": "The PaddleVideo library's UCF101 dataset offers utility functions and the Ucf24Metrics class for metric manipulation, bounding box handling, and precision/recall calculations. It computes mAP for image classification tasks and stores results per class using utility methods to read bounding box text files.",
+    "details": [
+        {
+            "comment": "This code snippet is from the UCF101 dataset utility functions in the PaddleVideo library. It contains an enum class representing average precision metrics and a copyright notice with license information, original source link, and developer contact details.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":0-32",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Forked from: https://github.com/rafaelpadilla/Object-Detection-Metrics\n# Developed by: Rafael Padilla (rafael.padilla@smt.ufrj.br)\nimport glob\nimport os\nimport shutil\nimport sys\nfrom collections import Counter\nimport numpy as np\nfrom enum import Enum\nimport cv2\nclass MethodAveragePrecision(Enum):\n    \"\"\"\n    Class representing if the coordinates are relative to the\n    image size or are absolute values.\n        Developed by: Rafael Padilla"
+        },
+        {
+            "comment": "This code defines three enumerations (CoordinatesType, BBType, and BBFormat) to represent different types of coordinates and bounding boxes. It also includes a function convertToRelativeValues that takes a size and a box as input and returns the box in relative values. The code was developed by Rafael Padilla with last modifications on April 28th for CoordinatesType, May 24th for BBType and format, and the function convertToRelativeValues is defined as well.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":33-80",
+            "content": "        Last modification: Apr 28 2018\n    \"\"\"\n    EveryPointInterpolation = 1\n    ElevenPointInterpolation = 2\nclass CoordinatesType(Enum):\n    \"\"\"\n    Class representing if the coordinates are relative to the\n    image size or are absolute values.\n        Developed by: Rafael Padilla\n        Last modification: Apr 28 2018\n    \"\"\"\n    Relative = 1\n    Absolute = 2\nclass BBType(Enum):\n    \"\"\"\n    Class representing if the bounding box is groundtruth or not.\n        Developed by: Rafael Padilla\n        Last modification: May 24 2018\n    \"\"\"\n    GroundTruth = 1\n    Detected = 2\nclass BBFormat(Enum):\n    \"\"\"\n    Class representing the format of a bounding box.\n    It can be (X,Y,width,height) => XYWH\n    or (X1,Y1,X2,Y2) => XYX2Y2\n        Developed by: Rafael Padilla\n        Last modification: May 24 2018\n    \"\"\"\n    XYWH = 1\n    XYX2Y2 = 2\ndef convertToRelativeValues(size, box):\n    dw = 1. / (size[0])\n    dh = 1. / (size[1])\n    cx = (box[1] + box[0]) / 2.0\n    cy = (box[3] + box[2]) / 2.0\n    w = box[1] - box[0]\n    h = box[3] - box[2]"
+        },
+        {
+            "comment": "Function `ucf24_utils.py:81-121` defines a function `convertToAbsoluteValues` which takes in the size and bounding box coordinates (x, y, w, h) as input and returns absolute values for xIn, yIn, xEnd, yEnd considering the image size. If any of these values fall outside the image boundaries, they are adjusted to the last valid pixel within the image.\nThis code also includes a function `add_bb_into_image` which adds a bounding box with given coordinates (x1, y1, x2, y2) and label on the image using OpenCV's rectangle() function and font() function for adding labels to the bounding boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":81-121",
+            "content": "    x = cx * dw\n    y = cy * dh\n    w = w * dw\n    h = h * dh\n    return x, y, w, h\ndef convertToAbsoluteValues(size, box):\n    xIn = round(((2 * float(box[0]) - float(box[2])) * size[0] / 2))\n    yIn = round(((2 * float(box[1]) - float(box[3])) * size[1] / 2))\n    xEnd = xIn + round(float(box[2]) * size[0])\n    yEnd = yIn + round(float(box[3]) * size[1])\n    if xIn < 0:\n        xIn = 0\n    if yIn < 0:\n        yIn = 0\n    if xEnd >= size[0]:\n        xEnd = size[0] - 1\n    if yEnd >= size[1]:\n        yEnd = size[1] - 1\n    return xIn, yIn, xEnd, yEnd\ndef add_bb_into_image(image, bb, color=(255, 0, 0), thickness=2, label=None):\n    r = int(color[0])\n    g = int(color[1])\n    b = int(color[2])\n    font = cv2.FONT_HERSHEY_SIMPLEX\n    fontScale = 0.5\n    fontThickness = 1\n    x1, y1, x2, y2 = bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n    x1 = int(x1)\n    y1 = int(y1)\n    x2 = int(x2)\n    y2 = int(y2)\n    cv2.rectangle(image, (x1, y1), (x2, y2), (b, g, r), thickness)\n    # Add label\n    if label is not None:\n        # Get size of the text box"
+        },
+        {
+            "comment": "This function calculates the text box coordinates and draws a rectangle around it, then adds text within the rectangle. The text position is adjusted if it's outside the image area. It also initializes a class for bounding boxes with properties like image name, class ID, and coordinates.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":122-147",
+            "content": "        (tw, th) = cv2.getTextSize(label, font, fontScale, fontThickness)[0]\n        # Top-left coord of the textbox\n        (xin_bb, yin_bb) = (x1 + thickness, y1 - th + int(12.5 * fontScale))\n        # Checking position of the text top-left (outside or inside the bb)\n        if yin_bb - th <= 0:  # if outside the image\n            yin_bb = y1 + th  # put it inside the bb\n        r_Xin = x1 - int(thickness / 2)\n        r_Yin = y1 - th - int(thickness / 2)\n        # Draw filled rectangle to put the text in it\n        cv2.rectangle(image, (r_Xin, r_Yin - thickness),\n                      (r_Xin + tw + thickness * 3, r_Yin + th + int(12.5 * fontScale)), (b, g, r),\n                      -1)\n        cv2.putText(image, label, (xin_bb, yin_bb), font, fontScale, (0, 0, 0), fontThickness,\n                    cv2.LINE_AA)\n    return image\nclass BoundingBox:\n    def __init__(self,\n                 imageName,\n                 classId,\n                 x,\n                 y,\n                 w,\n                 h,\n                 typeCoordinates=None,"
+        },
+        {
+            "comment": "This code snippet defines a constructor for the class Ucf24Metrics, which takes parameters like image name, class id, bounding box coordinates (x, y, w, h), and type of bounding box coordinates. It also accepts optional arguments such as imgSize, bbType, classConfidence, and format. If typeCoordinates is 'Relative', then imgSize is required. The constructor initializes an object representing a metric for UCF101 dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":148-164",
+            "content": "                 imgSize=None,\n                 bbType=None,\n                 classConfidence=None,\n                 format=None):\n        \"\"\"Constructor.\n        Args:\n            imageName: String representing the image name.\n            classId: String value representing class id.\n            x: Float value representing the X upper-left coordinate of the bounding box.\n            y: Float value representing the Y upper-left coordinate of the bounding box.\n            w: Float value representing the width bounding box.\n            h: Float value representing the height bounding box.\n            typeCoordinates: (optional) Enum (Relative or Absolute) represents if the bounding box\n            coordinates (x,y,w,h) are absolute or relative to size of the image. Default:'Absolute'.\n            imgSize: (optional) 2D vector (width, height)=>(int, int) represents the size of the\n            image of the bounding box. If typeCoordinates is 'Relative', imgSize is required.\n            bbType: (optional) Enum (Groundtruth or Detection) identifies if the bounding box"
+        },
+        {
+            "comment": "This code defines a class with properties: imageName, typeCoordinates (Relative or Absolute), imgSize (image size required if typeCoordinates is Relative), bbType (Ground Truth or Detection), and classConfidence (optional for Detection). It also includes error checks for mandatory parameters (imgSize for Relative typeCoordinates and classConfidence for Detection bbType).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":165-180",
+            "content": "            represents a ground truth or a detection. If it is a detection, the classConfidence has\n            to be informed.\n            classConfidence: (optional) Float value representing the confidence of the detected\n            class. If detectionType is Detection, classConfidence needs to be informed.\n            format: (optional) Enum (BBFormat.XYWH or BBFormat.XYX2Y2) indicating the format of the\n            coordinates of the bounding boxes. BBFormat.XYWH: <left> <top> <width> <height>\n            BBFormat.XYX2Y2: <left> <top> <right> <bottom>.\n        \"\"\"\n        self._imageName = imageName\n        self._typeCoordinates = typeCoordinates\n        if typeCoordinates == CoordinatesType.Relative and imgSize is None:\n            raise IOError(\n                'Parameter \\'imgSize\\' is required. It is necessary to inform the image size.')\n        if bbType == BBType.Detected and classConfidence is None:\n            raise IOError(\n                'For bbType=\\'Detection\\', it is necessary to inform the classConfidence value.')"
+        },
+        {
+            "comment": "This function converts relative bounding box coordinates to absolute values and assigns them to the object. If the given format is XYWH, it adjusts the width and height accordingly. For absolute coordinates, it directly assigns the provided values. If the format does not match XYWH for relative coordinates, an IOError is raised.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":182-206",
+            "content": "        self._classConfidence = classConfidence\n        self._bbType = bbType\n        self._classId = classId\n        self._format = format\n        # If relative coordinates, convert to absolute values\n        # For relative coords: (x,y,w,h)=(X_center/img_width , Y_center/img_height)\n        if typeCoordinates == CoordinatesType.Relative:\n            (self._x, self._y, self._w, self._h) = convertToAbsoluteValues(imgSize, (x, y, w, h))\n            self._width_img = imgSize[0]\n            self._height_img = imgSize[1]\n            if format == BBFormat.XYWH:\n                self._x2 = self._w\n                self._y2 = self._h\n                self._w = self._x2 - self._x\n                self._h = self._y2 - self._y\n            else:\n                raise IOError(\n                    'For relative coordinates, the format must be XYWH (x,y,width,height)')\n        # For absolute coords: (x,y,w,h)=real bb coords\n        else:\n            self._x = x\n            self._y = y\n            if format == BBFormat.XYWH:\n                self._w = w"
+        },
+        {
+            "comment": "The code defines a class with methods to handle bounding box formats. It supports two formats: XYWH and XYX2Y2. The constructor initializes the bounding box dimensions and image size if provided. The getAbsoluteBoundingBox method returns the bounding box coordinates based on the format specified. If no image size is available, getRelativeBoundingBox requires the imgSize parameter to determine the absolute position of the bounding box.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":207-231",
+            "content": "                self._h = h\n                self._x2 = self._x + self._w\n                self._y2 = self._y + self._h\n            else:  # format == BBFormat.XYX2Y2: <left> <top> <right> <bottom>.\n                self._x2 = w\n                self._y2 = h\n                self._w = self._x2 - self._x\n                self._h = self._y2 - self._y\n        if imgSize is None:\n            self._width_img = None\n            self._height_img = None\n        else:\n            self._width_img = imgSize[0]\n            self._height_img = imgSize[1]\n    def getAbsoluteBoundingBox(self, format=None):\n        if format == BBFormat.XYWH:\n            return self._x, self._y, self._w, self._h\n        elif format == BBFormat.XYX2Y2:\n            return self._x, self._y, self._x2, self._y2\n    def getRelativeBoundingBox(self, imgSize=None):\n        if imgSize is None and self._width_img is None and self._height_img is None:\n            raise IOError(\n                'Parameter \\'imgSize\\' is required. It is necessary to inform the image size.')"
+        },
+        {
+            "comment": "This code defines a class with various getter methods to access different attributes of the detection result. The class also contains a static method compare() that takes two detections as input and compares them using absolute bounding boxes and image sizes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":232-265",
+            "content": "        if imgSize is None:\n            return convertToRelativeValues((imgSize[0], imgSize[1]),\n                                           (self._x, self._y, self._w, self._h))\n        else:\n            return convertToRelativeValues((self._width_img, self._height_img),\n                                           (self._x, self._y, self._w, self._h))\n    def getImageName(self):\n        return self._imageName\n    def getConfidence(self):\n        return self._classConfidence\n    def getFormat(self):\n        return self._format\n    def getClassId(self):\n        return self._classId\n    def getImageSize(self):\n        return self._width_img, self._height_img\n    def getCoordinatesType(self):\n        return self._typeCoordinates\n    def getBBType(self):\n        return self._bbType\n    @staticmethod\n    def compare(det1, det2):\n        det1BB = det1.getAbsoluteBoundingBox(format=BBFormat.XYWH)\n        det1ImgSize = det1.getImageSize()\n        det2BB = det2.getAbsoluteBoundingBox(format=BBFormat.XYWH)\n        det2ImgSize = det2.getImageSize()"
+        },
+        {
+            "comment": "The code snippet compares two bounding boxes to check if they match by comparing their class IDs, coordinates, and image sizes. If the conditions are met, it returns True; otherwise, False. The static method `clone` creates a new bounding box with the same properties as an existing one, allowing for easy cloning of bounding boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":267-293",
+            "content": "        if det1.getClassId() == det2.getClassId() and \\\n                det1.classConfidence == det2.classConfidenc() and \\\n                det1BB[0] == det2BB[0] and \\\n                det1BB[1] == det2BB[1] and \\\n                det1BB[2] == det2BB[2] and \\\n                det1BB[3] == det2BB[3] and \\\n                det1ImgSize[0] == det1ImgSize[0] and \\\n                det2ImgSize[1] == det2ImgSize[1]:\n            return True\n        return False\n    @staticmethod\n    def clone(boundingBox):\n        absBB = boundingBox.getAbsoluteBoundingBox(format=BBFormat.XYWH)\n        newBoundingBox = BoundingBox(\n            boundingBox.getImageName(),\n            boundingBox.getClassId(),\n            absBB[0],\n            absBB[1],\n            absBB[2],\n            absBB[3],\n            typeCoordinates=boundingBox.getCoordinatesType(),\n            imgSize=boundingBox.getImageSize(),\n            bbType=boundingBox.getBBType(),\n            classConfidence=boundingBox.getConfidence(),\n            format=BBFormat.XYWH)\n        return newBoundingBox"
+        },
+        {
+            "comment": "This class represents a collection of bounding boxes with methods to add, remove, and retrieve bounding boxes based on their type or class. It also provides functionality to retrieve all classes present in the bounding boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":296-331",
+            "content": "class BoundingBoxes:\n    def __init__(self):\n        self._boundingBoxes = []\n    def addBoundingBox(self, bb):\n        self._boundingBoxes.append(bb)\n    def removeBoundingBox(self, _boundingBox):\n        for d in self._boundingBoxes:\n            if BoundingBox.compare(d, _boundingBox):\n                del self._boundingBoxes[d]\n                return\n    def removeAllBoundingBoxes(self):\n        self._boundingBoxes = []\n    def getBoundingBoxes(self):\n        return self._boundingBoxes\n    def getBoundingBoxByClass(self, classId):\n        boundingBoxes = []\n        for d in self._boundingBoxes:\n            if d.getClassId() == classId:  # get only specified bounding box type\n                boundingBoxes.append(d)\n        return boundingBoxes\n    def getClasses(self):\n        classes = []\n        for d in self._boundingBoxes:\n            c = d.getClassId()\n            if c not in classes:\n                classes.append(c)\n        return classes\n    def getBoundingBoxesByType(self, bbType):\n        # get only specified bb type"
+        },
+        {
+            "comment": "Function `getBoundingBoxesByBBType` returns a list of bounding boxes with the specified BB type.\nFunction `getBoundingBoxesByImageName` returns a list of bounding boxes for the given image name.\nMethod `count` counts and returns the number of bounding boxes with the specified BB type, or all bounding boxes if no type is provided.\nMethod `clone` creates a new instance of BoundingBoxes and adds clones of each bounding box from the original instance.\nFunction `drawAllBoundingBoxes` draws all bounding boxes for the given image name on the specified image, only ground truth bounding boxes are drawn in green color.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":332-358",
+            "content": "        return [d for d in self._boundingBoxes if d.getBBType() == bbType]\n    def getBoundingBoxesByImageName(self, imageName):\n        # get only specified bb type\n        return [d for d in self._boundingBoxes if d.getImageName() == imageName]\n    def count(self, bbType=None):\n        if bbType is None:  # Return all bounding boxes\n            return len(self._boundingBoxes)\n        count = 0\n        for d in self._boundingBoxes:\n            if d.getBBType() == bbType:  # get only specified bb type\n                count += 1\n        return count\n    def clone(self):\n        newBoundingBoxes = BoundingBoxes()\n        for d in self._boundingBoxes:\n            det = BoundingBox.clone(d)\n            newBoundingBoxes.addBoundingBox(det)\n        return newBoundingBoxes\n    def drawAllBoundingBoxes(self, image, imageName):\n        bbxes = self.getBoundingBoxesByImageName(imageName)\n        for bb in bbxes:\n            if bb.getBBType() == BBType.GroundTruth:  # if ground truth\n                image = add_bb_into_image(image, bb, color=(0, 255, 0))  # green"
+        },
+        {
+            "comment": "The code defines a function `GetPascalVOCMetrics` within the `Evaluator` class to calculate metrics for Pascal VOC Challenge. It takes `boundingboxes`, `IOUThreshold`, and `method` as input parameters. The method can be set as `EveryPointInterpolation` or `ElevenPointInterpolation`. This function calculates precision, recall, F1 score, and AP metric using the provided parameters for Pascal VOC Challenge evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":359-379",
+            "content": "            else:  # if detection\n                image = add_bb_into_image(image, bb, color=(255, 0, 0))  # red\n        return image\nclass Evaluator:\n    def GetPascalVOCMetrics(self,\n                            boundingboxes,\n                            IOUThreshold=0.5,\n                            method=None):\n        \"\"\"Get the metrics used by the VOC Pascal 2012 challenge.\n        Get\n        Args:\n            boundingboxes: Object of the class BoundingBoxes representing ground truth and detected\n            bounding boxes;\n            IOUThreshold: IOU threshold indicating which detections will be considered TP or FP\n            (default value = 0.5);\n            method (default = EveryPointInterpolation): It can be calculated as the implementation\n            in the official PASCAL VOC toolkit (EveryPointInterpolation), or applying the 11-point\n            interpolatio as described in the paper \"The PASCAL Visual Object Classes(VOC) Challenge\"\n            or EveryPointInterpolation\"  (ElevenPointInterpolation);"
+        },
+        {
+            "comment": "The function returns a list of dictionaries, each containing information and metrics of each class. The keys include class representation, precision values, recall values, average precision, interpolated precision, interpolated recall, total positives, total true positives, and total false positives. It initializes an empty list \"ret\" to store the metrics for each class, as well as groundTruths and detection lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":380-396",
+            "content": "        Returns:\n            A list of dictionaries. Each dictionary contains information and metrics of each class.\n            The keys of each dictionary are:\n            dict['class']: class representing the current dictionary;\n            dict['precision']: array with the precision values;\n            dict['recall']: array with the recall values;\n            dict['AP']: average precision;\n            dict['interpolated precision']: interpolated precision values;\n            dict['interpolated recall']: interpolated recall values;\n            dict['total positives']: total number of ground truth positives;\n            dict['total TP']: total number of True Positive detections;\n            dict['total FP']: total number of False Negative detections;\n        \"\"\"\n        ret = []  # list containing metrics (precision, recall, average precision) of each class\n        # List with all ground truths (Ex: [imageName,class,confidence=1, (bb coordinates XYX2Y2)])\n        groundTruths = []\n        # List with all detections (Ex: [imageName,class,confidence,(bb coordinates XYX2Y2)])"
+        },
+        {
+            "comment": "The code initializes empty lists for detections and classes, then iterates through all bounding boxes. It separates ground truth (GT) bounding boxes from detections, appending them to their respective lists with additional information such as image name, class ID, confidence, and bounding box coordinates. It also keeps track of unique classes and sorts them. The code will then use these lists and sorted classes for precision-recall calculations by individual classes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":397-421",
+            "content": "        detections = []\n        # Get all classes\n        classes = []\n        # Loop through all bounding boxes and separate them into GTs and detections\n        for bb in boundingboxes.getBoundingBoxes():\n            # [imageName, class, confidence, (bb coordinates XYX2Y2)]\n            if bb.getBBType() == BBType.GroundTruth:\n                groundTruths.append([\n                    bb.getImageName(),\n                    bb.getClassId(), 1,\n                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n                ])\n            else:\n                detections.append([\n                    bb.getImageName(),\n                    bb.getClassId(),\n                    bb.getConfidence(),\n                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n                ])\n            # get class\n            if bb.getClassId() not in classes:\n                classes.append(bb.getClassId())\n        classes = sorted(classes)\n        # Precision x Recall is obtained individually by each class\n        # Loop through by classes"
+        },
+        {
+            "comment": "Iterating through classes, the code collects detections and ground truths for each class. It then calculates the number of positive ground truths (npos), sorts detections by confidence level, and initializes True Positive (TP) and False Positive (FP) arrays. The code creates a dictionary to store the amount of ground truths per image, and iterates through detections to find corresponding ground truth images, calculating Intersection over Union (IoU) between detection and ground truth bounding boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":422-444",
+            "content": "        for c in classes:\n            # Get only detection of class c\n            dects = []\n            [dects.append(d) for d in detections if d[1] == c]\n            # Get only ground truths of class c\n            gts = []\n            [gts.append(g) for g in groundTruths if g[1] == c]\n            npos = len(gts)\n            # sort detections by decreasing confidence\n            dects = sorted(dects, key=lambda conf: conf[2], reverse=True)\n            TP = np.zeros(len(dects))\n            FP = np.zeros(len(dects))\n            # create dictionary with amount of gts for each image\n            det = Counter([cc[0] for cc in gts])\n            for key, val in det.items():\n                det[key] = np.zeros(val)\n            # Loop through detections\n            for d in range(len(dects)):\n                # Find ground truth image\n                gt = [gt for gt in gts if gt[0] == dects[d][0]]\n                iouMax = sys.float_info.min\n                for j in range(len(gt)):\n                    iou = Evaluator.iou(dects[d][3], gt[j][3])"
+        },
+        {
+            "comment": "This code calculates true positives, false positives, and computes precision, recall, and average precision. It checks if a detected object overlaps with ground truth objects using IOU threshold. If the overlap is within the threshold, it counts as a true positive or false positive depending on whether the object has already been marked 'seen'. Finally, based on the method chosen (EveryPointInterpolation in this case), it calls the appropriate average precision calculation function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":445-464",
+            "content": "                    if iou > iouMax:\n                        iouMax = iou\n                        jmax = j\n                # Assign detection as true positive/don't care/false positive\n                if iouMax >= IOUThreshold:\n                    if det[dects[d][0]][jmax] == 0:\n                        TP[d] = 1  # count as true positive\n                        det[dects[d][0]][jmax] = 1  # flag as already 'seen'\n                    else:\n                        FP[d] = 1  # count as false positive\n                # - A detected \"cat\" is overlaped with a GT \"cat\" with IOU >= IOUThreshold.\n                else:\n                    FP[d] = 1  # count as false positive\n            # compute precision, recall and average precision\n            acc_FP = np.cumsum(FP)\n            acc_TP = np.cumsum(TP)\n            rec = acc_TP / npos\n            prec = np.divide(acc_TP, (acc_FP + acc_TP))\n            # Depending on the method, call the right implementation\n            if method == MethodAveragePrecision.EveryPointInterpolation:"
+        },
+        {
+            "comment": "Calculates average precision for each class using CalculateAveragePrecision or ElevenPointInterpolatedAP depending on the input. Appends the results to a dictionary, then adds the dictionary to a list and returns it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":465-494",
+            "content": "                [ap, mpre, mrec, ii] = Evaluator.CalculateAveragePrecision(rec, prec)\n            else:\n                [ap, mpre, mrec, _] = Evaluator.ElevenPointInterpolatedAP(rec, prec)\n            # add class result in the dictionary to be returned\n            r = {\n                'class': c,\n                'precision': prec,\n                'recall': rec,\n                'AP': ap,\n                'interpolated precision': mpre,\n                'interpolated recall': mrec,\n                'total positives': npos,\n                'total TP': np.sum(TP),\n                'total FP': np.sum(FP)\n            }\n            ret.append(r)\n        return ret\n    @staticmethod\n    def CalculateAveragePrecision(rec, prec):\n        mrec = [0]\n        [mrec.append(e) for e in rec]\n        mrec.append(1)\n        mpre = [0]\n        [mpre.append(e) for e in prec]\n        mpre.append(0)\n        for i in range(len(mpre) - 1, 0, -1):\n            mpre[i - 1] = max(mpre[i - 1], mpre[i])\n        ii = []\n        for i in range(len(mrec) - 1):"
+        },
+        {
+            "comment": "The code calculates the 11-point interpolated average precision (AP) between recall and precision values. It first appends recall and precision lists in reverse order, then creates a list of recall values from 0 to 1 in reverse order. Next, it iterates over these recall values, finding all recall values greater than or equal to the current value and selecting the maximum precision at that index. Finally, it returns the interpolated AP by summing the maximum precisions for each recall value and dividing by 11. The resulting AP values are stored in a list along with the original recall and precision lists, as well as an indicator list of indices where the recall values were greater than or equal to the current recall value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":495-522",
+            "content": "            if mrec[1:][i] != mrec[0:-1][i]:\n                ii.append(i + 1)\n        ap = 0\n        for i in ii:\n            ap = ap + np.sum((mrec[i] - mrec[i - 1]) * mpre[i])\n        return [ap, mpre[0:len(mpre) - 1], mrec[0:len(mpre) - 1], ii]\n    @staticmethod\n    # 11-point interpolated average precision\n    def ElevenPointInterpolatedAP(rec, prec):\n        mrec = []\n        [mrec.append(e) for e in rec]\n        mpre = []\n        [mpre.append(e) for e in prec]\n        recallValues = np.linspace(0, 1, 11)\n        recallValues = list(recallValues[::-1])\n        rhoInterp = []\n        recallValid = []\n        for r in recallValues:\n            # Obtain all recall values higher or equal than r\n            argGreaterRecalls = np.argwhere(mrec[:] >= r)\n            pmax = 0\n            # If there are recalls above r\n            if argGreaterRecalls.size != 0:\n                pmax = max(mpre[argGreaterRecalls.min():])\n            recallValid.append(r)\n            rhoInterp.append(pmax)\n        # By definition AP = sum(max(precision whose recall is above r))/11"
+        },
+        {
+            "comment": "The code calculates average precision (AP) and Area Under Curve (AUC), then generates recall and precision values for a plot. It also defines a method to calculate the Intersection over Union (IoU) between reference and detection bounding boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":523-552",
+            "content": "        ap = sum(rhoInterp) / 11\n        # Generating values for the plot\n        rvals = [recallValid[0]]\n        [rvals.append(e) for e in recallValid]\n        rvals.append(0)\n        pvals = [0]\n        [pvals.append(e) for e in rhoInterp]\n        pvals.append(0)\n        # rhoInterp = rhoInterp[::-1]\n        cc = []\n        for i in range(len(rvals)):\n            p = (rvals[i], pvals[i - 1])\n            if p not in cc:\n                cc.append(p)\n            p = (rvals[i], pvals[i])\n            if p not in cc:\n                cc.append(p)\n        recallValues = [i[0] for i in cc]\n        rhoInterp = [i[1] for i in cc]\n        return [ap, rhoInterp, recallValues, None]\n    # For each detections, calculate IOU with reference\n    @staticmethod\n    def _getAllIOUs(reference, detections):\n        ret = []\n        bbReference = reference.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n        # img = np.zeros((200,200,3), np.uint8)\n        for d in detections:\n            bb = d.getAbsoluteBoundingBox(BBFormat.XYX2Y2)\n            iou = Evaluator.iou(bbReference, bb)"
+        },
+        {
+            "comment": "The code calculates the IoU (intersection over union) between two bounding boxes, and returns a list of detection results sorted by IoU in descending order. It also includes utility methods to check if two boxes intersect and calculate the intersection area.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":553-582",
+            "content": "            ret.append((iou, reference, d))  # iou, reference, detection\n        return sorted(ret, key=lambda i: i[0], reverse=True)  # sort by iou (from highest to lowest)\n    @staticmethod\n    def iou(boxA, boxB):\n        # if boxes dont intersect\n        if Evaluator._boxesIntersect(boxA, boxB) is False:\n            return 0\n        interArea = Evaluator._getIntersectionArea(boxA, boxB)\n        union = Evaluator._getUnionAreas(boxA, boxB, interArea=interArea)\n        # intersection over union\n        iou = interArea / union\n        assert iou >= 0\n        return iou\n    @staticmethod\n    def _boxesIntersect(boxA, boxB):\n        if boxA[0] > boxB[2]:\n            return False  # boxA is right of boxB\n        if boxB[0] > boxA[2]:\n            return False  # boxA is left of boxB\n        if boxA[3] < boxB[1]:\n            return False  # boxA is above boxB\n        if boxA[1] > boxB[3]:\n            return False  # boxA is below boxB\n        return True\n    @staticmethod\n    def _getIntersectionArea(boxA, boxB):\n        xA = max(boxA[0], boxB[0])"
+        },
+        {
+            "comment": "This code contains functions to calculate intersection and union areas of two bounding boxes, and two validation functions for argument formats and mandatory arguments. The ValidateFormats function checks if the format is 'xywh', 'xyrb' or None (default) and returns a corresponding BBFormat type. The ValidateMandatoryArgs function checks if an argument exists and appends an error message to 'errors' if it doesn't meet the requirements.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":583-616",
+            "content": "        yA = max(boxA[1], boxB[1])\n        xB = min(boxA[2], boxB[2])\n        yB = min(boxA[3], boxB[3])\n        # intersection area\n        return (xB - xA + 1) * (yB - yA + 1)\n    @staticmethod\n    def _getUnionAreas(boxA, boxB, interArea=None):\n        area_A = Evaluator._getArea(boxA)\n        area_B = Evaluator._getArea(boxB)\n        if interArea is None:\n            interArea = Evaluator._getIntersectionArea(boxA, boxB)\n        return float(area_A + area_B - interArea)\n    @staticmethod\n    def _getArea(box):\n        return (box[2] - box[0] + 1) * (box[3] - box[1] + 1)\n# Validate formats\ndef ValidateFormats(argFormat, argName, errors):\n    if argFormat == 'xywh':\n        return BBFormat.XYWH\n    elif argFormat == 'xyrb':\n        return BBFormat.XYX2Y2\n    elif argFormat is None:\n        return BBFormat.XYWH  # default when nothing is passed\n    else:\n        errors.append(\n            'argument %s: invalid value. It must be either \\'xywh\\' or \\'xyrb\\'' % argName)\n# Validate mandatory args\ndef ValidateMandatoryArgs(arg, argName, errors):"
+        },
+        {
+            "comment": "This code defines a function ValidateImageSize that checks if the image size argument is valid. It appends error messages to the errors list if the argument is missing or not in the correct format 'width,height'. The function also handles the case where the argument is relative and requires both width and height to be integers. Finally, it returns a tuple of (width, height) if valid. Additionally, there's a ValidateCoordinatesTypes function that checks if the coordinate type argument is valid and returns the CoordinatesType.Absolute if 'abs'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":617-647",
+            "content": "    if arg is None:\n        errors.append('argument %s: required argument' % argName)\n    else:\n        return True\ndef ValidateImageSize(arg, argName, argInformed, errors):\n    errorMsg = 'argument %s: required argument if %s is relative' % (argName, argInformed)\n    ret = None\n    if arg is None:\n        errors.append(errorMsg)\n    else:\n        arg = arg.replace('(', '').replace(')', '')\n        args = arg.split(',')\n        if len(args) != 2:\n            errors.append(\n                '%s. It must be in the format \\'width,height\\' (e.g. \\'600,400\\')' % errorMsg)\n        else:\n            if not args[0].isdigit() or not args[1].isdigit():\n                errors.append(\n                    '%s. It must be in INdiaTEGER the format \\'width,height\\' (e.g. \\'600,400\\')' %\n                    errorMsg)\n            else:\n                ret = (int(args[0]), int(args[1]))\n    return ret\n# Validate coordinate types\ndef ValidateCoordinatesTypes(arg, argName, errors):\n    if arg == 'abs':\n        return CoordinatesType.Absolute"
+        },
+        {
+            "comment": "This code reads text files containing bounding boxes (ground truth and detections). It handles 'relative' or 'absolute' coordinates, and checks for invalid arguments. The function takes a directory, image format, and coordinate type as inputs, and returns bounding boxes and classes. If allBoundingBoxes or allClasses are None, it initializes them. It changes the working directory to the specified directory and reads all files in alphabetical order.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":648-679",
+            "content": "    elif arg == 'rel':\n        return CoordinatesType.Relative\n    elif arg is None:\n        return CoordinatesType.Absolute  # default when nothing is passed\n    errors.append('argument %s: invalid value. It must be either \\'rel\\' or \\'abs\\'' % argName)\ndef getBoundingBoxes(directory,\n                     isGT,\n                     bbFormat,\n                     coordType,\n                     allBoundingBoxes=None,\n                     allClasses=None,\n                     imgSize=(0, 0)):\n    \"\"\"Read txt files containing bounding boxes (ground truth and detections).\"\"\"\n    print(directory)\n    if allBoundingBoxes is None:\n        allBoundingBoxes = BoundingBoxes()\n    if allClasses is None:\n        allClasses = []\n    # Read ground truths\n    os.chdir(directory)\n    files = glob.glob(\"*.txt\")\n    files.sort()\n    for f in files:\n        nameOfImage = f.replace(\".txt\", \"\")\n        fh1 = open(f, \"r\")\n        for line in fh1:\n            line = line.replace(\"\\n\", \"\")\n            if line.replace(' ', '') == '':\n                continue"
+        },
+        {
+            "comment": "This code reads a line of text and determines whether it represents ground truth or predicted bounding boxes. It then initializes BoundingBox objects with the appropriate attributes based on the type (ground truth or prediction) and stores them accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":680-710",
+            "content": "            splitLine = line.split(\" \")\n            if isGT:\n                idClass = (splitLine[0])  # class\n                x = float(splitLine[1])\n                y = float(splitLine[2])\n                w = float(splitLine[3])\n                h = float(splitLine[4])\n                bb = BoundingBox(\n                    nameOfImage,\n                    idClass,\n                    x,\n                    y,\n                    w,\n                    h,\n                    coordType,\n                    imgSize,\n                    BBType.GroundTruth,\n                    format=bbFormat)\n            else:\n                idClass = (splitLine[0])  # class\n                confidence = float(splitLine[1])\n                x = float(splitLine[2])\n                y = float(splitLine[3])\n                w = float(splitLine[4])\n                h = float(splitLine[5])\n                bb = BoundingBox(\n                    nameOfImage,\n                    idClass,\n                    x,\n                    y,\n                    w,"
+        },
+        {
+            "comment": "The code defines a function to calculate the mean average precision (mAP) between ground truth and detected objects in image classification tasks. It takes input folders containing ground truth and detection results, adjustable threshold for determining true positives, and an optional save path for the output file. The code performs argument validation to ensure correct formats and coordinate types.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":711-742",
+            "content": "                    h,\n                    coordType,\n                    imgSize,\n                    BBType.Detected,\n                    confidence,\n                    format=bbFormat)\n            allBoundingBoxes.addBoundingBox(bb)\n            if idClass not in allClasses:\n                allClasses.append(idClass)\n        fh1.close()\n    return allBoundingBoxes, allClasses\ndef get_mAP(gtFolder, detFolder, threshold=0.5, savePath=None):\n    gtFormat = 'xyrb'\n    detFormat = 'xyrb'\n    gtCoordinates = 'abs'\n    detCoordinates = 'abs'\n    gtFolder = os.path.join(os.path.abspath('.'), gtFolder)\n    detFolder = os.path.join(os.path.abspath('.'), detFolder)\n    iouThreshold = threshold\n    # Arguments validation\n    errors = []\n    # Validate formats\n    gtFormat = ValidateFormats(gtFormat, 'gtFormat', errors)\n    detFormat = ValidateFormats(detFormat, '-detformat', errors)\n    # Coordinates types\n    gtCoordType = ValidateCoordinatesTypes(gtCoordinates, '-gtCoordinates', errors)\n    detCoordType = ValidateCoordinatesTypes(detCoordinates, '-detCoordinates', errors)"
+        },
+        {
+            "comment": "This code is creating a directory to save results, clearing any previous content in the folder. It then retrieves ground truth and detected bounding boxes, sorts classes, initializes an evaluator object, and calculates average precision (AP) for each class, storing the AP and mean AP results in the AP_res list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":743-771",
+            "content": "    imgSize = (0, 0)\n    # Create directory to save results\n    shutil.rmtree(savePath, ignore_errors=True)  # Clear folder\n    if savePath is not None:\n        os.makedirs(savePath)\n    # Get groundtruth boxes\n    allBoundingBoxes, allClasses = getBoundingBoxes(\n        gtFolder, True, gtFormat, gtCoordType, imgSize=imgSize)\n    # Get detected boxes\n    allBoundingBoxes, allClasses = getBoundingBoxes(\n        detFolder, False, detFormat, detCoordType, allBoundingBoxes, allClasses, imgSize=imgSize)\n    allClasses.sort()\n    evaluator = Evaluator()\n    acc_AP = 0\n    validClasses = 0\n    # Plot Precision x Recall curve\n    detections = evaluator.GetPascalVOCMetrics(allBoundingBoxes, iouThreshold,\n                                               method=MethodAveragePrecision.EveryPointInterpolation)\n    # each detection is a class and store AP and mAP results in AP_res list\n    AP_res = []\n    for metricsPerClass in detections:\n        # Get metric values per each class\n        cl = metricsPerClass['class']\n        ap = metricsPerClass['AP']"
+        },
+        {
+            "comment": "This code calculates mean Average Precision (mAP) for each class and returns it as a list. It iterates through valid classes, calculates Average Precision (AP) for each class if there are positive samples, updates mAP by averaging APs of all valid classes, and appends AP and class labels to the result list. The final mAP value is also formatted and added to the result list before returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ucf24_utils.py\":772-782",
+            "content": "        totalPositives = metricsPerClass['total positives']\n        if totalPositives > 0:\n            validClasses = validClasses + 1\n            acc_AP = acc_AP + ap\n            ap_str = \"{0:.2f}%\".format(ap * 100)\n            AP_res.append('AP: %s (%s)' % (ap_str, cl))\n    mAP = acc_AP / validClasses\n    mAP_str = \"{0:.2f}%\".format(mAP * 100)\n    AP_res.append('mAP: %s' % mAP_str)\n    return AP_res"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9d237c1c-3826-4eda-97e5-868ff713987b.json b/docs/doc/9d237c1c-3826-4eda-97e5-868ff713987b.json
new file mode 100644
index 000000000..fad5049d7
--- /dev/null
+++ b/docs/doc/9d237c1c-3826-4eda-97e5-868ff713987b.json
@@ -0,0 +1,95 @@
+{
+    "summary": "The code cleans and processes NTU RGB-D dataset data by removing noisy frames, handling missing values, updating arrays, logging counts, denoising raw skeleton data, and generating log files for sequences with multiple actors. It reads sequence data, extracts joints and color data, handles multiple actors and missing frames, and stores the processed data for further processing while counting missing data.",
+    "details": [
+        {
+            "comment": "This code is setting up directories and loggers for processing raw data and identifying noisy sequences. It checks if the required folders exist, creates them if not, sets up loggers to track noise length and spread thresholds, and initializes variables for the process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":0-37",
+            "content": "# ref: https://github.com/Uason-Chen/CTR-GCN/blob/main/data/ntu/get_raw_denoised_data.py\nimport os\nimport os.path as osp\nimport numpy as np\nimport pickle\nimport logging\nroot_path = './'\nraw_data_file = osp.join(root_path, 'raw_data', 'raw_skes_data.pkl')\nsave_path = osp.join(root_path, 'denoised_data')\nif not osp.exists(save_path):\n    os.mkdir(save_path)\nrgb_ske_path = osp.join(save_path, 'rgb+ske')\nif not osp.exists(rgb_ske_path):\n    os.mkdir(rgb_ske_path)\nactors_info_dir = osp.join(save_path, 'actors_info')\nif not osp.exists(actors_info_dir):\n    os.mkdir(actors_info_dir)\nmissing_count = 0\nnoise_len_thres = 11\nnoise_spr_thres1 = 0.8\nnoise_spr_thres2 = 0.69754\nnoise_mot_thres_lo = 0.089925\nnoise_mot_thres_hi = 2\nnoise_len_logger = logging.getLogger('noise_length')\nnoise_len_logger.setLevel(logging.INFO)\nnoise_len_logger.addHandler(\n    logging.FileHandler(osp.join(save_path, 'noise_length.log')))\nnoise_len_logger.info('{:^20}\\t{:^17}\\t{:^8}\\t{}'.format(\n    'Skeleton', 'bodyID', 'Motion', 'Length'))\nnoise_spr_logger = logging.getLogger('noise_spread')"
+        },
+        {
+            "comment": "The code sets up multiple loggers for different types of output: 'noise_spread.log', 'noise_motion.log', 'denoised_failed_1.log', and 'denoised_failed_2.log'. It also creates a logger for missing frames named 'missing_frames'. Each logger is configured with a specific level of logging (INFO) and a file handler to store the logs in designated files within the specified save path. This allows for organized and easily accessible logging during program execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":38-62",
+            "content": "noise_spr_logger.setLevel(logging.INFO)\nnoise_spr_logger.addHandler(\n    logging.FileHandler(osp.join(save_path, 'noise_spread.log')))\nnoise_spr_logger.info('{:^20}\\t{:^17}\\t{:^8}\\t{:^8}'.format(\n    'Skeleton', 'bodyID', 'Motion', 'Rate'))\nnoise_mot_logger = logging.getLogger('noise_motion')\nnoise_mot_logger.setLevel(logging.INFO)\nnoise_mot_logger.addHandler(\n    logging.FileHandler(osp.join(save_path, 'noise_motion.log')))\nnoise_mot_logger.info('{:^20}\\t{:^17}\\t{:^8}'.format('Skeleton', 'bodyID',\n                                                     'Motion'))\nfail_logger_1 = logging.getLogger('noise_outliers_1')\nfail_logger_1.setLevel(logging.INFO)\nfail_logger_1.addHandler(\n    logging.FileHandler(osp.join(save_path, 'denoised_failed_1.log')))\nfail_logger_2 = logging.getLogger('noise_outliers_2')\nfail_logger_2.setLevel(logging.INFO)\nfail_logger_2.addHandler(\n    logging.FileHandler(osp.join(save_path, 'denoised_failed_2.log')))\nmissing_skes_logger = logging.getLogger('missing_frames')\nmissing_skes_logger.setLevel(logging.INFO)"
+        },
+        {
+            "comment": "Creates multiple loggers for tracking missing skeleton frames, with different handlers and levels of information. Function denoising_by_length takes a skeleton name and bodies_data as input to perform data denoising based on frame length for each bodyID.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":63-85",
+            "content": "missing_skes_logger.addHandler(\n    logging.FileHandler(osp.join(save_path, 'missing_skes.log')))\nmissing_skes_logger.info('{:^20}\\t{}\\t{}'.format('Skeleton', 'num_frames',\n                                                 'num_missing'))\nmissing_skes_logger1 = logging.getLogger('missing_frames_1')\nmissing_skes_logger1.setLevel(logging.INFO)\nmissing_skes_logger1.addHandler(\n    logging.FileHandler(osp.join(save_path, 'missing_skes_1.log')))\nmissing_skes_logger1.info('{:^20}\\t{}\\t{}\\t{}\\t{}\\t{}'.format(\n    'Skeleton', 'num_frames', 'Actor1', 'Actor2', 'Start', 'End'))\nmissing_skes_logger2 = logging.getLogger('missing_frames_2')\nmissing_skes_logger2.setLevel(logging.INFO)\nmissing_skes_logger2.addHandler(\n    logging.FileHandler(osp.join(save_path, 'missing_skes_2.log')))\nmissing_skes_logger2.info('{:^20}\\t{}\\t{}\\t{}'.format('Skeleton', 'num_frames',\n                                                      'Actor1', 'Actor2'))\ndef denoising_by_length(ske_name, bodies_data):\n    \"\"\"\n    Denoising data based on the frame length for each bodyID."
+        },
+        {
+            "comment": "Code snippet filters out bodies with a length less than or equal to the predefined threshold and finds valid frames based on the spread of X and Y. It also logs the filtered body information and returns the updated bodies data along with filter information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":86-115",
+            "content": "    Filter out the bodyID which length is less or equal than the predefined threshold.\n    \"\"\"\n    noise_info = str()\n    new_bodies_data = bodies_data.copy()\n    for (bodyID, body_data) in new_bodies_data.items():\n        length = len(body_data['interval'])\n        if length <= noise_len_thres:\n            noise_info += 'Filter out: %s, %d (length).\\n' % (bodyID, length)\n            noise_len_logger.info('{}\\t{}\\t{:.6f}\\t{:^6d}'.format(\n                ske_name, bodyID, body_data['motion'], length))\n            del bodies_data[bodyID]\n    if noise_info != '':\n        noise_info += '\\n'\n    return bodies_data, noise_info\ndef get_valid_frames_by_spread(points):\n    \"\"\"\n    Find the valid (or reasonable) frames (index) based on the spread of X and Y.\n    :param points: joints or colors\n    \"\"\"\n    num_frames = points.shape[0]\n    valid_frames = []\n    for i in range(num_frames):\n        x = points[i, :, 0]\n        y = points[i, :, 1]\n        if (x.max() - x.min()) <= noise_spr_thres1 * (y.max() - y.min()):  # 0.8"
+        },
+        {
+            "comment": "The function \"denoising_by_spread\" takes a sequence of body data and filters out any bodies with a high ratio of noisy frames. It uses the spread of Y and X values to determine if a frame is valid or not. If the ratio of noisy frames exceeds a predefined threshold, the corresponding body is removed from the data. This function ensures that only clean data is used for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":116-146",
+            "content": "            valid_frames.append(i)\n    return valid_frames\ndef denoising_by_spread(ske_name, bodies_data):\n    \"\"\"\n    Denoising data based on the spread of Y value and X value.\n    Filter out the bodyID which the ratio of noisy frames is higher than the predefined\n    threshold.\n    bodies_data: contains at least 2 bodyIDs\n    \"\"\"\n    noise_info = str()\n    denoised_by_spr = False  # mark if this sequence has been processed by spread.\n    new_bodies_data = bodies_data.copy()\n    # for (bodyID, body_data) in bodies_data.items():\n    for (bodyID, body_data) in new_bodies_data.items():\n        if len(bodies_data) == 1:\n            break\n        valid_frames = get_valid_frames_by_spread(body_data['joints'].reshape(\n            -1, 25, 3))\n        num_frames = len(body_data['interval'])\n        num_noise = num_frames - len(valid_frames)\n        if num_noise == 0:\n            continue\n        ratio = num_noise / float(num_frames)\n        motion = body_data['motion']\n        if ratio >= noise_spr_thres2:  # 0.69754\n            del bodies_data[bodyID]"
+        },
+        {
+            "comment": "This function filters out frames with high noise (spread rate) and updates the motion values for each bodyID. It also returns a list of tuples sorted by motion, potentially removing noisy frames for each bodyID.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":147-171",
+            "content": "            denoised_by_spr = True\n            noise_info += 'Filter out: %s (spread rate >= %.2f).\\n' % (\n                bodyID, noise_spr_thres2)\n            noise_spr_logger.info('%s\\t%s\\t%.6f\\t%.6f' %\n                                  (ske_name, bodyID, motion, ratio))\n        else:  # Update motion\n            joints = body_data['joints'].reshape(-1, 25, 3)[valid_frames]\n            body_data['motion'] = min(\n                motion, np.sum(np.var(joints.reshape(-1, 3), axis=0)))\n            noise_info += '%s: motion %.6f -> %.6f\\n' % (bodyID, motion,\n                                                         body_data['motion'])\n            # TODO: Consider removing noisy frames for each bodyID\n    if noise_info != '':\n        noise_info += '\\n'\n    return bodies_data, noise_info, denoised_by_spr\ndef denoising_by_motion(ske_name, bodies_data, bodies_motion):\n    \"\"\"\n    Filter out the bodyID which motion is out of the range of predefined interval\n    \"\"\"\n    # Sort bodies based on the motion, return a list of tuples"
+        },
+        {
+            "comment": "This code sorts the motion data for each body, discards data with low or high motion values, and returns denoised body data along with information about filtered out bodies. The denoising process is based on heuristic methods that may not be correct for all samples.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":172-197",
+            "content": "    # bodies_motion = sorted(bodies_motion.items(), key=lambda x, y: cmp(x[1], y[1]), reverse=True)\n    bodies_motion = sorted(bodies_motion.items(),\n                           key=lambda x: x[1],\n                           reverse=True)\n    # Reserve the body data with the largest motion\n    denoised_bodies_data = [(bodies_motion[0][0],\n                             bodies_data[bodies_motion[0][0]])]\n    noise_info = str()\n    for (bodyID, motion) in bodies_motion[1:]:\n        if (motion < noise_mot_thres_lo) or (motion > noise_mot_thres_hi):\n            noise_info += 'Filter out: %s, %.6f (motion).\\n' % (bodyID, motion)\n            noise_mot_logger.info('{}\\t{}\\t{:.6f}'.format(\n                ske_name, bodyID, motion))\n        else:\n            denoised_bodies_data.append((bodyID, bodies_data[bodyID]))\n    if noise_info != '':\n        noise_info += '\\n'\n    return denoised_bodies_data, noise_info\ndef denoising_bodies_data(bodies_data):\n    \"\"\"\n    Denoising data based on some heuristic methods, not necessarily correct for all samples."
+        },
+        {
+            "comment": "This code performs denoising on bodies data based on frame length and spread. It first denoises the data by frame length and then by spread, if necessary. The function returns a tuple containing the denoised bodies data and the noise information for each step. The code also sorts the bodies based on their motion and returns it in a sorted manner.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":199-224",
+            "content": "    Return:\n      denoised_bodies_data (list): tuple: (bodyID, body_data).\n    \"\"\"\n    ske_name = bodies_data['name']\n    bodies_data = bodies_data['data']\n    # Step 1: Denoising based on frame length.\n    bodies_data, noise_info_len = denoising_by_length(ske_name, bodies_data)\n    if len(bodies_data) == 1:  # only has one bodyID left after step 1\n        return bodies_data.items(), noise_info_len\n    # Step 2: Denoising based on spread.\n    bodies_data, noise_info_spr, denoised_by_spr = denoising_by_spread(\n        ske_name, bodies_data)\n    if len(bodies_data) == 1:\n        return bodies_data.items(), noise_info_len + noise_info_spr\n    bodies_motion = dict()  # get body motion\n    for (bodyID, body_data) in bodies_data.items():\n        bodies_motion[bodyID] = body_data['motion']\n    # Sort bodies based on the motion\n    # bodies_motion = sorted(bodies_motion.items(), key=lambda x, y: cmp(x[1], y[1]), reverse=True)\n    bodies_motion = sorted(bodies_motion.items(),\n                           key=lambda x: x[1],"
+        },
+        {
+            "comment": "This code retrieves denoised data from the NTU RGB-D dataset, and considers further denoising by integrating motion. It also defines a function to get joints and colors for only one actor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":225-251",
+            "content": "                           reverse=True)\n    denoised_bodies_data = list()\n    for (bodyID, _) in bodies_motion:\n        denoised_bodies_data.append((bodyID, bodies_data[bodyID]))\n    return denoised_bodies_data, noise_info_len + noise_info_spr\n    # TODO: Consider denoising further by integrating motion method\n    # if denoised_by_spr:  # this sequence has been denoised by spread\n    #     bodies_motion = sorted(bodies_motion.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)\n    #     denoised_bodies_data = list()\n    #     for (bodyID, _) in bodies_motion:\n    #         denoised_bodies_data.append((bodyID, bodies_data[bodyID]))\n    #     return denoised_bodies_data, noise_info\n    # Step 3: Denoising based on motion\n    # bodies_data, noise_info = denoising_by_motion(ske_name, bodies_data, bodies_motion)\n    # return bodies_data, noise_info\ndef get_one_actor_points(body_data, num_frames):\n    \"\"\"\n    Get joints and colors for only one actor.\n    For joints, each frame contains 75 X-Y-Z coordinates.\n    For colors, each frame contains 25 x 2 (X, Y) coordinates."
+        },
+        {
+            "comment": "This code segment defines a function to get raw denoised data from body_data and another function to remove missing frames in the sequence. The first function initializes joints and colors arrays, extracts relevant data from body_data, and returns the joints and colors. The second function cuts off missing frames when all joint positions are 0s and records the number of missing frames for each actor if there are two actors' data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":252-279",
+            "content": "    \"\"\"\n    joints = np.zeros((num_frames, 75), dtype=np.float32)\n    colors = np.ones((num_frames, 1, 25, 2), dtype=np.float32) * np.nan\n    start, end = body_data['interval'][0], body_data['interval'][-1]\n    joints[start:end + 1] = body_data['joints'].reshape(-1, 75)\n    colors[start:end + 1, 0] = body_data['colors']\n    return joints, colors\ndef remove_missing_frames(ske_name, joints, colors):\n    \"\"\"\n    Cut off missing frames which all joints positions are 0s\n    For the sequence with 2 actors' data, also record the number of missing frames for\n    actor1 and actor2, respectively (for debug).\n    \"\"\"\n    num_frames = joints.shape[0]\n    num_bodies = colors.shape[1]  # 1 or 2\n    if num_bodies == 2:  # DEBUG\n        missing_indices_1 = np.where(joints[:, :75].sum(axis=1) == 0)[0]\n        missing_indices_2 = np.where(joints[:, 75:].sum(axis=1) == 0)[0]\n        cnt1 = len(missing_indices_1)\n        cnt2 = len(missing_indices_2)\n        start = 1 if 0 in missing_indices_1 else 0\n        end = 1 if num_frames - 1 in missing_indices_1 else 0"
+        },
+        {
+            "comment": "This code checks if any data is missing or lost for two subjects in a video. If there are missing frames, it updates the joints and colors arrays, marks missing indices with NaN, and logs the number of missing frames and total missing counts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":280-302",
+            "content": "        if max(cnt1, cnt2) > 0:\n            if cnt1 > cnt2:\n                info = '{}\\t{:^10d}\\t{:^6d}\\t{:^6d}\\t{:^5d}\\t{:^3d}'.format(\n                    ske_name, num_frames, cnt1, cnt2, start, end)\n                missing_skes_logger1.info(info)\n            else:\n                info = '{}\\t{:^10d}\\t{:^6d}\\t{:^6d}'.format(\n                    ske_name, num_frames, cnt1, cnt2)\n                missing_skes_logger2.info(info)\n    # Find valid frame indices that the data is not missing or lost\n    # For two-subjects action, this means both data of actor1 and actor2 is missing.\n    valid_indices = np.where(joints.sum(axis=1) != 0)[0]  # 0-based index\n    missing_indices = np.where(joints.sum(axis=1) == 0)[0]\n    num_missing = len(missing_indices)\n    if num_missing > 0:  # Update joints and colors\n        joints = joints[valid_indices]\n        colors[missing_indices] = np.nan\n        global missing_count\n        missing_count += 1\n        missing_skes_logger.info('{}\\t{:^10d}\\t{:^11d}'.format(\n            ske_name, num_frames, num_missing))"
+        },
+        {
+            "comment": "Function get_bodies_info formats the bodies' data into a string with bodyID, interval (start and end frame), and motion amount.\nFunction get_two_actors_points retrieves the first and second actor's joints positions and colors locations from given data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":304-328",
+            "content": "    return joints, colors\ndef get_bodies_info(bodies_data):\n    bodies_info = '{:^17}\\t{}\\t{:^8}\\n'.format('bodyID', 'Interval', 'Motion')\n    for (bodyID, body_data) in bodies_data.items():\n        start, end = body_data['interval'][0], body_data['interval'][-1]\n        bodies_info += '{}\\t{:^8}\\t{:f}\\n'.format(bodyID, str([start, end]),\n                                                  body_data['motion'])\n    return bodies_info + '\\n'\ndef get_two_actors_points(bodies_data):\n    \"\"\"\n    Get the first and second actor's joints positions and colors locations.\n    # Arguments:\n        bodies_data (dict): 3 key-value pairs: 'name', 'data', 'num_frames'.\n        bodies_data['data'] is also a dict, while the key is bodyID, the value is\n        the corresponding body_data which is also a dict with 4 keys:\n          - joints: raw 3D joints positions. Shape: (num_frames x 25, 3)\n          - colors: raw 2D color locations. Shape: (num_frames, 25, 2)\n          - interval: a list which records the frame indices.\n          - motion: motion amount"
+        },
+        {
+            "comment": "This function denoises bodies data and extracts joints and colors information for each frame. If only one actor remains after denoising, it checks if the action is for two subjects (label >= 50) and retrieves joints and colors from the remaining actor. If there are still multiple actors but the action is for one subject (label < 50), it initializes joints as zeros and colors as nans.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":330-357",
+            "content": "    # Return:\n        joints, colors.\n    \"\"\"\n    ske_name = bodies_data['name']\n    label = int(ske_name[-2:])\n    num_frames = bodies_data['num_frames']\n    bodies_info = get_bodies_info(bodies_data['data'])\n    bodies_data, noise_info = denoising_bodies_data(\n        bodies_data)  # Denoising data\n    bodies_info += noise_info\n    bodies_data = list(bodies_data)\n    if len(bodies_data) == 1:  # Only left one actor after denoising\n        if label >= 50:  # DEBUG: Denoising failed for two-subjects action\n            fail_logger_2.info(ske_name)\n        bodyID, body_data = bodies_data[0]\n        joints, colors = get_one_actor_points(body_data, num_frames)\n        bodies_info += 'Main actor: %s' % bodyID\n    else:\n        if label < 50:  # DEBUG: Denoising failed for one-subject action\n            fail_logger_1.info(ske_name)\n        joints = np.zeros((num_frames, 150), dtype=np.float32)\n        colors = np.ones((num_frames, 2, 25, 2), dtype=np.float32) * np.nan\n        bodyID, actor1 = bodies_data[0]  # the 1st actor with largest motion"
+        },
+        {
+            "comment": "Code snippet extracts joints, colors and other information from actors' data and assigns them to relevant arrays. It also generates formatted information about each actor including their interval and motion. The while loop iterates through the bodies_data list, considering only those actors whose intervals do not overlap with Actor1, appending their joints and colors to the respective arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":358-376",
+            "content": "        start1, end1 = actor1['interval'][0], actor1['interval'][-1]\n        joints[start1:end1 + 1, :75] = actor1['joints'].reshape(-1, 75)\n        colors[start1:end1 + 1, 0] = actor1['colors']\n        actor1_info = '{:^17}\\t{}\\t{:^8}\\n'.format('Actor1', 'Interval', 'Motion') + \\\n                      '{}\\t{:^8}\\t{:f}\\n'.format(bodyID, str([start1, end1]), actor1['motion'])\n        del bodies_data[0]\n        actor2_info = '{:^17}\\t{}\\t{:^8}\\n'.format('Actor2', 'Interval',\n                                                   'Motion')\n        start2, end2 = [0, 0]  # initial interval for actor2 (virtual)\n        while len(bodies_data) > 0:\n            bodyID, actor = bodies_data[0]\n            start, end = actor['interval'][0], actor['interval'][-1]\n            if min(end1, end) - max(start1,\n                                    start) <= 0:  # no overlap with actor1\n                joints[start:end + 1, :75] = actor['joints'].reshape(-1, 75)\n                colors[start:end + 1, 0] = actor['colors']\n                actor1_info += '{}\\t{:^8}\\t{:f}\\n'.format("
+        },
+        {
+            "comment": "This function extracts and denoises joint positions and color locations from raw skeleton sequences. It takes intervals of actor1 and actor2, updates their intervals if there's no overlap, and then stores the information in separate variables. Finally, it writes the information to a text file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":377-402",
+            "content": "                    bodyID, str([start, end]), actor['motion'])\n                # Update the interval of actor1\n                start1 = min(start, start1)\n                end1 = max(end, end1)\n            elif min(end2, end) - max(start2,\n                                      start) <= 0:  # no overlap with actor2\n                joints[start:end + 1, 75:] = actor['joints'].reshape(-1, 75)\n                colors[start:end + 1, 1] = actor['colors']\n                actor2_info += '{}\\t{:^8}\\t{:f}\\n'.format(\n                    bodyID, str([start, end]), actor['motion'])\n                # Update the interval of actor2\n                start2 = min(start, start2)\n                end2 = max(end, end2)\n            del bodies_data[0]\n        bodies_info += ('\\n' + actor1_info + '\\n' + actor2_info)\n    with open(osp.join(actors_info_dir, ske_name + '.txt'), 'w') as fw:\n        fw.write(bodies_info + '\\n')\n    return joints, colors\ndef get_raw_denoised_data():\n    \"\"\"\n    Get denoised data (joints positions and color locations) from raw skeleton sequences."
+        },
+        {
+            "comment": "This code reads raw skeleton data from a file, then processes and reshapes the 3D positions of each joint into a 75-dimensional vector for each frame. If there's only one actor, it fills zeros to complete the 150-dimensional vector. It selects the main and second actors based on motion amount. The resulting 2D arrays are stored in a list and serialized into a cPickle file. Additionally, log files record the filename and actors' information for skeleton sequences with two or more actors. The code also generates RGB+skeleton videos for better visualization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":404-418",
+            "content": "    For each frame of a skeleton sequence, an actor's 3D positions of 25 joints represented\n    by an 2D array (shape: 25 x 3) is reshaped into a 75-dim vector by concatenating each\n    3-dim (x, y, z) coordinates along the row dimension in joint order. Each frame contains\n    two actor's joints positions constituting a 150-dim vector. If there is only one actor,\n    then the last 75 values are filled with zeros. Otherwise, select the main actor and the\n    second actor based on the motion amount. Each 150-dim vector as a row vector is put into\n    a 2D numpy array where the number of rows equals the number of valid frames. All such\n    2D arrays are put into a list and finally the list is serialized into a cPickle file.\n    For the skeleton sequence which contains two or more actors (mostly corresponds to the\n    last 11 classes), the filename and actors' information are recorded into log files.\n    For better understanding, also generate RGB+skeleton videos for visualization.\n    \"\"\"\n    with open(raw_data_file, 'rb') as fr:  # load raw skeletons data"
+        },
+        {
+            "comment": "Code reads raw skeleton sequence data from file, counts the number of sequences, and processes each sequence by extracting joints and color data. It handles single or multiple actors in a sequence, removes missing frames if necessary, and stores processed data into separate lists for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":419-444",
+            "content": "        raw_skes_data = pickle.load(fr)\n    num_skes = len(raw_skes_data)\n    print('Found %d available skeleton sequences.' % num_skes)\n    raw_denoised_joints = []\n    raw_denoised_colors = []\n    frames_cnt = []\n    for (idx, bodies_data) in enumerate(raw_skes_data):\n        ske_name = bodies_data['name']\n        print('Processing %s' % ske_name)\n        num_bodies = len(bodies_data['data'])\n        if num_bodies == 1:  # only 1 actor\n            num_frames = bodies_data['num_frames']\n            body_data = list(bodies_data['data'].values())[0]\n            joints, colors = get_one_actor_points(body_data, num_frames)\n        else:  # more than 1 actor, select two main actors\n            joints, colors = get_two_actors_points(bodies_data)\n            # Remove missing frames\n            joints, colors = remove_missing_frames(ske_name, joints, colors)\n            num_frames = joints.shape[0]  # Update\n            # Visualize selected actors' skeletons on RGB videos.\n        raw_denoised_joints.append(joints)"
+        },
+        {
+            "comment": "The code iterates over a set of skes, appends raw denoised joints and colors to lists, prints progress, and saves the data into pickle files and text file. It also counts missing data and reports it at the end.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py\":445-470",
+            "content": "        raw_denoised_colors.append(colors)\n        frames_cnt.append(num_frames)\n        if (idx + 1) % 1000 == 0:\n            print('Processed: %.2f%% (%d / %d), ' % \\\n                  (100.0 * (idx + 1) / num_skes, idx + 1, num_skes) + \\\n                  'Missing count: %d' % missing_count)\n    raw_skes_joints_pkl = osp.join(save_path, 'raw_denoised_joints.pkl')\n    with open(raw_skes_joints_pkl, 'wb') as f:\n        pickle.dump(raw_denoised_joints, f, pickle.HIGHEST_PROTOCOL)\n    raw_skes_colors_pkl = osp.join(save_path, 'raw_denoised_colors.pkl')\n    with open(raw_skes_colors_pkl, 'wb') as f:\n        pickle.dump(raw_denoised_colors, f, pickle.HIGHEST_PROTOCOL)\n    frames_cnt = np.array(frames_cnt, dtype=np.int)\n    np.savetxt(osp.join(save_path, 'frames_cnt.txt'), frames_cnt, fmt='%d')\n    print('Saved raw denoised positions of {} frames into {}'.format(\n        np.sum(frames_cnt), raw_skes_joints_pkl))\n    print('Found %d files that have missing data' % missing_count)\nif __name__ == '__main__':\n    get_raw_denoised_data()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9d53e584-f7dc-4af6-9ac4-f2cc858b3ebb.json b/docs/doc/9d53e584-f7dc-4af6-9ac4-f2cc858b3ebb.json
new file mode 100644
index 000000000..c01c6af91
--- /dev/null
+++ b/docs/doc/9d53e584-f7dc-4af6-9ac4-f2cc858b3ebb.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines a \"InferModel\" class for audio inference using PaddleVideo, performs prediction on an audio file, and outputs shape, first value, and time taken. The model's predict function is called with 32-sample data as placeholders.",
+    "details": [
+        {
+            "comment": "The code above defines a class \"InferModel\" for audio inference. It initializes the model by setting the model file, parameters file, GPU memory, and device ID from a configuration file. The code enables IR optimization, memory optimization, and disables zero copy. Finally, it creates a predictor object and retrieves the input handle for the first input name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/audio_infer.py\":0-36",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"audio infer\"\"\"\n    def __init__(self, cfg, name='AUDIO'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])"
+        },
+        {
+            "comment": "This code defines a class for audio inference using the PaddleVideo library. It has an `infer` method that takes input data and returns output data after running the inference, and a `predict` method that loops through inferencer data, performs inference for each data point, and returns feature values and pcm values as arrays. The main part of the code initializes an instance of this class using a configuration file (configs.yaml), which is then used for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/audio_infer.py\":38-66",
+            "content": "        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        pcm_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = np.array(data, dtype='float32')\n            output = self.infer(inputs)\n            feature_list.append(np.squeeze(output))\n            pcm_list.append(inputs)\n        feature_values = np.vstack(feature_list)\n        pcm_values = np.vstack(pcm_list)\n        return feature_values, pcm_values\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)"
+        },
+        {
+            "comment": "This code loads an audio file, configures the model with its path, performs prediction, and prints the output shape, first output value, and time taken for the process. The model's predict function is called with a random 32-sample data array as placeholders.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/audio_infer.py\":68-77",
+            "content": "    pcm_path = '/home/work/datasets/WorldCup2018/pcm/6e577252c4004961ac7caa738a52c238.pcm'\n    t0 = time.time()\n    cfg['AUDIO']['pcm_file'] = pcm_path\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print(outputs[0])\n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9d891651-5fcc-4ad2-b8a8-193ba882fcb7.json b/docs/doc/9d891651-5fcc-4ad2-b8a8-193ba882fcb7.json
new file mode 100644
index 000000000..ed5bd8d0f
--- /dev/null
+++ b/docs/doc/9d891651-5fcc-4ad2-b8a8-193ba882fcb7.json
@@ -0,0 +1,75 @@
+{
+    "summary": "The PaddleVideo framework's VOSMetric class is responsible for video object segmentation tasks, data processing, and model preparation. It includes methods 'flip_tensor', 'save_mask', and manages various operations such as handling failures and logging data.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo framework, implementing the VOSMetric class. It registers a metric for video object segmentation tasks using the PaddlePaddle library. The class takes parameters such as data size, batch size, result root directory, and zip directory for results storage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":0-37",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport os\nimport paddle\nimport zipfile\nimport time\nfrom PIL import Image\nfrom paddle.io import DataLoader\nfrom .registry import METRIC\nfrom .base import BaseMetric\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass VOSMetric(BaseMetric):\n    def __init__(self,\n                 data_size,\n                 batch_size,\n                 result_root,\n                 zip_dir,\n                 log_interval=1):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval)"
+        },
+        {
+            "comment": "This code initializes a VOS metric class with parameters such as total_video_num, result_root and zip_dir. The update method processes each video in the dataset, updating metrics like seq_total_time and seq_total_frame. It also prepares variables for reference embeddings and masks for the Video Object Segmentation task using PaddlePaddle framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":38-67",
+            "content": "        self.video_num = 0\n        self.total_time = 0\n        self.total_frame = 0\n        self.total_sfps = 0\n        self.total_video_num = data_size\n        self.count = 0\n        self.result_root = result_root\n        self.zip_dir = zip_dir\n    def update(self, batch_id, data, model):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        self.video_num += 1\n        seq_dataset = data\n        seq_name = seq_dataset.seq_name\n        logger.info('Prcessing Seq {} [{}/{}]:'.format(seq_name, self.video_num,\n                                                       self.total_video_num))\n        seq_dataloader = DataLoader(seq_dataset,\n                                    return_list=True,\n                                    batch_size=1,\n                                    shuffle=False,\n                                    num_workers=0)\n        seq_total_time = 0\n        seq_total_frame = 0\n        ref_embeddings = []\n        ref_masks = []\n        prev_embedding = []\n        prev_mask = []\n        with paddle.no_grad():"
+        },
+        {
+            "comment": "This code appears to be part of a data loading and processing loop for a video object detection model. It loads samples from a sequential dataloader, processes each augmented image, and appends their embeddings and masks to the corresponding lists. The labels are also loaded if available. This process is repeated for all augmented images in the sequence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":68-90",
+            "content": "            for frame_idx, samples in enumerate(seq_dataloader):\n                time_start = time.time()\n                all_preds = []\n                join_label = None\n                for aug_idx in range(len(samples)):\n                    if len(ref_embeddings) <= aug_idx:\n                        ref_embeddings.append([])\n                        ref_masks.append([])\n                        prev_embedding.append(None)\n                        prev_mask.append(None)\n                    sample = samples[aug_idx]\n                    ref_emb = ref_embeddings[aug_idx]\n                    ref_m = ref_masks[aug_idx]\n                    prev_emb = prev_embedding[aug_idx]\n                    prev_m = prev_mask[aug_idx]\n                    current_img = sample['current_img']\n                    if 'current_label' in sample.keys():\n                        current_label = sample['current_label']\n                        current_label = paddle.to_tensor(current_label)\n                    else:\n                        current_label = None"
+        },
+        {
+            "comment": "This code prepares data for a video object detection model. It extracts necessary information from the sample such as obj_num, imgname, ori_height and ori_width. The current image shape is also obtained. A list of data is created including reference embedding, reference mask, previous embedding, previous mask, the current image, image dimensions, and object number. The model is then used to generate all predictions and current embedding. If it's the first frame, if no label exists, an info message is logged. Reference embeddings and masks are appended accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":92-112",
+            "content": "                    obj_num = sample['meta']['obj_num']\n                    imgname = sample['meta']['current_name']\n                    ori_height = sample['meta']['height']\n                    ori_width = sample['meta']['width']\n                    current_img = current_img\n                    obj_num = obj_num\n                    bs, _, h, w = current_img.shape\n                    data_batch = [\n                        ref_emb, ref_m, prev_emb, prev_m, current_img,\n                        [ori_height, ori_width], obj_num\n                    ]\n                    all_pred, current_embedding = model(data_batch, mode='test')\n                    if frame_idx == 0:\n                        if current_label is None:\n                            logger.info(\n                                \"No first frame label in Seq {}.\".format(\n                                    seq_name))\n                        ref_embeddings[aug_idx].append(current_embedding)\n                        ref_masks[aug_idx].append(current_label)"
+        },
+        {
+            "comment": "In this code, it checks if the sample has a 'meta' field with 'flip' set to True. If not, it checks if there are new labels for new objects. If necessary, it introduces a new label and adds the current prediction and embedding to their respective lists. The prev_embedding is also updated.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":114-128",
+            "content": "                        prev_embedding[aug_idx] = current_embedding\n                        prev_mask[aug_idx] = current_label\n                    else:\n                        if sample['meta']['flip']:  #False\n                            all_pred = self.flip_tensor(all_pred, 3)\n                        #  In YouTube-VOS, not all the objects appear in the first frame for the first time. Thus, we\n                        #  have to introduce new labels for new objects, if necessary.\n                        if not sample['meta']['flip'] and not (\n                                current_label is None) and join_label is None:\n                            join_label = paddle.cast(current_label,\n                                                     dtype='int64')\n                        all_preds.append(all_pred)\n                        if current_label is not None:\n                            ref_embeddings[aug_idx].append(current_embedding)\n                        prev_embedding[aug_idx] = current_embedding"
+        },
+        {
+            "comment": "This code calculates the mean of previous predictions, then finds the maximum value from these averaged results. It handles joining labels if present and reshapes the final prediction to match the original image dimensions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":130-146",
+            "content": "                if frame_idx > 0:\n                    all_preds = paddle.concat(all_preds, axis=0)\n                    all_preds = paddle.mean(\n                        all_preds, axis=0)  #average results if augmentation\n                    pred_label = paddle.argmax(all_preds, axis=0)\n                    if join_label is not None:\n                        join_label = paddle.squeeze(paddle.squeeze(join_label,\n                                                                   axis=0),\n                                                    axis=0)\n                        keep = paddle.cast((join_label == 0), dtype=\"int64\")\n                        pred_label = pred_label * keep + join_label * (1 - keep)\n                        pred_label = pred_label\n                    current_label = paddle.reshape(\n                        pred_label, shape=[1, 1, ori_height, ori_width])\n                    flip_pred_label = self.flip_tensor(pred_label, 1)\n                    flip_current_label = paddle.reshape(\n                        flip_pred_label, shape=[1, 1, ori_height, ori_width])"
+        },
+        {
+            "comment": "The code iterates over a list of samples, updating reference and previous masks based on whether the sample is flipped or not. It then calculates the time taken for one frame, adds it to total sequence time, increments the total frame count, logs frame information including object number, and saves the predicted label mask.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":148-167",
+            "content": "                    for aug_idx in range(len(samples)):\n                        if join_label is not None:\n                            if samples[aug_idx]['meta']['flip']:\n                                ref_masks[aug_idx].append(flip_current_label)\n                            else:\n                                ref_masks[aug_idx].append(current_label)\n                        if samples[aug_idx]['meta']['flip']:\n                            prev_mask[aug_idx] = flip_current_label\n                        else:\n                            prev_mask[\n                                aug_idx] = current_label  #update prev_mask\n                    one_frametime = time.time() - time_start\n                    seq_total_time += one_frametime\n                    seq_total_frame += 1\n                    obj_num = float(obj_num)\n                    logger.info('Frame: {}, Obj Num: {}, Time: {}'.format(\n                        imgname[0], obj_num, one_frametime))\n                    self.save_mask(\n                        pred_label,"
+        },
+        {
+            "comment": "This code calculates the average time per frame for a video sequence and reports it. It also keeps track of total time, total frames, average frames per second (FPS) for each sequence, and overall FPS. It logs this information for debugging or analysis purposes. The code handles both cases where all frames are successfully processed and when some frames fail processing. It then deletes unnecessary variables to free up memory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":168-190",
+            "content": "                        os.path.join(self.result_root, seq_name,\n                                     imgname[0].split('.')[0] + '.png'))\n                else:\n                    one_frametime = time.time() - time_start\n                    seq_total_time += one_frametime\n                    logger.info('Ref Frame: {}, Time: {}'.format(\n                        imgname[0], one_frametime))\n            del (ref_embeddings)\n            del (ref_masks)\n            del (prev_embedding)\n            del (prev_mask)\n            del (seq_dataset)\n            del (seq_dataloader)\n        seq_avg_time_per_frame = seq_total_time / seq_total_frame\n        self.total_time += seq_total_time\n        self.total_frame += seq_total_frame\n        total_avg_time_per_frame = self.total_time / self.total_frame\n        self.total_sfps += seq_avg_time_per_frame\n        avg_sfps = self.total_sfps / (batch_id + 1)\n        logger.info(\"Seq {} FPS: {}, Total FPS: {}, FPS per Seq: {}\".format(\n            seq_name, 1. / seq_avg_time_per_frame,"
+        },
+        {
+            "comment": "This code defines a class with two methods: 'flip_tensor' and 'save_mask'. The 'flip_tensor' method flips the tensor along a specified dimension by inverting the indices. The 'save_mask' method saves a mask tensor to a specified file path using a provided palette.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":191-208",
+            "content": "            1. / total_avg_time_per_frame, 1. / avg_sfps))\n    def flip_tensor(self, tensor, dim=0):\n        inv_idx = paddle.cast(paddle.arange(tensor.shape[dim] - 1, -1, -1),\n                              dtype=\"int64\")\n        tensor = paddle.index_select(x=tensor, index=inv_idx, axis=dim)\n        return tensor\n    def save_mask(self, mask_tensor, path):\n        _palette = [\n            0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128,\n            0, 128, 128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191,\n            128, 0, 64, 0, 128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64,\n            0, 128, 64, 0, 0, 191, 0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22,\n            22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27,\n            28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33,\n            33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39,\n            39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43, 44, 44, 44,"
+        },
+        {
+            "comment": "This code appears to be a list of numbers, each representing a potential value for an unknown variable. It spans from 45 to 114 and includes each number exactly once. Without context or additional information, it's difficult to determine the purpose or meaning behind these values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":209-221",
+            "content": "            45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50,\n            50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n            56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61,\n            62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67,\n            67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73,\n            73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78,\n            79, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 83, 83, 83, 84, 84,\n            84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 89, 89, 89, 90,\n            90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94, 94, 94, 95, 95, 95,\n            96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100, 100, 100, 101,\n            101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105, 105,\n            105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109,\n            110, 110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114,"
+        },
+        {
+            "comment": "This code snippet contains a series of consecutive integers from 114 to 174.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":222-235",
+            "content": "            114, 114, 115, 115, 115, 116, 116, 116, 117, 117, 117, 118, 118,\n            118, 119, 119, 119, 120, 120, 120, 121, 121, 121, 122, 122, 122,\n            123, 123, 123, 124, 124, 124, 125, 125, 125, 126, 126, 126, 127,\n            127, 127, 128, 128, 128, 129, 129, 129, 130, 130, 130, 131, 131,\n            131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135, 135, 135,\n            136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n            140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144,\n            144, 145, 145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148,\n            149, 149, 149, 150, 150, 150, 151, 151, 151, 152, 152, 152, 153,\n            153, 153, 154, 154, 154, 155, 155, 155, 156, 156, 156, 157, 157,\n            157, 158, 158, 158, 159, 159, 159, 160, 160, 160, 161, 161, 161,\n            162, 162, 162, 163, 163, 163, 164, 164, 164, 165, 165, 165, 166,\n            166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170, 170,\n            170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174,"
+        },
+        {
+            "comment": "This code snippet is likely representing a list of numbers, potentially related to frame or timestamp values in the video processing context.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":236-249",
+            "content": "            175, 175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179,\n            179, 179, 180, 180, 180, 181, 181, 181, 182, 182, 182, 183, 183,\n            183, 184, 184, 184, 185, 185, 185, 186, 186, 186, 187, 187, 187,\n            188, 188, 188, 189, 189, 189, 190, 190, 190, 191, 191, 191, 192,\n            192, 192, 193, 193, 193, 194, 194, 194, 195, 195, 195, 196, 196,\n            196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200, 200, 200,\n            201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n            205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209,\n            209, 210, 210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213,\n            214, 214, 214, 215, 215, 215, 216, 216, 216, 217, 217, 217, 218,\n            218, 218, 219, 219, 219, 220, 220, 220, 221, 221, 221, 222, 222,\n            222, 223, 223, 223, 224, 224, 224, 225, 225, 225, 226, 226, 226,\n            227, 227, 227, 228, 228, 228, 229, 229, 229, 230, 230, 230, 231,\n            231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235, 235,"
+        },
+        {
+            "comment": "Code snippet creates a mask from tensor data, converts it to an image and saves it with specified palette.\nThe 'zip_folder' function compresses the contents of a source folder into a zip file, preserving directory structure.\nThe 'accumulate' function is not defined in this code chunk.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":250-271",
+            "content": "            235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239,\n            240, 240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244,\n            244, 244, 245, 245, 245, 246, 246, 246, 247, 247, 247, 248, 248,\n            248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 252, 252, 252,\n            253, 253, 253, 254, 254, 254, 255, 255, 255\n        ]\n        mask = mask_tensor.cpu().numpy().astype('uint8')\n        mask = Image.fromarray(mask).convert('P')\n        mask.putpalette(_palette)\n        mask.save(path)\n    def zip_folder(self, source_folder, zip_dir):\n        f = zipfile.ZipFile(zip_dir, 'w', zipfile.ZIP_DEFLATED)\n        pre_len = len(os.path.dirname(source_folder))\n        for dirpath, dirnames, filenames in os.walk(source_folder):\n            for filename in filenames:\n                pathfile = os.path.join(dirpath, filename)\n                arcname = pathfile[pre_len:].strip(os.path.sep)\n                f.write(pathfile, arcname)\n        f.close()\n    def accumulate(self):"
+        },
+        {
+            "comment": "This code snippet is part of a class that handles metrics calculation. It accumulates metrics once all iterations are complete, then zips the results and saves them to a specified directory (zip_dir) using self.zip_folder method from the parent class. The logger.info statement displays an informational message confirming the save location and name of the zip file in the zip_dir.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/vos_metric.py\":272-275",
+            "content": "        \"\"\"accumulate metrics when finished all iters.\n        \"\"\"\n        self.zip_folder(self.result_root, self.zip_dir)\n        logger.info('Save result to {}.'.format(self.zip_dir))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9eca83d3-4536-4538-aa94-583aaef2a2ec.json b/docs/doc/9eca83d3-4536-4538-aa94-583aaef2a2ec.json
new file mode 100644
index 000000000..2068f60f8
--- /dev/null
+++ b/docs/doc/9eca83d3-4536-4538-aa94-583aaef2a2ec.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code reads data from a JSON file, converts timestamps to 25 fps, and formats it according to the table tennis analysis submission format. The formatted data is written back to a new JSON file for further use or analysis.",
+    "details": [
+        {
+            "comment": "This code reads data from a JSON file, converts the segment timestamps to a specific frame rate (25 fps), and organizes it into the target submission format. It then writes the formatted data back to a new JSON file for further use or analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/datasets/script/submission_format_transfer.py\":0-48",
+            "content": "import json\nimport math\nwith open('/workspace/bianjiang03/DATA/Output_for_bmn/prop.json') as f:\n    data = json.load(f)\nf.close()\ntransferred = dict()\n# 25 fps for all videos\nfps = 25\nfor item in data:\n    temp = []\n    for seg in item['bmn_results']:\n        temp_dict = {\n            'score': seg['score'],\n            'segment':\n            [round(seg['start'] / fps, 2),\n             round(seg['end'] / fps, 2)]\n        }\n        temp.append(temp_dict)\n    transferred[item['video_name']] = temp\ntarget_format = {\n    'version': 'A-test',\n    'results': transferred,\n    'external_data': {}\n}\njsonString = json.dumps(target_format, indent=4, ensure_ascii=False)\njsonFile = open('/workspace/bianjiang03/DATA/Output_for_bmn/submission.json',\n                'w')\njsonFile.write(jsonString)\njsonFile.close()\n# target format\n# {\n#   \"version\": NA,\n#   \"results\": {\n#     \"name_of_clip_1\": [\n#       {\n#         \"score\": 0.64,\n#         \"segment\": [2.33,3.15]\n#       },\n#       {\n#         \"score\": 0.77,\n#         \"segment\": [7.64, 7.84]\n#       }"
+        },
+        {
+            "comment": "This code defines a dictionary structure representing a submission format for table tennis analysis, with \"name_of_clip\" keys holding lists of segments and scores. The \"external_data\" field is empty.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/datasets/script/submission_format_transfer.py\":49-63",
+            "content": "#     ],\n# \t\"name_of_clip_2\": [\n#       {\n#         \"score\": 0.84,\n#         \"segment\": [9.73,10.15]\n#       },\n#       {\n#         \"score\": 0.87,\n#         \"segment\": [17.11, 17.84]\n#       }\n#     ],\n# \t...\n#   }\n#   \"external_data\": {}\n# }"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9ed623f3-e39a-4c0a-b74f-f97a22c24d1a.json b/docs/doc/9ed623f3-e39a-4c0a-b74f-f97a22c24d1a.json
new file mode 100644
index 000000000..07b9a928d
--- /dev/null
+++ b/docs/doc/9ed623f3-e39a-4c0a-b74f-f97a22c24d1a.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code explains how to convert dygraph models to static models for inference and deployment using PaddleInference, and provides examples on video inference testing with predict.py and benchmarking. Support for C++ infer and PaddleHub Serving deploy are coming soon.",
+    "details": [
+        {
+            "comment": "The code provides instructions on how to convert a dygraph model to a static model for inference and deployment, as well as testing the exported model using PaddleVideo's test script. The conversion is done using the \"export_model.py\" script with appropriate arguments, and some additional parameters are added for TSM. Refer to official documents for more information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/deployment.md\":0-23",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/tutorials/deployment.md) | English\n# Inference\n## How to convert dygraph model to static model?\nTo infer and deploy a model, we need export an inference model, or called to_static: `convert dygraph model to static model`, at first.\n```python\npython3.7 tools/export_model.py -c config_file -o output_path -p params_file\n```\nNote: In `export_model.py`, It will build a model again, and then loading the prarams. But some init params in the infer phase is different from the train phase.\nwe add `num_seg` for TSM in advanced, please add more params or modify them if it is necessary.\nplease refer to [official documents](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/04_dygraph_to_static/index_cn.html) for more information.\n## How to test the export model?\nPaddleVideo supports a test script to test the exported model.\n```python\npython3.7 tools/test_export_model.py -p params_file -i inference_folder -c config_file\n```\nWe just print the output shape, please feel free to ex"
+        },
+        {
+            "comment": "This code explains how to use the PaddleInference tool for testing video inference, providing examples on using predict.py and enabling benchmarking. It also mentions that support for C++ infer is coming soon, as well as instructions on using PaddleHub Serving deploy and PaddleLite deploy, which will be added later.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/deployment.md\":23-47",
+            "content": "tend it. Avtually, only test a video file by PaddleInference can make sure the exported model is right.\n## How to use PaddleInference?\nPaddleVideo supports ```tools/predict.py``` to infer\n```python\npython3.7 tools/predict.py -v example.avi --model_file \"./inference/example.pdmodel\" --params_file \"./inference/example.pdiparams\" --enable_benchmark=False --model=\"example\" --num_seg=8\n ```\n## How to test inference speed?\nPaddleVideo support a script to test inference speed\n```python\npython3.7 tools/predict.py --enable_benchmark=True --model_file=\u6a21\u578b\u6587\u4ef6 --params_file=\u53c2\u6570\u6587\u4ef6\n```\n## How to use C++ infer?\n<sup> coming soon</sup>\n# Deployment\n## How to use PaddleHub Serving deploy?\n<sup> coming soon</sup>\n## How to use PaddleLite deploy?\n<sup> coming soon</sup>"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9f2d24de-b1d7-492a-bb59-67a77a638ee3.json b/docs/doc/9f2d24de-b1d7-492a-bb59-67a77a638ee3.json
new file mode 100644
index 000000000..377c7d174
--- /dev/null
+++ b/docs/doc/9f2d24de-b1d7-492a-bb59-67a77a638ee3.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code imports libraries, sets up logger, and defines AttrDict class for config files. It includes functions to parse, print, visualize, check, and replace configurations using 'override' function. The code parses a config file, applies overrides, checks if the input option is a string, separates key-value pairs, splits keys by dots, calls `print_config()` and `check_config()`, and returns the updated config object.",
+    "details": [
+        {
+            "comment": "This code block is importing necessary libraries, setting up logger, and defining an AttrDict class for handling config files. It also defines a function create_attr_dict that takes in a yaml configuration file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py\":0-33",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport yaml\nfrom EIVideo.paddlevideo.utils.logger import coloring, setup_logger\n__all__ = ['get_config']\nlogger = setup_logger(\"./\", name=\"paddlevideo\", level=\"INFO\")\nclass AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef create_attr_dict(yaml_config):"
+        },
+        {
+            "comment": "This code defines functions for parsing and printing config files. The `parse_config` function loads a config file into an AttrDict object, handling nested dictionaries and string values. The `print_dict` function recursively visualizes a dictionary, indented based on the relationship of keys.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py\":34-66",
+            "content": "    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef print_dict(d, delimiter=0):\n    \"\"\"\n    Recursively visualize a dict and\n    indenting acrrording by the relationship of keys.\n    \"\"\"\n    placeholder = \"-\" * 60\n    for k, v in sorted(d.items()):\n        if isinstance(v, dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", coloring(k,\n                                                                   \"HEADER\")))"
+        },
+        {
+            "comment": "This code defines functions for visualizing and checking configurations, as well as recursively replacing dictionary or list values. It includes functions to print a configuration, check the configuration (currently empty), and override values in a dictionary or list. The print function formats output with coloring and delimiters, and the override function handles both dictionaries and lists for value replacement.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py\":67-108",
+            "content": "            print_dict(v, delimiter + 4)\n        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \",\n                                         coloring(str(k), \"HEADER\")))\n            for value in v:\n                print_dict(value, delimiter + 4)\n        else:\n            logger.info(\"{}{} : {}\".format(delimiter * \" \",\n                                           coloring(k, \"HEADER\"),\n                                           coloring(v, \"OKGREEN\")))\n        if k.isupper():\n            logger.info(placeholder)\ndef print_config(config):\n    \"\"\"\n    visualize configs\n    Arguments:\n        config: configs\n    \"\"\"\n    print_dict(config)\ndef check_config(config):\n    \"\"\"\n    Check config\n    \"\"\"\n    pass\ndef override(dl, ks, v):\n    \"\"\"\n    Recursively replace dict of list\n    Args:\n        dl(dict or list): dict or list to be replaced\n        ks(list): list of keys\n        v(str): value to be replaced\n    \"\"\"\n    def str2num(v):\n        try:\n            return eval(v)"
+        },
+        {
+            "comment": "This function overrides the config, recursively replacing values in the dictionary or list. It requires the configuration and optional options, both as dictionaries. If the key exists, it updates the value, otherwise a warning is issued for new fields, and a new field is created if the key is present in the options. If the key does not exist, an error is thrown.\n\nIn other words, this function allows you to update your configuration by replacing values with new ones. It also helps to identify and handle newly-appearing fields.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py\":109-138",
+            "content": "        except Exception:\n            return v\n    assert isinstance(dl, (list, dict)), (\"{} should be a list or a dict\")\n    assert len(ks) > 0, ('lenght of keys should larger than 0')\n    if isinstance(dl, list):\n        k = str2num(ks[0])\n        if len(ks) == 1:\n            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))\n            dl[k] = str2num(v)\n        else:\n            override(dl[k], ks[1:], v)\n    else:\n        if len(ks) == 1:\n            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))\n            if not ks[0] in dl:\n                logger.warning('A new filed ({}) detected!'.format(ks[0], dl))\n            dl[ks[0]] = str2num(v)\n        else:\n            assert ks[0] in dl, (\n                '({}) doesn\\'t exist in {}, a new dict field is invalid'.format(\n                    ks[0], dl))\n            override(dl[ks[0]], ks[1:], v)\ndef override_config(config, options=None):\n    \"\"\"\n    Recursively override the config\n    Args:\n        config(dict): dict to be replaced"
+        },
+        {
+            "comment": "The code parses a config file and applies overrides. It checks if the input option is a string and if it contains an equal sign to separate key-value pairs. It then splits the option into key and value, further splitting keys by dots. The function overrides the configuration file with these options, returning the updated configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py\":139-169",
+            "content": "        options(list): list of pairs(key0.key1.idx.key2=value)\n            such as: [\n                epochs=20',\n                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'\n            ]\n    Returns:\n        config(dict): replaced config\n    \"\"\"\n    if options is not None:\n        for opt in options:\n            assert isinstance(opt,\n                              str), (\"option({}) should be a str\".format(opt))\n            assert \"=\" in opt, (\n                \"option({}) should contain a =\"\n                \"to distinguish between key and value\".format(opt))\n            pair = opt.split('=')\n            assert len(pair) == 2, (\"there can be only a = in the option\")\n            key, value = pair\n            keys = key.split('.')\n            override(config, keys, value)\n    return config\ndef get_config(fname, overrides=None, show=True):\n    \"\"\"\n    Read config from file\n    \"\"\"\n    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))\n    config = parse_config(fname)\n    override_config(config, overrides)"
+        },
+        {
+            "comment": "This code checks if `show` is True, and if so, it calls the function `print_config(config)`. It then always calls another function `check_config(config)`, before finally returning the `config` object. This implies that `print_config()` prints out configuration details, while `check_config()` checks for correctness or validity of the configuration. The config is returned regardless to ensure it's available to the rest of the codebase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py\":170-173",
+            "content": "    if show:\n        print_config(config)\n    check_config(config)\n    return config"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9f2d6a4a-011e-45b3-92df-3a3591749907.json b/docs/doc/9f2d6a4a-011e-45b3-92df-3a3591749907.json
new file mode 100644
index 000000000..e85da0e2f
--- /dev/null
+++ b/docs/doc/9f2d6a4a-011e-45b3-92df-3a3591749907.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports and registers two reader classes, FeatureReader and KineticsReader, with the names \"ATTENTIONLSTM\" and \"TSN\", respectively. The registration occurs in alphabetical order.",
+    "details": [
+        {
+            "comment": "This code imports and registers two reader classes, FeatureReader and KineticsReader, with the names \"ATTENTIONLSTM\" and \"TSN\", respectively. The registration occurs in alphabetical order.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/__init__.py\":0-6",
+            "content": "from .reader_utils import regist_reader, get_reader\nfrom .feature_reader import FeatureReader\nfrom .kinetics_reader import KineticsReader\n# regist reader, sort by alphabet\nregist_reader(\"ATTENTIONLSTM\", FeatureReader)\nregist_reader(\"TSN\", KineticsReader)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9fc6d59f-4045-4899-b3b7-7a0e28825e37.json b/docs/doc/9fc6d59f-4045-4899-b3b7-7a0e28825e37.json
new file mode 100644
index 000000000..9c56fc5ec
--- /dev/null
+++ b/docs/doc/9fc6d59f-4045-4899-b3b7-7a0e28825e37.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is specifying the files and directories to include in the package distribution for PaddleVideo. It includes important documents like LICENSE and README, utilities scripts like utils.py and ava_predict.py, and key data file Kinetics-400_label_list.txt. Additionally, it uses a recursive-include to incorporate all .py and .txt files within the paddlevideo directory.",
+    "details": [
+        {
+            "comment": "This code is specifying the files and directories to include in the package distribution for PaddleVideo. It includes important documents like LICENSE and README, utilities scripts like utils.py and ava_predict.py, and key data file Kinetics-400_label_list.txt. Additionally, it uses a recursive-include to incorporate all .py and .txt files within the paddlevideo directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/MANIFEST.in\":0-8",
+            "content": "include LICENSE\ninclude README.md\ninclude tools/__init__.py\ninclude tools/utils.py\ninclude tools/ava_predict.py\ninclude tools/wheel.py\ninclude data/k400/Kinetics-400_label_list.txt\nrecursive-include paddlevideo/ *.py *.txt"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/9fee75a3-1e39-420a-a64f-f40e1abb9157.json b/docs/doc/9fee75a3-1e39-420a-a64f-f40e1abb9157.json
new file mode 100644
index 000000000..91432ea44
--- /dev/null
+++ b/docs/doc/9fee75a3-1e39-420a-a64f-f40e1abb9157.json
@@ -0,0 +1,70 @@
+{
+    "summary": "The code defines a ResNet-TSM model in PaddleVideo with Batch Normalization, Leaky ReLU activation, and optional shortcut connections for MRI applications, using ConvBNLayer and ResNetTSM_MRI classes.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and defining a class for a Convolutional Batch Normalization Layer. It also provides information about copyright, license, and contact details of the PaddlePaddle Authors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":0-31",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport math\nimport sys\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils.save_load import load_ckpt\nfrom paddle.regularizer import L2Decay\nclass ConvBNLayer(nn.Layer):"
+        },
+        {
+            "comment": "This code defines a ConvBNLayer class with various parameters such as in_channels, out_channels, kernel_size, stride, groups, is_tweaks_mode, act, and name. It inherits from the base class and initializes the layer's weights and biases using explicit declarations in the init_weights method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":32-57",
+            "content": "    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:\n        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        is_tweaks_mode (bool): switch for tweaks. Default: False.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode"
+        },
+        {
+            "comment": "This code snippet initializes ResNet-D with a 2x2 average pooling layer followed by a convolution operation. The pooling layer has a stride of 2 and is changed to 1 later in practice. The convolution uses the specified parameters such as in_channels, out_channels, kernel size, stride, padding, groups, and names for weights and batch normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":58-82",
+            "content": "        #ResNet-D 1/2:add a 2\u00d72 average pooling layer with a stride of 2 before the convolution,\n        #             whose stride is changed to 1, works well in practice.\n        self._pool2d_avg = AvgPool2D(kernel_size=2,\n                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(\n            out_channels,\n            weight_attr=ParamAttr(name=bn_name + \"_scale\","
+        },
+        {
+            "comment": "The code defines a class `ResNetTSM_MRI` which appears to be a backbone model for ResNet-TSM. It contains a forward function that applies pooling, convolution, batch normalization, and activation (if specified) to the inputs. The BottleneckBlock class is defined with options for stride, shortcut connection, number of segments, and name. It initializes a ConvBNLayer instance for the first branch, and another ConvBNLayer instance for the second branch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":83-111",
+            "content": "                                  regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(bn_name + \"_offset\", regularizer=L2Decay(0.0)))\n    def forward(self, inputs):\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 if_first=False,\n                 num_seg=8,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,"
+        },
+        {
+            "comment": "In this code, the function creates three ConvBNLayer instances: a \"branch2a\", \"branch2b\", and \"branch2c\". The \"branch2a\" instance is created with specified parameters. If shortcut is not set, an additional \"branch1\" instance (ConvBNLayer) is created with a 1x1 convolution layer and a stride of 1. This is explained to be useful in ResNet-D 2/2 configuration where a 2x2 average pooling layer with a stride of 2 is added before the convolution, which is later changed to 1 in practice.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":112-133",
+            "content": "                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(\n                in_channels=in_channels,\n                out_channels=out_channels * 4,\n                kernel_size=1,\n                stride=\n                1,  #ResNet-D 2/2:add a 2\u00d72 average pooling layer with a stride of 2 before the convolution,\n                #             whose stride is changed to 1, works well in practice.\n                is_tweaks_mode=False if if_first else True,\n                name=name + \"_branch1\")"
+        },
+        {
+            "comment": "The code defines a class for a ResNet-TSM backbone model, with the forward function applying temporal shifts and convolutions. The BasicBlock class is used for the basic building block of the network, with optional shortcut connections.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":135-165",
+            "content": "        self.shortcut = shortcut\n        self.num_seg = num_seg\n    def forward(self, inputs):\n        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.leaky_relu(y)\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 filter_size=3,\n                                 stride=stride,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2a\")"
+        },
+        {
+            "comment": "This code defines a ResNet TSM backbone model with Batch Normalization and Leaky ReLU activation. It includes a ConvBNLayer for the branch2b, and an optional shortcut connection depending on the input. The forward function performs convolution, adds the shortcut, applies leaky ReLU activation, and returns the result. The ResNetTSM_MRI class is registered with BACKBONES.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":166-199",
+            "content": "        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 filter_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     filter_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.leaky_relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTSM_MRI(nn.Layer):\n    \"\"\"ResNet TSM backbone.\n    Args:\n        depth (int): Depth of resnet model."
+        },
+        {
+            "comment": "This code defines a ResNetTSM_MRI class with parameters for depth, num_seg, pretrained (default None), and in_channels. It checks if the input layer is supported, sets the depth based on the input layer, sets out channels, and initializes ConvBNLayer instances accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":200-228",
+            "content": "        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, num_seg=8, pretrained=None, in_channels=1):\n        super(ResNetTSM_MRI, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        self.num_seg = num_seg\n        self.in_channels = in_channels\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = 64\n        out_channels = [64, 128, 256, 512]\n        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,\n                                   out_channels=32,"
+        },
+        {
+            "comment": "This code defines the ResNet-TSM backbone architecture in PaddleVideo. It includes multiple ConvBNLayer instances for different stages of feature extraction and a MaxPool2D layer for downsampling. The depth of each block is specified by the provided depth list, with shortcut connections determined based on the number of layers specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":229-251",
+            "content": "                                   kernel_size=3,\n                                   stride=2,\n                                   act='leaky_relu',\n                                   name=\"conv1_1\")\n        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='leaky_relu',\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='leaky_relu',\n                                   name=\"conv1_3\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):"
+        },
+        {
+            "comment": "The code dynamically assigns a name to the BottleneckBlock based on its block and layer indices. If the layers are 101 or 152 at block 2, it further distinguishes between convolutions 'a' and 'b'. The 'bb_%d_%d' naming is used for loading pre-trained models. The BottleneckBlock parameters include in_channels based on if i == 0 or not, out_channels of the block, stride depending on if it's the first layer or not, num_seg for segmentation, shortcut type, and a flag for if it's the first layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":252-270",
+            "content": "                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' %\n                        (block, i),  #same with PaddleClas, for loading pretrain\n                        BottleneckBlock(\n                            in_channels=in_channels\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            num_seg=self.num_seg,\n                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            name=conv_name))"
+        },
+        {
+            "comment": "This code defines a ResNet TSM backbone with multiple blocks and basic blocks. It dynamically creates convolutional layers using the add_sublayer function. The number of blocks and their configuration is defined by the input parameters \"depth\". Shortcuts are used between layers, and the function init_weights initializes the parameters of the backbone.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":271-293",
+            "content": "                    in_channels = out_channels[block] * 4\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone."
+        },
+        {
+            "comment": "This code initializes the backbone of a neural network. If a pretrained path is not specified, it uses specific initialization methods for Conv2D and BatchNorm2d layers. The KaimingNormal function initializes the Conv2D layer, while the Constant function with value 1 initializes the BatchNorm2d layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":294-310",
+            "content": "            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run."
+        },
+        {
+            "comment": "Reshaping and applying convolutional layers, max pooling, and iterating through a list of blocks to perform operations on the input feature map.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py\":312-326",
+            "content": "        \"\"\"\n        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27\n        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a019b3ed-d20a-4304-8541-226fd237bf3b.json b/docs/doc/a019b3ed-d20a-4304-8541-226fd237bf3b.json
new file mode 100644
index 000000000..741b6bbe1
--- /dev/null
+++ b/docs/doc/a019b3ed-d20a-4304-8541-226fd237bf3b.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code trains and analyzes two models (dygraph and dy2static), compares their losses, logs the differences, and prints the results.",
+    "details": [
+        {
+            "comment": "Source common functions and set IFS to handle line breaks. Read the BASE_CONFIG_FILE, identify MODE, get log path, delete existing directory if it exists, create a new one, set CUDNN deterministic for stable results, read base config, parse sub commands, and output relevant information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_dy2static_python.sh\":0-29",
+            "content": "source test_tipc/common_func.sh\nIFS=$'\\n'\nBASE_CONFIG_FILE=$1\n# always use the lite_train_lite_infer mode to speed. Modify the config file.\nMODE=lite_train_lite_infer\nBASEDIR=$(dirname \"$0\")\n# get the log path.\ndataline=$(cat ${BASE_CONFIG_FILE})\nlines=(${dataline})\nmodel_name=$(func_parser_value \"${lines[1]}\")\nLOG_PATH=\"./test_tipc/output/${model_name}/${MODE}\"\nrm -rf $LOG_PATH\nmkdir -p ${LOG_PATH}\nstatus_log=\"${LOG_PATH}/results_python.log\"\n# make cudnn algorithm deterministic, such as conv.\nexport FLAGS_cudnn_deterministic=True\n# read the base config and parse and run the sub commands\nconfig_line_numbers=`cat ${BASE_CONFIG_FILE} | grep -n \"============\" | cut -d':' -f1`\nfor cln in $config_line_numbers\ndo\n    # change IFS to prevent \\n is parsed as delimiter.\n    IFS=\"\"\n    config_lines=$(cat ${BASE_CONFIG_FILE} | sed -n \"${cln},\\$p\" | head -n 22)\n    config_name=`echo ${config_lines} | grep '=====' | cut -d' ' -f2`\n    FILENAME=$LOG_PATH/dy2static_$config_name.txt\n    echo \"[Start dy2static]\" \"${config_name} : ${FILENAME}\""
+        },
+        {
+            "comment": "This code is configuring, running and analyzing two different training models. It first sets the necessary environment, then runs a dygraph training model and a dy2static one, saving their results in separate logs. Finally, it extracts the losses from the dy2static log for comparison with the dygraph's loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_dy2static_python.sh\":30-56",
+            "content": "    echo ${config_lines} > $FILENAME\n    sed -i 's/gpu_list.*$/gpu_list:0/g' $FILENAME\n    # execute the last line command\n    custom_cmd=$(echo $config_lines | tail -n 1)\n    echo \"CustomCmd is: \" $custom_cmd\n    eval $custom_cmd\n    IFS=$'\\n'\n    # start dygraph train\n    dygraph_output=$LOG_PATH/${config_name}_python_train_infer_dygraph_output.txt\n    dygraph_loss=$LOG_PATH/${config_name}_dygraph_loss.txt\n    cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} $MODE >$dygraph_output 2>&1\"\n    echo $cmd\n    eval $cmd\n    # start dy2static train\n    dy2static_output=$LOG_PATH/${config_name}_python_train_infer_dy2static_output.txt\n    dy2static_loss=$LOG_PATH/${config_name}_dy2static_loss.txt\n    sed -i '16s/$/ -o to_static=True/' ${FILENAME}\n    cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} $MODE >$dy2static_output 2>&1\"\n    echo $cmd\n    eval $cmd\n    # analysis and compare the losses.\n    dyout=`cat $dy2static_output | python test_tipc/extract_loss.py -v 'train step' -e 'loss: {%f} ' | head -n 3`"
+        },
+        {
+            "comment": "This code compares the outputs of two models (dygraph_loss and dy2static_loss), checks for differences using a diff command, and logs the result to diff_log. If either dyout or stout is empty, it runs status_check with different codes. Finally, it prints the diff_log.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_dy2static_python.sh\":57-72",
+            "content": "    stout=`cat $dygraph_output   | python test_tipc/extract_loss.py -v 'train step' -e 'loss: {%f} ' | head -n 3`\n    echo $dyout > $dygraph_loss\n    echo $stout > $dy2static_loss\n    diff_log=$LOG_PATH/${config_name}_diff_log.txt\n    diff_cmd=\"diff -w $dygraph_loss $dy2static_loss > $diff_log\"\n    eval $diff_cmd\n    last_status=$?\n    cat $diff_log\n    if [ \"$dyout\" = \"\" ]; then\n        status_check 1 $diff_cmd $status_log $model_name $diff_log\n    elif [ \"$stout\" = \"\" ]; then\n        status_check 2 $diff_cmd $status_log $model_name $diff_log\n    else\n        status_check $last_status $diff_cmd $status_log $model_name $diff_log\n    fi\ndone"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a1262941-422f-4a13-b7cd-83a7d78c5289.json b/docs/doc/a1262941-422f-4a13-b7cd-83a7d78c5289.json
new file mode 100644
index 000000000..f671941ec
--- /dev/null
+++ b/docs/doc/a1262941-422f-4a13-b7cd-83a7d78c5289.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This Python file imports necessary modules, defines AttrDict class, checks PaddlePaddle version compatibility, and handles GPU usage based on CUDA availability. It uses paddle.utils.require_version to ensure required version '1.6.0' is installed, logging errors and exiting with status code 1 if not.",
+    "details": [
+        {
+            "comment": "Code is a Python file with license information, importing necessary modules such as os, sys, signal and logging. It defines the AttrDict class which extends the dictionary functionality and includes signals for handling SIGTERM and SIGINT for process termination.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/utility.py\":0-36",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport signal\nimport logging\nimport paddle\n__all__ = ['AttrDict']\nlogger = logging.getLogger(__name__)\ndef _term(sig_num, addition):\n    print('current pid is %s, group id is %s' % (os.getpid(), os.getpgrp()))\n    os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)\nsignal.signal(signal.SIGTERM, _term)\nsignal.signal(signal.SIGINT, _term)\nclass AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]"
+        },
+        {
+            "comment": "This code checks if the installed PaddlePaddle version is 1.6 or higher and ensures that the user's code is compatible with the installed version. It also handles GPU usage by checking if the code should run on GPU or CPU based on the availability of CUDA in the system.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/utility.py\":38-65",
+            "content": "    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\ndef check_cuda(use_cuda, err = \\\n    \"\\nYou can not set use_gpu = True in the model because you are using paddlepaddle-cpu.\\n \\\n    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_gpu = False to run models on CPU.\\n\"\n                                                                                                                     ):\n    try:\n        if use_cuda == True and paddle.is_compiled_with_cuda() == False:\n            print(err)\n            sys.exit(1)\n    except Exception as e:\n        pass\ndef check_version():\n    \"\"\"\n     Log error and exit when the installed version of paddlepaddle is\n     not satisfied.\n     \"\"\"\n    err = \"PaddlePaddle version 1.6 or higher is required, \" \\\n          \"or a suitable develop version is satisfied as well. \\n\" \\\n          \"Please make sure the version is good with your code.\" \\\n    try:"
+        },
+        {
+            "comment": "This code is using the paddle.utils.require_version function to check if the required version '1.6.0' is installed. If there is an exception, it logs the error and exits the program with a status code of 1.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/utility.py\":66-69",
+            "content": "        paddle.utils.require_version('1.6.0')\n    except Exception as e:\n        logger.error(err)\n        sys.exit(1)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a141b6a6-d3e3-4f1e-af36-8b59768eb5c9.json b/docs/doc/a141b6a6-d3e3-4f1e-af36-8b59768eb5c9.json
new file mode 100644
index 000000000..cec0d47ef
--- /dev/null
+++ b/docs/doc/a141b6a6-d3e3-4f1e-af36-8b59768eb5c9.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The FootballAction model in PaddleVideo employs PP-TSM, BMN, and Attention LSTM for feature extraction and classification/regression. The code updates the recognizer2d.py file, exports PP-TSM inference models, creates datasets, and predicts BMN proposal information. The Attention LSTM model is trained with improvements, resulting in accuracy and F1-score enhancements.",
+    "details": [
+        {
+            "comment": "This is a README for the FootballAction model in PaddleVideo, introducing a soccer action detection algorithm. It consists of sections on model introduction, environment preparation, data preparation (including dataset details), quick experience, advanced usage, references, and installation instructions. The model uses PP-TSM, BMN, and AttentionLSTM for feature extraction, proposal extraction, and action classification/regression from image and audio modalities. The dataset is derived from the 2016 European Cup, with 49 videos in total (44 training, 5 validation).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":0-53",
+            "content": "# \u8db3\u7403\u52a8\u4f5c\u68c0\u6d4b\u6a21\u578b\n## \u5185\u5bb9\n- [1. \u6a21\u578b\u7b80\u4ecb](#1-\u6a21\u578b\u7b80\u4ecb)\n- [2. \u73af\u5883\u51c6\u5907](#2-\u73af\u5883\u51c6\u5907)\n- [3. \u6570\u636e\u51c6\u5907](#3-\u6570\u636e\u51c6\u5907)\n    - [3.1 \u6570\u636e\u96c6\u7b80\u4ecb](#31-\u6570\u636e\u96c6\u7b80\u4ecb)\n    - [3.2 \u6570\u636e\u96c6\u4e0b\u8f7d](#32-\u6570\u636e\u96c6\u4e0b\u8f7d)\n    - [3.3 \u6570\u636e\u9884\u5904\u7406](#33-\u6570\u636e\u9884\u5904\u7406)\n- [4. \u5feb\u901f\u4f53\u9a8c](#4-\u5feb\u901f\u4f53\u9a8c)\n- [5. \u8fdb\u9636\u4f7f\u7528](#5-\u8fdb\u9636\u4f7f\u7528)\n    - [5.1 \u6a21\u578b\u8bad\u7ec3](#51-\u6a21\u578b\u8bad\u7ec3)\n    - [5.2 \u6a21\u578b\u63a8\u7406](#52-\u6a21\u578b\u63a8\u7406)\n    - [5.3 \u6a21\u578b\u8bc4\u4f30](#53-\u6a21\u578b\u8bc4\u4f30)\n    - [5.4 \u6a21\u578b\u4f18\u5316](#54-\u6a21\u578b\u4f18\u5316)\n    - [5.5 \u6a21\u578b\u90e8\u7f72](#55-\u6a21\u578b\u90e8\u7f72)\n- [6. \u53c2\u8003\u8bba\u6587](#6-\u53c2\u8003\u8bba\u6587)\n<a name=\"\u6a21\u578b\u7b80\u4ecb\"></a>\n## 1. \u6a21\u578b\u7b80\u4ecb\nFootballAction\u662f\u57fa\u4e8ePaddleVideo\u5b9e\u73b0\u7684\u8db3\u7403\u52a8\u4f5c\u68c0\u6d4b\u7b97\u6cd5\uff0c\u7528\u4e8e\u4ece\u8db3\u7403\u6bd4\u8d5b\u89c6\u9891\u4e2d\u5b9a\u4f4d\u51fa\u7cbe\u5f69\u52a8\u4f5c\u7247\u6bb5\u53d1\u751f\u7684\u8d77\u6b62\u65f6\u95f4\u548c\u5bf9\u5e94\u7684\u52a8\u4f5c\u7c7b\u522b\u3002\u53ef\u4ee5\u5b9a\u4f4d\u7684\u8db3\u7403\u52a8\u4f5c\u7c7b\u578b\u5305\u62ec8\u79cd\uff0c\u5206\u522b\u4e3a\uff1a\n```txt\n\u80cc\u666f\u3001\u8fdb\u7403\u3001\u89d2\u7403\u3001\u4efb\u610f\u7403\u3001\u9ec4\u724c\u3001\u7ea2\u724c\u3001\u6362\u4eba\u3001\u754c\u5916\u7403\n```\n\u6211\u4eec\u63d0\u51fa\u7684\u65b9\u6848\u7ed3\u5408PP-TSM\u3001BMN\u548cAttentionLSTM\u4e09\u4e2a\u6a21\u578b\uff0c\u56fe\u50cf\u548c\u97f3\u9891\u4e24\u79cd\u6a21\u6001\u8fdb\u884c\u52a8\u4f5c\u68c0\u6d4b\uff0c\u7b97\u6cd5\u6574\u4f53\u6d41\u7a0b\u5171\u5206\u4e3a\u4ee5\u4e0b\u4e09\u6b65\uff1a\n - \u7279\u5f81\u62bd\u53d6\n    - \u56fe\u50cf\u7279\u6027\uff1aPP-TSM\n    - \u97f3\u9891\u7279\u5f81\uff1aVGGish\n - proposal\u63d0\u53d6\uff1aBMN\n - \u52a8\u4f5c\u5206\u7c7b + \u56de\u5f52\uff1aAttentionLSTM\nAIStudio\u9879\u76ee\uff1a [\u57fa\u4e8ePP-TSM+BMN+AttentionLSTM\u5b9e\u73b0\u8db3\u7403\u7cbe\u5f69\u65f6\u523b\u526a\u8f91](https://aistudio.baidu.com/aistudio/projectdetail/3473391?channelType=0&channel=0)\n<a name=\"\u73af\u5883\u51c6\u5907\"></a>\n## 2. \u73af\u5883\u51c6\u5907\n- PaddleVideo\u6a21\u578b\u5e93\u4f9d\u8d56\u5b89\u88c5\u8bf7\u53c2\u8003 [\u5b89\u88c5\u8bf4\u660e](../../docs/zh-CN/install.md)\n<a name=\"\u6570\u636e\u51c6\u5907\"></a>\n## 3. \u6570\u636e\u51c6\u5907\n<a name=\"\u6570\u636e\u96c6\u7b80\u4ecb\"></a>\n### 3.1 \u6570\u636e\u96c6\u7b80\u4ecb\n\u6570\u636e\u96c6\u6765\u81ea\u6b27\u6d32\u676f2016\uff0c\u517149\u4e2a\u8db3\u7403\u89c6\u9891\uff0c\u5176\u4e2d\u8bad\u7ec3\u96c644\u4e2a\uff0c\u9a8c\u8bc1\u96c65\u4e2a\u3002\n- \u6570\u636e\u96c6label\u683c\u5f0f\n```\n{\n    \"0\": \"\u80cc\u666f\","
+        },
+        {
+            "comment": "This code defines a dictionary where each key corresponds to an action in the football game, such as \"\u8fdb\u7403\" or \"\u89d2\u7403\". The dataset file contains these labeled examples of actions for training and validation purposes. The data preprocessing step involves handling these labels, creating a JSON format file containing frames per second (fps) and ground truth (gts) data with respective video URLs, total frames, and action information including label IDs, names, start and end frame indices. It also mentions that the dataset can be downloaded using a provided script, and that the code structure is organized in a specific way.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":54-117",
+            "content": "    \"1\": \"\u8fdb\u7403\",\n    \"2\": \"\u89d2\u7403\",\n    \"3\": \"\u4efb\u610f\u7403\",\n    \"4\": \"\u9ec4\u724c\",\n    \"5\": \"\u7ea2\u724c\",\n    \"6\": \"\u6362\u4eba\",\n    \"7\": \"\u754c\u5916\u7403\",\n}\n```\n- \u6570\u636e\u96c6\u6807\u6ce8\u6587\u4ef6:\n```txt\ndatasets/EuroCup2016/label_cls8_train.json\ndatasets/EuroCup2016/label_cls8_val.json\n```\n- \u6570\u636e\u96c6gts\u5904\u7406, \u5c06\u539f\u59cb\u6807\u6ce8\u6570\u636e\u5904\u7406\u6210\u5982\u4e0bjson\u683c\u5f0f\n```\n{\n    'fps': 5,\n    'gts': [\n        {\n            'url': 'xxx.mp4',\n            'total_frames': 6341,\n            'actions': [\n                {\n                    \"label_ids\": [7],\n                    \"label_names\": [\"\u754c\u5916\u7403\"],\n                    \"start_id\": 395,\n                    \"end_id\": 399\n                },\n                ...\n            ]\n        },\n        ...\n    ]\n}\n```\n<a name=\"\u6570\u636e\u96c6\u4e0b\u8f7d\"></a>\n### 3.2 \u6570\u636e\u96c6\u4e0b\u8f7d\n\u6570\u636e\u96c6\u4e0b\u8f7d\u94fe\u63a5: [dataset_url.list](./datasets/EuroCup2016/dataset_url.list)\n\u53ef\u4f7f\u7528\u5982\u4e0b\u811a\u672c\u4e0b\u8f7d\uff1a\n```\ncd datasets/EuroCup2016 && sh download_dataset.sh\n```\n<a name=\"\u6570\u636e\u9884\u5904\u7406\"></a>\n### 3.3 \u6570\u636e\u9884\u5904\u7406\n- \u6570\u636e\u96c6\u62bd\u5e27, \u7531mp4, \u5f97\u5230frames\u548cpcm, \u8fd9\u91cc\u9700\u8981\u6dfb\u52a0ffmpeg\u73af\u5883\n```\ncd datasets/script && python get_frames_pcm.py\n```\n\u7ecf\u8fc7\u4ee5\u4e0a\u6b65\u9aa4\uff0c\u5f97\u5230\u7684\u4ee3\u7801\u7ed3\u6784\u5982\u4e0b\u6240\u793a\uff1a\n```\n|-- FootballAction\n   |--  checkpoints                # \u6a21\u578b\u5b58\u653e\u8def\u5f84\n   |--  datasets                   # \u6570\u636e\u96c6\u548c\u6570\u636e\u5904\u7406\u811a\u672c"
+        },
+        {
+            "comment": "This directory contains data and scripts related to the FootballAction dataset. It includes original MP4 videos, their image frames, audio PCM files, URL lists, and JSON files for ground truth labels and classifications. There are also separate folders for scripting data processing, feature extraction, model training (LSTM), and proposal-based object detection (PPTSM and BDN). The configs folder contains the configuration files needed for these training scripts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":118-140",
+            "content": "        |--  EuroCup2016           # \u6570\u636e\u5b58\u653e\u8def\u5f84\n            |--  feature_bmn       # bmn\u63d0\u53d6\u5230\u7684proposal\n            |--  features          # image\u548caudio\u7279\u5f81, image fps=5, audio \u6bcf\u79d2(1024)\n            |--  input_for_bmn     # bmn\u8bad\u7ec3\u7684\u8f93\u5165\u6570\u636e\uff0cwidows=40\n            |--  input_for_lstm    # lstm\u8bad\u7ec3\u7684\u8f93\u5165\u6570\u636e\n            |--  input_for_pptsm    # pptsm\u8bad\u7ec3\u7684\u6570\u636e\u6570\u636e\n            |--  mp4               # \u539f\u59cb\u89c6\u9891.mp4\n            |--  frames            # \u56fe\u50cf\u5e27, fps=5, '.jpg'\u683c\u5f0f\n            |--  pcm               # \u97f3\u9891pcm, \u97f3\u9891\u91c7\u6837\u738716000\uff0c\u91c7\u7528\u901a\u9053\u65701\n            |--  url.list          # \u89c6\u9891\u5217\u8868\n            |--  url_val.list          # \u89c6\u9891\u5217\u8868\n            |--  label_cls8_train.json  # \u8bad\u7ec3\u96c6\u539f\u59cbgts\n            |--  label_cls8_val.json    # \u9a8c\u8bc1\u96c6\u539f\u59cbgts\n            |--  label.json        # \u52a8\u4f5clabel\n        |--  script                # \u6570\u636e\u96c6\u5904\u7406\u811a\u672c\n    |--  predict                   # \u6a21\u578b\u9884\u6d4b\u4ee3\u7801\n    |--  extractor                 # \u7279\u5f81\u63d0\u53d6\u811a\u672c\n    |--  train_lstm                # lstm\u8bad\u7ec3\u4ee3\u7801\n    |--  train_proposal            # pptsm\u3001bmn\u8bad\u7ec3\u4ee3\u7801\n        |--  configs               # pptsm\u3001bmn\u914d\u7f6e\u6587\u4ef6\n```\n<a name=\"\u5feb\u901f\u4f53\u9a8c\"></a>"
+        },
+        {
+            "comment": "This code explains the steps to download a pre-trained model, run prediction, and perform advanced usage including training PP-TSM for the FootballAction application in PaddleVideo. It requires following specific commands and using provided scripts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":141-200",
+            "content": "## 4. \u5feb\u901f\u4f53\u9a8c\n\u9996\u5148\uff0c\u901a\u8fc7\u4ee5\u4e0b\u547d\u4ee4\uff0c\u4e0b\u8f7d\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u6587\u4ef6\uff1a\n```bash\ncd checkpoints\nsh  download.sh\n```\n\u8fd0\u884c\u9884\u6d4b\u4ee3\u7801\uff1a\n```\ncd ${FootballAction_root}/predict && python predict.py\n```\n\u4ea7\u51fa\u6587\u4ef6\uff1aresults.json\n<a name=\"\u8fdb\u9636\u4f7f\u7528\"></a>\n## 5. \u8fdb\u9636\u4f7f\u7528\n<a name=\"\u6a21\u578b\u8bad\u7ec3\"></a>\n### 5.1 \u6a21\u578b\u8bad\u7ec3\n\u91c7\u6837\u65b9\u5f0f\uff1a\n- image \u91c7\u6837\u9891\u7387fps=5\uff0c\u5982\u679c\u6709\u4e9b\u52a8\u4f5c\u65f6\u95f4\u8f83\u77ed\uff0c\u53ef\u4ee5\u9002\u5f53\u63d0\u9ad8\u91c7\u6837\u9891\u7387\n- BMN windows=200\uff0c\u537340s\uff0c\u6240\u4ee5\u6d4b\u8bd5\u81ea\u5df1\u7684\u6570\u636e\u65f6\uff0c\u89c6\u9891\u65f6\u957f\u9700\u5927\u4e8e40s\n\u8bf7\u5148\u53c2\u8003[\u4f7f\u7528\u8bf4\u660e](../../docs/zh-CN/usage.md)\u4e86\u89e3PaddleVideo\u6a21\u578b\u5e93\u7684\u4f7f\u7528\u3002\n#### step1 PP-TSM\u8bad\u7ec3\nPP-TSM\u6a21\u578b\u4f7f\u7528\u6587\u6863\u53c2\u8003[PP-TSM](../../docs/zh-CN/model_zoo/recognition/pp-tsm.md)\n##### step1.1  PP-TSM \u8bad\u7ec3\u6570\u636e\u5904\u7406\n\u4f7f\u7528\u5982\u4e0b\u547d\u4ee4\u7ed3\u5408frames\u548cgts\u751f\u6210\u8bad\u7ec3\u6240\u9700\u8981\u7684\u6b63\u8d1f\u6837\u672c:\n```bash\ncd datasets/script && python get_instance_for_pptsm.py\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                   # \u6570\u636e\u96c6\u548c\u6570\u636e\u5904\u7406\u811a\u672c\n        |--  EuroCup2016           # \u6570\u636e\u5b58\u653e\u8def\u5f84\n            |--  input_for_pptsm   # pptsm\u8bad\u7ec3\u7684\u6570\u636e\n```\n\u6587\u4ef6\u6309\u7167\u5982\u4e0b\u683c\u5f0f\u547d\u540d\uff1a\n```\n'{}_{}_{}_{}'.format(video_basename, start_id, end_id, label)\n```\n##### step1.2 PP-TSM\u6a21\u578b\u8bad\u7ec3\n\u8bad\u7ec3\u542f\u52a8\u547d\u4ee4\u5982\u4e0b\uff1a\n```bash\ncd ${FootballAction_root}\ncd ../..  #\u8fdb\u5165PaddleVideo\u76ee\u5f55\u4e0b\npython -B -m paddle.distributed.launch \\\n    --gpus=\"0,1,2,3\" \\\n    --log_dir=./football/logs_pptsm \\\n    main.py  \\"
+        },
+        {
+            "comment": "This code is updating the `recognizer2d.py` file in PaddleVideo, modifying the `__init__` and `infer_step` functions. Then it executes a command to export the PP-TSM inference model using the provided configuration file and best model parameters from previous training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":201-229",
+            "content": "    --validate \\\n    -c applications/FootballAction/train_proposal/configs/pptsm_football_v2.0.yaml  \\\n    -o output_dir=./football/pptsm\n```\n\u6211\u4eec\u4e5f\u63d0\u4f9b\u4e86\u8bad\u7ec3\u597d\u7684PP-TSM\u6a21\u578b\uff0c\u4e0b\u8f7d\u94fe\u63a5\u5df2\u5728\u5feb\u901f\u4f53\u9a8c\u7ae0\u8282\u4e2d\u7ed9\u51fa\u3002\n##### step1.3 \u5bfc\u51faPP-TSM\u63a8\u7406\u6a21\u578b\n\u5728\u8f6c\u4e3a\u9884\u6d4b\u6a21\u5f0f\u524d\uff0c\u9700\u8981\u4fee\u6539 `PaddleVideo/paddlevideo/modeling/framework/recognizers/recognizer2d.py` \u6587\u4ef6\uff0c\u5c06 init \u548c infer_step \u51fd\u6570\u5206\u522b\u66f4\u65b0\u4e3a\u5982\u4e0b\u4ee3\u7801\uff1a\n```python\n    def __init__(self, backbone=None, head=None):\n        super().__init__(backbone=backbone, head=head)\n        self.avgpool2d = paddle.nn.AdaptiveAvgPool2D((1, 1), data_format='NCHW')\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))\n        feature = self.backbone(imgs)\n        feat = self.avgpool2d(feature)\n        return feat\n```\n\u518d\u6267\u884c\u5982\u4e0b\u547d\u4ee4\uff1a\n```bash\ncd ${PaddleVideo_root}\npython tools/export_model.py -c applications/FootballAction/train_proposal/configs/pptsm_football_v2.0.yaml \\\n                             -p ./football/pptsm/ppTSM_best.pdparams \\"
+        },
+        {
+            "comment": "In this code snippet, we are replacing a line of code in `pptsm_infer.py` to change the output tensor from the second to the first output name. This is followed by commands to extract image and audio features using the modified code, which are stored in the \"features\" folder within the respective dataset. These features will be used in the training of a BMN model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":230-274",
+            "content": "                             -o ./football/inference_model\n```\n#####  step1.4  \u57fa\u4e8ePP-TSM\u7684\u89c6\u9891\u7279\u5f81\u63d0\u53d6\n\u5c06 `PaddleVideo/applications/FootballAction/predict/action_detect/models/pptsm_infer.py` \u6587\u4ef6\u4e2d41\u884c\u7684\n```python\nself.output_tensor = self.predictor.get_output_handle(output_names[1])\n```\n\u66ff\u6362\u4e3a\n```python\nself.output_tensor = self.predictor.get_output_handle(output_names[0])\n```\n\u4f7f\u7528\u5982\u4e0b\u547d\u4ee4\u8fdb\u884cimage\u548caudio\u7279\u5f81\u7684\u63d0\u53d6\uff0c\u9ed8\u8ba4\u4f7f\u7528\u4e0b\u8f7d\u7684\u6a21\u578b\u8fdb\u884c\u7279\u5f81\u63d0\u53d6\uff0c\u5982\u679c\u4f7f\u7528\u81ea\u5df1\u6570\u636e\u8bad\u7ec3\u7684\u6a21\u578b\uff0c\u8bf7\u6ce8\u610f\u4fee\u6539\u914d\u7f6e\u6587\u4ef6\u4e2d\u6a21\u578b\u7684\u6587\u4ef6\u8def\u5f84:\n```bash\ncd ${FootballAcation}\ncd extractor && python extract_feat.py\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                   # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  EuroCup2016            # \u6570\u636e\u96c6\n            |--  features          # \u89c6\u9891\u7684\u56fe\u50cf+\u97f3\u9891\u7279\u5f81\n```\n\u63a8\u7406\u7279\u5f81\u4ee5pkl\u6587\u4ef6\u4fdd\u5b58\uff0c\u683c\u5f0f\u5982\u4e0b\uff1a\n```txt\n# \u7279\u5f81\u7ef4\u5ea6, image(2048) + audio(1024)\nvideo_features = {'image_feature': np_image_features,\n                  'audio_feature': np_audio_features}\n```\n\u6b64\u7279\u5f81\u63a5\u4e0b\u6765\u4f1a\u7528\u4e8eBMN\u6a21\u578b\u7684\u8bad\u7ec3\u3002\n#### step2 BMN\u8bad\u7ec3\nBMN\u6a21\u578b\u4f7f\u7528\u6587\u6863\u53c2\u8003[BMN](../../docs/zh-CN/model_zoo/localization/bmn.md)\n##### step2.1 BMN\u8bad\u7ec3\u6570\u636e\u5904\u7406\n\u4f7f\u7528\u5982\u4e0b\u547d\u4ee4\u5f97\u5230BMN\u8bad\u7ec3\u6240\u9700\u8981\u7684\u6570\u636e\u96c6\uff0c\u9ed8\u8ba4\u4f7f\u7528windows=40\uff0c\u6839\u636egts\u548c\u7279\u5f81\u5f97\u5230\u8bad\u7ec3\u6240\u9700\u7684proposal\uff1a\n```bash"
+        },
+        {
+            "comment": "This code changes the directory to \"FootballAction/datasets/script\" and runs a Python script named get_instance_for_bmn.py, which creates a dataset for BMN (Bounding Box Regression) model training. The resulting data is stored in the datasets folder with the instance information saved as JSON files within the input_for_bmn directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":275-319",
+            "content": "cd FootballAction/datasets/script && python get_instance_for_bmn.py\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                   # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  EuroCup2016            # \u6570\u636e\u96c6\n            |--  input_for_bmn     # bmn\u8bad\u7ec3\u7684proposal\n                |--  feature\n                |--  label.json  \n```\n\u7279\u5f81\u6587\u4ef6\u4fdd\u5b58\u5728`label.json`\u6587\u4ef6\u4e2d\uff0c\u6570\u636e\u683c\u5f0f\u5982\u4e0b\uff1a\n```txt\n{\n    \"719b0a4bcb1f461eabb152298406b861_753_793\": {\n        \"duration_second\": 40.0,\n        \"duration_frame\": 200,\n        \"feature_frame\": 200,\n        \"subset\": \"train\",\n        \"annotations\": [\n            {\n                \"segment\": [\n                    15.0,\n                    22.0\n                ],\n                \"label\": \"3.0\",\n                \"label_name\": \"\u4efb\u610f\u7403\"\n            }\n        ]\n    },\n    ...\n}\n```\n##### step2.2  BMN\u6a21\u578b\u8bad\u7ec3\n\u8bad\u7ec3\u542f\u52a8\u547d\u4ee4\u5982\u4e0b\uff1a\n```bash\npython -B -m paddle.distributed.launch \\\n     --gpus=\"0,1\" \\\n     --log_dir=./football/logs_bmn \\\n     main.py  \\\n     --validate \\\n     -c applications/FootballAction/train_proposal/configs/bmn_football_v2.0.yaml \\\n     -o output_dir=./football/bmn"
+        },
+        {
+            "comment": "Step 2.3: Export BMN inference model with command `python tools/export_model.py -c applications/FootballAction/train_proposal/configs/bmn_football_v2.0.yaml -p ./football/bmn/BMN_epoch_00016.pdparams -o ./football/inference_model`.\nStep 2.4: Use command `cd extractor && python extract_bmn.py` to predict BMN proposal information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":320-361",
+            "content": "```\n\u6211\u4eec\u4e5f\u63d0\u4f9b\u4e86\u8bad\u7ec3\u597d\u7684BMN\u6a21\u578b\uff0c\u4e0b\u8f7d\u94fe\u63a5\u5df2\u5728\u5feb\u901f\u4f53\u9a8c\u7ae0\u8282\u4e2d\u7ed9\u51fa\u3002\n##### step2.3 \u5bfc\u51faBMN\u63a8\u7406\u6a21\u578b\n\u6a21\u578b\u5bfc\u51fa\u547d\u4ee4\u5982\u4e0b:\n```bash\npython tools/export_model.py -c applications/FootballAction/train_proposal/configs/bmn_football_v2.0.yaml \\\n                              -p ./football/bmn/BMN_epoch_00016.pdparams \\\n                               -o ./football/inference_model\n```\n##### step2.4  BMN\u6a21\u578b\u9884\u6d4b\n\u4f7f\u7528\u5982\u4e0b\u547d\u4ee4\u8fdb\u884c\u9884\u6d4b\uff0c\u5f97\u5230\u52a8\u4f5cproposal\u4fe1\u606f\uff1a start_id, end_id, score\u3002\u5982\u679c\u4f7f\u7528\u81ea\u5df1\u6570\u636e\u8bad\u7ec3\u7684\u6a21\u578b\uff0c\u8bf7\u6ce8\u610f\u4fee\u6539\u914d\u7f6e\u6587\u4ef6\u4e2d\u6a21\u578b\u7684\u6587\u4ef6\u8def\u5f84:\n```\ncd extractor && python extract_bmn.py\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                   # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  EuroCup2016            # \u6570\u636e\u96c6\n            |--  feature_bmn\n                 |--  prop.json    # bmn \u9884\u6d4b\u7ed3\u679c\n```\n\u9884\u6d4b\u7ed3\u679c\u6570\u636e\u683c\u5f0f\u5982\u4e0b\uff1a\n```txt\n[\n    {\n        \"video_name\": \"c9516c903de3416c97dae91a59e968d7\",\n        \"num_proposal\": 5534,\n        \"bmn_results\": [\n            {\n                \"start\": 7850.0,\n                \"end\": 7873.0,\n                \"score\": 0.77194699622342\n            },\n            {\n                \"start\": 4400.0,\n                \"end\": 4443.0,\n                \"score\": 0.7663803287641536"
+        },
+        {
+            "comment": "This code is part of the Attention LSTM model training process in the FootballAction application. It mentions a few improvements made to the original AttentionLSTM model, such as using different hidden sizes for different modal features and adding a regression branch for IOU. The code also discusses processing training data for LSTM training and provides an example of the label_info.json format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":362-407",
+            "content": "            },\n            ...\n        ]\n    },\n    ...\n]\n```\n#### step3 LSTM\u8bad\u7ec3\nAttentionLSTM\u6a21\u578b\u4f7f\u7528\u6587\u6863\u53c2\u8003[AttentionLSTM](../../docs/zh-CN/model_zoo/localization/bmn.md)\uff0c\u6b64\u5904\u6211\u4eec\u5bf9\u539f\u59cb\u5bf9AttentionLSTM\u6a21\u578b\u8fdb\u884c\u4e86\u6539\u8fdb\uff0c\u5305\u62ec\uff1a\n1. \u4e0d\u540c\u6a21\u6001\u7279\u5f81\u5728LSTM\u4e2d\u4f7f\u7528\u4e0d\u540c\u7684hiddne_size\n2. \u52a0\u5165\u4e86\u4e00\u4e2a\u56de\u5f52\u5206\u652f\u7528\u4e8e\u56de\u5f52iou\n3. \u6a21\u578b\u4e2d\u52a0\u5165\u4e86BN\u5c42\u6291\u5236\u8fc7\u62df\u5408\n##### step3.1  LSTM\u8bad\u7ec3\u6570\u636e\u5904\u7406\n\u5c06BMN\u5f97\u5230\u7684proposal\u622a\u65ad\u5e76\u5904\u7406\u6210LSTM\u8bad\u7ec3\u6240\u9700\u6570\u636e\u96c6\u3002\u540c\u7406\uff0c\u6ce8\u610f\u6570\u636e\u96c6\u6587\u4ef6\u4fee\u6539\u8def\u5f84\u3002\n```\ncd datasets/script && python get_instance_for_lstm.py\n```\n\u5b8c\u6210\u8be5\u6b65\u9aa4\u540e\uff0c\u6570\u636e\u5b58\u50a8\u4f4d\u7f6e\n```\n   |--  datasets                    # \u8bad\u7ec3\u6570\u636e\u96c6\u548c\u5904\u7406\u811a\u672c\n        |--  EuroCup2016            # \u6570\u636e\u96c6\n            |--  input_for_lstm     # lstm\u8bad\u7ec3\u7684proposal\n                \u251c\u2500\u2500 feature         # \u7279\u5f81\n                \u251c\u2500\u2500 label_info.json # \u6807\u7b7e\u4fe1\u606f\n                \u251c\u2500\u2500 train.txt       # \u8bad\u7ec3\u6587\u4ef6\u5217\u8868\n                \u2514\u2500\u2500 val.txt         # \u6d4b\u8bd5\u6587\u4ef6\u5217\u8868\n```\n- `label_info.json`\u6570\u636e\u683c\u5f0f\u5982\u4e0b\uff1a\n```\n{\n    \"fps\": 5,\n    \"results\": [\n        {\n            \"url\": \"https://xxx.mp4\",\n            \"mode\": \"train\",        # train or validation\n            \"total_frames\": 6128,\n            \"num_gts\": 93,\n            \"num_proposals\": 5043,\n            \"proposal_actions\": ["
+        },
+        {
+            "comment": "This code snippet represents the structure of a single data sample used in training an LSTM model for action recognition. The data includes image and audio features, frame-level labels, and metrics like IOU (Intersection over Union) and IOA (Intersection over Area). The \"proposal\" field contains start, end, and score values to define the segment of interest. The \"hit_gts\" field provides ground truth information about the labeled action segments within the sample.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":408-440",
+            "content": "                {\n                    \"label\": 6,\n                    \"norm_iou\": 0.7575757575757576,\n                    \"norm_ioa\": 0.7575757575757576,\n                    \"norm_start\": -0.32,\n                    \"proposal\": {\n                        \"start\": 5011,\n                        \"end\": 5036,\n                        \"score\": 0.7723643666324231\n                    },\n                    \"hit_gts\": {\n                        \"label_ids\": [\n                            6\n                        ],\n                        \"label_names\": [\n                            \"\u6362\u4eba\"\n                        ],\n                        \"start_id\": 5003,\n                        \"end_id\": 5036\n                    }\n                },\n                ...\n        },\n        ...\n}\n```\n- LSTM\u8bad\u7ec3\u6240\u9700\u8981\u7684feature\u6570\u636e\u683c\u5f0f\u5982\u4e0b:\n```\n{\n    'features': np.array(feature_hit, dtype=np.float32),    # iamge\u548caudio \u7279\u5f81\n    'feature_fps': 5,                                       # fps = 5\n    'label_info': {'norm_iou': 0.5, 'label': 3, ...},       # \u6570\u636e\u683c\u5f0f1\u4e2d\u7684'proposal_actions'"
+        },
+        {
+            "comment": "This code represents the data format for listing necessary files for LSTM training, and provides commands for launching LSTM training, exporting the trained model for inference, running predictions with default or custom models, and evaluating the model's performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":441-492",
+            "content": "    'video_name': 'c9516c903de3416c97dae91a59e968d7'        # video_name\n}\n```\n- LSTM\u8bad\u7ec3\u6240\u9700\u6587\u4ef6\u5217\u8868\u6570\u636e\u683c\u5f0f\u5982\u4e0b\uff1a\n```\n'{} {}'.format(filename, label)\n```\n##### step3.2  LSTM\u8bad\u7ec3\n\u8bad\u7ec3\u542f\u52a8\u547d\u4ee4\u5982\u4e0b:\n```bash\npython -B -m paddle.distributed.launch \\\n     --gpus=\"0,1,2,3\" \\\n     --log_dir=./football/logs_lstm \\\n     main.py  \\\n     --validate \\\n     -c applications/FootballAction/train_proposal/configs/lstm_football.yaml \\\n     -o output_dir=./football/lstm\n```\n##### step3.3 \u5bfc\u51faLSTM\u63a8\u7406\u6a21\u578b\n\u6a21\u578b\u5bfc\u51fa\u547d\u4ee4\u5982\u4e0b:\n```bash\npython tools/export_model.py -c applications/FootballAction/train_proposal/configs/lstm_football.yaml \\\n                              -p ./football/lstm/AttentionLSTM_best.pdparams  \\\n                               -o ./football/inference_model\n```\n<a name=\"\u6a21\u578b\u63a8\u7406\"></a>\n### 5.2 \u6a21\u578b\u63a8\u7406\n\u8fd0\u884c\u9884\u6d4b\u4ee3\u7801\n```\ncd predict && python predict.py\n```\n- \u9ed8\u8ba4\u4f7f\u7528\u6211\u4eec\u63d0\u4f9b\u7684\u4e8e\u8bad\u7ec3\u6587\u4ef6\u8fdb\u884c\u9884\u6d4b\uff0c\u5982\u4f7f\u7528\u4e2a\u4eba\u8bad\u7ec3\u7684\u6a21\u578b\u6587\u4ef6\uff0c\u8bf7\u5bf9\u5e94\u4fee\u6539[\u914d\u7f6e\u6587\u4ef6](./predict/configs/configs.yaml)\u4e2d\u7684\u53c2\u6570\u8def\u5f84\n- \u4ea7\u51fa\u6587\u4ef6\uff1aresults.json\n<a name=\"\u6a21\u578b\u8bc4\u4f30\"></a>\n### 5.3 \u6a21\u578b\u8bc4\u4f30\n```\n# \u5305\u62ecbmn proposal \u8bc4\u4f30\u548c\u6700\u7ec8action\u8bc4\u4f30\ncd predict && python eval.py results.json\n```\n<a name=\"\u6a21\u578b\u4f18\u5316\"></a>"
+        },
+        {
+            "comment": "The code discusses model optimization, where the base feature model (image) is replaced with PP-TSM, resulting in a 94% accuracy improvement. The audio base feature remains unchanged. This leads to an F1-score increase from 0.57 to 0.82. It also mentions model deployment and provides references for related papers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/README.md\":493-512",
+            "content": "### 5.4 \u6a21\u578b\u4f18\u5316\n- \u57fa\u7840\u7279\u5f81\u6a21\u578b\uff08\u56fe\u50cf\uff09\u66ff\u6362\u4e3aPP-TSM\uff0c\u51c6\u786e\u7387\u753184%\u63d0\u5347\u523094%\n- \u57fa\u7840\u7279\u5f81\u6a21\u578b\uff08\u97f3\u9891\uff09\u6ca1\u53d8\u52a8\n- \u51c6\u786e\u7387\u63d0\u5347\uff0cprecision\u548crecall\u5747\u6709\u5927\u5e45\u63d0\u5347\uff0cF1-score\u4ece0.57\u63d0\u5347\u52300.82\n<a name=\"\u6a21\u578b\u90e8\u7f72\"></a>\n### 5.5 \u6a21\u578b\u90e8\u7f72\n\u672c\u4ee3\u7801\u89e3\u51b3\u65b9\u6848\u5728\u52a8\u4f5c\u7684\u68c0\u6d4b\u548c\u53ec\u56de\u6307\u6807F1-score=82%\n<a name=\"\u53c2\u8003\u8bba\u6587\"></a>\n### 6. \u53c2\u8003\u8bba\u6587\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han\n- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.\n- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen\n- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a1806a3b-995a-43b5-8dff-6baa6c3fb101.json b/docs/doc/a1806a3b-995a-43b5-8dff-6baa6c3fb101.json
new file mode 100644
index 000000000..14f8415c4
--- /dev/null
+++ b/docs/doc/a1806a3b-995a-43b5-8dff-6baa6c3fb101.json
@@ -0,0 +1,85 @@
+{
+    "summary": "This code calculates delta between bounding boxes, adjusts using weighted averages, provides functions for filtering, computing overlaps, generating anchor points, decoding YOLO boxes, and calculating IoU. It transforms coordinates, computes deltas, stacks results, calculates dimensions and center of rotated boxes, converts rectangles to polygons, and finds the best begin point for a coordinate.",
+    "details": [
+        {
+            "comment": "This code calculates the delta between source and target bounding boxes. It first computes the width and height of both source and target boxes, then their center coordinates. The variables are initialized and calculated for further usage in other functions related to bounding box transformation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":0-29",
+            "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn.functional as F\nimport math\nimport numpy as np\ndef bbox2delta(src_boxes, tgt_boxes, weights):\n    src_w = src_boxes[:, 2] - src_boxes[:, 0]\n    src_h = src_boxes[:, 3] - src_boxes[:, 1]\n    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w\n    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h\n    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]\n    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]\n    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w"
+        },
+        {
+            "comment": "This code calculates the differentials (deltas) between target and source bounding boxes, then converts those deltas back into new bounding box coordinates. The conversion is done with weighted averages for x, y, width, and height adjustments, ensuring values are clipped to avoid large inputs for paddle.exp().",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":30-62",
+            "content": "    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h\n    wx, wy, ww, wh = weights\n    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w\n    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h\n    dw = ww * paddle.log(tgt_w / src_w)\n    dh = wh * paddle.log(tgt_h / src_h)\n    deltas = paddle.stack((dx, dy, dw, dh), axis=1)\n    return deltas\ndef delta2bbox(deltas, boxes, weights):\n    clip_scale = math.log(1000.0 / 16)\n    widths = boxes[:, 2] - boxes[:, 0]\n    heights = boxes[:, 3] - boxes[:, 1]\n    ctr_x = boxes[:, 0] + 0.5 * widths\n    ctr_y = boxes[:, 1] + 0.5 * heights\n    wx, wy, ww, wh = weights\n    dx = deltas[:, 0::4] / wx\n    dy = deltas[:, 1::4] / wy\n    dw = deltas[:, 2::4] / ww\n    dh = deltas[:, 3::4] / wh\n    # Prevent sending too large values into paddle.exp()\n    dw = paddle.clip(dw, max=clip_scale)\n    dh = paddle.clip(dh, max=clip_scale)\n    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)\n    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)\n    pred_w = paddle.exp(dw) * widths.unsqueeze(1)\n    pred_h = paddle.exp(dh) * heights.unsqueeze(1)"
+        },
+        {
+            "comment": "The code contains three functions: `expand_bbox`, `clip_bbox`, and `nonempty_bbox`. `expand_bbox` takes bbox coordinates, scales them by a factor, and returns the expanded bboxes. `clip_bbox` clips the bbox coordinates to the image shape boundaries. `nonempty_bbox` filters out empty bounding boxes based on a minimum size threshold or returns a mask if return_mask is True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":64-101",
+            "content": "    pred_boxes = []\n    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)\n    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)\n    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)\n    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)\n    pred_boxes = paddle.stack(pred_boxes, axis=-1)\n    return pred_boxes\ndef expand_bbox(bboxes, scale):\n    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5\n    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5\n    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5\n    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5\n    w_half *= scale\n    h_half *= scale\n    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)\n    bboxes_exp[:, 0] = x_c - w_half\n    bboxes_exp[:, 2] = x_c + w_half\n    bboxes_exp[:, 1] = y_c - h_half\n    bboxes_exp[:, 3] = y_c + h_half\n    return bboxes_exp\ndef clip_bbox(boxes, im_shape):\n    h, w = im_shape[0], im_shape[1]\n    x1 = boxes[:, 0].clip(0, w)\n    y1 = boxes[:, 1].clip(0, h)\n    x2 = boxes[:, 2].clip(0, w)\n    y2 = boxes[:, 3].clip(0, h)\n    return paddle.stack([x1, y1, x2, y2], axis=1)\ndef nonempty_bbox(boxes, min_size=0, return_mask=False):"
+        },
+        {
+            "comment": "The function `bbox_utils.py` at line 102-138 contains two functions: 'filter_boxes_by_size' and 'bbox_overlaps'.\n'filter_boxes_by_size' filters the bounding boxes by size, only keeping those whose width or height exceeds a specified minimum size. If a mask is also desired, it returns one of true values for the selected bounding boxes.\n'bbox_overlaps' calculates overlaps between two sets of bounding boxes and returns them as a tensor with shape [M, N]. This function uses the areas of the bounding boxes to compute the intersections and unions, applying clipping for valid computations and handling non-intersecting boxes.\n\nExplanation: The code contains functions that filter bounding boxes by size and calculate overlaps between them. The 'filter_boxes_by_size' function filters bounding boxes based on their width or height, while the 'bbox_overlaps' function calculates overlap between two sets of bounding boxes and returns a tensor with shape [M, N].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":102-138",
+            "content": "    w = boxes[:, 2] - boxes[:, 0]\n    h = boxes[:, 3] - boxes[:, 1]\n    mask = paddle.logical_and(w > min_size, w > min_size)\n    if return_mask:\n        return mask\n    keep = paddle.nonzero(mask).flatten()\n    return keep\ndef bbox_area(boxes):\n    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])\ndef bbox_overlaps(boxes1, boxes2):\n    \"\"\"\n    Calculate overlaps between boxes1 and boxes2\n    Args:\n        boxes1 (Tensor): boxes with shape [M, 4]\n        boxes2 (Tensor): boxes with shape [N, 4]\n    Return:\n        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]\n    \"\"\"\n    area1 = bbox_area(boxes1)\n    area2 = bbox_area(boxes2)\n    xy_max = paddle.minimum(\n        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])\n    xy_min = paddle.maximum(\n        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])\n    width_height = xy_max - xy_min\n    width_height = width_height.clip(min=0)\n    inter = width_height.prod(axis=2)\n    overlaps = paddle.where(inter > 0, inter /\n                            (paddle.unsqueeze(area1, 1) + area2 - inter),"
+        },
+        {
+            "comment": "The code defines functions for converting box coordinates, generating a grid of anchor points, and decoding YOLO bounding boxes. The \"xywh2xyxy\" function transforms (x, y, w, h) to (x1, y1, x2, y2). The \"make_grid\" function generates a grid of coordinates for downsampled images. The \"decode_yolo\" function decodes YOLO bounding boxes using anchor points and downsample ratios.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":139-175",
+            "content": "                            paddle.zeros_like(inter))\n    return overlaps\ndef xywh2xyxy(box):\n    x, y, w, h = box\n    x1 = x - w * 0.5\n    y1 = y - h * 0.5\n    x2 = x + w * 0.5\n    y2 = y + h * 0.5\n    return [x1, y1, x2, y2]\ndef make_grid(h, w, dtype):\n    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])\n    return paddle.stack((xv, yv), 2).cast(dtype=dtype)\ndef decode_yolo(box, anchor, downsample_ratio):\n    \"\"\"decode yolo box\n    Args:\n        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]\n        anchor (list): anchor with the shape [na, 2]\n        downsample_ratio (int): downsample ratio, default 32\n        scale (float): scale, default 1.\n    Return:\n        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]\n    \"\"\"\n    x, y, w, h = box\n    na, grid_h, grid_w = x.shape[1:4]\n    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))\n    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w\n    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h\n    anchor = paddle.to_tensor(anchor)"
+        },
+        {
+            "comment": "Code defines anchor and calculates width (w1) and height (h1) for bounding boxes based on exponential values of w and h, downsample ratio, and grid dimensions. It also includes a function iou_similarity that calculates the intersection over union (IoU) between two sets of bounding boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":176-203",
+            "content": "    anchor = paddle.cast(anchor, x.dtype)\n    anchor = anchor.reshape((1, na, 1, 1, 2))\n    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)\n    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)\n    return [x1, y1, w1, h1]\ndef iou_similarity(box1, box2, eps=1e-9):\n    \"\"\"Calculate iou of box1 and box2\n    Args:\n        box1 (Tensor): box with the shape [N, M1, 4]\n        box2 (Tensor): box with the shape [N, M2, 4]\n    Return:\n        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]\n    \"\"\"\n    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]\n    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]\n    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]\n    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]\n    x1y1 = paddle.maximum(px1y1, gx1y1)\n    x2y2 = paddle.minimum(px2y2, gx2y2)\n    overlap = (x2y2 - x1y1).clip(0).prod(-1)\n    area1 = (px2y2 - px1y1).clip(0).prod(-1)\n    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)\n    union = area1 + area2 - overlap + eps"
+        },
+        {
+            "comment": "This function calculates the intersection over union (IoU) between two bounding boxes, box1 and box2. It supports various IoU metrics such as Giou, Diou, or Ciou. The calculated IoU is returned as a tensor with the same shape as box1 and box2. This function is used in object detection tasks to measure the overlap between predicted and ground truth bounding boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":204-236",
+            "content": "    return overlap / union\ndef bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):\n    \"\"\"calculate the iou of box1 and box2\n    Args:\n        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]\n        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]\n        giou (bool): whether use giou or not, default False\n        diou (bool): whether use diou or not, default False\n        ciou (bool): whether use ciou or not, default False\n        eps (float): epsilon to avoid divide by zero\n    Return:\n        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]\n    \"\"\"\n    px1, py1, px2, py2 = box1\n    gx1, gy1, gx2, gy2 = box2\n    x1 = paddle.maximum(px1, gx1)\n    y1 = paddle.maximum(py1, gy1)\n    x2 = paddle.minimum(px2, gx2)\n    y2 = paddle.minimum(py2, gy2)\n    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))\n    area1 = (px2 - px1) * (py2 - py1)\n    area1 = area1.clip(0)\n    area2 = (gx2 - gx1) * (gy2 - gy1)\n    area2 = area2.clip(0)\n    union = area1 + area2 - overlap + eps"
+        },
+        {
+            "comment": "This code calculates the Intersection over Union (IoU) between two bounding boxes and applies various forms of IoU calculations depending on the input parameters. It first checks if giou, ciou, or diou is True and then proceeds with the corresponding calculation based on the convex area, diagonal distance, or aspect ratio difference between the bounding boxes. The rect2rbox function transforms a set of bounding boxes from (xmin, ymin, xmax, ymax) format to (cx, cy, w, h) format where cx and cy are center coordinates and w and h are width and height respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":237-267",
+            "content": "    iou = overlap / union\n    if giou or ciou or diou:\n        # convex w, h\n        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)\n        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)\n        if giou:\n            c_area = cw * ch + eps\n            return iou - (c_area - union) / c_area\n        else:\n            # convex diagonal squared\n            c2 = cw**2 + ch**2 + eps\n            # center distance\n            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4\n            if diou:\n                return iou - rho2 / c2\n            else:\n                w1, h1 = px2 - px1, py2 - py1 + eps\n                w2, h2 = gx2 - gx1, gy2 - gy1 + eps\n                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)\n                v = (4 / math.pi**2) * paddle.pow(delta, 2)\n                alpha = v / (1 + eps - iou + v)\n                alpha.stop_gradient = True\n                return iou - (rho2 / c2 + v * alpha)\n    else:\n        return iou\ndef rect2rbox(bboxes):\n    \"\"\"\n    :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax)"
+        },
+        {
+            "comment": "This function converts bounding box coordinates and dimensions to rotation-aligned bounding boxes by calculating the center, edge lengths, and angle. It returns a new tensor with reshaped and rotated bounding boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":268-303",
+            "content": "    :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle)\n    \"\"\"\n    bboxes = bboxes.reshape(-1, 4)\n    num_boxes = bboxes.shape[0]\n    x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0\n    y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0\n    edges1 = np.abs(bboxes[:, 2] - bboxes[:, 0])\n    edges2 = np.abs(bboxes[:, 3] - bboxes[:, 1])\n    angles = np.zeros([num_boxes], dtype=bboxes.dtype)\n    inds = edges1 < edges2\n    rboxes = np.stack((x_ctr, y_ctr, edges1, edges2, angles), axis=1)\n    rboxes[inds, 2] = edges2[inds]\n    rboxes[inds, 3] = edges1[inds]\n    rboxes[inds, 4] = np.pi / 2.0\n    return rboxes\ndef delta2rbox(Rrois,\n               deltas,\n               means=[0, 0, 0, 0, 0],\n               stds=[1, 1, 1, 1, 1],\n               wh_ratio_clip=1e-6):\n    \"\"\"\n    :param Rrois: (cx, cy, w, h, theta)\n    :param deltas: (dx, dy, dw, dh, dtheta)\n    :param means:\n    :param stds:\n    :param wh_ratio_clip:\n    :return:\n    \"\"\"\n    means = paddle.to_tensor(means)\n    stds = paddle.to_tensor(stds)\n    deltas = paddle.reshape(deltas, [-1, deltas.shape[-1]])"
+        },
+        {
+            "comment": "This code computes the bounding box regression results for each proposed bounding box. It calculates the deltas and applies clipping to ensure they stay within reasonable bounds, then transforms these deltas into actual bounding box coordinates. The resulting bounding boxes are stacked in a tensor and returned as output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":304-338",
+            "content": "    denorm_deltas = deltas * stds + means\n    dx = denorm_deltas[:, 0]\n    dy = denorm_deltas[:, 1]\n    dw = denorm_deltas[:, 2]\n    dh = denorm_deltas[:, 3]\n    dangle = denorm_deltas[:, 4]\n    max_ratio = np.abs(np.log(wh_ratio_clip))\n    dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)\n    dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)\n    Rroi_x = Rrois[:, 0]\n    Rroi_y = Rrois[:, 1]\n    Rroi_w = Rrois[:, 2]\n    Rroi_h = Rrois[:, 3]\n    Rroi_angle = Rrois[:, 4]\n    gx = dx * Rroi_w * paddle.cos(Rroi_angle) - dy * Rroi_h * paddle.sin(\n        Rroi_angle) + Rroi_x\n    gy = dx * Rroi_w * paddle.sin(Rroi_angle) + dy * Rroi_h * paddle.cos(\n        Rroi_angle) + Rroi_y\n    gw = Rroi_w * dw.exp()\n    gh = Rroi_h * dh.exp()\n    ga = np.pi * dangle + Rroi_angle\n    ga = (ga + np.pi / 4) % np.pi - np.pi / 4\n    ga = paddle.to_tensor(ga)\n    gw = paddle.to_tensor(gw, dtype='float32')\n    gh = paddle.to_tensor(gh, dtype='float32')\n    bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1)\n    return bboxes\ndef rbox2delta(proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]):"
+        },
+        {
+            "comment": "This code calculates the delta values between ground truth and proposals for bounding boxes, taking into account their widths, heights, angles, and applying normalization based on means and stds. It also ensures that angle differences are within 0 to 2\u03c0 range before scaling by the inverse of PI.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":339-377",
+            "content": "    \"\"\"\n    Args:\n        proposals:\n        gt:\n        means: 1x5\n        stds: 1x5\n    Returns:\n    \"\"\"\n    proposals = proposals.astype(np.float64)\n    PI = np.pi\n    gt_widths = gt[..., 2]\n    gt_heights = gt[..., 3]\n    gt_angle = gt[..., 4]\n    proposals_widths = proposals[..., 2]\n    proposals_heights = proposals[..., 3]\n    proposals_angle = proposals[..., 4]\n    coord = gt[..., 0:2] - proposals[..., 0:2]\n    dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4])\n          * coord[..., 1]) / proposals_widths\n    dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4])\n          * coord[..., 1]) / proposals_heights\n    dw = np.log(gt_widths / proposals_widths)\n    dh = np.log(gt_heights / proposals_heights)\n    da = (gt_angle - proposals_angle)\n    da = (da + PI / 4) % PI - PI / 4\n    da /= PI\n    deltas = np.stack([dx, dy, dw, dh, da], axis=-1)\n    means = np.array(means, dtype=deltas.dtype)\n    stds = np.array(stds, dtype=deltas.dtype)\n    deltas = (deltas - means) / stds"
+        },
+        {
+            "comment": "Function `bbox_decode` takes bbox predictions, anchors and means/stds as inputs. It returns decoded bounding boxes. It first converts the means and stds to tensors. Then for each image, it computes the bbox delta from the bbox predictions. It then transforms these deltas to actual bounding box coordinates using `delta2rbox` function. Finally, it reshapes the obtained bounding boxes and stores them in a list. The function returns a stack of all the bounding boxes for each image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":378-416",
+            "content": "    deltas = deltas.astype(np.float32)\n    return deltas\ndef bbox_decode(bbox_preds,\n                anchors,\n                means=[0, 0, 0, 0, 0],\n                stds=[1, 1, 1, 1, 1]):\n    \"\"\"decode bbox from deltas\n    Args:\n        bbox_preds: [N,H,W,5]\n        anchors: [H*W,5]\n    return:\n        bboxes: [N,H,W,5]\n    \"\"\"\n    means = paddle.to_tensor(means)\n    stds = paddle.to_tensor(stds)\n    num_imgs, H, W, _ = bbox_preds.shape\n    bboxes_list = []\n    for img_id in range(num_imgs):\n        bbox_pred = bbox_preds[img_id]\n        # bbox_pred.shape=[5,H,W]\n        bbox_delta = bbox_pred\n        anchors = paddle.to_tensor(anchors)\n        bboxes = delta2rbox(\n            anchors, bbox_delta, means, stds, wh_ratio_clip=1e-6)\n        bboxes = paddle.reshape(bboxes, [H, W, 5])\n        bboxes_list.append(bboxes)\n    return paddle.stack(bboxes_list, axis=0)\ndef poly_to_rbox(polys):\n    \"\"\"\n    poly:[x0,y0,x1,y1,x2,y2,x3,y3]\n    to\n    rotated_boxes:[x_ctr,y_ctr,w,h,angle]\n    \"\"\"\n    rotated_boxes = []\n    for poly in polys:"
+        },
+        {
+            "comment": "This code calculates the width, height, and angle of a rotated bounding box based on its eight points. It first converts the polyline to a numpy array, then calculates the lengths of two edges. The function then determines the maximum edge length as the width and the minimum edge length as the height. Based on these values, it computes the rotation angle using arctan2. Finally, it normalizes the rotation angle within a specified range using the norm_angle function. It also calculates the x and y coordinates of the box's center.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":417-446",
+            "content": "        poly = np.array(poly[:8], dtype=np.float32)\n        pt1 = (poly[0], poly[1])\n        pt2 = (poly[2], poly[3])\n        pt3 = (poly[4], poly[5])\n        pt4 = (poly[6], poly[7])\n        edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[\n            1]) * (pt1[1] - pt2[1]))\n        edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[\n            1]) * (pt2[1] - pt3[1]))\n        width = max(edge1, edge2)\n        height = min(edge1, edge2)\n        rbox_angle = 0\n        if edge1 > edge2:\n            rbox_angle = np.arctan2(\n                np.float(pt2[1] - pt1[1]), np.float(pt2[0] - pt1[0]))\n        elif edge2 >= edge1:\n            rbox_angle = np.arctan2(\n                np.float(pt4[1] - pt1[1]), np.float(pt4[0] - pt1[0]))\n        def norm_angle(angle, range=[-np.pi / 4, np.pi]):\n            return (angle - range[0]) % range[1] + range[0]\n        rbox_angle = norm_angle(rbox_angle)\n        x_ctr = np.float(pt1[0] + pt3[0]) / 2\n        y_ctr = np.float(pt1[1] + pt3[1]) / 2"
+        },
+        {
+            "comment": "This function `get_best_begin_point_single` takes a coordinate as input and calculates the minimum x and y coordinates (xmin, ymin) and maximum x and y coordinates (xmax, ymax). It then defines four different combinations of the four points in the coordinate and compares these combinations to the destination coordinate (dst_coordinate) by calculating the distance using `cal_line_length` function. The combination with the smallest distance is returned as the best begin point. The code also includes a force variable to handle potential edge cases where no valid begin point can be found.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":447-474",
+            "content": "        rotated_box = np.array([x_ctr, y_ctr, width, height, rbox_angle])\n        rotated_boxes.append(rotated_box)\n    ret_rotated_boxes = np.array(rotated_boxes)\n    assert ret_rotated_boxes.shape[1] == 5\n    return ret_rotated_boxes\ndef cal_line_length(point1, point2):\n    import math\n    return math.sqrt(\n        math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2))\ndef get_best_begin_point_single(coordinate):\n    x1, y1, x2, y2, x3, y3, x4, y4 = coordinate\n    xmin = min(x1, x2, x3, x4)\n    ymin = min(y1, y2, y3, y4)\n    xmax = max(x1, x2, x3, x4)\n    ymax = max(y1, y2, y3, y4)\n    combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]],\n                 [[x4, y4], [x1, y1], [x2, y2], [x3, y3]],\n                 [[x3, y3], [x4, y4], [x1, y1], [x2, y2]],\n                 [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]]\n    dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]\n    force = 100000000.0\n    force_flag = 0\n    for i in range(4):\n        temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \\"
+        },
+        {
+            "comment": "This function `rbox2poly_single` takes a rectangle represented by its center coordinates, width, height, and angle, and converts it to a polygon representation. It first calculates the top-left and bottom-right coordinates of the rectangle. Then, it creates a 2x4 matrix representing the four corners of the rectangle. The function applies a rotation matrix to transform the rectangle into a rotated coordinate system. Finally, it shifts the transformed coordinates by the center coordinates and returns the polygon representation as an array.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":475-502",
+            "content": "                     + cal_line_length(combinate[i][1], dst_coordinate[1]) \\\n                     + cal_line_length(combinate[i][2], dst_coordinate[2]) \\\n                     + cal_line_length(combinate[i][3], dst_coordinate[3])\n        if temp_force < force:\n            force = temp_force\n            force_flag = i\n    if force_flag != 0:\n        pass\n    return np.array(combinate[force_flag]).reshape(8)\ndef rbox2poly_single(rrect):\n    \"\"\"\n    rrect:[x_ctr,y_ctr,w,h,angle]\n    to\n    poly:[x0,y0,x1,y1,x2,y2,x3,y3]\n    \"\"\"\n    x_ctr, y_ctr, width, height, angle = rrect[:5]\n    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2\n    # rect 2x4\n    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])\n    R = np.array([[np.cos(angle), -np.sin(angle)],\n                  [np.sin(angle), np.cos(angle)]])\n    # poly\n    poly = R.dot(rect)\n    x0, x1, x2, x3 = poly[0, :4] + x_ctr\n    y0, y1, y2, y3 = poly[1, :4] + y_ctr\n    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)"
+        },
+        {
+            "comment": "This function `rbox2poly` converts a list of rotation rectangles (rrects) into polygons (polys). It first calculates the top-left and bottom-right coordinates of each rrect. Then, it rotates the rectangle using the given angle. The function adjusts the poly points by adding the x_ctr and y_ctr values to obtain the final poly. It applies a single best begin point adjustment (`get_best_begin_point_single`) and adds the poly to the list of polys. Finally, it returns the array of polygons.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/bbox_utils.py\":503-527",
+            "content": "    poly = get_best_begin_point_single(poly)\n    return poly\ndef rbox2poly(rrects):\n    \"\"\"\n    rrect:[x_ctr,y_ctr,w,h,angle]\n    to\n    poly:[x0,y0,x1,y1,x2,y2,x3,y3]\n    \"\"\"\n    polys = []\n    for rrect in rrects:\n        x_ctr, y_ctr, width, height, angle = rrect[:5]\n        tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2\n        rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])\n        R = np.array([[np.cos(angle), -np.sin(angle)],\n                      [np.sin(angle), np.cos(angle)]])\n        poly = R.dot(rect)\n        x0, x1, x2, x3 = poly[0, :4] + x_ctr\n        y0, y1, y2, y3 = poly[1, :4] + y_ctr\n        poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)\n        poly = get_best_begin_point_single(poly)\n        polys.append(poly)\n    polys = np.array(polys)\n    return polys"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a1a73c0b-b4df-44d5-aa4a-4e011115f77f.json b/docs/doc/a1a73c0b-b4df-44d5-aa4a-4e011115f77f.json
new file mode 100644
index 000000000..fa5719e06
--- /dev/null
+++ b/docs/doc/a1a73c0b-b4df-44d5-aa4a-4e011115f77f.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code creates a Qt application UI with video display, QGraphicsView and push button, along with complex design elements including buttons, sliders, panels, tabs, labels, and progress bars.",
+    "details": [
+        {
+            "comment": "This code defines a user interface for a main window with a central widget, a frame for displaying video content, and a QGraphicsView to render the video. The window has a fixed size of 800x486 pixels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/resources/QT/demo.ui\":0-43",
+            "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<ui version=\"4.0\">\n <class>MainWindow</class>\n <widget class=\"QMainWindow\" name=\"MainWindow\">\n  <property name=\"geometry\">\n   <rect>\n    <x>0</x>\n    <y>0</y>\n    <width>800</width>\n    <height>486</height>\n   </rect>\n  </property>\n  <property name=\"minimumSize\">\n   <size>\n    <width>800</width>\n    <height>486</height>\n   </size>\n  </property>\n  <property name=\"maximumSize\">\n   <size>\n    <width>800</width>\n    <height>486</height>\n   </size>\n  </property>\n  <property name=\"windowTitle\">\n   <string>MainWindow</string>\n  </property>\n  <widget class=\"QWidget\" name=\"centralwidget\">\n   <widget class=\"QFrame\" name=\"video_frame\">\n    <property name=\"geometry\">\n     <rect>\n      <x>20</x>\n      <y>20</y>\n      <width>761</width>\n      <height>361</height>\n     </rect>\n    </property>\n    <property name=\"frameShape\">\n     <enum>QFrame::StyledPanel</enum>\n    </property>\n    <property name=\"frameShadow\">\n     <enum>QFrame::Raised</enum>\n    </property>\n    <widget class=\"QGraphicsView\" name=\"graphicsView\">"
+        },
+        {
+            "comment": "The code represents a UI layout design for a user interface with a frame, a horizontal layout widget containing a push button labeled \"\u6253\u5f00\u89c6\u9891\", and possibly other UI elements. The frame is styled as raised panel, has a specified geometry, and the push button serves to open a video file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/resources/QT/demo.ui\":44-85",
+            "content": "     <property name=\"geometry\">\n      <rect>\n       <x>0</x>\n       <y>0</y>\n       <width>761</width>\n       <height>321</height>\n      </rect>\n     </property>\n    </widget>\n    <widget class=\"QFrame\" name=\"frame_2\">\n     <property name=\"geometry\">\n      <rect>\n       <x>0</x>\n       <y>320</y>\n       <width>761</width>\n       <height>41</height>\n      </rect>\n     </property>\n     <property name=\"frameShape\">\n      <enum>QFrame::StyledPanel</enum>\n     </property>\n     <property name=\"frameShadow\">\n      <enum>QFrame::Raised</enum>\n     </property>\n     <widget class=\"QWidget\" name=\"horizontalLayoutWidget\">\n      <property name=\"geometry\">\n       <rect>\n        <x>-1</x>\n        <y>-1</y>\n        <width>761</width>\n        <height>41</height>\n       </rect>\n      </property>\n      <layout class=\"QHBoxLayout\" name=\"horizontalLayout\">\n       <item>\n        <widget class=\"QPushButton\" name=\"open_btn\">\n         <property name=\"text\">\n          <string>\u6253\u5f00\u89c6\u9891</string>\n         </property>\n        </widget>\n       </item>\n       <item>"
+        },
+        {
+            "comment": "This code defines a user interface layout with various widgets, including QPushButtons and QSlider. The buttons have text labels in Chinese for \"\u4fdd\u5b58\u6807\u6ce8\", \"\u9009\u62e9\u76ee\u6807\", \"\u6e05\u7a7a\u76ee\u6807\", and \"\u5f00\u59cb\u63a8\u7406\". The layout is nested within other widgets to create a complex UI design.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/resources/QT/demo.ui\":86-124",
+            "content": "        <widget class=\"QPushButton\" name=\"save_btn\">\n         <property name=\"text\">\n          <string>\u4fdd\u5b58\u6807\u6ce8</string>\n         </property>\n        </widget>\n       </item>\n       <item>\n        <widget class=\"QSlider\" name=\"horizontalSlider\">\n         <property name=\"orientation\">\n          <enum>Qt::Horizontal</enum>\n         </property>\n        </widget>\n       </item>\n       <item>\n        <widget class=\"QPushButton\" name=\"select_btn\">\n         <property name=\"text\">\n          <string>\u9009\u62e9\u76ee\u6807</string>\n         </property>\n        </widget>\n       </item>\n       <item>\n        <widget class=\"QPushButton\" name=\"clean_btn\">\n         <property name=\"text\">\n          <string>\u6e05\u7a7a\u76ee\u6807</string>\n         </property>\n        </widget>\n       </item>\n       <item>\n        <widget class=\"QPushButton\" name=\"start_btn\">\n         <property name=\"text\">\n          <string>\u5f00\u59cb\u63a8\u7406</string>\n         </property>\n        </widget>\n       </item>\n      </layout>\n     </widget>\n    </widget>\n    <widget class=\"QFrame\" name=\"draw_frame\">\n     <property name=\"geometry\">"
+        },
+        {
+            "comment": "The code defines a user interface layout with a panel, tab widget, and label. The panel has dimensions, frame shape, and shadow properties set. The tab widget contains a single enabled tab named \"\u72b6\u6001\" (Chinese for \"Status\") and has a label inside it with specific geometry and text settings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/resources/QT/demo.ui\":125-168",
+            "content": "      <rect>\n       <x>0</x>\n       <y>10</y>\n       <width>751</width>\n       <height>301</height>\n      </rect>\n     </property>\n     <property name=\"frameShape\">\n      <enum>QFrame::StyledPanel</enum>\n     </property>\n     <property name=\"frameShadow\">\n      <enum>QFrame::Raised</enum>\n     </property>\n    </widget>\n   </widget>\n   <widget class=\"QTabWidget\" name=\"menu_tab\">\n    <property name=\"geometry\">\n     <rect>\n      <x>20</x>\n      <y>380</y>\n      <width>761</width>\n      <height>81</height>\n     </rect>\n    </property>\n    <property name=\"currentIndex\">\n     <number>0</number>\n    </property>\n    <widget class=\"QWidget\" name=\"tab\">\n     <attribute name=\"title\">\n      <string>\u72b6\u6001</string>\n     </attribute>\n     <widget class=\"QLabel\" name=\"act_label\">\n      <property name=\"enabled\">\n       <bool>true</bool>\n      </property>\n      <property name=\"geometry\">\n       <rect>\n        <x>10</x>\n        <y>30</y>\n        <width>71</width>\n        <height>21</height>\n       </rect>\n      </property>\n      <property name=\"text\">"
+        },
+        {
+            "comment": "This code snippet represents the UI layout for a user interface, using QT framework. It includes labels, progress bars and their respective properties such as position, size, text and value. The labels display current status, act information, and potentially other relevant data. The progress bar shows progress with a specific value and is likely used to represent the completion of certain tasks or actions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/resources/QT/demo.ui\":169-211",
+            "content": "       <string>\u5f53\u524d\u72b6\u6001\uff1a</string>\n      </property>\n     </widget>\n     <widget class=\"QLabel\" name=\"act_info_label\">\n      <property name=\"enabled\">\n       <bool>true</bool>\n      </property>\n      <property name=\"geometry\">\n       <rect>\n        <x>80</x>\n        <y>30</y>\n        <width>81</width>\n        <height>21</height>\n       </rect>\n      </property>\n      <property name=\"text\">\n       <string>-------------</string>\n      </property>\n     </widget>\n     <widget class=\"QProgressBar\" name=\"act_progressbar\">\n      <property name=\"geometry\">\n       <rect>\n        <x>170</x>\n        <y>32</y>\n        <width>521</width>\n        <height>21</height>\n       </rect>\n      </property>\n      <property name=\"value\">\n       <number>24</number>\n      </property>\n     </widget>\n     <widget class=\"QLabel\" name=\"label_3\">\n      <property name=\"enabled\">\n       <bool>true</bool>\n      </property>\n      <property name=\"geometry\">\n       <rect>\n        <x>680</x>\n        <y>30</y>\n        <width>60</width>\n        <height>21</height>\n       </rect>"
+        },
+        {
+            "comment": "This code represents the user interface layout for a Qt application. It includes various widgets such as labels, buttons and tabs arranged in a specific order with their respective properties and alignment set. The code also specifies the title of each tab.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/resources/QT/demo.ui\":212-235",
+            "content": "      </property>\n      <property name=\"layoutDirection\">\n       <enum>Qt::LeftToRight</enum>\n      </property>\n      <property name=\"text\">\n       <string>12%</string>\n      </property>\n      <property name=\"alignment\">\n       <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>\n      </property>\n     </widget>\n    </widget>\n    <widget class=\"QWidget\" name=\"tab_2\">\n     <attribute name=\"title\">\n      <string>\u5c5e\u6027\u914d\u7f6e</string>\n     </attribute>\n    </widget>\n   </widget>\n  </widget>\n  <widget class=\"QStatusBar\" name=\"statusbar\"/>\n </widget>\n <resources/>\n <connections/>\n</ui>"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a2822f22-d245-4950-aab3-a46099ed9e18.json b/docs/doc/a2822f22-d245-4950-aab3-a46099ed9e18.json
new file mode 100644
index 000000000..db1066e17
--- /dev/null
+++ b/docs/doc/a2822f22-d245-4950-aab3-a46099ed9e18.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This script exports CUDA device settings and FLAGS for efficient GPU usage, then executes a Python file to train an Attention LSTM Ernie model using the specified configuration file. The logs are saved at specific intervals, with pre-trained checkpoints used as well.",
+    "details": [
+        {
+            "comment": "This script exports CUDA device settings and FLAGS for efficient GPU usage, then executes a Python file to train an Attention LSTM Ernie model using the specified configuration file. The logs are saved at specific intervals, with pre-trained checkpoints used as well.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/train.sh\":0-12",
+            "content": "export CUDA_VISIBLE_DEVICES=0,1\nexport FLAGS_eager_delete_tensor_gb=0.0\nexport FLAGS_sync_nccl_allreduce=1\nexport FLAGS_fast_eager_deletion_mode=1\nexport FLAGS_fraction_of_gpu_memory_to_use=0.5\nexport FLAGS_reallocate_gpu_memory_in_mb=0\nexport FLAGS_memory_fraction_of_eager_deletion=1\npython scenario_lib/train.py --model_name=AttentionLstmErnie \\\n--config=./conf/conf.txt \\\n--log_interval=20 \\\n--valid_interval=1 \\\n--save_dir=checkpoints_save_new/ \\\n--pretrain=checkpoints_save/"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a2f3589f-d018-481b-8f96-ea16db092304.json b/docs/doc/a2f3589f-d018-481b-8f96-ea16db092304.json
new file mode 100644
index 000000000..959caa057
--- /dev/null
+++ b/docs/doc/a2f3589f-d018-481b-8f96-ea16db092304.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code converts a PaddlePaddle model to ONNX for inference using paddle2onnx and ONNXRuntime. The ONNX format enables similar usage to Paddle, with results matching Paddle predictions.",
+    "details": [
+        {
+            "comment": "This code demonstrates the process of converting a PaddlePaddle model to an ONNX model for inference using Paddle2ONNX and ONNXRuntime. It first installs the necessary packages, downloads the PP-TSN inference model, and then uses paddle2onnx to convert the model to the ONNX format while specifying the opset version.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/readme.md\":0-47",
+            "content": "# paddle2onnx \u6a21\u578b\u8f6c\u5316\u4e0e\u9884\u6d4b\n\u672c\u7ae0\u8282\u4ecb\u7ecd PP-TSN \u6a21\u578b\u5982\u4f55\u8f6c\u5316\u4e3a ONNX \u6a21\u578b\uff0c\u5e76\u57fa\u4e8e ONNX \u5f15\u64ce\u9884\u6d4b\u3002\n## 1. \u73af\u5883\u51c6\u5907\n\u9700\u8981\u51c6\u5907 Paddle2ONNX \u6a21\u578b\u8f6c\u5316\u73af\u5883\uff0c\u548c ONNX \u6a21\u578b\u9884\u6d4b\u73af\u5883\u3002\nPaddle2ONNX \u652f\u6301\u5c06 PaddlePaddle \u6a21\u578b\u683c\u5f0f\u8f6c\u5316\u5230 ONNX \u6a21\u578b\u683c\u5f0f\uff0c\u7b97\u5b50\u76ee\u524d\u7a33\u5b9a\u652f\u6301\u5bfc\u51fa ONNX Opset 9~11\uff0c\u90e8\u5206Paddle\u7b97\u5b50\u652f\u6301\u66f4\u4f4e\u7684ONNX Opset\u8f6c\u6362\u3002\n\u66f4\u591a\u7ec6\u8282\u53ef\u53c2\u8003 [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)\n- \u5b89\u88c5 Paddle2ONNX\n```bash\npython3.7 -m pip install paddle2onnx\n```\n- \u5b89\u88c5 ONNXRuntime\n```bash\n# \u5efa\u8bae\u5b89\u88c5 1.9.0 \u7248\u672c\uff0c\u53ef\u6839\u636e\u73af\u5883\u66f4\u6362\u7248\u672c\u53f7\npython3.7 -m pip install onnxruntime==1.9.0\n```\n## 2. \u6a21\u578b\u8f6c\u6362\n- PP-TSN inference\u6a21\u578b\u4e0b\u8f7d\n    ```bash\n    # \u4e0b\u8f7dinference\u6a21\u578b\u5230PaddleVideo/inference/ppTSN/ \u76ee\u5f55\u4e0b\n    mkdir -p ./inference\n    wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip\n    # \u89e3\u538binference\u6a21\u578b\n    pushd ./inference\n    unzip ppTSN.zip\n    popd\n    ```\n- \u6a21\u578b\u8f6c\u6362\n    \u4f7f\u7528 Paddle2ONNX \u5c06 Paddle inference\u6a21\u578b\u8f6c\u6362\u4e3a ONNX \u683c\u5f0f\u6a21\u578b\uff1a\n    ```bash\n    paddle2onnx \\\n    --model_dir=./inference/ppTSN \\\n    --model_filename=ppTSN.pdmodel \\\n    --params_filename=ppTSN.pdiparams \\\n    --save_file=./inference/ppTSN/ppTSN.onnx \\\n    --opset_version=10 \\"
+        },
+        {
+            "comment": "Enables ONNX checker to generate ONNX format model file for inference. Usage of ONNX model is similar to Paddle, and results match with Paddle inference predictions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/readme.md\":48-69",
+            "content": "    --enable_onnx_checker=True\n    ```\n\u6267\u884c\u5b8c\u6bd5\u540e\uff0c\u53ef\u4ee5\u53d1\u73b0 `./inference/ppTSN` \u76ee\u5f55\u4e0b\u751f\u6210\u4e86\u4e00\u4e2a ONNX \u683c\u5f0f\u7684\u6a21\u578b\u6587\u4ef6 `ppTSN.onnx`\n## 3. onnx \u9884\u6d4b\n\u63a5\u4e0b\u6765\u5c31\u53ef\u4ee5\u7528 ONNX \u683c\u5f0f\u6a21\u578b\u8fdb\u884c\u9884\u6d4b\uff0c\u5176\u7528\u6cd5\u4e0epaddle \u9884\u6d4b\u6a21\u578b\u7c7b\u4f3c\n\u6267\u884c\u5982\u4e0b\u547d\u4ee4\uff1a\n```bash\npython3.7 deploy/paddle2onnx/predict_onnx.py \\\n--input_file data/example.avi \\\n--config configs/recognition/pptsn/pptsn_k400_videos.yaml \\\n--onnx_file=./inference/ppTSN/ppTSN.onnx\n```\n\u7ed3\u679c\u5982\u4e0b\uff1a\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9998553991317749\n```\n\u53ef\u4ee5\u9a8c\u8bc1\u8be5\u7ed3\u679c\u4e0ePaddle inference\u7684\u9884\u6d4b\u7ed3\u679c\u5b8c\u5168\u4e00\u81f4"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a369eff4-1cfe-467f-9b5d-d71b4ab8a4f1.json b/docs/doc/a369eff4-1cfe-467f-9b5d-d71b4ab8a4f1.json
new file mode 100644
index 000000000..31ee527eb
--- /dev/null
+++ b/docs/doc/a369eff4-1cfe-467f-9b5d-d71b4ab8a4f1.json
@@ -0,0 +1,60 @@
+{
+    "summary": "This code defines a ResNet backbone model, utilizing ConvBNLayer and ReLU activation. It can dynamically add bottleneck blocks for models like ResNet-101 and ResNet-152, includes forward function and supports pretrained models.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and defining the ConvBNLayer class, which combines a convolutional layer with batch normalization. This class takes in the number of input channels as an argument. The copyright notice at the beginning indicates this code is licensed under the Apache License 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":0-33",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport math\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:\n        in_channels (int): Number of channels for the input."
+        },
+        {
+            "comment": "The ConvBNLayer class is a custom layer that initializes the Conv2D layer with BatchNorm2D and optional activation function. It takes in_channels, out_channels, kernel_size, stride (default: 1), groups (default: 1), act (optional activation function), and name as parameters. Weight and bias initialization are defined in the init_weights method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":34-57",
+            "content": "        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,"
+        },
+        {
+            "comment": "ResNet module with batch normalization and optional activation function. BottleneckBlock class for ResNet blocks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":58-88",
+            "content": "                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(out_channels,\n                                       weight_attr=ParamAttr(name=bn_name +\n                                                             \"_scale\"),\n                                       bias_attr=ParamAttr(bn_name + \"_offset\"))\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BottleneckBlock, self).__init__()"
+        },
+        {
+            "comment": "This code defines the ResNet backbone structure. It creates three ConvBNLayer instances for the first block of the network, with different parameters for each layer. The `self.conv0` layer has 1x1 kernel and performs a relu activation. `self.conv1` has a 3x3 kernel and also applies a relu activation after a stride operation. Lastly, `self.conv2` has a 1x1 kernel, no activation function, and increases the number of output channels by 4 times. The shortcut connection is created if `shortcut` is not set to `True`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":89-110",
+            "content": "        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,"
+        },
+        {
+            "comment": "The code defines a ResNet block with optional shortcut connection, containing ConvBNLayer and ReLU activation. The BasicBlock class initializes the parameters for the ResNet block including stride, number of channels, convolution layer, and optional shortcut.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":111-142",
+            "content": "                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.relu(y)\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 filter_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")"
+        },
+        {
+            "comment": "This code defines a ResNet backbone model. It includes a ConvBNLayer for feature extraction and optionally applies shortcut connections based on the input and output channel count. The forward function performs addition, followed by ReLU activation for each input. The ResNet class is registered with BACKBONES and takes arguments for depth and pretrained model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":143-177",
+            "content": "        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 filter_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     filter_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNet(nn.Layer):\n    \"\"\"ResNet backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None."
+        },
+        {
+            "comment": "This code defines a ResNet class with different layers and their corresponding depths. It also initializes the ConvBNLayer for the first convolution operation and MaxPool2D layer for pooling. The supported layers are 18, 34, 50, 101, and 152.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":178-207",
+            "content": "    \"\"\"\n    def __init__(self, depth, pretrained=None):\n        super(ResNet, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = [64, 256, 512, 1024]\n        out_channels = [64, 128, 256, 512]\n        self.conv = ConvBNLayer(in_channels=3,\n                                out_channels=64,\n                                kernel_size=7,\n                                stride=2,\n                                act=\"relu\",\n                                name=\"conv1\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)"
+        },
+        {
+            "comment": "This code adds bottleneck blocks to a ResNet backbone model, dynamically creating sublayers based on the input parameters. The block type and number of layers are determined by the given depth configuration. It also handles specific cases for ResNet-101 and ResNet-152 models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":209-228",
+            "content": "        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        conv_name,\n                        BottleneckBlock(\n                            # NOTE: Be careful! Here is different from TSM model.\n                            in_channels=in_channels[block]\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,"
+        },
+        {
+            "comment": "The code is defining a ResNet model by creating layers and blocks based on the given depth configuration. It alternates between BottleneckBlock and BasicBlock depending on the current block number and depth. It also initializes weights for the parameters in the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":229-251",
+            "content": "                            shortcut=shortcut,\n                            name=conv_name))\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters."
+        },
+        {
+            "comment": "If a pretrained loading path is specified, the code will load the model with that path. If no pretrained path is provided or it's set to an empty string, it initializes Conv2D layers with KaimingNormal function and BatchNorm2D layers with Constant function (value=1).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":252-267",
+            "content": "        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)"
+        },
+        {
+            "comment": "This code defines the forward function for a ResNet backbone. It reshapes and passes the input through a convolutional layer, max pooling, and a series of blocks. The output is returned after processing all blocks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet.py\":269-282",
+            "content": "    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27\n        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        y = self.conv(inputs)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a44076cf-c7ae-4a77-abec-caa1eafc8fd6.json b/docs/doc/a44076cf-c7ae-4a77-abec-caa1eafc8fd6.json
new file mode 100644
index 000000000..385382b49
--- /dev/null
+++ b/docs/doc/a44076cf-c7ae-4a77-abec-caa1eafc8fd6.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This function calculates video results by implementing one-dimensional non-maximal suppression, removes overlapping detections, and processes video properties. It takes in various parameters such as label map file, fps, score threshold, iou threshold, and frame offset.",
+    "details": [
+        {
+            "comment": "This function takes in label_map, data, and topk as input arguments. It calculates the video result based on the given parameters and returns it. The video result is a list of lists where each sub-list contains the feature start ID, feature end ID, label ID, label name, score, and label IOU for each action detected in the video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/process_result.py\":0-38",
+            "content": "\"\"\"\n# @File  : process_result.py\n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport sys\nimport os\nimport re\nimport numpy as np\nimport pickle\nimport json\nimport logger\nlogger = logger.Logger()\ndef get_data_res(label_map, data, topk):\n    \"\"\"get_data_res\"\"\"\n    sum_vid = len(data)\n    video_result = []\n    for i in range(sum_vid):\n        vid_name = data[i][0][0]\n        # true_label predict_start predict_end predict_score predict_len gt_iou gt_start gt_ioa\n        feature_start_id = float(data[i][0][1]['start'])\n        feature_end_id = float(data[i][0][1]['end'])\n        feature_stage1_score = data[i][0][1]['score']\n        predict_res = []\n        for k in range(topk):\n            score_top = data[i][1][k]\n            labelid_top = data[i][2][k]\n            label_iou = data[i][3]\n            labelname_top = label_map[str(labelid_top)]\n            video_result.append([\n                feature_start_id, feature_end_id, labelid_top, labelname_top,\n                score_top, label_iou\n            ])\n    return video_result"
+        },
+        {
+            "comment": "This code implements one-dimensional non-maximal suppression, which performs non-overlapping detection on bounding boxes. The function takes in a list of bounding boxes and removes any overlapping detections with an Intersection over Union (IoU) threshold greater than the given threshold. The resulting list contains only the non-overlapping detections.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/process_result.py\":41-77",
+            "content": "def base_nms(bboxes, thresh, delta=0, nms_id=2):\n    \"\"\"\n    One-dimensional non-maximal suppression\n    :param bboxes: [[vid, label, st, ed, score, ...], ...]\n    :param thresh:\n    :return:\n    \"\"\"\n    \"\"\"\n    t1 = bboxes[:, 0]\n    t2 = bboxes[:, 1]\n    scores = bboxes[:, nms_id]\n    \"\"\"\n    t1 = np.array([max(0, x[0] - delta) for x in bboxes])\n    t2 = np.array([x[1] + delta for x in bboxes])\n    scores = np.array([x[nms_id] for x in bboxes])\n    durations = t2 - t1\n    order = scores.argsort()[::-1]\n    keep = []\n    while order.size > 0:\n        i = order[0]\n        keep.append(i)\n        tt1 = np.maximum(t1[i], t1[order[1:]])\n        tt2 = np.minimum(t2[i], t2[order[1:]])\n        intersection = tt2 - tt1\n        IoU = intersection / (durations[i] + durations[order[1:]] -\n                              intersection).astype(float)\n        inds = np.where(IoU <= thresh)[0]\n        order = order[inds + 1]\n    return [bboxes[i] for i in keep]\ndef process_proposal(source_prop_box,\n                     min_frame_thread=5,"
+        },
+        {
+            "comment": "The code contains two functions: `process_video_prop` and `process_video_classify`. The first function processes video properties based on start frame, end frame, and score thresholds. It applies non-maximum suppression (NMS) to remove redundant or weak detections. The second function filters video properties based on background id and performs NMS for specific parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/process_result.py\":78-109",
+            "content": "                     nms_thresh=0.7,\n                     score_thresh=0.01):\n    \"\"\"process_video_prop\"\"\"\n    prop_box = []\n    for items in source_prop_box:\n        start_frame = float(items[0])\n        end_frame = float(items[1])\n        score = float(items[2])\n        if end_frame - start_frame < min_frame_thread or score < score_thresh:\n            continue\n        prop_box.append([start_frame, end_frame, score])\n    prop_box_keep = base_nms(prop_box, nms_thresh)\n    prop_res = []\n    for res in prop_box_keep:\n        prop_res.append({'start': res[0], 'end': res[1], 'score': res[2]})\n    return prop_res\ndef process_video_classify(video_prop, fps, score_thread, iou_thread, \\\n                           nms_id=5, nms_thread=0.01, nms_delta=10, backgroundid=0):\n    \"\"\"process_video_classify\"\"\"\n    prop_filter = []\n    for item in video_prop:\n        if item[2] == backgroundid:\n            continue\n        prop_filter.append(item)\n    # prop_filter = sorted(prop_filter, key=lambda x: x[nms_id], reverse=True)\n    prop_filter = base_nms(prop_filter, nms_thread, nms_delta, nms_id)"
+        },
+        {
+            "comment": "This code sorts prop_filter based on timestamps, then iterates over the sorted list to extract start and end times, label IDs, and scores. It appends these details to video_results if the classify score is greater than a threshold and IOU score is also above the threshold.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/process_result.py\":110-135",
+            "content": "    prop_filter = sorted(prop_filter, key=lambda x: x[0])\n    video_results = []\n    for item in prop_filter:\n        start_sec = item[0] / fps\n        end_sec = item[1] / fps\n        start_id_frame = item[0]\n        end_id_frame = item[1]\n        # start_time = \"%02d:%02d:%02d\" % ((start_id_frame / fps) / 3600, \\\n        #     ((start_id_frame / fps) % 3600) / 60, (start_id_frame / fps) % 60)\n        # end_time = \"%02d:%02d:%02d\" % ((end_id_frame / fps) / 3600, \\\n        #     ((end_id_frame / fps) % 3600) / 60, (end_id_frame / fps) % 60)\n        start_time = int(start_id_frame / fps)\n        end_time = int(end_id_frame / fps)\n        label_id = item[2]\n        label_name = item[3]\n        label_classify_score = item[4]\n        label_iou_score = item[5]\n        if label_classify_score > score_thread and label_iou_score > iou_thread:\n            video_results.append({\n                \"start_time\": start_time,\n                \"end_time\": end_time,\n                \"label_id\": label_id,\n                \"label_name\": label_name,"
+        },
+        {
+            "comment": "This function, `get_action_result`, takes in `result_info`, `label_map_file`, `fps`, `score_thread`, `iou_thread`, `nms_id`, `nms_thread`, and `frame_offset` as parameters. It uses the `json.load()` method to load a label map from the file specified by `label_map_file`. The function then calls `get_data_res` with the loaded label map, `result_info`, and `topk` to obtain original results (`org_result`). Finally, it processes these original results using `process_video_classify()`, passing in additional parameters such as `fps`, `score_thread`, `iou_thread`, `nms_id`, `nms_thread`, and `frame_offset`. The function returns the non-maximum suppression (`nms`) result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/utils/process_result.py\":136-154",
+            "content": "                \"classify_score\": label_classify_score,\n                \"iou_score\": label_iou_score\n            })\n    return video_results\ndef get_action_result(result_info, label_map_file, fps, score_thread=0, \\\n                      iou_thread=0, nms_id=5, nms_thread=0.01, frame_offset=10, topk=1):\n    \"\"\"get_action_result\"\"\"\n    label_map = json.load(open(label_map_file, 'r', encoding='utf-8'))\n    org_result = get_data_res(label_map, result_info, topk)\n    nms_result = process_video_classify(org_result, fps, score_thread,\n                                        iou_thread, nms_id, nms_thread,\n                                        frame_offset)\n    return nms_result"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a48c140a-2627-42d9-b89b-c74ece158c02.json b/docs/doc/a48c140a-2627-42d9-b89b-c74ece158c02.json
new file mode 100644
index 000000000..c73ad6c91
--- /dev/null
+++ b/docs/doc/a48c140a-2627-42d9-b89b-c74ece158c02.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code sets up PaddleVideo, imports libraries, and defines preprocessing pipelines for image recognition web services using PaddlePaddle. It includes a `VideoOp` class for video operations and a \"VideoService\" class for preprocessing and post-processing methods.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and modules, setting up the path for the PaddleVideo project, and defining several image processing pipelines including CenterCrop, Image2Array, Normalization, Sampler, and Scale. The purpose of this code is to provide a base for building an image recognition web service using PaddlePaddle.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/recognition_web_service.py\":0-27",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport base64\nimport os\nimport sys\nfrom typing import Callable, Dict, List\nimport numpy as np\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))\nfrom paddle_serving_app.reader import Sequential\nfrom paddlevideo.loader.pipelines import (CenterCrop, Image2Array,\n                                          Normalization, Sampler, Scale,"
+        },
+        {
+            "comment": "This code defines a function called get_preprocess_seq that returns a list of preprocessing operators based on the model name passed as an argument. The model names accepted are \"PPTSM\" and \"PPTSN\". The function checks the model name, and depending on its value, it constructs and returns a sequence of preprocess operators including Sampler, Scale, CenterCrop, Image2Array, and Normalization. These operations prepare the input data for a specific model before feeding into the model for prediction or inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/recognition_web_service.py\":28-61",
+            "content": "                                          TenCrop)\ntry:\n    from paddle_serving_server_gpu.web_service import Op, WebService\nexcept ImportError:\n    from paddle_serving_server.web_service import Op, WebService\nVALID_MODELS = [\"PPTSM\", \"PPTSN\"]\ndef get_preprocess_seq(model_name: str) -> List[Callable]:\n    \"\"\"get preprocess sequence by model name\n    Args:\n        model_name (str): model name for web serving, such as 'PPTSM', 'PPTSN'\n    Returns:\n        List[Callable]: preprocess operators in list.\n    \"\"\"\n    if model_name == 'PPTSM':\n        preprocess_seq = [\n            Sampler(8, 1, valid_mode=True),\n            Scale(256),\n            CenterCrop(224),\n            Image2Array(),\n            Normalization([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n        ]\n    elif model_name == 'PPTSN':\n        preprocess_seq = [\n            Sampler(25, 1, valid_mode=True, select_left=True),\n            Scale(256, fixed_ratio=True, do_round=True, backend='cv2'),\n            TenCrop(224),\n            Image2Array(),\n            Normalization([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])"
+        },
+        {
+            "comment": "This code snippet defines a class `VideoOp` that initializes an object with a preprocessing sequence and a dictionary of labels. The `preprocess()` method takes input dictionaries, data ID, and log ID as arguments to perform some operation on video data. The `init_op()` method is responsible for setting up the preprocessing sequence and label dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/recognition_web_service.py\":62-101",
+            "content": "        ]\n    else:\n        raise ValueError(\n            f\"model_name must in {VALID_MODELS}, but got {model_name}\")\n    return preprocess_seq\ndef np_softmax(x: np.ndarray, axis=0) -> np.ndarray:\n    \"\"\"softmax function\n    Args:\n        x (np.ndarray): logits.\n    Returns:\n        np.ndarray: probs.\n    \"\"\"\n    x -= np.max(x, axis=axis, keepdims=True)\n    x = np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)\n    return x\nclass VideoOp(Op):\n    def init_op(self):\n        \"\"\"init_op\n        \"\"\"\n        self.seq = Sequential(get_preprocess_seq(args.name))\n        self.label_dict = {}\n        with open(\"../../data/k400/Kinetics-400_label_list.txt\", \"r\") as fin:\n            for line in fin:\n                label_ind, label_name = line.strip().split(' ')\n                label_ind = int(label_ind)\n                self.label_dict[label_ind] = label_name.strip()\n    def preprocess(self, input_dicts: Dict, data_id: int, log_id: int):\n        \"\"\"preprocess\n        Args:\n            input_dicts (Dict): input_dicts.\n            data_id (int): data_id."
+        },
+        {
+            "comment": "This code function takes input_dicts, decodes and reshapes the 'frames' data into numpy array, splits it based on frame length, then squeezes the dimensions and stores the result in results dictionary. It also handles unexpected keys by raising ValueError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/recognition_web_service.py\":102-124",
+            "content": "            log_id (int): log_id.\n        Returns:\n            output_data: data for process stage.\n            is_skip_process: skip process stage or not, False default\n            prod_errcode: None default, otherwise, product errores occured.\n                          It is handled in the same way as exception.\n            prod_errinfo: \"\" default.\n        \"\"\"\n        (_, input_dict), = input_dicts.items()\n        for key in input_dict.keys():\n            if key == \"frames\":\n                frame_data = base64.b64decode(input_dict[key].encode('utf8'))\n                frame_data = np.fromstring(frame_data, np.uint8)\n            elif key == 'frames_shape':\n                shape_data = eval(input_dict[key])\n            else:\n                raise ValueError(f\"unexpected key received: {key}\")\n        frame_data = frame_data.reshape(shape_data)\n        frame_len = frame_data.shape[0]\n        frame_data = np.split(frame_data, frame_len, axis=0)\n        frame_data = [frame.squeeze(0) for frame in frame_data]\n        results = {"
+        },
+        {
+            "comment": "This code defines two methods: 'preprocess' and 'postprocess'. The 'preprocess' method takes input data in frames, sets the backend as cv2, expands dimensions for input to the network, and returns tmp_inp with a shape of [1,b,t,c,h,w]. The 'postprocess' method receives input_dicts from preprocess stage, fetch_dict from process stage, data_id, and log_id. It then returns the fetch result as a dictionary type.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/recognition_web_service.py\":125-148",
+            "content": "            'frames': frame_data,\n            'frames_len': frame_len,\n            'format': 'video',\n            'backend': 'cv2'\n        }\n        results = self.seq(results)\n        tmp_inp = np.expand_dims(results['imgs'], axis=0)  # [b,t,c,h,w]\n        # The input for the network is input_data[0], so need to add 1 dimension at the beginning\n        tmp_inp = np.expand_dims(tmp_inp, axis=0).copy()  # [1,b,t,c,h,w]\n        return {\"data_batch_0\": tmp_inp}, False, None, \"\"\n    def postprocess(self, input_dicts: Dict, fetch_dict: Dict, data_id: int,\n                    log_id: int):\n        \"\"\"postprocess\n        Args:\n            input_dicts (Dict): data returned in preprocess stage, dict(for single predict) or list(for batch predict).\n            fetch_dict (Dict): data returned in process stage, dict(for single predict) or list(for batch predict).\n            data_id (int): inner unique id, increase auto.\n            log_id (int): logid, 0 default.\n        Returns:\n            fetch_dict: fetch result must be dict type."
+        },
+        {
+            "comment": "This code defines a class and a function. The class, \"VideoService\", extends the \"WebService\" class and has a method called \"get_pipeline_response\". The method takes an input operation (read_op) as its argument and returns a VideoOp object with the given read_op as its input. The function \"parse_args\" is used to parse command line arguments. It seems that this code is related to video processing and handling inputs/outputs in some kind of pipeline or web service.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/recognition_web_service.py\":149-181",
+            "content": "            prod_errcode: None default, otherwise, product errores occured.\n                          It is handled in the same way as exception.\n            prod_errinfo: \"\" default.\n        \"\"\"\n        score_list = fetch_dict[\"outputs\"]\n        result = {\"label\": [], \"prob\": []}\n        for score in score_list:\n            score = np_softmax(score)\n            score = score.tolist()\n            max_score = max(score)\n            max_index = score.index(max_score)\n            result[\"label\"].append(self.label_dict[max_index])\n            result[\"prob\"].append(max_score)\n        result[\"label\"] = str(result[\"label\"])\n        result[\"prob\"] = str(result[\"prob\"])\n        return result, None, \"\"\nclass VideoService(WebService):\n    def get_pipeline_response(self, read_op):\n        \"\"\"get_pipeline_response\n        Args:\n            read_op ([type]): [description]\n        Returns:\n            [type]: [description]\n        \"\"\"\n        video_op = VideoOp(name=\"video\", input_ops=[read_op])\n        return video_op\ndef parse_args():"
+        },
+        {
+            "comment": "This code parses command-line arguments, initializes a PaddleVideo VideoService object with the provided configuration file and runs the service. The name of the model used in web serving is \"PPTSM\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/recognition_web_service.py\":182-207",
+            "content": "    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Web Serving model script\")\n    parser.add_argument(\n        '-n',\n        '--name',\n        type=str,\n        default='PPTSM',\n        help='model name used in web serving, such as PPTSM, PPTSN...')\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/PP-TSM.yaml',\n                        help='serving config file path')\n    return parser.parse_args()\nif __name__ == '__main__':\n    # get args such as serving config yaml path.\n    args = parse_args()\n    # start serving\n    uci_service = VideoService(name=\"video\")\n    uci_service.prepare_pipeline_config(yaml_file=args.config)\n    uci_service.run_service()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a58fa289-adcc-4987-9e12-b5292e91058b.json b/docs/doc/a58fa289-adcc-4987-9e12-b5292e91058b.json
new file mode 100644
index 000000000..4931888e3
--- /dev/null
+++ b/docs/doc/a58fa289-adcc-4987-9e12-b5292e91058b.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code is part of PaddleVideo's profiler module, which allows performance analysis and optimization. It initializes a profiler object and starts/stops profiling based on step ID and specified batch range, generating summary reports in ms units.",
+    "details": [
+        {
+            "comment": "This code is a part of PaddleVideo's profiler module, which allows for performance analysis and optimization. It imports the necessary libraries, initializes global variables, and defines the ProfilerOptions class to configure profiling options using a string in key-value format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/profiler.py\":0-28",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nimport paddle.profiler as profiler\n# A global variable to record the number of calling times for profiler\n# functions. It is used to specify the tracing range of training steps.\n_profiler_step_id = 0\n# A global variable to avoid parsing from string every time.\n_profiler_options = None\n_prof = None\nclass ProfilerOptions(object):\n    '''\n    Use a string to initialize a ProfilerOptions.\n    The string should be in the format: \"key1=value1;key2=value;key3=value3\"."
+        },
+        {
+            "comment": "The code defines a class \"ProfilerOptions\" with options for profiling. It takes an options string as input and has attributes for batch range (default [10, 20]), state (default 'All'), sorted key (default 'total'), tracer option (default 'Default'), profile path (empty string), and exit on finished flag (False).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/profiler.py\":29-52",
+            "content": "    For example:\n      \"profile_path=model.profile\"\n      \"batch_range=[50, 60]; profile_path=model.profile\"\n      \"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile\"\n    ProfilerOptions supports following key-value pair:\n      batch_range      - a integer list, e.g. [100, 110].\n      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. \n      sorted_key       - a string, the optional values are 'calls', 'total',\n                         'max', 'min' or 'ave.\n      tracer_option    - a string, the optional values are 'Default', 'OpDetail',\n                         'AllOpDetail'.\n      profile_path     - a string, the path to save the serialized profile data,\n                         which can be used to generate a timeline.\n      exit_on_finished - a boolean.\n    '''\n    def __init__(self, options_str):\n        assert isinstance(options_str, str)\n        self._options = {\n            'batch_range': [10, 20],\n            'state': 'All',\n            'sorted_key': 'total',"
+        },
+        {
+            "comment": "The code defines a class with an option parser. It parses options from a string, sets batch range if present, handles exit_on_finished flag, and updates other specified options (state, sorted_key, tracer_option, profile_path, timer_only).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/profiler.py\":53-76",
+            "content": "            'tracer_option': 'Default',\n            'profile_path': '/tmp/profile',\n            'exit_on_finished': True,\n            'timer_only': True\n        }\n        self._parse_from_string(options_str)\n    def _parse_from_string(self, options_str):\n        for kv in options_str.replace(' ', '').split(';'):\n            key, value = kv.split('=')\n            if key == 'batch_range':\n                value_list = value.replace('[', '').replace(']', '').split(',')\n                value_list = list(map(int, value_list))\n                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[\n                        1] > value_list[0]:\n                    self._options[key] = value_list\n            elif key == 'exit_on_finished':\n                self._options[key] = value.lower() in (\"yes\", \"true\", \"t\", \"1\")\n            elif key in [\n                    'state', 'sorted_key', 'tracer_option', 'profile_path'\n            ]:\n                self._options[key] = value\n            elif key == 'timer_only':\n                self._options[key] = value"
+        },
+        {
+            "comment": "This code provides a function to enable the operator-level timing using PaddlePaddle's profiler. The profiler step is initialized with options provided as a string. If no options are given, the profiler remains disabled. This can be used for performance analysis of models by measuring their throughput and time overhead.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/profiler.py\":78-104",
+            "content": "    def __getitem__(self, name):\n        if self._options.get(name, None) is None:\n            raise ValueError(\n                \"ProfilerOptions does not have an option named %s.\" % name)\n        return self._options[name]\ndef add_profiler_step(options_str=None):\n    '''\n    Enable the operator-level timing using PaddlePaddle's profiler.\n    The profiler uses a independent variable to count the profiler steps.\n    One call of this function is treated as a profiler step.\n    Args:\n      profiler_options - a string to initialize the ProfilerOptions.\n                         Default is None, and the profiler is disabled.\n    '''\n    if options_str is None:\n        return\n    global _prof \n    global _profiler_step_id\n    global _profiler_options\n    if _profiler_options is None:\n        _profiler_options = ProfilerOptions(options_str)\n    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan\n    # timer_only = True  only the model's throughput and time overhead are displayed"
+        },
+        {
+            "comment": "This code initializes a profiler object with specified scheduler range and timer_only option, then starts the profiling process. If the step ID matches the specified batch range, it stops the profiling, generates a summary report in ms units, clears the profiler, and exits the program if instructed to do so.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/profiler.py\":105-127",
+            "content": "    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.\n    # timer_only = False the output Timeline information can be found in the profiler_log directory\n    if _prof is None:\n        _timer_only = str(_profiler_options['timer_only']) == str(True)\n        _prof = profiler.Profiler(\n                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),\n                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),\n                   timer_only = _timer_only)\n        _prof.start()\n    else:\n        _prof.step()\n    if _profiler_step_id == _profiler_options['batch_range'][1]:\n        _prof.stop()\n        _prof.summary(\n             op_detail=True,\n             thread_sep=False,\n             time_unit='ms')\n        _prof = None\n        if _profiler_options['exit_on_finished']:\n            sys.exit(0)\n    _profiler_step_id += 1"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a5dedd80-9664-46bf-9695-2c6fd91d4193.json b/docs/doc/a5dedd80-9664-46bf-9695-2c6fd91d4193.json
new file mode 100644
index 000000000..fa6fa3985
--- /dev/null
+++ b/docs/doc/a5dedd80-9664-46bf-9695-2c6fd91d4193.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code sets naming conventions for object detection, defines fields and variables for efficient communication between decoder and model, and improves dataset evaluation. It also establishes conventions for video object detector output storage and standard metrics for field evaluation.",
+    "details": [
+        {
+            "comment": "This code is defining classes for standard naming conventions in object detection. It provides InputDataFields for input tensors and DetectionResultFields for results returned by the object detector.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py\":0-25",
+            "content": "# Copyright 2017 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# =============================================================================\n\"\"\"Contains classes specifying naming conventions used for object detection.\nSpecifies:\n  InputDataFields: standard fields used by reader/preprocessor/batcher.\n  DetectionResultFields: standard fields returned by object detector.\n\"\"\"\nclass InputDataFields:\n    \"\"\"Names for the input tensors.\n    Holds the standard data field names to use for identifying input tensors."
+        },
+        {
+            "comment": "This code defines standard fields used by the decoder and model for identifying keys in returned tensor_dict. Fields include image, original_image, source_id, filename, groundtruth_image_classes, groundtruth_boxes, groundtruth_classes, groundtruth_label_types, groundtruth_is_crowd, groundtruth_area, and groundtruth_difficult. It is used by the decoder to identify keys for returned tensor_dict and by model to identify necessary tensors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py\":26-45",
+            "content": "    This should be used by the decoder to identify keys for the returned\n    tensor_dict containing input tensors. And it should be used by the model to\n    identify the tensors it needs.\n    Attributes:\n        image: image.\n        original_image: image in the original input size.\n        key: unique key corresponding to image.\n        source_id: source of the original image.\n        filename: original filename of the dataset (without common path).\n        groundtruth_image_classes: image-level class labels.\n        groundtruth_boxes: coordinates of the ground truth boxes in the image.\n        groundtruth_classes: box-level class labels.\n        groundtruth_label_types: box-level label types (e.g. explicit\n            negative).\n        groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]\n            is the groundtruth a single object or a crowd.\n        groundtruth_area: area of a groundtruth segment.\n        groundtruth_difficult: is a `difficult` object\n        groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of"
+        },
+        {
+            "comment": "This code defines a dictionary of variables used for AVA evaluation, including image and original image keys, source IDs, and other metrics such as proposal boxes, ground truth instance masks, and more. These variables are necessary for accurately evaluating the performance of video object detection models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py\":46-65",
+            "content": "            the same class, forming a connected group, where instances are\n            heavily occluding each other.\n        proposal_boxes: coordinates of object proposal boxes.\n        proposal_objectness: objectness score of each proposal.\n        groundtruth_instance_masks: ground truth instance masks.\n        groundtruth_instance_boundaries: ground truth instance boundaries.\n        groundtruth_instance_classes: instance mask-level class labels.\n        groundtruth_keypoints: ground truth keypoints.\n        groundtruth_keypoint_visibilities: ground truth keypoint visibilities.\n        groundtruth_label_scores: groundtruth label scores.\n        groundtruth_weights: groundtruth weight factor for bounding boxes.\n        num_groundtruth_boxes: number of groundtruth boxes.\n        true_image_shapes: true shapes of images in the resized images, as\n            resized images can be padded with zeros.\n    \"\"\"\n    image = 'image'\n    original_image = 'original_image'\n    key = 'key'\n    source_id = 'source_id'"
+        },
+        {
+            "comment": "This code defines various field names for the AWA dataset, which includes attributes like ground truth image classes, bounding boxes, class labels, label types, object crowding status, and more. The fields cover aspects such as instance masks, boundaries, keypoints, visibilities, label scores, and weights. These field definitions are likely used to organize and manage data in the dataset for further processing or evaluation tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py\":66-85",
+            "content": "    filename = 'filename'\n    groundtruth_image_classes = 'groundtruth_image_classes'\n    groundtruth_boxes = 'groundtruth_boxes'\n    groundtruth_classes = 'groundtruth_classes'\n    groundtruth_label_types = 'groundtruth_label_types'\n    groundtruth_is_crowd = 'groundtruth_is_crowd'\n    groundtruth_area = 'groundtruth_area'\n    groundtruth_difficult = 'groundtruth_difficult'\n    groundtruth_group_of = 'groundtruth_group_of'\n    proposal_boxes = 'proposal_boxes'\n    proposal_objectness = 'proposal_objectness'\n    groundtruth_instance_masks = 'groundtruth_instance_masks'\n    groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'\n    groundtruth_instance_classes = 'groundtruth_instance_classes'\n    groundtruth_keypoints = 'groundtruth_keypoints'\n    groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'\n    groundtruth_label_scores = 'groundtruth_label_scores'\n    groundtruth_weights = 'groundtruth_weights'\n    num_groundtruth_boxes = 'num_groundtruth_boxes'\n    true_image_shape = 'true_image_shape'"
+        },
+        {
+            "comment": "This class defines the standard naming conventions for storing the output of a video object detector. It includes attributes like source_id, key, detection boxes coordinates, scores, classes, masks, boundaries, keypoints, and number of detections in a batch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py\":88-112",
+            "content": "class DetectionResultFields:\n    \"\"\"Naming conventions for storing the output of the detector.\n    Attributes:\n        source_id: source of the original image.\n        key: unique key corresponding to image.\n        detection_boxes: coordinates of the detection boxes in the image.\n        detection_scores: detection scores for the detection boxes in the\n            image.\n        detection_classes: detection-level class labels.\n        detection_masks: contains a segmentation mask for each detection box.\n        detection_boundaries: contains an object boundary for each detection\n            box.\n        detection_keypoints: contains detection keypoints for each detection\n            box.\n        num_detections: number of detections in the batch.\n    \"\"\"\n    source_id = 'source_id'\n    key = 'key'\n    detection_boxes = 'detection_boxes'\n    detection_scores = 'detection_scores'\n    detection_classes = 'detection_classes'\n    detection_masks = 'detection_masks'\n    detection_boundaries = 'detection_boundaries'"
+        },
+        {
+            "comment": "These two variables, detection_keypoints and num_detections, represent metrics for storing the keypoints of detected objects and the number of detections respectively in the standard fields evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py\":113-114",
+            "content": "    detection_keypoints = 'detection_keypoints'\n    num_detections = 'num_detections'"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a664fb08-433c-49c1-9290-2307b6dbef2e.json b/docs/doc/a664fb08-433c-49c1-9290-2307b6dbef2e.json
new file mode 100644
index 000000000..aced2e0b3
--- /dev/null
+++ b/docs/doc/a664fb08-433c-49c1-9290-2307b6dbef2e.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a base class for localization models using PaddlePaddle framework, with train, valid, and test steps implemented in subclasses. It supports different operation modes and allows weight initialization.",
+    "details": [
+        {
+            "comment": "This code snippet defines a base class for localization models. All subclasses of this base class should implement train_step, valid_step, and test_step methods to define their respective steps in the model's training process. It uses PaddlePaddle's framework and is licensed under the Apache License, Version 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/base.py\":0-26",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom abc import abstractmethod\nimport paddle.nn as nn\nfrom ... import builder\nclass BaseLocalizer(nn.Layer):\n    \"\"\"Base class for Localization.\n    All localizer should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, define your train step.\n    - Methods:``valid_step``, define your valid step, always the same as train_step.\n    - Methods:``test_step``, define your test step.\n    \"\"\""
+        },
+        {
+            "comment": "This code initializes a localizer model, handling backbone and loss functions, and allows for different operation modes (train, valid, test, infer). It also includes a function to initialize the model's network weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/base.py\":27-55",
+            "content": "    def __init__(self, backbone, loss):\n        super().__init__()\n        self.backbone = builder.build_backbone(backbone)\n        self.loss = builder.build_loss(loss)\n        self.init_weights()\n    def init_weights(self):\n        \"\"\"Initialize the model network weights. \"\"\"\n        if getattr(self.backbone, 'init_weights'):\n            self.backbone.init_weights()\n        else:\n            pass\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError"
+        },
+        {
+            "comment": "This code defines abstract classes for training, validation, and testing steps in a model. The train_step, val_step, and test_step methods require implementation by subclasses to perform the necessary computations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/base.py\":57-73",
+            "content": "    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.  input_data_batch -> loss_metric\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating setp. input_data_batch -> loss_metric\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Tets setp. to get acc in test data. input_data_batch -> output\n        \"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a666a2c0-92cd-438a-80c1-99bff79e4caf.json b/docs/doc/a666a2c0-92cd-438a-80c1-99bff79e4caf.json
new file mode 100644
index 000000000..205f527a1
--- /dev/null
+++ b/docs/doc/a666a2c0-92cd-438a-80c1-99bff79e4caf.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code defines the Normalize class for image normalization, along with several preprocessing operation classes like Permute, Scale, CenterCrop, and TenCrop, which can be used in PaddleVideo library for preparing images before inference.",
+    "details": [
+        {
+            "comment": "This code defines a class called Normalize with a Run method that takes in an input image and a vector of means for normalization. It is part of the PaddleVideo library, which likely uses OpenCV for image processing tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/preprocess_op.h\":0-38",
+            "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#pragma once\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\nusing namespace std;\nusing namespace paddle;\nnamespace PaddleVideo\n{\n    class Normalize\n    {\n    public:\n        virtual void Run(cv::Mat *im, const std::vector<float> &mean,"
+        },
+        {
+            "comment": "The code defines several classes representing image preprocessing operations, including Permute for changing RGB to CHW format, Scale for resizing images, CenterCrop for cropping images to a specific size, and TenCrop for splitting an image into multiple crops. These classes can be used in the PaddleVideo library for preparing images before running inference with deep learning models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/preprocess_op.h\":39-73",
+            "content": "                         const std::vector<float> &scale, const bool is_scale = true);\n    };\n    // RGB -> CHW\n    class Permute\n    {\n    public:\n        virtual void Run(const cv::Mat *img, float *data);\n    };\n    class Scale\n    {\n    public:\n        virtual void Run(const cv::Mat &img, cv::Mat &resize_img,\n                         bool use_tensorrt = false,\n                         const int &short_size = 256);\n    };\n    class CenterCrop\n    {\n    public:\n        virtual void Run(const cv::Mat &img, cv::Mat &crop_img,\n                         bool use_tensorrt = false,\n                         const int &target_size = 224);\n    };\n    class TenCrop\n    {\n    public:\n        virtual void Run(const cv::Mat &img, std::vector<cv::Mat> &crop_frames,\n                         const int &begin_index,\n                         bool use_tensorrt = false,\n                         const int &target_size = 224);\n    };\n} // namespace PaddleVideo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a6cf2a24-d0f7-45b7-a739-51ff35bf4be7.json b/docs/doc/a6cf2a24-d0f7-45b7-a739-51ff35bf4be7.json
new file mode 100644
index 000000000..ca424f5a6
--- /dev/null
+++ b/docs/doc/a6cf2a24-d0f7-45b7-a739-51ff35bf4be7.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code initializes a TSN model class and sets parameters for segmentation, training, image statistics, layers, epochs, video data, and optimizer. It defines a VideoTag model with train, test, and infer modes, updating parameters and excluding the final layer for pre-trained weights.",
+    "details": [
+        {
+            "comment": "This code imports necessary modules and defines a class TSN that extends the ModelBase class. The class takes parameters such as name, configuration, mode and is_videotag. It also has a method get_config that fetches the model configuration from the given section.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn.py\":0-33",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nfrom ..model import ModelBase\nfrom .tsn_res_model import TSN_ResNet\nimport logging\nimport paddle\nimport paddle.static as static\nlogger = logging.getLogger(__name__)\n__all__ = [\"TSN\"]\nclass TSN(ModelBase):\n    def __init__(self, name, cfg, mode='train', is_videotag=False):\n        super(TSN, self).__init__(name, cfg, mode=mode)\n        self.is_videotag = is_videotag\n        self.get_config()\n    def get_config(self):\n        self.num_classes = self.get_config_from_sec('model', 'num_classes')"
+        },
+        {
+            "comment": "This code initializes various parameters for the TSN model. It sets segment number, segment length, image mean and standard deviation, number of layers, training epochs, total videos, learning rate, learning rate decay, L2 weight decay, and momentum using get_config_from_sec method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn.py\":34-51",
+            "content": "        self.seg_num = self.get_config_from_sec('model', 'seg_num')\n        self.seglen = self.get_config_from_sec('model', 'seglen')\n        self.image_mean = self.get_config_from_sec('model', 'image_mean')\n        self.image_std = self.get_config_from_sec('model', 'image_std')\n        self.num_layers = self.get_config_from_sec('model', 'num_layers')\n        self.num_epochs = self.get_config_from_sec('train', 'epoch')\n        self.total_videos = self.get_config_from_sec('train', 'total_videos')\n        self.base_learning_rate = self.get_config_from_sec(\n            'train', 'learning_rate')\n        self.learning_rate_decay = self.get_config_from_sec(\n            'train', 'learning_rate_decay')\n        self.l2_weight_decay = self.get_config_from_sec('train',\n                                                        'l2_weight_decay')\n        self.momentum = self.get_config_from_sec('train', 'momentum')\n        self.seg_num = self.get_config_from_sec(self.mode, 'seg_num',\n                                                self.seg_num)"
+        },
+        {
+            "comment": "The code initializes the target size and batch size, then defines a `build_input` function to create data tensors for the model's inputs. It generates image and label tensors with specified shapes and data types, and optionally creates a DataLoader for handling data if not in infer mode. The feature and label inputs are stored as separate lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn.py\":52-74",
+            "content": "        self.target_size = self.get_config_from_sec(self.mode, 'target_size')\n        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size')\n    def build_input(self, use_dataloader=True):\n        image_shape = [3, self.target_size, self.target_size]\n        image_shape[0] = image_shape[0] * self.seglen\n        image_shape = [None, self.seg_num] + image_shape\n        self.use_dataloader = use_dataloader\n        image = static.data(name='image', shape=image_shape, dtype='float32')\n        if self.mode != 'infer':\n            label = static.data(name='label', shape=[None, 1], dtype='int64')\n        else:\n            label = None\n        if use_dataloader:\n            assert self.mode != 'infer', \\\n                        'dataloader is not recommendated when infer, please set use_dataloader to be false.'\n            self.dataloader = paddle.io.DataLoader.from_generator(\n                feed_list=[image, label], capacity=4, iterable=True)\n        self.feature_input = [image]\n        self.label_input = label"
+        },
+        {
+            "comment": "The code defines a model with configurable parameters and builds the model instance. It also includes an optimizer function that adjusts learning rate based on epoch points and total videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn.py\":76-100",
+            "content": "    def create_model_args(self):\n        cfg = {}\n        cfg['layers'] = self.num_layers\n        cfg['class_dim'] = self.num_classes\n        cfg['seg_num'] = self.seg_num\n        return cfg\n    def build_model(self):\n        cfg = self.create_model_args()\n        videomodel = TSN_ResNet(layers=cfg['layers'],\n                                seg_num=cfg['seg_num'],\n                                is_training=(self.mode == 'train'),\n                                is_extractor=self.is_videotag)\n        out = videomodel.net(input=self.feature_input[0],\n                             class_dim=cfg['class_dim'])\n        self.network_outputs = [out]\n    def optimizer(self):\n        assert self.mode == 'train', \"optimizer only can be get in train mode\"\n        epoch_points = [self.num_epochs / 3, self.num_epochs * 2 / 3]\n        total_videos = self.total_videos\n        step = int(total_videos / self.batch_size + 1)\n        bd = [e * step for e in epoch_points]\n        base_lr = self.base_learning_rate\n        lr_decay = self.learning_rate_decay"
+        },
+        {
+            "comment": "This code defines a model for the VideoTag application. It creates an optimizer with a piecewise learning rate decay and L2 weight decay, calculates the loss using cross entropy, updates the loss value, returns the network outputs, and handles feeds and fetches based on the mode (train, valid or infer).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn.py\":101-128",
+            "content": "        lr = [base_lr, base_lr * lr_decay, base_lr * lr_decay * lr_decay]\n        l2_weight_decay = self.l2_weight_decay\n        momentum = self.momentum\n        optimizer = paddle.optimizer.Momentum(\n            learning_rate=paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,\n                                                       values=lr),\n            momentum=momentum,\n            weight_decay=paddle.regularizer.L2Decay(coeff=l2_weight_decay))\n        return optimizer\n    def loss(self):\n        assert self.mode != 'infer', \"invalid loss calculationg in infer mode\"\n        cost = paddle.nn.functional.cross_entropy(input=self.network_outputs[0], \\\n                           label=self.label_input, ignore_index=-1)\n        self.loss_ = paddle.mean(x=cost)\n        return self.loss_\n    def outputs(self):\n        return self.network_outputs\n    def feeds(self):\n        return self.feature_input if self.mode == 'infer' else self.feature_input + [\n            self.label_input\n        ]\n    def fetches(self):\n        if self.mode == 'train' or self.mode == 'valid':"
+        },
+        {
+            "comment": "This code defines a model with three modes: train, test, and infer. It returns the losses, network outputs, and label inputs in train and test modes, while only returning network outputs in infer mode. The function pretrain_info() returns no information, weights_info() also returns no info, and load_pretrain_params() loads pre-trained weights from a specific file while excluding the final fully connected (fc) layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn.py\":129-158",
+            "content": "            losses = self.loss()\n            fetch_list = [losses, self.network_outputs[0], self.label_input]\n        elif self.mode == 'test':\n            losses = self.loss()\n            fetch_list = [losses, self.network_outputs[0], self.label_input]\n        elif self.mode == 'infer':\n            fetch_list = self.network_outputs\n        else:\n            raise NotImplementedError('mode {} not implemented'.format(\n                self.mode))\n        return fetch_list\n    def pretrain_info(self):\n        return None, None\n    def weights_info(self):\n        return None\n    def load_pretrain_params(self, exe, pretrain, prog):\n        def is_parameter(var):\n            return isinstance(var, paddle.framework.Parameter)\n        logger.info(\n            \"Load pretrain weights from {}, exclude fc layer.\".format(pretrain))\n        print(\"===pretrain===\", pretrain)\n        state_dict = paddle.static.load_program_state(pretrain)\n        dict_keys = list(state_dict.keys())\n        # remove fc layer when pretrain, because the number of classes in final fc may not match"
+        },
+        {
+            "comment": "The code is deleting specific keys from the pretrained parameters and then setting the program state with the updated dictionary. This could be done to avoid loading unnecessary or conflicting parameters during the model's execution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/tsn.py\":159-164",
+            "content": "        for name in dict_keys:\n            if \"fc_0\" in name:\n                del state_dict[name]\n                print('Delete {} from pretrained parameters. Do not load it'.\n                      format(name))\n        paddle.static.set_program_state(prog, state_dict)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a77fb5b6-3247-4e1f-8b9e-2d094f60b5a8.json b/docs/doc/a77fb5b6-3247-4e1f-8b9e-2d094f60b5a8.json
new file mode 100644
index 000000000..13d9d48cd
--- /dev/null
+++ b/docs/doc/a77fb5b6-3247-4e1f-8b9e-2d094f60b5a8.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The PaddleVideo's TimeSformerHead class is a model head in the TimeSformer architecture that initializes parameters and defines forward methods for computing output. It uses Linear layers from PaddlePaddle and allows customizing parameters with keyword arguments. The function applies an fc layer to input tensor x and returns classification scores without softmax, with unclear dropout location.",
+    "details": [
+        {
+            "comment": "This code is from PaddleVideo's TimeSformerHead class, which is a head in the modeling module. It is a subclass of BaseHead and has attributes such as num_classes, in_channels, and loss_cfg. The class is registered using HEADS registry, and it uses functions from paddle.nn, Linear, and BaseHead modules. Weight initialization is performed using trunc_normal_ and weight_init functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/timesformer_head.py\":0-28",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom paddle.nn import Linear\nfrom ..registry import HEADS\nfrom ..weight_init import trunc_normal_, weight_init_\nfrom .base import BaseHead\n@HEADS.register()\nclass TimeSformerHead(BaseHead):\n    \"\"\"TimeSformerHead Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss')."
+        },
+        {
+            "comment": "The code snippet defines a class for the TimeSformer head, initializes its parameters and provides a forward method to compute the output of the head. It uses PaddlePaddle's Linear layer and allows setting a specific std value in normal initialization, as well as customizing other parameters with additional keyword arguments. The forward function defines how the head operates on input data x.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/timesformer_head.py\":29-59",
+            "content": "        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 std=0.02,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.std = std\n        self.fc = Linear(self.in_channels, self.num_classes)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'TruncatedNormal',\n                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.0,\n                     std=self.std)\n        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal\n        trunc_normal_(self.fc.weight, std=self.std)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data."
+        },
+        {
+            "comment": "This function applies a fully connected layer (fc) to the input tensor x and returns the classification scores for input samples without applying softmax. The dropout location needs further clarification as indicated by XXX.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/timesformer_head.py\":60-69",
+            "content": "        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        # XXX: check dropout location!\n        # x.shape = [N, embed_dim]\n        score = self.fc(x)\n        # [N, num_class]\n        # x = F.softmax(x)  # NOTE remove\n        return score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a7c6c8d8-6afa-4f54-a92a-f368d815c37e.json b/docs/doc/a7c6c8d8-6afa-4f54-a92a-f368d815c37e.json
new file mode 100644
index 000000000..fb82f1dd6
--- /dev/null
+++ b/docs/doc/a7c6c8d8-6afa-4f54-a92a-f368d815c37e.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is part of the PaddleVideo library and defines a segmenter module. It includes three classes: BaseSegmenter, MSTCN, and ASRF. These classes are used for video frame-level feature extraction, semantic segmentation, and audio source separation respectively. The __all__ variable lists all exported names in this package.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library and defines a segmenter module. It includes three classes: BaseSegmenter, MSTCN, and ASRF. These classes are used for video frame-level feature extraction, semantic segmentation, and audio source separation respectively. The __all__ variable lists all exported names in this package.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/__init__.py\":0-16",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseSegmenter\nfrom .ms_tcn import MSTCN\nfrom .asrf import ASRF\n__all__ = ['BaseSegmenter', 'MSTCN', 'ASRF']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a851f9ec-67b0-40ab-8a4c-646dbc068aa9.json b/docs/doc/a851f9ec-67b0-40ab-8a4c-646dbc068aa9.json
new file mode 100644
index 000000000..66a02cdd3
--- /dev/null
+++ b/docs/doc/a851f9ec-67b0-40ab-8a4c-646dbc068aa9.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code calculates IoU and IOA, checks hits, stores relevant info in a dictionary, prints URLs for each video using gts data, and splits video features into training and validation datasets for handling football actions. It also separates video features and labels into training and validation sets, storing the data in .pkl files for later use.",
+    "details": [
+        {
+            "comment": "This code computes the IoU (intersection over union) and IOA (intersection over area) for proposals and ground truths in a dataset. It takes proposal bounding boxes and ground truth bounding boxes as inputs, calculates their intersections and unions, and outputs the resulting IoUs and IOAs. The calculated IoU and IOA values will be used to determine the labels for the LSTM model's input data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py\":0-43",
+            "content": "\"\"\"\nget instance for lstm\n\u6839\u636egts\u8ba1\u7b97\u6bcf\u4e2aproposal_bmn\u7684iou\u3001ioa\u3001label\u7b49\u4fe1\u606f\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\ndataset = \"../EuroCup2016\"\nfeat_dir = dataset + '/features'\nprop_file = dataset + '/feature_bmn/prop.json'\nout_dir = dataset + '/input_for_lstm'\nlabel_files = {\n    'train': 'label_cls8_train.json',\n    'validation': 'label_cls8_val.json'\n}\ndef IoU(e1, e2):\n    \"\"\"\n    clc iou and ioa\n    \"\"\"\n    area1 = e1[\"end\"] - e1[\"start\"]\n    area2 = e2[\"end\"] - e2[\"start\"]\n    x1 = np.maximum(e1[\"start\"], e2[\"start\"])\n    x2 = np.minimum(e1[\"end\"], e2[\"end\"])\n    inter = np.maximum(0.0, x2 - x1)\n    iou = 0.0 if (area1 + area2 -\n                  inter) == 0 else inter * 1.0 / (area1 + area2 - inter)\n    ioa = 0.0 if area2 == 0 else inter * 1.0 / area2\n    return iou, ioa\ndef clc_iou_of_proposal(proposal, gts):\n    hit_gts = {}\n    label = 0\n    norm_start = 0.\n    hit = False\n    for gt in gts:\n        e1 = {'start': proposal['start'], 'end': proposal['end']}\n        e2 = {'start': gt['start_id'], 'end': gt['end_id']}"
+        },
+        {
+            "comment": "This code calculates IoU and IOA between two sets of data, then checks if there is a hit. It stores the label, normalized start, and other relevant information in a dictionary and returns it. The get_bmn_info function takes gts and proposal data and prints the URL for each video, iterating through the gts data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py\":44-79",
+            "content": "        iou, ioa = IoU(e1, e2)\n        if iou > 0:\n            hit = True\n            hit_gts = gt\n            label = hit_gts['label_ids'][0]\n            norm_start = (gt['start_id'] - proposal['start']) * 1.0 / (\n                proposal['end'] - proposal['start'])\n            break\n    res = {\n        'label': label,\n        'norm_iou': iou,\n        'norm_ioa': ioa,\n        'norm_start': norm_start,\n        'proposal': proposal,\n        'hit_gts': hit_gts\n    }\n    return res\ndef get_bmn_info(gts_data, proposal_data, res_bmn, mode, score_threshold=0.01):\n    \"\"\"\n    @param, gts_data, original gts for action detection\n    @param, proposal_data, proposal actions from bmn\n    @param, mode, train or validation\n    @return, None.\n    \"\"\"\n    fps = gts_data['fps']\n    res_bmn['fps'] = fps\n    for gts_item in gts_data['gts']:\n        url = gts_item['url']\n        print(url)\n        max_length = gts_item['total_frames']\n        video_name = os.path.basename(url).split('.')[0]\n        if not video_name in proposal_data:\n            continue"
+        },
+        {
+            "comment": "The code retrieves ground truth (GT) actions and proposal actions from a dataset, then evaluates the Intersection over Union (IoU) of each proposal with the GT actions. If a proposal's score is below a threshold, it is skipped. The IoU values are appended to the 'results' list within a dictionary, along with other information such as URL, mode, total frames, number of GT and proposal actions. Finally, the function returns the dictionary. A separate function saves features in an output directory, creating one if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py\":81-109",
+            "content": "        gts_actions = gts_item['actions']\n        prop_actions = proposal_data[video_name]\n        res_bmn['results'].append({\n            'url': url,\n            'mode': mode,\n            'total_frames': max_length,\n            'num_gts': len(gts_actions),\n            'num_proposals': len(prop_actions),\n            'proposal_actions': []\n        })\n        for proposal in prop_actions:\n            if proposal['score'] < score_threshold:\n                continue\n            proposal['start'] = int(proposal['start'] * 1.0 / fps)\n            proposal['end'] = int(proposal['end'] * 1.0 / fps)\n            gts_info = clc_iou_of_proposal(proposal, gts_actions)\n            res_bmn['results'][-1]['proposal_actions'].append(gts_info)\n    return res_bmn\ndef save_feature(label_info, out_dir):\n    print('save feature ...')\n    fps = label_info['fps']\n    out_feature_dir = out_dir + '/feature'\n    out_feature_dir = os.path.abspath(out_feature_dir)\n    if not os.path.exists(out_feature_dir):\n        os.mkdir(out_feature_dir)"
+        },
+        {
+            "comment": "This code is splitting video features into training and validation datasets, handling audio-visual data for football actions. It reads the results from label_info and writes image and audio feature segments to train.txt or val.txt files based on mode (train/val). The code iterates through proposal actions, extracting the corresponding image and audio features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py\":110-129",
+            "content": "    fid_train = open(out_dir + '/train.txt', 'w')\n    fid_val = open(out_dir + '/val.txt', 'w')\n    for res in label_info['results']:\n        basename = os.path.basename(res['url']).split('.')[0]\n        print(basename, res['num_proposals'])\n        mode = res['mode']\n        fid = fid_train if mode == 'train' else fid_val\n        feature_path = os.path.join(feat_dir, basename + '.pkl')\n        feature_data = pickle.load(open(feature_path, 'rb'))\n        image_feature = feature_data['image_feature']\n        audio_feature = feature_data['audio_feature']\n        max_len_audio = len(audio_feature)\n        for proposal in res['proposal_actions']:\n            label = proposal['label']\n            start_id = proposal['proposal']['start']\n            end_id = proposal['proposal']['end']\n            # get hit feature\n            image_feature_hit = image_feature[start_id * fps:end_id * fps]\n            audio_feature_hit = audio_feature[min(start_id, max_len_audio\n                                                  ):min(end_id, max_len_audio)]"
+        },
+        {
+            "comment": "This code saves video features and labels into separate files for training and validation sets. It creates a dictionary of feature information and label, then dumps this data into a .pkl file with the appropriate naming format. Finally, it writes the file name and corresponding label into another file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py\":131-160",
+            "content": "            # save\n            anno_info = {\n                'image_feature': np.array(image_feature_hit, dtype=np.float32),\n                'audio_feature': np.array(audio_feature_hit, dtype=np.float32),\n                'feature_fps': fps,\n                'label_info': proposal,\n                'video_name': basename\n            }\n            save_name = '{}/{}_{}_{}.pkl'.format(out_feature_dir, basename,\n                                                 start_id, end_id)\n            with open(save_name, 'wb') as f:\n                pickle.dump(anno_info, f, protocol=pickle.HIGHEST_PROTOCOL)\n            fid.write('{} {}\\n'.format(save_name, label))\n    fid_train.close()\n    fid_val.close()\n    print('done!')\nif __name__ == \"__main__\":\n    if not os.path.exists(out_dir):\n        os.mkdir(out_dir)\n    prop_data = json.load(open(prop_file, 'rb'))\n    proposal_data = {}\n    for item in prop_data:\n        proposal_data[os.path.basename(\n            item['video_name'])] = item['bmn_results']\n    # get label info\n    res_bmn = {'fps': 0, 'results': []}"
+        },
+        {
+            "comment": "This code reads label files, loads and processes the data, then saves the processed data (label information) and optional features to specific directories.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py\":161-171",
+            "content": "    for item, value in label_files.items():\n        label_file = os.path.join(dataset, value)\n        gts_data = json.load(open(label_file, 'rb'))\n        res_bmn = get_bmn_info(gts_data, proposal_data, res_bmn, item)\n    with open(out_dir + '/label_info.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(res_bmn, indent=4, ensure_ascii=False)\n        f.write(data)\n    # save feature\n    save_feature(res_bmn, out_dir)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a8948d8c-051c-49b9-a87d-93f20686de6e.json b/docs/doc/a8948d8c-051c-49b9-a87d-93f20686de6e.json
new file mode 100644
index 000000000..baaecce37
--- /dev/null
+++ b/docs/doc/a8948d8c-051c-49b9-a87d-93f20686de6e.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code imports libraries, creates action recognition and data loading classes, and processes video data for training or validation using a pipeline, handling exceptions through retries and logging. It is part of a function that returns arrays of images and labels.",
+    "details": [
+        {
+            "comment": "This code snippet is importing necessary libraries and registering a new dataset class named SFMRIDataset for action recognition. It uses raw frames from frame files, applies specified transform operations, and loads an index file. The copyright and license information are also included in the beginning of the file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI_SlowFast.py\":0-30",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass SFMRIDataset(BaseDataset):\n    \"\"\"Rawframe dataset for action recognition.\n    The dataset loads raw frames from frame files, and apply specified transform operatation them.\n    The indecx file "
+        },
+        {
+            "comment": "This code is creating a class for loading an index file containing video information, including the directory of frames, total frames, and label. The constructor takes arguments like the file path, pipeline, data prefix, test mode, and suffix. The load_file function loads the index file to retrieve the video details.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI_SlowFast.py\":30-60",
+            "content": "is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.\n    Example of an index file:\n    .. code-block:: txt\n        file_path-1 150 1\n        file_path-2 160 1\n        file_path-3 170 2\n        file_path-4 180 2\n    Args:\n        file_path (str): Path to the index file.\n        pipeline(XXX):\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 num_retries=5,\n                 data_prefix=None,\n                 test_mode=False,\n                 suffix='img_{:05}.jpg'):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\""
+        },
+        {
+            "comment": "The code reads information from a file and stores it in a list of dictionaries. It then attempts to prepare the frames for training or validation by applying a pipeline, handling potential exceptions within a specified number of retries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI_SlowFast.py\":61-85",
+            "content": "        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                frame_dir, frames_len, labels = line_split\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(\n                    dict(\n                        frame_dir=frame_dir,\n                        #suffix=self.suffix,\n                        frames_len=frames_len,\n                        labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid gisven index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:"
+        },
+        {
+            "comment": "This code handles error cases when loading data by retrying the operation if an exception occurs. It uses a logger to provide information on the error, the number of retries, and whether or not to try again with a different index. The 'prepare_test' function is responsible for preparing frames for testing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI_SlowFast.py\":86-107",
+            "content": "                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return np.array(results['imgs'][0]), np.array(\n                results['imgs'][1]), np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)"
+        },
+        {
+            "comment": "The code is part of a function that returns three arrays: the first image from the 'imgs' list, the second image, and the labels. If there are more images available, the function continues processing them; if not, it returns the stored images and labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/MRI_SlowFast.py\":108-110",
+            "content": "                continue\n            return np.array(results['imgs'][0]), np.array(\n                results['imgs'][1]), np.array([results['labels']])"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a899b6a8-5eef-4b1f-b7bf-6b709f297854.json b/docs/doc/a899b6a8-5eef-4b1f-b7bf-6b709f297854.json
new file mode 100644
index 000000000..1daf22140
--- /dev/null
+++ b/docs/doc/a899b6a8-5eef-4b1f-b7bf-6b709f297854.json
@@ -0,0 +1,45 @@
+{
+    "summary": "The given code defines a custom loss function, Added_CrossEntropyLoss, which extends nn.Layer class and optionally uses hard example mining for better training by computing the loss for top k percent pixels. This loss function is designed to improve performance in image classification tasks using a weighted sum of binary cross-entropy and pixel loss with top-k pixel selection.",
+    "details": [
+        {
+            "comment": "This code defines a custom loss function that extends `nn.Layer` and uses BCEWithLogitsLoss from PaddlePaddle. It has an optional argument for top_k_percent_pixels to compute the loss only for the top k percent of pixels. If top_k_percent_pixels is None, it computes the mean loss for all pixels. The function also has a hard_example_mining_step parameter that may be used in future implementations but currently unused.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/loss.py\":0-27",
+            "content": "import paddle\nimport paddle.nn as nn\nimport os\nclass Added_BCEWithLogitsLoss(nn.Layer):\n    def __init__(self,\n                 top_k_percent_pixels=None,\n                 hard_example_mining_step=100000):\n        super(Added_BCEWithLogitsLoss, self).__init__()\n        self.top_k_percent_pixels = top_k_percent_pixels\n        if top_k_percent_pixels is not None:\n            assert (top_k_percent_pixels > 0 and top_k_percent_pixels < 1)\n        self.hard_example_mining_step = hard_example_mining_step\n        if self.top_k_percent_pixels == None:\n            self.bceloss = nn.BCEWithLogitsLoss(reduction='mean')\n        else:\n            self.bceloss = nn.BCEWithLogitsLoss(reduction='none')\n    def forward(self, dic_tmp, y, step):\n        final_loss = 0\n        for seq_name in dic_tmp.keys():\n            pred_logits = dic_tmp[seq_name]\n            gts = y[seq_name]\n            if self.top_k_percent_pixels == None:\n                final_loss += self.bceloss(pred_logits, gts)\n            else:\n                # Only compute the loss for top k percent pixels."
+        },
+        {
+            "comment": "Computes the loss for all pixels, without adding to loss_collection and keeps the shape. Then, based on hard example mining step, determines the number of top K pixels to consider.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/loss.py\":28-43",
+            "content": "                # First, compute the loss for all pixels. Note we do not put the loss\n                # to loss_collection and set reduction = None to keep the shape.\n                num_pixels = float(pred_logits.shape[2] * pred_logits.shape[3])\n                pred_logits = pred_logits.view(\n                    -1, pred_logits.shape[1],\n                    pred_logits.shape[2] * pred_logits.shape[3])\n                gts = gts.view(-1, gts.shape[1], gts.shape[2] * gts.shape[3])\n                pixel_losses = self.bceloss(pred_logits, gts)\n                if self.hard_example_mining_step == 0:\n                    top_k_pixels = int(self.top_k_percent_pixels * num_pixels)\n                else:\n                    ratio = min(1.0,\n                                step / float(self.hard_example_mining_step))\n                    top_k_pixels = int((ratio * self.top_k_percent_pixels +\n                                        (1.0 - ratio)) * num_pixels)\n                _, top_k_indices = paddle.topk(pixel_losses,"
+        },
+        {
+            "comment": "This code defines a custom loss function, Added_CrossEntropyLoss, that extends nn.Layer class. It has an optional parameter, top_k_percent_pixels, which determines whether to use hard example mining for better training. If this parameter is None, it falls back to using nn.CrossEntropyLoss with mean reduction. The code also initializes other attributes like self.top_k_percent_pixels and self.hard_example_mining_step based on the provided values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/loss.py\":44-66",
+            "content": "                                               k=top_k_pixels,\n                                               axis=2)\n                final_loss += nn.BCEWithLogitsLoss(weight=top_k_indices,\n                                                   reduction='mean')(\n                                                       pred_logits, gts)\n        return final_loss\nclass Added_CrossEntropyLoss(nn.Layer):\n    def __init__(self,\n                 top_k_percent_pixels=None,\n                 hard_example_mining_step=100000):\n        super(Added_CrossEntropyLoss, self).__init__()\n        self.top_k_percent_pixels = top_k_percent_pixels\n        if top_k_percent_pixels is not None:\n            assert (top_k_percent_pixels > 0 and top_k_percent_pixels < 1)\n        self.hard_example_mining_step = hard_example_mining_step\n        if self.top_k_percent_pixels == None:\n            self.celoss = nn.CrossEntropyLoss(ignore_index=255,\n                                              reduction='mean')\n        else:\n            self.celoss = nn.CrossEntropyLoss(ignore_index=255,"
+        },
+        {
+            "comment": "Computes the loss for top k percent pixels by first computing the loss for all pixels, reshaping them, and then selecting only the top k percent.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/loss.py\":67-86",
+            "content": "                                              reduction='none')\n    def forward(self, dic_tmp, y, step):\n        final_loss = 0\n        for seq_name in dic_tmp.keys():\n            pred_logits = dic_tmp[seq_name]\n            gts = y[seq_name]\n            if self.top_k_percent_pixels == None:\n                final_loss += self.celoss(pred_logits, gts)\n            else:\n                # Only compute the loss for top k percent pixels.\n                # First, compute the loss for all pixels. Note we do not put the loss\n                # to loss_collection and set reduction = None to keep the shape.\n                num_pixels = float(pred_logits.shape[2] * pred_logits.shape[3])\n                pred_logits = pred_logits.reshape([\n                    pred_logits.shape[1],\n                    pred_logits.shape[2] * pred_logits.shape[3]\n                ]).transpose([1, 0])\n                gts = gts.reshape([gts.shape[1] * gts.shape[2]])\n                pixel_losses = self.celoss(pred_logits, gts).reshape([1, -1])"
+        },
+        {
+            "comment": "The code defines a class called \"AddedEdge_CrossEntropyLoss\" which extends the base Layer class. It calculates the cross-entropy loss for a classification task while implementing hard example mining and top-k pixel selection strategies to improve performance. The top_k_percent_pixels and hard_example_mining_step parameters control these strategies, with different behavior depending on the current step value. The code block provided calculates the final loss by averaging over the top-k losses.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/loss.py\":87-108",
+            "content": "                if self.hard_example_mining_step == 0:\n                    top_k_pixels = int(self.top_k_percent_pixels * num_pixels)\n                else:\n                    ratio = min(1.0,\n                                step / float(self.hard_example_mining_step))\n                    top_k_pixels = int((ratio * self.top_k_percent_pixels +\n                                        (1.0 - ratio)) * num_pixels)\n                top_k_loss, top_k_indices = paddle.topk(pixel_losses,\n                                                        k=top_k_pixels,\n                                                        axis=1)\n                final_loss += paddle.mean(top_k_loss)\n        return final_loss\nclass AddedEdge_CrossEntropyLoss(nn.Layer):\n    def __init__(self,\n                 top_k_percent_pixels=None,\n                 hard_example_mining_step=100000):\n        super(AddedEdge_CrossEntropyLoss, self).__init__()\n        self.top_k_percent_pixels = top_k_percent_pixels\n        if top_k_percent_pixels is not None:"
+        },
+        {
+            "comment": "This code defines a class for a loss function with hard example mining step, top_k_percent_pixels and forward method. It calculates weights based on positive and negative numbers, and applies them to the BCEWithLogitsLoss if top_k_percent_pixels is None. The code also calculates the dcloss for cases where gts sum is 0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/loss.py\":109-129",
+            "content": "            assert (top_k_percent_pixels > 0 and top_k_percent_pixels < 1)\n        self.hard_example_mining_step = hard_example_mining_step\n        self.celoss = None\n    def forward(self, pred_logits, gts, step):\n        pos_num = paddle.sum(gts == 1, dtype='float32')\n        neg_num = paddle.sum(gts == 0, dtype='float32')\n        weight_pos = neg_num / (pos_num + neg_num)\n        weight_neg = pos_num / (pos_num + neg_num)\n        weights = paddle.to_tensor([weight_neg, weight_pos])\n        if self.top_k_percent_pixels == None:\n            sig_pred_logits = paddle.nn.functional.sigmoid(pred_logits)\n            self.bceloss = nn.BCEWithLogitsLoss(pos_weight=weight_pos,\n                                                reduction='mean')\n            if paddle.sum(gts) == 0:\n                dcloss = 0\n            else:\n                dcloss = (paddle.sum(sig_pred_logits * sig_pred_logits) +\n                          paddle.sum(gts * gts)) / (\n                              paddle.sum(2 * sig_pred_logits * gts) + 1e-5)"
+        },
+        {
+            "comment": "The code calculates the final loss for an image classification task. If the step is not zero, it uses hard example mining to calculate the pixel losses and select top K pixels based on a ratio of the current step. The final_loss is a weighted sum of binary cross-entropy (bceloss) and pixel loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/loss.py\":130-147",
+            "content": "            final_loss = 0.1 * self.bceloss(pred_logits, gts) + dcloss\n        else:\n            self.celoss = nn.CrossEntropyLoss(weight=weights,\n                                              ignore_index=255,\n                                              reduction='none')\n            num_pixels = float(pred_logits.shape[2] * pred_logits.shape[3])\n            pred_logits = pred_logits.view(\n                -1, pred_logits.shape[1],\n                pred_logits.shape[2] * pred_logits.shape[3])\n            gts = gts.view(-1, gts.shape[2] * gts.shape[3])\n            pixel_losses = self.celoss(pred_logits, gts)\n            if self.hard_example_mining_step == 0:\n                top_k_pixels = int(self.top_k_percent_pixels * num_pixels)\n            else:\n                ratio = min(1.0, step / float(self.hard_example_mining_step))\n                top_k_pixels = int((ratio * self.top_k_percent_pixels +\n                                    (1.0 - ratio)) * num_pixels)\n            top_k_loss, top_k_indices = paddle.topk(pixel_losses,"
+        },
+        {
+            "comment": "This code calculates the mean loss value by taking top-k pixel values from input images, and then averages them. This can be useful in image recognition tasks where some pixels have higher importance or relevance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/loss.py\":148-152",
+            "content": "                                                    k=top_k_pixels,\n                                                    axis=1)\n            final_loss = paddle.mean(top_k_loss)\n        return final_loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a8ed17f3-26c4-40e5-975f-47afb18bc21b.json b/docs/doc/a8ed17f3-26c4-40e5-975f-47afb18bc21b.json
new file mode 100644
index 000000000..b0148f26c
--- /dev/null
+++ b/docs/doc/a8ed17f3-26c4-40e5-975f-47afb18bc21b.json
@@ -0,0 +1,50 @@
+{
+    "summary": "This code trains TSM model using ResNet-50, PaddlePaddle, and AMP on UCF-101 and Kinetics-400 datasets with Momentum optimization and L2_Decay. It supports three sampling methods, provides training details, and gives inference instructions.",
+    "details": [
+        {
+            "comment": "This code implements the TSM (Temporal Shift Module) model for video understanding using a single RGB stream and ResNet-50 as the backbone. It follows the ICCV 2019 paper for details, and requires data from Kinetics-400 which can be downloaded and prepared according to the provided instructions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md\":0-32",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/tsm.md) | English\n# TSM\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Details](#Details)\n- [Reference](#Reference)\n## Introduction\nTemporal Shift Module (TSM) is a popular model that attracts more attention at present.\nThe method of moving through channels greatly improves the utilization ability of temporal information without increasing any\nadditional number of parameters and calculation amount.\nMoreover, due to its lightweight and efficient characteristics, it is very suitable for industrial landing.\n  <div align=\"center\">\n  <img src=\"../../../images/tsm_architecture.png\" height=250 width=700 hspace='10'/> <br />\n  </div>\nThis code implemented **single RGB stream** of TSM networks. Backbone is ResNet-50.\nPlease refer to the ICCV 2019 paper for details [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf)\n## Data\nPlease refer to Kinetics-400 data download and preparation [k400 data preparation](../../dataset/k400.md)"
+        },
+        {
+            "comment": "This code explains how to train a TSM (Temporal Shift Module) model on the Kinetics-400 dataset using the PaddleVideo framework. The user needs to download and replace the pretrained ResNet50_pretrain.pdparams model, then specify the new weight path in the tsm_k400_frames.yaml configuration file. Training can be started by running a specific command based on the desired configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md\":34-61",
+            "content": "Please refer to UCF101 data download and preparation [ucf101 data preparation](../../dataset/ucf101.md)\n## Train\n### Train on the Kinetics-400 dataset\n#### download pretrain-model\n1. Please download [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams) as pretraind model:\n   ```bash\n   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/tsm/tsm_k400_frames.yaml`, and fill in the downloaded weight path below `pretrained:`\n   ```bash\n   MODEL:\n   \tframework: \"Recognizer2D\"\n   \t\tbackbone:\n   \t\tname: \"ResNetTSM\"\n   \t\tpretrained: your weight path\n   ```\n#### Start training\n- By specifying different configuration files, different data formats/data sets can be used for training. Taking the training configuration of Kinetics-400 data set + 8 cards + frames format as an example, the startup command is as follows (more training commands can be viewed in `PaddleVideo/run.sh`)."
+        },
+        {
+            "comment": "This code snippet is running a PaddlePaddle (a deep learning framework) script to train the TSM (Temporal Shift Module) model on the Kinetics-400 dataset. The model is trained for videos and frames formats separately, utilizing Automatic Mixed Precision (AMP) for faster training with some environment variable settings. AMP works better with the NHWC data format and needs specific environment variable configurations as well.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md\":63-90",
+            "content": "  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_frames.yaml\n  ```\n- Training Kinetics-400 dataset of videos format using scripts.\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_videos.yaml\n  ```\n- AMP is useful for speeding up training, scripts as follows:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 #MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml\n```\n- AMP works better with `NHWC` data format, scripts as follows:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 #MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\npython3.7 -B -m paddle.distributed.l"
+        },
+        {
+            "comment": "This code snippet is for training the TSM (Temporal Shift Module) model on the UCF-101 dataset. It involves loading a pre-trained model, specifying the configuration file, and using 8 GPUs for training. The command launches the model with amp (automatic mixed precision) and validation mode. The provided link shows how to download the pre-trained TSM_k400 model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md\":90-117",
+            "content": "aunch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml\n```\n- For the config file usage\uff0cplease refer to [config](../../tutorials/config.md).\n### Train on UCF-101 dataset\n#### download pretrain-model\n- Load the TSM model we trained on Kinetics-400 [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams), or download it through the command line\n  ```bash\n  wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams\n  ```\n- Open `PaddleVideo/configs/recognition/tsm/tsm_ucf101_frames.yaml`, and fill in the downloaded weight path below `pretrained:`\n  ```bash\n  MODEL:\n      framework: \"Recognizer2D\"\n      backbone:\n          name: \"ResNetTSM\"\n          pretrained: your weight path\n  ```\n#### Start training\n- By specifying different configuration files, different data formats/data sets can be used for training. Taking the training configuration of Kinetics-400 data set + 8 cards"
+        },
+        {
+            "comment": "This code snippet provides commands to train the TSM (Temporal Shift Module) model on the UCF-101 dataset using PaddleVideo. It also demonstrates how to use AMP (Automatic Mixed Precision) for faster training and shows that it works better with `NHWC` data format. The provided commands can be executed in a terminal, specifying the required arguments like GPUs, log directory, and configuration file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md\":117-143",
+            "content": " + frames format as an example, the startup command is as follows (more training commands can be viewed in `PaddleVideo/run.sh`).\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml\n  ```\n- Training UCF-101 dataset of videos format using scripts.\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_ucf101_videos.yaml\n  ```\n- AMP is useful for speeding up training, scripts as follows:\n  ```bash\n  export FLAGS_conv_workspace_size_limit=800 #MB\n  export FLAGS_cudnn_exhaustive_search=1\n  export FLAGS_cudnn_batchnorm_spatial_persistent=1\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml\n  ```\n- AMP works better with `NHWC` data format, scripts as follows:\n  ```bash\n  export FLAGS_conv_workspace_size_limit=800 #MB\n  export FLAGS_cudnn_exhaustive_search=1"
+        },
+        {
+            "comment": "This code exports the flag for CUDNN batch normalization spatial persistent and runs a Python script to test the model with specified configuration files. The testing command takes the best model weights from a directory and evaluates the accuracy on validation datasets of Kinetics-400 and UCF-101.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md\":144-165",
+            "content": "  export FLAGS_cudnn_batchnorm_spatial_persistent=1\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames_nhwc.yaml\n  ```\n## Test\nPut the weight of the model to be tested into the `output/TSM/` directory, the test command is as follows\n```bash\npython3 main.py --test -c configs/recognition/tsm/tsm.yaml -w output/TSM/TSM_best.pdparams\n```\n---\nWhen the test configuration uses the following parameters, the evaluation accuracy on the validation data set of Kinetics-400 is as follows:\n| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |\n| :--------: | :---------------: | :-------: | :-----------: | :-----: | :-----------: | :-----------: |\n| ResNet50 | Uniform         | NCHW | 8       | 224         | 71.06 | [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams)        |\nWhen the test configuration uses the following parameters, the evaluation accuracy on the validation data set of UCF-101 is as follows:"
+        },
+        {
+            "comment": "This code provides information about different TSM (Temporal Shift Module) models trained using ResNet50 backbone with three sampling methods: Uniform, NCHW, NHWC+AMP. It shows the training strategy, number of segments, target size, and Top-1 accuracy for each model. It also mentions where to find the corresponding checkpoints and provides instructions on how to export the inference model using Python script.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md\":167-180",
+            "content": "| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |\n| :------: | :-------------: | :-----------------: | :-----: | :---------: | :---: | :---------: |\n| ResNet50 |     Uniform     | NCHW              |    8    |     224     | 94.42 |    [TSM_ucf101_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_nchw.pdparams)     |\n| ResNet50 |     Uniform     | NCHW+AMP |    8    |     224     | 94.40 |   [TSM_ucf101_amp_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nchw.pdparams)     |\n| ResNet50 |     Uniform     | NHWC+AMP |    8    |     224     | 94.55 |   [TSM_ucf101_amp_nhwc.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nhwc.pdparams)     |\n## Inference\n### export inference model\nTo get model architecture file `TSM.pdmodel` and parameters file `TSM.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml \\"
+        },
+        {
+            "comment": "This code is running a model inference for TSM (Temporal Shift Module) on an input video file using PaddlePaddle framework. It specifies the necessary arguments including the input file, configuration file, and model files. The --use_gpu and --use_tensorrt options are set to True and False respectively. The data processing step involves dividing the video into segments, extracting frames randomly, and applying random data enhancement.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md\":181-202",
+            "content": "                                -p data/TSM_k400.pdparams \\\n                                -o inference/TSM\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/tsm/tsm_k400_frames.yaml \\\n                           --model_file inference/TSM/TSM.pdmodel \\\n                           --params_file inference/TSM/TSM.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\n## Implementation details\n### data processing\n- The model reads the `mp4` data in the Kinetics-400 data set, first divides each piece of video data into `num_seg` segments, and then uniformly extracts 1 frame of image from each segment to obtain sparsely sampled `num_seg` video frames. Then do the same random data enhancement to this `n"
+        },
+        {
+            "comment": "The code outlines the training strategy for TSM (Temporal Shift Module) model, which includes using Momentum optimization algorithm with L2_Decay, global gradient clipping, and attenuating the learning rate at certain epochs. It also specifies the total number of epochs, learning rates for FC layer weights and biases, Dropout ratio, and parameter initialization methods.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tsm.md\":202-220",
+            "content": "um_seg` frame image, including multi-scale random cropping, random left and right flips, data normalization, etc., and finally zoom to `target_size`.\n### Training strategy\n*  Use Momentum optimization algorithm training, momentum=0.9\n*  Using L2_Decay, the weight attenuation coefficient is 1e-4\n*  Using global gradient clipping, the clipping factor is 20.0\n*  The total number of epochs is 50, and the learning rate will be attenuated by 0.1 times when the epoch reaches 20 and 40\n*  The learning rate of the weight and bias of the FC layer are respectively 5 times and 10 times the overall learning rate, and the bias does not set L2_Decay\n*  Dropout_ratio=0.5\n### Parameter initialization\n- Initialize the weight of the FC layer with the normal distribution of Normal(mean=0, std=0.001), and initialize the bias of the FC layer with a constant of 0\n## Reference\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/a99d6a07-98f2-4236-a972-9c1f702833c9.json b/docs/doc/a99d6a07-98f2-4236-a972-9c1f702833c9.json
new file mode 100644
index 000000000..7e63c8c15
--- /dev/null
+++ b/docs/doc/a99d6a07-98f2-4236-a972-9c1f702833c9.json
@@ -0,0 +1,30 @@
+{
+    "summary": "EIVideo is a Windows-based video annotation tool using Baidu Paddle MA-Net model, maintained by QPT-Family on GitHub and available in pre-release/stable versions with customization, usage instructions, updates, and licensing details.",
+    "details": [
+        {
+            "comment": "EIVideo is an interactive intelligent video annotation tool, available for Windows systems starting from Win7. It has downloadable packages for both EIVideo and QEIVideo users, with options to choose pre-releases or the latest stable version. The tool features a user-friendly interface and is actively maintained under the QPT-Family organization on GitHub, with an open license for use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/README.md\":0-14",
+            "content": "# EIVideo - \u4ea4\u4e92\u5f0f\u667a\u80fd\u89c6\u9891\u6807\u6ce8\u5de5\u5177\n[![Downloads](https://static.pepy.tech/personalized-badge/eivideo?period=total&units=international_system&left_color=grey&right_color=orange&left_text=EIVideo%20User)](https://pepy.tech/project/eivideo)\n[![Downloads](https://static.pepy.tech/personalized-badge/qeivideo?period=total&units=international_system&left_color=grey&right_color=orange&left_text=QEIVideo%20User)](https://pepy.tech/project/qeivideo)\n![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/QPT-Family/EIVideo?include_prereleases)\n![GitHub forks](https://img.shields.io/github/forks/QPT-Family/EIVideo)\n![GitHub Repo stars](https://img.shields.io/github/stars/QPT-Family/EIVideo)\n![GitHub](https://img.shields.io/github/license/QPT-Family/EIVideo)\n![](https://img.shields.io/badge/%E6%B7%B1%E5%BA%A6%E9%80%82%E9%85%8D->Win7-9cf)\n---\n<div align=\"center\">\n<img width=\"600\" alt=\"\u56fe\u7247\" src=\"https://user-images.githubusercontent.com/46156734/148925774-a04b641c-6a71-43ed-a7c0-d4b66e8d6e8a.png\">"
+        },
+        {
+            "comment": "EIVideo: Interactive intelligent video annotation toolbox, based on the Baidu Paddle MA-Net interactive video segmentation model. Can complete full video annotation with simple frame tagging. Improves video segmentation quality through multiple interactions with the video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/README.md\":15-48",
+            "content": "</div>\nEIVideo\uff0c\u57fa\u4e8e\u767e\u5ea6\u98de\u6868MA-Net\u4ea4\u4e92\u5f0f\u89c6\u9891\u5206\u5272\u6a21\u578b\u6253\u9020\u7684\u4ea4\u4e92\u5f0f**\u667a\u80fd\u89c6\u9891**\u6807\u6ce8\u5de5\u5177\u7bb1\uff0c\u53ea\u9700\u7b80\u5355\u6807\u6ce8\u51e0\u5e27\uff0c\u5373\u53ef\u5b8c\u6210\u5168\u89c6\u9891\u6807\u6ce8\uff0c\u82e5\u81ea\u52a8\u6807\u6ce8\u7ed3\u679c\u672a\u8fbe\u8981\u6c42\u8fd8\u53ef\u901a\u8fc7\u591a\u6b21\u548c\u89c6\u9891\u4ea4\u4e92\u800c\u4e0d\u65ad\u63d0\u5347\u89c6\u9891\u5206\u5272\u8d28\u91cf\uff0c\u76f4\u81f3\u5bf9\u5206\u5272\u8d28\u91cf\u6ee1\u610f\u3002  \n\u6233 -> \u4e86\u89e3\u76f8\u5173[\u6280\u672f\u6587\u7ae0&\u6a21\u578b\u539f\u7406](\u7b49\u5f85\u5fae\u4fe1\u516c\u4f17\u53f7)\n<div align=\"center\">\n<img width=\"300\" alt=\"\u56fe\u7247\" src=\"https://ai-studio-static-online.cdn.bcebos.com/f792bac0dd3b4f44ade7d744b58e908e2a85ed8718b541cfb6b2ce9fc8ad4374\">\n</div>\n> \u4e3a\u4e86\u66f4\u597d\u7684\u89e3\u653e\u53cc\u624b\uff0c\u6211\u4eec\u8fd8\u63d0\u4f9b\u4e86\u56fe\u5f62\u5316\u754c\u9762\u5de5\u5177QEIVideo\uff0c\u901a\u8fc7\u5b83\u6211\u4eec\u53ef\u4ee5\u4e0d\u4f7f\u7528\u7e41\u6742\u7684\u547d\u4ee4\u65b9\u5f0f\u6765\u5b8c\u6210\u89c6\u9891\u7684\u667a\u80fd\u6807\u6ce8\u5de5\u4f5c\u3002\n---\n### README\u76ee\u5f55\n- [EAP - The Early Access Program \u65e9\u671f\u8bbf\u95ee\u8ba1\u5212](#eap---the-early-access-program-\u65e9\u671f\u8bbf\u95ee\u8ba1\u5212)\n- [\u4f7f\u7528\u65b9\u5f0f](#\u4f7f\u7528\u65b9\u5f0f)\n  - [\u5b89\u88c5&\u8fd0\u884c](#\u5b89\u88c5\u8fd0\u884c)\n    - [QPT\u5305 - \u9002\u5408\u65e0Python\u57fa\u7840\u7528\u6237](#qpt\u5305---\u9002\u5408\u65e0python\u57fa\u7840\u7528\u6237)\n    - [\u6807\u51c6Python\u5305 - \u9002\u5408\u666e\u901aPython\u5f00\u53d1\u8005](#\u6807\u51c6python\u5305---\u9002\u5408\u666e\u901apython\u5f00\u53d1\u8005)\n    - [\u5f00\u53d1\u7248\u672c - \u9002\u5408\u9ad8\u9636\u5f00\u53d1\u8005\u8fdb\u884c\u5f00\u53d1/\u793e\u533a\u8d21\u732e](#\u5f00\u53d1\u7248\u672c---\u9002\u5408\u9ad8\u9636\u5f00\u53d1\u8005\u8fdb\u884c\u5f00\u53d1\u793e\u533a\u8d21\u732e)\n- [(Q)EIVideo\u4ea7\u54c1\u89c4\u5212\u5b89\u6392](#qeivideo\u4ea7\u54c1\u89c4\u5212\u5b89\u6392)\n- [\u5f00\u6e90\u534f\u8bae](#\u5f00\u6e90\u534f\u8bae)\n---\n### EAP - The Early Access Program \u65e9\u671f\u8bbf\u95ee\u8ba1\u5212\n> Warning \u5f53\u524d\u56fe\u5f62\u5316\u754c\u9762QEIVideo\u5904\u4e8e**\u6781\u5176\u521d\u9636**\u7684...\u5efa\u8bbe\u9636\u6bb5\uff0c\u5e76\u4e0d\u80fd\u4fdd\u8bc1\u7a0b\u5e8f\u7a33\u5b9a\u6027\u3002\n<div align=\"center\"> <img width=\"100\" alt=\"\u56fe\u7247\" src=\"https://user-images.githubusercontent.com/46156734/148927601-791362c0-0286-4fb9-b9d1-c193f7485de1.png\"> </div>\n\u5f53\u60a8\u9009\u62e9\u4f7f\u7528QEIVideo\u4f5c\u4e3a\u56fe\u5f62\u5316\u754c\u9762\u65f6\uff0c\u5373\u53ef\u89c6\u4e3a\u540c\u610f\u4f7f\u7528\u201c\u53ef\u80fd\u4f1a\u5b58\u5728\u5927\u91cf\u4f53\u9a8c\u4e0d\u4f73\u201d\u7684EAP\u4ea7\u54c1\u3002"
+        },
+        {
+            "comment": "Code is introducing the user to EIVideo, a customizable interactive video annotation model based on PaddleVideo, with instructions for installation and usage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/README.md\":50-84",
+            "content": "\u540c\u6837\uff0c\u60a8\u53ef\u9009\u62e9\u501f\u52a9\u57fa\u4e8e[PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo) \u5b9e\u73b0\u7684\n\u4ea4\u4e92\u5f0f\u89c6\u9891\u6807\u6ce8\u6a21\u578b[EIVideo](https://github.com/QPT-Family/EIVideo/EIVideo) \u8fdb\u884c\u4e8c\u6b21\u5f00\u53d1\uff0c\u5728\u6b64\u4e4b\u4e0a\u4e5f\u53ef\u5b8c\u6210\u60a8\u9700\u8981\u7684\u81ea\u5b9a\u4e49\u56fe\u5f62\u5316\u754c\u9762\uff0c\u540e\u7eed\u4e5f\u5c06\u63d0\u4f9b\u4e8c\u6b21\u5f00\u53d1\u6307\u5357\u3002\n<div align=\"center\"> <img width=\"100\" alt=\"\u56fe\u7247\" src=\"https://user-images.githubusercontent.com/46156734/148928046-b1490080-52f0-4a15-b7ff-11d54b135039.png\"> </div>\n> \u5982\u679c\u60a8\u613f\u610f\u53c2\u4e0e\u5230EIVideo\u6216QEIVideo\u7684\u5efa\u8bbe\u4e2d\u6765\uff0c\u6b22\u8fce\u60a8\u4e0ePMC\u53d6\u5f97\u8054\u7cfb -> WX:GT_ZhangAcer  \n## \u4f7f\u7528\u65b9\u5f0f\n### \u5b89\u88c5&\u8fd0\u884c\n#### QPT\u5305 - \u9002\u5408\u65e0Python\u57fa\u7840\u7528\u6237\n\u81ea\u52a8\u5316\u914d\u7f6e\u76f8\u5173Python\u73af\u5883\uff0c\u4f46\u4ec5\u652f\u6301Windows7/10/11\u64cd\u4f5c\u7cfb\u7edf\uff0c\u4e14\u4e0d\u5bf9\u76d7\u7248Windows7\u505a\u4efb\u4f55\u9002\u914d\u3002  \n\u4e0b\u8f7d\u5730\u5740\uff1a\u6682\u672a\u4e0a\u4f20\n> \u81ea\u52a8\u5316\u90e8\u7f72\u5de5\u5177\u7531[QPT - \u81ea\u52a8\u5c01\u88c5\u5de5\u5177](https://github.com/QPT-Family/QPT) \u652f\u6301  \n#### \u6807\u51c6Python\u5305 - \u9002\u5408\u666e\u901aPython\u5f00\u53d1\u8005\n* \u56fd\u9645\u65b9\u5f0f\uff1a\n  ```shell\n  python -m pip install eivideo\n  python qeivideo\n  ```\n* \u56fd\u5185\u63a8\u8350\uff1a\n  ```shell\n  python -m pip install eivideo -i https://mirrors.bfsu.edu.cn/pypi/web/simple\n  python qeivideo\n  ```\n> \u4e0a\u8ff0\u547d\u4ee4\u4ec5\u9002\u7528\u4e8e\u5e38\u89c4\u60c5\u51b5\uff0c\u82e5\u60a8\u5b89\u88c5\u4e86\u591a\u4e2aPython\u6216\u4fee\u6539\u4e86\u76f8\u5173\u5f00\u53d1\u5de5\u5177\u4e0e\u914d\u7f6e\uff0c\u8bf7\u81ea\u884c\u4fee\u6539\u76f8\u5173\u547d\u4ee4\u4f7f\u5176\u7b26\u5408\u60a8\u7684\u5f00\u53d1\u73af\u5883\u3002\n#### \u5f00\u53d1\u7248\u672c - \u9002\u5408\u9ad8\u9636\u5f00\u53d1\u8005\u8fdb\u884c\u5f00\u53d1/\u793e\u533a\u8d21\u732e\n* \u56fd\u9645\u65b9\u5f0f\uff1a\n  ```shell\n  git clone https://github.com/QPT-Family/EIVideo.git\n  python -m pip install -r requirements.txt\n  ```"
+        },
+        {
+            "comment": "This code provides instructions for cloning the EIVideo repository, installing necessary dependencies, and running the QEIVideo application. It also mentions that these commands are suitable for regular cases, and users might need to modify them according to their specific development environment. The code discusses the product roadmap of (Q)EIVideo, including planned features and versions. It also specifies the open-source license used for this project and clarifies that the code and parameters cannot be directly used for commercial purposes without prior consent from the developers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/README.md\":85-118",
+            "content": "* \u56fd\u5185\u63a8\u8350\uff1a\n  ```shell\n  # \u8bf7\u52ff\u7528\u4e8ePush\uff01\uff01\uff01\n  git clone https://hub.fastgit.org/QPT-Family/EIVideo.git\n  python -m pip install -r requirements.txt -i https://mirrors.bfsu.edu.cn/pypi/web/simple\n  ```\n* \u8fd0\u884c\u7a0b\u5e8f\n  ```shell\n  # \u8fdb\u5165\u5de5\u4f5c\u76ee\u5f55\n  cd \u6b64\u5904\u586b\u5199EIVideo\u6240\u5728\u7684\u76ee\u5f55\u7684\u7edd\u5bf9\u8def\u5f84\uff0c\u4e14\u8be5\u76ee\u5f55\u4e0b\u62e5\u6709EIVideo\u4e0eQEIVideo\u4e24\u6587\u4ef6\u5939\u3002\n  # \u8fd0\u884c\n  python QEIVideo/start.py\n  # \u5982\u8fd0\u884c\u65f6\u65e0\u6cd5\u627e\u5230\u5bf9\u5e94\u5305\uff0c\u53ef\u9009\u62e9\u4e0b\u8ff0\u65b9\u5f0f\u6dfb\u52a0\u73af\u5883\u53d8\u91cf\u6765\u8c03\u6574\u7d22\u5f15\u6b21\u5e8f\u540e\u6267\u884cpython\n  # Windows\n  set PYTHONPATH=$pwd:$PYTHONPATH\n  # Linux\n  export PYTHONPATH=$pwd:$PYTHONPATH\n  ```\n> \u4e0a\u8ff0\u547d\u4ee4\u4ec5\u9002\u7528\u4e8e\u5e38\u89c4\u60c5\u51b5\uff0c\u82e5\u60a8\u5b89\u88c5\u4e86\u591a\u4e2aPython\u6216\u4fee\u6539\u4e86\u76f8\u5173\u5f00\u53d1\u5de5\u5177\u4e0e\u914d\u7f6e\uff0c\u8bf7\u81ea\u884c\u4fee\u6539\u76f8\u5173\u547d\u4ee4\u4f7f\u5176\u7b26\u5408\u60a8\u7684\u5f00\u53d1\u73af\u5883\u3002\n## (Q)EIVideo\u4ea7\u54c1\u89c4\u5212\u5b89\u6392  \n> \u7531\u4e8eQEIVideo\u7531\u98de\u6868\u5f00\u6e90\u793e\u533a\u5b66\u751f\u7231\u597d\u8005\u6784\u6210\uff0c\u6240\u4ee5\u5728\u9879\u76ee\u7684\u4ea7\u51fa\u8fc7\u7a0b\u4e2d\u5c06\u4f1a\u4ee5\u5b66\u4e60\u4e3a\u4e3b\u8fdb\u884c\u5f00\u6e90\u8d21\u732e\uff0c\u5982\u60a8\u539f\u56e0\u4e0e\u6211\u4eec\u4e00\u540c\u5efa\u8bbe\uff0c\u6211\u4eec\u4e5f\u5c06\u975e\u5e38\u6b22\u8fce~\n<div align=\"center\"> <img width=\"100\" alt=\"\u56fe\u7247\" src=\"https://user-images.githubusercontent.com/46156734/148928475-b5b340b7-241d-4ddc-8155-70d98c6384a9.png\"> </div>\n- [x] EIVideo\u4e0eDemo\u7248QEIVideo\u53d1\u5e030.1.0Alpha\u7248\u672c\n- [ ] \u5b8c\u5584QEIVideo\uff0c\u4e30\u5bcc\u57fa\u7840\u6807\u6ce8\u529f\u80fd\uff0c\u4e8eQ1\u5347\u7ea7\u81f31.0Alpha\u7248\u672c\n- [ ] \u56de\u5f52QEIVideo\u7a33\u5b9a\u6027\uff0c\u4e8eQ2\u5b8c\u62101.0\u6b63\u5f0f\u7248\u672c\u53d1\u7248\n- [ ] \u589e\u52a0\u89c6\u9891\u76ee\u6807\u68c0\u6d4b\u3001\u5206\u7c7b\u4efb\u52a1\u7684\u4ea4\u4e92\u5f0f\u6807\u6ce8\u529f\u80fd\u3002\n### \u5f00\u6e90\u534f\u8bae\n\u672c\u9879\u76ee\u4f7f\u7528GNU LESSER GENERAL PUBLIC LICENSE(LGPL)\u5f00\u6e90\u534f\u8bae\u3002  \n> \u56e0\u6240\u4f7f\u7528\u7684\u6a21\u578b\u4e0e\u6570\u636e\u96c6\u7b49\u539f\u56e0\uff0c\u672c\u9879\u76ee\u4e2d\u4efb\u4e00\u4ee3\u7801\u3001\u53c2\u6570\u5747\u4e0d\u53ef\u76f4\u63a5\u8fdb\u884c\u5546\u7528\uff0c\u5982\u9700\u5546\u7528\u8bf7\u4e0e\u6211\u4eec\u53d6\u5f97\u8054\u7cfb\u3002"
+        },
+        {
+            "comment": "This code block provides the reference sources for the EIVideo model and its related resources, as well as mentioning the origin of some emoji used in the project.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/README.md\":120-122",
+            "content": "### \u5f15\u7528\u6765\u6e90\n1. EIVideo\u6a21\u578b\u4ee5\u53ca\u76f8\u5173\u6e90\u7801\u3001\u8bba\u6587\u4e0e\u9879\u76ee - [PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo)\n2. \u90e8\u5206\u8868\u60c5\u5305\u6765\u6e90 - [\u7518\u57ce\u306a\u3064\u304d](https://www.pixiv.net/users/3036679)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/aa25705c-75de-4665-bdcc-bdb5440b0d48.json b/docs/doc/aa25705c-75de-4665-bdcc-bdb5440b0d48.json
new file mode 100644
index 000000000..3ccedf02c
--- /dev/null
+++ b/docs/doc/aa25705c-75de-4665-bdcc-bdb5440b0d48.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The code is executing a Python script (tools/export_model.py) with specific parameters to export the best model from ./configs/recognition/tsm/pptsm.yaml, save it as ./output/ppTSM/ppTSM\\_best.pdparams and store the inference files in ./inference/. It will use 32 segments for processing.",
+    "details": [
+        {
+            "comment": "The code is executing a Python script (tools/export_model.py) with specific parameters to export the best model from ./configs/recognition/tsm/pptsm.yaml, save it as ./output/ppTSM/ppTSM\\_best.pdparams and store the inference files in ./inference/. It will use 32 segments for processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/save_model.sh\":0-4",
+            "content": "python tools/export_model.py \\\n  -c ./configs/recognition/tsm/pptsm.yaml \\\n  -p ./output/ppTSM/ppTSM_best.pdparams \\\n  -o ./inference/ \\\n  --num_seg=32 "
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/aa52229a-f66f-49fa-8f7e-59b2ef30f610.json b/docs/doc/aa52229a-f66f-49fa-8f7e-59b2ef30f610.json
new file mode 100644
index 000000000..90c039f30
--- /dev/null
+++ b/docs/doc/aa52229a-f66f-49fa-8f7e-59b2ef30f610.json
@@ -0,0 +1,45 @@
+{
+    "summary": "The README guides using PaddleVideo's Fight Recognition model for detecting fight and non-fight videos across four datasets. It includes data preparation, training, evaluation, exporting, quickstart guidance, and GPU usage control.",
+    "details": [
+        {
+            "comment": "This README provides an overview of the Fight Recognition model using PaddleVideo, including sections on quick start, data preparation, model training, evaluation, and model export. The PP-TSM model is used for fight recognition and can be adapted from the existing PP-TSM video classification model training process. Quickstart instructions and download links are provided, along with information on where to find additional usage guidance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FightRecognition/README.md\":0-28",
+            "content": "# \u6253\u67b6\u8bc6\u522b\u6a21\u578b\n## \u5185\u5bb9\n- [1 \u5feb\u901f\u5f00\u59cb](#\u5feb\u901f\u5f00\u59cb)\n- [2 \u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n    - [2.1 \u6570\u636e\u96c6\u4e0b\u8f7d](#\u6570\u636e\u96c6\u4e0b\u8f7d)\n    - [2.2 \u89c6\u9891\u62bd\u5e27](#\u89c6\u9891\u62bd\u5e27)\n    - [2.3 \u8bad\u7ec3\u96c6\u548c\u9a8c\u8bc1\u96c6\u5212\u5206](#\u8bad\u7ec3\u96c6\u548c\u9a8c\u8bc1\u96c6\u5212\u5206)\n    - [2.4 \u89c6\u9891\u88c1\u526a](#\u89c6\u9891\u88c1\u526a)\n- [3 \u6a21\u578b\u8bad\u7ec3](#\u6a21\u578b\u8bad\u7ec3)\n- [4 \u6a21\u578b\u8bc4\u4f30](#\u6a21\u578b\u8bc4\u4f30)\n- [5 \u6a21\u578b\u5bfc\u51fa](#\u6a21\u578b\u5bfc\u51fa)\n\u5b9e\u65f6\u884c\u4eba\u5206\u6790\u5de5\u5177[PP-Human](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/pphuman)\u4e2d\u96c6\u6210\u4e86\u89c6\u9891\u5206\u7c7b\u7684\u6253\u67b6\u8bc6\u522b\u6a21\u5757\u3002\u672c\u6587\u6863\u4ecb\u7ecd\u5982\u4f55\u57fa\u4e8e[PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo/)\uff0c\u5b8c\u6210\u6253\u67b6\u8bc6\u522b\u6a21\u578b\u7684\u8bad\u7ec3\u6d41\u7a0b\u3002\n\u76ee\u524d\u6253\u67b6\u8bc6\u522b\u6a21\u578b\u4f7f\u7528\u7684\u662f[PP-TSM](https://github.com/PaddlePaddle/PaddleVideo/blob/63c88a435e98c6fcaf353429d2df6cc24b8113ba/docs/zh-CN/model_zoo/recognition/pp-tsm.md)\uff0c\u5e76\u5728PP-TSM\u89c6\u9891\u5206\u7c7b\u6a21\u578b\u8bad\u7ec3\u6d41\u7a0b\u7684\u57fa\u7840\u4e0a\u4fee\u6539\u9002\u914d\uff0c\u5b8c\u6210\u6a21\u578b\u8bad\u7ec3\u3002\n\u8bf7\u5148\u53c2\u8003[\u4f7f\u7528\u8bf4\u660e](https://github.com/XYZ-916/PaddleVideo/blob/develop/docs/zh-CN/usage.md)\u4e86\u89e3PaddleVideo\u6a21\u578b\u5e93\u7684\u4f7f\u7528\u3002\n<a name=\"\u5feb\u901f\u5f00\u59cb\"></a>\n## 1 \u5feb\u901f\u5f00\u59cb\n\u6253\u67b6\u8bc6\u522b\u9759\u6001\u56fe\u6a21\u578b\u83b7\u53d6[https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.zip](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.zip)\u3002\n\u6253\u67b6\u8bc6\u522b[demo](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/fight_demo.mp4)\u3002\n\u9996\u5148\u9700\u8981\u5c06\u4e0b\u8f7d\u597d\u7684\u9759\u6001\u56fe\u6a21\u578b\u89e3\u538b\u5e76\u653e\u5230`inference`\u76ee\u5f55\u4e0b\uff0c\u7136\u540e\u6267\u884c\u4e0b\u9762\u7684\u547d\u4ee4\u5373\u53ef\u76f4\u63a5\u5224\u65ad\u4e00\u4e2a\u7ed9\u5b9a\u7684\u89c6\u9891\u4e2d\u662f\u5426\u5b58\u5728\u6253\u67b6\u884c\u4e3a\uff1a"
+        },
+        {
+            "comment": "This code is executing a Python script named \"predict.py\" in PaddleVideo's root directory, to predict fight events from a video file named 'fight.avi'. It uses the pre-trained pptsm_fight_frames_dense model and sets GPU usage and TensorRT as False.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FightRecognition/README.md\":30-54",
+            "content": "```\ncd ${PaddleVideo_root}\npython tools/predict.py --input_file fight.avi \\\n                           --config pptsm_fight_frames_dense.yaml \\\n                           --model_file inference/ppTSM/ppTSM.pdmodel \\\n                           --params_file inference/ppTSM/ppTSM.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\n<a name=\"\u6570\u636e\u51c6\u5907\"></a>\n## 2 \u6570\u636e\u51c6\u5907\nPP-TSM\u662f\u4e00\u4e2a\u57fa\u4e8e\u89c6\u9891\u7247\u6bb5\u8fdb\u884c\u9884\u6d4b\u7684\u6a21\u578b\u3002\u5728PaddleVideo\u4e2d\uff0c\u8bad\u7ec3\u6570\u636e\u4e3a`.mp4`\u3001`.avi`\u7b49\u683c\u5f0f\u89c6\u9891\u6216\u8005\u662f\u62bd\u5e27\u540e\u7684\u89c6\u9891\u5e27\u5e8f\u5217\uff0c\u6807\u7b7e\u5219\u53ef\u4ee5\u662f`.txt`\u683c\u5f0f\u5b58\u50a8\u7684\u6587\u4ef6\u3002\n<a name=\"\u6570\u636e\u96c6\u4e0b\u8f7d\"></a>\n### 2.1 \u6570\u636e\u96c6\u4e0b\u8f7d\n\u672c\u9879\u76ee\u57fa\u4e8e6\u4e2a\u516c\u5f00\u7684\u6253\u67b6\u3001\u66b4\u529b\u884c\u4e3a\u76f8\u5173\u6570\u636e\u96c6\u5408\u5e76\u540e\u7684\u6570\u636e\u8fdb\u884c\u6a21\u578b\u8bad\u7ec3\u3002\u516c\u5f00\u6570\u636e\u96c6\u5177\u4f53\u4fe1\u606f\u5982\u4e0b\uff1a\n| \u6570\u636e\u96c6 | \u4e0b\u8f7d\u8fde\u63a5 | \u7b80\u4ecb | \u6807\u6ce8 | \u6570\u91cf | \u65f6\u957f |\n| ---- | ---- | ---------- | ---- | ---- | ---------- |\n|  Surveillance Camera Fight Dataset| https://github.com/sayibet/fight-detection-surv-dataset | \u88c1\u526a\u89c6\u9891\uff0c\u76d1\u63a7\u89c6\u89d2 | \u89c6\u9891\u7ea7\u522b | \u6253\u67b6\uff1a150\uff1b\u975e\u6253\u67b6\uff1a150 | 2s |\n| A Dataset for Automatic Violence Detection in Videos | https://github.com/airtlab/A-Dataset-for-Automatic-Violence-Detection-in-Videos | \u88c1\u526a\u89c6\u9891\uff0c\u5ba4\u5185\u81ea\u884c\u5f55\u5236 | \u89c6\u9891\u7ea7\u522b | \u66b4\u529b\u884c\u4e3a\uff1a115\u4e2a\u573a\u666f\uff0c2\u4e2a\u673a\u4f4d\uff0c\u5171230 \uff1b\u975e\u66b4\u529b\u884c\u4e3a\uff1a60\u4e2a\u573a\u666f\uff0c2\u4e2a\u673a\u4f4d\uff0c\u5171120 | \u51e0\u79d2\u949f |"
+        },
+        {
+            "comment": "Code comments:\n- Hockey Fight Detection Dataset: URL, clipped videos, non-realistic scenarios, video level, 500 fight and 500 non-fight videos, 2s duration.\n- Video Fight Detection Dataset: URL, clipped videos, non-realistic scenarios, video level, 100 fights and 101 non-fights, 2s duration.\n- Real Life Violence Situations Dataset: URL, clipped videos, non-realistic scenarios, video level, 1000 fights and 1000 non-fights, a few seconds duration.\n- UBI Abnormal Event Detection Dataset: URL, unclipped videos, surveillance angle, frame level, 216 fights, 784 non-fights, 7,840 frames total, original video durations varying from a few seconds to a few minutes.\n- Extracting rawframes for faster training by running a script in PaddleVideo_root.\n- Split dataset into fight and non-fight videos stored in fight and nofight directories respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FightRecognition/README.md\":55-74",
+            "content": "| Hockey Fight Detection Dataset | https://www.kaggle.com/datasets/yassershrief/hockey-fight-vidoes?resource=download | \u88c1\u526a\u89c6\u9891\uff0c\u975e\u771f\u5b9e\u573a\u666f | \u89c6\u9891\u7ea7\u522b | \u6253\u67b6\uff1a500\uff1b\u975e\u6253\u67b6\uff1a500 | 2s |\n| Video Fight Detection Dataset | https://www.kaggle.com/datasets/naveenk903/movies-fight-detection-dataset | \u88c1\u526a\u89c6\u9891\uff0c\u975e\u771f\u5b9e\u573a\u666f | \u89c6\u9891\u7ea7\u522b | \u6253\u67b6\uff1a100\uff1b\u975e\u6253\u67b6\uff1a101 | 2s |\n| Real Life Violence Situations Dataset | https://www.kaggle.com/datasets/mohamedmustafa/real-life-violence-situations-dataset | \u88c1\u526a\u89c6\u9891\uff0c\u975e\u771f\u5b9e\u573a\u666f | \u89c6\u9891\u7ea7\u522b | \u66b4\u529b\u884c\u4e3a\uff1a1000\uff1b\u975e\u66b4\u529b\u884c\u4e3a\uff1a1000 | \u51e0\u79d2\u949f |\n| UBI Abnormal Event Detection Dataset| http://socia-lab.di.ubi.pt/EventDetection/ | \u672a\u88c1\u526a\u89c6\u9891\uff0c\u76d1\u63a7\u89c6\u89d2 | \u5e27\u7ea7\u522b | \u6253\u67b6\uff1a216\uff1b\u975e\u6253\u67b6\uff1a784\uff1b\u88c1\u526a\u540e\u4e8c\u6b21\u6807\u6ce8\uff1a\u6253\u67b61976\uff0c\u975e\u6253\u67b61630 | \u539f\u89c6\u9891\u51e0\u79d2\u5230\u51e0\u5206\u949f\u4e0d\u7b49\uff0c\u88c1\u526a\u540e2s |\n\u6253\u67b6\uff08\u66b4\u529b\u884c\u4e3a\uff09\u89c6\u98913956\u4e2a\uff0c\u975e\u6253\u67b6\uff08\u975e\u66b4\u529b\u884c\u4e3a\uff09\u89c6\u98913501\u4e2a\uff0c\u51717457\u4e2a\u89c6\u9891\uff0c\u6bcf\u4e2a\u89c6\u9891\u51e0\u79d2\u949f\u3002\n<a name=\"\u89c6\u9891\u62bd\u5e27\"></a>\n### 2.2 \u89c6\u9891\u62bd\u5e27\n\u4e3a\u4e86\u52a0\u5feb\u8bad\u7ec3\u901f\u5ea6\uff0c\u5c06\u89c6\u9891\u8fdb\u884c\u62bd\u5e27\u3002\n```bash\ncd ${PaddleVideo_root}\npython data/ucf101/extract_rawframes.py dataset/ rawframes/ --level 2 --ext mp4\n```\n\u5176\u4e2d\uff0c\u89c6\u9891\u5b58\u653e\u5728`dataset`\u76ee\u5f55\u4e0b\uff0c\u6253\u67b6\uff08\u66b4\u529b\uff09\u89c6\u9891\u5b58\u653e\u5728`dataset/fight`\u4e2d\uff1b\u975e\u6253\u67b6\uff08\u975e\u66b4\u529b\uff09\u89c6\u9891\u5b58\u653e\u5728`dataset/nofight`\u4e2d\u3002`rawframes`\u76ee\u5f55\u5b58\u653e\u62bd\u53d6\u7684\u89c6\u9891\u5e27\u3002\n<a name=\"\u8bad\u7ec3\u96c6\u548c\u9a8c\u8bc1\u96c6\u5212\u5206\"></a>\n### 2.3 \u8bad\u7ec3\u96c6\u548c\u9a8c\u8bc1\u96c6\u5212\u5206"
+        },
+        {
+            "comment": "The code reads data from three datasets: Surveillance Camera Fight Dataset, A Dataset for Automatic Violence Detection in Videos, and UBI Abnormal Event Detection Dataset. It also allows for splitting the data into training and testing sets with an 80:20 ratio. The 'get_list' function retrieves the list of files and counts them, while the 'fight_splits' function takes the video dictionary and train percent as inputs to split the data into training and testing sets.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FightRecognition/README.md\":76-117",
+            "content": "\u672c\u9879\u76ee\u9a8c\u8bc1\u96c61500\u6761\uff0c\u6765\u81eaSurveillance Camera Fight Dataset\u3001A Dataset for Automatic Violence Detection in Videos\u3001UBI Abnormal Event Detection Dataset\u4e09\u4e2a\u6570\u636e\u96c6\u3002\n\u4e5f\u53ef\u6839\u636e\u4e0b\u9762\u7684\u4ee3\u7801\u5c06\u6570\u636e\u6309\u71670.8:0.2\u7684\u6bd4\u4f8b\u5212\u5206\u6210\u8bad\u7ec3\u96c6\u548c\u6d4b\u8bd5\u96c6\uff1a\n```python\nimport os\nimport glob\nimport random\nimport fnmatch\nimport re\nclass_id = {\n    \"nofight\":0,\n    \"fight\":1\n}\ndef get_list(path,key_func=lambda x: x[-11:], rgb_prefix='img_', level=1):\n    if level == 1:\n        frame_folders = glob.glob(os.path.join(path, '*'))\n    elif level == 2:\n        frame_folders = glob.glob(os.path.join(path, '*', '*'))\n    else:\n        raise ValueError('level can be only 1 or 2')\n    def count_files(directory):\n        lst = os.listdir(directory)\n        cnt = len(fnmatch.filter(lst, rgb_prefix + '*'))\n        return cnt\n    # check RGB\n    video_dict = {}\n    for f in frame_folders:\n        cnt = count_files(f)\n        k = key_func(f)\n        if level==2:\n            k = k.split(\"/\")[0]\n        video_dict[f]=str(cnt)+\" \"+str(class_id[k])\n    return video_dict\ndef fight_splits(video_dict, train_percent=0.8):"
+        },
+        {
+            "comment": "This code generates two lists, one for training and one for validation, based on a provided video dictionary. It then shuffles the list of videos and splits them into train and val lists. The code also defines a key function depending on the level parameter. Finally, it prints the lengths of both lists, writes them to separate files \"fight_train_list.txt\" and \"fight_val_list.txt\", and calls the fight_splits() function with the video dictionary and train percentage as parameters. These two files will contain the labels for training and validation sets, where fight (label 1) and non-fight (label 0) videos are listed separately.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FightRecognition/README.md\":118-159",
+            "content": "    videos = list(video_dict.keys())\n    train_num = int(len(videos)*train_percent)\n    train_list = []\n    val_list = []\n    random.shuffle(videos)\n    for i in range(train_num):\n        train_list.append(videos[i]+\" \"+str(video_dict[videos[i]]))\n    for i in range(train_num,len(videos)):\n        val_list.append(videos[i]+\" \"+str(video_dict[videos[i]]))\n    print(\"train:\",len(train_list),\",val:\",len(val_list))\n    with open(\"fight_train_list.txt\",\"w\") as f:\n        for item in train_list:\n            f.write(item+\"\\n\")\n    with open(\"fight_val_list.txt\",\"w\") as f:\n        for item in val_list:\n            f.write(item+\"\\n\")\nframe_dir = \"rawframes\"\nlevel = 2\ntrain_percent = 0.8\nif level == 2:\n    def key_func(x):\n        return '/'.join(x.split('/')[-2:])\nelse:\n    def key_func(x):\n        return x.split('/')[-1]\nvideo_dict = get_list(frame_dir, key_func=key_func, level=level)  \nprint(\"number:\",len(video_dict))\nfight_splits(video_dict, train_percent)\n```\n\u6700\u7ec8\u751f\u6210fight_train_list.txt\u548cfight_val_list.txt\u4e24\u4e2a\u6587\u4ef6\u3002\u6253\u67b6\u7684\u6807\u7b7e\u4e3a1\uff0c\u975e\u6253\u67b6\u7684\u6807\u7b7e\u4e3a0\u3002"
+        },
+        {
+            "comment": "The code defines a function `cut_video` which takes a video path, start and stop frame numbers, and a saved video path. It uses OpenCV to read the input video, determine its FPS, total frames, and size. The function then creates a new VideoWriter object with the specified output file name, fourcc codec, and same FPS as the input video. It writes only the frames between the start and stop frame numbers to the new video file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FightRecognition/README.md\":161-191",
+            "content": "<a name=\"\u89c6\u9891\u88c1\u526a\"></a>\n### 2.4 \u89c6\u9891\u88c1\u526a\n\u5bf9\u4e8e\u672a\u88c1\u526a\u7684\u89c6\u9891\uff0c\u9700\u8981\u5148\u8fdb\u884c\u88c1\u526a\u624d\u80fd\u7528\u4e8e\u6a21\u578b\u8bad\u7ec3\uff0c\u8fd9\u4e2a\u7ed9\u51fa\u89c6\u9891\u88c1\u526a\u7684\u51fd\u6570`cut_video`\uff0c\u8f93\u5165\u4e3a\u89c6\u9891\u8def\u5f84\uff0c\u88c1\u526a\u7684\u8d77\u59cb\u5e27\u548c\u7ed3\u675f\u5e27\u4ee5\u53ca\u88c1\u526a\u540e\u7684\u89c6\u9891\u4fdd\u5b58\u8def\u5f84\u3002\n```python\nimport cv2\ndef cut_video(video_path, frameToStart, frametoStop, saved_video_path):\n    cap = cv2.VideoCapture(video_path)\n    FPS = cap.get(cv2.CAP_PROP_FPS)\n    #print(\"FPS:\",FPS)\n    TOTAL_FRAME = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # \u83b7\u53d6\u89c6\u9891\u603b\u5e27\u6570\n    #print(\"TOTAL_FRAME:\",TOTAL_FRAME)\n    size = (cap.get(cv2.CAP_PROP_FRAME_WIDTH), cap.get(cv2.CAP_PROP_FRAME_HEIGHT))\n    #print(\"size:\",size)\n    videoWriter =cv2.VideoWriter(saved_video_path,apiPreference = 0,fourcc = cv2.VideoWriter_fourcc(*'mp4v'),fps=FPS,\n            frameSize=(int(size[0]),int(size[1])))\n    COUNT = 0\n    while True:\n            success, frame = cap.read()\n            if success:\n                COUNT += 1\n                if COUNT <= frametoStop and COUNT > frameToStart:  # \u9009\u53d6\u8d77\u59cb\u5e27\n                    videoWriter.write(frame)\n            else:\n                print(\"cap.read failed!\")\n                break\n            if COUNT > frametoStop:"
+        },
+        {
+            "comment": "This code represents the final part of the model training process. The first line is a break statement which implies the end of a loop or condition block. Following that, it releases the `cap` and `videoWriter` objects, suggesting they were used for capturing and writing video data respectively. The last line prints out the saved video path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FightRecognition/README.md\":192-244",
+            "content": "                break\n    cap.release()\n    videoWriter.release()\n    print(saved_video_path)\n```\n<a name=\"\u6a21\u578b\u8bad\u7ec3\"></a>\n## 3 \u6a21\u578b\u8bad\u7ec3\n\u4e0b\u8f7d\u9884\u8bad\u7ec3\u6a21\u578b\uff1a\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams\n```\n\u6a21\u578b\u8bad\u7ec3\uff1a\n```bash\n# \u5355\u5361\u8bad\u7ec3\ncd ${PaddleVideo_root}\npython main.py --validate -c pptsm_fight_frames_dense.yaml\n```\n```bash\ncd ${PaddleVideo_root}\n# \u591a\u5361\u8bad\u7ec3\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython -B -m paddle.distributed.launch --gpus=\u201c0,1,2,3\u201d \\\n   --log_dir=log_pptsm_dense  main.py  --validate \\\n   -c pptsm_fight_frames_dense.yaml\n```\n<a name=\"\u6a21\u578b\u8bc4\u4f30\"></a>\n## 4 \u6a21\u578b\u8bc4\u4f30\n\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u4e0b\u8f7d\uff1a[https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.pdparams)\n\u6a21\u578b\u8bc4\u4f30\uff1a\n```bash\ncd ${PaddleVideo_root}\npython main.py --test -c pptsm_fight_frames_dense.yaml \\\n   -w ppTSM_fight_best.pdparams\n```\n\u5176\u4e2d`ppTSM_fight_best.pdparams`\u4e3a\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u3002\n<a name=\"\u6a21\u578b\u5bfc\u51fa\"></a>\n## 5 \u6a21\u578b\u5bfc\u51fa\n\u5bfc\u51fainference\u6a21\u578b\uff1a\n```bash\ncd ${PaddleVideo_root}\npython tools/export_model.py -c pptsm_fight_frames_dense.yaml \\"
+        },
+        {
+            "comment": "This code is loading a pre-trained model, \"ppTSM_fight_best.pdparams\", and saving the inference output to the \"inference/ppTSM\" directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FightRecognition/README.md\":245-247",
+            "content": "                                -p ppTSM_fight_best.pdparams \\\n                                -o inference/ppTSM\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/aa748ea4-6901-47a5-af9a-c291ee4adb6a.json b/docs/doc/aa748ea4-6901-47a5-af9a-c291ee4adb6a.json
new file mode 100644
index 000000000..afd5a140f
--- /dev/null
+++ b/docs/doc/aa748ea4-6901-47a5-af9a-c291ee4adb6a.json
@@ -0,0 +1,65 @@
+{
+    "summary": "This code defines a PaddlePaddle transformer encoder layer for normalization and training, including residual connections, dropout, self-attention mechanism, and position-wise feed-forward networks. It creates a Transformer Encoder with Scaled Dot-Product Attention for NLP tasks.",
+    "details": [
+        {
+            "comment": "This code defines a function called \"multi_head_attention\" which performs multi-head attention operations on queries, keys, and values. The function takes in additional parameters such as attn_bias, d_key, d_value, d_model. This is part of the Transformer encoder model implementation in PaddlePaddle framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":0-31",
+            "content": "#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Transformer encoder.\"\"\"\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom functools import partial\nimport paddle\nimport paddle.static as static\ndef multi_head_attention(queries,\n                         keys,\n                         values,\n                         attn_bias,\n                         d_key,\n                         d_value,\n                         d_model,"
+        },
+        {
+            "comment": "This code snippet defines a Multi-Head Attention layer. It takes in queries, keys (optional), and values (optional) as inputs, and performs linear projections on the queries before computing the attention weights. The function __compute_qkv also handles the case when keys or values are None by setting them to be equal to queries if needed. The inputs should all be 3-D tensors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":32-56",
+            "content": "                         n_head=1,\n                         dropout_rate=0.,\n                         cache=None,\n                         param_initializer=None,\n                         name='multi_head_att'):\n    \"\"\"\n    Multi-Head Attention. Note that attn_bias is added to the logit before\n    computing softmax activiation to mask certain selected positions so that\n    they will not considered in attention weights.\n    \"\"\"\n    keys = queries if keys is None else keys\n    values = keys if values is None else values\n    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):\n        raise ValueError(\n            \"Inputs: quries, keys and values should all be 3-D tensors.\")\n    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):\n        \"\"\"\n        Add linear projection to queries, keys, and values.\n        \"\"\"\n        q = static.nn.fc(x=queries,\n                      size=d_key * n_head,\n                      num_flatten_dims=2,\n                      weight_attr=paddle.ParamAttr("
+        },
+        {
+            "comment": "This code defines a function for the Transformer Encoder layer. It includes functions for multi-head attention, position-wise feed-forward network layers, and splits heads of input tensors. Parameters such as d_key, n_head, and param_initializer are used to define the dimensions and initialization methods for weights. The code uses Paddle's static nn library and defines the names for different FC layers within the function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":57-79",
+            "content": "                          name=name + '_query_fc.w_0',\n                          initializer=param_initializer),\n                      bias_attr=name + '_query_fc.b_0')\n        k = static.nn.fc(x=keys,\n                      size=d_key * n_head,\n                      num_flatten_dims=2,\n                      weight_attr=paddle.ParamAttr(\n                          name=name + '_key_fc.w_0',\n                          initializer=param_initializer),\n                      bias_attr=name + '_key_fc.b_0')\n        v = static.nn.fc(x=values,\n                      size=d_value * n_head,\n                      num_flatten_dims=2,\n                      weight_attr=paddle.ParamAttr(\n                          name=name + '_value_fc.w_0',\n                          initializer=param_initializer),\n                      bias_attr=name + '_value_fc.b_0')\n        return q, k, v\n    def __split_heads(x, n_head):\n        \"\"\"\n        Reshape the last dimension of inpunt tensor x so that it becomes two\n        dimensions and then transpose. Specifically, input a tensor with shape"
+        },
+        {
+            "comment": "This code is performing tensor reshaping and transposing operations to split the input tensor into multiple smaller tensors, representing different attention heads. The `__split_heads` function splits the tensor into a shape of [bs, n_head, max_sequence_length, hidden_dim], while the `__combine_heads` function reverses this process by transposing and reshaping the last two dimensions to combine the attention heads back into one dimension.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":80-103",
+            "content": "        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor\n        with shape [bs, n_head, max_sequence_length, hidden_dim].\n        \"\"\"\n        hidden_size = x.shape[-1]\n        # The value 0 in shape attr means copying the corresponding dimension\n        # size of the input as the output dimension size.\n        reshaped = paddle.reshape(\n            x=x, shape=[0, 0, n_head, hidden_size // n_head])\n        # permuate the dimensions into:\n        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]\n        return paddle.transpose(x=reshaped, perm=[0, 2, 1, 3])\n    def __combine_heads(x):\n        \"\"\"\n        Transpose and then reshape the last two dimensions of inpunt tensor x\n        so that it becomes one dimension, which is reverse to __split_heads.\n        \"\"\"\n        if len(x.shape) == 3: return x\n        if len(x.shape) != 4:\n            raise ValueError(\"Input(x) should be a 4-D Tensor.\")\n        trans_x = paddle.transpose(x, perm=[0, 2, 1, 3])\n        # The value 0 in shape attr means copying the corresponding dimension"
+        },
+        {
+            "comment": "This code defines a function that performs Scaled Dot-Product Attention. It first scales the query vector by dividing it with the square root of the key dimension, then takes the dot product between scaled query and key matrices after transposing the key matrix. If attention bias is provided, it adds it to the product. It applies softmax activation on the result to get weights, which are optionally dropout masked if a dropout rate is specified. Finally, it computes the output vector by taking the weighted sum of value vectors. This function is used in the context of Transformer Encoder layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":104-127",
+            "content": "        # size of the input as the output dimension size.\n        return paddle.reshape(\n            x=trans_x,\n            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]])\n    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):\n        \"\"\"\n        Scaled Dot-Product Attention\n        \"\"\"\n        scaled_q = paddle.scale(x=q, scale=d_key**-0.5)\n        product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)\n        if attn_bias:\n            # product += attn_bias\n            product = paddle.add(x=product, y=attn_bias)\n        weights = paddle.nn.functional.softmax(x=product)\n        if dropout_rate:\n            weights = paddle.nn.functional.dropout(weights, p=dropout_rate, mode=\"upscale_in_train\", training=True)\n        out = paddle.matmul(x=weights, y=v)\n        return out\n    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)\n    if cache is not None:  # use cache and concat time steps\n        # Since the inplace reshape in __split_heads changes the shape of k and"
+        },
+        {
+            "comment": "This code is reshaping the cache input for the next time step and splitting the inputs into multiple heads. It performs scaled dot product attention, combines the outputs of each head, and projects the result back to the model size using a fully connected layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":128-153",
+            "content": "        # v, which is the cache input for next time step, reshape the cache\n        # input from the previous time step first.\n        k = cache[\"k\"] = paddle.concat(\n            x=[paddle.reshape(\n                x=cache[\"k\"], shape=[0, 0, d_model]), k], axis=1)\n        v = cache[\"v\"] = paddle.concat(\n            x=[paddle.reshape(\n                x=cache[\"v\"], shape=[0, 0, d_model]), v], axis=1)\n    q = __split_heads(q, n_head)\n    k = __split_heads(k, n_head)\n    v = __split_heads(v, n_head)\n    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,\n                                                  dropout_rate)\n    out = __combine_heads(ctx_multiheads)\n    # Project back to the model size.\n    proj_out = static.nn.fc(x=out,\n                         size=d_model,\n                         num_flatten_dims=2,\n                         weight_attr=paddle.ParamAttr(\n                             name=name + '_output_fc.w_0',\n                             initializer=param_initializer),\n                         bias_attr=name + '_output_fc.b_0')"
+        },
+        {
+            "comment": "This code defines the position-wise feed-forward network used in a transformer encoder. It consists of two linear transformations with a ReLU activation applied to each position separately and identically. The hidden layer is passed through a dropout if dropout_rate is specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":154-181",
+            "content": "    return proj_out\ndef positionwise_feed_forward(x,\n                              d_inner_hid,\n                              d_hid,\n                              dropout_rate,\n                              hidden_act,\n                              param_initializer=None,\n                              name='ffn'):\n    \"\"\"\n    Position-wise Feed-Forward Networks.\n    This module consists of two linear transformations with a ReLU activation\n    in between, which is applied to each position separately and identically.\n    \"\"\"\n    hidden = static.nn.fc(x=x,\n                       size=d_inner_hid,\n                       num_flatten_dims=2,\n                       activation=hidden_act,\n                       weight_attr=paddle.ParamAttr(\n                           name=name + '_fc_0.w_0',\n                           initializer=param_initializer),\n                       bias_attr=name + '_fc_0.b_0')\n    if dropout_rate:\n        hidden = paddle.nn.functional.dropout(\n            hidden,\n            p=dropout_rate,\n            mode=\"upscale_in_train\","
+        },
+        {
+            "comment": "This code defines a function for a transformer encoder layer in the PaddleVideo MultimodalVideoTag application. The layer includes a multi-head attention mechanism and a position-wise feed-forward network, with residual connections and layer normalization added before or after these operations, as specified by the process_cmd argument.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":182-207",
+            "content": "            training=True)\n    out = static.nn.fc(x=hidden,\n                    size=d_hid,\n                    num_flatten_dims=2,\n                    weight_attr=paddle.ParamAttr(\n                        name=name + '_fc_1.w_0', initializer=param_initializer),\n                    bias_attr=name + '_fc_1.b_0')\n    return out\ndef pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,\n                           name=''):\n    \"\"\"\n    Add residual connection, layer normalization and droput to the out tensor\n    optionally according to the value of process_cmd.\n    This will be used before or after multi-head attention and position-wise\n    feed-forward networks.\n    \"\"\"\n    for cmd in process_cmd:\n        if cmd == \"a\":  # add residual connection\n            # out = out + prev_out if prev_out else out\n            out = paddle.add(x=out, y=prev_out) if prev_out else out\n        elif cmd == \"n\":  # add layer normalization\n            out_dtype = out.dtype\n            if out_dtype == \"float16\":\n                out = paddle.cast(x=out, dtype=\"float32\")"
+        },
+        {
+            "comment": "This code is part of a transformer encoder layer implementation in PaddlePaddle. It applies layer normalization, optional float16 casting, and optionally dropout for training. The pre_process_layer and post_process_layer are partial functions used for data pre-processing and post-processing respectively. The encoder_layer function takes input, attention bias, and number of heads as inputs to create a transformer encoder layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":208-235",
+            "content": "            out = static.nn.layer_norm(\n                out,\n                begin_norm_axis=len(out.shape) - 1,\n                param_attr=paddle.ParamAttr(\n                    name=name + '_layer_norm_scale',\n                    initializer=paddle.nn.initializer.Constant(value=1.)),\n                bias_attr=paddle.ParamAttr(\n                    name=name + '_layer_norm_bias',\n                    initializer=paddle.nn.initializer.Constant(value=0.)))\n            if out_dtype == \"float16\":\n                out = paddle.cast(x=out, dtype=\"float16\")\n        elif cmd == \"d\":  # add dropout\n            if dropout_rate:\n                out = paddle.nn.functional.dropout(\n                    out,\n                    p=dropout_rate,\n                    dropout_implementation=\"upscale_in_train\",\n                    training=True)\n    return out\npre_process_layer = partial(pre_post_process_layer, None)\npost_process_layer = pre_post_process_layer\ndef encoder_layer(enc_input,\n                  attn_bias,\n                  n_head,"
+        },
+        {
+            "comment": "This code defines a transformer encoder layer that stacks multiple layers to form a deep encoder. The encoder consists of a multi-head self-attention mechanism followed by position-wise feed-forward networks, all with residual connections and layer normalization to add dropout.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":236-267",
+            "content": "                  d_key,\n                  d_value,\n                  d_model,\n                  d_inner_hid,\n                  prepostprocess_dropout,\n                  attention_dropout,\n                  relu_dropout,\n                  hidden_act,\n                  preprocess_cmd=\"n\",\n                  postprocess_cmd=\"da\",\n                  param_initializer=None,\n                  name=''):\n    \"\"\"The encoder layers that can be stacked to form a deep encoder.\n    This module consits of a multi-head (self) attention followed by\n    position-wise feed-forward networks and both the two components companied\n    with the post_process_layer to add residual connection, layer normalization\n    and droput.\n    \"\"\"\n    attn_output = multi_head_attention(\n        pre_process_layer(\n            enc_input,\n            preprocess_cmd,\n            prepostprocess_dropout,\n            name=name + '_pre_att'),\n        None,\n        None,\n        attn_bias,\n        d_key,\n        d_value,\n        d_model,\n        n_head,\n        attention_dropout,"
+        },
+        {
+            "comment": "This code defines a transformer encoder model. It utilizes an attention mechanism to process input sequences, followed by position-wise feed forward layers. The function takes input sequences, attention bias, number of layers, number of heads, and other parameters as inputs and returns the processed output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":268-307",
+            "content": "        param_initializer=param_initializer,\n        name=name + '_multi_head_att')\n    attn_output = post_process_layer(\n        enc_input,\n        attn_output,\n        postprocess_cmd,\n        prepostprocess_dropout,\n        name=name + '_post_att')\n    ffd_output = positionwise_feed_forward(\n        pre_process_layer(\n            attn_output,\n            preprocess_cmd,\n            prepostprocess_dropout,\n            name=name + '_pre_ffn'),\n        d_inner_hid,\n        d_model,\n        relu_dropout,\n        hidden_act,\n        param_initializer=param_initializer,\n        name=name + '_ffn')\n    return post_process_layer(\n        attn_output,\n        ffd_output,\n        postprocess_cmd,\n        prepostprocess_dropout,\n        name=name + '_post_ffn')\ndef encoder(enc_input,\n            attn_bias,\n            n_layer,\n            n_head,\n            d_key,\n            d_value,\n            d_model,\n            d_inner_hid,\n            prepostprocess_dropout,\n            attention_dropout,\n            relu_dropout,\n            hidden_act,"
+        },
+        {
+            "comment": "This code defines a function to create an encoder consisting of multiple layers, where each layer is generated by calling the \"encoder_layer\" function. The encoder takes in input, attention bias, number of heads, dimensionality of keys and values, model dimensions, inner hidden dimensions, and dropout rates for preprocessing and postprocessing. The function applies each layer to the input sequentially, updating the input with each iteration. Finally, it applies a pre-processing layer to the output using specified preprocessing command and prepostprocess_dropout.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py\":308-337",
+            "content": "            preprocess_cmd=\"n\",\n            postprocess_cmd=\"da\",\n            param_initializer=None,\n            name=''):\n    \"\"\"\n    The encoder is composed of a stack of identical layers returned by calling\n    encoder_layer.\n    \"\"\"\n    for i in range(n_layer):\n        enc_output = encoder_layer(\n            enc_input,\n            attn_bias,\n            n_head,\n            d_key,\n            d_value,\n            d_model,\n            d_inner_hid,\n            prepostprocess_dropout,\n            attention_dropout,\n            relu_dropout,\n            hidden_act,\n            preprocess_cmd,\n            postprocess_cmd,\n            param_initializer=param_initializer,\n            name=name + '_layer_' + str(i))\n        enc_input = enc_output\n    enc_output = pre_process_layer(\n        enc_output, preprocess_cmd, prepostprocess_dropout, name=\"post_encoder\")\n    return enc_output"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/aac9c369-be5b-4511-b19d-174876a181d3.json b/docs/doc/aac9c369-be5b-4511-b19d-174876a181d3.json
new file mode 100644
index 000000000..3d7ac7550
--- /dev/null
+++ b/docs/doc/aac9c369-be5b-4511-b19d-174876a181d3.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code constructs a learning rate scheduler for PaddleVideo's VideoQualityAssessment module, using the PiecewiseDecay method and handling learning rate configurations. It creates an LR scheduler instance based on name and updates num_iters if iter_step is present.",
+    "details": [
+        {
+            "comment": "This code builds a learning rate scheduler according to the \"OPTIMIZER\" configuration. It uses the PiecewiseDecay method with specified boundaries and values. The learning rate scheduler is always passed into the optimizer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/lr.py\":0-32",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport copy\nimport paddle\nfrom . import custom_lr\ndef build_lr(cfg, num_iters):\n    \"\"\"\n    Build a learning rate scheduler accroding to ```OPTIMIZER``` configuration, and it always pass into the optimizer.\n    In configuration:\n    learning_rate:\n        name: 'PiecewiseDecay'\n        boundaries: [20, 60]\n        values: [0.00025, 0.000025, 0.0000025]\n    Returns:\n        A paddle.optimizer.lr instance.\n    \"\"\""
+        },
+        {
+            "comment": "This code handles learning rate configuration in PaddleVideo's VideoQualityAssessment module. It checks if the learning rate is a dictionary and modifies it accordingly, then creates an instance of the appropriate LR scheduler based on the specified name. If an iter_step is present, it updates num_iters before removing it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/lr.py\":34-48",
+            "content": "    cfg_copy = cfg.copy()\n    #when learning_rate is LRScheduler\n    if cfg_copy.get('learning_rate') and isinstance(cfg_copy['learning_rate'],\n                                                    dict):\n        cfg_copy['learning_rate'] = build_lr(\n            cfg_copy['learning_rate'],\n            num_iters)  #not support only inner iter_step\n    lr_name = cfg_copy.pop('name')\n    if cfg_copy.get('iter_step'):\n        cfg_copy['num_iters'] = num_iters\n        cfg_copy.pop('iter_step')\n    return getattr(custom_lr, lr_name)(**cfg_copy)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ab0e18d8-71f3-4fda-9fe4-197ba22746fa.json b/docs/doc/ab0e18d8-71f3-4fda-9fe4-197ba22746fa.json
new file mode 100644
index 000000000..2366d9947
--- /dev/null
+++ b/docs/doc/ab0e18d8-71f3-4fda-9fe4-197ba22746fa.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The BMN model, using three modules and the ActivityNet dataset, is trained and inferred for temporal action proposal generation with given commands. The export_model script and predict script are utilized to perform inference, providing logs as examples.",
+    "details": [
+        {
+            "comment": "The code describes the BMN model, which consists of three modules: Base Module, Temporal Evaluation Module, and Proposal Evaluation Module. It uses the ActivityNet dataset for training and provides instructions on how to start the training process using a command.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/bmn.md\":0-34",
+            "content": "[\u7b80\u4f53\u4e2d\u6587 ](../../../zh-CN/model_zoo/localization/bmn.md) | English\n# BMN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nBMN model contains three modules: Base Module handles the input feature sequence, and out- puts feature sequence shared by the following two modules; Temporal Evaluation Module evaluates starting and ending probabilities of each location in video to generate boundary probability sequences; Proposal Evaluation Module con- tains the BM layer to transfer feature sequence to BM fea- ture map, and contains a series of 3D and 2D convolutional layers to generate BM confidence map.\n<p align=\"center\">\n<img src=\"../../../images/BMN.png\" height=300 width=400 hspace='10'/> <br />\nBMN Overview\n</p>\n## Data\nWe use ActivityNet dataset to train this model\uff0cdata preparation please refer to [ActivityNet dataset](../../dataset/ActivityNet.md).\n## Train\nYou can start training by such command\uff1a\n```bash"
+        },
+        {
+            "comment": "This code is launching a PaddlePaddle distributed localization model named BMN using 4 GPUs and running it on the provided configuration file. It also provides instructions for testing, specifying the required label file and model path, as well as inference commands to export the architecture and parameters files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/bmn.md\":35-67",
+            "content": "export CUDA_VISIBLE_DEVICES=0,1,2,3\npython -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_bmn main.py  --validate -c configs/localization/bmn.yaml\n```\n## Test\nYou can start testing by such command\uff1a\n```bash\npython main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00009.pdparams -o DATASET.test_batch_size=1\n```\n- For now, we only support testing with **single card** and `batch_size=1`.\n-  Please download [activity\\_net\\_1\\_3\\_new.json](https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json) label file and specify the path to `METRIC.ground_truth_filename` in config file.\n-  Args `-w` is used to specifiy the model path\uff0cyou can download our model in [BMN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams)\nTest accuracy in ActivityNet1.3:\n| AR@1 | AR@5 | AR@10 | AR@100 | AUC |\n| :---: | :---: | :---: | :---: | :---: |\n| 33.26 | 49.48 | 56.86 | 75.19 | 67.23% |\n## Inference\n### export inference model\n To get model architecture file `BMN.pdmodel` and parameters file `BMN.pdiparams`, use: "
+        },
+        {
+            "comment": "The code exports the BMN model and runs inference on a set of feature files, producing output segments and scores. The export_model script requires the configuration file, the PDParams file, and outputs an inference folder. The predict script uses the configuration file, two model files, and a list of input feature files to perform inference. It prints the score and segment for each input, with example logs provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/bmn.md\":69-95",
+            "content": "```bash\npython3.7 tools/export_model.py -c configs/localization/bmn.yaml \\\n                                -p data/BMN.pdparams \\\n                                -o inference/BMN\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example_feat.list \\\n                           --config configs/localization/bmn.yaml \\\n                           --model_file inference/BMN/BMN.pdmodel \\\n                           --params_file inference/BMN/BMN.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nBMN Inference results of data/example_feat.npy :\n{'score': 0.7968077063560486, 'segment': [0.0, 122.9877]}\n{'score': 0.49097609519958496, 'segment': [12.423000000000002, 124.23]}\n{'score': 0.21395835280418396, 'segment': [39.7536, 122.9877]}\n{'score': 0.2106524258852005, 'segment': [0.0, 109.3224]}"
+        },
+        {
+            "comment": "The code snippet represents the inference results of BMN (Boundary-Matching Network) for temporal action proposal generation. These results, containing a score and segment information, are saved in the specified directory. The BMN paper reference is provided for further information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/bmn.md\":96-103",
+            "content": "{'score': 0.06876271963119507, 'segment': [23.6037, 114.2916]}\n```\nInference results are saved in `data/bmn/BMN_INFERENCE_results`. \n## Reference\n- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ab64bc73-0873-454d-b99f-286199ecee89.json b/docs/doc/ab64bc73-0873-454d-b99f-286199ecee89.json
new file mode 100644
index 000000000..0a1c5776c
--- /dev/null
+++ b/docs/doc/ab64bc73-0873-454d-b99f-286199ecee89.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code prepares the NTU RGB+D dataset for CTR-GCN through data organization and cleaning, involving obtaining, denoising, and transforming skeleton data using three scripts. The dataset consists of 60 action classes with two splits: Cross-subject and Cross-view.",
+    "details": [
+        {
+            "comment": "NTU-RGB+D dataset contains 60 action classes and 56,880 video samples for skeleton-based action recognition. It has two splits: Cross-subject and Cross-view. ST-GCN data preparation process introduced in the following sections.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ntu-rgbd.md\":0-22",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/ntu-rgbd.md) | English\n# NTU-RGB+D Preparation\n- [Introduction](#Introduction)\n- [ST-GCN Data Prepare](#ST-GCN_Data_Prepare)\n- [CTR-GTCN Data Prepare](#CTR-GCN_Data_Prepare)\n---\n## Introduction\nNTU-RGB+D contains 60 action classes and 56,880 video samples for skeleton-based action recognition. Please refer to its official website[NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) for more details.\nThe dataset contains two splits when dividing the training set and test set. For Cross-subject, the dataset is divided according to character id, with 40320 samples in training set and 16560 samples in test set. For Cross-view, the dataset is divided according to camera division. The samples collected by cameras 2 and 3 are training sets, including 37930 samples, and the samples collected by camera 1 are test sets, including 18960 samples.\n## ST-GCN_Data_Prepare\nST-GCN data prepare preceduce are introducted follow.\n### Download\nWe provide the download link of the p"
+        },
+        {
+            "comment": "This code describes a processed dataset called NTU-RGB-D, which is approximately 3.1GB in size and requires downloading and unzipping using the command \"tar -zxvf NTU-RGB-D.tar\". The resulting directory structure contains train and val data for both xsub and xview. The code also provides a script called download_dataset.sh to facilitate downloading the dataset from the official website, and shows the file tree structure after successful download.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ntu-rgbd.md\":22-58",
+            "content": "rocessed dataset [NTU-RGB-D.tar](https://videotag.bj.bcebos.com/Data/NTU-RGB-D.tar)(~3.1G). Please download and unzip with ```tar -zxvf NTU-RGB-D.tar ``` , the directory structure is as follows\uff1a\n```txt\n\u2500\u2500\u2500 NTU-RGB-D\n    \u251c\u2500\u2500 xsub\n    \u2502   \u251c\u2500\u2500 train_data.npy\n    \u2502   \u251c\u2500\u2500 train_label.pkl\n    \u2502   \u251c\u2500\u2500 val_data.npy\n    \u2502   \u2514\u2500\u2500 val_label.pkl\n    \u2514\u2500\u2500 xview\n        \u251c\u2500\u2500 train_data.npy\n        \u251c\u2500\u2500 train_label.pkl\n        \u251c\u2500\u2500 val_data.npy\n        \u2514\u2500\u2500 val_label.pkl\n```\n> This is a copies from [st-gcn](https://github.com/open-mmlab/mmskeleton/blob/master/doc/SKELETON_DATA.md).\n## CTR-GCN_Data_Prepare\nCTR-GCN data prepare preceduce are introducted follow.\n### Download\nThere is script `download_dataset.sh` to download the dataset from official website [NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) in dictory `data\\ntu-rgb-d`.\n```bash\nsh data/ntu-rgb-d/download_dataset.sh\n```\nFile tree:\n```txt\n\u2500\u2500\u2500 ntu-rgb-d\n    \u251c\u2500\u2500 download_dataset.sh\n    \u251c\u2500\u2500 nturgb+d_skeletons\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A001.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A002.skeleton"
+        },
+        {
+            "comment": "The provided code describes the preparation steps for processing the NTU-RGBD dataset to be used by CTR-GCN. It involves running three separate Python scripts in order:\n1. `get_raw_skes_data.py` is responsible for obtaining the skeleton of each performer from the data folders.\n2. `get_raw_denoised_data.py` removes any bad or corrupted skeletons from the dataset.\n3. `seq_transformation.py` transforms the remaining skeletons to the center of the first frame.\nTo follow these steps, navigate to the NTU-RGBD dataset folder and run each script sequentially in your command line interface.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ntu-rgbd.md\":59-92",
+            "content": "    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A003.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A004.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A005.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A006.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A007.skeleton\n    \u2502   \u251c\u2500\u2500 ....\n    \u2502\u00a0\u00a0 \u2514\u2500\u2500 S017C003P020R002A060.skeleton\n    \u251c\u2500\u2500 get_raw_denoised_data.py\n    \u251c\u2500\u2500 get_raw_skes_data.py\n    \u251c\u2500\u2500 seq_transformation.py\n    \u2514\u2500\u2500 statistics\n     \u00a0\u00a0 \u251c\u2500\u2500 camera.txt\n     \u00a0\u00a0 \u251c\u2500\u2500 label.txt\n     \u00a0\u00a0 \u251c\u2500\u2500 performer.txt\n     \u00a0\u00a0 \u251c\u2500\u2500 replication.txt\n     \u00a0\u00a0 \u251c\u2500\u2500 setup.txt\n     \u00a0\u00a0 \u2514\u2500\u2500 skes_available_name.txt\n```\n### Prepare\nrun follow script, then data will be precessed to the data format need by CTR-GCN.\n> Note\uff1aif make dataset by yourself, please prepare `data/ntu-rgb-d/statistics/skes_available_name.txt`, which is the list of skeletons files that will be precessed.\n```bash\ncd ./data/ntu-rgb-d\n# Get skeleton of each performer\npython get_raw_skes_data.py\n# Remove the bad skeleton\npython get_raw_denoised_data.py\n# Transform the skeleton to the center of the first frame\npython seq_transformation.py"
+        },
+        {
+            "comment": "The code represents a dataset called \"ntu-rgb-d\" containing skeleton data and associated files for denoising, logging missing skeletons, and tracking frames. The dataset is organized into folders including 'nturgb+d_skeletons' containing skeleton files per actor and 'denoised_data' with various log and pickle files related to the denoising process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ntu-rgbd.md\":93-128",
+            "content": "```\nFile tree:\n```txt\n\u2500\u2500\u2500 ntu-rgb-d\n    \u251c\u2500\u2500 download_dataset.sh\n    \u251c\u2500\u2500 nturgb+d_skeletons\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A001.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A002.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A003.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A004.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A005.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A006.skeleton\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A007.skeleton\n    \u2502   \u251c\u2500\u2500 ....\n    \u2502\u00a0\u00a0 \u2514\u2500\u2500 S017C003P020R002A060.skeleton\n    \u251c\u2500\u2500 denoised_data\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 actors_info\n    \u2502\u00a0\u00a0 \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A024.txt\n    \u2502\u00a0\u00a0 \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A025.txt\n    \u2502\u00a0\u00a0 \u2502\u00a0\u00a0 \u251c\u2500\u2500 S001C001P001R001A026.txt\n    \u2502\u00a0\u00a0 \u2502\u00a0\u00a0 \u251c\u2500\u2500 ....\n    \u2502\u00a0\u00a0 \u2502\u00a0\u00a0 \u251c\u2500\u2500 S017C003P020R002A059.txt\n    \u2502\u00a0\u00a0 \u2502\u00a0\u00a0 \u2514\u2500\u2500 S017C003P020R002A060.txt\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 denoised_failed_1.log\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 denoised_failed_2.log\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 frames_cnt.txt\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 missing_skes_1.log\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 missing_skes_2.log\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 missing_skes.log\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 noise_length.log\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 noise_motion.log\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 noise_spread.log\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 raw_denoised_colors.pkl\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 raw_denoised_joints.pkl"
+        },
+        {
+            "comment": "This code appears to organize various data files related to a dataset, likely for the NTU RGB+D action recognition benchmark. The directory structure includes raw data, denoised data, and preprocessed data in separate folders (xview and xsub). There are also statistics files and Python scripts for getting raw and denoised data. The notes suggest that some of the temporal files can be deleted if the extracted xview and xsub files are available.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ntu-rgbd.md\":129-157",
+            "content": "    \u2502\u00a0\u00a0 \u2514\u2500\u2500 rgb+ske\n    \u251c\u2500\u2500 raw_data\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 frames_cnt.txt\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 frames_drop.log\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 frames_drop_skes.pkl\n    \u2502\u00a0\u00a0 \u2514\u2500\u2500 raw_skes_data.pkl\n    \u251c\u2500\u2500 get_raw_denoised_data.py\n    \u251c\u2500\u2500 get_raw_skes_data.py\n    \u251c\u2500\u2500 seq_transformation.py\n    \u251c\u2500\u2500 statistics\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 camera.txt\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 label.txt\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 performer.txt\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 replication.txt\n    \u2502\u00a0\u00a0 \u251c\u2500\u2500 setup.txt\n    \u2502\u00a0\u00a0 \u2514\u2500\u2500 skes_available_name.txt\n    \u251c\u2500\u2500 xview\n    \u2502   \u251c\u2500\u2500 train_data.npy\n    \u2502   \u251c\u2500\u2500 train_label.pkl\n    \u2502   \u251c\u2500\u2500 val_data.npy\n    \u2502   \u2514\u2500\u2500 val_label.pkl\n    \u2514\u2500\u2500 xsub\n        \u251c\u2500\u2500 train_data.npy\n        \u251c\u2500\u2500 train_label.pkl\n        \u251c\u2500\u2500 val_data.npy\n        \u2514\u2500\u2500 val_label.pkl\n```\n> Note\uff1adictory `denoised_data`\u3001`raw_data`and`nturgb+d_skeletons`, that are temporal files, can be deleted, if extracted `xview` and `xsub`."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ac359f3c-86a3-4578-8965-6ef841b5b557.json b/docs/doc/ac359f3c-86a3-4578-8965-6ef841b5b557.json
new file mode 100644
index 000000000..490779d9b
--- /dev/null
+++ b/docs/doc/ac359f3c-86a3-4578-8965-6ef841b5b557.json
@@ -0,0 +1,85 @@
+{
+    "summary": "The code establishes paths, defines functions for AVA model in PaddleVideo with OpenCV, creates a video analysis model, extracts frames, predicts label scores, detects humans, performs inference, and identifies spatio-temporal actions.",
+    "details": [
+        {
+            "comment": "This code is a Python script for the AVA (Action Unit Detection) model in PaddleVideo. It imports necessary libraries, checks for missing dependencies, and sets up paths for model building.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":0-31",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport paddle\nimport os, sys\nimport copy as cp\nimport cv2\nimport math\ntry:\n    import ppdet\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [paddledet] package and it's dependencies is required for AVA.\"\n    )\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\nfrom paddlevideo.modeling.builder import build_model\nfrom paddlevideo.utils import get_config"
+        },
+        {
+            "comment": "This code snippet is a part of the PaddleVideo library. It defines several color schemes and abbreviation functions related to video analysis tasks. The color schemes are used for annotations, while the abbreviation function is for simplifying label names in the AVA dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":32-67",
+            "content": "from paddlevideo.loader.builder import build_dataloader, build_dataset, build_pipeline\nfrom paddlevideo.metrics.ava_utils import read_labelmap\nimport time\nfrom os import path as osp\nimport numpy as np\nfrom paddlevideo.utils import get_config\nimport pickle\nfrom paddlevideo.utils import (get_logger, load, mkdir, save)\nimport shutil\nFONTFACE = cv2.FONT_HERSHEY_DUPLEX\nFONTSCALE = 0.5\nFONTCOLOR = (255, 255, 255)  # BGR, white\nMSGCOLOR = (128, 128, 128)  # BGR, gray\nTHICKNESS = 1\nLINETYPE = 1\ndef hex2color(h):\n    \"\"\"Convert the 6-digit hex string to tuple of 3 int value (RGB)\"\"\"\n    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))\nplate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'\nplate_blue = plate_blue.split('-')\nplate_blue = [hex2color(h) for h in plate_blue]\nplate_green = '004b23-006400-007200-008000-38b000-70e000'\nplate_green = plate_green.split('-')\nplate_green = [hex2color(h) for h in plate_green]\ndef abbrev(name):\n    \"\"\"Get the abbreviation of label name:\n    'take (an object) from (a person)' -> 'take ... from ...'"
+        },
+        {
+            "comment": "This function visualizes frames with predicted annotations, requiring the number of frames and annotations to be multiples. It asserts that the max_num is less than or equal to the length of the plate used for visualization and ensures that frames are a deep copy before processing. The assertions check if the number of frames is divisible by the number of annotations, and calculates the number of frames per annotation. The function also initializes the annotation variable and stores the image height and width for later use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":68-97",
+            "content": "    \"\"\"\n    while name.find('(') != -1:\n        st, ed = name.find('('), name.find(')')\n        name = name[:st] + '...' + name[ed + 1:]\n    return name\n# annotations is pred results\ndef visualize(frames, annotations, plate=plate_blue, max_num=5):\n    \"\"\"Visualize frames with predicted annotations.\n    Args:\n        frames (list[np.ndarray]): Frames for visualization, note that\n            len(frames) % len(annotations) should be 0.\n        annotations (list[list[tuple]]): The predicted results.\n        plate (str): The plate used for visualization. Default: plate_blue.\n        max_num (int): Max number of labels to visualize for a person box.\n            Default: 5\uff0c\u76ee\u524d\u4e0d\u80fd\u5927\u4e8e5.\n    Returns:\n        list[np.ndarray]: Visualized frames.\n    \"\"\"\n    assert max_num + 1 <= len(plate)\n    plate = [x[::-1] for x in plate]\n    frames_ = cp.deepcopy(frames)\n    nf, na = len(frames), len(annotations)\n    assert nf % na == 0\n    nfpa = len(frames) // len(annotations)\n    anno = None\n    h, w, _ = frames[0].shape\n    # proposals\u88ab\u5f52\u4e00\u5316\u9700\u8981\u8fd8\u539f\u771f\u5b9e\u5750\u6807\u503c"
+        },
+        {
+            "comment": "This code is iterating through annotations and frames, scaling box coordinates based on image size, drawing rectangles around objects in frames using OpenCV, and displaying labels above the rectangles with their corresponding scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":98-124",
+            "content": "    scale_ratio = np.array([w, h, w, h])\n    for i in range(na):\n        anno = annotations[i]\n        if anno is None:\n            continue\n        for j in range(nfpa):\n            ind = i * nfpa + j\n            frame = frames_[ind]\n            for ann in anno:\n                box = ann[0]\n                label = ann[1]\n                if not len(label):\n                    continue\n                score = ann[2]\n                box = (box * scale_ratio).astype(np.int64)\n                st, ed = tuple(box[:2]), tuple(box[2:])\n                cv2.rectangle(frame, st, ed, plate[0], 2)\n                for k, lb in enumerate(label):\n                    if k >= max_num:\n                        break\n                    text = abbrev(lb)\n                    text = ': '.join([text, str(score[k])])\n                    location = (0 + st[0], 18 + k * 18 + st[1])\n                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,\n                                               THICKNESS)[0]\n                    textwidth = textsize[0]"
+        },
+        {
+            "comment": "This code is part of the \"ava_predict.py\" file in the PaddleVideo library. It defines a function called \"frame_extraction\" that takes a video path and target directory as arguments. The function extracts frames from the given video_path and saves them to the specified target directory. It reads each frame of the video, appends it to the \"frames\" list, writes it to disk using cv2.imwrite, and increments the index for frame naming. The target directory is created if it doesn't exist already. This function handles videos with a maximum length of several hours, as indicated by the FPS (Frames Per Second) value obtained from the video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":125-159",
+            "content": "                    diag0 = (location[0] + textwidth, location[1] - 14)\n                    diag1 = (location[0], location[1] + 2)\n                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)\n                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,\n                                FONTCOLOR, THICKNESS, LINETYPE)\n    return frames_\ndef frame_extraction(video_path, target_dir):\n    \"\"\"Extract frames given video_path.\n    Args:\n        video_path (str): The video_path.\n    \"\"\"\n    if not os.path.exists(target_dir):\n        os.makedirs(target_dir, exist_ok=True)\n    # Should be able to handle videos up to several hours\n    frame_tmpl = osp.join(target_dir, '{:05d}.jpg')\n    vid = cv2.VideoCapture(video_path)\n    FPS = int(vid.get(5))\n    frames = []\n    frame_paths = []\n    flag, frame = vid.read()\n    index = 1\n    while flag:\n        frames.append(frame)\n        frame_path = frame_tmpl.format(index)\n        frame_paths.append(frame_path)\n        cv2.imwrite(frame_path, frame)\n        index += 1"
+        },
+        {
+            "comment": "This code is for running PaddleVideo inference model. It takes a video file or URL, config file path, and overrides options as input parameters. The model can be finetuned or tested using specified weights. The detection model name is also an optional parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":160-190",
+            "content": "        flag, frame = vid.read()\n    return frame_paths, frames, FPS\ndef parse_args():\n    def str2bool(v):\n        return v.lower() in (\"true\", \"t\", \"1\")\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Inference model script\")\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',\n                        help='config file path')\n    parser.add_argument('--video_path', help='video file/url')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument('-w',\n                        '--weights',\n                        type=str,\n                        help='weights for finetuning or testing')\n    #detection_model_name\n    parser.add_argument('--detection_model_name',\n                        help='the name of detection model ')"
+        },
+        {
+            "comment": "This code is parsing arguments for the ava_predict function, including detection model weights path, output filename, predict step size, output step size, and output FPS. The pack_result function combines human detection results with a given result, sorting them by probability size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":191-221",
+            "content": "    # detection_model_weights\n    parser.add_argument('--detection_model_weights',\n                        help='the weights path of detection model ')\n    # params for predict\n    parser.add_argument('--out-filename',\n                        default='ava_det_demo.mp4',\n                        help='output filename')\n    parser.add_argument('--predict-stepsize',\n                        default=8,\n                        type=int,\n                        help='give out a prediction per n frames')\n    parser.add_argument(\n        '--output-stepsize',\n        default=4,\n        type=int,\n        help=('show one frame per n frames in the demo, we should have: '\n              'predict_stepsize % output_stepsize == 0'))\n    parser.add_argument('--output-fps',\n                        default=6,\n                        type=int,\n                        help='the fps of demo video output')\n    return parser.parse_args()\n# \u4e00\u5e27\u7684\u7ed3\u679c\u3002\u6839\u636e\u6982\u7387\u5927\u5c0f\u8fdb\u884c\u6392\u5e8f\ndef pack_result(human_detection, result):\n    \"\"\"Short summary.\n    Args:\n        human_detection (np.ndarray): Human detection result."
+        },
+        {
+            "comment": "This function takes the predicted label of each human proposal and returns a tuple containing the human proposal, label name, and label score. It also constructs data processing results for frame directory, timestamp, clip length, frame interval, and frames per second.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":222-263",
+            "content": "        result (type): The predicted label of each human proposal.\n    Returns:\n        tuple: Tuple of human proposal, label name and label score.\n    \"\"\"\n    results = []\n    if result is None:\n        return None\n    for prop, res in zip(human_detection, result):\n        res.sort(key=lambda x: -x[1])\n        results.append((prop, [x[0] for x in res], [x[1] for x in res]))\n    return results\n# \u6784\u9020\u6570\u636e\u5904\u7406\u9700\u8981\u7684results\ndef get_timestep_result(frame_dir, timestamp, clip_len, frame_interval, FPS):\n    result = {}\n    result[\"frame_dir\"] = frame_dir\n    frame_num = len(os.listdir(frame_dir))\n    dir_name = frame_dir.split(\"/\")[-1]\n    result[\"video_id\"] = dir_name\n    result['timestamp'] = timestamp\n    timestamp_str = '{:04d}'.format(timestamp)\n    img_key = dir_name + \",\" + timestamp_str\n    result['img_key'] = img_key\n    result['shot_info'] = (1, frame_num)\n    result['fps'] = FPS\n    result['suffix'] = '{:05}.jpg'\n    result['timestamp_start'] = 1\n    result['timestamp_end'] = int(frame_num / result['fps'])\n    return result"
+        },
+        {
+            "comment": "This function performs human detection on a list of frame paths using a specified model and weight file. It uses the trainer object to predict human boxes in each frame, saving the results as text files in the specified output directory. The function then returns a list of paths for these detection results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":266-293",
+            "content": "def detection_inference(frame_paths, output_dir, model_name, weights_path):\n    \"\"\"Detect human boxes given frame paths.\n    Args:\n        frame_paths (list[str]): The paths of frames to do detection inference.\n    Returns:\n        list[np.ndarray]: The human detection results.\n    \"\"\"\n    detection_cfg = ppdet.model_zoo.get_config_file(model_name)\n    detection_cfg = ppdet.core.workspace.load_config(detection_cfg)\n    detection_trainer = ppdet.engine.Trainer(detection_cfg, mode='test')\n    detection_trainer.load_weights(weights_path)\n    print('Performing Human Detection for each frame')\n    detection_trainer.predict(frame_paths, output_dir=output_dir, save_txt=True)\n    print(\"finish object detection\")\n    results = []\n    for frame_path in frame_paths:\n        (file_dir, file_name) = os.path.split(frame_path)\n        (file_path, ext) = os.path.splitext(frame_path)\n        txt_file_name = file_name.replace(ext, \".txt\")\n        txt_path = os.path.join(output_dir, txt_file_name)\n        results.append(txt_path)"
+        },
+        {
+            "comment": "This function reads a detection result file and returns the bounding box proposals (proposals) and corresponding scores for people in the image. It takes the path to the txt file, image height, and image width as input parameters. The function first splits the lines of the file and then checks each line to see if it corresponds to a person detection result. If so, it extracts the score and bounding box coordinates (x1, y1, x2, y2) for that object and adds them to separate lists, scores and proposals. Finally, it returns numpy arrays of the extracted proposals and scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":295-333",
+            "content": "    return results\ndef get_detection_result(txt_file_path, img_h, img_w, person_det_score_thr):\n    \"\"\"\n    \u6839\u636e\u68c0\u6d4b\u7ed3\u679c\u6587\u4ef6\u5f97\u5230\u56fe\u50cf\u4e2d\u4eba\u7684\u68c0\u6d4b\u6846(proposals)\u548c\u7f6e\u4fe1\u5ea6\uff08scores\uff09\n    txt_file_path:\u68c0\u6d4b\u7ed3\u679c\u5b58\u653e\u8def\u5f84\n    img_h:\u56fe\u50cf\u9ad8\u5ea6\n    img_w:\u56fe\u50cf\u5bbd\u5ea6\n    \"\"\"\n    proposals = []\n    scores = []\n    with open(txt_file_path, 'r') as detection_file:\n        lines = detection_file.readlines()\n        for line in lines:  # person 0.9842637181282043 0.0 469.1407470703125 944.7770385742188 831.806396484375\n            items = line.split(\" \")\n            if items[0] != 'person':  #\u53ea\u8981\u4eba\n                continue\n            score = items[1]\n            if (float)(score) < person_det_score_thr:\n                continue\n            x1 = (float(items[2])) / img_w\n            y1 = ((float)(items[3])) / img_h\n            box_w = ((float)(items[4]))\n            box_h = ((float)(items[5]))\n            x2 = (float(items[2]) + box_w) / img_w\n            y2 = (float(items[3]) + box_h) / img_h\n            scores.append(score)\n            proposals.append([x1, y1, x2, y2])\n    return np.array(proposals), np.array(scores)"
+        },
+        {
+            "comment": "This code function is extracting frames from a video, parsing config files, and setting up processing pipelines for testing. The frame extraction process involves specifying the input video path and output directory for storing frames. It calculates the number of frames in the video and ensures it's not zero. It asserts that clip_len and frame_interval are even numbers to create equal-sized clips. Finally, it calculates the window size based on these parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":336-364",
+            "content": "@paddle.no_grad()\ndef main(args):\n    config = get_config(args.config, show=False)  #parse config file\n    # extract frames from video\n    video_path = args.video_path\n    frame_dir = 'tmp_frames'\n    frame_paths, frames, FPS = frame_extraction(video_path, frame_dir)\n    num_frame = len(frame_paths)  #\u89c6\u9891\u79d2\u6570*FPS\n    assert num_frame != 0\n    print(\"Frame Number\uff1a\", num_frame)\n    # \u5e27\u56fe\u50cf\u9ad8\u5ea6\u548c\u5bbd\u5ea6\n    h, w, _ = frames[0].shape\n    # Get clip_len, frame_interval and calculate center index of each clip\n    data_process_pipeline = build_pipeline(config.PIPELINE.test)  #\u6d4b\u8bd5\u65f6\u8f93\u51fa\u5904\u7406\u6d41\u6c34\u914d\u7f6e\n    clip_len = config.PIPELINE.test.sample['clip_len']\n    assert clip_len % 2 == 0, 'We would like to have an even clip_len'\n    frame_interval = config.PIPELINE.test.sample['frame_interval']\n    # \u6b64\u5904\u5173\u952e\u5e27\u6bcf\u79d2\u53d6\u4e00\u4e2a\n    clip_len = config.PIPELINE.test.sample['clip_len']\n    assert clip_len % 2 == 0, 'We would like to have an even clip_len'\n    frame_interval = config.PIPELINE.test.sample['frame_interval']\n    window_size = clip_len * frame_interval\n    timestamps = np.arange(window_size // 2, (num_frame + 1 - window_size // 2),"
+        },
+        {
+            "comment": "This code snippet is parsing timestamps from a file, selecting frames based on those timestamps, loading a label map, constructing a model, and setting its state dictionary. The selected frames are passed to the `detection_inference` function which performs inference using the specified detection model with given weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":365-394",
+            "content": "                           args.predict_stepsize)\n    print(\"timetamps number:\", len(timestamps))\n    # get selected frame list according to timestamps\n    selected_frame_list = []\n    for timestamp in timestamps:\n        selected_frame_list.append(frame_paths[timestamp - 1])\n    # Load label_map\n    label_map_path = config.DATASET.test['label_file']\n    categories, class_whitelist = read_labelmap(open(label_map_path))\n    label_map = {}\n    for item in categories:\n        id = item['id']\n        name = item['name']\n        label_map[id] = name\n    # Construct model.\n    if config.MODEL.backbone.get('pretrained'):\n        config.MODEL.backbone.pretrained = ''  # disable pretrain model init\n    model = build_model(config.MODEL)\n    model.eval()\n    state_dicts = load(args.weights)\n    model.set_state_dict(state_dicts)\n    detection_result_dir = 'tmp_detection'\n    detection_model_name = args.detection_model_name\n    detection_model_weights = args.detection_model_weights\n    detection_txt_list = detection_inference(selected_frame_list,"
+        },
+        {
+            "comment": "This code performs SpatioTemporal Action Detection for each clip. It first retrieves detection results from various txt files, ensuring their lengths match the timestamps. Then, it extracts human detections and predictions for each timestamp using get_detection_result() and get_timestep_result(). If there are no detections in a frame, None values are appended to the lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":395-420",
+            "content": "                                             detection_result_dir,\n                                             detection_model_name,\n                                             detection_model_weights)\n    assert len(detection_txt_list) == len(timestamps)\n    print('Performing SpatioTemporal Action Detection for each clip')\n    human_detections = []\n    predictions = []\n    index = 0\n    for timestamp, detection_txt_path in zip(timestamps, detection_txt_list):\n        proposals, scores = get_detection_result(\n            detection_txt_path, h, w,\n            (float)(config.DATASET.test['person_det_score_thr']))\n        if proposals.shape[0] == 0:\n            predictions.append(None)\n            human_detections.append(None)\n            continue\n        human_detections.append(proposals)\n        result = get_timestep_result(frame_dir,\n                                     timestamp,\n                                     clip_len,\n                                     frame_interval,\n                                     FPS=FPS)"
+        },
+        {
+            "comment": "This code prepares input data for a model by converting images, proposals, and shapes to tensors. It then feeds the prepared data into the model in order mode='infer'. The output is stored in 'result' and used to generate predictions based on number of proposals.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":421-454",
+            "content": "        result[\"proposals\"] = proposals\n        result[\"scores\"] = scores\n        new_result = data_process_pipeline(result)\n        proposals = new_result['proposals']\n        img_slow = new_result['imgs'][0]\n        img_slow = img_slow[np.newaxis, :]\n        img_fast = new_result['imgs'][1]\n        img_fast = img_fast[np.newaxis, :]\n        proposals = proposals[np.newaxis, :]\n        scores = scores[np.newaxis, :]\n        img_shape = np.asarray(new_result['img_shape'])\n        img_shape = img_shape[np.newaxis, :]\n        data = [\n            paddle.to_tensor(img_slow, dtype='float32'),\n            paddle.to_tensor(img_fast, dtype='float32'),\n            paddle.to_tensor(proposals, dtype='float32'), scores,\n            paddle.to_tensor(img_shape, dtype='int32')\n        ]\n        with paddle.no_grad():\n            result = model(data, mode='infer')\n            result = result[0]\n            prediction = []\n            person_num = proposals.shape[1]\n            # N proposals\n            for i in range(person_num):"
+        },
+        {
+            "comment": "This code performs action score thresholding for each detected person in the video. It appends labels and corresponding scores to a prediction list, then appends the predictions to a list of lists for all detected humans. The code also prints progress updates every 10 iterations, and finally, it creates denser timestamps using an older frame interval.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":455-480",
+            "content": "                prediction.append([])\n            # Perform action score thr\n            for i in range(len(result)):\n                if i + 1 not in class_whitelist:\n                    continue\n                for j in range(person_num):\n                    if result[i][j, 4] > config.MODEL.head['action_thr']:\n                        prediction[j].append((label_map[i + 1], result[i][j,\n                                                                          4]))\n            predictions.append(prediction)\n        index = index + 1\n        if index % 10 == 0:\n            print(index, \"/\", len(timestamps))\n    results = []\n    for human_detection, prediction in zip(human_detections, predictions):\n        results.append(pack_result(human_detection, prediction))\n    def dense_timestamps(timestamps, n):\n        \"\"\"Make it nx frames.\"\"\"\n        old_frame_interval = (timestamps[1] - timestamps[0])\n        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2\n        new_frame_inds = np.arange(\n            len(timestamps) * n) * old_frame_interval / n + start"
+        },
+        {
+            "comment": "The code reads video frames, performs visualization, and writes the processed frames into a new video file. It requires moviepy to be installed for output functionality and deletes temporary files after use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/ava_predict.py\":481-508",
+            "content": "        return new_frame_inds.astype(np.int)\n    dense_n = int(args.predict_stepsize / args.output_stepsize)  #30\n    frames = [\n        cv2.imread(frame_paths[i - 1])\n        for i in dense_timestamps(timestamps, dense_n)\n    ]\n    vis_frames = visualize(frames, results)\n    try:\n        import moviepy.editor as mpy\n    except ImportError:\n        raise ImportError('Please install moviepy to enable output file')\n    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],\n                                fps=args.output_fps)\n    vid.write_videofile(args.out_filename)\n    print(\"finish write !\")\n    # delete tmp files and dirs\n    shutil.rmtree(frame_dir)\n    shutil.rmtree(detection_result_dir)\nif __name__ == '__main__':\n    args = parse_args()  #\u89e3\u6790\u53c2\u6570\n    main(args)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ad0de3a8-4889-410f-a868-c65bc2f74d46.json b/docs/doc/ad0de3a8-4889-410f-a868-c65bc2f74d46.json
new file mode 100644
index 000000000..c2e753bc6
--- /dev/null
+++ b/docs/doc/ad0de3a8-4889-410f-a868-c65bc2f74d46.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is from the ActivityNet repo and has been modified to reduce length, possibly for efficiency or better readability. It uses metrics and evaluation methods for action recognition tasks, likely in video analysis applications.",
+    "details": [
+        {
+            "comment": "This code is from the ActivityNet repo and has been modified to reduce length, possibly for efficiency or better readability. It uses metrics and evaluation methods for action recognition tasks, likely in video analysis applications.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/README.md\":0-1",
+            "content": "The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet).\nSome unused codes are removed to minimize the length of codes added."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/aec3b1b1-2fd6-489b-a358-2174bd3aa0d0.json b/docs/doc/aec3b1b1-2fd6-489b-a358-2174bd3aa0d0.json
new file mode 100644
index 000000000..9618d9270
--- /dev/null
+++ b/docs/doc/aec3b1b1-2fd6-489b-a358-2174bd3aa0d0.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code is for the PaddleVideo application's MA-Net model, supporting testing and training on DAVIS dataset with pretrained models for stage1 and stage1+stage2. It runs \"run_local.sh\" script to execute local environment for the MA-Net model in PaddleVideo.",
+    "details": [
+        {
+            "comment": "This code is for the PaddleVideo application, specifically Ma-Net, a CVPR2020 paper implementation. It currently supports model testing and training on DAVIS dataset, with inference on any video coming soon. Two pretrained models are provided for stage1 and stage1+stage2 training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/README.md\":0-34",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](README_cn.md) | English\n# Ma-Net\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n## Introduction\nThis is the paddle implementation of the CVPR2020 paper \"[Memory aggregation networks for efficient interactive video object segmentation](https://arxiv.org/abs/2003.13246)\".\n![avatar](images/1836-teaser.gif)\nThis code currently supports model test and model training on DAVIS  dataset,  and model inference on any given video will be provided in few days.\n## Data\nPlease refer to DAVIS data download and preparation doc [DAVIS-data](dataloaders/DAVIS2017.md)\n## Train and Test\n- You can download [pertained model for stage1](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/DeeplabV3_coco.pdparams) decompress it for stage1 training\u3002\n- You can download [trained model of stage1](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MaNet_davis2017_stage1.pdparams) decompress it for stage2 training directly skipping stage1 training\u3002\n```"
+        },
+        {
+            "comment": "This code snippet executes the \"run_local.sh\" script, which is used to run the local environment for the MA-Net model in the PaddleVideo application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/README.md\":35-46",
+            "content": "sh run_local.sh\n```\n- You can download [our model](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MANet_davis2017.pdparams) decompress it for testing.\nTest accuracy in DAVIS2017:\n| J@60  |  AUC  |\n| :---: | :---: |\n| 0.761 | 0.749 |"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/aed52e27-f4f9-4f88-8b68-834cd809a660.json b/docs/doc/aed52e27-f4f9-4f88-8b68-834cd809a660.json
new file mode 100644
index 000000000..92db03660
--- /dev/null
+++ b/docs/doc/aed52e27-f4f9-4f88-8b68-834cd809a660.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The code is a Python script that initializes a QApplication object, creates an instance of the BuildGUI class from the QEIVideo module, displays it, and executes the application's event loop. It is likely used to launch a graphical user interface (GUI) for a video processing or analysis application. The author is credited, and the code includes a copyright notice requesting proper attribution if reused.",
+    "details": [
+        {
+            "comment": "The code is a Python script that initializes a QApplication object, creates an instance of the BuildGUI class from the QEIVideo module, displays it, and executes the application's event loop. It is likely used to launch a graphical user interface (GUI) for a video processing or analysis application. The author is credited, and the code includes a copyright notice requesting proper attribution if reused.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/start.py\":0-19",
+            "content": "# Author: AP-Kai\n# Datetime: 2022/1/7\n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport sys\nfrom QEIVideo.build_gui import BuildGUI\nfrom PyQt5.QtWidgets import QApplication\ndef run():\n    app = QApplication(sys.argv)\n    demo = BuildGUI()\n    demo.show()\n    sys.exit(app.exec())\nif __name__ == '__main__':\n    run()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/af8e2a48-0b54-431a-92d4-49dbc536be17.json b/docs/doc/af8e2a48-0b54-431a-92d4-49dbc536be17.json
new file mode 100644
index 000000000..f6649a9d3
--- /dev/null
+++ b/docs/doc/af8e2a48-0b54-431a-92d4-49dbc536be17.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code demonstrates how PaddleVideo uses Inversion of Control and Dependency Injection for improved modularity, resolving coupling issues through factory classes and configuration files. It creates class instances based on configs and applies design patterns for dependency injection, using a config file for architecture, dataset, pipeline, and optimizer configurations.",
+    "details": [
+        {
+            "comment": "This code discusses the use of Inversion of Control (IOC) and Dependency Injection (DI) in PaddleVideo, a framework for video processing. It explains how these techniques help decouple and control the framework, increasing modularity and extensibility. The code demonstrates how traditional class instantiation can lead to coupling issues, and how IOC/DI can solve them by creating factory classes and using configuration files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/config.md\":0-36",
+            "content": "# Configs design\n---\nThis page shows how PaddleVideo use the basic IOC/DI technology to decouple and control the whole framework. It is flexible to increase modularity of this system and make it extensible. At last, we will explain the details of config yaml and script args.\n## Design\nFirst, when we create a new class, it is common to new a instance like:\n```python\nclass TSM():\n    pass\nmodel = TSM(init_attributes)\n```\nwhen more classes are created, the coupling relationship between the calling and called method will increase sharply, obviously, we can create a factory class to solve it, like that:\n```python\nif model_name == \"TSM\":\n    model = TSM()\nelif model_name == \"TSN\":\n    model = TSN()\nelif ...\n```\nand\n```python\noptimizer_cfg = dict(name:\"MOMENTUM\", params: XXX)\nif optimizer_cfg.name = \"MOMENTUM\":\n    optimizer = MOMENTUM(optimizer_cfg.pop(name))\nelif:\n    ...\n```\nmore and more conditions have to be created though. like widly used in the Java or other platforms, we apply ```inversion of control``` and ```Dependency Inversion``` to decuople."
+        },
+        {
+            "comment": "This code demonstrates the implementation of Dependency Injection (DI) using a Register and Builder. The Register provides name-to-object mapping, allowing objects to be registered with a specific name. The Builder facilitates obtaining registered modules by accepting a module's name and returning the corresponding instance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/config.md\":38-88",
+            "content": "Second, to implenment DI, we build two components:\n- Register, to regist a class\n- Builder, to new an instance\n1. Register\nWe implenment a getter and a setter function to map string to an instance.\n[source code](../../paddlevideo/utils/registry.py)\n```python\n#excerpt from source code.\nclass Registry():\n    def __init__(self, name):\n        self._name = name\n        self._obj_map = {}\n    #mapping name -> object\n    def register(self,  obj, name):\n        self._obj_map[name] = obj\n    #get object\n    def get(self, name):\n        ret = self._obj_map.get(name)\n        return ret\n```\nIt provides name -> object mapping. For example, To register an object:\n```python\n    BACKBONES = Registry('backbone')\n    class ResNet:\n        pass\n    BACKBONES.register(ResNet)\n```\nOr, use a decorator\n```python\n    BACKBONES = Registry('backbone') #new a Register\n    @BACKBONES.register() #regist resnet as a backbone.\n    class ResNet:\n        pass\n```\n2. Builder\nTo obtain a registed module.\n```python\n    # Usage: To build a module.\n    backbone_name = \"ResNet\""
+        },
+        {
+            "comment": "This code snippet is creating an instance of a class based on its name specified in the configuration file. It applies this design to various components like PIPELINE, BACKBONE, HEAD, LOSS, and METRIC for dependency injection. The attributes in the configuration field match the initialization attributes of the corresponding class. The config file separates architecture, dataset, pipeline, and optimizer configurations, along with global settings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/config.md\":89-116",
+            "content": "    b = BACKBONES.get(backbone_name)()\n```\nso that we can new(register) an instance in **where it declared**, not **where it called**, a basic DI sub-system has been created now.\nWe apply this design on many places, such as: PIPELINE, BACKBONE, HEAD, LOSS, METRIC and so on.\nFinally, We build all of the framework components from config yaml which matches the source code one by one, **It means the attributes in a configuration field is same as the init atrributes of the mathced class**, and to indicate a specified class, we always use ```name``` to mark it. like:\n```yaml\nhead:\n    name: \"TSMHead\"  # class name\n    num_classes: 400 # TSMHead class init attributes\n    ...\n```\n---\n## config yaml details\nWe separate the config to several parts, in high level:\n- **MODEL:** Architecture configuration, such as HEAD module, BACKBONE module.\n- **DATASET:** DATASET and dataloader configuration.\n- **PIPELINE:** pipeline of processing configuration.\n- **OPTIMIZER:** Optimizer configuration.\nand some unique global configurations, like"
+        },
+        {
+            "comment": "This code snippet is describing command-line arguments for a training script. The user can switch validate or test mode on/off, provide weights and config paths, and override specific args using \"-o\" option. It also mentions the available commands for each argument.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/config.md\":117-130",
+            "content": "- model_name\n- log_interval\n- epochs\n- resume_epoch\n- log_level\n...\nTraining script args\n-  **--validate**: switch validate mode on or not\n-  **--test**: switch test mode on or not\n-  **--weights**: weights path\n-  **-c**: config yaml path\n-  **-o**: override args, one can use it like: -o DATASET.batch_size=16"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/af8f088d-6df5-47e6-b9d7-5c875125ad33.json b/docs/doc/af8f088d-6df5-47e6-b9d7-5c875125ad33.json
new file mode 100644
index 000000000..0ec32f4cc
--- /dev/null
+++ b/docs/doc/af8f088d-6df5-47e6-b9d7-5c875125ad33.json
@@ -0,0 +1,60 @@
+{
+    "summary": "This code defines a class for evaluating metrics in video analysis tasks, handling inference mode and performing tagging/classification using a model with functions for metrics update, calculator reset, logging results, and saving/retrieving metrics.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and initializing a class called Metrics. It appears to be part of a larger module for evaluating metrics, possibly in the context of video analysis or recognition tasks. The code defines an object-oriented structure with methods that will likely handle different types of evaluation tasks based on the input name, mode, and metrics_args parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":0-32",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import unicode_literals\nfrom __future__ import print_function\nfrom __future__ import division\nimport logging\nimport os\nimport io\nimport numpy as np\nimport json\nfrom metrics.youtube8m import eval_util as youtube8m_metrics\nfrom metrics.kinetics import accuracy_metrics as kinetics_metrics\nlogger = logging.getLogger(__name__)\nclass Metrics(object):\n    def __init__(self, name, mode, metrics_args):"
+        },
+        {
+            "comment": "The code defines a class named Youtube8mMetrics that inherits from the Metrics base class. It has methods for calculating and logging metrics, accumulating results, finalizing and logging output, and resetting variables. The Youtube8mMetrics class is initialized with a name, mode, and metrics_args. The calculate_and_log_out method calculates loss, prediction, and ground truth labels, then calls the youtube8m_metrics.calculate_hit_at_one function to compute the hit at one metric.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":33-68",
+            "content": "        \"\"\"Not implemented\"\"\"\n        pass\n    def calculate_and_log_out(self, fetch_list, info=''):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def accumulate(self, fetch_list, info=''):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def finalize_and_log_out(self, info='', savedir='./'):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def reset(self):\n        \"\"\"Not implemented\"\"\"\n        pass\nclass Youtube8mMetrics(Metrics):\n    def __init__(self, name, mode, metrics_args):\n        self.name = name\n        self.mode = mode\n        self.num_classes = metrics_args['MODEL']['num_classes']\n        self.topk = metrics_args['MODEL']['topk']\n        self.calculator = youtube8m_metrics.EvaluationMetrics(\n            self.num_classes, self.topk)\n        if self.mode == 'infer':\n            self.infer_results = []\n    def calculate_and_log_out(self, fetch_list, info=''):\n        loss = np.mean(np.array(fetch_list[0]))\n        pred = np.array(fetch_list[1])\n        label = np.array(fetch_list[2])\n        hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)"
+        },
+        {
+            "comment": "This function accumulates metrics for a video tagging application. It handles two modes: 'infer' and others. For the 'infer' mode, it gathers predictions for each video, calculates top-k indices, and appends them to a list. For other modes, it takes in loss, prediction, and label arrays, and accumulates metrics using the calculator object. It logs information including loss, Hit@1, precision at equal recall rate (PERR), and gap.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":69-89",
+            "content": "        perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(\n            pred, label)\n        gap = youtube8m_metrics.calculate_gap(pred, label)\n        logger.info(info + ' , loss = {0}, Hit@1 = {1}, PERR = {2}, GAP = {3}'.format(\\\n                     '%.6f' % loss, '%.2f' % hit_at_one, '%.2f' % perr, '%.2f' % gap))\n    def accumulate(self, fetch_list, info=''):\n        if self.mode == 'infer':\n            predictions = np.array(fetch_list[0])\n            video_id = fetch_list[1]\n            for i in range(len(predictions)):\n                topk_inds = predictions[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds = predictions[i][topk_inds]\n                self.infer_results.append(\n                    (video_id[i], topk_inds.tolist(), preds.tolist()))\n        else:\n            loss = np.array(fetch_list[0])\n            pred = np.array(fetch_list[1])\n            label = np.array(fetch_list[2])\n            self.calculator.accumulate(loss, pred, label)"
+        },
+        {
+            "comment": "This code snippet is part of the VideoTag application and it logs out the final results for each video. It reads a label file, iterates through each item's predictions, matches class ID to class name and probability, and then prints them out. The function can be called with optional parameters to specify the output directory (default: `./data/results`) and the label file path (default: `./label_3396.txt`). It is designed to run in 'infer' mode only.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":91-112",
+            "content": "    def finalize_and_log_out(self,\n                             info='',\n                             savedir='./data/results',\n                             label_file='./label_3396.txt'):\n        if self.mode == 'infer':\n            for index, item in enumerate(self.infer_results):\n                video_id = item[0]\n                print('[========video_id [ {} ] , topk({}) preds: ========]\\n'.\n                      format(video_id, self.topk))\n                f = io.open(label_file, \"r\", encoding=\"utf-8\")\n                fl = f.readlines()\n                res_list = []\n                res_list.append(video_id)\n                for i in range(len(item[1])):\n                    class_id = item[1][i]\n                    class_prob = item[2][i]\n                    class_name = fl[class_id].split('\\n')[0]\n                    print('class_id: {},'.format(class_id), 'class_name:',\n                          class_name,\n                          ',  probability:  {} \\n'.format(class_prob))\n                    save_dict = {"
+        },
+        {
+            "comment": "This code snippet appears to be part of a larger program that performs some sort of video tagging or classification. It includes functions to save the result of an inference operation and update metrics after each epoch, as well as a reset function for the calculator. The \"calculator\" object seems to keep track of average hit rate at one, perr, loss, aps, and gap for some type of learning algorithm or model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":113-134",
+            "content": "                        \"'class_id\": class_id,\n                        \"class_name\": class_name,\n                        \"probability\": class_prob\n                    }\n                    res_list.append(save_dict)\n                # save infer result into output dir\n                with io.open(os.path.join(savedir,\n                                          'result' + str(index) + '.json'),\n                             'w',\n                             encoding='utf-8') as f:\n                    f.write(json.dumps(res_list, ensure_ascii=False))\n        else:\n            epoch_info_dict = self.calculator.get()\n            logger.info(info + '\\tavg_hit_at_one: {0},\\tavg_perr: {1},\\tavg_loss :{2},\\taps: {3},\\tgap:{4}'\\\n                     .format(epoch_info_dict['avg_hit_at_one'], epoch_info_dict['avg_perr'], \\\n                             epoch_info_dict['avg_loss'], epoch_info_dict['aps'], epoch_info_dict['gap']))\n    def reset(self):\n        self.calculator.clear()\n        if self.mode == 'infer':\n            self.infer_results = []"
+        },
+        {
+            "comment": "Class Kinetics400Metrics is used for calculating and logging metrics, accepting a name, mode, and metrics_args. It stores the topk value from metrics_args, initializes a MetricsCalculator instance with the given name and mode, and maintains an infer_results list if in inference mode. The calculate_and_log_out method takes a fetch_list as input and calculates the mean loss, accuracy for top-1 and top-5 predictions, and logs the information. It can be used to accumulate results during inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":137-162",
+            "content": "class Kinetics400Metrics(Metrics):\n    def __init__(self, name, mode, metrics_args):\n        self.name = name\n        self.mode = mode\n        self.topk = metrics_args['MODEL']['topk']\n        self.calculator = kinetics_metrics.MetricsCalculator(name, mode.lower())\n        if self.mode == 'infer':\n            self.infer_results = []\n    def calculate_and_log_out(self, fetch_list, info=''):\n        if len(fetch_list) == 3:\n            loss = fetch_list[0]\n            loss = np.mean(np.array(loss))\n            pred = np.array(fetch_list[1])\n            label = np.array(fetch_list[2])\n        else:\n            loss = 0.\n            pred = np.array(fetch_list[0])\n            label = np.array(fetch_list[1])\n        acc1, acc5 = self.calculator.calculate_metrics(loss, pred, label)\n        logger.info(info + '\\tLoss: {},\\ttop1_acc: {}, \\ttop5_acc: {}'.format('%.6f' % loss, \\\n                       '%.2f' % acc1, '%.2f' % acc5))\n        return loss\n    def accumulate(self, fetch_list, info=''):\n        if self.mode == 'infer':"
+        },
+        {
+            "comment": "This code appears to be a part of a machine learning model's evaluation process. It calculates top predictions and loss values for each video, accumulates them, and then logs out the results. The method \"finalize_and_log_out\" likely concludes the evaluation process and saves or outputs the final results. The code seems to handle both cases where results are available for each video (predictions and losses) and when only predictions and labels are given.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":163-186",
+            "content": "            predictions = np.array(fetch_list[0])\n            video_id = fetch_list[1]\n            for i in range(len(predictions)):\n                topk_inds = predictions[i].argsort()[0 - self.topk:]\n                topk_inds = topk_inds[::-1]\n                preds = predictions[i][topk_inds]\n                self.infer_results.append(\n                    (video_id[i], topk_inds.tolist(), preds.tolist()))\n        else:\n            if len(fetch_list) == 3:\n                loss = fetch_list[0]\n                loss = np.mean(np.array(loss))\n                pred = np.array(fetch_list[1])\n                label = np.array(fetch_list[2])\n            else:\n                loss = 0.\n                pred = np.array(fetch_list[0])\n                label = np.array(fetch_list[1])\n            self.calculator.accumulate(loss, pred, label)\n    def finalize_and_log_out(self,\n                             info='',\n                             savedir='./data/results',\n                             label_file='./label_3396.txt'):"
+        },
+        {
+            "comment": "This code is part of a function that iterates over the 'infer_results' list and prints out the video ID, topk predictions for each class, along with their respective probabilities. It reads labels from the 'label_file', appends each prediction to a 'res_list' as a dictionary containing class_id, class_name, and probability, and then continues to the next iteration. The label file is read once per video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":187-209",
+            "content": "        if self.mode == 'infer':\n            for index, item in enumerate(self.infer_results):\n                video_id = item[0]\n                print('[========video_id [ {} ] , topk({}) preds: ========]\\n'.\n                      format(video_id, self.topk))\n                f = io.open(label_file, \"r\", encoding=\"utf-8\")\n                fl = f.readlines()\n                res_list = []\n                res_list.append(video_id)\n                for i in range(len(item[1])):\n                    class_id = item[1][i]\n                    class_prob = item[2][i]\n                    class_name = fl[class_id].split('\\n')[0]\n                    print('class_id: {},'.format(class_id), 'class_name:',\n                          class_name,\n                          ',  probability:  {} \\n'.format(class_prob))\n                    save_dict = {\n                        \"'class_id\": class_id,\n                        \"class_name\": class_name,\n                        \"probability\": class_prob\n                    }\n                    res_list.append(save_dict)"
+        },
+        {
+            "comment": "The code saves the infer results into the specified output directory, finalizes and retrieves computed metrics from a calculator, logs the loss, top1_acc, and top5_acc if in 'train' mode, resets the calculator and list of infer results when resetting, and defines a MetricsNotFoundError exception for missing metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":211-236",
+            "content": "                # save infer result into output dir\n                with io.open(os.path.join(savedir,\n                                          'result' + str(index) + '.json'),\n                             'w',\n                             encoding='utf-8') as f:\n                    f.write(json.dumps(res_list, ensure_ascii=False))\n        else:\n            self.calculator.finalize_metrics()\n            metrics_dict = self.calculator.get_computed_metrics()\n            loss = metrics_dict['avg_loss']\n            acc1 = metrics_dict['avg_acc1']\n            acc5 = metrics_dict['avg_acc5']\n            logger.info(info + '\\tLoss: {},\\ttop1_acc: {}, \\ttop5_acc: {}'.format('%.6f' % loss, \\\n                       '%.2f' % acc1, '%.2f' % acc5))\n    def reset(self):\n        self.calculator.reset()\n        if self.mode == 'infer':\n            self.infer_results = []\nclass MetricsNotFoundError(Exception):\n    \"Error: metrics not found\"\n    def __init__(self, metrics_name, avail_metrics):\n        super(MetricsNotFoundError, self).__init__()"
+        },
+        {
+            "comment": "This code defines a MetricsZoo class to manage and retrieve metrics. It provides regist() and get() methods for registering and retrieving metrics by name, respectively. The MetricsZoo instance is made singleton via global variable metrics_zoo. Youtube8mMetrics are registered under the name \"ATTENTIONLSTM\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":237-277",
+            "content": "        self.metrics_name = metrics_name\n        self.avail_metrics = avail_metrics\n    def __str__(self):\n        msg = \"Metrics {} Not Found.\\nAvailiable metrics:\\n\".format(\n            self.metrics_name)\n        for metric in self.avail_metrics:\n            msg += \"  {}\\n\".format(metric)\n        return msg\nclass MetricsZoo(object):\n    def __init__(self):\n        self.metrics_zoo = {}\n    def regist(self, name, metrics):\n        assert metrics.__base__ == Metrics, \"Unknow model type {}\".format(\n            type(metrics))\n        self.metrics_zoo[name] = metrics\n    def get(self, name, mode, cfg):\n        for k, v in self.metrics_zoo.items():\n            if k == name:\n                return v(name, mode, cfg)\n        raise MetricsNotFoundError(name, self.metrics_zoo.keys())\n# singleton metrics_zoo\nmetrics_zoo = MetricsZoo()\ndef regist_metrics(name, metrics):\n    metrics_zoo.regist(name, metrics)\ndef get_metrics(name, mode, cfg):\n    return metrics_zoo.get(name, mode, cfg)\n# sort by alphabet\nregist_metrics(\"ATTENTIONLSTM\", Youtube8mMetrics)"
+        },
+        {
+            "comment": "The code registers the \"TSN\" metric with the Kinetics400Metrics class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/metrics_util.py\":278-278",
+            "content": "regist_metrics(\"TSN\", Kinetics400Metrics)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/af9be0dc-2f2c-4e4e-9b38-9456a5780c3f.json b/docs/doc/af9be0dc-2f2c-4e4e-9b38-9456a5780c3f.json
new file mode 100644
index 000000000..68e703002
--- /dev/null
+++ b/docs/doc/af9be0dc-2f2c-4e4e-9b38-9456a5780c3f.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports necessary classes and defines the publically accessible '__all__' list, containing the DepthEstimator and BaseEstimator classes.",
+    "details": [
+        {
+            "comment": "This code imports necessary classes and defines the publically accessible '__all__' list, containing the DepthEstimator and BaseEstimator classes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/__init__.py\":0-3",
+            "content": "from .base import BaseEstimator\nfrom .depth_estimator import DepthEstimator\n__all__ = ['DepthEstimator', 'BaseEstimator']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/af9fe02c-7c3c-481f-8311-f5310a02ab73.json b/docs/doc/af9fe02c-7c3c-481f-8311-f5310a02ab73.json
new file mode 100644
index 000000000..957165c4a
--- /dev/null
+++ b/docs/doc/af9fe02c-7c3c-481f-8311-f5310a02ab73.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines functions for converting tensors to images, applying masks, normalizing images, and constructing model names. It also includes functions for computing foreground and nocare area using OpenCV's dilation operation with optional parameters, returning the 'nocare' along with the original foreground image.",
+    "details": [
+        {
+            "comment": "ens2image: Converts a tensor to an image by removing dimensions and transposing if necessary.\n\noverlay_mask: Applies a mask on top of an image, allowing for color overlay and background adjustment.\n\nim_normalize: Normalizes an image by scaling pixel values between 0 and 1 based on the range of values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/helpers.py\":0-45",
+            "content": "import numpy as np\nimport cv2\ndef tens2image(im):\n    tmp = np.squeeze(im.numpy())\n    if tmp.ndim == 2:\n        return tmp\n    else:\n        return tmp.transpose((1, 2, 0))\ndef overlay_mask(im, ma, color=np.array([255, 0, 0]) / 255.0):\n    assert np.max(im) <= 1.0\n    ma = ma.astype(np.bool)\n    im = im.astype(np.float32)\n    alpha = 0.5\n    fg = im * alpha + np.ones(\n        im.shape) * (1 - alpha) * color  # np.array([0,0,255])/255.0\n    # Whiten background\n    alpha = 1.0\n    bg = im.copy()\n    bg[ma == 0] = im[ma == 0] * alpha + np.ones(im[ma == 0].shape) * (1 - alpha)\n    bg[ma == 1] = fg[ma == 1]\n    # [-2:] is s trick to be compatible both with opencv 2 and 3\n    contours = cv2.findContours(ma.copy().astype(np.uint8), cv2.RETR_TREE,\n                                cv2.CHAIN_APPROX_SIMPLE)[-2:]\n    cv2.drawContours(bg, contours[0], -1, (0.0, 0.0, 0.0), 1)\n    return bg\ndef im_normalize(im):\n    \"\"\"\n    Normalize image\n    \"\"\"\n    imn = (im - im.min()) / max((im.max() - im.min()), 1e-8)\n    return imn\ndef construct_name(p, prefix):"
+        },
+        {
+            "comment": "The code defines a function to construct the name of a model by concatenating keys and values from the input dictionary. It also includes two additional functions: one for computing foreground based on a given scribble image, and another for computing a nocare area with optional dilation and size parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/helpers.py\":46-77",
+            "content": "    \"\"\"\n    Construct the name of the model\n    p: dictionary of parameters\n    prefix: the prefix\n    name: the name of the model - manually add \".pth\" to follow the convention\n    \"\"\"\n    name = prefix\n    for key in p.keys():\n        if (type(p[key]) != tuple) and (type(p[key]) != list):\n            name = name + '_' + str(key) + '-' + str(p[key])\n        else:\n            name = name + '_' + str(key) + '-' + str(p[key][0])\n    return name\ndef gt_from_scribble(scr, dilation=11, nocare_area=21):\n    # Compute foreground\n    if scr.max() == 1:\n        kernel_fg = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,\n                                              (dilation, dilation))\n        fg = cv2.dilate(scr.astype(np.uint8),\n                        kernel=kernel_fg).astype(scr.dtype)\n    else:\n        fg = scr\n    # Compute nocare area\n    if nocare_area is None:\n        nocare = None\n    else:\n        kernel_nc = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,\n                                              (nocare_area, nocare_area))"
+        },
+        {
+            "comment": "The code uses OpenCV's dilation operation to enhance the background care region by dilating the foreground image with a given kernel. The resulting 'nocare' is then returned along with the original foreground image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/helpers.py\":78-80",
+            "content": "        nocare = cv2.dilate(fg, kernel=kernel_nc) - fg\n    return fg, nocare"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/afc1c7f3-617b-4fa5-8a8c-2ed81cf76d66.json b/docs/doc/afc1c7f3-617b-4fa5-8a8c-2ed81cf76d66.json
new file mode 100644
index 000000000..40895c621
--- /dev/null
+++ b/docs/doc/afc1c7f3-617b-4fa5-8a8c-2ed81cf76d66.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is importing two modules, 'base_model' and 'base_trainer', from the current package's subfolders. These modules likely contain the base model and trainer classes for further use in the application.",
+    "details": [
+        {
+            "comment": "This code is importing two modules, 'base_model' and 'base_trainer', from the current package's subfolders. These modules likely contain the base model and trainer classes for further use in the application.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/__init__.py\":0-1",
+            "content": "from .base_model import *\nfrom .base_trainer import *"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b04f83e0-4d87-450f-8743-ca5dc71128b5.json b/docs/doc/b04f83e0-4d87-450f-8743-ca5dc71128b5.json
new file mode 100644
index 000000000..e7855f60e
--- /dev/null
+++ b/docs/doc/b04f83e0-4d87-450f-8743-ca5dc71128b5.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This class defines the BaseDataset for PaddlePaddle, with methods for loading data, preparing training and testing sets, and retrieving samples. It supports list format results due to limitations in Paddle.io.DataLoader.",
+    "details": [
+        {
+            "comment": "This code is a Python class definition for the BaseDataset, which serves as the base class for all dataset types in PaddlePaddle. It requires subclasses to define load_file method for loading info from index files and provide train and test data using prepare_train and prepare_test methods respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/base.py\":0-31",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport numpy as np\nfrom abc import ABC, abstractmethod\nimport paddle\nfrom paddle.io import Dataset\nclass BaseDataset(Dataset, ABC):\n    \"\"\"Base class for datasets\n    All datasets should subclass it.\n    All subclass should overwrite:\n    - Method: `load_file`, load info from index file.\n    - Method: `prepare_train`, providing train data.\n    - Method: `prepare_test`, providing test data."
+        },
+        {
+            "comment": "This code initializes a base dataset class with file path, pipeline, data prefix, and test mode as arguments. It loads video information from the index file using load_file() method, supports training and validation, but cannot handle dict type results due to Paddle.io limitations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/base.py\":33-58",
+            "content": "    Args:\n        file_path (str): index file path.\n        pipeline (Sequence XXX)\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): whether to build test dataset. Default: False.\n    \"\"\"\n    def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):\n        super().__init__()\n        self.file_path = file_path\n        self.data_prefix = osp.realpath(data_prefix) if \\\n            data_prefix is not None and osp.isdir(data_prefix) else data_prefix\n        self.test_mode = test_mode\n        self.pipeline = pipeline\n        self.info = self.load_file()\n    @abstractmethod\n    def load_file(self):\n        \"\"\"load the video information from the index file path.\"\"\"\n        pass\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)"
+        },
+        {
+            "comment": "This code defines a dataset class with methods for preparing data for training and testing. The `prepare_train` method returns the input images and labels for training, while the `prepare_test` method does the same for testing. The `__len__` method returns the size of the dataset, and the `__getitem__` method retrieves either a training or testing sample based on the mode. Due to an issue with Paddle.io.DataLoader not supporting dict type retval, the results are converted to list format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/base.py\":59-79",
+            "content": "        #unsqueeze label to list\n        return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        #unsqueeze label to list\n        return results['imgs'], np.array([results['labels']])\n    def __len__(self):\n        \"\"\"get the size of the dataset.\"\"\"\n        return len(self.info)\n    def __getitem__(self, idx):\n        \"\"\" Get the sample for either training or testing given index\"\"\"\n        if self.test_mode:\n            return self.prepare_test(idx)\n        else:\n            return self.prepare_train(idx)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b05c9015-12d3-4ab9-acb2-2925a7e10b3a.json b/docs/doc/b05c9015-12d3-4ab9-acb2-2925a7e10b3a.json
new file mode 100644
index 000000000..e376af77f
--- /dev/null
+++ b/docs/doc/b05c9015-12d3-4ab9-acb2-2925a7e10b3a.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code utilizes the BAIDU CLOUD action to classify videos, extracts features, and performs prediction using a pre-trained model. It saves proposal counts and bounding box results in a JSON file with UTF-8 encoding and indentation.",
+    "details": [
+        {
+            "comment": "This code is loading a model for video classification using the BAIDU CLOUD action. It first loads the configuration file, then prints the configurations, and finally initializes the InferModel class with the loaded configurations. The `video_classify` function takes a video name as input, likely to perform feature extraction or prediction on that video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/extractor/extract_bmn.py\":0-48",
+            "content": "#!./python27-gcc482/bin/python\n# coding: utf-8\n\"\"\"\nBAIDU CLOUD action\n\"\"\"\nimport os\nimport sys\nimport pickle\nimport json\nimport time\nimport shutil\nimport numpy as np\nsys.path.append(\"../predict/action_detect\")\nimport models.bmn_infer as prop_model\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config, print_configs\nimport utils.config_utils as config_utils\nimport logger\nlogger = logger.Logger()\ndef load_model(cfg_file=\"configs/configs.yaml\"):\n    \"\"\"\n    load_model\n    \"\"\"\n    logger.info(\"load model ... \")\n    global infer_configs\n    infer_configs = parse_config(cfg_file)\n    print_configs(infer_configs, \"Infer\")\n    t0 = time.time()\n    global prop_model\n    prop_model = prop_model.InferModel(infer_configs)\n    t1 = time.time()\n    logger.info(\"step0: load model time: {} min\\n\".format((t1 - t0) * 1.0 / 60))\ndef video_classify(video_name):\n    \"\"\"\n    extract_feature\n    \"\"\"\n    logger.info('predict ... ')\n    logger.info(video_name)\n    imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")"
+        },
+        {
+            "comment": "This code extracts features from videos and predicts proposals using a pre-trained model. It loads the necessary configurations, creates feature directories if they don't exist, reads video URLs from a file, processes each video to obtain bounding box results, and saves these results into a list of dictionaries for further analysis or processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/extractor/extract_bmn.py\":49-82",
+            "content": "    pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n    # step 1: extract feature\n    feature_path = video_name.replace(\".mp4\", \".pkl\").replace(\"mp4\", \"features\")\n    video_features = pickle.load(open(feature_path, 'rb'))\n    # step2: get proposal\n    t0 = time.time()\n    bmn_results = prop_model.predict(infer_configs, material=video_features)\n    t1 = time.time()\n    logger.info(np.array(bmn_results).shape)\n    logger.info(\"step2: proposal time: {} min\".format((t1 - t0) * 1.0 / 60))\n    return bmn_results\nif __name__ == '__main__':\n    dataset_dir = \"../datasets/EuroCup2016\"\n    if not os.path.exists(dataset_dir + '/feature_bmn'):\n        os.mkdir(dataset_dir + '/feature_bmn')\n    results = []\n    load_model()\n    video_url = os.path.join(dataset_dir, 'url.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]\n    for line in lines:\n        bmn_results = video_classify(line)\n        results.append({\n            'video_name': os.path.basename(line).split('.')[0],"
+        },
+        {
+            "comment": "This code saves the number of proposals and a list of bounding box results for each proposal in a JSON file, formatting it with indentation and using UTF-8 encoding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/extractor/extract_bmn.py\":83-90",
+            "content": "            'num_proposal': len(bmn_results),\n            'bmn_results': bmn_results\n        })\n    with open(dataset_dir + '/feature_bmn/prop.json', 'w',\n              encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b14fe4fe-b2cb-4ca6-8e59-953f307b7852.json b/docs/doc/b14fe4fe-b2cb-4ca6-8e59-953f307b7852.json
new file mode 100644
index 000000000..1bde37e51
--- /dev/null
+++ b/docs/doc/b14fe4fe-b2cb-4ca6-8e59-953f307b7852.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code provides TIPC support for PaddleVideo, offering tutorials on acceleration features, defining naming conventions for testing, ONNX conversion, deployment with Paddle Serving, offline quantized training/inference, and multi-machine multi-GPU training/inference.",
+    "details": [
+        {
+            "comment": "This code provides an introduction to the PaddleVideo training and inference pipeline certification (TIPC), including a summary of support status for various models and deployment methods. It also mentions that more details on specific acceleration features can be found in tutorials associated with each test tool.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/README.md\":1-29",
+            "content": "# \u98de\u6868\u8bad\u63a8\u4e00\u4f53\u8ba4\u8bc1\uff08TIPC\uff09\n## 1. \u7b80\u4ecb\n\u98de\u6868\u9664\u4e86\u57fa\u672c\u7684\u6a21\u578b\u8bad\u7ec3\u548c\u9884\u6d4b\uff0c\u8fd8\u63d0\u4f9b\u4e86\u652f\u6301\u591a\u7aef\u591a\u5e73\u53f0\u7684\u9ad8\u6027\u80fd\u63a8\u7406\u90e8\u7f72\u5de5\u5177\u3002\u672c\u6587\u6863\u63d0\u4f9b\u4e86PaddleVideo\u4e2d\u6240\u6709\u6a21\u578b\u7684\u98de\u6868\u8bad\u63a8\u4e00\u4f53\u8ba4\u8bc1 (Training and Inference Pipeline Certification(TIPC)) \u4fe1\u606f\u548c\u6d4b\u8bd5\u5de5\u5177\uff0c\u65b9\u4fbf\u7528\u6237\u67e5\u9605\u6bcf\u79cd\u6a21\u578b\u7684\u8bad\u7ec3\u63a8\u7406\u90e8\u7f72\u6253\u901a\u60c5\u51b5\uff0c\u5e76\u53ef\u4ee5\u8fdb\u884c\u4e00\u952e\u6d4b\u8bd5\u3002\n<div align=\"center\">\n    <img src=\"docs/guide.png\" width=\"1000\">\n</div>\n## 2. \u6c47\u603b\u4fe1\u606f\n\u6253\u901a\u60c5\u51b5\u6c47\u603b\u5982\u4e0b\uff0c\u5df2\u586b\u5199\u7684\u90e8\u5206\u8868\u793a\u53ef\u4ee5\u4f7f\u7528\u672c\u5de5\u5177\u8fdb\u884c\u4e00\u952e\u6d4b\u8bd5\uff0c\u672a\u586b\u5199\u7684\u8868\u793a\u6b63\u5728\u652f\u6301\u4e2d\u3002\n**\u5b57\u6bb5\u8bf4\u660e\uff1a**\n- \u57fa\u7840\u8bad\u7ec3\u9884\u6d4b\uff1a\u5305\u62ec\u6a21\u578b\u8bad\u7ec3\u3001Paddle Inference Python\u9884\u6d4b\u3002\n- \u66f4\u591a\u8bad\u7ec3\u65b9\u5f0f\uff1a\u5305\u62ec\u591a\u673a\u591a\u5361(TODO)\u3001\u6df7\u5408\u7cbe\u5ea6\u3002\n- \u6a21\u578b\u538b\u7f29\uff1a\u5305\u62ec\u88c1\u526a\u3001\u79bb\u7ebf/\u5728\u7ebf\u91cf\u5316(TODO)\u3001\u84b8\u998f(TODO)\u3002\n- \u5176\u4ed6\u9884\u6d4b\u90e8\u7f72\uff1a\u5305\u62ecPaddle Inference C++\u9884\u6d4b\u3001Paddle Serving\u90e8\u7f72\u3001Paddle-Lite\u90e8\u7f72(TODO)\u7b49\u3002\n\u66f4\u8be6\u7ec6\u7684mkldnn\u3001Tensorrt\u7b49\u9884\u6d4b\u52a0\u901f\u76f8\u5173\u529f\u80fd\u7684\u652f\u6301\u60c5\u51b5\u53ef\u4ee5\u67e5\u770b\u5404\u6d4b\u8bd5\u5de5\u5177\u7684[\u66f4\u591a\u6559\u7a0b](#more)\u3002\n| \u7b97\u6cd5\u540d\u79f0 | \u6a21\u578b\u540d\u79f0 | \u6a21\u578b\u7c7b\u578b | \u57fa\u7840<br>\u8bad\u7ec3\u9884\u6d4b | \u66f4\u591a<br>\u8bad\u7ec3\u65b9\u5f0f | \u6a21\u578b\u538b\u7f29 |  \u5176\u4ed6\u9884\u6d4b\u90e8\u7f72  |\n| :--- | :--- |  :----:  | :--------: |  :----  |   :----  |   :----  |\n| PP-TSM     |pptsm_k400_frames_uniform | \u52a8\u4f5c\u8bc6\u522b | \u652f\u6301 | \u6df7\u5408\u7cbe\u5ea6 | \u79bb\u7ebf\u91cf\u5316 | Paddle Inference: C++ |\n| PP-TSN |pptsn_k400_videos | \u52a8\u4f5c\u8bc6\u522b | \u652f\u6301 | \u6df7\u5408\u7cbe\u5ea6 | - | Paddle Inference: C++ |\n| AGCN |agcn_fsd\t | \u52a8\u4f5c\u8bc6\u522b | \u652f\u6301 | \u6df7\u5408\u7cbe\u5ea6 | - | - |\n| STGCN |stgcn_fsd | \u52a8\u4f5c\u8bc6\u522b | \u652f\u6301 | \u6df7\u5408\u7cbe\u5ea6 | - | - |\n| TimeSformer |timesformer_k400_videos | \u52a8\u4f5c\u8bc6\u522b | \u652f\u6301 | \u6df7\u5408\u7cbe\u5ea6 | - | - |"
+        },
+        {
+            "comment": "This code snippet introduces the test tool for PaddleVideo, providing an overview of supported models and their respective configurations, as well as the directory structure containing these configuration files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/README.md\":30-54",
+            "content": "| SlowFast |slowfast | \u52a8\u4f5c\u8bc6\u522b | \u652f\u6301 | \u6df7\u5408\u7cbe\u5ea6 | - | - |\n| TSM  |tsm_k400_frames | \u52a8\u4f5c\u8bc6\u522b | \u652f\u6301 | \u6df7\u5408\u7cbe\u5ea6 | - | - |\n| TSN  |tsn_k400_frames          | \u52a8\u4f5c\u8bc6\u522b |\u652f\u6301|\u6df7\u5408\u7cbe\u5ea6|-|-|\n| AttentionLSTM |attention_lstm_youtube8m | \u52a8\u4f5c\u8bc6\u522b | \u652f\u6301 | \u6df7\u5408\u7cbe\u5ea6 | - | - |\n| BMN |bmn | \u52a8\u4f5c\u65f6\u95f4\u5b9a\u4f4d | \u652f\u6301 | \u6df7\u5408\u7cbe\u5ea6 | - | - |\n## 3. \u6d4b\u8bd5\u5de5\u5177\u7b80\u4ecb\n### \u76ee\u5f55\u4ecb\u7ecd\n```shell\ntest_tipc/\n\u251c\u2500\u2500 configs/  # \u914d\u7f6e\u6587\u4ef6\u76ee\u5f55\n\u2502   \u251c\u2500\u2500 PP-TSM/\n\u2502   \u2502   \u251c\u2500\u2500 train_infer_python.txt # PP-TSM\u5728Linux\u4e0a\u8fdb\u884cpython\u8bad\u7ec3\u9884\u6d4b\uff08\u57fa\u7840\u8bad\u7ec3\u9884\u6d4b\uff09\u7684\u914d\u7f6e\u6587\u4ef6\n\u2502   \u2502   \u251c\u2500\u2500 serving_infer_cpp.txt  # PP-TSM\u5728Linux\u4e0a\u8fdb\u884ccpp serving\u6d4b\u8bd5\u7684\u914d\u7f6e\u6587\u4ef6\n\u2502   \u2502   \u251c\u2500\u2500 train_amp_infer_python.txt # PP-TSM\u5728Linux\u4e0a\u8fdb\u884cpython\u8bad\u7ec3\u9884\u6d4b\uff08\u6df7\u5408\u7cbe\u5ea6\u8bad\u7ec3\u9884\u6d4b\uff09\u7684\u914d\u7f6e\u6587\u4ef6\n\u2502   \u2502   \u251c\u2500\u2500 serving_infer_python.txt # PP-TSM\u5728Linux\u4e0a\u8fdb\u884cpython serving\u9884\u6d4b\u7684\u914d\u7f6e\u6587\u4ef6\n\u2502   \u2502   \u2514\u2500\u2500 train_ptq_infer_python.txt # PP-TSM\u5728Linux\u4e0a\u8fdb\u884c\u79bb\u7ebf\u91cf\u5316\u63a8\u7406\u6d4b\u8bd5\u7684\u914d\u7f6e\u6587\u4ef6\n\u2502   \u251c\u2500\u2500 PP-TSN/\n\u2502   \u2502   \u251c\u2500\u2500 train_infer_python.txt # PP-TSN\u5728Linux\u4e0a\u8fdb\u884cpython\u8bad\u7ec3\u9884\u6d4b\uff08\u57fa\u7840\u8bad\u7ec3\u9884\u6d4b\uff09\u7684\u914d\u7f6e\u6587\u4ef6\n\u2502   \u2502   \u251c\u2500\u2500 paddle2onnx_infer_python.txt # PP-TSN\u5728Linux\u4e0a\u8fdb\u884cPaddle2ONNX\u9884\u6d4b\uff08\u57fa\u7840\u8bad\u7ec3\u9884\u6d4b\uff09\u7684\u914d\u7f6e\u6587\u4ef6\n\u2502   \u2502   \u251c\u2500\u2500 serving_infer_cpp.txt  # PP-TSN\u5728Linux\u4e0a\u8fdb\u884ccpp serving\u6d4b\u8bd5\u7684\u914d\u7f6e\u6587\u4ef6\n\u2502   \u2502   \u2514\u2500\u2500 train_amp_infer_python.txt # PP-TSN\u5728Linux\u4e0a\u8fdb\u884cpython\u8bad\u7ec3\u9884\u6d4b\uff08\u6df7\u5408\u7cbe\u5ea6\u8bad\u7ec3\u9884\u6d4b\uff09\u7684\u914d\u7f6e\u6587\u4ef6"
+        },
+        {
+            "comment": "This code represents the directory structure of a PaddleVideo test_tipc project. It includes pre-stored prediction results for various models in the 'results' folder, which are used to compare and verify the precision of the actual predictions. The scripts 'prepare.sh', 'test_train_inference_python.sh', 'test_inference_cpp.sh', and 'compare_results.py' are provided for testing, training, inference using Python or C++, as well as comparing the results with pre-stored data to calculate precision errors. The 'docs' folder contains detailed documentation on TIPC features, while 'test_paddle2onnx.sh' is used to test Paddle to ONNX conversion and inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/README.md\":55-75",
+            "content": "\u2502   \u251c\u2500\u2500 ...\n\u2502   \u2514\u2500\u2500 ...\n\u251c\u2500\u2500 results/   # \u9884\u5148\u4fdd\u5b58\u7684\u9884\u6d4b\u7ed3\u679c\uff0c\u7528\u4e8e\u548c\u5b9e\u9645\u9884\u6d4b\u7ed3\u679c\u8fdb\u884c\u7cbe\u5ea6\u6bd4\u5bf9\n\u2502   \u251c\u2500\u2500 PP-TSM/\n\u2502   \u2502\t\u251c\u2500\u2500 python_ppvideo_PP-TSM_results_fp16.txt # \u9884\u5b58\u7684PP-TSM\u8bc6\u522b\u8bc6\u522b\u6a21\u578bpython\u9884\u6d4bfp16\u7cbe\u5ea6\u7684\u7ed3\u679c\n\u2502   \u2502\t\u2514\u2500\u2500 python_ppvideo_PP-TSM_results_fp32.txt # \u9884\u5b58\u7684PP-TSM\u8bc6\u522b\u8bc6\u522b\u6a21\u578bpython\u9884\u6d4bfp32\u7cbe\u5ea6\u7684\u7ed3\u679c\n\u2502   \u251c\u2500\u2500 PP-TSN/\n\u2502   \u2502\t\u251c\u2500\u2500 python_ppvideo_PP-TSN_results_fp32.txt # \u9884\u5b58\u7684PP-TSN\u8bc6\u522b\u8bc6\u522b\u6a21\u578bpython\u9884\u6d4bfp16\u7cbe\u5ea6\u7684\u7ed3\u679c\n\u2502   \u2502\t\u2514\u2500\u2500 python_ppvideo_PP-TSN_results_fp32.txt # \u9884\u5b58\u7684PP-TSN\u8bc6\u522b\u8bc6\u522b\u6a21\u578bpython\u9884\u6d4bfp32\u7cbe\u5ea6\u7684\u7ed3\u679c\n\u2502   \u251c\u2500\u2500 PP-TSN_CPP/\n\u2502   \u2502\t\u251c\u2500\u2500 python_ppvideo_PP-TSN_results_fp32.txt # \u9884\u5b58\u7684PP-TSN\u8bc6\u522b\u8bc6\u522b\u6a21\u578bC++\u9884\u6d4bfp16\u7cbe\u5ea6\u7684\u7ed3\u679c\n\u2502   \u2502\t\u2514\u2500\u2500 python_ppvideo_PP-TSN_results_fp32.txt # \u9884\u5b58\u7684PP-TSN\u8bc6\u522b\u8bc6\u522b\u6a21\u578bC++\u9884\u6d4bfp32\u7cbe\u5ea6\u7684\u7ed3\u679c\n\u2502   \u251c\u2500\u2500 ...\n\u2502   \u2514\u2500\u2500 ...\n\u251c\u2500\u2500 prepare.sh                        # \u5b8c\u6210test_*.sh\u8fd0\u884c\u6240\u9700\u8981\u7684\u6570\u636e\u548c\u6a21\u578b\u4e0b\u8f7d\n\u251c\u2500\u2500 docs/                             # \u8be6\u7ec6\u7684TIPC\u5404\u79cd\u529f\u80fd\u6587\u6863\n\u251c\u2500\u2500 test_train_inference_python.sh    # \u6d4b\u8bd5python\u8bad\u7ec3\u9884\u6d4b\u7684\u4e3b\u7a0b\u5e8f\n\u251c\u2500\u2500 test_inference_cpp.sh             # \u6d4b\u8bd5C++\u9884\u6d4b\u7684\u4e3b\u7a0b\u5e8f\n\u251c\u2500\u2500 test_paddle2onnx.sh               # \u6d4b\u8bd5paddle2onnx\u8f6c\u6362\u4e0e\u63a8\u7406\u7684\u4e3b\u7a0b\u5e8f\n\u251c\u2500\u2500 compare_results.py                # \u7528\u4e8e\u5bf9\u6bd4log\u4e2d\u7684\u9884\u6d4b\u7ed3\u679c\u4e0eresults\u4e2d\u7684\u9884\u5b58\u7ed3\u679c\u7cbe\u5ea6\u8bef\u5dee\u662f\u5426\u5728\u9650\u5b9a\u8303\u56f4\u5185\n\u2514\u2500\u2500 README.md                         # \u4ecb\u7ecd\u6587\u6863"
+        },
+        {
+            "comment": "The code provides an overview of the test process for PaddleVideo's TIPC. It requires running a prepare script and test_*.sh scripts, comparing log files, and specifying model names and parameters using configuration files. Testing a single feature takes only two commands, and changing configurations is as simple as replacing the configuration file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/README.md\":76-111",
+            "content": "```\n### \u6d4b\u8bd5\u6d41\u7a0b\u6982\u8ff0\n\u4f7f\u7528\u672c\u5de5\u5177\uff0c\u53ef\u4ee5\u6d4b\u8bd5\u4e0d\u540c\u529f\u80fd\u7684\u652f\u6301\u60c5\u51b5\uff0c\u4ee5\u53ca\u9884\u6d4b\u7ed3\u679c\u662f\u5426\u5bf9\u9f50\uff0c\u6d4b\u8bd5\u6d41\u7a0b\u6982\u62ec\u5982\u4e0b\uff1a\n<div align=\"center\">\n    <img src=\"docs/Video_TIPC.png\" width=\"800\">\n</div>\n1. \u8fd0\u884cprepare.sh\u51c6\u5907\u6d4b\u8bd5\u6240\u9700\u6570\u636e\u548c\u6a21\u578b\uff1b\n2. \u8fd0\u884c\u8981\u6d4b\u8bd5\u7684\u529f\u80fd\u5bf9\u5e94\u7684\u6d4b\u8bd5\u811a\u672c`test_*.sh`\uff0c\u4ea7\u51falog\uff0c\u7531log\u53ef\u4ee5\u770b\u5230\u4e0d\u540c\u914d\u7f6e\u662f\u5426\u8fd0\u884c\u6210\u529f\uff1b\n3. \u7528`compare_results.py`\u5bf9\u6bd4log\u4e2d\u7684\u9884\u6d4b\u7ed3\u679c\u548c\u9884\u5b58\u5728results\u76ee\u5f55\u4e0b\u7684\u7ed3\u679c\uff0c\u5224\u65ad\u9884\u6d4b\u7cbe\u5ea6\u662f\u5426\u7b26\u5408\u9884\u671f\uff08\u5728\u8bef\u5dee\u8303\u56f4\u5185\uff09\u3002\n\u6d4b\u8bd5\u5355\u9879\u529f\u80fd\u4ec5\u9700\u4e24\u884c\u547d\u4ee4\uff0c**\u5982\u9700\u6d4b\u8bd5\u4e0d\u540c\u6a21\u578b/\u529f\u80fd\uff0c\u66ff\u6362\u914d\u7f6e\u6587\u4ef6\u5373\u53ef**\uff0c\u547d\u4ee4\u683c\u5f0f\u5982\u4e0b\uff1a\n```shell\n# \u529f\u80fd\uff1a\u51c6\u5907\u6570\u636e\n# \u683c\u5f0f\uff1abash + \u8fd0\u884c\u811a\u672c + \u53c2\u65701: \u914d\u7f6e\u6587\u4ef6\u9009\u62e9 + \u53c2\u65702: \u6a21\u5f0f\u9009\u62e9\nbash test_tipc/prepare.sh  configs/[model_name]/[params_file_name]  [Mode]\n# \u529f\u80fd\uff1a\u8fd0\u884c\u6d4b\u8bd5\n# \u683c\u5f0f\uff1abash + \u8fd0\u884c\u811a\u672c + \u53c2\u65701: \u914d\u7f6e\u6587\u4ef6\u9009\u62e9 + \u53c2\u65702: \u6a21\u5f0f\u9009\u62e9\nbash test_tipc/test_train_inference_python.sh configs/[model_name]/[params_file_name]  [Mode]\n```\n\u4f8b\u5982\uff0c\u6d4b\u8bd5\u57fa\u672c\u8bad\u7ec3\u9884\u6d4b\u529f\u80fd\u7684`lite_train_lite_infer`\u6a21\u5f0f\uff0c\u8fd0\u884c\uff1a\n```shell\n# \u51c6\u5907\u6570\u636e\nbash test_tipc/prepare.sh ./test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'\n# \u8fd0\u884c\u6d4b\u8bd5\nbash test_tipc/test_train_inference_python.sh ./test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'\n```\n\u5173\u4e8e\u672c\u793a\u4f8b\u547d\u4ee4\u7684\u66f4\u591a\u4fe1\u606f\u53ef\u67e5\u770b[\u57fa\u7840\u8bad\u7ec3\u9884\u6d4b\u4f7f\u7528\u6587\u6863](./docs/test_train_inference_python.md)\u3002\n### \u914d\u7f6e\u6587\u4ef6\u547d\u540d\u89c4\u8303\n\u5728`configs`\u76ee\u5f55\u4e0b\u5b58\u653e\u6240\u6709\u6a21\u578b\u6d4b\u8bd5\u9700\u8981\u7528\u5230\u7684\u914d\u7f6e\u6587\u4ef6\uff0c\u914d\u7f6e\u6587\u4ef6\u7684\u547d\u540d\u9075\u5faa\u5982\u4e0b\u89c4\u8303\uff1a"
+        },
+        {
+            "comment": "Code defines naming conventions for various training and inference configurations used by PaddleVideo, allowing users to easily identify the desired test scenario based on subdirectories and configuration file names.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/README.md\":113-125",
+            "content": "1. \u57fa\u7840\u8bad\u7ec3\u9884\u6d4b\u914d\u7f6e\u7b80\u5355\u547d\u540d\u4e3a\uff1a`train_infer_python.txt`\uff0c\u8868\u793a**Linux\u73af\u5883\u4e0b\u5355\u673a\u3001\u4e0d\u4f7f\u7528\u6df7\u5408\u7cbe\u5ea6\u8bad\u7ec3+python\u9884\u6d4b**\uff0c\u5176\u5b8c\u6574\u547d\u540d\u5bf9\u5e94`train_linux_gpu_normal_normal_infer_python_linux_gpu_cpu.txt`\uff0c\u7531\u4e8e\u672c\u914d\u7f6e\u6587\u4ef6\u4f7f\u7528\u9891\u7387\u8f83\u9ad8\uff0c\u8fd9\u91cc\u8fdb\u884c\u4e86\u540d\u79f0\u7b80\u5316\u3002\n2. \u5176\u4ed6\u5e26\u8bad\u7ec3\u914d\u7f6e\u547d\u540d\u683c\u5f0f\u4e3a\uff1a`train_\u8bad\u7ec3\u786c\u4ef6\u73af\u5883(linux_gpu/linux_dcu/\u2026)_\u662f\u5426\u591a\u673a(fleet/normal)_\u662f\u5426\u6df7\u5408\u7cbe\u5ea6(amp/normal)_\u9884\u6d4b\u6a21\u5f0f(infer/lite/serving/js)_\u8bed\u8a00(cpp/python/java)_\u9884\u6d4b\u786c\u4ef6\u73af\u5883(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`\u3002\u5982\uff0clinux gpu\u4e0b\u591a\u673a\u591a\u5361+\u6df7\u5408\u7cbe\u5ea6\u94fe\u6761\u6d4b\u8bd5\u5bf9\u5e94\u914d\u7f6e `train_linux_gpu_fleet_amp_infer_python_linux_gpu_cpu.txt`\uff0clinux dcu\u4e0b\u57fa\u7840\u8bad\u7ec3\u9884\u6d4b\u5bf9\u5e94\u914d\u7f6e `train_linux_dcu_normal_normal_infer_python_linux_dcu.txt`\u3002\n3. \u4ec5\u9884\u6d4b\u7684\u914d\u7f6e\uff08\u5982serving\u3001lite\u7b49\uff09\u547d\u540d\u683c\u5f0f\uff1a`model_\u8bad\u7ec3\u786c\u4ef6\u73af\u5883(linux_gpu/linux_dcu/\u2026)_\u662f\u5426\u591a\u673a(fleet/normal)_\u662f\u5426\u6df7\u5408\u7cbe\u5ea6(amp/normal)_(infer/lite/serving/js)_\u8bed\u8a00(cpp/python/java)_\u9884\u6d4b\u786c\u4ef6\u73af\u5883(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`\uff0c\u5373\uff0c\u4e0e2\u76f8\u6bd4\uff0c\u4ec5\u7b2c\u4e00\u4e2a\u5b57\u6bb5\u4ecetrain\u6362\u4e3amodel\uff0c\u6d4b\u8bd5\u65f6\u6a21\u578b\u76f4\u63a5\u4e0b\u8f7d\u83b7\u53d6\uff0c\u8fd9\u91cc\u7684\u201c\u8bad\u7ec3\u786c\u4ef6\u73af\u5883\u201d\u8868\u793a\u6240\u6d4b\u8bd5\u7684\u6a21\u578b\u662f\u5728\u54ea\u79cd\u73af\u5883\u4e0b\u8bad\u7ec3\u5f97\u5230\u7684\u3002\n**\u6839\u636e\u4e0a\u8ff0\u547d\u540d\u89c4\u8303\uff0c\u53ef\u4ee5\u76f4\u63a5\u4ece\u5b50\u76ee\u5f55\u540d\u79f0\u548c\u914d\u7f6e\u6587\u4ef6\u540d\u627e\u5230\u9700\u8981\u6d4b\u8bd5\u7684\u573a\u666f\u548c\u529f\u80fd\u5bf9\u5e94\u7684\u914d\u7f6e\u6587\u4ef6\u3002**\n<a name=\"more\"></a>\n## 4. \u5f00\u59cb\u6d4b\u8bd5\n\u5404\u529f\u80fd\u6d4b\u8bd5\u4e2d\u6d89\u53ca\u6df7\u5408\u7cbe\u5ea6\u3001\u88c1\u526a\u3001\u91cf\u5316\u7b49\u8bad\u7ec3\u76f8\u5173\uff0c\u53camkldnn\u3001Tensorrt\u7b49\u591a\u79cd\u9884\u6d4b\u76f8\u5173\u53c2\u6570\u914d\u7f6e\uff0c\u8bf7\u70b9\u51fb\u4e0b\u65b9\u76f8\u5e94\u94fe\u63a5\u4e86\u89e3\u66f4\u591a\u7ec6\u8282\u548c\u4f7f\u7528\u6559\u7a0b\uff1a\n- [test_train_inference_python \u4f7f\u7528](docs/test_train_inference_python.md) \uff1a\u6d4b\u8bd5\u57fa\u4e8ePython\u7684\u6a21\u578b\u8bad\u7ec3\u3001\u8bc4\u4f30\u3001\u63a8\u7406\u7b49\u57fa\u672c\u529f\u80fd\u3002"
+        },
+        {
+            "comment": "This code provides a brief overview of various test cases available for different functionalities within the PaddleVideo framework. The functionalities include testing Python-based mixed precision training, evaluation, and inference; C++-based model inference; converting models to ONNX format for inference; deploying models using Paddle Serving; offline quantized training and inference; and multi-machine multi-GPU training and inference using Python.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/README.md\":126-132",
+            "content": "- [test_amp_train_inference_python \u4f7f\u7528](docs/test_train_amp_inference_python.md) \uff1a\u6d4b\u8bd5\u57fa\u4e8ePython\u7684**\u6df7\u5408\u7cbe\u5ea6**\u6a21\u578b\u8bad\u7ec3\u3001\u8bc4\u4f30\u3001\u63a8\u7406\u7b49\u57fa\u672c\u529f\u80fd\u3002\n- [test_inference_cpp \u4f7f\u7528](docs/test_inference_cpp.md) \uff1a\u6d4b\u8bd5\u57fa\u4e8eC++\u7684\u6a21\u578b\u63a8\u7406\u529f\u80fd\u3002\n- [test_paddle2onnx \u4f7f\u7528](docs/test_paddle2onnx.md) \uff1a\u6d4b\u8bd5\u57fa\u4e8epython2onnx\u6a21\u578b\u7684\u63a8\u7406\u529f\u80fd\u3002\n- [test_serving_infer_python \u4f7f\u7528](docs/test_serving_infer_python.md) \uff1a\u6d4b\u8bd5\u57fa\u4e8ePaddle Serving\u7684\u670d\u52a1\u5316\u90e8\u7f72\u529f\u80fd\u3002\n- [test_serving_infer_cpp \u4f7f\u7528](docs/test_serving_infer_cpp.md) \uff1a\u6d4b\u8bd5\u57fa\u4e8eC++\u7684\u6a21\u578b\u63a8\u7406\u529f\u80fd\u3002\n- [test_ptq_inference_python \u4f7f\u7528](docs/test_train_ptq_inference_python.md) \uff1a\u6d4b\u8bd5\u79bb\u7ebf\u91cf\u5316\u8bad\u7ec3\u63a8\u7406\u529f\u80fd\u3002\n- [test_train_fleet_inference_python \u4f7f\u7528](./docs/test_train_fleet_inference_python.md)\uff1a\u6d4b\u8bd5\u57fa\u4e8ePython\u7684\u591a\u673a\u591a\u5361\u8bad\u7ec3\u4e0e\u63a8\u7406\u7b49\u57fa\u672c\u529f\u80fd"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b257cd96-eead-4082-bb8e-7698b6df2379.json b/docs/doc/b257cd96-eead-4082-bb8e-7698b6df2379.json
new file mode 100644
index 000000000..1222a2a11
--- /dev/null
+++ b/docs/doc/b257cd96-eead-4082-bb8e-7698b6df2379.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code initializes paths, defines train_info and val_info dictionaries, sets interval variables, processes video frames, draws rectangles around objects based on labels, saves images with label information, and writes data to train.json and val.json files after processing all data from given folders.",
+    "details": [
+        {
+            "comment": "The code initializes necessary paths and directories for annotation, training, and validation image paths. It creates the required directories if they do not exist. The code defines train_info and val_info as dictionaries containing information about images, annotations, and categories. It sets the interval variable for file processing and then iterates through each directory in the given path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/get_image_label.py\":0-52",
+            "content": "import cv2\nimport os\nimport json\n# please change it to your path\npath = '/workspace/wangqingzhong/Anti_UAV'\nannotation_path = 'annotations'\ntrain_img_path = 'train_imgs'\nval_img_path = 'val_imgs'\nif not os.path.exists(annotation_path):\n    os.makedirs(annotation_path)\nif not os.path.exists(train_img_path):\n    os.makedirs(train_img_path)\nif not os.path.exists(val_img_path):\n    os.makedirs(val_img_path)\ntrain_info = {\n    'images': [],\n    'type':\n    'instances',\n    'annotations': [],\n    'categories': [{\n        \"supercategory\": \"none\",\n        \"id\": 1,\n        \"name\": \"drone\"\n    }, {\n        \"supercategory\": \"none\",\n        \"id\": 2,\n        \"name\": \"noise\"\n    }]\n}\nval_info = {\n    'images': [],\n    'type':\n    'instances',\n    'annotations': [],\n    'categories': [{\n        \"supercategory\": \"none\",\n        \"id\": 1,\n        \"name\": \"drone\"\n    }, {\n        \"supercategory\": \"none\",\n        \"id\": 2,\n        \"name\": \"noise\"\n    }]\n}\n# you can change it\ninterval = 5\ndirs = os.listdir(path)\ntrain_img_id = 0\nval_img_id = 0\nfor d in dirs:"
+        },
+        {
+            "comment": "This code reads an image file and its label from a specified path. It then processes each frame of the video, drawing a rectangle around the object in the frame based on the provided labels. If the object exists, category_id is set to 1; otherwise, it's set to 2. Each processed frame is saved as an image file with its corresponding label information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/get_image_label.py\":53-76",
+            "content": "    if 'new' in d:\n        video_file = os.path.join(path, d, 'IR.mp4')\n        label_file = os.path.join(path, d, 'IR_label.json')\n        labels = json.load(open(label_file, 'r'))\n        exits = labels['exist']\n        gt_bbox = labels['gt_rect']\n        assert len(exits) == len(gt_bbox)\n        videocap = cv2.VideoCapture(video_file)\n        i = 0\n        while True:\n            success, frame = videocap.read()\n            if success:\n                if i % interval == 0:\n                    img_name = d + '_' + str(i) + '.jpg'\n                    cv2.imwrite(os.path.join(val_img_path, img_name), frame)\n                    height, width, depth = frame.shape\n                    x, y, w, h = gt_bbox[i]\n                    isexist = exits[i]\n                    if isexist:\n                        category_id = 1\n                    else:\n                        category_id = 2\n                    draw_frame = cv2.rectangle(frame, (x, y), (x + w, y + h),\n                                               (0, 255, 0), 2)"
+        },
+        {
+            "comment": "This code writes an image, creates image information (file name, height, width, and id), and annotation information (area, iscrowd, bbox coordinates, category_id, ignore, image_id, and id). It then appends the image and annotation information to the existing val_info data structure.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/get_image_label.py\":77-100",
+            "content": "                    img_name_draw = d + '_' + str(i) + 'draw.jpg'\n                    cv2.imwrite(os.path.join(val_img_path, img_name_draw),\n                                draw_frame)\n                    img_info = {\n                        'file_name': img_name,\n                        'height': float(height),\n                        'width': float(width),\n                        'id': val_img_id\n                    }\n                    ann_info = {\n                        'area': float(w) * float(h),\n                        'iscrowd': 0,\n                        'bbox': [float(x),\n                                 float(y),\n                                 float(w),\n                                 float(h)],\n                        'category_id': category_id,\n                        'ignore': 0,\n                        'image_id': val_img_id,\n                        'id': val_img_id + 1\n                    }\n                    val_info['images'].append(img_info)\n                    val_info['annotations'].append(ann_info)"
+        },
+        {
+            "comment": "Code reads a video and its corresponding label file, then extracts frames based on labels and saves them. If the object exists in the frame, it is labeled as category_id 1, otherwise as category_id 2. The process continues until all frames have been processed or a \"finish\" message is encountered.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/get_image_label.py\":101-127",
+            "content": "                    val_img_id += 1\n                i += 1\n            else:\n                print('finish {}'.format(d))\n                break\n    else:\n        video_file = os.path.join(path, d, 'IR.mp4')\n        label_file = os.path.join(path, d, 'IR_label.json')\n        labels = json.load(open(label_file, 'r'))\n        exits = labels['exist']\n        gt_bbox = labels['gt_rect']\n        assert len(exits) == len(gt_bbox)\n        videocap = cv2.VideoCapture(video_file)\n        i = 0\n        while True:\n            success, frame = videocap.read()\n            if success:\n                if i % interval == 0:\n                    img_name = d + '_' + str(i) + '.jpg'\n                    cv2.imwrite(os.path.join(train_img_path, img_name), frame)\n                    height, width, depth = frame.shape\n                    x, y, w, h = gt_bbox[i]\n                    isexist = exits[i]\n                    if isexist:\n                        category_id = 1\n                    else:\n                        category_id = 2"
+        },
+        {
+            "comment": "This code draws a rectangle around the detected object in an image, saves the image with the drawn rectangle, and creates two dictionaries (image and annotation information) to be used for training purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/get_image_label.py\":128-150",
+            "content": "                    draw_frame = cv2.rectangle(frame, (x, y), (x + w, y + h),\n                                               (0, 255, 0), 2)\n                    img_name_draw = d + '_' + str(i) + 'draw.jpg'\n                    cv2.imwrite(os.path.join(train_img_path, img_name_draw),\n                                draw_frame)\n                    img_info = {\n                        'file_name': img_name,\n                        'height': height,\n                        'width': width,\n                        'id': train_img_id\n                    }\n                    ann_info = {\n                        'area': float(w) * float(h),\n                        'iscrowd': 0,\n                        'bbox': [float(x),\n                                 float(y),\n                                 float(w),\n                                 float(h)],\n                        'category_id': category_id,\n                        'ignore': 0,\n                        'image_id': train_img_id,\n                        'id': train_img_id + 1"
+        },
+        {
+            "comment": "Code writes image and annotation information to train.json and val.json files after processing all data from given folders, ending the loop when done.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Anti-UAV/get_image_label.py\":151-163",
+            "content": "                    }\n                    train_info['images'].append(img_info)\n                    train_info['annotations'].append(ann_info)\n                    train_img_id += 1\n                i += 1\n            else:\n                print('finish {}'.format(d))\n                break\nwith open('annotations/train.json', 'w') as f:\n    json.dump(train_info, f)\nwith open('annotations/val.json', 'w') as f:\n    json.dump(val_info, f)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b2ee5724-cdb5-4eef-86eb-e640c4d8d31e.json b/docs/doc/b2ee5724-cdb5-4eef-86eb-e640c4d8d31e.json
new file mode 100644
index 000000000..2598fccca
--- /dev/null
+++ b/docs/doc/b2ee5724-cdb5-4eef-86eb-e640c4d8d31e.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code defines a class for bmn inferencing, initializes a PaddleVideo model using a config file, and detects basketball actions in videos through sliding window techniques. Results are stored and displayed along with inference time.",
+    "details": [
+        {
+            "comment": "The code defines a class InferModel, which is used for bmn inferencing. It initializes the model using a configuration file and sets properties such as GPU memory, device ID, minimum prediction score threshold, and frame processing thread count.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py\":0-36",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import process_proposal\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"bmn infer\"\"\"\n    def __init__(self, cfg, name='BMN'): \n        name = name.upper()\n        self.name           = name\n        model_file          = cfg[name]['model_file']\n        params_file         = cfg[name]['params_file']\n        gpu_mem             = cfg[name]['gpu_mem']\n        device_id           = cfg[name]['device_id']\n        self.nms_thread          = cfg[name]['nms_thread']\n        self.min_pred_score      = cfg[name]['score_thread']\n        self.min_frame_thread    = cfg['COMMON']['fps']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true"
+        },
+        {
+            "comment": "This code is for a basketball action detection model using PaddleVideo. It creates a predictor, defines input and output tensors, runs inference, and generates properties based on predictions for start and end times of an action.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py\":37-62",
+            "content": "        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n        self.output3_tensor = self.predictor.get_output_handle(output_names[2])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        output3 = self.output3_tensor.copy_to_cpu()\n        return output1, output2, output3\n    def generate_props(self, pred_bmn, pred_start, pred_end, max_window=200, min_window=5):\n        \"\"\"generate_props\"\"\""
+        },
+        {
+            "comment": "This code performs action detection by predicting start and end points, as well as the confidence score for a specific action within a video. It calculates the score_results based on valid start and end indices, taking into account the start and end masks. The boundary_choose function is used to choose the boundaries of the action from the given score list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py\":63-85",
+            "content": "        video_len = min(pred_bmn.shape[-1], min(pred_start.shape[-1], pred_end.shape[-1]))\n        pred_bmn = pred_bmn[0, :, :] * pred_bmn[1, :, :]\n        start_mask = self.boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = self.boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_results = []\n        for idx in range(min_window, max_window):\n            for jdx in range(video_len):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < video_len and start_mask[start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = start_index\n                    xmax = end_index\n                    xmin_score = pred_start[start_index]\n                    xmax_score = pred_end[end_index]\n                    bmn_score = pred_bmn[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bmn_score\n                    score_results.append([xmin, xmax, conf_score])\n        return score_results\n    def boundary_choose(self, score_list):"
+        },
+        {
+            "comment": "This code defines two functions, \"boundary_choose\" and \"predict\". The \"boundary_choose\" function takes a score list as input and uses it to generate three different arrays for scoring in front, middle, and back positions. It then creates a mask for the highest peak by comparing these three score arrays. Finally, it returns a binary mask representing boundary locations. The \"predict\" function initializes an infer reader, iterates through data from this reader, processes inputs, and features information to generate feature_T and feature_N.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py\":86-110",
+            "content": "        \"\"\"boundary_choose\"\"\"\n        max_score = max(score_list)\n        mask_high = (score_list > max_score * 0.5)\n        score_list = list(score_list)\n        score_middle = np.array([0.0] + score_list + [0.0])\n        score_front = np.array([0.0, 0.0] + score_list)\n        score_back = np.array(score_list + [0.0, 0.0])\n        mask_peak = ((score_middle > score_front) & (score_middle > score_back))\n        mask_peak = mask_peak[1:-1]\n        mask = (mask_high | mask_peak).astype('float32')\n        return mask\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs      = [items[0] for items in data]\n            winds       = [items[1] for items in data]\n            feat_info   = [items[2] for items in data]\n            feature_T   = feat_info[0][0]\n            feature_N   = feat_info[0][1]\n            inputs = np.array(inputs)"
+        },
+        {
+            "comment": "The code is calculating the average of multiple model predictions for each sliding window and then dividing it by the total number of windows to get the final prediction. These predictions are used to generate proposals, which are further processed based on some parameters like minimum frame threshold, NMS thread, and minimum prediction score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py\":111-130",
+            "content": "            pred_bmn, pred_sta, pred_end = self.infer(inputs)\n            if infer_iter == 0:\n                sum_pred_bmn = np.zeros((2, feature_N, feature_T))\n                sum_pred_sta = np.zeros((feature_T, ))\n                sum_pred_end = np.zeros((feature_T, ))\n                sum_pred_cnt = np.zeros((feature_T, ))\n            for idx, sub_wind in enumerate(winds):\n                sum_pred_bmn[:, :, sub_wind[0]: sub_wind[1]] += pred_bmn[idx]\n                sum_pred_sta[sub_wind[0]: sub_wind[1]] += pred_sta[idx]\n                sum_pred_end[sub_wind[0]: sub_wind[1]] += pred_end[idx]\n                sum_pred_cnt[sub_wind[0]: sub_wind[1]] += np.ones((sub_wind[1] - sub_wind[0], ))\n        pred_bmn = sum_pred_bmn / sum_pred_cnt\n        pred_sta = sum_pred_sta / sum_pred_cnt\n        pred_end = sum_pred_end / sum_pred_cnt\n        score_result = self.generate_props(pred_bmn, pred_sta, pred_end)\n        results = process_proposal(score_result, self.min_frame_thread, self.nms_thread, self.min_pred_score)"
+        },
+        {
+            "comment": "The code defines a model for action detection, loads configuration file and video features from file paths, predicts the actions using the model, stores results in a dictionary, writes the result to 'results.json', and finally prints the time taken for inference in minutes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py\":132-154",
+            "content": "        return results\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml' \n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    t0 = time.time()\n    outputs = model.predict(cfg, video_features)\n    t1 = time.time()\n    results = {'proposal': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) \n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b39f53e1-fbb8-4712-b30c-f03e86d2006c.json b/docs/doc/b39f53e1-fbb8-4712-b30c-f03e86d2006c.json
new file mode 100644
index 000000000..a91d14a93
--- /dev/null
+++ b/docs/doc/b39f53e1-fbb8-4712-b30c-f03e86d2006c.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet contains the version information for PaddleVideo library. The version is set to \"0.0.1\" and it includes a copyright notice, license details, and specifies that this file should be used only in compliance with the Apache License, Version 2.0.",
+    "details": [
+        {
+            "comment": "This code snippet contains the version information for PaddleVideo library. The version is set to \"0.0.1\" and it includes a copyright notice, license details, and specifies that this file should be used only in compliance with the Apache License, Version 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/version.py\":0-17",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\n__all__ = [\"paddlevideo_version\"]\npaddlevideo_version = \"0.0.1\""
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b3fa42ae-a035-4215-ae3a-448d2f68ffca.json b/docs/doc/b3fa42ae-a035-4215-ae3a-448d2f68ffca.json
new file mode 100644
index 000000000..06ecd3e7e
--- /dev/null
+++ b/docs/doc/b3fa42ae-a035-4215-ae3a-448d2f68ffca.json
@@ -0,0 +1,30 @@
+{
+    "summary": "Creates a data loader class with LRU caching, PaddlePaddle library, and MSRVTT dataset for efficient training data access. Supports refreshing, clearing caches, setting args, creating datasets, printing cache info, and storing dataloader in an instance variable.",
+    "details": [
+        {
+            "comment": "This code is a function for loading datasets, using LRU caching and PaddlePaddle library. It takes parameters like `use_zeros_for_missing`, `eval_only`, `data_dir`, `text_agg`, `text_feat`, `split_name`, `dataset_name`, and `cls_partition`. It imports MSRVTT dataset for loading specific datasets.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/data_loaders.py\":0-35",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport logging\nimport functools\nfrom pathlib import Path\nfrom typing import Dict, List\nfrom typeguard import typechecked\nfrom zsvision.zs_utils import memcache\nfrom data_loader.MSRVTT_dataset import MSRVTT\nfrom utils import HashableDict, HashableOrderedDict\n@functools.lru_cache(maxsize=64, typed=False)\ndef dataset_loader(\n        use_zeros_for_missing: bool,\n        eval_only: bool,\n        data_dir: str,\n        text_agg: str,\n        text_feat: str,\n        split_name: str,\n        dataset_name: str,\n        cls_partition: str,"
+        },
+        {
+            "comment": "Function `create_dataset` takes parameters to create an instance of a specific dataset class (MSRVTT in this case) with specified options. The function returns the created dataset object.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/data_loaders.py\":36-68",
+            "content": "        root_feat_folder: str,\n        text_dim: int,\n        num_test_captions: int,\n        restrict_train_captions: int,\n        logger: logging.Logger,\n        max_tokens: Dict[str, int],\n        raw_input_dims: HashableOrderedDict,\n        feat_aggregation: HashableDict,\n):\n    print(f\"refreshing cache for {dataset_name} data loader [{split_name}]\")\n    kwargs = dict(\n        data_dir=Path(data_dir),\n        text_dim=text_dim,\n        logger=logger,\n        eval_only=eval_only,\n        text_agg=text_agg,\n        text_feat=text_feat,\n        max_tokens=max_tokens,\n        split_name=split_name,\n        cls_partition=cls_partition,\n        raw_input_dims=raw_input_dims,\n        root_feat_folder=root_feat_folder,\n        feat_aggregation=feat_aggregation,\n        num_test_captions=num_test_captions,\n        use_zeros_for_missing=use_zeros_for_missing,\n        restrict_train_captions=restrict_train_captions,\n    )\n    if dataset_name == \"MSRVTT\":\n        dataset = MSRVTT(**kwargs)\n    return dataset\nclass ExpertDataLoader:"
+        },
+        {
+            "comment": "This code is a constructor for a data loader class that takes various parameters like eval_only, use_zeros_for_missing, text_dim, batch_size, etc. It initializes the object and ensures dictionaries are hashable to enable caching, and provides an optional refresh of dataloader and cuda cache.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/data_loaders.py\":70-100",
+            "content": "    @typechecked\n    def __init__(\n            self,\n            eval_only: bool,\n            use_zeros_for_missing: bool,\n            text_dim: int,\n            batch_size: int,\n            num_workers: int,\n            num_test_captions: int,\n            data_dir: str,\n            text_agg: str,\n            text_feat: str,\n            split_name: str,\n            dataset_name: str,\n            root_feat_folder: str,\n            max_tokens: Dict[str, int],\n            raw_input_dims: Dict[str, int],\n            feat_aggregation: Dict[str, Dict],\n            logger: logging.Logger,\n            restrict_train_captions: int = 0,\n            drop_last: bool = False,\n            refresh_lru_cache: bool = False,\n    ):\n        # Ensure that the dictionaries are hashable to allow use of caching\n        raw_input_dims = HashableOrderedDict(raw_input_dims)\n        feat_aggregation = HashableDict(feat_aggregation)\n        max_tokens = HashableDict(max_tokens)\n        if refresh_lru_cache:\n            logger.info(\"Explicitly refreshing dataloader and cuda cache\")"
+        },
+        {
+            "comment": "This code clears dataset and memory caches, sets common arguments for a specific dataset loader function, creates the dataset with these args, prints cache information, and stores the created dataloader in an instance variable.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/data_loaders.py\":101-126",
+            "content": "            dataset_loader.cache_clear()\n            memcache.cache_clear()\n        common_kwargs = dict(\n            logger=logger,\n            data_dir=data_dir,\n            text_dim=text_dim,\n            text_agg=text_agg,\n            eval_only=eval_only,\n            text_feat=text_feat,\n            max_tokens=max_tokens,\n            dataset_name=dataset_name,\n            split_name=split_name,\n            root_feat_folder=root_feat_folder,\n            use_zeros_for_missing=use_zeros_for_missing,\n            num_test_captions=num_test_captions,\n            raw_input_dims=raw_input_dims,\n            feat_aggregation=feat_aggregation,\n            restrict_train_captions=restrict_train_captions,\n        )\n        dataset = dataset_loader(cls_partition=\"train\", **common_kwargs)\n        x = dataset_loader.cache_info()  # pylint: disable=no-value-for-parameter\n        logger.info(f\"cache info {x}\")\n        self.dataloaders = {\"dataset\": dataset}\n        self.dataloaders[\"retrieval\"] = dataset.get_retrieval_data()"
+        },
+        {
+            "comment": "This function creates a DataLoader for training data with specified parameters and stores it in the self.dataloaders dictionary. It also logs the number of workers used and sets num_test_captions and dataset_name variables. The __getitem__ method returns the dataloader based on the provided key from the self.dataloaders dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/data_loaders.py\":128-144",
+            "content": "        if not eval_only:\n            train_loader = paddle.io.DataLoader(\n                dataset=dataset,\n                batch_size=batch_size,\n                num_workers=num_workers,\n                collate_fn=dataset.collate_data,\n                drop_last=drop_last,\n                shuffle=True,\n            )\n            self.dataloaders[\"train\"] = train_loader\n        logger.info(f\"Loading data loaders with {num_workers} workers\")\n        self.num_test_captions = num_test_captions\n        self.dataset_name = dataset_name\n    def __getitem__(self, key):\n        return self.dataloaders[key]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b408b5dd-77b5-4ca8-91d9-e5d409f6e20b.json b/docs/doc/b408b5dd-77b5-4ca8-91d9-e5d409f6e20b.json
new file mode 100644
index 000000000..98fb71185
--- /dev/null
+++ b/docs/doc/b408b5dd-77b5-4ca8-91d9-e5d409f6e20b.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code snippet calculates mean average precision for video tagging in the Youtube-8m dataset using numpy, with functions for accumulation and processing. It allows for clearing the calculator, checking if empty, and retrieving non-interpolated average precision at n for each class.",
+    "details": [
+        {
+            "comment": "This code calculates the mean average precision for an entire list or top-n ranked items in a video tag application. It imports numpy and provides functions to accumulate, process parts of the ranked list, and finally calculate the mean average precision using peek_map_at_n function. The example usage demonstrates how to use this code with a sample array.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py\":0-26",
+            "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Calculate the mean average precision.\nIt provides an interface for calculating mean average precision\nfor an entire list or the top-n ranked items.\nExample usages:\nWe first call the function accumulate many times to process parts of the ranked\nlist. After processing all the parts, we call peek_map_at_n\nto calculate the mean average precision.\n```\nimport random\np = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])"
+        },
+        {
+            "comment": "The code initializes a numpy array 'a' with random values (0 or 1) for each of the 50 classes. It then creates an instance of MeanAveragePrecisionCalculator class and accumulates the predictions and actuals using the 'accumulate' method. Finally, it retrieves the average precision at different recall levels using the 'peek_map_at_n' method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py\":27-57",
+            "content": "a = np.array([[random.choice([0, 1]) for _ in xrange(50)]\n     for _ in xrange(1000)])\n# mean average precision for 50 classes.\ncalculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(\n            num_class=50)\ncalculator.accumulate(p, a)\naps = calculator.peek_map_at_n()\n```\n\"\"\"\nimport numpy\nfrom . import average_precision_calculator\nclass MeanAveragePrecisionCalculator(object):\n    \"\"\"This class is to calculate mean average precision.\n  \"\"\"\n    def __init__(self, num_class):\n        \"\"\"Construct a calculator to calculate the (macro) average precision.\n    Args:\n      num_class: A positive Integer specifying the number of classes.\n      top_n_array: A list of positive integers specifying the top n for each\n      class. The top n in each class will be used to calculate its average\n      precision at n.\n      The size of the array must be num_class.\n    Raises:\n      ValueError: An error occurred when num_class is not a positive integer;\n      or the top_n_array is not a list of positive integers."
+        },
+        {
+            "comment": "This code initializes an instance of AveragePrecisionCalculator with a specified number of classes. It appends an instance of AveragePrecisionCalculator to the class member _ap_calculators for each class. The accumulate method takes predictions and actuals as arguments, accumulates prediction scores and ground truth labels, treats any value greater than 0 as positives and negatives otherwise, and optionally takes num_positives as an argument if provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py\":58-78",
+            "content": "    \"\"\"\n        if not isinstance(num_class, int) or num_class <= 1:\n            raise ValueError(\"num_class must be a positive integer.\")\n        self._ap_calculators = []  # member of AveragePrecisionCalculator\n        self._num_class = num_class  # total number of classes\n        for i in range(num_class):\n            self._ap_calculators.append(\n                average_precision_calculator.AveragePrecisionCalculator())\n    def accumulate(self, predictions, actuals, num_positives=None):\n        \"\"\"Accumulate the predictions and their ground truth labels.\n    Args:\n      predictions: A list of lists storing the prediction scores. The outer\n      dimension corresponds to classes.\n      actuals: A list of lists storing the ground truth labels. The dimensions\n      should correspond to the predictions input. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      num_positives: If provided, it is a list of numbers representing the\n      number of true positives for each class. If not provided, the number of"
+        },
+        {
+            "comment": "This code initializes a mean average precision calculator, handles accumulating predictions and actuals, allows for clearing the calculator, checks if it's empty, and retrieves non-interpolated average precision at n for each class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py\":79-110",
+            "content": "      true positives will be inferred from the 'actuals' array.\n    Raises:\n      ValueError: An error occurred when the shape of predictions and actuals\n      does not match.\n    \"\"\"\n        if not num_positives:\n            num_positives = [None for i in predictions.shape[1]]\n        calculators = self._ap_calculators\n        for i in range(len(predictions)):\n            calculators[i].accumulate(predictions[i], actuals[i],\n                                      num_positives[i])\n    def clear(self):\n        for calculator in self._ap_calculators:\n            calculator.clear()\n    def is_empty(self):\n        return ([calculator.heap_size for calculator in self._ap_calculators\n                 ] == [0 for _ in range(self._num_class)])\n    def peek_map_at_n(self):\n        \"\"\"Peek the non-interpolated mean average precision at n.\n    Returns:\n      An array of non-interpolated average precision at n (default 0) for each\n      class.\n    \"\"\"\n        aps = [\n            self._ap_calculators[i].peek_ap_at_n()\n            for i in range(self._num_class)"
+        },
+        {
+            "comment": "This code snippet calculates mean average precision for video tagging in the Youtube-8m dataset. It returns a list of average precisions (aps) after processing each chunk of data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py\":111-112",
+            "content": "        ]\n        return aps"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b4c114ef-374a-44b4-ad67-bac255580cab.json b/docs/doc/b4c114ef-374a-44b4-ad67-bac255580cab.json
new file mode 100644
index 000000000..b9e3859c3
--- /dev/null
+++ b/docs/doc/b4c114ef-374a-44b4-ad67-bac255580cab.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code defines a BaseHead abstract class for PaddleVideo, introduces a VideoQualityAssessment model with forward function, loss calculation, and accuracy metrics. It also contains functions for label smooth loss and top1/top5 accuracy calculations. An unimplemented function needs to be added based on the comments in the codebase.",
+    "details": [
+        {
+            "comment": "This code snippet is a part of PaddleVideo library. It imports necessary libraries and defines an abstract base class \"BaseHead\" for video head parts. This class should be subclassed by all heads, which must override the \"init_weights\" method for initializing weights. The class also utilizes logger from paddlevideo to log information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py\":0-35",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport numpy as np\nfrom abc import abstractmethod\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..builder import build_loss\nfrom paddlevideo.utils import get_logger, get_dist_info\nlogger = get_logger(\"paddlevideo\")\nclass BaseHead(nn.Layer):\n    \"\"\"Base class for head part.\n    All head should subclass it.\n    All subclass should overwrite:\n    - Methods: ```init_weights```, initializing weights."
+        },
+        {
+            "comment": "This code is defining a base class for a head network in PaddleVideo. It has an initializer that takes the number of classes, input channels, loss configuration, and label smoothing epsilon as arguments. The loss function and other parameters are initialized inside the constructor. It also requires the implementation of an abstract method \"init_weights\" for parameter initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py\":36-66",
+            "content": "    - Methods: ```forward```, forward function.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channels in input feature.\n        loss_cfg (dict): Config for building loss. Default: dict(type='CrossEntropyLoss').\n        ls_eps (float): label smoothing epsilon. Default: 0. .\n    \"\"\"\n    def __init__(\n        self,\n        num_classes,\n        in_channels,\n        loss_cfg=dict(\n            name=\"CrossEntropyLoss\"\n        ),  #TODO(shipping): only pass a name or standard build cfg format.\n        #multi_class=False, NOTE(shipping): not supported now.\n        ls_eps=0.):\n        super().__init__()\n        self.num_classes = num_classes\n        self.in_channels = in_channels\n        self.loss_func = build_loss(loss_cfg)\n        #self.multi_class = multi_class NOTE(shipping): not supported now\n        self.ls_eps = ls_eps\n    @abstractmethod\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        \"\"\"\n        raise NotImplementedError"
+        },
+        {
+            "comment": "This code defines a base head for the VideoQualityAssessment model. It includes a forward function that is expected to be overridden by subclasses, and a loss function that calculates the loss based on model output (scores) and target (labels). The loss function also returns top1 and top5 accuracy if not in validation mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py\":68-95",
+            "content": "    @abstractmethod\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        raise NotImplementedError\n    def loss(self, scores, labels, valid_mode=False, **kwargs):\n        \"\"\"Calculate the loss accroding to the model output ```scores```,\n           and the target ```labels```.\n        Args:\n            scores (paddle.Tensor): The output of the model.\n            labels (paddle.Tensor): The target output of the model.\n        Returns:\n            losses (dict): A dict containing field 'loss'(mandatory) and 'top1_acc', 'top5_acc'(optional).\n        \"\"\"\n        if len(labels) == 1:  #commonly case\n            labels = labels[0]\n            losses = dict()\n            if self.ls_eps != 0. and not valid_mode:  # label_smooth\n                loss = self.label_smooth_loss(scores, labels, **kwargs)\n            else:\n                loss = self.loss_func(scores, labels, **kwargs)\n            top1, top5 = self.get_acc(scores, labels, valid_mode)\n            losses['top1'] = top1"
+        },
+        {
+            "comment": "This code segment handles mixed-up labels where there are 3 labels (labels_a, labels_b, and lam). It calculates the loss and accuracy for both label sets with or without label smoothing. The final result is stored in a dictionary including top1, top5, and total loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py\":96-119",
+            "content": "            losses['top5'] = top5\n            losses['loss'] = loss\n            return losses\n        elif len(labels) == 3:  # mix_up\n            labels_a, labels_b, lam = labels\n            lam = lam[0]  # get lam value\n            losses = dict()\n            if self.ls_eps != 0:\n                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)\n                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)\n            else:\n                loss_a = self.loss_func(scores, labels_a, **kwargs)\n                loss_b = self.loss_func(scores, labels_a, **kwargs)\n            loss = lam * loss_a + (1 - lam) * loss_b\n            top1a, top5a = self.get_acc(scores, labels_a, valid_mode)\n            top1b, top5b = self.get_acc(scores, labels_b, valid_mode)\n            top1 = lam * top1a + (1 - lam) * top1b\n            top5 = lam * top5a + (1 - lam) * top5b\n            losses['top1'] = top1\n            losses['top5'] = top5\n            losses['loss'] = loss\n            return losses\n        else:"
+        },
+        {
+            "comment": "This code contains three functions: \"label_smooth_loss\", \"get_acc\", and an unimplemented function. The \"label_smooth_loss\" calculates the label smooth loss using one-hot encoding, label smoothing, and applies a soft loss function with a specified epsilon value. It also handles the loss calculation for cases where soft labels are needed. The \"get_acc\" function calculates both top1 and top5 accuracy values from input scores and labels. It can handle multi-card validation by performing all-reduce when validating on multiple cards. Finally, there is an unimplemented function that should be implemented based on the comments in the codebase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py\":120-142",
+            "content": "            raise NotImplementedError\n    def label_smooth_loss(self, scores, labels, **kwargs):\n        \"\"\"label smooth loss\"\"\"\n        labels = F.one_hot(labels, self.num_classes)\n        labels = F.label_smooth(labels, epsilon=self.ls_eps)\n        labels = paddle.squeeze(labels, axis=1)\n        loss = self.loss_func(scores, labels, soft_label=True, **kwargs)\n        return loss\n    def get_acc(self, scores, labels, valid_mode):\n        \"\"\"get acc\"\"\"\n        top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)\n        top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)\n        _, world_size = get_dist_info()\n        #NOTE(shipping): deal with multi cards validate\n        if world_size > 1 and valid_mode:  #reduce sum when valid\n            top1 = paddle.distributed.all_reduce(\n                top1, op=paddle.distributed.ReduceOp.SUM) / world_size\n            top5 = paddle.distributed.all_reduce(\n                top5, op=paddle.distributed.ReduceOp.SUM) / world_size\n        return top1, top5"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b53b1bbf-1f62-413b-a59b-6c457320f2e3.json b/docs/doc/b53b1bbf-1f62-413b-a59b-6c457320f2e3.json
new file mode 100644
index 000000000..59fabd3d1
--- /dev/null
+++ b/docs/doc/b53b1bbf-1f62-413b-a59b-6c457320f2e3.json
@@ -0,0 +1,30 @@
+{
+    "summary": "CTR-GCN is a bone-based behavior recognition model using graph convolution, trained and tested with PaddlePaddle framework on the NTU-RGB+D dataset. The code snippet represents top-1 action classification with 99.9988% accuracy.",
+    "details": [
+        {
+            "comment": "CTR-GCN is a bone-based behavior recognition model using graph convolution on human bone data. It improves accuracy for the task by extracting spatio-temporal features with spatio-temporal graph convolution. Train CTR-GCN on NTU-RGBD data with single GPU and joint modality.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/ctrgcn.md\":0-38",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/ctrgcn.md) | English\n# CTR-GCN\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\n[CTRGCN](https://github.com/Uason-Chen/CTR-GCN.git) is a bone based behavior recognition model proposed by iccv 2021. By applying the changes to the graph convolution of human bone data with topological structure, and using spatio-temporal graph convolution to extract spatio-temporal features for behavior recognition, the accuracy of bone based behavior recognition task is greatly improved.\n<div align=\"center\">\n<img src=\"../../../images/ctrgcn.jpg\" height=200 width=950 hspace='10'/> <br />\n</div>\n## Data\nPlease refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)\n## Train\n### Train on NTU-RGBD\n- Train CTR-GCN on NTU-RGBD scripts using single gpu\uff1a\n```bash\n# joint modality\npython main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml --seed 1"
+        },
+        {
+            "comment": "This code snippet runs the CTR-GCN model on different modalities and datasets, performs training with multiple GPUs, and tests the trained models. It uses the PaddlePaddle framework and provides configurations for the NTU-RGB+D dataset, including joint, bone, and motion modalities. The code can be executed by providing the appropriate command line arguments to specify the model, dataset, and mode (train or test).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/ctrgcn.md\":40-73",
+            "content": "# bone modality\npython main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml --seed 1\n# motion modality\npython main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml --seed 1\n# bone motion modality\npython main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml --seed 1\n```\n- Train CTR-GCN on NTU-RGBD scriptsusing multi gpus:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\"  --log_dir=log_ctrgcn  main.py  --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml\n```\n- config file `ctrgcn_ntucs_joint.yaml` corresponding to the config of CTR-GCN on NTU-RGB+D dataset with cross-subject splits.\n## Test\n### Test on NTU-RGB+D\n- Test scripts\uff1a\n```bash\n# joint modality\npython3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml -w data/CTRGCN_ntucs_joint.pdparams\n# bone modality\npython3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml -w data/CTRGCN_ntucs_bone.pdparams\n# motion modality\npython"
+        },
+        {
+            "comment": "This code is executing Python scripts for the CTRGCN model, which utilizes config files (-c) and pre-trained weight paths (-w). The accuracy table showcases performance on NTU-RGB+D dataset across different modalities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/ctrgcn.md\":73-89",
+            "content": "3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml -w data/CTRGCN_ntucs_motion.pdparams\n# bone motion modality\npython3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml -w data/CTRGCN_ntucs_bone_motion.pdparams\n```\n- Specify the config file with `-c`, specify the weight path with `-w`.\nAccuracy on NTU-RGB+D dataset:\n| split | modality | Top-1 | checkpoints |\n| :----: | :----: | :----: | :----: |\n| cross-subject | joint | 89.93 | [CTRGCN_ntucs_joint.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_joint.pdparams) |\n| cross-subject | bone | 85.24 | [CTRGCN_ntucs_bone.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone.pdparams) |\n| cross-subject | motion | 85.33 | [CTRGCN_ntucs_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_motion.pdparams) |\n| cross-subject | bone motion | 84.53 | [CTRGCN_ntucs_bone_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone_motion.pdparams) |"
+        },
+        {
+            "comment": "This code exports the inference model and performs inference using PaddleVideo's CTRGCN model for action recognition. The `export_model.py` script creates the architecture file (CTRGCN.pdmodel) and parameters file (CTRGCN.pdiparams). The `predict.py` script uses these files to perform inference on a given video file, specifying the configuration file for the CTRGCN model. It runs with GPU acceleration (use_gpu=True) and without TensorRT optimization (use_tensorrt=False).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/ctrgcn.md\":92-120",
+            "content": "## Inference\n### export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \\\n                                -p data/CTRGCN_ntucs_joint.pdparams \\\n                                -o inference/CTRGCN\n```\n To get model architecture file `CTRGCN.pdmodel` and parameters file `CTRGCN.pdiparams`, use:\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \\\n                           --config configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \\\n                           --model_file inference/CTRGCN_joint/CTRGCN_joint.pdmodel \\\n                           --params_file inference/CTRGCN_joint/CTRGCN_joint.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example_NTU-RGB-D_sketeton.npy"
+        },
+        {
+            "comment": "The code snippet represents the top-1 class and its corresponding score in a model's prediction for skeleton-based action recognition. The top-1 class is 4, with a top-1 score of 0.999988317489624. This information can be used to identify the recognized action from multiple options.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/ctrgcn.md\":121-127",
+            "content": "        top-1 class: 4\n        top-1 score: 0.999988317489624\n```\n## Reference\n- [Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213), Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b542398e-9d06-4161-b8bd-fc07ee5e983d.json b/docs/doc/b542398e-9d06-4161-b8bd-fc07ee5e983d.json
new file mode 100644
index 000000000..1dfc8c0b5
--- /dev/null
+++ b/docs/doc/b542398e-9d06-4161-b8bd-fc07ee5e983d.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code defines functions for numpy array operations on bounding boxes, including area calculation and intersection-over-union scores useful in computer vision tasks. It computes pairwise IoU scores using numpy arrays by dividing the intersection by the second set's box areas.",
+    "details": [
+        {
+            "comment": "The code defines functions for performing operations on numpy arrays of bounding boxes. It includes functionality to compute areas and intersection-over-union scores between pairs of boxes. The array holds N boxes and is expected to have shape [N, 4]. These operations are useful in computer vision tasks like object detection and tracking.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_ops.py\":0-28",
+            "content": "# Copyright 2017 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# ==============================================================================\n\"\"\"Operations for [N, 4] numpy arrays representing bounding boxes.\nExample box operations that are supported:\n    * Areas: compute bounding box areas\n    * IOU: pairwise intersection-over-union scores\n\"\"\"\nimport numpy as np\ndef area(boxes):\n    \"\"\"Computes area of boxes.\n    Args:\n        boxes: Numpy array with shape [N, 4] holding N boxes"
+        },
+        {
+            "comment": "Computes box areas by multiplying width and height (lines 30-34).\nCalculates pairwise intersection areas between boxes (lines 36-51).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_ops.py\":30-56",
+            "content": "    Returns:\n        a numpy array with shape [N*1] representing box areas\n    \"\"\"\n    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])\ndef intersection(boxes1, boxes2):\n    \"\"\"Compute pairwise intersection areas between boxes.\n    Args:\n        boxes1: a numpy array with shape [N, 4] holding N boxes\n        boxes2: a numpy array with shape [M, 4] holding M boxes\n    Returns:\n        a numpy array with shape [N*M] representing pairwise intersection area\n    \"\"\"\n    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)\n    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)\n    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))\n    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))\n    intersect_heights = np.maximum(\n        np.zeros(all_pairs_max_ymin.shape),\n        all_pairs_min_ymax - all_pairs_max_ymin)\n    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))\n    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))\n    intersect_widths = np.maximum("
+        },
+        {
+            "comment": "The code defines functions for computing pairwise intersection-over-union (iou) and intersection-over-area (ioa) between box collections. The iou function takes two numpy arrays of boxes, computes their intersection using the intersection function, calculates the union by adding the areas of both boxes and subtracting the intersection, and finally returns the pairwise iou scores. The ioa function also takes two numpy arrays of boxes, defines intersection-over-area as the intersection area divided by box2's area, and does not consider symmetry between box1 and box2.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_ops.py\":57-89",
+            "content": "        np.zeros(all_pairs_max_xmin.shape),\n        all_pairs_min_xmax - all_pairs_max_xmin)\n    return intersect_heights * intersect_widths\ndef iou(boxes1, boxes2):\n    \"\"\"Computes pairwise intersection-over-union between box collections.\n    Args:\n        boxes1: a numpy array with shape [N, 4] holding N boxes.\n        boxes2: a numpy array with shape [M, 4] holding N boxes.\n    Returns:\n        a numpy array with shape [N, M] representing pairwise iou scores.\n    \"\"\"\n    intersect = intersection(boxes1, boxes2)\n    area1 = area(boxes1)\n    area2 = area(boxes2)\n    union = (\n        np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) -\n        intersect)\n    return intersect / union\ndef ioa(boxes1, boxes2):\n    \"\"\"Computes pairwise intersection-over-area between box collections.\n    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as\n    their intersection area over box2's area. Note that ioa is not symmetric,\n    that is, IOA(box1, box2) != IOA(box2, box1).\n    Args:\n        boxes1: a numpy array with shape [N, 4] holding N boxes."
+        },
+        {
+            "comment": "This code calculates pairwise Intersection over Union (IoU) scores between two sets of bounding boxes represented by numpy arrays. It first computes the intersection of the two sets, then calculates the area of each box in the second set, and finally divides the intersection by the areas to obtain the IoU scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/np_box_ops.py\":90-97",
+            "content": "        boxes2: a numpy array with shape [M, 4] holding N boxes.\n    Returns:\n        a numpy array with shape [N, M] representing pairwise ioa scores.\n    \"\"\"\n    intersect = intersection(boxes1, boxes2)\n    areas = np.expand_dims(area(boxes2), axis=0)\n    return intersect / areas"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b59019a2-7952-46da-80c1-45bf72d06907.json b/docs/doc/b59019a2-7952-46da-80c1-45bf72d06907.json
new file mode 100644
index 000000000..3eab45d0f
--- /dev/null
+++ b/docs/doc/b59019a2-7952-46da-80c1-45bf72d06907.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code is part of the PaddleVideo framework, which provides base classes for various model classes like 'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator', 'DepthEstimator', and more. The list contains names of these available base classes within the module.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo framework, which provides several base classes for estimators, localizers, partitioners, recognizers, multimodal models, segments, and segmenters. The commented lines describe the license information and the imported classes from different modules within the framework. The '__all__' list contains the names of the base classes available in this module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/__init__.py\":0-23",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .estimators import BaseEstimator, DepthEstimator\nfrom .localizers import BaseLocalizer, BMNLocalizer\nfrom .partitioners import BasePartitioner, TransNetV2Partitioner\nfrom .recognizers import BaseRecognizer, Recognizer2D\nfrom .multimodal import ActBert, BaseMultimodal\nfrom .segment import BaseSegment, CFBI\nfrom .segmenters import MSTCN\n__all__ = [\n    'BaseRecognizer', 'Recognizer2D', 'BaseLocalizer', 'BMNLocalizer',"
+        },
+        {
+            "comment": "This code snippet contains a list of various model classes, including 'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator', 'DepthEstimator', 'BaseMultimodal', 'ActBert', 'BaseSegment', 'CFBI', and 'MSTCN'. These models are likely used for different tasks within the PaddleVideo framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/__init__.py\":24-27",
+            "content": "    'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator',\n    'DepthEstimator', 'BaseMultimodal', 'ActBert', 'BaseSegment', 'CFBI',\n    'MSTCN'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b5f9fd57-a23f-449a-a23d-75e461cb6d96.json b/docs/doc/b5f9fd57-a23f-449a-a23d-75e461cb6d96.json
new file mode 100644
index 000000000..fa034be74
--- /dev/null
+++ b/docs/doc/b5f9fd57-a23f-449a-a23d-75e461cb6d96.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code provides a list of papers on action recognition and video classification, including TSN, SlowFast Networks, X3D, ECO, 3D ResNet, etc. The paper \"Action Recognition with Trajectory-Pooled Deep-Convolutional Descriptors\" presents an efficient method for recognizing actions from video sequences using deep convolutional descriptors and trajectory pooling.",
+    "details": [
+        {
+            "comment": "This code contains a list of useful papers related to action recognition and video classification, including TSN, TSM, SlowFast Networks, Non-local Neural Networks, X3D, ECO, 3D ResNet, TPN, EvaNet, RepFlow, MARS, StNet, and Attention Cluster.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Action Recognition Papers\":0-15",
+            "content": "Useful Papers on Action Recognition and Video Classification.\nTSN: Temporal Segment Networks: Towards Good Practices for Deep Action Recognition, ECCV 2016\nTSM: Temporal Shift Module for Efficient Video Understanding, ICCV 2019\nSlowFast Networks for Video Recognition, ICCV 2019\nNon-local Neural Networks, CVPR 2018\nA Multigrid Method for Efficiently Training Video Models, CVPR2020\nX3D: Progressive Network Expansion for Efficient Video Recognition, CVPR2020\nECO: Efficient Convolutional Network for Online Video Understanding, ECCV 2018\n3D Resnet: Would Mega-scale Datasets Further Enhance Spatiotemporal 3D CNNs, CVPR 2018\nTPN: Temporal Pyramid Network for Action Recognition, CVPR 2020\nEvaNet: Evolving Space-Time Neural Architectures for Videos, ICCV 2019\nRepFlow: Representation Flow for Action Recognition, CVPR 2019\nMARS: Motion-Augmented RGB Stream for Action Recognition, CVPR 2019\nStNet: Local and Global Spatial-Temporal Modeling for Human Action Recognition, AAAI 2019\nAttention Cluster: Purely Attention Based Local Feature Integration for Video Classification"
+        },
+        {
+            "comment": "This code contains references to various research papers in the field of action recognition and video classification, highlighting different models and architectures for these tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Action Recognition Papers\":16-27",
+            "content": "NeXtVLAD: An Efficient Neural Network to Aggregate Frame-level Features for Large-scale Video Classification\nC-TCN: Action localization Model by Baidu, the Champion model of ActivityNet 2018\nNeural Graph Matching Networks for Fewshot 3D Action Recognition - M. Guo et al., ECCV2018.  \nTemporal 3D ConvNets using Temporal Transition Layer - A. Diba et al., CVPRW2018.  \nTemporal 3D ConvNets: New Architecture and Transfer Learning for Video Classification - A. Diba et al., arXiv2017.  \nAttentional Pooling for Action Recognition - R. Girdhar and D. Ramanan, NIPS2017.  \nFully Context-Aware Video Prediction - Byeon et al, arXiv2017.  \nHidden Two-Stream Convolutional Networks for Action Recognition - Y. Zhu et al, arXiv2017.  \nDynamic Image Networks for Action Recognition - H. Bilen et al, CVPR2016.   \nLong-term Recurrent Convolutional Networks for Visual Recognition and Description - J. Donahue et al, CVPR2015.  \nDescribing Videos by Exploiting Temporal Structure - L. Yao et al, ICCV2015. \nReal-time Action Recognition with Enhanced Motion Vector CNNs - B. Zhang et al, CVPR2016.  "
+        },
+        {
+            "comment": "This code refers to a paper titled \"Action Recognition with Trajectory-Pooled Deep-Convolutional Descriptors\" published in CVPR 2015 by authors L. Wang et al. This paper presents a method for action recognition using deep convolutional descriptors and trajectory pooling, offering an efficient approach to analyze and recognize actions from video sequences.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Action Recognition Papers\":28-28",
+            "content": "Action Recognition with Trajectory-Pooled Deep-Convolutional Descriptors - L. Wang et al, CVPR2015. "
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b637999b-258b-4c27-8b5b-e39d939d9727.json b/docs/doc/b637999b-258b-4c27-8b5b-e39d939d9727.json
new file mode 100644
index 000000000..9c4353b22
--- /dev/null
+++ b/docs/doc/b637999b-258b-4c27-8b5b-e39d939d9727.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is a part of the PaddleVideo library and contains localizers, which are responsible for handling different types of video localization tasks. The base class \"BaseLocalizer\" serves as a parent class, while specific classes like \"BMNLocalizer\" and \"YOWOLocalizer\" extend it to handle various localization techniques. These localizers may be used in a wide range of applications depending on the required localization approach.",
+    "details": [
+        {
+            "comment": "This code is a part of the PaddleVideo library and contains localizers, which are responsible for handling different types of video localization tasks. The base class \"BaseLocalizer\" serves as a parent class, while specific classes like \"BMNLocalizer\" and \"YOWOLocalizer\" extend it to handle various localization techniques. These localizers may be used in a wide range of applications depending on the required localization approach.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/__init__.py\":0-18",
+            "content": "# copyright (c) 2020  paddlepaddle authors. all rights reserved.\n#\n# licensed under the apache license, version 2.0 (the \"license\"\n# you may not use this file except in compliance with the license.\n# you may obtain a copy of the license at\n#\n#     http://www.apache.org/licenses/license-2.0\n#\n# unless required by applicable law or agreed to in writing, software\n# distributed under the license is distributed on an \"as is\" basis,\n# without warranties or conditions of any kind, either express or implied.\n# see the license for the specific language governing permissions and\n# limitations under the license.\nfrom .base import BaseLocalizer\nfrom .bmn_localizer import BMNLocalizer\nfrom .yowo_localizer import YOWOLocalizer\n__all__ = ['BaseLocalizer', 'BMNLocalizer', 'YOWOLocalizer']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b67dd6ad-fced-42c9-84df-1df4809132dc.json b/docs/doc/b67dd6ad-fced-42c9-84df-1df4809132dc.json
new file mode 100644
index 000000000..84d4161ee
--- /dev/null
+++ b/docs/doc/b67dd6ad-fced-42c9-84df-1df4809132dc.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports various functions and classes from different modules in the PaddleVideo library. The __all__ list specifies the exported public symbols including MultigridSchedule, get_norm, aggregate_sub_bn_stats, DistributedShortSampler, subn_save, subn_load, and is_eval_epoch.",
+    "details": [
+        {
+            "comment": "This code imports various functions and classes from different modules in the PaddleVideo library. The __all__ list specifies the exported public symbols including MultigridSchedule, get_norm, aggregate_sub_bn_stats, DistributedShortSampler, subn_save, subn_load, and is_eval_epoch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/__init__.py\":0-9",
+            "content": "from .multigrid import MultigridSchedule\nfrom .batchnorm_helper import get_norm, aggregate_sub_bn_stats\nfrom .short_sampler import DistributedShortSampler\nfrom .save_load_helper import subn_save, subn_load\nfrom .interval_helper import is_eval_epoch\n__all__ = [\n    'MultigridSchedule', 'get_norm', 'aggregate_sub_bn_stats',\n    'DistributedShortSampler', 'subn_save', 'subn_load', 'is_eval_epoch'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b68f2b94-9a8b-4e54-b303-e2f1d8fec70a.json b/docs/doc/b68f2b94-9a8b-4e54-b303-e2f1d8fec70a.json
new file mode 100644
index 000000000..666170efd
--- /dev/null
+++ b/docs/doc/b68f2b94-9a8b-4e54-b303-e2f1d8fec70a.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The \"ConfigParser\" initializes argument parsers, handles slave mode, and sets directories. It manages config parsing, experiment settings, and data loaders with logging capabilities. The code includes two functions for accessing and modifying values in a nested object using a sequence of keys.",
+    "details": [
+        {
+            "comment": "This code snippet is the beginning of a Python class, \"ConfigParser,\" which appears to be part of a larger project. The class takes in an \"args\" parameter (possibly command line arguments) and two optional parameters: \"options\" and \"timestamp\". It also has a boolean parameter named \"slave_mode\". The code imports various modules and defines the class but no specific functionality is provided in this section.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py\":0-34",
+            "content": "# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport time\nimport paddle\nimport pprint\nimport logging\nfrom typing import Dict\nfrom pathlib import Path\nfrom datetime import datetime\nfrom operator import getitem\nfrom functools import reduce\nfrom mergedeep import Strategy, merge\nfrom zsvision.zs_utils import set_nested_key_val\nfrom typeguard import typechecked\nfrom utils import read_json, write_json\nfrom logger import setup_logging\nclass ConfigParser:\n    def __init__(self, args, options='', timestamp=True, slave_mode=False):"
+        },
+        {
+            "comment": "The code initializes the argument parser, adds options to parse default and custom CLI options, handles slave mode (avoiding reinitializing logger), parses arguments, checks for a config file, and loads the configuration. If an evaluation config is specified, it validates the path to the evaluation file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py\":35-61",
+            "content": "        # slave_mode - when calling the config parser form an existing process, we\n        # avoid reinitialising the logger and ignore sys.argv when argparsing.\n        # parse default and custom cli options\n        for opt in options:\n            args.add_argument(*opt.flags, default=None, type=opt.type)\n        if slave_mode:\n            args = args.parse_args(args=[])\n        else:\n            args = args.parse_args()\n        if args.resume and not slave_mode:\n            self.resume = Path(args.resume)\n        else:\n            msg_no_cfg = \"Config file must be specified\"\n            assert args.config is not None, msg_no_cfg\n            self.resume = None\n        self.cfg_fname = Path(args.config)\n        config = self.load_config(self.cfg_fname)\n        self._config = _update_config(config, options, args)\n        if self._config.get(\"eval_config\", False):\n            # validate path to evaluation file\n            eval_cfg_path = self._config.get(\"eval_config\")\n            msg = f\"eval_config was specified, but `{eval_cfg_path}` does not exist\""
+        },
+        {
+            "comment": "The code sets the save directory for the trained model and logs based on whether \"tester\" or \"trainer\" is specified in the configuration. It also creates a timestamp to differentiate experiments, handles slave mode, sets the experiment name using given arguments, and if group_id and group_seed are provided, it generates subdirectories accordingly. Additionally, it checks if the user wants to purge previous experiments with the current config and removes them if true.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py\":62-87",
+            "content": "            assert Path(self._config.get(\"eval_config\")).exists(), msg\n        # set save_dir where trained model and log will be saved.\n        if \"tester\" in self.config:\n            save_dir = Path(self.config['tester']['save_dir'])\n        else:\n            save_dir = Path(self.config['trainer']['save_dir'])\n        timestamp = datetime.now().strftime(r\"%Y-%m-%d_%H-%M-%S\") if timestamp else \"\"\n        if slave_mode:\n            timestamp = f\"{timestamp}-eval-worker\"\n        exper_name = self.set_exper_name(args, config=config)\n        if getattr(args, \"group_id\", False):\n            subdir = Path(args.group_id) / f\"seed-{args.group_seed}\" / timestamp\n        else:\n            subdir = timestamp\n        self._save_dir = save_dir / 'models' / exper_name / subdir\n        self._log_dir = save_dir / 'log' / exper_name / subdir\n        self._exper_name = exper_name\n        self._args = args\n        # if set, remove all previous experiments with the current config\n        if vars(args).get(\"purge_exp_dir\", False):"
+        },
+        {
+            "comment": "The code is purging directories from a specified directory and then recreates the save_dir and log_dir directories. It writes the updated config file to the checkpoint dir, sets up logging if not in slave mode, and assumes that config files are organized into directories with the name of the dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py\":88-111",
+            "content": "            for dirpath in (self._save_dir, self._log_dir):\n                config_dir = dirpath.parent\n                existing = list(config_dir.glob(\"*\"))\n                print(f\"purging {len(existing)} directories from config_dir...\")\n                tic = time.time()\n                os.system(f\"rm -rf {config_dir}\")\n                print(f\"Finished purge in {time.time() - tic:.3f}s\")\n        self.save_dir.mkdir(parents=True, exist_ok=True)\n        self.log_dir.mkdir(parents=True, exist_ok=True)\n        # save updated config file to the checkpoint dir\n        write_json(self.config, self.save_dir / 'config.json')\n        # configure logging module\n        if not slave_mode:\n            self.log_path = setup_logging(self.log_dir)\n        self.log_levels = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}\n    def set_exper_name(self, args, config):\n        # We assume that the config files are organised into directories such that\n        # each directory has the name of the dataset.\n        dataset_name = self.cfg_fname.parent.stem"
+        },
+        {
+            "comment": "This code block handles configuration parsing, custom arguments, and some optional settings. It sets the experiment name based on dataset and config file name, then processes custom arguments to set nested keys in the configuration dictionary. The code also checks for disabled data loader workers and restricts training to a single epoch if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py\":112-133",
+            "content": "        exper_name = f\"{dataset_name}-{self.cfg_fname.stem}\"\n        if args.custom_args:\n            key_val_lists = args.custom_args.split(\"+\")\n            for key_val_pair in key_val_lists:\n                print(f\"parsing key-val pair : {key_val_pair}\")\n                key, val = key_val_pair.split(\"@\")\n                set_nested_key_val(key, val, self._config)\n                # remove periods from key names\n                key_ = key.replace(\"_.\", \"--\")\n                # remove commas from value names\n                val = val.replace(\",\", \"--\")\n                custom_tag = \"-\".join(key_.split(\".\")[-2:])\n                exper_name = f\"{exper_name}-{custom_tag}-{val}\"\n        if getattr(args, \"disable_workers\", False):\n            print(\"Disabling data loader workers....\")\n            config[\"data_loader\"][\"args\"][\"num_workers\"] = 0\n        if getattr(args, \"train_single_epoch\", False):\n            print(\"Restricting training to a single epoch....\")\n            config[\"trainer\"][\"epochs\"] = 1\n            config[\"trainer\"][\"save_period\"] = 1"
+        },
+        {
+            "comment": "This code snippet defines a function that loads and processes a configuration file, initializes an instance of a class with a specific name and arguments, and returns the initialized instance. The configuration loading process applies inheritance through a config hierarchy and allows skipping the first saves in training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py\":134-158",
+            "content": "            config[\"trainer\"][\"skip_first_n_saves\"] = 0\n            exper_name = f\"{exper_name}-train-single-epoch\"\n        return exper_name\n    @staticmethod\n    @typechecked\n    def load_config(cfg_fname: Path) -> Dict:\n        config = read_json(cfg_fname)\n        # apply inheritance through config hierarchy\n        descendant, ancestors = config, []\n        while \"inherit_from\" in descendant:\n            parent_config = read_json(Path(descendant[\"inherit_from\"]))\n            ancestors.append(parent_config)\n            descendant = parent_config\n        for ancestor in ancestors:\n            merge(ancestor, config, strategy=Strategy.REPLACE)\n            config = ancestor\n        return config\n    def init(self, name, module, *args, **kwargs):\n        \"\"\"Finds a function handle with the name given as 'type' in config, and returns\n        the instance initialized with corresponding keyword args given as 'args'.\n        \"\"\"\n        module_name = self[name]['type']\n        module_args = dict(self[name]['args'])"
+        },
+        {
+            "comment": "Function checks if any overwriting kwargs are present in the module_args and raises an error if so. It then updates the module_args with all kwargs, returns a function call using the updated args. The code also provides functionality to access, set, check containment, get keys of the config, and get values with default option.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py\":159-189",
+            "content": "        msg = (f\"Fail for {module_name}\\n\"\n               f\"overwriting kwargs given in config file is not allowed\\n\"\n               f\"passed kwargs: {kwargs}\\n\"\n               f\"for module_args: {module_args})\")\n        assert all([k not in module_args for k in kwargs]), msg\n        module_args.update(kwargs)\n        return getattr(module, module_name)(*args, **module_args)\n    def __getitem__(self, name):\n        return self.config[name]\n    def __len__(self):\n        # NOTE: This is used for boolean checking deep inside ray.tune, so we required it\n        # to be defined.\n        return len(self.config)\n    def __setitem__(self, name, value):\n        self.config[name] = value\n    def __contains__(self, name):\n        return name in self.config\n    def get(self, name, default):\n        return self.config.get(name, default)\n    def keys(self):\n        return self.config.keys()\n    def get_logger(self, name, verbosity=2):\n        msg_verbosity = \"verbosity option {} is invalid. Valid options are {}.\"\n        msg_verbosity = msg_verbosity.format(verbosity, self.log_levels.keys())"
+        },
+        {
+            "comment": "This code snippet defines a class with properties for config, save_dir, and log_dir. It also has methods to iterate over items in the config dictionary and helper functions to update the config with custom CLI options. The logger is set based on the verbosity level provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py\":190-231",
+            "content": "        assert verbosity in self.log_levels, msg_verbosity\n        logger = logging.getLogger(name)\n        logger.setLevel(self.log_levels[verbosity])\n        return logger\n    # setting read-only attributes\n    @property\n    def config(self):\n        return self._config\n    @property\n    def save_dir(self):\n        return self._save_dir\n    @property\n    def log_dir(self):\n        return self._log_dir\n    def __repr__(self):\n        return pprint.PrettyPrinter().pformat(self.__dict__)\n    def items(self):\n        return self._config.items()\n# helper functions used to update config dict with custom cli options\ndef _update_config(config, options, args):\n    for opt in options:\n        value = getattr(args, _get_opt_name(opt.flags))\n        if value is not None:\n            _set_by_path(config, opt.target, value)\n    return config\ndef _get_opt_name(flags):\n    for flg in flags:\n        if flg.startswith('--'):\n            return flg.replace('--', '')\n    return flags[0].replace('--', '')\ndef _set_by_path(tree, keys, value):"
+        },
+        {
+            "comment": "This code snippet defines two functions, `_get_by_path` and `set_in_nested`, for accessing and modifying values in a nested object using a sequence of keys. The `_get_by_path` function uses the `reduce` function with `getitem` as the function argument to iterate through the keys and return the nested object's value, while `set_in_nested` sets a new value in a nested object by first accessing the nested object using the provided keys and then setting the final key's value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/parse_config.py\":232-238",
+            "content": "    \"\"\"Set a value in a nested object in tree by sequence of keys.\"\"\"\n    _get_by_path(tree, keys[:-1])[keys[-1]] = value\ndef _get_by_path(tree, keys):\n    \"\"\"Access a nested object in tree by sequence of keys.\"\"\"\n    return reduce(getitem, keys, tree)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b694d4e3-92ea-4eb1-85b6-86ac64c4464b.json b/docs/doc/b694d4e3-92ea-4eb1-85b6-86ac64c4464b.json
new file mode 100644
index 000000000..d444b6bf5
--- /dev/null
+++ b/docs/doc/b694d4e3-92ea-4eb1-85b6-86ac64c4464b.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code provides a link to the Chinese version of the modular design tutorial and the English version, allowing users to switch between languages based on their preference.",
+    "details": [
+        {
+            "comment": "This code provides a link to the Chinese version of the modular design tutorial and the English version, allowing users to switch between languages based on their preference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/modular_design.md\":0-0",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/tutorials/modular_design.md) | English"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b69c8507-e3c3-4461-a3ec-789e44965073.json b/docs/doc/b69c8507-e3c3-4461-a3ec-789e44965073.json
new file mode 100644
index 000000000..fbabe8797
--- /dev/null
+++ b/docs/doc/b69c8507-e3c3-4461-a3ec-789e44965073.json
@@ -0,0 +1,10 @@
+{
+    "summary": "Code reads a JSON file, modifies its format, and writes it back as a new JSON file.",
+    "details": [
+        {
+            "comment": "Code reads a JSON file, modifies its format, and writes it back as a new JSON file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/gts_format_transfer.py\":0-11",
+            "content": "import json\nwith open('/home/aistudio/work/BMN/Input_for_bmn/label_fixed.json') as f:\n    data = json.load(f)\nf.close()\ntarget_format = {'taxonomy': None, 'database': data, 'version': None}\njsonString = json.dumps(target_format, indent=4, ensure_ascii=False)\njsonFile = open('/home/aistudio/work/BMN/Input_for_bmn/label_gts.json', 'w')\njsonFile.write(jsonString)\njsonFile.close()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b723ccf4-e1a5-4405-9960-ff6582d1c5be.json b/docs/doc/b723ccf4-e1a5-4405-9960-ff6582d1c5be.json
new file mode 100644
index 000000000..519ffa371
--- /dev/null
+++ b/docs/doc/b723ccf4-e1a5-4405-9960-ff6582d1c5be.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The Token Shift Transformer is a versatile video classification model utilizing vision transformer and Token Shift Module, trained on UCF-101 dataset with mixed-precision AMP acceleration, and achieves high accuracy with \"BrushingTeeth.avi\".",
+    "details": [
+        {
+            "comment": "Token Shift Transformer is a video classification model using vision transformer, with a novel Token Shift Module for modeling temporal relations. It offers strong interpretability and flexibility, while being zero-parameter and zero-FLOPs. UCF-101 data preparation guide provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tokenshift_transformer.md\":0-35",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/tokenshift_transformer.md) | English\n# Token Shift Transformer\n## Content\n- [Introduction](#Introduction)\n- [Data](#DATA)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nToken Shift Transformer is a video classification model based on vision transformer, which shares merits of strong interpretability, high discriminative power on hyper-scale data, and \ufb02exibility in processing varying length inputs. Token Shift Module is a novel, zero-parameter, zero-FLOPs operator, for modeling temporal relations within each transformer encoder.\n<div align=\"center\">\n<img src=\"../../../images/tokenshift_structure.png\">\n</div>\n## Data\nUCF-101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)\n## Train\n### UCF-101 data set training\n#### Download and add pre-trained models\n1. Download the image pre-training model [ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.c"
+        },
+        {
+            "comment": "This code provides instructions on how to download a pre-trained model and modify a configuration file for training the TokenShift Transformer model on the UCF-101 dataset using PaddlePaddle. It also highlights the need for using mixed-precision training with AMP to accelerate the training process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tokenshift_transformer.md\":35-62",
+            "content": "om/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through the wget command\n   ```bash\n   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams\n   ```\n2. Open `PaddleVideo/configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"RecognizerTransformer\"\n        backbone:\n            name: \"TokenShiftVisionTransformer\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The UCF-101 data set uses 1 card for training, and the start command of the training method is as follows:\n```bash\n# videos data format\npython3 main.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --validate --seed=1234\n```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n```bash"
+        },
+        {
+            "comment": "This code snippet is used to train a Token Shift Transformer model on the UCF101 dataset with a video size of 256. The model configuration file is tokShift_transformer_ucf101_256_videos.yaml, and the training is performed using automatic mixed precision (--amp flag). The model will be validated during training, and the best model's test accuracy can be found in the training log using the keyword \"best\". The test mode sampling method is uniform sampling, which differs from the dense sampling used in verification mode during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tokenshift_transformer.md\":63-77",
+            "content": "python3 main.py --amp -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --validate --seed=1234\n```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.\n## Test\n- The Token Shift Transformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```\n  Already save the best model (top1 acc)0.9201\n  ```\n- Since the sampling method of the Token Shift Transformer model test mode is **uniform** sampling, which is different from the **dense** sampling used in the verification mode during the training process, so the verification index recorded in the training log"
+        },
+        {
+            "comment": "This code describes a command for testing the best model after training is complete using the TokenShift VisionTransformer on the UCF-101 dataset. The test configuration parameters include backbone, sampling method, num_seg, and target_size to obtain Top-1 accuracy. The checkpoints are available in a shared Google Drive link. Uniform sampling divides timing equally into `num_seg`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tokenshift_transformer.md\":77-92",
+            "content": ", called `topk Acc `, does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:\n  ```bash\n  python3 main.py --amp -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --test --seed=1234 -w 'output/TokenShiftVisionTransformer/TokenShiftVisionTransformer_best.pdparams'\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of UCF-101 are as follows:\n  | backbone | sampling method | num_seg | target_size | Top-1 | checkpoints |\n  | :----------------: | :-----: | :-----: | :---------: | :----: | :----------------------------------------------------------: |\n  | Vision Transformer | Uniform | 8 | 256 | 92.81 | [TokenShiftTransformer.pdparams](https://drive.google.com/drive/folders/1k_TpAqaJZYJE8C5g5pT9phdyk9DrY_XL?usp=sharing) |\n- Uniform sampling: Timing-wise, equal division into `num_seg"
+        },
+        {
+            "comment": "This code provides instructions for exporting an inference model and using the prediction engine in PaddleVideo's TokenShift Vision Transformer. The first command exports the model structure file (TokenShiftVisionTransformer.pdmodel) and the model weight file (TokenShiftVisionTransformer.pdiparams). The second command uses these files to perform inference on a specific video file (e.g., 'data/BrushingTeeth.avi').",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tokenshift_transformer.md\":92-115",
+            "content": "` segments, 1 frame sampled at the middle of each segment; spatially, sampling at the center. 1 video sampled 1 clip in total.\n## Inference\n### Export inference model\n```bash\npython3 tools/export_model.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml -p 'output/TokenShiftVisionTransformer/TokenShiftVisionTransformer_best.pdparams'\n```\nThe above command will generate the model structure file `TokenShiftVisionTransformer.pdmodel` and the model weight file `TokenShiftVisionTransformer.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Reasoning Method](../../usage.md#2-infer)\n### Use prediction engine inference\n```bash\npython3 tools/predict.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml -i 'data/BrushingTeeth.avi' --model_file ./inference/TokenShiftVisionTransformer.pdmodel --params_file ./inference/TokenShiftVisionTransformer.pdiparams\n```\nThe output example is as follows:\n```\nCurrent video file: data/BrushingTeeth.avi"
+        },
+        {
+            "comment": "This code snippet is displaying the top-1 category prediction and confidence score for a given video file \"BrushingTeeth.avi\" using Token Shift Transformer model trained on UCF-101 dataset. The predicted top-1 category id is 19, and its corresponding category name is \"brushing_teeth\", with a high confidence of 0.99.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/tokenshift_transformer.md\":116-124",
+            "content": "\ttop-1 class: 19\n\ttop-1 score: 0.9959074258804321\n```\nIt can be seen that using the Token Shift Transformer model trained on UCF-101 to predict `data/BrushingTeeth.avi`, the output top1 category id is `19`, and the confidence is 0.99. By consulting the category id and name correspondence table, it can be seen that the predicted category name is `brushing_teeth`.\n## Reference\n- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b74049ae-1829-471d-9215-cc98adbb882d.json b/docs/doc/b74049ae-1829-471d-9215-cc98adbb882d.json
new file mode 100644
index 000000000..232741015
--- /dev/null
+++ b/docs/doc/b74049ae-1829-471d-9215-cc98adbb882d.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The YOWOLocalizer extends BaseLocalizer, performs NMS on detected boxes, matches ground truth with predicted boxes based on IoU threshold, and calculates precision, recall, and F-score for YOWO localizer using test step function.",
+    "details": [
+        {
+            "comment": "Code from PaddleVideo's yowo_localizer.py file defines a YOWOLocalizer class which extends BaseLocalizer and utilizes the backbone function for forwarding image data. It also includes methods forward_net and train_step for processing images in training context.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py\":0-32",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import LOCALIZERS\nfrom .base import BaseLocalizer\nfrom .yowo_utils import truths_length, nms, get_region_boxes, bbox_iou\n@LOCALIZERS.register()\nclass YOWOLocalizer(BaseLocalizer):\n    \"\"\"YOWO Localization framework\n    \"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Call backbone forward.\n        \"\"\"\n        # imgs.shape=[N,C,T,H,W], for YOWO\n        preds = self.backbone(imgs)\n        return preds\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        x_data = data_batch[0]"
+        },
+        {
+            "comment": "Function \"forward_net\" is called to perform model's forward pass, and the output of this function call is stored in 'out'. The code then calls another function \"loss\" passing the output from the model (out) and the target data (target). The output from the loss function call is stored in 'loss', along with number of correct predictions ('nCorrect'). A dictionary named 'loss_metrics' is created storing 'loss' and 'nCorrect'. This process is part of training set step. \nThe 'val_step' function performs validation steps like calculating total, proposals, correct and fscore variables using specific values (total = 0.0, proposals = 0.0, correct = 0.0, fscore = 0.0). It also uses an epsilon value of 1e-5 and a nms_thresh and iou_thresh of 0.4 for certain calculations. It calls the model's forward pass (forward_net) to get 'out', then gets all region boxes using get_region_boxes function, then iterates over each box in out, storing them in a list named 'out_boxes'. This process is part of validating step.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py\":33-66",
+            "content": "        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor\n        target.stop_gradient = True\n        # call Model forward\n        out = self.forward_net(x_data)\n        # call Loss forward\n        loss, nCorrect = self.loss(out, target)\n        loss_metrics = dict()\n        loss_metrics['loss'] = loss\n        loss_metrics['nCorrect'] = nCorrect\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        total = 0.0\n        proposals = 0.0\n        correct = 0.0\n        fscore = 0.0\n        eps = 1e-5\n        nms_thresh = 0.4\n        iou_thresh = 0.5\n        x_data = data_batch[0]\n        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor\n        frame_idx = data_batch[2]\n        target.stop_gradient = True\n        # call Model forward\n        out = self.forward_net(x_data)\n        all_boxes = get_region_boxes(out)\n        out_boxes = []\n        for i in range(out.shape[0]):\n            boxes = all_boxes[i]"
+        },
+        {
+            "comment": "This code performs Non-Maximum Suppression (NMS) on detected boxes, selects confident boxes for further processing, and associates ground truth boxes with predicted boxes based on Intersection over Union (IoU) threshold. It counts correct matches, proposals, total ground truth boxes, and calculates precision. The precision is calculated by dividing the number of correct matches by the sum of proposals and a small epsilon value to avoid division by zero.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py\":67-89",
+            "content": "            boxes = nms(boxes, nms_thresh)\n            out_boxes.append(boxes)\n            truths = target[i].reshape([-1, 5])\n            num_gts = truths_length(truths)\n            total = total + num_gts\n            pred_list = []\n            for i in range(len(boxes)):\n                if boxes[i][4] > 0.25:\n                    proposals = proposals + 1\n                    pred_list.append(i)\n            for i in range(num_gts):\n                box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]]\n                best_iou = 0\n                best_j = -1\n                for j in pred_list:  # ITERATE THROUGH ONLY CONFIDENT BOXES\n                    iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False)\n                    if iou > best_iou:\n                        best_j = j\n                        best_iou = iou\n                if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]:\n                    correct = correct + 1\n        precision = 1.0 * correct / (proposals + eps)"
+        },
+        {
+            "comment": "This code defines a localizer for the YOLOv3 model and calculates precision, recall, and F-score using test step function. It initializes variables, processes input data batch, applies non-maximum suppression (NMS) to regions of interest, and returns output metrics including precision, recall, F-score, and frame index.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py\":90-124",
+            "content": "        recall = 1.0 * correct / (total + eps)\n        fscore = 2.0 * precision * recall / (precision + recall + eps)\n        outs = dict()\n        outs['precision'] = precision\n        outs['recall'] = recall\n        outs['fscore'] = fscore\n        outs['frame_idx'] = frame_idx\n        return outs\n    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        total = 0.0\n        proposals = 0.0\n        correct = 0.0\n        fscore = 0.0\n        eps = 1e-5\n        nms_thresh = 0.4\n        iou_thresh = 0.5\n        x_data = data_batch[0]\n        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor\n        frame_idx = data_batch[2]\n        target.stop_gradient = True\n        # call Model forward\n        out = self.forward_net(x_data)\n        all_boxes = get_region_boxes(out)\n        out_boxes = []\n        for i in range(out.shape[0]):\n            boxes = all_boxes[i]\n            boxes = nms(boxes, nms_thresh)\n            out_boxes.append(boxes)\n            truths = target[i].reshape([-1, 5])"
+        },
+        {
+            "comment": "The code computes precision, recall, and F-score for each class of the YOWO localizer. It iterates through ground truth boxes and confident proposal boxes to match them based on Intersection over Union (IoU) threshold. It counts correctly matched boxes and total boxes, then calculates precision, recall, and F-score using these values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py\":125-146",
+            "content": "            num_gts = truths_length(truths)\n            total = total + num_gts\n            pred_list = []\n            for i in range(len(boxes)):\n                if boxes[i][4] > 0.25:\n                    proposals = proposals + 1\n                    pred_list.append(i)\n            for i in range(num_gts):\n                box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]]\n                best_iou = 0\n                best_j = -1\n                for j in pred_list:  # ITERATE THROUGH ONLY CONFIDENT BOXES\n                    iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False)\n                    if iou > best_iou:\n                        best_j = j\n                        best_iou = iou\n                if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]:\n                    correct = correct + 1\n        precision = 1.0 * correct / (proposals + eps)\n        recall = 1.0 * correct / (total + eps)\n        fscore = 2.0 * precision * recall / (precision + recall + eps)"
+        },
+        {
+            "comment": "This code defines two functions within the yowo_localizer class. The first function, \"infer_step\", takes in a data batch and feeds it into the forward_net to get an output. The second function returns dictionaries for boxes, precision, recall, fscore, and frame_idx as outputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py\":148-160",
+            "content": "        outs = dict()\n        outs['boxes'] = out_boxes\n        outs['precision'] = precision\n        outs['recall'] = recall\n        outs['fscore'] = fscore\n        outs['frame_idx'] = frame_idx\n        return outs\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        out = self.forward_net(data_batch[0])\n        return out"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b78583dd-b911-4165-bd81-f7a00b2893c8.json b/docs/doc/b78583dd-b911-4165-bd81-f7a00b2893c8.json
new file mode 100644
index 000000000..83ebf8710
--- /dev/null
+++ b/docs/doc/b78583dd-b911-4165-bd81-f7a00b2893c8.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code initializes a metric class for PaddleVideo, handling batch updates and GPU data to mitigate resampling effects while managing output accumulation, concatenation, top-k accuracy, and logging.",
+    "details": [
+        {
+            "comment": "This code registers a class called CenterCropMetric as a metric in the PaddleVideo library. It initializes the metric with data_size, batch_size, and log_interval parameters. The rest_data_size is also stored to keep track of remaining samples to be tested.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/center_crop_metric.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom typing import List\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom .base import BaseMetric\nfrom .registry import METRIC\nlogger = get_logger(\"paddlevideo\")\n@METRIC.register\nclass CenterCropMetric(BaseMetric):\n    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):\n        \"\"\"prepare for metrics\n        \"\"\"\n        super().__init__(data_size, batch_size, log_interval, **kwargs)\n        self.rest_data_size = data_size  # Number of samples remaining to be tested\n        self.all_outputs = []"
+        },
+        {
+            "comment": "This code is initializing a metric object, allowing for batch updates, and handling data from multiple GPUs to avoid resampling effects when testing with multiple cards.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/center_crop_metric.py\":31-54",
+            "content": "        self.all_labels = []\n        self.topk = kwargs.get(\"topk\", [1, 5])\n    def update(self, batch_id: int, data: List, outputs: paddle.Tensor) -> None:\n        \"\"\"update metrics during each iter\n        Args:\n            batch_id (int): iter id of current batch.\n            data (List): list of batched data, such as [inputs, labels]\n            outputs (paddle.Tensor): batched outputs from model\n        \"\"\"\n        labels = data[1]\n        if self.world_size > 1:\n            labels_gathered = self.gather_from_gpu(labels, concat_axis=0)\n            outpus_gathered = self.gather_from_gpu(outputs, concat_axis=0)\n        else:\n            labels_gathered = labels\n            outpus_gathered = outputs\n        # Avoid resampling effects when testing with multiple cards\n        labels_gathered = labels_gathered[0:min(len(labels_gathered), self.\n                                                rest_data_size)]\n        outpus_gathered = outpus_gathered[0:min(len(outpus_gathered), self.\n                                                rest_data_size)]"
+        },
+        {
+            "comment": "The code is part of a class that seems to be handling batch processing in a machine learning application. It accumulates and concatenates outputs and labels from multiple batches, performs top-k accuracy calculations, and logs the results. The log_interval variable controls when progress updates are displayed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/center_crop_metric.py\":55-78",
+            "content": "        self.all_labels.append(labels_gathered)\n        self.all_outputs.append(outpus_gathered)\n        self.rest_data_size -= outpus_gathered.shape[0]\n        # preds ensemble\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{} ...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size)))\n    def accumulate(self):\n        \"\"\"accumulate, compute, and show metrics when finished all iters.\n        \"\"\"\n        self.all_outputs = paddle.concat(self.all_outputs, axis=0)\n        self.all_labels = paddle.concat(self.all_labels, axis=0)\n        result_str = []\n        for _k in self.topk:\n            topk_val = paddle.metric.accuracy(input=self.all_outputs,\n                                              label=self.all_labels,\n                                              k=_k).item()\n            result_str.append(f\"avg_acc{_k}={topk_val}\")\n        result_str = \", \".join(result_str)\n        logger.info(f\"[TEST] finished, {result_str}\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b79c18a8-79c5-4932-9188-5acc155abc96.json b/docs/doc/b79c18a8-79c5-4932-9188-5acc155abc96.json
new file mode 100644
index 000000000..6b7bbf7d5
--- /dev/null
+++ b/docs/doc/b79c18a8-79c5-4932-9188-5acc155abc96.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This script changes directory to \"data/ntu-rgb-d\" and downloads a zip file containing skeleton data for frames 1-17. It then unzips the file, deletes the original, and downloads another zip file named \"statistics.zip\". The script creates a new folder named \"statistics\", extracts its contents into it, and removes the downloaded zip file.",
+    "details": [
+        {
+            "comment": "This script changes directory to \"data/ntu-rgb-d\" and downloads a zip file containing skeleton data for frames 1-17. It then unzips the file, deletes the original, and downloads another zip file named \"statistics.zip\". The script creates a new folder named \"statistics\", extracts its contents into it, and removes the downloaded zip file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/download_dataset.sh\":0-11",
+            "content": "cd data/ntu-rgb-d\n# download\nwget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1CUZnBtYwifVXS21yVg62T-vrPVayso5H' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p')&id=1CUZnBtYwifVXS21yVg62T-vrPVayso5H\" -O nturgbd_skeletons_s001_to_s017.zip && rm -rf /tmp/cookies.txt\nunzip nturgbd_skeletons_s001_to_s017.zip && rm -rf nturgbd_skeletons_s001_to_s017.zip\nwget https://videotag.bj.bcebos.com/Data/statistics.zip\nmkdir statistics\nunzip statistics.zip -d statistics/ && rm -rf statistics.zip"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b87949ed-ea6c-4a52-a6a6-5f681a0797a0.json b/docs/doc/b87949ed-ea6c-4a52-a6a6-5f681a0797a0.json
new file mode 100644
index 000000000..4ceee445a
--- /dev/null
+++ b/docs/doc/b87949ed-ea6c-4a52-a6a6-5f681a0797a0.json
@@ -0,0 +1,35 @@
+{
+    "summary": "YOWO is a single-stage feature extraction network with channel fusion and attention. Pre-trained on UCF101-24, it provides model structure and weight files for prediction with high confidence.",
+    "details": [
+        {
+            "comment": "YOWO is a single-stage network with 2 branches for spatial and spatio-temporal feature extraction. It uses channel fusion and attention mechanism to aggregate features, then performs frame-level detection. UCF101-24 data preparation instructions provided. Pre-trained models like resnext-101-kinetics are needed for training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/yowo.md\":0-35",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/localization/yowo.md) | English\n# YOWO\n## Content\n- [Introduction](#Introduction)\n- [Data](#DATA)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nYOWO is a single-stage network with two branches. One branch extracts spatial features of key frames (i.e., the current frame) via 2D-CNN, while the other branch acquires spatio-temporal features of clips consisting of previous frames via 3D-CNN. To accurately aggregate these features, YOWO uses a channel fusion and attention mechanism that maximizes the inter-channel dependencies. Finally, the fused features are subjected to frame-level detection.\n<div align=\"center\">\n<img src=\"../../../images/yowo.jpg\">\n</div>\n## Data\nUCF101-24 data download and preparation please refer to [UCF101-24 data preparation](../../dataset/ucf24.md)\n## Train\n### UCF101-24 data set training\n#### Download and add pre-trained models\n1. Download the pre-training model [resnext-101-kinetics](https://vide"
+        },
+        {
+            "comment": "This code provides instructions for downloading and configuring pre-trained models (`darknet.pdparam` and `resnext101_kinetics.pdparams`) for the YOWOLocalizer model in PaddleVideo. The models need to be added under `pretrained_2d:` and `pretrained_3d:` respectively in the `yowo.yaml` file. After that, use the command `python3 main.py -c configs/localization/yowo.yaml --validate --seed=1` to start training on the UCF101-24 dataset using 1 card.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/yowo.md\":35-59",
+            "content": "otag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams) \u548c [darknet](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam) as Backbone initialization parameters, or download through the wget command\n   ```bash\n    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam\n    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams\n   ```\n2. Open `PaddleVideo/configs/localization/yowo.yaml`, and fill in the downloaded weight storage path below `pretrained_2d:` and `pretrained_3d:` respectively\n    ```yaml\n    MODEL:\n        framework: \"YOWOLocalizer\"\n        backbone:\n            name: \"YOWO\"\n            num_class: 24\n            pretrained_2d: fill in the path of 2D pre-training model here\n            pretrained_3d: fill in the path of 3D pre-training model here\n    ```\n#### Start training\n- The UCF101-24 data set uses 1 card for training, and the start command of the training method is as follows:\n    ```bash\n    python3 main.py -c configs/localization/yowo.yaml --validate --seed=1"
+        },
+        {
+            "comment": "Enables AMP mixed-precision for faster training, using the command 'python3 main.py --amp -c configs/localization/yowo.yaml --validate --seed=1'. Customize parameters to train or test on different datasets, following the naming format 'model_dataset name_file format_data format_sampling method.yaml'. During training, find 'best' in logs to obtain model test accuracy using Frame-mAP (@ IoU 0.5), which differs from verification fscore used during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/yowo.md\":60-79",
+            "content": "    ```\n- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:\n    ```bash\n    python3 main.py --amp -c configs/localization/yowo.yaml --validate --seed=1\n    ```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.\n## Test\n- The YOWO model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:\n  ```\n  Already save the best model (fsocre)0.8779\n  ```\n- Since the verification index of the YOWO model test mode is **Frame-mAP (@ IoU 0.5)**, which is different from the **fscore** used in the verification mode during the training process, so the v"
+        },
+        {
+            "comment": "The code snippet shows how to evaluate the YOWO model's performance using a test mode and provides information about the input size, frame-mAP with IoU 0.5, and the checkpoint used for testing on UCF101-24 dataset. Additionally, it demonstrates how to export the inference model for future use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/yowo.md\":79-100",
+            "content": "erification index recorded in the training log, called `fscore `, does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:\n  ```bash\n  python3 main.py -c configs/localization/yowo.yaml --test --seed=1 -w 'output/YOWO/YOWO_epoch_00005.pdparams'\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of UCF101-24 are as follows:\n  | Model    | 3D-CNN backbone | 2D-CNN backbone | Dataset  |Input    | Frame-mAP <br>(@ IoU 0.5)    |   checkpoints  |\n  | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: |\n  | YOWO | 3D-ResNext-101 | Darknet-19 | UCF101-24 | 16-frames, d=1 | 80.94 | [YOWO.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/YOWO_epoch_00005.pdparams) |\n## Inference\n### Export inference model\n```bash\npython3 tools/export_model.py -c configs/localization/yowo.yaml -p 'output/YOWO/YOWO_epoch_00005.pdparams'"
+        },
+        {
+            "comment": "This code explains how to generate the YOWO model structure file and weight file for prediction. It also provides instructions on how to use the prediction engine for inference using a test video, downloading it if necessary, and saving the results as an image sequence that can be converted into a gif for visualization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/yowo.md\":101-121",
+            "content": "```\nThe above command will generate the model structure file `YOWO.pdmodel` and the model weight file `YOWO.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Reasoning Method](../../usage.md#2-infer)\n### Use prediction engine inference\n- Download the test video [HorseRiding.avi](https://videotag.bj.bcebos.com/Data/HorseRiding.avi) for a quick experience, or via the wget command. The downloaded video should be placed in the `data/ucf24` directory:\n```bash\nwget -nc https://videotag.bj.bcebos.com/Data/HorseRiding.avi\n```\n- Run the following command for inference:\n```bash\npython3 tools/predict.py -c configs/localization/yowo.yaml -i 'data/ucf24/HorseRiding.avi' --model_file ./inference/YOWO.pdmodel --params_file ./inference/YOWO.pdiparams\n```\n- When inference is over, the prediction results in image form will be saved in the `inference/YOWO_infer` directory. The image sequence can be converted to a gif by running the following command to complete the final visualisation."
+        },
+        {
+            "comment": "This code is running a visualization script for the YOWO model trained on UCF101-24. It predicts the category of frames in \"data/ucf24/HorseRiding.avi\" as HorseRiding with high confidence (about 0.80).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/localization/yowo.md\":123-137",
+            "content": "```\npython3 data/ucf24/visualization.py --frames_dir ./inference/YOWO_infer/HorseRiding --duration 0.04\n```\nThe resulting visualization is as follows:\n<div align=\"center\">\n  <img  src=\"../../../images/horse_riding.gif\" alt=\"Horse Riding\">\n</div>\nIt can be seen that using the YOWO model trained on UCF101-24 to predict `data/ucf24/HorseRiding.avi`, the category of each frame output is HorseRiding with a confidence level of about 0.80.\n## Reference\n- [You Only Watch Once: A Unified CNN Architecture for Real-Time Spatiotemporal Action Localization](https://arxiv.org/pdf/1911.06644.pdf), K\u00f6p\u00fckl\u00fc O, Wei X, Rigoll G."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b8b83e13-cc8e-4883-9d38-44941a776dd2.json b/docs/doc/b8b83e13-cc8e-4883-9d38-44941a776dd2.json
new file mode 100644
index 000000000..c56f41b07
--- /dev/null
+++ b/docs/doc/b8b83e13-cc8e-4883-9d38-44941a776dd2.json
@@ -0,0 +1,20 @@
+{
+    "summary": "Decoder neural network layer uses backbone features, 2D convolution, batch normalization, and ReLU activation for class prediction. The `build_decoder` function constructs a decoder network with specified number of classes, backbone architecture, and Batch Normalization implementation.",
+    "details": [
+        {
+            "comment": "Decoder is a neural network layer that takes in features from backbone and outputs predicted classes. It initializes convolution layers with different input planes based on the specified backbone. It includes batch normalization, ReLU activation, and sequential convolutions for feature extraction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/decoder.py\":0-31",
+            "content": "import math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom utils.api import kaiming_normal_\nclass Decoder(nn.Layer):\n    def __init__(self, num_classes, backbone, BatchNorm):\n        super(Decoder, self).__init__()\n        if backbone == 'resnet' or backbone == 'drn' or backbone == 'resnet_edge':\n            low_level_inplanes = 256\n        elif backbone == 'xception':\n            low_level_inplanes = 128\n        elif backbone == 'mobilenet':\n            low_level_inplanes = 24\n        else:\n            raise NotImplementedError\n        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)\n        self.bn1 = BatchNorm(48)\n        self.relu = nn.ReLU(True)\n        self.last_conv = nn.Sequential(\n            nn.Conv2D(304,\n                      256,\n                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(True),\n            nn.Sequential(),\n            nn.Conv2D(256,\n                      256,"
+        },
+        {
+            "comment": "Decoder network with 2D convolution, batch normalization, and ReLU activation. Initializes weight using Kaiming Normal distribution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/decoder.py\":32-61",
+            "content": "                      kernel_size=3,\n                      stride=1,\n                      padding=1,\n                      bias_attr=False), BatchNorm(256), nn.ReLU(True),\n            nn.Sequential())\n        self._init_weight()\n    def forward(self, x, low_level_feat):\n        low_level_feat = self.conv1(low_level_feat)\n        low_level_feat = self.bn1(low_level_feat)\n        low_level_feat = self.relu(low_level_feat)\n        x = F.interpolate(x,\n                          size=low_level_feat.shape[2:],\n                          mode='bilinear',\n                          align_corners=True)\n        x = paddle.concat((x, low_level_feat), axis=1)\n        x = self.last_conv(x)\n        return x\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)"
+        },
+        {
+            "comment": "The function `build_decoder` takes parameters `num_classes`, `backbone`, and `BatchNorm` and returns an instance of the `Decoder` class. The purpose is to construct a decoder network for the specified number of classes, using the chosen backbone architecture and Batch Normalization implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/decoder.py\":64-65",
+            "content": "def build_decoder(num_classes, backbone, BatchNorm):\n    return Decoder(num_classes, backbone, BatchNorm)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b8c21e07-10b3-4d28-896c-6b6a23de6d2d.json b/docs/doc/b8c21e07-10b3-4d28-896c-6b6a23de6d2d.json
new file mode 100644
index 000000000..6bc2020f0
--- /dev/null
+++ b/docs/doc/b8c21e07-10b3-4d28-896c-6b6a23de6d2d.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The DataReader class handles YouTube-8M dataset using LSTM models, pickle for data loading, and supports various Python versions. It sets batch size, shuffles files if training mode is on, and reads video frames with labels/filenames into batches using one-hot encoding.",
+    "details": [
+        {
+            "comment": "This code is for a DataReader class that handles the youtube-8M dataset. The features are extracted by prior networks, specifically for LSTM models. It uses pickle to load data and BytesIO for compatibility with different python versions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/feature_reader.py\":0-33",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport sys\nfrom .reader_utils import DataReader\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\nimport numpy as np\nimport random\npython_ver = sys.version_info\nclass FeatureReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm\n    dataset cfg: num_classes"
+        },
+        {
+            "comment": "Initializes a feature reader object with specified name, mode and configuration. Sets batch size and file list from the configuration. Reads the file list, removes empty lines and shuffles if in training mode. Defines a nested function reader that iterates over each item in the file list, loads corresponding RGB data and labels, converts labels to one-hot format if not in inference mode, and returns them as batch outputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/feature_reader.py\":34-63",
+            "content": "                 batch_size\n                 list\n    \"\"\"\n    def __init__(self, name, mode, cfg):\n        self.name = name\n        self.mode = mode\n        self.num_classes = cfg.MODEL.num_classes\n        # set batch size and file list\n        self.batch_size = cfg[mode.upper()]['batch_size']\n        self.filelist = cfg[mode.upper()]['filelist']\n        self.seg_num = cfg.MODEL.get('seg_num', None)\n    def create_reader(self):\n        fl = open(self.filelist).readlines()\n        fl = [line.strip() for line in fl if line.strip() != '']\n        if self.mode == 'train':\n            random.shuffle(fl)\n        def reader():\n            batch_out = []\n            for item in fl:\n                fileinfo = item.split(' ')\n                filepath = fileinfo[0]\n                rgb = np.load(filepath, allow_pickle=True)\n                nframes = rgb.shape[0]\n                label = [int(i) for i in fileinfo[1:]]\n                one_hot_label = make_one_hot(label, self.num_classes)\n                if self.mode != 'infer':"
+        },
+        {
+            "comment": "This code reads video frames and their labels/filenames into batches, using one-hot encoding for label conversion. The make_one_hot function creates a one-hot encoded vector from the given label.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/feature_reader.py\":64-79",
+            "content": "                    batch_out.append((rgb, one_hot_label))\n                else:\n                    batch_out.append((rgb, filepath.split('/')[-1]))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n        return reader\ndef make_one_hot(label, dim=3862):\n    one_hot_label = np.zeros(dim)\n    one_hot_label = one_hot_label.astype(float)\n    for ind in label:\n        one_hot_label[int(ind)] = 1\n    return one_hot_label"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b8f5865e-b61d-4b37-ad93-afbc4bc13b5c.json b/docs/doc/b8f5865e-b61d-4b37-ad93-afbc4bc13b5c.json
new file mode 100644
index 000000000..d4d030f44
--- /dev/null
+++ b/docs/doc/b8f5865e-b61d-4b37-ad93-afbc4bc13b5c.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code offers a comprehensive table of action recognition and segmentation models with corresponding links for further details, classified by adaptability and network type. It includes various models like AttentionLSTM, MoViNet, ST-GCN, AGCN, 2s-AGCN, CTR-GCN, BMN, MS-TCN, and ASRF, and serves as a table of contents for PaddleVideo model zoo in HTML format.",
+    "details": [
+        {
+            "comment": "This code provides a table listing action recognition models and their corresponding links for further details. The models listed include PP-TSM, PP-TSN, PP-TimeSformer, TSN, TSM, SlowFast, TimeSformer, VideoSwin, and TokenShift.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/README.md\":0-25",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/model_zoo/README.md) | English\n# Academic algorithms\n## 1. Introduction\nWe implemented action recgonition model and action localization model in this repo.\n## 2. Model list\n<table style=\"margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;\">\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Action recognition method</td>\n  </tr>\n  <tr>\n    <td><a href=\"./recognition/pp-tsm.md\">PP-TSM</a> (PP series)</td>\n    <td><a href=\"./recognition/pp-tsn.md\">PP-TSN</a> (PP series)</td>\n    <td><a href=\"./recognition/pp-timesformer.md\">PP-TimeSformer</a> (PP series)</td>\n    <td><a href=\"./recognition/tsn.md\">TSN</a> (2D\u2019)</td>\n    <td><a href=\"./recognition/tsm.md\">TSM</a> (2D')</td>\n  <tr>\n    <td><a href=\"./recognition/slowfast.md\">SlowFast</a> (3D\u2019)</td>\n    <td><a href=\"./recognition/timesformer.md\">TimeSformer</a> (Transformer')</td>\n    <td><a href=\"./recognition/videoswin.md\">VideoSwin</a> (Transformer\u2019)</td>\n    <td><a href=\"./recognition/tokenshift_transformer.md\">TokenShift</a> (3D\u2019)</td>"
+        },
+        {
+            "comment": "This code is part of a table of contents for an AI model repository. It lists various action recognition and segmentation models, categorized by their features like adaptability, customization, and network type (RNN, Lite, etc.). The models include AttentionLSTM, MoViNet, ST-GCN, AGCN, 2s-AGCN, CTR-GCN, BMN, MS-TCN, and ASRF. Each model is linked to its corresponding documentation file in the repository.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/README.md\":26-60",
+            "content": "    <td><a href=\"./recognition/attention_lstm.md\">AttentionLSTM</a> (RNN\u2018)</td>\n  </tr>\n  <tr>\n    <td><a href=\"./recognition/movinet.md\">MoViNet</a> (Lite\u2018)</td>\n    <td></td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Skeleton based action recognition</td>\n  </tr>\n  <tr>\n    <td><a href=\"./recognition/stgcn.md\">ST-GCN</a> (Custom\u2019)</td>\n    <td><a href=\"./recognition/agcn.md\">AGCN</a> (Adaptive')</td>\n    <td><a href=\"./recognition/agcn2s.md\">2s-AGCN</a> (Adaptive')</td>\n    <td><a href=\"./recognition/ctrgcn.md\">CTR-GCN</a> (GCN\u2018)</td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Sequence action detection method</td>\n  </tr>\n  <tr>\n    <td><a href=\"./localization/bmn.md\">BMN</a> (One-stage')</td>\n    <td></td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">temporal segment</td>\n  </tr>\n  <tr>\n    <td><a href=\"./segmentation/mstcn.md\">MS-TCN</a> </td>\n    <td><a href=\"./segmentation/asrf.md\">ASRF</a> </td>"
+        },
+        {
+            "comment": "This code represents a table of contents with hyperlinks for different models and methods in the PaddleVideo model zoo. It includes categories such as Spatio-temporal motion detection, Multimodal, Video target segmentation, and Monocular depth estimation. Each category has a brief description of its subcategories or models, indicated by hyperlinks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/README.md\":61-99",
+            "content": "    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Spatio-temporal motion detection method</td>\n  </tr>\n  <tr>\n    <td><a href=\"docs/en/model_zoo/detection/SlowFast_FasterRCNN_en.md\">SlowFast+Fast R-CNN</a>\n    <td></td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Multimodal</td>\n  </tr>\n  <tr>\n    <td><a href=\"./multimodal/actbert.md\">ActBERT</a> (Learning')</td>\n    <td><a href=\"../../../applications/T2VLAD/README.md\">T2VLAD</a> (Retrieval')</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Video target segmentation</td>\n  </tr>\n  <tr>\n    <td><a href=\"./segmentation/cfbi.md\">CFBI</a> (Semi')</td>\n    <td><a href=\"../../../applications/EIVideo/EIVideo/docs/en/manet.md\">MA-Net</a> (Supervised')</td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n  <tr>\n    <td colspan=\"5\" style=\"font-weight:bold;\">Monocular depth estimation</td>\n  </tr>\n  <tr>\n    <td><a href=\"./estimation/adds.md\">ADDS</a> (Unsupervised\u2018)</td>"
+        },
+        {
+            "comment": "This code represents an empty table cell or row, likely within a HTML table structure.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/README.md\":100-105",
+            "content": "    <td></td>\n    <td></td>\n    <td></td>\n    <td></td>\n  </tr>\n</table>"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/b9e6e2c3-2cdd-4666-a7f4-8180d78457cb.json b/docs/doc/b9e6e2c3-2cdd-4666-a7f4-8180d78457cb.json
new file mode 100644
index 000000000..6bdc7b1e5
--- /dev/null
+++ b/docs/doc/b9e6e2c3-2cdd-4666-a7f4-8180d78457cb.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code introduces a GCN Recognizer model framework for PaddleVideo, classifying images through forward pass definition, training step loss calculation, and validation. A RecognizerGCN model is defined with test_step and infer_step functions.",
+    "details": [
+        {
+            "comment": "This code defines a GCN Recognizer model framework for PaddleVideo. It has an initialization method that takes arguments for backbone, head, runtime_cfg, and if_top5. The GCN Recognizer is registered with the RECOGNIZERS registry and extends BaseRecognizer class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py\":0-32",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass RecognizerGCN(BaseRecognizer):\n    \"\"\"GCN Recognizer model framework.\n    \"\"\"\n    def __init__(self,\n                 backbone=None,\n                 head=None,\n                 runtime_cfg=None,\n                 if_top5=True):\n        \"\"\"\n        Args:\n            backbone (dict): Backbone modules to extract feature.\n            head (dict): Classification head to process feature."
+        },
+        {
+            "comment": "RecognizerGCN is a model that performs image classification. It has a backbone for feature extraction and a head for classification. Forward_net defines the forward pass. Train_step calculates loss and metrics during training, taking into account if_top5 flag. Val_step performs validation by forward pass and loss calculation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py\":33-65",
+            "content": "            is_top5 (bool): Whether to display top-5 accuracy during training/validation steps.\n        \"\"\"\n        super(RecognizerGCN, self).__init__(backbone, head, runtime_cfg)\n        self.if_top5 = if_top5\n    def forward_net(self, data):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        feature = self.backbone(data)\n        cls_score = self.head(feature)\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        data = data_batch[0]\n        label = data_batch[1:]\n        # call forward\n        cls_score = self.forward_net(data)\n        loss_metrics = self.head.loss(cls_score, label, if_top5=self.if_top5)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        data = data_batch[0]\n        label = data_batch[1:]\n        # call forward\n        cls_score = self.forward_net(data)\n        loss_metrics = self.head.loss(cls_score,\n                                      label,"
+        },
+        {
+            "comment": "The code defines a RecognizerGCN model and provides test_step and infer_step functions to classify data by forwarding it through the network and returning class scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py\":66-86",
+            "content": "                                      valid_mode=True,\n                                      if_top5=self.if_top5)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        data = data_batch[0]\n        # call forward\n        cls_score = self.forward_net(data)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        data = data_batch[0]\n        # call forward\n        cls_score = self.forward_net(data)\n        return cls_score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ba1bf883-3c09-41d1-a3a7-fcc67613df5e.json b/docs/doc/ba1bf883-3c09-41d1-a3a7-fcc67613df5e.json
new file mode 100644
index 000000000..8d6dfcf0b
--- /dev/null
+++ b/docs/doc/ba1bf883-3c09-41d1-a3a7-fcc67613df5e.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The PaddleVideo framework's ASRF segmentation model uses a backbone for feature extraction and head network for classification. It performs forward passes, post-processing, inference, validates using loss and F1@0.50 score, and extracts class outputs for results.",
+    "details": [
+        {
+            "comment": "Class ASRF is a segmenter model in PaddleVideo framework. It takes arguments like postprocessing_method, boundary_threshold, backbone, head, and loss for initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/asrf.py\":0-32",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import SEGMENTERS\nfrom .base import BaseSegmenter\nimport paddle\nimport paddle.nn.functional as F\nfrom .utils import ASRFPostProcessing\n@SEGMENTERS.register()\nclass ASRF(BaseSegmenter):\n    \"\"\"ASRF model framework.\"\"\"\n    def __init__(self,\n                 postprocessing_method,\n                 boundary_threshold,\n                 backbone=None,\n                 head=None,\n                 loss=None):\n        super().__init__(backbone=backbone, head=head, loss=loss)\n        self.postprocessing_method = postprocessing_method"
+        },
+        {
+            "comment": "The code defines a model for segmentation, which has a forward function and train step. It uses a backbone for feature extraction and a head network for classification. The train_step calculates loss using the defined loss function if it's not None.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/asrf.py\":33-66",
+            "content": "        self.boundary_threshold = boundary_threshold\n    def forward_net(self, video_feature):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        if self.backbone is not None:\n            feature = self.backbone(video_feature)\n        else:\n            feature = video_feature\n        if self.head is not None:\n            network_outputs = self.head(feature)\n        else:\n            network_outputs = None\n        return network_outputs\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        feature, label, boundary = data_batch\n        # call forward\n        outputs_cls, outputs_boundary = self.forward_net(feature)\n        # transfer data\n        outputs_cls_np = outputs_cls[-1].numpy()\n        outputs_boundary_np = outputs_boundary[-1].numpy()\n        # caculate loss\n        if self.loss is not None:\n            output_loss = self.loss(feature, outputs_cls, label,\n                                    outputs_boundary, boundary)\n        else:\n            output_loss = None"
+        },
+        {
+            "comment": "The code snippet represents the ASRF model's validation step. It predicts the outputs for the given inputs, calculates loss if applicable, and performs post-processing using ASRFPostProcessing function. The function then returns a dictionary of metrics including the 'loss' value and the 'F1@0.50' score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/asrf.py\":68-99",
+            "content": "        # predict post process\n        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,\n                                       self.postprocessing_method)\n        predicted = paddle.squeeze(predicted)\n        loss_metrics = dict()\n        loss_metrics['loss'] = output_loss\n        loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, label)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        feature, label, boundary = data_batch\n        # call forward\n        outputs_cls, outputs_boundary = self.forward_net(feature)\n        # transfer data\n        outputs_cls_np = outputs_cls[-1].numpy()\n        outputs_boundary_np = outputs_boundary[-1].numpy()\n        ## caculate loss\n        if self.loss is not None:\n            output_loss = self.loss(feature, outputs_cls, label,\n                                    outputs_boundary, boundary)\n        else:\n            output_loss = None\n        # predict post process\n        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,"
+        },
+        {
+            "comment": "This code is for a model that performs segmentation using ASRF (Adaptive Sparsely Represented Field) method. It consists of functions for forward pass, post processing, and inference steps. The forward_net function takes input features and returns predicted classes and boundaries. The test_step performs testing by calling the forward_net function and applying post-processing to the results. The infer_step performs inference on data_batch using ASRFPostProcessing. It outputs the predicted segmentation, sigmoid-transformed output, and returns them in a dictionary for further processing or evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/asrf.py\":100-128",
+            "content": "                                       self.postprocessing_method)\n        predicted = paddle.squeeze(predicted)\n        outputs_dict = dict()\n        outputs_dict['loss'] = output_loss\n        outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, label)\n        return outputs_dict\n    def test_step(self, data_batch):\n        \"\"\"Testing setp.\n        \"\"\"\n        feature, _, _ = data_batch\n        outputs_dict = dict()\n        # call forward\n        outputs_cls, outputs_boundary = self.forward_net(feature)\n        # transfer data\n        outputs_cls_np = outputs_cls[-1].numpy()\n        outputs_boundary_np = outputs_boundary[-1].numpy()\n        # predict post process\n        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,\n                                       self.postprocessing_method)\n        outputs_dict['predict'] = paddle.to_tensor(predicted[0, :])\n        outputs_dict['output_np'] = F.sigmoid(outputs_cls[-1])\n        return outputs_dict\n    def infer_step(self, data_batch):\n        \"\"\"Infering setp."
+        },
+        {
+            "comment": "This code segment performs the forward pass on a feature, then extracts last outputs for class and boundary, applies sigmoid to the last output of class, and returns all in a list as results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/asrf.py\":129-142",
+            "content": "        \"\"\"\n        feature = data_batch[0]\n        # call forward\n        outputs_cls, outputs_boundary = self.forward_net(feature)\n        # transfer data\n        outputs_cls_np = outputs_cls[-1]\n        outputs_boundary_np = outputs_boundary[-1]\n        outputs = [\n            outputs_cls_np, outputs_boundary_np,\n            F.sigmoid(outputs_cls[-1])\n        ]\n        return outputs"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ba3194ef-fcc2-4d0d-9a6c-2f6c76db0854.json b/docs/doc/ba3194ef-fcc2-4d0d-9a6c-2f6c76db0854.json
new file mode 100644
index 000000000..a0ff6a988
--- /dev/null
+++ b/docs/doc/ba3194ef-fcc2-4d0d-9a6c-2f6c76db0854.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code introduces a PaddleVideo class in Python for loading and processing video datasets, reading index files, applying transforms, handles corrupted files with retries, and provides error logging during training/validation.",
+    "details": [
+        {
+            "comment": "This code is a Python class defining a video dataset for action recognition. It loads raw videos and applies specified transforms on them using an index file with multiple lines, each indicating the properties of a video. The code is part of the PaddleVideo library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py\":0-31",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass VideoDataset(BaseDataset):\n    \"\"\"Video dataset for action recognition\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates"
+        },
+        {
+            "comment": "This code initializes a dataset class, which reads an index file containing paths to video files and their labels. It loads the index file line by line and processes each line to append video information into a list called \"info\". The filename is assumed to have .avi suffix in this case. If there is a data_prefix assigned, it will be added to the filename.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py\":32-57",
+            "content": "       a sample video with the filepath and label, which are split with a whitesapce.\n       Example of a inde file:\n       .. code-block:: txt\n           path/000.mp4 1\n           path/001.mp4 1\n           path/002.mp4 2\n           path/003.mp4 2\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(self, file_path, pipeline, num_retries=5, **kwargs):\n        self.num_retries = num_retries\n        super().__init__(file_path, pipeline, **kwargs)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                filename, labels = line_split\n                #TODO(hj): Required suffix format: may mp4/avi/wmv\n                filename = filename + '.avi'\n                if self.data_prefix is not None:"
+        },
+        {
+            "comment": "The code is a part of a video dataset loader. It handles preparing data for training, validation, and testing in a dataset with potential corrupted files. It joins filenames to the prefix, stores them along with labels in a list (info). For training/validation, it tries a set number of times to read each file due to possible corruption, applies a pipeline to the data, logs exceptions if they occur, and tries again with a random index if needed. In testing, it simply returns the prepared data without retries or error handling.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py\":58-80",
+            "content": "                    filename = osp.join(self.data_prefix, filename)\n                info.append(dict(filename=filename, labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"TEST. Prepare the data for test given the index.\"\"\""
+        },
+        {
+            "comment": "This code attempts to read a video file and catch any exceptions caused by reading corrupted files. It uses a retry mechanism with a maximum number of retries (self.num_retries) to handle potential errors. If an exception occurs, the error is logged, and if there are more retries left, it tries again with a different random index from self.info. Once successful, it returns the images and labels as numpy arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py\":81-94",
+            "content": "        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ba74ccbf-caf5-4b98-8d47-579014659578.json b/docs/doc/ba74ccbf-caf5-4b98-8d47-579014659578.json
new file mode 100644
index 000000000..83a8cfdd6
--- /dev/null
+++ b/docs/doc/ba74ccbf-caf5-4b98-8d47-579014659578.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code defines a ConvBNLayer class and Darknet backbone, performing convolutions, pooling, and reorganization in a neural network. It concatenates results from two branches, applies more convolutions, and returns final output.",
+    "details": [
+        {
+            "comment": "This code defines a ConvBNLayer class that inherits from nn.Layer and includes a Conv2D layer, Batch Normalization, and other parameters like input_channels, output_channels, filter_size, stride, padding, and name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/darknet.py\":0-31",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 input_channels,\n                 output_channels,\n                 filter_size,\n                 stride,\n                 padding,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self._conv = nn.Conv2D("
+        },
+        {
+            "comment": "This code defines a convolutional neural network block with batch normalization and leaky ReLU activation. The forward function applies the convolution followed by batch normalization, and BasicBlock is a subclass of nn.Layer representing a single block in the model architecture.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/darknet.py\":32-60",
+            "content": "            in_channels=input_channels,\n            out_channels=output_channels,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=padding,\n            weight_attr=ParamAttr(name=name + \".conv.weights\"),\n            bias_attr=False)\n        bn_name = name + \".bn\"\n        self._bn = nn.BatchNorm(\n            num_channels=output_channels,\n            act=\"leaky_relu\",\n            param_attr=ParamAttr(name=bn_name + \".scale\"),\n            bias_attr=ParamAttr(name=bn_name + \".offset\"),\n            moving_mean_name=bn_name + \".mean\",\n            moving_variance_name=bn_name + \".var\")\n    def forward(self, inputs):\n        x = self._conv(inputs)\n        x = self._bn(x)\n        return x\nclass BasicBlock(nn.Layer):\n    def __init__(self, input_channels, output_channels, name=None):\n        super(BasicBlock, self).__init__()\n        self._conv1 = ConvBNLayer(input_channels=input_channels, output_channels=output_channels, filter_size=[\n                                  3, 3], stride=1, padding=1,  name=name+'.0')"
+        },
+        {
+            "comment": "Code defines a Darknet backbone with ConvBNLayer and MaxPooling layers, followed by Reorg layer for spatial downsampling.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/darknet.py\":61-91",
+            "content": "        self._max_pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)\n        self._conv2 = ConvBNLayer(input_channels=output_channels, output_channels=output_channels *\n                                  2, filter_size=[3, 3], stride=1, padding=1, name=name+'.1')\n        self._conv3 = ConvBNLayer(input_channels=output_channels*2, output_channels=output_channels,\n                                  filter_size=[1, 1], stride=1, padding=0, name=name+'.2')\n    def forward(self, x):\n        x = self._conv1(x)\n        x = self._max_pool(x)\n        x = self._conv2(x)\n        x = self._conv3(x)\n        return x\nclass Reorg(nn.Layer):\n    def __init__(self, stride=2):\n        super(Reorg, self).__init__()\n        self.stride = stride\n    def forward(self, x):\n        stride = self.stride\n        assert (x.dim() == 4)\n        B = x.shape[0]\n        C = x.shape[1]\n        H = x.shape[2]\n        W = x.shape[3]\n        assert (H % stride == 0)\n        assert (W % stride == 0)\n        ws = stride\n        hs = stride\n        x = x.reshape([B, C, H // hs, hs, W // ws, ws]"
+        },
+        {
+            "comment": "This code reshapes the input tensor and performs a sequence of transpose operations to rearrange dimensions. The code is part of a Darknet class, which inherits from nn.Layer and contains various ConvBNLayer and BasicBlock instances for building a convolutional neural network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/darknet.py\":92-114",
+            "content": "                      ).transpose([0, 1, 2, 4, 3, 5])\n        x = x.reshape([B, C, H // hs * W // ws, hs * ws]\n                      ).transpose([0, 1, 3, 2])\n        x = x.reshape([B, C, hs * ws, H // hs, W // ws]\n                      ).transpose([0, 2, 1, 3, 4])\n        x = x.reshape([B, hs * ws * C, H // hs, W // ws])\n        return x\nclass Darknet(nn.Layer):\n    def __init__(self, pretrained=None):\n        super(Darknet, self).__init__()\n        self.pretrained = pretrained\n        self._conv1 = ConvBNLayer(\n            input_channels=3, output_channels=32, filter_size=3, stride=1, padding=1, name='input')\n        self._max_pool1 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)\n        self._basic_block_11 = BasicBlock(\n            input_channels=32, output_channels=64, name='1.1')\n        self._basic_block_12 = BasicBlock(\n            input_channels=64, output_channels=128, name='1.2')\n        self._basic_block_13 = BasicBlock(\n            input_channels=128, output_channels=256, name='1.3')\n        self._conv2 = ConvBNLayer("
+        },
+        {
+            "comment": "The code defines a series of ConvBNLayer objects for the Darknet backbone. These layers include upsampling, downsampling, and convolution operations with different filter sizes and strides. The ConvBNLayer class is used to perform convolutions followed by batch normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/darknet.py\":115-128",
+            "content": "            input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='up1')\n        self._conv3 = ConvBNLayer(\n            input_channels=512, output_channels=256, filter_size=1, stride=1, padding=0, name='down1')\n        self._conv4 = ConvBNLayer(\n            input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='2.1')\n        self._max_pool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)\n        self._conv5 = ConvBNLayer(\n            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='2.2')\n        self._conv6 = ConvBNLayer(input_channels=1024, output_channels=512,\n                                  filter_size=1, stride=1, padding=0, name='2.3')  # ori\n        self._conv7 = ConvBNLayer(\n            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='up2')\n        self._conv8 = ConvBNLayer(input_channels=1024, output_channels=512,\n                                  filter_size=1, stride=1, padding=0, name='down2')"
+        },
+        {
+            "comment": "This code defines a neural network backbone with multiple convolutional layers, batch normalization, and pooling operations. The forward method implements the network's processing flow for input images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/darknet.py\":129-149",
+            "content": "        self._conv9 = ConvBNLayer(\n            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.1')\n        self._conv10 = ConvBNLayer(\n            input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.2')\n        self._conv11 = ConvBNLayer(\n            input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.3')\n        self._conv12 = ConvBNLayer(\n            input_channels=512, output_channels=64, filter_size=1, stride=1, padding=0, name='4.1')\n        self._reorg = Reorg()\n        self._conv13 = ConvBNLayer(\n            input_channels=1280, output_channels=1024, filter_size=3, stride=1, padding=1, name='5.1')\n        self._conv14 = nn.Conv2D(1024, 425, kernel_size=1)\n    def forward(self, inputs):\n        x = self._conv1(inputs)\n        x = self._max_pool1(x)\n        x = self._basic_block_11(x)\n        x = self._basic_block_12(x)\n        x = self._basic_block_13(x)\n        x = self._conv2(x)\n        x = self._conv3(x)"
+        },
+        {
+            "comment": "The code performs multiple convolutional operations, followed by pooling and reorganization. It concatenates the results of two separate branches, then applies further convolutions before returning the final output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/darknet.py\":150-164",
+            "content": "        ori = self._conv4(x)\n        x = self._max_pool2(ori)\n        x = self._conv5(x)\n        x = self._conv6(x)\n        x = self._conv7(x)\n        x = self._conv8(x)\n        x = self._conv9(x)\n        x = self._conv10(x)\n        x1 = self._conv11(x)\n        x2 = self._conv12(ori)\n        x2 = self._reorg(x2)\n        x = paddle.concat([x2, x1], 1)\n        x = self._conv13(x)\n        x = self._conv14(x)\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bb325240-2947-45dd-a561-15c2b11d70d3.json b/docs/doc/bb325240-2947-45dd-a561-15c2b11d70d3.json
new file mode 100644
index 000000000..4a083993d
--- /dev/null
+++ b/docs/doc/bb325240-2947-45dd-a561-15c2b11d70d3.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The SFMRI_DecodeSampler class is a tool that decodes and samples MRI frames, creating segments based on sampling indices and handling video length constraints. It calculates offsets for 's' and 'f' frame types, determines average durations per segment, and returns an object containing the frame indices.",
+    "details": [
+        {
+            "comment": "This code snippet is a Python class for the SFMRI_DecodeSampler pipeline, which decodes and samples MRI frames. It uses PIL and SimpleITK packages to read images and relies on OpenCV for image processing. The class is registered in the PIPELINES module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py\":0-35",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport random\nimport numpy as np\nfrom PIL import Image\ntry:\n    import SimpleITK as sitk\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care.\"\n    )\nimport cv2\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass SFMRI_DecodeSampler(object):\n    \"\"\"\n    Sample frames id.\n    NOTE: Use PIL to read image here, has diff with CV2\n    Args:"
+        },
+        {
+            "comment": "This code defines a class with methods for creating segments of frames from an MRI image. The constructor takes arguments for the number of segments, length of each segment, and optional parameters for sampling mode. It returns the indexes of sampled frames in each segment. The class also includes a method for getting images from the MRI and storing them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py\":36-63",
+            "content": "        num_seg(int): number of segments.\n        seg_len(int): number of sampled frames in each segment.\n        valid_mode(bool): True or False.\n        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.\n    Returns:\n        frames_idx: the index of sampled #frames.\n    \"\"\"\n    def __init__(self,\n                 num_seg,\n                 seg_len,\n                 valid_mode=False,\n                 select_left=False,\n                 dense_sample=False,\n                 linspace_sample=False):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.valid_mode = valid_mode\n        self.select_left = select_left\n        self.dense_sample = dense_sample\n        self.linspace_sample = linspace_sample\n    def _get(self, frames_idx_s, frames_idx_f, results):\n        frame_dir = results['frame_dir']\n        imgs_s = []\n        imgs_f = []\n        MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))\n        for idx in frames_idx_s:"
+        },
+        {
+            "comment": "This code defines a class that takes in MRI data and returns resized images for sampling. It creates two lists, imgs_s and imgs_f, which contain the resized MRI frames. The results dictionary contains these lists under the 'imgs' key. The __call__ method calculates the average duration of each segment based on frames_len, and generates frame indices for each segment using linspace_sample. It does not return any value in this context.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py\":64-93",
+            "content": "            item = MRI[idx]\n            item = cv2.resize(item, (224, 224))\n            imgs_s.append(item)\n        for idx in frames_idx_f:\n            item = MRI[idx]\n            item = cv2.resize(item, (224, 224))\n            imgs_f.append(item)\n        results['imgs'] = [imgs_s, imgs_f]\n        return results\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            frames_len: length of frames.\n        return:\n            sampling id.\n        \"\"\"\n        frames_len = int(results['frames_len'])\n        average_dur1 = int(frames_len / self.num_seg[0])\n        average_dur2 = int(frames_len / self.num_seg[1])\n        frames_idx_s = []\n        frames_idx_f = []\n        if self.linspace_sample:\n            if 'start_idx' in results and 'end_idx' in results:\n                offsets_s = np.linspace(results['start_idx'],\n                                        results['end_idx'], self.num_seg[0])\n                offsets_f = np.linspace(results['start_idx'],\n                                        results['end_idx'], self.num_seg[1])"
+        },
+        {
+            "comment": "This code segment handles sampling of frames for video decoding. It sets the offsets for sample positions based on the number of segments specified and ensures they are within the valid frame range. If `select_left` is not set, it further checks if `dense_sample` is enabled in dense sampling mode. For ppTSM, it selects a sample position and calculates the corresponding offsets for each segment using the given formulas.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py\":94-114",
+            "content": "            else:\n                offsets_s = np.linspace(0, frames_len - 1, self.num_seg[0])\n                offsets_f = np.linspace(0, frames_len - 1, self.num_seg[1])\n            offsets_s = np.clip(offsets_s, 0, frames_len - 1).astype(np.int64)\n            offsets_f = np.clip(offsets_f, 0, frames_len - 1).astype(np.int64)\n            frames_idx_s = list(offsets_s)\n            frames_idx_f = list(offsets_f)\n            return self._get(frames_idx_s, frames_idx_f, results)\n        if not self.select_left:\n            if self.dense_sample:  # For ppTSM\n                if not self.valid_mode:  # train\n                    sample_pos = max(1, 1 + frames_len - 64)\n                    t_stride1 = 64 // self.num_seg[0]\n                    t_stride2 = 64 // self.num_seg[1]\n                    start_idx = 0 if sample_pos == 1 else np.random.randint(\n                        0, sample_pos - 1)\n                    offsets_s = [(idx * t_stride1 + start_idx) % frames_len + 1\n                                 for idx in range(self.num_seg[0])]"
+        },
+        {
+            "comment": "This code calculates the sampling indices for both spatial and frequency domains. It creates two lists, frames_idx_s and frames_idx_f, based on the number of segments in each dimension (self.num_seg[0] and self.num_seg[1]). If the video length is less than 64 frames, it sets a smaller sampling range for both domains. The code also includes a backup strategy that uses a list of starting points for sampling if the video length is longer but still shorter than 64 frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py\":115-134",
+            "content": "                    offsets_f = [(idx * t_stride2 + start_idx) % frames_len + 1\n                                 for idx in range(self.num_seg[1])]\n                    frames_idx_s = offsets_s\n                    frames_idx_f = offsets_f\n                else:\n                    sample_pos = max(1, 1 + frames_len - 64)\n                    t_stride1 = 64 // self.num_seg[0]\n                    t_stride2 = 64 // self.num_seg[1]\n                    start_list = np.linspace(0,\n                                             sample_pos - 1,\n                                             num=10,\n                                             dtype=int)\n                    offsets_s = []\n                    offsets_f = []\n                    for start_idx in start_list.tolist():\n                        offsets_s += [\n                            (idx * t_stride1 + start_idx) % frames_len + 1\n                            for idx in range(self.num_seg[0])\n                        ]\n                    for start_idx in start_list.tolist():"
+        },
+        {
+            "comment": "This code calculates the offsets for segmenting frames and storing them in two lists, `frames_idx_s` and `frames_idx_f`. If `valid_mode` is set, it randomly selects the indices within the constraints of `average_dur1`, otherwise it uses sequential indexing. It also handles cases where `average_dur1` is less than 1 by setting the index to i.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py\":135-156",
+            "content": "                        offsets_f += [\n                            (idx * t_stride2 + start_idx) % frames_len + 1\n                            for idx in range(self.num_seg[1])\n                        ]\n                    frames_idx_s = offsets_s\n                    frames_idx_f = offsets_f\n            else:\n                for i in range(self.num_seg[0]):\n                    idx = 0\n                    if not self.valid_mode:\n                        if average_dur1 >= self.seg_len:\n                            idx = random.randint(0, average_dur1 - self.seg_len)\n                            idx += i * average_dur1\n                        elif average_dur1 >= 1:\n                            idx += i * average_dur1\n                        else:\n                            idx = i\n                    else:\n                        if average_dur1 >= self.seg_len:\n                            idx = (average_dur1 - 1) // 2\n                            idx += i * average_dur1\n                        elif average_dur1 >= 1:"
+        },
+        {
+            "comment": "Code iterates over frames and segments, assigning frame indices based on valid mode and average durations. If valid mode is off, it determines the idx based on average duration 1 and 2, or if in valid mode, it sets the idx to half the remaining average duration 2 minus 1. Finally, it appends frame indices to frames_idx_s list for each segment length.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py\":157-179",
+            "content": "                            idx += i * average_dur1\n                        else:\n                            idx = i\n                    for jj in range(idx, idx + self.seg_len):\n                        frames_idx_s.append(jj)\n                for i in range(self.num_seg[1]):\n                    idx = 0\n                    if not self.valid_mode:\n                        if average_dur2 >= self.seg_len:\n                            idx = random.randint(0, average_dur2 - self.seg_len)\n                            idx += i * average_dur2\n                        elif average_dur2 >= 1:\n                            idx += i * average_dur2\n                        else:\n                            idx = i\n                    else:\n                        if average_dur2 >= self.seg_len:\n                            idx = (average_dur2 - 1) // 2\n                            idx += i * average_dur2\n                        elif average_dur2 >= 1:\n                            idx += i * average_dur2\n                        else:"
+        },
+        {
+            "comment": "If not in valid mode, if average duration 2 > 0, generate offsets_s and offsets_f for TSM. If frames_len is greater than num_seg[1], randomly select offsets_s and offsets_f. Otherwise, set offsets_s to zeros.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py\":180-202",
+            "content": "                            idx = i\n                    for jj in range(idx, idx + self.seg_len):\n                        frames_idx_f.append(jj)\n            return self._get(frames_idx_s, frames_idx_f, results)\n        else:  # for TSM\n            if not self.valid_mode:\n                if average_dur2 > 0:\n                    offsets_s = np.multiply(list(range(\n                        self.num_seg[0])), average_dur1) + np.random.randint(\n                            average_dur1, size=self.num_seg[0])\n                    offsets_f = np.multiply(list(range(\n                        self.num_seg[1])), average_dur2) + np.random.randint(\n                            average_dur2, size=self.num_seg[1])\n                elif frames_len > self.num_seg[1]:\n                    offsets_s = np.sort(\n                        np.random.randint(frames_len, size=self.num_seg[0]))\n                    offsets_f = np.sort(\n                        np.random.randint(frames_len, size=self.num_seg[1]))\n                else:\n                    offsets_s = np.zeros(shape=(self.num_seg[0], ))"
+        },
+        {
+            "comment": "This code calculates the offsets for segmenting frames into 's' and 'f' types based on the number of segments specified. If the total number of frames is greater than the specified number of segments, it calculates the average duration per segment for both types ('s' and 'f'). It then creates arrays of frame indices for 's' and 'f' frames using these calculated offsets. Finally, it returns an object by calling a method '_get'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py\":203-223",
+            "content": "                    offsets_f = np.zeros(shape=(self.num_seg[1], ))\n            else:\n                if frames_len > self.num_seg[1]:\n                    average_dur_float_s = frames_len / self.num_seg[0]\n                    offsets_s = np.array([\n                        int(average_dur_float_s / 2.0 + average_dur_float_s * x)\n                        for x in range(self.num_seg[0])\n                    ])\n                    average_dur_float_f = frames_len / self.num_seg[1]\n                    offsets_f = np.array([\n                        int(average_dur_float_f / 2.0 + average_dur_float_f * x)\n                        for x in range(self.num_seg[1])\n                    ])\n                else:\n                    offsets_s = np.zeros(shape=(self.num_seg[0], ))\n                    offsets_f = np.zeros(shape=(self.num_seg[1], ))\n            frames_idx_s = list(offsets_s)\n            frames_idx_f = list(offsets_f)\n            return self._get(frames_idx_s, frames_idx_f, results)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bb921512-02e5-4d0d-ad4c-d6d72943394d.json b/docs/doc/bb921512-02e5-4d0d-ad4c-d6d72943394d.json
new file mode 100644
index 000000000..5548a4b48
--- /dev/null
+++ b/docs/doc/bb921512-02e5-4d0d-ad4c-d6d72943394d.json
@@ -0,0 +1,65 @@
+{
+    "summary": "This code utilizes PaddleVideo's TimeSformer model for video processing, including a VideoDecoder class to decode mp4 files and handle varying durations. The \"ActionFeatureDecoder\" class handles feature decoding, while the function prepares data for model input and normalizes inputs for PaddlePaddle's video pipeline.",
+    "details": [
+        {
+            "comment": "This code snippet is part of the PaddleVideo library and it appears to import various packages, define a function \"get_start_end_idx\", and register something into the PIPELINES registry. It seems to handle video clip processing for TimeSformer and other models. The function calculates start and end indices for video clips based on video size, clip size, clip index, and the total number of clips.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":0-31",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\ntry:\n    import av\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models.\"\n    )\nimport cv2\nimport pickle\nimport decord as de\nimport math\nimport random\nfrom ..registry import PIPELINES\ndef get_start_end_idx(video_size, clip_size, clip_idx, num_clips):\n    delta = max(video_size - clip_size, 0)\n    if clip_idx == -1:  # here"
+        },
+        {
+            "comment": "This code defines a VideoDecoder class for decoding mp4 files to frames. It takes the file path as input and has additional parameters for time-series applications like TimeSformer. The __call__ method performs the decoding operation, returning a list of numpy arrays representing the decoded frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":32-68",
+            "content": "        # Random temporal sampling.\n        start_idx = random.uniform(0, delta)\n    else:  # ignore\n        # Uniformly sample the clip with the given index.\n        start_idx = delta * clip_idx / num_clips\n    end_idx = start_idx + clip_size - 1\n    return start_idx, end_idx\n@PIPELINES.register()\nclass VideoDecoder(object):\n    \"\"\"\n    Decode mp4 file to frames.\n    Args:\n        filepath: the file path of mp4 file\n    \"\"\"\n    def __init__(self,\n                 backend='cv2',\n                 mode='train',\n                 sampling_rate=32,\n                 num_seg=8,\n                 num_clips=1,\n                 target_fps=30):\n        self.backend = backend\n        # params below only for TimeSformer\n        self.mode = mode\n        self.sampling_rate = sampling_rate\n        self.num_seg = num_seg\n        self.num_clips = num_clips\n        self.target_fps = target_fps\n    def __call__(self, results):\n        \"\"\"\n        Perform mp4 decode operations.\n        return:\n            List where each item is a numpy array after decoder."
+        },
+        {
+            "comment": "This code is part of a video decoding pipeline. It checks the backend and decodes videos using either cv2, decord or pyav depending on the backend specified. It reads frames from the video and stores them in 'results' dictionary for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":69-97",
+            "content": "        \"\"\"\n        file_path = results['filename']\n        results['format'] = 'video'\n        results['backend'] = self.backend\n        if self.backend == 'cv2':\n            cap = cv2.VideoCapture(file_path)\n            videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n            sampledFrames = []\n            for i in range(videolen):\n                ret, frame = cap.read()\n                # maybe first frame is empty\n                if ret == False:\n                    continue\n                img = frame[:, :, ::-1]\n                sampledFrames.append(img)\n            results['frames'] = sampledFrames\n            results['frames_len'] = len(sampledFrames)\n        elif self.backend == 'decord':\n            container = de.VideoReader(file_path)\n            frames_len = len(container)\n            results['frames'] = container\n            results['frames_len'] = frames_len\n        elif self.backend == 'pyav':  # for TimeSformer\n            if self.mode in [\"train\", \"valid\"]:\n                clip_idx = -1\n            elif self.mode in [\"test\"]:"
+        },
+        {
+            "comment": "This code checks if the duration of a video file is None. If it is, it sets decode_all_video to True and calculates video_start_pts and video_end_pts as 0 and infinity respectively. If the duration is not None, it calculates start and end indices for decoding specific clips from the video file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":98-124",
+            "content": "                clip_idx = 0\n            else:\n                raise NotImplementedError\n            container = av.open(file_path)\n            num_clips = 1  # always be 1\n            # decode process\n            fps = float(container.streams.video[0].average_rate)\n            frames_length = container.streams.video[0].frames\n            duration = container.streams.video[0].duration\n            if duration is None:\n                # If failed to fetch the decoding information, decode the entire video.\n                decode_all_video = True\n                video_start_pts, video_end_pts = 0, math.inf\n            else:\n                decode_all_video = False\n                start_idx, end_idx = get_start_end_idx(\n                    frames_length,\n                    self.sampling_rate * self.num_seg / self.target_fps * fps,\n                    clip_idx, num_clips)\n                timebase = duration / frames_length\n                video_start_pts = int(start_idx * timebase)\n                video_end_pts = int(end_idx * timebase)"
+        },
+        {
+            "comment": "This code snippet is part of a video decoding pipeline in PaddleVideo. It seeks to a specific start time of the video stream, then decodes and filters frames based on their start and end points. Frames before the start point are skipped, while frames after the end point are buffered. Finally, it stores the relevant frames in the \"tmp\\_frames\" dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":126-149",
+            "content": "            frames = None\n            # If video stream was found, fetch video frames from the video.\n            if container.streams.video:\n                margin = 1024\n                seek_offset = max(video_start_pts - margin, 0)\n                container.seek(seek_offset,\n                               any_frame=False,\n                               backward=True,\n                               stream=container.streams.video[0])\n                tmp_frames = {}\n                buffer_count = 0\n                max_pts = 0\n                for frame in container.decode(**{\"video\": 0}):\n                    max_pts = max(max_pts, frame.pts)\n                    if frame.pts < video_start_pts:\n                        continue\n                    if frame.pts <= video_end_pts:\n                        tmp_frames[frame.pts] = frame\n                    else:\n                        buffer_count += 1\n                        tmp_frames[frame.pts] = frame\n                        if buffer_count >= 0:\n                            break"
+        },
+        {
+            "comment": "This code extracts video frames, sorts them by timestamp, and then converts the frames to RGB format. It calculates the start and end indices for a given clip size based on the number of frames and the selected clip size. The results are stored in a dictionary along with additional information such as frame length and indices. If no code is provided for the \"else\" condition, a NotImplementedError will be raised. This class is registered as a pipeline using @PIPELINES.register().",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":150-176",
+            "content": "                video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]\n                container.close()\n                frames = [frame.to_rgb().to_ndarray() for frame in video_frames]\n                clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps\n                start_idx, end_idx = get_start_end_idx(\n                    len(frames),  # frame_len\n                    clip_sz,\n                    clip_idx if decode_all_video else\n                    0,  # If decode all video, -1 in train and valid, 0 in test;\n                    # else, always 0 in train, valid and test, as we has selected clip size frames when decode.\n                    1)\n                results['frames'] = frames\n                results['frames_len'] = len(frames)\n                results['start_idx'] = start_idx\n                results['end_idx'] = end_idx\n        else:\n            raise NotImplementedError\n        return results\n@PIPELINES.register()\nclass FrameDecoder(object):\n    \"\"\"just parse results\n    \"\"\""
+        },
+        {
+            "comment": "The code defines three pipeline classes for decoding different types of data. The MRIDecoder class sets the format to 'MRI'. The FeatureDecoder class initializes with parameters num_classes, max_len and has_label, then performs feature decode operations on loaded pkl files, parsing them into RGB/audio format, padding as necessary, and returning a list of numpy arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":177-221",
+            "content": "    def __init__(self):\n        pass\n    def __call__(self, results):\n        results['format'] = 'frame'\n        return results\n@PIPELINES.register()\nclass MRIDecoder(object):\n    \"\"\"just parse results\n    \"\"\"\n    def __init__(self):\n        pass\n    def __call__(self, results):\n        results['format'] = 'MRI'\n        return results\n@PIPELINES.register()\nclass FeatureDecoder(object):\n    \"\"\"\n        Perform feature decode operations.e.g.youtube8m\n    \"\"\"\n    def __init__(self, num_classes, max_len=512, has_label=True):\n        self.max_len = max_len\n        self.num_classes = num_classes\n        self.has_label = has_label\n    def __call__(self, results):\n        \"\"\"\n        Perform feature decode operations.\n        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        #1. load pkl\n        #2. parse to rgb/audio/\n        #3. padding\n        filepath = results['filename']\n        data = pickle.load(open(filepath, 'rb'), encoding='bytes')\n        record = data\n        nframes = record['nframes'] if 'nframes' in record else record["
+        },
+        {
+            "comment": "This code is preparing data for a model. It loads the 'feature' and 'audio' from the record if available, converts them to float type, and cuts them up to the specified number of frames (nframes). If labels are present in the record, it makes one-hot encoding out of them. The data is then dequantized using a method, and results are stored into the 'labels' variable. Finally, three lists (feat_pad_list, feat_len_list, mask_list) are initialized for further data processing. The code handles two types of data: 'feature' and 'audio', iterating over them in a range loop.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":222-248",
+            "content": "            b'nframes']\n        rgb = record['feature'].astype(\n            float) if 'feature' in record else record[b'feature'].astype(float)\n        audio = record['audio'].astype(\n            float) if 'audio' in record else record[b'audio'].astype(float)\n        if self.has_label:\n            label = record['label'] if 'label' in record else record[b'label']\n            one_hot_label = self.make_one_hot(label, self.num_classes)\n        rgb = rgb[0:nframes, :]\n        audio = audio[0:nframes, :]\n        rgb = self.dequantize(rgb,\n                              max_quantized_value=2.,\n                              min_quantized_value=-2.)\n        audio = self.dequantize(audio,\n                                max_quantized_value=2,\n                                min_quantized_value=-2)\n        if self.has_label:\n            results['labels'] = one_hot_label.astype(\"float32\")\n        feat_pad_list = []\n        feat_len_list = []\n        mask_list = []\n        vitem = [rgb, audio]\n        for vi in range(2):  #rgb and audio"
+        },
+        {
+            "comment": "This function pads and dequantizes video features for model input. It first checks the type of feature (video or audio) and prepends 'rgb_' or 'audio_' to the result keys accordingly. Then it pads the feature with zeros to match the max length, creates a mask for the padded feature, and dequantizes the feature from byte format to float format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":249-274",
+            "content": "            if vi == 0:\n                prefix = \"rgb_\"\n            else:\n                prefix = \"audio_\"\n            feat = vitem[vi]\n            results[prefix + 'len'] = feat.shape[0]\n            #feat pad step 1. padding\n            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),\n                                dtype=np.float32)\n            feat_pad = np.concatenate((feat, feat_add), axis=0)\n            results[prefix + 'data'] = feat_pad.astype(\"float32\")\n            #feat pad step 2. mask\n            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)\n            feat_mask_add = feat_add\n            feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),\n                                       axis=0)\n            results[prefix + 'mask'] = feat_mask.astype(\"float32\")\n        return results\n    def dequantize(self,\n                   feat_vector,\n                   max_quantized_value=2.,\n                   min_quantized_value=-2.):\n        \"\"\"\n        Dequantize the feature from the byte format to the float format"
+        },
+        {
+            "comment": "The code defines a class called \"ActionFeatureDecoder\" for feature decoding operations in football actions. It initializes with parameters for the maximum length, number of classes, and whether or not it should handle labels. The __call__ method performs the decoding operation on input results and returns a list of numpy arrays after decoding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":275-309",
+            "content": "        \"\"\"\n        assert max_quantized_value > min_quantized_value\n        quantized_range = max_quantized_value - min_quantized_value\n        scalar = quantized_range / 255.0\n        bias = (quantized_range / 512.0) + min_quantized_value\n        return feat_vector * scalar + bias\n    def make_one_hot(self, label, dim=3862):\n        one_hot_label = np.zeros(dim)\n        one_hot_label = one_hot_label.astype(float)\n        for ind in label:\n            one_hot_label[int(ind)] = 1\n        return one_hot_label\n@PIPELINES.register()\nclass ActionFeatureDecoder(object):\n    \"\"\"\n        Perform feature decode operations on footballaction\n    \"\"\"\n    def __init__(self, num_classes, max_len=512, has_label=True):\n        self.max_len = max_len\n        self.num_classes = num_classes\n        self.has_label = has_label\n    def __call__(self, results):\n        \"\"\"\n        Perform feature decode operations.\n        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        #1. load pkl\n        #2. parse to rgb/audio/"
+        },
+        {
+            "comment": "The code is reading a pickle file, extracting the rgb image and audio features, label information, and performing some data manipulations. It sets the label to either 0 or 1 randomly if there's more than one in the data, normalizes iou values, and adds padding to the data for further processing. This is used in a video processing pipeline for PaddlePaddle.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":310-337",
+            "content": "        #3. padding\n        filepath = results['filename']\n        data = pickle.load(open(filepath, 'rb'), encoding='bytes')\n        pkl_data = data\n        rgb = pkl_data['image_feature'].astype(float)\n        audio = pkl_data['audio_feature'].astype(float)\n        label_id_info = pkl_data['label_info']\n        label_cls = [label_id_info['label']]\n        label_one = int(label_cls[0])\n        if len(label_cls) > 1:\n            label_index = random.randint(0, 1)\n            label_one = int(label_cls[label_index])\n        iou_norm = float(label_id_info['norm_iou'])\n        results['labels'] = np.array([label_one])\n        results['iou_norm'] = float(iou_norm)\n        vitem = [rgb, audio]\n        for vi in range(2):  #rgb and audio\n            if vi == 0:\n                prefix = \"rgb_\"\n            else:\n                prefix = \"audio_\"\n            feat = vitem[vi]\n            results[prefix + 'len'] = feat.shape[0]\n            #feat pad step 1. padding\n            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),"
+        },
+        {
+            "comment": "This code pads the feature data and its corresponding mask for a PaddleVideo pipeline, concatenating them and casting the results to float32 type before storing in the 'results' dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode.py\":338-346",
+            "content": "                                dtype=np.float32)\n            feat_pad = np.concatenate((feat, feat_add), axis=0)\n            results[prefix + 'data'] = feat_pad.astype(\"float32\")\n            #feat pad step 2. mask\n            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)\n            feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0)\n            results[prefix + 'mask'] = feat_mask.astype(\"float32\")\n        return results"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bbf3f8f6-d6b5-4fc0-8cfe-4890421b5c77.json b/docs/doc/bbf3f8f6-d6b5-4fc0-8cfe-4890421b5c77.json
new file mode 100644
index 000000000..ce29ac878
--- /dev/null
+++ b/docs/doc/bbf3f8f6-d6b5-4fc0-8cfe-4890421b5c77.json
@@ -0,0 +1,45 @@
+{
+    "summary": "The code implements temporal convolutional networks and GCN units in PaddlePaddle, creating a Graph class and AGCN2s graph convolution layer for the NTURGB+D dataset. This involves initializing variables, obtaining adjacency matrix, normalization, and executing convolutions.",
+    "details": [
+        {
+            "comment": "This code defines a class named \"UnitTCN\" which is a type of layer for temporal convolutional network. It's implemented using PaddlePaddle library and includes methods to define the convolutional layers with specified number of input and output channels, kernel size and stride. The class is registered in the BACKBONES registry of the PaddleVideo module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn2s.py\":0-31",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport numpy as np\nfrom ..registry import BACKBONES\ndef import_class(name):\n    components = name.split('.')\n    mod = __import__(components[0])\n    for comp in components[1:]:\n        mod = getattr(mod, comp)\n    return mod\nclass UnitTCN(nn.Layer):\n    def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):\n        super(UnitTCN, self).__init__()\n        pad = int((kernel_size - 1) / 2)"
+        },
+        {
+            "comment": "This code defines a GCN unit class with convolutional layers for learning spatio-temporal features. It uses batch normalization and ReLU activation, allowing the model to learn representations from the input data. The GCN unit takes in channels, output channels, adjacency matrix A, coefficient embedding, and number of subsets as parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn2s.py\":32-64",
+            "content": "        self.conv = nn.Conv2D(in_channels,\n                              out_channels,\n                              kernel_size=(kernel_size, 1),\n                              padding=(pad, 0),\n                              stride=(stride, 1))\n        self.bn = nn.BatchNorm2D(out_channels)\n        self.relu = nn.ReLU()\n    def forward(self, x):\n        \" input size : (N*M, C, T, V)\"\n        x = self.bn(self.conv(x))\n        return x\nclass UnitGCN(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 A,\n                 coff_embedding=4,\n                 num_subset=3):\n        super(UnitGCN, self).__init__()\n        inter_channels = out_channels // coff_embedding\n        self.inter_c = inter_channels\n        PA = self.create_parameter(shape=A.shape, dtype='float32')\n        self.PA = PA\n        self.A = paddle.to_tensor(A.astype(np.float32))\n        self.num_subset = num_subset\n        self.conv_a = nn.LayerList()\n        self.conv_b = nn.LayerList()\n        self.conv_d = nn.LayerList()"
+        },
+        {
+            "comment": "This code defines a neural network backbone for the AGCN2S model. It initializes and appends convolutional layers, checks if input and output channels are different to determine whether to add a downsampling layer, and defines softmax, batch normalization, and ReLU activation functions. The forward function performs operations on input data to produce the final output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn2s.py\":65-90",
+            "content": "        for i in range(self.num_subset):\n            self.conv_a.append(nn.Conv2D(in_channels, inter_channels, 1))\n            self.conv_b.append(nn.Conv2D(in_channels, inter_channels, 1))\n            self.conv_d.append(nn.Conv2D(in_channels, out_channels, 1))\n        if in_channels != out_channels:\n            self.down = nn.Sequential(nn.Conv2D(in_channels, out_channels, 1),\n                                      nn.BatchNorm2D(out_channels))\n        else:\n            self.down = lambda x: x\n        self.bn = nn.BatchNorm2D(out_channels)\n        self.soft = nn.Softmax(-2)\n        self.relu = nn.ReLU()\n    def forward(self, x):\n        N, C, T, V = x.shape\n        A = self.A + self.PA\n        y = None\n        for i in range(self.num_subset):\n            A1 = paddle.transpose(self.conv_a[i](x),\n                                  perm=[0, 3, 1,\n                                        2]).reshape([N, V, self.inter_c * T])\n            A2 = self.conv_b[i](x).reshape([N, self.inter_c * T, V])\n            A1 = self.soft(paddle.matmul(A1, A2) / A1.shape[-1])"
+        },
+        {
+            "comment": "The code defines a block class for a neural network architecture. It consists of GCN and TCN units in series, followed by a ReLU activation function. The residual connection is either set to zero or equal to the input if not specified, allowing for identity shortcuts within the network. The forward method combines the outputs from GCN and TCN with residual connections.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn2s.py\":91-120",
+            "content": "            A1 = A1 + A[i]\n            A2 = x.reshape([N, C * T, V])\n            z = self.conv_d[i](paddle.matmul(A2, A1).reshape([N, C, T, V]))\n            y = z + y if y is not None else z\n        y = self.bn(y)\n        y += self.down(x)\n        return self.relu(y)\nclass Block(nn.Layer):\n    def __init__(self, in_channels, out_channels, A, stride=1, residual=True):\n        super(Block, self).__init__()\n        self.gcn1 = UnitGCN(in_channels, out_channels, A)\n        self.tcn1 = UnitTCN(out_channels, out_channels, stride=stride)\n        self.relu = nn.ReLU()\n        if not residual:\n            self.residual = lambda x: 0\n        elif (in_channels == out_channels) and (stride == 1):\n            self.residual = lambda x: x\n        else:\n            self.residual = UnitTCN(in_channels,\n                                    out_channels,\n                                    kernel_size=1,\n                                    stride=stride)\n    def forward(self, x):\n        x = self.tcn1(self.gcn1(x)) + self.residual(x)"
+        },
+        {
+            "comment": "This code defines a Graph class with a fixed number of nodes (25) and connectivity patterns for the NTURGB+D dataset. It initializes self_link, inward, outward, and neighbor variables based on the specified labeling mode ('spatial' by default). The adjacency matrix is obtained using get_adjacency_matrix method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn2s.py\":121-143",
+            "content": "        return self.relu(x)\n# This Graph structure is for the NTURGB+D dataset. If you use a custom dataset, modify num_node and the corresponding graph adjacency structure.\nclass Graph:\n    def __init__(self, labeling_mode='spatial'):\n        num_node = 25\n        self_link = [(i, i) for i in range(num_node)]\n        inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),\n                            (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),\n                            (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),\n                            (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),\n                            (23, 8), (24, 25), (25, 12)]\n        inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]\n        outward = [(j, i) for (i, j) in inward]\n        neighbor = inward + outward\n        self.num_node = num_node\n        self.self_link = self_link\n        self.inward = inward\n        self.outward = outward\n        self.neighbor = neighbor\n        self.A = self.get_adjacency_matrix(labeling_mode)"
+        },
+        {
+            "comment": "The code defines three functions: `edge2mat()`, `normalize_digraph()`, and `get_spatial_graph()`. `edge2mat()` converts a list of edges into an adjacency matrix. `normalize_digraph()` normalizes a directed graph by computing the in-degree for each node. `get_spatial_graph()` combines the adjacency matrices from self-links, incoming edges, and outgoing edges into one matrix. The last function `get_adjacency_matrix()` returns the adjacency matrix depending on the given labeling mode (default or spatial).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn2s.py\":145-175",
+            "content": "    def edge2mat(self, link, num_node):\n        A = np.zeros((num_node, num_node))\n        for i, j in link:\n            A[j, i] = 1\n        return A\n    def normalize_digraph(self, A):\n        Dl = np.sum(A, 0)\n        h, w = A.shape\n        Dn = np.zeros((w, w))\n        for i in range(w):\n            if Dl[i] > 0:\n                Dn[i, i] = Dl[i]**(-1)\n        AD = np.dot(A, Dn)\n        return AD\n    def get_spatial_graph(self, num_node, self_link, inward, outward):\n        I = self.edge2mat(self_link, num_node)\n        In = self.normalize_digraph(self.edge2mat(inward, num_node))\n        Out = self.normalize_digraph(self.edge2mat(outward, num_node))\n        A = np.stack((I, In, Out))\n        return A\n    def get_adjacency_matrix(self, labeling_mode=None):\n        if labeling_mode is None:\n            return self.A\n        if labeling_mode == 'spatial':\n            A = self.get_spatial_graph(self.num_node, self.self_link,\n                                       self.inward, self.outward)\n        else:\n            raise ValueError()"
+        },
+        {
+            "comment": "Class AGCN2s defines a neural network layer for graph convolutions. It takes parameters such as number of points, persons, and the type of graph. The code initializes graph adjacency matrix 'A' from the specified graph and creates several Block layers for convolution operations with different parameters and strides. In forward pass, it rearranges the input tensor dimensions and reshapes it before performing graph convolutions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn2s.py\":176-211",
+            "content": "        return A\n@BACKBONES.register()\nclass AGCN2s(nn.Layer):\n    def __init__(self,\n                 num_point=25,\n                 num_person=2,\n                 graph='ntu_rgb_d',\n                 graph_args=dict(),\n                 in_channels=3):\n        super(AGCN2s, self).__init__()\n        if graph == 'ntu_rgb_d':\n            self.graph = Graph(**graph_args)\n        else:\n            raise ValueError()\n        A = self.graph.A\n        self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point)\n        self.l1 = Block(in_channels, 64, A, residual=False)\n        self.l2 = Block(64, 64, A)\n        self.l3 = Block(64, 64, A)\n        self.l4 = Block(64, 64, A)\n        self.l5 = Block(64, 128, A, stride=2)\n        self.l6 = Block(128, 128, A)\n        self.l7 = Block(128, 128, A)\n        self.l8 = Block(128, 256, A, stride=2)\n        self.l9 = Block(256, 256, A)\n        self.l10 = Block(256, 256, A)\n    def forward(self, x):\n        N, C, T, V, M = x.shape\n        x = x.transpose([0, 4, 3, 1, 2]).reshape_([N, M * V * C, T])"
+        },
+        {
+            "comment": "The code performs the following operations: \n1. Applies data normalization to x using self.data_bn.\n2. Reshapes x with dimensions N, M, V, C, and T to (N*M,C,T,V).\n3. Passes x through ten linear layers (l1 to l10) for transformation.\n4. Finally, returns the transformed x.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn2s.py\":212-228",
+            "content": "        x = self.data_bn(x)\n        x = x.reshape_([N, M, V, C,\n                        T]).transpose([0, 1, 3, 4,\n                                       2]).reshape_([N * M, C, T, V])\n        x = self.l1(x)\n        x = self.l2(x)\n        x = self.l3(x)\n        x = self.l4(x)\n        x = self.l5(x)\n        x = self.l6(x)\n        x = self.l7(x)\n        x = self.l8(x)\n        x = self.l9(x)\n        x = self.l10(x)\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bc14a46f-d682-47ed-9056-8668543c4710.json b/docs/doc/bc14a46f-d682-47ed-9056-8668543c4710.json
new file mode 100644
index 000000000..ccc7f28f1
--- /dev/null
+++ b/docs/doc/bc14a46f-d682-47ed-9056-8668543c4710.json
@@ -0,0 +1,70 @@
+{
+    "summary": "The code introduces CustomWarmupCosineDecay and CustomWarmupPiecewiseDecay schedulers for PaddleVideo, combining warm-up, cosine decay, and piecewise decay. The CustomWarmupAdjustDecay scheduler combines warmup and cosine decay and adjusts based on epoch number.",
+    "details": [
+        {
+            "comment": "This code defines a custom learning rate scheduler, CustomWarmupCosineDecay, which combines warmup and stepwise-cosine decay for use in PaddleVideo. It is part of the PaddlePaddle framework and can be used to adjust learning rates during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":0-30",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nfrom paddle.optimizer.lr import *\nimport numpy as np\n\"\"\"\nPaddleVideo Learning Rate Schedule:\nYou can use paddle.optimizer.lr\nor define your custom_lr in this file.\n\"\"\"\nclass CustomWarmupCosineDecay(LRScheduler):\n    r\"\"\"\n    We combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        warmup_start_lr (float): start learning rate used in warmup stage.\n        warmup_epochs (int): the number epochs of warmup."
+        },
+        {
+            "comment": "This code defines a class `CosineAnnealingDecay` that schedules the learning rate for training. It takes parameters like warmup start lr, warmup epochs, cosine base lr, max epoch, num_iters, last_epoch (optional), and verbose (optional). The class initializes these parameters and provides a `step()` method to update the learning rate based on cosine annealing schedule. If verbose is set to True, it will print messages for each update.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":31-53",
+            "content": "        cosine_base_lr (float|int, optional): base learning rate in cosine schedule.\n        max_epoch (int): total training epochs.\n        num_iters(int): number iterations of each epoch.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CosineAnnealingDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 warmup_start_lr,\n                 warmup_epochs,\n                 cosine_base_lr,\n                 max_epoch,\n                 num_iters,\n                 last_epoch=-1,\n                 verbose=False):\n        self.warmup_start_lr = warmup_start_lr\n        self.warmup_epochs = warmup_epochs\n        self.cosine_base_lr = cosine_base_lr\n        self.max_epoch = max_epoch\n        self.num_iters = num_iters\n        #call step() in base class, last_lr/last_epoch/base_lr will be update"
+        },
+        {
+            "comment": "This code defines a custom learning rate scheduler for PaddleVideo, implementing the CustomWarmupCosineDecay class. The step() method updates the learning rate based on current epoch and calls get_lr() to set the new learning rate. The _lr_func_cosine() function calculates the learning rate using a cosine annealing schedule.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":54-79",
+            "content": "        super(CustomWarmupCosineDecay, self).__init__(last_epoch=last_epoch,\n                                                      verbose=verbose)\n    def step(self, epoch=None):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if self.last_epoch == -1:\n                self.last_epoch += 1\n            else:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def _lr_func_cosine(self, cur_epoch, cosine_base_lr, max_epoch):"
+        },
+        {
+            "comment": "The code defines a custom learning rate (LR) scheduler that includes warmup and stepwise-cosine decay. It first performs a warmup stage with a linear increase in LR from the warmup_start_lr to lr_end over warmup_epochs, then applies cosine annealing for the rest of the epochs, resulting in a learning rate that decreases from the base value according to the cosine function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":80-105",
+            "content": "        return cosine_base_lr * (math.cos(math.pi * cur_epoch / max_epoch) +\n                                 1.0) * 0.5\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        lr = self._lr_func_cosine(self.last_epoch, self.cosine_base_lr,\n                                  self.max_epoch)\n        lr_end = self._lr_func_cosine(self.warmup_epochs, self.cosine_base_lr,\n                                      self.max_epoch)\n        # Perform warm up.\n        if self.last_epoch < self.warmup_epochs:\n            lr_start = self.warmup_start_lr\n            alpha = (lr_end - lr_start) / self.warmup_epochs\n            lr = self.last_epoch * alpha + lr_start\n        return lr\nclass CustomWarmupPiecewiseDecay(LRScheduler):\n    r\"\"\"\n    This op combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        warmup_start_lr (float): start learning rate used in warmup stage.\n        warmup_epochs (int): the number epochs of warmup.\n        step_base_lr (float|int, optional): base learning rate in step schedule."
+        },
+        {
+            "comment": "This code defines a class `CustomWarmupPiecewiseDecay` which schedules learning rate for training. The class takes parameters like warmup_start_lr, warmup_epochs, step_base_lr, lrs, gamma, steps, max_epoch, num_iters, last_epoch, and verbose. It initializes these parameters in the constructor (__init__). The learning rate is scheduled to decay over time following a piecewise function with warm-up and custom decays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":106-132",
+            "content": "        max_epoch (int): total training epochs.\n        num_iters(int): number iterations of each epoch.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CustomWarmupPiecewiseDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 warmup_start_lr,\n                 warmup_epochs,\n                 step_base_lr,\n                 lrs,\n                 gamma,\n                 steps,\n                 max_epoch,\n                 num_iters,\n                 last_epoch=0,\n                 verbose=False):\n        self.warmup_start_lr = warmup_start_lr\n        self.warmup_epochs = warmup_epochs\n        self.step_base_lr = step_base_lr\n        self.lrs = lrs\n        self.gamma = gamma\n        self.steps = steps\n        self.max_epoch = max_epoch\n        self.num_iters = num_iters"
+        },
+        {
+            "comment": "This code defines a custom learning rate scheduler for the PaddleVideo library. The `step` method updates the learning rate based on the current epoch and returns None. It should be called after `optimizer.step`. If no epoch is specified, it increments the last epoch by the number of iterations divided by the total number of iterations. The last learning rate is stored in `self.last_lr`, and if verbose is set to True, it prints the current epoch, scheduler name, and updated learning rate.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":133-157",
+            "content": "        self.last_epoch = last_epoch\n        self.last_lr = self.warmup_start_lr  # used in first iter\n        self.verbose = verbose\n        self._var_name = None\n    def step(self, epoch=None, rebuild=False):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if not rebuild:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print(\n                'step Epoch {}: {} set learning rate to {}.self.num_iters={}, 1/self.num_iters={}'\n                .format(self.last_epoch, self.__class__.__name__, self.last_lr,"
+        },
+        {
+            "comment": "This function defines a learning rate (lr) policy that varies based on the current epoch, predefined learning rates, base lr, steps, and maximum epoch. It calculates the learning rate for each step using a relative learning rate function and returns it. The function also includes a warmup phase where the learning rate gradually increases from 0 to its final value over the specified number of epochs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":158-187",
+            "content": "                        self.num_iters, 1 / self.num_iters))\n    def _lr_func_steps_with_relative_lrs(self, cur_epoch, lrs, base_lr, steps,\n                                         max_epoch):\n        # get step index\n        steps = steps + [max_epoch]\n        for ind, step in enumerate(steps):\n            if cur_epoch < step:\n                break\n        if self.verbose:\n            print(\n                '_lr_func_steps_with_relative_lrs, cur_epoch {}: {}, steps {}, ind {}, step{}, max_epoch{}'\n                .format(cur_epoch, self.__class__.__name__, steps, ind, step,\n                        max_epoch))\n        return lrs[ind - 1] * base_lr\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        lr = self._lr_func_steps_with_relative_lrs(\n            self.last_epoch,\n            self.lrs,\n            self.step_base_lr,\n            self.steps,\n            self.max_epoch,\n        )\n        lr_end = self._lr_func_steps_with_relative_lrs(\n            self.warmup_epochs,\n            self.lrs,\n            self.step_base_lr,"
+        },
+        {
+            "comment": "This code implements a CustomWarmupCosineStepDecay learning rate scheduler, which performs warm up and then applies piecewise decay. The learning rate is determined based on the current epoch, warmup epochs, warmup start and end rates, and the number of steps. A CustomPiecewiseDecay class is also defined, which inherits from PiecewiseDecay and overrides the num_iters parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":188-221",
+            "content": "            self.steps,\n            self.max_epoch,\n        )\n        # Perform warm up.\n        if self.last_epoch < self.warmup_epochs:\n            lr_start = self.warmup_start_lr\n            alpha = (lr_end - lr_start) / self.warmup_epochs\n            lr = self.last_epoch * alpha + lr_start\n        if self.verbose:\n            print(\n                'get_lr, Epoch {}: {}, lr {}, lr_end {}, self.lrs{}, self.step_base_lr{}, self.steps{}, self.max_epoch{}'\n                .format(self.last_epoch, self.__class__.__name__, lr, lr_end,\n                        self.lrs, self.step_base_lr, self.steps,\n                        self.max_epoch))\n        return lr\nclass CustomPiecewiseDecay(PiecewiseDecay):\n    def __init__(self, **kargs):\n        kargs.pop('num_iters')\n        super().__init__(**kargs)\nclass CustomWarmupCosineStepDecay(LRScheduler):\n    def __init__(self,\n                 warmup_iters,\n                 warmup_ratio=0.1,\n                 min_lr=0,\n                 base_lr=3e-5,\n                 max_epoch=30,"
+        },
+        {
+            "comment": "This function initializes the custom learning rate scheduler. It sets warmup ratio, minimum learning rate, and warmup iterations. The total number of iterations, maximum epochs, base learning rate for cosine annealing, and a regular learning rate are calculated. The function also defines a helper method 'annealing_cos' for cosine annealing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":222-248",
+            "content": "                 last_epoch=-1,\n                 num_iters=None,\n                 verbose=False):\n        self.warmup_ratio = warmup_ratio\n        self.min_lr = min_lr\n        self.warmup_epochs = warmup_iters\n        self.warmup_iters = warmup_iters * num_iters\n        self.cnt_iters = 0\n        self.cnt_epoch = 0\n        self.num_iters = num_iters\n        self.tot_iters = max_epoch * num_iters\n        self.max_epoch = max_epoch\n        self.cosine_base_lr = base_lr  # initial lr for all param groups\n        self.regular_lr = self.get_regular_lr()\n        super().__init__(last_epoch=last_epoch, verbose=verbose)\n    def annealing_cos(self, start, end, factor, weight=1):\n        cos_out = math.cos(math.pi * factor) + 1\n        return end + 0.5 * weight * (start - end) * cos_out\n    def get_regular_lr(self):\n        progress = self.cnt_epoch\n        max_progress = self.max_epoch\n        target_lr = self.min_lr\n        return self.annealing_cos(self.cosine_base_lr, target_lr, progress /\n                                  max_progress)  # self.cosine_base_lr"
+        },
+        {
+            "comment": "This code defines a custom learning rate scheduler that combines warmup and stepwise-cosine decay. The get_warmup_lr function calculates the warmup learning rate, while the get_lr function determines whether the current iteration is in the warmup stage or not, returning either the regular learning rate or the warmed-up learning rate. The step function updates the learning rate and counts the number of iterations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":250-281",
+            "content": "    def get_warmup_lr(self, cur_iters):\n        k = (1 - cur_iters / self.warmup_iters) * (1 - self.warmup_ratio)\n        warmup_lr = self.regular_lr * (1 - k)  # 3e-5 * (1-k)\n        return warmup_lr\n    def step(self, epoch=None):\n        self.regular_lr = self.get_regular_lr()\n        self.last_lr = self.get_lr()\n        self.cnt_epoch = (self.cnt_iters +\n                          1) // self.num_iters  # update step with iters\n        self.cnt_iters += 1\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def get_lr(self):\n        \"\"\"Define lr policy\"\"\"\n        cur_iter = self.cnt_iters\n        if cur_iter >= self.warmup_iters:\n            return self.regular_lr\n        else:\n            warmup_lr = self.get_warmup_lr(cur_iter)\n            return warmup_lr\nclass CustomWarmupAdjustDecay(LRScheduler):\n    r\"\"\"\n    We combine warmup and stepwise-cosine which is used in slowfast model.\n    Args:\n        step_base_lr (float): start learning rate used in warmup stage."
+        },
+        {
+            "comment": "Custom learning rate scheduler with warmup, decay, and boundary steps. Initializes the LR scheduler with step base LR, warmup epochs, decay rate, boundaries, number of iterations (optional), last epoch (optional), and verbosity level (optional).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":282-304",
+            "content": "        warmup_epochs (int): the number epochs of warmup.\n        lr_decay_rate (float|int, optional): base learning rate decay rate.\n        step (int): step in change learning rate.\n        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.\n        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .\n    Returns:\n        ``CosineAnnealingDecay`` instance to schedule learning rate.\n    \"\"\"\n    def __init__(self,\n                 step_base_lr,\n                 warmup_epochs,\n                 lr_decay_rate,\n                 boundaries,\n                 num_iters=None,\n                 last_epoch=-1,\n                 verbose=False):\n        self.step_base_lr = step_base_lr\n        self.warmup_epochs = warmup_epochs\n        self.lr_decay_rate = lr_decay_rate\n        self.boundaries = boundaries\n        self.num_iters = num_iters\n        #call step() in base class, last_lr/last_epoch/base_lr will be update"
+        },
+        {
+            "comment": "The code defines a custom learning rate scheduler, CustomWarmupAdjustDecay, which adjusts the learning rate based on epoch number. It initializes the scheduler and provides a step method for updating the learning rate after optimizer.step is called. The get_lr method returns the current learning rate. The last_epoch variable keeps track of the current epoch. If no epoch is specified, it auto-increments from the last_epoch value. If an epoch is provided, the last_epoch is set to that value. Finally, if verbose is True, it prints the current epoch and the learning rate set.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":305-331",
+            "content": "        super(CustomWarmupAdjustDecay, self).__init__(last_epoch=last_epoch,\n                                                      verbose=verbose)\n    def step(self, epoch=None):\n        \"\"\"\n        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .\n        The new learning rate will take effect on next ``optimizer.step`` .\n        Args:\n            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.\n        Returns:\n            None\n        \"\"\"\n        if epoch is None:\n            if self.last_epoch == -1:\n                self.last_epoch += 1\n            else:\n                self.last_epoch += 1 / self.num_iters  # update step with iters\n        else:\n            self.last_epoch = epoch\n        self.last_lr = self.get_lr()\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n    def get_lr(self):"
+        },
+        {
+            "comment": "This code calculates the learning rate based on whether the current epoch is within the warmup phase or not. If in warmup, it linearly increases the base learning rate. Otherwise, it applies a decay rate to determine the learning rate.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/solver/custom_lr.py\":332-337",
+            "content": "        if self.last_epoch < self.warmup_epochs:\n            lr = self.step_base_lr * (self.last_epoch + 1) / self.warmup_epochs\n        else:\n            lr = self.step_base_lr * (self.lr_decay_rate**np.sum(\n                self.last_epoch >= np.array(self.boundaries)))\n        return lr"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bc1deee6-4bc8-430d-bb7d-3f7f654f0040.json b/docs/doc/bc1deee6-4bc8-430d-bb7d-3f7f654f0040.json
new file mode 100644
index 000000000..d7a1e62ec
--- /dev/null
+++ b/docs/doc/bc1deee6-4bc8-430d-bb7d-3f7f654f0040.json
@@ -0,0 +1,70 @@
+{
+    "summary": "PaddleVideo library contains ResNet TSN model backbones, licensed under Apache 2.0, includes ConvBNLayer, offers modified ResNet with BatchNorm and pooling layers, uses BottleneckBlock for deeper networks, and initializes configurable parameters with BasicBlock. The code constructs a ResNet backbone, performs forward pass through network, applies convolutions and pooling, and returns output after passing through each block in the block list using input lists to determine layers and filters while initializing model weights.",
+    "details": [
+        {
+            "comment": "This code is a part of the PaddleVideo library, which provides model backbones including ResNet Tweaks TSN. It imports necessary modules and defines functions for creating convolutional layers, batch normalization, pooling layers, initializing weights, and loading checkpoints. The code follows the Apache License 2.0 and is distributed under an \"AS IS\" basis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":0-28",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn import Conv2D, BatchNorm\nfrom paddle.nn import MaxPool2D, AvgPool2D\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt"
+        },
+        {
+            "comment": "This code defines a ConvBNLayer class with an average pooling operation, a convolution layer, and optional tweaks mode. It also initializes a Conv2D layer and sets parameters for weight attributes and learning rates.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":30-57",
+            "content": "__all__ = [\"ResNetTweaksTSN\"]\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 lr_mult=1.0,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode\n        self._pool2d_avg = AvgPool2D(kernel_size=2,\n                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\",\n                                                  learning_rate=lr_mult),"
+        },
+        {
+            "comment": "The code defines a ResNet backbone with Temporal Segment Network (TSN) modifications. It includes a BatchNorm layer for normalization and has a forward function that applies pooling if in tweaks mode, followed by the batch norm and convolution layers. The BottleneckBlock class is also defined as a sublayer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":58-88",
+            "content": "                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._batch_norm = BatchNorm(\n            out_channels,\n            act=act,\n            param_attr=ParamAttr(name=bn_name + '_scale',\n                                 learning_rate=lr_mult,\n                                 regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(bn_name + '_offset',\n                                learning_rate=lr_mult,\n                                regularizer=L2Decay(0.0)),\n            moving_mean_name=bn_name + '_mean',\n            moving_variance_name=bn_name + '_variance')\n    def forward(self, inputs):\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,"
+        },
+        {
+            "comment": "This code defines the BottleneckBlock class, which is a layer in ResNet backbone. It consists of three ConvBNLayer instances: conv0, conv1, and conv2. The first one performs a 1x1 convolution, while the second one does a 3x3 convolution with stride. Lastly, the third one executes a 1x1 convolution without activation function. This block is designed to reduce parameters for deeper networks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":89-110",
+            "content": "                 if_first=False,\n                 lr_mult=1.0,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,"
+        },
+        {
+            "comment": "This code defines a ResNet block with two convolution layers, one optional shortcut connection, and applies ReLU activation after the addition of the branch outputs. The BasicBlock class is used for the basic building block of the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":111-143",
+            "content": "                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     lr_mult=lr_mult,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        y = F.relu(y)\n        return y\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,"
+        },
+        {
+            "comment": "This code defines a BasicBlock class with convolutional layers and Batch Normalization. It initializes the block's parameters like stride, convolution layers, and batch normalization. The shortcut connection is optional and depends on the 'shortcut' parameter. The 'if_first', 'lr_mult', and 'name' parameters are also provided for customization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":144-166",
+            "content": "                 shortcut=True,\n                 if_first=False,\n                 lr_mult=1.0,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act='relu',\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 act=None,\n                                 lr_mult=lr_mult,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,"
+        },
+        {
+            "comment": "This code defines a ResNetTweaksTSN backbone for deep learning models. It includes layers such as convolution, shortcut connections, and ReLU activation function. The constructor takes parameters like depth (layers), pretrained model, and learning rate multipliers for different layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":167-202",
+            "content": "                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     lr_mult=lr_mult,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTweaksTSN(nn.Layer):\n    \"\"\"ResNetTweaksTSN backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self,\n                 layers=50,\n                 pretrained=None,\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0]):\n        super(ResNetTweaksTSN, self).__init__()\n        self.pretrained = pretrained"
+        },
+        {
+            "comment": "This code initializes a ResNet backbone with different configurations based on the input layer. It checks if the provided layer is supported, asserts the type and length of the learning rate multiplier list, and assigns depth and number of channels for each layer configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":203-231",
+            "content": "        self.layers = layers\n        supported_layers = [18, 34, 50, 101, 152, 200]\n        assert layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, layers)\n        self.lr_mult_list = lr_mult_list\n        assert isinstance(\n            self.lr_mult_list,\n            (list, tuple\n             )), \"lr_mult_list should be in (list, tuple) but got {}\".format(\n                 type(self.lr_mult_list))\n        assert len(\n            self.lr_mult_list\n        ) == 5, \"lr_mult_list length should should be 5 but got {}\".format(\n            len(self.lr_mult_list))\n        if layers == 18:\n            depth = [2, 2, 2, 2]\n        elif layers == 34 or layers == 50:\n            depth = [3, 4, 6, 3]\n        elif layers == 101:\n            depth = [3, 4, 23, 3]\n        elif layers == 152:\n            depth = [3, 8, 36, 3]\n        elif layers == 200:\n            depth = [3, 12, 48, 3]\n        num_channels = [64, 256, 512, 1024\n                        ] if layers >= 50 else [64, 64, 128, 256]"
+        },
+        {
+            "comment": "This code defines the first layer of the ResNet backbone, including three ConvBNLayer instances for different operations. The first layer consists of a 2x downsampling convolution, followed by two 1x1 convolutions to reduce dimensionality and apply relu activation. Lr_mult ensures that these layers are trained with different learning rates based on their importance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":232-253",
+            "content": "        num_filters = [64, 128, 256, 512]\n        self.conv1_1 = ConvBNLayer(in_channels=3,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=2,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_1\")\n        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   lr_mult=self.lr_mult_list[0],"
+        },
+        {
+            "comment": "This code defines a ResNet backbone with optional Temporal Segment Network (TSN) modifications. It adds BottleneckBlock layers, specifies pooling operations, and handles shortcut connections for blocks 0-56. The number of layers and filters are determined by the provided depth and num_filters lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":254-275",
+            "content": "                                   name=\"conv1_3\")\n        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if layers in [101, 152, 200] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' % (block, i),\n                        BottleneckBlock(\n                            in_channels=num_channels[block]\n                            if i == 0 else num_filters[block] * 4,\n                            out_channels=num_filters[block],\n                            stride=2 if i == 0 and block != 0 else 1,"
+        },
+        {
+            "comment": "This code adds layers to the ResNet backbone model. It uses conditional statements and loops to determine the number of layers added at each block based on a given depth configuration, and applies different configurations for the first block. Layers are added with specific parameters such as in_channels, out_channels, stride, shortcut, if_first flag, and name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":276-295",
+            "content": "                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            lr_mult=self.lr_mult_list[block + 1],\n                            name=conv_name))\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        'bb_%d_%d' % (block, i),\n                        BasicBlock(in_channels=num_channels[block]\n                                   if i == 0 else num_filters[block],\n                                   out_channels=num_filters[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   if_first=block == i == 0,\n                                   name=conv_name,"
+        },
+        {
+            "comment": "This code initializes a backbone model and handles the loading of pre-trained weights. If pre-trained path is specified, it loads the weights; otherwise, it follows specific initialization for Conv2D layers and BatchNorm2d using KaimingNormal and Constant functions respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":296-313",
+            "content": "                                   lr_mult=self.lr_mult_list[block + 1]))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be\n            initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        # XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():"
+        },
+        {
+            "comment": "This code initializes the weights of convolutional layers without bias and batch normalization layers with constant value 1. It then performs forward pass through the network, applying convolutions and pooling operations. The output is returned after passing through each block in the block list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py\":314-327",
+            "content": "                if isinstance(layer, nn.Conv2D):\n                    # XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2d_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bc402025-96c3-420c-aaae-0b1fb4e0d10b.json b/docs/doc/bc402025-96c3-420c-aaae-0b1fb4e0d10b.json
new file mode 100644
index 000000000..01d2b012b
--- /dev/null
+++ b/docs/doc/bc402025-96c3-420c-aaae-0b1fb4e0d10b.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code imports and registers various models for computer vision, defines functions to build these components based on configuration, and uses a \"build\" function to determine the model type.",
+    "details": [
+        {
+            "comment": "This code imports necessary modules and registers various types of models for a computer vision application. It also includes licensing information and provides utility functions for model building.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py\":0-18",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, BBOX_CODERS, PARTITIONERS, MULTIMODAL, SEGMENT\nfrom ..utils import build\nfrom .registry import (BACKBONES, BBOX_ASSIGNERS, BBOX_CODERS, BBOX_SAMPLERS,\n                       DETECTORS, ESTIMATORS, HEADS, LOCALIZERS, LOSSES,\n                       MULTIMODAL, PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)"
+        },
+        {
+            "comment": "The code defines functions for building various components of a video processing model, including backbone, roi extractor, assigner, sampler, head, loss, recognizer, and localizer. These functions use the `build()` method to construct the components based on the given configuration (cfg). BACKBONES, ROI_EXTRACTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, HEADS, LOSSES, RECOGNIZERS, and framework are used as parameters in the `build()` method. The functions repeat twice for each component, which could be a code formatting issue or redundancy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py\":21-72",
+            "content": "def build_backbone(cfg):\n    \"\"\"Build backbone.\"\"\"\n    return build(cfg, BACKBONES)\ndef build_roi_extractor(cfg):\n    \"\"\"Build roi extractor.\"\"\"\n    return build(cfg, ROI_EXTRACTORS)\ndef build_assigner(cfg, **default_args):\n    \"\"\"Builder of box assigner.\"\"\"\n    return build(cfg, BBOX_ASSIGNERS)\ndef build_sampler(cfg, **default_args):\n    \"\"\"Builder of box batch_sampler.\"\"\"\n    return build(cfg, BBOX_SAMPLERS)\ndef build_roi_extractor(cfg):\n    \"\"\"Build roi extractor.\"\"\"\n    return build(cfg, ROI_EXTRACTORS)\ndef build_assigner(cfg, **default_args):\n    \"\"\"Builder of box assigner.\"\"\"\n    return build(cfg, BBOX_ASSIGNERS)\ndef build_sampler(cfg, **default_args):\n    \"\"\"Builder of box batch_sampler.\"\"\"\n    return build(cfg, BBOX_SAMPLERS)\ndef build_head(cfg):\n    \"\"\"Build head.\"\"\"\n    return build(cfg, HEADS)\ndef build_loss(cfg):\n    \"\"\"Build loss.\"\"\"\n    return build(cfg, LOSSES)\ndef build_recognizer(cfg):\n    \"\"\"Build recognizer.\"\"\"\n    return build(cfg, RECOGNIZERS, key='framework')\ndef build_localizer(cfg):\n    \"\"\"Build localizer.\"\"\""
+        },
+        {
+            "comment": "The code defines several functions that build different models such as recognizer, localizer, partitioner, estimator, and segment. It uses a \"build\" function to determine which model to create based on the provided configuration (cfg). The model is built by copying the cfg and checking its value for the key 'framework', then calling the appropriate function to build the desired model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py\":73-115",
+            "content": "    return build(cfg, LOCALIZERS, key='framework')\ndef build_segmentationer(cfg):\n    \"\"\"Build detector.\"\"\"\n    return build(cfg, SEGMENT, key='framework')\ndef build_partitioner(cfg):\n    \"\"\"Build partitioner.\"\"\"\n    return build(cfg, PARTITIONERS, key='framework')\ndef build_estimator(cfg):\n    \"\"\"Build estimator.\"\"\"\n    return build(cfg, ESTIMATORS, key='framework')\ndef build_multimodal(cfg):\n    \"\"\"Build multimodal.\"\"\"\n    return build(cfg, MULTIMODAL, key='framework')\ndef build_detector(cfg):\n    \"\"\"Build multimodal.\"\"\"\n    return build(cfg, DETECTORS, key='framework')\ndef build_segment(cfg):\n    \"\"\"Build segment.\"\"\"\n    return build(cfg, SEGMENT, key='framework')\ndef build_model(cfg, key='framework'):\n    cfg_copy = cfg.copy()\n    framework_type = cfg_copy.get(key)\n    if framework_type in RECOGNIZERS:\n        return build_recognizer(cfg)\n    elif framework_type in LOCALIZERS:\n        return build_localizer(cfg)\n    elif framework_type in PARTITIONERS:\n        return build_partitioner(cfg)\n    elif framework_type in DETECTORS:"
+        },
+        {
+            "comment": "This code is selecting a function to build a video analysis framework based on the given configuration (cfg) and framework type. If the type matches any of the predefined categories, it returns the corresponding function result. Otherwise, it raises a NotImplementedError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py\":116-124",
+            "content": "        return build_detector(cfg)\n    elif framework_type in ESTIMATORS:\n        return build_estimator(cfg)\n    elif framework_type in MULTIMODAL:\n        return build_multimodal(cfg)\n    elif framework_type in SEGMENT:\n        return build_segment(cfg)\n    else:\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bc87640d-9bd7-4ce9-88de-36df7379d29d.json b/docs/doc/bc87640d-9bd7-4ce9-88de-36df7379d29d.json
new file mode 100644
index 000000000..ec0171b73
--- /dev/null
+++ b/docs/doc/bc87640d-9bd7-4ce9-88de-36df7379d29d.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The \"build\" function takes a config dictionary and registry, constructs an object from the configuration, checks for required keys, retrieves class from the registry, and returns the instance.",
+    "details": [
+        {
+            "comment": "This code defines a function named \"build\" that takes a config dictionary and a registry, builds an object from the given configuration dictionary, and returns it. The function asserts that the input is a valid dictionary and checks if the required key exists. It then retrieves the object type from the dictionary and gets the corresponding class from the registry before returning the constructed object.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\ndef build(cfg, registry, key='name'):\n    \"\"\"Build a module from config dict.\n    Args:\n        cfg (dict): Config dict. It should at least contain the key.\n        registry (XXX): The registry to search the type from.\n        key (str): the key.\n    Returns:\n        obj: The constructed object.\n    \"\"\"\n    assert isinstance(cfg, dict) and key in cfg\n    cfg_copy = cfg.copy()\n    obj_type = cfg_copy.pop(key)\n    obj_cls = registry.get(obj_type)"
+        },
+        {
+            "comment": "Checks if an object class is provided, raises a KeyError if not found in the registry, and returns an instance of the found class with provided configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py\":31-34",
+            "content": "    if obj_cls is None:\n        raise KeyError('{} is not in the {} registry'.format(\n                obj_type, registry.name))\n    return obj_cls(**cfg_copy)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bcfb23d4-6122-42c0-867e-bc6b1215ad1c.json b/docs/doc/bcfb23d4-6122-42c0-867e-bc6b1215ad1c.json
new file mode 100644
index 000000000..a205d932d
--- /dev/null
+++ b/docs/doc/bcfb23d4-6122-42c0-867e-bc6b1215ad1c.json
@@ -0,0 +1,60 @@
+{
+    "summary": "ConvBNLayer combines Conv2D and BatchNorm2D in PaddlePaddle's ResNet class, using BasicBlock and BottleneckBlock with optional shortcut connections. The code dynamically creates layers, initializes weights, performs convolution and pooling operations, for a customizable deep learning model backbone.",
+    "details": [
+        {
+            "comment": "This code defines the ConvBNLayer class, which is a combination of Conv2D and BatchNorm2D layers. It is part of a PaddlePaddle deep learning model backbone. The class takes arguments for its constructor, suggesting it is customizable or can be initialized with specific parameters. The weight initialization function is also imported to initialize the layer's weights. This could indicate that this class may involve complex neural network layers for image processing tasks like image classification or object detection.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":0-34",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport numpy as np\nimport math\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils import load_ckpt\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:"
+        },
+        {
+            "comment": "This code defines a ConvBNLayer class that takes parameters such as in_channels, out_channels, kernel_size, stride (default 1), groups (default 1), activation function (act) and name. It inherits from another class, super(ConvBNLayer, self). It then initializes the Conv2D layer with the provided parameters and is followed by an init_weights method for weight and bias initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":35-57",
+            "content": "        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,"
+        },
+        {
+            "comment": "This code defines a Convolutional Neural Network (CNN) layer with optional batch normalization and activation. It is initialized in the ResNet class, which also contains a forward function for feed-forward computation. The BottleneckBlock class extends this design to create a bottleneck block.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":58-88",
+            "content": "                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(out_channels,\n                                       weight_attr=ParamAttr(name=bn_name +\n                                                             \"_scale\"),\n                                       bias_attr=ParamAttr(bn_name + \"_offset\"))\n    def forward(self, inputs):\n        \"\"\"forward\"\"\"\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    \"\"\"BottleneckBlock\"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,"
+        },
+        {
+            "comment": "This code defines a BottleneckBlock class with multiple ConvBNLayer instances for the \"branch2a\", \"branch2b\", and \"branch2c\" layers. The BottleneckBlock class is a building block for ResNet architecture in PaddleVideo, used to perform convolutional operations with specific parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":89-112",
+            "content": "                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,"
+        },
+        {
+            "comment": "This code defines a class for a BasicBlock in a convolutional neural network. It contains a ConvBNLayer, another ConvBNLayer, and an optional shortcut connection. The forward function performs the operations within the block and returns the output after applying ReLU activation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":113-145",
+            "content": "                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        \"\"\"forward\"\"\"\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.relu(y)\nclass BasicBlock(nn.Layer):\n    \"\"\"BasicBlock\"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,"
+        },
+        {
+            "comment": "This code defines a class with forward function. It initializes layers such as ConvBNLayer, and the shortcut connection depends on the provided 'shortcut'. The forward function performs the computations by passing inputs through the conv0 layer, then the conv1 layer, and finally adds the results of the two operations to generate the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":146-173",
+            "content": "                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     kernel_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        \"\"\"forward\"\"\"\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)"
+        },
+        {
+            "comment": "ResNet is a backbone class for creating ResNet models with different depths. It initializes the layers based on the input depth and supports pretrained models. The code defines supported layer sizes, sets up the block size and number of output channels. It creates an instance of ConvBNLayer for the input channel size 3 and output channel size 64.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":174-209",
+            "content": "        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNet(nn.Layer):\n    \"\"\"ResNet backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, pretrained=None):\n        super(ResNet, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = [64, 256, 512, 1024]\n        out_channels = [64, 128, 256, 512]\n        self.conv = ConvBNLayer(in_channels=3,\n                                out_channels=64,"
+        },
+        {
+            "comment": "This code snippet defines a ResNet model. It includes a convolutional layer with specified parameters, a MaxPool2D layer, and dynamically generates BottleneckBlock layers based on the desired depth. The code also checks for specific layer counts (101 or 152) in certain blocks and sets the corresponding layer names accordingly to differentiate them from other blocks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":210-231",
+            "content": "                                kernel_size=7,\n                                stride=2,\n                                act=\"relu\",\n                                name=\"conv1\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        conv_name,\n                        BottleneckBlock(\n                            # NOTE: Be careful! Here is different from TSM model."
+        },
+        {
+            "comment": "This code defines a ResNet model architecture with Bottleneck and Basic blocks. It dynamically creates the layers based on the input channels, output channels, depth, and stride values defined in the respective lists. The shortcut connection is used to skip layers or not, depending on the block number. Each block is added as a sublayer to the model's layer list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":232-251",
+            "content": "                            in_channels=in_channels[block]\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            shortcut=shortcut,\n                            name=conv_name))\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,"
+        },
+        {
+            "comment": "The code defines a function for initializing the weights of a backbone model. If pretrained weights are specified, it loads them. Otherwise, it uses specific initialization methods for Conv2D and BatchNorm2d layers. It checks if pretrained weights are provided or not and acts accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":252-269",
+            "content": "                                   shortcut=shortcut,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():"
+        },
+        {
+            "comment": "The code is defining a forward pass function for the backbone, which performs convolution and pooling operations. It also initializes layer weights based on their type (Conv2D or BatchNorm2D). The comments indicate that the input shape has been merged beforehand and reshaping is not necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py\":270-289",
+            "content": "                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27\n        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        y = self.conv(inputs)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bdc17646-5d44-4a81-aa6c-f1256733ce9a.json b/docs/doc/bdc17646-5d44-4a81-aa6c-f1256733ce9a.json
new file mode 100644
index 000000000..58a52418e
--- /dev/null
+++ b/docs/doc/bdc17646-5d44-4a81-aa6c-f1256733ce9a.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This line imports the functions and classes from the \"attention_lstm.py\" file in the same directory, allowing for easy access to those components within this module.",
+    "details": [
+        {
+            "comment": "This line imports the functions and classes from the \"attention_lstm.py\" file in the same directory, allowing for easy access to those components within this module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/attention_lstm/__init__.py\":0-0",
+            "content": "from .attention_lstm import *"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/be92fedf-3fa2-4fde-bed4-4d1ec067a31c.json b/docs/doc/be92fedf-3fa2-4fde-bed4-4d1ec067a31c.json
new file mode 100644
index 000000000..4bffe3fb9
--- /dev/null
+++ b/docs/doc/be92fedf-3fa2-4fde-bed4-4d1ec067a31c.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code introduces the BMNINFReader class for data reading in PaddleVideo's TableTennis app, initializes a table tennis action detection class, creates a dataset, and defines an inference reader function.",
+    "details": [
+        {
+            "comment": "This code defines a class BMNINFReader for data reading in PaddleVideo's TableTennis application. It uses the get_sw_prop function to retrieve valid proposal spans, filters them based on having at least one second of video duration, and returns the filtered proposal list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py\":0-48",
+            "content": "\"\"\"\n# @File  : bmninf_reader.py\n# @Author: macaihong\n# @Date  : 2019/12/15\n# @Desc  :\n\"\"\"\nimport os\nimport random\nimport pickle\nimport json\nimport numpy as np\nimport multiprocessing\nimport numpy as np\nfrom .reader_utils import DataReader\ndef get_sw_prop(duration, window=200, step=10):\n    \"\"\"\n    get_sw_prop\n    \"\"\"\n    pr = []\n    local_boxes = []\n    for k in np.arange(0, duration - window + step, step):\n        start_id = k\n        end_id = min(duration, k + window)\n        if end_id - start_id < window:\n            start_id = end_id - window\n        local_boxes = (start_id, end_id)\n        pr.append(local_boxes)\n    def valid_proposal(duration, span):\n        \"\"\"\n        valid_proposal\n        \"\"\"\n        # fileter proposals\n        # a valid proposal should have at least one second in the video\n        real_span = min(duration, span[1]) - span[0]\n        return real_span >= 1\n    pr = list(filter(lambda x: valid_proposal(duration, x), pr))\n    return pr\nclass BMNINFReader(DataReader):\n    \"\"\"\n    Data reader for BMN model, which was stored as features extracted by prior networks"
+        },
+        {
+            "comment": "This code is initializing a class that reads data from BMNINF files for table tennis action detection. The class takes in arguments like name, mode, configuration (cfg), and material. It sets the temporal length of BM map (tscale) and duration scale of BM map (dscale). It also calculates other values such as step size and uses them to reshape feature data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py\":49-71",
+            "content": "    dataset cfg: feat_path, feature path,\n                 tscale, temporal length of BM map,\n                 dscale, duration scale of BM map,\n                 anchor_xmin, anchor_xmax, the range of each point in the feature sequence,\n                 batch_size, batch size of input data,\n                 num_threads, number of threads of data processing\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.tscale = cfg[self.name.upper()]['tscale']  # 200\n        self.dscale = cfg[self.name.upper()]['dscale']  # 200\n        # self.subset = cfg[self.name.upper()]['subset']\n        self.tgap = 1. / self.tscale\n        self.step = cfg[self.name.upper()]['window_step']\n        self.material = material\n        src_feature = self.material\n        image_feature = src_feature['image_feature']\n        # pcm_feature = src_feature['pcm_feature']\n        # pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))\n        # print(rgb_feature.shape, audio_feature.shape, pcm_feature.shape)"
+        },
+        {
+            "comment": "This code is creating a dataset for video analysis. It concatenates image and audio features, sets the duration, window size, and retrieves the list of videos to be analyzed in the dataset. The code also handles test and infer modes by setting the number of threads accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py\":72-102",
+            "content": "        # min_length = min(image_feature.shape[0], pcm_feature.shape[0])\n        #if min_length == 0:\n        #    continue\n        # image_feature = image_feature[:min_length, :]\n        # pcm_feature = pcm_feature[:min_length, :]\n        # self.features = np.concatenate((image_feature, pcm_feature), axis=1)\n        self.features = image_feature\n        self.duration = len(self.features)\n        self.window = self.tscale\n        self.get_dataset_dict()\n        self.get_match_map()\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        if (mode == 'test') or (mode == 'infer'):\n            self.num_threads = 1  # set num_threads as 1 for test and infer\n    def get_dataset_dict(self):\n        \"\"\"\n        get_dataset_dict\n        \"\"\"\n        self.video_list = get_sw_prop(self.duration, self.window, self.step)\n    def get_match_map(self):\n        \"\"\"\n        get_match_map\n        \"\"\"\n        match_map = []\n        for idx in range(self.tscale):\n            tmp_match_window = []\n            xmin = self.tgap * idx"
+        },
+        {
+            "comment": "The code defines a class with methods to load video features, create a reader for ctcn model inference, and define match_map which seems to be related to table tennis action detection. The load_file method takes a window of feature ids and loads the corresponding video features. The create_reader method returns an inferential reader for the ctcn model. The make_infer_reader method is used to create the reader object.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py\":103-132",
+            "content": "            for jdx in range(1, self.tscale + 1):\n                xmax = xmin + self.tgap * jdx\n                tmp_match_window.append([xmin, xmax])\n            match_map.append(tmp_match_window)\n        match_map = np.array(match_map)\n        match_map = np.transpose(match_map, [1, 0, 2])\n        match_map = np.reshape(match_map, [-1, 2])\n        self.match_map = match_map\n        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]\n        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]\n    def load_file(self, video_wind):\n        \"\"\"\n        load_file\n        \"\"\"\n        start_feat_id = video_wind[0]\n        end_feat_id = video_wind[1]\n        video_feat = self.features[video_wind[0]:video_wind[1]]\n        video_feat = video_feat.T\n        video_feat = video_feat.astype(\"float32\")\n        return video_feat\n    def create_reader(self):\n        \"\"\"\n        reader creator for ctcn model\n        \"\"\"\n        return self.make_infer_reader()\n    def make_infer_reader(self):\n        \"\"\""
+        },
+        {
+            "comment": "This code defines a reader function for inference that iterates over video files and appends data to a batch. It yields the batches when their size reaches the specified batch_size, and at the end of iteration if there's still remaining data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py\":133-153",
+            "content": "        reader for inference\n        \"\"\"\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            # for video_name in self.video_list:\n            for video_wind in self.video_list:\n                video_idx = self.video_list.index(video_wind)\n                video_feat = self.load_file(video_wind)\n                batch_out.append(\n                    (video_feat, video_wind, [self.duration, self.dscale]))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/be9b7893-fa30-4e2c-afd6-c05e2ddfbc5c.json b/docs/doc/be9b7893-fa30-4e2c-afd6-c05e2ddfbc5c.json
new file mode 100644
index 000000000..75c8091ba
--- /dev/null
+++ b/docs/doc/be9b7893-fa30-4e2c-afd6-c05e2ddfbc5c.json
@@ -0,0 +1,20 @@
+{
+    "summary": "MoViNet, a lightweight Google Research video model, improves accuracy using causal convolution and temporal ensembles. PaddleVideo's code includes training/testing info, Kinetics-400 data, inference tools, configuration file, model file, parameter file, GPU usage, TensorRT settings, and example logs for processing videos.",
+    "details": [
+        {
+            "comment": "MoViNet is a lightweight, efficient video model developed by Google research for online reasoning on video streams. It utilizes causal convolution operator with stream buffer and temporal ensembles to improve accuracy. The code provides details on how to train and test MoViNet using Kinetics-400 data, along with instructions for accessing the training logs to find test accuracy results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/movinet.md\":0-39",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/movinet.md) | English\n# MoViNet\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nMovinet is a mobile video network developed by Google research. It uses causal convolution operator with stream buffer and temporal ensembles to improve accuracy. It is a lightweight and efficient video model that can be used for online reasoning video stream.\n## Data\nPlease refer to Kinetics400 data download and preparation doc [k400-data](../../dataset/K400.md)\n## Train\n- Train MoViNet on kinetics-400 scripts:\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_movinet main.py --validate -c configs/recognition/movinet/movinet_k400_frame.yaml\n```\n## Test\n- For uniform sampling, test accuracy can be found in training-logs by search key word `best`, such as:\n```txt\nAlready save the best model (top1 acc)0.6489"
+        },
+        {
+            "comment": "This code provides information on testing and inference for the MoViNet model. It includes commands for running tests, accessing accuracy results on Kinetics400, exporting inference models, and using the predict tool with example input files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/movinet.md\":40-72",
+            "content": "```\n- Test scripts:\n```bash\npython3.7 main.py --test -c configs/recognition/movinet/movinet_k400_frame.yaml -w output/MoViNet/MoViNet_best.pdparams\n```\nAccuracy on Kinetics400:\n| Config | Sampling method | num_seg | target_size | Top-1 | checkpoints |\n| :------: | :--------: | :-------: | :-------: | :-----: | :-----: |\n| A0 | Uniform | 50 | 172  | 66.62 | [MoViNetA0_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/MoViNetA0_k400.pdparams)  |\n## Inference\n### export inference model\n To get model architecture file `MoViNetA0.pdmodel` and parameters file `MoViNetA0.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/movinet/movinet_k400_frame.yaml \\\n                                -p data/MoViNetA0_k400.pdparams \\\n                                -o inference/MoViNetA0\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\"
+        },
+        {
+            "comment": "The code specifies the configuration file, model file, and parameter file for the MoViNet model in PaddleVideo. It also sets the use of GPU as True and TensorRT as False. The example logs show the video file being processed and the top-1 class and score for video recognition.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/movinet.md\":73-90",
+            "content": "                           --config configs/recognition/movinet/movinet_k400_frame.yaml \\\n                           --model_file inference/MoViNetA0/MoViNet.pdmodel \\\n                           --params_file inference/MoViNetA0/MoViNet.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.7667049765586853\n```\n## Reference\n- [MoViNets: Mobile Video Networks for Efficient Video Recognition](https://arxiv.org/abs/2103.11511)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bf30a94e-f8ad-448a-a836-7c2845544b79.json b/docs/doc/bf30a94e-f8ad-448a-a836-7c2845544b79.json
new file mode 100644
index 000000000..2950a722e
--- /dev/null
+++ b/docs/doc/bf30a94e-f8ad-448a-a836-7c2845544b79.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code file is a part of the PaddleVideo library and contains definitions for different head models (BaseHead, TSNHead, TSMRecHead) used in Video Quality Assessment. It imports these classes from other files within the modeling/heads directory and provides them to be used by other parts of the library.",
+    "details": [
+        {
+            "comment": "This code file is a part of the PaddleVideo library and contains definitions for different head models (BaseHead, TSNHead, TSMRecHead) used in Video Quality Assessment. It imports these classes from other files within the modeling/heads directory and provides them to be used by other parts of the library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py\":0-20",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .base import BaseHead\nfrom .tsn_head import TSNHead\nfrom .tsm_rec_head import TSMRecHead\n__all__ = ['BaseHead', 'TSNHead', 'TSMRecHead']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bf9cdb75-d2a5-45c0-b919-4d876129c06e.json b/docs/doc/bf9cdb75-d2a5-45c0-b919-4d876129c06e.json
new file mode 100644
index 000000000..1d5afd3d6
--- /dev/null
+++ b/docs/doc/bf9cdb75-d2a5-45c0-b919-4d876129c06e.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code file contains import statements and a list of functions used for image preprocessing in PaddleVideo's EIVideo application. It includes Resize, RandomCrop, RandomHorizontalFlip, ToTensor, and RandomScale transformations specific to the \"manet\" model. These transformations are part of PaddlePaddle's video processing framework.",
+    "details": [
+        {
+            "comment": "This code file contains import statements and a list of functions used for image preprocessing in PaddleVideo's EIVideo application. It includes Resize, RandomCrop, RandomHorizontalFlip, ToTensor, and RandomScale transformations specific to the \"manet\" model. These transformations are part of PaddlePaddle's video processing framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py\":0-20",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .custom_transforms_f import Resize_manet, RandomCrop_manet, RandomHorizontalFlip_manet, ToTensor_manet, \\\n    RandomScale_manet\n__all__ = [\n     'Resize_manet', 'RandomCrop_manet',\n    'RandomHorizontalFlip_manet', 'ToTensor_manet', 'RandomScale_manet',\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bfcf0ae4-ed3d-4567-81a9-0db6efb71708.json b/docs/doc/bfcf0ae4-ed3d-4567-81a9-0db6efb71708.json
new file mode 100644
index 000000000..1e249065f
--- /dev/null
+++ b/docs/doc/bfcf0ae4-ed3d-4567-81a9-0db6efb71708.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code loads a JSON file, splits the ground truth sequences (gts) into training and validation sets, and saves them as separate JSON files. It uses the json module for reading and writing JSON data. The original file is labeled 'label_cls14_train.json' and has gts from index 0 to 4 in the validation set, and gts from index 5 onwards in the training set. The code also writes a new validation set in '/home/aistudio/data/label_cls14_val.json' with the same fps (25).",
+    "details": [
+        {
+            "comment": "This code loads a JSON file, splits the ground truth sequences (gts) into training and validation sets, and saves them as separate JSON files. It uses the json module for reading and writing JSON data. The original file is labeled 'label_cls14_train.json' and has gts from index 0 to 4 in the validation set, and gts from index 5 onwards in the training set. The code also writes a new validation set in '/home/aistudio/data/label_cls14_val.json' with the same fps (25).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/val_split.py\":0-18",
+            "content": "import json\nwith open('/home/aistudio/data/label_cls14_train.json') as f:\n    data = json.load(f)\nf.close()\nval = {'gts': data['gts'][0:5], 'fps': 25}\njsonString = json.dumps(val, indent=4, ensure_ascii=False)\njsonFile = open('/home/aistudio/data/label_cls14_val.json', 'w')\njsonFile.write(jsonString)\njsonFile.close()\ntrain = {'gts': data['gts'][5:], 'fps': 25}\njsonString = json.dumps(train, indent=4, ensure_ascii=False)\njsonFile = open('/home/aistudio/data/label_cls14_train.json', 'w')\njsonFile.write(jsonString)\njsonFile.close()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/bff320f1-95c0-46f2-8cc0-af254f2c92ff.json b/docs/doc/bff320f1-95c0-46f2-8cc0-af254f2c92ff.json
new file mode 100644
index 000000000..3c0b07ae2
--- /dev/null
+++ b/docs/doc/bff320f1-95c0-46f2-8cc0-af254f2c92ff.json
@@ -0,0 +1,55 @@
+{
+    "summary": "The code introduces a spatial-temporal detection dataset class in PaddleVideo, initializes attributes and evaluation functions, loads records from paths, prepares training data by filtering proposals and annotations, pads elements to fixed lengths, and defines methods for padding 2D/1D features.",
+    "details": [
+        {
+            "comment": "This code snippet is the AVA dataset class for spatial-temporal detection, which is part of PaddleVideo. It imports necessary modules and registers the dataset in the DATASETS registry. The class inherits from BaseDataset and includes a function ava_evaluate_results for evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":0-31",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nimport sys\nimport os\nimport pickle\nfrom datetime import datetime\nfrom ...metrics.ava_utils import ava_evaluate_results\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom collections import defaultdict\n@DATASETS.register()\nclass AVADataset(BaseDataset):\n    \"\"\"AVA dataset for spatial temporal detection.\n    the dataset loads raw frames, bounding boxes, proposals and applies"
+        },
+        {
+            "comment": "This code is initializing a class with various parameters for the AvaDataset. It sets default values and performs checks on input values, such as ensuring 'person_det_score_thr' falls within 0 to 1 range. The code also initializes instance variables, including custom classes, exclude file path, label file path, proposal file path, and more.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":32-61",
+            "content": "    transformations to return the frame tensors and other information.\n    \"\"\"\n    _FPS = 30\n    def __init__(self,\n                 pipeline,\n                 file_path=None,\n                 exclude_file=None,\n                 label_file=None,\n                 suffix='{:05}.jpg',\n                 proposal_file=None,\n                 person_det_score_thr=0.9,\n                 num_classes=81,\n                 data_prefix=None,\n                 test_mode=False,\n                 num_max_proposals=1000,\n                 timestamp_start=900,\n                 timestamp_end=1800):\n        self.custom_classes = None\n        self.exclude_file = exclude_file\n        self.label_file = label_file\n        self.proposal_file = proposal_file\n        assert 0 <= person_det_score_thr <= 1, (\n            'The value of '\n            'person_det_score_thr should in [0, 1]. ')\n        self.person_det_score_thr = person_det_score_thr\n        self.num_classes = num_classes\n        self.suffix = suffix\n        self.num_max_proposals = num_max_proposals"
+        },
+        {
+            "comment": "The code snippet initializes class attributes and checks for proposal file. If the proposal file exists, it loads the proposals; otherwise, it sets them as None. It then filters out invalid indexes if not in test mode. The code also includes a method to load data from a given path using pickle and close the file afterward. Another method parses img_records by extracting bounding boxes, labels, and entity IDs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":62-92",
+            "content": "        self.timestamp_start = timestamp_start\n        self.timestamp_end = timestamp_end\n        super().__init__(\n            file_path,\n            pipeline,\n            data_prefix,\n            test_mode,\n        )\n        if self.proposal_file is not None:\n            self.proposals = self._load(self.proposal_file)\n        else:\n            self.proposals = None\n        if not test_mode:\n            valid_indexes = self.filter_exclude_file()\n            self.info = self.info = [self.info[i] for i in valid_indexes]\n    def _load(self, path):\n        f = open(path, 'rb')\n        res = pickle.load(f)\n        f.close()\n        return res\n    def parse_img_record(self, img_records):\n        bboxes, labels, entity_ids = [], [], []\n        while len(img_records) > 0:\n            img_record = img_records[0]\n            num_img_records = len(img_records)\n            selected_records = list(\n                filter(\n                    lambda x: np.array_equal(x['entity_box'], img_record[\n                        'entity_box']), img_records))"
+        },
+        {
+            "comment": "This code is filtering out specific records from the dataset. It checks if the entity box of each record matches with a given img_record's entity box, excluding them if they do. If there are no exclude file information, it includes all the records in valid_indexes. Finally, it stacks and returns bboxes, labels, and entity_ids for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":93-121",
+            "content": "            num_selected_records = len(selected_records)\n            img_records = list(\n                filter(\n                    lambda x: not np.array_equal(x['entity_box'], img_record[\n                        'entity_box']), img_records))\n            assert len(img_records) + num_selected_records == num_img_records\n            bboxes.append(img_record['entity_box'])\n            valid_labels = np.array([\n                selected_record['label'] for selected_record in selected_records\n            ])\n            label = np.zeros(self.num_classes, dtype=np.float32)\n            label[valid_labels] = 1.\n            labels.append(label)\n            entity_ids.append(img_record['entity_id'])\n        bboxes = np.stack(bboxes)\n        labels = np.stack(labels)\n        entity_ids = np.stack(entity_ids)\n        return bboxes, labels, entity_ids\n    def filter_exclude_file(self):\n        valid_indexes = []\n        if self.exclude_file is None:\n            valid_indexes = list(range(len(self.info)))\n        else:\n            exclude_video_infos = ["
+        },
+        {
+            "comment": "The code reads a file, splits each line into video ID, timestamp, and other data. It then checks for any exclusion videos based on the ID and timestamp. If found, it removes that index from the valid_indexes list. Finally, it returns the updated valid_indexes list. The load_file method reads the file, extracts information including video ID, timestamp, entity box, label, and entity ID for each line.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":122-147",
+            "content": "                x.strip().split(',') for x in open(self.exclude_file)\n            ]\n            for i, video_info in enumerate(self.info):\n                valid_indexes.append(i)\n                for video_id, timestamp in exclude_video_infos:\n                    if (video_info['video_id'] == video_id\n                            and video_info['timestamp'] == int(timestamp)):\n                        valid_indexes.pop()\n                        break\n        return valid_indexes\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        records_dict_by_img = defaultdict(list)\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split(',')\n                video_id = line_split[0]\n                timestamp = int(line_split[1])\n                img_key = f'{video_id},{timestamp:04d}'\n                entity_box = np.array(list(map(float, line_split[2:6])))\n                label = int(line_split[6])\n                entity_id = int(line_split[7])"
+        },
+        {
+            "comment": "The code initializes `shot_info` based on the timestamp range and FPS, then creates a `video_info` dictionary containing various video details. It appends this information to the `records_dict_by_img` for each `img_key`. Next, it extracts video ID and timestamp from `img_key`, calls `parse_img_record()`, and stores the resulting bounding boxes, labels, and entity IDs in an `ann` dictionary. Finally, it sets the frame directory path and adds a new `video_info` dictionary for each video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":148-169",
+            "content": "                shot_info = (0, (self.timestamp_end - self.timestamp_start) *\n                             self._FPS)\n                video_info = dict(video_id=video_id,\n                                  timestamp=timestamp,\n                                  entity_box=entity_box,\n                                  label=label,\n                                  entity_id=entity_id,\n                                  shot_info=shot_info)\n                records_dict_by_img[img_key].append(video_info)\n        for img_key in records_dict_by_img:\n            video_id, timestamp = img_key.split(',')\n            bboxes, labels, entity_ids = self.parse_img_record(\n                records_dict_by_img[img_key])\n            ann = dict(gt_bboxes=bboxes,\n                       gt_labels=labels,\n                       entity_ids=entity_ids)\n            frame_dir = video_id\n            if self.data_prefix is not None:\n                frame_dir = osp.join(self.data_prefix, frame_dir)\n            video_info = dict(frame_dir=frame_dir,"
+        },
+        {
+            "comment": "The code initializes a video information object with the provided parameters, including the video ID, timestamp, image key, shot info, FPS, and annotations. It then appends this object to a list of video information. The prepare_train method takes an index, creates a copy of the corresponding video information from the list, adds suffix and timestamp information if applicable, and populates proposals with default values if not present in self.proposals.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":170-196",
+            "content": "                              video_id=video_id,\n                              timestamp=int(timestamp),\n                              img_key=img_key,\n                              shot_info=shot_info,\n                              fps=self._FPS,\n                              ann=ann)\n            info.append(video_info)\n        return info\n    def prepare_train(self, idx):\n        results = copy.deepcopy(self.info[idx])\n        img_key = results['img_key']\n        results['suffix'] = self.suffix\n        results['timestamp_start'] = self.timestamp_start\n        results['timestamp_end'] = self.timestamp_end\n        if self.proposals is not None:\n            if img_key not in self.proposals:\n                results['proposals'] = np.array([[0, 0, 1, 1]])\n                results['scores'] = np.array([1])\n            else:\n                proposals = self.proposals[img_key]\n                assert proposals.shape[-1] in [4, 5]\n                if proposals.shape[-1] == 5:\n                    thr = min(self.person_det_score_thr, max(proposals[:, 4]))"
+        },
+        {
+            "comment": "This code is filtering and padding proposals and annotations for a dataset. It selects positive proposals based on a threshold, limits the number of proposals to the maximum allowed, and assigns the results to different categories. If there are no positive proposals, it simply limits the number and assigns them. After that, it retrieves ground truth bounding boxes, labels, and entity IDs from the 'ann' dictionary. Finally, the code pads the proposals, scores, and other elements with zeros to reach a fixed length of 128 using a custom padding function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":197-220",
+            "content": "                    positive_inds = (proposals[:, 4] >= thr)\n                    proposals = proposals[positive_inds]\n                    proposals = proposals[:self.num_max_proposals]\n                    results['proposals'] = proposals[:, :4]\n                    results['scores'] = proposals[:, 4]\n                else:\n                    proposals = proposals[:self.num_max_proposals]\n                    results['proposals'] = proposals\n        ann = results.pop('ann')\n        results['gt_bboxes'] = ann['gt_bboxes']\n        results['gt_labels'] = ann['gt_labels']\n        results['entity_ids'] = ann['entity_ids']\n        #ret = self.pipeline(results, \"\")\n        ret = self.pipeline(results)\n        #padding for dataloader\n        len_proposals = ret['proposals'].shape[0]\n        len_gt_bboxes = ret['gt_bboxes'].shape[0]\n        len_gt_labels = ret['gt_labels'].shape[0]\n        len_scores = ret['scores'].shape[0]\n        len_entity_ids = ret['entity_ids'].shape[0]\n        padding_len = 128\n        ret['proposals'] = self.my_padding_2d(ret['proposals'], padding_len)"
+        },
+        {
+            "comment": "This code snippet defines a class with methods for padding 2D and 1D features. The 'my_padding_2d' method takes a feature matrix and pads it with zeros to the maximum length specified, while the 'my_padding_1d' method does the same but for 1D features. These methods are then called in another function to pad various feature matrices before returning them along with other variables.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":221-239",
+            "content": "        ret['gt_bboxes'] = self.my_padding_2d(ret['gt_bboxes'], padding_len)\n        ret['gt_labels'] = self.my_padding_2d(ret['gt_labels'], padding_len)\n        ret['scores'] = self.my_padding_1d(ret['scores'], padding_len)\n        ret['entity_ids'] = self.my_padding_1d(ret['entity_ids'], padding_len)\n        return ret['imgs'][0], ret['imgs'][1], ret['proposals'], ret[\n            'gt_bboxes'], ret['gt_labels'], ret['scores'], ret[\n                'entity_ids'], np.array(\n                    ret['img_shape'], dtype=int\n                ), idx, len_proposals, len_gt_bboxes, len_gt_labels, len_scores, len_entity_ids\n    def my_padding_2d(self, feat, max_len):\n        feat_add = np.zeros((max_len - feat.shape[0], feat.shape[1]),\n                            dtype=np.float32)\n        feat_pad = np.concatenate((feat, feat_add), axis=0)\n        return feat_pad\n    def my_padding_1d(self, feat, max_len):\n        feat_add = np.zeros((max_len - feat.shape[0]), dtype=np.float32)\n        feat_pad = np.concatenate((feat, feat_add), axis=0)"
+        },
+        {
+            "comment": "The code defines three functions: 'prepare_train', 'prepare_test', and 'evaluate'. The 'prepare_train' function is used to prepare training data given an index, while the 'prepare_test' function returns the same as 'prepare_train'. The 'evaluate' function evaluates the results using 'ava_evaluate_results' by passing various arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/ava_dataset.py\":240-248",
+            "content": "        return feat_pad\n    def prepare_test(self, idx):\n        return self.prepare_train(idx)\n    def evaluate(self, results):\n        return ava_evaluate_results(self.info, len(self), results,\n                                    self.custom_classes, self.label_file,\n                                    self.file_path, self.exclude_file)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c009818a-14d8-4300-8db0-70ffcf3fab9d.json b/docs/doc/c009818a-14d8-4300-8db0-70ffcf3fab9d.json
new file mode 100644
index 000000000..9af6f96f7
--- /dev/null
+++ b/docs/doc/c009818a-14d8-4300-8db0-70ffcf3fab9d.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code initializes global profiling variables and defines the ProfilerOptions class for operator-level timing using PaddlePaddle's profiler. It also stops the profiler, checks for exit conditions, and increments _profiler_step_id.",
+    "details": [
+        {
+            "comment": "This code is setting up a global variable to record the number of calling times for profiler functions and another global variable to avoid parsing from string every time. It also defines the ProfilerOptions class, which can be initialized using a string in the format \"key1=value1;key2=value;key3=value3\". This indicates that the code is part of PaddleVideo's EIVideo application and is related to profiling options and step ID management.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py\":0-28",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nimport paddle\n# A global variable to record the number of calling times for profiler\n# functions. It is used to specify the tracing range of training steps.\n_profiler_step_id = 0\n# A global variable to avoid parsing from string every time.\n_profiler_options = None\nclass ProfilerOptions(object):\n    \"\"\"\n    Use a string to initialize a ProfilerOptions.\n    The string should be in the format: \"key1=value1;key2=value;key3=value3\"."
+        },
+        {
+            "comment": "The code defines a class \"ProfilerOptions\" which takes in an options string and initializes its attributes. Options can include batch_range, state (CPU/GPU/All), sorted_key (calls/total/max/min/ave), tracer_option (Default/OpDetail/AllOpDetail), profile_path for storing serialized data, and exit_on_finished boolean flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py\":29-51",
+            "content": "    For example:\n      \"profile_path=model.profile\"\n      \"batch_range=[50, 60]; profile_path=model.profile\"\n      \"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile\"\n    ProfilerOptions supports following key-value pair:\n      batch_range      - a integer list, e.g. [100, 110].\n      state            - a string, the optional values are 'CPU', 'GPU' or 'All'.\n      sorted_key       - a string, the optional values are 'calls', 'total',\n                         'max', 'min' or 'ave.\n      tracer_option    - a string, the optional values are 'Default', 'OpDetail',\n                         'AllOpDetail'.\n      profile_path     - a string, the path to save the serialized profile data,\n                         which can be used to generate a timeline.\n      exit_on_finished - a boolean.\n    \"\"\"\n    def __init__(self, options_str):\n        assert isinstance(options_str, str)\n        self._options = {\n            'batch_range': [10, 20],\n            'state': 'All',\n            'sorted_key': 'total',"
+        },
+        {
+            "comment": "Class for parsing profile options from a string. It stores the batch range, tracer option, exit on finished status, state, sorted key, and profile path as options. The _parse_from_string function sets the values based on specific conditions: if the 'batch_range' is valid, 'exit_on_finished' is set to True if the value matches \"yes\", \"true\", \"t\", or \"1\", and other options are directly assigned from the string. If an option name is not found in the string, it returns None.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py\":52-75",
+            "content": "            'tracer_option': 'Default',\n            'profile_path': '/tmp/profile',\n            'exit_on_finished': True\n        }\n        self._parse_from_string(options_str)\n    def _parse_from_string(self, options_str):\n        for kv in options_str.replace(' ', '').split(';'):\n            key, value = kv.split('=')\n            if key == 'batch_range':\n                value_list = value.replace('[', '').replace(']', '').split(',')\n                value_list = list(map(int, value_list))\n                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[\n                        1] > value_list[0]:\n                    self._options[key] = value_list\n            elif key == 'exit_on_finished':\n                self._options[key] = value.lower() in (\"yes\", \"true\", \"t\", \"1\")\n            elif key in [\n                    'state', 'sorted_key', 'tracer_option', 'profile_path'\n            ]:\n                self._options[key] = value\n    def __getitem__(self, name):\n        if self._options.get(name, None) is None:"
+        },
+        {
+            "comment": "This function enables the operator-level timing using PaddlePaddle's profiler. It initializes the ProfilerOptions with a provided string and increments the global profiler step id. If the current step matches the start or end of the batch range in the options, it starts or stops the profiler respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py\":76-103",
+            "content": "            raise ValueError(\n                \"ProfilerOptions does not have an option named %s.\" % name)\n        return self._options[name]\ndef add_profiler_step(options_str=None):\n    \"\"\"\n    Enable the operator-level timing using PaddlePaddle's profiler.\n    The profiler uses a independent variable to count the profiler steps.\n    One call of this function is treated as a profiler step.\n    Args:\n      profiler_options - a string to initialize the ProfilerOptions.\n                         Default is None, and the profiler is disabled.\n    \"\"\"\n    if options_str is None:\n        return\n    global _profiler_step_id\n    global _profiler_options\n    if _profiler_options is None:\n        _profiler_options = ProfilerOptions(options_str)\n    if _profiler_step_id == _profiler_options['batch_range'][0]:\n        paddle.utils.profiler.start_profiler(_profiler_options['state'],\n                                             _profiler_options['tracer_option'])\n    elif _profiler_step_id == _profiler_options['batch_range'][1]:"
+        },
+        {
+            "comment": "This code snippet stops the profiler, checks if it should exit on finished, and increments _profiler_step_id.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py\":104-109",
+            "content": "        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],\n                                            _profiler_options['profile_path'])\n        if _profiler_options['exit_on_finished']:\n            sys.exit(0)\n    _profiler_step_id += 1"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c03d34db-0b44-4eaf-b7cd-10271f89d591.json b/docs/doc/c03d34db-0b44-4eaf-b7cd-10271f89d591.json
new file mode 100644
index 000000000..9fa815346
--- /dev/null
+++ b/docs/doc/c03d34db-0b44-4eaf-b7cd-10271f89d591.json
@@ -0,0 +1,75 @@
+{
+    "summary": "The code defines the Deep Residual Network (DRN) model and MA-Net architecture in PaddlePaddle, with various configurations and optional pre-trained weights. It also includes low-level feature retention through processing inputs and can be tested using examples.",
+    "details": [
+        {
+            "comment": "This code defines a class BasicBlock, which is an extension of the nn.Layer class in PaddlePaddle's library. It contains a convolution layer with 3x3 kernel size and optional downsampling using a stride greater than 1. The BasicBlock has an expansion parameter set to 1, indicating no change in the input and output channel dimensions. There are pre-trained models available for download from the specified URLs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":0-28",
+            "content": "import paddle.nn as nn\nimport math\nwebroot = 'https://tigress-web.princeton.edu/~fy/drn/models/'\nmodel_urls = {\n    'resnet50': 'https://download.pypaddle.org/models/resnet50-19c8e357.pth',\n    'drn-c-26': webroot + 'drn_c_26-ddedf421.pth',\n    'drn-c-42': webroot + 'drn_c_42-9d336e8c.pth',\n    'drn-c-58': webroot + 'drn_c_58-0a53a92c.pth',\n    'drn-d-22': webroot + 'drn_d_22-4bd2f8ea.pth',\n    'drn-d-38': webroot + 'drn_d_38-eebb45f0.pth',\n    'drn-d-54': webroot + 'drn_d_54-0e0534ff.pth',\n    'drn-d-105': webroot + 'drn_d_105-12b40979.pth'\n}\ndef conv3x3(in_planes, out_planes, stride=1, padding=1, dilation=1):\n    return nn.Conv2D(in_planes, out_planes, kernel_size=3, stride=stride,\n                     padding=padding, bias_attr=False, dilation=dilation)\nclass BasicBlock(nn.Layer):\n    expansion = 1\n    def __init__(self, inplanes, planes, stride=1, downsample=None,\n                 dilation=(1, 1), residual=True, BatchNorm=None):\n        super(BasicBlock, self).__init__()\n        self.conv1 = conv3x3(inplanes, planes, stride,"
+        },
+        {
+            "comment": "This code defines a residual block with BatchNormalization and ReLU activation, using convolutions and optional downsampling. The Bottleneck class also includes a 1x1 convolution and has an expansion factor of 4.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":29-64",
+            "content": "                             padding=dilation[0], dilation=dilation[0])\n        self.bn1 = BatchNorm(planes)\n        self.relu = nn.ReLU()\n        self.conv2 = conv3x3(planes, planes,\n                             padding=dilation[1], dilation=dilation[1])\n        self.bn2 = BatchNorm(planes)\n        self.downsample = downsample\n        self.stride = stride\n        self.residual = residual\n    def forward(self, x):\n        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        if self.residual:\n            out += residual\n        out = self.relu(out)\n        return out\nclass Bottleneck(nn.Layer):\n    expansion = 4\n    def __init__(self, inplanes, planes, stride=1, downsample=None,\n                 dilation=(1, 1), residual=True, BatchNorm=None):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)"
+        },
+        {
+            "comment": "This code defines a DRN (Deep Residual Network) model with residual blocks. It includes batch normalization, convolutional layers, and ReLU activation functions. The forward method applies the layers sequentially and performs residual connections if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":65-102",
+            "content": "        self.bn1 = BatchNorm(planes)\n        self.conv2 = nn.Conv2D(planes, planes, kernel_size=3, stride=stride,\n                               padding=dilation[1], bias_attr=False,\n                               dilation=dilation[1])\n        self.bn2 = BatchNorm(planes)\n        self.conv3 = nn.Conv2D(planes, planes * 4, kernel_size=1, bias_attr=False)\n        self.bn3 = BatchNorm(planes * 4)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n    def forward(self, x):\n        residual = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass DRN(nn.Layer):\n    def __init__(self, block, layers, arch='D',\n                 channels=(16, 32, 64, 128, 256, 512, 512, 512),"
+        },
+        {
+            "comment": "This code defines a DRN class that inherits from an unknown base class. It initializes the object with specified number of channels, layers, and architecture type ('C' or 'D'). The constructor creates different layers depending on the architecture: for 'C', it includes convolutional and pooling layers with BatchNorm and ReLU activation; for 'D', it only includes a convolutional layer followed by BatchNorm and ReLU activation, then adds more convolutional layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":103-129",
+            "content": "                 BatchNorm=None):\n        super(DRN, self).__init__()\n        self.inplanes = channels[0]\n        self.out_dim = channels[-1]\n        self.arch = arch\n        if arch == 'C':\n            self.conv1 = nn.Conv2D(3, channels[0], kernel_size=7, stride=1,\n                                   padding=3, bias_attr=False)\n            self.bn1 = BatchNorm(channels[0])\n            self.relu = nn.ReLU()\n            self.layer1 = self._make_layer(\n                BasicBlock, channels[0], layers[0], stride=1, BatchNorm=BatchNorm)\n            self.layer2 = self._make_layer(\n                BasicBlock, channels[1], layers[1], stride=2, BatchNorm=BatchNorm)\n        elif arch == 'D':\n            self.layer0 = nn.Sequential(\n                nn.Conv2D(3, channels[0], kernel_size=7, stride=1, padding=3,\n                          bias_attr=False),\n                BatchNorm(channels[0]),\n                nn.ReLU()\n            )\n            self.layer1 = self._make_conv_layers(\n                channels[0], layers[0], stride=1, BatchNorm=BatchNorm)"
+        },
+        {
+            "comment": "The code defines a network architecture with six potential layers (2-6) using the provided block, and two additional layers (7 & 8) if the architecture is 'C'. Each layer has a specific number of channels, layers, and dilation rate. The last three layers can be set to None if their corresponding number of layers is 0. Batch Normalization is applied to each layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":130-146",
+            "content": "            self.layer2 = self._make_conv_layers(\n                channels[1], layers[1], stride=2, BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block, channels[2], layers[2], stride=2, BatchNorm=BatchNorm)\n        self.layer4 = self._make_layer(block, channels[3], layers[3], stride=2, BatchNorm=BatchNorm)\n        self.layer5 = self._make_layer(block, channels[4], layers[4],\n                                       dilation=2, new_level=False, BatchNorm=BatchNorm)\n        self.layer6 = None if layers[5] == 0 else \\\n            self._make_layer(block, channels[5], layers[5], dilation=4,\n                             new_level=False, BatchNorm=BatchNorm)\n        if arch == 'C':\n            self.layer7 = None if layers[6] == 0 else \\\n                self._make_layer(BasicBlock, channels[6], layers[6], dilation=2,\n                                 new_level=False, residual=False, BatchNorm=BatchNorm)\n            self.layer8 = None if layers[7] == 0 else \\\n                self._make_layer(BasicBlock, channels[7], layers[7], dilation=1,"
+        },
+        {
+            "comment": "This code defines a network backbone for the MA-Net model in PaddleVideo. It includes layers 1 to 8 with optional activation, residual connections, and batch normalization. The `_init_weight` function initializes weights for convolutional and batch normalization layers, while `_make_layer` creates each layer of the backbone based on the specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":147-169",
+            "content": "                                 new_level=False, residual=False, BatchNorm=BatchNorm)\n        elif arch == 'D':\n            self.layer7 = None if layers[6] == 0 else \\\n                self._make_conv_layers(channels[6], layers[6], dilation=2, BatchNorm=BatchNorm)\n            self.layer8 = None if layers[7] == 0 else \\\n                self._make_conv_layers(channels[7], layers[7], dilation=1, BatchNorm=BatchNorm)\n        self._init_weight()\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                m.weight.normal_(0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2D):\n                from manet_paddle.utils.api import fill_\n                fill_(m.weight, 1)\n                from manet_paddle.utils.api import zero_\n                zero_(m.bias)\n    def _make_layer(self, block, planes, blocks, stride=1, dilation=1,\n                    new_level=True, residual=True, BatchNorm=None):"
+        },
+        {
+            "comment": "This code is creating a network layer with multiple blocks. It checks the stride and dilation to determine if downsampling is required, then constructs a Sequential module of convolutional layers using the provided number of blocks, channels, and convolutions. The BatchNorm function is an optional parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":170-192",
+            "content": "        assert dilation == 1 or dilation % 2 == 0\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes, planes * block.expansion,\n                          kernel_size=1, stride=stride, bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = list()\n        layers.append(block(\n            self.inplanes, planes, stride, downsample,\n            dilation=(1, 1) if dilation == 1 else (\n                dilation // 2 if new_level else dilation, dilation),\n            residual=residual, BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(block(self.inplanes, planes, residual=residual,\n                                dilation=(dilation, dilation), BatchNorm=BatchNorm))\n        return nn.Sequential(*layers)\n    def _make_conv_layers(self, channels, convs, stride=1, dilation=1, BatchNorm=None):"
+        },
+        {
+            "comment": "The code defines a DRN (Deep Residual Network) backbone with multiple layers. It first creates a list of modules containing convolutional layers, batch normalization, and ReLU activation. The `forward` function handles different architectures ('C' or 'D') and processes input through various layers while retaining low-level features. The DRN_A class extends the functionality with more layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":193-233",
+            "content": "        modules = []\n        for i in range(convs):\n            modules.extend([\n                nn.Conv2D(self.inplanes, channels, kernel_size=3,\n                          stride=stride if i == 0 else 1,\n                          padding=dilation, bias_attr=False, dilation=dilation),\n                BatchNorm(channels),\n                nn.ReLU()])\n            self.inplanes = channels\n        return nn.Sequential(*modules)\n    def forward(self, x):\n        if self.arch == 'C':\n            x = self.conv1(x)\n            x = self.bn1(x)\n            x = self.relu(x)\n        elif self.arch == 'D':\n            x = self.layer0(x)\n        x = self.layer1(x)\n        x = self.layer2(x)\n        x = self.layer3(x)\n        low_level_feat = x\n        x = self.layer4(x)\n        x = self.layer5(x)\n        if self.layer6 is not None:\n            x = self.layer6(x)\n        if self.layer7 is not None:\n            x = self.layer7(x)\n        if self.layer8 is not None:\n            x = self.layer8(x)\n        return x, low_level_feat\nclass DRN_A(nn.Layer):"
+        },
+        {
+            "comment": "The code defines a DRN_A class that is a type of backbone network. It has an __init__ method initializing parameters, and includes a Conv2D layer, BatchNorm layer, ReLU activation, MaxPool2D layer, and several _make_layer methods for creating different layers with varying dimensions and strides. The _init_weight method is used to initialize the weights of the convolution layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":235-256",
+            "content": "    def __init__(self, block, layers, BatchNorm=None):\n        self.inplanes = 64\n        super(DRN_A, self).__init__()\n        self.out_dim = 512 * block.expansion\n        self.conv1 = nn.Conv2D(3, 64, kernel_size=7, stride=2, padding=3,\n                               bias_attr=False)\n        self.bn1 = BatchNorm(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block, 64, layers[0], BatchNorm=BatchNorm)\n        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, BatchNorm=BatchNorm)\n        self.layer3 = self._make_layer(block, 256, layers[2], stride=1,\n                                       dilation=2, BatchNorm=BatchNorm)\n        self.layer4 = self._make_layer(block, 512, layers[3], stride=1,\n                                       dilation=4, BatchNorm=BatchNorm)\n        self._init_weight()\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels"
+        },
+        {
+            "comment": "The code defines a function _make_layer that creates layers of a specified block with the given number of blocks, planes, and stride. It also handles downsampling if needed and initializes the weights for BatchNorm2D layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":257-278",
+            "content": "                m.weight.normal_(0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2D):\n                from manet_paddle.utils.api import fill_\n                fill_(m.weight, 1)\n                from manet_paddle.utils.api import zero_\n                zero_(m.bias)\n    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, BatchNorm=None):\n        downsample = None\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(self.inplanes, planes * block.expansion,\n                          kernel_size=1, stride=stride, bias_attr=False),\n                BatchNorm(planes * block.expansion),\n            )\n        layers = []\n        layers.append(block(self.inplanes, planes, stride, downsample, BatchNorm=BatchNorm))\n        self.inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(block(self.inplanes, planes,\n                                dilation=(dilation, dilation, ), BatchNorm=BatchNorm))"
+        },
+        {
+            "comment": "This code defines three functions: drn_a_50, drn_c_26, and drn_c_42. Each function takes a BatchNorm argument and an optional pretrained flag. The functions return different types of DRN models based on the input arguments. If pretrained is True, the code sets the model's state dictionary to a pre-trained model's weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":280-317",
+            "content": "        return nn.Sequential(*layers)\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n        x = self.layer1(x)\n        x = self.layer2(x)\n        x = self.layer3(x)\n        x = self.layer4(x)\n        return x\ndef drn_a_50(BatchNorm, pretrained=True):\n    model = DRN_A(Bottleneck, [3, 4, 6, 3], BatchNorm=BatchNorm)\n    if pretrained:\n        import paddlehub as hub\n        model.set_state_dict(hub.Module(name=\"resnet50_vd_animals\"))\n    return model\ndef drn_c_26(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='C', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-c-26'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_c_42(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-c-42'])"
+        },
+        {
+            "comment": "Code defines functions for initializing DRN models with different architectures (C, D) and sizes (58, 22, 24). If `pretrained` is True, it loads pre-trained model weights from a URL and removes the last fully connected layer's weight and bias before setting the state dictionary of the model. This allows for custom downstream tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":318-348",
+            "content": "        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_c_58(BatchNorm, pretrained=True):\n    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-c-58'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_22(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-22'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_24(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 2, 2], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-24'])\n        del pretrained['fc.weight']"
+        },
+        {
+            "comment": "The code defines three functions, drn_d_38, drn_d_40, and drn_d_54, which return instances of the DRN model with different configurations and optional pre-trained weights. If pre-trained weights are specified, it loads them from a URL and deletes 'fc.weight' and 'fc.bias' keys before setting the state dictionary of the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":349-379",
+            "content": "        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_38(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-38'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_40(BatchNorm, pretrained=True):\n    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 2, 2], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-40'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\ndef drn_d_54(BatchNorm, pretrained=True):\n    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-54'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']"
+        },
+        {
+            "comment": "This code defines a function 'drn_d_105' that returns an instance of the DRN model with specified parameters. It also loads pre-trained weights for the model if 'pretrained' flag is set to True. The example usage at the end creates and tests an instance of the DRN model with specific parameters, using PaddlePaddle library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/drn.py\":380-399",
+            "content": "        model.set_state_dict(pretrained)\n    return model\ndef drn_d_105(BatchNorm, pretrained=True):\n    model = DRN(Bottleneck, [1, 1, 3, 4, 23, 3, 1, 1], arch='D', BatchNorm=BatchNorm)\n    if pretrained:\n        pretrained = model_zoo.load_url(model_urls['drn-d-105'])\n        del pretrained['fc.weight']\n        del pretrained['fc.bias']\n        model.set_state_dict(pretrained)\n    return model\nif __name__ == \"__main__\":\n    import paddle\n    model = drn_a_50(BatchNorm=nn.BatchNorm2D, pretrained=True)\n    input = paddle.rand([1, 3, 512, 512])\n    output, low_level_feat = model(input)\n    print(output.shape)\n    print(low_level_feat.shape)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c0d9077b-2b9a-4163-9af7-0ed37ff0075b.json b/docs/doc/c0d9077b-2b9a-4163-9af7-0ed37ff0075b.json
new file mode 100644
index 000000000..dcccdcf09
--- /dev/null
+++ b/docs/doc/c0d9077b-2b9a-4163-9af7-0ed37ff0075b.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is a Python module for video and frame datasets in PaddleVideo. It includes the VideoDataset class, FrameRecDataset class (from frame_rec module), and defines __all__.",
+    "details": [
+        {
+            "comment": "This code is a Python module for video and frame datasets in PaddleVideo. It includes the VideoDataset class, FrameRecDataset class (from frame_rec module), and defines __all__.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py\":0-20",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .video import VideoDataset\n#from .frame import FrameDataset\nfrom .frame_rec import FrameRecDataset\n__all__ = ['VideoDataset', 'FrameRecDataset']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c0fee295-486e-411f-81b5-35d6374afee3.json b/docs/doc/c0fee295-486e-411f-81b5-35d6374afee3.json
new file mode 100644
index 000000000..1effb5a77
--- /dev/null
+++ b/docs/doc/c0fee295-486e-411f-81b5-35d6374afee3.json
@@ -0,0 +1,165 @@
+{
+    "summary": "This code uses PaddleVideo for video segment matching, ASPP-based deep learning models for object size determination and feature extraction, handles padding, computes distances, prepares data, performs feature selection and masking, and utilizes parallel processing in PaddlePaddle.",
+    "details": [
+        {
+            "comment": "This code defines a function \"foreground2background\" that takes distance (dis) and object number (obj_num) as inputs. It returns the background distances for each foreground object when obj_num is greater than 1 by concatenating the unsqueezed distance of other objects along axis 1.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":0-30",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\ndef foreground2background(dis, obj_num):\n    if obj_num == 1:\n        return dis\n    bg_dis = []\n    for i in range(obj_num):\n        obj_back = []\n        for j in range(obj_num):\n            if i == j:\n                continue\n            obj_back.append(paddle.unsqueeze(dis[j], axis=0))\n        obj_back = paddle.concat(x=obj_back, axis=1)"
+        },
+        {
+            "comment": "This function calculates the pairwise squared L2 distances between tensors x and y, returns them in a matrix d. The function takes x and y as input, which are [n, feature_dim] and [m, feature_dim] respectively. It then performs matrix calculations to compute the pairwise distances and returns d, which is of size [n, m].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":31-67",
+            "content": "        obj_back = paddle.min(x=obj_back, axis=1, keepdim=True)\n        bg_dis.append(obj_back)\n    bg_dis = paddle.concat(x=bg_dis, axis=0)\n    return bg_dis\nWRONG_LABEL_PADDING_DISTANCE = 5e4\n#GLOBAL_DIST_MAP\ndef _pairwise_distances(x, x2, y, y2):\n    \"\"\"\n    Computes pairwise squared l2 distances between tensors x and y.\n    Args:\n    x: [n, feature_dim].\n    y: [m, feature_dim].\n    Returns:\n    d: [n, m].\n    \"\"\"\n    xs = x2\n    ys = y2\n    xs = paddle.unsqueeze(xs, axis=1)\n    ys = paddle.unsqueeze(ys, axis=0)\n    d = xs + ys - 2. * paddle.matmul(x, y, transpose_y=True)\n    return d\ndef _flattened_pairwise_distances(reference_embeddings, ref_square,\n                                  query_embeddings, query_square):\n    \"\"\"\n    Calculates flattened tensor of pairwise distances between ref and query.\n    Args:\n        reference_embeddings: [..., embedding_dim],\n          the embedding vectors for the reference frame\n        query_embeddings: [..., embedding_dim],\n          the embedding vectors for the query frames."
+        },
+        {
+            "comment": "This code computes pairwise distances between query and reference embeddings, then extracts features for each object using nearest neighbor attention. It takes embedding vectors for the reference frame and query frames as input, along with a mask for pixels not used for matching. The output is a tensor of nearest neighbor features shape [m_chunk, n_objects, n_chunk]. The code also checks the dtype of reference_embeddings to handle float16 data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":68-91",
+            "content": "    Returns:\n        dists: [reference_embeddings.size / embedding_dim, query_embeddings.size / embedding_dim]\n    \"\"\"\n    dists = _pairwise_distances(query_embeddings, query_square,\n                                reference_embeddings, ref_square)\n    return dists\ndef _nn_features_per_object_for_chunk(reference_embeddings, ref_square,\n                                      query_embeddings, query_square,\n                                      wrong_label_mask):\n    \"\"\"Extracts features for each object using nearest neighbor attention.\n    Args:\n        reference_embeddings: [n_chunk, embedding_dim],\n          the embedding vectors for the reference frame.\n        query_embeddings: [m_chunk, embedding_dim],\n          the embedding vectors for the query frames.\n        wrong_label_mask: [n_objects, n_chunk],\n          the mask for pixels not used for matching.\n    Returns:\n        nn_features: A float32 tensor of nearest neighbor features of shape\n          [m_chunk, n_objects, n_chunk].\n    \"\"\"\n    if reference_embeddings.dtype == \"float16\":"
+        },
+        {
+            "comment": "This function calculates the nearest neighbor features per object in chunks to save memory. It takes reference embeddings, query embeddings, and reference labels as inputs. The function first casts the wrong_label_mask based on its type (float16 or float32). Then it calculates pairwise distances between reference and query embeddings. Distances for incorrect matches are set to a specific padding distance using wrong_label_mask. Finally, it returns the features by taking the minimum value across chunks in each dimension.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":92-112",
+            "content": "        wrong_label_mask = paddle.cast(wrong_label_mask, dtype=\"float16\")\n    else:\n        wrong_label_mask = paddle.cast(wrong_label_mask, dtype=\"float32\")\n    reference_embeddings_key = reference_embeddings\n    query_embeddings_key = query_embeddings\n    dists = _flattened_pairwise_distances(reference_embeddings_key, ref_square,\n                                          query_embeddings_key, query_square)\n    dists = (paddle.unsqueeze(dists, axis=1) +\n             paddle.unsqueeze(wrong_label_mask, axis=0) *\n             WRONG_LABEL_PADDING_DISTANCE)\n    features = paddle.min(dists, axis=2, keepdim=True)\n    return features\ndef _nearest_neighbor_features_per_object_in_chunks(reference_embeddings_flat,\n                                                    query_embeddings_flat,\n                                                    reference_labels_flat,\n                                                    n_chunks):\n    \"\"\"Calculates the nearest neighbor features per object in chunks to save mem.\n    Uses chunking to bound the memory use."
+        },
+        {
+            "comment": "This function computes the features for a set of query frames against a reference frame. It takes in embedding vectors for reference and query frames, as well as their respective class labels. The function uses chunking to handle large feature dimensions, with the number of chunks adjustable by the user. It returns a tensor of shape [m, n_objects, n] which represents the features for each query frame.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":113-137",
+            "content": "    Args:\n        reference_embeddings_flat: [n, embedding_dim],\n          the embedding vectors for the reference frame.\n        query_embeddings_flat: [m, embedding_dim],\n          the embedding vectors for the query frames.\n        reference_labels_flat: [n, n_objects],\n          the class labels of the reference frame.\n        n_chunks: Integer, the number of chunks to use to save memory\n          (set to 1 for no chunking).\n    Returns:\n        nn_features: [m, n_objects, n].\n    \"\"\"\n    feature_dim, embedding_dim = query_embeddings_flat.shape\n    chunk_size = int(np.ceil(float(feature_dim) / n_chunks))\n    wrong_label_mask = reference_labels_flat < 0.1\n    wrong_label_mask = paddle.transpose(x=wrong_label_mask, perm=[1, 0])\n    ref_square = paddle.sum(paddle.pow(reference_embeddings_flat, 2), axis=1)\n    query_square = paddle.sum(paddle.pow(query_embeddings_flat, 2), axis=1)\n    all_features = []\n    for n in range(n_chunks):\n        if n_chunks == 1:\n            query_embeddings_flat_chunk = query_embeddings_flat"
+        },
+        {
+            "comment": "This function is performing global matching on query embeddings and reference embeddings. It breaks down the embeddings into chunks, calculates features for each chunk, and concatenates these features to get the final nn_features. The number of chunks is determined by n_chunks, which default to 100. If n_chunks = 1, it returns the features from the only chunk.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":138-166",
+            "content": "            query_square_chunk = query_square\n            chunk_start = 0\n        else:\n            chunk_start = n * chunk_size\n            chunk_end = (n + 1) * chunk_size\n            query_square_chunk = query_square[chunk_start:chunk_end]\n            if query_square_chunk.shape[0] == 0:\n                continue\n            query_embeddings_flat_chunk = query_embeddings_flat[\n                chunk_start:chunk_end]\n        features = _nn_features_per_object_for_chunk(\n            reference_embeddings_flat, ref_square, query_embeddings_flat_chunk,\n            query_square_chunk, wrong_label_mask)\n        all_features.append(features)\n    if n_chunks == 1:\n        nn_features = all_features[0]\n    else:\n        nn_features = paddle.concat(all_features, axis=0)\n    return nn_features\ndef global_matching(reference_embeddings,\n                    query_embeddings,\n                    reference_labels,\n                    n_chunks=100,\n                    dis_bias=0.,\n                    ori_size=None,\n                    atrous_rate=1,"
+        },
+        {
+            "comment": "This code calculates the distance to the nearest neighbor per object for query_embeddings and reference_embeddings, given class labels and other parameters. It uses chunks to save memory and takes into account the atrous rate of reference_embeddings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":167-185",
+            "content": "                    use_float16=True,\n                    atrous_obj_pixel_num=0):\n    \"\"\"\n    Calculates the distance to the nearest neighbor per object.\n    For every pixel of query_embeddings calculate the distance to the\n    nearest neighbor in the (possibly subsampled) reference_embeddings per object.\n    Args:\n        reference_embeddings: [height, width, embedding_dim],\n          the embedding vectors for the reference frame.\n        query_embeddings: [height, width,\n          embedding_dim], the embedding vectors for the query frames.\n        reference_labels: [height, width, obj_nums],\n          the class labels of the reference frame.\n        n_chunks: Integer, the number of chunks to use to save memory\n          (set to 1 for no chunking).\n        dis_bias: [n_objects], foreground and background bias\n        ori_size: (ori_height, ori_width),\n          the original spatial size. If \"None\", (ori_height, ori_width) = (height, width).\n        atrous_rate: Integer, the atrous rate of reference_embeddings."
+        },
+        {
+            "comment": "This code snippet calculates and pads the selected points for spatial pyramid pooling in a segmentation model. It checks if float16 is used, then prepares padding based on the atrous rate. The resulting tensor of selected points is reshaped to match the input shape before returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":186-208",
+            "content": "        use_float16: Bool, if \"True\", use float16 type for matching.\n    Returns:\n        nn_features: [1, ori_height, ori_width, n_objects, feature_dim].\n    \"\"\"\n    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])\n    if use_float16:\n        query_embeddings = paddle.cast(query_embeddings, dtype=\"float16\")\n        reference_embeddings = paddle.cast(reference_embeddings,\n                                           dtype=\"float16\")\n    h, w, embedding_dim = query_embeddings.shape\n    obj_nums = reference_labels.shape[2]\n    if atrous_rate > 1:\n        h_pad = (atrous_rate - h % atrous_rate) % atrous_rate\n        w_pad = (atrous_rate - w % atrous_rate) % atrous_rate\n        selected_points = paddle.zeros([h + h_pad, w + w_pad])\n        selected_points = selected_points.view(\n            (h + h_pad) // atrous_rate, atrous_rate, (w + w_pad) // atrous_rate,\n            atrous_rate)\n        selected_points[:, 0, :, 0] = 1.\n        selected_points = paddle.reshape(selected_points,\n                                         [h + h_pad, w + w_pad, 1])[:h, :w]"
+        },
+        {
+            "comment": "The code is implementing a segmentation method in the PaddleVideo library. It first determines if an object is big or small based on the sum of reference labels. Then, it reshapes the reference embeddings and labels for further processing. It checks if any reference labels are present and returns default values if none are found.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":209-229",
+            "content": "        is_big_obj = (paddle.sum(\n            reference_labels,\n            axis=(0, 1))) > (atrous_obj_pixel_num * atrous_rate**2)\n        reference_labels[:, :,\n                         is_big_obj] = reference_labels[:, :,\n                                                        is_big_obj] * selected_points\n    reference_embeddings_flat = paddle.reshape(reference_embeddings,\n                                               [-1, embedding_dim])\n    reference_labels_flat = paddle.reshape(reference_labels, [-1, obj_nums])\n    query_embeddings_flat = paddle.reshape(query_embeddings,\n                                           [-1, embedding_dim])\n    all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9\n    reference_labels_flat = paddle.reshape(\n        paddle.masked_select(reference_labels_flat,\n                             paddle.expand(all_ref_fg, [-1, obj_nums])),\n        [-1, obj_nums])\n    if reference_labels_flat.shape[0] == 0:\n        return paddle.ones([1, h, w, obj_nums, 1])\n    reference_embeddings_flat = paddle.reshape("
+        },
+        {
+            "comment": "This function performs nearest neighbor feature extraction for video segment matching using reference and query embeddings, reference labels, and other parameters such as number of chunks, displacement bias, original size, and atrous rate. It returns the normalized nearest neighbor features in a reshaped format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":230-256",
+            "content": "        paddle.masked_select(reference_embeddings_flat,\n                             paddle.expand(all_ref_fg, [-1, embedding_dim])),\n        [-1, embedding_dim])\n    nn_features = _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,\n        n_chunks)\n    nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])\n    nn_features_reshape = (\n        F.sigmoid(nn_features_reshape +\n                  paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2\n    #TODO: ori_size is not None\n    if use_float16:\n        nn_features_reshape = paddle.cast(nn_features_reshape, dtype=\"float32\")\n    return nn_features_reshape\ndef global_matching_for_eval(all_reference_embeddings,\n                             query_embeddings,\n                             all_reference_labels,\n                             n_chunks=20,\n                             dis_bias=0.,\n                             ori_size=None,\n                             atrous_rate=1,"
+        },
+        {
+            "comment": "This code calculates the distance to the nearest neighbor per object for query embeddings in a list of reference embeddings, considering potentially subsampled frames. It takes query_embeddings of size [n_query_images, height, width, embedding_dim], all_reference_embeddings and all_reference_labels lists with size [height, width, obj_nums] each, n_chunks, dis_bias, and ori_size as input arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":257-276",
+            "content": "                             use_float16=True,\n                             atrous_obj_pixel_num=0):\n    \"\"\"\n    Calculates the distance to the nearest neighbor per object.\n    For every pixel of query_embeddings calculate the distance to the\n    nearest neighbor in the (possibly subsampled) reference_embeddings per object.\n    Args:\n        all_reference_embeddings: A list of reference_embeddings,\n          each with size [height, width, embedding_dim],\n          the embedding vectors for the reference frame.\n        query_embeddings: [n_query_images, height, width,\n          embedding_dim], the embedding vectors for the query frames.\n        all_reference_labels: A list of reference_labels,\n          each with size [height, width, obj_nums],\n          the class labels of the reference frame.\n        n_chunks: Integer, the number of chunks to use to save memory\n          (set to 1 for no chunking).\n        dis_bias: [n_objects], foreground and background bias\n        ori_size: (ori_height, ori_width),\n          the original spatial size. If \"None\", (ori_height, ori_width) = (height, width)."
+        },
+        {
+            "comment": "This function is responsible for creating a tensor of reference embeddings and labels for a given set of query embeddings, based on the provided atrous rate. The function first calculates the shape of the input tensors, then initializes empty lists for flat versions of reference embeddings and labels. It then determines the padding needed to match the atrous rate, creates a selection matrix with ones at the selected points, and reshapes it according to the atrous rate. Finally, it prepares the tensor for matching by flattening the reference embeddings and labels lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":277-298",
+            "content": "        atrous_rate: Integer, the atrous rate of reference_embeddings.\n        use_float16: Bool, if \"True\", use float16 type for matching.\n    Returns:\n        nn_features: [n_query_images, ori_height, ori_width, n_objects, feature_dim].\n    \"\"\"\n    h, w, embedding_dim = query_embeddings.shape\n    obj_nums = all_reference_labels[0].shape[2]\n    all_reference_embeddings_flat = []\n    all_reference_labels_flat = []\n    ref_num = len(all_reference_labels)\n    n_chunks *= ref_num\n    if atrous_obj_pixel_num > 0:\n        if atrous_rate > 1:\n            h_pad = (atrous_rate - h % atrous_rate) % atrous_rate\n            w_pad = (atrous_rate - w % atrous_rate) % atrous_rate\n            selected_points = paddle.zeros([h + h_pad, w + w_pad])\n            selected_points = paddle.reshape(\n                selected_points, [(h + h_pad) // atrous_rate, atrous_rate,\n                                  (w + w_pad) // atrous_rate, atrous_rate])\n            selected_points[:, 0, :, 0] = 1.\n            selected_points = paddle.reshape(selected_points,"
+        },
+        {
+            "comment": "This code segment appears to be a part of image segmentation or object detection algorithm. It processes reference embeddings and labels, potentially for each detected object in the image. The atrous rate determines if an object is big or small, with larger objects being processed separately by multiplying selected points to corresponding regions in reference_labels. The embeddings are flattened into 1D arrays, as well as reference_labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":299-317",
+            "content": "                                             [h + h_pad, w + w_pad, 1])[:h, :w]\n        for reference_embeddings, reference_labels, idx in zip(\n                all_reference_embeddings, all_reference_labels, range(ref_num)):\n            if atrous_rate > 1:\n                is_big_obj = paddle.sum(\n                    reference_labels,\n                    axis=(0, 1)) > (atrous_obj_pixel_num * atrous_rate**2)\n                is_big_obj = list(np.array(is_big_obj))\n                for j in range(len(is_big_obj)):\n                    if is_big_obj[j] == True:\n                        reference_labels[:, :, j:j +\n                                         1] = reference_labels[:, :, j:j +\n                                                               1] * selected_points\n            reference_embeddings_flat = paddle.reshape(reference_embeddings,\n                                                       [-1, embedding_dim])\n            reference_labels_flat = paddle.reshape(reference_labels,\n                                                   [-1, obj_nums])"
+        },
+        {
+            "comment": "The code concatenates reference embeddings and labels, then pads them if necessary based on the atrous rate. If there is only one reference, it directly selects the first item from all_reference_embeddings and all_reference_labels lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":319-337",
+            "content": "            all_reference_embeddings_flat.append(reference_embeddings_flat)\n            all_reference_labels_flat.append(reference_labels_flat)\n        reference_embeddings_flat = paddle.concat(\n            x=all_reference_embeddings_flat, axis=0)\n        reference_labels_flat = paddle.concat(x=all_reference_labels_flat,\n                                              axis=0)\n    else:\n        if ref_num == 1:\n            reference_embeddings, reference_labels = all_reference_embeddings[\n                0], all_reference_labels[0]\n            if atrous_rate > 1:\n                h_pad = (atrous_rate - h % atrous_rate) % atrous_rate\n                w_pad = (atrous_rate - w % atrous_rate) % atrous_rate\n                if h_pad > 0 or w_pad > 0:\n                    reference_embeddings = F.pad(reference_embeddings,\n                                                 [0, h_pad, 0, w_pad, 0, 0])\n                    reference_labels = F.pad(reference_labels,\n                                             [0, h_pad, 0, w_pad, 0, 0])"
+        },
+        {
+            "comment": "This code reshapes the reference embeddings and labels to match a specific pattern, then flattens the reference embeddings while preserving their data type and shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":338-355",
+            "content": "                reference_embeddings = paddle.reshape(\n                    reference_embeddings,\n                    [(h + h_pad) // atrous_rate, atrous_rate,\n                     (w + w_pad) // atrous_rate, atrous_rate, 32])\n                reference_labels = paddle.reshape(\n                    reference_labels,\n                    [(h + h_pad) // atrous_rate, atrous_rate,\n                     (w + w_pad) // atrous_rate, atrous_rate, -1])\n                reference_embeddings = paddle.reshape(\n                    reference_embeddings[:, 0, :, 0, :],\n                    reference_embeddings[:, 0, :, 0, :].shape)\n                reference_labels = paddle.reshape(\n                    reference_labels[:, 0, :, 0, :],\n                    reference_labels[:, 0, :, 0, :].shape)\n            reference_embeddings_flat = paddle.reshape(reference_embeddings,\n                                                       [-1, embedding_dim])\n            reference_labels_flat = paddle.reshape(reference_labels,\n                                                   [-1, obj_nums])"
+        },
+        {
+            "comment": "This code segment handles the case where atrous_rate is greater than 1. It pads reference embeddings and labels if needed, then reshapes them to have a shape compatible with Atrous Spatial Pyramid Pooling (ASPP) in deep learning models for image classification or detection tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":356-374",
+            "content": "        else:\n            for reference_embeddings, reference_labels, idx in zip(\n                    all_reference_embeddings, all_reference_labels,\n                    range(ref_num)):\n                if atrous_rate > 1:\n                    h_pad = (atrous_rate - h % atrous_rate) % atrous_rate\n                    w_pad = (atrous_rate - w % atrous_rate) % atrous_rate\n                    if h_pad > 0 or w_pad > 0:\n                        reference_embeddings = F.pad(reference_embeddings,\n                                                     [0, h_pad, 0, w_pad, 0, 0])\n                        reference_labels = F.pad(reference_labels,\n                                                 [0, h_pad, 0, w_pad, 0, 0])\n                    reference_embeddings = paddle.reshape(\n                        reference_embeddings,\n                        [(h + h_pad) // atrous_rate, atrous_rate,\n                         (w + w_pad) // atrous_rate, atrous_rate, -1])\n                    reference_labels = paddle.reshape(\n                        reference_labels,"
+        },
+        {
+            "comment": "This code reshapes the reference embeddings and labels into a flattened format, appends them to lists, and then concatenates all the flattened reference embeddings along axis 0. This is likely for use in a deep learning model that requires the data in this specific format for training or prediction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":375-393",
+            "content": "                        [(h + h_pad) // atrous_rate, atrous_rate,\n                         (w + w_pad) // atrous_rate, atrous_rate, -1])\n                    reference_embeddings = paddle.reshape(\n                        reference_embeddings[:, 0, :, 0, :],\n                        reference_embeddings[:, 0, :, 0, :].shape)\n                    reference_labels = paddle.reshape(\n                        reference_labels[:, 0, :, 0, :],\n                        reference_labels[:, 0, :, 0, :].shape)\n                reference_embeddings_flat = paddle.reshape(\n                    reference_embeddings, [-1, embedding_dim])\n                reference_labels_flat = paddle.reshape(reference_labels,\n                                                       [-1, obj_nums])\n                all_reference_embeddings_flat.append(reference_embeddings_flat)\n                all_reference_labels_flat.append(reference_labels_flat)\n            reference_embeddings_flat = paddle.concat(\n                all_reference_embeddings_flat, axis=0)"
+        },
+        {
+            "comment": "This code segment performs feature selection and reshaping of query and reference embeddings for the segment matching process. It concatenates all reference labels, flattens the query embeddings, masks the selected reference labels and embeddings based on a threshold, and finally reshapes them before returning a tensor of ones if no references are found or casting the embeddings to float16 datatype if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":394-414",
+            "content": "            reference_labels_flat = paddle.concat(all_reference_labels_flat,\n                                                  axis=0)\n    query_embeddings_flat = paddle.reshape(query_embeddings,\n                                           [-1, embedding_dim])\n    all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9\n    reference_labels_flat = paddle.reshape(\n        paddle.masked_select(reference_labels_flat,\n                             paddle.expand(all_ref_fg, [-1, obj_nums])),\n        [-1, obj_nums])\n    if reference_labels_flat.shape[0] == 0:\n        return paddle.ones([1, h, w, obj_nums, 1])\n    reference_embeddings_flat = paddle.reshape(\n        paddle.masked_select(reference_embeddings_flat,\n                             paddle.expand(all_ref_fg, [-1, embedding_dim])),\n        [-1, embedding_dim])\n    if use_float16:\n        query_embeddings_flat = paddle.cast(query_embeddings_flat,\n                                            dtype=\"float16\")\n        reference_embeddings_flat = paddle.cast(reference_embeddings_flat,"
+        },
+        {
+            "comment": "This code calculates pairwise squared L2 distances using a local search window, and then computes the nearest neighbor features for each object in image chunks. The result is reshaped into an appropriate format and can be used for further processing or analysis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":415-441",
+            "content": "                                                dtype=\"float16\")\n    nn_features = _nearest_neighbor_features_per_object_in_chunks(\n        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,\n        n_chunks)\n    nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])\n    nn_features_reshape = (\n        F.sigmoid(nn_features_reshape +\n                  paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2\n    # TODO: ori_size is not None\n    if use_float16:\n        nn_features_reshape = paddle.cast(nn_features_reshape, dtype=\"float32\")\n    return nn_features_reshape\n#LOCAL_DIST_MAP\ndef local_pairwise_distances(x,\n                             y,\n                             max_distance=9,\n                             atrous_rate=1,\n                             allow_downsample=False):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n        Use for-loop for saving memory.\n    Args:\n        x: Float32 tensor of shape [height, width, feature_dim]."
+        },
+        {
+            "comment": "This function takes in a tensor 'x' and 'y', along with parameters such as max_distance, atrous_rate, and allow_downsample. It returns a distances tensor of shape [height, width, (2 * max_distance + 1) ** 2]. If downsampling is allowed, the original height and width are saved and the tensors 'x' and 'y' are reshaped. Then, using bilinear interpolation, 'x' and 'y' are downsampled to half their size while preserving values at borders.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":442-463",
+            "content": "        y: Float32 tensor of shape [height, width, feature_dim].\n        max_distance: Integer, the maximum distance in pixel coordinates\n          per dimension which is considered to be in the search window.\n        atrous_rate: Integer, the atrous rate of local matching.\n        allow_downsample: Bool, if \"True\", downsample x and y\n          with a stride of 2.\n    Returns:\n        Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    if allow_downsample:\n        ori_height = x.shape[0]\n        ori_width = x.shape[1]\n        x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)\n        y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)\n        down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)\n        x = F.interpolate(x,\n                          size=down_size,\n                          mode='bilinear',\n                          align_corners=True)\n        y = F.interpolate(y,\n                          size=down_size,\n                          mode='bilinear',"
+        },
+        {
+            "comment": "This code computes local pairwise distances between the input tensors x and y, accounting for atrous dilation. It first pads y with wrong label padding distance to match the size of x. Then it loops through the range of possible offsets for each pixel and calculates the sum of squared differences between the current pixel and all potential offsets in y. These distances are then stacked along the channel axis before being returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":464-491",
+            "content": "                          align_corners=True)\n        x = paddle.unsqueeze(paddle.transpose(x, [1, 2, 0]), axis=0)\n        y = paddle.unsqueeze(paddle.transpose(y, [1, 2, 0]), axis=0)\n    pad_max_distance = max_distance - max_distance % atrous_rate\n    # no change pad\n    padded_y = F.pad(y, (0, 0, pad_max_distance, pad_max_distance,\n                         pad_max_distance, pad_max_distance),\n                     value=WRONG_LABEL_PADDING_DISTANCE)\n    height, width, _ = x.shape\n    dists = []\n    for y in range(2 * pad_max_distance // atrous_rate + 1):\n        y_start = y * atrous_rate\n        y_end = y_start + height\n        y_slice = padded_y[y_start:y_end]\n        for x in range(2 * max_distance + 1):\n            x_start = x * atrous_rate\n            x_end = x_start + width\n            offset_y = y_slice[:, x_start:x_end]\n            dist = paddle.sum(paddle.pow((x - offset_y), 2), axis=2)\n            dists.append(dist)\n    dists = paddle.stack(dists, axis=2)\n    return dists\ndef local_pairwise_distances_parallel(x,"
+        },
+        {
+            "comment": "This function computes pairwise squared L2 distances using a local search window. It takes two tensors x and y of shape [height, width, feature_dim] as input. The maximum distance (max\\_distance) in pixel coordinates per dimension is considered in the search window. Atrous rate determines the local matching rate. If downsampling is allowed, the function downsamples the tensors with a stride of 2. It returns a float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":492-512",
+            "content": "                                      y,\n                                      max_distance=9,\n                                      atrous_rate=1,\n                                      allow_downsample=True):\n    \"\"\"Computes pairwise squared l2 distances using a local search window.\n    Args:\n        x: Float32 tensor of shape [height, width, feature_dim].\n        y: Float32 tensor of shape [height, width, feature_dim].\n        max_distance: Integer, the maximum distance in pixel coordinates\n          per dimension which is considered to be in the search window.\n        atrous_rate: Integer, the atrous rate of local matching.\n        allow_downsample: Bool, if \"True\", downsample x and y\n          with a stride of 2.\n    Returns:\n        Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].\n    \"\"\"\n    ori_height, ori_width, _ = x.shape\n    x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)\n    y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)\n    if allow_downsample:"
+        },
+        {
+            "comment": "The code resizes the input tensors x and y to a downsized version of half the original size using bilinear interpolation. It then calculates the squared values for x and y, reshapes them, pads the tensors with WRONG_LABEL_PADDING_DISTANCE to match the atrous rate, and assigns them to padded_y and padded_y2 variables.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":513-536",
+            "content": "        down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)\n        x = F.interpolate(x,\n                          size=down_size,\n                          mode='bilinear',\n                          align_corners=True)\n        y = F.interpolate(y,\n                          size=down_size,\n                          mode='bilinear',\n                          align_corners=True)\n    _, channels, height, width = x.shape\n    x2 = paddle.reshape(paddle.sum(paddle.pow(x, 2), axis=1),\n                        [height, width, 1])\n    y2 = paddle.reshape(paddle.sum(paddle.pow(y, 2), axis=1),\n                        [1, 1, height, width])\n    pad_max_distance = max_distance - max_distance % atrous_rate\n    # no change pad\n    padded_y = F.pad(y, (pad_max_distance, pad_max_distance, pad_max_distance,\n                         pad_max_distance))\n    padded_y2 = F.pad(y2, (pad_max_distance, pad_max_distance, pad_max_distance,\n                           pad_max_distance),\n                      value=WRONG_LABEL_PADDING_DISTANCE)"
+        },
+        {
+            "comment": "This code snippet calculates the distance between embeddings of frames to measure similarity. It takes in two frame embeddings, their corresponding labels, and several optional parameters, including an atomic number for local distances, a size parameter, and whether to allow downsampling. The function uses Paddle's unfold operation to reshape the data, then calculates distances between these reshaped embeddings using a formula that involves matmul (matrix multiplication) operations. The final step is returning the calculated distances.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":538-565",
+            "content": "    offset_y = paddle.transpose(\n        paddle.reshape(\n            F.unfold(x=padded_y,\n                     kernel_sizes=[height, width],\n                     strides=[atrous_rate, atrous_rate]),\n            [channels, height * width, -1]), [1, 0, 2])\n    offset_y2 = paddle.reshape(\n        F.unfold(padded_y2,\n                 kernel_sizes=[height, width],\n                 strides=[atrous_rate, atrous_rate]), [height, width, -1])\n    x = paddle.transpose(paddle.reshape(x, [channels, height * width, -1]),\n                         [1, 2, 0])\n    dists = x2 + offset_y2 - 2. * paddle.reshape(paddle.matmul(x, offset_y),\n                                                 [height, width, -1])\n    return dists\ndef local_matching(prev_frame_embedding,\n                   query_embedding,\n                   prev_frame_labels,\n                   dis_bias=0.,\n                   multi_local_distance=[15],\n                   ori_size=None,\n                   atrous_rate=1,\n                   use_float16=True,\n                   allow_downsample=True,"
+        },
+        {
+            "comment": "This code computes nearest neighbor features for local matching in video segmentation. It takes embedding vectors, class labels, and a list of maximum distances as input. The function allows downsampling and parallel processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":566-583",
+            "content": "                   allow_parallel=True):\n    \"\"\"Computes nearest neighbor features while only allowing local matches.\n    Args:\n        prev_frame_embedding: [height, width, embedding_dim],\n          the embedding vectors for the last frame.\n        query_embedding: [height, width, embedding_dim],\n          the embedding vectors for the query frames.\n        prev_frame_labels: [height, width, n_objects],\n        the class labels of the previous frame.\n        multi_local_distance: A list of Integer,\n          a list of maximum distance allowed for local matching.\n        ori_size: (ori_height, ori_width),\n          the original spatial size. If \"None\", (ori_height, ori_width) = (height, width).\n        atrous_rate: Integer, the atrous rate of local matching.\n        use_float16: Bool, if \"True\", use float16 type for matching.\n        allow_downsample: Bool, if \"True\", downsample prev_frame_embedding and query_embedding\n          with a stride of 2.\n        allow_parallel: Bool, if \"True\", do matching in a parallel way. If \"False\", do matching in"
+        },
+        {
+            "comment": "This function calculates nearest neighbor features by using local pairwise distances in a parallel manner, with options to cast data types and allow downsampling. It takes query and previous frame embeddings as input, and returns nearest neighbor features of shape [1, height, width, n_objects, 1].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":584-608",
+            "content": "          a for-loop way, which will save GPU memory.\n    Returns:\n        nn_features: A float32 np.array of nearest neighbor features of shape\n          [1, height, width, n_objects, 1].\n    \"\"\"\n    max_distance = multi_local_distance[-1]\n    if ori_size is None:\n        height, width = prev_frame_embedding.shape[:2]\n        ori_size = (height, width)\n    obj_num = prev_frame_labels.shape[2]\n    pad = paddle.ones([1]) * WRONG_LABEL_PADDING_DISTANCE\n    if use_float16:\n        query_embedding = paddle.cast(query_embedding, dtype=\"float16\")\n        prev_frame_embedding = paddle.cast(prev_frame_embedding,\n                                           dtype=\"float16\")\n        pad = paddle.cast(pad, dtype=\"float16\")\n    if allow_parallel:\n        d = local_pairwise_distances_parallel(query_embedding,\n                                              prev_frame_embedding,\n                                              max_distance=max_distance,\n                                              atrous_rate=atrous_rate,\n                                              allow_downsample=allow_downsample)"
+        },
+        {
+            "comment": "This code calculates pairwise distances between query and previous frame embeddings. If the shape of the distances doesn't match the original size, it interpolates labels using nearest neighbor mode. The code then pads the labels with zeros to match the maximum distance considering the atrous rate.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":609-634",
+            "content": "    else:\n        d = local_pairwise_distances(query_embedding,\n                                     prev_frame_embedding,\n                                     max_distance=max_distance,\n                                     atrous_rate=atrous_rate,\n                                     allow_downsample=allow_downsample)\n    height, width = d.shape[:2]\n    labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]), 1)\n    labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]),\n                              axis=1)\n    if (height, width) != ori_size:\n        labels = F.interpolate(labels, size=(height, width), mode='nearest')\n    pad_max_distance = max_distance - max_distance % atrous_rate\n    atrous_max_distance = pad_max_distance // atrous_rate\n    #no change pad\n    padded_labels = F.pad(labels, (\n        pad_max_distance,\n        pad_max_distance,\n        pad_max_distance,\n        pad_max_distance,\n    ),\n                          mode='constant',\n                          value=0)"
+        },
+        {
+            "comment": "This code segment applies atrous spatial pyramid pooling in a PaddlePaddle implementation. It creates offset masks, performs element-wise masking, computes minimum distances, and reshapes the data for each local distance level to perform feature extraction at different scales.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":636-661",
+            "content": "    offset_masks = paddle.transpose(\n        paddle.reshape(\n            F.unfold(padded_labels,\n                     kernel_sizes=[height, width],\n                     strides=[atrous_rate, atrous_rate]),\n            [obj_num, height, width, -1]), [1, 2, 3, 0]) > 0.9\n    d_tiled = paddle.expand(paddle.unsqueeze(\n        d, axis=-1), [-1, -1, -1, obj_num])  # h, w, num_local_pos, obj_num\n    d_masked = paddle.where(offset_masks, d_tiled, pad)\n    dists = paddle.min(d_masked, axis=2)\n    multi_dists = [\n        paddle.unsqueeze(paddle.transpose(dists, [2, 0, 1]), axis=1)\n    ]  # n_objects, num_multi_local, h, w\n    reshaped_d_masked = paddle.reshape(d_masked, [\n        height, width, 2 * atrous_max_distance + 1, 2 * atrous_max_distance + 1,\n        obj_num\n    ])\n    for local_dis in multi_local_distance[:-1]:\n        local_dis = local_dis // atrous_rate\n        start_idx = atrous_max_distance - local_dis\n        end_idx = atrous_max_distance + local_dis + 1\n        new_d_masked = paddle.reshape(\n            reshaped_d_masked[:, :, start_idx:end_idx, start_idx:end_idx, :],"
+        },
+        {
+            "comment": "This code performs image segmentation by reshaping and resizing the distance matrix, calculating minimum distances, and applying sigmoid activation. It also handles cases where input size is not the original size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":662-683",
+            "content": "            reshaped_d_masked[:, :, start_idx:end_idx,\n                              start_idx:end_idx, :].shape)\n        new_d_masked = paddle.reshape(new_d_masked,\n                                      [height, width, -1, obj_num])\n        new_dists = paddle.min(new_d_masked, axis=2)\n        new_dists = paddle.unsqueeze(paddle.transpose(new_dists, [2, 0, 1]),\n                                     axis=1)\n        multi_dists.append(new_dists)\n    multi_dists = paddle.concat(multi_dists, axis=1)\n    multi_dists = (F.sigmoid(multi_dists +\n                             paddle.reshape(dis_bias, [-1, 1, 1, 1])) - 0.5) * 2\n    if use_float16:\n        multi_dists = paddle.cast(multi_dists, dtype=\"float32\")\n    if (height, width) != ori_size:\n        multi_dists = F.interpolate(multi_dists,\n                                    size=ori_size,\n                                    mode='bilinear',\n                                    align_corners=True)\n    multi_dists = paddle.transpose(multi_dists, perm=[2, 3, 0, 1])"
+        },
+        {
+            "comment": "This function calculates the attention heads for each object in a given scene. It takes reference and previous embeddings and labels as input, along with an optional epsilon value. It then computes the positional and negative head values, divides them by their respective counts (positive and negative labels), and returns the resulting attention heads for each object. The epsilon is added to avoid division by zero in the calculations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":684-708",
+            "content": "    multi_dists = paddle.reshape(multi_dists,\n                                 [1, ori_size[0], ori_size[1], obj_num, -1])\n    return multi_dists\ndef calculate_attention_head(ref_embedding,\n                             ref_label,\n                             prev_embedding,\n                             prev_label,\n                             epsilon=1e-5):\n    ref_head = ref_embedding * ref_label\n    ref_head_pos = paddle.sum(ref_head, axis=(2, 3))\n    ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos\n    ref_pos_num = paddle.sum(ref_label, axis=(2, 3))\n    ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))\n    ref_head_pos = ref_head_pos / (ref_pos_num + epsilon)\n    ref_head_neg = ref_head_neg / (ref_neg_num + epsilon)\n    prev_head = prev_embedding * prev_label\n    prev_head_pos = paddle.sum(prev_head, axis=(2, 3))\n    prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos\n    prev_pos_num = paddle.sum(prev_label, axis=(2, 3))\n    prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))"
+        },
+        {
+            "comment": "This code calculates the attention head values for evaluation, where it sums up reference embeddings multiplied by their corresponding labels. It also accounts for positive and negative instances of reference embeddings by subtracting them from total sums. The final total_head is returned as a concatenated matrix.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":709-735",
+            "content": "    prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)\n    prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)\n    total_head = paddle.concat(\n        x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)\n    return total_head\ndef calculate_attention_head_for_eval(ref_embeddings,\n                                      ref_labels,\n                                      prev_embedding,\n                                      prev_label,\n                                      epsilon=1e-5):\n    total_ref_head_pos = 0.\n    total_ref_head_neg = 0.\n    total_ref_pos_num = 0.\n    total_ref_neg_num = 0.\n    for idx in range(len(ref_embeddings)):\n        ref_embedding = ref_embeddings[idx]\n        ref_label = ref_labels[idx]\n        ref_head = ref_embedding * ref_label\n        ref_head_pos = paddle.sum(ref_head, axis=(2, 3))\n        ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos\n        ref_pos_num = paddle.sum(ref_label, axis=(2, 3))\n        ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))"
+        },
+        {
+            "comment": "This code calculates and returns a total head value by accumulating reference (ref) head values and previous (prev) head values, then normalizing them. It handles potential zero-division cases with a small epsilon for stability. The resulting total head consists of reference positive (pos), reference negative (neg), previous positive (pos), and previous negative (neg) head components concatenated along axis 1.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/utils.py\":736-753",
+            "content": "        total_ref_head_pos = total_ref_head_pos + ref_head_pos\n        total_ref_head_neg = total_ref_head_neg + ref_head_neg\n        total_ref_pos_num = total_ref_pos_num + ref_pos_num\n        total_ref_neg_num = total_ref_neg_num + ref_neg_num\n    ref_head_pos = total_ref_head_pos / (total_ref_pos_num + epsilon)\n    ref_head_neg = total_ref_head_neg / (total_ref_neg_num + epsilon)\n    prev_head = prev_embedding * prev_label\n    prev_head_pos = paddle.sum(prev_head, axis=(2, 3))\n    prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos\n    prev_pos_num = paddle.sum(prev_label, axis=(2, 3))\n    prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))\n    prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)\n    prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)\n    total_head = paddle.concat(\n        x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)\n    return total_head"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c151342c-384a-4441-b8d2-e548e4ddbe63.json b/docs/doc/c151342c-384a-4441-b8d2-e548e4ddbe63.json
new file mode 100644
index 000000000..8cc4c168e
--- /dev/null
+++ b/docs/doc/c151342c-384a-4441-b8d2-e548e4ddbe63.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The Video-Swin-Transformer model achieves SOTA accuracy on Kinetics-400, offering multi-scale modeling, efficient local attention features, and mixed-precision training. Code provides data prep, training, testing, and inference instructions for 8 GPUs, with pre-trained Swin-Transformer models available in PaddleVideo.",
+    "details": [
+        {
+            "comment": "This is a model card for the Video-Swin-Transformer video classification model, based on Swin Transformer. It utilizes multi-scale modeling and efficient local attention features to achieve SOTA accuracy on Kinetics-400 dataset. The code provides information about data preparation, training, testing, and inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/videoswin.md\":0-32",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/videoswin.md) | English\n# Video-Swin-Transformer Video Classification Model\n## content\n- [Introduction](#Introduction)\n- [Data](#DATA)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nVideo-Swin-Transformer is a video classification model based on Swin Transformer. It utilizes Swin Transformer's multi-scale modeling and efficient local attention characteristics. It currently achieves SOTA accuracy on the Kinetics-400 data set, surpassing the same transformer structure. The TimeSformer model.\n![VideoSwin](../../../images/videoswin.jpg)\n## DATA\nK400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)\n## Train\n### Kinetics-400 data set training\n#### Download and add pre-trained models\n1. Download the image pre-training model [swin_base_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams) as the Backbone initialization parameter, or download it through the wget command"
+        },
+        {
+            "comment": "This code provides the steps to download a pretrained VideoSwin model, update its configuration file with the downloaded path, and finally start training it on the Kinetics400 dataset using 8 GPUs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/videoswin.md\":34-59",
+            "content": "   ```bash\n   wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams # ImageNet pretrained model for VideoSwin_base\n   # wget https://videotag.bj.bcebos.com/PaddleVideorelease2.2/swin_small_patch4_window7_224.pdparams # Imagenet pretrained model for VideoSwin_small\n   ```\n2. Open `configs/recognition/videoswin/videoswin_base_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`\n    ```yaml\n    MODEL:\n        framework: \"RecognizerTransformer\"\n        backbone:\n            name: \"SwinTransformer3D\"\n            pretrained: fill in the path here\n    ```\n#### Start training\n- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:\n    ```bash\n    # videos data format\n    python3.7 -u -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_videoswin_base main.py --validate -c configs/recognition/video_swin_transformer/videoswin_base_k400_videos.yaml\n    ```\n- Turn o"
+        },
+        {
+            "comment": "The code sets up mixed-precision training with specific flags for faster processing. It also provides command for running the PaddleVideo model, specifically Video-Swin-Transformer, on GPUs and customizable configuration files. The accuracy is verified during training by checking for the \"best\" keyword in the log.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/videoswin.md\":59-74",
+            "content": "n amp mixed-precision training to speed up the training process. The training start command is as follows:\n    ```bash\n    export FLAGS_conv_workspace_size_limit=800 # MB\n    export FLAGS_cudnn_exhaustive_search=1\n    export FLAGS_cudnn_batchnorm_spatial_persistent=1\n    # videos data format\n    python3.7 -u -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_videoswin_base main.py --amp --validate -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml\n    ```\n- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../../zh-CN/contribute/config.md) for parameter usage.\n## Test\n- The Video-Swin-Transformer model is verified during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:"
+        },
+        {
+            "comment": "Code snippet shows how to test the best Video-Swin-Transformer model after training, using a different sampling method (UniformCrop) for improved accuracy. The command provided demonstrates how to execute the test with specific configuration settings and input files, resulting in evaluation metrics on the Kinetics-400 validation dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/videoswin.md\":76-88",
+            "content": "  ```log\n  Already save the best model (top1 acc)0.7258\n  ```\n- Since the sampling method of the Video-Swin-Transformer model test mode is a bit slower but more accurate **UniformCrop**, which is different from the **CenterCrop** used in the verification mode during the training process, so the verification recorded in the training log The index `topk Acc` does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" --log_dir=log_videoswin_base main.py --test -c configs/recognition/video_swin_transformer/videoswin_base_k400_videos.yaml -w \"output/VideoSwin_base/VideoSwin_base_best.pdparams\"\n  ```\n  When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:\n   |        backbone        | Sampling method | num_seg | target_s"
+        },
+        {
+            "comment": "The table displays pre-trained model checkpoints for Swin-Transformer in PaddleVideo's model zoo, including the model size, input image size, top-1 accuracy, and corresponding URLs for downloading the pdparams files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/videoswin.md\":88-91",
+            "content": "ize | Top-1 |                                                        checkpoints                                                         | pretrain model |\n   | :--------------------: | :-------------: | :-----: | :---------: | :---- | :------------------------------------------------------------------------------------------------------------------------: | :----: |\n   | Swin-Transformer_base  |   UniformCrop   |   32    |     224     | 82.40 |  [SwinTransformer_k400_base.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_base_k400.pdparams)  | [swin_base_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams) |\n   | Swin-Transformer_small |   UniformCrop   |   32    |     224     | 80.18 | [SwinTransformer_k400_small.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_small_k400.pdparams) | [swin_small_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_small_patch4_window7_224.pdparams) |"
+        },
+        {
+            "comment": "This code snippet provides instructions for exporting an inference model and using the predictive engine inference in PaddleVideo. The first command generates the necessary files (`.pdmodel` and `.pdiparams`) required for prediction, while the second command performs the actual prediction on a given input video file with specified configuration and model files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/videoswin.md\":93-116",
+            "content": "## Inference\n### Export inference model\n```bash\npython3.7 tools/export_model.py -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml \\\n                                -p data/VideoSwin_base_k400.pdparams \\\n                                -o inference/VideoSwin_base\n```\nThe above command will generate the model structure file `VideoSwin_base.pdmodel` and the model weight file `VideoSwin_base.pdiparams` required for prediction.\n- For the meaning of each parameter, please refer to [Model Inference](../../usage.md#2-infer)\n### Use predictive engine inference\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/videoswin/videoswin_base_k400_videos.yaml \\\n                           --model_file inference/VideoSwin_base/VideoSwin_base.pdmodel \\\n                           --params_file inference/VideoSwin_base/VideoSwin_base.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```"
+        },
+        {
+            "comment": "This code showcases an example of using the Video-Swin-Transformer model trained on Kinetics-400 to predict a video file. The output includes the top-1 class and score, and referring to the category id and name correspondence table allows for identifying the predicted category name.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/videoswin.md\":118-130",
+            "content": "The output example is as follows:\n```log\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9999829530715942\n```\nIt can be seen that using the Video-Swin-Transformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By referring to the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.\n## Reference\n- [Video Swin Transformer](https://arxiv.org/pdf/2106.13230.pdf), Ze Liu, Jia Ning, Yue Cao, Yixuan Wei"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c1634b2d-7ec7-4f71-9305-985386018d58.json b/docs/doc/c1634b2d-7ec7-4f71-9305-985386018d58.json
new file mode 100644
index 000000000..0cc528b1c
--- /dev/null
+++ b/docs/doc/c1634b2d-7ec7-4f71-9305-985386018d58.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code registers various model types (backbones, heads, recognizers) using a Registry class for efficient organization and management in a larger model architecture or framework implementation. Registries are created for 'bbox_coder', 'estimator', 'multimodal', and 'segment'.",
+    "details": [
+        {
+            "comment": "This code is registering different types of models (backbones, heads, recognizers, etc.) using a Registry class from the utils module. The Registry will help in organizing and managing these different model types efficiently. This code snippet seems to be part of a larger model architecture or framework implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/registry.py\":0-26",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nBACKBONES = Registry('backbone')\nHEADS = Registry('head')\nRECOGNIZERS = Registry('recognizer')\nSEGMENTERS = Registry('Segmenters')\nLOCALIZERS = Registry('localizer')\nPARTITIONERS = Registry('partitioner')\nLOSSES = Registry('loss')\nROI_EXTRACTORS = Registry('roi_extractor')\nDETECTORS = Registry('detectors')\nBBOX_ASSIGNERS = Registry('bbox_assigner')\nBBOX_SAMPLERS = Registry('bbox_sampler')"
+        },
+        {
+            "comment": "Registry is created for 'bbox_coder', 'estimator', 'multimodal', and 'segment'. These Registries organize and manage the different types of models or coding methods, allowing for easy access and maintenance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/registry.py\":27-30",
+            "content": "BBOX_CODERS = Registry('bbox_coder')\nESTIMATORS = Registry('estimator')\nMULTIMODAL = Registry('multimodal')\nSEGMENT = Registry('segment')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c179667d-88cb-428a-9539-25b36f5743ec.json b/docs/doc/c179667d-88cb-428a-9539-25b36f5743ec.json
new file mode 100644
index 000000000..feb0ed66c
--- /dev/null
+++ b/docs/doc/c179667d-88cb-428a-9539-25b36f5743ec.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The Temporal Shift Module (TSM) is an efficient video understanding technique that balances performance and efficiency, capturing spatial-temporal features. Suitable for both online and offline videos, it focuses on temporal information and has a simple 2-line implementation.",
+    "details": [
+        {
+            "comment": "This code snippet provides background and motivation for TSM (Temporal Shift Module), a classic model in video understanding proposed by MIT and IBM Watson AI Lab. The TSM aims to balance efficiency and performance while improving the ability to analyze video content in various dimensions. It is related to the Temporal Segment Network (TSN) published by Limin Wang.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/TSM.md\":0-4",
+            "content": "# 1. Background&Motivation\nAt present, the video data on the Internet is increasing rapidly, and the time users spend watching short videos and small videos is also increasing rapidly. How to analyze, process and classify the massive video resources quickly and accurately is an urgent problem to be solved. The video understanding technology can analyze the video content in multiple dimensions, understand the video semantics, and automatically classify and label the video, which greatly saves the efficiency of manual audit and costs. At the same time, accurate user recommendation is realized to improve the experience effect.\nIn this paper, we will introduce the classic model **TSM (Temporal Shift Module)** in the field of video understanding, which is proposed by **MIT** and **IBM Watson AI Lab** `Ji Lin, Chuang Gan and Songhan, etc`, to achieve the balance between effeiciency and performance and improve video understanding ability.\nThe most relevant video understanding model to TSM is the **Temporal Segment Network (TSN)** published by Limin Wang"
+        },
+        {
+            "comment": "This code describes the Temporal Shift Module (TSM), which is a method for efficient video understanding that avoids extra computation by using temporal dimension feature map shift. It is based on the concept of capturing spatial-temporal features, with a focus on temporal information in videos. This approach aims to achieve feature fusion and joint modeling among different frames without adding extra computational overhead compared to TSN.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/TSM.md\":5-9",
+            "content": "a series of works represented such as I3D, S3D and P3D, which carry out end-to-end joint spatial-temporal modeling through 3D convolution. Although this series of works can capture spatial-temporal features, compared with TSN, the transition from 2D convolution to 3D convolution inevitably introduces extra computation. TSM cleverly uses the idea of temporal dimension feature map shift, theoretically achieving the purpose of feature fusion and joint modeling among different frames with zero extra computing overhead compared with TSN.\n**Paper Address:** [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383v2.pdf)\nLet's have a look at the following example: if the video is played from left to right and then from right to left respectively, the subjects will give different but correct interpretation of the video, indicating that the understanding of the video is strongly dependent on the temporal information of the video. Yes !, It is the motivation why TSM is proposed."
+        },
+        {
+            "comment": "The code is an introduction to Temporal Shift Module (TSM) in video understanding, highlighting the trade-offs between 2D and 3D CNN methods, and how TSM embeds time displacement into 2D CNN for equivalent performance without additional computation or parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/TSM.md\":10-20",
+            "content": "<p align=\"center\">\n<img src=\"../../images/temporal.png\" height=188 width=500 hspace='10'/> <br />\n</p>\nIt looks interesting, next,let's dive into the core modules of TSM.\n# 2. Dark technologies used in TSM\nOn the basis of traditional image analysis, video analysis needs researchers to supplement the modeling structure of temporal information. At present, 2D CNN and 3D CNN are the two most commonly used methods in video understanding: using 2D CNN model requires less computation but will lose part of the time information; While using 3D CNN has a good effect but a large amount of computation. Faced with such a situation, Ji Lin, Chuang Gan and Song Han et al. from MIT and IBM Watson AI Lab proposed the Temp Shift Module (TSM) Module. By embedding the time displacement module into 2D CNN, they can easily achieve the same video understanding ability as 3D CNN without adding any additional calculation and parameters.\n<p align=\"center\">\n<img src=\"../../images/tsm_intr.png\" height=188 width=500 hspace='10'/> <br />"
+        },
+        {
+            "comment": "This code describes the TSM (Temporal Segment Networks) module, which introduces context interaction on the temporal dimension in feature graphs. It does this by moving some channels forward and backward one step in the temporal dimension, filling gaps with zeros. The channel movement allows 2D convolution to extract spatial-temporal information like 3D convolution. This improves model ability in time dimension and has TSM modules suitable for online and offline videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/TSM.md\":21-26",
+            "content": "</p>\nThe rows and columns of the matrix in the figure above represent the temporal and channel dimensions of the feature graph, respectively. In TSM module, some channels are moved forward one step int the temporal dimension, and some channels are moved backward one step in the temporal dimension, and the gaps after the displacement are zeroed. In this way, context interaction on the temporal dimension is introduced into the feature graph. The channel movement operation can make the current frame contain the channel information of the two adjacent frames. In this way, the 2D convolution operation can directly extract the spatial-temporal information of the video just like the 3D convolution.\nIt improves the modeling ability of the model in time dimension. based on this basis, the researchers further subdivided the module into TSM module suitable for online video and TSM module suitable for offline video.\n<p align=\"center\">\n<img src=\"../../images/tsm_architecture.png\" height=188 width=500 hspace='10'/> <br />"
+        },
+        {
+            "comment": "Bi-Direction TSM module handles past and future spatial and temporal information, suitable for high throughput offline videos. UNI-Direction TSM is more appropriate for low delay online video recognition. Residual TSM performs better than in-place TSM but may affect spatial information extraction. Torch version tsm implementation to follow.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/TSM.md\":27-39",
+            "content": "</p>\nBi-Direction TSM module can obtain past and future spatial and temporal information, which is suitable for offline video with high throughput. However, UNI-Direction TSM module is only suitable for low delay online video recognition compared with the present and past spatio-temporal information.\nIn addition, the author also considered the insertion position of TSM modules and compared two TSM insertion methods: **Residual TSM** and **in-place TSM**. The author found that **Residual TSM** could achieve better performance than **in-place TSM**, At the same time, author explained that **in-place TSM** may affect the extraction of spatial information.\n<p align=\"center\">\n<img src=\"../../images/residual_tsm.png\" height=188 width=500 hspace='10'/> <br />\n</p>\nTSM module looks **So Easy!!**, the next question is how to implement ?\n# 3. The core codes of TSM\nNow that the principle is clear, let's look at how the code works. First let's have a look the torch version tsm. Unfortunately, the Torch fr"
+        },
+        {
+            "comment": "The code demonstrates a TSM model implementation in PaddlePaddle framework, allowing users to achieve Temporal Shift Module (TSM) operations without writing additional code. It significantly improves accuracy and efficiency on Something-Something datasets. The provided images visually explain the TSM implementation and the optimized version (TSM OP). Additionally, the documentation refers users to the acceleration documentation for further information on speeding up the model while reducing memory consumption.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/TSM.md\":39-57",
+            "content": "amework does not provide an API for TSM, so we will have to do it by ourselves. The code is shown below:\n<p align=\"center\">\n<img src=\"../../images/torch_tsm.png\" height=160 width=500 hspace='10'/> <br />\n</p>\nThis means that you only need to add four lines of code to TSN's codebase then you can **double the accuracy in Something-Something datasets!!** what a simple and efficient model!\nBut...\uff0c\n**paddlepaddle** framework take the needs of the majority of users into account and have achieve TSM OP,then users can use it easily.\n<p align=\"center\">\n<img src=\"../../images/tsm_op.png\" height=300 width=400 hspace='10'/> <br />\n</p>\nSo you no longer have to achieve it by yourself, **it cab be called directly!!! , it can be called directly!!! , it can be called directly!!!** The important thing must say three times.\nDo you think that it is the end of the this topic?  **Too young Too simple !!!**\nWe have also optimized it to increase speed by 5 times while reducing memory consumption. See the acceleration documentation [accelerate.md](./accelerate.md) for more information."
+        },
+        {
+            "comment": "The provided code demonstrates an implementation of the Temporal Shift Module (TSM) in PaddlePaddle. It only requires two lines of code and uses the `temporal_shift` function from `paddle.nn.functional`. This makes it easy to implement the TSM for efficient video understanding, as referenced by Lin Ji et al. and Limin Wang et al.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/TSM.md\":59-72",
+            "content": "Let's have a look at how TSM is implemented using **paddlepaddle**:\n`import paddle.nn.functional as F`\n`shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)`\n**Only two lines codes !!!**, isn't it easy ?\n# Reference\n[1] [Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018](https://arxiv.org/pdf/1811.08383v2.pdf).\n[2] [Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20\u201336. Springer, 2016](https://arxiv.org/abs/1608.00859)."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c17f64d9-ec50-4c57-8589-cf6f4c226113.json b/docs/doc/c17f64d9-ec50-4c57-8589-cf6f4c226113.json
new file mode 100644
index 000000000..ce206df19
--- /dev/null
+++ b/docs/doc/c17f64d9-ec50-4c57-8589-cf6f4c226113.json
@@ -0,0 +1,15 @@
+{
+    "summary": "CrossEntropyLoss is a custom loss function in PaddlePaddle, inheriting from BaseWeightedLoss, for classification tasks. It calculates CrossEntropy loss between scores and labels using F.cross_entropy method and returns the result as a tensor.",
+    "details": [
+        {
+            "comment": "CrossEntropyLoss is a custom loss function in PaddlePaddle for classification tasks. It inherits from BaseWeightedLoss and has a forward method that takes class scores and labels as input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/cross_entropy_loss.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass CrossEntropyLoss(BaseWeightedLoss):\n    \"\"\"Cross Entropy Loss.\"\"\"\n    def _forward(self, score, labels, **kwargs):\n        \"\"\"Forward function.\n        Args:\n            score (paddle.Tensor): The class score.\n            labels (paddle.Tensor): The ground truth labels.\n            kwargs: Any keyword argument to be used to calculate"
+        },
+        {
+            "comment": "This function calculates the CrossEntropy loss between score and labels, using Paddle's F.cross_entropy method, and returns the result as a tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/cross_entropy_loss.py\":30-35",
+            "content": "                CrossEntropy loss.\n        Returns:\n            loss (paddle.Tensor): The returned CrossEntropy loss.\n        \"\"\"\n        loss = F.cross_entropy(score, labels, **kwargs)\n        return loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c195ce2b-9608-498c-b992-61b33f2343e1.json b/docs/doc/c195ce2b-9608-498c-b992-61b33f2343e1.json
new file mode 100644
index 000000000..b61cc9ccc
--- /dev/null
+++ b/docs/doc/c195ce2b-9608-498c-b992-61b33f2343e1.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This PaddleVideo library code initializes a dataset class for action segmentation videos, includes methods to load data for training/validation, and loads video features, labels, and boundaries using a pipeline.",
+    "details": [
+        {
+            "comment": "The code imports necessary libraries and defines the ASRFDataset class for action segmentation video datasets. It registers this dataset with the DATASETS registry and initializes the dataset with specified file paths and pipeline parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/asrf_dataset.py\":0-37",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport os\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass ASRFDataset(BaseDataset):\n    \"\"\"Video dataset for action segmentation.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        feature_path,\n        label_path,\n        boundary_path,\n        **kwargs,"
+        },
+        {
+            "comment": "The code initializes an instance of a dataset class with file paths for labels, boundaries, and features. It defines methods to load index files containing video information and prepare data for training/validation, including loading video features, labels, and boundaries based on the given index.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/asrf_dataset.py\":38-67",
+            "content": "    ):\n        super().__init__(file_path, pipeline, **kwargs)\n        self.label_path = label_path\n        self.boundary_path = boundary_path\n        self.feature_path = feature_path\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        file_ptr = open(self.file_path, 'r')\n        info = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID: Prepare data for training/valid given the index.\"\"\"\n        results = {}\n        video_name = self.info[idx]\n        # load video feature\n        file_name = video_name.split('.')[0] + \".npy\"\n        feat_file_path = os.path.join(self.feature_path, file_name)\n        #TODO: check path\n        video_feat = np.load(feat_file_path)\n        # load label\n        file_name = video_name.split('.')[0] + \".npy\"\n        label_file_path = os.path.join(self.label_path, file_name)\n        label = np.load(label_file_path).astype(np.int64)\n        # load boundary\n        file_name = video_name.split('.')[0] + \".npy\""
+        },
+        {
+            "comment": "The code above is from a dataset loader class in the PaddleVideo library. It loads video features, labels, and boundaries for either training or testing data. The prepare_test function loads video features and labels given an index. The code uses numpy to load data from specified file paths and deepcopy the results for further processing by the pipeline function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/asrf_dataset.py\":68-91",
+            "content": "        boundary_file_path = os.path.join(self.boundary_path, file_name)\n        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)\n        results['video_feat'] = copy.deepcopy(video_feat)\n        results['video_label'] = copy.deepcopy(label)\n        results['video_boundary'] = copy.deepcopy(boundary)\n        results = self.pipeline(results)\n        return results['video_feat'], results['video_label'], results['video_boundary']\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        results = {}\n        video_name = self.info[idx]\n        # load video feature\n        file_name = video_name.split('.')[0] + \".npy\"\n        feat_file_path = os.path.join(self.feature_path, file_name)\n        #TODO: check path\n        video_feat = np.load(feat_file_path)\n        # load label\n        file_name = video_name.split('.')[0] + \".npy\"\n        label_file_path = os.path.join(self.label_path, file_name)\n        label = np.load(label_file_path).astype(np.int64)"
+        },
+        {
+            "comment": "This code snippet loads the boundary data for a video, reads it from a file using numpy's load function, and assigns it to a variable named 'boundary'. The code then creates a results dictionary, copies video features, labels, and boundaries into their respective keys in the results dictionary. Finally, it passes this dictionary through a pipeline and returns the video features, labels, and boundaries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/asrf_dataset.py\":93-103",
+            "content": "        # load boundary\n        file_name = video_name.split('.')[0] + \".npy\"\n        boundary_file_path = os.path.join(self.boundary_path, file_name)\n        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)\n        results['video_feat'] = copy.deepcopy(video_feat)\n        results['video_label'] = copy.deepcopy(label)\n        results['video_boundary'] = copy.deepcopy(boundary)\n        results = self.pipeline(results)\n        return results['video_feat'], results['video_label'], results['video_boundary']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c22bdd3d-0b55-49b5-906d-f139a46e5ffe.json b/docs/doc/c22bdd3d-0b55-49b5-906d-f139a46e5ffe.json
new file mode 100644
index 000000000..e16c40b8b
--- /dev/null
+++ b/docs/doc/c22bdd3d-0b55-49b5-906d-f139a46e5ffe.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code defines a model and calculates loss, F1 score, and edit scores for recognition tasks. It retrieves label start/end times from recognized and ground truth sequences, then iterates through labels to calculate F-score for overlapping segments, updating tp, fp, fn counts and returning the F-score as a float value.",
+    "details": [
+        {
+            "comment": "This code defines the MSTCNHead class, a head for PaddleVideo's MS-TCN model. It inherits from BaseHead and initializes a CrossEntropyLoss and Mean Squared Error loss function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ms_tcn_head.py\":0-32",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom paddle import ParamAttr\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass MSTCNHead(BaseHead):\n    def __init__(self, num_classes, in_channels):\n        super().__init__(num_classes, in_channels)\n        self.ce = nn.CrossEntropyLoss(ignore_index=-100)\n        self.mse = nn.MSELoss(reduction='none')"
+        },
+        {
+            "comment": "The code defines a class for the MS-TCN head, which calculates loss and F1 score. The forward function returns the input as is. The loss function transposes output tensor, computes cross-entropy (CE) loss, and adds mean squared error (MSE) loss with weight 0.15. The get_F1_score function converts predicted and ground truth to lists, counts correct classifications, and returns F1 score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ms_tcn_head.py\":33-67",
+            "content": "        self.num_classes = num_classes\n        # cls score\n        self.overlap = 0.5\n    def forward(self, x):\n        \"\"\"MS-TCN no head\n        \"\"\"\n        return x\n    def loss(self, output, video_gt):\n        \"\"\"calculate loss\n        \"\"\"\n        output_transpose = paddle.transpose(output, [2, 0, 1])\n        ce_x = paddle.reshape(output_transpose,\n                              (output_transpose.shape[0] *\n                               output_transpose.shape[1], self.num_classes))\n        ce_y = video_gt[0, :]\n        ce_loss = self.ce(ce_x, ce_y)\n        loss = ce_loss\n        mse = self.mse(F.log_softmax(output[:, :, 1:], axis=1),\n                       F.log_softmax(output.detach()[:, :, :-1], axis=1))\n        mse = paddle.clip(mse, min=0, max=16)\n        mse_loss = 0.15 * paddle.mean(mse)\n        loss += mse_loss\n        return loss\n    def get_F1_score(self, predicted, groundTruth):\n        recog_content = list(predicted.numpy())\n        gt_content = list(groundTruth[0].numpy())\n        # cls score\n        correct = 0"
+        },
+        {
+            "comment": "This code calculates the F1 score based on a given sequence of content and then extracts labels, start times, and end times from frame-wise labels. It iterates through the sequence to determine correct and incorrect elements, as well as false positives and negatives for the F1 score calculation. The extracted labels, starts, and ends are stored in separate lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ms_tcn_head.py\":68-104",
+            "content": "        total = 0\n        edit = 0\n        for i in range(len(gt_content)):\n            total += 1\n            if gt_content[i] == recog_content[i]:\n                correct += 1\n        edit_num = self.edit_score(recog_content, gt_content)\n        edit += edit_num\n        tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)\n        # cls metric\n        precision = tp / float(tp + fp)\n        recall = tp / float(fp + fn)\n        if precision + recall > 0.0:\n            f1 = 2.0 * (precision * recall) / (precision + recall)\n        else:\n            f1 = 0.0\n        f1 = np.nan_to_num(f1)\n        return f1\n    def get_labels_start_end_time(self, frame_wise_labels):\n        labels = []\n        starts = []\n        ends = []\n        last_label = frame_wise_labels[0]\n        labels.append(frame_wise_labels[0])\n        starts.append(0)\n        for i in range(len(frame_wise_labels)):\n            if frame_wise_labels[i] != last_label:\n                labels.append(frame_wise_labels[i])\n                starts.append(i)"
+        },
+        {
+            "comment": "This code defines two functions: \"labels_start_end\" and \"edit_score\". The first function takes in frame-wise labels, starts, and ends and returns the labels, starts, and ends. The second function calculates the edit score between recognized text and ground truth using a dynamic programming approach, specifically Levenshtein distance algorithm. It normalizes the scores if norm is True, and returns the unnormalized score otherwise.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ms_tcn_head.py\":105-136",
+            "content": "                ends.append(i)\n                last_label = frame_wise_labels[i]\n        ends.append(i + 1)\n        return labels, starts, ends\n    def levenstein(self, p, y, norm=False):\n        m_row = len(p)\n        n_col = len(y)\n        D = np.zeros([m_row + 1, n_col + 1], np.float)\n        for i in range(m_row + 1):\n            D[i, 0] = i\n        for i in range(n_col + 1):\n            D[0, i] = i\n        for j in range(1, n_col + 1):\n            for i in range(1, m_row + 1):\n                if y[j - 1] == p[i - 1]:\n                    D[i, j] = D[i - 1, j - 1]\n                else:\n                    D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,\n                                  D[i - 1, j - 1] + 1)\n        if norm:\n            score = (1 - D[-1, -1] / max(m_row, n_col)) * 100\n        else:\n            score = D[-1, -1]\n        return score\n    def edit_score(self, recognized, ground_truth, norm=True):\n        P, _, _ = self.get_labels_start_end_time(recognized)\n        Y, _, _ = self.get_labels_start_end_time(ground_truth)"
+        },
+        {
+            "comment": "This code calculates the F-score for overlapping segments of labels in two sequences. It first retrieves the start and end times for each label in the recognized and ground truth sequences, then iterates through each label in the recognized sequence to calculate the intersection and union between the current recognized segment and each segment in the ground truth sequence. The code then determines if there is an overlap between the segments, updates the true positive (tp), false positive (fp), and false negative (fn) counts accordingly, and finally returns the F-score as a float value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/ms_tcn_head.py\":137-164",
+            "content": "        return self.levenstein(P, Y, norm)\n    def f_score(self, recognized, ground_truth, overlap):\n        p_label, p_start, p_end = self.get_labels_start_end_time(recognized)\n        y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)\n        tp = 0\n        fp = 0\n        hits = np.zeros(len(y_label))\n        for j in range(len(p_label)):\n            intersection = np.minimum(p_end[j], y_end) - np.maximum(\n                p_start[j], y_start)\n            union = np.maximum(p_end[j], y_end) - np.minimum(\n                p_start[j], y_start)\n            IoU = (1.0 * intersection / union) * (\n                [p_label[j] == y_label[x] for x in range(len(y_label))])\n            # Get the best scoring segment\n            idx = np.array(IoU).argmax()\n            if IoU[idx] >= overlap and not hits[idx]:\n                tp += 1\n                hits[idx] = 1\n            else:\n                fp += 1\n        fn = len(y_label) - sum(hits)\n        return float(tp), float(fp), float(fn)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c2429992-e65c-4762-a20d-99a8ffe36e80.json b/docs/doc/c2429992-e65c-4762-a20d-99a8ffe36e80.json
new file mode 100644
index 000000000..fd1816002
--- /dev/null
+++ b/docs/doc/c2429992-e65c-4762-a20d-99a8ffe36e80.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code converts PaddleVideo's JSON files to training data, exports a model for PP-Human, and organizes it in directories suitable for behavior recognition inference.",
+    "details": [
+        {
+            "comment": "Training behavior recognition model using ST-GCN on PaddleVideo.\nPrepare training data in Numpy format with dimensions (N,C,T,V,M).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/README.md\":0-20",
+            "content": "# PP-Human \u884c\u4e3a\u8bc6\u522b\u6a21\u578b\n\u5b9e\u65f6\u884c\u4eba\u5206\u6790\u5de5\u5177[PP-Human](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/pphuman)\u4e2d\u96c6\u6210\u4e86\u57fa\u4e8e\u9aa8\u9abc\u70b9\u7684\u884c\u4e3a\u8bc6\u522b\u6a21\u5757\u3002\u672c\u6587\u6863\u4ecb\u7ecd\u5982\u4f55\u57fa\u4e8e[PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo/)\uff0c\u5b8c\u6210\u884c\u4e3a\u8bc6\u522b\u6a21\u578b\u7684\u8bad\u7ec3\u6d41\u7a0b\u3002\n## \u884c\u4e3a\u8bc6\u522b\u6a21\u578b\u8bad\u7ec3\n\u76ee\u524d\u884c\u4e3a\u8bc6\u522b\u6a21\u578b\u4f7f\u7528\u7684\u662f[ST-GCN](https://arxiv.org/abs/1801.07455)\uff0c\u5e76\u5728[PaddleVideo\u8bad\u7ec3\u6d41\u7a0b](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/stgcn.md)\u7684\u57fa\u7840\u4e0a\u4fee\u6539\u9002\u914d\uff0c\u5b8c\u6210\u6a21\u578b\u8bad\u7ec3\u3002\n### \u51c6\u5907\u8bad\u7ec3\u6570\u636e\nSTGCN\u662f\u4e00\u4e2a\u57fa\u4e8e\u9aa8\u9abc\u70b9\u5750\u6807\u5e8f\u5217\u8fdb\u884c\u9884\u6d4b\u7684\u6a21\u578b\u3002\u5728PaddleVideo\u4e2d\uff0c\u8bad\u7ec3\u6570\u636e\u4e3a\u91c7\u7528`.npy`\u683c\u5f0f\u5b58\u50a8\u7684`Numpy`\u6570\u636e\uff0c\u6807\u7b7e\u5219\u53ef\u4ee5\u662f`.npy`\u6216`.pkl`\u683c\u5f0f\u5b58\u50a8\u7684\u6587\u4ef6\u3002\u5bf9\u4e8e\u5e8f\u5217\u6570\u636e\u7684\u7ef4\u5ea6\u8981\u6c42\u4e3a`(N,C,T,V,M)`\u3002\n\u4ee5\u6211\u4eec\u5728PPhuman\u4e2d\u7684\u6a21\u578b\u4e3a\u4f8b\uff0c\u5176\u4e2d\u5177\u4f53\u8bf4\u660e\u5982\u4e0b\uff1a\n| \u7ef4\u5ea6 | \u5927\u5c0f | \u8bf4\u660e |\n| ---- | ---- | ---------- |\n| N | \u4e0d\u5b9a | \u6570\u636e\u96c6\u5e8f\u5217\u4e2a\u6570 |\n| C | 2 | \u5173\u952e\u70b9\u5750\u6807\u7ef4\u5ea6\uff0c\u5373(x, y) |\n| T | 50 | \u52a8\u4f5c\u5e8f\u5217\u7684\u65f6\u5e8f\u7ef4\u5ea6\uff08\u5373\u6301\u7eed\u5e27\u6570\uff09|\n| V | 17 | \u6bcf\u4e2a\u4eba\u7269\u5173\u952e\u70b9\u7684\u4e2a\u6570\uff0c\u8fd9\u91cc\u6211\u4eec\u4f7f\u7528\u4e86`COCO`\u6570\u636e\u96c6\u7684\u5b9a\u4e49\uff0c\u5177\u4f53\u53ef\u89c1[\u8fd9\u91cc](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/docs/tutorials/PrepareKeypointDataSet_cn.md#COCO%E6%95%B0%E6%8D%AE%E9%9B%86) |\n| M | 1 | \u4eba\u7269\u4e2a\u6570\uff0c\u8fd9\u91cc\u6211\u4eec\u6bcf\u4e2a\u52a8\u4f5c\u5e8f\u5217\u53ea\u9488\u5bf9\u5355\u4eba\u9884\u6d4b |\n#### 1. \u83b7\u53d6\u5e8f\u5217\u7684\u9aa8\u9abc\u70b9\u5750\u6807\n\u5bf9\u4e8e\u4e00\u4e2a\u5f85\u6807\u6ce8\u7684\u5e8f\u5217\uff08\u8fd9\u91cc\u5e8f\u5217\u6307\u4e00\u4e2a\u52a8\u4f5c\u7247\u6bb5\uff0c\u53ef\u4ee5\u662f\u89c6\u9891\u6216\u6709\u987a\u5e8f\u7684\u56fe\u7247\u96c6\u5408\uff09\u3002\u53ef\u4ee5\u901a\u8fc7\u6a21\u578b\u9884\u6d4b\u6216\u4eba\u5de5\u6807\u6ce8\u7684\u65b9\u5f0f\u83b7\u53d6\u9aa8\u9abc\u70b9\uff08\u4e5f\u79f0\u4e3a\u5173\u952e\u70b9\uff09\u5750\u6807\u3002"
+        },
+        {
+            "comment": "The code describes the process of preparing data for PP-Human, a human action detection model. It involves obtaining key points from pre-trained models or manual annotations, normalizing the coordinates, setting a uniform sequence length, and saving the data in PaddleVideo compatible format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/README.md\":21-41",
+            "content": "- \u6a21\u578b\u9884\u6d4b\uff1a\u53ef\u4ee5\u76f4\u63a5\u9009\u7528[PaddleDetection KeyPoint\u6a21\u578b\u7cfb\u5217](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/configs/keypoint) \u6a21\u578b\u5e93\u4e2d\u7684\u6a21\u578b\uff0c\u5e76\u6839\u636e`3\u3001\u8bad\u7ec3\u4e0e\u6d4b\u8bd5 - \u90e8\u7f72\u9884\u6d4b - \u68c0\u6d4b+keypoint top-down\u6a21\u578b\u8054\u5408\u90e8\u7f72`\u4e2d\u7684\u6b65\u9aa4\u83b7\u53d6\u76ee\u6807\u5e8f\u5217\u768417\u4e2a\u5173\u952e\u70b9\u5750\u6807\u3002\n- \u4eba\u5de5\u6807\u6ce8\uff1a\u82e5\u5bf9\u5173\u952e\u70b9\u7684\u6570\u91cf\u6216\u662f\u5b9a\u4e49\u6709\u5176\u4ed6\u9700\u6c42\uff0c\u4e5f\u53ef\u4ee5\u76f4\u63a5\u4eba\u5de5\u6807\u6ce8\u5404\u4e2a\u5173\u952e\u70b9\u7684\u5750\u6807\u4f4d\u7f6e\uff0c\u6ce8\u610f\u5bf9\u4e8e\u88ab\u906e\u6321\u6216\u8f83\u96be\u6807\u6ce8\u7684\u70b9\uff0c\u4ecd\u9700\u8981\u6807\u6ce8\u4e00\u4e2a\u5927\u81f4\u5750\u6807\uff0c\u5426\u5219\u540e\u7eed\u7f51\u7edc\u5b66\u4e60\u8fc7\u7a0b\u4f1a\u53d7\u5230\u5f71\u54cd\u3002\n\u5728\u5b8c\u6210\u9aa8\u9abc\u70b9\u5750\u6807\u7684\u83b7\u53d6\u540e\uff0c\u5efa\u8bae\u6839\u636e\u5404\u4eba\u7269\u7684\u68c0\u6d4b\u6846\u8fdb\u884c\u5f52\u4e00\u5316\u5904\u7406\uff0c\u4ee5\u6d88\u9664\u4eba\u7269\u4f4d\u7f6e\u3001\u5c3a\u5ea6\u7684\u5dee\u5f02\u7ed9\u7f51\u7edc\u5e26\u6765\u7684\u6536\u655b\u96be\u5ea6\uff0c\u8fd9\u4e00\u6b65\u53ef\u4ee5\u53c2\u8003[\u8fd9\u91cc](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/deploy/pphuman/pipe_utils.py#L352-L363)\u3002\n#### 2. \u7edf\u4e00\u5e8f\u5217\u7684\u65f6\u5e8f\u957f\u5ea6\n\u7531\u4e8e\u5b9e\u9645\u6570\u636e\u4e2d\u6bcf\u4e2a\u52a8\u4f5c\u7684\u957f\u5ea6\u4e0d\u4e00\uff0c\u9996\u5148\u9700\u8981\u6839\u636e\u60a8\u7684\u6570\u636e\u548c\u5b9e\u9645\u573a\u666f\u9884\u5b9a\u65f6\u5e8f\u957f\u5ea6\uff08\u5728PP-Human\u4e2d\u6211\u4eec\u91c7\u752850\u5e27\u4e3a\u4e00\u4e2a\u52a8\u4f5c\u5e8f\u5217\uff09\uff0c\u5e76\u5bf9\u6570\u636e\u505a\u4ee5\u4e0b\u5904\u7406\uff1a\n- \u5b9e\u9645\u957f\u5ea6\u8d85\u8fc7\u9884\u5b9a\u957f\u5ea6\u7684\u6570\u636e\uff0c\u968f\u673a\u622a\u53d6\u4e00\u4e2a50\u5e27\u7684\u7247\u6bb5\n- \u5b9e\u9645\u957f\u5ea6\u4e0d\u8db3\u9884\u5b9a\u957f\u5ea6\u7684\u6570\u636e\uff1a\u88650\uff0c\u76f4\u5230\u6ee1\u8db350\u5e27\n- \u6070\u597d\u7b49\u4e8e\u9884\u5b9a\u957f\u5ea6\u7684\u6570\u636e\uff1a \u65e0\u9700\u5904\u7406\n\u6ce8\u610f\uff1a\u5728\u8fd9\u4e00\u6b65\u5b8c\u6210\u540e\uff0c\u8bf7\u4e25\u683c\u786e\u8ba4\u5904\u7406\u540e\u7684\u6570\u636e\u4ecd\u7136\u5305\u542b\u4e86\u4e00\u4e2a\u5b8c\u6574\u7684\u884c\u4e3a\u52a8\u4f5c\uff0c\u4e0d\u4f1a\u4ea7\u751f\u9884\u6d4b\u4e0a\u7684\u6b67\u4e49\uff0c\u5efa\u8bae\u901a\u8fc7\u53ef\u89c6\u5316\u6570\u636e\u7684\u65b9\u5f0f\u8fdb\u884c\u786e\u8ba4\u3002\n#### 3. \u4fdd\u5b58\u4e3aPaddleVideo\u53ef\u7528\u7684\u6587\u4ef6\u683c\u5f0f\n\u5728\u7ecf\u8fc7\u524d\u4e24\u6b65\u5904\u7406\u540e\uff0c\u6211\u4eec\u5f97\u5230\u4e86\u6bcf\u4e2a\u4eba\u7269\u52a8\u4f5c\u7247\u6bb5\u7684\u6807\u6ce8\uff0c\u6b64\u65f6\u6211\u4eec\u5df2\u6709\u4e00\u4e2a\u5217\u8868`all_kpts`\uff0c\u8fd9\u4e2a\u5217\u8868\u4e2d\u5305\u542b\u591a\u4e2a\u5173\u952e\u70b9\u5e8f\u5217\u7247\u6bb5\uff0c\u5176\u4e2d\u6bcf\u4e00\u4e2a\u7247\u6bb5\u5f62\u72b6\u4e3a(T, V, C) \uff08\u5728\u6211\u4eec\u7684\u4f8b\u5b50\u4e2d\u5373(50, 17, 2)), \u4e0b\u9762\u8fdb\u4e00\u6b65\u5c06\u5176\u8f6c\u5316\u4e3aPaddleVideo\u53ef\u7528\u7684\u683c\u5f0f\u3002\n- \u8c03\u6574\u7ef4\u5ea6\u987a\u5e8f\uff1a \u53ef\u901a\u8fc7`np.transpose`\u548c`np.expand_dims`\u5c06\u6bcf\u4e00\u4e2a\u7247\u6bb5\u7684\u7ef4\u5ea6\u8f6c\u5316\u4e3a(C, T, V, M)\u7684\u683c\u5f0f\u3002\n- \u5c06\u6240\u6709\u7247\u6bb5\u7ec4\u5408\u5e76\u4fdd\u5b58\u4e3a\u4e00\u4e2a\u6587\u4ef6\n\u6ce8\u610f\uff1a\u8fd9\u91cc\u7684`class_id`\u662f`int`\u7c7b\u578b\uff0c\u4e0e\u5176\u4ed6\u5206\u7c7b\u4efb\u52a1\u7c7b\u4f3c\u3002\u4f8b\u5982`0\uff1a\u6454\u5012\uff0c 1\uff1a\u5176\u4ed6`\u3002\n\u81f3\u6b64\uff0c\u6211\u4eec\u5f97\u5230\u4e86\u53ef\u7528\u7684\u8bad\u7ec3\u6570\u636e\uff08`.npy`\uff09\u548c\u5bf9\u5e94\u7684\u6807\u6ce8\u6587\u4ef6\uff08`.pkl`\uff09\u3002"
+        },
+        {
+            "comment": "This code is downloading pretrained models for keypoint detection using PaddleDetection and then using them to get the keypoint coordinates for an image sequence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/README.md\":43-59",
+            "content": "#### \u793a\u4f8b\uff1a\u57fa\u4e8eUR Fall Detection Dataset\u7684\u6454\u5012\u6570\u636e\u5904\u7406\n[UR Fall Detection Dataset](http://fenix.univ.rzeszow.pl/~mkepski/ds/uf.html)\u662f\u4e00\u4e2a\u5305\u542b\u4e86\u4e0d\u540c\u6444\u50cf\u673a\u89c6\u89d2\u53ca\u4e0d\u540c\u4f20\u611f\u5668\u4e0b\u7684\u6454\u5012\u68c0\u6d4b\u6570\u636e\u96c6\u3002\u6570\u636e\u96c6\u672c\u8eab\u5e76\u4e0d\u5305\u542b\u5173\u952e\u70b9\u5750\u6807\u6807\u6ce8\uff0c\u5728\u8fd9\u91cc\u6211\u4eec\u4f7f\u7528\u5e73\u89c6\u89c6\u89d2\uff08camera 0\uff09\u7684RGB\u56fe\u50cf\u6570\u636e\uff0c\u4ecb\u7ecd\u5982\u4f55\u4f9d\u7167\u4e0a\u9762\u5c55\u793a\u7684\u6b65\u9aa4\u5b8c\u6210\u6570\u636e\u51c6\u5907\u5de5\u4f5c\u3002\n\uff081\uff09\u4f7f\u7528[PaddleDetection\u5173\u952e\u70b9\u6a21\u578b](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/configs/keypoint)\u5b8c\u6210\u5173\u952e\u70b9\u5750\u6807\u7684\u68c0\u6d4b\n```bash\n# current path is under root of PaddleDetection\n# Step 1: download pretrained inference models.\nwget https://bj.bcebos.com/v1/paddledet/models/pipeline/mot_ppyoloe_l_36e_pipeline.zip\nwget https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip\nunzip -d output_inference/ mot_ppyoloe_l_36e_pipeline.zip\nunzip -d output_inference/ dark_hrnet_w32_256x192.zip\n# Step 2: Get the keypoint coordinarys\n# if your data is image sequence\npython deploy/python/det_keypoint_unite_infer.py --det_model_dir=output_inference/mot_ppyoloe_l_36e_pipeline/ --keypoint_model_dir=output_inference/dark_hrnet_w32_256x192 --image_dir={your image directory path} --device=GPU --save_res=True"
+        },
+        {
+            "comment": "The provided code is a command line instruction for running the PaddleVideo's PPHuman application on video data. It uses pre-trained models to detect human keypoints in the video, resulting in a `det_keypoint_unite_image_results.json` file containing the detection results. These steps are repeated for each segment of UR Fall data. The JSON files are then saved into a specific directory structure with a naming convention based on video and camera IDs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/README.md\":61-82",
+            "content": "# if your data is video\npython deploy/python/det_keypoint_unite_infer.py --det_model_dir=output_inference/mot_ppyoloe_l_36e_pipeline/ --keypoint_model_dir=output_inference/dark_hrnet_w32_256x192 --video_file={your video file path} --device=GPU --save_res=True\n```\n\u8fd9\u6837\u6211\u4eec\u4f1a\u5f97\u5230\u4e00\u4e2a`det_keypoint_unite_image_results.json`\u7684\u68c0\u6d4b\u7ed3\u679c\u6587\u4ef6\u3002\u5185\u5bb9\u7684\u5177\u4f53\u542b\u4e49\u8bf7\u89c1[\u8fd9\u91cc](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/deploy/python/det_keypoint_unite_infer.py#L108)\u3002\n\u8fd9\u91cc\u6211\u4eec\u9700\u8981\u5bf9UR Fall\u4e2d\u7684\u6bcf\u4e00\u6bb5\u6570\u636e\u6267\u884c\u4e0a\u9762\u4ecb\u7ecd\u7684\u6b65\u9aa4\uff0c\u5728\u6bcf\u4e00\u6bb5\u6267\u884c\u5b8c\u6210\u540e\u53ca\u65f6\u5c06\u68c0\u6d4b\u7ed3\u679c\u6587\u4ef6\u59a5\u5584\u4fdd\u5b58\u5230\u4e00\u4e2a\u6587\u4ef6\u5939\u4e2d\u3002\n```bash\nmkdir {root of PaddleVideo}/applications/PPHuman/datasets/annotations\nmv det_keypoint_unite_image_results.json {root of PaddleVideo}/applications/PPHuman/datasets/annotations/det_keypoint_unite_image_results_{video_id}_{camera_id}.json\n```\n\uff082\uff09\u5c06\u5173\u952e\u70b9\u5750\u6807\u8f6c\u5316\u4e3a\u8bad\u7ec3\u6570\u636e\n\u5728\u5b8c\u6210\u4e0a\u8ff0\u6b65\u9aa4\u540e\uff0c\u6211\u4eec\u5f97\u5230\u7684\u9aa8\u9abc\u70b9\u6570\u636e\u5f62\u5f0f\u5982\u4e0b\uff1a\n```\nannotations/\n\u251c\u2500\u2500 det_keypoint_unite_image_results_fall-01-cam0-rgb.json\n\u251c\u2500\u2500 det_keypoint_unite_image_results_fall-02-cam0-rgb.json\n\u251c\u2500\u2500 det_keypoint_unite_image_results_fall-03-cam0-rgb.json\n\u251c\u2500\u2500 det_keypoint_unite_image_results_fall-04-cam0-rgb.json"
+        },
+        {
+            "comment": "Code snippet represents a list of json files in a PaddleVideo application called \"PPHuman\". These JSON files contain image results for different actions. The code suggests using a provided script to convert these data into training data, resulting in two new files: \"train_data.npy\" and \"train_label.pkl\". It mentions that some data preparation steps include parsing the JSON content and organizing the training data. There is a link for more comprehensive data available for download. The code also provides instructions on how to train and test the model using PaddleVideo's main script with specific configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/README.md\":83-113",
+            "content": "    ...\n\u251c\u2500\u2500 det_keypoint_unite_image_results_fall-28-cam0-rgb.json\n\u251c\u2500\u2500 det_keypoint_unite_image_results_fall-29-cam0-rgb.json\n\u2514\u2500\u2500 det_keypoint_unite_image_results_fall-30-cam0-rgb.json\n```\n\u8fd9\u91cc\u4f7f\u7528\u6211\u4eec\u63d0\u4f9b\u7684\u811a\u672c\u76f4\u63a5\u5c06\u6570\u636e\u8f6c\u5316\u4e3a\u8bad\u7ec3\u6570\u636e, \u5f97\u5230\u6570\u636e\u6587\u4ef6`train_data.npy`, \u6807\u7b7e\u6587\u4ef6`train_label.pkl`\u3002\u8be5\u811a\u672c\u6267\u884c\u7684\u5185\u5bb9\u5305\u62ec\u89e3\u6790json\u6587\u4ef6\u5185\u5bb9\u3001\u524d\u8ff0\u6b65\u9aa4\u4e2d\u4ecb\u7ecd\u7684\u6574\u7406\u8bad\u7ec3\u6570\u636e\u53ca\u4fdd\u5b58\u6570\u636e\u6587\u4ef6\u3002\n```bash\n# current path is {root of PaddleVideo}/applications/PPHuman/datasets/\npython prepare_dataset.py\n```\n\u51e0\u70b9\u8bf4\u660e\uff1a\n- UR Fall\u7684\u52a8\u4f5c\u5927\u591a\u662f100\u5e27\u5de6\u53f3\u957f\u5ea6\u5bf9\u5e94\u4e00\u4e2a\u5b8c\u6574\u52a8\u4f5c\uff0c\u4e2a\u522b\u89c6\u9891\u5305\u542b\u4e00\u4e9b\u65e0\u5173\u52a8\u4f5c\uff0c\u53ef\u4ee5\u624b\u5de5\u53bb\u9664\uff0c\u4e5f\u53ef\u4ee5\u88c1\u526a\u4f5c\u4e3a\u8d1f\u6837\u672c\n- \u7edf\u4e00\u5c06\u6570\u636e\u6574\u7406\u4e3a100\u5e27\uff0c\u518d\u62bd\u53d6\u4e3a50\u5e27\uff0c\u4fdd\u8bc1\u52a8\u4f5c\u5b8c\u6574\u6027\n- \u4e0a\u8ff0\u5305\u542b\u6454\u5012\u7684\u52a8\u4f5c\u662f\u6b63\u6837\u672c\uff0c\u5728\u5b9e\u9645\u8bad\u7ec3\u4e2d\u4e5f\u9700\u8981\u4e00\u4e9b\u5176\u4ed6\u7684\u52a8\u4f5c\u6216\u6b63\u5e38\u7ad9\u7acb\u7b49\u4f5c\u4e3a\u8d1f\u6837\u672c\uff0c\u6b65\u9aa4\u540c\u4e0a\uff0c\u4f46\u6ce8\u610flabel\u7684\u7c7b\u578b\u53d61\u3002\n\u8fd9\u91cc\u6211\u4eec\u63d0\u4f9b\u4e86\u6211\u4eec\u5904\u7406\u597d\u7684\u66f4\u5168\u9762\u7684[\u6570\u636e](https://bj.bcebos.com/v1/paddledet/data/PPhuman/fall_data.zip)\uff0c\u5305\u62ec\u5176\u4ed6\u573a\u666f\u4e2d\u7684\u6454\u5012\u53ca\u975e\u6454\u5012\u7684\u52a8\u4f5c\u573a\u666f\u3002\n### \u8bad\u7ec3\u4e0e\u6d4b\u8bd5\n\u5728PaddleVideo\u4e2d\uff0c\u4f7f\u7528\u4ee5\u4e0b\u547d\u4ee4\u5373\u53ef\u5f00\u59cb\u8bad\u7ec3\uff1a\n```bash\n# current path is under root of PaddleVideo\npython main.py -c applications/PPHuman/configs/stgcn_pphuman.yaml\n# \u7531\u4e8e\u6574\u4e2a\u4efb\u52a1\u53ef\u80fd\u8fc7\u62df\u5408,\u5efa\u8bae\u540c\u65f6\u5f00\u542f\u9a8c\u8bc1\u4ee5\u4fdd\u5b58\u6700\u4f73\u6a21\u578b\npython main.py --validate -c applications/PPHuman/configs/stgcn_pphuman.yaml\n```\n\u5728\u8bad\u7ec3\u5b8c\u6210\u540e\uff0c\u91c7\u7528\u4ee5\u4e0b\u547d\u4ee4\u8fdb\u884c\u9884\u6d4b\uff1a\n```bash\npython main.py --test -c applications/PPHuman/configs/stgcn_pphuman.yaml  -w output/STGCN/STGCN_best.pdparams"
+        },
+        {
+            "comment": "The provided code demonstrates the process of exporting a model in PaddleVideo for use in PP-Human. It creates the necessary files and renames them according to PP-Human's requirements, resulting in a structured directory that can be used for behavior recognition inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PPHuman/README.md\":114-142",
+            "content": "```\n### \u5bfc\u51fa\u6a21\u578b\u63a8\u7406\n- \u5728PaddleVideo\u4e2d\uff0c\u901a\u8fc7\u4ee5\u4e0b\u547d\u4ee4\u5b9e\u73b0\u6a21\u578b\u7684\u5bfc\u51fa\uff0c\u5f97\u5230\u6a21\u578b\u7ed3\u6784\u6587\u4ef6`STGCN.pdmodel`\u548c\u6a21\u578b\u6743\u91cd\u6587\u4ef6`STGCN.pdiparams`\uff0c\u5e76\u589e\u52a0\u914d\u7f6e\u6587\u4ef6\uff1a\n```bash\n# current path is under root of PaddleVideo\npython tools/export_model.py -c applications/PPHuman/configs/stgcn_pphuman.yaml \\\n                                -p output/STGCN/STGCN_best.pdparams \\\n                                -o output_inference/STGCN\ncp applications/PPHuman/configs/infer_cfg.yml output_inference/STGCN\n# \u91cd\u547d\u540d\u6a21\u578b\u6587\u4ef6\uff0c\u9002\u914dPP-Human\u7684\u8c03\u7528\ncd output_inference/STGCN\nmv STGCN.pdiparams model.pdiparams\nmv STGCN.pdiparams.info model.pdiparams.info\nmv STGCN.pdmodel model.pdmodel\n```\n\u5b8c\u6210\u540e\u7684\u5bfc\u51fa\u6a21\u578b\u76ee\u5f55\u7ed3\u6784\u5982\u4e0b\uff1a\n```\nSTGCN\n\u251c\u2500\u2500 infer_cfg.yml\n\u251c\u2500\u2500 model.pdiparams\n\u251c\u2500\u2500 model.pdiparams.info\n\u251c\u2500\u2500 model.pdmodel\n```\n\u81f3\u6b64\uff0c\u5c31\u53ef\u4ee5\u4f7f\u7528[PP-Human](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/pphuman)\u8fdb\u884c\u884c\u4e3a\u8bc6\u522b\u7684\u63a8\u7406\u4e86\u3002"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c2f9f009-1d3a-467b-983f-70b52a617326.json b/docs/doc/c2f9f009-1d3a-467b-983f-70b52a617326.json
new file mode 100644
index 000000000..3e4c4ce88
--- /dev/null
+++ b/docs/doc/c2f9f009-1d3a-467b-983f-70b52a617326.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code initializes an ActionDetection instance, loads a model for prediction, processes video URLs by extracting frames and audio, makes predictions on these inputs, and stores the results in a JSON file. The JSON file is written without escaping special characters.",
+    "details": [
+        {
+            "comment": "The code imports necessary libraries, appends the \"action_detect\" directory to the system path, and initializes an ActionDetection instance. It then loads the model for prediction using a specified configuration file. The code reads a list of video URLs from a file, processes each video by extracting image frames and audio, and makes predictions on these inputs. Finally, it stores the results in a JSON file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/predict.py\":1-32",
+            "content": "import os\nimport sys\nimport json\nsys.path.append('action_detect')\nfrom action import ActionDetection\nif __name__ == '__main__':\n    dataset_dir = \"datasets/\"\n    model_predict = ActionDetection(cfg_file=\"configs_basketball/configs_basketball.yaml\")\n    model_predict.load_model()\n    video_url = os.path.join(dataset_dir, 'mp4.list')\n    with open(video_url, 'r') as f:\n        lines = f.readlines()\n    lines = [os.path.join(dataset_dir, \"mp4\", os.path.basename(k.strip())) for k in lines]\n    results = []\n    for line in lines:\n        video_name = line\n        print(video_name)\n        imgs_path = video_name.replace(\".mp4\", \"\").replace(\"mp4\", \"frames\")\n        pcm_path = video_name.replace(\".mp4\", \".pcm\").replace(\"mp4\", \"pcm\")\n        bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)\n        results.append({'video_name': line,\n                        'bmn_results': bmn_results, \n                        'action_results': action_results})\n    with open('results.json', 'w', encoding='utf-8') as f:"
+        },
+        {
+            "comment": "Writes JSON-formatted 'results' to file using indentation and without escaping special characters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/predict.py\":33-34",
+            "content": "       data = json.dumps(results, indent=4, ensure_ascii=False)\n       f.write(data) "
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c3a1bf5b-7d7c-4c67-9903-6185834c09f9.json b/docs/doc/c3a1bf5b-7d7c-4c67-9903-6185834c09f9.json
new file mode 100644
index 000000000..888c45d29
--- /dev/null
+++ b/docs/doc/c3a1bf5b-7d7c-4c67-9903-6185834c09f9.json
@@ -0,0 +1,55 @@
+{
+    "summary": "The code imports modules, defines functions for data processing and splitting, handles evaluation cases, transforms joints, encodes labels, and saves the training/testing sets in suitable formats. It applies translation, alignment, and uses \"split_dataset\" function to create train/test indices before printing 'Done!'.",
+    "details": [
+        {
+            "comment": "This code imports necessary modules and defines constants for file paths. It checks if a directory exists, creates it if not, and defines a function to remove frames with NaN values while logging such occurrences.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":0-33",
+            "content": "# ref: https://github.com/Uason-Chen/CTR-GCN/blob/main/data/ntu/seq_transformation.py\nimport os\nimport os.path as osp\nimport numpy as np\nimport pickle\nimport logging\nfrom sklearn.model_selection import train_test_split\nroot_path = './'\nstat_path = osp.join(root_path, 'statistics')\nsetup_file = osp.join(stat_path, 'setup.txt')\ncamera_file = osp.join(stat_path, 'camera.txt')\nperformer_file = osp.join(stat_path, 'performer.txt')\nreplication_file = osp.join(stat_path, 'replication.txt')\nlabel_file = osp.join(stat_path, 'label.txt')\nskes_name_file = osp.join(stat_path, 'skes_available_name.txt')\ndenoised_path = osp.join(root_path, 'denoised_data')\nraw_skes_joints_pkl = osp.join(denoised_path, 'raw_denoised_joints.pkl')\nframes_file = osp.join(denoised_path, 'frames_cnt.txt')\nsave_path = './'\nif not osp.exists(save_path):\n    os.mkdir(save_path)\ndef remove_nan_frames(ske_name, ske_joints, nan_logger):\n    num_frames = ske_joints.shape[0]\n    valid_frames = []\n    for f in range(num_frames):\n        if not np.any(np.isnan(ske_joints[f])):"
+        },
+        {
+            "comment": "The code defines a function \"seq_translation\" that iterates through multiple skeleton joints sequences. It checks for missing frames and calculates the origin point. It returns valid frames only if any are found, or logs nan indices otherwise. The code also handles cases with one or two bodies in the sequence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":34-62",
+            "content": "            valid_frames.append(f)\n        else:\n            nan_indices = np.where(np.isnan(ske_joints[f]))[0]\n            nan_logger.info('{}\\t{:^5}\\t{}'.format(ske_name, f + 1,\n                                                   nan_indices))\n    return ske_joints[valid_frames]\ndef seq_translation(skes_joints):\n    for idx, ske_joints in enumerate(skes_joints):\n        num_frames = ske_joints.shape[0]\n        num_bodies = 1 if ske_joints.shape[1] == 75 else 2\n        if num_bodies == 2:\n            missing_frames_1 = np.where(ske_joints[:, :75].sum(axis=1) == 0)[0]\n            missing_frames_2 = np.where(ske_joints[:, 75:].sum(axis=1) == 0)[0]\n            cnt1 = len(missing_frames_1)\n            cnt2 = len(missing_frames_2)\n        i = 0  # get the \"real\" first frame of actor1\n        while i < num_frames:\n            if np.any(ske_joints[i, :75] != 0):\n                break\n            i += 1\n        origin = np.copy(ske_joints[i, 3:6])  # new origin: joint-2\n        for f in range(num_frames):\n            if num_bodies == 1:"
+        },
+        {
+            "comment": "This code is performing sequence transformation for NTU RGB+D dataset. It subtracts origin from joint coordinates and handles missing frames by setting them to zero if there are only two actors. It also logs information about skeletons, frames, and joints using a logger.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":63-88",
+            "content": "                ske_joints[f] -= np.tile(origin, 25)\n            else:  # for 2 actors\n                ske_joints[f] -= np.tile(origin, 50)\n        if (num_bodies == 2) and (cnt1 > 0):\n            ske_joints[missing_frames_1, :75] = np.zeros((cnt1, 75),\n                                                         dtype=np.float32)\n        if (num_bodies == 2) and (cnt2 > 0):\n            ske_joints[missing_frames_2, 75:] = np.zeros((cnt2, 75),\n                                                         dtype=np.float32)\n        skes_joints[idx] = ske_joints  # Update\n    return skes_joints\ndef frame_translation(skes_joints, skes_name, frames_cnt):\n    nan_logger = logging.getLogger('nan_skes')\n    nan_logger.setLevel(logging.INFO)\n    nan_logger.addHandler(logging.FileHandler(\"./nan_frames.log\"))\n    nan_logger.info('{}\\t{}\\t{}'.format('Skeleton', 'Frame', 'Joints'))\n    for idx, ske_joints in enumerate(skes_joints):\n        num_frames = ske_joints.shape[0]\n        # Calculate the distance between spine base (joint-1) and spine (joint-21)"
+        },
+        {
+            "comment": "This code aligns all sequences to the same frame length by subtracting the origin (middle of spine joint) from each skeleton joint, normalizing the resulting coordinates based on the distance between the new origin and original origin. It updates the number of valid frames for each sequence and returns the aligned skeleton joints and updated frame counts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":89-117",
+            "content": "        j1 = ske_joints[:, 0:3]\n        j21 = ske_joints[:, 60:63]\n        dist = np.sqrt(((j1 - j21)**2).sum(axis=1))\n        for f in range(num_frames):\n            origin = ske_joints[f, 3:\n                                6]  # new origin: middle of the spine (joint-2)\n            if (ske_joints[f, 75:] == 0).all():\n                ske_joints[f, :75] = (ske_joints[f, :75] - np.tile(origin, 25)) / \\\n                                      dist[f] + np.tile(origin, 25)\n            else:\n                ske_joints[f] = (ske_joints[f] - np.tile(origin, 50)) / \\\n                                 dist[f] + np.tile(origin, 50)\n        ske_name = skes_name[idx]\n        ske_joints = remove_nan_frames(ske_name, ske_joints, nan_logger)\n        frames_cnt[idx] = num_frames  # update valid number of frames\n        skes_joints[idx] = ske_joints\n    return skes_joints, frames_cnt\ndef align_frames(skes_joints, frames_cnt):\n    \"\"\"\n    Align all sequences with the same frame length.\n    \"\"\"\n    num_skes = len(skes_joints)\n    max_num_frames = frames_cnt.max()  # 300"
+        },
+        {
+            "comment": "This code is part of the PaddleVideo library and contains three functions. The first function, `seq_transformation`, takes a list of skeleton joints and transforms them into aligned positions for all frames. It handles cases where there are either one or two bodies. The second function, `one_hot_vector`, converts a list of labels into a one-hot encoded vector. Lastly, the third function, `split_train_val`, splits the training set into train and validation sets using a specified method (either 'sklearn' or user-defined) and ratio.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":118-149",
+            "content": "    aligned_skes_joints = np.zeros((num_skes, max_num_frames, 150),\n                                   dtype=np.float32)\n    for idx, ske_joints in enumerate(skes_joints):\n        num_frames = ske_joints.shape[0]\n        num_bodies = 1 if ske_joints.shape[1] == 75 else 2\n        if num_bodies == 1:\n            aligned_skes_joints[idx, :num_frames] = np.hstack(\n                (ske_joints, np.zeros_like(ske_joints)))\n        else:\n            aligned_skes_joints[idx, :num_frames] = ske_joints\n    return aligned_skes_joints\ndef one_hot_vector(labels):\n    num_skes = len(labels)\n    labels_vector = np.zeros((num_skes, 60))\n    for idx, l in enumerate(labels):\n        labels_vector[idx, l] = 1\n    return labels_vector\ndef split_train_val(train_indices, method='sklearn', ratio=0.05):\n    \"\"\"\n    Get validation set by splitting data randomly from training set with two methods.\n    In fact, I thought these two methods are equal as they got the same performance.\n    \"\"\"\n    if method == 'sklearn':\n        return train_test_split(train_indices,"
+        },
+        {
+            "comment": "This code defines a function to split a dataset into training and validation sets based on the input parameters. It also includes functionality for selecting the validation set from the training set using either sklearn or numpy methods, and saving labels and features (joints positions) for each sequence of each dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":150-175",
+            "content": "                                test_size=ratio,\n                                random_state=10000)\n    else:\n        np.random.seed(10000)\n        np.random.shuffle(train_indices)\n        val_num_skes = int(np.ceil(0.05 * len(train_indices)))\n        val_indices = train_indices[:val_num_skes]\n        train_indices = train_indices[val_num_skes:]\n        return train_indices, val_indices\ndef split_dataset(skes_name, skes_joints, label, performer, camera, evaluation,\n                  save_path):\n    train_indices, test_indices = get_indices(performer, camera, evaluation)\n    m = 'sklearn'  # 'sklearn' or 'numpy'\n    # Select validation set from training set\n    # train_indices, val_indices = split_train_val(train_indices, m)\n    # Save labels and num_frames for each sequence of each data set\n    train_labels = label[train_indices]\n    test_labels = label[test_indices]\n    train_x = skes_joints[train_indices]\n    # train_y = one_hot_vector(train_labels)\n    test_x = skes_joints[test_indices]\n    # test_y = one_hot_vector(test_labels)"
+        },
+        {
+            "comment": "The code is creating evaluation paths, checking if they exist, and then initializing the paths for train_data.npy, train_label.pkl, val_data.npy, and val_label.pkl files. It reshapes the train and test data and saves them using np.save() function. The train and test labels are also saved separately in pickle format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":177-203",
+            "content": "    evaluation_path = osp.join(save_path, evaluation)\n    isExists = osp.exists(evaluation_path)\n    if not isExists:\n        os.makedirs(evaluation_path)\n    train_data_save_path = osp.join(evaluation_path, 'train_data.npy')\n    train_label_save_path = osp.join(evaluation_path, 'train_label.pkl')\n    val_data_save_path = osp.join(evaluation_path, 'val_data.npy')\n    val_label_save_path = osp.join(evaluation_path, 'val_label.pkl')\n    # reshape data\n    N, T, VC = train_x.shape\n    train_x = np.reshape(train_x, (N, T, 2, 25, 3))\n    train_x = np.transpose(train_x, (0, 4, 1, 3, 2))\n    N, T, VC = test_x.shape\n    test_x = np.reshape(test_x, (N, T, 2, 25, 3))\n    test_x = np.transpose(test_x, (0, 4, 1, 3, 2))\n    # save train\n    np.save(train_data_save_path, train_x)\n    out = [skes_name[train_indices], train_labels]\n    with open(train_label_save_path, 'wb') as f:\n        pickle.dump(out, f)\n    # save test\n    np.save(val_data_save_path, test_x)\n    out = [skes_name[test_indices], test_labels]\n    with open(val_label_save_path, 'wb') as f:"
+        },
+        {
+            "comment": "This function, `get_indices`, takes performer and camera as inputs and returns the indices of training and test data based on either cross-subject or cross-view evaluation. For cross-subject, it selects train/test IDs, then finds their respective indices in the performer array. Similarly, for cross-view, it selects train/test camera IDs and finds their indices. The code handles both cases and returns the training and test indices separately.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":204-234",
+            "content": "        pickle.dump(out, f)\ndef get_indices(performer, camera, evaluation='xsub'):\n    test_indices = np.empty(0)\n    train_indices = np.empty(0)\n    if evaluation == 'xsub':  # Cross Subject (Subject IDs)\n        train_ids = [\n            1, 2, 4, 5, 8, 9, 13, 14, 15, 16, 17, 18, 19, 25, 27, 28, 31, 34,\n            35, 38\n        ]\n        test_ids = [\n            3, 6, 7, 10, 11, 12, 20, 21, 22, 23, 24, 26, 29, 30, 32, 33, 36, 37,\n            39, 40\n        ]\n        # Get indices of test data\n        for idx in test_ids:\n            temp = np.where(performer == idx)[0]  # 0-based index\n            test_indices = np.hstack((test_indices, temp)).astype(np.int)\n        # Get indices of training data\n        for train_id in train_ids:\n            temp = np.where(performer == train_id)[0]  # 0-based index\n            train_indices = np.hstack((train_indices, temp)).astype(np.int)\n    else:  # Cross View (Camera IDs)\n        train_ids = [2, 3]\n        test_ids = 1\n        # Get indices of test data\n        temp = np.where(camera == test_ids)[0]  # 0-based index"
+        },
+        {
+            "comment": "Code reads camera, performer, label, and frames_cnt from respective files. It loads skes_name and skes_joints data from file using pickle. Applies seq_translation and align_frames to skes_joints. Creates test_indices and train_ids by filtering camera ids. Returns train_indices and test_indices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":235-262",
+            "content": "        test_indices = np.hstack((test_indices, temp)).astype(np.int)\n        # Get indices of training data\n        for train_id in train_ids:\n            temp = np.where(camera == train_id)[0]  # 0-based index\n            train_indices = np.hstack((train_indices, temp)).astype(np.int)\n    return train_indices, test_indices\nif __name__ == '__main__':\n    camera = np.loadtxt(camera_file, dtype=np.int)  # camera id: 1, 2, 3\n    performer = np.loadtxt(performer_file, dtype=np.int)  # subject id: 1~40\n    label = np.loadtxt(label_file, dtype=np.int) - 1  # action label: 0~59\n    frames_cnt = np.loadtxt(frames_file, dtype=np.int)  # frames_cnt\n    skes_name = np.loadtxt(skes_name_file, dtype=np.string_)\n    with open(raw_skes_joints_pkl, 'rb') as fr:\n        skes_joints = pickle.load(fr)  # a list\n    skes_joints = seq_translation(skes_joints)\n    skes_joints = align_frames(skes_joints,\n                               frames_cnt)  # aligned to the same frame length\n    evaluations = ['xview', 'xsub']\n    for evaluation in evaluations:"
+        },
+        {
+            "comment": "The code is calling the \"split_dataset\" function to process data, likely dividing it into training and testing sets. The input parameters include various file names, labels, performers, cameras, evaluation criteria, and a save path. Once completed, it prints 'Done!'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/data/ntu-rgb-d/seq_transformation.py\":263-265",
+            "content": "        split_dataset(skes_name, skes_joints, label, performer, camera,\n                      evaluation, save_path)\n    print('Done!')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c3e8e8c6-a656-4684-ab01-2fe7572e9672.json b/docs/doc/c3e8e8c6-a656-4684-ab01-2fe7572e9672.json
new file mode 100644
index 000000000..38d53fc9a
--- /dev/null
+++ b/docs/doc/c3e8e8c6-a656-4684-ab01-2fe7572e9672.json
@@ -0,0 +1,55 @@
+{
+    "summary": "This code calculates retrieval metrics, offers sorting and visualization options, handles tie-breaking efficiently, and computes ranking metrics for input data using NumPy, SciPy, and Matplotlib.",
+    "details": [
+        {
+            "comment": "This code is computing retrieval metrics from a similarity matrix. It takes two tensors as inputs, sims and query_masks. The sims tensor contains NxM matrix of similarities between embeddings, where x_{i,j} = <text_embd[i], vid_embed[j]>. The query_masks tensor is optional and is used to mask any missing queries from the dataset. It then calculates various retrieval metrics such as average precision score, mean average precision, and other related statistics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":0-29",
+            "content": "# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport numbers\nimport scipy.stats\nimport numpy as np\nfrom pathlib import Path\nfrom sklearn.metrics import average_precision_score\ndef t2v_metrics(sims, query_masks=None):\n    \"\"\"Compute retrieval metrics from a similiarity matrix.\n    Args:\n        sims (th.Tensor): N x M matrix of similarities between embeddings, where\n             x_{i,j} = <text_embd[i], vid_embed[j]>\n        query_masks (th.Tensor): mask any missing queries from the dataset (two videos"
+        },
+        {
+            "comment": "This function calculates retrieval metrics for a given similarity matrix, and it ensures the matrix has two dimensions. It sorts the distances in the matrix and provides an option to visualize it using matplotlib. The code also computes the ground truth indices for each video, given the number of queries and videos.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":30-57",
+            "content": "             in MSRVTT only have 19, rather than 20 captions)\n    Returns:\n        (dict[str:float]): retrieval metrics\n    \"\"\"\n    assert sims.ndim == 2, \"expected a matrix\"\n    num_queries, num_vids = sims.shape\n    dists = -sims\n    sorted_dists = np.sort(dists, axis=1)\n    if False:\n        import sys\n        import matplotlib\n        from pathlib import Path\n        matplotlib.use(\"Agg\")\n        import matplotlib.pyplot as plt\n        sys.path.insert(0, str(Path.home() / \"coding/src/zsvision/python\"))\n        from zsvision.zs_iterm import zs_dispFig # NOQA\n        plt.matshow(dists)\n        zs_dispFig()\n        import ipdb; ipdb.set_trace()\n    # The indices are computed such that they slice out the ground truth distances\n    # from the psuedo-rectangular dist matrix\n    queries_per_video = num_queries // num_vids\n    gt_idx = [[np.ravel_multi_index([ii, jj], (num_queries, num_vids))\n              for ii in range(jj * queries_per_video, (jj + 1) * queries_per_video)]\n              for jj in range(num_vids)]"
+        },
+        {
+            "comment": "This section is handling tie-breaking in the similarity matrix, ensuring that it evaluates correctly even when there are ties. It averages over all possible partial orderings implied by the ties for a principled approach. This should occur extremely rarely but can distort scores if not handled properly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":58-74",
+            "content": "    gt_idx = np.array(gt_idx)\n    gt_dists = dists.reshape(-1)[gt_idx.reshape(-1)]\n    gt_dists = gt_dists[:, np.newaxis]\n    rows, cols = np.where((sorted_dists - gt_dists) == 0)  # find column position of GT\n    # --------------------------------\n    # NOTE: Breaking ties\n    # --------------------------------\n    # We sometimes need to break ties (in general, these should occur extremely rarely,\n    # but there are pathological cases when they can distort the scores, such as when\n    # the similarity matrix is all zeros). Previous implementations (e.g. the t2i\n    # evaluation function used\n    # here: https://github.com/niluthpol/multimodal_vtt/blob/master/evaluation.py and\n    # here: https://github.com/linxd5/VSE_Pytorch/blob/master/evaluation.py#L87) generally\n    # break ties \"optimistically\".  However, if the similarity matrix is constant this\n    # can evaluate to a perfect ranking. A principled option is to average over all\n    # possible partial orderings implied by the ties. See # this paper for a discussion:"
+        },
+        {
+            "comment": "This code is computing information retrieval performance measures efficiently in the presence of tied scores, following McSherry et al. (2008). It handles ties optimistically or by averaging, and checks if the number of unique rows matches the number of queries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":75-97",
+            "content": "    #    McSherry, Frank, and Marc Najork,\n    #    \"Computing information retrieval performance measures efficiently in the presence\n    #    of tied scores.\" European conference on information retrieval. Springer, Berlin, \n    #    Heidelberg, 2008.\n    # http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.145.8892&rep=rep1&type=pdf\n    # break_ties = \"optimistically\"\n    break_ties = \"averaging\"\n    if rows.size > num_queries:\n        assert np.unique(rows).size == num_queries, \"issue in metric evaluation\"\n        if break_ties == \"optimistically\":\n            _, idx = np.unique(rows, return_index=True)\n            cols = cols[idx]\n        elif break_ties == \"averaging\":\n            # fast implementation, based on this code:\n            # https://stackoverflow.com/a/49239335\n            locs = np.argwhere((sorted_dists - gt_dists) == 0)\n            # Find the split indices\n            steps = np.diff(locs[:, 0])\n            splits = np.nonzero(steps)[0] + 1\n            splits = np.insert(splits, 0, 0)"
+        },
+        {
+            "comment": "This code calculates the average rank of each query by dividing the summed ranks by their respective counts. It also provides a slower, more interpretable version for testing and asserts that the size of the calculated results matches the expected number of queries. The code includes optional debugging features to verify rank averaging across ties and recover single-query scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":99-121",
+            "content": "            # Compute the result columns\n            summed_cols = np.add.reduceat(locs[:, 1], splits)\n            counts = np.diff(np.append(splits, locs.shape[0]))\n            avg_cols = summed_cols / counts\n            if False:\n                print(\"Running slower code to verify rank averaging across ties\")\n                # slow, but more interpretable version, used for testing\n                avg_cols_slow = [np.mean(cols[rows == idx]) for idx in range(num_queries)]\n                assert np.array_equal(avg_cols, avg_cols_slow), \"slow vs fast difference\"\n                print(\"passed num check\")\n            cols = avg_cols\n    msg = \"expected ranks to match queries ({} vs {}) \"\n    if cols.size != num_queries:\n        import ipdb; ipdb.set_trace()\n    assert cols.size == num_queries, msg\n    if False:\n        # overload mask to check that we can recover the scores for single-query\n        # retrieval\n        print(\"DEBUGGING MODE\")\n        query_masks = np.zeros_like(query_masks)\n        query_masks[:, 0] = 1  # recover single query score"
+        },
+        {
+            "comment": "This function computes retrieval metrics from a similarity matrix and handles invalid queries by checking if query_masks are not None, removing invalid queries, updating the number of queries, and returning the results. It also includes a sanity check against old logic for square matrices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":123-147",
+            "content": "    if query_masks is not None:\n        # remove invalid queries\n        assert query_masks.size == num_queries, \"invalid query mask shape\"\n        cols = cols[query_masks.reshape(-1).astype(np.bool)]\n        assert cols.size == query_masks.sum(), \"masking was not applied correctly\"\n        # update number of queries to account for those that were missing\n        num_queries = query_masks.sum()\n    if False:\n        # sanity check against old logic for square matrices\n        gt_dists_old = np.diag(dists)\n        gt_dists_old = gt_dists_old[:, np.newaxis]\n        _, cols_old = np.where((sorted_dists - gt_dists_old) == 0)\n        assert np.array_equal(cols_old, cols), \"new metric doesn't match\"\n    return cols2metrics(cols, num_queries)\ndef v2t_metrics(sims, query_masks=None):\n    \"\"\"Compute retrieval metrics from a similiarity matrix.\n    Args:\n        sims (th.Tensor): N x M matrix of similarities between embeddings, where\n             x_{i,j} = <text_embd[i], vid_embed[j]>\n        query_masks (th.Tensor): mask any missing captions from the dataset"
+        },
+        {
+            "comment": "This code calculates retrieval metrics for finding the closest \"GT caption\" in embedding space. It first switches axes of text and video, then applies various operations to compute distances between queries and captions. The code handles missing values by setting them to have a distance of infinity. The result is a dictionary of retrieval metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":149-179",
+            "content": "    Returns:\n        (dict[str:float]): retrieval metrics\n    NOTES: We find the closest \"GT caption\" in the style of VSE, which corresponds\n    to finding the rank of the closest relevant caption in embedding space:\n    github.com/ryankiros/visual-semantic-embedding/blob/master/evaluation.py#L52-L56\n    \"\"\"\n    # switch axes of text and video\n    sims = sims.T\n    if False:\n        # experiment with toy example\n        sims = np.ones((3, 3))\n        sims[0, 0] = 2\n        sims[1, 1:2] = 2\n        sims[2, :] = 2\n        query_masks = None\n    assert sims.ndim == 2, \"expected a matrix\"\n    num_queries, num_caps = sims.shape\n    dists = -sims\n    caps_per_video = num_caps // num_queries\n    break_ties = \"averaging\"\n    MISSING_VAL = 1E8\n    query_ranks = []\n    for ii in range(num_queries):\n        row_dists = dists[ii, :]\n        if query_masks is not None:\n            # Set missing queries to have a distance of infinity.  A missing query\n            # refers to a query position `n` for a video that had less than `n`"
+        },
+        {
+            "comment": "The code performs ranking of captions based on distances and handles missing values. It uses distance subtraction instead of argsort for better deterministic results. The code skips rankings of missing captions, and when ties occur, it provides options to break them optimistically or by averaging.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":180-198",
+            "content": "            # captions (for example, a few MSRVTT videos only have 19 queries)\n            row_dists[np.logical_not(query_masks.reshape(-1))] = MISSING_VAL\n        # NOTE: Using distance subtraction to perform the ranking is easier to make\n        # deterministic than using argsort, which suffers from the issue of defining\n        # \"stability\" for equal distances.  Example of distance subtraction code:\n        # github.com/antoine77340/Mixture-of-Embedding-Experts/blob/master/train.py\n        sorted_dists = np.sort(row_dists)\n        min_rank = np.inf\n        for jj in range(ii * caps_per_video, (ii + 1) * caps_per_video):\n            if row_dists[jj] == MISSING_VAL:\n                # skip rankings of missing captions\n                continue\n            ranks = np.where((sorted_dists - row_dists[jj]) == 0)[0]\n            if break_ties == \"optimistically\":\n                rank = ranks[0]\n            elif break_ties == \"averaging\":\n                # NOTE: If there is more than one caption per video, its possible for the"
+        },
+        {
+            "comment": "This code snippet calculates the average rank of similarities in a matrix and checks if it's lower than the minimum rank. It also includes a sanity check against an older version of the code by comparing the calculated ranks with the diagonal elements of the distance matrix and asserts that they are equal using NumPy's array_equal function. If the assertion fails, it prints a message with the number of differences and uses matplotlib to visualize the distance matrix for debugging purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":199-223",
+            "content": "                # method to do \"worse than chance\" in the degenerate case when all\n                # similarities are tied.  TODO(Samuel): Address this case.\n                rank = ranks.mean()\n            if rank < min_rank:\n                min_rank = rank\n        query_ranks.append(min_rank)\n    query_ranks = np.array(query_ranks)\n    # sanity check against old version of code\n    if False:\n        sorted_dists = np.sort(dists, axis=1)\n        gt_dists_old = np.diag(dists)\n        gt_dists_old = gt_dists_old[:, np.newaxis]\n        rows_old, cols_old = np.where((sorted_dists - gt_dists_old) == 0)\n        if rows_old.size > num_queries:\n            _, idx = np.unique(rows_old, return_index=True)\n            cols_old = cols_old[idx]\n        num_diffs = (1 - (cols_old == query_ranks)).sum()\n        msg = f\"new metric doesn't match in {num_diffs} places\"\n        assert np.array_equal(cols_old, query_ranks), msg\n        # visualise the distance matrix\n        import sys\n        import matplotlib\n        matplotlib.use(\"Agg\")"
+        },
+        {
+            "comment": "This code is using matplotlib to display a matrix of distances and then calculates various ranking metrics such as R1, R5, R10, MedR and MeanR for the input data. The function cols2metrics takes in two parameters: 'cols', which represents the input data, and 'num_queries', representing the total number of queries. It computes these ranking metrics using numpy and scipy libraries. Finally, it returns a dictionary containing all calculated metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/metric.py\":224-242",
+            "content": "        import matplotlib.pyplot as plt\n        sys.path.insert(0, str(Path.home() / \"coding/src/zsvision/python\"))\n        from zsvision.zs_iterm import zs_dispFig # NOQA\n        plt.matshow(dists)\n        zs_dispFig()\n    return cols2metrics(query_ranks, num_queries)\ndef cols2metrics(cols, num_queries):\n    metrics = {}\n    metrics[\"R1\"] = 100 * float(np.sum(cols == 0)) / num_queries\n    metrics[\"R5\"] = 100 * float(np.sum(cols < 5)) / num_queries\n    metrics[\"R10\"] = 100 * float(np.sum(cols < 10)) / num_queries\n    metrics[\"R50\"] = 100 * float(np.sum(cols < 50)) / num_queries\n    metrics[\"MedR\"] = np.median(cols) + 1\n    metrics[\"MeanR\"] = np.mean(cols) + 1\n    stats = [metrics[x] for x in (\"R1\", \"R5\", \"R10\")]\n    metrics[\"geometric_mean_R1-R5-R10\"] = scipy.stats.mstats.gmean(stats)\n    return metrics"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c415ca52-7b9b-4718-a2eb-06307b263e3a.json b/docs/doc/c415ca52-7b9b-4718-a2eb-06307b263e3a.json
new file mode 100644
index 000000000..56844d522
--- /dev/null
+++ b/docs/doc/c415ca52-7b9b-4718-a2eb-06307b263e3a.json
@@ -0,0 +1,60 @@
+{
+    "summary": "The code introduces a trainer class for PaddleVideo's T2VLAD application, managing features like multi-epoch training, monitoring performance metrics and model saving during training. It also manages model checkpoints to prevent storage overload.",
+    "details": [
+        {
+            "comment": "This code defines a base class for all trainers. It takes in parameters such as model, loss function, metrics to track, optimizer, and configuration. It also initializes the logger and sets up the necessary components for training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":0-32",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport re\nimport copy\nimport time\nimport paddle\nimport pickle\nimport numpy as np\nfrom pathlib import Path\nfrom abc import abstractmethod\nclass BaseTrainer:\n    \"\"\" Base class for all trainers\n    \"\"\"\n    def __init__(self, model, loss, metrics, optimizer, config, mini_train,\n                 num_keep_ckpts, skip_tboard):\n        self.config = config\n        self.logger = config.get_logger(\n            'trainer', config['trainer']['verbosity'])\n        self.model = model\n        self.loss = loss\n        self.metrics = metrics"
+        },
+        {
+            "comment": "This code is initializing the base trainer object with parameters from a configuration file. It sets optimizer, number of checkpoints to keep, whether to skip TensorBoard logging or not, and overridable properties like skipping the first N saves. It also assigns epochs, save period, monitor mode for model performance evaluation, best score to compare against, starts training from epoch 1, and sets the model directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":33-59",
+            "content": "        self.optimizer = optimizer\n        self.num_keep_ckpts = num_keep_ckpts\n        self.skip_tboard = skip_tboard or mini_train\n        # This property can be overriden in the subclass\n        self.skip_first_n_saves = 0\n        cfg_trainer = config['trainer']\n        self.epochs = cfg_trainer['epochs']\n        self.save_period = cfg_trainer['save_period']\n        self.monitor = cfg_trainer.get('monitor', 'off')\n        self.save_only_best = cfg_trainer.get(\"save_only_best\", True)\n        self.val_freq = cfg_trainer['val_freq']\n        # configuration to monitor model performance and save best\n        if self.monitor == 'off':\n            self.mnt_mode = 'off'\n            self.mnt_best = 0\n        else:\n            self.mnt_mode, self.mnt_metric = self.monitor.split()\n            assert self.mnt_mode in ['min', 'max']\n            self.mnt_best = np.inf if self.mnt_mode == 'min' else -np.inf\n            self.early_stop = cfg_trainer.get('early_stop', np.inf)\n        self.start_epoch = 1\n        self.model_dir = config.save_dir"
+        },
+        {
+            "comment": "This code defines a base trainer class for PaddleVideo's T2VLAD application. It includes methods to train for multiple epochs, handle resume from a saved state, and log training metrics. The trainer iterates over each epoch and calls the _train_epoch method to perform training logic. If validation frequency is set, it logs results at specified epochs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":61-88",
+            "content": "        self.include_optim_in_save_model = config[\"trainer\"].get(\"include_optim_in_save_model\", 1)\n        if config.resume is not None:\n            self._resume_model(config.resume)\n    @abstractmethod\n    def _train_epoch(self, epoch):\n        \"\"\"Training logic for an epoch\n        :param epoch: Current epoch number\n        \"\"\"\n        raise NotImplementedError\n    def train(self):\n        \"\"\"Full training logic.  Responsible for iterating over epochs, early stopping,\n        modeling and logging metrics.\n        \"\"\"\n        for epoch in range(self.start_epoch, self.epochs + 1):\n            result, cached_preds = self._train_epoch(epoch)\n            if epoch % self.val_freq != 0:\n                continue\n            # save logged informations into log dict\n            log = {'epoch': epoch}\n            for key, value in result.items():\n                if key == 'metrics':\n                    log.update({mtr.__name__: value[i]\n                                for i, mtr in enumerate(self.metrics)})\n                elif key == 'val_metrics':"
+        },
+        {
+            "comment": "The code updates the log with metrics values, handles nested metrics, prints logged information to the screen, and checks if the metric improved for monitoring mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":89-109",
+            "content": "                    log.update({'val_' + mtr.__name__: value[i]\n                                for i, mtr in enumerate(self.metrics)})\n                elif key == 'nested_val_metrics':\n                    # NOTE: currently only supports two layers of nesting\n                    for subkey, subval in value.items():\n                        for subsubkey, subsubval in subval.items():\n                            log[f\"val_{subkey}_{subsubkey}\"] = subsubval\n                else:\n                    log[key] = value\n            # print logged informations to the screen\n            for key, value in log.items():\n                self.logger.info('    {:15s}: {}'.format(str(key), value))\n            # eval model according to configured metric, save best # ckpt as trained_model\n            not_improved_count = 0\n            best = False\n            if self.mnt_mode != 'off':\n                try:\n                    # check whether specified metric improved or not, according to\n                    # specified metric(mnt_metric)"
+        },
+        {
+            "comment": "This code checks if the performance metric (mnt_metric) has improved and updates the best value accordingly. If the metric is not found, it disables performance monitoring and sets improved to False. It also raises a ValueError asking the user to choose a relevant metric.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":110-127",
+            "content": "                    lower = log[self.mnt_metric] <= self.mnt_best\n                    higher = log[self.mnt_metric] >= self.mnt_best\n                    improved = (self.mnt_mode == 'min' and lower) or \\\n                               (self.mnt_mode == 'max' and higher)\n                except KeyError:\n                    msg = \"Warning: Metric '{}' not found, perf monitoring is disabled.\"\n                    self.logger.warning(msg.format(self.mnt_metric))\n                    self.mnt_mode = 'off'\n                    improved = False\n                    not_improved_count = 0\n                    raise ValueError(\"Pick a metric that will save models!!!!!!!!\")\n                if improved:\n                    self.mnt_best = log[self.mnt_metric]\n                    # TODO(Samuel): refactor the code so that we don't move the model\n                    # off the GPU or duplicate on the GPU (we should be able to safely\n                    # copy the state dict directly to CPU)\n                    copy_model = copy.deepcopy(self.model)"
+        },
+        {
+            "comment": "This code snippet is responsible for early stopping and saving the best model. If validation performance does not improve after a certain number of epochs (early_stop), training stops. The best model is saved if save_only_best is True and only at the end of the epochs. Otherwise, any model that outperforms the current best metric will be saved intermittently.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":128-150",
+            "content": "                    self.best_model = {\"epoch\": epoch, \"model\": copy_model}\n                    not_improved_count = 0\n                    best = True\n                else:\n                    not_improved_count += 1\n                if not_improved_count > self.early_stop:\n                    self.logger.info(\"Val performance didn\\'t improve for {} epochs. \"\n                                     \"Training stops.\".format(self.early_stop))\n                    break\n            if self.save_only_best:\n                if epoch == self.epochs:\n                    best_model = self.best_model\n                    self.model = best_model[\"model\"]\n                    print(f\"saving the best model to disk (epoch {epoch})\")\n                    self._save_model(best_model[\"epoch\"], save_best=True)\n                continue\n            # If modeling is done intermittently, still save models that outperform\n            # the best metric\n            # save_best = best and not self.mnt_metric == \"epoch\"\n            save_best = True"
+        },
+        {
+            "comment": "This code snippet is used to control the frequency and conditions of model saving during training. It checks if the current epoch is less than a specified number (`self.skip_first_n_saves`) and if `self.save_only_best` is set to False. If either condition is true, it skips saving the model at that epoch. If both conditions are false or the first condition is false but the second one is true, it saves the model every `self.save_period` epochs when `save_best` is set to True. Additionally, if this epoch's save is considered the best (`best` is True), it logs all predictions for each key in `cached_preds`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":152-169",
+            "content": "            # Due to the fast runtime/slow HDD combination, modeling can dominate\n            # the total training time, so we optionally skip models for some of\n            # the first epochs\n            if epoch < self.skip_first_n_saves and not self.save_only_best:\n                msg = f\"Skipping model save at epoch {epoch} <= {self.skip_first_n_saves}\"\n                self.logger.info(msg)\n                continue\n            if epoch % self.save_period == 0 and save_best:\n                self._save_model(epoch, save_best=best)\n                print(\"This epoch, the save best :{}\".format(best))\n                if best:\n                    for key, cached in cached_preds.items():\n                        log_dir = Path(self.config.log_dir)\n                        prediction_path = log_dir / f\"{key}_preds.txt\"\n                        prediction_logits_path = log_dir / f\"{key}_preds_logits.npy\"\n                        np.save(prediction_logits_path, cached[\"preds\"])\n                        gt_logits_path = log_dir / f\"{key}_gt_logits.npy\""
+        },
+        {
+            "comment": "Saves the ground-truth labels and predicted classes for each video, writing them to disk in a specified format. It also saves the video names associated with these predictions and logs a message when all preds have been saved.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":170-185",
+            "content": "                        np.save(gt_logits_path, cached[\"labels\"].cpu().numpy())\n                        vid_names = []\n                        sort_predict = np.argsort(cached[\"preds\"])[:, ::-1]\n                        with open(str(prediction_path), 'w') as f:\n                            for kk in range(cached[\"preds\"].shape[0]):\n                                pred_classes = [str(v) for v in sort_predict[kk, :]]\n                                vid_name = cached[\"vid_name\"][kk]\n                                if key == \"test\":\n                                    vid_name = vid_name[kk].split('/')[-1] + '.mp4'\n                                row = f\"{vid_name} {' '.join(pred_classes)}\"\n                                print(row, file=f)\n                                vid_names.append(vid_name)\n                        save_name_path = log_dir / f\"{key}_vid_name.pkl\"\n                        with open(save_name_path, 'wb') as f:\n                            pickle.dump(vid_names, f)\n                        self.logger.info(f\"All {key} preds saved\")"
+        },
+        {
+            "comment": "This code is responsible for managing the storage of model checkpoints and purging old or unnecessary models. It keeps track of the number of models to keep (`num_keep_ckpts`) and removes older ones if necessary. The `purge_stale_models()` function checks if all the checkpoints follow the expected format, then purges the oldest models by removing them from storage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":186-209",
+            "content": "                        self.logger.info(f\"Wrote result to: {str(prediction_path)}\")\n            if epoch > self.num_keep_ckpts:\n                self.purge_stale_models()\n    def purge_stale_models(self):\n        \"\"\"Remove models that are no longer neededself.\n        NOTE: This function assumes that the `best` model has already been renamed\n        to have a format that differs from `model-epoch<num>.pth`\n        \"\"\"\n        all_ckpts = list(self.model_dir.glob(\"*.pdparams\"))\n        found_epoch_ckpts = list(self.model_dir.glob(\"model-epoch*.pdparams\"))\n        if len(all_ckpts) <= self.num_keep_ckpts:\n            return\n        msg = \"Expected at the best model to have been renamed to a different format\"\n        if not len(all_ckpts) > len(found_epoch_ckpts):\n            print(\"Warning, purging model, but the best epoch was not saved!\")\n        # assert len(all_ckpts) > len(found_epoch_ckpts), msg\n        # purge the oldest models\n        regex = r\".*model-epoch(\\d+)[.pdparams$\"\n        epochs = [int(re.search(regex, str(x)).groups()[0]) for x in found_epoch_ckpts]"
+        },
+        {
+            "comment": "This code snippet is responsible for saving and removing stale models during the training process. It saves model checkpoints at each epoch, keeps a specified number of the most recent ones, and deletes older checkpoints. The _save_model function saves the model state along with its architecture, current epoch, optimizer state if included, and configuration details into a .pdparams file in the specified directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":210-237",
+            "content": "        sorted_ckpts = sorted(list(zip(epochs, found_epoch_ckpts)), key=lambda x: -x[0])\n        for epoch, stale_ckpt in sorted_ckpts[self.num_keep_ckpts:]:\n            tic = time.time()\n            stale_ckpt.unlink()\n            msg = f\"removing stale model [epoch {epoch}] [took {time.time() - tic:.2f}s]\"\n            self.logger.info(msg)\n    def _save_model(self, epoch, save_best=False):\n        \"\"\"Saving models\n        :param epoch: current epoch number\n        :param log: logging information of the epoch\n        :param save_best: if True, rename the saved model to 'trained_model.pdparams'\n        \"\"\"\n        arch = type(self.model).__name__\n        state = {\n            'arch': arch,\n            'epoch': epoch,\n            'state_dict': self.model.state_dict(),\n            'monitor_best': self.mnt_best,\n            'config': self.config\n        }\n        if self.include_optim_in_save_model:\n            state[\"optimizer\"] = self.optimizer.state_dict()\n        filename = str(self.model_dir /\n                       'model-epoch{}.pdparams'.format(epoch))"
+        },
+        {
+            "comment": "Saves model with optional best model update after training completion. Allows resuming training from a previously saved state.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_trainer.py\":238-257",
+            "content": "        tic = time.time()\n        self.logger.info(\"Saving model: {} ...\".format(filename))\n        paddle.save(state, filename)\n        self.logger.info(f\"Done in {time.time() - tic:.3f}s\")\n        if save_best:\n            self.logger.info(\"Updating 'best' model: {} ...\".format(filename))\n            best_path = str(self.model_dir / 'trained_model.pdparams')\n            paddle.save(state, best_path)\n            self.logger.info(f\"Done in {time.time() - tic:.3f}s\")\n    def _resume_model(self, resume_path):\n        \"\"\" Resume from saved models\n        :param resume_path: model path to be resumed\n        \"\"\"\n        resume_path = str(resume_path)\n        self.logger.info(\"Loading model: {} ...\".format(resume_path))\n        model = paddle.load(resume_path)\n        self.model.load_dict(model)\n        self.logger.info(f\"model loaded. Resume training from epoch {self.start_epoch}\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c475c2a1-a6b7-4c12-9c4b-6110396a61c1.json b/docs/doc/c475c2a1-a6b7-4c12-9c4b-6110396a61c1.json
new file mode 100644
index 000000000..35f76fc3f
--- /dev/null
+++ b/docs/doc/c475c2a1-a6b7-4c12-9c4b-6110396a61c1.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet appears to be a list of URLs pointing to various MP4 video files. The file names are hashed strings, indicating that the videos may have been previously used for storage or identification purposes.",
+    "details": [
+        {
+            "comment": "This code snippet appears to be a list of URLs pointing to various MP4 video files. The file names are hashed strings, indicating that the videos may have been previously used for storage or identification purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/url_val.list\":0-4",
+            "content": "mp4/5572686cb90f440988ded956a60e555d.mp4\nmp4/f6e64ee9b13a4088b24c45c257894c1e.mp4\nmp4/259856b769044b4d8dc94076deb356bf.mp4\nmp4/1f0a0698e38d493988fe42a50f7e8723.mp4\nmp4/8cfb4e605af44055b1576c37eb0e3209.mp4"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c4a979be-ebca-4aaa-80f4-ae5cf72a419b.json b/docs/doc/c4a979be-ebca-4aaa-80f4-ae5cf72a419b.json
new file mode 100644
index 000000000..23edcbee1
--- /dev/null
+++ b/docs/doc/c4a979be-ebca-4aaa-80f4-ae5cf72a419b.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code defines an abstract base class `BaseMetric` for metrics in PaddleVideo's EIVideo application. It initializes the metric object with data size, batch size, and world size from distributed environment. The abstract methods `update()` and `accumulate()` must be implemented by subclasses.",
+    "details": [
+        {
+            "comment": "This code defines an abstract base class `BaseMetric` for metrics in PaddleVideo's EIVideo application. It initializes the metric object with data size, batch size, and world size from distributed environment. The abstract methods `update()` and `accumulate()` must be implemented by subclasses.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nfrom EIVideo.paddlevideo.utils import get_dist_info\nclass BaseMetric(object):\n    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):\n        self.data_size = data_size\n        self.batch_size = batch_size\n        _, self.world_size = get_dist_info()\n        self.log_interval = log_interval\n    @abstractmethod\n    def update(self):\n        raise NotImplemented\n    @abstractmethod\n    def accumulate(self):\n        raise NotImplemented"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c50f6cb2-5742-447d-895d-a6e75865037d.json b/docs/doc/c50f6cb2-5742-447d-895d-a6e75865037d.json
new file mode 100644
index 000000000..bc1732673
--- /dev/null
+++ b/docs/doc/c50f6cb2-5742-447d-895d-a6e75865037d.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The code introduces a \"KineticsReader\" class to efficiently read Kinetics dataset in mp4 and pkl formats, applying data augmentation for image/video classification tasks. It generates images for multi-threaded processing, and selects frames based on parameters for training or testing mode.",
+    "details": [
+        {
+            "comment": "This code is from the PaddleVideo library's VideoTag application, specifically the kinetics_reader.py file. It imports necessary modules, defines a VideoRecord class to describe frames information of videos, and includes license and version details. The code seems to be part of a video processing framework for machine learning tasks, potentially in image or video classification.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":0-40",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport sys\nimport cv2\nimport math\nimport random\nimport functools\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\nimport numpy as np\nimport paddle\nfrom PIL import Image, ImageEnhance\nimport logging\nfrom .reader_utils import DataReader\nlogger = logging.getLogger(__name__)\npython_ver = sys.version_info\nclass VideoRecord(object):\n    '''\n    define a class method which used to describe the frames information of videos"
+        },
+        {
+            "comment": "This code defines a class \"KineticsReader\" for reading the Kinetics dataset in two formats: mp4 and pkl. It initializes with a row of data containing the frames' path, number of frames, and label. The class has properties for accessing these data elements. The code also specifies dataset configuration options.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":41-78",
+            "content": "    1. self._data[0] is the frames' path\n    2. self._data[1] is the number of frames\n    3. self._data[2] is the label of frames\n    '''\n    def __init__(self, row):\n        self._data = row\n    @property\n    def path(self):\n        return self._data[0]\n    @property\n    def num_frames(self):\n        return int(self._data[1])\n    @property\n    def label(self):\n        return int(self._data[2])\nclass KineticsReader(DataReader):\n    \"\"\"\n    Data reader for kinetics dataset of two format mp4 and pkl.\n    1. mp4, the original format of kinetics400\n    2. pkl, the mp4 was decoded previously and stored as pkl\n    In both case, load the data, and then get the frame data in the form of numpy and label as an integer.\n     dataset cfg: format\n                  num_classes\n                  seg_num\n                  short_size\n                  target_size\n                  num_reader_threads\n                  buf_size\n                  image_mean\n                  image_std\n                  batch_size\n                  list\n    \"\"\""
+        },
+        {
+            "comment": "This code initializes an object of the KineticsReader class, which takes in parameters like name, mode, and configuration (cfg). It retrieves various attributes from the configuration, such as number of classes, segmentation information, image sizes, reader threads, buffer size, and random seed. It also sets the mean and standard deviation values for image normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":79-97",
+            "content": "    def __init__(self, name, mode, cfg):\n        super(KineticsReader, self).__init__(name, mode, cfg)\n        self.format = cfg.MODEL.format\n        self.num_classes = self.get_config_from_sec('model', 'num_classes')\n        self.seg_num = self.get_config_from_sec('model', 'seg_num')\n        self.seglen = self.get_config_from_sec('model', 'seglen')\n        self.seg_num = self.get_config_from_sec(mode, 'seg_num', self.seg_num)\n        self.short_size = self.get_config_from_sec(mode, 'short_size')\n        self.target_size = self.get_config_from_sec(mode, 'target_size')\n        self.num_reader_threads = self.get_config_from_sec(\n            mode, 'num_reader_threads')\n        self.buf_size = self.get_config_from_sec(mode, 'buf_size')\n        self.fix_random_seed = self.get_config_from_sec(mode, 'fix_random_seed')\n        self.img_mean = np.array(cfg.MODEL.image_mean).reshape(\n            [3, 1, 1]).astype(np.float32)\n        self.img_std = np.array(cfg.MODEL.image_std).reshape([3, 1, 1]).astype(\n            np.float32)"
+        },
+        {
+            "comment": "This code sets the batch size and file list for a video reader. It also ensures random seeds are set, limits the number of reader threads to 1 if fixing random seed, asserts that the filelist exists, creates a video reader object using a provided creator function, and defines a batch_reader generator function to iterate over the reader's output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":98-120",
+            "content": "        # set batch size and file list\n        self.batch_size = cfg[mode.upper()]['batch_size']\n        self.filelist = cfg[mode.upper()]['filelist']\n        if self.fix_random_seed:\n            random.seed(0)\n            np.random.seed(0)\n            self.num_reader_threads = 1\n    def create_reader(self):\n        assert os.path.exists(self.filelist), \\\n                    '{} not exist, please check the data list'.format(self.filelist)\n        _reader = self._reader_creator(self.filelist, self.mode, seg_num=self.seg_num, seglen = self.seglen, \\\n                         short_size = self.short_size, target_size = self.target_size, \\\n                         img_mean = self.img_mean, img_std = self.img_std, \\\n                         shuffle = (self.mode == 'train'), \\\n                         num_threads = self.num_reader_threads, \\\n                         buf_size = self.buf_size, format = self.format)\n        def _batch_reader():\n            batch_out = []\n            for imgs, label in _reader():\n                if imgs is None:"
+        },
+        {
+            "comment": "This code defines a function `_reader_creator` that takes in various parameters and returns another function `decode_mp4`. The returned function reads video frames from MP4 files, extracts labels if necessary, and yields batches of images and labels based on batch size and other specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":121-150",
+            "content": "                    continue\n                batch_out.append((imgs, label))\n                if len(batch_out) == self.batch_size:\n                    yield batch_out\n                    batch_out = []\n        return _batch_reader\n    def _reader_creator(self,\n                        file_list,\n                        mode,\n                        seg_num,\n                        seglen,\n                        short_size,\n                        target_size,\n                        img_mean,\n                        img_std,\n                        shuffle=False,\n                        num_threads=1,\n                        buf_size=1024,\n                        format='frames'):\n        def decode_mp4(sample, mode, seg_num, seglen, short_size, target_size,\n                       img_mean, img_std):\n            sample = sample[0].split(' ')\n            mp4_path = sample[0]\n            if mode == \"infer\":\n                label = mp4_path.split('/')[-1]\n            else:\n                label = int(sample[1])\n            try:"
+        },
+        {
+            "comment": "This code is defining two functions: `kinetics_reader` and `decode_frames`. The `kinetics_reader` function loads frames from a given MP4 file using `mp4_loader`, applies transformations if necessary, and returns the frames along with their corresponding labels. It also logs an error if the number of frames is less than 1. If an exception occurs during the process, it logs an error message as well. The `decode_frames` function loads frames from a specified directory (specified by the `recode` object) using the `frames_loader` function and returns the frames along with their labels. If the number of frames is less than 1, it logs an error; if an exception occurs, it also logs an error.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":151-175",
+            "content": "                imgs = mp4_loader(mp4_path, seg_num, seglen, mode)\n                if len(imgs) < 1:\n                    logger.error('{} frame length {} less than 1.'.format(\n                        mp4_path, len(imgs)))\n                    return None, None\n            except:\n                logger.error('Error when loading {}'.format(mp4_path))\n                return None, None\n            return imgs_transform(imgs, mode, seg_num, seglen, \\\n                         short_size, target_size, img_mean, img_std, name = self.name), label\n        def decode_frames(sample, mode, seg_num, seglen, short_size,\n                          target_size, img_mean, img_std):\n            recode = VideoRecord(sample[0].split(' '))\n            frames_dir_path = recode.path\n            if mode == \"infer\":\n                label = frames_dir_path\n            else:\n                label = recode.label\n            try:\n                imgs = frames_loader(recode, seg_num, seglen, mode)\n                if len(imgs) < 1:\n                    logger.error('{} frame length {} less than 1.'.format("
+        },
+        {
+            "comment": "The code snippet is responsible for loading video frames from a specified directory and handling any errors that may occur during the process. It takes the frames directory path, image format (frames or video), segment number, sequence length, short size, target size, image mean, image standard deviation, and name as input parameters. The code also defines a function reader() to read the file list and shuffle its lines if necessary. Based on the specified format (frames or video), it calls the appropriate decoding function (decode_frames or decode_video).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":176-203",
+            "content": "                        frames_dir_path, len(imgs)))\n                    return None, None\n            except:\n                logger.error('Error when loading {}'.format(frames_dir_path))\n                return None, None\n            return imgs_transform(imgs,\n                                  mode,\n                                  seg_num,\n                                  seglen,\n                                  short_size,\n                                  target_size,\n                                  img_mean,\n                                  img_std,\n                                  name=self.name), label\n        def reader_():\n            with open(file_list) as flist:\n                lines = [line.strip() for line in flist]\n                if shuffle:\n                    random.shuffle(lines)\n                for line in lines:\n                    file_path = line.strip()\n                    yield [file_path]\n        if format == 'frames':\n            decode_func = decode_frames\n        elif format == 'video':"
+        },
+        {
+            "comment": "This code selects a specific video format decoder function based on the input format. If the format is not recognized, it raises an error. It then applies transformations to the images using the selected function and returns them with additional functionality for efficient processing with multiple threads.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":204-232",
+            "content": "            decode_func = decode_mp4\n        else:\n            raise (\"Not implemented format {}\".format(format))\n        mapper = functools.partial(decode_func,\n                                   mode=mode,\n                                   seg_num=seg_num,\n                                   seglen=seglen,\n                                   short_size=short_size,\n                                   target_size=target_size,\n                                   img_mean=img_mean,\n                                   img_std=img_std)\n        return paddle.reader.decorator.xmap_readers(mapper,\n                                     reader_,\n                                     num_threads,\n                                     buf_size,\n                                     order=True)\ndef imgs_transform(imgs,\n                   mode,\n                   seg_num,\n                   seglen,\n                   short_size,\n                   target_size,\n                   img_mean,\n                   img_std,\n                   name=''):"
+        },
+        {
+            "comment": "This code reads images from a dataset and performs data augmentation by cropping, flipping, and normalization. It also checks if the image dimensions are larger than the target crop size before applying the crop operation. If in 'train' mode, it randomly crops the image. Otherwise, it centers the crop. The resulting images are then normalized by subtracting the mean pixel values and dividing by standard deviation for feature extraction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":233-265",
+            "content": "    imgs = group_scale(imgs, short_size)\n    np_imgs = np.array([np.array(img).astype('float32') for img in imgs])  #dhwc\n    if mode == 'train':\n        np_imgs = group_crop(np_imgs, target_size)\n        np_imgs = group_random_flip(np_imgs)\n    else:\n        np_imgs = group_crop(np_imgs, target_size, is_center=True)\n    np_imgs = np_imgs.transpose(0, 3, 1, 2) / 255  #dchw\n    np_imgs -= img_mean\n    np_imgs /= img_std\n    return np_imgs\ndef group_crop(np_imgs, target_size, is_center=True):\n    d, h, w, c = np_imgs.shape\n    th, tw = target_size, target_size\n    assert (w >= target_size) and (h >= target_size), \\\n          \"image width({}) and height({}) should be larger than crop size\".format(w, h, target_size)\n    if is_center:\n        h_off = int(round((h - th) / 2.))\n        w_off = int(round((w - tw) / 2.))\n    else:\n        w_off = random.randint(0, w - tw)\n        h_off = random.randint(0, h - th)\n    img_crop = np_imgs[:, h_off:h_off + target_size,\n                       w_off:w_off + target_size, :]\n    return img_crop"
+        },
+        {
+            "comment": "The code defines three functions: \"group_random_flip\" flips the image horizontally with 50% probability, \"group_scale\" resizes images to a specified target size while maintaining aspect ratio, and \"mp4_loader\" loads frames from a video file for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":268-304",
+            "content": "def group_random_flip(np_imgs):\n    prob = random.random()\n    if prob < 0.5:\n        ret = np_imgs[:, :, ::-1, :]\n        return ret\n    else:\n        return np_imgs\ndef group_scale(imgs, target_size):\n    resized_imgs = []\n    for i in range(len(imgs)):\n        img = imgs[i]\n        w, h = img.size\n        if (w <= h and w == target_size) or (h <= w and h == target_size):\n            resized_imgs.append(img)\n            continue\n        if w < h:\n            ow = target_size\n            oh = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n        else:\n            oh = target_size\n            ow = int(target_size * 4.0 / 3.0)\n            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n    return resized_imgs\ndef mp4_loader(filepath, nsample, seglen, mode):\n    cap = cv2.VideoCapture(filepath)\n    videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n    sampledFrames = []\n    for i in range(videolen):\n        ret, frame = cap.read()\n        # maybe first frame is empty"
+        },
+        {
+            "comment": "This code reads video frames and selects a subset of them based on the provided parameters. It appends each frame in the specified sequence to sampledFrames, calculates average duration, then extracts the required number of frames with a given segment length from the list. The extracted frames are returned at the end.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":305-339",
+            "content": "        if ret == False:\n            continue\n        img = frame[:, :, ::-1]\n        sampledFrames.append(img)\n    average_dur = int(len(sampledFrames) / nsample)\n    imgs = []\n    for i in range(nsample):\n        idx = 0\n        if mode == 'train':\n            if average_dur >= seglen:\n                idx = random.randint(0, average_dur - seglen)\n                idx += i * average_dur\n            elif average_dur >= 1:\n                idx += i * average_dur\n            else:\n                idx = i\n        else:\n            if average_dur >= seglen:\n                idx = (average_dur - 1) // 2\n                idx += i * average_dur\n            elif average_dur >= 1:\n                idx += i * average_dur\n            else:\n                idx = i\n        for jj in range(idx, idx + seglen):\n            imgbuf = sampledFrames[int(jj % len(sampledFrames))]\n            img = Image.fromarray(imgbuf, mode='RGB')\n            imgs.append(img)\n    return imgs\ndef frames_loader(recode, nsample, seglen, mode):\n    imgpath, num_frames = recode.path, recode.num_frames"
+        },
+        {
+            "comment": "This code calculates the average duration of video frames and then generates a set of images by randomly selecting start points based on the mode (train or test) and segment length. It opens each image file in RGB format, converts it, and adds it to the list of images returned at the end.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/kinetics_reader.py\":340-366",
+            "content": "    average_dur = int(num_frames / nsample)\n    imgs = []\n    for i in range(nsample):\n        idx = 0\n        if mode == 'train':\n            if average_dur >= seglen:\n                idx = random.randint(0, average_dur - seglen)\n                idx += i * average_dur\n            elif average_dur >= 1:\n                idx += i * average_dur\n            else:\n                idx = i\n        else:\n            if average_dur >= seglen:\n                idx = (average_dur - 1) // 2\n                idx += i * average_dur\n            elif average_dur >= 1:\n                idx += i * average_dur\n            else:\n                idx = i\n        for jj in range(idx, idx + seglen):\n            img = Image.open(\n                os.path.join(imgpath,\n                             'img_{:05d}.jpg'.format(jj + 1))).convert('RGB')\n            imgs.append(img)\n    return imgs"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c57980fa-dba6-436c-ba71-56c4d9826ed4.json b/docs/doc/c57980fa-dba6-436c-ba71-56c4d9826ed4.json
new file mode 100644
index 000000000..1bf0f13e6
--- /dev/null
+++ b/docs/doc/c57980fa-dba6-436c-ba71-56c4d9826ed4.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The precise_bn.py file in PaddlePaddle's EIVideo module contains a function called do_preciseBN, which recomputes batch normalization stats for improved accuracy by running the model multiple times with input data from the data_loader, updating BN layers with running averages for normalization.",
+    "details": [
+        {
+            "comment": "The code provided is part of the PaddlePaddle framework for video applications, specifically the EIVideo module. This precise_bn.py file contains a function called do_preciseBN that recomputes and updates batch norm stats to improve accuracy. It does so by running the model with input data from the data_loader multiple times (num_iters) to make BN statistics more precise. The code also includes an import for paddle, itertools, and EIVideo's paddlevideo module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py\":0-29",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport itertools\nfrom EIVideo.paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n\"\"\"\nImplement precise bn, which is useful for improving accuracy.\n\"\"\"\n@paddle.no_grad()  # speed up and save CUDA memory\ndef do_preciseBN(model, data_loader, parallel, num_iters=200):\n    \"\"\"\n    Recompute and update the batch norm stats to make them more precise. During\n    training both BN stats and the weight are changing after every iteration, so"
+        },
+        {
+            "comment": "This code recomputes the batch normalization (BN) statistics with fixed weights for a given model, improving validation accuracy. It computes true average of per-batch mean/variance instead of running average. The code targets specific BN layers in the model and is applied when there are no such layers or if training is not enabled.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py\":30-53",
+            "content": "    the running average can not precisely reflect the actual stats of the\n    current model.\n    In this function, the BN stats are recomputed with fixed weights, to make\n    the running average more precise. Specifically, it computes the true average\n    of per-batch mean/variance instead of the running average.\n    This is useful to improve validation accuracy.\n    Args:\n        model: the model whose bn stats will be recomputed\n        data_loader: an iterator. Produce data as input to the model\n        num_iters: number of iterations to compute the stats.\n    Return:\n        the model with precise mean and variance in bn layers.\n    \"\"\"\n    bn_layers_list = [\n        m for m in model.sublayers()\n        if any((isinstance(m, bn_type)\n                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,\n                                paddle.nn.BatchNorm3D))) and m.training\n    ]\n    if len(bn_layers_list) == 0:\n        return\n    # moving_mean=moving_mean*momentum+batch_mean*(1.\u2212momentum)\n    # we set momentum=0. to get the true mean and variance during forward"
+        },
+        {
+            "comment": "This code initializes the momentum of batch normalization (BN) layers to 0 and creates lists for running mean and variance. It then trains a model for a specified number of iterations, updating the BN statistics by accumulating the difference between current and running mean/variance. Finally, it asserts that the correct number of iterations were performed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py\":54-79",
+            "content": "    momentum_actual = [bn._momentum for bn in bn_layers_list]\n    for bn in bn_layers_list:\n        bn._momentum = 0.\n    running_mean = [paddle.zeros_like(bn._mean)\n                    for bn in bn_layers_list]  #pre-ignore\n    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]\n    ind = -1\n    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):\n        logger.info(\"doing precise BN {} / {}...\".format(ind + 1, num_iters))\n        if parallel:\n            model._layers.train_step(data)\n        else:\n            model.train_step(data)\n        for i, bn in enumerate(bn_layers_list):\n            # Accumulates the bn stats.\n            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)\n            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)\n    assert ind == num_iters - 1, (\n        \"update_bn_stats is meant to run for {} iterations, but the batch_sampler stops at {} iterations.\"\n        .format(num_iters, ind))\n    # Sets the precise bn stats."
+        },
+        {
+            "comment": "This code is iterating through a list of batch normalization (BN) layers, setting their mean and variance values from a separate list, and updating their momentum value. This could be part of a model's training process where it updates the BN layers with running averages for normalization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py\":80-83",
+            "content": "    for i, bn in enumerate(bn_layers_list):\n        bn._mean.set_value(running_mean[i])\n        bn._variance.set_value(running_var[i])\n        bn._momentum = momentum_actual[i]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c5c75f55-929a-4e2b-9a16-b39c42a2c8a9.json b/docs/doc/c5c75f55-929a-4e2b-9a16-b39c42a2c8a9.json
new file mode 100644
index 000000000..674532ef1
--- /dev/null
+++ b/docs/doc/c5c75f55-929a-4e2b-9a16-b39c42a2c8a9.json
@@ -0,0 +1,45 @@
+{
+    "summary": "The code processes ground truth data, generates output for bmn, and combines GT data for each frame. It selects video segments, defines instance parameters, converts label data to BMN format, and saves as a numpy array and JSON labeled file.",
+    "details": [
+        {
+            "comment": "This code reads original ground truth (gts) data for action detection, sets the frame per second (fps), and generates output gts dict for bmn. It processes each sub-item in the gts_data['gts'], extracts the URL, maximum video length, and load features if not already present. The code then creates a new dictionary with fps and gts list as output gts data for bmn.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py\":0-41",
+            "content": "\"\"\"\nget instance for bmn\n\u4f7f\u7528winds=40\u7684\u6ed1\u7a97\uff0c\u5c06\u6240\u6709\u5b50\u7a97\u53e3\u7684\u957f\u5ea6\u4e4b\u548c\u5c0f\u4e8ewinds\u7684\u8fdb\u884c\u5408\u5e76\n\u5408\u5e76\u540e\uff0c\u7236\u7a97\u53e3\u4ee3\u8868bmn\u8bad\u7ec3\u6570\u636e\uff0c\u5b50\u7a97\u53e3\u4ee3\u8868tsn\u8bad\u7ec3\u6570\u636e\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nbmn_window = 40\ndataset = \"../EuroCup2016\"\nfeat_dir = dataset + '/features'\nout_dir = dataset + '/input_for_bmn'\nlabel_files = {\n    'train': 'label_cls8_train.json',\n    'validation': 'label_cls8_val.json'\n}\nglobal fps\ndef gen_gts_for_bmn(gts_data):\n    \"\"\"\n    @param, gts_data, original gts for action detection\n    @return, gts_bmn, output gts dict for bmn\n    \"\"\"\n    fps = gts_data['fps']\n    gts_bmn = {'fps': fps, 'gts': []}\n    for sub_item in gts_data['gts']:\n        url = sub_item['url']\n        max_length = sub_item['total_frames']\n        # \u7279\u5f81\u63d0\u53d6\u6ca1\u6709\u83b7\u53d6\u6240\u6709\u5e27\u7279\u5f81\uff0c\u8fd9\u91ccload feature\u83b7\u53d6\u51c6\u786emax_length\n        #feat_path = feat_dir + '/' + os.path.basename(url).replace('.mp4', '.pkl')\n        #feature_video = pickle.load(open(feat_path, 'rb'))['features']\n        #max_length = int(len(feature_video) * 1.0 / fps)\n        gts_bmn['gts'].append({\n            'url': url,"
+        },
+        {
+            "comment": "This code is filtering out sub-actions that exceed a specified duration (bmn_window). It then creates a root_action list from the remaining sub-actions. The code also keeps track of the before_id and after_id to create 'gts' dictionary entries, which include the before_id, after_id, and root_actions for each group of actions that do not exceed the bmn_window duration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py\":42-68",
+            "content": "            'total_frames': max_length,\n            'root_actions': []\n        })\n        sub_actions = sub_item['actions']\n        # duration > bmn_window\uff0c \u76f4\u63a5\u5220\u9664\n        for idx, sub_action in enumerate(sub_actions):\n            if sub_action['end_id'] - sub_action['start_id'] > bmn_window:\n                sub_actions.pop(idx)\n        root_actions = [sub_actions[0]]\n        # before_id, \u524d\u4e00\u52a8\u4f5c\u7684\u6700\u540e\u4e00\u5e27\n        # after_id, \u540e\u4e00\u52a8\u4f5c\u7684\u7b2c\u4e00\u5e27\n        before_id = 0\n        for idx in range(1, len(sub_actions)):\n            cur_action = sub_actions[idx]\n            duration = (cur_action['end_id'] - root_actions[0]['start_id'])\n            if duration > bmn_window:\n                after_id = cur_action['start_id']\n                gts_bmn['gts'][-1]['root_actions'].append({\n                    'before_id':\n                    before_id,\n                    'after_id':\n                    after_id,\n                    'actions':\n                    root_actions\n                })\n                before_id = root_actions[-1]['end_id']"
+        },
+        {
+            "comment": "This function combines ground truth (GT) data for each frame within a bmn_window range. If there is only one action in the bmn_window, it takes that action; otherwise, it considers three actions: first, last, and all. It then creates segments based on these actions and returns the combined GT data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py\":69-101",
+            "content": "                root_actions = [cur_action]\n            else:\n                root_actions.append(cur_action)\n            if idx == len(sub_actions) - 1:\n                after_id = max_length\n                gts_bmn['gts'][-1]['root_actions'].append({\n                    'before_id':\n                    before_id,\n                    'after_id':\n                    after_id,\n                    'actions':\n                    root_actions\n                })\n    return gts_bmn\ndef combile_gts(gts_bmn, gts_process, mode):\n    \"\"\"\n    1\u3001bmn_window \u8303\u56f4\u5185\u53ea\u6709\u4e00\u4e2a\u52a8\u4f5c\uff0c\u53ea\u53d6\u4e00\u4e2a\u76ee\u6807\u6846\n    2\u3001bmn_window \u8303\u56f4\u5185\u6709\u591a\u4e2a\u52a8\u4f5c\uff0c\u53d6\u4e09\u4e2a\u76ee\u6807\u6846(\u7b2c\u4e00\u4e2a\u52a8\u4f5c\u3001\u6700\u540e\u4e00\u4e2a\u52a8\u4f5c\u3001\u6240\u6709\u52a8\u4f5c)\n    \"\"\"\n    global fps\n    fps = gts_process['fps']\n    duration_second = bmn_window * 1.0\n    duration_frame = bmn_window * fps\n    feature_frame = duration_frame\n    for item in gts_process['gts']:\n        url = item['url']\n        basename = os.path.basename(url).split('.')[0]\n        root_actions = item['root_actions']\n        for root_action in root_actions:\n            segments = []\n            # all actions"
+        },
+        {
+            "comment": "This code appends segments to a list based on the number of actions in root_action. If there is more than one action, it separates the first and last action into their own segments using before_id and after_id values. Finally, it loops through the segments list using a for loop to assign before_id, after_id, and actions to each segment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py\":102-127",
+            "content": "            segments.append({\n                'actions': root_action['actions'],\n                'before_id': root_action['before_id'],\n                'after_id': root_action['after_id']\n            })\n            if len(root_action['actions']) > 1:\n                # first action\n                segments.append({\n                    'actions': [root_action['actions'][0]],\n                    'before_id':\n                    root_action['before_id'],\n                    'after_id':\n                    root_action['actions'][1]['start_id']\n                })\n                # last action\n                segments.append({\n                    'actions': [root_action['actions'][-1]],\n                    'before_id':\n                    root_action['actions'][-2]['end_id'],\n                    'after_id':\n                    root_action['after_id']\n                })\n            for segment in segments:\n                before_id = segment['before_id']\n                after_id = segment['after_id']\n                actions = segment['actions']"
+        },
+        {
+            "comment": "This code selects a random segment of video from a list of actions, assigns a label to it, and stores the segment information in a dictionary. It uses the start and end IDs of each action to determine the range for the random start point and calculates the segment's position relative to the cur_start value. The code also handles edge cases where the box0 is less than or equal to box1 and creates the annotation dictionary with label and label_name information, as well as the segment duration in seconds.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py\":128-146",
+            "content": "                box0 = int(max(actions[-1]['end_id'] - bmn_window, before_id))\n                box1 = int(min(actions[0]['start_id'], after_id - bmn_window))\n                if box0 <= box1:\n                    cur_start = random.randint(box0, box1)\n                    cur_end = cur_start + bmn_window\n                    name = '{}_{}_{}'.format(basename, cur_start, cur_end)\n                    annotations = []\n                    for action in actions:\n                        label = str(1.0 * action['label_ids'][0])\n                        label_name = action['label_names'][0]\n                        seg0 = 1.0 * (action['start_id'] - cur_start)\n                        seg1 = 1.0 * (action['end_id'] - cur_start)\n                        annotations.append({\n                            'segment': [seg0, seg1],\n                            'label': label,\n                            'label_name': label_name\n                        })\n                    gts_bmn[name] = {\n                        'duration_second': duration_second,"
+        },
+        {
+            "comment": "The code defines a function `get_instance_for_bmn` that returns a dictionary containing various parameters for an instance, and another function `save_feature_to_numpy` which saves feature data to a file. The features are split into two types: image and pcm, stored in a dictionary named \"feature\" with keys 'image_feature' and 'pcm_feature'. The code then loops through the dictionaries, creating sub-dictionaries for each item with their corresponding start and end indexes, before saving them to a file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py\":147-177",
+            "content": "                        'duration_frame': duration_frame,\n                        'feature_frame': feature_frame,\n                        'subset': mode,\n                        'annotations': annotations\n                    }\n    return gts_bmn\ndef save_feature_to_numpy(gts_bmn, folder):\n    global fps\n    print('save feature for bmn ...')\n    if not os.path.exists(folder):\n        os.mkdir(folder)\n    process_gts_bmn = {}\n    for item, value in gts_bmn.items():\n        basename, start_id, end_id = item.split('_')\n        if not basename in process_gts_bmn:\n            process_gts_bmn[basename] = []\n        process_gts_bmn[basename].append({\n            'name': item,\n            'start': int(start_id),\n            'end': int(end_id)\n        })\n    for item, values in process_gts_bmn.items():\n        feat_path = os.path.join(feat_dir, item + '.pkl')\n        print(feat_path)\n        feature = pickle.load(open(feat_path, 'rb'))\n        image_feature = feature['image_feature']\n        pcm_feature = feature['pcm_feature']"
+        },
+        {
+            "comment": "Reshapes pcm_feature for concatenation, sets min_length based on shorter of two feature arrays, continues if min_length is 0, slices image_feature and pcm_feature to match min_length, concatenates along axis 1 to create feature_video, iterates through values dictionary, creates save_cut_name path, calculates start and end frames in seconds, checks if end frame exceeds length of feature_video, removes key from gts_bmn if end_frame is greater than feature_video length, generates list of feature_video slices within range of start and end frames, converts to numpy array for floating point numbers, saves np_feature_cut as .npy file with name derived from value's 'name'. Returns gts_bmn dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py\":179-204",
+            "content": "        pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))\n        min_length = min(image_feature.shape[0], pcm_feature.shape[0])\n        if min_length == 0:\n            continue\n        image_feature = image_feature[:min_length, :]\n        pcm_feature = pcm_feature[:min_length, :]\n        feature_video = np.concatenate((image_feature, pcm_feature), axis=1)\n        for value in values:\n            save_cut_name = os.path.join(folder, value['name'])\n            start_frame = (value['start']) * fps\n            end_frame = (value['end']) * fps\n            if end_frame > len(feature_video):\n                del gts_bmn[value['name']]\n                continue\n            feature_cut = [\n                feature_video[i] for i in range(start_frame, end_frame)\n            ]\n            np_feature_cut = np.array(feature_cut, dtype=np.float32)\n            np.save(save_cut_name, np_feature_cut)\n    return gts_bmn\nif __name__ == \"__main__\":\n    if not os.path.exists(out_dir):\n        os.mkdir(out_dir)\n    gts_bmn = {}"
+        },
+        {
+            "comment": "The code is iterating over the label_files, loading JSON data from each file, processing it for BMN format, combining it with existing gts_bmn, and saving the final result as a numpy array and JSON formatted label.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py\":205-215",
+            "content": "    for item, value in label_files.items():\n        label_file = os.path.join(dataset, value)\n        gts_data = json.load(open(label_file, 'rb'))\n        gts_process = gen_gts_for_bmn(gts_data)\n        gts_bmn = combile_gts(gts_bmn, gts_process, item)\n    gts_bmn = save_feature_to_numpy(gts_bmn, out_dir + '/feature')\n    with open(out_dir + '/label.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(gts_bmn, indent=4, ensure_ascii=False)\n        f.write(data)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c5d6e433-0c24-4fd7-9ca9-aa3327cb1592.json b/docs/doc/c5d6e433-0c24-4fd7-9ca9-aa3327cb1592.json
new file mode 100644
index 000000000..59db50ae8
--- /dev/null
+++ b/docs/doc/c5d6e433-0c24-4fd7-9ca9-aa3327cb1592.json
@@ -0,0 +1,105 @@
+{
+    "summary": "The code combines T2VLAD and BERT in a CENet model for video analysis, initializes MOE with Transformer layers, extracts visual features, and uses VLAD for cross-view localization. The function calculates video-text similarity scores, includes batch normalization, global pooling, and availability masking, reshapes weights, normalizes embeddings, computes text-video similarity with weighting, checks for NaN values, and raises ValueError if found.",
+    "details": [
+        {
+            "comment": "This code snippet is importing necessary libraries and models for the T2VLAD model. It includes copyright and license information, as well as imports from base, net_vlad, paddlenlp, and various other modules. The code aims to create a T2VLAD model using PaddlePaddle framework with potential dependencies on BertModel and paddlenlp packages.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":0-33",
+            "content": "# Copyright 2021 Antoine Miech All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport time\nimport itertools\nimport paddle\nimport numpy as np\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import Tensor\nfrom typing import Optional\nfrom collections import OrderedDict\nfrom base import BaseModel\nfrom model.net_vlad import NetVLAD\ntry:\n    from paddlenlp.transformers import BertModel\nexcept ImportError as e:\n    print(\n        f\"{e}, [paddlenlp] package and it's dependencies is required for T2VLAD.\""
+        },
+        {
+            "comment": "The code defines three functions: 'Mish', 'kronecker_prod', and 'drop_nans'. The 'Mish' function implements the mish activation function, which applies the mish formula element-wise. The 'kronecker_prod' function performs a Kronecker product of two tensors along the last dimension. Finally, the 'drop_nans' function removes NaN values from input features, considering any missing indices as containing NaN.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":34-65",
+            "content": "    )\nclass Mish(nn.Layer):\n    '''\n    Applies the mish function element-wise:\n    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))\n    SRC: https://github.com/digantamisra98/Mish/blob/master/Mish/Torch/mish.py\n    '''\n    def forward(self, input):\n        '''\n        Forward pass of the function.\n        '''\n        return input * paddle.tanh(F.softplus(input))\ndef kronecker_prod(t1, t2):\n    # kronecker is performed along the last dim\n    kron = paddle.bmm(t1.reshape([-1, t1.size(-1)], 1),\n                      t2.reshape([-1, 1, t2.size(-1)]))\n    return kron.reshape[(t1.shape[0], t1.shape[1], -1)]\ndef drop_nans(x, ind, validate_missing):\n    \"\"\"Remove nans, which we expect to find at missing indices.\n    Args:\n        x (paddle.Tensor): features\n        ind (paddle.Tensor): binary values denoting whether or not a given feature is present\n        validate_missing (bool): whether to validate that the missing location contains a nan.\n    Returns:\n        (paddle.tensor): the features, with the missing values masked to zero."
+        },
+        {
+            "comment": "The code defines a CENet model and checks for any NaN values in the input tensor 'x'. It sets missing locations to 0 and raises a ValueError if there are still NaN values after removing them. The model consists of expert_dims, vlad_clusters, feat_aggregation, ce_shared_dim, use_mish, and mimic_ce_dims. The text_pooling layer is implemented as NetVLAD for feature extraction if the vlad_clusters[\"text\"] is non-zero.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":66-97",
+            "content": "    \"\"\"\n    missing = paddle.nonzero(ind == 0).flatten()\n    if missing.numel():\n        if validate_missing:\n            vals = x[missing[0]]\n            assert paddle.isnan(vals.reshape(\n                [-1])[0]), \"expected nans at missing locations\"\n        #Prevent overwrite of the original tensor\n        x_ = x\n        x_[missing] = 0\n        x = x_\n    if paddle.isnan(x).sum() > 0:\n        raise ValueError(\"Still find nans after removing it!\")\n    return x\nclass CENet(BaseModel):\n    def __init__(self, text_dim, expert_dims, vlad_clusters, ghost_clusters,\n                 feat_aggregation, ce_shared_dim, use_mish, mimic_ce_dims):\n        super().__init__()\n        self.expert_dims = expert_dims\n        self.feat_aggregation = feat_aggregation\n        vlad_feat_sizes = {key: val for key, val in vlad_clusters.items()}\n        if vlad_clusters[\"text\"] == 0:\n            self.text_pooling = nn.Sequential()\n        else:\n            self.text_pooling = NetVLAD(\n                feature_size=text_dim,\n                cluster_size=vlad_clusters[\"text\"],"
+        },
+        {
+            "comment": "The code initializes a model with specified expert dimensions, and handles nan-checks for the experts. It also creates a time estimation start point.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":98-129",
+            "content": "                ghost_clusters=ghost_clusters[\"text\"],\n            )\n            self.text_bert = BertModel.from_pretrained('bert-base-uncased')\n            text_dim = self.text_pooling.out_dim\n        self.ce = CEModule(\n            text_dim=text_dim,\n            expert_dims=expert_dims,\n            vlad_feat_sizes=vlad_feat_sizes,\n            mimic_ce_dims=mimic_ce_dims,\n            use_mish=use_mish,\n            same_dim=ce_shared_dim,\n        )\n    def forward(self,\n                experts,\n                ind,\n                cap_id=None,\n                att_mask=None,\n                text=None,\n                raw_captions=None,\n                text_token_mask=None):\n        aggregated_experts = OrderedDict()\n        # Handle all nan-checks\n        for mod in self.expert_dims:\n            experts[mod] = drop_nans(x=experts[mod],\n                                     ind=ind[mod],\n                                     validate_missing=True)\n            aggregated_experts[mod] = experts[mod]\n        start = time.time()"
+        },
+        {
+            "comment": "This code is reshaping the input text tensor to account for multiple captions per video, applying a pooling operation specific to the chosen text_pooling method (NetVLAD in this case), and then passing the text through a BERT model before performing pooling again. The resulting output is shaped according to the required format for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":130-147",
+            "content": "        # When pooling multiple captions for a single video, we treat them as separate\n        # members of the minibatch, so the total pooling op does the following:\n        # pooling: B x captions_per_video x max_sentence_length x text_feat_dim\n        # -> B x captions_per_video (cluster_dim * text_feat_dim)\n        B, captions_per_video, max_words, text_feat_dim = text.shape\n        text = text.reshape([B * captions_per_video, max_words, text_feat_dim])\n        if isinstance(self.text_pooling, NetVLAD):\n            kwargs = {\"mask\": text_token_mask}\n        else:\n            kwargs = {}\n        cap_id = cap_id.reshape([B * captions_per_video, -1])\n        att_mask = att_mask.reshape([B * captions_per_video, -1])\n        att_mask = att_mask.unsqueeze(axis=[1, 2])\n        bert_out = self.text_bert(cap_id,\n                                  token_type_ids=None,\n                                  attention_mask=att_mask)\n        text = bert_out[0]\n        text, _, save_ass = self.text_pooling(text, **kwargs)"
+        },
+        {
+            "comment": "The given code contains a function that performs multi-head attention, feedforward model implementation, and LayerNorm normalization in Transformer layers. The `nn.MultiHeadAttention` applies the self-attention mechanism, while `nn.Linear` layers are used for linear transformations. Dropout and ReLU activations are also applied to prevent overfitting and introduce nonlinearity respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":148-178",
+            "content": "        text = text.reshape([B, captions_per_video, -1])\n        return self.ce(text, aggregated_experts, ind, raw_captions,\n                       self.text_pooling, start)\ndef _get_clones(module, N):\n    return nn.LayerList([copy.deepcopy(module) for i in range(N)])\nclass TransformerLayer(nn.Layer):\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward=2048,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 normalize_before=True):\n        super().__init__()\n        self.self_attn = nn.MultiHeadAttention(d_model, nhead, dropout=dropout)\n        # Implementation of Feedforward model\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout)\n        self.dropout2 = nn.Dropout(dropout)\n        self.activation = F.relu"
+        },
+        {
+            "comment": "This code defines a class with three forward functions: `forward_post`, `forward_pre`, and an undefined `forward`. The `forward_post` function applies self-attention to the input source tensor, while the `forward_pre` function normalizes the input source tensor before applying self-attention. Both functions take an optional mask and position embedding for the input tensor. The code also includes a class attribute `normalize_before` that determines whether to normalize the input tensor or not.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":179-206",
+            "content": "        self.normalize_before = normalize_before\n    def with_pos_embed(self, tensor, pos: Optional[Tensor]):\n        return tensor if pos is None else tensor + pos\n    def forward_post(self,\n                     src,\n                     src_mask: Optional[Tensor] = None,\n                     pos: Optional[Tensor] = None):\n        q = k = self.with_pos_embed(src, pos)\n        q = q.transpose([1, 0, 2])\n        k = k.transpose([1, 0, 2])\n        src = src.transpose([1, 0, 2])\n        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)\n        src2 = src2.transpose([1, 0, 2])\n        src = src + self.dropout1(src2)\n        src = self.norm1(src)\n        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))\n        src = src + self.dropout2(src2)\n        src = self.norm2(src)\n        return src\n    def forward_pre(self,\n                    src,\n                    src_mask: Optional[Tensor] = None,\n                    pos: Optional[Tensor] = None):\n        src2 = self.norm1(src)\n        q = k = self.with_pos_embed(src2, pos)"
+        },
+        {
+            "comment": "The code defines a Transformer class that performs multi-head self-attention and feedforward operations. The class takes an encoder_layer as input and num_layers as parameters, allowing for multiple layers of transformation. The Transformer class has a forward function that can perform the transformations before or after normalization depending on the value of normalize_before flag. The _reset_parameters function is used to reset the parameters of the class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":207-236",
+            "content": "        q = q.transpose([1, 0, 2])\n        k = k.transpose([1, 0, 2])\n        src2 = src2.transpose([1, 0, 2])\n        src2 = self.self_attn(q, key=k, value=src2, attn_mask=src_mask)\n        src2 = src2.transpose([1, 0, 2])\n        src = src + self.dropout1(src2)\n        src2 = self.norm2(src)\n        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))\n        src = src + self.dropout2(src2)\n        return src\n    def forward(self,\n                src,\n                src_mask: Optional[Tensor] = None,\n                pos: Optional[Tensor] = None):\n        if self.normalize_before:\n            return self.forward_pre(src, src_mask, pos)\n        return self.forward_post(src, src_mask, pos)\nclass Transformer(nn.Layer):\n    def __init__(self, encoder_layer, num_layers, norm=None):\n        super().__init__()\n        self.layers = _get_clones(encoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.norm = norm\n        self._reset_parameters()\n    def _reset_parameters(self):\n        for p in self.parameters():  # may have a problem"
+        },
+        {
+            "comment": "This code defines a CEModule class with expert_dims, modalities, mimic_ce_dims, vlad_feat_sizes, and same_dim parameters. It uses the Mish function for non-linear activation if use_mish is True, otherwise using ReLU. It also includes a ContextGating object and a VisTransformer boolean.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":237-274",
+            "content": "            if p.dim() > 1:\n                nn.initializer.XavierUniform(p)\n    def forward(self,\n                src,\n                mask: Optional[Tensor] = None,\n                pos: Optional[Tensor] = None):\n        output = src\n        for layer in self.layers:\n            output = layer(output)\n        if self.norm is not None:\n            output = self.norm(output)\n        return output\nclass CEModule(nn.Layer):\n    def __init__(self, expert_dims, text_dim, use_mish, mimic_ce_dims,\n                 vlad_feat_sizes, same_dim):\n        super().__init__()\n        modalities = list(expert_dims.keys())\n        self.expert_dims = expert_dims\n        self.modalities = modalities\n        self.mimic_ce_dims = mimic_ce_dims\n        self.same_dim = same_dim\n        self.use_mish = use_mish\n        self.vlad_feat_sizes = vlad_feat_sizes\n        self.reduce_dim = 64\n        self.moe_cg = ContextGating\n        self.vis_transformer = True\n        if self.use_mish:\n            self.non_lin = Mish()\n        else:\n            self.non_lin = nn.ReLU()"
+        },
+        {
+            "comment": "This code is initializing a MOE (Multi-Output Expert) model with specified modalities and dimensions, setting up the linear layer, weights, and batch normalization flags. It also defines the temporal repeat for each modality and calculates the input dimensions based on expert dimensions and temporal repetitions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":276-296",
+            "content": "        num_mods = len(expert_dims)\n        self.moe_fc = nn.Linear(text_dim, len(expert_dims))\n        self.moe_weights = paddle.ones([1, num_mods]) / num_mods\n        # The batch size of the face input can vary (due to missing inputs), so we\n        # probably shouldn't use BN on this branch. It's probably fine to leave it\n        # n for the corresponding text inputs, (but we should switch to GN)\n        use_bns = [True for modality in self.modalities]\n        # NOTE: When use_ce is not used, the text features are projected to\n        # subspaces of different dimensions.  When use_ce is used, they must all\n        # be projected to `same_dim` (to allow fusion). The only excpetion is for an\n        # ablation in which we mimic the `same_dim` reduction to measure whether this\n        # projection influences overall performance.\n        self.repeat_temporal = {}\n        for mod in modalities:\n            self.repeat_temporal[mod] = 1\n        in_dims = [\n            expert_dims[mod][0] * self.repeat_temporal[mod]"
+        },
+        {
+            "comment": "The code initializes and prepares model components for modalities, including dimensions for expert features and feature sizes. It also creates a transformer if visual transformation is enabled, and sets up feature reducers if cross-entropy loss dims are mimicked.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":297-322",
+            "content": "            for mod in modalities\n        ]\n        agg_dims = [\n            expert_dims[mod][1] * self.repeat_temporal[mod]\n            for mod in modalities\n        ]\n        feat_dims = [\n            expert_dims[mod][0] // self.vlad_feat_sizes[mod]\n            for mod in modalities\n        ]\n        if self.vis_transformer:\n            num_encoder_layers = 1\n            d_model = 768\n            nhead = 4\n            dim_feedforward = 768\n            dropout = 0  #dropout=0.1\n            normalize_before = True\n            encoder_layer = TransformerLayer(d_model, nhead, dim_feedforward,\n                                             dropout)\n            encoder_norm = nn.LayerNorm(d_model) if normalize_before else None\n            self.transformers = Transformer(encoder_layer, num_encoder_layers,\n                                            encoder_norm)\n        if self.mimic_ce_dims:\n            dim_reducers = [ReduceDim(in_dim, same_dim) for in_dim in feat_dims]\n            self.video_dim_reduce = nn.LayerList(dim_reducers)"
+        },
+        {
+            "comment": "The code creates GatedEmbeddingUnit instances for both video and text features of different dimensions, initializes LayerLists to store them as video_GU and text_GU. The compute_moe_weights function calculates softmax weights for multiple captions (K) assigned to the same video, with an assertion for 1-10 modalities. Text is reshaped before applying MOE fully connected layer, then reshaped back to BxKxM shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":324-349",
+            "content": "        gated_vid_embds = [\n            GatedEmbeddingUnit(in_dim, same_dim, use_bn=True)\n            for in_dim in feat_dims\n        ]\n        text_out_dims = [same_dim for _ in agg_dims]\n        self.video_GU = nn.LayerList(gated_vid_embds)\n        gated_text_embds = [\n            GatedEmbeddingUnit(text_dim, dim, use_bn=True)\n            for dim in text_out_dims\n        ]\n        self.text_GU = nn.LayerList(gated_text_embds)\n    def compute_moe_weights(self, text, ind):\n        # compute weights for all captions (including when assigned K captions to\n        # the same video)\n        B, K, D = text.shape\n        M = len(self.modalities)\n        msg = f\"expected between 1 and 10 modalities, found {M} ({self.modalities})\"\n        assert 1 <= M <= 10, msg\n        # Treat each caption independently in the softmax (which runs over modalities)\n        text = text.reshape([B * K, D])\n        moe_weights = self.moe_fc(text)  # BK x D -> BK x M\n        moe_weights = F.softmax(moe_weights, axis=1)\n        moe_weights = moe_weights.reshape([B, K, M])"
+        },
+        {
+            "comment": "This code is implementing a method for passing text embeddings through gated units. It first reshapes the input text, then iterates over the modalities and gated units to compute the text embeddings, which are stored in a dictionary. Finally, it reshapes the result back to its original shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":350-373",
+            "content": "        return moe_weights\n    def forward(self, text, experts, ind, raw_captions, vis_vlad, stime):\n        \"\"\"Compute joint embeddings and, if requested, a confusion matrix between\n        video and text representations in the minibatch.\n        Notation: B = batch size, M = number of modalities\n        \"\"\"\n        # Pass text embeddings through gated units\n        text_embd = {}\n        # Unroll repeated captions into present minibatch\n        B, captions_per_video, feat_dim = text.shape\n        text = text.reshape([B * captions_per_video, feat_dim])\n        for modality, layer in zip(self.modalities, self.text_GU):\n            # NOTE: Due to the batch norm, the gated units are sensitive to passing\n            # in a lot of zeroes, so we do the masking step after the forwards pass\n            text_ = layer(text)\n            # We always assume that text is available for retrieval\n            text_ = text_.reshape([B, captions_per_video, -1])\n            text_embd[modality] = text_\n        text = text.reshape([B, captions_per_video, -1])"
+        },
+        {
+            "comment": "This code section is performing MOE weights computation and feature extraction for a Multi-Modal Video Analysis task. It excludes specific features to handle NAN values, then computes the MOE weights using text data and reshapes it accordingly. The visual features are extracted for each modality, then all the visual features are concatenated along the dimension. Finally, if vis_transformer is present, it is applied on the extracted visual features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":375-396",
+            "content": "        # vladded nans are handled earlier (during pooling)\n        # We also avoid zeroing random features, since this will leak information\n        # exclude = list(self.vlad_feat_sizes.keys()) + list(self.random_feats)\n        # experts = self.mask_missing_embeddings(experts, ind, exclude=exclude)\n        # MOE weights computation + normalization - note that we use the first caption\n        # sample to predict the weights\n        moe_weights = self.compute_moe_weights(text, ind=ind)\n        text_local = text.reshape([B * captions_per_video, -1])\n        vis_local = {}\n        for modality in self.modalities:\n            vis_local[modality] = experts[modality]\n        all_vis_feat = []\n        if hasattr(self, \"video_dim_reduce\"):\n            # Embed all features to a common dimension\n            for modality, layer in zip(self.modalities, self.video_dim_reduce):\n                all_vis_feat.append(layer(vis_local[modality]))\n        all_vis_feat = paddle.concat(all_vis_feat, axis=1)\n        if self.vis_transformer:"
+        },
+        {
+            "comment": "This code performs cross-view video localization by calculating the cross-view confidence matrix using VLAD and MOE weights. It also applies transformers, max pooling, and sharded inner products for each modality. The result is a dictionary containing the modalities used in the computation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":397-421",
+            "content": "            experts_tensor = all_vis_feat\n            experts_tensor = experts_tensor.transpose([1, 0, 2])\n            att_out = self.transformers(experts_tensor, mask=None, pos=None)\n            all_vis_feat = att_out.transpose([1, 0, 2])\n        vis_local, _, save_ass = vis_vlad(all_vis_feat, freeze=True)\n        cross_view_conf_matrix_tv = paddle.matmul(text_local, vis_local.t())\n        for modality in self.modalities:\n            experts[modality] = experts[modality].max(axis=1)\n        for modality, layer in zip(self.modalities, self.video_GU):\n            experts[modality] = layer(experts[modality])\n        cross_view_conf_matrix = sharded_cross_view_inner_product(\n            ind=ind,\n            vid_embds=experts,\n            text_embds=text_embd,\n            text_weights=moe_weights,\n            subspaces=self.modalities,\n            raw_captions=raw_captions,\n        )\n        cross_view_conf_matrix = 0.5 * cross_view_conf_matrix + 0.5 * cross_view_conf_matrix_tv\n        return {\n            \"modalities\": self.modalities,"
+        },
+        {
+            "comment": "This code defines several neural network layers: \"GatedEmbeddingUnit\", \"ReduceDim\", and \"ContextGating\". These layers are used for feature extraction, normalization, and context gating in the T2VLAD model. The GatedEmbeddingUnit layer combines a linear transformation and context gating to produce normalized output. The ReduceDim layer reduces the dimension of input features through a linear transformation followed by normalization. The ContextGating layer performs a linear transformation and optional batch normalization for context gating.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":422-455",
+            "content": "            \"cross_view_conf_matrix\": cross_view_conf_matrix,\n        }\nclass GatedEmbeddingUnit(nn.Layer):\n    def __init__(self, input_dimension, output_dimension, use_bn):\n        super(GatedEmbeddingUnit, self).__init__()\n        self.fc = nn.Linear(input_dimension, output_dimension)\n        self.cg = ContextGating(output_dimension, add_batch_norm=use_bn)\n    def forward(self, x):\n        x = self.fc(x)\n        x = self.cg(x)\n        x = F.normalize(x)\n        return x\nclass ReduceDim(nn.Layer):\n    def __init__(self, input_dimension, output_dimension):\n        super(ReduceDim, self).__init__()\n        self.fc = nn.Linear(input_dimension, output_dimension)\n    def forward(self, x):\n        x = self.fc(x)\n        x = F.normalize(x, axis=-1)\n        return x\nclass ContextGating(nn.Layer):\n    def __init__(self, dimension, add_batch_norm=True):\n        super(ContextGating, self).__init__()\n        self.fc = nn.Linear(dimension, dimension)\n        self.add_batch_norm = add_batch_norm\n        self.batch_norm = nn.BatchNorm1D(dimension)"
+        },
+        {
+            "comment": "This code defines a function for computing the similarity matrix between two sets of embeddings, which are divided into smaller shards. The function takes these sharded embeddings and weights for each set, and returns a similarity matrix of size BK x BK. The code includes batch normalization and global pooling operations in its forward pass.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":457-484",
+            "content": "    def forward(self, x):\n        x1 = self.fc(x)\n        if self.add_batch_norm:\n            x1 = self.batch_norm(x1)\n        x = paddle.concat([x, x1], axis=1)\n        return F.glu(x, axis=1)\ndef sharded_cross_view_inner_product(vid_embds,\n                                     text_embds,\n                                     text_weights,\n                                     subspaces,\n                                     ind,\n                                     tol=1E-5,\n                                     raw_captions=None):\n    \"\"\"Compute a similarity matrix from sharded vectors.\n    Args:\n        embds1 (dict[str:paddle.Tensor]): the set of sub-embeddings that, when\n            concatenated, form the whole. The ith shard has shape `B x K x F_i`\n            (i.e. they can differ in the last dimension).\n        embds2 (dict[str:paddle.Tensor]): same format.\n        weights2 (paddle.Tensor): weights for the shards in `embds2`.\n    Returns:\n        (paddle.tensor): similarity matrix of size `BK x BK`.\n    NOTE: If multiple captions are provided, we can aggregate their similarities to"
+        },
+        {
+            "comment": "This code calculates video-text similarity scores and handles the modalities of available experts. It initializes variables for storing similarity scores (sims) and text weights (text_weights). The code also calculates mean and standard deviation for text_weights, and stores these values as mus and stds respectively. Then it creates an availability mask for each modality, marking them either 0 or 1, with the assertion that the mask should only contain 0s or 1s.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":485-506",
+            "content": "    provide a single video-text similarity score.\n    \"\"\"\n    B = vid_embds[subspaces[0]].shape[0]\n    T, num_caps, _ = text_embds[subspaces[0]].shape\n    # unroll separate captions onto first dimension and treat them separately\n    sims = paddle.zeros([T * num_caps, B])\n    text_weights = text_weights.reshape([T * num_caps, -1])\n    if True:\n        mus = [round(x, 3) for x in text_weights.mean(0).numpy().tolist()]\n        stds = [round(x, 3) for x in text_weights.std(0).numpy().tolist()]\n        summary = \">>>\"\n        for mod, mu, std in zip(subspaces, mus, stds):\n            summary += f\"{mod}: {mu} +/- {std} \"\n    # mark expert availabilities along the second axis\n    available = paddle.ones([1, B, len(subspaces)], dtype=text_weights.dtype)\n    for ii, modality in enumerate(subspaces):\n        ind[modality] = paddle.to_tensor(ind[modality], dtype='float32')\n        available[:, :, ii] = ind[modality]\n    msg = \"expected `available` modality mask to only contain 0s or 1s\"\n    assert set(paddle.unique(available).cpu().numpy()).issubset(set([0,"
+        },
+        {
+            "comment": "This code reshapes the text_weights and combines them with availabilities to produce a tensor of size T x B x num_experts. It then normalizes these weights by accounting for missing experts. Lastly, it calculates the L2-masses for both video and text embeddings and applies the weights to the corresponding embeddings. The code also includes an assertion message to ensure correct shape matching between text_embd_ and text_weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":507-525",
+            "content": "                                                                     1])), msg\n    # set the text weights along the first axis and combine with availabilities to\n    # produce a <T x B x num_experts> tensor\n    text_weight_tensor = text_weights.reshape([T * num_caps, 1,\n                                               len(subspaces)]) * available\n    # normalise to account for missing experts\n    normalising_weights = text_weight_tensor.sum(2).reshape(\n        [T * num_caps, B, 1])\n    text_weight_tensor = paddle.divide(text_weight_tensor, normalising_weights)\n    l2_mass_text, l2_mass_vid = 1, 1\n    for idx, modality in enumerate(subspaces):\n        vid_embd_ = vid_embds[modality].reshape([B, -1]) / l2_mass_vid\n        text_embd_ = text_embds[modality].reshape([T * num_caps, -1])\n        msg = \"expected weights to be applied to text embeddings\"\n        assert text_embd_.shape[0] == text_weights.shape[0], msg\n        text_embd_ = text_embd_ / l2_mass_text\n        weighting = text_weight_tensor[:, :, idx]"
+        },
+        {
+            "comment": "This code calculates the similarity between text and video embeddings, multiplying them together with a weighting factor. It then checks for NaN values in the resulting similarity matrix and raises a ValueError if any are found.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/model/model.py\":526-532",
+            "content": "        sims += weighting * paddle.matmul(text_embd_,\n                                          vid_embd_.t())  # (T x num_caps) x (B)\n    if paddle.isnan(sims).sum().item():\n        raise ValueError(\"Found nans in similarity matrix!\")\n    return sims"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c6695e85-c27f-4792-8511-f4a351c56e99.json b/docs/doc/c6695e85-c27f-4792-8511-f4a351c56e99.json
new file mode 100644
index 000000000..656a71bce
--- /dev/null
+++ b/docs/doc/c6695e85-c27f-4792-8511-f4a351c56e99.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is an import statement for the DeepLab class from the deeplab_manet module and specifies that it should be included in the __all__ list.",
+    "details": [
+        {
+            "comment": "This code is an import statement for the DeepLab class from the deeplab_manet module and specifies that it should be included in the __all__ list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py\":0-15",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .deeplab_manet import DeepLab\n__all__ = ['DeepLab']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c6dd135c-d0ae-423f-bfaf-4f9c9d471b16.json b/docs/doc/c6dd135c-d0ae-423f-bfaf-4f9c9d471b16.json
new file mode 100644
index 000000000..535003728
--- /dev/null
+++ b/docs/doc/c6dd135c-d0ae-423f-bfaf-4f9c9d471b16.json
@@ -0,0 +1,225 @@
+{
+    "summary": "This PyTorch code uses OpenCV for image processing, offers conversion functions and error handling with PaddleVideo. It initializes tensors using Xavier/Glorot or Kaiming normal distribution, favoring Torch.nn.init methods over older ones.",
+    "details": [
+        {
+            "comment": "Imports various modules and defines a type hint for paddle tensor or iterable of tensors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":0-27",
+            "content": "from __future__ import absolute_import\nimport json\nimport math\nimport os\nimport pickle\nimport warnings\nimport numpy\nimport numpy as np\nfrom numpy import inf\nfrom paddle import Tensor, concat, reshape, nn\nimport paddle\nfrom typing import Union, Iterable\n# from reprod_log.compare import compute_diff\n# from reprod_log.utils import check_print_diff, np2torch, np2paddle, torch2np, paddle2np\n_tensor_or_tensors = Union[paddle.Tensor, Iterable[paddle.Tensor]]\n_palette = [\n    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,\n    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,\n    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,\n    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,\n    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,\n    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,\n    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,"
+        },
+        {
+            "comment": "This code consists of a long sequence of integers with no apparent functionality or structure. It may represent an array, list, or range of values used in various parts of the codebase, but without further context, it is impossible to determine the specific purpose or usage for these numbers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":28-40",
+            "content": "    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,\n    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,\n    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,\n    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,\n    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,\n    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,\n    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,\n    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,\n    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,\n    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,\n    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,\n    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,\n    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,"
+        },
+        {
+            "comment": "This code likely contains a list of integer values, potentially representing coordinates or other numerical data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":41-53",
+            "content": "    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,\n    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,\n    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,\n    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,\n    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,\n    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,\n    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,\n    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,\n    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,\n    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,\n    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,\n    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,\n    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,"
+        },
+        {
+            "comment": "This code appears to be a list of integers. It is difficult to provide a brief comment as there seems to be no clear context or purpose for these numbers in this specific location.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":54-66",
+            "content": "    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,\n    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,\n    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,\n    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,\n    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,\n    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,\n    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,\n    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,\n    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,\n    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,\n    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,\n    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,\n    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,"
+        },
+        {
+            "comment": "This code defines a function called \"mask\\_damager\" which takes in labels and a probability of blacking out as input. It randomly scales the image using a scale range of (0.8, 1.0, 1.2), generates a random kernel size between 10 to 15, and applies random rotation to the image. If a random number is less than the given probability, it sets the final label as black; otherwise, it performs random rotations and scaling on the input labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":67-104",
+            "content": "    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,\n    255, 255\n]\n# paddle.set_device('gpu') if paddle.is_compiled_with_cuda() else paddle.set_device('cpu')\nimport paddle\nimport PIL\nimport numbers\nimport numpy as np\nfrom PIL import Image\nfrom paddle.vision.transforms import BaseTransform\nfrom paddle.vision.transforms import functional as F\nimport numpy as np\nfrom scipy.ndimage import interpolation, binary_dilation\ntry:\n    from skimage import morphology, transform\nexcept ImportError as e:\n    print(\n        f\"{e}, [scikit-image] package and it's dependencies is required for EIVideo.\"\n    )\nimport paddle\nimport cv2\nimport random\n####\ndef mask_damager(labels=None, p_black=0.2):\n    scales = (0.8, 1.0, 1.2)\n    kernel_size = random.randint(10, 15)\n    kernel = np.ones((kernel_size, kernel_size), np.uint8)\n    if random.random() < p_black:\n        final_label = paddle.zeros_like(labels)\n        final_label = final_label.squeeze().numpy()\n    else:\n        prot = random.randint(5, 15)\n        nrot = random.randint(-15, -5)"
+        },
+        {
+            "comment": "The code performs morphological operations on an image using OpenCV and then applies a rotation transformation to overlay the segmentation mask onto the RGB image. It uses different colors for different classes in the segmentation mask.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":105-145",
+            "content": "        rots = [prot, nrot, 0]\n        rot = rots[random.randint(0, 2)]\n        sc = scales[random.randint(0, 2)]\n        _, _, h, w = labels.shape\n        tmp = labels.squeeze()\n        tmp = tmp.unsqueeze(-1)\n        tmp = tmp.numpy().astype(np.uint8)\n        morph_p = random.random()\n        if morph_p < 0.5:\n            tmp = cv2.morphologyEx(tmp, cv2.MORPH_OPEN, kernel)\n        else:\n            tmp = cv2.morphologyEx(tmp, cv2.MORPH_CLOSE, kernel)\n        tmp = tmp.astype(np.uint8)\n        center = (w / 2, h / 2)\n        M = cv2.getRotationMatrix2D(center, rot, sc)\n        final_label = cv2.warpAffine(tmp, M, (w, h), cv2.INTER_NEAREST)\n    return final_label\ncolor_map = [\n    [0, 0, 0],\n    [255, 127, 0],\n    [30, 144, 255],\n    [186, 85, 211],\n    [255, 105, 180],\n    [192, 255, 62],\n    [255, 105, 180],\n    [50, 255, 255],\n]\ncolor_map_np = np.array(color_map)\ndef overlay_davis(image, mask, alpha=0.5):\n    \"\"\" Overlay segmentation on top of RGB image. from davis official\"\"\"\n    im_overlay = image.copy()\n    mask = mask.astype('uint8')"
+        },
+        {
+            "comment": "This function takes a list of masks and images, and for each pair, it applies an overlay function to generate an overlay image. It saves these overlay images in the specified directory with filenames corresponding to their original image names. Additionally, it stores the list of overlays as JSON in a file named \"masks.json\". The comments suggest that there might be another function to store masks as a list instead of overlays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":146-171",
+            "content": "    colored_mask = color_map_np[mask]\n    foreground = image * alpha + (1 - alpha) * colored_mask\n    binary_mask = (mask > 0)\n    # Compose image\n    im_overlay[binary_mask] = foreground[binary_mask]\n    countours = binary_dilation(binary_mask) ^ binary_mask\n    im_overlay[countours, :] = 0\n    return im_overlay.astype(image.dtype)\n# TODO\ndef submit_masks(masks, images, inter_file_path):\n    overlays = []\n    save_result_path = os.path.join(inter_file_path, 'result')\n    os.makedirs(save_result_path, exist_ok=True)\n    for imgname, (mask, image) in enumerate(zip(masks, images)):\n        overlay = overlay_davis(image, mask)\n        overlays.append(overlay.tolist())\n        overlay = Image.fromarray(overlay)\n        imgname = str(imgname)\n        while len(imgname) < 5:\n            imgname = '0' + imgname\n        overlay.save(os.path.join(save_result_path, imgname + '.png'))\n    result = {'overlays': overlays}\n    # result = {'masks': masks.tolist()}\n    with open(os.path.join(save_result_path, 'masks.json'), 'w') as f:"
+        },
+        {
+            "comment": "load_video function reads frames from a video file and optionally resizes the frame to match a minimum side length, appending each frame to a list. The function then stacks the frames in the list into a single numpy array and returns it. get_scribbles generates scribble data for 8 labels by iterating through corresponding JSON files and yields the data along with a flag indicating if it is the first label or not. get_images retrieves video images from a specified sequence directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":172-205",
+            "content": "        json.dump(result, f)\ndef load_video(path, min_side=None):\n    frame_list = []\n    cap = cv2.VideoCapture(path)\n    while (cap.isOpened()):\n        _, frame = cap.read()\n        if frame is None:\n            break\n        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n        if min_side:\n            h, w = frame.shape[:2]\n            new_w = (w * min_side // min(w, h))\n            new_h = (h * min_side // min(w, h))\n            frame = cv2.resize(frame, (new_w, new_h),\n                               interpolation=cv2.INTER_CUBIC)\n            # .transpose([2, 0, 1])\n        frame_list.append(frame)\n    frames = np.stack(frame_list, axis=0)\n    return frames\ndef get_scribbles():\n    for i in range(8):\n        with open(f'/home/lc/paddlevideo/data/bike-packing/lable/{i + 1}.json'\n                  ) as f:\n            scribbles = json.load(f)\n            first_scribble = not i\n            yield scribbles, first_scribble\ndef get_images(sequence='bike-packing'):\n    img_path = os.path.join('/home/lc/paddlevideo/data', sequence.strip(),"
+        },
+        {
+            "comment": "The code defines two functions: \"load_image\" and \"rough_ROI\". The \"load_image\" function loads an image from a specified directory, sorts the images by file name, reads each image using PIL library, and returns the images as a numpy array. The \"rough_ROI\" function receives scribble labels as input, determines the bounding box around each scribble in the batch, applies this bounding box to another mask, and returns the final scribble labels after filtering.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":206-235",
+            "content": "                            'frame')\n    img_files = os.listdir(img_path)\n    img_files.sort()\n    files = []\n    for img in img_files:\n        img_file = np.array(Image.open(os.path.join(img_path, img)))\n        files.append(img_file)\n    return np.array(files)\ndef rough_ROI(ref_scribble_labels):\n    #### b*1*h*w\n    dist = 20\n    b, _, h, w = ref_scribble_labels.shape\n    filter_ = paddle.zeros_like(ref_scribble_labels)\n    to_fill = paddle.zeros_like(ref_scribble_labels)\n    for i in range(b):\n        no_background = (ref_scribble_labels[i] != -1)\n        no_background = no_background.squeeze(0)\n        no_b = no_background.nonzero()\n        (h_min, w_min) = paddle.min(no_b, 0)\n        (h_max, w_max) = paddle.max(no_b, 0)\n        filter_[i, 0,\n                max(h_min - dist, 0):min(h_max + dist, h - 1),\n                max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1\n    final_scribble_labels = paddle.where(byte_(filter_), ref_scribble_labels,\n                                         to_fill)\n    return final_scribble_labels"
+        },
+        {
+            "comment": "This code defines a function `load` that loads a pretrained model from a file. It checks if the file exists, then loads either the 'state_dict' or the entire dictionary depending on compatibility with the model keys. The function also filters out 'num_batches_tracked' and 'head.' before assigning the correct values to state_dicts. Finally, it writes both the modified state_dicts and model to separate text files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":238-271",
+            "content": "import os.path as osp\ndef load(file_name, model, **cfg):\n    if not osp.isfile(file_name):\n        raise IOError(f'{file_name} not exist')\n    try:\n        state_dicts_ = paddle.load(file_name)['state_dict']\n    except:\n        state_dicts_ = paddle.load(file_name)\n    state_dicts = {}\n    for k in model.keys():\n        if 'num_batches_tracked' not in k:\n            if ('head.' + k) not in state_dicts_.keys():\n                if k not in state_dicts_.keys():\n                    print(f'model -----{k} -------is not in pretrained')\n                else:\n                    state_dicts[k] = state_dicts_[k]\n            else:\n                state_dicts[k] = state_dicts_['head.' + k]\n    write_dict(state_dicts, 'state_dicts.txt', **cfg)\n    write_dict(model, 'model.txt', **cfg)\n    return state_dicts\n#####\ndef write_dict(state_dict, file_name, **cfg):\n    lines = []\n    tot = 0\n    for k, v in state_dict.items():\n        # \u76ee\u524d\u53ea\u53d1\u73b0\u4e86torch\u548cpaddle\u6a21\u578b\u53c2\u6570\u547d\u540d\u7684\u8fd9\u4e09\u79cd\u4e0d\u4e00\u81f4\n        # \u4e0d\u4e00\u81f41\n        if 'num_batches_tracked' in k:\n            tot += 1"
+        },
+        {
+            "comment": "The code defines two functions: `damage_masks` and `damage_masks_np`. Both functions take in a labels array, and apply damage to the masks by applying shift, scale, and rotate transformations. The output is returned as a tensor after being transposed. The functions are designed for PaddlePaddle and NumPy respectively, and the input must be of shape (batch_size * 1 * h * w).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":272-303",
+            "content": "            continue\n        try:\n            line = str(k) + '\\t' + str(v.cpu().detach().numpy().shape) + '\\n'\n        except:\n            line = str(k) + '\\t' + str(v.shape) + '\\n'\n        lines.append(line)\n    # with open(cfg.get(\"output_dir\", f\"./output/{file_name}\"), 'w') as f:\n    #     f.writelines(lines)\n    # print('%d num_batches_tracked skipped' % tot)\ndef damage_masks(labels, shift=True, scale=True, rotate=True):\n    \"\"\"\n    Args:\n    labels: numpy array (batch_size * 1 * h * w)\n    \"\"\"\n    bs, _, h, w = labels.shape\n    labels = labels.transpose([0, 2, 3, 1])\n    labels = labels.numpy()\n    final_label = []\n    for i in range(bs):\n        label = labels[i]\n        damaged_label = damage_masks_np(label, shift, scale, rotate)\n        final_label.append(damaged_label)\n    final_label = np.array(final_label)\n    final_label = paddle.to_tensor(final_label)\n    final_label = final_label.transpose([0, 3, 1, 2])\n    return final_label\ndef damage_masks_np(labels, shift=True, scale=True, rotate=True):\n    \"\"\"Performs the actual mask damaging in numpy."
+        },
+        {
+            "comment": "This function damages the input labels by randomly shifting, scaling, rotating, and dilating the object masks. It first extracts unique labels, then shuffles them before iterating through each unique label to generate a damaged version of the labels. The `_damage_single_object_mask` function is used internally for performing mask damage on a single object.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":304-329",
+            "content": "    Args:\n    labels: Int32 numpy array of shape (height, width, 1).\n    shift: Boolean, whether to damage the masks by shifting.\n    scale: Boolean, whether to damage the masks by scaling.\n    rotate: Boolean, whether to damage the masks by rotation.\n    dilate: Boolean, whether to damage the masks by dilation.\n    Returns:\n    The damaged version of labels.\n    \"\"\"\n    unique_labels = np.unique(labels)\n    unique_labels = np.setdiff1d(unique_labels, [0])\n    # Shuffle to get random depth ordering when combining together.\n    np.random.shuffle(unique_labels)\n    damaged_labels = np.zeros_like(labels)\n    for l in unique_labels:\n        obj_mask = (labels == l)\n        damaged_obj_mask = _damage_single_object_mask(obj_mask, shift, scale,\n                                                      rotate)\n        damaged_labels[damaged_obj_mask] = l\n    return damaged_labels\ndef _damage_single_object_mask(mask, shift, scale, rotate):\n    \"\"\"Performs mask damaging in numpy for a single object.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1)."
+        },
+        {
+            "comment": "This code appears to be part of a function that damages a mask for a single object by randomly shifting it in numpy. The function takes a Boolean numpy array as input and returns the shifted version of the mask. It also includes parameters for scaling, rotation, and dilation, but these operations are not defined in this snippet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":330-360",
+            "content": "    shift: Boolean, whether to damage the masks by shifting.\n    scale: Boolean, whether to damage the masks by scaling.\n    rotate: Boolean, whether to damage the masks by rotation.\n    dilate: Boolean, whether to damage the masks by dilation.\n    Returns:\n    The damaged version of mask.\n    \"\"\"\n    if shift:\n        mask = _shift_mask(mask)\n    if scale:\n        mask = _scale_mask(mask)\n    if rotate:\n        mask = _rotate_mask(mask)\n    return mask\ndef _shift_mask(mask, max_shift_factor=0.05):\n    \"\"\"Damages a mask for a single object by randomly shifting it in numpy.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    max_shift_factor: Float scalar, the maximum factor for random shifting.\n    Returns:\n    The shifted version of mask.\n    \"\"\"\n    nzy, nzx, _ = mask.nonzero()\n    h = nzy.max() - nzy.min()\n    w = nzx.max() - nzx.min()\n    size = np.sqrt(h * w)\n    offset = np.random.uniform(-size * max_shift_factor,\n                               size * max_shift_factor, 2)\n    shifted_mask = interpolation.shift(np.squeeze(mask, axis=2),"
+        },
+        {
+            "comment": "The code contains three functions: _shift_mask, _scale_mask, and _rotate_mask. These functions are used to randomly manipulate a binary mask by shifting, scaling, or rotating it for a single object. The purpose is to damage the mask to enhance the robustness of the AI system against different poses or scales of the object.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":361-387",
+            "content": "                                       offset,\n                                       order=0).astype('bool')[..., np.newaxis]\n    return shifted_mask\ndef _scale_mask(mask, scale_amount=0.025):\n    \"\"\"Damages a mask for a single object by randomly scaling it in numpy.\n    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    scale_amount: Float scalar, the maximum factor for random scaling.\n    Returns:\n    The scaled version of mask.\n    \"\"\"\n    nzy, nzx, _ = mask.nonzero()\n    cy = 0.5 * (nzy.max() - nzy.min())\n    cx = 0.5 * (nzx.max() - nzx.min())\n    scale_factor = np.random.uniform(1.0 - scale_amount, 1.0 + scale_amount)\n    shift = transform.SimilarityTransform(translation=[-cx, -cy])\n    inv_shift = transform.SimilarityTransform(translation=[cx, cy])\n    s = transform.SimilarityTransform(scale=[scale_factor, scale_factor])\n    m = (shift + (s + inv_shift)).inverse\n    scaled_mask = transform.warp(mask, m) > 0.5\n    return scaled_mask\ndef _rotate_mask(mask, max_rot_degrees=3.0):\n    \"\"\"Damages a mask for a single object by randomly rotating it in numpy."
+        },
+        {
+            "comment": "This code defines a function that rotates and scales a binary mask. It first calculates the center coordinates of the mask, then generates a random rotation angle within a specified range, applies the transformation, and inverses it to get the final scaling transformation matrix. The result is a warped version of the mask where pixels above 0.5 are considered as true values. Additionally, there's an AverageMeter class that computes and stores average and current value for continuous metrics calculation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":388-421",
+            "content": "    Args:\n    mask: Boolean numpy array of shape(height, width, 1).\n    max_rot_degrees: Float scalar, the maximum number of degrees to rotate.\n    Returns:\n    The scaled version of mask.\n    \"\"\"\n    cy = 0.5 * mask.shape[0]\n    cx = 0.5 * mask.shape[1]\n    rot_degrees = np.random.uniform(-max_rot_degrees, max_rot_degrees)\n    shift = transform.SimilarityTransform(translation=[-cx, -cy])\n    inv_shift = transform.SimilarityTransform(translation=[cx, cy])\n    r = transform.SimilarityTransform(rotation=np.deg2rad(rot_degrees))\n    m = (shift + (r + inv_shift)).inverse\n    scaled_mask = transform.warp(mask, m) > 0.5\n    return scaled_mask\nclass AverageMeter(object):\n    \"\"\"Computes and stores the average and current value\"\"\"\n    def __init__(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def reset(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        self.val = val\n        self.sum += val * n"
+        },
+        {
+            "comment": "Code utilities for PaddleVideo: converts label to colormap, converts PyTorch data types to Paddle, fills tensor with a value, sets tensor value to zero, and casts tensor to float32 dtype.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":422-465",
+            "content": "        self.count += n\n        self.avg = self.sum / self.count\nimport numpy as np\ndef label2colormap(label):\n    m = label.astype(np.uint8)\n    r, c = m.shape\n    cmap = np.zeros((r, c, 3), dtype=np.uint8)\n    cmap[:, :, 0] = (m & 1) << 7 | (m & 8) << 3 | (m & 64) >> 1\n    cmap[:, :, 1] = (m & 2) << 6 | (m & 16) << 2 | (m & 128) >> 2\n    cmap[:, :, 2] = (m & 4) << 5 | (m & 32) << 1\n    return cmap\ndef torch2paddle(data):\n    try:\n        import torch\n        if isinstance(data, dict):\n            np_data = {}\n            for k, v in data.items():\n                np_data[k] = paddle.to_tensor(v.detach().numpy())\n            return np_data\n        else:\n            return paddle.to_tensor(data.detach().numpy())\n    except:\n        pass\ndef fill_(tensor: Tensor, value):\n    return tensor.set_value(paddle.full_like(tensor, value))\ndef zero_(tensor: Tensor):\n    return tensor.set_value(paddle.zeros_like(tensor))\ndef float_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='float32')\ndef long_(tensor: Tensor):"
+        },
+        {
+            "comment": "The code provides three functions for converting tensors to different data types: int64, int32, and bool. The class ToPILImage is used to convert images from Tensor or np.ndarray format to PIL Image. It checks the type of input pic and throws a TypeError if it's not a Tensor or ndarray. If pic has 2 or 3 dimensions, it adds a channel dimension for 2D images. If the number of dimensions is not 2 or 3, it raises a ValueError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":466-499",
+            "content": "    return paddle.to_tensor(tensor, dtype='int64')\ndef int_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='int32')\ndef byte_(tensor: Tensor):\n    return paddle.to_tensor(tensor, dtype='bool')\nclass ToPILImage(BaseTransform):\n    def __init__(self, mode=None, keys=None):\n        super(ToPILImage, self).__init__(keys)\n    def _apply_image(self, pic):\n        \"\"\"\n        Args:\n            pic (Tensor|np.ndarray): Image to be converted to PIL Image.\n        Returns:\n            PIL: Converted image.\n        \"\"\"\n        if not (isinstance(pic, paddle.Tensor) or isinstance(pic, np.ndarray)):\n            raise TypeError('pic should be Tensor or ndarray. Got {}.'.format(\n                type(pic)))\n        elif isinstance(pic, paddle.Tensor):\n            if pic.ndimension() not in {2, 3}:\n                raise ValueError(\n                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(\n                        pic.ndimension()))\n            elif pic.ndimension() == 2:\n                # if 2D image, add channel dimension (CHW)"
+        },
+        {
+            "comment": "This code is checking the input \"pic\" and adjusting its format to be compatible with the function. It first checks if it's a tensor or ndarray, then ensures that the image is 2D or 3D (adding channels if necessary) before converting it into NumPy ndarray format. Finally, it checks the data type and mode to further adjust the \"pic\" as needed. If any issue arises during this process, it raises an error with a descriptive message.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":500-528",
+            "content": "                pic = pic.unsqueeze(0)\n        elif isinstance(pic, np.ndarray):\n            if pic.ndim not in {2, 3}:\n                raise ValueError(\n                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(\n                        pic.ndim))\n            elif pic.ndim == 2:\n                # if 2D image, add channel dimension (HWC)\n                pic = np.expand_dims(pic, 2)\n        npimg = pic\n        if isinstance(pic, paddle.Tensor) and \"float\" in str(\n                pic.numpy().dtype) and self.mode != 'F':\n            pic = pic.mul(255).byte()\n        if isinstance(pic, paddle.Tensor):\n            npimg = np.transpose(pic.numpy(), (1, 2, 0))\n        if not isinstance(npimg, np.ndarray):\n            raise TypeError(\n                'Input pic must be a paddle.Tensor or NumPy ndarray, ' +\n                'not {}'.format(type(npimg)))\n        if npimg.shape[2] == 1:\n            expected_mode = None\n            npimg = npimg[:, :, 0]\n            if npimg.dtype == np.uint8:\n                expected_mode = 'L'"
+        },
+        {
+            "comment": "This code is validating the input image's data type and dimensions to determine the appropriate mode for the image. It raises a ValueError if the supplied self.mode does not match the expected mode based on the input type, or if the number of channels in the image does not match permitted modes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":529-552",
+            "content": "            elif npimg.dtype == np.int16:\n                expected_mode = 'I;16'\n            elif npimg.dtype == np.int32:\n                expected_mode = 'I'\n            elif npimg.dtype == np.float32:\n                expected_mode = 'F'\n            if self.mode is not None and self.mode != expected_mode:\n                raise ValueError(\n                    \"Incorrect self.mode ({}) supplied for input type {}. Should be {}\"\n                    .format(self.mode, np.dtype, expected_mode))\n            self.mode = expected_mode\n        elif npimg.shape[2] == 2:\n            permitted_2_channel_modes = ['LA']\n            if self.mode is not None and self.mode not in permitted_2_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 2D inputs\".format(\n                        permitted_2_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'LA'\n        elif npimg.shape[2] == 4:\n            permitted_4_channel_modes = ['RGBA', 'CMYK', 'RGBX']"
+        },
+        {
+            "comment": "The code snippet is part of a class function that checks the input image mode and data type to determine if it's compatible with the operation. If not, it raises an error or sets the mode accordingly. It also defines a placeholder identity operator class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":553-577",
+            "content": "            if self.mode is not None and self.mode not in permitted_4_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 4D inputs\".format(\n                        permitted_4_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'RGBA'\n        else:\n            permitted_3_channel_modes = ['RGB', 'YCbCr', 'HSV']\n            if self.mode is not None and self.mode not in permitted_3_channel_modes:\n                raise ValueError(\n                    \"Only self.modes {} are supported for 3D inputs\".format(\n                        permitted_3_channel_modes))\n            if self.mode is None and npimg.dtype == np.uint8:\n                self.mode = 'RGB'\n        if self.mode is None:\n            raise TypeError('Input type {} is not supported'.format(\n                npimg.dtype))\n        return Image.fromarray(npimg, mode=self.mode)\nclass Identity(nn.Layer):\n    r\"\"\"A placeholder identity operator that is argument-insensitive."
+        },
+        {
+            "comment": "This code defines a class \"Identity\" with an empty forward function and a convert function that converts dictionary data between Paddle and Torch formats. If 'paddle' is given as the to parameter, it converts numpy arrays in the input dictionary to Paddle tensors. If 'torch' is given, it tries to import torch and converts numpy arrays or leaves unchanged non-numpy elements in the input dictionary to Torch tensors. Dtype can be used to specify a specific data type for tensor conversion.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":579-614",
+            "content": "    Args:\n        args: any argument (unused)\n        kwargs: any keyword argument (unused)\n    \"\"\"\n    def __init__(self, *args, **kwargs):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\ndef convert(data: dict, to, dtype=None):\n    assert isinstance(data, dict)\n    input = {}\n    for k, v in data.items():\n        if 'paddle' == to:\n            if isinstance(v, np.ndarray):\n                if dtype is not None:\n                    input[k] = paddle.to_tensor(v.astype(dtype))\n                else:\n                    input[k] = paddle.to_tensor(v)\n            else:\n                input[k] = v\n        elif 'torch' == to:\n            try:\n                import torch\n                if isinstance(v, np.ndarray):\n                    if dtype is not None:\n                        input[k] = torch.tensor(v.astype(dtype))\n                    else:\n                        input[k] = torch.tensor(v)\n                else:\n                    input[k] = v\n            except:\n                pass"
+        },
+        {
+            "comment": "This function clips the gradient norm of an iterable of parameters. The norm is computed over all gradients together, as if they were concatenated into a single vector. Gradients are modified in-place. It takes arguments such as iterable of Tensors or a single Tensor that will have gradients normalized, max_norm (float or int) to set the maximum norm of the gradients, norm_type (float or int) for the type of used p-norm, and error_if_nonfinite (bool) to indicate whether an error should be thrown if total norm is nan.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":615-639",
+            "content": "        else:\n            if isinstance(v, np.ndarray):\n                input[k] = v.astype(to)\n            else:\n                input[k] = v\n    return input\ndef clip_grad_norm_(parameters: _tensor_or_tensors,\n                    max_norm: float,\n                    norm_type: float = 2.0,\n                    error_if_nonfinite: bool = False) -> paddle.Tensor:\n    r\"\"\"Clips gradient norm of an iterable of parameters.\n    The norm is computed over all gradients together, as if they were\n    concatenated into a single vector. Gradients are modified in-place.\n    Args:\n        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a\n            single Tensor that will have gradients normalized\n        max_norm (float or int): max norm of the gradients\n        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for\n            infinity norm.\n        error_if_nonfinite (bool): if True, an error is thrown if the total\n            norm of the gradients from :attr:``parameters`` is ``nan``,"
+        },
+        {
+            "comment": "This function calculates the total norm of parameters (viewed as a single vector) and handles cases where parameters are tensors. It first checks if the parameter is a tensor, then selects only those with non-null gradients, detaches their gradients, and applies different norm types based on the input. If the norm type is infinity, it calculates maximum absolute values for each parameter; otherwise, it calculates the p-norm using provided parameters and detached gradients.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":640-665",
+            "content": "            ``inf``, or ``-inf``. Default: False (will switch to True in the future)\n    Returns:\n        Total norm of the parameters (viewed as a single vector).\n    \"\"\"\n    import time\n    if isinstance(parameters, paddle.Tensor):\n        parameters = [parameters]\n    parameters = [p for p in parameters if p.grad is not None]\n    detached_grads = [p.grad.detach() for p in parameters]\n    max_norm = float(max_norm)\n    norm_type = float(norm_type)\n    if len(parameters) == 0:\n        return paddle.to_tensor(0.)\n    # device = paddle.get_device()  # parameters[0].grad.device\n    if norm_type == inf:\n        norms = [p.abs().max() for p in parameters]\n        total_norm = norms[0] if len(norms) == 1 else paddle.max(\n            paddle.stack(norms))\n    else:\n        #         tik = time.time()\n        total_norm = paddle.norm(\n            paddle.stack([paddle.norm(g, norm_type) for g in detached_grads]),\n            norm_type)\n    #         total_norm = paddle.norm(paddle.stack([paddle.sqrt(paddle.sum(g*g)) for g in detached_grads]), norm_type)  # fixed."
+        },
+        {
+            "comment": "This code checks if the total norm of gradients from `parameters` is non-finite. If it is, it raises a RuntimeError and suggests setting `error_if_nonfinite=False`. Then it calculates the clipping coefficient, performs a clip operation to ensure it's within the range (0, 1], and finally multiplies the gradients with this coefficient to scale them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":666-681",
+            "content": "    #         print(time.time() - tik)\n    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),\n                                                total_norm.isinf()):\n        raise RuntimeError(\n            f'The total norm of order {norm_type} for gradients from '\n            '`parameters` is non-finite, so it cannot be clipped. To disable '\n            'this error and scale the gradients by the non-finite norm anyway, '\n            'set `error_if_nonfinite=False`')\n    clip_coef = max_norm / (total_norm + 1e-6)\n    # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so\n    # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization\n    # when the gradients do not reside in CPU memory.\n    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)\n    for i, p in enumerate(parameters):\n        #         p.set_value(paddle.multiply(p, clip_coef_clamped))\n        p.grad.set_value(detached_grads[i] * clip_coef_clamped)  # fixed"
+        },
+        {
+            "comment": "This code defines a function `max()` that finds the maximum values in a tensor and their corresponding indices. It also includes a `gather()` function that performs tensor gathering based on provided indices. The functions use PaddlePaddle library for Tensor operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":682-715",
+            "content": "    #         p.grad.detach().mul_(clip_coef_clamped\n    return total_norm\n# def max(a: paddle.Tensor, axis=0, keepdim=True):\n#     \"\"\"ndarray=numpy.array([[1, 2, 3, 4],\n#            [4, 3, 2, 1],\n#            [5, 6, 7, 8],\n#            [8, 7, 6, 5]])\n#     np.where(ndarray == np.max(ndarray))\n#     (array([2, 3]), array([3, 0]))\n#     ndarray[np.where(ndarray == np.max(ndarray))]\n#     array([8, 8])\n#     \"\"\"\n#     max_ = a.max(axis).unsqueeze(-1)\n#     index = paddle.argmax(a, axis=axis, keepdim=keepdim)\n#     max_ = max_.numpy()\n#     index = index.numpy()\n#     # index = paddle.argmax(a, axis=axis, keepdim=keepdim)[-1].flatten()\n#     return max_, index\ndef gather(tmp: paddle.Tensor, ind: paddle.Tensor):\n    shape = tmp.shape\n    tmp = paddle.to_tensor(tmp)\n    ind = paddle.to_tensor(ind)\n    if len(shape) == 2:\n        b = shape[0]\n        return concat([\n            reshape(paddle.gather(tmp[i, :], ind[i, :]), [1, -1])\n            for i in range(b)\n        ],\n                      axis=0)\n    elif len(shape) == 3:"
+        },
+        {
+            "comment": "This function performs sampling and reshaping operations on tensors with different shapes. It handles cases where the tensor shape has 0, 1, or 4 dimensions. In case of a 4-dimensional tensor, it uses gather and concat functions to rearrange data according to the given indices. The no_grad_* functions are used as wrappers for parts that require `torch.no_grad()` context manager due to JIT's inability to handle context managers directly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":716-744",
+            "content": "        out = []\n        for i in range(tmp.shape[0]):\n            _ = paddle.index_sample(tmp[i], ind[i])\n            out.append(_)\n        return paddle.to_tensor(out)\n    elif len(shape) == 4:\n        b, c, d = shape[:3]\n        return concat([\n            reshape(\n                concat([\n                    reshape(\n                        concat([\n                            reshape(\n                                paddle.gather(tmp[i, j, k, :], ind[i, j, k, :]),\n                                [1, -1]) for k in range(d)\n                        ],\n                               axis=0), [1, d, -1]) for j in range(c)\n                ],\n                       axis=0), [1, c, d, -1]) for i in range(b)\n        ],\n                      axis=0)\n    else:\n        pass\n# These no_grad_* functions are necessary as wrappers around the parts of these\n# functions that use `with torch.no_grad()`. The JIT doesn't support context\n# managers, so these need to be implemented as builtins. Using these wrappers\n# lets us keep those builtins small and re-usable."
+        },
+        {
+            "comment": "This code defines three functions: `_no_grad_uniform_`, `_no_grad_normal_`, and `_no_grad_trunc_normal_`. These functions generate tensors with specific distributions while using gradient calculation, which helps in tensor initialization or randomization tasks. The first function generates uniformly distributed values within a defined range. The second function generates normally distributed values with specified mean and standard deviation. The third function generates truncated normal distribution values by combining uniform and normal distribution functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":745-773",
+            "content": "def _no_grad_uniform_(tensor, a, b):\n    with paddle.no_grad():\n        tensor.set_value(paddle.uniform(tensor.shape, min=a, max=b))\n        return tensor\ndef _no_grad_normal_(tensor, mean, std):\n    with paddle.no_grad():\n        tensor.set_value(paddle.normal(shape=tensor.shape, mean=mean, std=std))\n        return tensor\ndef _no_grad_trunc_normal_(tensor, mean, std, a, b):\n    from scipy import special\n    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf\n    def norm_cdf(x):\n        # Computes standard normal cumulative distribution function\n        return (1. + math.erf(x / math.sqrt(2.))) / 2.\n    if (mean < a - 2 * std) or (mean > b + 2 * std):\n        warnings.warn(\n            \"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. \"\n            \"The distribution of values may be incorrect.\",\n            stacklevel=2)\n    with paddle.no_grad():\n        # Values are generated by using a truncated uniform distribution and\n        # then using the inverse CDF for the normal distribution."
+        },
+        {
+            "comment": "This code snippet is used to generate random values within a specified range and then apply inverse cumulative distribution function (CDF) transforms to normalize the tensor. It uses PyTorch's `paddle` library for its operations. The result is then clamped between a minimum and maximum value, which are defined by the variables 'a' and 'b', respectively. This process ensures that the generated tensor falls within the desired range.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":774-805",
+            "content": "        # Get upper and lower cdf values\n        l = norm_cdf((a - mean) / std)\n        u = norm_cdf((b - mean) / std)\n        # Uniformly fill tensor with values from [l, u], then translate to\n        # [2l-1, 2u-1].\n        tensor.set_value(\n            paddle.uniform(tensor.shape, min=2 * l - 1, max=2 * u - 1))\n        # tensor.uniform_(2 * l - 1, 2 * u - 1)\n        # Use inverse cdf transform for normal distribution to get truncated\n        # standard normal\n        # tensor.erfinv_()  # paddle \u65e0\n        tensor.set_value(special.erfinv(tensor))\n        # Transform to proper mean, std\n        # tensor.mul_(std * math.sqrt(2.))\n        tensor.set_value(tensor.multiply(paddle.to_tensor(std * math.sqrt(2.))))\n        tensor.add_(mean)\n        # Clamp to ensure it's in the proper range\n        tensor.clip_(min=a, max=b)\n        return tensor\ndef _no_grad_fill_(tensor, val):\n    with paddle.no_grad():\n        tensor.set_value(paddle.full_like(tensor, fill_value=val))\n        return tensor\ndef _no_grad_zero_(tensor):"
+        },
+        {
+            "comment": "This function calculates the recommended gain value for a given nonlinearity function. The gain values depend on the function used, with different values assigned to functions like Linear/Identity (1), Sigmoid (1), Tanh (5/3), ReLU (sqrt(2)), Leaky Relu (sqrt((2/(1 + negative_slope^2))), SELU (3/4). The function takes nonlinearity and optional param as arguments, and returns the gain value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":806-832",
+            "content": "    with paddle.no_grad():\n        tensor.set_value(paddle.zeros_like(tensor))\n        return tensor\ndef calculate_gain(nonlinearity, param=None):\n    r\"\"\"Return the recommended gain value for the given nonlinearity function.\n    The values are as follows:\n    ================= ====================================================\n    nonlinearity      gain\n    ================= ====================================================\n    Linear / Identity :math:`1`\n    Conv{1,2,3}D      :math:`1`\n    Sigmoid           :math:`1`\n    Tanh              :math:`\\frac{5}{3}`\n    ReLU              :math:`\\sqrt{2}`\n    Leaky Relu        :math:`\\sqrt{\\frac{2}{1 + \\text{negative\\_slope}^2}}`\n    SELU              :math:`\\frac{3}{4}`\n    ================= ====================================================\n    Args:\n        nonlinearity: the non-linear function (`nn.functional` name)\n        param: optional parameter for the non-linear function\n    Examples:\n        >>> gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2"
+        },
+        {
+            "comment": "This code defines a function to map different nonlinearities (e.g., linear, sigmoid, tanh) to corresponding numerical values or exceptions when an unsupported nonlinearity is provided. It handles cases like linear, sigmoid, tanh, relu, leaky_relu, and selu, providing the appropriate values for each.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":833-858",
+            "content": "    \"\"\"\n    linear_fns = [\n        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',\n        'conv_transpose2d', 'conv_transpose3d'\n    ]\n    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':\n        return 1\n    elif nonlinearity == 'tanh':\n        return 5.0 / 3\n    elif nonlinearity == 'relu':\n        return math.sqrt(2.0)\n    elif nonlinearity == 'leaky_relu':\n        if param is None:\n            negative_slope = 0.01\n        elif not isinstance(param, bool) and isinstance(\n                param, int) or isinstance(param, float):\n            # True/False are instances of int, hence check above\n            negative_slope = param\n        else:\n            raise ValueError(\n                \"negative_slope {} not a valid number\".format(param))\n        return math.sqrt(2.0 / (1 + negative_slope**2))\n    elif nonlinearity == 'selu':\n        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)\n    else:\n        raise ValueError(\"Unsupported nonlinearity {}\".format(nonlinearity))"
+        },
+        {
+            "comment": "These code snippets define functions to initialize a tensor with values drawn from a uniform or normal distribution. The `uniform_` function fills the input tensor with values from a uniform distribution, while `normal_` initializes it with values from a normal distribution. These functions can be used for various tasks such as initializing weights in a neural network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":861-894",
+            "content": "def uniform_(tensor: Tensor, a: float = 0., b: float = 1.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from the uniform\n    distribution :math:`\\mathcal{U}(a, b)`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the lower bound of the uniform distribution\n        b: the upper bound of the uniform distribution\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.uniform_(w)\n    \"\"\"\n    return _no_grad_uniform_(tensor, a, b)\ndef normal_(tensor: Tensor, mean: float = 0., std: float = 1.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from the normal\n    distribution :math:`\\mathcal{N}(\\text{mean}, \\text{std}^2)`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        mean: the mean of the normal distribution\n        std: the standard deviation of the normal distribution\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.normal_(w)\n    \"\"\"\n    return _no_grad_normal_(tensor, mean, std)\ndef trunc_normal_(tensor: Tensor,\n                  mean: float = 0.,"
+        },
+        {
+            "comment": "This code snippet defines a function `trunc_normal_` that fills the input Tensor with values drawn from a truncated normal distribution. The values are within the range [a, b] and the method works best when a <= mean <= b. Additionally, it includes a separate function `constant_` which fills the input Tensor with a constant value val.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":895-922",
+            "content": "                  std: float = 1.,\n                  a: float = -2.,\n                  b: float = 2.) -> Tensor:\n    r\"\"\"Fills the input Tensor with values drawn from a truncated\n    normal distribution. The values are effectively drawn from the\n    normal distribution :math:`\\mathcal{N}(\\text{mean}, \\text{std}^2)`\n    with values outside :math:`[a, b]` redrawn until they are within\n    the bounds. The method used for generating the random values works\n    best when :math:`a \\leq \\text{mean} \\leq b`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        mean: the mean of the normal distribution\n        std: the standard deviation of the normal distribution\n        a: the minimum cutoff value\n        b: the maximum cutoff value\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.trunc_normal_(w)\n    \"\"\"\n    return _no_grad_trunc_normal_(tensor, mean, std, a, b)\ndef constant_(tensor: Tensor, val: float) -> Tensor:\n    r\"\"\"Fills the input Tensor with the value :math:`\\text{val}`.\n    Args:"
+        },
+        {
+            "comment": "These methods fill a tensor with specific values or an identity matrix for Linear layers. The `constant_()`, `ones_()`, and `zeros_()` functions fill the input tensor with constant, ones, or zeros respectively. The `eye_()` function fills a 2-dimensional tensor with an identity matrix while preserving the identities of inputs in Linear layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":923-965",
+            "content": "        tensor: an n-dimensional `torch.Tensor`\n        val: the value to fill the tensor with\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.constant_(w, 0.3)\n    \"\"\"\n    return _no_grad_fill_(tensor, val)\ndef ones_(tensor: Tensor) -> Tensor:\n    r\"\"\"Fills the input Tensor with the scalar value `1`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.ones_(w)\n    \"\"\"\n    return _no_grad_fill_(tensor, 1.)\ndef zeros_(tensor: Tensor) -> Tensor:\n    r\"\"\"Fills the input Tensor with the scalar value `0`.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.zeros_(w)\n    \"\"\"\n    return _no_grad_zero_(tensor)\ndef eye_(tensor):\n    r\"\"\"Fills the 2-dimensional input `Tensor` with the identity\n    matrix. Preserves the identity of the inputs in `Linear` layers, where as\n    many inputs are preserved as possible.\n    Args:\n        tensor: a 2-dimensional `torch.Tensor`"
+        },
+        {
+            "comment": "The provided code defines two functions, `eye_` and `dirac_`, that initialize a tensor with specific values. The `eye_` function fills the 2D tensor with an identity matrix, while the `dirac_` function fills a 3D, 4D or 5D tensor with Dirac delta functions. It also takes an optional argument for groups in case of Convolutional layers. Both functions require the input tensor to have specific dimensions and raise a ValueError if not satisfied.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":967-997",
+            "content": "    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.eye_(w)\n    \"\"\"\n    if tensor.ndimension() != 2:\n        raise ValueError(\"Only tensors with 2 dimensions are supported\")\n    with paddle.no_grad():\n        tensor.set_value(paddle.eye(*tensor.shape))\n    return tensor\ndef dirac_(tensor, groups=1):\n    r\"\"\"Fills the {3, 4, 5}-dimensional input `Tensor` with the Dirac\n    delta function. Preserves the identity of the inputs in `Convolutional`\n    layers, where as many input channels are preserved as possible. In case\n    of groups>1, each group of channels preserves identity\n    Args:\n        tensor: a {3, 4, 5}-dimensional `torch.Tensor`\n        groups (optional): number of groups in the conv layer (default: 1)\n    Examples:\n        >>> w = torch.empty(3, 16, 5, 5)\n        >>> nn.init.dirac_(w)\n        >>> w = torch.empty(3, 24, 5, 5)\n        >>> nn.init.dirac_(w, 3)\n    \"\"\"\n    dimensions = tensor.ndimension()\n    if dimensions not in [3, 4, 5]:\n        raise ValueError(\n            \"Only tensors with 3, 4, or 5 dimensions are supported\")"
+        },
+        {
+            "comment": "This function initializes a tensor with ones in specific positions based on the provided dimensions (3 for temporal convolution, 4 for spatial convolution, and 5 for volumetric convolution). It checks if dim 0 is divisible by groups and raises an error if not. Then it calculates out_chans_per_grp and min_dim, and finally initializes the tensor using no_grad context manager.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":999-1027",
+            "content": "    sizes = tensor.shape\n    if sizes[0] % groups != 0:\n        raise ValueError('dim 0 must be divisible by groups')\n    out_chans_per_grp = sizes[0] // groups\n    min_dim = min(out_chans_per_grp, sizes[1])\n    with paddle.no_grad():\n        tensor.zero_()\n        for g in range(groups):\n            for d in range(min_dim):\n                if dimensions == 3:  # Temporal convolution\n                    tensor[g * out_chans_per_grp + d, d,\n                           tensor.shape[2] // 2] = 1\n                elif dimensions == 4:  # Spatial convolution\n                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,\n                           tensor.shape[3] // 2] = 1\n                else:  # Volumetric convolution\n                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,\n                           tensor.shape[3] // 2, tensor.shape[4] // 2] = 1\n    return tensor\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = tensor.dim()\n    if dimensions < 2:\n        raise ValueError("
+        },
+        {
+            "comment": "Function to calculate fan_in and fan_out for tensor with dimensions greater than 2, compute the gain factor for Xavier uniform initialization, fill the input Tensor with values from a uniform distribution according to Glorot & Bengio (2010) method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":1028-1059",
+            "content": "            \"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions\"\n        )\n    num_input_fmaps = tensor.shape[1]  # .size(1)\n    num_output_fmaps = tensor.shape[0]  # .size(0)\n    receptive_field_size = 1\n    if tensor.dim() > 2:\n        for s in tensor.shape[2:]:\n            receptive_field_size *= s  # fixed\n    fan_in = num_input_fmaps * receptive_field_size\n    fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef LongTensor(x):\n    return paddle.to_tensor(x, dtype='int64')\ndef IntTensor(x):\n    return paddle.to_tensor(x, dtype='int32')\ndef xavier_uniform_(tensor: Tensor, gain: float = 1.) -> Tensor:\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Understanding the difficulty of training deep feedforward\n    neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform\n    distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{U}(-a, a)` where\n    .. math::\n        a = \\text{gain} \\times \\sqrt{\\frac{6}{\\text{fan\\_in} + \\text{fan\\_out}}}"
+        },
+        {
+            "comment": "This code snippet is for initializing a Tensor with values from a normal distribution, using the Xavier/Glorot initialization method. It calculates the standard deviation based on the fan-in and fan-out of the tensor and applies it to uniformly fill the tensor within specified bounds.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":1061-1088",
+            "content": "    Also known as Glorot initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        gain: an optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation\n    return _no_grad_uniform_(tensor, -a, a)\ndef xavier_normal_(tensor: Tensor, gain: float = 1.) -> Tensor:\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Understanding the difficulty of training deep feedforward\n    neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal\n    distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{N}(0, \\text{std}^2)` where\n    .. math::\n        \\text{std} = \\text{gain} \\times \\sqrt{\\frac{2}{\\text{fan\\_in} + \\text{fan\\_out}}}\n    Also known as Glorot initialization."
+        },
+        {
+            "comment": "This code snippet is from the PyTorch library and provides functions for initializing tensors with Xavier normal distribution. It includes the `kaiming_uniform_` function that takes a tensor, gain factor, and optional parameters for fan-in/fan-out calculation or nonlinearity. The `_calculate_fan_in_and_fan_out` and `_calculate_correct_fan` functions help calculate the appropriate fan values based on input arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":1090-1119",
+            "content": "    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        gain: an optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.xavier_normal_(w)\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    return _no_grad_normal_(tensor, 0., std)\ndef _calculate_correct_fan(tensor, mode):\n    mode = mode.lower()\n    valid_modes = ['fan_in', 'fan_out']\n    if mode not in valid_modes:\n        raise ValueError(\"Mode {} not supported, please use one of {}\".format(\n            mode, valid_modes))\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n    return fan_in if mode == 'fan_in' else fan_out\ndef kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Delving deep into rectifiers: Surpassing human-level\n    performance on ImageNet classification` - He, K. et al. (2015), using a\n    uniform distribution. The resulting tensor will have values sampled from"
+        },
+        {
+            "comment": "This function initializes a tensor with Kaiming uniform initialization, setting the standard deviation of the normal distribution to be equal to gain multiplied by the square root of 3 divided by fan mode. It's used for He initialization and applies nonlinearity based on the specified parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":1120-1144",
+            "content": "    :math:`\\mathcal{U}(-\\text{bound}, \\text{bound})` where\n    .. math::\n        \\text{bound} = \\text{gain} \\times \\sqrt{\\frac{3}{\\text{fan\\_mode}}}\n    Also known as He initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the negative slope of the rectifier used after this layer (only\n            used with ``'leaky_relu'``)\n        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``\n            preserves the magnitude of the variance of the weights in the\n            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the\n            backwards pass.\n        nonlinearity: the non-linear function (`nn.functional` name),\n            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)"
+        },
+        {
+            "comment": "This code initializes a tensor using the Kaiming normal method. It fills the input tensor with values sampled from a normal distribution, where std is calculated based on gain and fan_mode (fan_in by default). This initialization method is often used in neural networks to improve performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":1145-1169",
+            "content": "    bound = math.sqrt(\n        3.0) * std  # Calculate uniform bounds from standard deviation\n    with paddle.no_grad():\n        tensor.set_value(paddle.uniform(tensor.shape, min=-bound, max=bound))\n        return tensor\ndef kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):\n    r\"\"\"Fills the input `Tensor` with values according to the method\n    described in `Delving deep into rectifiers: Surpassing human-level\n    performance on ImageNet classification` - He, K. et al. (2015), using a\n    normal distribution. The resulting tensor will have values sampled from\n    :math:`\\mathcal{N}(0, \\text{std}^2)` where\n    .. math::\n        \\text{std} = \\frac{\\text{gain}}{\\sqrt{\\text{fan\\_mode}}}\n    Also known as He initialization.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        a: the negative slope of the rectifier used after this layer (only\n            used with ``'leaky_relu'``)\n        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``\n            preserves the magnitude of the variance of the weights in the"
+        },
+        {
+            "comment": "These functions fill a tensor with either a (semi) orthogonal matrix or initialize weights using Kaiming normal distribution. The 'fan_out' and 'nonlinearity' parameters are used for the initialization process. These functions are inspired by research papers, one focusing on orthogonal matrices in deep linear neural networks and another on Kaiming normal distribution for weight initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":1170-1195",
+            "content": "            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the\n            backwards pass.\n        nonlinearity: the non-linear function (`nn.functional` name),\n            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> kaiming_normal_(w, mode='fan_out', nonlinearity='relu')\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    with paddle.no_grad():\n        tensor.set_value(paddle.normal(shape=tensor.shape, mean=0, std=std))\n        return tensor\ndef orthogonal_(tensor, gain=1):\n    r\"\"\"Fills the input `Tensor` with a (semi) orthogonal matrix, as\n    described in `Exact solutions to the nonlinear dynamics of learning in deep\n    linear neural networks` - Saxe, A. et al. (2013). The input tensor must have\n    at least 2 dimensions, and for tensors with more than 2 dimensions the\n    trailing dimensions are flattened.\n    Args:\n        tensor: an n-dimensional `torch.Tensor`, where :math:`n \\geq 2`"
+        },
+        {
+            "comment": "This function initializes a 2D tensor with values drawn from the standard normal distribution, ensuring that at least a certain sparsity level is maintained. It uses QR factorization and scales the result by a given gain factor if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":1196-1232",
+            "content": "        gain: optional scaling factor\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.orthogonal_(w)\n    \"\"\"\n    if tensor.ndimension() < 2:\n        raise ValueError(\"Only tensors with 2 or more dimensions are supported\")\n    rows = tensor.shape[0]  # .size(0)\n    cols = tensor.numel() // rows\n    flattened = tensor.new(rows, cols).normal_(0, 1)\n    if rows < cols:\n        flattened.t_()\n    # Compute the qr factorization\n    q, r = paddle.to_tensor(np.linalg.qr(flattened.numpy()))\n    # q, r = torch.qr(flattened)\n    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf\n    d = paddle.diag(r, 0)\n    ph = d.sign()\n    q *= ph\n    if rows < cols:\n        q.t_()\n    with paddle.no_grad():\n        tensor.view_as(q).copy_(q)\n        tensor.mul_(gain)\n    return tensor\ndef sparse_(tensor, sparsity, std=0.01):\n    r\"\"\"Fills the 2D input `Tensor` as a sparse matrix, where the\n    non-zero elements will be drawn from the normal distribution\n    :math:`\\mathcal{N}(0, 0.01)`, as described in `Deep learning via"
+        },
+        {
+            "comment": "This code initializes a 2D torch.Tensor with a specified sparsity and standard deviation by setting some elements to zero while keeping others non-zero. It checks for tensor dimensions, normalizes values, assigns zeroes based on the input sparsity, and is compatible with both PyTorch and PaddlePaddle.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":1233-1267",
+            "content": "    Hessian-free optimization` - Martens, J. (2010).\n    Args:\n        tensor: an n-dimensional `torch.Tensor`\n        sparsity: The fraction of elements in each column to be set to zero\n        std: the standard deviation of the normal distribution used to generate\n            the non-zero values\n    Examples:\n        >>> w = torch.empty(3, 5)\n        >>> nn.init.sparse_(w, sparsity=0.1)\n    \"\"\"\n    if tensor.ndimension() != 2:\n        raise ValueError(\"Only tensors with 2 dimensions are supported\")\n    rows, cols = tensor.shape\n    num_zeros = int(math.ceil(sparsity * rows))\n    with paddle.no_grad():\n        tensor.normal_(0, std)\n        for col_idx in range(cols):\n            row_indices = paddle.randperm(rows)\n            zero_indices = row_indices[:num_zeros]\n            tensor[zero_indices, col_idx] = 0\n    return tensor\n# for backward compatibility\ndef _make_deprecate(meth):\n    new_name = meth.__name__\n    old_name = new_name[:-1]\n    def deprecated_init(*args, **kwargs):\n        warnings.warn(\n            \"nn.init.{} is now deprecated in favor of nn.init.{}.\".format("
+        },
+        {
+            "comment": "This code defines several deprecated initialization methods and creates their corresponding non-deprecated alternatives. The _make_deprecate function wraps the old functions with a warning that they are deprecated in favor of new Torch.nn.init functions, redirecting users to the new functions for more information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py\":1268-1294",
+            "content": "                old_name, new_name),\n            stacklevel=2)\n        return meth(*args, **kwargs)\n    deprecated_init.__doc__ = r\"\"\"\n    {old_name}(...)\n    .. warning::\n        This method is now deprecated in favor of :func:`torch.nn.init.{new_name}`.\n    See :func:`~torch.nn.init.{new_name}` for details.\"\"\".format(\n        old_name=old_name, new_name=new_name)\n    deprecated_init.__name__ = old_name\n    return deprecated_init\n# uniform = _make_deprecate(uniform_)\n# normal = _make_deprecate(normal_)\n# constant = _make_deprecate(constant_)\n# eye = _make_deprecate(eye_)\n# dirac = _make_deprecate(dirac_)\n# xavier_uniform = _make_deprecate(xavier_uniform_)\n# xavier_normal = _make_deprecate(xavier_normal_)\n# kaiming_uniform = _make_deprecate(kaiming_uniform_)\n# kaiming_normal = _make_deprecate(kaiming_normal_)\n# orthogonal = _make_deprecate(orthogonal_)\n# sparse = _make_deprecate(sparse_)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c725aee3-d95d-4fa6-851f-2cc08e6e39cd.json b/docs/doc/c725aee3-d95d-4fa6-851f-2cc08e6e39cd.json
new file mode 100644
index 000000000..c850aed90
--- /dev/null
+++ b/docs/doc/c725aee3-d95d-4fa6-851f-2cc08e6e39cd.json
@@ -0,0 +1,75 @@
+{
+    "summary": "This section provides Linux setup for deploying PaddleVideo models, offers Windows support, and recommends Docker. It installs OpenCV 3.4.7, sets paths, compiles Video inference code, builds prediction library with simple commands, and defines model parameters/configuration options but may encounter errors due to missing libcudnn or incorrect CUDNN_LIB_DIR setting.",
+    "details": [
+        {
+            "comment": "This section introduces the C++ deployment method of PaddleVideo model, which offers better performance compared to Python. It provides instructions for setting up a Linux environment with docker recommendation and mentions that Windows support is under development. Additionally, it requires installing extra dependencies like paddledet using pip.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":0-19",
+            "content": "English | [\u7b80\u4f53\u4e2d\u6587](./readme.md)\n# Server-side C++ prediction\nThis chapter introduces the C++ deployment method of the PaddleVideo model. For the python prediction deployment method, please refer to the **Model Reasoning** chapter of the respective model.\nC++ is better than python in terms of performance calculation. Therefore, in most CPU and GPU deployment scenarios, C++ deployment methods are mostly used. This section will introduce how to configure the C++ environment in the Linux (CPU/GPU) environment and complete it.\nPaddleVideo model deployment.\nBefore getting started, you need to install additional dependencies as follows:\n```bash\npython -m pip install [paddledet](git+https://github.com/LDOUBLEV/AutoLog)\n```\n## 1. Prepare the environment\n- For Linux environment, docker is recommended.\n- Windows environment, currently supports compilation based on `Visual Studio 2019 Community` (TODO)\n* This document mainly introduces the PaddleVideo C++ prediction process based on the Linux environment. If yo"
+        },
+        {
+            "comment": "This code provides instructions for compiling the OpenCV library and installing FFmpeg to enable normal video reading under Linux. It also mentions a Windows Compilation Tutorial that needs to be completed (TODO).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":19-36",
+            "content": "u need to perform C++ prediction based on the prediction library under Windows, please refer to [Windows Compilation Tutorial](./docs/windows_vs2019_build.md)(TODO) for the specific compilation method\n* **The purpose of preparing the environment is to get the compiled opencv library and paddle prediction library**.\n### 1.1 Compile opencv library\n* First, you need to download the compressed package compiled from the source code in the Linux environment from the opencv official website, and unzip it into a folder. Take opencv3.4.7 as an example, the download command is as follows:\n    ```bash\n    cd deploy/cpp_infer\n    wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz\n    tar -xf 3.4.7.tar.gz\n    ```\n    After decompression, you can get the decompressed folder of `opencv-3.4.7` in the `deploy/cpp_infer` directory.\n* Install ffmpeg\n    Opencv and ffmpeg can read the video normally under linux, otherwise it may encounter the situation that the number of video frames returns to 0 or no video frame can be read"
+        },
+        {
+            "comment": "This code installs necessary libraries for compiling OpenCV 3.4.7 on Linux, sets the source and installation paths, removes existing build folder, creates a new one, runs cmake to configure build options and libraries to use, and then proceeds with the compilation process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":38-75",
+            "content": "    Using a relatively simple apt installation, the installation command is as follows:\n    ```bash\n    apt-get update\n    apt install libavformat-dev\n    apt install libavcodec-dev\n    apt install libswresample-dev\n    apt install libswscale-dev\n    apt install libavutil-dev\n    apt install libsdl1.2-dev\n    apt-get install ffmpeg\n    ```\n* To prepare to compile opencv, first enter the `opencv-3.4.7` folder, and then set the opencv source path `root_path` and the installation path `install_path`. The execution command is as follows:\n    ```bash\n    cd opencv-3.4.7\n    root_path=$PWD  # That is the absolute path of opencv-3.4.7\n    install_path=${root_path}/opencv3\n    rm -rf build\n    mkdir build\n    cd build\n    cmake .. \\\n        -DCMAKE_INSTALL_PREFIX=${install_path} \\\n        -DCMAKE_BUILD_TYPE=Release \\\n        -DBUILD_SHARED_LIBS=OFF \\\n        -DWITH_IPP=OFF \\\n        -DBUILD_IPP_IW=OFF \\\n        -DWITH_LAPACK=OFF \\\n        -DWITH_EIGEN=OFF \\\n        -DCMAKE_INSTALL_LIBDIR=lib64 \\\n        -DWITH_ZLIB=ON \\\n        -DBUILD_ZLIB=ON \\"
+        },
+        {
+            "comment": "Configuring and installing OpenCV library with specified options and building the Video inference C++ code using it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":76-109",
+            "content": "        -DWITH_JPEG=ON \\\n        -DBUILD_JPEG=ON \\\n        -DWITH_PNG=ON \\\n        -DBUILD_PNG=ON \\\n        -DWITH_TIFF=ON \\\n        -DBUILD_TIFF=ON \\\n        -DWITH_FFMPEG=ON\n    make -j\n    make install\n    ```\n    After the completion of `make install`, opencv header files and library files will be generated in this folder, which will be used to compile the Video inference C++ code later.\n    Finally, the installation path `install_path` will be used as the specified path, and a folder of `opencv3` will be obtained. The file structure is shown below.\n    ```shell\n    opencv-3.4.7/\n    \u251c\u2500\u2500 opencv3/\n    \u2502   \u251c\u2500\u2500 bin/\n    \u2502   \u251c\u2500\u2500 include/\n    \u2502   \u251c\u2500\u2500 lib/\n    \u2502   \u251c\u2500\u2500 lib64/\n    \u2502   \u2514\u2500\u2500 share/\n    ```\n### 1.2 Download or compile Paddle prediction library\nThere are two ways to obtain the Paddle prediction library, which will be described in detail below.\n#### 1.2.1 Download and install directly\n* [Paddle prediction library official website](https://paddleinference.paddlepaddle.org.cn/v2.2/user_guides/download_li"
+        },
+        {
+            "comment": "This code provides instructions for downloading and unzipping the prediction library, or compiling it from source code if you want the latest features. It specifies the appropriate version selection on the official website (paddle version>=2.0.1, 2.2.2 recommended) and the required environment (gcc8.2). The wget command downloads the tgz package, tar extracts it into a subfolder of paddle_inference in the current folder. Alternatively, cloning the latest code from Paddle GitHub and compiling from source can be done for accessing the latest prediction library features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":109-122",
+            "content": "b.html) provides different cuda versions of Linux prediction libraries, you can Check and **select the appropriate prediction library version** on the official website (it is recommended to select the prediction library with paddle version>=2.0.1, and the prediction library of 2.2.2 is recommended).\n* Download and get a `paddle_inference.tgz` compressed package, and then unzip it into a folder, the command is as follows (taking the machine environment as gcc8.2 as an example):\n    ```bash\n    wget https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz\n    tar -xf paddle_inference.tgz\n    ```\n    Eventually, a subfolder of `paddle_inference/` will be generated in the current folder.\n#### 1.2.2 Prediction library source code compilation\n* If you want to get the latest prediction library features, you can clone the latest code from Paddle github and compile the prediction library from source code.\n* You can refer t"
+        },
+        {
+            "comment": "This code provides the installation and compilation instructions for Paddle prediction library. The steps involve cloning the Paddle repository, checking out a specific release branch, configuring and building the project with CMake, and finally generating the prediction library by making and making inference_lib_dist. This process is done to ensure that users can obtain the latest and most optimized version of the prediction library for their needs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":122-149",
+            "content": "o [Paddle prediction library installation and compilation instructions](https://paddleinference.paddlepaddle.org.cn/user_guides/source_compile.html) instructions from github Obtain the Paddle code, and then compile it to generate the latest prediction library. The method of using git to get the code is as follows.\n    ```shell\n    git clone https://github.com/PaddlePaddle/Paddle.git\n    git checkout release/2.2\n    ```\n* After entering the Paddle directory, the compilation method is as follows.\n    ```shell\n    rm -rf build\n    mkdir build\n    cd build\n    cmake .. \\\n        -DWITH_CONTRIB=OFF \\\n        -DWITH_MKL=ON \\\n        -DWITH_MKLDNN=ON \\\n        -DWITH_TESTING=OFF \\\n        -DCMAKE_BUILD_TYPE=Release \\\n        -DWITH_INFERENCE_API_TEST=OFF \\\n        -DON_INFER=ON \\\n        -DWITH_PYTHON=ON\n    make -j\n    make inference_lib_dist -j4 # 4\u4e3a\u7f16\u8bd1\u65f6\u4f7f\u7528\u6838\u6570\uff0c\u53ef\u6839\u636e\u673a\u5668\u60c5\u51b5\u81ea\u884c\u4fee\u6539\n    ```\n    You can refer to [documentation](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/b"
+        },
+        {
+            "comment": "Step 1: The code describes the generation of several files and folders after a successful compilation process. These include `CMakeCache.txt`, `paddle/`, `third_party/`, and `version.txt`.\n\nStep 2: Explains that among these, `paddle` is the C++ library required for prediction, while `version.txt` contains version information of the current prediction library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":149-172",
+            "content": "uild_and_install_lib_cn.html#congyuanmabianyi) for more introduction of compilation parameter options.\n* After the compilation is complete, you can see the following files and folders are generated under the file `build/paddle_inference_install_dir/`.\n    ```\n    build/\n    \u2514\u2500\u2500 paddle_inference_install_dir/\n        \u251c\u2500\u2500 CMakeCache.txt\n        \u251c\u2500\u2500 paddle/\n        \u251c\u2500\u2500 third_party/\n        \u2514\u2500\u2500 version.txt\n    ```\n    Among them, `paddle` is the Paddle library required for C++ prediction, and `version.txt` contains the version information of the current prediction library.\n## 2. Compile and run the prediction demo\n### 2.1 Export the model as an inference model\n* This step is the same as the export prediction model under the python deployment mode. You can refer to the model prediction chapter of the respective model. Several related inference model files exported are used for model prediction. **Taking PP-TSM as an example**, the directory structure of the derived prediction model is as follows.\n    ```\n    inference/"
+        },
+        {
+            "comment": "This code snippet provides instructions for compiling the PaddleVideo C++ prediction demo. First, navigate to the `deploy/cpp_infer` directory. Then, execute the compile command `bash tools/build.sh`. Modify environment paths in `tools/build.sh`, such as OPENCV_DIR, LIB_DIR, CUDA_LIB_DIR, and CUDNN_LIB_DIR to match your system's configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":173-202",
+            "content": "    \u2514\u2500\u2500 ppTSM/\n        \u251c\u2500\u2500 ppTSM.pdiparams\n        \u251c\u2500\u2500 ppTSM.pdiparamsinfo\n        \u2514\u2500\u2500 ppTSM.pdmodel\n    ```\n### 2.2 Compile PaddleVideo C++ prediction demo\n* Enter the `deploy/cpp_infer` directory and execute the following compile command\n    ```shell\n    bash tools/build.sh\n    ```\n    The addresses of the Paddle C++ prediction library, opencv and other dependent libraries in `tools/build.sh` need to be replaced with the actual addresses on your own machine.\n* Specifically, you need to modify the environment path in `tools/build.sh`, the relevant content is as follows:\n    ```shell\n    OPENCV_DIR=your_opencv_dir\n    LIB_DIR=your_paddle_inference_dir\n    CUDA_LIB_DIR=/usr/local/cuda/lib64\n    CUDNN_LIB_DIR=/usr/lib/x86_64-linux-gnu/\n    ```\n    The above parameters are as follows (the following path users can modify according to their own machine conditions)\n    `OPENCV_DIR` is the address where opencv is compiled and installed\n     `LIB_DIR` is the download (`paddle_inference` folder) or the generated Paddle prediction library address (`build/paddle_inference_install_dir` folder)"
+        },
+        {
+            "comment": "To enable TensorRT acceleration during prediction, modify the code in `tools/build.sh` by setting `DWITH_GPU=ON`, `DWITH_TENSORRT=ON`, and providing the absolute path to TensorRT using `TENSORRT_DIR`. This allows for GPU-accelerated predictions with PaddleVideo's C++ implementation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":203-230",
+            "content": "     `CUDA_LIB_DIR` is the address of the cuda library file, which is `/usr/local/cuda/lib64` in docker\n     `CUDNN_LIB_DIR` is the cudnn library file address, which is `/usr/lib/x86_64-linux-gnu/` in docker.\n     **If you want to enable TensorRT acceleration during prediction, you need to modify the code at `tools/build.sh`3**\n     1. Set `DWITH_GPU=ON`\n     2. Set `DWITH_TENSORRT=ON`\n     3. Set `TENSORRT_DIR=/path/to/TensorRT-x.x.x.x`\n    **The above paths are all absolute paths, do not use relative paths**\n* After the compilation is complete, an executable file named `ppvideo` will be generated in the `cpp_infer/build` folder.\n### 2.3 Run PaddleVideo C++ prediction demo\nOperation mode:\n```bash\n./build/ppvideo <mode> [--param1] [--param2] [...]\n```\nAmong them, `mode` is a required parameter, which means the selected function, and the value range is ['rec'], which means **video recognition** (more functions will be added in succession).\n##### 1. Call video recognition:\n```bash\n# run PP-TSM inference\n./build/ppvideo rec \\"
+        },
+        {
+            "comment": "This code sets the model directory, inference model name, video directory, number of segments, and segment length for PaddleVideo's PP-TSM and PP-TSN inference. Additional parameters include use_gpu, gpu_id, gpu_mem, cpu_threads, enable_mkldnn, use_tensorrt, and precision for customizing the inference process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":231-257",
+            "content": "--rec_model_dir=../../inference/ppTSM \\\n--inference_model_name=ppTSM \\\n--video_dir=./example_video_dir \\\n--num_seg=8 \\\n--seg_len=1\n# run PP-TSN inference\n./build/ppvideo rec \\\n--rec_model_dir=../../inference/ppTSN \\\n--inference_model_name=ppTSN \\\n--video_dir=./example_video_dir \\\n--num_seg=25 \\\n--seg_len=1\n```\nMore parameters are as follows:\n- General parameters\n    | Parameter name | Type | Default parameter | Meaning |\n    | ------------- | ---- | --------------- | ------------------------------------------------------------ |\n    | use_gpu | bool | false | Whether to use GPU |\n    | gpu_id | int | 0 | GPU id, valid when using GPU |\n    | gpu_mem | int | 4000 | GPU memory requested |\n    | cpu_threads | int | 10 | The number of threads for CPU prediction. When the number of machine cores is sufficient, the larger the value, the faster the prediction speed |\n    | enable_mkldnn | bool | false | Whether to use mkldnn library |\n    | use_tensorrt | bool | false | Whether to use the tensorrt library |\n    | precision | str | \"fp32\" | Use fp32/fp16/uint8 precision to predict |"
+        },
+        {
+            "comment": "This code provides configuration parameters for video recognition models. The `video_dir` specifies the folder path containing the video to be recognized, while `rec_model_dir` points to the exported prediction model's location. The `inference_model_name` refers to the name of the model used in prediction. `num_seg` and `seg_len` determine the number of video segments and frames per segment respectively. `rec_batch_num` indicates the batch size during model prediction, and `char_list_file` stores category labels and names.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":258-270",
+            "content": "    | benchmark | bool | true | Whether to enable benchmark during prediction, after enabling it, the configuration, model, time-consuming and other information will be output at the end. |\n- Video recognition model related\n    | Parameter name | Type | Default parameter | Meaning |\n    | -------------- | ------ | --------------------------------------------- | ------------------------------------ |\n    | video_dir | string | \"../example_video_dir\" | The path of the folder where the video to be recognized is stored |\n    | rec_model_dir | string | \"\" | The folder path where the exported prediction model is stored |\n    | inference_model_name | string | \"ppTSM\" | The name of the model used in the prediction |\n    | num_seg | int | 8 | Number of video segments |\n    | seg_len | int | 1 | The number of frames extracted in each segment of the video |\n    | rec_batch_num | int | 1 | Batch size during model prediction |\n    | char_list_file | str | \"../../data/k400/Kinetics-400_label_list.txt\" | The text path for storing all category labels and corresponding names |"
+        },
+        {
+            "comment": "This code snippet demonstrates the output of the inference process for a sample video. It shows the input video, its class and score. Additionally, it provides information about the runtime device, IR optimization, memory optimization, whether TensorRT is enabled or not, the number of CPU math library threads, and data information such as batch size and input shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":272-288",
+            "content": "\u200b\tTake the sample video `example01.avi` under example_video_dir as the input video as an example, the final \tscreen will output the detection results as follows.\n```bash\n[./inference/ppTSM]\n[./deploy/cpp_infer/example_video_dir]\ntotal videos num: 1\n./example_video_dir/example01.avi   class: 5 archery       score: 0.999556\nI1125 08:10:45.834288 13955 autolog.h:50] ----------------------- Config info -----------------------\nI1125 08:10:45.834458 13955 autolog.h:51] runtime_device: cpu\nI1125 08:10:45.834467 13955 autolog.h:52] ir_optim: True\nI1125 08:10:45.834475 13955 autolog.h:53] enable_memory_optim: True\nI1125 08:10:45.834483 13955 autolog.h:54] enable_tensorrt: 0\nI1125 08:10:45.834518 13955 autolog.h:55] enable_mkldnn: False\nI1125 08:10:45.834525 13955 autolog.h:56] cpu_math_library_num_threads: 10\nI1125 08:10:45.834532 13955 autolog.h:57] ----------------------- Data info -----------------------\nI1125 08:10:45.834540 13955 autolog.h:58] batch_size: 1\nI1125 08:10:45.834547 13955 autolog.h:59] input_shape: dynamic"
+        },
+        {
+            "comment": "The code is displaying information about the model used for inference. It mentions the model name, precision type, and total time spent on inference. Additionally, it provides a breakdown of preprocessing, inference, and post-processing times. The error message indicates that the CUDA Deep Neural Network library (libcudnn) is missing or not found during compilation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":289-307",
+            "content": "I1125 08:10:45.834556 13955 autolog.h:60] data_num: 1\nI1125 08:10:45.834564 13955 autolog.h:61] ----------------------- Model info -----------------------\nI1125 08:10:45.834573 13955 autolog.h:62] model_name: rec\nI1125 08:10:45.834579 13955 autolog.h:63] precision: fp32\nI1125 08:10:45.834586 13955 autolog.h:64] ----------------------- Perf info ------------------------\nI1125 08:10:45.834594 13955 autolog.h:65] Total time spent(ms): 2739\nI1125 08:10:45.834602 13955 autolog.h:67] preprocess_time(ms): 10.6524, inference_time(ms): 1269.55, postprocess_time(ms): 0.009118\n```\n### 3 FAQ\n1. The following error occurred during the compilation of the demo\n     ```shell\n     make[2]: *** No rule to make target '/usr/lib/x86_64-linux-gn/libcudnn.so', needed by 'ppvideo'. Stop.\n     make[2]: *** Waiting for unfinished jobs....\n     [ 16%] Building CXX object CMakeFiles/ppvideo.dir/src/main.cpp.o\n     [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/preprocess_op.cpp.o\n     [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/postprocess_op.cpp.o"
+        },
+        {
+            "comment": "Error: CMakeFiles/ppvideo.dir/all and all target failed with Error 2 due to missing libcudnn.so, possibly caused by incorrect CUDNN_LIB_DIR setting.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme_en.md\":308-315",
+            "content": "     [83%] Building CXX object CMakeFiles/ppvideo.dir/src/utility.cpp.o\n     [ 83%] Building CXX object CMakeFiles/ppvideo.dir/src/video_rec.cpp.o\n     CMakeFiles/Makefile2:95: recipe for target 'CMakeFiles/ppvideo.dir/all' failed\n     make[1]: *** [CMakeFiles/ppvideo.dir/all] Error 2\n     Makefile:83: recipe for target 'all' failed\n     make: *** [all] Error 2\n     ````\n     It may be that `CUDNN_LIB_DIR` is set incorrectly, resulting in that `libcudnn.so` in this directory cannot be found."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c774b682-2827-4da6-8133-61dbceffd840.json b/docs/doc/c774b682-2827-4da6-8133-61dbceffd840.json
new file mode 100644
index 000000000..c569e4fe8
--- /dev/null
+++ b/docs/doc/c774b682-2827-4da6-8133-61dbceffd840.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines the InferModel class for inference using a pre-trained PPTSM model, taking config files and performing inference on image data from a specified path, predicting football actions and printing output shape and time taken.",
+    "details": [
+        {
+            "comment": "The code defines a class called InferModel that initializes and prepares the PPTSM model for inference. It takes a configuration file as input, which includes details such as model files, parameter files, GPU memory, and device ID. The code sets up configurations to optimize GPU memory usage and enable zero-copy operations. It then creates a predictor object with these configurations and retrieves the input tensor handle.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/pptsm_infer.py\":0-37",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"pptsm infer\"\"\"\n    def __init__(self, cfg, name='PPTSM'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])"
+        },
+        {
+            "comment": "The code defines a class that performs inference using a pre-trained model. It gets the output names and handles from the predictor, runs inference on input data, and returns the output. The main function reads a configuration file and creates an instance of the InferModel class to perform inference based on the specified config.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/pptsm_infer.py\":39-66",
+            "content": "        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[1])\n        #self.output_tensor = self.predictor.get_output_handle(output_names[0])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = [items[:-1] for items in data]\n            inputs = np.array(inputs)\n            output = self.infer(inputs)\n            feature_list.append(np.squeeze(output))\n        feature_list = np.vstack(feature_list)\n        return feature_list\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)"
+        },
+        {
+            "comment": "This code loads image data from a specific path, uses the model to predict action for each frame, and prints the output shape and time taken for inference. It seems to be part of an application related to FootballAction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/models/pptsm_infer.py\":68-77",
+            "content": "    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238/'\n    imgs_list = get_images(imgs_path)\n    t0 = time.time()\n    cfg['PPTSM']['frame_list'] = imgs_list\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c78113cf-3fe3-4d28-a49a-7d5f79595a37.json b/docs/doc/c78113cf-3fe3-4d28-a49a-7d5f79595a37.json
new file mode 100644
index 000000000..fb3dad557
--- /dev/null
+++ b/docs/doc/c78113cf-3fe3-4d28-a49a-7d5f79595a37.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The PaddleVideo library's VideoDecoder class decodes mp4 files into frames, handles RGB frames and audio, and provides data with masks. It includes functions for decoding, dequantizing feature vectors, and making one-hot labels.",
+    "details": [
+        {
+            "comment": "This code is for a VideoDecoder class in the PaddleVideo library. It decodes mp4 files into frames as part of a pipeline. The class takes a file path argument and performs mp4 decode operations using the __call__ method, which processes results returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py\":0-41",
+            "content": "\"\"\"\n#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport sys\nfrom io import BytesIO\nimport os\nimport random\nimport numpy as np\nimport pickle\nimport cv2\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass VideoDecoder(object):\n    \"\"\"\n    Decode mp4 file to frames.\n    Args:\n        filepath: the file path of mp4 file\n    \"\"\"\n    def __init__(self):\n        pass\n    def __call__(self, results):\n        \"\"\"\n        Perform mp4 decode operations.\n        return:"
+        },
+        {
+            "comment": "This code defines three classes for decoding different types of data: video, frames, and features. The VideoDecoder reads a video file frame by frame, the FrameDecoder parses results as individual frames, and the FeatureDecoder handles feature decode operations like YouTube8M. The results are stored in 'frames', 'frames_len', and 'format' fields respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py\":42-79",
+            "content": "            List where each item is a numpy array after decoder.\n        \"\"\"\n        #XXX get info from results!!!\n        file_path = results['filename']\n        cap = cv2.VideoCapture(file_path)\n        videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n        sampledFrames = []\n        for i in range(videolen):\n            ret, frame = cap.read()\n            # maybe first frame is empty\n            if ret == False:\n                continue\n            img = frame[:, :, ::-1]\n            sampledFrames.append(img)\n        results['frames'] = sampledFrames\n        results['frames_len'] = len(sampledFrames)\n        results['format'] = 'video'\n        return results\n@PIPELINES.register()\nclass FrameDecoder(object):\n    \"\"\"just parse results\n    \"\"\"\n    def __init__(self):\n        pass\n    def __call__(self, results):\n        results['format'] = 'frame'\n        return results\n@PIPELINES.register()\nclass FeatureDecoder(object):\n    \"\"\"\n        Perform feature decode operations.e.g.youtube8m\n    \"\"\"\n    def __init__(self, num_classes, max_len=512, has_label=True):"
+        },
+        {
+            "comment": "This code is part of a decoding pipeline that loads and preprocesses data from a .pkl file. It extracts RGB frames, audio, and labels (if available), performs dequantization, and applies padding as needed. The results are returned as numpy arrays for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py\":80-112",
+            "content": "        self.max_len = max_len\n        self.num_classes = num_classes\n        self.has_label = has_label\n    def __call__(self, results):\n        \"\"\"\n        Perform feature decode operations.\n        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        #1. load pkl\n        #2. parse to rgb/audio/\n        #3. padding\n        filepath = results['filename']\n        data = pickle.load(open(filepath, 'rb'), encoding='bytes')\n        record = data\n        nframes = record[b'nframes']\n        rgb = record[b'feature'].astype(float)\n        audio = record[b'audio'].astype(float)\n        if self.has_label:\n            label = record[b'label']\n            one_hot_label = self.make_one_hot(label, self.num_classes)\n        rgb = rgb[0:nframes, :]\n        audio = audio[0:nframes, :]\n        rgb = self.dequantize(rgb,\n                              max_quantized_value=2.,\n                              min_quantized_value=-2.)\n        audio = self.dequantize(audio,\n                                max_quantized_value=2,"
+        },
+        {
+            "comment": "The code snippet initializes a list of feature paddings, lengths, and masks for both rgb and audio data. It iterates through the two types of data (rgb and audio) to populate the results dictionary with information about each type of data, including its length and padded feature data along with their respective masks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py\":113-138",
+            "content": "                                min_quantized_value=-2)\n        if self.has_label:\n            results['labels'] = one_hot_label.astype(\"float32\")\n        feat_pad_list = []\n        feat_len_list = []\n        mask_list = []\n        vitem = [rgb, audio]\n        for vi in range(2):  #rgb and audio\n            if vi == 0:\n                prefix = \"rgb_\"\n            else:\n                prefix = \"audio_\"\n            feat = vitem[vi]\n            results[prefix + 'len'] = feat.shape[0]\n            #feat pad step 1. padding\n            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),\n                                dtype=np.float32)\n            feat_pad = np.concatenate((feat, feat_add), axis=0)\n            results[prefix + 'data'] = feat_pad.astype(\"float32\")\n            #feat pad step 2. mask\n            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)\n            feat_mask_add = feat_add\n            feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),\n                                       axis=0)"
+        },
+        {
+            "comment": "The code contains functions for decoding, dequantizing feature vectors, and making one-hot labels. The decode function stores the feature mask in a dictionary, the dequantize function scales and translates the quantized values back to float format, and the make_one_hot function creates one-hot encoded labels from given indices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py\":139-164",
+            "content": "            results[prefix + 'mask'] = feat_mask.astype(\"float32\")\n        return results\n    def dequantize(self,\n                   feat_vector,\n                   max_quantized_value=2.,\n                   min_quantized_value=-2.):\n        \"\"\"\n        Dequantize the feature from the byte format to the float format\n        \"\"\"\n        assert max_quantized_value > min_quantized_value\n        quantized_range = max_quantized_value - min_quantized_value\n        scalar = quantized_range / 255.0\n        bias = (quantized_range / 512.0) + min_quantized_value\n        return feat_vector * scalar + bias\n    def make_one_hot(self, label, dim=3862):\n        \"\"\"make one hot\"\"\"\n        one_hot_label = np.zeros(dim)\n        one_hot_label = one_hot_label.astype(float)\n        for ind in label:\n            one_hot_label[int(ind)] = 1\n        return one_hot_label"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c7831c49-96f7-4f4b-9cd7-6917facb27c9.json b/docs/doc/c7831c49-96f7-4f4b-9cd7-6917facb27c9.json
new file mode 100644
index 000000000..ff1d18d77
--- /dev/null
+++ b/docs/doc/c7831c49-96f7-4f4b-9cd7-6917facb27c9.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code creates a base class for model recognizers in PaddleVideo. It initializes the backbone and head modules' weights, extracts features using the backbone, and performs optional classification. The class provides abstract methods for training, validating, and testing steps to be implemented by subclasses.",
+    "details": [
+        {
+            "comment": "Base class for recognizers, subclass for train_step, valid_step, test_step. Initializes backbone and head modules with weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py\":0-37",
+            "content": "\"\"\"\nstart\n\"\"\"\nfrom abc import abstractmethod\nfrom ... import builder\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nclass BaseRecognizer(nn.Layer):\n    \"\"\"Base class for recognizers.\n    All recognizers should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Classification head to process feature.\n    \"\"\"\n    def __init__(self, backbone=None, head=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            self.head.init_weights()"
+        },
+        {
+            "comment": "The code defines a base class for model recognizers. It initializes the weights of both backbone and head, extracts features using the backbone, and optionally performs classification using the head if it exists. The method also handles reshaping inputs when necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py\":38-74",
+            "content": "        else:\n           self.head = None\n    def init_weights(self):\n        \"\"\"Initialize the model network weights. \"\"\"\n        self.backbone.init_weights(\n        )  #TODO: required? while backbone without base class\n        self.head.init_weights()\n    def extract_feature(self, imgs):\n        \"\"\"Extract features through a backbone.\n    Args:\n        imgs (paddle.Tensor) : The input images.\n        Returns:\n            feature (paddle.Tensor) : The extracted features.\n        \"\"\"\n        feature = self.backbone(imgs)\n        return feature\n    def forward(self, imgs, **kwargs):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        batches = imgs.shape[0]\n        num_segs = imgs.shape[1]\n        imgs = paddle.reshape(imgs, [-1] + list(imgs.shape[2:]))\n        if self.backbone is not None:\n            feature = self.extract_feature(imgs)\n        else:\n            feature = imgs\n        if self.head is not None:\n            cls_score = self.head(feature, num_segs)\n        else:"
+        },
+        {
+            "comment": "This code defines a base class for recognizer models in PaddleVideo. It provides abstract methods for training, validating, and testing steps, which must be implemented by any subclasses that inherit from this base class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py\":75-96",
+            "content": "            cls_score = None\n        return cls_score\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c7cc8d2b-c7d1-425d-9153-51d6e9cad3cd.json b/docs/doc/c7cc8d2b-c7d1-425d-9153-51d6e9cad3cd.json
new file mode 100644
index 000000000..79421ccc9
--- /dev/null
+++ b/docs/doc/c7cc8d2b-c7d1-425d-9153-51d6e9cad3cd.json
@@ -0,0 +1,55 @@
+{
+    "summary": "The MultigridSchedule class manages multigrid training schedules, batch sizes, sampling rates, and long cycle updates. The update_long_cycle() function adjusts these parameters based on the epoch in PaddleVideo. It also calculates final learning rate schedules and provides a function for determining long cycle base shape.",
+    "details": [
+        {
+            "comment": "This code defines a MultigridSchedule class for multigrid training schedule and updates cfg according to multigrid settings. The init_multigrid function takes in configs (cfg) as input, updates it based on multigrid settings, and returns the updated cfg. It stores original values of batch size, temporal size, and crop size in cfg's MULTIGRID subsection as global variables for later use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":0-24",
+            "content": "\"\"\"Functions for multigrid training.\"\"\"\nimport numpy as np\nclass MultigridSchedule(object):\n    \"\"\"\n    This class defines multigrid training schedule and update cfg accordingly.\n    \"\"\"\n    def init_multigrid(self, cfg):\n        \"\"\"\n        Update cfg based on multigrid settings.\n        Args:\n            cfg (configs): configs that contains training and multigrid specific\n                hyperparameters.\n        Returns:\n            cfg (configs): the updated cfg.\n        \"\"\"\n        self.schedule = None\n        # We may modify cfg.DATASET.batch_size, cfg.PIPELINE.train.decode_sampler.num_frames, and\n        # cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] during training, so we store their original\n        # value in cfg and use them as global variables.\n        cfg.MULTIGRID.default_batch_size = cfg.DATASET.batch_size  # total bs,64\n        cfg.MULTIGRID.default_temporal_size = cfg.PIPELINE.train.decode_sampler.num_frames  # 32\n        cfg.MULTIGRID.default_crop_size = cfg.PIPELINE.train.transform[1]["
+        },
+        {
+            "comment": "The code initializes the multi-grid training schedule for the given configuration (cfg). If a long cycle is enabled, it sets learning rate steps and adjusts them for fine-tuning. It also updates the maximum epoch count based on the schedule.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":25-49",
+            "content": "            'MultiCrop']['target_size']  # 224\n        if cfg.MULTIGRID.LONG_CYCLE:\n            self.schedule = self.get_long_cycle_schedule(cfg)\n            cfg.OPTIMIZER.learning_rate.steps = [0] + [\n                s[-1] for s in self.schedule\n            ]\n            # Fine-tuning phase.\n            cfg.OPTIMIZER.learning_rate.steps[-1] = (\n                cfg.OPTIMIZER.learning_rate.steps[-2] +\n                cfg.OPTIMIZER.learning_rate.steps[-1]) // 2\n            cfg.OPTIMIZER.learning_rate.lrs = [\n                cfg.OPTIMIZER.learning_rate.gamma**s[0] * s[1][0]\n                for s in self.schedule\n            ]\n            # Fine-tuning phase.\n            cfg.OPTIMIZER.learning_rate.lrs = cfg.OPTIMIZER.learning_rate.lrs[:-1] + [\n                cfg.OPTIMIZER.learning_rate.lrs[-2],\n                cfg.OPTIMIZER.learning_rate.lrs[-1],\n            ]\n            cfg.OPTIMIZER.learning_rate.max_epoch = self.schedule[-1][-1]\n        elif cfg.MULTIGRID.SHORT_CYCLE:\n            cfg.OPTIMIZER.learning_rate.steps = ["
+        },
+        {
+            "comment": "This function, update_long_cycle(), checks if the long cycle shape should change before every epoch. If it should, it updates cfg accordingly. It takes in configs (cfg) and current epoch index (cur_epoch), and returns the updated cfg and a boolean indicating whether the long cycle shape changed. The function also retrieves the base_b, base_t, and base_s using get_current_long_cycle_shape(). If these values differ from the target size or number of frames in the cfg, it implies that the long cycle shape should change.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":50-73",
+            "content": "                int(s * cfg.MULTIGRID.epoch_factor)\n                for s in cfg.OPTIMIZER.learning_rate.steps\n            ]\n            cfg.OPTIMIZER.learning_rate.max_epoch = int(\n                cfg.OPTIMIZER.learning_rate.max_epoch *\n                cfg.OPTIMIZER.learning_rate.max_epoch)\n        return cfg\n    def update_long_cycle(self, cfg, cur_epoch):\n        \"\"\"\n        Before every epoch, check if long cycle shape should change. If it\n            should, update cfg accordingly.\n        Args:\n            cfg (configs): configs that contains training and multigrid specific\n                hyperparameters.\n            cur_epoch (int): current epoch index.\n        Returns:\n            cfg (configs): the updated cfg.\n            changed (bool): whether to change long cycle shape at this epoch\n        \"\"\"\n        base_b, base_t, base_s = get_current_long_cycle_shape(\n            self.schedule, cur_epoch)\n        if base_s != cfg.PIPELINE.train.transform[1]['MultiCrop'][\n                'target_size'] or base_t != cfg.PIPELINE.train.decode_sampler.num_frames:"
+        },
+        {
+            "comment": "This code sets the number of frames and crop size for the head and transform, adjusts batch size based on multigrid configuration, determines whether to use \"batchnorm\" or \"sub_batchnorm\", and sets the long cycle sampling rate. The output is a message stating if long cycle updates are enabled.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":74-93",
+            "content": "            #NOTE Modify\n            # no need to modify, used by pool_size in head, None when multigrid\n            # cfg.MODEL.head.num_frames = base_t\n            # cfg.MODEL.head.crop_size  = base_s\n            cfg.PIPELINE.train.decode_sampler.num_frames = base_t\n            cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] = base_s\n            cfg.DATASET.batch_size = base_b * cfg.MULTIGRID.default_batch_size  #change bs\n            bs_factor = (float(cfg.DATASET.batch_size) /\n                         cfg.MULTIGRID.bn_base_size)\n            if bs_factor == 1:  #single bs == bn_base_size (== 8)\n                cfg.MODEL.backbone.bn_norm_type = \"batchnorm\"\n            else:\n                cfg.MODEL.backbone.bn_norm_type = \"sub_batchnorm\"\n                cfg.MODEL.backbone.bn_num_splits = int(bs_factor)\n            cfg.MULTIGRID.long_cycle_sampling_rate = cfg.PIPELINE.train.decode_sampler.sampling_rate * (\n                cfg.MULTIGRID.default_temporal_size // base_t)\n            print(\"Long cycle updates:\")"
+        },
+        {
+            "comment": "The code is a function that checks the configuration for certain parameters related to multigrid training. It prints specific values and returns two values: a boolean indicating if the long cycle schedule should be used, and the original config unchanged.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":94-114",
+            "content": "            print(\"\\tbn_norm_type: {}\".format(cfg.MODEL.backbone.bn_norm_type))\n            if cfg.MODEL.backbone.bn_norm_type == \"sub_batchnorm\":\n                print(\"\\tbn_num_splits: {}\".format(\n                    cfg.MODEL.backbone.bn_num_splits))\n            print(\"\\tTRAIN.batch_size[single card]: {}\".format(\n                cfg.DATASET.batch_size))\n            print(\"\\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}\".format(\n                cfg.PIPELINE.train.decode_sampler.num_frames,\n                cfg.MULTIGRID.long_cycle_sampling_rate))\n            print(\"\\tDATA.train_crop_size: {}\".format(\n                cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']))\n            return cfg, True\n        else:\n            return cfg, False\n    def get_long_cycle_schedule(self, cfg):\n        \"\"\"\n        Based on multigrid hyperparameters, define the schedule of a long cycle.\n        Args:\n            cfg (configs): configs that contains training and multigrid specific\n                hyperparameters."
+        },
+        {
+            "comment": "This code calculates the schedule for multi-grid training, iterating over long cycle factor pairs in `cfg.MULTIGRID.long_cycle_factors`. It determines base shapes for each cycle, calculating `base_t` based on `cfg.PIPELINE.train.decode_sampler.num_frames` and `t_factor`, and `base_s` based on target size from `cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']` and `s_factor`. It also considers short cycle training flag, `cfg.MULTIGRID.SHORT_CYCLE`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":115-140",
+            "content": "        Returns:\n            schedule (list): Specifies a list long cycle base shapes and their\n                corresponding training epochs.\n        \"\"\"\n        steps = cfg.OPTIMIZER.learning_rate.steps\n        default_size = float(\n            cfg.PIPELINE.train.decode_sampler.num_frames *\n            cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']**\n            2)  # 32 * 224 * 224  C*H*W\n        default_iters = steps[-1]  # 196\n        # Get shapes and average batch size for each long cycle shape.\n        avg_bs = []\n        all_shapes = []\n        #        for t_factor, s_factor in cfg.MULTIGRID.long_cycle_factors:\n        for item in cfg.MULTIGRID.long_cycle_factors:\n            t_factor, s_factor = item[\"value\"]\n            base_t = int(\n                round(cfg.PIPELINE.train.decode_sampler.num_frames * t_factor))\n            base_s = int(\n                round(\n                    cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']\n                    * s_factor))\n            if cfg.MULTIGRID.SHORT_CYCLE:"
+        },
+        {
+            "comment": "This code defines the multigrid training schedule for PaddleVideo. It sets the shapes for different grid levels, converts them to batch sizes, and calculates the average batch size. The code then computes the total number of iterations and generates the multigrid training schedule based on the steps provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":141-168",
+            "content": "                shapes = [\n                    [\n                        base_t,\n                        cfg.MULTIGRID.default_crop_size *\n                        cfg.MULTIGRID.short_cycle_factors[0],\n                    ],\n                    [\n                        base_t,\n                        cfg.MULTIGRID.default_crop_size *\n                        cfg.MULTIGRID.short_cycle_factors[1],\n                    ],\n                    [base_t, base_s],\n                ]  #first two is short_cycle, last is the base long_cycle\n            else:\n                shapes = [[base_t, base_s]]\n            # (T, S) -> (B, T, S)\n            shapes = [[\n                int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]\n            ] for s in shapes]\n            avg_bs.append(np.mean([s[0] for s in shapes]))\n            all_shapes.append(shapes)\n        # Get schedule regardless of cfg.MULTIGRID.epoch_factor.\n        total_iters = 0\n        schedule = []\n        for step_index in range(len(steps) - 1):\n            step_epochs = steps[step_index + 1] - steps[step_index]"
+        },
+        {
+            "comment": "This code calculates the number of iterations for each sequence based on average batch sizes, and then appends the schedule with corresponding step index, shape, and epochs. It also ensures that the fine-tuning phase has the same number of iterations as the rest of the training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":170-190",
+            "content": "            for long_cycle_index, shapes in enumerate(all_shapes):\n                #ensure each of 4 sequences run the same num of iters\n                cur_epochs = (step_epochs * avg_bs[long_cycle_index] /\n                              sum(avg_bs))\n                # get cur_iters from cur_epochs\n                cur_iters = cur_epochs / avg_bs[long_cycle_index]\n                total_iters += cur_iters\n                schedule.append((step_index, shapes[-1], cur_epochs))\n        iter_saving = default_iters / total_iters  # ratio between default iters and real iters\n        final_step_epochs = cfg.OPTIMIZER.learning_rate.max_epoch - steps[-1]\n        # We define the fine-tuning phase to have the same amount of iteration\n        # saving as the rest of the training.\n        #final_step_epochs / iter_saving make fine-tune having the same iters as training\n        ft_epochs = final_step_epochs / iter_saving * avg_bs[-1]\n        #        schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs))\n        schedule.append((step_index + 1, all_shapes[-1][-1], ft_epochs))"
+        },
+        {
+            "comment": "This code calculates the final learning rate schedule for multigrid training based on a provided schedule, max_epoch, and epoch_factor. It then prints this new schedule. The function get_current_long_cycle_shape takes in this same schedule and current epoch index to return the long cycle base shape for the given epoch.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":192-223",
+            "content": "        # Obtrain final schedule given desired cfg.MULTIGRID.epoch_factor.\n        x = (cfg.OPTIMIZER.learning_rate.max_epoch *\n             cfg.MULTIGRID.epoch_factor / sum(s[-1] for s in schedule))\n        final_schedule = []\n        total_epochs = 0\n        for s in schedule:\n            epochs = s[2] * x\n            total_epochs += epochs\n            final_schedule.append((s[0], s[1], int(round(total_epochs))))\n        print_schedule(final_schedule)\n        return final_schedule\ndef print_schedule(schedule):\n    \"\"\"\n    Log schedule.\n    \"\"\"\n    print(\n        \"Long_cycle_index\\tBase_shape(bs_factor,temporal_size,crop_size)\\tEpochs\"\n    )\n    for s in schedule:\n        print(\"{}\\t\\t\\t{}\\t\\t\\t\\t\\t{}\".format(s[0], s[1], s[2]))\ndef get_current_long_cycle_shape(schedule, epoch):\n    \"\"\"\n    Given a schedule and epoch index, return the long cycle base shape.\n    Args:\n        schedule (configs): configs that contains training and multigrid specific\n            hyperparameters.\n        cur_epoch (int): current epoch index."
+        },
+        {
+            "comment": "This function returns a list describing the base shape in a long cycle based on the current epoch and a given schedule. It iterates through the schedule, returning the appropriate shape if the current epoch is less than the scheduled value, otherwise it returns the last shape in the schedule.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/utils/multigrid/multigrid.py\":224-232",
+            "content": "    Returns:\n        shapes (list): A list describing the base shape in a long cycle:\n            [batch size relative to default,\n            number of frames, spatial dimension].\n    \"\"\"\n    for s in schedule:\n        if epoch < s[-1]:\n            return s[1]\n    return schedule[-1][1]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c7eaf2b9-2f77-4f59-abd9-7a2cba8fbd07.json b/docs/doc/c7eaf2b9-2f77-4f59-abd9-7a2cba8fbd07.json
new file mode 100644
index 000000000..d3fb87b4e
--- /dev/null
+++ b/docs/doc/c7eaf2b9-2f77-4f59-abd9-7a2cba8fbd07.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The ResNet3dSlowOnly class creates a Slowfast pathway in the ResNet3d architecture, reduces channel number, and is registered under BACKBONES. The make_res_layer function builds residual layers with specified spatial_strides, temporal_strides, and dilations for 3D Resnet layers.",
+    "details": [
+        {
+            "comment": "This code defines a ResNet3dSlowOnly class that extends ResNet3d for creating a Slowfast pathway. It reduces the channel number of the fast pathway by a specified 'channel_ratio'. This model is registered under BACKBONES and accepts the same arguments as ResNet3d.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport warnings\nimport paddle\nimport paddle.nn as nn\nfrom .resnet3d import ResNet3d, ConvBNLayer\nfrom ..registry import BACKBONES\n@BACKBONES.register()\nclass ResNet3dSlowOnly(ResNet3d):\n    \"\"\"A pathway of Slowfast based on ResNet3d.\n    Args:\n        *args (arguments): Arguments same as :class:``ResNet3d``.\n        channel_ratio (int): Reduce the channel number of fast pathway\n            by ``channel_ratio``, corresponding to ``beta`` in the paper."
+        },
+        {
+            "comment": "Function `__init__` initializes the ResNet3d object by setting initial values for inplanes and lateral_connections based on provided args and kwargs. The function make_res_layer builds a residual layer for Slowfast, taking in various parameters including block type, input planes, output planes, number of blocks, and more.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py\":30-59",
+            "content": "            Default: 8.\n        **kwargs (keyword arguments): Keywords arguments for ResNet3d.\n    \"\"\"\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.inplanes = self.base_channels\n        self.lateral_connections = []\n        for i in range(len(self.stage_blocks)):\n            planes = self.base_channels * 2**i\n            self.inplanes = planes * self.block.expansion\n    def make_res_layer(self,\n                       block,\n                       inplanes,\n                       planes,\n                       blocks,\n                       spatial_stride=1,\n                       temporal_stride=1,\n                       dilation=1,\n                       inflate=1,\n                       inflate_style='3x1x1',\n                       non_local=0,\n                       non_local_cfg=dict(),\n                       conv_cfg=None,\n                       norm_cfg=None,\n                       act_cfg=None,\n                       with_cp=False):\n        \"\"\"Build residual layer for Slowfast."
+        },
+        {
+            "comment": "This function is defining a Residual module with specified parameters including block type, input and output planes, number of residual blocks, spatial and temporal strides, dilation rate, whether to inflate or apply non-local operations for each block.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py\":61-78",
+            "content": "        Args:\n            block (nn.Module): Residual module to be built.\n            inplanes (int): Number of channels for the input\n                feature in each block.\n            planes (int): Number of channels for the output\n                feature in each block.\n            blocks (int): Number of residual blocks.\n            spatial_stride (int | Sequence[int]): Spatial strides\n                in residual and conv layers. Default: 1.\n            temporal_stride (int | Sequence[int]): Temporal strides in\n                residual and conv layers. Default: 1.\n            dilation (int): Spacing between kernel elements. Default: 1.\n            inflate (int | Sequence[int]): Determine whether to inflate\n                for each block. Default: 1.\n            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines\n                the kernel sizes and padding strides for conv1 and\n                conv2 in each block. Default: ``3x1x1``.\n            non_local (int | Sequence[int]): Determine whether to apply"
+        },
+        {
+            "comment": "This function takes in a configuration for building residual layers, including parameters like blocks (number of residual layers to create), inflate (inflation times for the conv layers), non_local (whether to use non-local modules), conv_cfg, norm_cfg, act_cfg, and with_cp (use checkpoint). It asserts that the lengths of inflate and non_local match the number of blocks specified. The function returns a residual layer for the given configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py\":79-97",
+            "content": "                non-local module in the corresponding block of each stages.\n                Default: 0.\n            non_local_cfg (dict): Config for non-local module.\n                Default: ``dict()``.\n            conv_cfg (dict | None): Config for conv layers. Default: None.\n            norm_cfg (dict | None): Config for norm layers. Default: None.\n            act_cfg (dict | None): Config for activate layers. Default: None.\n            with_cp (bool): Use checkpoint or not. Using checkpoint will save\n                some memory while slowing down the training speed.\n                Default: False.\n        Returns:\n            nn.Module: A residual layer for the given config.\n        \"\"\"\n        inflate = inflate if not isinstance(inflate,\n                                            int) else (inflate, ) * blocks\n        non_local = non_local if not isinstance(non_local,\n                                                int) else (non_local, ) * blocks\n        assert len(inflate) == blocks and len(non_local) == blocks"
+        },
+        {
+            "comment": "This code is creating a downsample layer and appending a block to the layers list. The downsample is created based on whether the current input planes match the expected value or not. If it doesn't match, a convolutional layer with stride is used for downsampling. The block is added to the layers list with specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py\":99-128",
+            "content": "        lateral_inplanes = 0\n        if (spatial_stride != 1\n                or (inplanes + lateral_inplanes) != planes * block.expansion):\n            downsample = ConvBNLayer(\n                in_channels=inplanes + lateral_inplanes,\n                out_channels=planes * block.expansion,\n                kernel_size=1,\n                stride=(temporal_stride, spatial_stride, spatial_stride),\n                bias=False,\n                act=None)\n        else:\n            downsample = None\n        layers = []\n        layers.append(\n            block(\n                inplanes + lateral_inplanes,\n                planes,\n                spatial_stride,\n                temporal_stride,\n                dilation,\n                downsample,\n                inflate=(inflate[0] == 1),\n                inflate_style=inflate_style,\n                non_local=(non_local[0] == 1),\n                non_local_cfg=non_local_cfg,\n                conv_cfg=conv_cfg,\n                norm_cfg=norm_cfg,\n                act_cfg=act_cfg,\n                with_cp=with_cp))"
+        },
+        {
+            "comment": "This code defines a function to create layers for a Resnet3D backbone model in PaddleVideo. It takes in parameters such as planes, blocks, dilation, inflate, inflate_style, non_local, non_local_cfg, conv_cfg, norm_cfg, act_cfg, and with_cp. The function creates layers by appending instances of a block class to a list, and returns them as a sequential model for training or inference. Additionally, there is another function that inflates a 2D conv module to a 3D one, mainly adjusting the inplanes due to lateral connections for fitting the shapes of lateral connection counterparts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py\":129-156",
+            "content": "        inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(\n                    inplanes,\n                    planes,\n                    1,\n                    1,\n                    dilation,\n                    inflate=(inflate[i] == 1),\n                    inflate_style=inflate_style,\n                    non_local=(non_local[i] == 1),\n                    non_local_cfg=non_local_cfg,\n                    conv_cfg=conv_cfg,\n                    norm_cfg=norm_cfg,\n                    act_cfg=act_cfg,\n                    with_cp=with_cp))\n        return nn.Sequential(*layers)\n    def _inflate_conv_params(self, conv3d, state_dict_2d, module_name_2d,\n                             inflated_param_names):\n        \"\"\"Inflate a conv module from 2d to 3d.\n        The differences of conv modules betweene 2d and 3d in Pathway\n        mainly lie in the inplanes due to lateral connections. To fit the\n        shapes of the lateral connection counterpart, it will expand"
+        },
+        {
+            "comment": "This function loads the 2D model's state dictionary into a 3D Conv module, concatenating conv2d parameters and adding zero paddings to match the new shape. The weight shape of the 2D model is retrieved, and if the number of input channels in the 3D model is different from the 2D model, it will raise a warning or return without loading the parameters due to incompatible shapes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py\":157-179",
+            "content": "        parameters by concatting conv2d parameters and extra zero paddings.\n        Args:\n            conv3d (nn.Module): The destination conv3d module.\n            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.\n            module_name_2d (str): The name of corresponding conv module in the\n                2d model.\n            inflated_param_names (list[str]): List of parameters that have been\n                inflated.\n        \"\"\"\n        weight_2d_name = module_name_2d + '.weight'\n        conv2d_weight = state_dict_2d[weight_2d_name]\n        old_shape = conv2d_weight.shape\n        new_shape = conv3d.weight.data.shape\n        kernel_t = new_shape[2]\n        if new_shape[1] != old_shape[1]:\n            if new_shape[1] < old_shape[1]:\n                warnings.warn(f'The parameter of {module_name_2d} is not'\n                              'loaded due to incompatible shapes. ')\n                return\n            # Inplanes may be different due to lateral connections\n            new_channels = new_shape[1] - old_shape[1]"
+        },
+        {
+            "comment": "The code inflates a 2D convolutional layer into a 3D convolutional layer by padding the weights and copying the bias. It does this for all layers specified in the ResNet3dSlowOnly architecture, with specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py\":180-209",
+            "content": "            pad_shape = old_shape\n            pad_shape = pad_shape[:1] + (new_channels, ) + pad_shape[2:]\n            # Expand parameters by concat extra channels\n            conv2d_weight = paddle.concat(\n                (conv2d_weight, paddle.zeros(pad_shape)), axis=1)\n        new_weight = conv2d_weight.data.unsqueeze(2).expand_as(\n            conv3d.weight) / kernel_t\n        conv3d.weight.data.copy_(new_weight)\n        inflated_param_names.append(weight_2d_name)\n        if getattr(conv3d, 'bias') is not None:\n            bias_2d_name = module_name_2d + '.bias'\n            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])\n            inflated_param_names.append(bias_2d_name)\nif __name__ == '__main__':\n    net = ResNet3dSlowOnly(\n        depth=50,\n        in_channels=17,\n        base_channels=32,\n        conv1_kernel=(1, 7, 7),\n        num_stages=3,\n        out_indices=[2],\n        stage_blocks=[3, 4, 6],\n        conv1_stride_s=1,\n        pool1_stride_s=1,\n        inflate=[0, 1, 1],\n        with_pool2=False,"
+        },
+        {
+            "comment": "This code sets the spatial_strides, temporal_strides, and dilations for a 3D Resnet layer, with spatial strides of [2, 2, 2], temporal strides of [1, 1, 2], and dilations of [1, 1, 1].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py\":210-213",
+            "content": "        spatial_strides=[2, 2, 2],\n        temporal_strides=[1, 1, 2],\n        dilations=[1, 1, 1])\n    pass"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c8ac3bc9-9b4c-4e2f-a884-01d4a4db219f.json b/docs/doc/c8ac3bc9-9b4c-4e2f-a884-01d4a4db219f.json
new file mode 100644
index 000000000..6e7d57bae
--- /dev/null
+++ b/docs/doc/c8ac3bc9-9b4c-4e2f-a884-01d4a4db219f.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code defines a MobileNetV2 model with InvertedResidual blocks for Ma-Net application, initializing the backbone network and preparing it for forward propagation while applying Kaiming normal initialization to certain layers.",
+    "details": [
+        {
+            "comment": "This code defines a network layer for MobileNet, including convolution-batch normalization-ReLU6 operations and an inverted residual block. It utilizes padding and dilation techniques to increase the effective receptive field size of the convolutions. The InvertedResidual class handles stride, dilation, and expand_ratio parameters for the network layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py\":0-32",
+            "content": "import paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nimport math\nfrom utils.api import kaiming_normal_\ndef conv_bn(inp, oup, stride, BatchNorm):\n    return nn.Sequential(nn.Conv2D(inp, oup, 3, stride, 1, bias_attr=False),\n                         BatchNorm(oup), nn.ReLU6())\ndef fixed_padding(inputs, kernel_size, dilation):\n    kernel_size_effective = kernel_size + (kernel_size - 1) * (dilation - 1)\n    pad_total = kernel_size_effective - 1\n    pad_beg = pad_total // 2\n    pad_end = pad_total - pad_beg\n    padded_inputs = F.pad(inputs, (pad_beg, pad_end, pad_beg, pad_end))\n    return padded_inputs\nclass InvertedResidual(nn.Layer):\n    def __init__(self, inp, oup, stride, dilation, expand_ratio, BatchNorm):\n        super(InvertedResidual, self).__init__()\n        self.stride = stride\n        assert stride in [1, 2]\n        hidden_dim = round(inp * expand_ratio)\n        self.use_res_connect = self.stride == 1 and inp == oup\n        self.kernel_size = 3\n        self.dilation = dilation\n        if expand_ratio == 1:"
+        },
+        {
+            "comment": "This code defines a convolutional neural network layer for MobileNet backbone. It includes parameters such as input and output channels (inp, oup), hidden dimension (hidden_dim), stride, dilation, and whether to use pointwise (pw) or depthwise (dw) convolution. The layer is created using nn.Sequential module and includes BatchNorm and ReLU6 activation functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py\":33-62",
+            "content": "            self.conv = nn.Sequential(\n                # dw\n                nn.Conv2D(hidden_dim,\n                          hidden_dim,\n                          3,\n                          stride,\n                          0,\n                          dilation,\n                          groups=hidden_dim,\n                          bias_attr=False),\n                BatchNorm(hidden_dim),\n                nn.ReLU6(),\n                # pw-linear\n                nn.Conv2D(hidden_dim, oup, 1, 1, 0, 1, 1, bias_attr=False),\n                BatchNorm(oup),\n            )\n        else:\n            self.conv = nn.Sequential(\n                # pw\n                nn.Conv2D(inp, hidden_dim, 1, 1, 0, 1, bias_attr=False),\n                BatchNorm(hidden_dim),\n                nn.ReLU6(),\n                # dw\n                nn.Conv2D(hidden_dim,\n                          hidden_dim,\n                          3,\n                          stride,\n                          0,\n                          dilation,\n                          groups=hidden_dim,"
+        },
+        {
+            "comment": "This code defines a MobileNetV2 model with InvertedResidual blocks, including convolutional layers, batch normalization, ReLU activation, and optional residual connection. The model takes in an input image of size 3xHxW and outputs a feature map of size oup x (H/stride) x (W/stride). It also supports variable width multiplier to adjust the number of channels for each block.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py\":63-98",
+            "content": "                          bias_attr=False),\n                BatchNorm(hidden_dim),\n                nn.ReLU6(),\n                # pw-linear\n                nn.Conv2D(hidden_dim, oup, 1, 1, 0, 1, bias_attr=False),\n                BatchNorm(oup),\n            )\n    def forward(self, x):\n        x_pad = fixed_padding(x, self.kernel_size, dilation=self.dilation)\n        if self.use_res_connect:\n            x = x + self.conv(x_pad)\n        else:\n            x = self.conv(x_pad)\n        return x\nclass MobileNetV2(nn.Layer):\n    def __init__(self,\n                 output_stride=8,\n                 BatchNorm=None,\n                 width_mult=1.,\n                 pretrained=True):\n        super(MobileNetV2, self).__init__()\n        block = InvertedResidual\n        input_channel = 32\n        current_stride = 1\n        rate = 1\n        interverted_residual_setting = [\n            # t, c, n, s\n            [1, 16, 1, 1],\n            [6, 24, 2, 2],\n            [6, 32, 3, 2],\n            [6, 64, 4, 2],\n            [6, 96, 3, 1],\n            [6, 160, 3, 2],"
+        },
+        {
+            "comment": "This code builds the MobileNet backbone for Ma-Net application. It initializes the first layer with a specific input channel and then iterates through inverted residual blocks, adjusting stride and dilation rate accordingly. The block function is used to build each block, and input channels are updated accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py\":99-126",
+            "content": "            [6, 320, 1, 1],\n        ]\n        # building first layer\n        input_channel = int(input_channel * width_mult)\n        self.features = [conv_bn(3, input_channel, 2, BatchNorm)]\n        current_stride *= 2\n        # building inverted residual blocks\n        for t, c, n, s in interverted_residual_setting:\n            if current_stride == output_stride:\n                stride = 1\n                dilation = rate\n                rate *= s\n            else:\n                stride = s\n                dilation = 1\n                current_stride *= s\n            output_channel = int(c * width_mult)\n            for i in range(n):\n                if i == 0:\n                    self.features.append(\n                        block(input_channel, output_channel, stride, dilation,\n                              t, BatchNorm))\n                else:\n                    self.features.append(\n                        block(input_channel, output_channel, 1, dilation, t,\n                              BatchNorm))\n                input_channel = output_channel"
+        },
+        {
+            "comment": "Initializes and prepares the MobileNet backbone network for forward propagation. If pretrained model is specified, loads the pretrained weights from PaddleHub's MobileNet_v2_imagenet. Otherwise, initializes the weights according to the provided configuration. The forward function extracts low-level and high-level features by passing the input through separate subsections of the feature extraction network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py\":127-156",
+            "content": "        self.features = nn.Sequential(*self.features)\n        self._initialize_weights()\n        if pretrained:\n            self._load_pretrained_model()\n        self.low_level_features = self.features[0:4]\n        self.high_level_features = self.features[4:]\n    def forward(self, x):\n        low_level_feat = self.low_level_features(x)\n        x = self.high_level_features(low_level_feat)\n        return x, low_level_feat\n    def _load_pretrained_model(self):\n        import paddlehub as hub\n        pretrain_dict = hub.Module(name=\"mobilenet_v2_imagenet\")\n        model_dict = {}\n        state_dict = self.state_dict()\n        for k, v in pretrain_dict.items():\n            if k in state_dict:\n                model_dict[k] = v\n        state_dict.update(model_dict)\n        self.set_state_dict(state_dict)\n    def _initialize_weights(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                # n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                # m.weight.normal_(0, math.sqrt(2. / n))"
+        },
+        {
+            "comment": "Code applies Kaiming normal initialization to certain layers (m.weight) and performs batch normalization by filling layer weights with 1 and setting bias to zero for nn.BatchNorm2D instances.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py\":157-162",
+            "content": "                kaiming_normal_(m.weight)\n            elif isinstance(m, nn.BatchNorm2D):\n                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c92c0d2c-eab9-4a11-92e3-bb38d8c091e8.json b/docs/doc/c92c0d2c-eab9-4a11-92e3-bb38d8c091e8.json
new file mode 100644
index 000000000..09ae86110
--- /dev/null
+++ b/docs/doc/c92c0d2c-eab9-4a11-92e3-bb38d8c091e8.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This class defines a base loss function in PaddleVideo, requires subclasses to implement _forward method and supports an optional loss_weight parameter.",
+    "details": [
+        {
+            "comment": "Base class for loss functions in PaddleVideo, subclasses should override the _forward() method to return normal loss without weights. Contains an optional loss_weight parameter for scaling the final loss value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py\":0-32",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom abc import  abstractmethod\nimport paddle\nimport paddle.nn as nn\n#XXX use _forward?? or forward??\nclass BaseWeightedLoss(nn.Layer):\n    \"\"\"Base class for loss.\n    All subclass should overwrite the ``_forward()`` method which returns the\n    normal loss without loss weights.\n    Args:\n        loss_weight (float): Factor scalar multiplied on the loss.\n            Default: 1.0.\n    \"\"\"\n    def __init__(self, loss_weight=1.0):"
+        },
+        {
+            "comment": "The code defines an abstract base class for a loss function. It initializes the loss weight, requires subclasses to implement the _forward method, and returns the forward pass result multiplied by the loss weight in the forward method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py\":33-50",
+            "content": "        super().__init__()\n        self.loss_weight = loss_weight\n    @abstractmethod\n    def _forward(self, *args, **kwargs):\n        pass\n    def forward(self, *args, **kwargs):\n        \"\"\"Defines the computation performed at every call.\n        Args:\n            *args: The positional arguments for the corresponding\n                loss.\n            **kwargs: The keyword arguments for the corresponding\n                loss.\n        Returns:\n            paddle.Tensor: The calculated loss.\n        \"\"\"\n        return self._forward(*args, **kwargs) * self.loss_weight"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/c9b6c38b-cb97-4d36-a421-1e806dc14d11.json b/docs/doc/c9b6c38b-cb97-4d36-a421-1e806dc14d11.json
new file mode 100644
index 000000000..4746b3113
--- /dev/null
+++ b/docs/doc/c9b6c38b-cb97-4d36-a421-1e806dc14d11.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code initializes a model's head, defines modes of operation, and provides abstract methods for training, validation, and inference steps. It serves as a base class for recognizer models in PaddleVideo and raises NotImplementedError if subclasses don't implement these steps.",
+    "details": [
+        {
+            "comment": "Base class for recognizers: Subclasses should override train_step, valid_step, and test_step methods. Builds backbone and head using builder if provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/base.py\":0-32",
+            "content": "from abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseRecognizer(nn.Layer):\n    \"\"\"Base class for recognizers.\n    All recognizers should subclass it.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Classification head to process feature.\n    \"\"\"\n    def __init__(self, backbone=None, head=None, runtime_cfg=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):"
+        },
+        {
+            "comment": "This code initializes a model's head, defines the mode of operation (train, valid, test, infer), and provides abstract methods for training and validation steps. If the mode is 'infer', it saves the inference model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/base.py\":33-65",
+            "content": "                self.head.init_weights()\n        else:\n            self.head = None\n        # Settings when the model is running,\n        # such as 'avg_type'\n        self.runtime_cfg = runtime_cfg\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):"
+        },
+        {
+            "comment": "This code snippet in PaddleVideo defines abstract methods for validating, testing, and inferring steps. It serves as a base class for recognizer models and expects subclasses to implement these methods. The NotImplementedError is raised to ensure that subclasses provide their own implementation for these steps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/base.py\":66-80",
+            "content": "        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ca88b2a2-aef0-4ae4-90c5-f0f306873d19.json b/docs/doc/ca88b2a2-aef0-4ae4-90c5-f0f306873d19.json
new file mode 100644
index 000000000..68514a31b
--- /dev/null
+++ b/docs/doc/ca88b2a2-aef0-4ae4-90c5-f0f306873d19.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code extracts audio features using MFCC and STFT for action detection in FootballAction. It includes spectrogram bins conversion, data normalization, and resampling with examples using a WAV file.",
+    "details": [
+        {
+            "comment": "This code extracts audio features using the Mel-frequency cepstral coefficients (MFCC) method. It defines a function \"frame\" to slice data into frames, another function \"periodic_hann\" for windowing using periodic Hann window, and finally a function \"stft_magnitude\" for computing Short Time Fourier Transform (STFT) magnitude from signal. The code likely uses these functions in combination to extract MFCC features from audio data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py\":0-37",
+            "content": "\"\"\"\naudio feature extract\n\"\"\"\n# coding: utf-8\nimport os\nimport numpy as np\nimport pickle\nimport mfcc.vgg_params as vgg_params\ndef frame(data, window_length, hop_length):\n    \"\"\"\n    frame\n    \"\"\"\n    num_samples = data.shape[0]\n    #print(\"window_length , hop_length\", window_length, hop_length)\n    #print(\"num_sample = \", num_samples)\n    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))\n    #print(\" num_frames = \", num_frames)\n    shape = (num_frames, window_length) + data.shape[1:]\n    #print(\" shape = \", shape)\n    strides = (data.strides[0] * hop_length, ) + data.strides\n    #print(\"data.strides = \", data.strides)\n    #print(\"strides = \", strides)\n    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)\ndef periodic_hann(window_length):\n    \"\"\"\n    periodic_hann\n    \"\"\"\n    return 0.5 - (0.5 *\n                  np.cos(2 * np.pi / window_length * np.arange(window_length)))\ndef stft_magnitude(signal, fft_length, hop_length=None, window_length=None):\n    \"\"\"\n    stft_magnitude"
+        },
+        {
+            "comment": "This code defines functions for converting frequencies from Hertz to Mel scale, and creating a mel spectrum matrix from a spectrogram. It also includes validation checks to ensure lower edge frequency is less than the upper edge frequency. The Mel scale is used in audio processing for approximating human auditory perception of sound.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py\":38-68",
+            "content": "    \"\"\"\n    frames = frame(signal, window_length, hop_length)\n    window = periodic_hann(window_length)\n    windowed_frames = frames * window\n    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))\n_MEL_BREAK_FREQUENCY_HERTZ = 700.0\n_MEL_HIGH_FREQUENCY_Q = 1127.0\ndef hertz_to_mel(frequencies_hertz):\n    \"\"\"\n    hertz_to_mel\n    \"\"\"\n    return _MEL_HIGH_FREQUENCY_Q * np.log(1.0 + (frequencies_hertz /\n                                                 _MEL_BREAK_FREQUENCY_HERTZ))\ndef spectrogram_to_mel_matrix(num_mel_bins=20,\n                              num_spectrogram_bins=129,\n                              audio_sample_rate=8000,\n                              lower_edge_hertz=125.0,\n                              upper_edge_hertz=3800.0):\n    \"\"\"\n    spectrogram_to_mel_matrix\n    \"\"\"\n    nyquist_hertz = audio_sample_rate / 2.\n    if lower_edge_hertz >= upper_edge_hertz:\n        raise ValueError(\"lower_edge_hertz %.1f >= upper_edge_hertz %.1f\" %\n                         (lower_edge_hertz, upper_edge_hertz))"
+        },
+        {
+            "comment": "This function calculates mel-frequency cepstral coefficients (MFCC) from speech audio data. It converts spectrogram bins to hertz and mel scales, creates band edges for mel analysis, computes mel weights matrix using triangular interpolation, and sets the first row of the matrix to zero.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py\":69-89",
+            "content": "    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz,\n                                         num_spectrogram_bins)\n    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)\n    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),\n                                 hertz_to_mel(upper_edge_hertz),\n                                 num_mel_bins + 2)\n    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))\n    for i in range(num_mel_bins):\n        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]\n        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /\n                       (center_mel - lower_edge_mel))\n        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /\n                       (upper_edge_mel - center_mel))\n        mel_weights_matrix[:,\n                           i] = np.maximum(0.0,\n                                           np.minimum(lower_slope, upper_slope))\n    mel_weights_matrix[0, :] = 0.0\n    return mel_weights_matrix\ndef log_mel_spectrogram(data,"
+        },
+        {
+            "comment": "This function takes in audio data and parameters such as audio sample rate, window length in seconds, hop length in seconds, and other optional keywords. It calculates the window length samples and hop length samples based on the provided audio sample rate. It then determines the FFT length by taking the next highest power of 2 from the window length samples. Finally, it computes the spectrogram using the STFT (Short-Time Fourier Transform) magnitude with the calculated parameters and returns the resulting spectrogram.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py\":90-110",
+            "content": "                        audio_sample_rate=8000,\n                        log_offset=0.0,\n                        window_length_secs=0.025,\n                        hop_length_secs=0.010,\n                        **kwargs):\n    \"\"\"\n    log_mel_spectrogram\n    \"\"\"\n    window_length_samples = int(round(audio_sample_rate * window_length_secs))\n    #print(\"audio_sample_rate = \", audio_sample_rate)\n    #print(\"window_length_secs = \", window_length_secs)\n    #print(\"window_length_sample \", window_length_samples)\n    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))\n    #print(\"hop_length_samples \", hop_length_samples)\n    fft_length = 2**int(np.ceil(np.log(window_length_samples) / np.log(2.0)))\n    #print(\" fft_lengt = \", fft_length)\n    spectrogram = stft_magnitude(data,\n                                 fft_length=fft_length,\n                                 hop_length=hop_length_samples,\n                                 window_length=window_length_samples)\n    #print(\" spectrogram.shape = \", spectrogram.shape)"
+        },
+        {
+            "comment": "The code extracts audio features from a WAV file using short-time Fourier transform (STFT) and applies Mel-frequency cepstral coefficients (MFCCs). It reads the WAV file, pads zeros if necessary to match desired window length, scales the data to be between -1 and 1, and then calculates STFT. Finally, it computes MFCCs from the spectrogram and returns the log of the result plus a small offset for numerical stability.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py\":111-135",
+            "content": "    mel_spectrogram = np.dot(\n        spectrogram,\n        spectrogram_to_mel_matrix(num_spectrogram_bins=spectrogram.shape[1],\n                                  audio_sample_rate=audio_sample_rate,\n                                  **kwargs))\n    return np.log(mel_spectrogram + log_offset)\ndef wav_to_example(wav_data, sample_rate):\n    \"\"\"\n    wav_to_example\n    \"\"\"\n    #sample_rate, wav_data = wavfile.read(wav_file)\n    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype\n    #wav_data = wav_data[:16000*30]\n    #print(\" wav_data \", wav_data.shape)\n    #print(\" wav_data \", wav_data.shape)\n    pad_zero_num = int(sample_rate * (vgg_params.STFT_WINDOW_LENGTH_SECONDS -\n                                      vgg_params.STFT_HOP_LENGTH_SECONDS))\n    wav_data_extend = np.hstack((wav_data, np.zeros(pad_zero_num)))\n    wav_data = wav_data_extend\n    #print(\" wav_data \", wav_data.shape)\n    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]\n    #print(\" wav_data after convert to -1 1\", wav_data)"
+        },
+        {
+            "comment": "This code performs feature extraction on audio data for action detection in the FootballAction application. It ensures that the audio data is within specified bounds, applies mean normalization if necessary, resamples to a fixed rate, and then generates log mel spectrogram features. These features are framed into examples at a specific sample rate and window length for use by VGG model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py\":136-156",
+            "content": "    #if wav_data.shape[0] > max_second * sample_rate:\n    #    wav_data = wav_data[:max_second * sample_rate, :]\n    if len(wav_data.shape) > 1:\n        wav_data = np.mean(wav_data, axis=1)\n    #print(\" wav_data after mean\", wav_data.shape, len(wav_data.shape), wav_data)\n    # Resample to the rate assumed by vgg.\n    #if sample_rate != vgg_params.SAMPLE_RATE:\n    #    wav_data = resampy.resample(wav_data, sample_rate, vgg_params.SAMPLE_RATE)\n    log_mel = log_mel_spectrogram(\n        wav_data,\n        audio_sample_rate=vgg_params.SAMPLE_RATE,\n        log_offset=vgg_params.LOG_OFFSET,\n        window_length_secs=vgg_params.STFT_WINDOW_LENGTH_SECONDS,\n        hop_length_secs=vgg_params.STFT_HOP_LENGTH_SECONDS,\n        num_mel_bins=vgg_params.NUM_MEL_BINS,\n        lower_edge_hertz=vgg_params.MEL_MIN_HZ,\n        upper_edge_hertz=vgg_params.MEL_MAX_HZ)\n    # Frame features into examples.\n    features_sample_rate = 1.0 / vgg_params.STFT_HOP_LENGTH_SECONDS\n    example_window_length = int(\n        round(vgg_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))"
+        },
+        {
+            "comment": "The code defines a function that extracts audio features from WAV files. It calculates the hop length based on the example window length and sample rate, and applies a log Mel spectrum to the audio data. It also includes a separate function for extracting examples from PCM files and converting them into examples at a given sample rate. The main part of the code demonstrates how to use the functions by reading a WAV file, printing its shape after processing with the feature extraction functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py\":158-181",
+            "content": "    example_hop_length = int(\n        round(vgg_params.EXAMPLE_HOP_SECONDS * features_sample_rate))\n    log_mel_examples = frame(log_mel,\n                             window_length=example_window_length,\n                             hop_length=example_hop_length)\n    return log_mel_examples\ndef extract_pcm(pcm_file, sample_rate):\n    with open(pcm_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype=np.int16)\n    examples = wav_to_example(audio_data, sample_rate)\n    return examples\nif __name__ == \"__main__\":\n    wav_file = sys.argv[1]\n    print(\"wav_file = \", wav_file)\n    with open(wav_file, \"rb\") as f:\n        pcm_data = f.read()\n    audio_data = np.fromstring(pcm_data, dtype = np.int16)\n    examples_batch = wav_to_example(audio_data, 16000)\n    print(\"examples_batch.shape\", examples_batch.shape)   "
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/caf4379e-498e-4140-bd1c-dcf4eb7d5dc9.json b/docs/doc/caf4379e-498e-4140-bd1c-dcf4eb7d5dc9.json
new file mode 100644
index 000000000..c07356ca0
--- /dev/null
+++ b/docs/doc/caf4379e-498e-4140-bd1c-dcf4eb7d5dc9.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is for the VideoTag, a large-scale video classification model developed by PaddlePaddle. It uses two stages of modeling - image modeling and sequence learning - to classify videos in large scale scenarios. The model involves data processing, TSN network training for feature extraction, and attention clusters, LSTM, Nextvlad for sequence learning. Results are predicted by combining multiple models, leading to increased accuracy.",
+    "details": [
+        {
+            "comment": "This code is for the VideoTag, a large-scale video classification model developed by PaddlePaddle. It uses two stages of modeling - image modeling and sequence learning - to classify videos in large scale scenarios. The model involves data processing, TSN network training for feature extraction, and attention clusters, LSTM, Nextvlad for sequence learning. Results are predicted by combining multiple models, leading to increased accuracy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/README.md\":0-30",
+            "content": "# VideoTag \u98de\u6868\u5927\u89c4\u6a21\u89c6\u9891\u5206\u7c7b\u6a21\u578b\n---\n## \u5185\u5bb9\n- [\u6a21\u578b\u7b80\u4ecb](#\u6a21\u578b\u7b80\u4ecb)\n- [\u4f7f\u7528\u65b9\u6cd5](#\u4f7f\u7528\u65b9\u6cd5)\n## \u6a21\u578b\u7b80\u4ecb\n\u98de\u6868\u5927\u89c4\u6a21\u89c6\u9891\u5206\u7c7b\u6a21\u578bVideoTag\u57fa\u4e8e\u767e\u5ea6\u77ed\u89c6\u9891\u4e1a\u52a1\u5343\u4e07\u7ea7\u6570\u636e\uff0c\u652f\u63013000\u4e2a\u6e90\u4e8e\u4ea7\u4e1a\u5b9e\u8df5\u7684\u5b9e\u7528\u6807\u7b7e\uff0c\u5177\u6709\u826f\u597d\u7684\u6cdb\u5316\u80fd\u529b\uff0c\u975e\u5e38\u9002\u7528\u4e8e\u56fd\u5185\u5927\u89c4\u6a21\uff08\u5343\u4e07/\u4ebf/\u5341\u4ebf\u7ea7\u522b\uff09\u77ed\u89c6\u9891\u5206\u7c7b\u573a\u666f\u7684\u5e94\u7528\u3002VideoTag\u91c7\u7528\u4e24\u9636\u6bb5\u5efa\u6a21\u65b9\u5f0f\uff0c\u5373\u56fe\u50cf\u5efa\u6a21\u548c\u5e8f\u5217\u5b66\u4e60\u3002\u7b2c\u4e00\u9636\u6bb5\uff0c\u4f7f\u7528\u5c11\u91cf\u89c6\u9891\u6837\u672c\uff08\u5341\u4e07\u7ea7\u522b\uff09\u8bad\u7ec3\u5927\u89c4\u6a21\u89c6\u9891\u7279\u5f81\u63d0\u53d6\u6a21\u578b(Extractor)\uff1b\u7b2c\u4e8c\u9636\u6bb5\uff0c\u4f7f\u7528\u5343\u4e07\u7ea7\u6570\u636e\u8bad\u7ec3\u9884\u6d4b\u5668(Predictor)\uff0c\u6700\u7ec8\u5b9e\u73b0\u5728\u8d85\u5927\u89c4\u6a21\uff08\u5343\u4e07/\u4ebf/\u5341\u4ebf\u7ea7\u522b\uff09\u77ed\u89c6\u9891\u4e0a\u4ea7\u4e1a\u5e94\u7528\uff0c\u5176\u539f\u7406\u793a\u610f\u5982\u4e0b\u56fe\u6240\u793a\u3002\n<p align=\"center\">\n<img src=\"images.png\" height=220 width=800 hspace='10'/> <br />\nVideoTag\u6a21\u578b\u793a\u610f\u56fe\n</p>\n- \u6570\u636e\u5904\u7406\uff1a\u89c6\u9891\u662f\u6309\u7279\u5b9a\u987a\u5e8f\u6392\u5217\u7684\u4e00\u7ec4\u56fe\u50cf\u7684\u96c6\u5408\uff0c\u8fd9\u4e9b\u56fe\u50cf\u4e5f\u79f0\u4e3a\u5e27\u3002\u89c6\u9891\u5206\u7c7b\u4efb\u52a1\u9700\u8981\u5148\u5bf9\u77ed\u89c6\u9891\u8fdb\u884c\u89e3\u7801\uff0c\u7136\u540e\u518d\u5c06\u8f93\u51fa\u7684\u56fe\u50cf\u5e27\u5e8f\u5217\u704c\u5165\u5230VideoTag\u4e2d\u8fdb\u884c\u8bad\u7ec3\u548c\u9884\u6d4b\u3002\n- \u56fe\u50cf\u5efa\u6a21\uff1a\u5148\u4ece\u8bad\u7ec3\u6570\u636e\u4e2d\uff0c\u5bf9\u6bcf\u4e2a\u7c7b\u522b\u5747\u5300\u91c7\u6837\u5c11\u91cf\u6837\u672c\u6570\u636e\uff0c\u6784\u6210\u5341\u4e07\u91cf\u7ea7\u7684\u8bad\u7ec3\u89c6\u9891\u3002\u7136\u540e\u4f7f\u7528TSN\u7f51\u7edc\u8fdb\u884c\u8bad\u7ec3\uff0c\u63d0\u53d6\u6240\u6709\u89c6\u9891\u5e27\u7684TSN\u6a21\u578b\u5206\u7c7b\u5c42\u524d\u4e00\u5c42\u7684\u7279\u5f81\u6570\u636e\u3002\u5728\u8fd9\u4e2a\u8fc7\u7a0b\u4e2d\uff0c\u6bcf\u4e00\u5e27\u90fd\u88ab\u8f6c\u5316\u6210\u76f8\u5e94\u7684\u7279\u5f81\u5411\u91cf\uff0c\u4e00\u6bb5\u89c6\u9891\u88ab\u8f6c\u5316\u6210\u4e00\u4e2a\u7279\u5f81\u5e8f\u5217\u3002\n- \u5e8f\u5217\u5b66\u4e60\uff1a\u91c7\u7528Attention clusters\u3001LSTM\u548cNextvlad\u5bf9\u7279\u5f81\u5e8f\u5217\u8fdb\u884c\u5efa\u6a21\uff0c\u5b66\u4e60\u5404\u4e2a\u7279\u5f81\u4e4b\u95f4\u7684\u7ec4\u5408\u65b9\u5f0f\uff0c\u8fdb\u4e00\u6b65\u63d0\u9ad8\u6a21\u578b\u51c6\u786e\u7387\u3002\u7531\u4e8e\u5e8f\u5217\u5b66\u4e60\u76f8\u6bd4\u4e8e\u56fe\u50cf\u5efa\u6a21\u8017\u65f6\u66f4\u77ed\uff0c\u56e0\u6b64\u53ef\u4ee5\u878d\u5408\u591a\u4e2a\u5177\u6709\u4e92\u8865\u6027\u7684\u5e8f\u5217\u6a21\u578b\u3002\u793a\u4f8b\u4ee3\u7801\u4ec5\u4f7f\u7528Attention\\_LSTM\u7f51\u7edc\u8fdb\u884c\u5e8f\u5217\u7279\u5f81\u9884\u6d4b\u3002\n- \u9884\u6d4b\u7ed3\u679c\uff1a\u878d\u5408\u591a\u4e2a\u6a21\u578b\u7ed3\u679c\u5b9e\u73b0\u89c6\u9891\u5206\u7c7b\uff0c\u8fdb\u4e00\u6b65\u63d0\u9ad8\u5206\u7c7b\u51c6\u786e\u7387\u3002\n## \u4f7f\u7528\u65b9\u6cd5\n- [1. \u5982\u4f55\u8fd0\u884c\u6837\u4f8b\u4ee3\u7801](./Run.md)\n- [2. \u5982\u4f55\u4f7f\u7528\u81ea\u5df1\u7684\u6570\u636e\u8fdb\u884c\u6d4b\u8bd5](./Test.md)\n- [3. \u5982\u4f55\u8fdb\u884c\u6a21\u578bfine-tune](./FineTune.md)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/cb151237-0521-4dc7-bd9a-f0e86568e947.json b/docs/doc/cb151237-0521-4dc7-bd9a-f0e86568e947.json
new file mode 100644
index 000000000..69195d9c5
--- /dev/null
+++ b/docs/doc/cb151237-0521-4dc7-bd9a-f0e86568e947.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The CAM_Module and YOWO backbone model are for image processing and video classification respectively, using attention mechanism and convolutional layers. The code loads pretrain weights correctly and returns a Paddle Video YOWO model after processing input clips through backbones, CFAM, and convolutional layers.",
+    "details": [
+        {
+            "comment": "This code is a part of the PaddleVideo library and defines a custom layer called CAM_Module. It takes an input dimension as a parameter, initializes a gamma parameter, and inherits from nn.Layer. The class constructor creates a zero-dimensional tensor as the initial value for gamma using paddle.create_parameter function. This module is used in backbone architectures to enable Channel Attention Mechanism for image processing tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/yowo.py\":0-27",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..registry import BACKBONES\nfrom .darknet import Darknet\nfrom .resnext101 import ResNext101\nimport paddle.nn as nn\nimport paddle\nclass CAM_Module(nn.Layer):\n    def __init__(self, in_dim):\n        super(CAM_Module, self).__init__()\n        self.chanel_in = in_dim\n        temp = paddle.zeros([1], dtype='float32')\n        self.gamma = paddle.create_parameter(shape=temp.shape, dtype=str(temp.numpy().dtype),\n                                             default_initializer=paddle.nn.initializer.Assign(temp))"
+        },
+        {
+            "comment": "The code defines a CFAMPBlock layer with a Channel-wise Attention Mechanism. It contains a convolution, batch normalization, and ReLU layers for the attention mechanism, followed by a gamma scaling and channel-wise attention calculation. The forward function performs the attention operation and scales the input using the attention map.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/yowo.py\":28-52",
+            "content": "        self.softmax = nn.Softmax(axis=-1)\n    def forward(self, x):\n        m_batchsize, C, height, width = x.shape\n        proj_query = paddle.reshape(x, [m_batchsize, C, -1])\n        proj_key = paddle.transpose(paddle.reshape(\n            x, [m_batchsize, C, -1]), perm=[0, 2, 1])\n        energy = paddle.bmm(proj_query, proj_key)\n        energy_new = paddle.expand_as(paddle.max(\n            energy, axis=-1, keepdim=True), energy) - energy\n        attention = self.softmax(energy_new)\n        proj_value = paddle.reshape(x, [m_batchsize, C, -1])\n        out = paddle.bmm(attention, proj_value)\n        out = out.reshape([m_batchsize, C, height, width])\n        out = self.gamma * out + x\n        return out\nclass CFAMBlock(nn.Layer):\n    def __init__(self, in_channels, out_channels):\n        super(CFAMBlock, self).__init__()\n        inter_channels = 1024\n        self.conv_bn_relu1 = nn.Sequential(nn.Conv2D(in_channels, inter_channels, kernel_size=1, bias_attr=False),\n                                           nn.BatchNorm2D(inter_channels),"
+        },
+        {
+            "comment": "This code defines a YOWO backbone model, which is a neural network architecture for video classification tasks. It consists of several convolutional layers followed by batch normalization and ReLU activations. The CAM_Module is also included, which might be a custom attention mechanism. The output channels are adjusted based on the input size. Dropout regularization is applied to prevent overfitting.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/yowo.py\":53-78",
+            "content": "                                           nn.ReLU())\n        self.conv_bn_relu2 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),\n                                           nn.BatchNorm2D(inter_channels),\n                                           nn.ReLU())\n        self.sc = CAM_Module(inter_channels)\n        self.conv_bn_relu3 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),\n                                           nn.BatchNorm2D(inter_channels),\n                                           nn.ReLU())\n        self.conv_out = nn.Sequential(nn.Dropout2D(0.1), nn.Conv2D(\n            inter_channels, out_channels, 1, bias_attr=True))\n    def forward(self, x):\n        x = self.conv_bn_relu1(x)\n        x = self.conv_bn_relu2(x)\n        x = self.sc(x)\n        x = self.conv_bn_relu3(x)\n        output = self.conv_out(x)\n        return output\n@BACKBONES.register()\nclass YOWO(nn.Layer):\n    def __init__(self, num_class, pretrained_2d=None, pretrained_3d=None):"
+        },
+        {
+            "comment": "The code initializes a YOWO model with pre-trained 2D and 3D backbones, loads pre-trained weights if provided for both backbones, and has a method to initialize weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/yowo.py\":79-107",
+            "content": "        super(YOWO, self).__init__()\n        self.pretrained_2d = pretrained_2d\n        self.pretrained_3d = pretrained_3d\n        self.backbone_2d = Darknet()\n        self.backbone_3d = ResNext101()\n        self.num_ch_2d = 425\n        self.num_ch_3d = 2048\n        self.num_class = num_class\n        self.cfam = CFAMBlock(self.num_ch_2d + self.num_ch_3d, 1024)\n        self.conv_final = nn.Conv2D(\n            1024, 5 * (self.num_class + 4 + 1), kernel_size=1, bias_attr=False)\n        self.seen = 0\n    def init_weights(self):\n        if self.pretrained_2d is not None:\n            self.backbone_2d = self.load_pretrain_weight(\n                self.backbone_2d, self.pretrained_2d)\n        if self.pretrained_3d is not None:\n            self.backbone_3d = self.load_pretrain_weight(\n                self.backbone_3d, self.pretrained_3d)\n    def load_pretrain_weight(self, model, weights_path):\n        model_dict = model.state_dict()\n        param_state_dict = paddle.load(weights_path)\n        ignore_weights = set()\n        # hack: fit for faster rcnn. Pretrain weights contain prefix of 'backbone'"
+        },
+        {
+            "comment": "This code is replacing the prefix of 'res5' with 'bbox_head.head' in param_state_dict to load pretrain weights correctly. It then checks if the weight shapes match and adds redundant or unmatched weights to ignore_weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/yowo.py\":108-128",
+            "content": "        # while res5 module is located in bbox_head.head. Replace the prefix of\n        # res5 with 'bbox_head.head' to load pretrain weights correctly.\n        for k in list(param_state_dict.keys()):\n            if 'backbone.res5' in k:\n                new_k = k.replace('backbone', 'bbox_head.head')\n                if new_k in model_dict.keys():\n                    value = param_state_dict.pop(k)\n                    param_state_dict[new_k] = value\n        for name, weight in param_state_dict.items():\n            if name in model_dict.keys():\n                if list(weight.shape) != list(model_dict[name].shape):\n                    print(\n                        '{} not used, shape {} unmatched with {} in model.'.format(\n                            name, weight.shape, list(model_dict[name].shape)))\n                    ignore_weights.add(name)\n            else:\n                print('Redundant weight {} and ignore it.'.format(name))\n                ignore_weights.add(name)\n        for weight in ignore_weights:"
+        },
+        {
+            "comment": "This function loads model weights from the specified path and returns a Paddle Video YOWO model. The model's `forward` method takes an input clip, separates it into 3D and 2D representations, passes them through their respective backbones, concatenates them together, and finally feeds it to CFAM and a convolutional layer for processing before returning the output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/yowo.py\":129-149",
+            "content": "            param_state_dict.pop(weight, None)\n        model.set_dict(param_state_dict)\n        print('Finish loading model weights: {}'.format(weights_path))\n        return model\n    def forward(self, input):\n        x_3d = input  # Input clip\n        x_2d = input[:, :, -1, :, :]  # Last frame of the clip that is read\n        x_2d = self.backbone_2d(x_2d)\n        x_3d = self.backbone_3d(x_3d)\n        x_3d = paddle.squeeze(x_3d, axis=2)\n        x = paddle.concat([x_3d, x_2d], axis=1)\n        x = self.cfam(x)\n        out = self.conv_final(x)\n        return out"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/cbad7aa4-450b-4d96-94e5-3befd12c571c.json b/docs/doc/cbad7aa4-450b-4d96-94e5-3befd12c571c.json
new file mode 100644
index 000000000..4480c587a
--- /dev/null
+++ b/docs/doc/cbad7aa4-450b-4d96-94e5-3befd12c571c.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code provides a guide for using OpenPose to process figure skating action data by converting videos into bone point data, suitable for model training and prediction in the PaddleVideo framework.",
+    "details": [
+        {
+            "comment": "This code is a guide for processing figure skating action data using openpose, a tool for detecting body skeletons from videos. It includes instructions on how to install and test openpose, as well as specific steps for processing video data with the Skeleton Scripts project.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FigureSkating/README.md\":0-45",
+            "content": "# \u82b1\u6837\u6ed1\u51b0\u52a8\u4f5c\u8bc6\u522b\n---\n## \u5185\u5bb9\n- [\u89c6\u9891\u6570\u636e\u5904\u7406\u65b9\u6cd5](#\u89c6\u9891\u6570\u636e\u5904\u7406\u65b9\u6cd5)\n- [\u6a21\u578b\u8bad\u7ec3\u9884\u6d4b\u65b9\u6cd5](#\u6a21\u578b\u8bad\u7ec3\u9884\u6d4b\u65b9\u6cd5)\n<div align=\"center\">\n  <img src=\"Alex.gif\" width=250/></div>\n### \u89c6\u9891\u6570\u636e\u5904\u7406\u65b9\u6cd5\n - \u63d0\u4f9b\u4ece\u89c6\u9891\u4e2d\u63d0\u53d6\u9aa8\u9abc\u70b9\u6570\u636e\u7684\u65b9\u6cd5\uff0c\u65b9\u4fbf\u7528\u6237\u81ea\u884c\u63d0\u53d6\u6570\u636e\u8fdb\u884c\u6d4b\u8bd5\u3002\n \u82b1\u6837\u6ed1\u51b0\u6570\u636e\u63d0\u53d6\u91c7\u7528\u4e86openpose\uff0c\u901a\u8fc7\u5176\u63d0\u4f9b\u7684demo\u6216\u662f\u76f8\u5e94\u7684api\u6765\u5b9e\u73b0\u6570\u636e\u7684\u63d0\u53d6\uff0c\u56e0\u6b64\u9700\u8981\u7528\u6237\u914d\u7f6eopenpose\u73af\u5883\u3002\n \u5982\u4e0b\u662f\u901a\u8fc7\u82b1\u6837\u6ed1\u51b0\u6570\u636e\u96c6\u6784\u5efa\u9879\u76ee[Skeleton Scripts](https://github.com/HaxiSnake/skeleton_scripts)\u63d0\u53d6\u9aa8\u9abc\u70b9\u6570\u636e\u65b9\u6cd5\u7684\u5177\u4f53\u4ecb\u7ecd\u3002\n #### step1 \u5b89\u88c5openpose\n - \u53c2\u8003\uff1ahttps://github.com/CMU-Perceptual-Computing-Lab/openpose  \n #### step2 \u6d4b\u8bd5openpose\u63d0\u4f9bdemo\n - \u8fd9\u91cc\u901a\u8fc7\u6d4b\u8bd5openpose\u7684demo\u7a0b\u5e8f\u6765\u9a8c\u8bc1\u662f\u5426\u5b89\u88c5\u6210\u529f\u3002\n demo1\uff1a\u68c0\u6d4b\u89c6\u9891\u4e2d\u8eab\u4f53\u9aa8\u9abc\u70b9\uff08\u4ee5linux\u7cfb\u7edf\u4e3a\u4f8b\uff09\uff1a\n ```bash\n ./build/examples/openpose/openpose.bin --video examples_video.avi --write_json output/ --display 0 --render_pose 0\n ```\n \u6267\u884c\u6210\u529f\u4e4b\u540e\u4f1a\u5728output/\u8def\u5f84\u4e0b\u751f\u6210\u89c6\u9891\u6bcf\u4e00\u5e27\u9aa8\u9abc\u70b9\u6570\u636e\u7684json\u6587\u4ef6\u3002\n demo2\uff1a\u68c0\u6d4b\u89c6\u9891\u4e2d\u8eab\u4f53+\u9762\u90e8+\u624b\u90e8\u9aa8\u9abc\u70b9\uff08\u4ee5linux\u7cfb\u7edf\u4e3a\u4f8b\uff09\uff1a\n ```bash\n ./build/examples/openpose/openpose.bin --video examples_video.avi --write_json output/ --display 0 --render_pose 0 --face --hand\n ```\n \u6267\u884c\u6210\u529f\u4e4b\u540e\u4f1a\u5728output/\u8def\u5f84\u4e0b\u751f\u6210\u89c6\u9891\u6bcf\u4e00\u5e27\u8eab\u4f53+\u9762\u90e8+\u624b\u90e8\u9aa8\u9abc\u70b9\u6570\u636e\u7684json\u6587\u4ef6\u3002\n #### step3 \u89c6\u9891\u53ca\u76f8\u5173\u4fe1\u606f\u5904\u7406\n - \u7531\u4e8e[Skeleton Scripts](https://github.com/HaxiSnake/skeleton_scripts)\u4e3a\u5236\u4f5c\u82b1\u6837\u6ed1\u51b0\u6570\u636e\u96c6\u6240\u7528\uff0c\u56e0\u6b64\u6b64\u5904\u6b65\u9aa4\u53ef\u80fd\u5b58\u5728\u4e0d\u540c\u7a0b\u5ea6\u8bef\u5dee\uff0c\u5b9e\u9645\u8bf7\u7528\u6237\u81ea\u884c\u8c03\u8bd5\u4ee3\u7801\u3002"
+        },
+        {
+            "comment": "The code is outlining the steps to convert figure skating videos into bone point data, which can then be used for model training and prediction. This involves specifying the video storage paths, extracting video information, using OpenPose to process the videos, and saving the results as npy files. Finally, users can input these npy files into a model or prediction engine. The code is specifically for FigureSkating application in PaddleVideo codebase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FigureSkating/README.md\":47-91",
+            "content": " \u5c06\u8981\u8f6c\u5316\u7684\u82b1\u6837\u6ed1\u51b0\u89c6\u9891\u50a8\u5b58\u5230[Skeleton Scripts](https://github.com/HaxiSnake/skeleton_scripts)\u7684\u6307\u5b9a\u8def\u5f84\uff08\u53ef\u81ea\u884c\u521b\u5efa\uff09\uff1a\n ```bash\n ./skating2.0/skating63/\n ```\n \u540c\u65f6\u9700\u8981\u7528\u6237\u81ea\u884c\u5b8c\u6210\u5bf9\u89c6\u9891\u4fe1\u606f\u7684\u63d0\u53d6\uff0c\u4fdd\u5b58\u4e3alabel_skating63.csv\u6587\u4ef6\uff0c\u50a8\u5b58\u5230\u5982\u4e0b\u8def\u5f84\u4e2d\uff08\u53ef\u81ea\u884c\u521b\u5efa\uff09\uff1a\n ```bash\n ./skating2.0/skating63/\n ./skating2.0/skating63_openpose_result/\n ```\n label_skating63.csv\u4e2d\u683c\u5f0f\u5982\u4e0b\uff1a\n | \u52a8\u4f5c\u5206\u7c7b | \u89c6\u9891\u6587\u4ef6\u540d | \u89c6\u9891\u5e27\u6570 | \u52a8\u4f5c\u6807\u7b7e |\n | :----: | :----: | :----: | :---- |\n \u6b64\u5904\u7528\u6237\u53ea\u9700\u8981\u8f93\u5165\u89c6\u9891\u6587\u4ef6\u540d\uff08\u65e0\u9700\u540e\u7f00\uff0c\u9ed8\u8ba4\u540e\u7f00\u540d\u4e3a.mp4\uff0c\u5176\u4ed6\u683c\u5f0f\u9700\u81ea\u884c\u66f4\u6539\u4ee3\u7801)\uff0c\u5176\u4ed6\u4e09\u9879\u5b9a\u4e49\u4e3a\u7a7a\u5b57\u7b26\u4e32\u5373\u53ef\uff0c\u4e0d\u540c\u8868\u9879\u4e4b\u95f4\u901a\u8fc7 ',' \u5206\u5272\u3002\n #### step4 \u6267\u884cskating_convert.py:\n - \u6ce8\u610f\uff0c\u8fd9\u4e00\u6b65\u9700\u8981\u6839\u636e\u7528\u6237\u5bf9openpose\u7684\u914d\u7f6e\u8fdb\u884c\u4ee3\u7801\u7684\u66f4\u6539\uff0c\u4e3b\u8981\u4fee\u6539\u9879\u4e3aopenpose\u8def\u5f84\u3001openpose-demo\u8def\u5f84\u7b49\uff0c\u5177\u4f53\u8be6\u89c1\u4ee3\u7801\u3002\n \u672c\u811a\u6b65\u539f\u7406\u662f\u8c03\u7528openpose\u63d0\u4f9b\u7684demo\u63d0\u53d6\u89c6\u9891\u4e2d\u7684\u9aa8\u9abc\u70b9\uff0c\u5e76\u8fdb\u884c\u6570\u636e\u683c\u5f0f\u6e05\u6d17\uff0c\u6700\u540e\u5c06\u6bcf\u4e2a\u89c6\u9891\u7684\u63d0\u53d6\u7ed3\u679c\u7ed3\u679c\u6253\u5305\u6210json\u6587\u4ef6\uff0cjson\u6587\u4ef6\u50a8\u5b58\u5728\u5982\u4e0b\u8def\u5f84\uff1a\n ```bash\n ./skating2.0/skating63_openpose_result/label_skating63_data/\n ```\n #### step5 \u6267\u884cskating_gendata.py:\n \u5c06json\u6587\u4ef6\u6574\u7406\u4e3anpy\u6587\u4ef6\u5e76\u4fdd\u5b58\uff0c\u591a\u4e2a\u89c6\u9891\u6587\u4ef6\u5c06\u4fdd\u5b58\u4e3a\u4e00\u4e2anpy\u6587\u4ef6\uff0c\u4fdd\u5b58\u8def\u5f84\u4e3a\uff1a\n ```bash\n ./skating2.0/skating63_openpose_result/skeleton_file/\n ```\n - \u901a\u8fc7\u4e0a\u8ff0\u6b65\u9aa4\u5c31\u53ef\u4ee5\u5c06\u89c6\u9891\u6570\u636e\u8f6c\u5316\u4e3a\u65e0\u6807\u7b7e\u7684\u9aa8\u9abc\u70b9\u6570\u636e\u3002\n - \u6700\u540e\u7528\u6237\u53ea\u9700\u5c06npy\u6570\u636e\u8f93\u5165\u9001\u5165\u7f51\u7edc\u5f00\u59cb\u6a21\u578b\u6d4b\u8bd5\uff0c\u4ea6\u53ef\u901a\u8fc7\u9884\u6d4b\u5f15\u64ce\u63a8\u7406\u3002\n ### \u6a21\u578b\u8bad\u7ec3\u9884\u6d4b\u65b9\u6cd5\n \u6a21\u578b\u4f7f\u7528\u65b9\u6cd5\u53c2\u8003[ST-GCN\u6a21\u578b\u6587\u6863](../../docs/zh-CN/model_zoo/recognition/stgcn.md)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/cc552703-8a7a-499d-bbad-f142b2601dbb.json b/docs/doc/cc552703-8a7a-499d-bbad-f142b2601dbb.json
new file mode 100644
index 000000000..6682af630
--- /dev/null
+++ b/docs/doc/cc552703-8a7a-499d-bbad-f142b2601dbb.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code includes the AttrDict class and functions for managing config files in PaddleVideo. These functions handle dictionary creation, config file parsing, recursive printing, and value overriding.",
+    "details": [
+        {
+            "comment": "This code file contains the definition of a class AttrDict, which is used to handle configurations in the PaddleVideo library. It also sets up a logger for logging information related to PaddleVideo. The __all__ variable holds the list of functions/classes that are exported by this module. This file is part of PaddleVideo's utility package.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py\":0-34",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os\nimport yaml\nfrom paddlevideo.utils.logger import coloring, get_logger, setup_logger\n__all__ = ['get_config']\nlogger = setup_logger(\"./\", name=\"paddlevideo\", level=\"INFO\")\nclass AttrDict(dict):\n    \"\"\"Attr Dict\"\"\"\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value"
+        },
+        {
+            "comment": "This code defines three functions: \"create_attr_dict\", \"parse_config\", and \"print_dict\". The \"create_attr_dict\" function converts specific values in a dictionary to AttrDict objects. The \"parse_config\" function loads a configuration file into an AttrDict object after applying the create_attr_dict function to it. Finally, the \"print_dict\" function recursively prints out the contents of a dictionary, indented based on their relationships.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py\":37-70",
+            "content": "def create_attr_dict(yaml_config):\n    \"\"\"create attr dict\"\"\"\n    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n    with open(cfg_file, 'r') as fopen:\n        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))\n    create_attr_dict(yaml_config)\n    return yaml_config\ndef print_dict(d, delimiter=0):\n    \"\"\"\n    Recursively visualize a dict and\n    indenting acrrording by the relationship of keys.\n    \"\"\"\n    placeholder = \"-\" * 60\n    for k, v in sorted(d.items()):\n        if isinstance(v, dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", coloring(k,"
+        },
+        {
+            "comment": "This code defines functions to print and check config files. The \"print_config\" function visualizes the config file by printing its content in a structured format, while the \"check_config\" function is currently a placeholder with no implementation. The \"override\" function allows recursive replacement of values within a dictionary or list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py\":71-109",
+            "content": "                                                                   \"HEADER\")))\n            print_dict(v, delimiter + 4)\n        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \",\n                                         coloring(str(k), \"HEADER\")))\n            for value in v:\n                print_dict(value, delimiter + 4)\n        else:\n            logger.info(\"{}{} : {}\".format(delimiter * \" \",\n                                           coloring(k, \"HEADER\"),\n                                           coloring(v, \"OKGREEN\")))\n        if k.isupper():\n            logger.info(placeholder)\ndef print_config(config):\n    \"\"\"\n    visualize configs\n    Arguments:\n        config: configs\n    \"\"\"\n    print_dict(config)\ndef check_config(config):\n    \"\"\"\n    Check config\n    \"\"\"\n    pass\ndef override(dl, ks, v):\n    \"\"\"\n    Recursively replace dict of list\n    Args:\n        dl(dict or list): dict or list to be replaced\n        ks(list): list of keys\n        v(str): value to be replaced"
+        },
+        {
+            "comment": "This code defines a function `override_config` which takes in a config and optionally an options parameter. It checks if the config is either a list or dictionary, and ensures that there are keys in the config. If the config is a list, it uses the `str2num` function to convert the first key into a number and then uses this index to set the corresponding value. If there's only one key, it checks if the index is within range before setting the value. If there are multiple keys, it calls the `override` function with the first key, remaining keys, and value. If the config is a dictionary, it checks if the first key exists in the dictionary. If it doesn't, it logs a warning about a new field being detected. It then sets the value using the first key or calls `override` for subsequent keys.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py\":110-141",
+            "content": "    \"\"\"\n    def str2num(v):\n        \"\"\"str2num\"\"\"\n        try:\n            return eval(v)\n        except Exception:\n            return v\n    assert isinstance(dl, (list, dict)), (\"{} should be a list or a dict\")\n    assert len(ks) > 0, ('lenght of keys should larger than 0')\n    if isinstance(dl, list):\n        k = str2num(ks[0])\n        if len(ks) == 1:\n            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))\n            dl[k] = str2num(v)\n        else:\n            override(dl[k], ks[1:], v)\n    else:\n        if len(ks) == 1:\n            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))\n            if not ks[0] in dl:\n                logger.warning('A new filed ({}, {}) detected!'.format(ks[0], dl))\n            dl[ks[0]] = str2num(v)\n        else:\n            assert ks[0] in dl, (\n                '({}) doesn\\'t exist in {}, a new dict field is invalid'.format(\n                    ks[0], dl))\n            override(dl[ks[0]], ks[1:], v)\ndef override_config(config, options=None):\n    \"\"\""
+        },
+        {
+            "comment": "This code defines a function that recursively overrides the config with given options. It takes a dictionary (config) and a list of key-value pairs (options) as arguments, and returns the updated config after overriding. The function checks if the options are provided and in the correct format. If so, it splits the key-value pair, extracts the keys and values, and recursively overrides the config with these values. Finally, it returns the updated config. The code also includes a separate function that reads the config from a file and has optional parameters for overrides and displaying information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py\":142-173",
+            "content": "    Recursively override the config\n    Args:\n        config(dict): dict to be replaced\n        options(list): list of pairs(key0.key1.idx.key2=value)\n            such as: [\n                epochs=20',\n                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'\n            ]\n    Returns:\n        config(dict): replaced config\n    \"\"\"\n    if options is not None:\n        for opt in options:\n            assert isinstance(opt,\n                              str), (\"option({}) should be a str\".format(opt))\n            assert \"=\" in opt, (\n                \"option({}) should contain a =\"\n                \"to distinguish between key and value\".format(opt))\n            pair = opt.split('=')\n            assert len(pair) == 2, (\"there can be only a = in the option\")\n            key, value = pair\n            keys = key.split('.')\n            override(config, keys, value)\n    return config\ndef get_config(fname, overrides=None, show=True):\n    \"\"\"\n    Read config from file\n    \"\"\"\n    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))"
+        },
+        {
+            "comment": "This function parses a configuration file, applies any overrides, displays the config if requested, and checks its validity before returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py\":174-179",
+            "content": "    config = parse_config(fname)\n    override_config(config, overrides)\n    if show:\n        print_config(config)\n    check_config(config)\n    return config"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/cc63a58f-7f9a-405d-ab67-aa85d3dbc875.json b/docs/doc/cc63a58f-7f9a-405d-ab67-aa85d3dbc875.json
new file mode 100644
index 000000000..5acf5000d
--- /dev/null
+++ b/docs/doc/cc63a58f-7f9a-405d-ab67-aa85d3dbc875.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code provides instructions to run a script that executes a series of benchmark tests for the TimeSformer model in PaddleVideo. The provided script, \"run_all.sh\", performs several steps including switching to a specific branch (benchmark_dev), installing dependencies, downloading and uncompressing data, and then running various benchmarks with different parameters using another script named \"run_benchmark.sh\".",
+    "details": [
+        {
+            "comment": "This code provides instructions to run a script that executes a series of benchmark tests for the TimeSformer model in PaddleVideo. The provided script, \"run_all.sh\", performs several steps including switching to a specific branch (benchmark_dev), installing dependencies, downloading and uncompressing data, and then running various benchmarks with different parameters using another script named \"run_benchmark.sh\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/benchmark/TimeSformer/README.md\":0-13",
+            "content": "\u6267\u884c\n```bash\nbash ./run_all.sh down_data\n```\n\u5373\u53ef\u8fd0\u884c.\nrun_all.sh\u5185\u90e8\u7684\u6267\u884c\u6b65\u9aa4\uff1a\n1. cd \u5230 ../../ (\u4e5f\u5c31\u662f PaddleVideo \u76ee\u5f55)\n2. \u5207\u6362\u5230benchmark_dev\u5206\u652f\n3. \u5b89\u88c5 PaddleVideo \u6240\u9700\u4f9d\u8d56\n4. cd \u56dePaddleVideo/data/ucf101\n5. wget\u4e0b\u8f7d\u6570\u636e\u96c6\u5e76\u89e3\u538b\u7f29\uff0c\u5e76\u4e0b\u8f7d\u9884\u8bad\u7ec3\u6743\u91cd\u653e\u5230data\u76ee\u5f55\u4e0b\n6. \u518d\u6b21cd \u56de\u5230 ../../ (\u4e5f\u5c31\u662f PaddleVideo \u76ee\u5f55)\n8. \u6309\u7167\u4e0d\u540c\u7684\u53c2\u6570\u6267\u884c run_benchmark.sh \u811a\u672c"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ccdce669-2f90-4127-9495-93e9ef0f9b0e.json b/docs/doc/ccdce669-2f90-4127-9495-93e9ef0f9b0e.json
new file mode 100644
index 000000000..bfd6ceb44
--- /dev/null
+++ b/docs/doc/ccdce669-2f90-4127-9495-93e9ef0f9b0e.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The BMNDataset class handles video datasets for action localization, initializing with file path, pipeline, and subset information. It loads data, sorts by name, and returns features, ground truth IOU map, and start frame indices for training. The function also prepares test data given an index by processing through the pipeline and returning selected results.",
+    "details": [
+        {
+            "comment": "This code snippet defines the BMNDataset class for video datasets used in action localization. It imports necessary modules, registers the class with the DATASETS registry, and initializes the dataset with file path, pipeline, and subset information. Logger is also defined for logging purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/bmn_dataset.py\":0-35",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy\nimport json\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass BMNDataset(BaseDataset):\n    \"\"\"Video dataset for action localization.\n    \"\"\"\n    def __init__(\n        self,\n        file_path,\n        pipeline,\n        subset,\n        **kwargs,\n    ):\n        self.subset = subset\n        super().__init__(file_path, pipeline, **kwargs)"
+        },
+        {
+            "comment": "This function is loading an index file to get video information and then sorts the data by video name. It also adds a video_idx attribute to each element in the list and returns the video features, ground truth IOU map, and start frame indices for training purposes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/bmn_dataset.py\":37-63",
+            "content": "    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        annos = json.load(open(self.file_path))\n        for video_name in annos.keys():\n            video_subset = annos[video_name][\"subset\"]\n            if self.subset in video_subset:\n                info.append(\n                    dict(\n                        video_name=video_name,\n                        video_info=annos[video_name],\n                    ))\n        #sort by video_name\n        sort_f = lambda elem: elem['video_name']\n        info.sort(key=sort_f)\n        #add video_idx to info\n        for idx, elem in enumerate(info):\n            info[idx]['video_idx'] = idx\n        logger.info(\"{} subset video numbers: {}\".format(\n            self.subset, len(info)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID: Prepare data for training/valid given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        return results['video_feat'], results['gt_iou_map'], results['gt_start'],\\"
+        },
+        {
+            "comment": "This function prepares test data given an index by copying the dataset info, processing it through the pipeline, and returning selected results (video_feat, gt_iou_map, gt_start, gt_end, video_idx).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/bmn_dataset.py\":64-71",
+            "content": "               results['gt_end']\n    def prepare_test(self, idx):\n        \"\"\"TEST: Prepare the data for test given the index.\"\"\"\n        results = copy.deepcopy(self.info[idx])\n        results = self.pipeline(results)\n        return results['video_feat'], results['gt_iou_map'], results['gt_start'], \\\n               results['gt_end'], results['video_idx']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/cd0367d0-1762-4da5-9fc5-3268989e55a1.json b/docs/doc/cd0367d0-1762-4da5-9fc5-3268989e55a1.json
new file mode 100644
index 000000000..ee2f858ea
--- /dev/null
+++ b/docs/doc/cd0367d0-1762-4da5-9fc5-3268989e55a1.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines a semi-Video Object Segmentation abstract base class with train_step, valid_step, and test_step methods for different modes (train, valid, test, or infer). Subclasses must implement 4 methods for model training and evaluation.",
+    "details": [
+        {
+            "comment": "This code defines an abstract base class for semi-Video Object Segmentation. It has three required methods: train_step, valid_step, and test_step. The class also contains backbone, head, and loss modules to extract feature, process feature, and define the loss function respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/base.py\":0-28",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nfrom ... import builder\nimport paddle.nn as nn\nclass BaseSegment(nn.Layer):\n    \"\"\"Base class for semi-Video Object Segmentation.\n    All subclass should overwrite:\n    - Methods:``train_step``, supporting to forward when training.\n    - Methods:``valid_step``, supporting to forward when validating.\n    - Methods:``test_step``, supporting to forward when testing.\n    Args:\n        backbone (dict): Backbone modules to extract feature.\n        head (dict): Head to process feature.\n        loss(dict): Loss function."
+        },
+        {
+            "comment": "This code initializes a segment model by building its backbone, head, and loss based on the provided parameters. The forward method defines how the model runs in different modes (train, valid, test, or infer). If running in train mode, the model performs training operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/base.py\":29-56",
+            "content": "    \"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):\n                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n        if loss is not None:\n            self.loss = builder.build_loss(loss)\n        else:\n            self.loss = None\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py\n        \"\"\"\n        if mode == 'train':"
+        },
+        {
+            "comment": "The code defines an abstract class with four methods (train_step, val_step, test_step, and infer_step) that must be implemented by subclasses. The method name is chosen based on the mode input parameter for different phases of model training or evaluation. If an unsupported mode is passed, a NotImplementedError is raised.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/base.py\":57-89",
+            "content": "            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch, **kwargs):\n        \"\"\"Training step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch, **kwargs):\n        \"\"\"Validating step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch, **kwargs):\n        \"\"\"Test step.\n        \"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch, **kwargs):\n        \"\"\"Infer step.\n        \"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/cd86fd79-9925-4ba8-b715-582e518aa42e.json b/docs/doc/cd86fd79-9925-4ba8-b715-582e518aa42e.json
new file mode 100644
index 000000000..fcb48811f
--- /dev/null
+++ b/docs/doc/cd86fd79-9925-4ba8-b715-582e518aa42e.json
@@ -0,0 +1,75 @@
+{
+    "summary": "The code defines a TSM ResNet backbone class for feature extraction in Temporal Segment Networks, with customizable depth, segments, and pretrained options. It applies temporal shift modules and convolutions across various ResNet models (18-152 layers).",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries, defines a class for a TSM ResNet backbone, and includes functions for loading pre-trained models. The code also contains a license notice and copyright information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":0-30",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport math\nimport sys\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom paddle.regularizer import L2Decay\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils.save_load import load_ckpt\n# Download URL of pretrained model"
+        },
+        {
+            "comment": "This code defines a class called \"ConvBNLayer\" that combines Conv2D and BatchNorm2D layers, taking input and output channel counts, kernel size, and stride as arguments. It also includes a dictionary of pre-trained model URLs for ResNet variations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":31-53",
+            "content": "# {\n# \"ResNet50_vd\":\n# \"wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams\",\n# \"ResNet101_vd\":\n# \"https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams\",\n# \"ResNet18_vd\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams\",\n# \"ResNet34_vd\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet34_vd_ssld_pretrained.pdparams\",\n# \"ResNet152_vd\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams\",\n# \"ResNet200_vd\":\n# \"https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams\",\n# }\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:\n        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1."
+        },
+        {
+            "comment": "The code defines a class \"ConvBNLayer\" with parameters such as in_channels, out_channels, kernel_size, stride, groups, is_tweaks_mode, act, and name. It also adds an average pooling layer before the convolution for ResNet-D 1/2 tweak, which works well in practice.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":54-77",
+            "content": "        groups (int): Groups in the Conv2D, Default: 1.\n        is_tweaks_mode (bool): switch for tweaks. Default: False.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode\n        #ResNet-D 1/2:add a 2\u00d72 average pooling layer with a stride of 2 before the convolution,\n        #             whose stride is changed to 1, works well in practice.\n        self._pool2d_avg = AvgPool2D(kernel_size=2,\n                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,"
+        },
+        {
+            "comment": "This code defines a Convolutional Neural Network (CNN) layer with Batch Normalization and optionally activation function. The layer can have tweaks mode for pooling and average pooling operations. It also includes a forward method for passing inputs through the defined layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":78-104",
+            "content": "                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(\n            out_channels,\n            weight_attr=ParamAttr(name=bn_name + \"_scale\",\n                                  regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(bn_name + \"_offset\", regularizer=L2Decay(0.0)))\n    def forward(self, inputs):\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)"
+        },
+        {
+            "comment": "The code defines a BottleneckBlock class with in_channels, out_channels, stride, shortcut (optional), if_first (boolean), and num_seg as parameters. It initializes ConvBNLayer objects for conv0, conv1, and conv2 layers. The BottleneckBlock is a part of the ResNet architecture with tweaks and TSM.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":105-131",
+            "content": "        return y\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 if_first=False,\n                 num_seg=8,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,"
+        },
+        {
+            "comment": "This code defines a class with an initializer and a forward method. The initializer sets the number of segments (num_seg) and whether to use shortcut connection. The forward method reshapes input, pads it based on segment numbers, and likely performs some computations for Temporal Segment Networks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":132-158",
+            "content": "                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels * 4,\n                                     kernel_size=1,\n                                     stride=1,\n                                     is_tweaks_mode=False if if_first else True,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n        self.num_seg = num_seg\n    def forward(self, inputs):\n        if paddle.is_compiled_with_custom_device('npu'):\n            x = inputs\n            seg_num = self.num_seg\n            shift_ratio = 1.0 / self.num_seg\n            shape = x.shape  #[N*T, C, H, W]\n            reshape_x = x.reshape(\n                (-1, seg_num, shape[1], shape[2], shape[3]))  #[N, T, C, H, W]\n            pad_x = F.pad(reshape_x, [\n                0,\n                0,"
+        },
+        {
+            "comment": "This code defines a function and a class, both part of a ResNet backbone model. The function takes in an input tensor and applies temporal shifts, convolutions, and shortcut connections to form the output. The BasicBlock class initializes a basic block layer with convolutional layers and a shortcut connection.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":159-191",
+            "content": "                1,\n                1,\n                0,\n                0,\n                0,\n                0,\n                0,\n                0,\n            ])  #[N, T+2, C, H, W]\n            c1 = int(shape[1] * shift_ratio)\n            c2 = int(shape[1] * 2 * shift_ratio)\n            slice1 = pad_x[:, :seg_num, :c1, :, :]\n            slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]\n            slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]\n            concat_x = paddle.concat([slice1, slice2, slice3],\n                                     axis=2)  #[N, T, C, H, W]\n            shifts = concat_x.reshape(shape)\n        else:\n            shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.leaky_relu(y)\nclass BasicBlock(nn.Layer):\n    def __init__(self,"
+        },
+        {
+            "comment": "This code defines a BasicBlock class for ResNet TSM model, with parameters including input and output channels, stride, shortcut flag, and number of segments. It initializes instance variables and creates convolution layers (conv0, conv1) for feature extraction. If not using shortcut connections, it also initializes a short layer for residual connections.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":192-216",
+            "content": "                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 num_seg=8,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.num_seg = num_seg\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"leaky_relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     kernel_size=1,"
+        },
+        {
+            "comment": "The code defines a ResNet TSM backbone model. It has an init function that initializes the model with specified depth, number of segments, and pretrained options. The forward function applies temporal shift module, convolution, shortcut connection if applicable, and Leaky ReLU activation for feature extraction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":217-252",
+            "content": "                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        # add temporal shift module\n        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.leaky_relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTweaksTSM(nn.Layer):\n    \"\"\"ResNet TSM backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, num_seg=8, pretrained=None):\n        super(ResNetTweaksTSM, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        self.num_seg = num_seg\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\"
+        },
+        {
+            "comment": "The code defines a ResNet model with different depths based on the specified number of layers. It initializes the layers, including a 7x7 convolution and multiple 3x3 convolutions, as well as Batch Normalization and Leaky ReLU activation functions. The model structure is determined by the input layer size, with supported layers ranging from 18 to 152.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":253-278",
+            "content": "            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = 64\n        out_channels = [64, 128, 256, 512]\n        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        self.conv1_1 = ConvBNLayer(in_channels=3,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=2,\n                                   act='leaky_relu',\n                                   name=\"conv1_1\")\n        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,"
+        },
+        {
+            "comment": "This code defines a ResNet backbone with TSM tweaks. It includes convolutional layers, Batch Normalization, Leaky ReLU activation functions, and max pooling. The block_list is initialized and the structure of the network adapts depending on the specified layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":279-300",
+            "content": "                                   act='leaky_relu',\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='leaky_relu',\n                                   name=\"conv1_3\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)"
+        },
+        {
+            "comment": "Iterates through ResNet blocks and Bottleneck blocks, assigning in_channels based on previous block's out_channels. Inserts each block into the block_list. Adjusts shortcut value accordingly. Initializes in_channels as 64 for specified blocks if depth is not defined.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":301-321",
+            "content": "                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' %\n                        (block, i),  #same with PaddleClas, for loading pretrain\n                        BottleneckBlock(\n                            in_channels=in_channels\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            num_seg=self.num_seg,\n                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            name=conv_name))\n                    in_channels = out_channels[block] * 4\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            in_channels = [64, 64, 128, 256]\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)"
+        },
+        {
+            "comment": "This code defines a function for initializing weights in the ResNet TSMBackbone. It loads pre-trained weights if a loading path is specified or uses specific initialization methods otherwise, with Conv2D layers using KaimingNormal and BatchNorm2d layers using Constant initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":322-338",
+            "content": "                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   num_seg=self.num_seg,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function."
+        },
+        {
+            "comment": "This function initializes the backbone's weights, with Kaiming Normal distribution for Conv2D layers and constant value 1 for BatchNorm2D. The forward function defines how the backbone processes inputs through a series of convolutions and pooling, then passes the result to each block in the block_list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":339-361",
+            "content": "            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    # no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/cea7c6ee-4308-4274-ace8-0fb1637d5d62.json b/docs/doc/cea7c6ee-4308-4274-ace8-0fb1637d5d62.json
new file mode 100644
index 000000000..344c0d3c6
--- /dev/null
+++ b/docs/doc/cea7c6ee-4308-4274-ace8-0fb1637d5d62.json
@@ -0,0 +1,60 @@
+{
+    "summary": "The GaussianSmoothing class in PaddlePaddle applies 1D gaussian smoothing for image processing tasks, and the code initializes weights and biases for a neural network layer with Kaiming Uniform method.",
+    "details": [
+        {
+            "comment": "This code defines a GaussianSmoothing class in PaddlePaddle for applying gaussian smoothing on 1D tensors. It uses depthwise convolution to filter each channel separately, with input and output channels remaining the same. The kernel size can be specified as an integer or sequence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# https://github.com/yiskw713/asrf/libs/postprocess.py\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nimport math\nclass GaussianSmoothing(nn.Layer):\n    \"\"\"\n    Apply gaussian smoothing on a 1d tensor.\n    Filtering is performed seperately for each channel\n    in the input using a depthwise convolution.\n    Arguments:\n        channels (int, sequence): Number of channels of the input tensors. Output will\n            have this number of channels as well.\n        kernel_size (int, sequence): Size of the gaussian kernel."
+        },
+        {
+            "comment": "This code initializes a Gaussian kernel with specified kernel size and sigma. The kernel is then applied to the input during forward pass to filter the data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":30-61",
+            "content": "        sigma (float, sequence): Standard deviation of the gaussian kernel.\n    \"\"\"\n    def __init__(self, kernel_size=15, sigma=1.0):\n        super().__init__()\n        self.kernel_size = kernel_size\n        # The gaussian kernel is the product of the\n        # gaussian function of each dimension.\n        kernel = 1\n        meshgrid = paddle.arange(kernel_size)\n        meshgrid = paddle.cast(meshgrid, dtype='float32')\n        mean = (kernel_size - 1) / 2\n        kernel = kernel / (sigma * math.sqrt(2 * math.pi))\n        kernel = kernel * paddle.exp(-(((meshgrid - mean) / sigma)**2) / 2)\n        # Make sure sum of values in gaussian kernel equals 1.\n        # kernel = kernel / paddle.max(kernel)\n        self.kernel = paddle.reshape(kernel, [1, 1, -1])\n    def forward(self, inputs):\n        \"\"\"\n        Apply gaussian filter to input.\n        Arguments:\n            input (paddle.Tensor): Input to apply gaussian filter on.\n        Returns:\n            filtered (paddle.Tensor): Filtered output.\n        \"\"\"\n        _, c, _ = inputs.shape"
+        },
+        {
+            "comment": "This code defines a convolution operation and an argrelmax function for image processing. The conv1d function performs 1D convolutions on the input tensor, with padding, kernel expansion, and return as output. The argrelmax function calculates the arguments of relative maxima in boundary probability maps, ignoring values below a certain threshold. This code seems to be related to image segmentation or edge detection tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":62-94",
+            "content": "        inputs = F.pad(inputs,\n                       pad=((self.kernel_size - 1) // 2,\n                            (self.kernel_size - 1) // 2),\n                       mode=\"reflect\",\n                       data_format='NCL')\n        kernel = paddle.expand(self.kernel, shape=[c, 1, self.kernel_size])\n        return F.conv1d(inputs, weight=kernel, groups=c)\ndef argrelmax(prob, threshold=0.7):\n    \"\"\"\n    Calculate arguments of relative maxima.\n    prob: np.array. boundary probability maps distributerd in [0, 1]\n    prob shape is (T)\n    ignore the peak whose value is under threshold\n    Return:\n        Index of peaks for each batch\n    \"\"\"\n    # ignore the values under threshold\n    prob[prob < threshold] = 0.0\n    # calculate the relative maxima of boundary maps\n    # treat the first frame as boundary\n    peak = np.concatenate(\n        [\n            np.ones((1), dtype=np.bool),\n            (prob[:-2] < prob[1:-1]) & (prob[2:] < prob[1:-1]),\n            np.zeros((1), dtype=np.bool),\n        ],\n        axis=0,\n    )"
+        },
+        {
+            "comment": "The code provides functions to convert tensors into probabilities or labels. The 'is_probability' function checks if a tensor is in the form of sigmoid or softmax outputs and returns True/False accordingly. The 'convert2probability' function converts tensors into probabilities based on whether they are sigmoid or softmax outputs. Lastly, 'convert2label' function converts tensors (2D or 3D) into labels by either casting to int64 directly for 2D or first converting the tensor to probability and then finding the index of maximum value along the appropriate axis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":96-145",
+            "content": "    peak_idx = np.where(peak)[0].tolist()\n    return peak_idx\ndef is_probability(x):\n    assert x.ndim == 3\n    if x.shape[1] == 1:\n        # sigmoid\n        if x.min() >= 0 and x.max() <= 1:\n            return True\n        else:\n            return False\n    else:\n        # softmax\n        _sum = np.sum(x, axis=1).astype(np.float32)\n        _ones = np.ones_like(_sum, dtype=np.float32)\n        return np.allclose(_sum, _ones)\ndef convert2probability(x):\n    \"\"\"\n    Args: x (N, C, T)\n    \"\"\"\n    assert x.ndim == 3\n    if is_probability(x):\n        return x\n    else:\n        if x.shape[1] == 1:\n            # sigmoid\n            prob = 1 / (1 + np.exp(-x))\n        else:\n            # softmax\n            prob = np.exp(x) / np.sum(np.exp(x), axis=1)\n        return prob.astype(np.float32)\ndef convert2label(x):\n    assert x.ndim == 2 or x.ndim == 3\n    if x.ndim == 2:\n        return x.astype(np.int64)\n    else:\n        if not is_probability(x):\n            x = convert2probability(x)\n        label = np.argmax(x, axis=1)\n        return label.astype(np.int64)"
+        },
+        {
+            "comment": "This function refines the segmented action outputs based on boundary predictions, and performs majority vote to decide class labels. The inputs include model output (outputs) for frame-level class prediction, boundary prediction (boundaries), and an optional threshold (boundary_threshold). It converts outputs and boundaries into label and probability format respectively. For each sequence, it finds the indices of maximum boundary values above the threshold, appends the last action end index, then performs majority vote on each interval between adjacent max boundaries. The function returns the final class prediction considering boundaries in a numpy array format (preds).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":148-175",
+            "content": "def refinement_with_boundary(outputs, boundaries, boundary_threshold):\n    \"\"\"\n    Get segments which is defined as the span b/w two boundaries,\n    and decide their classes by majority vote.\n    Args:\n        outputs: numpy array. shape (N, C, T)\n            the model output for frame-level class prediction.\n        boundaries: numpy array.  shape (N, 1, T)\n            boundary prediction.\n        boundary_threshold: the threshold of the size of action segments. float(default=0.7)\n    Return:\n        preds: np.array. shape (N, T)\n            final class prediction considering boundaries.\n    \"\"\"\n    preds = convert2label(outputs)\n    boundaries = convert2probability(boundaries)\n    for i, (output, pred, boundary) in enumerate(zip(outputs, preds,\n                                                     boundaries)):\n        idx = argrelmax(boundary[0, :], threshold=boundary_threshold)\n        # add the index of the last action ending\n        T = pred.shape[0]\n        idx.append(T)\n        # majority vote\n        for j in range(len(idx) - 1):"
+        },
+        {
+            "comment": "This code segment performs action segmentation by detecting the majority class in each chunk and relabeling smaller action segments with their previous action segment. It uses numpy's bincount and where functions to find majority classes, and has separate logic for cases with multiple majority classes depending on the dimension of outputs. The results are stored in preds array.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":176-202",
+            "content": "            count = np.bincount(pred[idx[j]:idx[j + 1]])\n            modes = np.where(count == count.max())[0]\n            if len(modes) == 1:\n                mode = modes\n            else:\n                if outputs.ndim == 3:\n                    # if more than one majority class exist\n                    prob_sum_max = 0\n                    for m in modes:\n                        prob_sum = output[m, idx[j]:idx[j + 1]].sum()\n                        if prob_sum_max < prob_sum:\n                            mode = m\n                            prob_sum_max = prob_sum\n                else:\n                    # decide first mode when more than one majority class\n                    # have the same number during oracle experiment\n                    mode = modes[0]\n            preds[i, idx[j]:idx[j + 1]] = mode\n    return preds\ndef relabeling(outputs, theta_t):\n    \"\"\"\n        Relabeling small action segments with their previous action segment\n        Args:\n            output: the results of action segmentation. (N, T) or (N, C, T)"
+        },
+        {
+            "comment": "The code defines two functions: \"relabel\" and \"smoothing\". The relabel function takes predicted action segment labels, applies a threshold to merge adjacent segments with overlapping actions, and returns the relabeled output. The smoothing function applies a Gaussian filter to frame-wise action probabilities, resulting in final predictions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":203-241",
+            "content": "            theta_t: the threshold of the size of action segments.\n        Return:\n            relabeled output. (N, T)\n        \"\"\"\n    preds = convert2label(outputs)\n    for i in range(preds.shape[0]):\n        # shape (T,)\n        last = preds[i][0]\n        cnt = 1\n        for j in range(1, preds.shape[1]):\n            if last == preds[i][j]:\n                cnt += 1\n            else:\n                if cnt > theta_t:\n                    cnt = 1\n                    last = preds[i][j]\n                else:\n                    preds[i][j - cnt:j] = preds[i][j - cnt - 1]\n                    cnt = 1\n                    last = preds[i][j]\n        if cnt <= theta_t:\n            preds[i][j - cnt:j] = preds[i][j - cnt - 1]\n    return preds\ndef smoothing(outputs, filter_func):\n    \"\"\"\n        Smoothing action probabilities with gaussian filter.\n        Args:\n            outputs: frame-wise action probabilities. (N, C, T)\n        Return:\n            predictions: final prediction. (N, T)\n        \"\"\"\n    outputs = convert2probability(outputs)"
+        },
+        {
+            "comment": "This code is implementing ASRF post-processing for refining action boundary and classification. It takes in outputs_cls (action segmentation results), outputs_boundary (action boundary probability), refinement_method, boundary_threshold, theta_t (threshold of the size of action segments), and kernel_size as arguments. The code applies three processing steps: \"refinement_with_boundary\", \"relabeling\", and \"smoothing\" to refine the predict boundary and classification. It returns the preds output which is a (N, T) shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":242-269",
+            "content": "    outputs = filter_func(paddle.to_tensor(outputs)).numpy()\n    preds = convert2label(outputs)\n    return preds\ndef ASRFPostProcessing(outputs_cls,\n                       outputs_boundary,\n                       refinement_method,\n                       boundary_threshold=0.7,\n                       theta_t=15,\n                       kernel_size=15):\n    \"\"\"\n    ASRF post processing is to refine action boundary\n    Args:\n        outputs_cls: the results of action segmentation. (N, T) or (N, C, T)\n        outputs_boundary: action boundary probability. (N, 1, T)\n        refinement_method: the way of refine predict boundary and classification. str\n        boundary_threshold: the threshold of the size of action segments. float(default=0.7)\n        theta_t: the threshold of the size of action segments. int(default=15)\n        kernel_size: Size of the gaussian kernel. int(default=15)\n    Return:\n        preds output. (N, T)\n    \"\"\"\n    func = [\n        \"refinement_with_boundary\",\n        \"relabeling\",\n        \"smoothing\","
+        },
+        {
+            "comment": "This code segment defines a function that takes an input tensor and calculates the fan-in and fan-out. It also applies different refinement methods to outputs_cls based on the user-specified refinement method. If an invalid method is chosen, it returns a zero tensor. The code includes functions for smoothing, relabeling, and refinement with boundary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":270-300",
+            "content": "    ]\n    if refinement_method == \"smoothing\":\n        filter_func = GaussianSmoothing(kernel_size)\n        preds = smoothing(outputs_cls, filter_func)\n    elif refinement_method == \"relabeling\":\n        preds = relabeling(outputs_cls, theta_t)\n    elif refinement_method == \"refinement_with_boundary\":\n        preds = refinement_with_boundary(outputs_cls, outputs_boundary,\n                                         boundary_threshold)\n    else:\n        preds = np.zeros((1, 1))\n        assert refinement_method in func\n    return paddle.to_tensor(preds)\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = len(tensor.shape)\n    if dimensions < 2:\n        raise ValueError(\"Fan in and fan out can not be computed \\\n        for tensor with fewer than 2 dimensions\")\n    if dimensions == 2:  # Linear\n        fan_in = tensor.shape[1]\n        fan_out = tensor.shape[0]\n    else:\n        num_input_fmaps = tensor.shape[1]\n        num_output_fmaps = tensor.shape[0]\n        receptive_field_size = 1\n        if tensor.dim() > 2:"
+        },
+        {
+            "comment": "This code calculates the gain and fan-in/fan-out values for weight initialization in a neural network. It supports different nonlinearities such as 'tanh', 'relu', 'leaky_relu', and 'selu'. The function KaimingUniform_like_torch initializes weights using the Kaiming Uniform method with the specified nonlinearity, fan mode (fan_in or fan_out), and standard deviation of the initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":301-334",
+            "content": "            receptive_field_size = tensor[0][0].numel()\n        fan_in = num_input_fmaps * receptive_field_size\n        fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef calculate_gain(nonlinearity=None, a=None):\n    if nonlinearity == 'tanh':\n        return 5.0 / 3\n    elif nonlinearity == 'relu':\n        return math.sqrt(2.0)\n    elif nonlinearity == 'leaky_relu':\n        if a is not None:\n            return math.sqrt(2.0 / (1 + a**2))\n        else:\n            return math.sqrt(2.0 / (1 + 0.01**2))\n    elif nonlinearity == 'selu':\n        return 3.0 / 4\n    else:\n        return 1\ndef KaimingUniform_like_torch(weight_npy,\n                              mode='fan_in',\n                              nonlinearity='leaky_relu'):\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)\n    if mode == 'fan_in':\n        fan_mode = fan_in\n    else:\n        fan_mode = fan_out\n    a = math.sqrt(5.0)\n    gain = calculate_gain(nonlinearity=nonlinearity, a=a)\n    std = gain / math.sqrt(fan_mode)"
+        },
+        {
+            "comment": "This code initializes weights and biases for a neural network layer. It calculates the fan-in and fan-out, determines bounds based on standard deviation or square root of three times the standard deviation for weights, and uses a uniform distribution within those bounds to initialize the weights and biases.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segmenters/utils.py\":335-342",
+            "content": "    bound = math.sqrt(3.0) * std\n    return np.random.uniform(-bound, bound, weight_npy.shape)\ndef init_bias(weight_npy, bias_npy):\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)\n    bound = 1.0 / math.sqrt(fan_in)\n    return np.random.uniform(-bound, bound, bias_npy.shape)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/cfb3e1a6-0fc5-4b0b-8cec-a267353f68aa.json b/docs/doc/cfb3e1a6-0fc5-4b0b-8cec-a267353f68aa.json
new file mode 100644
index 000000000..5a405b14d
--- /dev/null
+++ b/docs/doc/cfb3e1a6-0fc5-4b0b-8cec-a267353f68aa.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The setup file installs necessary packages for PaddleVideo, supports Python 3.2-3.6, and includes a console script \"ppvideo\" with keywords: \"A treasure chest for video understanding powered by PaddlePaddle.\" The code specifies Python version 3.7 and categorizes the project as a utility for metadata description in setup.py file.",
+    "details": [
+        {
+            "comment": "This code is a setup file for the PaddleVideo package using setuptools. It specifies the package name, installs required packages from requirements.txt, reads README content, and sets up directory structure.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/setup.py\":0-30",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom setuptools import setup\nfrom io import open\nwith open('requirements.txt', encoding=\"utf-8-sig\") as f:\n    requirements = f.readlines()\ndef readme():\n    with open('docs/en/quick_start.md', encoding=\"utf-8-sig\") as f:\n        README = f.read()\n    return README\nsetup(\n    name='ppvideo',  #name of .whl file\n    packages=['ppvideo'],  #install package name\n    package_dir={'ppvideo': ''},\n    include_package_data="
+        },
+        {
+            "comment": "This code is for setting up a Python package named \"PaddleVideo\" using setup.py. It specifies package details such as its name, version, requirements, description, license, and URL. The package is built with console script \"ppvideo\", and it supports Python 3.2-3.6. Keywords: \"A treasure chest for video understanding powered by PaddlePaddle.\"",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/setup.py\":31-52",
+            "content": "    True,  #Accept all data files and directories matched by MANIFEST.in\n    install_requires=requirements,\n    entry_points={\"console_scripts\": [\"ppvideo= ppvideo.tools.wheel:main\"]},\n    version='2.3.0',\n    license='Apache License 2.0',\n    description='Awesome Video toolkits based on PaddlePaddle ',\n    long_description=readme(),\n    long_description_content_type='text/markdown',\n    url='https://github.com/PaddlePaddle/PaddleVideo',\n    download_url='https://github.com/PaddlePaddle/PaddleVideo.git',\n    keywords=[\n        'A treasure chest for video understanding powered by PaddlePaddle.'\n    ],\n    classifiers=[\n        'Intended Audience :: Developers', 'Operating System :: OS Independent',\n        'Natural Language :: Chinese (Simplified)',\n        'Programming Language :: Python :: 3',\n        'Programming Language :: Python :: 3.2',\n        'Programming Language :: Python :: 3.3',\n        'Programming Language :: Python :: 3.4',\n        'Programming Language :: Python :: 3.5',\n        'Programming Language :: Python :: 3.6',"
+        },
+        {
+            "comment": "The code is specifying Python version 3.7 and categorizing the project as a utility for metadata description in setup.py file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/setup.py\":53-55",
+            "content": "        'Programming Language :: Python :: 3.7', 'Topic :: Utilities'\n    ],\n)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d0056eef-03dd-4253-9be2-549523effe6d.json b/docs/doc/d0056eef-03dd-4253-9be2-549523effe6d.json
new file mode 100644
index 000000000..0d31db32c
--- /dev/null
+++ b/docs/doc/d0056eef-03dd-4253-9be2-549523effe6d.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is part of the PaddleVideo library and includes the necessary imports and declarations for the VOSMetric and build_metric functions. It also contains licensing information, specifying that it's under the Apache License, Version 2.0.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library and includes the necessary imports and declarations for the VOSMetric and build_metric functions. It also contains licensing information, specifying that it's under the Apache License, Version 2.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py\":0-19",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .vos_metric import VOSMetric\nfrom .build import build_metric\n__all__ = [\n    'VOSMetric', \"build_metric\"\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d10cc70a-a304-4cad-9ce9-9d6361dcdd2c.json b/docs/doc/d10cc70a-a304-4cad-9ce9-9d6361dcdd2c.json
new file mode 100644
index 000000000..e21f2ba77
--- /dev/null
+++ b/docs/doc/d10cc70a-a304-4cad-9ce9-9d6361dcdd2c.json
@@ -0,0 +1,45 @@
+{
+    "summary": "This code imports libraries, defines a merging function and uses Paddlevideo classes for evaluation. It calculates Hit@1, measures video-level precision, averages results to assess model performance. The function computes top-k triplet predictions, raises ValueError if k is not a positive integer, and initializes HitOneMetric class for evaluation metrics in Youtube8m's PaddleVideo module.",
+    "details": [
+        {
+            "comment": "The code provides functions for evaluating models. It imports necessary libraries, defines a function to merge multiple lists into one, and includes classes for Average Precision Calculator and Mean Average Precision Calculator from the paddlevideo module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/eval_util.py\":0-28",
+            "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Provides functions to help with evaluating models.\"\"\"\nimport numpy as np\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom ..base import BaseMetric\nfrom ..registry import METRIC\nfrom . import average_precision_calculator as ap_calculator\nfrom . import mean_average_precision_calculator as map_calculator\nlogger = get_logger(\"paddlevideo\")\ndef flatten(l):\n    \"\"\" Merges a list of lists into a single list. \"\"\"\n    return [item for sublist in l for item in sublist]"
+        },
+        {
+            "comment": "Calculates Hit@1, the fraction of samples with at least one ground truth label in top predictions.\nMeasures video-level annotation precision when retrieving the same number of entities as ground truth.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/eval_util.py\":31-59",
+            "content": "def calculate_hit_at_one(predictions, actuals):\n    \"\"\"\n    Hit@k: indicates the fraction of test samples that contain at least\n    one of the ground truth labels in the top k predictions,\n    i.e topk.\n    Args:\n        predictions: Matrix containing the outputs of the model.\n        Dimensions are 'batch' x 'num_classes'.\n        actuals: Matrix containing the ground truth labels.\n        Dimensions are 'batch' x 'num_classes'.\n    Returns:\n        float: The average hit at one across the entire batch.\n    \"\"\"\n    top_prediction = np.argmax(predictions, 1)\n    hits = actuals[np.arange(actuals.shape[0]), top_prediction]\n    return np.mean(hits)\ndef calculate_precision_at_equal_recall_rate(predictions, actuals):\n    \"\"\"\n    PERR: measures the video-level annotation precision when we retrieve the same number\n     of entities per video as there are in the ground-truth.\n    More details please refer to:  https://arxiv.org/abs/1609.08675\n    Args:\n        predictions: Matrix containing the outputs of the model.\n        Dimensions are 'batch' x 'num_classes'."
+        },
+        {
+            "comment": "The code calculates the average precision at equal recall rate and global average precision for a batch of videos. It iterates over each video, determines the number of labels, finds the top indices based on predictions, calculates item-wise precision, aggregates these precisions for all videos, and returns the averaged precision as well as the gap score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/eval_util.py\":60-89",
+            "content": "        actuals: Matrix containing the ground truth labels.\n        Dimensions are 'batch' x 'num_classes'.\n    Returns:\n        float: The average precision at equal recall rate across the entire batch.\n    \"\"\"\n    aggregated_precision = 0.0\n    num_videos = actuals.shape[0]\n    for row in np.arange(num_videos):\n        num_labels = int(np.sum(actuals[row]))\n        top_indices = np.argpartition(predictions[row],\n                                      -num_labels)[-num_labels:]\n        item_precision = 0.0\n        for label_index in top_indices:\n            if predictions[row][label_index] > 0:\n                item_precision += actuals[row][label_index]\n        item_precision /= top_indices.size\n        aggregated_precision += item_precision\n    aggregated_precision /= num_videos\n    return aggregated_precision\ndef calculate_gap(predictions, actuals, top_k=20):\n    \"\"\"\n    GAP: the global average precision.\n    Only the top_k predictions are taken for each of the videos.\n    Args:\n        predictions: Matrix containing the outputs of the model."
+        },
+        {
+            "comment": "This code calculates the global average precision by first extracting the top k predictions for each video, sorted by class. It then accumulates these results using an AveragePrecisionCalculator and returns the global average precision.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/eval_util.py\":90-115",
+            "content": "        Dimensions are 'batch' x 'num_classes'.\n        actuals: Matrix containing the ground truth labels.\n        Dimensions are 'batch' x 'num_classes'.\n        top_k: How many predictions to use per video.\n    Returns:\n        float: The global average precision.\n    \"\"\"\n    gap_calculator = ap_calculator.AveragePrecisionCalculator()\n    sparse_predictions, sparse_labels, num_positives = top_k_by_class(\n        predictions, actuals, top_k)\n    gap_calculator.accumulate(flatten(sparse_predictions),\n                              flatten(sparse_labels), sum(num_positives))\n    return gap_calculator.peek_ap_at_n()\ndef top_k_by_class(predictions, labels, k=20):\n    \"\"\"Extracts the top k predictions for each video, sorted by class.\n    Args:\n        predictions: A numpy matrix containing the outputs of the model.\n        Dimensions are 'batch' x 'num_classes'.\n        k: the top k non-zero entries to preserve in each prediction.\n    Returns:\n        A tuple (predictions,labels, true_positives). 'predictions' and 'labels'"
+        },
+        {
+            "comment": "This function takes in a list of lists containing probability predictions and ground truth labels for multiple classes, and calculates top-k triplet predictions based on the given k value. It raises a ValueError if k is not a positive integer. The function then creates empty lists to store output predictions and labels for each class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/eval_util.py\":116-136",
+            "content": "        are lists of lists of floats. 'true_positives' is a list of scalars. The\n        length of the lists are equal to the number of classes. The entries in the\n        predictions variable are probability predictions, and\n        the corresponding entries in the labels variable are the ground truth for\n        those predictions. The entries in 'true_positives' are the number of true\n        positives for each class in the ground truth.\n    Raises:\n        ValueError: An error occurred when the k is not a positive integer.\n    \"\"\"\n    if k <= 0:\n        raise ValueError(\"k must be a positive integer.\")\n    k = min(k, predictions.shape[1])\n    num_classes = predictions.shape[1]\n    prediction_triplets = []\n    for video_index in range(predictions.shape[0]):\n        prediction_triplets.extend(\n            top_k_triplets(predictions[video_index], labels[video_index], k))\n    out_predictions = [[] for v in range(num_classes)]\n    out_labels = [[] for v in range(num_classes)]\n    for triplet in prediction_triplets:"
+        },
+        {
+            "comment": "This code calculates top-k predictions and labels from given predictions and labels arrays, and then initializes HitOneMetric class to store the evaluation metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/eval_util.py\":137-166",
+            "content": "        out_predictions[triplet[0]].append(triplet[1])\n        out_labels[triplet[0]].append(triplet[2])\n    out_true_positives = [np.sum(labels[:, i]) for i in range(num_classes)]\n    return out_predictions, out_labels, out_true_positives\ndef top_k_triplets(predictions, labels, k=20):\n    \"\"\"Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in\n    (prediction, class) format\"\"\"\n    m = len(predictions)\n    k = min(k, m)\n    indices = np.argpartition(predictions, -k)[-k:]\n    return [(index, predictions[index], labels[index]) for index in indices]\n@METRIC.register\nclass HitOneMetric(BaseMetric):\n    \"\"\"A class to store the evaluation metrics.\"\"\"\n    def __init__(self,\n                 num_class,\n                 top_k,\n                 data_size,\n                 batch_size,\n                 log_interval=20):\n        \"\"\"Construct an HitOneMetric object to store the evaluation metrics.\"\"\"\n        self.hit_at_one = []\n        self.perr = []\n        self.gap = []\n        super().__init__(data_size, batch_size, log_interval)"
+        },
+        {
+            "comment": "The code defines a HitOneMetric class for evaluating metrics in a video prediction task. The accumulate method calculates mean values of hit_at_one, perr, and gap, and logs the results as information. The clear method resets all metrics to an empty list. The update method updates the metric with each iteration, taking into account multi-card validation using PaddlePaddle's distributed functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/eval_util.py\":168-192",
+            "content": "    def accumulate(self):\n        logger.info(\n            '[TEST] finished, hit_at_one = {:.5f}, perr = {:.5f}, gap = {:.5f}'.\n            format(np.mean(np.array(self.hit_at_one)),\n                   np.mean(np.array(self.perr)), np.mean(np.array(self.gap))))\n    def clear(self):\n        \"\"\"Clear the evaluation metrics and reset the HitOneMetric object.\"\"\"\n        self.hit_at_one = []\n        self.perr = []\n        self.gap = []\n    def update(self, batch_id, data, outputs):\n        \"\"\"update metrics during each iter\n        \"\"\"\n        hit_at_one = paddle.to_tensor(outputs['hit_at_one'])\n        perr = paddle.to_tensor(outputs['perr'])\n        gap = paddle.to_tensor(outputs['gap'])\n        # NOTE(shipping): deal with multi cards validate\n        if self.world_size > 1:\n            hit_at_one = paddle.distributed.all_reduce(\n                hit_at_one,\n                op=paddle.distributed.ReduceOp.SUM) / self.world_size\n            perr = paddle.distributed.all_reduce(\n                perr, op=paddle.distributed.ReduceOp.SUM) / self.world_size"
+        },
+        {
+            "comment": "This code snippet is a part of the Youtube8m evaluation module in PaddleVideo. It calculates the gap between ground truth and prediction for each batch, performs all-reduce on the gap, appends it to the corresponding list. Also, logs information about processing batches during testing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/youtube8m/eval_util.py\":193-204",
+            "content": "            gap = paddle.distributed.all_reduce(\n                gap, op=paddle.distributed.ReduceOp.SUM) / self.world_size\n        self.hit_at_one.append(hit_at_one.numpy())\n        self.perr.append(perr.numpy())\n        self.gap.append(gap.numpy())\n        # preds ensemble\n        if batch_id % self.log_interval == 0:\n            logger.info(\"[TEST] Processing batch {}/{}...\".format(\n                batch_id,\n                self.data_size // (self.batch_size * self.world_size),\n            ))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d15fdcdd-2e5b-43be-9fe9-712b6f255d6a.json b/docs/doc/d15fdcdd-2e5b-43be-9fe9-712b6f255d6a.json
new file mode 100644
index 000000000..832bb248d
--- /dev/null
+++ b/docs/doc/d15fdcdd-2e5b-43be-9fe9-712b6f255d6a.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code imports libraries and defines an FPN class with three layers, creates a backbone model using convolutional layers and GroupNorm. It also defines a \"CFBI\" class that utilizes DeepLab for feature extraction and FPN to combine multi-scale features, returning extracted features at 4x, 8x, 16x scales along with low-level features using a forward function.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and defines a class called FPN, which is an FPN layer in a neural network. It has three layers: toplayer, latlayer1, and latlayer2, each with specific input dimensions and output dimensions. The _make_layer function is used to create these layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/cfbi.py\":0-27",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom .deeplab import DeepLab\nclass FPN(nn.Layer):\n    \"\"\"FPN Layer\"\"\"\n    def __init__(self, in_dim_4x, in_dim_8x, in_dim_16x, out_dim):\n        super(FPN, self).__init__()\n        self.toplayer = self._make_layer(in_dim_16x, out_dim)\n        self.latlayer1 = self._make_layer(in_dim_8x, out_dim)"
+        },
+        {
+            "comment": "The code defines a backbone model with two convolutional layers followed by GroupNorm layer. The forward function applies the defined layers to input images of size 4x, 8x, and 16x.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/cfbi.py\":28-53",
+            "content": "        self.latlayer2 = self._make_layer(in_dim_4x, out_dim)\n        self.smooth1 = self._make_layer(out_dim,\n                                        out_dim,\n                                        kernel_size=3,\n                                        padding=1)\n        self.smooth2 = self._make_layer(out_dim,\n                                        out_dim,\n                                        kernel_size=3,\n                                        padding=1)\n    def _make_layer(self, in_dim, out_dim, kernel_size=1, padding=0):\n        return nn.Sequential(\n            nn.Conv2D(in_dim,\n                      out_dim,\n                      kernel_size=kernel_size,\n                      stride=1,\n                      padding=padding,\n                      bias_attr=False),\n            nn.GroupNorm(num_groups=32, num_channels=out_dim))\n    def forward(self, x_4x, x_8x, x_16x):\n        \"\"\" forward function\"\"\"\n        x_16x = self.toplayer(x_16x)\n        x_8x = self.latlayer1(x_8x)\n        x_4x = self.latlayer2(x_4x)"
+        },
+        {
+            "comment": "This code defines a class \"CFBI\" which is a backbone model for feature extraction. It utilizes DeepLab as the feature extractor and FPN (Feature Pyramid Network) to combine features from different scales. The input image x is processed through the feature extracter and the output is passed through the fpn to obtain three outputs at 4x, 8x and 16x scales. These outputs are then interpolated and smoothed before being returned after applying ReLU activation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/cfbi.py\":55-83",
+            "content": "        x_8x = x_8x + F.interpolate(\n            x_16x, size=x_8x.shape[-2:], mode='bilinear', align_corners=True)\n        x_4x = x_4x + F.interpolate(\n            x_8x, size=x_4x.shape[-2:], mode='bilinear', align_corners=True)\n        x_8x = self.smooth1(x_8x)\n        x_4x = self.smooth2(x_4x)\n        return F.relu(x_4x), F.relu(x_8x), F.relu(x_16x)\n@BACKBONES.register()\nclass CFBI(nn.Layer):\n    \"\"\"CFBI plus backbone\"\"\"\n    def __init__(self,\n                 backbone='resnet',\n                 freeze_bn=True,\n                 model_aspp_outdim=256,\n                 in_dim_8x=512,\n                 model_semantic_embedding_dim=256):  #,epsilon=1e-05):\n        super(CFBI, self).__init__()\n        #self.epsilon = epsilon\n        self.feature_extracter = DeepLab(backbone=backbone, freeze_bn=freeze_bn)\n        self.fpn = FPN(in_dim_4x=model_aspp_outdim,\n                       in_dim_8x=in_dim_8x,\n                       in_dim_16x=model_aspp_outdim,\n                       out_dim=model_semantic_embedding_dim)\n    def forward(self, x):"
+        },
+        {
+            "comment": "This code defines a forward function that takes an input image and uses the feature_extracter and fpn modules to extract features at different scales. It returns the extracted features at 4x, 8x, and 16x scales along with the low-level features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/cfbi.py\":84-87",
+            "content": "        \"\"\"forward function\"\"\"\n        x, aspp_x, low_level, mid_level = self.feature_extracter(x, True)\n        x_4x, x_8x, x_16x = self.fpn(x, mid_level, aspp_x)\n        return x_4x, x_8x, x_16x, low_level"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d191fe21-0341-4511-a025-55caf07e6fb1.json b/docs/doc/d191fe21-0341-4511-a025-55caf07e6fb1.json
new file mode 100644
index 000000000..2c1fbe5e5
--- /dev/null
+++ b/docs/doc/d191fe21-0341-4511-a025-55caf07e6fb1.json
@@ -0,0 +1,250 @@
+{
+    "summary": "This code enhances PaddleVideo's loader with resize operation and augmentation pipeline, enabling diverse data preprocessing. It calculates crop offsets and performs object detection image augmentation using uniform sampling, resizing, and flipping techniques, resizes images, scales by 255.0, concatenates frames, transposes array dimensions, stores results in 'results', and returns arrays.",
+    "details": [
+        {
+            "comment": "This code is from the PaddleVideo library, specifically the loader module's augmentations pipeline. It defines a Scale class that scales images based on their short side to the given short_size parameter. The fixed_ratio parameter determines whether or not the image should be resized while maintaining its aspect ratio. This class is then registered in PIPELINES for later use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":0-33",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport random\nfrom collections.abc import Sequence\nimport cv2\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\nfrom PIL import Image\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass Scale(object):\n    \"\"\"\n    Scale images.\n    Args:\n        short_size(float | int): Short size of an image will be scaled to the short_size.\n        fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True"
+        },
+        {
+            "comment": "The code defines a class for resize operations. It takes parameters for short size, fixed ratio (defaults to True), keep ratio, do_round (default False), and backend (default 'pillow'). The class checks if fixed_ratio and keep_ratio can't be true at the same time. It also ensures the backend is either 'pillow' or 'cv2'. The __call__ method performs resize operations on images, taking a Sequence of PIL.Image as input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":34-60",
+            "content": "        do_round(bool): Whether to round up when calculating the zoom ratio. default: False\n        backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'\n    \"\"\"\n    def __init__(self,\n                 short_size,\n                 fixed_ratio=True,\n                 keep_ratio=None,\n                 do_round=False,\n                 backend='pillow'):\n        self.short_size = short_size\n        assert (fixed_ratio and not keep_ratio) or (not fixed_ratio), \\\n            f\"fixed_ratio and keep_ratio cannot be true at the same time\"\n        self.fixed_ratio = fixed_ratio\n        self.keep_ratio = keep_ratio\n        self.do_round = do_round\n        assert backend in [\n            'pillow', 'cv2'\n        ], f\"Scale's backend must be pillow or cv2, but get {backend}\"\n        self.backend = backend\n    def __call__(self, results):\n        \"\"\"\n        Performs resize operations.\n        Args:\n            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]"
+        },
+        {
+            "comment": "This code is responsible for resizing images to a specified size in a PaddleVideo pipeline. It iterates through each image, checks the aspect ratio, and resizes them accordingly before appending to the resized_imgs list. If the image is already the correct size, it is directly added to the list without further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":61-87",
+            "content": "        return:\n            resized_imgs: List where each item is a PIL.Image after scaling.\n        \"\"\"\n        imgs = results['imgs']\n        resized_imgs = []\n        for i in range(len(imgs)):\n            img = imgs[i]\n            if isinstance(img, np.ndarray):\n                h, w, _ = img.shape\n            elif isinstance(img, Image.Image):\n                w, h = img.size\n            else:\n                raise NotImplementedError\n            if (w <= h and w == self.short_size) or (h <= w\n                                                     and h == self.short_size):\n                if self.backend == 'pillow' and not isinstance(\n                        img, Image.Image):\n                    img = Image.fromarray(img)\n                resized_imgs.append(img)\n                continue\n            if w <= h:\n                ow = self.short_size\n                if self.fixed_ratio:\n                    oh = int(self.short_size * 4.0 / 3.0)\n                elif self.keep_ratio is False:\n                    oh = self.short_size"
+        },
+        {
+            "comment": "This code calculates the output image size for resizing and maintains aspect ratio if specified. It uses scale_factor to calculate the output height (oh) and width (ow), considering do_round, fixed_ratio, keep_ratio flags and short_size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":88-107",
+            "content": "                else:\n                    scale_factor = self.short_size / w\n                    oh = int(h * float(scale_factor) +\n                             0.5) if self.do_round else int(h *\n                                                            self.short_size / w)\n                    ow = int(w * float(scale_factor) +\n                             0.5) if self.do_round else self.short_size\n            else:\n                oh = self.short_size\n                if self.fixed_ratio:\n                    ow = int(self.short_size * 4.0 / 3.0)\n                elif self.keep_ratio is False:\n                    ow = self.short_size\n                else:\n                    scale_factor = self.short_size / h\n                    oh = int(h * float(scale_factor) +\n                             0.5) if self.do_round else self.short_size\n                    ow = int(w * float(scale_factor) +\n                             0.5) if self.do_round else int(w *\n                                                            self.short_size / h)"
+        },
+        {
+            "comment": "This code defines an augmentation pipeline for image processing. It resizes images using different backends based on the backend specified and whether the ratio should be preserved or not. The results are then returned as a dictionary with 'imgs' key containing the resized images. Additionally, there is a RandomCrop class which performs random crop operations on images of the specified target size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":108-137",
+            "content": "            if self.backend == 'pillow':\n                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))\n            elif self.backend == 'cv2' and (self.keep_ratio is not None):\n                resized_imgs.append(\n                    cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR))\n            else:\n                resized_imgs.append(\n                    Image.fromarray(\n                        cv2.resize(np.asarray(img), (ow, oh),\n                                   interpolation=cv2.INTER_LINEAR)))\n        results['imgs'] = resized_imgs\n        return results\n@PIPELINES.register()\nclass RandomCrop(object):\n    \"\"\"\n    Random crop images.\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = target_size\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]"
+        },
+        {
+            "comment": "This code is a part of PaddleVideo's augmentations.py, which applies random cropping to images. It checks if the backend used is 'pyav', and if so, extracts the image dimensions. If not, it gets the image size from the first image in the list. Then, it asserts that the image dimensions are larger than the target size. Finally, it generates a random crop position and crops each image in the list using these positions. The cropped images are stored in the 'crop_images' list which is returned at the end.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":138-163",
+            "content": "        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]\n            h, w = imgs.shape[2:]\n        else:\n            w, h = imgs[0].size\n        th, tw = self.target_size, self.target_size\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size\".format(\n                w, h, self.target_size)\n        crop_images = []\n        if 'backend' in results and results['backend'] == 'pyav':\n            x1 = np.random.randint(0, w - tw)\n            y1 = np.random.randint(0, h - th)\n            crop_images = imgs[:, :, y1:y1 + th, x1:x1 + tw]  # [C, T, th, tw]\n        else:\n            x1 = random.randint(0, w - tw)\n            y1 = random.randint(0, h - th)\n            for img in imgs:\n                if w == tw and h == th:\n                    crop_images.append(img)\n                else:"
+        },
+        {
+            "comment": "RandomResizedCrop is a pipeline that resizes and crops images randomly with specified area, aspect ratio range, target size, and backend. The method get_crop_bbox takes image shape, area and aspect ratio ranges as input and returns the crop bounding box within the specified range of area and aspect ratio.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":164-196",
+            "content": "                    crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = crop_images\n        return results\n@PIPELINES.register()\nclass RandomResizedCrop(RandomCrop):\n    def __init__(self,\n                 area_range=(0.08, 1.0),\n                 aspect_ratio_range=(3 / 4, 4 / 3),\n                 target_size=224,\n                 backend='cv2'):\n        self.area_range = area_range\n        self.aspect_ratio_range = aspect_ratio_range\n        self.target_size = target_size\n        self.backend = backend\n    @staticmethod\n    def get_crop_bbox(img_shape,\n                      area_range,\n                      aspect_ratio_range,\n                      max_attempts=10):\n        assert 0 < area_range[0] <= area_range[1] <= 1\n        assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]\n        img_h, img_w = img_shape\n        area = img_h * img_w\n        min_ar, max_ar = aspect_ratio_range\n        aspect_ratios = np.exp(\n            np.random.uniform(np.log(min_ar), np.log(max_ar),"
+        },
+        {
+            "comment": "This function generates a random crop size based on the aspect ratios and target areas. It then iterates through candidate crop sizes, selecting one that fits within the image bounds. If no suitable crop is found, it falls back to centering a smaller crop. The function returns the offset coordinates and crop dimensions for the selected crop.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":197-218",
+            "content": "                              size=max_attempts))\n        target_areas = np.random.uniform(*area_range, size=max_attempts) * area\n        candidate_crop_w = np.round(np.sqrt(target_areas *\n                                            aspect_ratios)).astype(np.int32)\n        candidate_crop_h = np.round(np.sqrt(target_areas /\n                                            aspect_ratios)).astype(np.int32)\n        for i in range(max_attempts):\n            crop_w = candidate_crop_w[i]\n            crop_h = candidate_crop_h[i]\n            if crop_h <= img_h and crop_w <= img_w:\n                x_offset = random.randint(0, img_w - crop_w)\n                y_offset = random.randint(0, img_h - crop_h)\n                return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h\n        # Fallback\n        crop_size = min(img_h, img_w)\n        x_offset = (img_w - crop_size) // 2\n        y_offset = (img_h - crop_size) // 2\n        return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size\n    def __call__(self, results):"
+        },
+        {
+            "comment": "This code is a part of PaddleVideo library and performs image cropping based on the specified backend. It first retrieves the image dimensions, then applies a crop box to each image according to the defined area range and aspect ratio range. The code handles different backends such as Pillow, OpenCV (cv2), and PyAV. If an unsupported backend is encountered, it raises a NotImplementedError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":219-248",
+            "content": "        imgs = results['imgs']\n        if self.backend == 'pillow':\n            img_w, img_h = imgs[0].size\n        elif self.backend == 'cv2':\n            img_h, img_w, _ = imgs[0].shape\n        elif self.backend == 'pyav':\n            img_h, img_w = imgs.shape[2:]  # [cthw]\n        else:\n            raise NotImplementedError\n        left, top, right, bottom = self.get_crop_bbox(\n            (img_h, img_w), self.area_range, self.aspect_ratio_range)\n        if self.backend == 'pillow':\n            img_w, img_h = imgs[0].size\n            imgs = [img.crop(left, top, right, bottom) for img in imgs]\n        elif self.backend == 'cv2':\n            img_h, img_w, _ = imgs[0].shape\n            imgs = [img[top:bottom, left:right] for img in imgs]\n        elif self.backend == 'pyav':\n            img_h, img_w = imgs.shape[2:]  # [cthw]\n            imgs = imgs[:, :, top:bottom, left:right]\n        else:\n            raise NotImplementedError\n        results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass CenterCrop(object):"
+        },
+        {
+            "comment": "This code defines a class for center cropping images. The constructor takes the target size, whether to round the coordinates (True by default), and the backend (default is Pillow). The `__call__` method applies the center crop operation on a list of PIL Image objects, returning a new list with the cropped images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":249-275",
+            "content": "    \"\"\"\n    Center crop images.\n    Args:\n        target_size(int): Center crop a square with the target_size from an image.\n        do_round(bool): Whether to round up the coordinates of the upper left corner of the cropping area. default: True\n    \"\"\"\n    def __init__(self, target_size, do_round=True, backend='pillow'):\n        self.target_size = target_size\n        self.do_round = do_round\n        self.backend = backend\n    def __call__(self, results):\n        \"\"\"\n        Performs Center crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            ccrop_imgs: List where each item is a PIL.Image after Center crop.\n        \"\"\"\n        imgs = results['imgs']\n        ccrop_imgs = []\n        th, tw = self.target_size, self.target_size\n        if isinstance(imgs, paddle.Tensor):\n            h, w = imgs.shape[-2:]\n            x1 = int(round((w - tw) / 2.0)) if self.do_round else (w - tw) // 2\n            y1 = int(round((h - th) / 2.0)) if self.do_round else (h - th) // 2"
+        },
+        {
+            "comment": "This function performs center crop on images based on the given target size. It first checks if the image dimensions are larger than the crop size, and then calculates the starting coordinates for cropping. If the backend is Pillow, it uses the crop() method to perform the cropping operation; if the backend is OpenCV (cv2), it slices the image array accordingly. The resulting cropped images are stored in 'ccrop_imgs' list and returned in the 'results' dictionary under the key 'imgs'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":276-296",
+            "content": "            ccrop_imgs = imgs[:, :, y1:y1 + th, x1:x1 + tw]\n        else:\n            for img in imgs:\n                if self.backend == 'pillow':\n                    w, h = img.size\n                elif self.backend == 'cv2':\n                    h, w, _ = img.shape\n                else:\n                    raise NotImplementedError\n                assert (w >= self.target_size) and (h >= self.target_size), \\\n                    \"image width({}) and height({}) should be larger than crop size\".format(\n                        w, h, self.target_size)\n                x1 = int(round(\n                    (w - tw) / 2.0)) if self.do_round else (w - tw) // 2\n                y1 = int(round(\n                    (h - th) / 2.0)) if self.do_round else (h - th) // 2\n                if self.backend == 'cv2':\n                    ccrop_imgs.append(img[y1:y1 + th, x1:x1 + tw])\n                elif self.backend == 'pillow':\n                    ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = ccrop_imgs"
+        },
+        {
+            "comment": "The MultiScaleCrop class is a pipeline module that randomly crops images with multiple scales, targeting a specific size. It allows adjustable parameters like maximum distortion, fix crop start point, and duplicate candidate crop points for flexibility. This module is useful in image processing tasks where random cropping can provide more data augmentation and improve model performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":297-324",
+            "content": "        return results\n@PIPELINES.register()\nclass MultiScaleCrop(object):\n    \"\"\"\n    Random crop images in with multiscale sizes\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n        scales(int): List of candidate cropping scales.\n        max_distort(int): Maximum allowable deformation combination distance.\n        fix_crop(int): Whether to fix the cutting start point.\n        allow_duplication(int): Whether to allow duplicate candidate crop starting points.\n        more_fix_crop(int): Whether to allow more cutting starting points.\n    \"\"\"\n    def __init__(\n            self,\n            target_size,  # NOTE: named target size now, but still pass short size in it!\n            scales=None,\n            max_distort=1,\n            fix_crop=True,\n            allow_duplication=False,\n            more_fix_crop=True,\n            backend='pillow'):\n        self.target_size = target_size\n        self.scales = scales if scales else [1, .875, .75, .66]\n        self.max_distort = max_distort"
+        },
+        {
+            "comment": "This code defines a class for multi-scale cropping of images with Pillow or OpenCV backend. The `__init__` method initializes the instance variables and checks if the provided backend is either 'pillow' or 'cv2'. The `__call__` method performs the actual multi-scale cropping operation on a given list of images, applying random crop offsets to each image with the specified target size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":325-359",
+            "content": "        self.fix_crop = fix_crop\n        self.allow_duplication = allow_duplication\n        self.more_fix_crop = more_fix_crop\n        assert backend in [\n            'pillow', 'cv2'\n        ], f\"MultiScaleCrop's backend must be pillow or cv2, but get {backend}\"\n        self.backend = backend\n    def __call__(self, results):\n        \"\"\"\n        Performs MultiScaleCrop operations.\n        Args:\n            imgs: List where wach item is a PIL.Image.\n            XXX:\n        results:\n        \"\"\"\n        imgs = results['imgs']\n        input_size = [self.target_size, self.target_size]\n        im_size = imgs[0].size\n        # get random crop offset\n        def _sample_crop_size(im_size):\n            image_w, image_h = im_size[0], im_size[1]\n            base_size = min(image_w, image_h)\n            crop_sizes = [int(base_size * x) for x in self.scales]\n            crop_h = [\n                input_size[1] if abs(x - input_size[1]) < 3 else x\n                for x in crop_sizes\n            ]\n            crop_w = [\n                input_size[0] if abs(x - input_size[0]) < 3 else x"
+        },
+        {
+            "comment": "This code generates a random crop pair from provided sizes, and then applies different cropping locations to the image. If fix_crop is False, it randomly selects an offset for the crop pair within the image boundaries. If fix_crop is True, it calculates four different offsets in a grid pattern using step values based on the image size. The resulting crops are stored in 'ret'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":360-383",
+            "content": "                for x in crop_sizes\n            ]\n            pairs = []\n            for i, h in enumerate(crop_h):\n                for j, w in enumerate(crop_w):\n                    if abs(i - j) <= self.max_distort:\n                        pairs.append((w, h))\n            crop_pair = random.choice(pairs)\n            if not self.fix_crop:\n                w_offset = random.randint(0, image_w - crop_pair[0])\n                h_offset = random.randint(0, image_h - crop_pair[1])\n            else:\n                w_step = (image_w - crop_pair[0]) / 4\n                h_step = (image_h - crop_pair[1]) / 4\n                ret = list()\n                ret.append((0, 0))  # upper left\n                if self.allow_duplication or w_step != 0:\n                    ret.append((4 * w_step, 0))  # upper right\n                if self.allow_duplication or h_step != 0:\n                    ret.append((0, 4 * h_step))  # lower left\n                if self.allow_duplication or (h_step != 0 and w_step != 0):\n                    ret.append((4 * w_step, 4 * h_step))  # lower right"
+        },
+        {
+            "comment": "This code samples random crop sizes and offsets for image augmentation. It appends different cropping positions based on user allowance or specific flag settings, then randomly selects one of these positions. Finally, it crops the image using the selected position and size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":384-404",
+            "content": "                if self.allow_duplication or (h_step != 0 or w_step != 0):\n                    ret.append((2 * w_step, 2 * h_step))  # center\n                if self.more_fix_crop:\n                    ret.append((0, 2 * h_step))  # center left\n                    ret.append((4 * w_step, 2 * h_step))  # center right\n                    ret.append((2 * w_step, 4 * h_step))  # lower center\n                    ret.append((2 * w_step, 0 * h_step))  # upper center\n                    ret.append((1 * w_step, 1 * h_step))  # upper left quarter\n                    ret.append((3 * w_step, 1 * h_step))  # upper right quarter\n                    ret.append((1 * w_step, 3 * h_step))  # lower left quarter\n                    ret.append((3 * w_step, 3 * h_step))  # lower righ quarter\n                w_offset, h_offset = random.choice(ret)\n            return crop_pair[0], crop_pair[1], w_offset, h_offset\n        crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)\n        crop_img_group = [\n            img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))"
+        },
+        {
+            "comment": "This code is a PaddleVideo pipeline for image augmentation, specifically performing random flips with a given probability. It resizes and crops images according to the provided input size. If the backend is set to 'pillow', it uses PIL library's resize function; otherwise, it uses OpenCV's resize function. The results are stored in the 'imgs' key of the 'results' dictionary, which is then returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":405-439",
+            "content": "            for img in imgs\n        ]\n        if self.backend == 'pillow':\n            ret_img_group = [\n                img.resize((input_size[0], input_size[1]), Image.BILINEAR)\n                for img in crop_img_group\n            ]\n        else:\n            ret_img_group = [\n                Image.fromarray(\n                    cv2.resize(np.asarray(img),\n                               dsize=(input_size[0], input_size[1]),\n                               interpolation=cv2.INTER_LINEAR))\n                for img in crop_img_group\n            ]\n        results['imgs'] = ret_img_group\n        return results\n@PIPELINES.register()\nclass RandomFlip(object):\n    \"\"\"\n    Random Flip images.\n    Args:\n        p(float): Random flip images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.5):\n        self.p = p\n    def __call__(self, results):\n        \"\"\"\n        Performs random flip operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]"
+        },
+        {
+            "comment": "This code implements a random image flipping and brightness adjustment in PaddleVideo's pipeline. It takes an image as input, randomly decides whether to flip or keep it intact with probability 'p', and adjusts the brightness if applied. The result is then returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":440-472",
+            "content": "        return:\n            flip_imgs: List where each item is a PIL.Image after random flip.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            if isinstance(imgs, paddle.Tensor):\n                results['imgs'] = paddle.flip(imgs, axis=[3])\n            elif isinstance(imgs[0], np.ndarray):\n                results['imgs'] = [cv2.flip(img, 1, img) for img in imgs\n                                   ]  # [[h,w,c], [h,w,c], ..., [h,w,c]]\n            else:\n                results['imgs'] = [\n                    img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs\n                ]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass RandomBrightness(object):\n    \"\"\"\n    Random Brightness images.\n    Args:\n        p(float): Random brightness images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.1, brightness=1):\n        self.p = p\n        self.brightness = brightness\n    def __call__(self, results):\n        \"\"\""
+        },
+        {
+            "comment": "The code defines two classes, RandomBrightness and RandomSaturation, which perform random operations on image brightness and saturation respectively. The RandomBrightness class applies ColorJitter with a specified brightness level to each image in the list with a certain probability, while the RandomSaturation class adjusts the saturation of images with another probability. Both classes are registered as Pipelines for data augmentation in PaddleVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":473-507",
+            "content": "        Performs random brightness operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            brightness_imgs: List where each item is a PIL.Image after random brightness.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            transform = ColorJitter(brightness=self.brightness)\n            results['imgs'] = [transform(img) for img in imgs]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass RandomSaturation(object):\n    \"\"\"\n    Random Saturation images.\n    Args:\n        p(float): Random saturation images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.1, saturation=2):\n        self.p = p\n        self.saturation = saturation\n    def __call__(self, results):\n        \"\"\"\n        Performs random saturation operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]"
+        },
+        {
+            "comment": "This code snippet contains two classes: RandomSaturation and RandomHue. Both classes are pipeline transforms for image processing in the PaddleVideo framework. The RandomSaturation class applies random saturation adjustments to images with a certain probability, while the RandomHue class randomly alters hue values of images with another probability. These transforms can be used to augment and enhance the dataset for better model training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":508-545",
+            "content": "        return:\n            saturation_imgs: List where each item is a PIL.Image after random saturation.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            transform = ColorJitter(saturation=self.saturation)\n            results['imgs'] = [transform(img) for img in imgs]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass RandomHue(object):\n    \"\"\"\n    Random Hue images.\n    Args:\n        p(float): Random hue images with the probability p.\n    \"\"\"\n    def __init__(self, p=0.1, hue=0.5):\n        self.p = p\n        self.hue = hue\n    def __call__(self, results):\n        \"\"\"\n        Performs random hue operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            hue_imgs: List where each item is a PIL.Image after random hue.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:"
+        },
+        {
+            "comment": "The code defines a pipeline module for data augmentation in PaddleVideo. It includes a RandomGamma class that randomly applies gamma correction to images with a specified probability and gamma value range. The ColorJitter transform is used to apply random changes to the hue of images. The results are stored in a dictionary under the 'imgs' key, either after applying transformations or as is if no transformation is needed. The code also handles adjusting gamma for both numpy arrays and PIL Image objects.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":546-576",
+            "content": "            transform = ColorJitter(hue=self.hue)\n            results['imgs'] = [transform(img) for img in imgs]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass RandomGamma(object):\n    \"\"\"\n    Random Gamma images.\n    Args:\n        p(float): Random gamma images with the probability p.\n        gamma (float): Non negative real number, same as `\\\\gamma` in the equation.\n                       gamma larger than 1 make the shadows darker,\n                      while gamma smaller than 1 make dark regions lighter.\n    \"\"\"\n    def __init__(self, p=0.1, gamma=0.2):\n        self.p = p\n        self.value = [1 - gamma, 1 + gamma]\n        self.value[0] = max(self.value[0], 0)\n    def _adust_gamma(self, img, gamma, gain=1.0):\n        flag = False\n        if isinstance(img, np.ndarray):\n            flag = True\n            img = Image.fromarray(img)\n        input_mode = img.mode\n        img = img.convert(\"RGB\")\n        gamma_map = [\n            int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma))"
+        },
+        {
+            "comment": "This code is defining a pipeline for image augmentation, specifically adjusting gamma values randomly. It checks if a random number falls below the threshold and applies a random gamma adjustment to each image in the input list. If not, it leaves the images unchanged. Finally, it registers an Image2Array class that converts PIL.Image to Numpy array with transposed dimensions from 'dhwc' to 'dchw'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":577-610",
+            "content": "            for ele in range(256)\n        ] * 3\n        img = img.point(\n            gamma_map)  # use PIL's point-function to accelerate this part\n        img = img.convert(input_mode)\n        if flag:\n            img = np.array(img)\n        return img\n    def __call__(self, results):\n        \"\"\"\n        Performs random gamma operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            gamma_imgs: List where each item is a PIL.Image after random gamma.\n        \"\"\"\n        imgs = results['imgs']\n        v = random.random()\n        if v < self.p:\n            gamma = random.uniform(self.value[0], self.value[1])\n            results['imgs'] = [self._adust_gamma(img, gamma) for img in imgs]\n        else:\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass Image2Array(object):\n    \"\"\"\n    transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.\n    Args:"
+        },
+        {
+            "comment": "This code is part of a class that performs Image to NumpyArray operations. It initializes with the option to transpose or not, and specifies the data format as either 'tchw' or 'cthw'. The class checks if the backend is 'pyav', then transposes the images accordingly. If 'transpose' is True and 'data_format' is 'tchw', it transposes the images from (0, 3, 1, 2) to (0, 3, 1, 2), resulting in 'tchw'. Otherwise, if 'transpose' is True and 'data_format' is 'cthw', it transposes the images from (3, 0, 1, 2) to (3, 0, 1, 2), resulting in 'cthw'. The transposed images are then stored back into 'imgs'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":611-637",
+            "content": "        transpose: whether to transpose or not, default True, False for slowfast.\n    \"\"\"\n    def __init__(self, transpose=True, data_format='tchw'):\n        assert data_format in [\n            'tchw', 'cthw'\n        ], f\"Target format must in ['tchw', 'cthw'], but got {data_format}\"\n        self.transpose = transpose\n        self.data_format = data_format\n    def __call__(self, results):\n        \"\"\"\n        Performs Image to NumpyArray operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            np_imgs: Numpy array.\n        \"\"\"\n        imgs = results['imgs']\n        if 'backend' in results and results[\n                'backend'] == 'pyav':  # [T,H,W,C] in [0, 1]\n            if self.transpose:\n                if self.data_format == 'tchw':\n                    t_imgs = imgs.transpose((0, 3, 1, 2))  # tchw\n                else:\n                    t_imgs = imgs.transpose((3, 0, 1, 2))  # cthw\n            results['imgs'] = t_imgs"
+        },
+        {
+            "comment": "This code is defining a class for normalization in PaddleVideo's loader pipelines. It takes mean and std values as arguments to normalize the image data, and allows for transpose operation depending on data_format. The tensor_shape parameter is optional with default value [3,1,1] for standard usage or [1,1,1,3] for slowfast support. Inplace flag can be set to True to perform in-place operations if desired.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":638-664",
+            "content": "        else:\n            t_imgs = np.stack(imgs).astype('float32')\n            if self.transpose:\n                if self.data_format == 'tchw':\n                    t_imgs = t_imgs.transpose(0, 3, 1, 2)  # tchw\n                else:\n                    t_imgs = t_imgs.transpose(3, 0, 1, 2)  # cthw\n            results['imgs'] = t_imgs\n        return results\n@PIPELINES.register()\nclass Normalization(object):\n    \"\"\"\n    Normalization.\n    Args:\n        mean(Sequence[float]): mean values of different channels.\n        std(Sequence[float]): std values of different channels.\n        tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]\n    \"\"\"\n    def __init__(self, mean, std, tensor_shape=[3, 1, 1], inplace=False):\n        if not isinstance(mean, Sequence):\n            raise TypeError(\n                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')\n        if not isinstance(std, Sequence):\n            raise TypeError(\n                f'Std must be list, tuple or np.ndarray, but got {type(std)}')"
+        },
+        {
+            "comment": "This code defines a class for normalizing images. It takes mean and std values as inputs, which are used for image normalization. If inplace is set to False, it converts the input into numpy arrays with appropriate shapes and data types. The __call__ method performs normalization on the given results. If inplace is True, it uses the existing array and avoids making copies. The method calculates mean and std values for normalization and applies them to each image in the results using cv2.subtract.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":666-692",
+            "content": "        self.inplace = inplace\n        if not inplace:\n            self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)\n            self.std = np.array(std).reshape(tensor_shape).astype(np.float32)\n        else:\n            self.mean = np.array(mean, dtype=np.float32)\n            self.std = np.array(std, dtype=np.float32)\n    def __call__(self, results):\n        \"\"\"\n        Performs normalization operations.\n        Args:\n            imgs: Numpy array.\n        return:\n            np_imgs: Numpy array after normalization.\n        \"\"\"\n        if self.inplace:\n            n = len(results['imgs'])\n            h, w, c = results['imgs'][0].shape\n            norm_imgs = np.empty((n, h, w, c), dtype=np.float32)\n            for i, img in enumerate(results['imgs']):\n                norm_imgs[i] = img\n            for img in norm_imgs:  # [n,h,w,c]\n                mean = np.float64(self.mean.reshape(1, -1))  # [1, 3]\n                stdinv = 1 / np.float64(self.std.reshape(1, -1))  # [1, 3]\n                cv2.subtract(img, mean, img)"
+        },
+        {
+            "comment": "This code applies image normalization and potentially scales the images while preserving aspect ratio, with options for random scaling. This is part of a PaddleVideo pipeline, likely for preprocessing input data before feeding it to a model for training or inference. It can be used with different backends such as \"cv2\" or \"pyav\", and returns the processed image results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":693-721",
+            "content": "                cv2.multiply(img, stdinv, img)\n        else:\n            imgs = results['imgs']\n            norm_imgs = imgs / 255.0\n            norm_imgs -= self.mean\n            norm_imgs /= self.std\n            if 'backend' in results and results['backend'] == 'pyav':\n                norm_imgs = paddle.to_tensor(norm_imgs, dtype=paddle.float32)\n        results['imgs'] = norm_imgs\n        return results\n@PIPELINES.register()\nclass JitterScale(object):\n    \"\"\"\n    Scale image, while the target short size is randomly select between min_size and max_size.\n    Args:\n        min_size: Lower bound for random sampler.\n        max_size: Higher bound for random sampler.\n    \"\"\"\n    def __init__(self,\n                 min_size,\n                 max_size,\n                 short_cycle_factors=[0.5, 0.7071],\n                 default_min_size=256):\n        self.default_min_size = default_min_size\n        self.orig_min_size = self.min_size = min_size\n        self.max_size = max_size\n        self.short_cycle_factors = short_cycle_factors"
+        },
+        {
+            "comment": "This code defines a function that performs jitter resize operations. It takes in an image sequence and scales each image based on a random size between min_size and max_size, considering short cycle factors and asserting the minimum length of images. If the backend is pyav, it retrieves height and width separately; otherwise, it gets the size from the first image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":723-748",
+            "content": "    def __call__(self, results):\n        \"\"\"\n        Performs jitter resize operations.\n        Args:\n            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            resized_imgs: List where each item is a PIL.Image after scaling.\n        \"\"\"\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx in [0, 1]:\n            self.min_size = int(\n                round(self.short_cycle_factors[short_cycle_idx] *\n                      self.default_min_size))\n        else:\n            self.min_size = self.orig_min_size\n        imgs = results['imgs']\n        size = int(round(np.random.uniform(self.min_size, self.max_size)))\n        assert (len(imgs) >= 1), \\\n            \"len(imgs):{} should be larger than 1\".format(len(imgs))\n        if 'backend' in results and results['backend'] == 'pyav':\n            height, width = imgs.shape[2:]\n        else:\n            width, height = imgs[0].size"
+        },
+        {
+            "comment": "This code resizes images to a specified size (width or height equals size). It checks if the image is loaded by PyAV and performs the resize operation using F.interpolate for PyAV-loaded images, otherwise it uses PIL's Image.resize function for other images. The resized images are added to 'imgs' in the results dictionary and returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":749-773",
+            "content": "        if (width <= height and width == size) or (height <= width\n                                                   and height == size):\n            return results\n        new_width = size\n        new_height = size\n        if width < height:\n            new_height = int(math.floor((float(height) / width) * size))\n        else:\n            new_width = int(math.floor((float(width) / height) * size))\n        if 'backend' in results and results['backend'] == 'pyav':\n            frames_resize = F.interpolate(imgs,\n                                          size=(new_height, new_width),\n                                          mode=\"bilinear\",\n                                          align_corners=False)  # [c,t,h,w]\n        else:\n            frames_resize = []\n            for j in range(len(imgs)):\n                img = imgs[j]\n                scale_img = img.resize((new_width, new_height), Image.BILINEAR)\n                frames_resize.append(scale_img)\n        results['imgs'] = frames_resize\n        return results"
+        },
+        {
+            "comment": "This code defines a MultiCenterCrop class that performs center crop, left center crop, and right center crop operations on images. It takes a target size as input and returns the cropped images. The function checks if the image size is larger than the target size before performing the operation. If the image size is smaller, it throws an assertion error.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":776-804",
+            "content": "@PIPELINES.register()\nclass MultiCenterCrop(object):\n    \"\"\"\n    center crop, left center crop right center crop\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = target_size\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]\n            h, w = imgs.shape[2:]\n        else:\n            w, h = imgs[0].size\n        th, tw = self.target_size, self.target_size\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size\".format(\n                w, h, self.target_size)"
+        },
+        {
+            "comment": "This code is performing image cropping for a specific backend (pyav) and storing the results in three separate lists: crop_imgs_center, crop_imgs_left, and crop_imgs_right. The cropping is done based on the size of the original image compared to the target size, with different crops for center, left, and right areas.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":806-833",
+            "content": "        crop_images = []\n        #just for tensor\n        crop_imgs_center = []\n        crop_imgs_left = []\n        crop_imgs_right = []\n        if 'backend' in results and results['backend'] == 'pyav':\n            #center_corp\n            x1 = 0\n            if w > self.target_size:\n                x1 = int((w - self.target_size) / 2.0)\n            y1 = 0\n            if h > self.target_size:\n                y1 = int((h - self.target_size) / 2.0)\n            crop_imgs_center = imgs[:, :, y1:y1 + th,\n                                    x1:x1 + tw].numpy()  # [C, T, th, tw]\n            #left_crop\n            x1 = 0\n            y1 = 0\n            if h > self.target_size:\n                y1 = int((h - self.target_size) / 2.0)\n            crop_imgs_left = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy()\n            #right_crop\n            x1 = 0\n            y1 = 0\n            if w > self.target_size:\n                x1 = w - self.target_size\n            if h > self.target_size:\n                y1 = int((h - self.target_size) / 2.0)"
+        },
+        {
+            "comment": "This code defines a MultiCrop pipeline that randomly crops an image into three parts: center, left, and right. The cropped images are concatenated horizontally and converted to Paddle Tensor before returning the results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":834-864",
+            "content": "            crop_imgs_right = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy()\n            crop_imgs = np.concatenate(\n                (crop_imgs_center, crop_imgs_left, crop_imgs_right), axis=1)\n            crop_images = paddle.to_tensor(crop_imgs)\n        else:\n            x1 = 0\n            if w > self.target_size:\n                x1 = random.randint(0, w - tw)\n            y1 = 0\n            if h > self.target_size:\n                y1 = random.randint(0, h - th)\n            for img in imgs:\n                if w == tw and h == th:\n                    crop_images.append(img)\n                else:\n                    crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))\n        results['imgs'] = crop_images\n        return results\n@PIPELINES.register()\nclass MultiCrop(object):\n    \"\"\"\n    Random crop image.\n    This operation can perform multi-crop during multi-clip test, as in slowfast model.\n    Args:\n        target_size(int): Random crop a square with the target_size from an image.\n    \"\"\"\n    def __init__(self,\n                 target_size,"
+        },
+        {
+            "comment": "The code initializes an augmentation class with parameters for target size, short cycle factors, default crop size, and test mode. It then defines a __call__ method that performs random cropping operations on images based on the provided parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":865-890",
+            "content": "                 default_crop_size=224,\n                 short_cycle_factors=[0.5, 0.7071],\n                 test_mode=False):\n        self.orig_target_size = self.target_size = target_size\n        self.short_cycle_factors = short_cycle_factors\n        self.default_crop_size = default_crop_size\n        self.test_mode = test_mode\n    def __call__(self, results):\n        \"\"\"\n        Performs random crop operations.\n        Args:\n            imgs: List where each item is a PIL.Image.\n            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]\n        return:\n            crop_imgs: List where each item is a PIL.Image after random crop.\n        \"\"\"\n        imgs = results['imgs']\n        spatial_sample_index = results['spatial_sample_index']\n        spatial_num_clips = results['spatial_num_clips']\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx in [0, 1]:\n            self.target_size = int(\n                round(self.short_cycle_factors[short_cycle_idx] *\n                      self.default_crop_size))"
+        },
+        {
+            "comment": "This code checks if the image size matches the target size. If it does, it returns the results. If not, it generates crops for multi-crop testing mode or a single crop for non-testing mode based on random offsets. The code also handles the case where the target size is determined from a saved value before the call.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":891-913",
+            "content": "        else:\n            self.target_size = self.orig_target_size  # use saved value before call\n        w, h = imgs[0].size\n        if w == self.target_size and h == self.target_size:\n            return results\n        assert (w >= self.target_size) and (h >= self.target_size), \\\n            \"image width({}) and height({}) should be larger than crop size({},{})\".format(w, h, self.target_size, self.target_size)\n        frames_crop = []\n        if not self.test_mode:\n            x_offset = random.randint(0, w - self.target_size)\n            y_offset = random.randint(0, h - self.target_size)\n        else:  # multi-crop\n            x_gap = int(\n                math.ceil((w - self.target_size) / (spatial_num_clips - 1)))\n            y_gap = int(\n                math.ceil((h - self.target_size) / (spatial_num_clips - 1)))\n            if h > w:\n                x_offset = int(math.ceil((w - self.target_size) / 2))\n                if spatial_sample_index == 0:\n                    y_offset = 0\n                elif spatial_sample_index == spatial_num_clips - 1:"
+        },
+        {
+            "comment": "This code calculates the crop offsets for a set of images based on their size and target size. If the aspect ratio is preserved, it determines the y_offset, otherwise, it calculates the x and y offsets separately for each spatial sample index. The resulting cropped images are stored in frames\\_crop and added to results['imgs']. PackOutput is a pipeline register that takes an alpha argument and is used in slowfast model to get slow pathway from fast pathway based on alpha factor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":914-943",
+            "content": "                    y_offset = h - self.target_size\n                else:\n                    y_offset = y_gap * spatial_sample_index\n            else:\n                y_offset = int(math.ceil((h - self.target_size) / 2))\n                if spatial_sample_index == 0:\n                    x_offset = 0\n                elif spatial_sample_index == spatial_num_clips - 1:\n                    x_offset = w - self.target_size\n                else:\n                    x_offset = x_gap * spatial_sample_index\n        for img in imgs:\n            nimg = img.crop((x_offset, y_offset, x_offset + self.target_size,\n                             y_offset + self.target_size))\n            frames_crop.append(nimg)\n        results['imgs'] = frames_crop\n        return results\n@PIPELINES.register()\nclass PackOutput(object):\n    \"\"\"\n    In slowfast model, we want to get slow pathway from fast pathway based on\n    alpha factor.\n    Args:\n        alpha(int): temporal length of fast/slow\n    \"\"\"\n    def __init__(self, alpha):\n        self.alpha = alpha"
+        },
+        {
+            "comment": "The code defines a GroupFullResSample pipeline that selects and groups slow and fast pathways from input images. It resizes the pathways to the specified crop_size, performs horizontal flips if flip is True, and stores them in frames_list before updating results['imgs'].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":945-974",
+            "content": "    def __call__(self, results):\n        fast_pathway = results['imgs']\n        # sample num points between start and end\n        slow_idx_start = 0\n        slow_idx_end = fast_pathway.shape[0] - 1\n        slow_idx_num = fast_pathway.shape[0] // self.alpha\n        slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,\n                                       slow_idx_num).astype(\"int64\")\n        slow_pathway = fast_pathway[slow_idxs_select]\n        # T H W C -> C T H W.\n        slow_pathway = slow_pathway.transpose(3, 0, 1, 2)\n        fast_pathway = fast_pathway.transpose(3, 0, 1, 2)\n        # slow + fast\n        frames_list = [slow_pathway, fast_pathway]\n        results['imgs'] = frames_list\n        return results\n@PIPELINES.register()\nclass GroupFullResSample(object):\n    def __init__(self, crop_size, flip=False):\n        self.crop_size = crop_size if not isinstance(crop_size, int) else (\n            crop_size, crop_size)\n        self.flip = flip\n    def __call__(self, results):\n        img_group = results['imgs']"
+        },
+        {
+            "comment": "This code performs image augmentation by creating a list of different crops and flips from the input image group. It calculates the crop size and step sizes, creates offsets for each crop position, iterates over the input images to create normal and flipped crops, and stores them in separate groups before combining them into the oversample_group. Finally, it adds the oversample_group to the results dictionary and returns the results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":976-1006",
+            "content": "        image_w, image_h = img_group[0].size\n        crop_w, crop_h = self.crop_size\n        w_step = (image_w - crop_w) // 4\n        h_step = (image_h - crop_h) // 4\n        offsets = list()\n        offsets.append((0 * w_step, 2 * h_step))  # left\n        offsets.append((4 * w_step, 2 * h_step))  # right\n        offsets.append((2 * w_step, 2 * h_step))  # center\n        oversample_group = list()\n        for o_w, o_h in offsets:\n            normal_group = list()\n            flip_group = list()\n            for i, img in enumerate(img_group):\n                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))\n                normal_group.append(crop)\n                if self.flip:\n                    flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)\n                    flip_group.append(flip_crop)\n            oversample_group.extend(normal_group)\n            if self.flip:\n                oversample_group.extend(flip_group)\n        results['imgs'] = oversample_group\n        return results\n@PIPELINES.register()"
+        },
+        {
+            "comment": "This code defines a class \"TenCrop\" which crops a given image into 10 cropped images, taking the top-left corner, bottom-left corner, top-right corner, bottom-right corner, and center of the image. It achieves this by using the target size for crop and calculating the width and height steps based on the original image's dimensions. The class also includes a __call__ method which takes a results dictionary as input and returns an array of cropped images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1007-1036",
+            "content": "class TenCrop:\n    \"\"\"\n    Crop out 5 regions (4 corner points + 1 center point) from the picture,\n    and then flip the cropping result to get 10 cropped images, which can make the prediction result more robust.\n    Args:\n        target_size(int | tuple[int]): (w, h) of target size for crop.\n    \"\"\"\n    def __init__(self, target_size):\n        self.target_size = (target_size, target_size)\n    def __call__(self, results):\n        imgs = results['imgs']\n        img_w, img_h = imgs[0].size\n        crop_w, crop_h = self.target_size\n        w_step = (img_w - crop_w) // 4\n        h_step = (img_h - crop_h) // 4\n        offsets = [\n            (0, 0),\n            (4 * w_step, 0),\n            (0, 4 * h_step),\n            (4 * w_step, 4 * h_step),\n            (2 * w_step, 2 * h_step),\n        ]\n        img_crops = list()\n        for x_offset, y_offset in offsets:\n            crop = [\n                img.crop(\n                    (x_offset, y_offset, x_offset + crop_w, y_offset + crop_h))\n                for img in imgs\n            ]"
+        },
+        {
+            "comment": "This code is for the \"UniformCrop\" pipeline, which performs uniform spatial sampling on images by selecting three regions: two ends of the long side and the middle position (left/right or top/bottom). The target size can be provided as an integer for square crop or a tuple for specific width and height. It uses either OpenCV or PIL for image manipulation based on the 'backend' argument, which defaults to OpenCV.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1037-1068",
+            "content": "            crop_fliped = [\n                timg.transpose(Image.FLIP_LEFT_RIGHT) for timg in crop\n            ]\n            img_crops.extend(crop)\n            img_crops.extend(crop_fliped)\n        results['imgs'] = img_crops\n        return results\n@PIPELINES.register()\nclass UniformCrop:\n    \"\"\"\n    Perform uniform spatial sampling on the images,\n    select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions.\n    Args:\n        target_size(int | tuple[int]): (w, h) of target size for crop.\n    \"\"\"\n    def __init__(self, target_size, backend='cv2'):\n        if isinstance(target_size, tuple):\n            self.target_size = target_size\n        elif isinstance(target_size, int):\n            self.target_size = (target_size, target_size)\n        else:\n            raise TypeError(\n                f'target_size must be int or tuple[int], but got {type(target_size)}'\n            )\n        self.backend = backend\n    def __call__(self, results):\n        imgs = results['imgs']"
+        },
+        {
+            "comment": "This code is determining the image offsets for cropping based on the target size and the original image dimensions. It supports two backends: 'pyav' and 'pillow'. If the backend is 'pyav', it extracts the height and width of the image. If the backend is 'pillow', it retrieves the width and height from the first image. The code then calculates the step size for cropping based on whether the target size matches the image dimensions or not, and finally constructs a list of offsets to crop the images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1069-1098",
+            "content": "        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]\n            img_h, img_w = imgs.shape[2:]\n        elif self.backend == 'pillow':\n            img_w, img_h = imgs[0].size\n        else:\n            img_h, img_w = imgs[0].shape[:2]\n        crop_w, crop_h = self.target_size\n        if crop_h == img_h:\n            w_step = (img_w - crop_w) // 2\n            offsets = [\n                (0, 0),\n                (w_step * 2, 0),\n                (w_step, 0),\n            ]\n        elif crop_w == img_w:\n            h_step = (img_h - crop_h) // 2\n            offsets = [\n                (0, 0),\n                (0, h_step * 2),\n                (0, h_step),\n            ]\n        else:\n            raise ValueError(\n                f\"img_w({img_w}) == crop_w({crop_w}) or img_h({img_h}) == crop_h({crop_h})\"\n            )\n        img_crops = []\n        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]\n            for x_offset, y_offset in offsets:\n                crop = imgs[:, :, y_offset:y_offset + crop_h,"
+        },
+        {
+            "comment": "The code is defining a pipeline for image augmentation, including cropping and resizing operations. If the backend is 'pillow', it performs cropping using pixel coordinates; otherwise, it uses slice notation to crop images. The results are stored in 'img_crops' and returned as 'results['imgs']'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1099-1126",
+            "content": "                            x_offset:x_offset + crop_w]\n                img_crops.append(crop)\n            img_crops = paddle.concat(img_crops, axis=1)\n        else:\n            if self.backend == 'pillow':\n                for x_offset, y_offset in offsets:\n                    crop = [\n                        img.crop((x_offset, y_offset, x_offset + crop_w,\n                                  y_offset + crop_h)) for img in imgs\n                    ]\n                    img_crops.extend(crop)\n            else:\n                for x_offset, y_offset in offsets:\n                    crop = [\n                        img[y_offset:y_offset + crop_h,\n                            x_offset:x_offset + crop_w] for img in imgs\n                    ]\n                    img_crops.extend(crop)\n        results['imgs'] = img_crops\n        return results\n@PIPELINES.register()\nclass GroupResize(object):\n    def __init__(self, height, width, scale, K, mode='train'):\n        self.height = height\n        self.width = width\n        self.scale = scale"
+        },
+        {
+            "comment": "This code initializes a resize transformation for image augmentation in PaddleVideo. The transformations are applied based on the scale and mode ('infer' or 'train') specified. For infer mode, it processes color images by applying resizing to each scale level. In train mode, it calculates the K matrix and its inverse for each scale level and stores them in the results dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1127-1155",
+            "content": "        self.resize = {}\n        self.K = np.array(K, dtype=np.float32)\n        self.mode = mode\n        for i in range(self.scale):\n            s = 2**i\n            self.resize[i] = paddle.vision.transforms.Resize(\n                (self.height // s, self.width // s), interpolation='lanczos')\n    def __call__(self, results):\n        if self.mode == 'infer':\n            imgs = results['imgs']\n            for k in list(imgs):  # (\"color\", 0, -1)\n                if \"color\" in k or \"color_n\" in k:\n                    n, im, _ = k\n                    for i in range(self.scale):\n                        imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])\n        else:\n            imgs = results['imgs']\n            for scale in range(self.scale):\n                K = self.K.copy()\n                K[0, :] *= self.width // (2**scale)\n                K[1, :] *= self.height // (2**scale)\n                inv_K = np.linalg.pinv(K)\n                imgs[(\"K\", scale)] = K\n                imgs[(\"inv_K\", scale)] = inv_K\n            for k in list(imgs):"
+        },
+        {
+            "comment": "This code applies color jitter augmentation to the images by randomly adjusting brightness, contrast, saturation, and hue. The ColorJitter class initializes a colorjitter transform with specified parameters for train mode or test mode. The __call__ method is called on each image in the results dictionary and checks if the color augmentation should be applied. If true, the color jittered image is returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1156-1192",
+            "content": "                if \"color\" in k or \"color_n\" in k:\n                    n, im, i = k\n                    for i in range(self.scale):\n                        imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])\n            results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass ColorJitter(object):\n    \"\"\"Randomly change the brightness, contrast, saturation and hue of an image.\n    \"\"\"\n    def __init__(self,\n                 brightness=0,\n                 contrast=0,\n                 saturation=0,\n                 hue=0,\n                 mode='train',\n                 p=0.5,\n                 keys=None):\n        self.mode = mode\n        self.colorjitter = paddle.vision.transforms.ColorJitter(\n            brightness, contrast, saturation, hue)\n        self.p = p\n    def __call__(self, results):\n        \"\"\"\n        Args:\n            results (PIL Image): Input image.\n        Returns:\n            PIL Image: Color jittered image.\n        \"\"\"\n        do_color_aug = random.random() > self.p\n        imgs = results['imgs']"
+        },
+        {
+            "comment": "This code is part of a data augmentation pipeline, specifically handling color and flip transformations for images. It iterates over the 'imgs' dictionary to find and organize color images, applying color jitter if required. Then it removes specific color images based on the mode (\"train\" or \"test\"). Finally, it returns the updated results dictionary with the modified image groupings. The GroupRandomFlip class performs random flipping of images with a specified probability.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1193-1226",
+            "content": "        for k in list(imgs):\n            f = imgs[k]\n            if \"color\" in k or \"color_n\" in k:\n                n, im, i = k\n                imgs[(n, im, i)] = f\n                if do_color_aug:\n                    imgs[(n + \"_aug\", im, i)] = self.colorjitter(f)\n                else:\n                    imgs[(n + \"_aug\", im, i)] = f\n        if self.mode == \"train\":\n            for i in results['frame_idxs']:\n                del imgs[(\"color\", i, -1)]\n                del imgs[(\"color_aug\", i, -1)]\n                del imgs[(\"color_n\", i, -1)]\n                del imgs[(\"color_n_aug\", i, -1)]\n        else:\n            for i in results['frame_idxs']:\n                del imgs[(\"color\", i, -1)]\n                del imgs[(\"color_aug\", i, -1)]\n        results['img'] = imgs\n        return results\n@PIPELINES.register()\nclass GroupRandomFlip(object):\n    def __init__(self, p=0.5):\n        self.p = p\n    def __call__(self, results):\n        imgs = results['imgs']\n        do_flip = random.random() > self.p\n        if do_flip:"
+        },
+        {
+            "comment": "This code is part of a machine learning pipeline that processes image data. It first flips left-right some images marked with \"color\" or \"color_n\". Then, it converts certain color and depth images to floats and normalizes them to [0,1] for training. Finally, it returns the updated image dictionary as part of the results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1227-1256",
+            "content": "            for k in list(imgs):\n                if \"color\" in k or \"color_n\" in k:\n                    n, im, i = k\n                    imgs[(n, im,\n                          i)] = imgs[(n, im,\n                                      i)].transpose(Image.FLIP_LEFT_RIGHT)\n            if \"depth_gt\" in imgs:\n                imgs['depth_gt'] = np.array(np.fliplr(imgs['depth_gt']))\n        results['imgs'] = imgs\n        return results\n@PIPELINES.register()\nclass ToArray(object):\n    def __init__(self):\n        pass\n    def __call__(self, results):\n        imgs = results['imgs']\n        for k in list(imgs):\n            if \"color\" in k or \"color_n\" in k or \"color_aug\" in k or \"color_n_aug\" in k:\n                n, im, i = k\n                imgs[(n, im,\n                      i)] = np.array(imgs[(n, im, i)]).astype('float32') / 255.0\n                imgs[(n, im, i)] = imgs[(n, im, i)].transpose((2, 0, 1))\n        if \"depth_gt\" in imgs:\n            imgs['depth_gt'] = np.array(imgs['depth_gt']).astype('float32')\n        results['imgs'] = imgs"
+        },
+        {
+            "comment": "This code defines a class called YowoAug for image augmentation. It takes in parameters such as target size, jitter, hue, saturation, exposure, and valid mode. The class has methods to randomly scale the image, distort the image by changing hue, saturation, and exposure levels, and returns the augmented results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1257-1293",
+            "content": "        return results\n@PIPELINES.register()\nclass YowoAug(object):\n    def __init__(self, target_size=224, jitter=0.2, hue=0.1, saturation=1.5, exposure=1.5, valid_mode=False):\n        self.shape = (target_size, target_size)\n        self.jitter = jitter\n        self.hue = hue\n        self.saturation = saturation\n        self.exposure = exposure\n        self.valid_mode = valid_mode\n    def _rand_scale(self, s):\n        scale = random.uniform(1, s)\n        if (random.randint(1, 10000) % 2):\n            return scale\n        return 1. / scale\n    def _distort_image(self, im, hue, sat, val):\n        im = im.convert('HSV')\n        cs = list(im.split())\n        cs[1] = cs[1].point(lambda i: i * sat)\n        cs[2] = cs[2].point(lambda i: i * val)\n        def _change_hue(x):\n            x += hue * 255\n            if x > 255:\n                x -= 255\n            if x < 0:\n                x += 255\n            return x\n        cs[0] = cs[0].point(_change_hue)\n        im = Image.merge(im.mode, tuple(cs))\n        im = im.convert('RGB')"
+        },
+        {
+            "comment": "The code snippet defines several functions related to image augmentation and truth detection in an object detection task. The \"constrain_image\" function ensures the image is within a specific range of values. \"random_distort_image\" applies distortion to the input image randomly. \"read_truths_args\" reads the ground truth boxes from a file, scales and transforms them accordingly, and checks if the box scale is smaller than the minimum required scale before adding it to new_truths. Lastly, \"_fill_truth_detection\" fills in the ground truth detection with additional parameters like flip, dx, dy, sx, and sy.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1294-1321",
+            "content": "        # constrain_image(im)\n        return im\n    def _random_distort_image(self, im, dhue, dsat, dexp):\n        res = self._distort_image(im, dhue, dsat, dexp)\n        return res\n    def _read_truths_args(self, lab_path, min_box_scale):\n        truths = np.loadtxt(lab_path)\n        truths = np.reshape(truths, (truths.size // 5, 5))\n        new_truths = []\n        for i in range(truths.shape[0]):\n            cx = (truths[i][1] + truths[i][3]) / (2 * 320)\n            cy = (truths[i][2] + truths[i][4]) / (2 * 240)\n            imgw = (truths[i][3] - truths[i][1]) / 320\n            imgh = (truths[i][4] - truths[i][2]) / 240\n            truths[i][0] = truths[i][0] - 1\n            truths[i][1] = cx\n            truths[i][2] = cy\n            truths[i][3] = imgw\n            truths[i][4] = imgh\n            if truths[i][3] < min_box_scale:\n                continue\n            new_truths.append([truths[i][0], truths[i][1], truths[i][2], truths[i][3], truths[i][4]])\n        return np.array(new_truths)\n    def _fill_truth_detection(self, labpath, flip, dx, dy, sx, sy):"
+        },
+        {
+            "comment": "The code resizes and normalizes bounding box coordinates from a loaded label file, adjusts them based on image size scaling factors and offsets, and updates the bounding boxes accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1322-1352",
+            "content": "        max_boxes = 50\n        label = np.zeros((max_boxes, 5))\n        bs = np.loadtxt(labpath)\n        bs = np.reshape(bs, (-1, 5))\n        for i in range(bs.shape[0]):\n            cx = (bs[i][1] + bs[i][3]) / (2 * 320)\n            cy = (bs[i][2] + bs[i][4]) / (2 * 240)\n            imgw = (bs[i][3] - bs[i][1]) / 320\n            imgh = (bs[i][4] - bs[i][2]) / 240\n            bs[i][0] = bs[i][0] - 1\n            bs[i][1] = cx\n            bs[i][2] = cy\n            bs[i][3] = imgw\n            bs[i][4] = imgh\n        cc = 0\n        for i in range(bs.shape[0]):\n            x1 = bs[i][1] - bs[i][3] / 2\n            y1 = bs[i][2] - bs[i][4] / 2\n            x2 = bs[i][1] + bs[i][3] / 2\n            y2 = bs[i][2] + bs[i][4] / 2\n            x1 = min(0.999, max(0, x1 * sx - dx))\n            y1 = min(0.999, max(0, y1 * sy - dy))\n            x2 = min(0.999, max(0, x2 * sx - dx))\n            y2 = min(0.999, max(0, y2 * sy - dy))\n            bs[i][1] = (x1 + x2) / 2\n            bs[i][2] = (y1 + y2) / 2\n            bs[i][3] = (x2 - x1)"
+        },
+        {
+            "comment": "This code initializes a list of bounding boxes, applies jitter if not in valid mode (randomly adjusts image size and position), reshapes the list into a single array of bounding boxes, and returns it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1353-1389",
+            "content": "            bs[i][4] = (y2 - y1)\n            if flip:\n                bs[i][1] = 0.999 - bs[i][1]\n            if bs[i][3] < 0.001 or bs[i][4] < 0.001:\n                continue\n            label[cc] = bs[i]\n            cc += 1\n            if cc >= 50:\n                break\n        label = np.reshape(label, (-1))\n        return label\n    def __call__(self, results):\n        clip = results['imgs']\n        frame_num = len(clip)\n        oh = clip[0].height\n        ow = clip[0].width\n        labpath = results['filename'].replace('jpg', 'txt').replace('rgb-images', 'labels')\n        if not self.valid_mode:\n            dw = int(ow * self.jitter)\n            dh = int(oh * self.jitter)\n            pleft = random.randint(-dw, dw)\n            pright = random.randint(-dw, dw)\n            ptop = random.randint(-dh, dh)\n            pbot = random.randint(-dh, dh)\n            swidth = ow - pleft - pright\n            sheight = oh - ptop - pbot\n            sx = float(swidth) / ow\n            sy = float(sheight) / oh\n            dx = (float(pleft) / ow) / sx"
+        },
+        {
+            "comment": "This code performs image augmentations and label manipulations. It applies random crop, resize, flip (horizontally), and distortion to the image(s) with a certain probability. The label is either filled from the truth detection or set to zero vector depending on the size of the extracted truth data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1390-1418",
+            "content": "            dy = (float(ptop) / oh) / sy\n            flip = random.randint(1, 10000) % 2\n            dhue = random.uniform(-self.hue, self.hue)\n            dsat = self._rand_scale(self.saturation)\n            dexp = self._rand_scale(self.exposure)\n            # Augment\n            cropped = [img.crop((pleft, ptop, pleft + swidth - 1, ptop + sheight - 1)) for img in clip]\n            sized = [img.resize(self.shape) for img in cropped]\n            if flip:\n                sized = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in sized]\n            clip = [self._random_distort_image(img, dhue, dsat, dexp) for img in sized]\n            label = self._fill_truth_detection(labpath, flip, dx, dy, 1. / sx, 1. / sy)\n        else:\n            label = np.zeros([50 * 5])\n            tmp = self._read_truths_args(labpath, 8.0 / clip[0].width).astype('float32')\n            tmp = np.reshape(tmp, [-1])\n            tsz = tmp.size\n            if tsz > 50 * 5:\n                label = tmp[0:50 * 5]\n            elif tsz > 0:\n                label[0:tsz] = tmp"
+        },
+        {
+            "comment": "Resizes images to a specific shape, converts them to float32 type and scales by 255.0, concatenates frames into a single array, transposes array dimensions, stores image and label arrays in 'results', returns results",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/augmentations.py\":1419-1426",
+            "content": "            clip = [img.resize(self.shape) for img in clip]\n        clip = [np.asarray(img).astype('float32') / 255.0 for img in clip]\n        clip = np.concatenate(clip, 0).reshape([frame_num, 224, 224, 3])\n        clip = np.transpose(clip, [3, 0, 1, 2])\n        results['imgs'] = clip\n        results['labels'] = label\n        return results"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d1efb8e7-a9de-40ad-ba64-9fab006ae520.json b/docs/doc/d1efb8e7-a9de-40ad-ba64-9fab006ae520.json
new file mode 100644
index 000000000..f86de79d1
--- /dev/null
+++ b/docs/doc/d1efb8e7-a9de-40ad-ba64-9fab006ae520.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet is a license notice and import statement for the PaddleVideo library in Python. It sets the copyright, licensing information, and imports the version module from the same directory.",
+    "details": [
+        {
+            "comment": "This code snippet is a license notice and import statement for the PaddleVideo library in Python. It sets the copyright, licensing information, and imports the version module from the same directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/__init__.py\":0-16",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom .version import paddlevideo_version"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d3766e63-d8f2-49e9-90d8-d9a35099b357.json b/docs/doc/d3766e63-d8f2-49e9-90d8-d9a35099b357.json
new file mode 100644
index 000000000..e0ca96b27
--- /dev/null
+++ b/docs/doc/d3766e63-d8f2-49e9-90d8-d9a35099b357.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code sets up libraries, initializes DALI and TSN model, creates a dataloader, builds solver, trains model with optimization steps, logs performance metrics, updates learning rates, supports resuming training/finetuning, and saves states at intervals.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and modules, sets up licenses, and imports functions from other files for model building, solver configuration, and additional utility functions. It also defines a loader for TSN-Dali dataset and functions for input data preparation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_dali.py\":0-24",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport time\nimport os.path as osp\nimport paddle\nfrom ..modeling.builder import build_model\nfrom ..solver import build_lr, build_optimizer\nfrom ..utils import do_preciseBN\nfrom paddlevideo.utils import get_logger, coloring\nfrom paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,\n                               save, load, mkdir)\nfrom paddlevideo.loader import TSN_Dali_loader, get_input_data"
+        },
+        {
+            "comment": "This code snippet initializes and trains a DALI (Data Augmentation and Input Pipeline Library) for the TSN model. It first constructs the model, creates a Dali dataloader, builds a solver with specified optimizer and learning rate, and then resumes training from the last checkpoint if provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_dali.py\":25-62",
+            "content": "\"\"\"\nWe only supported DALI training for TSN model now.\n\"\"\"\ndef train_dali(cfg, weights=None, parallel=True):\n    \"\"\"Train model entry\n    Args:\n    \tcfg (dict): configuration.\n        weights (str): weights path for finetuning.\n    \tparallel (bool): Whether multi-cards training. Default: True.\n    \"\"\"\n    logger = get_logger(\"paddlevideo\")\n    batch_size = cfg.DALI_LOADER.get('batch_size', 8)\n    places = paddle.set_device('gpu')\n    model_name = cfg.model_name\n    output_dir = cfg.get(\"output_dir\", f\"./output/{model_name}\")\n    mkdir(output_dir)\n    # 1. Construct model\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    # 2. Construct dali dataloader\n    train_loader = TSN_Dali_loader(cfg.DALI_LOADER).build_dali_reader()\n    # 3. Construct solver.\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, None)\n    optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)\n    # Resume\n    resume_epoch = cfg.get(\"resume_epoch\", 0)\n    if resume_epoch:\n        filename = osp.join(output_dir,"
+        },
+        {
+            "comment": "This code snippet is part of a model training pipeline. It first checks if the resume_epoch is 0 or if weights are provided for finetuning, then loads and sets the corresponding state dictionaries for the model and optimizer. The model is trained for specified epochs, with the option to continue from a previous epoch or start from scratch depending on the resume_epoch and weights inputs. It also records reader time during training loop iterations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_dali.py\":63-87",
+            "content": "                            model_name + f\"_epoch_{resume_epoch:05d}\")\n        resume_model_dict = load(filename + '.pdparams')\n        resume_opt_dict = load(filename + '.pdopt')\n        model.set_state_dict(resume_model_dict)\n        optimizer.set_state_dict(resume_opt_dict)\n    # Finetune:\n    if weights:\n        assert resume_epoch == 0, f\"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it.\"\n        model_dict = load(weights)\n        model.set_state_dict(model_dict)\n    # 4. Train Model\n    for epoch in range(0, cfg.epochs):\n        if epoch < resume_epoch:\n            logger.info(\n                f\"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... \"\n            )\n            continue\n        model.train()\n        record_list = build_record(cfg.MODEL)\n        tic = time.time()\n        for i, data in enumerate(train_loader):\n            data = get_input_data(data)\n            record_list['reader_time'].update(time.time() - tic)"
+        },
+        {
+            "comment": "This code is training a model. It performs forward, backward pass, and optimization steps before logging performance metrics and updating learning rates. The model takes input data and calculates outputs in 'train' mode. Then, it calculates the average loss from the outputs. Next, it updates gradients using backward propagation, optimizes the model with step and clears gradients. It records log information such as learning rate and batch time for later analysis. The code also checks if there is an interval in the training to log current metrics and provides an instance per second rate (ips) as performance indicator. Lastly, it updates learning rates using both iteration steps and epoch steps, based on configuration settings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_dali.py\":88-115",
+            "content": "            # 4.1 forward\n            outputs = model(data, mode='train')\n            # 4.2 backward\n            avg_loss = outputs['loss']\n            avg_loss.backward()\n            # 4.3 minimize\n            optimizer.step()\n            optimizer.clear_grad()\n            # log record\n            record_list['lr'].update(optimizer._global_learning_rate(),\n                                     batch_size)\n            for name, value in outputs.items():\n                record_list[name].update(value, batch_size)\n            record_list['batch_time'].update(time.time() - tic)\n            tic = time.time()\n            if i % cfg.get(\"log_interval\", 10) == 0:\n                ips = \"ips: {:.5f} instance/sec.\".format(\n                    batch_size / record_list[\"batch_time\"].val)\n                log_batch(record_list, i, epoch + 1, cfg.epochs, \"train\", ips)\n            # learning rate iter step\n            if cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n                lr.step()\n        # learning rate epoch step"
+        },
+        {
+            "comment": "This code chunk performs the following actions:\n1. Checks if learning rate should be updated based on iteration count.\n2. Calculates and logs the training instance speed (ips).\n3. Optionally applies precise Batch Normalization (bn) to improve accuracy.\n4. Saves the model's and optimizer's state every 'save_interval' epochs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_dali.py\":116-140",
+            "content": "        if not cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n            lr.step()\n        ips = \"ips: {:.5f} instance/sec.\".format(\n            batch_size * record_list[\"batch_time\"].count /\n            record_list[\"batch_time\"].sum)\n        log_epoch(record_list, epoch + 1, \"train\", ips)\n        # use precise bn to improve acc\n        if cfg.get(\"PRECISEBN\") and (epoch % cfg.PRECISEBN.preciseBN_interval\n                                     == 0 or epoch == cfg.epochs - 1):\n            do_preciseBN(\n                model, train_loader, parallel,\n                min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader)))\n        # 5. Save model and optimizer\n        if epoch % cfg.get(\"save_interval\", 1) == 0 or epoch == cfg.epochs - 1:\n            save(\n                optimizer.state_dict(),\n                osp.join(output_dir,\n                         model_name + f\"_epoch_{epoch+1:05d}.pdopt\"))\n            save(\n                model.state_dict(),\n                osp.join(output_dir,\n                         model_name + f\"_epoch_{epoch+1:05d}.pdparams\"))"
+        },
+        {
+            "comment": "This line logs the completion of training a specific model using the \"logger.info\" function, indicating that the training process for the specified \"model_name\" has ended.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_dali.py\":142-142",
+            "content": "    logger.info(f'training {model_name} finished')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d42d08af-e62d-41d7-9e7f-56974ebfd3b8.json b/docs/doc/d42d08af-e62d-41d7-9e7f-56974ebfd3b8.json
new file mode 100644
index 000000000..f66a0fe98
--- /dev/null
+++ b/docs/doc/d42d08af-e62d-41d7-9e7f-56974ebfd3b8.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code introduces a FrozenBatchNorm2d class for static batch normalization and DeepLab class as a neural network backbone, allowing freezing of BatchNorm layers. The code also provides methods to get parameters with different learning rates and creates an instance of the model, evaluates it, generates input data, and outputs its shape.",
+    "details": [
+        {
+            "comment": "This code defines a FrozenBatchNorm2d class and imports necessary modules. The class is used to create a batch normalization layer where the parameters are frozen, meaning they will not be updated during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py\":0-25",
+            "content": "import paddle\nimport paddle.nn as nn\nfrom ..registry import BACKBONES\nfrom EIVideo.paddlevideo.modeling.backbones.aspp_manet import build_aspp\nfrom EIVideo.paddlevideo.modeling.backbones.decoder_manet import build_decoder\nfrom EIVideo.paddlevideo.modeling.backbones.resnet_manet import build_backbone\nclass FrozenBatchNorm2d(nn.Layer):\n    def __init__(self, n):\n        super(FrozenBatchNorm2d, self).__init__()\n        self.register_buffer(\"weight\", paddle.ones(n))\n        self.register_buffer(\"bias\", paddle.zeros(n))\n        self.register_buffer(\"running_mean\", paddle.zeros(n))\n        self.register_buffer(\"running_var\", paddle.ones(n))\n    def forward(self, x):\n        if x.dtype == paddle.float16:\n            self.weight = self.weight.half()\n            self.bias = self.bias.half()\n            self.running_mean = self.running_mean.half()\n            self.running_var = self.running_var.half()\n        scale = self.weight * self.running_var.rsqrt()\n        bias = self.bias - self.running_mean * scale\n        scale = scale.reshape(1, -1, 1, 1)"
+        },
+        {
+            "comment": "The code defines a DeepLab class as a neural network backbone, which uses other modules (backbone, ASPP, and decoder) for feature extraction and classification. It takes input and returns output after passing through these modules. The freeze_bn method can be called to freeze the BatchNorm layers if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py\":26-60",
+            "content": "        bias = bias.reshape(1, -1, 1, 1)\n        return x * scale + bias\n@BACKBONES.register()\nclass DeepLab(nn.Layer):\n    def __init__(self,\n                 backbone='resnet',\n                 output_stride=16,\n                 num_classes=21,\n                 freeze_bn=False,\n                 pretrained=None):\n        super(DeepLab, self).__init__()\n        if backbone == 'drn':\n            output_stride = 8\n        if freeze_bn == True:\n            print(\"Use frozen BN in DeepLab\")\n            BatchNorm = FrozenBatchNorm2d\n        else:\n            BatchNorm = nn.BatchNorm2D\n        self.backbone = build_backbone(output_stride, BatchNorm, pretrained)\n        self.aspp = build_aspp(backbone, output_stride, BatchNorm)\n        self.decoder = build_decoder(num_classes, backbone, BatchNorm)\n    def forward(self, input):\n        x, low_level_feat = self.backbone(input)\n        x = self.aspp(x)\n        x = self.decoder(x, low_level_feat)\n        return x\n    def freeze_bn(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.BatchNorm2D):"
+        },
+        {
+            "comment": "This code defines a DeepLab model with backbone options, sets the model to evaluation mode, and provides two methods for getting parameters with different learning rates. The main part of the code creates an instance of the model, evaluates it, generates random input data, and outputs its shape.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py\":61-89",
+            "content": "                m.eval()\n    def get_1x_lr_params(self):\n        modules = [self.backbone]\n        for i in range(len(modules)):\n            for m in modules[i].named_modules():\n                if isinstance(m[1], nn.Conv2D) or isinstance(\n                        m[1], nn.BatchNorm2D):\n                    for p in m[1].parameters():\n                        if p.requires_grad:\n                            yield p\n    def get_10x_lr_params(self):\n        modules = [self.aspp, self.decoder]\n        for i in range(len(modules)):\n            for m in modules[i].named_modules():\n                if isinstance(m[1], nn.Conv2D) or isinstance(\n                        m[1], nn.BatchNorm2D):\n                    for p in m[1].parameters():\n                        if p.requires_grad:\n                            yield p\nif __name__ == \"__main__\":\n    model = DeepLab(backbone='resnet', output_stride=16)\n    model.eval()\n    input = paddle.rand([2, 3, 513, 513])\n    output = model(input)\n    print(output.shape)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d4bc0edc-611c-42b9-b972-d1e7597de4c6.json b/docs/doc/d4bc0edc-611c-42b9-b972-d1e7597de4c6.json
new file mode 100644
index 000000000..3b8102a01
--- /dev/null
+++ b/docs/doc/d4bc0edc-611c-42b9-b972-d1e7597de4c6.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code allows users to download the Kinetics-400 dataset using two methods and provides a script for extracting frames. The training and validation sets require around 135GB and 2TB of storage space, respectively. Two data categories are included with list file links.",
+    "details": [
+        {
+            "comment": "This code provides information on downloading the Kinetics-400 dataset, which is commonly used in video tasks. It offers two methods for download: Baidu network disk and script download. The code also directs to official resources like the Kinetics website and ActivityNet repository for further details.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/k400.md\":0-26",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/k400.md) | English\n# Kinetics-400 Preparation\n- [Introduction](#Introduction)\n- [Download](#Download)\n- [Frames](#Frames)\n---\n## Introduction\nKinetics-400 is a commonly used benchmark dataset in the video field. Please refer to its official website [Kinetics](https://deepmind.com/research/open-source/kinetics) for details. You can refer to the official address [ActivityNet](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics), and use the download script provided to download the dataset.\n## Download\nConsidering the difficulty of downloading the K400 data set, we provide two download methods: (1) Baidu network disk download (2) Script download\n### Baidu SkyDrive Download\nNetdisk link: https://pan.baidu.com/s/1S_CGBjWOUAuxL_cCX5kMPg\nExtraction code: `ppvi`\n### Script download\n- Download the training set link list file [train_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list) and the validation set link list file [val_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list)."
+        },
+        {
+            "comment": "This script downloads training and validation sets from provided links, unzips them, and adds the data paths to respective list files. Due to broken video links, approximately 135GB of storage space is required. The frames extracted from videos help in accelerating network training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/k400.md\":28-64",
+            "content": "Write the download script `download.sh` as follows:\n```bash\nfile=$1\nwhile read line \ndo\n  wget \"$line\"\ndone <$file\n```\nDownload training set command:\n```bash\nbash download.sh train_link.list\n```\nDownload verification set command:\n```bash\nbash download.sh val_link.list\n```\n---\n|category | Number of data  | list file |\n| :------: | :----------: | :----: |\n|Training set | 234619  |  [train.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list)|\n|Validation set | 19761 |  [val.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list)|\n- After downloading, unzip and add the data path to list file.\n- Due to the failure of some video link, part of original data is missing. This copies need about 135G of storage space.\n> This copies is only used for academic research. If it is helpful to you, welcome to star [our project](https://github.com/PaddlePaddle/PaddleVideo)\n## Frames\nIn order to speed up the training process of the network, we first extract frames from the video file (K4"
+        },
+        {
+            "comment": "This code explains how to extract frames from the K400 video file in mp4 format using the \"extract_rawframes.py\" script, and provides the command to execute it with specified paths for videos and raw frames folders, along with level and ext parameters. The extracted frames will be stored in the ./rawframes path, occupying around 2TB of space. The code also mentions two data categories - training set (234619 files) and validation set (19761 files), along with their respective list file links for easy reference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/k400.md\":64-77",
+            "content": "00 video file is in mp4 format). Compared with the method of network training directly through video files, the method of frames can greatly accelerate the speed of network training\u3002\nEnter the following command to extract the frames of the K400 video file\n```python\npython extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4\n```\nAfter the video file frames are extracted, they will be stored in the specified `./rawframes` path, and the size is about 2T.\n|category | Number of data  | list file |\n| :------: | :----------: | :----: |\n|Training set | 234619  |  [train_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list)|\n|Validation set | 19761 |  [val_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list)|"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d4cf6c01-e76c-4b45-bddb-e8335224bdb4.json b/docs/doc/d4cf6c01-e76c-4b45-bddb-e8335224bdb4.json
new file mode 100644
index 000000000..88e6f2d06
--- /dev/null
+++ b/docs/doc/d4cf6c01-e76c-4b45-bddb-e8335224bdb4.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The FeatureReader class, a DataReader subclass, reads video features using LSTM, attention cluster, and NextVlad models for YouTube-8M dataset. It handles multimodal data loading, exception handling, label manipulation, soft labels generation, and batch input feature creation. A function loads words and their indices from a file into a dictionary.",
+    "details": [
+        {
+            "comment": "FeatureReader class is a subclass of DataReader, which reads video features from files using Pickle. It uses pandas and includes ExtractEmbeddingReader to read Ernie tasks and provides data reader functions for train/test splits.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py\":0-38",
+            "content": "\"\"\"\nfeature reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\n    from io import BytesIO\nimport numpy as np\nimport random\nimport os\nimport traceback\nimport pickle\npython_ver = sys.version_info\nfrom collections import defaultdict\nimport pandas as pd\nfrom .ernie_task_reader import ExtractEmbeddingReader\nfrom .reader_utils import DataReader\nclass FeatureReader(DataReader):"
+        },
+        {
+            "comment": "The code initializes a data reader for YouTube-8M dataset, which contains features extracted by prior networks. It supports three models: LSTM, attention cluster, and NextVlad. The constructor takes the name, mode (train or test), and configuration parameters as inputs. It sets the batch size, file list, eigen_file (for NextVlad only), number of segments (num_seg), loss type, and initializes an ExtractEmbeddingReader using a vocab.txt file and maximum sequence length (text_max_len).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py\":39-66",
+            "content": "    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm, attention cluster, nextvlad\n    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg):\n        \"\"\"\n        init\n        \"\"\"\n        self.name = name\n        self.mode = mode\n        self.num_classes = cfg.MODEL.num_classes\n        # set batch size and file list\n        self.batch_size = cfg[mode.upper()]['batch_size']\n        self.filelist = cfg[mode.upper()]['filelist']\n        self.eigen_file = cfg.MODEL.get('eigen_file', None)\n        self.num_seg = cfg.MODEL.get('num_seg', None)\n        self.loss_type = cfg.TRAIN['loss_type']\n        vocab_file = os.path.join(cfg.TRAIN.ernie_pretrain_dict_path,\n                                  'vocab.txt')\n        self.ernie_reader = ExtractEmbeddingReader(\n            vocab_path=vocab_file,\n            max_seq_len=cfg.MODEL.text_max_len,"
+        },
+        {
+            "comment": "The code loads a class dictionary and a video file information based on the given configuration. It then creates a reader function that iterates through the URLs, checks if a file exists for each URL, and skips if it doesn't. If the file exists, it loads the data (pickle format) using the appropriate pickle version for Python < 3.0 or >= 3.0.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py\":67-94",
+            "content": "            do_lower_case=True)\n        url_title_label_file = cfg[mode.upper()]['url_title_label_file']\n        self.class_dict = load_class_file(cfg.MODEL.class_name_file)\n        self.url_title_info = load_video_file(url_title_label_file,\n                                              self.class_dict, mode)\n    def create_reader(self):\n        \"\"\"\n        create reader\n        \"\"\"\n        url_list = list(self.url_title_info.keys())\n        if self.mode == 'train':\n            random.shuffle(url_list)\n        def reader():\n            \"\"\"reader\n            \"\"\"\n            batch_out = []\n            for url in url_list:\n                try:\n                    filepath = os.path.join(\n                        self.filelist,\n                        url.split('/')[-1].split('.')[0] + '.pkl')\n                    if os.path.exists(filepath) is False:\n                        continue\n                    if python_ver < (3, 0):\n                        record = pickle.load(open(filepath, 'rb'))\n                    else:"
+        },
+        {
+            "comment": "This code reads data from a file, prepares and processes it into various formats. It first loads the record from a file with pickle, then extracts text, RGB image data, and audio data (defaulting to zeroes if no audio is present). The code also generates one-hot encoding for the text using the ernie_reader. It obtains the video data and depending on the mode, assigns labels either as one-hot or softmax based on the loss type specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py\":95-112",
+            "content": "                        record = pickle.load(open(filepath, 'rb'),\n                                             encoding='iso-8859-1')\n                    text_raw = self.url_title_info[url]['title']\n                    rgb = record['feature']['image_pkl'].astype(float)\n                    if record['feature']['audio_pkl'].shape[0] == 0:\n                        audio_pkl = np.zeros((10, 128))\n                        audio = audio_pkl.astype(float)\n                    else:\n                        audio = record['feature']['audio_pkl'].astype(float)\n                    text_one_hot = self.ernie_reader.data_generate_from_text(\n                        str(text_raw))\n                    video = record['video']\n                    if self.mode != 'infer':\n                        label = self.url_title_info[url]['label']\n                        label = [int(w) for w in label]\n                        if self.loss_type == 'sigmoid':\n                            label = make_one_hot(label, self.num_classes)\n                        elif self.loss_type == 'softmax':"
+        },
+        {
+            "comment": "This code is part of a data reader for multimodal video tagging. It reads in RGB images, audio clips, and text one-hot vectors, then appends them to a batch. If a label is available, it converts the label to a softmax output; otherwise, it yields the video itself. The code handles exceptions during data loading and allows for inferencing. Configuration values are retrieved using get_config_from_sec function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py\":113-139",
+            "content": "                            label = make_one_soft_hot(label, self.num_classes,\n                                                      False)\n                        batch_out.append((rgb, audio, text_one_hot, label))\n                    else:\n                        batch_out.append((rgb, audio, text_one_hot, video))\n                    if len(batch_out) == self.batch_size:\n                        yield batch_out\n                        batch_out = []\n                except Exception as e:\n                    print(\"warning: load data {} failed, {}\".format(\n                        filepath, str(e)))\n                    traceback.print_exc()\n                    continue\n# if self.mode == 'infer' and len(batch_out) > 0:\n            if len(batch_out) > 0:\n                yield batch_out\n        return reader\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"get_config_from_sec\n        \"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)"
+        },
+        {
+            "comment": "This code defines a function load_video_file() that reads a label file in tab-separated format and stores the URLs, titles, and labels into a dictionary called url_info_dict. It also contains another function dequantize(), but this one is not used in the current code block. The load_video_file() function checks for NA values and splits the labels by comma before processing. If 'mode' is set to 'infer', it only stores title information; otherwise, it processes the labels. Finally, it prints the number of processed videos and returns the url_info_dict dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py\":142-172",
+            "content": "def load_video_file(label_file, class_dict, mode='train'):\n    \"\"\"\n    labelfile formate: URL \\t title \\t label1,label2\n    return dict\n    \"\"\"\n    data = pd.read_csv(label_file, sep='\\t', header=None)\n    url_info_dict = defaultdict(dict)\n    for index, row in data.iterrows():\n        url = row[0]\n        if url in url_info_dict:\n            continue\n        if pd.isna(row[1]):\n            title = \"\"\n        else:\n            title = str(row[1])\n        if mode == 'infer':\n            url_info_dict[url] = {'title': title}\n        else:\n            if pd.isna(row[2]):\n                continue\n            labels = row[2].split(',')\n            labels_idx = [class_dict[w] for w in labels if w in class_dict]\n            if len(labels_idx) < 1:\n                continue\n            if url not in url_info_dict:\n                url_info_dict[url] = {'label': labels_idx, 'title': title}\n    print('load video %d' % (len(url_info_dict)))\n    return url_info_dict\ndef dequantize(feat_vector, max_quantized_value=2., min_quantized_value=-2.):"
+        },
+        {
+            "comment": "This code contains a series of functions for handling and manipulating label data. The 'feature_reader' function dequantizes feature values, while the 'label_smmoth' function modifies a one-hot label vector by replacing zeros with a specific smoothness value. The 'make_one_soft_hot' function creates a one-hot soft label based on the input label and applies label smoothing if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py\":173-211",
+            "content": "    \"\"\"\n    Dequantize the feature from the byte format to the float format\n    \"\"\"\n    assert max_quantized_value > min_quantized_value\n    quantized_range = max_quantized_value - min_quantized_value\n    scalar = quantized_range / 255.0\n    bias = (quantized_range / 512.0) + min_quantized_value\n    return feat_vector * scalar + bias\nepsilon = 0.1\nsmmoth_score = (1.0 / float(210)) * epsilon\ndef label_smmoth(label_one_hot_vector):\n    \"\"\"\n    label_smmoth\n    \"\"\"\n    global smmoth_score\n    for i in range(len(label_one_hot_vector)):\n        if label_one_hot_vector[i] == 0:\n            label_one_hot_vector[i] = smmoth_score\n    return label_one_hot_vector\ndef make_one_soft_hot(label, dim=15, label_smmoth=False):\n    \"\"\"\n    make_one_soft_hot\n    \"\"\"\n    one_hot_soft_label = np.zeros(dim)\n    one_hot_soft_label = one_hot_soft_label.astype(float)\n    # multi-labelis\n    # label smmoth\n    if label_smmoth:\n        one_hot_soft_label = label_smmoth(one_hot_soft_label)\n    label_len = len(label)\n    prob = (1 - np.sum(one_hot_soft_label)) / float(label_len)"
+        },
+        {
+            "comment": "This code defines several functions for generating one-hot labels, creating random indices, and getting batch input features for a specific application. It uses numpy arrays for efficient operations and handling multidimensional data. The functions can be used in the context of multimodal video analysis, where labels, text inputs, and other data are processed for further processing or model training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py\":212-250",
+            "content": "    for ind in label:\n        one_hot_soft_label[ind] += prob\n    #one_hot_soft_label = label_smmoth(one_hot_soft_label)\n    return one_hot_soft_label\ndef make_one_hot(label, dim=15):\n    \"\"\"\n    make_one_hot\n    \"\"\"\n    one_hot_soft_label = np.zeros(dim)\n    one_hot_soft_label = one_hot_soft_label.astype(float)\n    for ind in label:\n        one_hot_soft_label[ind] = 1\n    return one_hot_soft_label\ndef generate_random_idx(feature_len, num_seg):\n    \"\"\"\n    generate_random_idx\n    \"\"\"\n    idxs = []\n    stride = float(feature_len) / num_seg\n    for i in range(num_seg):\n        pos = (i + np.random.random()) * stride\n        idxs.append(min(feature_len - 1, int(pos)))\n    return idxs\ndef get_batch_ernie_input_feature(reader, texts):\n    \"\"\"\n    get_batch_ernie_input_feature\n    \"\"\"\n    result_list = reader.data_generate_from_texts(texts)\n    result_trans = []\n    for i in range(len(texts)):\n        result_trans.append([result_list[0][i],\\\n                             result_list[1][i],\n                             result_list[2][i],"
+        },
+        {
+            "comment": "The code contains a function that loads and returns a dictionary containing words and their corresponding indices from a class file. The function reads the lines of the file, removes any leading or trailing whitespace, splits the line based on tabs, assigns the first element as the word and the second element as the index (if available), then adds these key-value pairs to a dictionary. This dictionary is returned as the result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py\":251-273",
+            "content": "                             result_list[3][i],\n                             result_list[4][i]])\n    return np.array(result_trans)\ndef load_class_file(class_file):\n    \"\"\"\n    load_class_file\n    \"\"\"\n    class_lines = open(class_file, 'r', encoding='utf8').readlines()\n    class_dict = {}\n    for i, line in enumerate(class_lines):\n        tmp = line.strip().split('\\t')\n        word = tmp[0]\n        index = str(i)\n        if len(tmp) == 2:\n            index = tmp[1]\n        class_dict[word] = index\n    return class_dict\nif __name__ == '__main__':\n    pass"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d4d32b73-4eb8-4bfd-9b8b-cfea23010106.json b/docs/doc/d4d32b73-4eb8-4bfd-9b8b-cfea23010106.json
new file mode 100644
index 000000000..689effb9f
--- /dev/null
+++ b/docs/doc/d4d32b73-4eb8-4bfd-9b8b-cfea23010106.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines a 3D Recognizer model and framework in PaddleVideo, with classes and methods for training, validation, and testing. It includes two methods, \"test_step\" and \"infer_step\", used for testing or inferring on limited data batches.",
+    "details": [
+        {
+            "comment": "This code defines a 3D Recognizer model framework that takes in input images, casts them to float32 type and unsqueeze the first image for dimension alignment. The Recognizer3DMRI class inherits from BaseRecognizer and has a forward_net method for defining how the model should run from input to output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nfrom paddlevideo.utils import get_logger\nimport paddle\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass Recognizer3DMRI(BaseRecognizer):\n    \"\"\"3D Recognizer model framework.\n    \"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        imgs[0] = paddle.cast(imgs[0], \"float32\")\n        imgs[1] = paddle.cast(imgs[1], \"float32\")\n        imgs[0] = imgs[0].unsqueeze(1)"
+        },
+        {
+            "comment": "This code defines a recognizer3dMRI model in the PaddleVideo framework. It has three methods: train_step, val_step, and test_step for training, validating, and testing the model, respectively. In each step, it processes image data batches, calls the forward function to generate class scores using a forward_net, applies sigmoid activation, and calculates losses using the head's loss function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py\":31-64",
+            "content": "        imgs[1] = imgs[1].unsqueeze(1)\n        feature = self.backbone(imgs)\n        cls_score = self.head(feature)\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        imgs = data_batch[0:2]\n        labels = data_batch[2:]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        imgs = data_batch[0:2]\n        labels = data_batch[2:]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        cls_score = paddle.nn.functional.sigmoid(cls_score)\n        loss_metrics = self.head.loss(cls_score,\n                                      labels,\n                                      valid_mode=True,\n                                      if_top5=False)\n        return loss_metrics\n    def test_step(self, data_batch):"
+        },
+        {
+            "comment": "This code defines two methods, \"test_step\" and \"infer_step\", which both take a data batch as input and return the class score after calling the forward function in the forward_net object. These steps seem to be used for testing or inferring on a limited subset of the data batch (the first two images).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py\":65-80",
+            "content": "        \"\"\"Test step.\n        \"\"\"\n        imgs = data_batch[0:2]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        imgs = data_batch[0:2]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        return cls_score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d5186a9d-dd54-42b0-9435-930938996858.json b/docs/doc/d5186a9d-dd54-42b0-9435-930938996858.json
new file mode 100644
index 000000000..57a50c0e5
--- /dev/null
+++ b/docs/doc/d5186a9d-dd54-42b0-9435-930938996858.json
@@ -0,0 +1,60 @@
+{
+    "summary": "This function creates mask matrices and BMN class in Paddle.ai, initializes 2D convolutional layers for the BMSN backbone, and defines a video analysis model with layers, activation functions, and returns processed input xp.",
+    "details": [
+        {
+            "comment": "This function generates a sample mask for a boundary-matching pair. It calculates the number of samples per bin and total samples based on segment bounds, total length, and desired numbers of samples.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":0-27",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport numpy as np\nimport paddle\nfrom paddle import ParamAttr\nfrom ..registry import BACKBONES\ndef _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,\n                           num_sample_perbin):\n    \"\"\" generate sample mask for a boundary-matching pair \"\"\"\n    plen = float(seg_xmax - seg_xmin)\n    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)\n    total_samples = [\n        seg_xmin + plen_sample * ii"
+        },
+        {
+            "comment": "This code generates sample masks for each point in a Boundary-Matching Map. It iterates through samples, creates binary vectors for each, and then scales them to obtain the final mask. The resulting masks are stored in an array and returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":28-52",
+            "content": "        for ii in range(num_sample * num_sample_perbin)\n    ]\n    p_mask = []\n    for idx in range(num_sample):\n        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *\n                                    num_sample_perbin]\n        bin_vector = np.zeros([tscale])\n        for sample in bin_samples:\n            sample_upper = math.ceil(sample)\n            sample_decimal, sample_down = math.modf(sample)\n            if (tscale - 1) >= int(sample_down) >= 0:\n                bin_vector[int(sample_down)] += 1 - sample_decimal\n            if (tscale - 1) >= int(sample_upper) >= 0:\n                bin_vector[int(sample_upper)] += sample_decimal\n        bin_vector = 1.0 / num_sample_perbin * bin_vector\n        p_mask.append(bin_vector)\n    p_mask = np.stack(p_mask, axis=1)\n    return p_mask\ndef get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,\n                      num_sample_perbin):\n    \"\"\" generate sample mask for each point in Boundary-Matching Map \"\"\"\n    mask_mat = []\n    for start_index in range(tscale):"
+        },
+        {
+            "comment": "This code generates mask matrices for video frames. It iterates over different duration scales and starts from a given start index. For each duration scale, it creates binary masks using interpolation. If the duration is smaller than the total time scale, it adjusts the sample range to include boundaries. Zero paddings are used if the duration exceeds the total time scale. The generated mask vectors are stacked together and reshaped for final output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":53-76",
+            "content": "        mask_mat_vector = []\n        for duration_index in range(dscale):\n            if start_index + duration_index < tscale:\n                p_xmin = start_index\n                p_xmax = start_index + duration_index\n                center_len = float(p_xmax - p_xmin) + 1\n                sample_xmin = p_xmin - center_len * prop_boundary_ratio\n                sample_xmax = p_xmax + center_len * prop_boundary_ratio\n                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,\n                                                tscale, num_sample,\n                                                num_sample_perbin)\n            else:\n                p_mask = np.zeros([tscale, num_sample])\n            mask_mat_vector.append(p_mask)\n        mask_mat_vector = np.stack(mask_mat_vector, axis=2)\n        mask_mat.append(mask_mat_vector)\n    mask_mat = np.stack(mask_mat, axis=3)\n    mask_mat = mask_mat.astype(np.float32)\n    sample_mask = np.reshape(mask_mat, [tscale, -1])\n    return sample_mask\ndef init_params(name, in_channels, kernel_size):"
+        },
+        {
+            "comment": "This code defines a BMN class as a Paddle.ai layer implementing the BMN model for temporal action proposal generation from the paper \"BMN: Boundary-Matching Network for Temporal Action Proposal Generation\". It has parameters tscale, dscale, prop_boundary_ratio, num_sample, and num_sample_perbin which determine the sequence length, max duration length, ratio of expanded temporal region in proposal boundary, number of samples between starting and ending boundaries of each proposal, and number of selected points in each sample respectively. The code also initializes a ParamAttr with Uniform initializer for weight initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":77-102",
+            "content": "    fan_in = in_channels * kernel_size * 1\n    k = 1. / math.sqrt(fan_in)\n    param_attr = ParamAttr(name=name,\n                           initializer=paddle.nn.initializer.Uniform(low=-k,\n                                                                     high=k))\n    return param_attr\n@BACKBONES.register()\nclass BMN(paddle.nn.Layer):\n    \"\"\"BMN model from\n    `\"BMN: Boundary-Matching Network for Temporal Action Proposal Generation\" <https://arxiv.org/abs/1907.09702>`_\n    Args:\n        tscale (int): sequence length, default 100.\n        dscale (int): max duration length, default 100.\n        prop_boundary_ratio (float): ratio of expanded temporal region in proposal boundary, default 0.5.\n        num_sample (int): number of samples betweent starting boundary and ending boundary of each propoasl, default 32.\n        num_sample_perbin (int):  number of selected points in each sample, default 3.\n    \"\"\"\n    def __init__(\n        self,\n        tscale,\n        dscale,\n        prop_boundary_ratio,\n        num_sample,"
+        },
+        {
+            "comment": "This code defines the BMN class, which is a backbone model. It initializes parameters and includes convolutional layers with ReLU activation functions for feature extraction. The code also includes instance variables for controlling the model's behavior and dimensionality of the hidden states.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":103-136",
+            "content": "        num_sample_perbin,\n        feat_dim=400,\n    ):\n        super(BMN, self).__init__()\n        #init config\n        self.feat_dim = feat_dim\n        self.tscale = tscale\n        self.dscale = dscale\n        self.prop_boundary_ratio = prop_boundary_ratio\n        self.num_sample = num_sample\n        self.num_sample_perbin = num_sample_perbin\n        self.hidden_dim_1d = 256\n        self.hidden_dim_2d = 128\n        self.hidden_dim_3d = 512\n        # Base Module\n        self.b_conv1 = paddle.nn.Conv1D(\n            in_channels=self.feat_dim,\n            out_channels=self.hidden_dim_1d,\n            kernel_size=3,\n            padding=1,\n            groups=4,\n            weight_attr=init_params('Base_1_w', self.feat_dim, 3),\n            bias_attr=init_params('Base_1_b', self.feat_dim, 3))\n        self.b_conv1_act = paddle.nn.ReLU()\n        self.b_conv2 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=self.hidden_dim_1d,\n            kernel_size=3,\n            padding=1,\n            groups=4,"
+        },
+        {
+            "comment": "This code defines a Conv1D block for the BMN model, including an input layer, a temporal evaluation module, and two convolutional layers with ReLU activation functions. The weights and biases are initialized using the 'init_params' function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":137-162",
+            "content": "            weight_attr=init_params('Base_2_w', self.hidden_dim_1d, 3),\n            bias_attr=init_params('Base_2_b', self.hidden_dim_1d, 3))\n        self.b_conv2_act = paddle.nn.ReLU()\n        # Temporal Evaluation Module\n        self.ts_conv1 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=self.hidden_dim_1d,\n            kernel_size=3,\n            padding=1,\n            groups=4,\n            weight_attr=init_params('TEM_s1_w', self.hidden_dim_1d, 3),\n            bias_attr=init_params('TEM_s1_b', self.hidden_dim_1d, 3))\n        self.ts_conv1_act = paddle.nn.ReLU()\n        self.ts_conv2 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=1,\n            kernel_size=1,\n            padding=0,\n            groups=1,\n            weight_attr=init_params('TEM_s2_w', self.hidden_dim_1d, 1),\n            bias_attr=init_params('TEM_s2_b', self.hidden_dim_1d, 1))\n        self.ts_conv2_act = paddle.nn.Sigmoid()\n        self.te_conv1 = paddle.nn.Conv1D("
+        },
+        {
+            "comment": "This code initializes the TEM and PEM modules of a backbone network. It defines several convolutional layers with specific configurations for each module, followed by activation functions. The weight and bias attributes are initialized using the init_params function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":163-188",
+            "content": "            in_channels=self.hidden_dim_1d,\n            out_channels=self.hidden_dim_1d,\n            kernel_size=3,\n            padding=1,\n            groups=4,\n            weight_attr=init_params('TEM_e1_w', self.hidden_dim_1d, 3),\n            bias_attr=init_params('TEM_e1_b', self.hidden_dim_1d, 3))\n        self.te_conv1_act = paddle.nn.ReLU()\n        self.te_conv2 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=1,\n            kernel_size=1,\n            padding=0,\n            groups=1,\n            weight_attr=init_params('TEM_e2_w', self.hidden_dim_1d, 1),\n            bias_attr=init_params('TEM_e2_b', self.hidden_dim_1d, 1))\n        self.te_conv2_act = paddle.nn.Sigmoid()\n        #Proposal Evaluation Module\n        self.p_conv1 = paddle.nn.Conv1D(\n            in_channels=self.hidden_dim_1d,\n            out_channels=self.hidden_dim_2d,\n            kernel_size=3,\n            padding=1,\n            groups=1,\n            weight_attr=init_params('PEM_1d_w', self.hidden_dim_1d, 3),"
+        },
+        {
+            "comment": "This code initializes a backbone model for the BMN architecture. It includes convolutional layers, ReLU activations, and a tensor mask for sampling. The model uses 1D, 2D, and 3D convolutions with specific parameters, as well as applies bias attributes to the weights and biases of the convolutions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":189-214",
+            "content": "            bias_attr=init_params('PEM_1d_b', self.hidden_dim_1d, 3))\n        self.p_conv1_act = paddle.nn.ReLU()\n        # init to speed up\n        sample_mask = get_interp1d_mask(self.tscale, self.dscale,\n                                        self.prop_boundary_ratio,\n                                        self.num_sample, self.num_sample_perbin)\n        self.sample_mask = paddle.to_tensor(sample_mask)\n        self.sample_mask.stop_gradient = True\n        self.p_conv3d1 = paddle.nn.Conv3D(\n            in_channels=128,\n            out_channels=self.hidden_dim_3d,\n            kernel_size=(self.num_sample, 1, 1),\n            stride=(self.num_sample, 1, 1),\n            padding=0,\n            weight_attr=ParamAttr(name=\"PEM_3d1_w\"),\n            bias_attr=ParamAttr(name=\"PEM_3d1_b\"))\n        self.p_conv3d1_act = paddle.nn.ReLU()\n        self.p_conv2d1 = paddle.nn.Conv2D(\n            in_channels=512,\n            out_channels=self.hidden_dim_2d,\n            kernel_size=1,\n            stride=1,\n            padding=0,"
+        },
+        {
+            "comment": "This code initializes a series of 2D convolutional layers with ReLU activation functions for the Batch Multi-Scale Network (BMSN) backbone in PaddleVideo. Each convolutional layer has a specified number of output channels, kernel size, and stride. The weights and biases for each layer are defined using ParamAttr.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":215-245",
+            "content": "            weight_attr=ParamAttr(name=\"PEM_2d1_w\"),\n            bias_attr=ParamAttr(name=\"PEM_2d1_b\"))\n        self.p_conv2d1_act = paddle.nn.ReLU()\n        self.p_conv2d2 = paddle.nn.Conv2D(\n            in_channels=128,\n            out_channels=self.hidden_dim_2d,\n            kernel_size=3,\n            stride=1,\n            padding=1,\n            weight_attr=ParamAttr(name=\"PEM_2d2_w\"),\n            bias_attr=ParamAttr(name=\"PEM_2d2_b\"))\n        self.p_conv2d2_act = paddle.nn.ReLU()\n        self.p_conv2d3 = paddle.nn.Conv2D(\n            in_channels=128,\n            out_channels=self.hidden_dim_2d,\n            kernel_size=3,\n            stride=1,\n            padding=1,\n            weight_attr=ParamAttr(name=\"PEM_2d3_w\"),\n            bias_attr=ParamAttr(name=\"PEM_2d3_b\"))\n        self.p_conv2d3_act = paddle.nn.ReLU()\n        self.p_conv2d4 = paddle.nn.Conv2D(\n            in_channels=128,\n            out_channels=2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            weight_attr=ParamAttr(name=\"PEM_2d4_w\"),"
+        },
+        {
+            "comment": "The code is defining a backbone model for video analysis. It consists of base, TEM (temporal-inspired module), PEM (position-inspired module), and BM (block-matching module) layers. The layers are sequentially applied to the input data with appropriate activation functions and reshaping operations in between. Finally, it performs matrix multiplication with a sample mask and applies additional convolutions and activations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":246-282",
+            "content": "            bias_attr=ParamAttr(name=\"PEM_2d4_b\"))\n        self.p_conv2d4_act = paddle.nn.Sigmoid()\n    def init_weights(self):\n        pass\n    def forward(self, x):\n        #Base Module\n        x = self.b_conv1(x)\n        x = self.b_conv1_act(x)\n        x = self.b_conv2(x)\n        x = self.b_conv2_act(x)\n        #TEM\n        xs = self.ts_conv1(x)\n        xs = self.ts_conv1_act(xs)\n        xs = self.ts_conv2(xs)\n        xs = self.ts_conv2_act(xs)\n        xs = paddle.squeeze(xs, axis=[1])\n        xe = self.te_conv1(x)\n        xe = self.te_conv1_act(xe)\n        xe = self.te_conv2(xe)\n        xe = self.te_conv2_act(xe)\n        xe = paddle.squeeze(xe, axis=[1])\n        #PEM\n        xp = self.p_conv1(x)\n        xp = self.p_conv1_act(xp)\n        #BM layer\n        xp = paddle.matmul(xp, self.sample_mask)\n        xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale])\n        xp = self.p_conv3d1(xp)\n        xp = self.p_conv3d1_act(xp)\n        xp = paddle.squeeze(xp, axis=[2])\n        xp = self.p_conv2d1(xp)\n        xp = self.p_conv2d1_act(xp)"
+        },
+        {
+            "comment": "This code is part of a neural network backbone model. It applies multiple convolution layers with activation functions and returns the processed input xp, along with other variables xs and xe.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/bmn.py\":283-289",
+            "content": "        xp = self.p_conv2d2(xp)\n        xp = self.p_conv2d2_act(xp)\n        xp = self.p_conv2d3(xp)\n        xp = self.p_conv2d3_act(xp)\n        xp = self.p_conv2d4(xp)\n        xp = self.p_conv2d4_act(xp)\n        return xp, xs, xe"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d51c241c-3bc0-4d07-b366-bf5c685b32cf.json b/docs/doc/d51c241c-3bc0-4d07-b366-bf5c685b32cf.json
new file mode 100644
index 000000000..a2341db65
--- /dev/null
+++ b/docs/doc/d51c241c-3bc0-4d07-b366-bf5c685b32cf.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The code imports a Registry class from the utils module and initializes the METRIC as an instance of this Registry, designed to store and manage different types of metrics.",
+    "details": [
+        {
+            "comment": "The code imports a Registry class from the utils module and initializes the METRIC as an instance of this Registry, designed to store and manage different types of metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py\":0-16",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nMETRIC = Registry('metric')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d55dded7-4589-4a37-8eec-5c58124315a8.json b/docs/doc/d55dded7-4589-4a37-8eec-5c58124315a8.json
new file mode 100644
index 000000000..7618cd4eb
--- /dev/null
+++ b/docs/doc/d55dded7-4589-4a37-8eec-5c58124315a8.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The Python module supports PaddleVideo's VideoTag app, includes a model class for subclassing with base methods, and handles weights, dataloader, pre-trained models, weight file paths, and downloads. It also provides a ModelZoo class for managing models and functions to get/register models.",
+    "details": [
+        {
+            "comment": "This code is a Python module for the PaddleVideo project's VideoTag application. It imports necessary libraries, sets the storage location for weights, and defines functions for parameter checking and handling exceptions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/model.py\":0-35",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport wget\nimport logging\nimport paddle\nimport paddle.static as static\ntry:\n    from configparser import ConfigParser\nexcept:\n    from ConfigParser import ConfigParser\nfrom .utils import download, AttrDict\nWEIGHT_DIR = os.path.join(os.path.expanduser('~'), '.paddle', 'weights')\nlogger = logging.getLogger(__name__)\ndef is_parameter(var):\n    return isinstance(var, paddle.framework.Parameter)\nclass NotImplementError(Exception):"
+        },
+        {
+            "comment": "This code defines two custom exceptions, \"NotImplementError\" and \"ModelNotFoundError\", to handle specific model-related issues. The \"ModelBase\" class serves as a base for creating different models with different modes (train, valid, test, infer). The code also checks if the mode input is valid.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/model.py\":36-68",
+            "content": "    \"Error: model function not implement\"\n    def __init__(self, model, function):\n        super(NotImplementError, self).__init__()\n        self.model = model.__class__.__name__\n        self.function = function.__name__\n    def __str__(self):\n        return \"Function {}() is not implemented in model {}\".format(\n            self.function, self.model)\nclass ModelNotFoundError(Exception):\n    \"Error: model not found\"\n    def __init__(self, model_name, avail_models):\n        super(ModelNotFoundError, self).__init__()\n        self.model_name = model_name\n        self.avail_models = avail_models\n    def __str__(self):\n        msg = \"Model {} Not Found.\\nAvailiable models:\\n\".format(\n            self.model_name)\n        for model in self.avail_models:\n            msg += \"  {}\\n\".format(model)\n        return msg\nclass ModelBase(object):\n    def __init__(self, name, cfg, mode='train'):\n        assert mode in ['train', 'valid', 'test', 'infer'], \\\n                \"Unknown mode type {}\".format(mode)\n        self.name = name"
+        },
+        {
+            "comment": "The code is a model class that requires subclassing for implementation. It defines various methods such as build_model, build_input, optimizer, outputs, loss, feeds, and fetches. However, the actual implementation of these methods should be provided in the subclass since they are all raising NotImplementedError. The weights_info method returns model weight default path and download URL.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/model.py\":69-104",
+            "content": "        self.is_training = (mode == 'train')\n        self.mode = mode\n        self.cfg = cfg\n        self.dataloader = None\n    def build_model(self):\n        \"build model struct\"\n        raise NotImplementError(self, self.build_model)\n    def build_input(self, use_dataloader):\n        \"build input Variable\"\n        raise NotImplementError(self, self.build_input)\n    def optimizer(self):\n        \"get model optimizer\"\n        raise NotImplementError(self, self.optimizer)\n    def outputs(self):\n        \"get output variable\"\n        raise NotImplementError(self, self.outputs)\n    def loss(self):\n        \"get loss variable\"\n        raise NotImplementError(self, self.loss)\n    def feeds(self):\n        \"get feed inputs list\"\n        raise NotImplementError(self, self.feeds)\n    def fetches(self):\n        \"get fetch list of model\"\n        raise NotImplementError(self, self.fetches)\n    def weights_info(self):\n        \"get model weight default path and download url\"\n        raise NotImplementError(self, self.weights_info)"
+        },
+        {
+            "comment": "This code defines several methods for a model class. The `get_weights` method returns the weight file path, downloading it from Paddle if it doesn't exist. The `dataloader` method returns the dataloader object. The `epoch_num` method returns the train epoch number. The `pretrain_info` method returns the pre-trained base model directory. The `get_pretrain_weights` method returns the weight file path, downloading it from Paddle if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/model.py\":106-138",
+            "content": "    def get_weights(self):\n        \"get model weight file path, download weight from Paddle if not exist\"\n        path, url = self.weights_info()\n        path = os.path.join(WEIGHT_DIR, path)\n        if not os.path.isdir(WEIGHT_DIR):\n            logger.info('{} not exists, will be created automatically.'.format(\n                WEIGHT_DIR))\n            os.makedirs(WEIGHT_DIR)\n        if os.path.exists(path):\n            return path\n        logger.info(\"Download weights of {} from {}\".format(self.name, url))\n        wget.download(url, path)\n        return path\n    def dataloader(self):\n        return self.dataloader\n    def epoch_num(self):\n        \"get train epoch num\"\n        return self.cfg.TRAIN.epoch\n    def pretrain_info(self):\n        \"get pretrain base model directory\"\n        return (None, None)\n    def get_pretrain_weights(self):\n        \"get model weight file path, download weight from Paddle if not exist\"\n        path, url = self.pretrain_info()\n        if not path:\n            return None\n        path = os.path.join(WEIGHT_DIR, path)"
+        },
+        {
+            "comment": "The code includes functions for handling model weights. It checks if a directory exists, downloads pretrain weights if necessary, loads pretrain and test weights into programs, and retrieves configuration from a config file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/model.py\":139-166",
+            "content": "        if not os.path.isdir(WEIGHT_DIR):\n            logger.info('{} not exists, will be created automatically.'.format(\n                WEIGHT_DIR))\n            os.makedirs(WEIGHT_DIR)\n        if os.path.exists(path):\n            return path\n        logger.info(\"Download pretrain weights of {} from {}\".format(\n            self.name, url))\n        download(url, path)\n        return path\n    def load_pretrain_params(self, exe, pretrain, prog):\n        logger.info(\"Load pretrain weights from {}\".format(pretrain))\n        state_dict = paddle.static.load_program_state(pretrain)\n        paddle.static.set_program_state(prog, state_dict)\n    def load_test_weights(self, exe, weights, prog):\n        params_list = list(filter(is_parameter, prog.list_vars()))\n        static.load(prog, weights, executor=exe, var_list=params_list)\n    def get_config_from_sec(self, sec, item, default=None):\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ModelZoo(object):"
+        },
+        {
+            "comment": "This code defines a ModelZoo class for managing different models, allowing registration and retrieval of models based on their names. The get() function returns the model instance with the specified name, while regist() registers new model classes to the ModelZoo. The get_model() and regist_model() functions provide convenient methods to interact with the singleton ModelZoo instance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/model.py\":167-191",
+            "content": "    def __init__(self):\n        self.model_zoo = {}\n    def regist(self, name, model):\n        assert model.__base__ == ModelBase, \"Unknow model type {}\".format(\n            type(model))\n        self.model_zoo[name] = model\n    def get(self, name, cfg, mode='train', is_videotag=False):\n        for k, v in self.model_zoo.items():\n            if k.upper() == name.upper():\n                return v(name, cfg, mode, is_videotag)\n        raise ModelNotFoundError(name, self.model_zoo.keys())\n# singleton model_zoo\nmodel_zoo = ModelZoo()\ndef regist_model(name, model):\n    model_zoo.regist(name, model)\ndef get_model(name, cfg, mode='train', is_videotag=False):\n    return model_zoo.get(name, cfg, mode, is_videotag)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d5630c58-45b2-41fc-b189-8cc4a1c58f4f.json b/docs/doc/d5630c58-45b2-41fc-b189-8cc4a1c58f4f.json
new file mode 100644
index 000000000..a4d164551
--- /dev/null
+++ b/docs/doc/d5630c58-45b2-41fc-b189-8cc4a1c58f4f.json
@@ -0,0 +1,75 @@
+{
+    "summary": "The code contains functions for non-maximum suppression, tensor movement, and applying NMS to anchor boxes in images using PaddlePaddle. It transforms YOLOv2 output, generates ground truth targets, calculates IoU, counts instances, updates predictions, and returns masks and transformation parameters for translation, width, and height.",
+    "details": [
+        {
+            "comment": "This code snippet defines a function `truths_length()` that returns the index of the first occurrence where the second element in 'truths' array is 0. It also defines a function `nms()` that applies Non-Maximum Suppression to filter out bounding boxes based on a given NMS threshold. The code checks if there are any bounding boxes, assigns confidence scores, sorts them in descending order of confidence, and removes overlapping bounding boxes with IoU greater than the NMS threshold.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":0-35",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn as nn\nimport numpy as np\nfrom builtins import range as xrange\ndef truths_length(truths):\n    for i in range(50):\n        if truths[i][1] == 0:\n            return i\ndef nms(boxes, nms_thresh):\n    if len(boxes) == 0:\n        return boxes\n    det_confs = paddle.zeros([len(boxes)])\n    for i in range(len(boxes)):\n        det_confs[i] = 1 - boxes[i][4]\n    sortIds = paddle.argsort(det_confs)"
+        },
+        {
+            "comment": "This code is defining three functions: \"out_boxes\" appears to perform non-maximum suppression on bounding boxes, \"convert2cpu\" converts a tensor from GPU memory to CPU memory as a float32 type, and \"convert2cpu_long\" performs the same operation but as an int64 type. The \"get_region_boxes\" function takes in output from a model and applies non-maximum suppression for each anchor box in the image, using provided anchors and thresholds. This function also reshapes the input to have shape (batch, num_anchors, 5 + num_classes). The code includes assertions to ensure proper input shapes are being used.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":36-66",
+            "content": "    out_boxes = []\n    for i in range(len(boxes)):\n        box_i = boxes[sortIds[i]]\n        if box_i[4] > 0:\n            out_boxes.append(box_i)\n            for j in range(i + 1, len(boxes)):\n                box_j = boxes[sortIds[j]]\n                if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh:\n                    box_j[4] = 0\n    return out_boxes\ndef convert2cpu(gpu_matrix):\n    float_32_g = gpu_matrix.astype('float32')\n    return float_32_g.cpu()\ndef convert2cpu_long(gpu_matrix):\n    int_64_g = gpu_matrix.astype('int64')\n    return int_64_g.cpu()\ndef get_region_boxes(output, conf_thresh=0.005, num_classes=24,\n                     anchors=[0.70458, 1.18803, 1.26654, 2.55121, 1.59382,\n                              4.08321, 2.30548, 4.94180, 3.52332, 5.91979],\n                     num_anchors=5, only_objectness=1, validation=False):\n    anchor_step = len(anchors) // num_anchors\n    if output.dim() == 3:\n        output = output.unsqueeze(0)\n    batch = output.shape[0]\n    assert (output.shape[1] == (5 + num_classes) * num_anchors)"
+        },
+        {
+            "comment": "This code performs box regression by reshaping the output tensor, creating grids for x and y coordinates, applying sigmoid function to the output, adding grid coordinates to get refined box coordinates. It also converts anchor widths into a tensor for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":67-93",
+            "content": "    h = output.shape[2]\n    w = output.shape[3]\n    all_boxes = []\n    output = paddle.reshape(\n        output, [batch * num_anchors, 5 + num_classes, h * w])\n    output = paddle.transpose(output, (1, 0, 2))\n    output = paddle.reshape(\n        output, [5 + num_classes, batch * num_anchors * h * w])\n    grid_x = paddle.linspace(0, w - 1, w)\n    grid_x = paddle.tile(grid_x, [h, 1])\n    grid_x = paddle.tile(grid_x, [batch * num_anchors, 1, 1])\n    grid_x = paddle.reshape(grid_x, [batch * num_anchors * h * w]).cuda()\n    grid_y = paddle.linspace(0, h - 1, h)\n    grid_y = paddle.tile(grid_y, [w, 1]).t()\n    grid_y = paddle.tile(grid_y, [batch * num_anchors, 1, 1])\n    grid_y = paddle.reshape(grid_y, [batch * num_anchors * h * w]).cuda()\n    sigmoid = nn.Sigmoid()\n    xs = sigmoid(output[0]) + grid_x\n    ys = sigmoid(output[1]) + grid_y\n    anchor_w = paddle.to_tensor(anchors)\n    anchor_w = paddle.reshape(anchor_w, [num_anchors, anchor_step])\n    anchor_w = paddle.index_select(anchor_w, index=paddle.to_tensor(\n        np.array([0]).astype('int32')), axis=1)"
+        },
+        {
+            "comment": "Code prepares output from a YOLOv2 object detection model, performing necessary reshaping and transformations to obtain the final detections and classifications. It computes the widths (ws) and heights (hs) of the bounding boxes based on the input feature maps, applies sigmoid activation to the fifth output channel for detection confidences, and converts the rest of the outputs to stop_gradient=True tensors for class predictions. The code then performs softmax normalization over class predictions and retrieves the maximum confidence and corresponding class IDs for each bounding box. Finally, it reshapes cls_max_confs to a 1D tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":95-121",
+            "content": "    anchor_h = paddle.to_tensor(anchors)\n    anchor_h = paddle.reshape(anchor_h, [num_anchors, anchor_step])\n    anchor_h = paddle.index_select(anchor_h, index=paddle.to_tensor(\n        np.array([1]).astype('int32')), axis=1)\n    anchor_w = paddle.tile(anchor_w, [batch, 1])\n    anchor_w = paddle.tile(anchor_w, [1, 1, h * w])\n    anchor_w = paddle.reshape(anchor_w, [batch * num_anchors * h * w]).cuda()\n    anchor_h = paddle.tile(anchor_h, [batch, 1])\n    anchor_h = paddle.tile(anchor_h, [1, 1, h * w])\n    anchor_h = paddle.reshape(anchor_h, [batch * num_anchors * h * w]).cuda()\n    ws = paddle.exp(output[2]) * anchor_w\n    hs = paddle.exp(output[3]) * anchor_h\n    det_confs = sigmoid(output[4])\n    cls_confs = paddle.to_tensor(output[5:5 + num_classes], stop_gradient=True)\n    cls_confs = paddle.transpose(cls_confs, [1, 0])\n    s = nn.Softmax()\n    cls_confs = paddle.to_tensor(s(cls_confs))\n    cls_max_confs = paddle.max(cls_confs, axis=1)\n    cls_max_ids = paddle.argmax(cls_confs, axis=1)\n    cls_max_confs = paddle.reshape(cls_max_confs, [-1])"
+        },
+        {
+            "comment": "The code extracts data from a PaddlePaddle tensor and converts it to CPU memory. It then reshapes the data, applies conditions, and stores box coordinates and confidences in lists for each batch. The extracted data is used to create bounding boxes for objects detected within the input image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":122-152",
+            "content": "    cls_max_ids = paddle.reshape(cls_max_ids, [-1])\n    sz_hw = h * w\n    sz_hwa = sz_hw * num_anchors\n    det_confs = convert2cpu(det_confs)\n    cls_max_confs = convert2cpu(cls_max_confs)\n    cls_max_ids = convert2cpu_long(cls_max_ids)\n    xs = convert2cpu(xs)\n    ys = convert2cpu(ys)\n    ws = convert2cpu(ws)\n    hs = convert2cpu(hs)\n    if validation:\n        cls_confs = convert2cpu(cls_confs.reshape([-1, num_classes]))\n    for b in range(batch):\n        boxes = []\n        for cy in range(h):\n            for cx in range(w):\n                for i in range(num_anchors):\n                    ind = b * sz_hwa + i * sz_hw + cy * w + cx\n                    det_conf = det_confs[ind]\n                    if only_objectness:\n                        conf = det_confs[ind]\n                    else:\n                        conf = det_confs[ind] * cls_max_confs[ind]\n                    if conf > conf_thresh:\n                        bcx = xs[ind]\n                        bcy = ys[ind]\n                        bw = ws[ind]\n                        bh = hs[ind]"
+        },
+        {
+            "comment": "The function `yowo_utils.py` returns a list of boxes with their respective confidences and class ids for each box. It includes only objectness if only_objectness is True, otherwise it also includes the per-class confidences. The function `bbox_iou` calculates the intersection over union between two bounding boxes, considering x1y1x2y2 format where (x1, y1) is the top left corner and (x2, y2) is the bottom right corner.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":153-177",
+            "content": "                        cls_max_conf = cls_max_confs[ind]\n                        cls_max_id = cls_max_ids[ind]\n                        box = [bcx / w, bcy / h, bw / w, bh / h,\n                               det_conf, cls_max_conf, cls_max_id]\n                        if (not only_objectness) and validation:\n                            for c in range(num_classes):\n                                tmp_conf = cls_confs[ind][c]\n                                if c != cls_max_id and det_confs[ind] * tmp_conf > conf_thresh:\n                                    box.append(tmp_conf)\n                                    box.append(c)\n                        boxes.append(box)\n        all_boxes.append(boxes)\n    return all_boxes\ndef bbox_iou(box1, box2, x1y1x2y2=True):\n    if x1y1x2y2:\n        mx = min(box1[0], box2[0])\n        Mx = max(box1[2], box2[2])\n        my = min(box1[1], box2[1])\n        My = max(box1[3], box2[3])\n        w1 = box1[2] - box1[0]\n        h1 = box1[3] - box1[1]\n        w2 = box2[2] - box2[0]\n        h2 = box2[3] - box2[1]"
+        },
+        {
+            "comment": "The code calculates the intersection-over-union (IOU) between two bounding boxes, which is commonly used in object detection tasks. It first finds the overlapping area by computing the minimum and maximum coordinates of the bounding boxes, then calculates the union of these boxes, and finally returns the intersection over union ratio. This helps in determining if the two bounding boxes represent the same object or not.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":178-212",
+            "content": "    else:\n        mx = min(float(box1[0] - box1[2] / 2.0),\n                 float(box2[0] - box2[2] / 2.0))\n        Mx = max(float(box1[0] + box1[2] / 2.0),\n                 float(box2[0] + box2[2] / 2.0))\n        my = min(float(box1[1] - box1[3] / 2.0),\n                 float(box2[1] - box2[3] / 2.0))\n        My = max(float(box1[1] + box1[3] / 2.0),\n                 float(box2[1] + box2[3] / 2.0))\n        w1 = box1[2]\n        h1 = box1[3]\n        w2 = box2[2]\n        h2 = box2[3]\n    uw = Mx - mx\n    uh = My - my\n    cw = w1 + w2 - uw\n    ch = h1 + h2 - uh\n    carea = 0\n    if cw <= 0 or ch <= 0:\n        return paddle.to_tensor(0.0)\n    area1 = w1 * h1\n    area2 = w2 * h2\n    carea = cw * ch\n    uarea = area1 + area2 - carea\n    return carea / uarea\ndef bbox_ious(boxes1, boxes2, x1y1x2y2=True):\n    if x1y1x2y2:\n        mx = paddle.min(boxes1[0], boxes2[0])\n        Mx = paddle.max(boxes1[2], boxes2[2])\n        my = paddle.min(boxes1[1], boxes2[1])\n        My = paddle.max(boxes1[3], boxes2[3])\n        w1 = boxes1[2] - boxes1[0]"
+        },
+        {
+            "comment": "This code calculates the intersection over union (IoU) between two bounding boxes. It first checks if both boxes have valid dimensions, then computes the coordinates of each box and their widths and heights. If the boxes overlap, it calculates the intersection area and union area of the bounding boxes, taking into account non-overlapping areas by setting them to 0 in the case of non-intersection. Finally, it returns the IoU as the intersection area divided by the union area.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":213-240",
+            "content": "        h1 = boxes1[3] - boxes1[1]\n        w2 = boxes2[2] - boxes2[0]\n        h2 = boxes2[3] - boxes2[1]\n    else:\n        mx = paddle.min(paddle.stack(\n            [boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0], axis=0), axis=0)\n        Mx = paddle.max(paddle.stack(\n            [boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0], axis=0), axis=0)\n        my = paddle.min(paddle.stack(\n            [boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0], axis=0), axis=0)\n        My = paddle.max(paddle.stack(\n            [boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0], axis=0), axis=0)\n        w1 = boxes1[2]\n        h1 = boxes1[3]\n        w2 = boxes2[2]\n        h2 = boxes2[3]\n    uw = Mx - mx\n    uh = My - my\n    cw = w1 + w2 - uw\n    ch = h1 + h2 - uh\n    mask = paddle.cast(cw <= 0, dtype=\"int32\") + \\\n        paddle.cast(ch <= 0, dtype=\"int32\") > 0\n    area1 = w1 * h1\n    area2 = w2 * h2\n    carea = cw * ch\n    carea[mask] = 0\n    uarea = area1 + area2 - carea\n    return carea / uarea"
+        },
+        {
+            "comment": "This function builds ground truth targets for each grid in the image. It iterates over each batch, anchor, height, and width to create confidence, coordinate, and class masks, as well as target coordinates and classes for each anchor box. The targets are then concatenated into a single tensor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":243-267",
+            "content": "# this function works for building the groud truth\ndef build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,\n                  sil_thresh):\n    # nH, nW here are number of grids in y and x directions (7, 7 here)\n    nB = target.shape[0]  # batch size\n    nA = num_anchors  # 5 for our case\n    nC = num_classes\n    anchor_step = len(anchors) // num_anchors\n    conf_mask = paddle.ones([nB, nA, nH, nW]) * noobject_scale\n    coord_mask = paddle.zeros([nB, nA, nH, nW])\n    cls_mask = paddle.zeros([nB, nA, nH, nW])\n    tx = paddle.zeros([nB, nA, nH, nW])\n    ty = paddle.zeros([nB, nA, nH, nW])\n    tw = paddle.zeros([nB, nA, nH, nW])\n    th = paddle.zeros([nB, nA, nH, nW])\n    tconf = paddle.zeros([nB, nA, nH, nW])\n    tcls = paddle.zeros([nB, nA, nH, nW])\n    # for each grid there are nA anchors\n    # nAnchors is the number of anchor for one image\n    nAnchors = nA * nH * nW\n    nPixels = nH * nW\n    # for each image\n    for b in xrange(nB):\n        # get all anchor boxes in one image"
+        },
+        {
+            "comment": "This code calculates the IoU (Intersection over Union) between predicted and ground truth boxes for each anchor. It uses a loop to iterate through 50 time steps, breaks if no target is available at the current time step, and calculates the bbox_ious function using cur_pred_boxes and cur_gt_boxes for IoU calculation. The highest IoU value is stored in cur_ious for each anchor.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":268-287",
+            "content": "        # (4 * nAnchors)\n        cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()\n        # initialize iou score for each anchor\n        cur_ious = paddle.zeros([nAnchors])\n        for t in xrange(50):\n            # for each anchor 4 coordinate parameters, already in the coordinate system for the whole image\n            # this loop is for anchors in each image\n            # for each anchor 5 parameters are available (class, x, y, w, h)\n            if target[b][t * 5 + 1] == 0:\n                break\n            gx = target[b][t * 5 + 1] * nW\n            gy = target[b][t * 5 + 2] * nH\n            gw = target[b][t * 5 + 3] * nW\n            gh = target[b][t * 5 + 4] * nH\n            # groud truth boxes\n            cur_gt_boxes = paddle.tile(paddle.to_tensor(\n                [gx, gy, gw, gh], dtype='float32').t(), [nAnchors, 1]).t()\n            # bbox_ious is the iou value between orediction and groud truth\n            cur_ious = paddle.max(\n                paddle.stack([cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)], axis=0), axis=0)"
+        },
+        {
+            "comment": "This code calculates the IoU (Intersection over Union) between predicted bounding boxes and ground truth bounding boxes, and applies a mask to the confidences based on this IoU. It also counts the number of ground truth instances (nGT) and correct detections (nCorrect). The target values are ratios multiplied by the width and height of the output feature maps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":288-314",
+            "content": "        # if iou > a given threshold, it is seen as it includes an object\n        # conf_mask[b][cur_ious>sil_thresh] = 0\n        conf_mask_t = paddle.reshape(conf_mask, [nB, -1])\n        conf_mask_t[b, cur_ious > sil_thresh] = 0\n        conf_mask_tt = paddle.reshape(conf_mask_t[b], [nA, nH, nW])\n        conf_mask[b] = conf_mask_tt\n    # number of ground truth\n    nGT = 0\n    nCorrect = 0\n    for b in xrange(nB):\n        # anchors for one batch (at least batch size, and for some specific classes, there might exist more than one anchor)\n        for t in xrange(50):\n            if target[b][t * 5 + 1] == 0:\n                break\n            nGT = nGT + 1\n            best_iou = 0.0\n            best_n = -1\n            min_dist = 10000\n            # the values saved in target is ratios\n            # times by the width and height of the output feature maps nW and nH\n            gx = target[b][t * 5 + 1] * nW\n            gy = target[b][t * 5 + 2] * nH\n            gi = int(gx)\n            gj = int(gy)\n            gw = target[b][t * 5 + 3] * nW"
+        },
+        {
+            "comment": "This code iterates over anchor boxes, calculates IoU with ground truth boxes and selects the best matching one. It then updates the corresponding prediction box for that image and marks it as valid in coord_mask and cls_mask matrices.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":315-337",
+            "content": "            gh = target[b][t * 5 + 4] * nH\n            gt_box = [0, 0, gw, gh]\n            for n in xrange(nA):\n                # get anchor parameters (2 values)\n                aw = anchors[anchor_step * n]\n                ah = anchors[anchor_step * n + 1]\n                anchor_box = [0, 0, aw, ah]\n                # only consider the size (width and height) of the anchor box\n                iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)\n                # get the best anchor form with the highest iou\n                if iou > best_iou:\n                    best_iou = iou\n                    best_n = n\n            # then we determine the parameters for an anchor (4 values together)\n            gt_box = [gx, gy, gw, gh]\n            # find corresponding prediction box\n            pred_box = pred_boxes[b * nAnchors +\n                                  best_n * nPixels + gj * nW + gi]\n            # only consider the best anchor box, for each image\n            coord_mask[b, best_n, gj, gi] = 1\n            cls_mask[b, best_n, gj, gi] = 1"
+        },
+        {
+            "comment": "The code calculates object position, size, and confidence for each detected object. It then counts the number of correct detections by checking if the IOU is greater than 0.5.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":339-356",
+            "content": "            # in this cell of the output feature map, there exists an object\n            conf_mask[b, best_n, gj, gi] = object_scale\n            tx[b, best_n, gj, gi] = paddle.cast(\n                target[b][t * 5 + 1] * nW - gi, dtype='float32')\n            ty[b, best_n, gj, gi] = paddle.cast(\n                target[b][t * 5 + 2] * nH - gj, dtype='float32')\n            tw[b, best_n, gj, gi] = math.log(\n                gw / anchors[anchor_step * best_n])\n            th[b, best_n, gj, gi] = math.log(\n                gh / anchors[anchor_step * best_n + 1])\n            iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou\n            # confidence equals to iou of the corresponding anchor\n            tconf[b, best_n, gj, gi] = paddle.cast(iou, dtype='float32')\n            tcls[b, best_n, gj, gi] = paddle.cast(\n                target[b][t * 5], dtype='float32')\n            # if ious larger than 0.5, we justify it as a correct prediction\n            if iou > 0.5:\n                nCorrect = nCorrect + 1"
+        },
+        {
+            "comment": "The function returns the ground truth values (nGT), correct predictions (nCorrect), and corresponding masks for coordinates, confidence, and class labels, as well as the transformation parameters for translation, width, and height.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py\":357-358",
+            "content": "    # true values are returned\n    return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d60d1fcc-23d3-4128-930f-5277c3778174.json b/docs/doc/d60d1fcc-23d3-4128-930f-5277c3778174.json
new file mode 100644
index 000000000..f58139cc2
--- /dev/null
+++ b/docs/doc/d60d1fcc-23d3-4128-930f-5277c3778174.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code constructs a PaddleVideo pipeline for preprocessing data and builds a data loader for distributed model training, handling variable batch sizes and using mix_collate_fn to collate data.",
+    "details": [
+        {
+            "comment": "The code is building a pipeline for PaddleVideo. It imports necessary libraries and classes, utilizes function to build the pipeline, logs information using get_logger from paddlevideo.utils, and adheres to Apache License 2.0. The purpose of this code seems to be related to data preprocessing and possibly model training in a distributed environment.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/builder.py\":0-28",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport signal\nimport os\nimport paddle\nfrom paddle.io import DataLoader, DistributedBatchSampler\nfrom .registry import DATASETS, PIPELINES\nfrom ..utils.build_utils import build\nfrom .pipelines.compose import Compose\nfrom paddlevideo.utils import get_logger\nfrom paddlevideo.utils.multigrid import DistributedShortSampler\nimport numpy as np\nlogger = get_logger(\"paddlevideo\")\ndef build_pipeline(cfg):\n    \"\"\"Build pipeline."
+        },
+        {
+            "comment": "build_dataset: Builds a dataset using provided config dictionary, building pipeline first.\nbuild_batch_pipeline: Constructs the batch pipeline using config from the PIPELINES module.\nbuild_dataloader: Creates Paddle Dataloader object using specified parameters and dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/builder.py\":29-73",
+            "content": "    Args:\n        cfg (dict): root config dict.\n    \"\"\"\n    if cfg == None:\n        return\n    return Compose(cfg)\ndef build_dataset(cfg):\n    \"\"\"Build dataset.\n    Args:\n        cfg (dict): root config dict.\n    Returns:\n        dataset: dataset.\n    \"\"\"\n    #XXX: ugly code here!\n    cfg_dataset, cfg_pipeline = cfg\n    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)\n    dataset = build(cfg_dataset, DATASETS, key=\"format\")\n    return dataset\ndef build_batch_pipeline(cfg):\n    batch_pipeline = build(cfg, PIPELINES)\n    return batch_pipeline\ndef build_dataloader(dataset,\n                     batch_size,\n                     num_workers,\n                     places,\n                     shuffle=True,\n                     drop_last=True,\n                     multigrid=False,\n                     collate_fn_cfg=None,\n                     **kwargs):\n    \"\"\"Build Paddle Dataloader.\n    XXX explain how the dataloader work!\n    Args:\n        dataset (paddle.dataset): A PaddlePaddle dataset object.\n        batch_size (int): batch size on single card."
+        },
+        {
+            "comment": "This code appears to be part of a data loading and processing function for a machine learning or deep learning model. It uses a sampler to manage the data, with options for shuffling and dropping the last batch if needed. The mix_collate_fn function is defined to collate the data in a specific way using a pipeline built from collate_fn_cfg.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/builder.py\":74-95",
+            "content": "        num_worker (int): num_worker\n        shuffle(bool): whether to shuffle the data at every epoch.\n    \"\"\"\n    if multigrid:\n        sampler = DistributedShortSampler(dataset,\n                                          batch_sizes=batch_size,\n                                          shuffle=True,\n                                          drop_last=True)\n    else:\n        sampler = DistributedBatchSampler(dataset,\n                                          batch_size=batch_size,\n                                          shuffle=shuffle,\n                                          drop_last=drop_last)\n    #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.\n    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:\n    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.\n    def mix_collate_fn(batch):\n        pipeline = build_batch_pipeline(collate_fn_cfg)\n        batch = pipeline(batch)\n        slots = []"
+        },
+        {
+            "comment": "This code appears to create a data loader that can handle batches of varying lengths. It iterates through each batch and organizes the items into slots based on their length, either creating a new slot for longer items or appending them to existing ones. The DataLoader class is then instantiated with this collate_fn for processing the dataset, using the provided parameters. Additionally, signal handlers are set up to handle SIGINT and SIGTERM signals to terminate the process group if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/builder.py\":96-131",
+            "content": "        for items in batch:\n            for i, item in enumerate(items):\n                if len(slots) < len(items):\n                    slots.append([item])\n                else:\n                    slots[i].append(item)\n        return [np.stack(slot, axis=0) for slot in slots]\n    #if collate_fn_cfg is not None:\n    #ugly code here. collate_fn is mix op config\n    #    collate_fn = mix_collate_fn(collate_fn_cfg)\n    data_loader = DataLoader(\n        dataset,\n        batch_sampler=sampler,\n        places=places,\n        num_workers=num_workers,\n        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,\n        return_list=True,\n        **kwargs)\n    return data_loader\ndef term_mp(sig_num, frame):\n    \"\"\" kill all child processes\n    \"\"\"\n    pid = os.getpid()\n    pgid = os.getpgid(os.getpid())\n    logger.info(\"main proc {} exit, kill process group \" \"{}\".format(pid, pgid))\n    os.killpg(pgid, signal.SIGKILL)\n    return\nsignal.signal(signal.SIGINT, term_mp)\nsignal.signal(signal.SIGTERM, term_mp)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d63d5d70-0864-4989-9f57-975f9beffc92.json b/docs/doc/d63d5d70-0864-4989-9f57-975f9beffc92.json
new file mode 100644
index 000000000..573f698bc
--- /dev/null
+++ b/docs/doc/d63d5d70-0864-4989-9f57-975f9beffc92.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code initializes an InferModel class for \"PPTSM\" model inference using PaddlePaddle and GPU, performs inference, predicts feature lists from inputs, retrieves image files, assigns them to the model, prints output shapes, calculates prediction time.",
+    "details": [
+        {
+            "comment": "This code initializes an instance of the InferModel class for a specific model named \"PPTSM\". It takes in a configuration file (cfg) and sets up the necessary parameters for the model's inference process. The class uses PaddlePaddle library to create a predictor, which handles input and output data processing and enables GPU memory optimization for faster computation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/pptsm_infer.py\":0-37",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"pptsm infer\"\"\"\n    def __init__(self, cfg, name='PPTSM'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy\n        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])"
+        },
+        {
+            "comment": "This code is for an InferModel class that performs inference on video frames using a pre-trained model. It uses the get_output_names and get_output_handle methods from the predictor to specify the desired output tensor. The infer method takes input data, runs the inference, and returns the output tensor. The predict method reads input data from a specified directory or config file, applies inference on frames, and returns a feature list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/pptsm_infer.py\":39-67",
+            "content": "        output_names = self.predictor.get_output_names()\n        self.output_tensor = self.predictor.get_output_handle(output_names[1])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output = self.output_tensor.copy_to_cpu()\n        return output\n    def predict(self, infer_config):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name, 'infer', infer_config)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = [items[:-1] for items in data]\n            inputs = np.array(inputs)\n            output = self.infer(inputs)\n            feature_list.append(np.squeeze(output))\n        feature_list = np.vstack(feature_list)\n        return feature_list\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238/'"
+        },
+        {
+            "comment": "This code retrieves image files from a specified path, assigns them to a model for inference and prints the resulting shape of the outputs. It also calculates and displays the time taken for the prediction process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/pptsm_infer.py\":68-76",
+            "content": "    imgs_list = get_images(imgs_path)\n    t0 = time.time()\n    cfg['PPTSM']['frame_list'] = imgs_list\n    outputs = model.predict(cfg)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    print(outputs.shape)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d7d9f6ff-e8e8-4f41-846a-989012f2c111.json b/docs/doc/d7d9f6ff-e8e8-4f41-846a-989012f2c111.json
new file mode 100644
index 000000000..53d759776
--- /dev/null
+++ b/docs/doc/d7d9f6ff-e8e8-4f41-846a-989012f2c111.json
@@ -0,0 +1,50 @@
+{
+    "summary": "The code defines a function for loading annotations and includes helper functions, iterates through label ranges and thresholds to find the best combination of IOU and score threshold for evaluating basketball actions, calculates evaluation results, updates best scores, and prints these best scores along with the evaluation results.",
+    "details": [
+        {
+            "comment": "This code defines a function called `load_gts()` which loads ground truth annotations (gts) for video evaluation. It imports necessary modules, sets up global variables like fps and mode, and utilizes a JSON file to map labels to their indices. The gts data is stored in a dictionary with 'fps' and 'gts' keys, where 'fps' stores the frame rate and 'gts' stores individual annotations for each video. Each annotation has a 'mode' key indicating whether it's from training or validation set.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py\":0-35",
+            "content": "\"\"\"\nget instance for lstm\n\u6839\u636egts\u8ba1\u7b97\u6bcf\u4e2aproposal_bmn\u7684iou\u3001ioa\u3001label\u7b49\u4fe1\u606f\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nimport io\nsys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding = 'utf-8')\ndataset = \"datasets/\"\nlabel_index_file = './configs_basketball/index_label_basketball_6.json'\neval_datasets = ['EuroCup2016']\nlabel_files = {'train': 'label_cls6_train.json',\n               'validation': 'label_cls6_val.json'}\nglobal fps, mode\nlabel_index = json.load(open(label_index_file, 'rb'))\ndef load_gts():\n    global fps\n    gts_data = {'fps': 0, 'gts': {}}\n    for eval_data in eval_datasets:\n        for item, value in label_files.items():\n            label_file = '{}/{}/{}'.format(dataset, eval_data, value)\n            gts = json.load(open(label_file, 'rb'))\n            gts_data['fps'] = gts['fps']\n            fps = gts['fps']\n            for gt in gts['gts']:\n                gt['mode'] = item\n                basename = '{}/{}/mp4/{}'.format(dataset, eval_data, os.path.basename(gt['url']))"
+        },
+        {
+            "comment": "This code snippet defines three functions: \"get_gt\", \"computeIoU\", and \"convert_proposal\". The \"get_gt\" function takes a baseline name and returns the ground truth (GT) for that specific baseline. The \"computeIoU\" function calculates the intersection over union (IoU) between two events. Lastly, the \"convert_proposal\" function converts event proposals into ground truths based on their scores, threshold, and frame rates.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py\":36-66",
+            "content": "                gts_data['gts'][basename] = gt\n    return gts_data['gts']\ndef computeIoU(e1, e2):\n    \"\"\"\n    clc iou and ioa\n    \"\"\"\n    if not (e1['label'] == e2['label'] and e1['basename'] == e2['basename']):\n        return 0.\n    area1 = e1[\"end\"] - e1[\"start\"]\n    area2 = e2[\"end\"] - e2[\"start\"]\n    x1 = np.maximum(e1[\"start\"], e2[\"start\"])\n    x2 = np.minimum(e1[\"end\"], e2[\"end\"])\n    inter = np.maximum(0.0, x2 - x1)\n    iou = 0.0 if (area1 + area2 - inter) == 0 else inter * 1.0 / (area1 + area2 - inter)\n    if not mode == 'proposal':\n        iou = 0.0 if area2 == 0 else inter * 1.0 / area2\n    return iou\ndef convert_proposal(boxes, basename, score_threshold=0.01):\n    boxes = sorted(boxes, key=lambda x:float(x['score']), reverse=True)\n    res = []\n    for box in boxes:\n        if not float(box['score']) >= score_threshold:\n            continue\n        res.append({'basename': basename,\n                    'start': int(float(box['start']) / fps),\n                    'end': int(float(box['end']) / fps),\n                    'label': 0})"
+        },
+        {
+            "comment": "The code defines a function `convert_classify` that takes in boxes, base name, iou threshold, and score threshold. It sorts the boxes based on their classify score and iou score in descending order. The function then loops over each box, checks if the box meets the iou and score thresholds, and appends to a list named 'res' with necessary details such as basename, start time converted to frame number, end time converted to frame number, and label id. The code also has another function `convert_groundtruth` which takes in boxes, base name, and phase (optional). It iterates over the label ids of each box and appends a dictionary to the list 'res' with necessary details such as basename, start id, and label.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py\":67-92",
+            "content": "    return res\ndef convert_classify(boxes, basename, iou_threshold, score_threshold):\n    boxes = sorted(boxes, key=lambda x:(float(x['classify_score']), float(x['iou_score'])), reverse=True)\n    def convert_time_to_frame(time_type):\n        return int(time_type)\n        h, m, s = time_type.split(':')\n        return int(h) * 3600 + int(m) * 60 + int(s)\n    res = []\n    for box in boxes:\n        if not (box['iou_score'] >= iou_threshold and\n                box['classify_score'] >= score_threshold):\n            continue\n        res.append({'basename': basename,\n                    'start': convert_time_to_frame(box['start_time']),\n                    'end': convert_time_to_frame(box['end_time']),\n                    'label': box['label_id']})\n    return res\ndef convert_groundtruth(boxes, basename, phase=None):\n    res = []\n    for box in boxes:\n        for item in box['label_ids']:\n            label = 0 if phase == 'proposal' else item\n            res.append({'basename': basename,\n                        'start': box['start_id'],"
+        },
+        {
+            "comment": "This code contains four functions: `evaluation`, `print_result`, `print_head`, and `print_head`. These functions calculate and print the evaluation results for a set of detected boxes (res_boxes) against the ground truth boxes (gts_boxes). The code also calculates various metrics such as precision, recall, hit properties, and number of instances. It uses label ranges, IoU thresholds, and can show intermediate IoU values if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py\":93-119",
+            "content": "                        'end': box['end_id'],\n                        'label': label})\n    return res\ndef print_head(iou):\n    print(\"\\nioa = {:.1f}\".format(iou))\n    res_str = ''\n    for item in ['label_name']:\n        res_str += '{:<12s}'.format(item)\n    for item in ['label_id', 'precision', 'recall', 'hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10s}'.format(item)\n    print(res_str)\ndef print_result(res_dict, label='avg'):\n    if label == 'avg':\n        res_str = '{:<22s}'.format(str(label))\n    else:\n        res_str = '{0:{2}<6s}{1:<10s}'.format(label_index[str(label)], str(label), chr(12288))\n    for item in ['prec', 'recall']:\n        res_str += '{:<10.4f}'.format(res_dict[item])\n    for item in ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10d}'.format(res_dict[item])\n    print(res_str)\ndef evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = False):\n    iou_map = [computeIoU(resId, gtsId) for resId in res_boxes \\\n                                        for gtsId in gts_boxes]"
+        },
+        {
+            "comment": "This code calculates evaluation metrics for detected objects based on their Intersection over Union (IoU) with ground truth objects. It iterates through different IoU thresholds and label ranges to compute hit proportion, number of propositions, hit GTs, and number of ground truth objects for each threshold and label. The results are stored in a dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py\":120-143",
+            "content": "    iou_map = np.array(iou_map).reshape((len(res_boxes), len(gts_boxes)))\n    hit_map_prop_total = np.max(iou_map, axis=1)\n    hit_map_index_total = np.argmax(iou_map, axis=1)\n    res_dict = ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']\n    for iou_threshold in iou_range:\n        if show_sub:\n            print_head(iou_threshold)\n        iou_prop = np.array([k >= iou_threshold for k in hit_map_prop_total])\n        average_results = {}\n        for label_id in label_range:\n            sub_results = {}\n            label_prop = np.array([k['label'] == label_id for k in res_boxes])\n            label_gts = np.array([k['label'] == label_id for k in gts_boxes])\n            sub_results['num_prop'] = sum(label_prop)\n            sub_results['num_gts'] = sum(label_gts)\n            if sub_results['num_prop'] == 0:\n                hit_prop_index = []\n            else:\n                hit_prop_index = label_prop & iou_prop\n            sub_results['hit_prop'] = sum(hit_prop_index)\n            sub_results['hit_gts'] = len(set(hit_map_index_total[hit_prop_index]))"
+        },
+        {
+            "comment": "This code calculates precision and recall values for sub-results and average results in a classification task. It handles cases where the number of true positives, true negatives, false positives or false negatives is zero by assigning precision and recall as 0. The code outputs average values only for labels with a range greater than one.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py\":145-160",
+            "content": "            sub_results['prec'] = 0.0 if sub_results['num_prop'] == 0 \\\n                                      else sub_results['hit_prop'] * 1.0 / sub_results['num_prop']\n            sub_results['recall'] = 0.0 if sub_results['num_gts'] == 0 \\\n                                        else sub_results['hit_gts'] * 1.0 / sub_results['num_gts']\n            if show_sub:\n                print_result(sub_results, label=label_id)\n            for item in res_dict:\n                if not item in average_results:\n                    average_results[item] = 0\n                average_results[item] += sub_results[item]\n        if len(label_range) == 1:   # proposal \u4e0d\u9700\u8981\u8f93\u51faaverage\u503c\n            continue\n        average_results['prec'] = 0.0 if average_results['num_prop'] == 0 \\\n                                      else average_results['hit_prop'] * 1.0 / average_results['num_prop']\n        average_results['recall'] = 0.0 if average_results['num_gts'] == 0 \\\n                                        else average_results['hit_gts'] * 1.0 / average_results['num_gts']"
+        },
+        {
+            "comment": "This code calculates the F1 score for a set of predictions and ground truth data. If 'show_sub' is True, it prints the average results. It then calculates the F1 score based on precision and recall values. The function returns the average results containing precision, recall, and F1 score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py\":161-188",
+            "content": "        if show_sub:\n            print_result(average_results)\n        average_results['F1'] = 0.0 if (average_results['prec'] + average_results['recall'] == 0) \\\n                                    else 2 * average_results['prec'] * average_results['recall'] / \\\n                                            (average_results['prec'] + average_results['recall'])\n        return average_results\ndef get_eval_results(predicts, gts_data, phase, iou_threshold = 0.3, score_threshold = 0.3, show_sub = False):\n    global mode\n    mode = phase\n    res_boxes = []\n    gts_boxes = []\n    for ped_data in predicts:\n        basename = ped_data['video_name']\n        # eval sub data\n        such_eval = False\n        for eval_name in eval_datasets:\n            if eval_name in basename:\n                such_eval = True\n                break\n        if not such_eval:\n            continue\n        gts = gts_data[basename]['actions']\n        if phase == 'proposal':\n            res_boxes.extend(convert_proposal(ped_data['bmn_results'], basename, score_threshold))"
+        },
+        {
+            "comment": "The code is evaluating the performance of a video action detection model. It extends the ground truth boxes for proposals and classifies them based on IOU and score thresholds. It then performs evaluation using these results and displays the best F1 score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py\":189-217",
+            "content": "            gts_boxes.extend(convert_groundtruth(gts, basename, phase='proposal'))\n            label_range = [0]\n            iou_range = np.arange(0.1, 1, 0.1)\n        else:\n            res_boxes.extend(convert_classify(ped_data['action_results'], basename, iou_threshold, score_threshold))\n            gts_boxes.extend(convert_groundtruth(gts, basename))\n            label_range = range(1, len(label_index))\n            iou_range = np.arange(0.5, 0.6, 0.1)\n    eval_results = evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = show_sub)\n    return eval_results\nif __name__ == \"__main__\":\n    result_file = sys.argv[1]\n    predicts = json.load(open(result_file, 'r', encoding='utf-8'))\n    gts_data = load_gts()\n    get_eval_results(predicts, gts_data, 'proposal', \n                     score_threshold = 0.03,\n                     show_sub = True)\n    #get_eval_results(predicts, gts_data, 'actions')\n    best_F1 = -0.1\n    best_res = {}\n    best_iou_threshold = 0.\n    best_score_threshold = 0.\n    for iou_threshold in np.arange(0.1, 0.9, 0.1):"
+        },
+        {
+            "comment": "This code is iterating through different score thresholds to find the best combination of IOU and score threshold for evaluating basketball actions. It calculates evaluation results for each threshold, updating the best scores accordingly. Finally, it prints these best scores and displays the evaluation results using a function called print_result().",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/eval.py\":218-236",
+            "content": "        for score_threshold in np.arange(0.1, 1, 0.1):\n            avg_res = get_eval_results(predicts, gts_data, 'actions', \n                                       iou_threshold = iou_threshold,\n                                       score_threshold = score_threshold,\n                                       show_sub = False)\n            if best_F1 < avg_res['F1']:\n                best_F1 = avg_res['F1']\n                best_res = avg_res\n                best_iou_threshold = iou_threshold\n                best_score_threshold = score_threshold\n    print(\"best iou threshold = {:.1f}\".format(best_iou_threshold))\n    print(\"best score threshold = {:.1f}\".format(best_score_threshold))\n    print('best F1 score = {:.4f}'.format(best_F1))\n    print_head(0.5)\n    print_result(best_res)\n    get_eval_results(predicts, gts_data, 'actions', iou_threshold = best_iou_threshold,\n                                                    score_threshold = best_score_threshold,\n                                                    show_sub = True)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d8181f92-5168-4dcf-996f-09bcce0cd488.json b/docs/doc/d8181f92-5168-4dcf-996f-09bcce0cd488.json
new file mode 100644
index 000000000..590ade063
--- /dev/null
+++ b/docs/doc/d8181f92-5168-4dcf-996f-09bcce0cd488.json
@@ -0,0 +1,75 @@
+{
+    "summary": "This code performs data augmentation using resizing, cropping, and scaling/rotating transformations with cv2 libraries, offering fixed or random options. It also initializes segmentation variables, computes dilated areas, generates edge masks, and handles various data types for PaddlePaddle's video object detection task.",
+    "details": [
+        {
+            "comment": "The code defines a Resize class that rescales images in a sample to the given output size. It accepts either an integer for uniform resizing or a tuple for specific dimensions. If the output size is not compatible with the image aspect ratio, it will be scaled proportionally. This class can process samples with one or more images (e.g., 'img1' and 'img2').",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":0-34",
+            "content": "import os\nimport random\nimport cv2\nimport numpy as np\nimport paddle\nfrom PIL import Image\nimport dataloaders.helpers as helpers\nfrom davisinteractive.utils.operations import bresenham\nfrom paddle.vision.transforms import functional as F\ncv2.setNumThreads(0)\nNEW_BRANCH = True\nclass Resize(object):\n    \"\"\"Rescale the image in a sample to a given size.\n    Args:\n        output_size (tuple or int): Desired output size. If tuple, output is\n            matched to output_size. If int, smaller of image edges is matched\n            to output_size keeping aspect ratio the same.\n    \"\"\"\n    def __init__(self, output_size):\n        assert isinstance(output_size, (int, tuple))\n        if isinstance(output_size, int):\n            self.output_size = (output_size, output_size)\n        else:\n            self.output_size = output_size\n    #        self.seg_interpolation = cv2.INTER_CUBIC if is_continuous else cv2.INTER_NEAREST\n    #        self.fix = fix\n    def __call__(self, sample):\n        img1 = sample['img1']\n        # img2 = sample['img2']"
+        },
+        {
+            "comment": "This code is a custom transform that resizes images in a sample to a specific output size. It checks if the current image size matches the desired output size, and if not, it uses cv2.resize() function to resize each image in the sample while maintaining aspect ratio for specified elements (img1, img2, ref_img) using INTER_CUBIC interpolation and others using INTER_NEAREST. It returns the modified sample with images resized according to the output size specified. The RandomCrop class is used to crop an image randomly to a given output size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":35-68",
+            "content": "        # ref_img=sample['ref_img']\n        h, w = img1.shape[:2]\n        if self.output_size == (h, w):\n            return sample\n        else:\n            new_h, new_w = self.output_size\n        new_h, new_w = int(new_h), int(new_w)\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':\n                flagval = cv2.INTER_CUBIC\n            else:\n                flagval = cv2.INTER_NEAREST\n            tmp = cv2.resize(tmp, dsize=(new_w, new_h), interpolation=flagval)\n            sample[elem] = tmp\n        return sample\nclass RandomCrop(object):\n    \"\"\"Crop randomly the image in a sample.\n    Args:\n        output_size (tuple or int): Desired output size. If int, square crop\n            is made.\n    \"\"\"\n    def __init__(self, output_size, step=None):\n        assert isinstance(output_size, (int, tuple))\n        if isinstance(output_size, int):\n            self.output_size = (output_size, output_size)"
+        },
+        {
+            "comment": "This code is part of a custom transform for image cropping. It takes an input sample, selects a random top and left position to crop the image, and checks if the corresponding reference scribble label has enough unique elements to proceed. If not, it continues selecting new positions until it finds one with enough unique elements in the scribble label. The step variable keeps track of how many times this loop has been executed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":69-97",
+            "content": "        else:\n            assert len(output_size) == 2\n            self.output_size = output_size\n        self.step = step\n    def __call__(self, sample):\n        image = sample['img1']\n        h, w = image.shape[:2]\n        new_h, new_w = self.output_size\n        new_h = h if new_h >= h else new_h\n        new_w = w if new_w >= w else new_w\n        is_contain_obj = False\n        if self.step is None:\n            while not is_contain_obj:\n                #                step += 1\n                top = np.random.randint(0, h - new_h + 1)\n                left = np.random.randint(0, w - new_w + 1)\n                ref_scribble_label = sample['ref_scribble_label']\n                new_ref_scribble_label = ref_scribble_label[top:top + new_h,\n                                                            left:left + new_w]\n                if len(np.unique(new_ref_scribble_label)) == 1:\n                    continue\n                else:\n                    for elem in sample.keys():\n                        if 'meta' in elem:"
+        },
+        {
+            "comment": "This code is randomly selecting a region in the image and adjusting it to the new size while ensuring that there is at least one object present in the cropped region. It then updates the corresponding image and label based on this new crop.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":98-123",
+            "content": "                            continue\n                        tmp = sample[elem]\n                        tmp = tmp[top:top + new_h, left:left + new_w]\n                        sample[elem] = tmp\n                    break\n        else:\n            st = 0\n            while not is_contain_obj and st < self.step:\n                st += 1\n                top = np.random.randint(0, h - new_h + 1)\n                left = np.random.randint(0, w - new_w + 1)\n                ref_scribble_label = sample['ref_scribble_label']\n                new_ref_scribble_label = ref_scribble_label[top:top + new_h,\n                                                            left:left + new_w]\n                if len(np.unique(\n                        new_ref_scribble_label)) == 1 or st < self.step - 1:\n                    continue\n                else:\n                    for elem in sample.keys():\n                        if 'meta' in elem:\n                            continue\n                        tmp = sample[elem]\n                        tmp = tmp[top:top + new_h, left:left + new_w]"
+        },
+        {
+            "comment": "The code defines a class called \"ScaleNRotate\" which applies scaling and rotation transformations to images and their corresponding ground truth. It takes two possible arguments for rotations and scales, either as tuples or lists. If the argument is a tuple, it randomly selects a rotation and scale within the defined range. If the argument is a list, it applies one of the fixed possible rotations and scales from the provided list. The code also initializes the instance variables \"rots\" and \"scales\" based on the input arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":124-153",
+            "content": "                        sample[elem] = tmp\n                    break\n        return sample\nclass ScaleNRotate(object):\n    \"\"\"Scale (zoom-in, zoom-out) and Rotate the image and the ground truth.\n    Args:\n        two possibilities:\n        1.  rots (tuple): (minimum, maximum) rotation angle\n            scales (tuple): (minimum, maximum) scale\n        2.  rots [list]: list of fixed possible rotation angles\n            scales [list]: list of fixed possible scales\n    \"\"\"\n    def __init__(self, rots=(-30, 30), scales=(.75, 1.25)):\n        assert (isinstance(rots, type(scales)))\n        self.rots = rots\n        self.scales = scales\n    def __call__(self, sample):\n        if type(self.rots) == tuple:\n            # Continuous range of scales and rotations\n            rot = (self.rots[1] - self.rots[0]) * random.random() - \\\n                  (self.rots[1] - self.rots[0]) / 2\n            sc = (self.scales[1] - self.scales[0]) * random.random() - \\\n                 (self.scales[1] - self.scales[0]) / 2 + 1\n        elif type(self.rots) == list:"
+        },
+        {
+            "comment": "This code applies random scaling, rotation, and warping to an image and its corresponding metadata. It selects a random scale and rotation from predefined ranges for each element in the sample, adjusting the shape of the image and preserving its center point. The cv2.warpAffine function is used to perform the transformation, using interpolation flags based on whether the original image contains only 0s and 1s or not. Finally, it returns the transformed sample.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":154-188",
+            "content": "            # Fixed range of scales and rotations\n            rot = self.rots[random.randint(0, len(self.rots))]\n            sc = self.scales[random.randint(0, len(self.scales))]\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            h, w = tmp.shape[:2]\n            center = (w / 2, h / 2)\n            assert (center != 0)  # Strange behaviour warpAffine\n            M = cv2.getRotationMatrix2D(center, rot, sc)\n            if ((tmp == 0) | (tmp == 1)).all():\n                flagval = cv2.INTER_NEAREST\n            else:\n                flagval = cv2.INTER_CUBIC\n            tmp = cv2.warpAffine(tmp, M, (w, h), flags=flagval)\n            sample[elem] = tmp\n        return sample\nclass RandomScale(object):\n    \"\"\"Randomly resize the image and the ground truth to specified scales.\n    Args:\n        scales (list): the list of scales\n    \"\"\"\n    def __init__(self, scales=[0.75, 1, 1.25]):\n        self.scales = scales\n    def __call__(self, sample):"
+        },
+        {
+            "comment": "The code includes classes for resizing, horizontally flipping, and subtracting the mean image from input data. The resizing function adjusts image size based on a randomly chosen scale from a fixed range. The RandomHorizontalFlip class flips images with a probability of 0.5. The SubtractMeanImage class subtracts a pre-calculated mean image from input images, presumably to normalize pixel values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":190-228",
+            "content": "        # Fixed range of scales\n        sc = self.scales[random.randint(0, len(self.scales) - 1)]\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':\n                flagval = cv2.INTER_CUBIC\n            else:\n                flagval = cv2.INTER_NEAREST\n            tmp = cv2.resize(tmp, None, fx=sc, fy=sc, interpolation=flagval)\n            sample[elem] = tmp\n        return sample\nclass RandomHorizontalFlip(object):\n    \"\"\"Horizontally flip the given image and ground truth randomly with a probability of 0.5.\"\"\"\n    def __init__(self, prob):\n        self.p = prob\n    def __call__(self, sample):\n        if random.random() < self.p:\n            for elem in sample.keys():\n                if 'meta' in elem:\n                    continue\n                tmp = sample[elem]\n                tmp = cv2.flip(tmp, flipCode=1)\n                sample[elem] = tmp\n        return sample\nclass SubtractMeanImage(object):"
+        },
+        {
+            "comment": "This code defines two classes, 'SubtractMeanImage' and 'CustomScribbleInteractive'. The former subtracts the mean from each image in a sample to normalize them. The latter initializes an object for custom scribble interactive functionality with parameters like scribbles, first frame, dilation, nocare_area, bresenham, use_previous_mask, and previous_mask_path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":229-260",
+            "content": "    def __init__(self, mean, change_channels=False):\n        self.mean = mean\n        self.change_channels = change_channels\n    def __call__(self, sample):\n        for elem in sample.keys():\n            if 'image' in elem:\n                if self.change_channels:\n                    sample[elem] = sample[elem][:, :, [2, 1, 0]]\n                sample[elem] = np.subtract(\n                    sample[elem], np.array(self.mean, dtype=np.float32))\n        return sample\n    def __str__(self):\n        return 'SubtractMeanImage' + str(self.mean)\nclass CustomScribbleInteractive(object):\n    def __init__(self,\n                 scribbles,\n                 first_frame,\n                 dilation=9,\n                 nocare_area=None,\n                 bresenham=True,\n                 use_previous_mask=False,\n                 previous_mask_path=None):\n        self.scribbles = scribbles\n        self.dilation = dilation\n        self.nocare_area = nocare_area\n        self.bresenham = bresenham\n        self.first_frame = first_frame"
+        },
+        {
+            "comment": "This code initializes variables for segmentation mask, no-care area, and scribbles. It iterates over the scribbles of a specific frame and determines whether the scribble is foreground or background based on the object ID. The Bresenham algorithm is applied if specified in the configuration to generate all points for each scribble.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":261-287",
+            "content": "        self.use_previous_mask = use_previous_mask\n        self.previous_mask_path = previous_mask_path\n    def __call__(self, sample):\n        meta = sample['meta']\n        frame_num = int(meta['frame_id'])\n        im_size = meta['im_size']\n        # Initialize gt to zeros, no-care areas to ones\n        scr_gt = np.zeros(im_size)\n        scr_nocare = np.ones(im_size)\n        mask = np.zeros(im_size)\n        mask_neg = np.zeros(im_size)\n        # Get all the scribbles for the current frame\n        for scribble in self.scribbles[frame_num]:\n            points_scribble = np.round(\n                np.array(scribble['path']) * np.array(\n                    (im_size[1], im_size[0]))).astype(int)\n            if self.bresenham and len(points_scribble) > 1:\n                all_points = bresenham(points_scribble)\n            else:\n                all_points = points_scribble\n            # Check if scribble is of same id to mark as foreground, otherwise as background\n            if scribble['object_id'] == meta['obj_id']:"
+        },
+        {
+            "comment": "This code segment appears to be responsible for generating ground truth (GT) masks from human-drawn scribbles. If the first frame is encountered, it computes dilated foreground and background masks along with a no-care area. It also excludes negative examples from the training set. The mask and nocare_area are computed based on the conditions in the code snippet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":288-309",
+            "content": "                mask[all_points[:, 1] - 1, all_points[:, 0] - 1] = 1\n            else:\n                mask_neg[all_points[:, 1] - 1, all_points[:, 0] - 1] = 1\n        if self.nocare_area is None:\n            nz = np.where(mask > 0)\n            nocare_area = int(.5 * np.sqrt(\n                (nz[0].max() - nz[0].min()) * (nz[1].max() - nz[1].min())))\n        else:\n            nocare_area = 100\n        # In case we are reading the first human annotation round\n        if frame_num == self.first_frame:\n            # Compute dilated foreground, background, and no-care area\n            scr_gt, scr_nocare = helpers.gt_from_scribble(\n                mask, dilation=self.dilation, nocare_area=nocare_area)\n            scr_gt_neg, _ = helpers.gt_from_scribble(mask_neg,\n                                                     dilation=self.dilation,\n                                                     nocare_area=None)\n            # Negative examples included in the training\n            scr_gt[scr_gt_neg > 0] = 0\n            scr_nocare[scr_gt_neg > 0] = 0"
+        },
+        {
+            "comment": "This code computes dilated foreground, background, and no-care area for annotation rounds generated by the robot. It first generates scr_gt_extra and scr_gt_neg using the gt_from_scribble function from helpers module. Then it ignores pixels that are not foreground if use_previous_mask is False. Else, it reads a previous mask image, converts it into float32 format and assigns pixel values greater than 0.8*255 to 1. These computations will be used in the subsequent operations of the code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":311-329",
+            "content": "        # For annotation rounds generated by the robot\n        else:\n            # Compute dilated foreground, background, and no-care area\n            scr_gt_extra, _ = helpers.gt_from_scribble(mask,\n                                                       dilation=self.dilation,\n                                                       nocare_area=None)\n            scr_gt_neg, _ = helpers.gt_from_scribble(mask_neg,\n                                                     dilation=self.dilation,\n                                                     nocare_area=None)\n            # Ignore pixels that are not foreground\n            if not self.use_previous_mask:\n                scr_nocare_extra = 1. - scr_gt_extra\n            else:\n                scr_nocare_extra = \\\n                    (cv2.imread(os.path.join(self.previous_mask_path, meta['seq_name'], str(meta['obj_id']),\n                                             meta['frame_id'] + '.png'), 0) > 0.8 * 255).astype(np.float32)\n            # Negative examples included in training"
+        },
+        {
+            "comment": "This code is part of a data loader in the Ma-Net application. It transforms image and mask data for PaddlePaddle's video object detection task. The code handles scribble ground truth (scribble_gt) and scribble void pixels (scribble_void_pixels), applying necessary adjustments to ensure correct formatting and values. It then uses the ToTensor class to convert ndarrays in samples to tensors, handling color axis swapping due to differences between numpy and PaddlePaddle image formats.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":330-365",
+            "content": "            scr_gt_extra[scr_gt_neg > 0] = 0\n            scr_nocare_extra[scr_gt_neg > 0] = 0\n            scr_gt = np.maximum(scr_gt, scr_gt_extra)\n            scr_nocare_extra[scr_gt > 0] = 0\n            scr_nocare = np.minimum(scr_nocare, scr_nocare_extra)\n        sample['scribble_gt'] = scr_gt\n        sample['scribble_void_pixels'] = scr_nocare\n        return sample\nclass ToTensor(object):\n    \"\"\"Convert ndarrays in sample to Tensors.\"\"\"\n    def __call__(self, sample):\n        for elem in sample.keys():\n            if 'meta' in elem:\n                continue\n            tmp = sample[elem]\n            if tmp.ndim == 2:\n                tmp = tmp[:, :, np.newaxis]\n            else:\n                tmp = tmp / 255.\n                tmp -= (0.485, 0.456, 0.406)\n                tmp /= (0.229, 0.224, 0.225)\n            # swap color axis because\n            # numpy image: H x W x C\n            # paddle image: C X H X W\n            tmp = tmp.transpose([2, 0, 1])\n            sample[elem] = paddle.to_tensor(tmp)\n        return sample"
+        },
+        {
+            "comment": "The code defines two classes, `GenerateEdge` and `GenerateEdge_2`, which generate edge masks from the input samples. The edge masks are generated based on whether there is a \"label2\" or \"ref_frame_gt\" present in the sample. If these labels are present, a kernel is applied to create an edge mask, which is then added to the sample. If they are not present, a RuntimeError is raised.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":368-404",
+            "content": "class GenerateEdge(object):\n    \"\"\"\n    \"\"\"\n    def __init__(self, edgesize=1):\n        self.edgesize = edgesize\n    def __call__(self, sample):\n        \"\"\"\n        \"\"\"\n        if \"label2\" in sample:\n            label2 = sample['label2']\n            kernel_size = 2 * self.edgesize + 1\n            maskedge = np.zeros_like(label2)\n            maskedge[np.where(label2[:, 1:] != label2[:, :-1])] = 1\n            maskedge[np.where(label2[1:, :] != label2[:-1, :])] = 1\n            maskedge = cv2.dilate(\n                maskedge, np.ones((kernel_size, kernel_size), dtype=np.uint8))\n            sample[\"edge_mask\"] = maskedge\n        else:\n            raise RuntimeError(\n                \"We need parsing mask to generate the edge mask.\")\n        return sample\nclass GenerateEdge_2(object):\n    \"\"\"\n    \"\"\"\n    def __init__(self, edgesize=1):\n        self.edgesize = edgesize\n    def __call__(self, sample):\n        \"\"\"\n        \"\"\"\n        if \"ref_frame_gt\" in sample:\n            label2 = sample['ref_frame_gt']\n            kernel_size = 2 * self.edgesize + 1"
+        },
+        {
+            "comment": "This code checks if a parsing mask is provided. If it is, it creates an edge mask by comparing the labels horizontally and vertically. It then dilates the resulting mask using cv2.dilate and assigns it to sample[\"edge_mask\"]. If no parsing mask is provided, it raises a RuntimeError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py\":405-415",
+            "content": "            maskedge = np.zeros_like(label2)\n            maskedge[np.where(label2[:, 1:] != label2[:, :-1])] = 1\n            maskedge[np.where(label2[1:, :] != label2[:-1, :])] = 1\n            maskedge = cv2.dilate(\n                maskedge, np.ones((kernel_size, kernel_size), dtype=np.uint8))\n            sample[\"edge_mask\"] = maskedge\n        else:\n            raise RuntimeError(\n                \"We need parsing mask to generate the edge mask.\")\n        return sample"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d853bea9-87b8-4332-a755-4f5ce7f250cc.json b/docs/doc/d853bea9-87b8-4332-a755-4f5ce7f250cc.json
new file mode 100644
index 000000000..655b08e9b
--- /dev/null
+++ b/docs/doc/d853bea9-87b8-4332-a755-4f5ce7f250cc.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code defines a RandomIdentitySampler class that randomly samples N identities and K instances from a dataset, generating a random sample of identities and instances with the ability to replace or not while selecting new instances.",
+    "details": [
+        {
+            "comment": "This code defines a RandomIdentitySampler class that randomly samples N identities and then K instances from a given dataset, resulting in a batch size of N*K. It imports necessary libraries and initializes class variables. The __iter__ method generates a random permutation of the identities.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/samplers.py\":0-30",
+            "content": "from __future__ import absolute_import\nfrom collections import defaultdict\nimport numpy as np\nimport paddle\nfrom paddle.io import Sampler\nclass RandomIdentitySampler(Sampler):\n    \"\"\"\n    Randomly sample N identities, then for each identity,\n    randomly sample K instances, therefore batch size is N*K.\n    Code imported from https://github.com/Cysu/open-reid/blob/master/reid/utils/data/sampler.py.\n    Args:\n        data_source (Dataset): dataset to sample from.\n        num_instances (int): number of instances per identity.\n    \"\"\"\n    def __init__(self, sample_list, num_instances=1):\n        self.sample_list = sample_list\n        self.num_instances = num_instances\n        self.index_dic = defaultdict(list)\n        for index, tmp_dic in enumerate(self.sample_list):\n            pid = tmp_dic['seq_name']\n            self.index_dic[pid].append(index)\n        self.pids = list(self.index_dic.keys())\n        self.num_identities = len(self.pids)\n    def __iter__(self):\n        indices = np.random.permutation(self.num_identities)"
+        },
+        {
+            "comment": "This code generates a random sample of identities and instances from a given list of indices. It checks if the length of the current index is greater than or equal to the number of desired instances, and then chooses either to replace or not while selecting new instances. The selected instances are stored in a list and returned as an iterator. The method also provides the total number of samples by multiplying the number of identities with the number of instances.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/dataloaders/samplers.py\":31-41",
+            "content": "        ret = []\n        for i in indices:\n            pid = self.pids[i]\n            t = self.index_dic[pid]\n            replace = False if len(t) >= self.num_instances else True\n            t = np.random.choice(t, size=self.num_instances, replace=replace)\n            ret.extend(t)\n        return iter(ret)\n    def __len__(self):\n        return self.num_identities * self.num_instances"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d92244f7-c7ba-4764-99cd-90f2e2eadbbd.json b/docs/doc/d92244f7-c7ba-4764-99cd-90f2e2eadbbd.json
new file mode 100644
index 000000000..28746bcb4
--- /dev/null
+++ b/docs/doc/d92244f7-c7ba-4764-99cd-90f2e2eadbbd.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The Compose class combines registry-based pipeline components like decode functions, sample functions, and transforms to apply transformations flexibly on dictionary or list inputs. It includes a workaround for old format configuration files.",
+    "details": [
+        {
+            "comment": "This code defines the Compose class, which composes multiple pipelines such as decode functions, sample functions, and transforms. It uses the PIPELINES registry for registration and builds pipelines based on input configurations. The code also handles temporary list-type configuration for flexibility.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections.abc import Sequence\nfrom ..registry import PIPELINES\nimport traceback\nfrom ...utils import build\nfrom ...utils import get_logger\n@PIPELINES.register()\nclass Compose(object):\n    \"\"\"\n    Composes several pipelines(include decode func, sample func, and transforms) together.\n    Note: To deal with ```list``` type cfg temporaray, like:\n        transform:\n            - Crop: # A list\n                attribute: 10"
+        },
+        {
+            "comment": "The code defines a Compose class that takes a list of transforms and composes them sequentially. It checks if the input is a dictionary or a list, builds the transform modules using build function from PIPELINES, and appends them to the pipelines list. The code also includes an ugly workaround for dealing with old format configuration files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py\":31-58",
+            "content": "            - Resize: # A list\n                attribute: 20\n    every key of list will pass as the key name to build a module.\n    XXX: will be improved in the future.\n    Args:\n        pipelines (list): List of transforms to compose.\n    Returns:\n        A compose object which is callable, __call__ for this Compose\n        object will call each given :attr:`transforms` sequencely.\n    \"\"\"\n    def __init__(self, pipelines):\n        #assert isinstance(pipelines, Sequence)\n        self.pipelines = []\n        for p in pipelines.values():\n            if isinstance(p, dict):\n                p = build(p, PIPELINES)\n                self.pipelines.append(p)\n            elif isinstance(p, list):\n                for t in p:\n                    #XXX: to deal with old format cfg, ugly code here!\n                    temp_dict = dict(name=list(t.keys())[0])\n                    for all_sub_t in t.values():\n                        if all_sub_t is not None:\n                            temp_dict.update(all_sub_t) \n                    t = build(temp_dict, PIPELINES)"
+        },
+        {
+            "comment": "The code defines a class with a `__call__` method and an append function for adding pipelines. The `__call__` method applies transformations to input data by iterating over the pipelines. If any pipeline fails, it logs the error and raises an exception. Pipelines can be either callable or dictionaries, but if not, a TypeError is raised.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py\":59-75",
+            "content": "                    self.pipelines.append(t)\n            elif callable(p):\n                self.pipelines.append(p)\n            else:\n                raise TypeError(f'pipelines must be callable or a dict,'\n                                f'but got {type(p)}')\n    def __call__(self, data):\n        for p in self.pipelines:\n            try:\n                data = p(data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger = get_logger(\"paddlevideo\")\n                logger.info(\"fail to perform transform [{}] with error: \"\n                      \"{} and stack:\\n{}\".format(p, e, str(stack_info)))\n                raise e\n        return data"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d9632cda-1d99-44f9-bc5b-a3bddef2f276.json b/docs/doc/d9632cda-1d99-44f9-bc5b-a3bddef2f276.json
new file mode 100644
index 000000000..0c8be1a98
--- /dev/null
+++ b/docs/doc/d9632cda-1d99-44f9-bc5b-a3bddef2f276.json
@@ -0,0 +1,75 @@
+{
+    "summary": "This Python class initializes the CFBI model in PaddleVideo library for image segmentation and video processing using AI techniques, with instance-level attention via previous frame embeddings and labels.",
+    "details": [
+        {
+            "comment": "This code is a Python class for the CFBI model in the PaddleVideo library. It initializes the model and inherits from the BaseSegment class, allowing it to use other classes like backbone, head, and loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":0-29",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom .utils import foreground2background, global_matching_for_eval, local_matching, calculate_attention_head_for_eval\nfrom ...registry import SEGMENT\nfrom .base import BaseSegment\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@SEGMENT.register()\nclass CFBI(BaseSegment):\n    \"\"\"CFBI model framework.\"\"\"\n    def __init__(self, backbone=None, head=None, loss=None):\n        super().__init__(backbone, head, loss)"
+        },
+        {
+            "comment": "This code defines a class with a `test_step` method that performs testing on input data. It initializes some parameters and returns None if there is no previous embedding. The backbone function extracts multiple frame embeddings, which are stored in the `current_frame_embedding` list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":30-55",
+            "content": "        x1 = paddle.zeros([3, 1, 1, 1])\n        self.bg_bias = paddle.create_parameter(\n            shape=x1.shape,\n            dtype=x1.dtype,\n            default_initializer=nn.initializer.Assign(x1))\n        self.fg_bias = paddle.create_parameter(\n            shape=x1.shape,\n            dtype=x1.dtype,\n            default_initializer=nn.initializer.Assign(x1))\n        self.epsilon = 1e-05\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\n        \"\"\"\n        self.test_mode = True\n        ref_embeddings, ref_masks, prev_embedding, prev_mask, current_frame, pred_size, gt_ids = data_batch\n        current_frame_embedding_4x, current_frame_embedding_8x, current_frame_embedding_16x, \\\n        current_low_level = self.backbone(current_frame)\n        current_frame_embedding = [\n            current_frame_embedding_4x, current_frame_embedding_8x,\n            current_frame_embedding_16x\n        ]\n        if prev_embedding is None:\n            return None, current_frame_embedding"
+        },
+        {
+            "comment": "The code is in PaddleVideo framework, and it contains an else block that executes when a condition is not met. The function first defines the shape of current_frame_embedding_4x. It then processes reference embeddings, previous embeddings, and current frame embedding with other parameters such as masks and IDs. It interpolates predictions and concatenates them along the specified axis. Finally, it applies softmax to all_pred on the specified axis before returning both all_pred and current_frame_embedding.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":56-83",
+            "content": "        else:\n            bs, c, h, w = current_frame_embedding_4x.shape\n            tmp_dic, _ = self.before_seghead_process(\n                ref_embeddings,\n                prev_embedding,\n                current_frame_embedding,\n                ref_masks,\n                prev_mask,\n                gt_ids,\n                current_low_level=current_low_level,\n            )\n            all_pred = []\n            for i in range(bs):\n                pred = tmp_dic[i]\n                pred = F.interpolate(pred,\n                                     size=[pred_size[0], pred_size[1]],\n                                     mode='bilinear',\n                                     align_corners=True)\n                all_pred.append(pred)\n            all_pred = paddle.concat(all_pred, axis=0)\n            all_pred = F.softmax(all_pred, axis=1)\n            return all_pred, current_frame_embedding\n    def before_seghead_process(self,\n                               ref_frame_embeddings=None,\n                               previous_frame_embeddings=None,"
+        },
+        {
+            "comment": "The code initializes various constants and variables for the segmentation head process. It includes settings for matching, atroous rates, and parallel processing, as well as defining arrays for scale reference frame labels and previous frame labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":84-107",
+            "content": "                               current_frame_embeddings=None,\n                               ref_frame_labels=None,\n                               previous_frame_mask=None,\n                               gt_ids=None,\n                               current_low_level=None):\n        \"\"\" process befor segmentation head\"\"\"\n        TEST_GLOBAL_MATCHING_CHUNK = [4, 1, 1]\n        TEST_GLOBAL_ATROUS_RATE = [2, 1, 1]\n        TRAIN_LOCAL_ATROUS_RATE = [2, 1, 1]\n        TEST_LOCAL_ATROUS_RATE = [2, 1, 1]\n        MODEL_FLOAT16_MATCHING = False\n        TEST_GLOBAL_MATCHING_MIN_PIXEL = 100\n        MODEL_MULTI_LOCAL_DISTANCE = [[4, 8, 12, 16, 20, 24],\n                                      [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]]\n        TRAIN_LOCAL_PARALLEL = True\n        TEST_LOCAL_PARALLEL = True\n        MODEL_MATCHING_BACKGROUND = True\n        MODEL_SEMANTIC_MATCHING_DIM = [32, 64, 128]\n        dic_tmp = []\n        boards = {}\n        scale_ref_frame_labels = []\n        scale_previous_frame_labels = []\n        for current_frame_embedding in current_frame_embeddings:"
+        },
+        {
+            "comment": "Resizing ref_frame_label and previous_frame_mask to match current frame size for nearest mode interpolation in PaddleVideo model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":108-126",
+            "content": "            bs, c, h, w = current_frame_embedding.shape\n            if not self.test_mode:\n                raise NotImplementedError\n            else:\n                ref_frame_embeddings = list(zip(*ref_frame_embeddings))\n                all_scale_ref_frame_label = []\n                for ref_frame_label in ref_frame_labels:\n                    scale_ref_frame_label = paddle.cast(F.interpolate(\n                        paddle.cast(ref_frame_label, dtype=\"float32\"),\n                        size=(h, w),\n                        mode='nearest'),\n                                                        dtype=\"int32\")\n                    all_scale_ref_frame_label.append(scale_ref_frame_label)\n                scale_ref_frame_labels.append(all_scale_ref_frame_label)\n            scale_previous_frame_label = paddle.cast(F.interpolate(\n                paddle.cast(previous_frame_mask, dtype=\"float32\"),\n                size=(h, w),\n                mode='nearest'),\n                                                     dtype=\"int32\")"
+        },
+        {
+            "comment": "The code is iterating over the input data and for each frame, it prepares the current_frame_embedding and previous_frame_embedding by reshaping, unsqueezing, and extracting the specific frames from their respective arrays. It then adds these embeddings to separate lists for later use in calculating attention scores and computing cross-entropy loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":127-143",
+            "content": "            scale_previous_frame_labels.append(scale_previous_frame_label)\n        for n in range(bs):\n            ref_obj_ids = paddle.reshape(\n                paddle.cast(paddle.arange(0,\n                                          np.array(gt_ids)[n] + 1),\n                            dtype=\"int32\"), [-1, 1, 1, 1])\n            obj_num = ref_obj_ids.shape[0]\n            low_level_feat = paddle.unsqueeze(current_low_level[n], axis=0)\n            all_CE_input = []\n            all_attention_head = []\n            for scale_idx, current_frame_embedding, ref_frame_embedding, previous_frame_embedding, \\\n                scale_ref_frame_label, scale_previous_frame_label in zip(range(3), \\\n                    current_frame_embeddings, ref_frame_embeddings, previous_frame_embeddings, \\\n                    scale_ref_frame_labels, scale_previous_frame_labels):\n                #Prepare\n                seq_current_frame_embedding = current_frame_embedding[n]\n                seq_prev_frame_embedding = previous_frame_embedding[n]"
+        },
+        {
+            "comment": "This code calculates the distance bias for each frame in a sequence and prepares it for matching. It checks if the object ID is greater than 0, then assigns the corresponding background or foreground distance bias. It also transposes the current frame embedding for matching in case it's not in test mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":144-163",
+            "content": "                seq_previous_frame_label = paddle.cast(\n                    (paddle.cast(scale_previous_frame_label[n], dtype=\"int32\")\n                     == ref_obj_ids),\n                    dtype=\"float32\")\n                if np.array(gt_ids)[n] > 0:\n                    dis_bias = paddle.concat([\n                        paddle.unsqueeze(self.bg_bias[scale_idx], axis=0),\n                        paddle.expand(\n                            paddle.unsqueeze(self.fg_bias[scale_idx], axis=0),\n                            [np.array(gt_ids)[n], -1, -1, -1])\n                    ],\n                                             axis=0)\n                else:\n                    dis_bias = paddle.unsqueeze(self.bg_bias[scale_idx], axis=0)\n                #Global FG map\n                matching_dim = MODEL_SEMANTIC_MATCHING_DIM[scale_idx]\n                seq_current_frame_embedding_for_matching = paddle.transpose(\n                    seq_current_frame_embedding[:matching_dim], [1, 2, 0])\n                if not self.test_mode:"
+        },
+        {
+            "comment": "The code raises a NotImplementedError if the condition is met and otherwise creates variables for storing reference frame embeddings, labels, and sequence-specific values. It then iterates through the provided labels and embeddings to prepare them for use in the model's segmentation process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":164-183",
+            "content": "                    raise NotImplementedError\n                else:\n                    all_scale_ref_frame_label = scale_ref_frame_label\n                    all_ref_frame_embedding = ref_frame_embedding\n                    all_reference_embeddings = []\n                    all_reference_labels = []\n                    seq_ref_frame_labels = []\n                    count = 0\n                    for idx in range(len(all_scale_ref_frame_label)):\n                        ref_frame_embedding = all_ref_frame_embedding[idx]\n                        scale_ref_frame_label = all_scale_ref_frame_label[idx]\n                        seq_ref_frame_embedding = ref_frame_embedding[n]\n                        seq_ref_frame_embedding = paddle.transpose(\n                            seq_ref_frame_embedding, [1, 2, 0])\n                        seq_ref_frame_label = paddle.cast(\n                            (paddle.cast(scale_ref_frame_label[n],\n                                         dtype=\"int32\") == ref_obj_ids),\n                            dtype=\"float32\")"
+        },
+        {
+            "comment": "This code appears to be part of a computer vision model. It is appending reference frame labels, transposing them, and adding the reference embeddings to a list. Then it calls a function called \"global_matching_fg\" with the reference embeddings, query embeddings, reference labels, number of chunks, distance bias, and atrous rate as arguments. The function is likely used for global matching evaluation in the context of this model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":184-199",
+            "content": "                        seq_ref_frame_labels.append(seq_ref_frame_label)\n                        seq_ref_frame_label = paddle.transpose(\n                            paddle.squeeze(seq_ref_frame_label, axis=1),\n                            [1, 2, 0])\n                        all_reference_embeddings.append(\n                            seq_ref_frame_embedding[:, :, :matching_dim])\n                        all_reference_labels.append(seq_ref_frame_label)\n                    global_matching_fg = global_matching_for_eval(\n                        all_reference_embeddings=all_reference_embeddings,\n                        query_embeddings=\n                        seq_current_frame_embedding_for_matching,\n                        all_reference_labels=all_reference_labels,\n                        n_chunks=TEST_GLOBAL_MATCHING_CHUNK[scale_idx],\n                        dis_bias=dis_bias,\n                        atrous_rate=TEST_GLOBAL_ATROUS_RATE[scale_idx],\n                        use_float16=MODEL_FLOAT16_MATCHING,"
+        },
+        {
+            "comment": "This code block prepares input for a local matching function to compare previous and current frames. It transposes the embeddings and labels, sets atrous rate based on test mode, and uses float16 if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":200-215",
+            "content": "                        atrous_obj_pixel_num=TEST_GLOBAL_MATCHING_MIN_PIXEL)\n                # Local FG map\n                seq_prev_frame_embedding_for_matching = paddle.transpose(\n                    seq_prev_frame_embedding[:matching_dim], [1, 2, 0])\n                seq_previous_frame_label_for_matching = paddle.transpose(\n                    paddle.squeeze(seq_previous_frame_label, axis=1), [1, 2, 0])\n                local_matching_fg = local_matching(\n                    prev_frame_embedding=seq_prev_frame_embedding_for_matching,\n                    query_embedding=seq_current_frame_embedding_for_matching,\n                    prev_frame_labels=seq_previous_frame_label_for_matching,\n                    multi_local_distance=MODEL_MULTI_LOCAL_DISTANCE[scale_idx],\n                    dis_bias=dis_bias,\n                    atrous_rate=TRAIN_LOCAL_ATROUS_RATE[scale_idx] if\n                    not self.test_mode else TEST_LOCAL_ATROUS_RATE[scale_idx],\n                    use_float16=MODEL_FLOAT16_MATCHING,"
+        },
+        {
+            "comment": "This code performs pixel-level matching and global/local background subtraction for image segmentation. It transposes and squeezes the global and local matching results, concatenates them with previous frame labels, and if using background modeling, computes global and local background maps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":216-236",
+            "content": "                    allow_downsample=False,\n                    allow_parallel=TRAIN_LOCAL_PARALLEL\n                    if not self.test_mode else TEST_LOCAL_PARALLEL)\n                #Aggregate Pixel-level Matching\n                to_cat_global_matching_fg = paddle.transpose(\n                    paddle.squeeze(global_matching_fg, axis=0), [2, 3, 0, 1])\n                to_cat_local_matching_fg = paddle.transpose(\n                    paddle.squeeze(local_matching_fg, axis=0), [2, 3, 0, 1])\n                all_to_cat = [\n                    to_cat_global_matching_fg, to_cat_local_matching_fg,\n                    seq_previous_frame_label\n                ]\n                #Global and Local BG map\n                if MODEL_MATCHING_BACKGROUND:\n                    to_cat_global_matching_bg = foreground2background(\n                        to_cat_global_matching_fg,\n                        np.array(gt_ids)[n] + 1)\n                    reshaped_prev_nn_feature_n = paddle.unsqueeze(\n                        paddle.transpose(to_cat_local_matching_fg,"
+        },
+        {
+            "comment": "This code segment appears to be part of a computer vision model that handles segmentation for video frames. It seems to be working with object instances, their previous and current frame embeddings, and global/local matching backgrounds. The code is performing reshaping operations and expansions on tensors, and calculating local and global matching backgrounds for the current frame's object instance. Overall, it appears to be a complex segment of a larger AI-based video processing pipeline.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":237-255",
+            "content": "                                         [0, 2, 3, 1]),\n                        axis=1)\n                    to_cat_local_matching_bg = foreground2background(\n                        reshaped_prev_nn_feature_n,\n                        np.array(gt_ids)[n] + 1)\n                    to_cat_local_matching_bg = paddle.squeeze(paddle.transpose(\n                        to_cat_local_matching_bg, [0, 4, 2, 3, 1]),\n                                                              axis=-1)\n                    all_to_cat += [\n                        to_cat_local_matching_bg, to_cat_global_matching_bg\n                    ]\n                to_cat_current_frame_embedding = paddle.expand(\n                    paddle.unsqueeze(current_frame_embedding[n], axis=0),\n                    [obj_num, -1, -1, -1])\n                to_cat_prev_frame_embedding = paddle.expand(\n                    paddle.unsqueeze(previous_frame_embedding[n], axis=0),\n                    [obj_num, -1, -1, -1])\n                to_cat_prev_frame_embedding_fg = to_cat_prev_frame_embedding * seq_previous_frame_label"
+        },
+        {
+            "comment": "This code calculates attention for instance-level using previous frame embeddings and labels. It concatenates current, previous frame embedding (for foreground and background), and then applies attention on all frames in non-test mode. In test mode, it raises a NotImplementedError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":256-278",
+            "content": "                to_cat_prev_frame_embedding_bg = to_cat_prev_frame_embedding * (\n                    1 - seq_previous_frame_label)\n                all_to_cat += [\n                    to_cat_current_frame_embedding,\n                    to_cat_prev_frame_embedding_fg,\n                    to_cat_prev_frame_embedding_bg\n                ]\n                CE_input = paddle.concat(all_to_cat, axis=1)\n                #Instance-level Attention\n                if not self.test_mode:\n                    raise NotImplementedError\n                else:\n                    attention_head = calculate_attention_head_for_eval(\n                        all_ref_frame_embedding,\n                        seq_ref_frame_labels,\n                        paddle.expand(\n                            paddle.unsqueeze(previous_frame_embedding[n],\n                                             axis=0), [obj_num, -1, -1, -1]),\n                        seq_previous_frame_label,\n                        epsilon=self.epsilon)\n                all_CE_input.append(CE_input)"
+        },
+        {
+            "comment": "This code snippet is part of a machine learning model. It appends the \"attention_head\" to the list \"all_attention_head\", then passes the combined inputs along with \"low_level_feat\" to a \"head\" function, and appends its output to \"dic_tmp\". Finally, it returns both \"dic_tmp\" and \"boards\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/segment/cfbi.py\":279-285",
+            "content": "                all_attention_head.append(attention_head)\n            #Collaborative Ensembler\n            pred = self.head(all_CE_input, all_attention_head, low_level_feat)\n            dic_tmp.append(pred)\n        return dic_tmp, boards"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d983f2b4-946d-4f7d-ab91-3cebf598f1b5.json b/docs/doc/d983f2b4-946d-4f7d-ab91-3cebf598f1b5.json
new file mode 100644
index 000000000..808986174
--- /dev/null
+++ b/docs/doc/d983f2b4-946d-4f7d-ab91-3cebf598f1b5.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The \"InferModel\" class is a GPU-optimized inference function for generating action boundaries in videos using propositions and scoring functions. It calculates running averages of predictions, predicts video features, and saves results as proposals in 'results.json'.",
+    "details": [
+        {
+            "comment": "The code defines a class called \"InferModel\" which implements the bmn infer function. It initializes the model with specified configuration and enables GPU usage if available. The class takes in a config file that specifies model, parameters files, GPU memory, device ID, thread count for nms, minimum prediction score threshold, and frame thread count. The code also switches on IR optimizations and enables memory optimization for efficient execution of the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py\":0-38",
+            "content": "\"\"\"\nppTSM InferModel\n\"\"\"\nimport sys\nimport numpy as np\nimport json\nimport pickle\nimport time\nsys.path.append('../')\nfrom utils.preprocess import get_images\nfrom utils.config_utils import parse_config\nfrom utils.process_result import process_proposal\nimport reader\nfrom paddle.inference import Config\nfrom paddle.inference import create_predictor\nclass InferModel(object):\n    \"\"\"bmn infer\"\"\"\n    def __init__(self, cfg, name='BMN'):\n        name = name.upper()\n        self.name = name\n        model_file = cfg[name]['model_file']\n        params_file = cfg[name]['params_file']\n        gpu_mem = cfg[name]['gpu_mem']\n        device_id = cfg[name]['device_id']\n        self.nms_thread = cfg[name]['nms_thread']\n        self.min_pred_score = cfg[name]['score_thread']\n        self.min_frame_thread = cfg['COMMON']['fps']\n        # model init\n        config = Config(model_file, params_file)\n        config.enable_use_gpu(gpu_mem, device_id)\n        config.switch_ir_optim(True)  # default true\n        config.enable_memory_optim()\n        # use zero copy"
+        },
+        {
+            "comment": "The code initializes a predictor, sets input and output tensors for inference, and defines an \"infer\" method to perform inference. The \"generate_props\" function takes predictions, start and end timestamps, and generates properties based on the given parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py\":39-64",
+            "content": "        config.switch_use_feed_fetch_ops(False)\n        self.predictor = create_predictor(config)\n        input_names = self.predictor.get_input_names()\n        self.input_tensor = self.predictor.get_input_handle(input_names[0])\n        output_names = self.predictor.get_output_names()\n        self.output1_tensor = self.predictor.get_output_handle(output_names[0])\n        self.output2_tensor = self.predictor.get_output_handle(output_names[1])\n        self.output3_tensor = self.predictor.get_output_handle(output_names[2])\n    def infer(self, input):\n        \"\"\"infer\"\"\"\n        self.input_tensor.copy_from_cpu(input)\n        self.predictor.run()\n        output1 = self.output1_tensor.copy_to_cpu()\n        output2 = self.output2_tensor.copy_to_cpu()\n        output3 = self.output3_tensor.copy_to_cpu()\n        return output1, output2, output3\n    def generate_props(self,\n                       pred_bmn,\n                       pred_start,\n                       pred_end,\n                       max_window=200,\n                       min_window=5):"
+        },
+        {
+            "comment": "This code generates propositions for action boundaries in a video. It iterates through possible window sizes to find valid start and end indices, checks if start and end masks match, then computes the confidence score based on boundary scores and BNM score. The results are stored as a list of [xmin, xmax, confidence] values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py\":65-86",
+            "content": "        \"\"\"generate_props\"\"\"\n        video_len = min(pred_bmn.shape[-1],\n                        min(pred_start.shape[-1], pred_end.shape[-1]))\n        pred_bmn = pred_bmn[0, :, :] * pred_bmn[1, :, :]\n        start_mask = self.boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = self.boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_results = []\n        for idx in range(min_window, max_window):\n            for jdx in range(video_len):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < video_len and start_mask[\n                        start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = start_index\n                    xmax = end_index\n                    xmin_score = pred_start[start_index]\n                    xmax_score = pred_end[end_index]\n                    bmn_score = pred_bmn[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bmn_score\n                    score_results.append([xmin, xmax, conf_score])"
+        },
+        {
+            "comment": "The code defines three functions: \"action_detect.models.bmn_infer.py\" contains the \"score_results\", \"boundary_choose\", and \"predict\" functions. The \"score_results\" function returns a list of scores for each action. The \"boundary_choose\" function determines boundary scores based on peak, front, and back scores. It uses masks to identify relevant positions in the score list. Finally, the \"predict\" function initializes an infer reader, iterates through data, and gathers input data for prediction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py\":87-111",
+            "content": "        return score_results\n    def boundary_choose(self, score_list):\n        \"\"\"boundary_choose\"\"\"\n        max_score = max(score_list)\n        mask_high = (score_list > max_score * 0.5)\n        score_list = list(score_list)\n        score_middle = np.array([0.0] + score_list + [0.0])\n        score_front = np.array([0.0, 0.0] + score_list)\n        score_back = np.array(score_list + [0.0, 0.0])\n        mask_peak = ((score_middle > score_front) & (score_middle > score_back))\n        mask_peak = mask_peak[1:-1]\n        mask = (mask_high | mask_peak).astype('float32')\n        return mask\n    def predict(self, infer_config, material):\n        \"\"\"predict\"\"\"\n        infer_reader = reader.get_reader(self.name,\n                                         'infer',\n                                         infer_config,\n                                         material=material)\n        feature_list = []\n        for infer_iter, data in enumerate(infer_reader()):\n            inputs = [items[0] for items in data]\n            winds = [items[1] for items in data]"
+        },
+        {
+            "comment": "This code performs a running average of predictions from a series of windows. It calculates the sum of each prediction for each window, divides it by the count of non-zero frames in that window, and stores the results in `sum_pred_bmn`, `sum_pred_sta`, and `sum_pred_end`. Finally, it divides these sums by the corresponding counts to get the final predictions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py\":112-134",
+            "content": "            feat_info = [items[2] for items in data]\n            feature_T = feat_info[0][0]\n            feature_N = feat_info[0][1]\n            inputs = np.array(inputs)\n            pred_bmn, pred_sta, pred_end = self.infer(inputs)\n            if infer_iter == 0:\n                sum_pred_bmn = np.zeros((2, feature_N, feature_T))\n                sum_pred_sta = np.zeros((feature_T, ))\n                sum_pred_end = np.zeros((feature_T, ))\n                sum_pred_cnt = np.zeros((feature_T, ))\n            for idx, sub_wind in enumerate(winds):\n                sum_pred_bmn[:, :, sub_wind[0]:sub_wind[1]] += pred_bmn[idx]\n                sum_pred_sta[sub_wind[0]:sub_wind[1]] += pred_sta[idx]\n                sum_pred_end[sub_wind[0]:sub_wind[1]] += pred_end[idx]\n                sum_pred_cnt[sub_wind[0]:sub_wind[1]] += np.ones(\n                    (sub_wind[1] - sub_wind[0], ))\n        pred_bmn = sum_pred_bmn / sum_pred_cnt\n        pred_sta = sum_pred_sta / sum_pred_cnt\n        pred_end = sum_pred_end / sum_pred_cnt"
+        },
+        {
+            "comment": "The code initializes an instance of the InferModel class from the given configuration file. It then predicts the video features by calling the model's predict method, passing the video features as input and returns the results in the form of proposals. The output is then saved to a JSON file named 'results.json'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py\":136-163",
+            "content": "        score_result = self.generate_props(pred_bmn, pred_sta, pred_end)\n        results = process_proposal(score_result, self.min_frame_thread,\n                                   self.nms_thread, self.min_pred_score)\n        return results\nif __name__ == \"__main__\":\n    cfg_file = '/home/work/inference/configs/configs.yaml'\n    cfg = parse_config(cfg_file)\n    model = InferModel(cfg)\n    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'\n    # feature\n    feature_path = imgs_path.replace(\"frames\", \"features\") + '.pkl'\n    video_features = pickle.load(open(feature_path, 'rb'))\n    t0 = time.time()\n    outputs = model.predict(cfg, video_features)\n    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))\n    t1 = time.time()\n    results = {'proposal': outputs}\n    with open('results.json', 'w', encoding='utf-8') as f:\n        data = json.dumps(results, indent=4, ensure_ascii=False)\n        f.write(data)\n    print('cost time = {} min'.format((t1 - t0) / 60.0))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/d9841904-8c07-4146-a303-83733a517a10.json b/docs/doc/d9841904-8c07-4146-a303-83733a517a10.json
new file mode 100644
index 000000000..399f4553e
--- /dev/null
+++ b/docs/doc/d9841904-8c07-4146-a303-83733a517a10.json
@@ -0,0 +1,125 @@
+{
+    "summary": "The code introduces a simplified 3D ResNet model in PaddleVideo, allowing for configurable parameters and options for non-local blocks and dilation values. The model is initialized with inflated 2D params, constructs layers, and can utilize pretrained weights.",
+    "details": [
+        {
+            "comment": "The code is defining a function that creates a ConvBNLayer, which is a combination of convolution, normalization, and activation layers. It simplifies the usage of these layers in a convolutional neural network model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":0-36",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport warnings\nimport collections\nfrom itertools import repeat\nimport paddle\nfrom paddle import nn\ndef _ntuple(n):\n    def parse(x):\n        if isinstance(x, collections.abc.Iterable):\n            return tuple(x)\n        return tuple(repeat(x, n))\n    return parse\n_triple = _ntuple(3)\nclass ConvBNLayer(nn.Layer):\n    \"\"\"A conv block that bundles conv/norm/activation layers.\n        This block simplifies the usage of convolution layers, which are commonly"
+        },
+        {
+            "comment": "This code defines a Conv2D layer with additional features including automatic bias setting, spectral norm support, and more padding modes. It is used in building convolutional layers, normalization layers, and activation layers for ResNet3D backbones in PaddleVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":37-55",
+            "content": "        used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).\n        It is based upon three build methods: `build_conv_layer()`,\n        `build_norm_layer()` and `build_activation_layer()`.\n        Besides, we add some additional features in this module.\n        1. Automatically set `bias` of the conv layer.\n        2. Spectral norm is supported.\n        3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only\n        supports zero and circular padding, and we add \"reflect\" padding mode.\n        Args:\n            in_channels (int): Number of channels in the input feature map.\n                Same as that in ``nn._ConvNd``.\n            out_channels (int): Number of channels produced by the convolution.\n                Same as that in ``nn._ConvNd``.\n            kernel_size (int | tuple[int]): Size of the convolving kernel.\n                Same as that in ``nn._ConvNd``.\n            stride (int | tuple[int]): Stride of the convolution.\n                Same as that in ``nn._ConvNd``."
+        },
+        {
+            "comment": "This code defines a ConvBNLayer class, which is a 3D convolutional layer followed by batch normalization. It takes parameters such as in_channels, out_channels, kernel_size, padding, stride, dilation, groups, act (activation function), and bias. The constructor initializes the Conv3D layer and BatchNorm3D with the specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":56-88",
+            "content": "            padding (int | tuple[int]): Zero-padding added to both sides of\n                the input. Same as that in ``nn._ConvNd``.\n            dilation (int | tuple[int]): Spacing between kernel elements.\n                Same as that in ``nn._ConvNd``.\n            groups (int): Number of blocked connections from input channels to\n                output channels. Same as that in ``nn._ConvNd``.\n        \"\"\"\n    def __init__(\n            self,\n            in_channels,\n            out_channels,\n            kernel_size,\n            padding=0,\n            stride=1,\n            dilation=1,\n            groups=1,\n            act=None,\n            bias=None,\n    ):\n        super(ConvBNLayer, self).__init__()\n        self._conv = nn.Conv3D(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            bias_attr=bias)\n        self._batch_norm = nn.BatchNorm3D(out_channels, momentum=0.1)"
+        },
+        {
+            "comment": "The code defines a Bottleneck3d class, which represents a bottleneck block for ResNet3D. It takes in input channels (inplanes), output channels (planes), spatial and temporal stride, dilation rate, downsample layer, inflate flag, and inflate style as arguments. The class has an act variable to store the activation function and defines a forward method that performs convolutions, batch normalization, and activation if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":89-114",
+            "content": "        self.act = act\n        if act is not None:\n            self._act_op = nn.ReLU()\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self.act is not None:\n            y = self._act_op(y)\n        return y\nclass Bottleneck3d(nn.Layer):\n    \"\"\"Bottleneck 3d block for ResNet3D.\n    Args:\n        inplanes (int): Number of channels for the input in first conv3d layer.\n        planes (int): Number of channels produced by some norm/conv3d layers.\n        spatial_stride (int): Spatial stride in the conv3d layer. Default: 1.\n        temporal_stride (int): Temporal stride in the conv3d layer. Default: 1.\n        dilation (int): Spacing between kernel elements. Default: 1.\n        downsample (nn.Module | None): Downsample layer. Default: None.\n        inflate (bool): Whether to inflate kernel. Default: True.\n        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the\n            kernel sizes and padding strides for conv1 and conv2 in each block."
+        },
+        {
+            "comment": "The code defines a ResNet3D block with various configurations including the number of input and output planes, spatial and temporal stride, dilation rate, downsampling method, inflation settings, and whether to apply non-local modules or not. The default configuration includes convolution, norm, and activation layers, as well as an option for using checkpoint to save memory at the cost of training speed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":115-139",
+            "content": "            Default: '3x1x1'.\n        non_local (bool): Determine whether to apply non-local module in this\n            block. Default: False.\n        non_local_cfg (dict): Config for non-local module. Default: ``dict()``.\n        conv_cfg (dict): Config dict for convolution layer.\n            Default: ``dict(type='Conv3d')``.\n        norm_cfg (dict): Config for norm layers. required keys are ``type``,\n            Default: ``dict(type='BN3d')``.\n        act_cfg (dict): Config dict for activation layer.\n            Default: ``dict(type='ReLU')``.\n        with_cp (bool): Use checkpoint or not. Using checkpoint will save some\n            memory while slowing down the training speed. Default: False.\n    \"\"\"\n    expansion = 4\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 spatial_stride=1,\n                 temporal_stride=1,\n                 dilation=1,\n                 downsample=None,\n                 inflate=True,\n                 inflate_style='3x1x1',\n                 non_local=False,"
+        },
+        {
+            "comment": "This code initializes an instance of a 3D ResNet backbone model with specified parameters, including planes, spatial and temporal strides, dilation, inflate style, norm and conv configurations, whether to use non-local blocks, and more. It sets various attributes based on the input and instantiates a Conv3d layer for the first block.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":140-170",
+            "content": "                 non_local_cfg=dict(),\n                 conv_cfg=dict(type='Conv3d'),\n                 norm_cfg=dict(type='BN3d'),\n                 act_cfg=dict(type='ReLU'),\n                 with_cp=False):\n        super().__init__()\n        assert inflate_style in ['3x1x1', '3x3x3']\n        self.inplanes = inplanes\n        self.planes = planes\n        self.spatial_stride = spatial_stride\n        self.temporal_stride = temporal_stride\n        self.dilation = dilation\n        self.inflate = inflate\n        self.inflate_style = inflate_style\n        self.norm_cfg = norm_cfg\n        self.conv_cfg = conv_cfg\n        self.act_cfg = act_cfg\n        self.with_cp = with_cp\n        self.non_local = non_local\n        self.non_local_cfg = non_local_cfg\n        self.conv1_stride_s = 1\n        self.conv2_stride_s = spatial_stride\n        self.conv1_stride_t = 1\n        self.conv2_stride_t = temporal_stride\n        if self.inflate:\n            if inflate_style == '3x1x1':\n                conv1_kernel_size = (3, 1, 1)\n                conv1_padding = (1, 0, 0)"
+        },
+        {
+            "comment": "Code is setting up convolutional layers for a ResNet3D model. It creates ConvBNLayer instances with different kernel sizes and padding based on the dilation value. These layers are used for temporal, spatial, and spatial dimensions depending on the dilation value provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":171-197",
+            "content": "                conv2_kernel_size = (1, 3, 3)\n                conv2_padding = (0, dilation, dilation)\n            else:\n                conv1_kernel_size = (1, 1, 1)\n                conv1_padding = (0, 0, 0)\n                conv2_kernel_size = (3, 3, 3)\n                conv2_padding = (1, dilation, dilation)\n        else:\n            conv1_kernel_size = (1, 1, 1)\n            conv1_padding = (0, 0, 0)\n            conv2_kernel_size = (1, 3, 3)\n            conv2_padding = (0, dilation, dilation)\n        self.conv1 = ConvBNLayer(\n            in_channels=inplanes,\n            out_channels=planes,\n            kernel_size=conv1_kernel_size,\n            stride=(self.conv1_stride_t, self.conv1_stride_s,\n                    self.conv1_stride_s),\n            padding=conv1_padding,\n            bias=False,\n            act='relu')\n        self.conv2 = ConvBNLayer(\n            in_channels=planes,\n            out_channels=planes,\n            kernel_size=conv2_kernel_size,\n            stride=(self.conv2_stride_t, self.conv2_stride_s,"
+        },
+        {
+            "comment": "This code defines a ResNet3D block with ConvBNLayer, downsample layer, and ReLU activation. The forward method applies the layers sequentially, optionally performs downsampling, and adds the identity connection before returning the output. Non-local blocks can be applied if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":198-238",
+            "content": "                    self.conv2_stride_s),\n            padding=conv2_padding,\n            dilation=(1, dilation, dilation),\n            bias=False,\n            act='relu')\n        self.conv3 = ConvBNLayer(\n            in_channels=planes,\n            out_channels=planes * self.expansion,\n            kernel_size=1,\n            bias=False,\n            act=None,\n        )\n        self.downsample = downsample\n        self.relu = nn.ReLU()\n    def forward(self, x):\n        \"\"\"Defines the computation performed at every call.\"\"\"\n        def _inner_forward(x):\n            \"\"\"Forward wrapper for utilizing checkpoint.\"\"\"\n            identity = x\n            out = self.conv1(x)\n            out = self.conv2(out)\n            out = self.conv3(out)\n            if self.downsample is not None:\n                identity = self.downsample(x)\n            out = out + identity\n            return out\n        out = _inner_forward(x)\n        out = self.relu(out)\n        if self.non_local:\n            out = self.non_local_block(out)\n        return out"
+        },
+        {
+            "comment": "The code defines a ResNet 3D backbone, with options for depth (18, 34, 50, 101, or 152), pretrained model name, number of stages for each res layer, loading of pretrained 2D model, input channel features, output feature indices, number of stages, and spatial and temporal strides.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":241-262",
+            "content": "class ResNet3d(nn.Layer):\n    \"\"\"ResNet 3d backbone.\n    Args:\n        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.\n        pretrained (str | None): Name of pretrained model.\n        stage_blocks (tuple | None): Set number of stages for each res layer.\n            Default: None.\n        pretrained2d (bool): Whether to load pretrained 2D model.\n            Default: True.\n        in_channels (int): Channel num of input features. Default: 3.\n        base_channels (int): Channel num of stem output features. Default: 64.\n        out_indices (Sequence[int]): Indices of output feature. Default: (3, ).\n        num_stages (int): Resnet stages. Default: 4.\n        spatial_strides (Sequence[int]):\n            Spatial strides of residual blocks of each stage.\n            Default: ``(1, 2, 2, 2)``.\n        temporal_strides (Sequence[int]):\n            Temporal strides of residual blocks of each stage.\n            Default: ``(1, 1, 1, 1)``.\n        dilations (Sequence[int]): Dilation of each stage.\n            Default: ``(1, 1, 1, 1)``."
+        },
+        {
+            "comment": "This code defines the parameters for ResNet3D backbone model including kernel sizes, stride values, and inflation dimensions. It also sets the default configuration for convolutional layers and normalization layers. The inflation style determines the kernel sizes for conv1 and conv2 in each block based on the given string input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":263-281",
+            "content": "        conv1_kernel (Sequence[int]): Kernel size of the first conv layer.\n            Default: ``(3, 7, 7)``.\n        conv1_stride_s (int): Spatial stride of the first conv layer.\n            Default: 2.\n        conv1_stride_t (int): Temporal stride of the first conv layer.\n            Default: 1.\n        pool1_stride_s (int): Spatial stride of the first pooling layer.\n            Default: 2.\n        pool1_stride_t (int): Temporal stride of the first pooling layer.\n            Default: 1.\n        with_pool2 (bool): Whether to use pool2. Default: True.\n        inflate (Sequence[int]): Inflate Dims of each block.\n            Default: (1, 1, 1, 1).\n        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the\n            kernel sizes and padding strides for conv1 and conv2 in each block.\n            Default: '3x1x1'.\n        conv_cfg (dict): Config for conv layers. required keys are ``type``\n            Default: ``dict(type='Conv3d')``.\n        norm_cfg (dict): Config for norm layers. required keys are ``type`` and"
+        },
+        {
+            "comment": "This code defines the parameters and architecture settings for ResNet3D model in PaddleVideo. It includes options such as backbone type, stages, activation layer, normalization mode, checkpoint usage, non-local module application, and residual block initialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":282-301",
+            "content": "            ``requires_grad``.\n            Default: ``dict(type='BN3d', requires_grad=True)``.\n        act_cfg (dict): Config dict for activation layer.\n            Default: ``dict(type='ReLU', inplace=True)``.\n        norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze\n            running stats (mean and var). Default: False.\n        with_cp (bool): Use checkpoint or not. Using checkpoint will save some\n            memory while slowing down the training speed. Default: False.\n        non_local (Sequence[int]): Determine whether to apply non-local module\n            in the corresponding block of each stages. Default: (0, 0, 0, 0).\n        non_local_cfg (dict): Config for non-local module. Default: ``dict()``.\n        zero_init_residual (bool):\n            Whether to use zero initialization for residual block,\n            Default: True.\n        kwargs (dict, optional): Key arguments for \"make_res_layer\".\n    \"\"\"\n    arch_settings = {\n        50: (Bottleneck3d, (3, 4, 6, 3)),\n        101: (Bottleneck3d, (3, 4, 23, 3)),"
+        },
+        {
+            "comment": "This code defines a ResNet3D backbone model with customizable parameters such as depth, stage blocks, and more. It uses Bottleneck3d layers and allows for pre-trained 2D weights usage. The model is designed for processing 4D data (spatial and temporal dimensions).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":302-330",
+            "content": "        152: (Bottleneck3d, (3, 8, 36, 3))\n    }\n    def __init__(self,\n                 depth,\n                 stage_blocks=None,\n                 pretrained2d=True,\n                 in_channels=3,\n                 num_stages=4,\n                 base_channels=64,\n                 out_indices=(3, ),\n                 spatial_strides=(1, 2, 2, 2),\n                 temporal_strides=(1, 1, 1, 1),\n                 dilations=(1, 1, 1, 1),\n                 conv1_kernel=(3, 7, 7),\n                 conv1_stride_s=2,\n                 conv1_stride_t=1,\n                 pool1_stride_s=2,\n                 pool1_stride_t=1,\n                 with_pool1=True,\n                 with_pool2=True,\n                 inflate=(1, 1, 1, 1),\n                 inflate_style='3x1x1',\n                 conv_cfg=dict(type='Conv3d'),\n                 norm_cfg=dict(type='BN3d', requires_grad=True),\n                 act_cfg=dict(type='ReLU', inplace=True),\n                 norm_eval=False,\n                 with_cp=False,\n                 non_local=(0, 0, 0, 0),"
+        },
+        {
+            "comment": "This function is initializing a ResNet3D model with specified depth, input channels, base channels, number of stages, stage blocks, output indices, spatial and temporal strides, dilations, and convolution kernel parameters. It raises an error if the provided depth does not match any of the known configurations or if the output indices exceed the number of stages. If the stage_blocks are specified, it also checks that their length matches the number of stages. The class inherits from a superclass.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":331-356",
+            "content": "                 non_local_cfg=dict(),\n                 zero_init_residual=True,\n                 **kwargs):\n        super().__init__()\n        if depth not in self.arch_settings:\n            raise KeyError(f'invalid depth {depth} for resnet')\n        self.depth = depth\n        self.pretrained2d = pretrained2d\n        self.in_channels = in_channels\n        self.base_channels = base_channels\n        self.num_stages = num_stages\n        assert 1 <= num_stages <= 4\n        self.stage_blocks = stage_blocks\n        self.out_indices = out_indices\n        assert max(out_indices) < num_stages\n        self.spatial_strides = spatial_strides\n        self.temporal_strides = temporal_strides\n        self.dilations = dilations\n        assert len(spatial_strides) == len(temporal_strides) == len(\n            dilations) == num_stages\n        if self.stage_blocks is not None:\n            assert len(self.stage_blocks) == num_stages\n        self.conv1_kernel = conv1_kernel\n        self.conv1_stride_s = conv1_stride_s\n        self.conv1_stride_t = conv1_stride_t"
+        },
+        {
+            "comment": "This code sets various attributes for a ResNet3D model. It initializes strides, determines if pooling layers are used in certain stages, inflates stages based on input, and configures convolutional, normalization, and activation settings. It also defines the block architecture and stage blocks according to the provided depth. Finally, it creates stem and residual layers based on the configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":357-386",
+            "content": "        self.pool1_stride_s = pool1_stride_s\n        self.pool1_stride_t = pool1_stride_t\n        self.with_pool1 = with_pool1\n        self.with_pool2 = with_pool2\n        self.stage_inflations = _ntuple(num_stages)(inflate)\n        self.non_local_stages = _ntuple(num_stages)(non_local)\n        self.inflate_style = inflate_style\n        self.conv_cfg = conv_cfg\n        self.norm_cfg = norm_cfg\n        self.act_cfg = act_cfg\n        self.norm_eval = norm_eval\n        self.with_cp = with_cp\n        self.zero_init_residual = zero_init_residual\n        self.block, stage_blocks = self.arch_settings[depth]\n        if self.stage_blocks is None:\n            self.stage_blocks = stage_blocks[:num_stages]\n        self.inplanes = self.base_channels\n        self.non_local_cfg = non_local_cfg\n        self._make_stem_layer()\n        self.res_layers = []\n        for i, num_blocks in enumerate(self.stage_blocks):\n            spatial_stride = spatial_strides[i]\n            temporal_stride = temporal_strides[i]\n            dilation = dilations[i]"
+        },
+        {
+            "comment": "This code defines a function that adds ResNet3D layers with specified block, input and output planes, number of blocks, spatial and temporal strides, dilation, norm/conv cfg, non-local stages, inflations, style, and with_cp. It updates inplanes and feat_dim accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":387-411",
+            "content": "            planes = self.base_channels * 2**i\n            res_layer = self.make_res_layer(\n                self.block,\n                self.inplanes,\n                planes,\n                num_blocks,\n                spatial_stride=spatial_stride,\n                temporal_stride=temporal_stride,\n                dilation=dilation,\n                norm_cfg=self.norm_cfg,\n                conv_cfg=self.conv_cfg,\n                act_cfg=self.act_cfg,\n                non_local=self.non_local_stages[i],\n                non_local_cfg=self.non_local_cfg,\n                inflate=self.stage_inflations[i],\n                inflate_style=self.inflate_style,\n                with_cp=with_cp,\n                **kwargs)\n            self.inplanes = planes * self.block.expansion\n            layer_name = f'layer{i + 1}'\n            self.add_sublayer(layer_name, res_layer)\n            self.res_layers.append(layer_name)\n        self.feat_dim = self.block.expansion * self.base_channels * 2**(\n            len(self.stage_blocks) - 1)"
+        },
+        {
+            "comment": "The function \"make_res_layer\" builds a residual layer for ResNet3D. It takes parameters such as block, inplanes, planes, blocks, spatial_stride, temporal_stride, and other optional settings like non_local, norm_cfg, act_cfg, conv_cfg, with_cp to create the residual module. The function constructs the layer based on the input arguments and returns it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":413-439",
+            "content": "    @staticmethod\n    def make_res_layer(block,\n                       inplanes,\n                       planes,\n                       blocks,\n                       spatial_stride=1,\n                       temporal_stride=1,\n                       dilation=1,\n                       inflate=1,\n                       inflate_style='3x1x1',\n                       non_local=0,\n                       non_local_cfg=dict(),\n                       norm_cfg=None,\n                       act_cfg=None,\n                       conv_cfg=None,\n                       with_cp=False,\n                       **kwargs):\n        \"\"\"Build residual layer for ResNet3D.\n        Args:\n            block (nn.Module): Residual module to be built.\n            inplanes (int): Number of channels for the input feature\n                in each block.\n            planes (int): Number of channels for the output feature\n                in each block.\n            blocks (int): Number of residual blocks.\n            spatial_stride (int | Sequence[int]): Spatial strides in"
+        },
+        {
+            "comment": "This function defines the ResNet3D backbone model, allowing customization through parameters such as residual and conv layers, temporal stride, dilation, inflate, inflate_style, non_local modules, conv_cfg, norm_cfg, and act_cfg. Default values are provided for each parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":440-456",
+            "content": "                residual and conv layers. Default: 1.\n            temporal_stride (int | Sequence[int]): Temporal strides in\n                residual and conv layers. Default: 1.\n            dilation (int): Spacing between kernel elements. Default: 1.\n            inflate (int | Sequence[int]): Determine whether to inflate\n                for each block. Default: 1.\n            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines\n                the kernel sizes and padding strides for conv1 and conv2\n                in each block. Default: '3x1x1'.\n            non_local (int | Sequence[int]): Determine whether to apply\n                non-local module in the corresponding block of each stages.\n                Default: 0.\n            non_local_cfg (dict): Config for non-local module.\n                Default: ``dict()``.\n            conv_cfg (dict | None): Config for norm layers. Default: None.\n            norm_cfg (dict | None): Config for norm layers. Default: None.\n            act_cfg (dict | None): Config for activate layers. Default: None."
+        },
+        {
+            "comment": "This function creates a residual layer based on the given configuration. It uses inflation and non-local blocks for the specified number of blocks, and optionally adds downsampling if there is a change in spatial or in/out planes. The output is a neural network module (nn.Module).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":457-480",
+            "content": "            with_cp (bool | None): Use checkpoint or not. Using checkpoint\n                will save some memory while slowing down the training speed.\n                Default: False.\n        Returns:\n            nn.Module: A residual layer for the given config.\n        \"\"\"\n        inflate = inflate if not isinstance(inflate,\n                                            int) else (inflate, ) * blocks\n        non_local = non_local if not isinstance(non_local,\n                                                int) else (non_local, ) * blocks\n        assert len(inflate) == blocks and len(non_local) == blocks\n        downsample = None\n        if spatial_stride != 1 or inplanes != planes * block.expansion:\n            downsample = ConvBNLayer(\n                in_channels=inplanes,\n                out_channels=planes * block.expansion,\n                kernel_size=1,\n                stride=(temporal_stride, spatial_stride, spatial_stride),\n                bias=False,\n                act=None)\n        layers = []\n        layers.append("
+        },
+        {
+            "comment": "The code defines a ResNet3D architecture with multiple blocks, each with configurable parameters such as spatial and temporal stride, dilation, downsample, inflate style, non-local operation, norm/conv configuration, activation function, and whether to include channel pruning. The inplanes are updated based on the expansion factor of the block.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":481-508",
+            "content": "            block(\n                inplanes,\n                planes,\n                spatial_stride=spatial_stride,\n                temporal_stride=temporal_stride,\n                dilation=dilation,\n                downsample=downsample,\n                inflate=(inflate[0] == 1),\n                inflate_style=inflate_style,\n                non_local=(non_local[0] == 1),\n                non_local_cfg=non_local_cfg,\n                norm_cfg=norm_cfg,\n                conv_cfg=conv_cfg,\n                act_cfg=act_cfg,\n                with_cp=with_cp,\n                **kwargs))\n        inplanes = planes * block.expansion\n        for i in range(1, blocks):\n            layers.append(\n                block(\n                    inplanes,\n                    planes,\n                    spatial_stride=1,\n                    temporal_stride=1,\n                    dilation=dilation,\n                    inflate=(inflate[i] == 1),\n                    inflate_style=inflate_style,\n                    non_local=(non_local[i] == 1),"
+        },
+        {
+            "comment": "This code defines a function to inflate a 3D convolutional neural network module from a pre-trained 2D model. It takes the destination conv3d module, state dict of the 2D model, name of the corresponding conv module in the 2D model, and list of inflated parameters as inputs. The function extracts the weight from the 2D model's state dict and reshapes it to fit the 3D convolution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":509-536",
+            "content": "                    non_local_cfg=non_local_cfg,\n                    norm_cfg=norm_cfg,\n                    conv_cfg=conv_cfg,\n                    act_cfg=act_cfg,\n                    with_cp=with_cp,\n                    **kwargs))\n        return nn.Sequential(*layers)\n    @staticmethod\n    def _inflate_conv_params(conv3d, state_dict_2d, module_name_2d,\n                             inflated_param_names):\n        \"\"\"Inflate a conv module from 2d to 3d.\n        Args:\n            conv3d (nn.Module): The destination conv3d module.\n            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.\n            module_name_2d (str): The name of corresponding conv module in the\n                2d model.\n            inflated_param_names (list[str]): List of parameters that have been\n                inflated.\n        \"\"\"\n        weight_2d_name = module_name_2d + '.weight'\n        conv2d_weight = state_dict_2d[weight_2d_name]\n        kernel_t = conv3d.weight.data.shape[2]\n        new_weight = conv2d_weight.data.unsqueeze(2).expand_as("
+        },
+        {
+            "comment": "This code inflates 2D convolutional and Batch Normalization (BN) parameters to 3D for a ResNet3D backbone. It copies the weights and biases, if present, from the 2D state dictionary to their corresponding 3D modules and updates the list of inflated parameter names.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":537-560",
+            "content": "            conv3d.weight) / kernel_t\n        conv3d.weight.data.copy_(new_weight)\n        inflated_param_names.append(weight_2d_name)\n        if getattr(conv3d, 'bias') is not None:\n            bias_2d_name = module_name_2d + '.bias'\n            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])\n            inflated_param_names.append(bias_2d_name)\n    @staticmethod\n    def _inflate_bn_params(bn3d, state_dict_2d, module_name_2d,\n                           inflated_param_names):\n        \"\"\"Inflate a norm module from 2d to 3d.\n        Args:\n            bn3d (nn.Module): The destination bn3d module.\n            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.\n            module_name_2d (str): The name of corresponding bn module in the\n                2d model.\n            inflated_param_names (list[str]): List of parameters that have been\n                inflated.\n        \"\"\"\n        for param_name, param in bn3d.named_parameters():\n            param_2d_name = f'{module_name_2d}.{param_name}'"
+        },
+        {
+            "comment": "This code snippet is from the PaddleVideo library, specifically the ResNet3D backbone. It is loading and inflating parameters from a state dictionary, ensuring compatibility between 2D and 3D parameter shapes. The function _make_stem_layer constructs a stem layer consisting of a convolution, normalization, activation, and pooling module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":561-585",
+            "content": "            param_2d = state_dict_2d[param_2d_name]\n            if param.data.shape != param_2d.shape:\n                warnings.warn(f'The parameter of {module_name_2d} is not'\n                              'loaded due to incompatible shapes. ')\n                return\n            param.data.copy_(param_2d)\n            inflated_param_names.append(param_2d_name)\n        for param_name, param in bn3d.named_buffers():\n            param_2d_name = f'{module_name_2d}.{param_name}'\n            # some buffers like num_batches_tracked may not exist in old\n            # checkpoints\n            if param_2d_name in state_dict_2d:\n                param_2d = state_dict_2d[param_2d_name]\n                param.data.copy_(param_2d)\n                inflated_param_names.append(param_2d_name)\n    def _make_stem_layer(self):\n        \"\"\"Construct the stem layers consists of a conv+norm+act module and a\n        pooling layer.\"\"\"\n        self.conv1 = ConvBNLayer(\n            in_channels=self.in_channels,\n            out_channels=self.base_channels,"
+        },
+        {
+            "comment": "This code is initializing a ResNet3D model with convolutional and pooling layers. The convolution layer has specified kernel size, stride, padding, and uses ReLU activation function. The max pooling layer has varying sizes for temporal, spatial dimensions. This model also includes optional pool1 and can be initialized with pretrained weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":586-619",
+            "content": "            kernel_size=self.conv1_kernel,\n            stride=(self.conv1_stride_t, self.conv1_stride_s,\n                    self.conv1_stride_s),\n            padding=tuple([(k - 1) // 2 for k in _triple(self.conv1_kernel)]),\n            bias=False,\n            act=\"relu\")\n        self.maxpool = nn.MaxPool3D(\n            kernel_size=(1, 3, 3),\n            stride=(self.pool1_stride_t, self.pool1_stride_s,\n                    self.pool1_stride_s),\n            padding=(0, 1, 1))\n        self.pool2 = nn.MaxPool3D(kernel_size=(2, 1, 1), stride=(2, 1, 1))\n    @staticmethod\n    def _init_weights(self, pretrained=None):\n        pass\n    def init_weights(self, pretrained=None):\n        self._init_weights(self, pretrained)\n    def forward(self, x):\n        \"\"\"Defines the computation performed at every call.\n        Args:\n            x (torch.Tensor): The input data.\n        Returns:\n            torch.Tensor: The feature of the input\n            samples extracted by the backbone.\n        \"\"\"\n        x = self.conv1(x)\n        if self.with_pool1:"
+        },
+        {
+            "comment": "This code defines a ResNet-3D backbone model with residual blocks, max pooling layers, and optionally a second pooling layer. The train function sets the model to training mode and evaluates batch normalization layers if self.norm_eval is True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/resnet3d.py\":620-640",
+            "content": "            x = self.maxpool(x)\n        outs = []\n        for i, layer_name in enumerate(self.res_layers):\n            res_layer = getattr(self, layer_name)\n            x = res_layer(x)\n            if i == 0 and self.with_pool2:\n                x = self.pool2(x)\n            if i in self.out_indices:\n                outs.append(x)\n        if len(outs) == 1:\n            return outs[0]\n        return tuple(outs)\n    def train(self, mode=True):\n        \"\"\"Set the optimization status when training.\"\"\"\n        super().train()\n        if mode and self.norm_eval:\n            for m in self.modules():\n                if isinstance(m, paddle.nn._BatchNormBase):\n                    m.eval()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/da8df420-ae41-4731-8dc2-585281aa76be.json b/docs/doc/da8df420-ae41-4731-8dc2-585281aa76be.json
new file mode 100644
index 000000000..308c13973
--- /dev/null
+++ b/docs/doc/da8df420-ae41-4731-8dc2-585281aa76be.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code deploys Paddle Serving, a model serving framework in PaddleVideo, through Docker on Linux with GPU and CPU options. It uses paddle-video-deploy for model conversion and includes client scripts, environment setup, and troubleshooting for missing libraries.",
+    "details": [
+        {
+            "comment": "This code snippet provides instructions for deploying Paddle Serving, a model serving framework, as part of the PaddleVideo codebase. It explains that this is done through Docker and covers GPU-accelerated and CPU-based installations, including the necessary package installation commands. The code assumes Linux platform, and does not support Windows at present.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme.md\":0-31",
+            "content": "\u7b80\u4f53\u4e2d\u6587 | [English](./readme_en.md)\n# \u6a21\u578b\u670d\u52a1\u5316\u90e8\u7f72\n## \u7b80\u4ecb\n[Paddle Serving](https://github.com/PaddlePaddle/Serving) \u65e8\u5728\u5e2e\u52a9\u6df1\u5ea6\u5b66\u4e60\u5f00\u53d1\u8005\u8f7b\u677e\u90e8\u7f72\u5728\u7ebf\u9884\u6d4b\u670d\u52a1\uff0c\u652f\u6301\u4e00\u952e\u90e8\u7f72\u5de5\u4e1a\u7ea7\u7684\u670d\u52a1\u80fd\u529b\u3001\u5ba2\u6237\u7aef\u548c\u670d\u52a1\u7aef\u4e4b\u95f4\u9ad8\u5e76\u53d1\u548c\u9ad8\u6548\u901a\u4fe1\u3001\u5e76\u652f\u6301\u591a\u79cd\u7f16\u7a0b\u8bed\u8a00\u5f00\u53d1\u5ba2\u6237\u7aef\u3002\n\u8be5\u90e8\u5206\u4ee5 HTTP \u9884\u6d4b\u670d\u52a1\u90e8\u7f72\u4e3a\u4f8b\uff0c\u4ecb\u7ecd\u600e\u6837\u5728 PaddleVideo \u4e2d\u4f7f\u7528 PaddleServing \u90e8\u7f72\u6a21\u578b\u670d\u52a1\u3002\u76ee\u524d\u53ea\u652f\u6301 Linux \u5e73\u53f0\u90e8\u7f72\uff0c\u6682\u4e0d\u652f\u6301 Windows \u5e73\u53f0\u3002\n## Serving \u5b89\u88c5\nServing \u5b98\u7f51\u63a8\u8350\u4f7f\u7528 docker \u5b89\u88c5\u5e76\u90e8\u7f72 Serving \u73af\u5883\u3002\u9996\u5148\u9700\u8981\u62c9\u53d6 docker \u73af\u5883\u5e76\u521b\u5efa\u57fa\u4e8e Serving \u7684 docker\u3002\n```bash\n# \u542f\u52a8GPU docker\ndocker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel\nnvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash\nnvidia-docker exec -it test bash\n# \u542f\u52a8CPU docker\ndocker pull paddlepaddle/serving:0.7.0-devel\ndocker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash\ndocker exec -it test bash\n```\n\u8fdb\u5165 docker \u540e\uff0c\u9700\u8981\u5b89\u88c5 Serving \u76f8\u5173\u7684 python \u5305\u3002\n```bash\npython3.7 -m pip install paddle-serving-client==0.7.0\npython3.7 -m pip install paddle-serving-app==0.7.0\n#\u82e5\u4e3aCPU\u90e8\u7f72\u73af\u5883:\npython3.7 -m pip install paddle-serving-server==0.7.0  # CPU\npython3.7 -m pip install paddlepaddle==2.2.0           # CPU"
+        },
+        {
+            "comment": "This code provides instructions for installing different versions of PaddleServing Server with various GPU environments and specifies the required pip commands. It also mentions an alternative method to speed up the installation process by changing the source. Furthermore, it highlights how to convert a PP-TSM inference model into Serving format for deploying behavior recognition service.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme.md\":33-63",
+            "content": "#\u82e5\u4e3aGPU\u90e8\u7f72\u73af\u5883\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102  # GPU with CUDA10.2 + TensorRT6\npython3.7 -m pip install paddlepaddle-gpu==2.2.0                   # GPU with CUDA10.2\n#\u5176\u4ed6GPU\u73af\u5883\u9700\u8981\u786e\u8ba4\u73af\u5883\u518d\u9009\u62e9\u6267\u884c\u54ea\u4e00\u6761\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101  # GPU with CUDA10.1 + TensorRT6\npython3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112  # GPU with CUDA11.2 + TensorRT8\n```\n* \u5982\u679c\u5b89\u88c5\u901f\u5ea6\u592a\u6162\uff0c\u53ef\u4ee5\u901a\u8fc7 `-i https://pypi.tuna.tsinghua.edu.cn/simple` \u66f4\u6362\u6e90\uff0c\u52a0\u901f\u5b89\u88c5\u8fc7\u7a0b\u3002\n* \u66f4\u591a\u73af\u5883\u548c\u5bf9\u5e94\u7684\u5b89\u88c5\u5305\u8be6\u89c1\uff1ahttps://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md\n## \u884c\u4e3a\u8bc6\u522b\u670d\u52a1\u90e8\u7f72\n### \u6a21\u578b\u8f6c\u6362\n\u4f7f\u7528 PaddleServing \u505a\u670d\u52a1\u5316\u90e8\u7f72\u65f6\uff0c\u9700\u8981\u5c06\u4fdd\u5b58\u7684 inference \u6a21\u578b\u8f6c\u6362\u4e3a Serving \u6a21\u578b\u3002\u4e0b\u9762\u4ee5 PP-TSM \u6a21\u578b\u4e3a\u4f8b\uff0c\u4ecb\u7ecd\u5982\u4f55\u90e8\u7f72\u884c\u4e3a\u8bc6\u522b\u670d\u52a1\u3002\n- \u4e0b\u8f7d PP-TSM \u63a8\u7406\u6a21\u578b\u5e76\u8f6c\u6362\u4e3a Serving \u6a21\u578b\uff1a\n  ```bash\n  # \u8fdb\u5165PaddleVideo\u76ee\u5f55\n  cd PaddleVideo\n  # \u4e0b\u8f7d\u63a8\u7406\u6a21\u578b\u5e76\u89e3\u538b\u5230./inference\u4e0b\n  mkdir ./inference\n  pushd ./inference\n  wget  https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip\n  unzip ppTSM.zip\n  popd\n  # \u8f6c\u6362\u6210 Serving \u6a21\u578b\n  pushd deploy/cpp_serving\n  python3.7 -m paddle_serving_client.convert \\"
+        },
+        {
+            "comment": "This code is using PaddleVideo's paddle-video-deploy to convert a model and store it in the \"inference/ppTSM\" directory. It specifies the model filename as \"ppTSM.pdmodel\", the params filename as \"ppTSM.pdiparams\", and generates serving_server and serving_client files in the \"ppTSM\" folder. The converted model will be saved in the \"deploy/cpp_serving\" directory, with the client configuration stored in the \"ppTSM_serving_client\" and server configurations stored in the \"ppTSM_serving_server\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme.md\":64-80",
+            "content": "  --dirname ../../inference/ppTSM \\\n  --model_filename ppTSM.pdmodel \\\n  --params_filename ppTSM.pdiparams \\\n  --serving_server ./ppTSM_serving_server \\\n  --serving_client ./ppTSM_serving_client\n  popd\n  ```\n  | \u53c2\u6570              | \u7c7b\u578b | \u9ed8\u8ba4\u503c             | \u63cf\u8ff0                                                         |\n  | ----------------- | ---- | ------------------ | ------------------------------------------------------------ |\n  | `dirname`         | str  | -                  | \u9700\u8981\u8f6c\u6362\u7684\u6a21\u578b\u6587\u4ef6\u5b58\u50a8\u8def\u5f84\uff0cProgram\u7ed3\u6784\u6587\u4ef6\u548c\u53c2\u6570\u6587\u4ef6\u5747\u4fdd\u5b58\u5728\u6b64\u76ee\u5f55\u3002 |\n  | `model_filename`  | str  | None               | \u5b58\u50a8\u9700\u8981\u8f6c\u6362\u7684\u6a21\u578bInference Program\u7ed3\u6784\u7684\u6587\u4ef6\u540d\u79f0\u3002\u5982\u679c\u8bbe\u7f6e\u4e3aNone\uff0c\u5219\u4f7f\u7528 `__model__` \u4f5c\u4e3a\u9ed8\u8ba4\u7684\u6587\u4ef6\u540d |\n  | `params_filename` | str  | None               | \u5b58\u50a8\u9700\u8981\u8f6c\u6362\u7684\u6a21\u578b\u6240\u6709\u53c2\u6570\u7684\u6587\u4ef6\u540d\u79f0\u3002\u5f53\u4e14\u4ec5\u5f53\u6240\u6709\u6a21\u578b\u53c2\u6570\u88ab\u4fdd>\u5b58\u5728\u4e00\u4e2a\u5355\u72ec\u7684\u4e8c\u8fdb\u5236\u6587\u4ef6\u4e2d\uff0c\u5b83\u624d\u9700\u8981\u88ab\u6307\u5b9a\u3002\u5982\u679c\u6a21\u578b\u53c2\u6570\u662f\u5b58\u50a8\u5728\u5404\u81ea\u5206\u79bb\u7684\u6587\u4ef6\u4e2d\uff0c\u8bbe\u7f6e\u5b83\u7684\u503c\u4e3aNone |\n  | `serving_server`  | str  | `\"serving_server\"` | \u8f6c\u6362\u540e\u7684\u6a21\u578b\u6587\u4ef6\u548c\u914d\u7f6e\u6587\u4ef6\u7684\u5b58\u50a8\u8def\u5f84\u3002\u9ed8\u8ba4\u503c\u4e3aserving_server |\n  | `serving_client`  | str  | `\"serving_client\"` | \u8f6c\u6362\u540e\u7684\u5ba2\u6237\u7aef\u914d\u7f6e\u6587\u4ef6\u5b58\u50a8\u8def\u5f84\u3002\u9ed8\u8ba4\u503c\u4e3aserving_client       |\n- \u63a8\u7406\u6a21\u578b\u8f6c\u6362\u5b8c\u6210\u540e\uff0c\u4f1a\u5728`deploy/cpp_serving`\u6587\u4ef6\u5939\u4e0b\u751f\u6210 `ppTSM_serving_client` \u548c `ppTSM_serving_server` \u4e24\u4e2a\u6587\u4ef6\u5939\uff0c\u5177\u5907\u5982\u4e0b\u683c\u5f0f\uff1a"
+        },
+        {
+            "comment": "This code snippet shows the necessary changes to be made in `serving_client_conf.prototxt` and `serving_server_conf.prototxt` files after getting the model file. The purpose is to rename `alias_name` as 'outputs' for `fetch_var`. This allows Serving to support different models without modifying code during deployment.\n\nThe modified `serving_server_conf.prototxt` file shows a feed variable and a fetch variable with their respective names, alias names, shapes, and types.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme.md\":81-117",
+            "content": "  ```bash\n  PaddleVideo/deploy/cpp_serving\n  \u251c\u2500\u2500 ppTSM_serving_client\n  \u2502   \u251c\u2500\u2500 serving_client_conf.prototxt\n  \u2502   \u2514\u2500\u2500 serving_client_conf.stream.prototxt\n  \u2514\u2500\u2500 ppTSM_serving_server\n      \u251c\u2500\u2500 ppTSM.pdiparams\n      \u251c\u2500\u2500 ppTSM.pdmodel\n      \u251c\u2500\u2500 serving_server_conf.prototxt\n      \u2514\u2500\u2500 serving_server_conf.stream.prototxt\n  ```\n  \u5f97\u5230\u6a21\u578b\u6587\u4ef6\u4e4b\u540e\uff0c\u9700\u8981\u5206\u522b\u4fee\u6539 `ppTSM_serving_client` \u4e0b\u7684 `serving_client_conf.prototxt` \u548c `ppTSM_serving_server` \u4e0b\u7684 `serving_server_conf.prototxt`\uff0c\u5c06\u4e24\u4efd\u6587\u4ef6\u4e2d`fetch_var` \u4e0b\u7684 `alias_name` \u5747\u6539\u4e3a `outputs`\n  **\u5907\u6ce8**:  Serving \u4e3a\u4e86\u517c\u5bb9\u4e0d\u540c\u6a21\u578b\u7684\u90e8\u7f72\uff0c\u63d0\u4f9b\u4e86\u8f93\u5165\u8f93\u51fa\u91cd\u547d\u540d\u7684\u529f\u80fd\u3002\u8fd9\u6837\uff0c\u4e0d\u540c\u7684\u6a21\u578b\u5728\u63a8\u7406\u90e8\u7f72\u65f6\uff0c\u53ea\u9700\u8981\u4fee\u6539\u914d\u7f6e\u6587\u4ef6\u7684`alias_name`\u5373\u53ef\uff0c\u65e0\u9700\u4fee\u6539\u4ee3\u7801\u5373\u53ef\u5b8c\u6210\u63a8\u7406\u90e8\u7f72\u3002\n  \u4fee\u6539\u540e\u7684`serving_server_conf.prototxt`\u5982\u4e0b\u6240\u793a:\n  ```yaml\n  feed_var {\n    name: \"data_batch_0\"\n    alias_name: \"data_batch_0\"\n    is_lod_tensor: false\n    feed_type: 1\n    shape: 8\n    shape: 3\n    shape: 224\n    shape: 224\n  }\n  fetch_var {\n    name: \"linear_2.tmp_1\"\n    alias_name: \"outputs\"\n    is_lod_tensor: false\n    fetch_type: 1\n    shape: 400\n  }\n  ```\n### \u670d\u52a1\u90e8\u7f72\u548c\u8bf7\u6c42\n`cpp_serving` \u76ee\u5f55\u5305\u542b\u4e86\u542f\u52a8 pipeline \u670d\u52a1\u3001C++ serving\u670d\u52a1\u548c\u53d1\u9001\u9884\u6d4b\u8bf7\u6c42\u7684\u4ee3\u7801\uff0c\u5177\u4f53\u5305\u62ec\uff1a"
+        },
+        {
+            "comment": "This code provides the instructions to set up and run a C++ serving server, send requests from a client script, and interpret the results. It also includes a script to install the required environment and troubleshoot common issues such as missing libraries.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme.md\":118-157",
+            "content": "  ```bash\n  run_cpp_serving.sh          # \u542f\u52a8C++ serving server\u7aef\u7684\u811a\u672c\n  pipeline_http_client.py     # client\u7aef\u53d1\u9001\u6570\u636e\u5e76\u83b7\u53d6\u9884\u6d4b\u7ed3\u679c\u7684\u811a\u672c\n  paddle_env_install.sh       # \u5b89\u88c5C++ serving\u73af\u5883\u811a\u672c\n  preprocess_ops.py           # \u5b58\u653e\u9884\u5904\u7406\u51fd\u6570\u7684\u6587\u4ef6\n  ```\n#### C++ Serving\n- \u8fdb\u5165\u5de5\u4f5c\u76ee\u5f55\uff1a\n  ```bash\n  cd deploy/cpp_serving\n  ```\n- \u542f\u52a8\u670d\u52a1\uff1a\n  ```bash\n  # \u5728\u540e\u53f0\u542f\u52a8\uff0c\u8fc7\u7a0b\u4e2d\u6253\u5370\u8f93\u51fa\u7684\u65e5\u5fd7\u4f1a\u91cd\u5b9a\u5411\u4fdd\u5b58\u5230nohup.txt\u4e2d\uff0c\u53ef\u4ee5\u4f7f\u7528tailf nohup.txt\u67e5\u770b\u8f93\u51fa\n  bash run_cpp_serving.sh\n  ```\n- \u53d1\u9001\u8bf7\u6c42\u5e76\u83b7\u53d6\u7ed3\u679c\uff1a\n  ```bash\n  python3.7 serving_client.py \\\n  -n PPTSM \\\n  -c ./ppTSM_serving_client/serving_client_conf.prototxt \\\n  --input_file=../../data/example.avi\n  ```\n\u6210\u529f\u8fd0\u884c\u540e\uff0c\u6a21\u578b\u9884\u6d4b\u7684\u7ed3\u679c\u4f1a\u6253\u5370\u5728 cmd \u7a97\u53e3\u4e2d\uff0c\u7ed3\u679c\u5982\u4e0b\uff1a\n  ```bash\n  I0510 04:33:00.110025 37097 naming_service_thread.cpp:202] brpc::policy::ListNamingService(\"127.0.0.1:9993\"): added 1\n  I0510 04:33:01.904764 37097 general_model.cpp:490] [client]logid=0,client_cost=1640.96ms,server_cost=1623.21ms.\n  {'class_id': '[5]', 'prob': '[0.9907387495040894]'}\n  ```\n**\u5982\u679c\u8fc7\u7a0b\u4e2d\u62a5\u9519\u663e\u793a\u627e\u4e0d\u5230libnvinfer.so.6\uff0c\u53ef\u4ee5\u6267\u884c\u811a\u672c`paddle_env_install.sh`\u5b89\u88c5\u76f8\u5173\u73af\u5883**\n  ```bash\n  bash paddle_env_install.sh\n  ```\n## FAQ\n**Q1**\uff1a \u53d1\u9001\u8bf7\u6c42\u540e\u6ca1\u6709\u7ed3\u679c\u8fd4\u56de\u6216\u8005\u63d0\u793a\u8f93\u51fa\u89e3\u7801\u62a5\u9519"
+        },
+        {
+            "comment": "Avoid setting proxies when starting the service and sending requests. Disable proxies by using \"unset https_proxy\" and \"unset http_proxy\" commands beforehand.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/readme.md\":159-163",
+            "content": "**A1**\uff1a \u542f\u52a8\u670d\u52a1\u548c\u53d1\u9001\u8bf7\u6c42\u65f6\u4e0d\u8981\u8bbe\u7f6e\u4ee3\u7406\uff0c\u53ef\u4ee5\u5728\u542f\u52a8\u670d\u52a1\u524d\u548c\u53d1\u9001\u8bf7\u6c42\u524d\u5173\u95ed\u4ee3\u7406\uff0c\u5173\u95ed\u4ee3\u7406\u7684\u547d\u4ee4\u662f\uff1a\n```\nunset https_proxy\nunset http_proxy\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/db149562-1b92-4a87-8389-fd7a3addc401.json b/docs/doc/db149562-1b92-4a87-8389-fd7a3addc401.json
new file mode 100644
index 000000000..a1c99d29a
--- /dev/null
+++ b/docs/doc/db149562-1b92-4a87-8389-fd7a3addc401.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code defines a `DrawFrame` class, which is a QWidget for drawing paths and responding to mouse events. It is used in conjunction with other classes such as `DemoUI`. The 'export' function converts painter's polygon to fill polygon and is triggered by the 'start_btn'. The code initializes the application and starts the event loop.",
+    "details": [
+        {
+            "comment": "This code defines a class called DrawFrame, which is a QWidget that can be drawn on. It overrides the paintEvent method to draw paths using a QPainter, and responds to mouse events for line drawing. The class takes a painter object in its constructor, suggesting it could be used in conjunction with other classes or methods.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/demo.py\":0-35",
+            "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport sys\nfrom PyQt5.QtWidgets import QApplication, QMainWindow, QFrame, QWidget\nfrom PyQt5.QtGui import QPainter, QPixmap, QPen, QColor, QPainterPath\nfrom PyQt5.QtCore import Qt, QPoint\nfrom PyQt5 import QtCore, QtGui, QtWidgets\nfrom QEIVideo.ui.demo import Ui_MainWindow as DemoUIRoot\nclass DrawFrame(QWidget):\n    def __init__(self, painter, *args, **kwargs):\n        super(DrawFrame, self).__init__(*args, **kwargs)\n        self.painter = painter\n    def paintEvent(self, event):\n        painter = QPainter(self)\n        pen = QPen(QColor(\"orange\"))\n        pen.setWidth(5)\n        pen.setCapStyle(Qt.RoundCap)\n        pen.setJoinStyle(Qt.RoundJoin)\n        painter.setPen(pen)\n        painter.drawPath(self.painter)\n    def mousePressEvent(self, event):\n        self.painter.moveTo(event.pos())\n        self.update()\n    def mouseMoveEvent(self, event):\n        self.painter.lineTo(event.pos())\n        self.update()"
+        },
+        {
+            "comment": "This code initializes a `DemoUI` class with a `DrawFrame` object that draws video frames. The `export` function converts the painter's polygon to a fill polygon and is triggered by the 'start_btn'. The code also sets up the application, creates an instance of the `DemoUI` class, and starts the event loop.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/demo.py\":38-61",
+            "content": "class DemoUI(QMainWindow, DemoUIRoot):\n    def __init__(self):\n        super(DemoUI, self).__init__()\n        self.setupUi(self)\n        self.painter = QPainterPath()\n        self.draw_frame = DrawFrame(self.painter, self.video_frame)\n        self.draw_frame.setGeometry(QtCore.QRect(0, 10, 751, 301))\n        self.draw_frame.setObjectName(\"draw_frame\")\n        self.draw_frame.raise_()\n        self.draw_frame.setAttribute(QtCore.Qt.WA_TranslucentBackground)\n        self.start_btn.clicked.connect(self.export)\n    def export(self):\n        a = self.painter.toFillPolygon()\n        pass\nif __name__ == '__main__':\n    app = QApplication(sys.argv)\n    gui_class = DemoUI()\n    gui_class.show()\n    sys.exit(app.exec_())"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/db437a82-93f4-4ebd-bc38-4234e0ea8457.json b/docs/doc/db437a82-93f4-4ebd-bc38-4234e0ea8457.json
new file mode 100644
index 000000000..03284e14f
--- /dev/null
+++ b/docs/doc/db437a82-93f4-4ebd-bc38-4234e0ea8457.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code enables image resizing, flipping, multi-scale segmentation in PaddleVideo's pipeline, with metadata addition and normalization. It performs image normalization and transposition before storing the result in a samples data structure.",
+    "details": [
+        {
+            "comment": "This code is for PaddleVideo's segmentation pipeline. It includes the class definition MultiRestrictSize, which can be used with minimum and maximum size limits, flipping option, and multiple scales for image resizing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/segmentation.py\":0-31",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nfrom PIL import Image\nimport copy\nimport cv2\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass MultiRestrictSize(object):\n    def __init__(self,\n                 min_size=None,\n                 max_size=800,\n                 flip=False,\n                 multi_scale=[1.3]):\n        self.min_size = min_size\n        self.max_size = max_size\n        self.multi_scale = multi_scale\n        self.flip = flip"
+        },
+        {
+            "comment": "This code is a function that applies image segmentation using multi-scale technique. It scales the input image based on a fixed range of scales and aligns short or long edges to meet minimum or maximum size requirements, respectively. The scaled images are stored in a list for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/segmentation.py\":32-64",
+            "content": "        assert ((min_size is None)) or ((max_size is None))\n    def __call__(self, sample):\n        samples = []\n        image = sample['current_img']\n        h, w = image.shape[:2]\n        for scale in self.multi_scale:\n            # Fixed range of scales\n            sc = None\n            # Align short edge\n            if not (self.min_size is None):\n                if h > w:\n                    short_edge = w\n                else:\n                    short_edge = h\n                if short_edge > self.min_size:\n                    sc = float(self.min_size) / short_edge\n            else:\n                if h > w:\n                    long_edge = h\n                else:\n                    long_edge = w\n                if long_edge > self.max_size:\n                    sc = float(self.max_size) / long_edge\n            if sc is None:\n                new_h = h\n                new_w = w\n            else:\n                new_h = sc * h\n                new_w = sc * w\n            new_h = int(new_h * scale)\n            new_w = int(new_w * scale)"
+        },
+        {
+            "comment": "Code resizes input images to a multiple of 16x16, appends samples with matching metadata, and optionally flips the image if enabled.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/segmentation.py\":66-91",
+            "content": "            if (new_h - 1) % 16 != 0:\n                new_h = int(np.around((new_h - 1) / 16.) * 16 + 1)\n            if (new_w - 1) % 16 != 0:\n                new_w = int(np.around((new_w - 1) / 16.) * 16 + 1)\n            if new_h == h and new_w == w:\n                samples.append(sample)\n            else:\n                new_sample = {}\n                for elem in sample.keys():\n                    if 'meta' in elem:\n                        new_sample[elem] = sample[elem]\n                        continue\n                    tmp = sample[elem]\n                    if 'label' in elem:\n                        new_sample[elem] = sample[elem]\n                        continue\n                    else:\n                        flagval = cv2.INTER_CUBIC\n                        tmp = cv2.resize(tmp,\n                                         dsize=(new_w, new_h),\n                                         interpolation=flagval)\n                        new_sample[elem] = tmp\n                samples.append(new_sample)\n            if self.flip:"
+        },
+        {
+            "comment": "This code segment is from the PaddleVideo library, specifically in the loader/pipelines/segmentation.py file. It appears to be a function that adds flipped image data to a list of samples, after normalizing each image by dividing it by 255 and subtracting (0.485, 0.456, 0.406). This function is part of the MultiNorm pipeline registered in the PIPELINES module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/segmentation.py\":92-123",
+            "content": "                now_sample = samples[-1]\n                new_sample = {}\n                for elem in now_sample.keys():\n                    if 'meta' in elem:\n                        new_sample[elem] = now_sample[elem].copy()\n                        new_sample[elem]['flip'] = True\n                        continue\n                    tmp = now_sample[elem]\n                    tmp = tmp[:, ::-1].copy()\n                    new_sample[elem] = tmp\n                samples.append(new_sample)\n        return samples\n@PIPELINES.register()\nclass MultiNorm(object):\n    def __call__(self, samples):\n        for idx in range(len(samples)):\n            sample = samples[idx]\n            for elem in sample.keys():\n                if 'meta' in elem:\n                    continue\n                tmp = sample[elem]\n                if tmp is None:\n                    continue\n                if tmp.ndim == 2:\n                    tmp = tmp[:, :, np.newaxis]\n                else:\n                    tmp = tmp / 255.\n                    tmp -= (0.485, 0.456, 0.406)"
+        },
+        {
+            "comment": "This code segment performs image normalization and transposition before storing the result in a dictionary-like samples data structure. It divides each RGB channel value by the average RGB values, then transposes the image channels. Finally, it adds the transformed image to the samples dictionary for the given index and element.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/segmentation.py\":124-129",
+            "content": "                    tmp /= (0.229, 0.224, 0.225)\n                tmp = tmp.transpose((2, 0, 1))\n                samples[idx][elem] = tmp\n        return samples"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/db83d6c4-7d6c-4325-8507-33f19b81fd7b.json b/docs/doc/db83d6c4-7d6c-4325-8507-33f19b81fd7b.json
new file mode 100644
index 000000000..103ac462f
--- /dev/null
+++ b/docs/doc/db83d6c4-7d6c-4325-8507-33f19b81fd7b.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines Distillation Entropy Loss and KL divergence loss classes, implementing CrossEntropy loss for single/triple labels and KL divergence respectively, with optional weighted average and activation functions.",
+    "details": [
+        {
+            "comment": "Defines a Distillation Entropy Loss class, which inherits from BaseWeightedLoss and takes score and labels as input for its forward function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/distillation_loss.py\":0-29",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\n@LOSSES.register()\nclass DistillationCELoss(BaseWeightedLoss):\n    \"\"\"Distillation Entropy Loss.\"\"\"\n    def _forward(self, score, labels, **kwargs):\n        \"\"\"Forward function.\n        Args:\n            score (paddle.Tensor): The class score.\n            labels (paddle.Tensor): The ground truth labels."
+        },
+        {
+            "comment": "The code defines a loss function that calculates CrossEntropy loss and supports both single and triple labels. For single label, it directly calculates the CrossEntropy loss. For triple labels, it first calculates two separate CrossEntropy losses, then combines them with a weighted average based on a given lambda value (lam). The DistillationDMLLoss class implements this behavior and also handles the act parameter for specifying different activation functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/distillation_loss.py\":30-59",
+            "content": "            kwargs: Any keyword argument to be used to calculate\n                CrossEntropy loss.\n        Returns:\n            loss (paddle.Tensor): The returned CrossEntropy loss.\n        \"\"\"\n        if len(labels) == 1:\n            label = labels[0]\n            loss = F.cross_entropy(score, label, **kwargs)\n        # Deal with VideoMix\n        elif len(labels) == 3:\n            label_a, label_b, lam = labels\n            loss_a = F.cross_entropy(score, label_a, **kwargs)\n            loss_b = F.cross_entropy(score, label_b, **kwargs)\n            loss = lam * loss_a + (1 - lam) * loss_b\n            loss = paddle.mean(loss)  #lam shape is bs\n        return loss\n@LOSSES.register()\nclass DistillationDMLLoss(BaseWeightedLoss):\n    \"\"\"\n    DistillationDMLLoss\n    \"\"\"\n    def __init__(self, act=\"softmax\", eps=1e-12, **kargs):\n        super().__init__(**kargs)\n        if act is not None:\n            assert act in [\"softmax\", \"sigmoid\"]\n        if act == \"softmax\":\n            self.act = nn.Softmax(axis=-1)\n        elif act == \"sigmoid\":"
+        },
+        {
+            "comment": "This code defines a class for implementing the Kullback-Leibler (KL) divergence loss. The constructor takes an optional activation function and epsilon for numerical stability. The _kldiv method calculates the KL divergence between two vectors, while the _forward method applies the activation function if provided and computes the final loss by averaging the KL divergences in both directions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/distillation_loss.py\":60-78",
+            "content": "            self.act = nn.Sigmoid()\n        else:\n            self.act = None\n        self.eps = eps\n    def _kldiv(self, x, target):\n        class_num = x.shape[-1]\n        cost = target * paddle.log(\n            (target + self.eps) / (x + self.eps)) * class_num\n        return cost\n    def _forward(self, x, target):\n        if self.act is not None:\n            x = self.act(x)\n            target = self.act(target)\n        loss = self._kldiv(x, target) + self._kldiv(target, x)\n        loss = loss / 2\n        loss = paddle.mean(loss)\n        return loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/dbfff6f8-70f2-4512-89a1-d45becd0a54f.json b/docs/doc/dbfff6f8-70f2-4512-89a1-d45becd0a54f.json
new file mode 100644
index 000000000..1f1163479
--- /dev/null
+++ b/docs/doc/dbfff6f8-70f2-4512-89a1-d45becd0a54f.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This Python function builds a module from a config dictionary, checks its validity, retrieves an object class from a registry, and returns an instance with optional parameters.",
+    "details": [
+        {
+            "comment": "This code snippet is a Python function that builds a module from a config dictionary. It checks if the input is a dictionary and verifies if the specified key exists. Then it makes a copy of the dictionary and removes the specified key, returning the constructed object. The registry is used to search for the type of the module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py\":0-29",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\ndef build(cfg, registry, key='name'):\n    \"\"\"Build a module from config dict.\n    Args:\n        cfg (dict): Config dict. It should at least contain the key.\n        registry (XXX): The registry to search the type from.\n        key (str): the key.\n    Returns:\n        obj: The constructed object.\n    \"\"\"\n    assert isinstance(cfg, dict) and key in cfg\n    cfg_copy = cfg.copy()\n    obj_type = cfg_copy.pop(key)"
+        },
+        {
+            "comment": "The code retrieves an object class from a registry based on the provided \"obj_type\", and if not found, raises a KeyError with an informative message. It then returns an instance of the retrieved class with optional configuration parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py\":31-35",
+            "content": "    obj_cls = registry.get(obj_type)\n    if obj_cls is None:\n        raise KeyError('{} is not in the {} registry'.format(\n                obj_type, registry.name))\n    return obj_cls(**cfg_copy)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/dc0c9c03-7070-47aa-8803-6519cf11b045.json b/docs/doc/dc0c9c03-7070-47aa-8803-6519cf11b045.json
new file mode 100644
index 000000000..e7a433add
--- /dev/null
+++ b/docs/doc/dc0c9c03-7070-47aa-8803-6519cf11b045.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is part of the PaddleVideo library, specifically defining detectors. It imports three detector classes (BaseDetector, FastRCNN, and TwoStageDetector) from its local directory and lists them in __all__. The comment at the beginning establishes copyright information and licensing.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library, specifically defining detectors. It imports three detector classes (BaseDetector, FastRCNN, and TwoStageDetector) from its local directory and lists them in __all__. The comment at the beginning establishes copyright information and licensing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/detectors/__init__.py\":0-16",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseDetector\nfrom .fast_rcnn import FastRCNN\nfrom .two_stage import TwoStageDetector\n__all__ = ['BaseDetector', 'TwoStageDetector', 'FastRCNN']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/dd6a35fd-c4e2-4773-a81c-ce934d329392.json b/docs/doc/dd6a35fd-c4e2-4773-a81c-ce934d329392.json
new file mode 100644
index 000000000..34fea5b21
--- /dev/null
+++ b/docs/doc/dd6a35fd-c4e2-4773-a81c-ce934d329392.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines a `ReaderZoo` class with functions for registering and retrieving readers based on their name, mode, and configuration. A custom exception class is defined for reader not found errors.",
+    "details": [
+        {
+            "comment": "Importing necessary libraries, defining custom exception class for reader not found error.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/reader_utils.py\":0-30",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport pickle\nimport cv2\nimport numpy as np\nimport random\nclass ReaderNotFoundError(Exception):\n    \"Error: reader not found\"\n    def __init__(self, reader_name, avail_readers):\n        super(ReaderNotFoundError, self).__init__()\n        self.reader_name = reader_name\n        self.avail_readers = avail_readers\n    def __str__(self):\n        msg = \"Reader {} Not Found.\\nAvailiable readers:\\n\".format(\n            self.reader_name)"
+        },
+        {
+            "comment": "The code defines a DataReader class for video input and a ReaderZoo class for registering and retrieving readers. The DataReader class has an init method for setting the model name, mode, and configuration, as well as a create_reader method that must be implemented by subclasses. The ReaderZoo class registers readers using the regist method and retrieves them based on name, mode, and configuration with the get method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/reader_utils.py\":31-69",
+            "content": "        for reader in self.avail_readers:\n            msg += \"  {}\\n\".format(reader)\n        return msg\nclass DataReader(object):\n    \"\"\"data reader for video input\"\"\"\n    def __init__(self, model_name, mode, cfg):\n        self.name = model_name\n        self.mode = mode\n        self.cfg = cfg\n    def create_reader(self):\n        \"\"\"Not implemented\"\"\"\n        pass\n    def get_config_from_sec(self, sec, item, default=None):\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ReaderZoo(object):\n    def __init__(self):\n        self.reader_zoo = {}\n    def regist(self, name, reader):\n        assert reader.__base__ == DataReader, \"Unknow model type {}\".format(\n            type(reader))\n        self.reader_zoo[name] = reader\n    def get(self, name, mode, cfg):\n        for k, v in self.reader_zoo.items():\n            if k == name:\n                return v(name, mode, cfg)\n        raise ReaderNotFoundError(name, self.reader_zoo.keys())\n# singleton reader_zoo"
+        },
+        {
+            "comment": "This code defines a class `ReaderZoo` and provides two functions `regist_reader` and `get_reader`. The `ReaderZoo` is used to register different types of readers and retrieve them based on their name, mode, and configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/reader/reader_utils.py\":70-79",
+            "content": "reader_zoo = ReaderZoo()\ndef regist_reader(name, reader):\n    reader_zoo.regist(name, reader)\ndef get_reader(name, mode, cfg):\n    reader_model = reader_zoo.get(name, mode, cfg)\n    return reader_model.create_reader()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ddf9a8cb-3fe1-4111-bdaa-5eb4ec613f53.json b/docs/doc/ddf9a8cb-3fe1-4111-bdaa-5eb4ec613f53.json
new file mode 100644
index 000000000..8e3f4ec12
--- /dev/null
+++ b/docs/doc/ddf9a8cb-3fe1-4111-bdaa-5eb4ec613f53.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code imports modules, defines decompressing and downloading functions, ensures directory existence, deletes downloaded files post-decompression, and includes an AttrDict class for attribute access.",
+    "details": [
+        {
+            "comment": "The code imports necessary modules and defines functions for decompressing and downloading files. It also ensures a directory exists before attempting to download a file, then deletes the downloaded file after decompression.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/utils.py\":0-35",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n#Licensed under the Apache License, Version 2.0 (the \"License\");\n#you may not use this file except in compliance with the License.\n#You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n#Unless required by applicable law or agreed to in writing, software\n#distributed under the License is distributed on an \"AS IS\" BASIS,\n#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#See the License for the specific language governing permissions and\n#limitations under the License.\nimport os\nimport wget\nimport tarfile\n__all__ = ['decompress', 'download', 'AttrDict']\ndef decompress(path):\n    t = tarfile.open(path)\n    t.extractall(path=os.path.split(path)[0])\n    t.close()\n    os.remove(path)\ndef download(url, path):\n    weight_dir = os.path.split(path)[0]\n    if not os.path.exists(weight_dir):\n        os.makedirs(weight_dir)\n    path = path + \".tar.gz\"\n    wget.download(url, path)\n    decompress(path)"
+        },
+        {
+            "comment": "This code defines an AttrDict class, which is a subclass of dict with additional getattr and setattr methods for accessing and modifying its elements as attributes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/utils.py\":38-46",
+            "content": "class AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/de118e7c-a70a-4755-8475-975d2312c0be.json b/docs/doc/de118e7c-a70a-4755-8475-975d2312c0be.json
new file mode 100644
index 000000000..c7e6cbe9f
--- /dev/null
+++ b/docs/doc/de118e7c-a70a-4755-8475-975d2312c0be.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code introduces FrameRecDataset class for PaddleVideo, loading raw frames and applying transformations. Another class reads index files, initializes base class with parameters, and handles missing frame file exceptions during training/validation.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library and defines a FrameRecDataset class for action recognition. It loads raw frames from frame files, applies specified transform operations to them, and registers the dataset with the DATASETS registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py\":0-31",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass FrameRecDataset(BaseDataset):\n    \"\"\"Rawframe dataset for action recognition.\n    The dataset loads raw frames from frame files, and apply specified transform operatation them.\n    The ind"
+        },
+        {
+            "comment": "This code defines a class that loads index files containing video information. The class takes an index file path, pipeline, data prefix (optional), test mode (optional) and suffix (optional) as arguments. It initializes the base class with these parameters and then has a method load_file() to read the index file and get the video information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py\":31-61",
+            "content": "ecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.\n    Example of an index file:\n    .. code-block:: txt\n        file_path-1 150 1\n        file_path-2 160 1\n        file_path-3 170 2\n        file_path-4 180 2\n    Args:\n        file_path (str): Path to the index file.\n        pipeline(XXX):\n        data_prefix (str): directory path of the data. Default: None.\n        test_mode (bool): Whether to bulid the test dataset. Default: False.\n        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 pipeline,\n                 num_retries=5,\n                 data_prefix=None,\n                 test_mode=False,\n                 suffix='img_{:05}.jpg'):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, data_prefix, test_mode)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\""
+        },
+        {
+            "comment": "This code reads a file and parses each line into frame path, directory, number of frames, and labels. It returns a list of dictionaries containing this information. The \"prepare_train\" function tries to prepare the frames for training/validation multiple times in case an exception occurs while reading the frames files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py\":62-87",
+            "content": "        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                mp4_path, frame_dir, frames_len, labels = line_split\n                if self.data_prefix is not None:\n                    frame_dir = osp.join(self.data_prefix, frame_dir)\n                info.append(\n                    dict(frame_dir=frame_dir,\n                         suffix=self.suffix,\n                         frames_len=frames_len,\n                         labels=float(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"Prepare the frames for training/valid given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info("
+        },
+        {
+            "comment": "The code handles exceptions caused by reading missing frames files. It attempts to load the frames multiple times if there are errors, and keeps track of the number of retries. If an error occurs, it logs the error message and continues with a different index from the dataset until it successfully loads the frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py\":88-109",
+            "content": "                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):\n        \"\"\"Prepare the frames for test given index. \"\"\"\n        #Try to catch Exception caused by reading missing frames files\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['frame_dir'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/de134f00-e75e-4048-9479-4f34324ea55d.json b/docs/doc/de134f00-e75e-4048-9479-4f34324ea55d.json
new file mode 100644
index 000000000..519cdb75f
--- /dev/null
+++ b/docs/doc/de134f00-e75e-4048-9479-4f34324ea55d.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code parses and extracts expressions, specifies reduction type (print/sum/mean), discards line parts, and enables debug mode. It defines functions to parse arguments, log messages, validate/extract data, and performs calculations on a list of numerical tuples based on user-defined parameters in the main function.",
+    "details": [
+        {
+            "comment": "This code parses arguments for validating and extracting expressions, specifying reduction type (print/sum/mean), discarding line parts, and enabling debug mode.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/extract_loss.py\":0-27",
+            "content": "import sys\nimport argparse\nimport re\ndef parameter_parser():\n    parser = argparse.ArgumentParser(description=\"Support Args:\")\n    parser.add_argument(\"-v\",\n                        \"--valid-expr\",\n                        type=str,\n                        default=\"*\",\n                        help=\"when not match, the line will discard.\")\n    parser.add_argument(\"-e\",\n                        \"--extract-expr\",\n                        type=str,\n                        default=\"^{%s}$,\",\n                        help=\"the extract expr for the loss: loss {%f}\")\n    parser.add_argument(\"-r\",\n                        \"--reduction-expr\",\n                        type=str,\n                        default=\"print\",\n                        help=\"print | sum | mean\")\n    parser.add_argument(\"-n\",\n                        \"--discard\",\n                        type=int,\n                        default=0,\n                        help=\"while reduction, discard [0:n] and [-n:]\")\n    parser.add_argument(\"-d\", \"--debug\", type=bool, default=False, help=\"debug\")"
+        },
+        {
+            "comment": "The code defines functions to parse arguments, log messages, and validate or extract data from a given line. The \"is_valid\" function checks if the input line matches a specific expression or wildcard, while the \"extract\" function uses regular expressions to parse a specified type of data (float, int, or string) from a given line.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/extract_loss.py\":28-70",
+            "content": "    return parser.parse_args()\nargs = parameter_parser()\ndef log(*inp, **kargs):\n    if args.debug:\n        print(*inp, **kargs)\ndef is_valid(line, valid_expr):\n    if valid_expr == \"*\": return True\n    if valid_expr in line: return True\n    return False\ndef extract(line, extract_expr):\n    \"\"\"\n    return tuple, the output will be\n    \"\"\"\n    log(\"Extract_expression is : \", extract_expr)\n    x = re.findall(\"\\{%(.)\\}\", extract_expr)\n    assert len(x) == 1, \"Must exist a {%d} | {%f} | {%s} \"\n    t = x[0]\n    type_converter = {\n        'f': float,\n        'i': int,\n        's': str,\n    }\n    type_extracter = {\n        \"f\": r'(-?\\\\d+\\\\.\\\\d+)',\n        \"i\": r'(-?\\\\d+)',\n        \"s\": r'(.*?)',\n    }\n    log(type_extracter[t])\n    pattern = re.sub(\"\\{%(.)\\}\", type_extracter[t], extract_expr, 1)\n    log(\"Created Pattern is: \", pattern)\n    x = re.findall(pattern, line)\n    if len(x) == 0: return None\n    assert len(x) == 1, f\"Multi Match for `{extract_expr}` in line: \\n{line}\"\n    log(\"Find in line: \", x[0].strip())\n    return type_converter[t](x[0].strip())"
+        },
+        {
+            "comment": "This code defines a function 'action' which performs calculations on a list of numerical tuples and prints the result based on the given action. The main function reads input lines, validates them, extracts values, and passes the resulting tuple list to the 'action' function based on user-defined parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/extract_loss.py\":73-101",
+            "content": "def action(tuple_list, action):\n    # discard the warm up\n    if args.discard > 0:\n        tuple_list = tuple_list[args.discard:]\n        tuple_list = tuple_list[:-args.discard]\n    # do action for each item\n    if action == \"sum\":\n        print(sum(tuple_list))\n    if action == \"mean\":\n        if len(tuple_list) == 0: print(\"null\")\n        else: print(sum(tuple_list) / len(tuple_list))\n    if action == \"print\":\n        for item in tuple_list:\n            print(item)\ndef main():\n    current_step = 0\n    tuple_list = []\n    for line in sys.stdin:\n        line = line.strip()\n        if is_valid(line, args.valid_expr):\n            ret = extract(line, args.extract_expr)\n            if ret: tuple_list.append(ret)\n    action(tuple_list, args.reduction_expr)\nif __name__ == \"__main__\":\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/de1d48ad-e588-4563-9abd-adea83542eb6.json b/docs/doc/de1d48ad-e588-4563-9abd-adea83542eb6.json
new file mode 100644
index 000000000..9cc582c17
--- /dev/null
+++ b/docs/doc/de1d48ad-e588-4563-9abd-adea83542eb6.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler for the logger. The handler is configured to log INFO level messages and uses a specific log format and date format.",
+    "details": [
+        {
+            "comment": "This code defines a custom logger class for the news stripper application. It checks if the 'logs' directory exists, creates it if not, and sets up a file handler for the logger. The handler is configured to log INFO level messages and uses a specific log format and date format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/logger.py\":0-22",
+            "content": "\"\"\"\nlogger\n\"\"\"\nimport os\nimport logging\nclass Logger(logging.Logger):\n    \"\"\"Customized logger for news stripper\n    \"\"\"\n    def __init__(self):\n        super(Logger, self).__init__(self)\n        if not os.path.exists('logs'):\n            os.mkdir('logs')\n        handler = logging.FileHandler(\"logs/action_detect.log\")\n        # handler.setLevel(logging.DEBUG)\n        handler.setLevel(logging.INFO)\n        format = \"%(levelname)s: %(asctime)s: %(filename)s:%(lineno)d %(message)s\"\n        datefmt = \"%y-%m-%d %H:%M:%S\"\n        formatter = logging.Formatter(format, datefmt)\n        handler.setFormatter(formatter)\n        self.addHandler(handler)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/dec208dc-4d3b-4719-9553-f1d29184b1b2.json b/docs/doc/dec208dc-4d3b-4719-9553-f1d29184b1b2.json
new file mode 100644
index 000000000..2b5f299b0
--- /dev/null
+++ b/docs/doc/dec208dc-4d3b-4719-9553-f1d29184b1b2.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code file is part of the PaddleVideo library and contains the initialization, base class (BaseMultimodal), and a specific multimodal model (ActBert). It also mentions licensing information and a link to access it. The __all__ variable lists the available modules for importing from this file.",
+    "details": [
+        {
+            "comment": "This code file is part of the PaddleVideo library and contains the initialization, base class (BaseMultimodal), and a specific multimodal model (ActBert). It also mentions licensing information and a link to access it. The __all__ variable lists the available modules for importing from this file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/multimodal/__init__.py\":0-15",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom .base import BaseMultimodal\nfrom .actbert import ActBert\n__all__ = ['BaseMultimodal', 'ActBert']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/dff18248-7c27-421b-ad0f-4520190aaf15.json b/docs/doc/dff18248-7c27-421b-ad0f-4520190aaf15.json
new file mode 100644
index 000000000..834746f59
--- /dev/null
+++ b/docs/doc/dff18248-7c27-421b-ad0f-4520190aaf15.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code defines AssignResult class, initializes MaxIoUAssignerAVA, assigns GT boxes to bboxes using max IOU method and handles multi-class cases. It's registered at BBOX_ASSIGNERS.",
+    "details": [
+        {
+            "comment": "This code defines a class called \"AssignResult\" for storing the assigned results, including number of gts, ground truth indexes, maximum overlaps, and labels if available. It also includes a method called \"add_gt_\" to add ground truth as assigned results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py\":0-26",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport numpy as np\nfrom ..registry import BBOX_ASSIGNERS\nfrom ..bbox_utils import bbox_overlaps\nclass AssignResult():\n    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):\n        self.num_gts = num_gts\n        self.gt_inds = gt_inds\n        self.max_overlaps = max_overlaps\n        self.labels = labels\n    def add_gt_(self, gt_labels):\n        \"\"\"Add ground truth as assigned results.  \"\"\""
+        },
+        {
+            "comment": "This code initializes a MaxIoUAssignerAVA object by setting the self_inds and gt_inds attributes using paddle.arange and paddle.squeeze functions, concatenating them with paddle.concat function. It also sets max_overlaps attribute by concatenating gt_label_ones and max_overlaps_squeeze, and updates labels attribute if not None. The class is then registered at BBOX_ASSIGNERS with the decorator @BBOX_ASSIGNERS.register().",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py\":27-48",
+            "content": "        self_inds = paddle.arange(1, len(gt_labels) + 1, dtype=\"int32\")\n        gt_inds_squeeze = paddle.squeeze(self.gt_inds, axis=0)\n        self.gt_inds = paddle.concat([self_inds, gt_inds_squeeze])\n        gt_label_ones = paddle.full((len(gt_labels), ), 1, dtype='float32')\n        max_overlaps_squeeze = paddle.squeeze(self.max_overlaps, axis=0)\n        self.max_overlaps = paddle.concat([gt_label_ones, max_overlaps_squeeze])\n        if self.labels is not None:\n            self.labels = paddle.concat([gt_labels, self.labels])\n@BBOX_ASSIGNERS.register()\nclass MaxIoUAssignerAVA():\n    \"\"\"Assign a corresponding gt bbox or background to each bbox.  \"\"\"\n    def __init__(self,\n                 pos_iou_thr,\n                 neg_iou_thr,\n                 min_pos_iou=.0,\n                 gt_max_assign_all=True,\n                 ignore_iof_thr=-1,\n                 ignore_wrt_candidates=True,\n                 match_low_quality=True,\n                 gpu_assign_thr=-1,\n                 iou_calculator=dict(type='BboxOverlaps2D')):"
+        },
+        {
+            "comment": "The code defines a class that assigns ground truth (GT) boxes to bboxes. It takes in bboxes and GT bboxes as input, and returns the assignment result. The function assign_wrt_overlaps calculates assigned_gt_inds based on the overlaps of bboxes with gts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py\":49-74",
+            "content": "        self.pos_iou_thr = pos_iou_thr\n        self.neg_iou_thr = neg_iou_thr\n        self.min_pos_iou = min_pos_iou\n        self.gt_max_assign_all = gt_max_assign_all\n        self.ignore_iof_thr = ignore_iof_thr\n        self.ignore_wrt_candidates = ignore_wrt_candidates\n        self.gpu_assign_thr = gpu_assign_thr\n        self.match_low_quality = match_low_quality\n    def assign(self, \n               bboxes, \n               gt_bboxes, \n               gt_labels=None):\n        \"\"\"Assign gt to bboxes.  \"\"\"\n        overlaps = bbox_overlaps(gt_bboxes, bboxes)\n        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)\n        return assign_result\n    def assign_wrt_overlaps(self, overlaps, gt_labels=None):\n        \"\"\"Assign w.r.t. the overlaps of bboxes with gts.  \"\"\"\n        num_gts, num_bboxes = overlaps.shape[0], overlaps.shape[1]\n        # 1. assign -1\n        assigned_gt_inds = paddle.full((num_bboxes, ), -1, dtype='int32')\n        # for each anchor, which gt best overlaps with it\n        # for each anchor, the max iou of all gts"
+        },
+        {
+            "comment": "This code assigns positive and negative labels to anchors based on their IoU with ground truth boxes. If the max IoU is above a certain threshold, it's considered positive. If it's below another threshold, it's negative. This process helps determine which anchor best overlaps with each ground truth box.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py\":75-92",
+            "content": "        max_overlaps, argmax_overlaps = paddle.topk(overlaps, k=1, axis=0)\n        # for each gt, which anchor best overlaps with it\n        # for each gt, the max iou of all proposals\n        gt_max_overlaps, gt_argmax_overlaps = paddle.topk(overlaps, k=1, axis=1) \n        # 2. assign negative: below the negative inds are set to be 0\n        match_labels = paddle.full(argmax_overlaps.shape, -1, dtype='int32')\n        match_labels = paddle.where(max_overlaps < self.neg_iou_thr,\n                            paddle.zeros_like(match_labels), match_labels)\n        # 3. assign positive: above positive IoU threshold\n        argmax_overlaps_int32 = paddle.cast(argmax_overlaps, 'int32')\n        match_labels = paddle.where(max_overlaps >= self.pos_iou_thr,\n                                argmax_overlaps_int32 + 1, match_labels)\n        assigned_gt_inds = match_labels\n        if self.match_low_quality:\n            # Low-quality matching will overwirte the assigned_gt_inds\n            # assigned in Step 3. Thus, the assigned gt might not be the"
+        },
+        {
+            "comment": "This code iterates over each ground truth (GT) bounding box, and if the IOU with a detection is above the minimum allowed position IOU, it checks whether all overlapping detections should be assigned to this GT. It creates a tensor of boolean values representing the assignment for each detection and GT pair. This is done by comparing the overlaps matrix and gt_max_overlaps, then reshaping and replacing match labels accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py\":93-108",
+            "content": "            # best one for prediction.\n            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox\n            # 1 & 2, bbox 1 will be assigned as the best target for bbox A\n            # in step 3. However, if GT bbox 2's gt_argmax_overlaps = A,\n            # bbox A's assigned_gt_inds will be overwritten to be bbox B.\n            # This might be the reason that it is not used in ROI Heads.\n            for i in range(num_gts):\n                if gt_max_overlaps.numpy()[i] >= self.min_pos_iou:\n                    if self.gt_max_assign_all:\n                        equal_x_np = overlaps[i, :].numpy()\n                        equal_y_np = gt_max_overlaps[i].numpy()\n                        max_iou_inds = np.equal(equal_x_np, equal_y_np)\n                        max_iou_inds = paddle.to_tensor(max_iou_inds)\n                        max_iou_inds = paddle.reshape( max_iou_inds, [1,max_iou_inds.shape[0]] )\n                        match_labels_gts = paddle.full(max_iou_inds.shape, i+1, dtype='int32')\n                        match_labels = paddle.where(max_iou_inds, match_labels_gts, match_labels)"
+        },
+        {
+            "comment": "This code assigns ground truth (GT) labels and indices for maximum IOU Assigner in the AVA dataset. It handles both multi-class cases with multiple classes per label. If there is a match, GT indices are assigned, otherwise, it assigns index + 1. Finally, it considers the multi-class case by asserting the existence of more than one class and assigns zeros to the initial labels array before updating them based on the selected gt_labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py\":109-125",
+            "content": "                        assigned_gt_inds = match_labels\n                    else:\n                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1\n        if gt_labels is not None:\n            # consider multi-class case (AVA)\n            assert len(gt_labels[0]) > 1\n            assigned_labels = paddle.full([num_bboxes, len(gt_labels[0])], 0, dtype='float32')\n            assigned_gt_inds_reshape = assigned_gt_inds.reshape([assigned_gt_inds.shape[1]])\n            pos_inds = paddle.nonzero( assigned_gt_inds_reshape , as_tuple=False)\n            pos_inds_num = float(paddle.numel(pos_inds))\n            if pos_inds_num > 0:\n                pos_inds = paddle.squeeze(pos_inds, axis = 1 )\n                assigned_gt_inds_squeeze = paddle.squeeze(assigned_gt_inds, axis=0)\n                assigned_gt_inds_select = paddle.index_select(assigned_gt_inds_squeeze, pos_inds) - 1\n                gt_labels_select = paddle.index_select(gt_labels, assigned_gt_inds_select)\n                A = assigned_gt_inds_squeeze"
+        },
+        {
+            "comment": "This code snippet is part of a max IOU assigner implementation in PaddleVideo. It assigns labels to objects based on the maximum IoU (intersection over union) threshold. If there's only one object, it assigns the ground truth index if the overlap is greater than 0, otherwise sets it to 0. For multiple objects, it uses a where statement to select the max IOU assignment. The assigned labels are then returned as part of the AssignResult.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py\":126-147",
+            "content": "                X = assigned_gt_inds_squeeze - 1\n                Y = paddle.zeros_like(X)\n                if A.shape[0]==1:\n                    if float(A) > 0:\n                        T=X\n                    else:\n                        T=Y\n                else:\n                    T = paddle.where(A>0, X, Y)\n                S = paddle.index_select(gt_labels, T)\n                AE = paddle.expand(A, [S.shape[1], A.shape[0]]) \n                AET = paddle.transpose(AE, perm=[1, 0])\n                R = paddle.where(AET>0, S, assigned_labels) \n                assigned_labels = R\n        else:\n            assigned_labels = None\n        ret = AssignResult(\n            num_gts,\n            assigned_gt_inds,\n            max_overlaps,\n            labels=assigned_labels)\n        return ret"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/dff1d35e-1cfe-4b4e-aeb2-cc058c1f0e6f.json b/docs/doc/dff1d35e-1cfe-4b4e-aeb2-cc058c1f0e6f.json
new file mode 100644
index 000000000..db0179523
--- /dev/null
+++ b/docs/doc/dff1d35e-1cfe-4b4e-aeb2-cc058c1f0e6f.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code block appears to be a comment section at the beginning of the file, indicating the author's name, date of creation, copyright information, and request for proper citation in case of reuse. The code seems to belong to PaddleVideo's EIVideo application, specifically within QEIVideo's gui module.",
+    "details": [
+        {
+            "comment": "This code block appears to be a comment section at the beginning of the file, indicating the author's name, date of creation, copyright information, and request for proper citation in case of reuse. The code seems to belong to PaddleVideo's EIVideo application, specifically within QEIVideo's gui module.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/gui/__init__.py\":0-3",
+            "content": "# Author: Acer Zhang\n# Datetime: 2022/1/6 \n# Copyright belongs to the author.\n# Please indicate the source for reprinting."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e016e8ac-9cdd-4db3-9ef2-8c5fcbd417c6.json b/docs/doc/e016e8ac-9cdd-4db3-9ef2-8c5fcbd417c6.json
new file mode 100644
index 000000000..0b5ebae2e
--- /dev/null
+++ b/docs/doc/e016e8ac-9cdd-4db3-9ef2-8c5fcbd417c6.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This line imports all functions and classes from the \"trainer\" module in the same package.",
+    "details": [
+        {
+            "comment": "This line imports all functions and classes from the \"trainer\" module in the same package.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/__init__.py\":0-0",
+            "content": "from .trainer import *"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e07625a1-98f7-4d70-bfa2-89cc8328f863.json b/docs/doc/e07625a1-98f7-4d70-bfa2-89cc8328f863.json
new file mode 100644
index 000000000..f7cf05e69
--- /dev/null
+++ b/docs/doc/e07625a1-98f7-4d70-bfa2-89cc8328f863.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This script runs PaddleVideo server using either PP-TSM or PP-TSN models on different ports. It uses the paddle_serving_server module and is executed as a background process with nohup command.",
+    "details": [
+        {
+            "comment": "This script runs PaddleVideo server using either PP-TSM or PP-TSN models on different ports. It uses the paddle_serving_server module and is executed as a background process with nohup command.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/run_cpp_serving.sh\":0-9",
+            "content": "## sample script\n# run paddlevideo server with PP-TSM:\nnohup python3.7 -m paddle_serving_server.serve \\\n--model ./ppTSM_serving_server \\\n--port 9993 &\n## run paddlevideo server with PP-TSN:\n# nohup python3.7 -m paddle_serving_server.serve \\\n# --model ./ppTSN_serving_server \\\n# --port 9993 &"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e0d2abc9-e634-493e-ae40-fbf5b535b19c.json b/docs/doc/e0d2abc9-e634-493e-ae40-fbf5b535b19c.json
new file mode 100644
index 000000000..9f4a62451
--- /dev/null
+++ b/docs/doc/e0d2abc9-e634-493e-ae40-fbf5b535b19c.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code provides URLs to download 13 EuroCup2016 video files from BCEBOS cloud storage for potential analysis or training data.",
+    "details": [
+        {
+            "comment": "List of EuroCup2016 dataset video URLs for download.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/dataset_url.list\":0-10",
+            "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/63e51df254d2402fac703b6c4fdb4ea9.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/76b5f7ee28d942988c6b224bfac136bd.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/250b88724acf40dbb6d7e8ccb400ef38.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/c9516c903de3416c97dae91a59e968d7.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/e1982c90cdd74abaacc4d0692070b400.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/1be705a8f67648da8ec4b4296fa80895.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/de23c0b2be3a4eb1990c5c657061fb29.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/2754615de6e64c4fb95ce1a8095dc1c1.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/299fe30d8f3b4a45b89313fe31f9f3c0.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6cc7db52c5ef4e70b401a5e00d8dd67a.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/22e89747689e4f7e83e3620620c93269.mp4"
+        },
+        {
+            "comment": "This code provides URLs to download various video files from a specified location for the EuroCup2016 dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/dataset_url.list\":11-21",
+            "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/2ceb6c549fc64305a06a75acb355642b.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/719b0a4bcb1f461eabb152298406b861.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/259856b769044b4d8dc94076deb356bf.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d0bd3eab1e794f0f9501c353a6d37827.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/19eb47cc736240d6b2dd930ab69da839.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/4435b708af6d48519a6b726144147d51.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/ea16ad2a020643529e257bd6cb11b3c3.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/eeebffbd4ec74222a9c2d0775d79b689.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/8cfb4e605af44055b1576c37eb0e3209.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6bca62b57cc449c6935f0b17f28d06be.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/70cfc31e520840b2afca458f93a01ce4.mp4"
+        },
+        {
+            "comment": "Lists URLs of 13 EuroCup2016 .mp4 video files hosted on BCEBOS cloud storage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/dataset_url.list\":22-32",
+            "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6496960935e845578e391a5916739752.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d6d25403a4bb4784aecff5f21fd00dc5.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/3e23d452a082403391f8abfb87bf2fb4.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/4c5d9d9af4f044c4a68d134061dc264f.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6994844c64b44c26b935cee9604bef0a.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d6322cb95f6a4402ac80432b561abd5d.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/2c8b5587083a4784a51622e4fec87ccd.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/5faa60d70ed141de8560110e840f2048.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/45d08bc5cb0f424f9ed9d7874eb561cd.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6630aaf0e32146088d0b624e9288f071.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f2edbee29c1b4966b3a410260f78fbe3.mp4"
+        },
+        {
+            "comment": "This code lists URLs for video files belonging to the EuroCup2016 dataset, stored on a specific BCEBOS server.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/dataset_url.list\":33-43",
+            "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f24116fdd6a54214991db32f7dddef67.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/0265731a0c6f4a9398c88db8e3d4a3bc.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/02d2de09997f4215b06e3b00ff0502a0.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/9c231896c56a43f291a5e190949f4333.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/4afbbf9afcd44dfea45b044117cccb48.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/745db97a080d4f44b450dc17a2bcf069.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/5933d0ce17854483b81a318d7d45a34e.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d2cfef2da9f84237a6950c7f6659655c.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/5572686cb90f440988ded956a60e555d.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/8962ac5a332346e180c79d701ae0a175.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f6e64ee9b13a4088b24c45c257894c1e.mp4"
+        },
+        {
+            "comment": "This code contains a list of URLs for EuroCup2016 video files stored in \"paddle-model-ecology.bj.bcebos.com/data/EuroCup2016\". Each URL represents an MP4 file related to the event, potentially used for analysis or training data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/EuroCup2016/dataset_url.list\":44-48",
+            "content": "https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f6ed2b612b3d43baa0726be8b14ebe7c.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/8ab7b0cba5744eb3b6fb10003dfda383.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/1f0a0698e38d493988fe42a50f7e8723.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/737fdb054ca141f2a45013c1740dd0a0.mp4\nhttps://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/bab63a9bcf204e4b99c4a887a01bfd60.mp4"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e11e8e99-e1c2-42f5-9d8e-2c3f4c4baba4.json b/docs/doc/e11e8e99-e1c2-42f5-9d8e-2c3f4c4baba4.json
new file mode 100644
index 000000000..573b233b6
--- /dev/null
+++ b/docs/doc/e11e8e99-e1c2-42f5-9d8e-2c3f4c4baba4.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This script sets the paths for OpenCV, PaddlePaddle inference, CUDA, cuDNN, and TensorRT directories. It clears existing build directory, creates a new one, navigates to it, runs cmake with specified options, and then compiles the project using 'make -j' command.",
+    "details": [
+        {
+            "comment": "This script sets the paths for OpenCV, PaddlePaddle inference, CUDA, cuDNN, and TensorRT directories. It clears existing build directory, creates a new one, navigates to it, runs cmake with specified options, and then compiles the project using 'make -j' command.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/tools/build.sh\":0-21",
+            "content": "OPENCV_DIR=your_opencv_dir\nLIB_DIR=your_paddle_inference_dir\nCUDA_LIB_DIR=your_cuda_lib_dir\nCUDNN_LIB_DIR=your_cudnn_lib_dir\nTENSORRT_DIR=your_tensorRT_dir\nBUILD_DIR=build\nrm -rf ${BUILD_DIR}\nmkdir ${BUILD_DIR}\ncd ${BUILD_DIR}\ncmake .. \\\n    -DPADDLE_LIB=${LIB_DIR} \\\n    -DWITH_MKL=ON \\\n    -DWITH_GPU=OFF \\\n    -DWITH_STATIC_LIB=OFF \\\n    -DWITH_TENSORRT=OFF \\\n    -DOPENCV_DIR=${OPENCV_DIR} \\\n    -DCUDNN_LIB=${CUDNN_LIB_DIR} \\\n    -DCUDA_LIB=${CUDA_LIB_DIR} \\\n    -DTENSORRT_DIR=${TENSORRT_DIR} \\\nmake -j"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e124873b-e846-4561-baea-8915df68a541.json b/docs/doc/e124873b-e846-4561-baea-8915df68a541.json
new file mode 100644
index 000000000..6baf26742
--- /dev/null
+++ b/docs/doc/e124873b-e846-4561-baea-8915df68a541.json
@@ -0,0 +1,25 @@
+{
+    "summary": "VideoDataset is a subclass of BaseDataset that loads and processes raw videos, using an index file containing video information. It handles corrupted files with retries and error logging. The `prepare_train` and `prepare_test` methods return image data and labels for training and testing respectively.",
+    "details": [
+        {
+            "comment": "This code is for VideoDataset class, a subclass of BaseDataset, that loads raw videos and applies specified transforms. It uses index file with multiple lines where each line indicates information about videos in the dataset. The class is registered within the DATASETS registry and logger is initialized.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/video.py\":0-30",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport copy\nimport random\nimport numpy as np\nfrom ..registry import DATASETS\nfrom .base import BaseDataset\nfrom ...utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@DATASETS.register()\nclass VideoDataset(BaseDataset):\n    \"\"\"Video dataset for action recognition\n       The dataset loads raw videos and apply specified transforms on them.\n       The index file is a file with multiple lines, and each line indicates"
+        },
+        {
+            "comment": "This code initializes a new class for loading index file data, which contains video information. The index file has path and label entries separated by whitespace. The load_file method reads the index file to retrieve filename and labels for each video.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/video.py\":31-56",
+            "content": "       a sample video with the filepath and label, which are split with a whitesapce.\n       Example of a inde file:\n       .. code-block:: txt\n           path/000.mp4 1\n           path/001.mp4 1\n           path/002.mp4 2\n           path/003.mp4 2\n       Args:\n           file_path(str): Path to the index file.\n           pipeline(XXX): A sequence of data transforms.\n           **kwargs: Keyword arguments for ```BaseDataset```.\n    \"\"\"\n    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):\n        self.num_retries = num_retries\n        self.suffix = suffix\n        super().__init__(file_path, pipeline, **kwargs)\n    def load_file(self):\n        \"\"\"Load index file to get video information.\"\"\"\n        info = []\n        with open(self.file_path, 'r') as fin:\n            for line in fin:\n                line_split = line.strip().split()\n                filename, labels = line_split\n                #TODO(hj): Required suffix format: may mp4/avi/wmv\n                filename = filename + self.suffix"
+        },
+        {
+            "comment": "This code defines a class with methods to prepare data for training and testing. It handles potential corrupted video files by retrying if an exception occurs, and logs the error message along with the number of retries. The `prepare_train` method returns image data (`imgs`) and corresponding labels from the given index in the dataset. Similarly, the `prepare_test` method returns image data and labels for testing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/video.py\":57-79",
+            "content": "                if self.data_prefix is not None:\n                    filename = osp.join(self.data_prefix, filename)\n                info.append(dict(filename=filename, labels=int(labels)))\n        return info\n    def prepare_train(self, idx):\n        \"\"\"TRAIN & VALID. Prepare the data for training/valid given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])\n    def prepare_test(self, idx):"
+        },
+        {
+            "comment": "This code attempts to load a video file and prepare the data for testing. It handles potential exceptions caused by corrupted files by retrying multiple times. If an exception occurs, it logs an error message and retries with another randomly selected file index. The function returns the images and labels from the successfully loaded video file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/video.py\":80-94",
+            "content": "        \"\"\"TEST. Prepare the data for test given the index.\"\"\"\n        #Try to catch Exception caused by reading corrupted video file\n        for ir in range(self.num_retries):\n            try:\n                results = copy.deepcopy(self.info[idx])\n                results = self.pipeline(results)\n            except Exception as e:\n                #logger.info(e)\n                if ir < self.num_retries - 1:\n                    logger.info(\n                        \"Error when loading {}, have {} trys, will try again\".\n                        format(results['filename'], ir))\n                idx = random.randint(0, len(self.info) - 1)\n                continue\n            return results['imgs'], np.array([results['labels']])"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e18d8052-04a5-4da4-9819-5fc4c1383835.json b/docs/doc/e18d8052-04a5-4da4-9819-5fc4c1383835.json
new file mode 100644
index 000000000..256466aef
--- /dev/null
+++ b/docs/doc/e18d8052-04a5-4da4-9819-5fc4c1383835.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code defines a 1D recognizer model in PaddleVideo, processing both image and audio data for training, validation, testing, and inference. It includes forward pass, loss computation, metrics calculations and handles RGB and audio data batches.",
+    "details": [
+        {
+            "comment": "This code defines a 1D recognizer model framework in PaddleVideo. It includes the forward_net function to define how the model trains from input to output and the train_step function for training steps. The data batch contains rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, and labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer1d.py\":0-28",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\n@RECOGNIZERS.register()\nclass Recognizer1D(BaseRecognizer):\n    \"\"\"1D recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        lstm_logit, lstm_output = self.head(imgs)\n        return lstm_logit, lstm_output\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels = data_batch"
+        },
+        {
+            "comment": "The code defines a recognizer1d model that processes both image and audio data. It includes methods for forward pass, validation, testing, and inference steps. In the forward pass, it takes input images and calculates logits and output from the LSTM network. The loss is then computed based on these logits and labels, and metrics such as hit_at_one, perr, and gap are calculated using the output and labels. The validation and testing steps perform similar calculations to those in the training step. In the inference step, only image and audio data are processed to produce output for each input.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer1d.py\":29-60",
+            "content": "        imgs = [(rgb_data, rgb_len, rgb_mask),\n                (audio_data, audio_len, audio_mask)]\n        # call forward\n        lstm_logit, lstm_output = self.forward_net(imgs)\n        loss = self.head.loss(lstm_logit, labels)\n        hit_at_one, perr, gap = self.head.metric(lstm_output, labels)\n        loss_metrics = dict()\n        loss_metrics['loss'] = loss\n        loss_metrics['hit_at_one'] = hit_at_one\n        loss_metrics['perr'] = perr\n        loss_metrics['gap'] = gap\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def test_step(self, data_batch):\n        \"\"\"Testing setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def infer_step(self, data_batch):\n        \"\"\"Infering setp.\n        \"\"\"\n        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch\n        imgs = [(rgb_data, rgb_len, rgb_mask),\n                (audio_data, audio_len, audio_mask)]\n        # call forward"
+        },
+        {
+            "comment": "This code defines a 1D recognizer model framework, which includes a forward_net function to define how the model trains from input to output and a train_step function for the training process. It takes in data batches, including both RGB and audio data, and outputs loss metrics including loss, top1, and top5.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer1d.py\":61-90",
+            "content": "        lstm_logit, _ = self.forward_net(imgs)\n        return lstm_logit\n@RECOGNIZERS.register()\nclass RecognizerAction(BaseRecognizer):\n    \"\"\"1D recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        lstm_logit, lstm_output = self.head(imgs)\n        return lstm_logit, lstm_output\n    def train_step(self, data_batch):\n        \"\"\"Training step.\n        \"\"\"\n        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels, labels_iou = data_batch\n        imgs = [(rgb_data, rgb_len, rgb_mask),\n                (audio_data, audio_len, audio_mask)]\n        # call forward\n        output_logit, output_iou = self.forward_net(imgs)\n        loss = self.head.loss(output_logit, output_iou, labels, labels_iou)\n        top1, top5 = self.head.metric(output_logit, labels)\n        loss_metrics = dict()\n        loss_metrics['loss'] = loss\n        loss_metrics['top1'] = top1\n        loss_metrics['top5'] = top5\n        return loss_metrics"
+        },
+        {
+            "comment": "The code contains three methods: `val_step`, `test_step`, and `infer_step`. These steps perform validating, testing, and inference, respectively. In all three cases, the data batch is passed to the `train_step` method, suggesting a shared implementation between these steps. The `infer_step` specifically expects certain types of data: RGB data with length and mask, as well as audio data with its respective length and mask, in a tuple format. It then processes this data using `forward_net`, returning output logits and IOU values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer1d.py\":92-110",
+            "content": "    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def test_step(self, data_batch):\n        \"\"\"Testing setp.\n        \"\"\"\n        return self.train_step(data_batch)\n    def infer_step(self, data_batch):\n        \"\"\"Infering setp.\n        \"\"\"\n        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch\n        imgs = [(rgb_data, rgb_len, rgb_mask),\n                (audio_data, audio_len, audio_mask)]\n        # call forward\n        output_logit, output_iou = self.forward_net(imgs)\n        return output_logit, output_iou"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e2249165-137e-4134-a239-582b14c701ed.json b/docs/doc/e2249165-137e-4134-a239-582b14c701ed.json
new file mode 100644
index 000000000..1d1d6620d
--- /dev/null
+++ b/docs/doc/e2249165-137e-4134-a239-582b14c701ed.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The Figure Skating Dataset offers 30 fps competition videos with Open Pose key points, and includes train_data, train_label, test_A_data, and test_B_data, downloadable from the competition's homepage. RGB datasets unavailable due to copyright reasons.",
+    "details": [
+        {
+            "comment": "Figure Skating Dataset provides video materials from Figure Skating Championships (2017-2018) standardized to 30 frames per second and 1080 * 720 image size. It uses Open Pose for key points extraction and saves data in .npy format. The dataset includes train_data, train_label, test_A_data, and test_B_data with respective counts. Train_label can be read using np.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/fsd.md\":0-25",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/fsd.md) | English\n# Figure Skating Dataset\n- [Introduction](#Introduction)\n- [Download](#Download)\n---\n## Introduction\nIn figure skating, compared with other sports, human posture and trajectory show the characteristics of strong complexity, which is helpful to the research of fine-grained action recognition tasks.\nFor FSD Dataset, all video materials are collected from the Figure Skating Championships from 2017 to 2018. The frame rate of the video is uniformly standardized to 30 frames per second, and the image size is 1080 * 720 to ensure the relative consistency of the dataset. After that, we use the 2D pose estimation algorithm Open Pose to extract frame by frame key points from the video, and finally save the data in `.npy` format.\nThe directory structure of training dataset and test dataset is as follows:\n```txt\ntrain_data.npy        # 2922\ntrain_label.npy       # 2922\ntest_A_data.npy       # 628\ntest_B_data.npy       # 634\n```\n`train_label.npy` can be read using `np."
+        },
+        {
+            "comment": "This code describes the structure and meaning of a tensor in the dataset, with dimensions N (number of samples), C (coordinates and confidence of joint points), T (duration of action), V (number of joint points), and M (number of athletes). It also includes an example image of a skeleton to illustrate the joint points' positions. The data can be downloaded from the competition homepage after registration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/fsd.md\":25-46",
+            "content": "load()`, each element is an integer variable with a value between 0-29, representing the label of the action. `data.npy` can be read using `np.load()`, return a tensor with the shape of `N\u00d7C\u00d7T\u00d7V\u00d7M`, the specific meaning of each dimension is as follows:\n| Dimension | Size | Meaning\t| Notes |\n| :---- | :----: | :----: | :---- |\n| N\t| N\t| Number of samples | - |\n| C | 3\t| The coordinates and confidence of each joint point respectively |\trescale to -1~1 |\n| T\t| 1500 |\t The duration of the action\t| The actual length of some actions may be less than 1500, in such case we will pad 0 to ensure the unity of T dimension. |\n| V |\t25 | Number of joint points |\tSee the skeleton example below for the meaning of specific joint points. |\n| M |\t1\t|  Number of athletes\t| - |\nskeleton example\uff1a\n<div align=\"left\">\n  <img src=\"../../images/skeleton_example.png\" width=\"180px\"/><br>\n</div>\n## Download\nYou can get the download link after registering on the [competition homepage](https://www.datafountain.cn/competitions/519)."
+        },
+        {
+            "comment": "This code provides dataset information for a PaddleVideo English document. It mentions the Train set with its data and label file URLs, while noting that RGB datasets are not available due to copyright reasons. TestA's data and labels will be provided soon.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/fsd.md\":48-54",
+            "content": "| Set | Data | Label\t|\n| :---- | :----: | :----: |\n| Train\t| [train_data.npy](https://videotag.bj.bcebos.com/Data/FSD_train_data.npy)\t| [train_label.npy](https://videotag.bj.bcebos.com/Data/FSD_train_label.npy) |\n| TestA\t| comming soon\t| comming soon |\n> RGB datasets would not be provided for copyright reasons."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e24cd3d1-be52-44ab-a1fb-b58bc82552de.json b/docs/doc/e24cd3d1-be52-44ab-a1fb-b58bc82552de.json
new file mode 100644
index 000000000..650f6fdc6
--- /dev/null
+++ b/docs/doc/e24cd3d1-be52-44ab-a1fb-b58bc82552de.json
@@ -0,0 +1,55 @@
+{
+    "summary": "This code performs object detection in computer vision tasks and evaluates table tennis action predictions, computing evaluation metrics to optimize F1 scores. The best performing combination is stored for future use.",
+    "details": [
+        {
+            "comment": "This code is importing necessary libraries and defining global variables. It loads ground truth data from specified labels for different datasets and evaluation splits, setting frame per second (fps) values as well. It uses these loaded gts to calculate proposal-box related information based on ground truth sequence.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":0-40",
+            "content": "\"\"\"\nget instance for lstm\n\u6839\u636egts\u8ba1\u7b97\u6bcf\u4e2aproposal_bmn\u7684iou\u3001ioa\u3001label\u7b49\u4fe1\u606f\n\"\"\"\nimport os\nimport sys\nimport json\nimport random\nimport pickle\nimport numpy as np\nimport io\nsys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')\ndataset = \"/home/work/datasets\"\nlabel_index_file = './configs/index_label_football_7.json'\neval_datasets = ['EuroCup2016']\nlabel_files = {\n    'train': 'label_cls8_train.json',\n    'validation': 'label_cls8_val.json'\n}\nglobal fps, mode\nlabel_index = json.load(open(label_index_file, 'rb'))\ndef load_gts():\n    global fps\n    gts_data = {'fps': 0, 'gts': {}}\n    for eval_data in eval_datasets:\n        for item, value in label_files.items():\n            label_file = '{}/{}/{}'.format(dataset, eval_data, value)\n            gts = json.load(open(label_file, 'rb'))\n            gts_data['fps'] = gts['fps']\n            fps = gts['fps']\n            for gt in gts['gts']:\n                gt['mode'] = item\n                basename = '{}/{}/mp4/{}'.format(dataset, eval_data,\n                                                 os.path.basename(gt['url']))"
+        },
+        {
+            "comment": "This code defines a function to compute the intersection of union (IoU) between two intervals, and another function that converts a list of proposals into final detections based on a score threshold. The computed IoU is used to filter out unwanted proposals, and only keep those with high confidence scores. This can be useful for object detection tasks in computer vision applications.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":41-72",
+            "content": "                gts_data['gts'][basename] = gt\n    return gts_data['gts']\ndef computeIoU(e1, e2):\n    \"\"\"\n    clc iou and ioa\n    \"\"\"\n    if not (e1['label'] == e2['label'] and e1['basename'] == e2['basename']):\n        return 0.\n    area1 = e1[\"end\"] - e1[\"start\"]\n    area2 = e2[\"end\"] - e2[\"start\"]\n    x1 = np.maximum(e1[\"start\"], e2[\"start\"])\n    x2 = np.minimum(e1[\"end\"], e2[\"end\"])\n    inter = np.maximum(0.0, x2 - x1)\n    iou = 0.0 if (area1 + area2 -\n                  inter) == 0 else inter * 1.0 / (area1 + area2 - inter)\n    if not mode == 'proposal':\n        iou = 0.0 if area2 == 0 else inter * 1.0 / area2\n    return iou\ndef convert_proposal(boxes, basename, score_threshold=0.01):\n    boxes = sorted(boxes, key=lambda x: float(x['score']), reverse=True)\n    res = []\n    for box in boxes:\n        if not float(box['score']) >= score_threshold:\n            continue\n        res.append({\n            'basename': basename,\n            'start': int(float(box['start']) / fps),\n            'end': int(float(box['end']) / fps),"
+        },
+        {
+            "comment": "The code contains three functions: 'convert_classify', 'convert_groundtruth', and 'convert_time_to_frame'. The 'convert_classify' function sorts boxes based on their classify and iou scores, then appends qualified boxes to a result list. 'convert_groundtruth' appends box labels to the result list based on the phase parameter. The 'convert_time_to_frame' function converts time strings to frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":73-107",
+            "content": "            'label': 0\n        })\n    return res\ndef convert_classify(boxes, basename, iou_threshold, score_threshold):\n    boxes = sorted(boxes,\n                   key=lambda x:\n                   (float(x['classify_score']), float(x['iou_score'])),\n                   reverse=True)\n    def convert_time_to_frame(time_type):\n        return int(time_type)\n        h, m, s = time_type.split(':')\n        return int(h) * 3600 + int(m) * 60 + int(s)\n    res = []\n    for box in boxes:\n        if not (box['iou_score'] >= iou_threshold\n                and box['classify_score'] >= score_threshold):\n            continue\n        res.append({\n            'basename': basename,\n            'start': convert_time_to_frame(box['start_time']),\n            'end': convert_time_to_frame(box['end_time']),\n            'label': box['label_id']\n        })\n    return res\ndef convert_groundtruth(boxes, basename, phase=None):\n    res = []\n    for box in boxes:\n        for item in box['label_ids']:\n            label = 0 if phase == 'proposal' else item"
+        },
+        {
+            "comment": "This code contains three functions: \"res.append\" appends a dictionary to a list with information about video frames, \"print_head\" prints headers for table output, and \"print_result\" prints the evaluation results of the model in a formatted way. The code is likely part of an image classification or object detection algorithm that evaluates the performance of the model on a set of video frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":108-141",
+            "content": "            res.append({\n                'basename': basename,\n                'start': box['start_id'],\n                'end': box['end_id'],\n                'label': label\n            })\n    return res\ndef print_head(iou):\n    print(\"\\nioa = {:.1f}\".format(iou))\n    res_str = ''\n    for item in ['label_name']:\n        res_str += '{:<12s}'.format(item)\n    for item in [\n            'label_id', 'precision', 'recall', 'hit_prop', 'num_prop',\n            'hit_gts', 'num_gts'\n    ]:\n        res_str += '{:<10s}'.format(item)\n    print(res_str)\ndef print_result(res_dict, label='avg'):\n    if label == 'avg':\n        res_str = '{:<22s}'.format(str(label))\n    else:\n        res_str = '{0:{2}<6s}{1:<10s}'.format(label_index[str(label)],\n                                              str(label), chr(12288))\n    for item in ['prec', 'recall']:\n        res_str += '{:<10.4f}'.format(res_dict[item])\n    for item in ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']:\n        res_str += '{:<10d}'.format(res_dict[item])\n    print(res_str)"
+        },
+        {
+            "comment": "Function `evaluation` takes in lists of predicted boxes (`res_boxes`) and ground truth boxes (`gts_boxes`), along with IOU and label ranges. It computes intersection over union (IoU) between each predicted box and ground truth box, creating a map of IoUs. The map is then reshaped into a 2D array for easier computation. The function calculates the maximum IoU per row in the map, and the index of this maximum value. It also loops through label and IOU ranges to calculate various statistics for subsets of labels and IOU thresholds. If `show_sub` is True, it prints a header indicating the current subset being evaluated. If there are no predicted boxes for a particular label in the current iteration, the function skips that iteration without computing results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":144-165",
+            "content": "def evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub=False):\n    iou_map = [computeIoU(resId, gtsId) for resId in res_boxes \\\n                                        for gtsId in gts_boxes]\n    iou_map = np.array(iou_map).reshape((len(res_boxes), len(gts_boxes)))\n    hit_map_prop_total = np.max(iou_map, axis=1)\n    hit_map_index_total = np.argmax(iou_map, axis=1)\n    res_dict = ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']\n    for iou_threshold in iou_range:\n        if show_sub:\n            print_head(iou_threshold)\n        iou_prop = np.array([k >= iou_threshold for k in hit_map_prop_total])\n        average_results = {}\n        for label_id in label_range:\n            sub_results = {}\n            label_prop = np.array([k['label'] == label_id for k in res_boxes])\n            label_gts = np.array([k['label'] == label_id for k in gts_boxes])\n            sub_results['num_prop'] = sum(label_prop)\n            sub_results['num_gts'] = sum(label_gts)\n            if sub_results['num_prop'] == 0:"
+        },
+        {
+            "comment": "The code calculates precision and recall scores for a set of results. It checks if there are any hits, then calculates the hit properties and ground truths. If show_sub is True, it prints the subresults for each label. The average results are also updated based on these calculations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":166-185",
+            "content": "                hit_prop_index = []\n            else:\n                hit_prop_index = label_prop & iou_prop\n            sub_results['hit_prop'] = sum(hit_prop_index)\n            sub_results['hit_gts'] = len(\n                set(hit_map_index_total[hit_prop_index]))\n            sub_results['prec'] = 0.0 if sub_results['num_prop'] == 0 \\\n                                      else sub_results['hit_prop'] * 1.0 / sub_results['num_prop']\n            sub_results['recall'] = 0.0 if sub_results['num_gts'] == 0 \\\n                                        else sub_results['hit_gts'] * 1.0 / sub_results['num_gts']\n            if show_sub:\n                print_result(sub_results, label=label_id)\n            for item in res_dict:\n                if not item in average_results:\n                    average_results[item] = 0\n                average_results[item] += sub_results[item]\n        if len(label_range) == 1:  # proposal \u4e0d\u9700\u8981\u8f93\u51faaverage\u503c\n            continue\n        average_results['prec'] = 0.0 if average_results['num_prop'] == 0 \\"
+        },
+        {
+            "comment": "This code calculates average precision and recall for a table tennis prediction model. It returns an F1 score, considers IOU and score thresholds, handles different phases, and optionally prints the results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":186-209",
+            "content": "                                      else average_results['hit_prop'] * 1.0 / average_results['num_prop']\n        average_results['recall'] = 0.0 if average_results['num_gts'] == 0 \\\n                                        else average_results['hit_gts'] * 1.0 / average_results['num_gts']\n        if show_sub:\n            print_result(average_results)\n        average_results['F1'] = 0.0 if (average_results['prec'] + average_results['recall'] == 0) \\\n                                    else 2 * average_results['prec'] * average_results['recall'] / \\\n                                            (average_results['prec'] + average_results['recall'])\n        return average_results\ndef get_eval_results(predicts,\n                     gts_data,\n                     phase,\n                     iou_threshold=0.3,\n                     score_threshold=0.3,\n                     show_sub=False):\n    global mode\n    mode = phase\n    res_boxes = []\n    gts_boxes = []\n    for ped_data in predicts:\n        basename = ped_data['video_name']"
+        },
+        {
+            "comment": "This code evaluates the performance of a video analysis model for table tennis. It determines if the data is an evaluation dataset and then extends the results and ground truth boxes based on the phase (proposal or classification). It sets label and iou thresholds for evaluation and finally calculates the evaluation results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":211-238",
+            "content": "        # eval sub data\n        such_eval = False\n        for eval_name in eval_datasets:\n            if eval_name in basename:\n                such_eval = True\n                break\n        if not such_eval:\n            continue\n        gts = gts_data[basename]['actions']\n        if phase == 'proposal':\n            res_boxes.extend(\n                convert_proposal(ped_data['bmn_results'], basename,\n                                 score_threshold))\n            gts_boxes.extend(\n                convert_groundtruth(gts, basename, phase='proposal'))\n            label_range = [0]\n            iou_range = np.arange(0.1, 1, 0.1)\n        else:\n            res_boxes.extend(\n                convert_classify(ped_data['action_results'], basename,\n                                 iou_threshold, score_threshold))\n            gts_boxes.extend(convert_groundtruth(gts, basename))\n            label_range = range(1, len(label_index))\n            iou_range = np.arange(0.5, 0.6, 0.1)\n    eval_results = evaluation(res_boxes,\n                              gts_boxes,"
+        },
+        {
+            "comment": "This code evaluates the performance of table tennis action predictions. It takes in predicted results and ground truth data, then computes evaluation metrics for different IOU and score thresholds. The best performing combination is stored for future reference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":239-269",
+            "content": "                              label_range,\n                              iou_range,\n                              show_sub=show_sub)\n    return eval_results\nif __name__ == \"__main__\":\n    result_file = sys.argv[1]\n    predicts = json.load(open(result_file, 'r', encoding='utf-8'))\n    gts_data = load_gts()\n    get_eval_results(predicts,\n                     gts_data,\n                     'proposal',\n                     score_threshold=0.03,\n                     show_sub=True)\n    #get_eval_results(predicts, gts_data, 'actions')\n    best_F1 = -0.1\n    best_res = {}\n    best_iou_threshold = 0.\n    best_score_threshold = 0.\n    for iou_threshold in np.arange(0.1, 0.9, 0.1):\n        for score_threshold in np.arange(0.1, 1, 0.1):\n            avg_res = get_eval_results(predicts,\n                                       gts_data,\n                                       'actions',\n                                       iou_threshold=iou_threshold,\n                                       score_threshold=score_threshold,\n                                       show_sub=False)"
+        },
+        {
+            "comment": "This code snippet is optimizing the iou and score thresholds for better F1 scores, then printing them and using the best values to get evaluation results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/TableTennis/predict/eval.py\":270-286",
+            "content": "            if best_F1 < avg_res['F1']:\n                best_F1 = avg_res['F1']\n                best_res = avg_res\n                best_iou_threshold = iou_threshold\n                best_score_threshold = score_threshold\n    print(\"best iou threshold = {:.1f}\".format(best_iou_threshold))\n    print(\"best score threshold = {:.1f}\".format(best_score_threshold))\n    print('best F1 score = {:.4f}'.format(best_F1))\n    print_head(0.5)\n    print_result(best_res)\n    get_eval_results(predicts,\n                     gts_data,\n                     'actions',\n                     iou_threshold=best_iou_threshold,\n                     score_threshold=best_score_threshold,\n                     show_sub=True)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e24e448a-f4f4-4034-a077-c81c2af43313.json b/docs/doc/e24e448a-f4f4-4034-a077-c81c2af43313.json
new file mode 100644
index 000000000..29fe6c32d
--- /dev/null
+++ b/docs/doc/e24e448a-f4f4-4034-a077-c81c2af43313.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code compares PaddleVideo's speed with popular frameworks, highlighting Slowfast's 2x faster speed and evaluates action segmentation model performance on Breakfast dataset. Tested on V100 GPU with batch size 2.",
+    "details": [
+        {
+            "comment": "This code provides a benchmark comparison of PaddleVideo with other popular frameworks and official releases in terms of speed. It specifies the environment, hardware, and software used for the experiments. The statistics include average training time and training speed measured in instances per second (ips). The dataset is prepared according to a specific method to ensure fairness in the comparison.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/benchmark.md\":0-26",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../zh-CN/benchmark.md) | English\n# Benchmark\nWe compare our results with some popular frameworks and official releases in terms of speed.\n## Environment\n### Hardware\n- 8 NVIDIA Tesla V100 (16G) GPUs\n- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz\n### Software\n- Python 3.7\n- PaddlePaddle2.0\n- CUDA 10.1\n- CUDNN 7.6.3\n- NCCL 2.1.15\n- GCC 8.2.0\n## Experiments and Statistics\nThe statistic is the average training time, including data processing and model training time, and the training speed is measured with ips(instance per second). Note that we skip the first 50 iters as they may contain the device warmup time.\nHere we compare PaddleVideo with the other video understanding toolkits in the same data and model settings.\nTo ensure the fairness of the comparison, the comparison experiments were conducted under the same hardware environment and using the same dataset. The dataset we used is generated by the [data preparation](dataset/k400.md), and in each model setting, the same data preprocessing methods are applied to make sure the same feature input."
+        },
+        {
+            "comment": "This table compares the inference performance (ips) of various video understanding models using PaddleVideo. It shows the batch size, number of GPUs used, and ips for each model. Slowfast model stands out for its 2x faster speed compared to counterparts. TSM and TSN have higher ips than others, but the reference implementation is not available.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/benchmark.md\":28-44",
+            "content": "Significant improvement can be observed when comparing with other video understanding framework as shown in the table below, Especially the [Slowfast](../../configs/recognition/slowfast/slowfast.yaml) model is nearly 2x faster than the counterparts.\n## Results\n### Recognizers\n| Model | batch size <sub>x</sub> gpus | PaddleVideo(ips) | Reference(ips) | MMAction2 (ips)  | PySlowFast (ips)|\n| :------: | :-------------------:|:---------------:|:---------------: | :---------------:  |:---------------: |\n| [TSM](../../configs/recognition/tsm/tsm.yaml) | 16x8 | 58.1 | 46.04(temporal-shift-module) | To do | X |\n| [PPTSM](../../configs/recognition/tsm/pptsm.yaml) | 16x8 |  57.6 | X |    X   | X |\n| [TSN](../../configs/recognition/tsn/tsn.yaml) | 16x8 |  841.1 |  To do (tsn-pytorch) | To do | X |\n| [Slowfast](../../configs/recognition/slowfast/slowfast.yaml)| 16x8 | 99.5 | X | To do | 43.2 |\n| [Attention_LSTM](../../configs/recognition/attention_lstm/attention_lstm.yaml) |  128x8  | 112.6  | X | X | X |\n### Localizers"
+        },
+        {
+            "comment": "This code provides a comparison of performance and accuracy between classical and popular sequential action segmentation models, with metrics such as F1@0.5, model names, Flops(M), Params(M), and test/inference times for different batch sizes. It is part of a repository that aims to compare these models using the Breakfast dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/benchmark.md\":46-63",
+            "content": "| Model | PaddleVideo(ips) |MMAction2 (ips) |BMN(boundary matching network) (ips)|\n| :--- | :---------------: | :-------------------------------------: | :-------------------------------------: |\n| [BMN](../../configs/localization/bmn.yaml)  | 43.84 | x | x |\n### Segmenters\nThis repo provides performance and accuracy comparison between classical and popular sequential action segmentation models\n| Model | Metrics | Value | Flops(M) |Params(M) | test time(ms) bs=1 | test time(ms) bs=2 | inference time(ms) bs=1 | inference time(ms) bs=2 |\n| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |\n| MS-TCN | F1@0.5 | 38.8% | 791.360 | 0.8 | 170 | - | 10.68 | - |\n| ASRF | F1@0.5 | 55.7% | 1,283.328 | 1.3 | 190 | - | 16.34 | - |\n* Model: model name, for example: PP-TSM\n* Metrics: Fill in the indicators used in the model test, and the data set used is **breakfast**\n* Value: Fill in the value corresponding to the metrics index, and generally keep two decimal places\n* Flops(M): The floating-"
+        },
+        {
+            "comment": "This code is describing the performance measurements for a PaddleVideo model. It calculates the model parameters (M), test time, and inference time with specific batch sizes and input tensor shapes. The test data used is \"breakfast\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/benchmark.md\":63-67",
+            "content": "point computation required for one forward operation of the model can be called `paddlevideo/tools/summary.py`script calculation (different models may need to be modified slightly), keep one decimal place, and measure it with data **input tensor with shape of (1, 2048, 1000)**\n* Params(M): The model parameter quantity, together with flops, will be calculated by the script, and one decimal place will be reserved\n* test time(ms) bs=1: When the python script starts the batchsize = 1 test, the time required for a sample is kept to two decimal places. The data set used in the test is **breakfast**.\n* test time(ms) bs=2: When the python script starts the batchsize = 2 test, the time required for a sample is kept to two decimal places. The sequential action segmentation model is generally a full convolution network, so the batch of training, testing and reasoning_ Size is 1. The data set used in the test is **breakfast**.\n* inference time(ms) bs=1: When the reasoning model is tested with GPU (def"
+        },
+        {
+            "comment": "The code states that the reasoning model is tested on a GPU (default V100) with batch size 2. The time required for a sample is reserved to two decimal places, and the dataset used for this particular reasoning process is \"breakfast\". Additionally, it mentions that the sequential action segmentation model is generally a full convolution network, which typically has a batch size of 1 during training, testing, and reasoning.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/benchmark.md\":67-68",
+            "content": "ault V100) with batchsize = 1, the time required for a sample is reserved to two decimal places. The dataset used for reasoning is **breakfast**.\n* inference time(ms) bs=2: When the reasoning model is tested with GPU (default V100) with batchsize = 1, the time required for a sample is reserved to two decimal places. The sequential action segmentation model is generally a full convolution network, so the batch of training, testing and reasoning_ Size is 1. The dataset used for reasoning is **breakfast**."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e3428502-31fd-4c16-ae7e-815bce30744d.json b/docs/doc/e3428502-31fd-4c16-ae7e-815bce30744d.json
new file mode 100644
index 000000000..60d4c970c
--- /dev/null
+++ b/docs/doc/e3428502-31fd-4c16-ae7e-815bce30744d.json
@@ -0,0 +1,10 @@
+{
+    "summary": "Class MoViNetHead extends BaseHead and registers itself with the HEADS registry. It initializes without any specific parameters and its forward function simply returns input 'x' without any modifications.",
+    "details": [
+        {
+            "comment": "Class MoViNetHead extends BaseHead and registers itself with the HEADS registry. It initializes without any specific parameters and its forward function simply returns input 'x' without any modifications.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/movinet_head.py\":0-14",
+            "content": "import collections.abc\ncontainer_abcs = collections.abc\nfrom ..registry import HEADS\nfrom .base import BaseHead\nfrom ..builder import build_loss\n@HEADS.register()\nclass MoViNetHead(BaseHead):\n    def __init__(self):\n        super().__init__()\n    def forward(self, x, *args):\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e371deba-14da-422e-a073-d06fb490edd5.json b/docs/doc/e371deba-14da-422e-a073-d06fb490edd5.json
new file mode 100644
index 000000000..a5729ac31
--- /dev/null
+++ b/docs/doc/e371deba-14da-422e-a073-d06fb490edd5.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code trains a multimodal video classification model using PaddlePaddle 2.0, incorporating text, video image, and audio data for tagging in multimodal scenarios. It focuses on training, evaluation, optimization, and use, with performance improvements through post-processing networks, faster training speeds, and stability enhancements. Three related papers are referenced: Attention Clusters for video classification, YouTube-8M as a large-scale benchmark, and Ernie's knowledge integration for enhanced representation.",
+    "details": [
+        {
+            "comment": "This code is for training a multimodal video classification model using PaddlePaddle 2.0, which combines text, video image, and audio data for tagging in multimodal scenarios. The provided feature files and label information are used for training and prediction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/README.md\":0-36",
+            "content": "# MutimodalVideoTag \u591a\u6a21\u6001\u89c6\u9891\u5206\u7c7b\u6a21\u578b\n---\n## \u5185\u5bb9\n- [\u6a21\u578b\u7b80\u4ecb](#\u6a21\u578b\u7b80\u4ecb)\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u8bad\u7ec3](#\u6a21\u578b\u8bad\u7ec3)\n- [\u6a21\u578b\u8bc4\u4f30](#\u6a21\u578b\u8bc4\u4f30)\n- [\u6a21\u578b\u63a8\u7406](#\u6a21\u578b\u63a8\u7406)\n- [\u6a21\u578b\u4f18\u5316](#\u6a21\u578b\u4f18\u5316)\n- [\u6a21\u578b\u90e8\u7f72](#\u6a21\u578b\u90e8\u7f72)\n- [\u53c2\u8003\u8bba\u6587](#\u53c2\u8003\u8bba\u6587)\n## \u6a21\u578b\u7b80\u4ecb\n\u8be5\u4ee3\u7801\u5e93\u7528\u4e8e\u591a\u6a21\u6001\u573a\u666f\u4e0b\u89c6\u9891\u5206\u7c7b\u4efb\u52a1\uff0c\u57fa\u4e8epaddle2.0\u7248\u672c\u5f00\u53d1\uff0c\u6a21\u578b\u57fa\u4e8e\u771f\u5b9e\u77ed\u89c6\u9891\u4e1a\u52a1\u6570\u636e\uff0c\u878d\u5408\u6587\u672c\u3001\u89c6\u9891\u56fe\u50cf\u3001\u97f3\u9891\u4e09\u79cd\u6a21\u6001\u8fdb\u884c\u89c6\u9891\u591a\u6a21\u6807\u7b7e\u5206\u7c7b\uff0c\u76f8\u6bd4\u7eaf\u89c6\u9891\u56fe\u50cf\u7279\u5f81\uff0c\u663e\u8457\u63d0\u5347\u9ad8\u5c42\u8bed\u4e49\u6807\u7b7e\u6548\u679c\u3002\u5176\u539f\u7406\u793a\u610f\u5982\u4e0b\u56fe\u6240\u793a\u3002\n<p align=\"center\">\n<img src=\"images/model.png\"  hspace='10'/> <br />\nMutimodalVideoTag \u591a\u6a21\u6001\u89c6\u9891\u5206\u7c7b\u6a21\u578b\u793a\u610f\u56fe\n</p>\n- \u6570\u636e\u5904\u7406\uff1a\u5206\u522b\u5bf9\u89c6\u9891\u4e09\u4e2a\u6a21\u6001\u7684\u6570\u636e\u8fdb\u884c\u5904\u7406\uff0c\u5bf9\u89c6\u9891\u8fdb\u884c\u62bd\u5e27\uff0c\u83b7\u5f97\u56fe\u50cf\u5e8f\u5217\uff1b\u62bd\u53d6\u89c6\u9891\u7684\u97f3\u9891pcm \u6587\u4ef6\uff1b\u6536\u96c6\u89c6\u9891\u6807\u9898\uff0c\u7b80\u5355\u8fdb\u884c\u6587\u672c\u957f\u5ea6\u622a\u65ad\uff0c\u4e00\u822c\u53d650\u4e2a\u5b57\u3002\n- \u7279\u5f81\u62bd\u53d6\uff1a\u4f7f\u7528\u9884\u8bad\u7ec3\u7684 ResNet \u5bf9\u56fe\u50cf\u62bd\u53d6\u9ad8\u5c42\u8bed\u4e49\u7279\u5f81\uff1b\u4f7f\u7528\u9884\u8bad\u7ec3\u7684VGGish\u7f51\u7edc\u62bd\u53d6\u97f3\u9891\u7279\u5f81\uff1b\u6587\u672c\u65b9\u9762\u4f7f\u7528[ERNIE 1.0](https://github.com/PaddlePaddle/ERNIE)\u62bd\u53d6\u6587\u672c\u7279\u5f81\uff0c\u65e0\u9700\u9884\u5148\u62bd\u53d6\uff0c\u652f\u6301\u89c6\u9891\u5206\u7c7b\u6a21\u578bfinetune\n- \u5e8f\u5217\u5b66\u4e60\uff1a\u5206\u522b\u4f7f\u7528\u72ec\u7acb\u7684LSTM \u5bf9\u56fe\u50cf\u7279\u5f81\u548c\u97f3\u9891\u7279\u5f81\u8fdb\u884c\u5e8f\u5217\u5b66\u4e60\uff0c\u6587\u672c\u65b9\u9762\u9884\u8bad\u7ec3\u6a21\u578b\u5bf9\u5b57\u7b26\u5e8f\u5217\u8fdb\u884c\u5efa\u6a21\uff0c\u5728ernie \u540e\u63a5\u5165\u4e00\u4e2atextcnn \u7f51\u7edc\u505a\u4e0b\u6e38\u4efb\u52a1\u7684\u8fc1\u79fb\u5b66\u4e60\u3002\n- \u591a\u6a21\u878d\u5408\uff1a\u6587\u672c\u5177\u6709\u663e\u5f0f\u7684\u9ad8\u5c42\u8bed\u4e49\u4fe1\u606f\uff0c\u5c06\u6587\u672c\u7279\u5f81\u5f15\u5165\u5230LSTM pooling \u8fc7\u7a0b\u6307\u5bfc\u56fe\u50cf\u548c\u97f3\u9891\u65f6\u5e8f\u6743\u91cd\u5206\u914d\uff0c\u8fdb\u884c\u4ea4\u53c9\u878d\u5408\uff0c\u6700\u540e\u5c06\u6587\u672c\u3001\u97f3\u9891\u3001\u89c6\u9891\u7279\u5f81\u62fc\u63a5\u3002\n- \u9884\u6d4b\u7ed3\u679c\uff1a\u5206\u7c7b\u5668\u9009\u7528sigmoid \u591a\u6807\u7b7e\u5206\u7c7b\u5668\uff0c\u652f\u6301\u89c6\u9891\u591a\u6807\u7b7e\u8f93\u51fa\u3002\n## \u6570\u636e\u51c6\u5907\n\u6570\u636e\u65b9\u9762\u63d0\u4f9b\u5df2\u7ecf\u62bd\u53d6\u597d\u56fe\u50cf\u3001\u97f3\u9891\u7279\u5f81\u7684\u7279\u5f81\u6587\u4ef6\uff0c\u4ee5\u53ca\u6807\u9898\u548c\u6807\u7b7e\u4fe1\u606f\uff0c\u6a21\u578b\u65b9\u9762\u63d0\u4f9b\u8bad\u7ec3\u597dcheckpoint \u6587\u4ef6\uff0c\u53ef\u8fdb\u884cfinetune\u3001\u6a21\u578b\u8bc4\u4f30\u3001\u9884\u6d4b\u3002\n```\nsh download.sh\n```\n\u6570\u636e\u6587\u4ef6\u5305\u62ec\u62bd\u53d6\u597d\u7279\u5f81\u7684\u6587\u4ef6\u5939 `feature_files`\uff0c\u4ee5\u53ca\u8bb0\u5f55\u5212\u5206\u7684txt \u6587\u4ef6\uff0c\u683c\u5f0f\u5982\u4e0b\n```\n\u6587\u4ef6\u540d \\t \u6807\u9898 \\t \u6807\u7b7e\n18e9bf08a2fc7eaa4ee9215ab42ea827.mp4 \u53ee\u53ee\u6765\u81ea\u8096\u5b87\u6881\u8096\u5b87\u6881rainco\u7684\u7279\u522b\u8d77\u5e8a\u94c3\u58f0 \u62cd\u4eba-\u5e05\u54e5,\u62cd\u4eba-\u79c0\u7279\u6548,\u660e\u661f\u5468\u8fb9-\u5176\u4ed6\u660e\u661f\u5468\u8fb9"
+        },
+        {
+            "comment": "This code is related to the PaddleVideo/applications/MultimodalVideoTag project, which focuses on training, evaluating, optimizing, and using a multimodal model for video tagging. The code snippet provides an overview of the steps involved in this process.\n\nTraining involves adjustable parameters like 'ernie_freeze' (for controlling whether text feature extraction from Ernie network should be fine-tuned) and 'lstm_pool_mode' (for controlling LSTM sequence pooling method). The training script is executed with 'sh train.sh'.\n\nEvaluation of the model on a test set is performed using 'sh eval_and_save_model.sh', which also supports converting checkpoint models into inference models with a 'save\\_only' option.\n\nInference, executed by 'sh inference.sh', uses the previously obtained inference model to make predictions, storing results in JSON format. The threshold for multi-label output can be adjusted using the 'conf/conf.txt' file's 'threshold' parameter.\n\nFinally, the code snippet mentions that experimentation has shown better performance for certain models and techniques in the text branch of the model, specifically mentioning gains from utilizing a post-processing network, faster training speeds, and improved stability. The table also shows how different combinations of these changes affect Hit@1 and Hit@2 metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/README.md\":37-64",
+            "content": "```\n##  \u6a21\u578b\u8bad\u7ec3\n\u6a21\u578b\u8bad\u7ec3\u8fc7\u7a0b\u6709\u5982\u4e0b\u53ef\u8c03\u6a21\u5f0f\uff0c\u53ef\u5728\u6839\u636e\u6570\u636e\u96c6\u60c5\u51b5\u8fdb\u884c\u8c03\u6574\uff0c\u5728`conf/conf.txt` \u6587\u4ef6\u4e2d\n- ernie_freeze: \u7528\u4e8e\u63a7\u5236\u6587\u672c\u63d0\u7279\u5f81\u7684ernie \u7f51\u7edc\u662f\u5426\u8fdb\u884cfinetune\uff0c\u56e0\u4e3aernie \u590d\u6742\u5ea6\u8fdc\u5927\u4e8e\u56fe\u50cf\u3001\u89c6\u9891\u5e8f\u5217\u5b66\u4e60\u7f51\u7edc\uff0c\u56e0\u6b64\u5728\u67d0\u4e9b\u6570\u636e\u96c6\u4e0a\u4e0d\u597d\u8bad\u7ec3\u3002\n- lstm_pool_mode: \u7528\u4e8e\u63a7\u5236lstm \u5e8f\u5217\u6c60\u5316\u7684\u65b9\u5f0f\uff0c\u9ed8\u8ba4\u662f\"text_guide\"\u8868\u793a\u5229\u7528\u6587\u672c\u52a0\u5f3a\u6c60\u5316\u6ce8\u610f\u529b\u6743\u91cd\uff0c\u5982\u679c\u8bbe\u7f6e\u4e3a\u7a7a\uff0c\u5219\u9ed8\u8ba4\u4e3a\u81ea\u6ce8\u610f\u529b\u7684\u6743\u91cd\u3002\n```\nsh train.sh \n```\n##  \u6a21\u578b\u8bc4\u4f30\n\u6a21\u578b\u5bf9\u6d4b\u8bd5\u96c6\u8fdb\u884c\u8bc4\u4f30\uff0c\u540c\u65f6\u652f\u6301\u5c06checkpoint \u6a21\u578b\u8f6c\u4e3ainference \u6a21\u578b\uff0c \u53ef\u7528\u53c2\u6570'save_only' \u9009\u9879\u63a7\u5236\uff0c\u8bbe\u7f6e\u5373\u53ea\u7528\u4e8e\u505a\u6a21\u578b\u8f6c\u6362\uff0c\u5f97\u5230inference \u6a21\u578b\n```\nsh eval_and_save_model.sh\n```\n##  \u6a21\u578b\u63a8\u7406\n\u901a\u8fc7\u4e0a\u4e00\u6b65\u5f97\u5230\u7684inference \u6a21\u578b\u8fdb\u884c\u9884\u6d4b\uff0c\u7ed3\u679c\u9ed8\u8ba4\u9608\u503c\u4e3a0.5\uff0c\u5b58\u50a8\u5230json \u6587\u4ef6\u4e2d\uff0c\u5728`conf/conf.txt` \u6587\u4ef6 `threshold` \u53c2\u6570\u8fdb\u884c\u63a7\u5236\u591a\u6807\u7b7e\u8f93\u51fa\u7684\u9608\u503c\u3002\n```\nsh inference.sh\n```\n## \u6a21\u578b\u4f18\u5316\n\u6a21\u578b\u65b9\u9762\uff0c\u4e3b\u8981\u5728\u6587\u672c\u5206\u652f\u8fdb\u884c\u4e86\u5b9e\u9a8c\uff0c\u5b9e\u9a8c\u7ed3\u679c\u663e\u793aERNIE \u5728\u591a\u5206\u652f\u4e0b\u4e0d\u5fae\u8c03\uff0c\u800c\u662f\u4f7f\u7528\u540e\u7f6e\u7f51\u7edc\u8fdb\u884c\u5fae\u8c03\uff0c\u8bad\u7ec3\u901f\u5ea6\u5feb\uff0c\u4e14\u7a33\u5b9a\uff0c\u540c\u65f6attention \u65b9\u9762\u4f7f\u7528\u6587\u672c\u4fe1\u606f\u589e\u5f3a\u56fe\u50cf\u3001\u97f3\u9891\u7684attention \u5b66\u4e60\u80fd\u4e00\u5b9a\u7a0b\u5ea6\u63d0\u5347\u6a21\u578b\u6548\u679c\u3002\n| \u6a21\u578b                                                         | Hit@1 | Hit@2 |\n| ------------------------------------------------------------ | ----- | ----- |\n| \u6587\u672c\u5206\u652fERNIE \u4e0dfinetune +self-attention                     | 71.07 | 83.72 |\n| \u6587\u672c\u5206\u652fERNIE \u4e0dfinetune +textcnn finetune + self-attention  | 72.66 | 85.01 |\n| \u6587\u672c\u5206\u652fERNIE \u4e0dfinetune +extcnn finetune + text-guide-attention | 73.29 | 85.59 |"
+        },
+        {
+            "comment": "The code is providing information about model deployment and referencing three related papers. The first paper introduces Attention Clusters for video classification, the second one presents YouTube-8M as a large-scale classification benchmark, and the third paper discusses Ernie's knowledge integration for enhanced representation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/README.md\":66-76",
+            "content": "## \u6a21\u578b\u90e8\u7f72\n<div align=\"center\">\n  <img src=\"images/show.gif\" width=\"480px\"/><br>\n</div>\n## \u53c2\u8003\u8bba\u6587\n- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen\n- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan\n- [Ernie: Enhanced representation through knowledge integration](https://arxiv.org/abs/1904.09223), Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Chen, Xuyi and Zhang, Han and Tian, Xin and Zhu, Danxiang and Tian, Hao and Wu, Hua"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e37bfe44-05e0-44a9-b61d-6353ad324cd8.json b/docs/doc/e37bfe44-05e0-44a9-b61d-6353ad324cd8.json
new file mode 100644
index 000000000..94ff4972d
--- /dev/null
+++ b/docs/doc/e37bfe44-05e0-44a9-b61d-6353ad324cd8.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The script updates a configuration file for NPU use, disables MKLDNN on non-x86_64, sets Python to 3.9 for NPU support, and changes the execution script from \"gpu\" to \"npu\". It executes a bash script using eval with the command stored in variable 'cmd'.",
+    "details": [
+        {
+            "comment": "This script modifies a configuration file to use NPU instead of GPU, disables MKLDNN on non-x86_64 environments, and updates the Python version to 3.9 for NPU backend support. It also changes the execution script from using \"gpu\" to \"npu\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python_npu.sh\":0-38",
+            "content": "#!/bin/bash\nsource test_tipc/common_func.sh\nfunction readlinkf() {\n    perl -MCwd -e \"print Cwd::abs_path shift\" \"$1\";\n}\nfunction func_parser_config() {\n    strs=$1\n    IFS=\" \"\n    array=(${strs})\n    tmp=${array[2]}\n    echo ${tmp}\n}\nBASEDIR=$(dirname \"$0\")\nREPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)\nFILENAME=$1\n# disable mkldnn on non x86_64 env\narch=$(uname -i)\nif [ $arch != \"x86_64\" ]; then\n    sed -i \"s/--enable_mkldnn:True|False/--enable_mkldnn:False/g\" $FILENAME\n    sed -i \"s/--enable_mkldnn:True/--enable_mkldnn:False/g\" $FILENAME\nfi\n# change gpu to npu in tipc txt configs\nsed -i \"s/use_gpu/use_npu/g\" $FILENAME\n# disable benchmark as AutoLog required nvidia-smi command\nsed -i \"s/--enable_benchmark:True/--enable_benchmark:False/g\" $FILENAME\n# python has been updated to version 3.9 for npu backend\nsed -i \"s/python3.7/python3.9/g\" $FILENAME\ndataline=`cat $FILENAME`\n# change gpu to npu in execution script\nsed -i \"s/\\\"gpu\\\"/\\\"npu\\\"/g\" test_tipc/test_train_inference_python.sh\n# pass parameters to test_train_inference_python.sh"
+        },
+        {
+            "comment": "The code is executing a bash script, storing the command in variable 'cmd', printing its execution status, and then running it using eval.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/test_tipc/test_train_inference_python_npu.sh\":39-41",
+            "content": "cmd=\"bash test_tipc/test_train_inference_python.sh ${FILENAME} $2\"\necho -e \"\\033[1;32m Started to run command: ${cmd}!  \\033[0m\"\neval $cmd"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e3c1398d-5459-42c5-89bd-7091b4244fc2.json b/docs/doc/e3c1398d-5459-42c5-89bd-7091b4244fc2.json
new file mode 100644
index 000000000..606bc787a
--- /dev/null
+++ b/docs/doc/e3c1398d-5459-42c5-89bd-7091b4244fc2.json
@@ -0,0 +1,60 @@
+{
+    "summary": "This code utilizes PaddleVideo library to train a video quality assessment model with GPU support, parallel processing, and distributed training. It includes data loaders, solvers, optimization, logging, and validation for efficient model training.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library for video quality assessment. It imports necessary modules, and uses builders to construct data loaders, datasets, models, solvers, and metrics. It also includes utilities for logging and batch processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":0-27",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport time\nimport os.path as osp\nimport paddle\nimport paddle.distributed.fleet as fleet\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..modeling.builder import build_model\nfrom ..solver import build_lr, build_optimizer\nfrom ..metrics import build_metric\nfrom ..utils import do_preciseBN\nfrom paddlevideo.utils import get_logger, coloring\nfrom paddlevideo.utils import (AverageMeter, build_rec_record, log_batch,"
+        },
+        {
+            "comment": "This function trains a model with specified configuration. It uses GPU for computation, and allows for parallel processing if multiple GPUs are available. Optionally, it performs validation during training and can also be used for fleet-based distributed training. The trained model's output directory is defined in the configuration file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":28-60",
+            "content": "                               log_epoch, save, load, mkdir)\n#from paddlevideo.metrics import QualityMetric\nimport numpy as np\nfrom scipy import stats\ndef train_model(cfg,\n                weights=None,\n                parallel=True,\n                validate=True,\n                amp=False,\n                fleet=False):\n    \"\"\"Train model entry\n    Args:\n    \tcfg (dict): configuration.\n        weights (str): weights path for finetuning.\n    \tparallel (bool): Whether multi-cards training. Default: True.\n        validate (bool): Whether to do evaluation. Default: False.\n    \"\"\"\n    if fleet:\n        fleet.init(is_collective=True)\n    logger = get_logger(\"paddlevideo\")\n    batch_size = cfg.DATASET.get('batch_size', 8)\n    valid_batch_size = cfg.DATASET.get('valid_batch_size', batch_size)\n    places = paddle.set_device('gpu')\n    # default num worker: 0, which means no subprocess will be created\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    model_name = cfg.model_name\n    output_dir = cfg.get(\"output_dir\", \"./output/model_name/\")"
+        },
+        {
+            "comment": "Code snippet creates a directory, builds the model based on configuration, and sets up data loaders for training and validation datasets. It also handles parallelization and distributed model usage if specified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":61-88",
+            "content": "    mkdir(output_dir)\n    # 1. Construct model\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    if fleet:\n        model = paddle.distributed_model(model)\n    # 2. Construct dataset and dataloader\n    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))\n    train_dataloader_setting = dict(batch_size=batch_size,\n                                    num_workers=num_workers,\n                                    collate_fn_cfg=cfg.get('MIX', None),\n                                    places=places)\n    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)\n    if validate:\n        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))\n        validate_dataloader_setting = dict(\n            batch_size=valid_batch_size,\n            num_workers=num_workers,\n            places=places,\n            drop_last=False,\n            shuffle=cfg.DATASET.get(\n                'shuffle_valid',\n                False)  #NOTE: attention lstm need shuffle valid data."
+        },
+        {
+            "comment": "Building a valid data loader, constructing a solver with specified optimizer and learning rate, resuming training from a previous epoch or finetuning the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":89-112",
+            "content": "        )\n        valid_loader = build_dataloader(valid_dataset,\n                                        **validate_dataloader_setting)\n    # 3. Construct solver.\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))\n    optimizer = build_optimizer(cfg.OPTIMIZER,\n                                lr,\n                                parameter_list=model.parameters())\n    if fleet:\n        optimizer = fleet.distributed_optimizer(optimizer)\n    # Resume\n    resume_epoch = cfg.get(\"resume_epoch\", 0)\n    if resume_epoch:\n        filename = osp.join(output_dir,\n                            model_name + \"_epoch_{}\".format(resume_epoch))\n        resume_model_dict = load(filename + '.pdparams')\n        resume_opt_dict = load(filename + '.pdopt')\n        model.set_state_dict(resume_model_dict)\n        optimizer.set_state_dict(resume_opt_dict)\n    # Finetune:\n    if weights:\n        assert resume_epoch == 0, \"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it.\""
+        },
+        {
+            "comment": "The code loads a model and sets its state dict. Then, it proceeds to train the model if not in resume phase. It builds record_list for metrics calculation and iterates through data from train loader to forward pass and calculate metrics. If AMP is enabled, auto-casting is used during training steps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":113-146",
+            "content": "        model_dict = load(weights)\n        model.set_state_dict(model_dict)\n    # 4. Train Model\n    ###AMP###\n    if amp:\n        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)\n    best = 0.\n    max_SROCC = 0\n    max_PLCC = 0\n    Metric = build_metric(cfg.METRIC)\n    for epoch in range(0, cfg.epochs):\n        if epoch < resume_epoch:\n            logger.info(\n                \"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... \"\n            )\n            continue\n        model.train()\n        record_list = build_rec_record(cfg.MODEL)\n        tic = time.time()\n        train_output = []\n        train_label = []\n        for i, data in enumerate(train_loader):\n            record_list['reader_time'].update(time.time() - tic)\n            # 4.1 forward\n            ###AMP###\n            if amp:\n                with paddle.amp.auto_cast(\n                        custom_black_list={\"temporal_shift\", \"reduce_mean\"}):\n                    if parallel:\n                        outputs = model._layers.train_step(data)"
+        },
+        {
+            "comment": "This code handles the model training step for video quality assessment. It uses the model's `train_step` function to calculate outputs and labels, then extends them to the train_output and train_label lists respectively. The average loss is calculated and scaled before its backward pass. Finally, it performs optimization by minimizing the scaler and clearing gradients.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":147-170",
+            "content": "                        ## required for DataParallel, will remove in next version\n                        model._reducer.prepare_for_backward(\n                            list(model._find_varbase(outputs)))\n                    else:\n                        outputs = model.train_step(data)\n                train_output.extend(outputs['output'])\n                train_label.extend(outputs['label'])\n                avg_loss = outputs['loss']\n                scaled = scaler.scale(avg_loss)\n                scaled.backward()\n                # keep prior to 2.0 design\n                scaler.minimize(optimizer, scaled)\n                optimizer.clear_grad()\n            else:\n                if parallel:\n                    outputs = model._layers.train_step(data)\n                    ## required for DataParallel, will remove in next version\n                    model._reducer.prepare_for_backward(\n                        list(model._find_varbase(outputs)))\n                else:\n                    outputs = model.train_step(data)"
+        },
+        {
+            "comment": "Code snippet performs backward propagation, optimizes model parameters, logs training progress and learning rate, updates metrics, and logs information at specified intervals.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":172-197",
+            "content": "                train_output.extend(outputs['output'])\n                train_label.extend(outputs['label'])\n                # 4.2 backward\n                avg_loss = outputs['loss']\n                avg_loss.backward()\n                # 4.3 minimize\n                optimizer.step()\n                optimizer.clear_grad()\n            # log record\n            record_list['lr'].update(optimizer._global_learning_rate(),\n                                     batch_size)\n            for name, value in outputs.items():\n                if name == 'output' or name == 'label':\n                    continue\n                record_list[name].update(value, batch_size)\n            record_list['batch_time'].update(time.time() - tic)\n            tic = time.time()\n            if i % cfg.get(\"log_interval\", 10) == 0:\n                ips = \"ips: {:.5f} instance/sec.\".format(\n                    batch_size / record_list[\"batch_time\"].val)\n                log_batch(record_list, i, epoch + 1, cfg.epochs, \"train\", ips)\n            # learning rate iter step"
+        },
+        {
+            "comment": "This code is part of a training process for a video quality assessment model. It checks if the learning rate should be updated by an iterative step, then updates it accordingly. The code calculates the train_SROCC and train_PLCC metrics to track progress, logs this information, and evaluates the model's performance on a separate validation dataset. A record of the training process is maintained to monitor batch time and other relevant statistics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":198-228",
+            "content": "            if cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n                lr.step()\n        # learning rate epoch step\n        if not cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n            lr.step()\n        train_PLCC, train_SROCC = Metric.accumulate_train(\n            train_output, train_label)\n        logger.info(\"train_SROCC={}\".format(train_SROCC))\n        logger.info(\"train_PLCC={}\".format(train_PLCC))\n        ips = \"ips: {:.5f} instance/sec.\".format(\n            batch_size * record_list[\"batch_time\"].count /\n            record_list[\"batch_time\"].sum)\n        log_epoch(record_list, epoch + 1, \"train\", ips)\n        eval_output = []\n        eval_label = []\n        def evaluate(best, max_SROCC, max_PLCC):\n            \"\"\"evaluate\"\"\"\n            model.eval()\n            record_list = build_rec_record(cfg.MODEL)\n            record_list.pop('lr')\n            tic = time.time()\n            for i, data in enumerate(valid_loader):\n                if parallel:\n                    outputs = model._layers.val_step(data)"
+        },
+        {
+            "comment": "This code is part of a model's validation step during training. It collects outputs and labels from the model, updates logging records, and logs validation metrics such as SROCC and PLCC. If these metrics are greater than the previous maximum values, it updates the max values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":229-253",
+            "content": "                else:\n                    outputs = model.val_step(data)\n                eval_output.extend(outputs['output'])\n                eval_label.extend(outputs['label'])\n                # log_record\n                for name, value in outputs.items():\n                    if name == 'output' or name == 'label':\n                        continue\n                    record_list[name].update(value, batch_size)\n                record_list['batch_time'].update(time.time() - tic)\n                tic = time.time()\n                if i % cfg.get(\"log_interval\", 10) == 0:\n                    ips = \"ips: {:.5f} instance/sec.\".format(\n                        batch_size / record_list[\"batch_time\"].val)\n                    log_batch(record_list, i, epoch + 1, cfg.epochs, \"val\", ips)\n            eval_PLCC, eval_SROCC = Metric.accumulate_train(\n                eval_output, eval_label)\n            logger.info(\"val_SROCC={}\".format(eval_SROCC))\n            logger.info(\"val_PLCC={}\".format(eval_PLCC))\n            if max_SROCC <= eval_SROCC and max_PLCC <= eval_PLCC:"
+        },
+        {
+            "comment": "This code snippet is responsible for storing the best optimizer and model states, logging instance per second (ips) during validation phase, and optionally performing precise batch normalization if configuration allows. It returns the best parameters, maximum SROCC, and maximum PLCC values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":254-275",
+            "content": "                max_SROCC = eval_SROCC\n                max_PLCC = eval_PLCC\n                logger.info(\"max_SROCC={}\".format(max_SROCC))\n                logger.info(\"max_PLCC={}\".format(max_PLCC))\n                save(optimizer.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdopt\"))\n                save(model.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdparams\"))\n            ips = \"ips: {:.5f} instance/sec.\".format(\n                batch_size * record_list[\"batch_time\"].count /\n                record_list[\"batch_time\"].sum)\n            log_epoch(record_list, epoch + 1, \"val\", ips)\n            return best, max_SROCC, max_PLCC\n        # use precise bn to improve acc\n        if cfg.get(\"PRECISEBN\") and (epoch % cfg.PRECISEBN.preciseBN_interval\n                                     == 0 or epoch == cfg.epochs - 1):\n            do_preciseBN(\n                model, train_loader, parallel,\n                min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader)))"
+        },
+        {
+            "comment": "This code block performs validation and model saving in a training process. It validates the model every 'val_interval' epochs or on the last epoch, and saves optimizer and model states every 'save_interval' epochs or on the last epoch. The logger then informs that training is finished.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py\":277-294",
+            "content": "        # 5. Validation\n        if validate and (epoch % cfg.get(\"val_interval\", 1) == 0\n                         or epoch == cfg.epochs - 1):\n            with paddle.no_grad():\n                best, max_SROCC, max_PLCC = evaluate(best, max_SROCC, max_PLCC)\n        # 6. Save model\n        if epoch % cfg.get(\"save_interval\", 1) == 0 or epoch == cfg.epochs - 1:\n            save(\n                optimizer.state_dict(),\n                osp.join(output_dir,\n                         model_name + \"_epoch_{}.pdopt\".format(epoch)))\n            save(\n                model.state_dict(),\n                osp.join(output_dir,\n                         model_name + \"_epoch_{}.pdparams\".format(epoch)))\n    logger.info('training {model_name} finished')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e48129d1-3eac-49d2-9ff5-4c31f24a68b1.json b/docs/doc/e48129d1-3eac-49d2-9ff5-4c31f24a68b1.json
new file mode 100644
index 000000000..749355998
--- /dev/null
+++ b/docs/doc/e48129d1-3eac-49d2-9ff5-4c31f24a68b1.json
@@ -0,0 +1,35 @@
+{
+    "summary": "The code creates a PyQt5 video player UI with QGraphicsView, QFrame button, and main window layout. It includes interactive buttons, sliders, QProgressBar, QLabel, and tabs in the tab widget for status and configuration.",
+    "details": [
+        {
+            "comment": "The code is a form implementation generated by PyQt5 UI code generator. It defines the `Ui_MainWindow` class which has a method `setupUi` that sets up the properties and widgets of the `MainWindow`. The main window has a central widget, containing a video frame, with dimensions 761x361 pixels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/ui/demo.py\":0-24",
+            "content": "# -*- coding: utf-8 -*-\n# Form implementation generated from reading ui file '/Users/zhanghongji/PycharmProjects/EIVideo/resources/QT/demo.ui'\n#\n# Created by: PyQt5 UI code generator 5.15.6\n#\n# WARNING: Any manual changes made to this file will be lost when pyuic5 is\n# run again.  Do not edit this file unless you know what you are doing.\nfrom PyQt5 import QtCore, QtGui, QtWidgets\nclass Ui_MainWindow(object):\n    def setupUi(self, MainWindow):\n        MainWindow.setObjectName(\"MainWindow\")\n        MainWindow.resize(800, 486)\n        MainWindow.setMinimumSize(QtCore.QSize(800, 486))\n        MainWindow.setMaximumSize(QtCore.QSize(800, 486))\n        self.centralwidget = QtWidgets.QWidget(MainWindow)\n        self.centralwidget.setObjectName(\"centralwidget\")\n        self.video_frame = QtWidgets.QFrame(self.centralwidget)\n        self.video_frame.setGeometry(QtCore.QRect(20, 20, 761, 361))\n        self.video_frame.setFrameShape(QtWidgets.QFrame.StyledPanel)\n        self.video_frame.setFrameShadow(QtWidgets.QFrame.Raised)"
+        },
+        {
+            "comment": "This code sets up the user interface elements for a video player. It creates a QGraphicsView for displaying video frames, and a QFrame with a horizontal layout widget containing a PushButton for opening videos. The QGraphicsView is set to take up most of the video frame, while the QFrame and its widgets sit at the bottom.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/ui/demo.py\":25-40",
+            "content": "        self.video_frame.setObjectName(\"video_frame\")\n        self.graphicsView = QtWidgets.QGraphicsView(self.video_frame)\n        self.graphicsView.setGeometry(QtCore.QRect(0, 0, 761, 321))\n        self.graphicsView.setObjectName(\"graphicsView\")\n        self.frame_2 = QtWidgets.QFrame(self.video_frame)\n        self.frame_2.setGeometry(QtCore.QRect(0, 320, 761, 41))\n        self.frame_2.setFrameShape(QtWidgets.QFrame.StyledPanel)\n        self.frame_2.setFrameShadow(QtWidgets.QFrame.Raised)\n        self.frame_2.setObjectName(\"frame_2\")\n        self.horizontalLayoutWidget = QtWidgets.QWidget(self.frame_2)\n        self.horizontalLayoutWidget.setGeometry(QtCore.QRect(-1, -1, 761, 41))\n        self.horizontalLayoutWidget.setObjectName(\"horizontalLayoutWidget\")\n        self.horizontalLayout = QtWidgets.QHBoxLayout(self.horizontalLayoutWidget)\n        self.horizontalLayout.setContentsMargins(0, 0, 0, 0)\n        self.horizontalLayout.setObjectName(\"horizontalLayout\")\n        self.open_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)"
+        },
+        {
+            "comment": "Creates UI buttons and a slider for video player interaction, sets their object names.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/ui/demo.py\":41-57",
+            "content": "        self.open_btn.setObjectName(\"open_btn\")\n        self.horizontalLayout.addWidget(self.open_btn)\n        self.save_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)\n        self.save_btn.setObjectName(\"save_btn\")\n        self.horizontalLayout.addWidget(self.save_btn)\n        self.horizontalSlider = QtWidgets.QSlider(self.horizontalLayoutWidget)\n        self.horizontalSlider.setOrientation(QtCore.Qt.Horizontal)\n        self.horizontalSlider.setObjectName(\"horizontalSlider\")\n        self.horizontalLayout.addWidget(self.horizontalSlider)\n        self.select_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)\n        self.select_btn.setObjectName(\"select_btn\")\n        self.horizontalLayout.addWidget(self.select_btn)\n        self.clean_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)\n        self.clean_btn.setObjectName(\"clean_btn\")\n        self.horizontalLayout.addWidget(self.clean_btn)\n        self.start_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)\n        self.start_btn.setObjectName(\"start_btn\")"
+        },
+        {
+            "comment": "The code is creating a GUI layout for a video player application. It adds widgets to a main window, sets the geometry and styling of some elements, and creates a tabbed interface with two labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/ui/demo.py\":58-76",
+            "content": "        self.horizontalLayout.addWidget(self.start_btn)\n        self.draw_frame = QtWidgets.QFrame(self.video_frame)\n        self.draw_frame.setGeometry(QtCore.QRect(0, 10, 751, 301))\n        self.draw_frame.setFrameShape(QtWidgets.QFrame.StyledPanel)\n        self.draw_frame.setFrameShadow(QtWidgets.QFrame.Raised)\n        self.draw_frame.setObjectName(\"draw_frame\")\n        self.menu_tab = QtWidgets.QTabWidget(self.centralwidget)\n        self.menu_tab.setGeometry(QtCore.QRect(20, 380, 761, 81))\n        self.menu_tab.setObjectName(\"menu_tab\")\n        self.tab = QtWidgets.QWidget()\n        self.tab.setObjectName(\"tab\")\n        self.act_label = QtWidgets.QLabel(self.tab)\n        self.act_label.setEnabled(True)\n        self.act_label.setGeometry(QtCore.QRect(10, 30, 71, 21))\n        self.act_label.setObjectName(\"act_label\")\n        self.act_info_label = QtWidgets.QLabel(self.tab)\n        self.act_info_label.setEnabled(True)\n        self.act_info_label.setGeometry(QtCore.QRect(80, 30, 81, 21))\n        self.act_info_label.setObjectName(\"act_info_label\")"
+        },
+        {
+            "comment": "Code snippet creates a QProgressBar and QLabel, sets their properties and positions, adds them to the tab widget, and sets tabs for the main window.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/ui/demo.py\":77-96",
+            "content": "        self.act_progressbar = QtWidgets.QProgressBar(self.tab)\n        self.act_progressbar.setGeometry(QtCore.QRect(170, 32, 521, 21))\n        self.act_progressbar.setProperty(\"value\", 24)\n        self.act_progressbar.setObjectName(\"act_progressbar\")\n        self.label_3 = QtWidgets.QLabel(self.tab)\n        self.label_3.setEnabled(True)\n        self.label_3.setGeometry(QtCore.QRect(680, 30, 60, 21))\n        self.label_3.setLayoutDirection(QtCore.Qt.LeftToRight)\n        self.label_3.setAlignment(QtCore.Qt.AlignRight|QtCore.Qt.AlignTrailing|QtCore.Qt.AlignVCenter)\n        self.label_3.setObjectName(\"label_3\")\n        self.menu_tab.addTab(self.tab, \"\")\n        self.tab_2 = QtWidgets.QWidget()\n        self.tab_2.setObjectName(\"tab_2\")\n        self.menu_tab.addTab(self.tab_2, \"\")\n        MainWindow.setCentralWidget(self.centralwidget)\n        self.statusbar = QtWidgets.QStatusBar(MainWindow)\n        self.statusbar.setObjectName(\"statusbar\")\n        MainWindow.setStatusBar(self.statusbar)\n        self.retranslateUi(MainWindow)"
+        },
+        {
+            "comment": "This code is a part of the user interface (UI) definition for a MainWindow in the QEIVideo application. It sets the window title and button texts, translates strings using QtCore.QCoreApplication.translate, and updates tab labels using self.menu_tab.setTabText. The UI consists of several tabs: one for displaying the current status, another for configuring attributes. The code also connects slots to signals in this MainWindow class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/QEIVideo/ui/demo.py\":97-112",
+            "content": "        self.menu_tab.setCurrentIndex(0)\n        QtCore.QMetaObject.connectSlotsByName(MainWindow)\n    def retranslateUi(self, MainWindow):\n        _translate = QtCore.QCoreApplication.translate\n        MainWindow.setWindowTitle(_translate(\"MainWindow\", \"MainWindow\"))\n        self.open_btn.setText(_translate(\"MainWindow\", \"\u6253\u5f00\u89c6\u9891\"))\n        self.save_btn.setText(_translate(\"MainWindow\", \"\u4fdd\u5b58\u6807\u6ce8\"))\n        self.select_btn.setText(_translate(\"MainWindow\", \"\u9009\u62e9\u76ee\u6807\"))\n        self.clean_btn.setText(_translate(\"MainWindow\", \"\u6e05\u7a7a\u76ee\u6807\"))\n        self.start_btn.setText(_translate(\"MainWindow\", \"\u5f00\u59cb\u63a8\u7406\"))\n        self.act_label.setText(_translate(\"MainWindow\", \"\u5f53\u524d\u72b6\u6001\uff1a\"))\n        self.act_info_label.setText(_translate(\"MainWindow\", \"-------------\"))\n        self.label_3.setText(_translate(\"MainWindow\", \"12%\"))\n        self.menu_tab.setTabText(self.menu_tab.indexOf(self.tab), _translate(\"MainWindow\", \"\u72b6\u6001\"))\n        self.menu_tab.setTabText(self.menu_tab.indexOf(self.tab_2), _translate(\"MainWindow\", \"\u5c5e\u6027\u914d\u7f6e\"))"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e4afe476-f448-4c8f-b18d-130062cc65aa.json b/docs/doc/e4afe476-f448-4c8f-b18d-130062cc65aa.json
new file mode 100644
index 000000000..615675e58
--- /dev/null
+++ b/docs/doc/e4afe476-f448-4c8f-b18d-130062cc65aa.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code imports various dataset classes from PaddleVideo library for video understanding tasks, and adds them to the `__all__` list for accessibility. These datasets include VideoDataset, FrameDataset, and more, with licensing information provided.",
+    "details": [
+        {
+            "comment": "This code is importing various dataset classes from different modules in the PaddleVideo library. These datasets are used for video understanding tasks, such as action recognition, activity classification, and video captioning. The code also includes licensing information and mentions that these datasets can be accessed on an \"AS IS\" basis.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/__init__.py\":0-24",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .actbert_dataset import ActBertDataset\nfrom .ava_dataset import AVADataset\nfrom .bmn_dataset import BMNDataset\nfrom .davis_dataset import DavisDataset\nfrom .feature import FeatureDataset\nfrom .frame import FrameDataset, FrameDataset_Sport\nfrom .MRI import MRIDataset\nfrom .MRI_SlowFast import SFMRIDataset\nfrom .msrvtt import MSRVTTDataset\nfrom .actbert_dataset import ActBertDataset\nfrom .asrf_dataset import ASRFDataset"
+        },
+        {
+            "comment": "This code imports several dataset classes and adds them to the `__all__` list, making them accessible within this module. The datasets include VideoDataset, FrameDataset, SFVideoDataset, BMNDataset, FeatureDataset, SkeletonDataset, AVADataset, MonoDataset, MSRVTTDataset, ActBertDataset, DavisDataset, MRIDataset, SFMRIDataset, FrameDataset_Sport, MSTCNDataset, ASRFDataset, UCF101SkeletonDataset, and UCF24Dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/dataset/__init__.py\":25-40",
+            "content": "from .ms_tcn_dataset import MSTCNDataset\nfrom .oxford import MonoDataset\nfrom .skeleton import SkeletonDataset\nfrom .slowfast_video import SFVideoDataset\nfrom .video import VideoDataset\nfrom .ucf101_skeleton import UCF101SkeletonDataset\nfrom .ucf24_dataset import UCF24Dataset\n__all__ = [\n    'VideoDataset', 'FrameDataset', 'SFVideoDataset', 'BMNDataset',\n    'FeatureDataset', 'SkeletonDataset', 'AVADataset', 'MonoDataset',\n    'MSRVTTDataset', 'ActBertDataset', 'DavisDataset', 'MRIDataset',\n    'SFMRIDataset', 'FrameDataset_Sport', 'MSTCNDataset', 'ASRFDataset',\n    'UCF101SkeletonDataset', 'UCF24Dataset'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e4c769ba-4090-47ea-a65c-3c5b9e860a4d.json b/docs/doc/e4c769ba-4090-47ea-a65c-3c5b9e860a4d.json
new file mode 100644
index 000000000..678077383
--- /dev/null
+++ b/docs/doc/e4c769ba-4090-47ea-a65c-3c5b9e860a4d.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code downloads, extracts, and organizes UCF101 dataset into separate folders with training and validation sets, representing a file hierarchy for easy access.",
+    "details": [
+        {
+            "comment": "This code provides instructions on how to download UCF101 dataset annotations and videos, extract frames from the video files, and generate file path lists for both the original videos and extracted frames. The code also includes commands to execute these steps using provided Python scripts.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ucf101.md\":0-39",
+            "content": "# UCF101\u6570\u636e\u51c6\u5907\nUCF101\u6570\u636e\u7684\u76f8\u5173\u51c6\u5907\u3002\u4e3b\u8981\u5305\u62ecUCF101\u7684video\u6587\u4ef6\u4e0b\u8f7d\uff0cvideo\u6587\u4ef6\u63d0\u53d6frames\uff0c\u4ee5\u53ca\u751f\u6210\u6587\u4ef6\u7684\u8def\u5f84list\u3002\n---\n## 1. \u6570\u636e\u4e0b\u8f7d\nUCF101\u6570\u636e\u7684\u8be6\u7ec6\u4fe1\u606f\u53ef\u4ee5\u53c2\u8003\u7f51\u7ad9[UCF101](https://www.crcv.ucf.edu/data/UCF101.php)\u3002 \u4e3a\u4e86\u65b9\u4fbf\u7528\u6237\u4f7f\u7528\uff0c\u6211\u4eec\u63d0\u4f9b\u4e86UCF101\u6570\u636e\u7684annotations\u6587\u4ef6\u548cvideos\u6587\u4ef6\u7684\u4e0b\u8f7d\u811a\u672c\u3002\n### \u4e0b\u8f7dannotations\u6587\u4ef6\n\u9996\u5148\uff0c\u8bf7\u786e\u4fdd\u5728`./data/dataset/ucf101/`\u76ee\u5f55\u4e0b\uff0c\u8f93\u5165\u5982\u4e0bUCF101\u6570\u636e\u96c6\u7684\u6807\u6ce8\u6587\u4ef6\u7684\u547d\u4ee4\u3002\n```shell\nbash download_annotations.sh\n```\n### \u4e0b\u8f7dUCF101\u7684\u89c6\u9891\u6587\u4ef6\n\u540c\u6837\u9700\u8981\u786e\u4fdd\u5728`./data/dataset/ucf101/`\u76ee\u5f55\u4e0b\uff0c\u8f93\u5165\u4e0b\u8ff0\u547d\u4ee4\u4e0b\u8f7d\u89c6\u9891\u6587\u4ef6\n```shell\nbash download_videos.sh\n```\n\u4e0b\u8f7d\u5b8c\u6210\u540e\u89c6\u9891\u6587\u4ef6\u4f1a\u5b58\u50a8\u5728`./data/dataset/ucf101/videos/`\u6587\u4ef6\u5939\u4e0b\uff0c\u89c6\u9891\u6587\u4ef6\u5927\u5c0f\u4e3a6.8G\u3002\n---\n## 2. \u63d0\u53d6\u89c6\u9891\u6587\u4ef6\u7684frames\n\u4e3a\u4e86\u52a0\u901f\u7f51\u7edc\u7684\u8bad\u7ec3\u8fc7\u7a0b\uff0c\u6211\u4eec\u9996\u5148\u5bf9\u89c6\u9891\u6587\u4ef6\uff08ucf101\u89c6\u9891\u6587\u4ef6\u4e3aavi\u683c\u5f0f\uff09\u63d0\u53d6\u5e27 (frames)\u3002\u76f8\u5bf9\u4e8e\u76f4\u63a5\u901a\u8fc7\u89c6\u9891\u6587\u4ef6\u8fdb\u884c\u7f51\u7edc\u8bad\u7ec3\u7684\u65b9\u5f0f\uff0cframes\u7684\u65b9\u5f0f\u80fd\u591f\u52a0\u5feb\u7f51\u7edc\u8bad\u7ec3\u7684\u901f\u5ea6\u3002\n\u76f4\u63a5\u8f93\u5165\u5982\u4e0b\u547d\u4ee4\uff0c\u5373\u53ef\u63d0\u53d6ucf101\u89c6\u9891\u6587\u4ef6\u7684frames\n``` python\npython extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext avi\n```\n\u89c6\u9891\u6587\u4ef6frames\u63d0\u53d6\u5b8c\u6210\u540e\uff0c\u4f1a\u5b58\u50a8\u5728`./rawframes`\u6587\u4ef6\u5939\u4e0b\uff0c\u5927\u5c0f\u4e3a56G\u3002\n---\n## 3. \u751f\u6210frames\u6587\u4ef6\u548c\u89c6\u9891\u6587\u4ef6\u7684\u8def\u5f84list\n\u751f\u6210\u89c6\u9891\u6587\u4ef6\u7684\u8def\u5f84list\uff0c\u8f93\u5165\u5982\u4e0b\u547d\u4ee4\n```python\npython build_ucf101_file_list.py videos/ --level 2 --format videos --out_list_path ./\n```\n\u751f\u6210frames\u6587\u4ef6\u7684\u8def\u5f84list\uff0c\u8f93\u5165\u5982\u4e0b\u547d\u4ee4\uff1a\n```python\npython build_ucf101_file_list.py rawframes/ --level 2 --format rawframes --out_list_path ./"
+        },
+        {
+            "comment": "This code is describing the organization of files for UCF101 dataset, specifying paths and formats. It organizes videos or frames into separate folders based on their categories and splits them into training and validation sets. The annotations folder contains information about each video or frame, while the dataset folder stores the generated path lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ucf101.md\":40-80",
+            "content": "```\n**\u53c2\u6570\u8bf4\u660e**\n`videos/` \u6216\u8005 `rawframes/` \uff1a \u8868\u793a\u89c6\u9891\u6216\u8005frames\u6587\u4ef6\u7684\u5b58\u50a8\u8def\u5f84\n`--level 2` \uff1a \u8868\u793a\u6587\u4ef6\u7684\u5b58\u50a8\u7ed3\u6784\n`--format`\uff1a \u8868\u793a\u662f\u9488\u5bf9\u89c6\u9891\u8fd8\u662fframes\u751f\u6210\u8def\u5f84list\n`--out_list_path `\uff1a \u8868\u793a\u751f\u7684\u8def\u5f84list\u6587\u4ef6\u5b58\u50a8\u4f4d\u7f6e\n# \u4ee5\u4e0a\u6b65\u9aa4\u5b8c\u6210\u540e\uff0c\u6587\u4ef6\u7ec4\u7ec7\u5f62\u5f0f\u5982\u4e0b\u6240\u793a\n```\n\u251c\u2500\u2500 data\n|   \u251c\u2500\u2500 dataset\n|   \u2502   \u251c\u2500\u2500 ucf101\n|   \u2502   \u2502   \u251c\u2500\u2500 ucf101_{train,val}_split_{1,2,3}_rawframes.txt\n|   \u2502   \u2502   \u251c\u2500\u2500 ucf101_{train,val}_split_{1,2,3}_videos.txt\n|   \u2502   \u2502   \u251c\u2500\u2500 annotations\n|   \u2502   \u2502   \u251c\u2500\u2500 videos\n|   \u2502   \u2502   \u2502   \u251c\u2500\u2500 ApplyEyeMakeup\n|   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 v_ApplyEyeMakeup_g01_c01.avi\n|  \n|   \u2502   \u2502   \u2502   \u251c\u2500\u2500 YoYo\n|   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 v_YoYo_g25_c05.avi\n|   \u2502   \u2502   \u251c\u2500\u2500 rawframes\n|   \u2502   \u2502   \u2502   \u251c\u2500\u2500 ApplyEyeMakeup\n|   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 v_ApplyEyeMakeup_g01_c01\n|   \u2502   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 img_00001.jpg\n|   \u2502   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 img_00002.jpg\n|   \u2502   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 ...\n|   \u2502   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 flow_x_00001.jpg\n|   \u2502   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 flow_x_00002.jpg\n|   \u2502   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 ...\n|   \u2502   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 flow_y_00001.jpg\n|   \u2502   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 flow_y_00002.jpg\n|   \u2502   \u2502   \u2502   \u251c\u2500\u2500 ...\n|   \u2502   \u2502   \u2502   \u251c\u2500\u2500 YoYo"
+        },
+        {
+            "comment": "Code represents a file hierarchy in the UCF101 dataset, where each folder inside \"dataset\" corresponds to a video category and contains clips (e.g., v_YoYo_g01_c01).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/ucf101.md\":81-85",
+            "content": "|   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 v_YoYo_g01_c01\n|   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 ...\n|   \u2502   \u2502   \u2502   \u2502   \u251c\u2500\u2500 v_YoYo_g25_c05\n```"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e503dca0-1546-4303-8a8e-4e2ba7b82c35.json b/docs/doc/e503dca0-1546-4303-8a8e-4e2ba7b82c35.json
new file mode 100644
index 000000000..217464e53
--- /dev/null
+++ b/docs/doc/e503dca0-1546-4303-8a8e-4e2ba7b82c35.json
@@ -0,0 +1,10 @@
+{
+    "summary": "Updating PaddleVideo's EIVideo on GitHub: pushing and pulling development branches, splitting and rejoining code.",
+    "details": [
+        {
+            "comment": "Updating PaddleVideo's EIVideo on GitHub: pushing and pulling development branches, splitting and rejoining code.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/resources/cmd\":0-3",
+            "content": "# \u66f4\u65b0PaddleVideo\u4e0a\u7684EIVideo\ngit subtree push --prefix=applications/EIVideo/ https://github.com/QPT-Family/EIVideo \u5f00\u53d1\u5206\u652f\ngit subtree pull --prefix=applications/EIVideo/ https://github.com/QPT-Family/EIVideo \u5f00\u53d1\u5206\u652f --squash\ngit subtree split --rejoin --prefix=applications/EIVideo/  --branch \u5f00\u53d1\u5206\u652f"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e50b2b3e-3714-41e3-8c30-d1fb8613eb3a.json b/docs/doc/e50b2b3e-3714-41e3-8c30-d1fb8613eb3a.json
new file mode 100644
index 000000000..35639f613
--- /dev/null
+++ b/docs/doc/e50b2b3e-3714-41e3-8c30-d1fb8613eb3a.json
@@ -0,0 +1,55 @@
+{
+    "summary": "The Python code's `AveragePrecisionCalculator` class calculates interpolated average precision, supports large datasets, and handles sparse prediction scores and ground truth labels for classification tasks.",
+    "details": [
+        {
+            "comment": "This code is for a Python class that calculates the interpolated average precision (IAP) of ranked items in a list. It follows the definition provided in the given link and can be used as a static function to directly calculate IAP from a short ranked list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":0-22",
+            "content": "# Copyright 2016 Google Inc. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS-IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Calculate or keep track of the interpolated average precision.\nIt provides an interface for calculating interpolated average precision for an\nentire list or the top-n ranked items. For the definition of the\n(non-)interpolated average precision:\nhttp://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf\nExample usages:\n1) Use it as a static function call to directly calculate average precision for\na short ranked list in the memory."
+        },
+        {
+            "comment": "The code creates an instance of the AveragePrecisionCalculator class and uses its accumulate method to process parts of a ranked list that cannot be stored in memory or observed at once. After processing all parts, it uses the peek_interpolated_ap_at_n method to get the interpolated average precision at a given number of elements. The code also imports heapq and random modules for priority queue and random number generation respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":24-54",
+            "content": "```\nimport random\np = np.array([random.random() for _ in xrange(10)])\na = np.array([random.choice([0, 1]) for _ in xrange(10)])\nap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)\n```\n2) Use it as an object for long ranked list that cannot be stored in memory or\nthe case where partial predictions can be observed at a time (Tensorflow\npredictions). In this case, we first call the function accumulate many times\nto process parts of the ranked list. After processing all the parts, we call\npeek_interpolated_ap_at_n.\n```\np1 = np.array([random.random() for _ in xrange(5)])\na1 = np.array([random.choice([0, 1]) for _ in xrange(5)])\np2 = np.array([random.random() for _ in xrange(5)])\na2 = np.array([random.choice([0, 1]) for _ in xrange(5)])\n# interpolated average precision at 10 using 1000 break points\ncalculator = average_precision_calculator.AveragePrecisionCalculator(10)\ncalculator.accumulate(p1, a1)\ncalculator.accumulate(p2, a2)\nap3 = calculator.peek_ap_at_n()\n```\n\"\"\"\nimport heapq\nimport random\nimport numbers"
+        },
+        {
+            "comment": "This code defines a class `AveragePrecisionCalculator` that calculates average precision and average precision at n for a single label. It takes a `top_n` argument to specify the average precision at n or uses all provided data points if None. The class maintains a max heap of (prediction, actual) and provides a `heap_size` property to get the heap size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":56-85",
+            "content": "import numpy\nclass AveragePrecisionCalculator(object):\n    \"\"\"Calculate the average precision and average precision at n.\"\"\"\n    def __init__(self, top_n=None):\n        \"\"\"Construct an AveragePrecisionCalculator to calculate average precision.\n    This class is used to calculate the average precision for a single label.\n    Args:\n      top_n: A positive Integer specifying the average precision at n, or\n        None to use all provided data points.\n    Raises:\n      ValueError: An error occurred when the top_n is not a positive integer.\n    \"\"\"\n        if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):\n            raise ValueError(\"top_n must be a positive integer or None.\")\n        self._top_n = top_n  # average precision at n\n        self._total_positives = 0  # total number of positives have seen\n        self._heap = []  # max heap of (prediction, actual)\n    @property\n    def heap_size(self):\n        \"\"\"Gets the heap size maintained in the class.\"\"\"\n        return len(self._heap)\n    @property"
+        },
+        {
+            "comment": "This function accumulates prediction scores and ground truth labels, allowing for the calculation of average precision after the call. The function requires both predictions and actuals to have the same shape. If inputs are incomplete, you can provide 'num_positives' to accurately track recall.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":86-107",
+            "content": "    def num_accumulated_positives(self):\n        \"\"\"Gets the number of positive samples that have been accumulated.\"\"\"\n        return self._total_positives\n    def accumulate(self, predictions, actuals, num_positives=None):\n        \"\"\"Accumulate the predictions and their ground truth labels.\n    After the function call, we may call peek_ap_at_n to actually calculate\n    the average precision.\n    Note predictions and actuals must have the same shape.\n    Args:\n      predictions: a list storing the prediction scores.\n      actuals: a list storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      num_positives = If the 'predictions' and 'actuals' inputs aren't complete,\n      then it's possible some true positives were missed in them. In that case,\n      you can provide 'num_positives' in order to accurately track recall.\n    Raises:\n      ValueError: An error occurred when the format of the input is not the\n      numpy 1-D array or the shape of predictions and actuals does not match."
+        },
+        {
+            "comment": "This code snippet is a class method that checks the shape compatibility of 'predictions' and 'actuals', verifies if 'num_positives' is a nonzero number, calculates the total positives, and populates a heap. It also ensures the correctness of the predictions by comparing them to the actuals and updating the heap accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":108-133",
+            "content": "    \"\"\"\n        if len(predictions) != len(actuals):\n            raise ValueError(\n                \"the shape of predictions and actuals does not match.\")\n        if not num_positives is None:\n            if not isinstance(num_positives,\n                              numbers.Number) or num_positives < 0:\n                raise ValueError(\n                    \"'num_positives' was provided but it wan't a nonzero number.\"\n                )\n        if not num_positives is None:\n            self._total_positives += num_positives\n        else:\n            self._total_positives += numpy.size(numpy.where(actuals > 0))\n        topk = self._top_n\n        heap = self._heap\n        for i in range(numpy.size(predictions)):\n            if topk is None or len(heap) < topk:\n                heapq.heappush(heap, (predictions[i], actuals[i]))\n            else:\n                if predictions[i] > heap[0][0]:  # heap[0] is the smallest\n                    heapq.heappop(heap)\n                    heapq.heappush(heap, (predictions[i], actuals[i]))"
+        },
+        {
+            "comment": "This code is part of a class that calculates average precision for video tagging. It includes methods to clear accumulated predictions, peek the non-interpolated average precision at a specific point (n), and calculate the non-interpolated average precision using prediction and actual scores arrays.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":135-165",
+            "content": "    def clear(self):\n        \"\"\"Clear the accumulated predictions.\"\"\"\n        self._heap = []\n        self._total_positives = 0\n    def peek_ap_at_n(self):\n        \"\"\"Peek the non-interpolated average precision at n.\n    Returns:\n      The non-interpolated average precision at n (default 0).\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    \"\"\"\n        if self.heap_size <= 0:\n            return 0\n        predlists = numpy.array(list(zip(*self._heap)))\n        ap = self.ap_at_n(predlists[0],\n                          predlists[1],\n                          n=self._top_n,\n                          total_num_positives=self._total_positives)\n        return ap\n    @staticmethod\n    def ap(predictions, actuals):\n        \"\"\"Calculate the non-interpolated average precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      actuals: a numpy 1-D array storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives."
+        },
+        {
+            "comment": "This code calculates the non-interpolated average precision at 'n' in a list. It takes sparse prediction scores and ground truth labels as input, with any value larger than 0 treated as positives. It also allows specifying the total number of positive items in the list, which can be used for calculation if provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":167-191",
+            "content": "    Returns:\n      The non-interpolated average precision at n.\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    Raises:\n      ValueError: An error occurred when the format of the input is not the\n      numpy 1-D array or the shape of predictions and actuals does not match.\n    \"\"\"\n        return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None)\n    @staticmethod\n    def ap_at_n(predictions, actuals, n=20, total_num_positives=None):\n        \"\"\"Calculate the non-interpolated average precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      actuals: a numpy 1-D array storing the ground truth labels. Any value\n      larger than 0 will be treated as positives, otherwise as negatives.\n      n: the top n items to be considered in ap@n.\n      total_num_positives : (optionally) you can specify the number of total\n        positive\n      in the list. If specified, it will be used in calculation.\n    Returns:"
+        },
+        {
+            "comment": "The code defines a function that calculates the average precision at a specific rank, n. It checks the shape of predictions and actuals arrays and if n is positive integer or None. If any error occurs, it raises ValueError. The code also shuffles the predictions and actuals to avoid overestimating the average precision.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":192-219",
+            "content": "      The non-interpolated average precision at n.\n      If n is larger than the length of the ranked list,\n      the average precision will be returned.\n    Raises:\n      ValueError: An error occurred when\n      1) the format of the input is not the numpy 1-D array;\n      2) the shape of predictions and actuals does not match;\n      3) the input n is not a positive integer.\n    \"\"\"\n        if len(predictions) != len(actuals):\n            raise ValueError(\n                \"the shape of predictions and actuals does not match.\")\n        if n is not None:\n            if not isinstance(n, int) or n <= 0:\n                raise ValueError(\"n must be 'None' or a positive integer.\"\n                                 \" It was '%s'.\" % n)\n        ap = 0.0\n        predictions = numpy.array(predictions)\n        actuals = numpy.array(actuals)\n        # add a shuffler to avoid overestimating the ap\n        predictions, actuals = AveragePrecisionCalculator._shuffle(\n            predictions, actuals)\n        sortidx = sorted(range(len(predictions)),"
+        },
+        {
+            "comment": "This code calculates the average precision of a classification task by first shuffling the predictions and actuals, then iterating through the sorted list to calculate the precision at each recall step. It handles cases where total_num_positives is given or automatically calculated.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":220-255",
+            "content": "                         key=lambda k: predictions[k],\n                         reverse=True)\n        if total_num_positives is None:\n            numpos = numpy.size(numpy.where(actuals > 0))\n        else:\n            numpos = total_num_positives\n        if numpos == 0:\n            return 0\n        if n is not None:\n            numpos = min(numpos, n)\n        delta_recall = 1.0 / numpos\n        poscount = 0.0\n        # calculate the ap\n        r = len(sortidx)\n        if n is not None:\n            r = min(r, n)\n        for i in range(r):\n            if actuals[sortidx[i]] > 0:\n                poscount += 1\n                ap += poscount / (i + 1) * delta_recall\n        return ap\n    @staticmethod\n    def _shuffle(predictions, actuals):\n        random.seed(0)\n        suffidx = random.sample(range(len(predictions)), len(predictions))\n        predictions = predictions[suffidx]\n        actuals = actuals[suffidx]\n        return predictions, actuals\n    @staticmethod\n    def _zero_one_normalize(predictions, epsilon=1e-7):"
+        },
+        {
+            "comment": "This function normalizes the predictions to a range of 0.0 to 1.0 by subtracting the minimum prediction and dividing by the maximum denominator (prediction difference) with an optional epsilon value to prevent division by zero.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py\":256-273",
+            "content": "        \"\"\"Normalize the predictions to the range between 0.0 and 1.0.\n    For some predictions like SVM predictions, we need to normalize them before\n    calculate the interpolated average precision. The normalization will not\n    change the rank in the original list and thus won't change the average\n    precision.\n    Args:\n      predictions: a numpy 1-D array storing the sparse prediction scores.\n      epsilon: a small constant to avoid denominator being zero.\n    Returns:\n      The normalized prediction.\n    \"\"\"\n        denominator = numpy.max(predictions) - numpy.min(predictions)\n        ret = (predictions - numpy.min(predictions)) / numpy.max(\n            denominator, epsilon)\n        return ret"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e575fca7-8d39-4ac0-a79a-138261037f33.json b/docs/doc/e575fca7-8d39-4ac0-a79a-138261037f33.json
new file mode 100644
index 000000000..4f0a0225e
--- /dev/null
+++ b/docs/doc/e575fca7-8d39-4ac0-a79a-138261037f33.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code snippet is importing the paddlevideo_version from the version module. This suggests that this file is serving as an initialization point for the PaddleVideo library, potentially setting up necessary imports or defining constants and functions to be used throughout the library.",
+    "details": [
+        {
+            "comment": "This code snippet is importing the paddlevideo_version from the version module. This suggests that this file is serving as an initialization point for the PaddleVideo library, potentially setting up necessary imports or defining constants and functions to be used throughout the library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/__init__.py\":0-14",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .version import paddlevideo_version"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e6df6256-e18b-4ad5-98b8-5e8f9ec0824e.json b/docs/doc/e6df6256-e18b-4ad5-98b8-5e8f9ec0824e.json
new file mode 100644
index 000000000..71318063c
--- /dev/null
+++ b/docs/doc/e6df6256-e18b-4ad5-98b8-5e8f9ec0824e.json
@@ -0,0 +1,50 @@
+{
+    "summary": "This code initializes a PaddlePaddle model for video training, sets up feeds and outputs, configures loss and optimizer, builds the model, prepares programs, trains, logs, and saves it. The main function handles arguments, checks save directory, and executes the training process.",
+    "details": [
+        {
+            "comment": "The code imports necessary libraries and modules, enables static mode for PaddlePaddle, initializes a model (AttentionLstmErnie), defines train_with_pyreader function, and handles config file operations. It follows the Apache License 2.0 and provides information for obtaining the license.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py\":0-33",
+            "content": "\"\"\"\ntrain main\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\nimport time\nimport argparse\nimport logging\nimport numpy as np\nimport paddle\npaddle.enable_static()\nimport paddle.static as static\nfrom accuracy_metrics import MetricsCalculator\nfrom datareader import get_reader\nfrom config import print_configs, merge_configs, parse_config\nfrom models.attention_lstm_ernie import AttentionLstmErnie\nfrom utils import init_pretraining_params, train_with_pyreader"
+        },
+        {
+            "comment": "This code sets up the logging configuration and parses command-line arguments for training a video model using Paddle Video. The default model name is 'BaiduNet', and the config file path is 'configs/conf.txt'. It also allows setting the batch size, learning rate, and pretrain weights through command-line flags.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py\":36-70",
+            "content": "logging.root.handlers = []\nFORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'\nlogging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)\nlogger = logging.getLogger(__name__)\ndef parse_args():\n    \"\"\"parse_args\n    \"\"\"\n    parser = argparse.ArgumentParser(\"Paddle Video train script\")\n    parser.add_argument(\n        '--model_name',\n        type=str,\n        default='BaiduNet',\n        help='name of model to train.')\n    parser.add_argument(\n        '--config',\n        type=str,\n        default='configs/conf.txt',\n        help='path to config file of model')\n    parser.add_argument(\n        '--batch_size',\n        type=int,\n        default=None,\n        help='training batch size. None to use config file setting.')\n    parser.add_argument(\n        '--learning_rate',\n        type=float,\n        default=None,\n        help='learning rate use for training. None to use config file setting.')\n    parser.add_argument(\n        '--pretrain',\n        type=str,\n        default=None,\n        help='path to pretrain weights. None to use default weights path in  ~/.paddle/weights.'"
+        },
+        {
+            "comment": "This code snippet from the PaddleVideo library's MultimodalVideoTag application defines command line argument options for training. Options include resuming training, GPU usage, disabling pyreader, memory optimization during training, epoch number, validation interval, and saving directory.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py\":71-104",
+            "content": "    )\n    parser.add_argument(\n        '--resume',\n        type=str,\n        default=None,\n        help='path to resume training based on previous checkpoints. '\n        'None for not resuming any checkpoints.')\n    parser.add_argument(\n        '--use_gpu', type=bool, default=True, help='default use gpu.')\n    parser.add_argument(\n        '--no_use_pyreader',\n        action='store_true',\n        default=False,\n        help='whether to use pyreader')\n    parser.add_argument(\n        '--no_memory_optimize',\n        action='store_true',\n        default=False,\n        help='whether to use memory optimize in train')\n    parser.add_argument(\n        '--epoch_num',\n        type=int,\n        default=0,\n        help='epoch number, 0 for read from config file')\n    parser.add_argument(\n        '--valid_interval',\n        type=int,\n        default=1,\n        help='validation epoch interval, 0 for no validation.')\n    parser.add_argument(\n        '--save_dir',\n        type=str,\n        default='checkpoints',\n        help='directory name to save train snapshoot')"
+        },
+        {
+            "comment": "This code defines command-line arguments for the mini-batch interval to log and the save filename, parses the configuration file, creates train and valid models based on the model name and configurations, sets the maximum number of training steps, and prepares static programs for building the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py\":105-135",
+            "content": "    parser.add_argument(\n        '--log_interval',\n        type=int,\n        default=10,\n        help='mini-batch interval to log.')\n    parser.add_argument(\n        '--save_log_name',\n        type=str,\n        default='train_val',\n        help='save to tensorboard filename recommand model name.')\n    args = parser.parse_args()\n    return args\ndef train(args):\n    \"\"\"train main\n    \"\"\"\n    # parse config\n    config = parse_config(args.config)\n    train_config = merge_configs(config, 'train', vars(args))\n    valid_config = merge_configs(config, 'valid', vars(args))\n    print_configs(train_config, 'Train')\n    train_model = AttentionLstmErnie(args.model_name, train_config, mode='train')\n    valid_model = AttentionLstmErnie(args.model_name, valid_config, mode='valid')\n    max_train_steps = train_config.TRAIN.epoch * train_config.TRAIN.num_samples // train_config.TRAIN.batch_size\n    print('max train steps %d' % (max_train_steps))\n    # build model\n    startup = static.Program()\n    train_prog = static.Program()\n    with static.program_guard(train_prog, startup):"
+        },
+        {
+            "comment": "This code snippet prepares the model for training by setting up feeds, outputs, loss, and optimizer. It also enables memory optimization if specified by arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py\":136-159",
+            "content": "        paddle.disable_static()\n        train_model.build_input(use_pyreader=True)\n        train_model.build_model()\n            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label\n        train_feeds = train_model.feeds()\n        train_feeds[-1].persistable = True\n            # for the output of classification model, has the form [pred]\n        train_outputs = train_model.outputs()\n        for output in train_outputs:\n            output.persistable = True\n        train_loss = train_model.loss()\n        train_loss.persistable = True\n            # outputs, loss, label should be fetched, so set persistable to be true\n        optimizer = train_model.optimizer()\n        optimizer.minimize(train_loss)\n        train_pyreader = train_model.pyreader()\n        paddle.enable_static()\n    if not args.no_memory_optimize:\n        paddle.distributed.transpiler.memory_optimize(train_prog)\n    valid_prog = static.Program()\n    with static.program_guard(valid_prog, startup):\n        paddle.disable_static()"
+        },
+        {
+            "comment": "The code is building the model, setting up executor and place (CPU or GPU), checking if resume weights exist to load them if necessary, and initializing pre-trained parameters for Ernie model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py\":160-189",
+            "content": "        valid_model.build_input(True)\n        valid_model.build_model()\n        valid_feeds = valid_model.feeds()\n        valid_outputs = valid_model.outputs()\n        valid_loss = valid_model.loss()\n        valid_pyreader = valid_model.pyreader()\n        paddle.enable_static()\n    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()\n    exe = static.Executor(place)\n    exe.run(startup)\n    if args.resume:\n        # if resume weights is given, load resume weights directly\n        assert os.path.exists(args.resume), \\\n            \"Given resume weight dir {} not exist.\".format(args.resume)\n        def if_exist(var):\n            \"\"\"if_exist\n            \"\"\"\n            return os.path.exists(os.path.join(args.resume, var.name))\n        print('resuming ,,,,,,,,,,,,,,')\n        paddle.fluid.io.load_persistables(\n                    exe, '', main_program=train_prog, filename=args.resume)\n    else:\n        # load ernie pretrain model\n        init_pretraining_params(exe,\n                                train_config.TRAIN.ernie_pretrain_dict_path,"
+        },
+        {
+            "comment": "Loading pre-trained weights if provided, enabling inplace for faster execution and creating compiled programs with data parallelism for both training and validation programs. If not using PyReader and GPU is enabled, it sets the device list to use CUDA places.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py\":190-212",
+            "content": "                                main_program=train_prog)\n        # if not in resume mode, load pretrain weights\n        # this pretrain may be only audio or video\n        if args.pretrain:\n            assert os.path.exists(args.pretrain), \\\n                \"Given pretrain weight dir {} not exist.\".format(args.pretrain)\n        if args.pretrain:\n            train_model.load_test_weights_file(exe, args.pretrain, train_prog, place)\n    build_strategy = paddle.static.BuildStrategy()\n    build_strategy.enable_inplace = True\n    compiled_train_prog = static.CompiledProgram(\n        train_prog).with_data_parallel(loss_name=train_loss.name,\n                                       build_strategy=build_strategy)\n    compiled_valid_prog = static.CompiledProgram(\n        valid_prog).with_data_parallel(share_vars_from=compiled_train_prog,\n                                       build_strategy=build_strategy)\n    # get reader\n    bs_denominator = 1\n    if (not args.no_use_pyreader) and args.use_gpu:\n        dev_list = static.cuda_places()"
+        },
+        {
+            "comment": "This code sets the batch size for training and validation based on the length of the development list. It initializes train and valid readers with these batch sizes, decorates them with specified places, and creates MetricsCalculator objects to get metrics for training and validation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py\":213-230",
+            "content": "        bs_denominator = len(dev_list)\n    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /\n                                        bs_denominator)\n    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /\n                                        bs_denominator)\n    train_reader = get_reader(args.model_name.upper(), 'train', train_config)\n    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)\n    exe_places = static.cuda_places() if args.use_gpu else static.cpu_places()\n    train_pyreader.decorate_sample_list_generator(train_reader,\n                                                  places=exe_places)\n    valid_pyreader.decorate_sample_list_generator(valid_reader,\n                                                  places=exe_places)\n    # get metrics\n    train_metrics = MetricsCalculator(args.model_name.upper(), 'train', train_config)\n    valid_metrics = MetricsCalculator(args.model_name.upper(), 'valid', valid_config)\n    # print(\"****************************valid_metrics\", valid_metrics.get())"
+        },
+        {
+            "comment": "The code initializes training and validation fetch lists, sets the number of epochs based on argument or model default, then trains the model using the specified executor, programs, feeds, and fetch lists. It also handles logging intervals, valid intervals, save directory, and save model name. The main function parses arguments, checks if the save directory exists, and calls the train function to execute the training process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py\":231-262",
+            "content": "    train_fetch_list = [train_loss.name] + [x.name for x in train_outputs\n                                            ] + [train_feeds[-1].name]\n    valid_fetch_list = [valid_loss.name] + [x.name for x in valid_outputs\n                                            ] + [valid_feeds[-1].name]\n    epochs = args.epoch_num or train_model.epoch_num()\n    train_with_pyreader(\n        exe,\n        train_prog,\n        compiled_train_prog,\n        train_pyreader,\n        train_fetch_list,\n        train_metrics,\n        epochs=epochs,\n        log_interval=args.log_interval,\n        valid_interval=args.valid_interval,\n        save_dir=args.save_dir,\n        save_model_name=args.model_name,\n        test_exe=compiled_valid_prog,\n        test_pyreader=valid_pyreader,\n        test_fetch_list=valid_fetch_list,\n        test_metrics=valid_metrics)\nif __name__ == \"__main__\":\n    args = parse_args()\n    logger.info(args)\n    if not os.path.exists(args.save_dir):\n        os.makedirs(args.save_dir)\n    train(args)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e73084d8-adcc-4bfe-bfb7-f533358c0599.json b/docs/doc/e73084d8-adcc-4bfe-bfb7-f533358c0599.json
new file mode 100644
index 000000000..ff6fef9e8
--- /dev/null
+++ b/docs/doc/e73084d8-adcc-4bfe-bfb7-f533358c0599.json
@@ -0,0 +1,65 @@
+{
+    "summary": "FocalLoss optimizes hard examples in object detection, while YowoLoss and RegionLoss use softmax encoding. Code prepares input with reshaping, sigmoid activation, and anchor parameters. The code calculates YOLOv3-style losses for bounding box location, confidence, and classification on GPU.",
+    "details": [
+        {
+            "comment": "This code snippet defines a FocalLoss class that implements the Focal Loss criterion. It is used for dense object detection and aims to reduce the classification loss for well-classified samples, focusing more on hard examples. The formula for the loss is given as -\u03b1(1-softmax(x)[class])^\u03b3 * log(softmax(x)[class]).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nfrom paddle.static import Variable\nfrom ..registry import LOSSES\nfrom .base import BaseWeightedLoss\nfrom ..framework.localizers.yowo_utils import build_targets\nclass FocalLoss(nn.Layer):\n    \"\"\"\n        This criterion is a implemenation of Focal Loss, which is proposed in\n        Focal Loss for Dense Object Detection.\n            Loss(x, class) = - \\alpha (1-softmax(x)[class])^gamma \\log(softmax(x)[class])"
+        },
+        {
+            "comment": "FocalLoss is a criterion that takes in alpha, gamma, and size_average as arguments. It averages losses across observations for each minibatch by default but can sum the losses if size_average is set to False. Alpha is either a tensor or variable, and gamma should be greater than 0, reducing relative loss for well-classified examples.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":32-54",
+            "content": "        The losses are averaged across observations for each minibatch.\n        Args:\n            alpha(1D Tensor, Variable) : the scalar factor for this criterion\n            gamma(float, double) : gamma > 0; reduces the relative loss for well-classi\ufb01ed examples (p > .5),\n                                   putting more focus on hard, misclassi\ufb01ed examples\n            size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.\n                                However, if the field size_average is set to False, the losses are\n                                instead summed for each minibatch.\n    \"\"\"\n    def __init__(self, class_num, alpha=None, gamma=2, size_average=True):\n        super(FocalLoss, self).__init__()\n        if alpha is None:\n            self.alpha = paddle.ones(\n                [class_num, 1])\n            self.alpha.stop_gradient = False\n        else:\n            if isinstance(alpha, Variable):\n                self.alpha = alpha\n            else:"
+        },
+        {
+            "comment": "This code defines a class for the Yowo loss function. The constructor sets various attributes like alpha, gamma, class_num, size_average, and stop_gradient. The forward method calculates the loss using softmax, one-hot encoding, and other operations. If inputs or self.alpha are not in GPU, it transfers them to the GPU. It then computes the batch_loss and finally returns either average or sum depending on size_average attribute.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":55-86",
+            "content": "                self.alpha = (alpha)\n                self.alpha.stop_gradient = False\n        self.gamma = gamma\n        self.class_num = class_num\n        self.size_average = size_average\n    def forward(self, inputs, targets):\n        N = inputs.shape[0]\n        C = inputs.shape[1]\n        P = F.softmax(inputs, axis=1)\n        tmp = numpy.zeros((N, C))\n        class_mask = paddle.to_tensor(tmp, place=inputs.place)\n        class_mask.stop_gradient = False\n        ids = paddle.reshape(targets, [-1, 1])\n        class_mask = F.one_hot(ids.squeeze(-1), class_mask.shape[1])\n        if \"Place\" not in str(inputs.place) and \"Place\" not in str(self.alpha.place):\n            self.alpha = self.alpha.cuda()\n        alpha = self.alpha[paddle.reshape(ids.detach(), [-1])]\n        probs = paddle.reshape((P * class_mask).sum(1), [-1, 1])\n        log_p = probs.log()\n        batch_loss = -alpha * (paddle.pow((1 - probs), self.gamma)) * log_p\n        if self.size_average:\n            loss = batch_loss.mean()\n        else:\n            loss = batch_loss.sum()"
+        },
+        {
+            "comment": "This code defines a RegionLoss class that inherits from BaseWeightedLoss. It takes parameters such as num_classes, anchors, num_anchors, object_scale, noobject_scale, class_scale, and coord_scale. The class initializes an instance of FocalLoss and sets a threshold. The forward method computes the loss between output and target tensors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":87-111",
+            "content": "        return loss\n@LOSSES.register()\nclass RegionLoss(BaseWeightedLoss):\n    # for our model anchors has 10 values and number of anchors is 5\n    # parameters: 24, 10 float values, 24, 5\n    def __init__(self, num_classes, anchors, num_anchors, object_scale, noobject_scale, class_scale, coord_scale):\n        super().__init__()\n        self.num_classes = num_classes\n        self.anchors = [float(x) for x in anchors]\n        self.num_anchors = num_anchors\n        self.anchor_step = len(self.anchors) // self.num_anchors  # each anchor has 2 parameters\n        self.object_scale = object_scale\n        self.noobject_scale = noobject_scale\n        self.class_scale = class_scale\n        self.coord_scale = coord_scale\n        self.focalloss = FocalLoss(class_num=self.num_classes, gamma=2, size_average=False)\n        self.thresh = 0.6\n    def convert2cpu(self, gpu_matrix):\n        # return paddle.to_tensor((gpu_matrix.shape), dtype=\"float32\").copy_(gpu_matrix)\n        return gpu_matrix.cpu()\n    def forward(self, output, target):"
+        },
+        {
+            "comment": "This code reshapes the output tensor for each anchor's parameters and applies sigmoid activation to the transformed tensor. The output tensor is of shape B*A*(4+1+num_classes)*H*W, which represents the coordinates (tx, ty), width, height, confidence score, and class probabilities for each anchor box in the image grid. By applying sigmoid activation functions to tx and ty, the code scales the anchor's parameter values between 0 and 1, preparing them for the subsequent operations in the YOLOv4 model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":112-136",
+            "content": "        # output : B*A*(4+1+num_classes)*H*W            8*5*29*24*24\n        # B: number of batches\n        # A: number of anchors\n        # 4: 4 parameters for each bounding box\n        # 1: confidence score\n        # num_classes\n        # H: height of the image (in grids)\n        # W: width of the image (in grids)\n        # for each grid cell, there are A*(4+1+num_classes) parameters\n        nB = output.detach().shape[0]  # batch\n        nA = self.num_anchors  # anchor_num\n        nC = self.num_classes\n        nH = output.detach().shape[2]\n        nW = output.detach().shape[3]\n        # resize the output (all parameters for each anchor can be reached)\n        output = paddle.reshape(output, [nB, nA, (5 + nC), nH, nW])\n        # anchor's parameter tx\n        x = F.sigmoid(\n            paddle.reshape(paddle.index_select(output, paddle.to_tensor([0], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW]))\n        x.stop_gradient = False\n        # anchor's parameter ty\n        y = F.sigmoid("
+        },
+        {
+            "comment": "The code reshapes and assigns stop_gradient to output slices of the tensor \"output\" corresponding to anchor parameters (paddle, w, h) and a confidence score (conf), as well as class labels (cls). All are assigned stop_gradient=False.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":137-154",
+            "content": "            paddle.reshape(paddle.index_select(output, paddle.to_tensor([1], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW]))\n        y.stop_gradient = False\n        # anchor's parameter tw\n        w = paddle.reshape(paddle.index_select(output, paddle.to_tensor([2], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW])\n        w.stop_gradient = False\n        # anchor's parameter th\n        h = paddle.reshape(paddle.index_select(output, paddle.to_tensor([3], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW])\n        h.stop_gradient = False\n        # confidence score for each anchor\n        conf = F.sigmoid(\n            paddle.reshape(paddle.index_select(output, paddle.to_tensor([4], dtype='int64').cuda(), axis=2),\n                           [nB, nA, nH, nW]))\n        conf.stop_gradient = False\n        # anchor's parameter class label\n        cls = paddle.index_select(output, paddle.linspace(5, 5 + nC - 1, nC, 'int64').cuda(), axis=2)"
+        },
+        {
+            "comment": "This code resizes the data structure to have a class label for each anchor, initializes prediction boxes, and creates grid coordinates for localization. It uses PaddlePaddle's linear algebra functions like paddle.reshape, paddle.transpose, and paddle.linspace. The code aims to prepare the input data for object detection model training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":155-168",
+            "content": "        cls.stop_gradient = False\n        # resize the data structure so that for every anchor there is a class label in the last dimension\n        cls = paddle.reshape(paddle.transpose(paddle.reshape(cls, [nB * nA, nC, nH * nW]), [0, 2, 1]),\n                             [nB * nA * nH * nW, nC])\n        # for the prediction of localization of each bounding box, there exist 4 parameters (tx, ty, tw, th)\n        # pred_boxes = torch.cuda.FloatTensor(4, nB*nA*nH*nW)\n        pred_boxes = paddle.zeros([4, nB * nA * nH * nW], dtype='float32').cuda()\n        # tx and ty\n        grid_x = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nW - 1, nW), [nH, 1]), [nB * nA, 1, 1]),\n                                [nB * nA * nH * nW]).cuda()\n        grid_y = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nH - 1, nH), [nW, 1]).t(), [nB * nA, 1, 1]),\n                                [nB * nA * nH * nW]).cuda()\n        # for each anchor there are anchor_step variables (with the structure num_anchor*anchor_step)"
+        },
+        {
+            "comment": "This code is preparing anchor width and height values for the YOWO loss function. It reshapes the anchors, index selects the width and height values, tiles them to match grid dimensions, and assigns the prediction of bounding box localization for each grid cell.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":169-180",
+            "content": "        # for each row(anchor), the first variable is anchor's width, second is anchor's height\n        # pw and ph\n        anchor_w = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]),\n                                       paddle.to_tensor([0], dtype='int64'), axis=1).cuda()\n        anchor_h = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]),\n                                       paddle.to_tensor([1], dtype='int64'), axis=1).cuda()\n        # for each pixel (grid) repeat the above process (obtain width and height of each grid)\n        anchor_w = paddle.reshape(paddle.tile(paddle.tile(anchor_w, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW])\n        anchor_h = paddle.reshape(paddle.tile(paddle.tile(anchor_h, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW])\n        # prediction of bounding box localization\n        # x.data and y.data: top left corner of the anchor\n        # grid_x, grid_y: tx and ty predictions made by yowo"
+        },
+        {
+            "comment": "This code reshapes and casts input tensors, calculates predicted bounding box coordinates based on input features, and calls a function to build targets for the model. It then reshapes and transposes the predicted boxes tensor before passing it to the build_targets function. The function is part of the YOLOv3-style loss calculation in PaddleVideo.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":182-198",
+            "content": "        x_data = paddle.reshape(x.detach(), [-1])\n        y_data = paddle.reshape(y.detach(), [-1])\n        w_data = paddle.reshape(w.detach(), [-1])\n        h_data = paddle.reshape(h.detach(), [-1])\n        pred_boxes[0] = paddle.cast(x_data, dtype='float32') + paddle.cast(grid_x, dtype='float32')  # bx\n        pred_boxes[1] = paddle.cast(y_data, dtype='float32') + paddle.cast(grid_y, dtype='float32')  # by\n        pred_boxes[2] = paddle.exp(paddle.cast(w_data, dtype='float32')) * paddle.cast(anchor_w, dtype='float32')  # bw\n        pred_boxes[3] = paddle.exp(paddle.cast(h_data, dtype='float32')) * paddle.cast(anchor_h, dtype='float32')  # bh\n        # the size -1 is inferred from other dimensions\n        # pred_boxes (nB*nA*nH*nW, 4)\n        pred_boxes = self.convert2cpu(\n            paddle.cast(paddle.reshape(paddle.transpose(pred_boxes, (1, 0)), [-1, 4]), dtype='float32'))\n        nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,\n           "
+        },
+        {
+            "comment": "This code is setting up a loss function for object detection. It takes in target values, anchors, number of anchors (nA), number of classes (nC), and the image dimensions (nH, nW). The noobject_scale and object_scale variables control how the loss is applied depending on whether an object is present or not. The cls_mask variable filters out proposals with low box confidence scores. The final predictions are kept if their confidence score is greater than 0.25. The tensor tx is moved to the GPU (cuda).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":198-209",
+            "content": "                                                                                         target.detach(),\n                                                                                                    self.anchors, nA,\n                                                                                                    nC, \\\n                                                                                                    nH, nW,\n                                                                                                    self.noobject_scale,\n                                                                                                    self.object_scale,\n                                                                                                    self.thresh)\n        cls_mask = (cls_mask == 1)\n        #  keep those with high box confidence scores (greater than 0.25) as our final predictions\n        nProposals = int((conf > 0.25).sum().detach().item())\n        tx = (tx).cuda()"
+        },
+        {
+            "comment": "This code is moving variables to the GPU and setting their gradient flags to False. Then, it calculates losses for bounding box location, prediction confidence, and classification separately using SmoothL1Loss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":210-236",
+            "content": "        tx.stop_gradient = False\n        ty = ty.cuda()\n        ty.stop_gradient = False\n        tw = tw.cuda()\n        tw.stop_gradient = False\n        th = th.cuda()\n        th.stop_gradient = False\n        tconf = tconf.cuda()\n        tconf.stop_gradient = False\n        tcls = paddle.reshape(tcls, [-1]).astype('int64')[paddle.reshape(cls_mask, [-1])].cuda()\n        tcls.stop_gradient = False\n        coord_mask = coord_mask.cuda()\n        coord_mask.stop_gradient = False\n        conf_mask = conf_mask.cuda().sqrt()\n        coord_mask.stop_gradient = False\n        cls_mask = paddle.tile(paddle.reshape(cls_mask, [-1, 1]), [1, nC]).cuda()\n        cls_mask.stop_gradient = False\n        cls = paddle.reshape(cls[cls_mask], [-1, nC])\n        # losses between predictions and targets (ground truth)\n        # In total 6 aspects are considered as losses:\n        # 4 for bounding box location, 2 for prediction confidence and classification seperately\n        L1_loss = nn.SmoothL1Loss(reduction='sum')\n        loss_x = self.coord_scale * L1_loss(paddle.cast(x, dtype=\"float32\") * coord_mask, tx * coord_mask) / 2.0"
+        },
+        {
+            "comment": "This code calculates the loss for an object detection model, consisting of L1_loss for coordinates (x, y, w, h) and MSELoss for confidence. It applies focal loss for classification with a gamma value of 2, sums all losses together, and returns the total loss and count of correct predictions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/yowo_loss.py\":237-248",
+            "content": "        loss_y = self.coord_scale * L1_loss(paddle.cast(y, dtype=\"float32\") * coord_mask, ty * coord_mask) / 2.0\n        loss_w = self.coord_scale * L1_loss(paddle.cast(w * coord_mask, dtype=\"float32\"), tw * coord_mask) / 2.0\n        loss_h = self.coord_scale * L1_loss(paddle.cast(h * coord_mask, dtype=\"float32\"), th * coord_mask) / 2.0\n        loss_conf = nn.MSELoss(reduction='sum')(paddle.cast(conf, dtype=\"float32\") * conf_mask, tconf * conf_mask) / 2.0\n        # try focal loss with gamma = 2\n        loss_cls = self.class_scale * self.focalloss(cls, tcls)\n        # sum of loss\n        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls\n        return loss, nCorrect"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e792b9c3-5b22-449e-b47c-ea429581ee3e.json b/docs/doc/e792b9c3-5b22-449e-b47c-ea429581ee3e.json
new file mode 100644
index 000000000..627df0e29
--- /dev/null
+++ b/docs/doc/e792b9c3-5b22-449e-b47c-ea429581ee3e.json
@@ -0,0 +1,70 @@
+{
+    "summary": "This code defines a ResNet model with TSM backbone, implementing the ResNet-C architecture, consisting of multiple blocks and configurable layer numbers. It initializes weights based on pretrained values and returns output after processing through all blocks.",
+    "details": [
+        {
+            "comment": "This code defines a class called ConvBNLayer which is a layer consisting of Conv2D (convolutional) and BatchNorm2D layers. It appears to be part of a larger neural network model, likely used for feature extraction or classification tasks. The class also imports other useful modules such as Linear, Dropout, MaxPool2D, AvgPool2D from paddle.nn, and uses weight_init_ function from utils.save_load module to initialize layer weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":0-33",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport numpy as np\nimport math\nimport sys\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,\n                       AvgPool2D)\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nfrom ..weight_init import weight_init_\nfrom ...utils.save_load import load_ckpt\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer."
+        },
+        {
+            "comment": "This code defines a class for a ConvBNLayer, which includes a convolution layer followed by batch normalization and activation. It has optional parameters for tweaks mode, activation function, and name. Weight and bias initialization are handled in the init_weights method.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":35-59",
+            "content": "    Args:\n        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        is_tweaks_mode (bool): switch for tweaks. Default: False.\n        act (str): Indicate activation after BatchNorm2D layer.\n        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 is_tweaks_mode=False,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.is_tweaks_mode = is_tweaks_mode\n        self._pool2d_avg = AvgPool2D(kernel_size=2,"
+        },
+        {
+            "comment": "This code defines a ResNet TSM backbone with stride, padding, and ceil_mode. It also includes a convolution layer, batch normalization, and activation function. The forward function takes inputs and processes them through the defined layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":60-84",
+            "content": "                                     stride=2,\n                                     padding=0,\n                                     ceil_mode=True)\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            weight_attr=ParamAttr(name=name + \"_weights\"),\n                            bias_attr=False)\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        self._act = act\n        self._batch_norm = BatchNorm2D(out_channels,\n                                       weight_attr=ParamAttr(name=bn_name +\n                                                             \"_scale\"),\n                                       bias_attr=ParamAttr(bn_name + \"_offset\"))\n    def forward(self, inputs):"
+        },
+        {
+            "comment": "The code defines a BottleneckBlock class which is a layer in the ResNet model. This block consists of two 3x3 convolutional layers, each followed by batch normalization and ReLU activation. The input channels, output channels, stride, and other parameters are defined for this block. This structure helps in reducing the number of parameters while preserving or even improving accuracy in deep neural networks like ResNet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":85-113",
+            "content": "        \"\"\"forward\"\"\"\n        if self.is_tweaks_mode:\n            inputs = self._pool2d_avg(inputs)\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BottleneckBlock(nn.Layer):\n    \"\"\"BottleneckBlock\"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 if_first=False,\n                 num_seg=8,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=1,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,"
+        },
+        {
+            "comment": "This code defines a custom layer in the ResNet-D model, with optional 2x2 pooling before convolution. The layer includes several ConvBNLayer components and a shortcut connection. If `short` is True, it adds a 2x2 average pooling layer before the convolution, whose stride is changed to 1. This works well in practice for ResNet-D 2/2.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":114-139",
+            "content": "                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2b\")\n        self.conv2 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels * 4,\n                                 kernel_size=1,\n                                 act=None,\n                                 name=name + \"_branch2c\")\n        if not shortcut:\n            self.short = ConvBNLayer(\n                in_channels=in_channels,\n                out_channels=out_channels * 4,\n                kernel_size=1,\n                stride=\n                1,  #ResNet-D 2/2:add a 2\u00d72 average pooling layer with a stride of 2 before the convolution,\n                #             whose stride is changed to 1, works well in practice.\n                is_tweaks_mode=False if if_first else True,\n                name=name + \"_branch1\")\n        self.shortcut = shortcut\n        self.num_seg = num_seg\n    def forward(self, inputs):\n        \"\"\"forward\"\"\""
+        },
+        {
+            "comment": "This code defines a BasicBlock class with convolutional layers, Batch Normalization, and ReLU activation functions. It also includes an optional shortcut connection. The method within the class performs temporal shifting on inputs before passing through convolutional layers and adding to the shortcut connection if applicable. Finally, it applies ReLU activation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":140-169",
+            "content": "        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)\n        y = self.conv0(shifts)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        return F.relu(y)\nclass BasicBlock(nn.Layer):\n    \"\"\"BasicBlock\"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride,\n                 shortcut=True,\n                 name=None):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(in_channels=in_channels,\n                                 out_channels=out_channels,\n                                 kernel_size=3,\n                                 stride=stride,\n                                 act=\"relu\",\n                                 name=name + \"_branch2a\")\n        self.conv1 = ConvBNLayer(in_channels=out_channels,\n                                 out_channels=out_channels,"
+        },
+        {
+            "comment": "This code defines a ResNet TSM backbone model with optional shortcut connections. It includes convolutional layers, batch normalization, and a forward function for computation. The depth of the resnet model is specified as an argument, along with optional pretrained weights.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":170-205",
+            "content": "                                 kernel_size=3,\n                                 act=None,\n                                 name=name + \"_branch2b\")\n        if not shortcut:\n            self.short = ConvBNLayer(in_channels=in_channels,\n                                     out_channels=out_channels,\n                                     kernel_size=1,\n                                     stride=stride,\n                                     name=name + \"_branch1\")\n        self.shortcut = shortcut\n    def forward(self, inputs):\n        \"\"\"forward\"\"\"\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(short, conv1)\n        y = F.relu(y)\n        return y\n@BACKBONES.register()\nclass ResNetTweaksTSM(nn.Layer):\n    \"\"\"ResNet TSM backbone.\n    Args:\n        depth (int): Depth of resnet model.\n        pretrained (str): pretrained model. Default: None.\n    \"\"\"\n    def __init__(self, depth, num_seg=8, pretrained=None):"
+        },
+        {
+            "comment": "This code initializes an instance of ResNetTweaksTSM and sets its parameters, including the layer depth and number of segments. It also checks if the input layer is supported and assigns the corresponding depth based on the specified layer type. The code defines a ConvBNLayer for the first convolutional layer with 3x3 kernel and relu activation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":206-234",
+            "content": "        super(ResNetTweaksTSM, self).__init__()\n        self.pretrained = pretrained\n        self.layers = depth\n        self.num_seg = num_seg\n        supported_layers = [18, 34, 50, 101, 152]\n        assert self.layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, self.layers)\n        if self.layers == 18:\n            depth = [2, 2, 2, 2]\n        elif self.layers == 34 or self.layers == 50:\n            depth = [3, 4, 6, 3]\n        elif self.layers == 101:\n            depth = [3, 4, 23, 3]\n        elif self.layers == 152:\n            depth = [3, 8, 36, 3]\n        in_channels = 64\n        out_channels = [64, 128, 256, 512]\n        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        self.conv1_1 = ConvBNLayer(in_channels=3,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=2,\n                                   act='relu',\n                                   name=\"conv1_1\")"
+        },
+        {
+            "comment": "This code defines a ResNet model with Temporal Segment Network (TSM) backbone. It includes convolutional layers, batch normalization, ReLU activation, max pooling, and multiple blocks for the ResNet structure. The number of layers can be configured as 50, 101, or 152, affecting the block's properties.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":235-257",
+            "content": "        self.conv1_2 = ConvBNLayer(in_channels=32,\n                                   out_channels=32,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   name=\"conv1_2\")\n        self.conv1_3 = ConvBNLayer(in_channels=32,\n                                   out_channels=64,\n                                   kernel_size=3,\n                                   stride=1,\n                                   act='relu',\n                                   name=\"conv1_3\")\n        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.block_list = []\n        if self.layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if self.layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:"
+        },
+        {
+            "comment": "This code defines a ResNet model with Temporal Segment Network (TSM) backbone. It creates blocks of BottleneckBlock layers and appends them to the block list based on input and output channel numbers, stride values, and other parameters. The code handles both bottleneck and standard blocks and keeps track of shortcut connections for each block.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":258-277",
+            "content": "                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    bottleneck_block = self.add_sublayer(\n                        'bb_%d_%d' %\n                        (block, i),  #same with PaddleClas, for loading pretrain\n                        BottleneckBlock(\n                            in_channels=in_channels\n                            if i == 0 else out_channels[block] * 4,\n                            out_channels=out_channels[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            num_seg=self.num_seg,\n                            shortcut=shortcut,\n                            if_first=block == i == 0,\n                            name=conv_name))\n                    in_channels = out_channels[block] * 4\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):"
+        },
+        {
+            "comment": "The code defines a function to initialize the weights of the ResNet backbone. If a pretrained loading path is provided, it loads the weights from that path; otherwise, it follows specific weight initialization methods for Conv2D layers in the backbone.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":278-296",
+            "content": "                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(in_channels=in_channels[block]\n                                   if i == 0 else out_channels[block],\n                                   out_channels=out_channels[block],\n                                   stride=2 if i == 0 and block != 0 else 1,\n                                   shortcut=shortcut,\n                                   name=conv_name))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n    def init_weights(self):\n        \"\"\"Initiate the parameters.\n        Note:\n            1. when indicate pretrained loading path, will load it to initiate backbone.\n            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D lay"
+        },
+        {
+            "comment": "This code initializes the backbone network for a video quality assessment model. It checks if pretrained weights are provided and loads them if available, or initializes the layers using Kaiming Normal for convolutional layers and constant value of 1 for batch normalization layers. The forward function defines how the backbone is executed on inputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":296-316",
+            "content": "er will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.\n            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html\n        \"\"\"\n        #XXX: check bias!!! check pretrained!!!\n        if isinstance(self.pretrained, str) and self.pretrained.strip() != \"\":\n            load_ckpt(self, self.pretrained)\n        elif self.pretrained is None or self.pretrained.strip() == \"\":\n            for layer in self.sublayers():\n                if isinstance(layer, nn.Conv2D):\n                    #XXX: no bias\n                    weight_init_(layer, 'KaimingNormal')\n                elif isinstance(layer, nn.BatchNorm2D):\n                    weight_init_(layer, 'Constant', value=1)\n    def forward(self, inputs):\n        \"\"\"Define how the backbone is going to run.\n        \"\"\"\n        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,\n        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27"
+        },
+        {
+            "comment": "This code implements the ResNet-C architecture, which uses three 3x3 convolutions and one 7x7 convolution in the first layer, followed by max pooling and multiple blocks. The output is returned after processing through all the blocks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py\":317-327",
+            "content": "        #y = paddle.reshape(\n        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])\n        ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv\n        y = self.conv1_1(inputs)\n        y = self.conv1_2(y)\n        y = self.conv1_3(y)\n        y = self.pool2D_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e878569f-fa06-4db7-b67c-3dc1f40f3333.json b/docs/doc/e878569f-fa06-4db7-b67c-3dc1f40f3333.json
new file mode 100644
index 000000000..97e8e7074
--- /dev/null
+++ b/docs/doc/e878569f-fa06-4db7-b67c-3dc1f40f3333.json
@@ -0,0 +1,85 @@
+{
+    "summary": "The code defines a SeparableConv2d class and layers for convolutional layers, initializes an AlignedXception network with skip connections, ReLU activations, and separable convolutions for feature extraction in the backbone architecture, and utilizes pre-trained weights for image classification tasks.",
+    "details": [
+        {
+            "comment": "The code defines a `SeparableConv2d` class which extends the `nn.Layer` class and implements a separable convolutional layer with optional batch normalization (`BatchNorm`) and fixed padding applied using the `fixed_padding()` function. It has input channels (`inplanes`), output channels (`planes`), kernel size, stride, dilation rate, whether to use bias or not, and an optional BatchNorm layer as parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":0-33",
+            "content": "import math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\ndef fixed_padding(inputs, kernel_size, dilation):\n    kernel_size_effective = kernel_size + (kernel_size - 1) * (dilation - 1)\n    pad_total = kernel_size_effective - 1\n    pad_beg = pad_total // 2\n    pad_end = pad_total - pad_beg\n    padded_inputs = F.pad(inputs, (pad_beg, pad_end, pad_beg, pad_end))\n    return padded_inputs\nclass SeparableConv2d(nn.Layer):\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 kernel_size=3,\n                 stride=1,\n                 dilation=1,\n                 bias=False,\n                 BatchNorm=None):\n        super(SeparableConv2d, self).__init__()\n        self.conv1 = nn.Conv2D(inplanes,\n                               inplanes,\n                               kernel_size,\n                               stride,\n                               0,\n                               dilation,\n                               groups=inplanes,\n                               bias=bias)"
+        },
+        {
+            "comment": "The code defines a block layer that consists of convolutional layers, batch normalization, and optional skip connections. It initializes the block layer with specified parameters such as input planes, output planes, number of repetitions, stride, dilation rate, and whether it's the last block or not. The forward method performs fixed padding on the input, applies the convolution operation, batch normalization, and finally the pointwise convolution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":34-66",
+            "content": "        self.bn = BatchNorm(inplanes)\n        self.pointwise = nn.Conv2D(inplanes, planes, 1, 1, 0, 1, 1, bias=bias)\n    def forward(self, x):\n        x = fixed_padding(x,\n                          self.conv1._kernel_size[0],\n                          dilation=self.conv1.dilation[0])\n        x = self.conv1(x)\n        x = self.bn(x)\n        x = self.pointwise(x)\n        return x\nclass Block(nn.Layer):\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 reps,\n                 stride=1,\n                 dilation=1,\n                 BatchNorm=None,\n                 start_with_relu=True,\n                 grow_first=True,\n                 is_last=False):\n        super(Block, self).__init__()\n        if planes != inplanes or stride != 1:\n            self.skip = nn.Conv2D(inplanes,\n                                  planes,\n                                  1,\n                                  stride=stride,\n                                  bias_attr=False)\n            self.skipbn = BatchNorm(planes)"
+        },
+        {
+            "comment": "The code creates a backbone network with xception structure. It initializes the skip connection, adds a ReLU activation function, and appends layers of separable convolutions with batch normalization. The number of reps determines the number of such layers. If grow_first is True, it starts with a growth block; otherwise, it ends with one.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":67-101",
+            "content": "        else:\n            self.skip = None\n        self.relu = nn.ReLU()\n        rep = []\n        filters = inplanes\n        if grow_first:\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(inplanes,\n                                planes,\n                                3,\n                                1,\n                                dilation,\n                                BatchNorm=BatchNorm))\n            rep.append(BatchNorm(planes))\n            filters = planes\n        for i in range(reps - 1):\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(filters,\n                                filters,\n                                3,\n                                1,\n                                dilation,\n                                BatchNorm=BatchNorm))\n            rep.append(BatchNorm(filters))\n        if not grow_first:\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(inplanes,\n                                planes,"
+        },
+        {
+            "comment": "This code defines a class for the AlignedXception network. It uses separable convolutions with batch normalization and optionally applies ReLU activations at different stages. The function forward performs inference by adding input skip connections and applying batch normalization to skip connections if present.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":102-143",
+            "content": "                                3,\n                                1,\n                                dilation,\n                                BatchNorm=BatchNorm))\n            rep.append(BatchNorm(planes))\n        if stride != 1:\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(planes, planes, 3, 2, BatchNorm=BatchNorm))\n            rep.append(BatchNorm(planes))\n        if stride == 1 and is_last:\n            rep.append(self.relu)\n            rep.append(\n                SeparableConv2d(planes, planes, 3, 1, BatchNorm=BatchNorm))\n            rep.append(BatchNorm(planes))\n        if not start_with_relu:\n            rep = rep[1:]\n        self.rep = nn.Sequential(*rep)\n    def forward(self, inp):\n        x = self.rep(inp)\n        if self.skip is not None:\n            skip = self.skip(inp)\n            skip = self.skipbn(skip)\n        else:\n            skip = inp\n        x = x + skip\n        return x\nclass AlignedXception(nn.Layer):\n    \"\"\"\n    Modified Alighed Xception\n    \"\"\"\n    def __init__(self, output_stride, BatchNorm, pretrained=True):"
+        },
+        {
+            "comment": "This code initializes an AlignedXception network. It sets parameters based on the output_stride, defines convolutional layers and batch normalization for entry flow, and instantiates two blocks with specified dimensions and repetitions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":144-174",
+            "content": "        super(AlignedXception, self).__init__()\n        if output_stride == 16:\n            entry_block3_stride = 2\n            middle_block_dilation = 1\n            exit_block_dilations = (1, 2)\n        elif output_stride == 8:\n            entry_block3_stride = 1\n            middle_block_dilation = 2\n            exit_block_dilations = (2, 4)\n        else:\n            raise NotImplementedError\n        # Entry flow\n        self.conv1 = nn.Conv2D(3, 32, 3, stride=2, padding=1, bias_attr=False)\n        self.bn1 = BatchNorm(32)\n        self.relu = nn.ReLU()\n        self.conv2 = nn.Conv2D(32, 64, 3, stride=1, padding=1, bias_attr=False)\n        self.bn2 = BatchNorm(64)\n        self.block1 = Block(64,\n                            128,\n                            reps=2,\n                            stride=2,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=False)\n        self.block2 = Block(128,\n                            256,\n                            reps=2,\n                            stride=2,"
+        },
+        {
+            "comment": "The code defines the Xception backbone network, consisting of blocks for feature extraction. The first block (entry_block) has 3x3 convolutions and BatchNorm. The block3 has two repetitions with a stride and is the last block. Middle blocks (block4 and block5) have three repetitions with dilation applied to the filter. All blocks use BatchNorm, start with ReLU activation, and grow first with subsequent layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":175-200",
+            "content": "                            BatchNorm=BatchNorm,\n                            start_with_relu=False,\n                            grow_first=True)\n        self.block3 = Block(256,\n                            728,\n                            reps=2,\n                            stride=entry_block3_stride,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True,\n                            is_last=True)\n        # Middle flow\n        self.block4 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block5 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,"
+        },
+        {
+            "comment": "The code defines several blocks (block4 to block8) using the Block class with specific parameters for number of input and output channels, repetitions, stride, dilation, BatchNorm implementation, starting with ReLU activation, and growing first. These blocks are used in a Xception network for image classification or detection tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":201-224",
+            "content": "                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block6 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block7 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block8 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,"
+        },
+        {
+            "comment": "The code defines three consecutive blocks (block9, block10, and block11) in a neural network architecture. Each block takes input and output channels of 728, with 3 repetitions of convolution and batch normalization layers, and an increasing dilation factor (middle_block_dilation). All blocks start with ReLU activation and grow the number of filters first.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":225-248",
+            "content": "                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block9 = Block(728,\n                            728,\n                            reps=3,\n                            stride=1,\n                            dilation=middle_block_dilation,\n                            BatchNorm=BatchNorm,\n                            start_with_relu=True,\n                            grow_first=True)\n        self.block10 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block11 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,"
+        },
+        {
+            "comment": "The code initializes four block instances, each with 728 input and output channels, performing a series of convolutions with 3 repetitions, a stride of 1, dilation determined by middle_block_dilation, using BatchNorm for normalization, starting with ReLU activation, and growing the first layer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":249-272",
+            "content": "                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block12 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block13 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block14 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,"
+        },
+        {
+            "comment": "The code initializes three blocks, each with 728 input and output channels, repeating the process 3 times, and applying BatchNormalization, starting with ReLU activation, and growing first. These blocks are part of the Xception network in the Ma-Net application for image classification tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":273-296",
+            "content": "                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block15 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block16 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block17 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,"
+        },
+        {
+            "comment": "The code defines several blocks (block17 to block20) using the Block class. Each block has a specific number of input and output channels, repetitions, stride, dilation rate, BatchNorm type, start with relu, and grow first parameters. The last block (block20) connects its output to the next layer in the network.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":297-322",
+            "content": "                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block18 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        self.block19 = Block(728,\n                             728,\n                             reps=3,\n                             stride=1,\n                             dilation=middle_block_dilation,\n                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=True)\n        # Exit flow\n        self.block20 = Block(728,\n                             1024,\n                             reps=2,\n                             stride=1,\n                             dilation=exit_block_dilations[0],"
+        },
+        {
+            "comment": "This code defines a series of SeparableConv2d layers with associated BatchNorm layers in an Xception network. The layers have increasing output dimensions and are used for feature extraction and image classification tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":323-347",
+            "content": "                             BatchNorm=BatchNorm,\n                             start_with_relu=True,\n                             grow_first=False,\n                             is_last=True)\n        self.conv3 = SeparableConv2d(1024,\n                                     1536,\n                                     3,\n                                     stride=1,\n                                     dilation=exit_block_dilations[1],\n                                     BatchNorm=BatchNorm)\n        self.bn3 = BatchNorm(1536)\n        self.conv4 = SeparableConv2d(1536,\n                                     1536,\n                                     3,\n                                     stride=1,\n                                     dilation=exit_block_dilations[1],\n                                     BatchNorm=BatchNorm)\n        self.bn4 = BatchNorm(1536)\n        self.conv5 = SeparableConv2d(1536,\n                                     2048,\n                                     3,\n                                     stride=1,"
+        },
+        {
+            "comment": "The code defines a neural network model with 16 blocks and Batch Normalization. It initializes the weights, has an option to load pre-trained weights, and includes ReLU activation functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":348-389",
+            "content": "                                     dilation=exit_block_dilations[1],\n                                     BatchNorm=BatchNorm)\n        self.bn5 = BatchNorm(2048)\n        # Init weights\n        self._init_weight()\n        # Load pretrained model\n        if pretrained:\n            self._load_pretrained_model()\n    def forward(self, x):\n        # Entry flow\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu(x)\n        x = self.block1(x)\n        # add relu here\n        x = self.relu(x)\n        low_level_feat = x\n        x = self.block2(x)\n        x = self.block3(x)\n        # Middle flow\n        x = self.block4(x)\n        x = self.block5(x)\n        x = self.block6(x)\n        x = self.block7(x)\n        x = self.block8(x)\n        x = self.block9(x)\n        x = self.block10(x)\n        x = self.block11(x)\n        x = self.block12(x)\n        x = self.block13(x)\n        x = self.block14(x)\n        x = self.block15(x)\n        x = self.block16(x)"
+        },
+        {
+            "comment": "The code defines a neural network model, initializes its weights, and has methods for processing input and loading pre-trained models. The Xception model is used in image classification tasks. It consists of several convolutional layers with batch normalization and ReLU activation functions. The _init_weight method sets up the initial weights for the convolutional layers using a normal distribution. The _load_pretrained_model method allows loading a pre-trained Xception model from the PaddleHub library, which can be useful when transferring knowledge from an existing dataset to a new task.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":390-426",
+            "content": "        x = self.block17(x)\n        x = self.block18(x)\n        x = self.block19(x)\n        # Exit flow\n        x = self.block20(x)\n        x = self.relu(x)\n        x = self.conv3(x)\n        x = self.bn3(x)\n        x = self.relu(x)\n        x = self.conv4(x)\n        x = self.bn4(x)\n        x = self.relu(x)\n        x = self.conv5(x)\n        x = self.bn5(x)\n        x = self.relu(x)\n        return x, low_level_feat\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels\n                m.weight.normal_(0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2D):\n                from utils.api import fill_\n                fill_(m.weight, 1)\n                from utils.api import zero_\n                zero_(m.bias)\n    def _load_pretrained_model(self):\n        import paddlehub as hub\n        pretrain_dict = hub.Module(name=\"xception71_imagenet\")\n        model_dict = {}\n        state_dict = self.state_dict()"
+        },
+        {
+            "comment": "The code iterates through the pre-trained dictionary, updating specific keys in the model_dict. It handles 'pointwise' layers by unsqueezing the input, and adjusts keys starting with 'block11', 'block12', or 'bn3' by replacing their suffixes to match corresponding blocks. This is likely a method for adapting pre-trained weights to match the target network's structure.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":428-446",
+            "content": "        for k, v in pretrain_dict.items():\n            if k in model_dict:\n                if 'pointwise' in k:\n                    v = v.unsqueeze(-1).unsqueeze(-1)\n                if k.startswith('block11'):\n                    model_dict[k] = v\n                    model_dict[k.replace('block11', 'block12')] = v\n                    model_dict[k.replace('block11', 'block13')] = v\n                    model_dict[k.replace('block11', 'block14')] = v\n                    model_dict[k.replace('block11', 'block15')] = v\n                    model_dict[k.replace('block11', 'block16')] = v\n                    model_dict[k.replace('block11', 'block17')] = v\n                    model_dict[k.replace('block11', 'block18')] = v\n                    model_dict[k.replace('block11', 'block19')] = v\n                elif k.startswith('block12'):\n                    model_dict[k.replace('block12', 'block20')] = v\n                elif k.startswith('bn3'):\n                    model_dict[k] = v\n                    model_dict[k.replace('bn3', 'bn4')] = v"
+        },
+        {
+            "comment": "This code snippet renames 'conv4' and 'bn4' parameters to 'conv5' and 'bn5', respectively, before updating the model dictionary. The final state dictionary is then set as the model's state dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/networks/backbone/xception.py\":447-454",
+            "content": "                elif k.startswith('conv4'):\n                    model_dict[k.replace('conv4', 'conv5')] = v\n                elif k.startswith('bn4'):\n                    model_dict[k.replace('bn4', 'bn5')] = v\n                else:\n                    model_dict[k] = v\n        state_dict.update(model_dict)\n        self.set_state_dict(state_dict)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e881da45-26b7-4628-a8c3-aaa333d4076c.json b/docs/doc/e881da45-26b7-4628-a8c3-aaa333d4076c.json
new file mode 100644
index 000000000..e6ffa3bcc
--- /dev/null
+++ b/docs/doc/e881da45-26b7-4628-a8c3-aaa333d4076c.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code initializes layer weights in PaddlePaddle, applying truncated normal or other initializations like Gaussian and Kaiming uniform. It adjusts for different modes and supports numpy arrays and Paddle tensors.",
+    "details": [
+        {
+            "comment": "This code defines a function that initializes the weights of a PaddlePaddle layer using specified functions. It can also set bias values and is compatible with numpy arrays and Paddle tensors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/weight_init.py\":0-35",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn.initializer as init\nimport numpy as np\nfrom scipy import special\ndef weight_init_(layer,\n                 func,\n                 weight_name=None,\n                 bias_name=None,\n                 bias_value=0.0,\n                 **kwargs):\n    \"\"\"\n    In-place params init function.\n    Usage:\n    .. code-block:: python\n        import paddle\n        import numpy as np\n        data = np.ones([3, 4], dtype='float32')"
+        },
+        {
+            "comment": "Code initializes a Linear layer, applies truncated normal initialization to its weights with specified mean and std deviation, and optionally changes the weight name. If the layer has bias, it initializes the bias with a constant value and optionally changes the bias name. The _no_grad_trunc_normal_ function is used internally by nn.init.trunc_normal_.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/weight_init.py\":36-65",
+            "content": "        linear = paddle.nn.Linear(4, 4)\n        input = paddle.to_tensor(data)\n        print(linear.weight)\n        linear(input)\n        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)\n        print(linear.weight)\n    \"\"\"\n    if hasattr(layer, 'weight') and layer.weight is not None:\n        getattr(init, func)(**kwargs)(layer.weight)\n        if weight_name is not None:\n            # override weight name\n            layer.weight.name = weight_name\n    if hasattr(layer, 'bias') and layer.bias is not None:\n        init.Constant(bias_value)(layer.bias)\n        if bias_name is not None:\n            # override bias name\n            layer.bias.name = bias_name\ndef _no_grad_trunc_normal_(tensor, mean, std, a, b):\n    def norm_cdf(x):\n        # Computes standard normal cumulative distribution function\n        return (1. + math.erf(x / math.sqrt(2.))) / 2.\n    if (mean < a - 2 * std) or (mean > b + 2 * std):\n        print(\"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. \"\n              \"The distribution of values may be incorrect.\")"
+        },
+        {
+            "comment": "This code generates weights for a tensor following a truncated Gaussian distribution. It computes the lower and upper bounds, uniformly fills the tensor with values between these bounds, transforms them to a standard Gaussian distribution, adjusts the mean and standard deviation, clamps the values within the original bounds, and sets the tensor's value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/weight_init.py\":67-97",
+            "content": "    with paddle.no_grad():\n        # Values are generated by using a truncated uniform distribution and\n        # then using the inverse CDF for the normal distribution.\n        # Get upper and lower cdf values\n        l = norm_cdf((a - mean) / std)\n        u = norm_cdf((b - mean) / std)\n        # Uniformly fill tensor with values from [l, u], then translate to [2l-1, 2u-1].\n        tmp = np.random.uniform(2 * l - 1, 2 * u - 1,\n                                size=list(tensor.shape)).astype(np.float32)\n        # Use inverse cdf transform for normal distribution to get truncated\n        # standard normal\n        tmp = special.erfinv(tmp)\n        # Transform to proper mean, std\n        tmp *= (std * math.sqrt(2.0))\n        tmp += mean\n        # Clamp to ensure it's in the proper range\n        tmp = np.clip(tmp, a, b)\n        tensor.set_value(paddle.to_tensor(tmp))\n        return tensor\ndef _calculate_fan_in_and_fan_out(tensor):\n    dimensions = tensor.dim()\n    if dimensions < 2:\n        raise ValueError(\n            \"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions\""
+        },
+        {
+            "comment": "This code initializes weights in a convolutional layer using either truncated normal or Kaiming uniform initialization. It calculates the fan-in and fan-out based on input and output feature maps, receptive field size, and optionally adjusts for different modes. The `trunc_normal_` function generates random values within specific bounds using truncated normal distribution, while `kaiming_normal_` sets weights using Kaiming uniform initialization with an optional nonlinearity parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/weight_init.py\":98-129",
+            "content": "        )\n    num_input_fmaps = tensor.shape[1]\n    num_output_fmaps = tensor.shape[0]\n    receptive_field_size = 1\n    if tensor.dim() > 2:\n        receptive_field_size = tensor[0][0].numel()\n    fan_in = num_input_fmaps * receptive_field_size\n    fan_out = num_output_fmaps * receptive_field_size\n    return fan_in, fan_out\ndef trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):\n    return _no_grad_trunc_normal_(tensor, mean, std, a, b)\ndef kaiming_normal_(tensor, a=0., mode='fan_in', nonlinearity='leaky_relu'):\n    def _calculate_correct_fan(tensor, mode):\n        mode = mode.lower()\n        valid_modes = ['fan_in', 'fan_out']\n        if mode not in valid_modes:\n            raise ValueError(\n                \"Mode {} not supported, please use one of {}\".format(\n                    mode, valid_modes))\n        fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)\n        return fan_in if mode == 'fan_in' else fan_out\n    def calculate_gain(nonlinearity, param=None):\n        linear_fns = [\n            'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',"
+        },
+        {
+            "comment": "This function initializes the weights of a neural network layer with respect to the nonlinearity used. It returns different values depending on the nonlinearity type, calculates the fan for each layer and then applies normal initialization using Paddle's Normal initializer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/weight_init.py\":130-155",
+            "content": "            'conv_transpose2d', 'conv_transpose3d'\n        ]\n        if nonlinearity in linear_fns or nonlinearity == 'sigmoid':\n            return 1\n        elif nonlinearity == 'tanh':\n            return 5.0 / 3\n        elif nonlinearity == 'relu':\n            return math.sqrt(2.0)\n        elif nonlinearity == 'leaky_relu':\n            if param is None:\n                negative_slope = 0.01\n            elif not isinstance(param, bool) and isinstance(\n                    param, int) or isinstance(param, float):\n                negative_slope = param\n            else:\n                raise ValueError(\n                    \"negative_slope {} not a valid number\".format(param))\n            return math.sqrt(2.0 / (1 + negative_slope**2))\n        else:\n            raise ValueError(\"Unsupported nonlinearity {}\".format(nonlinearity))\n    fan = _calculate_correct_fan(tensor, mode)\n    gain = calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    with paddle.no_grad():\n        paddle.nn.initializer.Normal(0, std)(tensor)"
+        },
+        {
+            "comment": "Initializes a tensor with specified values and returns it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/weight_init.py\":156-156",
+            "content": "        return tensor"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e8aaf0b3-2c5a-4019-bb0c-9710ba03a18f.json b/docs/doc/e8aaf0b3-2c5a-4019-bb0c-9710ba03a18f.json
new file mode 100644
index 000000000..a0538fd26
--- /dev/null
+++ b/docs/doc/e8aaf0b3-2c5a-4019-bb0c-9710ba03a18f.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This class offers static methods to read dictionary files and perform utility operations related to PaddleVideo, including functions for file handling, image manipulation, value indexing, and frame sampling.",
+    "details": [
+        {
+            "comment": "Utility class for PaddleVideo containing static methods to read dictionary files and perform various utility operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/utility.h\":0-39",
+            "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#pragma once\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <stdlib.h>\n#include <vector>\n#include <algorithm>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include \"opencv2/opencv.hpp\"\nnamespace PaddleVideo\n{\n    class Utility\n    {\n    public:\n        static std::vector<std::string> ReadDict(const std::string &path);"
+        },
+        {
+            "comment": "The code contains several utility functions. It has a function to get all files in a directory, another for rotating and cropping images based on bounding boxes, a template function for finding the index of maximum value in a range, and one for sampling frames from a video file. All these belong to the PaddleVideo namespace.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/include/utility.h\":41-53",
+            "content": "        static void GetAllFiles(const char *dir_name, std::vector<std::string> &all_inputs);\n        static cv::Mat GetRotateCropImage(const cv::Mat &srcimage, std::vector<std::vector<int>> box);\n        template <class ForwardIterator> inline static size_t argmax(ForwardIterator first, ForwardIterator last)\n        {\n            return std::distance(first, std::max_element(first, last));\n        }\n        static std::vector<cv::Mat> SampleFramesFromVideo(const std::string &VideoPath, const int &num_seg, const int &seg_len);\n    };\n} // namespace PaddleVideo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/e9b1554b-f961-40dd-bcee-c0ce8ece8f10.json b/docs/doc/e9b1554b-f961-40dd-bcee-c0ce8ece8f10.json
new file mode 100644
index 000000000..5ad0ca6d7
--- /dev/null
+++ b/docs/doc/e9b1554b-f961-40dd-bcee-c0ce8ece8f10.json
@@ -0,0 +1,45 @@
+{
+    "summary": "The PaddleVideo library's function compresses predictions based on query masks and similarity scores. The code initializes a Paddle model, prepares data loaders, sets evaluation mode, processes samples, calculates metrics, evaluates models, and runs the \"evaluation\" function.",
+    "details": [
+        {
+            "comment": "This code is part of the PaddleVideo library and contains a function named `compress_predictions`. It imports necessary libraries, defines function parameters, and utilizes various modules from the PaddleVideo library. The function compresses predictions based on query masks and similarity scores (`sims`) with optional top k values. It is type checked for data integrity.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/test.py\":0-32",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport copy\nimport random\nimport paddle\nimport logging\nimport argparse\nimport numpy as np\nimport model.model as module_arch\nimport model.metric as module_metric\nimport data_loader.data_loaders as module_data\nfrom typing import Tuple\nfrom pathlib import Path\nfrom typeguard import typechecked\nfrom mergedeep import Strategy, merge\nfrom parse_config import ConfigParser\nfrom trainer.trainer import verbose, ctxt_mgr\nfrom utils.util import compute_dims, compute_trn_config\n@typechecked\ndef compress_predictions(query_masks: np.ndarray, sims: np.ndarray, topk: int = 10):"
+        },
+        {
+            "comment": "Code validates input shapes, ensuring that sims and query_masks represent the same number of videos and queries. It asserts the correct dimensions for sims and query_masks to ensure compatibility in further computations, preventing potential errors.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/test.py\":33-50",
+            "content": "    \"\"\"We store the indices of the top-k predictions, rather than the full similarity\n    matrix, to reduce storage requirements.\n    NOTE: The similarity matrix contains `num_queries x num_videos` elements, where\n    `num_queries = num_videos x max_num_queries_per_video`.  We first mask out\n    locations in the similarity matrix that correspond to invalid queries (these are\n    produced by videos with fewer than `max_num_queries_per_video` descriptions).\n    \"\"\"\n    # validate the input shapes\n    assert query_masks.ndim == 2, \"Expected query_masks to be a matrix\"\n    query_num_videos, query_max_per_video = query_masks.shape\n    sims_queries, sims_num_videos = sims.shape\n    msg = (f\"Expected sims and query masks to represent the same number of videos \"\n           f\"(found {sims_num_videos} v {query_num_videos}\")\n    assert query_num_videos == sims_num_videos, msg\n    msg = (f\"Expected sims and query masks to represent the same number of queries \"\n           f\"(found {sims_queries} v {query_num_videos * query_max_per_video}\")"
+        },
+        {
+            "comment": "This code defines a function that takes a configuration, logger, and model path as input, returns a tuple containing a Paddle.js layer model and an ExpertDataLoader object for training data. The function first computes the expert dimensions and raw input dimensions based on the provided config, then initializes the train data loaders using the same config and returns the model and data loader tuple. The code also handles cases where some features might be missing by allowing the use of zeros to fill in such gaps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/test.py\":51-83",
+            "content": "    assert query_max_per_video * query_num_videos == sims_queries, msg\n    valid_sims = sims[query_masks.flatten().astype(np.bool)]\n    ranks = np.argsort(-valid_sims, axis=1)\n    return ranks[:, :topk]\n@typechecked\ndef get_model_and_data_loaders(\n        config: ConfigParser,\n        logger: logging.Logger,\n        model_path: Path,\n) -> Tuple[paddle.nn.Layer, module_data.ExpertDataLoader]:\n    expert_dims, raw_input_dims = compute_dims(config)\n    trn_config = compute_trn_config(config)\n    data_loaders = config.init(\n        name='data_loader',\n        module=module_data,\n        logger=logger,\n        raw_input_dims=raw_input_dims,\n        text_feat=config[\"experts\"][\"text_feat\"],\n        text_dim=config[\"experts\"][\"text_dim\"],\n        text_agg=config[\"experts\"][\"text_agg\"],\n        use_zeros_for_missing=config[\"experts\"].get(\"use_zeros_for_missing\", False),\n        eval_only=True,\n    )\n    model = config.init(\n        name='arch',\n        module=module_arch,\n        expert_dims=expert_dims,\n        text_dim=config[\"experts\"][\"text_dim\"],"
+        },
+        {
+            "comment": "This code is initializing a model and preparing it for evaluation. It loads a checkpoint from the specified model path, creates a data loader, and performs an evaluation with the given configuration. The random seed is set to ensure reproducibility of results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/test.py\":84-115",
+            "content": "        ce_shared_dim=config[\"experts\"].get(\"ce_shared_dim\", None),\n        feat_aggregation=config[\"data_loader\"][\"args\"][\"feat_aggregation\"],\n    )\n    model_path = config._args.resume\n    logger.info(f\"Loading checkpoint: {model_path} ...\")\n    checkpoint = paddle.load(model_path)\n    state_dict = checkpoint\n    if config['n_gpu'] > 1:\n        model = paddle.DataParallel(model)\n    model.load_dict(state_dict)\n    return model, data_loaders\ndef evaluation(config, logger=None, trainer=None):\n    if logger is None:\n        logger = config.get_logger('test')\n    if getattr(config._args, \"eval_from_training_config\", False):\n        eval_conf = copy.deepcopy(config)\n        merge(eval_conf._config, config[\"eval_settings\"], strategy=Strategy.REPLACE)\n        config = eval_conf\n    logger.info(\"Running evaluation with configuration:\")\n    logger.info(config)\n    # Set the random initial seeds\n    seed = config[\"seed\"]\n    logger.info(f\"Setting experiment random seed to {seed}\")\n    random.seed(seed)\n    np.random.seed(seed)"
+        },
+        {
+            "comment": "The code snippet initializes the Paddle model, data loaders, and sets the model to evaluation mode. It also prepares the retrieval dataset by checking for nan values and making temporary copies of relevant data elements based on their shape. The code then determines the number of video batches and text batches based on the dataset size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/test.py\":116-145",
+            "content": "    paddle.seed(seed)\n    model, data_loaders = get_model_and_data_loaders(\n        config=config,\n        logger=logger,\n        model_path=Path(config._args.resume),\n    )\n    logger.info(model)\n    metrics = [getattr(module_metric, met) for met in config['metrics']]\n    # prepare model for testing.  Note that some datasets fail to fit the retrieval\n    # set on the GPU, so we run them on the CPU\n    model.eval()\n    with paddle.no_grad():\n        samples, meta = data_loaders[\"retrieval\"]\n        #import pdb; pdb.set_trace()\n        # To use the nan-checks safely, we need make temporary copies of the data\n        all_text_num = samples['text'].shape[0]\n        text_keys = ['text', 'cap_id', 'att_mask', 'text_token_mask']\n        chk = 100\n        tck = 100 \n        if samples['text'].shape[0] % chk == 0:\n            vid_batch = samples['text'].shape[0] // chk\n        else:\n            vid_batch = samples['text'].shape[0] // chk + 1\n        if samples['text'].shape[0] % tck == 0:\n            text_batch  =  samples['text'].shape[0] // tck"
+        },
+        {
+            "comment": "This code slices samples into sub-samples and processes them for multiple videos. It then concatenates the processed results along axis 1, storing each result in the list \"sub_sims\". This process is repeated for a batch of text and video samples. The code also includes progress printing and utilizes context management to run model operations efficiently.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/test.py\":146-166",
+            "content": "        else: \n            text_batch  =  samples['text'].shape[0] // tck + 1\n        sub_sims = []\n        for idx in range(text_batch):\n            if idx % 5 == 0:\n                print(idx,'/',text_batch)\n            sub_samples = {}\n            for key in text_keys:\n                sub_samples.update({key: samples[key][idx*tck:idx*tck+tck]})\n            subsub_sims = []\n            for vid in range(vid_batch):\n                sub_samples['experts'] = {}\n                sub_samples['ind'] = {}\n                for expert in samples['experts'].keys():\n                    sub_samples['experts'][expert] = samples['experts'][expert][vid*chk:vid*chk+chk]\n                    sub_samples['ind'][expert] = samples['ind'][expert][vid*chk:vid*chk+chk]\n                with ctxt_mgr(sub_samples) as valid:\n                    output = model(**valid)\n                subsub_sims.append(output[\"cross_view_conf_matrix\"].cpu())\n            subsub_sims = paddle.concat(subsub_sims, axis=1)\n            sub_sims.append(subsub_sims)"
+        },
+        {
+            "comment": "This code calculates metrics for a dataset, concatenates sub-similarities, converts to numpy array, iterates through metrics and computes results for each metric using sims and query_masks. The results are logged for further analysis and information display.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/test.py\":167-189",
+            "content": "        sub_sims = paddle.concat(sub_sims, axis=0)\n        sims = paddle.to_tensor(sub_sims, dtype='float32').numpy()\n        dataset = data_loaders.dataset_name\n        nested_metrics = {}\n        for metric in metrics:\n            metric_name = metric.__name__\n            res = metric(sims, query_masks=meta[\"query_masks\"])\n            verbose(epoch=0, metrics=res, name=dataset, mode=metric_name)\n            if trainer is not None:\n                if not trainer.mini_train:\n                    trainer.writer.set_step(step=0, mode=\"val\")\n                # avoid tensboard folding by prefixing\n                metric_name_ = f\"test_{metric_name}\"\n                trainer.log_metrics(res, metric_name=metric_name_, mode=\"val\")\n            nested_metrics[metric_name] = res\n    log = {}\n    for subkey, subval in nested_metrics.items():\n        for subsubkey, subsubval in subval.items():\n            log[f\"test_{subkey}_{subsubkey}\"] = subsubval\n    for key, value in log.items():\n        logger.info(\" {:15s}: {}\".format(str(key), value))"
+        },
+        {
+            "comment": "This code sets up argument parsing and configuration loading for evaluation. It checks if a model checkpoint is specified via --resume flag, then merges the main config file with eval_settings (if provided), finally calling the \"evaluation\" function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/test.py\":192-205",
+            "content": "if __name__ == '__main__':\n    args = argparse.ArgumentParser(description='PyTorch Template')\n    args.add_argument('--config', default=None, type=str, help=\"config file path\")\n    args.add_argument('--resume', default=None, help='path to checkpoint for evaluation')\n    args.add_argument('--eval_from_training_config', action=\"store_true\",\n                      help=\"if true, evaluate directly from a training config file.\")\n    args.add_argument(\"--custom_args\", help=\"qualified key,val pairs\")\n    eval_config = ConfigParser(args)\n    cfg_msg = \"For evaluation, a model checkpoint must be specified via the --resume flag\"\n    assert eval_config._args.resume, cfg_msg\n    if eval_config._config.get(\"eval_settings\", False):\n        merge(eval_config._config, eval_config[\"eval_settings\"], strategy=Strategy.REPLACE)\n        evaluation(eval_config)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ea007cde-8602-4b41-ad03-a2e699e98c59.json b/docs/doc/ea007cde-8602-4b41-ad03-a2e699e98c59.json
new file mode 100644
index 000000000..811becda5
--- /dev/null
+++ b/docs/doc/ea007cde-8602-4b41-ad03-a2e699e98c59.json
@@ -0,0 +1,80 @@
+{
+    "summary": "The code introduces new layers, initializes Bottleneck with GCT, defines Convolutional Feature Fusion Block and Atrous Spatial Pyramid Pooling modules. The CollaborativeEnsemblerMS class is a neural network architecture with multiple input dimensions, transformer stages, convolutional layers, ReLU activation, and outputs foreground/background logits using ASPP modules.",
+    "details": [
+        {
+            "comment": "This code defines a class for the IA_gate layer, which is a part of a computer vision model. It has an input and output dimension and includes a linear layer and a forward function. The forward function calculates the activation (a) by applying a tanh function to the linear layer's output and then unsqueezing it along the axis for multiplication. The result is used in the model's computation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":0-31",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nclass IA_gate(nn.Layer):\n    def __init__(self, in_dim, out_dim):\n        super(IA_gate, self).__init__()\n        self.IA = nn.Linear(in_dim, out_dim)\n    def forward(self, x, IA_head):\n        a = self.IA(IA_head)\n        a = 1. + paddle.tanh(a)\n        a = paddle.unsqueeze(paddle.unsqueeze(a, axis=-1), axis=-1)"
+        },
+        {
+            "comment": "This code defines a GCT layer, which is a type of normalization layer for neural networks. It initializes parameters alpha, gamma and beta with specific shapes and default values. The layer also takes in an input x, applies the mode 'l2' operation (pow) on it, and returns the result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":32-64",
+            "content": "        x = a * x\n        return x\nclass GCT(nn.Layer):\n    def __init__(self, num_channels, epsilon=1e-5, mode='l2', after_relu=False):\n        super(GCT, self).__init__()\n        x1 = paddle.zeros([1, num_channels, 1, 1])\n        x2 = paddle.ones([1, num_channels, 1, 1])\n        self.alpha = paddle.create_parameter(\n            shape=x2.shape,\n            dtype=x2.dtype,\n            default_initializer=nn.initializer.Assign(x2))\n        self.alpha.stop_gradient = False\n        self.gamma = paddle.create_parameter(\n            shape=x1.shape,\n            dtype=x1.dtype,\n            default_initializer=nn.initializer.Assign(x1))\n        self.gamma.stop_gradient = False\n        self.beta = paddle.create_parameter(\n            shape=x1.shape,\n            dtype=x1.dtype,\n            default_initializer=nn.initializer.Assign(x1))\n        self.beta.stop_gradient = False\n        self.epsilon = epsilon\n        self.mode = mode\n        self.after_relu = after_relu\n    def forward(self, x):\n        if self.mode == 'l2':\n            embedding = paddle.pow("
+        },
+        {
+            "comment": "The code initializes a Bottleneck layer in the PaddleVideo model, with GCT and convolutional layers for feature extraction. It also includes adjustable normalization and activation based on the mode parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":65-94",
+            "content": "                paddle.sum(paddle.pow(x, 2), axis=[2, 3], keepdim=True) +\n                self.epsilon, 0.5) * self.alpha\n            norm = self.gamma / paddle.pow(\n                (paddle.mean(paddle.pow(embedding, 2), axis=1, keepdim=True) +\n                 self.epsilon), 0.5)\n        elif self.mode == 'l1':\n            if not self.after_relu:\n                _x = paddle.abs(x)\n            else:\n                _x = x\n            embedding = paddle.sum(_x, axis=(2, 3), keepdim=True) * self.alpha\n            norm = self.gamma / (paddle.mean(\n                paddle.abs(embedding), axis=1, keepdim=True) + self.epsilon)\n        else:\n            print('Unknown mode!')\n            exit()\n        gate = 1. + paddle.tanh(embedding * norm + self.beta)\n        return x * gate\nclass Bottleneck(nn.Layer):\n    def __init__(self, inplanes, outplanes, stride=1, dilation=1):\n        super(Bottleneck, self).__init__()\n        expansion = 4\n        planes = int(outplanes / expansion)\n        self.GCT1 = GCT(inplanes)\n        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)"
+        },
+        {
+            "comment": "This code defines a neural network layer that includes batch normalization and convolutional layers, as well as ReLU activation. It has the option for downsampling if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":95-118",
+            "content": "        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=planes)\n        self.conv2 = nn.Conv2D(planes,\n                               planes,\n                               kernel_size=3,\n                               stride=stride,\n                               dilation=dilation,\n                               padding=dilation,\n                               bias_attr=False)\n        self.bn2 = nn.GroupNorm(num_groups=32, num_channels=planes)\n        self.conv3 = nn.Conv2D(planes,\n                               planes * expansion,\n                               kernel_size=1,\n                               bias_attr=False)\n        self.bn3 = nn.GroupNorm(num_groups=32, num_channels=planes * expansion)\n        self.relu = nn.ReLU()\n        if stride != 1 or inplanes != planes * expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(inplanes,\n                          planes * expansion,\n                          kernel_size=1,\n                          stride=stride,\n                          bias_attr=False),"
+        },
+        {
+            "comment": "Code initializes a module with 3 Conv2D layers and BatchNorm2D layers. It also includes a GroupNorm layer if num_groups and num_channels are specified, otherwise sets downsample to None. Initializes sublayers and applies Kaiming Normal initialization. Forward function performs convolutions, adds residual connection if applicable, and applies ReLU activation. _ASPPModule has GCT and AtrousConv2D layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":119-159",
+            "content": "                nn.GroupNorm(num_groups=32, num_channels=planes * expansion),\n            )\n        else:\n            downsample = None\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n    def forward(self, x):\n        residual = x\n        out = self.GCT1(x)\n        out = self.conv1(out)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)\n        if self.downsample is not None:\n            residual = self.downsample(x)\n        out += residual\n        out = self.relu(out)\n        return out\nclass _ASPPModule(nn.Layer):\n    def __init__(self, inplanes, planes, kernel_size, padding, dilation):\n        super(_ASPPModule, self).__init__()\n        self.GCT = GCT(inplanes)\n        self.atrous_conv = nn.Conv2D(inplanes,"
+        },
+        {
+            "comment": "The code defines a Convolutional Feature Fusion Block (CFFB) and an Atrous Spatial Pyramid Pooling (ASPP) module. The CFFB consists of group convolution, batch normalization, and ReLU activation layers. The ASPP module has four pathways with different dilation rates, each followed by a group convolution, batch normalization, and ReLU activation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":160-192",
+            "content": "                                     planes,\n                                     kernel_size=kernel_size,\n                                     stride=1,\n                                     padding=padding,\n                                     dilation=dilation,\n                                     bias_attr=False)\n        self.bn = nn.GroupNorm(num_groups=int(planes / 4), num_channels=planes)\n        self.relu = nn.ReLU()\n        self._init_weight()\n    def forward(self, x):\n        x = self.GCT(x)\n        x = self.atrous_conv(x)\n        x = self.bn(x)\n        return self.relu(x)\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)\nclass ASPP(nn.Layer):\n    def __init__(self):\n        super(ASPP, self).__init__()\n        inplanes = 512\n        dilations = [1, 6, 12, 18]"
+        },
+        {
+            "comment": "This code initializes four ASPPModules and a global average pooling layer in the CFBI head model for feature extraction and pooling. The ASPPModules have different dilation rates based on the specified dilations list, while the global_avg_pool performs adaptive averaging and convolution to extract global features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":194-217",
+            "content": "        self.aspp1 = _ASPPModule(inplanes,\n                                 128,\n                                 1,\n                                 padding=0,\n                                 dilation=dilations[0])\n        self.aspp2 = _ASPPModule(inplanes,\n                                 128,\n                                 3,\n                                 padding=dilations[1],\n                                 dilation=dilations[1])\n        self.aspp3 = _ASPPModule(inplanes,\n                                 128,\n                                 3,\n                                 padding=dilations[2],\n                                 dilation=dilations[2])\n        self.aspp4 = _ASPPModule(inplanes,\n                                 128,\n                                 3,\n                                 padding=dilations[3],\n                                 dilation=dilations[3])\n        self.global_avg_pool = nn.Sequential(\n            nn.AdaptiveAvgPool2D((1, 1)),\n            nn.Conv2D(inplanes, 128, 1, stride=1, bias_attr=False), nn.ReLU())"
+        },
+        {
+            "comment": "The code initializes a class with multiple layers for feature extraction and processing, using Conv2D, GroupNorm, ReLU activation functions, and Global Average Pooling. The forward function combines features from different ASPP modules and passes them through GCT, convolution, batch normalization, and ReLU for final output. Initializes the weight of each layer with specific initializers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":219-250",
+            "content": "        self.GCT = GCT(640)\n        self.conv1 = nn.Conv2D(640, 256, 1, bias_attr=False)\n        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=256)\n        self.relu = nn.ReLU()\n        self._init_weight()\n    def forward(self, x):\n        x1 = self.aspp1(x)\n        x2 = self.aspp2(x)\n        x3 = self.aspp3(x)\n        x4 = self.aspp4(x)\n        x5 = self.global_avg_pool(x)\n        x5 = F.interpolate(x5,\n                           size=x4.shape[2:],\n                           mode='bilinear',\n                           align_corners=True)\n        x = paddle.concat([x1, x2, x3, x4, x5], axis=1)\n        x = self.GCT(x)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        return x\n    def _init_weight(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                nn.initializer.KaimingNormal()\n            elif isinstance(m, nn.GroupNorm):\n                m.weight.data = nn.initializer.Constant(1)\n                m.bias.data = nn.initializer.Constant(0)"
+        },
+        {
+            "comment": "The code defines a CollaborativeEnsemblerMS class within the PaddleVideo framework. It has multiple input dimensions (4x, 8x, and 16x) for semantic embedding, local distance, and attention dimension. The class also includes an instance of ReLU activation function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":253-278",
+            "content": "@HEADS.register()\nclass CollaborativeEnsemblerMS(nn.Layer):\n    def __init__(\n        self,\n        model_semantic_embedding_dim=256,\n        model_multi_local_distance=[[4, 8, 12, 16, 20, 24],\n                                    [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]],\n        model_head_embedding_dim=256,\n        model_refine_channels=64,\n        model_low_level_inplanes=256,\n    ):\n        super(CollaborativeEnsemblerMS, self).__init__()\n        in_dim_4x = model_semantic_embedding_dim * 3 + 3 + 2 * len(\n            model_multi_local_distance[0])\n        in_dim_8x = model_semantic_embedding_dim * 3 + 3 + 2 * len(\n            model_multi_local_distance[1])\n        in_dim_16x = model_semantic_embedding_dim * 3 + 3 + 2 * len(\n            model_multi_local_distance[2])\n        attention_dim = model_semantic_embedding_dim * 4\n        embed_dim = model_head_embedding_dim\n        refine_dim = model_refine_channels\n        low_level_dim = model_low_level_inplanes\n        IA_in_dim = attention_dim\n        self.relu = nn.ReLU()"
+        },
+        {
+            "comment": "This code initializes multiple layers for different stages of a transformer model. Each stage consists of several IA_gate and Bottleneck layers, with varying input and output dimensions. The stages progressively increase the embedding dimension, incorporating additional inputs along the way.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":280-305",
+            "content": "        # stage 1\n        self.S1_IA1 = IA_gate(IA_in_dim, in_dim_4x)\n        self.S1_layer1 = Bottleneck(in_dim_4x, embed_dim)\n        self.S1_IA2 = IA_gate(IA_in_dim, embed_dim)\n        self.S1_layer2 = Bottleneck(embed_dim, embed_dim, 1, 2)\n        # stage2\n        self.S2_IA1 = IA_gate(IA_in_dim, embed_dim)\n        self.S2_layer1 = Bottleneck(embed_dim, embed_dim * 2, 2)\n        self.S2_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_8x)\n        self.S2_layer2 = Bottleneck(embed_dim * 2 + in_dim_8x, embed_dim * 2, 1,\n                                    2)\n        self.S2_IA3 = IA_gate(IA_in_dim, embed_dim * 2)\n        self.S2_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)\n        # stage3\n        self.S3_IA1 = IA_gate(IA_in_dim, embed_dim * 2)\n        self.S3_layer1 = Bottleneck(embed_dim * 2, embed_dim * 2, 2)\n        self.S3_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_16x)\n        self.S3_layer2 = Bottleneck(embed_dim * 2 + in_dim_16x, embed_dim * 2,\n                                    1, 2)"
+        },
+        {
+            "comment": "This code is defining various components of a model for feature extraction and fusion. It includes IA_gate, Bottleneck, GCT, ASPP, nn.Conv2D, GroupNorm, ReLU layers and their configurations. The model has separate modules for encoding and decoding stages to process low-level and high-level features respectively.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":307-331",
+            "content": "        self.S3_IA3 = IA_gate(IA_in_dim, embed_dim * 2)\n        self.S3_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)\n        self.ASPP_IA = IA_gate(IA_in_dim, embed_dim * 2)\n        self.ASPP = ASPP()\n        # Decoder\n        self.GCT_sc = GCT(low_level_dim + embed_dim)\n        self.conv_sc = nn.Conv2D(low_level_dim + embed_dim,\n                                 refine_dim,\n                                 1,\n                                 bias_attr=False)\n        self.bn_sc = nn.GroupNorm(num_groups=int(refine_dim / 4),\n                                  num_channels=refine_dim)\n        self.relu = nn.ReLU()\n        self.IA10 = IA_gate(IA_in_dim, embed_dim + refine_dim)\n        self.conv1 = nn.Conv2D(embed_dim + refine_dim,\n                               int(embed_dim / 2),\n                               kernel_size=3,\n                               padding=1,\n                               bias_attr=False)\n        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))\n        self.IA11 = IA_gate(IA_in_dim, int(embed_dim / 2))"
+        },
+        {
+            "comment": "This code defines a neural network architecture for a computer vision task. It includes convolutional layers, batch normalization, and linear layers. The forward function applies these operations to input features at different scales (4x, 8x, 16x) and concatenates the results. The KaimingNormal initialization is used to set the weights of the convolution layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":332-359",
+            "content": "        self.conv2 = nn.Conv2D(int(embed_dim / 2),\n                               int(embed_dim / 2),\n                               kernel_size=3,\n                               padding=1,\n                               bias_attr=False)\n        self.bn2 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))\n        # Output\n        self.IA_final_fg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)\n        self.IA_final_bg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)\n        self.conv_sc.weight.data = nn.initializer.KaimingNormal()\n        self.conv1.weight.data = nn.initializer.KaimingNormal()\n        self.conv2.weight.data = nn.initializer.KaimingNormal()\n    def forward(self, all_x, all_IA_head=None, low_level_feat=None):\n        x_4x, x_8x, x_16x = all_x\n        IA_head = all_IA_head[0]\n        # stage 1\n        x = self.S1_IA1(x_4x, IA_head)\n        x = self.S1_layer1(x)\n        x = self.S1_IA2(x, IA_head)\n        x = self.S1_layer2(x)\n        low_level_feat = paddle.concat(\n            [paddle.expand(low_level_feat, [x.shape[0], -1, -1, -1]), x],"
+        },
+        {
+            "comment": "This code defines a neural network architecture for instance segmentation. It consists of multiple stages and an ASPP (Atrous Spatial Pyramid Pooling) module. The IA_logit function is used to output foreground and background logits. The final output, 'pred', is the instance segmentation prediction after applying background augmentation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":360-400",
+            "content": "            axis=1)\n        # stage 2\n        x = self.S2_IA1(x, IA_head)\n        x = self.S2_layer1(x)\n        x = paddle.concat([x, x_8x], axis=1)\n        x = self.S2_IA2(x, IA_head)\n        x = self.S2_layer2(x)\n        x = self.S2_IA3(x, IA_head)\n        x = self.S2_layer3(x)\n        # stage 3\n        x = self.S3_IA1(x, IA_head)\n        x = self.S3_layer1(x)\n        x = paddle.concat([x, x_16x], axis=1)\n        x = self.S3_IA2(x, IA_head)\n        x = self.S3_layer2(x)\n        x = self.S3_IA3(x, IA_head)\n        x = self.S3_layer3(x)\n        # ASPP + Decoder\n        x = self.ASPP_IA(x, IA_head)\n        x = self.ASPP(x)\n        x = self.decoder(x, low_level_feat, IA_head)\n        fg_logit = self.IA_logit(x, IA_head, self.IA_final_fg)\n        bg_logit = self.IA_logit(x, IA_head, self.IA_final_bg)\n        pred = self.augment_background_logit(fg_logit, bg_logit)\n        return pred\n    def IA_logit(self, x, IA_head, IA_final):\n        n, c, h, w = x.shape\n        x = paddle.reshape(x, [1, n * c, h, w])\n        IA_output = IA_final(IA_head)"
+        },
+        {
+            "comment": "The code defines two functions: `IA_head` and `decoder`. The `IA_head` function takes an input, applies a convolution with a weight and a bias, and reshapes the output. The `decoder` function combines an input image and a low-level feature, passes it through several convolutional layers with batch normalization and ReLU activation, then applies two IA heads.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":401-432",
+            "content": "        IA_weight = IA_output[:, :c]\n        IA_bias = IA_output[:, -1]\n        IA_weight = paddle.reshape(IA_weight, [n, c, 1, 1])\n        IA_bias = paddle.reshape(IA_bias, [-1])\n        logit = paddle.reshape(\n            F.conv2d(x, weight=IA_weight, bias=IA_bias, groups=n), [n, 1, h, w])\n        return logit\n    def decoder(self, x, low_level_feat, IA_head):\n        x = F.interpolate(x,\n                          size=low_level_feat.shape[2:],\n                          mode='bicubic',\n                          align_corners=True)\n        low_level_feat = self.GCT_sc(low_level_feat)\n        low_level_feat = self.conv_sc(low_level_feat)\n        low_level_feat = self.bn_sc(low_level_feat)\n        low_level_feat = self.relu(low_level_feat)\n        x = paddle.concat([x, low_level_feat], axis=1)\n        x = self.IA10(x, IA_head)\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.IA11(x, IA_head)\n        x = self.conv2(x)\n        x = self.bn2(x)\n        x = self.relu(x)\n        return x"
+        },
+        {
+            "comment": "This function takes two logits, fg_logit and bg_logit, and augments the absolute background logit by using relative background logits from all foreground objects. If there are more than one foreground object, it calculates the minimum of their relative background logits, pads with zeros to match the number of original background logits, concatenates, and adds this augmented background logit to the original fg_logit. The output is then transposed before being returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/cfbi_head.py\":434-447",
+            "content": "    def augment_background_logit(self, fg_logit, bg_logit):\n        #  We augment the logit of absolute background by using the relative background logit of all the\n        #  foreground objects.\n        obj_num = fg_logit.shape[0]\n        pred = fg_logit\n        if obj_num > 1:\n            bg_logit = bg_logit[1:obj_num, :, :, :]\n            aug_bg_logit = paddle.min(bg_logit, axis=0, keepdim=True)\n            pad = paddle.expand(paddle.zeros(aug_bg_logit.shape),\n                                [obj_num - 1, -1, -1, -1])\n            aug_bg_logit = paddle.concat([aug_bg_logit, pad], axis=0)\n            pred = pred + aug_bg_logit\n        pred = paddle.transpose(pred, [1, 0, 2, 3])\n        return pred"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ea45c3af-2eae-4f13-bebd-3fb7fb98a4f4.json b/docs/doc/ea45c3af-2eae-4f13-bebd-3fb7fb98a4f4.json
new file mode 100644
index 000000000..244576794
--- /dev/null
+++ b/docs/doc/ea45c3af-2eae-4f13-bebd-3fb7fb98a4f4.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code retrieves images, converts them to JSON format and stores object locations. It also processes videos, creating JSON annotations on frames with functions for video loading/saving, image resizing, mask processing, and PNG saving.",
+    "details": [
+        {
+            "comment": "Code imports necessary libraries and defines two functions. `get_images` retrieves image files from a specified sequence, sorts them, and returns as a numpy array. `json2frame` reads a JSON file and converts its overlays into Image objects in a list format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/api.py\":0-38",
+            "content": "# Author: AP-Kai\n# Datetime: 2022/1/10\n# Copyright belongs to the author.\n# Please indicate the source for reprinting.\nimport json\nimport os\nfrom collections import OrderedDict\nimport cv2\nimport numpy as np\nfrom PIL import Image\nfrom EIVideo.paddlevideo.utils.manet_utils import overlay_davis\nfrom EIVideo import TEMP_JSON_SAVE_PATH, TEMP_JSON_FINAL_PATH\ndef get_images(sequence='bike-packing'):\n    img_path = os.path.join('data', sequence.strip(), 'frame')\n    img_files = os.listdir(img_path)\n    img_files.sort()\n    files = []\n    for img in img_files:\n        img_file = np.array(Image.open(os.path.join(img_path, img)))\n        files.append(img_file)\n    return np.array(files)\ndef json2frame(path):\n    print(\"now turn masks.json to frames\", path)\n    with open(path, 'r', encoding='utf-8') as f:\n        res = f.read()\n        a = json.loads(res)\n        b = a.get('overlays')\n        b_array = np.array(b)\n        frame_list = []\n        for i in range(0, len(b_array)):\n            im = Image.fromarray(np.uint8(b_array[i]))"
+        },
+        {
+            "comment": "The code converts a PNG image to JSON format. It opens the image using PIL, converts it to grayscale, and stores the unique object IDs found in the image. For each object ID, it finds its corresponding locations in the image and adds them as paths to the pframe (an OrderedDict). Finally, it appends the pframes to a list called pframes. The function returns this list of pframes when complete.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/api.py\":39-66",
+            "content": "            im = cv2.cvtColor(np.asarray(im), cv2.COLOR_RGB2BGR)\n            im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)\n            # im = np.array(b_array[i]).astype(\"uint8\")\n            # im = im.transpose((2, 0, 1))\n            # im = cv2.merge(im)\n            frame_list.append(im)\n    return frame_list\ndef png2json(image_path, sliderframenum, save_json_path):\n    image = Image.open(image_path)  # \u7528PIL\u4e2d\u7684Image.open\u6253\u5f00\u56fe\u50cf\n    image = image.convert('P')\n    image_arr = np.array(image)  # \u8f6c\u5316\u6210numpy\u6570\u7ec4\n    image_arr = image_arr.astype(\"float32\")\n    r1 = np.argwhere(image_arr == 1)  # tuple\n    pframes = []\n    # i -> object id\n    for i in range(1, len(np.unique(image_arr))):\n        pframe = OrderedDict()\n        pframe['path'] = []\n        # Find object id in image_arr\n        r1 = np.argwhere(image_arr == i)  # tuple\n        r1 = r1.astype(\"float32\")\n        # Add path to pframe\n        for j in range(0, len(r1)):\n            r1[j][0] = r1[j][0] / 480.0\n            r1[j][1] = r1[j][1] / 910.0\n            # r1[j] = np.around(r1[j], decimals=16)"
+        },
+        {
+            "comment": "This code is related to video processing, specifically for saving and loading videos. It creates a JSON file with scribble annotations on frames. The \"load_video\" function reads the video frames and converts them to RGB format if necessary. It also supports optional minimum side parameter for resizing frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/api.py\":67-100",
+            "content": "            pframe['path'].append(r1[j].tolist())\n        # Add object id, start_time, stop_time\n        pframe['object_id'] = i\n        pframe['start_time'] = sliderframenum\n        pframe['stop_time'] = sliderframenum\n        # Add pframe to pframes\n        pframes.append(pframe)\n    dic = OrderedDict()\n    dic['scribbles'] = []\n    for i in range(0, int(100)):\n        if i == sliderframenum:\n            # Add value to frame[]\n            dic['scribbles'].append(pframes)\n        else:\n            dic['scribbles'].append([])\n    json_str = json.dumps(dic)\n    with open(save_json_path, 'w') as json_file:\n        json_file.write(json_str)\ndef load_video(video_path, min_side=None):\n    frame_list = []\n    # ToDo To AP-kai: \u662f\u4e0d\u662f\u8f7b\u677e\u5e72\u6389\u4e86m.video_path\uff1f\n    cap = cv2.VideoCapture(video_path)\n    # ToDo To AP-kai: while (cap.isOpened()): -> \u4e0d\u5fc5\u591a\u5199\u4e2a\u62ec\u53f7\u54c8\n    while cap.isOpened():\n        _, frame = cap.read()\n        if frame is None:\n            break\n        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n        if min_side:\n            h, w = frame.shape[:2]"
+        },
+        {
+            "comment": "Code chunk resizes images, appends them to a list, stacks the frames into an array and returns both. It also handles loading data from TEMP_JSON_SAVE_PATH and yields scribbles with a boolean flag for the first one. The last function processes masks by overlaying them onto images, saves them as PNGs in the specified save path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/api.py\":101-129",
+            "content": "            new_w = (w * min_side // min(w, h))\n            new_h = (h * min_side // min(w, h))\n            frame = cv2.resize(frame, (new_w, new_h),\n                               interpolation=cv2.INTER_CUBIC)\n            # .transpose([2, 0, 1])\n        frame_list.append(frame)\n    frames = np.stack(frame_list, axis=0)\n    return frames, frame_list\ndef get_scribbles():\n    # os.makedirs(TEMP_JSON_SAVE_PATH, exist_ok=True)\n    with open(TEMP_JSON_SAVE_PATH) as f:\n        print(\"load TEMP_JSON_SAVE_PATH success\")\n        scribbles = json.load(f)\n        first_scribble = True\n        yield scribbles, first_scribble\ndef submit_masks(save_path, masks, images):\n    overlays = []\n    for img_name, (mask, image) in enumerate(zip(masks, images)):\n        overlay = overlay_davis(image, mask)\n        overlays.append(overlay.tolist())\n        overlay = Image.fromarray(overlay)\n        img_name = str(img_name)\n        while len(img_name) < 5:\n            img_name = '0' + img_name\n        overlay.save(os.path.join(save_path, img_name + '.png'))"
+        },
+        {
+            "comment": "This code is saving a dictionary of overlays to a JSON file. It was previously also saving a list of masks, but that functionality has been commented out. The dictionary contains the overlays and the resulting JSON will be written to the specified temporary path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/api.py\":130-133",
+            "content": "    result = {'overlays': overlays}\n    # result = {'masks': masks.tolist()}\n    with open(TEMP_JSON_FINAL_PATH, 'w') as f:\n        json.dump(result, f)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/eb9035c9-4dfc-4366-87b0-d019e7c33732.json b/docs/doc/eb9035c9-4dfc-4366-87b0-d019e7c33732.json
new file mode 100644
index 000000000..7e5c566e7
--- /dev/null
+++ b/docs/doc/eb9035c9-4dfc-4366-87b0-d019e7c33732.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code defines an abstract base class for all PaddleVideo models, requiring implementation of forward method and including a trainable parameter count in __str__ output. It also imports libraries, checks stop_gradient flag, and calculates parameter shapes.",
+    "details": [
+        {
+            "comment": "This code defines an abstract base class for all models in the PaddleVideo application. It requires derived classes to implement the `forward` method and includes a `__str__` method that prints the model with the number of trainable parameters. The code also imports necessary libraries, checks for stop_gradient flag on parameters, and calculates product of shape for each parameter.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_model.py\":0-35",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport numpy as np\nimport paddle.nn as nn\nfrom abc import abstractmethod\nclass BaseModel(nn.Layer):\n    \"\"\"\n    Base class for all models\n    \"\"\"\n    @abstractmethod\n    def forward(self, *inputs):\n        \"\"\"\n        Forward pass logic\n        :return: Model output\n        \"\"\"\n        raise NotImplementedError\n    def __str__(self):\n        \"\"\"\n        Model prints with number of trainable parameters\n        \"\"\"\n        model_parameters = filter(lambda p: p.stop_gradient==False, self.parameters())\n        params = sum([np.prod(p.shape) for p in model_parameters])"
+        },
+        {
+            "comment": "The code returns a string containing the superclass's __str__ method, followed by the number of trainable parameters in the current instance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/base/base_model.py\":36-36",
+            "content": "        return super().__str__() + f\"\\nTrainable parameters: {params}\""
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ebac55f6-d77b-4017-8599-98910e1050b9.json b/docs/doc/ebac55f6-d77b-4017-8599-98910e1050b9.json
new file mode 100644
index 000000000..fd1ec623d
--- /dev/null
+++ b/docs/doc/ebac55f6-d77b-4017-8599-98910e1050b9.json
@@ -0,0 +1,25 @@
+{
+    "summary": "A PaddlePaddle TSN head class for video quality assessment tasks is defined, implementing adaptive average pooling, linear transformation, dropout, and taking input number of classes and feature channels as arguments. The forward pass function applies these operations and produces classification scores, operating on tensors of dimensions N, num_seg, and num_class, with softmax activation.",
+    "details": [
+        {
+            "comment": "TSNHead: PaddlePaddle Temporal Segment Network head class for video quality assessment tasks. Implements adaptive average pooling, linear transformation, dropout, and takes input number of classes and input feature channels as arguments. Registered in the HEADS registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nfrom paddle.nn import AdaptiveAvgPool2D, Linear, Dropout\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nimport paddle.nn.functional as F\n@HEADS.register()\nclass TSNHead(BaseHead):\n    \"\"\"TSN Head.\n    Args:\n        num_classes (int): The number of classes to be classified.\n        in_channels (int): The number of channles in input feature.\n        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss')."
+        },
+        {
+            "comment": "This code defines a class for an image classification head with dropout and global average pooling. It initializes the class with specified parameters, such as number of classes, input channels, loss configuration, drop ratio, standard deviation for initialization, and data format. The class also includes methods for initializing weights in the fully connected (FC) layer using normal distribution.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py\":31-63",
+            "content": "        drop_ratio(float): drop ratio. Default: 0.4.\n        std(float): Std(Scale) value in normal initilizar. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to initialize.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 drop_ratio=0.4,\n                 std=0.01,\n                 data_format=\"NCHW\",\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.drop_ratio = drop_ratio\n        self.std = std\n        #NOTE: global pool performance\n        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)\n        if self.drop_ratio != 0:\n            self.dropout = Dropout(p=self.drop_ratio)\n        else:\n            self.dropout = None\n        self.fc = Linear(self.in_channels, self.num_classes)\n    def init_weights(self):\n        \"\"\"Initiate the FC layer parameters\"\"\"\n        weight_init_(self.fc,\n                     'Normal',"
+        },
+        {
+            "comment": "This code defines a forward pass function for a neural network head. It applies average pooling, optionally applies dropout, and performs a series of reshaping and fully connected layer operations to produce classification scores. Dropout is applied if not None, and the softmax activation (NOTE: remove) is used in the original code. The code operates on tensors of various dimensions, with N representing the number of input samples, num_seg representing the number of segments or regions for each sample, and num_class representing the number of classes being classified.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py\":64-95",
+            "content": "                     'fc_0.w_0',\n                     'fc_0.b_0',\n                     mean=0.,\n                     std=self.std)\n    def forward(self, x, num_seg):\n        \"\"\"Define how the head is going to run.\n        Args:\n            x (paddle.Tensor): The input data.\n            num_segs (int): Number of segments.\n        Returns:\n            score: (paddle.Tensor) The classification scores for input samples.\n        \"\"\"\n        #XXX: check dropout location!\n        # [N * num_segs, in_channels, 7, 7]\n        x = self.avgpool2d(x)\n        # [N * num_segs, in_channels, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)\n        # [N * num_seg, in_channels, 1, 1]\n        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])\n        # [N, num_seg, in_channels]\n        x = paddle.mean(x, axis=1)\n        # [N, 1, in_channels]\n        x = paddle.reshape(x, shape=[-1, self.in_channels])\n        # [N, in_channels]\n        score = self.fc(x)\n        # [N, num_class]\n        #score = F.softmax(score)  #NOTE remove"
+        },
+        {
+            "comment": "This line returns the calculated score as output from the function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py\":96-96",
+            "content": "        return score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ebad8e11-686c-45de-be4f-69becec66ed9.json b/docs/doc/ebad8e11-686c-45de-be4f-69becec66ed9.json
new file mode 100644
index 000000000..e9cc7502d
--- /dev/null
+++ b/docs/doc/ebad8e11-686c-45de-be4f-69becec66ed9.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This code introduces T2VLAD, a text video retrieval model by Baidu. It provides data preparation, training, and testing steps on MSR-VTT dataset, along with performance metrics and checkpoint information in PaddleVideo.",
+    "details": [
+        {
+            "comment": "This code provides an introduction to the T2VLAD model, a text video retrieval model proposed by Baidu at CVPR 2021. It explains how to prepare data, train the model, and test it on the MSR-VTT dataset. The code also includes instructions for installing dependencies and running the necessary commands.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/README.md\":0-59",
+            "content": "[English](./README_en.md) | \u7b80\u4f53\u4e2d\u6587\n# T2VLAD: \u57fa\u4e8e\u5c40\u90e8\u5168\u5c40\u5bf9\u9f50\u7684\u6587\u672c\u89c6\u9891\u68c0\u7d22\n---\n## \u5185\u5bb9\n- [\u6a21\u578b\u7b80\u4ecb](#\u6a21\u578b\u7b80\u4ecb)\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u8bad\u7ec3](#\u6a21\u578b\u8bad\u7ec3)\n- [\u6a21\u578b\u6d4b\u8bd5](#\u6a21\u578b\u6d4b\u8bd5)\n- [\u53c2\u8003\u8bba\u6587](#\u53c2\u8003\u8bba\u6587)\n\u5728\u5f00\u59cb\u4f7f\u7528\u4e4b\u524d\uff0c\u60a8\u9700\u8981\u6309\u7167\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5\u989d\u5916\u7684\u4f9d\u8d56\u5305\uff1a\n```bash\npython -m pip install paddlenlp\n```\n\u540c\u65f6\u786e\u4fddpaddle\u7248\u672c\u4e3a2.2.2\u3002\n## \u6a21\u578b\u7b80\u4ecb\nT2VLAD\u662f\u767e\u5ea6\u5728CVPR2021\u63d0\u51fa\u7684\u6587\u672c\u89c6\u9891\u68c0\u7d22\u6a21\u578b\u3002\u6587\u672c\u89c6\u9891\u68c0\u7d22\u662f\u4e00\u9879\u5177\u6709\u6311\u6218\u7684\u4efb\u52a1\uff0c\u65e8\u5728\u57fa\u4e8e\u81ea\u7136\u8bed\u8a00\u5904\u7406\u63cf\u8ff0\u641c\u7d22\u76f8\u5173\u89c6\u9891\u5185\u5bb9\u3002\u8fd9\u4e2a\u95ee\u9898\u7684\u5173\u952e\u662f\u5728\u8054\u5408\u5d4c\u5165\u7a7a\u95f4\u4e2d\u6d4b\u91cf\u6587\u672c-\u89c6\u9891\u7684\u76f8\u4f3c\u6027\u3002T2VLAD\u8bbe\u8ba1\u4e86\u4e00\u79cd\u6709\u6548\u7684\u5168\u5c40-\u5c40\u90e8\u5bf9\u9f50\u65b9\u6cd5\uff0c\u5728\u4e09\u4e2a\u6807\u51c6\u7684\u6587\u672c\u89c6\u9891\u68c0\u7d22\u57fa\u51c6\u4e0a\u53d6\u5f97\u4e86\u4e00\u81f4\u7684\u6539\u8fdb\uff0c\u5e76\u4ee5\u660e\u663e\u7684\u4f18\u52bf\u8d85\u8d8a\u4e86\u6700\u5148\u8fdb\u7684\u6280\u672f\u3002\n<div align=\"center\">\n<img src=\"./imgs/t2vlad.png\" height=400 width=700 hspace='10'/> <br />\n</div>\n## \u6570\u636e\u51c6\u5907\nMSR-VTT\u6570\u636e\u4e0b\u8f7d\u53ca\u51c6\u5907\u8bf7\u53c2\u8003 [MSR-VTT\u6570\u636e\u51c6\u5907](../../docs/zh-CN/dataset/msrvtt.md)\n## \u6a21\u578b\u8bad\u7ec3\n### MSR-VTT\u6570\u636e\u96c6\u8bad\u7ec3\n\u4e0b\u8f7d\u6570\u636e\u5e76\u6dfb\u52a0\u5230 `data/MSRVTT` \u6587\u4ef6\u5939\u4e0b\u3002\n#### \u5f00\u59cb\u8bad\u7ec3\n- \u8bad\u7ec3\u542f\u52a8\u547d\u4ee4\u5982\u4e0b:\n```bash\nexport CUDA_VISIBLE_DEVICES=0\npython3.7 train.py --config ./configs/msrvtt_transformers.json\n```\nT2VLAD\u5728\u8bad\u7ec3\u65f6\u4f7f\u7528\u4e86Ranger\u4f18\u5316\u5668\uff0c\u8fd9\u91cc\u6211\u4eec\u6682\u65f6\u6ca1\u6709\u652f\u6301Ranger\u4f18\u5316\u5668\u5230\u7684\u5b9e\u73b0\uff0c\u76ee\u524d\u53ef\u4ee5\u4f7f\u7528AdamW\u4f18\u5316\u5668\u6765\u5b8c\u6210\u8bad\u7ec3\u3002\n## \u6a21\u578b\u6d4b\u8bd5\n- \u5bf9\u4e0b\u6e38\u4efb\u52a1\uff1a\u6587\u672c-\u89c6\u9891\u68c0\u7d22\uff0c\u5728MSR-VTT\u6570\u636e\u96c6\u4e0a\u8bc4\u4f30\u6027\u80fd\uff0c\u8bc4\u4f30\u811a\u672c\u542f\u52a8\u65b9\u5f0f\u5982\u4e0b\uff1a\n```bash\nexport CUDA_VISIBLE_DEVICES=0\npython3.7 test.py --config ./configs/msrvtt_transformers.json --resume ./T2VLAD_msrvtt.pdparams\n```\nMSR-VTT\u6570\u636e\u96c6\u6d4b\u8bd5\u7cbe\u5ea6:"
+        },
+        {
+            "comment": "This code provides performance metrics and checkpoint information for a T2VLAD model in PaddleVideo. The Text-to-Video R@1, R@5, R@10, and Median R values are shown, along with the corresponding checkpoints' links. Video-to-Text R@1, R@5, R@10, and Median R values are also given. The reference paper for T2VLAD is provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/README.md\":60-74",
+            "content": "Text $\\rightarrow$ Video\n| R@1  | R@5  | R@10 | Median R |                         checkpoints                          |\n| :--: | :--: | :--: | :------: | :----------------------------------------------------------: |\n| 29.5 | 59.0 | 70.1 |   4      | [T2VLAD.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/T2VLAD_msrvtt.pdparams) |\nVideo $\\rightarrow$ Text\n| R@1  | R@5  | R@10 | Median R |\n| :--: | :--: | :--: | :------: |\n| 26.1 | 54.7 | 68.1 |   4      |\n## \u53c2\u8003\u8bba\u6587\n- [T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval\n](https://arxiv.org/pdf/2104.10054.pdf), Xiaohan Wang, Linchao Zhu, Yi Yang"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ebe347ec-d419-4e14-bda2-c3e934c9aa94.json b/docs/doc/ebe347ec-d419-4e14-bda2-c3e934c9aa94.json
new file mode 100644
index 000000000..afaab8fa0
--- /dev/null
+++ b/docs/doc/ebe347ec-d419-4e14-bda2-c3e934c9aa94.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code installs TensorRT and sets up Go environment for PaddleVideo C++ serving, checking CUDA version, downloading SSL libraries, and installing necessary packages.",
+    "details": [
+        {
+            "comment": "This script installs TensorRT based on the detected CUDA version, and installs necessary libraries for PaddleVideo. It checks the CUDA version, downloads the corresponding TensorRT package, extracts it to /usr/local, and copies relevant include and lib files to their respective directories. If no CUDA version is found, it displays a message stating no Cuda Found and no need to install TensorRT.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/paddle_env_install.sh\":0-21",
+            "content": "unset GREP_OPTIONS\nfunction install_trt(){\n  CUDA_VERSION=$(nvcc --version | egrep -o \"V[0-9]+.[0-9]+\" | cut -c2-)\n  if [ $CUDA_VERSION == \"10.2\" ]; then\n    wget https://paddle-ci.gz.bcebos.com/TRT/TensorRT6-cuda10.2-cudnn7.tar.gz --no-check-certificate\n    tar -zxf TensorRT6-cuda10.2-cudnn7.tar.gz -C /usr/local\n    cp -rf /usr/local/TensorRT-6.0.1.8/include/*  /usr/include/ && cp -rf /usr/local/TensorRT-6.0.1.8/lib/* /usr/lib/\n    rm -rf TensorRT6-cuda10.2-cudnn7.tar.gz\n  elif [ $CUDA_VERSION == \"11.2\" ]; then\n    wget https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz --no-check-certificate\n    tar -zxf TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz -C /usr/local\n    cp -rf /usr/local/TensorRT-8.0.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.0.3.4/lib/* /usr/lib/\n    rm -rf TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz\n  else\n    echo \"No Cuda Found, no need to install TensorRT\"\n  fi\n}\nfunction env_install()\n{\n    apt install -y libcurl4-openssl-dev libbz2-dev"
+        },
+        {
+            "comment": "This code is installing necessary packages and setting up Go environment for PaddleVideo C++ serving. It downloads SSL libraries, installs Go 1.15.12, sets GOROOT and GOPATH variables, and installs the trt package.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_serving/paddle_env_install.sh\":22-34",
+            "content": "    wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && tar xf centos_ssl.tar && rm -rf centos_ssl.tar && mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so\n    rm -rf /usr/local/go && wget -qO- https://paddle-ci.gz.bcebos.com/go1.15.12.linux-amd64.tar.gz | \\\n    tar -xz -C /usr/local && \\\n    mkdir /root/go && \\\n    mkdir /root/go/bin && \\\n    mkdir /root/go/src && \\\n    echo \"GOROOT=/usr/local/go\" >> /root/.bashrc && \\\n    echo \"GOPATH=/root/go\" >> /root/.bashrc && \\\n    echo \"PATH=/usr/local/go/bin:/root/go/bin:$PATH\" >> /root/.bashrc\n    install_trt\n}\nenv_install"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ec1b8f6c-47a8-4ec9-b8df-9b4258268a4e.json b/docs/doc/ec1b8f6c-47a8-4ec9-b8df-9b4258268a4e.json
new file mode 100644
index 000000000..65287e0dd
--- /dev/null
+++ b/docs/doc/ec1b8f6c-47a8-4ec9-b8df-9b4258268a4e.json
@@ -0,0 +1,30 @@
+{
+    "summary": "The code defines a PaddlePaddle GCN class with convolutional blocks for temporal sequences, utilizing layers such as batch normalization and residual connections. It also presents a custom AGCN backbone model for graph convolution tasks using adaptive graph convolutions.",
+    "details": [
+        {
+            "comment": "The code is defining a GCN (Graph Convolutional Network) class within the PaddlePaddle framework. It takes in channel dimensions, output channel dimensions, vertex numbers and stride as parameters for its constructor. The class has one convolution layer with kernel size of 1 and stride of 1.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn.py\":0-26",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ..registry import BACKBONES\nclass GCN(nn.Layer):\n    def __init__(self, in_channels, out_channels, vertex_nums=25, stride=1):\n        super(GCN, self).__init__()\n        self.conv1 = nn.Conv2D(in_channels=in_channels,\n                               out_channels=3 * out_channels,\n                               kernel_size=1,\n                               stride=1)"
+        },
+        {
+            "comment": "The code defines a convolutional block for processing temporal sequences with 3D spatial-temporal convolutions. The block applies multiple convolution layers, batch normalization, and transposes the dimensions to perform feature extraction from the input sequence. It is parameterized by the number of channels, output channels, vertex numbers, temporal size, and a flag for residual connections.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn.py\":27-56",
+            "content": "        self.conv2 = nn.Conv2D(in_channels=vertex_nums * 3,\n                               out_channels=vertex_nums,\n                               kernel_size=1)\n    def forward(self, x):\n        # x --- N,C,T,V\n        x = self.conv1(x)  # N,3C,T,V\n        N, C, T, V = x.shape\n        x = paddle.reshape(x, [N, C // 3, 3, T, V])  # N,C,3,T,V\n        x = paddle.transpose(x, perm=[0, 1, 2, 4, 3])  # N,C,3,V,T\n        x = paddle.reshape(x, [N, C // 3, 3 * V, T])  # N,C,3V,T\n        x = paddle.transpose(x, perm=[0, 2, 1, 3])  # N,3V,C,T\n        x = self.conv2(x)  # N,V,C,T\n        x = paddle.transpose(x, perm=[0, 2, 3, 1])  # N,C,T,V\n        return x\nclass Block(paddle.nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 vertex_nums=25,\n                 temporal_size=9,\n                 stride=1,\n                 residual=True):\n        super(Block, self).__init__()\n        self.residual = residual\n        self.out_channels = out_channels\n        self.bn_res = nn.BatchNorm2D(out_channels)"
+        },
+        {
+            "comment": "This code initializes a convolutional residual block with a Graph Convolutional Network (GCN) and Temporal Convolutional Network (TCN). The conv_res is a 1x1 convolution, gcn is a GCN layer, and tcn is a TCN layer. In the forward pass, if residual is True, the input goes through the conv_res layer before being passed to the gcn layer, then the tcn layer. The output is either the sum of the input and the residual (if residual is True) or just the output of the GCN layer, which is then passed through a ReLU activation function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn.py\":57-83",
+            "content": "        self.conv_res = nn.Conv2D(in_channels=in_channels,\n                                  out_channels=out_channels,\n                                  kernel_size=1,\n                                  stride=(stride, 1))\n        self.gcn = GCN(in_channels=in_channels,\n                       out_channels=out_channels,\n                       vertex_nums=vertex_nums)\n        self.tcn = nn.Sequential(\n            nn.BatchNorm2D(out_channels),\n            nn.ReLU(),\n            nn.Conv2D(in_channels=out_channels,\n                      out_channels=out_channels,\n                      kernel_size=(temporal_size, 1),\n                      padding=((temporal_size - 1) // 2, 0),\n                      stride=(stride, 1)),\n            nn.BatchNorm2D(out_channels),\n        )\n    def forward(self, x):\n        if self.residual:\n            y = self.conv_res(x)\n            y = self.bn_res(y)\n        x = self.gcn(x)\n        x = self.tcn(x)\n        out = x + y if self.residual else x\n        out = F.relu(out)\n        return out"
+        },
+        {
+            "comment": "The code defines a class AGCN (Adaptive Graph Convolutional Network) as a subclass of nn.Layer, which is an improved version of ST-GCN for graph convolution tasks using adaptive graph convolutions. The model architecture consists of several Block layers with varying in_channels and out_channels, and downsampling is performed with stride=2.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn.py\":86-109",
+            "content": "@BACKBONES.register()\nclass AGCN(nn.Layer):\n    \"\"\"\n    AGCN model improves the performance of ST-GCN using\n    Adaptive Graph Convolutional Networks.\n    Args:\n        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.\n    \"\"\"\n    def __init__(self, in_channels=2, **kwargs):\n        super(AGCN, self).__init__()\n        self.data_bn = nn.BatchNorm1D(25 * 2)\n        self.agcn = nn.Sequential(\n            Block(in_channels=in_channels,\n                  out_channels=64,\n                  residual=False,\n                  **kwargs), Block(in_channels=64, out_channels=64, **kwargs),\n            Block(in_channels=64, out_channels=64, **kwargs),\n            Block(in_channels=64, out_channels=64, **kwargs),\n            Block(in_channels=64, out_channels=128, stride=2, **kwargs),\n            Block(in_channels=128, out_channels=128, **kwargs),\n            Block(in_channels=128, out_channels=128, **kwargs),\n            Block(in_channels=128, out_channels=256, stride=2, **kwargs),\n            Block(in_channels=256, out_channels=256, **kwargs),"
+        },
+        {
+            "comment": "This code defines a custom backbone for AGCN model with a block of 256 in_channels and out_channels, followed by an adaptive average pooling layer. The forward function performs data normalization, transposes the shape, reshapes it, applies the AGCN layer, pools it to size (1,1), and finally reshapes and averages along one axis before returning the result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/agcn.py\":110-127",
+            "content": "            Block(in_channels=256, out_channels=256, **kwargs))\n        self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))\n    def forward(self, x):\n        # data normalization\n        N, C, T, V, M = x.shape\n        x = x.transpose((0, 4, 1, 2, 3))  # N, M, C, T, V\n        x = x.reshape((N * M, C, T, V))\n        x = self.agcn(x)\n        x = self.pool(x)  # NM,C,T,V --> NM,C,1,1\n        C = x.shape[1]\n        x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1)  # N,C,1,1\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/eda0853e-c8c2-4ba8-a88b-b068e5097a4f.json b/docs/doc/eda0853e-c8c2-4ba8-a88b-b068e5097a4f.json
new file mode 100644
index 000000000..d02600107
--- /dev/null
+++ b/docs/doc/eda0853e-c8c2-4ba8-a88b-b068e5097a4f.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code defines ReaderNotFoundError and ReaderZoo classes for video input data readers, offering a singleton reader_zoo and functions to register and get specific readers. The get_reader function returns the reader instance based on name, mode, configuration, and material, while raising ReaderNotFoundError if not found.",
+    "details": [
+        {
+            "comment": "This code defines a class \"ReaderNotFoundError\" for handling reader not found exceptions with the possibility to specify the unavailable reader name and available readers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/reader_utils.py\":0-33",
+            "content": "\"\"\"\nreader_util\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nclass ReaderNotFoundError(Exception):\n    \"\"\"\n    \"Error: reader not found\"\n    \"\"\"\n    def __init__(self, reader_name, avail_readers):\n        super(ReaderNotFoundError, self).__init__()\n        self.reader_name = reader_name\n        self.avail_readers = avail_readers\n    def __str__(self):\n        msg = \"Reader {} Not Found.\\nAvailiable readers:\\n\".format(\n            self.reader_name)"
+        },
+        {
+            "comment": "This code defines classes for video input data readers and a reader zoo. The DataReader class initializes with a model name, mode, and configuration. It has methods to create readers (not implemented) and get config from sections. The ReaderZoo class manages registered readers in a zoo, allowing easy access and usage of different reader types for video input data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/reader_utils.py\":34-82",
+            "content": "        for reader in self.avail_readers:\n            msg += \"  {}\\n\".format(reader)\n        return msg\nclass DataReader(object):\n    \"\"\"\n    data reader for video input\n    \"\"\"\n    def __init__(self, model_name, mode, cfg):\n        self.name = model_name\n        self.mode = mode\n        self.cfg = cfg\n    def create_reader(self):\n        \"\"\"\n        Not implemented\n        \"\"\"\n        pass\n    def get_config_from_sec(self, sec, item, default=None):\n        \"\"\"\n        get_config_from_sec\n        \"\"\"\n        if sec.upper() not in self.cfg:\n            return default\n        return self.cfg[sec.upper()].get(item, default)\nclass ReaderZoo(object):\n    \"\"\"\n    ReaderZoo\n    \"\"\"\n    def __init__(self):\n        \"\"\"\n        __init__\n        \"\"\"\n        self.reader_zoo = {}\n    def regist(self, name, reader):\n        \"\"\"\n        regist\n        \"\"\"\n        assert reader.__base__ == DataReader, \"Unknow model type {}\".format(\n            type(reader))\n        self.reader_zoo[name] = reader\n    def get(self, name, mode, cfg, material=None):"
+        },
+        {
+            "comment": "This code defines a singleton reader_zoo and provides functions for registering readers and getting a specific reader. The get_reader function returns the created reader instance based on the provided name, mode, configuration (cfg), and material (if any). If the reader is not found, it raises ReaderNotFoundError with available reader names as information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/BasketballAction/predict/action_detect/reader/reader_utils.py\":83-108",
+            "content": "        \"\"\"\n        get\n        \"\"\"\n        for k, v in self.reader_zoo.items():\n            if k == name:\n                return v(name, mode, cfg, material)\n        raise ReaderNotFoundError(name, self.reader_zoo.keys())\n# singleton reader_zoo\nreader_zoo = ReaderZoo()\ndef regist_reader(name, reader):\n    \"\"\"\n    regist_reader\n    \"\"\"\n    reader_zoo.regist(name, reader)\ndef get_reader(name, mode, cfg, material=None):\n    \"\"\"\n    get_reader\n    \"\"\"\n    reader_model = reader_zoo.get(name, mode, cfg, material)\n    return reader_model.create_reader()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ee479cee-44cb-4434-bee5-a847ff5495a3.json b/docs/doc/ee479cee-44cb-4434-bee5-a847ff5495a3.json
new file mode 100644
index 000000000..a06e1f788
--- /dev/null
+++ b/docs/doc/ee479cee-44cb-4434-bee5-a847ff5495a3.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The AGCN2sHead class is a head for the AGCN2s model in PaddleVideo, with input arguments defining channels, classes, people, and dropout ratio. It registers under HEADS registry, inherits from BaseHead class, initializes base class, sets instance variables, creates a linear layer, and reshapes input for forward pass. The code takes the input tensor x, averages along axes, passes through a fully connected layer (self.fc) to produce output.",
+    "details": [
+        {
+            "comment": "The code defines the AGCN2sHead class, a head for the AGCN2s model in PaddleVideo. It has input feature channels, number of classes, number of people, and dropout ratio as arguments. This head is registered under HEADS registry and inherits from BaseHead class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/agcn2s_head.py\":0-31",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport paddle\nimport paddle.nn as nn\nfrom .base import BaseHead\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\n@HEADS.register()\nclass AGCN2sHead(BaseHead):\n    \"\"\"\n    Head for AGCN2s model.\n    Args:\n        in_channels: int, input feature channels. Default: 64.\n        num_classes: int, output the number of classes.\n        M: int, number of people.\n        drop_out: float, dropout ratio of layer. Default: 0."
+        },
+        {
+            "comment": "Class constructor takes in_channels, num_classes, and M as parameters, initializes base class, sets instance variables, creates a linear layer with specified weights using paddle's Normal initializer, and reshapes input for forward pass.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/agcn2s_head.py\":32-55",
+            "content": "    \"\"\"\n    def __init__(self, in_channels=64, num_classes=10, M=2, **kwargs):\n        super().__init__(num_classes, in_channels, **kwargs)\n        self.in_channels = in_channels\n        self.M = M\n        weight_attr = paddle.ParamAttr(\n            name=\"linear_weight\",\n            initializer=paddle.nn.initializer.Normal(mean=0.0,\n                                                     std=math.sqrt(\n                                                         2. / num_classes)))\n        self.fc = nn.Linear(self.in_channels * 4,\n                            self.num_classes,\n                            weight_attr=weight_attr)\n    def forward(self, x):\n        \"\"\"Define how the head is going to run.\n        \"\"\"\n        assert x.shape[\n            0] % self.M == 0, f'The first dimension of the output must be an integer multiple of the number of people M, but recieved shape[0]={x.shape[0]}, M={self.M}'\n        # N*M,C,T,V\n        N = x.shape[0] // self.M\n        c_new = x.shape[1]\n        x = x.reshape([N, self.M, c_new, -1])"
+        },
+        {
+            "comment": "This code takes the input tensor x, averages it along the third and first axes respectively, then passes it through a fully connected layer (self.fc) to produce an output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/agcn2s_head.py\":56-58",
+            "content": "        x = x.mean(3).mean(1)\n        return self.fc(x)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ee4e7b22-14bb-46d9-ac33-7bc46cdb0c38.json b/docs/doc/ee4e7b22-14bb-46d9-ac33-7bc46cdb0c38.json
new file mode 100644
index 000000000..ebd57ecb4
--- /dev/null
+++ b/docs/doc/ee4e7b22-14bb-46d9-ac33-7bc46cdb0c38.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The PaddleVideo class in PaddlePipelines uses PIL and skimage for image decoding operations. It accepts parameters such as scales, side map, and backend. This class can be used with datasets like KITTI and KITTI ODOM, supporting the retrieval of image paths and resizing of depth images. The code organizes results into a dictionary structure, processes image data based on 'train' or 'val', retrieves color images, adjusts intrinsics for depth estimation, stores results in the 'imgs' dictionary, and adds processed 'imgs' to 'results'.",
+    "details": [
+        {
+            "comment": "This code is a Python class for decoding images, registered with the PADDLEPIPELINES module. It uses the PIL and skimage libraries, and is part of the PaddleVideo package in PaddlePaddle. The class takes in a dataset and frame_idxs as parameters for image decoding operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_image.py\":0-36",
+            "content": "#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport numpy as np\nimport PIL.Image as pil\ntry:\n    import skimage.transform\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [scikit-image] package and it's dependencies is required for ADDS.\"\n    )\nfrom PIL import Image\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass ImageDecoder(object):\n    \"\"\"Decode Image\n    \"\"\"\n    def __init__(self,\n                 dataset,\n                 frame_idxs,"
+        },
+        {
+            "comment": "This code defines a class for image decoding pipelines, accepting parameters such as number of scales, side map, full resolution shape, image extension, and backend. It also includes methods for loading images using the PIL library and retrieving image paths based on the dataset. The class is intended to be used for decoding images from specific datasets like KITTI and KITTI ODOM.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_image.py\":37-65",
+            "content": "                 num_scales,\n                 side_map,\n                 full_res_shape,\n                 img_ext,\n                 backend='cv2'):\n        self.backend = backend\n        self.dataset = dataset\n        self.frame_idxs = frame_idxs\n        self.num_scales = num_scales\n        self.side_map = side_map\n        self.full_res_shape = full_res_shape\n        self.img_ext = img_ext\n    def _pil_loader(self, path):\n        with open(path, 'rb') as f:\n            with Image.open(f) as img:\n                return img.convert('RGB')\n    def get_color(self, folder, frame_index, side):\n        color = self._pil_loader(\n            self.get_image_path(self.dataset, folder, frame_index, side))\n        return color\n    def get_image_path(self, dataset, folder, frame_index, side):\n        if dataset == \"kitti\":\n            f_str = \"{:010d}{}\".format(frame_index, self.img_ext)\n            image_path = os.path.join(self.data_path, folder, f_str)\n        elif dataset == \"kitti_odom\":\n            f_str = \"{:06d}{}\".format(frame_index, self.img_ext)"
+        },
+        {
+            "comment": "This code defines a class with two methods: `get_image_path` and `get_depth`. The first method returns the path of an image based on the dataset, folder, frame index, and side. If the dataset is \"kitti_depth\", it constructs the path using frame index and extension. The second method retrieves depth data for a given dataset, folder, frame index, and side. It uses the \"kitii_depth\" dataset, constructs the path to the depth file, opens the image, resizes it, and converts it into a float32 array divided by 256.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_image.py\":66-88",
+            "content": "            image_path = os.path.join(self.data_path,\n                                      \"sequences/{:02d}\".format(int(folder)),\n                                      \"image_{}\".format(self.side_map[side]),\n                                      f_str)\n        elif dataset == \"kitti_depth\":\n            f_str = \"{:010d}{}\".format(frame_index, self.img_ext)\n            image_path = os.path.join(\n                self.data_path, folder,\n                \"image_0{}/data\".format(self.side_map[side]), f_str)\n        return image_path\n    def get_depth(self, dataset, folder, frame_index, side):\n        if dataset == \"kitii_depth\":\n            f_str = \"{:010d}.png\".format(frame_index)\n            depth_path = os.path.join(\n                self.data_path, folder,\n                \"proj_depth/groundtruth/image_0{}\".format(self.side_map[side]),\n                f_str)\n            depth_gt = pil.open(depth_path)\n            depth_gt = depth_gt.resize(self.full_res_shape, pil.NEAREST)\n            depth_gt = np.array(depth_gt).astype(np.float32) / 256"
+        },
+        {
+            "comment": "This function reads the depth image from file and resizes it to the desired shape, ensuring that it is a 16-bit depth map. It also checks if the maximum value exceeds 255, asserting that this is not an 8-bit image. The final output is a resized depth_gt with dimensions self.full_res_shape[::-1]. The function returns this resized depth_gt after performing any necessary operations for mp4 decode operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_image.py\":90-115",
+            "content": "        else:\n            f_str = \"{:010d}{}\".format(frame_index, self.img_ext)\n            depth_path = os.path.join(self.data_path, folder + '_gt', f_str)\n            img_file = Image.open(depth_path)\n            depth_png = np.array(img_file, dtype=int)\n            img_file.close()\n            # make sure we have a proper 16bit depth map here.. not 8bit!\n            assert np.max(depth_png) > 255, \\\n                \"np.max(depth_png)={}, path={}\".format(np.max(depth_png), depth_path)\n            depth_gt = depth_png.astype(np.float) / 256.\n            depth_gt = depth_gt[160:960 - 160, :]\n            depth_gt = skimage.transform.resize(depth_gt,\n                                                self.full_res_shape[::-1],\n                                                order=0,\n                                                preserve_range=True,\n                                                mode='constant')\n        return depth_gt\n    def __call__(self, results):\n        \"\"\"\n        Perform mp4 decode operations."
+        },
+        {
+            "comment": "This code handles the decoding of images and organizes the results into a dictionary structure. It checks if the mode is set to 'infer', where it opens an image in RGB format, stores it in the results dictionary under 'imgs' key, and returns the results. If the mode is not set or is 'train', it sets up necessary variables for organizing data based on day or night folders and whether the folder is real or fake.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_image.py\":116-148",
+            "content": "        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        if results.get('mode', None) == 'infer':\n            imgs = {}\n            imgs[(\"color\", 0,\n                  -1)] = Image.open(results[\"filename\"]).convert(\"RGB\")\n            results['imgs'] = imgs\n            return results\n        self.data_path = results['data_path']\n        results['backend'] = self.backend\n        imgs = {}\n        results['frame_idxs'] = self.frame_idxs\n        results['num_scales'] = self.num_scales\n        file_name = results['filename']\n        folder = results['folder']\n        frame_index = results['frame_index']\n        line = file_name.split('/')\n        istrain = folder.split('_')[1]\n        if 'mode' not in results:\n            results['mode'] = istrain\n        results['day_or_night'] = folder.split('_')[0]\n        if istrain == \"train\":\n            if folder[0] == 'd':\n                folder2 = folder + '_fake_night'\n                flag = 0\n            else:\n                folder2 = folder + '_fake_day'"
+        },
+        {
+            "comment": "This code is setting up image files for decoding from a given folder and folder2 based on the frame indexes. It also considers whether the images are for the left or right side, identified by 'r' and 'l'. The flag variable is used to check if there's a change in side. If the folder name does not contain 'train', it executes something else (not shown in this code snippet).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_image.py\":149-178",
+            "content": "                tmp = folder\n                folder = folder2\n                folder2 = tmp\n                flag = 1\n            if len(line) == 3:\n                side = line[2]\n            else:\n                side = None\n            results['side'] = side\n            for i in self.frame_idxs:\n                if i == \"s\":\n                    other_side = {\"r\": \"l\", \"l\": \"r\"}[side]\n                    imgs[(\"color\", i,\n                          -1)] = self.get_color(folder, frame_index, other_side)\n                    imgs[(\"color_n\", i,\n                          -1)] = self.get_color(folder2, frame_index,\n                                                other_side)\n                else:\n                    imgs[(\"color\", i,\n                          -1)] = self.get_color(folder, frame_index + i, side)\n                    imgs[(\"color_n\", i,\n                          -1)] = self.get_color(folder2, frame_index + i, side)\n            istrain = folder.split('_')[1]\n            if istrain != 'train':\n                if flag:"
+        },
+        {
+            "comment": "The code checks the 'train' or 'val' flag and processes image data accordingly. It retrieves color images based on 'frame_idxs', adjusts intrinsics for depth estimation, and stores results in 'imgs' dictionary. The processed 'imgs' is then added to 'results' before returning it.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_image.py\":179-205",
+            "content": "                    depth_gt = self.get_depth(folder2, frame_index, side)\n                else:\n                    depth_gt = self.get_depth(folder, frame_index, side)\n                imgs[\"depth_gt\"] = np.expand_dims(depth_gt, 0)\n        elif istrain == 'val':\n            if len(line) == 3:\n                side = line[2]\n            else:\n                side = None\n            for i in self.frame_idxs:\n                if i == \"s\":\n                    other_side = {\"r\": \"l\", \"l\": \"r\"}[side]\n                    imgs[(\"color\", i,\n                          -1)] = self.get_color(folder, frame_index, other_side)\n                else:\n                    imgs[(\"color\", i,\n                          -1)] = self.get_color(folder, frame_index + i, side)\n            # adjusting intrinsics to match each scale in the pyramid\n            depth_gt = self.get_depth(self.dataset, folder, frame_index, side)\n            imgs[\"depth_gt\"] = np.expand_dims(depth_gt, 0)\n        results['imgs'] = imgs\n        return results"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ee9b6955-c8bc-401b-9325-c73bd62ce982.json b/docs/doc/ee9b6955-c8bc-401b-9325-c73bd62ce982.json
new file mode 100644
index 000000000..170eff514
--- /dev/null
+++ b/docs/doc/ee9b6955-c8bc-401b-9325-c73bd62ce982.json
@@ -0,0 +1,40 @@
+{
+    "summary": "The code introduces a Non-Maximum Suppression function for bounding boxes and defines the AVARoIHead class, an object detection layer performing ROI alignment with bbox loss calculation, image assignment & sampling, and result returning. The simple_test function tests detection without augmentation.",
+    "details": [
+        {
+            "comment": "This code defines the function bbox2result, which takes in bounding box coordinates (bboxes), labels, number of classes (num_classes), image shape (img_shape) and a threshold value (thr). The function returns a list of numpy arrays representing the detection results. If there are no detections (i.e., bboxes is empty), it returns an empty list of zeros for each class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_head.py\":0-28",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nfrom .. import builder\nfrom ..registry import HEADS\ndef bbox2result(bboxes, labels, num_classes, img_shape, thr=0.01):\n    \"\"\"Convert detection results to a list of numpy arrays.  \"\"\"\n    if len(bboxes) == 0:\n        return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32))\n    else:\n        bboxes = bboxes[0]\n        labels = labels\n        img_shape_np = img_shape"
+        },
+        {
+            "comment": "This code performs Non-Maximum Suppression (NMS) on bounding boxes and scores to filter out overlapping regions. It iterates through each class, selects bounding boxes and their corresponding scores that are above a certain threshold for each class, and appends them to the result list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_head.py\":29-58",
+            "content": "        img_h, img_w = img_shape_np[0][0], img_shape_np[0][1]\n        img_w = paddle.cast(img_w, dtype='int32')\n        img_h = paddle.cast(img_h, dtype='int32')\n        bboxes[:, 0::2] /= img_w\n        bboxes[:, 1::2] /= img_h\n        # We only handle multilabel now\n        assert labels.shape[-1] > 1\n        scores = labels  # rename\n        thr = (thr, ) * num_classes if isinstance(thr, float) else thr\n        assert scores.shape[1] == num_classes\n        assert len(thr) == num_classes\n        result = []\n        for i in range(num_classes - 1):\n            #step1. \u5bf9\u8be5\u7c7b, \u6bcf\u4e2abbox\u7684\u5f97\u5206\u662f\u5426\u5927\u4e8e\u9608\u503c\n            where = scores[:, i + 1] > thr[i + 1]\n            where = paddle.nonzero(where)  # index\n            bboxes_select = paddle.index_select(x=bboxes, index=where)\n            bboxes_select = bboxes_select[:, :4]\n            scores_select = paddle.index_select(x=scores, index=where)\n            scores_select = scores_select[:, i + 1:i + 2]\n            result.append(\n                #\u5bf9\u4e8estep1\u4e2d\u5f97\u5206\u5927\u4e8e\u9608\u503c\u7684bbox(\u53ef\u80fd\u4e3a\u7a7a), \u5c06bbox\u53ca\u5728\u8be5\u7c7b\u7684score\u653e\u5165result\u5217\u8868."
+        },
+        {
+            "comment": "The code defines a class named AVARoIHead, which is a PaddlePaddle layer for object detection. It initializes the assigner and sampler, and optionally initializes the bbox_head (bounding box regression head). The method init_assigner_sampler initializes the bbox_assigner and bbox_sampler from the passed arguments. The method init_bbox_head initializes the bounding box regression head if the bbox_head is provided. This class registers with HEADS, which may be a registry or a list of defined classes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_head.py\":59-92",
+            "content": "                paddle.concat((bboxes_select, scores_select), axis=1).numpy())\n        return result\n@HEADS.register()\nclass AVARoIHead(nn.Layer):\n    def __init__(self,\n                 assigner,\n                 sampler,\n                 pos_weight=1.0,\n                 action_thr=0.0,\n                 bbox_roi_extractor=None,\n                 bbox_head=None,\n                 train_cfg=None,\n                 test_cfg=None):\n        super().__init__()\n        self.assigner = assigner\n        self.sampler = sampler\n        self.pos_weight = pos_weight\n        self.action_thr = action_thr\n        self.init_assigner_sampler()\n        if bbox_head is not None:\n            self.init_bbox_head(bbox_roi_extractor, bbox_head)\n    def init_assigner_sampler(self):\n        \"\"\"Initialize assigner and sampler.\"\"\"\n        self.bbox_assigner = None\n        self.bbox_sampler = None\n        self.bbox_assigner = builder.build_assigner(self.assigner)\n        self.bbox_sampler = builder.build_sampler(self.sampler, context=self)\n    def init_bbox_head(self, bbox_roi_extractor, bbox_head):"
+        },
+        {
+            "comment": "This code initializes the bbox_head and defines the _bbox_forward function for feature extraction and prediction, as well as the _bbox_forward_train function for training purposes. It also handles situations where ROI's width or height equals 0 by correcting the roi_align operation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_head.py\":93-113",
+            "content": "        \"\"\"Initialize ``bbox_head``\"\"\"\n        self.bbox_roi_extractor = builder.build_roi_extractor(\n            bbox_roi_extractor)\n        self.bbox_head = builder.build_head(bbox_head)\n    def _bbox_forward(self, x, rois, rois_num):\n        bbox_feat = self.bbox_roi_extractor(x, rois, rois_num)\n        cls_score, bbox_pred = self.bbox_head(\n            bbox_feat, rois, rois_num\n        )  #deal with: when roi's width or height = 0 , roi_align is wrong\n        bbox_results = dict(cls_score=cls_score,\n                            bbox_pred=bbox_pred,\n                            bbox_feats=bbox_feat)\n        return bbox_results\n    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels):\n        \"\"\"Run forward function and calculate loss for box head in training.\"\"\"\n        rois = [res.bboxes for res in sampling_results]\n        rois_num = [res.bboxes.shape[0] for res in sampling_results]\n        bbox_results = self._bbox_forward(x, rois, rois_num)\n        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,"
+        },
+        {
+            "comment": "The code defines a ROI head that calculates the bbox loss and performs assignment and sampling for each image in a batch. It takes input images, proposal list, ground truth bounding boxes, and labels as parameters and returns results containing loss_bbox.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_head.py\":114-133",
+            "content": "                                                  gt_labels, self.pos_weight)\n        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], bbox_targets)\n        bbox_results.update(loss_bbox=loss_bbox)\n        return bbox_results\n    def train_step(self, x, img_metas, proposal_list, gt_bboxes, gt_labels):\n        #1. assign gts and sample proposals\n        num_imgs = len(img_metas[0])\n        sampling_results = []\n        for i in range(num_imgs):\n            assign_result = self.bbox_assigner.assign(proposal_list[i],\n                                                      gt_bboxes[i],\n                                                      gt_labels[i])\n            sampling_result = self.bbox_sampler.sample(assign_result,\n                                                       proposal_list[i],\n                                                       gt_bboxes[i],\n                                                       gt_labels[i])\n            sampling_results.append(sampling_result)\n        #2. forward and loss"
+        },
+        {
+            "comment": "The code contains two main functions, \"simple_test\" and \"_bbox_forward_train\". The former is for testing the model in simple test mode by taking input x, proposal list, img_shape, and rescale flag. It calculates det_bboxes and det_labels using the function \"simple_test_bboxes\". Then it uses bbox2result to convert det_bboxes and det_labels into bbox_results. The latter function takes input x, sampling results, gt_bboxes, and gt_labels to calculate bbox results and losses. It updates the losses dictionary with \"loss_bbox\" and returns the losses.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_head.py\":134-157",
+            "content": "        bbox_results = self._bbox_forward_train(x, sampling_results, gt_bboxes,\n                                                gt_labels)\n        losses = dict()\n        losses.update(bbox_results['loss_bbox'])\n        return losses\n    def simple_test(self, x, proposal_list, img_shape, rescale=False):\n        x_shape = x[0].shape\n        #assert x_shape[0] == 1, 'only accept 1 sample at test mode'\n        det_bboxes, det_labels = self.simple_test_bboxes(x,\n                                                         img_shape,\n                                                         proposal_list,\n                                                         self.action_thr,\n                                                         rescale=rescale)\n        bbox_results = bbox2result(det_bboxes, det_labels,\n                                   self.bbox_head.num_classes, img_shape,\n                                   self.action_thr)\n        return [bbox_results]\n    def simple_test_bboxes(self,\n                           x,"
+        },
+        {
+            "comment": "This function tests only detection bboxes without augmentation. It takes input x, proposals, action_thr, and rescale as parameters. It creates rois and rois_num from the proposals. It then calls _bbox_forward to get cls_score. It sets crop_quadruple and flip to False. Finally, it calls bbox_head's get_det_bboxes to return det_bboxes and det_labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/roi_head.py\":158-176",
+            "content": "                           img_shape,\n                           proposals,\n                           action_thr,\n                           rescale=False):\n        \"\"\"Test only det bboxes without augmentation.\"\"\"\n        rois = [proposals]\n        rois_num = [rois[0].shape[0]]\n        bbox_results = self._bbox_forward(x, rois, rois_num)\n        cls_score = bbox_results['cls_score']\n        crop_quadruple = np.array([0, 0, 1, 1])\n        flip = False\n        det_bboxes, det_labels = self.bbox_head.get_det_bboxes(\n            rois,\n            cls_score,\n            img_shape,\n            flip=flip,\n            crop_quadruple=crop_quadruple)\n        return det_bboxes, det_labels"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f056e460-dd90-4b51-8857-d2e4ff737d5c.json b/docs/doc/f056e460-dd90-4b51-8857-d2e4ff737d5c.json
new file mode 100644
index 000000000..a90dfa203
--- /dev/null
+++ b/docs/doc/f056e460-dd90-4b51-8857-d2e4ff737d5c.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code provides a guide for testing the pre-trained VideoTag model on custom data. It covers preparing test data and running inference using Python's videotag_test.py script. The video file input formats supported are mp4, mkv, and webm. Inference is performed on 300 uniformly sampled frames per video. GPU acceleration can be enabled with the --use\\_gpu flag.",
+    "details": [
+        {
+            "comment": "This code provides a guide for testing the pre-trained VideoTag model on custom data. It covers preparing test data and running inference using Python's videotag_test.py script. The video file input formats supported are mp4, mkv, and webm. Inference is performed on 300 uniformly sampled frames per video. GPU acceleration can be enabled with the --use\\_gpu flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/Test.md\":0-30",
+            "content": "# \u9884\u8bad\u7ec3\u6a21\u578b\u81ea\u6d4b\u6307\u5357\n## \u5185\u5bb9\n\u53c2\u8003\u672c\u6587\u6863\uff0c\u60a8\u53ef\u4ee5\u5feb\u901f\u6d4b\u8bd5VideoTag\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u5728\u81ea\u5df1\u4e1a\u52a1\u6570\u636e\u4e0a\u7684\u9884\u6d4b\u6548\u679c\u3002\n\u4e3b\u8981\u5185\u5bb9\u5305\u62ec:\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u63a8\u65ad](#\u6a21\u578b\u63a8\u65ad)\n## \u6570\u636e\u51c6\u5907\n\u5728\u6570\u636e\u51c6\u5907\u9636\u6bb5\uff0c\u60a8\u9700\u8981\u51c6\u5907\u597d\u81ea\u5df1\u7684\u6d4b\u8bd5\u6570\u636e\uff0c\u5e76\u5728video\\_tag/data/VideoTag\\_test.list\u6587\u4ef6\u4e2d\u6307\u5b9a\u5f85\u63a8\u65ad\u7684\u6d4b\u8bd5\u6587\u4ef6\u8def\u5f84\uff0c\u5185\u5bb9\u683c\u5f0f\u5982\u4e0b:\n```\nmy_video_path/my_video_file1.mp4\nmy_video_path/my_video_file2.mp4\n...\n```\n## \u6a21\u578b\u63a8\u65ad\n\u6a21\u578b\u63a8\u65ad\u7684\u542f\u52a8\u65b9\u5f0f\u5982\u4e0b\uff1a\n    python videotag_test.py\n- \u76ee\u524d\u652f\u6301\u7684\u89c6\u9891\u6587\u4ef6\u8f93\u5165\u683c\u5f0f\u4e3a\uff1amp4\u3001mkv\u548cwebm\u683c\u5f0f\uff1b\n- \u6a21\u578b\u4f1a\u4ece\u8f93\u5165\u7684\u89c6\u9891\u6587\u4ef6\u4e2d*\u5747\u5300\u62bd\u53d6300\u5e27*\u7528\u4e8e\u9884\u6d4b\u3002\u5bf9\u4e8e\u8f83\u957f\u7684\u89c6\u9891\u6587\u4ef6\uff0c\u5efa\u8bae\u5148\u622a\u53d6\u6709\u6548\u90e8\u5206\u8f93\u5165\u6a21\u578b\u4ee5\u63d0\u9ad8\u9884\u6d4b\u901f\u5ea6\uff1b\n- \u901a\u8fc7--use\\_gpu\u53c2\u6570\u53ef\u6307\u5b9a\u662f\u5426\u4f7f\u7528gpu\u8fdb\u884c\u63a8\u65ad\uff0c\u9ed8\u8ba4\u4f7f\u7528gpu\u3002\u5bf9\u4e8e10s\u5de6\u53f3\u7684\u77ed\u89c6\u9891\u6587\u4ef6\uff0cgpu\u63a8\u65ad\u65f6\u95f4\u7ea6\u4e3a4s\uff1b\n- \u901a\u8fc7--filelist\u53ef\u6307\u5b9a\u8f93\u5165list\u6587\u4ef6\u8def\u5f84\uff0c\u9ed8\u8ba4\u4e3avideo\\_tag/data/VideoTag\\_test.list\u3002"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f0663793-8f98-4731-9883-073b01aa42c6.json b/docs/doc/f0663793-8f98-4731-9883-073b01aa42c6.json
new file mode 100644
index 000000000..6a1d6fbb6
--- /dev/null
+++ b/docs/doc/f0663793-8f98-4731-9883-073b01aa42c6.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code lists three research papers on action detection and localization in videos: Fast Action Proposals, Bag-of-fragments, and Action Localization through Context Walk.",
+    "details": [
+        {
+            "comment": "This code is a list of useful temporal action detection papers, each with their respective authors and conference/journal they were published in.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Temporal Action Detection Papers\":0-11",
+            "content": "Usefull Temporal Action Detection Papers. \n    Rethinking the Faster R-CNN Architecture for Temporal Action Localization - Yu-Wei Chao et al., CVPR2018\n    Weakly Supervised Action Localization by Sparse Temporal Pooling Network - Phuc Nguyen et al., CVPR 2018\n    Temporal Deformable Residual Networks for Action Segmentation in Videos - P. Lei and S. Todrovic., CVPR2018.\n    End-to-End, Single-Stream Temporal Action Detection in Untrimmed Videos - Shayamal Buch et al., BMVC 2017\n    Cascaded Boundary Regression for Temporal Action Detection - Jiyang Gao et al., BMVC 2017\n    Temporal Tessellation: A Unified Approach for Video Analysis - Kaufman et al., ICCV2017. \n    Temporal Action Detection with Structured Segment Networks - Y. Zhao et al., ICCV2017. \n    Temporal Context Network for Activity Localization in Videos - X. Dai et al., ICCV2017.\n    Detecting the Moment of Completion: Temporal Models for Localising Action Completion - F. Heidarivincheh et al., arXiv2017.\n    CDC: Convolutional-De-"
+        },
+        {
+            "comment": "This code contains references to various papers on temporal action detection, localization, and understanding in untrimmed videos. It includes papers from different authors and years, with some including PyTorch implementation and project web links.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Temporal Action Detection Papers\":11-20",
+            "content": "Convolutional Networks for Precise Temporal Action Localization in Untrimmed Videos - Z. Shou et al, CVPR2017.\n    SST: Single-Stream Temporal Action Proposals - S. Buch et al, CVPR2017.\n    R-C3D: Region Convolutional 3D Network for Temporal Activity Detection - H. Xu et al, arXiv2017. [code] [project web] [PyTorch]\n    DAPs: Deep Action Proposals for Action Understanding - V. Escorcia et al, ECCV2016. \n    Online Action Detection using Joint Classification-Regression Recurrent Neural Networks - Y. Li et al, ECCV2016. \n    Temporal Action Localization in Untrimmed Videos via Multi-stage CNNs - Z. Shou et al, CVPR2016. \n    Fast Temporal Activity Proposals for Efficient Detection of Human Actions in Untrimmed Videos - F. Heilbron et al, CVPR2016. \n    Actionness Estimation Using Hybrid Fully Convolutional Networks - L. Wang et al, CVPR2016. \n    Learning Activity Progression in LSTMs for Activity Detection and Early Detection - S. Ma et al, CVPR2016.\n    End-to-end Learning of Action Detection from Frame Glimpses in Videos - S. Yeung et al, CVPR2016. "
+        },
+        {
+            "comment": "This code provides references to three research papers related to action detection and localization in videos: \n1. Fast Action Proposals by Yu & Yuan (CVPR2015), \n2. Bag-of-fragments by Mettes et al. (ICMR2015), and \n3. Action Localization through Context Walk by Soomro et al. (ICCV2015).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/tutorials/Temporal Action Detection Papers\":21-23",
+            "content": "    Fast Action Proposals for Human Action Detection and Search - G. Yu and J. Yuan, CVPR2015. \n    Bag-of-fragments: Selecting and encoding video fragments for event detection and recounting - P. Mettes et al, ICMR2015.\n    Action localization in videos through context walk - K. Soomro et al, ICCV2015."
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f080dcdf-69e5-4f71-94ad-a6005467e31e.json b/docs/doc/f080dcdf-69e5-4f71-94ad-a6005467e31e.json
new file mode 100644
index 000000000..f547a1519
--- /dev/null
+++ b/docs/doc/f080dcdf-69e5-4f71-94ad-a6005467e31e.json
@@ -0,0 +1,65 @@
+{
+    "summary": "This code imports libraries, defines functions for data processing and categorizing experts, adjusts input features, ensures Tensor format, and includes utility functions.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries and defines several functions. The 'filter_cmd_args' function removes specified keys from a list of command arguments while preserving the order. The 'set_seeds' function sets seeds for randomization libraries, ensuring consistent results. The 'memory_summary' function provides a summary of virtual memory usage using the 'psutil' library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":0-49",
+            "content": "\"\"\"\nExclude from autoreload\n%aimport -util.utils\n\"\"\"\nimport os\nimport json\nimport random\nfrom pathlib import Path\nfrom datetime import datetime\nfrom typing import List\nfrom itertools import repeat\nfrom collections import OrderedDict\nimport numpy as np\nimport paddle\nimport psutil\nimport humanize\nfrom PIL import Image\nfrom typeguard import typechecked\n@typechecked\ndef filter_cmd_args(cmd_args: List[str], remove: List[str]) -> List[str]:\n    drop = []\n    for key in remove:\n        if key not in cmd_args:\n            continue\n        pos = cmd_args.index(key)\n        drop.append(pos)\n        if len(cmd_args) > (pos + 1) and not cmd_args[pos + 1].startswith(\"--\"):\n            drop.append(pos + 1)\n    for pos in reversed(drop):\n        cmd_args.pop(pos)\n    return cmd_args\n@typechecked\ndef set_seeds(seed: int):\n    \"\"\"Set seeds for randomisation libraries.\n    Args:\n        seed: the seed value\n    \"\"\"\n    random.seed(seed)\n    np.random.seed(seed)\n    paddle.seed(seed)\ndef memory_summary():\n    vmem = psutil.virtual_memory()\n    msg = ("
+        },
+        {
+            "comment": "This code defines three functions. The first function, `print_memory`, prints the current system memory usage in a readable format. The second function, `flatten_dict`, recursively flattens nested dictionaries into a single-level dictionary. The third function, `expert_tensor_storage`, categorizes experts based on their temporal configurations into fixed, variable, and flaky sets.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":50-75",
+            "content": "        f\">>> Currently using {vmem.percent}% of system memory \"\n        f\"{humanize.naturalsize(vmem.used)}/{humanize.naturalsize(vmem.available)}\"\n    )\n    print(msg)\ndef flatten_dict(x, keysep=\"-\"):\n    flat_dict = {}\n    for key, val in x.items():\n        if isinstance(val, dict):\n            flat_subdict = flatten_dict(val)\n            flat_dict.update({f\"{key}{keysep}{subkey}\": subval\n                              for subkey, subval in flat_subdict.items()})\n        else:\n            flat_dict.update({key: val})\n    return flat_dict\ndef expert_tensor_storage(experts, feat_aggregation):\n    expert_storage = {\"fixed\": set(), \"variable\": set(), \"flaky\": set()}\n    # fixed_sz_experts, variable_sz_experts, flaky_experts = set(), set(), set()\n    for expert, config in feat_aggregation.items():\n        if config[\"temporal\"] in {\"vlad\",  \"fixed_seg\"}:\n            expert_storage[\"variable\"].add(expert)\n        elif config[\"temporal\"] in {\"avg\", \"max\", \"avg-max\", \"max-avg\", \"avg-max-ent\", \n                                    \"max-avg-ent\"}:"
+        },
+        {
+            "comment": "This code snippet contains a function that takes in an expert and its configuration, adds it to the appropriate storage based on its temporal strategy, and handles flaky experts. It also defines two utility functions - read_json for parsing JSON files and path2str for converting pathlib objects to strings for serialization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":76-102",
+            "content": "            expert_storage[\"fixed\"].add(expert)\n        else:\n            raise ValueError(f\"unknown temporal strategy: {config['temporal']}\")\n        # some \"flaky\" experts are only available for a fraction of videos - we need\n        # to pass this information (in the form of indices) into the network for any\n        # experts present in the current dataset\n        if config.get(\"flaky\", False):\n            expert_storage[\"flaky\"].add(expert)\n    # we only allocate storage for experts used by the current dataset\n    for key, value in expert_storage.items():\n        expert_storage[key] = value.intersection(set(experts))\n    return expert_storage\ndef read_json(fname):\n    with fname.open('rt') as handle:\n        return json.load(handle, object_hook=OrderedDict)\ndef path2str(x):\n    \"\"\"Recursively convert pathlib objects to strings to enable serialization\"\"\"\n    for key, val in x.items():\n        if isinstance(val, dict):\n            path2str(val)\n        elif isinstance(val, Path):\n            x[key] = str(val)"
+        },
+        {
+            "comment": "This code includes a function for writing JSON data, an infinite loop wrapper for data loaders, two classes for hashable dictionaries, a function to compute training configuration from a given config file, and a function to compute dimensions from the same config file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":105-142",
+            "content": "def write_json(content, fname, paths2strs=False):\n    if paths2strs:\n        path2str(content)\n    with fname.open('wt') as handle:\n        json.dump(content, handle, indent=4, sort_keys=False)\ndef inf_loop(data_loader):\n    ''' wrapper function for endless data loader. '''\n    for loader in repeat(data_loader):\n        yield from loader\nclass HashableDict(dict):\n    def __hash__(self):\n        return hash(frozenset(self))\nclass HashableOrderedDict(dict):\n    def __hash__(self):\n        return hash(frozenset(self))\ndef compute_trn_config(config, logger=None):\n    trn_config = {}\n    feat_agg = config[\"data_loader\"][\"args\"][\"feat_aggregation\"]\n    for static_expert in feat_agg.keys():\n        if static_expert in feat_agg:\n            if \"trn_seg\" in feat_agg[static_expert].keys():\n                trn_config[static_expert] = feat_agg[static_expert][\"trn_seg\"]\n    return trn_config\ndef compute_dims(config, logger=None):\n    if logger is None:\n        logger = config.get_logger('utils')\n    experts = config[\"experts\"]"
+        },
+        {
+            "comment": "This code is organizing modalities, extracting expert settings and dimensions for different modalities like face, features_scene, features_s3d, and features_flow. It also checks if any feature should be dropped and sorts them accordingly. Finally, it assigns the input and output dimensions based on the modality and temporal aggregation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":143-164",
+            "content": "    # TODO(Samuel): clean up the logic since it's a little convoluted\n    ordered = sorted(config[\"experts\"][\"modalities\"])\n    if experts[\"drop_feats\"]:\n        to_drop = experts[\"drop_feats\"].split(\",\")\n        logger.info(f\"dropping: {to_drop}\")\n        ordered = [x for x in ordered if x not in to_drop]\n    feat_agg = config[\"data_loader\"][\"args\"][\"feat_aggregation\"]\n    dims = []\n    arch_args = config[\"arch\"][\"args\"]\n    vlad_clusters = arch_args[\"vlad_clusters\"]\n    for expert in ordered:\n        temporal = feat_agg[expert][\"temporal\"]\n        if expert == \"face\":\n            in_dim, out_dim = experts[\"face_dim\"], experts[\"face_dim\"]\n        elif expert == \"features_scene\" and temporal == \"vlad\":\n            in_dim, out_dim = 2208 * vlad_clusters[\"features_scene\"], 2208\n        elif expert == \"features_s3d\" and temporal == \"vlad\":\n            in_dim, out_dim = 1024 * vlad_clusters[\"features_s3d\"], 1024\n        elif expert == \"features_flow\" and temporal == \"vlad\":\n            in_dim, out_dim = 1024 * vlad_clusters[\"features_flow\"], 1024"
+        },
+        {
+            "comment": "This code snippet is determining the input and output dimensions based on the expert type and temporal method used. It sets the input dimension by multiplying vlad_clusters value with respective constants, and the output dimension remains constant for each expert type and temporal method combination.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":165-180",
+            "content": "        elif expert == \"features_rgb\" and temporal == \"vlad\":\n            in_dim, out_dim = 2048 * vlad_clusters[\"features_rgb\"], 2048\n        elif expert == \"features_ocr\" and temporal == \"vlad\":\n            in_dim, out_dim = 300 * vlad_clusters[\"features_ocr\"], 300\n        elif expert == \"features_face\" and temporal == \"vlad\":\n            in_dim, out_dim = 512 * vlad_clusters[\"features_face\"], 512\n        elif expert == \"features_speech\" and temporal == \"vlad\":\n            in_dim, out_dim = 300 * vlad_clusters[\"features_speech\"], 300\n        elif expert == \"features_audio\" and temporal == \"vlad\":\n            in_dim, out_dim = 128 * vlad_clusters[\"features_audio\"], 128\n        elif expert == \"audio\" and temporal == \"vlad\":\n            in_dim, out_dim = 128 * vlad_clusters[\"audio\"], 128\n        elif expert == \"audio\" and temporal == \"vlad\":\n            in_dim, out_dim = 128 * vlad_clusters[\"audio\"], 128\n        elif expert == \"speech\" and temporal == \"vlad\":\n            in_dim, out_dim = 300 * vlad_clusters[\"speech\"], 300"
+        },
+        {
+            "comment": "This code block assigns the input and output dimensions for different experts (e.g., OCR, detection, detection-sem, openpose) based on their respective configurations and cluster settings. It also considers aggregation types like avg or max pooling.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":181-201",
+            "content": "        elif expert == \"ocr\" and temporal == \"vlad\":\n            in_dim, out_dim = 300 * vlad_clusters[\"ocr\"], 300\n        elif expert == \"detection\":\n            # allow for avg pooling\n            det_clusters = arch_args[\"vlad_clusters\"].get(\"detection\", 1)\n            in_dim, out_dim = 1541 * det_clusters, 1541\n        elif expert == \"detection-sem\":\n            if config[\"data_loader\"][\"args\"].get(\"spatial_feats\", False):\n                base = 300 + 16\n            else:\n                base = 300 + 5\n            det_clusters = arch_args[\"vlad_clusters\"].get(\"detection-sem\", 1)\n            in_dim, out_dim = base * det_clusters, base\n        elif expert == \"openpose\":\n            base = 54\n            det_clusters = arch_args[\"vlad_clusters\"].get(\"openpose\", 1)\n            in_dim, out_dim = base * det_clusters, base\n        else:\n            common_dim = feat_agg[expert][\"feat_dims\"][feat_agg[expert][\"type\"]]\n            # account for aggregation of multilpe forms (e.g. avg + max pooling)\n            common_dim = common_dim * len(feat_agg[expert][\"temporal\"].split(\"-\"))"
+        },
+        {
+            "comment": "This code configures the expert dimensions for a machine learning model. It checks if certain conditions are met, such as disabling VLAD for text with single tokens and using averaging only with text using single tokens. To avoid dependencies between dataloader and model architecture, it creates a second copy of expert dimensions accounting for the number of VLAD clusters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":202-225",
+            "content": "            in_dim, out_dim = common_dim, common_dim\n        # For the CE architecture, we need to project all features to a common\n        # dimensionality\n        if arch_args.get(\"mimic_ce_dims\", False):\n            out_dim = experts[\"ce_shared_dim\"]\n        dims.append((expert, (in_dim, out_dim)))\n    expert_dims = OrderedDict(dims)\n    if vlad_clusters[\"text\"] == 0:\n        msg = \"vlad can only be disabled for text with single tokens\"\n        assert config[\"data_loader\"][\"args\"][\"max_tokens\"][\"text\"] == 1, msg\n    if config[\"experts\"][\"text_agg\"] == \"avg\":\n        msg = \"averaging can only be performed with text using single tokens\"\n        assert config[\"arch\"][\"args\"][\"vlad_clusters\"][\"text\"] == 0\n        assert config[\"data_loader\"][\"args\"][\"max_tokens\"][\"text\"] == 1\n    # To remove the dependency of dataloader on the model architecture, we create a\n    # second copy of the expert dimensions which accounts for the number of vlad\n    # clusters\n    raw_input_dims = OrderedDict()\n    for expert, dim_pair in expert_dims.items():"
+        },
+        {
+            "comment": "This code is adjusting the dimensionality of input features for different expert models and ensuring they are in Tensor format. It also provides utility functions like Timer for measuring time durations and tensor2im to convert Tensors into numpy images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":226-257",
+            "content": "        raw_dim = dim_pair[0]\n        if expert in {\"audio\", \"speech\", \"ocr\", \"detection\", \"detection-sem\", \"openpose\", \"features_audio\", \"features_speech\", \"features_face\", \"features_ocr\",  \"features_rgb\", \"features_flow\", \"features_s3d\", \"features_scene\",\n                      \"speech.mozilla.0\"}:\n            if feat_agg[expert][\"temporal\"] == \"vlad\":\n                raw_dim = raw_dim // vlad_clusters.get(expert, 1)\n        raw_input_dims[expert] = raw_dim\n    return expert_dims, raw_input_dims\ndef ensure_tensor(x):\n    if not isinstance(x, paddle.Tensor): #if not isinstance(x, torch.Tensor):\n        x = paddle.to_tensor(x) #    x = torch.from_numpy(x)\n    return x\nclass Timer:\n    def __init__(self):\n        self.cache = datetime.now()\n    def check(self):\n        now = datetime.now()\n        duration = now - self.cache\n        self.cache = now\n        return duration.total_seconds()\n    def reset(self):\n        self.cache = datetime.now()\ndef tensor2im(input_image, imtype=np.uint8):\n    \"\"\"\"Converts a Tensor array into a numpy image array."
+        },
+        {
+            "comment": "The function normalizes and converts the input image tensor array to a numpy array. It also handles different data types and saves the numpy image to disk.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":259-283",
+            "content": "    Parameters:\n        input_image (tensor) --  the input image tensor array\n        imtype (type)        --  the desired type of the converted numpy array\n    \"\"\"\n    if not isinstance(input_image, np.ndarray):\n        if isinstance(input_image, paddle.Tensor): #if isinstance(input_image, torch.Tensor):  # get the data from a variable\n            image_tensor = input_image #image_tensor = input_image.data\n        else:\n            return input_image\n        # convert it into a numpy array\n        image_numpy = image_tensor[0].cpu().float().numpy()\n        if image_numpy.shape[0] == 1:  # grayscale to RGB\n            image_numpy = np.tile(image_numpy, (3, 1, 1))\n        # post-processing: tranpose and scaling\n        image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0\n    else:  # if it is a numpy array, do nothing\n        image_numpy = input_image\n    return image_numpy.astype(imtype)\ndef save_image(image_numpy, image_path):\n    \"\"\"Save a numpy image to the disk\n    Parameters:\n        image_numpy (numpy array) -- input numpy array"
+        },
+        {
+            "comment": "util.py contains several utility functions:\n1. \"image_to_path\" converts an image numpy array to a PIL Image and saves it at the given path.\n2. \"print_numpy\" prints statistics (mean, min, max, median, std) of a numpy array if specified.\n3. \"mkdirs\" creates empty directories if they don't exist, accepting either a list of paths or a single path.\n4. \"mkdir\" is a helper function for \"mkdirs,\" creating a single directory if it doesn't exist.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":284-320",
+            "content": "        image_path (str)          -- the path of the image\n    \"\"\"\n    image_pil = Image.fromarray(image_numpy)\n    image_pil.save(image_path)\ndef print_numpy(x, val=True, shp=False):\n    \"\"\"Print the mean, min, max, median, std, and size of a numpy array\n    Parameters:\n        val (bool) -- if print the values of the numpy array\n        shp (bool) -- if print the shape of the numpy array\n    \"\"\"\n    x = x.astype(np.float64)\n    if shp:\n        print('shape,', x.shape)\n    if val:\n        x = x.flatten()\n        print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (\n            np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))\ndef mkdirs(paths):\n    \"\"\"create empty directories if they don't exist\n    Parameters:\n        paths (str list) -- a list of directory paths\n    \"\"\"\n    if isinstance(paths, list) and not isinstance(paths, str):\n        for path in paths:\n            mkdir(path)\n    else:\n        mkdir(paths)\ndef mkdir(path):\n    \"\"\"create a single empty directory if it didn't exist"
+        },
+        {
+            "comment": "This function creates a directory if it does not exist at the given path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/utils/util.py\":322-326",
+            "content": "    Parameters:\n        path (str) -- a single directory path\n    \"\"\"\n    if not os.path.exists(path):\n        os.makedirs(path)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f0bc3308-bc79-4065-ad74-396d26f0a5ed.json b/docs/doc/f0bc3308-bc79-4065-ad74-396d26f0a5ed.json
new file mode 100644
index 000000000..764b1b613
--- /dev/null
+++ b/docs/doc/f0bc3308-bc79-4065-ad74-396d26f0a5ed.json
@@ -0,0 +1,20 @@
+{
+    "summary": "Recognizer3D defines a model framework with forward_net, train/val_step for training and validation in recognition models. It handles image processing based on backbone and calculates loss metrics. The code defines test_step and infer_step methods for testing and inference.",
+    "details": [
+        {
+            "comment": "Recognizer3D is a 3D Recognizer model framework, which defines how the model runs from input to output. It includes forward_net method for model execution and train_step method for training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3d.py\":0-32",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass Recognizer3D(BaseRecognizer):\n    \"\"\"3D Recognizer model framework.\n    \"\"\"\n    def forward_net(self, imgs):\n        \"\"\"Define how the model is going to run, from input to output.\n        \"\"\"\n        feature = self.backbone(imgs)\n        cls_score = self.head(feature)\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Training step."
+        },
+        {
+            "comment": "The code is defining two methods, `train_step` and `val_step`, which are used for training and validation steps respectively in a recognition model. If the backbone of the model is 'ResNet3dSlowOnly', it reshapes the images to have a specific dimension before processing. For other backbones, it separates the images and labels from the data batch accordingly. Both methods then forward the images through the `forward_net` and calculate loss metrics with or without validation mode depending on the step type. The final output is the loss metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3d.py\":33-63",
+            "content": "        \"\"\"\n        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':\n            imgs = data_batch[0]\n            labels = data_batch[1:]\n            if imgs.dim() == 6:\n                imgs = imgs.reshape([-1] + imgs.shape[2:])\n        else:\n            imgs = data_batch[0:2]\n            labels = data_batch[2:]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels)\n        return loss_metrics\n    def val_step(self, data_batch):\n        \"\"\"Validating setp.\n        \"\"\"\n        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':\n            imgs = data_batch[0]\n            labels = data_batch[1:]\n            if imgs.dim() == 6:\n                imgs = imgs.reshape([-1] + imgs.shape[2:])\n        else:\n            imgs = data_batch[0:2]\n            labels = data_batch[2:]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)\n        return loss_metrics"
+        },
+        {
+            "comment": "The code defines two methods: `test_step` and `infer_step`. In the `test_step`, if the backbone is a 'ResNet3dSlowOnly', it reshapes the input images, then calls the forward pass to get class scores. Otherwise, it takes the first two elements of the data batch for inference. The `infer_step` follows similar logic but without the condition on backbone type. Both methods return the class scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3d.py\":65-92",
+            "content": "    def test_step(self, data_batch):\n        \"\"\"Test step.\n        \"\"\"\n        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':\n            imgs = data_batch[0]\n            if imgs.dim() == 6:\n                imgs = imgs.reshape([-1] + imgs.shape[2:])\n        else:\n            imgs = data_batch[0:2]\n        # call forward\n        cls_score = self.forward_net(imgs)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Infer step.\n        \"\"\"\n        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':\n            imgs = data_batch[0]\n            # call forward\n            imgs = imgs.reshape([-1] + imgs.shape[2:])\n            cls_score = self.forward_net(imgs)\n        else:\n            imgs = data_batch[0:2]\n            # call forward\n            cls_score = self.forward_net(imgs)\n        return cls_score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f15cf972-cd8a-425f-8be9-c63d25acdc96.json b/docs/doc/f15cf972-cd8a-425f-8be9-c63d25acdc96.json
new file mode 100644
index 000000000..89fe1a113
--- /dev/null
+++ b/docs/doc/f15cf972-cd8a-425f-8be9-c63d25acdc96.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This Python script serves models in PaddleVideo framework, parses command-line arguments, and sends video data via HTTP requests using argparse, json, and requests libraries. It converts videos to numpy arrays, encodes as base64 strings, and sends to specific URL endpoints.",
+    "details": [
+        {
+            "comment": "The code is a Python script that parses command-line arguments and defines functions for model serving in the PaddleVideo framework. It imports necessary libraries, including argparse for argument handling, json for data manipulation, requests for HTTP communication, and utils module for converting video to numpy format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/pipeline_http_client.py\":0-29",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport json\nimport requests\nfrom utils import numpy_to_base64, parse_file_paths, video_to_numpy\ndef parse_args():\n    # general params\n    parser = argparse.ArgumentParser(\"PaddleVideo Web Serving model script\")\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/PP-TSM.yaml',\n                        help='serving config file path')"
+        },
+        {
+            "comment": "This code defines command-line arguments for port number and input file path or directory, parses the arguments, and uses them to send video data to a server via HTTP requests. It decodes videos into frames as numpy arrays, encodes them to base64 strings, generates dictionaries with keys \"key\" and \"value\", and sends the data to a specific URL endpoint.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/pipeline_http_client.py\":30-61",
+            "content": "    parser.add_argument('-ptn',\n                        '--port_number',\n                        type=int,\n                        default=18080,\n                        help='http port number')\n    parser.add_argument('-i',\n                        '--input_file',\n                        type=str,\n                        help='input file path or directory path')\n    return parser.parse_args()\nif __name__ == \"__main__\":\n    args = parse_args()\n    url = f\"http://127.0.0.1:{args.port_number}/video/prediction\"\n    files_list = parse_file_paths(args.input_file)\n    for file_path in files_list:\n        # decoding video and get stacked frames as ndarray\n        decoded_frames = video_to_numpy(file_path=file_path)\n        # encode ndarray to base64 string for transportation.\n        decoded_frames_base64 = numpy_to_base64(decoded_frames)\n        # generate dict & convert to json.\n        data = {\n            \"key\": [\"frames\", \"frames_shape\"],\n            \"value\": [decoded_frames_base64,\n                      str(decoded_frames.shape)]"
+        },
+        {
+            "comment": "This code snippet sends a POST request to the specified URL with the data in JSON format. It uses Python's requests library to transport the data and waits for 100 seconds for a response. The response is then printed as JSON.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/python_serving/pipeline_http_client.py\":62-69",
+            "content": "        }\n        data = json.dumps(data)\n        # transport to server & get get results.\n        r = requests.post(url=url, data=data, timeout=100)\n        # print result\n        print(r.json())"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f1ef72ce-3ab3-47c7-b807-14cad6064db5.json b/docs/doc/f1ef72ce-3ab3-47c7-b807-14cad6064db5.json
new file mode 100644
index 000000000..c663921fb
--- /dev/null
+++ b/docs/doc/f1ef72ce-3ab3-47c7-b807-14cad6064db5.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code is importing Registry classes from the \"utils\" module and creating four different registries named PIPELINES, DATASETS, SAMPLERS, BATCH_SAMPLERS, and DATALOADERS. These registries will be used for organizing and managing various functionalities in the PaddleVideo framework.",
+    "details": [
+        {
+            "comment": "This code is importing Registry classes from the \"utils\" module and creating four different registries named PIPELINES, DATASETS, SAMPLERS, BATCH_SAMPLERS, and DATALOADERS. These registries will be used for organizing and managing various functionalities in the PaddleVideo framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py\":0-20",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom ..utils import Registry\nPIPELINES = Registry(\"pipeline\")\nDATASETS = Registry(\"datasets\")\nSAMPLERS = Registry(\"sampler\")\nBATCH_SAMPLERS = Registry(\"batch_sampler\")\nDATALOADERS = Registry(\"dataloader\")"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f310e506-75d8-42f7-b153-4159889d74ab.json b/docs/doc/f310e506-75d8-42f7-b153-4159889d74ab.json
new file mode 100644
index 000000000..445ad51c2
--- /dev/null
+++ b/docs/doc/f310e506-75d8-42f7-b153-4159889d74ab.json
@@ -0,0 +1,75 @@
+{
+    "summary": "The code defines a VisionTransformer class, ToShiftVIT model and TokenShiftVisionTransformer for image processing with attention blocks, positional embeddings, dropout and normalization layers. It also supports pretrained checkpoints.",
+    "details": [
+        {
+            "comment": "The code defines a class for VisionTransformer backbones and imports necessary libraries. It includes functions like `to_2tuple` and `drop_path` for processing input data and implementing drop path operation, respectively. The code also handles initialization of zero and one constants.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":0-36",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom collections.abc import Callable\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Constant\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import trunc_normal_\n__all__ = ['VisionTransformer']\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)\ndef to_2tuple(x):\n    return tuple([x] * 2)\ndef drop_path(x, drop_prob=0., training=False):"
+        },
+        {
+            "comment": "Code implements Drop Paths (Stochastic Depth) for residual blocks. The function applies dropout probabilistically, and the class `DropPath` handles it during forward pass. `Identity` class serves as an identity mapping.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":37-64",
+            "content": "    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    # issuecomment-532968956 ...\n    See discussion: https://github.com/tensorflow/tpu/issues/494\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob)\n    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)\n    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\nclass Identity(nn.Layer):"
+        },
+        {
+            "comment": "This code defines three classes: Identity, Mlp, and Attention. Identity is a simple class that returns its input unchanged. Mlp stands for Multi-Layer Perceptron and defines a feedforward neural network layer with optional hidden layers. The Attention class implements a self-attention mechanism commonly used in transformer models. It initializes the necessary parameters and applies dropout to the input and output of the attention operation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":65-103",
+            "content": "    def __init__(self):\n        super(Identity, self).__init__()\n    def forward(self, input):\n        return input\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.0):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\nclass Attention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.0,\n                 proj_drop=0.0):"
+        },
+        {
+            "comment": "The code defines a class representing a self-attention module, with parameters like dimension (dim), number of heads (num_heads), and optional bias for the QKV linear layer (qkv_bias). The class initializes these attributes and defines its forward function to compute attention scores and output.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":104-137",
+            "content": "        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n        self.attn_drop = nn.Dropout(attn_drop)\n    def forward(self, x):\n        N, C = x.shape[1:]\n        qkv = self.qkv(x).reshape(\n            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(\n                (2, 0, 3, 1, 4))\n        q, k, v = qkv[0], qkv[1], qkv[2]\n        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale\n        attn = nn.functional.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.0,\n                 qkv_bias=False,"
+        },
+        {
+            "comment": "The code above initializes an object with multiple parameters such as num_segments, fold_div, norm_layer, and attention_type. It also creates a norm1 layer based on the type of norm_layer provided (either a string or a Callable). If norm_layer is a string, it uses eval() to call the specified class, otherwise if it's a Callable, it directly initializes the layer with that function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":138-163",
+            "content": "                 qk_scale=None,\n                 drop=0.0,\n                 attn_drop=0.0,\n                 drop_path=0.1,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 num_segments = 8,\n                 fold_div = 4):\n                #attention_type='divided_space_time',\n        super().__init__()\n        self.n_seg = num_segments       #ckk\n        self.foldP_div = fold_div       #ckk\n        #self.attention_type = attention_type\n        if isinstance(norm_layer, str):\n            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm1 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        self.attn = Attention(dim,\n                              num_heads=num_heads,\n                              qkv_bias=qkv_bias,\n                              qk_scale=qk_scale,"
+        },
+        {
+            "comment": "This code initializes the Temporal Attention parameters for the model. If the attention_type is 'divided_space_time', it creates a temporal normalization layer and an attention layer, as well as a fully connected layer for the temporal branch of the model. Drop paths are used for stochastic depth to reduce overfitting.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":164-185",
+            "content": "                              attn_drop=attn_drop,\n                              proj_drop=drop)\n        # Temporal Attention Parameters\n        '''\n        if self.attention_type == 'divided_space_time':\n            if isinstance(norm_layer, str):\n                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n            elif isinstance(norm_layer, Callable):\n                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)\n            else:\n                raise TypeError(\n                    \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n            self.temporal_attn = Attention(dim,\n                                           num_heads=num_heads,\n                                           qkv_bias=qkv_bias,\n                                           qk_scale=qk_scale,\n                                           attn_drop=attn_drop,\n                                           proj_drop=drop)\n            self.temporal_fc = nn.Linear(dim, dim)\n        '''\n        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here"
+        },
+        {
+            "comment": "This code initializes a ToShift ViT model. It creates a drop path layer, normalization layer, and MLP based on the given parameters. The `shuift_tk` function performs token shifting by reshaping the input, creating a mask with stop gradient attribute, and element-wise adding it to the original input. This helps in improving the model's performance by reducing the effect of irrelevant tokens during training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":186-212",
+            "content": "        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        if isinstance(norm_layer, str):\n            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm2 = norm_layer(dim, epsilon=epsilon)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n    # token_shift\n    def shuift_tk(self, x):\n        t = self.n_seg\n        bt, n, c = x.shape\n        b = bt // t\n        x = x.reshape([b, t, n, c]) #B T N C\n        fold = c // self.foldP_div\n        out = paddle.zeros_like(x)\n        out.stop_gradient = True\n        # print(\"#### fold \", fold)\n        # print(out.shape)\n        # print(x[:, 1:, 0, :fold].unsqueeze(2).shape)\n        # print(out[:, :-1, 0:1, :fold].shape)"
+        },
+        {
+            "comment": "This code defines a \"ToshiftVIT\" class, which appears to be a custom backbone for a Vision Transformer model. It includes a forward function that applies shift and drop path operations on the input, as well as a PatchEmbed class for image-to-patch embedding. The ToshiftVIT class also has an unknown \"shuift_tk\" function that seems to be used in the forward pass.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":213-244",
+            "content": "        # exit(0)\n        out[:, :-1, 0, :fold] = x[:, 1:, 0, :fold] # shift left\n        out[:, 1:,  0, fold:2*fold] = x[:,:-1:, 0, fold:2*fold]\n        out[:, :, 1:, :2*fold] = x[:, :, 1:, :2*fold]\n        out[:, :, :, 2*fold:] = x[:, :, :, 2*fold:]\n        return out.reshape([bt, n, c])\n    def forward(self, x):\n        x = self.shuift_tk(x)\n        x = x + self.drop_path(self.attn(self.norm1(x)))\n        x = self.shuift_tk(x)\n        x = x + self.drop_path(self.mlp(self.norm2(x)))\n        return x\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n    def __init__(self,\n                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768):\n        super().__init__()\n        img_size = to_2tuple(img_size)\n        patch_size = to_2tuple(patch_size)\n        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //\n                                                        patch_size[0])\n        self.img_size = img_size\n        self.patch_size = patch_size"
+        },
+        {
+            "comment": "This code defines a TokenShiftVisionTransformer class, which is a type of Vision Transformer model that supports patch input. The class has an initialization function where it sets the number of patches and creates a projection layer. It also includes a forward function for processing input data through the model. The assert statement ensures the input image size matches the expected model dimensions. The @BACKBONES.register() decorator registers the model with other backbones in the codebase.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":245-277",
+            "content": "        self.num_patches = num_patches\n        self.proj = nn.Conv2D(in_channels,\n                              embed_dim,\n                              kernel_size=patch_size,\n                              stride=patch_size)\n    def forward(self, x):\n        B, C, T, H, W = x.shape\n        assert H == self.img_size[0] and W == self.img_size[1], \\\n            f\"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).\"\n        x = x.transpose((0, 2, 1, 3, 4))\n        x = x.reshape([-1, C, H, W])\n        x = self.proj(x)\n        W = x.shape[-1]\n        x = x.flatten(2).transpose((0, 2, 1))\n        return x, T, W\n@BACKBONES.register()\nclass TokenShiftVisionTransformer(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n    \"\"\"\n    def __init__(self,\n                 pretrained=None,\n                 img_size=224,\n                 patch_size=16,\n                 in_channels=3,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,"
+        },
+        {
+            "comment": "This code is initializing a class for the Toshift ViT backbone. It sets parameters such as pretrained, num_seg, attention_type, embed_dim, and others. It creates PatchEmbed and positional embeddings (cls_token and pos_embed). The code also calculates the number of patches.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":278-304",
+            "content": "                 qkv_bias=False,\n                 qk_scale=None,\n                 drop_rate=0,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.1,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 num_seg=8,\n                 attention_type='divided_space_time',\n                 **args):\n        super().__init__()\n        self.pretrained = pretrained\n        self.num_seg = num_seg\n        self.attention_type = attention_type\n        self.num_features = self.embed_dim = embed_dim\n        self.patch_embed = PatchEmbed(img_size=img_size,\n                                      patch_size=patch_size,\n                                      in_channels=in_channels,\n                                      embed_dim=embed_dim)\n        num_patches = self.patch_embed.num_patches\n        # Positional Embeddings\n        self.cls_token = self.create_parameter(shape=(1, 1, embed_dim),\n                                               default_initializer=zeros_)\n        self.pos_embed = self.create_parameter(shape=(1, num_patches + 1,"
+        },
+        {
+            "comment": "This code initializes and sets up the parameters for a Transformer-based backbone model. It creates positional embeddings, dropout layers, and a list of transformer blocks with specified dimensions, numbers of heads, ratios, biases, scale factors, drop rates, attn drop rates, and drop path rates. These parameters are used to build the network architecture for processing data in downstream tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":305-329",
+            "content": "                                                      embed_dim),\n                                               default_initializer=zeros_)\n        self.pos_drop = nn.Dropout(p=drop_rate)\n        if self.attention_type != 'space_only':\n            self.time_embed = self.create_parameter(shape=(1, num_seg,\n                                                           embed_dim),\n                                                    default_initializer=zeros_)\n            self.time_drop = nn.Dropout(p=drop_rate)\n        self.add_parameter(\"pos_embed\", self.pos_embed)\n        self.add_parameter(\"cls_token\", self.cls_token)\n        dpr = np.linspace(0, drop_path_rate, depth)\n        self.blocks = nn.LayerList([\n            Block(dim=embed_dim,\n                  num_heads=num_heads,\n                  mlp_ratio=mlp_ratio,\n                  qkv_bias=qkv_bias,\n                  qk_scale=qk_scale,\n                  drop=drop_rate,\n                  attn_drop=attn_drop_rate,\n                  drop_path=dpr[i],\n                  norm_layer=norm_layer,"
+        },
+        {
+            "comment": "The code initializes a Toshift_VIT model with the specified number of segments and depth. It sets the attention type to 'divided_space_time' for certain blocks. The model's weights are then initialized using truncated normal distribution and the provided function, and any temporal FC layers in the respective block are set to zero. If a pretrained checkpoint is provided, it will be loaded.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":330-359",
+            "content": "                  epsilon=epsilon,\n                  num_segments= self.num_seg\n                  ) for i in range(depth)\n                #attention_type=self.attention_type\n        ])\n        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)\n    def init_weights(self):\n        \"\"\"First init model's weight\"\"\"\n        trunc_normal_(self.pos_embed, std=0.02)\n        trunc_normal_(self.cls_token, std=0.02)\n        self.apply(self._init_fn)\n        if self.attention_type == 'divided_space_time':\n            i = 0\n            for m in self.blocks.sublayers(include_self=True):\n                m_str = str(m)\n                if 'Block' in m_str:\n                    if i > 0:\n                        zeros_(m.temporal_fc.weight)\n                        zeros_(m.temporal_fc.bias)\n                    i += 1\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if isinstance(\n                self.pretrained, str\n        ) and self.pretrained.strip() != \"\":  # load pretrained weights\n            load_ckpt(self,"
+        },
+        {
+            "comment": "The code initializes a TOShiftViT model, defines an initialization function for the layers, and defines a forward_features function to process input feature maps. The function takes the number of patches from the patch embedding layer, expands the class token, concatenates it with the features, and applies positional embeddings if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":360-385",
+            "content": "                      self.pretrained,\n                      num_patches=self.patch_embed.num_patches,\n                      num_seg=self.num_seg,\n                      attention_type=self.attention_type)\n    def _init_fn(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):\n            ones_(m.weight)\n            zeros_(m.bias)\n    def forward_features(self, x):\n        # B = x.shape[0]\n        B = paddle.shape(x)[0]\n        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]\n        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]\n        x = paddle.concat((cls_tokens, x), axis=1)\n        pos_interp = (x.shape[1] != self.pos_embed.shape[1])\n        if pos_interp:\n            pos_embed = self.pos_embed\n            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)\n            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(\n                (0, 2, 1))"
+        },
+        {
+            "comment": "This function reshapes and interpolates a positional embedding, adding it to the input if specified. It then performs positional dropout before passing through attention blocks and normalization layers, finally returning the forward pass of features.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/toshift_vit.py\":386-412",
+            "content": "            P = int(other_pos_embed.shape[2]**0.5)\n            H = x.shape[1] // W\n            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])\n            new_pos_embed = F.interpolate(other_pos_embed,\n                                          size=(H, W),\n                                          mode='nearest')\n            new_pos_embed = new_pos_embed.flatten(2)\n            new_pos_embed = new_pos_embed.transpose((0, 2, 1))\n            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),\n                                          axis=1)\n            x = x + new_pos_embed\n        else:\n            x = x + self.pos_embed\n        x = self.pos_drop(x)\n        # Attention blocks\n        for blk in self.blocks:\n            x = blk(x)\n        x = self.norm(x)\n        return x[:, 0]  # [B,  embed_dim]  -> [B*T, embed_dim]\n    def forward(self, x):\n        x = self.forward_features(x)\n        return x"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f3177cee-e9ca-4e0b-9977-8a67079bc9c6.json b/docs/doc/f3177cee-e9ca-4e0b-9977-8a67079bc9c6.json
new file mode 100644
index 000000000..1db256d89
--- /dev/null
+++ b/docs/doc/f3177cee-e9ca-4e0b-9977-8a67079bc9c6.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code trains PaddleVideo models, imports libraries, defines command line arguments, and supports distributed training/testing based on --test argument.",
+    "details": [
+        {
+            "comment": "This code snippet is the beginning of a Python script for PaddleVideo, specifically for training models. It imports necessary libraries and modules, defines a function to parse command line arguments, and sets up the argument parser.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/main.py\":0-29",
+            "content": "\"\"\"\n# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport paddle\nimport argparse\nfrom paddlevideo.utils import get_config\nfrom paddlevideo.tasks import train_model, test_model\nfrom paddlevideo.utils import get_dist_info\ndef parse_args():\n    \"\"\"parse_args\"\"\"\n    parser = argparse.ArgumentParser(\"PaddleVideo train script\")\n    parser.add_argument('-c',\n                        '--config',\n                        type=str,\n                        default='configs/example.yaml',"
+        },
+        {
+            "comment": "This code uses the ArgumentParser class to define and parse command-line arguments for a video quality assessment application. It allows specifying config file paths, overriding config options, testing a model, using DALI for training speedup, multigrid training, weights for finetuning or testing, and whether to use distributed training via fleet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/main.py\":30-51",
+            "content": "                        help='config file path')\n    parser.add_argument('-o',\n                        '--override',\n                        action='append',\n                        default=[],\n                        help='config options to be overridden')\n    parser.add_argument('--test',\n                        action='store_true',\n                        help='whether to test a model')\n    parser.add_argument('--train_dali',\n                        action='store_true',\n                        help='whether to use dali to speed up training')\n    parser.add_argument('--multigrid',\n                        action='store_true',\n                        help='whether to use multigrid training')\n    parser.add_argument('-w',\n                        '--weights',\n                        type=str,\n                        help='weights for finetuning or testing')\n    parser.add_argument('--fleet',\n                        action='store_true',\n                        help='whether to use fleet run distributed training')"
+        },
+        {
+            "comment": "This code defines command line arguments for training and testing models, and initializes distributed parallel environment if necessary. Then it calls appropriate functions based on the --test argument value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/main.py\":52-87",
+            "content": "    parser.add_argument('--amp',\n                        action='store_true',\n                        help='whether to open amp training.')\n    parser.add_argument(\n        '--validate',\n        action='store_true',\n        help='whether to evaluate the checkpoint during training')\n    args = parser.parse_args()\n    return args\ndef main():\n    \"\"\"main\"\"\"\n    args = parse_args()\n    cfg = get_config(args.config, overrides=args.override)\n    _, world_size = get_dist_info()\n    parallel = world_size != 1\n    if parallel:\n        paddle.distributed.init_parallel_env()\n    if args.test:\n        test_model(cfg, weights=args.weights, parallel=parallel)\n    else:\n        train_model(cfg,\n                    weights=args.weights,\n                    parallel=parallel,\n                    validate=args.validate,\n                    fleet=args.fleet,\n                    amp=args.amp)\nif __name__ == '__main__':\n    main()"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f3c38d5a-f9fd-434e-a15e-72f8972e53fe.json b/docs/doc/f3c38d5a-f9fd-434e-a15e-72f8972e53fe.json
new file mode 100644
index 000000000..2e4f4aca4
--- /dev/null
+++ b/docs/doc/f3c38d5a-f9fd-434e-a15e-72f8972e53fe.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code defines an attention-based LSTM feature reader for the FootballAction application in PaddleVideo, handling data reading from the youtube-8M dataset. It reads features from image, audio, and pcm_lists, concatenates them, yields batches, and continues even if exceptions occur.",
+    "details": [
+        {
+            "comment": "This code is for an attention-based LSTM feature reader, used in the FootballAction application of PaddleVideo. It imports necessary libraries, handles potential import errors, and defines a class called FeatureReader which inherits from DataReader to handle data reading specifically for the youtube-8M dataset that contains features extracted by prior networks. The code is licensed under Apache 2.0 license.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/feature_reader.py\":0-32",
+            "content": "\"\"\"\nattention-lstm feature reader\n\"\"\"\n#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\ntry:\n    import cPickle as pickle\n    from cStringIO import StringIO\nexcept ImportError:\n    import pickle\nimport numpy as np\nimport random\nimport code\nfrom .reader_utils import DataReader\nclass FeatureReader(DataReader):\n    \"\"\"\n    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks\n    This is for the three models: lstm, attention cluster, nextvlad"
+        },
+        {
+            "comment": "This code initializes a feature reader object for the FootballAction application, taking in parameters such as name, mode, and configuration (cfg). It then creates a reader function that iterates through proposal features and extracts relevant data based on start and end IDs. The extracted features are stored in batch_out.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/feature_reader.py\":34-70",
+            "content": "    dataset cfg: num_classes\n                 batch_size\n                 list\n                 NextVlad only: eigen_file\n    \"\"\"\n    def __init__(self, name, mode, cfg, material=None):\n        self.name = name\n        self.mode = mode\n        self.batch_size = cfg[self.name.upper()]['batch_size']\n        self.feature = material['feature']\n        self.proposal = material['proposal']\n        self.fps = 5\n    def create_reader(self):\n        \"\"\"\n        create_reader\n        \"\"\"\n        image_feature_list = self.feature['image_feature']\n        audio_feature_list = self.feature['audio_feature']\n        pcm_feature_list = self.feature['pcm_feature']\n        pcm_feature_list = pcm_feature_list.reshape((pcm_feature_list.shape[0] * 5, 640))\n        fl = self.proposal\n        if self.mode == 'train':\n            random.shuffle(fl)\n        def reader():\n            \"\"\"\n            reader\n            \"\"\"\n            batch_out = []\n            for prop_info in fl:\n                start_id = int(prop_info['start'])\n                end_id = int(prop_info['end'])"
+        },
+        {
+            "comment": "This code reads features from image, audio, and pcm_feature lists based on start_id and end_id. It concatenates the image and pcm_features along axis=1. If batch_size is reached, it yields the batch and resets the batch_out list. The code continues even if an exception occurs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/predict/action_detect/reader/feature_reader.py\":71-85",
+            "content": "                bmn_score = float(prop_info['score'])\n                try:\n                    image_feature = image_feature_list[start_id: end_id]\n                    audio_feature = audio_feature_list[int(start_id / self.fps): int(end_id / self.fps)]\n                    pcm_feature = pcm_feature_list[start_id: end_id]\n                    # image_feature = np.concatenate((image_feature, pcm_feature), axis=1)\n                    batch_out.append((image_feature, audio_feature, 0, prop_info))\n                    if len(batch_out) == self.batch_size:\n                        yield batch_out\n                        batch_out = []\n                except Exception as e:\n                    continue\n        return reader"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f5b5215c-464b-4ef3-bbce-265eda1cb10b.json b/docs/doc/f5b5215c-464b-4ef3-bbce-265eda1cb10b.json
new file mode 100644
index 000000000..e5c4e01c1
--- /dev/null
+++ b/docs/doc/f5b5215c-464b-4ef3-bbce-265eda1cb10b.json
@@ -0,0 +1,15 @@
+{
+    "summary": "This is a shell script for PaddlePaddle, setting CUDA_VISIBLE_DEVICES and executing commands like training, testing, and exporting models using tsm architecture. It also mentions running predict.py on example.avi with model files and disabling benchmarking for \"example\" model with 8 segments.",
+    "details": [
+        {
+            "comment": "The code is a shell script that sets the CUDA_VISIBLE_DEVICES environment variable and executes PaddlePaddle commands for training, testing, finetuning, resuming and exporting/predicting models. The commands use specific configurations (yaml files) for recognition tasks using tsm architecture. It mentions the file paths where necessary, such as the log directory, model weights, and output directories.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/run.sh\":0-18",
+            "content": "export CUDA_VISIBLE_DEVICES=0\n# run  training\npython3.7 -B -m paddle.distributed.launch --gpus=\"0\"  --log_dir=log_pptsm  main.py --amp  --validate -c configs/recognition/tsm/pptsm_regression.yaml\n# run testing\n#python3.7 -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_pptsm main.py -c configs/recognition/tsm/pptsm_regression.yaml --test --weights=output/model_name/ppTSM_best.pdparams\n#finetune\n#python3 -m paddle.distributed.launch --gpus=\"0,1,2,3\" main.py --amp -c ./configs/recognition/tsm/pptsm_regression.yaml --validate --weights=./output/model_name/ppTSM_best.pdparams\n#resume\n#python3 -m paddle.distributed.launch --gpus=\"0,1,2,3\" main.py --amp -c ./configs/recognition/tsm/pptsm_regression.yaml --validate -o resume_epoch=2\n# export_models script\n# just use `example` as example, please replace to real name.\n#python3.7 tools/export_model.py -c configs/example.yaml -p output/model_name/ppTSM_best.pdparams -o ./inference\n# predict script\n# just use `example` as example, please replace to real name."
+        },
+        {
+            "comment": "Running predict.py script on example.avi with specified model files and disabling benchmarking for the \"example\" model with 8 segments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/run.sh\":19-19",
+            "content": "#python3.7 tools/predict.py -v example.avi --model_file \"./inference/example.pdmodel\" --params_file \"./inference/example.pdiparams\" --enable_benchmark=False --model=\"example\" --num_seg=8"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f626dc61-6e8a-4e48-96eb-e4cb181d2383.json b/docs/doc/f626dc61-6e8a-4e48-96eb-e4cb181d2383.json
new file mode 100644
index 000000000..e280ffa43
--- /dev/null
+++ b/docs/doc/f626dc61-6e8a-4e48-96eb-e4cb181d2383.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code uses PaddlePaddle for training, imports modules, defines logging functions, and trains with a dataloader, iterating over batches to track progress, log metrics, profile performance, handle errors, and save model progress.",
+    "details": [
+        {
+            "comment": "This code snippet is likely part of a larger program that uses the PaddlePaddle framework. It imports several modules, defines a function called `log_lr_and_step()`, and sets up a logger object. The purpose of this particular block may be to handle logging learning rate values and tracking the training step during the model's optimization process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/train_utils.py\":0-31",
+            "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport sys\nimport time\nimport numpy as np\nimport paddle\nimport paddle.static as static\nimport paddle.profiler as profiler\nimport logging\nimport shutil\nlogger = logging.getLogger(__name__)\ndef log_lr_and_step():\n    try:\n        # In optimizers, if learning_rate is set as constant, lr_var\n        # name is 'learning_rate_0', and iteration counter is not\n        # recorded. If learning_rate is set as decayed values from"
+        },
+        {
+            "comment": "This code retrieves the learning rate and learning rate counter from the global scope. It prints their values, but handles potential exceptions if they cannot be found or accessed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/train_utils.py\":32-56",
+            "content": "        # learning_rate_scheduler, lr_var name is 'learning_rate',\n        # and iteration counter is recorded with name '@LR_DECAY_COUNTER@',\n        # better impliment is required here\n        lr_var = static.global_scope().find_var(\"learning_rate\")\n        if not lr_var:\n            lr_var = static.global_scope().find_var(\"learning_rate_0\")\n        lr = np.array(lr_var.get_tensor())\n        lr_count = '[-]'\n        lr_count_var = static.global_scope().find_var(\"@LR_DECAY_COUNTER@\")\n        if lr_count_var:\n            lr_count = np.array(lr_count_var.get_tensor())\n        logger.info(\n            \"------- learning rate {}, learning rate counter {} -----\".format(\n                np.array(lr), np.array(lr_count)))\n    except:\n        logger.warn(\"Unable to get learning_rate and LR_DECAY_COUNTER.\")\ndef test_with_dataloader(exe,\n                         compiled_test_prog,\n                         test_dataloader,\n                         test_fetch_list,\n                         test_metrics,\n                         log_interval=0,"
+        },
+        {
+            "comment": "This code defines a function to train a model with a dataloader. It takes an executor, training program, compiled training program, train dataloader, train fetch list, and train metrics as inputs. The function iterates over the dataloader, runs the training program for each data batch, accumulates metrics, logs intermediate results if specified, and finalizes and logs out when finished.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/train_utils.py\":57-79",
+            "content": "                         save_model_name=''):\n    if not test_dataloader:\n        logger.error(\"[TEST] get dataloader failed.\")\n    test_metrics.reset()\n    test_iter = 0\n    for data in test_dataloader():\n        test_outs = exe.run(compiled_test_prog,\n                            fetch_list=test_fetch_list,\n                            feed=data)\n        test_metrics.accumulate(test_outs)\n        if log_interval > 0 and test_iter % log_interval == 0:\n            test_metrics.calculate_and_log_out(test_outs, \\\n               info = '[TEST] test_iter {} '.format(test_iter))\n        test_iter += 1\n    test_metrics.finalize_and_log_out(\"[TEST] Finish\")\ndef train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, \\\n                        train_fetch_list, train_metrics, epochs = 10, \\\n                        log_interval = 0, valid_interval = 0, save_dir = './', \\\n                        num_trainers = 1, trainer_id = 0, \\\n                        save_model_name = 'model', fix_random_seed = False, \\"
+        },
+        {
+            "comment": "This code initializes variables and starts a loop over epochs. Inside the loop, it logs the learning rate and step, iterates through the training dataloader, runs the compiled program with fetched data, records the time taken for each iteration, and optionally uses profiler tools for benchmarking.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/train_utils.py\":80-108",
+            "content": "                        compiled_test_prog = None, test_dataloader = None, \\\n                        test_fetch_list = None, test_metrics = None, \\\n                        is_profiler = None, profiler_path = None):\n    if not train_dataloader:\n        logger.error(\"[TRAIN] get dataloader failed.\")\n    epoch_periods = []\n    train_loss = 0\n    # NOTE: profiler tools, used for benchmark\n    if is_profiler:\n        prof = profiler.Profiler()\n    for epoch in range(epochs):\n        log_lr_and_step()\n        train_iter = 0\n        epoch_periods = []\n        cur_time = time.time()\n        for data in train_dataloader():\n            if is_profiler and train_iter == log_interval:\n                prof.start()\n            train_outs = exe.run(compiled_train_prog,\n                                 fetch_list=train_fetch_list,\n                                 feed=data)\n            period = time.time() - cur_time\n            epoch_periods.append(period)\n            timeStamp = time.time()\n            localTime = time.localtime(timeStamp)"
+        },
+        {
+            "comment": "This code segment tracks the training progress of a video analysis algorithm. It logs and calculates metrics at specified intervals, profiles performance if desired, and saves model progress after each epoch. If no iterations are executed, it alerts and exits with an error message to check data reader.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/train_utils.py\":109-134",
+            "content": "            strTime = time.strftime(\"%Y-%m-%d %H:%M:%S\", localTime)\n            if log_interval > 0 and (train_iter % log_interval == 0):\n                train_metrics.calculate_and_log_out(train_outs, \\\n                        info = '[TRAIN {}] Epoch {}, iter {}, time {}, '.format(strTime, epoch, train_iter, period))\n            train_iter += 1\n            cur_time = time.time()\n            if is_profiler:\n                prof.step()\n                if train_iter == log_interval + 5:\n                    prof.stop()\n                    prof.export(path=profiler_path, format=\"json\")\n                    return\n        if len(epoch_periods) < 1:\n            logger.info(\n                'No iteration was executed, please check the data reader')\n            sys.exit(1)\n        logger.info(\n            '[TRAIN] Epoch {} training finished, average time: {}'.format(\n                epoch, np.mean(epoch_periods[1:])))\n        if trainer_id == 0:\n            save_model(exe, train_prog, save_dir, save_model_name,\n                       \"_epoch{}\".format(epoch))"
+        },
+        {
+            "comment": "The code is checking if it's time to test the program, saving the model if it's trainer 0, and fixing the random seed for debugging. It also saves the model with a specified name in the given directory. The code appears to be part of a larger deep learning training process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/train_utils.py\":135-158",
+            "content": "        if compiled_test_prog and valid_interval > 0 and (\n                epoch + 1) % valid_interval == 0:\n            test_with_dataloader(exe, compiled_test_prog, test_dataloader,\n                                 test_fetch_list, test_metrics, log_interval,\n                                 save_model_name)\n    if trainer_id == 0:\n        save_model(exe, train_prog, save_dir, save_model_name)\n    #when fix_random seed for debug\n    if fix_random_seed:\n        cards = os.environ.get('CUDA_VISIBLE_DEVICES')\n        gpu_num = len(cards.split(\",\"))\n        print(\"kpis\\ttrain_cost_card{}\\t{}\".format(gpu_num, train_loss))\n        print(\"kpis\\ttrain_speed_card{}\\t{}\".format(gpu_num,\n                                                    np.mean(epoch_periods)))\ndef save_model(exe, program, save_dir, model_name, postfix=''):\n    \"\"\"save paramters and optimizer related varaibles\"\"\"\n    if not os.path.isdir(save_dir):\n        os.makedirs(save_dir)\n    saved_model_name = model_name + postfix\n    paddle.static.save(program, os.path.join(save_dir, saved_model_name))"
+        },
+        {
+            "comment": "The code snippet seems to be incomplete as it only contains a single line, which is the return statement. Without seeing the context or surrounding code, it's difficult to provide an accurate comment. Can you please provide more information or additional lines of code?",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/utils/train_utils.py\":160-160",
+            "content": "    return"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f63ffb7f-05fc-4a28-97a0-f967428fef40.json b/docs/doc/f63ffb7f-05fc-4a28-97a0-f967428fef40.json
new file mode 100644
index 000000000..7a7a9ceac
--- /dev/null
+++ b/docs/doc/f63ffb7f-05fc-4a28-97a0-f967428fef40.json
@@ -0,0 +1,40 @@
+{
+    "summary": "This code presents PP-TSM, an optimized TSM model for action recognition on UCF101 and Kinetics-400 datasets using PaddlePaddle and ResNet101 as backbone. It offers pre-trained models for video classification inference and predicts 'archery' as top1 class for 'example.avi'.",
+    "details": [
+        {
+            "comment": "This code describes the PP-TSM model, an optimized version of TSM for action recognition. It significantly improves accuracy in UCF101 and Kinetics-400 datasets without increasing parameters. Two sampling methods are used, Dense and Uniform, with respective top1 accuracies shown.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsm.md\":0-30",
+            "content": "[\u7b80\u4f53\u4e2d\u6587](../../../zh-CN/model_zoo/recognition/pp-tsm.md) | English\n# PP-TSM\n---\n## Contents\n- [Introduction](#Introduction)\n- [Data](#Data)\n- [Train](#Train)\n- [Test](#Test)\n- [Inference](#Inference)\n- [Reference](#Reference)\n## Introduction\nWe optimized TSM model and proposed **PP-TSM** in this repo. Without increasing the number of parameters, the accuracy of TSM was significantly improved in UCF101 and Kinetics-400 datasets. Please refer to [**Tricks on PP-TSM**](https://zhuanlan.zhihu.com/p/382134297) for more details.\n| Version | Sampling method | Top1 |\n| :------ | :----------: | :----: |\n| Ours (distill) | Dense | **76.16** |\n| Ours | Dense | 75.69 |\n| [mmaction2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) | Dense | 74.55 |\n| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module) | Dense | 74.1 |\n| Version | Sampling method | Top1 |\n| :------ | :----------: | :----: |\n| Ours (distill) | Uniform | **75.11** |\n| Ours | Uniform | 74.54 |\n| [mmaction"
+        },
+        {
+            "comment": "Code snippet provides a guide for training TSM model on Kinetics-400 and UCF101 datasets. It explains how to download the pre-trained ResNet50_vd_ssld_v2 model, specifies the configuration file modification required, and provides links to related data preparation documents.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsm.md\":30-63",
+            "content": "2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) |  Uniform | 71.90 |\n| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module)  | Uniform | 71.16 |\n## Data\nPlease refer to Kinetics400 data download and preparation doc [k400-data](../../dataset/K400.md)\nPlease refer to UCF101 data download and preparation doc [ucf101-data](../../dataset/ucf101.md)\n## Train\n### Train on kinetics-400\n#### download pretrain-model\nPlease download [ResNet50_vd_ssld_v2](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams) as pretraind model:\n```bash\nwget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams\n```\nand add path to `MODEL.framework.backbone.pretrained` in config file as\uff1a\n```yaml\nMODEL:\n    framework: \"Recognizer2D\"\n    backbone:\n        name: \"ResNetTweaksTSM\"\n        pretrained: your weight path\n```\n- If use ResNet101 as backbone, please download [ResNet101_vd_ssld_pretrained."
+        },
+        {
+            "comment": "Loading pretrained model \"pdparams\" from the provided link.\nStarting training for PP-TSM on kinetics-400 using specified scripts and configurations.\nUsing AMP to speed up training.\nTraining with dense sampling also available.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsm.md\":63-89",
+            "content": "pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams) as pretraind model.\n#### Start training\n- Train PP-TSM on kinetics-400 scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml\n```\n- Train PP-TSM on kinetics-400 video data using scripts:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml\n```\n- AMP is useful for speeding up training:\n```bash\nexport FLAGS_conv_workspace_size_limit=800 #MB\nexport FLAGS_cudnn_exhaustive_search=1\nexport FLAGS_cudnn_batchnorm_spatial_persistent=1\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --amp --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml\n```\n- Train PP-TSM on kinetics-400 with dense sampling:"
+        },
+        {
+            "comment": "This code is used to train and test the PP-TSM model on Kinetics-400 dataset. The training process utilizes PaddlePaddle distributed launch, with GPUs 0-7 for execution. It uses ResNet101 as backbone and dense sampling method for training. To obtain test accuracy, a separate script is used, specifying the configuration file and weight file path. The code also displays accuracy metrics in terms of backbone, distillation, sampling method, number of segments, target size, and top-1 accuracy for the Kinetics400 dataset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsm.md\":91-121",
+            "content": "```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml\n```\n- Train PP-TSM on kinetics-400 with ResNet101 as backbone using dense sampling:\n```bash\npython3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense_r101.yaml\n```\n## Test\n- For uniform sampling, test accuracy can be found in training-logs by search key word `best`, such as:\n```txt\nAlready save the best model (top1 acc)0.7454\n```\n- For dense sampling, test accuracy can be obtained using scripts:\n```bash\npython3 main.py --test -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml -w output/ppTSM/ppTSM_best.pdparams\n```\nAccuracy on Kinetics400:\n| backbone | distill | Sampling method | num_seg | target_size | Top-1 | checkpoints |\n| :------: | :----------: | :----: | :----: | :----: | :----: | :---- |\n| ResNet50 | False | Uniform"
+        },
+        {
+            "comment": "This code is a table of pre-trained models for PaddlePaddle Temporal Shift Module (ppTSM) with different configurations. Models are based on ResNet50 and ResNet101 architectures, using both uniform and dense distillation methods. They have different parameters, input sizes, and accuracy levels. The pdparams files are the pre-trained model weights available for download from specified URLs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsm.md\":121-126",
+            "content": " | 8 | 224 | 74.54 | [ppTSM_k400_uniform.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams) |\n| ResNet50 | False | Dense | 8 | 224 | 75.69 | [ppTSM_k400_dense.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense.pdparams) |\n| ResNet50 | True | Uniform | 8 | 224 | 75.11 | [ppTSM_k400_uniform_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform_distill.pdparams) |\n| ResNet50 | True | Dense | 8 | 224 | 76.16 | [ppTSM_k400_dense_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense_distill.pdparams) |\n| ResNet101 | True | Uniform | 8 | 224 | 76.35 | [ppTSM_k400_uniform_distill_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_uniform_distill_r101.pdparams) |\n| ResNet101 | False | Dense | 8 | 224 | 77.15 | [ppTSM_k400_dense_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_dense_r101.pdparams) |"
+        },
+        {
+            "comment": "This code exports the PPTSM model for inference and demonstrates how to use it for video classification. It requires the user to run two commands: one to export the model architecture file (ppTSM.pdmodel) and parameters file (ppTSM.pdiparams), and another to use the model for prediction on a video file (example.avi). The predicted output includes the top-1 class and its corresponding score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsm.md\":128-158",
+            "content": "## Inference\n### export inference model\n To get model architecture file `ppTSM.pdmodel` and parameters file `ppTSM.pdiparams`, use:\n```bash\npython3.7 tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n                                -p data/ppTSM_k400_uniform.pdparams \\\n                                -o inference/ppTSM\n```\n- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n### infer\n```bash\npython3.7 tools/predict.py --input_file data/example.avi \\\n                           --config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n                           --model_file inference/ppTSM/ppTSM.pdmodel \\\n                           --params_file inference/ppTSM/ppTSM.pdiparams \\\n                           --use_gpu=True \\\n                           --use_tensorrt=False\n```\nexample of logs:\n```\nCurrent video file: data/example.avi\n\ttop-1 class: 5\n\ttop-1 score: 0.9907386302947998"
+        },
+        {
+            "comment": "The code retrieves the class name from class id and a map file, then shows that the top1 prediction of 'data/example.avi' is 'archery'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/model_zoo/recognition/pp-tsm.md\":159-166",
+            "content": "```\nwe can get the class name using class id and map file `data/k400/Kinetics-400_label_list.txt`. The top1 prediction of `data/example.avi` is `archery`.\n## Reference\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han\n- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f7463aec-0911-4b21-8ae9-c3076545d82b.json b/docs/doc/f7463aec-0911-4b21-8ae9-c3076545d82b.json
new file mode 100644
index 000000000..9e391dc9a
--- /dev/null
+++ b/docs/doc/f7463aec-0911-4b21-8ae9-c3076545d82b.json
@@ -0,0 +1,70 @@
+{
+    "summary": "This code defines custom loss functions for video modeling, including TMSE and GSTMSE, with the ActionSegmentationLoss class applying various criteria like regression, classification, and temporal segmentation losses.",
+    "details": [
+        {
+            "comment": "The code defines a class TMSE, which is a temporal MSE loss function. It's inspired by the MS-TCN method proposed in CVPR2019 for action segmentation tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":0-31",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# https://github.com/yiskw713/asrf/libs/loss_fn/__init__.py\nimport numpy as np\nimport pandas as pd\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport sys\nimport os\nfrom ..registry import LOSSES\nclass TMSE(nn.Layer):\n    \"\"\"\n    Temporal MSE Loss Function\n    Proposed in Y. A. Farha et al. MS-TCN: Multi-Stage Temporal Convolutional Network for ActionSegmentation in CVPR2019\n    arXiv: https://arxiv.org/pdf/1903.01945.pdf"
+        },
+        {
+            "comment": "The code defines two classes: ASRF_Loss and GaussianSimilarityTMSE. The first class represents the Average Symmetric Ranking Forest Loss, while the second class is a Temporal MSE Loss Function with Gaussian Similarity Weighting. Both classes inherit from nn.Layer and have an __init__ method for initialization, as well as a forward method for calculating losses. The ASRF_Loss class uses an MSELoss function to calculate loss between predicted and ground truth frames, while the GaussianSimilarityTMSE class calculates temporal MSE with Gaussian similarity weighting.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":32-65",
+            "content": "    \"\"\"\n    def __init__(self, threshold=4, ignore_index=255):\n        super().__init__()\n        self.threshold = threshold\n        self.ignore_index = ignore_index\n        self.mse = nn.MSELoss(reduction=\"none\")\n    def forward(self, preds, gts):\n        total_loss = 0.0\n        batch_size = preds.shape[0]\n        for pred, gt in zip(preds, gts):\n            pred = paddle.gather(pred,\n                                 paddle.nonzero(gt != self.ignore_index)[:, 0])\n            loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),\n                            F.log_softmax(pred[:, :-1], axis=1))\n            loss = paddle.clip(loss, min=0, max=self.threshold**2)\n            total_loss += paddle.mean(loss)\n        return total_loss / batch_size\nclass GaussianSimilarityTMSE(nn.Layer):\n    \"\"\"\n    Temporal MSE Loss Function with Gaussian Similarity Weighting\n    \"\"\"\n    def __init__(self, threshold=4, sigma=1.0, ignore_index=255):\n        super().__init__()\n        self.threshold = threshold\n        self.ignore_index = ignore_index"
+        },
+        {
+            "comment": "This code calculates a temporal MSE loss weighted by Gaussian similarity. It uses Paddle's nn.MSELoss function, with reduction set to \"none\". The forward method takes in the model's predictions (before softmax), ground truth and similarity index as inputs. It loops through each input, performs non-zero checks for gt != ignore_index, then gathers the relevant rows from the predicted values. It calculates gaussian similarity using the gathered data and the given sigma value. The calculated loss is returned.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":66-91",
+            "content": "        self.mse = nn.MSELoss(reduction=\"none\")\n        self.sigma = sigma\n    def forward(self, preds, gts, sim_index):\n        \"\"\"\n        Args:\n            preds: the output of model before softmax. (N, C, T)\n            gts: Ground Truth. (N, T)\n            sim_index: similarity index. (N, C, T)\n        Return:\n            the value of Temporal MSE weighted by Gaussian Similarity.\n        \"\"\"\n        total_loss = 0.0\n        batch_size = preds.shape[0]\n        for pred, gt, sim in zip(preds, gts, sim_index):\n            pred = paddle.gather(pred,\n                                 paddle.nonzero(gt != self.ignore_index)[:, 0],\n                                 axis=1)\n            sim = paddle.gather(sim,\n                                paddle.nonzero(gt != self.ignore_index)[:, 0],\n                                axis=1)\n            # calculate gaussian similarity\n            diff = sim[:, 1:] - sim[:, :-1]\n            similarity = paddle.exp(\n                (-1 * paddle.norm(diff, axis=0)) / (2 * self.sigma**2))"
+        },
+        {
+            "comment": "This code defines a class \"ASRF_Loss\" for calculating ASRF loss using temporal MSE and Gaussian similarity weighting. It also defines a class \"FocalLoss\" for focal loss calculation using CrossEntropyLoss with custom gamma and alpha parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":93-125",
+            "content": "            # calculate temporal mse\n            loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),\n                            F.log_softmax(pred[:, :-1], axis=1))\n            loss = paddle.clip(loss, min=0, max=self.threshold**2)\n            # gaussian similarity weighting\n            loss = similarity * loss\n            total_loss += paddle.mean(loss)\n        return total_loss / batch_size\nclass FocalLoss(nn.Layer):\n    def __init__(self,\n                 weight=None,\n                 size_average=True,\n                 batch_average=True,\n                 ignore_index=255,\n                 gamma=2.0,\n                 alpha=0.25):\n        super().__init__()\n        self.gamma = gamma\n        self.alpha = alpha\n        self.batch_average = batch_average\n        self.criterion = nn.CrossEntropyLoss(weight=weight,\n                                             ignore_index=ignore_index,\n                                             size_average=size_average)\n    def forward(self, logit, target):\n        n, _, _ = logit.size()"
+        },
+        {
+            "comment": "This code defines an ActionSegmentationLoss class, which is a loss function for action segmentation tasks. It allows the user to choose from various loss functions including Cross Entropy Loss (CE), Focal Loss, Temporal MSE (TMSE), and Gaussian Similarity TMSE (GSTMSE). The user can specify parameters such as num_classes, file_path, label_path, ce, focal, tmse, gstmse, weight, threshold, ignore_index, ce_weight, focal_weight, and tmse_weight. The class initializes the chosen loss functions and calculates the overall loss based on user inputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":127-166",
+            "content": "        logpt = -self.criterion(logit, target.long())\n        pt = paddle.exp(logpt)\n        if self.alpha is not None:\n            logpt *= self.alpha\n        loss = -((1 - pt)**self.gamma) * logpt\n        if self.batch_average:\n            loss /= n\n        return loss\nclass ActionSegmentationLoss(nn.Layer):\n    \"\"\"\n    Loss Function for Action Segmentation\n    You can choose the below loss functions and combine them.\n        - Cross Entropy Loss (CE)\n        - Focal Loss\n        - Temporal MSE (TMSE)\n        - Gaussian Similarity TMSE (GSTMSE)\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 file_path,\n                 label_path,\n                 ce=True,\n                 focal=True,\n                 tmse=False,\n                 gstmse=False,\n                 weight=None,\n                 threshold=4.,\n                 ignore_index=255,\n                 ce_weight=1.0,\n                 focal_weight=1.0,\n                 tmse_weight=0.15,\n                 gstmse_weight=0.15):\n        super().__init__()"
+        },
+        {
+            "comment": "The code initializes criterions and weights for different loss functions based on the provided parameters. It adds CrossEntropyLoss, FocalLoss, TMSE, and GaussianSimilarityTMSE to self.criterions list, and their corresponding weights to self.weights list. The weight parameter determines whether class weights are used in CrossEntropyLoss. Ignore_index is added for all loss functions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":167-197",
+            "content": "        self.criterions = []\n        self.weights = []\n        self.num_classes = num_classes\n        self.file_path = file_path\n        self.label_path = label_path\n        if weight:\n            class_weight = self.get_class_weight()\n        else:\n            class_weight = None\n        if ce:\n            self.criterions.append(\n                nn.CrossEntropyLoss(weight=class_weight,\n                                    ignore_index=ignore_index))\n            self.weights.append(ce_weight)\n        if focal:\n            self.criterions.append(FocalLoss(ignore_index=ignore_index))\n            self.weights.append(focal_weight)\n        if tmse:\n            self.criterions.append(\n                TMSE(threshold=threshold, ignore_index=ignore_index))\n            self.weights.append(tmse_weight)\n        if gstmse:\n            self.criterions.append(\n                GaussianSimilarityTMSE(threshold=threshold,\n                                       ignore_index=ignore_index))\n            self.weights.append(gstmse_weight)"
+        },
+        {
+            "comment": "The code snippet loads file information from a given path and calculates class weights for CrossEntropy loss function, based on the method described in the Eigen and Fergus paper. It reads file names and their corresponding labels, and stores them as lists for later use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":199-220",
+            "content": "        if len(self.criterions) == 0:\n            print(\"You have to choose at least one loss function.\")\n            sys.exit(1)\n    def get_class_weight(self):\n        \"\"\"\n        Class weight for CrossEntropy\n        Class weight is calculated in the way described in:\n            D. Eigen and R. Fergus, \u201cPredicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture,\u201d in ICCV,\n            openaccess: https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Eigen_Predicting_Depth_Surface_ICCV_2015_paper.pdf\n        \"\"\"\n        # load file list\n        file_ptr = open(self.file_path, 'r')\n        info = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        nums = [0 for i in range(self.num_classes)]\n        for i in range(len(info)):\n            video_name = info[i]\n            file_name = video_name.split('.')[0] + \".npy\"\n            label_file_path = os.path.join(self.label_path, file_name)\n            label = np.load(label_file_path).astype(np.int64)"
+        },
+        {
+            "comment": "This code defines a class that calculates class weights based on the frequency of occurrence in labels. It also includes a forward function for applying different loss functions to predictions and ground truths, with associated weights. The criterion types include GaussianSimilarityTMSE and nn.CrossEntropyLoss.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":221-247",
+            "content": "            num, cnt = np.unique(label, return_counts=True)\n            for n, c in zip(num, cnt):\n                nums[n] += c\n        class_num = paddle.to_tensor(nums, dtype=\"float32\")\n        total = class_num.sum().item()\n        frequency = class_num / total\n        median = paddle.median(frequency)\n        class_weight = median / frequency\n        return class_weight\n    def forward(self, preds, gts, sim_index):\n        \"\"\"\n        Args:\n            preds: paddle.float (N, C, T).\n            gts: paddle.int64 (N, T).\n            sim_index: paddle.float (N, C', T).\n        \"\"\"\n        loss = 0.0\n        for criterion, weight in zip(self.criterions, self.weights):\n            if isinstance(criterion, GaussianSimilarityTMSE):\n                loss += weight * criterion(preds, gts, sim_index)\n            elif isinstance(criterion, nn.CrossEntropyLoss):\n                preds_t = paddle.transpose(preds, perm=[0, 2, 1])\n                loss += weight * criterion(preds_t, gts)\n            else:\n                loss += weight * criterion(preds, gts)"
+        },
+        {
+            "comment": "This class defines a boundary regression loss function, which combines different loss types such as Binary Cross Entropy Loss (bce), Mean Squared Error (mse) and Focal Loss (focal). It initializes with file_path, label_path, bce, focal, mse, weight and pos_weight parameters. The get_pos_weight method retrieves a position weight depending on the norm parameter. If at least one loss function is chosen, the criterions list is created. If not, it prints an error message and exits the program.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":249-290",
+            "content": "        return loss\nclass BoundaryRegressionLoss(nn.Layer):\n    \"\"\"\n    Boundary Regression Loss\n        bce: Binary Cross Entropy Loss for Boundary Prediction\n        mse: Mean Squared Error\n    \"\"\"\n    def __init__(self,\n                 file_path,\n                 label_path,\n                 bce=True,\n                 focal=False,\n                 mse=False,\n                 weight=None,\n                 pos_weight=None):\n        super().__init__()\n        self.criterions = []\n        self.file_path = file_path\n        self.label_path = label_path\n        pos_weight = self.get_pos_weight()\n        if bce:\n            self.criterions.append(\n                nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight))\n        if focal:\n            self.criterions.append(FocalLoss())\n        if mse:\n            self.criterions.append(nn.MSELoss())\n        if len(self.criterions) == 0:\n            print(\"You have to choose at least one loss function.\")\n            sys.exit(1)\n    def get_pos_weight(self, norm=None):\n        \"\"\""
+        },
+        {
+            "comment": "This code calculates the positive weight for binary cross-entropy with logits loss. It loads file information from a given path, counts the number of positive and negative samples, then calculates the ratio of positive samples to total samples. The positive weight is set as the reciprocal of this ratio. If a normalization factor is provided, it divides the positive weight by this factor before returning it in float32 tensor format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":291-320",
+            "content": "        pos_weight for binary cross entropy with logits loss\n        pos_weight is defined as reciprocal of ratio of positive samples in the dataset\n        \"\"\"\n        # load file list\n        file_ptr = open(self.file_path, 'r')\n        info = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        n_classes = 2  # boundary or not\n        nums = [0 for i in range(n_classes)]\n        for i in range(len(info)):\n            video_name = info[i]\n            file_name = video_name.split('.')[0] + \".npy\"\n            label_file_path = os.path.join(self.label_path, file_name)\n            label = np.load(label_file_path).astype(np.int64)\n            num, cnt = np.unique(label, return_counts=True)\n            for n, c in zip(num, cnt):\n                nums[n] += c\n        pos_ratio = nums[1] / sum(nums)\n        pos_weight = 1 / pos_ratio\n        if norm is not None:\n            pos_weight /= norm\n        return paddle.to_tensor(pos_weight, dtype=\"float32\")\n    def forward(self, preds, gts):\n        \"\"\"\n        Args:"
+        },
+        {
+            "comment": "The ASRFLoss class is an ASR (Automatic Speech Recognition) loss function implemented with various criteria for prediction and ground truth. It uses different weights for CE, Focal, TMSE, GST MSE, BCE, and BR LFocal losses depending on the input parameters. The function returns the average loss across all criterions and samples.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":321-358",
+            "content": "            preds: paddle.float (N, 1, T).\n            gts: paddle.float (N, 1, T).\n        \"\"\"\n        loss = 0.0\n        batch_size = float(preds.shape[0])\n        for criterion in self.criterions:\n            for pred, gt in zip(preds, gts):\n                loss += criterion(pred, gt)\n        return loss / batch_size\n@LOSSES.register()\nclass ASRFLoss(nn.Layer):\n    def __init__(self,\n                 lambda_bound_loss,\n                 num_classes,\n                 file_path,\n                 label_path,\n                 boundary_path,\n                 ce=True,\n                 asl_focal=True,\n                 tmse=False,\n                 gstmse=False,\n                 asl_weight=None,\n                 threshold=4.,\n                 ignore_index=255,\n                 ce_weight=1.0,\n                 focal_weight=1.0,\n                 tmse_weight=0.15,\n                 gstmse_weight=0.15,\n                 bce=True,\n                 brl_focal=False,\n                 mse=False,\n                 brl_weight=None):\n        super().__init__()"
+        },
+        {
+            "comment": "This code initializes an ActionSegmentationLoss object with specified parameters for classification loss, focal loss, and temporal segmentation losses. It also takes weights and file paths as inputs to optimize the model's performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":359-372",
+            "content": "        self.criterion_cls = ActionSegmentationLoss(ce=ce,\n                                                    focal=asl_focal,\n                                                    tmse=tmse,\n                                                    gstmse=gstmse,\n                                                    weight=asl_weight,\n                                                    threshold=threshold,\n                                                    ignore_index=ignore_index,\n                                                    ce_weight=ce_weight,\n                                                    focal_weight=focal_weight,\n                                                    tmse_weight=tmse_weight,\n                                                    gstmse_weight=gstmse_weight,\n                                                    file_path=file_path,\n                                                    label_path=label_path,\n                                                    num_classes=num_classes)"
+        },
+        {
+            "comment": "This code defines a custom loss function for a video modeling framework. It initializes a boundary regression loss criterion and takes a weighted average of classification and boundary losses. The forward method calculates the total loss by summing weighted classification and boundary losses, and returns the final loss value.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/losses/asrf_loss.py\":373-400",
+            "content": "        self.criterion_boundary = BoundaryRegressionLoss(\n            bce=bce,\n            focal=brl_focal,\n            mse=mse,\n            weight=brl_weight,\n            file_path=file_path,\n            label_path=boundary_path)\n        self.lambda_bound_loss = lambda_bound_loss\n    def forward(self, x, output_cls, label, outputs_boundary, boundary):\n        loss = 0.0\n        if isinstance(output_cls, list):\n            n = len(output_cls)\n            for out in output_cls:\n                loss += self.criterion_cls(out, label, x) / n\n        else:\n            loss += self.criterion_cls(output_cls, label, x)\n        if isinstance(outputs_boundary, list):\n            n = len(outputs_boundary)\n            for out in outputs_boundary:\n                loss += self.lambda_bound_loss * self.criterion_boundary(\n                    out, boundary) / n\n        else:\n            loss += self.lambda_bound_loss * self.criterion_boundary(\n                outputs_boundary, boundary)\n        return loss"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f76e78d8-071f-4bc5-b451-fa90ab75cd5d.json b/docs/doc/f76e78d8-071f-4bc5-b451-fa90ab75cd5d.json
new file mode 100644
index 000000000..077879b17
--- /dev/null
+++ b/docs/doc/f76e78d8-071f-4bc5-b451-fa90ab75cd5d.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The code utilizes PaddlePaddle library for video retrieval, trains a model, handles memory-efficient sample copies, calculates metrics, logs progress, and visualizes ranking if available. Mean Average Precision is computed, results stored, and single test caption checked during each epoch.",
+    "details": [
+        {
+            "comment": "This code is part of a larger program using the PaddlePaddle library for video retrieval. It defines a verbose function to display training metrics and a context manager to handle temporary copies of retrieval samples.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":0-30",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport paddle\nimport numpy as np\nfrom base import BaseTrainer\nfrom utils import memory_summary\nfrom contextlib import contextmanager\ndef verbose(epoch, metrics, mode, name=\"TEST\"):\n    r1, r5, r10, r50 = metrics[\"R1\"], metrics[\"R5\"], metrics[\"R10\"], metrics[\"R50\"]\n    msg = f\"[{mode}]{name:s} epoch {epoch}, R@1: {r1:.1f}\"\n    msg += f\", R@5: {r5:.1f}, R@10 {r10:.1f}, R@50 {r50:.1f}\"\n    msg += f\"MedR: {metrics['MedR']:g}, MeanR: {metrics['MeanR']:.1f}\"\n    print(msg)\n@contextmanager\ndef ctxt_mgr(samples):\n    \"\"\"Provide a context for managing temporary, cloned copies of retrieval"
+        },
+        {
+            "comment": "This function creates a copy of the \"experts\" tensor from the input samples and replaces it in the samples dictionary. It also includes other relevant tensors and allows for evaluation without modifying the original samples. The copied samples are yielded, and then deleted after use to avoid memory leaks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":31-65",
+            "content": "    sample tensors.\n    The rationale here is that to use nan-checking in the model (to validate the\n    positions of missing experts), we need to modify the underlying tensors. This\n    function lets the evaluation code run (and modify) temporary copies, without\n    modifying the originals.\n    \"\"\"\n    exp_dict = samples[\"experts\"].items()\n    experts = {key: val.clone() for key, val in exp_dict}\n    samples_ = {\n        \"experts\": experts,\n        \"ind\": samples[\"ind\"],\n        \"text\": samples[\"text\"],\n        \"cap_id\": samples[\"cap_id\"],\n        \"att_mask\": samples[\"att_mask\"],\n    }\n    if \"text_token_mask\" in samples:\n        samples_[\"text_token_mask\"] = samples[\"text_token_mask\"]\n    try:\n        yield samples_\n    finally:\n        del samples_\nclass Trainer(BaseTrainer):\n    \"\"\"\n    Trainer class\n    Note:\n        Inherited from BaseTrainer.\n    \"\"\"\n    def __init__(self, model, loss, metrics, optimizer, config, data_loaders,\n                 lr_scheduler, visualizer, skip_first_n_saves,\n                 include_optim_in_save_model, force_cpu_val, cache_targets=set(),"
+        },
+        {
+            "comment": "This code defines a class for training a model with specific configurations, data loaders, learning rate scheduler, and more. It initializes the necessary attributes and provides a method for performing training during an epoch. The `_train_epoch` method performs training logic for an epoch and returns a log containing all relevant information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":66-88",
+            "content": "                 num_keep_ckpts=3, mini_train=False, val_freq=1, skip_tboard=False):\n        super().__init__(model, loss, metrics, optimizer, config, mini_train=mini_train,\n                         skip_tboard=skip_tboard, num_keep_ckpts=num_keep_ckpts)\n        self.config = config\n        self.cache_targets = cache_targets\n        self.data_loaders = data_loaders\n        self.lr_scheduler = lr_scheduler\n        self.mini_train = mini_train\n        self.len_epoch = len(self.data_loaders[\"train\"])\n        self.log_step = int(np.sqrt(data_loaders[\"train\"].batch_size))\n        self.visualizer = visualizer\n        self.force_cpu_val = force_cpu_val\n        self.val_freq = val_freq\n        self.skip_first_n_saves = skip_first_n_saves\n        self.include_optim_in_save_model = include_optim_in_save_model\n        self.seen = {\"train\": 0, \"val\": 0}\n    def _train_epoch(self, epoch):\n        \"\"\"\n        Training logic for an epoch\n        :param epoch: Current training epoch.\n        :return: A log that contains all information you want to save."
+        },
+        {
+            "comment": "This code trains a model and computes the loss for each batch of data in the train loader. The loss is then backpropagated, the optimizer steps, and gradients are cleared before moving on to the next batch. The batch size is also tracked as part of the seen data count.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":90-116",
+            "content": "        Note:\n            If you have additional information to record, for example:\n                > additional_log = {\"x\": x, \"y\": y}\n            merge it with log before return. i.e.\n                > log = {**log, **additional_log}\n                > return log\n            The metrics in log must have the key 'metrics'.\n        \"\"\"\n        total_loss = 0\n        self.model.train()\n        memory_summary()\n        for batch_idx, minibatch in enumerate(self.data_loaders[\"train\"]):\n            output = self.model(**minibatch)\n            if \"retrieval\" in self.data_loaders.dataloaders:\n                loss = self.loss(output[\"cross_view_conf_matrix\"])\n            else:\n                loss = self.loss(x=output[\"class_preds\"], target=labels)\n            loss.backward()\n            self.optimizer.step()\n            self.optimizer.clear_grad()\n            sample_key = list(minibatch[\"experts\"].keys())[0]\n            batch_size = minibatch[\"experts\"][sample_key].shape[0]\n            self.seen[\"train\"] += batch_size"
+        },
+        {
+            "comment": "Training loop for a machine learning model, logging progress and validating after certain epochs. Performs validation metrics calculation and updates learning rate with a scheduler.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":118-149",
+            "content": "            total_loss += loss.item()\n            if batch_idx % self.log_step == 0:\n                prog = self._progress(batch_idx)\n                self.logger.info(f\"Train Epoch: {epoch} {prog} Loss: {loss.item():.6f}\")\n            if batch_idx == self.len_epoch or (self.mini_train and batch_idx > 3):\n                break\n        log = {'loss': total_loss / self.len_epoch}\n        if epoch % self.val_freq == 0:\n            nested_log, cached_preds = self._valid_epoch(epoch)\n            log.update(nested_log)\n        else:\n            nested_log, cached_preds = {}, None\n            self.logger.info(f\"skipping val for epoch: {epoch}\")\n        self.lr_scheduler.step()\n        self.logger.info(f\"LR {self.lr_scheduler.get_lr()}\")\n        return log, cached_preds\n    def _valid_epoch(self, epoch):\n        \"\"\"Validate model after an epoch of training and store results to disk.\n        Args:\n            epoch (int): the current epoch\n        Returns:\n            A log that contains information about validation\n        NOTE: The validation metrics in log must have the key 'val_metrics'."
+        },
+        {
+            "comment": "This code is initializing the model in evaluation mode, creating a dictionary to store cached predictions, and retrieving data from dataloaders. It also checks if there are too many queries and adjusts batch size accordingly. The text_keys variable stores keys for text-related data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":150-170",
+            "content": "        \"\"\"\n        self.model.eval()\n        cached_preds = {key: {\"vid_name\": [], \"preds\": [], \"labels\": []}\n                        for key in self.cache_targets}\n        with paddle.no_grad():\n            if \"retrieval\" in self.data_loaders.dataloaders:\n                samples, meta = self.data_loaders[\"retrieval\"]\n                sample_key = list(samples[\"experts\"].keys())[0]\n                batch_size = samples[\"experts\"][sample_key].shape[0]\n                self.seen[\"val\"] += batch_size\n                num_queries = samples[\"text\"].shape[0] * samples[\"text\"].shape[1]\n                safe_queries = 1\n                text_keys = ['text', 'cap_id', 'att_mask', 'text_token_mask']\n                if num_queries > safe_queries:\n                    chk = 50\n                    tck = 50\n                    if samples['text'].shape[0] % chk == 0:\n                        vid_batch = samples['text'].shape[0] // chk\n                    else:\n                        vid_batch = samples['text'].shape[0] // chk + 1"
+        },
+        {
+            "comment": "This code segment calculates the number of batches for 'text' and iterates through each batch. It then creates sub-samples and subsub-sims for further processing. This seems to be part of a machine learning model training process, possibly using video data with experts and indicators as additional features. The progress is printed every 5 batches.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":171-189",
+            "content": "                    if samples['text'].shape[0] % tck == 0:\n                        text_batch  =  samples['text'].shape[0] // tck\n                    else:\n                        text_batch  =  samples['text'].shape[0] // tck + 1\n                    sub_sims = []\n                    for idx in range(text_batch):\n                        if idx % 5 == 0:\n                            print(idx,'/',text_batch)\n                        sub_samples = {}\n                        for key in text_keys:\n                            sub_samples.update({key: samples[key][idx*tck:idx*tck+tck]})\n                        subsub_sims = []\n                        for vid in range(vid_batch):\n                            sub_samples['experts'] = {}\n                            sub_samples['ind'] = {} \n                            for expert in samples['experts'].keys():\n                                sub_samples['experts'][expert] = samples['experts'][expert][vid*chk:vid*chk+chk]\n                                sub_samples['ind'][expert] = samples['ind'][expert][vid*chk:vid*chk+chk]"
+        },
+        {
+            "comment": "This code appears to be part of a machine learning model training process. It uses PaddlePaddle, a deep learning framework, to calculate similarity metrics (sims) between samples or sub-samples, then concatenates them based on the given condition (if sub_samples exists). If no sub_samples exist, it directly calculates sims from the samples. The code then samples the loss using only the first query for each video and reshapes the sims tensor accordingly before passing it to a loss function (self.loss). Finally, the dataset name is captured in the variable \"dataset\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":190-208",
+            "content": "                            with ctxt_mgr(sub_samples) as xx:\n                                output = self.model(**xx)\n                            subsub_sims.append(output[\"cross_view_conf_matrix\"].cpu())\n                        subsub_sims = paddle.concat(subsub_sims, axis=1)\n                        sub_sims.append(subsub_sims)\n                    sims = paddle.concat(sub_sims, axis=0)\n                    sims = paddle.to_tensor(sims, dtype='float32').cpu().numpy()\n                else:\n                    with ctxt_mgr(samples) as xx:\n                        output = self.model(**xx)\n                    sims = paddle.to_tensor(output[\"cross_view_conf_matrix\"], dtype='float32').cpu().numpy()\n                # sample the loss (using only the first query for each video)\n                queries_per_vid = meta[\"query_masks\"].shape[1]\n                sims_ = paddle.to_tensor(sims).reshape([-1, queries_per_vid, sims.shape[-1]])\n                loss = self.loss(sims_[:, 0, :])\n                dataset = self.data_loaders.dataset_name"
+        },
+        {
+            "comment": "The code is calculating metrics such as Mean Average Precision (mAP) for each epoch and storing the results in a dictionary named nested_metrics. If mAP is calculated, it prints the value. It also calls a verbose function that takes the current epoch, metrics values, dataset name, and metric name as parameters. The code checks if there is only one test caption available (num_test_captions == 1) and if raw_captions exist. If so, it visualizes the ranking by calling a visualizer function passing simulation scores (sims), meta data, current epoch, and nested_metrics as arguments.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":209-227",
+            "content": "                nested_metrics = {}\n                for metric in self.metrics:\n                    metric_name = metric.__name__\n                    res = metric(sims, query_masks=meta[\"query_masks\"])\n                    if metric_name == \"mean_average_precision\":\n                        print(f\"Epoch: {epoch}, mean AP: {res['mAP']}\")\n                    else:\n                        verbose(epoch=epoch, metrics=res, name=dataset, mode=metric_name)\n                    nested_metrics[metric_name] = res\n                # TODO(Samuel) disabled visualisation for now, simple to add in later\n                num_test_caps = self.data_loaders.num_test_captions\n                if num_test_caps == 1 and meta[\"raw_captions\"] is not None:\n                    if self.visualizer is not None:\n                        self.visualizer.visualize_ranking(\n                            sims=sims,\n                            meta=meta,\n                            epoch=epoch,\n                            nested_metrics=nested_metrics,"
+        },
+        {
+            "comment": "Iterating over validation data, calculates metrics for each batch, logs progress during iteration. If cache_targets includes \"val\", stores predictions and labels in cached_preds dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":228-248",
+            "content": "                        )\n                return {\"nested_val_metrics\": nested_metrics}, cached_preds\n            elif \"val\" in self.data_loaders.dataloaders:\n                metrics = [x() for x in self.metrics]\n                for batch_idx, minibatch in enumerate(self.data_loaders[\"val\"]):\n                    labels = minibatch.pop(\"labels\")\n                    vid_name = minibatch.pop(\"vid_name\")\n                    output = self.model(**minibatch)\n                    if \"val\" in self.cache_targets:\n                        cached_preds[\"val\"][\"vid_name\"].append(vid_name)\n                        cached_preds[\"val\"][\"preds\"].append(output[\"class_preds\"])\n                    for metric in metrics:\n                        metric.add(output=output[\"class_preds\"], target=labels)\n                    if batch_idx % self.log_step == 0:\n                        prog = self._progress(batch_idx)\n                        self.logger.info(f\"Val Epoch: {epoch} {prog}\")\n                nested_metrics = {}\n                for metric in metrics:"
+        },
+        {
+            "comment": "This code checks if the metric has a \"topk\" attribute, then creates a dictionary of top-k values and assigns it to \"res\". If not supported, raises a ValueError. It adds the accuracy metric to nested_metrics. The code then creates a nested dictionary for cache targets other than \"val\", and iterates through each data loader in self.data_loaders[\"tiny\"]. For each batch, it checks if labels are present, appends them to cached_preds with corresponding vid name and model predictions. Finally, it aggregates all cached predictions for the specified target(s).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":249-266",
+            "content": "                    if hasattr(metric, \"topk\"):\n                        res = {f\"top{key}\": val for key, val in\n                               zip(metric.topk, metric.value())}\n                        nested_metrics[\"accuracy\"] = res\n                    else:\n                        raise ValueError(f\"unsupported mettric: {type(metric)}\")\n                nested = {\"nested_val_metrics\": nested_metrics}\n                for target in self.cache_targets - {\"val\"}:\n                    for batch_idx, minibatch in enumerate(self.data_loaders[\"tiny\"]):\n                        if \"labels\" in minibatch:\n                            cached_preds[target][\"labels\"].append(minibatch.pop(\"labels\"))\n                        cached_preds[target][\"vid_name\"].append(minibatch.pop(\"vid_name\"))\n                        output = self.model(**minibatch)\n                        cached_preds[target][\"preds\"].append(output[\"class_preds\"])\n                # aggregate all cached predictions\n                for target in self.cache_targets:"
+        },
+        {
+            "comment": "The code defines two functions: _compute_nested_preds and _progress. The first function computes nested predictions from cached predictions for a given target, while the second one returns a progress message based on the current batch index and total number of samples or epoch length.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/trainer/trainer.py\":267-279",
+            "content": "                    for key, val in cached_preds[target].items():\n                        cached_preds[key] = paddle.concat(val).cpu().numpy()\n                return nested, cached_preds\n    def _progress(self, batch_idx):\n        base = '[{}/{} ({:.0f}%)]'\n        if hasattr(self.data_loaders, 'n_samples'):\n            current = batch_idx * self.data_loaders.batch_size\n            total = self.data_loaders.n_samples\n        else:\n            current = batch_idx\n            total = self.len_epoch\n        return base.format(current, total, 100.0 * current / total)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f78e1ab0-c730-4610-aada-6b2232160b79.json b/docs/doc/f78e1ab0-c730-4610-aada-6b2232160b79.json
new file mode 100644
index 000000000..9806255a8
--- /dev/null
+++ b/docs/doc/f78e1ab0-c730-4610-aada-6b2232160b79.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code loads weights from a checkpoint file into a model, defines functions for saving, loading, and creating directories, using Paddle's save and load methods.",
+    "details": [
+        {
+            "comment": "This code snippet is part of PaddleVideo's Video Quality Assessment application. It loads a checkpoint file into the provided model. If the weight path file does not exist, it will not be loaded, and the method returns immediately without any action. The function uses the \"os\" and \"tqdm\" libraries for file operations and progress bars, respectively. It also utilizes Paddle's \"set_state_dict\" method to load the model's parameters from the checkpoint file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py\":0-36",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nimport os\nimport os.path as osp\nimport time\nimport pickle\nfrom tqdm import tqdm\nimport paddle\nfrom paddlevideo.utils import get_logger\nfrom paddlevideo.utils import main_only\n#XXX(shipping): maybe need load N times because of different cards have different params.\n@main_only\ndef load_ckpt(model, weight_path):\n    \"\"\"\n    load_ckpt\n    \"\"\"\n    #model.set_state_dict(state_dict)\n    if not osp.isfile(weight_path):"
+        },
+        {
+            "comment": "This code snippet is attempting to load weights from a checkpoint file into a model. It first raises an error if the provided path does not correspond to a valid checkpoint file. The logger variable is assigned for logging purposes. Then, it uses Paddle's paddle.load() function to load the state dictionaries from the specified weight_path.\n\nThe code then initializes an empty dictionary 'tmp' and calculates the total length of the model's state dictionary. It creates a local key name list by iterating through state_dicts. \n\nNext, it uses tqdm to create a progress bar for displaying the loading process. For each item in the model's state dictionary, it checks if it exists in the loaded state dictionaries and assigns the corresponding value to 'tmp'. It also updates the description of the progress bar with the current name being loaded. Finally, upon completion, it sets a final description indicating that all weights have been loaded successfully.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py\":37-62",
+            "content": "        raise IOError('{weight_path} is not a checkpoint file')\n    #state_dicts = load(weight_path)\n    logger = get_logger(\"paddlevideo\")\n    state_dicts = paddle.load(weight_path)\n    tmp = {}\n    total_len = len(model.state_dict())\n    localkeyname = [i for i in state_dicts]\n    with tqdm(total=total_len,\n              position=1,\n              bar_format='{desc}',\n              desc=\"Loading weights\") as desc:\n        #for item in tqdm(model.state_dict(), total=total_len, position=0):\n        for i, item in enumerate(\n                tqdm(model.state_dict(), total=total_len, position=0)):\n            name = item\n            desc.set_description('Loading %s' % name)\n            print(\"model name is {}, correspoding local name is {}\".format(\n                name, localkeyname[i]))\n            #tmp[name] = state_dicts[name]\n            tmp[name] = state_dicts[localkeyname[i]]\n            time.sleep(0.01)\n        ret_str = \"loading {:<20d} weights completed.\".format(\n            len(model.state_dict()))\n        desc.set_description(ret_str)"
+        },
+        {
+            "comment": "This code defines functions for saving, loading, and creating directories. The \"save\" function uses Paddle's save method to store an object at a specified path. The \"load\" function checks if the file exists before returning its contents using Paddle's load method. Lastly, the \"mkdir\" function creates a directory at the specified location, handling errors that may occur when training with multiple GPUs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py\":63-86",
+            "content": "        model.set_state_dict(tmp)\ndef mkdir(dir):\n    \"\"\"mkdir\"\"\"\n    if not os.path.exists(dir):\n        # avoid error when train with multiple gpus\n        try:\n            os.makedirs(dir)\n        except:\n            pass\n@main_only\ndef save(obj, path):\n    \"\"\"save\"\"\"\n    paddle.save(obj, path)\ndef load(file_name):\n    \"\"\"load\"\"\"\n    if not osp.isfile(file_name):\n        raise IOError('{file_name} not exist')\n    return paddle.load(file_name)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f796ef44-34e0-4d10-b159-03f772bc8352.json b/docs/doc/f796ef44-34e0-4d10-b159-03f772bc8352.json
new file mode 100644
index 000000000..abf3668fe
--- /dev/null
+++ b/docs/doc/f796ef44-34e0-4d10-b159-03f772bc8352.json
@@ -0,0 +1,55 @@
+{
+    "summary": "This code deploys PaddleVideo models with C++, supports optional settings and displays inference results, but encounters an error searching for 'libcudnn.so' due to incorrect/missing CUDNN_LIB_DIR setting.",
+    "details": [
+        {
+            "comment": "Explanation of the code: This is an introduction to deploying PaddleVideo models using C++. It provides instructions on setting up a Linux environment and compiling OpenCV and PaddlePaddle libraries for model prediction. The code also mentions the need to install additional dependencies and provides commands for downloading, extracting, and compiling the OpenCV library. Additionally, it notes that Windows support is currently under development (TODO) and requires Visual Studio 2019 Community for compilation (TODO).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":0-44",
+            "content": "[English](./readme_en.md) | \u7b80\u4f53\u4e2d\u6587\n# \u670d\u52a1\u5668\u7aefC++\u9884\u6d4b\n\u672c\u7ae0\u8282\u4ecb\u7ecdPaddleVideo\u6a21\u578b\u7684\u7684C++\u90e8\u7f72\u65b9\u6cd5\uff0cpython\u9884\u6d4b\u90e8\u7f72\u65b9\u6cd5\u8bf7\u53c2\u8003\u5404\u81ea\u6a21\u578b\u7684**\u6a21\u578b\u63a8\u7406**\u7ae0\u8282\u3002\nC++\u5728\u6027\u80fd\u8ba1\u7b97\u4e0a\u4f18\u4e8epython\uff0c\u56e0\u6b64\uff0c\u5728\u5927\u591a\u6570CPU\u3001GPU\u90e8\u7f72\u573a\u666f\uff0c\u591a\u91c7\u7528C++\u7684\u90e8\u7f72\u65b9\u5f0f\uff0c\u672c\u8282\u5c06\u4ecb\u7ecd\u5982\u4f55\u5728Linux\uff08CPU/GPU\uff09\u73af\u5883\u4e0b\u914d\u7f6eC++\u73af\u5883\u5e76\u5b8c\u6210\nPaddleVideo\u6a21\u578b\u90e8\u7f72\u3002\n\u5728\u5f00\u59cb\u4f7f\u7528\u4e4b\u524d\uff0c\u60a8\u9700\u8981\u6309\u7167\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5\u989d\u5916\u7684\u4f9d\u8d56\u5305\uff1a\n```bash\npython -m pip install git+https://github.com/LDOUBLEV/AutoLog\n```\n## 1. \u51c6\u5907\u73af\u5883\n- Linux\u73af\u5883\uff0c\u63a8\u8350\u4f7f\u7528docker\u3002\n- Windows\u73af\u5883\uff0c\u76ee\u524d\u652f\u6301\u57fa\u4e8e`Visual Studio 2019 Community`\u8fdb\u884c\u7f16\u8bd1\uff08TODO\uff09\n* \u8be5\u6587\u6863\u4e3b\u8981\u4ecb\u7ecd\u57fa\u4e8eLinux\u73af\u5883\u7684PaddleVideo C++\u9884\u6d4b\u6d41\u7a0b\uff0c\u5982\u679c\u9700\u8981\u5728Windows\u4e0b\u57fa\u4e8e\u9884\u6d4b\u5e93\u8fdb\u884cC++\u9884\u6d4b\uff0c\u5177\u4f53\u7f16\u8bd1\u65b9\u6cd5\u8bf7\u53c2\u8003[Windows\u4e0b\u7f16\u8bd1\u6559\u7a0b](./docs/windows_vs2019_build.md)\uff08TODO\uff09\n* **\u51c6\u5907\u73af\u5883\u7684\u76ee\u7684\u662f\u5f97\u5230\u7f16\u8bd1\u597d\u7684opencv\u5e93\u4e0epaddle\u9884\u6d4b\u5e93**\u3002\n### 1.1 \u7f16\u8bd1opencv\u5e93\n* \u9996\u5148\u9700\u8981\u4eceopencv\u5b98\u7f51\u4e0a\u4e0b\u8f7d\u5728Linux\u73af\u5883\u4e0b\u6e90\u7801\u7f16\u8bd1\u7684\u538b\u7f29\u5305\uff0c\u5e76\u89e3\u538b\u6210\u6587\u4ef6\u5939\u3002\u4ee5opencv3.4.7\u4e3a\u4f8b\uff0c\u4e0b\u8f7d\u547d\u4ee4\u5982\u4e0b\uff1a\n    ```bash\n    cd deploy/cpp_infer\n    wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz\n    tar -xf 3.4.7.tar.gz\n    ```\n    \u89e3\u538b\u5b8c\u6bd5\u540e\u5728`deploy/cpp_infer`\u76ee\u5f55\u4e0b\u53ef\u4ee5\u5f97\u5230\u89e3\u538b\u51fa\u7684`opencv-3.4.7`\u7684\u6587\u4ef6\u5939\u3002\n* \u5b89\u88c5ffmpeg\n    opencv\u914d\u5408ffmpeg\u624d\u80fd\u5728linux\u4e0b\u6b63\u5e38\u8bfb\u53d6\u89c6\u9891\uff0c\u5426\u5219\u53ef\u80fd\u9047\u5230\u89c6\u9891\u5e27\u6570\u8fd4\u56de\u4e3a0\u6216\u65e0\u6cd5\u8bfb\u53d6\u4efb\u4f55\u89c6\u9891\u5e27\u7684\u60c5\u51b5\n    \u91c7\u7528\u8f83\u4e3a\u7b80\u5355\u7684apt\u5b89\u88c5\uff0c\u5b89\u88c5\u547d\u4ee4\u5982\u4e0b\uff1a\n    ```bash\n    apt-get update\n    apt install libavformat-dev\n    apt install libavcodec-dev"
+        },
+        {
+            "comment": "Preparing to compile OpenCV, enter the `opencv-3.4.7` directory and set `root_path` and `install_path`. Remove existing `build` folder, create a new one, navigate into it, run cmake commands with specified options, make and install. Results in an `opencv3` folder with header files and libraries for C++ video inference code compilation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":45-90",
+            "content": "    apt install libswresample-dev\n    apt install libswscale-dev\n    apt install libavutil-dev\n    apt install libsdl1.2-dev\n    apt-get install ffmpeg\n    ```\n* \u51c6\u5907\u7f16\u8bd1opencv\uff0c\u9996\u5148\u8fdb\u5165`opencv-3.4.7`\u7684\u6587\u4ef6\u5939\uff0c\u7136\u540e\u8bbe\u7f6eopencv\u6e90\u7801\u8def\u5f84`root_path`\u4ee5\u53ca\u5b89\u88c5\u8def\u5f84`install_path`\u3002\u6267\u884c\u547d\u4ee4\u5982\u4e0b\uff1a\n    ```bash\n    cd opencv-3.4.7\n    root_path=$PWD  # \u5f53\u524d\u6240\u5728\u8def\u5f84\u5373\u4e3aopencv-3.4.7\u7684\u7edd\u5bf9\u8def\u5f84\n    install_path=${root_path}/opencv3\n    rm -rf build\n    mkdir build\n    cd build\n    cmake .. \\\n        -DCMAKE_INSTALL_PREFIX=${install_path} \\\n        -DCMAKE_BUILD_TYPE=Release \\\n        -DBUILD_SHARED_LIBS=OFF \\\n        -DWITH_IPP=OFF \\\n        -DBUILD_IPP_IW=OFF \\\n        -DWITH_LAPACK=OFF \\\n        -DWITH_EIGEN=OFF \\\n        -DCMAKE_INSTALL_LIBDIR=lib64 \\\n        -DWITH_ZLIB=ON \\\n        -DBUILD_ZLIB=ON \\\n        -DWITH_JPEG=ON \\\n        -DBUILD_JPEG=ON \\\n        -DWITH_PNG=ON \\\n        -DBUILD_PNG=ON \\\n        -DWITH_TIFF=ON \\\n        -DBUILD_TIFF=ON \\\n        -DWITH_FFMPEG=ON\n    make -j\n    make install\n    ```\n    `make install`\u5b8c\u6210\u4e4b\u540e\uff0c\u4f1a\u5728\u8be5\u6587\u4ef6\u5939\u4e0b\u751f\u6210opencv\u5934\u6587\u4ef6\u548c\u5e93\u6587\u4ef6\uff0c\u7528\u4e8e\u540e\u9762\u7684Video\u63a8\u7406C++\u4ee3\u7801\u7f16\u8bd1\u3002\n    \u6700\u7ec8\u4f1a\u4ee5\u5b89\u88c5\u8def\u5f84`install_path`\u4e3a\u6307\u5b9a\u8def\u5f84\uff0c\u5f97\u5230\u4e00\u4e2a`opencv3`\u7684\u6587\u4ef6\u5939\uff0c\u5176\u6587\u4ef6\u7ed3\u6784\u5982\u4e0b\u6240\u793a\u3002"
+        },
+        {
+            "comment": "In this code snippet, the user is provided with two methods to obtain Paddle prediction library. The first method involves directly downloading a pre-compiled version of the library from the official website based on the desired CUDA version and OS architecture. The second method involves cloning the latest source code from Paddle's GitHub repository and compiling it manually for the most recent features. The code also provides sample commands to download and extract a pre-compiled library or clone the Paddle source code using 'wget' and 'tar' commands.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":92-124",
+            "content": "    ```shell\n    opencv-3.4.7/\n    \u251c\u2500\u2500 opencv3/  # \u5b89\u88c5\u5728opencv3\u76ee\u5f55\u4e0b\n    \u2502   \u251c\u2500\u2500 bin/\n    \u2502   \u251c\u2500\u2500 include/\n    \u2502   \u251c\u2500\u2500 lib/\n    \u2502   \u251c\u2500\u2500 lib64/\n    \u2502   \u2514\u2500\u2500 share/\n    ```\n### 1.2 \u4e0b\u8f7d\u6216\u8005\u7f16\u8bd1Paddle\u9884\u6d4b\u5e93\n\u67092\u79cd\u65b9\u5f0f\u83b7\u53d6Paddle\u9884\u6d4b\u5e93\uff0c\u4e0b\u9762\u8fdb\u884c\u8be6\u7ec6\u4ecb\u7ecd\u3002\n#### 1.2.1 \u76f4\u63a5\u4e0b\u8f7d\u5b89\u88c5\n* [Paddle\u9884\u6d4b\u5e93\u5b98\u7f51](https://paddleinference.paddlepaddle.org.cn/v2.2/user_guides/download_lib.html) \u4e0a\u63d0\u4f9b\u4e86\u4e0d\u540ccuda\u7248\u672c\u7684Linux\u9884\u6d4b\u5e93\uff0c\u53ef\u4ee5\u5728\u5b98\u7f51\u67e5\u770b\u5e76**\u9009\u62e9\u5408\u9002\u7684\u9884\u6d4b\u5e93\u7248\u672c**\uff08\u5efa\u8bae\u9009\u62e9paddle\u7248\u672c>=2.0.1\u7248\u672c\u7684\u9884\u6d4b\u5e93\uff0c\u63a8\u8350\u4f7f\u75282.2.2\u7684\u9884\u6d4b\u5e93\uff09\u3002\n* \u4e0b\u8f7d\u5f97\u5230\u4e00\u4e2a`paddle_inference.tgz`\u538b\u7f29\u5305\uff0c\u7136\u540e\u5c06\u5b83\u89e3\u538b\u6210\u6587\u4ef6\u5939\uff0c\u547d\u4ee4\u5982\u4e0b(\u4ee5\u673a\u5668\u73af\u5883\u4e3agcc8.2\u4e3a\u4f8b)\uff1a\n    ```bash\n    wget https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz\n    tar -xf paddle_inference.tgz\n    ```\n    \u6700\u7ec8\u4f1a\u5728\u5f53\u524d\u7684\u6587\u4ef6\u5939\u4e2d\u751f\u6210`paddle_inference/`\u7684\u5b50\u6587\u4ef6\u5939\u3002\n#### 1.2.2 \u9884\u6d4b\u5e93\u6e90\u7801\u7f16\u8bd1\n* \u5982\u679c\u5e0c\u671b\u83b7\u53d6\u6700\u65b0\u9884\u6d4b\u5e93\u7279\u6027\uff0c\u53ef\u4ee5\u4ecePaddle github\u4e0a\u514b\u9686\u6700\u65b0\u4ee3\u7801\uff0c\u6e90\u7801\u7f16\u8bd1\u9884\u6d4b\u5e93\u3002\n* \u53ef\u4ee5\u53c2\u8003[Paddle\u9884\u6d4b\u5e93\u5b89\u88c5\u7f16\u8bd1\u8bf4\u660e](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi) \u7684\u8bf4\u660e\uff0c\u4ecegithub\u4e0a\u83b7\u53d6Paddle\u4ee3\u7801\uff0c\u7136\u540e\u8fdb\u884c\u7f16\u8bd1\uff0c\u751f\u6210\u6700\u65b0\u7684\u9884\u6d4b\u5e93\u3002\u4f7f\u7528git\u83b7\u53d6\u4ee3\u7801\u65b9\u6cd5\u5982\u4e0b\u3002\n    ```shell"
+        },
+        {
+            "comment": "The provided code demonstrates how to compile the Paddle inference API library from the source code. It explains the steps for cloning and entering the Paddle repository, setting build parameters, compiling the library using make, and creating a build directory. The comments also mention where to find more information about build parameter options and what files are generated after compilation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":125-169",
+            "content": "    git clone https://github.com/PaddlePaddle/Paddle.git\n    git checkout release/2.2\n    ```\n* \u8fdb\u5165Paddle\u76ee\u5f55\u540e\uff0c\u7f16\u8bd1\u65b9\u6cd5\u5982\u4e0b\u3002\n    ```shell\n    rm -rf build\n    mkdir build\n    cd build\n    cmake  .. \\\n        -DWITH_CONTRIB=OFF \\\n        -DWITH_MKL=ON \\\n        -DWITH_MKLDNN=ON  \\\n        -DWITH_TESTING=OFF \\\n        -DCMAKE_BUILD_TYPE=Release \\\n        -DWITH_INFERENCE_API_TEST=OFF \\\n        -DON_INFER=ON \\\n        -DWITH_PYTHON=ON\n    make -j4\n    make inference_lib_dist -j4 # 4\u4e3a\u7f16\u8bd1\u65f6\u4f7f\u7528\u6838\u6570\uff0c\u53ef\u6839\u636e\u673a\u5668\u60c5\u51b5\u81ea\u884c\u4fee\u6539\n    ```\n    \u66f4\u591a\u7f16\u8bd1\u53c2\u6570\u9009\u9879\u4ecb\u7ecd\u53ef\u4ee5\u53c2\u8003[\u6587\u6863\u8bf4\u660e](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi)\u3002\n* \u7f16\u8bd1\u5b8c\u6210\u4e4b\u540e\uff0c\u53ef\u4ee5\u5728`build/paddle_inference_install_dir/`\u6587\u4ef6\u4e0b\u770b\u5230\u751f\u6210\u4e86\u4ee5\u4e0b\u6587\u4ef6\u53ca\u6587\u4ef6\u5939\u3002\n    ```bash\n    build/\n    \u2514\u2500\u2500 paddle_inference_install_dir/\n        \u251c\u2500\u2500 CMakeCache.txt\n        \u251c\u2500\u2500 paddle/\n        \u251c\u2500\u2500 third_party/\n        \u2514\u2500\u2500 version.txt\n    ```\n    \u5176\u4e2d`paddle`\u5c31\u662fC++\u9884\u6d4b\u6240\u9700\u7684Paddle\u5e93\uff0c`version.txt`\u4e2d\u5305\u542b\u5f53\u524d\u9884\u6d4b\u5e93\u7684\u7248\u672c\u4fe1\u606f\u3002\n## 2. \u7f16\u8bd1\u5e76\u8fd0\u884c\u9884\u6d4bdemo\n### 2.1 \u5c06\u6a21\u578b\u5bfc\u51fa\u4e3ainference model\n* \u8be5\u6b65\u9aa4\u4e0epython\u90e8\u7f72\u65b9\u5f0f\u4e0b\u7684\u5bfc\u51fa\u9884\u6d4b\u6a21\u578b\u76f8\u540c\uff0c\u53ef\u4ee5\u53c2\u8003\u5404\u81ea\u6a21\u578b\u7684\u6a21\u578b\u9884\u6d4b\u7ae0\u8282\u3002\u5bfc\u51fa\u7684\u51e0\u4e2a\u76f8\u5173inference model\u6587\u4ef6\u7528\u4e8e\u6a21\u578b\u9884\u6d4b\u3002**\u4ee5PP-TSM\u4e3a\u4f8b**\uff0c\u5bfc\u51fa\u9884\u6d4b\u6a21\u578b\u7684\u76ee\u5f55\u7ed3\u6784\u5982\u4e0b\u3002"
+        },
+        {
+            "comment": "This code is providing instructions to compile the PaddleVideo C++ prediction demo for an inference model. Users need to navigate to the \"deploy/cpp_infer\" directory and execute the `bash tools/build.sh` command. They must also modify the `tools/build.sh` script with their specific openCV, Paddle Inference, CUDA library, and CUDNN library directories before running the build script.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":171-212",
+            "content": "    ```\n    inference/\n    \u2514\u2500\u2500 ppTSM/\n        \u251c\u2500\u2500 ppTSM.pdiparams\n        \u251c\u2500\u2500 ppTSM.pdiparamsinfo\n        \u2514\u2500\u2500 ppTSM.pdmodel\n    ```\n### 2.2 \u7f16\u8bd1PaddleVideo C++\u9884\u6d4bdemo\n* \u8fdb\u5165\u5230`deploy/cpp_infer`\u76ee\u5f55\u4e0b\uff0c\u6267\u884c\u4ee5\u4e0b\u7f16\u8bd1\u547d\u4ee4\n    ```shell\n    bash tools/build.sh\n    ```\n    `tools/build.sh`\u4e2d\u7684Paddle C++\u9884\u6d4b\u5e93\u3001opencv\u7b49\u5176\u4ed6\u4f9d\u8d56\u5e93\u7684\u5730\u5740\u9700\u8981\u6362\u6210\u81ea\u5df1\u673a\u5668\u4e0a\u7684\u5b9e\u9645\u5730\u5740\u3002\n* \u5177\u4f53\u5730\uff0c\u9700\u8981\u4fee\u6539`tools/build.sh`\u4e2d\u7684\u73af\u5883\u8def\u5f84\uff0c\u76f8\u5173\u5185\u5bb9\u5982\u4e0b\uff1a\n    ```shell\n    OPENCV_DIR=your_opencv_dir\n    LIB_DIR=your_paddle_inference_dir\n    CUDA_LIB_DIR=your_cuda_lib_dir\n    CUDNN_LIB_DIR=your_cudnn_lib_dir\n    ```\n    \u4e0a\u8ff0\u53c2\u6570\u5982\u4e0b(\u4ee5\u4e0b\u8def\u5f84\u7528\u6237\u53ef\u6839\u636e\u81ea\u5df1\u673a\u5668\u7684\u60c5\u51b5\u5bf9\u5e94\u4fee\u6539)\n    ```bash\n    OPENCV_DIR=/path/to/opencv3\n    LIB_DIR=/path/to/paddle_inference\n    CUDA_LIB_DIR=/usr/local/cuda/lib64\n    CUDNN_LIB_DIR=/usr/lib/x86_64-linux-gnu/\n    ```\n    `OPENCV_DIR`\u4e3aopencv\u7f16\u8bd1\u5b89\u88c5\u7684\u5730\u5740\n    `LIB_DIR`\u4e3a\u4e0b\u8f7d(`paddle_inference`\u6587\u4ef6\u5939)\u6216\u8005\u7f16\u8bd1\u751f\u6210\u7684Paddle\u9884\u6d4b\u5e93\u5730\u5740(`build/paddle_inference_install_dir`\u6587\u4ef6\u5939)\n    `CUDA_LIB_DIR`\u4e3acuda\u5e93\u6587\u4ef6\u5730\u5740\uff0c\u5728docker\u4e2d\u4e3a`/usr/local/cuda/lib64`\n    `CUDNN_LIB_DIR`\u4e3acudnn\u5e93\u6587\u4ef6\u5730\u5740\uff0c\u5728docker\u4e2d\u4e3a`/usr/lib/x86_64-linux-gnu/`\u3002\n    **\u5982\u679c\u5e0c\u671b\u9884\u6d4b\u65f6\u5f00\u542fTensorRT\u52a0\u901f\u529f\u80fd\uff0c\u90a3\u4e48\u8fd8\u9700\u8981\u4fee\u6539`tools/build.sh`3\u5904\u4ee3\u7801**"
+        },
+        {
+            "comment": "This code sets the necessary environment variables and provides instructions for running PaddleVideo's C++ prediction demo. It supports video recognition mode with optional parameters such as model directory, inference model name, video directory, number of segments, and segment length. Users can choose from PP-TSM or PP-TSN models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":213-258",
+            "content": "    1. \u8bbe\u7f6e`DWITH_GPU=ON`\n    2. \u8bbe\u7f6e`DWITH_TENSORRT=ON`\n    3. \u8bbe\u7f6e`TENSORRT_DIR=/path/to/TensorRT-x.x.x.x`\n    **\u4ee5\u4e0a\u8def\u5f84\u90fd\u5199\u7edd\u5bf9\u8def\u5f84\uff0c\u4e0d\u8981\u5199\u76f8\u5bf9\u8def\u5f84**\n* \u7f16\u8bd1\u5b8c\u6210\u4e4b\u540e\uff0c\u4f1a\u5728`cpp_infer/build`\u6587\u4ef6\u5939\u4e0b\u751f\u6210\u4e00\u4e2a\u540d\u4e3a`ppvideo`\u7684\u53ef\u6267\u884c\u6587\u4ef6\u3002\n### 2.3 \u8fd0\u884cPaddleVideo C++\u9884\u6d4bdemo\n\u8fd0\u884c\u65b9\u5f0f\uff1a\n```bash\n./build/ppvideo <mode> [--param1] [--param2] [...]\n```\n\u5176\u4e2d\uff0c`mode`\u4e3a\u5fc5\u9009\u53c2\u6570\uff0c\u8868\u793a\u9009\u62e9\u7684\u529f\u80fd\uff0c\u53d6\u503c\u8303\u56f4['rec']\uff0c\u8868\u793a**\u89c6\u9891\u8bc6\u522b**\uff08\u66f4\u591a\u529f\u80fd\u4f1a\u9646\u7eed\u52a0\u5165\uff09\u3002\n##### 1. \u8c03\u7528\u89c6\u9891\u8bc6\u522b\uff1a\n```bash\n# \u8c03\u7528PP-TSM\u8bc6\u522b\n./build/ppvideo rec \\\n--rec_model_dir=../../inference/ppTSM \\\n--inference_model_name=ppTSM \\\n--video_dir=./example_video_dir \\\n--num_seg=8 \\\n--seg_len=1\n# \u8c03\u7528PP-TSN\u8bc6\u522b\n./build/ppvideo rec \\\n--rec_model_dir=../../inference/ppTSN \\\n--inference_model_name=ppTSN \\\n--video_dir=./example_video_dir \\\n--num_seg=25 \\\n--seg_len=1\n```\n\u66f4\u591a\u53c2\u6570\u5982\u4e0b\uff1a\n- \u901a\u7528\u53c2\u6570\n    | \u53c2\u6570\u540d\u79f0      | \u7c7b\u578b | \u9ed8\u8ba4\u53c2\u6570        | \u610f\u4e49                                                         |\n    | ------------- | ---- | --------------- | ------------------------------------------------------------ |\n    | use_gpu       | bool | false           | \u662f\u5426\u4f7f\u7528GPU                                                  |"
+        },
+        {
+            "comment": "This code snippet defines various parameters for video recognition model execution. It specifies GPU ID, requested GPU memory, CPU thread count for faster predictions on machines with sufficient cores, boolean values to enable mkldnn and tensorrt libraries, precision type for predictions (fp32/fp16/uint8), and a flag to start benchmarking during prediction. The video recognition model parameters include the path to the folder containing videos to be recognized.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":259-272",
+            "content": "    | gpu_id        | int  | 0               | GPU id\uff0c\u4f7f\u7528GPU\u65f6\u6709\u6548                                        |\n    | gpu_mem       | int  | 4000            | \u7533\u8bf7\u7684GPU\u5185\u5b58                                                |\n    | cpu_threads   | int  | 10              | CPU\u9884\u6d4b\u65f6\u7684\u7ebf\u7a0b\u6570\uff0c\u5728\u673a\u5668\u6838\u6570\u5145\u8db3\u7684\u60c5\u51b5\u4e0b\uff0c\u8be5\u503c\u8d8a\u5927\uff0c\u9884\u6d4b\u901f\u5ea6\u8d8a\u5feb |\n    | enable_mkldnn | bool | false           | \u662f\u5426\u4f7f\u7528mkldnn\u5e93                                             |\n    | use_tensorrt  | bool | false           | \u662f\u5426\u4f7f\u7528tensorrt\u5e93                                           |\n    | precision     | str  | \"fp32\"          | \u4f7f\u7528fp32/fp16/uint8\u7cbe\u5ea6\u6765\u9884\u6d4b                                |\n    | benchmark     | bool | true            | \u9884\u6d4b\u65f6\u662f\u5426\u5f00\u542fbenchmark\uff0c\u5f00\u542f\u540e\u4f1a\u5728\u6700\u540e\u8f93\u51fa\u914d\u7f6e\u3001\u6a21\u578b\u3001\u8017\u65f6\u7b49\u4fe1\u606f\u3002 |\n- \u89c6\u9891\u8bc6\u522b\u6a21\u578b\u76f8\u5173\n    | \u53c2\u6570\u540d\u79f0       | \u7c7b\u578b   | \u9ed8\u8ba4\u53c2\u6570                                      | \u610f\u4e49                                 |\n    | -------------- | ------ | --------------------------------------------- | ------------------------------------ |\n    | video_dir      | string | \"../example_video_dir\"                        | \u5b58\u653e\u5c06\u8981\u8bc6\u522b\u7684\u89c6\u9891\u7684\u6587\u4ef6\u5939\u8def\u5f84       |"
+        },
+        {
+            "comment": "The code is configuring the model directory path, inference model name, number of video segments, length of each segment, batch size for prediction, and the file path containing class labels and names. An example input video is used to demonstrate how the code outputs the detected results on the screen, including video file, classification, and confidence score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":273-288",
+            "content": "    | rec_model_dir  | string | \"\"                                            | \u5b58\u653e\u5bfc\u51fa\u7684\u9884\u6d4b\u6a21\u578b\u7684\u6587\u4ef6\u5939\u8def\u5f84       |\n    | inference_model_name | string | \"ppTSM\"                                 | \u9884\u6d4b\u6a21\u578b\u7684\u540d\u79f0 |\n    | num_seg        | int    | 8                                             | \u89c6\u9891\u5206\u6bb5\u7684\u6bb5\u6570                       |\n    | seg_len        | int    | 1                                             | \u89c6\u9891\u6bcf\u6bb5\u62bd\u53d6\u7684\u5e27\u6570                   |\n    | rec_batch_num  | int    | 1                                             | \u6a21\u578b\u9884\u6d4b\u65f6\u7684batch size               |\n    | char_list_file | str    | \"../../data/k400/Kinetics-400_label_list.txt\" | \u5b58\u653e\u6240\u6709\u7c7b\u522b\u6807\u53f7\u548c\u5bf9\u5e94\u540d\u5b57\u7684\u6587\u672c\u8def\u5f84 |\n\u200b\t\u4ee5example_video_dir\u4e0b\u7684\u6837\u4f8b\u89c6\u9891`example01.avi`\u4e3a\u8f93\u5165\u89c6\u9891\u4e3a\u4f8b\uff0c\u6700\u7ec8\u5c4f\u5e55\u4e0a\u4f1a\u8f93\u51fa\u68c0\u6d4b\u7ed3\u679c\u5982\u4e0b\u3002\n```bash\n[./inference/ppTSM]\n[./deploy/cpp_infer/example_video_dir]\ntotal videos num: 1\n./example_video_dir/example01.avi   class: 5 archery       score: 0.999556\nI1125 08:10:45.834288 13955 autolog.h:50] ----------------------- Config info -----------------------\nI1125 08:10:45.834458 13955 autolog.h:51] runtime_device: cpu"
+        },
+        {
+            "comment": "This code configures the inference engine with options for optimizing IR, memory optimization, TensorRT and MKLDNN support. It also sets the number of CPU threads, displays data information (batch size, input shape, data count), model name and precision, and logs the total time spent for inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":289-303",
+            "content": "I1125 08:10:45.834467 13955 autolog.h:52] ir_optim: True\nI1125 08:10:45.834475 13955 autolog.h:53] enable_memory_optim: True\nI1125 08:10:45.834483 13955 autolog.h:54] enable_tensorrt: 0\nI1125 08:10:45.834518 13955 autolog.h:55] enable_mkldnn: False\nI1125 08:10:45.834525 13955 autolog.h:56] cpu_math_library_num_threads: 10\nI1125 08:10:45.834532 13955 autolog.h:57] ----------------------- Data info -----------------------\nI1125 08:10:45.834540 13955 autolog.h:58] batch_size: 1\nI1125 08:10:45.834547 13955 autolog.h:59] input_shape: dynamic\nI1125 08:10:45.834556 13955 autolog.h:60] data_num: 1\nI1125 08:10:45.834564 13955 autolog.h:61] ----------------------- Model info -----------------------\nI1125 08:10:45.834573 13955 autolog.h:62] model_name: rec\nI1125 08:10:45.834579 13955 autolog.h:63] precision: fp32\nI1125 08:10:45.834586 13955 autolog.h:64] ----------------------- Perf info ------------------------\nI1125 08:10:45.834594 13955 autolog.h:65] Total time spent(ms): 2739\nI1125 08:10:45.834602 13955 au"
+        },
+        {
+            "comment": "This code snippet displays the preprocess time, inference time, and postprocess time for a certain task. It shows that the inference time is 1269.55ms and the postprocess time is 0.009118ms. The error message indicates a problem with finding the 'libcudnn.so' library due to an incorrect or missing CUDNN_LIB_DIR setting.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/readme.md\":303-323",
+            "content": "tolog.h:67] preprocess_time(ms): 10.6524, inference_time(ms): 1269.55, postprocess_time(ms): 0.009118\n```\n### 3 FAQ\n1. \u7f16\u8bd1demo\u8fc7\u7a0b\u4e2d\u51fa\u73b0\u4ee5\u4e0b\u9519\u8bef\n    ```shell\n    make[2]: *** No rule to make target '/usr/lib/x86_64-linux-gn/libcudnn.so', needed by 'ppvideo'.  Stop.\n    make[2]: *** Waiting for unfinished jobs....\n    [ 16%] Building CXX object CMakeFiles/ppvideo.dir/src/main.cpp.o\n    [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/preprocess_op.cpp.o\n    [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/postprocess_op.cpp.o\n    [ 83%] Building CXX object CMakeFiles/ppvideo.dir/src/utility.cpp.o\n    [ 83%] Building CXX object CMakeFiles/ppvideo.dir/src/video_rec.cpp.o\n    CMakeFiles/Makefile2:95: recipe for target 'CMakeFiles/ppvideo.dir/all' failed\n    make[1]: *** [CMakeFiles/ppvideo.dir/all] Error 2\n    Makefile:83: recipe for target 'all' failed\n    make: *** [all] Error 2\n    ```\n    \u53ef\u80fd\u662f`CUDNN_LIB_DIR`\u8bbe\u7f6e\u7684\u4e0d\u5bf9\uff0c\u5bfc\u81f4\u627e\u4e0d\u5230\u8be5\u76ee\u5f55\u4e0b\u7684`libcudnn.so`\u3002"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f79d64da-db70-4480-9ad6-0bddaa5c4dde.json b/docs/doc/f79d64da-db70-4480-9ad6-0bddaa5c4dde.json
new file mode 100644
index 000000000..6542de605
--- /dev/null
+++ b/docs/doc/f79d64da-db70-4480-9ad6-0bddaa5c4dde.json
@@ -0,0 +1,65 @@
+{
+    "summary": "The code prepares the environment for training PaddleVideo models, builds a multigrid configuration, handles device and parallelism, trains the model, optimizes it using specified optimizer, logs progress/learning rate updates, evaluates performance, saves state, and saves model & optimizer.",
+    "details": [
+        {
+            "comment": "The code snippet is the opening section of the file \"train_multigrid.py\" within the PaddleVideo library. It starts by declaring copyright, licensing information, and importing necessary modules. It also includes functions to build datasets, models, loaders, solvers, and utilities for logging, saving, and loading model parameters and progress. This section sets up the environment for training video models in the PaddleVideo framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":0-26",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport time\nimport os.path as osp\nimport paddle\nimport paddle.distributed as dist\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..modeling.builder import build_model\nfrom ..solver import build_lr, build_optimizer\nfrom ..utils import do_preciseBN\nfrom paddlevideo.utils import get_logger, coloring\nfrom paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,\n                               save, load, mkdir)"
+        },
+        {
+            "comment": "This function constructs data loaders for training a model with the multigrid approach. It takes several arguments including configuration (cfg), places to distribute the data, whether to use precise batch normalization (precise_bn), number of iterations for precise BN (num_iters_precise_bn), and world size. If precise BN is enabled, it adjusts the number of samples in the training dataset, creates a separate loader for precise BN, and sets the adjusted number of samples back to None. If not, it sets the precise BN loader to None. The code also checks if a short cycle multigrid approach is being used.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":27-49",
+            "content": "from paddlevideo.utils.multigrid import MultigridSchedule, aggregate_sub_bn_stats, subn_load, subn_save, is_eval_epoch\ndef construct_loader(cfg, places, validate, precise_bn, num_iters_precise_bn,\n                     world_size):\n    batch_size = cfg.DATASET.get('batch_size', 2)\n    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))\n    precise_bn_dataloader_setting = dict(\n        batch_size=batch_size,\n        num_workers=cfg.DATASET.get('num_workers', 0),\n        places=places,\n    )\n    if precise_bn:\n        cfg.DATASET.train.num_samples_precise_bn = num_iters_precise_bn * batch_size * world_size\n        precise_bn_dataset = build_dataset((cfg.DATASET.train,\n                                            cfg.PIPELINE.train))\n        precise_bn_loader = build_dataloader(precise_bn_dataset,\n                                             **precise_bn_dataloader_setting)\n        cfg.DATASET.train.num_samples_precise_bn = None\n    else:\n        precise_bn_loader = None\n    if cfg.MULTIGRID.SHORT_CYCLE:"
+        },
+        {
+            "comment": "The code adjusts the batch size in a short cycle schedule based on target image size, multi-grid factors and default crop size. It then sets up a train_dataloader with these batch sizes and other parameters. If validate is True, it also builds a valid_dataset and valid_dataloader with the given configurations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":50-76",
+            "content": "        # get batch size list in short cycle schedule\n        bs_factor = [\n            int(\n                round((float(cfg.PIPELINE.train.transform[1]['MultiCrop'][\n                    'target_size']) / (s * cfg.MULTIGRID.default_crop_size))\n                      **2)) for s in cfg.MULTIGRID.short_cycle_factors\n        ]\n        batch_sizes = [\n            batch_size * bs_factor[0],\n            batch_size * bs_factor[1],\n            batch_size,\n        ]\n        train_dataloader_setting = dict(\n            batch_size=batch_sizes,\n            multigrid=True,\n            num_workers=cfg.DATASET.get('num_workers', 0),\n            places=places,\n        )\n    else:\n        train_dataloader_setting = precise_bn_dataloader_setting\n    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)\n    if validate:\n        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))\n        validate_dataloader_setting = dict(\n            batch_size=batch_size,\n            num_workers=cfg.DATASET.get('num_workers', 0),"
+        },
+        {
+            "comment": "This code is creating training and validation data loaders for a PaddleVideo model. It also builds the model, and if parallelization is enabled, it wraps the model with Paddle's DataParallel API to distribute computation across multiple GPUs. The function returns the trained model, its optimizer, and the various data loaders required for training and validation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":77-109",
+            "content": "            places=places,\n            drop_last=False,\n            shuffle=False)\n        valid_loader = build_dataloader(valid_dataset,\n                                        **validate_dataloader_setting)\n    else:\n        valid_loader = None\n    return train_loader, valid_loader, precise_bn_loader\ndef build_trainer(cfg, places, parallel, validate, precise_bn,\n                  num_iters_precise_bn, world_size):\n    \"\"\"\n    Build training model and its associated tools, including optimizer,\n    dataloaders and meters.\n    Args:\n        cfg (CfgNode): configs.\n    Returns:\n        model: training model.\n        optimizer: optimizer.\n        train_loader: training data loader.\n        val_loader: validatoin data loader.\n        precise_bn_loader: training data loader for computing\n            precise BN.\n    \"\"\"\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    train_loader, valid_loader, precise_bn_loader = \\\n        construct_loader(cfg,\n                         places,"
+        },
+        {
+            "comment": "This code initializes a multigrid training configuration and builds the model, learning rate, optimizer, and loaders for training, validation, and precise Batch Normalization. It also includes an optional multigrid schedule for long or short cycles if specified in the configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":110-145",
+            "content": "                         validate,\n                         precise_bn,\n                         num_iters_precise_bn,\n                         world_size,\n                         )\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))\n    optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)\n    return (\n        model,\n        lr,\n        optimizer,\n        train_loader,\n        valid_loader,\n        precise_bn_loader,\n    )\ndef train_model_multigrid(cfg, world_size=1, validate=True):\n    \"\"\"Train model entry\n    Args:\n    \tcfg (dict): configuration.\n    \tparallel (bool): Whether multi-card training. Default: True\n        validate (bool): Whether to do evaluation. Default: False.\n    \"\"\"\n    # Init multigrid.\n    multigrid = None\n    if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE:\n        multigrid = MultigridSchedule()\n        cfg = multigrid.init_multigrid(cfg)\n        if cfg.MULTIGRID.LONG_CYCLE:\n            cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0)\n    multi_save_epoch = [i[-1] - 1 for i in multigrid.schedule]"
+        },
+        {
+            "comment": "This code sets the device (npu, xpu or gpu) based on configuration and creates the model, dataloaders for training, validation, and precise BN if needed. It also initializes a logger and handles distributed training using parallel models and dataloaders.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":147-178",
+            "content": "    parallel = world_size != 1\n    logger = get_logger(\"paddlevideo\")\n    batch_size = cfg.DATASET.get('batch_size', 2)\n    if cfg.get('use_npu', False):\n        places = paddle.set_device('npu')\n    elif cfg.get('use_xpu', False):\n        places = paddle.set_device('xpu')\n    else:\n        places = paddle.set_device('gpu')\n    model_name = cfg.model_name\n    output_dir = cfg.get(\"output_dir\", f\"./output/{model_name}\")\n    mkdir(output_dir)\n    local_rank = dist.ParallelEnv().local_rank\n    precise_bn = cfg.get(\"PRECISEBN\")\n    num_iters_precise_bn = cfg.PRECISEBN.num_iters_preciseBN\n    # 1. Construct model\n    model = build_model(cfg.MODEL)\n    if parallel:\n        model = paddle.DataParallel(model)\n    # 2. Construct dataloader\n    train_loader, valid_loader, precise_bn_loader = \\\n        construct_loader(cfg,\n                         places,\n                         validate,\n                         precise_bn,\n                         num_iters_precise_bn,\n                         world_size,\n                         )"
+        },
+        {
+            "comment": "Constructing the optimizer, resuming training from a previous checkpoint if specified in the config file, and updating the long cycle configuration for multi-grid training.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":180-209",
+            "content": "    # 3. Construct optimizer\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))\n    optimizer = build_optimizer(\n        cfg.OPTIMIZER, lr, parameter_list=model.parameters())\n    # Resume\n    resume_epoch = cfg.get(\"resume_epoch\", 0)\n    if resume_epoch:\n        filename = osp.join(\n            output_dir,\n            model_name + str(local_rank) + '_' + f\"{resume_epoch:05d}\")\n        subn_load(model, filename, optimizer)\n    # 4. Train Model\n    best = 0.\n    total_epochs = int(cfg.epochs * cfg.MULTIGRID.epoch_factor)\n    for epoch in range(total_epochs):\n        if epoch < resume_epoch:\n            logger.info(\n                f\"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... \"\n            )\n            continue\n        if cfg.MULTIGRID.LONG_CYCLE:\n            cfg, changed = multigrid.update_long_cycle(cfg, epoch)\n            if changed:\n                logger.info(\"====== Rebuild model/optimizer/loader =====\")\n                (\n                    model,\n                    lr,"
+        },
+        {
+            "comment": "The code builds a trainer with specified configurations, optimizer, train and validation loaders. It loads checkpoints if the epoch is not zero and updates the learning rate for the next epoch before training the model on the given data.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":210-234",
+            "content": "                    optimizer,\n                    train_loader,\n                    valid_loader,\n                    precise_bn_loader,\n                ) = build_trainer(cfg, places, parallel, validate, precise_bn,\n                                  num_iters_precise_bn, world_size)\n                #load checkpoint after re-build model\n                if epoch != 0:\n                    #epoch no need to -1, haved add 1 when save\n                    filename = osp.join(\n                        output_dir,\n                        model_name + str(local_rank) + '_' + f\"{(epoch):05d}\")\n                    subn_load(model, filename, optimizer)\n                #update lr last epoch, not to use saved params\n                lr.last_epoch = epoch\n                lr.step(rebuild=True)\n        model.train()\n        record_list = build_record(cfg.MODEL)\n        tic = time.time()\n        for i, data in enumerate(train_loader):\n            record_list['reader_time'].update(time.time() - tic)\n            # 4.1 forward\n            outputs = model(data, mode='train')"
+        },
+        {
+            "comment": "Performing backward pass, optimizing using given optimizer, logging progress, and updating learning rate in both iteration step and epoch step.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":235-261",
+            "content": "            # 4.2 backward\n            avg_loss = outputs['loss']\n            avg_loss.backward()\n            # 4.3 minimize\n            optimizer.step()\n            optimizer.clear_grad()\n            # log record\n            record_list['lr'].update(\n                float(optimizer._global_learning_rate()), batch_size)\n            for name, value in outputs.items():\n                record_list[name].update(float(value), batch_size)\n            record_list['batch_time'].update(time.time() - tic)\n            tic = time.time()\n            if i % cfg.get(\"log_interval\", 10) == 0:\n                ips = \"ips: {:.5f} instance/sec.\".format(\n                    batch_size / record_list[\"batch_time\"].val)\n                log_batch(record_list, i, epoch + 1, total_epochs, \"train\", ips)\n            # learning rate iter step\n            if cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n                lr.step()\n        # learning rate epoch step\n        if not cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n            lr.step()"
+        },
+        {
+            "comment": "This code snippet evaluates the model's performance during training and updates the record list with new values. It also logs the progress at certain intervals, displaying the number of instances processed per second (ips). The function 'evaluate' is called to perform this evaluation for each data batch in the valid_loader, updating the record list accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":263-287",
+            "content": "        ips = \"ips: {:.5f} instance/sec.\".format(\n            batch_size * record_list[\"batch_time\"].count /\n            record_list[\"batch_time\"].sum)\n        log_epoch(record_list, epoch + 1, \"train\", ips)\n        def evaluate(best):\n            model.eval()\n            record_list = build_record(cfg.MODEL)\n            record_list.pop('lr')\n            tic = time.time()\n            for i, data in enumerate(valid_loader):\n                outputs = model(data, mode='valid')\n                # log_record\n                for name, value in outputs.items():\n                    record_list[name].update(float(value), batch_size)\n                record_list['batch_time'].update(time.time() - tic)\n                tic = time.time()\n                if i % cfg.get(\"log_interval\", 10) == 0:\n                    ips = \"ips: {:.5f} instance/sec.\".format(\n                        batch_size / record_list[\"batch_time\"].val)\n                    log_batch(record_list, i, epoch + 1, total_epochs, \"val\",\n                              ips)"
+        },
+        {
+            "comment": "The code calculates the instantaneous processing speed (ips) and checks if a new best performance has been achieved. It then logs this information. If it's an evaluation epoch, it performs precise batch normalization, aggregates sub-batch normalization stats, and validates the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":289-312",
+            "content": "            ips = \"ips: {:.5f} instance/sec.\".format(\n                batch_size * record_list[\"batch_time\"].count /\n                record_list[\"batch_time\"].sum)\n            log_epoch(record_list, epoch + 1, \"val\", ips)\n            best_flag = False\n            if record_list.get('top1') and record_list['top1'].avg > best:\n                best = record_list['top1'].avg\n                best_flag = True\n            return best, best_flag\n        # use precise bn to improve acc\n        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):\n            logger.info(f\"do precise BN in {epoch+1} ...\")\n            do_preciseBN(model, precise_bn_loader, parallel,\n                         min(num_iters_precise_bn, len(precise_bn_loader)))\n        #  aggregate sub_BN stats\n        logger.info(\"Aggregate sub_BatchNorm stats...\")\n        aggregate_sub_bn_stats(model)\n        # 5. Validation\n        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):\n            logger.info(f\"eval in {epoch+1} ...\")"
+        },
+        {
+            "comment": "The code saves the best model if it outperforms previous results, and periodically saves the current model parameters during training. It uses the evaluate function to measure performance, the save function to store state dictionaries, and the subn_save function for saving models and optimizers at certain epochs. The logger is used for informative messages about saving and training completion.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train_multigrid.py\":313-334",
+            "content": "            with paddle.no_grad():\n                best, save_best_flag = evaluate(best)\n            # save best\n            if save_best_flag:\n                save(optimizer.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdopt\"))\n                save(model.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdparams\"))\n                logger.info(\n                    f\"Already save the best model (top1 acc){int(best * 10000) / 10000}\"\n                )\n        # 6. Save model and optimizer\n        if is_eval_epoch(\n                cfg, epoch,\n                total_epochs, multigrid.schedule) or epoch % cfg.get(\n                    \"save_interval\", 10) == 0 or epoch in multi_save_epoch:\n            logger.info(\"[Save parameters] ======\")\n            subn_save(output_dir, model_name + str(local_rank) + '_', epoch + 1,\n                      model, optimizer)\n    logger.info(f'training {model_name} finished')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f7bc72bd-4574-4ec4-9c02-69b414705b8b.json b/docs/doc/f7bc72bd-4574-4ec4-9c02-69b414705b8b.json
new file mode 100644
index 000000000..afde59e57
--- /dev/null
+++ b/docs/doc/f7bc72bd-4574-4ec4-9c02-69b414705b8b.json
@@ -0,0 +1,95 @@
+{
+    "summary": "The code utilizes PaddlePaddle's Fleet API for distributed training, defines models/metrics, and uses AMP to speed up gradient descent via DataParallel. It logs performance data, evaluates using PaddleVideo, saves the best model/optimizer, and periodically saves state during training.",
+    "details": [
+        {
+            "comment": "The code imports necessary libraries, defines functions to build data loaders, datasets, models, and metrics using a builder pattern. It also includes functions for logging progress and saving results. The code is licensed under the Apache License 2.0, and it might be part of a larger framework or application dealing with video analysis tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":0-26",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os.path as osp\nimport time\nimport paddle\nimport paddle.amp as amp\nimport paddle.distributed as dist\nimport paddle.distributed.fleet as fleet\nfrom paddlevideo.utils import (add_profiler_step, build_record, get_logger,\n                               load, log_batch, log_epoch, mkdir, save)\nfrom ..loader.builder import build_dataloader, build_dataset\nfrom ..metrics.ava_utils import collect_results_cpu\nfrom ..modeling.builder import build_model"
+        },
+        {
+            "comment": "The code defines a train_model function for training the model using given configuration (cfg). It takes optional arguments like weights path, parallel training flag, validation enablement, automatic mixed precision usage, and more.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":27-50",
+            "content": "from ..solver import build_lr, build_optimizer\nfrom ..utils import do_preciseBN\ndef train_model(cfg,\n                weights=None,\n                parallel=True,\n                validate=True,\n                use_amp=False,\n                amp_level=None,\n                max_iters=None,\n                use_fleet=False,\n                profiler_options=None):\n    \"\"\"Train model entry\n    Args:\n        cfg (dict): configuration.\n        weights (str, optional): weights path for finetuning. Defaults to None.\n        parallel (bool, optional): whether multi-cards training. Defaults to True.\n        validate (bool, optional): whether to do evaluation. Defaults to True.\n        use_amp (bool, optional): whether to use automatic mixed precision during training. Defaults to False.\n        amp_level (str, optional): amp optmization level, must be 'O1' or 'O2' when use_amp is True. Defaults to None.\n        max_iters (int, optional): max running iters in an epoch. Defaults to None.\n        use_fleet (bool, optional): whether to use fleet. Defaults to False."
+        },
+        {
+            "comment": "This code sets up gradient accumulation and global batch size for distributed training using PaddlePaddle's Fleet API. It retrieves batch and validation batch sizes from the configuration, then checks if gradient accumulation is enabled and the world size of the distributed setup. If so, it calculates the global batch size based on these settings and asserts that global_batch_size is greater than the current batch size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":51-74",
+            "content": "        profiler_options (str, optional): configuration for the profiler function. Defaults to None.\n    \"\"\"\n    if use_fleet:\n        fleet.init(is_collective=True)\n    logger = get_logger(\"paddlevideo\")\n    batch_size = cfg.DATASET.get('batch_size', 8)\n    valid_batch_size = cfg.DATASET.get('valid_batch_size', batch_size)\n    # gradient accumulation settings\n    use_gradient_accumulation = cfg.get('GRADIENT_ACCUMULATION', None)\n    if use_gradient_accumulation and dist.get_world_size() >= 1:\n        global_batch_size = cfg.GRADIENT_ACCUMULATION.get(\n            'global_batch_size', None)\n        num_gpus = dist.get_world_size()\n        assert isinstance(\n            global_batch_size, int\n        ), f\"global_batch_size must be int, but got {type(global_batch_size)}\"\n        assert batch_size <= global_batch_size, \\\n            f\"global_batch_size({global_batch_size}) must not be less than batch_size({batch_size})\"\n        cur_global_batch_size = batch_size * num_gpus  # The number of batches calculated by all GPUs at one time"
+        },
+        {
+            "comment": "The code ensures the global batch size is divisible by cur_global_batch_size, sets the number of iterations needed to reach the global batch size, and sets the device type (NPU, XPU, or GPU) based on config values. It also allows for setting the number of workers for training and validation data loading.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":75-95",
+            "content": "        assert global_batch_size % cur_global_batch_size == 0, \\\n            f\"The global batchsize({global_batch_size}) must be divisible by cur_global_batch_size({cur_global_batch_size})\"\n        cfg.GRADIENT_ACCUMULATION[\n            \"num_iters\"] = global_batch_size // cur_global_batch_size\n        # The number of iterations required to reach the global batchsize\n        logger.info(\n            f\"Using gradient accumulation training strategy, \"\n            f\"global_batch_size={global_batch_size}, \"\n            f\"num_gpus={num_gpus}, \"\n            f\"num_accumulative_iters={cfg.GRADIENT_ACCUMULATION.num_iters}\")\n    if cfg.get('use_npu', False):\n        places = paddle.set_device('npu')\n    elif cfg.get('use_xpu', False):\n        places = paddle.set_device('xpu')\n    else:\n        places = paddle.set_device('gpu')\n    # default num worker: 0, which means no subprocess will be created\n    num_workers = cfg.DATASET.get('num_workers', 0)\n    valid_num_workers = cfg.DATASET.get('valid_num_workers', num_workers)"
+        },
+        {
+            "comment": "Code snippet builds a model, creates dataset and dataloader for training and validation, and optionally converts the model to static using Paddle.jit.to_static(). It saves the output in the specified directory and logs if @to_static is applied successfully.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":96-123",
+            "content": "    model_name = cfg.model_name\n    output_dir = cfg.get(\"output_dir\", f\"./output/{model_name}\")\n    mkdir(output_dir)\n    # 1. Construct model\n    model = build_model(cfg.MODEL)\n    if cfg.get('to_static', False):\n        specs = None\n        model = paddle.jit.to_static(model, input_spec=specs)\n        logger.info(\n            \"Successfully to apply @to_static with specs: {}\".format(specs))\n    # 2. Construct dataset and dataloader for training and evaluation\n    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))\n    train_dataloader_setting = dict(\n        batch_size=batch_size,\n        num_workers=num_workers,\n        collate_fn_cfg=cfg.get('MIX', None),\n        places=places)\n    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)\n    if validate:\n        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))\n        validate_dataloader_setting = dict(\n            batch_size=valid_batch_size,\n            num_workers=valid_num_workers,\n            places=places,"
+        },
+        {
+            "comment": "This code is setting up a training process for the PaddleVideo framework. It first creates train and validation dataloaders with specified settings, then constructs a learning rate scheduler and optimizer based on provided configurations. Optionally, it converts model parameters to fp16 using AMP if needed.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":124-149",
+            "content": "            drop_last=False,\n            shuffle=cfg.DATASET.get(\n                'shuffle_valid',\n                False)  # NOTE: attention_LSTM needs to shuffle valid data.\n        )\n        valid_loader = build_dataloader(valid_dataset,\n                                        **validate_dataloader_setting)\n    # 3. Construct learning rate scheduler(lr) and optimizer\n    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))\n    optimizer = build_optimizer(\n        cfg.OPTIMIZER, lr, model=model, use_amp=use_amp, amp_level=amp_level)\n    # 4. Construct scalar and convert parameters for amp(optional)\n    if use_amp:\n        scaler = amp.GradScaler(\n            init_loss_scaling=2.0**16,\n            incr_every_n_steps=2000,\n            decr_every_n_nan_or_inf=1)\n        # convert model parameters to fp16 when amp_level is O2(pure fp16)\n        model, optimizer = amp.decorate(\n            models=model,\n            optimizers=optimizer,\n            level=amp_level,\n            master_weight=True,\n            save_dtype=None)"
+        },
+        {
+            "comment": "The code checks if training in amp mode or fp32 mode. If in amp mode, it asserts that the amp_level is not None and logs the current level. If in fp32 mode, it asserts that amp_level is None and logs the mode. It then handles optional resume and finetuning steps if specified by loading model weights from a file, setting the model state dictionary to the loaded dictionary, and logging the checkpoint used.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":150-171",
+            "content": "        # NOTE: save_dtype is set to float32 now.\n        logger.info(f\"Training in amp mode, amp_level={amp_level}.\")\n    else:\n        assert amp_level is None, f\"amp_level must be None when training in fp32 mode, but got {amp_level}.\"\n        logger.info(\"Training in fp32 mode.\")\n    # 5. Resume(optional)\n    resume_epoch = cfg.get(\"resume_epoch\", 0)\n    if resume_epoch:\n        filename = osp.join(output_dir,\n                            model_name + f\"_epoch_{resume_epoch:05d}\")\n        resume_model_dict = load(filename + '.pdparams')\n        resume_opt_dict = load(filename + '.pdopt')\n        model.set_state_dict(resume_model_dict)\n        optimizer.set_state_dict(resume_opt_dict)\n        logger.info(\"Resume from checkpoint: {}\".format(filename))\n    # 6. Finetune(optional)\n    if weights:\n        assert resume_epoch == 0, f\"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it.\"\n        model_dict = load(weights)\n        model.set_state_dict(model_dict)"
+        },
+        {
+            "comment": "The code finetunes a model from a specified checkpoint. It optionally parallelizes the training process using Paddle's DataParallel API and Fleet distributed computing for further optimization. The code trains the model for a specified number of epochs, continuing from a previous resume_epoch if needed. Performance information is collected when profiler options are activated.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":172-203",
+            "content": "        logger.info(\"Finetune from checkpoint: {}\".format(weights))\n    # 7. Parallelize(optional)\n    if parallel:\n        model = paddle.DataParallel(model)\n    if use_fleet:\n        model = fleet.distributed_model(model)\n        optimizer = fleet.distributed_optimizer(optimizer)\n    # 8. Train Model\n    best = 0.0\n    for epoch in range(0, cfg.epochs):\n        if epoch < resume_epoch:\n            logger.info(\n                f\"| epoch: [{epoch + 1}] <= resume_epoch: [{resume_epoch}], continue...\"\n            )\n            continue\n        model.train()\n        record_list = build_record(cfg.MODEL)\n        tic = time.time()\n        for i, data in enumerate(train_loader):\n            \"\"\"Next two line of code only used in test_tipc,\n            ignore it most of the time\"\"\"\n            if max_iters is not None and i >= max_iters:\n                break\n            record_list['reader_time'].update(time.time() - tic)\n            # Collect performance information when profiler_options is activate\n            add_profiler_step(profiler_options)"
+        },
+        {
+            "comment": "Applies Automatic Mixed Precision (AMP) for faster training, calculates average loss, performs gradient accumulation, and scales backpropagation to reduce memory usage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":205-228",
+            "content": "            # 8.1 forward\n            # AMP #\n            if use_amp:\n                with amp.auto_cast(\n                        custom_black_list={\"reduce_mean\", \"conv3d\"},\n                        level=amp_level):\n                    outputs = model(data, mode='train')\n                avg_loss = outputs['loss']\n                if use_gradient_accumulation:\n                    # clear grad at when epoch begins\n                    if i == 0:\n                        optimizer.clear_grad()\n                    # Loss normalization\n                    avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters\n                    # Loss scaling\n                    scaled = scaler.scale(avg_loss)\n                    # 8.2 backward\n                    scaled.backward()\n                    # 8.3 minimize\n                    if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:\n                        scaler.minimize(optimizer, scaled)\n                        optimizer.clear_grad()\n                else:  # general case\n                    # Loss scaling"
+        },
+        {
+            "comment": "This code calculates the average loss, scales it if necessary, performs backward pass, and applies gradient descent to minimize the loss. If gradient accumulation is used, the gradients are cleared at the start of each epoch and after every accumulated number of iterations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":229-252",
+            "content": "                    scaled = scaler.scale(avg_loss)\n                    # 8.2 backward\n                    scaled.backward()\n                    # 8.3 minimize\n                    scaler.minimize(optimizer, scaled)\n                    optimizer.clear_grad()\n            else:\n                outputs = model(data, mode='train')\n                avg_loss = outputs['loss']\n                if use_gradient_accumulation:\n                    # clear grad at when epoch begins\n                    if i == 0:\n                        optimizer.clear_grad()\n                    # Loss normalization\n                    avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters\n                    # 8.2 backward\n                    avg_loss.backward()\n                    # 8.3 minimize\n                    if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:\n                        optimizer.step()\n                        optimizer.clear_grad()\n                else:  # general case\n                    # 8.2 backward\n                    avg_loss.backward()"
+        },
+        {
+            "comment": "Optimizer step and gradient clearance followed by logging records, updating logs, calculating instantaneous performance (ips), determining progress and estimated time of arrival (eta), and calling log_batch function.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":253-276",
+            "content": "                    # 8.3 minimize\n                    optimizer.step()\n                    optimizer.clear_grad()\n            # log record\n            record_list['lr'].update(optimizer.get_lr(), batch_size)\n            for name, value in outputs.items():\n                if name in record_list:\n                    record_list[name].update(value, batch_size)\n            record_list['batch_time'].update(time.time() - tic)\n            tic = time.time()\n            if i % cfg.get(\"log_interval\", 10) == 0:\n                ips = \"ips: {:.5f} instance/sec,\".format(\n                    batch_size / record_list[\"batch_time\"].val)\n                cur_progress = ((i + 1) + epoch * len(train_loader)) / (\n                    len(train_loader) * cfg.epochs)\n                eta = int(record_list[\"batch_time\"].sum *\n                          (1 - cur_progress) / cur_progress + 0.5)\n                log_batch(record_list, i, epoch + 1, cfg.epochs, \"train\", ips,\n                          eta)\n            # learning rate iter step"
+        },
+        {
+            "comment": "This code snippet is from the PaddleVideo library and it contains code for training a model. It uses an optimizer with a learning rate that can be stepped based on whether it's an iterative step or not. After performing an epoch, it logs the average instances per second processed. The code then evaluates the model by setting it to evaluation mode and collecting test results using a record list. It also records the time taken for testing in 'tic'.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":277-305",
+            "content": "            if cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n                lr.step()\n        # learning rate epoch step\n        if not cfg.OPTIMIZER.learning_rate.get(\"iter_step\"):\n            lr.step()\n        ips = \"avg_ips: {:.5f} instance/sec.\".format(\n            batch_size * record_list[\"batch_time\"].count /\n            record_list[\"batch_time\"].sum)\n        log_epoch(record_list, epoch + 1, \"train\", ips)\n        def evaluate(best):\n            model.eval()\n            results = []\n            record_list = build_record(cfg.MODEL)\n            record_list.pop('lr')\n            tic = time.time()\n            if parallel:\n                rank = dist.get_rank()\n            # single_gpu_test and multi_gpu_test\n            for i, data in enumerate(valid_loader):\n                \"\"\"Next two line of code only used in test_tipc,\n                ignore it most of the time\"\"\"\n                if max_iters is not None and i >= max_iters:\n                    break\n                if use_amp:\n                    with amp.auto_cast("
+        },
+        {
+            "comment": "This code snippet is from the PaddleVideo library and appears to be handling model training for a specific framework. It calculates outputs, updates records for non-FastRCNN models, logs batch information, and handles FastRCNN-specific operations. The code also includes functionality for updating batch time and logging progress at regular intervals.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":306-329",
+            "content": "                            custom_black_list={\"reduce_mean\", \"conv3d\"},\n                            level=amp_level):\n                        outputs = model(data, mode='valid')\n                else:\n                    outputs = model(data, mode='valid')\n                if cfg.MODEL.framework == \"FastRCNN\":\n                    results.extend(outputs)\n                # log_record\n                if cfg.MODEL.framework != \"FastRCNN\":\n                    for name, value in outputs.items():\n                        if name in record_list:\n                            record_list[name].update(value, batch_size)\n                record_list['batch_time'].update(time.time() - tic)\n                tic = time.time()\n                if i % cfg.get(\"log_interval\", 10) == 0:\n                    ips = \"ips: {:.5f} instance/sec.\".format(\n                        valid_batch_size / record_list[\"batch_time\"].val)\n                    log_batch(record_list, i, epoch + 1, cfg.epochs, \"val\", ips)\n            if cfg.MODEL.framework == \"FastRCNN\":"
+        },
+        {
+            "comment": "Code section checks if parallel processing is enabled, collects results for CPU, and evaluates the dataset. It calculates average instance processing speed and logs it. If using specific models like FastRCNN or YOWOLocalizer, compares current performance metrics with the best values achieved so far and returns them along with a flag indicating if a new best value was found.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":330-350",
+            "content": "                if parallel:\n                    results = collect_results_cpu(results, len(valid_dataset))\n                if not parallel or (parallel and rank == 0):\n                    eval_res = valid_dataset.evaluate(results)\n                    for name, value in eval_res.items():\n                        record_list[name].update(value, valid_batch_size)\n            ips = \"avg_ips: {:.5f} instance/sec.\".format(\n                valid_batch_size * record_list[\"batch_time\"].count /\n                record_list[\"batch_time\"].sum)\n            log_epoch(record_list, epoch + 1, \"val\", ips)\n            best_flag = False\n            if cfg.MODEL.framework == \"FastRCNN\" and (not parallel or\n                                                      (parallel and rank == 0)):\n                if record_list[\"mAP@0.5IOU\"].val > best:\n                    best = record_list[\"mAP@0.5IOU\"].val\n                    best_flag = True\n                return best, best_flag\n            if cfg.MODEL.framework == \"YOWOLocalizer\" and (not parallel or"
+        },
+        {
+            "comment": "This code is updating the best value and flag based on various metrics (fscore, hit_at_one, top1, rmse, F1@0.50) in a parallel setting with rank 0. It also checks if using precise batch normalization improves accuracy every 'preciseBN_interval' epochs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":351-372",
+            "content": "                                                           (parallel and rank == 0)):\n                if record_list[\"fscore\"].avg > best:\n                    best = record_list[\"fscore\"].avg\n                    best_flag = True\n                return best, best_flag\n            # forbest2, cfg.MODEL.framework != \"FastRCNN\":\n            for top_flag in ['hit_at_one', 'top1', 'rmse', \"F1@0.50\"]:\n                if record_list.get(top_flag):\n                    if top_flag != 'rmse' and record_list[top_flag].avg > best:\n                        best = record_list[top_flag].avg\n                        best_flag = True\n                    elif top_flag == 'rmse' and (\n                            best == 0.0 or record_list[top_flag].avg < best):\n                        best = record_list[top_flag].avg\n                        best_flag = True\n            return best, best_flag\n        # use precise bn to improve acc\n        if cfg.get(\"PRECISEBN\") and (\n                epoch % cfg.PRECISEBN.preciseBN_interval == 0"
+        },
+        {
+            "comment": "This code block is responsible for the precise Batch Normalization and validation steps in a deep learning training process. It applies PreciseBN for specific number of iterations, performs validation every \"val_interval\" epochs or at the last epoch, saves best model state if validation accuracy improves, and handles model saving differently depending on the framework used (Distillation vs others).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":373-394",
+            "content": "                or epoch == cfg.epochs - 1):\n            do_preciseBN(model, train_loader, parallel,\n                         min(cfg.PRECISEBN.num_iters_preciseBN,\n                             len(train_loader)), use_amp, amp_level)\n        # 9. Validation\n        if validate and (epoch % cfg.get(\"val_interval\", 1) == 0\n                         or epoch == cfg.epochs - 1):\n            with paddle.no_grad():\n                best, save_best_flag = evaluate(best)\n            # save best\n            if save_best_flag:\n                save(optimizer.state_dict(),\n                     osp.join(output_dir, model_name + \"_best.pdopt\"))\n                save_student_model_flag = True if \"Distillation\" in cfg.MODEL.framework else False\n                save(\n                    model.state_dict(),\n                    osp.join(output_dir, model_name + \"_best.pdparams\"),\n                    save_student_model=save_student_model_flag)\n                if model_name == \"AttentionLstm\":\n                    logger.info(\n                        f\"Already save the best model (hit_at_one){best}\")"
+        },
+        {
+            "comment": "This code block checks the current model framework and logs the metric used to identify the best model saved, followed by saving the best model and optimizer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":395-416",
+            "content": "                elif cfg.MODEL.framework == \"FastRCNN\":\n                    logger.info(\n                        f\"Already save the best model (mAP@0.5IOU){int(best * 10000) / 10000}\"\n                    )\n                elif cfg.MODEL.framework == \"DepthEstimator\":\n                    logger.info(\n                        f\"Already save the best model (rmse){int(best * 10000) / 10000}\"\n                    )\n                elif cfg.MODEL.framework in ['MSTCN', 'ASRF']:\n                    logger.info(\n                        f\"Already save the best model (F1@0.50){int(best * 10000) / 10000}\"\n                    )\n                elif cfg.MODEL.framework in ['YOWOLocalizer']:\n                    logger.info(\n                        f\"Already save the best model (fsocre){int(best * 10000) / 10000}\"\n                    )\n                else:\n                    logger.info(\n                        f\"Already save the best model (top1 acc){int(best * 10000) / 10000}\"\n                    )\n        # 10. Save model and optimizer"
+        },
+        {
+            "comment": "This code saves the optimizer and model state dictionaries at specific intervals during training. The optimizer state is saved with a .pdopt extension and the model state is saved with a .pdparams extension. This occurs if the current epoch is either divisible by the save_interval or is the final epoch, to preserve progress during training. Finally, it logs that training for the specified model has finished.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/tasks/train.py\":417-425",
+            "content": "        if epoch % cfg.get(\"save_interval\", 1) == 0 or epoch == cfg.epochs - 1:\n            save(optimizer.state_dict(),\n                 osp.join(output_dir,\n                          model_name + f\"_epoch_{epoch + 1:05d}.pdopt\"))\n            save(model.state_dict(),\n                 osp.join(output_dir,\n                          model_name + f\"_epoch_{epoch + 1:05d}.pdparams\"))\n    logger.info(f'training {model_name} finished')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f907552f-8df7-4c75-ab4a-cc979ff2cd10.json b/docs/doc/f907552f-8df7-4c75-ab4a-cc979ff2cd10.json
new file mode 100644
index 000000000..0496ae484
--- /dev/null
+++ b/docs/doc/f907552f-8df7-4c75-ab4a-cc979ff2cd10.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The code is a part of PaddleVideo's Video Quality Assessment application. It defines and registers several types of models including backbones, heads, recognizers, localizers, and losses using a registry system for easier management and organization.",
+    "details": [
+        {
+            "comment": "The code is a part of PaddleVideo's Video Quality Assessment application. It defines and registers several types of models including backbones, heads, recognizers, localizers, and losses using a registry system for easier management and organization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py\":0-22",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nfrom ..utils import Registry\nBACKBONES = Registry('backbone')\nHEADS = Registry('head')\nRECOGNIZERS = Registry('recognizer')\nLOCALIZERS = Registry('localizer')\nLOSSES = Registry('loss')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f9491c07-0bd8-4d14-b486-087bb091307f.json b/docs/doc/f9491c07-0bd8-4d14-b486-087bb091307f.json
new file mode 100644
index 000000000..3573d99b3
--- /dev/null
+++ b/docs/doc/f9491c07-0bd8-4d14-b486-087bb091307f.json
@@ -0,0 +1,20 @@
+{
+    "summary": "YouTube-8M is a large video classification dataset containing over 8 million URLs and covers more than 3800 knowledge graph entities. The code splits the pkl files into smaller files for easier processing.",
+    "details": [
+        {
+            "comment": "English | [\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/youtube8m.md)\n# YouTube-8M Data Preparation\n- [Introduction](#Introduction)\n- [Download](#Download)\n- [Conversion](#Conversion)\n## Introduction\nYouTube-8M is a large-scale video classification data set, containing more than 8 million video URLs. The tag system covers more than 3800 knowledge graph entities. One video corresponds to multiple tags (3-4 on average) and is labeled by machine.\n**The length of each video is between 120s and 500s\nDue to the large amount of video data, the image classification model was used to extract frame-level features in advance, and PCA was used to reduce the dimensionality of the features to obtain multi-frame 1024-dimensional features. Similarly, the audio model was used to obtain multi-frame 128-dimensional features. Audio characteristics. **\n> The dataset used here is the updated YouTube-8M data set in 2018 (May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features).\n## Download\n1. Create a new directory for storing features (take the PaddleVideo directory as an example)",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/youtube8m.md\":0-19",
+            "content": "English | [\u7b80\u4f53\u4e2d\u6587](../../zh-CN/dataset/youtube8m.md)\n# YouTube-8M Data Preparation\n- [Introduction](#Introduction)\n- [Download](#Download)\n- [Conversion](#Conversion)\n## Introduction\nYouTube-8M is a large-scale video classification data set, containing more than 8 million video URLs. The tag system covers more than 3800 knowledge graph entities. One video corresponds to multiple tags (3-4 on average) and is labeled by machine.\n**The length of each video is between 120s and 500s\nDue to the large amount of video data, the image classification model was used to extract frame-level features in advance, and PCA was used to reduce the dimensionality of the features to obtain multi-frame 1024-dimensional features. Similarly, the audio model was used to obtain multi-frame 128-dimensional features. Audio characteristics. **\n> The dataset used here is the updated YouTube-8M data set in 2018 (May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features).\n## Download\n1. Create a new directory for storing features (take the PaddleVideo directory as an example)"
+        },
+        {
+            "comment": "Creates a frame directory, downloads the training and validation sets to it using curl, installs TensorFlow for reading TFRecord data, then converts the TFRecord files to pickle format for PaddlePaddle usage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/youtube8m.md\":20-43",
+            "content": "    ```bash\n    cd data/yt8m\n    mkdir frame\n    cd frame\n    ```\n2. Download the training and validation set to the frame folder\n    ```bash\n    curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python\n    curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python\n    ```\n    The download process is shown in the figure\n    ![image](https://user-images.githubusercontent.com/23737287/140709613-1e2d6ec0-a82e-474d-b220-7803065b0153.png)\n    After the data download is complete, you will get 3844 training data files and 3844 verification data files (TFRecord format)\n## Conversion\n1. Install tensorflow to read tfrecord data\n    ```bash\n    python3.7 -m pip install tensorflow-gpu==1.14.0\n    ```\n2. Convert the downloaded TFRecord file into a pickle file for PaddlePaddle to use\n    ```bash\n    cd .. # From the frame directory back to the yt8m directory\n    python3.7 tf2pkl.py ./frame ./pkl_frame/ # Convert train*.tfrecord and validate*.tfrecord in the frame folder to pkl format"
+        },
+        {
+            "comment": "This code generates a single pkl file path set and splits the pkl files into smaller files based on given file lists. It first writes the paths of \"train*.pkl\" and \"validate*.pkl\" to \"train.list\" and \"val.list\" respectively. Then, it uses the \"split_yt8m.py\" script to split each \"train*.pkl\" into multiple \"train*_split*.pkl\" files and each \"validate*.pkl\" into multiple \"validate*_split*.pkl\" files. Finally, it rewrites the paths of the smaller pkl files back into \"train.list\" and \"val.list\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/english_documents/dataset/youtube8m.md\":44-55",
+            "content": "    ```\n3. Generate a single pkl file path set, and split pkl into multiple small pkl files based on this file, and generate the final split pkl file path required\n    ```bash\n    ls pkl_frame/train*.pkl> train.list # Write the path of train*.pkl to train.list\n    ls pkl_frame/validate*.pkl> val.list # Write the path of validate*.pkl into val.list\n    python3.7 split_yt8m.py train.list # Split each train*.pkl into multiple train*_split*.pkl\n    python3.7 split_yt8m.py val.list # Split each validate*.pkl into multiple validate*_split*.pkl\n    ls pkl_frame/train*_split*.pkl> train.list # Rewrite the path of train*_split*.pkl into train.list\n    ls pkl_frame/validate*_split*.pkl> val.list # Rewrite the path of validate*_split*.pkl into val.list\n    ``` "
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f94edaed-272f-46c5-92f6-29e73f7a0e9a.json b/docs/doc/f94edaed-272f-46c5-92f6-29e73f7a0e9a.json
new file mode 100644
index 000000000..d8debcdcc
--- /dev/null
+++ b/docs/doc/f94edaed-272f-46c5-92f6-29e73f7a0e9a.json
@@ -0,0 +1,15 @@
+{
+    "summary": "The code imports various metrics from different modules for video analysis and evaluation, including AVAMetric, VOSMetric, BMNMetric, MSRVTTMetric, SkeletonMetric, TransNetV2Metric, DepthMetric, CenterCropMetric, MultiCropMetric, HitOneMetric, and SegmentationMetric. It also imports the METRIC registry for managing these metrics.",
+    "details": [
+        {
+            "comment": "The code is importing various metrics from different metric classes for video analysis and evaluation. It includes BMNMetric, MSRVTTMetric, SkeletonMetric, TransNetV2Metric, DepthMetric, CenterCropMetric, MultiCropMetric, HitOneMetric, and SegmentationMetric. The METRIC registry is also imported for managing these metrics.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/__init__.py\":0-24",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .bmn_metric import BMNMetric\nfrom .build import build_metric\nfrom .center_crop_metric import CenterCropMetric\nfrom .depth_metric import DepthMetric\nfrom .msrvtt_metric import MSRVTTMetric\nfrom .multi_crop_metric import MultiCropMetric\nfrom .registry import METRIC\nfrom .skeleton_metric import SkeletonMetric\nfrom .transnetv2_metric import TransNetV2Metric\nfrom .youtube8m.eval_util import HitOneMetric\nfrom .segmentation_metric import SegmentationMetric"
+        },
+        {
+            "comment": "This code imports various metrics from different modules and adds them to the __all__ list for easy access, including AVAMetric, VOSMetric, CenterCropMetric_MRI, YOWOMetric, METRIC, build_metric, MultiCropMetric, BMNMetric, CenterCropMetric, SkeletonMetric, HitOneMetric, TransNetV2Metric, DepthMetric, MSRVTTMetric.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/__init__.py\":25-35",
+            "content": "from .ava_metric import AVAMetric\nfrom .vos_metric import VOSMetric\nfrom .center_crop_metric_MRI import CenterCropMetric_MRI\nfrom .yowo_metric import YOWOMetric\n__all__ = [\n    'METRIC', 'build_metric', 'MultiCropMetric', 'BMNMetric',\n    'CenterCropMetric', 'SkeletonMetric', 'HitOneMetric', 'TransNetV2Metric',\n    'DepthMetric', 'MSRVTTMetric', 'VOSMetric', 'CenterCropMetric_MRI','AVAMetric',\n    'SegmentationMetric', 'YOWOMetric'\n]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f9ca0655-5981-4b00-b49a-1a8ad20ed2ac.json b/docs/doc/f9ca0655-5981-4b00-b49a-1a8ad20ed2ac.json
new file mode 100644
index 000000000..f788caaaa
--- /dev/null
+++ b/docs/doc/f9ca0655-5981-4b00-b49a-1a8ad20ed2ac.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code introduces PaddleVideo's model compression using PaddleSlim, demonstrates PP-TSM quantized model prediction and pruning methods, providing recommendations for hyperparameters when using quantized training with pre-trained models.",
+    "details": [
+        {
+            "comment": "This code provides an introduction to the slim functionality of PaddleVideo's model compression using PaddleSlim. It explains the purpose and benefits of model quantization and pruning, and how to use PaddleSlim for PaddleVideo model compression.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme.md\":1-43",
+            "content": "## Slim\u529f\u80fd\u4ecb\u7ecd\n\u590d\u6742\u7684\u6a21\u578b\u6709\u5229\u4e8e\u63d0\u9ad8\u6a21\u578b\u7684\u6027\u80fd\uff0c\u4f46\u4e5f\u5bfc\u81f4\u6a21\u578b\u4e2d\u5b58\u5728\u4e00\u5b9a\u5197\u4f59\u3002\u6b64\u90e8\u5206\u63d0\u4f9b\u7cbe\u7b80\u6a21\u578b\u7684\u529f\u80fd\uff0c\u5305\u62ec\u4e24\u90e8\u5206\uff1a\u6a21\u578b\u91cf\u5316\uff08\u91cf\u5316\u8bad\u7ec3\u3001\u79bb\u7ebf\u91cf\u5316\uff09\u3001\u6a21\u578b\u526a\u679d\u3002\n\u5176\u4e2d\u6a21\u578b\u91cf\u5316\u5c06\u5168\u7cbe\u5ea6\u7f29\u51cf\u5230\u5b9a\u70b9\u6570\u51cf\u5c11\u8fd9\u79cd\u5197\u4f59\uff0c\u8fbe\u5230\u51cf\u5c11\u6a21\u578b\u8ba1\u7b97\u590d\u6742\u5ea6\uff0c\u63d0\u9ad8\u6a21\u578b\u63a8\u7406\u6027\u80fd\u7684\u76ee\u7684\u3002\n\u6a21\u578b\u91cf\u5316\u53ef\u4ee5\u5728\u57fa\u672c\u4e0d\u635f\u5931\u6a21\u578b\u7684\u7cbe\u5ea6\u7684\u60c5\u51b5\u4e0b\uff0c\u5c06FP32\u7cbe\u5ea6\u7684\u6a21\u578b\u53c2\u6570\u8f6c\u6362\u4e3aInt8\u7cbe\u5ea6\uff0c\u51cf\u5c0f\u6a21\u578b\u53c2\u6570\u5927\u5c0f\u5e76\u52a0\u901f\u8ba1\u7b97\uff0c\u4f7f\u7528\u91cf\u5316\u540e\u7684\u6a21\u578b\u5728\u79fb\u52a8\u7aef\u7b49\u90e8\u7f72\u65f6\u66f4\u5177\u5907\u901f\u5ea6\u4f18\u52bf\u3002\n\u6a21\u578b\u526a\u679d\u5c06CNN\u4e2d\u4e0d\u91cd\u8981\u7684\u5377\u79ef\u6838\u88c1\u526a\u6389\uff0c\u51cf\u5c11\u6a21\u578b\u53c2\u6570\u91cf\uff0c\u4ece\u800c\u964d\u4f4e\u6a21\u578b\u8ba1\u7b97\u590d\u6742\u5ea6\u3002\n\u672c\u6559\u7a0b\u5c06\u4ecb\u7ecd\u5982\u4f55\u4f7f\u7528\u98de\u6868\u6a21\u578b\u538b\u7f29\u5e93PaddleSlim\u505aPaddleVideo\u6a21\u578b\u7684\u538b\u7f29\u3002\n[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) \u96c6\u6210\u4e86\u6a21\u578b\u526a\u679d\u3001\u91cf\u5316\uff08\u5305\u62ec\u91cf\u5316\u8bad\u7ec3\u548c\u79bb\u7ebf\u91cf\u5316\uff09\u3001\u84b8\u998f\u548c\u795e\u7ecf\u7f51\u7edc\u641c\u7d22\u7b49\u591a\u79cd\u4e1a\u754c\u5e38\u7528\u4e14\u9886\u5148\u7684\u6a21\u578b\u538b\u7f29\u529f\u80fd\uff0c\u5982\u679c\u60a8\u611f\u5174\u8da3\uff0c\u53ef\u4ee5\u5173\u6ce8\u5e76\u4e86\u89e3\u3002\n\u5728\u5f00\u59cb\u672c\u6559\u7a0b\u4e4b\u524d\uff0c\u5efa\u8bae\u5148\u4e86\u89e3[PaddleVideo\u6a21\u578b\u7684\u8bad\u7ec3\u65b9\u6cd5](../../docs/zh-CN/usage.md)\u4ee5\u53ca[PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/latest/index.html)\n## \u5feb\u901f\u5f00\u59cb\n\u5f53\u8bad\u7ec3\u51fa\u4e00\u4e2a\u6a21\u578b\u540e\uff0c\u5982\u679c\u5e0c\u671b\u8fdb\u4e00\u6b65\u7684\u538b\u7f29\u6a21\u578b\u5927\u5c0f\u5e76\u52a0\u901f\u9884\u6d4b\uff0c\u53ef\u4f7f\u7528\u91cf\u5316\u6216\u8005\u526a\u679d\u7684\u65b9\u6cd5\u538b\u7f29\u6a21\u578b\u3002\n\u6a21\u578b\u538b\u7f29\u4e3b\u8981\u5305\u62ec\u4e94\u4e2a\u6b65\u9aa4\uff1a\n1. \u5b89\u88c5 PaddleSlim\n2. \u51c6\u5907\u8bad\u7ec3\u597d\u7684\u6a21\u578b\n3. \u6a21\u578b\u538b\u7f29\n4. \u5bfc\u51fa\u91cf\u5316\u63a8\u7406\u6a21\u578b\n5. \u91cf\u5316\u6a21\u578b\u9884\u6d4b\u90e8\u7f72\n### 1. \u5b89\u88c5PaddleSlim\n* \u53ef\u4ee5\u901a\u8fc7pip install\u7684\u65b9\u5f0f\u8fdb\u884c\u5b89\u88c5\u3002\n```bash\npython3.7 -m pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple\n```\n* \u5982\u679c\u83b7\u53d6PaddleSlim\u7684\u6700\u65b0\u7279\u6027\uff0c\u53ef\u4ee5\u4ece\u6e90\u7801\u5b89\u88c5\u3002\n```bash\ngit clone https://github.com/PaddlePaddle/PaddleSlim.git\ncd Paddleslim\npython3.7 setup.py install\n```\n### 2. \u51c6\u5907\u8bad\u7ec3\u597d\u7684\u6a21\u578b\nPaddleVideo\u63d0\u4f9b\u4e86\u4e00\u7cfb\u5217\u8bad\u7ec3\u597d\u7684[\u6a21\u578b](../../docs/zh-CN/model_zoo/README.md)\uff0c\u5982\u679c\u5f85\u91cf\u5316\u7684\u6a21\u578b\u4e0d\u5728\u5217\u8868\u4e2d\uff0c\u9700\u8981\u6309\u7167[\u5e38\u89c4\u8bad\u7ec3](../../docs/zh-CN/usage.md)\u65b9\u6cd5\u5f97\u5230\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u3002"
+        },
+        {
+            "comment": "This code snippet explains the process of offline quantization in PaddleVideo for model compression. It mentions that the code is located in `deploy/slim/quant_post_static.py`. The snippet also details the steps involved in offline quantization, including using a pre-trained model and specifying the quantization strategy in a configuration file. The process generates an output directory with `__model__` and `__params__` files that can be used for deployment without re-exporting the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme.md\":45-90",
+            "content": "### 3. \u6a21\u578b\u538b\u7f29\n\u8fdb\u5165PaddleVideo\u6839\u76ee\u5f55\n```bash\ncd PaddleVideo\n```\n\u79bb\u7ebf\u91cf\u5316\u4ee3\u7801\u4f4d\u4e8e`deploy/slim/quant_post_static.py`\u3002\n#### 3.1 \u6a21\u578b\u91cf\u5316\n\u91cf\u5316\u8bad\u7ec3\u5305\u62ec\u79bb\u7ebf\u91cf\u5316\u8bad\u7ec3\u548c\u5728\u7ebf\u91cf\u5316\u8bad\u7ec3(TODO)\uff0c\u5728\u7ebf\u91cf\u5316\u8bad\u7ec3\u6548\u679c\u66f4\u597d\uff0c\u9700\u52a0\u8f7d\u9884\u8bad\u7ec3\u6a21\u578b\uff0c\u5728\u5b9a\u4e49\u597d\u91cf\u5316\u7b56\u7565\u540e\u5373\u53ef\u5bf9\u6a21\u578b\u8fdb\u884c\u91cf\u5316\u3002\n##### 3.1.1 \u5728\u7ebf\u91cf\u5316\u8bad\u7ec3\nTODO\n##### 3.1.2 \u79bb\u7ebf\u91cf\u5316\n**\u6ce8\u610f**\uff1a\u76ee\u524d\u79bb\u7ebf\u91cf\u5316\uff0c\u5fc5\u987b\u4f7f\u7528\u5df2\u7ecf\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u5bfc\u51fa\u7684`inference model`\u8fdb\u884c\u91cf\u5316\u3002\u4e00\u822c\u6a21\u578b\u5bfc\u51fa`inference model`\u53ef\u53c2\u8003[\u6559\u7a0b](../../docs/zh-CN/usage.md#5-\u6a21\u578b\u63a8\u7406).\n\u4e00\u822c\u6765\u8bf4\uff0c\u79bb\u7ebf\u91cf\u5316\u635f\u5931\u6a21\u578b\u7cbe\u5ea6\u8f83\u591a\u3002\n\u4ee5PP-TSM\u6a21\u578b\u4e3a\u4f8b\uff0c\u751f\u6210`inference model`\u540e\uff0c\u79bb\u7ebf\u91cf\u5316\u8fd0\u884c\u65b9\u5f0f\u5982\u4e0b\n```bash\n# \u4e0b\u8f7d\u5e76\u89e3\u538b\u51fa\u5c11\u91cf\u6570\u636e\u7528\u4e8e\u79bb\u7ebf\u91cf\u5316\u7684\u6821\u51c6\npushd ./data/k400\nwget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar\ntar -xf k400_rawframes_small.tar\npopd\n# \u7136\u540e\u8fdb\u5165deploy/slim\u76ee\u5f55\u4e0b\ncd deploy/slim\n# \u6267\u884c\u79bb\u7ebf\u91cf\u5316\u547d\u4ee4\npython3.7 quant_post_static.py \\\n-c ../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml \\\n--use_gpu=True\n```\n\u9664`use_gpu`\u5916\uff0c\u6240\u6709\u7684\u91cf\u5316\u73af\u5883\u53c2\u6570\u90fd\u5728`pptsm_k400_frames_uniform_quantization.yaml`\u6587\u4ef6\u4e2d\u8fdb\u884c\u914d\u7f6e\n\u5176\u4e2d`inference_model_dir`\u8868\u793a\u4e0a\u4e00\u6b65\u5bfc\u51fa\u7684`inference model`\u76ee\u5f55\u8def\u5f84\uff0c`quant_output_dir`\u8868\u793a\u91cf\u5316\u6a21\u578b\u7684\u8f93\u51fa\u76ee\u5f55\u8def\u5f84\n\u6267\u884c\u6210\u529f\u540e\uff0c\u5728`quant_output_dir`\u7684\u76ee\u5f55\u4e0b\u751f\u6210\u4e86`__model__`\u6587\u4ef6\u548c`__params__`\u6587\u4ef6\uff0c\u8fd9\u4e8c\u8005\u7528\u4e8e\u5b58\u50a8\u751f\u6210\u7684\u79bb\u7ebf\u91cf\u5316\u6a21\u578b\n\u7c7b\u4f3c`inference model`\u7684\u4f7f\u7528\u65b9\u6cd5\uff0c\u63a5\u4e0b\u6765\u53ef\u4ee5\u76f4\u63a5\u7528\u8fd9\u4e24\u4e2a\u6587\u4ef6\u8fdb\u884c\u9884\u6d4b\u90e8\u7f72\uff0c\u65e0\u9700\u518d\u91cd\u65b0\u5bfc\u51fa\u6a21\u578b\u3002"
+        },
+        {
+            "comment": "This code snippet demonstrates the usage of PP-TSM quantized model for prediction in PaddleVideo. It directs the user to navigate into the PaddleVideo directory and then executes a python script with specific parameters such as input file, configuration file, model files, and flags for GPU and TensorRT utilization. The output shows the recognized top-1 class and score. The code also mentions additional information on how to prune models, export them, and deploy them in Python or C++ settings by referring to separate documentation sections. It provides recommendations for training hyperparameters when using quantized training with pre-trained models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/slim/readme.md\":92-132",
+            "content": "```bash\n# \u4f7f\u7528PP-TSM\u79bb\u7ebf\u91cf\u5316\u6a21\u578b\u8fdb\u884c\u9884\u6d4b\n# \u56de\u5230PaddleVideo\u76ee\u5f55\u4e0b\ncd ../../\n# \u4f7f\u7528\u91cf\u5316\u6a21\u578b\u8fdb\u884c\u9884\u6d4b\npython3.7 tools/predict.py \\\n--input_file data/example.avi \\\n--config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \\\n--model_file ./inference/ppTSM/quant_model/__model__ \\\n--params_file ./inference/ppTSM/quant_model/__params__ \\\n--use_gpu=True \\\n--use_tensorrt=False\n```\n\u8f93\u51fa\u5982\u4e0b\uff1a\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9997928738594055\n```\n#### 3.2 \u6a21\u578b\u526a\u679d\nTODO\n### 4. \u5bfc\u51fa\u6a21\u578b\nTODO\n### 5. \u6a21\u578b\u90e8\u7f72\n\u4e0a\u8ff0\u6b65\u9aa4\u5bfc\u51fa\u7684\u6a21\u578b\u53ef\u4ee5\u901a\u8fc7PaddleLite\u7684opt\u6a21\u578b\u8f6c\u6362\u5de5\u5177\u5b8c\u6210\u6a21\u578b\u8f6c\u6362\u3002\n\u6a21\u578b\u90e8\u7f72\u7684\u53ef\u53c2\u8003\n[Serving Python\u90e8\u7f72](../python_serving/readme.md)\n[Serving C++\u90e8\u7f72](../cpp_serving/readme.md)\n## \u8bad\u7ec3\u8d85\u53c2\u6570\u5efa\u8bae\n* \u91cf\u5316\u8bad\u7ec3\u65f6\uff0c\u5efa\u8bae\u52a0\u8f7d\u5e38\u89c4\u8bad\u7ec3\u5f97\u5230\u7684\u9884\u8bad\u7ec3\u6a21\u578b\uff0c\u52a0\u901f\u91cf\u5316\u8bad\u7ec3\u6536\u655b\u3002\n* \u91cf\u5316\u8bad\u7ec3\u65f6\uff0c\u5efa\u8bae\u521d\u59cb\u5b66\u4e60\u7387\u4fee\u6539\u4e3a\u5e38\u89c4\u8bad\u7ec3\u7684`1/20~1/10`\uff0c\u540c\u65f6\u5c06\u8bad\u7ec3epoch\u6570\u4fee\u6539\u4e3a\u5e38\u89c4\u8bad\u7ec3\u7684`1/5~1/2`\uff0c\u5b66\u4e60\u7387\u7b56\u7565\u65b9\u9762\uff0c\u52a0\u4e0aWarmup\uff0c\u5176\u4ed6\u914d\u7f6e\u4fe1\u606f\u4e0d\u5efa\u8bae\u4fee\u6539\u3002"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/f9e68dc7-a04f-4c73-b284-687a44ba1c23.json b/docs/doc/f9e68dc7-a04f-4c73-b284-687a44ba1c23.json
new file mode 100644
index 000000000..12f744af3
--- /dev/null
+++ b/docs/doc/f9e68dc7-a04f-4c73-b284-687a44ba1c23.json
@@ -0,0 +1,205 @@
+{
+    "summary": "The code imports necessary libraries, registers object detection backbones, performs vector transformations, defines network creation functions, computes depth prediction, initializes PaddleVideo backbone, includes Project3D layer, calculates SSIM loss, and creates a deep learning model for image processing with ResNet V1.5, DepthDecoder, and PoseDecoder classes. The pose estimation model supports diverse inputs, handles day/night scenarios, computes parameters, generates warped images, and selects data based on conditions.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries, defines constants, and registers backbones in a PaddlePaddle's object detection model library. It also includes comments for licensing and copyright information as well as function definitions for weight initialization and calculating fan-in and fan-out of layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":0-29",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nfrom collections import OrderedDict\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn import BatchNorm2D, Conv2D\nfrom paddle.nn.initializer import Constant, Normal\nfrom paddle.vision.models import ResNet\nfrom ...utils import load_ckpt\nfrom ..registry import BACKBONES\nfrom ..weight_init import kaiming_normal_, _calculate_fan_in_and_fan_out\nzeros_ = Constant(value=0.)"
+        },
+        {
+            "comment": "Code snippet defines three functions - \"disp_to_depth\" for converting network's sigmoid output into depth prediction, \"gram_matrix\" for computing the Gram matrix of feature maps and \"convt_bn_relu\" for creating a convolution layer with batch normalization and ReLU activation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":30-66",
+            "content": "ones_ = Constant(value=1.)\nnormal_ = Normal(mean=0, std=1e-3)\ndef disp_to_depth(disp, min_depth, max_depth):\n    \"\"\"Convert network's sigmoid output into depth prediction\n    The formula for this conversion is given in the 'additional considerations'\n    section of the paper.\n    \"\"\"\n    min_disp = 1 / max_depth\n    max_disp = 1 / min_depth\n    scaled_disp = min_disp + (max_disp - min_disp) * disp\n    depth = 1 / scaled_disp\n    return scaled_disp, depth\ndef gram_matrix(y):\n    (b, ch, h, w) = y.shape\n    features = y.reshape([b, ch, w * h])\n    features_t = paddle.transpose(features, [0, 2, 1])\n    gram = features.bmm(features_t) / (ch * h * w)\n    return gram\ndef convt_bn_relu(in_channels,\n                  out_channels,\n                  kernel_size,\n                  stride=1,\n                  padding=0,\n                  output_padding=0,\n                  bn=True,\n                  relu=True):\n    bias = not bn\n    layers = []\n    layers.append(\n        nn.Conv2DTranspose(in_channels,\n                           out_channels,"
+        },
+        {
+            "comment": "The code defines a function for creating a convolutional transpose layer, adding batch normalization and Leaky ReLU activation if specified. It also includes weight initialization for the created layers. The second function converts network's (axisangle, translation) output into a 4x4 matrix based on parameters and an optional invert flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":67-103",
+            "content": "                           kernel_size,\n                           stride,\n                           padding,\n                           output_padding,\n                           bias_attr=bias))\n    if bn:\n        layers.append(nn.BatchNorm2D(out_channels))\n    if relu:\n        layers.append(nn.LeakyReLU(0.2))\n    layers = nn.Sequential(*layers)\n    # initialize the weights\n    for m in layers.sublayers(include_self=True):\n        if isinstance(m, nn.Conv2DTranspose):\n            normal_(m.weight)\n            if m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.BatchNorm2D):\n            ones_(m.weight)\n            zeros_(m.bias)\n    return layers\ndef transformation_from_parameters(axisangle, translation, invert=False):\n    \"\"\"Convert the network's (axisangle, translation) output into a 4x4 matrix\n    \"\"\"\n    R = rot_from_axisangle(axisangle)\n    t = translation.clone()\n    if invert:\n        R = R.transpose([0, 2, 1])\n        t *= -1\n    T = get_translation_matrix(t)\n    if invert:"
+        },
+        {
+            "comment": "get_translation_matrix: Converts translation vector to a 4x4 transformation matrix.\nrot_from_axisangle: Converts axis-angle rotation into a 4x4 transformation matrix.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":104-150",
+            "content": "        M = paddle.matmul(R, T)\n    else:\n        M = paddle.matmul(T, R)\n    return M\ndef get_translation_matrix(translation_vector):\n    \"\"\"Convert a translation vector into a 4x4 transformation matrix\n    \"\"\"\n    t = translation_vector.reshape([-1, 3, 1])\n    gather_object = paddle.stack([\n        paddle.zeros([\n            translation_vector.shape[0],\n        ], paddle.float32),\n        paddle.ones([\n            translation_vector.shape[0],\n        ], paddle.float32),\n        paddle.squeeze(t[:, 0], axis=-1),\n        paddle.squeeze(t[:, 1], axis=-1),\n        paddle.squeeze(t[:, 2], axis=-1),\n    ])\n    gather_index = paddle.to_tensor([\n        [1],\n        [0],\n        [0],\n        [2],\n        [0],\n        [1],\n        [0],\n        [3],\n        [0],\n        [0],\n        [1],\n        [4],\n        [0],\n        [0],\n        [0],\n        [1],\n    ])\n    T = paddle.gather_nd(gather_object, gather_index)\n    T = T.reshape([4, 4, -1]).transpose((2, 0, 1))\n    return T\ndef rot_from_axisangle(vec):\n    \"\"\"Convert an axisangle rotation into a 4x4 transformation matrix"
+        },
+        {
+            "comment": "This code performs rotation operations on a 3D vector 'vec'. It calculates the angle and axis of rotation, then applies trigonometry to compute rotation matrices. Finally, it gathers transformed vectors using stacked tensor operations.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":151-187",
+            "content": "    (adapted from https://github.com/Wallacoloo/printipi)\n    Input 'vec' has to be Bx1x3\n    \"\"\"\n    angle = paddle.norm(vec, 2, 2, True)\n    axis = vec / (angle + 1e-7)\n    ca = paddle.cos(angle)\n    sa = paddle.sin(angle)\n    C = 1 - ca\n    x = axis[..., 0].unsqueeze(1)\n    y = axis[..., 1].unsqueeze(1)\n    z = axis[..., 2].unsqueeze(1)\n    xs = x * sa\n    ys = y * sa\n    zs = z * sa\n    xC = x * C\n    yC = y * C\n    zC = z * C\n    xyC = x * yC\n    yzC = y * zC\n    zxC = z * xC\n    gather_object = paddle.stack([\n        paddle.squeeze(x * xC + ca, axis=(-1, -2)),\n        paddle.squeeze(xyC - zs, axis=(-1, -2)),\n        paddle.squeeze(zxC + ys, axis=(-1, -2)),\n        paddle.squeeze(xyC + zs, axis=(-1, -2)),\n        paddle.squeeze(y * yC + ca, axis=(-1, -2)),\n        paddle.squeeze(yzC - xs, axis=(-1, -2)),\n        paddle.squeeze(zxC - ys, axis=(-1, -2)),\n        paddle.squeeze(yzC + xs, axis=(-1, -2)),\n        paddle.squeeze(z * zC + ca, axis=(-1, -2)),\n        paddle.ones([\n            vec.shape[0],\n        ], dtype=paddle.float32),"
+        },
+        {
+            "comment": "Code defines three functions: \"get_rot\", \"upsample\", and \"get_smooth_loss\". get_rot performs a gather operation on a tensor, reshapes the result, then transposes it. upsample interpolates an input tensor by doubling its size. get_smooth_loss computes the smoothness loss for disparity images using gradients of disparities and color image edges.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":188-230",
+            "content": "        paddle.zeros([\n            vec.shape[0],\n        ], dtype=paddle.float32)\n    ])\n    gather_index = paddle.to_tensor([\n        [0],\n        [1],\n        [2],\n        [10],\n        [3],\n        [4],\n        [5],\n        [10],\n        [6],\n        [7],\n        [8],\n        [10],\n        [10],\n        [10],\n        [10],\n        [9],\n    ])\n    rot = paddle.gather_nd(gather_object, gather_index)\n    rot = rot.reshape([4, 4, -1]).transpose((2, 0, 1))\n    return rot\ndef upsample(x):\n    \"\"\"Upsample input tensor by a factor of 2\n    \"\"\"\n    return F.interpolate(x, scale_factor=2, mode=\"nearest\")\ndef get_smooth_loss(disp, img):\n    \"\"\"Computes the smoothness loss for a disparity image\n    The color image is used for edge-aware smoothness\n    \"\"\"\n    grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])\n    grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])\n    grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),\n                             1,\n                             keepdim=True)"
+        },
+        {
+            "comment": "This code defines functions for creating convolutional layers and a ResNet model with multiple input images. The functions include 3x3 and 1x1 convolutions, along with a function that constructs the ResNet model itself. The ResNet model can handle multiple input images by combining gradients from each image channel.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":231-263",
+            "content": "    grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),\n                             1,\n                             keepdim=True)\n    grad_disp_x *= paddle.exp(-grad_img_x)\n    grad_disp_y *= paddle.exp(-grad_img_y)\n    return grad_disp_x.mean() + grad_disp_y.mean()\ndef conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):\n    \"\"\"3x3 convolution with padding\"\"\"\n    return nn.Conv2D(in_planes,\n                     out_planes,\n                     kernel_size=3,\n                     stride=stride,\n                     padding=dilation,\n                     groups=groups,\n                     bias_attr=False,\n                     dilation=dilation)\ndef conv1x1(in_planes, out_planes, stride=1):\n    \"\"\"1x1 convolution\"\"\"\n    return nn.Conv2D(in_planes,\n                     out_planes,\n                     kernel_size=1,\n                     stride=stride,\n                     bias_attr=False)\ndef resnet_multiimage_input(num_layers, num_input_images=1):\n    \"\"\"Constructs a ResNet model."
+        },
+        {
+            "comment": "This code defines a function that creates a ResNet model with multiple image inputs. The model takes in the number of resnet layers (18 or 50), whether to use pretrained weights, and the number of input frames to stack. It then creates blocks based on the layer type and number of layers provided, and initializes the model's weights. The ConvBlock class performs a convolution followed by ELU activation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":264-293",
+            "content": "    Args:\n        num_layers (int): Number of resnet layers. Must be 18 or 50\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\n        num_input_images (int): Number of frames stacked as input\n    \"\"\"\n    assert num_layers in [18, 50], \"Can only run with 18 or 50 layer resnet\"\n    blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]\n    block_type = {18: BasicBlock, 50: Bottleneck}[num_layers]\n    model = ResNetMultiImageInput(block_type,\n                                  num_layers,\n                                  blocks,\n                                  num_input_images=num_input_images)\n    model.init_weights()\n    return model\nclass ConvBlock(nn.Layer):\n    \"\"\"Layer to perform a convolution followed by ELU\n    \"\"\"\n    def __init__(self, in_channels, out_channels):\n        super(ConvBlock, self).__init__()\n        self.conv = Conv3x3(in_channels, out_channels)\n        self.nonlin = nn.ELU()\n    def forward(self, x):\n        out = self.conv(x)\n        out = self.nonlin(out)"
+        },
+        {
+            "comment": "Conv3x3 is a layer that pads and convolves the input.\nBackprojectDepth transforms a depth image into a point cloud.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":294-329",
+            "content": "        return out\nclass Conv3x3(nn.Layer):\n    \"\"\"Layer to pad and convolve input\n    \"\"\"\n    def __init__(self, in_channels, out_channels, use_refl=True):\n        super(Conv3x3, self).__init__()\n        if use_refl:\n            self.pad = nn.Pad2D(1, mode='reflect')\n        else:\n            self.pad = nn.Pad2D(1)\n        self.conv = nn.Conv2D(int(in_channels), int(out_channels), 3)\n    def forward(self, x):\n        out = self.pad(x)\n        out = self.conv(out)\n        return out\nclass BackprojectDepth(nn.Layer):\n    \"\"\"Layer to transform a depth image into a point cloud\n    \"\"\"\n    def __init__(self, batch_size, height, width):\n        super(BackprojectDepth, self).__init__()\n        self.batch_size = batch_size\n        self.height = height\n        self.width = width\n        meshgrid = np.meshgrid(range(self.width),\n                               range(self.height),\n                               indexing='xy')\n        id_coords = np.stack(meshgrid, axis=0).astype(np.float32)\n        self.id_coords = self.create_parameter(shape=list(id_coords.shape),"
+        },
+        {
+            "comment": "This code creates and initializes parameters for a backbone in PaddleVideo, specifically for the ID and pixel coordinates. It sets the gradients to stop, meaning they won't be updated during backpropagation. The code uses paddling operations like unsqueeze, stack, tile, and concat for parameter creation and manipulation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":330-354",
+            "content": "                                               dtype=paddle.float32)\n        self.id_coords.set_value(id_coords)\n        self.add_parameter(\"id_coords\", self.id_coords)\n        self.id_coords.stop_gradient = True\n        self.ones = self.create_parameter(\n            shape=[self.batch_size, 1, self.height * self.width],\n            default_initializer=ones_)\n        self.add_parameter(\"ones\", self.ones)\n        self.ones.stop_gradient = True\n        pix_coords = paddle.unsqueeze(\n            paddle.stack([\n                self.id_coords[0].reshape([\n                    -1,\n                ]), self.id_coords[1].reshape([\n                    -1,\n                ])\n            ], 0), 0)\n        pix_coords = pix_coords.tile([batch_size, 1, 1])\n        pix_coords = paddle.concat([pix_coords, self.ones], 1)\n        self.pix_coords = self.create_parameter(shape=list(pix_coords.shape), )\n        self.pix_coords.set_value(pix_coords)\n        self.add_parameter(\"pix_coords\", self.pix_coords)\n        self.pix_coords.stop_gradient = True"
+        },
+        {
+            "comment": "The code defines a Project3D layer that projects 3D points into a camera with intrinsics K and at position T. It includes the forward pass, initialization, and required parameters such as batch_size, height, and width. The forward function calculates camera projection points by multiplying the intrinsic matrix K with the translation matrix T, then projects the points to pixels coordinates.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":356-384",
+            "content": "    def forward(self, depth, inv_K):\n        cam_points = paddle.matmul(inv_K[:, :3, :3], self.pix_coords)\n        cam_points = depth.reshape([self.batch_size, 1, -1]) * cam_points\n        cam_points = paddle.concat([cam_points, self.ones], 1)\n        return cam_points\nclass Project3D(nn.Layer):\n    \"\"\"Layer which projects 3D points into a camera with intrinsics K and at position T\n    \"\"\"\n    def __init__(self, batch_size, height, width, eps=1e-7):\n        super(Project3D, self).__init__()\n        self.batch_size = batch_size\n        self.height = height\n        self.width = width\n        self.eps = eps\n    def forward(self, points, K, T):\n        P = paddle.matmul(K, T)[:, :3, :]\n        cam_points = paddle.matmul(P, points)\n        pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) +\n                                             self.eps)\n        pix_coords = pix_coords.reshape(\n            [self.batch_size, 2, self.height, self.width])\n        pix_coords = pix_coords.transpose([0, 2, 3, 1])"
+        },
+        {
+            "comment": "The code defines a function `pix_coords` that normalizes pixel coordinates and a class `SSIM` for computing the Structural Similarity Index (SSIM) loss between two images. It initializes variables for mean, variance pooling, and applies padding to input images before calculating SSIM loss using provided formulas.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":385-416",
+            "content": "        pix_coords[..., 0] /= self.width - 1\n        pix_coords[..., 1] /= self.height - 1\n        pix_coords = (pix_coords - 0.5) * 2\n        return pix_coords\nclass SSIM(nn.Layer):\n    \"\"\"Layer to compute the SSIM loss between a pair of images\n    \"\"\"\n    def __init__(self):\n        super(SSIM, self).__init__()\n        self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)\n        self.refl = nn.Pad2D(1, mode='reflect')\n        self.C1 = 0.01**2\n        self.C2 = 0.03**2\n    def forward(self, x, y):\n        x = self.refl(x)\n        y = self.refl(y)\n        mu_x = self.mu_x_pool(x)\n        mu_y = self.mu_y_pool(y)\n        sigma_x = self.sig_x_pool(x**2) - mu_x**2\n        sigma_y = self.sig_y_pool(y**2) - mu_y**2\n        sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y"
+        },
+        {
+            "comment": "The code defines a ResNet model with multiple input images. It includes a convolution layer, batch normalization, ReLU activation, and max pooling for initial processing. The class \"ResNetMultiImageInput\" inherits from the base \"ResNet\" class and can handle different numbers of input images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":418-440",
+            "content": "        SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)\n        SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)\n        return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)\nclass ResNetMultiImageInput(ResNet):\n    \"\"\"Constructs a resnet model with varying number of input images.\n    Adapted from https://github.com/pypaddle/vision/blob/master/paddlevision/models/resnet.py\n    \"\"\"\n    def __init__(self, block, depth, layers, num_input_images=1):\n        super(ResNetMultiImageInput, self).__init__(block, depth)\n        self.inplanes = 64\n        self.conv1 = nn.Conv2D(num_input_images * 3,\n                               64,\n                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)\n        self.bn1 = nn.BatchNorm2D(64)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block, 64, layers[0])"
+        },
+        {
+            "comment": "The code defines a model architecture with multiple layers, including ConvBNLayer. It initializes the weights of these layers using specific methods and constraints for convolutional and batch normalization layers. This is typically done to improve performance and stability in deep learning models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":441-465",
+            "content": "        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)\n        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)\n        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)\n    def init_weights(self):\n        for layer in self.sublayers(include_self=True):\n            if isinstance(layer, nn.Conv2D):\n                kaiming_normal_(layer.weight,\n                                mode='fan_out',\n                                nonlinearity='relu')\n            elif isinstance(layer, nn.BatchNorm2D):\n                ones_(layer.weight)\n                zeros_(layer.bias)\nclass ConvBNLayer(nn.Layer):\n    \"\"\"Conv2D and BatchNorm2D layer.\n    Args:\n        in_channels (int): Number of channels for the input.\n        out_channels (int): Number of channels for the output.\n        kernel_size (int): Kernel size.\n        stride (int): Stride in the Conv2D layer. Default: 1.\n        groups (int): Groups in the Conv2D, Default: 1.\n        act (str): Indicate activation after BatchNorm2D layer."
+        },
+        {
+            "comment": "The `ConvBNLayer` class is a custom layer that consists of a convolution operation and batch normalization. It initializes the Conv2D layer and BatchNorm2D layer with specified parameters, and applies them sequentially in the forward pass.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":466-496",
+            "content": "        name (str): the name of an instance of ConvBNLayer.\n    Note: weight and bias initialization include initialize values\n    and name the restored parameters, values initialization\n    are explicit declared in the ```init_weights``` method.\n    \"\"\"\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 act=None,\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(in_channels=in_channels,\n                            out_channels=out_channels,\n                            kernel_size=kernel_size,\n                            stride=stride,\n                            padding=(kernel_size - 1) // 2,\n                            groups=groups,\n                            bias_attr=False)\n        self._act = act\n        self._batch_norm = BatchNorm2D(out_channels)\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)"
+        },
+        {
+            "comment": "The code defines a class called `BasicBlock` which is an instance of the `nn.Layer` class, and initializes it with parameters such as `inplanes`, `planes`, `stride`, `downsample`, `groups`, `base_width`, `dilation`, and `norm_layer`. It also performs some checks to ensure that certain values match the block's requirements, and then initializes specific layers like `conv1`, `bn1`, and `relu` accordingly. The code also handles cases where `stride` is not equal to 1 by downsampling the input through both `self.conv1` and `self.downsample`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":497-527",
+            "content": "        if self._act:\n            y = getattr(paddle.nn.functional, self._act)(y)\n        return y\nclass BasicBlock(nn.Layer):\n    expansion = 1\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 stride=1,\n                 downsample=None,\n                 groups=1,\n                 base_width=64,\n                 dilation=1,\n                 norm_layer=None):\n        super(BasicBlock, self).__init__()\n        if norm_layer is None:\n            norm_layer = nn.BatchNorm2D\n        if groups != 1 or base_width != 64:\n            raise ValueError(\n                'BasicBlock only supports groups=1 and base_width=64')\n        if dilation > 1:\n            raise NotImplementedError(\n                \"Dilation > 1 not supported in BasicBlock\")\n        # Both self.conv1 and self.downsample layers downsample the input when stride != 1\n        self.conv1 = conv3x3(inplanes, planes, stride)\n        self.bn1 = norm_layer(planes)\n        self.relu = nn.ReLU()\n        self.conv2 = conv3x3(planes, planes)"
+        },
+        {
+            "comment": "The code defines a Bottleneck layer with stride at the 3x3 convolution (self.conv2) for ResNet V1.5, improving accuracy according to sources like \"Deep residual learning for image recognition\" and \"NVIDIA: ResNet_50_v1_5_for_PyTorch\". The Bottleneck layer has an expansion of 4, and its class initializes inplanes, planes, and other parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":528-562",
+            "content": "        self.bn2 = norm_layer(planes)\n        self.downsample = downsample\n        self.stride = stride\n    def forward(self, x):\n        identity = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        if self.downsample is not None:\n            identity = self.downsample(x)\n        out += identity\n        out = self.relu(out)\n        return out\nclass Bottleneck(nn.Layer):\n    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)\n    # while original implementation places the stride at the first 1x1 convolution(self.conv1)\n    # according to \"Deep residual learning for image recognition\"https://arxiv.org/abs/1512.03385.\n    # This variant is also known as ResNet V1.5 and improves accuracy according to\n    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.\n    expansion = 4\n    def __init__(self,\n                 inplanes,\n                 planes,"
+        },
+        {
+            "comment": "The code defines a Bottleneck class for a convolutional neural network. It has multiple layers of 1x1 and 3x3 convolutions, with batch normalization and ReLU activation functions. The class also supports downsampling and stride configuration options.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":563-596",
+            "content": "                 stride=1,\n                 downsample=None,\n                 groups=1,\n                 base_width=64,\n                 dilation=1,\n                 norm_layer=None):\n        super(Bottleneck, self).__init__()\n        if norm_layer is None:\n            norm_layer = nn.BatchNorm2D\n        width = int(planes * (base_width / 64.)) * groups\n        self.conv1 = conv1x1(inplanes, width)\n        self.bn1 = norm_layer(width)\n        self.conv2 = conv3x3(width, width, stride, groups, dilation)\n        self.bn2 = norm_layer(width)\n        self.conv3 = conv1x1(width, planes * self.expansion)\n        self.bn3 = norm_layer(planes * self.expansion)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n    def forward(self, x):\n        identity = x\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n        out = self.conv3(out)\n        out = self.bn3(out)"
+        },
+        {
+            "comment": "The code defines a class called DepthDecoder. It takes in parameters such as number of channels, scales, output channel count, and use_skips. The class initializes various attributes like num_output_channels, use_skips, upsample_mode, and scale. It also creates an OrderedDict named 'convs' which stores ConvBlock instances based on the given parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":598-630",
+            "content": "        if self.downsample is not None:\n            identity = self.downsample(x)\n        out += identity\n        out = self.relu(out)\n        return out\nclass DepthDecoder(nn.Layer):\n    def __init__(self,\n                 num_ch_enc,\n                 scales=range(4),\n                 num_output_channels=1,\n                 use_skips=True):\n        super(DepthDecoder, self).__init__()\n        self.num_output_channels = num_output_channels\n        self.use_skips = use_skips\n        self.upsample_mode = 'nearest'\n        self.scales = scales\n        self.num_ch_enc = num_ch_enc\n        self.num_ch_dec = np.array([16, 32, 64, 128, 256])\n        # decoder\n        self.convs = OrderedDict()\n        for i in range(4, -1, -1):\n            # upconv_0\n            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i +\n                                                                           1]\n            num_ch_out = self.num_ch_dec[i]\n            self.convs[(\"upconv\", i, 0)] = ConvBlock(num_ch_in, num_ch_out)"
+        },
+        {
+            "comment": "Code defines a convolutional network architecture for image decoding. It uses ConvBlock layers and Conv3x3 layers in the decoder section. The input features are upsampled and combined with previous encoder outputs at each stage, and then passed through convolution layers. The results are stored in 'outputs' dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":632-659",
+            "content": "            # upconv_1\n            num_ch_in = self.num_ch_dec[i]\n            if self.use_skips and i > 0:\n                num_ch_in += self.num_ch_enc[i - 1]\n            num_ch_out = self.num_ch_dec[i]\n            self.convs[(\"upconv\", i, 1)] = ConvBlock(num_ch_in, num_ch_out)\n        for s in self.scales:\n            self.convs[(\"dispconv\", s)] = Conv3x3(self.num_ch_dec[s],\n                                                  self.num_output_channels)\n        self.decoder = nn.LayerList(list(self.convs.values()))\n        self.sigmoid = nn.Sigmoid()\n    def forward(self, input_features):\n        outputs = {}\n        # decoder\n        x = input_features[-1]\n        for i in range(4, -1, -1):\n            x = self.convs[(\"upconv\", i, 0)](x)\n            x = [upsample(x)]\n            if self.use_skips and i > 0:\n                x += [input_features[i - 1]]\n            x = paddle.concat(x, 1)\n            x = self.convs[(\"upconv\", i, 1)](x)\n            if i in self.scales:\n                outputs[(\"disp\", i)] = self.sigmoid(self.convs[(\"dispconv\","
+        },
+        {
+            "comment": "The PoseDecoder class in this code is a neural network layer that uses convolutional layers to predict pose for a given number of frames. It takes in the number of input channels, the number of input features, and an optional parameter for the number of frames to predict. The layer contains three convolution layers with different parameters for each.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":660-685",
+            "content": "                                                                i)](x))\n        return outputs\nclass PoseDecoder(nn.Layer):\n    def __init__(self,\n                 num_ch_enc,\n                 num_input_features,\n                 num_frames_to_predict_for=None,\n                 stride=1):\n        super(PoseDecoder, self).__init__()\n        self.num_ch_enc = num_ch_enc\n        self.num_input_features = num_input_features\n        if num_frames_to_predict_for is None:\n            num_frames_to_predict_for = num_input_features - 1\n        self.num_frames_to_predict_for = num_frames_to_predict_for\n        self.convs = OrderedDict()\n        self.convs[(\"squeeze\")] = nn.Conv2D(self.num_ch_enc[-1], 256, 1)\n        self.convs[(\"pose\", 0)] = nn.Conv2D(num_input_features * 256, 256, 3,\n                                            stride, 1)\n        self.convs[(\"pose\", 1)] = nn.Conv2D(256, 256, 3, stride, 1)\n        self.convs[(\"pose\", 2)] = nn.Conv2D(256, 6 * num_frames_to_predict_for,\n                                            1)"
+        },
+        {
+            "comment": "The code defines a class \"Adds\" with a forward function that performs feature extraction and concatenation, followed by convolution and activation operations. The code also includes a class \"ResnetEncoder\" which is a Pypaddle implementation of a ResNet encoder.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":687-724",
+            "content": "        self.relu = nn.ReLU()\n        self.net = nn.LayerList(list(self.convs.values()))\n    def forward(self, input_features):\n        last_features = [f[-1] for f in input_features]\n        cat_features = [\n            self.relu(self.convs[\"squeeze\"](f)) for f in last_features\n        ]\n        cat_features = paddle.concat(cat_features, 1)\n        out = cat_features\n        for i in range(3):\n            out = self.convs[(\"pose\", i)](out)\n            if i != 2:\n                out = self.relu(out)\n        out = out.mean(3).mean(2)\n        out = 0.01 * out.reshape([-1, self.num_frames_to_predict_for, 1, 6])\n        axisangle = out[..., :3]\n        translation = out[..., 3:]\n        return axisangle, translation\nclass ResnetEncoder(nn.Layer):\n    \"\"\"Pypaddle module for a resnet encoder\n    \"\"\"\n    def __init__(self, num_layers, pretrained=False, num_input_images=1):\n        super(ResnetEncoder, self).__init__()\n        self.num_ch_enc = np.array([64, 64, 128, 256, 512])\n        resnets = {\n            18: paddle.vision.models.resnet18,"
+        },
+        {
+            "comment": "The code defines a function that creates a ResNet backbone model with specified layers and checks if the input has multiple images. It uses pretrained weights, adds a convolutional layer to the output of the ResNet, and scales certain channels based on the number of layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":725-752",
+            "content": "            34: paddle.vision.models.resnet34,\n            50: paddle.vision.models.resnet50,\n            101: paddle.vision.models.resnet101,\n            152: paddle.vision.models.resnet152\n        }\n        if num_layers not in resnets:\n            raise ValueError(\n                \"{} is not a valid number of resnet layers\".format(num_layers))\n        if num_input_images > 1:\n            self.encoder = resnet_multiimage_input(num_layers, pretrained,\n                                                   num_input_images)\n        else:\n            self.encoder = resnets[num_layers](pretrained)\n        if num_layers > 34:\n            self.num_ch_enc[1:] *= 4\n        ######################################\n        # night public first conv\n        ######################################\n        self.conv1 = nn.Conv2D(3,\n                               64,\n                               kernel_size=7,\n                               stride=2,\n                               padding=3,\n                               bias_attr=False)"
+        },
+        {
+            "comment": "This code initializes a network backbone with shared and private encoders for day and night, as well as a shared decoder. It uses BatchNorm2D, ReLU activation, Conv2D layers, and sets up convolutional blocks for the encoders and decoder.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":753-775",
+            "content": "        self.bn1 = nn.BatchNorm2D(64)\n        self.relu = nn.ReLU()  # NOTE\n        self.conv_shared = nn.Conv2D(512, 64, kernel_size=1)\n        ##########################################\n        # private source encoder, day\n        ##########################################\n        self.encoder_day = resnets[num_layers](pretrained)\n        self.conv_diff_day = nn.Conv2D(\n            512, 64, kernel_size=1)  # no bn after conv, so bias=true\n        ##########################################\n        # private target encoder, night\n        ##########################################\n        self.encoder_night = resnets[num_layers](pretrained)\n        self.conv_diff_night = nn.Conv2D(512, 64, kernel_size=1)\n        ######################################\n        # shared decoder (small decoder), use a simple de-conv to upsample the features with no skip connection\n        ######################################\n        self.convt5 = convt_bn_relu(in_channels=512,\n                                    out_channels=256,"
+        },
+        {
+            "comment": "This code defines a series of convolutional layers with batch normalization and ReLU activation functions. The layers have different numbers of input and output channels, as well as identical kernel sizes, strides, padding, and output padding values. These layers likely form part of a deep learning model for image processing or analysis tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":776-796",
+            "content": "                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,\n                                    output_padding=1)\n        self.convt4 = convt_bn_relu(in_channels=256,\n                                    out_channels=128,\n                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,\n                                    output_padding=1)\n        self.convt3 = convt_bn_relu(in_channels=128,\n                                    out_channels=64,\n                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,\n                                    output_padding=1)\n        self.convt2 = convt_bn_relu(in_channels=64,\n                                    out_channels=64,\n                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,"
+        },
+        {
+            "comment": "The code defines a class with an initializer for two ConvT blocks and a convolutional layer. The forward function is used for training, where it subtracts 0.45 and divides by 0.225 from the input image to normalize it, and if the 'is_night' parameter is 'day', it passes this normalized image through the day encoder blocks of the model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":797-816",
+            "content": "                                    output_padding=1)\n        self.convt1 = convt_bn_relu(in_channels=64,\n                                    out_channels=64,\n                                    kernel_size=3,\n                                    stride=2,\n                                    padding=1,\n                                    output_padding=1)\n        self.convtf = nn.Conv2D(64, 3, kernel_size=1, stride=1, padding=0)\n    def forward(self, input_image, is_night):\n        if self.training:\n            result = []\n            input_data = (input_image - 0.45) / 0.225\n            if is_night == 'day':\n                # source private encoder, day\n                private_feature = self.encoder_day.conv1(input_data)\n                private_feature = self.encoder_day.bn1(private_feature)\n                private_feature = self.encoder_day.relu(private_feature)\n                private_feature = self.encoder_day.maxpool(private_feature)\n                private_feature = self.encoder_day.layer1(private_feature)"
+        },
+        {
+            "comment": "The code is processing the input data through a day or night specific encoder, applying convolutions, batch normalization, ReLU activation, and max pooling. It then appends the resulting private code and gram matrix to the 'result' list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":817-833",
+            "content": "                private_feature = self.encoder_day.layer2(private_feature)\n                private_feature = self.encoder_day.layer3(private_feature)\n                private_feature = self.encoder_day.layer4(private_feature)\n                private_code = self.conv_diff_day(private_feature)\n                private_gram = gram_matrix(private_feature)\n                result.append(private_code)\n                result.append(private_gram)\n            elif is_night == 'night':\n                # target private encoder, night\n                private_feature = self.encoder_night.conv1(input_data)\n                private_feature = self.encoder_night.bn1(private_feature)\n                private_feature = self.encoder_night.relu(private_feature)\n                private_feature = self.encoder_night.maxpool(private_feature)\n                private_feature = self.encoder_night.layer1(private_feature)\n                private_feature = self.encoder_night.layer2(private_feature)\n                private_feature = self.encoder_night.layer3(private_feature)"
+        },
+        {
+            "comment": "This code defines a model with two branches: one for day and one for night. It extracts features from the input image, applies different layers depending on whether it's day or night, and appends them to a list of features. Finally, it calculates a shared code for training using the last feature extracted.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":834-860",
+            "content": "                private_feature = self.encoder_night.layer4(private_feature)\n                private_code = self.conv_diff_night(private_feature)\n                private_gram = gram_matrix(private_feature)\n                result.append(private_code)\n                result.append(private_gram)\n        # shared encoder\n        self.features = []\n        x = (input_image - 0.45) / 0.225\n        if is_night == 'day':\n            x = self.encoder.conv1(x)\n            x = self.encoder.bn1(x)\n            self.features.append(self.encoder.relu(x))\n        else:\n            x = self.conv1(x)\n            x = self.bn1(x)\n            self.features.append(self.relu(x))\n        self.features.append(\n            self.encoder.layer1(self.encoder.maxpool(self.features[-1])))\n        self.features.append(self.encoder.layer2(self.features[-1]))\n        self.features.append(self.encoder.layer3(self.features[-1]))\n        self.features.append(self.encoder.layer4(self.features[-1]))\n        if self.training:\n            shared_code = self.conv_shared(self.features[-1])"
+        },
+        {
+            "comment": "This code defines a ResnetEncoder_pose class, which is a Pypaddle module for a resnet encoder. It initializes the number of layers and whether pre-trained weights are used. The code then defines several convolutional layers (convt1 to convt5) for processing feature maps. If pretrained is set to True, the method returns the features. Otherwise, it appends the processed feature maps to a result list and returns the features and result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":861-888",
+            "content": "            shared_gram = gram_matrix(self.features[-1])\n            result.append(shared_code)  # use this to calculate loss of diff\n            result.append(shared_gram)\n            result.append(\n                self.features[-1])  # use this to calculate loss of similarity\n            union_code = private_feature + self.features[-1]\n            rec_code = self.convt5(union_code)\n            rec_code = self.convt4(rec_code)\n            rec_code = self.convt3(rec_code)\n            rec_code = self.convt2(rec_code)\n            rec_code = self.convt1(rec_code)\n            rec_code = self.convtf(rec_code)\n            result.append(rec_code)\n            return self.features, result\n        else:\n            return self.features\nclass ResnetEncoder_pose(nn.Layer):\n    \"\"\"Pypaddle module for a resnet encoder\n    \"\"\"\n    def __init__(self, num_layers, pretrained=False, num_input_images=1):\n        super(ResnetEncoder_pose, self).__init__()\n        self.num_ch_enc = np.array([64, 64, 128, 256, 512])\n        resnets = {"
+        },
+        {
+            "comment": "This code defines a ResNet backbone model with different layers (18, 34, 50, 101, 152) and handles multi-image input cases. The encoder is initialized based on the specified number of layers, and adjusts the number of channels for layers larger than 34. The forward function extracts features from an input image through a series of ResNet layers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":889-916",
+            "content": "            18: paddle.vision.models.resnet18,\n            34: paddle.vision.models.resnet34,\n            50: paddle.vision.models.resnet50,\n            101: paddle.vision.models.resnet101,\n            152: paddle.vision.models.resnet152\n        }\n        if num_layers not in resnets:\n            raise ValueError(\n                \"{} is not a valid number of resnet layers\".format(num_layers))\n        if num_input_images > 1:\n            self.encoder = resnet_multiimage_input(num_layers, num_input_images)\n        else:\n            self.encoder = resnets[num_layers](pretrained)\n        if num_layers > 34:\n            self.num_ch_enc[1:] *= 4\n    def forward(self, input_image):\n        features = []\n        x = (input_image - 0.45) / 0.225\n        x = self.encoder.conv1(x)\n        x = self.encoder.bn1(x)\n        features.append(self.encoder.relu(x))\n        features.append(self.encoder.layer1(self.encoder.maxpool(features[-1])))\n        features.append(self.encoder.layer2(features[-1]))\n        features.append(self.encoder.layer3(features[-1]))"
+        },
+        {
+            "comment": "This code defines the class `ADDS_DepthNet`, which is a depth estimation network, with parameters such as number of layers, frame IDs, input size, batch size, etc. It inherits from `nn.Layer` and has methods to encode poses and features. The class also registers itself at `BACKBONES`.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":917-948",
+            "content": "        features.append(self.encoder.layer4(features[-1]))\n        return features\n@BACKBONES.register()\nclass ADDS_DepthNet(nn.Layer):\n    def __init__(self,\n                 num_layers=18,\n                 frame_ids=[0, -1, 1],\n                 height=256,\n                 width=512,\n                 batch_size=6,\n                 pose_model_input=\"pairs\",\n                 use_stereo=False,\n                 only_depth_encoder=False,\n                 pretrained=None,\n                 scales=[0, 1, 2, 3],\n                 min_depth=0.1,\n                 max_depth=100.0,\n                 pose_model_type='separate_resnet',\n                 v1_multiscale=False,\n                 predictive_mask=False,\n                 disable_automasking=False):\n        super(ADDS_DepthNet, self).__init__()\n        self.num_layers = num_layers\n        self.height = height\n        self.width = width\n        self.batch_size = batch_size\n        self.frame_ids = frame_ids\n        self.pose_model_input = pose_model_input\n        self.use_stereo = use_stereo"
+        },
+        {
+            "comment": "The code initializes the model parameters and instances, including whether to only use the depth encoder (only_depth_encoder), if pre-trained weights are used (pretrained), and the scales for the depth decoding (scales). It also determines the number of input frames needed for both depth and pose prediction based on the provided inputs. The code creates instances of DepthDecoder, ResnetEncoder, and ResnetEncoder_pose depending on the model configuration.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":949-971",
+            "content": "        self.only_depth_encoder = only_depth_encoder\n        self.pretrained = pretrained\n        self.scales = scales\n        self.pose_model_type = pose_model_type\n        self.predictive_mask = predictive_mask\n        self.disable_automasking = disable_automasking\n        self.v1_multiscale = v1_multiscale\n        self.min_depth = min_depth\n        self.max_depth = max_depth\n        self.num_input_frames = len(self.frame_ids)\n        self.num_pose_frames = 2 if self.pose_model_input == \"pairs\" else self.num_input_frames\n        assert self.frame_ids[0] == 0, \"frame_ids must start with 0\"\n        self.use_pose_net = not (self.use_stereo and self.frame_ids == [0])\n        self.encoder = ResnetEncoder(self.num_layers)\n        if not self.only_depth_encoder:\n            self.depth = DepthDecoder(self.encoder.num_ch_enc, self.scales)\n        if self.use_pose_net and not self.only_depth_encoder:\n            if self.pose_model_type == \"separate_resnet\":\n                self.pose_encoder = ResnetEncoder_pose("
+        },
+        {
+            "comment": "The code initializes a backbone model by defining its layers and scales, then initializing the weights of convolutional layers using Kaiming normalization and uniform initialization for bias. This backbone model is designed for handling pose estimation tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":972-995",
+            "content": "                    self.num_layers, num_input_images=self.num_pose_frames)\n                self.pose = PoseDecoder(self.pose_encoder.num_ch_enc,\n                                        num_input_features=1,\n                                        num_frames_to_predict_for=2)\n        self.backproject_depth = {}\n        self.project_3d = {}\n        for scale in self.scales:\n            h = self.height // (2**scale)\n            w = self.width // (2**scale)\n            self.backproject_depth[scale] = BackprojectDepth(\n                self.batch_size, h, w)\n            self.project_3d[scale] = Project3D(batch_size, h, w)\n    def init_weights(self):\n        \"\"\"First init model's weight\"\"\"\n        for m in self.sublayers(include_self=True):\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight, a=math.sqrt(5))\n                if m.bias is not None:\n                    fan_in, _ = _calculate_fan_in_and_fan_out(m.weight)\n                    bound = 1 / math.sqrt(fan_in)\n                    uniform_ = paddle.nn.initializer.Uniform(-bound, bound)"
+        },
+        {
+            "comment": "This code defines a forward function for a backbone model. It applies the encoder to inputs and uses the depth module to extract features. If pose prediction is enabled, it adds poses to the output dictionary, generates images, and stores frame IDs and scales in the outputs dictionary. This function handles both day and night scenarios.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":996-1018",
+            "content": "                    uniform_(m.bias)\n        \"\"\"Second, if provide pretrained ckpt, load it\"\"\"\n        if self.pretrained:  # load pretrained weights\n            load_ckpt(self, self.pretrained)\n    def forward(self, inputs, day_or_night='day'):\n        if self.training:\n            features, result = self.encoder(inputs[\"color_aug\", 0, 0], 'day')\n            features_night, result_night = self.encoder(\n                inputs[(\"color_n_aug\", 0, 0)], 'night')\n            outputs = self.depth(features)\n            outputs_night = self.depth(features_night)\n            if self.use_pose_net and not self.only_depth_encoder:\n                outputs.update(self.predict_poses(inputs, 'day'))\n                outputs_night.update(self.predict_poses(inputs, 'night'))\n                self.generate_images_pred(inputs, outputs, 'day')\n                self.generate_images_pred(inputs, outputs_night, 'night')\n            outputs['frame_ids'] = self.frame_ids\n            outputs['scales'] = self.scales\n            outputs['result'] = result"
+        },
+        {
+            "comment": "This code handles both dictionary and non-dictionary inputs for a model. If the input is a dictionary, it selects the 'color' input and processes accordingly. It uses an encoder to extract features from the input, then passes those features through a depth function to get predictions. The predictions are converted to depth format, and the final outputs include pred_disp and gt (ground truth) for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":1019-1043",
+            "content": "            outputs['result_night'] = result_night\n            outputs_night['frame_ids'] = self.frame_ids\n            outputs_night['scales'] = self.scales\n            outputs['outputs_night'] = outputs_night\n        else:\n            if isinstance(inputs, dict):\n                input_color = inputs[(\"color\", 0, 0)]\n                features = self.encoder(input_color, day_or_night[0])\n                outputs = self.depth(features)\n                pred_disp, _ = disp_to_depth(outputs[(\"disp\", 0)],\n                                             self.min_depth, self.max_depth)\n                pred_disp = pred_disp[:, 0].numpy()\n                outputs['pred_disp'] = np.squeeze(pred_disp)\n                outputs['gt'] = np.squeeze(inputs['depth_gt'].numpy())\n            else:\n                input_color = inputs\n                features = self.encoder(input_color, day_or_night)\n                outputs = self.depth(features)\n                pred_disp, _ = disp_to_depth(outputs[(\"disp\", 0)],\n                                             self.min_depth, self.max_depth)"
+        },
+        {
+            "comment": "This code is defining a function to predict poses between input frames for monocular sequences. It takes inputs as parameters and checks if the number of pose frames is 2. If so, it applies different treatments based on whether it's night or day. For night, it uses color_n_aug; for day, it uses color_aug. Then, it iterates through the frame IDs, excluding 's', and prepares inputs accordingly. The pose model type is \"separate_resnet\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":1045-1073",
+            "content": "                pred_disp = pred_disp[:, 0]\n                outputs = paddle.squeeze(pred_disp)\n        return outputs\n    def predict_poses(self, inputs, is_night):\n        \"\"\"Predict poses between input frames for monocular sequences.\n        \"\"\"\n        outputs = {}\n        if self.num_pose_frames == 2:\n            if is_night:\n                pose_feats = {\n                    f_i: inputs[\"color_n_aug\", f_i, 0]\n                    for f_i in self.frame_ids\n                }\n            else:\n                pose_feats = {\n                    f_i: inputs[\"color_aug\", f_i, 0]\n                    for f_i in self.frame_ids\n                }\n            for f_i in self.frame_ids[1:]:\n                if f_i != \"s\":\n                    if f_i < 0:\n                        pose_inputs = [pose_feats[f_i], pose_feats[0]]\n                    else:\n                        pose_inputs = [pose_feats[0], pose_feats[f_i]]\n                    if self.pose_model_type == \"separate_resnet\":\n                        pose_inputs = ["
+        },
+        {
+            "comment": "This code segment defines a function that calculates pose, axisangle, translation, and camera transformation parameters for an image. It takes input from the \"pose_encoder\" function, combines them, and assigns the results to specific positions in the \"outputs\" dictionary. If the frame ID is negative, it inverts the calculated matrix. The code also initializes a nested loop over different scales and generates warped color images for a given batch of inputs.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":1074-1095",
+            "content": "                            self.pose_encoder(paddle.concat(pose_inputs,\n                                                            axis=1))\n                        ]\n                    axisangle, translation = self.pose(pose_inputs)\n                    outputs[(\"axisangle\", 0, f_i)] = axisangle\n                    outputs[(\"translation\", 0, f_i)] = translation\n                    # Invert the matrix if the frame id is negative\n                    outputs[(\"cam_T_cam\", 0,\n                             f_i)] = transformation_from_parameters(\n                                 axisangle[:, 0],\n                                 translation[:, 0],\n                                 invert=(f_i < 0))\n            return outputs\n    def generate_images_pred(self, inputs, outputs, is_night):\n        \"\"\"Generate the warped (reprojected) color images for a minibatch.\n        Generated images are saved into the `outputs` dictionary.\n        \"\"\"\n        _, _, height, width = inputs['color', 0, 0].shape\n        for scale in self.scales:"
+        },
+        {
+            "comment": "The code interpolates the displacement output based on the scale, and if multiscale is not enabled, it performs bilinear interpolation to match the input size. It then converts the displacement into depth using disp_to_depth function. Depth and its corresponding scale are added to the outputs. For each frame ID in the list, it retrieves camera transformation matrix T, backprojects depth to 3D coordinates, projects them onto image plane using project_3d, and adds the resulting pixel coordinates to the outputs. If is_night is True, it modifies the color_n input's stop_gradient attribute.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":1096-1121",
+            "content": "            disp = outputs[(\"disp\", scale)]\n            if self.v1_multiscale:\n                source_scale = scale\n            else:\n                disp = F.interpolate(disp, [height, width],\n                                     mode=\"bilinear\",\n                                     align_corners=False)\n                source_scale = 0\n            _, depth = disp_to_depth(disp, self.min_depth, self.max_depth)\n            outputs[(\"depth\", 0, scale)] = depth\n            for i, frame_id in enumerate(self.frame_ids[1:]):\n                T = outputs[(\"cam_T_cam\", 0, frame_id)]\n                cam_points = self.backproject_depth[source_scale](\n                    depth, inputs[(\"inv_K\", source_scale)])\n                pix_coords = self.project_3d[source_scale](\n                    cam_points, inputs[(\"K\", source_scale)], T)\n                outputs[(\"sample\", frame_id, scale)] = pix_coords\n                if is_night:\n                    inputs[(\"color_n\", frame_id,\n                            source_scale)].stop_gradient = False"
+        },
+        {
+            "comment": "This code performs grid sampling on a tensor and assigns the result to a specific location in the outputs dictionary based on frame_id and scale. If disable_automasking is True, it also creates an identity mask for night scenes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":1122-1141",
+            "content": "                    outputs[(\"color\", frame_id,\n                             scale)] = paddle.nn.functional.grid_sample(\n                                 inputs[(\"color_n\", frame_id, source_scale)],\n                                 outputs[(\"sample\", frame_id, scale)],\n                                 padding_mode=\"border\",\n                                 align_corners=False)\n                else:\n                    inputs[(\"color\", frame_id,\n                            source_scale)].stop_gradient = False\n                    outputs[(\"color\", frame_id,\n                             scale)] = paddle.nn.functional.grid_sample(\n                                 inputs[(\"color\", frame_id, source_scale)],\n                                 outputs[(\"sample\", frame_id, scale)],\n                                 padding_mode=\"border\",\n                                 align_corners=False)\n                if not self.disable_automasking:\n                    if is_night:\n                        outputs[(\"color_identity\", frame_id, scale)] = \\"
+        },
+        {
+            "comment": "This code is selecting the input data from a dictionary based on specific conditions. If the frame_id and source_scale match, it assigns the value to \"color_n\". Otherwise, it assigns the value of \"color\" input to \"color_identity\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/backbones/adds.py\":1142-1145",
+            "content": "                            inputs[(\"color_n\", frame_id, source_scale)]\n                    else:\n                        outputs[(\"color_identity\", frame_id, scale)] = \\\n                            inputs[(\"color\", frame_id, source_scale)]"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fa8044e1-d121-43f2-ace0-5c4af43dbfdf.json b/docs/doc/fa8044e1-d121-43f2-ace0-5c4af43dbfdf.json
new file mode 100644
index 000000000..ac335e91d
--- /dev/null
+++ b/docs/doc/fa8044e1-d121-43f2-ace0-5c4af43dbfdf.json
@@ -0,0 +1,20 @@
+{
+    "summary": "Recognizer2D is a 2D model in PaddleVideo for video analysis. It requires num_segs and includes functions for processing, training/validating, and testing the model. The Recognizer2D class defines forward_net and infer_step methods for classification scores.",
+    "details": [
+        {
+            "comment": "Recognizer2D is a 2D recognizer model framework in PaddleVideo, inheriting from BaseRecognizer. It requires the number of segments (num_segs) which can be obtained from the shape of input images. The forward_net function performs image recognition using this model framework.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer2d.py\":0-26",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom ...registry import RECOGNIZERS\nfrom .base import BaseRecognizer\nimport paddle\nfrom paddlevideo.utils import get_logger\nlogger = get_logger(\"paddlevideo\")\n@RECOGNIZERS.register()\nclass Recognizer2D(BaseRecognizer):\n    \"\"\"2D recognizer model framework.\"\"\"\n    def forward_net(self, imgs):\n        # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.\n        num_segs = imgs.shape[\n            1]  # imgs.shape=[N,T,C,H,W], for most commonly case"
+        },
+        {
+            "comment": "The code defines a recognizer2D model for video analysis. It consists of three main parts: the forward_net function that processes images, the train_step function for training the model using input data, and the val_step and test_step functions for validating and testing the trained model respectively. The forward_net function reshapes the images and passes them through a backbone network if one is defined, then to a head network if one is defined as well. It returns the classification scores. The train_step calculates the loss metrics using the provided labels, while the val_step does the same but in validation mode. The test_step computes the loss metrics without providing any labels.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer2d.py\":27-59",
+            "content": "        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))\n        if self.backbone is not None:\n            feature = self.backbone(imgs)\n        else:\n            feature = imgs\n        if self.head is not None:\n            cls_score = self.head(feature, num_segs)\n        else:\n            cls_score = None\n        return cls_score\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\"\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels)\n        return loss_metrics\n    def val_step(self, data_batch):\n        imgs = data_batch[0]\n        labels = data_batch[1:]\n        cls_score = self.forward_net(imgs)\n        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)\n        return loss_metrics\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        # NOTE: (s"
+        },
+        {
+            "comment": "The code defines a Recognizer2D class with two methods: forward_net and infer_step. The forward_net method takes in images (imgs) and returns the classification scores (cls_score). The infer_step method is used for testing and follows the same process as forward_net to return cls_score.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/recognizers/recognizer2d.py\":59-68",
+            "content": "hipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics\n        imgs = data_batch[0]\n        cls_score = self.forward_net(imgs)\n        return cls_score\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        imgs = data_batch[0]\n        cls_score = self.forward_net(imgs)\n        return cls_score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fa96bb1c-e294-4472-97b3-2fb3e373dc5e.json b/docs/doc/fa96bb1c-e294-4472-97b3-2fb3e373dc5e.json
new file mode 100644
index 000000000..927a1696e
--- /dev/null
+++ b/docs/doc/fa96bb1c-e294-4472-97b3-2fb3e373dc5e.json
@@ -0,0 +1,295 @@
+{
+    "summary": "The code utilizes PaddleVideo for video inference, including preprocessing steps and various action recognition techniques. It also offers classes for human detection and pose estimation which can be used for classification or object detection tasks in videos with NMS and label/probability display.",
+    "details": [
+        {
+            "comment": "This code block is an import and error handling section for various Python libraries such as imageio, matplotlib, and json. It also contains license information and warning messages for required packages.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":0-33",
+            "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport json\nimport os\nimport shutil\nimport sys\nfrom typing import List\nimport pickle\nimport cv2\ntry:\n    import imageio\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [imageio] package and it's dependencies is required for VideoSwin.\"\n    )\ntry:\n    import matplotlib as mpl\n    import matplotlib.cm as cm\nexcept ImportError as e:\n    print(\n        f\"Warning! {e}, [matplotlib] package and it's dependencies is required for ADDS.\""
+        },
+        {
+            "comment": "This code imports necessary libraries, defines a directory path, appends the directory to the system path, and imports classes and functions from various modules within the PaddleVideo framework. It also includes abstract methods for building pipelines and metrics, as well as utility functions for model segmentation and post-processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":34-57",
+            "content": "    )\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\nimport pandas\nfrom PIL import Image\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\nfrom abc import abstractmethod\nfrom paddlevideo.loader.builder import build_pipeline\nfrom paddlevideo.loader.pipelines import (\n    AutoPadding, CenterCrop, DecodeSampler, FeatureDecoder, FrameDecoder,\n    GroupResize, Image2Array, ImageDecoder, JitterScale, MultiCrop,\n    Normalization, PackOutput, Sampler, SamplerPkl, Scale, SkeletonNorm,\n    TenCrop, ToArray, UniformCrop, VideoDecoder, SegmentationSampler,\n    SketeonCropSample, MultiCenterCrop, SketeonCropSample, UniformSampleFrames,\n    PoseDecode, PoseCompact, Resize, CenterCrop_V2, GeneratePoseTarget,\n    FormatShape, Collect)\nfrom paddlevideo.metrics.ava_utils import read_labelmap\nfrom paddlevideo.metrics.bmn_metric import boundary_choose, soft_nms\nfrom paddlevideo.utils import Registry, build, get_config\nfrom paddlevideo.modeling.framework.segmenters.utils import ASRFPostProcessing"
+        },
+        {
+            "comment": "This code imports functions from the \"ava_predict\" and \"yowo_utils\" modules. It defines a function called \"build_inference_helper\" which uses the \"Registry\" class to build an inference helper object. The base class for this object is defined as \"Base_Inference_helper\". This class has an initializer that takes arguments for number of segmentations, length of each segmentation, short size, target size, and top k.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":59-85",
+            "content": "from tools.ava_predict import (detection_inference, frame_extraction,\n                               get_detection_result, get_timestep_result,\n                               pack_result, visualize)\nfrom paddlevideo.modeling.framework.localizers.yowo_utils import nms, get_region_boxes\nINFERENCE = Registry('inference')\ndef build_inference_helper(cfg):\n    return build(cfg, INFERENCE)\nclass Base_Inference_helper():\n    def __init__(self,\n                 num_seg=8,\n                 seg_len=1,\n                 short_size=256,\n                 target_size=224,\n                 top_k=1):\n        \"\"\"Base_Inference_helper\n        Args:\n            num_seg (int, optional): number of segmentations of an sliced input video. Defaults to 8.\n            seg_len (int, optional): length of each segmentation. Defaults to 1.\n            short_size (int, optional): short size of input video. Defaults to 256.\n            target_size (int, optional): size of cropped video. Defaults to 224.\n            top_k (int, optional): select topk result in outputs. Defaults to 1."
+        },
+        {
+            "comment": "This code defines an abstract class with a preprocess method and a concrete implementation of the preprocess_batch method. The class has attributes for the number of segments, segment length, short size, target size, and top k. The preprocess_batch method processes each input file in a list of file paths and concatenates the processed data into batches. The input files are stored in the self.input\\_file attribute.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":86-120",
+            "content": "        \"\"\"\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n    @abstractmethod\n    def preprocess(self, input_file: str):\n        \"\"\"preprocess abstractmethod\n        Args:\n            input_file (str): input file path.\n        \"\"\"\n        pass\n    def preprocess_batch(self, file_list: List[str]) -> List[np.ndarray]:\n        \"\"\"preprocess for file list\n        Args:\n            file_list (List[str]): file pathes in an list, [path1, path2, ...].\n        Returns:\n            List[np.ndarray]: batched inputs data, [data_batch[0], data_batch[1], ...].\n        \"\"\"\n        batched_inputs = []\n        for file in file_list:\n            inputs = self.preprocess(file)\n            batched_inputs.append(inputs)\n        batched_inputs = [\n            np.concatenate([item[i] for item in batched_inputs])\n            for i in range(len(batched_inputs[0]))\n        ]\n        self.input_file = file_list\n        return batched_inputs"
+        },
+        {
+            "comment": "This function postprocesses output scores from a model, accepting batched output scores as input. It checks if the input file is a list and reshapes the output array accordingly. The code applies softmax to each individual output tensor along the last axis, then iterates over the number of inputs (N) to generate class predictions. Classes are sorted based on their scores, and the results are stored in a list for further use.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":122-146",
+            "content": "    def postprocess(self,\n                    output: np.ndarray,\n                    print_output: bool = True,\n                    return_result: bool = False):\n        \"\"\"postprocess\n        Args:\n            output (np.ndarray): batched output scores, shape of (batch_size, class_num).\n            print_output (bool, optional): whether to print result. Defaults to True.\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,\n            ]\n        output = output[0]  # [B, num_cls]\n        N = len(self.input_file)\n        if output.shape[0] != N:\n            output = output.reshape([N] + [output.shape[0] // N] +\n                                    list(output.shape[1:]))  # [N, T, C]\n            output = output.mean(axis=1)  # [N, C]\n        output = F.softmax(paddle.to_tensor(output), axis=-1).numpy()\n        results_list = []\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]"
+        },
+        {
+            "comment": "This code is creating a helper class for inference tasks. It takes input files, performs video classification using the PaddleVideo framework, and returns top-k class results for each video file. The class also has options to print output and return results as a list. The user can customize the number of segments, segment length, short side size, target size, and top-k values for the classification.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":147-175",
+            "content": "            scores = output[i, classes]\n            topk_class = classes[:self.top_k]\n            topk_scores = scores[:self.top_k]\n            result = {\n                \"video_id\": self.input_file[i],\n                \"topk_class\": topk_class,\n                \"topk_scores\": topk_scores\n            }\n            results_list.append(result)\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))\n                print(\"\\ttop-{0} class: {1}\".format(self.top_k, topk_class))\n                print(\"\\ttop-{0} score: {1}\".format(self.top_k, topk_scores))\n        if return_result:\n            return results_list\n@INFERENCE.register()\nclass ppTSM_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=8,\n                 seg_len=1,\n                 short_size=256,\n                 target_size=224,\n                 top_k=1):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size"
+        },
+        {
+            "comment": "This code defines a class that takes in an input file path, applies several image preprocessing operations such as decoding, sampling, resizing, cropping, and normalization, then returns the processed image data in a list. The class also initializes some parameters like the number of segments, segment length, short size for resizing, target size for cropping, and top k value. The code is part of PaddleVideo library.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":176-210",
+            "content": "        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        img_mean = [0.485, 0.456, 0.406]\n        img_std = [0.229, 0.224, 0.225]\n        ops = [\n            VideoDecoder(backend=\"decord\"),\n            Sampler(self.num_seg, self.seg_len, valid_mode=True),\n            Scale(self.short_size),\n            CenterCrop(self.target_size),\n            Image2Array(),\n            Normalization(img_mean, img_std)\n        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass ppTSN_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=25,\n                 seg_len=1,\n                 short_size=256,\n                 target_size=224,\n                 top_k=1):"
+        },
+        {
+            "comment": "This code snippet initializes a class object with several parameters (num_seg, seg_len, short_size, target_size, top_k) and defines a preprocess method. The preprocess method takes an input file path, performs various operations on the image using different ops such as VideoDecoder, Sampler, Scale, TenCrop, Image2Array, Normalization in sequence, and returns an array of processed images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":211-244",
+            "content": "        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        img_mean = [0.485, 0.456, 0.406]\n        img_std = [0.229, 0.224, 0.225]\n        ops = [\n            VideoDecoder(backend=\"decord\"),\n            Sampler(self.num_seg,\n                    self.seg_len,\n                    valid_mode=True,\n                    select_left=True),\n            Scale(self.short_size,\n                  fixed_ratio=True,\n                  do_round=True,\n                  backend='cv2'),\n            TenCrop(self.target_size),\n            Image2Array(),\n            Normalization(img_mean, img_std)\n        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['imgs'], axis=0).copy()"
+        },
+        {
+            "comment": "This function serves as a helper class for BMN inference and handles preprocessing of input files. It loads and preprocesses the features from the specified file path, converts them to float32 type, and returns the result in a list format. The postprocess function takes outputs as input, assuming it is a list containing predicted BMN, start time, and end time values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":245-277",
+            "content": "        return [res]\n@INFERENCE.register()\nclass BMN_Inference_helper(Base_Inference_helper):\n    def __init__(self, feat_dim, dscale, tscale, result_path):\n        self.feat_dim = feat_dim\n        self.dscale = dscale\n        self.tscale = tscale\n        self.result_path = result_path\n        if not os.path.isdir(self.result_path):\n            os.makedirs(self.result_path)\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        file_info = json.load(open(input_file))\n        self.feat_path = file_info['feat_path']\n        self.video_duration = file_info['duration_second']\n        feat = np.load(self.feat_path).astype('float32').T\n        res = np.expand_dims(feat, axis=0).copy()\n        return [res]\n    def postprocess(self, outputs, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        pred_bm, pred_start, pred_end = outputs"
+        },
+        {
+            "comment": "This code defines a function _gen_props that calculates snippet xmin and xmax values, generates start and end masks from pred_start and pred_end, and initializes score_vector_list. It iterates over the dscale and tscale to determine start and end indices, checks if valid indices are found, and assigns xmin, xmax, xmin_score, and xmax_score accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":278-301",
+            "content": "        self._gen_props(pred_bm, pred_start[0], pred_end[0], print_output)\n    def _gen_props(self, pred_bm, pred_start, pred_end, print_output):\n        snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]\n        snippet_xmaxs = [\n            1.0 / self.tscale * i for i in range(1, self.tscale + 1)\n        ]\n        pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]\n        start_mask = boundary_choose(pred_start)\n        start_mask[0] = 1.\n        end_mask = boundary_choose(pred_end)\n        end_mask[-1] = 1.\n        score_vector_list = []\n        for idx in range(self.dscale):\n            for jdx in range(self.tscale):\n                start_index = jdx\n                end_index = start_index + idx\n                if end_index < self.tscale and start_mask[\n                        start_index] == 1 and end_mask[end_index] == 1:\n                    xmin = snippet_xmins[start_index]\n                    xmax = snippet_xmaxs[end_index]\n                    xmin_score = pred_start[start_index]\n                    xmax_score = pred_end[end_index]"
+        },
+        {
+            "comment": "This code performs non-maximum suppression (NMS) on bounding box predictions, selects top-5 predictions for each video feature path, and stores the results in a dictionary. It also prints the top-5 predictions if `print_output` is enabled.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":302-327",
+            "content": "                    bm_score = pred_bm[idx, jdx]\n                    conf_score = xmin_score * xmax_score * bm_score\n                    score_vector_list.append([xmin, xmax, conf_score])\n        cols = [\"xmin\", \"xmax\", \"score\"]\n        score_vector_list = np.stack(score_vector_list)\n        df = pandas.DataFrame(score_vector_list, columns=cols)\n        result_dict = {}\n        proposal_list = []\n        df = soft_nms(df, alpha=0.4, t1=0.55, t2=0.9)\n        for idx in range(min(100, len(df))):\n            tmp_prop={\"score\":df.score.values[idx], \\\n                      \"segment\":[max(0,df.xmin.values[idx])*self.video_duration, \\\n                                 min(1,df.xmax.values[idx])*self.video_duration]}\n            proposal_list.append(tmp_prop)\n        result_dict[self.feat_path] = proposal_list\n        # print top-5 predictions\n        if print_output:\n            print(\"Current video file: {0} :\".format(self.feat_path))\n            for pred in proposal_list[:5]:\n                print(pred)\n        # save result"
+        },
+        {
+            "comment": "This code defines a class called TokenShift_Inference_helper, which extends Base_Inference_helper. It has several parameters for customizing the inference process and includes a preprocess method that reads an input file and returns results as a dictionary. The results are then written to a JSON file named \"bmn_results_inference.json\".",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":328-361",
+            "content": "        outfile = open(\n            os.path.join(self.result_path, \"bmn_results_inference.json\"), \"w\")\n        json.dump(result_dict, outfile)\n@INFERENCE.register()\nclass TokenShift_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=8,\n                 seg_len=1,\n                 short_size=256,\n                 target_size=256,\n                 top_k=1,\n                 mean=[0.5, 0.5, 0.5],\n                 std=[0.5, 0.5, 0.5]):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n        self.mean = mean\n        self.std = std\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        ops = [\n            VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),"
+        },
+        {
+            "comment": "The code creates a series of data processing operations to preprocess input images for the TimeSformer model. It initializes an instance of TimeSformer_Inference_helper with specified parameters, then applies these operations in order on the input image, resulting in a final tensor ready for model inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":362-394",
+            "content": "            Sampler(self.num_seg, self.seg_len, valid_mode=True),\n            Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),\n            Image2Array(data_format='cthw'),\n            JitterScale(self.short_size, self.short_size),\n            MultiCenterCrop(self.target_size)\n        ]\n        for op in ops:\n            results = op(results)\n        # [N,C,Tx3,H,W]\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass TimeSformer_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=8,\n                 seg_len=1,\n                 short_size=224,\n                 target_size=224,\n                 top_k=1,\n                 mean=[0.45, 0.45, 0.45],\n                 std=[0.225, 0.225, 0.225]):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n        self.mean = mean\n        self.std = std\n    def preprocess(self, input_file):"
+        },
+        {
+            "comment": "This code defines a function that reads an input file, applies a series of operations to it, and returns the processed data. The operations include video decoding, sampling, normalization, image conversion, jitter scaling, and uniform cropping. The result is a tensor in the shape [N,C,Tx3,H,W], where N is the number of segments, C is the number of channels, Tx3 is the number of frames, H is the height, and W is the width.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":395-426",
+            "content": "        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        ops = [\n            VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),\n            Sampler(self.num_seg,\n                    self.seg_len,\n                    valid_mode=True,\n                    linspace_sample=True),\n            Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),\n            Image2Array(data_format='cthw'),\n            JitterScale(self.short_size, self.short_size),\n            UniformCrop(self.target_size)\n        ]\n        for op in ops:\n            results = op(results)\n        # [N,C,Tx3,H,W]\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass VideoSwin_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=4,\n                 seg_len=32,\n                 frame_interval=2,"
+        },
+        {
+            "comment": "This code defines a class for video preprocessing, taking input file path as parameter. It checks if the file exists and stores the filename in results dictionary. The class uses Decord backend for video decoding and Sampler to sample frames based on specified parameters.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":427-457",
+            "content": "                 short_size=224,\n                 target_size=224,\n                 top_k=1,\n                 mean=[123.675, 116.28, 103.53],\n                 std=[58.395, 57.12, 57.375]):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.frame_interval = frame_interval\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n        self.mean = mean\n        self.std = std\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        self.input_file = input_file\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        ops = [\n            VideoDecoder(backend='decord', mode='valid'),\n            Sampler(num_seg=self.num_seg,\n                    frame_interval=self.frame_interval,\n                    seg_len=self.seg_len,\n                    valid_mode=True,\n                    use_pil=False),"
+        },
+        {
+            "comment": "The code preprocesses images by resizing, cropping, normalizing, and converting to arrays. It also provides a postprocessing function that handles outputs for multiple input files if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":458-488",
+            "content": "            Scale(short_size=self.short_size,\n                  fixed_ratio=False,\n                  keep_ratio=True,\n                  backend='cv2',\n                  do_round=True),\n            CenterCrop(target_size=224, backend='cv2'),\n            Normalization(mean=self.mean,\n                          std=self.std,\n                          tensor_shape=[3, 1, 1, 1],\n                          inplace=True),\n            Image2Array(data_format='cthw')\n        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n    def postprocess(self, output, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,\n            ]\n        output = output[0]  # [B, num_cls]\n        N = len(self.input_file)\n        if output.shape[0] != N:\n            output = output.reshape([N] + [output.shape[0] // N] +\n                                    list(output.shape[1:]))  # [N, T, C]"
+        },
+        {
+            "comment": "This code snippet is part of a function that extracts the top k classes and their corresponding scores from an output tensor. It first performs mean pooling along the axis 1 to reshape the tensor to [N, C] format, where N is the number of images and C is the number of channels. Then, it iterates over each image and finds the indexes of top k classes by performing argument partition and sorting them based on their scores. Finally, it prints out these results for each image if the print_output flag is set to True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":489-515",
+            "content": "            output = output.mean(axis=1)  # [N, C]\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]\n            scores = output[i, classes]\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))\n                for j in range(self.top_k):\n                    print(\"\\ttop-{0} class: {1}\".format(j + 1, classes[j]))\n                    print(\"\\ttop-{0} score: {1}\".format(j + 1, scores[j]))\n@INFERENCE.register()\nclass VideoSwin_TableTennis_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=1,\n                 seg_len=32,\n                 short_size=256,\n                 target_size=224,\n                 top_k=1):\n        self.num_seg = num_seg\n        self.seg_len = seg_len\n        self.short_size = short_size\n        self.target_size = target_size\n        self.top_k = top_k\n    def preprocess(self, input_file):"
+        },
+        {
+            "comment": "This code defines a function that takes an input file, reads frames from it, applies various transformations including decoding, sampling, scaling, cropping, and normalization, and finally converts the resulting images to a numpy array. It uses the PaddleVideo library and has parameters for short_size, target_size, and num_seg.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":516-541",
+            "content": "        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'frame_dir': input_file, 'suffix': 'img_{:05}.jpg'}\n        img_mean = [123.675, 116.28, 103.53]\n        img_std = [58.395, 57.12, 57.375]\n        ops = [\n            FrameDecoder(),\n            SamplerPkl(num_seg=self.num_seg,\n                       seg_len=self.seg_len,\n                       backend='cv2',\n                       valid_mode=True),\n            Scale(short_size=self.short_size,\n                  fixed_ratio=False,\n                  keep_ratio=True,\n                  backend='cv2',\n                  do_round=True),\n            UniformCrop(target_size=self.target_size, backend='cv2'),\n            Normalization(mean=img_mean,\n                          std=img_std,\n                          tensor_shape=[3, 1, 1, 1],\n                          inplace=True),\n            Image2Array(data_format='cthw')"
+        },
+        {
+            "comment": "The code snippet is adding text to a video. It creates directories, loads or captures frames from the video, and extracts important information like frame length, FPS, and frame width. The code then calls other functions to manipulate images and add text to each frame before storing or displaying the final result.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":542-572",
+            "content": "        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['imgs'], axis=0).copy()\n        return [res]\n    def add_text_to_video(\n            self,\n            video_path,\n            output_dir=\"applications/TableTennis/ActionRecognition/results\",\n            text=None):\n        os.makedirs(output_dir, exist_ok=True)\n        if video_path.endswith('.pkl'):\n            try:\n                import cPickle as pickle\n                from cStringIO import StringIO\n            except ImportError:\n                import pickle\n                from io import BytesIO\n            from PIL import Image\n            data_loaded = pickle.load(open(video_path, 'rb'), encoding='bytes')\n            _, _, frames = data_loaded\n            frames_len = len(frames)\n        else:\n            videoCapture = cv2.VideoCapture()\n            videoCapture.open(video_path)\n            fps = videoCapture.get(cv2.CAP_PROP_FPS)\n            frame_width = int(videoCapture.get(cv2.CAP_PROP_FRAME_WIDTH))"
+        },
+        {
+            "comment": "The code reads the video frames and resizes them, then converts to RGB format. If the file is a .pkl file, it opens the image from binary data. It also adds text to each frame using cv2.putText. The code appends each frame in RGB format to a list, and finally, releases the videoCapture object, closes all windows, and saves the resulting GIF with a specific filename.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":573-594",
+            "content": "            frame_height = int(videoCapture.get(cv2.CAP_PROP_FRAME_HEIGHT))\n            frames_len = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT)\n            print(\"fps=\", int(fps), \"frames=\", int(frames_len), \"scale=\",\n                  f\"{frame_height}x{frame_width}\")\n        frames_rgb_list = []\n        for i in range(int(frames_len)):\n            if video_path.endswith('.pkl'):\n                frame = np.array(\n                    Image.open(BytesIO(frames[i])).convert(\"RGB\").resize(\n                        (240, 135)))[:, :, ::-1].astype('uint8')\n            else:\n                _, frame = videoCapture.read()\n            frame = cv2.putText(frame, text, (30, 30), cv2.FONT_HERSHEY_COMPLEX,\n                                1.0, (0, 0, 255), 2)\n            frames_rgb_list.append(frame[:, :, ::-1])  # bgr to rgb\n        if not video_path.endswith('.pkl'):\n            videoCapture.release()\n        cv2.destroyAllWindows()\n        output_filename = os.path.basename(video_path)\n        output_filename = output_filename.split('.')[0] + '.gif'"
+        },
+        {
+            "comment": "The function `postprocess` takes an output list and processes it according to the specified parameters. It ensures that the shape of the input matches with the number of files in the input_file list, then calculates class scores for each video file. If print_output is True, it will print the current video file being processed. Finally, if save_gif is True, it creates a GIF using the frames_rgb_list and saves it to the specified output directory with the filename mentioned in the function call.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":595-619",
+            "content": "        imageio.mimsave(f'{output_dir}/{output_filename}',\n                        frames_rgb_list,\n                        'GIF',\n                        duration=0.00085)\n    def postprocess(self, output, print_output=True, save_gif=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,\n            ]\n        output = output[0]  # [B, num_cls]\n        N = len(self.input_file)\n        if output.shape[0] != N:\n            output = output.reshape([N] + [output.shape[0] // N] +\n                                    list(output.shape[1:]))  # [N, T, C]\n            output = output.mean(axis=1)  # [N, C]\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]\n            scores = output[i, classes]\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))"
+        },
+        {
+            "comment": "This code is a part of PaddleVideo's utils.py file, specifically the SlowFast_Inference_helper class, which handles video inference using the SlowFast model. The class has attributes for number of frames, sampling rate, target size, alpha value, and top k classes to display. It contains methods like preprocess, add_text_to_video, and infer. In this section, it displays the top-1 class and score for each frame in a video and adds text annotations to the first frame of the video if save_gif is set to True.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":620-650",
+            "content": "                for j in range(self.top_k):\n                    print(\"\\ttop-{0} class: {1}\".format(j + 1, classes[j]))\n                    print(\"\\ttop-{0} score: {1}\".format(j + 1, scores[j]))\n            if save_gif:\n                self.add_text_to_video(\n                    self.input_file[0],\n                    text=f\"{str(classes[0])} {float(scores[0]):.5f}\")\n@INFERENCE.register()\nclass SlowFast_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_frames=32,\n                 sampling_rate=2,\n                 target_size=256,\n                 alpha=8,\n                 top_k=1):\n        self.num_frames = num_frames\n        self.sampling_rate = sampling_rate\n        self.target_size = target_size\n        self.alpha = alpha\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {"
+        },
+        {
+            "comment": "This code defines a function for preprocessing and postprocessing video frames. It initializes parameters like filename, sampling rate, target size, and normalization values. The function applies a series of operations to the input image, such as decoding, jitter scaling, cropping, converting to array format, normalizing pixel values, and packing the output. Finally, it expands the result along an axis and returns the processed frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":651-681",
+            "content": "            'filename': input_file,\n            'temporal_sample_index': 0,\n            'spatial_sample_index': 0,\n            'temporal_num_clips': 1,\n            'spatial_num_clips': 1\n        }\n        img_mean = [0.45, 0.45, 0.45]\n        img_std = [0.225, 0.225, 0.225]\n        ops = [\n            DecodeSampler(self.num_frames, self.sampling_rate, test_mode=True),\n            JitterScale(self.target_size, self.target_size),\n            MultiCrop(self.target_size),\n            Image2Array(transpose=False),\n            Normalization(img_mean, img_std, tensor_shape=[1, 1, 1, 3]),\n            PackOutput(self.alpha),\n        ]\n        for op in ops:\n            results = op(results)\n        res = []\n        for item in results['imgs']:\n            res.append(np.expand_dims(item, axis=0).copy())\n        return res\n    def postprocess(self, output, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,"
+        },
+        {
+            "comment": "This function reshapes the output tensor based on the number of input files, then calculates top classes and scores for each file. If print_output is True, it prints the top classes and scores for each video file. The output is from a STGCN (Spatio-Temporal Graph Convolutional Network) inference process.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":682-705",
+            "content": "            ]\n        output = output[0]  # [B, num_cls]\n        N = len(self.input_file)\n        if output.shape[0] != N:\n            output = output.reshape([N] + [output.shape[0] // N] +\n                                    list(output.shape[1:]))  # [N, T, C]\n            output = output.mean(axis=1)  # [N, C]\n        # output = F.softmax(paddle.to_tensor(output), axis=-1).numpy() # done in it's head\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]\n            scores = output[i, classes]\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))\n                for j in range(self.top_k):\n                    print(\"\\ttop-{0} class: {1}\".format(j + 1, classes[j]))\n                    print(\"\\ttop-{0} score: {1}\".format(j + 1, scores[j]))\n@INFERENCE.register()\nclass STGCN_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_channels,"
+        },
+        {
+            "comment": "This code defines a class `CTRGCN_Inference_helper` that preprocesses data for CTRGCN inference. It takes input file path as parameter and returns processed data as list. The preprocessing includes applying auto-padding, skeleton normalization operations on the input data. The window size, vertex numbers, person numbers can be specified during initialization of the class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":706-739",
+            "content": "                 window_size,\n                 vertex_nums,\n                 person_nums,\n                 top_k=1):\n        self.num_channels = num_channels\n        self.window_size = window_size\n        self.vertex_nums = vertex_nums\n        self.person_nums = person_nums\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        data = np.load(input_file)\n        results = {'data': data}\n        ops = [AutoPadding(window_size=self.window_size), SkeletonNorm()]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['data'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass CTRGCN_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_channels=3,\n                 vertex_nums=25,\n                 person_nums=2,\n                 window_size=64,"
+        },
+        {
+            "comment": "This code defines a class for preprocessing data and applying operations. It has an `__init__` method to initialize the window size, number of channels, vertex numbers, person numbers, and top k. The `preprocess` method takes a file path, asserts that it exists, loads the data, applies operations defined in ops, expands dimensions, and returns the processed data. It also registers this class for inference using the @INFERENCE decorator.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":740-774",
+            "content": "                 p_interval=[0.95],\n                 top_k=1):\n        self.window_size = window_size\n        self.p_interval = p_interval\n        self.num_channels = num_channels\n        self.vertex_nums = vertex_nums\n        self.person_nums = person_nums\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        data = np.load(input_file)\n        results = {'data': data}\n        ops = [\n            SketeonCropSample(window_size=self.window_size,\n                              p_interval=self.p_interval)\n        ]\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['data'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass AGCN2s_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 window_size=300,\n                 num_channels=3,\n                 vertex_nums=25,"
+        },
+        {
+            "comment": "This code defines a class for preprocessing input data and an inference helper class for MSTCN. It initializes the class with parameters like window size, number of channels, vertex numbers, and top k. The `preprocess` method loads data from a file path and returns it as a list. The `MSTCN_Inference_helper` registers itself to be used by INFERENCE.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":775-806",
+            "content": "                 person_nums=2,\n                 top_k=1):\n        self.window_size = window_size\n        self.num_channels = num_channels\n        self.vertex_nums = vertex_nums\n        self.person_nums = person_nums\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        data = np.load(input_file)\n        results = {'data': data}\n        res = np.expand_dims(results['data'], axis=0).copy()\n        return [res]\n@INFERENCE.register()\nclass MSTCN_Inference_helper(Base_Inference_helper):\n    def __init__(self, num_channels, actions_map_file_path, feature_path=None):\n        self.num_channels = num_channels\n        file_ptr = open(actions_map_file_path, 'r')\n        actions = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        self.actions_dict = dict()\n        for a in actions:\n            self.actions_dict[a.split()[1]] = int(a.split()[0])"
+        },
+        {
+            "comment": "The code defines a class with methods to handle video feature files. It initializes the feature path and creates an empty list for file names. The `get_process_file` method reads the input text file, checks if each file exists, appends file paths to `self.file_name_list`, and returns a list of files. The `preprocess` method loads a feature file into data, creates a dictionary with 'video_feat' key, and returns it as output_list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":808-839",
+            "content": "        self.feature_path = feature_path\n        self.file_name_list = []\n    def get_process_file(self, input_file_txt):\n        with open(input_file_txt, 'r') as file_ptr:\n            info = file_ptr.read().split('\\n')[:-1]\n        files = []\n        for video_name in info:\n            if self.feature_path is not None:\n                file_name = video_name.split('.')[0] + \".npy\"\n                input_file = os.path.join(self.feature_path, file_name)\n            else:\n                input_file = video_name\n            assert os.path.isfile(\n                input_file) is not None, \"{0} not exists\".format(input_file)\n            files.append(input_file)\n            self.file_name_list.append(input_file.split('/')[-1].split('.')[0])\n        return files\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, feature file list txt path\n        return: list\n        \"\"\"\n        output_list = []\n        data = np.load(input_file)\n        results = {'video_feat': data, 'video_gt': None}\n        ops = []"
+        },
+        {
+            "comment": "The code processes video features, performs post-processing by creating a directory if it doesn't exist, appends the processed output to the output list and then creates separate text files for each result in the output list. The text files contain the recognized actions and are saved in the specified directory with corresponding filenames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":840-866",
+            "content": "        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['video_feat'], axis=0).copy()\n        output_list.append(res)\n        return output_list\n    def postprocess(self, output, print_output=True):\n        reslut_path = os.path.join(\"./inference/infer_results/\")\n        if not os.path.isdir(reslut_path):\n            os.makedirs(reslut_path)\n        output = [output]\n        for outputs in output:\n            output_np = outputs[0]\n            recognition = []\n            for i in range(output_np.shape[0]):\n                recognition = np.concatenate((recognition, [\n                    list(self.actions_dict.keys())[list(\n                        self.actions_dict.values()).index(output_np[i])]\n                ]))\n            recog_content = list(recognition)\n            recog_content = [line + \"\\n\" for line in recog_content]\n            filename = self.file_name_list.pop(0)\n            write_path = os.path.join(reslut_path, filename + \".txt\")\n            f = open(write_path, \"w\")"
+        },
+        {
+            "comment": "This code initializes an instance of the ASRF_Inference_helper class, which takes in parameters such as num_channels, actions_map_file_path, postprocessing_method, boundary_threshold, and feature_path. It reads the actions map file, splits the lines into separate action names and their corresponding indices, and stores them in a dictionary called self.actions_dict. The code also creates an empty list called self.file_name_list. Additionally, it defines another function called get_process_file that takes input_file_txt as a parameter and reads its content to store information for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":867-897",
+            "content": "            f.writelines(recog_content)\n            f.close()\n        print(\"result write in : \" + write_path)\n@INFERENCE.register()\nclass ASRF_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_channels,\n                 actions_map_file_path,\n                 postprocessing_method,\n                 boundary_threshold,\n                 feature_path=None):\n        self.num_channels = num_channels\n        file_ptr = open(actions_map_file_path, 'r')\n        actions = file_ptr.read().split('\\n')[:-1]\n        file_ptr.close()\n        self.actions_dict = dict()\n        for a in actions:\n            self.actions_dict[a.split()[1]] = int(a.split()[0])\n        self.postprocessing_method = postprocessing_method\n        self.boundary_threshold = boundary_threshold\n        self.feature_path = feature_path\n        self.file_name_list = []\n    def get_process_file(self, input_file_txt):\n        with open(input_file_txt, 'r') as file_ptr:\n            info = file_ptr.read().split('\\n')[:-1]\n        files = []"
+        },
+        {
+            "comment": "The code defines a class with methods for loading feature files, preprocessing data, and post-processing results. The `load_features` method reads the feature file list, checks if each input file exists, and stores their names in `file_name_list`. The `preprocess` method loads the features from a specified input file, applies transformations defined by `ops`, and returns a processed output. The `postprocess` method saves the final results to the specified result path.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":898-931",
+            "content": "        for video_name in info:\n            if self.feature_path is not None:\n                file_name = video_name.split('.')[0] + \".npy\"\n                input_file = os.path.join(self.feature_path, file_name)\n            else:\n                input_file = video_name\n            assert os.path.isfile(\n                input_file) is not None, \"{0} not exists\".format(input_file)\n            files.append(input_file)\n            self.file_name_list.append(input_file.split('/')[-1].split('.')[0])\n        return files\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, feature file list txt path\n        return: list\n        \"\"\"\n        output_list = []\n        data = np.load(input_file)\n        results = {'video_feat': data, 'video_gt': None}\n        ops = []\n        for op in ops:\n            results = op(results)\n        res = np.expand_dims(results['video_feat'], axis=0).copy()\n        output_list.append(res)\n        return output_list\n    def postprocess(self, output, print_output=True):\n        reslut_path = os.path.join(\"./inference/infer_results/\")"
+        },
+        {
+            "comment": "The code is creating a directory if it doesn't exist, then processing and storing video outputs into separate text files based on the actions detected. It uses a dictionary to match action values with corresponding labels. The processed output is written into a new file for each video, using the populated file name list as references.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":932-958",
+            "content": "        if not os.path.isdir(reslut_path):\n            os.makedirs(reslut_path)\n        output = [output]\n        for outputs in output:\n            outputs_cls_np = outputs[0]\n            outputs_boundary_np = outputs[1]\n            output_np = ASRFPostProcessing(\n                outputs_cls_np,\n                outputs_boundary_np,\n                self.postprocessing_method,\n                boundary_threshold=self.boundary_threshold).numpy()[0, :]\n            recognition = []\n            for i in range(output_np.shape[0]):\n                recognition = np.concatenate((recognition, [\n                    list(self.actions_dict.keys())[list(\n                        self.actions_dict.values()).index(output_np[i])]\n                ]))\n            recog_content = list(recognition)\n            recog_content = [line + \"\\n\" for line in recog_content]\n            filename = self.file_name_list.pop(0)\n            write_path = os.path.join(reslut_path, filename + \".txt\")\n            f = open(write_path, \"w\")\n            f.writelines(recog_content)"
+        },
+        {
+            "comment": "This code defines a class `AttentionLSTM_Inference_helper` that initializes attributes for processing data, and has a method `preprocess()` to process input file. The method applies feature decoding operations on the input file, stores results in dictionary format, and returns the result as a list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":959-992",
+            "content": "            f.close()\n        print(\"result write in : \" + write_path)\n@INFERENCE.register()\nclass AttentionLSTM_Inference_helper(Base_Inference_helper):\n    def __init__(\n            self,\n            num_classes,  #Optional, the number of classes to be classified.\n            feature_num,\n            feature_dims,\n            embedding_size,\n            lstm_size,\n            top_k=1):\n        self.num_classes = num_classes\n        self.feature_num = feature_num\n        self.feature_dims = feature_dims\n        self.embedding_size = embedding_size\n        self.lstm_size = lstm_size\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {'filename': input_file}\n        ops = [FeatureDecoder(num_classes=self.num_classes, has_label=False)]\n        for op in ops:\n            results = op(results)\n        res = []"
+        },
+        {
+            "comment": "This code snippet defines a function and a class for video inference using the TransNetV2 model. The function takes input frames, processes them by dividing into windows of 100 frames, padding first/last window, and returns the results as a list of arrays representing data, lengths, and masks for 'rgb' and 'audio' modalities. The class initializes an instance with specified parameters for image size, number of channels, threshold value, output path, and visualization flag.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":993-1021",
+            "content": "        for modality in ['rgb', 'audio']:\n            res.append(\n                np.expand_dims(results[f'{modality}_data'], axis=0).copy())\n            res.append(\n                np.expand_dims(results[f'{modality}_len'], axis=0).copy())\n            res.append(\n                np.expand_dims(results[f'{modality}_mask'], axis=0).copy())\n        return res\n@INFERENCE.register()\nclass TransNetV2_Inference_helper():\n    def __init__(self,\n                 num_frames,\n                 height,\n                 width,\n                 num_channels,\n                 threshold=0.5,\n                 output_path=None,\n                 visualize=True):\n        self._input_size = (height, width, num_channels)\n        self.output_path = output_path\n        self.len_frames = 0\n        self.threshold = threshold\n        self.visualize = visualize\n    def input_iterator(self, frames):\n        # return windows of size 100 where the first/last 25 frames are from the previous/next batch\n        # the first and last window must be padded by copies of the first and last frame of the video"
+        },
+        {
+            "comment": "This code is part of a function that takes in an input file and preprocesses it. It imports the 'ffmpeg' library, checks if it exists or not, and then proceeds with the data processing operations. The code calculates the number of padded frames based on the total number of frames, concatenates the start frame, frames, and end frame into a single array, and then iteratively yields batches of 100 elements from this array as an iterator for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1022-1050",
+            "content": "        no_padded_frames_start = 25\n        no_padded_frames_end = 25 + 50 - (\n            len(frames) % 50 if len(frames) % 50 != 0 else 50)  # 25 - 74\n        start_frame = np.expand_dims(frames[0], 0)\n        end_frame = np.expand_dims(frames[-1], 0)\n        padded_inputs = np.concatenate([start_frame] * no_padded_frames_start +\n                                       [frames] +\n                                       [end_frame] * no_padded_frames_end, 0)\n        ptr = 0\n        while ptr + 100 <= len(padded_inputs):\n            out = padded_inputs[ptr:ptr + 100]\n            out = out.astype(np.float32)\n            ptr += 50\n            yield out[np.newaxis]\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: iterator\n        \"\"\"\n        try:\n            import ffmpeg\n        except ImportError as e:\n            print(\n                f\"Warning! {e}, [ffmpeg-python] package and it's dependencies is required for TransNetV2.\"\n            )\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format("
+        },
+        {
+            "comment": "The code initializes a video input and extracts frames from it. It then reshapes the frames into a 3D array and stores them for further processing. The `input_iterator` function returns an iterator over these frames. The `predictions_to_scenes` function takes predictions, converts them to binary format (0 or 1), and iterates through them to identify scene changes based on consecutive 0's and 1's.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1051-1073",
+            "content": "            input_file)\n        self.input_file = input_file\n        self.filename = os.path.splitext(os.path.split(self.input_file)[1])[0]\n        video_stream, err = ffmpeg.input(\n            self.input_file).output(\"pipe:\",\n                                    format=\"rawvideo\",\n                                    pix_fmt=\"rgb24\",\n                                    s=\"48x27\").run(capture_stdout=True,\n                                                   capture_stderr=True)\n        self.frames = np.frombuffer(video_stream,\n                                    np.uint8).reshape([-1, 27, 48, 3])\n        self.len_frames = len(self.frames)\n        return self.input_iterator(self.frames)\n    def predictions_to_scenes(self, predictions):\n        predictions = (predictions > self.threshold).astype(np.uint8)\n        scenes = []\n        t, t_prev, start = -1, 0, 0\n        for i, t in enumerate(predictions):\n            if t_prev == 1 and t == 0:\n                start = i\n            if t_prev == 0 and t == 1 and i != 0:"
+        },
+        {
+            "comment": "The code above is part of a video processing tool. It appends the start and end frames of a scene to a list, skips scenes with no changes in predictions, pads frames to ensure even widths, and then flattens the scene lists into an array. The `visualize_predictions` function takes a sequence of frames and predictions, pads them to match lengths, and splits the frames into a grid based on width.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1074-1102",
+            "content": "                scenes.append([start, i])\n            t_prev = t\n        if t == 0:\n            scenes.append([start, i])\n        # just fix if all predictions are 1\n        if len(scenes) == 0:\n            return np.array([[0, len(predictions) - 1]], dtype=np.int32)\n        return np.array(scenes, dtype=np.int32)\n    def visualize_predictions(self, frames, predictions):\n        from PIL import Image, ImageDraw\n        if isinstance(predictions, np.ndarray):\n            predictions = [predictions]\n        ih, iw, ic = frames.shape[1:]\n        width = 25\n        # pad frames so that length of the video is divisible by width\n        # pad frames also by len(predictions) pixels in width in order to show predictions\n        pad_with = width - len(frames) % width if len(\n            frames) % width != 0 else 0\n        frames = np.pad(frames, [(0, pad_with), (0, 1), (0, len(predictions)),\n                                 (0, 0)])\n        predictions = [np.pad(x, (0, pad_with)) for x in predictions]\n        height = len(frames) // width"
+        },
+        {
+            "comment": "The code takes in a list of predictions and reshapes them into an image. It then iterates over the frames and predictions, drawing lines to visualize multiple predictions per frame. Finally, it returns the processed image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1104-1132",
+            "content": "        img = frames.reshape([height, width, ih + 1, iw + len(predictions), ic])\n        img = np.concatenate(np.split(\n            np.concatenate(np.split(img, height), axis=2)[0], width),\n                             axis=2)[0, :-1]\n        img = Image.fromarray(img)\n        draw = ImageDraw.Draw(img)\n        # iterate over all frames\n        for i, pred in enumerate(zip(*predictions)):\n            x, y = i % width, i // width\n            x, y = x * (iw + len(predictions)) + iw, y * (ih + 1) + ih - 1\n            # we can visualize multiple predictions per single frame\n            for j, p in enumerate(pred):\n                color = [0, 0, 0]\n                color[(j + 1) % 3] = 255\n                value = round(p * (ih - 1))\n                if value != 0:\n                    draw.line((x + j, y, x + j, y - value),\n                              fill=tuple(color),\n                              width=1)\n        return img\n    def postprocess(self, outputs, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\""
+        },
+        {
+            "comment": "This code generates predictions for single and all frames. It extracts logits from outputs, applies sigmoid function to convert them into probabilities, and stores the results in a list. Finally, it concatenates the lists of single and all frame predictions for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1133-1148",
+            "content": "        predictions = []\n        for output in outputs:\n            single_frame_logits, all_frames_logits = output\n            single_frame_pred = F.sigmoid(paddle.to_tensor(single_frame_logits))\n            all_frames_pred = F.sigmoid(paddle.to_tensor(all_frames_logits))\n            predictions.append((single_frame_pred.numpy()[0, 25:75, 0],\n                                all_frames_pred.numpy()[0, 25:75, 0]))\n        single_frame_pred = np.concatenate(\n            [single_ for single_, all_ in predictions])\n        all_frames_pred = np.concatenate(\n            [all_ for single_, all_ in predictions])\n        single_frame_predictions, all_frame_predictions = single_frame_pred[:\n                                                                            self\n                                                                            .\n                                                                            len_frames], all_frames_pred[:\n                                                                                                         self"
+        },
+        {
+            "comment": "The code takes in single-frame and all-frame predictions, converts them into shot boundary scenes, and then optionally prints the output. If an output path is provided and it doesn't exist, it creates the directory. It then stacks the two prediction arrays horizontally, saves the frame predictions file with formatted floats, and saves the scene file with formatted integers.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1149-1168",
+            "content": "                                                                                                         .\n                                                                                                         len_frames]\n        scenes = self.predictions_to_scenes(single_frame_predictions)\n        if print_output:\n            print(\"Current video file: {0}\".format(self.input_file))\n            print(\"\\tShot Boundarys: {0}\".format(scenes))\n        if self.output_path:\n            if not os.path.exists(self.output_path):\n                os.makedirs(self.output_path)\n            predictions = np.stack(\n                [single_frame_predictions, all_frame_predictions], 1)\n            predictions_file = os.path.join(self.output_path,\n                                            self.filename + \"_predictions.txt\")\n            np.savetxt(predictions_file, predictions, fmt=\"%.6f\")\n            scenes_file = os.path.join(self.output_path,\n                                       self.filename + \"_scenes.txt\")\n            np.savetxt(scenes_file, scenes, fmt=\"%d\")"
+        },
+        {
+            "comment": "This code initializes an ADDS_Inference_helper object with various parameters such as frame indices, number of scales, side map, height, width, full resolution shape, number of channels, image extension, and K. The visualize feature is also included to display predictions on saved images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1170-1200",
+            "content": "            if self.visualize:\n                pil_image = self.visualize_predictions(\n                    self.frames,\n                    predictions=(single_frame_predictions,\n                                 all_frame_predictions))\n                image_file = os.path.join(self.output_path,\n                                          self.filename + \"_vis.png\")\n                pil_image.save(image_file)\n@INFERENCE.register()\nclass ADDS_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 frame_idxs=[0],\n                 num_scales=4,\n                 side_map={\n                     \"2\": 2,\n                     \"3\": 3,\n                     \"l\": 2,\n                     \"r\": 3\n                 },\n                 height=256,\n                 width=512,\n                 full_res_shape=None,\n                 num_channels=None,\n                 img_ext=\".png\",\n                 K=None):\n        self.frame_idxs = frame_idxs\n        self.num_scales = num_scales\n        self.side_map = side_map"
+        },
+        {
+            "comment": "The code defines a class with attributes 'full_res_shape', 'img_ext', 'height', 'width', and 'K'. It also has a method 'preprocess' that takes an input file path, checks if the file exists, and returns a list. The preprocess method uses three operations: ImageDecoder, GroupResize, and ToArray(). These operations are applied in sequence to preprocess the image data from the given input file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1201-1236",
+            "content": "        self.full_res_shape = full_res_shape\n        self.img_ext = img_ext\n        self.height = height\n        self.width = width\n        self.K = K\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        results = {\n            'filename': input_file,\n            'mode': 'infer',\n            'day_or_night': 'day',\n        }\n        ops = [\n            ImageDecoder(\n                backend='pil',\n                dataset='kitti',\n                frame_idxs=self.frame_idxs,\n                num_scales=self.num_scales,\n                side_map=self.side_map,\n                full_res_shape=self.full_res_shape,\n                img_ext=self.img_ext,\n            ),\n            GroupResize(\n                height=self.height,\n                width=self.width,\n                K=self.K,\n                scale=1,\n                mode='infer',\n            ),\n            ToArray(),"
+        },
+        {
+            "comment": "This function processes a list of outputs and performs post-processing operations on each output. It checks if the input file is a single item or a list, then iterates over the outputs to extract depth maps, optionally prints information about each input image and saves the associated depth map as an image file in a specified directory. The code also converts the depth maps to PNG format before saving them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1237-1263",
+            "content": "        ]\n        for op in ops:\n            results = op(results)\n        res = results['imgs'][('color', 0, 0)]\n        res = np.expand_dims(res, axis=0).copy()\n        return [res]\n    def postprocess(self, output, print_output, save_dir='data/'):\n        \"\"\"\n        output: list\n        \"\"\"\n        if not isinstance(self.input_file, list):\n            self.input_file = [\n                self.input_file,\n            ]\n        print(len(output))\n        N = len(self.input_file)\n        for i in range(N):\n            pred_depth = output[i]  # [H, W]\n            if print_output:\n                print(\"Current input image: {0}\".format(self.input_file[i]))\n                file_name = os.path.basename(self.input_file[i]).split('.')[0]\n                save_path = os.path.join(save_dir,\n                                         file_name + \"_depth\" + \".png\")\n                pred_depth_color = self._convertPNG(pred_depth)\n                pred_depth_color.save(save_path)\n                print(f\"pred depth image saved to: {save_path}\")"
+        },
+        {
+            "comment": "This code defines a function `_convertPNG` that converts an image to PNG format after resizing, normalizing, and color mapping. The class `AVA_SlowFast_FastRCNN_Inference_helper` initializes with various parameters for detection model inference and output settings.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1265-1290",
+            "content": "    def _convertPNG(self, image_numpy):\n        disp_resized = cv2.resize(image_numpy, (1280, 640))\n        disp_resized_np = disp_resized\n        vmax = np.percentile(disp_resized_np, 95)\n        normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)\n        mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')\n        colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *\n                          255).astype(np.uint8)\n        im = Image.fromarray(colormapped_im)\n        return im\n@INFERENCE.register()\nclass AVA_SlowFast_FastRCNN_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 detection_model_name,\n                 detection_model_weights,\n                 config_file_path,\n                 predict_stepsize=8,\n                 output_stepsize=4,\n                 output_fps=6,\n                 out_filename='ava_det_demo.mp4',\n                 num_frames=32,\n                 alpha=4,\n                 target_size=256):\n        self.detection_model_name = detection_model_name"
+        },
+        {
+            "comment": "The code is initializing some parameters and then extracting frames from the input video file for further processing. It builds a pipeline configuration for testing, sets clip length, and calculates center indices of each clip. The extracted frames will be used for object detection or other tasks in subsequent steps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1291-1320",
+            "content": "        self.detection_model_weights = detection_model_weights\n        self.config = get_config(config_file_path,\n                                 show=False)  #parse config file\n        self.predict_stepsize = predict_stepsize\n        self.output_stepsize = output_stepsize\n        self.output_fps = output_fps\n        self.out_filename = out_filename\n        self.num_frames = num_frames\n        self.alpha = alpha\n        self.target_size = target_size\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        \"\"\"\n        frame_dir = 'tmp_frames'\n        self.frame_paths, frames, FPS = frame_extraction(input_file, frame_dir)\n        num_frame = len(self.frame_paths)  #\u89c6\u9891\u79d2\u6570*FPS\n        assert num_frame != 0\n        # \u5e27\u56fe\u50cf\u9ad8\u5ea6\u548c\u5bbd\u5ea6\n        h, w, _ = frames[0].shape\n        # Get clip_len, frame_interval and calculate center index of each clip\n        data_process_pipeline = build_pipeline(\n            self.config.PIPELINE.test)  #\u6d4b\u8bd5\u65f6\u8f93\u51fa\u5904\u7406\u6d41\u6c34\u914d\u7f6e\n        clip_len = self.config.PIPELINE.test.sample['clip_len']"
+        },
+        {
+            "comment": "The code asserts for an even clip_len and frame_interval, calculates window size, generates timestamps for selecting frames, creates a list of selected frames, reads label map from file and assigns categories to a dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1321-1343",
+            "content": "        assert clip_len % 2 == 0, 'We would like to have an even clip_len'\n        frame_interval = self.config.PIPELINE.test.sample['frame_interval']\n        # \u6b64\u5904\u5173\u952e\u5e27\u6bcf\u79d2\u53d6\u4e00\u4e2a\n        clip_len = self.config.PIPELINE.test.sample['clip_len']\n        assert clip_len % 2 == 0, 'We would like to have an even clip_len'\n        frame_interval = self.config.PIPELINE.test.sample['frame_interval']\n        window_size = clip_len * frame_interval\n        timestamps = np.arange(window_size // 2,\n                               (num_frame + 1 - window_size // 2),\n                               self.predict_stepsize)\n        selected_frame_list = []\n        for timestamp in timestamps:\n            selected_frame_list.append(self.frame_paths[timestamp - 1])\n        # Load label_map\n        label_map_path = self.config.DATASET.test['label_file']\n        self.categories, self.class_whitelist = read_labelmap(\n            open(label_map_path))\n        label_map = {}\n        for item in self.categories:\n            id = item['id']"
+        },
+        {
+            "comment": "This code is initializing a label map, running object detection inference on a list of frames, and extracting detection results for each timestamp. It then processes these results by getting proposals and scores for each frame, and checks if there are any detections (if not, it proceeds).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1344-1368",
+            "content": "            name = item['name']\n            label_map[id] = name\n        self.label_map = label_map\n        detection_result_dir = 'tmp_detection'\n        detection_model_name = self.detection_model_name\n        detection_model_weights = self.detection_model_weights\n        detection_txt_list = detection_inference(selected_frame_list,\n                                                 detection_result_dir,\n                                                 detection_model_name,\n                                                 detection_model_weights)\n        assert len(detection_txt_list) == len(timestamps)\n        human_detections = []\n        data_list = []\n        person_num_list = []\n        for timestamp, detection_txt_path in zip(timestamps,\n                                                 detection_txt_list):\n            proposals, scores = get_detection_result(\n                detection_txt_path, h, w,\n                (float)(self.config.DATASET.test['person_det_score_thr']))\n            if proposals.shape[0] == 0:"
+        },
+        {
+            "comment": "This code is part of a data processing pipeline in PaddleVideo. It appends proposals and scores to the result dictionary, reshapes tensors for image and proposal inputs, and converts images and proposal lists to Paddle Tensors for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1369-1399",
+            "content": "                #person_num_list.append(0)\n                human_detections.append(None)\n                continue\n            human_detections.append(proposals)\n            result = get_timestep_result(frame_dir,\n                                         timestamp,\n                                         clip_len,\n                                         frame_interval,\n                                         FPS=FPS)\n            result[\"proposals\"] = proposals\n            result[\"scores\"] = scores\n            new_result = data_process_pipeline(result)\n            proposals = new_result['proposals']\n            img_slow = new_result['imgs'][0]\n            img_slow = img_slow[np.newaxis, :]\n            img_fast = new_result['imgs'][1]\n            img_fast = img_fast[np.newaxis, :]\n            proposals = proposals[np.newaxis, :]\n            scores = scores[np.newaxis, :]\n            img_shape = np.asarray(new_result['img_shape'])\n            img_shape = img_shape[np.newaxis, :]\n            data = [\n                paddle.to_tensor(img_slow, dtype='float32'),"
+        },
+        {
+            "comment": "This code defines a class with methods to create and post-process human detections. It takes in various directories as input, and outputs lists of data and predictions. The preprocess method converts image, proposals, and shape into tensors, and appends the number of people and data list for each frame. The postprocess method takes output from the model and checks if human_detections is None for each timestamp, then adds predictions to a list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1400-1432",
+            "content": "                paddle.to_tensor(img_fast, dtype='float32'),\n                paddle.to_tensor(proposals, dtype='float32'),\n                paddle.to_tensor(img_shape, dtype='int32')\n            ]\n            person_num = proposals.shape[1]\n            person_num_list.append(person_num)\n            data_list.append(data)\n        self.human_detections = human_detections\n        self.person_num_list = person_num_list\n        self.timestamps = timestamps\n        self.frame_dir = frame_dir\n        self.detection_result_dir = detection_result_dir\n        return data_list\n    def postprocess(self, outputs, print_output=True):\n        \"\"\"\n        output: list\n        \"\"\"\n        predictions = []\n        assert len(self.person_num_list) == len(outputs)\n        #print(\"***  self.human_detections\",len( self.human_detections))\n        #print(\"***  outputs\",len( outputs))\n        index = 0\n        for t_index in range(len(self.timestamps)):\n            if self.human_detections[t_index] is None:\n                predictions.append(None)"
+        },
+        {
+            "comment": "This code iterates over human detections and their corresponding outputs. If a detection is None, it appends a None value to the predictions list. It then iterates through the result array for each class, checking if the action score exceeds the specified threshold. For each valid action score, it adds the class label and score to the prediction list. Finally, it appends the prediction list to the predictions list.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1433-1463",
+            "content": "                continue\n            human_detection = self.human_detections[t_index]\n            output = outputs[index]\n            result = output  #\u957f\u5ea6\u4e3a\u7c7b\u522b\u4e2a\u6570\uff0c\u4e0d\u5305\u542b\u80cc\u666f\n            person_num = self.person_num_list[index]\n            index = index + 1\n            prediction = []\n            if human_detection is None:\n                predictions.append(None)\n                continue\n            # N proposals\n            for i in range(person_num):\n                prediction.append([])\n            # Perform action score thr\n            for i in range(len(result)):  # for class\n                if i + 1 not in self.class_whitelist:\n                    continue\n                for j in range(person_num):\n                    if result[i][j, 4] > self.config.MODEL.head['action_thr']:\n                        prediction[j].append(\n                            (self.label_map[i + 1], result[i][j, 4]\n                             ))  # label_map is a dict, label index start from 1\n            predictions.append(prediction)"
+        },
+        {
+            "comment": "Code snippet reads frames from specific paths, performs human detections and predictions, and densely samples timestamps to create a sequence of images. It then visualizes these images and attempts to import moviepy library for output file creation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1465-1489",
+            "content": "        results = []\n        for human_detection, prediction in zip(self.human_detections,\n                                               predictions):\n            results.append(pack_result(human_detection, prediction))\n        def dense_timestamps(timestamps, n):\n            \"\"\"Make it nx frames.\"\"\"\n            old_frame_interval = (timestamps[1] - timestamps[0])\n            start = timestamps[0] - old_frame_interval / n * (n - 1) / 2\n            new_frame_inds = np.arange(\n                len(timestamps) * n) * old_frame_interval / n + start\n            return new_frame_inds.astype(np.int)\n        dense_n = int(self.predict_stepsize / self.output_stepsize)  #30\n        frames = [\n            cv2.imread(self.frame_paths[i - 1])\n            for i in dense_timestamps(self.timestamps, dense_n)\n        ]\n        vis_frames = visualize(frames, results)\n        try:\n            import moviepy.editor as mpy\n        except ImportError:\n            raise ImportError('Please install moviepy to enable output file')"
+        },
+        {
+            "comment": "This code snippet defines a class PoseC3D_Inference_helper that handles image processing and inference for pose estimation. It includes methods for preprocessing, such as loading data from file, defining keypoint indices for left and right body parts, and applying various operations like frame sampling, pose decoding, and compacting the pose results. The code also demonstrates error handling by checking if input files exist before processing them, and performs cleanup of temporary directories after writing video files.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1491-1522",
+            "content": "        vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],\n                                    fps=self.output_fps)\n        vid.write_videofile(self.out_filename)\n        print(\"finish write !\")\n        # delete tmp files and dirs\n        shutil.rmtree(self.frame_dir)\n        shutil.rmtree(self.detection_result_dir)\n@INFERENCE.register()\nclass PoseC3D_Inference_helper(Base_Inference_helper):\n    def __init__(self, top_k=1):\n        self.top_k = top_k\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        with open(input_file, 'rb') as f:\n            data = pickle.load(f)\n        self.input_file = input_file\n        left_kp = [1, 3, 5, 7, 9, 11, 13, 15]\n        right_kp = [2, 4, 6, 8, 10, 12, 14, 16]\n        ops = [\n            UniformSampleFrames(clip_len=48, num_clips=10, test_mode=True),\n            PoseDecode(),\n            PoseCompact(hw_ratio=1., allow_imgpad=True),"
+        },
+        {
+            "comment": "The code appears to be a part of a PaddleVideo tool that performs image preprocessing, resizing, cropping, and pose estimation. It uses PaddlePaddle library functions such as Resize, CenterCrop_V2, GeneratePoseTarget, FormatShape, Collect, and F.softmax for various operations. The code also calculates the number of segments and performs post-processing on output results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1523-1547",
+            "content": "            Resize(scale=(-1, 56)),\n            CenterCrop_V2(crop_size=56),\n            GeneratePoseTarget(sigma=0.6,\n                               use_score=True,\n                               with_kp=True,\n                               with_limb=False,\n                               double=True,\n                               left_kp=left_kp,\n                               right_kp=right_kp),\n            FormatShape(input_format='NCTHW'),\n            Collect(keys=['imgs', 'label'], meta_keys=[])\n        ]\n        for op in ops:\n            results = op(data)\n        results = [results[0][np.newaxis, :, :, :, :, :]]\n        self.num_segs = results[0].shape[1]\n        return results\n    def postprocess(self, outputs, print_output=True):\n        batch_size = outputs[0].shape[0]\n        cls_score = outputs[0].reshape(\n            [batch_size // self.num_segs, self.num_segs, outputs[0].shape[-1]])\n        output = F.softmax(paddle.to_tensor(cls_score),\n                           axis=2).mean(axis=1).numpy()"
+        },
+        {
+            "comment": "This code snippet is a part of YOWO_Inference_helper class in PaddleVideo. It initializes the class with parameters such as num_seg, target_size, nms_thresh, conf_thresh_valid, mean, and std. The class seems to be used for image classification or object detection tasks, based on the presence of top-k classes and scores.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1548-1573",
+            "content": "        N = len(self.input_file)\n        for i in range(N):\n            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]\n            classes = classes[np.argsort(-output[i, classes])]\n            scores = output[i, classes]\n            if print_output:\n                print(\"Current video file: {0}\".format(self.input_file[i]))\n                for j in range(self.top_k):\n                    print(\"\\ttop-{0} class: {1}\".format(j + 1, classes[j]))\n                    print(\"\\ttop-{0} score: {1}\".format(j + 1, scores[j]))\n@INFERENCE.register()\nclass YOWO_Inference_helper(Base_Inference_helper):\n    def __init__(self,\n                 num_seg=16,\n                 target_size=224,\n                 nms_thresh=0.5,\n                 conf_thresh_valid=0.5,\n                 mean=[0.4345, 0.4051, 0.3775],\n                 std=[0.2768, 0.2713, 0.2737]):\n        self.num_seg = num_seg\n        self.target_size = target_size\n        self.nms_thresh = nms_thresh\n        self.conf_thresh_valid = conf_thresh_valid"
+        },
+        {
+            "comment": "This code is initializing a preprocess function for video input. It checks if the input file exists, then uses OpenCV to read frames from the video file. The function populates a queue with initial frames, adds new frames, and resizes them using interpolation. Finally, it converts images to CHW order while keeping BGR values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1574-1605",
+            "content": "        self.mean = mean\n        self.std = std\n    def preprocess(self, input_file):\n        \"\"\"\n        input_file: str, file path\n        return: list\n        \"\"\"\n        assert os.path.isfile(input_file) is not None, \"{0} not exists\".format(\n            input_file)\n        cap = cv2.VideoCapture(input_file)\n        queue = []\n        inputs = []\n        frames = []\n        while (cap.isOpened()):\n            ret, frame = cap.read()\n            if ret == False:\n                break\n            if len(queue) <= 0:  # At initialization, populate queue with initial frame\n                for i in range(self.num_seg):\n                    queue.append(frame)\n            # Add the read frame to last and pop out the oldest one\n            queue.append(frame)\n            queue.pop(0)\n            # Resize images\n            imgs = [cv2.resize(img, (self.target_size, self.target_size), interpolation=cv2.INTER_LINEAR) for img in\n                    queue]\n            # Convert image to CHW keeping BGR order.\n            imgs = [img.transpose([2, 0, 1]) for img in imgs]"
+        },
+        {
+            "comment": "The code normalizes the image values to [0, 1] range and reshapes them into a specific format. It then concatenates the images to form a single array and expands dimensions as necessary before appending it to the inputs list. The postprocess function takes outputs, frames, frame, and filename as input and returns labels for classification tasks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1607-1637",
+            "content": "            # Image [0, 255] -> [0, 1].\n            imgs = [img / 255.0 for img in imgs]\n            imgs = [\n                np.ascontiguousarray(\n                    img.reshape((3, imgs[0].shape[1], imgs[0].shape[2]))\n                ).astype(np.float32)\n                for img in imgs\n            ]\n            # Concat list of images to single ndarray.\n            imgs = np.concatenate(\n                [np.expand_dims(img, axis=1) for img in imgs], axis=1\n            )\n            imgs = np.ascontiguousarray(imgs)\n            imgs = np.expand_dims(imgs, axis=0)\n            imgs = np.expand_dims(imgs, axis=0)\n            inputs.append(imgs)\n            frames.append(queue[-1])\n        return inputs, frames\n    def postprocess(self, outputs, frame, filename, save_img=True):\n        \"\"\"\n        outputs: list\n        frames: list\n        \"\"\"\n        labels = [\n            \"Basketball\", \"BasketballDunk\", \"Biking\", \"CliffDiving\", \"CricketBowling\",\n            \"Diving\", \"Fencing\", \"FloorGymnastics\", \"GolfSwing\", \"HorseRiding\","
+        },
+        {
+            "comment": "This code appears to be involved in object detection and recognition. It applies Non-Maximum Suppression (NMS) to the predicted bounding boxes to filter out redundant detections, calculates the adjusted coordinates for each box, and extracts the classification confidence scores for each class of the detected objects. The specific activity being detected or the model architecture used is not specified in this code snippet.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1638-1659",
+            "content": "            \"IceDancing\", \"LongJump\", \"PoleVault\", \"RopeClimbing\", \"SalsaSpin\",\n            \"SkateBoarding\", \"Skiing\", \"Skijet\", \"SoccerJuggling\", \"Surfing\",\n            \"TennisSwing\", \"TrampolineJumping\", \"VolleyballSpiking\", \"WalkingWithDog\"]\n        nms_thresh = 0.5\n        font = cv2.FONT_HERSHEY_SIMPLEX\n        for out in outputs:\n            out = paddle.to_tensor(out)\n            preds = []\n            all_boxes = get_region_boxes(out)\n            for i in range(out.shape[0]):\n                boxes = all_boxes[i]\n                boxes = nms(boxes, nms_thresh)\n                for box in boxes:\n                    x1 = round(float(box[0] - box[2] / 2.0) * 320.0)\n                    y1 = round(float(box[1] - box[3] / 2.0) * 240.0)\n                    x2 = round(float(box[0] + box[2] / 2.0) * 320.0)\n                    y2 = round(float(box[1] + box[3] / 2.0) * 240.0)\n                    det_conf = float(box[4])\n                    for j in range((len(box) - 5) // 2):\n                        cls_conf = float(box[5 + 2 * j].item())"
+        },
+        {
+            "comment": "This code is part of a video object detection system. It calculates the probability (prob) of detections based on confidence (det_conf) and class confidence (cls_conf). The detections are stored in preds list. If the probability is below 0.4, the loop breaks. Then it draws rectangles around detected objects on the frame using their coordinates from preds[0], colors them green, and displays text with object label and probability using cv2.putText(). Finally, it saves the processed frame as a .jpg image named after filename.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/tools/utils.py\":1660-1669",
+            "content": "                        prob = det_conf * cls_conf\n                    preds.append([[x1, y1, x2, y2], prob, labels[int(box[6])]])\n            for _, dets in enumerate(preds):\n                if dets[1] < 0.4:\n                    break\n                text = dets[2] + ' ' + '{:.2f}'.format(dets[1])\n                cv2.rectangle(frame, (dets[0][0], dets[0][1]), (dets[0][2], dets[0][3]), (0, 255, 0), 2)\n                cv2.putText(frame, text, (dets[0][0] + 3, dets[0][1] - 5 - 10 * _), font, 0.5, (0, 255, 0), 2)\n            cv2.imwrite('{}.jpg'.format(filename), frame)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fc12e4fe-cfd4-4ba1-b47a-fa2b0843d027.json b/docs/doc/fc12e4fe-cfd4-4ba1-b47a-fa2b0843d027.json
new file mode 100644
index 000000000..c608065f9
--- /dev/null
+++ b/docs/doc/fc12e4fe-cfd4-4ba1-b47a-fa2b0843d027.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports all modules and functions from the \"tsn\" subdirectory within the current package, allowing easy access to those components in other parts of the code. This is commonly used for modularity and organization in larger projects.",
+    "details": [
+        {
+            "comment": "This code imports all modules and functions from the \"tsn\" subdirectory within the current package, allowing easy access to those components in other parts of the code. This is commonly used for modularity and organization in larger projects.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoTag/models/tsn/__init__.py\":0-0",
+            "content": "from .tsn import *"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fc9c3324-d108-479a-aba6-5d4fb5a89d52.json b/docs/doc/fc9c3324-d108-479a-aba6-5d4fb5a89d52.json
new file mode 100644
index 000000000..97375a671
--- /dev/null
+++ b/docs/doc/fc9c3324-d108-479a-aba6-5d4fb5a89d52.json
@@ -0,0 +1,20 @@
+{
+    "summary": "This code utilizes Paddle2ONNX to convert PP-TSN model, and demonstrates prediction using ONNX engine. Environment setup involves installing necessary packages and downloading the inference model for conversion & prediction. The code generates output for video files with top-1 class and score.",
+    "details": [
+        {
+            "comment": "This code describes how to convert a PaddlePaddle (PP-TSN) model into an ONNX model and predict using the ONNX engine. It requires environment preparation by installing Paddle2ONNX and ONNXRuntime. Afterward, PP-TSN inference model should be downloaded for conversion and prediction.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/readme_en.md\":0-27",
+            "content": "# paddle2onnx model conversion and prediction\nThis chapter describes how the PP-TSN model is transformed into an ONNX model and predicted based on the ONNX engine.\n## 1. Environment preparation\nNeed to prepare Paddle2ONNX model conversion environment, and ONNX model prediction environment.\nPaddle2ONNX supports converting the PaddlePaddle model format to the ONNX model format. The operator currently supports exporting ONNX Opset 9~11 stably, and some Paddle operators support lower ONNX Opset conversion.\nFor more details, please refer to [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)\n- Install Paddle2ONNX\n```bash\npython3.7 -m pip install paddle2onnx\n```\n- Install ONNXRuntime\n```bash\n# It is recommended to install version 1.9.0, and the version number can be changed according to the environment\npython3.7 -m pip install onnxruntime==1.9.0\n```\n## 2. Model conversion\n- PP-TSN inference model download\n    ```bash\n    # Download the inference model to the PaddleVideo/inference/ppTSN/ directory"
+        },
+        {
+            "comment": "The provided code is for model conversion and prediction using Paddle2ONNX. First, it downloads an inference model from a URL, decompresses it, and then converts the Paddle inference model to ONNX format. Finally, it executes an example prediction using the converted ONNX model.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/readme_en.md\":28-60",
+            "content": "    mkdir -p ./inference\n    wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip\n    # Decompress the inference model\n    pushd ./inference\n    unzip ppTSN.zip\n    popd\n    ```\n- Model conversion\n    Convert Paddle inference models to ONNX format models using Paddle2ONNX:\n    ```bash\n    paddle2onnx \\\n    --model_dir=./inference/ppTSN \\\n    --model_filename=ppTSN.pdmodel \\\n    --params_filename=ppTSN.pdiparams \\\n    --save_file=./inference/ppTSN/ppTSN.onnx \\\n    --opset_version=10 \\\n    --enable_onnx_checker=True\n    ```\nAfter execution, you can find that a model file `ppTSN.onnx` in ONNX format is generated in the `./inference/ppTSN` directory\n## 3. onnx prediction\nNext, you can use the ONNX format model for prediction, which is similar to the paddle prediction model\nExecute the following command:\n```bash\npython3.7 deploy/paddle2onnx/predict_onnx.py \\\n--input_file data/example.avi \\\n--config configs/recognition/pptsn/pptsn_k400_videos.yaml \\\n--onnx_file=./inference/ppTSN/ppTSN.onnx"
+        },
+        {
+            "comment": "This code demonstrates how to generate an output for a video file using PaddleVideo. The top-1 class and score are displayed, which can be verified with the result of Paddle inference.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/paddle2onnx/readme_en.md\":61-69",
+            "content": "```\nThe result is as follows:\n```bash\nCurrent video file: data/example.avi\n        top-1 class: 5\n        top-1 score: 0.9998553991317749\n```\nIt can be verified that the result is completely consistent with the prediction result of Paddle inference"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fd74e93d-2298-44b2-9f58-9e3e7162ad1b.json b/docs/doc/fd74e93d-2298-44b2-9f58-9e3e7162ad1b.json
new file mode 100644
index 000000000..03719c98f
--- /dev/null
+++ b/docs/doc/fd74e93d-2298-44b2-9f58-9e3e7162ad1b.json
@@ -0,0 +1,35 @@
+{
+    "summary": "This code defines MSR-Vtt dataset paths, performs type checking, and loads features for a specific expert, handling aggregation, caching, and saving raw captions. It also checks text features, verifies format, size, and number of test captions, calculates missing queries, and raises errors for incorrect query mask sum.",
+    "details": [
+        {
+            "comment": "The code snippet is part of the MSRVTT class in the PaddleVideo library. It defines a dataset for MSR-Vtt, a large-scale video description dataset. The dataset_paths method returns the paths to train and test data splits. This method is typechecked to ensure that input types match expected data structures.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py\":0-28",
+            "content": "# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nimport copy\nfrom pathlib import Path\nfrom utils import memory_summary\nfrom typeguard import typechecked\nfrom typing import Dict, Union, List\nfrom base.base_dataset import BaseDataset\nfrom zsvision.zs_utils import memcache, concat_features\nclass MSRVTT(BaseDataset):\n    @staticmethod\n    @typechecked\n    def dataset_paths() -> Dict[str, Union[str, List[str], Path, Dict]]:\n        subset_paths = {}\n        split_name = \"jsfusion\"\n        train_list_path = \"train_list_jsfusion.txt\"\n        test_list_path = \"val_list_jsfusion.txt\""
+        },
+        {
+            "comment": "This code defines the data split paths for training and validation sets, as well as custom feature paths for different types of features. The JSFusion test caption indices path is also specified to reproduce a specific evaluation subset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py\":29-45",
+            "content": "        # NOTE: The JSFusion split (referred to as 1k-A in the paper) uses all\n        # videos, but randomly samples a single caption per video from the test\n        # set for evaluation. To reproduce this evaluation, we use the indices\n        # of the test captions, and restrict to this subset during eval.\n        js_test_cap_idx_path = \"jsfusion_val_caption_idx.pkl\"\n        subset_paths[split_name] = {\"train\": train_list_path, \"val\": test_list_path}\n        custom_paths = {\n            \"features_audio\": [\"mmt_feats/features.audio.pkl\"],\n            \"features_flow\": [\"mmt_feats/features.flow_agg.pkl\"],\n            \"features_rgb\": [\"mmt_feats/features.rgb_agg.pkl\"],\n            \"features_scene\": [\"mmt_feats/features.scene.pkl\"],\n            \"features_face\": [\"mmt_feats/features.face_agg.pkl\"],\n            \"features_ocr\": [\"mmt_feats/features.ocr.pkl\"],\n            \"features_s3d\": [\"mmt_feats/features.s3d.pkl\"],\n            \"features_speech\": [\"mmt_feats/features.speech.pkl\"],\n        }\n        text_feat_paths = {"
+        },
+        {
+            "comment": "This code is loading features from the MSRVTT dataset. It defines paths for text features and raw captions, and then updates a dictionary with custom paths, subset list paths, text feature paths, raw caption path, and JS test caption index path. The load_features method retrieves these paths and loads the features accordingly.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py\":46-70",
+            "content": "            \"openai\": \"w2v_MSRVTT_openAIGPT.pickle\",\n        }\n        text_feat_paths = {key: Path(\"aggregated_text_feats\") / fname\n                           for key, fname in text_feat_paths.items()}\n        feature_info = {\n            \"custom_paths\": custom_paths,\n            \"subset_list_paths\": subset_paths,\n            \"text_feat_paths\": text_feat_paths,\n            \"raw_captions_path\": \"raw-captions.pkl\",\n            \"js_test_cap_idx_path\": js_test_cap_idx_path,\n        }\n        return feature_info\n    def load_features(self):\n        root_feat = Path(self.root_feat)\n        feat_names = {}\n        custom_path_key = \"custom_paths\"\n        feat_names.update(self.paths[custom_path_key])\n        features = {}\n        for expert, rel_names in feat_names.items():\n            if expert not in self.ordered_experts:\n                continue\n            feat_paths = tuple([root_feat / rel_name for rel_name in rel_names])\n            if len(feat_paths) == 1:\n                features[expert] = memcache(feat_paths[0])"
+        },
+        {
+            "comment": "This code is handling feature aggregation for a specific expert. It checks if the aggregation method is \"concat\" and then concatenates the features based on the given axis. If not, it throws an error message. The code also caches information about the concatenated features, copies the features for each split, and stores them in the 'features' dictionary. Finally, it saves raw captions using memcache and updates self.raw_captions.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py\":71-88",
+            "content": "            else:\n                # support multiple forms of feature (e.g. max and avg pooling). For\n                # now, we only support direct concatenation\n                msg = f\"{expert}: Only direct concatenation of muliple feats is possible\"\n                print(f\"Concatenating aggregates for {expert}....\")\n                is_concat = self.feat_aggregation[expert][\"aggregate\"] == \"concat\"\n                self.log_assert(is_concat, msg=msg)\n                axis = self.feat_aggregation[expert][\"aggregate-axis\"]\n                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter\n                print(f\"concat cache info: {x}\")\n                features_ = concat_features(feat_paths, axis=axis)\n                memory_summary()\n                # Make separate feature copies for each split to allow in-place filtering\n                features[expert] = copy.deepcopy(features_)\n        self.features = features\n        self.raw_captions = memcache(root_feat / self.paths[\"raw_captions_path\"])"
+        },
+        {
+            "comment": "This code retrieves text features from the cache and checks if they belong to a specific training set. It also verifies that the train text features are in the expected format (a list with length 19 or 20).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py\":89-107",
+            "content": "        text_feat_path = root_feat / self.paths[\"text_feat_paths\"][self.text_feat]\n        self.text_features = memcache(text_feat_path)\n        if self.restrict_train_captions:\n            # hash the video names to avoid O(n) lookups in long lists\n            train_list = set(self.partition_lists[\"train\"])\n            for key, val in self.text_features.items():\n                if key not in train_list:\n                    continue\n                if not self.split_name == \"full-test\":\n                    # Note that we do not perform this sanity check for the full-test\n                    # split, because the text features in the cached dataset will\n                    # already have been cropped to the specified\n                    # `resstrict_train_captions`\n                    expect = {19, 20}\n                    msg = f\"expected train text feats as lists with length {expect}\"\n                    has_expected_feats = isinstance(val, list) and len(val) in expect\n                    self.log_assert(has_expected_feats, msg=msg)"
+        },
+        {
+            "comment": "The code checks if the number of test captions is set to 20 and verifies that the corresponding validation list size matches expected values. It calculates the missing queries based on the validation list size and raises a ValueError for unrecognized test sets. The code asserts that the difference between query mask sum and its size should be equal to the number of missing queries, with an error message if not correct.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py\":109-125",
+            "content": "                # restrict to the first N captions (deterministic)\n                self.text_features[key] = val[:self.restrict_train_captions]\n        self.summary_stats()\n    def sanity_checks(self):\n        if self.num_test_captions == 20:\n            if len(self.partition_lists[\"val\"]) == 2990:\n                missing = 6\n            elif len(self.partition_lists[\"val\"]) == 1000:\n                missing = 2\n            elif len(self.partition_lists[\"val\"]) == 497:\n                missing = 0\n            else:\n                raise ValueError(\"unrecognised test set\")\n            msg = \"Expected to find two missing queries in MSRVTT for full eval\"\n            correct_missing = self.query_masks.sum() == self.query_masks.size - missing\n            self.log_assert(correct_missing, msg=msg)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fda7878c-b567-4c38-aaa8-71eeca454894.json b/docs/doc/fda7878c-b567-4c38-aaa8-71eeca454894.json
new file mode 100644
index 000000000..4bd4a1077
--- /dev/null
+++ b/docs/doc/fda7878c-b567-4c38-aaa8-71eeca454894.json
@@ -0,0 +1,145 @@
+{
+    "summary": "PaddleVideo's \"object_detection_evaluation\" module offers `ObjectDetectionEvaluator` for evaluating object detection outcomes, including metrics like mAP and mean correct localization, considering IOU threshold. It handles AVA dataset and computes AVA metrics for average precision and mean average precision.",
+    "details": [
+        {
+            "comment": "The provided code is a part of the \"object_detection_evaluation\" module in the PaddleVideo library. This module provides a class called \"ObjectDetectionEvaluation\" that manages ground truth information for object detection datasets, computes frequently used metrics like Precision, Recall, and CorLoc from detection results. The class supports adding ground truth information sequentially and various operations for evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":0-20",
+            "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# =============================================================================\n\"\"\"object_detection_evaluation module.\nObjectDetectionEvaluation is a class which manages ground truth information of\na object detection dataset, and computes frequently used detection metrics such\nas Precision, Recall, CorLoc of the provided detection results.\nIt supports the following operations:\n1) Add ground truth information of images sequentially."
+        },
+        {
+            "comment": "This code defines an abstract class `DetectionEvaluator` for evaluating object detection results. It takes categories as input and allows adding single ground truth and detected image information. After adding all the data, it can be evaluated to get a metrics dictionary. This evaluation is done on numpy boxes and box lists.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":21-57",
+            "content": "2) Add detection result of images sequentially.\n3) Evaluate detection metrics on already inserted detection results.\n4) Write evaluation result into a pickle file for future processing or\n   visualization.\nNote: This module operates on numpy boxes and box lists.\n\"\"\"\nimport collections\nimport logging\nfrom abc import ABCMeta, abstractmethod\nimport numpy as np\nfrom . import metrics, per_image_evaluation, standard_fields\nclass DetectionEvaluator:\n    \"\"\"Interface for object detection evalution classes.\n    Example usage of the Evaluator:\n    ------------------------------\n    evaluator = DetectionEvaluator(categories)\n    # Detections and groundtruth for image 1.\n    evaluator.add_single_groundtruth_image_info(...)\n    evaluator.add_single_detected_image_info(...)\n    # Detections and groundtruth for image 2.\n    evaluator.add_single_groundtruth_image_info(...)\n    evaluator.add_single_detected_image_info(...)\n    metrics_dict = evaluator.evaluate()\n    \"\"\"\n    __metaclass__ = ABCMeta\n    def __init__(self, categories):"
+        },
+        {
+            "comment": "This code defines a class constructor that takes categories as input and provides two abstract methods for adding ground truth and detected image information for evaluation. The categories are used to uniquely identify different objects in the images.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":58-85",
+            "content": "        \"\"\"Constructor.\n        Args:\n            categories: A list of dicts, each of which has the following keys -\n                'id': (required) an integer id uniquely identifying this\n                    category.\n                'name': (required) string representing category name e.g.,\n                    'cat', 'dog'.\n        \"\"\"\n        self._categories = categories\n    @abstractmethod\n    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):\n        \"\"\"Adds groundtruth for a single image to be used for evaluation.\n        Args:\n            image_id: A unique string/integer identifier for the image.\n            groundtruth_dict: A dictionary of groundtruth numpy arrays required\n                for evaluations.\n        \"\"\"\n    @abstractmethod\n    def add_single_detected_image_info(self, image_id, detections_dict):\n        \"\"\"Adds detections for a single image to be used for evaluation.\n        Args:\n            image_id: A unique string/integer identifier for the image.\n            detections_dict: A dictionary of detection numpy arrays required"
+        },
+        {
+            "comment": "This code defines an `ObjectDetectionEvaluator` class that evaluates object detection results. It takes categories, IOU threshold, options for evaluating corner localizations and masks. The `evaluate()` method returns a dictionary of metrics, while the `clear()` method clears the state for a new evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":86-119",
+            "content": "                for evaluation.\n        \"\"\"\n    @abstractmethod\n    def evaluate(self):\n        \"\"\"Evaluates detections and returns a dictionary of metrics.\"\"\"\n    @abstractmethod\n    def clear(self):\n        \"\"\"Clears the state to prepare for a fresh evaluation.\"\"\"\nclass ObjectDetectionEvaluator(DetectionEvaluator):\n    \"\"\"A class to evaluate detections.\"\"\"\n    def __init__(\n        self,\n        categories,\n        matching_iou_threshold=0.5,\n        evaluate_corlocs=False,\n        metric_prefix=None,\n        use_weighted_mean_ap=False,\n        evaluate_masks=False,\n    ):\n        \"\"\"Constructor.\n        Args:\n            categories: A list of dicts, each of which has the following keys -\n                'id': (required) an integer id uniquely identifying this\n                    category.\n                'name': (required) string representing category name e.g.,\n                    'cat', 'dog'.\n            matching_iou_threshold: IOU threshold to use for matching\n                groundtruth boxes to detection boxes."
+        },
+        {
+            "comment": "This code is initializing the ObjectDetectionEvaluator class, which evaluates object detection performance. It takes in optional parameters for corloc scores, metric prefix, and weighted mean AP computation. It checks if category IDs are 1-indexed and raises a ValueError if not.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":120-138",
+            "content": "            evaluate_corlocs: (optional) boolean which determines if corloc\n                scores are to be returned or not.\n            metric_prefix: (optional) string prefix for metric name; if None,\n                no prefix is used.\n            use_weighted_mean_ap: (optional) boolean which determines if the\n                mean average precision is computed directly from the scores and\n                tp_fp_labels of all classes.\n            evaluate_masks: If False, evaluation will be performed based on\n                boxes. If True, mask evaluation will be performed instead.\n        Raises:\n            ValueError: If the category ids are not 1-indexed.\n        \"\"\"\n        super(ObjectDetectionEvaluator, self).__init__(categories)\n        self._num_classes = max([cat['id'] for cat in categories])\n        if min(cat['id'] for cat in categories) < 1:\n            raise ValueError('Classes should be 1-indexed.')\n        self._matching_iou_threshold = matching_iou_threshold\n        self._use_weighted_mean_ap = use_weighted_mean_ap"
+        },
+        {
+            "comment": "This code is initializing an object detection evaluation module, specifically for the Aggregated Average Precision metric. The module takes in parameters such as the number of ground truth classes, matching IOU threshold, and a label offset. It also adds a single image's ground truth information for evaluation purposes. The function expects an image ID and a dictionary containing ground truth boxes information.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":139-159",
+            "content": "        self._label_id_offset = 1\n        self._evaluate_masks = evaluate_masks\n        self._evaluation = ObjectDetectionEvaluation(\n            num_groundtruth_classes=self._num_classes,\n            matching_iou_threshold=self._matching_iou_threshold,\n            use_weighted_mean_ap=self._use_weighted_mean_ap,\n            label_id_offset=self._label_id_offset,\n        )\n        self._image_ids = set([])\n        self._evaluate_corlocs = evaluate_corlocs\n        self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''\n    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):\n        \"\"\"Adds groundtruth for a single image to be used for evaluation.\n        Args:\n            image_id: A unique string/integer identifier for the image.\n            groundtruth_dict: A dictionary containing -\n                standard_fields.InputDataFields.groundtruth_boxes: float32\n                    numpy array of shape [num_boxes, 4] containing `num_boxes`\n                    groundtruth boxes of the format [ymin, xmin, ymax, xmax] in"
+        },
+        {
+            "comment": "The code is defining the `add_groundtruth` function that takes in an image ID and a groundtruth dictionary. It checks if the image ID has been added before, and raises a ValueError if it has. The groundtruth dictionary should contain 'boxes', 'groundtruth_classes', 'groundtruth_difficult' (optional), and 'groundtruth_instance_masks' (if difficult instances). If the groundtruth is valid, it adds the information to the _image_ids set and initializes corresponding arrays for that image ID. If instance masks are not in the groundtruth dictionary, it raises a ValueError.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":160-178",
+            "content": "                    absolute image coordinates.\n                standard_fields.InputDataFields.groundtruth_classes: integer\n                    numpy array of shape [num_boxes] containing 1-indexed\n                    groundtruth classes for the boxes.\n                standard_fields.InputDataFields.groundtruth_difficult: Optional\n                    length M numpy boolean array denoting whether a ground\n                    truth box is a difficult instance or not. This field is\n                    optional to support the case that no boxes are difficult.\n                standard_fields.InputDataFields.groundtruth_instance_masks:\n                    Optional numpy array of shape [num_boxes, height, width]\n                    with values in {0, 1}.\n        Raises:\n            ValueError: On adding groundtruth for an image more than once. Will\n                also raise error if instance masks are not in groundtruth\n                dictionary.\n        \"\"\"\n        if image_id in self._image_ids:\n            raise ValueError("
+        },
+        {
+            "comment": "This code is checking if an image with a specific id already exists. If not, it retrieves the groundtruth classes and difficult labels from the dictionary, either from existing keys or by setting them to None if not present or empty. It also checks if the image_id is already added to avoid duplicates.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":179-197",
+            "content": "                'Image with id {} already added.'.format(image_id))\n        groundtruth_classes = (\n            groundtruth_dict[\n                standard_fields.InputDataFields.groundtruth_classes] -\n            self._label_id_offset)\n        # If the key is not present in the groundtruth_dict or the array is\n        # empty (unless there are no annotations for the groundtruth on this\n        # image) use values from the dictionary or insert None otherwise.\n        if (standard_fields.InputDataFields.groundtruth_difficult\n                in groundtruth_dict.keys()) and (groundtruth_dict[\n                    standard_fields.InputDataFields.groundtruth_difficult].size\n                                                 or\n                                                 not groundtruth_classes.size):\n            groundtruth_difficult = groundtruth_dict[\n                standard_fields.InputDataFields.groundtruth_difficult]\n        else:\n            groundtruth_difficult = None\n            if not len(self._image_ids) % 1000:"
+        },
+        {
+            "comment": "This code block checks if the ground truth difficult flag is specified for an image and raises a warning if not. It then adds single ground truth image information, including bounding boxes, class labels, and mask (if available), to the evaluation object. This allows for evaluating the performance of the object detection model on the given image.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":198-218",
+            "content": "                logging.warn(('image %s does not have groundtruth difficult '\n                              'flag specified'), image_id)\n        groundtruth_masks = None\n        if self._evaluate_masks:\n            if (standard_fields.InputDataFields.groundtruth_instance_masks\n                    not in groundtruth_dict):\n                raise ValueError(\n                    'Instance masks not in groundtruth dictionary.')\n            groundtruth_masks = groundtruth_dict[\n                standard_fields.InputDataFields.groundtruth_instance_masks]\n        self._evaluation.add_single_ground_truth_image_info(\n            image_key=image_id,\n            groundtruth_boxes=groundtruth_dict[\n                standard_fields.InputDataFields.groundtruth_boxes],\n            groundtruth_class_labels=groundtruth_classes,\n            groundtruth_is_difficult_list=groundtruth_difficult,\n            groundtruth_masks=groundtruth_masks,\n        )\n        self._image_ids.update([image_id])\n    def add_single_detected_image_info(self, image_id, detections_dict):"
+        },
+        {
+            "comment": "This code snippet adds detections for a single image to be used in evaluation. It takes in image_id and a dictionary containing detection boxes, scores, classes, and masks as input. The detection boxes are represented by a float32 numpy array of shape [num_boxes, 4] with the format [ymin, xmin, ymax, xmax] in absolute image coordinates. Detection scores and classes are integer numpy arrays representing the scores and classes for each box respectively, while detection masks are represented by a uint8 numpy array of shape [num_boxes, height, width].",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":219-235",
+            "content": "        \"\"\"Adds detections for a single image to be used for evaluation.\n        Args:\n            image_id: A unique string/integer identifier for the image.\n            detections_dict: A dictionary containing -\n                standard_fields.DetectionResultFields.detection_boxes: float32\n                    numpy array of shape [num_boxes, 4] containing `num_boxes`\n                    detection boxes of the format [ymin, xmin, ymax, xmax] in\n                    absolute image coordinates.\n                standard_fields.DetectionResultFields.detection_scores: float32\n                    numpy array of shape [num_boxes] containing detection\n                    scores for the boxes.\n                standard_fields.DetectionResultFields.detection_classes:\n                    integer numpy array of shape [num_boxes] containing\n                    1-indexed detection classes for the boxes.\n                standard_fields.DetectionResultFields.detection_masks: uint8\n                    numpy array of shape [num_boxes, height, width] containing"
+        },
+        {
+            "comment": "This code block retrieves detection classes and masks from the \"detections_dict\" dictionary. If _evaluate_Masks is True, it checks if detection masks are present in detections_dict. If not, it raises a ValueError. Then, it adds single detected image information to _evaluation using detected boxes, scores, and (optionally) detection masks.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":236-258",
+            "content": "                    `num_boxes` masks of values ranging between 0 and 1.\n        Raises:\n            ValueError: If detection masks are not in detections dictionary.\n        \"\"\"\n        detection_classes = (\n            detections_dict[\n                standard_fields.DetectionResultFields.detection_classes] -\n            self._label_id_offset)\n        detection_masks = None\n        if self._evaluate_masks:\n            if (standard_fields.DetectionResultFields.detection_masks\n                    not in detections_dict):\n                raise ValueError(\n                    'Detection masks not in detections dictionary.')\n            detection_masks = detections_dict[\n                standard_fields.DetectionResultFields.detection_masks]\n        self._evaluation.add_single_detected_image_info(\n            image_key=image_id,\n            detected_boxes=detections_dict[\n                standard_fields.DetectionResultFields.detection_boxes],\n            detected_scores=detections_dict[\n                standard_fields.DetectionResultFields.detection_scores],"
+        },
+        {
+            "comment": "This code is related to object detection evaluation in the AVA dataset. The `create_category_index` function creates a dictionary of COCO compatible categories, keyed by category id. The `evaluate` function computes the evaluation results, returning a dictionary of metrics including summary_metrics. The code also uses `detection_classes` and `detection_masks` for evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":259-289",
+            "content": "            detected_class_labels=detection_classes,\n            detected_masks=detection_masks,\n        )\n    def create_category_index(self, categories):\n        \"\"\"Creates dictionary of COCO compatible categories keyed by category\n        id.\n        Args:\n            categories: a list of dicts, each of which has the following keys:\n                'id': (required) an integer id uniquely identifying this\n                    category.\n                'name': (required) string representing category name\n                    e.g., 'cat', 'dog', 'pizza'.\n        Returns:\n            category_index: a dict containing the same entries as categories,\n                but keyed by the 'id' field of each category.\n        \"\"\"\n        category_index = {}\n        for cat in categories:\n            category_index[cat['id']] = cat\n        return category_index\n    def evaluate(self):\n        \"\"\"Compute evaluation result.\n        Returns:\n            A dictionary of metrics with the following fields -\n            1. summary_metrics:"
+        },
+        {
+            "comment": "This code calculates the mean average precision (mAP) and optionally, the mean correct localization score (meanCorLoc), at a specified IOU threshold for object detection evaluation. It creates metrics under different categories using category-specific results. The calculated values are then stored in the pascal_metrics dictionary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":290-314",
+            "content": "                'Precision/mAP@<matching_iou_threshold>IOU': mean average\n                precision at the specified IOU threshold\n            2. per_category_ap: category specific results with keys of the form\n               'PerformanceByCategory/mAP@<matching_iou_threshold>IOU/category'\n        \"\"\"\n        (\n            per_class_ap,\n            mean_ap,\n            _,\n            _,\n            per_class_corloc,\n            mean_corloc,\n        ) = self._evaluation.evaluate()\n        metric = f'mAP@{self._matching_iou_threshold}IOU'\n        pascal_metrics = {self._metric_prefix + metric: mean_ap}\n        if self._evaluate_corlocs:\n            pascal_metrics[self._metric_prefix +\n                           'Precision/meanCorLoc@{}IOU'.format(\n                               self._matching_iou_threshold)] = mean_corloc\n        category_index = self.create_category_index(self._categories)\n        for idx in range(per_class_ap.size):\n            if idx + self._label_id_offset in category_index:\n                display_name = ("
+        },
+        {
+            "comment": "This code calculates average precision (AP) and optional correct localization (CorLoc) metrics for object detection by category. It appends these metrics to the pascal_metrics dictionary based on the matching IOU threshold and category names from the category_index. The clear() function resets the evaluation state for a new evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":315-337",
+            "content": "                    self._metric_prefix +\n                    'PerformanceByCategory/AP@{}IOU/{}'.format(\n                        self._matching_iou_threshold,\n                        category_index[idx + self._label_id_offset]['name'],\n                    ))\n                pascal_metrics[display_name] = per_class_ap[idx]\n                # Optionally add CorLoc metrics.classes\n                if self._evaluate_corlocs: #False\n                    display_name = (\n                        self._metric_prefix +\n                        'PerformanceByCategory/CorLoc@{}IOU/{}'.format(\n                            self._matching_iou_threshold,\n                            category_index[idx +\n                                           self._label_id_offset]['name'],\n                        ))\n                    pascal_metrics[display_name] = per_class_corloc[idx]\n        return pascal_metrics\n    def clear(self):\n        \"\"\"Clears the state to prepare for a fresh evaluation.\"\"\"\n        self._evaluation = ObjectDetectionEvaluation("
+        },
+        {
+            "comment": "The given code is a part of an object detection evaluation module in PaddleVideo. It defines classes and functions to evaluate detections using PASCAL metrics. The ObjectDetectionEvaluator class initializes with categories, matching IoU threshold, evaluating corlocs flag, and use weighted mean AP flag. PascalDetectionEvaluator is a subclass of ObjectDetectionEvaluator specifically for PASCAL evaluation. The code also defines the ObjectDetectionEvalMetrics namedtuple which includes average_precisions, mean_ap, precisions, recalls, corlocs, and mean_corloc attributes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":338-374",
+            "content": "            num_groundtruth_classes=self._num_classes,\n            matching_iou_threshold=self._matching_iou_threshold,\n            use_weighted_mean_ap=self._use_weighted_mean_ap,\n            label_id_offset=self._label_id_offset,\n        )\n        self._image_ids.clear()\nclass PascalDetectionEvaluator(ObjectDetectionEvaluator):\n    \"\"\"A class to evaluate detections using PASCAL metrics.\"\"\"\n    def __init__(self, categories, matching_iou_threshold=0.5):\n        super(PascalDetectionEvaluator, self).__init__(\n            categories,\n            matching_iou_threshold=matching_iou_threshold,\n            evaluate_corlocs=False,\n            use_weighted_mean_ap=False,\n        )\nObjectDetectionEvalMetrics = collections.namedtuple(\n    'ObjectDetectionEvalMetrics',\n    [\n        'average_precisions',\n        'mean_ap',\n        'precisions',\n        'recalls',\n        'corlocs',\n        'mean_corloc',\n    ],\n)\nclass ObjectDetectionEvaluation:\n    \"\"\"Internal implementation of Pascal object detection metrics.\"\"\"\n    def __init__("
+        },
+        {
+            "comment": "This function initializes the necessary attributes for object detection evaluation. It requires 'self', number of ground truth classes, matching and nms iou thresholds, maximum output boxes, whether to use weighted mean AP, label offset, and sets up dictionaries to store ground truth information. It also initializes counters for the number of instances and images per class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":375-401",
+            "content": "        self,\n        num_groundtruth_classes,\n        matching_iou_threshold=0.5,\n        nms_iou_threshold=1.0,\n        nms_max_output_boxes=10000,\n        use_weighted_mean_ap=False,\n        label_id_offset=0,\n    ):\n        if num_groundtruth_classes < 1:\n            raise ValueError(\n                'Need at least 1 groundtruth class for evaluation.')\n        self.per_image_eval = per_image_evaluation.PerImageEvaluation(\n            num_groundtruth_classes=num_groundtruth_classes,\n            matching_iou_threshold=matching_iou_threshold,\n        )\n        self.num_class = num_groundtruth_classes\n        self.use_weighted_mean_ap = use_weighted_mean_ap\n        self.label_id_offset = label_id_offset\n        self.groundtruth_boxes = {}\n        self.groundtruth_class_labels = {}\n        self.groundtruth_masks = {}\n        self.groundtruth_is_difficult_list = {}\n        self.groundtruth_is_group_of_list = {}\n        self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int)\n        self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int)"
+        },
+        {
+            "comment": "This code initializes the detection variables and provides functions to clear detections, add single ground truth image info, and perform evaluation. The average precision per class is initialized with nan values, and these functions manage the data for object detection evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":403-429",
+            "content": "        self._initialize_detections()\n    def _initialize_detections(self):\n        self.detection_keys = set()\n        self.scores_per_class = [[] for _ in range(self.num_class)]\n        self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]\n        self.num_images_correctly_detected_per_class = np.zeros(self.num_class)\n        self.average_precision_per_class = np.empty(\n            self.num_class, dtype=float)\n        self.average_precision_per_class.fill(np.nan)\n        self.precisions_per_class = []\n        self.recalls_per_class = []\n        self.corloc_per_class = np.ones(self.num_class, dtype=float)\n    def clear_detections(self):\n        self._initialize_detections()\n    def add_single_ground_truth_image_info(\n        self,\n        image_key,\n        groundtruth_boxes,\n        groundtruth_class_labels,\n        groundtruth_is_difficult_list=None,\n        groundtruth_is_group_of_list=None,\n        groundtruth_masks=None,\n    ):\n        \"\"\"Adds groundtruth for a single image to be used for evaluation."
+        },
+        {
+            "comment": "The function takes in an image identifier, ground truth boxes coordinates, class labels for the boxes, a boolean array denoting difficult instances, and another boolean array for group-of boxes. It calculates average precision and recall for object detection using these inputs. The function also supports cases where no boxes are difficult or groups-of.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":431-446",
+            "content": "        Args:\n            image_key: A unique string/integer identifier for the image.\n            groundtruth_boxes: float32 numpy array of shape [num_boxes, 4]\n                containing `num_boxes` groundtruth boxes of the format\n                [ymin, xmin, ymax, xmax] in absolute image coordinates.\n            groundtruth_class_labels: integer numpy array of shape [num_boxes]\n                containing 0-indexed groundtruth classes for the boxes.\n            groundtruth_is_difficult_list: A length M numpy boolean array\n                denoting whether a ground truth box is a difficult instance or\n                not. To support the case that no boxes are difficult, it is by\n                default set as None.\n            groundtruth_is_group_of_list: A length M numpy boolean array\n                denoting whether a ground truth box is a group-of box or not.\n                To support the case that no boxes are groups-of, it is by\n                default set as None.\n            groundtruth_masks: uint8 numpy array of shape"
+        },
+        {
+            "comment": "This function adds the ground truth boxes, class labels, and masks to the database for a given image key. If the groundtruth_is_difficult_list or groundtruth_is_group_of_list are None, it creates them with default values. It stores these lists as well in the database for the specified image key.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":447-466",
+            "content": "                [num_boxes, height, width] containing `num_boxes` groundtruth\n                masks. The mask values range from 0 to 1.\n        \"\"\"\n        if image_key in self.groundtruth_boxes:\n            logging.warn(('image %s has already been added to the ground '\n                          'truth database.'), image_key)\n            return\n        self.groundtruth_boxes[image_key] = groundtruth_boxes\n        self.groundtruth_class_labels[image_key] = groundtruth_class_labels\n        self.groundtruth_masks[image_key] = groundtruth_masks\n        if groundtruth_is_difficult_list is None:\n            num_boxes = groundtruth_boxes.shape[0]\n            groundtruth_is_difficult_list = np.zeros(num_boxes, dtype=bool)\n        self.groundtruth_is_difficult_list[\n            image_key] = groundtruth_is_difficult_list.astype(dtype=bool)\n        if groundtruth_is_group_of_list is None:\n            num_boxes = groundtruth_boxes.shape[0]\n            groundtruth_is_group_of_list = np.zeros(num_boxes, dtype=bool)\n        self.groundtruth_is_group_of_list["
+        },
+        {
+            "comment": "This function adds detections for a single image to be used for evaluation. It requires an image key, detected boxes, detected scores, and detected class labels as input. The detected boxes should be in the format [ymin, xmin, ymax, xmax] and the detected scores and detected class labels should be numpy arrays of the specified shapes. The function calls a _update_ground_truth_statistics method with groundtruth class labels, difficult list, and group of list as input. This method updates the ground truth statistics for evaluation.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":467-492",
+            "content": "            image_key] = groundtruth_is_group_of_list.astype(dtype=bool)\n        self._update_ground_truth_statistics(\n            groundtruth_class_labels,\n            groundtruth_is_difficult_list.astype(dtype=bool),\n            groundtruth_is_group_of_list.astype(dtype=bool),\n        )\n    def add_single_detected_image_info(\n        self,\n        image_key,\n        detected_boxes,\n        detected_scores,\n        detected_class_labels,\n        detected_masks=None,\n    ):\n        \"\"\"Adds detections for a single image to be used for evaluation.\n        Args:\n            image_key: A unique string/integer identifier for the image.\n            detected_boxes: float32 numpy array of shape [num_boxes, 4]\n                containing `num_boxes` detection boxes of the format\n                [ymin, xmin, ymax, xmax] in absolute image coordinates.\n            detected_scores: float32 numpy array of shape [num_boxes]\n                containing detection scores for the boxes.\n            detected_class_labels: integer numpy array of shape [num_boxes]"
+        },
+        {
+            "comment": "This function creates a numpy array of detection masks based on detected boxes, scores and class labels. It raises a ValueError if the lengths of these lists are not equal. If an image key already exists in the detection keys list, it logs a warning message and returns without adding the image to the database.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":493-515",
+            "content": "                containing 0-indexed detection classes for the boxes.\n            detected_masks: np.uint8 numpy array of shape\n                [num_boxes, height, width] containing `num_boxes` detection\n                masks with values ranging between 0 and 1.\n        Raises:\n            ValueError: if the number of boxes, scores and class labels differ\n                in length.\n        \"\"\"\n        if len(detected_boxes) != len(detected_scores) or len(\n                detected_boxes) != len(detected_class_labels):\n            raise ValueError(\n                'detected_boxes, detected_scores and '\n                'detected_class_labels should all have same lengths. Got'\n                '[%d, %d, %d]' % len(detected_boxes),\n                len(detected_scores),\n                len(detected_class_labels),\n            )\n        if image_key in self.detection_keys:\n            logging.warn(('image %s has already been added to the ground '\n                          'truth database.'), image_key)\n            return"
+        },
+        {
+            "comment": "This code is initializing ground truth values for object detection evaluation. If an image key exists in the ground truth boxes dictionary, it retrieves the corresponding ground truth values (boxes, class labels, masks) and removes them from their respective dictionaries to avoid memory overflow. If no image key exists, it initializes empty arrays or None values for the ground truth values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":517-535",
+            "content": "        self.detection_keys.add(image_key)\n        if image_key in self.groundtruth_boxes:\n            groundtruth_boxes = self.groundtruth_boxes[image_key]\n            groundtruth_class_labels = self.groundtruth_class_labels[image_key]\n            # Masks are popped instead of look up. The reason is that we do not\n            # want to keep all masks in memory which can cause memory overflow.\n            groundtruth_masks = self.groundtruth_masks.pop(image_key)\n            groundtruth_is_difficult_list = self.groundtruth_is_difficult_list[\n                image_key]\n            groundtruth_is_group_of_list = self.groundtruth_is_group_of_list[\n                image_key]\n        else:\n            groundtruth_boxes = np.empty(shape=[0, 4], dtype=float)\n            groundtruth_class_labels = np.array([], dtype=int)\n            if detected_masks is None:\n                groundtruth_masks = None\n            else:\n                groundtruth_masks = np.empty(shape=[0, 1, 1], dtype=float)\n            groundtruth_is_difficult_list = np.array([], dtype=bool)"
+        },
+        {
+            "comment": "This code is part of the PaddleVideo library and it computes object detection metrics. It takes in detected boxes, scores, class labels, ground truth boxes, class labels, masks etc., and calculates true positive and false positive labels for each image. The computed values are then stored per class in separate lists (scores_per_class and tp_fp_labels_per_class). Additionally, the function updates ground truth statistics by appending new ground truth class labels and difficult list to existing ones.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":536-560",
+            "content": "            groundtruth_is_group_of_list = np.array([], dtype=bool)\n        (\n            scores,\n            tp_fp_labels,\n        ) = self.per_image_eval.compute_object_detection_metrics(\n            detected_boxes=detected_boxes,\n            detected_scores=detected_scores,\n            detected_class_labels=detected_class_labels,\n            groundtruth_boxes=groundtruth_boxes,\n            groundtruth_class_labels=groundtruth_class_labels,\n            groundtruth_is_difficult_list=groundtruth_is_difficult_list,\n            groundtruth_is_group_of_list=groundtruth_is_group_of_list,\n            detected_masks=detected_masks,\n            groundtruth_masks=groundtruth_masks,\n        )\n        for i in range(self.num_class):\n            if scores[i].shape[0] > 0:\n                self.scores_per_class[i].append(scores[i])\n                self.tp_fp_labels_per_class[i].append(tp_fp_labels[i])\n    def _update_ground_truth_statistics(\n        self,\n        groundtruth_class_labels,\n        groundtruth_is_difficult_list,"
+        },
+        {
+            "comment": "This function updates ground truth statistics for object detection by counting instances, excluding difficult boxes and treating them as normal ones for CorLoc computations. It iterates through class indices to determine the number of instances for each class label, excluding difficult or group-of boxes.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":561-582",
+            "content": "        groundtruth_is_group_of_list,\n    ):\n        \"\"\"Update grouth truth statitistics.\n        1. Difficult boxes are ignored when counting the number of ground truth\n        instances as done in Pascal VOC devkit.\n        2. Difficult boxes are treated as normal boxes when computing CorLoc\n        related statitistics.\n        Args:\n            groundtruth_class_labels: An integer numpy array of length M,\n                representing M class labels of object instances in ground truth\n            groundtruth_is_difficult_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a difficult instance or\n                not\n            groundtruth_is_group_of_list: A boolean numpy array of length M\n                denoting whether a ground truth box is a group-of box or not\n        \"\"\"\n        for class_index in range(self.num_class):\n            num_gt_instances = np.sum(groundtruth_class_labels[\n                ~groundtruth_is_difficult_list\n                & ~groundtruth_is_group_of_list] == class_index)"
+        },
+        {
+            "comment": "The code calculates average precision, mean average precision, precisions, recalls, and CorLoc scores for object detection evaluation. It checks if any ground truth instances exist for each class and returns a named tuple with evaluation results. If there are classes with no ground truth examples, it prints a warning message.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":583-604",
+            "content": "            self.num_gt_instances_per_class[class_index] += num_gt_instances\n            if np.any(groundtruth_class_labels == class_index):\n                self.num_gt_imgs_per_class[class_index] += 1\n    def evaluate(self):\n        \"\"\"Compute evaluation result.\n        Returns:\n            A named tuple with the following fields -\n                average_precision: float numpy array of average precision for\n                    each class.\n                mean_ap: mean average precision of all classes, float scalar\n                precisions: List of precisions, each precision is a float numpy\n                    array\n                recalls: List of recalls, each recall is a float numpy array\n                corloc: numpy float array\n                mean_corloc: Mean CorLoc score for each class, float scalar\n        \"\"\"\n        if (self.num_gt_instances_per_class == 0).any():\n            print(\n                'The following classes have no ground truth examples: %s',\n                np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) +"
+        },
+        {
+            "comment": "This code is part of a class that performs object detection evaluation using Average Vehicle Accuracy (AVA) metrics. It checks for the number of ground truth instances per class and concatenates scores and true positive/false positive labels per class. If weighted mean average precision (AP) calculation is enabled, it appends the scores and labels to the total arrays. The code uses the compute_precision_recall function from the metrics module to calculate precision and recall values.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":605-628",
+            "content": "                self.label_id_offset, \"self.detection_keys:\",self.detection_keys\n            )\n        if self.use_weighted_mean_ap:\n            all_scores = np.array([], dtype=float)\n            all_tp_fp_labels = np.array([], dtype=bool)\n        for class_index in range(self.num_class):\n            if self.num_gt_instances_per_class[class_index] == 0:\n                continue\n            if not self.scores_per_class[class_index]:\n                scores = np.array([], dtype=float)\n                tp_fp_labels = np.array([], dtype=bool)\n            else:\n                scores = np.concatenate(self.scores_per_class[class_index])\n                tp_fp_labels = np.concatenate(\n                    self.tp_fp_labels_per_class[class_index])\n            if self.use_weighted_mean_ap:\n                all_scores = np.append(all_scores, scores)\n                all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)\n            precision, recall = metrics.compute_precision_recall(\n                scores,\n                tp_fp_labels,"
+        },
+        {
+            "comment": "This function calculates average precision and correlation localization for object detection evaluation. It stores the precision, recall, average precision per class, and correlation localization per class. If weighted mean AP is enabled, it computes precision, recall, mean AP, and mean correlation localization.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":629-650",
+            "content": "                self.num_gt_instances_per_class[class_index],\n            )\n            self.precisions_per_class.append(precision)\n            self.recalls_per_class.append(recall)\n            average_precision = metrics.compute_average_precision(\n                precision, recall)\n            self.average_precision_per_class[class_index] = average_precision\n        self.corloc_per_class = metrics.compute_cor_loc(\n            self.num_gt_imgs_per_class,\n            self.num_images_correctly_detected_per_class,\n        )\n        if self.use_weighted_mean_ap:\n            num_gt_instances = np.sum(self.num_gt_instances_per_class)\n            precision, recall = metrics.compute_precision_recall(\n                all_scores, all_tp_fp_labels, num_gt_instances)\n            mean_ap = metrics.compute_average_precision(precision, recall)\n        else:\n            mean_ap = np.nanmean(self.average_precision_per_class)\n        mean_corloc = np.nanmean(self.corloc_per_class)\n        return ObjectDetectionEvalMetrics("
+        },
+        {
+            "comment": "This code snippet appears to be part of a class function that returns several evaluation metrics for object detection. The metrics include average precision per class, mean average precision, precisions and recalls per class, and mean corloc values. These metrics are commonly used in evaluating object detection models' performance.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py\":651-657",
+            "content": "            self.average_precision_per_class,\n            mean_ap,\n            self.precisions_per_class,\n            self.recalls_per_class,\n            self.corloc_per_class,\n            mean_corloc,\n        )"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fdb59e4b-8fde-4c7a-8847-ac96f52a9f6d.json b/docs/doc/fdb59e4b-8fde-4c7a-8847-ac96f52a9f6d.json
new file mode 100644
index 000000000..82d3e87bc
--- /dev/null
+++ b/docs/doc/fdb59e4b-8fde-4c7a-8847-ac96f52a9f6d.json
@@ -0,0 +1,25 @@
+{
+    "summary": "This code defines an I3D classification head in PaddleVideo with options for loss, pooling type, dropout ratio and initialization standard deviation. It performs adaptive average pooling, dropout, linear layer, and has a learning rate of 10.0.",
+    "details": [
+        {
+            "comment": "This code snippet imports necessary libraries and defines a class called \"I3DHead\" which is a classification head for I3D models. It takes in arguments like the number of classes to be classified, the input channel size, and configuration for building loss. The code is part of PaddleVideo library and registered with HEADS registry.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/i3d_head.py\":0-30",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom ..registry import HEADS\nfrom ..weight_init import weight_init_\nfrom .base import BaseHead\n@HEADS.register()\nclass I3DHead(BaseHead):\n    \"\"\"Classification head for I3D.\n    Args:\n        num_classes (int): Number of classes to be classified.\n        in_channels (int): Number of channels in input feature.\n        loss_cls (dict): Config for building loss."
+        },
+        {
+            "comment": "Class constructor for a head, with optional parameters for loss configuration, pooling type in spatial dimension, dropout ratio, and standard deviation for initialization. Initializes the base class, sets attributes, and optionally adds a Dropout layer if drop_ratio is non-zero.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/i3d_head.py\":31-58",
+            "content": "            Default: dict(name='CrossEntropyLoss')\n        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.\n        drop_ratio (float): Probability of dropout layer. Default: 0.5.\n        std (float): Std value for Initiation. Default: 0.01.\n        kwargs (dict, optional): Any keyword argument to be used to initialize\n            the head.\n    \"\"\"\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 loss_cfg=dict(name='CrossEntropyLoss'),\n                 spatial_type='avg',\n                 drop_ratio=0.5,\n                 std=0.01,\n                 **kwargs):\n        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)\n        self.spatial_type = spatial_type\n        self.drop_ratio = drop_ratio\n        self.stdv = std\n        if self.drop_ratio != 0:\n            self.dropout = nn.Dropout(p=self.drop_ratio)\n        else:\n            self.dropout = None\n        self.fc = nn.Linear(\n            self.in_channels,\n            self.num_classes,"
+        },
+        {
+            "comment": "Function: I3D Head\nPurpose: To process and classify the extracted features from an I3D network\nKey Operations: Adaptive average pooling, dropout, and a linear layer for classification\nLearning Rate: Set to 10.0 for weights and bias parameters",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/i3d_head.py\":59-90",
+            "content": "            weight_attr=ParamAttr(learning_rate=10.0),\n            bias_attr=ParamAttr(learning_rate=10.0),\n        )\n        if self.spatial_type == 'avg':\n            # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels.\n            self.avg_pool = nn.AdaptiveAvgPool3D((1, 1, 1))\n        else:\n            self.avg_pool = None\n    def init_weights(self):\n        \"\"\"Initiate the parameters from scratch.\"\"\"\n        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)\n    def forward(self, x):\n        \"\"\"Defines the computation performed at every call.\n        Args:\n            x (torch.Tensor): The input data.\n        Returns:\n            torch.Tensor: The classification scores for input samples.\n        \"\"\"\n        # [N, in_channels, 4, 7, 7]\n        if self.avg_pool is not None:\n            x = self.avg_pool(x)\n        # [N, in_channels, 1, 1, 1]\n        if self.dropout is not None:\n            x = self.dropout(x)\n        # [N, in_channels, 1, 1, 1]\n        N = paddle.shape(x)[0]\n        x = x.reshape([N, -1])"
+        },
+        {
+            "comment": "This code snippet represents the output layer of a classification head in PaddleVideo. It takes input 'x' and passes it through 'self.fc', which is presumably a fully connected (FC) layer, producing 'cls_score'. The result is then returned as the final classification score for each sample. The shape of the output is [N, num_classes], where N is the batch size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/heads/i3d_head.py\":91-94",
+            "content": "        # [N, in_channels]\n        cls_score = self.fc(x)\n        # [N, num_classes]\n        return cls_score"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fe050e44-807f-4ebe-8ee9-e38632bda6fa.json b/docs/doc/fe050e44-807f-4ebe-8ee9-e38632bda6fa.json
new file mode 100644
index 000000000..63ef9871f
--- /dev/null
+++ b/docs/doc/fe050e44-807f-4ebe-8ee9-e38632bda6fa.json
@@ -0,0 +1,10 @@
+{
+    "summary": "This code imports base and recognizer2d classes from the same directory and adds them to the __all__ list for access. It also includes a copyright notice, license information, and disclaimer.",
+    "details": [
+        {
+            "comment": "This code imports base and recognizer2d classes from the same directory and adds them to the __all__ list for access. It also includes a copyright notice, license information, and disclaimer.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py\":0-18",
+            "content": "\"\"\"\n# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\"\"\"\nfrom .base import BaseRecognizer\nfrom .recognizer2d import Recognizer2D\n__all__ = ['BaseRecognizer', 'Recognizer2D']"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fe14e8ee-bdf8-47b0-be0f-8c97328a5783.json b/docs/doc/fe14e8ee-bdf8-47b0-be0f-8c97328a5783.json
new file mode 100644
index 000000000..5d9d63e21
--- /dev/null
+++ b/docs/doc/fe14e8ee-bdf8-47b0-be0f-8c97328a5783.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code processes video data, extracts action instances, creates frames and stores them as pickle files for a dataset, with potential data splitting for training and validation.",
+    "details": [
+        {
+            "comment": "This code is processing video data by extracting positive and negative action instances. Positive action instances are frames corresponding to annotated action intervals, while negative action instances are randomly selected frames from non-action intervals. The code reads JSON files containing labels and frame information, then processes each item by appending the start and end times of the action intervals. The length of positive action intervals is used to determine the start time for negative action intervals, with a minimum duration constraint between them.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_pptsm.py\":0-37",
+            "content": "\"\"\"\nget instance for tsn\npositive: \u6807\u6ce8\u540e\u7684\u52a8\u4f5c\u533a\u95f4\uff0c\u4e00\u4e2a\u533a\u95f4\u6240\u6709frames\u751f\u6210\u4e00\u4e2apkl\nnegative: \u6807\u6ce8\u540e\u7684\u975e\u52a8\u4f5c\u533a\u95f4\uff0c\u968f\u673a\u53d6N\u4e2a\u533a\u95f4\u751f\u6210N\u4e2apkl\uff0c\u6bcf\u4e2a\u533a\u95f4\u957f\u5ea6\u7b49\u4e8e\u6700\u8fd1\u7684\u524d\u4e00\u4e2a\u52a8\u4f5c\u533a\u95f4\u7684\u957f\u5ea6\n\"\"\"\nimport os\nimport json\nimport numpy as np\nimport random\nimport pickle\nfrom concurrent import futures\ndataset = \"../EuroCup2016\"\nframes_dir = dataset + '/frames'\nlabel_files = {'train': 'label_cls8_train.json', 'val': 'label_cls8_val.json'}\ndef process(item, fps, save_folder):\n    actions_pos = []\n    actions_neg = []\n    url = item['url']\n    print(url)\n    basename = os.path.basename(url).split('.')[0]\n    actions = item['actions']\n    # pos\n    for action in actions:\n        actions_pos.append({\n            'label': action['label_ids'],\n            'start': action['start_id'] * fps,\n            'end': action['end_id'] * fps\n        })\n    # neg\n    for idx, pos in enumerate(actions_pos):\n        if idx == len(actions_pos) - 1:\n            break\n        len_pos = pos['end'] - pos['start']\n        duration_start = [pos['end'], actions_pos[idx + 1]['start'] - len_pos]\n        if duration_start[1] - duration_start[0] < 3:"
+        },
+        {
+            "comment": "Code is iterating over frames and creating positive (label=1) and negative (label=0) action instances. It randomly sets the start frame, calculates end frame, appends to 'actions_pos' or 'actions_neg'. Then concatenates both lists, loops through items in the list, extracts start/end frames, label, and iterates over frames range to read images and store them in 'frames'. It forms an output file name using base name, start and end frame numbers, and labels, then writes frames to a .pkl file.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_pptsm.py\":38-64",
+            "content": "            continue\n        for k in range(1, 3):\n            start_frame = random.randint(duration_start[0], duration_start[1])\n            end_frame = start_frame + len_pos\n            actions_neg.append({\n                'label': [0],\n                'start': start_frame,\n                'end': end_frame\n            })\n    # save pkl\n    for item in np.concatenate((actions_pos, actions_neg), axis=0):\n        start = item['start']\n        end = item['end']\n        label = item['label']\n        label_str = str(label[0])\n        if len(item['label']) == 2:\n            label_str = label_str + '-' + str(label[1])\n        frames = []\n        for ii in range(start, end + 1):\n            img = os.path.join(frames_dir, basename, '%08d.jpg' % ii)\n            with open(img, 'rb') as f:\n                data = f.read()\n            frames.append(data)\n        # print(label_str)\n        outname = '%s/%s_%08d_%08d_%s.pkl' % (save_folder, basename, start, end,\n                                              label_str)\n        with open(outname, 'wb') as f:"
+        },
+        {
+            "comment": "The code is creating instances for a dataset, processing data using multiprocessing, and saving them as pickle files. It also generates train and val lists of pickle files for further usage.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_pptsm.py\":65-95",
+            "content": "            pickle.dump((basename, label, frames), f, -1)\ndef gen_instance_pkl(label_data, save_folder):\n    fps = label_data['fps']\n    gts = label_data['gts']\n    with futures.ProcessPoolExecutor(max_workers=10) as executer:\n        fs = [executer.submit(process, gt, fps, save_folder) for gt in gts]\n    #for gt in gts:\n    #    process(gt, fps, save_folder)\nif __name__ == \"__main__\":\n    for item, value in label_files.items():\n        save_folder = os.path.join(dataset, 'input_for_pptsm', item)\n        if not os.path.exists(save_folder):\n            os.makedirs(save_folder)\n        label_file = os.path.join(dataset, value)\n        label_data = json.load(open(label_file, 'rb'))\n        gen_instance_pkl(label_data, save_folder)\n    # gen train val list\n    #data_dir = '../EuroCup2016/input_for_pptsm/'\n    data_dir = os.path.abspath(os.path.join(dataset, 'input_for_pptsm'))\n    os.system('find ' + data_dir + '/train -name \"*.pkl\" > ' + data_dir +\n              '/train.list')\n    os.system('find ' + data_dir + '/val -name \"*.pkl\" > ' + data_dir +"
+        },
+        {
+            "comment": "This line of code is likely specifying a file path for a validation list ('val.list') which could be used in the context of data splitting or model evaluation on a separate dataset subset.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/FootballAction/datasets/script/get_instance_for_pptsm.py\":96-96",
+            "content": "              '/val.list')"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fe7a66ca-368e-4787-ac41-61bb6031d076.json b/docs/doc/fe7a66ca-368e-4787-ac41-61bb6031d076.json
new file mode 100644
index 000000000..ab3f4565c
--- /dev/null
+++ b/docs/doc/fe7a66ca-368e-4787-ac41-61bb6031d076.json
@@ -0,0 +1,10 @@
+{
+    "summary": "The code defines a class called AverageMeter which computes and stores the average and current value. It has four attributes: val, avg, sum, and count. The reset function resets all values to 0, while the update function takes a value (val) and updates the running average based on the number of samples (n).",
+    "details": [
+        {
+            "comment": "The code defines a class called AverageMeter which computes and stores the average and current value. It has four attributes: val, avg, sum, and count. The reset function resets all values to 0, while the update function takes a value (val) and updates the running average based on the number of samples (n).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/Ma-Net/utils/meters.py\":0-21",
+            "content": "from __future__ import absolute_import\nclass AverageMeter(object):\n    \"\"\"Computes and stores the average and current value\"\"\"\n    def __init__(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def reset(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n    def update(self, val, n=1):\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ff13c874-3963-4a5b-8ad0-9c4109589c9f.json b/docs/doc/ff13c874-3963-4a5b-8ad0-9c4109589c9f.json
new file mode 100644
index 000000000..9eb29125d
--- /dev/null
+++ b/docs/doc/ff13c874-3963-4a5b-8ad0-9c4109589c9f.json
@@ -0,0 +1,20 @@
+{
+    "summary": "The code creates a PaddleVideo BaseEstimator class, inheriting from nn.Layer and utilizing builder for backbone construction. It initializes weights, registers the class, and sets forward modes for validation, testing, and inference, with abstract methods that must be implemented by subclasses.",
+    "details": [
+        {
+            "comment": "This code is defining a base class for an estimator in PaddleVideo. It inherits from nn.Layer, uses builder to construct the backbone if specified, and initializes the weights of the backbone if it has an init_weights method. The ESTIMATORS registry is used to register this BaseEstimator class.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/base.py\":0-33",
+            "content": "# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nfrom abc import abstractmethod\nimport paddle\nimport paddle.nn as nn\nfrom paddlevideo.modeling.registry import ESTIMATORS\nfrom paddlevideo.utils import get_logger\nfrom ... import builder\nlogger = get_logger(\"paddlevideo\")\n@ESTIMATORS.register()\nclass BaseEstimator(nn.Layer):\n    \"\"\"BaseEstimator\n    \"\"\"\n    def __init__(self, backbone=None, head=None):\n        super().__init__()\n        if backbone is not None:\n            self.backbone = builder.build_backbone(backbone)\n            if hasattr(self.backbone, 'init_weights'):"
+        },
+        {
+            "comment": "The code initializes the backbone and head components of a model depending on their availability. It then defines four forward modes (train, valid, test, infer) to execute the model accordingly. The train_step abstract method must be implemented separately.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/base.py\":34-65",
+            "content": "                self.backbone.init_weights()\n        else:\n            self.backbone = None\n        if head is not None:\n            self.head_name = head.name\n            self.head = builder.build_head(head)\n            if hasattr(self.head, 'init_weights'):\n                self.head.init_weights()\n        else:\n            self.head = None\n    def forward(self, data_batch, mode='infer'):\n        \"\"\"\n        1. Define how the model is going to run, from input to output.\n        2. Console of train, valid, test or infer step\n        \"\"\"\n        if mode == 'train':\n            return self.train_step(data_batch)\n        elif mode == 'valid':\n            return self.val_step(data_batch)\n        elif mode == 'test':\n            return self.test_step(data_batch)\n        elif mode == 'infer':\n            return self.infer_step(data_batch)\n        else:\n            raise NotImplementedError\n    @abstractmethod\n    def train_step(self, data_batch):\n        \"\"\"Define how the model is going to train, from input to output.\n        \"\"\""
+        },
+        {
+            "comment": "This code defines abstract methods for model validation, testing, and inference steps. It raises a NotImplementedError to ensure subclasses must implement these methods.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/modeling/framework/estimators/base.py\":66-81",
+            "content": "        raise NotImplementedError\n    @abstractmethod\n    def val_step(self, data_batch):\n        \"\"\"Define how the model is going to valid, from input to output.\"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def test_step(self, data_batch):\n        \"\"\"Define how the model is going to test, from input to output.\"\"\"\n        raise NotImplementedError\n    @abstractmethod\n    def infer_step(self, data_batch):\n        \"\"\"Define how the model is going to infer, from input to output.\"\"\"\n        raise NotImplementedError"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ff189f5d-9b37-4d8c-8994-fadb9831d5b6.json b/docs/doc/ff189f5d-9b37-4d8c-8994-fadb9831d5b6.json
new file mode 100644
index 000000000..f4e4b1ad4
--- /dev/null
+++ b/docs/doc/ff189f5d-9b37-4d8c-8994-fadb9831d5b6.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code initializes a pre-trained PaddleVideo model for PP-Care using TSM and ResNet50 weights, executes the application, and provides accuracy metrics while referencing relevant research papers on video understanding.",
+    "details": [
+        {
+            "comment": "Introduction to video models for 3DMRI, data preparation, model training, testing, inference details, and references. Install SimpleITK dependency. Uses PaddleVideo models for 3DMRI classification. Dataset includes PD and Con cases; train/test split is 300:78. Format as *.nii or *.nii.gz. Downloaded from a Baidu link.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PP-Care/Readme.md\":0-54",
+            "content": "# Video models for 3DMRI\n## \u5185\u5bb9\n- [\u6a21\u578b\u7b80\u4ecb](#\u6a21\u578b\u7b80\u4ecb)\n- [\u6570\u636e\u51c6\u5907](#\u6570\u636e\u51c6\u5907)\n- [\u6a21\u578b\u8bad\u7ec3](#\u6a21\u578b\u8bad\u7ec3)\n- [\u6a21\u578b\u6d4b\u8bd5](#\u6a21\u578b\u6d4b\u8bd5)\n- [\u6a21\u578b\u63a8\u7406](#\u6a21\u578b\u63a8\u7406)\n- [\u5b9e\u73b0\u7ec6\u8282](#\u5b9e\u73b0\u7ec6\u8282)\n- [\u53c2\u8003\u8bba\u6587](#\u53c2\u8003\u8bba\u6587)\n\u5728\u5f00\u59cb\u4f7f\u7528\u4e4b\u524d\uff0c\u60a8\u9700\u8981\u6309\u7167\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5\u989d\u5916\u7684\u4f9d\u8d56\u5305\uff1a\n```bash\npython -m pip install SimpleITK\n```\n## \u6a21\u578b\u7b80\u4ecb\n\u76ee\u524d\u5bf9\u4e8e\u533b\u5b663D\u6570\u636e\u5982MRI\uff0c\u5e76\u65e0\u592a\u597d\u7684\u5904\u7406\u624b\u6bb5\uff0c\u5927\u591a\u65702D\u6a21\u578b\u65e0\u6cd5\u83b7\u5f973D\u7a7a\u95f4\u5c42\u9762\u7684\u7279\u5f81\uff0c\u800c\u5e38\u7528\u76843D\u6a21\u578b\u53c8\u9700\u8981\u8f83\u5927\u7684\u8ba1\u7b97\u6210\u672c\u3002\u800c\u540c\u65f6\uff0c3D\u533b\u5b66\u6570\u636e\u4e0e\u5e38\u89c1\u7684\u89c6\u9891\u6570\u636e\u6709\u4e00\u5b9a\u76f8\u4f3c\u4e4b\u5904\uff0c\u6211\u4eec\u5c1d\u8bd5\u4e86\u901a\u8fc7PaddleVideo\u4e2d\u7684\u5e38\u89c1\u6a21\u578b\u89e3\u51b3\u533b\u5b663DMRI\u6570\u636e\u7684\u5206\u7c7b\u95ee\u9898\uff0c\u83b7\u5f97\u4e86\u8f83\u597d\u7684\u7ed3\u679c\u3002\u76ee\u524d\u652f\u6301PP-TSN\u3001PP-TSM\u3001Slowfast\u548cTimesformer\u5bf93DMRI\u7684\u76f4\u63a5\u8bad\u7ec3\u3002\n## \u6570\u636e\u51c6\u5907\n\u6570\u636e\u96c6\u5305\u62ec\u5e15\u91d1\u68ee\u60a3\u8005(PD)\u4e0e\u6b63\u5e38(Con)\u4e24\u79cd\u7c7b\u578b\u5171378\u4e2acase\uff0c\u8bad\u7ec3\u96c6\uff1a\u6d4b\u8bd5\u96c6=300\uff1a78\uff0c\u4f7f\u7528\u6570\u636e\u5747\u4e3a\u516c\u5f00\u6570\u636e\u96c6\uff0c\u5305\u62ec*neurocon*, *taowu*, *PPMI*\u548c*OASIS-1*\uff08\u7ecf\u8fc7\u9009\u53d6\uff09\uff0c\u5e76\u7ecf\u8fc7\u4e00\u5b9a\u683c\u5f0f\u8f6c\u6362\uff0c\u6570\u636e\u6700\u540e\u7684\u683c\u5f0f\u5747\u4e3a*name.nii*\u6216*name.nii.gz*\uff0c\u8def\u5f84\u4e0elabel\u4fe1\u606f\u901a\u8fc7txt\u6587\u4ef6\u4fdd\u5b58\uff0c\u6570\u636e\u96c6\u53ef\u4ee5\u901a\u8fc7\u767e\u5ea6\u7f51\u76d8\u4e0b\u8f7d\uff1a[\u4e0b\u8f7d\u94fe\u63a5](https://pan.baidu.com/s/1eIsHHqnkKNG5x9CGjRONEA?pwd=avug)\n- \u6570\u636e\u96c6label\u683c\u5f0f\n```\n{\n   \"0\": \"Con\",\n   \"1\": \"PD\"\n}\n```\n- \u6570\u636e\u96c6\u4fe1\u606f\u6587\u4ef6\u683c\u5f0f\n```\n{\n   path1 label1\n   path2 label2\n   ...\n}\n```\n- \u6570\u636e\u4fdd\u5b58\u683c\u5f0f\n```\n{\n   |--  datasets\n      |--  neurocon\n      |--  taowu\n      |--  PPMI\n      |--  OASIS-1\n}\n```\n## \u6a21\u578b\u8bad\u7ec3\n#### \u4e0b\u8f7d\u5e76\u6dfb\u52a0\u9884\u8bad\u7ec3\u6a21\u578b\n1. \u5bf9\u4e8ePP-TSN\u4e0ePP-TSM\uff0c\u9664\u4e86\u53ef\u4ee5\u4f7f\u7528ImageNet1000\u4e0a\u8bad\u7ec3\u597d\u7684\u9884\u8bad\u7ec3\u6a21\u578b\uff08\u89c1[PP-TSN\u9884\u8bad\u7ec3\u6a21\u578b](../../../docs/zh-CN/model_zoo/recognition/pp-tsn.md)\u4e0e[PP-"
+        },
+        {
+            "comment": "This code provides instructions on how to initialize a pre-trained model for PaddleVideo's PP-Care application. It mentions downloading a pre-trained TSM model, initializing the backbone with ResNet50 weights trained on MRI data, and filling in the weight path in the YAML configuration file. The code also explains how to train and test the model using specific commands for PP-TSN_MRI as an example.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PP-Care/Readme.md\":54-80",
+            "content": "TSM\u9884\u8bad\u7ec3\u6a21\u578b](../../../docs/zh-CN/model_zoo/recognition/pp-tsm.md))\uff0c\u4e5f\u53ef\u4ee5\u4f7f\u7528\u5728MRI\u6570\u636e\u96c6\u4e0a\u9884\u8bad\u7ec3\u7684ResNet50\u6743\u91cd\u5ea7\u4f4dBackbone\u521d\u59cb\u5316\u53c2\u6570\uff0c\u901a\u8fc7\u767e\u5ea6\u7f51\u76d8\u4e0b\u8f7d: [\u4e0b\u8f7d\u94fe\u63a5](https://pan.baidu.com/s/1eIsHHqnkKNG5x9CGjRONEA?pwd=avug)\u3002\u5bf9\u4e8eSlowfast\u4e0eTimeSformer\uff0c\u76ee\u524d\u53ea\u652f\u6301\u662f\u4f7f\u7528\u81ea\u7136\u6570\u636e\u96c6\u7684\u9884\u8bad\u7ec3\u6a21\u578b\uff0c\u89c1[Slowfast\u9884\u8bad\u7ec3\u6a21\u578b](../../../docs/zh-CN/model_zoo/recognition/slowfast.md)\u4e0e[Timesformer\u9884\u8bad\u7ec3\u6a21\u578b](../../../docs/zh-CN/model_zoo/recognition/timesformer.md)\n2. \u6253\u5f00`PaddleVideo/applications/PP-Care/configs/XXX.yaml`\uff0c\u5c06\u4e0b\u8f7d\u597d\u7684\u6743\u91cd\u8def\u5f84\u586b\u5199\u5230\u4e0b\u65b9`pretrained:`\u4e4b\u540e\uff0c\u4ee5pptsn_MRI\u4e3a\u4f8b\n   ```yaml\n   MODEL:\n       framework: \"RecognizerMRI\"\n       backbone:\n           name: \"ResNetTSN_MRI\"\n           pretrained: \u5c06\u8def\u5f84\u586b\u5199\u5230\u6b64\u5904\n   ```\n#### \u5f00\u59cb\u8bad\u7ec3\n- \u8bad\u7ec3\u4f7f\u7528\u663e\u5361\u6570\u91cf\u4e0e\u8f93\u51fa\u8def\u5f84\u7b49\u4fe1\u606f\u5747\u53ef\u4ee5\u9009\u62e9\uff0c\u4ee5PP-TSN_MRI\u76844\u5361\u8bad\u7ec3\u4e3a\u4f8b\uff0c\u8bad\u7ec3\u542f\u52a8\u547d\u4ee4\u5982\u4e0b\n  ```bash\n  python3.7 -B -m paddle.distributed.launch --gpus=\"0,1,2,3\" --log_dir=log_pptsn_MRI main.py  --validate -c applications/PP-Care/configs/pptsn_MRI.yaml\n  ```\n## \u6a21\u578b\u6d4b\u8bd5\n\u7531\u4e8e\u5404\u6a21\u578b\u5747\u5b58\u5728\u968f\u673a\u91c7\u6837\u90e8\u5206\uff0c\u4e14\u91c7\u6837\u65b9\u5f0f\u5b58\u5728\u4e0d\u540c\uff0c\u6240\u4ee5\u8bad\u7ec3\u65e5\u5fd7\u4e2d\u8bb0\u5f55\u7684\u9a8c\u8bc1\u6307\u6807`topk Acc`\u4e0d\u4ee3\u8868\u6700\u7ec8\u7684\u6d4b\u8bd5\u5206\u6570\uff0c\u56e0\u6b64\u5728\u8bad\u7ec3\u5b8c\u6210\u4e4b\u540e\u53ef\u4ee5\u7528\u6d4b\u8bd5\u6a21\u5f0f\u5bf9\u6700\u597d\u7684\u6a21\u578b\u8fdb\u884c\u6d4b\u8bd5\u83b7\u53d6\u6700\u7ec8\u7684\u6307\u6807\uff0c\u4ee5PP-TSN_MRI\u4e3a\u4f8b\uff0c\u547d\u4ee4\u5982\u4e0b\uff1a\n```bash\npython3.7 -B -m paddle.distributed.laun"
+        },
+        {
+            "comment": "The given code executes a PaddleVideo application, PP-Care, using specific configurations and trained model weights. It tests the ResNet50 backbone with PP-TSN and PP-TSM heads on 3DMRI validation data and reports accuracy metrics. The optimized models can be downloaded from a Baidu disk link provided.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PP-Care/Readme.md\":80-105",
+            "content": "ch --gpus=\"0,1,2,3\" --log_dir=log_pptsn_MRI main.py  --test -c applications/PP-Care/configs/pptsn_MRI.yaml -w \"output/ppTSN_MRI/ppTSN_MRI_best.pdparams\"\n```\n\u5f53\u6d4b\u8bd5\u914d\u7f6e\u91c7\u7528.yaml\u4e2d\u53c2\u6570\u65f6\uff0c\u57283DMRI\u6570\u636e\u7684validation\u6570\u636e\u96c6\u4e0a\u7684\u6d4b\u8bd5\u6307\u6807\u5982\u4e0b\uff1a\n|      backbone      |     head     |  Acc  |\n| :----------------: | :----------: | :---: |\n|      ResNet50      |    PP-TSN    | 91.07 |\n|      ResNet50      |    PP-TSM    | 90.83 |\n|     3DResNet50     |   Slowfast   | 91.07 |\n| Vision Transformer |  Timesformer | 88.33 |\n\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u53ef\u4ee5\u901a\u8fc7\u767e\u5ea6\u7f51\u76d8\u4e0b\u8f7d\uff1a[\u4e0b\u8f7d\u94fe\u63a5](https://pan.baidu.com/s/1eIsHHqnkKNG5x9CGjRONEA?pwd=avug)\n## \u6a21\u578b\u4f18\u5316\n\u5728\u5b9e\u9645\u4f7f\u7528\u4e2d\uff0c\u53ef\u4ee5\u5c1d\u8bd5\u6a21\u578b\u4f18\u5316\u7b56\u7565\n- \u53ef\u4ee5\u6839\u636eMRI\u6570\u636e\u5206\u5e03\uff0c\u8c03\u6574\u91c7\u6837\u7387\n- \u672c\u6a21\u578b\u76ee\u524d\u672a\u52a0\u5165\u8fc7\u591a\u7684\u6570\u636e\u9884\u5904\u7406\u7b56\u7565\uff0c\u9488\u5bf9\u4e0d\u540c\u6570\u636e\u7279\u6027\uff0c\u5728\u672c\u6a21\u578b\u57fa\u7840\u4e0a\u52a0\u5165\u4e00\u5b9a\u7684\u9884\u5904\u7406\u624b\u6bb5\u53ef\u80fd\u4f1a\u4f7f\u7ed3\u679c\u7ee7\u7eed\u63d0\u5347\n- \u7531\u4e8e\u6570\u636e\u91cf\u4e0e\u4efb\u52a1\u96be\u5ea6\u9650\u5236\uff0c\u672c\u6a21\u578b\u76ee\u524d\u5728\u51c6\u786e\u7387\u4e0a\u7684\u8868\u73b0\u4e0e3DResNet\u5e76\u65e0\u663e\u8457\u533a\u522b\uff0c\u4f46\u5bf9\u4e8e\u65f6\u95f4\u4e0e\u7a7a\u95f4\u7684\u9700\u6c42\u5747\u8fdc\u5c0f\u4e8e3D\u6a21\u578b\n## \u53c2\u8003\u8bba\u6587\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/pdf/1608.00859.pdf), Limin Wang, Yuanjun Xiong, Zhe Wang\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han"
+        },
+        {
+            "comment": "This code snippet contains the references for various important research papers related to video understanding using neural networks. These papers cover topics like knowledge distillation, slow-fast networks, and efficient training methods for video models.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/applications/PP-Care/Readme.md\":106-109",
+            "content": "- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean\n- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.\n- [A Multigrid Method for Efficiently Training Video Models](https://arxiv.org/abs/1912.00998), Chao-Yuan Wu, Ross Girshick, et al.\n- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/ffd105f7-d1d3-4e7a-9576-c6cc2185a1ed.json b/docs/doc/ffd105f7-d1d3-4e7a-9576-c6cc2185a1ed.json
new file mode 100644
index 000000000..a06fe2057
--- /dev/null
+++ b/docs/doc/ffd105f7-d1d3-4e7a-9576-c6cc2185a1ed.json
@@ -0,0 +1,25 @@
+{
+    "summary": "The code imports libraries, defines a DecodeSampler class for video decoding, and initializes parameters. It then decodes video frames, clips the index, retrieves frames, converts them to images using PIL library, and stores the images in a list.",
+    "details": [
+        {
+            "comment": "This code imports necessary libraries, defines the DecodeSampler class for faster decoding and sampling of video data using 'decord', and registers it with the PIPELINES registry. It is used in the slowfast model and takes arguments such as num_frames, sampling_rate, and target_fps.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler.py\":0-29",
+            "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport random\nimport numpy as np\nfrom PIL import Image\nimport decord as de\nfrom ..registry import PIPELINES\n@PIPELINES.register()\nclass DecodeSampler(object):\n    \"\"\"\n    We use 'decord' for decode and sampling, which is faster than opencv.\n    This is used in slowfast model.\n    Args:\n        num_frames(int): the number of frames we want to sample.\n        sampling_rate(int): sampling rate for video data.\n        target_fps(int): desired fps, default 30"
+        },
+        {
+            "comment": "This code initializes a class with parameters for sampling video frames, and determines the start and end indices for each clip based on test mode (random or uniform).",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler.py\":30-54",
+            "content": "        test_mode(bool): whether test or train/valid. In slowfast, we use multicrop when test.\n    \"\"\"\n    def __init__(self,\n                 num_frames,\n                 sampling_rate,\n                 default_sampling_rate=2,\n                 target_fps=30,\n                 test_mode=False):\n        self.num_frames = num_frames\n        self.orig_sampling_rate = self.sampling_rate = sampling_rate\n        self.default_sampling_rate = default_sampling_rate\n        self.target_fps = target_fps\n        self.test_mode = test_mode\n    def get_start_end_idx(self, video_size, clip_size, clip_idx,\n                          temporal_num_clips):\n        delta = max(video_size - clip_size, 0)\n        if not self.test_mode:\n            # Random temporal sampling.\n            start_idx = random.uniform(0, delta)\n        else:\n            # Uniformly sample the clip with the given index.\n            start_idx = delta * clip_idx / temporal_num_clips\n        end_idx = start_idx + clip_size - 1\n        return start_idx, end_idx"
+        },
+        {
+            "comment": "This function performs mp4 decode operations and returns a list of numpy arrays after decoding. It considers the short_cycle_idx to adjust the sampling rate, takes the filepath and temporal parameters from results, initializes a VideoReader object, calculates clip size, gets start and end indices for video clipping based on these values, and finally creates an index list for the decoded frames.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler.py\":56-80",
+            "content": "    def __call__(self, results):\n        \"\"\"\n        Perform mp4 decode operations.\n        return:\n            List where each item is a numpy array after decoder.\n        \"\"\"\n        short_cycle_idx = results.get('short_cycle_idx')\n        if short_cycle_idx:\n            self.sampling_rate = random.randint(self.default_sampling_rate,\n                                                self.orig_sampling_rate)\n        filepath = results['filename']\n        temporal_sample_index = results['temporal_sample_index']\n        temporal_num_clips = results['temporal_num_clips']\n        vr = de.VideoReader(filepath)\n        videolen = len(vr)\n        fps = vr.get_avg_fps()\n        clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps\n        start_idx, end_idx = self.get_start_end_idx(videolen, clip_size,\n                                                    temporal_sample_index,\n                                                    temporal_num_clips)\n        index = np.linspace(start_idx, end_idx, self.num_frames).astype(\"int64\")"
+        },
+        {
+            "comment": "This code segment is responsible for decoding and preparing image frames from a video. It clips the index value to ensure it falls within the valid range, retrieves the corresponding batch of frames using get_batch function, converts these frames into an array, and then loops through the array to convert each frame into an image using the PIL library's Image.fromarray method. The resulting images are stored in a list which is then assigned to 'results'['imgs'] before the function returns the results.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/paddlevideo/loader/pipelines/decode_sampler.py\":81-92",
+            "content": "        index = np.clip(index, 0, videolen)\n        frames_select = vr.get_batch(index)  #1 for buffer\n        # dearray_to_img\n        np_frames = frames_select.asnumpy()\n        frames_select_list = []\n        for i in range(np_frames.shape[0]):\n            imgbuf = np_frames[i]\n            frames_select_list.append(Image.fromarray(imgbuf, mode='RGB'))\n        results['imgs'] = frames_select_list\n        return results"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/doc/fffdaf50-3d31-4f87-bb69-f4d87abb409e.json b/docs/doc/fffdaf50-3d31-4f87-bb69-f4d87abb409e.json
new file mode 100644
index 000000000..1b25b3ec9
--- /dev/null
+++ b/docs/doc/fffdaf50-3d31-4f87-bb69-f4d87abb409e.json
@@ -0,0 +1,30 @@
+{
+    "summary": "This code normalizes, scales, and converts images for inference using a ten-crop technique within the PaddleVideo library's implementation of pre-processing operations.",
+    "details": [
+        {
+            "comment": "This code includes necessary headers for OpenCV and Paddle API libraries, defines the Permute class which runs a permutation operation on input images and outputs data in float format.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/preprocess_op.cpp\":0-35",
+            "content": "// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n#include \"opencv2/core.hpp\"\n#include \"opencv2/imgcodecs.hpp\"\n#include \"opencv2/imgproc.hpp\"\n#include \"paddle_api.h\"\n#include \"paddle_inference_api.h\"\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <ostream>\n#include <vector>\n#include <cstring>\n#include <fstream>\n#include <numeric>\n#include <include/preprocess_op.h>\nnamespace PaddleVideo\n{\n    void Permute::Run(const cv::Mat *im, float *data)"
+        },
+        {
+            "comment": "This code block is for preprocessing images before inference. It extracts each channel from the image and performs normalization on them separately, then scales the values and merges them back into a single image. The normalization is done by subtracting the mean and dividing by the scale factor for each channel. If scaling is required, it also converts the image data type to float. Afterwards, it resizes the image if necessary.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/preprocess_op.cpp\":36-65",
+            "content": "    {\n        int rh = im->rows;\n        int rw = im->cols;\n        int rc = im->channels();\n        for (int i = 0; i < rc; ++i)\n        {\n            // Extract the i-th channel of im and write it into the array with (data + i * rh * rw) as the starting address\n            cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw), rc - 1 - i);\n        }\n    }\n    void Normalize::Run(cv::Mat *im, const std::vector<float> &mean,\n                        const std::vector<float> &scale, const bool is_scale)\n    {\n        double e = 1.0;\n        if (is_scale)\n        {\n            e /= 255.0;\n        }\n        (*im).convertTo(*im, CV_32FC3, e);\n        std::vector<cv::Mat> bgr_channels(3);\n        cv::split(*im, bgr_channels);\n        for (auto i = 0; i < bgr_channels.size(); i++)\n        {\n            bgr_channels[i].convertTo(bgr_channels[i], CV_32FC1, 1.0 / scale[i], (0.0 - mean[i]) / scale[i]);\n        }\n        cv::merge(bgr_channels, *im);\n    }\n    void Scale::Run(const cv::Mat &img, cv::Mat &resize_img, bool use_tensorrt, const int &short_size)"
+        },
+        {
+            "comment": "This function performs image resizing and cropping operations based on the input image size and target crop size. If the image size is larger than or equal to the target crop size, it resizes the image to fit within the specified bounds and crops the center of the resized image with dimensions equal to the target crop size. If the image size is smaller than the target crop size, it prints an error message stating that the image width and height should be larger than the crop size.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/preprocess_op.cpp\":66-103",
+            "content": "    {\n        int h = img.rows;\n        int w = img.cols;\n        if ((w <= h && w == short_size) || (h <= w && h == short_size))\n        {\n            img.copyTo(resize_img);\n        }\n        else\n        {\n            int oh, ow;\n            if (w < h)\n            {\n                ow = short_size;\n                oh = h * ow / w;\n            }\n            else\n            {\n                oh = short_size;\n                ow = w * oh / h;\n            }\n            cv::resize(img, resize_img, cv::Size(ow, oh), 0.0f, 0.0f, cv::INTER_LINEAR);\n        }\n    }\n    void CenterCrop::Run(const cv::Mat &img, cv::Mat &crop_img, bool use_tensorrt, const int &target_size)\n    {\n        int h = img.rows;\n        int w = img.cols;\n        int crop_h = target_size;\n        int crop_w = target_size;\n        if (w < crop_w || h < crop_h)\n        {\n            printf(\"[Error] image width (%d) and height (%d) should be larger than crop size (%d)\",\n                   w, h, target_size);\n        }\n        else\n        {\n            int x1 = (w - crop_w) / 2;"
+        },
+        {
+            "comment": "This code applies a ten-crop technique to input image by extracting 5 pairs of horizontally and vertically cropped images from the original one. These cropped images are stored in a vector for further processing.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/preprocess_op.cpp\":104-131",
+            "content": "            int y1 = (h - crop_h) / 2;\n            crop_img = img(cv::Rect(x1, y1, crop_w, crop_h));\n        }\n    }\n    void TenCrop::Run(const cv::Mat &img, std::vector<cv::Mat> &crop_imgs, const int &begin_index, bool use_tensorrt, const int &target_size)\n    {\n        int h = img.rows;\n        int w = img.cols;\n        int crop_h = target_size;\n        int crop_w = target_size;\n        int w_step = (w - crop_w) / 4;\n        int h_step = (h - crop_h) / 4;\n        pair<int, int>offsets[5] =\n        {\n            {0,          0},\n            {4 * w_step, 0},\n            {0,          4 * h_step},\n            {4 * w_step, 4 * h_step},\n            {2 * w_step, 2 * h_step}\n        };\n        for (int i = 0; i < 5; ++i)\n        {\n            const int &j = i * 2;\n            const int &x1 = offsets[i].first;\n            const int &y1 = offsets[i].second;\n            crop_imgs[begin_index + j] = img(cv::Rect(x1, y1, crop_w, crop_h)); // cropped\n            cv::flip(img(cv::Rect(x1, y1, crop_w, crop_h)), crop_imgs[begin_index + j + 1], 0); // cropped"
+        },
+        {
+            "comment": "This code is a part of the PaddleVideo library. It appears to be inside a class called PaddleVideo, and it seems that this code block is responsible for implementing some sort of pre-processing operation or function. This operation might involve processing video frames before they are sent through an AI model for inference. The code also includes namespaces, which are used to organize the code into logical groups or modules.",
+            "location": "\"/media/root/Prima/works/PaddleVideo/docs/src/deploy/cpp_infer/src/preprocess_op.cpp\":132-134",
+            "content": "        }\n    }\n} // namespace PaddleVideo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/docs/en/dataset/k400.md b/docs/en/dataset/k400.md
deleted file mode 100644
index 539735513..000000000
--- a/docs/en/dataset/k400.md
+++ /dev/null
@@ -1,78 +0,0 @@
-[简体中文](../../zh-CN/dataset/k400.md) | English
-
-# Kinetics-400 Preparation
-
-- [Introduction](#Introduction)
-- [Download](#Download)
-- [Frames](#Frames)
-
----
-
-
-## Introduction
-
-Kinetics-400 is a commonly used benchmark dataset in the video field. Please refer to its official website [Kinetics](https://deepmind.com/research/open-source/kinetics) for details. You can refer to the official address [ActivityNet](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics), and use the download script provided to download the dataset.
-
-## Download
-
-Considering the difficulty of downloading the K400 data set, we provide two download methods: (1) Baidu network disk download (2) Script download
-
-### Baidu SkyDrive Download
-
-Netdisk link: https://pan.baidu.com/s/1S_CGBjWOUAuxL_cCX5kMPg
-Extraction code: `ppvi`
-
-### Script download
-
-- Download the training set link list file [train_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list) and the validation set link list file [val_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list).
-
-Write the download script `download.sh` as follows:
-
-```bash
-file=$1
-
-while read line 
-do
-  wget "$line"
-done <$file
-```
-
-Download training set command:
-```bash
-bash download.sh train_link.list
-```
-
-Download verification set command:
-```bash
-bash download.sh val_link.list
-```
-
----
-
-|category | Number of data  | list file |
-| :------: | :----------: | :----: |
-|Training set | 234619  |  [train.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list)|
-|Validation set | 19761 |  [val.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list)|
-
-- After downloading, unzip and add the data path to list file.
-
-- Due to the failure of some video link, part of original data is missing. This copies need about 135G of storage space.
-
-> This copies is only used for academic research. If it is helpful to you, welcome to star [our project](https://github.com/PaddlePaddle/PaddleVideo)
-
-
-## Frames
-In order to speed up the training process of the network, we first extract frames from the video file (K400 video file is in mp4 format). Compared with the method of network training directly through video files, the method of frames can greatly accelerate the speed of network training。
-
-Enter the following command to extract the frames of the K400 video file
-
-```python
-python extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4
-```
-
-After the video file frames are extracted, they will be stored in the specified `./rawframes` path, and the size is about 2T.
-
-|category | Number of data  | list file |
-| :------: | :----------: | :----: |
-|Training set | 234619  |  [train_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list)|
-|Validation set | 19761 |  [val_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list)|
diff --git a/docs/en/dataset/youtube8m.md b/docs/en/dataset/youtube8m.md
deleted file mode 100644
index 77c686042..000000000
--- a/docs/en/dataset/youtube8m.md
+++ /dev/null
@@ -1,56 +0,0 @@
-English | [简体中文](../../zh-CN/dataset/youtube8m.md)
-
-# YouTube-8M Data Preparation
-
-- [Introduction](#Introduction)
-- [Download](#Download)
-- [Conversion](#Conversion)
-
-
-## Introduction
-
-YouTube-8M is a large-scale video classification data set, containing more than 8 million video URLs. The tag system covers more than 3800 knowledge graph entities. One video corresponds to multiple tags (3-4 on average) and is labeled by machine.
-
-**The length of each video is between 120s and 500s
-Due to the large amount of video data, the image classification model was used to extract frame-level features in advance, and PCA was used to reduce the dimensionality of the features to obtain multi-frame 1024-dimensional features. Similarly, the audio model was used to obtain multi-frame 128-dimensional features. Audio characteristics. **
-> The dataset used here is the updated YouTube-8M data set in 2018 (May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features).
-  
-
-## Download
-1. Create a new directory for storing features (take the PaddleVideo directory as an example)
-    ```bash
-    cd data/yt8m
-    mkdir frame
-    cd frame
-    ```
-2. Download the training and validation set to the frame folder
-    ```bash
-    curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python
-    curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python
-    ```
-    The download process is shown in the figure
-    ![image](https://user-images.githubusercontent.com/23737287/140709613-1e2d6ec0-a82e-474d-b220-7803065b0153.png)
-
-    After the data download is complete, you will get 3844 training data files and 3844 verification data files (TFRecord format)
-
-## Conversion
-1. Install tensorflow to read tfrecord data
-    ```bash
-    python3.7 -m pip install tensorflow-gpu==1.14.0
-    ```
-2. Convert the downloaded TFRecord file into a pickle file for PaddlePaddle to use
-    ```bash
-    cd .. # From the frame directory back to the yt8m directory
-    python3.7 tf2pkl.py ./frame ./pkl_frame/ # Convert train*.tfrecord and validate*.tfrecord in the frame folder to pkl format
-    ```
-3. Generate a single pkl file path set, and split pkl into multiple small pkl files based on this file, and generate the final split pkl file path required
-    ```bash
-    ls pkl_frame/train*.pkl> train.list # Write the path of train*.pkl to train.list
-    ls pkl_frame/validate*.pkl> val.list # Write the path of validate*.pkl into val.list
-
-    python3.7 split_yt8m.py train.list # Split each train*.pkl into multiple train*_split*.pkl
-    python3.7 split_yt8m.py val.list # Split each validate*.pkl into multiple validate*_split*.pkl
-    
-    ls pkl_frame/train*_split*.pkl> train.list # Rewrite the path of train*_split*.pkl into train.list
-    ls pkl_frame/validate*_split*.pkl> val.list # Rewrite the path of validate*_split*.pkl into val.list
-    ``` 
diff --git a/docs/github-markdown.css b/docs/github-markdown.css
new file mode 100755
index 000000000..96a4f29e6
--- /dev/null
+++ b/docs/github-markdown.css
@@ -0,0 +1,1197 @@
+@media (prefers-color-scheme: dark) {
+
+    .markdown-body,
+    [data-theme="dark"] {
+        /*dark*/
+        color-scheme: dark;
+        --color-prettylights-syntax-comment: #8b949e;
+        --color-prettylights-syntax-constant: #79c0ff;
+        --color-prettylights-syntax-entity: #d2a8ff;
+        --color-prettylights-syntax-storage-modifier-import: #c9d1d9;
+        --color-prettylights-syntax-entity-tag: #7ee787;
+        --color-prettylights-syntax-keyword: #ff7b72;
+        --color-prettylights-syntax-string: #a5d6ff;
+        --color-prettylights-syntax-variable: #ffa657;
+        --color-prettylights-syntax-brackethighlighter-unmatched: #f85149;
+        --color-prettylights-syntax-invalid-illegal-text: #f0f6fc;
+        --color-prettylights-syntax-invalid-illegal-bg: #8e1519;
+        --color-prettylights-syntax-carriage-return-text: #f0f6fc;
+        --color-prettylights-syntax-carriage-return-bg: #b62324;
+        --color-prettylights-syntax-string-regexp: #7ee787;
+        --color-prettylights-syntax-markup-list: #f2cc60;
+        --color-prettylights-syntax-markup-heading: #1f6feb;
+        --color-prettylights-syntax-markup-italic: #c9d1d9;
+        --color-prettylights-syntax-markup-bold: #c9d1d9;
+        --color-prettylights-syntax-markup-deleted-text: #ffdcd7;
+        --color-prettylights-syntax-markup-deleted-bg: #67060c;
+        --color-prettylights-syntax-markup-inserted-text: #aff5b4;
+        --color-prettylights-syntax-markup-inserted-bg: #033a16;
+        --color-prettylights-syntax-markup-changed-text: #ffdfb6;
+        --color-prettylights-syntax-markup-changed-bg: #5a1e02;
+        --color-prettylights-syntax-markup-ignored-text: #c9d1d9;
+        --color-prettylights-syntax-markup-ignored-bg: #1158c7;
+        --color-prettylights-syntax-meta-diff-range: #d2a8ff;
+        --color-prettylights-syntax-brackethighlighter-angle: #8b949e;
+        --color-prettylights-syntax-sublimelinter-gutter-mark: #484f58;
+        --color-prettylights-syntax-constant-other-reference-link: #a5d6ff;
+        --color-fg-default: #e6edf3;
+        --color-fg-muted: #848d97;
+        --color-fg-subtle: #6e7681;
+        --color-canvas-default: #0d1117;
+        --color-canvas-subtle: #161b22;
+        --color-border-default: #30363d;
+        --color-border-muted: #21262d;
+        --color-neutral-muted: rgba(110, 118, 129, 0.4);
+        --color-accent-fg: #2f81f7;
+        --color-accent-emphasis: #1f6feb;
+        --color-success-fg: #3fb950;
+        --color-success-emphasis: #238636;
+        --color-attention-fg: #d29922;
+        --color-attention-emphasis: #9e6a03;
+        --color-attention-subtle: rgba(187, 128, 9, 0.15);
+        --color-danger-fg: #f85149;
+        --color-danger-emphasis: #da3633;
+        --color-done-fg: #a371f7;
+        --color-done-emphasis: #8957e5;
+    }
+}
+
+@media (prefers-color-scheme: light) {
+
+    .markdown-body,
+    [data-theme="light"] {
+        /*light*/
+        color-scheme: light;
+        --color-prettylights-syntax-comment: #57606a;
+        --color-prettylights-syntax-constant: #0550ae;
+        --color-prettylights-syntax-entity: #6639ba;
+        --color-prettylights-syntax-storage-modifier-import: #24292f;
+        --color-prettylights-syntax-entity-tag: #116329;
+        --color-prettylights-syntax-keyword: #cf222e;
+        --color-prettylights-syntax-string: #0a3069;
+        --color-prettylights-syntax-variable: #953800;
+        --color-prettylights-syntax-brackethighlighter-unmatched: #82071e;
+        --color-prettylights-syntax-invalid-illegal-text: #f6f8fa;
+        --color-prettylights-syntax-invalid-illegal-bg: #82071e;
+        --color-prettylights-syntax-carriage-return-text: #f6f8fa;
+        --color-prettylights-syntax-carriage-return-bg: #cf222e;
+        --color-prettylights-syntax-string-regexp: #116329;
+        --color-prettylights-syntax-markup-list: #3b2300;
+        --color-prettylights-syntax-markup-heading: #0550ae;
+        --color-prettylights-syntax-markup-italic: #24292f;
+        --color-prettylights-syntax-markup-bold: #24292f;
+        --color-prettylights-syntax-markup-deleted-text: #82071e;
+        --color-prettylights-syntax-markup-deleted-bg: #ffebe9;
+        --color-prettylights-syntax-markup-inserted-text: #116329;
+        --color-prettylights-syntax-markup-inserted-bg: #dafbe1;
+        --color-prettylights-syntax-markup-changed-text: #953800;
+        --color-prettylights-syntax-markup-changed-bg: #ffd8b5;
+        --color-prettylights-syntax-markup-ignored-text: #eaeef2;
+        --color-prettylights-syntax-markup-ignored-bg: #0550ae;
+        --color-prettylights-syntax-meta-diff-range: #8250df;
+        --color-prettylights-syntax-brackethighlighter-angle: #57606a;
+        --color-prettylights-syntax-sublimelinter-gutter-mark: #8c959f;
+        --color-prettylights-syntax-constant-other-reference-link: #0a3069;
+        --color-fg-default: #1F2328;
+        --color-fg-muted: #656d76;
+        --color-fg-subtle: #6e7781;
+        --color-canvas-default: #ffffff;
+        --color-canvas-subtle: #f6f8fa;
+        --color-border-default: #d0d7de;
+        --color-border-muted: hsla(210, 18%, 87%, 1);
+        --color-neutral-muted: rgba(175, 184, 193, 0.2);
+        --color-accent-fg: #0969da;
+        --color-accent-emphasis: #0969da;
+        --color-success-fg: #1a7f37;
+        --color-success-emphasis: #1f883d;
+        --color-attention-fg: #9a6700;
+        --color-attention-emphasis: #9a6700;
+        --color-attention-subtle: #fff8c5;
+        --color-danger-fg: #d1242f;
+        --color-danger-emphasis: #cf222e;
+        --color-done-fg: #8250df;
+        --color-done-emphasis: #8250df;
+    }
+}
+
+.markdown-body {
+    -ms-text-size-adjust: 100%;
+    -webkit-text-size-adjust: 100%;
+    margin: 0;
+    color: var(--color-fg-default);
+    background-color: var(--color-canvas-default);
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji";
+    font-size: 16px;
+    line-height: 1.5;
+    word-wrap: break-word;
+}
+
+.markdown-body .octicon {
+    display: inline-block;
+    fill: currentColor;
+    vertical-align: text-bottom;
+}
+
+.markdown-body h1:hover .anchor .octicon-link:before,
+.markdown-body h2:hover .anchor .octicon-link:before,
+.markdown-body h3:hover .anchor .octicon-link:before,
+.markdown-body h4:hover .anchor .octicon-link:before,
+.markdown-body h5:hover .anchor .octicon-link:before,
+.markdown-body h6:hover .anchor .octicon-link:before {
+    width: 16px;
+    height: 16px;
+    content: ' ';
+    display: inline-block;
+    background-color: currentColor;
+    -webkit-mask-image: url("data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 16 16' version='1.1' aria-hidden='true'><path fill-rule='evenodd' d='M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z'></path></svg>");
+    mask-image: url("data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 16 16' version='1.1' aria-hidden='true'><path fill-rule='evenodd' d='M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z'></path></svg>");
+}
+
+.markdown-body details,
+.markdown-body figcaption,
+.markdown-body figure {
+    display: block;
+}
+
+.markdown-body summary {
+    display: list-item;
+}
+
+.markdown-body [hidden] {
+    display: none !important;
+}
+
+.markdown-body a {
+    background-color: transparent;
+    color: var(--color-accent-fg);
+    text-decoration: none;
+}
+
+.markdown-body abbr[title] {
+    border-bottom: none;
+    -webkit-text-decoration: underline dotted;
+    text-decoration: underline dotted;
+}
+
+.markdown-body b,
+.markdown-body strong {
+    font-weight: var(--base-text-weight-semibold, 600);
+}
+
+.markdown-body dfn {
+    font-style: italic;
+}
+
+.markdown-body h1 {
+    margin: .67em 0;
+    font-weight: var(--base-text-weight-semibold, 600);
+    padding-bottom: .3em;
+    font-size: 2em;
+    border-bottom: 1px solid var(--color-border-muted);
+}
+
+.markdown-body mark {
+    background-color: var(--color-attention-subtle);
+    color: var(--color-fg-default);
+}
+
+.markdown-body small {
+    font-size: 90%;
+}
+
+.markdown-body sub,
+.markdown-body sup {
+    font-size: 75%;
+    line-height: 0;
+    position: relative;
+    vertical-align: baseline;
+}
+
+.markdown-body sub {
+    bottom: -0.25em;
+}
+
+.markdown-body sup {
+    top: -0.5em;
+}
+
+.markdown-body img {
+    border-style: none;
+    max-width: 100%;
+    box-sizing: content-box;
+    background-color: var(--color-canvas-default);
+}
+
+.markdown-body code,
+.markdown-body kbd,
+.markdown-body pre,
+.markdown-body samp {
+    font-family: monospace;
+    font-size: 1em;
+}
+
+.markdown-body figure {
+    margin: 1em 40px;
+}
+
+.markdown-body hr {
+    box-sizing: content-box;
+    overflow: hidden;
+    background: transparent;
+    border-bottom: 1px solid var(--color-border-muted);
+    height: .25em;
+    padding: 0;
+    margin: 24px 0;
+    background-color: var(--color-border-default);
+    border: 0;
+}
+
+.markdown-body input {
+    font: inherit;
+    margin: 0;
+    overflow: visible;
+    font-family: inherit;
+    font-size: inherit;
+    line-height: inherit;
+}
+
+.markdown-body [type=button],
+.markdown-body [type=reset],
+.markdown-body [type=submit] {
+    -webkit-appearance: button;
+    appearance: button;
+}
+
+.markdown-body [type=checkbox],
+.markdown-body [type=radio] {
+    box-sizing: border-box;
+    padding: 0;
+}
+
+.markdown-body [type=number]::-webkit-inner-spin-button,
+.markdown-body [type=number]::-webkit-outer-spin-button {
+    height: auto;
+}
+
+.markdown-body [type=search]::-webkit-search-cancel-button,
+.markdown-body [type=search]::-webkit-search-decoration {
+    -webkit-appearance: none;
+    appearance: none;
+}
+
+.markdown-body ::-webkit-input-placeholder {
+    color: inherit;
+    opacity: .54;
+}
+
+.markdown-body ::-webkit-file-upload-button {
+    -webkit-appearance: button;
+    appearance: button;
+    font: inherit;
+}
+
+.markdown-body a:hover {
+    text-decoration: underline;
+}
+
+.markdown-body ::placeholder {
+    color: var(--color-fg-subtle);
+    opacity: 1;
+}
+
+.markdown-body hr::before {
+    display: table;
+    content: "";
+}
+
+.markdown-body hr::after {
+    display: table;
+    clear: both;
+    content: "";
+}
+
+.markdown-body table {
+    border-spacing: 0;
+    border-collapse: collapse;
+    display: block;
+    width: max-content;
+    max-width: 100%;
+    overflow: auto;
+}
+
+.markdown-body td,
+.markdown-body th {
+    padding: 0;
+}
+
+.markdown-body details summary {
+    cursor: pointer;
+}
+
+.markdown-body details:not([open])>*:not(summary) {
+    display: none !important;
+}
+
+.markdown-body a:focus,
+.markdown-body [role=button]:focus,
+.markdown-body input[type=radio]:focus,
+.markdown-body input[type=checkbox]:focus {
+    outline: 2px solid var(--color-accent-fg);
+    outline-offset: -2px;
+    box-shadow: none;
+}
+
+.markdown-body a:focus:not(:focus-visible),
+.markdown-body [role=button]:focus:not(:focus-visible),
+.markdown-body input[type=radio]:focus:not(:focus-visible),
+.markdown-body input[type=checkbox]:focus:not(:focus-visible) {
+    outline: solid 1px transparent;
+}
+
+.markdown-body a:focus-visible,
+.markdown-body [role=button]:focus-visible,
+.markdown-body input[type=radio]:focus-visible,
+.markdown-body input[type=checkbox]:focus-visible {
+    outline: 2px solid var(--color-accent-fg);
+    outline-offset: -2px;
+    box-shadow: none;
+}
+
+.markdown-body a:not([class]):focus,
+.markdown-body a:not([class]):focus-visible,
+.markdown-body input[type=radio]:focus,
+.markdown-body input[type=radio]:focus-visible,
+.markdown-body input[type=checkbox]:focus,
+.markdown-body input[type=checkbox]:focus-visible {
+    outline-offset: 0;
+}
+
+.markdown-body kbd {
+    display: inline-block;
+    padding: 3px 5px;
+    font: 11px ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace;
+    line-height: 10px;
+    color: var(--color-fg-default);
+    vertical-align: middle;
+    background-color: var(--color-canvas-subtle);
+    border: solid 1px var(--color-neutral-muted);
+    border-bottom-color: var(--color-neutral-muted);
+    border-radius: 6px;
+    box-shadow: inset 0 -1px 0 var(--color-neutral-muted);
+}
+
+.markdown-body h1,
+.markdown-body h2,
+.markdown-body h3,
+.markdown-body h4,
+.markdown-body h5,
+.markdown-body h6 {
+    margin-top: 24px;
+    margin-bottom: 16px;
+    font-weight: var(--base-text-weight-semibold, 600);
+    line-height: 1.25;
+}
+
+.markdown-body h2 {
+    font-weight: var(--base-text-weight-semibold, 600);
+    padding-bottom: .3em;
+    font-size: 1.5em;
+    border-bottom: 1px solid var(--color-border-muted);
+}
+
+.markdown-body h3 {
+    font-weight: var(--base-text-weight-semibold, 600);
+    font-size: 1.25em;
+}
+
+.markdown-body h4 {
+    font-weight: var(--base-text-weight-semibold, 600);
+    font-size: 1em;
+}
+
+.markdown-body h5 {
+    font-weight: var(--base-text-weight-semibold, 600);
+    font-size: .875em;
+}
+
+.markdown-body h6 {
+    font-weight: var(--base-text-weight-semibold, 600);
+    font-size: .85em;
+    color: var(--color-fg-muted);
+}
+
+.markdown-body p {
+    margin-top: 0;
+    margin-bottom: 10px;
+}
+
+.markdown-body blockquote {
+    margin: 0;
+    padding: 0 1em;
+    color: var(--color-fg-muted);
+    border-left: .25em solid var(--color-border-default);
+}
+
+.markdown-body ul,
+.markdown-body ol {
+    margin-top: 0;
+    margin-bottom: 0;
+    padding-left: 2em;
+}
+
+.markdown-body ol ol,
+.markdown-body ul ol {
+    list-style-type: lower-roman;
+}
+
+.markdown-body ul ul ol,
+.markdown-body ul ol ol,
+.markdown-body ol ul ol,
+.markdown-body ol ol ol {
+    list-style-type: lower-alpha;
+}
+
+.markdown-body dd {
+    margin-left: 0;
+}
+
+.markdown-body tt,
+.markdown-body code,
+.markdown-body samp {
+    font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace;
+    font-size: 12px;
+}
+
+.markdown-body pre {
+    margin-top: 0;
+    margin-bottom: 0;
+    font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace;
+    font-size: 12px;
+    word-wrap: normal;
+}
+
+.markdown-body .octicon {
+    display: inline-block;
+    overflow: visible !important;
+    vertical-align: text-bottom;
+    fill: currentColor;
+}
+
+.markdown-body input::-webkit-outer-spin-button,
+.markdown-body input::-webkit-inner-spin-button {
+    margin: 0;
+    -webkit-appearance: none;
+    appearance: none;
+}
+
+.markdown-body .mr-2 {
+    margin-right: var(--base-size-8, 8px) !important;
+}
+
+.markdown-body::before {
+    display: table;
+    content: "";
+}
+
+.markdown-body::after {
+    display: table;
+    clear: both;
+    content: "";
+}
+
+.markdown-body>*:first-child {
+    margin-top: 0 !important;
+}
+
+.markdown-body>*:last-child {
+    margin-bottom: 0 !important;
+}
+
+.markdown-body a:not([href]) {
+    color: inherit;
+    text-decoration: none;
+}
+
+.markdown-body .absent {
+    color: var(--color-danger-fg);
+}
+
+.markdown-body .anchor {
+    float: left;
+    padding-right: 4px;
+    margin-left: -20px;
+    line-height: 1;
+}
+
+.markdown-body .anchor:focus {
+    outline: none;
+}
+
+.markdown-body p,
+.markdown-body blockquote,
+.markdown-body ul,
+.markdown-body ol,
+.markdown-body dl,
+.markdown-body table,
+.markdown-body pre,
+.markdown-body details {
+    margin-top: 0;
+    margin-bottom: 16px;
+}
+
+.markdown-body blockquote>:first-child {
+    margin-top: 0;
+}
+
+.markdown-body blockquote>:last-child {
+    margin-bottom: 0;
+}
+
+.markdown-body h1 .octicon-link,
+.markdown-body h2 .octicon-link,
+.markdown-body h3 .octicon-link,
+.markdown-body h4 .octicon-link,
+.markdown-body h5 .octicon-link,
+.markdown-body h6 .octicon-link {
+    color: var(--color-fg-default);
+    vertical-align: middle;
+    visibility: hidden;
+}
+
+.markdown-body h1:hover .anchor,
+.markdown-body h2:hover .anchor,
+.markdown-body h3:hover .anchor,
+.markdown-body h4:hover .anchor,
+.markdown-body h5:hover .anchor,
+.markdown-body h6:hover .anchor {
+    text-decoration: none;
+}
+
+.markdown-body h1:hover .anchor .octicon-link,
+.markdown-body h2:hover .anchor .octicon-link,
+.markdown-body h3:hover .anchor .octicon-link,
+.markdown-body h4:hover .anchor .octicon-link,
+.markdown-body h5:hover .anchor .octicon-link,
+.markdown-body h6:hover .anchor .octicon-link {
+    visibility: visible;
+}
+
+.markdown-body h1 tt,
+.markdown-body h1 code,
+.markdown-body h2 tt,
+.markdown-body h2 code,
+.markdown-body h3 tt,
+.markdown-body h3 code,
+.markdown-body h4 tt,
+.markdown-body h4 code,
+.markdown-body h5 tt,
+.markdown-body h5 code,
+.markdown-body h6 tt,
+.markdown-body h6 code {
+    padding: 0 .2em;
+    font-size: inherit;
+}
+
+.markdown-body summary h1,
+.markdown-body summary h2,
+.markdown-body summary h3,
+.markdown-body summary h4,
+.markdown-body summary h5,
+.markdown-body summary h6 {
+    display: inline-block;
+}
+
+.markdown-body summary h1 .anchor,
+.markdown-body summary h2 .anchor,
+.markdown-body summary h3 .anchor,
+.markdown-body summary h4 .anchor,
+.markdown-body summary h5 .anchor,
+.markdown-body summary h6 .anchor {
+    margin-left: -40px;
+}
+
+.markdown-body summary h1,
+.markdown-body summary h2 {
+    padding-bottom: 0;
+    border-bottom: 0;
+}
+
+.markdown-body ul.no-list,
+.markdown-body ol.no-list {
+    padding: 0;
+    list-style-type: none;
+}
+
+.markdown-body ol[type="a s"] {
+    list-style-type: lower-alpha;
+}
+
+.markdown-body ol[type="A s"] {
+    list-style-type: upper-alpha;
+}
+
+.markdown-body ol[type="i s"] {
+    list-style-type: lower-roman;
+}
+
+.markdown-body ol[type="I s"] {
+    list-style-type: upper-roman;
+}
+
+.markdown-body ol[type="1"] {
+    list-style-type: decimal;
+}
+
+.markdown-body div>ol:not([type]) {
+    list-style-type: decimal;
+}
+
+.markdown-body ul ul,
+.markdown-body ul ol,
+.markdown-body ol ol,
+.markdown-body ol ul {
+    margin-top: 0;
+    margin-bottom: 0;
+}
+
+.markdown-body li>p {
+    margin-top: 16px;
+}
+
+.markdown-body li+li {
+    margin-top: .25em;
+}
+
+.markdown-body dl {
+    padding: 0;
+}
+
+.markdown-body dl dt {
+    padding: 0;
+    margin-top: 16px;
+    font-size: 1em;
+    font-style: italic;
+    font-weight: var(--base-text-weight-semibold, 600);
+}
+
+.markdown-body dl dd {
+    padding: 0 16px;
+    margin-bottom: 16px;
+}
+
+.markdown-body table th {
+    font-weight: var(--base-text-weight-semibold, 600);
+}
+
+.markdown-body table th,
+.markdown-body table td {
+    padding: 6px 13px;
+    border: 1px solid var(--color-border-default);
+}
+
+.markdown-body table td>:last-child {
+    margin-bottom: 0;
+}
+
+.markdown-body table tr {
+    background-color: var(--color-canvas-default);
+    border-top: 1px solid var(--color-border-muted);
+}
+
+.markdown-body table tr:nth-child(2n) {
+    background-color: var(--color-canvas-subtle);
+}
+
+.markdown-body table img {
+    background-color: transparent;
+}
+
+.markdown-body img[align=right] {
+    padding-left: 20px;
+}
+
+.markdown-body img[align=left] {
+    padding-right: 20px;
+}
+
+.markdown-body .emoji {
+    max-width: none;
+    vertical-align: text-top;
+    background-color: transparent;
+}
+
+.markdown-body span.frame {
+    display: block;
+    overflow: hidden;
+}
+
+.markdown-body span.frame>span {
+    display: block;
+    float: left;
+    width: auto;
+    padding: 7px;
+    margin: 13px 0 0;
+    overflow: hidden;
+    border: 1px solid var(--color-border-default);
+}
+
+.markdown-body span.frame span img {
+    display: block;
+    float: left;
+}
+
+.markdown-body span.frame span span {
+    display: block;
+    padding: 5px 0 0;
+    clear: both;
+    color: var(--color-fg-default);
+}
+
+.markdown-body span.align-center {
+    display: block;
+    overflow: hidden;
+    clear: both;
+}
+
+.markdown-body span.align-center>span {
+    display: block;
+    margin: 13px auto 0;
+    overflow: hidden;
+    text-align: center;
+}
+
+.markdown-body span.align-center span img {
+    margin: 0 auto;
+    text-align: center;
+}
+
+.markdown-body span.align-right {
+    display: block;
+    overflow: hidden;
+    clear: both;
+}
+
+.markdown-body span.align-right>span {
+    display: block;
+    margin: 13px 0 0;
+    overflow: hidden;
+    text-align: right;
+}
+
+.markdown-body span.align-right span img {
+    margin: 0;
+    text-align: right;
+}
+
+.markdown-body span.float-left {
+    display: block;
+    float: left;
+    margin-right: 13px;
+    overflow: hidden;
+}
+
+.markdown-body span.float-left span {
+    margin: 13px 0 0;
+}
+
+.markdown-body span.float-right {
+    display: block;
+    float: right;
+    margin-left: 13px;
+    overflow: hidden;
+}
+
+.markdown-body span.float-right>span {
+    display: block;
+    margin: 13px auto 0;
+    overflow: hidden;
+    text-align: right;
+}
+
+.markdown-body code,
+.markdown-body tt {
+    padding: .2em .4em;
+    margin: 0;
+    font-size: 85%;
+    white-space: break-spaces;
+    background-color: var(--color-neutral-muted);
+    border-radius: 6px;
+}
+
+.markdown-body code br,
+.markdown-body tt br {
+    display: none;
+}
+
+.markdown-body del code {
+    text-decoration: inherit;
+}
+
+.markdown-body samp {
+    font-size: 85%;
+}
+
+.markdown-body pre code {
+    font-size: 100%;
+}
+
+.markdown-body pre>code {
+    padding: 0;
+    margin: 0;
+    word-break: normal;
+    white-space: pre;
+    background: transparent;
+    border: 0;
+}
+
+.markdown-body .highlight {
+    margin-bottom: 16px;
+}
+
+.markdown-body .highlight pre {
+    margin-bottom: 0;
+    word-break: normal;
+}
+
+.markdown-body .highlight pre,
+.markdown-body pre {
+    padding: 16px;
+    overflow: auto;
+    font-size: 85%;
+    line-height: 1.45;
+    color: var(--color-fg-default);
+    background-color: var(--color-canvas-subtle);
+    border-radius: 6px;
+}
+
+.markdown-body pre code,
+.markdown-body pre tt {
+    display: inline;
+    max-width: auto;
+    padding: 0;
+    margin: 0;
+    overflow: visible;
+    line-height: inherit;
+    word-wrap: normal;
+    background-color: transparent;
+    border: 0;
+}
+
+.markdown-body .csv-data td,
+.markdown-body .csv-data th {
+    padding: 5px;
+    overflow: hidden;
+    font-size: 12px;
+    line-height: 1;
+    text-align: left;
+    white-space: nowrap;
+}
+
+.markdown-body .csv-data .blob-num {
+    padding: 10px 8px 9px;
+    text-align: right;
+    background: var(--color-canvas-default);
+    border: 0;
+}
+
+.markdown-body .csv-data tr {
+    border-top: 0;
+}
+
+.markdown-body .csv-data th {
+    font-weight: var(--base-text-weight-semibold, 600);
+    background: var(--color-canvas-subtle);
+    border-top: 0;
+}
+
+.markdown-body [data-footnote-ref]::before {
+    content: "[";
+}
+
+.markdown-body [data-footnote-ref]::after {
+    content: "]";
+}
+
+.markdown-body .footnotes {
+    font-size: 12px;
+    color: var(--color-fg-muted);
+    border-top: 1px solid var(--color-border-default);
+}
+
+.markdown-body .footnotes ol {
+    padding-left: 16px;
+}
+
+.markdown-body .footnotes ol ul {
+    display: inline-block;
+    padding-left: 16px;
+    margin-top: 16px;
+}
+
+.markdown-body .footnotes li {
+    position: relative;
+}
+
+.markdown-body .footnotes li:target::before {
+    position: absolute;
+    top: -8px;
+    right: -8px;
+    bottom: -8px;
+    left: -24px;
+    pointer-events: none;
+    content: "";
+    border: 2px solid var(--color-accent-emphasis);
+    border-radius: 6px;
+}
+
+.markdown-body .footnotes li:target {
+    color: var(--color-fg-default);
+}
+
+.markdown-body .footnotes .data-footnote-backref g-emoji {
+    font-family: monospace;
+}
+
+.markdown-body .pl-c {
+    color: var(--color-prettylights-syntax-comment);
+}
+
+.markdown-body .pl-c1,
+.markdown-body .pl-s .pl-v {
+    color: var(--color-prettylights-syntax-constant);
+}
+
+.markdown-body .pl-e,
+.markdown-body .pl-en {
+    color: var(--color-prettylights-syntax-entity);
+}
+
+.markdown-body .pl-smi,
+.markdown-body .pl-s .pl-s1 {
+    color: var(--color-prettylights-syntax-storage-modifier-import);
+}
+
+.markdown-body .pl-ent {
+    color: var(--color-prettylights-syntax-entity-tag);
+}
+
+.markdown-body .pl-k {
+    color: var(--color-prettylights-syntax-keyword);
+}
+
+.markdown-body .pl-s,
+.markdown-body .pl-pds,
+.markdown-body .pl-s .pl-pse .pl-s1,
+.markdown-body .pl-sr,
+.markdown-body .pl-sr .pl-cce,
+.markdown-body .pl-sr .pl-sre,
+.markdown-body .pl-sr .pl-sra {
+    color: var(--color-prettylights-syntax-string);
+}
+
+.markdown-body .pl-v,
+.markdown-body .pl-smw {
+    color: var(--color-prettylights-syntax-variable);
+}
+
+.markdown-body .pl-bu {
+    color: var(--color-prettylights-syntax-brackethighlighter-unmatched);
+}
+
+.markdown-body .pl-ii {
+    color: var(--color-prettylights-syntax-invalid-illegal-text);
+    background-color: var(--color-prettylights-syntax-invalid-illegal-bg);
+}
+
+.markdown-body .pl-c2 {
+    color: var(--color-prettylights-syntax-carriage-return-text);
+    background-color: var(--color-prettylights-syntax-carriage-return-bg);
+}
+
+.markdown-body .pl-sr .pl-cce {
+    font-weight: bold;
+    color: var(--color-prettylights-syntax-string-regexp);
+}
+
+.markdown-body .pl-ml {
+    color: var(--color-prettylights-syntax-markup-list);
+}
+
+.markdown-body .pl-mh,
+.markdown-body .pl-mh .pl-en,
+.markdown-body .pl-ms {
+    font-weight: bold;
+    color: var(--color-prettylights-syntax-markup-heading);
+}
+
+.markdown-body .pl-mi {
+    font-style: italic;
+    color: var(--color-prettylights-syntax-markup-italic);
+}
+
+.markdown-body .pl-mb {
+    font-weight: bold;
+    color: var(--color-prettylights-syntax-markup-bold);
+}
+
+.markdown-body .pl-md {
+    color: var(--color-prettylights-syntax-markup-deleted-text);
+    background-color: var(--color-prettylights-syntax-markup-deleted-bg);
+}
+
+.markdown-body .pl-mi1 {
+    color: var(--color-prettylights-syntax-markup-inserted-text);
+    background-color: var(--color-prettylights-syntax-markup-inserted-bg);
+}
+
+.markdown-body .pl-mc {
+    color: var(--color-prettylights-syntax-markup-changed-text);
+    background-color: var(--color-prettylights-syntax-markup-changed-bg);
+}
+
+.markdown-body .pl-mi2 {
+    color: var(--color-prettylights-syntax-markup-ignored-text);
+    background-color: var(--color-prettylights-syntax-markup-ignored-bg);
+}
+
+.markdown-body .pl-mdr {
+    font-weight: bold;
+    color: var(--color-prettylights-syntax-meta-diff-range);
+}
+
+.markdown-body .pl-ba {
+    color: var(--color-prettylights-syntax-brackethighlighter-angle);
+}
+
+.markdown-body .pl-sg {
+    color: var(--color-prettylights-syntax-sublimelinter-gutter-mark);
+}
+
+.markdown-body .pl-corl {
+    text-decoration: underline;
+    color: var(--color-prettylights-syntax-constant-other-reference-link);
+}
+
+.markdown-body g-emoji {
+    display: inline-block;
+    min-width: 1ch;
+    font-family: "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
+    font-size: 1em;
+    font-style: normal !important;
+    font-weight: var(--base-text-weight-normal, 400);
+    line-height: 1;
+    vertical-align: -0.075em;
+}
+
+.markdown-body g-emoji img {
+    width: 1em;
+    height: 1em;
+}
+
+.markdown-body .task-list-item {
+    list-style-type: none;
+}
+
+.markdown-body .task-list-item label {
+    font-weight: var(--base-text-weight-normal, 400);
+}
+
+.markdown-body .task-list-item.enabled label {
+    cursor: pointer;
+}
+
+.markdown-body .task-list-item+.task-list-item {
+    margin-top: 4px;
+}
+
+.markdown-body .task-list-item .handle {
+    display: none;
+}
+
+.markdown-body .task-list-item-checkbox {
+    margin: 0 .2em .25em -1.4em;
+    vertical-align: middle;
+}
+
+.markdown-body .contains-task-list:dir(rtl) .task-list-item-checkbox {
+    margin: 0 -1.6em .25em .2em;
+}
+
+.markdown-body .contains-task-list {
+    position: relative;
+}
+
+.markdown-body .contains-task-list:hover .task-list-item-convert-container,
+.markdown-body .contains-task-list:focus-within .task-list-item-convert-container {
+    display: block;
+    width: auto;
+    height: 24px;
+    overflow: visible;
+    clip: auto;
+}
+
+.markdown-body ::-webkit-calendar-picker-indicator {
+    filter: invert(50%);
+}
+
+.markdown-body .markdown-alert {
+    padding: var(--base-size-8) var(--base-size-16);
+    margin-bottom: 16px;
+    color: inherit;
+    border-left: .25em solid var(--color-border-default);
+}
+
+.markdown-body .markdown-alert>:first-child {
+    margin-top: 0;
+}
+
+.markdown-body .markdown-alert>:last-child {
+    margin-bottom: 0;
+}
+
+.markdown-body .markdown-alert .markdown-alert-title {
+    display: flex;
+    font-weight: var(--base-text-weight-medium, 500);
+    align-items: center;
+    line-height: 1;
+}
+
+.markdown-body .markdown-alert.markdown-alert-note {
+    border-left-color: var(--color-accent-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-note .markdown-alert-title {
+    color: var(--color-accent-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-important {
+    border-left-color: var(--color-done-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-important .markdown-alert-title {
+    color: var(--color-done-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-warning {
+    border-left-color: var(--color-attention-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-warning .markdown-alert-title {
+    color: var(--color-attention-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-tip {
+    border-left-color: var(--color-success-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-tip .markdown-alert-title {
+    color: var(--color-success-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-caution {
+    border-left-color: var(--color-danger-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-caution .markdown-alert-title {
+    color: var(--color-danger-fg);
+}
\ No newline at end of file
diff --git a/docs/images/BMN.png b/docs/images/BMN.png
deleted file mode 100644
index ea0519812..000000000
Binary files a/docs/images/BMN.png and /dev/null differ
diff --git a/docs/images/FootballAction.gif b/docs/images/FootballAction.gif
deleted file mode 100644
index 244725176..000000000
Binary files a/docs/images/FootballAction.gif and /dev/null differ
diff --git a/docs/images/SlowFast.png b/docs/images/SlowFast.png
deleted file mode 100644
index 9b7db836f..000000000
Binary files a/docs/images/SlowFast.png and /dev/null differ
diff --git a/docs/images/VideoTag.gif b/docs/images/VideoTag.gif
deleted file mode 100644
index e60ddfbe4..000000000
Binary files a/docs/images/VideoTag.gif and /dev/null differ
diff --git a/docs/images/acc_vps.jpeg b/docs/images/acc_vps.jpeg
deleted file mode 100644
index 3b42cdd2d..000000000
Binary files a/docs/images/acc_vps.jpeg and /dev/null differ
diff --git a/docs/images/actbert.png b/docs/images/actbert.png
deleted file mode 100644
index 40b21e2c5..000000000
Binary files a/docs/images/actbert.png and /dev/null differ
diff --git a/docs/images/action_classification.png b/docs/images/action_classification.png
deleted file mode 100644
index 13e7f698c..000000000
Binary files a/docs/images/action_classification.png and /dev/null differ
diff --git a/docs/images/action_detection.png b/docs/images/action_detection.png
deleted file mode 100644
index 9ddbd6234..000000000
Binary files a/docs/images/action_detection.png and /dev/null differ
diff --git a/docs/images/action_framework.png b/docs/images/action_framework.png
deleted file mode 100644
index 7cc33271a..000000000
Binary files a/docs/images/action_framework.png and /dev/null differ
diff --git a/docs/images/agcn2s.png b/docs/images/agcn2s.png
deleted file mode 100644
index f6628eaa2..000000000
Binary files a/docs/images/agcn2s.png and /dev/null differ
diff --git a/docs/images/agcn2s_result.png b/docs/images/agcn2s_result.png
deleted file mode 100644
index f3e818765..000000000
Binary files a/docs/images/agcn2s_result.png and /dev/null differ
diff --git a/docs/images/application.png b/docs/images/application.png
deleted file mode 100644
index 777240898..000000000
Binary files a/docs/images/application.png and /dev/null differ
diff --git a/docs/images/asrf.png b/docs/images/asrf.png
deleted file mode 100644
index 3f49edde2..000000000
Binary files a/docs/images/asrf.png and /dev/null differ
diff --git a/docs/images/cfbi.png b/docs/images/cfbi.png
deleted file mode 100644
index cc34629ff..000000000
Binary files a/docs/images/cfbi.png and /dev/null differ
diff --git a/docs/images/classic_model.png b/docs/images/classic_model.png
deleted file mode 100644
index 21e849e12..000000000
Binary files a/docs/images/classic_model.png and /dev/null differ
diff --git a/docs/images/contribute/001_fork.png b/docs/images/contribute/001_fork.png
deleted file mode 100644
index 50a920dc0..000000000
Binary files a/docs/images/contribute/001_fork.png and /dev/null differ
diff --git a/docs/images/contribute/002_clone.png b/docs/images/contribute/002_clone.png
deleted file mode 100644
index 484e24f43..000000000
Binary files a/docs/images/contribute/002_clone.png and /dev/null differ
diff --git a/docs/images/contribute/003_precommit.png b/docs/images/contribute/003_precommit.png
deleted file mode 100644
index 067fb75dd..000000000
Binary files a/docs/images/contribute/003_precommit.png and /dev/null differ
diff --git a/docs/images/contribute/004_pr.png b/docs/images/contribute/004_pr.png
deleted file mode 100644
index 489141610..000000000
Binary files a/docs/images/contribute/004_pr.png and /dev/null differ
diff --git a/docs/images/ctrgcn.jpg b/docs/images/ctrgcn.jpg
deleted file mode 100644
index 899da3639..000000000
Binary files a/docs/images/ctrgcn.jpg and /dev/null differ
diff --git a/docs/images/features.png b/docs/images/features.png
deleted file mode 100644
index 56d1e41d4..000000000
Binary files a/docs/images/features.png and /dev/null differ
diff --git a/docs/images/features_en.png b/docs/images/features_en.png
deleted file mode 100644
index 5495d0e5d..000000000
Binary files a/docs/images/features_en.png and /dev/null differ
diff --git a/docs/images/home.gif b/docs/images/home.gif
deleted file mode 100644
index 1335bb00d..000000000
Binary files a/docs/images/home.gif and /dev/null differ
diff --git a/docs/images/horse_riding.gif b/docs/images/horse_riding.gif
deleted file mode 100644
index c52a3cf98..000000000
Binary files a/docs/images/horse_riding.gif and /dev/null differ
diff --git a/docs/images/i3d_compare.jpg b/docs/images/i3d_compare.jpg
deleted file mode 100644
index 548e9daab..000000000
Binary files a/docs/images/i3d_compare.jpg and /dev/null differ
diff --git a/docs/images/i3d_expand.jpg b/docs/images/i3d_expand.jpg
deleted file mode 100644
index c183e1cb2..000000000
Binary files a/docs/images/i3d_expand.jpg and /dev/null differ
diff --git a/docs/images/i3d_expriment1.jpg b/docs/images/i3d_expriment1.jpg
deleted file mode 100644
index daee822d8..000000000
Binary files a/docs/images/i3d_expriment1.jpg and /dev/null differ
diff --git a/docs/images/i3d_expriment2.jpg b/docs/images/i3d_expriment2.jpg
deleted file mode 100644
index 499188e00..000000000
Binary files a/docs/images/i3d_expriment2.jpg and /dev/null differ
diff --git a/docs/images/joinus.PNG b/docs/images/joinus.PNG
deleted file mode 100644
index 00da92eda..000000000
Binary files a/docs/images/joinus.PNG and /dev/null differ
diff --git a/docs/images/mstcn.PNG b/docs/images/mstcn.PNG
deleted file mode 100644
index 354b53088..000000000
Binary files a/docs/images/mstcn.PNG and /dev/null differ
diff --git a/docs/images/multimodality.png b/docs/images/multimodality.png
deleted file mode 100644
index 22c4f3b2e..000000000
Binary files a/docs/images/multimodality.png and /dev/null differ
diff --git a/docs/images/oxford_image.png b/docs/images/oxford_image.png
deleted file mode 100644
index 5c1f09094..000000000
Binary files a/docs/images/oxford_image.png and /dev/null differ
diff --git a/docs/images/oxford_image_depth.png b/docs/images/oxford_image_depth.png
deleted file mode 100644
index ad74126b5..000000000
Binary files a/docs/images/oxford_image_depth.png and /dev/null differ
diff --git a/docs/images/residual_tsm.png b/docs/images/residual_tsm.png
deleted file mode 100644
index c7e1dcb45..000000000
Binary files a/docs/images/residual_tsm.png and /dev/null differ
diff --git a/docs/images/skeleton_example.png b/docs/images/skeleton_example.png
deleted file mode 100644
index 701603b25..000000000
Binary files a/docs/images/skeleton_example.png and /dev/null differ
diff --git a/docs/images/slowfast_network.jpg b/docs/images/slowfast_network.jpg
deleted file mode 100644
index 8ce3b9e95..000000000
Binary files a/docs/images/slowfast_network.jpg and /dev/null differ
diff --git a/docs/images/slowfast_structure.jpg b/docs/images/slowfast_structure.jpg
deleted file mode 100644
index 17b955efe..000000000
Binary files a/docs/images/slowfast_structure.jpg and /dev/null differ
diff --git a/docs/images/st-gcn.png b/docs/images/st-gcn.png
deleted file mode 100644
index a52c4277d..000000000
Binary files a/docs/images/st-gcn.png and /dev/null differ
diff --git a/docs/images/temporal.png b/docs/images/temporal.png
deleted file mode 100644
index 20cde2e95..000000000
Binary files a/docs/images/temporal.png and /dev/null differ
diff --git a/docs/images/timesformer_attention_arch.png b/docs/images/timesformer_attention_arch.png
deleted file mode 100644
index 4d331f12c..000000000
Binary files a/docs/images/timesformer_attention_arch.png and /dev/null differ
diff --git a/docs/images/timesformer_attention_visualize.png b/docs/images/timesformer_attention_visualize.png
deleted file mode 100644
index d7546ede2..000000000
Binary files a/docs/images/timesformer_attention_visualize.png and /dev/null differ
diff --git a/docs/images/tokenshift_structure.png b/docs/images/tokenshift_structure.png
deleted file mode 100644
index e20b32a45..000000000
Binary files a/docs/images/tokenshift_structure.png and /dev/null differ
diff --git a/docs/images/torch_tsm.png b/docs/images/torch_tsm.png
deleted file mode 100644
index d4fde0cca..000000000
Binary files a/docs/images/torch_tsm.png and /dev/null differ
diff --git a/docs/images/transnetv2.png b/docs/images/transnetv2.png
deleted file mode 100644
index 8b48e8c6b..000000000
Binary files a/docs/images/transnetv2.png and /dev/null differ
diff --git a/docs/images/tsm_architecture.png b/docs/images/tsm_architecture.png
deleted file mode 100644
index 286792350..000000000
Binary files a/docs/images/tsm_architecture.png and /dev/null differ
diff --git a/docs/images/tsm_intr.png b/docs/images/tsm_intr.png
deleted file mode 100644
index c8e32e732..000000000
Binary files a/docs/images/tsm_intr.png and /dev/null differ
diff --git a/docs/images/tsm_op.png b/docs/images/tsm_op.png
deleted file mode 100644
index dc8532575..000000000
Binary files a/docs/images/tsm_op.png and /dev/null differ
diff --git a/docs/images/tsn_architecture.png b/docs/images/tsn_architecture.png
deleted file mode 100644
index d605f089d..000000000
Binary files a/docs/images/tsn_architecture.png and /dev/null differ
diff --git a/docs/images/tsn_input.jpg b/docs/images/tsn_input.jpg
deleted file mode 100644
index 391179c55..000000000
Binary files a/docs/images/tsn_input.jpg and /dev/null differ
diff --git a/docs/images/tsn_structure.jpg b/docs/images/tsn_structure.jpg
deleted file mode 100644
index f7d1ddb9e..000000000
Binary files a/docs/images/tsn_structure.jpg and /dev/null differ
diff --git a/docs/images/user_group.png b/docs/images/user_group.png
deleted file mode 100644
index e3dbfb33b..000000000
Binary files a/docs/images/user_group.png and /dev/null differ
diff --git a/docs/images/videodata.png b/docs/images/videodata.png
deleted file mode 100644
index f1400fec4..000000000
Binary files a/docs/images/videodata.png and /dev/null differ
diff --git a/docs/images/videoswin.jpg b/docs/images/videoswin.jpg
deleted file mode 100644
index 1d16dbb05..000000000
Binary files a/docs/images/videoswin.jpg and /dev/null differ
diff --git a/docs/images/yowo.jpg b/docs/images/yowo.jpg
deleted file mode 100644
index de4ebefe8..000000000
Binary files a/docs/images/yowo.jpg and /dev/null differ
diff --git a/docs/index.html b/docs/index.html
new file mode 100755
index 000000000..d1154b4d3
--- /dev/null
+++ b/docs/index.html
@@ -0,0 +1,1250 @@
+<!DOCTYPE html>
+<html lang="en">
+<!-- TODO: add debug flag to pop up alert window when error occurs -->
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+    <link rel="icon"
+        href="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' fill='currentColor' class='bi bi-search' viewBox='0 0 16 16'%3E%3Cpath d='M11.742 10.344a6.5 6.5 0 1 0-1.397 1.398h-.001c.03.04.062.078.098.115l3.85 3.85a1 1 0 0 0 1.415-1.414l-3.85-3.85a1.007 1.007 0 0 0-.115-.1zM12 6.5a5.5 5.5 0 1 1-11 0 5.5 5.5 0 0 1 11 0'/%3E%3C/svg%3E"
+        type="image/svg+xml">
+    <title>Search Code By Comment</title>
+
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css">
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap-icons/font/bootstrap-icons.css" rel="stylesheet">
+
+    <script type="text/javascript">
+        /**
+         * FlexSearch.js v0.7.31 (Bundle)
+         * Author and Copyright: Thomas Wilkerling
+         * Licence: Apache-2.0
+         * Hosted by Nextapps GmbH
+         * https://github.com/nextapps-de/flexsearch
+         */
+        (function _f(self) {
+            'use strict'; try { if (module) self = module } catch (e) { } self._factory = _f; var t; function u(a) { return "undefined" !== typeof a ? a : !0 } function aa(a) { const b = Array(a); for (let c = 0; c < a; c++)b[c] = v(); return b } function v() { return Object.create(null) } function ba(a, b) { return b.length - a.length } function x(a) { return "string" === typeof a } function C(a) { return "object" === typeof a } function D(a) { return "function" === typeof a }; function ca(a, b) { var c = da; if (a && (b && (a = E(a, b)), this.H && (a = E(a, this.H)), this.J && 1 < a.length && (a = E(a, this.J)), c || "" === c)) { a = a.split(c); if (this.filter) { b = this.filter; c = a.length; const d = []; for (let e = 0, f = 0; e < c; e++) { const g = a[e]; g && !b[g] && (d[f++] = g) } a = d } return a } return a } const da = /[\s\xA0\u2000-\u200B\u2028\u2029\u3000\ufeff!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]/
+ /* /[\p{Z}\p{S}\p{P}\p{C}]+/u */, ea = /[\u0300-\u036f]/g;
+            function fa(a, b) { const c = Object.keys(a), d = c.length, e = []; let f = "", g = 0; for (let h = 0, k, m; h < d; h++)k = c[h], (m = a[k]) ? (e[g++] = F(b ? "(?!\\b)" + k + "(\\b|_)" : k), e[g++] = m) : f += (f ? "|" : "") + k; f && (e[g++] = F(b ? "(?!\\b)(" + f + ")(\\b|_)" : "(" + f + ")"), e[g] = ""); return e } function E(a, b) { for (let c = 0, d = b.length; c < d && (a = a.replace(b[c], b[c + 1]), a); c += 2); return a } function F(a) { return new RegExp(a, "g") } function ha(a) { let b = "", c = ""; for (let d = 0, e = a.length, f; d < e; d++)(f = a[d]) !== c && (b += c = f); return b }; var ja = { encode: ia, F: !1, G: "" }; function ia(a) { return ca.call(this, ("" + a).toLowerCase(), !1) }; const ka = {}, G = {}; function la(a) { I(a, "add"); I(a, "append"); I(a, "search"); I(a, "update"); I(a, "remove") } function I(a, b) { a[b + "Async"] = function () { const c = this, d = arguments; var e = d[d.length - 1]; let f; D(e) && (f = e, delete d[d.length - 1]); e = new Promise(function (g) { setTimeout(function () { c.async = !0; const h = c[b].apply(c, d); c.async = !1; g(h) }) }); return f ? (e.then(f), this) : e } }; function ma(a, b, c, d) {
+                const e = a.length; let f = [], g, h, k = 0; d && (d = []); for (let m = e - 1; 0 <= m; m--) { const n = a[m], w = n.length, q = v(); let r = !g; for (let l = 0; l < w; l++) { const p = n[l], z = p.length; if (z) for (let B = 0, A, y; B < z; B++)if (y = p[B], g) { if (g[y]) { if (!m) if (c) c--; else if (f[k++] = y, k === b) return f; if (m || d) q[y] = 1; r = !0 } if (d && (A = (h[y] || 0) + 1, h[y] = A, A < e)) { const H = d[A - 2] || (d[A - 2] = []); H[H.length] = y } } else q[y] = 1 } if (d) g || (h = q); else if (!r) return []; g = q } if (d) for (let m = d.length - 1, n, w; 0 <= m; m--) {
+                    n = d[m]; w = n.length; for (let q = 0, r; q < w; q++)if (r =
+                        n[q], !g[r]) { if (c) c--; else if (f[k++] = r, k === b) return f; g[r] = 1 }
+                } return f
+            } function na(a, b) { const c = v(), d = v(), e = []; for (let f = 0; f < a.length; f++)c[a[f]] = 1; for (let f = 0, g; f < b.length; f++) { g = b[f]; for (let h = 0, k; h < g.length; h++)k = g[h], c[k] && !d[k] && (d[k] = 1, e[e.length] = k) } return e }; function J(a) { this.l = !0 !== a && a; this.cache = v(); this.h = [] } function oa(a, b, c) { C(a) && (a = a.query); let d = this.cache.get(a); d || (d = this.search(a, b, c), this.cache.set(a, d)); return d } J.prototype.set = function (a, b) { if (!this.cache[a]) { var c = this.h.length; c === this.l ? delete this.cache[this.h[c - 1]] : c++; for (--c; 0 < c; c--)this.h[c] = this.h[c - 1]; this.h[0] = a } this.cache[a] = b }; J.prototype.get = function (a) { const b = this.cache[a]; if (this.l && b && (a = this.h.indexOf(a))) { const c = this.h[a - 1]; this.h[a - 1] = this.h[a]; this.h[a] = c } return b }; const qa = { memory: { charset: "latin:extra", D: 3, B: 4, m: !1 }, performance: { D: 3, B: 3, s: !1, context: { depth: 2, D: 1 } }, match: { charset: "latin:extra", G: "reverse" }, score: { charset: "latin:advanced", D: 20, B: 3, context: { depth: 3, D: 9 } }, "default": {} }; function ra(a, b, c, d, e, f, g) { setTimeout(function () { const h = a(c ? c + "." + d : d, JSON.stringify(g)); h && h.then ? h.then(function () { b.export(a, b, c, e, f + 1) }) : b.export(a, b, c, e, f + 1) }) }; function K(a, b) {
+                if (!(this instanceof K)) return new K(a); var c; if (a) { x(a) ? a = qa[a] : (c = a.preset) && (a = Object.assign({}, c[c], a)); c = a.charset; var d = a.lang; x(c) && (-1 === c.indexOf(":") && (c += ":default"), c = G[c]); x(d) && (d = ka[d]) } else a = {}; let e, f, g = a.context || {}; this.encode = a.encode || c && c.encode || ia; this.register = b || v(); this.D = e = a.resolution || 9; this.G = b = c && c.G || a.tokenize || "strict"; this.depth = "strict" === b && g.depth; this.l = u(g.bidirectional); this.s = f = u(a.optimize); this.m = u(a.fastupdate); this.B = a.minlength || 1; this.C =
+                    a.boost; this.map = f ? aa(e) : v(); this.A = e = g.resolution || 1; this.h = f ? aa(e) : v(); this.F = c && c.F || a.rtl; this.H = (b = a.matcher || d && d.H) && fa(b, !1); this.J = (b = a.stemmer || d && d.J) && fa(b, !0); if (c = b = a.filter || d && d.filter) { c = b; d = v(); for (let h = 0, k = c.length; h < k; h++)d[c[h]] = 1; c = d } this.filter = c; this.cache = (b = a.cache) && new J(b)
+            } t = K.prototype; t.append = function (a, b) { return this.add(a, b, !0) };
+            t.add = function (a, b, c, d) {
+                if (b && (a || 0 === a)) {
+                    if (!d && !c && this.register[a]) return this.update(a, b); b = this.encode(b); if (d = b.length) {
+                        const m = v(), n = v(), w = this.depth, q = this.D; for (let r = 0; r < d; r++) {
+                            let l = b[this.F ? d - 1 - r : r]; var e = l.length; if (l && e >= this.B && (w || !n[l])) {
+                                var f = L(q, d, r), g = ""; switch (this.G) {
+                                    case "full": if (2 < e) { for (f = 0; f < e; f++)for (var h = e; h > f; h--)if (h - f >= this.B) { var k = L(q, d, r, e, f); g = l.substring(f, h); M(this, n, g, k, a, c) } break } case "reverse": if (1 < e) {
+                                        for (h = e - 1; 0 < h; h--)g = l[h] + g, g.length >= this.B && M(this, n,
+                                            g, L(q, d, r, e, h), a, c); g = ""
+                                    } case "forward": if (1 < e) { for (h = 0; h < e; h++)g += l[h], g.length >= this.B && M(this, n, g, f, a, c); break } default: if (this.C && (f = Math.min(f / this.C(b, l, r) | 0, q - 1)), M(this, n, l, f, a, c), w && 1 < d && r < d - 1) for (e = v(), g = this.A, f = l, h = Math.min(w + 1, d - r), e[f] = 1, k = 1; k < h; k++)if ((l = b[this.F ? d - 1 - r - k : r + k]) && l.length >= this.B && !e[l]) { e[l] = 1; const p = this.l && l > f; M(this, m, p ? f : l, L(g + (d / 2 > g ? 0 : 1), d, r, h - 1, k - 1), a, c, p ? l : f) }
+                                }
+                            }
+                        } this.m || (this.register[a] = 1)
+                    }
+                } return this
+            };
+            function L(a, b, c, d, e) { return c && 1 < a ? b + (d || 0) <= a ? c + (e || 0) : (a - 1) / (b + (d || 0)) * (c + (e || 0)) + 1 | 0 : 0 } function M(a, b, c, d, e, f, g) { let h = g ? a.h : a.map; if (!b[c] || g && !b[c][g]) a.s && (h = h[d]), g ? (b = b[c] || (b[c] = v()), b[g] = 1, h = h[g] || (h[g] = v())) : b[c] = 1, h = h[c] || (h[c] = []), a.s || (h = h[d] || (h[d] = [])), f && h.includes(e) || (h[h.length] = e, a.m && (a = a.register[e] || (a.register[e] = []), a[a.length] = h)) }
+            t.search = function (a, b, c) {
+                c || (!b && C(a) ? (c = a, a = c.query) : C(b) && (c = b)); let d = [], e; let f, g = 0; if (c) { a = c.query || a; b = c.limit; g = c.offset || 0; var h = c.context; f = c.suggest } if (a && (a = this.encode("" + a), e = a.length, 1 < e)) { c = v(); var k = []; for (let n = 0, w = 0, q; n < e; n++)if ((q = a[n]) && q.length >= this.B && !c[q]) if (this.s || f || this.map[q]) k[w++] = q, c[q] = 1; else return d; a = k; e = a.length } if (!e) return d; b || (b = 100); h = this.depth && 1 < e && !1 !== h; c = 0; let m; h ? (m = a[0], c = 1) : 1 < e && a.sort(ba); for (let n, w; c < e; c++) {
+                    w = a[c]; h ? (n = sa(this, d, f, b, g, 2 === e, w,
+                        m), f && !1 === n && d.length || (m = w)) : n = sa(this, d, f, b, g, 1 === e, w); if (n) return n; if (f && c === e - 1) { k = d.length; if (!k) { if (h) { h = 0; c = -1; continue } return d } if (1 === k) return ta(d[0], b, g) }
+                } return ma(d, b, g, f)
+            };
+            function sa(a, b, c, d, e, f, g, h) { let k = [], m = h ? a.h : a.map; a.s || (m = ua(m, g, h, a.l)); if (m) { let n = 0; const w = Math.min(m.length, h ? a.A : a.D); for (let q = 0, r = 0, l, p; q < w; q++)if (l = m[q]) if (a.s && (l = ua(l, g, h, a.l)), e && l && f && (p = l.length, p <= e ? (e -= p, l = null) : (l = l.slice(e), e = 0)), l && (k[n++] = l, f && (r += l.length, r >= d))) break; if (n) { if (f) return ta(k, d, 0); b[b.length] = k; return } } return !c && k } function ta(a, b, c) { a = 1 === a.length ? a[0] : [].concat.apply([], a); return c || a.length > b ? a.slice(c, c + b) : a }
+            function ua(a, b, c, d) { c ? (d = d && b > c, a = (a = a[d ? b : c]) && a[d ? c : b]) : a = a[b]; return a } t.contain = function (a) { return !!this.register[a] }; t.update = function (a, b) { return this.remove(a).add(a, b) };
+            t.remove = function (a, b) { const c = this.register[a]; if (c) { if (this.m) for (let d = 0, e; d < c.length; d++)e = c[d], e.splice(e.indexOf(a), 1); else N(this.map, a, this.D, this.s), this.depth && N(this.h, a, this.A, this.s); b || delete this.register[a]; if (this.cache) { b = this.cache; for (let d = 0, e, f; d < b.h.length; d++)f = b.h[d], e = b.cache[f], e.includes(a) && (b.h.splice(d--, 1), delete b.cache[f]) } } return this };
+            function N(a, b, c, d, e) { let f = 0; if (a.constructor === Array) if (e) b = a.indexOf(b), -1 !== b ? 1 < a.length && (a.splice(b, 1), f++) : f++; else { e = Math.min(a.length, c); for (let g = 0, h; g < e; g++)if (h = a[g]) f = N(h, b, c, d, e), d || f || delete a[g] } else for (let g in a) (f = N(a[g], b, c, d, e)) || delete a[g]; return f } t.searchCache = oa;
+            t.export = function (a, b, c, d, e) { let f, g; switch (e || (e = 0)) { case 0: f = "reg"; if (this.m) { g = v(); for (let h in this.register) g[h] = 1 } else g = this.register; break; case 1: f = "cfg"; g = { doc: 0, opt: this.s ? 1 : 0 }; break; case 2: f = "map"; g = this.map; break; case 3: f = "ctx"; g = this.h; break; default: return }ra(a, b || this, c, f, d, e, g); return !0 }; t.import = function (a, b) { if (b) switch (x(b) && (b = JSON.parse(b)), a) { case "cfg": this.s = !!b.opt; break; case "reg": this.m = !1; this.register = b; break; case "map": this.map = b; break; case "ctx": this.h = b } }; la(K.prototype); function va(a) { a = a.data; var b = self._index; const c = a.args; var d = a.task; switch (d) { case "init": d = a.options || {}; a = a.factory; b = d.encode; d.cache = !1; b && 0 === b.indexOf("function") && (d.encode = Function("return " + b)()); a ? (Function("return " + a)()(self), self._index = new self.FlexSearch.Index(d), delete self.FlexSearch) : self._index = new K(d); break; default: a = a.id, b = b[d].apply(b, c), postMessage("search" === d ? { id: a, msg: b } : { id: a }) } }; let wa = 0; function O(a) { if (!(this instanceof O)) return new O(a); var b; a ? D(b = a.encode) && (a.encode = b.toString()) : a = {}; (b = (self || window)._factory) && (b = b.toString()); const c = "undefined" === typeof window && self.exports, d = this; this.o = xa(b, c, a.worker); this.h = v(); if (this.o) { if (c) this.o.on("message", function (e) { d.h[e.id](e.msg); delete d.h[e.id] }); else this.o.onmessage = function (e) { e = e.data; d.h[e.id](e.msg); delete d.h[e.id] }; this.o.postMessage({ task: "init", factory: b, options: a }) } } P("add"); P("append"); P("search");
+            P("update"); P("remove"); function P(a) { O.prototype[a] = O.prototype[a + "Async"] = function () { const b = this, c = [].slice.call(arguments); var d = c[c.length - 1]; let e; D(d) && (e = d, c.splice(c.length - 1, 1)); d = new Promise(function (f) { setTimeout(function () { b.h[++wa] = f; b.o.postMessage({ task: a, id: wa, args: c }) }) }); return e ? (d.then(e), this) : d } }
+            function xa(a, b, c) { let d; try { d = b ? eval('new (require("worker_threads")["Worker"])("../dist/node/node.js")') : a ? new Worker(URL.createObjectURL(new Blob(["onmessage=" + va.toString()], { type: "text/javascript" }))) : new Worker(x(c) ? c : "worker/worker.js", { type: "module" }) } catch (e) { } return d }; function Q(a) {
+                if (!(this instanceof Q)) return new Q(a); var b = a.document || a.doc || a, c; this.K = []; this.h = []; this.A = []; this.register = v(); this.key = (c = b.key || b.id) && S(c, this.A) || "id"; this.m = u(a.fastupdate); this.C = (c = b.store) && !0 !== c && []; this.store = c && v(); this.I = (c = b.tag) && S(c, this.A); this.l = c && v(); this.cache = (c = a.cache) && new J(c); a.cache = !1; this.o = a.worker; this.async = !1; c = v(); let d = b.index || b.field || b; x(d) && (d = [d]); for (let e = 0, f, g; e < d.length; e++)f = d[e], x(f) || (g = f, f = f.field), g = C(g) ? Object.assign({}, a, g) : a,
+                    this.o && (c[f] = new O(g), c[f].o || (this.o = !1)), this.o || (c[f] = new K(g, this.register)), this.K[e] = S(f, this.A), this.h[e] = f; if (this.C) for (a = b.store, x(a) && (a = [a]), b = 0; b < a.length; b++)this.C[b] = S(a[b], this.A); this.index = c
+            } function S(a, b) { const c = a.split(":"); let d = 0; for (let e = 0; e < c.length; e++)a = c[e], 0 <= a.indexOf("[]") && (a = a.substring(0, a.length - 2)) && (b[d] = !0), a && (c[d++] = a); d < c.length && (c.length = d); return 1 < d ? c : c[0] } function T(a, b) { if (x(b)) a = a[b]; else for (let c = 0; a && c < b.length; c++)a = a[b[c]]; return a }
+            function U(a, b, c, d, e) { a = a[e]; if (d === c.length - 1) b[e] = a; else if (a) if (a.constructor === Array) for (b = b[e] = Array(a.length), e = 0; e < a.length; e++)U(a, b, c, d, e); else b = b[e] || (b[e] = v()), e = c[++d], U(a, b, c, d, e) } function V(a, b, c, d, e, f, g, h) { if (a = a[g]) if (d === b.length - 1) { if (a.constructor === Array) { if (c[d]) { for (b = 0; b < a.length; b++)e.add(f, a[b], !0, !0); return } a = a.join(" ") } e.add(f, a, h, !0) } else if (a.constructor === Array) for (g = 0; g < a.length; g++)V(a, b, c, d, e, f, g, h); else g = b[++d], V(a, b, c, d, e, f, g, h) } t = Q.prototype;
+            t.add = function (a, b, c) {
+                C(a) && (b = a, a = T(b, this.key)); if (b && (a || 0 === a)) {
+                    if (!c && this.register[a]) return this.update(a, b); for (let d = 0, e, f; d < this.h.length; d++)f = this.h[d], e = this.K[d], x(e) && (e = [e]), V(b, e, this.A, 0, this.index[f], a, e[0], c); if (this.I) { let d = T(b, this.I), e = v(); x(d) && (d = [d]); for (let f = 0, g, h; f < d.length; f++)if (g = d[f], !e[g] && (e[g] = 1, h = this.l[g] || (this.l[g] = []), !c || !h.includes(a))) if (h[h.length] = a, this.m) { const k = this.register[a] || (this.register[a] = []); k[k.length] = h } } if (this.store && (!c || !this.store[a])) {
+                        let d;
+                        if (this.C) { d = v(); for (let e = 0, f; e < this.C.length; e++)f = this.C[e], x(f) ? d[f] = b[f] : U(b, d, f, 0, f[0]) } this.store[a] = d || b
+                    }
+                } return this
+            }; t.append = function (a, b) { return this.add(a, b, !0) }; t.update = function (a, b) { return this.remove(a).add(a, b) };
+            t.remove = function (a) { C(a) && (a = T(a, this.key)); if (this.register[a]) { for (var b = 0; b < this.h.length && (this.index[this.h[b]].remove(a, !this.o), !this.m); b++); if (this.I && !this.m) for (let c in this.l) { b = this.l[c]; const d = b.indexOf(a); -1 !== d && (1 < b.length ? b.splice(d, 1) : delete this.l[c]) } this.store && delete this.store[a]; delete this.register[a] } return this };
+            t.search = function (a, b, c, d) {
+                c || (!b && C(a) ? (c = a, a = "") : C(b) && (c = b, b = 0)); let e = [], f = [], g, h, k, m, n, w, q = 0; if (c) if (c.constructor === Array) k = c, c = null; else { a = c.query || a; k = (g = c.pluck) || c.index || c.field; m = c.tag; h = this.store && c.enrich; n = "and" === c.bool; b = c.limit || b || 100; w = c.offset || 0; if (m && (x(m) && (m = [m]), !a)) { for (let l = 0, p; l < m.length; l++)if (p = ya.call(this, m[l], b, w, h)) e[e.length] = p, q++; return q ? e : [] } x(k) && (k = [k]) } k || (k = this.h); n = n && (1 < k.length || m && 1 < m.length); const r = !d && (this.o || this.async) && []; for (let l = 0, p, z, B; l <
+                    k.length; l++) { let A; z = k[l]; x(z) || (A = z, z = A.field, a = A.query || a, b = A.limit || b); if (r) r[l] = this.index[z].searchAsync(a, b, A || c); else { d ? p = d[l] : p = this.index[z].search(a, b, A || c); B = p && p.length; if (m && B) { const y = []; let H = 0; n && (y[0] = [p]); for (let X = 0, pa, R; X < m.length; X++)if (pa = m[X], B = (R = this.l[pa]) && R.length) H++, y[y.length] = n ? [R] : R; H && (p = n ? ma(y, b || 100, w || 0) : na(p, y), B = p.length) } if (B) f[q] = z, e[q++] = p; else if (n) return [] } } if (r) {
+                        const l = this; return new Promise(function (p) {
+                            Promise.all(r).then(function (z) {
+                                p(l.search(a, b,
+                                    c, z))
+                            })
+                        })
+                    } if (!q) return []; if (g && (!h || !this.store)) return e[0]; for (let l = 0, p; l < f.length; l++) { p = e[l]; p.length && h && (p = za.call(this, p)); if (g) return p; e[l] = { field: f[l], result: p } } return e
+            }; function ya(a, b, c, d) { let e = this.l[a], f = e && e.length - c; if (f && 0 < f) { if (f > b || c) e = e.slice(c, c + b); d && (e = za.call(this, e)); return { tag: a, result: e } } } function za(a) { const b = Array(a.length); for (let c = 0, d; c < a.length; c++)d = a[c], b[c] = { id: d, doc: this.store[d] }; return b } t.contain = function (a) { return !!this.register[a] }; t.get = function (a) { return this.store[a] };
+            t.set = function (a, b) { this.store[a] = b; return this }; t.searchCache = oa; t.export = function (a, b, c, d, e) { e || (e = 0); d || (d = 0); if (d < this.h.length) { const f = this.h[d], g = this.index[f]; b = this; setTimeout(function () { g.export(a, b, e ? f : "", d, e++) || (d++, e = 1, b.export(a, b, f, d, e)) }) } else { let f, g; switch (e) { case 1: f = "tag"; g = this.l; break; case 2: f = "store"; g = this.store; break; default: return }ra(a, this, c, f, d, e, g) } };
+            t.import = function (a, b) { if (b) switch (x(b) && (b = JSON.parse(b)), a) { case "tag": this.l = b; break; case "reg": this.m = !1; this.register = b; for (let d = 0, e; d < this.h.length; d++)e = this.index[this.h[d]], e.register = b, e.m = !1; break; case "store": this.store = b; break; default: a = a.split("."); const c = a[0]; a = a[1]; c && a && this.index[c].import(a, b) } }; la(Q.prototype); var Ba = { encode: Aa, F: !1, G: "" }; const Ca = [F("[\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5]"), "a", F("[\u00e8\u00e9\u00ea\u00eb]"), "e", F("[\u00ec\u00ed\u00ee\u00ef]"), "i", F("[\u00f2\u00f3\u00f4\u00f5\u00f6\u0151]"), "o", F("[\u00f9\u00fa\u00fb\u00fc\u0171]"), "u", F("[\u00fd\u0177\u00ff]"), "y", F("\u00f1"), "n", F("[\u00e7c]"), "k", F("\u00df"), "s", F(" & "), " and "]; function Aa(a) { var b = a = "" + a; b.normalize && (b = b.normalize("NFD").replace(ea, "")); return ca.call(this, b.toLowerCase(), !a.normalize && Ca) }; var Ea = { encode: Da, F: !1, G: "strict" }; const Fa = /[^a-z0-9]+/, Ga = { b: "p", v: "f", w: "f", z: "s", x: "s", "\u00df": "s", d: "t", n: "m", c: "k", g: "k", j: "k", q: "k", i: "e", y: "e", u: "o" }; function Da(a) { a = Aa.call(this, a).join(" "); const b = []; if (a) { const c = a.split(Fa), d = c.length; for (let e = 0, f, g = 0; e < d; e++)if ((a = c[e]) && (!this.filter || !this.filter[a])) { f = a[0]; let h = Ga[f] || f, k = h; for (let m = 1; m < a.length; m++) { f = a[m]; const n = Ga[f] || f; n && n !== k && (h += n, k = n) } b[g++] = h } } return b }; var Ia = { encode: Ha, F: !1, G: "" }; const Ja = [F("ae"), "a", F("oe"), "o", F("sh"), "s", F("th"), "t", F("ph"), "f", F("pf"), "f", F("(?![aeo])h(?![aeo])"), "", F("(?!^[aeo])h(?!^[aeo])"), ""]; function Ha(a, b) { a && (a = Da.call(this, a).join(" "), 2 < a.length && (a = E(a, Ja)), b || (1 < a.length && (a = ha(a)), a && (a = a.split(" ")))); return a || [] }; var La = { encode: Ka, F: !1, G: "" }; const Ma = F("(?!\\b)[aeo]"); function Ka(a) { a && (a = Ha.call(this, a, !0), 1 < a.length && (a = a.replace(Ma, "")), 1 < a.length && (a = ha(a)), a && (a = a.split(" "))); return a || [] }; G["latin:default"] = ja; G["latin:simple"] = Ba; G["latin:balance"] = Ea; G["latin:advanced"] = Ia; G["latin:extra"] = La; const W = self; let Y; const Z = { Index: K, Document: Q, Worker: O, registerCharset: function (a, b) { G[a] = b }, registerLanguage: function (a, b) { ka[a] = b } }; (Y = W.define) && Y.amd ? Y([], function () { return Z }) : W.exports ? W.exports = Z : W.FlexSearch = Z;
+        }(this));
+
+    </script>
+
+    <script src="https://cdn.jsdelivr.net/npm/mark.js"></script>
+
+
+    <style>
+        /* CSS for highlighted text */
+        mark {
+            background-color: yellow;
+            color: black;
+            font-weight: bold;
+        }
+    </style>
+    <link href="https://cdn.jsdelivr.net/npm/prismjs@v1.x/themes/prism.css" rel="stylesheet" />
+    <style>
+        code {
+            white-space: pre-wrap !important;
+        }
+
+        /* html, body{
+            max-width: 980px;
+        } */
+        /* Add custom styles for the drawer button */
+        #drawerButton {
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            z-index: 1000;
+        }
+
+        .grayed-out-button {
+            color: #ccc;
+            /* Set the text color to gray */
+            background-color: #f4f4f4;
+            /* Set a light gray background color */
+            border: 1px solid #ccc;
+            /* Add a gray border */
+            cursor: not-allowed;
+            /* Change the cursor to indicate the button is disabled */
+            pointer-events: none;
+            /* Disable pointer events to prevent interaction */
+        }
+
+        .grayed-out-button i {
+            color: #ccc;
+            /* Set the icon color to gray */
+        }
+
+
+        #drawer {
+            /* to visualize the alignment of buttons and text */
+            background-color: white;
+            margin: auto;
+            /* margin: 1.5%; */
+            /* margin-right: 10%; */
+            text-wrap: wrap;
+            /* margin-bottom: 10px; */
+        }
+
+        #drawer_mask {
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            background-color: rgba(0, 0, 0, 0.3);
+            /* background-color: rgba(0, 0, 0, 0.1); */
+        }
+
+        #progress-overlay {
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            background-color: rgba(0, 0, 0, 0.1);
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            z-index: 9999;
+        }
+
+        #progress-bar {
+            height: 20px;
+            background-color: #f2f2f2;
+            border: 1px solid #ccc;
+            overflow: hidden;
+        }
+
+        #progress-bar .progress {
+            width: 0;
+            height: 100%;
+            background-color: #4caf50 !important;
+        }
+    </style>
+    <style>
+        /* html, body{
+        background-color: transparent;
+    }
+    input {
+        background-color: transparent;
+    } */
+
+        .search-container {
+            margin-bottom: 10px;
+        }
+
+        /* 
+        h3,
+        h4 {
+            line-height: 0.5;
+        } */
+
+        p {
+            line-height: 1.5;
+        }
+
+        h2 {
+            overflow-x: auto;
+        }
+
+        /* Styles for mobile devices */
+        @media (max-width: 767px) {
+
+            #progress-bar {
+
+                width: 60%;
+            }
+
+            /*(h2{
+            font-size: 27px;
+        }*/
+            html,
+            body {
+
+                margin-left: 3%;
+                margin-right: 3%;
+            }
+
+            .repository-url {
+                display: none;
+            }
+
+            .search-container {
+                border: 1px solid #ccc;
+            }
+
+            .right-half {
+                /*padding-left: 10px;*/
+                padding-right: 10px;
+                overflow-y: auto;
+                overflow-x: auto;
+            }
+
+            .left-half {
+                padding-left: 15px;
+                padding-right: 10px;
+
+                padding-top: 5px;
+                padding-bottom: 5px;
+            }
+
+            #searchResults,
+            #searchInput {
+                font-size: 14px;
+            }
+
+            #searchInput {
+                z-index: 10;
+                text-indent: 15px;
+            }
+
+            .searchItemInfo {
+                padding-left: 15px;
+            }
+        }
+
+        /* Styles for desktop devices */
+        @media (min-width: 768px) {
+
+            #progress-bar {
+                width: 50%;
+            }
+
+            #searchInput {
+                z-index: 10;
+                text-indent: 20px;
+            }
+
+            html,
+            body {
+
+                margin-left: 5%;
+                margin-right: 5%;
+            }
+
+            #searchResults,
+            #searchInput {
+                font-size: 17px;
+            }
+
+            .searchItemInfo {
+                padding-left: 20px;
+            }
+
+            .search-container {
+                display: flex;
+                border: 1px solid #ccc;
+                /* justify-content: space-between; */
+                /* flex-direction: row; */
+                height: min-content;
+                /* align-items: flex-start; */
+            }
+
+            .left-half {
+                flex: 1;
+                /* align-self: stretch; */
+                height: 100%;
+                overflow-x: auto;
+
+                /* overflow-y: auto; */
+            }
+
+            .right-half {
+                flex: 1;
+                /* height:100%; */
+
+                /* align-self: stretch; */
+                overflow-x: auto;
+                overflow-y: scroll;
+            }
+
+            .right-half {
+
+                padding-right: 20px;
+            }
+
+            .left-half {
+                padding-top: 5px;
+                padding-left: 20px;
+                padding-right: 20px;
+            }
+        }
+
+        .left-half {
+            background-color: #ccc;
+
+        }
+
+        .right-half {
+            background-color: #f1f1f1;
+        }
+
+        .highlight {
+            background-color: yellow !important;
+            color: black !important;
+        }
+
+        .monospace-text {
+            font-family: "Courier New", monospace;
+            color: #333;
+        }
+
+        /* useless now */
+        pre {
+            overflow-x: visible !important;
+            overflow-y: visible !important;
+            /* overflow-x: auto; */
+            white-space: pre-wrap;
+            white-space: -moz-pre-wrap;
+            white-space: -pre-wrap;
+            white-space: -o-pre-wrap;
+            word-wrap: break-word;
+        }
+
+        button {
+            cursor: pointer;
+        }
+
+        .codelink:hover {
+            cursor: pointer;
+            text-decoration: underline;
+        }
+
+        .codelink {
+            word-wrap: break-word;
+            /* Allow long words to be broken and wrap onto the next line */
+            overflow-wrap: break-word
+        }
+
+        /*
+    .searchItemInfo:hover {
+         cursor: pointer;
+    }
+    .search-container:hover {
+         cursor: pointer;
+    }
+    */
+        .searchItem {
+            padding: 10px;
+            padding-bottom: 0;
+            padding-top: 0;
+            border: 1px solid #ccc;
+        }
+
+        /*
+    .searchItemInfo {
+        padding: 10px;
+    }
+*/
+    </style>
+    <style type="text/css">
+        html,
+        body {
+            /*max-width: 900px;*/
+            /* font-family: 'Roboto', sans-serif; */
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji";
+            height: 100%;
+            /* margin: 0; */
+            padding: 0;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .container {
+            display: flex;
+            flex-direction: column;
+            align-items: left;
+            justify-content: center;
+            text-align: left;
+            /* margin: 5%; */
+            margin-bottom: 10px;
+            margin-top: 1.5%;
+        }
+
+        input[type="text"] {
+            padding: 10px;
+            /*border-radius: 10px;*/
+            border: 1px solid #ccc;
+        }
+
+        .input-group {
+            position: relative;
+            display: flex;
+            /* flex-wrap: wrap; */
+            /* align-items: stretch; */
+            width: 100%;
+            justify-content: space-between;
+            align-items: center;
+        }
+
+        .input-group-button:hover {
+            cursor: pointer;
+        }
+
+        #sidebar-button:hover {
+            cursor: pointer;
+        }
+
+        .input-group-button {
+            border: 1px solid #ccc;
+            /* padding:auto; */
+            /* flex:1; */
+            z-index: 9;
+            padding: 10px;
+            margin: auto;
+        }
+
+        ul.search-results {
+            flex: 1;
+            /* Fill the remaining space */
+            overflow-y: scroll;
+            /* Enable vertical scrolling */
+            list-style: none;
+            padding: 0;
+            /* margin: 5%; */
+            margin-top: 0;
+        }
+
+        ul.search-results li {
+            background-color: #f2f2f2;
+            margin-bottom: 10px;
+            /*border-radius: 10px;*/
+            box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1);
+        }
+
+        #drawer_content>div>span:hover {
+            cursor: pointer;
+            text-decoration: underline;
+        }
+    </style>
+
+</head>
+
+
+<body>
+    <!-- <div> -->
+    <div id="drawer_mask" style="display:none;"></div>
+    <div id="drawer" style="display:none;">
+        <h2><span id="drawer_title"></span></h2>
+        <div id="drawer_content" style="line-height: 1.8; overflow-y:scroll; height:75%">
+            <!-- <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;world world world world world world world world world world world world world world</div> -->
+            <!-- subterms shall also be bold, but font size is smaller. -->
+            <!-- <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div>
+           <div><b>Basic Python Output: "world"</b></div>
+           <div>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;waitForDOMContentLoaded</div> -->
+
+        </div>
+    </div>
+    <!-- <div style="max-width: 980px; margin:0 auto;"> -->
+    <div id="progress-overlay">
+        <div id="progress-bar">
+            <div class="progress"></div>
+        </div>
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/prismjs@v1.x/components/prism-core.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/prismjs@v1.x/plugins/autoloader/prism-autoloader.min.js"></script>
+    <header class="container">
+        <h2 style="display: flex; justify-content: space-between; align-items: center;">
+            <div id="document_index"
+                style="flex: 1; float: left; white-space: nowrap; overflow: hidden; text-overflow: ellipsis;">
+                Document index<span class="repository-url"> of:
+                    <span id="partial-repository-url"></span></span></div>
+            <div style="float: right;">
+                <a title="Github repository" id="github-link" class="github-icon"><i class="fab fa-github"></i></a>
+                <a title="Project structure" id="brief-link" href="tree.html"><i class="bi bi-text-right"></i></a>
+                <a title="Result overview" href="#" id="sidebar-open" style="position:relative; z-index:8888;"><i
+                        class="bi bi-caret-left-square"></i></a>
+            </div>
+        </h2>
+        <div class="input-group">
+            <input type="text" id="searchInput" style="flex:1;" placeholder="Search...">
+            <button type="button" id="file_previous" class="input-group-button" title="Previous file"
+                style="display:none;"><i class="bi bi-caret-left"></i></button> <button type="button" id="file_next"
+                class="input-group-button" title="Next file" style="display:none;"><i
+                    class="bi bi-caret-right"></i></button>
+        </div>
+    </header>
+    <ul class="search-results" id="searchResults"></ul>
+    <script type="text/javascript" defer>
+        function generateUUID() {
+            return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function (c) {
+                var r = (Math.random() * 16) | 0,
+                    v = c == 'x' ? r : (r & 0x3) | 0x8;
+                return v.toString(16);
+            });
+        }
+        function toggleDrawer() {
+            const drawer = document.getElementById('drawer');
+            const mybutton = document.getElementById('sidebar-open');
+            const drawer_mask = document.getElementById('drawer_mask');
+            const drawer_title = document.getElementById('drawer_title');
+            const drawer_content = document.getElementById('drawer_content');
+            if (drawer.style.display != 'none') {
+                drawer.style.display = "none";
+                drawer_mask.style.display = "none";
+                mybutton.innerHTML = `<i class="bi bi-caret-left-square"></i>`;
+                mybutton.setAttribute("title", "Result overview")
+                return;
+            } else {
+                mybutton.innerHTML = `<i class="bi bi-caret-right-square"></i>`;
+                mybutton.setAttribute("title", "Hide overview")
+
+            }
+
+            // var element_close = document.getElementById("sidebar-close")
+            // var element = document.getElementById("sidebar-open");
+
+            // Get the position of the element
+            // var rect = element.getBoundingClientRect();
+
+            // Get the top and left position
+            // element_close.style.position = 'absolute';
+            // element_close.style.top = rect.top+"px";
+            // element_close.style.left = rect.left+"px";
+
+
+            drawer.style.position = 'absolute';
+            var pageHeight = window.innerHeight;
+            var pageWidth = window.innerWidth;
+            drawer.style.height = pageHeight + 'px';
+            drawer.style.width = pageWidth / 2 + 'px';
+
+            drawer.style.top = '0px';
+            if (pageWidth > 768) {
+                drawer.style.left = `${pageWidth / 2}px`;
+                drawer.style.width = "50%"
+                drawer.style.borderLeft = "1px solid #ccc";
+
+
+            } else {
+                drawer.style.left = `0px`;
+                drawer.style.width = "100%"
+            }
+            drawer.style.height = "100%"
+            // drawer.style.backgroundColor = "#ccc !important";
+            drawer.style.zIndex = '999';
+            drawer.style.display = '';
+
+            drawer_mask.style.zIndex = '998';
+            drawer_mask.style.display = "";
+            // Get the reference element
+            var referenceElement = document.getElementById("document_index");
+
+            // Get the top and left position of the reference element
+            var rect = referenceElement.getBoundingClientRect();
+            var topPosition = rect.top;
+            var leftPosition = rect.left;
+
+            // Position the target element at the same top and left coordinates as the reference element
+            drawer_title.style.position = "absolute";
+            drawer_title.style.top = topPosition * 1 + "px";
+            // drawer_title.style.top = topPosition*1.5 + "px";
+            if (pageWidth > 768) {
+                drawer_title.style.left = pageWidth * 0.02 + "px";
+                // drawer_title.style.left = pageWidth * 0.03 + "px";
+
+            } else {
+                drawer_title.style.left = leftPosition + "px";
+
+            }
+
+            // var paddingRightSize = topPosition;
+            // drawer.style.marginRight = paddingRightSize*8 + "px";
+
+            var referenceElement = document.getElementById("searchInput");
+
+            // Get the top and left position of the reference element
+            var rect = referenceElement.getBoundingClientRect();
+            var topPosition = rect.top;
+            var leftPosition = rect.left;
+
+            drawer_content.style.position = "absolute";
+            drawer_content.style.top = topPosition + "px";
+            if (pageWidth > 768) {
+                drawer_content.style.left = pageWidth * 0.04 + "px";
+                drawer_content.style.width = "75%"
+                drawer_content.style.fontSize = "large";
+                // drawer_title.style.left = pageWidth * 0.03 + "px";
+            } else {
+                drawer_content.style.width = "83%"
+                drawer_content.style.left = leftPosition * 2 + "px";
+                drawer_content.style.fontSize = "medium";
+
+            }
+
+            drawer_title.textContent = "Overview";
+        }
+
+
+        function jumpToElement(elementId) {
+            var element = document.getElementById(elementId);
+            if (element) {
+                element.scrollIntoView({ behavior: "smooth", block: "start" });
+            }
+        }
+        function toggleDrawerAndJumpToElement(elementId) {
+            toggleDrawer();
+            jumpToElement(elementId);
+        }
+        // TODO: paging
+        const RESULT_LIMIT = 15;
+        // const RESULT_LIMIT = 50;
+        const progressOverlay = document.getElementById('progress-overlay');
+        const progressBar = document.querySelector('.progress');
+        var isDebugMode = false;
+        function navigateToPage(base_filepath, language_id, project_id, keywords, detail_filepath = "") {
+            // Use a relative path to navigate to a specific page
+            let page_param = "codeview.html";
+            let file_param = 'src' + base_filepath;
+            file_param = encodeURIComponent(file_param);
+            let language_param = language_id
+            let keywords_encoded = encodeURIComponent(JSON.stringify(keywords));
+            let jump_link = `${page_param}?file=${file_param}&language=${language_id}&project=${project_id}&keywords=${keywords_encoded}`;
+            if (detail_filepath !== "") {
+                let location_range = detail_filepath.slice(base_filepath.length + 1);
+                let location_param = `mycode.${location_range}`;
+                jump_link = `${jump_link}#${location_param}`;
+            }
+            window.location.href = jump_link;
+        }
+        /*async function waitForDOMContentLoaded() {
+            return new Promise(resolve => {
+                if (document.readyState === 'loading') {
+                    document.addEventListener('DOMContentLoaded', resolve);
+                } else {
+                    resolve();
+                }
+            });
+        }
+        */
+
+
+        document.getElementById("sidebar-open").onclick = toggleDrawer;
+        // document.getElementById("sidebar-close").onclick = toggleDrawer;
+
+        // Sample data for demonstration
+        // async function async_main() {
+        function async_main() {
+
+            //const metadata_req = await fetch("metadata.json")// load from server
+            //const metadata = JSON.parse(await metadata_req.text())
+            var xhr = new XMLHttpRequest();
+            xhr.open('GET', "metadata.json", false); // The third parameter is set to false for synchronous request
+            xhr.send(null);
+            const metadata = JSON.parse(xhr.responseText);
+
+            const github_url = metadata.url.full;
+            const project_id = metadata.project_name;
+            const myDefaultTitle = `Document index of: ${project_id}`
+            document.title = myDefaultTitle;
+            const github_partial_url = metadata.url.partial;
+            const file_mapping = metadata.file_mapping
+            const split_count = metadata.split_count
+
+            var xhr = new XMLHttpRequest();
+            xhr.open('GET', "metadata_title.json", false); // The third parameter is set to false for synchronous request
+            xhr.send(null);
+            const metadata_title = JSON.parse(xhr.responseText);
+
+            const title_split_count = metadata_title.split_count;
+            const total_split_count = split_count + title_split_count;
+
+            var data_var = {}
+            for (let i = 0; i < split_count; i++) {
+                //let data_req = await fetch(`data/${i}.json`)// load from server
+                //let data_part = JSON.parse(await data_req.text())
+
+                var xhr = new XMLHttpRequest();
+                xhr.open('GET', `data/${i}.json`, false); // The third parameter is set to false for synchronous request
+                xhr.send(null);
+                let data_part = JSON.parse(xhr.responseText);
+
+                //data_var = { ...data_part, ...data_var }
+                data_var = Object.assign(data_var, data_part)
+                const progressPercentage = ((i + 1) / total_split_count) * 100;
+                progressBar.style.width = `${progressPercentage}%`;
+            }
+            var title_data_var = {}
+            for (let i = 0; i < title_split_count; i++) {
+
+                var xhr = new XMLHttpRequest();
+                xhr.open('GET', `data/titles/${i}.json`, false); // The third parameter is set to false for synchronous request
+                xhr.send(null);
+                let data_part = JSON.parse(xhr.responseText);
+
+                //data_var = { ...data_part, ...data_var }
+                title_data_var = Object.assign(title_data_var, data_part)
+                const progressPercentage = ((split_count + i + 1) / total_split_count) * 100;
+                progressBar.style.width = `${progressPercentage}%`;
+            }
+            progressOverlay.style.display = 'none';
+
+
+            const data = data_var // obviously not constant.
+            const title_data = title_data_var
+
+            // console.log(title_data);
+            // debugger;
+
+            const data_total_length = Object.keys(data).length;
+            // debugger
+            var sourceCodePaths_v = [];
+            var sourceCodeIndexRanges_v = {};
+            const file_total_count = Object.keys(file_mapping).length;
+            for (let i = 0; i < file_total_count; i++) {
+                sourceCodePaths_v.push(file_mapping[i].filepath)
+                const entry_left = file_mapping[i].entry_id
+                var entry_right;
+                if ((i + 1) > (file_total_count - 1)) {
+                    entry_right = data_total_length
+                }
+                else {
+                    // console.log(i, i + 1, file_total_count)
+                    entry_right = file_mapping[i + 1].entry_id
+                }
+                if (entry_right == entry_left) {
+                    console.log(i, i + 1, file_total_count, entry_left, entry_right);
+                    debugger
+                }
+                sourceCodeIndexRanges_v[file_mapping[i].filepath] = {
+                    left: entry_left,
+                    right: entry_right
+                }
+            }
+            sourceCodePaths_v.sort(function (a, b) {
+                return a.toLowerCase().localeCompare(b.toLowerCase());
+            });
+            const sourceCodePaths = sourceCodePaths_v;
+            const sourceCodePathCount = sourceCodePaths.length;
+            const sourceCodeIndexRanges = sourceCodeIndexRanges_v;
+
+            // Create a new FlexSearch instance with the required configuration
+            const doc = new FlexSearch.Document({
+                tokenize: "full",
+                document: {
+                    id: "id",
+                    index: ["content"]
+                }
+            });
+
+            // Add the data to the search index
+            Object.keys(data).forEach(id => {
+                doc.add(id, data[id]);
+            });
+            const englishSymbols = ["!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"];
+
+            function replaceAll(inputString, searchValue, replaceValue) {
+                return inputString.split(searchValue).join(replaceValue);
+            }
+            function getSubTerms(it) {
+                it = it.trim();
+                var ret = [it]
+                for (const sym of englishSymbols) {
+                    var m_str = it
+                    m_str = replaceAll(m_str, sym, " ");
+                    ret.concat(m_str.split(" "))
+                }
+                ret = ret.concat(it.split(" "))
+                ret = ret.concat(m_str.split(" "));
+                return ret
+            }
+
+            const searchInputElem = document.getElementById('searchInput');
+            var isSearchRunning = false;
+            var hasInputEvent = false;
+            var lastInputEventTime = new Date().getTime();
+            function searchInputEventListener() {
+
+                function capitalizeFirstLetter(text) {
+                    return text.charAt(0).toUpperCase() + text.slice(1);
+                }
+                function mutateText(text) {
+                    var lowerText = text.toLowerCase()
+                    var upperText = text.toUpperCase()
+                    var lowerCapitalized = capitalizeFirstLetter(lowerText)
+                    return [text, lowerText, upperText, lowerCapitalized]
+                }
+
+                const searchTerm = searchInputElem.value.trim();
+
+
+                const file_previous = document.getElementById("file_previous");
+                const file_next = document.getElementById("file_next");
+                // Display the search results
+                const searchResults = document.getElementById('searchResults');
+                searchResults.innerHTML = '';
+                const drawer_content = document.getElementById('drawer_content');
+                drawer_content.innerHTML = "";
+                // file_next.classList.add("grayed-out-button")
+                // file_previous.classList.add("grayed-out-button")
+                file_next.style.display = "none"
+                file_previous.style.display = "none"
+                if (searchTerm.length == 0) { return }
+
+                const searchSubTerms = getSubTerms(searchTerm);
+                function getHighlightTerms(msubterms) {
+                    var ret = []
+                    for (var it of msubterms) {
+                        if (it.length > 0) {
+                            for (var mut of mutateText(it)) {
+                                if (ret.indexOf(mut) == -1) {
+                                    ret.push(mut);
+                                }
+                            }
+                        }
+                    }
+                    return ret;
+                }
+
+                var isFileSearch = false;
+
+                if (searchTerm.startsWith("/")) {
+                    const searchTermFileIndex = sourceCodePaths.indexOf(searchTerm)
+                    // console.log('search term file index:', searchTermFileIndex)
+                    if (searchTermFileIndex != -1) {
+                        isFileSearch = true;
+                        document.title = `File: ${searchTerm} - ${myDefaultTitle}`
+                        // do something with the buttons
+                        // file_previous file_next
+                        if (searchTermFileIndex != 0) {
+                            // console.log('showing file previous button')
+                            file_previous.style.display = "";
+                            // file_previous.classList.remove("grayed-out-button")
+                            file_previous.onclick = () => { window.location.href = `?q=${encodeURIComponent(sourceCodePaths[searchTermFileIndex - 1])}` };
+                        }
+                        if (searchTermFileIndex != (sourceCodePathCount - 1)) {
+                            // console.log('showing file next button')
+                            // file_next.classList.remove("grayed-out-button")
+                            file_next.style.display = "";
+                            file_next.onclick = () => { window.location.href = `?q=${encodeURIComponent(sourceCodePaths[searchTermFileIndex + 1])}` };
+                        }
+
+                    }
+                }
+                var searchHighlightTerms;
+                var results;
+
+                if (isFileSearch) {
+                    searchHighlightTerms = [];
+                    const queryFileRange = sourceCodeIndexRanges[searchTerm];
+                    var results_v = [];
+                    var ids = []
+                    for (var i = queryFileRange.left; i < queryFileRange.right; i++) {
+                        ids.push(i);
+                    }
+                    var it = { field: 'content', result: ids }
+                    results_v.push(it)
+                    results = results_v;
+                } else {
+                    searchHighlightTerms = getHighlightTerms(searchSubTerms);
+                    results = doc.search(searchTerm, RESULT_LIMIT); // Limiting to 5 results for demonstration
+                }
+                // console.log(isFileSearch,results);
+
+                const detail_types = ["code", "comment"];
+
+                var searchResultItems = {};
+                var searchResultItemIds = [];
+                var leftElements = [];
+                var rightElements = [];
+
+                results.forEach(result => {
+                    const field = result.field; // "content"
+                    const ids = result.result;
+                    ids.forEach(id => {
+                        id = id - 0 // to integer.
+                        const data_type = data[id].type;
+                        const file_id = data[id].file_id;
+                        if (searchResultItems[file_id] === undefined) {
+                            searchResultItemIds.push(file_id);
+                            const file_metadata = file_mapping[file_id];
+                            const entry_id = file_metadata.entry_id;
+                            const summary = data[entry_id + 1].content;
+                            const filepath = file_metadata.filepath;
+                            const language_id = file_metadata.language_id;
+                            searchResultItems[file_id] = { "header": { "summary": summary, "filepath": filepath }, "pairs": {}, "language_id": language_id };
+                        }
+                        if (detail_types.indexOf(data_type) != -1) {
+                            const pair_id = ((data_type === "code") ? id : (id - 1));
+                            if (searchResultItems[file_id].pairs[pair_id] === undefined) {
+                                // console.log(pair_id, pair_id+1, data_type, data[pair_id].location)
+                                let mit = { "left": data[pair_id].content, "right": data[pair_id + 1].content, "location": data[pair_id].location, "hit": [data_type] }
+                                // let mit = { "left": data[pair_id].content, "right": data[pair_id - 1].content, "location": data[pair_id].location }
+                                // console.log(mit)
+                                searchResultItems[file_id].pairs[pair_id] = mit;
+                            }
+                            else {
+                                searchResultItems[file_id].pairs[pair_id].hit.push(data_type)
+                            }
+                        }
+                        // listItem.textContent = `[${field}][${data_type}] ${data[id][field]}`;
+                        // searchResults.appendChild(listItem);
+                    });
+                });
+                var isDesktopScreen = window.innerWidth > 768;
+                var isFirstDrawerEntry = true;
+                var title_topics = []
+                for (var file_id of searchResultItemIds) {
+                    const it = searchResultItems[file_id];
+                    const item = document.createElement('li');
+                    //const item = document.createElement('div');
+                    item.className = "searchItem";
+                    const search_header = document.createElement('div');
+                    search_header.className = "searchItemInfo";
+                    const file_title_elem = document.createElement('h3');
+                    const overview_file_title_elem = document.createElement('div');
+                    const file_title_elem_id = generateUUID()
+                    file_title_elem.setAttribute('id', file_title_elem_id)
+                    const mylocation = it.header.filepath.slice(1)
+                    title_topics.push(title_data["/" + mylocation])
+
+                    overview_file_title_elem.innerHTML = `&bull;&nbsp;<span>${title_data["/" + mylocation]}</span>`;
+                    if (isFirstDrawerEntry) {
+                        isFirstDrawerEntry = false;
+                    }else{
+                        overview_file_title_elem.style.marginTop = '7px'
+                    }
+                    overview_file_title_elem.onclick = () => {
+                        //  jumpToElement(file_title_elem_id); 
+                        toggleDrawerAndJumpToElement(file_title_elem_id);
+                    };
+                    drawer_content.appendChild(overview_file_title_elem);
+                    const location_p = document.createElement('p');
+                    // location_p.className = "monospace-text";
+                    location_p.className = "monospace-text codelink";
+                    // file_title_elem.className = "codelink";
+                    const file_title_span = document.createElement("span")
+                    file_title_span.textContent = title_data["/" + mylocation]
+                    file_title_span.setAttribute("title", mylocation)
+                    file_title_elem.appendChild(file_title_span);
+                    location_p.innerHTML = (mylocation);
+
+                    if (isFileSearch) {
+                        location_p.onclick = () => { navigateToPage(it.header.filepath, it.language_id, project_id, searchHighlightTerms) }
+                    } else {
+                        location_p.onclick = () => { window.location.href = "?q=/" + encodeURIComponent(mylocation) }
+                    }
+                    // if (isFileSearch) {
+                    //     file_title_span.onclick = () => { navigateToPage(it.header.filepath, it.language_id, project_id, searchHighlightTerms) }
+                    // } else {
+                    //     file_title_span.onclick = () => { window.location.href = "?q=/" + encodeURIComponent(mylocation) }
+                    // }
+
+                    // file_title_span.setAttribute('onclick', `navigateToPage(${JSON.stringify(it.header.filepath)}, ${JSON.stringify(it.language_id)},${JSON.stringify(project_id)})`)
+                    // location_p.setAttribute('onclick', `navigateToPage(${JSON.stringify(it.header.filepath)}, ${JSON.stringify(it.language_id)}, ${JSON.stringify(project_id)})`)
+                    search_header.appendChild(location_p);
+                    search_header.appendChild(file_title_elem);
+
+                    var skip_summary = false;
+
+                    if (Object.keys(it.pairs).length == 1) {
+                        if (it.pairs[Object.keys(it.pairs)[0]].right == it.header.summary) {
+                            skip_summary = true;
+                        }
+                    }
+                    if (!skip_summary) {
+                        const summary_p = document.createElement('p');
+                        summary_p.innerHTML = (it.header.summary);
+                        search_header.appendChild(summary_p);
+                    }
+                    //search_header.setAttribute('onclick', `navigateToPage(${JSON.stringify(it.header.filepath)}, ${JSON.stringify(it.language_id)})`)
+
+
+                    item.appendChild(search_header);
+
+
+                    for (var pair_id in it.pairs) {
+                        const pair_container = document.createElement('div');
+                        pair_container.className = "search-container";
+                        const pair = it.pairs[pair_id];
+                        const pair_item = document.createElement('div');
+                        const pair_left = document.createElement('div');
+                        const pair_title = document.createElement('h4');
+                        const pair_title_id = generateUUID();
+                        pair_title.setAttribute('id', pair_title_id);
+                        const pair_location_text = pair.location.slice(1);
+                        const pair_title_text = title_data["/" + pair_location_text];
+                        const code_title_span = document.createElement('span')
+                        const overview_pair_title = document.createElement('div');
+                        if (isDesktopScreen) {
+                            overview_pair_title.style.fontSize = "medium";
+                        } else {
+                            overview_pair_title.style.fontSize = "small";
+                        }
+                        overview_pair_title.onclick = () => {
+                            // jumpToElement(pair_title_id);
+                            toggleDrawerAndJumpToElement(pair_title_id);
+                        }
+                        if (pair_title_text != undefined) {
+                            code_title_span.textContent = pair_title_text
+                            overview_pair_title.innerHTML = `&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span>${pair_title_text}</span>`
+                            // overview_pair_title.innerHTML = `&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&#9702;&nbsp;<span>${pair_title_text}</span>`
+                        }
+                        pair_title.appendChild(code_title_span);
+                        pair_left.className = "left-half";
+                        const code_location_p = document.createElement('p');
+                        // pair_title.className = "codelink";
+                        // code_location_p.className = "monospace-text";
+                        code_location_p.className = "monospace-text codelink";
+                        code_location_p.innerHTML = (pair_location_text);
+                        code_title_span.setAttribute("title", pair_location_text)
+                        // code_title_span.onclick = () => { navigateToPage(it.header.filepath, it.language_id, project_id, searchHighlightTerms, pair.location) }
+                        // code_title_span.setAttribute('onclick', `navigateToPage(${JSON.stringify(it.header.filepath)}, ${JSON.stringify(it.language_id)},  ${JSON.stringify(project_id)}, ${JSON.stringify(pair.location)})`)
+                        code_location_p.setAttribute('onclick', `navigateToPage(${JSON.stringify(it.header.filepath)}, ${JSON.stringify(it.language_id)},  ${JSON.stringify(project_id)}, ${JSON.stringify(pair.location)})`)
+                        pair_left.appendChild(code_location_p);
+                        if (pair_title_text != undefined) {
+                            pair_left.appendChild(pair_title);
+                            drawer_content.appendChild(overview_pair_title);
+                        }
+                        // const comment_p = document.createElement('pre');
+                        const comment_p = document.createElement('p');
+                        comment_p.setAttribute('style', 'white-space: pre-line;');
+                        comment_p.innerHTML = (pair.right);
+                        pair_left.appendChild(comment_p);
+
+                        const pair_right = document.createElement('div');
+                        pair_right.className = "right-half";
+                        const code_pre = document.createElement('pre');
+                        const code_code = document.createElement('code');
+                        code_code.className = `language-${it.language_id}`
+                        // Set the "data-dependencies" attribute
+                        // code_code.setAttribute('data-dependencies', it.language_id+"!");
+                        code_code.textContent = pair.left;
+                        code_pre.appendChild(code_code);
+                        //code_pre.className = "monospace-text";
+                        //code_pre.innerHTML = highlightTerm(pair.left);
+                        pair_right.appendChild(code_pre);
+
+                        leftElements.push(pair_left);
+                        rightElements.push(pair_right);
+                        pair_container.appendChild(pair_left); // comment
+                        pair_container.appendChild(pair_right);
+                        // if (pair.hit.indexOf("code") != -1) {
+                        // }// code
+                        //pair_container.setAttribute('onclick', `navigateToPage(${JSON.stringify(it.header.filepath)}, ${JSON.stringify(it.language_id)}, ${JSON.stringify(pair.location)})`)
+                        item.appendChild(pair_container);
+                    }
+                    //li_elem = document.createElement('li');
+                    //li_elem.appendChild(item);
+                    //searchResults.appendChild(li_elem);
+
+                    searchResults.appendChild(item);
+                }
+
+                Prism.highlightAllUnder(searchResults);
+                const title_topics_joined = title_topics.join(" | ")
+                document.title = `${document.title} - Topics: ${title_topics_joined}`
+                // let's try understand that.
+                const markInstance = new Mark(document.getElementById('searchResults'));
+                markInstance.unmark(); // Clear previous marks
+                markInstance.mark(searchHighlightTerms);
+                for (let i in leftElements) {
+                    let leftHeight = leftElements[i].clientHeight;
+                    let rightHeight = rightElements[i].clientHeight;
+                    if (leftHeight < rightHeight) {
+                        rightElements[i].style.height = leftHeight + "px";
+                    }
+                }
+            }
+
+
+            function getQueryParams() {
+                var search = window.location.search.substring(1); // Remove leading '?'
+                var queryParams = {};
+                search.split('&').forEach(function (pair) {
+                    var parts = pair.split('=');
+                    var key = decodeURIComponent(parts[0]);
+                    var value = decodeURIComponent(parts[1]);
+                    queryParams[key] = value;
+                });
+                return queryParams;
+            }
+            // Event listener for the search input
+            function registerSearchEventListener() {
+                // this is never called
+                document.getElementById("partial-repository-url").innerText = github_partial_url;
+                document.getElementById("github-link").setAttribute("href", github_url);
+                // get query parameters.
+                // function keyPressListener(event) {
+                //     if (event.key === 'Enter') {
+                //         searchInputEventListener();
+                //     }
+                // }
+                // searchInputElem.addEventListener('keypress',
+                //     // searchInputElem.addEventListener('input',
+                //     // searchInputEventListener
+                //     keyPressListener
+                // );
+                setInterval(() => {
+                    if (hasInputEvent) {
+                        let currentTime = new Date().getTime();
+                        if ((currentTime - lastInputEventTime) > 500) // .5 sec
+                        {
+                            const inputBoxText = document.getElementById('searchInput').value;
+                            inputBoxText.trim()
+                            var newURL;
+                            if (inputBoxText.trim() !== "") {
+                                // Construct the new URL with the updated query string
+                                newURL = `${window.location.protocol}//${window.location.host}${window.location.pathname}?q=${inputBoxText}`;
+                                document.title = `Query: ${inputBoxText.trim()} - ${myDefaultTitle}`;
+                            }
+                            else {
+                                newURL = `${window.location.protocol}//${window.location.host}${window.location.pathname}`;
+                                document.title = myDefaultTitle;
+
+                            }
+
+                            // Use history.pushState() to update the URL without reloading the page
+                            history.pushState(null, null, newURL);
+
+                            hasInputEvent = false;
+                            if (!isSearchRunning) {
+                                isSearchRunning = true;
+                                try {
+                                    searchInputEventListener();
+                                } catch (error) {
+                                    // Print exception info
+                                    console.error(error);
+                                    if (isDebugMode) {
+                                        // Show alert window in debug mode
+                                        console.log(error.message);
+                                        alert("An error occurred when searching: " + error.message);
+                                    } else {
+                                        // Raise exception when not in debug mode
+                                        throw error;
+                                    }
+                                }
+                                isSearchRunning = false;
+                            }
+                        }
+                    }
+                }, 100)
+
+                searchInputElem.addEventListener('input', () => {
+                    hasInputEvent = true;
+                    lastInputEventTime = new Date().getTime();
+                })
+
+
+                function setTextAndTriggerInputEvent(queryString) {
+                    searchInputElem.value = queryString; // Set text into the input box
+                    // searchInputElem.dispatchEvent(new Event('keypress')); // Fire an input event
+                    // var enterKeyEvent = new KeyboardEvent('keypress', {
+                    //     key: 'Enter'
+                    // });
+                    // searchInputElem.dispatchEvent(enterKeyEvent);
+                    hasInputEvent = true;
+                }
+                function displayFile(file_path) {
+                    // TODO: handle file query string
+                }
+
+                //const queryParams = new URLSearchParams(window.location.search);
+                const queryParams = getQueryParams(window.location.search);
+                const query_from_url = queryParams.q;
+                const file_path_from_url = queryParams.file;
+                isDebugMode = queryParams.debug == 'true';
+                if (isDebugMode) { console.log("You are in debug mode.") }
+                if (query_from_url != null || query_from_url != undefined) {
+                    setTextAndTriggerInputEvent(query_from_url)
+                } else if (file_path_from_url != null || file_path_from_url != undefined) {
+                    displayFile(file_path_from_url)
+                }
+            }
+
+            //document.addEventListener('DOMContentLoaded', registerSearchEventListener);
+            // waitForDOMContentLoaded();
+            registerSearchEventListener()
+            // console.log("event listener registered")
+        }
+        async_main()
+    </script>
+    <!-- </div> -->
+</body>
+
+
+</html>
\ No newline at end of file
diff --git a/docs/metadata.json b/docs/metadata.json
new file mode 100644
index 000000000..0ea7b26dc
--- /dev/null
+++ b/docs/metadata.json
@@ -0,0 +1,3265 @@
+{
+    "url": {
+        "full": "https://github.com/PaddlePaddle/PaddleVideo",
+        "partial": "PaddlePaddle/PaddleVideo"
+    },
+    "file_mapping": {
+        "0": {
+            "filepath": "/MANIFEST.in",
+            "entry_id": 0,
+            "language_id": "text"
+        },
+        "1": {
+            "filepath": "/README.md",
+            "entry_id": 4,
+            "language_id": "markdown"
+        },
+        "2": {
+            "filepath": "/README_en.md",
+            "entry_id": 12,
+            "language_id": "markdown"
+        },
+        "3": {
+            "filepath": "/__init__.py",
+            "entry_id": 20,
+            "language_id": "python"
+        },
+        "4": {
+            "filepath": "/applications/AbnormalActionDetection/README.md",
+            "entry_id": 24,
+            "language_id": "plain-text"
+        },
+        "5": {
+            "filepath": "/applications/Anti-UAV/README.md",
+            "entry_id": 32,
+            "language_id": "markdown"
+        },
+        "6": {
+            "filepath": "/applications/Anti-UAV/get_image_label.py",
+            "entry_id": 40,
+            "language_id": "python"
+        },
+        "7": {
+            "filepath": "/applications/BasketballAction/README.md",
+            "entry_id": 54,
+            "language_id": "plain-text"
+        },
+        "8": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/action.py",
+            "entry_id": 78,
+            "language_id": "python"
+        },
+        "9": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/logger.py",
+            "entry_id": 92,
+            "language_id": "python"
+        },
+        "10": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py",
+            "entry_id": 96,
+            "language_id": "python"
+        },
+        "11": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/mfcc/model_config.py",
+            "entry_id": 110,
+            "language_id": "python"
+        },
+        "12": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py",
+            "entry_id": 116,
+            "language_id": "python"
+        },
+        "13": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/models/audio_infer.py",
+            "entry_id": 122,
+            "language_id": "python"
+        },
+        "14": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/models/bmn_infer.py",
+            "entry_id": 130,
+            "language_id": "python"
+        },
+        "15": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/models/lstm_infer.py",
+            "entry_id": 144,
+            "language_id": "python"
+        },
+        "16": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py",
+            "entry_id": 158,
+            "language_id": "python"
+        },
+        "17": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/reader/__init__.py",
+            "entry_id": 166,
+            "language_id": "python"
+        },
+        "18": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/reader/audio_reader.py",
+            "entry_id": 170,
+            "language_id": "python"
+        },
+        "19": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py",
+            "entry_id": 178,
+            "language_id": "python"
+        },
+        "20": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/reader/feature_reader.py",
+            "entry_id": 190,
+            "language_id": "python"
+        },
+        "21": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/reader/reader_utils.py",
+            "entry_id": 198,
+            "language_id": "python"
+        },
+        "22": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py",
+            "entry_id": 206,
+            "language_id": "python"
+        },
+        "23": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/utils/config_utils.py",
+            "entry_id": 232,
+            "language_id": "python"
+        },
+        "24": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/utils/preprocess.py",
+            "entry_id": 240,
+            "language_id": "python"
+        },
+        "25": {
+            "filepath": "/applications/BasketballAction/predict/action_detect/utils/process_result.py",
+            "entry_id": 244,
+            "language_id": "python"
+        },
+        "26": {
+            "filepath": "/applications/BasketballAction/predict/eval.py",
+            "entry_id": 256,
+            "language_id": "python"
+        },
+        "27": {
+            "filepath": "/applications/BasketballAction/predict/predict.py",
+            "entry_id": 276,
+            "language_id": "python"
+        },
+        "28": {
+            "filepath": "/applications/EIVideo/EIVideo/README.MD",
+            "entry_id": 282,
+            "language_id": "markdown"
+        },
+        "29": {
+            "filepath": "/applications/EIVideo/EIVideo/__init__.py",
+            "entry_id": 286,
+            "language_id": "python"
+        },
+        "30": {
+            "filepath": "/applications/EIVideo/EIVideo/api.py",
+            "entry_id": 290,
+            "language_id": "python"
+        },
+        "31": {
+            "filepath": "/applications/EIVideo/EIVideo/main.py",
+            "entry_id": 302,
+            "language_id": "python"
+        },
+        "32": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/__init__.py",
+            "entry_id": 312,
+            "language_id": "python"
+        },
+        "33": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py",
+            "entry_id": 316,
+            "language_id": "python"
+        },
+        "34": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py",
+            "entry_id": 320,
+            "language_id": "python"
+        },
+        "35": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py",
+            "entry_id": 332,
+            "language_id": "python"
+        },
+        "36": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py",
+            "entry_id": 336,
+            "language_id": "python"
+        },
+        "37": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py",
+            "entry_id": 344,
+            "language_id": "python"
+        },
+        "38": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py",
+            "entry_id": 360,
+            "language_id": "python"
+        },
+        "39": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py",
+            "entry_id": 364,
+            "language_id": "python"
+        },
+        "40": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py",
+            "entry_id": 368,
+            "language_id": "python"
+        },
+        "41": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py",
+            "entry_id": 372,
+            "language_id": "python"
+        },
+        "42": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py",
+            "entry_id": 376,
+            "language_id": "python"
+        },
+        "43": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py",
+            "entry_id": 380,
+            "language_id": "python"
+        },
+        "44": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py",
+            "entry_id": 410,
+            "language_id": "python"
+        },
+        "45": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py",
+            "entry_id": 416,
+            "language_id": "python"
+        },
+        "46": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py",
+            "entry_id": 420,
+            "language_id": "python"
+        },
+        "47": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py",
+            "entry_id": 432,
+            "language_id": "python"
+        },
+        "48": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py",
+            "entry_id": 440,
+            "language_id": "python"
+        },
+        "49": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py",
+            "entry_id": 448,
+            "language_id": "python"
+        },
+        "50": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py",
+            "entry_id": 466,
+            "language_id": "python"
+        },
+        "51": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py",
+            "entry_id": 476,
+            "language_id": "python"
+        },
+        "52": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py",
+            "entry_id": 480,
+            "language_id": "python"
+        },
+        "53": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py",
+            "entry_id": 484,
+            "language_id": "python"
+        },
+        "54": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py",
+            "entry_id": 492,
+            "language_id": "python"
+        },
+        "55": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py",
+            "entry_id": 534,
+            "language_id": "python"
+        },
+        "56": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py",
+            "entry_id": 608,
+            "language_id": "python"
+        },
+        "57": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py",
+            "entry_id": 612,
+            "language_id": "python"
+        },
+        "58": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py",
+            "entry_id": 618,
+            "language_id": "python"
+        },
+        "59": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py",
+            "entry_id": 632,
+            "language_id": "python"
+        },
+        "60": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py",
+            "entry_id": 636,
+            "language_id": "python"
+        },
+        "61": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py",
+            "entry_id": 642,
+            "language_id": "python"
+        },
+        "62": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py",
+            "entry_id": 646,
+            "language_id": "python"
+        },
+        "63": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/config.py",
+            "entry_id": 652,
+            "language_id": "python"
+        },
+        "64": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py",
+            "entry_id": 666,
+            "language_id": "python"
+        },
+        "65": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py",
+            "entry_id": 670,
+            "language_id": "python"
+        },
+        "66": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py",
+            "entry_id": 680,
+            "language_id": "python"
+        },
+        "67": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py",
+            "entry_id": 770,
+            "language_id": "python"
+        },
+        "68": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py",
+            "entry_id": 780,
+            "language_id": "python"
+        },
+        "69": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/record.py",
+            "entry_id": 792,
+            "language_id": "python"
+        },
+        "70": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py",
+            "entry_id": 806,
+            "language_id": "python"
+        },
+        "71": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py",
+            "entry_id": 814,
+            "language_id": "python"
+        },
+        "72": {
+            "filepath": "/applications/EIVideo/EIVideo/paddlevideo/version.py",
+            "entry_id": 830,
+            "language_id": "python"
+        },
+        "73": {
+            "filepath": "/applications/EIVideo/EIVideo/setup.py",
+            "entry_id": 834,
+            "language_id": "python"
+        },
+        "74": {
+            "filepath": "/applications/EIVideo/EIVideo/version.py",
+            "entry_id": 838,
+            "language_id": "python"
+        },
+        "75": {
+            "filepath": "/applications/EIVideo/QEIVideo/__init__.py",
+            "entry_id": 842,
+            "language_id": "python"
+        },
+        "76": {
+            "filepath": "/applications/EIVideo/QEIVideo/build_gui.py",
+            "entry_id": 846,
+            "language_id": "python"
+        },
+        "77": {
+            "filepath": "/applications/EIVideo/QEIVideo/gui/__init__.py",
+            "entry_id": 860,
+            "language_id": "python"
+        },
+        "78": {
+            "filepath": "/applications/EIVideo/QEIVideo/gui/demo.py",
+            "entry_id": 864,
+            "language_id": "python"
+        },
+        "79": {
+            "filepath": "/applications/EIVideo/QEIVideo/gui/ui_main_window.py",
+            "entry_id": 870,
+            "language_id": "python"
+        },
+        "80": {
+            "filepath": "/applications/EIVideo/QEIVideo/start.py",
+            "entry_id": 888,
+            "language_id": "python"
+        },
+        "81": {
+            "filepath": "/applications/EIVideo/QEIVideo/tools/__init__.py",
+            "entry_id": 892,
+            "language_id": "python"
+        },
+        "82": {
+            "filepath": "/applications/EIVideo/QEIVideo/ui/__init__.py",
+            "entry_id": 896,
+            "language_id": "python"
+        },
+        "83": {
+            "filepath": "/applications/EIVideo/QEIVideo/ui/demo.py",
+            "entry_id": 900,
+            "language_id": "python"
+        },
+        "84": {
+            "filepath": "/applications/EIVideo/QEIVideo/version.py",
+            "entry_id": 914,
+            "language_id": "python"
+        },
+        "85": {
+            "filepath": "/applications/EIVideo/QEIVideo/widget/PaintBoard.py",
+            "entry_id": 918,
+            "language_id": "python"
+        },
+        "86": {
+            "filepath": "/applications/EIVideo/README.md",
+            "entry_id": 926,
+            "language_id": "plain-text"
+        },
+        "87": {
+            "filepath": "/applications/EIVideo/resources/QT/demo.ui",
+            "entry_id": 938,
+            "language_id": "text"
+        },
+        "88": {
+            "filepath": "/applications/EIVideo/resources/cmd",
+            "entry_id": 952,
+            "language_id": "text"
+        },
+        "89": {
+            "filepath": "/applications/FightRecognition/README.md",
+            "entry_id": 956,
+            "language_id": "plain-text"
+        },
+        "90": {
+            "filepath": "/applications/FigureSkating/README.md",
+            "entry_id": 974,
+            "language_id": "markdown"
+        },
+        "91": {
+            "filepath": "/applications/FootballAction/README.md",
+            "entry_id": 980,
+            "language_id": "markdown"
+        },
+        "92": {
+            "filepath": "/applications/FootballAction/checkpoints/download.sh",
+            "entry_id": 1006,
+            "language_id": "shell"
+        },
+        "93": {
+            "filepath": "/applications/FootballAction/datasets/EuroCup2016/dataset_url.list",
+            "entry_id": 1010,
+            "language_id": "text"
+        },
+        "94": {
+            "filepath": "/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh",
+            "entry_id": 1022,
+            "language_id": "shell"
+        },
+        "95": {
+            "filepath": "/applications/FootballAction/datasets/EuroCup2016/url.list",
+            "entry_id": 1034,
+            "language_id": "text"
+        },
+        "96": {
+            "filepath": "/applications/FootballAction/datasets/EuroCup2016/url_val.list",
+            "entry_id": 1040,
+            "language_id": "text"
+        },
+        "97": {
+            "filepath": "/applications/FootballAction/datasets/script/get_frames_pcm.py",
+            "entry_id": 1044,
+            "language_id": "python"
+        },
+        "98": {
+            "filepath": "/applications/FootballAction/datasets/script/get_instance_for_bmn.py",
+            "entry_id": 1050,
+            "language_id": "python"
+        },
+        "99": {
+            "filepath": "/applications/FootballAction/datasets/script/get_instance_for_lstm.py",
+            "entry_id": 1068,
+            "language_id": "python"
+        },
+        "100": {
+            "filepath": "/applications/FootballAction/datasets/script/get_instance_for_pptsm.py",
+            "entry_id": 1082,
+            "language_id": "python"
+        },
+        "101": {
+            "filepath": "/applications/FootballAction/extractor/extract_bmn.py",
+            "entry_id": 1092,
+            "language_id": "python"
+        },
+        "102": {
+            "filepath": "/applications/FootballAction/extractor/extract_feat.py",
+            "entry_id": 1100,
+            "language_id": "python"
+        },
+        "103": {
+            "filepath": "/applications/FootballAction/predict/action_detect/action.py",
+            "entry_id": 1108,
+            "language_id": "python"
+        },
+        "104": {
+            "filepath": "/applications/FootballAction/predict/action_detect/logger.py",
+            "entry_id": 1122,
+            "language_id": "python"
+        },
+        "105": {
+            "filepath": "/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py",
+            "entry_id": 1126,
+            "language_id": "python"
+        },
+        "106": {
+            "filepath": "/applications/FootballAction/predict/action_detect/mfcc/model_config.py",
+            "entry_id": 1142,
+            "language_id": "python"
+        },
+        "107": {
+            "filepath": "/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py",
+            "entry_id": 1148,
+            "language_id": "python"
+        },
+        "108": {
+            "filepath": "/applications/FootballAction/predict/action_detect/models/audio_infer.py",
+            "entry_id": 1154,
+            "language_id": "python"
+        },
+        "109": {
+            "filepath": "/applications/FootballAction/predict/action_detect/models/bmn_infer.py",
+            "entry_id": 1162,
+            "language_id": "python"
+        },
+        "110": {
+            "filepath": "/applications/FootballAction/predict/action_detect/models/lstm_infer.py",
+            "entry_id": 1176,
+            "language_id": "python"
+        },
+        "111": {
+            "filepath": "/applications/FootballAction/predict/action_detect/models/pptsm_infer.py",
+            "entry_id": 1190,
+            "language_id": "python"
+        },
+        "112": {
+            "filepath": "/applications/FootballAction/predict/action_detect/reader/__init__.py",
+            "entry_id": 1198,
+            "language_id": "python"
+        },
+        "113": {
+            "filepath": "/applications/FootballAction/predict/action_detect/reader/audio_reader.py",
+            "entry_id": 1202,
+            "language_id": "python"
+        },
+        "114": {
+            "filepath": "/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py",
+            "entry_id": 1210,
+            "language_id": "python"
+        },
+        "115": {
+            "filepath": "/applications/FootballAction/predict/action_detect/reader/feature_reader.py",
+            "entry_id": 1222,
+            "language_id": "python"
+        },
+        "116": {
+            "filepath": "/applications/FootballAction/predict/action_detect/reader/reader_utils.py",
+            "entry_id": 1230,
+            "language_id": "python"
+        },
+        "117": {
+            "filepath": "/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py",
+            "entry_id": 1238,
+            "language_id": "python"
+        },
+        "118": {
+            "filepath": "/applications/FootballAction/predict/action_detect/utils/config_utils.py",
+            "entry_id": 1264,
+            "language_id": "python"
+        },
+        "119": {
+            "filepath": "/applications/FootballAction/predict/action_detect/utils/preprocess.py",
+            "entry_id": 1272,
+            "language_id": "python"
+        },
+        "120": {
+            "filepath": "/applications/FootballAction/predict/action_detect/utils/process_result.py",
+            "entry_id": 1276,
+            "language_id": "python"
+        },
+        "121": {
+            "filepath": "/applications/FootballAction/predict/eval.py",
+            "entry_id": 1288,
+            "language_id": "python"
+        },
+        "122": {
+            "filepath": "/applications/FootballAction/predict/predict.py",
+            "entry_id": 1308,
+            "language_id": "python"
+        },
+        "123": {
+            "filepath": "/applications/Ma-Net/README.md",
+            "entry_id": 1314,
+            "language_id": "markdown"
+        },
+        "124": {
+            "filepath": "/applications/Ma-Net/README_cn.md",
+            "entry_id": 1320,
+            "language_id": "markdown"
+        },
+        "125": {
+            "filepath": "/applications/Ma-Net/config.py",
+            "entry_id": 1324,
+            "language_id": "python"
+        },
+        "126": {
+            "filepath": "/applications/Ma-Net/dataloaders/DAVIS2017.md",
+            "entry_id": 1336,
+            "language_id": "markdown"
+        },
+        "127": {
+            "filepath": "/applications/Ma-Net/dataloaders/DAVIS2017_cn.md",
+            "entry_id": 1340,
+            "language_id": "markdown"
+        },
+        "128": {
+            "filepath": "/applications/Ma-Net/dataloaders/custom_transforms_f.py",
+            "entry_id": 1344,
+            "language_id": "python"
+        },
+        "129": {
+            "filepath": "/applications/Ma-Net/dataloaders/davis_2017_f.py",
+            "entry_id": 1374,
+            "language_id": "python"
+        },
+        "130": {
+            "filepath": "/applications/Ma-Net/dataloaders/helpers.py",
+            "entry_id": 1426,
+            "language_id": "python"
+        },
+        "131": {
+            "filepath": "/applications/Ma-Net/dataloaders/samplers.py",
+            "entry_id": 1434,
+            "language_id": "python"
+        },
+        "132": {
+            "filepath": "/applications/Ma-Net/networks/IntVOS.py",
+            "entry_id": 1440,
+            "language_id": "python"
+        },
+        "133": {
+            "filepath": "/applications/Ma-Net/networks/aspp.py",
+            "entry_id": 1518,
+            "language_id": "python"
+        },
+        "134": {
+            "filepath": "/applications/Ma-Net/networks/backbone/__init__.py",
+            "entry_id": 1528,
+            "language_id": "python"
+        },
+        "135": {
+            "filepath": "/applications/Ma-Net/networks/backbone/drn.py",
+            "entry_id": 1532,
+            "language_id": "python"
+        },
+        "136": {
+            "filepath": "/applications/Ma-Net/networks/backbone/mobilenet.py",
+            "entry_id": 1562,
+            "language_id": "python"
+        },
+        "137": {
+            "filepath": "/applications/Ma-Net/networks/backbone/resnet.py",
+            "entry_id": 1576,
+            "language_id": "python"
+        },
+        "138": {
+            "filepath": "/applications/Ma-Net/networks/backbone/xception.py",
+            "entry_id": 1594,
+            "language_id": "python"
+        },
+        "139": {
+            "filepath": "/applications/Ma-Net/networks/decoder.py",
+            "entry_id": 1628,
+            "language_id": "python"
+        },
+        "140": {
+            "filepath": "/applications/Ma-Net/networks/deeplab.py",
+            "entry_id": 1636,
+            "language_id": "python"
+        },
+        "141": {
+            "filepath": "/applications/Ma-Net/networks/loss.py",
+            "entry_id": 1644,
+            "language_id": "python"
+        },
+        "142": {
+            "filepath": "/applications/Ma-Net/run.sh",
+            "entry_id": 1662,
+            "language_id": "shell"
+        },
+        "143": {
+            "filepath": "/applications/Ma-Net/test.py",
+            "entry_id": 1668,
+            "language_id": "python"
+        },
+        "144": {
+            "filepath": "/applications/Ma-Net/train_stage1.py",
+            "entry_id": 1720,
+            "language_id": "python"
+        },
+        "145": {
+            "filepath": "/applications/Ma-Net/train_stage2.py",
+            "entry_id": 1760,
+            "language_id": "python"
+        },
+        "146": {
+            "filepath": "/applications/Ma-Net/utils/api.py",
+            "entry_id": 1820,
+            "language_id": "python"
+        },
+        "147": {
+            "filepath": "/applications/Ma-Net/utils/mask_damaging.py",
+            "entry_id": 1878,
+            "language_id": "python"
+        },
+        "148": {
+            "filepath": "/applications/Ma-Net/utils/meters.py",
+            "entry_id": 1892,
+            "language_id": "python"
+        },
+        "149": {
+            "filepath": "/applications/Ma-Net/utils/utils.py",
+            "entry_id": 1896,
+            "language_id": "python"
+        },
+        "150": {
+            "filepath": "/applications/MultimodalVideoTag/README.md",
+            "entry_id": 1900,
+            "language_id": "markdown"
+        },
+        "151": {
+            "filepath": "/applications/MultimodalVideoTag/download.sh",
+            "entry_id": 1908,
+            "language_id": "shell"
+        },
+        "152": {
+            "filepath": "/applications/MultimodalVideoTag/eval_and_save_model.sh",
+            "entry_id": 1912,
+            "language_id": "shell"
+        },
+        "153": {
+            "filepath": "/applications/MultimodalVideoTag/inference.sh",
+            "entry_id": 1916,
+            "language_id": "shell"
+        },
+        "154": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py",
+            "entry_id": 1920,
+            "language_id": "python"
+        },
+        "155": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/config.py",
+            "entry_id": 1934,
+            "language_id": "python"
+        },
+        "156": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py",
+            "entry_id": 1940,
+            "language_id": "python"
+        },
+        "157": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py",
+            "entry_id": 1944,
+            "language_id": "python"
+        },
+        "158": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py",
+            "entry_id": 1970,
+            "language_id": "python"
+        },
+        "159": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py",
+            "entry_id": 1990,
+            "language_id": "python"
+        },
+        "160": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py",
+            "entry_id": 1998,
+            "language_id": "python"
+        },
+        "161": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py",
+            "entry_id": 2028,
+            "language_id": "python"
+        },
+        "162": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/inference.py",
+            "entry_id": 2042,
+            "language_id": "python"
+        },
+        "163": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py",
+            "entry_id": 2056,
+            "language_id": "python"
+        },
+        "164": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/models/ernie.py",
+            "entry_id": 2090,
+            "language_id": "python"
+        },
+        "165": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py",
+            "entry_id": 2110,
+            "language_id": "python"
+        },
+        "166": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/train.py",
+            "entry_id": 2136,
+            "language_id": "python"
+        },
+        "167": {
+            "filepath": "/applications/MultimodalVideoTag/scenario_lib/utils.py",
+            "entry_id": 2156,
+            "language_id": "python"
+        },
+        "168": {
+            "filepath": "/applications/MultimodalVideoTag/train.sh",
+            "entry_id": 2174,
+            "language_id": "shell"
+        },
+        "169": {
+            "filepath": "/applications/PP-Care/Readme.md",
+            "entry_id": 2178,
+            "language_id": "markdown"
+        },
+        "170": {
+            "filepath": "/applications/PPHuman/README.md",
+            "entry_id": 2188,
+            "language_id": "markdown"
+        },
+        "171": {
+            "filepath": "/applications/PPHuman/datasets/prepare_dataset.py",
+            "entry_id": 2202,
+            "language_id": "python"
+        },
+        "172": {
+            "filepath": "/applications/README.md",
+            "entry_id": 2210,
+            "language_id": "plain-text"
+        },
+        "173": {
+            "filepath": "/applications/T2VLAD/README.md",
+            "entry_id": 2214,
+            "language_id": "markdown"
+        },
+        "174": {
+            "filepath": "/applications/T2VLAD/README_en.md",
+            "entry_id": 2220,
+            "language_id": "markdown"
+        },
+        "175": {
+            "filepath": "/applications/T2VLAD/base/__init__.py",
+            "entry_id": 2228,
+            "language_id": "python"
+        },
+        "176": {
+            "filepath": "/applications/T2VLAD/base/base_dataset.py",
+            "entry_id": 2232,
+            "language_id": "python"
+        },
+        "177": {
+            "filepath": "/applications/T2VLAD/base/base_model.py",
+            "entry_id": 2280,
+            "language_id": "python"
+        },
+        "178": {
+            "filepath": "/applications/T2VLAD/base/base_trainer.py",
+            "entry_id": 2286,
+            "language_id": "python"
+        },
+        "179": {
+            "filepath": "/applications/T2VLAD/data/download_features.sh",
+            "entry_id": 2310,
+            "language_id": "shell"
+        },
+        "180": {
+            "filepath": "/applications/T2VLAD/data_loader/MSRVTT_dataset.py",
+            "entry_id": 2314,
+            "language_id": "python"
+        },
+        "181": {
+            "filepath": "/applications/T2VLAD/data_loader/data_loaders.py",
+            "entry_id": 2328,
+            "language_id": "python"
+        },
+        "182": {
+            "filepath": "/applications/T2VLAD/logger/__init__.py",
+            "entry_id": 2340,
+            "language_id": "python"
+        },
+        "183": {
+            "filepath": "/applications/T2VLAD/logger/log_parser.py",
+            "entry_id": 2344,
+            "language_id": "python"
+        },
+        "184": {
+            "filepath": "/applications/T2VLAD/logger/logger.py",
+            "entry_id": 2356,
+            "language_id": "python"
+        },
+        "185": {
+            "filepath": "/applications/T2VLAD/model/loss.py",
+            "entry_id": 2360,
+            "language_id": "python"
+        },
+        "186": {
+            "filepath": "/applications/T2VLAD/model/metric.py",
+            "entry_id": 2370,
+            "language_id": "python"
+        },
+        "187": {
+            "filepath": "/applications/T2VLAD/model/model.py",
+            "entry_id": 2392,
+            "language_id": "python"
+        },
+        "188": {
+            "filepath": "/applications/T2VLAD/model/net_vlad.py",
+            "entry_id": 2434,
+            "language_id": "python"
+        },
+        "189": {
+            "filepath": "/applications/T2VLAD/model/text.py",
+            "entry_id": 2446,
+            "language_id": "python"
+        },
+        "190": {
+            "filepath": "/applications/T2VLAD/parse_config.py",
+            "entry_id": 2458,
+            "language_id": "python"
+        },
+        "191": {
+            "filepath": "/applications/T2VLAD/test.py",
+            "entry_id": 2478,
+            "language_id": "python"
+        },
+        "192": {
+            "filepath": "/applications/T2VLAD/train.py",
+            "entry_id": 2496,
+            "language_id": "python"
+        },
+        "193": {
+            "filepath": "/applications/T2VLAD/trainer/__init__.py",
+            "entry_id": 2510,
+            "language_id": "python"
+        },
+        "194": {
+            "filepath": "/applications/T2VLAD/trainer/trainer.py",
+            "entry_id": 2514,
+            "language_id": "python"
+        },
+        "195": {
+            "filepath": "/applications/T2VLAD/utils/__init__.py",
+            "entry_id": 2540,
+            "language_id": "python"
+        },
+        "196": {
+            "filepath": "/applications/T2VLAD/utils/util.py",
+            "entry_id": 2544,
+            "language_id": "python"
+        },
+        "197": {
+            "filepath": "/applications/TableTennis/ActionRecognition/README.md",
+            "entry_id": 2570,
+            "language_id": "markdown"
+        },
+        "198": {
+            "filepath": "/applications/TableTennis/datasets/script/submission_format_transfer.py",
+            "entry_id": 2578,
+            "language_id": "python"
+        },
+        "199": {
+            "filepath": "/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py",
+            "entry_id": 2584,
+            "language_id": "python"
+        },
+        "200": {
+            "filepath": "/applications/TableTennis/fix_bad_label.py",
+            "entry_id": 2592,
+            "language_id": "python"
+        },
+        "201": {
+            "filepath": "/applications/TableTennis/get_instance_for_bmn.py",
+            "entry_id": 2596,
+            "language_id": "python"
+        },
+        "202": {
+            "filepath": "/applications/TableTennis/gts_format_transfer.py",
+            "entry_id": 2614,
+            "language_id": "python"
+        },
+        "203": {
+            "filepath": "/applications/TableTennis/predict/action_detect/action.py",
+            "entry_id": 2618,
+            "language_id": "python"
+        },
+        "204": {
+            "filepath": "/applications/TableTennis/predict/action_detect/logger.py",
+            "entry_id": 2634,
+            "language_id": "python"
+        },
+        "205": {
+            "filepath": "/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py",
+            "entry_id": 2638,
+            "language_id": "python"
+        },
+        "206": {
+            "filepath": "/applications/TableTennis/predict/action_detect/mfcc/model_config.py",
+            "entry_id": 2654,
+            "language_id": "python"
+        },
+        "207": {
+            "filepath": "/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py",
+            "entry_id": 2660,
+            "language_id": "python"
+        },
+        "208": {
+            "filepath": "/applications/TableTennis/predict/action_detect/models/audio_infer.py",
+            "entry_id": 2666,
+            "language_id": "python"
+        },
+        "209": {
+            "filepath": "/applications/TableTennis/predict/action_detect/models/bmn_infer.py",
+            "entry_id": 2674,
+            "language_id": "python"
+        },
+        "210": {
+            "filepath": "/applications/TableTennis/predict/action_detect/models/lstm_infer.py",
+            "entry_id": 2688,
+            "language_id": "python"
+        },
+        "211": {
+            "filepath": "/applications/TableTennis/predict/action_detect/models/pptsm_infer.py",
+            "entry_id": 2702,
+            "language_id": "python"
+        },
+        "212": {
+            "filepath": "/applications/TableTennis/predict/action_detect/reader/__init__.py",
+            "entry_id": 2710,
+            "language_id": "python"
+        },
+        "213": {
+            "filepath": "/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py",
+            "entry_id": 2714,
+            "language_id": "python"
+        },
+        "214": {
+            "filepath": "/applications/TableTennis/predict/action_detect/reader/feature_reader.py",
+            "entry_id": 2726,
+            "language_id": "python"
+        },
+        "215": {
+            "filepath": "/applications/TableTennis/predict/action_detect/reader/reader_utils.py",
+            "entry_id": 2734,
+            "language_id": "python"
+        },
+        "216": {
+            "filepath": "/applications/TableTennis/predict/action_detect/utils/config_utils.py",
+            "entry_id": 2742,
+            "language_id": "python"
+        },
+        "217": {
+            "filepath": "/applications/TableTennis/predict/action_detect/utils/preprocess.py",
+            "entry_id": 2750,
+            "language_id": "python"
+        },
+        "218": {
+            "filepath": "/applications/TableTennis/predict/action_detect/utils/process_result.py",
+            "entry_id": 2754,
+            "language_id": "python"
+        },
+        "219": {
+            "filepath": "/applications/TableTennis/predict/eval.py",
+            "entry_id": 2766,
+            "language_id": "python"
+        },
+        "220": {
+            "filepath": "/applications/TableTennis/predict/predict.py",
+            "entry_id": 2788,
+            "language_id": "python"
+        },
+        "221": {
+            "filepath": "/applications/TableTennis/val_split.py",
+            "entry_id": 2794,
+            "language_id": "python"
+        },
+        "222": {
+            "filepath": "/applications/VideoQualityAssessment/README.md",
+            "entry_id": 2798,
+            "language_id": "plain-text"
+        },
+        "223": {
+            "filepath": "/applications/VideoQualityAssessment/main.py",
+            "entry_id": 2810,
+            "language_id": "python"
+        },
+        "224": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/__init__.py",
+            "entry_id": 2818,
+            "language_id": "python"
+        },
+        "225": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py",
+            "entry_id": 2822,
+            "language_id": "python"
+        },
+        "226": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/builder.py",
+            "entry_id": 2826,
+            "language_id": "python"
+        },
+        "227": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py",
+            "entry_id": 2836,
+            "language_id": "python"
+        },
+        "228": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py",
+            "entry_id": 2840,
+            "language_id": "python"
+        },
+        "229": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py",
+            "entry_id": 2848,
+            "language_id": "python"
+        },
+        "230": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py",
+            "entry_id": 2858,
+            "language_id": "python"
+        },
+        "231": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py",
+            "entry_id": 2868,
+            "language_id": "python"
+        },
+        "232": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py",
+            "entry_id": 2874,
+            "language_id": "python"
+        },
+        "233": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py",
+            "entry_id": 2910,
+            "language_id": "python"
+        },
+        "234": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py",
+            "entry_id": 2918,
+            "language_id": "python"
+        },
+        "235": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py",
+            "entry_id": 2930,
+            "language_id": "python"
+        },
+        "236": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py",
+            "entry_id": 2938,
+            "language_id": "python"
+        },
+        "237": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/loader/registry.py",
+            "entry_id": 2948,
+            "language_id": "python"
+        },
+        "238": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py",
+            "entry_id": 2952,
+            "language_id": "python"
+        },
+        "239": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/metrics/base.py",
+            "entry_id": 2956,
+            "language_id": "python"
+        },
+        "240": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/metrics/build.py",
+            "entry_id": 2962,
+            "language_id": "python"
+        },
+        "241": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py",
+            "entry_id": 2966,
+            "language_id": "python"
+        },
+        "242": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py",
+            "entry_id": 2974,
+            "language_id": "python"
+        },
+        "243": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py",
+            "entry_id": 2978,
+            "language_id": "python"
+        },
+        "244": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py",
+            "entry_id": 2984,
+            "language_id": "python"
+        },
+        "245": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py",
+            "entry_id": 2988,
+            "language_id": "python"
+        },
+        "246": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py",
+            "entry_id": 3012,
+            "language_id": "python"
+        },
+        "247": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py",
+            "entry_id": 3040,
+            "language_id": "python"
+        },
+        "248": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py",
+            "entry_id": 3046,
+            "language_id": "python"
+        },
+        "249": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py",
+            "entry_id": 3050,
+            "language_id": "python"
+        },
+        "250": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py",
+            "entry_id": 3054,
+            "language_id": "python"
+        },
+        "251": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py",
+            "entry_id": 3062,
+            "language_id": "python"
+        },
+        "252": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py",
+            "entry_id": 3068,
+            "language_id": "python"
+        },
+        "253": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py",
+            "entry_id": 3072,
+            "language_id": "python"
+        },
+        "254": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py",
+            "entry_id": 3084,
+            "language_id": "python"
+        },
+        "255": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py",
+            "entry_id": 3098,
+            "language_id": "python"
+        },
+        "256": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py",
+            "entry_id": 3108,
+            "language_id": "python"
+        },
+        "257": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py",
+            "entry_id": 3112,
+            "language_id": "python"
+        },
+        "258": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py",
+            "entry_id": 3118,
+            "language_id": "python"
+        },
+        "259": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py",
+            "entry_id": 3124,
+            "language_id": "python"
+        },
+        "260": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py",
+            "entry_id": 3130,
+            "language_id": "python"
+        },
+        "261": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py",
+            "entry_id": 3134,
+            "language_id": "python"
+        },
+        "262": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py",
+            "entry_id": 3140,
+            "language_id": "python"
+        },
+        "263": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py",
+            "entry_id": 3144,
+            "language_id": "python"
+        },
+        "264": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/solver/lr.py",
+            "entry_id": 3162,
+            "language_id": "python"
+        },
+        "265": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py",
+            "entry_id": 3168,
+            "language_id": "python"
+        },
+        "266": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py",
+            "entry_id": 3176,
+            "language_id": "python"
+        },
+        "267": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/tasks/test.py",
+            "entry_id": 3180,
+            "language_id": "python"
+        },
+        "268": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/tasks/train.py",
+            "entry_id": 3188,
+            "language_id": "python"
+        },
+        "269": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py",
+            "entry_id": 3212,
+            "language_id": "python"
+        },
+        "270": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py",
+            "entry_id": 3216,
+            "language_id": "python"
+        },
+        "271": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/utils/config.py",
+            "entry_id": 3222,
+            "language_id": "python"
+        },
+        "272": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py",
+            "entry_id": 3236,
+            "language_id": "python"
+        },
+        "273": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/utils/logger.py",
+            "entry_id": 3242,
+            "language_id": "python"
+        },
+        "274": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py",
+            "entry_id": 3252,
+            "language_id": "python"
+        },
+        "275": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/utils/record.py",
+            "entry_id": 3262,
+            "language_id": "python"
+        },
+        "276": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/utils/registry.py",
+            "entry_id": 3274,
+            "language_id": "python"
+        },
+        "277": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py",
+            "entry_id": 3282,
+            "language_id": "python"
+        },
+        "278": {
+            "filepath": "/applications/VideoQualityAssessment/paddlevideo/version.py",
+            "entry_id": 3290,
+            "language_id": "python"
+        },
+        "279": {
+            "filepath": "/applications/VideoQualityAssessment/run.sh",
+            "entry_id": 3294,
+            "language_id": "shell"
+        },
+        "280": {
+            "filepath": "/applications/VideoQualityAssessment/save_model.sh",
+            "entry_id": 3300,
+            "language_id": "shell"
+        },
+        "281": {
+            "filepath": "/applications/VideoQualityAssessment/setup.py",
+            "entry_id": 3304,
+            "language_id": "python"
+        },
+        "282": {
+            "filepath": "/applications/VideoTag/FineTune.md",
+            "entry_id": 3312,
+            "language_id": "markdown"
+        },
+        "283": {
+            "filepath": "/applications/VideoTag/README.md",
+            "entry_id": 3326,
+            "language_id": "plain-text"
+        },
+        "284": {
+            "filepath": "/applications/VideoTag/Run.md",
+            "entry_id": 3330,
+            "language_id": "markdown"
+        },
+        "285": {
+            "filepath": "/applications/VideoTag/Test.md",
+            "entry_id": 3338,
+            "language_id": "markdown"
+        },
+        "286": {
+            "filepath": "/applications/VideoTag/eval.py",
+            "entry_id": 3342,
+            "language_id": "python"
+        },
+        "287": {
+            "filepath": "/applications/VideoTag/metrics/__init__.py",
+            "entry_id": 3354,
+            "language_id": "python"
+        },
+        "288": {
+            "filepath": "/applications/VideoTag/metrics/kinetics/accuracy_metrics.py",
+            "entry_id": 3358,
+            "language_id": "python"
+        },
+        "289": {
+            "filepath": "/applications/VideoTag/metrics/metrics_util.py",
+            "entry_id": 3368,
+            "language_id": "python"
+        },
+        "290": {
+            "filepath": "/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py",
+            "entry_id": 3392,
+            "language_id": "python"
+        },
+        "291": {
+            "filepath": "/applications/VideoTag/metrics/youtube8m/eval_util.py",
+            "entry_id": 3414,
+            "language_id": "python"
+        },
+        "292": {
+            "filepath": "/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py",
+            "entry_id": 3434,
+            "language_id": "python"
+        },
+        "293": {
+            "filepath": "/applications/VideoTag/models/__init__.py",
+            "entry_id": 3446,
+            "language_id": "python"
+        },
+        "294": {
+            "filepath": "/applications/VideoTag/models/attention_lstm/__init__.py",
+            "entry_id": 3450,
+            "language_id": "python"
+        },
+        "295": {
+            "filepath": "/applications/VideoTag/models/attention_lstm/attention_lstm.py",
+            "entry_id": 3454,
+            "language_id": "python"
+        },
+        "296": {
+            "filepath": "/applications/VideoTag/models/attention_lstm/lstm_attention.py",
+            "entry_id": 3470,
+            "language_id": "python"
+        },
+        "297": {
+            "filepath": "/applications/VideoTag/models/model.py",
+            "entry_id": 3478,
+            "language_id": "python"
+        },
+        "298": {
+            "filepath": "/applications/VideoTag/models/tsn/__init__.py",
+            "entry_id": 3492,
+            "language_id": "python"
+        },
+        "299": {
+            "filepath": "/applications/VideoTag/models/tsn/tsn.py",
+            "entry_id": 3496,
+            "language_id": "python"
+        },
+        "300": {
+            "filepath": "/applications/VideoTag/models/tsn/tsn_res_model.py",
+            "entry_id": 3512,
+            "language_id": "python"
+        },
+        "301": {
+            "filepath": "/applications/VideoTag/models/utils.py",
+            "entry_id": 3526,
+            "language_id": "python"
+        },
+        "302": {
+            "filepath": "/applications/VideoTag/predict.py",
+            "entry_id": 3532,
+            "language_id": "python"
+        },
+        "303": {
+            "filepath": "/applications/VideoTag/reader/__init__.py",
+            "entry_id": 3546,
+            "language_id": "python"
+        },
+        "304": {
+            "filepath": "/applications/VideoTag/reader/feature_reader.py",
+            "entry_id": 3550,
+            "language_id": "python"
+        },
+        "305": {
+            "filepath": "/applications/VideoTag/reader/kinetics_reader.py",
+            "entry_id": 3558,
+            "language_id": "python"
+        },
+        "306": {
+            "filepath": "/applications/VideoTag/reader/reader_utils.py",
+            "entry_id": 3584,
+            "language_id": "python"
+        },
+        "307": {
+            "filepath": "/applications/VideoTag/train.py",
+            "entry_id": 3592,
+            "language_id": "python"
+        },
+        "308": {
+            "filepath": "/applications/VideoTag/tsn_extractor.py",
+            "entry_id": 3612,
+            "language_id": "python"
+        },
+        "309": {
+            "filepath": "/applications/VideoTag/utils/config_utils.py",
+            "entry_id": 3626,
+            "language_id": "python"
+        },
+        "310": {
+            "filepath": "/applications/VideoTag/utils/train_utils.py",
+            "entry_id": 3634,
+            "language_id": "python"
+        },
+        "311": {
+            "filepath": "/applications/VideoTag/utils/utility.py",
+            "entry_id": 3650,
+            "language_id": "python"
+        },
+        "312": {
+            "filepath": "/applications/VideoTag/videotag_test.py",
+            "entry_id": 3658,
+            "language_id": "python"
+        },
+        "313": {
+            "filepath": "/benchmark/TimeSformer/README.md",
+            "entry_id": 3680,
+            "language_id": "plain-text"
+        },
+        "314": {
+            "filepath": "/benchmark/TimeSformer/run_all.sh",
+            "entry_id": 3684,
+            "language_id": "shell"
+        },
+        "315": {
+            "filepath": "/benchmark/TimeSformer/run_benchmark.sh",
+            "entry_id": 3692,
+            "language_id": "shell"
+        },
+        "316": {
+            "filepath": "/data/50salads/prepare_asrf_data.py",
+            "entry_id": 3700,
+            "language_id": "python"
+        },
+        "317": {
+            "filepath": "/data/50salads/transform_segmentation_label.py",
+            "entry_id": 3710,
+            "language_id": "python"
+        },
+        "318": {
+            "filepath": "/data/ntu-rgb-d/download_dataset.sh",
+            "entry_id": 3726,
+            "language_id": "shell"
+        },
+        "319": {
+            "filepath": "/data/ntu-rgb-d/get_raw_denoised_data.py",
+            "entry_id": 3730,
+            "language_id": "python"
+        },
+        "320": {
+            "filepath": "/data/ntu-rgb-d/get_raw_skes_data.py",
+            "entry_id": 3768,
+            "language_id": "python"
+        },
+        "321": {
+            "filepath": "/data/ntu-rgb-d/seq_transformation.py",
+            "entry_id": 3782,
+            "language_id": "python"
+        },
+        "322": {
+            "filepath": "/deploy/cpp_infer/external-cmake/auto-log.cmake",
+            "entry_id": 3804,
+            "language_id": "cmake"
+        },
+        "323": {
+            "filepath": "/deploy/cpp_infer/include/postprocess_op.h",
+            "entry_id": 3808,
+            "language_id": "c"
+        },
+        "324": {
+            "filepath": "/deploy/cpp_infer/include/preprocess_op.h",
+            "entry_id": 3814,
+            "language_id": "header"
+        },
+        "325": {
+            "filepath": "/deploy/cpp_infer/include/utility.h",
+            "entry_id": 3820,
+            "language_id": "header"
+        },
+        "326": {
+            "filepath": "/deploy/cpp_infer/include/video_rec.h",
+            "entry_id": 3826,
+            "language_id": "c"
+        },
+        "327": {
+            "filepath": "/deploy/cpp_infer/readme.md",
+            "entry_id": 3836,
+            "language_id": "markdown"
+        },
+        "328": {
+            "filepath": "/deploy/cpp_infer/readme_en.md",
+            "entry_id": 3858,
+            "language_id": "markdown"
+        },
+        "329": {
+            "filepath": "/deploy/cpp_infer/src/main.cpp",
+            "entry_id": 3888,
+            "language_id": "c++"
+        },
+        "330": {
+            "filepath": "/deploy/cpp_infer/src/postprocess_op.cpp",
+            "entry_id": 3904,
+            "language_id": "c++"
+        },
+        "331": {
+            "filepath": "/deploy/cpp_infer/src/preprocess_op.cpp",
+            "entry_id": 3910,
+            "language_id": "c++"
+        },
+        "332": {
+            "filepath": "/deploy/cpp_infer/src/utility.cpp",
+            "entry_id": 3922,
+            "language_id": "c++"
+        },
+        "333": {
+            "filepath": "/deploy/cpp_infer/src/video_rec.cpp",
+            "entry_id": 3938,
+            "language_id": "c++"
+        },
+        "334": {
+            "filepath": "/deploy/cpp_infer/tools/build.sh",
+            "entry_id": 3964,
+            "language_id": "shell"
+        },
+        "335": {
+            "filepath": "/deploy/cpp_serving/paddle_env_install.sh",
+            "entry_id": 3968,
+            "language_id": "shell"
+        },
+        "336": {
+            "filepath": "/deploy/cpp_serving/preprocess_ops.py",
+            "entry_id": 3974,
+            "language_id": "python"
+        },
+        "337": {
+            "filepath": "/deploy/cpp_serving/readme.md",
+            "entry_id": 3984,
+            "language_id": "markdown"
+        },
+        "338": {
+            "filepath": "/deploy/cpp_serving/readme_en.md",
+            "entry_id": 3998,
+            "language_id": "markdown"
+        },
+        "339": {
+            "filepath": "/deploy/cpp_serving/run_cpp_serving.sh",
+            "entry_id": 4016,
+            "language_id": "shell"
+        },
+        "340": {
+            "filepath": "/deploy/cpp_serving/serving_client.py",
+            "entry_id": 4020,
+            "language_id": "python"
+        },
+        "341": {
+            "filepath": "/deploy/paddle2onnx/predict_onnx.py",
+            "entry_id": 4028,
+            "language_id": "python"
+        },
+        "342": {
+            "filepath": "/deploy/paddle2onnx/readme.md",
+            "entry_id": 4042,
+            "language_id": "markdown"
+        },
+        "343": {
+            "filepath": "/deploy/paddle2onnx/readme_en.md",
+            "entry_id": 4048,
+            "language_id": "markdown"
+        },
+        "344": {
+            "filepath": "/deploy/python_serving/pipeline_http_client.py",
+            "entry_id": 4056,
+            "language_id": "python"
+        },
+        "345": {
+            "filepath": "/deploy/python_serving/pipeline_rpc_client.py",
+            "entry_id": 4064,
+            "language_id": "python"
+        },
+        "346": {
+            "filepath": "/deploy/python_serving/readme.md",
+            "entry_id": 4072,
+            "language_id": "markdown"
+        },
+        "347": {
+            "filepath": "/deploy/python_serving/readme_en.md",
+            "entry_id": 4086,
+            "language_id": "markdown"
+        },
+        "348": {
+            "filepath": "/deploy/python_serving/recognition_web_service.py",
+            "entry_id": 4106,
+            "language_id": "python"
+        },
+        "349": {
+            "filepath": "/deploy/python_serving/utils.py",
+            "entry_id": 4122,
+            "language_id": "python"
+        },
+        "350": {
+            "filepath": "/deploy/slim/quant_post_static.py",
+            "entry_id": 4130,
+            "language_id": "python"
+        },
+        "351": {
+            "filepath": "/deploy/slim/readme.md",
+            "entry_id": 4142,
+            "language_id": "markdown"
+        },
+        "352": {
+            "filepath": "/deploy/slim/readme_en.md",
+            "entry_id": 4150,
+            "language_id": "markdown"
+        },
+        "353": {
+            "filepath": "/english_documents/benchmark.md",
+            "entry_id": 4164,
+            "language_id": "markdown"
+        },
+        "354": {
+            "filepath": "/english_documents/dataset/AVA.md",
+            "entry_id": 4176,
+            "language_id": "markdown"
+        },
+        "355": {
+            "filepath": "/english_documents/dataset/ActivityNet.md",
+            "entry_id": 4186,
+            "language_id": "markdown"
+        },
+        "356": {
+            "filepath": "/english_documents/dataset/Oxford_RobotCar.md",
+            "entry_id": 4196,
+            "language_id": "markdown"
+        },
+        "357": {
+            "filepath": "/english_documents/dataset/README.md",
+            "entry_id": 4216,
+            "language_id": "markdown"
+        },
+        "358": {
+            "filepath": "/english_documents/dataset/SegmentationDataset.md",
+            "entry_id": 4224,
+            "language_id": "markdown"
+        },
+        "359": {
+            "filepath": "/english_documents/dataset/fsd.md",
+            "entry_id": 4228,
+            "language_id": "markdown"
+        },
+        "360": {
+            "filepath": "/english_documents/dataset/k400.md",
+            "entry_id": 4236,
+            "language_id": "markdown"
+        },
+        "361": {
+            "filepath": "/english_documents/dataset/msrvtt.md",
+            "entry_id": 4244,
+            "language_id": "markdown"
+        },
+        "362": {
+            "filepath": "/english_documents/dataset/ntu-rgbd.md",
+            "entry_id": 4252,
+            "language_id": "markdown"
+        },
+        "363": {
+            "filepath": "/english_documents/dataset/ucf101.md",
+            "entry_id": 4264,
+            "language_id": "markdown"
+        },
+        "364": {
+            "filepath": "/english_documents/dataset/ucf24.md",
+            "entry_id": 4272,
+            "language_id": "markdown"
+        },
+        "365": {
+            "filepath": "/english_documents/dataset/youtube8m.md",
+            "entry_id": 4280,
+            "language_id": "markdown"
+        },
+        "366": {
+            "filepath": "/english_documents/install.md",
+            "entry_id": 4288,
+            "language_id": "markdown"
+        },
+        "367": {
+            "filepath": "/english_documents/model_zoo/README.md",
+            "entry_id": 4294,
+            "language_id": "plain-text"
+        },
+        "368": {
+            "filepath": "/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md",
+            "entry_id": 4304,
+            "language_id": "markdown"
+        },
+        "369": {
+            "filepath": "/english_documents/model_zoo/estimation/adds.md",
+            "entry_id": 4316,
+            "language_id": "markdown"
+        },
+        "370": {
+            "filepath": "/english_documents/model_zoo/localization/bmn.md",
+            "entry_id": 4332,
+            "language_id": "markdown"
+        },
+        "371": {
+            "filepath": "/english_documents/model_zoo/localization/yowo.md",
+            "entry_id": 4342,
+            "language_id": "markdown"
+        },
+        "372": {
+            "filepath": "/english_documents/model_zoo/multimodal/actbert.md",
+            "entry_id": 4356,
+            "language_id": "markdown"
+        },
+        "373": {
+            "filepath": "/english_documents/model_zoo/partition/transnetv2.md",
+            "entry_id": 4364,
+            "language_id": "markdown"
+        },
+        "374": {
+            "filepath": "/english_documents/model_zoo/recognition/agcn.md",
+            "entry_id": 4372,
+            "language_id": "markdown"
+        },
+        "375": {
+            "filepath": "/english_documents/model_zoo/recognition/agcn2s.md",
+            "entry_id": 4382,
+            "language_id": "markdown"
+        },
+        "376": {
+            "filepath": "/english_documents/model_zoo/recognition/attention_lstm.md",
+            "entry_id": 4396,
+            "language_id": "markdown"
+        },
+        "377": {
+            "filepath": "/english_documents/model_zoo/recognition/ctrgcn.md",
+            "entry_id": 4406,
+            "language_id": "markdown"
+        },
+        "378": {
+            "filepath": "/english_documents/model_zoo/recognition/movinet.md",
+            "entry_id": 4418,
+            "language_id": "markdown"
+        },
+        "379": {
+            "filepath": "/english_documents/model_zoo/recognition/posec3d.md",
+            "entry_id": 4426,
+            "language_id": "markdown"
+        },
+        "380": {
+            "filepath": "/english_documents/model_zoo/recognition/pp-timesformer.md",
+            "entry_id": 4436,
+            "language_id": "markdown"
+        },
+        "381": {
+            "filepath": "/english_documents/model_zoo/recognition/pp-tsm.md",
+            "entry_id": 4454,
+            "language_id": "markdown"
+        },
+        "382": {
+            "filepath": "/english_documents/model_zoo/recognition/pp-tsn.md",
+            "entry_id": 4470,
+            "language_id": "markdown"
+        },
+        "383": {
+            "filepath": "/english_documents/model_zoo/recognition/slowfast.md",
+            "entry_id": 4486,
+            "language_id": "markdown"
+        },
+        "384": {
+            "filepath": "/english_documents/model_zoo/recognition/stgcn.md",
+            "entry_id": 4498,
+            "language_id": "markdown"
+        },
+        "385": {
+            "filepath": "/english_documents/model_zoo/recognition/timesformer.md",
+            "entry_id": 4508,
+            "language_id": "markdown"
+        },
+        "386": {
+            "filepath": "/english_documents/model_zoo/recognition/tokenshift_transformer.md",
+            "entry_id": 4524,
+            "language_id": "markdown"
+        },
+        "387": {
+            "filepath": "/english_documents/model_zoo/recognition/tsm.md",
+            "entry_id": 4538,
+            "language_id": "markdown"
+        },
+        "388": {
+            "filepath": "/english_documents/model_zoo/recognition/tsn.md",
+            "entry_id": 4558,
+            "language_id": "markdown"
+        },
+        "389": {
+            "filepath": "/english_documents/model_zoo/recognition/tsn_dali.md",
+            "entry_id": 4574,
+            "language_id": "markdown"
+        },
+        "390": {
+            "filepath": "/english_documents/model_zoo/recognition/videoswin.md",
+            "entry_id": 4582,
+            "language_id": "markdown"
+        },
+        "391": {
+            "filepath": "/english_documents/model_zoo/segmentation/asrf.md",
+            "entry_id": 4598,
+            "language_id": "markdown"
+        },
+        "392": {
+            "filepath": "/english_documents/model_zoo/segmentation/cfbi.md",
+            "entry_id": 4612,
+            "language_id": "markdown"
+        },
+        "393": {
+            "filepath": "/english_documents/model_zoo/segmentation/mstcn.md",
+            "entry_id": 4618,
+            "language_id": "markdown"
+        },
+        "394": {
+            "filepath": "/english_documents/quick_start.md",
+            "entry_id": 4630,
+            "language_id": "markdown"
+        },
+        "395": {
+            "filepath": "/english_documents/tools.md",
+            "entry_id": 4644,
+            "language_id": "markdown"
+        },
+        "396": {
+            "filepath": "/english_documents/tutorials/Action Recognition Datasets",
+            "entry_id": 4648,
+            "language_id": "text"
+        },
+        "397": {
+            "filepath": "/english_documents/tutorials/Action Recognition Papers",
+            "entry_id": 4652,
+            "language_id": "text"
+        },
+        "398": {
+            "filepath": "/english_documents/tutorials/Spatio-Temporal Action Detection Papers",
+            "entry_id": 4660,
+            "language_id": "text"
+        },
+        "399": {
+            "filepath": "/english_documents/tutorials/TSM.md",
+            "entry_id": 4668,
+            "language_id": "markdown"
+        },
+        "400": {
+            "filepath": "/english_documents/tutorials/Temporal Action Detection Papers",
+            "entry_id": 4684,
+            "language_id": "text"
+        },
+        "401": {
+            "filepath": "/english_documents/tutorials/accelerate.md",
+            "entry_id": 4692,
+            "language_id": "markdown"
+        },
+        "402": {
+            "filepath": "/english_documents/tutorials/config.md",
+            "entry_id": 4696,
+            "language_id": "markdown"
+        },
+        "403": {
+            "filepath": "/english_documents/tutorials/customized_usage.md",
+            "entry_id": 4706,
+            "language_id": "markdown"
+        },
+        "404": {
+            "filepath": "/english_documents/tutorials/demos",
+            "entry_id": 4710,
+            "language_id": "text"
+        },
+        "405": {
+            "filepath": "/english_documents/tutorials/deployment.md",
+            "entry_id": 4714,
+            "language_id": "markdown"
+        },
+        "406": {
+            "filepath": "/english_documents/tutorials/modular_design.md",
+            "entry_id": 4720,
+            "language_id": "markdown"
+        },
+        "407": {
+            "filepath": "/english_documents/tutorials/pp-tsm.md",
+            "entry_id": 4724,
+            "language_id": "markdown"
+        },
+        "408": {
+            "filepath": "/english_documents/tutorials/summarize.md",
+            "entry_id": 4730,
+            "language_id": "markdown"
+        },
+        "409": {
+            "filepath": "/english_documents/usage.md",
+            "entry_id": 4756,
+            "language_id": "markdown"
+        },
+        "410": {
+            "filepath": "/main.py",
+            "entry_id": 4770,
+            "language_id": "python"
+        },
+        "411": {
+            "filepath": "/paddlevideo/__init__.py",
+            "entry_id": 4782,
+            "language_id": "python"
+        },
+        "412": {
+            "filepath": "/paddlevideo/loader/__init__.py",
+            "entry_id": 4786,
+            "language_id": "python"
+        },
+        "413": {
+            "filepath": "/paddlevideo/loader/builder.py",
+            "entry_id": 4790,
+            "language_id": "python"
+        },
+        "414": {
+            "filepath": "/paddlevideo/loader/dali_loader.py",
+            "entry_id": 4800,
+            "language_id": "python"
+        },
+        "415": {
+            "filepath": "/paddlevideo/loader/dataset/MRI.py",
+            "entry_id": 4820,
+            "language_id": "python"
+        },
+        "416": {
+            "filepath": "/paddlevideo/loader/dataset/MRI_SlowFast.py",
+            "entry_id": 4832,
+            "language_id": "python"
+        },
+        "417": {
+            "filepath": "/paddlevideo/loader/dataset/__init__.py",
+            "entry_id": 4844,
+            "language_id": "python"
+        },
+        "418": {
+            "filepath": "/paddlevideo/loader/dataset/actbert_dataset.py",
+            "entry_id": 4850,
+            "language_id": "python"
+        },
+        "419": {
+            "filepath": "/paddlevideo/loader/dataset/asrf_dataset.py",
+            "entry_id": 4858,
+            "language_id": "python"
+        },
+        "420": {
+            "filepath": "/paddlevideo/loader/dataset/ava_dataset.py",
+            "entry_id": 4868,
+            "language_id": "python"
+        },
+        "421": {
+            "filepath": "/paddlevideo/loader/dataset/base.py",
+            "entry_id": 4890,
+            "language_id": "python"
+        },
+        "422": {
+            "filepath": "/paddlevideo/loader/dataset/bmn_dataset.py",
+            "entry_id": 4898,
+            "language_id": "python"
+        },
+        "423": {
+            "filepath": "/paddlevideo/loader/dataset/davis_dataset.py",
+            "entry_id": 4906,
+            "language_id": "python"
+        },
+        "424": {
+            "filepath": "/paddlevideo/loader/dataset/feature.py",
+            "entry_id": 4922,
+            "language_id": "python"
+        },
+        "425": {
+            "filepath": "/paddlevideo/loader/dataset/frame.py",
+            "entry_id": 4930,
+            "language_id": "python"
+        },
+        "426": {
+            "filepath": "/paddlevideo/loader/dataset/ms_tcn_dataset.py",
+            "entry_id": 4946,
+            "language_id": "python"
+        },
+        "427": {
+            "filepath": "/paddlevideo/loader/dataset/msrvtt.py",
+            "entry_id": 4956,
+            "language_id": "python"
+        },
+        "428": {
+            "filepath": "/paddlevideo/loader/dataset/oxford.py",
+            "entry_id": 4974,
+            "language_id": "python"
+        },
+        "429": {
+            "filepath": "/paddlevideo/loader/dataset/skeleton.py",
+            "entry_id": 4980,
+            "language_id": "python"
+        },
+        "430": {
+            "filepath": "/paddlevideo/loader/dataset/slowfast_video.py",
+            "entry_id": 4988,
+            "language_id": "python"
+        },
+        "431": {
+            "filepath": "/paddlevideo/loader/dataset/ucf101_skeleton.py",
+            "entry_id": 5002,
+            "language_id": "python"
+        },
+        "432": {
+            "filepath": "/paddlevideo/loader/dataset/ucf24_dataset.py",
+            "entry_id": 5010,
+            "language_id": "python"
+        },
+        "433": {
+            "filepath": "/paddlevideo/loader/dataset/video.py",
+            "entry_id": 5018,
+            "language_id": "python"
+        },
+        "434": {
+            "filepath": "/paddlevideo/loader/pipelines/__init__.py",
+            "entry_id": 5028,
+            "language_id": "python"
+        },
+        "435": {
+            "filepath": "/paddlevideo/loader/pipelines/anet_pipeline.py",
+            "entry_id": 5038,
+            "language_id": "python"
+        },
+        "436": {
+            "filepath": "/paddlevideo/loader/pipelines/augmentations.py",
+            "entry_id": 5052,
+            "language_id": "python"
+        },
+        "437": {
+            "filepath": "/paddlevideo/loader/pipelines/augmentations_ava.py",
+            "entry_id": 5152,
+            "language_id": "python"
+        },
+        "438": {
+            "filepath": "/paddlevideo/loader/pipelines/compose.py",
+            "entry_id": 5204,
+            "language_id": "python"
+        },
+        "439": {
+            "filepath": "/paddlevideo/loader/pipelines/decode.py",
+            "entry_id": 5212,
+            "language_id": "python"
+        },
+        "440": {
+            "filepath": "/paddlevideo/loader/pipelines/decode_image.py",
+            "entry_id": 5238,
+            "language_id": "python"
+        },
+        "441": {
+            "filepath": "/paddlevideo/loader/pipelines/decode_sampler.py",
+            "entry_id": 5254,
+            "language_id": "python"
+        },
+        "442": {
+            "filepath": "/paddlevideo/loader/pipelines/decode_sampler_MRI.py",
+            "entry_id": 5264,
+            "language_id": "python"
+        },
+        "443": {
+            "filepath": "/paddlevideo/loader/pipelines/mix.py",
+            "entry_id": 5284,
+            "language_id": "python"
+        },
+        "444": {
+            "filepath": "/paddlevideo/loader/pipelines/multimodal.py",
+            "entry_id": 5294,
+            "language_id": "python"
+        },
+        "445": {
+            "filepath": "/paddlevideo/loader/pipelines/sample.py",
+            "entry_id": 5324,
+            "language_id": "python"
+        },
+        "446": {
+            "filepath": "/paddlevideo/loader/pipelines/sample_ava.py",
+            "entry_id": 5354,
+            "language_id": "python"
+        },
+        "447": {
+            "filepath": "/paddlevideo/loader/pipelines/sample_ucf24.py",
+            "entry_id": 5384,
+            "language_id": "python"
+        },
+        "448": {
+            "filepath": "/paddlevideo/loader/pipelines/segmentation.py",
+            "entry_id": 5392,
+            "language_id": "python"
+        },
+        "449": {
+            "filepath": "/paddlevideo/loader/pipelines/segmentation_pipline.py",
+            "entry_id": 5404,
+            "language_id": "python"
+        },
+        "450": {
+            "filepath": "/paddlevideo/loader/pipelines/skeleton_pipeline.py",
+            "entry_id": 5410,
+            "language_id": "python"
+        },
+        "451": {
+            "filepath": "/paddlevideo/loader/registry.py",
+            "entry_id": 5524,
+            "language_id": "python"
+        },
+        "452": {
+            "filepath": "/paddlevideo/metrics/ActivityNet/__init__.py",
+            "entry_id": 5528,
+            "language_id": "python"
+        },
+        "453": {
+            "filepath": "/paddlevideo/metrics/ActivityNet/anet_prop.py",
+            "entry_id": 5532,
+            "language_id": "python"
+        },
+        "454": {
+            "filepath": "/paddlevideo/metrics/__init__.py",
+            "entry_id": 5564,
+            "language_id": "python"
+        },
+        "455": {
+            "filepath": "/paddlevideo/metrics/ava_evaluation/README.md",
+            "entry_id": 5570,
+            "language_id": "markdown"
+        },
+        "456": {
+            "filepath": "/paddlevideo/metrics/ava_evaluation/metrics.py",
+            "entry_id": 5574,
+            "language_id": "python"
+        },
+        "457": {
+            "filepath": "/paddlevideo/metrics/ava_evaluation/np_box_list.py",
+            "entry_id": 5588,
+            "language_id": "python"
+        },
+        "458": {
+            "filepath": "/paddlevideo/metrics/ava_evaluation/np_box_ops.py",
+            "entry_id": 5600,
+            "language_id": "python"
+        },
+        "459": {
+            "filepath": "/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py",
+            "entry_id": 5610,
+            "language_id": "python"
+        },
+        "460": {
+            "filepath": "/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py",
+            "entry_id": 5668,
+            "language_id": "python"
+        },
+        "461": {
+            "filepath": "/paddlevideo/metrics/ava_evaluation/standard_fields.py",
+            "entry_id": 5710,
+            "language_id": "python"
+        },
+        "462": {
+            "filepath": "/paddlevideo/metrics/ava_metric.py",
+            "entry_id": 5724,
+            "language_id": "python"
+        },
+        "463": {
+            "filepath": "/paddlevideo/metrics/ava_utils.py",
+            "entry_id": 5734,
+            "language_id": "python"
+        },
+        "464": {
+            "filepath": "/paddlevideo/metrics/base.py",
+            "entry_id": 5764,
+            "language_id": "python"
+        },
+        "465": {
+            "filepath": "/paddlevideo/metrics/bmn_metric.py",
+            "entry_id": 5770,
+            "language_id": "python"
+        },
+        "466": {
+            "filepath": "/paddlevideo/metrics/build.py",
+            "entry_id": 5794,
+            "language_id": "python"
+        },
+        "467": {
+            "filepath": "/paddlevideo/metrics/center_crop_metric.py",
+            "entry_id": 5798,
+            "language_id": "python"
+        },
+        "468": {
+            "filepath": "/paddlevideo/metrics/center_crop_metric_MRI.py",
+            "entry_id": 5806,
+            "language_id": "python"
+        },
+        "469": {
+            "filepath": "/paddlevideo/metrics/depth_metric.py",
+            "entry_id": 5814,
+            "language_id": "python"
+        },
+        "470": {
+            "filepath": "/paddlevideo/metrics/msrvtt_metric.py",
+            "entry_id": 5822,
+            "language_id": "python"
+        },
+        "471": {
+            "filepath": "/paddlevideo/metrics/multi_crop_metric.py",
+            "entry_id": 5830,
+            "language_id": "python"
+        },
+        "472": {
+            "filepath": "/paddlevideo/metrics/recall.py",
+            "entry_id": 5842,
+            "language_id": "python"
+        },
+        "473": {
+            "filepath": "/paddlevideo/metrics/registry.py",
+            "entry_id": 5850,
+            "language_id": "python"
+        },
+        "474": {
+            "filepath": "/paddlevideo/metrics/segmentation_metric.py",
+            "entry_id": 5854,
+            "language_id": "python"
+        },
+        "475": {
+            "filepath": "/paddlevideo/metrics/skeleton_metric.py",
+            "entry_id": 5882,
+            "language_id": "python"
+        },
+        "476": {
+            "filepath": "/paddlevideo/metrics/transnetv2_metric.py",
+            "entry_id": 5892,
+            "language_id": "python"
+        },
+        "477": {
+            "filepath": "/paddlevideo/metrics/ucf24_utils.py",
+            "entry_id": 5906,
+            "language_id": "python"
+        },
+        "478": {
+            "filepath": "/paddlevideo/metrics/vos_metric.py",
+            "entry_id": 5964,
+            "language_id": "python"
+        },
+        "479": {
+            "filepath": "/paddlevideo/metrics/youtube8m/average_precision_calculator.py",
+            "entry_id": 5994,
+            "language_id": "python"
+        },
+        "480": {
+            "filepath": "/paddlevideo/metrics/youtube8m/eval_util.py",
+            "entry_id": 6016,
+            "language_id": "python"
+        },
+        "481": {
+            "filepath": "/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py",
+            "entry_id": 6034,
+            "language_id": "python"
+        },
+        "482": {
+            "filepath": "/paddlevideo/metrics/yowo_metric.py",
+            "entry_id": 6046,
+            "language_id": "python"
+        },
+        "483": {
+            "filepath": "/paddlevideo/modeling/__init__.py",
+            "entry_id": 6054,
+            "language_id": "python"
+        },
+        "484": {
+            "filepath": "/paddlevideo/modeling/assigners/__init__.py",
+            "entry_id": 6060,
+            "language_id": "python"
+        },
+        "485": {
+            "filepath": "/paddlevideo/modeling/assigners/max_iou_assigner_ava.py",
+            "entry_id": 6064,
+            "language_id": "python"
+        },
+        "486": {
+            "filepath": "/paddlevideo/modeling/backbones/__init__.py",
+            "entry_id": 6080,
+            "language_id": "python"
+        },
+        "487": {
+            "filepath": "/paddlevideo/modeling/backbones/actbert.py",
+            "entry_id": 6088,
+            "language_id": "python"
+        },
+        "488": {
+            "filepath": "/paddlevideo/modeling/backbones/adds.py",
+            "entry_id": 6186,
+            "language_id": "python"
+        },
+        "489": {
+            "filepath": "/paddlevideo/modeling/backbones/agcn.py",
+            "entry_id": 6268,
+            "language_id": "python"
+        },
+        "490": {
+            "filepath": "/paddlevideo/modeling/backbones/agcn2s.py",
+            "entry_id": 6280,
+            "language_id": "python"
+        },
+        "491": {
+            "filepath": "/paddlevideo/modeling/backbones/asrf.py",
+            "entry_id": 6298,
+            "language_id": "python"
+        },
+        "492": {
+            "filepath": "/paddlevideo/modeling/backbones/bmn.py",
+            "entry_id": 6306,
+            "language_id": "python"
+        },
+        "493": {
+            "filepath": "/paddlevideo/modeling/backbones/cfbi.py",
+            "entry_id": 6330,
+            "language_id": "python"
+        },
+        "494": {
+            "filepath": "/paddlevideo/modeling/backbones/ctrgcn.py",
+            "entry_id": 6340,
+            "language_id": "python"
+        },
+        "495": {
+            "filepath": "/paddlevideo/modeling/backbones/darknet.py",
+            "entry_id": 6378,
+            "language_id": "python"
+        },
+        "496": {
+            "filepath": "/paddlevideo/modeling/backbones/deeplab.py",
+            "entry_id": 6394,
+            "language_id": "python"
+        },
+        "497": {
+            "filepath": "/paddlevideo/modeling/backbones/movinet.py",
+            "entry_id": 6426,
+            "language_id": "python"
+        },
+        "498": {
+            "filepath": "/paddlevideo/modeling/backbones/ms_tcn.py",
+            "entry_id": 6472,
+            "language_id": "python"
+        },
+        "499": {
+            "filepath": "/paddlevideo/modeling/backbones/pptsm_mv2.py",
+            "entry_id": 6484,
+            "language_id": "python"
+        },
+        "500": {
+            "filepath": "/paddlevideo/modeling/backbones/pptsm_mv3.py",
+            "entry_id": 6508,
+            "language_id": "python"
+        },
+        "501": {
+            "filepath": "/paddlevideo/modeling/backbones/pptsm_v2.py",
+            "entry_id": 6540,
+            "language_id": "python"
+        },
+        "502": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet.py",
+            "entry_id": 6570,
+            "language_id": "python"
+        },
+        "503": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet3d.py",
+            "entry_id": 6594,
+            "language_id": "python"
+        },
+        "504": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet3d_slowonly.py",
+            "entry_id": 6644,
+            "language_id": "python"
+        },
+        "505": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet_slowfast.py",
+            "entry_id": 6664,
+            "language_id": "python"
+        },
+        "506": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py",
+            "entry_id": 6724,
+            "language_id": "python"
+        },
+        "507": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet_tsm.py",
+            "entry_id": 6784,
+            "language_id": "python"
+        },
+        "508": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet_tsm_MRI.py",
+            "entry_id": 6812,
+            "language_id": "python"
+        },
+        "509": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet_tsn_MRI.py",
+            "entry_id": 6840,
+            "language_id": "python"
+        },
+        "510": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py",
+            "entry_id": 6868,
+            "language_id": "python"
+        },
+        "511": {
+            "filepath": "/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py",
+            "entry_id": 6898,
+            "language_id": "python"
+        },
+        "512": {
+            "filepath": "/paddlevideo/modeling/backbones/resnext101.py",
+            "entry_id": 6926,
+            "language_id": "python"
+        },
+        "513": {
+            "filepath": "/paddlevideo/modeling/backbones/stgcn.py",
+            "entry_id": 6942,
+            "language_id": "python"
+        },
+        "514": {
+            "filepath": "/paddlevideo/modeling/backbones/swin_transformer.py",
+            "entry_id": 6968,
+            "language_id": "python"
+        },
+        "515": {
+            "filepath": "/paddlevideo/modeling/backbones/toshift_vit.py",
+            "entry_id": 7024,
+            "language_id": "python"
+        },
+        "516": {
+            "filepath": "/paddlevideo/modeling/backbones/transnetv2.py",
+            "entry_id": 7054,
+            "language_id": "python"
+        },
+        "517": {
+            "filepath": "/paddlevideo/modeling/backbones/vit.py",
+            "entry_id": 7110,
+            "language_id": "python"
+        },
+        "518": {
+            "filepath": "/paddlevideo/modeling/backbones/vit_tweaks.py",
+            "entry_id": 7146,
+            "language_id": "python"
+        },
+        "519": {
+            "filepath": "/paddlevideo/modeling/backbones/yowo.py",
+            "entry_id": 7184,
+            "language_id": "python"
+        },
+        "520": {
+            "filepath": "/paddlevideo/modeling/bbox_utils.py",
+            "entry_id": 7198,
+            "language_id": "python"
+        },
+        "521": {
+            "filepath": "/paddlevideo/modeling/builder.py",
+            "entry_id": 7232,
+            "language_id": "python"
+        },
+        "522": {
+            "filepath": "/paddlevideo/modeling/framework/__init__.py",
+            "entry_id": 7242,
+            "language_id": "python"
+        },
+        "523": {
+            "filepath": "/paddlevideo/modeling/framework/detectors/__init__.py",
+            "entry_id": 7248,
+            "language_id": "python"
+        },
+        "524": {
+            "filepath": "/paddlevideo/modeling/framework/detectors/base.py",
+            "entry_id": 7252,
+            "language_id": "python"
+        },
+        "525": {
+            "filepath": "/paddlevideo/modeling/framework/detectors/fast_rcnn.py",
+            "entry_id": 7258,
+            "language_id": "python"
+        },
+        "526": {
+            "filepath": "/paddlevideo/modeling/framework/detectors/two_stage.py",
+            "entry_id": 7264,
+            "language_id": "python"
+        },
+        "527": {
+            "filepath": "/paddlevideo/modeling/framework/estimators/__init__.py",
+            "entry_id": 7280,
+            "language_id": "python"
+        },
+        "528": {
+            "filepath": "/paddlevideo/modeling/framework/estimators/base.py",
+            "entry_id": 7284,
+            "language_id": "python"
+        },
+        "529": {
+            "filepath": "/paddlevideo/modeling/framework/estimators/depth_estimator.py",
+            "entry_id": 7292,
+            "language_id": "python"
+        },
+        "530": {
+            "filepath": "/paddlevideo/modeling/framework/localizers/__init__.py",
+            "entry_id": 7300,
+            "language_id": "python"
+        },
+        "531": {
+            "filepath": "/paddlevideo/modeling/framework/localizers/base.py",
+            "entry_id": 7304,
+            "language_id": "python"
+        },
+        "532": {
+            "filepath": "/paddlevideo/modeling/framework/localizers/bmn_localizer.py",
+            "entry_id": 7312,
+            "language_id": "python"
+        },
+        "533": {
+            "filepath": "/paddlevideo/modeling/framework/localizers/yowo_localizer.py",
+            "entry_id": 7318,
+            "language_id": "python"
+        },
+        "534": {
+            "filepath": "/paddlevideo/modeling/framework/localizers/yowo_utils.py",
+            "entry_id": 7332,
+            "language_id": "python"
+        },
+        "535": {
+            "filepath": "/paddlevideo/modeling/framework/multimodal/__init__.py",
+            "entry_id": 7362,
+            "language_id": "python"
+        },
+        "536": {
+            "filepath": "/paddlevideo/modeling/framework/multimodal/actbert.py",
+            "entry_id": 7366,
+            "language_id": "python"
+        },
+        "537": {
+            "filepath": "/paddlevideo/modeling/framework/multimodal/base.py",
+            "entry_id": 7374,
+            "language_id": "python"
+        },
+        "538": {
+            "filepath": "/paddlevideo/modeling/framework/partitioners/__init__.py",
+            "entry_id": 7382,
+            "language_id": "python"
+        },
+        "539": {
+            "filepath": "/paddlevideo/modeling/framework/partitioners/base.py",
+            "entry_id": 7386,
+            "language_id": "python"
+        },
+        "540": {
+            "filepath": "/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py",
+            "entry_id": 7394,
+            "language_id": "python"
+        },
+        "541": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/__init__.py",
+            "entry_id": 7402,
+            "language_id": "python"
+        },
+        "542": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/base.py",
+            "entry_id": 7408,
+            "language_id": "python"
+        },
+        "543": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizer1d.py",
+            "entry_id": 7416,
+            "language_id": "python"
+        },
+        "544": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizer2d.py",
+            "entry_id": 7426,
+            "language_id": "python"
+        },
+        "545": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizer3d.py",
+            "entry_id": 7434,
+            "language_id": "python"
+        },
+        "546": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py",
+            "entry_id": 7442,
+            "language_id": "python"
+        },
+        "547": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py",
+            "entry_id": 7450,
+            "language_id": "python"
+        },
+        "548": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizerMRI.py",
+            "entry_id": 7470,
+            "language_id": "python"
+        },
+        "549": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py",
+            "entry_id": 7478,
+            "language_id": "python"
+        },
+        "550": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py",
+            "entry_id": 7486,
+            "language_id": "python"
+        },
+        "551": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py",
+            "entry_id": 7494,
+            "language_id": "python"
+        },
+        "552": {
+            "filepath": "/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py",
+            "entry_id": 7504,
+            "language_id": "python"
+        },
+        "553": {
+            "filepath": "/paddlevideo/modeling/framework/segment/__init__.py",
+            "entry_id": 7514,
+            "language_id": "python"
+        },
+        "554": {
+            "filepath": "/paddlevideo/modeling/framework/segment/base.py",
+            "entry_id": 7518,
+            "language_id": "python"
+        },
+        "555": {
+            "filepath": "/paddlevideo/modeling/framework/segment/cfbi.py",
+            "entry_id": 7526,
+            "language_id": "python"
+        },
+        "556": {
+            "filepath": "/paddlevideo/modeling/framework/segment/utils.py",
+            "entry_id": 7556,
+            "language_id": "python"
+        },
+        "557": {
+            "filepath": "/paddlevideo/modeling/framework/segmenters/__init__.py",
+            "entry_id": 7622,
+            "language_id": "python"
+        },
+        "558": {
+            "filepath": "/paddlevideo/modeling/framework/segmenters/asrf.py",
+            "entry_id": 7626,
+            "language_id": "python"
+        },
+        "559": {
+            "filepath": "/paddlevideo/modeling/framework/segmenters/base.py",
+            "entry_id": 7638,
+            "language_id": "python"
+        },
+        "560": {
+            "filepath": "/paddlevideo/modeling/framework/segmenters/ms_tcn.py",
+            "entry_id": 7648,
+            "language_id": "python"
+        },
+        "561": {
+            "filepath": "/paddlevideo/modeling/framework/segmenters/utils.py",
+            "entry_id": 7656,
+            "language_id": "python"
+        },
+        "562": {
+            "filepath": "/paddlevideo/modeling/heads/__init__.py",
+            "entry_id": 7680,
+            "language_id": "python"
+        },
+        "563": {
+            "filepath": "/paddlevideo/modeling/heads/adds_head.py",
+            "entry_id": 7686,
+            "language_id": "python"
+        },
+        "564": {
+            "filepath": "/paddlevideo/modeling/heads/agcn2s_head.py",
+            "entry_id": 7700,
+            "language_id": "python"
+        },
+        "565": {
+            "filepath": "/paddlevideo/modeling/heads/asrf_head.py",
+            "entry_id": 7708,
+            "language_id": "python"
+        },
+        "566": {
+            "filepath": "/paddlevideo/modeling/heads/attention_lstm_head.py",
+            "entry_id": 7724,
+            "language_id": "python"
+        },
+        "567": {
+            "filepath": "/paddlevideo/modeling/heads/base.py",
+            "entry_id": 7750,
+            "language_id": "python"
+        },
+        "568": {
+            "filepath": "/paddlevideo/modeling/heads/bbox_head.py",
+            "entry_id": 7766,
+            "language_id": "python"
+        },
+        "569": {
+            "filepath": "/paddlevideo/modeling/heads/cfbi_head.py",
+            "entry_id": 7788,
+            "language_id": "python"
+        },
+        "570": {
+            "filepath": "/paddlevideo/modeling/heads/ctrgcn_head.py",
+            "entry_id": 7820,
+            "language_id": "python"
+        },
+        "571": {
+            "filepath": "/paddlevideo/modeling/heads/i3d_head.py",
+            "entry_id": 7828,
+            "language_id": "python"
+        },
+        "572": {
+            "filepath": "/paddlevideo/modeling/heads/movinet_head.py",
+            "entry_id": 7838,
+            "language_id": "python"
+        },
+        "573": {
+            "filepath": "/paddlevideo/modeling/heads/ms_tcn_head.py",
+            "entry_id": 7842,
+            "language_id": "python"
+        },
+        "574": {
+            "filepath": "/paddlevideo/modeling/heads/pptimesformer_head.py",
+            "entry_id": 7854,
+            "language_id": "python"
+        },
+        "575": {
+            "filepath": "/paddlevideo/modeling/heads/pptsm_head.py",
+            "entry_id": 7862,
+            "language_id": "python"
+        },
+        "576": {
+            "filepath": "/paddlevideo/modeling/heads/pptsn_head.py",
+            "entry_id": 7872,
+            "language_id": "python"
+        },
+        "577": {
+            "filepath": "/paddlevideo/modeling/heads/roi_extractor.py",
+            "entry_id": 7882,
+            "language_id": "python"
+        },
+        "578": {
+            "filepath": "/paddlevideo/modeling/heads/roi_head.py",
+            "entry_id": 7888,
+            "language_id": "python"
+        },
+        "579": {
+            "filepath": "/paddlevideo/modeling/heads/single_straight3d.py",
+            "entry_id": 7904,
+            "language_id": "python"
+        },
+        "580": {
+            "filepath": "/paddlevideo/modeling/heads/slowfast_head.py",
+            "entry_id": 7912,
+            "language_id": "python"
+        },
+        "581": {
+            "filepath": "/paddlevideo/modeling/heads/stgcn_head.py",
+            "entry_id": 7924,
+            "language_id": "python"
+        },
+        "582": {
+            "filepath": "/paddlevideo/modeling/heads/timesformer_head.py",
+            "entry_id": 7930,
+            "language_id": "python"
+        },
+        "583": {
+            "filepath": "/paddlevideo/modeling/heads/token_shift_head.py",
+            "entry_id": 7938,
+            "language_id": "python"
+        },
+        "584": {
+            "filepath": "/paddlevideo/modeling/heads/transnetv2_head.py",
+            "entry_id": 7946,
+            "language_id": "python"
+        },
+        "585": {
+            "filepath": "/paddlevideo/modeling/heads/tsm_head.py",
+            "entry_id": 7952,
+            "language_id": "python"
+        },
+        "586": {
+            "filepath": "/paddlevideo/modeling/heads/tsn_head.py",
+            "entry_id": 7962,
+            "language_id": "python"
+        },
+        "587": {
+            "filepath": "/paddlevideo/modeling/losses/__init__.py",
+            "entry_id": 7970,
+            "language_id": "python"
+        },
+        "588": {
+            "filepath": "/paddlevideo/modeling/losses/actbert_loss.py",
+            "entry_id": 7976,
+            "language_id": "python"
+        },
+        "589": {
+            "filepath": "/paddlevideo/modeling/losses/asrf_loss.py",
+            "entry_id": 7984,
+            "language_id": "python"
+        },
+        "590": {
+            "filepath": "/paddlevideo/modeling/losses/base.py",
+            "entry_id": 8012,
+            "language_id": "python"
+        },
+        "591": {
+            "filepath": "/paddlevideo/modeling/losses/bmn_loss.py",
+            "entry_id": 8018,
+            "language_id": "python"
+        },
+        "592": {
+            "filepath": "/paddlevideo/modeling/losses/cross_entropy_loss.py",
+            "entry_id": 8034,
+            "language_id": "python"
+        },
+        "593": {
+            "filepath": "/paddlevideo/modeling/losses/depth_loss.py",
+            "entry_id": 8040,
+            "language_id": "python"
+        },
+        "594": {
+            "filepath": "/paddlevideo/modeling/losses/distillation_loss.py",
+            "entry_id": 8062,
+            "language_id": "python"
+        },
+        "595": {
+            "filepath": "/paddlevideo/modeling/losses/transnetv2_loss.py",
+            "entry_id": 8070,
+            "language_id": "python"
+        },
+        "596": {
+            "filepath": "/paddlevideo/modeling/losses/yowo_loss.py",
+            "entry_id": 8078,
+            "language_id": "python"
+        },
+        "597": {
+            "filepath": "/paddlevideo/modeling/registry.py",
+            "entry_id": 8104,
+            "language_id": "python"
+        },
+        "598": {
+            "filepath": "/paddlevideo/modeling/samplers/__init__.py",
+            "entry_id": 8110,
+            "language_id": "python"
+        },
+        "599": {
+            "filepath": "/paddlevideo/modeling/samplers/random_sampler.py",
+            "entry_id": 8114,
+            "language_id": "python"
+        },
+        "600": {
+            "filepath": "/paddlevideo/modeling/weight_init.py",
+            "entry_id": 8128,
+            "language_id": "python"
+        },
+        "601": {
+            "filepath": "/paddlevideo/solver/__init__.py",
+            "entry_id": 8142,
+            "language_id": "python"
+        },
+        "602": {
+            "filepath": "/paddlevideo/solver/custom_lr.py",
+            "entry_id": 8146,
+            "language_id": "python"
+        },
+        "603": {
+            "filepath": "/paddlevideo/solver/lr.py",
+            "entry_id": 8174,
+            "language_id": "python"
+        },
+        "604": {
+            "filepath": "/paddlevideo/solver/optimizer.py",
+            "entry_id": 8180,
+            "language_id": "python"
+        },
+        "605": {
+            "filepath": "/paddlevideo/tasks/__init__.py",
+            "entry_id": 8194,
+            "language_id": "python"
+        },
+        "606": {
+            "filepath": "/paddlevideo/tasks/test.py",
+            "entry_id": 8198,
+            "language_id": "python"
+        },
+        "607": {
+            "filepath": "/paddlevideo/tasks/train.py",
+            "entry_id": 8206,
+            "language_id": "python"
+        },
+        "608": {
+            "filepath": "/paddlevideo/tasks/train_dali.py",
+            "entry_id": 8244,
+            "language_id": "python"
+        },
+        "609": {
+            "filepath": "/paddlevideo/tasks/train_multigrid.py",
+            "entry_id": 8258,
+            "language_id": "python"
+        },
+        "610": {
+            "filepath": "/paddlevideo/utils/__init__.py",
+            "entry_id": 8284,
+            "language_id": "python"
+        },
+        "611": {
+            "filepath": "/paddlevideo/utils/build_utils.py",
+            "entry_id": 8288,
+            "language_id": "python"
+        },
+        "612": {
+            "filepath": "/paddlevideo/utils/config.py",
+            "entry_id": 8294,
+            "language_id": "python"
+        },
+        "613": {
+            "filepath": "/paddlevideo/utils/dist_utils.py",
+            "entry_id": 8308,
+            "language_id": "python"
+        },
+        "614": {
+            "filepath": "/paddlevideo/utils/logger.py",
+            "entry_id": 8312,
+            "language_id": "python"
+        },
+        "615": {
+            "filepath": "/paddlevideo/utils/multigrid/__init__.py",
+            "entry_id": 8322,
+            "language_id": "python"
+        },
+        "616": {
+            "filepath": "/paddlevideo/utils/multigrid/batchnorm_helper.py",
+            "entry_id": 8326,
+            "language_id": "python"
+        },
+        "617": {
+            "filepath": "/paddlevideo/utils/multigrid/interval_helper.py",
+            "entry_id": 8340,
+            "language_id": "python"
+        },
+        "618": {
+            "filepath": "/paddlevideo/utils/multigrid/multigrid.py",
+            "entry_id": 8344,
+            "language_id": "python"
+        },
+        "619": {
+            "filepath": "/paddlevideo/utils/multigrid/save_load_helper.py",
+            "entry_id": 8366,
+            "language_id": "python"
+        },
+        "620": {
+            "filepath": "/paddlevideo/utils/multigrid/short_sampler.py",
+            "entry_id": 8386,
+            "language_id": "python"
+        },
+        "621": {
+            "filepath": "/paddlevideo/utils/precise_bn.py",
+            "entry_id": 8400,
+            "language_id": "python"
+        },
+        "622": {
+            "filepath": "/paddlevideo/utils/profiler.py",
+            "entry_id": 8410,
+            "language_id": "python"
+        },
+        "623": {
+            "filepath": "/paddlevideo/utils/record.py",
+            "entry_id": 8422,
+            "language_id": "python"
+        },
+        "624": {
+            "filepath": "/paddlevideo/utils/registry.py",
+            "entry_id": 8438,
+            "language_id": "python"
+        },
+        "625": {
+            "filepath": "/paddlevideo/utils/save_load.py",
+            "entry_id": 8446,
+            "language_id": "python"
+        },
+        "626": {
+            "filepath": "/paddlevideo/version.py",
+            "entry_id": 8472,
+            "language_id": "python"
+        },
+        "627": {
+            "filepath": "/run.sh",
+            "entry_id": 8476,
+            "language_id": "shell"
+        },
+        "628": {
+            "filepath": "/setup.py",
+            "entry_id": 8488,
+            "language_id": "python"
+        },
+        "629": {
+            "filepath": "/test_tipc/README.md",
+            "entry_id": 8496,
+            "language_id": "plain-text"
+        },
+        "630": {
+            "filepath": "/test_tipc/benchmark_train.sh",
+            "entry_id": 8510,
+            "language_id": "shell"
+        },
+        "631": {
+            "filepath": "/test_tipc/common_func.sh",
+            "entry_id": 8536,
+            "language_id": "shell"
+        },
+        "632": {
+            "filepath": "/test_tipc/compare_results.py",
+            "entry_id": 8542,
+            "language_id": "python"
+        },
+        "633": {
+            "filepath": "/test_tipc/extract_loss.py",
+            "entry_id": 8558,
+            "language_id": "python"
+        },
+        "634": {
+            "filepath": "/test_tipc/prepare.sh",
+            "entry_id": 8566,
+            "language_id": "shell"
+        },
+        "635": {
+            "filepath": "/test_tipc/test_inference_cpp.sh",
+            "entry_id": 8620,
+            "language_id": "shell"
+        },
+        "636": {
+            "filepath": "/test_tipc/test_paddle2onnx.sh",
+            "entry_id": 8642,
+            "language_id": "shell"
+        },
+        "637": {
+            "filepath": "/test_tipc/test_ptq_inference_python.sh",
+            "entry_id": 8652,
+            "language_id": "shell"
+        },
+        "638": {
+            "filepath": "/test_tipc/test_serving_infer_cpp.sh",
+            "entry_id": 8668,
+            "language_id": "shell"
+        },
+        "639": {
+            "filepath": "/test_tipc/test_serving_infer_python.sh",
+            "entry_id": 8680,
+            "language_id": "shell"
+        },
+        "640": {
+            "filepath": "/test_tipc/test_train_dy2static_python.sh",
+            "entry_id": 8690,
+            "language_id": "shell"
+        },
+        "641": {
+            "filepath": "/test_tipc/test_train_inference_python.sh",
+            "entry_id": 8698,
+            "language_id": "shell"
+        },
+        "642": {
+            "filepath": "/test_tipc/test_train_inference_python_npu.sh",
+            "entry_id": 8742,
+            "language_id": "shell"
+        },
+        "643": {
+            "filepath": "/test_tipc/test_train_inference_python_xpu.sh",
+            "entry_id": 8748,
+            "language_id": "shell"
+        },
+        "644": {
+            "filepath": "/tools/__init__.py",
+            "entry_id": 8754,
+            "language_id": "python"
+        },
+        "645": {
+            "filepath": "/tools/ava_predict.py",
+            "entry_id": 8758,
+            "language_id": "python"
+        },
+        "646": {
+            "filepath": "/tools/export_model.py",
+            "entry_id": 8792,
+            "language_id": "python"
+        },
+        "647": {
+            "filepath": "/tools/predict.py",
+            "entry_id": 8812,
+            "language_id": "python"
+        },
+        "648": {
+            "filepath": "/tools/summary.py",
+            "entry_id": 8838,
+            "language_id": "python"
+        },
+        "649": {
+            "filepath": "/tools/utils.py",
+            "entry_id": 8846,
+            "language_id": "python"
+        },
+        "650": {
+            "filepath": "/tools/wheel.py",
+            "entry_id": 8964,
+            "language_id": "python"
+        }
+    },
+    "project_name": "PaddleVideo",
+    "split_count": 90
+}
\ No newline at end of file
diff --git a/docs/metadata_title.json b/docs/metadata_title.json
new file mode 100644
index 000000000..c8054d78f
--- /dev/null
+++ b/docs/metadata_title.json
@@ -0,0 +1 @@
+{"split_count": 15}
\ No newline at end of file
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
new file mode 100644
index 000000000..02313e39c
--- /dev/null
+++ b/docs/sitemap.xml
@@ -0,0 +1,3919 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+<!-- created with Free Online Sitemap Generator www.xml-sitemaps.com -->
+
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/MANIFEST.in</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/README_en.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/AbnormalActionDetection/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Anti-UAV/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Anti-UAV/get_image_label.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/action.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/logger.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/mfcc/model_config.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/models/audio_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/models/bmn_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/models/lstm_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/reader/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/reader/audio_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/reader/feature_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/reader/reader_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/utils/config_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/utils/preprocess.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/action_detect/utils/process_result.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/eval.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/BasketballAction/predict/predict.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/README.MD</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/api.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/main.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/config.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/record.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/paddlevideo/version.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/setup.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/EIVideo/version.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/build_gui.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/gui/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/gui/demo.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/gui/ui_main_window.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/start.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/tools/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/ui/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/ui/demo.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/version.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/QEIVideo/widget/PaintBoard.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/resources/QT/demo.ui</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/EIVideo/resources/cmd</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FightRecognition/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FigureSkating/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/checkpoints/download.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/datasets/EuroCup2016/dataset_url.list</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/datasets/EuroCup2016/url.list</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/datasets/EuroCup2016/url_val.list</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/datasets/script/get_frames_pcm.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/datasets/script/get_instance_for_bmn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/datasets/script/get_instance_for_lstm.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/datasets/script/get_instance_for_pptsm.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/extractor/extract_bmn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/extractor/extract_feat.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/action.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/logger.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/mfcc/model_config.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/models/audio_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/models/bmn_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/models/lstm_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/models/pptsm_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/reader/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/reader/audio_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/reader/feature_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/reader/reader_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/utils/config_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/utils/preprocess.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/action_detect/utils/process_result.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/eval.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/FootballAction/predict/predict.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/README_cn.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/config.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/dataloaders/DAVIS2017.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/dataloaders/DAVIS2017_cn.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/dataloaders/custom_transforms_f.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/dataloaders/davis_2017_f.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/dataloaders/helpers.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/dataloaders/samplers.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/IntVOS.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/aspp.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/backbone/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/backbone/drn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/backbone/mobilenet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/backbone/resnet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/backbone/xception.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/decoder.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/deeplab.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/networks/loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/run.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/test.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/train_stage1.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/train_stage2.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/utils/api.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/utils/mask_damaging.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/utils/meters.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/Ma-Net/utils/utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/download.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/eval_and_save_model.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/inference.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/config.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/inference.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/models/ernie.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/train.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/scenario_lib/utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/MultimodalVideoTag/train.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/PP-Care/Readme.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/PPHuman/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/PPHuman/datasets/prepare_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/README_en.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/base/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/base/base_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/base/base_model.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/base/base_trainer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/data/download_features.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/data_loader/MSRVTT_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/data_loader/data_loaders.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/logger/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/logger/log_parser.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/logger/logger.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/model/loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/model/metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/model/model.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/model/net_vlad.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/model/text.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/parse_config.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/test.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/train.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/trainer/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/trainer/trainer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/utils/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/T2VLAD/utils/util.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/ActionRecognition/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/datasets/script/submission_format_transfer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/fix_bad_label.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/get_instance_for_bmn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/gts_format_transfer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/action.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/logger.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/mfcc/model_config.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/models/audio_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/models/bmn_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/models/lstm_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/models/pptsm_infer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/reader/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/reader/feature_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/reader/reader_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/utils/config_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/utils/preprocess.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/action_detect/utils/process_result.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/eval.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/predict/predict.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/TableTennis/val_split.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/main.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/builder.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/loader/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/metrics/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/metrics/build.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/solver/lr.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/tasks/test.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/tasks/train.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/utils/config.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/utils/logger.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/utils/record.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/utils/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/paddlevideo/version.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/run.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/save_model.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoQualityAssessment/setup.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/FineTune.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/Run.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/Test.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/eval.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/metrics/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/metrics/kinetics/accuracy_metrics.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/metrics/metrics_util.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/metrics/youtube8m/eval_util.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/models/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/models/attention_lstm/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/models/attention_lstm/attention_lstm.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/models/attention_lstm/lstm_attention.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/models/model.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/models/tsn/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/models/tsn/tsn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/models/tsn/tsn_res_model.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/models/utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/predict.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/reader/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/reader/feature_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/reader/kinetics_reader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/reader/reader_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/train.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/tsn_extractor.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/utils/config_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/utils/train_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/utils/utility.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/applications/VideoTag/videotag_test.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/benchmark/TimeSformer/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/benchmark/TimeSformer/run_all.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/benchmark/TimeSformer/run_benchmark.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/data/50salads/prepare_asrf_data.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/data/50salads/transform_segmentation_label.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/data/ntu-rgb-d/download_dataset.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/data/ntu-rgb-d/get_raw_denoised_data.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/data/ntu-rgb-d/get_raw_skes_data.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/data/ntu-rgb-d/seq_transformation.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/external-cmake/auto-log.cmake</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/include/postprocess_op.h</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/include/preprocess_op.h</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/include/utility.h</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/include/video_rec.h</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/readme.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/readme_en.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/src/main.cpp</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/src/postprocess_op.cpp</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/src/preprocess_op.cpp</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/src/utility.cpp</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/src/video_rec.cpp</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_infer/tools/build.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_serving/paddle_env_install.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_serving/preprocess_ops.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_serving/readme.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_serving/readme_en.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_serving/run_cpp_serving.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/cpp_serving/serving_client.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/paddle2onnx/predict_onnx.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/paddle2onnx/readme.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/paddle2onnx/readme_en.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/python_serving/pipeline_http_client.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/python_serving/pipeline_rpc_client.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/python_serving/readme.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/python_serving/readme_en.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/python_serving/recognition_web_service.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/python_serving/utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/slim/quant_post_static.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/slim/readme.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/deploy/slim/readme_en.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/benchmark.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/AVA.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/ActivityNet.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/Oxford_RobotCar.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/SegmentationDataset.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/fsd.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/k400.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/msrvtt.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/ntu-rgbd.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/ucf101.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/ucf24.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/dataset/youtube8m.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/install.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/estimation/adds.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/localization/bmn.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/localization/yowo.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/multimodal/actbert.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/partition/transnetv2.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/agcn.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/agcn2s.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/attention_lstm.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/ctrgcn.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/movinet.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/posec3d.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/pp-timesformer.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/pp-tsm.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/pp-tsn.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/slowfast.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/stgcn.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/timesformer.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/tokenshift_transformer.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/tsm.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/tsn.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/tsn_dali.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/recognition/videoswin.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/segmentation/asrf.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/segmentation/cfbi.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/model_zoo/segmentation/mstcn.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/quick_start.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tools.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/Action%20Recognition%20Datasets</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/Action%20Recognition%20Papers</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/Spatio-Temporal%20Action%20Detection%20Papers</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/TSM.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/Temporal%20Action%20Detection%20Papers</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/accelerate.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/config.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/customized_usage.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/demos</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/deployment.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/modular_design.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/pp-tsm.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/tutorials/summarize.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/english_documents/usage.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/main.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/builder.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dali_loader.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/MRI.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/MRI_SlowFast.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/actbert_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/asrf_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/ava_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/bmn_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/davis_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/feature.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/frame.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/ms_tcn_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/msrvtt.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/oxford.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/skeleton.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/slowfast_video.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/ucf101_skeleton.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/ucf24_dataset.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/dataset/video.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/anet_pipeline.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/augmentations.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/augmentations_ava.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/compose.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/decode.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/decode_image.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/decode_sampler.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/decode_sampler_MRI.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/mix.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/multimodal.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/sample.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/sample_ava.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/sample_ucf24.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/segmentation.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/segmentation_pipline.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/pipelines/skeleton_pipeline.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/loader/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ActivityNet/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ActivityNet/anet_prop.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ava_evaluation/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ava_evaluation/metrics.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ava_evaluation/np_box_list.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ava_evaluation/np_box_ops.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ava_evaluation/standard_fields.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ava_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ava_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/bmn_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/build.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/center_crop_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/center_crop_metric_MRI.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/depth_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/msrvtt_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/multi_crop_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/recall.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/segmentation_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/skeleton_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/transnetv2_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/ucf24_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/vos_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/youtube8m/average_precision_calculator.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/youtube8m/eval_util.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/metrics/yowo_metric.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/assigners/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/assigners/max_iou_assigner_ava.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/actbert.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/adds.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/agcn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/agcn2s.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/asrf.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/bmn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/cfbi.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/ctrgcn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/darknet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/deeplab.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/movinet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/ms_tcn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/pptsm_mv2.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/pptsm_mv3.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/pptsm_v2.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet3d.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet3d_slowonly.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet_slowfast.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet_tsm.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet_tsm_MRI.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet_tsn_MRI.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/resnext101.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/stgcn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/swin_transformer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/toshift_vit.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/transnetv2.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/vit.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/vit_tweaks.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/backbones/yowo.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/bbox_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/builder.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/detectors/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/detectors/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/detectors/fast_rcnn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/detectors/two_stage.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/estimators/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/estimators/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/estimators/depth_estimator.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/localizers/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/localizers/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/localizers/bmn_localizer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/localizers/yowo_localizer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/localizers/yowo_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/multimodal/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/multimodal/actbert.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/multimodal/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/partitioners/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/partitioners/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizer1d.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizer2d.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizer3d.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizerMRI.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/segment/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/segment/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/segment/cfbi.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/segment/utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/segmenters/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/segmenters/asrf.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/segmenters/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/segmenters/ms_tcn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/framework/segmenters/utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/adds_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/agcn2s_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/asrf_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/attention_lstm_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/bbox_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/cfbi_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/ctrgcn_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/i3d_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/movinet_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/ms_tcn_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/pptimesformer_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/pptsm_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/pptsn_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/roi_extractor.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/roi_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/single_straight3d.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/slowfast_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/stgcn_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/timesformer_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/token_shift_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/transnetv2_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/tsm_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/heads/tsn_head.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/actbert_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/asrf_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/base.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/bmn_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/cross_entropy_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/depth_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/distillation_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/transnetv2_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/losses/yowo_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/samplers/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/samplers/random_sampler.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/modeling/weight_init.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/solver/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/solver/custom_lr.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/solver/lr.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/solver/optimizer.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/tasks/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/tasks/test.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/tasks/train.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/tasks/train_dali.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/tasks/train_multigrid.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/build_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/config.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/dist_utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/logger.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/multigrid/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/multigrid/batchnorm_helper.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/multigrid/interval_helper.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/multigrid/multigrid.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/multigrid/save_load_helper.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/multigrid/short_sampler.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/precise_bn.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/profiler.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/record.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/registry.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/utils/save_load.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/paddlevideo/version.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/run.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/setup.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/README.md</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/benchmark_train.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/common_func.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/compare_results.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/extract_loss.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/prepare.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/test_inference_cpp.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/test_paddle2onnx.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/test_ptq_inference_python.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/test_serving_infer_cpp.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/test_serving_infer_python.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/test_train_dy2static_python.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/test_train_inference_python.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/test_train_inference_python_npu.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/test_tipc/test_train_inference_python_xpu.sh</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/tools/__init__.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/tools/ava_predict.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/tools/export_model.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/tools/predict.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/tools/summary.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/tools/utils.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo?q=/tools/wheel.py</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+<url>
+  <loc>https://james4ever0.github.io/PaddleVideo/tree.html?full=true</loc>
+  <lastmod>2023-12-28T09:21:02+00:00</lastmod>
+  <priority>1.00</priority>
+</url>
+
+</urlset>
\ No newline at end of file
diff --git a/docs/src/MANIFEST.in b/docs/src/MANIFEST.in
new file mode 100644
index 000000000..3bfb26412
--- /dev/null
+++ b/docs/src/MANIFEST.in
@@ -0,0 +1,9 @@
+include LICENSE
+include README.md
+include tools/__init__.py
+include tools/utils.py
+include tools/ava_predict.py
+include tools/wheel.py
+include data/k400/Kinetics-400_label_list.txt
+
+recursive-include paddlevideo/ *.py *.txt
diff --git a/docs/src/README.md b/docs/src/README.md
new file mode 100644
index 000000000..733eb54af
--- /dev/null
+++ b/docs/src/README.md
@@ -0,0 +1,75 @@
+[English](README_en.md) | 中文
+
+# PaddleVideo
+
+![python version](https://img.shields.io/badge/python-3.7+-orange.svg) ![paddle version](https://img.shields.io/badge/PaddlePaddle-2.3.1-blue)
+
+## 简介
+
+PaddleVideo旨在打造一套丰富、领先且实用的Video工具库，旨在帮助开发者更好的进行视频领域的学术研究和产业实践。
+
+<div align="center">
+  <img src="docs/images/home.gif" width="450px"/><br>
+</div>
+
+## 近期更新
+
+- 开源视频标注工具🌟[BILS](./docs/zh-CN/annotation_tools.md)，欢迎下载安装包体验～
+- 发布轻量化行为识别模型**🔥[PP-TSMv2](./docs/zh-CN/model_zoo/recognition/pp-tsm_v2.md)**, Kinetics-400精度75.16%，25fps的10s视频cpu推理时间仅需456ms.各模型性能对比[benchmark](./docs/zh-CN/benchmark.md).
+- 新增[知识蒸馏](./docs/zh-CN/distillation.md)功能.
+- 新增基于transformer的行为识别模型[TokenShift](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/tokenshift_transformer.md).
+- 新增基于骨骼点的行为识别模型[2s-ACGN](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/agcn2s.md)、[CTR-GCN](./docs/zh-CN/model_zoo/recognition/ctrgcn.md).
+- 新增单阶段时空动作检测模型[YOWO](./docs/zh-CN/model_zoo/localization/yowo.md).
+
+
+👀 🌟  **《产业级视频技术与应用案例》系列课程回放链接**:  https://aistudio.baidu.com/aistudio/course/introduce/6742 🌟
+
+​																	  💖 **欢迎大家扫码入群讨论** 💖
+<div align="center">
+  <img src="docs/images/user_group.png" width=250/></div>
+
+- 添加成功后回复【视频】加入交流群
+
+## 特性
+
+支持多种Video相关前沿算法，在此基础上打造产业级特色模型[PP-TSM](docs/zh-CN/model_zoo/recognition/pp-tsm.md)和[PP-TSMv2](docs/zh-CN/model_zoo/recognition/pp-tsm_v2.md)，并打通数据生产、模型训练、压缩、预测部署全流程。
+
+<div align="center">
+    <img src="./docs/images/features.png" width="700">
+</div>
+
+## 快速开始
+
+- 一行命令快速使用: [快速开始](./docs/zh-CN/quick_start.md)
+
+## 场景应用
+
+PaddleVideo场景应用覆盖体育、互联网、工业、医疗行业，在PP-TSM的基础能力之上，以案例的形式展示利用场景数据微调、模型优化方法、数据增广等内容，为开发者实际落地提供示范与启发。详情可查看[应用](./applications/)。
+
+## 文档教程
+
+- [快速开始](./docs/zh-CN/quick_start.md)
+- [安装说明](./docs/zh-CN/install.md)
+- [训练/测试/推理全流程使用指南](./docs/zh-CN/usage.md)
+- [PP-TSM行为识别🔥](./docs/zh-CN/model_zoo/recognition/pp-tsm.md)
+  - [模型库](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#7)
+  - [模型训练](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#4)
+  - [模型压缩](./deploy/slim/)
+      - [模型量化](./deploy/slim/readme.md)
+      - [知识蒸馏](./docs/zh-CN/distillation.md)
+  - [推理部署](./deploy/)
+      - [基于Python预测引擎推理](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#62)
+      - [基于C++预测引擎推理](./deploy/cpp_infer/readme.md)
+      - [服务端部署](./deploy/python_serving/readme.md)
+      - [Paddle2ONNX模型转化与预测](./deploy/paddle2onnx/readme.md)
+      - [Benchmark](./docs/zh-CN/benchmark.md)
+- [前沿算法与模型](./docs/zh-CN/model_zoo/README.md)🚀
+- [数据集](./docs/zh-CN/dataset/README.md)
+- [场景应用](./applications/README.md)
+- [数据标注](./docs/zh-CN/annotation_tools.md)
+- [赛事支持](./docs/zh-CN/competition.md)
+- [贡献代码](./docs/zh-CN/contribute/README.md)
+
+## 许可证书
+
+本项目的发布受[Apache 2.0 license](LICENSE)许可认证。
diff --git a/docs/src/README_en.md b/docs/src/README_en.md
new file mode 100644
index 000000000..61a335a82
--- /dev/null
+++ b/docs/src/README_en.md
@@ -0,0 +1,65 @@
+[简体中文](README.md) | English
+
+# PaddleVideo
+
+![python version](https://img.shields.io/badge/python-3.7+-orange.svg) ![paddle version](https://img.shields.io/badge/PaddlePaddle-2.0-blue)
+
+## Introduction
+
+PaddleVideo is a toolset for video tasks prepared for the industry and academia. This repository provides examples and best practice guildelines for exploring deep learning algorithm in the scene of video area.
+
+<div align="center">
+  <img src="docs/images/home.gif" width="450px"/><br>
+</div>
+
+
+## Update:
+
+- release **🔥[PP-TSMv2](./docs/zh-CN/model_zoo/recognition/pp-tsm.md)**, an lite action recognition model, top1_acc on Kinetics-400 is 74.38%，cpu inference time on 10s video with 25fps is only 433ms. [benchmark](./docs/zh-CN/benchmark.md).
+- add [Knowledge Distilltion](./docs/zh-CN/distillation.md) framework code.
+- add [TokenShift](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/tokenshift_transformer.md), [2s-ACGN](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/agcn2s.md) and [CTR-GCN](./docs/zh-CN/model_zoo/recognition/ctrgcn.md) model.
+
+​ 💖 **Welcome to scan the code and join the group discussion** 💖
+
+<div align="center">
+  <img src="docs/images/user_group.png" width=250/></div>
+
+- Scan the QR code below with your Wechat and reply "video", you can access to official technical exchange group. Look forward to your participation.
+
+## Features
+PaddleVideo support a variety of cutting-edge algorithms related to video, and developed industrial featured models/solution [PP-TSM](docs/zh-CN/model_zoo/recognition/pp-tsm.md) and [PP-TSMv2](docs/zh-CN/model_zoo/recognition/pp-tsm.md) on this basis, and get through the whole process of data production, model training, compression, inference and deployment.
+
+<div align="center">
+    <img src="./docs/images/features_en.png" width="700">
+</div>
+
+## Quick Start
+
+- One line of code quick use: [Quick Start](./docs/zh-CN/quick_start.md)
+
+## Tutorials
+
+
+- [Quick Start](./docs/zh-CN/quick_start.md)
+- [Installation](./docs/zh-CN/install.md)
+- [Usage](./docs/zh-CN/usage.md)
+- [PP-TSM🔥](./docs/zh-CN/model_zoo/recognition/pp-tsm.md)
+  - [Model Zoo](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#7)
+  - [Model training](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#4)
+  - [Model Compression](./deploy/slim/)
+      - [Model Quantization](./deploy/slim/readme.md)
+      - [Knowledge Distillation](./docs/zh-CN/distillation.md)
+  - [Inference and Deployment](./deploy/)
+      - [Python Inference](./docs/zh-CN/model_zoo/recognition/pp-tsm.md#62)
+      - [C++ Inference](./deploy/cpp_infer/readme.md)
+      - [Serving](./deploy/python_serving/readme.md)
+      - [Paddle2ONNX](./deploy/paddle2onnx/readme.md)
+      - [Benchmark](./docs/zh-CN/benchmark.md)
+- [Academic algorithms](./docs/en/model_zoo/README.md)🚀
+- [Datasets](./docs/en/dataset/README.md)
+- [Data Annotation](./applications/BILS)
+- [Contribute](./docs/zh-CN/contribute/README.md)
+
+## License
+
+PaddleVideo is released under the [Apache 2.0 license](LICENSE).
diff --git a/docs/src/__init__.py b/docs/src/__init__.py
new file mode 100644
index 000000000..c4f28815b
--- /dev/null
+++ b/docs/src/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['PaddleVideo']
+from .tools import PaddleVideo
\ No newline at end of file
diff --git a/docs/src/applications/AbnormalActionDetection/README.md b/docs/src/applications/AbnormalActionDetection/README.md
new file mode 100644
index 000000000..77935131c
--- /dev/null
+++ b/docs/src/applications/AbnormalActionDetection/README.md
@@ -0,0 +1,153 @@
+# 异常行为识别
+
+## 内容
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型评估](#模型评估)
+- [模型推理](#模型推理)
+- [模型部署](#模型部署)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+该代码库用于异常行为检测, 基于paddle2.2版本开发，结合PaddleVideo中的SlowFast+FasterRCNN模型实现7个异常行为的检测。
+主要框架如下：
+<div align="center">
+  <img src="./images/SlowFast_FasterRCNN.png" width="640px"/><br>
+</div>
+
+AIStudio项目: [基于时空信息的异常行为检测](https://aistudio.baidu.com/aistudio/projectdetail/3431613)
+
+## 数据准备
+
+### Step1 稀疏抽取视频帧
+首先稀疏抽取视频帧用于检测每帧中人的位置：
+
+```
+cd data/ava/script && bash extract_video_frames.sh abnormal_action_videos abnormal_action_frames 2
+```
+
+* 第一个参数abnormal_action_videos：被抽帧的视频根目录；
+* 第二个参数abnormal_action_frames：抽取的视频帧存放目录；
+* 第三个参数2：抽帧帧率。
+
+### Step2 目标检测
+用成熟的可检测人的目标检测模型检测上述步骤抽得的视频帧中的人。如PaddleDetection套件中的基于coco数据集训练得到的[PP-YOLOv2](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/ppyolo)模型。
+
+### Step3 生成pkl文件
+将上述步骤得到的每个视频帧的检测结果进行转化，得到SlowFast_FasterRCNN模型需要的输入格式。注意我们只需要人的检测结果，其他目标不需要。
+SlowFast_FasterRCNN模型需要的proposals是pkl格式文件，该文件以字典形式存储检测结果，字典的key是视频帧的索引（video_id+frame_id拼接得到），value是一个list，每个元素是检测得到的人的位置信息和置信度。
+
+```
+{
+    打架,0001:
+        [[0.036    0.098    0.55     0.979    0.995518] # x1,y1,x2,y2,score
+        [0.443    0.04     0.99     0.989    0.977824]]
+}
+```
+
+### Step4 密集抽取视频帧
+对视频数据进行密集抽帧。
+SlowFast_FasterRCNN输入的视频帧是密集帧，因此需要再次对视频进行抽帧。具体命令如下：
+```
+cd data/ava/script && bash extract_video_frames.sh abnormal_action_videos abnormal_action_frames_30fps 30
+```
+
+具体参数同步骤1，只不过次数抽帧率为30fps。
+
+### Step5 准备标签数据
+标签数据以pbtxt文件个数存储，本案例具体如下（注意行为标签id从1开始）：
+```
+item {
+  name: "挥棍"
+  id: 1
+}
+item {
+  name: "打架"
+  id: 2
+}
+item {
+  name: "踢东西"
+  id: 3
+}
+item {
+  name: "追逐"
+  id: 4
+}
+item {
+  name: "争吵"
+  id: 5
+}
+item {
+  name: "快速奔跑"
+  id: 6
+}
+item {
+  name: "摔倒"
+  id: 7
+}
+```
+
+## 模型训练
+异常行为检测模型基于在AVA数据集上训练得到模型进行迁移学习。具体训练命令如下：
+```
+python main.py --validate -w AVA_SlowFast_FastRcnn_best.pdparams \
+ -c configs/abnoraml_action.yaml
+```
+
+ - w 预训练模型路径
+ - c 配置文件路径
+
+## 模型评估
+```
+python main.py --test \
+   -w abnormal_action_SlowFast_FastRcnn.pdparams \
+   -c configs/abnoraml_action.yaml
+```
+
+## 模型推理
+基于动态图的推理：
+```
+python tools/ava_predict.py \
+  -c configs/abnoraml_action.yaml \
+  -w abnormal_action_SlowFast_FastRcnn.pdparams \
+  --video_path data/wave_9.mp4 \
+  --detection_model_name 'faster_rcnn/faster_rcnn_r50_fpn_1x_coco' \
+  --detection_model_weights 'faster_rcnn_r50_fpn_1x_coco.pdparams'
+```
+
+- video_path 视频路径
+- detection_model_name 检测模型名称
+- detection_model_weights 检测模型权重路径
+
+基于静态图模型进行推理：
+
+导出模型，动态图模型转换为静态图模型：
+
+```
+python tools/export_model.py \
+  -c configs/abnoraml_action.yaml \
+  -o inference_output \
+  -p abnormal_action_SlowFast_FastRcnn.pdparams
+```
+
+- o 导出模型存放文件夹
+- p 被导出模型路径
+
+基于导出的模型做推理：
+```
+python tools/predict.py \
+    -c configs/abnoraml_action.yaml \
+    --input_file "data/wave_9.mp4" \
+    --model_file "inference_output/abnormal_action_SlowFast_FastRcnn.pdmodel" \
+    --params_file "inference_output/abnormal_action_SlowFast_FastRcnn.pdiparams" \
+    --use_gpu=True \
+    --use_tensorrt=False
+```
+
+## 模型部署
+请参考[Paddle Inference示例](https://paddle-inference.readthedocs.io/en/latest/quick_start/python_demo.html)
+
+## 参考论文
+- [SlowFast Networks for Video Recognition](https://arxiv.org/pdf/1812.03982.pdf), Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, Kaiming He
diff --git a/docs/src/applications/Anti-UAV/README.md b/docs/src/applications/Anti-UAV/README.md
new file mode 100644
index 000000000..c55903c4a
--- /dev/null
+++ b/docs/src/applications/Anti-UAV/README.md
@@ -0,0 +1,39 @@
+# Paddle-Anti-UAV
+Anti-UAV base on PaddleDetection
+
+## Background
+UAVs are very popular and we can see them in many public spaces, such as parks and playgrounds. Most people use UAVs for taking photos.
+However, many areas like airport forbiden UAVs since they are potentially dangerous. In this case, we need to detect the flying UAVs in
+these areas.
+
+In this repository, we show how to train a detection model using [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection).
+
+## Data preparation
+The dataset can be found [here](https://anti-uav.github.io/dataset/). We direcly download the ```test-dev``` split composed of 140 videos
+train the detection model.
+* Download the ```test-dev``` dataset.
+* Run `unzip Anti_UAV_test_dev.zip -d Anti_UAV`.
+* Run `python get_image_label.py`. In this step, you may change the path to the videos and the value of `interval`.
+
+After the above steps, you will get a MSCOCO-style datasst for object detection.
+
+## Install PaddleDetection
+Please refer to this [link](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.3/docs/tutorials/INSTALL.md).
+
+We use `python=3.7`, `Paddle=2.2.1`, `CUDA=10.2`.
+
+## Train PP-YOLO
+We use [PP-YOLO](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.3/configs/ppyolo) as the detector.
+* Run `git clone https://github.com/PaddlePaddle/PaddleDetection.git`. Note that you should finish this step when you install PaddleDetection.
+* Move the anti-UAV dataset to `dataset`.
+* Move `anti_uav.yml` to `configs/datasets`, move `ppyolo_r50vd_dcn_1x_antiuav.yml` to `configs/ppyolo` and move `ppyolo_r50vd_dcn_antiuav.yml`
+to `configs/ppyolo/_base`.
+* Keep the value of `anchors` in `configs/ppyolo/_base/ppyolo_reader.yml` the same as `ppyolo_r50vd_dcn_antiuav.yml`.
+* Run `python -m paddle.distributed.launch --log_dir=./ppyolo_dygraph/ --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_antiuav.yml &>ppyolo_dygraph.log 2>&1 &`.
+Note that you may change the arguments, such as `batch_size` and `gups`.
+
+## Inference
+Please refer to the infernce section on this [webpage](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.3/docs/tutorials/GETTING_STARTED.md). You can just switch the configeration file and trained model to your own files.
+
+![](https://github.com/qingzwang/Paddle-Anti-UAV/blob/main/demo1.gif)
+![](https://github.com/qingzwang/Paddle-Anti-UAV/blob/main/demo.gif)
diff --git a/docs/src/applications/Anti-UAV/get_image_label.py b/docs/src/applications/Anti-UAV/get_image_label.py
new file mode 100644
index 000000000..f084d3010
--- /dev/null
+++ b/docs/src/applications/Anti-UAV/get_image_label.py
@@ -0,0 +1,164 @@
+import cv2
+import os
+import json
+
+# please change it to your path
+path = '/workspace/wangqingzhong/Anti_UAV'
+annotation_path = 'annotations'
+train_img_path = 'train_imgs'
+val_img_path = 'val_imgs'
+if not os.path.exists(annotation_path):
+    os.makedirs(annotation_path)
+if not os.path.exists(train_img_path):
+    os.makedirs(train_img_path)
+if not os.path.exists(val_img_path):
+    os.makedirs(val_img_path)
+
+train_info = {
+    'images': [],
+    'type':
+    'instances',
+    'annotations': [],
+    'categories': [{
+        "supercategory": "none",
+        "id": 1,
+        "name": "drone"
+    }, {
+        "supercategory": "none",
+        "id": 2,
+        "name": "noise"
+    }]
+}
+val_info = {
+    'images': [],
+    'type':
+    'instances',
+    'annotations': [],
+    'categories': [{
+        "supercategory": "none",
+        "id": 1,
+        "name": "drone"
+    }, {
+        "supercategory": "none",
+        "id": 2,
+        "name": "noise"
+    }]
+}
+
+# you can change it
+interval = 5
+dirs = os.listdir(path)
+train_img_id = 0
+val_img_id = 0
+for d in dirs:
+    if 'new' in d:
+        video_file = os.path.join(path, d, 'IR.mp4')
+        label_file = os.path.join(path, d, 'IR_label.json')
+        labels = json.load(open(label_file, 'r'))
+        exits = labels['exist']
+        gt_bbox = labels['gt_rect']
+        assert len(exits) == len(gt_bbox)
+        videocap = cv2.VideoCapture(video_file)
+        i = 0
+        while True:
+            success, frame = videocap.read()
+            if success:
+                if i % interval == 0:
+                    img_name = d + '_' + str(i) + '.jpg'
+                    cv2.imwrite(os.path.join(val_img_path, img_name), frame)
+                    height, width, depth = frame.shape
+                    x, y, w, h = gt_bbox[i]
+                    isexist = exits[i]
+                    if isexist:
+                        category_id = 1
+                    else:
+                        category_id = 2
+                    draw_frame = cv2.rectangle(frame, (x, y), (x + w, y + h),
+                                               (0, 255, 0), 2)
+                    img_name_draw = d + '_' + str(i) + 'draw.jpg'
+                    cv2.imwrite(os.path.join(val_img_path, img_name_draw),
+                                draw_frame)
+
+                    img_info = {
+                        'file_name': img_name,
+                        'height': float(height),
+                        'width': float(width),
+                        'id': val_img_id
+                    }
+                    ann_info = {
+                        'area': float(w) * float(h),
+                        'iscrowd': 0,
+                        'bbox': [float(x),
+                                 float(y),
+                                 float(w),
+                                 float(h)],
+                        'category_id': category_id,
+                        'ignore': 0,
+                        'image_id': val_img_id,
+                        'id': val_img_id + 1
+                    }
+                    val_info['images'].append(img_info)
+                    val_info['annotations'].append(ann_info)
+                    val_img_id += 1
+                i += 1
+            else:
+                print('finish {}'.format(d))
+                break
+    else:
+        video_file = os.path.join(path, d, 'IR.mp4')
+        label_file = os.path.join(path, d, 'IR_label.json')
+        labels = json.load(open(label_file, 'r'))
+        exits = labels['exist']
+        gt_bbox = labels['gt_rect']
+        assert len(exits) == len(gt_bbox)
+        videocap = cv2.VideoCapture(video_file)
+        i = 0
+        while True:
+            success, frame = videocap.read()
+            if success:
+                if i % interval == 0:
+                    img_name = d + '_' + str(i) + '.jpg'
+                    cv2.imwrite(os.path.join(train_img_path, img_name), frame)
+                    height, width, depth = frame.shape
+                    x, y, w, h = gt_bbox[i]
+                    isexist = exits[i]
+                    if isexist:
+                        category_id = 1
+                    else:
+                        category_id = 2
+                    draw_frame = cv2.rectangle(frame, (x, y), (x + w, y + h),
+                                               (0, 255, 0), 2)
+                    img_name_draw = d + '_' + str(i) + 'draw.jpg'
+                    cv2.imwrite(os.path.join(train_img_path, img_name_draw),
+                                draw_frame)
+
+                    img_info = {
+                        'file_name': img_name,
+                        'height': height,
+                        'width': width,
+                        'id': train_img_id
+                    }
+                    ann_info = {
+                        'area': float(w) * float(h),
+                        'iscrowd': 0,
+                        'bbox': [float(x),
+                                 float(y),
+                                 float(w),
+                                 float(h)],
+                        'category_id': category_id,
+                        'ignore': 0,
+                        'image_id': train_img_id,
+                        'id': train_img_id + 1
+                    }
+                    train_info['images'].append(img_info)
+                    train_info['annotations'].append(ann_info)
+                    train_img_id += 1
+                i += 1
+            else:
+                print('finish {}'.format(d))
+                break
+
+with open('annotations/train.json', 'w') as f:
+    json.dump(train_info, f)
+with open('annotations/val.json', 'w') as f:
+    json.dump(val_info, f)
diff --git a/docs/src/applications/BasketballAction/README.md b/docs/src/applications/BasketballAction/README.md
new file mode 100644
index 000000000..59bf4e308
--- /dev/null
+++ b/docs/src/applications/BasketballAction/README.md
@@ -0,0 +1,389 @@
+# 篮球动作检测模型
+
+
+## 内容
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型评估](#模型评估)
+- [模型推理](#模型推理)
+- [模型优化](#模型优化)
+- [模型部署](#模型部署)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+该代码库用于篮球动作检测+识别, 基于paddle2.0版本开发，结合PaddleVideo中的ppTSM, BMN, attentionLSTM的多个视频模型进行视频时空二阶段检测算法。
+主要分为如下几步
+ - 特征抽取
+    - 图像特性，ppTSM
+    - 音频特征，Vggsound
+ - proposal提取，BMN
+ - LSTM，动作分类 + 回归
+
+
+## 数据准备
+数据集处理代码
+```
+参考https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/datasets
+```
+
+- 数据集label格式
+```
+{
+    "0": "背景",
+    "1": "回放",
+    "2": "进球-三分球",
+    "3": "进球-两分球",
+    "4": "进球-扣篮",
+    "5": "罚球",
+    "6": "跳球"
+}
+```
+
+- 数据集gts处理, 将原始标注数据处理成如下json格式
+```
+{
+    'fps': 5,
+    'gts': [
+        {
+            'url': 'xxx.mp4',
+            'total_frames': 6341,
+            'actions': [
+                {
+                    "label_ids": [6],
+                    "label_names": ["跳球"],
+                    "start_id": 395,
+                    "end_id": 399
+                },
+                ...
+            ]
+        },
+        ...
+    ]
+}
+```
+
+- 数据集抽帧, 由mp4, 得到frames和pcm, 这里需要添加ffmpeg环境
+```
+cd datasets/script && python get_frames_pcm.py
+```
+
+- 数据预处理后保存格式如下
+```
+   |--  datasets                   # 训练数据集和处理脚本
+        |--  basketball            # xx数据集
+            |--  mp4               # 原始视频.mp4
+            |--  frames            # 图像帧, fps=5, '.jpg'格式
+            |--  pcm               # 音频pcm, 音频采样率16000，采用通道数1
+            |--  url.list          # 视频列表
+            |--  label_train.json  # 训练集原始gts
+            |--  label_val.json    # 验证集原始gts
+```
+
+
+## 模型训练
+代码参考足球动作检测：https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction
+
+将该代码库的文件夹 [datasets](https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/datasets)，[extractor](https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/extractor)，[train_lstm](https://github.com/PaddlePaddle/PaddleVideo/tree/application/FootballAction/train_lstm)， 拷贝到本代码库复用。
+
+ - image 采样频率fps=5，如果有些动作时间较短，可以适当提高采样频率
+ - BMN windows=200，即40s，所以测试自己的数据时，视频时长需大于40s
+
+### 基础镜像
+```
+docker pull tmtalgo/paddleaction:action-detection-v2
+```
+
+### step1 ppTSM训练
+我们提供了篮球数据训练的模型，参考checkpoints_basketball。如果使用提供的pptsm模型，可直接跳过下边的pptsm训练数据处理和训练步骤。
+如果需要在自己的数据上训练，ppTSM训练代码为：https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0
+ppTSM文档参考：https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/pp-tsm.md
+
+#### step1.1  ppTSM 训练数据处理
+由frames结合gts生成训练所需要的正负样本
+```
+cd ${BasketballAction}
+cd datasets/script && python get_instance_for_tsn.py
+
+# 文件名按照如下格式
+'{}_{}_{}_{}'.format(video_basename, start_id, end_id, label)
+```
+完成该步骤后，数据存储位置
+```
+   |--  datasets                   # 训练数据集和处理脚本
+        |--  basketball           # xx数据集
+            |--  input_for_tsn     # tsn/tsm训练的数据
+```
+
+#### step1.2 ppTSM模型训练
+```
+# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0
+cd ${PaddleVideo}
+# 修改config.yaml参数修改为 ${BasketballAcation}/configs_train/pptsm_basketball.yaml
+python -B -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    --log_dir=$save_dir/logs \
+    main.py  \
+    --validate \
+    -c {BasketballAcation}/configs_train/pptsm_basketball.yaml \
+    -o output_dir=$save_dir
+```
+
+#### step1.3 ppTSM模型转为预测模式
+```
+# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0
+$cd {PaddleVideo}
+python tools/export_model.py -c ${BasketballAcation}/configs_train/pptsm_basketball.yaml \
+                               -p ${pptsm_train_dir}/checkpoints/models_pptsm/ppTSM_epoch_00057.pdparams \
+                               -o {BasketballAcation}/checkpoints/ppTSM
+```
+
+####  step1.4 基于ppTSM视频特征提取
+image and audio特征提取，保存到datasets features文件夹下
+```
+cd ${BasketballAcation}
+cd extractor && python extract_feat.py
+# 特征维度, image(2048) + audio(1024) + pcm(640)
+# 特征保存格式如下，将如下dict保存在pkl格式，用于接下来的BMN训练
+video_features = {'image_feature': np_image_features,
+                  'audio_feature': np_audio_features
+                  'pcm_feature': np_pcm_features}
+```
+完成该步骤后，数据存储位置
+```
+   |--  datasets                   # 训练数据集和处理脚本
+        |--  basketball            # xx数据集
+            |--  features          # 视频的图像+音频特征
+```
+
+
+### step2 BMN训练
+BMN训练代码为：https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0
+BMN文档参考：https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/localization/bmn.md
+
+#### step2.1 BMN训练数据处理
+用于提取二分类的proposal，windows=40，根据gts和特征得到BMN训练所需要的数据集
+```
+cd ${BasketballAcation}
+cd datasets/script && python get_instance_for_bmn.py
+# 数据格式
+{
+    "719b0a4bcb1f461eabb152298406b861_753_793": {
+        "duration_second": 40.0,
+        "duration_frame": 200,
+        "feature_frame": 200,
+        "subset": "train",
+        "annotations": [
+            {
+                "segment": [
+                    15.0,
+                    22.0
+                ],
+                "label": "6.0",
+                "label_name": "跳球"
+            }
+        ]
+    },
+    ...
+}
+```
+完成该步骤后，数据存储位置
+```
+   |--  datasets                   # 训练数据集和处理脚本
+        |--  basketball            # xx数据集
+            |--  input_for_bmn     # bmn训练的proposal         
+```
+
+#### step2.2  BMN模型训练
+```
+# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0
+cd ${PaddleVideo}
+# 修改config.yaml参数修为${BasketballAcation}/configs_train/bmn_basketball.yaml
+python -B -m paddle.distributed.launch \
+     --gpus="0,1" \
+     --log_dir=$out_dir/logs \
+     main.py  \
+     --validate \
+     -c ${BasketballAcation}/configs_train/bmn_basketball.yaml \
+     -o output_dir=$out_dir
+```
+
+#### step2.3 BMN模型转为预测模式
+```
+# https://github.com/PaddlePaddle/PaddleVideo/tree/release/2.0
+${PaddleVideo}
+python tools/export_model.py -c $${BasketballAcation}/configs_train/bmn_basketball.yaml \
+                               -p ${bmn_train_dir}/checkpoints/models_bmn/bmn_epoch16.pdparams \
+                               -o {BasketballAcation}/checkpoints/BMN
+```
+
+#### step2.4  BMN模型预测
+得到动作proposal信息： start_id, end_id, score
+```
+cd ${BasketballAcation}
+cd extractor && python extract_bmn.py
+# 数据格式
+[
+    {
+        "video_name": "c9516c903de3416c97dae91a59e968d7",
+        "num_proposal": 5534,
+        "bmn_results": [
+            {
+                "start": 7850.0,
+                "end": 7873.0,
+                "score": 0.77194699622342
+            },
+            {
+                "start": 4400.0,
+                "end": 4443.0,
+                "score": 0.7663803287641536
+            },
+            ...
+        ]
+    },
+    ...
+]
+```
+完成该步骤后，数据存储位置
+```
+   |--  datasets                   # 训练数据集和处理脚本
+        |--  basketball            # xx数据集
+            |--  feature_bmn
+                 |--  prop.json    # bmn 预测结果
+```
+
+### step3 LSTM训练
+LSTM训练代码为：train_lstm
+
+#### step3.1  LSTM训练数据处理
+将BMN得到的proposal截断并处理成LSTM训练所需数据集
+```
+cd ${BasketballAcation}
+cd datasets/script && python get_instance_for_lstm.py
+# 数据格式1，label_info
+{
+    "fps": 5,
+    "results": [
+        {
+            "url": "https://xxx.mp4",
+            "mode": "train",        # train or validation
+            "total_frames": 6128,
+            "num_gts": 93,
+            "num_proposals": 5043,
+            "proposal_actions": [
+                {
+                    "label": 6,
+                    "norm_iou": 0.7575757575757576,
+                    "norm_ioa": 0.7575757575757576,
+                    "norm_start": -0.32,
+                    "proposal": {
+                        "start": 5011,
+                        "end": 5036,
+                        "score": 0.7723643666324231
+                    },
+                    "hit_gts": {
+                        "label_ids": [
+                            6
+                        ],
+                        "label_names": [
+                            "跳球"
+                        ],
+                        "start_id": 5003,
+                        "end_id": 5036
+                    }
+                },
+                ...
+        },
+        ...
+}
+# 数据格式2，LSTM训练所需要的feature
+{
+    'features': np.array(feature_hit, dtype=np.float32),    # TSM audio and pcm 特征, 可根据需求选择组合
+    'feature_fps': 5,                                       # fps = 5
+    'label_info': {'norm_iou': 0.5, 'label': 3, ...},       # 数据格式1中的'proposal_actions'
+    'video_name': 'c9516c903de3416c97dae91a59e968d7'        # video_name
+}
+# 数据格式3，LSTM训练所需label.txt
+'{} {}'.format(filename, label)
+```
+完成该步骤后，数据存储位置
+```
+   |--  datasets                   # 训练数据集和处理脚本
+        |--  basketball            # xx数据集
+            |--  input_for_lstm    # LSTM训练数据集
+```
+
+#### step3.2  LSTM训练
+```
+#conf.yaml修改为 ${BasketballAcation}/configs_train/lstm_basketball.yaml
+cd ${BasketballAcation}
+python -u scenario_lib/train.py \
+    --model_name=ActionNet \
+    --config=${BasketballAcation}/configs_train/lstm_basketball.yaml \
+    --save_dir=${out_dir}"/models_lstm/" \
+    --log_interval=5 \
+    --valid_interval=1
+```
+
+#### step3.3 LSTM模型转为预测模式
+```
+${BasketballAcation}
+python tools/export_model.py -c ${BasketballAction}/train_lstm/conf/conf.yaml \
+                               -p ${lstm_train_dir}/checkpoints/models_lstm/bmn_epoch29.pdparams \
+                               -o {BasketballAcation}/checkpoints/LSTM
+```
+
+
+## 模型推理
+测试数据格式，可参考使用样例
+```
+wget https://videotag.bj.bcebos.com/Applications/basketball/datasets.tar.gz
+```
+测试模型，可使用我们提供的模型
+```
+wget https://videotag.bj.bcebos.com/Applications/basketball/checkpoints_basketball.tar.gz
+```
+运行预测代码
+```
+cd ${BasketballAction}
+cd predict
+# 如果使用自己训练的模型，请将各训练过程中转换的inference模型放到predict库
+# cp -rf ../checkpoints checkpoints_basketball
+python predict.py
+```
+产出文件
+```
+${BasketballAction}/predict/results.json
+```
+
+
+## 模型评估
+```
+cd ${BasketballAction}
+cd predict
+python eval.py results.json
+```
+
+
+## 模型优化
+在实际使用场景中可根据视频内容尝试优化策略
+- 可根据动作运动速度，调整抽帧采样率，本代码默认为fps=5
+- 统计动作的时间分布，调整bmn采样窗口
+- 根据图像和音频的关联程度，调整图像和音频特征的融合方式：本代码将图像特征和音频在时间维度对齐，融合后再进入模型训练。也可尝试分别模型训练后，加权融合等
+- 本代码的解决方案也可用于其他动作检测。变换场景后，图像特征重新训练效果更好。音频特征采用的VGGSound训练，如果使用场景仍为生活场景，可直接复用。
+
+
+## 模型部署
+本代码解决方案在动作的检测和召回指标F1-score=80.14%
+<div align="center">
+  <img src="images/BasketballAction_demo.gif" width="640px"/><br>
+</div>
+
+
+## 参考论文
+
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
+- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.
+- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen
+- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/action.py b/docs/src/applications/BasketballAction/predict/action_detect/action.py
new file mode 100644
index 000000000..4bca40ad5
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/action.py
@@ -0,0 +1,175 @@
+#!./python27-gcc482/bin/python
+# coding: utf-8
+"""
+BAIDU CLOUD action
+"""
+
+import os
+import sys
+import pickle
+import json
+import time
+import functools
+
+import numpy as np
+
+from utils.preprocess import get_images
+from utils.config_utils import parse_config, print_configs
+import mfcc.feature_extractor as mfcc_extractor
+
+import models.pptsm_infer as image_model
+import models.audio_infer as audio_model
+import models.bmn_infer as prop_model
+import models.lstm_infer as classify_model
+
+import logger
+logger = logger.Logger()
+
+def record_time_info(func):
+    """decorator func to log cost time for func
+    """
+    @functools.wraps(func)
+    def timer(*args):
+        """log cost time for func
+        """
+        logger.info("function [{}] processing ...".format(func.__name__))
+        start_time = time.time()
+        retval = func(*args)
+        cost_time = round(time.time() - start_time, 5)
+        logger.info("function [{}] run time: {:.2f} min".format(func.__name__, cost_time / 60))
+        return retval
+    return timer
+
+
+class ActionDetection(object):
+    """ModelPredict"""
+    def __init__(self, cfg_file="configs/configs.yaml"):
+        cfg = parse_config(cfg_file)
+        self.configs = cfg
+        print_configs(self.configs, "Infer")
+
+        name = 'COMMON'
+        self.DEBUG          = cfg[name]['DEBUG']
+        self.BMN_ONLY       = cfg[name]['BMN_ONLY']
+        self.LSTM_ONLY      = cfg[name]['LSTM_ONLY']
+        self.PCM_ONLY       = cfg[name]['PCM_ONLY']
+        if self.LSTM_ONLY:
+            self.prop_dict = {}
+            for dataset in ['EuroCup2016']:
+                prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)
+                json_data = json.load(open(prop_json, 'r'))
+                for item in json_data:
+                    basename = prop_json.replace('feature_bmn/prop.json', 'mp4')
+                    basename = basename + '/' + item['video_name'] + '.mp4'
+                    self.prop_dict[basename] = item['bmn_results']
+            
+
+    @record_time_info
+    def load_model(self):
+        """
+        load_model
+        """
+        if not self.DEBUG:
+            self.image_model = image_model.InferModel(self.configs)
+            if not self.PCM_ONLY:
+                self.audio_model = audio_model.InferModel(self.configs)
+    
+        if not self.LSTM_ONLY:
+            self.prop_model = prop_model.InferModel(self.configs)
+
+        if not self.BMN_ONLY:
+            self.classify_model = classify_model.InferModel(self.configs)
+
+        logger.info("==> Action Detection prepared.")
+
+    @record_time_info
+    def infer(self, imgs_path, pcm_path, fps=5):
+        """
+        extract_feature
+        """
+        print("imgs_path  = ", imgs_path)
+        self.imgs_path = imgs_path
+        self.pcm_path = pcm_path
+        self.configs['COMMON']['fps'] = fps
+
+        logger.info("==> input video {}".format(os.path.basename(self.imgs_path)))
+    
+        # step 1: extract feature
+        video_features = self.extract_feature()
+    
+        # step2: get proposal
+        bmn_results = self.extract_proposal(video_features)
+         
+        # step3: classify 
+        material = {'feature': video_features, 'proposal': bmn_results}
+        action_results = self.video_classify(material)
+        
+        return bmn_results, action_results
+
+    @record_time_info
+    def video_classify(self, material):
+        """video classify"""
+        if self.BMN_ONLY:
+            return []
+        action_results = self.classify_model.predict(self.configs, material=material) 
+        logger.info('action shape {}'.format(np.array(action_results).shape))
+        return action_results
+
+    @record_time_info
+    def extract_proposal(self, video_features):
+        """extract proposal"""
+        if self.LSTM_ONLY:
+            basename = self.imgs_path.replace('frames', 'mp4') + '.mp4'
+            bmn_results = self.prop_dict[basename]
+            return bmn_results
+        bmn_results = self.prop_model.predict(self.configs, material=video_features)
+        logger.info('proposal shape {}'.format(np.array(bmn_results).shape))
+        return bmn_results
+
+    @record_time_info
+    def extract_feature(self):
+        """extract feature"""
+        if not self.DEBUG:
+            image_path_list = get_images(self.imgs_path)
+            self.configs['PPTSM']['frame_list'] = image_path_list
+            self.configs['AUDIO']['pcm_file'] = self.pcm_path
+            image_features = self.image_model.predict(self.configs)
+            if self.PCM_ONLY:
+                sample_rate = self.configs['AUDIO']['sample_rate']
+                pcm_features = mfcc_extractor.extract_pcm(self.pcm_path, sample_rate)
+                audio_features = []
+            else:
+                audio_features, pcm_features = self.audio_model.predict(self.configs)
+
+            np_image_features = np.array(image_features, dtype=np.float32)
+            np_audio_features = np.array(audio_features, dtype=np.float32)
+            np_pcm_features = np.array(pcm_features, dtype=np.float32)
+
+            video_features = {'image_feature': np_image_features,
+                              'audio_feature': np_audio_features,
+                              'pcm_feature': np_pcm_features}
+        else:
+            feature_path = self.imgs_path.replace("frames", "features") + '.pkl'
+            video_features = pickle.load(open(feature_path, 'rb'))
+
+        logger.info("feature shape {} {} {}".format(video_features['image_feature'].shape,
+                                                    video_features['audio_feature'].shape,
+                                                    video_features['pcm_feature'].shape))
+
+        return video_features
+
+if __name__ == '__main__':
+
+    model_predict = ActionDetection(cfg_file="../configs/configs.yaml")
+    model_predict.load_model()
+
+    imgs_path = "/home/work/datasets/EuroCup2016/frames/1be705a8f67648da8ec4b4296fa80895"
+    pcm_path = "/home/work/datasets/EuroCup2016/pcm/1be705a8f67648da8ec4b4296fa80895.pcm"
+
+    bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)
+    results = {'bmn_results': bmn_results, 'action_results': action_results}
+
+    with open('results.json', 'w', encoding='utf-8') as f:
+       data = json.dumps(results, indent=4, ensure_ascii=False)
+       f.write(data)
+
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/logger.py b/docs/src/applications/BasketballAction/predict/action_detect/logger.py
new file mode 100644
index 000000000..b03348721
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/logger.py
@@ -0,0 +1,24 @@
+"""
+logger
+"""
+import os
+import logging
+
+class Logger(logging.Logger):
+    """Customized logger for news stripper
+    """
+    def __init__(self):
+        super(Logger, self).__init__(self)
+        if not os.path.exists('logs'):
+            os.mkdir('logs')
+        handler = logging.FileHandler("logs/action_detect.log")
+        # handler.setLevel(logging.DEBUG)
+        handler.setLevel(logging.INFO)
+
+        format = "%(levelname)s: %(asctime)s: %(filename)s:%(lineno)d %(message)s"
+        datefmt = "%y-%m-%d %H:%M:%S"
+
+        formatter = logging.Formatter(format, datefmt)
+        handler.setFormatter(formatter)
+        self.addHandler(handler)
+
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py b/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py
new file mode 100755
index 000000000..43b110046
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py
@@ -0,0 +1,158 @@
+"""
+audio feature extract
+"""
+# coding: utf-8
+import os
+import numpy as np
+import pickle
+import mfcc.vgg_params as vgg_params
+
+
+def frame(data, window_length, hop_length):
+    """
+    frame
+    """
+    num_samples = data.shape[0]
+    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
+    shape = (num_frames, window_length) + data.shape[1:]
+    strides = (data.strides[0] * hop_length, ) + data.strides
+    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
+
+
+def periodic_hann(window_length):
+    """
+    periodic_hann
+    """
+    return 0.5 - (0.5 *
+                  np.cos(2 * np.pi / window_length * np.arange(window_length)))
+
+
+def stft_magnitude(signal, fft_length, hop_length=None, window_length=None):
+    """
+    stft_magnitude
+    """
+    frames = frame(signal, window_length, hop_length)
+    window = periodic_hann(window_length)
+    windowed_frames = frames * window
+    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
+
+
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+
+
+def hertz_to_mel(frequencies_hertz):
+    """
+    hertz_to_mel
+    """
+    return _MEL_HIGH_FREQUENCY_Q * np.log(1.0 + (frequencies_hertz /
+                                                 _MEL_BREAK_FREQUENCY_HERTZ))
+
+
+def spectrogram_to_mel_matrix(num_mel_bins=20,
+                              num_spectrogram_bins=129,
+                              audio_sample_rate=8000,
+                              lower_edge_hertz=125.0,
+                              upper_edge_hertz=3800.0):
+    """
+    spectrogram_to_mel_matrix
+    """
+    nyquist_hertz = audio_sample_rate / 2.
+    if lower_edge_hertz >= upper_edge_hertz:
+        raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
+                         (lower_edge_hertz, upper_edge_hertz))
+    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz,
+                                         num_spectrogram_bins)
+    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
+    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
+                                 hertz_to_mel(upper_edge_hertz),
+                                 num_mel_bins + 2)
+    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
+    for i in range(num_mel_bins):
+        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
+        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
+                       (center_mel - lower_edge_mel))
+        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
+                       (upper_edge_mel - center_mel))
+        mel_weights_matrix[:,
+                           i] = np.maximum(0.0,
+                                           np.minimum(lower_slope, upper_slope))
+    mel_weights_matrix[0, :] = 0.0
+    return mel_weights_matrix
+
+
+def log_mel_spectrogram(data,
+                        audio_sample_rate=8000,
+                        log_offset=0.0,
+                        window_length_secs=0.025,
+                        hop_length_secs=0.010,
+                        **kwargs):
+    """
+    log_mel_spectrogram
+    """
+    window_length_samples = int(round(audio_sample_rate * window_length_secs))
+    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
+    fft_length = 2**int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
+    spectrogram = stft_magnitude(data,
+                                 fft_length=fft_length,
+                                 hop_length=hop_length_samples,
+                                 window_length=window_length_samples)
+    mel_spectrogram = np.dot(
+        spectrogram,
+        spectrogram_to_mel_matrix(num_spectrogram_bins=spectrogram.shape[1],
+                                  audio_sample_rate=audio_sample_rate,
+                                  **kwargs))
+
+    return np.log(mel_spectrogram + log_offset)
+
+
+def wav_to_example(wav_data, sample_rate):
+    """
+    wav_to_example
+    """
+    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
+    pad_zero_num = int(sample_rate * (vgg_params.STFT_WINDOW_LENGTH_SECONDS -
+                                      vgg_params.STFT_HOP_LENGTH_SECONDS))
+    wav_data_extend = np.hstack((wav_data, np.zeros(pad_zero_num)))
+    wav_data = wav_data_extend
+    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
+    if len(wav_data.shape) > 1:
+        wav_data = np.mean(wav_data, axis=1)
+    log_mel = log_mel_spectrogram(
+        wav_data,
+        audio_sample_rate=vgg_params.SAMPLE_RATE,
+        log_offset=vgg_params.LOG_OFFSET,
+        window_length_secs=vgg_params.STFT_WINDOW_LENGTH_SECONDS,
+        hop_length_secs=vgg_params.STFT_HOP_LENGTH_SECONDS,
+        num_mel_bins=vgg_params.NUM_MEL_BINS,
+        lower_edge_hertz=vgg_params.MEL_MIN_HZ,
+        upper_edge_hertz=vgg_params.MEL_MAX_HZ)
+    # Frame features into examples.
+    features_sample_rate = 1.0 / vgg_params.STFT_HOP_LENGTH_SECONDS
+    example_window_length = int(
+        round(vgg_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
+
+    example_hop_length = int(
+        round(vgg_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
+    log_mel_examples = frame(log_mel,
+                             window_length=example_window_length,
+                             hop_length=example_hop_length)
+    return log_mel_examples
+
+
+def extract_pcm(pcm_file, sample_rate):
+    with open(pcm_file, "rb") as f:
+        pcm_data = f.read()
+    audio_data = np.fromstring(pcm_data, dtype=np.int16)
+    examples = wav_to_example(audio_data, sample_rate)
+    return examples
+
+
+if __name__ == "__main__":
+    wav_file = sys.argv[1]
+    print("wav_file = ", wav_file)
+    with open(wav_file, "rb") as f:
+        pcm_data = f.read()
+    audio_data = np.fromstring(pcm_data, dtype = np.int16)
+    examples_batch = wav_to_example(audio_data, 16000)
+    print("examples_batch.shape", examples_batch.shape)   
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/mfcc/model_config.py b/docs/src/applications/BasketballAction/predict/action_detect/mfcc/model_config.py
new file mode 100644
index 000000000..194365ece
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/mfcc/model_config.py
@@ -0,0 +1,51 @@
+"""
+audio model config
+"""
+import numpy as np
+
+import mfcc.feature_extractor as feature_extractor
+
+
+class ModelAudio(object):
+    """
+    modelAudio
+    """
+    def __init__(self, configs, use_gpu=1):
+        self.use_gpu = use_gpu
+
+        self.audio_fps = configs.COMMON.fps
+        self.audio_feat_scale = configs.TSN.audio_scale
+        self.sample_rate = 16000
+
+    def predict_slice(self, wav_data, sample_rate):
+        """
+        audio predict
+        """
+        examples_batch = feature_extractor.wav_to_example(
+            wav_data, sample_rate)[0]
+        return examples_batch
+
+    def predict_audio(self, audio_file):
+        """
+        predict_audio
+        """
+        audio_feature_list = []
+        # read pcm
+        sample_rate = self.sample_rate
+        try:
+            with open(audio_file, "rb") as f:
+                pcm_data = f.read()
+            audio_data = np.fromstring(pcm_data, dtype=np.int16)
+            audio_status = "audio load success"
+        except Exception as e:
+            audio_data = []
+            audio_status = "audio load failed"
+        step = 1
+        len_video = int(len(audio_data) / sample_rate)
+        print(len_video)
+        for i in range(0, len_video, step):
+            audio_data_part = audio_data[i * sample_rate:(i + step) *
+                                         sample_rate]
+            feature_audio = self.predict_slice(audio_data_part, sample_rate)
+            audio_feature_list.append(feature_audio)
+        return audio_feature_list
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py b/docs/src/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py
new file mode 100755
index 000000000..0a9951961
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py
@@ -0,0 +1,37 @@
+"""Global parameters for the VGGish model.
+See vggish_slim.py for more information.
+"""
+
+# Architectural constants.
+NUM_FRAMES = 50  # Frames in input mel-spectrogram patch.
+NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
+EMBEDDING_SIZE = 128  # Size of embedding layer.
+
+# Hyperparameters used in feature and example generation.
+SAMPLE_RATE = 16000
+STFT_WINDOW_LENGTH_SECONDS = 0.040
+STFT_HOP_LENGTH_SECONDS = 0.020
+NUM_MEL_BINS = NUM_BANDS
+MEL_MIN_HZ = 125
+MEL_MAX_HZ = 7500
+LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
+EXAMPLE_WINDOW_SECONDS = 1.00  # Each example contains 96 10ms frames
+EXAMPLE_HOP_SECONDS = 1.00  # with zero overlap.
+
+# Parameters used for embedding postprocessing.
+PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
+PCA_MEANS_NAME = 'pca_means'
+QUANTIZE_MIN_VAL = -2.0
+QUANTIZE_MAX_VAL = +2.0
+
+# Hyperparameters used in training.
+INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
+LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
+ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
+
+# Names of ops, tensors, and features.
+INPUT_OP_NAME = 'vggish/input_features'
+INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
+OUTPUT_OP_NAME = 'vggish/embedding'
+OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
+AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/models/audio_infer.py b/docs/src/applications/BasketballAction/predict/action_detect/models/audio_infer.py
new file mode 100644
index 000000000..7b19c90ed
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/models/audio_infer.py
@@ -0,0 +1,80 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """audio infer"""
+    def __init__(self, cfg, name='AUDIO'): 
+        name = name.upper()
+        self.name           = name
+        model_file          = cfg[name]['model_file']
+        params_file         = cfg[name]['params_file']
+        gpu_mem             = cfg[name]['gpu_mem']
+        device_id           = cfg[name]['device_id']
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input_tensor = self.predictor.get_input_handle(input_names[0])
+
+        output_names = self.predictor.get_output_names()
+        self.output_tensor = self.predictor.get_output_handle(output_names[0])
+
+
+    def infer(self, input):
+        """infer"""
+        self.input_tensor.copy_from_cpu(input)
+        self.predictor.run()
+        output = self.output_tensor.copy_to_cpu()
+        return output
+
+
+    def predict(self, infer_config):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config)
+        feature_list = []
+        pcm_list = []
+        for infer_iter, data in enumerate(infer_reader()):
+            inputs = np.array(data, dtype = 'float32')
+            output = self.infer(inputs)
+            feature_list.append(np.squeeze(output))
+            pcm_list.append(inputs)
+        feature_values = np.vstack(feature_list)
+        pcm_values = np.vstack(pcm_list)
+        return feature_values, pcm_values
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml' 
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    pcm_path = '/home/work/datasets/WorldCup2018/pcm/6e577252c4004961ac7caa738a52c238.pcm'
+    t0 = time.time()
+    cfg['AUDIO']['pcm_file'] = pcm_path
+    outputs = model.predict(cfg)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    t1 = time.time()
+
+    print(outputs.shape)
+    print(outputs[0])
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py b/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py
new file mode 100644
index 000000000..9e739d1af
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/models/bmn_infer.py
@@ -0,0 +1,155 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import json
+import pickle
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+from utils.process_result import process_proposal
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """bmn infer"""
+    def __init__(self, cfg, name='BMN'): 
+        name = name.upper()
+        self.name           = name
+        model_file          = cfg[name]['model_file']
+        params_file         = cfg[name]['params_file']
+        gpu_mem             = cfg[name]['gpu_mem']
+        device_id           = cfg[name]['device_id']
+
+        self.nms_thread          = cfg[name]['nms_thread']
+        self.min_pred_score      = cfg[name]['score_thread']
+        self.min_frame_thread    = cfg['COMMON']['fps']
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input_tensor = self.predictor.get_input_handle(input_names[0])
+
+        output_names = self.predictor.get_output_names()
+        self.output1_tensor = self.predictor.get_output_handle(output_names[0])
+        self.output2_tensor = self.predictor.get_output_handle(output_names[1])
+        self.output3_tensor = self.predictor.get_output_handle(output_names[2])
+
+
+    def infer(self, input):
+        """infer"""
+        self.input_tensor.copy_from_cpu(input)
+        self.predictor.run()
+        output1 = self.output1_tensor.copy_to_cpu()
+        output2 = self.output2_tensor.copy_to_cpu()
+        output3 = self.output3_tensor.copy_to_cpu()
+        return output1, output2, output3
+
+
+    def generate_props(self, pred_bmn, pred_start, pred_end, max_window=200, min_window=5):
+        """generate_props"""
+        video_len = min(pred_bmn.shape[-1], min(pred_start.shape[-1], pred_end.shape[-1]))
+        pred_bmn = pred_bmn[0, :, :] * pred_bmn[1, :, :]
+        start_mask = self.boundary_choose(pred_start)
+        start_mask[0] = 1.
+        end_mask = self.boundary_choose(pred_end)
+        end_mask[-1] = 1.
+        score_results = []
+        for idx in range(min_window, max_window):
+            for jdx in range(video_len):
+                start_index = jdx
+                end_index = start_index + idx
+                if end_index < video_len and start_mask[start_index] == 1 and end_mask[end_index] == 1:
+                    xmin = start_index
+                    xmax = end_index
+                    xmin_score = pred_start[start_index]
+                    xmax_score = pred_end[end_index]
+                    bmn_score = pred_bmn[idx, jdx]
+                    conf_score = xmin_score * xmax_score * bmn_score
+                    score_results.append([xmin, xmax, conf_score])
+        return score_results
+
+
+    def boundary_choose(self, score_list):
+        """boundary_choose"""
+        max_score = max(score_list)
+        mask_high = (score_list > max_score * 0.5)
+        score_list = list(score_list)
+        score_middle = np.array([0.0] + score_list + [0.0])
+        score_front = np.array([0.0, 0.0] + score_list)
+        score_back = np.array(score_list + [0.0, 0.0])
+        mask_peak = ((score_middle > score_front) & (score_middle > score_back))
+        mask_peak = mask_peak[1:-1]
+        mask = (mask_high | mask_peak).astype('float32')
+        return mask
+
+
+    def predict(self, infer_config, material):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)
+        feature_list = []
+        for infer_iter, data in enumerate(infer_reader()):
+            inputs      = [items[0] for items in data]
+            winds       = [items[1] for items in data]
+            feat_info   = [items[2] for items in data]
+            feature_T   = feat_info[0][0]
+            feature_N   = feat_info[0][1]
+
+            inputs = np.array(inputs)
+            pred_bmn, pred_sta, pred_end = self.infer(inputs)
+
+            if infer_iter == 0:
+                sum_pred_bmn = np.zeros((2, feature_N, feature_T))
+                sum_pred_sta = np.zeros((feature_T, ))
+                sum_pred_end = np.zeros((feature_T, ))
+                sum_pred_cnt = np.zeros((feature_T, ))
+
+            for idx, sub_wind in enumerate(winds):
+                sum_pred_bmn[:, :, sub_wind[0]: sub_wind[1]] += pred_bmn[idx]
+                sum_pred_sta[sub_wind[0]: sub_wind[1]] += pred_sta[idx]
+                sum_pred_end[sub_wind[0]: sub_wind[1]] += pred_end[idx]
+                sum_pred_cnt[sub_wind[0]: sub_wind[1]] += np.ones((sub_wind[1] - sub_wind[0], ))
+
+        pred_bmn = sum_pred_bmn / sum_pred_cnt
+        pred_sta = sum_pred_sta / sum_pred_cnt
+        pred_end = sum_pred_end / sum_pred_cnt
+
+        score_result = self.generate_props(pred_bmn, pred_sta, pred_end)
+        results = process_proposal(score_result, self.min_frame_thread, self.nms_thread, self.min_pred_score)
+
+        return results
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml' 
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'
+
+    # feature
+    feature_path = imgs_path.replace("frames", "features") + '.pkl'
+    video_features = pickle.load(open(feature_path, 'rb'))
+
+    t0 = time.time()
+    outputs = model.predict(cfg, video_features)
+    t1 = time.time()
+
+    results = {'proposal': outputs}
+    with open('results.json', 'w', encoding='utf-8') as f:
+       data = json.dumps(results, indent=4, ensure_ascii=False)
+       f.write(data) 
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py b/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py
new file mode 100644
index 000000000..a3acb674e
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/models/lstm_infer.py
@@ -0,0 +1,145 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import json
+import pickle
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+from utils.process_result import get_action_result
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """lstm infer"""
+    def __init__(self, cfg, name='ACTION'): 
+        name = name.upper()
+        self.name           = name
+        model_file          = cfg[name]['model_file']
+        params_file         = cfg[name]['params_file']
+        gpu_mem             = cfg[name]['gpu_mem']
+        device_id           = cfg[name]['device_id']
+
+        self.topk           = cfg[name]['topk']
+        self.frame_offset   = cfg[name]['nms_offset']
+        self.nms_thread     = cfg[name]['nms_thread']
+        self.cls_thread     = cfg[name]['classify_score_thread']
+        self.iou_thread     = cfg[name]['iou_score_thread']
+
+        self.label_map_file = cfg['COMMON']['label_dic']
+        self.fps            = cfg['COMMON']['fps']
+        self.nms_id         = 5
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input1_tensor = self.predictor.get_input_handle(input_names[0])
+        #self.input2_tensor = self.predictor.get_input_handle(input_names[1])
+
+        output_names = self.predictor.get_output_names()
+        self.output1_tensor = self.predictor.get_output_handle(output_names[0])
+        self.output2_tensor = self.predictor.get_output_handle(output_names[1])
+
+
+    def infer(self, input1_arr, input1_lod, input2_arr=None, input2_lod=None):
+        """infer"""
+        self.input1_tensor.copy_from_cpu(input1_arr)
+        self.input1_tensor.set_lod(input1_lod)
+        if not input2_arr is None:
+            self.input2_tensor.copy_from_cpu(input2_arr)
+            self.input2_tensor.set_lod(input2_lod)
+        self.predictor.run()
+        output1 = self.output1_tensor.copy_to_cpu()
+        output2 = self.output2_tensor.copy_to_cpu()
+        # print(output.shape)
+        return output1, output2
+
+    def pre_process(self, input):
+        """pre process"""
+        input_arr = []
+        input_lod = [0]
+        start_lod = 0
+        end_lod = 0
+        for sub_item in input:
+            end_lod = start_lod + len(sub_item)
+            input_lod.append(end_lod)
+            input_arr.extend(sub_item)
+            start_lod = end_lod
+        input_arr = np.array(input_arr)
+        return input_arr, [input_lod]
+
+    def predict(self, infer_config, material):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)
+        results = []
+        for infer_iter, data in enumerate(infer_reader()):
+            video_id = [[items[-2], items[-1]] for items in data]
+            input1 = [items[0] for items in data]
+            input1_arr, input1_lod = self.pre_process(input1)
+            output1, output2 = self.infer(input1_arr, input1_lod)
+
+            predictions_id = output1 
+            predictions_iou = output2
+            for i in range(len(predictions_id)):
+                topk_inds = predictions_id[i].argsort()[0 - self.topk:]
+                topk_inds = topk_inds[::-1]
+                preds_id = predictions_id[i][topk_inds]
+                preds_iou = predictions_iou[i][0]
+                results.append((video_id[i], preds_id.tolist(), topk_inds.tolist(), preds_iou.tolist()))
+
+        predict_result = get_action_result(results, self.label_map_file, self.fps, 
+                                           self.cls_thread, self.iou_thread, 
+                                           self.nms_id, self.nms_thread, self.frame_offset)
+        return predict_result
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml' 
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    # proposal total
+    prop_dict = {}
+    for dataset in ['EuroCup2016', 'WorldCup2018']:
+        prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)
+        json_data = json.load(open(prop_json, 'r'))
+        for item in json_data:
+            basename = prop_json.replace('feature_bmn/prop.json', 'mp4')
+            basename = basename + '/' + item['video_name'] + '.mp4'
+            prop_dict[basename] = item['bmn_results']
+
+    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'
+
+    # feature
+    feature_path = imgs_path.replace("frames", "features") + '.pkl'
+    video_features = pickle.load(open(feature_path, 'rb'))
+
+    # proposal
+    basename = imgs_path.replace('frames', 'mp4') + '.mp4'
+    bmn_results = prop_dict[basename]
+
+    material = {'feature': video_features, 'proposal': bmn_results}
+
+    t0 = time.time()
+    outputs = model.predict(cfg, material)
+    t1 = time.time()
+    results = {'actions': outputs}
+    with open('results.json', 'w', encoding='utf-8') as f:
+       data = json.dumps(results, indent=4, ensure_ascii=False)
+       f.write(data) 
+
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py b/docs/src/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py
new file mode 100644
index 000000000..63f31367a
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py
@@ -0,0 +1,83 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """pptsm infer"""
+    def __init__(self, cfg, name='PPTSM'): 
+        name = name.upper()
+        self.name           = name
+        model_file          = cfg[name]['model_file']
+        params_file         = cfg[name]['params_file']
+        gpu_mem             = cfg[name]['gpu_mem']
+        device_id           = cfg[name]['device_id']
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input_tensor = self.predictor.get_input_handle(input_names[0])
+
+        output_names = self.predictor.get_output_names()
+        print("output_names = ", output_names)
+        #self.output_tensor = self.predictor.get_output_handle(output_names[1])
+        self.output_tensor = self.predictor.get_output_handle(output_names[0])
+
+
+    def infer(self, input):
+        """infer"""
+        self.input_tensor.copy_from_cpu(input)
+        self.predictor.run()
+        output = self.output_tensor.copy_to_cpu()
+        return output
+
+
+    def predict(self, infer_config):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config)
+        feature_list = []
+        for infer_iter, data in enumerate(infer_reader()):
+            inputs = [items[:-1] for items in data]
+            inputs = np.array(inputs)
+            output = self.infer(inputs)
+            #print("inputs", inputs.shape)
+            #print("outputs", output.shape)
+            feature_list.append(np.squeeze(output))
+        feature_list = np.vstack(feature_list)
+        return feature_list
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml' 
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238/' 
+    imgs_list = get_images(imgs_path)
+    t0 = time.time()
+    cfg['PPTSM']['frame_list'] = imgs_list
+    outputs = model.predict(cfg)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    t1 = time.time()
+
+    print(outputs.shape)
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/reader/__init__.py b/docs/src/applications/BasketballAction/predict/action_detect/reader/__init__.py
new file mode 100644
index 000000000..547b2d6bb
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/reader/__init__.py
@@ -0,0 +1,15 @@
+"""
+read map for model
+"""
+from reader.reader_utils import regist_reader, get_reader
+import reader.tsminf_reader as tsminf_reader
+import reader.audio_reader as audio_reader
+import reader.bmninf_reader as bmninf_reader
+import reader.feature_reader as feature_reader
+
+# regist reader, sort by alphabet
+regist_reader("TSM", tsminf_reader.TSMINFReader)
+regist_reader("PPTSM", tsminf_reader.TSMINFReader)
+regist_reader("AUDIO", audio_reader.AudioReader)
+regist_reader("BMN", bmninf_reader.BMNINFReader)
+regist_reader("ACTION", feature_reader.FeatureReader)
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/reader/audio_reader.py b/docs/src/applications/BasketballAction/predict/action_detect/reader/audio_reader.py
new file mode 100644
index 000000000..2e1f1d28f
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/reader/audio_reader.py
@@ -0,0 +1,78 @@
+"""
+audio reader
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import _pickle as cPickle
+#from .reader_utils import DataReader
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+    from io import BytesIO
+import numpy as np
+import random
+import code
+
+from .reader_utils import DataReader
+import mfcc.feature_extractor as feature_extractor
+
+class AudioReader(DataReader):
+    """
+    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks
+    This is for the three models: lstm, attention cluster, nextvlad
+
+    dataset cfg: num_classes
+                 batch_size
+                 list
+                 NextVlad only: eigen_file
+    """
+
+    def __init__(self, name, mode, cfg, material=None):
+        self.name = name
+        self.mode = mode
+
+        # set batch size and file list
+        self.sample_rate = cfg[self.name.upper()]['sample_rate']
+        self.batch_size = cfg[self.name.upper()]['batch_size']
+        self.pcm_file = cfg[self.name.upper()]['pcm_file']
+        self.material = material
+
+    def create_reader(self):
+        """create_reader"""
+        with open(self.pcm_file, "rb") as f:
+            pcm_data = f.read()
+        audio_data = np.fromstring(pcm_data, dtype=np.int16)
+        examples = feature_extractor.wav_to_example(audio_data, self.sample_rate)
+        # print(examples.shape)
+
+        def reader():
+            """reader"""
+            batch_out = []
+            batch_out_pre = []
+        
+            for audio in examples:
+                # batch_out.append([audio])
+                batch_out.append(audio)
+                if len(batch_out) == self.batch_size:
+                    yield batch_out
+                    batch_out = []
+            if len(batch_out) > 0:
+                yield batch_out
+            
+        return reader
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py b/docs/src/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py
new file mode 100644
index 000000000..afc15b886
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py
@@ -0,0 +1,151 @@
+"""
+# @File  : bmninf_reader.py  
+# @Author: macaihong
+# @Date  : 2019/12/15
+# @Desc  :
+"""
+
+import os
+import random
+import pickle
+import json
+import numpy as np
+import multiprocessing
+
+import numpy as np
+
+from .reader_utils import DataReader
+
+
+def get_sw_prop(duration, window=200, step=10):
+    """
+    get_sw_prop
+    """
+    pr = []
+    local_boxes = []
+    for k in np.arange(0, duration - window + step, step):
+        start_id = k
+        end_id = min(duration, k + window)
+        if end_id - start_id < window:
+            start_id = end_id - window
+        local_boxes = (start_id, end_id)
+        pr.append(local_boxes)
+
+    def valid_proposal(duration, span):
+        """
+        valid_proposal
+        """
+        # fileter proposals
+        # a valid proposal should have at least one second in the video
+        real_span = min(duration, span[1]) - span[0]
+        return real_span >= 1
+
+    pr = list(filter(lambda x: valid_proposal(duration, x), pr))
+    return pr
+
+
+class BMNINFReader(DataReader):
+    """
+    Data reader for BMN model, which was stored as features extracted by prior networks
+    dataset cfg: feat_path, feature path,
+                 tscale, temporal length of BM map,
+                 dscale, duration scale of BM map,
+                 anchor_xmin, anchor_xmax, the range of each point in the feature sequence,
+                 batch_size, batch size of input data,
+                 num_threads, number of threads of data processing
+    """
+
+    def __init__(self, name, mode, cfg, material=None):
+        self.name = name
+        self.mode = mode
+        self.tscale = cfg[self.name.upper()]['tscale']  # 200
+        self.dscale = cfg[self.name.upper()]['dscale']  # 200
+        self.tgap = 1. / self.tscale
+        self.step = cfg[self.name.upper()]['window_step']
+
+        self.material = material
+        src_feature = self.material
+
+        image_feature = src_feature['image_feature']
+        pcm_feature = src_feature['pcm_feature']
+        pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))
+        min_length = min(image_feature.shape[0], pcm_feature.shape[0])
+        image_feature = image_feature[:min_length, :]
+        pcm_feature = pcm_feature[:min_length, :]
+        self.features = np.concatenate((image_feature, pcm_feature), axis=1)
+
+        self.duration = len(self.features)
+        self.window = self.tscale
+
+        self.get_dataset_dict()
+        self.get_match_map()
+
+        self.batch_size = cfg[self.name.upper()]['batch_size']
+        if (mode == 'test') or (mode == 'infer'):
+            self.num_threads = 1  # set num_threads as 1 for test and infer
+
+    def get_dataset_dict(self):
+        """
+        get_dataset_dict
+        """
+        self.video_list = get_sw_prop(self.duration, self.window, self.step)
+
+    def get_match_map(self):
+        """
+        get_match_map
+        """
+        match_map = []
+        for idx in range(self.tscale):
+            tmp_match_window = []
+            xmin = self.tgap * idx
+            for jdx in range(1, self.tscale + 1):
+                xmax = xmin + self.tgap * jdx
+                tmp_match_window.append([xmin, xmax])
+            match_map.append(tmp_match_window)
+        match_map = np.array(match_map)
+        match_map = np.transpose(match_map, [1, 0, 2])
+        match_map = np.reshape(match_map, [-1, 2])
+        self.match_map = match_map
+        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]
+        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]
+
+    
+    def load_file(self, video_wind):
+        """
+        load_file
+        """
+        start_feat_id = video_wind[0]
+        end_feat_id = video_wind[1]
+        video_feat = self.features[video_wind[0]: video_wind[1]]
+        video_feat = video_feat.T
+        video_feat = video_feat.astype("float32")
+        return video_feat
+
+    def create_reader(self):
+        """
+        reader creator for ctcn model
+        """
+        return self.make_infer_reader()
+
+    def make_infer_reader(self):
+        """
+        reader for inference
+        """
+        def reader():
+            """
+            reader
+            """
+            batch_out = []
+            # for video_name in self.video_list:
+            for video_wind in self.video_list:
+                video_idx = self.video_list.index(video_wind)
+                video_feat = self.load_file(video_wind)
+                batch_out.append((video_feat, video_wind, [self.duration, self.dscale]))
+
+                if len(batch_out) == self.batch_size:
+                    yield batch_out
+                    batch_out = []
+            if len(batch_out) > 0:
+                yield batch_out
+
+        return reader
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/reader/feature_reader.py b/docs/src/applications/BasketballAction/predict/action_detect/reader/feature_reader.py
new file mode 100644
index 000000000..a2e74fe8d
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/reader/feature_reader.py
@@ -0,0 +1,87 @@
+"""
+attention-lstm feature reader
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+import numpy as np
+import random
+import code
+
+from .reader_utils import DataReader
+
+class FeatureReader(DataReader):
+    """
+    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks
+    This is for the three models: lstm, attention cluster, nextvlad
+
+    dataset cfg: num_classes
+                 batch_size
+                 list
+                 NextVlad only: eigen_file
+    """
+
+    def __init__(self, name, mode, cfg, material=None):
+        self.name = name
+        self.mode = mode
+        self.batch_size = cfg[self.name.upper()]['batch_size']
+
+        self.feature = material['feature']
+        self.proposal = material['proposal']
+        self.fps = 5
+
+    def create_reader(self):
+        """
+        create_reader
+        """
+        image_feature_list = self.feature['image_feature']
+        audio_feature_list = self.feature['audio_feature']
+        pcm_feature_list = self.feature['pcm_feature']
+        pcm_feature_list = pcm_feature_list.reshape((pcm_feature_list.shape[0] * 5, 640))
+
+        fl = self.proposal
+
+        if self.mode == 'train':
+            random.shuffle(fl)
+
+        def reader():
+            """
+            reader
+            """
+            batch_out = []
+            for prop_info in fl:
+                start_id = int(prop_info['start'])
+                end_id = int(prop_info['end'])
+                bmn_score = float(prop_info['score'])
+                try:
+                    image_feature = image_feature_list[start_id: end_id]
+                    audio_feature = audio_feature_list[int(start_id / self.fps): int(end_id / self.fps)]
+                    pcm_feature = pcm_feature_list[start_id: end_id]
+
+                    image_feature = np.concatenate((image_feature, pcm_feature), axis=1)
+                    
+                    batch_out.append((image_feature, audio_feature, 0, prop_info))
+                    if len(batch_out) == self.batch_size:
+                        yield batch_out
+                        batch_out = []
+                except Exception as e:
+                    continue
+        return reader
+
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/reader/reader_utils.py b/docs/src/applications/BasketballAction/predict/action_detect/reader/reader_utils.py
new file mode 100644
index 000000000..f76b5d38d
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/reader/reader_utils.py
@@ -0,0 +1,109 @@
+"""
+reader_util
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+
+
+class ReaderNotFoundError(Exception):
+    """
+    "Error: reader not found"
+    """
+
+    def __init__(self, reader_name, avail_readers):
+        super(ReaderNotFoundError, self).__init__()
+        self.reader_name = reader_name
+        self.avail_readers = avail_readers
+
+    def __str__(self):
+        msg = "Reader {} Not Found.\nAvailiable readers:\n".format(
+            self.reader_name)
+        for reader in self.avail_readers:
+            msg += "  {}\n".format(reader)
+        return msg
+
+
+class DataReader(object):
+    """
+    data reader for video input
+    """
+
+    def __init__(self, model_name, mode, cfg):
+        self.name = model_name
+        self.mode = mode
+        self.cfg = cfg
+
+    def create_reader(self):
+        """
+        Not implemented
+        """
+        pass
+
+    def get_config_from_sec(self, sec, item, default=None):
+        """
+        get_config_from_sec
+        """
+        if sec.upper() not in self.cfg:
+            return default
+        return self.cfg[sec.upper()].get(item, default)
+
+
+class ReaderZoo(object):
+    """
+    ReaderZoo
+    """
+    def __init__(self):
+        """
+        __init__
+        """
+        self.reader_zoo = {}
+
+    def regist(self, name, reader):
+        """
+        regist
+        """
+        assert reader.__base__ == DataReader, "Unknow model type {}".format(
+            type(reader))
+        self.reader_zoo[name] = reader
+
+    def get(self, name, mode, cfg, material=None):
+        """
+        get
+        """
+        for k, v in self.reader_zoo.items():
+            if k == name:
+                return v(name, mode, cfg, material)
+        raise ReaderNotFoundError(name, self.reader_zoo.keys())
+
+
+# singleton reader_zoo
+reader_zoo = ReaderZoo()
+
+
+def regist_reader(name, reader):
+    """
+    regist_reader
+    """
+    reader_zoo.regist(name, reader)
+
+
+def get_reader(name, mode, cfg, material=None):
+    """
+    get_reader
+    """
+    reader_model = reader_zoo.get(name, mode, cfg, material)
+    return reader_model.create_reader()
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py b/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py
new file mode 100644
index 000000000..241ba4bc6
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py
@@ -0,0 +1,366 @@
+"""
+tsn frame reader
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import random
+import functools
+import concurrent.futures
+import multiprocessing
+
+import numpy as np
+import paddle
+from PIL import Image, ImageEnhance
+
+from .reader_utils import DataReader
+
+
+class TSMINFReader(DataReader):
+    """
+    Data reader for video dataset of jpg folder.
+    """
+    def __init__(self, name, mode, cfg, material=None):
+        super(TSMINFReader, self).__init__(name, mode, cfg)
+        name = name.upper()
+        self.num_seg = cfg[name]['num_seg']
+        self.seglen = cfg[name]['seglen']
+        self.short_size = cfg[name]['short_size']
+        self.target_size = cfg[name]['target_size']
+        self.batch_size = cfg[name]['batch_size']
+        self.reader_threads = cfg[name]['reader_threads']
+        self.buf_size = cfg[name]['buf_size']
+        self.video_path = cfg[name]['frame_list']
+
+        self.img_mean = np.array(cfg[name]['image_mean']).reshape(
+            [3, 1, 1]).astype(np.float32)
+        self.img_std = np.array(cfg[name]['image_std']).reshape(
+            [3, 1, 1]).astype(np.float32)
+
+        self.material = material
+
+    def create_reader(self):
+        """
+        batch loader for TSN
+        """
+        _reader = self._inference_reader_creator_longvideo(
+            self.video_path,
+            self.mode,
+            num_seg=self.num_seg,
+            seglen=self.seglen,
+            short_size=self.short_size,
+            target_size=self.target_size,
+            img_mean=self.img_mean,
+            img_std=self.img_std,
+            num_threads=self.reader_threads,
+            buf_size=self.buf_size)
+
+        def _batch_reader():
+            batch_out = []
+            for imgs, label in _reader():
+                if imgs is None:
+                    continue
+                batch_out.append((imgs, label))
+                if len(batch_out) == self.batch_size:
+                    yield batch_out
+                    batch_out = []
+            if len(batch_out) > 1:
+                yield batch_out[:-1]
+
+        return _batch_reader
+
+    def _inference_reader_creator_longvideo(self, video_path, mode, num_seg,
+                                            seglen, short_size, target_size,
+                                            img_mean, img_std, num_threads,
+                                            buf_size):
+        """
+        inference reader for video
+        """
+        def reader():
+            """
+            reader
+            """
+            def image_buf(image_id_path_buf):
+                """
+                image_buf reader
+                """
+                try:
+                    img_path = image_id_path_buf[1]
+                    img = Image.open(img_path).convert("RGB")
+                    image_id_path_buf[2] = img
+                except:
+                    image_id_path_buf[2] = None
+
+            frame_len = len(video_path)
+            read_thread_num = num_seg
+            for i in range(0, frame_len, read_thread_num):
+                image_list_part = video_path[i:i + read_thread_num]
+                image_id_path_buf_list = []
+                for k in range(len(image_list_part)):
+                    image_id_path_buf_list.append([k, image_list_part[k], None])
+
+                with concurrent.futures.ThreadPoolExecutor(
+                        max_workers=read_thread_num) as executor:
+                    executor.map(
+                        lambda image_id_path_buf: image_buf(image_id_path_buf),
+                        image_id_path_buf_list)
+                imgs_seg_list = [x[2] for x in image_id_path_buf_list]
+
+                # add the fault-tolerant for bad image
+                for k in range(len(image_id_path_buf_list)):
+                    img_buf = image_id_path_buf_list[k][2]
+                    pad_id = 1
+                    while pad_id < num_seg and img_buf is None:
+                        img_buf = imgs_seg_list[(k + pad_id) % num_seg][2]
+                    if img_buf is None:
+                        print("read img erro from {} to {}".format(
+                            i, i + read_thread_num))
+                        exit(0)
+                    else:
+                        imgs_seg_list[k] = img_buf
+                for pad_id in range(len(imgs_seg_list), num_seg):
+                    imgs_seg_list.append(imgs_seg_list[-1])
+                yield imgs_seg_list
+
+
+        def inference_imgs_transform(imgs_list, mode, num_seg, seglen, short_size,\
+                                    target_size, img_mean, img_std):
+            """
+            inference_imgs_transform
+            """
+            imgs_ret = imgs_transform(imgs_list, mode, num_seg, seglen,
+                                      short_size, target_size, img_mean,
+                                      img_std)
+            label_ret = 0
+
+            return imgs_ret, label_ret
+
+        mapper = functools.partial(inference_imgs_transform,
+                                   mode=mode,
+                                   num_seg=num_seg,
+                                   seglen=seglen,
+                                   short_size=short_size,
+                                   target_size=target_size,
+                                   img_mean=img_mean,
+                                   img_std=img_std)
+
+        return paddle.reader.xmap_readers(mapper,
+                                          reader,
+                                          num_threads,
+                                          buf_size,
+                                          order=True)
+
+
+def imgs_transform(imgs,
+                   mode,
+                   num_seg,
+                   seglen,
+                   short_size,
+                   target_size,
+                   img_mean,
+                   img_std,
+                   name=''):
+    """
+    imgs_transform
+    """
+    imgs = group_scale(imgs, short_size)
+
+    if mode == 'train':
+        if name == "TSM":
+            imgs = group_multi_scale_crop(imgs, short_size)
+        imgs = group_random_crop(imgs, target_size)
+        imgs = group_random_flip(imgs)
+    else:
+        imgs = group_center_crop(imgs, target_size)
+
+    np_imgs = (np.array(imgs[0]).astype('float32').transpose(
+        (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255
+    for i in range(len(imgs) - 1):
+        img = (np.array(imgs[i + 1]).astype('float32').transpose(
+            (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255
+        np_imgs = np.concatenate((np_imgs, img))
+    imgs = np_imgs
+    imgs -= img_mean
+    imgs /= img_std
+    imgs = np.reshape(imgs, (num_seg, seglen * 3, target_size, target_size))
+
+    return imgs
+
+def group_multi_scale_crop(img_group, target_size, scales=None, \
+        max_distort=1, fix_crop=True, more_fix_crop=True):
+    """
+    group_multi_scale_crop
+    """
+    scales = scales if scales is not None else [1, .875, .75, .66]
+    input_size = [target_size, target_size]
+
+    im_size = img_group[0].size
+
+    # get random crop offset
+    def _sample_crop_size(im_size):
+        """
+         _sample_crop_size
+        """
+        image_w, image_h = im_size[0], im_size[1]
+
+        base_size = min(image_w, image_h)
+        crop_sizes = [int(base_size * x) for x in scales]
+        crop_h = [
+            input_size[1] if abs(x - input_size[1]) < 3 else x
+            for x in crop_sizes
+        ]
+        crop_w = [
+            input_size[0] if abs(x - input_size[0]) < 3 else x
+            for x in crop_sizes
+        ]
+
+        pairs = []
+        for i, h in enumerate(crop_h):
+            for j, w in enumerate(crop_w):
+                if abs(i - j) <= max_distort:
+                    pairs.append((w, h))
+
+        crop_pair = random.choice(pairs)
+        if not fix_crop:
+            w_offset = random.randint(0, image_w - crop_pair[0])
+            h_offset = random.randint(0, image_h - crop_pair[1])
+        else:
+            w_step = (image_w - crop_pair[0]) / 4
+            h_step = (image_h - crop_pair[1]) / 4
+
+            ret = list()
+            ret.append((0, 0))  # upper left
+            if w_step != 0:
+                ret.append((4 * w_step, 0))  # upper right
+            if h_step != 0:
+                ret.append((0, 4 * h_step))  # lower left
+            if h_step != 0 and w_step != 0:
+                ret.append((4 * w_step, 4 * h_step))  # lower right
+            if h_step != 0 or w_step != 0:
+                ret.append((2 * w_step, 2 * h_step))  # center
+
+            if more_fix_crop:
+                ret.append((0, 2 * h_step))  # center left
+                ret.append((4 * w_step, 2 * h_step))  # center right
+                ret.append((2 * w_step, 4 * h_step))  # lower center
+                ret.append((2 * w_step, 0 * h_step))  # upper center
+
+                ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+                ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+                ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+                ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+
+            w_offset, h_offset = random.choice(ret)
+            crop_info = {
+                'crop_w': crop_pair[0],
+                'crop_h': crop_pair[1],
+                'offset_w': w_offset,
+                'offset_h': h_offset
+            }
+
+        return crop_info
+
+    crop_info = _sample_crop_size(im_size)
+    crop_w = crop_info['crop_w']
+    crop_h = crop_info['crop_h']
+    offset_w = crop_info['offset_w']
+    offset_h = crop_info['offset_h']
+    crop_img_group = [
+        img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
+        for img in img_group
+    ]
+    ret_img_group = [
+        img.resize((input_size[0], input_size[1]), Image.BILINEAR)
+        for img in crop_img_group
+    ]
+
+    return ret_img_group
+
+
+def group_random_crop(img_group, target_size):
+    """
+    group_random_crop
+    """
+    w, h = img_group[0].size
+    th, tw = target_size, target_size
+
+    assert (w >= target_size) and (h >= target_size), \
+          "image width({}) and height({}) should be larger than crop size".format(w, h)
+
+    out_images = []
+    x1 = random.randint(0, w - tw)
+    y1 = random.randint(0, h - th)
+
+    for img in img_group:
+        if w == tw and h == th:
+            out_images.append(img)
+        else:
+            out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+
+    return out_images
+
+
+def group_random_flip(img_group):
+    """
+    group_random_flip
+    """
+    v = random.random()
+    if v < 0.5:
+        ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+        return ret
+    else:
+        return img_group
+
+
+def group_center_crop(img_group, target_size):
+    """
+    group_center_crop
+    """
+    img_crop = []
+    for img in img_group:
+        w, h = img.size
+        th, tw = target_size, target_size
+        assert (w >= target_size) and (h >= target_size), \
+             "image width({}) and height({}) should be larger than crop size".format(w, h)
+        x1 = int(round((w - tw) / 2.))
+        y1 = int(round((h - th) / 2.))
+        img_crop.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+
+    return img_crop
+
+
+def group_scale(imgs, target_size):
+    """
+    group_scale
+    """
+    resized_imgs = []
+    for i in range(len(imgs)):
+        img = imgs[i]
+        w, h = img.size
+        if (w <= h and w == target_size) or (h <= w and h == target_size):
+            resized_imgs.append(img)
+            continue
+
+        if w < h:
+            ow = target_size
+            oh = int(target_size * 4.0 / 3.0)
+            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+        else:
+            oh = target_size
+            ow = int(target_size * 4.0 / 3.0)
+            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+
+    return resized_imgs
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/utils/config_utils.py b/docs/src/applications/BasketballAction/predict/action_detect/utils/config_utils.py
new file mode 100644
index 000000000..e5db92b0d
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/utils/config_utils.py
@@ -0,0 +1,80 @@
+"""
+config_utils
+"""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import ast
+
+import logger
+
+logger = logger.Logger()
+
+CONFIG_SECS = [
+    'train',
+    'valid',
+    'test',
+    'infer',
+]
+
+class AttrDict(dict):
+    """
+    AttrDict
+    """
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    import yaml
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def create_attr_dict(yaml_config):
+    """create_attr_dict"""
+    for key, value in yaml_config.items():
+        if isinstance(value, dict):
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = ast.literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+    return
+
+
+def print_configs(cfg, mode):
+    """print_configs"""
+    logger.info("---------------- {:>5} Arguments ----------------".format(
+        mode))
+    for sec, sec_items in cfg.items():
+        logger.info("{}:".format(sec))
+        for k, v in sec_items.items():
+            logger.info("    {}:{}".format(k, v))
+    logger.info("-------------------------------------------------")
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/utils/preprocess.py b/docs/src/applications/BasketballAction/predict/action_detect/utils/preprocess.py
new file mode 100644
index 000000000..d14aaf1ee
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/utils/preprocess.py
@@ -0,0 +1,36 @@
+""" extract frames and pcm"""
+import os
+import sys
+import shutil
+
+
+def ffmpeg_frames(mp4_addr, frame_out_folder, fps=5):
+    """ffmpeg_frames"""
+    if os.path.exists(frame_out_folder):
+        shutil.rmtree(frame_out_folder)
+    os.makedirs(frame_out_folder)
+    cmd = './src/utils/ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (mp4_addr, fps, frame_out_folder, '%08d')
+    os.system(cmd)
+
+
+def ffmpeg_pcm(mp4_addr, save_file_name):
+    """ffmpeg_pcm"""
+    cmd = './src/utils/ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' \
+        % (mp4_addr, save_file_name)
+    os.system(cmd)
+
+
+def ffmpeg_mp4(mp4_url, mp4_addr):
+    """ffmpeg_mp4"""
+    cmd = "wget %s -O %s -q" % (mp4_url, mp4_addr)
+    print ("cmd = ", cmd)
+    os.system(cmd)
+
+
+def get_images(image_path):
+    """get_images"""
+    images = sorted(os.listdir(image_path))
+    images = images
+    images_path_list = [image_path + '/' + im for im in images]
+    return images_path_list
+
diff --git a/docs/src/applications/BasketballAction/predict/action_detect/utils/process_result.py b/docs/src/applications/BasketballAction/predict/action_detect/utils/process_result.py
new file mode 100644
index 000000000..164869696
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/action_detect/utils/process_result.py
@@ -0,0 +1,144 @@
+"""
+# @File  : process_result.py  
+# @Author: macaihong
+# @Date  : 2019/12/15
+# @Desc  :
+"""
+
+import sys
+import os
+import re
+import numpy as np
+import pickle
+import json
+import logger
+
+logger = logger.Logger()
+
+
+def get_data_res(label_map, data, topk):
+    """get_data_res"""
+    sum_vid = len(data)
+    video_result = []
+    for i in range(sum_vid):
+        vid_name = data[i][0][0]
+        # true_label predict_start predict_end predict_score predict_len gt_iou gt_start gt_ioa
+        feature_start_id = float(data[i][0][1]['start'])
+        feature_end_id = float(data[i][0][1]['end'])
+        feature_stage1_score = data[i][0][1]['score']
+        predict_res = []
+        for k in range(topk):
+            score_top = data[i][1][k]
+            labelid_top = data[i][2][k]
+            label_iou = data[i][3]
+            labelname_top = label_map[str(labelid_top)]
+            video_result.append([feature_start_id, feature_end_id, labelid_top, labelname_top, score_top, label_iou])
+    return video_result
+
+
+def base_nms(bboxes, thresh, delta=0, nms_id=2):
+    """
+    One-dimensional non-maximal suppression
+    :param bboxes: [[vid, label, st, ed, score, ...], ...]
+    :param thresh:
+    :return:
+    """
+    """
+    t1 = bboxes[:, 0]
+    t2 = bboxes[:, 1]
+    scores = bboxes[:, nms_id]
+    """
+
+    t1 = np.array([max(0, x[0] - delta) for x in bboxes])
+    t2 = np.array([x[1] + delta for x in bboxes])
+    scores = np.array([x[nms_id] for x in bboxes])
+
+    durations = t2 - t1
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        tt1 = np.maximum(t1[i], t1[order[1:]])
+        tt2 = np.minimum(t2[i], t2[order[1:]])
+        intersection = tt2 - tt1
+        IoU = intersection / (durations[i] + durations[order[1:]] - intersection).astype(float)
+
+        inds = np.where(IoU <= thresh)[0]
+        order = order[inds + 1]
+    return [bboxes[i] for i in keep]
+
+
+def process_proposal(source_prop_box, min_frame_thread=5, nms_thresh=0.7, score_thresh=0.01):
+    """process_video_prop"""
+    prop_box = []
+    for items in source_prop_box:
+        start_frame = float(items[0])
+        end_frame = float(items[1])
+        score = float(items[2])
+        if end_frame - start_frame < min_frame_thread or score < score_thresh:
+            continue
+        prop_box.append([start_frame, end_frame, score])
+
+    prop_box_keep = base_nms(prop_box, nms_thresh)
+
+    prop_res = []
+    for res in prop_box_keep:
+        prop_res.append({'start': res[0], 'end': res[1], 'score': res[2]})
+
+    return prop_res
+
+
+def process_video_classify(video_prop, fps, score_thread, iou_thread, \
+                           nms_id=5, nms_thread=0.01, nms_delta=10, backgroundid=0):
+    """process_video_classify"""
+    prop_filter = []
+    for item in video_prop:
+        if item[2] == backgroundid:
+            continue
+        prop_filter.append(item)
+
+    # prop_filter = sorted(prop_filter, key=lambda x: x[nms_id], reverse=True)
+    prop_filter = base_nms(prop_filter, nms_thread, nms_delta, nms_id)
+    prop_filter = sorted(prop_filter, key=lambda x: x[0])
+
+    video_results = []
+    for item in prop_filter:
+        start_sec = item[0] / fps
+        end_sec = item[1] / fps
+
+        start_id_frame = item[0]
+        end_id_frame = item[1]
+        # start_time = "%02d:%02d:%02d" % ((start_id_frame / fps) / 3600, \
+        #     ((start_id_frame / fps) % 3600) / 60, (start_id_frame / fps) % 60)
+        # end_time = "%02d:%02d:%02d" % ((end_id_frame / fps) / 3600, \
+        #     ((end_id_frame / fps) % 3600) / 60, (end_id_frame / fps) % 60)
+        start_time = int(start_id_frame / fps)
+        end_time = int(end_id_frame / fps)
+
+        label_id = item[2]
+        label_name = item[3]
+        label_classify_score = item[4]
+        label_iou_score = item[5]
+        if label_classify_score > score_thread and label_iou_score > iou_thread:
+            video_results.append({"start_time": start_time,
+                                  "end_time": end_time,
+                                  "label_id": label_id,
+                                  "label_name": label_name,
+                                  "classify_score": label_classify_score,
+                                  "iou_score": label_iou_score})
+
+    return video_results
+
+
+def get_action_result(result_info, label_map_file, fps, score_thread=0, \
+                      iou_thread=0, nms_id=5, nms_thread=0.01, frame_offset=10, topk=1):
+    """get_action_result"""
+
+    label_map = json.load(open(label_map_file, 'r', encoding='utf-8'))
+
+    org_result = get_data_res(label_map, result_info, topk)
+    nms_result = process_video_classify(org_result, fps, score_thread, iou_thread, nms_id, nms_thread, frame_offset)
+
+    return nms_result
diff --git a/docs/src/applications/BasketballAction/predict/eval.py b/docs/src/applications/BasketballAction/predict/eval.py
new file mode 100644
index 000000000..f7fe5705b
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/eval.py
@@ -0,0 +1,238 @@
+"""
+get instance for lstm
+根据gts计算每个proposal_bmn的iou、ioa、label等信息
+"""
+import os
+import sys
+import json
+import random
+import pickle
+import numpy as np
+
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding = 'utf-8')
+
+dataset = "datasets/"
+
+label_index_file = './configs_basketball/index_label_basketball_6.json'
+eval_datasets = ['EuroCup2016']
+label_files = {'train': 'label_cls6_train.json',
+               'validation': 'label_cls6_val.json'}
+
+global fps, mode
+label_index = json.load(open(label_index_file, 'rb'))
+
+def load_gts():
+    global fps
+    gts_data = {'fps': 0, 'gts': {}}
+    for eval_data in eval_datasets:
+        for item, value in label_files.items():
+            label_file = '{}/{}/{}'.format(dataset, eval_data, value)
+            gts = json.load(open(label_file, 'rb'))
+            gts_data['fps'] = gts['fps']
+            fps = gts['fps']
+            for gt in gts['gts']:
+                gt['mode'] = item
+                basename = '{}/{}/mp4/{}'.format(dataset, eval_data, os.path.basename(gt['url']))
+                gts_data['gts'][basename] = gt
+    return gts_data['gts']
+    
+
+def computeIoU(e1, e2):
+    """
+    clc iou and ioa
+    """
+    if not (e1['label'] == e2['label'] and e1['basename'] == e2['basename']):
+        return 0.
+    area1 = e1["end"] - e1["start"]
+    area2 = e2["end"] - e2["start"]
+    x1 = np.maximum(e1["start"], e2["start"])
+    x2 = np.minimum(e1["end"], e2["end"])
+    inter = np.maximum(0.0, x2 - x1)
+    iou = 0.0 if (area1 + area2 - inter) == 0 else inter * 1.0 / (area1 + area2 - inter)
+    if not mode == 'proposal':
+        iou = 0.0 if area2 == 0 else inter * 1.0 / area2
+    return iou
+
+
+def convert_proposal(boxes, basename, score_threshold=0.01):
+    boxes = sorted(boxes, key=lambda x:float(x['score']), reverse=True)
+    res = []
+    for box in boxes:
+        if not float(box['score']) >= score_threshold:
+            continue
+        res.append({'basename': basename,
+                    'start': int(float(box['start']) / fps),
+                    'end': int(float(box['end']) / fps),
+                    'label': 0})
+    return res
+
+def convert_classify(boxes, basename, iou_threshold, score_threshold):
+    boxes = sorted(boxes, key=lambda x:(float(x['classify_score']), float(x['iou_score'])), reverse=True)
+    def convert_time_to_frame(time_type):
+        return int(time_type)
+        h, m, s = time_type.split(':')
+        return int(h) * 3600 + int(m) * 60 + int(s)
+    res = []
+    for box in boxes:
+        if not (box['iou_score'] >= iou_threshold and
+                box['classify_score'] >= score_threshold):
+            continue
+        res.append({'basename': basename,
+                    'start': convert_time_to_frame(box['start_time']),
+                    'end': convert_time_to_frame(box['end_time']),
+                    'label': box['label_id']})
+    return res
+        
+def convert_groundtruth(boxes, basename, phase=None):
+    res = []
+    for box in boxes:
+        for item in box['label_ids']:
+            label = 0 if phase == 'proposal' else item
+            res.append({'basename': basename,
+                        'start': box['start_id'],
+                        'end': box['end_id'],
+                        'label': label})
+    return res
+def print_head(iou):
+    print("\nioa = {:.1f}".format(iou))
+    res_str = ''
+    for item in ['label_name']:
+        res_str += '{:<12s}'.format(item)
+    for item in ['label_id', 'precision', 'recall', 'hit_prop', 'num_prop', 'hit_gts', 'num_gts']:
+        res_str += '{:<10s}'.format(item)
+    print(res_str)
+
+def print_result(res_dict, label='avg'):
+    if label == 'avg':
+        res_str = '{:<22s}'.format(str(label))
+    else:
+        res_str = '{0:{2}<6s}{1:<10s}'.format(label_index[str(label)], str(label), chr(12288))
+
+    for item in ['prec', 'recall']:
+        res_str += '{:<10.4f}'.format(res_dict[item])
+    for item in ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']:
+        res_str += '{:<10d}'.format(res_dict[item])
+    print(res_str)
+
+def evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = False):
+    iou_map = [computeIoU(resId, gtsId) for resId in res_boxes \
+                                        for gtsId in gts_boxes]
+    iou_map = np.array(iou_map).reshape((len(res_boxes), len(gts_boxes)))
+    hit_map_prop_total = np.max(iou_map, axis=1)
+    hit_map_index_total = np.argmax(iou_map, axis=1)
+
+    res_dict = ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']
+
+    for iou_threshold in iou_range:
+        if show_sub:
+            print_head(iou_threshold)
+            
+        iou_prop = np.array([k >= iou_threshold for k in hit_map_prop_total])
+        average_results = {}
+        for label_id in label_range:
+            sub_results = {}
+            label_prop = np.array([k['label'] == label_id for k in res_boxes])
+            label_gts = np.array([k['label'] == label_id for k in gts_boxes])
+            sub_results['num_prop'] = sum(label_prop)
+            sub_results['num_gts'] = sum(label_gts)
+            if sub_results['num_prop'] == 0:
+                hit_prop_index = []
+            else:
+                hit_prop_index = label_prop & iou_prop
+            sub_results['hit_prop'] = sum(hit_prop_index)
+            sub_results['hit_gts'] = len(set(hit_map_index_total[hit_prop_index]))
+
+            sub_results['prec'] = 0.0 if sub_results['num_prop'] == 0 \
+                                      else sub_results['hit_prop'] * 1.0 / sub_results['num_prop']
+            sub_results['recall'] = 0.0 if sub_results['num_gts'] == 0 \
+                                        else sub_results['hit_gts'] * 1.0 / sub_results['num_gts']
+            if show_sub:
+                print_result(sub_results, label=label_id)
+            for item in res_dict:
+                if not item in average_results:
+                    average_results[item] = 0
+                average_results[item] += sub_results[item]
+        if len(label_range) == 1:   # proposal 不需要输出average值
+            continue
+        average_results['prec'] = 0.0 if average_results['num_prop'] == 0 \
+                                      else average_results['hit_prop'] * 1.0 / average_results['num_prop']
+        average_results['recall'] = 0.0 if average_results['num_gts'] == 0 \
+                                        else average_results['hit_gts'] * 1.0 / average_results['num_gts']
+        if show_sub:
+            print_result(average_results)
+
+        average_results['F1'] = 0.0 if (average_results['prec'] + average_results['recall'] == 0) \
+                                    else 2 * average_results['prec'] * average_results['recall'] / \
+                                            (average_results['prec'] + average_results['recall'])
+        return average_results
+
+def get_eval_results(predicts, gts_data, phase, iou_threshold = 0.3, score_threshold = 0.3, show_sub = False):
+    global mode
+    mode = phase
+    res_boxes = []
+    gts_boxes = []
+    for ped_data in predicts:
+        basename = ped_data['video_name']
+
+        # eval sub data
+        such_eval = False
+        for eval_name in eval_datasets:
+            if eval_name in basename:
+                such_eval = True
+                break
+        if not such_eval:
+            continue
+
+        gts = gts_data[basename]['actions']
+        if phase == 'proposal':
+            res_boxes.extend(convert_proposal(ped_data['bmn_results'], basename, score_threshold))
+            gts_boxes.extend(convert_groundtruth(gts, basename, phase='proposal'))
+            label_range = [0]
+            iou_range = np.arange(0.1, 1, 0.1)
+        else:
+            res_boxes.extend(convert_classify(ped_data['action_results'], basename, iou_threshold, score_threshold))
+            gts_boxes.extend(convert_groundtruth(gts, basename))
+            label_range = range(1, len(label_index))
+            iou_range = np.arange(0.5, 0.6, 0.1)
+            
+    eval_results = evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = show_sub)
+     
+    return eval_results
+    
+
+if __name__ == "__main__":
+    result_file = sys.argv[1]
+    predicts = json.load(open(result_file, 'r', encoding='utf-8'))
+    gts_data = load_gts()
+
+    get_eval_results(predicts, gts_data, 'proposal', 
+                     score_threshold = 0.03,
+                     show_sub = True)
+    #get_eval_results(predicts, gts_data, 'actions')
+
+    best_F1 = -0.1
+    best_res = {}
+    best_iou_threshold = 0.
+    best_score_threshold = 0.
+    for iou_threshold in np.arange(0.1, 0.9, 0.1):
+        for score_threshold in np.arange(0.1, 1, 0.1):
+            avg_res = get_eval_results(predicts, gts_data, 'actions', 
+                                       iou_threshold = iou_threshold,
+                                       score_threshold = score_threshold,
+                                       show_sub = False)
+            if best_F1 < avg_res['F1']:
+                best_F1 = avg_res['F1']
+                best_res = avg_res
+                best_iou_threshold = iou_threshold
+                best_score_threshold = score_threshold
+    print("best iou threshold = {:.1f}".format(best_iou_threshold))
+    print("best score threshold = {:.1f}".format(best_score_threshold))
+    print('best F1 score = {:.4f}'.format(best_F1))
+    print_head(0.5)
+    print_result(best_res)
+
+    get_eval_results(predicts, gts_data, 'actions', iou_threshold = best_iou_threshold,
+                                                    score_threshold = best_score_threshold,
+                                                    show_sub = True)
+    
diff --git a/docs/src/applications/BasketballAction/predict/predict.py b/docs/src/applications/BasketballAction/predict/predict.py
new file mode 100644
index 000000000..168d34125
--- /dev/null
+++ b/docs/src/applications/BasketballAction/predict/predict.py
@@ -0,0 +1,35 @@
+
+import os
+import sys
+import json
+
+sys.path.append('action_detect')
+from action import ActionDetection
+
+if __name__ == '__main__':
+    dataset_dir = "datasets/"
+    
+    model_predict = ActionDetection(cfg_file="configs_basketball/configs_basketball.yaml")
+    model_predict.load_model()
+    
+    video_url = os.path.join(dataset_dir, 'mp4.list')
+    with open(video_url, 'r') as f:
+        lines = f.readlines()
+    lines = [os.path.join(dataset_dir, "mp4", os.path.basename(k.strip())) for k in lines]
+    
+    results = []
+    for line in lines:
+        video_name = line
+        print(video_name)
+
+        imgs_path = video_name.replace(".mp4", "").replace("mp4", "frames")
+        pcm_path = video_name.replace(".mp4", ".pcm").replace("mp4", "pcm")
+
+        bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)
+        results.append({'video_name': line,
+                        'bmn_results': bmn_results, 
+                        'action_results': action_results})
+
+    with open('results.json', 'w', encoding='utf-8') as f:
+       data = json.dumps(results, indent=4, ensure_ascii=False)
+       f.write(data) 
diff --git a/docs/src/applications/EIVideo/EIVideo/README.MD b/docs/src/applications/EIVideo/EIVideo/README.MD
new file mode 100644
index 000000000..6dc0fe4b9
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/README.MD
@@ -0,0 +1,15 @@
+# 交互式视频智能标注工具 - CLI(Command Line Interface)
+
+在开始使用之前，您需要按照以下命令安装额外的依赖包：
+```bash
+python -m pip install scikit-image
+```
+
+## 推理运行方式
+```shell
+
+C:\Python\Python37\python.exe main.py --test -c E:/PaddlePaddle_Project/EIVideo/resources/backend/configs/manet.yaml -w E:/PaddlePaddle_Project/EIVideo/resources/backend/model/save_step_80000.pdparams
+C:\Python\Python37\python.exe resources/backend/main.py --test -c E:/PaddlePaddle_Project/EIVideo/resources/backend/configs/manet.yaml -w E:/PaddlePaddle_Project/EIVideo/resources/backend/model/save_step_80000.pdparams
+```
+## 参考文档
+[manet](docs/zh-CN/manet.md)
\ No newline at end of file
diff --git a/docs/src/applications/EIVideo/EIVideo/__init__.py b/docs/src/applications/EIVideo/EIVideo/__init__.py
new file mode 100644
index 000000000..17f7630ec
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/__init__.py
@@ -0,0 +1,16 @@
+# Author: Acer Zhang
+# Datetime: 2022/1/6 
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
+
+import os
+from EIVideo.version import __version__
+
+EI_VIDEO_ROOT = os.path.abspath(os.path.dirname(__file__))
+TEMP_IMG_SAVE_PATH = "./temp.png"
+TEMP_JSON_SAVE_PATH = "./save.json"
+TEMP_JSON_FINAL_PATH = "./final.json"
+
+
+def join_root_path(path: str):
+    return os.path.join(EI_VIDEO_ROOT, path)
diff --git a/docs/src/applications/EIVideo/EIVideo/api.py b/docs/src/applications/EIVideo/EIVideo/api.py
new file mode 100644
index 000000000..009673cba
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/api.py
@@ -0,0 +1,134 @@
+# Author: AP-Kai
+# Datetime: 2022/1/10
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
+
+
+import json
+import os
+from collections import OrderedDict
+import cv2
+import numpy as np
+from PIL import Image
+
+from EIVideo.paddlevideo.utils.manet_utils import overlay_davis
+from EIVideo import TEMP_JSON_SAVE_PATH, TEMP_JSON_FINAL_PATH
+
+
+def get_images(sequence='bike-packing'):
+    img_path = os.path.join('data', sequence.strip(), 'frame')
+    img_files = os.listdir(img_path)
+    img_files.sort()
+    files = []
+    for img in img_files:
+        img_file = np.array(Image.open(os.path.join(img_path, img)))
+        files.append(img_file)
+    return np.array(files)
+
+
+def json2frame(path):
+    print("now turn masks.json to frames", path)
+    with open(path, 'r', encoding='utf-8') as f:
+        res = f.read()
+        a = json.loads(res)
+        b = a.get('overlays')
+        b_array = np.array(b)
+        frame_list = []
+
+        for i in range(0, len(b_array)):
+            im = Image.fromarray(np.uint8(b_array[i]))
+            im = cv2.cvtColor(np.asarray(im), cv2.COLOR_RGB2BGR)
+            im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
+            # im = np.array(b_array[i]).astype("uint8")
+            # im = im.transpose((2, 0, 1))
+            # im = cv2.merge(im)
+            frame_list.append(im)
+    return frame_list
+
+
+def png2json(image_path, sliderframenum, save_json_path):
+    image = Image.open(image_path)  # 用PIL中的Image.open打开图像
+    image = image.convert('P')
+    image_arr = np.array(image)  # 转化成numpy数组
+    image_arr = image_arr.astype("float32")
+    r1 = np.argwhere(image_arr == 1)  # tuple
+    pframes = []
+    # i -> object id
+    for i in range(1, len(np.unique(image_arr))):
+        pframe = OrderedDict()
+        pframe['path'] = []
+        # Find object id in image_arr
+        r1 = np.argwhere(image_arr == i)  # tuple
+        r1 = r1.astype("float32")
+        # Add path to pframe
+        for j in range(0, len(r1)):
+            r1[j][0] = r1[j][0] / 480.0
+            r1[j][1] = r1[j][1] / 910.0
+            # r1[j] = np.around(r1[j], decimals=16)
+            pframe['path'].append(r1[j].tolist())
+        # Add object id, start_time, stop_time
+        pframe['object_id'] = i
+        pframe['start_time'] = sliderframenum
+        pframe['stop_time'] = sliderframenum
+        # Add pframe to pframes
+        pframes.append(pframe)
+
+    dic = OrderedDict()
+    dic['scribbles'] = []
+    for i in range(0, int(100)):
+        if i == sliderframenum:
+            # Add value to frame[]
+            dic['scribbles'].append(pframes)
+        else:
+            dic['scribbles'].append([])
+
+    json_str = json.dumps(dic)
+    with open(save_json_path, 'w') as json_file:
+        json_file.write(json_str)
+
+
+def load_video(video_path, min_side=None):
+    frame_list = []
+    # ToDo To AP-kai: 是不是轻松干掉了m.video_path？
+    cap = cv2.VideoCapture(video_path)
+    # ToDo To AP-kai: while (cap.isOpened()): -> 不必多写个括号哈
+    while cap.isOpened():
+        _, frame = cap.read()
+        if frame is None:
+            break
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        if min_side:
+            h, w = frame.shape[:2]
+            new_w = (w * min_side // min(w, h))
+            new_h = (h * min_side // min(w, h))
+            frame = cv2.resize(frame, (new_w, new_h),
+                               interpolation=cv2.INTER_CUBIC)
+            # .transpose([2, 0, 1])
+        frame_list.append(frame)
+    frames = np.stack(frame_list, axis=0)
+    return frames, frame_list
+
+
+def get_scribbles():
+    # os.makedirs(TEMP_JSON_SAVE_PATH, exist_ok=True)
+    with open(TEMP_JSON_SAVE_PATH) as f:
+        print("load TEMP_JSON_SAVE_PATH success")
+        scribbles = json.load(f)
+        first_scribble = True
+        yield scribbles, first_scribble
+
+
+def submit_masks(save_path, masks, images):
+    overlays = []
+    for img_name, (mask, image) in enumerate(zip(masks, images)):
+        overlay = overlay_davis(image, mask)
+        overlays.append(overlay.tolist())
+        overlay = Image.fromarray(overlay)
+        img_name = str(img_name)
+        while len(img_name) < 5:
+            img_name = '0' + img_name
+        overlay.save(os.path.join(save_path, img_name + '.png'))
+    result = {'overlays': overlays}
+    # result = {'masks': masks.tolist()}
+    with open(TEMP_JSON_FINAL_PATH, 'w') as f:
+        json.dump(result, f)
diff --git a/docs/src/applications/EIVideo/EIVideo/main.py b/docs/src/applications/EIVideo/EIVideo/main.py
new file mode 100644
index 000000000..f691e7616
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/main.py
@@ -0,0 +1,116 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless requifFred by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import random
+
+import numpy as np
+import paddle
+
+from EIVideo.paddlevideo.tasks import (test_model)
+from EIVideo.paddlevideo.utils import get_config, get_dist_info
+from EIVideo import EI_VIDEO_ROOT, join_root_path
+
+DEF_CONFIG_FILE_PATH = join_root_path("configs/manet.yaml")
+DEF_PARAMS_FILE_PATH = join_root_path("model/default_manet.pdparams")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleVideo train script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default=DEF_CONFIG_FILE_PATH,
+                        help='config file path')
+    parser.add_argument('-o',
+                        '--override',
+                        action='append',
+                        default=[],
+                        help='config options to be overridden')
+    parser.add_argument('--test',
+                        action='store_true',
+                        help='whether to test a model')
+    parser.add_argument('--train_dali',
+                        action='store_true',
+                        help='whether to use dali to speed up training')
+    parser.add_argument('--multigrid',
+                        action='store_true',
+                        help='whether to use multigrid training')
+    parser.add_argument('-w',
+                        '--weights',
+                        type=str,
+                        default=DEF_PARAMS_FILE_PATH,
+                        help='weights for finetuning or testing')
+    parser.add_argument('--fleet',
+                        action='store_true',
+                        help='whether to use fleet run distributed training')
+    parser.add_argument('--amp',
+                        action='store_true',
+                        help='whether to open amp training.')
+    parser.add_argument(
+        '--validate',
+        action='store_true',
+        help='whether to evaluate the checkpoint during training')
+    parser.add_argument(
+        '--seed',
+        type=int,
+        default=None,
+        help='fixed all random seeds when the program is running')
+    parser.add_argument(
+        '--max_iters',
+        type=int,
+        default=None,
+        help='max iterations when training(this argonly used in test_tipc)')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format '
+             '\"key1=value1;key2=value2;key3=value3\".')
+    parser.add_argument('--use_npu',
+                        type=bool,
+                        default=False,
+                        help='whether use npu.')
+
+    args = parser.parse_args()
+    return args
+
+
+def main(**kwargs):
+    args = parse_args()
+    cfg = get_config(args.config, overrides=args.override)
+    # ToDo To AP-kai: 下面这行代码目的是更新配置，这样的话我们调用main(use_npu = Ture)，这时cfg.use_npu就是Ture了
+    for key, value in kwargs.items():
+        cfg.__setattr__(key, value)
+
+    # set seed if specified
+    seed = args.seed
+    if seed is not None:
+        assert isinstance(
+            seed,
+            int), f"seed must be a integer when specified, but got {seed}"
+        paddle.seed(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+
+    _, world_size = get_dist_info()
+    parallel = world_size != 1
+    if parallel:
+        paddle.distributed.init_parallel_env()
+    final = test_model(cfg, weights=args.weights, parallel=parallel)
+    return final
+
+
+if __name__ == '__main__':
+    main(video_path='example/example1.mp4', save_path='./output')
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/__init__.py
new file mode 100644
index 000000000..8b03acf29
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .version import paddlevideo_version
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py
new file mode 100644
index 000000000..232be145d
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import build_batch_pipeline
+from .pipelines.compose import Compose
+
+__all__ = [
+    'build_batch_pipeline','Compose'
+]
+
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py
new file mode 100644
index 000000000..0e920aa25
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import signal
+import os
+import paddle
+from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
+
+from .pipelines.compose import Compose
+from .registry import DATASETS, PIPELINES, DATALOADERS, BATCH_SAMPLERS, SAMPLERS
+from ..utils import get_logger
+from ..utils.build_utils import build
+import numpy as np
+
+logger = get_logger("paddlevideo")
+
+
+def build_pipeline(cfg):
+    """Build pipeline.
+    Args:
+        cfg (dict): root config dict.
+    """
+    if cfg == None:
+        return
+    return Compose(cfg)
+
+
+def build_dataset(cfg):
+    """Build dataset.
+    Args:
+        cfg (dict): root config dict.
+
+    Returns:
+        dataset: dataset.
+    """
+    # XXX: ugly code here!
+    cfg_dataset, cfg_pipeline = cfg
+    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)
+    dataset = build(cfg_dataset, DATASETS, key="format")
+    return dataset
+
+
+def build_sampler(cfg):
+    """Build batch_sampler.
+    Args:
+        cfg (dict): root config dict.
+
+    Returns:
+        batch_sampler: batch_sampler.
+    """
+    sampler = build(cfg, SAMPLERS)
+    return sampler
+
+
+def build_batch_pipeline(cfg):
+    batch_pipeline = build(cfg, PIPELINES)
+    return batch_pipeline
+
+
+def build_custom_dataloader(cfg):
+    custom_dataloader = build(cfg, DATALOADERS, key='dataloader')
+    return custom_dataloader
+
+
+def build_dataloader(dataset,
+                     batch_size,
+                     num_workers,
+                     places=None,
+                     shuffle=True,
+                     drop_last=True,
+                     multigrid=False,
+                     collate_fn_cfg=None,
+                     **kwargs):
+    """Build Paddle Dataloader.
+
+    XXX explain how the batch_sampler work!
+
+    Args:
+        dataset (paddle.dataset): A PaddlePaddle dataset object.
+        batch_size (int): batch size on single card.
+        num_worker (int): num_worker
+        shuffle(bool): whether to shuffle the data at every epoch.
+    """
+
+    if not kwargs.get('sampler'):
+        batch_sampler = DistributedBatchSampler(dataset,
+                                                batch_size=batch_size,
+                                                shuffle=shuffle,
+                                                drop_last=drop_last)
+    else:
+        sampler = build_sampler(kwargs['sampler'])
+        batch_sampler = BatchSampler(dataset,
+                                     sampler=sampler,
+                                     batch_size=batch_size,
+                                     shuffle=shuffle,
+                                     drop_last=drop_last)
+    kwargs.update({'batch_sampler': batch_sampler})
+
+    # NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.
+
+    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:
+    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.
+
+    def mix_collate_fn(batch):
+        pipeline = build_batch_pipeline(collate_fn_cfg)
+        batch = pipeline(batch)
+        slots = []
+        for items in batch:
+            for i, item in enumerate(items):
+                if len(slots) < len(items):
+                    slots.append([item])
+                else:
+                    slots[i].append(item)
+        return [np.stack(slot, axis=0) for slot in slots]
+
+    # if collate_fn_cfg is not None:
+    # ugly code here. collate_fn is mix op config
+    #    collate_fn = mix_collate_fn(collate_fn_cfg)
+
+    data_loader = DataLoader(
+        dataset,
+        places=places,
+        num_workers=num_workers,
+        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,
+        **kwargs)
+
+    return data_loader
+
+
+def term_mp(sig_num, frame):
+    """ kill all child processes
+    """
+    pid = os.getpid()
+    pgid = os.getpgid(os.getpid())
+    logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid))
+    os.killpg(pgid, signal.SIGKILL)
+    return
+
+
+signal.signal(signal.SIGINT, term_mp)
+signal.signal(signal.SIGTERM, term_mp)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py
new file mode 100644
index 000000000..647989115
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .custom_transforms_f import Resize_manet, RandomCrop_manet, RandomHorizontalFlip_manet, ToTensor_manet, \
+    RandomScale_manet
+
+__all__ = [
+     'Resize_manet', 'RandomCrop_manet',
+    'RandomHorizontalFlip_manet', 'ToTensor_manet', 'RandomScale_manet',
+]
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py
new file mode 100644
index 000000000..76eb4ed4d
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Sequence
+from ..registry import PIPELINES
+import traceback
+from ...utils import build
+from ...utils import get_logger
+
+
+@PIPELINES.register()
+class Compose(object):
+    """
+    Composes several pipelines(include decode func, sample func, and transforms) together.
+
+    Note: To deal with ```list``` type cfg temporaray, like:
+
+        transform:
+            - Crop: # A list
+                attribute: 10
+            - Resize: # A list
+                attribute: 20
+
+    every key of list will pass as the key name to build a module.
+    XXX: will be improved in the future.
+
+    Args:
+        pipelines (list): List of transforms to compose.
+    Returns:
+        A compose object which is callable, __call__ for this Compose
+        object will call each given :attr:`transforms` sequencely.
+    """
+    def __init__(self, pipelines):
+        #assert isinstance(pipelines, Sequence)
+        self.pipelines = []
+        for p in pipelines.values():
+            if isinstance(p, dict):
+                p = build(p, PIPELINES)
+                self.pipelines.append(p)
+            elif isinstance(p, list):
+                for t in p:
+                    #XXX: to deal with old format cfg, ugly code here!
+                    temp_dict = dict(name=list(t.keys())[0])
+                    for all_sub_t in t.values():
+                        if all_sub_t is not None:
+                            temp_dict.update(all_sub_t) 
+      
+                    t = build(temp_dict, PIPELINES)
+                    self.pipelines.append(t)
+            elif callable(p):
+                self.pipelines.append(p)
+            else:
+                raise TypeError(f'pipelines must be callable or a dict,'
+                                f'but got {type(p)}')
+    def __call__(self, data):
+        for p in self.pipelines:
+            try:
+                data = p(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger = get_logger("paddlevideo")
+                logger.info("fail to perform transform [{}] with error: "
+                      "{} and stack:\n{}".format(p, e, str(stack_info)))
+                raise e
+        return data
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py
new file mode 100644
index 000000000..c2fc50633
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py
@@ -0,0 +1,220 @@
+import os
+import random
+import cv2
+import numpy as np
+import paddle
+from PIL import Image
+from davisinteractive.utils.operations import bresenham
+
+from ..registry import PIPELINES
+
+cv2.setNumThreads(0)
+NEW_BRANCH = True
+
+
+@PIPELINES.register()
+class RandomScale_manet(object):
+    """Randomly resize the image and the ground truth to specified scales.
+    Args:
+        scales (list): the list of scales
+    """
+    def __init__(self, scales=[0.75, 1, 1.25]):
+        self.scales = scales
+
+    def __call__(self, sample):
+
+        # Fixed range of scales
+        sc = self.scales[random.randint(0, len(self.scales) - 1)]
+
+        for elem in sample.keys():
+            if 'meta' in elem:
+                continue
+            tmp = sample[elem]
+
+            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':
+                flagval = cv2.INTER_CUBIC
+            else:
+                flagval = cv2.INTER_NEAREST
+
+            tmp = cv2.resize(tmp, None, fx=sc, fy=sc, interpolation=flagval)
+
+            sample[elem] = tmp
+
+        return sample
+
+
+@PIPELINES.register()
+class Resize_manet(object):
+    """Rescale the image in a results to a given size.
+
+    Args:
+        output_size (tuple or int): Desired output size. If tuple, output is
+            matched to output_size. If int, smaller of image edges is matched
+            to output_size keeping aspect ratio the same.
+    """
+    def __init__(self, output_size):
+        assert isinstance(output_size, (int, list))
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            self.output_size = output_size
+
+    #        self.seg_interpolation = cv2.INTER_CUBIC if is_continuous else cv2.INTER_NEAREST
+    #        self.fix = fix
+
+    def __call__(self, results):
+        img1 = results['img1']
+        h, w = img1.shape[:2]
+        if self.output_size == (h, w):
+            return results
+
+        else:
+            new_h, new_w = self.output_size
+        new_h, new_w = int(new_h), int(new_w)
+        for elem in results.keys():
+            if 'meta' in elem:
+                continue
+            tmp = results[elem]
+            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':
+                flagval = cv2.INTER_CUBIC
+            else:
+                flagval = cv2.INTER_NEAREST
+
+            tmp = cv2.resize(tmp, dsize=(new_w, new_h), interpolation=flagval)
+            results[elem] = tmp
+        return results
+
+
+@PIPELINES.register()
+class RandomCrop_manet(object):
+    """Crop randomly the image in a results.
+
+    Args:
+        output_size (tuple or int): Desired output size. If int, square crop
+            is made.
+    """
+    def __init__(self, output_size, step=None):
+        assert isinstance(output_size, (int, list))
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            assert len(output_size) == 2
+            self.output_size = output_size
+        self.step = step
+
+    def __call__(self, results):
+
+        image = results['img1']
+        h, w = image.shape[:2]
+        new_h, new_w = self.output_size
+
+        new_h = h if new_h >= h else new_h
+        new_w = w if new_w >= w else new_w
+        is_contain_obj = False
+
+        #        while (not is_contain_obj) and (step < 5):
+        if self.step is None:
+            while not is_contain_obj:
+                #                step += 1
+                top = np.random.randint(0, h - new_h + 1)
+                left = np.random.randint(0, w - new_w + 1)
+                ref_scribble_label = results['ref_scribble_label']
+                new_ref_scribble_label = ref_scribble_label[top:top + new_h,
+                                                            left:left + new_w]
+                if len(np.unique(new_ref_scribble_label)) == 1:
+                    continue
+                else:
+
+                    for elem in results.keys():
+                        if 'meta' in elem:
+                            continue
+
+                        tmp = results[elem]
+                        tmp = tmp[top:top + new_h, left:left + new_w]
+                        results[elem] = tmp
+                    break
+        else:
+            st = 0
+            while not is_contain_obj and st < self.step:
+                st += 1
+                top = np.random.randint(0, h - new_h + 1)
+                left = np.random.randint(0, w - new_w + 1)
+                ref_scribble_label = results['ref_scribble_label']
+                new_ref_scribble_label = ref_scribble_label[top:top + new_h,
+                                                            left:left + new_w]
+                if len(np.unique(
+                        new_ref_scribble_label)) == 1 or st < self.step - 1:
+                    continue
+                else:
+
+                    for elem in results.keys():
+                        if 'meta' in elem:
+                            continue
+
+                        tmp = results[elem]
+                        tmp = tmp[top:top + new_h, left:left + new_w]
+                        results[elem] = tmp
+                    break
+
+        return results
+
+
+@PIPELINES.register()
+class RandomHorizontalFlip_manet(object):
+    """Horizontally flip the given image and ground truth randomly with a probability of 0.5."""
+    def __init__(self, prob):
+        self.p = prob
+
+    def __call__(self, results):
+
+        if random.random() < self.p:
+            for elem in results.keys():
+                if 'meta' in elem:
+                    continue
+                tmp = results[elem]
+                tmp = cv2.flip(tmp, flipCode=1)
+                results[elem] = tmp
+
+        return results
+
+
+@PIPELINES.register()
+class ToTensor_manet(object):
+    """Convert ndarrays in results to Tensors."""
+    def __call__(self, results):
+
+        for elem in results.keys():
+            if 'meta' in elem:
+                continue
+            tmp = results[elem]
+
+            if tmp.ndim == 2:
+                tmp = tmp[:, :, np.newaxis]
+            else:
+                tmp = tmp / 255.
+                tmp -= (0.485, 0.456, 0.406)
+                tmp /= (0.229, 0.224, 0.225)
+            tmp = tmp.transpose([2, 0, 1])
+            results[elem] = paddle.to_tensor(tmp)
+        return results
+
+
+def gt_from_scribble(scr, dilation=11, nocare_area=21):
+    # Compute foreground
+    if scr.max() == 1:
+        kernel_fg = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
+                                              (dilation, dilation))
+        fg = cv2.dilate(scr.astype(np.uint8),
+                        kernel=kernel_fg).astype(scr.dtype)
+    else:
+        fg = scr
+
+    # Compute nocare area
+    if nocare_area is None:
+        nocare = None
+    else:
+        kernel_nc = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
+                                              (nocare_area, nocare_area))
+        nocare = cv2.dilate(fg, kernel=kernel_nc) - fg
+
+    return fg, nocare
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py
new file mode 100644
index 000000000..0af97a1ef
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+PIPELINES = Registry("pipeline")
+DATASETS = Registry("datasets")
+SAMPLERS = Registry("sampler")
+BATCH_SAMPLERS = Registry("batch_sampler")
+DATALOADERS = Registry("dataloader")
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py
new file mode 100644
index 000000000..844dc3d61
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .vos_metric import VOSMetric
+from .build import build_metric
+
+__all__ = [
+    'VOSMetric', "build_metric"
+]
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py
new file mode 100644
index 000000000..06302597b
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+from EIVideo.paddlevideo.utils import get_dist_info
+
+
+class BaseMetric(object):
+    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):
+        self.data_size = data_size
+        self.batch_size = batch_size
+        _, self.world_size = get_dist_info()
+        self.log_interval = log_interval
+
+    @abstractmethod
+    def update(self):
+        raise NotImplemented
+
+    @abstractmethod
+    def accumulate(self):
+        raise NotImplemented
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py
new file mode 100644
index 000000000..82e4b5026
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import METRIC
+from ..utils import build
+
+
+def build_metric(cfg):
+    return build(cfg, METRIC)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py
new file mode 100644
index 000000000..221444023
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+METRIC = Registry('metric')
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py
new file mode 100644
index 000000000..758564508
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py
@@ -0,0 +1,279 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import paddle
+import zipfile
+import time
+from PIL import Image
+
+from paddle.io import DataLoader
+
+from .registry import METRIC
+from .base import BaseMetric
+from EIVideo.paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class VOSMetric(BaseMetric):
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 result_root,
+                 zip_dir,
+                 log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.video_num = 0
+        self.total_time = 0
+        self.total_frame = 0
+        self.total_sfps = 0
+        self.total_video_num = data_size
+        self.count = 0
+        self.result_root = result_root
+        self.zip_dir = zip_dir
+
+    def update(self, batch_id, data, model):
+        """update metrics during each iter
+        """
+        self.video_num += 1
+        seq_dataset = data
+        seq_name = seq_dataset.seq_name
+
+        logger.info('Prcessing Seq {} [{}/{}]:'.format(seq_name,
+                                                       self.video_num,
+                                                       self.total_video_num))
+        seq_dataloader = DataLoader(seq_dataset,
+                                    return_list=True,
+                                    batch_size=1,
+                                    shuffle=False,
+                                    num_workers=0)
+        seq_total_time = 0
+        seq_total_frame = 0
+        ref_embeddings = []
+        ref_masks = []
+        prev_embedding = []
+        prev_mask = []
+        with paddle.no_grad():
+            for frame_idx, samples in enumerate(seq_dataloader):
+                time_start = time.time()
+                all_preds = []
+                join_label = None
+                for aug_idx in range(len(samples)):
+                    if len(ref_embeddings) <= aug_idx:
+                        ref_embeddings.append([])
+                        ref_masks.append([])
+                        prev_embedding.append(None)
+                        prev_mask.append(None)
+
+                    sample = samples[aug_idx]
+                    ref_emb = ref_embeddings[aug_idx]
+                    ref_m = ref_masks[aug_idx]
+                    prev_emb = prev_embedding[aug_idx]
+                    prev_m = prev_mask[aug_idx]
+
+                    current_img = sample['current_img']
+                    if 'current_label' in sample.keys():
+                        current_label = sample['current_label']
+                        current_label = paddle.to_tensor(current_label)
+                    else:
+                        current_label = None
+
+                    obj_num = sample['meta']['obj_num']
+                    imgname = sample['meta']['current_name']
+                    ori_height = sample['meta']['height']
+                    ori_width = sample['meta']['width']
+                    current_img = current_img
+                    obj_num = obj_num
+                    bs, _, h, w = current_img.shape
+                    data_batch = [
+                        ref_emb, ref_m, prev_emb, prev_m, current_img,
+                        [ori_height, ori_width], obj_num
+                    ]
+
+                    all_pred, current_embedding = model(data_batch,
+                                                        mode='test')
+
+                    if frame_idx == 0:
+                        if current_label is None:
+                            logger.info(
+                                "No first frame label in Seq {}.".format(
+                                    seq_name))
+                        ref_embeddings[aug_idx].append(current_embedding)
+                        ref_masks[aug_idx].append(current_label)
+
+                        prev_embedding[aug_idx] = current_embedding
+                        prev_mask[aug_idx] = current_label
+                    else:
+                        if sample['meta']['flip']:  #False
+                            all_pred = self.flip_tensor(all_pred, 3)
+                        #  In YouTube-VOS, not all the objects appear in the first frame for the first time. Thus, we
+                        #  have to introduce new labels for new objects, if necessary.
+                        if not sample['meta']['flip'] and not (
+                                current_label is None) and join_label is None:
+                            join_label = paddle.cast(current_label,
+                                                     dtype='int64')
+                        all_preds.append(all_pred)
+                        if current_label is not None:
+                            ref_embeddings[aug_idx].append(current_embedding)
+                        prev_embedding[aug_idx] = current_embedding
+
+                if frame_idx > 0:
+                    all_preds = paddle.concat(all_preds, axis=0)
+                    all_preds = paddle.mean(
+                        all_preds, axis=0)  #average results if augmentation
+                    pred_label = paddle.argmax(all_preds, axis=0)
+                    if join_label is not None:
+                        join_label = paddle.squeeze(paddle.squeeze(join_label,
+                                                                   axis=0),
+                                                    axis=0)
+                        keep = paddle.cast((join_label == 0), dtype="int64")
+                        pred_label = pred_label * keep + join_label * (1 -
+                                                                       keep)
+                        pred_label = pred_label
+                    current_label = paddle.reshape(
+                        pred_label, shape=[1, 1, ori_height, ori_width])
+                    flip_pred_label = self.flip_tensor(pred_label, 1)
+                    flip_current_label = paddle.reshape(
+                        flip_pred_label, shape=[1, 1, ori_height, ori_width])
+
+                    for aug_idx in range(len(samples)):
+                        if join_label is not None:
+                            if samples[aug_idx]['meta']['flip']:
+                                ref_masks[aug_idx].append(flip_current_label)
+                            else:
+                                ref_masks[aug_idx].append(current_label)
+                        if samples[aug_idx]['meta']['flip']:
+                            prev_mask[aug_idx] = flip_current_label
+                        else:
+                            prev_mask[
+                                aug_idx] = current_label  #update prev_mask
+
+                    one_frametime = time.time() - time_start
+                    seq_total_time += one_frametime
+                    seq_total_frame += 1
+                    obj_num = float(obj_num)
+                    logger.info('Frame: {}, Obj Num: {}, Time: {}'.format(
+                        imgname[0], obj_num, one_frametime))
+                    self.save_mask(
+                        pred_label,
+                        os.path.join(self.result_root, seq_name,
+                                     imgname[0].split('.')[0] + '.png'))
+                else:
+                    one_frametime = time.time() - time_start
+                    seq_total_time += one_frametime
+                    logger.info('Ref Frame: {}, Time: {}'.format(
+                        imgname[0], one_frametime))
+
+            del (ref_embeddings)
+            del (ref_masks)
+            del (prev_embedding)
+            del (prev_mask)
+            del (seq_dataset)
+            del (seq_dataloader)
+
+        seq_avg_time_per_frame = seq_total_time / seq_total_frame
+        self.total_time += seq_total_time
+        self.total_frame += seq_total_frame
+        total_avg_time_per_frame = self.total_time / self.total_frame
+        self.total_sfps += seq_avg_time_per_frame
+        avg_sfps = self.total_sfps / (batch_id + 1)
+        logger.info("Seq {} FPS: {}, Total FPS: {}, FPS per Seq: {}".format(
+            seq_name, 1. / seq_avg_time_per_frame,
+            1. / total_avg_time_per_frame, 1. / avg_sfps))
+
+    def flip_tensor(self, tensor, dim=0):
+        inv_idx = paddle.cast(paddle.arange(tensor.shape[dim] - 1, -1, -1),
+                              dtype="int64")
+        tensor = paddle.index_select(x=tensor, index=inv_idx, axis=dim)
+        return tensor
+
+    def save_mask(self, mask_tensor, path):
+        _palette = [
+            0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128,
+            0, 128, 128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191,
+            128, 0, 64, 0, 128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0,
+            64, 0, 128, 64, 0, 0, 191, 0, 128, 191, 0, 0, 64, 128, 128, 64,
+            128, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26,
+            27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32,
+            32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38,
+            38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43,
+            44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49,
+            49, 50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55,
+            55, 55, 56, 56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60,
+            61, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66,
+            66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72,
+            72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77,
+            78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 83, 83,
+            83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 89,
+            89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94, 94, 94,
+            95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,
+            100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104,
+            104, 105, 105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108,
+            109, 109, 109, 110, 110, 110, 111, 111, 111, 112, 112, 112, 113,
+            113, 113, 114, 114, 114, 115, 115, 115, 116, 116, 116, 117, 117,
+            117, 118, 118, 118, 119, 119, 119, 120, 120, 120, 121, 121, 121,
+            122, 122, 122, 123, 123, 123, 124, 124, 124, 125, 125, 125, 126,
+            126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130, 130,
+            130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134,
+            135, 135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139,
+            139, 139, 140, 140, 140, 141, 141, 141, 142, 142, 142, 143, 143,
+            143, 144, 144, 144, 145, 145, 145, 146, 146, 146, 147, 147, 147,
+            148, 148, 148, 149, 149, 149, 150, 150, 150, 151, 151, 151, 152,
+            152, 152, 153, 153, 153, 154, 154, 154, 155, 155, 155, 156, 156,
+            156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160, 160, 160,
+            161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,
+            165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169,
+            169, 170, 170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173,
+            174, 174, 174, 175, 175, 175, 176, 176, 176, 177, 177, 177, 178,
+            178, 178, 179, 179, 179, 180, 180, 180, 181, 181, 181, 182, 182,
+            182, 183, 183, 183, 184, 184, 184, 185, 185, 185, 186, 186, 186,
+            187, 187, 187, 188, 188, 188, 189, 189, 189, 190, 190, 190, 191,
+            191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195, 195,
+            195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199,
+            200, 200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204,
+            204, 204, 205, 205, 205, 206, 206, 206, 207, 207, 207, 208, 208,
+            208, 209, 209, 209, 210, 210, 210, 211, 211, 211, 212, 212, 212,
+            213, 213, 213, 214, 214, 214, 215, 215, 215, 216, 216, 216, 217,
+            217, 217, 218, 218, 218, 219, 219, 219, 220, 220, 220, 221, 221,
+            221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225, 225, 225,
+            226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,
+            230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234,
+            234, 235, 235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238,
+            239, 239, 239, 240, 240, 240, 241, 241, 241, 242, 242, 242, 243,
+            243, 243, 244, 244, 244, 245, 245, 245, 246, 246, 246, 247, 247,
+            247, 248, 248, 248, 249, 249, 249, 250, 250, 250, 251, 251, 251,
+            252, 252, 252, 253, 253, 253, 254, 254, 254, 255, 255, 255
+        ]
+        mask = mask_tensor.cpu().numpy().astype('uint8')
+        mask = Image.fromarray(mask).convert('P')
+        mask.putpalette(_palette)
+        mask.save(path)
+
+    def zip_folder(self, source_folder, zip_dir):
+        f = zipfile.ZipFile(zip_dir, 'w', zipfile.ZIP_DEFLATED)
+        pre_len = len(os.path.dirname(source_folder))
+        for dirpath, dirnames, filenames in os.walk(source_folder):
+            for filename in filenames:
+                pathfile = os.path.join(dirpath, filename)
+                arcname = pathfile[pre_len:].strip(os.path.sep)
+                f.write(pathfile, arcname)
+        f.close()
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        self.zip_folder(self.result_root, self.zip_dir)
+        logger.info('Save result to {}.'.format(self.zip_dir))
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py
new file mode 100644
index 000000000..7d5ddcd63
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .backbones import DeepLab
+from .builder import (build_backbone, build_head, build_localizer, build_loss,
+                      build_recognizer)
+from .heads import IntVOS
+from .registry import (BACKBONES, DETECTORS, HEADS, LOCALIZERS, LOSSES,
+                       PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)
+from .weight_init import kaiming_normal_, trunc_normal_, weight_init_
+
+__all__ = [
+    'BACKBONES', 'HEADS', 'RECOGNIZERS', 'LOCALIZERS', 'PARTITIONERS',
+    'LOSSES', 'build_recognizer', 'build_localizer', 'build_head',
+    'build_backbone', 'build_loss', 'DETECTORS', 'kaiming_normal_', 'trunc_normal_',
+    'weight_init_', 'DeepLab', 'IntVOS'
+]
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py
new file mode 100644
index 000000000..2a715feb3
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .deeplab_manet import DeepLab
+
+__all__ = ['DeepLab']
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py
new file mode 100644
index 000000000..819c43bc1
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py
@@ -0,0 +1,124 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from EIVideo.paddlevideo.utils.manet_utils import kaiming_normal_
+
+
+class _ASPPModule(nn.Layer):
+    def __init__(self, inplanes, planes, kernel_size, padding, dilation,
+                 BatchNorm):
+        super(_ASPPModule, self).__init__()
+        self.atrous_conv = nn.Conv2D(inplanes,
+                                     planes,
+                                     kernel_size=kernel_size,
+                                     stride=1,
+                                     padding=padding,
+                                     dilation=dilation,
+                                     bias_attr=False)
+        self.bn = BatchNorm(planes)
+        self.relu = nn.ReLU(True)
+
+        self._init_weight()
+
+    def forward(self, x):
+        x = self.atrous_conv(x)
+        x = self.bn(x)
+
+        return self.relu(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                from EIVideo.paddlevideo.utils.manet_utils import fill_
+                fill_(m.weight, 1)
+                from EIVideo.paddlevideo.utils.manet_utils import zero_
+                zero_(m.bias)
+
+
+class ASPP(nn.Layer):
+    def __init__(self, backbone, output_stride, BatchNorm):
+        super(ASPP, self).__init__()
+        if backbone == 'drn':
+            inplanes = 512
+        elif backbone == 'mobilenet':
+            inplanes = 320
+        else:
+            inplanes = 2048
+        if output_stride == 16:
+            dilations = [1, 6, 12, 18]
+        elif output_stride == 8:
+            dilations = [1, 12, 24, 36]
+        else:
+            raise NotImplementedError
+
+        self.aspp1 = _ASPPModule(inplanes,
+                                 256,
+                                 1,
+                                 padding=0,
+                                 dilation=dilations[0],
+                                 BatchNorm=BatchNorm)
+        self.aspp2 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[1],
+                                 dilation=dilations[1],
+                                 BatchNorm=BatchNorm)
+        self.aspp3 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[2],
+                                 dilation=dilations[2],
+                                 BatchNorm=BatchNorm)
+        self.aspp4 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[3],
+                                 dilation=dilations[3],
+                                 BatchNorm=BatchNorm)
+
+        self.global_avg_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2D((1, 1)),
+            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),
+            BatchNorm(256), nn.ReLU())
+        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)
+        self.bn1 = BatchNorm(256)
+        self.relu = nn.ReLU(True)
+        self.dropout = nn.Dropout(0.1)
+        self._init_weight()
+
+    def forward(self, x):
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.interpolate(x5,
+                           size=x4.shape[2:],
+                           mode='bilinear',
+                           align_corners=True)
+        x = paddle.concat((x1, x2, x3, x4, x5), axis=1)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        return x
+        return self.dropout(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                # n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                # m.weight.normal_(0, math.sqrt(2. / n))
+                kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                from EIVideo.paddlevideo.utils.manet_utils import fill_
+                fill_(m.weight, 1)
+                from EIVideo.paddlevideo.utils.manet_utils import zero_
+                zero_(m.bias)
+
+
+def build_aspp(backbone, output_stride, BatchNorm):
+    return ASPP(backbone, output_stride, BatchNorm)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py
new file mode 100644
index 000000000..9f80dd41d
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py
@@ -0,0 +1,65 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from EIVideo.paddlevideo.utils.manet_utils import kaiming_normal_
+
+
+class Decoder(nn.Layer):
+    def __init__(self, num_classes, backbone, BatchNorm):
+        super(Decoder, self).__init__()
+        if backbone == 'resnet' or backbone == 'drn' or backbone == 'resnet_edge':
+            low_level_inplanes = 256
+        elif backbone == 'xception':
+            low_level_inplanes = 128
+        elif backbone == 'mobilenet':
+            low_level_inplanes = 24
+        else:
+            raise NotImplementedError
+
+        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)
+        self.bn1 = BatchNorm(48)
+        self.relu = nn.ReLU(True)
+        self.last_conv = nn.Sequential(
+            nn.Conv2D(304,
+                      256,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=False), BatchNorm(256), nn.ReLU(True),
+            nn.Sequential(),
+            nn.Conv2D(256,
+                      256,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=False), BatchNorm(256), nn.ReLU(True),
+            nn.Sequential())
+        self._init_weight()
+
+    def forward(self, x, low_level_feat):
+        low_level_feat = self.conv1(low_level_feat)
+        low_level_feat = self.bn1(low_level_feat)
+        low_level_feat = self.relu(low_level_feat)
+
+        x = F.interpolate(x,
+                          size=low_level_feat.shape[2:],
+                          mode='bilinear',
+                          align_corners=True)
+        x = paddle.concat((x, low_level_feat), axis=1)
+        x = self.last_conv(x)
+
+        return x
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                from EIVideo.paddlevideo.utils.manet_utils import fill_
+                fill_(m.weight, 1)
+                from EIVideo.paddlevideo.utils.manet_utils import zero_
+                zero_(m.bias)
+
+
+def build_decoder(num_classes, backbone, BatchNorm):
+    return Decoder(num_classes, backbone, BatchNorm)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py
new file mode 100644
index 000000000..d188dc143
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py
@@ -0,0 +1,90 @@
+import paddle
+import paddle.nn as nn
+
+from ..registry import BACKBONES
+from EIVideo.paddlevideo.modeling.backbones.aspp_manet import build_aspp
+from EIVideo.paddlevideo.modeling.backbones.decoder_manet import build_decoder
+from EIVideo.paddlevideo.modeling.backbones.resnet_manet import build_backbone
+
+
+class FrozenBatchNorm2d(nn.Layer):
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", paddle.ones(n))
+        self.register_buffer("bias", paddle.zeros(n))
+        self.register_buffer("running_mean", paddle.zeros(n))
+        self.register_buffer("running_var", paddle.ones(n))
+
+    def forward(self, x):
+        if x.dtype == paddle.float16:
+            self.weight = self.weight.half()
+            self.bias = self.bias.half()
+            self.running_mean = self.running_mean.half()
+            self.running_var = self.running_var.half()
+        scale = self.weight * self.running_var.rsqrt()
+        bias = self.bias - self.running_mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+        return x * scale + bias
+
+
+@BACKBONES.register()
+class DeepLab(nn.Layer):
+    def __init__(self,
+                 backbone='resnet',
+                 output_stride=16,
+                 num_classes=21,
+                 freeze_bn=False,
+                 pretrained=None):
+        super(DeepLab, self).__init__()
+        if backbone == 'drn':
+            output_stride = 8
+        if freeze_bn == True:
+            print("Use frozen BN in DeepLab")
+            BatchNorm = FrozenBatchNorm2d
+        else:
+            BatchNorm = nn.BatchNorm2D
+
+        self.backbone = build_backbone(output_stride, BatchNorm, pretrained)
+        self.aspp = build_aspp(backbone, output_stride, BatchNorm)
+        self.decoder = build_decoder(num_classes, backbone, BatchNorm)
+
+
+    def forward(self, input):
+        x, low_level_feat = self.backbone(input)
+        x = self.aspp(x)
+        x = self.decoder(x, low_level_feat)
+        return x
+
+    def freeze_bn(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.BatchNorm2D):
+                m.eval()
+
+    def get_1x_lr_params(self):
+        modules = [self.backbone]
+        for i in range(len(modules)):
+            for m in modules[i].named_modules():
+                if isinstance(m[1], nn.Conv2D) or isinstance(
+                        m[1], nn.BatchNorm2D):
+                    for p in m[1].parameters():
+                        if p.requires_grad:
+                            yield p
+
+    def get_10x_lr_params(self):
+        modules = [self.aspp, self.decoder]
+        for i in range(len(modules)):
+            for m in modules[i].named_modules():
+                if isinstance(m[1], nn.Conv2D) or isinstance(
+                        m[1], nn.BatchNorm2D):
+                    for p in m[1].parameters():
+                        if p.requires_grad:
+                            yield p
+
+
+if __name__ == "__main__":
+    model = DeepLab(backbone='resnet', output_stride=16)
+    model.eval()
+    input = paddle.rand([2, 3, 513, 513])
+    output = model(input)
+    print(output.shape)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py
new file mode 100644
index 000000000..2a490956d
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py
@@ -0,0 +1,245 @@
+import paddle.nn as nn
+# from reprod_log.utils import paddle2np
+
+from EIVideo.paddlevideo.utils.manet_utils import fill_, zero_
+
+
+class Bottleneck(nn.Layer):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 BatchNorm=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
+        self.bn1 = BatchNorm(planes)
+        self.conv2 = nn.Conv2D(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               dilation=dilation,
+                               padding=dilation,
+                               bias_attr=False)
+        self.bn2 = BatchNorm(planes)
+        self.conv3 = nn.Conv2D(planes,
+                               planes * 4,
+                               kernel_size=1,
+                               bias_attr=False)
+        self.bn3 = BatchNorm(planes * 4)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Layer):
+    def __init__(self,
+                 block,
+                 layers,
+                 output_stride,
+                 BatchNorm,
+                 pretrained=None):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        blocks = [1, 2, 4]
+        if output_stride == 16:
+            strides = [1, 2, 2, 1]
+            dilations = [1, 1, 1, 2]
+        elif output_stride == 8:
+            strides = [1, 2, 1, 1]
+            dilations = [1, 1, 2, 4]
+        else:
+            raise NotImplementedError
+
+        # Modules
+        self.conv1 = nn.Conv2D(3,
+                               64,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias_attr=False)
+        self.bn1 = BatchNorm(64)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block,
+                                       64,
+                                       layers[0],
+                                       stride=strides[0],
+                                       dilation=dilations[0],
+                                       BatchNorm=BatchNorm)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=strides[1],
+                                       dilation=dilations[1],
+                                       BatchNorm=BatchNorm)
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=strides[2],
+                                       dilation=dilations[2],
+                                       BatchNorm=BatchNorm)
+        self.layer4 = self._make_MG_unit(block,
+                                         512,
+                                         blocks=blocks,
+                                         stride=strides[3],
+                                         dilation=dilations[3],
+                                         BatchNorm=BatchNorm)
+        self.init_weight()
+
+
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    stride=1,
+                    dilation=1,
+                    BatchNorm=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes,
+                          planes * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, dilation, downsample,
+                  BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      dilation=dilation,
+                      BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def _make_MG_unit(self,
+                      block,
+                      planes,
+                      blocks,
+                      stride=1,
+                      dilation=1,
+                      BatchNorm=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes,
+                          planes * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes,
+                  planes,
+                  stride,
+                  dilation=blocks[0] * dilation,
+                  downsample=downsample,
+                  BatchNorm=BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, len(blocks)):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      stride=1,
+                      dilation=blocks[i] * dilation,
+                      BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, input):
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        low_level_feat = x
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x, low_level_feat
+
+    def init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                fill_(m.weight, 1)
+            elif isinstance(m, nn.BatchNorm2D):
+                fill_(m.weight, 1)
+                zero_(m.bias)
+        return self.sublayers()
+
+
+
+
+def ResNet101(output_stride, BatchNorm, pretrained=None):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3],
+                   output_stride,
+                   BatchNorm,
+                   pretrained=pretrained)
+    return model
+
+
+def build_backbone(output_stride, BatchNorm, pretrained):
+    return ResNet101(output_stride, BatchNorm, pretrained)
+
+
+if __name__ == "__main__":
+    import paddle
+
+    model = ResNet101(BatchNorm=nn.BatchNorm2D,
+                      pretrained=True,
+                      output_stride=8)
+    input = paddle.rand([1, 3, 512, 512])
+    output, low_level_feat = model(input)
+    print(output.shape)
+    print(low_level_feat.shape)
+    import json
+
+    with open('output.txt', 'w') as f:
+        json.dump(output.tolist(), f)
+    with open('low_level_feat.txt', 'w') as f:
+        json.dump(low_level_feat.tolist(), f)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py
new file mode 100644
index 000000000..d131ca48e
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, BBOX_CODERS, PARTITIONERS, MULTIMODAL, SEGMENT
+from ..utils import build
+from .registry import (BACKBONES, BBOX_ASSIGNERS, BBOX_CODERS, BBOX_SAMPLERS,
+                       DETECTORS, ESTIMATORS, HEADS, LOCALIZERS, LOSSES,
+                       MULTIMODAL, PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return build(cfg, BACKBONES)
+
+
+def build_roi_extractor(cfg):
+    """Build roi extractor."""
+    return build(cfg, ROI_EXTRACTORS)
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build(cfg, BBOX_ASSIGNERS)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box batch_sampler."""
+    return build(cfg, BBOX_SAMPLERS)
+
+
+def build_roi_extractor(cfg):
+    """Build roi extractor."""
+    return build(cfg, ROI_EXTRACTORS)
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build(cfg, BBOX_ASSIGNERS)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box batch_sampler."""
+    return build(cfg, BBOX_SAMPLERS)
+
+
+def build_head(cfg):
+    """Build head."""
+    return build(cfg, HEADS)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return build(cfg, LOSSES)
+
+
+def build_recognizer(cfg):
+    """Build recognizer."""
+    return build(cfg, RECOGNIZERS, key='framework')
+
+
+def build_localizer(cfg):
+    """Build localizer."""
+    return build(cfg, LOCALIZERS, key='framework')
+
+
+def build_segmentationer(cfg):
+    """Build detector."""
+    return build(cfg, SEGMENT, key='framework')
+
+
+def build_partitioner(cfg):
+    """Build partitioner."""
+    return build(cfg, PARTITIONERS, key='framework')
+
+
+def build_estimator(cfg):
+    """Build estimator."""
+    return build(cfg, ESTIMATORS, key='framework')
+
+
+def build_multimodal(cfg):
+    """Build multimodal."""
+    return build(cfg, MULTIMODAL, key='framework')
+
+
+def build_detector(cfg):
+    """Build multimodal."""
+    return build(cfg, DETECTORS, key='framework')
+
+
+def build_segment(cfg):
+    """Build segment."""
+    return build(cfg, SEGMENT, key='framework')
+
+
+def build_model(cfg, key='framework'):
+    cfg_copy = cfg.copy()
+    framework_type = cfg_copy.get(key)
+    if framework_type in RECOGNIZERS:
+        return build_recognizer(cfg)
+    elif framework_type in LOCALIZERS:
+        return build_localizer(cfg)
+    elif framework_type in PARTITIONERS:
+        return build_partitioner(cfg)
+    elif framework_type in DETECTORS:
+        return build_detector(cfg)
+    elif framework_type in ESTIMATORS:
+        return build_estimator(cfg)
+    elif framework_type in MULTIMODAL:
+        return build_multimodal(cfg)
+    elif framework_type in SEGMENT:
+        return build_segment(cfg)
+    else:
+        raise NotImplementedError
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py
new file mode 100644
index 000000000..a7e528a80
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .segment import BaseSegment, Manet
+
+__all__ = ['BaseSegment',
+           'Manet'
+]
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py
new file mode 100644
index 000000000..8db3adf82
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseSegment
+from .manet_stage1 import Manet
+
+__all__ = [
+    'BaseSegment',
+    'Manet',
+]
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py
new file mode 100644
index 000000000..b5bb53945
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py
@@ -0,0 +1,95 @@
+
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseSegment(nn.Layer):
+    """Base class for semi-Video Object Segmentation.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Head to process feature.
+        loss(dict): Loss function.
+    """
+    def __init__(self, backbone=None, head=None, loss=None):
+        super().__init__()
+        if backbone != None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head != None:
+            self.head_name = head.name
+            if head.name == 'IntVOS':
+                head.update({'feature_extracter': self.backbone})
+                self.head = builder.build_head(head)
+            else:
+                self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+        if loss != None:
+            self.loss = builder.build_loss(loss)
+        else:
+            self.loss = None
+
+    def forward(self, data_batch, mode='infer', **kwargs):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch, **kwargs)
+        elif mode == 'valid':
+            return self.val_step(data_batch, **kwargs)
+        elif mode == 'test':
+            return self.test_step(data_batch, **kwargs)
+        elif mode == 'infer':
+            return self.infer_step(data_batch, **kwargs)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch, **kwargs):
+        """Infer step.
+        """
+        raise NotImplementedError
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py
new file mode 100644
index 000000000..875c508c1
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py
@@ -0,0 +1,417 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from EIVideo.paddlevideo.loader.builder import build_pipeline
+
+from EIVideo.paddlevideo.loader.pipelines import ToTensor_manet
+
+import os
+import timeit
+import paddle
+from PIL import Image
+from davisinteractive.utils.scribbles import scribbles2mask, annotated_frames
+from paddle import nn
+
+from EIVideo.paddlevideo.utils import load
+from EIVideo.paddlevideo.utils.manet_utils import float_, _palette, damage_masks, long_, write_dict, rough_ROI
+from EIVideo.api import load_video, get_scribbles, submit_masks
+
+from ...builder import build_model
+from ...registry import SEGMENT
+from .base import BaseSegment
+
+
+# if cfg.MODEL.framework == "Manet":
+#     cfg_helper = {"knns": 1,
+#                   "is_save_image": True}
+#     cfg.update(cfg_helper)
+#     build_model(cfg['MODEL']).test_step(**cfg,
+#                                         weights=weights,
+#                                         parallel=False)
+#     return
+
+
+@SEGMENT.register()
+class Manet(BaseSegment):
+    def __init__(self, backbone=None, head=None, **cfg):
+        super().__init__(backbone, head, **cfg)
+
+    def train_step(self, data_batch, step, **cfg):
+        pass
+
+    def val_step(self, data_batch, **kwargs):
+        pass
+
+    def infer_step(self, data_batch, **kwargs):
+        """Define how the model is going to test, from input to output."""
+        pass
+
+    def test_step(self, weights, parallel=True, is_save_image=True, **cfg):
+        # 1. Construct model.
+        cfg['MODEL'].head.pretrained = ''
+        cfg['MODEL'].head.test_mode = True
+        model = build_model(cfg['MODEL'])
+        if parallel:
+            model = paddle.DataParallel(model)
+
+        # 2. Construct data.
+        sequence = cfg["video_path"].split('/')[-1].split('.')[0]
+        obj_nums = 1
+        images, _ = load_video(cfg["video_path"], 480)
+        print("stage1 load_video success")
+        # [195, 389, 238, 47, 244, 374, 175, 399]
+        # .shape: (502, 480, 600, 3)
+        report_save_dir = cfg.get("output_dir",
+                                  f"./output/{cfg['model_name']}")
+        if not os.path.exists(report_save_dir):
+            os.makedirs(report_save_dir)
+            # Configuration used in the challenges
+        max_nb_interactions = 8  # Maximum number of interactions
+        # Interactive parameters
+        model.eval()
+
+        state_dicts_ = load(weights)['state_dict']
+        state_dicts = {}
+        for k, v in state_dicts_.items():
+            if 'num_batches_tracked' not in k:
+                state_dicts['head.' + k] = v
+                if ('head.' + k) not in model.state_dict().keys():
+                    print(f'pretrained -----{k} -------is not in model')
+        write_dict(state_dicts, 'model_for_infer.txt', **cfg)
+        model.set_state_dict(state_dicts)
+        inter_file = open(
+            os.path.join(
+                cfg.get("output_dir", f"./output/{cfg['model_name']}"),
+                'inter_file.txt'), 'w')
+        seen_seq = False
+
+        with paddle.no_grad():
+
+            # Get the current iteration scribbles
+            for scribbles, first_scribble in get_scribbles():
+                t_total = timeit.default_timer()
+                f, h, w = images.shape[:3]
+                if 'prev_label_storage' not in locals().keys():
+                    prev_label_storage = paddle.zeros([f, h, w])
+                if len(annotated_frames(scribbles)) == 0:
+                    final_masks = prev_label_storage
+                    # ToDo To AP-kai: save_path传过来了
+                    submit_masks(cfg["save_path"], final_masks.numpy(), images)
+                    continue
+
+                # if no scribbles return, keep masks in previous round
+                start_annotated_frame = annotated_frames(scribbles)[0]
+                pred_masks = []
+                pred_masks_reverse = []
+
+                if first_scribble:  # If in the first round, initialize memories
+                    n_interaction = 1
+                    eval_global_map_tmp_dic = {}
+                    local_map_dics = ({}, {})
+                    total_frame_num = f
+
+                else:
+                    n_interaction += 1
+                inter_file.write(sequence + ' ' + 'interaction' +
+                                 str(n_interaction) + ' ' + 'frame' +
+                                 str(start_annotated_frame) + '\n')
+
+                if first_scribble:  # if in the first round, extract pixel embbedings.
+                    if not seen_seq:
+                        seen_seq = True
+                        inter_turn = 1
+                        embedding_memory = []
+                        places = paddle.set_device('cpu')
+
+                        for imgs in images:
+                            if cfg['PIPELINE'].get('test'):
+                                imgs = paddle.to_tensor([
+                                    build_pipeline(cfg['PIPELINE'].test)({
+                                        'img1':
+                                            imgs
+                                    })['img1']
+                                ])
+                            else:
+                                imgs = paddle.to_tensor([imgs])
+                            if parallel:
+                                for c in model.children():
+                                    frame_embedding = c.head.extract_feature(
+                                        imgs)
+                            else:
+                                frame_embedding = model.head.extract_feature(
+                                    imgs)
+                            embedding_memory.append(frame_embedding)
+
+                        del frame_embedding
+
+                        embedding_memory = paddle.concat(embedding_memory, 0)
+                        _, _, emb_h, emb_w = embedding_memory.shape
+                        ref_frame_embedding = embedding_memory[
+                            start_annotated_frame]
+                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)
+                    else:
+                        inter_turn += 1
+                        ref_frame_embedding = embedding_memory[
+                            start_annotated_frame]
+                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)
+
+                else:
+                    ref_frame_embedding = embedding_memory[
+                        start_annotated_frame]
+                    ref_frame_embedding = ref_frame_embedding.unsqueeze(0)
+                ########
+                scribble_masks = scribbles2mask(scribbles, (emb_h, emb_w))
+                scribble_label = scribble_masks[start_annotated_frame]
+                scribble_sample = {'scribble_label': scribble_label}
+                scribble_sample = ToTensor_manet()(scribble_sample)
+                #                     print(ref_frame_embedding, ref_frame_embedding.shape)
+                scribble_label = scribble_sample['scribble_label']
+
+                scribble_label = scribble_label.unsqueeze(0)
+                model_name = cfg['model_name']
+                output_dir = cfg.get("output_dir", f"./output/{model_name}")
+                inter_file_path = os.path.join(
+                    output_dir, sequence, 'interactive' + str(n_interaction),
+                                          'turn' + str(inter_turn))
+                if is_save_image:
+                    ref_scribble_to_show = scribble_label.squeeze().numpy()
+                    im_ = Image.fromarray(
+                        ref_scribble_to_show.astype('uint8')).convert('P', )
+                    im_.putpalette(_palette)
+                    ref_img_name = str(start_annotated_frame)
+
+                    if not os.path.exists(inter_file_path):
+                        os.makedirs(inter_file_path)
+                    im_.save(
+                        os.path.join(inter_file_path,
+                                     'inter_' + ref_img_name + '.png'))
+                if first_scribble:
+                    prev_label = None
+                    prev_label_storage = paddle.zeros([f, h, w])
+                else:
+                    prev_label = prev_label_storage[start_annotated_frame]
+                    prev_label = prev_label.unsqueeze(0).unsqueeze(0)
+                # check if no scribbles.
+                if not first_scribble and paddle.unique(
+                        scribble_label).shape[0] == 1:
+                    print(
+                        'not first_scribble and paddle.unique(scribble_label).shape[0] == 1'
+                    )
+                    print(paddle.unique(scribble_label))
+                    final_masks = prev_label_storage
+                    submit_masks(cfg["save_path"], final_masks.numpy(), images)
+                    continue
+
+                ###inteaction segmentation head
+                if parallel:
+                    for c in model.children():
+                        tmp_dic, local_map_dics = c.head.int_seghead(
+                            ref_frame_embedding=ref_frame_embedding,
+                            ref_scribble_label=scribble_label,
+                            prev_round_label=prev_label,
+                            global_map_tmp_dic=eval_global_map_tmp_dic,
+                            local_map_dics=local_map_dics,
+                            interaction_num=n_interaction,
+                            seq_names=[sequence],
+                            gt_ids=paddle.to_tensor([obj_nums]),
+                            frame_num=[start_annotated_frame],
+                            first_inter=first_scribble)
+                else:
+                    tmp_dic, local_map_dics = model.head.int_seghead(
+                        ref_frame_embedding=ref_frame_embedding,
+                        ref_scribble_label=scribble_label,
+                        prev_round_label=prev_label,
+                        global_map_tmp_dic=eval_global_map_tmp_dic,
+                        local_map_dics=local_map_dics,
+                        interaction_num=n_interaction,
+                        seq_names=[sequence],
+                        gt_ids=paddle.to_tensor([obj_nums]),
+                        frame_num=[start_annotated_frame],
+                        first_inter=first_scribble)
+                pred_label = tmp_dic[sequence]
+                pred_label = nn.functional.interpolate(pred_label,
+                                                       size=(h, w),
+                                                       mode='bilinear',
+                                                       align_corners=True)
+                pred_label = paddle.argmax(pred_label, axis=1)
+                pred_masks.append(float_(pred_label))
+                # np.unique(pred_label)
+                # array([0], dtype=int64)
+                prev_label_storage[start_annotated_frame] = float_(
+                    pred_label[0])
+
+                if is_save_image:  # save image
+                    pred_label_to_save = pred_label.squeeze(0).numpy()
+                    im = Image.fromarray(
+                        pred_label_to_save.astype('uint8')).convert('P', )
+                    im.putpalette(_palette)
+                    imgname = str(start_annotated_frame)
+                    while len(imgname) < 5:
+                        imgname = '0' + imgname
+                    if not os.path.exists(inter_file_path):
+                        os.makedirs(inter_file_path)
+                    im.save(os.path.join(inter_file_path, imgname + '.png'))
+                #######################################
+                if first_scribble:
+                    scribble_label = rough_ROI(scribble_label)
+
+                ##############################
+                ref_prev_label = pred_label.unsqueeze(0)
+                prev_label = pred_label.unsqueeze(0)
+                prev_embedding = ref_frame_embedding
+                for ii in range(start_annotated_frame + 1, total_frame_num):
+                    current_embedding = embedding_memory[ii]
+                    current_embedding = current_embedding.unsqueeze(0)
+                    prev_label = prev_label
+                    if parallel:
+                        for c in model.children():
+                            tmp_dic, eval_global_map_tmp_dic, local_map_dics = c.head.prop_seghead(
+                                ref_frame_embedding,
+                                prev_embedding,
+                                current_embedding,
+                                scribble_label,
+                                prev_label,
+                                normalize_nearest_neighbor_distances=True,
+                                use_local_map=True,
+                                seq_names=[sequence],
+                                gt_ids=paddle.to_tensor([obj_nums]),
+                                k_nearest_neighbors=cfg['knns'],
+                                global_map_tmp_dic=eval_global_map_tmp_dic,
+                                local_map_dics=local_map_dics,
+                                interaction_num=n_interaction,
+                                start_annotated_frame=start_annotated_frame,
+                                frame_num=[ii],
+                                dynamic_seghead=c.head.dynamic_seghead)
+                    else:
+                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.head.prop_seghead(
+                            ref_frame_embedding,
+                            prev_embedding,
+                            current_embedding,
+                            scribble_label,
+                            prev_label,
+                            normalize_nearest_neighbor_distances=True,
+                            use_local_map=True,
+                            seq_names=[sequence],
+                            gt_ids=paddle.to_tensor([obj_nums]),
+                            k_nearest_neighbors=cfg['knns'],
+                            global_map_tmp_dic=eval_global_map_tmp_dic,
+                            local_map_dics=local_map_dics,
+                            interaction_num=n_interaction,
+                            start_annotated_frame=start_annotated_frame,
+                            frame_num=[ii],
+                            dynamic_seghead=model.head.dynamic_seghead)
+                    pred_label = tmp_dic[sequence]
+                    pred_label = nn.functional.interpolate(pred_label,
+                                                           size=(h, w),
+                                                           mode='bilinear',
+                                                           align_corners=True)
+                    pred_label = paddle.argmax(pred_label, axis=1)
+                    pred_masks.append(float_(pred_label))
+                    prev_label = pred_label.unsqueeze(0)
+                    prev_embedding = current_embedding
+                    prev_label_storage[ii] = float_(pred_label[0])
+                    if is_save_image:
+                        pred_label_to_save = pred_label.squeeze(0).numpy()
+                        im = Image.fromarray(
+                            pred_label_to_save.astype('uint8')).convert('P', )
+                        im.putpalette(_palette)
+                        imgname = str(ii)
+                        while len(imgname) < 5:
+                            imgname = '0' + imgname
+                        if not os.path.exists(inter_file_path):
+                            os.makedirs(inter_file_path)
+                        im.save(os.path.join(inter_file_path,
+                                             imgname + '.png'))
+                #######################################
+                prev_label = ref_prev_label
+                prev_embedding = ref_frame_embedding
+                #######
+                # Propagation <-
+                for ii in range(start_annotated_frame):
+                    current_frame_num = start_annotated_frame - 1 - ii
+                    current_embedding = embedding_memory[current_frame_num]
+                    current_embedding = current_embedding.unsqueeze(0)
+                    prev_label = prev_label
+                    if parallel:
+                        for c in model.children():
+                            tmp_dic, eval_global_map_tmp_dic, local_map_dics = c.head.prop_seghead(
+                                ref_frame_embedding,
+                                prev_embedding,
+                                current_embedding,
+                                scribble_label,
+                                prev_label,
+                                normalize_nearest_neighbor_distances=True,
+                                use_local_map=True,
+                                seq_names=[sequence],
+                                gt_ids=paddle.to_tensor([obj_nums]),
+                                k_nearest_neighbors=cfg['knns'],
+                                global_map_tmp_dic=eval_global_map_tmp_dic,
+                                local_map_dics=local_map_dics,
+                                interaction_num=n_interaction,
+                                start_annotated_frame=start_annotated_frame,
+                                frame_num=[current_frame_num],
+                                dynamic_seghead=c.head.dynamic_seghead)
+                    else:
+                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.head.prop_seghead(
+                            ref_frame_embedding,
+                            prev_embedding,
+                            current_embedding,
+                            scribble_label,
+                            prev_label,
+                            normalize_nearest_neighbor_distances=True,
+                            use_local_map=True,
+                            seq_names=[sequence],
+                            gt_ids=paddle.to_tensor([obj_nums]),
+                            k_nearest_neighbors=cfg['knns'],
+                            global_map_tmp_dic=eval_global_map_tmp_dic,
+                            local_map_dics=local_map_dics,
+                            interaction_num=n_interaction,
+                            start_annotated_frame=start_annotated_frame,
+                            frame_num=[current_frame_num],
+                            dynamic_seghead=model.head.dynamic_seghead)
+                    pred_label = tmp_dic[sequence]
+                    pred_label = nn.functional.interpolate(pred_label,
+                                                           size=(h, w),
+                                                           mode='bilinear',
+                                                           align_corners=True)
+
+                    pred_label = paddle.argmax(pred_label, axis=1)
+                    pred_masks_reverse.append(float_(pred_label))
+                    prev_label = pred_label.unsqueeze(0)
+                    prev_embedding = current_embedding
+                    ####
+                    prev_label_storage[current_frame_num] = float_(
+                        pred_label[0])
+                    ###
+                    if is_save_image:
+                        pred_label_to_save = pred_label.squeeze(0).numpy()
+                        im = Image.fromarray(
+                            pred_label_to_save.astype('uint8')).convert('P', )
+                        im.putpalette(_palette)
+                        imgname = str(current_frame_num)
+                        while len(imgname) < 5:
+                            imgname = '0' + imgname
+                        if not os.path.exists(inter_file_path):
+                            os.makedirs(inter_file_path)
+                        im.save(os.path.join(inter_file_path,
+                                             imgname + '.png'))
+                pred_masks_reverse.reverse()
+                pred_masks_reverse.extend(pred_masks)
+                final_masks = paddle.concat(pred_masks_reverse, 0)
+                submit_masks(cfg["save_path"], final_masks.numpy(), images)
+
+                t_end = timeit.default_timer()
+                print('Total time for single interaction: ' +
+                      str(t_end - t_total))
+        inter_file.close()
+        return None
\ No newline at end of file
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py
new file mode 100644
index 000000000..32dbe0096
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py
@@ -0,0 +1,893 @@
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+import paddle.nn.functional as F
+from EIVideo.paddlevideo.utils.manet_utils import int_, float_, long_, load
+from EIVideo.paddlevideo.utils.manet_utils import kaiming_normal_
+
+#############################################################GLOBAL_DIST_MAP
+
+MODEL_UNFOLD = True
+WRONG_LABEL_PADDING_DISTANCE = 1e20
+
+
+def _pairwise_distances(x, y, ys=None):
+    """Computes pairwise squared l2 distances between tensors x and y.
+    Args:
+    x: Tensor of shape [n, feature_dim].
+    y: Tensor of shape [m, feature_dim].
+    Returns:
+    Float32 distances tensor of shape [n, m].
+    """
+
+    xs = paddle.sum(x * x, 1)
+    xs = xs.unsqueeze(1)
+    if ys is None:
+        ys = paddle.sum(y * y, 1)
+        ys = ys.unsqueeze(0)
+    else:
+        ys = ys
+    d = xs + ys - 2. * paddle.matmul(x, paddle.t(y))
+    return d, ys
+
+
+##################
+def _flattened_pairwise_distances(reference_embeddings, query_embeddings, ys):
+    """Calculates flattened tensor of pairwise distances between ref and query.
+    Args:
+    reference_embeddings: Tensor of shape [..., embedding_dim],
+      the embedding vectors for the reference frame
+    query_embeddings: Tensor of shape [n_query_images, height, width,
+      embedding_dim], the embedding vectors for the query frames.
+    Returns:
+    A distance tensor of shape [reference_embeddings.size / embedding_dim,
+    query_embeddings.size / embedding_dim]
+    """
+    embedding_dim = query_embeddings.shape[-1]
+    reference_embeddings = reference_embeddings.reshape([-1, embedding_dim])
+    first_dim = -1
+    query_embeddings = query_embeddings.reshape([first_dim, embedding_dim])
+    dists, ys = _pairwise_distances(query_embeddings, reference_embeddings, ys)
+    return dists, ys
+
+
+def _nn_features_per_object_for_chunk(reference_embeddings, query_embeddings,
+                                      wrong_label_mask, k_nearest_neighbors,
+                                      ys):
+    """Extracts features for each object using nearest neighbor attention.
+  Args:
+    reference_embeddings: Tensor of shape [n_chunk, embedding_dim],
+      the embedding vectors for the reference frame.
+    query_embeddings: Tensor of shape [m_chunk, embedding_dim], the embedding
+      vectors for the query frames.
+    wrong_label_mask:
+    k_nearest_neighbors: Integer, the number of nearest neighbors to use.
+  Returns:
+    nn_features: A float32 tensor of nearest neighbor features of shape
+      [m_chunk, n_objects, feature_dim].
+    """
+    #    reference_embeddings_key = reference_embeddings
+    #    query_embeddings_key = query_embeddings
+    dists, ys = _flattened_pairwise_distances(reference_embeddings,
+                                              query_embeddings, ys)
+
+    dists = (paddle.unsqueeze(dists, 1) +
+             paddle.unsqueeze(float_(wrong_label_mask), 0) *
+             WRONG_LABEL_PADDING_DISTANCE)
+    if k_nearest_neighbors == 1:
+        features = paddle.min(dists, 2, keepdim=True)
+    else:
+        dists, _ = paddle.topk(-dists, k=k_nearest_neighbors, axis=2)
+        dists = -dists
+        valid_mask = (dists < WRONG_LABEL_PADDING_DISTANCE)
+        masked_dists = dists * valid_mask.float()
+        pad_dist = paddle.max(masked_dists, axis=2, keepdim=True)[0].tile(
+            (1, 1, masked_dists.shape[-1]))
+        dists = paddle.where(valid_mask, dists, pad_dist)
+        # take mean of distances
+        features = paddle.mean(dists, axis=2, keepdim=True)
+
+    return features, ys
+
+
+###
+def _selected_pixel(ref_labels_flat, ref_emb_flat):
+    index_list = paddle.arange(len(ref_labels_flat))
+    index_list = index_list
+    index_ = paddle.masked_select(index_list, ref_labels_flat != -1)
+
+    index_ = long_(index_)
+    ref_labels_flat = paddle.index_select(ref_labels_flat, index_, 0)
+    ref_emb_flat = paddle.index_select(ref_emb_flat, index_, 0)
+
+    return ref_labels_flat, ref_emb_flat
+
+
+###
+
+
+def _nearest_neighbor_features_per_object_in_chunks(reference_embeddings_flat,
+                                                    query_embeddings_flat,
+                                                    reference_labels_flat,
+                                                    ref_obj_ids,
+                                                    k_nearest_neighbors,
+                                                    n_chunks, **cfg):
+    """Calculates the nearest neighbor features per object in chunks to save mem.
+    Uses chunking to bound the memory use.
+    Args:
+    reference_embeddings_flat: Tensor of shape [n, embedding_dim],
+      the embedding vectors for the reference frame.
+    query_embeddings_flat: Tensor of shape [m, embedding_dim], the embedding
+      vectors for the query frames.
+    reference_labels_flat: Tensor of shape [n], the class labels of the
+      reference frame.
+    ref_obj_ids: int tensor of unique object ids in the reference labels.
+    k_nearest_neighbors: Integer, the number of nearest neighbors to use.
+    n_chunks: Integer, the number of chunks to use to save memory
+      (set to 1 for no chunking).
+    Returns:
+    nn_features: A float32 tensor of nearest neighbor features of shape
+      [m, n_objects, feature_dim].
+    """
+
+    # reference_embeddings_flat = reference_embeddings_flat.cpu()
+    # query_embeddings_flat = query_embeddings_flat.cpu()
+    # reference_labels_flat = reference_labels_flat.cpu()
+    # ref_obj_ids = ref_obj_ids.cpu()
+
+    chunk_size = int_(
+        np.ceil((float_(query_embeddings_flat.shape[0]) / n_chunks).numpy()))
+    if cfg.get('test_mode'):
+        reference_labels_flat, reference_embeddings_flat = _selected_pixel(
+            reference_labels_flat, reference_embeddings_flat)
+    wrong_label_mask = (reference_labels_flat != paddle.unsqueeze(
+        ref_obj_ids, 1))
+    all_features = []
+    for n in range(n_chunks):
+        if n == 0:
+            ys = None
+        if n_chunks == 1:
+            query_embeddings_flat_chunk = query_embeddings_flat
+        else:
+            chunk_start = n * chunk_size
+            chunk_end = (n + 1) * chunk_size
+            query_embeddings_flat_chunk = query_embeddings_flat[
+                chunk_start:chunk_end]
+        features, ys = _nn_features_per_object_for_chunk(
+            reference_embeddings_flat, query_embeddings_flat_chunk,
+            wrong_label_mask, k_nearest_neighbors, ys)
+        all_features.append(features)
+    if n_chunks == 1:
+        nn_features = all_features[0]
+    else:
+        nn_features = paddle.concat(all_features, axis=0)
+    return nn_features
+
+
+def nearest_neighbor_features_per_object(reference_embeddings,
+                                         query_embeddings,
+                                         reference_labels,
+                                         k_nearest_neighbors,
+                                         gt_ids=None,
+                                         n_chunks=100,
+                                         **cfg):
+    """Calculates the distance to the nearest neighbor per object.
+    For every pixel of query_embeddings calculate the distance to the
+    nearest neighbor in the (possibly subsampled) reference_embeddings per object.
+    Args:
+    reference_embeddings: Tensor of shape [height, width, embedding_dim],
+      the embedding vectors for the reference frame.
+    query_embeddings: Tensor of shape [n_query_images, height, width,
+      embedding_dim], the embedding vectors for the query frames.
+    reference_labels: Tensor of shape [height, width, 1], the class labels of
+      the reference frame.
+    max_neighbors_per_object: Integer, the maximum number of candidates
+      for the nearest neighbor query per object after subsampling,
+      or 0 for no subsampling.
+    k_nearest_neighbors: Integer, the number of nearest neighbors to use.
+    gt_ids: Int tensor of shape [n_objs] of the sorted unique ground truth
+      ids in the first frame. If None, it will be derived from
+      reference_labels.
+    n_chunks: Integer, the number of chunks to use to save memory
+      (set to 1 for no chunking).
+    Returns:
+    nn_features: A float32 tensor of nearest neighbor features of shape
+      [n_query_images, height, width, n_objects, feature_dim].
+    gt_ids: An int32 tensor of the unique sorted object ids present
+      in the reference labels.
+    """
+    # reference_embeddings = reference_embeddings.detach().cpu()
+    # query_embeddings = query_embeddings.detach().cpu()
+    # reference_labels = reference_labels.detach().cpu()
+
+    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])
+    h, w, _ = query_embeddings.shape
+    reference_labels_flat = reference_labels.reshape([-1])
+    if gt_ids is None:
+        ref_obj_ids = paddle.unique(reference_labels_flat)[-1]
+        ref_obj_ids = np.arange(0, ref_obj_ids + 1)
+        gt_ids = paddle.to_tensor(ref_obj_ids)
+        gt_ids = int_(gt_ids)
+    else:
+        gt_ids = int_(paddle.arange(0, gt_ids + 1))
+
+    embedding_dim = query_embeddings.shape[-1]
+    query_embeddings_flat = query_embeddings.reshape([-1, embedding_dim])
+    reference_embeddings_flat = reference_embeddings.reshape(
+        [-1, embedding_dim])
+    nn_features = _nearest_neighbor_features_per_object_in_chunks(
+        reference_embeddings_flat, query_embeddings_flat,
+        reference_labels_flat, gt_ids, k_nearest_neighbors, n_chunks, **cfg)
+    nn_features_dim = nn_features.shape[-1]
+    nn_features = nn_features.reshape(
+        [1, h, w, gt_ids.shape[0], nn_features_dim])
+    return nn_features.cuda(), gt_ids
+
+
+########################################################################LOCAL_DIST_MAP
+
+
+def local_pairwise_distances2(x, y, max_distance=9):
+    """Computes pairwise squared l2 distances using a local search window.
+    Naive implementation using map_fn.
+    Used as a slow fallback for when correlation_cost is not available.
+    Args:
+    x: Float32 tensor of shape [height, width, feature_dim].
+    y: Float32 tensor of shape [height, width, feature_dim].
+    max_distance: Integer, the maximum distance in pixel coordinates
+      per dimension which is considered to be in the search window.
+    Returns:
+    Float32 distances tensor of shape
+      [height, width, (2 * max_distance + 1) ** 2].
+    """
+    ori_h, ori_w, _ = x.shape
+    x = paddle.transpose(x, [2, 0, 1]).unsqueeze(0)
+    x = F.avg_pool2d(x, (2, 2), (2, 2))
+    y = paddle.transpose(y, [2, 0, 1]).unsqueeze(0)
+    y = F.avg_pool2d(y, (2, 2), (2, 2))
+
+    _, channels, height, width = x.shape
+    padding_val = 1e20
+    padded_y = F.pad(y,
+                     (max_distance, max_distance, max_distance, max_distance),
+                     mode='constant',
+                     value=padding_val)
+    offset_y = F.unfold(padded_y, kernel_sizes=[height, width]).reshape(
+        [1, channels, height, width, -1])
+    x = x.reshape([1, channels, height, width, 1])
+    minus = x - offset_y
+    dists = paddle.sum(paddle.multiply(minus, minus),
+                       axis=1).reshape([1, height, width,
+                                        -1]).transpose([0, 3, 1, 2])
+    dists = (paddle.nn.functional.sigmoid(dists) - 0.5) * 2
+    dists = F.interpolate(dists,
+                          size=[ori_h, ori_w],
+                          mode='bilinear',
+                          align_corners=True)
+    dists = dists.squeeze(0).transpose([1, 2, 0])
+    return dists
+
+
+def local_previous_frame_nearest_neighbor_features_per_object(
+        prev_frame_embedding,
+        query_embedding,
+        prev_frame_labels,
+        gt_ids,
+        max_distance=12):
+    """Computes nearest neighbor features while only allowing local matches.
+  Args:
+    prev_frame_embedding: Tensor of shape [height, width, embedding_dim],
+      the embedding vectors for the last frame.
+    query_embedding: Tensor of shape [height, width, embedding_dim],
+      the embedding vectors for the query frames.
+    prev_frame_labels: Tensor of shape [height, width, 1], the class labels of
+      the previous frame.
+    gt_ids: Int Tensor of shape [n_objs] of the sorted unique ground truth
+      ids in the first frame.
+    max_distance: Integer, the maximum distance allowed for local matching.
+  Returns:
+    nn_features: A float32 np.array of nearest neighbor features of shape
+      [1, height, width, n_objects, 1].
+    """
+    #     print(query_embedding.shape, prev_frame_embedding.shape)
+    #     print(query_embedding.place, prev_frame_embedding.place)
+    #     query_embedding = query_embedding.cpu()
+    #     prev_frame_embedding = prev_frame_embedding.cpu()
+    #     prev_frame_labels = prev_frame_labels.cpu()
+    #     print(prev_frame_labels.place, prev_frame_embedding.place, query_embedding.place)
+
+    d = local_pairwise_distances2(query_embedding,
+                                  prev_frame_embedding,
+                                  max_distance=max_distance)
+    height, width = prev_frame_embedding.shape[:2]
+
+    if MODEL_UNFOLD:
+
+        labels = float_(prev_frame_labels).transpose([2, 0, 1]).unsqueeze(0)
+        padded_labels = F.pad(labels, (
+            2 * max_distance,
+            2 * max_distance,
+            2 * max_distance,
+            2 * max_distance,
+        ))
+        offset_labels = F.unfold(padded_labels,
+                                 kernel_sizes=[height, width],
+                                 strides=[2,
+                                          2]).reshape([height, width, -1, 1])
+        offset_masks = paddle.equal(
+            offset_labels,
+            float_(gt_ids).unsqueeze(0).unsqueeze(0).unsqueeze(0))
+    else:
+
+        masks = paddle.equal(prev_frame_labels,
+                             gt_ids.unsqueeze(0).unsqueeze(0))
+        padded_masks = nn.functional.pad(masks, (
+            0,
+            0,
+            max_distance,
+            max_distance,
+            max_distance,
+            max_distance,
+        ))
+        offset_masks = []
+        for y_start in range(2 * max_distance + 1):
+            y_end = y_start + height
+            masks_slice = padded_masks[y_start:y_end]
+            for x_start in range(2 * max_distance + 1):
+                x_end = x_start + width
+                offset_mask = masks_slice[:, x_start:x_end]
+                offset_masks.append(offset_mask)
+        offset_masks = paddle.stack(offset_masks, axis=2)
+
+    d_tiled = d.unsqueeze(-1).tile((1, 1, 1, gt_ids.shape[0]))
+    pad = paddle.ones_like(d_tiled)
+    d_masked = paddle.where(offset_masks, d_tiled, pad)
+    dists = paddle.min(d_masked, axis=2)
+    dists = dists.reshape([1, height, width, gt_ids.shape[0], 1])
+
+    return dists
+
+
+##############################################################
+
+
+#################
+class _res_block(nn.Layer):
+    def __init__(self, in_dim, out_dim, **cfg):
+        super(_res_block, self).__init__()
+        self.conv1 = nn.Conv2D(in_dim,
+                               out_dim,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1)
+        self.relu1 = nn.ReLU()
+        self.bn1 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg['train_bn_mom'])
+        self.conv2 = nn.Conv2D(out_dim,
+                               out_dim,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1)
+        self.relu2 = nn.ReLU()
+        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg['train_bn_mom'])
+
+    def forward(self, x):
+        res = x
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+        x += res
+        return x
+
+
+####################
+class IntSegHead(nn.Layer):
+    def __init__(self, in_dim, emb_dim, **cfg):
+        super(IntSegHead, self).__init__()
+        self.conv1 = nn.Conv2D(in_dim,
+                               emb_dim,
+                               kernel_size=7,
+                               stride=1,
+                               padding=3)
+        self.bn1 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg['train_bn_mom'])
+        self.relu1 = nn.ReLU(True)
+        self.res1 = _res_block(emb_dim, emb_dim, **cfg)
+        self.res2 = _res_block(emb_dim, emb_dim, **cfg)
+        self.conv2 = nn.Conv2D(256,
+                               emb_dim,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1)
+        self.bn2 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg['train_bn_mom'])
+        self.relu2 = nn.ReLU(True)
+        self.conv3 = nn.Conv2D(emb_dim, 1, 1, 1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.res1(x)
+        x = self.res2(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        return x
+
+
+class _split_separable_conv2d(nn.Layer):
+    def __init__(self, in_dim, out_dim, kernel_size=7, **cfg):
+        super(_split_separable_conv2d, self).__init__()
+        self.conv1 = nn.Conv2D(in_dim,
+                               in_dim,
+                               kernel_size=kernel_size,
+                               stride=1,
+                               padding=int((kernel_size - 1) / 2),
+                               groups=in_dim)
+        self.relu1 = nn.ReLU(True)
+        self.bn1 = paddle.nn.BatchNorm2D(in_dim, momentum=cfg['train_bn_mom'])
+        self.conv2 = nn.Conv2D(in_dim, out_dim, kernel_size=1, stride=1)
+        self.relu2 = nn.ReLU(True)
+        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg['train_bn_mom'])
+        kaiming_normal_(self.conv1.weight, mode='fan_out', nonlinearity='relu')
+        kaiming_normal_(self.conv2.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+        return x
+
+
+class DynamicSegHead(nn.Layer):
+    def __init__(self, in_dim, embed_dim, **cfg):
+        super(DynamicSegHead, self).__init__()
+        self.layer1 = _split_separable_conv2d(in_dim, embed_dim, **cfg)
+        self.layer2 = _split_separable_conv2d(embed_dim, embed_dim, **cfg)
+        self.layer3 = _split_separable_conv2d(embed_dim, embed_dim, **cfg)
+        self.layer4 = _split_separable_conv2d(embed_dim, embed_dim, **cfg)
+        self.conv = nn.Conv2D(embed_dim, 1, 1, 1)
+        kaiming_normal_(self.conv.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.conv(x)
+        return x
+
+
+from ..registry import HEADS
+"""
+覆盖原理
+class c1:
+    def __init__(self):
+        self.a = 1
+
+
+class c2(c1):
+    def __init__(self):
+        super(c2, self).__init__()
+        self.a = 2
+
+
+c = c2()
+print(c.a)
+
+"""
+
+
+@HEADS.register()
+class IntVOS(nn.Layer):
+    def __init__(self, feature_extracter, **cfg):
+        super(IntVOS, self).__init__()
+        self.feature_extracter = feature_extracter  ##embedding extractor
+        self.feature_extracter.cls_conv = nn.Sequential()
+        self.feature_extracter.upsample4 = nn.Sequential()
+        self.semantic_embedding = None
+        self.seperate_conv = nn.Conv2D(cfg['model_aspp_outdim'],
+                                       cfg['model_aspp_outdim'],
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1,
+                                       groups=cfg['model_aspp_outdim'])
+        self.bn1 = paddle.nn.BatchNorm2D(cfg['model_aspp_outdim'],
+                                         momentum=cfg['train_bn_mom'])
+        self.relu1 = nn.ReLU(True)
+        self.embedding_conv = nn.Conv2D(cfg['model_aspp_outdim'],
+                                        cfg['model_semantic_embedding_dim'], 1,
+                                        1)
+        self.relu2 = nn.ReLU(True)
+        self.bn2 = paddle.nn.BatchNorm2D(cfg['model_semantic_embedding_dim'],
+                                         momentum=cfg['train_bn_mom'])
+        self.semantic_embedding = nn.Sequential(*[
+            self.seperate_conv, self.bn1, self.relu1, self.embedding_conv,
+            self.bn2, self.relu2
+        ])
+
+        for m in self.semantic_embedding:
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+
+        self.dynamic_seghead = DynamicSegHead(
+            in_dim=cfg['model_semantic_embedding_dim'] + 3,
+            embed_dim=cfg['model_head_embedding_dim'],
+            **cfg)  # propagation segm head
+        if cfg['model_useintseg']:
+            self.inter_seghead = IntSegHead(
+                in_dim=cfg['model_semantic_embedding_dim'] + 3,
+                emb_dim=cfg['model_head_embedding_dim'],
+                **cfg)
+        else:
+            self.inter_seghead = DynamicSegHead(
+                in_dim=cfg['model_semantic_embedding_dim'] + 2,
+                embed_dim=cfg['model_head_embedding_dim'],
+                **cfg)  # interaction segm head
+        self.pretrained = cfg.get('pretrained', None)
+        self.cfg = cfg
+
+    def init_weights(self):
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            self.set_state_dict(load(self.pretrained, self.state_dict()))
+            print('loaded pretrained model')
+
+    def loss(self, **kwargs):
+        return self.loss_func(**kwargs)
+
+    def forward(self,
+                x=None,
+                ref_scribble_label=None,
+                previous_frame_mask=None,
+                normalize_nearest_neighbor_distances=True,
+                use_local_map=True,
+                seq_names=None,
+                gt_ids=None,
+                k_nearest_neighbors=1,
+                global_map_tmp_dic=None,
+                local_map_dics=None,
+                interaction_num=None,
+                start_annotated_frame=None,
+                frame_num=None):
+
+        x = self.extract_feature(x)
+        #         print('extract_feature:', x.mean().item())
+        ref_frame_embedding, previous_frame_embedding, current_frame_embedding = paddle.split(
+            x, num_or_sections=3, axis=0)
+
+        if global_map_tmp_dic is None:
+            dic = self.prop_seghead(
+                ref_frame_embedding,
+                previous_frame_embedding,
+                current_frame_embedding,
+                ref_scribble_label,
+                previous_frame_mask,
+                normalize_nearest_neighbor_distances,
+                use_local_map,
+                seq_names,
+                gt_ids,
+                k_nearest_neighbors,
+                global_map_tmp_dic,
+                local_map_dics,
+                interaction_num,
+                start_annotated_frame,
+                frame_num,
+                self.dynamic_seghead,
+            )
+            return dic
+
+        else:
+            dic, global_map_tmp_dic = self.prop_seghead(
+                ref_frame_embedding,
+                previous_frame_embedding,
+                current_frame_embedding,
+                ref_scribble_label,
+                previous_frame_mask,
+                normalize_nearest_neighbor_distances,
+                use_local_map,
+                seq_names,
+                gt_ids,
+                k_nearest_neighbors,
+                global_map_tmp_dic,
+                local_map_dics,
+                interaction_num,
+                start_annotated_frame,
+                frame_num,
+                self.dynamic_seghead,
+            )
+            return dic, global_map_tmp_dic
+
+    def extract_feature(self, x):
+        x = self.feature_extracter(x)
+        x = self.semantic_embedding(x)
+        return x
+
+    def prop_seghead(
+        self,
+        ref_frame_embedding=None,
+        previous_frame_embedding=None,
+        current_frame_embedding=None,
+        ref_scribble_label=None,
+        previous_frame_mask=None,
+        normalize_nearest_neighbor_distances=True,
+        use_local_map=True,
+        seq_names=None,
+        gt_ids=None,
+        k_nearest_neighbors=1,
+        global_map_tmp_dic=None,
+        local_map_dics=None,
+        interaction_num=None,
+        start_annotated_frame=None,
+        frame_num=None,
+        dynamic_seghead=None,
+    ):
+        """return: feature_embedding,global_match_map,local_match_map,previous_frame_mask"""
+        ###############
+        cfg = self.cfg
+        global_map_tmp_dic = global_map_tmp_dic
+        dic_tmp = {}
+        bs, c, h, w = current_frame_embedding.shape
+        if cfg.get('test_mode'):
+            scale_ref_scribble_label = float_(ref_scribble_label)
+        else:
+            scale_ref_scribble_label = paddle.nn.functional.interpolate(
+                float_(ref_scribble_label), size=(h, w), mode='nearest')
+        scale_ref_scribble_label = int_(scale_ref_scribble_label)
+        scale_previous_frame_label = paddle.nn.functional.interpolate(
+            float_(previous_frame_mask), size=(h, w), mode='nearest')
+        scale_previous_frame_label = int_(scale_previous_frame_label)
+        for n in range(bs):
+            seq_current_frame_embedding = current_frame_embedding[n]
+            seq_ref_frame_embedding = ref_frame_embedding[n]
+            seq_prev_frame_embedding = previous_frame_embedding[n]
+            seq_ref_frame_embedding = seq_ref_frame_embedding.transpose(
+                [1, 2, 0])
+            seq_current_frame_embedding = seq_current_frame_embedding.transpose(
+                [1, 2, 0])
+            seq_ref_scribble_label = scale_ref_scribble_label[n].transpose(
+                [1, 2, 0])
+            #########Global Map
+            nn_features_n, ref_obj_ids = nearest_neighbor_features_per_object(
+                reference_embeddings=seq_ref_frame_embedding,
+                query_embeddings=seq_current_frame_embedding,
+                reference_labels=seq_ref_scribble_label,
+                k_nearest_neighbors=k_nearest_neighbors,
+                gt_ids=gt_ids[n],
+                n_chunks=10)
+            if normalize_nearest_neighbor_distances:
+                nn_features_n = (paddle.nn.functional.sigmoid(nn_features_n) -
+                                 0.5) * 2
+
+            #             print(nn_features_n)
+
+            ###
+            if global_map_tmp_dic is not None:  ###when testing, use global map memory
+                if seq_names[n] not in global_map_tmp_dic:
+                    global_map_tmp_dic[seq_names[n]] = paddle.ones_like(
+                        nn_features_n).tile([1000, 1, 1, 1, 1])
+                nn_features_n = paddle.where(
+                    nn_features_n <= global_map_tmp_dic[seq_names[n]][
+                        frame_num[n]].unsqueeze(0), nn_features_n,
+                    global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(
+                        0))
+
+                #                 print('detach 1')
+                #                 print(nn_features_n.shape)
+                # nn_features_n = nn_features_n.detach()
+                global_map_tmp_dic[seq_names[n]][
+                    frame_num[n]] = nn_features_n.detach()[0]
+
+            #########################Local dist map
+            seq_prev_frame_embedding = seq_prev_frame_embedding.transpose(
+                [1, 2, 0])
+            seq_previous_frame_label = scale_previous_frame_label[n].transpose(
+                [1, 2, 0])
+
+            if use_local_map:
+                prev_frame_nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(
+                    prev_frame_embedding=seq_prev_frame_embedding,
+                    query_embedding=seq_current_frame_embedding,
+                    prev_frame_labels=seq_previous_frame_label,
+                    gt_ids=ref_obj_ids,
+                    max_distance=cfg['model_max_local_distance'])
+            else:
+                prev_frame_nn_features_n, _ = nearest_neighbor_features_per_object(
+                    reference_embeddings=seq_prev_frame_embedding,
+                    query_embeddings=seq_current_frame_embedding,
+                    reference_labels=seq_previous_frame_label,
+                    k_nearest_neighbors=k_nearest_neighbors,
+                    gt_ids=gt_ids[n],
+                    n_chunks=20)
+                prev_frame_nn_features_n = (
+                    paddle.nn.functional.sigmoid(prev_frame_nn_features_n) -
+                    0.5) * 2
+
+            #             print(prev_frame_nn_features_n.mean().item(), prev_frame_nn_features_n.shape, interaction_num)  # o
+            #############
+            if local_map_dics is not None:  ##When testing, use local map memory
+                local_map_tmp_dic, local_map_dist_dic = local_map_dics
+                if seq_names[n] not in local_map_dist_dic:
+                    print(seq_names[n], 'not in local_map_dist_dic')
+                    local_map_dist_dic[seq_names[n]] = paddle.zeros(1000, 9)
+                if seq_names[n] not in local_map_tmp_dic:
+                    print(seq_names[n], 'not in local_map_tmp_dic')
+                    local_map_tmp_dic[seq_names[n]] = paddle.zeros_like(
+                        prev_frame_nn_features_n).unsqueeze(0).tile(
+                            [1000, 9, 1, 1, 1, 1])
+                #                 print(local_map_dist_dic[seq_names[n]].shape)
+                #                 print('detach 2')
+                # prev_frame_nn_features_n = prev_frame_nn_features_n.detach()
+                local_map_dist_dic[seq_names[n]][
+                    frame_num[n], interaction_num -
+                    1] = 1.0 / (abs(frame_num[n] - start_annotated_frame)
+                                )  # bugs fixed.
+                local_map_tmp_dic[seq_names[n]][
+                    frame_num[n],
+                    interaction_num - 1] = prev_frame_nn_features_n.squeeze(
+                        0).detach()  # bugs fixed.
+                if interaction_num == 1:
+                    prev_frame_nn_features_n = local_map_tmp_dic[seq_names[n]][
+                        frame_num[n]][interaction_num - 1]
+                    prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(
+                        0)
+                else:
+                    if local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 1] > \
+                            local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 2]:
+                        prev_frame_nn_features_n = local_map_tmp_dic[
+                            seq_names[n]][frame_num[n]][interaction_num - 1]
+                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(
+                            0)
+                    else:
+                        prev_frame_nn_features_n = local_map_tmp_dic[
+                            seq_names[n]][frame_num[n]][interaction_num - 2]
+                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(
+                            0)
+
+                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)
+
+            to_cat_previous_frame = (
+                float_(seq_previous_frame_label) == float_(ref_obj_ids)
+            )  # float comparision?
+
+            to_cat_current_frame_embedding = current_frame_embedding[
+                n].unsqueeze(0).tile((ref_obj_ids.shape[0], 1, 1, 1))
+
+            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(
+                [2, 3, 0, 1])
+            to_cat_previous_frame = float_(
+                to_cat_previous_frame.unsqueeze(-1).transpose([2, 3, 0, 1]))
+            to_cat_prev_frame_nn_feature_n = prev_frame_nn_features_n.squeeze(
+                0).transpose([2, 3, 0, 1])
+            to_cat = paddle.concat(
+                (to_cat_current_frame_embedding, to_cat_nn_feature_n,
+                 to_cat_prev_frame_nn_feature_n, to_cat_previous_frame), 1)
+            pred_ = dynamic_seghead(to_cat)
+            pred_ = pred_.transpose([1, 0, 2, 3])
+            dic_tmp[seq_names[n]] = pred_
+
+        if global_map_tmp_dic is None:
+            return dic_tmp
+        else:
+            if local_map_dics is None:
+                return dic_tmp, global_map_tmp_dic
+            else:
+                return dic_tmp, global_map_tmp_dic, local_map_dics
+
+    def int_seghead(self,
+                    ref_frame_embedding=None,
+                    ref_scribble_label=None,
+                    prev_round_label=None,
+                    normalize_nearest_neighbor_distances=True,
+                    global_map_tmp_dic=None,
+                    local_map_dics=None,
+                    interaction_num=None,
+                    seq_names=None,
+                    gt_ids=None,
+                    k_nearest_neighbors=1,
+                    frame_num=None,
+                    first_inter=True):
+        dic_tmp = {}
+        bs, c, h, w = ref_frame_embedding.shape
+        scale_ref_scribble_label = paddle.nn.functional.interpolate(
+            float_(ref_scribble_label), size=(h, w), mode='nearest')
+        scale_ref_scribble_label = int_(scale_ref_scribble_label)
+        if not first_inter:
+            scale_prev_round_label = paddle.nn.functional.interpolate(
+                float_(prev_round_label), size=(h, w), mode='nearest')
+            scale_prev_round_label = int_(scale_prev_round_label)
+        n_chunks = 500
+        for n in range(bs):
+
+            gt_id = paddle.arange(0, gt_ids[n] + 1)
+
+            gt_id = int_(gt_id)
+
+            seq_ref_frame_embedding = ref_frame_embedding[n]
+
+            ########################Local dist map
+            seq_ref_frame_embedding = paddle.transpose(seq_ref_frame_embedding,
+                                                       [1, 2, 0])
+            seq_ref_scribble_label = paddle.transpose(
+                scale_ref_scribble_label[n], [1, 2, 0])
+            nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(
+                prev_frame_embedding=seq_ref_frame_embedding,
+                query_embedding=seq_ref_frame_embedding,
+                prev_frame_labels=seq_ref_scribble_label,
+                gt_ids=gt_id,
+                max_distance=self.cfg['model_max_local_distance'])
+
+            #######
+            ######################Global map update
+            if seq_names[n] not in global_map_tmp_dic:
+                global_map_tmp_dic[seq_names[n]] = paddle.ones_like(
+                    nn_features_n).tile([1000, 1, 1, 1, 1])
+            nn_features_n_ = paddle.where(
+                nn_features_n <=
+                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0),
+                nn_features_n,
+                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0))
+
+            ###
+
+            ###
+            #             print('detach 3')
+            # nn_features_n_ = nn_features_n_.detach()
+            global_map_tmp_dic[seq_names[n]][
+                frame_num[n]] = nn_features_n_.detach()[0]
+            ##################Local map update
+            if local_map_dics is not None:
+                local_map_tmp_dic, local_map_dist_dic = local_map_dics
+                if seq_names[n] not in local_map_dist_dic:
+                    local_map_dist_dic[seq_names[n]] = paddle.zeros([1000, 9])
+                if seq_names[n] not in local_map_tmp_dic:
+                    local_map_tmp_dic[seq_names[n]] = paddle.ones_like(
+                        nn_features_n).unsqueeze(0).tile([1000, 9, 1, 1, 1, 1])
+                local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num
+                                                               - 1] = 0
+
+                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)
+
+            ##################
+            to_cat_current_frame_embedding = ref_frame_embedding[n].unsqueeze(
+                0).tile((gt_id.shape[0], 1, 1, 1))
+            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(
+                [2, 3, 0, 1])
+
+            to_cat_scribble_mask_to_cat = (
+                float_(seq_ref_scribble_label) == float_(gt_id)
+            )  # float comparision?
+            to_cat_scribble_mask_to_cat = float_(
+                to_cat_scribble_mask_to_cat.unsqueeze(-1).transpose(
+                    [2, 3, 0, 1]))
+            if not first_inter:
+                seq_prev_round_label = scale_prev_round_label[n].transpose(
+                    [1, 2, 0])
+
+                to_cat_prev_round_to_cat = (
+                    float_(seq_prev_round_label) == float_(gt_id)
+                )  # float comparision?
+                to_cat_prev_round_to_cat = float_(
+                    to_cat_prev_round_to_cat.unsqueeze(-1).transpose(
+                        [2, 3, 0, 1]))
+            else:
+                to_cat_prev_round_to_cat = paddle.zeros_like(
+                    to_cat_scribble_mask_to_cat)
+                to_cat_prev_round_to_cat[0] = 1.
+
+            to_cat = paddle.concat(
+                (to_cat_current_frame_embedding, to_cat_scribble_mask_to_cat,
+                 to_cat_prev_round_to_cat), 1)
+
+            pred_ = self.inter_seghead(to_cat)
+            pred_ = pred_.transpose([1, 0, 2, 3])
+            dic_tmp[seq_names[n]] = pred_
+        if local_map_dics is None:
+            return dic_tmp
+        else:
+            return dic_tmp, local_map_dics
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py
new file mode 100644
index 000000000..5a98b6451
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .IntVOS import IntVOS
+
+__all__ = ['IntVOS'
+]
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py
new file mode 100644
index 000000000..48babf5a3
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+BACKBONES = Registry('backbone')
+HEADS = Registry('head')
+RECOGNIZERS = Registry('recognizer')
+LOCALIZERS = Registry('localizer')
+PARTITIONERS = Registry('partitioner')
+SEGMENT = Registry('segmentation')
+LOSSES = Registry('loss')
+ROI_EXTRACTORS = Registry('roi_extractor')
+DETECTORS = Registry('detectors')
+BBOX_ASSIGNERS = Registry('bbox_assigner')
+BBOX_SAMPLERS = Registry('bbox_sampler')
+BBOX_CODERS = Registry('bbox_coder')
+ESTIMATORS = Registry('estimator')
+MULTIMODAL = Registry('multimodal')
+SEGMENT = Registry('segment')
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py
new file mode 100644
index 000000000..479129eb6
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn.initializer as init
+import numpy as np
+from scipy import special
+
+
+def weight_init_(layer,
+                 func,
+                 weight_name=None,
+                 bias_name=None,
+                 bias_value=0.0,
+                 **kwargs):
+    """
+    In-place params init function.
+    Usage:
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        data = np.ones([3, 4], dtype='float32')
+        linear = paddle.nn.Linear(4, 4)
+        input = paddle.to_tensor(data)
+        print(linear.weight)
+        linear(input)
+
+        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)
+        print(linear.weight)
+    """
+
+    if hasattr(layer, 'weight') and layer.weight is not None:
+        getattr(init, func)(**kwargs)(layer.weight)
+        if weight_name is not None:
+            # override weight name
+            layer.weight.name = weight_name
+
+    if hasattr(layer, 'bias') and layer.bias is not None:
+        init.Constant(bias_value)(layer.bias)
+        if bias_name is not None:
+            # override bias name
+            layer.bias.name = bias_name
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        print("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+              "The distribution of values may be incorrect.")
+
+    with paddle.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to [2l-1, 2u-1].
+        tmp = np.random.uniform(2 * l - 1, 2 * u - 1,
+                                size=list(tensor.shape)).astype(np.float32)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tmp = special.erfinv(tmp)
+
+        # Transform to proper mean, std
+        tmp *= (std * math.sqrt(2.0))
+        tmp += mean
+
+        # Clamp to ensure it's in the proper range
+        tmp = np.clip(tmp, a, b)
+        tensor.set_value(paddle.to_tensor(tmp))
+
+        return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = tensor.dim()
+    if dimensions < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    num_input_fmaps = tensor.shape[1]
+    num_output_fmaps = tensor.shape[0]
+    receptive_field_size = 1
+    if tensor.dim() > 2:
+        receptive_field_size = tensor[0][0].numel()
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def kaiming_normal_(tensor, a=0., mode='fan_in', nonlinearity='leaky_relu'):
+    def _calculate_correct_fan(tensor, mode):
+        mode = mode.lower()
+        valid_modes = ['fan_in', 'fan_out']
+        if mode not in valid_modes:
+            raise ValueError(
+                "Mode {} not supported, please use one of {}".format(
+                    mode, valid_modes))
+
+        fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+        return fan_in if mode == 'fan_in' else fan_out
+
+    def calculate_gain(nonlinearity, param=None):
+        linear_fns = [
+            'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+            'conv_transpose2d', 'conv_transpose3d'
+        ]
+        if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+            return 1
+        elif nonlinearity == 'tanh':
+            return 5.0 / 3
+        elif nonlinearity == 'relu':
+            return math.sqrt(2.0)
+        elif nonlinearity == 'leaky_relu':
+            if param is None:
+                negative_slope = 0.01
+            elif not isinstance(param, bool) and isinstance(
+                    param, int) or isinstance(param, float):
+                negative_slope = param
+            else:
+                raise ValueError(
+                    "negative_slope {} not a valid number".format(param))
+            return math.sqrt(2.0 / (1 + negative_slope**2))
+        else:
+            raise ValueError(
+                "Unsupported nonlinearity {}".format(nonlinearity))
+
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    with paddle.no_grad():
+        paddle.nn.initializer.Normal(0, std)(tensor)
+        return tensor
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py
new file mode 100644
index 000000000..45d1d0c09
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py
@@ -0,0 +1,19 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .test import test_model
+
+__all__ = [
+    'test_model',
+]
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py
new file mode 100644
index 000000000..c92bb1293
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py
@@ -0,0 +1,39 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from EIVideo.paddlevideo.utils import get_logger, load
+from ..loader.builder import build_dataloader, build_dataset
+from ..metrics import build_metric
+from ..modeling.builder import build_model
+from ..modeling.framework import Manet
+
+logger = get_logger("paddlevideo")
+
+
+@paddle.no_grad()
+def test_model(cfg, weights, parallel=True):
+    """Test model entry
+
+    Args:
+        cfg (dict): configuration.
+        weights (str): weights path to load.
+        parallel (bool): Whether to do multi-cards testing. Default: True.
+
+    """
+    if cfg.MODEL.framework == "Manet":
+        cfg_helper = {"knns": 1, "is_save_image": True}
+        cfg.update(cfg_helper)
+        final = Manet().test_step(**cfg, weights=weights, parallel=False)
+        return final
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py
new file mode 100644
index 000000000..d18561d76
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import Registry
+from .build_utils import build
+from .config import *
+from .logger import setup_logger, coloring, get_logger
+from .record import AverageMeter, build_record, log_batch, log_epoch
+from .dist_utils import get_dist_info, main_only
+from .save_load import save, load, load_ckpt, mkdir
+from .precise_bn import do_preciseBN
+from .profiler import add_profiler_step
+__all__ = ['Registry', 'build']
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py
new file mode 100644
index 000000000..73c0ca46b
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def build(cfg, registry, key='name'):
+    """Build a module from config dict.
+    Args:
+        cfg (dict): Config dict. It should at least contain the key.
+        registry (XXX): The registry to search the type from.
+        key (str): the key.
+    Returns:
+        obj: The constructed object.
+    """
+
+    assert isinstance(cfg, dict) and key in cfg
+
+    cfg_copy = cfg.copy()
+    obj_type = cfg_copy.pop(key)
+
+    obj_cls = registry.get(obj_type)
+    if obj_cls is None:
+        raise KeyError('{} is not in the {} registry'.format(
+                obj_type, registry.name))
+    return obj_cls(**cfg_copy)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py
new file mode 100644
index 000000000..9db59bd5e
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/config.py
@@ -0,0 +1,174 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import yaml
+from EIVideo.paddlevideo.utils.logger import coloring, setup_logger
+
+__all__ = ['get_config']
+
+logger = setup_logger("./", name="paddlevideo", level="INFO")
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    placeholder = "-" * 60
+    for k, v in sorted(d.items()):
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", coloring(k,
+                                                                   "HEADER")))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ",
+                                         coloring(str(k), "HEADER")))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ",
+                                           coloring(k, "HEADER"),
+                                           coloring(v, "OKGREEN")))
+
+        if k.isupper():
+            logger.info(placeholder)
+
+
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    print_dict(config)
+
+
+def check_config(config):
+    """
+    Check config
+    """
+    pass
+
+
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+    def str2num(v):
+        try:
+            return eval(v)
+        except Exception:
+            return v
+
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            if not ks[0] in dl:
+                logger.warning('A new filed ({}) detected!'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            assert ks[0] in dl, (
+                '({}) doesn\'t exist in {}, a new dict field is invalid'.format(
+                    ks[0], dl))
+            override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                epochs=20',
+                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    if options is not None:
+        for opt in options:
+            assert isinstance(opt,
+                              str), ("option({}) should be a str".format(opt))
+            assert "=" in opt, (
+                "option({}) should contain a ="
+                "to distinguish between key and value".format(opt))
+            pair = opt.split('=')
+            assert len(pair) == 2, ("there can be only a = in the option")
+            key, value = pair
+            keys = key.split('.')
+            override(config, keys, value)
+
+    return config
+
+
+def get_config(fname, overrides=None, show=True):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    override_config(config, overrides)
+    if show:
+        print_config(config)
+    check_config(config)
+    return config
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py
new file mode 100644
index 000000000..7659e88c1
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+
+import paddle
+import paddle.distributed as dist
+
+def get_dist_info():
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    return rank, world_size
+
+def main_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py
new file mode 100644
index 000000000..e9791b89b
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import datetime
+
+from paddle.distributed import ParallelEnv
+
+
+
+Color = {
+    'RED': '\033[31m',
+    'HEADER': '\033[35m',  # deep purple
+    'PURPLE': '\033[95m',  # purple
+    'OKBLUE': '\033[94m',
+    'OKGREEN': '\033[92m',
+    'WARNING': '\033[93m',
+    'FAIL': '\033[91m',
+    'ENDC': '\033[0m'
+}
+
+
+def coloring(message, color="OKGREEN"):
+    assert color in Color.keys()
+    if os.environ.get('COLORING', True):
+        return Color[color] + str(message) + Color["ENDC"]
+    else:
+        return message
+
+
+logger_initialized = []
+
+
+def setup_logger(output=None, name="paddlevideo", level="INFO"):
+    """
+    Initialize the paddlevideo logger and set its verbosity level to "INFO".
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+    Returns:
+        logging.Logger: a logger
+    """
+    def time_zone(sec, fmt):
+        real_time = datetime.datetime.now()
+        return real_time.timetuple()
+    logging.Formatter.converter = time_zone
+
+    logger = logging.getLogger(name)
+    if level == "INFO":
+        logger.setLevel(logging.INFO)
+    elif level=="DEBUG":
+        logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    if level == "DEBUG":
+        plain_formatter = logging.Formatter(
+            "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
+            datefmt="%m/%d %H:%M:%S")
+    else:
+        plain_formatter = logging.Formatter(
+            "[%(asctime)s] %(message)s",
+            datefmt="%m/%d %H:%M:%S")
+    # stdout logging: master only
+    local_rank = ParallelEnv().local_rank
+    if local_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, ".log.txt")
+        if local_rank > 0:
+            filename = filename + ".rank{}".format(local_rank)
+
+        # PathManager.mkdirs(os.path.dirname(filename))
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        # fh = logging.StreamHandler(_cached_log_stream(filename)
+        fh = logging.FileHandler(filename, mode='a')
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+    logger_initialized.append(name)
+    return logger
+
+
+def get_logger(name, output=None):
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    return setup_logger(name=name, output=name)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py
new file mode 100644
index 000000000..99986101e
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py
@@ -0,0 +1,1295 @@
+from __future__ import absolute_import
+
+import json
+import math
+import os
+import pickle
+import warnings
+
+import numpy
+import numpy as np
+from numpy import inf
+from paddle import Tensor, concat, reshape, nn
+import paddle
+
+from typing import Union, Iterable
+
+# from reprod_log.compare import compute_diff
+# from reprod_log.utils import check_print_diff, np2torch, np2paddle, torch2np, paddle2np
+
+_tensor_or_tensors = Union[paddle.Tensor, Iterable[paddle.Tensor]]
+_palette = [
+    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,
+    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,
+    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,
+    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,
+    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,
+    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,
+    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,
+    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,
+    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,
+    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,
+    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,
+    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,
+    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,
+    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,
+    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,
+    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,
+    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,
+    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,
+    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,
+    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,
+    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,
+    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,
+    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,
+    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,
+    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,
+    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,
+    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,
+    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,
+    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,
+    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,
+    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,
+    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,
+    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,
+    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,
+    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,
+    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,
+    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,
+    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,
+    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,
+    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,
+    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,
+    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,
+    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,
+    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,
+    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,
+    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,
+    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,
+    255, 255
+]
+
+# paddle.set_device('gpu') if paddle.is_compiled_with_cuda() else paddle.set_device('cpu')
+
+import paddle
+import PIL
+import numbers
+import numpy as np
+from PIL import Image
+from paddle.vision.transforms import BaseTransform
+from paddle.vision.transforms import functional as F
+
+import numpy as np
+from scipy.ndimage import interpolation, binary_dilation
+try:
+    from skimage import morphology, transform
+except ImportError as e:
+    print(
+        f"{e}, [scikit-image] package and it's dependencies is required for EIVideo."
+    )
+import paddle
+import cv2
+import random
+
+
+####
+def mask_damager(labels=None, p_black=0.2):
+    scales = (0.8, 1.0, 1.2)
+    kernel_size = random.randint(10, 15)
+    kernel = np.ones((kernel_size, kernel_size), np.uint8)
+    if random.random() < p_black:
+        final_label = paddle.zeros_like(labels)
+        final_label = final_label.squeeze().numpy()
+    else:
+        prot = random.randint(5, 15)
+        nrot = random.randint(-15, -5)
+        rots = [prot, nrot, 0]
+        rot = rots[random.randint(0, 2)]
+
+        sc = scales[random.randint(0, 2)]
+        _, _, h, w = labels.shape
+        tmp = labels.squeeze()
+
+        tmp = tmp.unsqueeze(-1)
+        tmp = tmp.numpy().astype(np.uint8)
+        morph_p = random.random()
+        if morph_p < 0.5:
+            tmp = cv2.morphologyEx(tmp, cv2.MORPH_OPEN, kernel)
+        else:
+            tmp = cv2.morphologyEx(tmp, cv2.MORPH_CLOSE, kernel)
+
+        tmp = tmp.astype(np.uint8)
+        center = (w / 2, h / 2)
+        M = cv2.getRotationMatrix2D(center, rot, sc)
+        final_label = cv2.warpAffine(tmp, M, (w, h), cv2.INTER_NEAREST)
+
+    return final_label
+
+
+color_map = [
+    [0, 0, 0],
+    [255, 127, 0],
+    [30, 144, 255],
+    [186, 85, 211],
+    [255, 105, 180],
+    [192, 255, 62],
+    [255, 105, 180],
+    [50, 255, 255],
+]
+
+color_map_np = np.array(color_map)
+
+
+def overlay_davis(image, mask, alpha=0.5):
+    """ Overlay segmentation on top of RGB image. from davis official"""
+    im_overlay = image.copy()
+    mask = mask.astype('uint8')
+    colored_mask = color_map_np[mask]
+    foreground = image * alpha + (1 - alpha) * colored_mask
+    binary_mask = (mask > 0)
+    # Compose image
+    im_overlay[binary_mask] = foreground[binary_mask]
+    countours = binary_dilation(binary_mask) ^ binary_mask
+    im_overlay[countours, :] = 0
+    return im_overlay.astype(image.dtype)
+
+
+# TODO
+def submit_masks(masks, images, inter_file_path):
+    overlays = []
+    save_result_path = os.path.join(inter_file_path, 'result')
+    os.makedirs(save_result_path, exist_ok=True)
+    for imgname, (mask, image) in enumerate(zip(masks, images)):
+        overlay = overlay_davis(image, mask)
+        overlays.append(overlay.tolist())
+        overlay = Image.fromarray(overlay)
+        imgname = str(imgname)
+        while len(imgname) < 5:
+            imgname = '0' + imgname
+        overlay.save(os.path.join(save_result_path, imgname + '.png'))
+    result = {'overlays': overlays}
+    # result = {'masks': masks.tolist()}
+    with open(os.path.join(save_result_path, 'masks.json'), 'w') as f:
+        json.dump(result, f)
+
+
+def load_video(path, min_side=None):
+    frame_list = []
+    cap = cv2.VideoCapture(path)
+    while (cap.isOpened()):
+        _, frame = cap.read()
+        if frame is None:
+            break
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        if min_side:
+            h, w = frame.shape[:2]
+            new_w = (w * min_side // min(w, h))
+            new_h = (h * min_side // min(w, h))
+            frame = cv2.resize(frame, (new_w, new_h),
+                               interpolation=cv2.INTER_CUBIC)
+            # .transpose([2, 0, 1])
+        frame_list.append(frame)
+    frames = np.stack(frame_list, axis=0)
+    return frames
+
+
+def get_scribbles():
+    for i in range(8):
+        with open(f'/home/lc/paddlevideo/data/bike-packing/lable/{i + 1}.json'
+                  ) as f:
+            scribbles = json.load(f)
+            first_scribble = not i
+            yield scribbles, first_scribble
+
+
+def get_images(sequence='bike-packing'):
+    img_path = os.path.join('/home/lc/paddlevideo/data', sequence.strip(),
+                            'frame')
+    img_files = os.listdir(img_path)
+    img_files.sort()
+    files = []
+    for img in img_files:
+        img_file = np.array(Image.open(os.path.join(img_path, img)))
+        files.append(img_file)
+    return np.array(files)
+
+
+def rough_ROI(ref_scribble_labels):
+    #### b*1*h*w
+    dist = 20
+    b, _, h, w = ref_scribble_labels.shape
+    filter_ = paddle.zeros_like(ref_scribble_labels)
+    to_fill = paddle.zeros_like(ref_scribble_labels)
+    for i in range(b):
+        no_background = (ref_scribble_labels[i] != -1)
+        no_background = no_background.squeeze(0)
+
+        no_b = no_background.nonzero()
+        (h_min, w_min) = paddle.min(no_b, 0)
+        (h_max, w_max) = paddle.max(no_b, 0)
+        filter_[i, 0,
+                max(h_min - dist, 0):min(h_max + dist, h - 1),
+                max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1
+
+    final_scribble_labels = paddle.where(byte_(filter_), ref_scribble_labels,
+                                         to_fill)
+    return final_scribble_labels
+
+
+import os.path as osp
+
+
+def load(file_name, model, **cfg):
+    if not osp.isfile(file_name):
+        raise IOError(f'{file_name} not exist')
+    try:
+        state_dicts_ = paddle.load(file_name)['state_dict']
+    except:
+        state_dicts_ = paddle.load(file_name)
+    state_dicts = {}
+    for k in model.keys():
+        if 'num_batches_tracked' not in k:
+            if ('head.' + k) not in state_dicts_.keys():
+                if k not in state_dicts_.keys():
+                    print(f'model -----{k} -------is not in pretrained')
+                else:
+                    state_dicts[k] = state_dicts_[k]
+            else:
+                state_dicts[k] = state_dicts_['head.' + k]
+    write_dict(state_dicts, 'state_dicts.txt', **cfg)
+    write_dict(model, 'model.txt', **cfg)
+    return state_dicts
+
+
+#####
+def write_dict(state_dict, file_name, **cfg):
+    lines = []
+    tot = 0
+    for k, v in state_dict.items():
+        # 目前只发现了torch和paddle模型参数命名的这三种不一致
+        # 不一致1
+        if 'num_batches_tracked' in k:
+            tot += 1
+            continue
+        try:
+            line = str(k) + '\t' + str(v.cpu().detach().numpy().shape) + '\n'
+        except:
+            line = str(k) + '\t' + str(v.shape) + '\n'
+        lines.append(line)
+    # with open(cfg.get("output_dir", f"./output/{file_name}"), 'w') as f:
+    #     f.writelines(lines)
+    # print('%d num_batches_tracked skipped' % tot)
+
+
+def damage_masks(labels, shift=True, scale=True, rotate=True):
+    """
+    Args:
+    labels: numpy array (batch_size * 1 * h * w)
+    """
+    bs, _, h, w = labels.shape
+    labels = labels.transpose([0, 2, 3, 1])
+    labels = labels.numpy()
+    final_label = []
+    for i in range(bs):
+        label = labels[i]
+        damaged_label = damage_masks_np(label, shift, scale, rotate)
+        final_label.append(damaged_label)
+    final_label = np.array(final_label)
+    final_label = paddle.to_tensor(final_label)
+    final_label = final_label.transpose([0, 3, 1, 2])
+    return final_label
+
+
+def damage_masks_np(labels, shift=True, scale=True, rotate=True):
+    """Performs the actual mask damaging in numpy.
+    Args:
+    labels: Int32 numpy array of shape (height, width, 1).
+    shift: Boolean, whether to damage the masks by shifting.
+    scale: Boolean, whether to damage the masks by scaling.
+    rotate: Boolean, whether to damage the masks by rotation.
+    dilate: Boolean, whether to damage the masks by dilation.
+    Returns:
+    The damaged version of labels.
+    """
+    unique_labels = np.unique(labels)
+    unique_labels = np.setdiff1d(unique_labels, [0])
+    # Shuffle to get random depth ordering when combining together.
+    np.random.shuffle(unique_labels)
+    damaged_labels = np.zeros_like(labels)
+    for l in unique_labels:
+        obj_mask = (labels == l)
+        damaged_obj_mask = _damage_single_object_mask(obj_mask, shift, scale,
+                                                      rotate)
+        damaged_labels[damaged_obj_mask] = l
+    return damaged_labels
+
+
+def _damage_single_object_mask(mask, shift, scale, rotate):
+    """Performs mask damaging in numpy for a single object.
+    Args:
+    mask: Boolean numpy array of shape(height, width, 1).
+    shift: Boolean, whether to damage the masks by shifting.
+    scale: Boolean, whether to damage the masks by scaling.
+    rotate: Boolean, whether to damage the masks by rotation.
+    dilate: Boolean, whether to damage the masks by dilation.
+    Returns:
+    The damaged version of mask.
+    """
+    if shift:
+        mask = _shift_mask(mask)
+    if scale:
+        mask = _scale_mask(mask)
+    if rotate:
+        mask = _rotate_mask(mask)
+    return mask
+
+
+def _shift_mask(mask, max_shift_factor=0.05):
+    """Damages a mask for a single object by randomly shifting it in numpy.
+    Args:
+    mask: Boolean numpy array of shape(height, width, 1).
+    max_shift_factor: Float scalar, the maximum factor for random shifting.
+    Returns:
+    The shifted version of mask.
+    """
+    nzy, nzx, _ = mask.nonzero()
+    h = nzy.max() - nzy.min()
+    w = nzx.max() - nzx.min()
+    size = np.sqrt(h * w)
+    offset = np.random.uniform(-size * max_shift_factor,
+                               size * max_shift_factor, 2)
+    shifted_mask = interpolation.shift(np.squeeze(mask, axis=2),
+                                       offset,
+                                       order=0).astype('bool')[..., np.newaxis]
+    return shifted_mask
+
+
+def _scale_mask(mask, scale_amount=0.025):
+    """Damages a mask for a single object by randomly scaling it in numpy.
+    Args:
+    mask: Boolean numpy array of shape(height, width, 1).
+    scale_amount: Float scalar, the maximum factor for random scaling.
+    Returns:
+    The scaled version of mask.
+    """
+    nzy, nzx, _ = mask.nonzero()
+    cy = 0.5 * (nzy.max() - nzy.min())
+    cx = 0.5 * (nzx.max() - nzx.min())
+    scale_factor = np.random.uniform(1.0 - scale_amount, 1.0 + scale_amount)
+    shift = transform.SimilarityTransform(translation=[-cx, -cy])
+    inv_shift = transform.SimilarityTransform(translation=[cx, cy])
+    s = transform.SimilarityTransform(scale=[scale_factor, scale_factor])
+    m = (shift + (s + inv_shift)).inverse
+    scaled_mask = transform.warp(mask, m) > 0.5
+    return scaled_mask
+
+
+def _rotate_mask(mask, max_rot_degrees=3.0):
+    """Damages a mask for a single object by randomly rotating it in numpy.
+    Args:
+    mask: Boolean numpy array of shape(height, width, 1).
+    max_rot_degrees: Float scalar, the maximum number of degrees to rotate.
+    Returns:
+    The scaled version of mask.
+    """
+    cy = 0.5 * mask.shape[0]
+    cx = 0.5 * mask.shape[1]
+    rot_degrees = np.random.uniform(-max_rot_degrees, max_rot_degrees)
+    shift = transform.SimilarityTransform(translation=[-cx, -cy])
+    inv_shift = transform.SimilarityTransform(translation=[cx, cy])
+    r = transform.SimilarityTransform(rotation=np.deg2rad(rot_degrees))
+    m = (shift + (r + inv_shift)).inverse
+    scaled_mask = transform.warp(mask, m) > 0.5
+    return scaled_mask
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+import numpy as np
+
+
+def label2colormap(label):
+    m = label.astype(np.uint8)
+    r, c = m.shape
+    cmap = np.zeros((r, c, 3), dtype=np.uint8)
+    cmap[:, :, 0] = (m & 1) << 7 | (m & 8) << 3 | (m & 64) >> 1
+    cmap[:, :, 1] = (m & 2) << 6 | (m & 16) << 2 | (m & 128) >> 2
+    cmap[:, :, 2] = (m & 4) << 5 | (m & 32) << 1
+    return cmap
+
+
+def torch2paddle(data):
+    try:
+        import torch
+        if isinstance(data, dict):
+            np_data = {}
+            for k, v in data.items():
+                np_data[k] = paddle.to_tensor(v.detach().numpy())
+            return np_data
+        else:
+            return paddle.to_tensor(data.detach().numpy())
+    except:
+        pass
+
+
+def fill_(tensor: Tensor, value):
+    return tensor.set_value(paddle.full_like(tensor, value))
+
+
+def zero_(tensor: Tensor):
+    return tensor.set_value(paddle.zeros_like(tensor))
+
+
+def float_(tensor: Tensor):
+    return paddle.to_tensor(tensor, dtype='float32')
+
+
+def long_(tensor: Tensor):
+    return paddle.to_tensor(tensor, dtype='int64')
+
+
+def int_(tensor: Tensor):
+    return paddle.to_tensor(tensor, dtype='int32')
+
+
+def byte_(tensor: Tensor):
+    return paddle.to_tensor(tensor, dtype='bool')
+
+
+class ToPILImage(BaseTransform):
+    def __init__(self, mode=None, keys=None):
+        super(ToPILImage, self).__init__(keys)
+
+    def _apply_image(self, pic):
+        """
+        Args:
+            pic (Tensor|np.ndarray): Image to be converted to PIL Image.
+        Returns:
+            PIL: Converted image.
+        """
+        if not (isinstance(pic, paddle.Tensor) or isinstance(pic, np.ndarray)):
+            raise TypeError('pic should be Tensor or ndarray. Got {}.'.format(
+                type(pic)))
+
+        elif isinstance(pic, paddle.Tensor):
+            if pic.ndimension() not in {2, 3}:
+                raise ValueError(
+                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(
+                        pic.ndimension()))
+
+            elif pic.ndimension() == 2:
+                # if 2D image, add channel dimension (CHW)
+                pic = pic.unsqueeze(0)
+
+        elif isinstance(pic, np.ndarray):
+            if pic.ndim not in {2, 3}:
+                raise ValueError(
+                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(
+                        pic.ndim))
+
+            elif pic.ndim == 2:
+                # if 2D image, add channel dimension (HWC)
+                pic = np.expand_dims(pic, 2)
+
+        npimg = pic
+        if isinstance(pic, paddle.Tensor) and "float" in str(
+                pic.numpy().dtype) and self.mode != 'F':
+            pic = pic.mul(255).byte()
+        if isinstance(pic, paddle.Tensor):
+            npimg = np.transpose(pic.numpy(), (1, 2, 0))
+
+        if not isinstance(npimg, np.ndarray):
+            raise TypeError(
+                'Input pic must be a paddle.Tensor or NumPy ndarray, ' +
+                'not {}'.format(type(npimg)))
+
+        if npimg.shape[2] == 1:
+            expected_mode = None
+            npimg = npimg[:, :, 0]
+            if npimg.dtype == np.uint8:
+                expected_mode = 'L'
+            elif npimg.dtype == np.int16:
+                expected_mode = 'I;16'
+            elif npimg.dtype == np.int32:
+                expected_mode = 'I'
+            elif npimg.dtype == np.float32:
+                expected_mode = 'F'
+            if self.mode is not None and self.mode != expected_mode:
+                raise ValueError(
+                    "Incorrect self.mode ({}) supplied for input type {}. Should be {}"
+                    .format(self.mode, np.dtype, expected_mode))
+            self.mode = expected_mode
+
+        elif npimg.shape[2] == 2:
+            permitted_2_channel_modes = ['LA']
+            if self.mode is not None and self.mode not in permitted_2_channel_modes:
+                raise ValueError(
+                    "Only self.modes {} are supported for 2D inputs".format(
+                        permitted_2_channel_modes))
+
+            if self.mode is None and npimg.dtype == np.uint8:
+                self.mode = 'LA'
+
+        elif npimg.shape[2] == 4:
+            permitted_4_channel_modes = ['RGBA', 'CMYK', 'RGBX']
+            if self.mode is not None and self.mode not in permitted_4_channel_modes:
+                raise ValueError(
+                    "Only self.modes {} are supported for 4D inputs".format(
+                        permitted_4_channel_modes))
+
+            if self.mode is None and npimg.dtype == np.uint8:
+                self.mode = 'RGBA'
+        else:
+            permitted_3_channel_modes = ['RGB', 'YCbCr', 'HSV']
+            if self.mode is not None and self.mode not in permitted_3_channel_modes:
+                raise ValueError(
+                    "Only self.modes {} are supported for 3D inputs".format(
+                        permitted_3_channel_modes))
+            if self.mode is None and npimg.dtype == np.uint8:
+                self.mode = 'RGB'
+
+        if self.mode is None:
+            raise TypeError('Input type {} is not supported'.format(
+                npimg.dtype))
+
+        return Image.fromarray(npimg, mode=self.mode)
+
+
+class Identity(nn.Layer):
+    r"""A placeholder identity operator that is argument-insensitive.
+
+    Args:
+        args: any argument (unused)
+        kwargs: any keyword argument (unused)
+    """
+    def __init__(self, *args, **kwargs):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+def convert(data: dict, to, dtype=None):
+    assert isinstance(data, dict)
+    input = {}
+    for k, v in data.items():
+
+        if 'paddle' == to:
+            if isinstance(v, np.ndarray):
+                if dtype is not None:
+                    input[k] = paddle.to_tensor(v.astype(dtype))
+                else:
+                    input[k] = paddle.to_tensor(v)
+            else:
+                input[k] = v
+        elif 'torch' == to:
+            try:
+                import torch
+                if isinstance(v, np.ndarray):
+                    if dtype is not None:
+                        input[k] = torch.tensor(v.astype(dtype))
+                    else:
+                        input[k] = torch.tensor(v)
+                else:
+                    input[k] = v
+            except:
+                pass
+        else:
+            if isinstance(v, np.ndarray):
+                input[k] = v.astype(to)
+            else:
+                input[k] = v
+    return input
+
+
+def clip_grad_norm_(parameters: _tensor_or_tensors,
+                    max_norm: float,
+                    norm_type: float = 2.0,
+                    error_if_nonfinite: bool = False) -> paddle.Tensor:
+    r"""Clips gradient norm of an iterable of parameters.
+
+    The norm is computed over all gradients together, as if they were
+    concatenated into a single vector. Gradients are modified in-place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, an error is thrown if the total
+            norm of the gradients from :attr:``parameters`` is ``nan``,
+            ``inf``, or ``-inf``. Default: False (will switch to True in the future)
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    import time
+    if isinstance(parameters, paddle.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    detached_grads = [p.grad.detach() for p in parameters]
+
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return paddle.to_tensor(0.)
+    # device = paddle.get_device()  # parameters[0].grad.device
+    if norm_type == inf:
+        norms = [p.abs().max() for p in parameters]
+        total_norm = norms[0] if len(norms) == 1 else paddle.max(
+            paddle.stack(norms))
+    else:
+        #         tik = time.time()
+        total_norm = paddle.norm(
+            paddle.stack([paddle.norm(g, norm_type) for g in detached_grads]),
+            norm_type)
+    #         total_norm = paddle.norm(paddle.stack([paddle.sqrt(paddle.sum(g*g)) for g in detached_grads]), norm_type)  # fixed.
+    #         print(time.time() - tik)
+    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),
+                                                total_norm.isinf()):
+        raise RuntimeError(
+            f'The total norm of order {norm_type} for gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. To disable '
+            'this error and scale the gradients by the non-finite norm anyway, '
+            'set `error_if_nonfinite=False`')
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so
+    # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization
+    # when the gradients do not reside in CPU memory.
+    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)
+    for i, p in enumerate(parameters):
+        #         p.set_value(paddle.multiply(p, clip_coef_clamped))
+        p.grad.set_value(detached_grads[i] * clip_coef_clamped)  # fixed
+    #         p.grad.detach().mul_(clip_coef_clamped
+    return total_norm
+
+
+# def max(a: paddle.Tensor, axis=0, keepdim=True):
+#     """ndarray=numpy.array([[1, 2, 3, 4],
+#            [4, 3, 2, 1],
+#            [5, 6, 7, 8],
+#            [8, 7, 6, 5]])
+#     np.where(ndarray == np.max(ndarray))
+#     (array([2, 3]), array([3, 0]))
+#     ndarray[np.where(ndarray == np.max(ndarray))]
+#     array([8, 8])
+#     """
+#     max_ = a.max(axis).unsqueeze(-1)
+#     index = paddle.argmax(a, axis=axis, keepdim=keepdim)
+#     max_ = max_.numpy()
+#     index = index.numpy()
+#     # index = paddle.argmax(a, axis=axis, keepdim=keepdim)[-1].flatten()
+#     return max_, index
+
+
+def gather(tmp: paddle.Tensor, ind: paddle.Tensor):
+    shape = tmp.shape
+    tmp = paddle.to_tensor(tmp)
+    ind = paddle.to_tensor(ind)
+    if len(shape) == 2:
+        b = shape[0]
+        return concat([
+            reshape(paddle.gather(tmp[i, :], ind[i, :]), [1, -1])
+            for i in range(b)
+        ],
+                      axis=0)
+    elif len(shape) == 3:
+        out = []
+        for i in range(tmp.shape[0]):
+            _ = paddle.index_sample(tmp[i], ind[i])
+            out.append(_)
+        return paddle.to_tensor(out)
+    elif len(shape) == 4:
+        b, c, d = shape[:3]
+        return concat([
+            reshape(
+                concat([
+                    reshape(
+                        concat([
+                            reshape(
+                                paddle.gather(tmp[i, j, k, :], ind[i, j, k, :]),
+                                [1, -1]) for k in range(d)
+                        ],
+                               axis=0), [1, d, -1]) for j in range(c)
+                ],
+                       axis=0), [1, c, d, -1]) for i in range(b)
+        ],
+                      axis=0)
+    else:
+        pass
+
+
+# These no_grad_* functions are necessary as wrappers around the parts of these
+# functions that use `with torch.no_grad()`. The JIT doesn't support context
+# managers, so these need to be implemented as builtins. Using these wrappers
+# lets us keep those builtins small and re-usable.
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(paddle.uniform(tensor.shape, min=a, max=b))
+        return tensor
+
+
+def _no_grad_normal_(tensor, mean, std):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(shape=tensor.shape, mean=mean, std=std))
+        return tensor
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    from scipy import special
+
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2)
+
+    with paddle.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.set_value(
+            paddle.uniform(tensor.shape, min=2 * l - 1, max=2 * u - 1))
+        # tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        # tensor.erfinv_()  # paddle 无
+        tensor.set_value(special.erfinv(tensor))
+
+        # Transform to proper mean, std
+        # tensor.mul_(std * math.sqrt(2.))
+        tensor.set_value(tensor.multiply(paddle.to_tensor(std * math.sqrt(2.))))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clip_(min=a, max=b)
+        return tensor
+
+
+def _no_grad_fill_(tensor, val):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, fill_value=val))
+        return tensor
+
+
+def _no_grad_zero_(tensor):
+    with paddle.no_grad():
+        tensor.set_value(paddle.zeros_like(tensor))
+        return tensor
+
+
+def calculate_gain(nonlinearity, param=None):
+    r"""Return the recommended gain value for the given nonlinearity function.
+    The values are as follows:
+
+    ================= ====================================================
+    nonlinearity      gain
+    ================= ====================================================
+    Linear / Identity :math:`1`
+    Conv{1,2,3}D      :math:`1`
+    Sigmoid           :math:`1`
+    Tanh              :math:`\frac{5}{3}`
+    ReLU              :math:`\sqrt{2}`
+    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}`
+    SELU              :math:`\frac{3}{4}`
+    ================= ====================================================
+
+    Args:
+        nonlinearity: the non-linear function (`nn.functional` name)
+        param: optional parameter for the non-linear function
+
+    Examples:
+        >>> gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2
+    """
+    linear_fns = [
+        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+        'conv_transpose2d', 'conv_transpose3d'
+    ]
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError(
+                "negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def uniform_(tensor: Tensor, a: float = 0., b: float = 1.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the lower bound of the uniform distribution
+        b: the upper bound of the uniform distribution
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.uniform_(w)
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor: Tensor, mean: float = 0., std: float = 1.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.normal_(w)
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def trunc_normal_(tensor: Tensor,
+                  mean: float = 0.,
+                  std: float = 1.,
+                  a: float = -2.,
+                  b: float = 2.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def constant_(tensor: Tensor, val: float) -> Tensor:
+    r"""Fills the input Tensor with the value :math:`\text{val}`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        val: the value to fill the tensor with
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.constant_(w, 0.3)
+    """
+    return _no_grad_fill_(tensor, val)
+
+
+def ones_(tensor: Tensor) -> Tensor:
+    r"""Fills the input Tensor with the scalar value `1`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.ones_(w)
+    """
+    return _no_grad_fill_(tensor, 1.)
+
+
+def zeros_(tensor: Tensor) -> Tensor:
+    r"""Fills the input Tensor with the scalar value `0`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.zeros_(w)
+    """
+    return _no_grad_zero_(tensor)
+
+
+def eye_(tensor):
+    r"""Fills the 2-dimensional input `Tensor` with the identity
+    matrix. Preserves the identity of the inputs in `Linear` layers, where as
+    many inputs are preserved as possible.
+
+    Args:
+        tensor: a 2-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.eye_(w)
+    """
+    if tensor.ndimension() != 2:
+        raise ValueError("Only tensors with 2 dimensions are supported")
+
+    with paddle.no_grad():
+        tensor.set_value(paddle.eye(*tensor.shape))
+    return tensor
+
+
+def dirac_(tensor, groups=1):
+    r"""Fills the {3, 4, 5}-dimensional input `Tensor` with the Dirac
+    delta function. Preserves the identity of the inputs in `Convolutional`
+    layers, where as many input channels are preserved as possible. In case
+    of groups>1, each group of channels preserves identity
+
+    Args:
+        tensor: a {3, 4, 5}-dimensional `torch.Tensor`
+        groups (optional): number of groups in the conv layer (default: 1)
+    Examples:
+        >>> w = torch.empty(3, 16, 5, 5)
+        >>> nn.init.dirac_(w)
+        >>> w = torch.empty(3, 24, 5, 5)
+        >>> nn.init.dirac_(w, 3)
+    """
+    dimensions = tensor.ndimension()
+    if dimensions not in [3, 4, 5]:
+        raise ValueError(
+            "Only tensors with 3, 4, or 5 dimensions are supported")
+
+    sizes = tensor.shape
+
+    if sizes[0] % groups != 0:
+        raise ValueError('dim 0 must be divisible by groups')
+
+    out_chans_per_grp = sizes[0] // groups
+    min_dim = min(out_chans_per_grp, sizes[1])
+
+    with paddle.no_grad():
+        tensor.zero_()
+
+        for g in range(groups):
+            for d in range(min_dim):
+                if dimensions == 3:  # Temporal convolution
+                    tensor[g * out_chans_per_grp + d, d,
+                           tensor.shape[2] // 2] = 1
+                elif dimensions == 4:  # Spatial convolution
+                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,
+                           tensor.shape[3] // 2] = 1
+                else:  # Volumetric convolution
+                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,
+                           tensor.shape[3] // 2, tensor.shape[4] // 2] = 1
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = tensor.dim()
+    if dimensions < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    num_input_fmaps = tensor.shape[1]  # .size(1)
+    num_output_fmaps = tensor.shape[0]  # .size(0)
+    receptive_field_size = 1
+    if tensor.dim() > 2:
+        for s in tensor.shape[2:]:
+            receptive_field_size *= s  # fixed
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def LongTensor(x):
+    return paddle.to_tensor(x, dtype='int64')
+
+
+def IntTensor(x):
+    return paddle.to_tensor(x, dtype='int32')
+
+
+def xavier_uniform_(tensor: Tensor, gain: float = 1.) -> Tensor:
+    r"""Fills the input `Tensor` with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform
+    distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-a, a)` where
+
+    .. math::
+        a = \text{gain} \times \sqrt{\frac{6}{\text{fan\_in} + \text{fan\_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        gain: an optional scaling factor
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    return _no_grad_uniform_(tensor, -a, a)
+
+
+def xavier_normal_(tensor: Tensor, gain: float = 1.) -> Tensor:
+    r"""Fills the input `Tensor` with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal
+    distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan\_in} + \text{fan\_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        gain: an optional scaling factor
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.xavier_normal_(w)
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+
+    return _no_grad_normal_(tensor, 0., std)
+
+
+def _calculate_correct_fan(tensor, mode):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    r"""Fills the input `Tensor` with values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification` - He, K. et al. (2015), using a
+    uniform distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
+
+    .. math::
+        \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the negative slope of the rectifier used after this layer (only
+            used with ``'leaky_relu'``)
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    bound = math.sqrt(
+        3.0) * std  # Calculate uniform bounds from standard deviation
+    with paddle.no_grad():
+        tensor.set_value(paddle.uniform(tensor.shape, min=-bound, max=bound))
+        return tensor
+
+
+def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    r"""Fills the input `Tensor` with values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification` - He, K. et al. (2015), using a
+    normal distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \frac{\text{gain}}{\sqrt{\text{fan\_mode}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the negative slope of the rectifier used after this layer (only
+            used with ``'leaky_relu'``)
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(shape=tensor.shape, mean=0, std=std))
+        return tensor
+
+
+def orthogonal_(tensor, gain=1):
+    r"""Fills the input `Tensor` with a (semi) orthogonal matrix, as
+    described in `Exact solutions to the nonlinear dynamics of learning in deep
+    linear neural networks` - Saxe, A. et al. (2013). The input tensor must have
+    at least 2 dimensions, and for tensors with more than 2 dimensions the
+    trailing dimensions are flattened.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`, where :math:`n \geq 2`
+        gain: optional scaling factor
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.orthogonal_(w)
+    """
+    if tensor.ndimension() < 2:
+        raise ValueError("Only tensors with 2 or more dimensions are supported")
+
+    rows = tensor.shape[0]  # .size(0)
+    cols = tensor.numel() // rows
+    flattened = tensor.new(rows, cols).normal_(0, 1)
+
+    if rows < cols:
+        flattened.t_()
+
+    # Compute the qr factorization
+    q, r = paddle.to_tensor(np.linalg.qr(flattened.numpy()))
+    # q, r = torch.qr(flattened)
+    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
+    d = paddle.diag(r, 0)
+    ph = d.sign()
+    q *= ph
+
+    if rows < cols:
+        q.t_()
+
+    with paddle.no_grad():
+        tensor.view_as(q).copy_(q)
+        tensor.mul_(gain)
+    return tensor
+
+
+def sparse_(tensor, sparsity, std=0.01):
+    r"""Fills the 2D input `Tensor` as a sparse matrix, where the
+    non-zero elements will be drawn from the normal distribution
+    :math:`\mathcal{N}(0, 0.01)`, as described in `Deep learning via
+    Hessian-free optimization` - Martens, J. (2010).
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        sparsity: The fraction of elements in each column to be set to zero
+        std: the standard deviation of the normal distribution used to generate
+            the non-zero values
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.sparse_(w, sparsity=0.1)
+    """
+    if tensor.ndimension() != 2:
+        raise ValueError("Only tensors with 2 dimensions are supported")
+
+    rows, cols = tensor.shape
+    num_zeros = int(math.ceil(sparsity * rows))
+
+    with paddle.no_grad():
+        tensor.normal_(0, std)
+        for col_idx in range(cols):
+            row_indices = paddle.randperm(rows)
+            zero_indices = row_indices[:num_zeros]
+            tensor[zero_indices, col_idx] = 0
+    return tensor
+
+
+# for backward compatibility
+def _make_deprecate(meth):
+    new_name = meth.__name__
+    old_name = new_name[:-1]
+
+    def deprecated_init(*args, **kwargs):
+        warnings.warn(
+            "nn.init.{} is now deprecated in favor of nn.init.{}.".format(
+                old_name, new_name),
+            stacklevel=2)
+        return meth(*args, **kwargs)
+
+    deprecated_init.__doc__ = r"""
+    {old_name}(...)
+
+    .. warning::
+        This method is now deprecated in favor of :func:`torch.nn.init.{new_name}`.
+
+    See :func:`~torch.nn.init.{new_name}` for details.""".format(
+        old_name=old_name, new_name=new_name)
+    deprecated_init.__name__ = old_name
+    return deprecated_init
+
+
+# uniform = _make_deprecate(uniform_)
+# normal = _make_deprecate(normal_)
+# constant = _make_deprecate(constant_)
+# eye = _make_deprecate(eye_)
+# dirac = _make_deprecate(dirac_)
+# xavier_uniform = _make_deprecate(xavier_uniform_)
+# xavier_normal = _make_deprecate(xavier_normal_)
+# kaiming_uniform = _make_deprecate(kaiming_uniform_)
+# kaiming_normal = _make_deprecate(kaiming_normal_)
+# orthogonal = _make_deprecate(orthogonal_)
+# sparse = _make_deprecate(sparse_)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py
new file mode 100644
index 000000000..7bb8de043
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py
@@ -0,0 +1,84 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import itertools
+
+from EIVideo.paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+"""
+Implement precise bn, which is useful for improving accuracy.
+"""
+
+
+@paddle.no_grad()  # speed up and save CUDA memory
+def do_preciseBN(model, data_loader, parallel, num_iters=200):
+    """
+    Recompute and update the batch norm stats to make them more precise. During
+    training both BN stats and the weight are changing after every iteration, so
+    the running average can not precisely reflect the actual stats of the
+    current model.
+    In this function, the BN stats are recomputed with fixed weights, to make
+    the running average more precise. Specifically, it computes the true average
+    of per-batch mean/variance instead of the running average.
+    This is useful to improve validation accuracy.
+    Args:
+        model: the model whose bn stats will be recomputed
+        data_loader: an iterator. Produce data as input to the model
+        num_iters: number of iterations to compute the stats.
+    Return:
+        the model with precise mean and variance in bn layers.
+    """
+    bn_layers_list = [
+        m for m in model.sublayers()
+        if any((isinstance(m, bn_type)
+                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,
+                                paddle.nn.BatchNorm3D))) and m.training
+    ]
+    if len(bn_layers_list) == 0:
+        return
+
+    # moving_mean=moving_mean*momentum+batch_mean*(1.−momentum)
+    # we set momentum=0. to get the true mean and variance during forward
+    momentum_actual = [bn._momentum for bn in bn_layers_list]
+    for bn in bn_layers_list:
+        bn._momentum = 0.
+
+    running_mean = [paddle.zeros_like(bn._mean)
+                    for bn in bn_layers_list]  #pre-ignore
+    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]
+
+    ind = -1
+    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):
+        logger.info("doing precise BN {} / {}...".format(ind + 1, num_iters))
+        if parallel:
+            model._layers.train_step(data)
+        else:
+            model.train_step(data)
+
+        for i, bn in enumerate(bn_layers_list):
+            # Accumulates the bn stats.
+            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)
+            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)
+
+    assert ind == num_iters - 1, (
+        "update_bn_stats is meant to run for {} iterations, but the batch_sampler stops at {} iterations."
+        .format(num_iters, ind))
+
+    # Sets the precise bn stats.
+    for i, bn in enumerate(bn_layers_list):
+        bn._mean.set_value(running_mean[i])
+        bn._variance.set_value(running_var[i])
+        bn._momentum = momentum_actual[i]
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py
new file mode 100644
index 000000000..04201aa26
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py
@@ -0,0 +1,110 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    """
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'.
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    """
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    """
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    """
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(_profiler_options['state'],
+                                             _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py
new file mode 100644
index 000000000..c52d30c3e
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/record.py
@@ -0,0 +1,157 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import paddle
+
+from .logger import coloring, get_logger
+
+logger = get_logger("paddlevideo")
+
+__all__ = ['AverageMeter', 'build_record', 'log_batch', 'log_epoch']
+
+
+def build_record(cfg):
+    framework_type = cfg.get('framework', '')
+    record_list = [
+        ("loss", AverageMeter('loss', '7.5f')),
+        ("lr", AverageMeter('lr', 'f', need_avg=False)),
+    ]
+    if 'Recognizer1D' in framework_type:  #TODO: required specify str in framework
+        record_list.append(("hit_at_one", AverageMeter("hit_at_one", '.5f')))
+        record_list.append(("perr", AverageMeter("perr", '.5f')))
+        record_list.append(("gap", AverageMeter("gap", '.5f')))
+    elif 'Recognizer' in framework_type:
+        record_list.append(("top1", AverageMeter("top1", '.5f')))
+        record_list.append(("top5", AverageMeter("top5", '.5f')))
+    elif 'FastRCNN' in framework_type:
+        record_list.append(
+            ("recall@thr=0.5", AverageMeter("recall@thr=0.5", '.5f')))
+        record_list.append(
+            ("prec@thr=0.5", AverageMeter("prec@thr=0.5", '.5f')))
+        record_list.append(("recall@top3", AverageMeter("recall@top3", '.5f')))
+        record_list.append(("prec@top3", AverageMeter("prec@top3", '.5f')))
+        record_list.append(("recall@top5", AverageMeter("recall@top5", '.5f')))
+        record_list.append(("prec@top5", AverageMeter("prec@top5", '.5f')))
+        record_list.append(("mAP@0.5IOU", AverageMeter("mAP@0.5IOU", '.5f')))
+    elif 'DepthEstimator' in cfg.framework:
+        record_list.append(("abs_rel", AverageMeter("abs_rel", '.5f')))
+        record_list.append(("sq_rel", AverageMeter("sq_rel", '.5f')))
+        record_list.append(("rmse", AverageMeter("rmse", '.5f')))
+        record_list.append(("rmse_log", AverageMeter("rmse_log", '.5f')))
+        record_list.append(("a1", AverageMeter("a1", '.5f')))
+        record_list.append(("a2", AverageMeter("a2", '.5f')))
+        record_list.append(("a3", AverageMeter("a3", '.5f')))
+        record_list.append(("losses_day", AverageMeter("losses_day", '.5f')))
+        record_list.append(
+            ("losses_night", AverageMeter("losses_night", '.5f')))
+
+    record_list.append(("batch_time", AverageMeter('batch_cost', '.5f')))
+    record_list.append(("reader_time", AverageMeter('reader_cost', '.5f')))
+    record_list = OrderedDict(record_list)
+    return record_list
+
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    """
+    def __init__(self, name='', fmt='f', need_avg=True):
+        self.name = name
+        self.fmt = fmt
+        self.need_avg = need_avg
+        self.reset()
+
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """ update """
+        if isinstance(val, paddle.Tensor):
+            val = float(val)
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def total(self):
+        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)
+
+    @property
+    def total_minute(self):
+        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,
+                                                            self=self)
+
+    @property
+    def mean(self):
+        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(
+            self=self) if self.need_avg else ''
+
+    @property
+    def value(self):
+        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)
+
+
+def log_batch(metric_list,
+              batch_id,
+              epoch_id,
+              total_epoch,
+              mode,
+              ips,
+              tot_step=None,
+              max_iters=None):
+    batch_cost = str(metric_list['batch_time'].value) + ' sec,'
+    reader_cost = str(metric_list['reader_time'].value) + ' sec,'
+
+    metric_values = []
+    for m in metric_list:
+        if not (m == 'batch_time' or m == 'reader_time'):
+            metric_values.append(metric_list[m].value)
+    metric_str = ' '.join([str(v) for v in metric_values])
+    if max_iters:
+        epoch_str = "iter:[{:>3d}/{:<3d}]".format(tot_step, max_iters)
+    else:
+        epoch_str = "epoch:[{:>3d}/{:<3d}]".format(epoch_id, total_epoch)
+    step_str = "{:s} step:{:<4d}".format(mode, batch_id)
+
+    logger.info("{:s} {:s} {:s} {:s} {:s} {}".format(
+        coloring(epoch_str, "HEADER") if batch_id == 0 else epoch_str,
+        coloring(step_str, "PURPLE"), coloring(metric_str, 'OKGREEN'),
+        coloring(batch_cost, "OKGREEN"), coloring(reader_cost, 'OKGREEN'),
+        ips))
+
+
+def log_epoch(metric_list, epoch, mode, ips):
+    batch_cost = 'avg_' + str(metric_list['batch_time'].value) + ' sec,'
+    reader_cost = 'avg_' + str(metric_list['reader_time'].value) + ' sec,'
+    batch_sum = str(metric_list['batch_time'].total) + ' sec,'
+
+    metric_values = []
+    for m in metric_list:
+        if not (m == 'batch_time' or m == 'reader_time'):
+            metric_values.append(metric_list[m].mean)
+    metric_str = ' '.join([str(v) for v in metric_values])
+
+    end_epoch_str = "END epoch:{:<3d}".format(epoch)
+
+    logger.info("{:s} {:s} {:s} {:s} {:s} {:s} {}".format(
+        coloring(end_epoch_str, "RED"), coloring(mode, "PURPLE"),
+        coloring(metric_str, "OKGREEN"), coloring(batch_cost, "OKGREEN"),
+        coloring(reader_cost, "OKGREEN"), coloring(batch_sum, "OKGREEN"), ips))
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py
new file mode 100644
index 000000000..81b76bd51
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Registry(object):
+    """
+    The registry that provides name -> object mapping, to support third-party users' custom modules.
+
+    To register an object:
+
+    .. code-block:: python
+
+        BACKBONES = Registry('backbone')
+        @BACKBONES.register()
+        class ResNet:
+            pass
+    Or:
+    .. code-block:: python
+
+        BACKBONES = Registry('backbone')
+        class ResNet:
+            pass
+        BACKBONES.register(ResNet)
+
+    Usage: To build a module.
+
+    .. code-block:: python
+        backbone_name = "ResNet"
+        b = BACKBONES.get(backbone_name)()
+
+    """
+    def __init__(self, name):
+        """
+        Args:
+            name (str): the name of this registry
+        """
+        self._name = name
+        self._obj_map = {}
+
+    def __contains__(self, key):
+        return self._obj_map.get(key) is not None
+
+    def _do_register(self, name, obj):
+        assert (
+            name not in self._obj_map
+        ), "An object named '{}' was already registered in '{}' registry!".format(
+            name, self._name)
+        self._obj_map[name] = obj
+
+    def register(self, obj=None, name=None):
+        """
+        Register the given object under the the name `obj.__name__`.
+        Can be used as either a decorator or not. See docstring of this class for usage.
+        """
+        if obj is None:
+            # used as a decorator
+            def deco(func_or_class, name=name):
+                if name is None:
+                    name = func_or_class.__name__
+                self._do_register(name, func_or_class)
+                return func_or_class
+
+            return deco
+
+        # used as a function call
+        if name is None:
+            name = obj.__name__
+        self._do_register(name, obj)
+
+    def get(self, name):
+        """Get the registry record.
+
+        Args:
+            name (str): The class name.
+
+        Returns:
+            ret: The class.
+        """
+        ret = self._obj_map.get(name)
+        if ret is None:
+            raise KeyError(
+                "No object named '{}' found in '{}' registry!".format(
+                    name, self._name))
+
+        return ret
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py
new file mode 100644
index 000000000..9ca9cb708
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import os.path as osp
+import time
+
+from tqdm import tqdm
+import paddle
+import paddle.nn.functional as F
+from EIVideo.paddlevideo.utils import get_logger
+from EIVideo.paddlevideo.utils import main_only
+
+
+def pretrain_vit_param_trans(model, state_dicts, num_patches, seg_num,
+                             attention_type):
+    """
+    Convert ViT's pre-trained model parameters to a parameter dictionary that matches the existing model
+    """
+    if 'head' + '.weight' in state_dicts:
+        del state_dicts['head' + '.weight']
+    if 'head' + '.bias' in state_dicts:
+        del state_dicts['head' + '.bias']
+
+    total_len = len(model.state_dict())
+    if num_patches + 1 != state_dicts['pos_embed'].shape[1]:
+        pos_embed = state_dicts['pos_embed']
+        cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+        other_pos_embed = pos_embed[0,
+                                    1:, :].unsqueeze(0).unsqueeze(1).transpose(
+                                        (0, 1, 3, 2))
+        new_pos_embed = F.interpolate(other_pos_embed,
+                                      size=(other_pos_embed.shape[-2],
+                                            num_patches),
+                                      mode='nearest')
+        new_pos_embed = new_pos_embed.squeeze(0).transpose((0, 2, 1))
+        new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), axis=1)
+        state_dicts['pos_embed'] = new_pos_embed
+        time.sleep(0.01)
+
+    if 'time_embed' in state_dicts and seg_num != state_dicts[
+            'time_embed'].shape[1]:
+        time_embed = state_dicts['time_embed'].transpose((0, 2, 1)).unsqueeze(0)
+        new_time_embed = F.interpolate(time_embed,
+                                       size=(time_embed.shape[-2], seg_num),
+                                       mode='nearest')
+        state_dicts['time_embed'] = new_time_embed.squeeze(0).transpose(
+            (0, 2, 1))
+        time.sleep(0.01)
+    with tqdm(total=total_len,
+              position=1,
+              bar_format='{desc}',
+              desc="Loading weights") as desc:
+        if attention_type == 'divided_space_time':
+            new_state_dicts = state_dicts.copy()
+            for key in tqdm(state_dicts):
+                if 'blocks' in key and 'attn' in key:
+                    desc.set_description("Loading %s" % key)
+                    new_key = key.replace('attn', 'temporal_attn')
+                    if not new_key in state_dicts:
+                        new_state_dicts[new_key] = state_dicts[key]
+                    else:
+                        new_state_dicts[new_key] = state_dicts[new_key]
+                if 'blocks' in key and 'norm1' in key:
+                    desc.set_description("Loading %s" % key)
+                    new_key = key.replace('norm1', 'temporal_norm1')
+                    if not new_key in state_dicts:
+                        new_state_dicts[new_key] = state_dicts[key]
+                    else:
+                        new_state_dicts[new_key] = state_dicts[new_key]
+                time.sleep(0.01)
+    ret_str = "loading {:<20d} weights completed.".format(
+        len(model.state_dict()))
+    desc.set_description(ret_str)
+    return new_state_dicts
+
+
+def pretrain_resnet18_param_trans(model, loaded_dict):
+    encoder_dict = model.encoder.state_dict()
+    pose_encoder_dict = model.pose_encoder.state_dict()
+
+    names = ['encoder.', 'encoder_day.', 'encoder_night.']
+    for name in names:
+        for key, value in loaded_dict.items():
+            key = str(name + key)
+            if key in encoder_dict:
+                encoder_dict[key] = value
+
+    num_input_images = 2
+    loaded_dict['conv1.weight'] = paddle.concat(
+        [loaded_dict['conv1.weight']] * num_input_images, 1) / num_input_images
+
+    for name, value in loaded_dict.items():
+        name = str('encoder.' + name)
+        if name in pose_encoder_dict:
+            pose_encoder_dict[name] = value
+
+    return encoder_dict, pose_encoder_dict
+
+
+#XXX(shipping): maybe need load N times because of different cards have different params.
+@main_only
+def load_ckpt(model, weight_path, **kargs):
+    """
+    1. Load pre-trained model parameters
+    2. Extract and convert from the pre-trained model to the parameters
+    required by the existing model
+    3. Load the converted parameters of the existing model
+    """
+    #model.set_state_dict(state_dict)
+
+    if not osp.isfile(weight_path):
+        raise IOError(f'{weight_path} is not a checkpoint file')
+    #state_dicts = load(weight_path)
+
+    logger = get_logger("paddlevideo")
+    state_dicts = paddle.load(weight_path)
+    if 'ResnetEncoder' in str(model):
+        encoder_dict, pose_encoder_dict = pretrain_resnet18_param_trans(
+            model, state_dicts)
+        tmp = model.state_dict()
+        tmp.update(
+            {'backbone.encoder.' + k: v
+             for (k, v) in encoder_dict.items()})
+        tmp.update({
+            'backbone.pose_encoder.' + k: v
+            for (k, v) in pose_encoder_dict.items()
+        })
+    elif "VisionTransformer" in str(model):  # For TimeSformer case
+        tmp = pretrain_vit_param_trans(model, state_dicts, kargs['num_patches'],
+                                       kargs['seg_num'],
+                                       kargs['attention_type'])
+    else:
+        tmp = {}
+        total_len = len(model.state_dict())
+        with tqdm(total=total_len,
+                  position=1,
+                  bar_format='{desc}',
+                  desc="Loading weights") as desc:
+            for item in tqdm(model.state_dict(), total=total_len, position=0):
+                name = item
+                desc.set_description('Loading %s' % name)
+                if name not in state_dicts:  # Convert from non-parallel model
+                    if str('backbone.' + name) in state_dicts:
+                        tmp[name] = state_dicts['backbone.' + name]
+                else:  # Convert from parallel model
+                    tmp[name] = state_dicts[name]
+                time.sleep(0.01)
+        ret_str = "loading {:<20d} weights completed.".format(
+            len(model.state_dict()))
+        desc.set_description(ret_str)
+    model.set_state_dict(tmp)
+
+
+def mkdir(dir):
+    if not os.path.exists(dir):
+        # avoid error when train with multiple gpus
+        try:
+            os.makedirs(dir)
+        except:
+            pass
+
+
+@main_only
+def save(obj, path):
+    paddle.save(obj, path)
+
+
+def load(file_name):
+    if not osp.isfile(file_name):
+        raise IOError(f'{file_name} not exist')
+    return paddle.load(file_name)
diff --git a/docs/src/applications/EIVideo/EIVideo/paddlevideo/version.py b/docs/src/applications/EIVideo/EIVideo/paddlevideo/version.py
new file mode 100644
index 000000000..b5b7f481f
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/paddlevideo/version.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["paddlevideo_version"]
+paddlevideo_version = "0.0.1"
diff --git a/docs/src/applications/EIVideo/EIVideo/setup.py b/docs/src/applications/EIVideo/EIVideo/setup.py
new file mode 100644
index 000000000..7174baeeb
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/setup.py
@@ -0,0 +1,4 @@
+# Author: Acer Zhang
+# Datetime: 2022/1/11
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
diff --git a/docs/src/applications/EIVideo/EIVideo/version.py b/docs/src/applications/EIVideo/EIVideo/version.py
new file mode 100644
index 000000000..1d2c58584
--- /dev/null
+++ b/docs/src/applications/EIVideo/EIVideo/version.py
@@ -0,0 +1,6 @@
+# Author: Acer Zhang
+# Datetime: 2022/1/11 
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
+
+__version__ = "0.1a"
diff --git a/docs/src/applications/EIVideo/QEIVideo/__init__.py b/docs/src/applications/EIVideo/QEIVideo/__init__.py
new file mode 100644
index 000000000..53e6390b0
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/__init__.py
@@ -0,0 +1,13 @@
+# Author: Acer Zhang
+# Datetime: 2022/1/6 
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
+
+import os
+
+QEI_VIDEO_ROOT = os.path.abspath(os.path.dirname(__file__))
+
+import os
+from QEIVideo.version import __version__
+
+QEI_VIDEO_ROOT = os.path.abspath(os.path.dirname(__file__))
diff --git a/docs/src/applications/EIVideo/QEIVideo/build_gui.py b/docs/src/applications/EIVideo/QEIVideo/build_gui.py
new file mode 100644
index 000000000..477505acb
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/build_gui.py
@@ -0,0 +1,151 @@
+# Author: Acer Zhang
+# Datetime:2022/1/11 
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
+import json
+import os
+
+import numpy as np
+from PIL import Image
+
+from PyQt5 import QtCore, QtWidgets
+from PyQt5.QtGui import *
+from PyQt5.QtWidgets import *
+from PyQt5.QtCore import *
+import cv2
+
+from EIVideo.api import json2frame, png2json, load_video
+from EIVideo.main import main
+# ToDo To AP-kai: 这是定义前端临时保存用于推理的json的地点之类的，因为是固定的，所以声明为全局常量是最好的
+from EIVideo import TEMP_JSON_SAVE_PATH, TEMP_IMG_SAVE_PATH, TEMP_JSON_FINAL_PATH
+
+from QEIVideo.gui.ui_main_window import Ui_MainWindow
+
+
+class BuildGUI(QMainWindow, Ui_MainWindow):
+    def __init__(self):
+        super(BuildGUI, self).__init__()
+        # ToDo To AP-kai: 这里定义当前选择的视频路径的占位符，相当于全局变量
+        self.select_video_path = None
+        # ToDo To AP-kai: 未来为用户提供个保存路径的入口哈，这里先随意定义了个路径
+        self.save_path = "./result"
+        os.makedirs(self.save_path, exist_ok=True)
+        self.setupUi(self)
+
+    def infer(self):
+        self.label.setText("Start infer")
+        self.progressBar.setProperty("value", 0)
+        image = self.paintBoard.get_content_as_q_image()
+        image.save(TEMP_IMG_SAVE_PATH)
+        print(self.slider_frame_num)
+        self.progressBar.setProperty("value", 25)
+        # ToDo To AP-kai:相同的文件路径，直接定义一个常量就好
+        png2json(TEMP_IMG_SAVE_PATH, self.slider_frame_num, TEMP_JSON_SAVE_PATH)
+        self.progressBar.setProperty("value", 50)
+        # ToDo To AP-kai:打印的信息，需要注意首字母大写
+        # ToDo To AP-kai: 此处传入保存路径以及当前选择的视频路径，最后会在manet_stage1.py里通过cfg来传入
+        out = main(video_path=self.select_video_path, save_path=self.save_path)
+        print('Infer ok')
+        self.progressBar.setProperty("value", 75)
+        self.all_frames = json2frame(TEMP_JSON_FINAL_PATH)
+        print("Success get submit_masks")
+        self.open_frame()
+        self.progressBar.setProperty("value", 100)
+        self.label.setText("Infer succeed")
+
+    def btn_func(self, btn):
+        if btn == self.playbtn:
+            self.label.setText("Play video")
+            if self.progress_slider.value() == self.cap.get(7) - 1:
+                self.slider_frame_num = 0
+                self.progress_slider.setValue(self.slider_frame_num)
+                self.time_label.setText('{}/{}'.format(self.slider_frame_num, self.cap.get(7)))
+            self.timer_camera = QTimer()  # 定义定时器
+            self.timer_camera.start(1000 / self.cap.get(cv2.CAP_PROP_FPS))
+            self.slider_frame_num = self.progress_slider.value()
+            self.timer_camera.timeout.connect(self.open_frame)
+
+        elif btn == self.pushButton_2:
+            self.label.setText("Stop video")
+            self.slot_stop()
+
+        elif btn == self.pushButton_4:
+            self.label.setText("Choose video")
+            self.select_video_path, _ = QFileDialog.getOpenFileName(self.frame, "Open", "", "*.mp4;;All Files(*)")
+            print("-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-")
+            print("Select video file path:\t" + self.select_video_path)
+            # ToDo To AP-kai:下断点来看一下，如果不选择的时候返回值是什么样的，然后再做判断，目前这个if没有生效
+            if self.select_video_path != "":
+                self.cap = cv2.VideoCapture(self.select_video_path)
+                # 存所有frame
+                self.save_temp_frame()
+                print("save temp frame done")
+                self.progress_slider.setRange(0, self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                self.slider_frame_num = 0
+                self.open_frame()
+
+            # ToDo To AP-kai: 未来这个地方增加提示框，告诉他没有选择文件
+
+    def on_cbtn_eraser_clicked(self):
+        self.label.setText("Eraser On")
+        if self.cbtn_Eraser.isChecked():
+            self.paintBoard.EraserMode = True  # 进入橡皮擦模式
+        else:
+            self.paintBoard.EraserMode = False  # 退出橡皮擦模式
+
+    def fill_color_list(self, combo_box):
+        index_black = 0
+        index = 0
+        for color in self.colorList:
+            if color == "black":
+                index_black = index
+            index += 1
+            pix = QPixmap(70, 20)
+            pix.fill(QColor(color))
+            combo_box.addItem(QIcon(pix), None)
+            combo_box.setIconSize(QSize(70, 20))
+            combo_box.setSizeAdjustPolicy(QComboBox.AdjustToContents)
+
+        combo_box.setCurrentIndex(index_black)
+
+    def on_pen_color_change(self):
+        self.label.setText("Change pen color")
+        color_index = self.comboBox_penColor.currentIndex()
+        color_str = self.colorList[color_index]
+
+        self.paintBoard.change_pen_color(color_str)
+
+    # 拖拽进度条
+    def update_video_position_func(self):
+        self.label.setText("Change slider position")
+        self.slider_frame_num = self.progress_slider.value()
+        self.slot_stop()
+        self.open_frame()
+        self.progress_slider.setValue(self.slider_frame_num)
+        self.time_label.setText('{}/{}'.format(self.slider_frame_num, self.cap.get(7)))
+
+    def save_temp_frame(self):
+        _, self.all_frames = load_video(self.select_video_path, 480)
+
+    def slot_stop(self):
+        if self.cap != []:
+            self.timer_camera.stop()  # 停止计时器
+        else:
+            # ToDo To AP-kai: QMessageBox.warning没有返回值，这里我把Warming = QMessageBox.warning的Warming删去了
+            QMessageBox.warning(self, "Warming", "Push the left upper corner button to Quit.",
+                                QMessageBox.Yes)
+
+    def open_frame(self):
+        self.progress_slider.setValue(self.slider_frame_num)
+        self.slider_frame_num = self.progress_slider.value()
+        self.frame = self.all_frames[self.slider_frame_num]
+        frame = self.frame
+        height, width, bytes_per_component = frame.shape
+        bytes_per_line = bytes_per_component * width
+        q_image = QImage(frame.data, width, height, bytes_per_line,
+                         QImage.Format_RGB888).scaled(self.picturelabel.width(), self.picturelabel.height())
+        self.picturelabel.setPixmap(QPixmap.fromImage(q_image))
+        self.slider_frame_num = self.slider_frame_num + 1
+        self.time_label.setText('{}/{}'.format(self.slider_frame_num, self.cap.get(7)))
+        if self.progress_slider.value() == self.cap.get(7) - 1:
+            self.slot_stop()
diff --git a/docs/src/applications/EIVideo/QEIVideo/gui/__init__.py b/docs/src/applications/EIVideo/QEIVideo/gui/__init__.py
new file mode 100644
index 000000000..1b0d211a2
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/gui/__init__.py
@@ -0,0 +1,4 @@
+# Author: Acer Zhang
+# Datetime: 2022/1/6 
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
diff --git a/docs/src/applications/EIVideo/QEIVideo/gui/demo.py b/docs/src/applications/EIVideo/QEIVideo/gui/demo.py
new file mode 100644
index 000000000..b9573fafe
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/gui/demo.py
@@ -0,0 +1,62 @@
+# Author: Acer Zhang
+# Datetime: 2022/1/6 
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
+import sys
+
+from PyQt5.QtWidgets import QApplication, QMainWindow, QFrame, QWidget
+from PyQt5.QtGui import QPainter, QPixmap, QPen, QColor, QPainterPath
+from PyQt5.QtCore import Qt, QPoint
+from PyQt5 import QtCore, QtGui, QtWidgets
+
+from QEIVideo.ui.demo import Ui_MainWindow as DemoUIRoot
+
+
+class DrawFrame(QWidget):
+    def __init__(self, painter, *args, **kwargs):
+        super(DrawFrame, self).__init__(*args, **kwargs)
+        self.painter = painter
+
+    def paintEvent(self, event):
+        painter = QPainter(self)
+        pen = QPen(QColor("orange"))
+        pen.setWidth(5)
+        pen.setCapStyle(Qt.RoundCap)
+        pen.setJoinStyle(Qt.RoundJoin)
+        painter.setPen(pen)
+        painter.drawPath(self.painter)
+
+    def mousePressEvent(self, event):
+        self.painter.moveTo(event.pos())
+        self.update()
+
+    def mouseMoveEvent(self, event):
+        self.painter.lineTo(event.pos())
+
+        self.update()
+
+
+class DemoUI(QMainWindow, DemoUIRoot):
+    def __init__(self):
+        super(DemoUI, self).__init__()
+        self.setupUi(self)
+
+        self.painter = QPainterPath()
+        self.draw_frame = DrawFrame(self.painter, self.video_frame)
+        self.draw_frame.setGeometry(QtCore.QRect(0, 10, 751, 301))
+        self.draw_frame.setObjectName("draw_frame")
+        self.draw_frame.raise_()
+        self.draw_frame.setAttribute(QtCore.Qt.WA_TranslucentBackground)
+
+        self.start_btn.clicked.connect(self.export)
+
+    def export(self):
+        a = self.painter.toFillPolygon()
+        pass
+
+
+if __name__ == '__main__':
+    app = QApplication(sys.argv)
+    gui_class = DemoUI()
+    gui_class.show()
+    sys.exit(app.exec_())
diff --git a/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py b/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py
new file mode 100644
index 000000000..5c9627eb6
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/gui/ui_main_window.py
@@ -0,0 +1,169 @@
+# -*- coding: utf-8 -*-
+
+# Form implementation generated from reading ui file 'GUI.ui'
+#
+# Created by: PyQt5 UI code generator 5.15.2
+#
+# WARNING: Any manual changes made to this file will be lost when pyuic5 is
+# run again.  Do not edit this file unless you know what you are doing.
+from PyQt5 import QtCore, QtWidgets
+from PyQt5.QtGui import *
+from PyQt5.QtWidgets import *
+from PyQt5.QtCore import *
+from QEIVideo.widget.PaintBoard import PaintBoard
+
+
+
+class Ui_MainWindow(object):
+    def setupUi(self, MainWindow):
+        MainWindow.setObjectName("EIVideo")
+        MainWindow.resize(1101, 751)
+        self.centralwidget = QtWidgets.QWidget(MainWindow)
+        self.centralwidget.setObjectName("centralwidget")
+        self.frame = QtWidgets.QFrame(self.centralwidget)
+        self.frame.setGeometry(QtCore.QRect(20, 20, 1271, 771))
+        self.frame.setFrameShadow(QtWidgets.QFrame.Raised)
+        self.frame.setObjectName("frame")
+
+        self.cap = []
+        self.all_frames = []
+
+        self.fps = None
+        self.timer = QTimer(self.frame)
+        self.time_label = QLabel('--/--', self.frame)
+
+        self.progress_slider = QSlider(self.frame)
+        self.progress_slider.setEnabled(True)
+        self.progress_slider.setOrientation(Qt.Horizontal)
+        self.progress_slider.setFixedWidth(710)
+        self.progress_slider.setFixedHeight(20)
+        self.progress_slider.setSingleStep(1)  # 设置变化步长
+        self.progress_slider.setValue(0)
+        self.progress_slider.sliderReleased.connect(self.update_video_position_func)  # 拖拽进度条
+
+        self.picturelabel = QtWidgets.QLabel(self.frame)
+        self.picturelabel.setGeometry(30, 30, 810, 458)
+        self.picturelabel.setText("")
+        self.picturelabel.setObjectName("picturelabel")
+
+        self.paintBoard = PaintBoard(self.frame)
+        self.paintBoard.setGeometry(30, 30, 810, 458)
+
+        self.cbtn_Eraser = QCheckBox("橡皮擦")
+        self.cbtn_Eraser.setParent(self.frame)
+        self.cbtn_Eraser.move(950, 40)
+        self.cbtn_Eraser.clicked.connect(self.on_cbtn_eraser_clicked)
+        self.btn_Clear = QPushButton("清空画板")
+        self.btn_Clear.setParent(self.frame)  # 设置父对象为本界面
+        self.btn_Clear.move(950, 60)
+        self.btn_Clear.clicked.connect(self.paintBoard.clear)
+        self.label_penColor = QLabel(self.frame)
+        self.label_penColor.setText("画笔颜色")
+        self.label_penColor.move(990, 100)
+        # 获取颜色列表(字符串类型)
+        self.colorList = QColor.colorNames()
+        self.comboBox_penColor = QComboBox(self.frame)
+        self.fill_color_list(self.comboBox_penColor)  # 用各种颜色填充下拉列表
+        self.comboBox_penColor.move(1080, 80)
+        self.comboBox_penColor.currentIndexChanged.connect(
+            self.on_pen_color_change)  # 关联下拉列表的当前索引变更信号与函数on_PenColorChange
+
+        self.helplabel = QLabel()
+        self.helplabel.setText("Hi,Welcome to use EIVideo\n"
+                               "This is a guide for EIVideo,\n"
+                               "please check\n"
+                               "1. Choose 'Add' for a video\n"
+                               "2. Click 'Play' to start playing\n"
+                               "3. At this point, all functions \n"
+                               "are unlocked\n"
+                               "4. Paint and enjoy it!\n")
+
+        self.widget2 = QtWidgets.QWidget(self.frame)
+        self.widget2.setGeometry(860, 60, 200, 300)
+        self.widget2.setObjectName("widget2")
+        self.rightLayout = QtWidgets.QVBoxLayout(self.widget2)
+        self.rightLayout.setContentsMargins(0, 0, 0, 0)
+        self.rightLayout.setObjectName("rightLayout")
+        self.rightLayout.addWidget(self.helplabel)
+        self.rightLayout.addSpacing(50)
+        self.rightLayout.addWidget(self.cbtn_Eraser)
+        self.rightLayout.addWidget(self.btn_Clear)
+        self.colorLayout = QtWidgets.QHBoxLayout(self.widget2)
+        self.colorLayout.setContentsMargins(0, 0, 0, 0)
+        self.colorLayout.setObjectName('colorLayout')
+        self.colorLayout.addWidget(self.label_penColor)
+        self.colorLayout.addWidget(self.comboBox_penColor)
+        self.rightLayout.addLayout(self.colorLayout)
+
+
+
+        # pushButton_6 -> GO
+        self.pushButton_6 = QtWidgets.QPushButton(self.frame)
+        self.pushButton_6.setGeometry(870, 600, 150, 90)
+        self.pushButton_6.setObjectName("pushButton_6")
+        self.pushButton_6.clicked.connect(self.infer)
+
+        self.widget1 = QtWidgets.QWidget(self.frame)
+        self.widget1.move(60, 520)
+        self.widget1.setObjectName("widget1")
+        self.barLayout = QtWidgets.QVBoxLayout(self.widget1)
+        self.barLayout.setContentsMargins(0, 0, 0, 0)
+        self.barLayout.setObjectName("barLayout")
+        self.horizontalLayout = QtWidgets.QHBoxLayout(self.widget1)
+        self.horizontalLayout.setContentsMargins(0, 0, 0, 0)
+        self.horizontalLayout.setObjectName("horizontalLayout")
+        self.timeLayout = QtWidgets.QHBoxLayout(self.widget1)
+        self.timeLayout.setContentsMargins(0, 0, 0, 0)
+        self.timeLayout.setObjectName("horizontalLayout")
+
+        self.playbtn = QtWidgets.QPushButton(self.widget1)
+        self.playbtn.setObjectName("playbtn")
+        self.playbtn.clicked.connect(lambda: self.btn_func(self.playbtn))
+        self.horizontalLayout.addWidget(self.playbtn)
+        self.pushButton_2 = QtWidgets.QPushButton(self.widget1)
+        self.pushButton_2.setObjectName("pushButton_2")
+        self.pushButton_2.clicked.connect(lambda: self.btn_func(self.pushButton_2))
+        self.horizontalLayout.addWidget(self.pushButton_2)
+        self.pushButton_4 = QtWidgets.QPushButton(self.widget1)
+        self.pushButton_4.setObjectName("pushButton_4")
+        self.pushButton_4.clicked.connect(lambda: self.btn_func(self.pushButton_4))
+        self.horizontalLayout.addWidget(self.pushButton_4)
+
+        self.timeLayout.addWidget(self.progress_slider)
+        self.timeLayout.addWidget(self.time_label)
+        self.barLayout.addSpacing(20)
+        self.barLayout.addLayout(self.timeLayout)
+        self.barLayout.addSpacing(30)
+        self.barLayout.addLayout(self.horizontalLayout)
+
+        self.splitter = QtWidgets.QSplitter(self.frame)
+        self.splitter.setGeometry(QtCore.QRect(71, 670, 750, 20))
+        self.splitter.setOrientation(QtCore.Qt.Horizontal)
+        self.splitter.setObjectName("splitter")
+        self.label = QtWidgets.QLabel(self.splitter)
+        self.label.setObjectName("label")
+        self.progressBar = QtWidgets.QProgressBar(self.splitter)
+        self.progressBar.setProperty("value", 0)
+        self.progressBar.setObjectName("progressBar")
+        MainWindow.setCentralWidget(self.centralwidget)
+        self.menubar = QtWidgets.QMenuBar(MainWindow)
+        self.menubar.setGeometry(QtCore.QRect(0, 0, 1327, 23))
+        self.menubar.setObjectName("menubar")
+        MainWindow.setMenuBar(self.menubar)
+        self.statusbar = QtWidgets.QStatusBar(MainWindow)
+        self.statusbar.setObjectName("statusbar")
+        MainWindow.setStatusBar(self.statusbar)
+
+        self.retranslateUi(MainWindow)
+        QtCore.QMetaObject.connectSlotsByName(MainWindow)
+
+    def retranslateUi(self, MainWindow):
+        _translate = QtCore.QCoreApplication.translate
+        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
+        self.pushButton_6.setText(_translate("MainWindow", "GO"))
+        self.playbtn.setText(_translate("MainWindow", "Play"))
+        self.pushButton_2.setText(_translate("MainWindow", "Stop"))
+        self.pushButton_4.setText(_translate("MainWindow", "Add"))
+        self.label.setText(_translate("MainWindow", "Hi, This is EIVideo"))
+
+
diff --git a/docs/src/applications/EIVideo/QEIVideo/start.py b/docs/src/applications/EIVideo/QEIVideo/start.py
new file mode 100644
index 000000000..fe8d3785a
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/start.py
@@ -0,0 +1,20 @@
+# Author: AP-Kai
+# Datetime: 2022/1/7
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
+
+
+import sys
+from QEIVideo.build_gui import BuildGUI
+from PyQt5.QtWidgets import QApplication
+
+
+def run():
+    app = QApplication(sys.argv)
+    demo = BuildGUI()
+    demo.show()
+    sys.exit(app.exec())
+
+
+if __name__ == '__main__':
+    run()
diff --git a/docs/src/applications/EIVideo/QEIVideo/tools/__init__.py b/docs/src/applications/EIVideo/QEIVideo/tools/__init__.py
new file mode 100644
index 000000000..1b0d211a2
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/tools/__init__.py
@@ -0,0 +1,4 @@
+# Author: Acer Zhang
+# Datetime: 2022/1/6 
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
diff --git a/docs/src/applications/EIVideo/QEIVideo/ui/__init__.py b/docs/src/applications/EIVideo/QEIVideo/ui/__init__.py
new file mode 100644
index 000000000..1b0d211a2
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/ui/__init__.py
@@ -0,0 +1,4 @@
+# Author: Acer Zhang
+# Datetime: 2022/1/6 
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
diff --git a/docs/src/applications/EIVideo/QEIVideo/ui/demo.py b/docs/src/applications/EIVideo/QEIVideo/ui/demo.py
new file mode 100644
index 000000000..2985ec2ce
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/ui/demo.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+
+# Form implementation generated from reading ui file '/Users/zhanghongji/PycharmProjects/EIVideo/resources/QT/demo.ui'
+#
+# Created by: PyQt5 UI code generator 5.15.6
+#
+# WARNING: Any manual changes made to this file will be lost when pyuic5 is
+# run again.  Do not edit this file unless you know what you are doing.
+
+
+from PyQt5 import QtCore, QtGui, QtWidgets
+
+
+class Ui_MainWindow(object):
+    def setupUi(self, MainWindow):
+        MainWindow.setObjectName("MainWindow")
+        MainWindow.resize(800, 486)
+        MainWindow.setMinimumSize(QtCore.QSize(800, 486))
+        MainWindow.setMaximumSize(QtCore.QSize(800, 486))
+        self.centralwidget = QtWidgets.QWidget(MainWindow)
+        self.centralwidget.setObjectName("centralwidget")
+        self.video_frame = QtWidgets.QFrame(self.centralwidget)
+        self.video_frame.setGeometry(QtCore.QRect(20, 20, 761, 361))
+        self.video_frame.setFrameShape(QtWidgets.QFrame.StyledPanel)
+        self.video_frame.setFrameShadow(QtWidgets.QFrame.Raised)
+        self.video_frame.setObjectName("video_frame")
+        self.graphicsView = QtWidgets.QGraphicsView(self.video_frame)
+        self.graphicsView.setGeometry(QtCore.QRect(0, 0, 761, 321))
+        self.graphicsView.setObjectName("graphicsView")
+        self.frame_2 = QtWidgets.QFrame(self.video_frame)
+        self.frame_2.setGeometry(QtCore.QRect(0, 320, 761, 41))
+        self.frame_2.setFrameShape(QtWidgets.QFrame.StyledPanel)
+        self.frame_2.setFrameShadow(QtWidgets.QFrame.Raised)
+        self.frame_2.setObjectName("frame_2")
+        self.horizontalLayoutWidget = QtWidgets.QWidget(self.frame_2)
+        self.horizontalLayoutWidget.setGeometry(QtCore.QRect(-1, -1, 761, 41))
+        self.horizontalLayoutWidget.setObjectName("horizontalLayoutWidget")
+        self.horizontalLayout = QtWidgets.QHBoxLayout(self.horizontalLayoutWidget)
+        self.horizontalLayout.setContentsMargins(0, 0, 0, 0)
+        self.horizontalLayout.setObjectName("horizontalLayout")
+        self.open_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)
+        self.open_btn.setObjectName("open_btn")
+        self.horizontalLayout.addWidget(self.open_btn)
+        self.save_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)
+        self.save_btn.setObjectName("save_btn")
+        self.horizontalLayout.addWidget(self.save_btn)
+        self.horizontalSlider = QtWidgets.QSlider(self.horizontalLayoutWidget)
+        self.horizontalSlider.setOrientation(QtCore.Qt.Horizontal)
+        self.horizontalSlider.setObjectName("horizontalSlider")
+        self.horizontalLayout.addWidget(self.horizontalSlider)
+        self.select_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)
+        self.select_btn.setObjectName("select_btn")
+        self.horizontalLayout.addWidget(self.select_btn)
+        self.clean_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)
+        self.clean_btn.setObjectName("clean_btn")
+        self.horizontalLayout.addWidget(self.clean_btn)
+        self.start_btn = QtWidgets.QPushButton(self.horizontalLayoutWidget)
+        self.start_btn.setObjectName("start_btn")
+        self.horizontalLayout.addWidget(self.start_btn)
+        self.draw_frame = QtWidgets.QFrame(self.video_frame)
+        self.draw_frame.setGeometry(QtCore.QRect(0, 10, 751, 301))
+        self.draw_frame.setFrameShape(QtWidgets.QFrame.StyledPanel)
+        self.draw_frame.setFrameShadow(QtWidgets.QFrame.Raised)
+        self.draw_frame.setObjectName("draw_frame")
+        self.menu_tab = QtWidgets.QTabWidget(self.centralwidget)
+        self.menu_tab.setGeometry(QtCore.QRect(20, 380, 761, 81))
+        self.menu_tab.setObjectName("menu_tab")
+        self.tab = QtWidgets.QWidget()
+        self.tab.setObjectName("tab")
+        self.act_label = QtWidgets.QLabel(self.tab)
+        self.act_label.setEnabled(True)
+        self.act_label.setGeometry(QtCore.QRect(10, 30, 71, 21))
+        self.act_label.setObjectName("act_label")
+        self.act_info_label = QtWidgets.QLabel(self.tab)
+        self.act_info_label.setEnabled(True)
+        self.act_info_label.setGeometry(QtCore.QRect(80, 30, 81, 21))
+        self.act_info_label.setObjectName("act_info_label")
+        self.act_progressbar = QtWidgets.QProgressBar(self.tab)
+        self.act_progressbar.setGeometry(QtCore.QRect(170, 32, 521, 21))
+        self.act_progressbar.setProperty("value", 24)
+        self.act_progressbar.setObjectName("act_progressbar")
+        self.label_3 = QtWidgets.QLabel(self.tab)
+        self.label_3.setEnabled(True)
+        self.label_3.setGeometry(QtCore.QRect(680, 30, 60, 21))
+        self.label_3.setLayoutDirection(QtCore.Qt.LeftToRight)
+        self.label_3.setAlignment(QtCore.Qt.AlignRight|QtCore.Qt.AlignTrailing|QtCore.Qt.AlignVCenter)
+        self.label_3.setObjectName("label_3")
+        self.menu_tab.addTab(self.tab, "")
+        self.tab_2 = QtWidgets.QWidget()
+        self.tab_2.setObjectName("tab_2")
+        self.menu_tab.addTab(self.tab_2, "")
+        MainWindow.setCentralWidget(self.centralwidget)
+        self.statusbar = QtWidgets.QStatusBar(MainWindow)
+        self.statusbar.setObjectName("statusbar")
+        MainWindow.setStatusBar(self.statusbar)
+
+        self.retranslateUi(MainWindow)
+        self.menu_tab.setCurrentIndex(0)
+        QtCore.QMetaObject.connectSlotsByName(MainWindow)
+
+    def retranslateUi(self, MainWindow):
+        _translate = QtCore.QCoreApplication.translate
+        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
+        self.open_btn.setText(_translate("MainWindow", "打开视频"))
+        self.save_btn.setText(_translate("MainWindow", "保存标注"))
+        self.select_btn.setText(_translate("MainWindow", "选择目标"))
+        self.clean_btn.setText(_translate("MainWindow", "清空目标"))
+        self.start_btn.setText(_translate("MainWindow", "开始推理"))
+        self.act_label.setText(_translate("MainWindow", "当前状态："))
+        self.act_info_label.setText(_translate("MainWindow", "-------------"))
+        self.label_3.setText(_translate("MainWindow", "12%"))
+        self.menu_tab.setTabText(self.menu_tab.indexOf(self.tab), _translate("MainWindow", "状态"))
+        self.menu_tab.setTabText(self.menu_tab.indexOf(self.tab_2), _translate("MainWindow", "属性配置"))
diff --git a/docs/src/applications/EIVideo/QEIVideo/version.py b/docs/src/applications/EIVideo/QEIVideo/version.py
new file mode 100644
index 000000000..1d2c58584
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/version.py
@@ -0,0 +1,6 @@
+# Author: Acer Zhang
+# Datetime: 2022/1/11 
+# Copyright belongs to the author.
+# Please indicate the source for reprinting.
+
+__version__ = "0.1a"
diff --git a/docs/src/applications/EIVideo/QEIVideo/widget/PaintBoard.py b/docs/src/applications/EIVideo/QEIVideo/widget/PaintBoard.py
new file mode 100644
index 000000000..020d6a0d9
--- /dev/null
+++ b/docs/src/applications/EIVideo/QEIVideo/widget/PaintBoard.py
@@ -0,0 +1,106 @@
+from PyQt5.QtWidgets import QWidget
+from PyQt5.Qt import QPixmap, QPainter, QPoint, QPaintEvent, QMouseEvent, QPen, \
+    QColor, QSize
+from PyQt5.QtCore import Qt
+
+
+class PaintBoard(QWidget):
+
+    def __init__(self, parent=None):
+        '''
+        Constructor
+        '''
+        super().__init__(parent)
+
+        self.__init_data()  # 先初始化数据，再初始化界面
+        self.__init_view()
+
+    def __init_data(self):
+
+        self.__size = QSize(810, 458)
+
+        # 新建QPixmap作为画板，尺寸为__size
+        self.__board = QPixmap(self.__size)
+        self.__board.fill(Qt.transparent)  # 用透明填充画板
+
+        self.__IsEmpty = True  # 默认为空画板
+        self.EraserMode = False  # 默认为禁用橡皮擦模式
+
+        self.__lastPos = QPoint(0, 0)  # 上一次鼠标位置
+        self.__currentPos = QPoint(0, 0)  # 当前的鼠标位置
+
+        self.__painter = QPainter()  # 新建绘图工具
+
+        self.__thickness = 15  # 默认画笔粗细为10px
+        self.__penColor = QColor("black")  # 设置默认画笔颜色为黑色
+        self.__colorList = QColor.colorNames()  # 获取颜色列表
+
+    def __init_view(self):
+        # 设置界面的尺寸为__size
+        self.setFixedSize(self.__size)
+
+    def clear(self):
+        # 清空画板
+        # self.__board.fill(Qt.white)
+        self.__board = QPixmap(self.__size)
+        self.__board.fill(Qt.transparent)  # 用透明填充画板
+
+        self.update()
+        self.__IsEmpty = True
+
+    def change_pen_color(self, color="black"):
+        # 改变画笔颜色
+        # rgbaColor = QColor(255, 255, 0, 100)
+        self.__penColor = QColor(color)
+
+    def change_pen_thickness(self, thickness=10):
+        # 改变画笔粗细
+        self.__thickness = thickness
+
+    def is_empty(self):
+        # 返回画板是否为空
+        return self.__IsEmpty
+
+    def get_content_as_q_image(self):
+        # 获取画板内容（返回QImage）
+        image = self.__board.toImage()
+        return image
+
+    def paintEvent(self, paint_event):
+        # 绘图事件
+        # 绘图时必须使用QPainter的实例，此处为__painter
+        # 绘图在begin()函数与end()函数间进行
+        # begin(param)的参数要指定绘图设备，即把图画在哪里
+        # drawPixmap用于绘制QPixmap类型的对象
+        self.__painter.begin(self)
+        # 0,0为绘图的左上角起点的坐标，__board即要绘制的图
+        self.__painter.drawPixmap(0, 0, self.__board)
+        self.__painter.end()
+
+    def mousePressEvent(self, mouse_event):
+        # 鼠标按下时，获取鼠标的当前位置保存为上一次位置
+        self.__currentPos = mouse_event.pos()
+        self.__lastPos = self.__currentPos
+
+    def mouseMoveEvent(self, mouse_event):
+        # 鼠标移动时，更新当前位置，并在上一个位置和当前位置间画线
+        self.__currentPos = mouse_event.pos()
+        self.__painter.begin(self.__board)
+
+        if self.EraserMode == False:
+            # 非橡皮擦模式
+            self.__painter.setPen(QPen(self.__penColor, self.__thickness))  # 设置画笔颜色，粗细
+        else:
+            # 橡皮擦模式下画笔为纯白色，粗细为10
+            self.__painter.setPen(QPen(Qt.transparent, 10))
+
+        # 画线
+        # print(self.__lastPos + self.__currentPos)
+        self.__painter.drawLine(self.__lastPos, self.__currentPos)
+        self.__painter.end()
+        self.__lastPos = self.__currentPos
+
+        self.update()  # 更新显示
+
+    def mouseReleaseEvent(self, mouseEvent):
+        self.__IsEmpty = False  # 画板不再为空
diff --git a/docs/src/applications/EIVideo/README.md b/docs/src/applications/EIVideo/README.md
new file mode 100644
index 000000000..c88b596a3
--- /dev/null
+++ b/docs/src/applications/EIVideo/README.md
@@ -0,0 +1,124 @@
+# EIVideo - 交互式智能视频标注工具
+
+[![Downloads](https://static.pepy.tech/personalized-badge/eivideo?period=total&units=international_system&left_color=grey&right_color=orange&left_text=EIVideo%20User)](https://pepy.tech/project/eivideo)
+[![Downloads](https://static.pepy.tech/personalized-badge/qeivideo?period=total&units=international_system&left_color=grey&right_color=orange&left_text=QEIVideo%20User)](https://pepy.tech/project/qeivideo)
+![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/QPT-Family/EIVideo?include_prereleases)
+![GitHub forks](https://img.shields.io/github/forks/QPT-Family/EIVideo)
+![GitHub Repo stars](https://img.shields.io/github/stars/QPT-Family/EIVideo)
+![GitHub](https://img.shields.io/github/license/QPT-Family/EIVideo)
+![](https://img.shields.io/badge/%E6%B7%B1%E5%BA%A6%E9%80%82%E9%85%8D->Win7-9cf)
+
+---
+
+
+<div align="center">
+<img width="600" alt="图片" src="https://user-images.githubusercontent.com/46156734/148925774-a04b641c-6a71-43ed-a7c0-d4b66e8d6e8a.png">
+</div>
+  
+EIVideo，基于百度飞桨MA-Net交互式视频分割模型打造的交互式**智能视频**标注工具箱，只需简单标注几帧，即可完成全视频标注，若自动标注结果未达要求还可通过多次和视频交互而不断提升视频分割质量，直至对分割质量满意。  
+
+戳 -> 了解相关[技术文章&模型原理](等待微信公众号)
+
+<div align="center">
+<img width="300" alt="图片" src="https://ai-studio-static-online.cdn.bcebos.com/f792bac0dd3b4f44ade7d744b58e908e2a85ed8718b541cfb6b2ce9fc8ad4374">
+</div>
+
+> 为了更好的解放双手，我们还提供了图形化界面工具QEIVideo，通过它我们可以不使用繁杂的命令方式来完成视频的智能标注工作。
+
+---
+
+### README目录
+
+- [EAP - The Early Access Program 早期访问计划](#eap---the-early-access-program-早期访问计划)
+- [使用方式](#使用方式)
+  - [安装&运行](#安装运行)
+    - [QPT包 - 适合无Python基础用户](#qpt包---适合无python基础用户)
+    - [标准Python包 - 适合普通Python开发者](#标准python包---适合普通python开发者)
+    - [开发版本 - 适合高阶开发者进行开发/社区贡献](#开发版本---适合高阶开发者进行开发社区贡献)
+- [(Q)EIVideo产品规划安排](#qeivideo产品规划安排)
+- [开源协议](#开源协议)
+
+---
+
+### EAP - The Early Access Program 早期访问计划
+
+> Warning 当前图形化界面QEIVideo处于**极其初阶**的...建设阶段，并不能保证程序稳定性。
+
+<div align="center"> <img width="100" alt="图片" src="https://user-images.githubusercontent.com/46156734/148927601-791362c0-0286-4fb9-b9d1-c193f7485de1.png"> </div>
+
+当您选择使用QEIVideo作为图形化界面时，即可视为同意使用“可能会存在大量体验不佳”的EAP产品。
+
+同样，您可选择借助基于[PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo) 实现的
+交互式视频标注模型[EIVideo](https://github.com/QPT-Family/EIVideo/EIVideo) 进行二次开发，在此之上也可完成您需要的自定义图形化界面，后续也将提供二次开发指南。
+
+<div align="center"> <img width="100" alt="图片" src="https://user-images.githubusercontent.com/46156734/148928046-b1490080-52f0-4a15-b7ff-11d54b135039.png"> </div>
+
+
+> 如果您愿意参与到EIVideo或QEIVideo的建设中来，欢迎您与PMC取得联系 -> WX:GT_ZhangAcer  
+
+## 使用方式
+### 安装&运行
+#### QPT包 - 适合无Python基础用户
+自动化配置相关Python环境，但仅支持Windows7/10/11操作系统，且不对盗版Windows7做任何适配。  
+下载地址：暂未上传
+> 自动化部署工具由[QPT - 自动封装工具](https://github.com/QPT-Family/QPT) 支持  
+
+#### 标准Python包 - 适合普通Python开发者
+* 国际方式：
+  ```shell
+  python -m pip install eivideo
+  python qeivideo
+  ```
+* 国内推荐：
+  ```shell
+  python -m pip install eivideo -i https://mirrors.bfsu.edu.cn/pypi/web/simple
+  python qeivideo
+  ```
+> 上述命令仅适用于常规情况，若您安装了多个Python或修改了相关开发工具与配置，请自行修改相关命令使其符合您的开发环境。
+
+#### 开发版本 - 适合高阶开发者进行开发/社区贡献
+
+* 国际方式：
+  ```shell
+  git clone https://github.com/QPT-Family/EIVideo.git
+  python -m pip install -r requirements.txt
+  ```
+* 国内推荐：
+  ```shell
+  # 请勿用于Push！！！
+  git clone https://hub.fastgit.org/QPT-Family/EIVideo.git
+  python -m pip install -r requirements.txt -i https://mirrors.bfsu.edu.cn/pypi/web/simple
+  ```
+* 运行程序
+  ```shell
+  # 进入工作目录
+  cd 此处填写EIVideo所在的目录的绝对路径，且该目录下拥有EIVideo与QEIVideo两文件夹。
+  # 运行
+  python QEIVideo/start.py
+  
+  # 如运行时无法找到对应包，可选择下述方式添加环境变量来调整索引次序后执行python
+  # Windows
+  set PYTHONPATH=$pwd:$PYTHONPATH
+  # Linux
+  export PYTHONPATH=$pwd:$PYTHONPATH
+  ```
+
+> 上述命令仅适用于常规情况，若您安装了多个Python或修改了相关开发工具与配置，请自行修改相关命令使其符合您的开发环境。
+
+## (Q)EIVideo产品规划安排  
+> 由于QEIVideo由飞桨开源社区学生爱好者构成，所以在项目的产出过程中将会以学习为主进行开源贡献，如您原因与我们一同建设，我们也将非常欢迎~
+<div align="center"> <img width="100" alt="图片" src="https://user-images.githubusercontent.com/46156734/148928475-b5b340b7-241d-4ddc-8155-70d98c6384a9.png"> </div>
+
+- [x] EIVideo与Demo版QEIVideo发布0.1.0Alpha版本
+- [ ] 完善QEIVideo，丰富基础标注功能，于Q1升级至1.0Alpha版本
+- [ ] 回归QEIVideo稳定性，于Q2完成1.0正式版本发版
+- [ ] 增加视频目标检测、分类任务的交互式标注功能。
+
+### 开源协议
+本项目使用GNU LESSER GENERAL PUBLIC LICENSE(LGPL)开源协议。  
+> 因所使用的模型与数据集等原因，本项目中任一代码、参数均不可直接进行商用，如需商用请与我们取得联系。
+
+### 引用来源
+1. EIVideo模型以及相关源码、论文与项目 - [PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo)
+2. 部分表情包来源 - [甘城なつき](https://www.pixiv.net/users/3036679)
+
diff --git a/docs/src/applications/EIVideo/resources/QT/demo.ui b/docs/src/applications/EIVideo/resources/QT/demo.ui
new file mode 100644
index 000000000..b11250624
--- /dev/null
+++ b/docs/src/applications/EIVideo/resources/QT/demo.ui
@@ -0,0 +1,236 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>MainWindow</class>
+ <widget class="QMainWindow" name="MainWindow">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>800</width>
+    <height>486</height>
+   </rect>
+  </property>
+  <property name="minimumSize">
+   <size>
+    <width>800</width>
+    <height>486</height>
+   </size>
+  </property>
+  <property name="maximumSize">
+   <size>
+    <width>800</width>
+    <height>486</height>
+   </size>
+  </property>
+  <property name="windowTitle">
+   <string>MainWindow</string>
+  </property>
+  <widget class="QWidget" name="centralwidget">
+   <widget class="QFrame" name="video_frame">
+    <property name="geometry">
+     <rect>
+      <x>20</x>
+      <y>20</y>
+      <width>761</width>
+      <height>361</height>
+     </rect>
+    </property>
+    <property name="frameShape">
+     <enum>QFrame::StyledPanel</enum>
+    </property>
+    <property name="frameShadow">
+     <enum>QFrame::Raised</enum>
+    </property>
+    <widget class="QGraphicsView" name="graphicsView">
+     <property name="geometry">
+      <rect>
+       <x>0</x>
+       <y>0</y>
+       <width>761</width>
+       <height>321</height>
+      </rect>
+     </property>
+    </widget>
+    <widget class="QFrame" name="frame_2">
+     <property name="geometry">
+      <rect>
+       <x>0</x>
+       <y>320</y>
+       <width>761</width>
+       <height>41</height>
+      </rect>
+     </property>
+     <property name="frameShape">
+      <enum>QFrame::StyledPanel</enum>
+     </property>
+     <property name="frameShadow">
+      <enum>QFrame::Raised</enum>
+     </property>
+     <widget class="QWidget" name="horizontalLayoutWidget">
+      <property name="geometry">
+       <rect>
+        <x>-1</x>
+        <y>-1</y>
+        <width>761</width>
+        <height>41</height>
+       </rect>
+      </property>
+      <layout class="QHBoxLayout" name="horizontalLayout">
+       <item>
+        <widget class="QPushButton" name="open_btn">
+         <property name="text">
+          <string>打开视频</string>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <widget class="QPushButton" name="save_btn">
+         <property name="text">
+          <string>保存标注</string>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <widget class="QSlider" name="horizontalSlider">
+         <property name="orientation">
+          <enum>Qt::Horizontal</enum>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <widget class="QPushButton" name="select_btn">
+         <property name="text">
+          <string>选择目标</string>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <widget class="QPushButton" name="clean_btn">
+         <property name="text">
+          <string>清空目标</string>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <widget class="QPushButton" name="start_btn">
+         <property name="text">
+          <string>开始推理</string>
+         </property>
+        </widget>
+       </item>
+      </layout>
+     </widget>
+    </widget>
+    <widget class="QFrame" name="draw_frame">
+     <property name="geometry">
+      <rect>
+       <x>0</x>
+       <y>10</y>
+       <width>751</width>
+       <height>301</height>
+      </rect>
+     </property>
+     <property name="frameShape">
+      <enum>QFrame::StyledPanel</enum>
+     </property>
+     <property name="frameShadow">
+      <enum>QFrame::Raised</enum>
+     </property>
+    </widget>
+   </widget>
+   <widget class="QTabWidget" name="menu_tab">
+    <property name="geometry">
+     <rect>
+      <x>20</x>
+      <y>380</y>
+      <width>761</width>
+      <height>81</height>
+     </rect>
+    </property>
+    <property name="currentIndex">
+     <number>0</number>
+    </property>
+    <widget class="QWidget" name="tab">
+     <attribute name="title">
+      <string>状态</string>
+     </attribute>
+     <widget class="QLabel" name="act_label">
+      <property name="enabled">
+       <bool>true</bool>
+      </property>
+      <property name="geometry">
+       <rect>
+        <x>10</x>
+        <y>30</y>
+        <width>71</width>
+        <height>21</height>
+       </rect>
+      </property>
+      <property name="text">
+       <string>当前状态：</string>
+      </property>
+     </widget>
+     <widget class="QLabel" name="act_info_label">
+      <property name="enabled">
+       <bool>true</bool>
+      </property>
+      <property name="geometry">
+       <rect>
+        <x>80</x>
+        <y>30</y>
+        <width>81</width>
+        <height>21</height>
+       </rect>
+      </property>
+      <property name="text">
+       <string>-------------</string>
+      </property>
+     </widget>
+     <widget class="QProgressBar" name="act_progressbar">
+      <property name="geometry">
+       <rect>
+        <x>170</x>
+        <y>32</y>
+        <width>521</width>
+        <height>21</height>
+       </rect>
+      </property>
+      <property name="value">
+       <number>24</number>
+      </property>
+     </widget>
+     <widget class="QLabel" name="label_3">
+      <property name="enabled">
+       <bool>true</bool>
+      </property>
+      <property name="geometry">
+       <rect>
+        <x>680</x>
+        <y>30</y>
+        <width>60</width>
+        <height>21</height>
+       </rect>
+      </property>
+      <property name="layoutDirection">
+       <enum>Qt::LeftToRight</enum>
+      </property>
+      <property name="text">
+       <string>12%</string>
+      </property>
+      <property name="alignment">
+       <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
+      </property>
+     </widget>
+    </widget>
+    <widget class="QWidget" name="tab_2">
+     <attribute name="title">
+      <string>属性配置</string>
+     </attribute>
+    </widget>
+   </widget>
+  </widget>
+  <widget class="QStatusBar" name="statusbar"/>
+ </widget>
+ <resources/>
+ <connections/>
+</ui>
diff --git a/docs/src/applications/EIVideo/resources/cmd b/docs/src/applications/EIVideo/resources/cmd
new file mode 100644
index 000000000..c21d88afb
--- /dev/null
+++ b/docs/src/applications/EIVideo/resources/cmd
@@ -0,0 +1,4 @@
+# 更新PaddleVideo上的EIVideo
+git subtree push --prefix=applications/EIVideo/ https://github.com/QPT-Family/EIVideo 开发分支
+git subtree pull --prefix=applications/EIVideo/ https://github.com/QPT-Family/EIVideo 开发分支 --squash
+git subtree split --rejoin --prefix=applications/EIVideo/  --branch 开发分支
\ No newline at end of file
diff --git a/docs/src/applications/FightRecognition/README.md b/docs/src/applications/FightRecognition/README.md
new file mode 100644
index 000000000..e82e4f650
--- /dev/null
+++ b/docs/src/applications/FightRecognition/README.md
@@ -0,0 +1,249 @@
+# 打架识别模型
+
+## 内容
+- [1 快速开始](#快速开始)
+- [2 数据准备](#数据准备)
+    - [2.1 数据集下载](#数据集下载)
+    - [2.2 视频抽帧](#视频抽帧)
+    - [2.3 训练集和验证集划分](#训练集和验证集划分)
+    - [2.4 视频裁剪](#视频裁剪)
+- [3 模型训练](#模型训练)
+- [4 模型评估](#模型评估)
+- [5 模型导出](#模型导出)
+
+
+实时行人分析工具[PP-Human](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/pphuman)中集成了视频分类的打架识别模块。本文档介绍如何基于[PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo/)，完成打架识别模型的训练流程。
+
+目前打架识别模型使用的是[PP-TSM](https://github.com/PaddlePaddle/PaddleVideo/blob/63c88a435e98c6fcaf353429d2df6cc24b8113ba/docs/zh-CN/model_zoo/recognition/pp-tsm.md)，并在PP-TSM视频分类模型训练流程的基础上修改适配，完成模型训练。
+
+请先参考[使用说明](https://github.com/XYZ-916/PaddleVideo/blob/develop/docs/zh-CN/usage.md)了解PaddleVideo模型库的使用。
+
+
+<a name="快速开始"></a>
+## 1 快速开始
+
+打架识别静态图模型获取[https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.zip](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.zip)。
+
+打架识别[demo](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/fight_demo.mp4)。
+
+首先需要将下载好的静态图模型解压并放到`inference`目录下，然后执行下面的命令即可直接判断一个给定的视频中是否存在打架行为：
+
+```
+cd ${PaddleVideo_root}
+python tools/predict.py --input_file fight.avi \
+                           --config pptsm_fight_frames_dense.yaml \
+                           --model_file inference/ppTSM/ppTSM.pdmodel \
+                           --params_file inference/ppTSM/ppTSM.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+
+<a name="数据准备"></a>
+## 2 数据准备
+
+PP-TSM是一个基于视频片段进行预测的模型。在PaddleVideo中，训练数据为`.mp4`、`.avi`等格式视频或者是抽帧后的视频帧序列，标签则可以是`.txt`格式存储的文件。
+
+<a name="数据集下载"></a>
+### 2.1 数据集下载
+
+本项目基于6个公开的打架、暴力行为相关数据集合并后的数据进行模型训练。公开数据集具体信息如下：
+
+| 数据集 | 下载连接 | 简介 | 标注 | 数量 | 时长 |
+| ---- | ---- | ---------- | ---- | ---- | ---------- |
+|  Surveillance Camera Fight Dataset| https://github.com/sayibet/fight-detection-surv-dataset | 裁剪视频，监控视角 | 视频级别 | 打架：150；非打架：150 | 2s |
+| A Dataset for Automatic Violence Detection in Videos | https://github.com/airtlab/A-Dataset-for-Automatic-Violence-Detection-in-Videos | 裁剪视频，室内自行录制 | 视频级别 | 暴力行为：115个场景，2个机位，共230 ；非暴力行为：60个场景，2个机位，共120 | 几秒钟 |
+| Hockey Fight Detection Dataset | https://www.kaggle.com/datasets/yassershrief/hockey-fight-vidoes?resource=download | 裁剪视频，非真实场景 | 视频级别 | 打架：500；非打架：500 | 2s |
+| Video Fight Detection Dataset | https://www.kaggle.com/datasets/naveenk903/movies-fight-detection-dataset | 裁剪视频，非真实场景 | 视频级别 | 打架：100；非打架：101 | 2s |
+| Real Life Violence Situations Dataset | https://www.kaggle.com/datasets/mohamedmustafa/real-life-violence-situations-dataset | 裁剪视频，非真实场景 | 视频级别 | 暴力行为：1000；非暴力行为：1000 | 几秒钟 |
+| UBI Abnormal Event Detection Dataset| http://socia-lab.di.ubi.pt/EventDetection/ | 未裁剪视频，监控视角 | 帧级别 | 打架：216；非打架：784；裁剪后二次标注：打架1976，非打架1630 | 原视频几秒到几分钟不等，裁剪后2s |
+
+打架（暴力行为）视频3956个，非打架（非暴力行为）视频3501个，共7457个视频，每个视频几秒钟。
+
+<a name="视频抽帧"></a>
+### 2.2 视频抽帧
+
+为了加快训练速度，将视频进行抽帧。
+
+```bash
+cd ${PaddleVideo_root}
+python data/ucf101/extract_rawframes.py dataset/ rawframes/ --level 2 --ext mp4
+```
+其中，视频存放在`dataset`目录下，打架（暴力）视频存放在`dataset/fight`中；非打架（非暴力）视频存放在`dataset/nofight`中。`rawframes`目录存放抽取的视频帧。
+
+<a name="训练集和验证集划分"></a>
+### 2.3 训练集和验证集划分
+
+本项目验证集1500条，来自Surveillance Camera Fight Dataset、A Dataset for Automatic Violence Detection in Videos、UBI Abnormal Event Detection Dataset三个数据集。
+
+也可根据下面的代码将数据按照0.8:0.2的比例划分成训练集和测试集：
+
+```python
+import os
+import glob
+import random
+import fnmatch
+import re
+
+class_id = {
+    "nofight":0,
+    "fight":1
+}
+
+def get_list(path,key_func=lambda x: x[-11:], rgb_prefix='img_', level=1):
+    if level == 1:
+        frame_folders = glob.glob(os.path.join(path, '*'))
+    elif level == 2:
+        frame_folders = glob.glob(os.path.join(path, '*', '*'))
+    else:
+        raise ValueError('level can be only 1 or 2')
+
+    def count_files(directory):
+        lst = os.listdir(directory)
+        cnt = len(fnmatch.filter(lst, rgb_prefix + '*'))
+        return cnt
+
+    # check RGB
+    video_dict = {}
+    for f in frame_folders:
+        cnt = count_files(f)
+        k = key_func(f)
+        if level==2:
+            k = k.split("/")[0]
+
+        video_dict[f]=str(cnt)+" "+str(class_id[k])
+
+    return video_dict
+
+def fight_splits(video_dict, train_percent=0.8):
+    videos = list(video_dict.keys())
+
+    train_num = int(len(videos)*train_percent)
+
+    train_list = []
+    val_list = []
+
+    random.shuffle(videos)
+
+    for i in range(train_num):
+        train_list.append(videos[i]+" "+str(video_dict[videos[i]]))
+    for i in range(train_num,len(videos)):
+        val_list.append(videos[i]+" "+str(video_dict[videos[i]]))
+
+    print("train:",len(train_list),",val:",len(val_list))
+
+    with open("fight_train_list.txt","w") as f:
+        for item in train_list:
+            f.write(item+"\n")
+
+    with open("fight_val_list.txt","w") as f:
+        for item in val_list:
+            f.write(item+"\n")
+
+frame_dir = "rawframes"
+level = 2
+train_percent = 0.8
+
+if level == 2:
+    def key_func(x):
+        return '/'.join(x.split('/')[-2:])
+else:
+    def key_func(x):
+        return x.split('/')[-1]
+
+video_dict = get_list(frame_dir, key_func=key_func, level=level)  
+print("number:",len(video_dict))
+
+fight_splits(video_dict, train_percent)
+```
+
+最终生成fight_train_list.txt和fight_val_list.txt两个文件。打架的标签为1，非打架的标签为0。
+
+<a name="视频裁剪"></a>
+### 2.4 视频裁剪
+对于未裁剪的视频，需要先进行裁剪才能用于模型训练，这个给出视频裁剪的函数`cut_video`，输入为视频路径，裁剪的起始帧和结束帧以及裁剪后的视频保存路径。
+
+```python
+
+import cv2
+
+def cut_video(video_path, frameToStart, frametoStop, saved_video_path):
+    cap = cv2.VideoCapture(video_path)
+    FPS = cap.get(cv2.CAP_PROP_FPS)
+    #print("FPS:",FPS)
+
+    TOTAL_FRAME = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # 获取视频总帧数
+    #print("TOTAL_FRAME:",TOTAL_FRAME)
+    size = (cap.get(cv2.CAP_PROP_FRAME_WIDTH), cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    #print("size:",size)
+    videoWriter =cv2.VideoWriter(saved_video_path,apiPreference = 0,fourcc = cv2.VideoWriter_fourcc(*'mp4v'),fps=FPS,
+            frameSize=(int(size[0]),int(size[1])))
+
+    COUNT = 0
+    while True:
+            success, frame = cap.read()
+            if success:
+                COUNT += 1
+                if COUNT <= frametoStop and COUNT > frameToStart:  # 选取起始帧
+                    videoWriter.write(frame)
+            else:
+                print("cap.read failed!")
+                break
+            if COUNT > frametoStop:
+                break
+
+    cap.release()
+    videoWriter.release()
+
+    print(saved_video_path)
+```
+
+<a name="模型训练"></a>
+## 3 模型训练
+下载预训练模型：
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams
+```
+
+模型训练：
+```bash
+# 单卡训练
+cd ${PaddleVideo_root}
+python main.py --validate -c pptsm_fight_frames_dense.yaml
+```
+
+```bash
+cd ${PaddleVideo_root}
+# 多卡训练
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -B -m paddle.distributed.launch --gpus=“0,1,2,3” \
+   --log_dir=log_pptsm_dense  main.py  --validate \
+   -c pptsm_fight_frames_dense.yaml
+```
+
+<a name="模型评估"></a>
+## 4 模型评估
+
+训练好的模型下载：[https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_fight.pdparams)
+
+模型评估：
+```bash
+cd ${PaddleVideo_root}
+python main.py --test -c pptsm_fight_frames_dense.yaml \
+   -w ppTSM_fight_best.pdparams
+```
+
+其中`ppTSM_fight_best.pdparams`为训练好的模型。
+
+<a name="模型导出"></a>
+## 5 模型导出
+
+导出inference模型：
+
+```bash
+cd ${PaddleVideo_root}
+python tools/export_model.py -c pptsm_fight_frames_dense.yaml \
+                                -p ppTSM_fight_best.pdparams \
+                                -o inference/ppTSM
+```
+
diff --git a/docs/src/applications/FigureSkating/README.md b/docs/src/applications/FigureSkating/README.md
new file mode 100644
index 000000000..5f7af394b
--- /dev/null
+++ b/docs/src/applications/FigureSkating/README.md
@@ -0,0 +1,92 @@
+# 花样滑冰动作识别
+
+---
+## 内容
+
+- [视频数据处理方法](#视频数据处理方法)
+- [模型训练预测方法](#模型训练预测方法)
+
+
+<div align="center">
+  <img src="Alex.gif" width=250/></div>
+
+### 视频数据处理方法
+
+ - 提供从视频中提取骨骼点数据的方法，方便用户自行提取数据进行测试。
+
+ 花样滑冰数据提取采用了openpose，通过其提供的demo或是相应的api来实现数据的提取，因此需要用户配置openpose环境。
+ 如下是通过花样滑冰数据集构建项目[Skeleton Scripts](https://github.com/HaxiSnake/skeleton_scripts)提取骨骼点数据方法的具体介绍。
+
+ #### step1 安装openpose
+
+ - 参考：https://github.com/CMU-Perceptual-Computing-Lab/openpose  
+
+ #### step2 测试openpose提供demo
+
+ - 这里通过测试openpose的demo程序来验证是否安装成功。
+
+ demo1：检测视频中身体骨骼点（以linux系统为例）：
+
+ ```bash
+ ./build/examples/openpose/openpose.bin --video examples_video.avi --write_json output/ --display 0 --render_pose 0
+ ```
+
+ 执行成功之后会在output/路径下生成视频每一帧骨骼点数据的json文件。
+
+ demo2：检测视频中身体+面部+手部骨骼点（以linux系统为例）：
+
+ ```bash
+ ./build/examples/openpose/openpose.bin --video examples_video.avi --write_json output/ --display 0 --render_pose 0 --face --hand
+ ```
+
+ 执行成功之后会在output/路径下生成视频每一帧身体+面部+手部骨骼点数据的json文件。
+
+ #### step3 视频及相关信息处理
+
+ - 由于[Skeleton Scripts](https://github.com/HaxiSnake/skeleton_scripts)为制作花样滑冰数据集所用，因此此处步骤可能存在不同程度误差，实际请用户自行调试代码。
+
+ 将要转化的花样滑冰视频储存到[Skeleton Scripts](https://github.com/HaxiSnake/skeleton_scripts)的指定路径（可自行创建）：
+ ```bash
+ ./skating2.0/skating63/
+ ```
+
+ 同时需要用户自行完成对视频信息的提取，保存为label_skating63.csv文件，储存到如下路径中（可自行创建）：
+
+ ```bash
+ ./skating2.0/skating63/
+ ./skating2.0/skating63_openpose_result/
+ ```
+
+ label_skating63.csv中格式如下：
+
+ | 动作分类 | 视频文件名 | 视频帧数 | 动作标签 |
+ | :----: | :----: | :----: | :---- |
+
+ 此处用户只需要输入视频文件名（无需后缀，默认后缀名为.mp4，其他格式需自行更改代码)，其他三项定义为空字符串即可，不同表项之间通过 ',' 分割。
+
+ #### step4 执行skating_convert.py:
+
+ - 注意，这一步需要根据用户对openpose的配置进行代码的更改，主要修改项为openpose路径、openpose-demo路径等，具体详见代码。
+
+ 本脚步原理是调用openpose提供的demo提取视频中的骨骼点，并进行数据格式清洗，最后将每个视频的提取结果结果打包成json文件，json文件储存在如下路径：
+
+ ```bash
+ ./skating2.0/skating63_openpose_result/label_skating63_data/
+ ```
+
+ #### step5 执行skating_gendata.py:
+
+ 将json文件整理为npy文件并保存，多个视频文件将保存为一个npy文件，保存路径为：
+
+ ```bash
+ ./skating2.0/skating63_openpose_result/skeleton_file/
+ ```
+
+ - 通过上述步骤就可以将视频数据转化为无标签的骨骼点数据。
+
+ - 最后用户只需将npy数据输入送入网络开始模型测试，亦可通过预测引擎推理。
+
+
+ ### 模型训练预测方法
+
+ 模型使用方法参考[ST-GCN模型文档](../../docs/zh-CN/model_zoo/recognition/stgcn.md)
diff --git a/docs/src/applications/FootballAction/README.md b/docs/src/applications/FootballAction/README.md
new file mode 100644
index 000000000..faf129574
--- /dev/null
+++ b/docs/src/applications/FootballAction/README.md
@@ -0,0 +1,513 @@
+# 足球动作检测模型
+
+
+## 内容
+- [1. 模型简介](#1-模型简介)
+- [2. 环境准备](#2-环境准备)
+- [3. 数据准备](#3-数据准备)
+    - [3.1 数据集简介](#31-数据集简介)
+    - [3.2 数据集下载](#32-数据集下载)
+    - [3.3 数据预处理](#33-数据预处理)
+- [4. 快速体验](#4-快速体验)
+- [5. 进阶使用](#5-进阶使用)
+    - [5.1 模型训练](#51-模型训练)
+    - [5.2 模型推理](#52-模型推理)
+    - [5.3 模型评估](#53-模型评估)
+    - [5.4 模型优化](#54-模型优化)
+    - [5.5 模型部署](#55-模型部署)
+- [6. 参考论文](#6-参考论文)
+
+<a name="模型简介"></a>
+## 1. 模型简介
+
+FootballAction是基于PaddleVideo实现的足球动作检测算法，用于从足球比赛视频中定位出精彩动作片段发生的起止时间和对应的动作类别。可以定位的足球动作类型包括8种，分别为：
+```txt
+背景、进球、角球、任意球、黄牌、红牌、换人、界外球
+```
+
+我们提出的方案结合PP-TSM、BMN和AttentionLSTM三个模型，图像和音频两种模态进行动作检测，算法整体流程共分为以下三步：
+ - 特征抽取
+    - 图像特性：PP-TSM
+    - 音频特征：VGGish
+ - proposal提取：BMN
+ - 动作分类 + 回归：AttentionLSTM
+
+
+AIStudio项目： [基于PP-TSM+BMN+AttentionLSTM实现足球精彩时刻剪辑](https://aistudio.baidu.com/aistudio/projectdetail/3473391?channelType=0&channel=0)
+
+<a name="环境准备"></a>
+## 2. 环境准备
+
+- PaddleVideo模型库依赖安装请参考 [安装说明](../../docs/zh-CN/install.md)
+
+<a name="数据准备"></a>
+## 3. 数据准备
+
+<a name="数据集简介"></a>
+### 3.1 数据集简介
+
+数据集来自欧洲杯2016，共49个足球视频，其中训练集44个，验证集5个。
+
+- 数据集label格式
+```
+{
+    "0": "背景",
+    "1": "进球",
+    "2": "角球",
+    "3": "任意球",
+    "4": "黄牌",
+    "5": "红牌",
+    "6": "换人",
+    "7": "界外球",
+}
+```
+
+- 数据集标注文件:
+```txt
+datasets/EuroCup2016/label_cls8_train.json
+datasets/EuroCup2016/label_cls8_val.json
+```
+
+- 数据集gts处理, 将原始标注数据处理成如下json格式
+```
+{
+    'fps': 5,
+    'gts': [
+        {
+            'url': 'xxx.mp4',
+            'total_frames': 6341,
+            'actions': [
+                {
+                    "label_ids": [7],
+                    "label_names": ["界外球"],
+                    "start_id": 395,
+                    "end_id": 399
+                },
+                ...
+            ]
+        },
+        ...
+    ]
+}
+```
+
+<a name="数据集下载"></a>
+### 3.2 数据集下载
+
+数据集下载链接: [dataset_url.list](./datasets/EuroCup2016/dataset_url.list)
+
+可使用如下脚本下载：
+```
+cd datasets/EuroCup2016 && sh download_dataset.sh
+```
+
+<a name="数据预处理"></a>
+### 3.3 数据预处理
+
+- 数据集抽帧, 由mp4, 得到frames和pcm, 这里需要添加ffmpeg环境
+```
+cd datasets/script && python get_frames_pcm.py
+```
+
+
+经过以上步骤，得到的代码结构如下所示：
+
+```
+|-- FootballAction
+   |--  checkpoints                # 模型存放路径
+   |--  datasets                   # 数据集和数据处理脚本
+        |--  EuroCup2016           # 数据存放路径
+            |--  feature_bmn       # bmn提取到的proposal
+            |--  features          # image和audio特征, image fps=5, audio 每秒(1024)
+            |--  input_for_bmn     # bmn训练的输入数据，widows=40
+            |--  input_for_lstm    # lstm训练的输入数据
+            |--  input_for_pptsm    # pptsm训练的数据数据
+            |--  mp4               # 原始视频.mp4
+            |--  frames            # 图像帧, fps=5, '.jpg'格式
+            |--  pcm               # 音频pcm, 音频采样率16000，采用通道数1
+            |--  url.list          # 视频列表
+            |--  url_val.list          # 视频列表
+            |--  label_cls8_train.json  # 训练集原始gts
+            |--  label_cls8_val.json    # 验证集原始gts
+            |--  label.json        # 动作label
+        |--  script                # 数据集处理脚本
+    |--  predict                   # 模型预测代码
+    |--  extractor                 # 特征提取脚本
+    |--  train_lstm                # lstm训练代码
+    |--  train_proposal            # pptsm、bmn训练代码
+        |--  configs               # pptsm、bmn配置文件
+```
+
+<a name="快速体验"></a>
+## 4. 快速体验
+
+首先，通过以下命令，下载训练好的模型文件：
+```bash
+cd checkpoints
+sh  download.sh
+```
+
+运行预测代码：
+```
+cd ${FootballAction_root}/predict && python predict.py
+```
+产出文件：results.json
+
+
+<a name="进阶使用"></a>
+## 5. 进阶使用
+
+<a name="模型训练"></a>
+### 5.1 模型训练
+
+采样方式：
+- image 采样频率fps=5，如果有些动作时间较短，可以适当提高采样频率
+- BMN windows=200，即40s，所以测试自己的数据时，视频时长需大于40s
+
+请先参考[使用说明](../../docs/zh-CN/usage.md)了解PaddleVideo模型库的使用。
+
+#### step1 PP-TSM训练
+
+PP-TSM模型使用文档参考[PP-TSM](../../docs/zh-CN/model_zoo/recognition/pp-tsm.md)
+
+##### step1.1  PP-TSM 训练数据处理
+
+使用如下命令结合frames和gts生成训练所需要的正负样本:
+```bash
+cd datasets/script && python get_instance_for_pptsm.py
+```
+
+完成该步骤后，数据存储位置
+```
+   |--  datasets                   # 数据集和数据处理脚本
+        |--  EuroCup2016           # 数据存放路径
+            |--  input_for_pptsm   # pptsm训练的数据
+```
+
+文件按照如下格式命名：
+```
+'{}_{}_{}_{}'.format(video_basename, start_id, end_id, label)
+```
+
+##### step1.2 PP-TSM模型训练
+训练启动命令如下：
+```bash
+cd ${FootballAction_root}
+cd ../..  #进入PaddleVideo目录下
+
+python -B -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    --log_dir=./football/logs_pptsm \
+    main.py  \
+    --validate \
+    -c applications/FootballAction/train_proposal/configs/pptsm_football_v2.0.yaml  \
+    -o output_dir=./football/pptsm
+```
+
+我们也提供了训练好的PP-TSM模型，下载链接已在快速体验章节中给出。
+
+##### step1.3 导出PP-TSM推理模型
+在转为预测模式前，需要修改 `PaddleVideo/paddlevideo/modeling/framework/recognizers/recognizer2d.py` 文件，将 init 和 infer_step 函数分别更新为如下代码：
+
+```python
+    def __init__(self, backbone=None, head=None):
+        super().__init__(backbone=backbone, head=head)
+        self.avgpool2d = paddle.nn.AdaptiveAvgPool2D((1, 1), data_format='NCHW')
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        imgs = data_batch[0]
+        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))
+        feature = self.backbone(imgs)
+        feat = self.avgpool2d(feature)
+        return feat
+```
+再执行如下命令：
+
+```bash
+cd ${PaddleVideo_root}
+python tools/export_model.py -c applications/FootballAction/train_proposal/configs/pptsm_football_v2.0.yaml \
+                             -p ./football/pptsm/ppTSM_best.pdparams \
+                             -o ./football/inference_model
+```
+
+#####  step1.4  基于PP-TSM的视频特征提取
+
+将 `PaddleVideo/applications/FootballAction/predict/action_detect/models/pptsm_infer.py` 文件中41行的
+```python
+self.output_tensor = self.predictor.get_output_handle(output_names[1])
+```
+替换为
+```python
+self.output_tensor = self.predictor.get_output_handle(output_names[0])
+```
+
+
+使用如下命令进行image和audio特征的提取，默认使用下载的模型进行特征提取，如果使用自己数据训练的模型，请注意修改配置文件中模型的文件路径:
+```bash
+cd ${FootballAcation}
+cd extractor && python extract_feat.py
+```
+
+完成该步骤后，数据存储位置
+```
+   |--  datasets                   # 训练数据集和处理脚本
+        |--  EuroCup2016            # 数据集
+            |--  features          # 视频的图像+音频特征
+```
+
+
+推理特征以pkl文件保存，格式如下：
+```txt
+# 特征维度, image(2048) + audio(1024)
+video_features = {'image_feature': np_image_features,
+                  'audio_feature': np_audio_features}
+```
+此特征接下来会用于BMN模型的训练。
+
+
+#### step2 BMN训练
+
+BMN模型使用文档参考[BMN](../../docs/zh-CN/model_zoo/localization/bmn.md)
+
+##### step2.1 BMN训练数据处理
+使用如下命令得到BMN训练所需要的数据集，默认使用windows=40，根据gts和特征得到训练所需的proposal：
+```bash
+cd FootballAction/datasets/script && python get_instance_for_bmn.py
+```
+
+完成该步骤后，数据存储位置
+```
+   |--  datasets                   # 训练数据集和处理脚本
+        |--  EuroCup2016            # 数据集
+            |--  input_for_bmn     # bmn训练的proposal
+                |--  feature
+                |--  label.json  
+```
+
+特征文件保存在`label.json`文件中，数据格式如下：
+```txt
+{
+    "719b0a4bcb1f461eabb152298406b861_753_793": {
+        "duration_second": 40.0,
+        "duration_frame": 200,
+        "feature_frame": 200,
+        "subset": "train",
+        "annotations": [
+            {
+                "segment": [
+                    15.0,
+                    22.0
+                ],
+                "label": "3.0",
+                "label_name": "任意球"
+            }
+        ]
+    },
+    ...
+}
+```
+
+##### step2.2  BMN模型训练
+训练启动命令如下：
+```bash
+python -B -m paddle.distributed.launch \
+     --gpus="0,1" \
+     --log_dir=./football/logs_bmn \
+     main.py  \
+     --validate \
+     -c applications/FootballAction/train_proposal/configs/bmn_football_v2.0.yaml \
+     -o output_dir=./football/bmn
+```
+
+我们也提供了训练好的BMN模型，下载链接已在快速体验章节中给出。
+
+##### step2.3 导出BMN推理模型
+模型导出命令如下:
+```bash
+python tools/export_model.py -c applications/FootballAction/train_proposal/configs/bmn_football_v2.0.yaml \
+                              -p ./football/bmn/BMN_epoch_00016.pdparams \
+                               -o ./football/inference_model
+```
+
+##### step2.4  BMN模型预测
+使用如下命令进行预测，得到动作proposal信息： start_id, end_id, score。如果使用自己数据训练的模型，请注意修改配置文件中模型的文件路径:
+```
+cd extractor && python extract_bmn.py
+```
+
+完成该步骤后，数据存储位置
+```
+   |--  datasets                   # 训练数据集和处理脚本
+        |--  EuroCup2016            # 数据集
+            |--  feature_bmn
+                 |--  prop.json    # bmn 预测结果
+```
+
+预测结果数据格式如下：
+```txt
+[
+    {
+        "video_name": "c9516c903de3416c97dae91a59e968d7",
+        "num_proposal": 5534,
+        "bmn_results": [
+            {
+                "start": 7850.0,
+                "end": 7873.0,
+                "score": 0.77194699622342
+            },
+            {
+                "start": 4400.0,
+                "end": 4443.0,
+                "score": 0.7663803287641536
+            },
+            ...
+        ]
+    },
+    ...
+]
+```
+
+#### step3 LSTM训练
+
+AttentionLSTM模型使用文档参考[AttentionLSTM](../../docs/zh-CN/model_zoo/localization/bmn.md)，此处我们对原始对AttentionLSTM模型进行了改进，包括：
+
+1. 不同模态特征在LSTM中使用不同的hiddne_size
+2. 加入了一个回归分支用于回归iou
+3. 模型中加入了BN层抑制过拟合
+
+
+##### step3.1  LSTM训练数据处理
+将BMN得到的proposal截断并处理成LSTM训练所需数据集。同理，注意数据集文件修改路径。
+```
+cd datasets/script && python get_instance_for_lstm.py
+```
+
+完成该步骤后，数据存储位置
+```
+   |--  datasets                    # 训练数据集和处理脚本
+        |--  EuroCup2016            # 数据集
+            |--  input_for_lstm     # lstm训练的proposal
+                ├── feature         # 特征
+                ├── label_info.json # 标签信息
+                ├── train.txt       # 训练文件列表
+                └── val.txt         # 测试文件列表
+```
+
+- `label_info.json`数据格式如下：
+```
+{
+    "fps": 5,
+    "results": [
+        {
+            "url": "https://xxx.mp4",
+            "mode": "train",        # train or validation
+            "total_frames": 6128,
+            "num_gts": 93,
+            "num_proposals": 5043,
+            "proposal_actions": [
+                {
+                    "label": 6,
+                    "norm_iou": 0.7575757575757576,
+                    "norm_ioa": 0.7575757575757576,
+                    "norm_start": -0.32,
+                    "proposal": {
+                        "start": 5011,
+                        "end": 5036,
+                        "score": 0.7723643666324231
+                    },
+                    "hit_gts": {
+                        "label_ids": [
+                            6
+                        ],
+                        "label_names": [
+                            "换人"
+                        ],
+                        "start_id": 5003,
+                        "end_id": 5036
+                    }
+                },
+                ...
+        },
+        ...
+}
+```
+
+- LSTM训练所需要的feature数据格式如下:
+```
+{
+    'features': np.array(feature_hit, dtype=np.float32),    # iamge和audio 特征
+    'feature_fps': 5,                                       # fps = 5
+    'label_info': {'norm_iou': 0.5, 'label': 3, ...},       # 数据格式1中的'proposal_actions'
+    'video_name': 'c9516c903de3416c97dae91a59e968d7'        # video_name
+}
+```
+
+- LSTM训练所需文件列表数据格式如下：
+```
+'{} {}'.format(filename, label)
+```
+
+##### step3.2  LSTM训练
+
+训练启动命令如下:
+
+```bash
+python -B -m paddle.distributed.launch \
+     --gpus="0,1,2,3" \
+     --log_dir=./football/logs_lstm \
+     main.py  \
+     --validate \
+     -c applications/FootballAction/train_proposal/configs/lstm_football.yaml \
+     -o output_dir=./football/lstm
+```
+
+##### step3.3 导出LSTM推理模型
+
+模型导出命令如下:
+```bash
+python tools/export_model.py -c applications/FootballAction/train_proposal/configs/lstm_football.yaml \
+                              -p ./football/lstm/AttentionLSTM_best.pdparams  \
+                               -o ./football/inference_model
+```
+
+<a name="模型推理"></a>
+### 5.2 模型推理
+
+运行预测代码
+```
+cd predict && python predict.py
+```
+- 默认使用我们提供的于训练文件进行预测，如使用个人训练的模型文件，请对应修改[配置文件](./predict/configs/configs.yaml)中的参数路径
+- 产出文件：results.json
+
+
+<a name="模型评估"></a>
+### 5.3 模型评估
+
+```
+# 包括bmn proposal 评估和最终action评估
+cd predict && python eval.py results.json
+```
+
+<a name="模型优化"></a>
+### 5.4 模型优化
+
+- 基础特征模型（图像）替换为PP-TSM，准确率由84%提升到94%
+- 基础特征模型（音频）没变动
+- 准确率提升，precision和recall均有大幅提升，F1-score从0.57提升到0.82
+
+
+<a name="模型部署"></a>
+### 5.5 模型部署
+
+本代码解决方案在动作的检测和召回指标F1-score=82%
+
+
+<a name="参考论文"></a>
+### 6. 参考论文
+
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
+- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.
+- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen
+- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan
diff --git a/docs/src/applications/FootballAction/checkpoints/download.sh b/docs/src/applications/FootballAction/checkpoints/download.sh
new file mode 100644
index 000000000..594866525
--- /dev/null
+++ b/docs/src/applications/FootballAction/checkpoints/download.sh
@@ -0,0 +1,18 @@
+# audio
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/audio.tar
+# pptsm
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/pptsm.tar
+# bmn
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/bmn.tar
+# lstm
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/FootballAction/lstm.tar
+
+tar -xvf audio.tar
+tar -xvf pptsm.tar
+tar -xvf bmn.tar
+tar -xvf lstm.tar
+
+rm -f audio.tar
+rm -f pptsm.tar
+rm -f bmn.tar
+rm -f lstm.tar
diff --git a/docs/src/applications/FootballAction/datasets/EuroCup2016/dataset_url.list b/docs/src/applications/FootballAction/datasets/EuroCup2016/dataset_url.list
new file mode 100644
index 000000000..429914dc1
--- /dev/null
+++ b/docs/src/applications/FootballAction/datasets/EuroCup2016/dataset_url.list
@@ -0,0 +1,49 @@
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/63e51df254d2402fac703b6c4fdb4ea9.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/76b5f7ee28d942988c6b224bfac136bd.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/250b88724acf40dbb6d7e8ccb400ef38.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/c9516c903de3416c97dae91a59e968d7.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/e1982c90cdd74abaacc4d0692070b400.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/1be705a8f67648da8ec4b4296fa80895.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/de23c0b2be3a4eb1990c5c657061fb29.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/2754615de6e64c4fb95ce1a8095dc1c1.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/299fe30d8f3b4a45b89313fe31f9f3c0.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6cc7db52c5ef4e70b401a5e00d8dd67a.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/22e89747689e4f7e83e3620620c93269.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/2ceb6c549fc64305a06a75acb355642b.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/719b0a4bcb1f461eabb152298406b861.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/259856b769044b4d8dc94076deb356bf.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d0bd3eab1e794f0f9501c353a6d37827.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/19eb47cc736240d6b2dd930ab69da839.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/4435b708af6d48519a6b726144147d51.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/ea16ad2a020643529e257bd6cb11b3c3.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/eeebffbd4ec74222a9c2d0775d79b689.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/8cfb4e605af44055b1576c37eb0e3209.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6bca62b57cc449c6935f0b17f28d06be.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/70cfc31e520840b2afca458f93a01ce4.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6496960935e845578e391a5916739752.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d6d25403a4bb4784aecff5f21fd00dc5.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/3e23d452a082403391f8abfb87bf2fb4.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/4c5d9d9af4f044c4a68d134061dc264f.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6994844c64b44c26b935cee9604bef0a.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d6322cb95f6a4402ac80432b561abd5d.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/2c8b5587083a4784a51622e4fec87ccd.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/5faa60d70ed141de8560110e840f2048.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/45d08bc5cb0f424f9ed9d7874eb561cd.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/6630aaf0e32146088d0b624e9288f071.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f2edbee29c1b4966b3a410260f78fbe3.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f24116fdd6a54214991db32f7dddef67.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/0265731a0c6f4a9398c88db8e3d4a3bc.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/02d2de09997f4215b06e3b00ff0502a0.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/9c231896c56a43f291a5e190949f4333.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/4afbbf9afcd44dfea45b044117cccb48.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/745db97a080d4f44b450dc17a2bcf069.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/5933d0ce17854483b81a318d7d45a34e.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/d2cfef2da9f84237a6950c7f6659655c.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/5572686cb90f440988ded956a60e555d.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/8962ac5a332346e180c79d701ae0a175.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f6e64ee9b13a4088b24c45c257894c1e.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/f6ed2b612b3d43baa0726be8b14ebe7c.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/8ab7b0cba5744eb3b6fb10003dfda383.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/1f0a0698e38d493988fe42a50f7e8723.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/737fdb054ca141f2a45013c1740dd0a0.mp4
+https://paddle-model-ecology.bj.bcebos.com/data/EuroCup2016/bab63a9bcf204e4b99c4a887a01bfd60.mp4
diff --git a/docs/src/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh b/docs/src/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh
new file mode 100644
index 000000000..180a4be80
--- /dev/null
+++ b/docs/src/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh
@@ -0,0 +1,51 @@
+mkdir mp4
+cd mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/63e51df254d2402fac703b6c4fdb4ea9.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/76b5f7ee28d942988c6b224bfac136bd.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/250b88724acf40dbb6d7e8ccb400ef38.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/c9516c903de3416c97dae91a59e968d7.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/e1982c90cdd74abaacc4d0692070b400.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/1be705a8f67648da8ec4b4296fa80895.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/de23c0b2be3a4eb1990c5c657061fb29.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/2754615de6e64c4fb95ce1a8095dc1c1.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/299fe30d8f3b4a45b89313fe31f9f3c0.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6cc7db52c5ef4e70b401a5e00d8dd67a.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/22e89747689e4f7e83e3620620c93269.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/2ceb6c549fc64305a06a75acb355642b.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/719b0a4bcb1f461eabb152298406b861.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/259856b769044b4d8dc94076deb356bf.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d0bd3eab1e794f0f9501c353a6d37827.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/19eb47cc736240d6b2dd930ab69da839.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/4435b708af6d48519a6b726144147d51.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/ea16ad2a020643529e257bd6cb11b3c3.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/eeebffbd4ec74222a9c2d0775d79b689.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/8cfb4e605af44055b1576c37eb0e3209.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6bca62b57cc449c6935f0b17f28d06be.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/70cfc31e520840b2afca458f93a01ce4.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6496960935e845578e391a5916739752.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d6d25403a4bb4784aecff5f21fd00dc5.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/3e23d452a082403391f8abfb87bf2fb4.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/4c5d9d9af4f044c4a68d134061dc264f.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6994844c64b44c26b935cee9604bef0a.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d6322cb95f6a4402ac80432b561abd5d.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/2c8b5587083a4784a51622e4fec87ccd.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/5faa60d70ed141de8560110e840f2048.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/45d08bc5cb0f424f9ed9d7874eb561cd.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/6630aaf0e32146088d0b624e9288f071.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f2edbee29c1b4966b3a410260f78fbe3.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f24116fdd6a54214991db32f7dddef67.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/0265731a0c6f4a9398c88db8e3d4a3bc.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/02d2de09997f4215b06e3b00ff0502a0.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/9c231896c56a43f291a5e190949f4333.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/4afbbf9afcd44dfea45b044117cccb48.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/745db97a080d4f44b450dc17a2bcf069.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/5933d0ce17854483b81a318d7d45a34e.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/d2cfef2da9f84237a6950c7f6659655c.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/5572686cb90f440988ded956a60e555d.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/8962ac5a332346e180c79d701ae0a175.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f6e64ee9b13a4088b24c45c257894c1e.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/f6ed2b612b3d43baa0726be8b14ebe7c.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/8ab7b0cba5744eb3b6fb10003dfda383.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/1f0a0698e38d493988fe42a50f7e8723.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/737fdb054ca141f2a45013c1740dd0a0.mp4
+wget https://bj.bcebos.com/v1/tmt-pub/datasets/EuroCup2016/bab63a9bcf204e4b99c4a887a01bfd60.mp4
\ No newline at end of file
diff --git a/docs/src/applications/FootballAction/datasets/EuroCup2016/url.list b/docs/src/applications/FootballAction/datasets/EuroCup2016/url.list
new file mode 100644
index 000000000..ddff158d6
--- /dev/null
+++ b/docs/src/applications/FootballAction/datasets/EuroCup2016/url.list
@@ -0,0 +1,49 @@
+mp4/63e51df254d2402fac703b6c4fdb4ea9.mp4
+mp4/76b5f7ee28d942988c6b224bfac136bd.mp4
+mp4/250b88724acf40dbb6d7e8ccb400ef38.mp4
+mp4/c9516c903de3416c97dae91a59e968d7.mp4
+mp4/e1982c90cdd74abaacc4d0692070b400.mp4
+mp4/1be705a8f67648da8ec4b4296fa80895.mp4
+mp4/de23c0b2be3a4eb1990c5c657061fb29.mp4
+mp4/2754615de6e64c4fb95ce1a8095dc1c1.mp4
+mp4/299fe30d8f3b4a45b89313fe31f9f3c0.mp4
+mp4/6cc7db52c5ef4e70b401a5e00d8dd67a.mp4
+mp4/22e89747689e4f7e83e3620620c93269.mp4
+mp4/2ceb6c549fc64305a06a75acb355642b.mp4
+mp4/719b0a4bcb1f461eabb152298406b861.mp4
+mp4/259856b769044b4d8dc94076deb356bf.mp4
+mp4/d0bd3eab1e794f0f9501c353a6d37827.mp4
+mp4/19eb47cc736240d6b2dd930ab69da839.mp4
+mp4/4435b708af6d48519a6b726144147d51.mp4
+mp4/ea16ad2a020643529e257bd6cb11b3c3.mp4
+mp4/eeebffbd4ec74222a9c2d0775d79b689.mp4
+mp4/8cfb4e605af44055b1576c37eb0e3209.mp4
+mp4/6bca62b57cc449c6935f0b17f28d06be.mp4
+mp4/70cfc31e520840b2afca458f93a01ce4.mp4
+mp4/6496960935e845578e391a5916739752.mp4
+mp4/d6d25403a4bb4784aecff5f21fd00dc5.mp4
+mp4/3e23d452a082403391f8abfb87bf2fb4.mp4
+mp4/4c5d9d9af4f044c4a68d134061dc264f.mp4
+mp4/6994844c64b44c26b935cee9604bef0a.mp4
+mp4/d6322cb95f6a4402ac80432b561abd5d.mp4
+mp4/2c8b5587083a4784a51622e4fec87ccd.mp4
+mp4/5faa60d70ed141de8560110e840f2048.mp4
+mp4/45d08bc5cb0f424f9ed9d7874eb561cd.mp4
+mp4/6630aaf0e32146088d0b624e9288f071.mp4
+mp4/f2edbee29c1b4966b3a410260f78fbe3.mp4
+mp4/f24116fdd6a54214991db32f7dddef67.mp4
+mp4/0265731a0c6f4a9398c88db8e3d4a3bc.mp4
+mp4/02d2de09997f4215b06e3b00ff0502a0.mp4
+mp4/9c231896c56a43f291a5e190949f4333.mp4
+mp4/4afbbf9afcd44dfea45b044117cccb48.mp4
+mp4/745db97a080d4f44b450dc17a2bcf069.mp4
+mp4/5933d0ce17854483b81a318d7d45a34e.mp4
+mp4/d2cfef2da9f84237a6950c7f6659655c.mp4
+mp4/5572686cb90f440988ded956a60e555d.mp4
+mp4/8962ac5a332346e180c79d701ae0a175.mp4
+mp4/f6e64ee9b13a4088b24c45c257894c1e.mp4
+mp4/f6ed2b612b3d43baa0726be8b14ebe7c.mp4
+mp4/8ab7b0cba5744eb3b6fb10003dfda383.mp4
+mp4/1f0a0698e38d493988fe42a50f7e8723.mp4
+mp4/737fdb054ca141f2a45013c1740dd0a0.mp4
+mp4/bab63a9bcf204e4b99c4a887a01bfd60.mp4
diff --git a/docs/src/applications/FootballAction/datasets/EuroCup2016/url_val.list b/docs/src/applications/FootballAction/datasets/EuroCup2016/url_val.list
new file mode 100644
index 000000000..c401f3174
--- /dev/null
+++ b/docs/src/applications/FootballAction/datasets/EuroCup2016/url_val.list
@@ -0,0 +1,5 @@
+mp4/5572686cb90f440988ded956a60e555d.mp4
+mp4/f6e64ee9b13a4088b24c45c257894c1e.mp4
+mp4/259856b769044b4d8dc94076deb356bf.mp4
+mp4/1f0a0698e38d493988fe42a50f7e8723.mp4
+mp4/8cfb4e605af44055b1576c37eb0e3209.mp4
diff --git a/docs/src/applications/FootballAction/datasets/script/get_frames_pcm.py b/docs/src/applications/FootballAction/datasets/script/get_frames_pcm.py
new file mode 100644
index 000000000..286f47aa6
--- /dev/null
+++ b/docs/src/applications/FootballAction/datasets/script/get_frames_pcm.py
@@ -0,0 +1,54 @@
+"""
+get frames and pcm from video
+"""
+import os
+from concurrent import futures
+
+dataset = "../EuroCup2016"
+url_list = os.path.join(dataset, 'url.list')
+dst_frames = os.path.join(dataset, 'frames')
+dst_pcm = os.path.join(dataset, 'pcm')
+if not os.path.exists(dst_frames):
+    os.mkdir(dst_frames)
+if not os.path.exists(dst_pcm):
+    os.mkdir(dst_pcm)
+
+
+def extract_frames(video_name, out_folder, fps=5):
+    if os.path.exists(out_folder):
+        os.system('rm -rf ' + out_folder + '/*')
+        os.system('rm -rf ' + out_folder)
+    os.makedirs(out_folder)
+    cmd = 'ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (video_name, fps,
+                                                      out_folder, '%08d')
+    os.system(cmd)
+
+
+def extract_pcm(video_name, file_name_pcm):
+    cmd = 'ffmpeg -y -i %s -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' % (
+        video_name, file_name_pcm)
+    os.system(cmd)
+
+
+def process(line):
+    print(line)
+    mp4_name = os.path.join(dataset, line)
+    basename = os.path.basename(line).split('.')[0]
+    folder_frame = os.path.join(dst_frames, basename)
+    filename_pcm = os.path.join(dst_pcm, basename + '.pcm')
+    # extract
+    extract_frames(mp4_name, folder_frame)
+    extract_pcm(mp4_name, filename_pcm)
+
+
+if __name__ == "__main__":
+    with open(url_list, 'r') as f:
+        lines = f.readlines()
+    lines = [k.strip() for k in lines]
+
+    # multi thread
+    with futures.ProcessPoolExecutor(max_workers=10) as executer:
+        fs = [executer.submit(process, line) for line in lines]
+    #for line in lines:
+    #    process(line)
+    print("done")
diff --git a/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py b/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py
new file mode 100644
index 000000000..5d348492f
--- /dev/null
+++ b/docs/src/applications/FootballAction/datasets/script/get_instance_for_bmn.py
@@ -0,0 +1,216 @@
+"""
+get instance for bmn
+使用winds=40的滑窗，将所有子窗口的长度之和小于winds的进行合并
+合并后，父窗口代表bmn训练数据，子窗口代表tsn训练数据
+"""
+import os
+import sys
+import json
+import random
+import pickle
+import numpy as np
+
+bmn_window = 40
+dataset = "../EuroCup2016"
+feat_dir = dataset + '/features'
+out_dir = dataset + '/input_for_bmn'
+label_files = {
+    'train': 'label_cls8_train.json',
+    'validation': 'label_cls8_val.json'
+}
+
+global fps
+
+
+def gen_gts_for_bmn(gts_data):
+    """
+    @param, gts_data, original gts for action detection
+    @return, gts_bmn, output gts dict for bmn
+    """
+    fps = gts_data['fps']
+    gts_bmn = {'fps': fps, 'gts': []}
+    for sub_item in gts_data['gts']:
+        url = sub_item['url']
+
+        max_length = sub_item['total_frames']
+        # 特征提取没有获取所有帧特征，这里load feature获取准确max_length
+        #feat_path = feat_dir + '/' + os.path.basename(url).replace('.mp4', '.pkl')
+        #feature_video = pickle.load(open(feat_path, 'rb'))['features']
+        #max_length = int(len(feature_video) * 1.0 / fps)
+
+        gts_bmn['gts'].append({
+            'url': url,
+            'total_frames': max_length,
+            'root_actions': []
+        })
+        sub_actions = sub_item['actions']
+        # duration > bmn_window， 直接删除
+        for idx, sub_action in enumerate(sub_actions):
+            if sub_action['end_id'] - sub_action['start_id'] > bmn_window:
+                sub_actions.pop(idx)
+
+        root_actions = [sub_actions[0]]
+        # before_id, 前一动作的最后一帧
+        # after_id, 后一动作的第一帧
+        before_id = 0
+        for idx in range(1, len(sub_actions)):
+            cur_action = sub_actions[idx]
+            duration = (cur_action['end_id'] - root_actions[0]['start_id'])
+            if duration > bmn_window:
+                after_id = cur_action['start_id']
+                gts_bmn['gts'][-1]['root_actions'].append({
+                    'before_id':
+                    before_id,
+                    'after_id':
+                    after_id,
+                    'actions':
+                    root_actions
+                })
+                before_id = root_actions[-1]['end_id']
+                root_actions = [cur_action]
+            else:
+                root_actions.append(cur_action)
+            if idx == len(sub_actions) - 1:
+                after_id = max_length
+                gts_bmn['gts'][-1]['root_actions'].append({
+                    'before_id':
+                    before_id,
+                    'after_id':
+                    after_id,
+                    'actions':
+                    root_actions
+                })
+    return gts_bmn
+
+
+def combile_gts(gts_bmn, gts_process, mode):
+    """
+    1、bmn_window 范围内只有一个动作，只取一个目标框
+    2、bmn_window 范围内有多个动作，取三个目标框(第一个动作、最后一个动作、所有动作)
+    """
+    global fps
+    fps = gts_process['fps']
+    duration_second = bmn_window * 1.0
+    duration_frame = bmn_window * fps
+    feature_frame = duration_frame
+    for item in gts_process['gts']:
+        url = item['url']
+        basename = os.path.basename(url).split('.')[0]
+        root_actions = item['root_actions']
+        for root_action in root_actions:
+            segments = []
+            # all actions
+            segments.append({
+                'actions': root_action['actions'],
+                'before_id': root_action['before_id'],
+                'after_id': root_action['after_id']
+            })
+            if len(root_action['actions']) > 1:
+                # first action
+                segments.append({
+                    'actions': [root_action['actions'][0]],
+                    'before_id':
+                    root_action['before_id'],
+                    'after_id':
+                    root_action['actions'][1]['start_id']
+                })
+                # last action
+                segments.append({
+                    'actions': [root_action['actions'][-1]],
+                    'before_id':
+                    root_action['actions'][-2]['end_id'],
+                    'after_id':
+                    root_action['after_id']
+                })
+            for segment in segments:
+                before_id = segment['before_id']
+                after_id = segment['after_id']
+                actions = segment['actions']
+                box0 = int(max(actions[-1]['end_id'] - bmn_window, before_id))
+                box1 = int(min(actions[0]['start_id'], after_id - bmn_window))
+                if box0 <= box1:
+                    cur_start = random.randint(box0, box1)
+                    cur_end = cur_start + bmn_window
+                    name = '{}_{}_{}'.format(basename, cur_start, cur_end)
+                    annotations = []
+                    for action in actions:
+                        label = str(1.0 * action['label_ids'][0])
+                        label_name = action['label_names'][0]
+                        seg0 = 1.0 * (action['start_id'] - cur_start)
+                        seg1 = 1.0 * (action['end_id'] - cur_start)
+                        annotations.append({
+                            'segment': [seg0, seg1],
+                            'label': label,
+                            'label_name': label_name
+                        })
+                    gts_bmn[name] = {
+                        'duration_second': duration_second,
+                        'duration_frame': duration_frame,
+                        'feature_frame': feature_frame,
+                        'subset': mode,
+                        'annotations': annotations
+                    }
+
+    return gts_bmn
+
+
+def save_feature_to_numpy(gts_bmn, folder):
+    global fps
+    print('save feature for bmn ...')
+    if not os.path.exists(folder):
+        os.mkdir(folder)
+    process_gts_bmn = {}
+    for item, value in gts_bmn.items():
+        basename, start_id, end_id = item.split('_')
+        if not basename in process_gts_bmn:
+            process_gts_bmn[basename] = []
+        process_gts_bmn[basename].append({
+            'name': item,
+            'start': int(start_id),
+            'end': int(end_id)
+        })
+
+    for item, values in process_gts_bmn.items():
+        feat_path = os.path.join(feat_dir, item + '.pkl')
+        print(feat_path)
+        feature = pickle.load(open(feat_path, 'rb'))
+        image_feature = feature['image_feature']
+        pcm_feature = feature['pcm_feature']
+
+        pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))
+        min_length = min(image_feature.shape[0], pcm_feature.shape[0])
+        if min_length == 0:
+            continue
+        image_feature = image_feature[:min_length, :]
+        pcm_feature = pcm_feature[:min_length, :]
+        feature_video = np.concatenate((image_feature, pcm_feature), axis=1)
+        for value in values:
+            save_cut_name = os.path.join(folder, value['name'])
+            start_frame = (value['start']) * fps
+            end_frame = (value['end']) * fps
+            if end_frame > len(feature_video):
+                del gts_bmn[value['name']]
+                continue
+            feature_cut = [
+                feature_video[i] for i in range(start_frame, end_frame)
+            ]
+            np_feature_cut = np.array(feature_cut, dtype=np.float32)
+            np.save(save_cut_name, np_feature_cut)
+    return gts_bmn
+
+
+if __name__ == "__main__":
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+    gts_bmn = {}
+    for item, value in label_files.items():
+        label_file = os.path.join(dataset, value)
+        gts_data = json.load(open(label_file, 'rb'))
+        gts_process = gen_gts_for_bmn(gts_data)
+        gts_bmn = combile_gts(gts_bmn, gts_process, item)
+    
+    gts_bmn = save_feature_to_numpy(gts_bmn, out_dir + '/feature')
+
+    with open(out_dir + '/label.json', 'w', encoding='utf-8') as f:
+        data = json.dumps(gts_bmn, indent=4, ensure_ascii=False)
+        f.write(data)
diff --git a/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py b/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py
new file mode 100644
index 000000000..10bf4b60c
--- /dev/null
+++ b/docs/src/applications/FootballAction/datasets/script/get_instance_for_lstm.py
@@ -0,0 +1,172 @@
+"""
+get instance for lstm
+根据gts计算每个proposal_bmn的iou、ioa、label等信息
+"""
+import os
+import sys
+import json
+import random
+import pickle
+import numpy as np
+
+dataset = "../EuroCup2016"
+feat_dir = dataset + '/features'
+prop_file = dataset + '/feature_bmn/prop.json'
+out_dir = dataset + '/input_for_lstm'
+label_files = {
+    'train': 'label_cls8_train.json',
+    'validation': 'label_cls8_val.json'
+}
+
+
+def IoU(e1, e2):
+    """
+    clc iou and ioa
+    """
+    area1 = e1["end"] - e1["start"]
+    area2 = e2["end"] - e2["start"]
+    x1 = np.maximum(e1["start"], e2["start"])
+    x2 = np.minimum(e1["end"], e2["end"])
+    inter = np.maximum(0.0, x2 - x1)
+    iou = 0.0 if (area1 + area2 -
+                  inter) == 0 else inter * 1.0 / (area1 + area2 - inter)
+    ioa = 0.0 if area2 == 0 else inter * 1.0 / area2
+    return iou, ioa
+
+
+def clc_iou_of_proposal(proposal, gts):
+    hit_gts = {}
+    label = 0
+    norm_start = 0.
+    hit = False
+    for gt in gts:
+        e1 = {'start': proposal['start'], 'end': proposal['end']}
+        e2 = {'start': gt['start_id'], 'end': gt['end_id']}
+        iou, ioa = IoU(e1, e2)
+        if iou > 0:
+            hit = True
+            hit_gts = gt
+            label = hit_gts['label_ids'][0]
+            norm_start = (gt['start_id'] - proposal['start']) * 1.0 / (
+                proposal['end'] - proposal['start'])
+            break
+    res = {
+        'label': label,
+        'norm_iou': iou,
+        'norm_ioa': ioa,
+        'norm_start': norm_start,
+        'proposal': proposal,
+        'hit_gts': hit_gts
+    }
+    return res
+
+
+def get_bmn_info(gts_data, proposal_data, res_bmn, mode, score_threshold=0.01):
+    """
+    @param, gts_data, original gts for action detection
+    @param, proposal_data, proposal actions from bmn
+    @param, mode, train or validation
+    @return, None.
+    """
+    fps = gts_data['fps']
+    res_bmn['fps'] = fps
+    for gts_item in gts_data['gts']:
+        url = gts_item['url']
+        print(url)
+        max_length = gts_item['total_frames']
+
+        video_name = os.path.basename(url).split('.')[0]
+        if not video_name in proposal_data:
+            continue
+
+        gts_actions = gts_item['actions']
+        prop_actions = proposal_data[video_name]
+
+        res_bmn['results'].append({
+            'url': url,
+            'mode': mode,
+            'total_frames': max_length,
+            'num_gts': len(gts_actions),
+            'num_proposals': len(prop_actions),
+            'proposal_actions': []
+        })
+        for proposal in prop_actions:
+            if proposal['score'] < score_threshold:
+                continue
+            proposal['start'] = int(proposal['start'] * 1.0 / fps)
+            proposal['end'] = int(proposal['end'] * 1.0 / fps)
+            gts_info = clc_iou_of_proposal(proposal, gts_actions)
+            res_bmn['results'][-1]['proposal_actions'].append(gts_info)
+
+    return res_bmn
+
+
+def save_feature(label_info, out_dir):
+    print('save feature ...')
+    fps = label_info['fps']
+    out_feature_dir = out_dir + '/feature'
+    out_feature_dir = os.path.abspath(out_feature_dir)
+    if not os.path.exists(out_feature_dir):
+        os.mkdir(out_feature_dir)
+    fid_train = open(out_dir + '/train.txt', 'w')
+    fid_val = open(out_dir + '/val.txt', 'w')
+    for res in label_info['results']:
+        basename = os.path.basename(res['url']).split('.')[0]
+        print(basename, res['num_proposals'])
+        mode = res['mode']
+        fid = fid_train if mode == 'train' else fid_val
+        feature_path = os.path.join(feat_dir, basename + '.pkl')
+        feature_data = pickle.load(open(feature_path, 'rb'))
+        image_feature = feature_data['image_feature']
+        audio_feature = feature_data['audio_feature']
+        max_len_audio = len(audio_feature)
+        for proposal in res['proposal_actions']:
+            label = proposal['label']
+            start_id = proposal['proposal']['start']
+            end_id = proposal['proposal']['end']
+            # get hit feature
+            image_feature_hit = image_feature[start_id * fps:end_id * fps]
+            audio_feature_hit = audio_feature[min(start_id, max_len_audio
+                                                  ):min(end_id, max_len_audio)]
+
+            # save
+            anno_info = {
+                'image_feature': np.array(image_feature_hit, dtype=np.float32),
+                'audio_feature': np.array(audio_feature_hit, dtype=np.float32),
+                'feature_fps': fps,
+                'label_info': proposal,
+                'video_name': basename
+            }
+            save_name = '{}/{}_{}_{}.pkl'.format(out_feature_dir, basename,
+                                                 start_id, end_id)
+            with open(save_name, 'wb') as f:
+                pickle.dump(anno_info, f, protocol=pickle.HIGHEST_PROTOCOL)
+            fid.write('{} {}\n'.format(save_name, label))
+
+    fid_train.close()
+    fid_val.close()
+    print('done!')
+
+
+if __name__ == "__main__":
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+    prop_data = json.load(open(prop_file, 'rb'))
+    proposal_data = {}
+    for item in prop_data:
+        proposal_data[os.path.basename(
+            item['video_name'])] = item['bmn_results']
+
+    # get label info
+    res_bmn = {'fps': 0, 'results': []}
+    for item, value in label_files.items():
+        label_file = os.path.join(dataset, value)
+        gts_data = json.load(open(label_file, 'rb'))
+        res_bmn = get_bmn_info(gts_data, proposal_data, res_bmn, item)
+
+    with open(out_dir + '/label_info.json', 'w', encoding='utf-8') as f:
+        data = json.dumps(res_bmn, indent=4, ensure_ascii=False)
+        f.write(data)
+
+    # save feature
+    save_feature(res_bmn, out_dir)
diff --git a/docs/src/applications/FootballAction/datasets/script/get_instance_for_pptsm.py b/docs/src/applications/FootballAction/datasets/script/get_instance_for_pptsm.py
new file mode 100644
index 000000000..30e3f637e
--- /dev/null
+++ b/docs/src/applications/FootballAction/datasets/script/get_instance_for_pptsm.py
@@ -0,0 +1,97 @@
+"""
+get instance for tsn
+positive: 标注后的动作区间，一个区间所有frames生成一个pkl
+negative: 标注后的非动作区间，随机取N个区间生成N个pkl，每个区间长度等于最近的前一个动作区间的长度
+"""
+import os
+import json
+import numpy as np
+import random
+import pickle
+from concurrent import futures
+
+dataset = "../EuroCup2016"
+frames_dir = dataset + '/frames'
+label_files = {'train': 'label_cls8_train.json', 'val': 'label_cls8_val.json'}
+
+
+def process(item, fps, save_folder):
+    actions_pos = []
+    actions_neg = []
+    url = item['url']
+    print(url)
+    basename = os.path.basename(url).split('.')[0]
+    actions = item['actions']
+    # pos
+    for action in actions:
+        actions_pos.append({
+            'label': action['label_ids'],
+            'start': action['start_id'] * fps,
+            'end': action['end_id'] * fps
+        })
+    # neg
+    for idx, pos in enumerate(actions_pos):
+        if idx == len(actions_pos) - 1:
+            break
+        len_pos = pos['end'] - pos['start']
+        duration_start = [pos['end'], actions_pos[idx + 1]['start'] - len_pos]
+        if duration_start[1] - duration_start[0] < 3:
+            continue
+        for k in range(1, 3):
+            start_frame = random.randint(duration_start[0], duration_start[1])
+            end_frame = start_frame + len_pos
+            actions_neg.append({
+                'label': [0],
+                'start': start_frame,
+                'end': end_frame
+            })
+    # save pkl
+    for item in np.concatenate((actions_pos, actions_neg), axis=0):
+        start = item['start']
+        end = item['end']
+        label = item['label']
+        label_str = str(label[0])
+        if len(item['label']) == 2:
+            label_str = label_str + '-' + str(label[1])
+        frames = []
+        for ii in range(start, end + 1):
+            img = os.path.join(frames_dir, basename, '%08d.jpg' % ii)
+            with open(img, 'rb') as f:
+                data = f.read()
+            frames.append(data)
+        # print(label_str)
+        outname = '%s/%s_%08d_%08d_%s.pkl' % (save_folder, basename, start, end,
+                                              label_str)
+        with open(outname, 'wb') as f:
+            pickle.dump((basename, label, frames), f, -1)
+
+
+def gen_instance_pkl(label_data, save_folder):
+    fps = label_data['fps']
+    gts = label_data['gts']
+    with futures.ProcessPoolExecutor(max_workers=10) as executer:
+        fs = [executer.submit(process, gt, fps, save_folder) for gt in gts]
+
+    #for gt in gts:
+    #    process(gt, fps, save_folder)
+
+
+if __name__ == "__main__":
+    for item, value in label_files.items():
+        save_folder = os.path.join(dataset, 'input_for_pptsm', item)
+        if not os.path.exists(save_folder):
+            os.makedirs(save_folder)
+
+        label_file = os.path.join(dataset, value)
+        label_data = json.load(open(label_file, 'rb'))
+
+        gen_instance_pkl(label_data, save_folder)
+
+    # gen train val list
+    #data_dir = '../EuroCup2016/input_for_pptsm/'
+    data_dir = os.path.abspath(os.path.join(dataset, 'input_for_pptsm'))
+
+    os.system('find ' + data_dir + '/train -name "*.pkl" > ' + data_dir +
+              '/train.list')
+    os.system('find ' + data_dir + '/val -name "*.pkl" > ' + data_dir +
+              '/val.list')
diff --git a/docs/src/applications/FootballAction/extractor/extract_bmn.py b/docs/src/applications/FootballAction/extractor/extract_bmn.py
new file mode 100644
index 000000000..191a34cdc
--- /dev/null
+++ b/docs/src/applications/FootballAction/extractor/extract_bmn.py
@@ -0,0 +1,91 @@
+#!./python27-gcc482/bin/python
+# coding: utf-8
+"""
+BAIDU CLOUD action
+"""
+
+import os
+import sys
+import pickle
+import json
+import time
+import shutil
+
+import numpy as np
+
+sys.path.append("../predict/action_detect")
+import models.bmn_infer as prop_model
+from utils.preprocess import get_images
+from utils.config_utils import parse_config, print_configs
+import utils.config_utils as config_utils
+
+import logger
+
+logger = logger.Logger()
+
+
+def load_model(cfg_file="configs/configs.yaml"):
+    """
+    load_model
+    """
+    logger.info("load model ... ")
+    global infer_configs
+    infer_configs = parse_config(cfg_file)
+    print_configs(infer_configs, "Infer")
+
+    t0 = time.time()
+    global prop_model
+    prop_model = prop_model.InferModel(infer_configs)
+    t1 = time.time()
+    logger.info("step0: load model time: {} min\n".format((t1 - t0) * 1.0 / 60))
+
+
+def video_classify(video_name):
+    """
+    extract_feature
+    """
+    logger.info('predict ... ')
+    logger.info(video_name)
+    imgs_path = video_name.replace(".mp4", "").replace("mp4", "frames")
+    pcm_path = video_name.replace(".mp4", ".pcm").replace("mp4", "pcm")
+
+    # step 1: extract feature
+
+    feature_path = video_name.replace(".mp4", ".pkl").replace("mp4", "features")
+    video_features = pickle.load(open(feature_path, 'rb'))
+
+    # step2: get proposal
+    t0 = time.time()
+    bmn_results = prop_model.predict(infer_configs, material=video_features)
+    t1 = time.time()
+    logger.info(np.array(bmn_results).shape)
+    logger.info("step2: proposal time: {} min".format((t1 - t0) * 1.0 / 60))
+
+    return bmn_results
+
+
+if __name__ == '__main__':
+    dataset_dir = "../datasets/EuroCup2016"
+    if not os.path.exists(dataset_dir + '/feature_bmn'):
+        os.mkdir(dataset_dir + '/feature_bmn')
+    results = []
+
+    load_model()
+
+    video_url = os.path.join(dataset_dir, 'url.list')
+    with open(video_url, 'r') as f:
+        lines = f.readlines()
+    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]
+
+    for line in lines:
+        bmn_results = video_classify(line)
+        results.append({
+            'video_name': os.path.basename(line).split('.')[0],
+            'num_proposal': len(bmn_results),
+            'bmn_results': bmn_results
+        })
+
+    with open(dataset_dir + '/feature_bmn/prop.json', 'w',
+              encoding='utf-8') as f:
+        data = json.dumps(results, indent=4, ensure_ascii=False)
+        f.write(data)
diff --git a/docs/src/applications/FootballAction/extractor/extract_feat.py b/docs/src/applications/FootballAction/extractor/extract_feat.py
new file mode 100644
index 000000000..03e2e91e4
--- /dev/null
+++ b/docs/src/applications/FootballAction/extractor/extract_feat.py
@@ -0,0 +1,100 @@
+#!./python27-gcc482/bin/python
+# coding: utf-8
+"""
+BAIDU CLOUD action
+"""
+
+import os
+import sys
+import pickle
+import json
+import time
+import shutil
+
+import numpy as np
+
+sys.path.append("../predict/action_detect")
+import models.pptsm_infer as image_model
+import models.audio_infer as audio_model
+
+from utils.preprocess import get_images
+from utils.config_utils import parse_config, print_configs
+import utils.config_utils as config_utils
+
+import logger
+
+logger = logger.Logger()
+
+
+def load_model(cfg_file="configs/configs.yaml"):
+    """
+    load_model
+    """
+    logger.info("load model ... ")
+    global infer_configs
+    infer_configs = parse_config(cfg_file)
+    print_configs(infer_configs, "Infer")
+
+    t0 = time.time()
+    global image_model, audio_model
+    image_model = image_model.InferModel(infer_configs)
+    audio_model = audio_model.InferModel(infer_configs)
+    t1 = time.time()
+    logger.info("step0: load model time: {} min\n".format((t1 - t0) * 1.0 / 60))
+
+
+def video_classify(video_name):
+    """
+    extract_feature
+    """
+    logger.info('predict ... ')
+    logger.info(video_name)
+    imgs_path = video_name.replace(".mp4", "").replace("mp4", "frames")
+    pcm_path = video_name.replace(".mp4", ".pcm").replace("mp4", "pcm")
+
+    # step 1: extract feature
+    t0 = time.time()
+    image_path_list = get_images(imgs_path)
+    infer_configs['PPTSM']['frame_list'] = image_path_list
+    infer_configs['AUDIO']['pcm_file'] = pcm_path
+    image_features = image_model.predict(infer_configs)
+    audio_features, pcm_features = audio_model.predict(infer_configs)
+
+    np_image_features = np.array(image_features, dtype=np.float32)
+    np_audio_features = np.array(audio_features, dtype=np.float32)
+    np_pcm_features = np.array(pcm_features, dtype=np.float32)
+    t1 = time.time()
+
+    logger.info('{} {} {}'.format(np_image_features.shape,
+                                  np_audio_features.shape,
+                                  np_pcm_features.shape))
+    logger.info("step1: feature extract time: {} min".format(
+        (t1 - t0) * 1.0 / 60))
+    video_features = {
+        'image_feature': np_image_features,
+        'audio_feature': np_audio_features,
+        'pcm_feature': np_pcm_features
+    }
+
+    # save feature
+    feature_path = video_name.replace(".mp4", ".pkl").replace("mp4", "features")
+    feat_pkl_str = pickle.dumps(video_features,
+                                protocol=pickle.HIGHEST_PROTOCOL)
+    with open(feature_path, 'wb') as fout:
+        fout.write(feat_pkl_str)
+
+
+if __name__ == '__main__':
+    dataset_dir = "../datasets/EuroCup2016"
+    if not os.path.exists(dataset_dir + '/features'):
+        os.mkdir(dataset_dir + '/features')
+
+    load_model()
+
+    video_url = os.path.join(dataset_dir, 'url.list')
+    with open(video_url, 'r') as f:
+        lines = f.readlines()
+    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]
+
+    for line in lines:
+        video_classify(line)
diff --git a/docs/src/applications/FootballAction/predict/action_detect/action.py b/docs/src/applications/FootballAction/predict/action_detect/action.py
new file mode 100644
index 000000000..6f4775f38
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/action.py
@@ -0,0 +1,174 @@
+#!./python27-gcc482/bin/python
+# coding: utf-8
+"""
+BAIDU CLOUD action
+"""
+
+import os
+import sys
+import pickle
+import json
+import time
+import functools
+
+import numpy as np
+
+from utils.preprocess import get_images
+from utils.config_utils import parse_config, print_configs
+import mfcc.feature_extractor as mfcc_extractor
+
+import models.pptsm_infer as image_model
+import models.audio_infer as audio_model
+import models.bmn_infer as prop_model
+import models.lstm_infer as classify_model
+
+import logger
+logger = logger.Logger()
+
+def record_time_info(func):
+    """decorator func to log cost time for func
+    """
+    @functools.wraps(func)
+    def timer(*args):
+        """log cost time for func
+        """
+        logger.info("function [{}] processing ...".format(func.__name__))
+        start_time = time.time()
+        retval = func(*args)
+        cost_time = round(time.time() - start_time, 5)
+        logger.info("function [{}] run time: {:.2f} min".format(func.__name__, cost_time / 60))
+        return retval
+    return timer
+
+
+class ActionDetection(object):
+    """ModelPredict"""
+    def __init__(self, cfg_file="configs/configs.yaml"):
+        cfg = parse_config(cfg_file)
+        self.configs = cfg
+        print_configs(self.configs, "Infer")
+
+        name = 'COMMON'
+        self.DEBUG          = cfg[name]['DEBUG']
+        self.BMN_ONLY       = cfg[name]['BMN_ONLY']
+        self.LSTM_ONLY      = cfg[name]['LSTM_ONLY']
+        self.PCM_ONLY       = cfg[name]['PCM_ONLY']
+        if self.LSTM_ONLY:
+            self.prop_dict = {}
+            for dataset in ['EuroCup2016']:
+                prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)
+                json_data = json.load(open(prop_json, 'r'))
+                for item in json_data:
+                    basename = prop_json.replace('feature_bmn/prop.json', 'mp4')
+                    basename = basename + '/' + item['video_name'] + '.mp4'
+                    self.prop_dict[basename] = item['bmn_results']
+            
+
+    @record_time_info
+    def load_model(self):
+        """
+        load_model
+        """
+        if not self.DEBUG:
+            self.image_model = image_model.InferModel(self.configs)
+            if not self.PCM_ONLY:
+                self.audio_model = audio_model.InferModel(self.configs)
+    
+        if not self.LSTM_ONLY:
+            self.prop_model = prop_model.InferModel(self.configs)
+
+        if not self.BMN_ONLY:
+            self.classify_model = classify_model.InferModel(self.configs)
+
+        logger.info("==> Action Detection prepared.")
+
+    @record_time_info
+    def infer(self, imgs_path, pcm_path, fps=5):
+        """
+        extract_feature
+        """
+        self.imgs_path = imgs_path
+        self.pcm_path = pcm_path
+        self.configs['COMMON']['fps'] = fps
+
+        logger.info("==> input video {}".format(os.path.basename(self.imgs_path)))
+    
+        # step 1: extract feature
+        video_features = self.extract_feature()
+    
+        # step2: get proposal
+        bmn_results = self.extract_proposal(video_features)
+         
+        # step3: classify 
+        material = {'feature': video_features, 'proposal': bmn_results}
+        action_results = self.video_classify(material)
+        
+        return bmn_results, action_results
+
+    @record_time_info
+    def video_classify(self, material):
+        """video classify"""
+        if self.BMN_ONLY:
+            return []
+        action_results = self.classify_model.predict(self.configs, material=material) 
+        logger.info('action shape {}'.format(np.array(action_results).shape))
+        return action_results
+
+    @record_time_info
+    def extract_proposal(self, video_features):
+        """extract proposal"""
+        if self.LSTM_ONLY:
+            basename = self.imgs_path.replace('frames', 'mp4') + '.mp4'
+            bmn_results = self.prop_dict[basename]
+            return bmn_results
+        bmn_results = self.prop_model.predict(self.configs, material=video_features)
+        logger.info('proposal shape {}'.format(np.array(bmn_results).shape))
+        return bmn_results
+
+    @record_time_info
+    def extract_feature(self):
+        """extract feature"""
+        if not self.DEBUG:
+            image_path_list = get_images(self.imgs_path)
+            self.configs['PPTSM']['frame_list'] = image_path_list
+            self.configs['AUDIO']['pcm_file'] = self.pcm_path
+            image_features = self.image_model.predict(self.configs)
+            if self.PCM_ONLY:
+                sample_rate = self.configs['AUDIO']['sample_rate']
+                pcm_features = mfcc_extractor.extract_pcm(self.pcm_path, sample_rate)
+                audio_features = []
+            else:
+                audio_features, pcm_features = self.audio_model.predict(self.configs)
+
+            np_image_features = np.array(image_features, dtype=np.float32)
+            np_audio_features = np.array(audio_features, dtype=np.float32)
+            np_pcm_features = np.array(pcm_features, dtype=np.float32)
+
+            video_features = {'image_feature': np_image_features,
+                              'audio_feature': np_audio_features,
+                              'pcm_feature': np_pcm_features}
+        else:
+            feature_path = self.imgs_path.replace("frames", "features") + '.pkl'
+            video_features = pickle.load(open(feature_path, 'rb'))
+
+        logger.info("feature shape {} {} {}".format(video_features['image_feature'].shape,
+                                                    video_features['audio_feature'].shape,
+                                                    video_features['pcm_feature'].shape))
+
+        return video_features
+
+if __name__ == '__main__':
+
+    model_predict = ActionDetection(cfg_file="../configs/configs.yaml")
+    model_predict.load_model()
+
+    imgs_path = "/home/work/datasets/EuroCup2016/frames/1be705a8f67648da8ec4b4296fa80895"
+    pcm_path = "/home/work/datasets/EuroCup2016/pcm/1be705a8f67648da8ec4b4296fa80895.pcm"
+
+    bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)
+    results = {'bmn_results': bmn_results, 'action_results': action_results}
+
+    with open('results.json', 'w', encoding='utf-8') as f:
+       data = json.dumps(results, indent=4, ensure_ascii=False)
+       f.write(data)
+
diff --git a/docs/src/applications/FootballAction/predict/action_detect/logger.py b/docs/src/applications/FootballAction/predict/action_detect/logger.py
new file mode 100644
index 000000000..b03348721
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/logger.py
@@ -0,0 +1,24 @@
+"""
+logger
+"""
+import os
+import logging
+
+class Logger(logging.Logger):
+    """Customized logger for news stripper
+    """
+    def __init__(self):
+        super(Logger, self).__init__(self)
+        if not os.path.exists('logs'):
+            os.mkdir('logs')
+        handler = logging.FileHandler("logs/action_detect.log")
+        # handler.setLevel(logging.DEBUG)
+        handler.setLevel(logging.INFO)
+
+        format = "%(levelname)s: %(asctime)s: %(filename)s:%(lineno)d %(message)s"
+        datefmt = "%y-%m-%d %H:%M:%S"
+
+        formatter = logging.Formatter(format, datefmt)
+        handler.setFormatter(formatter)
+        self.addHandler(handler)
+
diff --git a/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py b/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py
new file mode 100755
index 000000000..07c1027a2
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py
@@ -0,0 +1,182 @@
+"""
+audio feature extract
+"""
+# coding: utf-8
+import os
+import numpy as np
+import pickle
+import mfcc.vgg_params as vgg_params
+
+
+def frame(data, window_length, hop_length):
+    """
+    frame
+    """
+    num_samples = data.shape[0]
+    #print("window_length , hop_length", window_length, hop_length)
+    #print("num_sample = ", num_samples)
+    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
+    #print(" num_frames = ", num_frames)
+    shape = (num_frames, window_length) + data.shape[1:]
+    #print(" shape = ", shape)
+    strides = (data.strides[0] * hop_length, ) + data.strides
+    #print("data.strides = ", data.strides)
+    #print("strides = ", strides)
+    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
+
+
+def periodic_hann(window_length):
+    """
+    periodic_hann
+    """
+    return 0.5 - (0.5 *
+                  np.cos(2 * np.pi / window_length * np.arange(window_length)))
+
+
+def stft_magnitude(signal, fft_length, hop_length=None, window_length=None):
+    """
+    stft_magnitude
+    """
+    frames = frame(signal, window_length, hop_length)
+    window = periodic_hann(window_length)
+    windowed_frames = frames * window
+    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
+
+
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+
+
+def hertz_to_mel(frequencies_hertz):
+    """
+    hertz_to_mel
+    """
+    return _MEL_HIGH_FREQUENCY_Q * np.log(1.0 + (frequencies_hertz /
+                                                 _MEL_BREAK_FREQUENCY_HERTZ))
+
+
+def spectrogram_to_mel_matrix(num_mel_bins=20,
+                              num_spectrogram_bins=129,
+                              audio_sample_rate=8000,
+                              lower_edge_hertz=125.0,
+                              upper_edge_hertz=3800.0):
+    """
+    spectrogram_to_mel_matrix
+    """
+    nyquist_hertz = audio_sample_rate / 2.
+    if lower_edge_hertz >= upper_edge_hertz:
+        raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
+                         (lower_edge_hertz, upper_edge_hertz))
+    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz,
+                                         num_spectrogram_bins)
+    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
+    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
+                                 hertz_to_mel(upper_edge_hertz),
+                                 num_mel_bins + 2)
+    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
+    for i in range(num_mel_bins):
+        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
+        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
+                       (center_mel - lower_edge_mel))
+        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
+                       (upper_edge_mel - center_mel))
+        mel_weights_matrix[:,
+                           i] = np.maximum(0.0,
+                                           np.minimum(lower_slope, upper_slope))
+    mel_weights_matrix[0, :] = 0.0
+    return mel_weights_matrix
+
+
+def log_mel_spectrogram(data,
+                        audio_sample_rate=8000,
+                        log_offset=0.0,
+                        window_length_secs=0.025,
+                        hop_length_secs=0.010,
+                        **kwargs):
+    """
+    log_mel_spectrogram
+    """
+    window_length_samples = int(round(audio_sample_rate * window_length_secs))
+    #print("audio_sample_rate = ", audio_sample_rate)
+    #print("window_length_secs = ", window_length_secs)
+    #print("window_length_sample ", window_length_samples)
+    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
+    #print("hop_length_samples ", hop_length_samples)
+    fft_length = 2**int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
+    #print(" fft_lengt = ", fft_length)
+    spectrogram = stft_magnitude(data,
+                                 fft_length=fft_length,
+                                 hop_length=hop_length_samples,
+                                 window_length=window_length_samples)
+    #print(" spectrogram.shape = ", spectrogram.shape)
+    mel_spectrogram = np.dot(
+        spectrogram,
+        spectrogram_to_mel_matrix(num_spectrogram_bins=spectrogram.shape[1],
+                                  audio_sample_rate=audio_sample_rate,
+                                  **kwargs))
+
+    return np.log(mel_spectrogram + log_offset)
+
+
+def wav_to_example(wav_data, sample_rate):
+    """
+    wav_to_example
+    """
+    #sample_rate, wav_data = wavfile.read(wav_file)
+    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
+    #wav_data = wav_data[:16000*30]
+    #print(" wav_data ", wav_data.shape)
+    #print(" wav_data ", wav_data.shape)
+    pad_zero_num = int(sample_rate * (vgg_params.STFT_WINDOW_LENGTH_SECONDS -
+                                      vgg_params.STFT_HOP_LENGTH_SECONDS))
+    wav_data_extend = np.hstack((wav_data, np.zeros(pad_zero_num)))
+    wav_data = wav_data_extend
+    #print(" wav_data ", wav_data.shape)
+    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
+    #print(" wav_data after convert to -1 1", wav_data)
+    #if wav_data.shape[0] > max_second * sample_rate:
+    #    wav_data = wav_data[:max_second * sample_rate, :]
+    if len(wav_data.shape) > 1:
+        wav_data = np.mean(wav_data, axis=1)
+    #print(" wav_data after mean", wav_data.shape, len(wav_data.shape), wav_data)
+    # Resample to the rate assumed by vgg.
+    #if sample_rate != vgg_params.SAMPLE_RATE:
+    #    wav_data = resampy.resample(wav_data, sample_rate, vgg_params.SAMPLE_RATE)
+    log_mel = log_mel_spectrogram(
+        wav_data,
+        audio_sample_rate=vgg_params.SAMPLE_RATE,
+        log_offset=vgg_params.LOG_OFFSET,
+        window_length_secs=vgg_params.STFT_WINDOW_LENGTH_SECONDS,
+        hop_length_secs=vgg_params.STFT_HOP_LENGTH_SECONDS,
+        num_mel_bins=vgg_params.NUM_MEL_BINS,
+        lower_edge_hertz=vgg_params.MEL_MIN_HZ,
+        upper_edge_hertz=vgg_params.MEL_MAX_HZ)
+    # Frame features into examples.
+    features_sample_rate = 1.0 / vgg_params.STFT_HOP_LENGTH_SECONDS
+    example_window_length = int(
+        round(vgg_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
+
+    example_hop_length = int(
+        round(vgg_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
+    log_mel_examples = frame(log_mel,
+                             window_length=example_window_length,
+                             hop_length=example_hop_length)
+    return log_mel_examples
+
+
+def extract_pcm(pcm_file, sample_rate):
+    with open(pcm_file, "rb") as f:
+        pcm_data = f.read()
+    audio_data = np.fromstring(pcm_data, dtype=np.int16)
+    examples = wav_to_example(audio_data, sample_rate)
+    return examples
+
+
+if __name__ == "__main__":
+    wav_file = sys.argv[1]
+    print("wav_file = ", wav_file)
+    with open(wav_file, "rb") as f:
+        pcm_data = f.read()
+    audio_data = np.fromstring(pcm_data, dtype = np.int16)
+    examples_batch = wav_to_example(audio_data, 16000)
+    print("examples_batch.shape", examples_batch.shape)   
diff --git a/docs/src/applications/FootballAction/predict/action_detect/mfcc/model_config.py b/docs/src/applications/FootballAction/predict/action_detect/mfcc/model_config.py
new file mode 100644
index 000000000..194365ece
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/mfcc/model_config.py
@@ -0,0 +1,51 @@
+"""
+audio model config
+"""
+import numpy as np
+
+import mfcc.feature_extractor as feature_extractor
+
+
+class ModelAudio(object):
+    """
+    modelAudio
+    """
+    def __init__(self, configs, use_gpu=1):
+        self.use_gpu = use_gpu
+
+        self.audio_fps = configs.COMMON.fps
+        self.audio_feat_scale = configs.TSN.audio_scale
+        self.sample_rate = 16000
+
+    def predict_slice(self, wav_data, sample_rate):
+        """
+        audio predict
+        """
+        examples_batch = feature_extractor.wav_to_example(
+            wav_data, sample_rate)[0]
+        return examples_batch
+
+    def predict_audio(self, audio_file):
+        """
+        predict_audio
+        """
+        audio_feature_list = []
+        # read pcm
+        sample_rate = self.sample_rate
+        try:
+            with open(audio_file, "rb") as f:
+                pcm_data = f.read()
+            audio_data = np.fromstring(pcm_data, dtype=np.int16)
+            audio_status = "audio load success"
+        except Exception as e:
+            audio_data = []
+            audio_status = "audio load failed"
+        step = 1
+        len_video = int(len(audio_data) / sample_rate)
+        print(len_video)
+        for i in range(0, len_video, step):
+            audio_data_part = audio_data[i * sample_rate:(i + step) *
+                                         sample_rate]
+            feature_audio = self.predict_slice(audio_data_part, sample_rate)
+            audio_feature_list.append(feature_audio)
+        return audio_feature_list
diff --git a/docs/src/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py b/docs/src/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py
new file mode 100755
index 000000000..0a9951961
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py
@@ -0,0 +1,37 @@
+"""Global parameters for the VGGish model.
+See vggish_slim.py for more information.
+"""
+
+# Architectural constants.
+NUM_FRAMES = 50  # Frames in input mel-spectrogram patch.
+NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
+EMBEDDING_SIZE = 128  # Size of embedding layer.
+
+# Hyperparameters used in feature and example generation.
+SAMPLE_RATE = 16000
+STFT_WINDOW_LENGTH_SECONDS = 0.040
+STFT_HOP_LENGTH_SECONDS = 0.020
+NUM_MEL_BINS = NUM_BANDS
+MEL_MIN_HZ = 125
+MEL_MAX_HZ = 7500
+LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
+EXAMPLE_WINDOW_SECONDS = 1.00  # Each example contains 96 10ms frames
+EXAMPLE_HOP_SECONDS = 1.00  # with zero overlap.
+
+# Parameters used for embedding postprocessing.
+PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
+PCA_MEANS_NAME = 'pca_means'
+QUANTIZE_MIN_VAL = -2.0
+QUANTIZE_MAX_VAL = +2.0
+
+# Hyperparameters used in training.
+INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
+LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
+ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
+
+# Names of ops, tensors, and features.
+INPUT_OP_NAME = 'vggish/input_features'
+INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
+OUTPUT_OP_NAME = 'vggish/embedding'
+OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
+AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
diff --git a/docs/src/applications/FootballAction/predict/action_detect/models/audio_infer.py b/docs/src/applications/FootballAction/predict/action_detect/models/audio_infer.py
new file mode 100644
index 000000000..7b19c90ed
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/models/audio_infer.py
@@ -0,0 +1,80 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """audio infer"""
+    def __init__(self, cfg, name='AUDIO'): 
+        name = name.upper()
+        self.name           = name
+        model_file          = cfg[name]['model_file']
+        params_file         = cfg[name]['params_file']
+        gpu_mem             = cfg[name]['gpu_mem']
+        device_id           = cfg[name]['device_id']
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input_tensor = self.predictor.get_input_handle(input_names[0])
+
+        output_names = self.predictor.get_output_names()
+        self.output_tensor = self.predictor.get_output_handle(output_names[0])
+
+
+    def infer(self, input):
+        """infer"""
+        self.input_tensor.copy_from_cpu(input)
+        self.predictor.run()
+        output = self.output_tensor.copy_to_cpu()
+        return output
+
+
+    def predict(self, infer_config):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config)
+        feature_list = []
+        pcm_list = []
+        for infer_iter, data in enumerate(infer_reader()):
+            inputs = np.array(data, dtype = 'float32')
+            output = self.infer(inputs)
+            feature_list.append(np.squeeze(output))
+            pcm_list.append(inputs)
+        feature_values = np.vstack(feature_list)
+        pcm_values = np.vstack(pcm_list)
+        return feature_values, pcm_values
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml' 
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    pcm_path = '/home/work/datasets/WorldCup2018/pcm/6e577252c4004961ac7caa738a52c238.pcm'
+    t0 = time.time()
+    cfg['AUDIO']['pcm_file'] = pcm_path
+    outputs = model.predict(cfg)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    t1 = time.time()
+
+    print(outputs.shape)
+    print(outputs[0])
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py b/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py
new file mode 100644
index 000000000..963f75669
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/models/bmn_infer.py
@@ -0,0 +1,156 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import json
+import pickle
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+from utils.process_result import process_proposal
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """bmn infer"""
+    def __init__(self, cfg, name='BMN'): 
+        name = name.upper()
+        self.name           = name
+        model_file          = cfg[name]['model_file']
+        params_file         = cfg[name]['params_file']
+        gpu_mem             = cfg[name]['gpu_mem']
+        device_id           = cfg[name]['device_id']
+
+        self.nms_thread          = cfg[name]['nms_thread']
+        self.min_pred_score      = cfg[name]['score_thread']
+        self.min_frame_thread    = cfg['COMMON']['fps']
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input_tensor = self.predictor.get_input_handle(input_names[0])
+
+        output_names = self.predictor.get_output_names()
+        self.output1_tensor = self.predictor.get_output_handle(output_names[0])
+        self.output2_tensor = self.predictor.get_output_handle(output_names[1])
+        self.output3_tensor = self.predictor.get_output_handle(output_names[2])
+
+
+    def infer(self, input):
+        """infer"""
+        self.input_tensor.copy_from_cpu(input)
+        self.predictor.run()
+        output1 = self.output1_tensor.copy_to_cpu()
+        output2 = self.output2_tensor.copy_to_cpu()
+        output3 = self.output3_tensor.copy_to_cpu()
+        return output1, output2, output3
+
+
+    def generate_props(self, pred_bmn, pred_start, pred_end, max_window=200, min_window=5):
+        """generate_props"""
+        video_len = min(pred_bmn.shape[-1], min(pred_start.shape[-1], pred_end.shape[-1]))
+        pred_bmn = pred_bmn[0, :, :] * pred_bmn[1, :, :]
+        start_mask = self.boundary_choose(pred_start)
+        start_mask[0] = 1.
+        end_mask = self.boundary_choose(pred_end)
+        end_mask[-1] = 1.
+        score_results = []
+        for idx in range(min_window, max_window):
+            for jdx in range(video_len):
+                start_index = jdx
+                end_index = start_index + idx
+                if end_index < video_len and start_mask[start_index] == 1 and end_mask[end_index] == 1:
+                    xmin = start_index
+                    xmax = end_index
+                    xmin_score = pred_start[start_index]
+                    xmax_score = pred_end[end_index]
+                    bmn_score = pred_bmn[idx, jdx]
+                    conf_score = xmin_score * xmax_score * bmn_score
+                    score_results.append([xmin, xmax, conf_score])
+        return score_results
+
+
+    def boundary_choose(self, score_list):
+        """boundary_choose"""
+        max_score = max(score_list)
+        mask_high = (score_list > max_score * 0.5)
+        score_list = list(score_list)
+        score_middle = np.array([0.0] + score_list + [0.0])
+        score_front = np.array([0.0, 0.0] + score_list)
+        score_back = np.array(score_list + [0.0, 0.0])
+        mask_peak = ((score_middle > score_front) & (score_middle > score_back))
+        mask_peak = mask_peak[1:-1]
+        mask = (mask_high | mask_peak).astype('float32')
+        return mask
+
+
+    def predict(self, infer_config, material):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)
+        feature_list = []
+        for infer_iter, data in enumerate(infer_reader()):
+            inputs      = [items[0] for items in data]
+            winds       = [items[1] for items in data]
+            feat_info   = [items[2] for items in data]
+            feature_T   = feat_info[0][0]
+            feature_N   = feat_info[0][1]
+
+            inputs = np.array(inputs)
+            pred_bmn, pred_sta, pred_end = self.infer(inputs)
+
+            if infer_iter == 0:
+                sum_pred_bmn = np.zeros((2, feature_N, feature_T))
+                sum_pred_sta = np.zeros((feature_T, ))
+                sum_pred_end = np.zeros((feature_T, ))
+                sum_pred_cnt = np.zeros((feature_T, ))
+
+            for idx, sub_wind in enumerate(winds):
+                sum_pred_bmn[:, :, sub_wind[0]: sub_wind[1]] += pred_bmn[idx]
+                sum_pred_sta[sub_wind[0]: sub_wind[1]] += pred_sta[idx]
+                sum_pred_end[sub_wind[0]: sub_wind[1]] += pred_end[idx]
+                sum_pred_cnt[sub_wind[0]: sub_wind[1]] += np.ones((sub_wind[1] - sub_wind[0], ))
+
+        pred_bmn = sum_pred_bmn / sum_pred_cnt
+        pred_sta = sum_pred_sta / sum_pred_cnt
+        pred_end = sum_pred_end / sum_pred_cnt
+
+        score_result = self.generate_props(pred_bmn, pred_sta, pred_end)
+        results = process_proposal(score_result, self.min_frame_thread, self.nms_thread, self.min_pred_score)
+
+        return results
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml' 
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'
+
+    # feature
+    feature_path = imgs_path.replace("frames", "features") + '.pkl'
+    video_features = pickle.load(open(feature_path, 'rb'))
+
+    t0 = time.time()
+    outputs = model.predict(cfg, video_features)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    t1 = time.time()
+
+    results = {'proposal': outputs}
+    with open('results.json', 'w', encoding='utf-8') as f:
+       data = json.dumps(results, indent=4, ensure_ascii=False)
+       f.write(data) 
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py b/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py
new file mode 100644
index 000000000..acb387422
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/models/lstm_infer.py
@@ -0,0 +1,152 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import json
+import pickle
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+from utils.process_result import get_action_result
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """lstm infer"""
+    def __init__(self, cfg, name='ACTION'): 
+        name = name.upper()
+        self.name           = name
+        model_file          = cfg[name]['model_file']
+        params_file         = cfg[name]['params_file']
+        gpu_mem             = cfg[name]['gpu_mem']
+        device_id           = cfg[name]['device_id']
+
+        self.topk           = cfg[name]['topk']
+        self.frame_offset   = cfg[name]['nms_offset']
+        self.nms_thread     = cfg[name]['nms_thread']
+        self.cls_thread     = cfg[name]['classify_score_thread']
+        self.iou_thread     = cfg[name]['iou_score_thread']
+
+        self.label_map_file = cfg['COMMON']['label_dic']
+        self.fps            = cfg['COMMON']['fps']
+        self.nms_id         = 5
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input1_tensor = self.predictor.get_input_handle(input_names[0])
+        self.input2_tensor = self.predictor.get_input_handle(input_names[1])
+
+        output_names = self.predictor.get_output_names()
+        self.output1_tensor = self.predictor.get_output_handle(output_names[0])
+        self.output2_tensor = self.predictor.get_output_handle(output_names[1])
+
+
+    def infer(self, input1_arr, input1_lod, input2_arr=None, input2_lod=None):
+        """infer"""
+        self.input1_tensor.copy_from_cpu(input1_arr)
+        self.input1_tensor.set_lod(input1_lod)
+        if not input2_arr is None:
+            self.input2_tensor.copy_from_cpu(input2_arr)
+            self.input2_tensor.set_lod(input2_lod)
+        self.predictor.run()
+        output1 = self.output1_tensor.copy_to_cpu()
+        output2 = self.output2_tensor.copy_to_cpu()
+        # print(output.shape)
+        return output1, output2
+
+    def pre_process(self, input):
+        """pre process"""
+        input_arr = []
+        input_lod = [0]
+        start_lod = 0
+        end_lod = 0
+        for sub_item in input:
+            end_lod = start_lod + len(sub_item)
+            input_lod.append(end_lod)
+            input_arr.extend(sub_item)
+            start_lod = end_lod
+        input_arr = np.array(input_arr)
+        # print(input_arr.shape)
+        # print([input_lod])
+        return input_arr, [input_lod]
+
+    def predict(self, infer_config, material):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config, material=material)
+        results = []
+        for infer_iter, data in enumerate(infer_reader()):
+            video_id = [[items[-2], items[-1]] for items in data]
+            input1 = [items[0] for items in data]
+            input2 = [items[1] for items in data]
+            input1_arr, input1_lod = self.pre_process(input1)
+            input2_arr, input2_lod = self.pre_process(input2)
+            output1, output2 = self.infer(input1_arr, input1_lod, input2_arr, input2_lod)
+            # output1, output2 = self.infer(input1_arr, input1_lod)
+
+            predictions_id = output1 
+            predictions_iou = output2
+            for i in range(len(predictions_id)):
+                topk_inds = predictions_id[i].argsort()[0 - self.topk:]
+                topk_inds = topk_inds[::-1]
+                preds_id = predictions_id[i][topk_inds]
+                preds_iou = predictions_iou[i][0]
+                results.append((video_id[i], preds_id.tolist(), topk_inds.tolist(), preds_iou.tolist()))
+
+        predict_result = get_action_result(results, self.label_map_file, self.fps, 
+                                           self.cls_thread, self.iou_thread, 
+                                           self.nms_id, self.nms_thread, self.frame_offset)
+        return predict_result
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml' 
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    # proposal total
+    prop_dict = {}
+    for dataset in ['EuroCup2016', 'WorldCup2018']:
+        prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(dataset)
+        json_data = json.load(open(prop_json, 'r'))
+        for item in json_data:
+            basename = prop_json.replace('feature_bmn/prop.json', 'mp4')
+            basename = basename + '/' + item['video_name'] + '.mp4'
+            prop_dict[basename] = item['bmn_results']
+
+    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'
+
+    # feature
+    feature_path = imgs_path.replace("frames", "features") + '.pkl'
+    video_features = pickle.load(open(feature_path, 'rb'))
+
+    # proposal
+    basename = imgs_path.replace('frames', 'mp4') + '.mp4'
+    bmn_results = prop_dict[basename]
+
+    material = {'feature': video_features, 'proposal': bmn_results}
+
+    t0 = time.time()
+    outputs = model.predict(cfg, material)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    # print(outputs.shape)
+    t1 = time.time()
+    results = {'actions': outputs}
+    with open('results.json', 'w', encoding='utf-8') as f:
+       data = json.dumps(results, indent=4, ensure_ascii=False)
+       f.write(data) 
+
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/FootballAction/predict/action_detect/models/pptsm_infer.py b/docs/src/applications/FootballAction/predict/action_detect/models/pptsm_infer.py
new file mode 100644
index 000000000..639cfab9b
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/models/pptsm_infer.py
@@ -0,0 +1,78 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """pptsm infer"""
+    def __init__(self, cfg, name='PPTSM'):
+        name = name.upper()
+        self.name = name
+        model_file = cfg[name]['model_file']
+        params_file = cfg[name]['params_file']
+        gpu_mem = cfg[name]['gpu_mem']
+        device_id = cfg[name]['device_id']
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input_tensor = self.predictor.get_input_handle(input_names[0])
+
+        output_names = self.predictor.get_output_names()
+        self.output_tensor = self.predictor.get_output_handle(output_names[1])
+        #self.output_tensor = self.predictor.get_output_handle(output_names[0])
+
+    def infer(self, input):
+        """infer"""
+        self.input_tensor.copy_from_cpu(input)
+        self.predictor.run()
+        output = self.output_tensor.copy_to_cpu()
+        return output
+
+    def predict(self, infer_config):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config)
+        feature_list = []
+        for infer_iter, data in enumerate(infer_reader()):
+            inputs = [items[:-1] for items in data]
+            inputs = np.array(inputs)
+            output = self.infer(inputs)
+            feature_list.append(np.squeeze(output))
+        feature_list = np.vstack(feature_list)
+        return feature_list
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml'
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238/'
+    imgs_list = get_images(imgs_path)
+    t0 = time.time()
+    cfg['PPTSM']['frame_list'] = imgs_list
+    outputs = model.predict(cfg)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    t1 = time.time()
+
+    print(outputs.shape)
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/FootballAction/predict/action_detect/reader/__init__.py b/docs/src/applications/FootballAction/predict/action_detect/reader/__init__.py
new file mode 100644
index 000000000..547b2d6bb
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/reader/__init__.py
@@ -0,0 +1,15 @@
+"""
+read map for model
+"""
+from reader.reader_utils import regist_reader, get_reader
+import reader.tsminf_reader as tsminf_reader
+import reader.audio_reader as audio_reader
+import reader.bmninf_reader as bmninf_reader
+import reader.feature_reader as feature_reader
+
+# regist reader, sort by alphabet
+regist_reader("TSM", tsminf_reader.TSMINFReader)
+regist_reader("PPTSM", tsminf_reader.TSMINFReader)
+regist_reader("AUDIO", audio_reader.AudioReader)
+regist_reader("BMN", bmninf_reader.BMNINFReader)
+regist_reader("ACTION", feature_reader.FeatureReader)
diff --git a/docs/src/applications/FootballAction/predict/action_detect/reader/audio_reader.py b/docs/src/applications/FootballAction/predict/action_detect/reader/audio_reader.py
new file mode 100644
index 000000000..2e1f1d28f
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/reader/audio_reader.py
@@ -0,0 +1,78 @@
+"""
+audio reader
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import _pickle as cPickle
+#from .reader_utils import DataReader
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+    from io import BytesIO
+import numpy as np
+import random
+import code
+
+from .reader_utils import DataReader
+import mfcc.feature_extractor as feature_extractor
+
+class AudioReader(DataReader):
+    """
+    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks
+    This is for the three models: lstm, attention cluster, nextvlad
+
+    dataset cfg: num_classes
+                 batch_size
+                 list
+                 NextVlad only: eigen_file
+    """
+
+    def __init__(self, name, mode, cfg, material=None):
+        self.name = name
+        self.mode = mode
+
+        # set batch size and file list
+        self.sample_rate = cfg[self.name.upper()]['sample_rate']
+        self.batch_size = cfg[self.name.upper()]['batch_size']
+        self.pcm_file = cfg[self.name.upper()]['pcm_file']
+        self.material = material
+
+    def create_reader(self):
+        """create_reader"""
+        with open(self.pcm_file, "rb") as f:
+            pcm_data = f.read()
+        audio_data = np.fromstring(pcm_data, dtype=np.int16)
+        examples = feature_extractor.wav_to_example(audio_data, self.sample_rate)
+        # print(examples.shape)
+
+        def reader():
+            """reader"""
+            batch_out = []
+            batch_out_pre = []
+        
+            for audio in examples:
+                # batch_out.append([audio])
+                batch_out.append(audio)
+                if len(batch_out) == self.batch_size:
+                    yield batch_out
+                    batch_out = []
+            if len(batch_out) > 0:
+                yield batch_out
+            
+        return reader
diff --git a/docs/src/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py b/docs/src/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py
new file mode 100644
index 000000000..a076f2bfe
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py
@@ -0,0 +1,155 @@
+"""
+# @File  : bmninf_reader.py  
+# @Author: macaihong
+# @Date  : 2019/12/15
+# @Desc  :
+"""
+
+import os
+import random
+import pickle
+import json
+import numpy as np
+import multiprocessing
+
+import numpy as np
+
+from .reader_utils import DataReader
+
+
+def get_sw_prop(duration, window=200, step=10):
+    """
+    get_sw_prop
+    """
+    pr = []
+    local_boxes = []
+    for k in np.arange(0, duration - window + step, step):
+        start_id = k
+        end_id = min(duration, k + window)
+        if end_id - start_id < window:
+            start_id = end_id - window
+        local_boxes = (start_id, end_id)
+        pr.append(local_boxes)
+
+    def valid_proposal(duration, span):
+        """
+        valid_proposal
+        """
+        # fileter proposals
+        # a valid proposal should have at least one second in the video
+        real_span = min(duration, span[1]) - span[0]
+        return real_span >= 1
+
+    pr = list(filter(lambda x: valid_proposal(duration, x), pr))
+    return pr
+
+
+class BMNINFReader(DataReader):
+    """
+    Data reader for BMN model, which was stored as features extracted by prior networks
+    dataset cfg: feat_path, feature path,
+                 tscale, temporal length of BM map,
+                 dscale, duration scale of BM map,
+                 anchor_xmin, anchor_xmax, the range of each point in the feature sequence,
+                 batch_size, batch size of input data,
+                 num_threads, number of threads of data processing
+    """
+
+    def __init__(self, name, mode, cfg, material=None):
+        self.name = name
+        self.mode = mode
+        self.tscale = cfg[self.name.upper()]['tscale']  # 200
+        self.dscale = cfg[self.name.upper()]['dscale']  # 200
+        # self.subset = cfg[self.name.upper()]['subset']
+        self.tgap = 1. / self.tscale
+        self.step = cfg[self.name.upper()]['window_step']
+
+        self.material = material
+        src_feature = self.material
+
+        image_feature = src_feature['image_feature']
+        pcm_feature = src_feature['pcm_feature']
+        pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))
+        # print(rgb_feature.shape, audio_feature.shape, pcm_feature.shape)
+        min_length = min(image_feature.shape[0], pcm_feature.shape[0])
+        #if min_length == 0:
+        #    continue
+        image_feature = image_feature[:min_length, :]
+        pcm_feature = pcm_feature[:min_length, :]
+        self.features = np.concatenate((image_feature, pcm_feature), axis=1)
+
+        self.duration = len(self.features)
+        self.window = self.tscale
+
+        self.get_dataset_dict()
+        self.get_match_map()
+
+        self.batch_size = cfg[self.name.upper()]['batch_size']
+        if (mode == 'test') or (mode == 'infer'):
+            self.num_threads = 1  # set num_threads as 1 for test and infer
+
+    def get_dataset_dict(self):
+        """
+        get_dataset_dict
+        """
+        self.video_list = get_sw_prop(self.duration, self.window, self.step)
+
+    def get_match_map(self):
+        """
+        get_match_map
+        """
+        match_map = []
+        for idx in range(self.tscale):
+            tmp_match_window = []
+            xmin = self.tgap * idx
+            for jdx in range(1, self.tscale + 1):
+                xmax = xmin + self.tgap * jdx
+                tmp_match_window.append([xmin, xmax])
+            match_map.append(tmp_match_window)
+        match_map = np.array(match_map)
+        match_map = np.transpose(match_map, [1, 0, 2])
+        match_map = np.reshape(match_map, [-1, 2])
+        self.match_map = match_map
+        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]
+        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]
+
+    
+    def load_file(self, video_wind):
+        """
+        load_file
+        """
+        start_feat_id = video_wind[0]
+        end_feat_id = video_wind[1]
+        video_feat = self.features[video_wind[0]: video_wind[1]]
+        video_feat = video_feat.T
+        video_feat = video_feat.astype("float32")
+        return video_feat
+
+    def create_reader(self):
+        """
+        reader creator for ctcn model
+        """
+        return self.make_infer_reader()
+
+    def make_infer_reader(self):
+        """
+        reader for inference
+        """
+        def reader():
+            """
+            reader
+            """
+            batch_out = []
+            # for video_name in self.video_list:
+            for video_wind in self.video_list:
+                video_idx = self.video_list.index(video_wind)
+                video_feat = self.load_file(video_wind)
+                batch_out.append((video_feat, video_wind, [self.duration, self.dscale]))
+
+                if len(batch_out) == self.batch_size:
+                    yield batch_out
+                    batch_out = []
+            if len(batch_out) > 0:
+                yield batch_out
+
+        return reader
diff --git a/docs/src/applications/FootballAction/predict/action_detect/reader/feature_reader.py b/docs/src/applications/FootballAction/predict/action_detect/reader/feature_reader.py
new file mode 100644
index 000000000..4e406f739
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/reader/feature_reader.py
@@ -0,0 +1,87 @@
+"""
+attention-lstm feature reader
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+import numpy as np
+import random
+import code
+
+from .reader_utils import DataReader
+
+class FeatureReader(DataReader):
+    """
+    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks
+    This is for the three models: lstm, attention cluster, nextvlad
+
+    dataset cfg: num_classes
+                 batch_size
+                 list
+                 NextVlad only: eigen_file
+    """
+
+    def __init__(self, name, mode, cfg, material=None):
+        self.name = name
+        self.mode = mode
+        self.batch_size = cfg[self.name.upper()]['batch_size']
+
+        self.feature = material['feature']
+        self.proposal = material['proposal']
+        self.fps = 5
+
+    def create_reader(self):
+        """
+        create_reader
+        """
+        image_feature_list = self.feature['image_feature']
+        audio_feature_list = self.feature['audio_feature']
+        pcm_feature_list = self.feature['pcm_feature']
+        pcm_feature_list = pcm_feature_list.reshape((pcm_feature_list.shape[0] * 5, 640))
+
+        fl = self.proposal
+
+        if self.mode == 'train':
+            random.shuffle(fl)
+
+        def reader():
+            """
+            reader
+            """
+            batch_out = []
+            for prop_info in fl:
+                start_id = int(prop_info['start'])
+                end_id = int(prop_info['end'])
+                bmn_score = float(prop_info['score'])
+                try:
+                    image_feature = image_feature_list[start_id: end_id]
+                    audio_feature = audio_feature_list[int(start_id / self.fps): int(end_id / self.fps)]
+                    pcm_feature = pcm_feature_list[start_id: end_id]
+
+                    # image_feature = np.concatenate((image_feature, pcm_feature), axis=1)
+                    
+                    batch_out.append((image_feature, audio_feature, 0, prop_info))
+                    if len(batch_out) == self.batch_size:
+                        yield batch_out
+                        batch_out = []
+                except Exception as e:
+                    continue
+        return reader
+
diff --git a/docs/src/applications/FootballAction/predict/action_detect/reader/reader_utils.py b/docs/src/applications/FootballAction/predict/action_detect/reader/reader_utils.py
new file mode 100644
index 000000000..f76b5d38d
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/reader/reader_utils.py
@@ -0,0 +1,109 @@
+"""
+reader_util
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+
+
+class ReaderNotFoundError(Exception):
+    """
+    "Error: reader not found"
+    """
+
+    def __init__(self, reader_name, avail_readers):
+        super(ReaderNotFoundError, self).__init__()
+        self.reader_name = reader_name
+        self.avail_readers = avail_readers
+
+    def __str__(self):
+        msg = "Reader {} Not Found.\nAvailiable readers:\n".format(
+            self.reader_name)
+        for reader in self.avail_readers:
+            msg += "  {}\n".format(reader)
+        return msg
+
+
+class DataReader(object):
+    """
+    data reader for video input
+    """
+
+    def __init__(self, model_name, mode, cfg):
+        self.name = model_name
+        self.mode = mode
+        self.cfg = cfg
+
+    def create_reader(self):
+        """
+        Not implemented
+        """
+        pass
+
+    def get_config_from_sec(self, sec, item, default=None):
+        """
+        get_config_from_sec
+        """
+        if sec.upper() not in self.cfg:
+            return default
+        return self.cfg[sec.upper()].get(item, default)
+
+
+class ReaderZoo(object):
+    """
+    ReaderZoo
+    """
+    def __init__(self):
+        """
+        __init__
+        """
+        self.reader_zoo = {}
+
+    def regist(self, name, reader):
+        """
+        regist
+        """
+        assert reader.__base__ == DataReader, "Unknow model type {}".format(
+            type(reader))
+        self.reader_zoo[name] = reader
+
+    def get(self, name, mode, cfg, material=None):
+        """
+        get
+        """
+        for k, v in self.reader_zoo.items():
+            if k == name:
+                return v(name, mode, cfg, material)
+        raise ReaderNotFoundError(name, self.reader_zoo.keys())
+
+
+# singleton reader_zoo
+reader_zoo = ReaderZoo()
+
+
+def regist_reader(name, reader):
+    """
+    regist_reader
+    """
+    reader_zoo.regist(name, reader)
+
+
+def get_reader(name, mode, cfg, material=None):
+    """
+    get_reader
+    """
+    reader_model = reader_zoo.get(name, mode, cfg, material)
+    return reader_model.create_reader()
diff --git a/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py b/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py
new file mode 100644
index 000000000..9886d5424
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py
@@ -0,0 +1,358 @@
+"""
+tsn frame reader
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import random
+import functools
+import concurrent.futures
+import multiprocessing
+
+import numpy as np
+import paddle
+from PIL import Image, ImageEnhance
+
+from .reader_utils import DataReader
+
+
+class TSMINFReader(DataReader):
+    """
+    Data reader for video dataset of jpg folder.
+    """
+
+    def __init__(self, name, mode, cfg, material=None):
+        super(TSMINFReader, self).__init__(name, mode, cfg)
+        name = name.upper()
+        self.seg_num        = cfg[name]['seg_num']
+        self.seglen         = cfg[name]['seglen']
+        self.short_size     = cfg[name]['short_size']
+        self.target_size    = cfg[name]['target_size']
+        self.batch_size     = cfg[name]['batch_size']
+        self.reader_threads = cfg[name]['reader_threads']
+        self.buf_size       = cfg[name]['buf_size']
+        self.video_path     = cfg[name]['frame_list']
+
+        self.img_mean       = np.array(cfg[name]['image_mean']).reshape([3, 1, 1]).astype(np.float32)
+        self.img_std        = np.array(cfg[name]['image_std']).reshape([3, 1, 1]).astype(np.float32)
+
+        self.material = material
+
+    def create_reader(self):
+        """
+        batch loader for TSN
+        """
+        _reader = self._inference_reader_creator_longvideo(
+                self.video_path,
+                self.mode,
+                seg_num=self.seg_num,
+                seglen=self.seglen,
+                short_size=self.short_size,
+                target_size=self.target_size,
+                img_mean=self.img_mean,
+                img_std=self.img_std,
+                num_threads = self.reader_threads,
+                buf_size = self.buf_size)
+
+        def _batch_reader():
+            batch_out = []
+            for imgs, label in _reader():
+                if imgs is None:
+                    continue
+                batch_out.append((imgs, label))
+                if len(batch_out) == self.batch_size:
+                    yield batch_out
+                    batch_out = []
+            if len(batch_out) > 1:
+                yield batch_out[:-1]
+
+        return _batch_reader
+
+
+    def _inference_reader_creator_longvideo(self, video_path, mode, seg_num, seglen,
+                                  short_size, target_size, img_mean, img_std, num_threads, buf_size):
+        """
+        inference reader for video
+        """
+        def reader():
+            """
+            reader
+            """
+            def image_buf(image_id_path_buf):
+                """
+                image_buf reader
+                """  
+                try:
+                    img_path = image_id_path_buf[1]
+                    img = Image.open(img_path).convert("RGB")
+                    image_id_path_buf[2] = img
+                except:
+                    image_id_path_buf[2] = None
+
+            frame_len = len(video_path)
+            read_thread_num = seg_num
+            for i in range(0, frame_len, read_thread_num):
+                image_list_part = video_path[i: i + read_thread_num]
+                image_id_path_buf_list = []
+                for k in range(len(image_list_part)):
+                    image_id_path_buf_list.append([k, image_list_part[k], None])
+
+                
+                with concurrent.futures.ThreadPoolExecutor(max_workers=read_thread_num) as executor:
+                    executor.map(lambda image_id_path_buf: image_buf(image_id_path_buf), image_id_path_buf_list)
+                imgs_seg_list = [x[2] for x in image_id_path_buf_list]
+                    
+                # add the fault-tolerant for bad image
+                for k in range(len(image_id_path_buf_list)):
+                    img_buf = image_id_path_buf_list[k][2]
+                    pad_id = 1
+                    while pad_id < seg_num and img_buf is None:
+                        img_buf = imgs_seg_list[(k + pad_id)%seg_num][2]
+                    if img_buf is None:
+                        logger.info("read img erro from {} to {}".format(i, i + read_thread_num))
+                        exit(0)
+                    else:
+                        imgs_seg_list[k] = img_buf
+                for pad_id in range(len(imgs_seg_list), seg_num):
+                    imgs_seg_list.append(imgs_seg_list[-1])
+                yield imgs_seg_list      
+
+
+        def inference_imgs_transform(imgs_list, mode, seg_num, seglen, short_size,\
+                                    target_size, img_mean, img_std):
+            """
+            inference_imgs_transform
+            """ 
+            imgs_ret = imgs_transform(imgs_list, mode, seg_num, seglen, short_size,
+                        target_size, img_mean, img_std)
+            label_ret = 0
+
+            return imgs_ret, label_ret
+
+        mapper = functools.partial(
+            inference_imgs_transform,
+            mode=mode,
+            seg_num=seg_num,
+            seglen=seglen,
+            short_size=short_size,
+            target_size=target_size,
+            img_mean=img_mean,
+            img_std=img_std)
+
+        return paddle.reader.xmap_readers(mapper, reader, num_threads, buf_size, order=True)
+
+
+def imgs_transform(imgs,
+                   mode,
+                   seg_num,
+                   seglen,
+                   short_size,
+                   target_size,
+                   img_mean,
+                   img_std,
+                   name=''):
+    """
+    imgs_transform
+    """
+    imgs = group_scale(imgs, short_size)
+
+    if mode == 'train':
+        if name == "TSM":
+            imgs = group_multi_scale_crop(imgs, short_size)
+        imgs = group_random_crop(imgs, target_size)
+        imgs = group_random_flip(imgs)
+    else:
+        imgs = group_center_crop(imgs, target_size)
+
+    np_imgs = (np.array(imgs[0]).astype('float32').transpose(
+        (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255
+    for i in range(len(imgs) - 1):
+        img = (np.array(imgs[i + 1]).astype('float32').transpose(
+            (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255
+        np_imgs = np.concatenate((np_imgs, img))
+    imgs = np_imgs
+    imgs -= img_mean
+    imgs /= img_std
+    imgs = np.reshape(imgs, (seg_num, seglen * 3, target_size, target_size))
+
+    return imgs
+
+def group_multi_scale_crop(img_group, target_size, scales=None, \
+        max_distort=1, fix_crop=True, more_fix_crop=True):
+    """
+    group_multi_scale_crop
+    """
+    scales = scales if scales is not None else [1, .875, .75, .66]
+    input_size = [target_size, target_size]
+
+    im_size = img_group[0].size
+
+    # get random crop offset
+    def _sample_crop_size(im_size):
+        """
+         _sample_crop_size
+        """
+        image_w, image_h = im_size[0], im_size[1]
+
+        base_size = min(image_w, image_h)
+        crop_sizes = [int(base_size * x) for x in scales]
+        crop_h = [
+            input_size[1] if abs(x - input_size[1]) < 3 else x
+            for x in crop_sizes
+        ]
+        crop_w = [
+            input_size[0] if abs(x - input_size[0]) < 3 else x
+            for x in crop_sizes
+        ]
+
+        pairs = []
+        for i, h in enumerate(crop_h):
+            for j, w in enumerate(crop_w):
+                if abs(i - j) <= max_distort:
+                    pairs.append((w, h))
+
+        crop_pair = random.choice(pairs)
+        if not fix_crop:
+            w_offset = random.randint(0, image_w - crop_pair[0])
+            h_offset = random.randint(0, image_h - crop_pair[1])
+        else:
+            w_step = (image_w - crop_pair[0]) / 4
+            h_step = (image_h - crop_pair[1]) / 4
+
+            ret = list()
+            ret.append((0, 0))  # upper left
+            if w_step != 0:
+                ret.append((4 * w_step, 0))  # upper right
+            if h_step != 0:
+                ret.append((0, 4 * h_step))  # lower left
+            if h_step != 0 and w_step != 0:
+                ret.append((4 * w_step, 4 * h_step))  # lower right
+            if h_step != 0 or w_step != 0:
+                ret.append((2 * w_step, 2 * h_step))  # center
+
+            if more_fix_crop:
+                ret.append((0, 2 * h_step))  # center left
+                ret.append((4 * w_step, 2 * h_step))  # center right
+                ret.append((2 * w_step, 4 * h_step))  # lower center
+                ret.append((2 * w_step, 0 * h_step))  # upper center
+
+                ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+                ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+                ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+                ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+
+            w_offset, h_offset = random.choice(ret)
+            crop_info = {
+                'crop_w': crop_pair[0],
+                'crop_h': crop_pair[1],
+                'offset_w': w_offset,
+                'offset_h': h_offset
+                }
+             
+        return crop_info
+    
+    crop_info = _sample_crop_size(im_size)
+    crop_w = crop_info['crop_w']
+    crop_h = crop_info['crop_h']
+    offset_w = crop_info['offset_w']
+    offset_h = crop_info['offset_h']
+    crop_img_group = [
+        img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
+        for img in img_group
+    ]
+    ret_img_group = [
+        img.resize((input_size[0], input_size[1]), Image.BILINEAR)
+        for img in crop_img_group
+    ]
+
+    return ret_img_group
+
+
+def group_random_crop(img_group, target_size):
+    """
+    group_random_crop
+    """
+    w, h = img_group[0].size
+    th, tw = target_size, target_size
+
+    assert (w >= target_size) and (h >= target_size), \
+          "image width({}) and height({}) should be larger than crop size".format(w, h)
+
+    out_images = []
+    x1 = random.randint(0, w - tw)
+    y1 = random.randint(0, h - th)
+
+    for img in img_group:
+        if w == tw and h == th:
+            out_images.append(img)
+        else:
+            out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+
+    return out_images
+
+
+def group_random_flip(img_group):
+    """
+    group_random_flip
+    """
+    v = random.random()
+    if v < 0.5:
+        ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+        return ret
+    else:
+        return img_group
+
+
+def group_center_crop(img_group, target_size):
+    """
+    group_center_crop
+    """
+    img_crop = []
+    for img in img_group:
+        w, h = img.size
+        th, tw = target_size, target_size
+        assert (w >= target_size) and (h >= target_size), \
+             "image width({}) and height({}) should be larger than crop size".format(w, h)
+        x1 = int(round((w - tw) / 2.))
+        y1 = int(round((h - th) / 2.))
+        img_crop.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+
+    return img_crop
+
+
+def group_scale(imgs, target_size):
+    """
+    group_scale
+    """
+    resized_imgs = []
+    for i in range(len(imgs)):
+        img = imgs[i]
+        w, h = img.size
+        if (w <= h and w == target_size) or (h <= w and h == target_size):
+            resized_imgs.append(img)
+            continue
+
+        if w < h:
+            ow = target_size
+            oh = int(target_size * 4.0 / 3.0)
+            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+        else:
+            oh = target_size
+            ow = int(target_size * 4.0 / 3.0)
+            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+
+    return resized_imgs
+
diff --git a/docs/src/applications/FootballAction/predict/action_detect/utils/config_utils.py b/docs/src/applications/FootballAction/predict/action_detect/utils/config_utils.py
new file mode 100644
index 000000000..e5db92b0d
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/utils/config_utils.py
@@ -0,0 +1,80 @@
+"""
+config_utils
+"""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import ast
+
+import logger
+
+logger = logger.Logger()
+
+CONFIG_SECS = [
+    'train',
+    'valid',
+    'test',
+    'infer',
+]
+
+class AttrDict(dict):
+    """
+    AttrDict
+    """
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    import yaml
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def create_attr_dict(yaml_config):
+    """create_attr_dict"""
+    for key, value in yaml_config.items():
+        if isinstance(value, dict):
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = ast.literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+    return
+
+
+def print_configs(cfg, mode):
+    """print_configs"""
+    logger.info("---------------- {:>5} Arguments ----------------".format(
+        mode))
+    for sec, sec_items in cfg.items():
+        logger.info("{}:".format(sec))
+        for k, v in sec_items.items():
+            logger.info("    {}:{}".format(k, v))
+    logger.info("-------------------------------------------------")
diff --git a/docs/src/applications/FootballAction/predict/action_detect/utils/preprocess.py b/docs/src/applications/FootballAction/predict/action_detect/utils/preprocess.py
new file mode 100644
index 000000000..d14aaf1ee
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/utils/preprocess.py
@@ -0,0 +1,36 @@
+""" extract frames and pcm"""
+import os
+import sys
+import shutil
+
+
+def ffmpeg_frames(mp4_addr, frame_out_folder, fps=5):
+    """ffmpeg_frames"""
+    if os.path.exists(frame_out_folder):
+        shutil.rmtree(frame_out_folder)
+    os.makedirs(frame_out_folder)
+    cmd = './src/utils/ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (mp4_addr, fps, frame_out_folder, '%08d')
+    os.system(cmd)
+
+
+def ffmpeg_pcm(mp4_addr, save_file_name):
+    """ffmpeg_pcm"""
+    cmd = './src/utils/ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' \
+        % (mp4_addr, save_file_name)
+    os.system(cmd)
+
+
+def ffmpeg_mp4(mp4_url, mp4_addr):
+    """ffmpeg_mp4"""
+    cmd = "wget %s -O %s -q" % (mp4_url, mp4_addr)
+    print ("cmd = ", cmd)
+    os.system(cmd)
+
+
+def get_images(image_path):
+    """get_images"""
+    images = sorted(os.listdir(image_path))
+    images = images
+    images_path_list = [image_path + '/' + im for im in images]
+    return images_path_list
+
diff --git a/docs/src/applications/FootballAction/predict/action_detect/utils/process_result.py b/docs/src/applications/FootballAction/predict/action_detect/utils/process_result.py
new file mode 100644
index 000000000..164869696
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/action_detect/utils/process_result.py
@@ -0,0 +1,144 @@
+"""
+# @File  : process_result.py  
+# @Author: macaihong
+# @Date  : 2019/12/15
+# @Desc  :
+"""
+
+import sys
+import os
+import re
+import numpy as np
+import pickle
+import json
+import logger
+
+logger = logger.Logger()
+
+
+def get_data_res(label_map, data, topk):
+    """get_data_res"""
+    sum_vid = len(data)
+    video_result = []
+    for i in range(sum_vid):
+        vid_name = data[i][0][0]
+        # true_label predict_start predict_end predict_score predict_len gt_iou gt_start gt_ioa
+        feature_start_id = float(data[i][0][1]['start'])
+        feature_end_id = float(data[i][0][1]['end'])
+        feature_stage1_score = data[i][0][1]['score']
+        predict_res = []
+        for k in range(topk):
+            score_top = data[i][1][k]
+            labelid_top = data[i][2][k]
+            label_iou = data[i][3]
+            labelname_top = label_map[str(labelid_top)]
+            video_result.append([feature_start_id, feature_end_id, labelid_top, labelname_top, score_top, label_iou])
+    return video_result
+
+
+def base_nms(bboxes, thresh, delta=0, nms_id=2):
+    """
+    One-dimensional non-maximal suppression
+    :param bboxes: [[vid, label, st, ed, score, ...], ...]
+    :param thresh:
+    :return:
+    """
+    """
+    t1 = bboxes[:, 0]
+    t2 = bboxes[:, 1]
+    scores = bboxes[:, nms_id]
+    """
+
+    t1 = np.array([max(0, x[0] - delta) for x in bboxes])
+    t2 = np.array([x[1] + delta for x in bboxes])
+    scores = np.array([x[nms_id] for x in bboxes])
+
+    durations = t2 - t1
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        tt1 = np.maximum(t1[i], t1[order[1:]])
+        tt2 = np.minimum(t2[i], t2[order[1:]])
+        intersection = tt2 - tt1
+        IoU = intersection / (durations[i] + durations[order[1:]] - intersection).astype(float)
+
+        inds = np.where(IoU <= thresh)[0]
+        order = order[inds + 1]
+    return [bboxes[i] for i in keep]
+
+
+def process_proposal(source_prop_box, min_frame_thread=5, nms_thresh=0.7, score_thresh=0.01):
+    """process_video_prop"""
+    prop_box = []
+    for items in source_prop_box:
+        start_frame = float(items[0])
+        end_frame = float(items[1])
+        score = float(items[2])
+        if end_frame - start_frame < min_frame_thread or score < score_thresh:
+            continue
+        prop_box.append([start_frame, end_frame, score])
+
+    prop_box_keep = base_nms(prop_box, nms_thresh)
+
+    prop_res = []
+    for res in prop_box_keep:
+        prop_res.append({'start': res[0], 'end': res[1], 'score': res[2]})
+
+    return prop_res
+
+
+def process_video_classify(video_prop, fps, score_thread, iou_thread, \
+                           nms_id=5, nms_thread=0.01, nms_delta=10, backgroundid=0):
+    """process_video_classify"""
+    prop_filter = []
+    for item in video_prop:
+        if item[2] == backgroundid:
+            continue
+        prop_filter.append(item)
+
+    # prop_filter = sorted(prop_filter, key=lambda x: x[nms_id], reverse=True)
+    prop_filter = base_nms(prop_filter, nms_thread, nms_delta, nms_id)
+    prop_filter = sorted(prop_filter, key=lambda x: x[0])
+
+    video_results = []
+    for item in prop_filter:
+        start_sec = item[0] / fps
+        end_sec = item[1] / fps
+
+        start_id_frame = item[0]
+        end_id_frame = item[1]
+        # start_time = "%02d:%02d:%02d" % ((start_id_frame / fps) / 3600, \
+        #     ((start_id_frame / fps) % 3600) / 60, (start_id_frame / fps) % 60)
+        # end_time = "%02d:%02d:%02d" % ((end_id_frame / fps) / 3600, \
+        #     ((end_id_frame / fps) % 3600) / 60, (end_id_frame / fps) % 60)
+        start_time = int(start_id_frame / fps)
+        end_time = int(end_id_frame / fps)
+
+        label_id = item[2]
+        label_name = item[3]
+        label_classify_score = item[4]
+        label_iou_score = item[5]
+        if label_classify_score > score_thread and label_iou_score > iou_thread:
+            video_results.append({"start_time": start_time,
+                                  "end_time": end_time,
+                                  "label_id": label_id,
+                                  "label_name": label_name,
+                                  "classify_score": label_classify_score,
+                                  "iou_score": label_iou_score})
+
+    return video_results
+
+
+def get_action_result(result_info, label_map_file, fps, score_thread=0, \
+                      iou_thread=0, nms_id=5, nms_thread=0.01, frame_offset=10, topk=1):
+    """get_action_result"""
+
+    label_map = json.load(open(label_map_file, 'r', encoding='utf-8'))
+
+    org_result = get_data_res(label_map, result_info, topk)
+    nms_result = process_video_classify(org_result, fps, score_thread, iou_thread, nms_id, nms_thread, frame_offset)
+
+    return nms_result
diff --git a/docs/src/applications/FootballAction/predict/eval.py b/docs/src/applications/FootballAction/predict/eval.py
new file mode 100644
index 000000000..2f6632486
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/eval.py
@@ -0,0 +1,239 @@
+"""
+get instance for lstm
+根据gts计算每个proposal_bmn的iou、ioa、label等信息
+"""
+import os
+import sys
+import json
+import random
+import pickle
+import numpy as np
+
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding = 'utf-8')
+
+dataset = "/home/work/datasets"
+
+label_index_file = './configs/index_label_football_8.json'
+eval_datasets = ['EuroCup2016']
+label_files = {'train': 'label_cls8_train.json',
+               'validation': 'label_cls8_val.json'}
+
+global fps, mode
+label_index = json.load(open(label_index_file, 'rb'))
+
+def load_gts():
+    global fps
+    gts_data = {'fps': 0, 'gts': {}}
+    for eval_data in eval_datasets:
+        for item, value in label_files.items():
+            label_file = '{}/{}/{}'.format(dataset, eval_data, value)
+            gts = json.load(open(label_file, 'rb'))
+            gts_data['fps'] = gts['fps']
+            fps = gts['fps']
+            for gt in gts['gts']:
+                gt['mode'] = item
+                basename = '{}/{}/mp4/{}'.format(dataset, eval_data, os.path.basename(gt['url']))
+                gts_data['gts'][basename] = gt
+    return gts_data['gts']
+    
+
+def computeIoU(e1, e2):
+    """
+    clc iou and ioa
+    """
+    if not (e1['label'] == e2['label'] and e1['basename'] == e2['basename']):
+        return 0.
+    area1 = e1["end"] - e1["start"]
+    area2 = e2["end"] - e2["start"]
+    x1 = np.maximum(e1["start"], e2["start"])
+    x2 = np.minimum(e1["end"], e2["end"])
+    inter = np.maximum(0.0, x2 - x1)
+    iou = 0.0 if (area1 + area2 - inter) == 0 else inter * 1.0 / (area1 + area2 - inter)
+    if not mode == 'proposal':
+        iou = 0.0 if area2 == 0 else inter * 1.0 / area2
+    return iou
+
+
+def convert_proposal(boxes, basename, score_threshold=0.01):
+    boxes = sorted(boxes, key=lambda x:float(x['score']), reverse=True)
+    res = []
+    for box in boxes:
+        if not float(box['score']) >= score_threshold:
+            continue
+        res.append({'basename': basename,
+                    'start': int(float(box['start']) / fps),
+                    'end': int(float(box['end']) / fps),
+                    'label': 0})
+    return res
+
+def convert_classify(boxes, basename, iou_threshold, score_threshold):
+    boxes = sorted(boxes, key=lambda x:(float(x['classify_score']), float(x['iou_score'])), reverse=True)
+    def convert_time_to_frame(time_type):
+        return int(time_type)
+        h, m, s = time_type.split(':')
+        return int(h) * 3600 + int(m) * 60 + int(s)
+    res = []
+    for box in boxes:
+        if not (box['iou_score'] >= iou_threshold and
+                box['classify_score'] >= score_threshold):
+            continue
+        res.append({'basename': basename,
+                    'start': convert_time_to_frame(box['start_time']),
+                    'end': convert_time_to_frame(box['end_time']),
+                    'label': box['label_id']})
+    return res
+        
+def convert_groundtruth(boxes, basename, phase=None):
+    res = []
+    for box in boxes:
+        for item in box['label_ids']:
+            label = 0 if phase == 'proposal' else item
+            res.append({'basename': basename,
+                        'start': box['start_id'],
+                        'end': box['end_id'],
+                        'label': label})
+    return res
+def print_head(iou):
+    print("\nioa = {:.1f}".format(iou))
+    res_str = ''
+    for item in ['label_name']:
+        res_str += '{:<12s}'.format(item)
+    for item in ['label_id', 'precision', 'recall', 'hit_prop', 'num_prop', 'hit_gts', 'num_gts']:
+        res_str += '{:<10s}'.format(item)
+    print(res_str)
+
+def print_result(res_dict, label='avg'):
+    if label == 'avg':
+        res_str = '{:<22s}'.format(str(label))
+    else:
+        res_str = '{0:{2}<6s}{1:<10s}'.format(label_index[str(label)], str(label), chr(12288))
+
+    for item in ['prec', 'recall']:
+        res_str += '{:<10.4f}'.format(res_dict[item])
+    for item in ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']:
+        res_str += '{:<10d}'.format(res_dict[item])
+    print(res_str)
+
+def evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = False):
+    iou_map = [computeIoU(resId, gtsId) for resId in res_boxes \
+                                        for gtsId in gts_boxes]
+    iou_map = np.array(iou_map).reshape((len(res_boxes), len(gts_boxes)))
+    hit_map_prop_total = np.max(iou_map, axis=1)
+    hit_map_index_total = np.argmax(iou_map, axis=1)
+
+    res_dict = ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']
+
+    for iou_threshold in iou_range:
+        if show_sub:
+            print_head(iou_threshold)
+            
+        iou_prop = np.array([k >= iou_threshold for k in hit_map_prop_total])
+        average_results = {}
+        for label_id in label_range:
+            sub_results = {}
+            label_prop = np.array([k['label'] == label_id for k in res_boxes])
+            label_gts = np.array([k['label'] == label_id for k in gts_boxes])
+            sub_results['num_prop'] = sum(label_prop)
+            sub_results['num_gts'] = sum(label_gts)
+            if sub_results['num_prop'] == 0:
+                hit_prop_index = []
+            else:
+                hit_prop_index = label_prop & iou_prop
+            sub_results['hit_prop'] = sum(hit_prop_index)
+            sub_results['hit_gts'] = len(set(hit_map_index_total[hit_prop_index]))
+
+            sub_results['prec'] = 0.0 if sub_results['num_prop'] == 0 \
+                                      else sub_results['hit_prop'] * 1.0 / sub_results['num_prop']
+            sub_results['recall'] = 0.0 if sub_results['num_gts'] == 0 \
+                                        else sub_results['hit_gts'] * 1.0 / sub_results['num_gts']
+            if show_sub:
+                print_result(sub_results, label=label_id)
+            for item in res_dict:
+                if not item in average_results:
+                    average_results[item] = 0
+                average_results[item] += sub_results[item]
+        if len(label_range) == 1:   # proposal 不需要输出average值
+            continue
+        average_results['prec'] = 0.0 if average_results['num_prop'] == 0 \
+                                      else average_results['hit_prop'] * 1.0 / average_results['num_prop']
+        average_results['recall'] = 0.0 if average_results['num_gts'] == 0 \
+                                        else average_results['hit_gts'] * 1.0 / average_results['num_gts']
+        if show_sub:
+            print_result(average_results)
+
+        average_results['F1'] = 0.0 if (average_results['prec'] + average_results['recall'] == 0) \
+                                    else 2 * average_results['prec'] * average_results['recall'] / \
+                                            (average_results['prec'] + average_results['recall'])
+        return average_results
+
+def get_eval_results(predicts, gts_data, phase, iou_threshold = 0.3, score_threshold = 0.3, show_sub = False):
+    global mode
+    mode = phase
+    res_boxes = []
+    gts_boxes = []
+    for ped_data in predicts:
+        basename = ped_data['video_name']
+
+        # eval sub data
+        such_eval = False
+        for eval_name in eval_datasets:
+            if eval_name in basename:
+                such_eval = True
+                break
+        if not such_eval:
+            continue
+
+        gts = gts_data[basename]['actions']
+        if phase == 'proposal':
+            res_boxes.extend(convert_proposal(ped_data['bmn_results'], basename, score_threshold))
+            gts_boxes.extend(convert_groundtruth(gts, basename, phase='proposal'))
+            label_range = [0]
+            iou_range = np.arange(0.1, 1, 0.1)
+        else:
+            res_boxes.extend(convert_classify(ped_data['action_results'], basename, iou_threshold, score_threshold))
+            gts_boxes.extend(convert_groundtruth(gts, basename))
+            label_range = range(1, len(label_index))
+            iou_range = np.arange(0.5, 0.6, 0.1)
+            
+    eval_results = evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub = show_sub)
+     
+    return eval_results
+    
+
+if __name__ == "__main__":
+    result_file = sys.argv[1]
+    predicts = json.load(open(result_file, 'r', encoding='utf-8'))
+    gts_data = load_gts()
+
+    get_eval_results(predicts, gts_data, 'proposal', 
+                     score_threshold = 0.03,
+                     show_sub = True)
+    #get_eval_results(predicts, gts_data, 'actions')
+
+    best_F1 = -0.1
+    best_res = {}
+    best_iou_threshold = 0.
+    best_score_threshold = 0.
+    for iou_threshold in np.arange(0.1, 0.9, 0.1):
+        for score_threshold in np.arange(0.1, 1, 0.1):
+            avg_res = get_eval_results(predicts, gts_data, 'actions', 
+                                       iou_threshold = iou_threshold,
+                                       score_threshold = score_threshold,
+                                       show_sub = False)
+            if best_F1 < avg_res['F1']:
+                best_F1 = avg_res['F1']
+                best_res = avg_res
+                best_iou_threshold = iou_threshold
+                best_score_threshold = score_threshold
+    print("best iou threshold = {:.1f}".format(best_iou_threshold))
+    print("best score threshold = {:.1f}".format(best_score_threshold))
+    print('best F1 score = {:.4f}'.format(best_F1))
+    print_head(0.5)
+    print_result(best_res)
+
+    get_eval_results(predicts, gts_data, 'actions', iou_threshold = best_iou_threshold,
+                                                    score_threshold = best_score_threshold,
+                                                    show_sub = True)
+    
+
diff --git a/docs/src/applications/FootballAction/predict/predict.py b/docs/src/applications/FootballAction/predict/predict.py
new file mode 100644
index 000000000..4812056a2
--- /dev/null
+++ b/docs/src/applications/FootballAction/predict/predict.py
@@ -0,0 +1,37 @@
+import os
+import sys
+import json
+
+sys.path.append('action_detect')
+from action import ActionDetection
+
+if __name__ == '__main__':
+    #dataset_dir = "/workspace/PaddleVideo/applications/FootballAction/datasets/EuroCup2016"
+    dataset_dir = "../datasets/EuroCup2016"
+
+    model_predict = ActionDetection(cfg_file="./configs/configs.yaml")
+    model_predict.load_model()
+
+    video_url = os.path.join(dataset_dir, 'url_val.list')
+    with open(video_url, 'r') as f:
+        lines = f.readlines()
+    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]
+
+    results = []
+    for line in lines:
+        video_name = line
+        print(video_name)
+
+        imgs_path = video_name.replace(".mp4", "").replace("mp4", "frames")
+        pcm_path = video_name.replace(".mp4", ".pcm").replace("mp4", "pcm")
+
+        bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)
+        results.append({
+            'video_name': line,
+            'bmn_results': bmn_results,
+            'action_results': action_results
+        })
+
+    with open('results.json', 'w', encoding='utf-8') as f:
+        data = json.dumps(results, indent=4, ensure_ascii=False)
+        f.write(data)
diff --git a/docs/src/applications/Ma-Net/README.md b/docs/src/applications/Ma-Net/README.md
new file mode 100644
index 000000000..6b741a29a
--- /dev/null
+++ b/docs/src/applications/Ma-Net/README.md
@@ -0,0 +1,47 @@
+[简体中文](README_cn.md) | English
+
+# Ma-Net
+
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+
+
+
+
+## Introduction
+
+This is the paddle implementation of the CVPR2020 paper "[Memory aggregation networks for efficient interactive video object segmentation](https://arxiv.org/abs/2003.13246)".
+
+![avatar](images/1836-teaser.gif)
+
+This code currently supports model test and model training on DAVIS  dataset,  and model inference on any given video will be provided in few days.
+
+
+
+## Data
+
+Please refer to DAVIS data download and preparation doc [DAVIS-data](dataloaders/DAVIS2017.md)
+
+## Train and Test
+- You can download [pertained model for stage1](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/DeeplabV3_coco.pdparams) decompress it for stage1 training。
+  
+- You can download [trained model of stage1](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MaNet_davis2017_stage1.pdparams) decompress it for stage2 training directly skipping stage1 training。
+  
+```
+sh run_local.sh
+```
+
+- You can download [our model](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MANet_davis2017.pdparams) decompress it for testing.
+
+
+
+Test accuracy in DAVIS2017:
+
+| J@60  |  AUC  |
+| :---: | :---: |
+| 0.761 | 0.749 |
diff --git a/docs/src/applications/Ma-Net/README_cn.md b/docs/src/applications/Ma-Net/README_cn.md
new file mode 100644
index 000000000..78d4d1c83
--- /dev/null
+++ b/docs/src/applications/Ma-Net/README_cn.md
@@ -0,0 +1,46 @@
+[English](README.md) | 简体中文
+
+# Ma-Net视频切分模型
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+
+
+
+
+## 模型简介
+
+这是CVPR2020论文"[Memory aggregation networks for efficient interactive video object segmentation](https://arxiv.org/abs/2003.13246)"的Paddle实现。
+
+![avatar](images/1836-teaser.gif)
+
+此代码目前支持在 DAVIS 数据集上进行模型测试和模型训练，并且将在之后提供对任何给定视频的模型推理。
+
+
+## 数据准备
+
+DAVIS数据下载及准备请参考[DAVIS2017数据准备](dataloaders/DAVIS2017_cn.md)
+
+
+## 模型训练与测试
+- 您可以下载[paddle版本的stage1预训练模型](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/DeeplabV3_coco.pdparams) 解压缩它以用于训练的第一阶段。
+  
+- 您可以下载[stage1训练结果模型](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MaNet_davis2017_stage1.pdparams) 解压缩它以直接训练的第二阶段跳过第一阶段的训练。
+  
+  ```bash
+  sh run.sh
+  ```
+  
+- 您可以下载[我们的模型](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MANet_davis2017.pdparams) 解压缩它以用于测试。
+
+
+在 DAVIS2017上的测试精度:
+
+| J@60  |  AUC  |
+| :---: | :---: |
+| 0.761 | 0.749 |
diff --git a/docs/src/applications/Ma-Net/config.py b/docs/src/applications/Ma-Net/config.py
new file mode 100644
index 000000000..d584c273f
--- /dev/null
+++ b/docs/src/applications/Ma-Net/config.py
@@ -0,0 +1,96 @@
+import paddle
+import argparse
+import os
+import sys
+import cv2
+import time
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description='intvos config')
+parser.add_argument('--ROOT_DIR',
+                    type=str,
+                    default=os.path.abspath(
+                        os.path.join(os.path.dirname("__file__"))))
+parser.add_argument('--EXP_NAME', type=str, default='deeplabv3+coco')
+parser.add_argument('--SAVE_RESULT_DIR', type=str, default='../afs/result/')
+parser.add_argument('--SAVE_VOS_RESULT_DIR', type=str, default='')
+parser.add_argument('--NUM_WORKER', type=int, default=4)
+parser.add_argument('--KNNS', type=int, default=1)
+parser.add_argument('--PRETRAINED_MODEL',
+                    type=str,
+                    default='./model_best.pth.tar')
+parser.add_argument(
+    '--RESULT_ROOT',
+    type=str,
+    default=os.path.join('../afs/vos_result/result_total_80000'))
+######DATA_CONFIG
+parser.add_argument('--DATA_NAME', type=str, default='COCO2017')
+parser.add_argument('--DATA_AUG', type=str2bool, default=True)
+parser.add_argument('--DATA_WORKERS', type=int, default=4)
+parser.add_argument('--DATA_RESCALE', type=int, default=416)
+parser.add_argument('--DATA_RANDOMCROP', type=int, default=416)
+parser.add_argument('--DATA_RANDOMROTATION', type=int, default=0)
+parser.add_argument('--DATA_RANDOM_H', type=int, default=10)
+parser.add_argument('--DATA_RANDOM_S', type=int, default=10)
+parser.add_argument('--DATA_RANDOM_V', type=int, default=10)
+parser.add_argument('--DATA_RANDOMFLIP', type=float, default=0.5)
+parser.add_argument('--DATA_ROOT', type=str, default='../data/DAVIS')
+
+######MODEL_CONFIG
+parser.add_argument('--MODEL_NAME', type=str, default='deeplabv3plus')
+parser.add_argument('--MODEL_BACKBONE', type=str, default='res101_atrous')
+parser.add_argument('--MODEL_OUTPUT_STRIDE', type=int, default=16)
+parser.add_argument('--MODEL_ASPP_OUTDIM', type=int, default=256)
+parser.add_argument('--MODEL_SHORTCUT_DIM', type=int, default=48)
+parser.add_argument('--MODEL_SHORTCUT_KERNEL', type=int, default=1)
+parser.add_argument('--MODEL_NUM_CLASSES', type=int, default=21)
+parser.add_argument('--MODEL_SEMANTIC_EMBEDDING_DIM', type=int, default=100)
+parser.add_argument('--MODEL_HEAD_EMBEDDING_DIM', type=int, default=256)
+parser.add_argument('--MODEL_LOCAL_DOWNSAMPLE', type=str2bool, default=True)
+parser.add_argument('--MODEL_MAX_LOCAL_DISTANCE', type=int, default=12)
+parser.add_argument('--MODEL_SELECT_PERCENT', type=float, default=0.8)
+parser.add_argument('--MODEL_USEIntSeg', type=str2bool, default=False)
+
+######TRAIN_CONFIG
+parser.add_argument('--TRAIN_LR', type=float, default=0.0007)
+parser.add_argument('--TRAIN_LR_GAMMA', type=float, default=0.1)
+parser.add_argument('--TRAIN_MOMENTUM', type=float, default=0.9)
+parser.add_argument('--TRAIN_WEIGHT_DECAY', type=float, default=0.00004)
+parser.add_argument('--TRAIN_POWER', type=float, default=0.9)
+parser.add_argument('--TRAIN_BATCH_SIZE', type=int, default=2)
+parser.add_argument('--TRAIN_SHUFFLE', type=str2bool, default=True)
+parser.add_argument('--TRAIN_CLIP_GRAD_NORM', type=float, default=5.)
+parser.add_argument('--TRAIN_MINEPOCH', type=int, default=9)
+parser.add_argument('--TRAIN_TOTAL_STEPS', type=int, default=101000)
+parser.add_argument('--TRAIN_LOSS_LAMBDA', type=int, default=0)
+parser.add_argument('--TRAIN_TBLOG', type=str2bool, default=False)
+parser.add_argument('--TRAIN_BN_MOM', type=float,
+                    default=0.9997)  # fixed. difs between paddle and torch.
+parser.add_argument('--TRAIN_TOP_K_PERCENT_PIXELS', type=float, default=0.15)
+parser.add_argument('--TRAIN_HARD_MINING_STEP', type=int, default=50000)
+parser.add_argument('--TRAIN_LR_STEPSIZE', type=int, default=2000)
+parser.add_argument('--TRAIN_INTER_USE_TRUE_RESULT',
+                    type=str2bool,
+                    default=True)
+parser.add_argument('--TRAIN_RESUME_DIR', type=str, default='')
+
+parser.add_argument('--LOG_DIR', type=str, default=os.path.join('./log'))
+
+parser.add_argument('--TEST_CHECKPOINT',
+                    type=str,
+                    default='save_step_100000.pth')
+parser.add_argument('--TEST_MODE', type=str2bool, default=False)
+
+cfg = parser.parse_args()
+cfg.TRAIN_EPOCHS = int(200000 * cfg.TRAIN_BATCH_SIZE / 60.)
diff --git a/docs/src/applications/Ma-Net/dataloaders/DAVIS2017.md b/docs/src/applications/Ma-Net/dataloaders/DAVIS2017.md
new file mode 100644
index 000000000..d3202331b
--- /dev/null
+++ b/docs/src/applications/Ma-Net/dataloaders/DAVIS2017.md
@@ -0,0 +1,27 @@
+[简体中文](../../zh-CN/dataset/DAVIS2017.md) | English
+
+# DAVIS2017 Data Preparation
+
+## 1.Data Download
+
+Download [DAVIS2017](https://data.vision.ee.ethz.ch/csergi/share/davis/DAVIS-2017-trainval-480p.zip) and [scribbles](https://data.vision.ee.ethz.ch/csergi/share/DAVIS-Interactive/DAVIS-2017-scribbles-trainval.zip) into one folder. Please refer to [DAVIS](https://davischallenge.org/davis2017/code.html).
+
+If you need the file "DAVIS2017/ImageSets/2017/v_a_l_instances.txt", please refer to the link [google]( https://drive.google.com/file/d/1aLPaQ_5lyAi3Lk3d2fOc_xewSrfcrQlc/view?usp=sharing)
+
+## 2.Folder Structure
+
+In the context of the whole project (for Ma-Net only), the folder structure will look like:
+
+```shell
+PaddleVideo
+├── configs
+├── paddlevideo
+├── docs
+├── tools
+├── data
+│ 	└── DAVIS2017
+│   │ 	├── Annotations
+│   │ 	├── ImageSets
+│   │ 	├── JPEGImages
+│   │ 	└── Scribbles
+```
diff --git a/docs/src/applications/Ma-Net/dataloaders/DAVIS2017_cn.md b/docs/src/applications/Ma-Net/dataloaders/DAVIS2017_cn.md
new file mode 100644
index 000000000..018e94835
--- /dev/null
+++ b/docs/src/applications/Ma-Net/dataloaders/DAVIS2017_cn.md
@@ -0,0 +1,27 @@
+[English](../../en/dataset/DAVIS2017.md) | 简体中文
+
+# DAVIS2017 数据集准备
+
+## 1.数据下载
+
+下载 [DAVIS2017](https://data.vision.ee.ethz.ch/csergi/share/davis/DAVIS-2017-trainval-480p.zip) 和 [scribbles](https://data.vision.ee.ethz.ch/csergi/share/DAVIS-Interactive/DAVIS-2017-scribbles-trainval.zip)到同一个文件夹中。请参阅[DAVIS](https://davischallenge.org/davis2017/code.html).
+
+如果您需要文件"DAVIS2017/ImageSets/2017/v_a_l_instances.txt"，请参阅[google](https://drive.google.com/file/d/1aLPaQ_5lyAi3Lk3d2fOc_xewSrfcrQlc/view?usp=sharing)链接
+
+## 2.目录结构
+
+整个项目(Ma-Net)的目录结构如下所示：
+
+```shell
+PaddleVideo
+├── configs
+├── paddlevideo
+├── docs
+├── tools
+├── data
+│ 	└── DAVIS2017
+│   │ 	├── Annotations
+│   │ 	├── ImageSets
+│   │ 	├── JPEGImages
+│   │ 	└── Scribbles
+```
diff --git a/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py b/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py
new file mode 100644
index 000000000..3a2890983
--- /dev/null
+++ b/docs/src/applications/Ma-Net/dataloaders/custom_transforms_f.py
@@ -0,0 +1,416 @@
+import os
+import random
+import cv2
+import numpy as np
+import paddle
+from PIL import Image
+import dataloaders.helpers as helpers
+from davisinteractive.utils.operations import bresenham
+from paddle.vision.transforms import functional as F
+
+cv2.setNumThreads(0)
+NEW_BRANCH = True
+
+
+class Resize(object):
+    """Rescale the image in a sample to a given size.
+
+    Args:
+        output_size (tuple or int): Desired output size. If tuple, output is
+            matched to output_size. If int, smaller of image edges is matched
+            to output_size keeping aspect ratio the same.
+    """
+    def __init__(self, output_size):
+        assert isinstance(output_size, (int, tuple))
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            self.output_size = output_size
+
+    #        self.seg_interpolation = cv2.INTER_CUBIC if is_continuous else cv2.INTER_NEAREST
+    #        self.fix = fix
+
+    def __call__(self, sample):
+        img1 = sample['img1']
+        # img2 = sample['img2']
+        # ref_img=sample['ref_img']
+        h, w = img1.shape[:2]
+        if self.output_size == (h, w):
+            return sample
+
+        else:
+            new_h, new_w = self.output_size
+        new_h, new_w = int(new_h), int(new_w)
+        for elem in sample.keys():
+            if 'meta' in elem:
+                continue
+            tmp = sample[elem]
+            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':
+                flagval = cv2.INTER_CUBIC
+            else:
+                flagval = cv2.INTER_NEAREST
+
+            tmp = cv2.resize(tmp, dsize=(new_w, new_h), interpolation=flagval)
+            sample[elem] = tmp
+
+        return sample
+
+
+class RandomCrop(object):
+    """Crop randomly the image in a sample.
+
+    Args:
+        output_size (tuple or int): Desired output size. If int, square crop
+            is made.
+    """
+    def __init__(self, output_size, step=None):
+        assert isinstance(output_size, (int, tuple))
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            assert len(output_size) == 2
+            self.output_size = output_size
+        self.step = step
+
+    def __call__(self, sample):
+
+        image = sample['img1']
+        h, w = image.shape[:2]
+        new_h, new_w = self.output_size
+
+        new_h = h if new_h >= h else new_h
+        new_w = w if new_w >= w else new_w
+        is_contain_obj = False
+
+        if self.step is None:
+            while not is_contain_obj:
+                #                step += 1
+                top = np.random.randint(0, h - new_h + 1)
+                left = np.random.randint(0, w - new_w + 1)
+                ref_scribble_label = sample['ref_scribble_label']
+                new_ref_scribble_label = ref_scribble_label[top:top + new_h,
+                                                            left:left + new_w]
+                if len(np.unique(new_ref_scribble_label)) == 1:
+                    continue
+                else:
+
+                    for elem in sample.keys():
+                        if 'meta' in elem:
+                            continue
+
+                        tmp = sample[elem]
+                        tmp = tmp[top:top + new_h, left:left + new_w]
+                        sample[elem] = tmp
+                    break
+        else:
+            st = 0
+            while not is_contain_obj and st < self.step:
+                st += 1
+                top = np.random.randint(0, h - new_h + 1)
+                left = np.random.randint(0, w - new_w + 1)
+                ref_scribble_label = sample['ref_scribble_label']
+                new_ref_scribble_label = ref_scribble_label[top:top + new_h,
+                                                            left:left + new_w]
+                if len(np.unique(
+                        new_ref_scribble_label)) == 1 or st < self.step - 1:
+                    continue
+                else:
+
+                    for elem in sample.keys():
+                        if 'meta' in elem:
+                            continue
+
+                        tmp = sample[elem]
+                        tmp = tmp[top:top + new_h, left:left + new_w]
+                        sample[elem] = tmp
+                    break
+
+        return sample
+
+
+class ScaleNRotate(object):
+    """Scale (zoom-in, zoom-out) and Rotate the image and the ground truth.
+    Args:
+        two possibilities:
+        1.  rots (tuple): (minimum, maximum) rotation angle
+            scales (tuple): (minimum, maximum) scale
+        2.  rots [list]: list of fixed possible rotation angles
+            scales [list]: list of fixed possible scales
+    """
+    def __init__(self, rots=(-30, 30), scales=(.75, 1.25)):
+        assert (isinstance(rots, type(scales)))
+        self.rots = rots
+        self.scales = scales
+
+    def __call__(self, sample):
+
+        if type(self.rots) == tuple:
+            # Continuous range of scales and rotations
+            rot = (self.rots[1] - self.rots[0]) * random.random() - \
+                  (self.rots[1] - self.rots[0]) / 2
+
+            sc = (self.scales[1] - self.scales[0]) * random.random() - \
+                 (self.scales[1] - self.scales[0]) / 2 + 1
+        elif type(self.rots) == list:
+            # Fixed range of scales and rotations
+            rot = self.rots[random.randint(0, len(self.rots))]
+            sc = self.scales[random.randint(0, len(self.scales))]
+
+        for elem in sample.keys():
+            if 'meta' in elem:
+                continue
+
+            tmp = sample[elem]
+
+            h, w = tmp.shape[:2]
+            center = (w / 2, h / 2)
+            assert (center != 0)  # Strange behaviour warpAffine
+            M = cv2.getRotationMatrix2D(center, rot, sc)
+
+            if ((tmp == 0) | (tmp == 1)).all():
+                flagval = cv2.INTER_NEAREST
+            else:
+                flagval = cv2.INTER_CUBIC
+            tmp = cv2.warpAffine(tmp, M, (w, h), flags=flagval)
+
+            sample[elem] = tmp
+
+        return sample
+
+
+class RandomScale(object):
+    """Randomly resize the image and the ground truth to specified scales.
+    Args:
+        scales (list): the list of scales
+    """
+    def __init__(self, scales=[0.75, 1, 1.25]):
+        self.scales = scales
+
+    def __call__(self, sample):
+
+        # Fixed range of scales
+        sc = self.scales[random.randint(0, len(self.scales) - 1)]
+
+        for elem in sample.keys():
+            if 'meta' in elem:
+                continue
+            tmp = sample[elem]
+
+            if elem == 'img1' or elem == 'img2' or elem == 'ref_img':
+                flagval = cv2.INTER_CUBIC
+            else:
+                flagval = cv2.INTER_NEAREST
+
+            tmp = cv2.resize(tmp, None, fx=sc, fy=sc, interpolation=flagval)
+
+            sample[elem] = tmp
+
+        return sample
+
+
+class RandomHorizontalFlip(object):
+    """Horizontally flip the given image and ground truth randomly with a probability of 0.5."""
+    def __init__(self, prob):
+        self.p = prob
+
+    def __call__(self, sample):
+
+        if random.random() < self.p:
+            for elem in sample.keys():
+                if 'meta' in elem:
+                    continue
+                tmp = sample[elem]
+                tmp = cv2.flip(tmp, flipCode=1)
+                sample[elem] = tmp
+
+        return sample
+
+
+class SubtractMeanImage(object):
+    def __init__(self, mean, change_channels=False):
+        self.mean = mean
+        self.change_channels = change_channels
+
+    def __call__(self, sample):
+        for elem in sample.keys():
+            if 'image' in elem:
+                if self.change_channels:
+                    sample[elem] = sample[elem][:, :, [2, 1, 0]]
+                sample[elem] = np.subtract(
+                    sample[elem], np.array(self.mean, dtype=np.float32))
+        return sample
+
+    def __str__(self):
+        return 'SubtractMeanImage' + str(self.mean)
+
+
+class CustomScribbleInteractive(object):
+    def __init__(self,
+                 scribbles,
+                 first_frame,
+                 dilation=9,
+                 nocare_area=None,
+                 bresenham=True,
+                 use_previous_mask=False,
+                 previous_mask_path=None):
+
+        self.scribbles = scribbles
+        self.dilation = dilation
+        self.nocare_area = nocare_area
+        self.bresenham = bresenham
+        self.first_frame = first_frame
+        self.use_previous_mask = use_previous_mask
+        self.previous_mask_path = previous_mask_path
+
+    def __call__(self, sample):
+        meta = sample['meta']
+        frame_num = int(meta['frame_id'])
+
+        im_size = meta['im_size']
+
+        # Initialize gt to zeros, no-care areas to ones
+        scr_gt = np.zeros(im_size)
+        scr_nocare = np.ones(im_size)
+        mask = np.zeros(im_size)
+        mask_neg = np.zeros(im_size)
+
+        # Get all the scribbles for the current frame
+        for scribble in self.scribbles[frame_num]:
+            points_scribble = np.round(
+                np.array(scribble['path']) * np.array(
+                    (im_size[1], im_size[0]))).astype(int)
+            if self.bresenham and len(points_scribble) > 1:
+                all_points = bresenham(points_scribble)
+            else:
+                all_points = points_scribble
+
+            # Check if scribble is of same id to mark as foreground, otherwise as background
+            if scribble['object_id'] == meta['obj_id']:
+                mask[all_points[:, 1] - 1, all_points[:, 0] - 1] = 1
+            else:
+                mask_neg[all_points[:, 1] - 1, all_points[:, 0] - 1] = 1
+        if self.nocare_area is None:
+            nz = np.where(mask > 0)
+            nocare_area = int(.5 * np.sqrt(
+                (nz[0].max() - nz[0].min()) * (nz[1].max() - nz[1].min())))
+        else:
+            nocare_area = 100
+
+        # In case we are reading the first human annotation round
+        if frame_num == self.first_frame:
+            # Compute dilated foreground, background, and no-care area
+            scr_gt, scr_nocare = helpers.gt_from_scribble(
+                mask, dilation=self.dilation, nocare_area=nocare_area)
+            scr_gt_neg, _ = helpers.gt_from_scribble(mask_neg,
+                                                     dilation=self.dilation,
+                                                     nocare_area=None)
+
+            # Negative examples included in the training
+            scr_gt[scr_gt_neg > 0] = 0
+            scr_nocare[scr_gt_neg > 0] = 0
+
+        # For annotation rounds generated by the robot
+        else:
+            # Compute dilated foreground, background, and no-care area
+            scr_gt_extra, _ = helpers.gt_from_scribble(mask,
+                                                       dilation=self.dilation,
+                                                       nocare_area=None)
+            scr_gt_neg, _ = helpers.gt_from_scribble(mask_neg,
+                                                     dilation=self.dilation,
+                                                     nocare_area=None)
+
+            # Ignore pixels that are not foreground
+            if not self.use_previous_mask:
+                scr_nocare_extra = 1. - scr_gt_extra
+            else:
+                scr_nocare_extra = \
+                    (cv2.imread(os.path.join(self.previous_mask_path, meta['seq_name'], str(meta['obj_id']),
+                                             meta['frame_id'] + '.png'), 0) > 0.8 * 255).astype(np.float32)
+
+            # Negative examples included in training
+            scr_gt_extra[scr_gt_neg > 0] = 0
+            scr_nocare_extra[scr_gt_neg > 0] = 0
+
+            scr_gt = np.maximum(scr_gt, scr_gt_extra)
+            scr_nocare_extra[scr_gt > 0] = 0
+            scr_nocare = np.minimum(scr_nocare, scr_nocare_extra)
+
+        sample['scribble_gt'] = scr_gt
+        sample['scribble_void_pixels'] = scr_nocare
+
+        return sample
+
+
+class ToTensor(object):
+    """Convert ndarrays in sample to Tensors."""
+    def __call__(self, sample):
+
+        for elem in sample.keys():
+            if 'meta' in elem:
+                continue
+            tmp = sample[elem]
+
+            if tmp.ndim == 2:
+                tmp = tmp[:, :, np.newaxis]
+            else:
+                tmp = tmp / 255.
+                tmp -= (0.485, 0.456, 0.406)
+                tmp /= (0.229, 0.224, 0.225)
+
+            # swap color axis because
+            # numpy image: H x W x C
+            # paddle image: C X H X W
+
+            tmp = tmp.transpose([2, 0, 1])
+            sample[elem] = paddle.to_tensor(tmp)
+        return sample
+
+
+class GenerateEdge(object):
+    """
+    """
+    def __init__(self, edgesize=1):
+        self.edgesize = edgesize
+
+    def __call__(self, sample):
+        """
+        """
+        if "label2" in sample:
+            label2 = sample['label2']
+            kernel_size = 2 * self.edgesize + 1
+            maskedge = np.zeros_like(label2)
+
+            maskedge[np.where(label2[:, 1:] != label2[:, :-1])] = 1
+            maskedge[np.where(label2[1:, :] != label2[:-1, :])] = 1
+            maskedge = cv2.dilate(
+                maskedge, np.ones((kernel_size, kernel_size), dtype=np.uint8))
+            sample["edge_mask"] = maskedge
+        else:
+            raise RuntimeError(
+                "We need parsing mask to generate the edge mask.")
+        return sample
+
+
+class GenerateEdge_2(object):
+    """
+    """
+    def __init__(self, edgesize=1):
+        self.edgesize = edgesize
+
+    def __call__(self, sample):
+        """
+        """
+        if "ref_frame_gt" in sample:
+            label2 = sample['ref_frame_gt']
+            kernel_size = 2 * self.edgesize + 1
+            maskedge = np.zeros_like(label2)
+
+            maskedge[np.where(label2[:, 1:] != label2[:, :-1])] = 1
+            maskedge[np.where(label2[1:, :] != label2[:-1, :])] = 1
+            maskedge = cv2.dilate(
+                maskedge, np.ones((kernel_size, kernel_size), dtype=np.uint8))
+            sample["edge_mask"] = maskedge
+        else:
+            raise RuntimeError(
+                "We need parsing mask to generate the edge mask.")
+        return sample
diff --git a/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py b/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py
new file mode 100644
index 000000000..ef16a14c4
--- /dev/null
+++ b/docs/src/applications/Ma-Net/dataloaders/davis_2017_f.py
@@ -0,0 +1,672 @@
+from __future__ import division
+import json
+import os
+import shutil
+import numpy as np
+import paddle, cv2
+from random import choice
+from paddle.io import Dataset
+import json
+from PIL import Image
+from davisinteractive.utils.scribbles import scribbles2mask, annotated_frames
+import sys
+
+sys.path.append("..")
+from config import cfg
+import time
+
+
+class DAVIS2017_Test_Manager():
+    def __init__(self,
+                 split='val',
+                 root=cfg.DATA_ROOT,
+                 transform=None,
+                 rgb=False,
+                 seq_name=None):
+        self.split = split
+        self.db_root_dir = root
+
+        self.rgb = rgb
+        self.transform = transform
+        self.seq_name = seq_name
+
+    def get_image(self, idx):
+        frame_name = str(idx)
+        while len(frame_name) != 5:
+            frame_name = '0' + frame_name
+        imgpath = os.path.join(self.db_root_dir, 'JPEGImages/480p/',
+                               str(self.seq_name), frame_name + '.jpg')
+        img = cv2.imread(imgpath)
+        img = np.array(img, dtype=np.float32)
+        sample = {'img': img}
+        if self.transform is not None:
+            sample = self.transform(sample)
+        return sample
+
+
+class DAVIS2017_Feature_Extract(Dataset):
+    def __init__(self,
+                 split='val',
+                 root=cfg.DATA_ROOT,
+                 transform=None,
+                 rgb=False,
+                 seq_name=None):
+        self.split = split
+        self.db_root_dir = root
+
+        self.rgb = rgb
+        self.transform = transform
+        self.seq_name = seq_name
+        self.img_list = np.sort(
+            os.listdir(
+                os.path.join(self.db_root_dir, 'JPEGImages/480p/',
+                             str(seq_name))))
+
+    def __len__(self):
+        return len(self.img_list)
+
+    def __getitem__(self, idx):
+        img = self.img_list[idx]
+        imgpath = os.path.join(self.db_root_dir, 'JPEGImages/480p/',
+                               str(self.seq_name), img)
+        current_img = cv2.imread(imgpath)
+        current_img = np.array(current_img, dtype=np.float32)
+        h, w, _ = current_img.shape
+        sample = {'img1': current_img}
+        sample['meta'] = {
+            'seq_name': self.seq_name,
+            'h_w': (h, w),
+            'img_path': imgpath
+        }
+        if self.transform is not None:
+            sample = self.transform(sample)
+        return sample
+
+
+class DAVIS2017_VOS_Test(Dataset):
+    """
+    """
+    def __init__(self,
+                 split='val',
+                 root=cfg.DATA_ROOT,
+                 transform=None,
+                 rgb=False,
+                 result_root=None,
+                 seq_name=None):
+        self.split = split
+        self.db_root_dir = root
+        self.result_root = result_root
+        self.rgb = rgb
+        self.transform = transform
+        self.seq_name = seq_name
+        self.seq_list_file = os.path.join(
+            self.db_root_dir, 'ImageSets', '2017',
+            '_'.join(self.split) + '_instances.txt')
+
+        self.seqs = []
+        for splt in self.split:
+            with open(
+                    os.path.join(self.db_root_dir, 'ImageSets', '2017',
+                                 self.split + '.txt')) as f:
+                seqs_tmp = f.readlines()
+            seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
+            self.seqs.extend(seqs_tmp)
+
+        if not self._check_preprocess():
+            self._preprocess()
+
+        assert self.seq_name in self.seq_dict.keys(
+        ), '{} not in {} set.'.format(self.seq_name, '_'.join(self.split))
+        names_img = np.sort(
+            os.listdir(
+                os.path.join(self.db_root_dir, 'JPEGImages/480p/',
+                             str(seq_name))))
+        img_list = list(
+            map(lambda x: os.path.join('JPEGImages/480p/', str(seq_name), x),
+                names_img))
+        name_label = np.sort(
+            os.listdir(
+                os.path.join(self.db_root_dir, 'Annotations/480p/',
+                             str(seq_name))))
+        labels = list(
+            map(lambda x: os.path.join('Annotations/480p/', str(seq_name), x),
+                name_label))
+
+        if not os.path.isfile(
+                os.path.join(self.result_root, seq_name, name_label[0])):
+            if not os.path.exists(os.path.join(self.result_root, seq_name)):
+                os.makedirs(os.path.join(self.result_root, seq_name))
+
+                shutil.copy(
+                    os.path.join(self.db_root_dir, labels[0]),
+                    os.path.join(self.result_root, seq_name, name_label[0]))
+            else:
+                shutil.copy(
+                    os.path.join(self.db_root_dir, labels[0]),
+                    os.path.join(self.result_root, seq_name, name_label[0]))
+        self.first_img = names_img[0]
+        self.first_label = name_label[0]
+        self.img_list = names_img[1:]
+
+    def __len__(self):
+        return len(self.img_list)
+
+    def __getitem__(self, idx):
+
+        img = self.img_list[idx]
+        imgpath = os.path.join(self.db_root_dir, 'JPEGImages/480p/',
+                               str(self.seq_name), img)
+
+        num_frame = int(img.split('.')[0])
+        ref_img = os.path.join(self.db_root_dir, 'JPEGImages/480p/',
+                               str(self.seq_name), self.first_img)
+        prev_frame = num_frame - 1
+        prev_frame = str(prev_frame)
+        while len(prev_frame) != 5:
+            prev_frame = '0' + prev_frame
+        prev_img = os.path.join(self.db_root_dir, 'JPEGImages/480p/',
+                                str(self.seq_name),
+                                prev_frame + '.' + img.split('.')[-1])
+
+        current_img = cv2.imread(imgpath)
+        current_img = np.array(current_img, dtype=np.float32)
+
+        ref_img = cv2.imread(ref_img)
+        ref_img = np.array(ref_img, dtype=np.float32)
+
+        prev_img = cv2.imread(prev_img)
+        prev_img = np.array(prev_img, dtype=np.float32)
+
+        ref_label = os.path.join(self.db_root_dir, 'Annotations/480p/',
+                                 str(self.seq_name), self.first_label)
+        ref_label = Image.open(ref_label)
+        ref_label = np.array(ref_label, dtype=np.uint8)
+
+        prev_label = os.path.join(
+            self.result_root, str(self.seq_name),
+            prev_frame + '.' + self.first_label.split('.')[-1])
+        prev_label = Image.open(prev_label)
+        prev_label = np.array(prev_label, dtype=np.uint8)
+
+        obj_num = self.seq_dict[self.seq_name][-1]
+        sample = {
+            'ref_img': ref_img,
+            'prev_img': prev_img,
+            'current_img': current_img,
+            'ref_label': ref_label,
+            'prev_label': prev_label
+        }
+        sample['meta'] = {
+            'seq_name': self.seq_name,
+            'frame_num': num_frame,
+            'obj_num': obj_num,
+            'current_name': img
+        }
+        if self.transform is not None:
+            sample = self.transform(sample)
+        return sample
+
+    def _check_preprocess(self):
+        _seq_list_file = self.seq_list_file
+        if not os.path.isfile(_seq_list_file):
+            return False
+        else:
+            self.seq_dict = json.load(open(self.seq_list_file, 'r'))
+            return True
+
+    def _preprocess(self):
+        self.seq_dict = {}
+        for seq in self.seqs:
+            # Read object masks and get number of objects
+            name_label = np.sort(
+                os.listdir(
+                    os.path.join(self.db_root_dir, 'Annotations/480p/', seq)))
+            label_path = os.path.join(self.db_root_dir, 'Annotations/480p/',
+                                      seq, name_label[0])
+            _mask = np.array(Image.open(label_path))
+            _mask_ids = np.unique(_mask)
+            n_obj = _mask_ids[-1]
+
+            self.seq_dict[seq] = list(range(1, n_obj + 1))
+
+        with open(self.seq_list_file, 'w') as outfile:
+            outfile.write('{{\n\t"{:s}": {:s}'.format(
+                self.seqs[0], json.dumps(self.seq_dict[self.seqs[0]])))
+            for ii in range(1, len(self.seqs)):
+                outfile.write(',\n\t"{:s}": {:s}'.format(
+                    self.seqs[ii], json.dumps(self.seq_dict[self.seqs[ii]])))
+            outfile.write('\n}\n')
+
+        print('Preprocessing finished')
+
+
+class DAVIS2017_VOS_Train(Dataset):
+    """DAVIS2017 dataset for training
+
+    Return: imgs: N*2*3*H*W,label: N*2*1*H*W, seq-name: N, frame_num:N
+    """
+    def __init__(self,
+                 split='train',
+                 root=cfg.DATA_ROOT,
+                 transform=None,
+                 rgb=False):
+        self.split = split
+        self.db_root_dir = root
+        self.rgb = rgb
+        self.transform = transform
+        self.seq_list_file = os.path.join(
+            self.db_root_dir, 'ImageSets', '2017',
+            '_'.join(self.split) + '_instances.txt')
+        self.seqs = []
+        for splt in self.split:
+            with open(
+                    os.path.join(self.db_root_dir, 'ImageSets', '2017',
+                                 self.split + '.txt')) as f:
+                seqs_tmp = f.readlines()
+            seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
+            self.seqs.extend(seqs_tmp)
+        self.imglistdic = {}
+        if not self._check_preprocess():
+            self._preprocess()
+        self.sample_list = []
+        for seq_name in self.seqs:
+            images = np.sort(
+                os.listdir(
+                    os.path.join(self.db_root_dir, 'JPEGImages/480p/',
+                                 seq_name.strip())))
+            images_path = list(
+                map(
+                    lambda x: os.path.join('JPEGImages/480p/', seq_name.strip(),
+                                           x), images))
+            lab = np.sort(
+                os.listdir(
+                    os.path.join(self.db_root_dir, 'Annotations/480p/',
+                                 seq_name.strip())))
+            lab_path = list(
+                map(
+                    lambda x: os.path.join('Annotations/480p/', seq_name.strip(
+                    ), x), lab))
+            self.imglistdic[seq_name] = (images, lab)
+
+    def __len__(self):
+        return len(self.seqs)
+
+    def __getitem__(self, idx):
+        seqname = self.seqs[idx]
+        imagelist, lablist = self.imglistdic[seqname]
+        prev_img = np.random.choice(imagelist[:-1], 1)
+        prev_img = prev_img[0]
+        frame_num = int(prev_img.split('.')[0]) + 1
+        next_frame = str(frame_num)
+        while len(next_frame) != 5:
+            next_frame = '0' + next_frame
+
+        ###############################Processing two adjacent frames and labels
+        img2path = os.path.join('JPEGImages/480p/', seqname,
+                                next_frame + '.' + prev_img.split('.')[-1])
+        img2 = cv2.imread(os.path.join(self.db_root_dir, img2path))
+        img2 = np.array(img2, dtype=np.float32)
+
+        imgpath = os.path.join('JPEGImages/480p/', seqname, prev_img)
+        img1 = cv2.imread(os.path.join(self.db_root_dir, imgpath))
+        img1 = np.array(img1, dtype=np.float32)
+        ###############
+        labelpath = os.path.join(
+            'Annotations/480p/', seqname,
+            prev_img.split('.')[0] + '.' + lablist[0].split('.')[-1])
+        label1 = Image.open(os.path.join(self.db_root_dir, labelpath))
+        label2path = os.path.join('Annotations/480p/', seqname,
+                                  next_frame + '.' + lablist[0].split('.')[-1])
+        label2 = Image.open(os.path.join(self.db_root_dir, label2path))
+
+        label1 = np.array(label1, dtype=np.uint8)
+        label2 = np.array(label2, dtype=np.uint8)
+
+        ###################
+        ref_img = np.random.choice(imagelist, 1)
+        ref_img = ref_img[0]
+        ref_img_name = ref_img
+        ref_scribble_label = Image.open(
+            os.path.join(
+                self.db_root_dir, 'Annotations/480p/', seqname,
+                ref_img_name.split('.')[0] + '.' + lablist[0].split('.')[-1]))
+        ref_scribble_label = np.array(ref_scribble_label, dtype=np.uint8)
+
+        while len(np.unique(ref_scribble_label)) < self.seq_dict[seqname][
+                -1] + 1 or ref_img == prev_img or ref_img == (
+                    next_frame + '.' + prev_img.split('.')[-1]):
+            ref_img = np.random.choice(imagelist, 1)
+            ref_img = ref_img[0]
+            ref_img_name = ref_img
+            ref_scribble_label = Image.open(
+                os.path.join(
+                    self.db_root_dir, 'Annotations/480p/', seqname,
+                    ref_img_name.split('.')[0] + '.' +
+                    lablist[0].split('.')[-1]))
+            ref_scribble_label = np.array(ref_scribble_label, dtype=np.int64)
+        ref_img = os.path.join('JPEGImages/480p/', seqname, ref_img)
+        ref_img = cv2.imread(os.path.join(self.db_root_dir, ref_img))
+        ref_img = np.array(ref_img, dtype=np.float32)
+        ####
+        ###################
+        if self.rgb:
+            img1 = img1[:, :, [2, 1, 0]]
+            img2 = img2[:, :, [2, 1, 0]]
+            ref_img = ref_img[:, :, [2, 1, 0]]
+        obj_num = self.seq_dict[seqname][-1]
+
+        sample = {
+            'ref_img': ref_img,
+            'img1': img1,
+            'img2': img2,
+            'ref_scribble_label': ref_scribble_label,
+            'label1': label1,
+            'label2': label2
+        }
+
+        sample['meta'] = {
+            'seq_name': seqname,
+            'frame_num': frame_num,
+            'obj_num': obj_num
+        }
+        if self.transform is not None:
+            sample = self.transform(sample)
+        sample['ref_scribble_label'] = paddle.to_tensor(
+            sample['ref_scribble_label'], dtype='int64')
+        sample['label1'] = paddle.to_tensor(sample['label1'], dtype='int64')
+        sample['label2'] = paddle.to_tensor(sample['label2'], dtype='int64')
+        return sample
+
+    ########################
+
+    def _check_preprocess(self):
+        _seq_list_file = self.seq_list_file
+        if not os.path.isfile(_seq_list_file):
+            return False
+        else:
+            self.seq_dict = json.load(open(self.seq_list_file, 'r'))
+            return True
+
+    def _preprocess(self):
+        self.seq_dict = {}
+        for seq in self.seqs:
+            # Read object masks and get number of objects
+            name_label = np.sort(
+                os.listdir(
+                    os.path.join(self.db_root_dir, 'Annotations/480p/', seq)))
+            label_path = os.path.join(self.db_root_dir, 'Annotations/480p/',
+                                      seq, name_label[0])
+            _mask = np.array(Image.open(label_path))
+            _mask_ids = np.unique(_mask)
+            n_obj = _mask_ids[-1]
+
+            self.seq_dict[seq] = list(range(1, n_obj + 1))
+
+        with open(self.seq_list_file, 'w') as outfile:
+            outfile.write('{{\n\t"{:s}": {:s}'.format(
+                self.seqs[0], json.dumps(self.seq_dict[self.seqs[0]])))
+            for ii in range(1, len(self.seqs)):
+                outfile.write(',\n\t"{:s}": {:s}'.format(
+                    self.seqs[ii], json.dumps(self.seq_dict[self.seqs[ii]])))
+            outfile.write('\n}\n')
+
+        print('Preprocessing finished')
+
+
+class DAVIS2017_Train(Dataset):
+    """DAVIS2017 dataset for training
+
+    Return: imgs: N*2*3*H*W,label: N*2*1*H*W, seq-name: N, frame_num:N
+    """
+    def __init__(self,
+                 split='train',
+                 root=cfg.DATA_ROOT,
+                 transform=None,
+                 rgb=False):
+        self.split = split
+        self.db_root_dir = root
+        self.rgb = rgb
+        self.transform = transform
+        self.seq_list_file = os.path.join(
+            self.db_root_dir, 'ImageSets', '2017',
+            '_'.join(self.split) + '_instances.txt')
+        self.seqs = []
+        for splt in self.split:
+            with open(
+                    os.path.join(self.db_root_dir, 'ImageSets', '2017',
+                                 self.split + '.txt')) as f:
+                seqs_tmp = f.readlines()
+            seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
+            self.seqs.extend(seqs_tmp)
+
+        if not self._check_preprocess():
+            self._preprocess()
+        self.sample_list = []
+        for seq_name in self.seqs:
+            images = np.sort(
+                os.listdir(
+                    os.path.join(self.db_root_dir, 'JPEGImages/480p/',
+                                 seq_name.strip())))
+            images_path = list(
+                map(
+                    lambda x: os.path.join('JPEGImages/480p/', seq_name.strip(),
+                                           x), images))
+            lab = np.sort(
+                os.listdir(
+                    os.path.join(self.db_root_dir, 'Annotations/480p/',
+                                 seq_name.strip())))
+            lab_path = list(
+                map(
+                    lambda x: os.path.join('Annotations/480p/', seq_name.strip(
+                    ), x), lab))
+
+            for img_path, label_path in zip(images_path[:-1], lab_path[:-1]):
+                tmp_dic = {
+                    'img': img_path,
+                    'label': label_path,
+                    'seq_name': seq_name,
+                    'frame_num': img_path.split('/')[-1].split('.')[0]
+                }
+                self.sample_list.append(tmp_dic)
+
+    def __len__(self):
+        return len(self.sample_list)
+
+    def __getitem__(self, idx):
+        tmp_sample = self.sample_list[idx]
+        imgpath = tmp_sample['img']
+        labelpath = tmp_sample['label']
+        seqname = tmp_sample['seq_name']
+        frame_num = int(tmp_sample['frame_num']) + 1
+
+        next_frame = str(frame_num)
+        while len(next_frame) != 5:
+            next_frame = '0' + next_frame
+        ###############################Processing two adjacent frames and labels
+        img2path = os.path.join('JPEGImages/480p/', seqname,
+                                next_frame + '.' + imgpath.split('.')[-1])
+        img2 = cv2.imread(os.path.join(self.db_root_dir, img2path))
+        img2 = np.array(img2, dtype=np.float32)
+
+        img1 = cv2.imread(os.path.join(self.db_root_dir, imgpath))
+        img1 = np.array(img1, dtype=np.float32)
+        ###############
+        label1 = Image.open(os.path.join(self.db_root_dir, labelpath))
+        label2path = os.path.join('Annotations/480p/', seqname,
+                                  next_frame + '.' + labelpath.split('.')[-1])
+        label2 = Image.open(os.path.join(self.db_root_dir, label2path))
+
+        label1 = np.array(
+            label1, dtype=np.int32
+        )  # fixed, uint8->int32, because layers.stack does not support uint8
+        label2 = np.array(
+            label2, dtype=np.int32
+        )  # fixed, uint8->int32, because layers.stack does not support uint8
+        ###################
+        ref_tmp_dic = self.ref_frame_dic[seqname]
+        ref_img = ref_tmp_dic['ref_frame']
+        ref_scribble_label = ref_tmp_dic['scribble_label']
+        ref_img = cv2.imread(os.path.join(self.db_root_dir, ref_img))
+        ref_img = np.array(ref_img, dtype=np.float32)
+        ref_frame_gt = ref_tmp_dic['ref_frame_gt']
+        ref_frame_gt = Image.open(os.path.join(self.db_root_dir, ref_frame_gt))
+        ref_frame_gt = np.array(
+            ref_frame_gt, dtype=np.int32
+        )  # fixed, uint8->int32, because layers.stack does not support uint8
+        ref_frame_num = ref_tmp_dic['ref_frame_num']
+
+        ###################
+        if self.rgb:
+            img1 = img1[:, :, [2, 1, 0]]
+            img2 = img2[:, :, [2, 1, 0]]
+            ref_img = ref_img[:, :, [2, 1, 0]]
+        obj_num = self.seq_dict[seqname][-1]
+        sample = {
+            'ref_img': ref_img,
+            'img1': img1,
+            'img2': img2,
+            'ref_scribble_label': ref_scribble_label,
+            'label1': label1,
+            'label2': label2,
+            'ref_frame_gt': ref_frame_gt
+        }
+        if 'prev_round_label' in ref_tmp_dic:
+            prev_round_label = ref_tmp_dic['prev_round_label']
+            prev_round_label = prev_round_label.squeeze()
+            prev_round_label = prev_round_label.numpy()
+            sample = {
+                'ref_img': ref_img,
+                'img1': img1,
+                'img2': img2,
+                'ref_scribble_label': ref_scribble_label,
+                'label1': label1,
+                'label2': label2,
+                'ref_frame_gt': ref_frame_gt,
+                'prev_round_label': prev_round_label
+            }
+
+        sample['meta'] = {
+            'seq_name': seqname,
+            'frame_num': frame_num,
+            'obj_num': obj_num,
+            'ref_frame_num': ref_frame_num
+        }
+        if self.transform is not None:
+            sample = self.transform(sample)
+
+        return sample
+
+    def update_ref_frame_and_label(self,
+                                   round_scribble=None,
+                                   frame_num=None,
+                                   prev_round_label_dic=None):
+        ##########Update reference frame and scribbles
+        for seq in self.seqs:
+            scribble = round_scribble[seq]
+            if frame_num is None:
+                scr_frame = annotated_frames(scribble)[0]
+            else:
+                scr_frame = frame_num[seq]
+                scr_frame = int(scr_frame)
+            scr_f = str(scr_frame)
+            while len(scr_f) != 5:
+                scr_f = '0' + scr_f
+            ref_frame_path = os.path.join('JPEGImages/480p', seq,
+                                          scr_f + '.jpg')
+            #######################
+            ref_frame_gt = os.path.join('Annotations/480p/', seq,
+                                        scr_f + '.png')
+            #########################
+            ref_tmp = cv2.imread(os.path.join(self.db_root_dir, ref_frame_path))
+            h_, w_ = ref_tmp.shape[:2]
+            scribble_masks = scribbles2mask(scribble, (h_, w_))
+            if frame_num is None:
+
+                scribble_label = scribble_masks[scr_frame]
+            else:
+                scribble_label = scribble_masks[0]
+            self.ref_frame_dic[seq] = {
+                'ref_frame': ref_frame_path,
+                'scribble_label': scribble_label,
+                'ref_frame_gt': ref_frame_gt,
+                'ref_frame_num': scr_frame
+            }
+            if prev_round_label_dic is not None:
+                self.ref_frame_dic[seq] = {
+                    'ref_frame': ref_frame_path,
+                    'scribble_label': scribble_label,
+                    'ref_frame_gt': ref_frame_gt,
+                    'ref_frame_num': scr_frame,
+                    'prev_round_label': prev_round_label_dic[seq]
+                }
+
+    def init_ref_frame_dic(self):
+        self.ref_frame_dic = {}
+        scribbles_path = os.path.join(self.db_root_dir, 'Scribbles')
+        for seq in self.seqs:
+            selected_json = np.random.choice(
+                ['001.json', '002.json', '003.json'], 1)
+            selected_json = selected_json[0]
+            scribble = os.path.join(self.db_root_dir, 'Scribbles', seq,
+                                    selected_json)
+            with open(scribble) as f:
+                scribble = json.load(f)
+                #    print(scribble)
+                scr_frame = annotated_frames(scribble)[0]
+                scr_f = str(scr_frame)
+                while len(scr_f) != 5:
+                    scr_f = '0' + scr_f
+
+                ref_frame_path = os.path.join('JPEGImages/480p', seq,
+                                              scr_f + '.jpg')
+                ref_tmp = cv2.imread(
+                    os.path.join(self.db_root_dir, ref_frame_path))
+                h_, w_ = ref_tmp.shape[:2]
+                scribble_masks = scribbles2mask(scribble, (h_, w_))
+                ########################
+                ref_frame_gt = os.path.join('Annotations/480p/', seq,
+                                            scr_f + '.png')
+                ########################
+
+                scribble_label = scribble_masks[scr_frame]
+                self.ref_frame_dic[seq] = {
+                    'ref_frame': ref_frame_path,
+                    'scribble_label': scribble_label,
+                    'ref_frame_gt': ref_frame_gt,
+                    'ref_frame_num': scr_frame
+                }
+
+    ########################
+
+    def _check_preprocess(self):
+        _seq_list_file = self.seq_list_file
+        if not os.path.isfile(_seq_list_file):
+            return False
+        else:
+            self.seq_dict = json.load(open(self.seq_list_file, 'r'))
+            return True
+
+    def _preprocess(self):
+        self.seq_dict = {}
+        for seq in self.seqs:
+            # Read object masks and get number of objects
+            name_label = np.sort(
+                os.listdir(
+                    os.path.join(self.db_root_dir, 'Annotations/480p/', seq)))
+            label_path = os.path.join(self.db_root_dir, 'Annotations/480p/',
+                                      seq, name_label[0])
+            _mask = np.array(Image.open(label_path))
+            _mask_ids = np.unique(_mask)
+            n_obj = _mask_ids[-1]
+
+            self.seq_dict[seq] = list(range(1, n_obj + 1))
+
+        with open(self.seq_list_file, 'w') as outfile:
+            outfile.write('{{\n\t"{:s}": {:s}'.format(
+                self.seqs[0], json.dumps(self.seq_dict[self.seqs[0]])))
+            for ii in range(1, len(self.seqs)):
+                outfile.write(',\n\t"{:s}": {:s}'.format(
+                    self.seqs[ii], json.dumps(self.seq_dict[self.seqs[ii]])))
+            outfile.write('\n}\n')
+
+        print('Preprocessing finished')
diff --git a/docs/src/applications/Ma-Net/dataloaders/helpers.py b/docs/src/applications/Ma-Net/dataloaders/helpers.py
new file mode 100644
index 000000000..2bef5a84f
--- /dev/null
+++ b/docs/src/applications/Ma-Net/dataloaders/helpers.py
@@ -0,0 +1,81 @@
+import numpy as np
+import cv2
+
+
+def tens2image(im):
+    tmp = np.squeeze(im.numpy())
+    if tmp.ndim == 2:
+        return tmp
+    else:
+        return tmp.transpose((1, 2, 0))
+
+
+def overlay_mask(im, ma, color=np.array([255, 0, 0]) / 255.0):
+    assert np.max(im) <= 1.0
+
+    ma = ma.astype(np.bool)
+    im = im.astype(np.float32)
+
+    alpha = 0.5
+
+    fg = im * alpha + np.ones(
+        im.shape) * (1 - alpha) * color  # np.array([0,0,255])/255.0
+
+    # Whiten background
+    alpha = 1.0
+    bg = im.copy()
+    bg[ma == 0] = im[ma == 0] * alpha + np.ones(im[ma == 0].shape) * (1 - alpha)
+    bg[ma == 1] = fg[ma == 1]
+
+    # [-2:] is s trick to be compatible both with opencv 2 and 3
+    contours = cv2.findContours(ma.copy().astype(np.uint8), cv2.RETR_TREE,
+                                cv2.CHAIN_APPROX_SIMPLE)[-2:]
+    cv2.drawContours(bg, contours[0], -1, (0.0, 0.0, 0.0), 1)
+
+    return bg
+
+
+def im_normalize(im):
+    """
+    Normalize image
+    """
+    imn = (im - im.min()) / max((im.max() - im.min()), 1e-8)
+    return imn
+
+
+def construct_name(p, prefix):
+    """
+    Construct the name of the model
+    p: dictionary of parameters
+    prefix: the prefix
+    name: the name of the model - manually add ".pth" to follow the convention
+    """
+    name = prefix
+    for key in p.keys():
+        if (type(p[key]) != tuple) and (type(p[key]) != list):
+            name = name + '_' + str(key) + '-' + str(p[key])
+        else:
+            name = name + '_' + str(key) + '-' + str(p[key][0])
+    return name
+
+
+def gt_from_scribble(scr, dilation=11, nocare_area=21):
+
+    # Compute foreground
+    if scr.max() == 1:
+        kernel_fg = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
+                                              (dilation, dilation))
+        fg = cv2.dilate(scr.astype(np.uint8),
+                        kernel=kernel_fg).astype(scr.dtype)
+    else:
+        fg = scr
+
+    # Compute nocare area
+    if nocare_area is None:
+        nocare = None
+    else:
+        kernel_nc = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
+                                              (nocare_area, nocare_area))
+        nocare = cv2.dilate(fg, kernel=kernel_nc) - fg
+
+    return fg, nocare
diff --git a/docs/src/applications/Ma-Net/dataloaders/samplers.py b/docs/src/applications/Ma-Net/dataloaders/samplers.py
new file mode 100644
index 000000000..c50260b33
--- /dev/null
+++ b/docs/src/applications/Ma-Net/dataloaders/samplers.py
@@ -0,0 +1,42 @@
+from __future__ import absolute_import
+from collections import defaultdict
+import numpy as np
+
+import paddle
+from paddle.io import Sampler
+
+
+class RandomIdentitySampler(Sampler):
+    """
+    Randomly sample N identities, then for each identity,
+    randomly sample K instances, therefore batch size is N*K.
+
+    Code imported from https://github.com/Cysu/open-reid/blob/master/reid/utils/data/sampler.py.
+
+    Args:
+        data_source (Dataset): dataset to sample from.
+        num_instances (int): number of instances per identity.
+    """
+    def __init__(self, sample_list, num_instances=1):
+        self.sample_list = sample_list
+        self.num_instances = num_instances
+        self.index_dic = defaultdict(list)
+        for index, tmp_dic in enumerate(self.sample_list):
+            pid = tmp_dic['seq_name']
+            self.index_dic[pid].append(index)
+        self.pids = list(self.index_dic.keys())
+        self.num_identities = len(self.pids)
+
+    def __iter__(self):
+        indices = np.random.permutation(self.num_identities)
+        ret = []
+        for i in indices:
+            pid = self.pids[i]
+            t = self.index_dic[pid]
+            replace = False if len(t) >= self.num_instances else True
+            t = np.random.choice(t, size=self.num_instances, replace=replace)
+            ret.extend(t)
+        return iter(ret)
+
+    def __len__(self):
+        return self.num_identities * self.num_instances
diff --git a/docs/src/applications/Ma-Net/networks/IntVOS.py b/docs/src/applications/Ma-Net/networks/IntVOS.py
new file mode 100644
index 000000000..aa526b65b
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/IntVOS.py
@@ -0,0 +1,927 @@
+import os
+import numpy as np
+import paddle
+import paddle.nn as nn
+import sys
+
+sys.path.append("..")
+
+from config import cfg
+import time
+import paddle.nn.functional as F
+from utils.api import int_, float_, long_
+from utils.api import kaiming_normal_
+
+#############################################################GLOBAL_DIST_MAP
+MODEL_UNFOLD = True
+WRONG_LABEL_PADDING_DISTANCE = 1e20
+
+
+def _pairwise_distances(x, y, ys=None):
+    """Computes pairwise squared l2 distances between tensors x and y.
+    Args:
+    x: Tensor of shape [n, feature_dim].
+    y: Tensor of shape [m, feature_dim].
+    Returns:
+    Float32 distances tensor of shape [n, m].
+    """
+
+    xs = paddle.sum(x * x, 1)
+    xs = xs.unsqueeze(1)
+    if ys is None:
+        ys = paddle.sum(y * y, 1)
+        ys = ys.unsqueeze(0)
+    else:
+        ys = ys
+    d = xs + ys - 2. * paddle.matmul(x, paddle.t(y))
+    return d, ys
+
+
+##################
+def _flattened_pairwise_distances(reference_embeddings, query_embeddings, ys):
+    """Calculates flattened tensor of pairwise distances between ref and query.
+    Args:
+    reference_embeddings: Tensor of shape [..., embedding_dim],
+      the embedding vectors for the reference frame
+    query_embeddings: Tensor of shape [n_query_images, height, width,
+      embedding_dim], the embedding vectors for the query frames.
+    Returns:
+    A distance tensor of shape [reference_embeddings.size / embedding_dim,
+    query_embeddings.size / embedding_dim]
+    """
+    embedding_dim = query_embeddings.shape[-1]
+    reference_embeddings = reference_embeddings.reshape([-1, embedding_dim])
+    first_dim = -1
+    query_embeddings = query_embeddings.reshape([first_dim, embedding_dim])
+    dists, ys = _pairwise_distances(query_embeddings, reference_embeddings, ys)
+    return dists, ys
+
+
+def _nn_features_per_object_for_chunk(reference_embeddings, query_embeddings,
+                                      wrong_label_mask, k_nearest_neighbors,
+                                      ys):
+    """Extracts features for each object using nearest neighbor attention.
+  Args:
+    reference_embeddings: Tensor of shape [n_chunk, embedding_dim],
+      the embedding vectors for the reference frame.
+    query_embeddings: Tensor of shape [m_chunk, embedding_dim], the embedding
+      vectors for the query frames.
+    wrong_label_mask:
+    k_nearest_neighbors: Integer, the number of nearest neighbors to use.
+  Returns:
+    nn_features: A float32 tensor of nearest neighbor features of shape
+      [m_chunk, n_objects, feature_dim].
+    """
+    #    reference_embeddings_key = reference_embeddings
+    #    query_embeddings_key = query_embeddings
+    dists, ys = _flattened_pairwise_distances(reference_embeddings,
+                                              query_embeddings, ys)
+
+    dists = (paddle.unsqueeze(dists, 1) +
+             paddle.unsqueeze(float_(wrong_label_mask), 0) *
+             WRONG_LABEL_PADDING_DISTANCE)
+    if k_nearest_neighbors == 1:
+        features = paddle.min(dists, 2, keepdim=True)
+    else:
+        dists, _ = paddle.topk(-dists, k=k_nearest_neighbors, axis=2)
+        dists = -dists
+        valid_mask = (dists < WRONG_LABEL_PADDING_DISTANCE)
+        masked_dists = dists * valid_mask.float()
+        pad_dist = paddle.max(masked_dists, axis=2, keepdim=True)[0].tile(
+            (1, 1, masked_dists.shape[-1]))
+        dists = paddle.where(valid_mask, dists, pad_dist)
+        # take mean of distances
+        features = paddle.mean(dists, axis=2, keepdim=True)
+
+    return features, ys
+
+
+###
+def _selected_pixel(ref_labels_flat, ref_emb_flat):
+    index_list = paddle.arange(len(ref_labels_flat))
+    index_list = index_list
+    index_ = paddle.masked_select(index_list, ref_labels_flat != -1)
+
+    index_ = long_(index_)
+    ref_labels_flat = paddle.index_select(ref_labels_flat, index_, 0)
+    ref_emb_flat = paddle.index_select(ref_emb_flat, index_, 0)
+
+    return ref_labels_flat, ref_emb_flat
+
+
+###
+
+
+def _nearest_neighbor_features_per_object_in_chunks(
+        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
+        ref_obj_ids, k_nearest_neighbors, n_chunks):
+    """Calculates the nearest neighbor features per object in chunks to save mem.
+    Uses chunking to bound the memory use.
+    Args:
+    reference_embeddings_flat: Tensor of shape [n, embedding_dim],
+      the embedding vectors for the reference frame.
+    query_embeddings_flat: Tensor of shape [m, embedding_dim], the embedding
+      vectors for the query frames.
+    reference_labels_flat: Tensor of shape [n], the class labels of the
+      reference frame.
+    ref_obj_ids: int tensor of unique object ids in the reference labels.
+    k_nearest_neighbors: Integer, the number of nearest neighbors to use.
+    n_chunks: Integer, the number of chunks to use to save memory
+      (set to 1 for no chunking).
+    Returns:
+    nn_features: A float32 tensor of nearest neighbor features of shape
+      [m, n_objects, feature_dim].
+    """
+
+    chunk_size = int_(
+        np.ceil((float_(query_embeddings_flat.shape[0]) / n_chunks).numpy()))
+    if cfg.TEST_MODE:
+        reference_labels_flat, reference_embeddings_flat = _selected_pixel(
+            reference_labels_flat, reference_embeddings_flat)
+    wrong_label_mask = (reference_labels_flat != paddle.unsqueeze(
+        ref_obj_ids, 1))
+    all_features = []
+    for n in range(n_chunks):
+        if n == 0:
+            ys = None
+        if n_chunks == 1:
+            query_embeddings_flat_chunk = query_embeddings_flat
+        else:
+            chunk_start = n * chunk_size
+            chunk_end = (n + 1) * chunk_size
+            query_embeddings_flat_chunk = query_embeddings_flat[
+                chunk_start:chunk_end]
+        features, ys = _nn_features_per_object_for_chunk(
+            reference_embeddings_flat, query_embeddings_flat_chunk,
+            wrong_label_mask, k_nearest_neighbors, ys)
+        all_features.append(features)
+    if n_chunks == 1:
+        nn_features = all_features[0]
+    else:
+        nn_features = paddle.concat(all_features, axis=0)
+    return nn_features
+
+
+def nearest_neighbor_features_per_object(reference_embeddings,
+                                         query_embeddings,
+                                         reference_labels,
+                                         k_nearest_neighbors,
+                                         gt_ids=None,
+                                         n_chunks=100):
+    """Calculates the distance to the nearest neighbor per object.
+    For every pixel of query_embeddings calculate the distance to the
+    nearest neighbor in the (possibly subsampled) reference_embeddings per object.
+    Args:
+    reference_embeddings: Tensor of shape [height, width, embedding_dim],
+      the embedding vectors for the reference frame.
+    query_embeddings: Tensor of shape [n_query_images, height, width,
+      embedding_dim], the embedding vectors for the query frames.
+    reference_labels: Tensor of shape [height, width, 1], the class labels of
+      the reference frame.
+    max_neighbors_per_object: Integer, the maximum number of candidates
+      for the nearest neighbor query per object after subsampling,
+      or 0 for no subsampling.
+    k_nearest_neighbors: Integer, the number of nearest neighbors to use.
+    gt_ids: Int tensor of shape [n_objs] of the sorted unique ground truth
+      ids in the first frame. If None, it will be derived from
+      reference_labels.
+    n_chunks: Integer, the number of chunks to use to save memory
+      (set to 1 for no chunking).
+    Returns:
+    nn_features: A float32 tensor of nearest neighbor features of shape
+      [n_query_images, height, width, n_objects, feature_dim].
+    gt_ids: An int32 tensor of the unique sorted object ids present
+      in the reference labels.
+    """
+
+    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])
+    h, w, _ = query_embeddings.shape
+    reference_labels_flat = reference_labels.reshape([-1])
+    if gt_ids is None:
+        ref_obj_ids = paddle.unique(reference_labels_flat)[-1]
+        ref_obj_ids = np.arange(0, ref_obj_ids + 1)
+        gt_ids = paddle.to_tensor(ref_obj_ids)
+        gt_ids = int_(gt_ids)
+    else:
+        gt_ids = int_(paddle.arange(0, gt_ids + 1))
+
+    embedding_dim = query_embeddings.shape[-1]
+    query_embeddings_flat = query_embeddings.reshape([-1, embedding_dim])
+    reference_embeddings_flat = reference_embeddings.reshape(
+        [-1, embedding_dim])
+    nn_features = _nearest_neighbor_features_per_object_in_chunks(
+        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
+        gt_ids, k_nearest_neighbors, n_chunks)
+    nn_features_dim = nn_features.shape[-1]
+    nn_features = nn_features.reshape(
+        [1, h, w, gt_ids.shape[0], nn_features_dim])
+    return nn_features.cuda(), gt_ids
+
+
+########################################################################LOCAL_DIST_MAP
+def local_pairwise_distances(x, y, max_distance=9):
+    """Computes pairwise squared l2 distances using a local search window.
+    Optimized implementation using correlation_cost.
+    Args:
+    x: Float32 tensor of shape [height, width, feature_dim].
+    y: Float32 tensor of shape [height, width, feature_dim].
+    max_distance: Integer, the maximum distance in pixel coordinates
+      per dimension which is considered to be in the search window.
+    Returns:
+    Float32 distances tensor of shape
+      [height, width, (2 * max_distance + 1) ** 2].
+
+    """
+    if cfg.MODEL_LOCAL_DOWNSAMPLE:
+        #####
+        ori_h, ori_w, _ = x.shape
+        x = x.transpose([2, 0, 1]).unsqueeze(0)
+
+        x = F.avg_pool2d(x, (2, 2), (2, 2))
+        y = y.transpose([2, 0, 1]).unsqueeze(0)
+        y = F.avg_pool2d(y, (2, 2), (2, 2))
+
+        x = x.squeeze(0).transpose([1, 2, 0])
+        y = y.squeeze(0).transpose([1, 2, 0])
+        corr = cross_correlate(x, y, max_distance=max_distance)
+        xs = paddle.sum(x * x, 2, keepdim=True)
+
+        ys = paddle.sum(y * y, 2, keepdim=True)
+        ones_ys = paddle.ones_like(ys)
+        ys = cross_correlate(ones_ys, ys, max_distance=max_distance)
+        d = xs + ys - 2 * corr
+        # Boundary should be set to Inf.
+        tmp = paddle.zeros_like(d)
+        boundary = paddle.equal(
+            cross_correlate(ones_ys, ones_ys, max_distance=max_distance), 0)
+        d = paddle.where(boundary, tmp.fill_(float_('inf')), d)
+        d = (paddle.nn.functional.sigmoid(d) - 0.5) * 2
+        d = d.transpose([2, 0, 1]).unsqueeze(0)
+        d = F.interpolate(d,
+                          size=(ori_h, ori_w),
+                          mode='bilinear',
+                          align_corners=True)
+        d = d.squeeze(0).transpose([1, 2, 0])
+    else:
+        corr = cross_correlate(x, y, max_distance=max_distance)
+        xs = paddle.sum(x * x, 2, keepdim=True)
+
+        ys = paddle.sum(y * y, 2, keepdim=True)
+        ones_ys = paddle.ones_like(ys)
+        ys = cross_correlate(ones_ys, ys, max_distance=max_distance)
+        d = xs + ys - 2 * corr
+        # Boundary should be set to Inf.
+        tmp = paddle.zeros_like(d)
+        boundary = paddle.equal(
+            cross_correlate(ones_ys, ones_ys, max_distance=max_distance), 0)
+        d = paddle.where(boundary, tmp.fill_(float_('inf')), d)
+    return d
+
+
+def local_pairwise_distances2(x, y, max_distance=9):
+    """Computes pairwise squared l2 distances using a local search window.
+    Naive implementation using map_fn.
+    Used as a slow fallback for when correlation_cost is not available.
+    Args:
+    x: Float32 tensor of shape [height, width, feature_dim].
+    y: Float32 tensor of shape [height, width, feature_dim].
+    max_distance: Integer, the maximum distance in pixel coordinates
+      per dimension which is considered to be in the search window.
+    Returns:
+    Float32 distances tensor of shape
+      [height, width, (2 * max_distance + 1) ** 2].
+    """
+    if cfg.MODEL_LOCAL_DOWNSAMPLE:
+        ori_h, ori_w, _ = x.shape
+        x = paddle.transpose(x, [2, 0, 1]).unsqueeze(0)
+        x = F.avg_pool2d(x, (2, 2), (2, 2))
+        y = paddle.transpose(y, [2, 0, 1]).unsqueeze(0)
+        y = F.avg_pool2d(y, (2, 2), (2, 2))
+
+        _, channels, height, width = x.shape
+        padding_val = 1e20
+        padded_y = F.pad(
+            y, (max_distance, max_distance, max_distance, max_distance),
+            mode='constant',
+            value=padding_val)
+        offset_y = F.unfold(padded_y, kernel_sizes=[height, width]).reshape(
+            [1, channels, height, width, -1])
+        x = x.reshape([1, channels, height, width, 1])
+        minus = x - offset_y
+        dists = paddle.sum(paddle.multiply(minus, minus),
+                           axis=1).reshape([1, height, width,
+                                            -1]).transpose([0, 3, 1, 2])
+        dists = (paddle.nn.functional.sigmoid(dists) - 0.5) * 2
+        dists = F.interpolate(dists,
+                              size=[ori_h, ori_w],
+                              mode='bilinear',
+                              align_corners=True)
+        dists = dists.squeeze(0).transpose([1, 2, 0])
+
+    else:
+        padding_val = 1e20
+        padded_y = nn.functional.pad(
+            y, (0, 0, max_distance, max_distance, max_distance, max_distance),
+            mode='constant',
+            value=padding_val)
+        height, width, _ = x.shape
+        dists = []
+        for y_start in range(2 * max_distance + 1):
+            y_end = y_start + height
+            y_slice = padded_y[y_start:y_end]
+            for x_start in range(2 * max_distance + 1):
+                x_end = x_start + width
+                offset_y = y_slice[:, x_start:x_end]
+                dist = paddle.sum(paddle.pow((x - offset_y), 2), dim=2)
+                dists.append(dist)
+        dists = paddle.stack(dists, dim=2)
+
+    return dists
+
+
+class SpatialCorrelationSampler:
+    pass
+
+
+def cross_correlate(x, y, max_distance=9):
+    """Efficiently computes the cross correlation of x and y.
+  Optimized implementation using correlation_cost.
+  Note that we do not normalize by the feature dimension.
+  Args:
+    x: Float32 tensor of shape [height, width, feature_dim].
+    y: Float32 tensor of shape [height, width, feature_dim].
+    max_distance: Integer, the maximum distance in pixel coordinates
+      per dimension which is considered to be in the search window.
+  Returns:
+    Float32 tensor of shape [height, width, (2 * max_distance + 1) ** 2].
+    """
+    corr_op = SpatialCorrelationSampler(kernel_size=1,
+                                        patch_size=2 * max_distance + 1,
+                                        stride=1,
+                                        dilation_patch=1,
+                                        padding=0)
+
+    xs = x.transpose(2, 0, 1)
+    xs = paddle.unsqueeze(xs, 0)
+    ys = y.transpose(2, 0, 1)
+    ys = paddle.unsqueeze(ys, 0)
+    corr = corr_op(xs, ys)
+    bs, _, _, hh, ww = corr.shape
+    corr = corr.reshape([bs, -1, hh, ww])
+    corr = paddle.squeeze(corr, 0)
+    corr = corr.transpose(1, 2, 0)
+    return corr
+
+
+def local_previous_frame_nearest_neighbor_features_per_object(
+        prev_frame_embedding,
+        query_embedding,
+        prev_frame_labels,
+        gt_ids,
+        max_distance=12):
+    """Computes nearest neighbor features while only allowing local matches.
+  Args:
+    prev_frame_embedding: Tensor of shape [height, width, embedding_dim],
+      the embedding vectors for the last frame.
+    query_embedding: Tensor of shape [height, width, embedding_dim],
+      the embedding vectors for the query frames.
+    prev_frame_labels: Tensor of shape [height, width, 1], the class labels of
+      the previous frame.
+    gt_ids: Int Tensor of shape [n_objs] of the sorted unique ground truth
+      ids in the first frame.
+    max_distance: Integer, the maximum distance allowed for local matching.
+  Returns:
+    nn_features: A float32 np.array of nearest neighbor features of shape
+      [1, height, width, n_objects, 1].
+    """
+
+    d = local_pairwise_distances2(query_embedding,
+                                  prev_frame_embedding,
+                                  max_distance=max_distance)
+    height, width = prev_frame_embedding.shape[:2]
+
+    if MODEL_UNFOLD:
+
+        labels = float_(prev_frame_labels).transpose([2, 0, 1]).unsqueeze(0)
+        padded_labels = F.pad(labels, (
+            2 * max_distance,
+            2 * max_distance,
+            2 * max_distance,
+            2 * max_distance,
+        ))
+        offset_labels = F.unfold(padded_labels,
+                                 kernel_sizes=[height, width],
+                                 strides=[2, 2]).reshape([height, width, -1, 1])
+        offset_masks = paddle.equal(
+            offset_labels,
+            float_(gt_ids).unsqueeze(0).unsqueeze(0).unsqueeze(0))
+    else:
+
+        masks = paddle.equal(prev_frame_labels,
+                             gt_ids.unsqueeze(0).unsqueeze(0))
+        padded_masks = nn.functional.pad(masks, (
+            0,
+            0,
+            max_distance,
+            max_distance,
+            max_distance,
+            max_distance,
+        ))
+        offset_masks = []
+        for y_start in range(2 * max_distance + 1):
+            y_end = y_start + height
+            masks_slice = padded_masks[y_start:y_end]
+            for x_start in range(2 * max_distance + 1):
+                x_end = x_start + width
+                offset_mask = masks_slice[:, x_start:x_end]
+                offset_masks.append(offset_mask)
+        offset_masks = paddle.stack(offset_masks, axis=2)
+
+    d_tiled = d.unsqueeze(-1).tile((1, 1, 1, gt_ids.shape[0]))
+    pad = paddle.ones_like(d_tiled)
+    d_masked = paddle.where(offset_masks, d_tiled, pad)
+    dists = paddle.min(d_masked, axis=2)
+    dists = dists.reshape([1, height, width, gt_ids.shape[0], 1])
+
+    return dists
+
+
+##############################################################
+
+
+#################
+class _res_block(nn.Layer):
+    def __init__(self, in_dim, out_dim):
+        super(_res_block, self).__init__()
+        self.conv1 = nn.Conv2D(in_dim,
+                               out_dim,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1)
+        self.relu1 = nn.ReLU()
+        self.bn1 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg.TRAIN_BN_MOM)
+        self.conv2 = nn.Conv2D(out_dim,
+                               out_dim,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1)
+        self.relu2 = nn.ReLU()
+        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg.TRAIN_BN_MOM)
+
+    def forward(self, x):
+        res = x
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+        x += res
+        return x
+
+
+####################
+class IntSegHead(nn.Layer):
+    def __init__(self,
+                 in_dim=(cfg.MODEL_SEMANTIC_EMBEDDING_DIM + 3),
+                 emb_dim=cfg.MODEL_HEAD_EMBEDDING_DIM):
+        super(IntSegHead, self).__init__()
+        self.conv1 = nn.Conv2D(in_dim,
+                               emb_dim,
+                               kernel_size=7,
+                               stride=1,
+                               padding=3)
+        self.bn1 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg.TRAIN_BN_MOM)
+        self.relu1 = nn.ReLU(True)
+        self.res1 = _res_block(emb_dim, emb_dim)
+        self.res2 = _res_block(emb_dim, emb_dim)
+        self.conv2 = nn.Conv2D(256, emb_dim, kernel_size=3, stride=1, padding=1)
+        self.bn2 = paddle.nn.BatchNorm2D(emb_dim, momentum=cfg.TRAIN_BN_MOM)
+        self.relu2 = nn.ReLU(True)
+        self.conv3 = nn.Conv2D(emb_dim, 1, 1, 1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.res1(x)
+        x = self.res2(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        return x
+
+
+class _split_separable_conv2d(nn.Layer):
+    def __init__(self, in_dim, out_dim, kernel_size=7):
+        super(_split_separable_conv2d, self).__init__()
+        self.conv1 = nn.Conv2D(in_dim,
+                               in_dim,
+                               kernel_size=kernel_size,
+                               stride=1,
+                               padding=int((kernel_size - 1) / 2),
+                               groups=in_dim)
+        self.relu1 = nn.ReLU(True)
+        self.bn1 = paddle.nn.BatchNorm2D(in_dim, momentum=cfg.TRAIN_BN_MOM)
+        self.conv2 = nn.Conv2D(in_dim, out_dim, kernel_size=1, stride=1)
+        self.relu2 = nn.ReLU(True)
+        self.bn2 = paddle.nn.BatchNorm2D(out_dim, momentum=cfg.TRAIN_BN_MOM)
+        kaiming_normal_(self.conv1.weight, mode='fan_out', nonlinearity='relu')
+        kaiming_normal_(self.conv2.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+        return x
+
+
+class DynamicSegHead(nn.Layer):
+    def __init__(self,
+                 in_dim=(cfg.MODEL_SEMANTIC_EMBEDDING_DIM + 3),
+                 embed_dim=cfg.MODEL_HEAD_EMBEDDING_DIM,
+                 kernel_size=1):
+        super(DynamicSegHead, self).__init__()
+        self.layer1 = _split_separable_conv2d(in_dim, embed_dim)
+        self.layer2 = _split_separable_conv2d(embed_dim, embed_dim)
+        self.layer3 = _split_separable_conv2d(embed_dim, embed_dim)
+        self.layer4 = _split_separable_conv2d(embed_dim, embed_dim)
+        self.conv = nn.Conv2D(embed_dim, 1, 1, 1)
+        kaiming_normal_(self.conv.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.conv(x)
+        return x
+
+
+##################
+
+
+###############
+class IntVOS(nn.Layer):
+    def __init__(self, cfg, feature_extracter):
+        super(IntVOS, self).__init__()
+        self.feature_extracter = feature_extracter  ##embedding extractor
+        self.feature_extracter.cls_conv = nn.Sequential()
+        self.feature_extracter.upsample4 = nn.Sequential()
+        self.semantic_embedding = None
+        self.seperate_conv = nn.Conv2D(cfg.MODEL_ASPP_OUTDIM,
+                                       cfg.MODEL_ASPP_OUTDIM,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1,
+                                       groups=cfg.MODEL_ASPP_OUTDIM)
+        self.bn1 = paddle.nn.BatchNorm2D(cfg.MODEL_ASPP_OUTDIM,
+                                         momentum=cfg.TRAIN_BN_MOM)
+        self.relu1 = nn.ReLU(True)
+        self.embedding_conv = nn.Conv2D(cfg.MODEL_ASPP_OUTDIM,
+                                        cfg.MODEL_SEMANTIC_EMBEDDING_DIM, 1, 1)
+        self.relu2 = nn.ReLU(True)
+        self.bn2 = paddle.nn.BatchNorm2D(cfg.MODEL_SEMANTIC_EMBEDDING_DIM,
+                                         momentum=cfg.TRAIN_BN_MOM)
+        self.semantic_embedding = nn.Sequential(*[
+            self.seperate_conv, self.bn1, self.relu1, self.embedding_conv,
+            self.bn2, self.relu2
+        ])
+
+        for m in self.semantic_embedding:
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+
+        self.dynamic_seghead = DynamicSegHead()  # propagation segm head
+        if cfg.MODEL_USEIntSeg:
+            self.inter_seghead = IntSegHead(
+                in_dim=cfg.MODEL_SEMANTIC_EMBEDDING_DIM + 3)
+        else:
+            self.inter_seghead = DynamicSegHead(
+                in_dim=cfg.MODEL_SEMANTIC_EMBEDDING_DIM +
+                2)  # interaction segm head
+
+    def forward(self,
+                x=None,
+                ref_scribble_label=None,
+                previous_frame_mask=None,
+                normalize_nearest_neighbor_distances=True,
+                use_local_map=True,
+                seq_names=None,
+                gt_ids=None,
+                k_nearest_neighbors=1,
+                global_map_tmp_dic=None,
+                local_map_dics=None,
+                interaction_num=None,
+                start_annotated_frame=None,
+                frame_num=None):
+
+        x = self.extract_feature(x)
+        #         print('extract_feature:', x.mean().item())
+        ref_frame_embedding, previous_frame_embedding, current_frame_embedding = paddle.split(
+            x, num_or_sections=3, axis=0)
+
+        if global_map_tmp_dic is None:
+            dic = self.prop_seghead(
+                ref_frame_embedding, previous_frame_embedding,
+                current_frame_embedding, ref_scribble_label,
+                previous_frame_mask, normalize_nearest_neighbor_distances,
+                use_local_map, seq_names, gt_ids, k_nearest_neighbors,
+                global_map_tmp_dic, local_map_dics, interaction_num,
+                start_annotated_frame, frame_num, self.dynamic_seghead)
+            return dic
+
+        else:
+            dic, global_map_tmp_dic = self.prop_seghead(
+                ref_frame_embedding, previous_frame_embedding,
+                current_frame_embedding, ref_scribble_label,
+                previous_frame_mask, normalize_nearest_neighbor_distances,
+                use_local_map, seq_names, gt_ids, k_nearest_neighbors,
+                global_map_tmp_dic, local_map_dics, interaction_num,
+                start_annotated_frame, frame_num, self.dynamic_seghead)
+            return dic, global_map_tmp_dic
+
+    def extract_feature(self, x):
+        x = self.feature_extracter(x)
+        x = self.semantic_embedding(x)
+        return x
+
+    def prop_seghead(self,
+                     ref_frame_embedding=None,
+                     previous_frame_embedding=None,
+                     current_frame_embedding=None,
+                     ref_scribble_label=None,
+                     previous_frame_mask=None,
+                     normalize_nearest_neighbor_distances=True,
+                     use_local_map=True,
+                     seq_names=None,
+                     gt_ids=None,
+                     k_nearest_neighbors=1,
+                     global_map_tmp_dic=None,
+                     local_map_dics=None,
+                     interaction_num=None,
+                     start_annotated_frame=None,
+                     frame_num=None,
+                     dynamic_seghead=None):
+        """return: feature_embedding,global_match_map,local_match_map,previous_frame_mask"""
+        ###############
+
+        global_map_tmp_dic = global_map_tmp_dic
+        dic_tmp = {}
+        bs, c, h, w = current_frame_embedding.shape
+        if cfg.TEST_MODE:
+            scale_ref_scribble_label = float_(ref_scribble_label)
+        else:
+            scale_ref_scribble_label = paddle.nn.functional.interpolate(
+                float_(ref_scribble_label), size=(h, w), mode='nearest')
+        scale_ref_scribble_label = int_(scale_ref_scribble_label)
+        scale_previous_frame_label = paddle.nn.functional.interpolate(
+            float_(previous_frame_mask), size=(h, w), mode='nearest')
+        #         print(scale_previous_frame_label.sum())  # xx
+        #         print(previous_frame_mask.sum().item())  # xx
+        scale_previous_frame_label = int_(scale_previous_frame_label)
+        #         print(scale_previous_frame_label.sum().item())  # xx
+        for n in range(bs):
+            seq_current_frame_embedding = current_frame_embedding[n]
+            seq_ref_frame_embedding = ref_frame_embedding[n]
+            seq_prev_frame_embedding = previous_frame_embedding[n]
+
+            seq_ref_frame_embedding = seq_ref_frame_embedding.transpose(
+                [1, 2, 0])
+            seq_current_frame_embedding = seq_current_frame_embedding.transpose(
+                [1, 2, 0])
+            seq_ref_scribble_label = scale_ref_scribble_label[n].transpose(
+                [1, 2, 0])
+            #########Global Map
+            nn_features_n, ref_obj_ids = nearest_neighbor_features_per_object(
+                reference_embeddings=seq_ref_frame_embedding,
+                query_embeddings=seq_current_frame_embedding,
+                reference_labels=seq_ref_scribble_label,
+                k_nearest_neighbors=k_nearest_neighbors,
+                gt_ids=gt_ids[n],
+                n_chunks=10)
+            if normalize_nearest_neighbor_distances:
+                nn_features_n = (paddle.nn.functional.sigmoid(nn_features_n) -
+                                 0.5) * 2
+
+            if global_map_tmp_dic is not None:  ###when testing, use global map memory
+                if seq_names[n] not in global_map_tmp_dic:
+                    global_map_tmp_dic[seq_names[n]] = paddle.ones_like(
+                        nn_features_n).tile([104, 1, 1, 1, 1])
+                nn_features_n = paddle.where(
+                    nn_features_n <=
+                    global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0),
+                    nn_features_n,
+                    global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0))
+
+                global_map_tmp_dic[seq_names[n]][
+                    frame_num[n]] = nn_features_n.detach()[0]
+
+            #########################Local dist map
+            seq_prev_frame_embedding = seq_prev_frame_embedding.transpose(
+                [1, 2, 0])
+            seq_previous_frame_label = scale_previous_frame_label[n].transpose(
+                [1, 2, 0])
+
+            if use_local_map:
+                prev_frame_nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(
+                    prev_frame_embedding=seq_prev_frame_embedding,
+                    query_embedding=seq_current_frame_embedding,
+                    prev_frame_labels=seq_previous_frame_label,
+                    gt_ids=ref_obj_ids,
+                    max_distance=cfg.MODEL_MAX_LOCAL_DISTANCE)
+            else:
+                prev_frame_nn_features_n, _ = nearest_neighbor_features_per_object(
+                    reference_embeddings=seq_prev_frame_embedding,
+                    query_embeddings=seq_current_frame_embedding,
+                    reference_labels=seq_previous_frame_label,
+                    k_nearest_neighbors=k_nearest_neighbors,
+                    gt_ids=gt_ids[n],
+                    n_chunks=20)
+                prev_frame_nn_features_n = (
+                    paddle.nn.functional.sigmoid(prev_frame_nn_features_n) -
+                    0.5) * 2
+
+
+#             print(prev_frame_nn_features_n.mean().item(), prev_frame_nn_features_n.shape, interaction_num)  # o
+#############
+            if local_map_dics is not None:  ##When testing, use local map memory
+                local_map_tmp_dic, local_map_dist_dic = local_map_dics
+                if seq_names[n] not in local_map_dist_dic:
+                    print(seq_names[n], 'not in local_map_dist_dic')
+                    local_map_dist_dic[seq_names[n]] = paddle.zeros(104, 9)
+                if seq_names[n] not in local_map_tmp_dic:
+                    print(seq_names[n], 'not in local_map_tmp_dic')
+                    local_map_tmp_dic[seq_names[n]] = paddle.zeros_like(
+                        prev_frame_nn_features_n).unsqueeze(0).tile(
+                            [104, 9, 1, 1, 1, 1])
+                local_map_dist_dic[seq_names[n]][
+                    frame_num[n], interaction_num -
+                    1] = 1.0 / (abs(frame_num[n] - start_annotated_frame)
+                                )  # bugs fixed.
+                local_map_tmp_dic[seq_names[n]][
+                    frame_num[n],
+                    interaction_num - 1] = prev_frame_nn_features_n.squeeze(
+                        0).detach()  # bugs fixed.
+                if interaction_num == 1:
+                    prev_frame_nn_features_n = local_map_tmp_dic[seq_names[n]][
+                        frame_num[n]][interaction_num - 1]
+                    prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(
+                        0)
+                else:
+                    if local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 1] > \
+                            local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num - 2]:
+                        prev_frame_nn_features_n = local_map_tmp_dic[
+                            seq_names[n]][frame_num[n]][interaction_num - 1]
+                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(
+                            0)
+                    else:
+                        prev_frame_nn_features_n = local_map_tmp_dic[
+                            seq_names[n]][frame_num[n]][interaction_num - 2]
+                        prev_frame_nn_features_n = prev_frame_nn_features_n.unsqueeze(
+                            0)
+
+                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)
+
+            to_cat_previous_frame = (
+                float_(seq_previous_frame_label) == float_(ref_obj_ids)
+            )  # float comparision?
+
+            to_cat_current_frame_embedding = current_frame_embedding[
+                n].unsqueeze(0).tile((ref_obj_ids.shape[0], 1, 1, 1))
+
+            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(
+                [2, 3, 0, 1])
+            to_cat_previous_frame = float_(
+                to_cat_previous_frame.unsqueeze(-1).transpose([2, 3, 0, 1]))
+            to_cat_prev_frame_nn_feature_n = prev_frame_nn_features_n.squeeze(
+                0).transpose([2, 3, 0, 1])
+            to_cat = paddle.concat(
+                (to_cat_current_frame_embedding, to_cat_nn_feature_n,
+                 to_cat_prev_frame_nn_feature_n, to_cat_previous_frame), 1)
+            pred_ = dynamic_seghead(to_cat)
+            pred_ = pred_.transpose([1, 0, 2, 3])
+            dic_tmp[seq_names[n]] = pred_
+
+        if global_map_tmp_dic is None:
+            return dic_tmp
+        else:
+            if local_map_dics is None:
+                return dic_tmp, global_map_tmp_dic
+            else:
+                return dic_tmp, global_map_tmp_dic, local_map_dics
+
+    def int_seghead(self,
+                    ref_frame_embedding=None,
+                    ref_scribble_label=None,
+                    prev_round_label=None,
+                    normalize_nearest_neighbor_distances=True,
+                    global_map_tmp_dic=None,
+                    local_map_dics=None,
+                    interaction_num=None,
+                    seq_names=None,
+                    gt_ids=None,
+                    k_nearest_neighbors=1,
+                    frame_num=None,
+                    first_inter=True):
+        dic_tmp = {}
+        bs, c, h, w = ref_frame_embedding.shape
+        scale_ref_scribble_label = paddle.nn.functional.interpolate(
+            float_(ref_scribble_label), size=(h, w), mode='nearest')
+        scale_ref_scribble_label = int_(scale_ref_scribble_label)
+        if not first_inter:
+            scale_prev_round_label = paddle.nn.functional.interpolate(
+                float_(prev_round_label), size=(h, w), mode='nearest')
+            scale_prev_round_label = int_(scale_prev_round_label)
+        n_chunks = 500
+        for n in range(bs):
+
+            gt_id = paddle.arange(0, gt_ids[n] + 1)
+
+            gt_id = int_(gt_id)
+
+            seq_ref_frame_embedding = ref_frame_embedding[n]
+
+            ########################Local dist map
+            seq_ref_frame_embedding = paddle.transpose(seq_ref_frame_embedding,
+                                                       [1, 2, 0])
+            seq_ref_scribble_label = paddle.transpose(
+                scale_ref_scribble_label[n], [1, 2, 0])
+            nn_features_n = local_previous_frame_nearest_neighbor_features_per_object(
+                prev_frame_embedding=seq_ref_frame_embedding,
+                query_embedding=seq_ref_frame_embedding,
+                prev_frame_labels=seq_ref_scribble_label,
+                gt_ids=gt_id,
+                max_distance=cfg.MODEL_MAX_LOCAL_DISTANCE)
+
+            #######
+            ######################Global map update
+            if seq_names[n] not in global_map_tmp_dic:
+                global_map_tmp_dic[seq_names[n]] = paddle.ones_like(
+                    nn_features_n).tile([104, 1, 1, 1, 1])
+            nn_features_n_ = paddle.where(
+                nn_features_n <=
+                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0),
+                nn_features_n,
+                global_map_tmp_dic[seq_names[n]][frame_num[n]].unsqueeze(0))
+
+            ###
+
+            ###
+            global_map_tmp_dic[seq_names[n]][
+                frame_num[n]] = nn_features_n_.detach()[0]
+            ##################Local map update
+            if local_map_dics is not None:
+                local_map_tmp_dic, local_map_dist_dic = local_map_dics
+                if seq_names[n] not in local_map_dist_dic:
+                    local_map_dist_dic[seq_names[n]] = paddle.zeros([104, 9])
+                if seq_names[n] not in local_map_tmp_dic:
+                    local_map_tmp_dic[seq_names[n]] = paddle.ones_like(
+                        nn_features_n).unsqueeze(0).tile([104, 9, 1, 1, 1, 1])
+                local_map_dist_dic[seq_names[n]][frame_num[n]][interaction_num -
+                                                               1] = 0
+
+                local_map_dics = (local_map_tmp_dic, local_map_dist_dic)
+
+            ##################
+            to_cat_current_frame_embedding = ref_frame_embedding[n].unsqueeze(
+                0).tile((gt_id.shape[0], 1, 1, 1))
+            to_cat_nn_feature_n = nn_features_n.squeeze(0).transpose(
+                [2, 3, 0, 1])
+
+            to_cat_scribble_mask_to_cat = (
+                float_(seq_ref_scribble_label) == float_(gt_id)
+            )  # float comparision?
+            to_cat_scribble_mask_to_cat = float_(
+                to_cat_scribble_mask_to_cat.unsqueeze(-1).transpose(
+                    [2, 3, 0, 1]))
+            if not first_inter:
+                seq_prev_round_label = scale_prev_round_label[n].transpose(
+                    [1, 2, 0])
+
+                to_cat_prev_round_to_cat = (
+                    float_(seq_prev_round_label) == float_(gt_id)
+                )  # float comparision?
+                to_cat_prev_round_to_cat = float_(
+                    to_cat_prev_round_to_cat.unsqueeze(-1).transpose(
+                        [2, 3, 0, 1]))
+            else:
+                to_cat_prev_round_to_cat = paddle.zeros_like(
+                    to_cat_scribble_mask_to_cat)
+                to_cat_prev_round_to_cat[0] = 1.
+
+            to_cat = paddle.concat(
+                (to_cat_current_frame_embedding, to_cat_scribble_mask_to_cat,
+                 to_cat_prev_round_to_cat), 1)
+
+            pred_ = self.inter_seghead(to_cat)
+            pred_ = pred_.transpose([1, 0, 2, 3])
+            dic_tmp[seq_names[n]] = pred_
+        if local_map_dics is None:
+            return dic_tmp
+        else:
+            return dic_tmp, local_map_dics
diff --git a/docs/src/applications/Ma-Net/networks/aspp.py b/docs/src/applications/Ma-Net/networks/aspp.py
new file mode 100644
index 000000000..a4f289f77
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/aspp.py
@@ -0,0 +1,123 @@
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from utils.api import kaiming_normal_
+
+
+class _ASPPModule(nn.Layer):
+    def __init__(self, inplanes, planes, kernel_size, padding, dilation,
+                 BatchNorm):
+        super(_ASPPModule, self).__init__()
+        self.atrous_conv = nn.Conv2D(inplanes,
+                                     planes,
+                                     kernel_size=kernel_size,
+                                     stride=1,
+                                     padding=padding,
+                                     dilation=dilation,
+                                     bias_attr=False)
+        self.bn = BatchNorm(planes)
+        self.relu = nn.ReLU(True)
+
+        self._init_weight()
+
+    def forward(self, x):
+        x = self.atrous_conv(x)
+        x = self.bn(x)
+
+        return self.relu(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                from utils.api import fill_
+                fill_(m.weight, 1)
+                from utils.api import zero_
+                zero_(m.bias)
+
+
+class ASPP(nn.Layer):
+    def __init__(self, backbone, output_stride, BatchNorm):
+        super(ASPP, self).__init__()
+        if backbone == 'drn':
+            inplanes = 512
+        elif backbone == 'mobilenet':
+            inplanes = 320
+        else:
+            inplanes = 2048
+        if output_stride == 16:
+            dilations = [1, 6, 12, 18]
+        elif output_stride == 8:
+            dilations = [1, 12, 24, 36]
+        else:
+            raise NotImplementedError
+
+        self.aspp1 = _ASPPModule(inplanes,
+                                 256,
+                                 1,
+                                 padding=0,
+                                 dilation=dilations[0],
+                                 BatchNorm=BatchNorm)
+        self.aspp2 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[1],
+                                 dilation=dilations[1],
+                                 BatchNorm=BatchNorm)
+        self.aspp3 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[2],
+                                 dilation=dilations[2],
+                                 BatchNorm=BatchNorm)
+        self.aspp4 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[3],
+                                 dilation=dilations[3],
+                                 BatchNorm=BatchNorm)
+
+        self.global_avg_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2D((1, 1)),
+            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),
+            BatchNorm(256), nn.ReLU())
+        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)
+        self.bn1 = BatchNorm(256)
+        self.relu = nn.ReLU(True)
+        self.dropout = nn.Dropout(0.1)
+        self._init_weight()
+
+    def forward(self, x):
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.interpolate(x5,
+                           size=x4.shape[2:],
+                           mode='bilinear',
+                           align_corners=True)
+        x = paddle.concat((x1, x2, x3, x4, x5), axis=1)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        return x
+        return self.dropout(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                from utils.api import fill_
+                fill_(m.weight, 1)
+                from utils.api import zero_
+                zero_(m.bias)
+
+
+def build_aspp(backbone, output_stride, BatchNorm):
+    return ASPP(backbone, output_stride, BatchNorm)
diff --git a/docs/src/applications/Ma-Net/networks/backbone/__init__.py b/docs/src/applications/Ma-Net/networks/backbone/__init__.py
new file mode 100644
index 000000000..ea7dd8f5f
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/backbone/__init__.py
@@ -0,0 +1,14 @@
+from networks.backbone import resnet, xception, drn, mobilenet
+
+
+def build_backbone(backbone, output_stride, BatchNorm):
+    if backbone == 'resnet':
+        return resnet.ResNet101(output_stride, BatchNorm)
+    elif backbone == 'xception':
+        return xception.AlignedXception(output_stride, BatchNorm)
+    elif backbone == 'drn':
+        return drn.drn_d_54(BatchNorm)
+    elif backbone == 'mobilenet':
+        return mobilenet.MobileNetV2(output_stride, BatchNorm)
+    else:
+        raise NotImplementedError
diff --git a/docs/src/applications/Ma-Net/networks/backbone/drn.py b/docs/src/applications/Ma-Net/networks/backbone/drn.py
new file mode 100644
index 000000000..18d764ef3
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/backbone/drn.py
@@ -0,0 +1,400 @@
+import paddle.nn as nn
+import math
+
+webroot = 'https://tigress-web.princeton.edu/~fy/drn/models/'
+
+model_urls = {
+    'resnet50': 'https://download.pypaddle.org/models/resnet50-19c8e357.pth',
+    'drn-c-26': webroot + 'drn_c_26-ddedf421.pth',
+    'drn-c-42': webroot + 'drn_c_42-9d336e8c.pth',
+    'drn-c-58': webroot + 'drn_c_58-0a53a92c.pth',
+    'drn-d-22': webroot + 'drn_d_22-4bd2f8ea.pth',
+    'drn-d-38': webroot + 'drn_d_38-eebb45f0.pth',
+    'drn-d-54': webroot + 'drn_d_54-0e0534ff.pth',
+    'drn-d-105': webroot + 'drn_d_105-12b40979.pth'
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1, padding=1, dilation=1):
+    return nn.Conv2D(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=padding, bias_attr=False, dilation=dilation)
+
+
+class BasicBlock(nn.Layer):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 dilation=(1, 1), residual=True, BatchNorm=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride,
+                             padding=dilation[0], dilation=dilation[0])
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = conv3x3(planes, planes,
+                             padding=dilation[1], dilation=dilation[1])
+        self.bn2 = BatchNorm(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.residual = residual
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        if self.residual:
+            out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Layer):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 dilation=(1, 1), residual=True, BatchNorm=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
+        self.bn1 = BatchNorm(planes)
+        self.conv2 = nn.Conv2D(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation[1], bias_attr=False,
+                               dilation=dilation[1])
+        self.bn2 = BatchNorm(planes)
+        self.conv3 = nn.Conv2D(planes, planes * 4, kernel_size=1, bias_attr=False)
+        self.bn3 = BatchNorm(planes * 4)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class DRN(nn.Layer):
+
+    def __init__(self, block, layers, arch='D',
+                 channels=(16, 32, 64, 128, 256, 512, 512, 512),
+                 BatchNorm=None):
+        super(DRN, self).__init__()
+        self.inplanes = channels[0]
+        self.out_dim = channels[-1]
+        self.arch = arch
+
+        if arch == 'C':
+            self.conv1 = nn.Conv2D(3, channels[0], kernel_size=7, stride=1,
+                                   padding=3, bias_attr=False)
+            self.bn1 = BatchNorm(channels[0])
+            self.relu = nn.ReLU()
+
+            self.layer1 = self._make_layer(
+                BasicBlock, channels[0], layers[0], stride=1, BatchNorm=BatchNorm)
+            self.layer2 = self._make_layer(
+                BasicBlock, channels[1], layers[1], stride=2, BatchNorm=BatchNorm)
+
+        elif arch == 'D':
+            self.layer0 = nn.Sequential(
+                nn.Conv2D(3, channels[0], kernel_size=7, stride=1, padding=3,
+                          bias_attr=False),
+                BatchNorm(channels[0]),
+                nn.ReLU()
+            )
+
+            self.layer1 = self._make_conv_layers(
+                channels[0], layers[0], stride=1, BatchNorm=BatchNorm)
+            self.layer2 = self._make_conv_layers(
+                channels[1], layers[1], stride=2, BatchNorm=BatchNorm)
+
+        self.layer3 = self._make_layer(block, channels[2], layers[2], stride=2, BatchNorm=BatchNorm)
+        self.layer4 = self._make_layer(block, channels[3], layers[3], stride=2, BatchNorm=BatchNorm)
+        self.layer5 = self._make_layer(block, channels[4], layers[4],
+                                       dilation=2, new_level=False, BatchNorm=BatchNorm)
+        self.layer6 = None if layers[5] == 0 else \
+            self._make_layer(block, channels[5], layers[5], dilation=4,
+                             new_level=False, BatchNorm=BatchNorm)
+
+        if arch == 'C':
+            self.layer7 = None if layers[6] == 0 else \
+                self._make_layer(BasicBlock, channels[6], layers[6], dilation=2,
+                                 new_level=False, residual=False, BatchNorm=BatchNorm)
+            self.layer8 = None if layers[7] == 0 else \
+                self._make_layer(BasicBlock, channels[7], layers[7], dilation=1,
+                                 new_level=False, residual=False, BatchNorm=BatchNorm)
+        elif arch == 'D':
+            self.layer7 = None if layers[6] == 0 else \
+                self._make_conv_layers(channels[6], layers[6], dilation=2, BatchNorm=BatchNorm)
+            self.layer8 = None if layers[7] == 0 else \
+                self._make_conv_layers(channels[7], layers[7], dilation=1, BatchNorm=BatchNorm)
+
+        self._init_weight()
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                m.weight.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2D):
+                from manet_paddle.utils.api import fill_
+                fill_(m.weight, 1)
+                from manet_paddle.utils.api import zero_
+                zero_(m.bias)
+
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1,
+                    new_level=True, residual=True, BatchNorm=None):
+        assert dilation == 1 or dilation % 2 == 0
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = list()
+        layers.append(block(
+            self.inplanes, planes, stride, downsample,
+            dilation=(1, 1) if dilation == 1 else (
+                dilation // 2 if new_level else dilation, dilation),
+            residual=residual, BatchNorm=BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, residual=residual,
+                                dilation=(dilation, dilation), BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_layers(self, channels, convs, stride=1, dilation=1, BatchNorm=None):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2D(self.inplanes, channels, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias_attr=False, dilation=dilation),
+                BatchNorm(channels),
+                nn.ReLU()])
+            self.inplanes = channels
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        if self.arch == 'C':
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.relu(x)
+        elif self.arch == 'D':
+            x = self.layer0(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+
+        x = self.layer3(x)
+        low_level_feat = x
+
+        x = self.layer4(x)
+        x = self.layer5(x)
+
+        if self.layer6 is not None:
+            x = self.layer6(x)
+
+        if self.layer7 is not None:
+            x = self.layer7(x)
+
+        if self.layer8 is not None:
+            x = self.layer8(x)
+
+        return x, low_level_feat
+
+
+class DRN_A(nn.Layer):
+
+    def __init__(self, block, layers, BatchNorm=None):
+        self.inplanes = 64
+        super(DRN_A, self).__init__()
+        self.out_dim = 512 * block.expansion
+        self.conv1 = nn.Conv2D(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias_attr=False)
+        self.bn1 = BatchNorm(64)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0], BatchNorm=BatchNorm)
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, BatchNorm=BatchNorm)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
+                                       dilation=2, BatchNorm=BatchNorm)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
+                                       dilation=4, BatchNorm=BatchNorm)
+
+        self._init_weight()
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                m.weight.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2D):
+                from manet_paddle.utils.api import fill_
+                fill_(m.weight, 1)
+                from manet_paddle.utils.api import zero_
+                zero_(m.bias)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, BatchNorm=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, BatchNorm=BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes,
+                                dilation=(dilation, dilation, ), BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        return x
+
+def drn_a_50(BatchNorm, pretrained=True):
+    model = DRN_A(Bottleneck, [3, 4, 6, 3], BatchNorm=BatchNorm)
+
+    if pretrained:
+        import paddlehub as hub
+        model.set_state_dict(hub.Module(name="resnet50_vd_animals"))
+    return model
+
+
+def drn_c_26(BatchNorm, pretrained=True):
+    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='C', BatchNorm=BatchNorm)
+    if pretrained:
+        pretrained = model_zoo.load_url(model_urls['drn-c-26'])
+        del pretrained['fc.weight']
+        del pretrained['fc.bias']
+        model.set_state_dict(pretrained)
+    return model
+
+
+def drn_c_42(BatchNorm, pretrained=True):
+    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', BatchNorm=BatchNorm)
+    if pretrained:
+        pretrained = model_zoo.load_url(model_urls['drn-c-42'])
+        del pretrained['fc.weight']
+        del pretrained['fc.bias']
+        model.set_state_dict(pretrained)
+    return model
+
+
+def drn_c_58(BatchNorm, pretrained=True):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', BatchNorm=BatchNorm)
+    if pretrained:
+        pretrained = model_zoo.load_url(model_urls['drn-c-58'])
+        del pretrained['fc.weight']
+        del pretrained['fc.bias']
+        model.set_state_dict(pretrained)
+    return model
+
+
+def drn_d_22(BatchNorm, pretrained=True):
+    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='D', BatchNorm=BatchNorm)
+    if pretrained:
+        pretrained = model_zoo.load_url(model_urls['drn-d-22'])
+        del pretrained['fc.weight']
+        del pretrained['fc.bias']
+        model.set_state_dict(pretrained)
+    return model
+
+
+def drn_d_24(BatchNorm, pretrained=True):
+    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 2, 2], arch='D', BatchNorm=BatchNorm)
+    if pretrained:
+        pretrained = model_zoo.load_url(model_urls['drn-d-24'])
+        del pretrained['fc.weight']
+        del pretrained['fc.bias']
+        model.set_state_dict(pretrained)
+    return model
+
+
+def drn_d_38(BatchNorm, pretrained=True):
+    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', BatchNorm=BatchNorm)
+    if pretrained:
+        pretrained = model_zoo.load_url(model_urls['drn-d-38'])
+        del pretrained['fc.weight']
+        del pretrained['fc.bias']
+        model.set_state_dict(pretrained)
+    return model
+
+
+def drn_d_40(BatchNorm, pretrained=True):
+    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 2, 2], arch='D', BatchNorm=BatchNorm)
+    if pretrained:
+        pretrained = model_zoo.load_url(model_urls['drn-d-40'])
+        del pretrained['fc.weight']
+        del pretrained['fc.bias']
+        model.set_state_dict(pretrained)
+    return model
+
+
+def drn_d_54(BatchNorm, pretrained=True):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', BatchNorm=BatchNorm)
+    if pretrained:
+        pretrained = model_zoo.load_url(model_urls['drn-d-54'])
+        del pretrained['fc.weight']
+        del pretrained['fc.bias']
+        model.set_state_dict(pretrained)
+    return model
+
+
+def drn_d_105(BatchNorm, pretrained=True):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 23, 3, 1, 1], arch='D', BatchNorm=BatchNorm)
+    if pretrained:
+        pretrained = model_zoo.load_url(model_urls['drn-d-105'])
+        del pretrained['fc.weight']
+        del pretrained['fc.bias']
+        model.set_state_dict(pretrained)
+    return model
+
+if __name__ == "__main__":
+    import paddle
+    model = drn_a_50(BatchNorm=nn.BatchNorm2D, pretrained=True)
+    input = paddle.rand([1, 3, 512, 512])
+    output, low_level_feat = model(input)
+    print(output.shape)
+    print(low_level_feat.shape)
diff --git a/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py b/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py
new file mode 100644
index 000000000..affe97991
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/backbone/mobilenet.py
@@ -0,0 +1,163 @@
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+import math
+from utils.api import kaiming_normal_
+
+
+def conv_bn(inp, oup, stride, BatchNorm):
+    return nn.Sequential(nn.Conv2D(inp, oup, 3, stride, 1, bias_attr=False),
+                         BatchNorm(oup), nn.ReLU6())
+
+
+def fixed_padding(inputs, kernel_size, dilation):
+    kernel_size_effective = kernel_size + (kernel_size - 1) * (dilation - 1)
+    pad_total = kernel_size_effective - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+    padded_inputs = F.pad(inputs, (pad_beg, pad_end, pad_beg, pad_end))
+    return padded_inputs
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, inp, oup, stride, dilation, expand_ratio, BatchNorm):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+        self.kernel_size = 3
+        self.dilation = dilation
+
+        if expand_ratio == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2D(hidden_dim,
+                          hidden_dim,
+                          3,
+                          stride,
+                          0,
+                          dilation,
+                          groups=hidden_dim,
+                          bias_attr=False),
+                BatchNorm(hidden_dim),
+                nn.ReLU6(),
+                # pw-linear
+                nn.Conv2D(hidden_dim, oup, 1, 1, 0, 1, 1, bias_attr=False),
+                BatchNorm(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2D(inp, hidden_dim, 1, 1, 0, 1, bias_attr=False),
+                BatchNorm(hidden_dim),
+                nn.ReLU6(),
+                # dw
+                nn.Conv2D(hidden_dim,
+                          hidden_dim,
+                          3,
+                          stride,
+                          0,
+                          dilation,
+                          groups=hidden_dim,
+                          bias_attr=False),
+                BatchNorm(hidden_dim),
+                nn.ReLU6(),
+                # pw-linear
+                nn.Conv2D(hidden_dim, oup, 1, 1, 0, 1, bias_attr=False),
+                BatchNorm(oup),
+            )
+
+    def forward(self, x):
+        x_pad = fixed_padding(x, self.kernel_size, dilation=self.dilation)
+        if self.use_res_connect:
+            x = x + self.conv(x_pad)
+        else:
+            x = self.conv(x_pad)
+        return x
+
+
+class MobileNetV2(nn.Layer):
+    def __init__(self,
+                 output_stride=8,
+                 BatchNorm=None,
+                 width_mult=1.,
+                 pretrained=True):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        current_stride = 1
+        rate = 1
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
+
+        # building first layer
+        input_channel = int(input_channel * width_mult)
+        self.features = [conv_bn(3, input_channel, 2, BatchNorm)]
+        current_stride *= 2
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            if current_stride == output_stride:
+                stride = 1
+                dilation = rate
+                rate *= s
+            else:
+                stride = s
+                dilation = 1
+                current_stride *= s
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(
+                        block(input_channel, output_channel, stride, dilation,
+                              t, BatchNorm))
+                else:
+                    self.features.append(
+                        block(input_channel, output_channel, 1, dilation, t,
+                              BatchNorm))
+                input_channel = output_channel
+        self.features = nn.Sequential(*self.features)
+        self._initialize_weights()
+
+        if pretrained:
+            self._load_pretrained_model()
+
+        self.low_level_features = self.features[0:4]
+        self.high_level_features = self.features[4:]
+
+    def forward(self, x):
+        low_level_feat = self.low_level_features(x)
+        x = self.high_level_features(low_level_feat)
+        return x, low_level_feat
+
+    def _load_pretrained_model(self):
+        import paddlehub as hub
+        pretrain_dict = hub.Module(name="mobilenet_v2_imagenet")
+        model_dict = {}
+        state_dict = self.state_dict()
+        for k, v in pretrain_dict.items():
+            if k in state_dict:
+                model_dict[k] = v
+        state_dict.update(model_dict)
+        self.set_state_dict(state_dict)
+
+    def _initialize_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                # n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                # m.weight.normal_(0, math.sqrt(2. / n))
+                kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                from utils.api import fill_
+                fill_(m.weight, 1)
+                from utils.api import zero_
+                zero_(m.bias)
diff --git a/docs/src/applications/Ma-Net/networks/backbone/resnet.py b/docs/src/applications/Ma-Net/networks/backbone/resnet.py
new file mode 100644
index 000000000..310acbcb7
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/backbone/resnet.py
@@ -0,0 +1,239 @@
+import math
+import paddle.nn as nn
+# from reprod_log.utils import paddle2np
+import paddle
+
+from utils.api import normal_, fill_, zero_
+
+
+class Bottleneck(nn.Layer):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 BatchNorm=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
+        self.bn1 = BatchNorm(planes)
+        self.conv2 = nn.Conv2D(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               dilation=dilation,
+                               padding=dilation,
+                               bias_attr=False)
+        self.bn2 = BatchNorm(planes)
+        self.conv3 = nn.Conv2D(planes,
+                               planes * 4,
+                               kernel_size=1,
+                               bias_attr=False)
+        self.bn3 = BatchNorm(planes * 4)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Layer):
+    def __init__(self,
+                 block,
+                 layers,
+                 output_stride,
+                 BatchNorm,
+                 pretrained=False):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        blocks = [1, 2, 4]
+        if output_stride == 16:
+            strides = [1, 2, 2, 1]
+            dilations = [1, 1, 1, 2]
+        elif output_stride == 8:
+            strides = [1, 2, 1, 1]
+            dilations = [1, 1, 2, 4]
+        else:
+            raise NotImplementedError
+
+        # Modules
+        self.conv1 = nn.Conv2D(3,
+                               64,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias_attr=False)
+        self.bn1 = BatchNorm(64)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block,
+                                       64,
+                                       layers[0],
+                                       stride=strides[0],
+                                       dilation=dilations[0],
+                                       BatchNorm=BatchNorm)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=strides[1],
+                                       dilation=dilations[1],
+                                       BatchNorm=BatchNorm)
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=strides[2],
+                                       dilation=dilations[2],
+                                       BatchNorm=BatchNorm)
+        self.layer4 = self._make_MG_unit(block,
+                                         512,
+                                         blocks=blocks,
+                                         stride=strides[3],
+                                         dilation=dilations[3],
+                                         BatchNorm=BatchNorm)
+        # self.layer4 = self._make_layer(block, 512, layers[3], stride=strides[3], dilation=dilations[3], BatchNorm=BatchNorm)
+        self._init_weight()
+
+        if pretrained:
+            self._load_pretrained_model()
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    stride=1,
+                    dilation=1,
+                    BatchNorm=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes,
+                          planes * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, dilation, downsample,
+                  BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      dilation=dilation,
+                      BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def _make_MG_unit(self,
+                      block,
+                      planes,
+                      blocks,
+                      stride=1,
+                      dilation=1,
+                      BatchNorm=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes,
+                          planes * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes,
+                  planes,
+                  stride,
+                  dilation=blocks[0] * dilation,
+                  downsample=downsample,
+                  BatchNorm=BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, len(blocks)):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      stride=1,
+                      dilation=blocks[i] * dilation,
+                      BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, input):
+        #         print('input:', input.mean().item())
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        low_level_feat = x
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x, low_level_feat
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                fill_(m.weight, 1)
+                # normal_(m.weight, 0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2D):
+                fill_(m.weight, 1)
+                zero_(m.bias)
+        return self.sublayers()
+
+    def _load_pretrained_model(self):
+        # TODO
+        pretrain_dict = paddle.load(
+            '/home/lc/manet/manet_paddle/model_best.pdparams.tar')
+        model_dict = {}
+        state_dict = self.state_dict()
+        for k, v in pretrain_dict.items():
+            if k in state_dict:
+                model_dict[k] = v
+        state_dict.update(model_dict)
+        self.set_state_dict(state_dict)
+
+
+def ResNet101(output_stride, BatchNorm, pretrained=False):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3],
+                   output_stride,
+                   BatchNorm,
+                   pretrained=pretrained)
+    return model
diff --git a/docs/src/applications/Ma-Net/networks/backbone/xception.py b/docs/src/applications/Ma-Net/networks/backbone/xception.py
new file mode 100644
index 000000000..e5dfb562b
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/backbone/xception.py
@@ -0,0 +1,455 @@
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def fixed_padding(inputs, kernel_size, dilation):
+    kernel_size_effective = kernel_size + (kernel_size - 1) * (dilation - 1)
+    pad_total = kernel_size_effective - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+    padded_inputs = F.pad(inputs, (pad_beg, pad_end, pad_beg, pad_end))
+    return padded_inputs
+
+
+class SeparableConv2d(nn.Layer):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 kernel_size=3,
+                 stride=1,
+                 dilation=1,
+                 bias=False,
+                 BatchNorm=None):
+        super(SeparableConv2d, self).__init__()
+
+        self.conv1 = nn.Conv2D(inplanes,
+                               inplanes,
+                               kernel_size,
+                               stride,
+                               0,
+                               dilation,
+                               groups=inplanes,
+                               bias=bias)
+        self.bn = BatchNorm(inplanes)
+        self.pointwise = nn.Conv2D(inplanes, planes, 1, 1, 0, 1, 1, bias=bias)
+
+    def forward(self, x):
+        x = fixed_padding(x,
+                          self.conv1._kernel_size[0],
+                          dilation=self.conv1.dilation[0])
+        x = self.conv1(x)
+        x = self.bn(x)
+        x = self.pointwise(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 reps,
+                 stride=1,
+                 dilation=1,
+                 BatchNorm=None,
+                 start_with_relu=True,
+                 grow_first=True,
+                 is_last=False):
+        super(Block, self).__init__()
+
+        if planes != inplanes or stride != 1:
+            self.skip = nn.Conv2D(inplanes,
+                                  planes,
+                                  1,
+                                  stride=stride,
+                                  bias_attr=False)
+            self.skipbn = BatchNorm(planes)
+        else:
+            self.skip = None
+
+        self.relu = nn.ReLU()
+        rep = []
+
+        filters = inplanes
+        if grow_first:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d(inplanes,
+                                planes,
+                                3,
+                                1,
+                                dilation,
+                                BatchNorm=BatchNorm))
+            rep.append(BatchNorm(planes))
+            filters = planes
+
+        for i in range(reps - 1):
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d(filters,
+                                filters,
+                                3,
+                                1,
+                                dilation,
+                                BatchNorm=BatchNorm))
+            rep.append(BatchNorm(filters))
+
+        if not grow_first:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d(inplanes,
+                                planes,
+                                3,
+                                1,
+                                dilation,
+                                BatchNorm=BatchNorm))
+            rep.append(BatchNorm(planes))
+
+        if stride != 1:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d(planes, planes, 3, 2, BatchNorm=BatchNorm))
+            rep.append(BatchNorm(planes))
+
+        if stride == 1 and is_last:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d(planes, planes, 3, 1, BatchNorm=BatchNorm))
+            rep.append(BatchNorm(planes))
+
+        if not start_with_relu:
+            rep = rep[1:]
+
+        self.rep = nn.Sequential(*rep)
+
+    def forward(self, inp):
+        x = self.rep(inp)
+
+        if self.skip is not None:
+            skip = self.skip(inp)
+            skip = self.skipbn(skip)
+        else:
+            skip = inp
+
+        x = x + skip
+
+        return x
+
+
+class AlignedXception(nn.Layer):
+    """
+    Modified Alighed Xception
+    """
+    def __init__(self, output_stride, BatchNorm, pretrained=True):
+        super(AlignedXception, self).__init__()
+
+        if output_stride == 16:
+            entry_block3_stride = 2
+            middle_block_dilation = 1
+            exit_block_dilations = (1, 2)
+        elif output_stride == 8:
+            entry_block3_stride = 1
+            middle_block_dilation = 2
+            exit_block_dilations = (2, 4)
+        else:
+            raise NotImplementedError
+
+        # Entry flow
+        self.conv1 = nn.Conv2D(3, 32, 3, stride=2, padding=1, bias_attr=False)
+        self.bn1 = BatchNorm(32)
+        self.relu = nn.ReLU()
+
+        self.conv2 = nn.Conv2D(32, 64, 3, stride=1, padding=1, bias_attr=False)
+        self.bn2 = BatchNorm(64)
+
+        self.block1 = Block(64,
+                            128,
+                            reps=2,
+                            stride=2,
+                            BatchNorm=BatchNorm,
+                            start_with_relu=False)
+        self.block2 = Block(128,
+                            256,
+                            reps=2,
+                            stride=2,
+                            BatchNorm=BatchNorm,
+                            start_with_relu=False,
+                            grow_first=True)
+        self.block3 = Block(256,
+                            728,
+                            reps=2,
+                            stride=entry_block3_stride,
+                            BatchNorm=BatchNorm,
+                            start_with_relu=True,
+                            grow_first=True,
+                            is_last=True)
+
+        # Middle flow
+        self.block4 = Block(728,
+                            728,
+                            reps=3,
+                            stride=1,
+                            dilation=middle_block_dilation,
+                            BatchNorm=BatchNorm,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block5 = Block(728,
+                            728,
+                            reps=3,
+                            stride=1,
+                            dilation=middle_block_dilation,
+                            BatchNorm=BatchNorm,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block6 = Block(728,
+                            728,
+                            reps=3,
+                            stride=1,
+                            dilation=middle_block_dilation,
+                            BatchNorm=BatchNorm,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block7 = Block(728,
+                            728,
+                            reps=3,
+                            stride=1,
+                            dilation=middle_block_dilation,
+                            BatchNorm=BatchNorm,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block8 = Block(728,
+                            728,
+                            reps=3,
+                            stride=1,
+                            dilation=middle_block_dilation,
+                            BatchNorm=BatchNorm,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block9 = Block(728,
+                            728,
+                            reps=3,
+                            stride=1,
+                            dilation=middle_block_dilation,
+                            BatchNorm=BatchNorm,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block10 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block11 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block12 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block13 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block14 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block15 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block16 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block17 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block18 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block19 = Block(728,
+                             728,
+                             reps=3,
+                             stride=1,
+                             dilation=middle_block_dilation,
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=True)
+
+        # Exit flow
+        self.block20 = Block(728,
+                             1024,
+                             reps=2,
+                             stride=1,
+                             dilation=exit_block_dilations[0],
+                             BatchNorm=BatchNorm,
+                             start_with_relu=True,
+                             grow_first=False,
+                             is_last=True)
+
+        self.conv3 = SeparableConv2d(1024,
+                                     1536,
+                                     3,
+                                     stride=1,
+                                     dilation=exit_block_dilations[1],
+                                     BatchNorm=BatchNorm)
+        self.bn3 = BatchNorm(1536)
+
+        self.conv4 = SeparableConv2d(1536,
+                                     1536,
+                                     3,
+                                     stride=1,
+                                     dilation=exit_block_dilations[1],
+                                     BatchNorm=BatchNorm)
+        self.bn4 = BatchNorm(1536)
+
+        self.conv5 = SeparableConv2d(1536,
+                                     2048,
+                                     3,
+                                     stride=1,
+                                     dilation=exit_block_dilations[1],
+                                     BatchNorm=BatchNorm)
+        self.bn5 = BatchNorm(2048)
+
+        # Init weights
+        self._init_weight()
+
+        # Load pretrained model
+        if pretrained:
+            self._load_pretrained_model()
+
+    def forward(self, x):
+        # Entry flow
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.block1(x)
+        # add relu here
+        x = self.relu(x)
+        low_level_feat = x
+        x = self.block2(x)
+        x = self.block3(x)
+
+        # Middle flow
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.block6(x)
+        x = self.block7(x)
+        x = self.block8(x)
+        x = self.block9(x)
+        x = self.block10(x)
+        x = self.block11(x)
+        x = self.block12(x)
+        x = self.block13(x)
+        x = self.block14(x)
+        x = self.block15(x)
+        x = self.block16(x)
+        x = self.block17(x)
+        x = self.block18(x)
+        x = self.block19(x)
+
+        # Exit flow
+        x = self.block20(x)
+        x = self.relu(x)
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu(x)
+
+        x = self.conv4(x)
+        x = self.bn4(x)
+        x = self.relu(x)
+
+        x = self.conv5(x)
+        x = self.bn5(x)
+        x = self.relu(x)
+
+        return x, low_level_feat
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                m.weight.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2D):
+                from utils.api import fill_
+                fill_(m.weight, 1)
+                from utils.api import zero_
+                zero_(m.bias)
+
+    def _load_pretrained_model(self):
+        import paddlehub as hub
+        pretrain_dict = hub.Module(name="xception71_imagenet")
+        model_dict = {}
+        state_dict = self.state_dict()
+
+        for k, v in pretrain_dict.items():
+            if k in model_dict:
+                if 'pointwise' in k:
+                    v = v.unsqueeze(-1).unsqueeze(-1)
+                if k.startswith('block11'):
+                    model_dict[k] = v
+                    model_dict[k.replace('block11', 'block12')] = v
+                    model_dict[k.replace('block11', 'block13')] = v
+                    model_dict[k.replace('block11', 'block14')] = v
+                    model_dict[k.replace('block11', 'block15')] = v
+                    model_dict[k.replace('block11', 'block16')] = v
+                    model_dict[k.replace('block11', 'block17')] = v
+                    model_dict[k.replace('block11', 'block18')] = v
+                    model_dict[k.replace('block11', 'block19')] = v
+                elif k.startswith('block12'):
+                    model_dict[k.replace('block12', 'block20')] = v
+                elif k.startswith('bn3'):
+                    model_dict[k] = v
+                    model_dict[k.replace('bn3', 'bn4')] = v
+                elif k.startswith('conv4'):
+                    model_dict[k.replace('conv4', 'conv5')] = v
+                elif k.startswith('bn4'):
+                    model_dict[k.replace('bn4', 'bn5')] = v
+                else:
+                    model_dict[k] = v
+        state_dict.update(model_dict)
+        self.set_state_dict(state_dict)
diff --git a/docs/src/applications/Ma-Net/networks/decoder.py b/docs/src/applications/Ma-Net/networks/decoder.py
new file mode 100644
index 000000000..f4bb19f0e
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/decoder.py
@@ -0,0 +1,66 @@
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from utils.api import kaiming_normal_
+
+
+class Decoder(nn.Layer):
+    def __init__(self, num_classes, backbone, BatchNorm):
+        super(Decoder, self).__init__()
+        if backbone == 'resnet' or backbone == 'drn' or backbone == 'resnet_edge':
+            low_level_inplanes = 256
+        elif backbone == 'xception':
+            low_level_inplanes = 128
+        elif backbone == 'mobilenet':
+            low_level_inplanes = 24
+        else:
+            raise NotImplementedError
+
+        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)
+        self.bn1 = BatchNorm(48)
+        self.relu = nn.ReLU(True)
+        self.last_conv = nn.Sequential(
+            nn.Conv2D(304,
+                      256,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=False), BatchNorm(256), nn.ReLU(True),
+            nn.Sequential(),
+            nn.Conv2D(256,
+                      256,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=False), BatchNorm(256), nn.ReLU(True),
+            nn.Sequential())
+        self._init_weight()
+
+    def forward(self, x, low_level_feat):
+        low_level_feat = self.conv1(low_level_feat)
+        low_level_feat = self.bn1(low_level_feat)
+        low_level_feat = self.relu(low_level_feat)
+
+        x = F.interpolate(x,
+                          size=low_level_feat.shape[2:],
+                          mode='bilinear',
+                          align_corners=True)
+        x = paddle.concat((x, low_level_feat), axis=1)
+        x = self.last_conv(x)
+
+        return x
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                from utils.api import fill_
+                fill_(m.weight, 1)
+                from utils.api import zero_
+                zero_(m.bias)
+
+
+def build_decoder(num_classes, backbone, BatchNorm):
+    return Decoder(num_classes, backbone, BatchNorm)
diff --git a/docs/src/applications/Ma-Net/networks/deeplab.py b/docs/src/applications/Ma-Net/networks/deeplab.py
new file mode 100644
index 000000000..fe8015733
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/deeplab.py
@@ -0,0 +1,81 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from networks.aspp import build_aspp
+from networks.decoder import build_decoder
+from networks.backbone import build_backbone
+
+
+class FrozenBatchNorm2d(nn.Layer):
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", paddle.ones(n))
+        self.register_buffer("bias", paddle.zeros(n))
+        self.register_buffer("running_mean", paddle.zeros(n))
+        self.register_buffer("running_var", paddle.ones(n))
+
+    def forward(self, x):
+        if x.dtype == paddle.float16:
+            self.weight = self.weight.half()
+            self.bias = self.bias.half()
+            self.running_mean = self.running_mean.half()
+            self.running_var = self.running_var.half()
+        scale = self.weight * self.running_var.rsqrt()
+        bias = self.bias - self.running_mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+        return x * scale + bias
+
+
+class DeepLab(nn.Layer):
+    def __init__(self,
+                 backbone='resnet',
+                 output_stride=16,
+                 num_classes=21,
+                 sync_bn=True,
+                 freeze_bn=False):
+        super(DeepLab, self).__init__()
+        if backbone == 'drn':
+            output_stride = 8
+        if freeze_bn == True:
+            print("Use frozen BN in DeepLab")
+            BatchNorm = FrozenBatchNorm2d
+        else:
+            BatchNorm = nn.BatchNorm2D
+
+        self.backbone = build_backbone(backbone, output_stride, BatchNorm)
+        self.aspp = build_aspp(backbone, output_stride, BatchNorm)
+        self.decoder = build_decoder(num_classes, backbone, BatchNorm)
+
+    def forward(self, input):
+        x, low_level_feat = self.backbone(input)
+        x = self.aspp(x)
+        x = self.decoder(x, low_level_feat)
+
+        return x
+
+    def freeze_bn(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.BatchNorm2D):
+                m.eval()
+
+    def get_1x_lr_params(self):
+        modules = [self.backbone]
+        for i in range(len(modules)):
+            for m in modules[i].named_modules():
+                if isinstance(m[1], nn.Conv2D) or isinstance(
+                        m[1], nn.BatchNorm2D):
+                    for p in m[1].parameters():
+                        if p.requires_grad:
+                            yield p
+
+    def get_10x_lr_params(self):
+        modules = [self.aspp, self.decoder]
+        for i in range(len(modules)):
+            for m in modules[i].named_modules():
+                if isinstance(m[1], nn.Conv2D) or isinstance(
+                        m[1], nn.BatchNorm2D):
+                    for p in m[1].parameters():
+                        if p.requires_grad:
+                            yield p
diff --git a/docs/src/applications/Ma-Net/networks/loss.py b/docs/src/applications/Ma-Net/networks/loss.py
new file mode 100644
index 000000000..e818b8c42
--- /dev/null
+++ b/docs/src/applications/Ma-Net/networks/loss.py
@@ -0,0 +1,153 @@
+import paddle
+import paddle.nn as nn
+import os
+
+
+class Added_BCEWithLogitsLoss(nn.Layer):
+    def __init__(self,
+                 top_k_percent_pixels=None,
+                 hard_example_mining_step=100000):
+        super(Added_BCEWithLogitsLoss, self).__init__()
+        self.top_k_percent_pixels = top_k_percent_pixels
+        if top_k_percent_pixels is not None:
+            assert (top_k_percent_pixels > 0 and top_k_percent_pixels < 1)
+        self.hard_example_mining_step = hard_example_mining_step
+        if self.top_k_percent_pixels == None:
+            self.bceloss = nn.BCEWithLogitsLoss(reduction='mean')
+        else:
+            self.bceloss = nn.BCEWithLogitsLoss(reduction='none')
+
+    def forward(self, dic_tmp, y, step):
+        final_loss = 0
+        for seq_name in dic_tmp.keys():
+            pred_logits = dic_tmp[seq_name]
+            gts = y[seq_name]
+            if self.top_k_percent_pixels == None:
+                final_loss += self.bceloss(pred_logits, gts)
+            else:
+                # Only compute the loss for top k percent pixels.
+                # First, compute the loss for all pixels. Note we do not put the loss
+                # to loss_collection and set reduction = None to keep the shape.
+                num_pixels = float(pred_logits.shape[2] * pred_logits.shape[3])
+                pred_logits = pred_logits.view(
+                    -1, pred_logits.shape[1],
+                    pred_logits.shape[2] * pred_logits.shape[3])
+                gts = gts.view(-1, gts.shape[1], gts.shape[2] * gts.shape[3])
+                pixel_losses = self.bceloss(pred_logits, gts)
+                if self.hard_example_mining_step == 0:
+                    top_k_pixels = int(self.top_k_percent_pixels * num_pixels)
+                else:
+                    ratio = min(1.0,
+                                step / float(self.hard_example_mining_step))
+                    top_k_pixels = int((ratio * self.top_k_percent_pixels +
+                                        (1.0 - ratio)) * num_pixels)
+                _, top_k_indices = paddle.topk(pixel_losses,
+                                               k=top_k_pixels,
+                                               axis=2)
+
+                final_loss += nn.BCEWithLogitsLoss(weight=top_k_indices,
+                                                   reduction='mean')(
+                                                       pred_logits, gts)
+        return final_loss
+
+
+class Added_CrossEntropyLoss(nn.Layer):
+    def __init__(self,
+                 top_k_percent_pixels=None,
+                 hard_example_mining_step=100000):
+        super(Added_CrossEntropyLoss, self).__init__()
+        self.top_k_percent_pixels = top_k_percent_pixels
+        if top_k_percent_pixels is not None:
+            assert (top_k_percent_pixels > 0 and top_k_percent_pixels < 1)
+        self.hard_example_mining_step = hard_example_mining_step
+        if self.top_k_percent_pixels == None:
+            self.celoss = nn.CrossEntropyLoss(ignore_index=255,
+                                              reduction='mean')
+        else:
+            self.celoss = nn.CrossEntropyLoss(ignore_index=255,
+                                              reduction='none')
+
+    def forward(self, dic_tmp, y, step):
+        final_loss = 0
+        for seq_name in dic_tmp.keys():
+            pred_logits = dic_tmp[seq_name]
+            gts = y[seq_name]
+            if self.top_k_percent_pixels == None:
+                final_loss += self.celoss(pred_logits, gts)
+            else:
+                # Only compute the loss for top k percent pixels.
+                # First, compute the loss for all pixels. Note we do not put the loss
+                # to loss_collection and set reduction = None to keep the shape.
+                num_pixels = float(pred_logits.shape[2] * pred_logits.shape[3])
+                pred_logits = pred_logits.reshape([
+                    pred_logits.shape[1],
+                    pred_logits.shape[2] * pred_logits.shape[3]
+                ]).transpose([1, 0])
+                gts = gts.reshape([gts.shape[1] * gts.shape[2]])
+                pixel_losses = self.celoss(pred_logits, gts).reshape([1, -1])
+                if self.hard_example_mining_step == 0:
+                    top_k_pixels = int(self.top_k_percent_pixels * num_pixels)
+                else:
+                    ratio = min(1.0,
+                                step / float(self.hard_example_mining_step))
+                    top_k_pixels = int((ratio * self.top_k_percent_pixels +
+                                        (1.0 - ratio)) * num_pixels)
+                top_k_loss, top_k_indices = paddle.topk(pixel_losses,
+                                                        k=top_k_pixels,
+                                                        axis=1)
+
+                final_loss += paddle.mean(top_k_loss)
+        return final_loss
+
+
+class AddedEdge_CrossEntropyLoss(nn.Layer):
+    def __init__(self,
+                 top_k_percent_pixels=None,
+                 hard_example_mining_step=100000):
+        super(AddedEdge_CrossEntropyLoss, self).__init__()
+        self.top_k_percent_pixels = top_k_percent_pixels
+        if top_k_percent_pixels is not None:
+            assert (top_k_percent_pixels > 0 and top_k_percent_pixels < 1)
+        self.hard_example_mining_step = hard_example_mining_step
+        self.celoss = None
+
+    def forward(self, pred_logits, gts, step):
+        pos_num = paddle.sum(gts == 1, dtype='float32')
+        neg_num = paddle.sum(gts == 0, dtype='float32')
+
+        weight_pos = neg_num / (pos_num + neg_num)
+        weight_neg = pos_num / (pos_num + neg_num)
+        weights = paddle.to_tensor([weight_neg, weight_pos])
+        if self.top_k_percent_pixels == None:
+            sig_pred_logits = paddle.nn.functional.sigmoid(pred_logits)
+            self.bceloss = nn.BCEWithLogitsLoss(pos_weight=weight_pos,
+                                                reduction='mean')
+            if paddle.sum(gts) == 0:
+                dcloss = 0
+            else:
+                dcloss = (paddle.sum(sig_pred_logits * sig_pred_logits) +
+                          paddle.sum(gts * gts)) / (
+                              paddle.sum(2 * sig_pred_logits * gts) + 1e-5)
+            final_loss = 0.1 * self.bceloss(pred_logits, gts) + dcloss
+        else:
+            self.celoss = nn.CrossEntropyLoss(weight=weights,
+                                              ignore_index=255,
+                                              reduction='none')
+            num_pixels = float(pred_logits.shape[2] * pred_logits.shape[3])
+            pred_logits = pred_logits.view(
+                -1, pred_logits.shape[1],
+                pred_logits.shape[2] * pred_logits.shape[3])
+            gts = gts.view(-1, gts.shape[2] * gts.shape[3])
+            pixel_losses = self.celoss(pred_logits, gts)
+            if self.hard_example_mining_step == 0:
+                top_k_pixels = int(self.top_k_percent_pixels * num_pixels)
+            else:
+                ratio = min(1.0, step / float(self.hard_example_mining_step))
+                top_k_pixels = int((ratio * self.top_k_percent_pixels +
+                                    (1.0 - ratio)) * num_pixels)
+            top_k_loss, top_k_indices = paddle.topk(pixel_losses,
+                                                    k=top_k_pixels,
+                                                    axis=1)
+
+            final_loss = paddle.mean(top_k_loss)
+        return final_loss
diff --git a/docs/src/applications/Ma-Net/run.sh b/docs/src/applications/Ma-Net/run.sh
new file mode 100644
index 000000000..2d7d1e3e4
--- /dev/null
+++ b/docs/src/applications/Ma-Net/run.sh
@@ -0,0 +1,15 @@
+PRETRAIN_MODEL='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/DeeplabV3_coco.pdparams'
+VOS_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/MaNet_davis2017_stage1.pdparams'
+#VOS_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/stage1'
+INT_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/MANet_davis2017.pdparams'
+#INT_SAVE_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/stage2'
+INT_RESULT_DIR='/home/lc/PaddleVideo/applications/Ma-Net/saved_model/result'
+RESCALE=416
+RANDOMCROP=416
+DATA_ROOT='/home/lc/PaddleVideo/data/DAVIS'
+echo 'Stage1 training'
+CUDA_VISIBLE_DEVICE=3 python train_stage1.py --SAVE_RESULT_DIR $VOS_SAVE_RESULT_DIR --PRETRAINED_MODEL $PRETRAIN_MODEL --DATA_ROOT $DATA_ROOT --TRAIN_BATCH_SIZE 2 --DATA_RESCALE $RESCALE --DATA_RANDOMCROP $RANDOMCROP --TRAIN_LR 0.0007  --MODEL_MAX_LOCAL_DISTANCE 12
+echo 'Stage2 training'
+python train_stage2.py --SAVE_RESULT_DIR $INT_SAVE_RESULT_DIR --SAVE_VOS_RESULT_DIR $VOS_SAVE_RESULT_DIR --DATA_ROOT $DATA_ROOT --DATA_RESCALE $RESCALE --DATA_RANDOMCROP $RANDOMCROP  --PRETRAINED_MODEL $PRETRAIN_MODEL
+echo 'Testing'
+python test.py --DATA_ROOT $DATA_ROOT --SAVE_RESULT_DIR $INT_SAVE_RESULT_DIR  --RESULT_ROOT $INT_RESULT_DIR --MODEL_USEIntSeg False --TEST_MODE True
diff --git a/docs/src/applications/Ma-Net/test.py b/docs/src/applications/Ma-Net/test.py
new file mode 100644
index 000000000..77df57921
--- /dev/null
+++ b/docs/src/applications/Ma-Net/test.py
@@ -0,0 +1,525 @@
+import cv2
+import os
+import json
+
+import paddle
+from PIL import Image
+import timeit
+import numpy as np
+from paddle.vision import transforms
+
+from dataloaders.davis_2017_f import DAVIS2017_Feature_Extract
+import dataloaders.custom_transforms_f as tr
+from davisinteractive.session import DavisInteractiveSession
+from networks.deeplab import DeepLab
+from networks.IntVOS import IntVOS
+import time
+from davisinteractive.utils.scribbles import scribbles2mask, annotated_frames
+from config import cfg
+from paddle import nn
+from paddle.io import DataLoader
+
+from utils.api import float_, byte_
+
+
+@paddle.no_grad()
+def main():
+    paddle.set_device("gpu:0")
+    total_frame_num_dic = {}
+    #################
+    seqs = []
+    with open(os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',
+                           'val' + '.txt')) as f:
+        seqs_tmp = f.readlines()
+        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
+        seqs.extend(seqs_tmp)
+    h_w_dic = {}
+    for seq_name in seqs:
+        images = np.sort(
+            os.listdir(
+                os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/',
+                             seq_name.strip())))
+        total_frame_num_dic[seq_name] = len(images)
+        im_ = cv2.imread(
+            os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/', seq_name,
+                         '00000.jpg'))
+        im_ = np.array(im_, dtype=np.float32)
+        hh_, ww_ = im_.shape[:2]
+        h_w_dic[seq_name] = (hh_, ww_)
+    _seq_list_file = os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',
+                                  'v_a_l' + '_instances.txt')
+    seq_dict = json.load(open(_seq_list_file, 'r'))
+    ##################
+    seq_imgnum_dict_ = {}
+    seq_imgnum_dict = os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',
+                                   'val_imgnum.txt')
+    if os.path.isfile(seq_imgnum_dict):
+
+        seq_imgnum_dict_ = json.load(open(seq_imgnum_dict, 'r'))
+    else:
+        for seq in os.listdir(os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/')):
+            seq_imgnum_dict_[seq] = len(
+                os.listdir(os.path.join(cfg.DATA_ROOT, 'JPEGImages/480p/',
+                                        seq)))
+        with open(seq_imgnum_dict, 'w') as f:
+            json.dump(seq_imgnum_dict_, f)
+
+    ##################
+
+    is_save_image = False  # Save the predicted masks
+    report_save_dir = cfg.RESULT_ROOT
+    save_res_dir = cfg.SAVE_RESULT_DIR  # changed to path
+    if not os.path.exists(cfg.RESULT_ROOT):
+        os.makedirs(cfg.RESULT_ROOT)
+        # Configuration used in the challenges
+    max_nb_interactions = 8  # Maximum number of interactions
+    max_time_per_interaction = 30  # Maximum time per interaction per object
+    # Total time available to interact with a sequence and an initial set of scribbles
+    max_time = max_nb_interactions * max_time_per_interaction  # Maximum time per object
+    # Interactive parameters
+    subset = 'val'
+    host = 'localhost'  # 'localhost' for subsets train and val.
+
+    feature_extracter = DeepLab(backbone='resnet', freeze_bn=False)
+    model = IntVOS(cfg, feature_extracter)
+    print('model loading...')
+
+    saved_model_dict = save_res_dir
+    pretrained_dict = paddle.load(saved_model_dict)
+    load_network(model, pretrained_dict)
+
+    print(f'model loading from {saved_model_dict} finished!')
+    model.eval()
+    inter_file = open(os.path.join(cfg.RESULT_ROOT, 'inter_file.txt'), 'w')
+    resized_h, resized_w = 480, 854
+    ###############################
+    composed_transforms = transforms.Compose(
+        [tr.Resize((resized_h, resized_w)),
+         tr.ToTensor()])
+    ###############################
+
+    seen_seq = []
+    n = 0
+    max_n = 1
+    with DavisInteractiveSession(host=host,
+                                 davis_root=cfg.DATA_ROOT,
+                                 subset=subset,
+                                 report_save_dir=report_save_dir,
+                                 max_nb_interactions=max_nb_interactions,
+                                 max_time=max_time,
+                                 metric_to_optimize='J') as sess:
+        while sess.next():
+            t_total = timeit.default_timer()
+            # Get the current iteration scribbles
+
+            sequence, scribbles, first_scribble = sess.get_scribbles(
+                only_last=True)
+            h, w = h_w_dic[sequence]
+            if 'prev_label_storage' not in locals().keys():
+                prev_label_storage = paddle.zeros(
+                    [104, h, w])  # because the maximum length of frames is 104.
+            print(sequence)
+            h, w = h_w_dic[sequence]
+            if len(
+                    annotated_frames(scribbles)
+            ) == 0:  # if no scribbles return, keep masks in previous round
+
+                final_masks = prev_label_storage[:seq_imgnum_dict_[sequence]]
+                sess.submit_masks(final_masks.numpy())
+            else:
+
+                start_annotated_frame = annotated_frames(scribbles)[0]
+
+                pred_masks = []
+                pred_masks_reverse = []
+
+                if first_scribble:  # If in the first round, initialize memories
+                    n_interaction = 1
+                    eval_global_map_tmp_dic = {}
+                    local_map_dics = ({}, {})
+                    total_frame_num = total_frame_num_dic[sequence]
+                    obj_nums = seq_dict[sequence][-1]
+
+                else:
+                    n_interaction += 1
+                ##
+                inter_file.write(sequence + ' ' + 'interaction' +
+                                 str(n_interaction) + ' ' + 'frame' +
+                                 str(start_annotated_frame) + '\n')
+                ##
+
+                ##########################Reference image process
+
+                if first_scribble:  # if in the first round, extract pixel embbedings.
+                    if sequence not in seen_seq:
+                        inter_turn = 1
+
+                        seen_seq.append(sequence)
+                        embedding_memory = []
+                        test_dataset = DAVIS2017_Feature_Extract(
+                            root=cfg.DATA_ROOT,
+                            transform=composed_transforms,
+                            seq_name=sequence)
+                        testloader = DataLoader(test_dataset,
+                                                batch_size=14,
+                                                shuffle=False,
+                                                num_workers=cfg.NUM_WORKER)
+                        for ii, sample in enumerate(testloader):
+                            imgs = sample['img1']
+                            frame_embedding = model.extract_feature(imgs)
+                            embedding_memory.append(frame_embedding)
+                        del frame_embedding
+
+                        embedding_memory = paddle.concat(embedding_memory, 0)
+                        _, _, emb_h, emb_w = embedding_memory.shape
+                        ref_frame_embedding = embedding_memory[
+                            start_annotated_frame]
+                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)
+
+                    else:
+                        inter_turn += 1
+                        ref_frame_embedding = embedding_memory[
+                            start_annotated_frame]
+                        ref_frame_embedding = ref_frame_embedding.unsqueeze(0)
+
+                else:
+                    ref_frame_embedding = embedding_memory[
+                        start_annotated_frame]
+                    ref_frame_embedding = ref_frame_embedding.unsqueeze(0)
+                ########
+                scribble_masks = scribbles2mask(scribbles, (emb_h, emb_w))
+                scribble_label = scribble_masks[start_annotated_frame]
+                scribble_sample = {'scribble_label': scribble_label}
+                scribble_sample = tr.ToTensor()(scribble_sample)
+                #                     print(ref_frame_embedding, ref_frame_embedding.shape)
+                scribble_label = scribble_sample['scribble_label']
+
+                scribble_label = scribble_label.unsqueeze(0)
+
+                ######
+                if is_save_image:
+                    ref_scribble_to_show = scribble_label.squeeze().numpy()
+                    im_ = Image.fromarray(
+                        ref_scribble_to_show.astype('uint8')).convert('P', )
+                    im_.putpalette(_palette)
+                    ref_img_name = str(start_annotated_frame)
+
+                    if not os.path.exists(
+                            os.path.join(cfg.RESULT_ROOT, sequence,
+                                         'interactive' + str(n_interaction),
+                                         'turn' + str(inter_turn))):
+                        os.makedirs(
+                            os.path.join(cfg.RESULT_ROOT, sequence,
+                                         'interactive' + str(n_interaction),
+                                         'turn' + str(inter_turn)))
+                    im_.save(
+                        os.path.join(cfg.RESULT_ROOT, sequence,
+                                     'interactive' + str(n_interaction),
+                                     'turn' + str(inter_turn),
+                                     'inter_' + ref_img_name + '.png'))
+
+                scribble_label = scribble_label
+
+                #######
+                if first_scribble:
+
+                    prev_label = None
+                    prev_label_storage = paddle.zeros([104, h, w])
+                    prev_label_storage = prev_label_storage
+                else:
+                    prev_label = prev_label_storage[start_annotated_frame]
+                    prev_label = prev_label.unsqueeze(0).unsqueeze(0)
+                if not first_scribble and paddle.unique(
+                        scribble_label).shape[0] == 1:
+                    final_masks = prev_label_storage[:
+                                                     seq_imgnum_dict_[sequence]]
+                    sess.submit_masks(final_masks.numpy())
+
+                else:  ###inteaction segmentation head
+                    print('inteaction segmentation head')
+                    tmp_dic, local_map_dics = model.int_seghead(
+                        ref_frame_embedding=ref_frame_embedding,
+                        ref_scribble_label=scribble_label,
+                        prev_round_label=prev_label,
+                        global_map_tmp_dic=eval_global_map_tmp_dic,
+                        local_map_dics=local_map_dics,
+                        interaction_num=n_interaction,
+                        seq_names=[sequence],
+                        gt_ids=paddle.to_tensor([obj_nums]),
+                        frame_num=[start_annotated_frame],
+                        first_inter=first_scribble)
+                    pred_label = tmp_dic[sequence]
+                    pred_label = nn.functional.interpolate(pred_label,
+                                                           size=(h, w),
+                                                           mode='bilinear',
+                                                           align_corners=True)
+                    pred_label = paddle.argmax(pred_label, axis=1)
+                    pred_masks.append(float_(pred_label))
+                    prev_label_storage[start_annotated_frame] = float_(
+                        pred_label[0])
+
+                    if is_save_image:  # save image
+                        pred_label_to_save = pred_label.squeeze(0).numpy()
+                        im = Image.fromarray(
+                            pred_label_to_save.astype('uint8')).convert('P', )
+                        im.putpalette(_palette)
+                        imgname = str(start_annotated_frame)
+                        while len(imgname) < 5:
+                            imgname = '0' + imgname
+                        if not os.path.exists(
+                                os.path.join(cfg.RESULT_ROOT, sequence,
+                                             'interactive' + str(n_interaction),
+                                             'turn' + str(inter_turn))):
+                            os.makedirs(
+                                os.path.join(cfg.RESULT_ROOT, sequence,
+                                             'interactive' + str(n_interaction),
+                                             'turn' + str(inter_turn)))
+                        im.save(
+                            os.path.join(cfg.RESULT_ROOT, sequence,
+                                         'interactive' + str(n_interaction),
+                                         'turn' + str(inter_turn),
+                                         imgname + '.png'))
+                    #######################################
+                    if first_scribble:
+                        scribble_label = rough_ROI(scribble_label)
+
+                    ##############################
+                    ref_prev_label = pred_label.unsqueeze(0)
+                    prev_label = pred_label.unsqueeze(0)
+                    prev_embedding = ref_frame_embedding
+                    #### Propagation ->
+                    for ii in range(start_annotated_frame + 1, total_frame_num):
+                        current_embedding = embedding_memory[ii]
+                        current_embedding = current_embedding.unsqueeze(0)
+                        prev_label = prev_label
+                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.prop_seghead(
+                            ref_frame_embedding,
+                            prev_embedding,
+                            current_embedding,
+                            scribble_label,
+                            prev_label,
+                            normalize_nearest_neighbor_distances=True,
+                            use_local_map=True,
+                            seq_names=[sequence],
+                            gt_ids=paddle.to_tensor([obj_nums]),
+                            k_nearest_neighbors=cfg.KNNS,
+                            global_map_tmp_dic=eval_global_map_tmp_dic,
+                            local_map_dics=local_map_dics,
+                            interaction_num=n_interaction,
+                            start_annotated_frame=start_annotated_frame,
+                            frame_num=[ii],
+                            dynamic_seghead=model.dynamic_seghead)
+                        pred_label = tmp_dic[sequence]
+
+                        pred_label = nn.functional.interpolate(
+                            pred_label,
+                            size=(h, w),
+                            mode='bilinear',
+                            align_corners=True)
+
+                        pred_label = paddle.argmax(pred_label, axis=1)
+                        pred_masks.append(float_(pred_label))
+                        prev_label = pred_label.unsqueeze(0)
+                        prev_embedding = current_embedding
+                        prev_label_storage[ii] = float_(pred_label[0])
+                        ####
+                        if is_save_image:
+                            pred_label_to_save = pred_label.squeeze(0).numpy()
+                            im = Image.fromarray(
+                                pred_label_to_save.astype('uint8')).convert(
+                                    'P', )
+                            im.putpalette(_palette)
+                            imgname = str(ii)
+                            while len(imgname) < 5:
+                                imgname = '0' + imgname
+                            if not os.path.exists(
+                                    os.path.join(
+                                        cfg.RESULT_ROOT, sequence,
+                                        'interactive' + str(n_interaction),
+                                        'turn' + str(inter_turn))):
+                                os.makedirs(
+                                    os.path.join(
+                                        cfg.RESULT_ROOT, sequence,
+                                        'interactive' + str(n_interaction),
+                                        'turn' + str(inter_turn)))
+                            im.save(
+                                os.path.join(cfg.RESULT_ROOT, sequence,
+                                             'interactive' + str(n_interaction),
+                                             'turn' + str(inter_turn),
+                                             imgname + '.png'))
+                    #######################################
+                    prev_label = ref_prev_label
+                    prev_embedding = ref_frame_embedding
+                    #######
+                    # Propagation <-
+                    for ii in range(start_annotated_frame):
+                        current_frame_num = start_annotated_frame - 1 - ii
+                        current_embedding = embedding_memory[current_frame_num]
+                        current_embedding = current_embedding.unsqueeze(0)
+                        prev_label = prev_label
+                        tmp_dic, eval_global_map_tmp_dic, local_map_dics = model.prop_seghead(
+                            ref_frame_embedding,
+                            prev_embedding,
+                            current_embedding,
+                            scribble_label,
+                            prev_label,
+                            normalize_nearest_neighbor_distances=True,
+                            use_local_map=True,
+                            seq_names=[sequence],
+                            gt_ids=paddle.to_tensor([obj_nums]),
+                            k_nearest_neighbors=cfg.KNNS,
+                            global_map_tmp_dic=eval_global_map_tmp_dic,
+                            local_map_dics=local_map_dics,
+                            interaction_num=n_interaction,
+                            start_annotated_frame=start_annotated_frame,
+                            frame_num=[current_frame_num],
+                            dynamic_seghead=model.dynamic_seghead)
+                        pred_label = tmp_dic[sequence]
+                        pred_label = nn.functional.interpolate(
+                            pred_label,
+                            size=(h, w),
+                            mode='bilinear',
+                            align_corners=True)
+
+                        pred_label = paddle.argmax(pred_label, axis=1)
+                        pred_masks_reverse.append(float_(pred_label))
+                        prev_label = pred_label.unsqueeze(0)
+                        prev_embedding = current_embedding
+                        ####
+                        prev_label_storage[current_frame_num] = float_(
+                            pred_label[0])
+                        ###
+                        if is_save_image:
+                            pred_label_to_save = pred_label.squeeze(0).numpy()
+                            im = Image.fromarray(
+                                pred_label_to_save.astype('uint8')).convert(
+                                    'P', )
+                            im.putpalette(_palette)
+                            imgname = str(current_frame_num)
+                            while len(imgname) < 5:
+                                imgname = '0' + imgname
+                            if not os.path.exists(
+                                    os.path.join(
+                                        cfg.RESULT_ROOT, sequence,
+                                        'interactive' + str(n_interaction),
+                                        'turn' + str(inter_turn))):
+                                os.makedirs(
+                                    os.path.join(
+                                        cfg.RESULT_ROOT, sequence,
+                                        'interactive' + str(n_interaction),
+                                        'turn' + str(inter_turn)))
+                            im.save(
+                                os.path.join(cfg.RESULT_ROOT, sequence,
+                                             'interactive' + str(n_interaction),
+                                             'turn' + str(inter_turn),
+                                             imgname + '.png'))
+                    pred_masks_reverse.reverse()
+                    pred_masks_reverse.extend(pred_masks)
+                    final_masks = paddle.concat(pred_masks_reverse, 0)
+                    sess.submit_masks(final_masks.numpy())
+
+            if inter_turn == 3 and n_interaction == 8:
+                del eval_global_map_tmp_dic
+                del local_map_dics
+                del embedding_memory
+                del prev_label_storage
+            t_end = timeit.default_timer()
+            print('Total time for single interaction: ' + str(t_end - t_total))
+        report = sess.get_report()
+        summary = sess.get_global_summary(
+            save_file=os.path.join(report_save_dir, 'summary.json'))
+    inter_file.close()
+
+
+def rough_ROI(ref_scribble_labels):
+    dist = 20
+    b, _, h, w = ref_scribble_labels.shape
+    filter_ = paddle.zeros_like(ref_scribble_labels)
+    to_fill = paddle.zeros_like(ref_scribble_labels)
+    for i in range(b):
+        no_background = (ref_scribble_labels[i] != -1)
+        no_background = no_background.squeeze(0)
+
+        no_b = no_background.nonzero()
+        (h_min, w_min) = paddle.min(no_b, 0)
+        (h_max, w_max) = paddle.max(no_b, 0)
+
+        filter_[i, 0,
+                max(h_min - dist, 0):min(h_max + dist, h - 1),
+                max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1
+
+    final_scribble_labels = paddle.where(byte_(filter_), ref_scribble_labels,
+                                         to_fill)
+    return final_scribble_labels
+
+
+def load_network(net, pretrained_dict):
+    model_dict = net.state_dict()
+    # 1. filter out unnecessary keys
+    f_pretrained_dict = {}
+    for k, v in pretrained_dict.items():
+        if k in model_dict:
+            f_pretrained_dict[k] = v
+        else:
+            print(k)
+
+    print(len(model_dict.keys()), len(pretrained_dict.keys()))
+
+    # 2. overwrite entries in the existing state dict
+    model_dict.update(pretrained_dict)
+    net.set_state_dict(model_dict)
+
+
+_palette = [
+    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,
+    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,
+    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,
+    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,
+    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,
+    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,
+    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,
+    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,
+    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,
+    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,
+    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,
+    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,
+    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,
+    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,
+    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,
+    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,
+    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,
+    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,
+    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,
+    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,
+    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,
+    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,
+    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,
+    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,
+    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,
+    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,
+    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,
+    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,
+    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,
+    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,
+    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,
+    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,
+    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,
+    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,
+    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,
+    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,
+    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,
+    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,
+    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,
+    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,
+    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,
+    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,
+    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,
+    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,
+    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,
+    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,
+    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,
+    255, 255
+]
+
+if __name__ == '__main__':
+    main()
diff --git a/docs/src/applications/Ma-Net/train_stage1.py b/docs/src/applications/Ma-Net/train_stage1.py
new file mode 100644
index 000000000..fcc10ac3a
--- /dev/null
+++ b/docs/src/applications/Ma-Net/train_stage1.py
@@ -0,0 +1,429 @@
+import cv2
+import paddle
+import paddle.nn as nn
+import os
+import numpy as np
+from paddle.io import DataLoader
+import paddle.optimizer as optim
+from paddle.vision import transforms
+from dataloaders.davis_2017_f import DAVIS2017_VOS_Train, DAVIS2017_VOS_Test
+import dataloaders.custom_transforms_f as tr
+from dataloaders.samplers import RandomIdentitySampler
+from networks.deeplab import DeepLab
+from networks.IntVOS import IntVOS
+from networks.loss import Added_BCEWithLogitsLoss, Added_CrossEntropyLoss
+from config import cfg
+from utils.api import float_, clip_grad_norm_, int_, long_
+from utils.meters import AverageMeter
+from utils.mask_damaging import damage_masks
+from utils.utils import label2colormap
+from PIL import Image
+import scipy.misc as sm
+import time
+# import logging
+paddle.disable_static()
+
+paddle.device.set_device('gpu:0')
+
+
+class Manager(object):
+    def __init__(self,
+                 use_gpu=True,
+                 time_budget=None,
+                 save_result_dir=cfg.SAVE_RESULT_DIR,
+                 pretrained=True,
+                 interactive_test=False,
+                 freeze_bn=False):
+
+        self.save_res_dir = save_result_dir
+        self.time_budget = time_budget
+        self.feature_extracter = DeepLab(backbone='resnet', freeze_bn=freeze_bn)
+        if pretrained:
+            pretrained_dict = paddle.load(cfg.PRETRAINED_MODEL)
+            # pretrained_dict = np.load(cfg.PRETRAINED_MODEL, allow_pickle=True).item()
+            pretrained_dict = pretrained_dict['state_dict']
+            self.load_network(self.feature_extracter, pretrained_dict)
+            print('load pretrained model successfully.')
+        self.model = IntVOS(cfg, self.feature_extracter)
+        self.use_gpu = use_gpu
+        if use_gpu:
+            self.model = self.model
+
+    def train(self,
+              damage_initial_previous_frame_mask=True,
+              lossfunc='cross_entropy',
+              model_resume=False):
+        ###################
+        self.model.train()
+        running_loss = AverageMeter()
+        running_time = AverageMeter()
+
+        param_list = [{
+            'params': self.model.feature_extracter.parameters()
+        }, {
+            'params': self.model.semantic_embedding.parameters()
+        }, {
+            'params': self.model.dynamic_seghead.parameters()
+        }]
+
+        ########
+        clip = paddle.nn.ClipGradByGlobalNorm(
+            clip_norm=cfg.TRAIN_CLIP_GRAD_NORM)
+        #         clip = None
+        optimizer = optim.Momentum(parameters=param_list,
+                                   learning_rate=cfg.TRAIN_LR,
+                                   momentum=cfg.TRAIN_MOMENTUM,
+                                   weight_decay=cfg.TRAIN_WEIGHT_DECAY,
+                                   use_nesterov=True,
+                                   grad_clip=clip)
+
+        self.param_list = param_list
+
+        ###################
+
+        composed_transforms = transforms.Compose([
+            tr.RandomHorizontalFlip(cfg.DATA_RANDOMFLIP),
+            tr.RandomScale(),
+            tr.RandomCrop((cfg.DATA_RANDOMCROP, cfg.DATA_RANDOMCROP), 5),
+            tr.Resize(cfg.DATA_RESCALE),
+            tr.ToTensor()
+        ])
+        print('dataset processing...')
+        train_dataset = DAVIS2017_VOS_Train(root=cfg.DATA_ROOT,
+                                            transform=composed_transforms)
+
+        trainloader = DataLoader(
+            train_dataset,
+            collate_fn=None,
+            batch_size=cfg.TRAIN_BATCH_SIZE,
+            shuffle=True,
+            num_workers=8,
+        )
+        print('dataset processing finished.')
+        if lossfunc == 'bce':
+            criterion = Added_BCEWithLogitsLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,
+                                                cfg.TRAIN_HARD_MINING_STEP)
+        elif lossfunc == 'cross_entropy':
+            criterion = Added_CrossEntropyLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,
+                                               cfg.TRAIN_HARD_MINING_STEP)
+        else:
+            print(
+                'unsupported loss funciton. Please choose from [cross_entropy,bce]'
+            )
+
+        max_itr = cfg.TRAIN_TOTAL_STEPS
+
+        step = 0
+
+        if model_resume:
+            saved_model_ = os.path.join(self.save_res_dir, cfg.TRAIN_RESUME_DIR)
+
+            saved_model_ = paddle.load(saved_model_)
+            self.model = self.load_network(self.model, saved_model_)
+            step = int(cfg.RESUME_DIR.split('.')[0].split('_')[-1])
+            print('resume from step {}'.format(step))
+
+        while step < cfg.TRAIN_TOTAL_STEPS:
+            if step > 100001:
+                break
+            t1 = time.time()
+            if step > 0:
+                running_time.update(time.time() - t1)
+            print(
+                f'{time.asctime()}: new epoch starts. last epoch time: {running_time.avg:.3f} s.',
+            )
+
+            for ii, sample in enumerate(trainloader):
+                now_lr = self._adjust_lr(optimizer, step, max_itr)
+
+                if step >= max_itr:
+                    step += 1
+                    break
+
+                ref_imgs = sample['ref_img']  # batch_size * 3 * h * w
+                img1s = sample['img1']
+                img2s = sample['img2']
+                ref_scribble_labels = sample[
+                    'ref_scribble_label']  # batch_size * 1 * h * w
+                label1s = sample['label1']
+                label2s = sample['label2']
+                seq_names = sample['meta']['seq_name']
+                obj_nums = sample['meta']['obj_num']
+
+                bs, _, h, w = img2s.shape
+                inputs = paddle.concat((ref_imgs, img1s, img2s), 0)
+                if damage_initial_previous_frame_mask:
+                    try:
+                        label1s = damage_masks(label1s)
+                    except:
+                        label1s = label1s
+                        print('damage_error')
+
+                ##########
+                if self.use_gpu:
+                    inputs = inputs
+                    ref_scribble_labels = ref_scribble_labels
+                    label1s = label1s
+                    label2s = label2s
+
+                ##########
+
+                tmp_dic = self.model(inputs,
+                                     ref_scribble_labels,
+                                     label1s,
+                                     use_local_map=True,
+                                     seq_names=seq_names,
+                                     gt_ids=obj_nums,
+                                     k_nearest_neighbors=cfg.KNNS)
+
+                label_and_obj_dic = {}
+                label_dic = {}
+                for i, seq_ in enumerate(seq_names):
+                    label_and_obj_dic[seq_] = (label2s[i], obj_nums[i])
+                for seq_ in tmp_dic.keys():
+                    tmp_pred_logits = tmp_dic[seq_]
+                    tmp_pred_logits = nn.functional.interpolate(
+                        tmp_pred_logits,
+                        size=(h, w),
+                        mode='bilinear',
+                        align_corners=True)
+                    tmp_dic[seq_] = tmp_pred_logits
+
+                    label_tmp, obj_num = label_and_obj_dic[seq_]
+                    obj_ids = np.arange(1, obj_num + 1)
+                    obj_ids = paddle.to_tensor(obj_ids)
+                    obj_ids = int_(obj_ids)
+                    if lossfunc == 'bce':
+                        label_tmp = label_tmp.transpose([1, 2, 0])
+                        label = (float_(label_tmp) == float_(obj_ids))
+                        label = label.unsqueeze(-1).transpose([3, 2, 0, 1])
+                        label_dic[seq_] = float_(label)
+                    elif lossfunc == 'cross_entropy':
+                        label_dic[seq_] = long_(label_tmp)
+
+                loss = criterion(tmp_dic, label_dic, step)
+                loss = loss / bs
+                optimizer.clear_grad()
+                loss.backward()
+
+                optimizer.step()
+
+                running_loss.update(loss.item(), bs)
+                ##############Visulization during training
+                if step % 50 == 0:
+                    print(time.asctime(), end='\t')
+                    log = 'step:{},now_lr:{} ,loss:{:.4f}({:.4f})'.format(
+                        step, now_lr, running_loss.val, running_loss.avg)
+                    print(log)
+                    #                     logging.info(log)
+
+                    show_ref_img = ref_imgs.numpy()[0]
+                    show_img1 = img1s.numpy()[0]
+                    show_img2 = img2s.numpy()[0]
+
+                    mean = np.array([[[0.485]], [[0.456]], [[0.406]]])
+                    sigma = np.array([[[0.229]], [[0.224]], [[0.225]]])
+
+                    show_ref_img = show_ref_img * sigma + mean
+                    show_img1 = show_img1 * sigma + mean
+                    show_img2 = show_img2 * sigma + mean
+
+                    show_gt = label2s[0]
+
+                    show_gt = show_gt.squeeze(0).numpy()
+                    show_gtf = label2colormap(show_gt).transpose((2, 0, 1))
+
+                    show_preds = tmp_dic[seq_names[0]]
+                    show_preds = nn.functional.interpolate(show_preds,
+                                                           size=(h, w),
+                                                           mode='bilinear',
+                                                           align_corners=True)
+                    show_preds = show_preds.squeeze(0)
+                    if lossfunc == 'bce':
+                        show_preds = (paddle.nn.functional.sigmoid(show_preds) >
+                                      0.5)
+                        show_preds_s = paddle.zeros((h, w))
+                        for i in range(show_preds.size(0)):
+                            show_preds_s[show_preds[i]] = i + 1
+                    elif lossfunc == 'cross_entropy':
+                        show_preds_s = paddle.argmax(show_preds, axis=0)
+                    show_preds_s = show_preds_s.numpy()
+                    show_preds_sf = label2colormap(show_preds_s).transpose(
+                        (2, 0, 1))
+
+                    pix_acc = np.sum(show_preds_s == show_gt) / (h * w)
+
+                    ###########TODO
+                if step % 20000 == 0 and step != 0:
+                    self.save_network(self.model, step)
+
+                step += 1
+
+    def test_VOS(self, use_gpu=True):
+        seqs = []
+
+        with open(
+                os.path.join(cfg.DATA_ROOT, 'ImageSets', '2017',
+                             'val' + '.txt')) as f:
+            seqs_tmp = f.readlines()
+        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
+        seqs.extend(seqs_tmp)
+        print('model loading...')
+        saved_model_dict = os.path.join(self.save_res_dir, cfg.TEST_CHECKPOINT)
+        pretrained_dict = paddle.load(saved_model_dict)
+        self.model = self.load_network(self.model, pretrained_dict)
+        print('model load finished')
+
+        self.model.eval()
+        with paddle.no_grad():
+            for seq_name in seqs:
+                print('prcessing seq:{}'.format(seq_name))
+                test_dataset = DAVIS2017_VOS_Test(root=cfg.DATA_ROOT,
+                                                  transform=tr.ToTensor(),
+                                                  result_root=cfg.RESULT_ROOT,
+                                                  seq_name=seq_name)
+                test_dataloader = DataLoader(test_dataset,
+                                             batch_size=1,
+                                             shuffle=False,
+                                             num_workers=0)
+                if not os.path.exists(os.path.join(cfg.RESULT_ROOT, seq_name)):
+                    os.makedirs(os.path.join(cfg.RESULT_ROOT, seq_name))
+                time_start = time.time()
+                for ii, sample in enumerate(test_dataloader):
+                    ref_img = sample['ref_img']
+                    prev_img = sample['prev_img']
+                    current_img = sample['current_img']
+                    ref_label = sample['ref_label']
+                    prev_label = sample['prev_label']
+                    obj_num = sample['meta']['obj_num']
+                    seqnames = sample['meta']['seq_name']
+                    imgname = sample['meta']['current_name']
+                    bs, _, h, w = current_img.shape
+
+                    inputs = paddle.concat((ref_img, prev_img, current_img), 0)
+                    if use_gpu:
+                        inputs = inputs
+                        ref_label = ref_label
+                        prev_label = prev_label
+
+                    ################
+                    t1 = time.time()
+                    tmp = self.model.extract_feature(inputs)
+                    ref_frame_embedding, previous_frame_embedding, current_frame_embedding = paddle.split(
+                        tmp, num_or_sections=3, axis=0)
+                    t2 = time.time()
+                    print('feature_extracter time:{}'.format(t2 - t1))
+                    tmp_dic = self.model.prop_seghead(
+                        ref_frame_embedding, previous_frame_embedding,
+                        current_frame_embedding, ref_label, prev_label, True,
+                        seqnames, obj_num, cfg.KNNS, self.model.dynamic_seghead)
+                    t3 = time.time()
+                    print('after time:{}'.format(t3 - t2))
+
+                    #######################
+                    pred_label = tmp_dic[seq_name]
+                    pred_label = nn.functional.interpolate(pred_label,
+                                                           size=(h, w),
+                                                           mode='bilinear',
+                                                           align_corners=True)
+
+                    pred_label = paddle.argmax(pred_label, axis=1)
+                    pred_label = pred_label.squeeze(0)
+                    pred_label = pred_label.numpy()
+                    im = Image.fromarray(pred_label.astype('uint8')).convert(
+                        'P', )
+                    im.putpalette(_palette)
+                    im.save(
+                        os.path.join(cfg.RESULT_ROOT, seq_name,
+                                     imgname[0].split('.')[0] + '.png'))
+                    one_frametime = time.time()
+                    print('seq name:{} frame:{} time:{}'.format(
+                        seq_name, imgname[0], one_frametime - time_start))
+                    time_start = time.time()
+
+    def load_network(self, net, pretrained_dict):
+
+        # pretrained_dict = pretrained_dict
+        model_dict = net.state_dict()
+        # 1. filter out unnecessary keys
+        pretrained_dict = {
+            k: v
+            for k, v in pretrained_dict.items() if k in model_dict
+        }
+        # 2. overwrite entries in the existing state dict
+        # for k in model_dict:
+        #     if k not in pretrained_dict:
+        #         print(k, 'not in loaded weights.')
+
+        model_dict.update(pretrained_dict)
+        net.set_state_dict(model_dict)
+        return net
+
+    def save_network(self, net, step):
+        save_path = self.save_res_dir
+
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        save_file = 'save_step_%s.pth' % (step)
+        paddle.save(net.state_dict(), os.path.join(save_path, save_file))
+
+    def _adjust_lr(self, optimizer, itr, max_itr):
+        now_lr = cfg.TRAIN_LR * (1 - itr / (max_itr + 1))**cfg.TRAIN_POWER
+        optimizer._param_groups[0]['lr'] = now_lr
+        return now_lr
+
+
+_palette = [
+    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,
+    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,
+    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,
+    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,
+    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,
+    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,
+    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,
+    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,
+    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,
+    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,
+    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,
+    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,
+    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,
+    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,
+    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,
+    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,
+    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,
+    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,
+    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,
+    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,
+    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,
+    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,
+    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,
+    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,
+    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,
+    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,
+    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,
+    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,
+    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,
+    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,
+    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,
+    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,
+    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,
+    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,
+    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,
+    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,
+    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,
+    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,
+    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,
+    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,
+    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,
+    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,
+    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,
+    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,
+    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,
+    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,
+    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,
+    255, 255
+]
+
+manager = Manager()
+
+manager.train()
diff --git a/docs/src/applications/Ma-Net/train_stage2.py b/docs/src/applications/Ma-Net/train_stage2.py
new file mode 100644
index 000000000..0f1d0f491
--- /dev/null
+++ b/docs/src/applications/Ma-Net/train_stage2.py
@@ -0,0 +1,612 @@
+import cv2
+import paddle
+import paddle.nn as nn
+import os
+import numpy as np
+# from paddle.io import DataLoader
+import paddle.optimizer as optim
+from paddle.vision import transforms
+from dataloaders.davis_2017_f import DAVIS2017_Train
+import dataloaders.custom_transforms_f as tr
+from dataloaders.samplers import RandomIdentitySampler
+from networks.deeplab import DeepLab
+from networks.IntVOS import IntVOS
+from networks.loss import Added_BCEWithLogitsLoss, Added_CrossEntropyLoss
+from config import cfg
+from utils.api import float_, long_, byte_
+from utils.meters import AverageMeter
+from utils.mask_damaging import damage_masks, mask_damager
+from utils.utils import label2colormap
+from PIL import Image
+import random
+import scipy.misc as sm
+import time
+import davisinteractive.robot.interactive_robot as interactive_robot
+
+paddle.disable_static()
+paddle.device.set_device("gpu:0")
+
+
+class DataLoader(paddle.io.DataLoader):
+    def __init__(self,
+                 dataset,
+                 batch_size=1,
+                 shuffle=False,
+                 sampler=None,
+                 batch_sampler=None,
+                 num_workers=0,
+                 collate_fn=None,
+                 pin_memory=False,
+                 drop_last=False,
+                 timeout=0,
+                 worker_init_fn=None,
+                 multiprocessing_context=None,
+                 generator=None):
+        if isinstance(dataset[0], (tuple, list)):
+            return_list = True
+        else:
+            return_list = False
+
+        super().__init__(dataset,
+                         feed_list=None,
+                         places=None,
+                         return_list=return_list,
+                         batch_sampler=batch_sampler,
+                         batch_size=batch_size,
+                         shuffle=shuffle,
+                         drop_last=drop_last,
+                         collate_fn=collate_fn,
+                         num_workers=num_workers,
+                         use_buffer_reader=True,
+                         use_shared_memory=False,
+                         timeout=timeout,
+                         worker_init_fn=worker_init_fn)
+        if sampler is not None:
+            self.batch_sampler.sampler = sampler
+
+
+class Manager(object):
+    def __init__(self,
+                 use_gpu=True,
+                 time_budget=None,
+                 save_result_dir=cfg.SAVE_RESULT_DIR,
+                 pretrained=True,
+                 interactive_test=False):
+
+        self.save_res_dir = save_result_dir
+        self.time_budget = time_budget
+        self.feature_extracter = DeepLab(backbone='resnet')
+
+        if pretrained:
+            pretrained_dict = paddle.load(cfg.PRETRAINED_MODEL)
+            pretrained_dict = pretrained_dict['state_dict']
+            self.load_network(self.feature_extracter, pretrained_dict)
+            print('load pretrained model successfully.')
+        self.model = IntVOS(cfg, self.feature_extracter)
+        model_filename = cfg.SAVE_VOS_RESULT_DIR
+        pd = paddle.load(model_filename)
+
+        self.load_network(self.model, pd)
+
+        print('load stage 1 model from', model_filename)
+        self.use_gpu = use_gpu
+        if use_gpu:
+            self.model = self.model
+
+    ##################################
+    def train(self,
+              damage_initial_previous_frame_mask=True,
+              lossfunc='cross_entropy',
+              model_resume=False,
+              eval_total=False,
+              init_prev=False):
+        ###################
+        interactor = interactive_robot.InteractiveScribblesRobot()
+        self.model.train()
+        running_loss = AverageMeter()
+        optimizer = optim.Momentum(parameters=[{
+            'params':
+            self.model.inter_seghead.parameters()
+        }],
+                                   learning_rate=cfg.TRAIN_LR,
+                                   momentum=cfg.TRAIN_MOMENTUM,
+                                   weight_decay=cfg.TRAIN_WEIGHT_DECAY)
+
+        ###################
+
+        composed_transforms = transforms.Compose([
+            tr.RandomHorizontalFlip(cfg.DATA_RANDOMFLIP),
+            tr.RandomScale(),
+            tr.RandomCrop((cfg.DATA_RANDOMCROP, cfg.DATA_RANDOMCROP), 10),
+            tr.Resize(cfg.DATA_RESCALE),
+            tr.ToTensor()
+        ])
+        print('dataset processing...')
+        train_dataset = DAVIS2017_Train(root=cfg.DATA_ROOT,
+                                        transform=composed_transforms)
+        train_list = train_dataset.seqs
+
+        print('dataset processing finished.')
+        if lossfunc == 'bce':
+            criterion = Added_BCEWithLogitsLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,
+                                                cfg.TRAIN_HARD_MINING_STEP)
+        elif lossfunc == 'cross_entropy':
+            criterion = Added_CrossEntropyLoss(cfg.TRAIN_TOP_K_PERCENT_PIXELS,
+                                               cfg.TRAIN_HARD_MINING_STEP)
+        else:
+            print(
+                'unsupported loss funciton. Please choose from [cross_entropy,bce]'
+            )
+
+        max_itr = cfg.TRAIN_TOTAL_STEPS
+
+        step = 0
+        round_ = 3
+        epoch_per_round = 30
+        if model_resume:
+            saved_model_ = os.path.join(self.save_res_dir,
+                                        'save_step_75000.pth')
+
+            saved_model_ = paddle.load(saved_model_)
+            self.model = self.load_network(self.model, saved_model_)
+            step = 75000
+            print('resume from step {}'.format(step))
+
+        while step < cfg.TRAIN_TOTAL_STEPS:
+
+            if step > 80001:
+                break
+
+            for r in range(round_):
+                if r == 0:  #### r==0: Train the interaction branch in the first round
+                    print('start new')
+                    global_map_tmp_dic = {}
+                    train_dataset.transform = transforms.Compose([
+                        tr.RandomHorizontalFlip(cfg.DATA_RANDOMFLIP),
+                        tr.RandomScale(),
+                        tr.RandomCrop(
+                            (cfg.DATA_RANDOMCROP, cfg.DATA_RANDOMCROP)),
+                        tr.Resize(cfg.DATA_RESCALE),
+                        tr.ToTensor()
+                    ])
+                    train_dataset.init_ref_frame_dic()
+
+                trainloader = DataLoader(train_dataset,
+                                         sampler=RandomIdentitySampler(
+                                             train_dataset.sample_list),
+                                         shuffle=False,
+                                         batch_size=cfg.TRAIN_BATCH_SIZE,
+                                         num_workers=0)
+                print('round:{} start'.format(r))
+                print(len(train_dataset))
+                print(len(trainloader))
+
+                for epoch in range(epoch_per_round):
+
+                    for ii, sample in enumerate(trainloader):
+                        now_lr = self._adjust_lr(optimizer, step, max_itr)
+                        ref_imgs = sample['ref_img']  # batch_size * 3 * h * w
+                        ref_scribble_labels = sample[
+                            'ref_scribble_label']  # batch_size * 1 * h * w
+                        seq_names = sample['meta']['seq_name']
+                        obj_nums = sample['meta']['obj_num']
+                        ref_frame_nums = sample['meta']['ref_frame_num']
+                        ref_frame_gts = sample['ref_frame_gt']
+                        bs, _, h, w = ref_imgs.shape
+                        ##########
+                        if self.use_gpu:
+                            inputs = ref_imgs
+
+                            ref_scribble_labels = ref_scribble_labels
+                            ref_frame_gts = ref_frame_gts
+                        ##########
+                        with paddle.no_grad():
+                            self.model.feature_extracter.eval()
+                            self.model.semantic_embedding.eval()
+                            ref_frame_embedding = self.model.extract_feature(
+                                inputs)
+                        if r == 0:
+                            first_inter = True
+
+                            tmp_dic = self.model.int_seghead(
+                                ref_frame_embedding=ref_frame_embedding,
+                                ref_scribble_label=ref_scribble_labels,
+                                prev_round_label=None,
+                                normalize_nearest_neighbor_distances=True,
+                                global_map_tmp_dic={},
+                                seq_names=seq_names,
+                                gt_ids=obj_nums,
+                                k_nearest_neighbors=cfg.KNNS,
+                                frame_num=ref_frame_nums,
+                                first_inter=first_inter)
+                        else:
+                            first_inter = False
+                            prev_round_label = sample['prev_round_label']
+                            prev_round_label = prev_round_label
+                            tmp_dic = self.model.int_seghead(
+                                ref_frame_embedding=ref_frame_embedding,
+                                ref_scribble_label=ref_scribble_labels,
+                                prev_round_label=prev_round_label,
+                                normalize_nearest_neighbor_distances=True,
+                                global_map_tmp_dic={},
+                                seq_names=seq_names,
+                                gt_ids=obj_nums,
+                                k_nearest_neighbors=cfg.KNNS,
+                                frame_num=ref_frame_nums,
+                                first_inter=first_inter)
+                        label_and_obj_dic = {}
+                        label_dic = {}
+                        for i, seq_ in enumerate(seq_names):
+                            label_and_obj_dic[seq_] = (ref_frame_gts[i],
+                                                       obj_nums[i])
+                        for seq_ in tmp_dic.keys():
+                            tmp_pred_logits = tmp_dic[seq_]
+                            tmp_pred_logits = nn.functional.interpolate(
+                                tmp_pred_logits,
+                                size=(h, w),
+                                mode='bilinear',
+                                align_corners=True)
+                            tmp_dic[seq_] = tmp_pred_logits
+
+                            label_tmp, obj_num = label_and_obj_dic[seq_]
+                            obj_ids = np.arange(0, obj_num + 1)
+                            obj_ids = paddle.to_tensor(obj_ids)
+                            obj_ids = paddle.to_tensor(obj_ids, dtype='int64')
+                            if lossfunc == 'bce':
+                                label_tmp = label_tmp.permute(1, 2, 0)
+                                label = (float_(label_tmp) == float_(obj_ids))
+                                label = label.unsqueeze(-1).permute(3, 2, 0, 1)
+                                label_dic[seq_] = float_(label)
+                            elif lossfunc == 'cross_entropy':
+                                label_dic[seq_] = long_(label_tmp)
+
+                        loss = criterion(tmp_dic, label_dic, step)
+                        loss = loss / bs
+                        optimizer.clear_grad()
+                        loss.backward()
+                        optimizer.step()
+
+                        running_loss.update(loss.item(), bs)
+                        if step % 50 == 0:
+                            print(
+                                'step:{},now_lr:{} ,loss:{:.4f}({:.4f})'.format(
+                                    step, now_lr, running_loss.val,
+                                    running_loss.avg))
+
+                            show_ref_img = ref_imgs.numpy()[0]
+
+                            mean = np.array([[[0.485]], [[0.456]], [[0.406]]])
+                            sigma = np.array([[[0.229]], [[0.224]], [[0.225]]])
+
+                            show_ref_img = show_ref_img * sigma + mean
+
+                            show_gt = ref_frame_gts[0].squeeze(0).numpy()
+                            show_gtf = label2colormap(show_gt).transpose(
+                                (2, 0, 1))
+                            show_scrbble = ref_scribble_labels[0].squeeze(
+                                0).numpy()
+                            show_scrbble = label2colormap(
+                                show_scrbble).transpose((2, 0, 1))
+                            if r != 0:
+                                show_prev_round_label = prev_round_label[
+                                    0].squeeze(0).numpy()
+                                show_prev_round_label = label2colormap(
+                                    show_prev_round_label).transpose((2, 0, 1))
+                            else:
+                                show_prev_round_label = np.zeros_like(show_gt)
+
+                                show_prev_round_label = label2colormap(
+                                    show_prev_round_label).transpose((2, 0, 1))
+
+                            ##########
+                            show_preds = tmp_dic[seq_names[0]]
+                            show_preds = nn.functional.interpolate(
+                                show_preds,
+                                size=(h, w),
+                                mode='bilinear',
+                                align_corners=True)
+                            show_preds = show_preds.squeeze(0)
+                            if lossfunc == 'bce':
+                                show_preds = show_preds[1:]
+
+                                show_preds = (
+                                    paddle.nn.functional.sigmoid(show_preds) >
+                                    0.5)
+                                marker = paddle.argmax(show_preds, axis=0)
+                                show_preds_s = paddle.zeros((h, w))
+                                for i in range(show_preds.size(0)):
+                                    tmp_mask = (marker
+                                                == i) & (show_preds[i] > 0.5)
+                                    show_preds_s[tmp_mask] = i + 1
+                            elif lossfunc == 'cross_entropy':
+                                show_preds_s = paddle.argmax(show_preds, axis=0)
+                            show_preds_s = show_preds_s.numpy()
+                            show_preds_sf = label2colormap(
+                                show_preds_s).transpose((2, 0, 1))
+
+                            pix_acc = np.sum(show_preds_s == show_gt) / (h * w)
+
+                            ###########TODO
+                        if step % 20000 == 0 and step != 0:
+                            self.save_network(self.model, step)
+
+                        step += 1
+
+                print('trainset evaluating...')
+                print('*' * 100)
+
+                if cfg.TRAIN_INTER_USE_TRUE_RESULT:
+                    if r != round_ - 1:
+                        if r == 0:
+                            prev_round_label_dic = {}
+                        self.model.eval()
+                        with paddle.no_grad():
+                            round_scribble = {}
+
+                            frame_num_dic = {}
+                            train_dataset.transform = transforms.Compose(
+                                [tr.Resize(cfg.DATA_RESCALE),
+                                 tr.ToTensor()])
+                            trainloader = DataLoader(
+                                train_dataset,
+                                sampler=RandomIdentitySampler(
+                                    train_dataset.sample_list),
+                                shuffle=False,
+                                batch_size=1,
+                                num_workers=0)
+                            for ii, sample in enumerate(trainloader):
+                                ref_imgs = sample[
+                                    'ref_img']  # batch_size * 3 * h * w
+                                img1s = sample['img1']
+                                img2s = sample['img2']
+                                ref_scribble_labels = sample[
+                                    'ref_scribble_label']  # batch_size * 1 * h * w
+                                label1s = sample['label1']
+                                label2s = sample['label2']
+                                seq_names = sample['meta']['seq_name']
+                                obj_nums = sample['meta']['obj_num']
+                                frame_nums = sample['meta']['frame_num']
+                                bs, _, h, w = img2s.shape
+                                inputs = paddle.concat((ref_imgs, img1s, img2s),
+                                                       0)
+                                if r == 0:
+                                    ref_scribble_labels = self.rough_ROI(
+                                        ref_scribble_labels)
+                                print(seq_names[0])
+                                label1s_tocat = None
+                                for i in range(bs):
+                                    l = label1s[i]
+                                    l = l.unsqueeze(0)
+                                    l = mask_damager(l, 0.0)
+                                    l = paddle.to_tensor(l)
+
+                                    l = l.unsqueeze(0).unsqueeze(0)
+
+                                    if label1s_tocat is None:
+                                        label1s_tocat = float_(l)
+                                    else:
+                                        label1s_tocat = paddle.concat(
+                                            (label1s_tocat, float_(l)), 0)
+
+                                label1s = label1s_tocat
+                                if self.use_gpu:
+                                    inputs = inputs
+                                    ref_scribble_labels = ref_scribble_labels
+                                    label1s = label1s
+
+                                tmp_dic, global_map_tmp_dic = self.model(
+                                    inputs,
+                                    ref_scribble_labels,
+                                    label1s,
+                                    seq_names=seq_names,
+                                    gt_ids=obj_nums,
+                                    k_nearest_neighbors=cfg.KNNS,
+                                    global_map_tmp_dic=global_map_tmp_dic,
+                                    frame_num=frame_nums)
+                                pred_label = tmp_dic[
+                                    seq_names[0]].detach().cpu()
+                                pred_label = nn.functional.interpolate(
+                                    pred_label,
+                                    size=(h, w),
+                                    mode='bilinear',
+                                    align_corners=True)
+                                pred_label = paddle.argmax(pred_label, axis=1)
+                                pred_label = pred_label.unsqueeze(0)
+                                try:
+                                    pred_label = damage_masks(pred_label)
+                                except:
+                                    pred_label = pred_label
+                                pred_label = pred_label.squeeze(0)
+                                round_scribble[
+                                    seq_names[0]] = interactor.interact(
+                                        seq_names[0], pred_label.numpy(),
+                                        float_(label2s).squeeze(0).numpy(),
+                                        obj_nums)
+                                frame_num_dic[seq_names[0]] = frame_nums[0]
+                                pred_label = pred_label.unsqueeze(0)
+                                img_ww = Image.open(
+                                    os.path.join(cfg.DATA_ROOT,
+                                                 'JPEGImages/480p/',
+                                                 seq_names[0], '00000.jpg'))
+                                img_ww = np.array(img_ww)
+                                or_h, or_w = img_ww.shape[:2]
+                                pred_label = paddle.nn.functional.interpolate(
+                                    float_(pred_label), (or_h, or_w),
+                                    mode='nearest')
+                                prev_round_label_dic[
+                                    seq_names[0]] = pred_label.squeeze(0)
+                        train_dataset.update_ref_frame_and_label(
+                            round_scribble, frame_num_dic, prev_round_label_dic)
+
+                    print(f'round {r}', 'trainset evaluating finished!')
+                    print('*' * 100)
+                    self.model.train()
+                    print('updating ref frame and label')
+
+                    train_dataset.transform = composed_transforms
+                    print('updating ref frame and label finished!')
+
+                else:
+                    if r != round_ - 1:
+                        round_scribble = {}
+
+                        if r == 0:
+                            prev_round_label_dic = {}
+                        frame_num_dic = {}
+                        train_dataset.transform = tr.ToTensor()
+                        trainloader = DataLoader(train_dataset,
+                                                 sampler=RandomIdentitySampler(
+                                                     train_dataset.sample_list),
+                                                 shuffle=False,
+                                                 batch_size=1,
+                                                 num_workers=0)
+
+                        self.model.eval()
+                        with paddle.no_grad():
+                            for ii, sample in enumerate(trainloader):
+                                ref_imgs = sample[
+                                    'ref_img']  # batch_size * 3 * h * w
+                                img1s = sample['img1']
+                                img2s = sample['img2']
+                                ref_scribble_labels = sample[
+                                    'ref_scribble_label']  # batch_size * 1 * h * w
+                                label1s = sample['label1']
+                                label2s = sample['label2']
+                                seq_names = sample['meta']['seq_name']
+                                obj_nums = sample['meta']['obj_num']
+                                frame_nums = sample['meta']['frame_num']
+                                bs, _, h, w = img2s.shape
+
+                                print(seq_names[0])
+                                label2s_ = mask_damager(label2s, 0.1)
+                                round_scribble[
+                                    seq_names[0]] = interactor.interact(
+                                        seq_names[0],
+                                        np.expand_dims(label2s_, axis=0),
+                                        float_(label2s).squeeze(0).numpy(),
+                                        obj_nums)
+                                label2s__ = paddle.to_tensor(label2s_)
+
+                                frame_num_dic[seq_names[0]] = frame_nums[0]
+                                prev_round_label_dic[seq_names[0]] = label2s__
+
+                        print(f'round {r}', 'trainset evaluating finished!')
+                        print('*' * 100)
+                        print('updating ref frame and label')
+
+                        train_dataset.update_ref_frame_and_label(
+                            round_scribble, frame_num_dic, prev_round_label_dic)
+                        self.model.train()
+                        train_dataset.transform = composed_transforms
+                        print('updating ref frame and label finished!')
+
+    #############################################
+
+    def rough_ROI(self, ref_scribble_labels):
+        #### b*1*h*w
+        dist = 15
+        b, _, h, w = ref_scribble_labels.shape
+        filter_ = paddle.zeros_like(ref_scribble_labels)
+        to_fill = paddle.zeros_like(ref_scribble_labels)
+        for i in range(b):
+            no_background = (ref_scribble_labels[i] != -1)
+            no_background = no_background.squeeze(0)
+
+            no_b = no_background.nonzero()
+            h_min, w_min = paddle.min(no_b, 0)  # fixed
+            h_max, w_max = paddle.max(no_b, 0)  # fixed
+
+            filter_[i, 0,
+                    max(h_min - dist, 0):min(h_max + dist, h - 1),
+                    max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1
+
+        final_scribble_labels = paddle.where(byte_(filter_),
+                                             ref_scribble_labels,
+                                             to_fill)  # uint8_ fixed.
+        return final_scribble_labels
+
+    def load_network(self, net, pretrained_dict):
+
+        # pretrained_dict = pretrained_dict
+        model_dict = net.state_dict()
+        # 1. filter out unnecessary keys
+        pretrained_dict = {
+            k: v
+            for k, v in pretrained_dict.items() if k in model_dict
+        }
+        # 2. overwrite entries in the existing state dict
+        # for k in model_dict:
+        #     if k not in pretrained_dict:
+        #         print(k, 'not in loaded weights.')
+
+        model_dict.update(pretrained_dict)
+        net.set_state_dict(model_dict)
+        return net
+
+    def save_network(self, net, step):
+        save_path = self.save_res_dir
+
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        save_file = 'save_step_%s.pth' % (step)
+        paddle.save(net.state_dict(), os.path.join(save_path, save_file))
+
+    def _adjust_lr(self, optimizer, itr, max_itr):
+        now_lr = cfg.TRAIN_LR * (1 - itr / (max_itr + 1))**cfg.TRAIN_POWER
+        optimizer._param_groups[0]['lr'] = now_lr
+        return now_lr
+
+
+_palette = [
+    0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128,
+    128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, 128, 0, 64, 0,
+    128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, 0, 128, 64, 0, 0, 191,
+    0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, 22, 22, 23, 23, 23, 24, 24,
+    24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30,
+    31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37,
+    37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43,
+    43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49,
+    50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,
+    56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62,
+    62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68,
+    69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75,
+    75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81,
+    81, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87,
+    88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94,
+    94, 94, 95, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100,
+    100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105,
+    105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, 110,
+    110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115,
+    115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 120,
+    120, 120, 121, 121, 121, 122, 122, 122, 123, 123, 123, 124, 124, 124, 125,
+    125, 125, 126, 126, 126, 127, 127, 127, 128, 128, 128, 129, 129, 129, 130,
+    130, 130, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135,
+    135, 135, 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,
+    140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, 144, 145,
+    145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, 149, 149, 149, 150,
+    150, 150, 151, 151, 151, 152, 152, 152, 153, 153, 153, 154, 154, 154, 155,
+    155, 155, 156, 156, 156, 157, 157, 157, 158, 158, 158, 159, 159, 159, 160,
+    160, 160, 161, 161, 161, 162, 162, 162, 163, 163, 163, 164, 164, 164, 165,
+    165, 165, 166, 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170,
+    170, 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, 175,
+    175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, 179, 179, 180,
+    180, 180, 181, 181, 181, 182, 182, 182, 183, 183, 183, 184, 184, 184, 185,
+    185, 185, 186, 186, 186, 187, 187, 187, 188, 188, 188, 189, 189, 189, 190,
+    190, 190, 191, 191, 191, 192, 192, 192, 193, 193, 193, 194, 194, 194, 195,
+    195, 195, 196, 196, 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200,
+    200, 200, 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,
+    205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, 209, 210,
+    210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215,
+    215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220,
+    220, 220, 221, 221, 221, 222, 222, 222, 223, 223, 223, 224, 224, 224, 225,
+    225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 229, 230,
+    230, 230, 231, 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235,
+    235, 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, 240,
+    240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, 244, 244, 245,
+    245, 245, 246, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250,
+    250, 250, 251, 251, 251, 252, 252, 252, 253, 253, 253, 254, 254, 254, 255,
+    255, 255
+]
+
+manager = Manager()
+manager.train()
diff --git a/docs/src/applications/Ma-Net/utils/api.py b/docs/src/applications/Ma-Net/utils/api.py
new file mode 100644
index 000000000..bb9d84349
--- /dev/null
+++ b/docs/src/applications/Ma-Net/utils/api.py
@@ -0,0 +1,857 @@
+import math
+import warnings
+
+import numpy
+import numpy as np
+from numpy import inf
+from paddle import Tensor, concat, reshape, nn
+import paddle
+
+from typing import Union, Iterable
+
+_tensor_or_tensors = Union[paddle.Tensor, Iterable[paddle.Tensor]]
+
+import paddle
+import PIL
+import numbers
+import numpy as np
+from PIL import Image
+from paddle.vision.transforms import BaseTransform
+from paddle.vision.transforms import functional as F
+
+
+def torch2paddle(data):
+    try:
+        import torch
+        if isinstance(data, dict):
+            np_data = {}
+            for k, v in data.items():
+                np_data[k] = paddle.to_tensor(v.detach().numpy())
+            return np_data
+        else:
+            return paddle.to_tensor(data.detach().numpy())
+    except:
+        pass
+
+
+def fill_(tensor: Tensor, value):
+    return tensor.set_value(paddle.full_like(tensor, value))
+
+
+def zero_(tensor: Tensor):
+    return tensor.set_value(paddle.zeros_like(tensor))
+
+
+def float_(tensor: Tensor):
+    return paddle.to_tensor(tensor, dtype='float32')
+
+
+def long_(tensor: Tensor):
+    return paddle.to_tensor(tensor, dtype='int64')
+
+
+def int_(tensor: Tensor):
+    return paddle.to_tensor(tensor, dtype='int32')
+
+
+def byte_(tensor: Tensor):
+    return paddle.to_tensor(tensor, dtype='bool')
+
+
+class ToPILImage(BaseTransform):
+    def __init__(self, mode=None, keys=None):
+        super(ToPILImage, self).__init__(keys)
+
+    def _apply_image(self, pic):
+        """
+        Args:
+            pic (Tensor|np.ndarray): Image to be converted to PIL Image.
+        Returns:
+            PIL: Converted image.
+        """
+        if not (isinstance(pic, paddle.Tensor) or isinstance(pic, np.ndarray)):
+            raise TypeError('pic should be Tensor or ndarray. Got {}.'.format(
+                type(pic)))
+
+        elif isinstance(pic, paddle.Tensor):
+            if pic.ndimension() not in {2, 3}:
+                raise ValueError(
+                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(
+                        pic.ndimension()))
+
+            elif pic.ndimension() == 2:
+                # if 2D image, add channel dimension (CHW)
+                pic = pic.unsqueeze(0)
+
+        elif isinstance(pic, np.ndarray):
+            if pic.ndim not in {2, 3}:
+                raise ValueError(
+                    'pic should be 2/3 dimensional. Got {} dimensions.'.format(
+                        pic.ndim))
+
+            elif pic.ndim == 2:
+                # if 2D image, add channel dimension (HWC)
+                pic = np.expand_dims(pic, 2)
+
+        npimg = pic
+        if isinstance(pic, paddle.Tensor) and "float" in str(
+                pic.numpy().dtype) and self.mode != 'F':
+            pic = pic.mul(255).byte()
+        if isinstance(pic, paddle.Tensor):
+            npimg = np.transpose(pic.numpy(), (1, 2, 0))
+
+        if not isinstance(npimg, np.ndarray):
+            raise TypeError(
+                'Input pic must be a paddle.Tensor or NumPy ndarray, ' +
+                'not {}'.format(type(npimg)))
+
+        if npimg.shape[2] == 1:
+            expected_mode = None
+            npimg = npimg[:, :, 0]
+            if npimg.dtype == np.uint8:
+                expected_mode = 'L'
+            elif npimg.dtype == np.int16:
+                expected_mode = 'I;16'
+            elif npimg.dtype == np.int32:
+                expected_mode = 'I'
+            elif npimg.dtype == np.float32:
+                expected_mode = 'F'
+            if self.mode is not None and self.mode != expected_mode:
+                raise ValueError(
+                    "Incorrect self.mode ({}) supplied for input type {}. Should be {}"
+                    .format(self.mode, np.dtype, expected_mode))
+            self.mode = expected_mode
+
+        elif npimg.shape[2] == 2:
+            permitted_2_channel_modes = ['LA']
+            if self.mode is not None and self.mode not in permitted_2_channel_modes:
+                raise ValueError(
+                    "Only self.modes {} are supported for 2D inputs".format(
+                        permitted_2_channel_modes))
+
+            if self.mode is None and npimg.dtype == np.uint8:
+                self.mode = 'LA'
+
+        elif npimg.shape[2] == 4:
+            permitted_4_channel_modes = ['RGBA', 'CMYK', 'RGBX']
+            if self.mode is not None and self.mode not in permitted_4_channel_modes:
+                raise ValueError(
+                    "Only self.modes {} are supported for 4D inputs".format(
+                        permitted_4_channel_modes))
+
+            if self.mode is None and npimg.dtype == np.uint8:
+                self.mode = 'RGBA'
+        else:
+            permitted_3_channel_modes = ['RGB', 'YCbCr', 'HSV']
+            if self.mode is not None and self.mode not in permitted_3_channel_modes:
+                raise ValueError(
+                    "Only self.modes {} are supported for 3D inputs".format(
+                        permitted_3_channel_modes))
+            if self.mode is None and npimg.dtype == np.uint8:
+                self.mode = 'RGB'
+
+        if self.mode is None:
+            raise TypeError('Input type {} is not supported'.format(
+                npimg.dtype))
+
+        return Image.fromarray(npimg, mode=self.mode)
+
+
+class Identity(nn.Layer):
+    r"""A placeholder identity operator that is argument-insensitive.
+
+    Args:
+        args: any argument (unused)
+        kwargs: any keyword argument (unused)
+    """
+    def __init__(self, *args, **kwargs):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+def convert(data: dict, to, dtype=None):
+    assert isinstance(data, dict)
+    input = {}
+    for k, v in data.items():
+
+        if 'paddle' == to:
+            if isinstance(v, np.ndarray):
+                if dtype is not None:
+                    input[k] = paddle.to_tensor(v.astype(dtype))
+                else:
+                    input[k] = paddle.to_tensor(v)
+            else:
+                input[k] = v
+        elif 'torch' == to:
+            try:
+                import torch
+                if isinstance(v, np.ndarray):
+                    if dtype is not None:
+                        input[k] = torch.tensor(v.astype(dtype))
+                    else:
+                        input[k] = torch.tensor(v)
+                else:
+                    input[k] = v
+            except:
+                pass
+        else:
+            if isinstance(v, np.ndarray):
+                input[k] = v.astype(to)
+            else:
+                input[k] = v
+    return input
+
+
+def clip_grad_norm_(parameters: _tensor_or_tensors,
+                    max_norm: float,
+                    norm_type: float = 2.0,
+                    error_if_nonfinite: bool = False) -> paddle.Tensor:
+    r"""Clips gradient norm of an iterable of parameters.
+
+    The norm is computed over all gradients together, as if they were
+    concatenated into a single vector. Gradients are modified in-place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, an error is thrown if the total
+            norm of the gradients from :attr:``parameters`` is ``nan``,
+            ``inf``, or ``-inf``. Default: False (will switch to True in the future)
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    import time
+    if isinstance(parameters, paddle.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    detached_grads = [p.grad.detach() for p in parameters]
+
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return paddle.to_tensor(0.)
+    if norm_type == inf:
+        norms = [p.abs().max() for p in parameters]
+        total_norm = norms[0] if len(norms) == 1 else paddle.max(
+            paddle.stack(norms))
+    else:
+        total_norm = paddle.norm(
+            paddle.stack([paddle.norm(g, norm_type) for g in detached_grads]),
+            norm_type)
+    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),
+                                                total_norm.isinf()):
+        raise RuntimeError(
+            f'The total norm of order {norm_type} for gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. To disable '
+            'this error and scale the gradients by the non-finite norm anyway, '
+            'set `error_if_nonfinite=False`')
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so
+    # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization
+    # when the gradients do not reside in CPU memory.
+    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)
+    for i, p in enumerate(parameters):
+        p.grad.set_value(detached_grads[i] * clip_coef_clamped)  # fixed
+    return total_norm
+
+
+def max(a: paddle.Tensor, axis=0, keepdim=True):
+    """ndarray=numpy.array([[1, 2, 3, 4],
+           [4, 3, 2, 1],
+           [5, 6, 7, 8],
+           [8, 7, 6, 5]])
+    np.where(ndarray == np.max(ndarray))
+    (array([2, 3]), array([3, 0]))
+    ndarray[np.where(ndarray == np.max(ndarray))]
+    array([8, 8])
+    """
+    max_ = a.max(axis).unsqueeze(-1)
+    index = paddle.argmax(a, axis=axis, keepdim=keepdim)
+    max_ = max_.numpy()
+    index = index.numpy()
+    # index = paddle.argmax(a, axis=axis, keepdim=keepdim)[-1].flatten()
+    return max_, index
+
+
+def gather(tmp: paddle.Tensor, ind: paddle.Tensor):
+    shape = tmp.shape
+    tmp = paddle.to_tensor(tmp)
+    ind = paddle.to_tensor(ind)
+    if len(shape) == 2:
+        b = shape[0]
+        return concat([
+            reshape(paddle.gather(tmp[i, :], ind[i, :]), [1, -1])
+            for i in range(b)
+        ],
+                      axis=0)
+    elif len(shape) == 3:
+        out = []
+        for i in range(tmp.shape[0]):
+            _ = paddle.index_sample(tmp[i], ind[i])
+            out.append(_)
+        return paddle.to_tensor(out)
+    elif len(shape) == 4:
+        b, c, d = shape[:3]
+        return concat([
+            reshape(
+                concat([
+                    reshape(
+                        concat([
+                            reshape(
+                                paddle.gather(tmp[i, j, k, :], ind[i, j, k, :]),
+                                [1, -1]) for k in range(d)
+                        ],
+                               axis=0), [1, d, -1]) for j in range(c)
+                ],
+                       axis=0), [1, c, d, -1]) for i in range(b)
+        ],
+                      axis=0)
+    else:
+        pass
+
+
+# These no_grad_* functions are necessary as wrappers around the parts of these
+# functions that use `with torch.no_grad()`. The JIT doesn't support context
+# managers, so these need to be implemented as builtins. Using these wrappers
+# lets us keep those builtins small and re-usable.
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(paddle.uniform(tensor.shape, min=a, max=b))
+        return tensor
+
+
+def _no_grad_normal_(tensor, mean, std):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(shape=tensor.shape, mean=mean, std=std))
+        return tensor
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    from scipy import special
+
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2)
+
+    with paddle.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.set_value(
+            paddle.uniform(tensor.shape, min=2 * l - 1, max=2 * u - 1))
+        # tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.set_value(special.erfinv(tensor))
+
+        # Transform to proper mean, std
+        tensor.set_value(tensor.multiply(paddle.to_tensor(std * math.sqrt(2.))))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clip_(min=a, max=b)
+        return tensor
+
+
+def _no_grad_fill_(tensor, val):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, fill_value=val))
+        return tensor
+
+
+def _no_grad_zero_(tensor):
+    with paddle.no_grad():
+        tensor.set_value(paddle.zeros_like(tensor))
+        return tensor
+
+
+def calculate_gain(nonlinearity, param=None):
+    r"""Return the recommended gain value for the given nonlinearity function.
+    The values are as follows:
+
+    ================= ====================================================
+    nonlinearity      gain
+    ================= ====================================================
+    Linear / Identity :math:`1`
+    Conv{1,2,3}D      :math:`1`
+    Sigmoid           :math:`1`
+    Tanh              :math:`\frac{5}{3}`
+    ReLU              :math:`\sqrt{2}`
+    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}`
+    SELU              :math:`\frac{3}{4}`
+    ================= ====================================================
+
+    Args:
+        nonlinearity: the non-linear function (`nn.functional` name)
+        param: optional parameter for the non-linear function
+
+    Examples:
+        >>> gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2
+    """
+    linear_fns = [
+        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+        'conv_transpose2d', 'conv_transpose3d'
+    ]
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError(
+                "negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def uniform_(tensor: Tensor, a: float = 0., b: float = 1.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the lower bound of the uniform distribution
+        b: the upper bound of the uniform distribution
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.uniform_(w)
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor: Tensor, mean: float = 0., std: float = 1.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.normal_(w)
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def trunc_normal_(tensor: Tensor,
+                  mean: float = 0.,
+                  std: float = 1.,
+                  a: float = -2.,
+                  b: float = 2.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def constant_(tensor: Tensor, val: float) -> Tensor:
+    r"""Fills the input Tensor with the value :math:`\text{val}`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        val: the value to fill the tensor with
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.constant_(w, 0.3)
+    """
+    return _no_grad_fill_(tensor, val)
+
+
+def ones_(tensor: Tensor) -> Tensor:
+    r"""Fills the input Tensor with the scalar value `1`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.ones_(w)
+    """
+    return _no_grad_fill_(tensor, 1.)
+
+
+def zeros_(tensor: Tensor) -> Tensor:
+    r"""Fills the input Tensor with the scalar value `0`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.zeros_(w)
+    """
+    return _no_grad_zero_(tensor)
+
+
+def eye_(tensor):
+    r"""Fills the 2-dimensional input `Tensor` with the identity
+    matrix. Preserves the identity of the inputs in `Linear` layers, where as
+    many inputs are preserved as possible.
+
+    Args:
+        tensor: a 2-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.eye_(w)
+    """
+    if tensor.ndimension() != 2:
+        raise ValueError("Only tensors with 2 dimensions are supported")
+
+    with paddle.no_grad():
+        tensor.set_value(paddle.eye(*tensor.shape))
+    return tensor
+
+
+def dirac_(tensor, groups=1):
+    r"""Fills the {3, 4, 5}-dimensional input `Tensor` with the Dirac
+    delta function. Preserves the identity of the inputs in `Convolutional`
+    layers, where as many input channels are preserved as possible. In case
+    of groups>1, each group of channels preserves identity
+
+    Args:
+        tensor: a {3, 4, 5}-dimensional `torch.Tensor`
+        groups (optional): number of groups in the conv layer (default: 1)
+    Examples:
+        >>> w = torch.empty(3, 16, 5, 5)
+        >>> nn.init.dirac_(w)
+        >>> w = torch.empty(3, 24, 5, 5)
+        >>> nn.init.dirac_(w, 3)
+    """
+    dimensions = tensor.ndimension()
+    if dimensions not in [3, 4, 5]:
+        raise ValueError(
+            "Only tensors with 3, 4, or 5 dimensions are supported")
+
+    sizes = tensor.shape
+
+    if sizes[0] % groups != 0:
+        raise ValueError('dim 0 must be divisible by groups')
+
+    out_chans_per_grp = sizes[0] // groups
+    min_dim = min(out_chans_per_grp, sizes[1])
+
+    with paddle.no_grad():
+        tensor.zero_()
+
+        for g in range(groups):
+            for d in range(min_dim):
+                if dimensions == 3:  # Temporal convolution
+                    tensor[g * out_chans_per_grp + d, d,
+                           tensor.shape[2] // 2] = 1
+                elif dimensions == 4:  # Spatial convolution
+                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,
+                           tensor.shape[3] // 2] = 1
+                else:  # Volumetric convolution
+                    tensor[g * out_chans_per_grp + d, d, tensor.shape[2] // 2,
+                           tensor.shape[3] // 2, tensor.shape[4] // 2] = 1
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = tensor.dim()
+    if dimensions < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    num_input_fmaps = tensor.shape[1]  # .size(1)
+    num_output_fmaps = tensor.shape[0]  # .size(0)
+    receptive_field_size = 1
+    if tensor.dim() > 2:
+        for s in tensor.shape[2:]:
+            receptive_field_size *= s  # fixed
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def LongTensor(x):
+    return paddle.to_tensor(x, dtype='int64')
+
+
+def IntTensor(x):
+    return paddle.to_tensor(x, dtype='int32')
+
+
+def xavier_uniform_(tensor: Tensor, gain: float = 1.) -> Tensor:
+    r"""Fills the input `Tensor` with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform
+    distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-a, a)` where
+
+    .. math::
+        a = \text{gain} \times \sqrt{\frac{6}{\text{fan\_in} + \text{fan\_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        gain: an optional scaling factor
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    return _no_grad_uniform_(tensor, -a, a)
+
+
+def xavier_normal_(tensor: Tensor, gain: float = 1.) -> Tensor:
+    r"""Fills the input `Tensor` with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal
+    distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan\_in} + \text{fan\_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        gain: an optional scaling factor
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.xavier_normal_(w)
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+
+    return _no_grad_normal_(tensor, 0., std)
+
+
+def _calculate_correct_fan(tensor, mode):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    r"""Fills the input `Tensor` with values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification` - He, K. et al. (2015), using a
+    uniform distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
+
+    .. math::
+        \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the negative slope of the rectifier used after this layer (only
+            used with ``'leaky_relu'``)
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    bound = math.sqrt(
+        3.0) * std  # Calculate uniform bounds from standard deviation
+    with paddle.no_grad():
+        tensor.set_value(paddle.uniform(tensor.shape, min=-bound, max=bound))
+        return tensor
+
+
+def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    r"""Fills the input `Tensor` with values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification` - He, K. et al. (2015), using a
+    normal distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \frac{\text{gain}}{\sqrt{\text{fan\_mode}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the negative slope of the rectifier used after this layer (only
+            used with ``'leaky_relu'``)
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(shape=tensor.shape, mean=0, std=std))
+        return tensor
+
+
+def orthogonal_(tensor, gain=1):
+    r"""Fills the input `Tensor` with a (semi) orthogonal matrix, as
+    described in `Exact solutions to the nonlinear dynamics of learning in deep
+    linear neural networks` - Saxe, A. et al. (2013). The input tensor must have
+    at least 2 dimensions, and for tensors with more than 2 dimensions the
+    trailing dimensions are flattened.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`, where :math:`n \geq 2`
+        gain: optional scaling factor
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.orthogonal_(w)
+    """
+    if tensor.ndimension() < 2:
+        raise ValueError("Only tensors with 2 or more dimensions are supported")
+
+    rows = tensor.shape[0]  # .size(0)
+    cols = tensor.numel() // rows
+    flattened = tensor.new(rows, cols).normal_(0, 1)
+
+    if rows < cols:
+        flattened.t_()
+
+    # Compute the qr factorization
+    q, r = paddle.to_tensor(np.linalg.qr(flattened.numpy()))
+    # q, r = torch.qr(flattened)
+    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
+    d = paddle.diag(r, 0)
+    ph = d.sign()
+    q *= ph
+
+    if rows < cols:
+        q.t_()
+
+    with paddle.no_grad():
+        tensor.view_as(q).copy_(q)
+        tensor.mul_(gain)
+    return tensor
+
+
+def sparse_(tensor, sparsity, std=0.01):
+    r"""Fills the 2D input `Tensor` as a sparse matrix, where the
+    non-zero elements will be drawn from the normal distribution
+    :math:`\mathcal{N}(0, 0.01)`, as described in `Deep learning via
+    Hessian-free optimization` - Martens, J. (2010).
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        sparsity: The fraction of elements in each column to be set to zero
+        std: the standard deviation of the normal distribution used to generate
+            the non-zero values
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.sparse_(w, sparsity=0.1)
+    """
+    if tensor.ndimension() != 2:
+        raise ValueError("Only tensors with 2 dimensions are supported")
+
+    rows, cols = tensor.shape
+    num_zeros = int(math.ceil(sparsity * rows))
+
+    with paddle.no_grad():
+        tensor.normal_(0, std)
+        for col_idx in range(cols):
+            row_indices = paddle.randperm(rows)
+            zero_indices = row_indices[:num_zeros]
+            tensor[zero_indices, col_idx] = 0
+    return tensor
+
+
+# for backward compatibility
+def _make_deprecate(meth):
+    new_name = meth.__name__
+    old_name = new_name[:-1]
+
+    def deprecated_init(*args, **kwargs):
+        warnings.warn(
+            "nn.init.{} is now deprecated in favor of nn.init.{}.".format(
+                old_name, new_name),
+            stacklevel=2)
+        return meth(*args, **kwargs)
+
+    deprecated_init.__doc__ = r"""
+    {old_name}(...)
+
+    .. warning::
+        This method is now deprecated in favor of :func:`torch.nn.init.{new_name}`.
+
+    See :func:`~torch.nn.init.{new_name}` for details.""".format(
+        old_name=old_name, new_name=new_name)
+    deprecated_init.__name__ = old_name
+    return deprecated_init
diff --git a/docs/src/applications/Ma-Net/utils/mask_damaging.py b/docs/src/applications/Ma-Net/utils/mask_damaging.py
new file mode 100644
index 000000000..12480294b
--- /dev/null
+++ b/docs/src/applications/Ma-Net/utils/mask_damaging.py
@@ -0,0 +1,170 @@
+import numpy as np
+from scipy.ndimage import interpolation
+try:
+    from skimage import morphology, transform
+except ImportError as e:
+    print(
+        f"{e}, [scikit-image] package and it's dependencies is required for MA-Net."
+    )
+import paddle
+import cv2
+import random
+
+
+####
+def mask_damager(labels=None, p_black=0.2):
+    scales = (0.8, 1.0, 1.2)
+    kernel_size = random.randint(10, 15)
+    kernel = np.ones((kernel_size, kernel_size), np.uint8)
+    if random.random() < p_black:
+        final_label = paddle.zeros_like(labels)
+        final_label = final_label.squeeze().numpy()
+    else:
+        prot = random.randint(5, 15)
+        nrot = random.randint(-15, -5)
+        rots = [prot, nrot, 0]
+        rot = rots[random.randint(0, 2)]
+
+        sc = scales[random.randint(0, 2)]
+        _, _, h, w = labels.shape
+        tmp = labels.squeeze()
+
+        tmp = tmp.unsqueeze(-1)
+        tmp = tmp.numpy().astype(np.uint8)
+        morph_p = random.random()
+        if morph_p < 0.5:
+            tmp = cv2.morphologyEx(tmp, cv2.MORPH_OPEN, kernel)
+        else:
+            tmp = cv2.morphologyEx(tmp, cv2.MORPH_CLOSE, kernel)
+
+        tmp = tmp.astype(np.uint8)
+        center = (w / 2, h / 2)
+        M = cv2.getRotationMatrix2D(center, rot, sc)
+        final_label = cv2.warpAffine(tmp, M, (w, h), cv2.INTER_NEAREST)
+
+    return final_label
+
+
+#####
+
+
+def damage_masks(labels, shift=True, scale=True, rotate=True):
+    """
+    Args:
+    labels: numpy array (batch_size * 1 * h * w)
+    """
+    bs, _, h, w = labels.shape
+    labels = labels.transpose([0, 2, 3, 1])
+    labels = labels.numpy()
+    final_label = []
+    for i in range(bs):
+        label = labels[i]
+        damaged_label = damage_masks_np(label, shift, scale, rotate)
+        final_label.append(damaged_label)
+    final_label = np.array(final_label)
+    final_label = paddle.to_tensor(final_label)
+    final_label = final_label.transpose([0, 3, 1, 2])
+    return final_label
+
+
+def damage_masks_np(labels, shift=True, scale=True, rotate=True):
+    """Performs the actual mask damaging in numpy.
+    Args:
+    labels: Int32 numpy array of shape (height, width, 1).
+    shift: Boolean, whether to damage the masks by shifting.
+    scale: Boolean, whether to damage the masks by scaling.
+    rotate: Boolean, whether to damage the masks by rotation.
+    dilate: Boolean, whether to damage the masks by dilation.
+    Returns:
+    The damaged version of labels.
+    """
+    unique_labels = np.unique(labels)
+    unique_labels = np.setdiff1d(unique_labels, [0])
+    # Shuffle to get random depth ordering when combining together.
+    np.random.shuffle(unique_labels)
+    damaged_labels = np.zeros_like(labels)
+    for l in unique_labels:
+        obj_mask = (labels == l)
+        damaged_obj_mask = _damage_single_object_mask(obj_mask, shift, scale,
+                                                      rotate)
+        damaged_labels[damaged_obj_mask] = l
+    return damaged_labels
+
+
+def _damage_single_object_mask(mask, shift, scale, rotate):
+    """Performs mask damaging in numpy for a single object.
+    Args:
+    mask: Boolean numpy array of shape(height, width, 1).
+    shift: Boolean, whether to damage the masks by shifting.
+    scale: Boolean, whether to damage the masks by scaling.
+    rotate: Boolean, whether to damage the masks by rotation.
+    dilate: Boolean, whether to damage the masks by dilation.
+    Returns:
+    The damaged version of mask.
+    """
+    if shift:
+        mask = _shift_mask(mask)
+    if scale:
+        mask = _scale_mask(mask)
+    if rotate:
+        mask = _rotate_mask(mask)
+    return mask
+
+
+def _shift_mask(mask, max_shift_factor=0.05):
+    """Damages a mask for a single object by randomly shifting it in numpy.
+    Args:
+    mask: Boolean numpy array of shape(height, width, 1).
+    max_shift_factor: Float scalar, the maximum factor for random shifting.
+    Returns:
+    The shifted version of mask.
+    """
+    nzy, nzx, _ = mask.nonzero()
+    h = nzy.max() - nzy.min()
+    w = nzx.max() - nzx.min()
+    size = np.sqrt(h * w)
+    offset = np.random.uniform(-size * max_shift_factor,
+                               size * max_shift_factor, 2)
+    shifted_mask = interpolation.shift(np.squeeze(mask, axis=2),
+                                       offset,
+                                       order=0).astype('bool')[..., np.newaxis]
+    return shifted_mask
+
+
+def _scale_mask(mask, scale_amount=0.025):
+    """Damages a mask for a single object by randomly scaling it in numpy.
+    Args:
+    mask: Boolean numpy array of shape(height, width, 1).
+    scale_amount: Float scalar, the maximum factor for random scaling.
+    Returns:
+    The scaled version of mask.
+    """
+    nzy, nzx, _ = mask.nonzero()
+    cy = 0.5 * (nzy.max() - nzy.min())
+    cx = 0.5 * (nzx.max() - nzx.min())
+    scale_factor = np.random.uniform(1.0 - scale_amount, 1.0 + scale_amount)
+    shift = transform.SimilarityTransform(translation=[-cx, -cy])
+    inv_shift = transform.SimilarityTransform(translation=[cx, cy])
+    s = transform.SimilarityTransform(scale=[scale_factor, scale_factor])
+    m = (shift + (s + inv_shift)).inverse
+    scaled_mask = transform.warp(mask, m) > 0.5
+    return scaled_mask
+
+
+def _rotate_mask(mask, max_rot_degrees=3.0):
+    """Damages a mask for a single object by randomly rotating it in numpy.
+    Args:
+    mask: Boolean numpy array of shape(height, width, 1).
+    max_rot_degrees: Float scalar, the maximum number of degrees to rotate.
+    Returns:
+    The scaled version of mask.
+    """
+    cy = 0.5 * mask.shape[0]
+    cx = 0.5 * mask.shape[1]
+    rot_degrees = np.random.uniform(-max_rot_degrees, max_rot_degrees)
+    shift = transform.SimilarityTransform(translation=[-cx, -cy])
+    inv_shift = transform.SimilarityTransform(translation=[cx, cy])
+    r = transform.SimilarityTransform(rotation=np.deg2rad(rot_degrees))
+    m = (shift + (r + inv_shift)).inverse
+    scaled_mask = transform.warp(mask, m) > 0.5
+    return scaled_mask
diff --git a/docs/src/applications/Ma-Net/utils/meters.py b/docs/src/applications/Ma-Net/utils/meters.py
new file mode 100644
index 000000000..c5cca45e1
--- /dev/null
+++ b/docs/src/applications/Ma-Net/utils/meters.py
@@ -0,0 +1,22 @@
+from __future__ import absolute_import
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
diff --git a/docs/src/applications/Ma-Net/utils/utils.py b/docs/src/applications/Ma-Net/utils/utils.py
new file mode 100644
index 000000000..a52f6cfa7
--- /dev/null
+++ b/docs/src/applications/Ma-Net/utils/utils.py
@@ -0,0 +1,12 @@
+import numpy as np
+
+
+def label2colormap(label):
+
+    m = label.astype(np.uint8)
+    r, c = m.shape
+    cmap = np.zeros((r, c, 3), dtype=np.uint8)
+    cmap[:, :, 0] = (m & 1) << 7 | (m & 8) << 3 | (m & 64) >> 1
+    cmap[:, :, 1] = (m & 2) << 6 | (m & 16) << 2 | (m & 128) >> 2
+    cmap[:, :, 2] = (m & 4) << 5 | (m & 32) << 1
+    return cmap
diff --git a/docs/src/applications/MultimodalVideoTag/README.md b/docs/src/applications/MultimodalVideoTag/README.md
new file mode 100644
index 000000000..eef56f1e2
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/README.md
@@ -0,0 +1,77 @@
+# MutimodalVideoTag 多模态视频分类模型
+---
+## 内容
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型评估](#模型评估)
+- [模型推理](#模型推理)
+- [模型优化](#模型优化)
+- [模型部署](#模型部署)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+该代码库用于多模态场景下视频分类任务，基于paddle2.0版本开发，模型基于真实短视频业务数据，融合文本、视频图像、音频三种模态进行视频多模标签分类，相比纯视频图像特征，显著提升高层语义标签效果。其原理示意如下图所示。
+
+<p align="center">
+<img src="images/model.png"  hspace='10'/> <br />
+MutimodalVideoTag 多模态视频分类模型示意图
+</p>
+
+- 数据处理：分别对视频三个模态的数据进行处理，对视频进行抽帧，获得图像序列；抽取视频的音频pcm 文件；收集视频标题，简单进行文本长度截断，一般取50个字。
+- 特征抽取：使用预训练的 ResNet 对图像抽取高层语义特征；使用预训练的VGGish网络抽取音频特征；文本方面使用[ERNIE 1.0](https://github.com/PaddlePaddle/ERNIE)抽取文本特征，无需预先抽取，支持视频分类模型finetune
+- 序列学习：分别使用独立的LSTM 对图像特征和音频特征进行序列学习，文本方面预训练模型对字符序列进行建模，在ernie 后接入一个textcnn 网络做下游任务的迁移学习。
+- 多模融合：文本具有显式的高层语义信息，将文本特征引入到LSTM pooling 过程指导图像和音频时序权重分配，进行交叉融合，最后将文本、音频、视频特征拼接。
+- 预测结果：分类器选用sigmoid 多标签分类器，支持视频多标签输出。
+
+## 数据准备
+数据方面提供已经抽取好图像、音频特征的特征文件，以及标题和标签信息，模型方面提供训练好checkpoint 文件，可进行finetune、模型评估、预测。
+```
+sh download.sh
+```
+数据文件包括抽取好特征的文件夹 `feature_files`，以及记录划分的txt 文件，格式如下
+```
+文件名 \t 标题 \t 标签
+18e9bf08a2fc7eaa4ee9215ab42ea827.mp4 叮叮来自肖宇梁肖宇梁rainco的特别起床铃声 拍人-帅哥,拍人-秀特效,明星周边-其他明星周边
+```
+
+##  模型训练
+模型训练过程有如下可调模式，可在根据数据集情况进行调整，在`conf/conf.txt` 文件中
+- ernie_freeze: 用于控制文本提特征的ernie 网络是否进行finetune，因为ernie 复杂度远大于图像、视频序列学习网络，因此在某些数据集上不好训练。
+- lstm_pool_mode: 用于控制lstm 序列池化的方式，默认是"text_guide"表示利用文本加强池化注意力权重，如果设置为空，则默认为自注意力的权重。
+
+```
+sh train.sh 
+```
+##  模型评估
+模型对测试集进行评估，同时支持将checkpoint 模型转为inference 模型， 可用参数'save_only' 选项控制，设置即只用于做模型转换，得到inference 模型
+```
+sh eval_and_save_model.sh
+```
+##  模型推理
+通过上一步得到的inference 模型进行预测，结果默认阈值为0.5，存储到json 文件中，在`conf/conf.txt` 文件 `threshold` 参数进行控制多标签输出的阈值。
+```
+sh inference.sh
+```
+## 模型优化
+模型方面，主要在文本分支进行了实验，实验结果显示ERNIE 在多分支下不微调，而是使用后置网络进行微调，训练速度快，且稳定，同时attention 方面使用文本信息增强图像、音频的attention 学习能一定程度提升模型效果。
+
+| 模型                                                         | Hit@1 | Hit@2 |
+| ------------------------------------------------------------ | ----- | ----- |
+| 文本分支ERNIE 不finetune +self-attention                     | 71.07 | 83.72 |
+| 文本分支ERNIE 不finetune +textcnn finetune + self-attention  | 72.66 | 85.01 |
+| 文本分支ERNIE 不finetune +extcnn finetune + text-guide-attention | 73.29 | 85.59 |
+
+## 模型部署
+
+<div align="center">
+  <img src="images/show.gif" width="480px"/><br>
+</div>
+
+
+## 参考论文
+- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen
+- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan
+- [Ernie: Enhanced representation through knowledge integration](https://arxiv.org/abs/1904.09223), Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Chen, Xuyi and Zhang, Han and Tian, Xin and Zhu, Danxiang and Tian, Hao and Wu, Hua
diff --git a/docs/src/applications/MultimodalVideoTag/download.sh b/docs/src/applications/MultimodalVideoTag/download.sh
new file mode 100644
index 000000000..8ae76a0c6
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/download.sh
@@ -0,0 +1,11 @@
+# download ernie 1.0 model
+wget https://videotag.bj.bcebos.com/Applications/MultimodalVideoTag/model_pretrained_ernie.tar.gz
+tar -xzvf model_pretrained_ernie.tar.gz
+
+# download pretrain model
+wget https://videotag.bj.bcebos.com/Applications/MultimodalVideoTag/checkpoints_save.tar.gz
+tar -xzvf checkpoints_save.tar.gz
+
+# download test dataset
+wget https://videotag.bj.bcebos.com/Applications/MultimodalVideoTag/datasets.tar.gz
+tar -xzvf datasets.tar.gz
diff --git a/docs/src/applications/MultimodalVideoTag/eval_and_save_model.sh b/docs/src/applications/MultimodalVideoTag/eval_and_save_model.sh
new file mode 100755
index 000000000..6ecd57e53
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/eval_and_save_model.sh
@@ -0,0 +1,13 @@
+# eval sh 
+export CUDA_VISIBLE_DEVICES=0
+export FLAGS_eager_delete_tensor_gb=0.0
+export FLAGS_sync_nccl_allreduce=1
+export FLAGS_fast_eager_deletion_mode=1
+export FLAGS_fraction_of_gpu_memory_to_use=0.5
+export FLAGS_reallocate_gpu_memory_in_mb=0
+export FLAGS_memory_fraction_of_eager_deletion=1
+python scenario_lib/eval_and_save_model.py --model_name=AttentionLstmErnie \
+--config=./conf/conf.txt \
+--save_model_param_dir=checkpoints_save \
+--save_inference_model=inference_models_save \
+# --save_only
diff --git a/docs/src/applications/MultimodalVideoTag/inference.sh b/docs/src/applications/MultimodalVideoTag/inference.sh
new file mode 100755
index 000000000..8d490cf54
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/inference.sh
@@ -0,0 +1,12 @@
+# inference sh 
+export CUDA_VISIBLE_DEVICES=0
+export FLAGS_eager_delete_tensor_gb=0.0
+export FLAGS_sync_nccl_allreduce=1
+export FLAGS_fast_eager_deletion_mode=1
+export FLAGS_fraction_of_gpu_memory_to_use=0.5
+export FLAGS_reallocate_gpu_memory_in_mb=0
+export FLAGS_memory_fraction_of_eager_deletion=1
+python scenario_lib/inference.py --model_name=AttentionLstmErnie \
+--config=./conf/conf.txt \
+--save_inference_model=inference_models_save \
+--output='output.json'
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py
new file mode 100755
index 000000000..21539ed9c
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py
@@ -0,0 +1,160 @@
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class MetricsCalculator():
+    """
+    MetricsCalculator
+    """
+    def __init__(self, name, mode, metrics_args):
+        """
+        init
+        """
+        self.name = name
+        self.mode = mode  # 'train', 'val', 'test'
+        self.acc_dict = {}
+        self.top_n_list = metrics_args.MODEL.top_n
+        self.num_classes = metrics_args.MODEL.num_classes
+        self.reset()
+
+    def reset(self):
+        """
+        reset
+        """
+        logger.info('Resetting {} metrics...'.format(self.mode))
+        for topk in self.top_n_list:
+            self.acc_dict['avg_acc%d' % (topk)] = 0.0
+        self.aggr_loss = 0.0
+        self.aggr_batch_size = 0
+
+    def finalize_metrics(self):
+        """finalize_metrics
+        """
+        for key, value in self.acc_dict.items():
+            self.acc_dict[key] = value / self.aggr_batch_size
+        self.aggr_loss = self.aggr_loss / self.aggr_batch_size
+
+    def get_computed_metrics(self):
+        """get_computed_metrics
+        """
+        acc_dict = {}
+        for key, value in self.acc_dict.items():
+            acc_dict[key] = value / self.aggr_batch_size
+        aggr_loss = self.aggr_loss / self.aggr_batch_size
+
+        return acc_dict, aggr_loss
+
+    def accumulate(self, loss, softmax, labels):
+        """accumulate
+        """
+        cur_batch_size = softmax.shape[0]
+        # if returned loss is None for e.g. test, just set loss to be 0.
+        if loss is None:
+            cur_loss = 0.
+        else:
+            cur_loss = np.mean(np.array(loss))  #
+        self.aggr_batch_size += cur_batch_size
+        self.aggr_loss += cur_loss * cur_batch_size
+
+        for top_k in self.top_n_list:
+            self.acc_dict['avg_acc%d' %
+                          (top_k)] += cur_batch_size * compute_topk_accuracy(
+                              softmax, labels, top_k=top_k) * 100.
+        return
+
+    def finalize_and_log_out(self, info=''):
+        """finalize_and_log_out
+        """
+        metrics_dict, loss = self.get_computed_metrics()
+        acc_str = []
+        for name, value in metrics_dict.items():
+            acc_str.append('{}:{},'.format('%s' % name, '%.2f' % value))
+        acc_str = '\t'.join(acc_str)
+        logger.info(info +
+                    '\tLoss: {},\t{}'.format('%.6f' % loss, '%s' % acc_str))
+        return
+
+
+def compute_topk_correct_hits_multilabel(top_k, preds, labels):
+    '''Compute the number of corret hits'''
+    batch_size = preds.shape[0]
+    top_k_preds = np.zeros((batch_size, 10), dtype=np.float32)
+    for i in range(batch_size):
+        top_k_preds[i, :] = np.argsort(-preds[i, :])[:10]
+    correctness = np.zeros(batch_size, dtype=np.float32)
+    for i in range(batch_size):
+        correc_sum = 0
+        for label_id in range(len(labels[i])):
+            label_hit = labels[i][label_id]
+            if label_hit == 0 or label_hit < 0.1:
+                continue
+            if label_id in top_k_preds[i, :top_k].astype(np.int32).tolist():
+                # correc_sum += 1
+                correc_sum = 1
+                break
+        correctness[i] = correc_sum
+    correct_hits = sum(correctness)
+    return correct_hits
+
+
+def compute_topk_correct_hits(top_k, preds, labels):
+    '''Compute the number of corret hits'''
+    batch_size = preds.shape[0]
+
+    top_k_preds = np.zeros((batch_size, top_k), dtype=np.float32)
+    for i in range(batch_size):
+        top_k_preds[i, :] = np.argsort(-preds[i, :])[:top_k]
+
+    correctness = np.zeros(batch_size, dtype=np.int32)
+    for i in range(batch_size):
+        if labels[i] in top_k_preds[i, :].astype(np.int32).tolist():
+            correctness[i] = 1
+    correct_hits = sum(correctness)
+
+    return correct_hits
+
+
+def compute_topk_accuracy(softmax, labels, top_k):
+    """compute_topk_accuracy
+    """
+    computed_metrics = {}
+    assert labels.shape[0] == softmax.shape[0], "Batch size mismatch."
+    aggr_batch_size = labels.shape[0]
+    # aggr_top_k_correct_hits = compute_topk_correct_hits(top_k, softmax, labels)
+    aggr_top_k_correct_hits = compute_topk_correct_hits_multilabel(
+        top_k, softmax, labels)
+    # normalize results
+    computed_metrics = \
+        float(aggr_top_k_correct_hits) / aggr_batch_size
+
+    return computed_metrics
+
+
+if __name__ == "__main__":
+    pred = np.array([[0.5, 0.2, 0.3, 0, 0]])
+    label = np.array([[0.5, 0.5, 0, 0, 0]])
+    print('pred:  ', pred)
+    print('label:  ', label)
+    print('Top 1 hits', compute_topk_correct_hits_multilabel(1, pred, label))
+    print('Top 5 hits', compute_topk_correct_hits_multilabel(5, pred, label))
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/config.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/config.py
new file mode 100755
index 000000000..751895c6e
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/config.py
@@ -0,0 +1,71 @@
+"""
+config parser
+"""
+
+try:
+    from configparser import ConfigParser
+except BaseException:
+    from ConfigParser import ConfigParser
+
+from utils import AttrDict
+
+import logging
+logger = logging.getLogger(__name__)
+
+CONFIG_SECS = [
+    'train',
+    'valid',
+    'test',
+    'infer',
+]
+
+
+def parse_config(cfg_file):
+    """parse_config
+    """
+    parser = ConfigParser()
+    cfg = AttrDict()
+    parser.read(cfg_file)
+    for sec in parser.sections():
+        sec_dict = AttrDict()
+        for k, v in parser.items(sec):
+            try:
+                v = eval(v)
+            except BaseException:
+                pass
+            setattr(sec_dict, k, v)
+        setattr(cfg, sec.upper(), sec_dict)
+
+    return cfg
+
+
+def merge_configs(cfg, sec, args_dict):
+    """merge_configs
+    """
+    assert sec in CONFIG_SECS, "invalid config section {}".format(sec)
+    sec_dict = getattr(cfg, sec.upper())
+    for k, v in args_dict.items():
+        if v is None:
+            continue
+        # try:
+        #     if hasattr(sec_dict, k):
+        #         setattr(sec_dict, k, v)
+        # except BaseException:
+        #     pass
+        if k in sec_dict:
+            setattr(sec_dict, k, v)
+    return cfg
+
+def print_configs(cfg, mode):
+    """print_configs
+    """
+    logger.info("---------------- {:>5} Arguments ----------------".format(mode))
+    for sec, sec_items in cfg.items():
+        if isinstance(sec_items, dict) is True:
+            logger.info("{}:".format(sec))
+            for k, v in sec_items.items():
+                logger.info("    {}:{}".format(k, v))
+        else:
+            logger.info("{}:{}".format(sec, sec_items))
+            
+    logger.info("-------------------------------------------------")
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py
new file mode 100755
index 000000000..270d628e3
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Copyright 2021 Baidu.com, Inc. All Rights Reserved
+Description: 
+Authors: wanghewei(wanghewei@baidu.com)
+LastEditors: wanghewei(wanghewei@baidu.com)
+Date: 2021-11-26 16:31:59
+"""
+from .reader_utils import regist_reader, get_reader
+from .feature_reader import FeatureReader
+# regist reader, sort by alphabet
+regist_reader("ATTENTIONLSTMERNIE", FeatureReader)
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py
new file mode 100755
index 000000000..e8d49a29b
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py
@@ -0,0 +1,334 @@
+"""
+ernie reader
+"""
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+import sys
+import os
+import json
+import random
+import logging
+import numpy as np
+import six
+from io import open
+from collections import namedtuple
+
+from .tokenization import FullTokenizer, convert_to_unicode
+
+log = logging.getLogger(__name__)
+
+if six.PY3:
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+
+
+def csv_reader(fd, delimiter='\t'):
+    """csv_reader
+    """
+    def gen():
+        """gen
+        """
+        for i in fd:
+            yield i.rstrip('\n').split(delimiter)
+
+    return gen()
+
+
+class BaseReader(object):
+    """BaseReader
+    """
+    def __init__(self,
+                 vocab_path,
+                 label_map_config=None,
+                 max_seq_len=512,
+                 do_lower_case=True,
+                 in_tokens=False,
+                 is_inference=False,
+                 random_seed=None,
+                 tokenizer="FullTokenizer",
+                 is_classify=True,
+                 is_regression=False,
+                 for_cn=True,
+                 task_id=0):
+        self.max_seq_len = max_seq_len
+        self.tokenizer = FullTokenizer(vocab_file=vocab_path,
+                                       do_lower_case=do_lower_case)
+        self.vocab = self.tokenizer.vocab
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.in_tokens = in_tokens
+        self.is_inference = is_inference
+        self.for_cn = for_cn
+        self.task_id = task_id
+
+        np.random.seed(random_seed)
+
+        self.is_classify = is_classify
+        self.is_regression = is_regression
+        self.current_example = 0
+        self.current_epoch = 0
+        self.num_examples = 0
+
+        if label_map_config:
+            with open(label_map_config, encoding='utf8') as f:
+                self.label_map = json.load(f)
+        else:
+            self.label_map = None
+
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length."""
+
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        """Converts a single `Example` into a single `Record`."""
+
+        text_a = convert_to_unicode(example.text_a)
+        tokens_a = tokenizer.tokenize(text_a)
+        tokens_b = None
+
+        has_text_b = False
+        if isinstance(example, dict):
+            has_text_b = "text_b" in example.keys()
+        else:
+            has_text_b = "text_b" in example._fields
+
+        if has_text_b:
+            text_b = convert_to_unicode(example.text_b)
+            tokens_b = tokenizer.tokenize(text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+        # The convention in BERT/ERNIE is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        text_type_ids = []
+        tokens.append("[CLS]")
+        text_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            text_type_ids.append(0)
+        tokens.append("[SEP]")
+        text_type_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                text_type_ids.append(1)
+            tokens.append("[SEP]")
+            text_type_ids.append(1)
+
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+
+        if self.is_inference:
+            Record = namedtuple('Record',
+                                ['token_ids', 'text_type_ids', 'position_ids'])
+            record = Record(token_ids=token_ids,
+                            text_type_ids=text_type_ids,
+                            position_ids=position_ids)
+        else:
+            if self.label_map:
+                label_id = self.label_map[example.label]
+            else:
+                label_id = example.label
+
+            Record = namedtuple('Record', [
+                'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'
+            ])
+
+            qid = None
+            if "qid" in example._fields:
+                qid = example.qid
+
+            record = Record(token_ids=token_ids,
+                            text_type_ids=text_type_ids,
+                            position_ids=position_ids,
+                            label_id=label_id,
+                            qid=qid)
+        return record
+
+    def _prepare_batch_data(self, examples, batch_size, phase=None):
+        """generate batch records"""
+        batch_records, max_len = [], 0
+        for index, example in enumerate(examples):
+            if phase == "train":
+                self.current_example = index
+            record = self._convert_example_to_record(example, self.max_seq_len,
+                                                     self.tokenizer)
+            max_len = max(max_len, len(record.token_ids))
+            if self.in_tokens:
+                to_append = (len(batch_records) + 1) * max_len <= batch_size
+            else:
+                to_append = len(batch_records) < batch_size
+            if to_append:
+                batch_records.append(record)
+            else:
+                yield self._pad_batch_records(batch_records)
+                batch_records, max_len = [record], len(record.token_ids)
+
+        if batch_records:
+            yield self._pad_batch_records(batch_records)
+
+
+class ExtractEmbeddingReader(BaseReader):
+    """
+    data prepare for getting erine embedding 
+    """
+    def _pad_batch_records(self, batch_records):
+        """
+        对字标号，位置标号特征进行固定长度补全
+        batch_records 包含多条文本的标号
+        return [字标号列表，文本类型列表，位置特征列表，任务标号列表，掩码列表]
+        """
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [
+            record.text_type_ids for record in batch_records
+        ]
+        batch_position_ids = [record.position_ids for record in batch_records]
+
+        # padding
+        padded_token_ids, input_mask, seq_lens = pad_batch_data(
+            batch_token_ids,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True,
+            max_len=self.max_seq_len)
+        padded_text_type_ids = pad_batch_data(batch_text_type_ids,
+                                              pad_idx=self.pad_id,
+                                              max_len=self.max_seq_len)
+        padded_position_ids = pad_batch_data(batch_position_ids,
+                                             pad_idx=self.pad_id,
+                                             max_len=self.max_seq_len)
+        padded_task_ids = np.ones_like(padded_token_ids,
+                                       dtype="int64") * self.task_id
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask
+        ]
+        return return_list
+
+    def data_generate_from_text(self, text):
+        """
+        trans text to idx
+        input single text
+        return 5*maxlen*1
+        """
+        Example = namedtuple('Example', ['text_a', 'label'])
+        example = Example(text, 0)
+        records = [
+            self._convert_example_to_record(example, self.max_seq_len,
+                                            self.tokenizer)
+        ]
+        pad_records = self._pad_batch_records(records)
+        text_one_hot = np.concatenate(pad_records, axis=0).astype('int64')
+        return text_one_hot
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   max_len=None,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   return_seq_lens=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias.
+    """
+    return_list = []
+    if max_len is None:
+        max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array(
+        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array(
+            [[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype("int64").reshape([-1])]
+
+    return return_list if len(return_list) > 1 else return_list[0]
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py
new file mode 100755
index 000000000..f97ea7874
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py
@@ -0,0 +1,274 @@
+"""
+feature reader
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+    from io import BytesIO
+import numpy as np
+import random
+import os
+import traceback
+import pickle
+python_ver = sys.version_info
+from collections import defaultdict
+
+import pandas as pd
+
+from .ernie_task_reader import ExtractEmbeddingReader
+from .reader_utils import DataReader
+
+
+class FeatureReader(DataReader):
+    """
+    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks
+    This is for the three models: lstm, attention cluster, nextvlad
+
+    dataset cfg: num_classes
+                 batch_size
+                 list
+                 NextVlad only: eigen_file
+    """
+    def __init__(self, name, mode, cfg):
+        """
+        init
+        """
+        self.name = name
+        self.mode = mode
+        self.num_classes = cfg.MODEL.num_classes
+
+        # set batch size and file list
+        self.batch_size = cfg[mode.upper()]['batch_size']
+        self.filelist = cfg[mode.upper()]['filelist']
+        self.eigen_file = cfg.MODEL.get('eigen_file', None)
+        self.num_seg = cfg.MODEL.get('num_seg', None)
+        self.loss_type = cfg.TRAIN['loss_type']
+        vocab_file = os.path.join(cfg.TRAIN.ernie_pretrain_dict_path,
+                                  'vocab.txt')
+        self.ernie_reader = ExtractEmbeddingReader(
+            vocab_path=vocab_file,
+            max_seq_len=cfg.MODEL.text_max_len,
+            do_lower_case=True)
+        url_title_label_file = cfg[mode.upper()]['url_title_label_file']
+        self.class_dict = load_class_file(cfg.MODEL.class_name_file)
+        self.url_title_info = load_video_file(url_title_label_file,
+                                              self.class_dict, mode)
+
+    def create_reader(self):
+        """
+        create reader
+        """
+        url_list = list(self.url_title_info.keys())
+        if self.mode == 'train':
+            random.shuffle(url_list)
+
+        def reader():
+            """reader
+            """
+            batch_out = []
+            for url in url_list:
+                try:
+                    filepath = os.path.join(
+                        self.filelist,
+                        url.split('/')[-1].split('.')[0] + '.pkl')
+                    if os.path.exists(filepath) is False:
+                        continue
+                    if python_ver < (3, 0):
+                        record = pickle.load(open(filepath, 'rb'))
+                    else:
+                        record = pickle.load(open(filepath, 'rb'),
+                                             encoding='iso-8859-1')
+                    text_raw = self.url_title_info[url]['title']
+                    rgb = record['feature']['image_pkl'].astype(float)
+                    if record['feature']['audio_pkl'].shape[0] == 0:
+                        audio_pkl = np.zeros((10, 128))
+                        audio = audio_pkl.astype(float)
+                    else:
+                        audio = record['feature']['audio_pkl'].astype(float)
+                    text_one_hot = self.ernie_reader.data_generate_from_text(
+                        str(text_raw))
+                    video = record['video']
+                    if self.mode != 'infer':
+                        label = self.url_title_info[url]['label']
+                        label = [int(w) for w in label]
+                        if self.loss_type == 'sigmoid':
+                            label = make_one_hot(label, self.num_classes)
+                        elif self.loss_type == 'softmax':
+                            label = make_one_soft_hot(label, self.num_classes,
+                                                      False)
+                        batch_out.append((rgb, audio, text_one_hot, label))
+                    else:
+                        batch_out.append((rgb, audio, text_one_hot, video))
+                    if len(batch_out) == self.batch_size:
+                        yield batch_out
+                        batch_out = []
+                except Exception as e:
+                    print("warning: load data {} failed, {}".format(
+                        filepath, str(e)))
+                    traceback.print_exc()
+                    continue
+
+
+# if self.mode == 'infer' and len(batch_out) > 0:
+            if len(batch_out) > 0:
+                yield batch_out
+
+        return reader
+
+    def get_config_from_sec(self, sec, item, default=None):
+        """get_config_from_sec
+        """
+        if sec.upper() not in self.cfg:
+            return default
+        return self.cfg[sec.upper()].get(item, default)
+
+
+def load_video_file(label_file, class_dict, mode='train'):
+    """
+    labelfile formate: URL \t title \t label1,label2
+    return dict
+    """
+    data = pd.read_csv(label_file, sep='\t', header=None)
+    url_info_dict = defaultdict(dict)
+    for index, row in data.iterrows():
+        url = row[0]
+        if url in url_info_dict:
+            continue
+        if pd.isna(row[1]):
+            title = ""
+        else:
+            title = str(row[1])
+        if mode == 'infer':
+            url_info_dict[url] = {'title': title}
+        else:
+            if pd.isna(row[2]):
+                continue
+            labels = row[2].split(',')
+            labels_idx = [class_dict[w] for w in labels if w in class_dict]
+            if len(labels_idx) < 1:
+                continue
+            if url not in url_info_dict:
+                url_info_dict[url] = {'label': labels_idx, 'title': title}
+    print('load video %d' % (len(url_info_dict)))
+    return url_info_dict
+
+
+def dequantize(feat_vector, max_quantized_value=2., min_quantized_value=-2.):
+    """
+    Dequantize the feature from the byte format to the float format
+    """
+
+    assert max_quantized_value > min_quantized_value
+    quantized_range = max_quantized_value - min_quantized_value
+    scalar = quantized_range / 255.0
+    bias = (quantized_range / 512.0) + min_quantized_value
+
+    return feat_vector * scalar + bias
+
+
+epsilon = 0.1
+smmoth_score = (1.0 / float(210)) * epsilon
+
+
+def label_smmoth(label_one_hot_vector):
+    """
+    label_smmoth
+    """
+    global smmoth_score
+    for i in range(len(label_one_hot_vector)):
+        if label_one_hot_vector[i] == 0:
+            label_one_hot_vector[i] = smmoth_score
+    return label_one_hot_vector
+
+
+def make_one_soft_hot(label, dim=15, label_smmoth=False):
+    """
+    make_one_soft_hot
+    """
+    one_hot_soft_label = np.zeros(dim)
+    one_hot_soft_label = one_hot_soft_label.astype(float)
+    # multi-labelis
+    # label smmoth
+    if label_smmoth:
+        one_hot_soft_label = label_smmoth(one_hot_soft_label)
+    label_len = len(label)
+    prob = (1 - np.sum(one_hot_soft_label)) / float(label_len)
+    for ind in label:
+        one_hot_soft_label[ind] += prob
+    #one_hot_soft_label = label_smmoth(one_hot_soft_label)
+    return one_hot_soft_label
+
+
+def make_one_hot(label, dim=15):
+    """
+    make_one_hot
+    """
+    one_hot_soft_label = np.zeros(dim)
+    one_hot_soft_label = one_hot_soft_label.astype(float)
+    for ind in label:
+        one_hot_soft_label[ind] = 1
+    return one_hot_soft_label
+
+
+def generate_random_idx(feature_len, num_seg):
+    """
+    generate_random_idx
+    """
+    idxs = []
+    stride = float(feature_len) / num_seg
+    for i in range(num_seg):
+        pos = (i + np.random.random()) * stride
+        idxs.append(min(feature_len - 1, int(pos)))
+    return idxs
+
+
+def get_batch_ernie_input_feature(reader, texts):
+    """
+    get_batch_ernie_input_feature
+    """
+    result_list = reader.data_generate_from_texts(texts)
+    result_trans = []
+    for i in range(len(texts)):
+        result_trans.append([result_list[0][i],\
+                             result_list[1][i],
+                             result_list[2][i],
+                             result_list[3][i],
+                             result_list[4][i]])
+    return np.array(result_trans)
+
+
+def load_class_file(class_file):
+    """
+    load_class_file
+    """
+    class_lines = open(class_file, 'r', encoding='utf8').readlines()
+    class_dict = {}
+    for i, line in enumerate(class_lines):
+        tmp = line.strip().split('\t')
+        word = tmp[0]
+        index = str(i)
+        if len(tmp) == 2:
+            index = tmp[1]
+        class_dict[word] = index
+    return class_dict
+
+
+if __name__ == '__main__':
+    pass
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py
new file mode 100755
index 000000000..153295074
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py
@@ -0,0 +1,91 @@
+"""
+reader utils
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class ReaderNotFoundError(Exception):
+    "Error: reader not found"
+
+    def __init__(self, reader_name, avail_readers):
+        super(ReaderNotFoundError, self).__init__()
+        self.reader_name = reader_name
+        self.avail_readers = avail_readers
+
+    def __str__(self):
+        msg = "Reader {} Not Found.\nAvailiable readers:\n".format(
+            self.reader_name)
+        for reader in self.avail_readers:
+            msg += "  {}\n".format(reader)
+        return msg
+
+
+class DataReader(object):
+    """data reader for video input"""
+
+    def __init__(self, model_name, mode, cfg):
+        self.name = model_name
+        self.mode = mode
+        self.cfg = cfg
+
+    def create_reader(self):
+        """Not implemented"""
+        pass
+
+    def get_config_from_sec(self, sec, item, default=None):
+        """get_config_from_sec
+        """
+        if sec.upper() not in self.cfg:
+            return default
+        return self.cfg[sec.upper()].get(item, default)
+
+
+class ReaderZoo(object):
+    """ReaderZoo
+    """
+    def __init__(self):
+        self.reader_zoo = {}
+
+    def regist(self, name, reader):
+        """regist
+        """
+        assert reader.__base__ == DataReader, "Unknow model type {}".format(
+            type(reader))
+        self.reader_zoo[name] = reader
+
+    def get(self, name, mode, cfg):
+        """get
+        """
+        for k, v in self.reader_zoo.items():
+            if k == name:
+                return v(name, mode, cfg)
+        raise ReaderNotFoundError(name, self.reader_zoo.keys())
+
+
+# singleton reader_zoo
+reader_zoo = ReaderZoo()
+
+
+def regist_reader(name, reader):
+    """regist_reader
+    """
+    reader_zoo.regist(name, reader)
+
+
+def get_reader(name, mode, cfg):
+    """get_reader
+    """
+    reader_model = reader_zoo.get(name, mode, cfg)
+    return reader_model.create_reader()
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py
new file mode 100755
index 000000000..52c8024ba
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py
@@ -0,0 +1,441 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+from io import open
+
+import collections
+import unicodedata
+import six
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, encoding='utf8') as fin:
+        for num, line in enumerate(fin):
+            items = convert_to_unicode(line.strip()).split("\t")
+            if len(items) > 2:
+                break
+            token = items[0]
+            index = items[1] if len(items) == 2 else num
+            token = token.strip()
+            vocab[token] = int(index)
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    """convert_tokens_to_ids
+    """
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    """convert_ids_to_tokens
+    """
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        """init
+        """
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        """tokenize
+        """
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """convert_tokens_to_ids
+        """
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        """convert_ids_to_tokens
+        """
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        """tokenize
+        """
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """convert_tokens_to_ids
+        """
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        """convert_ids_to_tokens
+        """
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+
+def tokenize_chinese_chars(text):
+    """Adds whitespace around any CJK character."""
+
+    def _is_chinese_char(cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _is_whitespace(c):
+        """_is_whitespace
+        """
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    output = []
+    buff = ""
+    for char in text:
+        cp = ord(char)
+        if _is_chinese_char(cp) or _is_whitespace(char):
+            if buff != "":
+                output.append(buff)
+                buff = ""
+            output.append(char)
+        else:
+            buff += char
+
+    if buff != "":
+        output.append(buff)
+
+    return output
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py
new file mode 100755
index 000000000..cc5300a09
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py
@@ -0,0 +1,159 @@
+"""
+eval main
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import argparse
+import logging
+import pickle
+
+import numpy as np
+import paddle
+paddle.enable_static()
+import paddle.static as static
+
+from accuracy_metrics import MetricsCalculator
+from datareader import get_reader
+from config import parse_config, merge_configs, print_configs
+from models.attention_lstm_ernie import AttentionLstmErnie
+from utils import test_with_pyreader
+
+
+def parse_args():
+    """parse_args
+    """
+    parser = argparse.ArgumentParser("Paddle Video evaluate script")
+    parser.add_argument('--model_name',
+                        type=str,
+                        default='BaiduNet',
+                        help='name of model to train.')
+    parser.add_argument('--config',
+                        type=str,
+                        default='configs/conf.txt',
+                        help='path to config file of model')
+    parser.add_argument(
+        '--pretrain',
+        type=str,
+        default=None,
+        help=
+        'path to pretrain weights. None to use default weights path in  ~/.paddle/weights.'
+    )
+    parser.add_argument('--output', type=str, default=None, help='output path')
+    parser.add_argument('--use_gpu',
+                        type=bool,
+                        default=True,
+                        help='default use gpu.')
+    parser.add_argument('--save_model_param_dir',
+                        type=str,
+                        default=None,
+                        help='checkpoint path')
+    parser.add_argument('--save_inference_model',
+                        type=str,
+                        default=None,
+                        help='save inference path')
+    parser.add_argument('--save_only',
+                        action='store_true',
+                        default=False,
+                        help='only save model, do not evaluate model')
+    args = parser.parse_args()
+    return args
+
+
+def evaluate(args):
+    """evaluate
+    """
+    # parse config
+    config = parse_config(args.config)
+    valid_config = merge_configs(config, 'valid', vars(args))
+    print_configs(valid_config, 'Valid')
+
+    # build model
+    valid_model = AttentionLstmErnie(args.model_name,
+                                     valid_config,
+                                     mode='valid')
+    startup = static.Program()
+    valid_prog = static.default_main_program().clone(for_test=True)
+    with static.program_guard(valid_prog, startup):
+        paddle.disable_static()
+        valid_model.build_input(True)
+        valid_model.build_model()
+        valid_feeds = valid_model.feeds()
+        valid_outputs = valid_model.outputs()
+        valid_loss = valid_model.loss()
+        valid_pyreader = valid_model.pyreader()
+        paddle.enable_static()
+
+    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
+    exe = static.Executor(place)
+    exe.run(startup)
+    compiled_valid_prog = static.CompiledProgram(valid_prog)
+
+    # load weights
+    assert os.path.exists(args.save_model_param_dir), \
+            "Given save weight dir {} not exist.".format(args.save_model_param_dir)
+    valid_model.load_test_weights_file(exe, args.save_model_param_dir,
+                                       valid_prog, place)
+
+    if args.save_inference_model:
+        save_model_params(exe, valid_prog, valid_model,
+                          args.save_inference_model)
+
+    if args.save_only is True:
+        print('save model only, exit')
+        return
+
+    # get reader
+    bs_denominator = 1
+    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /
+                                        bs_denominator)
+    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)
+
+    # get metrics
+    valid_metrics = MetricsCalculator(args.model_name.upper(), 'valid',
+                                      valid_config)
+    valid_fetch_list = [valid_loss.name] + [x.name for x in valid_outputs
+                                            ] + [valid_feeds[-1].name]
+    # get reader
+    exe_places = static.cuda_places() if args.use_gpu else static.cpu_places()
+    valid_pyreader.decorate_sample_list_generator(valid_reader,
+                                                  places=exe_places)
+
+    test_loss, metrics_dict_test = test_with_pyreader(exe, compiled_valid_prog,
+                                                      valid_pyreader,
+                                                      valid_fetch_list,
+                                                      valid_metrics)
+    test_acc1 = metrics_dict_test['avg_acc1']
+    print(test_loss)
+    print(test_acc1)
+
+
+def save_model_params(exe, program, model_object, save_dir):
+    """save_model_params
+    """
+    feeded_var_names = [var.name for var in model_object.feeds()][:-1]
+    static.save_inference_model(dirname=save_dir,
+                                  feeded_var_names=feeded_var_names,
+                                  main_program=program,
+                                  target_vars=model_object.outputs(),
+                                  executor=exe,
+                                  model_filename='model',
+                                  params_filename='params')
+
+if __name__ == "__main__":
+    args = parse_args()
+    evaluate(args)
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py
new file mode 100644
index 000000000..7546252bd
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/inference.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+infer model
+"""
+import sys
+import os
+import numpy as np
+import json
+import pickle
+import argparse
+import time
+
+import numpy as np
+import paddle
+
+from datareader import get_reader
+from config import merge_configs, parse_config, print_configs
+
+
+def parse_args():
+    """parse_args
+    """
+    parser = argparse.ArgumentParser("Paddle Video infer script")
+    parser.add_argument('--model_name',
+                        type=str,
+                        default='BaiduNet',
+                        help='name of model to train.')
+    parser.add_argument('--config',
+                        type=str,
+                        default='configs/conf.txt',
+                        help='path to config file of model')
+    parser.add_argument('--output', type=str, default=None, help='output path')
+    parser.add_argument('--use_gpu',
+                        type=bool,
+                        default=True,
+                        help='default use gpu.')
+    parser.add_argument('--save_inference_model',
+                        type=str,
+                        default=None,
+                        help='save inference path')
+    args = parser.parse_args()
+    return args
+
+class InferModel(object):
+    """lstm infer"""
+    def __init__(self, cfg, name='ACTION'): 
+        name = name.upper()
+        self.name           = name
+        self.threshold      = cfg.INFER.threshold
+        self.cfg            = cfg
+        self.label_map      = load_class_file(cfg.MODEL.class_name_file)
+       
+
+    def load_inference_model(self, model_dir, use_gpu=True):
+        """model_init
+        """
+        model_file = os.path.join(model_dir, "model")
+        params_file = os.path.join(model_dir, "params")
+        config = paddle.inference.Config(model_file, params_file)
+        if use_gpu:
+            config.enable_use_gpu(1024)
+        else:
+            config.disable_gpu()
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = paddle.inference.create_predictor(config)
+        # build input tensor and output tensor
+        self.build_input_output()
+
+    def build_input_output(self):
+        """build_input_output
+        """
+        input_names = self.predictor.get_input_names()
+        # input
+        self.input_rgb_tensor = self.predictor.get_input_handle(input_names[0])
+        self.input_audio_tensor = self.predictor.get_input_handle(input_names[1])
+        self.input_text_tensor = self.predictor.get_input_handle(input_names[2])
+
+        # output
+        output_names = self.predictor.get_output_names()
+        self.output_tensor = self.predictor.get_output_handle(output_names[0])
+
+    def preprocess_for_lod_data(self, input):
+        """pre process"""
+        input_arr = []
+        input_lod = [0]
+        start_lod = 0
+        end_lod = 0
+        for sub_item in input:
+            end_lod = start_lod + len(sub_item)
+            input_lod.append(end_lod)
+            input_arr.extend(sub_item)
+            start_lod = end_lod
+        input_arr = np.array(input_arr)
+        return input_arr, [input_lod]
+
+    def predict(self):
+        """predict"""
+        infer_reader = get_reader(self.name, 'infer', self.cfg)
+        probs = []
+        video_ids = []
+        label_map_inverse = {value: key for key, value in self.label_map.items()}
+        for infer_iter, data in enumerate(infer_reader()):
+            # video_id = [[items[-2], items[-1]] for items in data]
+            rgb = [items[0] for items in data]
+            audio = [items[1] for items in data]
+            text = np.array([items[2] for items in data])
+            videos = np.array([items[3] for items in data])
+
+            rgb_arr, rgb_lod = self.preprocess_for_lod_data(rgb)
+            audio_arr, audio_lod = self.preprocess_for_lod_data(audio)
+
+            self.input_rgb_tensor.copy_from_cpu(rgb_arr.astype('float32'))
+            self.input_rgb_tensor.set_lod(rgb_lod)
+
+            self.input_audio_tensor.copy_from_cpu(audio_arr.astype('float32'))
+            self.input_audio_tensor.set_lod(audio_lod)
+
+            self.input_text_tensor.copy_from_cpu(text.astype('int64'))
+
+            self.predictor.run()
+            output = self.output_tensor.copy_to_cpu()
+            probs.extend(list(output))
+            video_ids.extend(videos)
+        assert len(video_ids) == len(probs)
+        result = []
+        for video_id, prob in zip(video_ids, probs):
+            label_idx = list(np.where(prob >= self.threshold)[0])
+            result.append({
+                "video_id": video_id,
+                "labels": [
+                    (label_map_inverse[str(idx)], float(prob[idx])) for idx in label_idx
+                ]
+            })
+        return result
+
+
+def load_class_file(class_file):
+    """
+    load_class_file
+    """
+    class_lines = open(class_file, 'r', encoding='utf8').readlines()
+    class_dict = {}
+    for i, line in enumerate(class_lines):
+        tmp = line.strip().split('\t')
+        word = tmp[0]
+        index = str(i)
+        if len(tmp) == 2:
+            index = tmp[1]
+        class_dict[word] = index
+    return class_dict
+
+
+def infer(args):
+    """
+    infer main
+    """
+    config = parse_config(args.config)
+    infer_config = merge_configs(config, 'infer', vars(args))
+    print_configs(infer_config, 'infer')
+    infer_obj = InferModel(infer_config, name=args.model_name)
+    infer_obj.load_inference_model(args.save_inference_model, use_gpu=args.use_gpu)
+    rt = infer_obj.predict()
+    if args.output:
+        with open(args.output, 'w') as f:
+            json.dump(rt, f, ensure_ascii=False, indent=4)
+
+if __name__ == "__main__":
+    args = parse_args()
+    infer(args)
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py
new file mode 100755
index 000000000..a28aa0d4e
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py
@@ -0,0 +1,400 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+attention lstm add ernie model
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import paddle.static as static
+
+from .ernie import ErnieConfig, ErnieModel
+
+
+class AttentionLstmErnie(object):
+    """
+    Base on scenario-classify (image + audio), add text information
+    use ERNIE to extract text feature
+    """
+    def __init__(self, name, cfg, mode='train'):
+        self.cfg = cfg
+        self.name = name
+        self.mode = mode
+        self.py_reader = None
+        self.get_config()
+
+    def get_config(self):
+        """get_config
+        """
+        # get model configs
+        self.feature_num = self.cfg.MODEL.feature_num
+        self.feature_names = self.cfg.MODEL.feature_names
+        self.feature_dims = self.cfg.MODEL.feature_dims
+        self.feature_dtypes = self.cfg.MODEL.feature_dtypes
+        self.feature_lod_level = self.cfg.MODEL.feature_lod_level
+        self.num_classes = self.cfg.MODEL.num_classes
+        self.embedding_size = self.cfg.MODEL.embedding_size
+        self.lstm_size_img = self.cfg.MODEL.lstm_size_img
+        self.lstm_size_audio = self.cfg.MODEL.lstm_size_audio
+        self.ernie_freeze = self.cfg.MODEL.ernie_freeze
+        self.lstm_pool_mode = self.cfg.MODEL.lstm_pool_mode
+        self.drop_rate = self.cfg.MODEL.drop_rate
+        self.loss_type = self.cfg.TRAIN.loss_type
+        self.ernie_pretrain_dict_path = self.cfg.TRAIN.ernie_pretrain_dict_path
+
+        # get mode configs
+        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size', 1)
+        self.num_gpus = self.get_config_from_sec(self.mode, 'num_gpus', 1)
+
+        if self.mode == 'train':
+            self.learning_rate = self.get_config_from_sec(
+                'train', 'learning_rate', 1e-3)
+            self.weight_decay = self.get_config_from_sec(
+                'train', 'weight_decay', 8e-4)
+            self.num_samples = self.get_config_from_sec(
+                'train', 'num_samples', 5000000)
+            self.decay_epochs = self.get_config_from_sec(
+                'train', 'decay_epochs', [5])
+            self.decay_gamma = self.get_config_from_sec(
+                'train', 'decay_gamma', 0.1)
+
+    def get_config_from_sec(self, sec, item, default=None):
+        """get_config_from_sec"""
+        if sec.upper() not in self.cfg:
+            return default
+        return self.cfg[sec.upper()].get(item, default)
+
+    def build_input(self, use_pyreader):
+        """
+        build input
+        """
+        self.feature_input = []
+        for name, dim, dtype, lod_level in zip(self.feature_names,
+                                               self.feature_dims,
+                                               self.feature_dtypes,
+                                               self.feature_lod_level):
+            self.feature_input.append(
+                static.data(shape=dim,
+                                  lod_level=lod_level,
+                                  dtype=dtype,
+                                  name=name))
+        self.label_input = static.data(shape=[self.num_classes],
+                                             dtype='float32',
+                                             name='label')
+
+        self.py_reader = paddle.fluid.io.PyReader(feed_list=self.feature_input +
+                                           [self.label_input],
+                                           capacity=1024,
+                                           iterable=True)
+
+    def ernie_encoder(self):
+        """
+        text feature extractor
+        """
+        ernie_config = ErnieConfig(
+            os.path.join(self.ernie_pretrain_dict_path, 'ernie_config.json'))
+        if self.mode != 'train':
+            ernie_config['attention_probs_dropout_prob'] = 0.0
+            ernie_config['hidden_dropout_prob'] = 0.0
+
+        src_ids = self.feature_input[2][:, 0]
+        sent_ids = self.feature_input[2][:, 1]
+        position_ids = self.feature_input[2][:, 2]
+        task_ids = self.feature_input[2][:, 3]
+        input_mask = self.feature_input[2][:, 4].astype('float32')
+        ernie = ErnieModel(src_ids=src_ids,
+                           position_ids=position_ids,
+                           sentence_ids=sent_ids,
+                           task_ids=task_ids,
+                           input_mask=input_mask,
+                           config=ernie_config)
+        enc_out = ernie.get_sequence_output()
+        # to Freeze ERNIE param
+        if self.ernie_freeze is True:
+            enc_out.stop_gradient = True
+        # ernie cnn
+        enc_out_cnn = ernie.get_sequence_textcnn_output(enc_out, input_mask)
+
+        enc_out_cnn_drop = paddle.nn.functional.dropout(enc_out_cnn, p=self.drop_rate, training=(self.mode=='train'))
+        return enc_out_cnn_drop
+
+    def build_model(self):
+        """build_model
+        """
+        # ---------------- transfer from old paddle ---------------
+        # get image,audio,text feature
+        video_input_tensor = self.feature_input[0]
+        audio_input_tensor = self.feature_input[1]
+        self.ernie_feature = self.ernie_encoder()
+
+        # ------image------
+        lstm_forward_fc = static.nn.fc(x=video_input_tensor,
+                                          size=self.lstm_size_img * 4,
+                                          activation=None,
+                                          bias_attr=False)
+        lstm_forward, _ = paddle.fluid.layers.dynamic_lstm(input=lstm_forward_fc,
+                                                    size=self.lstm_size_img *
+                                                    4,
+                                                    is_reverse=False,
+                                                    use_peepholes=True)
+
+        lsmt_backward_fc = static.nn.fc(x=video_input_tensor,
+                                           size=self.lstm_size_img * 4,
+                                           activation=None,
+                                           bias_attr=None)
+        lstm_backward, _ = paddle.fluid.layers.dynamic_lstm(input=lsmt_backward_fc,
+                                                     size=self.lstm_size_img *
+                                                     4,
+                                                     is_reverse=True,
+                                                     use_peepholes=True)
+
+        lstm_forward_img = paddle.concat(
+            x=[lstm_forward, lstm_backward], axis=1)
+
+        lstm_dropout = paddle.nn.functional.dropout(lstm_forward_img, p=self.drop_rate, training=(self.mode=='train'))
+        if self.lstm_pool_mode == 'text_guide':
+            lstm_weight = self.attention_weight_by_feature_seq2seq_attention(
+                self.ernie_feature, lstm_dropout, self.lstm_size_img * 2)
+        else:
+            lstm_weight = static.nn.fc(x=lstm_dropout,
+                                          size=1,
+                                          activation='sequence_softmax',
+                                          bias_attr=None)
+        scaled = paddle.multiply(x=lstm_dropout,
+                                              y=lstm_weight)
+        self.lstm_pool = paddle.static.nn.sequence_pool(input=scaled,
+                                                    pool_type='sum')
+        # ------audio------
+        lstm_forward_fc_audio = static.nn.fc(
+            x=audio_input_tensor,
+            size=self.lstm_size_audio * 4,
+            activation=None,
+            bias_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(coeff=0.0),
+                initializer=paddle.nn.initializer.Normal(std=0.0)))
+        lstm_forward_audio, _ = paddle.fluid.layers.dynamic_lstm(
+            input=lstm_forward_fc_audio,
+            size=self.lstm_size_audio * 4,
+            is_reverse=False,
+            use_peepholes=True)
+
+        lsmt_backward_fc_audio = static.nn.fc(x=audio_input_tensor,
+                                                 size=self.lstm_size_audio * 4,
+                                                 activation=None,
+                                                 bias_attr=False)
+        lstm_backward_audio, _ = paddle.fluid.layers.dynamic_lstm(
+            input=lsmt_backward_fc_audio,
+            size=self.lstm_size_audio * 4,
+            is_reverse=True,
+            use_peepholes=True)
+
+        lstm_forward_audio = paddle.concat(
+            x=[lstm_forward_audio, lstm_backward_audio], axis=1)
+
+        lstm_dropout_audio = paddle.nn.functional.dropout(lstm_forward_audio, p=self.drop_rate, training=(self.mode=='train'))
+        if self.lstm_pool_mode == 'text_guide':
+            lstm_weight_audio = self.attention_weight_by_feature_seq2seq_attention(
+                self.ernie_feature, lstm_dropout_audio,
+                self.lstm_size_audio * 2)
+        else:
+            lstm_weight_audio = static.nn.fc(x=lstm_dropout_audio,
+                                                size=1,
+                                                activation='sequence_softmax',
+                                                bias_attr=None)
+        scaled_audio = paddle.multiply(x=lstm_dropout_audio,
+                                                    y=lstm_weight_audio)
+        self.lstm_pool_audio = paddle.static.nn.sequence_pool(input=scaled_audio,
+                                                          pool_type='sum')
+
+        lstm_concat = paddle.concat(
+            x=[self.lstm_pool, self.lstm_pool_audio, self.ernie_feature],
+            axis=1,
+            name='final_concat')
+
+        # lstm_concat = self.add_bn(lstm_concat)
+        if self.loss_type == 'softmax':
+            self.fc = static.nn.fc(x=lstm_concat,
+                                      size=self.num_classes,
+                                      activation='softmax')
+        elif self.loss_type == 'sigmoid':
+            self.fc = static.nn.fc(x=lstm_concat,
+                                      size=self.num_classes,
+                                      activation=None)
+            self.logit = self.fc
+            self.fc = paddle.nn.functional.sigmoid(self.fc)
+
+        self.network_outputs = [self.fc]
+
+    def attention_weight_by_feature_seq2seq_attention(
+            self,
+            text_feature,
+            sequence_feature,
+            sequence_feature_dim,
+            name_prefix="seq2seq_attention"):
+        """
+        caculate weight by feature
+        Neural Machine Translation by Jointly Learning to Align and Translate
+        """
+        text_feature_expand = paddle.static.nn.sequence_expand(text_feature,
+                                                           sequence_feature,
+                                                           ref_level=0)
+        sequence_text_concat = paddle.concat(
+            x=[sequence_feature, text_feature_expand],
+            axis=-1,
+            name='video_text_concat')
+        energy = static.nn.fc(x=sequence_text_concat,
+                                 size=sequence_feature_dim,
+                                 activation='tanh',
+                                 name=name_prefix + "_tanh_fc")
+        weight_vector = static.nn.fc(x=energy,
+                                        size=1,
+                                        activation='sequence_softmax',
+                                        bias_attr=None,
+                                        name=name_prefix + "_softmax_fc")
+        return weight_vector
+
+    def add_bn(self, lstm_concat):
+        """
+        v2.5 add drop out and batch norm
+        """
+        input_fc_proj = static.nn.fc(
+            x=lstm_concat,
+            size=8192,
+            activation=None,
+            bias_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(coeff=0.0),
+                initializer=paddle.nn.initializer.Normal(std=0.0)))
+        input_fc_proj_bn = paddle.static.nn.batch_norm(
+            input=input_fc_proj,
+            act="relu",
+            is_test=(not self.mode == 'train'))
+        input_fc_proj_dropout = paddle.nn.functional.dropout(
+            input_fc_proj_bn,
+            p=self.drop_rate,
+            training=(self.mode=='train'))
+        input_fc_hidden = static.nn.fc(
+            x=input_fc_proj_dropout,
+            size=4096,
+            activation=None,
+            bias_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(coeff=0.0),
+                initializer=paddle.nn.initializer.Normal(std=0.0)))
+        input_fc_hidden_bn = paddle.static.nn.batch_norm(
+            input=input_fc_hidden,
+            act="relu",
+            is_test=(not self.mode == 'train'))
+        input_fc_hidden_dropout = paddle.nn.functional.dropout(
+            input_fc_hidden_bn,
+            p=self.drop_rate,
+            training=(self.mode=='train'))
+        return input_fc_hidden_dropout
+
+    def optimizer(self):
+        """
+        optimizer
+        """
+        assert self.mode == 'train', "optimizer only can be get in train mode"
+        values = [
+            self.learning_rate * (self.decay_gamma ** i)
+            for i in range(len(self.decay_epochs) + 1)
+        ]
+        iter_per_epoch = self.num_samples / self.batch_size
+        boundaries = [e * iter_per_epoch for e in self.decay_epochs]
+        return paddle.optimizer.RMSProp(
+            learning_rate=paddle.optimizer.lr.PiecewiseDecay(values=values,
+                                                       boundaries=boundaries),
+            centered=True,
+            weight_decay=paddle.regularizer.L2Decay(
+                coeff=self.weight_decay))
+
+    def softlabel_cross_entropy_loss(self):
+        """
+        softlabel_cross_entropy_loss
+        """
+        assert self.mode != 'infer', "invalid loss calculationg in infer mode"
+        '''
+        cost = paddle.nn.functional.cross_entropy(input=self.network_outputs[0], \
+                                          label=self.label_input)
+        '''
+        cost = paddle.nn.functional.cross_entropy(input=self.network_outputs[0], \
+                                          label=self.label_input,
+                                          soft_label=True)
+
+        cost = paddle.sum(x=cost, axis=-1)
+        sum_cost = paddle.sum(x=cost)
+        self.loss_ = paddle.scale(sum_cost,
+                                        scale=self.num_gpus,
+                                        bias_after_scale=False)
+
+        return self.loss_
+
+    def sigmoid_cross_entropy_loss(self):
+        """
+        sigmoid_cross_entropy_loss
+        """
+        assert self.mode != 'infer', "invalid loss calculationg in infer mode"
+        cost = paddle.nn.functional.binary_cross_entropy(input=self.logit,\
+                                          label=self.label_input, reduction=None)
+
+        cost = paddle.sum(x=cost, axis=-1)
+        sum_cost = paddle.sum(x=cost)
+        self.loss_ = paddle.scale(sum_cost,
+                                        scale=self.num_gpus,
+                                        bias_after_scale=False)
+
+        return self.loss_
+
+    def loss(self):
+        """
+        loss
+        """
+        if self.loss_type == 'sigmoid':
+            return self.sigmoid_cross_entropy_loss()
+        else:
+            return self.softlabel_cross_entropy_loss()
+
+    def outputs(self):
+        """
+        get outputs
+        """
+        return self.network_outputs
+
+    def feeds(self):
+        """
+        get feeds
+        """
+        return self.feature_input if self.mode == 'infer' else self.feature_input + [
+            self.label_input
+        ]
+
+    def pyreader(self):
+        """pyreader"""
+        return self.py_reader
+
+    def epoch_num(self):
+        """get train epoch num"""
+        return self.cfg.TRAIN.epoch
+
+    def load_test_weights_file(self, exe, weights, prog, place):
+        """
+        load_test_weights_file
+        """
+        load_vars = [x for x in prog.list_vars() \
+                     if isinstance(x, paddle.framework.Parameter)]
+        static.load_vars(exe,
+                           dirname=weights,
+                           vars=load_vars,
+                           filename="param")
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py
new file mode 100755
index 000000000..84861c1e9
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/models/ernie.py
@@ -0,0 +1,250 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+import json
+import six
+import logging
+import paddle
+import paddle.static as static
+from io import open
+
+from .transformer_encoder import encoder, pre_process_layer
+
+log = logging.getLogger(__name__)
+
+class ErnieConfig(object):
+    """
+    Erine model config
+    """
+    def __init__(self, config_path):
+        """
+        init
+        """
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        """
+        parse config
+        """
+        try:
+            with open(config_path, 'r', encoding='utf8') as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing Ernie model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        """
+        get item
+        """
+        return self._config_dict.get(key, None)
+    def __setitem__(self, key, value):
+        """
+        set item
+        """
+        self._config_dict[key] = value
+
+    def print_config(self):
+        """
+        print config
+        """
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            log.info('%s: %s' % (arg, value))
+        log.info('------------------------------------------------')
+
+
+class ErnieModel(object):
+    """
+    ERINE Model
+    """
+    def __init__(self,
+                 src_ids,
+                 position_ids,
+                 sentence_ids,
+                 task_ids,
+                 input_mask,
+                 config,
+                 weight_sharing=True,
+                 use_fp16=False):
+        """
+        init model
+        """
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        if config['sent_type_vocab_size']:
+            self._sent_types = config['sent_type_vocab_size']
+        else:
+            self._sent_types = config['type_vocab_size']
+
+        self._use_task_id = config['use_task_id']
+        if self._use_task_id:
+            self._task_types = config['task_type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self._weight_sharing = weight_sharing
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._task_emb_name = "task_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+        self._emb_dtype = "float32"
+
+        # Initialize all weigths by truncated normal initializer, and all biases
+        # will be initialized by constant zero by default.
+        self._param_initializer = paddle.nn.initializer.TruncatedNormal(
+            std=config['initializer_range'])
+
+        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
+                          input_mask)
+
+    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
+                     input_mask):
+        """
+        build  model
+        """
+        # padding id in vocabulary must be set to 0
+        emb_out = static.nn.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=paddle.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+
+        position_emb_out = static.nn.embedding(
+            input=position_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=paddle.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer))
+
+        sent_emb_out = static.nn.embedding(
+            sentence_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=paddle.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer))
+
+        # emb_out = emb_out + position_emb_out
+        # emb_out = emb_out + sent_emb_out
+        emb_out = paddle.add(x=emb_out, y=position_emb_out)
+        emb_out = paddle.add(x=emb_out, y=sent_emb_out)
+
+        if self._use_task_id:
+            task_emb_out = static.nn.embedding(
+                task_ids,
+                size=[self._task_types, self._emb_size],
+                dtype=self._emb_dtype,
+                param_attr=paddle.ParamAttr(
+                    name=self._task_emb_name,
+                    initializer=self._param_initializer))
+
+            emb_out = emb_out + task_emb_out
+
+        emb_out = pre_process_layer(
+            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
+
+        if self._dtype == "float16":
+            emb_out = paddle.cast(x=emb_out, dtype=self._dtype)
+            input_mask = paddle.cast(x=input_mask, dtype=self._dtype)
+        self_attn_mask = paddle.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+
+        self_attn_mask = paddle.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = paddle.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        self._enc_out = encoder(
+            enc_input=emb_out,
+            attn_bias=n_head_self_attn_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            param_initializer=self._param_initializer,
+            name='encoder')
+        if self._dtype == "float16":
+            self._enc_out = paddle.cast(
+                x=self._enc_out, dtype=self._emb_dtype)
+
+
+    def get_sequence_output(self):
+        """
+        get sequence output
+        """
+        return self._enc_out
+
+    def get_sequence_textcnn_output(self, sequence_feature, input_mask):
+        """
+        get sequence output
+        """
+        seq_len = paddle.sum(x=input_mask, axis=[1, 2])
+        seq_len = paddle.cast(seq_len, 'int64')
+        sequence_feature = paddle.static.nn.sequence_unpad(sequence_feature, seq_len)
+
+        return self.textcnn(sequence_feature)
+
+    def get_pooled_output(self):
+        """Get the first feature of each sequence for classification"""
+        next_sent_feat = paddle.slice(
+            input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = static.nn.fc(
+            x=next_sent_feat,
+            size=self._emb_size,
+            activation="tanh",
+            weight_attr=paddle.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0")
+        return next_sent_feat
+
+    def textcnn(self, feature, name='text_cnn'):
+        """
+        TextCNN sequence feature extraction
+        """
+        win_sizes = [2, 3, 4]
+        hid_dim = 256
+        convs = []
+        for win_size in win_sizes:
+            conv_h = paddle.fluid.nets.sequence_conv_pool(input=feature,
+                                                   num_filters=hid_dim,
+                                                   filter_size=win_size,
+                                                   act="tanh",
+                                                   pool_type="max")
+            convs.append(conv_h)
+        convs_out = paddle.concat(x=convs, axis=1)
+        return convs_out
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py
new file mode 100755
index 000000000..a0264bd9a
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py
@@ -0,0 +1,338 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer encoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import partial
+import paddle
+import paddle.static as static
+
+
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None,
+                         param_initializer=None,
+                         name='multi_head_att'):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = static.nn.fc(x=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      weight_attr=paddle.ParamAttr(
+                          name=name + '_query_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_query_fc.b_0')
+        k = static.nn.fc(x=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      weight_attr=paddle.ParamAttr(
+                          name=name + '_key_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_key_fc.b_0')
+        v = static.nn.fc(x=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=2,
+                      weight_attr=paddle.ParamAttr(
+                          name=name + '_value_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_value_fc.b_0')
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = paddle.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head])
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return paddle.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = paddle.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return paddle.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]])
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = paddle.scale(x=q, scale=d_key**-0.5)
+        product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            # product += attn_bias
+            product = paddle.add(x=product, y=attn_bias)
+        weights = paddle.nn.functional.softmax(x=product)
+        if dropout_rate:
+            weights = paddle.nn.functional.dropout(weights, p=dropout_rate, mode="upscale_in_train", training=True)
+        out = paddle.matmul(x=weights, y=v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    if cache is not None:  # use cache and concat time steps
+        # Since the inplace reshape in __split_heads changes the shape of k and
+        # v, which is the cache input for next time step, reshape the cache
+        # input from the previous time step first.
+        k = cache["k"] = paddle.concat(
+            x=[paddle.reshape(
+                x=cache["k"], shape=[0, 0, d_model]), k], axis=1)
+        v = cache["v"] = paddle.concat(
+            x=[paddle.reshape(
+                x=cache["v"], shape=[0, 0, d_model]), v], axis=1)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = static.nn.fc(x=out,
+                         size=d_model,
+                         num_flatten_dims=2,
+                         weight_attr=paddle.ParamAttr(
+                             name=name + '_output_fc.w_0',
+                             initializer=param_initializer),
+                         bias_attr=name + '_output_fc.b_0')
+    return proj_out
+
+
+def positionwise_feed_forward(x,
+                              d_inner_hid,
+                              d_hid,
+                              dropout_rate,
+                              hidden_act,
+                              param_initializer=None,
+                              name='ffn'):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = static.nn.fc(x=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       activation=hidden_act,
+                       weight_attr=paddle.ParamAttr(
+                           name=name + '_fc_0.w_0',
+                           initializer=param_initializer),
+                       bias_attr=name + '_fc_0.b_0')
+    if dropout_rate:
+        hidden = paddle.nn.functional.dropout(
+            hidden,
+            p=dropout_rate,
+            mode="upscale_in_train",
+            training=True)
+    out = static.nn.fc(x=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    weight_attr=paddle.ParamAttr(
+                        name=name + '_fc_1.w_0', initializer=param_initializer),
+                    bias_attr=name + '_fc_1.b_0')
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
+                           name=''):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            # out = out + prev_out if prev_out else out
+            out = paddle.add(x=out, y=prev_out) if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out_dtype = out.dtype
+            if out_dtype == "float16":
+                out = paddle.cast(x=out, dtype="float32")
+            out = static.nn.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=paddle.ParamAttr(
+                    name=name + '_layer_norm_scale',
+                    initializer=paddle.nn.initializer.Constant(value=1.)),
+                bias_attr=paddle.ParamAttr(
+                    name=name + '_layer_norm_bias',
+                    initializer=paddle.nn.initializer.Constant(value=0.)))
+            if out_dtype == "float16":
+                out = paddle.cast(x=out, dtype="float16")
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = paddle.nn.functional.dropout(
+                    out,
+                    p=dropout_rate,
+                    dropout_implementation="upscale_in_train",
+                    training=True)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  hidden_act,
+                  preprocess_cmd="n",
+                  postprocess_cmd="da",
+                  param_initializer=None,
+                  name=''):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(
+        pre_process_layer(
+            enc_input,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_att'),
+        None,
+        None,
+        attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        param_initializer=param_initializer,
+        name=name + '_multi_head_att')
+    attn_output = post_process_layer(
+        enc_input,
+        attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_att')
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(
+            attn_output,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_ffn'),
+        d_inner_hid,
+        d_model,
+        relu_dropout,
+        hidden_act,
+        param_initializer=param_initializer,
+        name=name + '_ffn')
+    return post_process_layer(
+        attn_output,
+        ffd_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_ffn')
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
+
+    return enc_output
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py
new file mode 100755
index 000000000..8764fd516
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/train.py
@@ -0,0 +1,263 @@
+"""
+train main
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import argparse
+import logging
+
+
+import numpy as np
+import paddle
+paddle.enable_static()
+import paddle.static as static
+
+from accuracy_metrics import MetricsCalculator
+from datareader import get_reader
+from config import print_configs, merge_configs, parse_config
+from models.attention_lstm_ernie import AttentionLstmErnie
+from utils import init_pretraining_params, train_with_pyreader
+
+
+logging.root.handlers = []
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    """parse_args
+    """
+    parser = argparse.ArgumentParser("Paddle Video train script")
+    parser.add_argument(
+        '--model_name',
+        type=str,
+        default='BaiduNet',
+        help='name of model to train.')
+    parser.add_argument(
+        '--config',
+        type=str,
+        default='configs/conf.txt',
+        help='path to config file of model')
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=None,
+        help='training batch size. None to use config file setting.')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=None,
+        help='learning rate use for training. None to use config file setting.')
+    parser.add_argument(
+        '--pretrain',
+        type=str,
+        default=None,
+        help='path to pretrain weights. None to use default weights path in  ~/.paddle/weights.'
+    )
+    parser.add_argument(
+        '--resume',
+        type=str,
+        default=None,
+        help='path to resume training based on previous checkpoints. '
+        'None for not resuming any checkpoints.')
+    parser.add_argument(
+        '--use_gpu', type=bool, default=True, help='default use gpu.')
+    parser.add_argument(
+        '--no_use_pyreader',
+        action='store_true',
+        default=False,
+        help='whether to use pyreader')
+    parser.add_argument(
+        '--no_memory_optimize',
+        action='store_true',
+        default=False,
+        help='whether to use memory optimize in train')
+    parser.add_argument(
+        '--epoch_num',
+        type=int,
+        default=0,
+        help='epoch number, 0 for read from config file')
+    parser.add_argument(
+        '--valid_interval',
+        type=int,
+        default=1,
+        help='validation epoch interval, 0 for no validation.')
+    parser.add_argument(
+        '--save_dir',
+        type=str,
+        default='checkpoints',
+        help='directory name to save train snapshoot')
+    parser.add_argument(
+        '--log_interval',
+        type=int,
+        default=10,
+        help='mini-batch interval to log.')
+    parser.add_argument(
+        '--save_log_name',
+        type=str,
+        default='train_val',
+        help='save to tensorboard filename recommand model name.')
+    args = parser.parse_args()
+    return args
+
+
+def train(args):
+    """train main
+    """
+    # parse config
+    config = parse_config(args.config)
+    train_config = merge_configs(config, 'train', vars(args))
+    valid_config = merge_configs(config, 'valid', vars(args))
+    print_configs(train_config, 'Train')
+    train_model = AttentionLstmErnie(args.model_name, train_config, mode='train')
+    valid_model = AttentionLstmErnie(args.model_name, valid_config, mode='valid')
+
+    max_train_steps = train_config.TRAIN.epoch * train_config.TRAIN.num_samples // train_config.TRAIN.batch_size
+    print('max train steps %d' % (max_train_steps))
+    # build model
+    startup = static.Program()
+    train_prog = static.Program()
+    with static.program_guard(train_prog, startup):
+        paddle.disable_static()
+        train_model.build_input(use_pyreader=True)
+        train_model.build_model()
+            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label
+        train_feeds = train_model.feeds()
+        train_feeds[-1].persistable = True
+            # for the output of classification model, has the form [pred]
+        train_outputs = train_model.outputs()
+        for output in train_outputs:
+            output.persistable = True
+        train_loss = train_model.loss()
+        train_loss.persistable = True
+            # outputs, loss, label should be fetched, so set persistable to be true
+        optimizer = train_model.optimizer()
+        optimizer.minimize(train_loss)
+        train_pyreader = train_model.pyreader()
+        paddle.enable_static()
+
+    if not args.no_memory_optimize:
+        paddle.distributed.transpiler.memory_optimize(train_prog)
+
+    valid_prog = static.Program()
+    with static.program_guard(valid_prog, startup):
+        paddle.disable_static()
+        valid_model.build_input(True)
+        valid_model.build_model()
+        valid_feeds = valid_model.feeds()
+        valid_outputs = valid_model.outputs()
+        valid_loss = valid_model.loss()
+        valid_pyreader = valid_model.pyreader()
+        paddle.enable_static()
+
+    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
+    exe = static.Executor(place)
+    exe.run(startup)
+
+    if args.resume:
+        # if resume weights is given, load resume weights directly
+        assert os.path.exists(args.resume), \
+            "Given resume weight dir {} not exist.".format(args.resume)
+
+        def if_exist(var):
+            """if_exist
+            """
+            return os.path.exists(os.path.join(args.resume, var.name))
+
+        print('resuming ,,,,,,,,,,,,,,')
+        paddle.fluid.io.load_persistables(
+                    exe, '', main_program=train_prog, filename=args.resume)
+
+    else:
+        # load ernie pretrain model
+        init_pretraining_params(exe,
+                                train_config.TRAIN.ernie_pretrain_dict_path,
+                                main_program=train_prog)
+        # if not in resume mode, load pretrain weights
+        # this pretrain may be only audio or video
+        if args.pretrain:
+            assert os.path.exists(args.pretrain), \
+                "Given pretrain weight dir {} not exist.".format(args.pretrain)
+        if args.pretrain:
+            train_model.load_test_weights_file(exe, args.pretrain, train_prog, place)
+
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.enable_inplace = True
+
+    compiled_train_prog = static.CompiledProgram(
+        train_prog).with_data_parallel(loss_name=train_loss.name,
+                                       build_strategy=build_strategy)
+    compiled_valid_prog = static.CompiledProgram(
+        valid_prog).with_data_parallel(share_vars_from=compiled_train_prog,
+                                       build_strategy=build_strategy)
+
+    # get reader
+    bs_denominator = 1
+    if (not args.no_use_pyreader) and args.use_gpu:
+        dev_list = static.cuda_places()
+        bs_denominator = len(dev_list)
+    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /
+                                        bs_denominator)
+    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /
+                                        bs_denominator)
+    train_reader = get_reader(args.model_name.upper(), 'train', train_config)
+    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)
+
+    exe_places = static.cuda_places() if args.use_gpu else static.cpu_places()
+    train_pyreader.decorate_sample_list_generator(train_reader,
+                                                  places=exe_places)
+    valid_pyreader.decorate_sample_list_generator(valid_reader,
+                                                  places=exe_places)
+
+    # get metrics
+    train_metrics = MetricsCalculator(args.model_name.upper(), 'train', train_config)
+    valid_metrics = MetricsCalculator(args.model_name.upper(), 'valid', valid_config)
+    # print("****************************valid_metrics", valid_metrics.get())
+    train_fetch_list = [train_loss.name] + [x.name for x in train_outputs
+                                            ] + [train_feeds[-1].name]
+    valid_fetch_list = [valid_loss.name] + [x.name for x in valid_outputs
+                                            ] + [valid_feeds[-1].name]
+
+    epochs = args.epoch_num or train_model.epoch_num()
+
+    train_with_pyreader(
+        exe,
+        train_prog,
+        compiled_train_prog,
+        train_pyreader,
+        train_fetch_list,
+        train_metrics,
+        epochs=epochs,
+        log_interval=args.log_interval,
+        valid_interval=args.valid_interval,
+        save_dir=args.save_dir,
+        save_model_name=args.model_name,
+        test_exe=compiled_valid_prog,
+        test_pyreader=valid_pyreader,
+        test_fetch_list=valid_fetch_list,
+        test_metrics=valid_metrics)
+
+if __name__ == "__main__":
+    args = parse_args()
+    logger.info(args)
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    train(args)
diff --git a/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py b/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py
new file mode 100755
index 000000000..b90a4601b
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/scenario_lib/utils.py
@@ -0,0 +1,218 @@
+"""
+utils
+"""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import traceback
+import logging
+import shutil
+
+import numpy as np
+import paddle
+import paddle.static as static
+import static as static
+
+
+logger = logging.getLogger(__name__)
+
+
+def test_with_pyreader(exe,
+                       compiled_test_prog,
+                       test_pyreader,
+                       test_fetch_list,
+                       test_metrics,
+                       log_interval=0):
+    """test_with_pyreader
+    """
+    if not test_pyreader:
+        logger.error("[TEST] get pyreader failed.")
+    test_metrics.reset()
+    test_iter = 0
+    label_all = []
+    pred_all = []
+    try:
+        for data in test_pyreader():
+            test_outs = exe.run(compiled_test_prog,
+                                fetch_list=test_fetch_list,
+                                feed=data)
+            loss = np.array(test_outs[0])
+            pred = np.array(test_outs[1])
+            label = np.array(test_outs[-1])
+            pred_all.extend(pred)
+            label_all.extend(label)
+            test_metrics.accumulate(loss, pred, label)
+            test_iter += 1
+        test_metrics.finalize_and_log_out("[TEST] Finish")
+    except Exception as e:
+        logger.warn(
+            "[TEST] fail to execute test or calculate metrics: {}".format(e))
+        traceback.print_exc()
+    metrics_dict, test_loss = test_metrics.get_computed_metrics()
+    metrics_dict['label_all'] = label_all
+    metrics_dict['pred_all'] = pred_all
+    return test_loss, metrics_dict
+
+
+def train_with_pyreader(exe, train_prog, compiled_train_prog, train_pyreader,
+                        train_fetch_list, train_metrics, epochs=10,
+                        log_interval=0, valid_interval=0,
+                        save_dir='./', save_model_name='model',
+                        test_exe=None, test_pyreader=None,
+                        test_fetch_list=None, test_metrics=None):
+    """train_with_pyreader
+    """
+    if not train_pyreader:
+        logger.error("[TRAIN] get pyreader failed.")
+    EARLY_STOP_NUM = 20
+    early_stop = EARLY_STOP_NUM
+    global_iter = 0
+    train_iter = 0
+    iter_all = 0
+    best_test_acc1 = 0
+
+    for epoch in range(epochs):
+        lr = static.global_scope().find_var("learning_rate").get_tensor()
+        logger.info(
+            "------- learning rate {}, learning rate counter  -----".format(
+                np.array(lr)))
+        if early_stop < 0:
+            logger.info('Earyly Stop !!!')
+            break
+        train_metrics.reset()
+        global_iter += train_iter
+        epoch_periods = []
+        for data in train_pyreader():
+            try:
+                cur_time = time.time()
+                train_outs = exe.run(compiled_train_prog,
+                                     fetch_list=train_fetch_list,
+                                     feed=data)
+                iter_all += 1
+                period = time.time() - cur_time
+                epoch_periods.append(period)
+                loss = np.array(train_outs[0])
+                pred = np.array(train_outs[1])
+                label = np.array(train_outs[-1])
+                train_metrics.accumulate(loss, pred, label)
+                if log_interval > 0 and (train_iter % log_interval == 0):
+                    # eval here
+                    train_metrics.finalize_and_log_out(
+                                info='[TRAIN] Epoch {} iter {} everage: '.format(epoch, train_iter))
+                train_iter += 1
+            except Exception as e:
+                logger.info(
+                    "[TRAIN] Epoch {}, iter {} data training failed: {}".
+                    format(epoch, train_iter, str(e)))
+        if len(epoch_periods) < 1:
+            logger.info(
+                'No iteration was executed, please check the data reader')
+            sys.exit(1)
+
+        logger.info(
+            '[TRAIN] Epoch {} training finished, average time: {}'.format(
+                epoch, np.mean(epoch_periods)))
+        train_metrics.finalize_and_log_out( \
+            info='[TRAIN] Finished ... Epoch {} all iters average: '.format(epoch))
+
+        # save models of min loss in best acc epochs
+        if test_exe and valid_interval > 0 and (epoch +
+                                                1) % valid_interval == 0:
+            # metrics_dict,loss = train_metrics.calculator.get_computed_metrics()
+            loss, metrics_dict_test = test_with_pyreader(
+                exe, test_exe, test_pyreader, test_fetch_list, test_metrics,
+                log_interval)
+            test_acc1 = metrics_dict_test['avg_acc1']
+            if test_acc1 > best_test_acc1:
+                best_test_acc1 = test_acc1
+                save_model(exe, train_prog, save_dir, save_model_name,
+                           "_epoch{}_acc{}".format(epoch, best_test_acc1))
+                early_stop = EARLY_STOP_NUM
+            else:
+                early_stop -= 1
+
+
+def save_model(exe, program, save_dir, model_name, postfix=None):
+    """save_model
+    """
+    model_path = os.path.join(save_dir, model_name + postfix)
+    if os.path.isdir(model_path):
+        shutil.rmtree(model_path)
+    # fluid.io.save_persistables(exe, model_path, main_program=program)
+    save_vars = [x for x in program.list_vars() \
+                                 if isinstance(x, paddle.framework.Parameter)]
+
+    static.save_vars(exe,
+                       dirname=model_path,
+                       main_program=program,
+                       vars=save_vars,
+                       filename="param")
+
+
+def save_model_persist(exe, program, save_dir, model_name, postfix=None):
+    """save_model"""
+    model_path = os.path.join(save_dir, model_name + postfix)
+    if os.path.isdir(model_path):
+        shutil.rmtree(model_path)
+    paddle.fluid.io.save_persistables(exe,
+                               save_dir,
+                               main_program=program,
+                               filename=model_path)
+
+
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    """
+    init pretrain_params
+    """
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+
+    def existed_params(var):
+        """
+        Load existed params
+        """
+        if not isinstance(var, paddle.framework.Parameter):
+            return False
+        flag = os.path.exists(os.path.join(pretraining_params_path, var.name))
+        return flag
+
+    static.load_vars(exe,
+                       pretraining_params_path,
+                       main_program=main_program,
+                       predicate=existed_params)
+    logger.info(
+        "Load pretraining parameters from {}.".format(pretraining_params_path))
+
+
+class AttrDict(dict):
+    """AttrDict
+    """
+    def __getattr__(self, key):
+        """getter
+        """
+        return self[key]
+
+    def __setattr__(self, key, value):
+        """setter
+        """
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
diff --git a/docs/src/applications/MultimodalVideoTag/train.sh b/docs/src/applications/MultimodalVideoTag/train.sh
new file mode 100755
index 000000000..05db2585e
--- /dev/null
+++ b/docs/src/applications/MultimodalVideoTag/train.sh
@@ -0,0 +1,13 @@
+export CUDA_VISIBLE_DEVICES=0,1
+export FLAGS_eager_delete_tensor_gb=0.0
+export FLAGS_sync_nccl_allreduce=1
+export FLAGS_fast_eager_deletion_mode=1
+export FLAGS_fraction_of_gpu_memory_to_use=0.5
+export FLAGS_reallocate_gpu_memory_in_mb=0
+export FLAGS_memory_fraction_of_eager_deletion=1
+python scenario_lib/train.py --model_name=AttentionLstmErnie \
+--config=./conf/conf.txt \
+--log_interval=20 \
+--valid_interval=1 \
+--save_dir=checkpoints_save_new/ \
+--pretrain=checkpoints_save/
diff --git a/docs/src/applications/PP-Care/Readme.md b/docs/src/applications/PP-Care/Readme.md
new file mode 100644
index 000000000..61498fe96
--- /dev/null
+++ b/docs/src/applications/PP-Care/Readme.md
@@ -0,0 +1,110 @@
+# Video models for 3DMRI
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型推理](#模型推理)
+- [实现细节](#实现细节)
+- [参考论文](#参考论文)
+
+在开始使用之前，您需要按照以下命令安装额外的依赖包：
+```bash
+python -m pip install SimpleITK
+```
+
+## 模型简介
+
+目前对于医学3D数据如MRI，并无太好的处理手段，大多数2D模型无法获得3D空间层面的特征，而常用的3D模型又需要较大的计算成本。而同时，3D医学数据与常见的视频数据有一定相似之处，我们尝试了通过PaddleVideo中的常见模型解决医学3DMRI数据的分类问题，获得了较好的结果。目前支持PP-TSN、PP-TSM、Slowfast和Timesformer对3DMRI的直接训练。
+
+## 数据准备
+
+数据集包括帕金森患者(PD)与正常(Con)两种类型共378个case，训练集：测试集=300：78，使用数据均为公开数据集，包括*neurocon*, *taowu*, *PPMI*和*OASIS-1*（经过选取），并经过一定格式转换，数据最后的格式均为*name.nii*或*name.nii.gz*，路径与label信息通过txt文件保存，数据集可以通过百度网盘下载：[下载链接](https://pan.baidu.com/s/1eIsHHqnkKNG5x9CGjRONEA?pwd=avug)
+- 数据集label格式
+```
+{
+   "0": "Con",
+   "1": "PD"
+}
+```
+- 数据集信息文件格式
+```
+{
+   path1 label1
+   path2 label2
+   ...
+}
+```
+- 数据保存格式
+```
+{
+   |--  datasets
+      |--  neurocon
+      |--  taowu
+      |--  PPMI
+      |--  OASIS-1
+}
+```
+
+## 模型训练
+
+#### 下载并添加预训练模型
+
+1. 对于PP-TSN与PP-TSM，除了可以使用ImageNet1000上训练好的预训练模型（见[PP-TSN预训练模型](../../../docs/zh-CN/model_zoo/recognition/pp-tsn.md)与[PP-TSM预训练模型](../../../docs/zh-CN/model_zoo/recognition/pp-tsm.md))，也可以使用在MRI数据集上预训练的ResNet50权重座位Backbone初始化参数，通过百度网盘下载: [下载链接](https://pan.baidu.com/s/1eIsHHqnkKNG5x9CGjRONEA?pwd=avug)。对于Slowfast与TimeSformer，目前只支持是使用自然数据集的预训练模型，见[Slowfast预训练模型](../../../docs/zh-CN/model_zoo/recognition/slowfast.md)与[Timesformer预训练模型](../../../docs/zh-CN/model_zoo/recognition/timesformer.md)
+
+
+2. 打开`PaddleVideo/applications/PP-Care/configs/XXX.yaml`，将下载好的权重路径填写到下方`pretrained:`之后，以pptsn_MRI为例
+
+   ```yaml
+   MODEL:
+       framework: "RecognizerMRI"
+       backbone:
+           name: "ResNetTSN_MRI"
+           pretrained: 将路径填写到此处
+   ```
+
+#### 开始训练
+
+- 训练使用显卡数量与输出路径等信息均可以选择，以PP-TSN_MRI的4卡训练为例，训练启动命令如下
+
+  ```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_pptsn_MRI main.py  --validate -c applications/PP-Care/configs/pptsn_MRI.yaml
+  ```
+
+## 模型测试
+
+由于各模型均存在随机采样部分，且采样方式存在不同，所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数，因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标，以PP-TSN_MRI为例，命令如下：
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_pptsn_MRI main.py  --test -c applications/PP-Care/configs/pptsn_MRI.yaml -w "output/ppTSN_MRI/ppTSN_MRI_best.pdparams"
+```
+
+当测试配置采用.yaml中参数时，在3DMRI数据的validation数据集上的测试指标如下：
+
+|      backbone      |     head     |  Acc  |
+| :----------------: | :----------: | :---: |
+|      ResNet50      |    PP-TSN    | 91.07 |
+|      ResNet50      |    PP-TSM    | 90.83 |
+|     3DResNet50     |   Slowfast   | 91.07 |
+| Vision Transformer |  Timesformer | 88.33 |
+
+训练好的模型可以通过百度网盘下载：[下载链接](https://pan.baidu.com/s/1eIsHHqnkKNG5x9CGjRONEA?pwd=avug)
+
+
+## 模型优化
+在实际使用中，可以尝试模型优化策略
+- 可以根据MRI数据分布，调整采样率
+- 本模型目前未加入过多的数据预处理策略，针对不同数据特性，在本模型基础上加入一定的预处理手段可能会使结果继续提升
+- 由于数据量与任务难度限制，本模型目前在准确率上的表现与3DResNet并无显著区别，但对于时间与空间的需求均远小于3D模型
+
+
+## 参考论文
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/pdf/1608.00859.pdf), Limin Wang, Yuanjun Xiong, Zhe Wang
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
+- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.
+- [A Multigrid Method for Efficiently Training Video Models](https://arxiv.org/abs/1912.00998), Chao-Yuan Wu, Ross Girshick, et al.
+- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
diff --git a/docs/src/applications/PPHuman/README.md b/docs/src/applications/PPHuman/README.md
new file mode 100644
index 000000000..24659c9f3
--- /dev/null
+++ b/docs/src/applications/PPHuman/README.md
@@ -0,0 +1,143 @@
+# PP-Human 行为识别模型
+
+实时行人分析工具[PP-Human](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/pphuman)中集成了基于骨骼点的行为识别模块。本文档介绍如何基于[PaddleVideo](https://github.com/PaddlePaddle/PaddleVideo/)，完成行为识别模型的训练流程。
+
+## 行为识别模型训练
+目前行为识别模型使用的是[ST-GCN](https://arxiv.org/abs/1801.07455)，并在[PaddleVideo训练流程](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/stgcn.md)的基础上修改适配，完成模型训练。
+
+### 准备训练数据
+STGCN是一个基于骨骼点坐标序列进行预测的模型。在PaddleVideo中，训练数据为采用`.npy`格式存储的`Numpy`数据，标签则可以是`.npy`或`.pkl`格式存储的文件。对于序列数据的维度要求为`(N,C,T,V,M)`。
+
+以我们在PPhuman中的模型为例，其中具体说明如下：
+| 维度 | 大小 | 说明 |
+| ---- | ---- | ---------- |
+| N | 不定 | 数据集序列个数 |
+| C | 2 | 关键点坐标维度，即(x, y) |
+| T | 50 | 动作序列的时序维度（即持续帧数）|
+| V | 17 | 每个人物关键点的个数，这里我们使用了`COCO`数据集的定义，具体可见[这里](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/docs/tutorials/PrepareKeypointDataSet_cn.md#COCO%E6%95%B0%E6%8D%AE%E9%9B%86) |
+| M | 1 | 人物个数，这里我们每个动作序列只针对单人预测 |
+
+#### 1. 获取序列的骨骼点坐标
+对于一个待标注的序列（这里序列指一个动作片段，可以是视频或有顺序的图片集合）。可以通过模型预测或人工标注的方式获取骨骼点（也称为关键点）坐标。
+- 模型预测：可以直接选用[PaddleDetection KeyPoint模型系列](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/configs/keypoint) 模型库中的模型，并根据`3、训练与测试 - 部署预测 - 检测+keypoint top-down模型联合部署`中的步骤获取目标序列的17个关键点坐标。
+- 人工标注：若对关键点的数量或是定义有其他需求，也可以直接人工标注各个关键点的坐标位置，注意对于被遮挡或较难标注的点，仍需要标注一个大致坐标，否则后续网络学习过程会受到影响。
+
+在完成骨骼点坐标的获取后，建议根据各人物的检测框进行归一化处理，以消除人物位置、尺度的差异给网络带来的收敛难度，这一步可以参考[这里](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/deploy/pphuman/pipe_utils.py#L352-L363)。
+
+#### 2. 统一序列的时序长度
+由于实际数据中每个动作的长度不一，首先需要根据您的数据和实际场景预定时序长度（在PP-Human中我们采用50帧为一个动作序列），并对数据做以下处理：
+- 实际长度超过预定长度的数据，随机截取一个50帧的片段
+- 实际长度不足预定长度的数据：补0，直到满足50帧
+- 恰好等于预定长度的数据： 无需处理
+
+注意：在这一步完成后，请严格确认处理后的数据仍然包含了一个完整的行为动作，不会产生预测上的歧义，建议通过可视化数据的方式进行确认。
+
+#### 3. 保存为PaddleVideo可用的文件格式
+在经过前两步处理后，我们得到了每个人物动作片段的标注，此时我们已有一个列表`all_kpts`，这个列表中包含多个关键点序列片段，其中每一个片段形状为(T, V, C) （在我们的例子中即(50, 17, 2)), 下面进一步将其转化为PaddleVideo可用的格式。
+- 调整维度顺序： 可通过`np.transpose`和`np.expand_dims`将每一个片段的维度转化为(C, T, V, M)的格式。
+- 将所有片段组合并保存为一个文件
+
+注意：这里的`class_id`是`int`类型，与其他分类任务类似。例如`0：摔倒， 1：其他`。
+
+至此，我们得到了可用的训练数据（`.npy`）和对应的标注文件（`.pkl`）。
+
+#### 示例：基于UR Fall Detection Dataset的摔倒数据处理
+[UR Fall Detection Dataset](http://fenix.univ.rzeszow.pl/~mkepski/ds/uf.html)是一个包含了不同摄像机视角及不同传感器下的摔倒检测数据集。数据集本身并不包含关键点坐标标注，在这里我们使用平视视角（camera 0）的RGB图像数据，介绍如何依照上面展示的步骤完成数据准备工作。
+
+（1）使用[PaddleDetection关键点模型](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/configs/keypoint)完成关键点坐标的检测
+```bash
+# current path is under root of PaddleDetection
+
+# Step 1: download pretrained inference models.
+wget https://bj.bcebos.com/v1/paddledet/models/pipeline/mot_ppyoloe_l_36e_pipeline.zip
+wget https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip
+unzip -d output_inference/ mot_ppyoloe_l_36e_pipeline.zip
+unzip -d output_inference/ dark_hrnet_w32_256x192.zip
+
+# Step 2: Get the keypoint coordinarys
+
+# if your data is image sequence
+python deploy/python/det_keypoint_unite_infer.py --det_model_dir=output_inference/mot_ppyoloe_l_36e_pipeline/ --keypoint_model_dir=output_inference/dark_hrnet_w32_256x192 --image_dir={your image directory path} --device=GPU --save_res=True
+
+# if your data is video
+python deploy/python/det_keypoint_unite_infer.py --det_model_dir=output_inference/mot_ppyoloe_l_36e_pipeline/ --keypoint_model_dir=output_inference/dark_hrnet_w32_256x192 --video_file={your video file path} --device=GPU --save_res=True
+```
+这样我们会得到一个`det_keypoint_unite_image_results.json`的检测结果文件。内容的具体含义请见[这里](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/deploy/python/det_keypoint_unite_infer.py#L108)。
+
+这里我们需要对UR Fall中的每一段数据执行上面介绍的步骤，在每一段执行完成后及时将检测结果文件妥善保存到一个文件夹中。
+```bash
+
+mkdir {root of PaddleVideo}/applications/PPHuman/datasets/annotations
+mv det_keypoint_unite_image_results.json {root of PaddleVideo}/applications/PPHuman/datasets/annotations/det_keypoint_unite_image_results_{video_id}_{camera_id}.json
+```
+
+（2）将关键点坐标转化为训练数据
+
+
+在完成上述步骤后，我们得到的骨骼点数据形式如下：
+```
+annotations/
+├── det_keypoint_unite_image_results_fall-01-cam0-rgb.json
+├── det_keypoint_unite_image_results_fall-02-cam0-rgb.json
+├── det_keypoint_unite_image_results_fall-03-cam0-rgb.json
+├── det_keypoint_unite_image_results_fall-04-cam0-rgb.json
+    ...
+├── det_keypoint_unite_image_results_fall-28-cam0-rgb.json
+├── det_keypoint_unite_image_results_fall-29-cam0-rgb.json
+└── det_keypoint_unite_image_results_fall-30-cam0-rgb.json
+```
+这里使用我们提供的脚本直接将数据转化为训练数据, 得到数据文件`train_data.npy`, 标签文件`train_label.pkl`。该脚本执行的内容包括解析json文件内容、前述步骤中介绍的整理训练数据及保存数据文件。
+```bash
+# current path is {root of PaddleVideo}/applications/PPHuman/datasets/
+
+python prepare_dataset.py
+```
+几点说明：
+- UR Fall的动作大多是100帧左右长度对应一个完整动作，个别视频包含一些无关动作，可以手工去除，也可以裁剪作为负样本
+- 统一将数据整理为100帧，再抽取为50帧，保证动作完整性
+- 上述包含摔倒的动作是正样本，在实际训练中也需要一些其他的动作或正常站立等作为负样本，步骤同上，但注意label的类型取1。
+
+这里我们提供了我们处理好的更全面的[数据](https://bj.bcebos.com/v1/paddledet/data/PPhuman/fall_data.zip)，包括其他场景中的摔倒及非摔倒的动作场景。
+
+### 训练与测试
+在PaddleVideo中，使用以下命令即可开始训练：
+```bash
+# current path is under root of PaddleVideo
+python main.py -c applications/PPHuman/configs/stgcn_pphuman.yaml
+
+# 由于整个任务可能过拟合,建议同时开启验证以保存最佳模型
+python main.py --validate -c applications/PPHuman/configs/stgcn_pphuman.yaml
+```
+
+在训练完成后，采用以下命令进行预测：
+```bash
+python main.py --test -c applications/PPHuman/configs/stgcn_pphuman.yaml  -w output/STGCN/STGCN_best.pdparams
+```
+
+### 导出模型推理
+
+- 在PaddleVideo中，通过以下命令实现模型的导出，得到模型结构文件`STGCN.pdmodel`和模型权重文件`STGCN.pdiparams`，并增加配置文件：
+```bash
+# current path is under root of PaddleVideo
+python tools/export_model.py -c applications/PPHuman/configs/stgcn_pphuman.yaml \
+                                -p output/STGCN/STGCN_best.pdparams \
+                                -o output_inference/STGCN
+
+cp applications/PPHuman/configs/infer_cfg.yml output_inference/STGCN
+
+# 重命名模型文件，适配PP-Human的调用
+cd output_inference/STGCN
+mv STGCN.pdiparams model.pdiparams
+mv STGCN.pdiparams.info model.pdiparams.info
+mv STGCN.pdmodel model.pdmodel
+```
+完成后的导出模型目录结构如下：
+```
+STGCN
+├── infer_cfg.yml
+├── model.pdiparams
+├── model.pdiparams.info
+├── model.pdmodel
+```
+
+至此，就可以使用[PP-Human](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/pphuman)进行行为识别的推理了。
diff --git a/docs/src/applications/PPHuman/datasets/prepare_dataset.py b/docs/src/applications/PPHuman/datasets/prepare_dataset.py
new file mode 100644
index 000000000..81d6f1fe6
--- /dev/null
+++ b/docs/src/applications/PPHuman/datasets/prepare_dataset.py
@@ -0,0 +1,98 @@
+import os
+import json
+import numpy as np
+import pickle
+"""
+ This python script is used to convert keypoint results of UR FALL dataset
+   for training by PaddleVideo
+"""
+
+
+def self_norm(kpt, bbox):
+    # kpt: (2, T, 17, 1),  bbox: (T, 4)
+    tl = bbox[:, 0:2]
+    wh = bbox[:, 2:]
+    tl = np.expand_dims(np.transpose(tl, (1, 0)), (2, 3))
+    wh = np.expand_dims(np.transpose(wh, (1, 0)), (2, 3))
+
+    res = (kpt - tl) / wh
+    res *= np.expand_dims(np.array([[384.], [512.]]), (2, 3))
+    return res
+
+
+def convert_to_ppvideo(all_kpts, all_scores, all_bbox):
+    # shape of all_kpts is (T, 17, 2)
+    keypoint = np.expand_dims(np.transpose(all_kpts, [2, 0, 1]),
+                              -1)  #(2, T, 17, 1)
+    keypoint = self_norm(keypoint, all_bbox)
+
+    scores = all_scores
+    if keypoint.shape[1] > 100:
+        frame_start = (keypoint.shape[1] - 100) // 2
+        keypoint = keypoint[:, frame_start:frame_start + 100:2, :, :]
+        scores = all_scores[frame_start:frame_start + 100:2, :, :]
+    elif keypoint.shape[1] < 100:
+        keypoint = np.concatenate([
+            keypoint,
+            np.zeros((2, 100 - keypoint.shape[1], 17, 1), dtype=keypoint.dtype)
+        ], 1)[:, ::2, :, :]
+        scores = np.concatenate([
+            all_scores,
+            np.zeros((100 - all_scores.shape[0], 17, 1), dtype=keypoint.dtype)
+        ], 0)[::2, :, :]
+    else:
+        keypoint = keypoint[:, ::2, :, :]
+        scores = scores[::2, :, :]
+    return keypoint, scores
+
+
+def decode_json_path(json_path):
+    content = json.load(open(json_path))
+    content = sorted(content, key=lambda x: x[0])
+    all_kpts = []
+    all_score = []
+    all_bbox = []
+    for annos in content:
+        bboxes = annos[1]
+        kpts = annos[2][0]
+        frame_id = annos[0]
+
+        if len(bboxes) != 1:
+            continue
+        kpt_res = []
+        kpt_score = []
+        for kpt in kpts[0]:
+            x, y, score = kpt
+            kpt_res.append([x, y])
+            kpt_score.append([score])
+        all_kpts.append(np.array(kpt_res))
+        all_score.append(np.array(kpt_score))
+        all_bbox.append([
+            bboxes[0][0], bboxes[0][1], bboxes[0][2] - bboxes[0][0],
+            bboxes[0][3] - bboxes[0][1]
+        ])
+    all_kpts_np = np.array(all_kpts)
+    all_score_np = np.array(all_score)
+    all_bbox_np = np.array(all_bbox)
+    video_anno, scores = convert_to_ppvideo(all_kpts_np, all_score_np,
+                                            all_bbox_np)
+
+    return video_anno, scores
+
+
+if __name__ == '__main__':
+    all_keypoints = []
+    all_labels = [[], []]
+    all_scores = []
+    for i, path in enumerate(os.listdir("annotations")):
+        video_anno, score = decode_json_path(os.path.join("annotations", path))
+
+        all_keypoints.append(video_anno)
+        all_labels[0].append(str(i))
+        all_labels[1].append(0)  #label 0 means falling
+        all_scores.append(score)
+    all_data = np.stack(all_keypoints, 0)
+    all_score_data = np.stack(all_scores, 0)
+    np.save(f"train_data.npy", all_data)
+    pickle.dump(all_labels, open(f"train_label.pkl", "wb"))
+    np.save("kptscore_data.npy", all_score_data)
diff --git a/docs/src/applications/README.md b/docs/src/applications/README.md
new file mode 100644
index 000000000..baf0f8a7b
--- /dev/null
+++ b/docs/src/applications/README.md
@@ -0,0 +1,18 @@
+# 应用案例
+
+## 1. 概览
+
+| Applications | Descriptions |
+| :--------------- | :-------- |
+| [FootballAction](./FootballAction) | 足球动作检测方案|
+| [BasketballAction](./BasketballAction) | 篮球动作检测方案 |
+| [TableTennis](./TableTennis) | 乒乓球动作识别方案|
+| [FigureSkating](./FigureSkating) | 花样滑冰动作识别方案|
+| [VideoTag](./VideoTag) | 3000类大规模视频分类方案 |
+| [MultimodalVideoTag](./MultimodalVideoTag) | 多模态视频分类方案|
+| [VideoQualityAssessment](.s/VideoQualityAssessment) | 视频质量评估方案|
+| [PP-Care](./PP-Care) | 3DMRI医疗图像识别方案 |
+| [EIVideo](./EIVideo) | 视频交互式分割工具|
+| [Anti-UAV](./Anti-UAV) |无人机检测方案|
+| [AbnormalActionDetection](./AbnormalActionDetection) |异常行为检测方案|
+| [PP-Human](./PPHuman) | 行人分析场景动作识别方案 |
diff --git a/docs/src/applications/T2VLAD/README.md b/docs/src/applications/T2VLAD/README.md
new file mode 100644
index 000000000..9ca65b9f5
--- /dev/null
+++ b/docs/src/applications/T2VLAD/README.md
@@ -0,0 +1,75 @@
+[English](./README_en.md) | 简体中文
+
+# T2VLAD: 基于局部全局对齐的文本视频检索
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [参考论文](#参考论文)
+
+在开始使用之前，您需要按照以下命令安装额外的依赖包：
+```bash
+python -m pip install paddlenlp
+```
+同时确保paddle版本为2.2.2。
+
+## 模型简介
+
+T2VLAD是百度在CVPR2021提出的文本视频检索模型。文本视频检索是一项具有挑战的任务，旨在基于自然语言处理描述搜索相关视频内容。这个问题的关键是在联合嵌入空间中测量文本-视频的相似性。T2VLAD设计了一种有效的全局-局部对齐方法，在三个标准的文本视频检索基准上取得了一致的改进，并以明显的优势超越了最先进的技术。
+
+<div align="center">
+<img src="./imgs/t2vlad.png" height=400 width=700 hspace='10'/> <br />
+</div>
+
+
+## 数据准备
+
+MSR-VTT数据下载及准备请参考 [MSR-VTT数据准备](../../docs/zh-CN/dataset/msrvtt.md)
+
+## 模型训练
+
+### MSR-VTT数据集训练
+
+下载数据并添加到 `data/MSRVTT` 文件夹下。
+
+#### 开始训练
+
+- 训练启动命令如下:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0
+python3.7 train.py --config ./configs/msrvtt_transformers.json
+```
+
+T2VLAD在训练时使用了Ranger优化器，这里我们暂时没有支持Ranger优化器到的实现，目前可以使用AdamW优化器来完成训练。
+
+
+## 模型测试
+
+- 对下游任务：文本-视频检索，在MSR-VTT数据集上评估性能，评估脚本启动方式如下：
+
+```bash
+export CUDA_VISIBLE_DEVICES=0
+python3.7 test.py --config ./configs/msrvtt_transformers.json --resume ./T2VLAD_msrvtt.pdparams
+```
+
+MSR-VTT数据集测试精度:
+Text $\rightarrow$ Video
+| R@1  | R@5  | R@10 | Median R |                         checkpoints                          |
+| :--: | :--: | :--: | :------: | :----------------------------------------------------------: |
+| 29.5 | 59.0 | 70.1 |   4      | [T2VLAD.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/T2VLAD_msrvtt.pdparams) |
+
+Video $\rightarrow$ Text
+| R@1  | R@5  | R@10 | Median R |
+| :--: | :--: | :--: | :------: |
+| 26.1 | 54.7 | 68.1 |   4      |
+
+
+## 参考论文
+
+- [T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval
+](https://arxiv.org/pdf/2104.10054.pdf), Xiaohan Wang, Linchao Zhu, Yi Yang
diff --git a/docs/src/applications/T2VLAD/README_en.md b/docs/src/applications/T2VLAD/README_en.md
new file mode 100644
index 000000000..0c46a5884
--- /dev/null
+++ b/docs/src/applications/T2VLAD/README_en.md
@@ -0,0 +1,69 @@
+[简体中文](./README.md) | English
+
+# T2VLAD
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Reference](#Reference)
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install paddlenlp
+```
+
+## Introduction
+T2VLAD is proposed by Baidu in CVPR2021 for text-video retrieval. Text-video retrieval is a challenging task that aims to search relevant video contents based on natural language descriptions. The key to this problem is to measure text- video similarities in a joint embedding space. T2VLAD designs an efficient global-local alignment method. This model achieves consistent improvements on three standard text-video retrieval benchmarks and outperform the state- of-the-art by a clear margin.
+
+<div align="center">
+<img src="./imgs/t2vlad.png" height=400 width=700 hspace='10'/> <br />
+</div>
+
+
+## Data
+Please refer to MSR-VTT data download and preparation doc [MSR-VTT data](../../docs/en/dataset/msrvtt.md)
+
+## Train
+### Train on MSR-VTT
+Download data then move to `data/MSRVTT` folder.
+
+#### Start training
+
+- Train T2VLAD on MSRVTT scripts:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0
+python3.7 train.py --config ./configs/msrvtt_transformers.json
+```
+
+T2VLAD uses the Ranger optimizer during training. We haven't supported the implementation of Ranger optimizer, for now, the AdamW optimizer can be used to complete the training.
+
+
+## Test
+
+- Evaluation performs on downstream task, i.e. text-video clip retrieval on MSR-VTT dataset, test accuracy can be obtained using scripts:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0
+python3.7 test.py --config ./configs/msrvtt_transformers.json --resume ./T2VLAD_msrvtt.pdparams
+```
+
+Accuracy on MSR-VTT:
+Text $\rightarrow$ Video
+| R@1  | R@5  | R@10 | Median R |                         checkpoints                          |
+| :--: | :--: | :--: | :------: | :----------------------------------------------------------: |
+| 29.5 | 59.0 | 70.1 |   4      | [T2VLAD.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/T2VLAD_msrvtt.pdparams) |
+
+Video $\rightarrow$ Text
+| R@1  | R@5  | R@10 | Median R |
+| :--: | :--: | :--: | :------: |
+| 26.1 | 54.7 | 68.1 |   4      |
+
+## Reference
+
+- [T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval
+](https://arxiv.org/pdf/2104.10054.pdf), Xiaohan Wang, Linchao Zhu, Yi Yang
diff --git a/docs/src/applications/T2VLAD/base/__init__.py b/docs/src/applications/T2VLAD/base/__init__.py
new file mode 100644
index 000000000..d9d437ad7
--- /dev/null
+++ b/docs/src/applications/T2VLAD/base/__init__.py
@@ -0,0 +1,2 @@
+from .base_model import *
+from .base_trainer import *
diff --git a/docs/src/applications/T2VLAD/base/base_dataset.py b/docs/src/applications/T2VLAD/base/base_dataset.py
new file mode 100644
index 000000000..877b7364e
--- /dev/null
+++ b/docs/src/applications/T2VLAD/base/base_dataset.py
@@ -0,0 +1,562 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import time
+import json
+import random
+import paddle
+import inspect
+import logging
+import functools
+import data_loader
+
+import numpy as np
+import pickle as pkl
+
+from pathlib import Path
+from abc import abstractmethod
+from typing import Dict, Union
+from numpy.random import randint
+from typeguard import typechecked
+from collections import OrderedDict
+from zsvision.zs_utils import memcache
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"{e}, [paddlenlp] package and it's dependencies is required for T2VLAD."
+    )
+from utils import ensure_tensor, expert_tensor_storage
+
+# For SLURM usage, buffering makes it difficult to see events as they happen, so we set
+# the global print statement to enforce flushing
+print = functools.partial(print, flush=True)
+
+
+class BaseDataset(paddle.io.Dataset):
+    @staticmethod
+    @abstractmethod
+    @typechecked
+    def dataset_paths() -> Dict[str, Union[Path, str]]:
+        """Generates a datastructure containing all the paths required to load features
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def sanity_checks(self):
+        """Run sanity checks on loaded data
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_features(self):
+        """Load features from disk
+        """
+        raise NotImplementedError
+
+    @typechecked
+    def __init__(
+        self,
+        data_dir: Path,
+        eval_only: bool,
+        use_zeros_for_missing: bool,
+        text_agg: str,
+        text_feat: str,
+        split_name: str,
+        cls_partition: str,
+        root_feat_folder: str,
+        text_dim: int,
+        num_test_captions: int,
+        restrict_train_captions: int,
+        max_tokens: Dict[str, int],
+        logger: logging.Logger,
+        raw_input_dims: Dict[str, int],
+        feat_aggregation: Dict[str, Dict],
+    ):
+        self.eval_only = eval_only
+        self.logger = logger
+        self.text_feat = text_feat
+        self.data_dir = data_dir
+        self.text_dim = text_dim
+        self.restrict_train_captions = restrict_train_captions
+        self.max_tokens = max_tokens
+        self.cls_partition = cls_partition
+        self.num_test_captions = num_test_captions
+        self.feat_aggregation = feat_aggregation
+        self.root_feat = data_dir / root_feat_folder
+        self.experts = set(raw_input_dims.keys())
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        # This attributes can be overloaded by different datasets, so it must be set
+        # before the `load_features() method call`
+        self.restrict_test_captions = None
+        self.text_features = None
+        self.label_features = None
+        self.video_labels = None
+        self.raw_captions = None
+        self.features = None
+
+        self.word2int = json.load(open('word2int.json'))
+
+        # Use a single caption per video when forming training minibatches (different
+        # captions from the same video may still be used across different minibatches)
+        self.captions_per_video = 1
+
+        self.ordered_experts = list(raw_input_dims.keys())
+
+        # Training and test lists are set by dataset-specific subclasses
+        self.partition_lists = {}
+        self.configure_train_test_splits(split_name=split_name)
+
+        # All retrieval-based tasks use a single dataloader (and handle the retrieval
+        # data separately), whereas for classification we use one dataloader for
+        # training and one for validation.
+        self.logger.info("The current task is retrieval")
+        self.sample_list = self.partition_lists["train"]
+        self.num_samples = len(self.sample_list)
+        num_val = len(self.partition_lists["val"])
+
+        self.raw_input_dims = raw_input_dims
+
+        # we store default paths to enable visualisations (this can be overloaded by
+        # dataset-specific classes)
+        self.video_path_retrieval = [
+            f"videos/{x}.mp4" for x in self.partition_lists["val"]
+        ]
+
+        # NOTE: We use nans rather than zeros to indicate missing faces, unless we wish
+        # to test single modality strength, which requires passing zeroed features for
+        # missing videos
+        if use_zeros_for_missing:
+            self.MISSING_VAL = 0
+        else:
+            self.MISSING_VAL = np.nan
+
+        # load the dataset-specific features into memory
+        self.load_features()
+
+        if text_agg == "avg":
+            self.logger.info("averaging the text features...")
+            for key, val in self.text_features.items():
+                self.text_features[key] = [
+                    np.mean(x, 0, keepdims=1) for x in val
+                ]
+            self.logger.info("finished averaging the text features")
+
+        self.trn_config = {}
+        self.raw_config = {}
+        self.tensor_storage = expert_tensor_storage(self.experts,
+                                                    self.feat_aggregation)
+        for static_expert in self.tensor_storage["fixed"]:
+            if static_expert in self.feat_aggregation:
+                if "trn_seg" in self.feat_aggregation[static_expert].keys():
+                    self.trn_config[static_expert] = \
+                        self.feat_aggregation[static_expert]["trn_seg"]
+                if "raw" in self.feat_aggregation[static_expert]["temporal"]:
+                    self.raw_config[static_expert] = 1
+
+        retrieval = {
+            expert: np.zeros(
+                (num_val, self.max_tokens[expert], raw_input_dims[expert]))
+            for expert in self.tensor_storage["variable"]
+        }
+        retrieval.update({
+            expert: np.zeros((num_val, raw_input_dims[expert]))
+            for expert in self.tensor_storage["fixed"]
+        })
+        self.retrieval = retrieval
+        self.test_ind = {
+            expert: paddle.ones([num_val])
+            for expert in self.experts
+        }
+        self.raw_captions_retrieval = [None] * num_val
+
+        # avoid evaluation on missing queries
+        self.query_masks = np.zeros((num_val, num_test_captions))
+        self.text_token_mask = np.zeros((num_val, num_test_captions))
+        self.text_retrieval = np.zeros((num_val, self.num_test_captions,
+                                        self.max_tokens["text"], self.text_dim))
+        self.cap_retrieval = paddle.zeros(
+            [num_val, self.num_test_captions, self.max_tokens["text"]],
+            dtype='int64'
+        )  #self.cap_retrieval = th.zeros((num_val, self.num_test_captions, self.max_tokens["text"]))
+        self.att_retrieval = paddle.zeros(
+            [num_val, self.num_test_captions, self.max_tokens["text"]],
+            dtype='int64'
+        )  #self.att_retrieval = th.zeros((num_val, self.num_test_captions, self.max_tokens["text"]))
+
+        save_cap = []
+        for ii, video_name in enumerate(self.partition_lists["val"]):
+
+            self.raw_captions_retrieval[ii] = self.raw_captions[video_name]
+            for expert in self.tensor_storage["fixed"].intersection(
+                    self.experts):
+                feats = self.features[expert][video_name]
+                drop = self.has_missing_values(feats)
+                self.test_ind[expert][ii] = not drop
+                self.retrieval[expert][ii] = feats
+                if drop:
+                    self.retrieval[expert][ii][:] = self.MISSING_VAL
+                if self.feat_aggregation[expert].get("binarise", False):
+                    keep = np.logical_not(
+                        np.isnan(self.retrieval[expert][:, 0, 0]))
+                    marker = np.ones_like(self.retrieval[expert][keep])
+                    self.retrieval[expert][keep] = marker
+
+            for expert in self.tensor_storage["variable"].intersection(
+                    self.experts):
+                feats = self.features[expert][video_name]
+                drop = self.has_missing_values(feats)
+                self.test_ind[expert][ii] = not drop
+                if drop:
+                    self.retrieval[expert][ii][:] = self.MISSING_VAL
+                if self.feat_aggregation[expert].get("binarise", False):
+                    keep = np.logical_not(
+                        np.isnan(self.retrieval[expert][:, 0, 0]))
+                    marker = np.ones_like(self.retrieval[expert][keep])
+                    self.retrieval[expert][keep] = marker
+                if self.test_ind[expert][ii]:
+                    keep = min(self.max_tokens[expert], len(feats))
+                    self.retrieval[expert][ii, :keep, :] = feats[:keep]
+
+            candidates_sentences = self.text_features[video_name]
+            if self.restrict_test_captions is not None:
+                keep_sent_idx = self.restrict_test_captions[video_name]
+                candidates_sentences = [candidates_sentences[keep_sent_idx]]
+
+            self.query_masks[ii, :len(candidates_sentences)] = 1
+
+            for test_caption_idx in range(self.num_test_captions):
+                if len(candidates_sentences) <= test_caption_idx:
+                    break
+                keep = min(len(candidates_sentences[test_caption_idx]),
+                           self.max_tokens["text"])
+                self.text_token_mask[ii, test_caption_idx] = keep
+                sent = self.raw_captions_retrieval[ii][test_caption_idx]
+                sent = " ".join(sent)
+                sent = sent.strip()
+                encoded_dict = self.tokenizer.__call__(
+                    sent,
+                    max_seq_len=self.max_tokens["text"],
+                    pad_to_max_seq_len=True,
+                    return_attention_mask=True,
+                    truncation_strategy='longest_first')
+                cap_ids = paddle.to_tensor(encoded_dict['input_ids'])
+                attention_mask = paddle.to_tensor(
+                    encoded_dict['attention_mask'])
+                save_cap.append(sent)
+                self.cap_retrieval[ii, test_caption_idx, :] = cap_ids
+                self.att_retrieval[ii, test_caption_idx, :] = attention_mask
+                if ii % 500 == 0 and test_caption_idx == 0:
+                    msg = (
+                        f"{ii}/{len(self.partition_lists['val'])} will evaluate "
+                        f"sentence {test_caption_idx} out of "
+                        f"{len(candidates_sentences)} (has {keep} words) "
+                        f"{video_name}")
+                    self.logger.info(msg)
+                text_feats = candidates_sentences[test_caption_idx][:keep]
+                if text_feats.shape[0] == 0:
+                    text_feats = 0
+                    raise ValueError("empty text features!")
+                self.text_retrieval[ii, test_caption_idx, :keep, :] = text_feats
+        with open('run_cap.pkl', 'wb') as f:
+            pkl.dump(save_cap, f)
+        self.sanity_checks()
+
+    def configure_train_test_splits(self, split_name):
+        """Partition the datset into train/val/test splits.
+
+        Args:
+            split_name (str): the name of the split
+        """
+        self.paths = type(self).dataset_paths()
+        print("loading training/val splits....")
+        tic = time.time()
+        for subset, path in self.paths["subset_list_paths"][split_name].items():
+            root_feat = Path(self.root_feat)
+            subset_list_path = root_feat / path
+            if subset == "train" and self.eval_only:
+                rows = []
+            else:
+                with open(subset_list_path) as f:
+                    rows = f.read().splitlines()
+            self.partition_lists[subset] = rows
+        print("done in {:.3f}s".format(time.time() - tic))
+        self.split_name = split_name
+
+    def collate_data(self, data):
+        batch_size = len(data)
+        tensors = {}
+        for expert in self.tensor_storage["fixed"]:
+            if expert in self.trn_config.keys():
+                tensors[expert] = paddle.to_tensor(
+                    np.zeros((batch_size, self.trn_config[expert],
+                              self.raw_input_dims[expert])))
+            else:
+                tensors[expert] = paddle.to_tensor(
+                    np.zeros((batch_size, self.raw_input_dims[expert])))
+
+        # Track which indices of each modality are available in the present batch
+        ind = {
+            expert: paddle.to_tensor(np.zeros(batch_size))
+            for expert in self.experts
+        }
+        tensors.update({
+            expert: paddle.to_tensor(
+                np.zeros((batch_size, self.max_tokens[expert],
+                          self.raw_input_dims[expert])))
+            for expert in self.tensor_storage["variable"]
+        })
+
+        text_tensor = paddle.to_tensor(
+            np.zeros((batch_size, self.captions_per_video,
+                      self.max_tokens["text"], self.text_dim)))
+        text_token_mask = paddle.to_tensor(
+            np.zeros((batch_size, self.captions_per_video)))
+        text_cap_id = paddle.zeros([batch_size, self.max_tokens["text"]],
+                                   dtype='int64')
+        text_att_mask = paddle.zeros([batch_size, self.max_tokens["text"]],
+                                     dtype='int64')
+
+        for ii, _ in enumerate(data):
+            datum = data[ii]
+            for expert in self.experts:
+                ind[expert][ii] = datum[f"{expert}_ind"]
+            for expert in self.tensor_storage["fixed"]:
+                tensors[expert][ii] = datum[expert]
+            for expert in self.tensor_storage["variable"]:
+                if ind[expert][ii]:
+                    keep = min(len(datum[expert]), self.max_tokens[expert])
+                    if keep:
+                        tensors[expert][ii, :keep, :] = datum[expert][:keep]
+                else:
+                    tensors[expert][ii, :, :] = self.MISSING_VAL
+
+            text = datum["text"]
+            cap_id = datum["cap_id"]
+            att_mask = datum["att_mask"]
+            text_cap_id[ii, :] = paddle.to_tensor(cap_id)
+            text_att_mask[ii, :] = paddle.to_tensor(att_mask)
+            for jj in range(self.captions_per_video):
+                keep = min(len(text[jj]), self.max_tokens["text"])
+                text_tensor[ii, jj, :keep, :] = text[jj][:keep]
+                text_token_mask[ii, jj] = keep
+
+        ind = {key: ensure_tensor(val) for key, val in ind.items()}
+        experts = OrderedDict(
+            (expert, paddle.to_tensor(tensors[expert], dtype='float32'))
+            for expert in self.ordered_experts)
+
+        for expert in self.experts:
+            if self.feat_aggregation[expert].get("binarise", False):
+                replace = np.logical_not(paddle.isnan(experts[expert][:, 0, 0]))
+                experts[expert][replace] = paddle.ones_like(
+                    experts[expert][replace])
+
+        minibatch = {"experts": experts, "ind": ind}
+        minibatch["text"] = paddle.to_tensor(text_tensor, dtype='float32')
+        minibatch["cap_id"] = paddle.to_tensor(text_cap_id, dtype='int64')
+        minibatch["att_mask"] = paddle.to_tensor(text_att_mask, dtype='int64')
+        minibatch["text_token_mask"] = paddle.to_tensor(text_token_mask)
+        return minibatch
+
+    def process_sent(self, sent, max_words, EOS: int = 1, UNK: int = 2):
+        # set EOS=1, UNK=2 by default, consistent with file 'word2int.json'.
+        tokens = [self.word2int.get(w, UNK) for w in sent]
+        tokens = tokens[:max_words]
+        tokens_len = len(tokens)
+        tokens = np.array(tokens + [EOS] * (max_words - tokens_len))
+        return tokens, tokens_len
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        if idx < self.num_samples:
+            vid = self.sample_list[idx]
+            features = {}
+            for expert in self.experts:
+                if expert not in self.trn_config.keys():
+                    if expert in self.raw_config.keys():
+                        features[expert] = np.mean(self.features[expert][vid],
+                                                   axis=0)
+                    else:
+                        features[expert] = self.features[expert][vid]
+                else:
+                    raw_frame_feats = self.features[expert][vid]
+                    new_length = 1
+                    num_frames = raw_frame_feats.shape[0]
+                    avg_duration = ((num_frames - new_length + 1) //
+                                    self.trn_config[expert])
+                    assert avg_duration > 0, "average duration must be positive"
+                    if avg_duration > 0:
+                        # maybe we could change to use average for each tiny segment
+                        # seems like use everything per iter
+                        offsets = np.multiply(
+                            list(range(self.trn_config[expert])), avg_duration)
+                        offsets += randint(avg_duration,
+                                           size=self.trn_config[expert])
+                        new_frame_feats = np.zeros(
+                            (self.trn_config[expert], raw_frame_feats.shape[1]))
+                        for idx, xx in enumerate(offsets):
+                            new_frame_feats[idx, :] = raw_frame_feats[xx, :]
+                        msg = "returning a wrong feature != segment num"
+                        assert new_frame_feats.shape[0] == self.trn_config[
+                            expert], msg
+                        features[expert] = new_frame_feats
+
+            ind = {}
+            for expert in self.ordered_experts:
+                if expert in self.tensor_storage["flaky"]:
+                    ind[expert] = not self.has_missing_values(features[expert])
+                else:
+                    ind[expert] = 1
+
+            # Handle some inconsistencies between how the text features are stored
+            text = self.text_features[vid]
+            if isinstance(text, list):
+                pick = np.random.choice(len(text), size=self.captions_per_video)
+                sent = self.raw_captions[vid][pick[0]]
+                sent = " ".join(sent)
+                sent = sent.strip()
+
+                text = np.array(text)[pick]
+                encoded_dict = self.tokenizer.__call__(
+                    sent,
+                    max_seq_len=self.max_tokens["text"],
+                    pad_to_max_seq_len=True,
+                    return_attention_mask=True,
+                    truncation_strategy='longest_first')
+                cap_id = encoded_dict['input_ids']
+                token_type_ids = encoded_dict['token_type_ids']
+                attention_mask = encoded_dict['attention_mask']
+            else:
+                pick = None
+                text = np.random.choice(text, size=self.captions_per_video)
+
+        # Return both the missing indices as well as the tensors
+        sample = {"text": text}
+        sample.update({"cap_id": cap_id})
+        sample.update({"att_mask": attention_mask})
+        sample.update({f"{key}_ind": val for key, val in ind.items()})
+        sample.update(features)
+        return sample
+
+    def get_retrieval_data(self):
+        experts = OrderedDict(
+            (expert, paddle.to_tensor(self.retrieval[expert], dtype='float32'))
+            for expert in self.ordered_experts)
+        retrieval_data = {
+            "text":
+            paddle.to_tensor(ensure_tensor(self.text_retrieval),
+                             dtype='float32'),
+            "experts":
+            experts,
+            "cap_id":
+            paddle.to_tensor(self.cap_retrieval, dtype='int64'),
+            "att_mask":
+            paddle.to_tensor(self.att_retrieval, dtype='int64'),
+            "ind":
+            self.test_ind,
+            "text_token_mask":
+            paddle.to_tensor(self.text_token_mask)
+        }
+        meta = {
+            "query_masks": self.query_masks,
+            "raw_captions": self.raw_captions_retrieval,
+            "paths": self.video_path_retrieval,
+        }
+        return retrieval_data, meta
+
+    def has_missing_values(self, x):
+        return isinstance(x, float) and np.isnan(x)
+
+    def visual_feat_paths(self, model_spec, tag=None):
+        """Canonical path lookup for visual features
+        """
+        if model_spec not in self.ordered_experts:
+            self.logger.info(
+                f"Skipping load for {model_spec} (feature not requested)")
+            return f"SKIPPED-{model_spec}"
+
+        feat_type, model_name, _ = model_spec.split(".")
+        aggs = self.feat_aggregation[model_spec]
+        base = f"aggregated_{feat_type.replace('-', '_')}"
+        required = ("fps", "pixel_dim", "stride")
+        fps, pixel_dim, stride = [aggs.get(x, None) for x in required]
+        if feat_type in {"facecrops", "faceboxes"}:
+            base = f"{base}_{fps}fps_{pixel_dim}px_stride{stride}"
+        elif feat_type not in {"ocr", "speech", "audio"}:
+            base = f"{base}_{fps}fps_{pixel_dim}px_stride{stride}"
+
+        for option in "offset", "inner_stride":
+            if aggs.get(option, None) is not None:
+                base += f"_{option}{aggs[option]}"
+
+        feat_paths = []
+        for agg in aggs["temporal"].split("-"):
+            fname = f"{model_name}-{agg}"
+            if aggs["type"] == "logits":
+                fname = f"{fname}-logits"
+            if tag is not None:
+                fname += f"-{tag}"
+            feat_paths.append(Path(base) / f"{fname}.pickle")
+        return feat_paths
+
+    def log_assert(self, bool_, msg="", verbose=True):
+        """Use assertions that will be written to the logs. This is a recipe from:
+        http://code.activestate.com/recipes/577074-logging-asserts/
+        """
+        try:
+            assert bool_, msg
+        except AssertionError:
+            # construct an exception message from the code of the calling frame
+            last_stackframe = inspect.stack()[-2]
+            source_file, line_no, func = last_stackframe[1:4]
+            source = f"Traceback (most recent call last):\n" + \
+                     f" File {source_file}, line {line_no}, in {func}\n"
+            if verbose:
+                # include more lines than that where the statement was made
+                source_code = open(source_file).readlines()
+                source += "".join(source_code[line_no - 3:line_no + 1])
+            else:
+                source += last_stackframe[-2][0].strip()
+            self.logger.debug(f"{msg}\n{source}")
+            raise AssertionError(f"{msg}\n{source}")
+
+    def summary_stats(self):
+        """Report basic statistics about feature availability and variable lengths
+        across the different subsets of the data.
+        """
+        self.logger.info("Computing feature stats...")
+        queries = self.ordered_experts + ["text"]
+        for subset, keep in self.partition_lists.items():
+            keep = set(keep)
+            print(f"Summary for {subset}")
+            for expert in queries:
+                if expert in self.features:
+                    feats = self.features[expert]
+                else:
+                    feats = self.text_features
+                vals = [feats[key] for key in keep]
+                missing = 0
+                sizes = []
+                for val in vals:
+                    if self.has_missing_values(val):
+                        missing += 1
+                    else:
+                        sizes.append(len(val))
+                if sizes:
+                    stat_str = (f"min: {np.min(sizes):4}, "
+                                f"max: {np.max(sizes):4}, "
+                                f"mean: {np.mean(sizes):.1f}")
+                    print(
+                        f"{subset}: missing: {missing:4}, {stat_str} {expert}")
diff --git a/docs/src/applications/T2VLAD/base/base_model.py b/docs/src/applications/T2VLAD/base/base_model.py
new file mode 100644
index 000000000..29af41c76
--- /dev/null
+++ b/docs/src/applications/T2VLAD/base/base_model.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle.nn as nn
+
+from abc import abstractmethod
+
+class BaseModel(nn.Layer):
+    """
+    Base class for all models
+    """
+    @abstractmethod
+    def forward(self, *inputs):
+        """
+        Forward pass logic
+
+        :return: Model output
+        """
+        raise NotImplementedError
+
+    def __str__(self):
+        """
+        Model prints with number of trainable parameters
+        """
+        model_parameters = filter(lambda p: p.stop_gradient==False, self.parameters())
+        params = sum([np.prod(p.shape) for p in model_parameters])
+        return super().__str__() + f"\nTrainable parameters: {params}"
diff --git a/docs/src/applications/T2VLAD/base/base_trainer.py b/docs/src/applications/T2VLAD/base/base_trainer.py
new file mode 100644
index 000000000..31ef04a55
--- /dev/null
+++ b/docs/src/applications/T2VLAD/base/base_trainer.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import re
+import copy
+import time
+import paddle
+import pickle
+import numpy as np
+from pathlib import Path
+from abc import abstractmethod
+
+class BaseTrainer:
+    """ Base class for all trainers
+    """
+    def __init__(self, model, loss, metrics, optimizer, config, mini_train,
+                 num_keep_ckpts, skip_tboard):
+        self.config = config
+        self.logger = config.get_logger(
+            'trainer', config['trainer']['verbosity'])
+
+        self.model = model
+        self.loss = loss
+        self.metrics = metrics
+        self.optimizer = optimizer
+        self.num_keep_ckpts = num_keep_ckpts
+        self.skip_tboard = skip_tboard or mini_train
+
+        # This property can be overriden in the subclass
+        self.skip_first_n_saves = 0
+
+        cfg_trainer = config['trainer']
+        self.epochs = cfg_trainer['epochs']
+        self.save_period = cfg_trainer['save_period']
+        self.monitor = cfg_trainer.get('monitor', 'off')
+        self.save_only_best = cfg_trainer.get("save_only_best", True)
+        self.val_freq = cfg_trainer['val_freq']
+        # configuration to monitor model performance and save best
+        if self.monitor == 'off':
+            self.mnt_mode = 'off'
+            self.mnt_best = 0
+        else:
+            self.mnt_mode, self.mnt_metric = self.monitor.split()
+            assert self.mnt_mode in ['min', 'max']
+
+            self.mnt_best = np.inf if self.mnt_mode == 'min' else -np.inf
+            self.early_stop = cfg_trainer.get('early_stop', np.inf)
+
+        self.start_epoch = 1
+
+        self.model_dir = config.save_dir
+
+        self.include_optim_in_save_model = config["trainer"].get("include_optim_in_save_model", 1)
+        if config.resume is not None:
+            self._resume_model(config.resume)
+
+    @abstractmethod
+    def _train_epoch(self, epoch):
+        """Training logic for an epoch
+
+        :param epoch: Current epoch number
+        """
+        raise NotImplementedError
+
+    def train(self):
+        """Full training logic.  Responsible for iterating over epochs, early stopping,
+        modeling and logging metrics.
+        """
+        for epoch in range(self.start_epoch, self.epochs + 1):
+            result, cached_preds = self._train_epoch(epoch)
+
+            if epoch % self.val_freq != 0:
+                continue
+            # save logged informations into log dict
+            log = {'epoch': epoch}
+            for key, value in result.items():
+                if key == 'metrics':
+                    log.update({mtr.__name__: value[i]
+                                for i, mtr in enumerate(self.metrics)})
+                elif key == 'val_metrics':
+                    log.update({'val_' + mtr.__name__: value[i]
+                                for i, mtr in enumerate(self.metrics)})
+                elif key == 'nested_val_metrics':
+                    # NOTE: currently only supports two layers of nesting
+                    for subkey, subval in value.items():
+                        for subsubkey, subsubval in subval.items():
+                            log[f"val_{subkey}_{subsubkey}"] = subsubval
+                else:
+                    log[key] = value
+
+            # print logged informations to the screen
+            for key, value in log.items():
+                self.logger.info('    {:15s}: {}'.format(str(key), value))
+
+            # eval model according to configured metric, save best # ckpt as trained_model
+            not_improved_count = 0
+            best = False
+            if self.mnt_mode != 'off':
+                try:
+                    # check whether specified metric improved or not, according to
+                    # specified metric(mnt_metric)
+                    lower = log[self.mnt_metric] <= self.mnt_best
+                    higher = log[self.mnt_metric] >= self.mnt_best
+                    improved = (self.mnt_mode == 'min' and lower) or \
+                               (self.mnt_mode == 'max' and higher)
+                except KeyError:
+                    msg = "Warning: Metric '{}' not found, perf monitoring is disabled."
+                    self.logger.warning(msg.format(self.mnt_metric))
+                    self.mnt_mode = 'off'
+                    improved = False
+                    not_improved_count = 0
+                    raise ValueError("Pick a metric that will save models!!!!!!!!")
+
+                if improved:
+                    self.mnt_best = log[self.mnt_metric]
+                    # TODO(Samuel): refactor the code so that we don't move the model
+                    # off the GPU or duplicate on the GPU (we should be able to safely
+                    # copy the state dict directly to CPU)
+                    copy_model = copy.deepcopy(self.model)
+                    self.best_model = {"epoch": epoch, "model": copy_model}
+                    not_improved_count = 0
+                    best = True
+                else:
+                    not_improved_count += 1
+
+                if not_improved_count > self.early_stop:
+                    self.logger.info("Val performance didn\'t improve for {} epochs. "
+                                     "Training stops.".format(self.early_stop))
+                    break
+
+            if self.save_only_best:
+                if epoch == self.epochs:
+                    best_model = self.best_model
+                    self.model = best_model["model"]
+                    print(f"saving the best model to disk (epoch {epoch})")
+                    self._save_model(best_model["epoch"], save_best=True)
+                continue
+
+            # If modeling is done intermittently, still save models that outperform
+            # the best metric
+            # save_best = best and not self.mnt_metric == "epoch"
+            save_best = True
+
+            # Due to the fast runtime/slow HDD combination, modeling can dominate
+            # the total training time, so we optionally skip models for some of
+            # the first epochs
+            if epoch < self.skip_first_n_saves and not self.save_only_best:
+                msg = f"Skipping model save at epoch {epoch} <= {self.skip_first_n_saves}"
+                self.logger.info(msg)
+                continue
+
+            if epoch % self.save_period == 0 and save_best:
+                self._save_model(epoch, save_best=best)
+                print("This epoch, the save best :{}".format(best))
+                if best:
+                    for key, cached in cached_preds.items():
+                        log_dir = Path(self.config.log_dir)
+                        prediction_path = log_dir / f"{key}_preds.txt"
+                        prediction_logits_path = log_dir / f"{key}_preds_logits.npy"
+                        np.save(prediction_logits_path, cached["preds"])
+                        gt_logits_path = log_dir / f"{key}_gt_logits.npy"
+                        np.save(gt_logits_path, cached["labels"].cpu().numpy())
+                        vid_names = []
+                        sort_predict = np.argsort(cached["preds"])[:, ::-1]
+                        with open(str(prediction_path), 'w') as f:
+                            for kk in range(cached["preds"].shape[0]):
+                                pred_classes = [str(v) for v in sort_predict[kk, :]]
+                                vid_name = cached["vid_name"][kk]
+                                if key == "test":
+                                    vid_name = vid_name[kk].split('/')[-1] + '.mp4'
+                                row = f"{vid_name} {' '.join(pred_classes)}"
+                                print(row, file=f)
+                                vid_names.append(vid_name)
+                        save_name_path = log_dir / f"{key}_vid_name.pkl"
+                        with open(save_name_path, 'wb') as f:
+                            pickle.dump(vid_names, f)
+                        self.logger.info(f"All {key} preds saved")
+                        self.logger.info(f"Wrote result to: {str(prediction_path)}")
+
+            if epoch > self.num_keep_ckpts:
+                self.purge_stale_models()
+
+    def purge_stale_models(self):
+        """Remove models that are no longer neededself.
+
+        NOTE: This function assumes that the `best` model has already been renamed
+        to have a format that differs from `model-epoch<num>.pth`
+        """
+        all_ckpts = list(self.model_dir.glob("*.pdparams"))
+        found_epoch_ckpts = list(self.model_dir.glob("model-epoch*.pdparams"))
+        if len(all_ckpts) <= self.num_keep_ckpts:
+            return
+
+        msg = "Expected at the best model to have been renamed to a different format"
+        if not len(all_ckpts) > len(found_epoch_ckpts):
+            print("Warning, purging model, but the best epoch was not saved!")
+        # assert len(all_ckpts) > len(found_epoch_ckpts), msg
+
+        # purge the oldest models
+        regex = r".*model-epoch(\d+)[.pdparams$"
+        epochs = [int(re.search(regex, str(x)).groups()[0]) for x in found_epoch_ckpts]
+        sorted_ckpts = sorted(list(zip(epochs, found_epoch_ckpts)), key=lambda x: -x[0])
+
+        for epoch, stale_ckpt in sorted_ckpts[self.num_keep_ckpts:]:
+            tic = time.time()
+            stale_ckpt.unlink()
+            msg = f"removing stale model [epoch {epoch}] [took {time.time() - tic:.2f}s]"
+            self.logger.info(msg)
+
+    def _save_model(self, epoch, save_best=False):
+        """Saving models
+
+        :param epoch: current epoch number
+        :param log: logging information of the epoch
+        :param save_best: if True, rename the saved model to 'trained_model.pdparams'
+        """
+        arch = type(self.model).__name__
+        state = {
+            'arch': arch,
+            'epoch': epoch,
+            'state_dict': self.model.state_dict(),
+            'monitor_best': self.mnt_best,
+            'config': self.config
+        }
+        if self.include_optim_in_save_model:
+            state["optimizer"] = self.optimizer.state_dict()
+
+        filename = str(self.model_dir /
+                       'model-epoch{}.pdparams'.format(epoch))
+        tic = time.time()
+        self.logger.info("Saving model: {} ...".format(filename))
+        paddle.save(state, filename)
+        self.logger.info(f"Done in {time.time() - tic:.3f}s")
+        if save_best:
+            self.logger.info("Updating 'best' model: {} ...".format(filename))
+            best_path = str(self.model_dir / 'trained_model.pdparams')
+            paddle.save(state, best_path)
+            self.logger.info(f"Done in {time.time() - tic:.3f}s")
+
+    def _resume_model(self, resume_path):
+        """ Resume from saved models
+
+        :param resume_path: model path to be resumed
+        """
+        resume_path = str(resume_path)
+        self.logger.info("Loading model: {} ...".format(resume_path))
+        model = paddle.load(resume_path)
+        self.model.load_dict(model)
+        self.logger.info(f"model loaded. Resume training from epoch {self.start_epoch}")
diff --git a/docs/src/applications/T2VLAD/data/download_features.sh b/docs/src/applications/T2VLAD/data/download_features.sh
new file mode 100644
index 000000000..3d325fd48
--- /dev/null
+++ b/docs/src/applications/T2VLAD/data/download_features.sh
@@ -0,0 +1,9 @@
+mkdir MSRVTT
+cd MSRVTT
+wget https://videotag.bj.bcebos.com/Data/MSRVTT/aggregated_text_feats.tar
+wget https://videotag.bj.bcebos.com/Data/MSRVTT/mmt_feats.tar
+wget https://videotag.bj.bcebos.com/Data/MSRVTT/raw-captions.pkl
+wget https://videotag.bj.bcebos.com/Data/MSRVTT/train_list_jsfusion.txt
+wget https://videotag.bj.bcebos.com/Data/MSRVTT/val_list_jsfusion.txt
+tar -xvf aggregated_text_feats.tar
+tar -xvf mmt_feats.tar
diff --git a/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py b/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py
new file mode 100644
index 000000000..fe41d3ea6
--- /dev/null
+++ b/docs/src/applications/T2VLAD/data_loader/MSRVTT_dataset.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import copy
+
+from pathlib import Path
+from utils import memory_summary
+from typeguard import typechecked
+from typing import Dict, Union, List
+from base.base_dataset import BaseDataset
+from zsvision.zs_utils import memcache, concat_features
+
+class MSRVTT(BaseDataset):
+    @staticmethod
+    @typechecked
+    def dataset_paths() -> Dict[str, Union[str, List[str], Path, Dict]]:
+        subset_paths = {}
+        split_name = "jsfusion"
+        train_list_path = "train_list_jsfusion.txt"
+        test_list_path = "val_list_jsfusion.txt"
+        # NOTE: The JSFusion split (referred to as 1k-A in the paper) uses all
+        # videos, but randomly samples a single caption per video from the test
+        # set for evaluation. To reproduce this evaluation, we use the indices
+        # of the test captions, and restrict to this subset during eval.
+        js_test_cap_idx_path = "jsfusion_val_caption_idx.pkl"
+        subset_paths[split_name] = {"train": train_list_path, "val": test_list_path}
+        custom_paths = {
+            "features_audio": ["mmt_feats/features.audio.pkl"],
+            "features_flow": ["mmt_feats/features.flow_agg.pkl"],
+            "features_rgb": ["mmt_feats/features.rgb_agg.pkl"],
+            "features_scene": ["mmt_feats/features.scene.pkl"],
+            "features_face": ["mmt_feats/features.face_agg.pkl"],
+            "features_ocr": ["mmt_feats/features.ocr.pkl"],
+            "features_s3d": ["mmt_feats/features.s3d.pkl"],
+            "features_speech": ["mmt_feats/features.speech.pkl"],
+        }
+        text_feat_paths = {
+            "openai": "w2v_MSRVTT_openAIGPT.pickle",
+        }
+        text_feat_paths = {key: Path("aggregated_text_feats") / fname
+                           for key, fname in text_feat_paths.items()}
+        feature_info = {
+            "custom_paths": custom_paths,
+            "subset_list_paths": subset_paths,
+            "text_feat_paths": text_feat_paths,
+            "raw_captions_path": "raw-captions.pkl",
+            "js_test_cap_idx_path": js_test_cap_idx_path,
+        }
+        return feature_info
+
+    def load_features(self):
+        root_feat = Path(self.root_feat)
+        feat_names = {}
+        custom_path_key = "custom_paths"
+        feat_names.update(self.paths[custom_path_key])
+        features = {}
+        for expert, rel_names in feat_names.items():
+            if expert not in self.ordered_experts:
+                continue
+            feat_paths = tuple([root_feat / rel_name for rel_name in rel_names])
+            if len(feat_paths) == 1:
+                features[expert] = memcache(feat_paths[0])
+            else:
+                # support multiple forms of feature (e.g. max and avg pooling). For
+                # now, we only support direct concatenation
+                msg = f"{expert}: Only direct concatenation of muliple feats is possible"
+                print(f"Concatenating aggregates for {expert}....")
+                is_concat = self.feat_aggregation[expert]["aggregate"] == "concat"
+                self.log_assert(is_concat, msg=msg)
+                axis = self.feat_aggregation[expert]["aggregate-axis"]
+                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
+                print(f"concat cache info: {x}")
+                features_ = concat_features(feat_paths, axis=axis)
+                memory_summary()
+
+                # Make separate feature copies for each split to allow in-place filtering
+                features[expert] = copy.deepcopy(features_)
+
+        self.features = features
+        self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
+        text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
+        self.text_features = memcache(text_feat_path)
+
+        if self.restrict_train_captions:
+            # hash the video names to avoid O(n) lookups in long lists
+            train_list = set(self.partition_lists["train"])
+            for key, val in self.text_features.items():
+                if key not in train_list:
+                    continue
+
+                if not self.split_name == "full-test":
+                    # Note that we do not perform this sanity check for the full-test
+                    # split, because the text features in the cached dataset will
+                    # already have been cropped to the specified
+                    # `resstrict_train_captions`
+                    expect = {19, 20}
+                    msg = f"expected train text feats as lists with length {expect}"
+                    has_expected_feats = isinstance(val, list) and len(val) in expect
+                    self.log_assert(has_expected_feats, msg=msg)
+
+                # restrict to the first N captions (deterministic)
+                self.text_features[key] = val[:self.restrict_train_captions]
+        self.summary_stats()
+
+    def sanity_checks(self):
+        if self.num_test_captions == 20:
+            if len(self.partition_lists["val"]) == 2990:
+                missing = 6
+            elif len(self.partition_lists["val"]) == 1000:
+                missing = 2
+            elif len(self.partition_lists["val"]) == 497:
+                missing = 0
+            else:
+                raise ValueError("unrecognised test set")
+            msg = "Expected to find two missing queries in MSRVTT for full eval"
+            correct_missing = self.query_masks.sum() == self.query_masks.size - missing
+            self.log_assert(correct_missing, msg=msg)
diff --git a/docs/src/applications/T2VLAD/data_loader/data_loaders.py b/docs/src/applications/T2VLAD/data_loader/data_loaders.py
new file mode 100644
index 000000000..fe64e6192
--- /dev/null
+++ b/docs/src/applications/T2VLAD/data_loader/data_loaders.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import logging
+import functools
+from pathlib import Path
+from typing import Dict, List
+
+from typeguard import typechecked
+from zsvision.zs_utils import memcache
+
+from data_loader.MSRVTT_dataset import MSRVTT
+from utils import HashableDict, HashableOrderedDict
+
+
+
+@functools.lru_cache(maxsize=64, typed=False)
+def dataset_loader(
+        use_zeros_for_missing: bool,
+        eval_only: bool,
+        data_dir: str,
+        text_agg: str,
+        text_feat: str,
+        split_name: str,
+        dataset_name: str,
+        cls_partition: str,
+        root_feat_folder: str,
+        text_dim: int,
+        num_test_captions: int,
+        restrict_train_captions: int,
+        logger: logging.Logger,
+        max_tokens: Dict[str, int],
+        raw_input_dims: HashableOrderedDict,
+        feat_aggregation: HashableDict,
+):
+    print(f"refreshing cache for {dataset_name} data loader [{split_name}]")
+    kwargs = dict(
+        data_dir=Path(data_dir),
+        text_dim=text_dim,
+        logger=logger,
+        eval_only=eval_only,
+        text_agg=text_agg,
+        text_feat=text_feat,
+        max_tokens=max_tokens,
+        split_name=split_name,
+        cls_partition=cls_partition,
+        raw_input_dims=raw_input_dims,
+        root_feat_folder=root_feat_folder,
+        feat_aggregation=feat_aggregation,
+        num_test_captions=num_test_captions,
+        use_zeros_for_missing=use_zeros_for_missing,
+        restrict_train_captions=restrict_train_captions,
+    )
+    if dataset_name == "MSRVTT":
+        dataset = MSRVTT(**kwargs)
+    return dataset
+
+
+class ExpertDataLoader:
+
+    @typechecked
+    def __init__(
+            self,
+            eval_only: bool,
+            use_zeros_for_missing: bool,
+            text_dim: int,
+            batch_size: int,
+            num_workers: int,
+            num_test_captions: int,
+            data_dir: str,
+            text_agg: str,
+            text_feat: str,
+            split_name: str,
+            dataset_name: str,
+            root_feat_folder: str,
+            max_tokens: Dict[str, int],
+            raw_input_dims: Dict[str, int],
+            feat_aggregation: Dict[str, Dict],
+            logger: logging.Logger,
+            restrict_train_captions: int = 0,
+            drop_last: bool = False,
+            refresh_lru_cache: bool = False,
+    ):
+
+        # Ensure that the dictionaries are hashable to allow use of caching
+        raw_input_dims = HashableOrderedDict(raw_input_dims)
+        feat_aggregation = HashableDict(feat_aggregation)
+        max_tokens = HashableDict(max_tokens)
+
+        if refresh_lru_cache:
+            logger.info("Explicitly refreshing dataloader and cuda cache")
+            dataset_loader.cache_clear()
+            memcache.cache_clear()
+
+        common_kwargs = dict(
+            logger=logger,
+            data_dir=data_dir,
+            text_dim=text_dim,
+            text_agg=text_agg,
+            eval_only=eval_only,
+            text_feat=text_feat,
+            max_tokens=max_tokens,
+            dataset_name=dataset_name,
+            split_name=split_name,
+            root_feat_folder=root_feat_folder,
+            use_zeros_for_missing=use_zeros_for_missing,
+            num_test_captions=num_test_captions,
+            raw_input_dims=raw_input_dims,
+            feat_aggregation=feat_aggregation,
+            restrict_train_captions=restrict_train_captions,
+        )
+
+        dataset = dataset_loader(cls_partition="train", **common_kwargs)
+        x = dataset_loader.cache_info()  # pylint: disable=no-value-for-parameter
+        logger.info(f"cache info {x}")
+        self.dataloaders = {"dataset": dataset}
+        self.dataloaders["retrieval"] = dataset.get_retrieval_data()
+    
+        if not eval_only:
+            train_loader = paddle.io.DataLoader(
+                dataset=dataset,
+                batch_size=batch_size,
+                num_workers=num_workers,
+                collate_fn=dataset.collate_data,
+                drop_last=drop_last,
+                shuffle=True,
+            )
+            self.dataloaders["train"] = train_loader
+
+        logger.info(f"Loading data loaders with {num_workers} workers")
+        self.num_test_captions = num_test_captions
+        self.dataset_name = dataset_name
+
+    def __getitem__(self, key):
+        return self.dataloaders[key]
diff --git a/docs/src/applications/T2VLAD/logger/__init__.py b/docs/src/applications/T2VLAD/logger/__init__.py
new file mode 100644
index 000000000..086cb2371
--- /dev/null
+++ b/docs/src/applications/T2VLAD/logger/__init__.py
@@ -0,0 +1,2 @@
+from .logger import *
+from .log_parser import *
\ No newline at end of file
diff --git a/docs/src/applications/T2VLAD/logger/log_parser.py b/docs/src/applications/T2VLAD/logger/log_parser.py
new file mode 100644
index 000000000..73fa355a3
--- /dev/null
+++ b/docs/src/applications/T2VLAD/logger/log_parser.py
@@ -0,0 +1,104 @@
+import re
+import scipy.stats
+import logging
+import numpy as np
+from collections import defaultdict
+
+
+def log_summary(logger, log_path, eval_mode="test_run", fixed_num_epochs=None):
+    """Extract performace statistics from experiment log files.
+
+    Args:
+        logger (logger): reference to primary logging instance
+        log_path (Path): the path to the log file
+        eval_mode (str): the method use to collect the statistics. Can be one of:
+            `test_run`, `fixed_num_epochs` or `geometric_mean`
+
+    NOTE: The `eval_mode` argument differs by dataset: for datasets which provide a
+    validation set, we use validation set performance to complete a single test run.  For
+    datasets where no validation set is available, we aim to match prior work by either
+    fixing the number of training epochs, or selecting directly from validation set
+    performance (Details can be found in the supplementary material of the paper.)
+    """
+    with open(str(log_path), "r") as f:
+        log = f.read().splitlines()
+
+    # keep track of the random seed used for the part of the logfile being processed
+    current_seed = None
+
+    # Regex tag for finding the seed
+    seed_tag = "Setting experiment random seed to"
+
+    if eval_mode == "test_run":
+        subset = "test"
+    else:
+        subset = "val"
+
+    for mode in "t2v", "v2t":
+        logger.info("")
+        logger.info("----------------------------------------------------")
+        logger.info(f"[{mode}] loaded log file with {len(log)} lines....")
+        logger.info("----------------------------------------------------")
+
+        # Search for the following metrics
+        scores = {
+            "R1": defaultdict(list),
+            "R5": defaultdict(list),
+            "R10": defaultdict(list),
+            "R50": defaultdict(list),
+            "MedR": defaultdict(list),
+            "MeanR": defaultdict(list),
+        }
+
+        for row in log:
+            if seed_tag in row:
+                # Search for the log file entry describing the current random seed
+                match = re.search(seed_tag + " (\d+)$", row)  # NOQA
+                assert len(match.groups()) == 1, "expected a single regex match"
+                current_seed = match.groups()[0]
+
+            if f"{subset}_{mode}_metrics" in row:
+                tokens = row.split(" ")
+                for key in scores:
+                    tag = f"{subset}_{mode}_metrics_{key}:"
+                    if tag in tokens:
+                        pos = tokens.index(tag) + 1
+                        val = tokens[pos]
+                        val = float(val)
+                        assert current_seed is not None, "failed to determine the seed"
+                        scores[key][current_seed].append(val)
+
+        agg_scores = {"R1": [], "R5": [], "R10": [], "R50": [], "MedR": [], "MeanR": []}
+
+        # compute the best performance for a single epoch (i.e. sharing the same model
+        # to compute all stats)
+        geometric_stats = defaultdict(list)
+        best_epochs = {}
+        if eval_mode == "geometric_mean":
+            raise NotImplementedError("Need to fix this for new log format")
+            consider = ["R1", "R5", "R10"]
+            seeds = list(scores["R1"].keys())
+            for seed in seeds:
+                for metric, subdict in scores.items():
+                    if metric in consider:
+                        geometric_stats[seed].append(subdict[seed])
+                gms_raw = np.array(geometric_stats[seed])
+                geo_means = scipy.stats.mstats.gmean(gms_raw, axis=0)
+                best_epochs[seed] = np.argmax(geo_means)
+
+        for metric, subdict in scores.items():
+            for seed, values in subdict.items():
+                if eval_mode == "test_run":
+                    stat = values[0]
+                elif eval_mode == "fixed_num_epochs":
+                    stat = values[fixed_num_epochs - 1]
+                elif "LSMDC" in log_path and eval_mode == "geometric_mean":
+                    stat = values[best_epochs[seed]]
+                else:
+                    raise ValueError(f"unrecognised eval_mode: {eval_mode}")
+                agg_scores[metric].append(stat)
+
+        if eval_mode == "fixed_num_epochs":
+            logger.info(f"Reporting stats with fixed training length: {fixed_num_epochs}")
+        for metric, values in agg_scores.items():
+            logger.info(f"{metric}: {np.mean(values):.1f}, {np.std(values, ddof=1):.1f}")
diff --git a/docs/src/applications/T2VLAD/logger/logger.py b/docs/src/applications/T2VLAD/logger/logger.py
new file mode 100644
index 000000000..b2fbebd31
--- /dev/null
+++ b/docs/src/applications/T2VLAD/logger/logger.py
@@ -0,0 +1,25 @@
+import os
+import logging
+import logging.config
+from pathlib import Path
+from utils import read_json
+
+
+def setup_logging(save_dir, log_config='logger/logger_config.json',
+                  default_level=logging.INFO):
+    """Setup logging configuration."""
+    print(os.getcwd())
+    log_config = Path(log_config)
+    print(f"log config: {log_config} exists: {log_config.exists()}")
+    if log_config.is_file():
+        config = read_json(log_config)
+        # modify logging paths based on run config
+        for _, handler in config['handlers'].items():
+            if 'filename' in handler:
+                handler['filename'] = str(save_dir / handler['filename'])
+
+        logging.config.dictConfig(config)
+    else:
+        print(f"Warning: logging configuration file is not found in {log_config}.")
+        logging.basicConfig(level=default_level)
+    return config["handlers"]["info_file_handler"]["filename"]
diff --git a/docs/src/applications/T2VLAD/model/loss.py b/docs/src/applications/T2VLAD/model/loss.py
new file mode 100644
index 000000000..263fda15d
--- /dev/null
+++ b/docs/src/applications/T2VLAD/model/loss.py
@@ -0,0 +1,102 @@
+"""This module contains an implementation of the max margin ranking loss, slightly
+modified from this code:
+https://github.com/antoine77340/Mixture-of-Embedding-Experts/blob/master/loss.py
+
+The modification is the `fix_norm` conditional, which removes zero terms from the
+diagonal when performing the averaging calculation.
+
+Original licence below.
+"""
+# Copyright 2021 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+def cosine_sim(im, s):
+  '''cosine similarity between all the image and sentence pairs
+  '''
+  inner_prod = im.mm(s.t())
+  im_norm = paddle.sqrt((im ** 2).sum(axis=1).reshape([-1, 1]) + 1e-18) 
+  s_norm = paddle.sqrt((s ** 2).sum(axis=1).reshape([-1, 1]) + 1e-18)
+  sim = inner_prod / (im_norm * s_norm)
+  return sim
+
+class ContrastiveLoss(nn.Layer):
+  '''compute contrastive loss
+  '''
+  def __init__(self, margin=0, max_violation=True, direction='bi', topk=1):
+    '''Args:
+      direction: i2t for negative sentence, t2i for negative image, bi for both
+    '''
+    super().__init__()
+    self.margin = margin
+    self.max_violation = max_violation
+    self.direction = direction
+    self.topk = topk
+
+  def forward(self, scores, margin=None, average_batch=True):
+    '''
+    Args:
+      scores: image-sentence score matrix, (batch, batch)
+        the same row of im and s are positive pairs, different rows are negative pairs
+    '''
+
+    if margin is None:
+      margin = self.margin
+
+    batch_size = scores.shape[0] 
+    diagonal = paddle.diagonal(scores).reshape([batch_size, 1])
+    # mask to clear diagonals which are positive pairs
+    pos_masks = paddle.eye(batch_size).astype('bool') 
+
+    batch_topk = min(batch_size, self.topk)
+    if self.direction == 'i2t' or self.direction == 'bi':
+      d1 = diagonal.expand_as(scores) # same collumn for im2s (negative sentence)
+      # compare every diagonal score to scores in its collumn
+      # caption retrieval
+      cost_s = (margin + scores - d1).clip(min=0)
+      cost_s[pos_masks] =  0 
+      if self.max_violation:
+        cost_s, _ = paddle.topk(cost_s, batch_topk, axis=1)
+        cost_s = cost_s / batch_topk
+        if average_batch:
+          cost_s = cost_s / batch_size
+      else:
+        if average_batch:
+          cost_s = cost_s / (batch_size * (batch_size - 1))
+      cost_s = paddle.sum(cost_s)
+
+    if self.direction == 't2i' or self.direction == 'bi':
+      d2 = diagonal.t().expand_as(scores) # same row for s2im (negative image)
+      # compare every diagonal score to scores in its row
+      cost_im = (margin + scores - d2).clip(min=0)
+      cost_im[pos_masks] = 0 
+      if self.max_violation:
+        cost_im, _ = paddle.topk(cost_im, batch_topk, axis=0)
+        cost_im = cost_im / batch_topk
+        if average_batch:
+          cost_im = cost_im / batch_size
+      else:
+        if average_batch:
+          cost_im = cost_im / (batch_size * (batch_size - 1))
+      cost_im = paddle.sum(cost_im)
+
+    if self.direction == 'i2t':
+      return cost_s
+    elif self.direction == 't2i':
+      return cost_im
+    else:
+      return cost_s + cost_im
diff --git a/docs/src/applications/T2VLAD/model/metric.py b/docs/src/applications/T2VLAD/model/metric.py
new file mode 100644
index 000000000..0db6eeb4b
--- /dev/null
+++ b/docs/src/applications/T2VLAD/model/metric.py
@@ -0,0 +1,243 @@
+# Copyright 2021 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import numbers
+import scipy.stats
+import numpy as np
+
+from pathlib import Path
+from sklearn.metrics import average_precision_score
+
+def t2v_metrics(sims, query_masks=None):
+    """Compute retrieval metrics from a similiarity matrix.
+
+    Args:
+        sims (th.Tensor): N x M matrix of similarities between embeddings, where
+             x_{i,j} = <text_embd[i], vid_embed[j]>
+        query_masks (th.Tensor): mask any missing queries from the dataset (two videos
+             in MSRVTT only have 19, rather than 20 captions)
+
+    Returns:
+        (dict[str:float]): retrieval metrics
+    """
+    assert sims.ndim == 2, "expected a matrix"
+    num_queries, num_vids = sims.shape
+    dists = -sims
+    sorted_dists = np.sort(dists, axis=1)
+
+    if False:
+        import sys
+        import matplotlib
+        from pathlib import Path
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+        sys.path.insert(0, str(Path.home() / "coding/src/zsvision/python"))
+        from zsvision.zs_iterm import zs_dispFig # NOQA
+        plt.matshow(dists)
+        zs_dispFig()
+        import ipdb; ipdb.set_trace()
+
+    # The indices are computed such that they slice out the ground truth distances
+    # from the psuedo-rectangular dist matrix
+    queries_per_video = num_queries // num_vids
+    gt_idx = [[np.ravel_multi_index([ii, jj], (num_queries, num_vids))
+              for ii in range(jj * queries_per_video, (jj + 1) * queries_per_video)]
+              for jj in range(num_vids)]
+    gt_idx = np.array(gt_idx)
+    gt_dists = dists.reshape(-1)[gt_idx.reshape(-1)]
+    gt_dists = gt_dists[:, np.newaxis]
+    rows, cols = np.where((sorted_dists - gt_dists) == 0)  # find column position of GT
+
+    # --------------------------------
+    # NOTE: Breaking ties
+    # --------------------------------
+    # We sometimes need to break ties (in general, these should occur extremely rarely,
+    # but there are pathological cases when they can distort the scores, such as when
+    # the similarity matrix is all zeros). Previous implementations (e.g. the t2i
+    # evaluation function used
+    # here: https://github.com/niluthpol/multimodal_vtt/blob/master/evaluation.py and
+    # here: https://github.com/linxd5/VSE_Pytorch/blob/master/evaluation.py#L87) generally
+    # break ties "optimistically".  However, if the similarity matrix is constant this
+    # can evaluate to a perfect ranking. A principled option is to average over all
+    # possible partial orderings implied by the ties. See # this paper for a discussion:
+    #    McSherry, Frank, and Marc Najork,
+    #    "Computing information retrieval performance measures efficiently in the presence
+    #    of tied scores." European conference on information retrieval. Springer, Berlin, 
+    #    Heidelberg, 2008.
+    # http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.145.8892&rep=rep1&type=pdf
+
+    # break_ties = "optimistically"
+    break_ties = "averaging"
+
+    if rows.size > num_queries:
+        assert np.unique(rows).size == num_queries, "issue in metric evaluation"
+        if break_ties == "optimistically":
+            _, idx = np.unique(rows, return_index=True)
+            cols = cols[idx]
+        elif break_ties == "averaging":
+            # fast implementation, based on this code:
+            # https://stackoverflow.com/a/49239335
+            locs = np.argwhere((sorted_dists - gt_dists) == 0)
+
+            # Find the split indices
+            steps = np.diff(locs[:, 0])
+            splits = np.nonzero(steps)[0] + 1
+            splits = np.insert(splits, 0, 0)
+
+            # Compute the result columns
+            summed_cols = np.add.reduceat(locs[:, 1], splits)
+            counts = np.diff(np.append(splits, locs.shape[0]))
+            avg_cols = summed_cols / counts
+            if False:
+                print("Running slower code to verify rank averaging across ties")
+                # slow, but more interpretable version, used for testing
+                avg_cols_slow = [np.mean(cols[rows == idx]) for idx in range(num_queries)]
+                assert np.array_equal(avg_cols, avg_cols_slow), "slow vs fast difference"
+                print("passed num check")
+            cols = avg_cols
+
+    msg = "expected ranks to match queries ({} vs {}) "
+    if cols.size != num_queries:
+        import ipdb; ipdb.set_trace()
+    assert cols.size == num_queries, msg
+
+    if False:
+        # overload mask to check that we can recover the scores for single-query
+        # retrieval
+        print("DEBUGGING MODE")
+        query_masks = np.zeros_like(query_masks)
+        query_masks[:, 0] = 1  # recover single query score
+
+    if query_masks is not None:
+        # remove invalid queries
+        assert query_masks.size == num_queries, "invalid query mask shape"
+        cols = cols[query_masks.reshape(-1).astype(np.bool)]
+        assert cols.size == query_masks.sum(), "masking was not applied correctly"
+        # update number of queries to account for those that were missing
+        num_queries = query_masks.sum()
+
+    if False:
+        # sanity check against old logic for square matrices
+        gt_dists_old = np.diag(dists)
+        gt_dists_old = gt_dists_old[:, np.newaxis]
+        _, cols_old = np.where((sorted_dists - gt_dists_old) == 0)
+        assert np.array_equal(cols_old, cols), "new metric doesn't match"
+
+    return cols2metrics(cols, num_queries)
+
+
+def v2t_metrics(sims, query_masks=None):
+    """Compute retrieval metrics from a similiarity matrix.
+
+    Args:
+        sims (th.Tensor): N x M matrix of similarities between embeddings, where
+             x_{i,j} = <text_embd[i], vid_embed[j]>
+        query_masks (th.Tensor): mask any missing captions from the dataset
+
+    Returns:
+        (dict[str:float]): retrieval metrics
+
+    NOTES: We find the closest "GT caption" in the style of VSE, which corresponds
+    to finding the rank of the closest relevant caption in embedding space:
+    github.com/ryankiros/visual-semantic-embedding/blob/master/evaluation.py#L52-L56
+    """
+    # switch axes of text and video
+    sims = sims.T
+
+    if False:
+        # experiment with toy example
+        sims = np.ones((3, 3))
+        sims[0, 0] = 2
+        sims[1, 1:2] = 2
+        sims[2, :] = 2
+        query_masks = None
+
+    assert sims.ndim == 2, "expected a matrix"
+    num_queries, num_caps = sims.shape
+    dists = -sims
+    caps_per_video = num_caps // num_queries
+    break_ties = "averaging"
+
+    MISSING_VAL = 1E8
+    query_ranks = []
+    for ii in range(num_queries):
+        row_dists = dists[ii, :]
+        if query_masks is not None:
+            # Set missing queries to have a distance of infinity.  A missing query
+            # refers to a query position `n` for a video that had less than `n`
+            # captions (for example, a few MSRVTT videos only have 19 queries)
+            row_dists[np.logical_not(query_masks.reshape(-1))] = MISSING_VAL
+
+        # NOTE: Using distance subtraction to perform the ranking is easier to make
+        # deterministic than using argsort, which suffers from the issue of defining
+        # "stability" for equal distances.  Example of distance subtraction code:
+        # github.com/antoine77340/Mixture-of-Embedding-Experts/blob/master/train.py
+        sorted_dists = np.sort(row_dists)
+
+        min_rank = np.inf
+        for jj in range(ii * caps_per_video, (ii + 1) * caps_per_video):
+            if row_dists[jj] == MISSING_VAL:
+                # skip rankings of missing captions
+                continue
+            ranks = np.where((sorted_dists - row_dists[jj]) == 0)[0]
+            if break_ties == "optimistically":
+                rank = ranks[0]
+            elif break_ties == "averaging":
+                # NOTE: If there is more than one caption per video, its possible for the
+                # method to do "worse than chance" in the degenerate case when all
+                # similarities are tied.  TODO(Samuel): Address this case.
+                rank = ranks.mean()
+            if rank < min_rank:
+                min_rank = rank
+        query_ranks.append(min_rank)
+    query_ranks = np.array(query_ranks)
+
+    # sanity check against old version of code
+    if False:
+        sorted_dists = np.sort(dists, axis=1)
+        gt_dists_old = np.diag(dists)
+        gt_dists_old = gt_dists_old[:, np.newaxis]
+        rows_old, cols_old = np.where((sorted_dists - gt_dists_old) == 0)
+        if rows_old.size > num_queries:
+            _, idx = np.unique(rows_old, return_index=True)
+            cols_old = cols_old[idx]
+        num_diffs = (1 - (cols_old == query_ranks)).sum()
+        msg = f"new metric doesn't match in {num_diffs} places"
+        assert np.array_equal(cols_old, query_ranks), msg
+
+        # visualise the distance matrix
+        import sys
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+        sys.path.insert(0, str(Path.home() / "coding/src/zsvision/python"))
+        from zsvision.zs_iterm import zs_dispFig # NOQA
+        plt.matshow(dists)
+        zs_dispFig()
+
+    return cols2metrics(query_ranks, num_queries)
+
+def cols2metrics(cols, num_queries):
+    metrics = {}
+    metrics["R1"] = 100 * float(np.sum(cols == 0)) / num_queries
+    metrics["R5"] = 100 * float(np.sum(cols < 5)) / num_queries
+    metrics["R10"] = 100 * float(np.sum(cols < 10)) / num_queries
+    metrics["R50"] = 100 * float(np.sum(cols < 50)) / num_queries
+    metrics["MedR"] = np.median(cols) + 1
+    metrics["MeanR"] = np.mean(cols) + 1
+    stats = [metrics[x] for x in ("R1", "R5", "R10")]
+    metrics["geometric_mean_R1-R5-R10"] = scipy.stats.mstats.gmean(stats)
+    return metrics
diff --git a/docs/src/applications/T2VLAD/model/model.py b/docs/src/applications/T2VLAD/model/model.py
new file mode 100644
index 000000000..a2e9fc99a
--- /dev/null
+++ b/docs/src/applications/T2VLAD/model/model.py
@@ -0,0 +1,533 @@
+# Copyright 2021 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import time
+import itertools
+
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle import Tensor
+from typing import Optional
+from collections import OrderedDict
+
+from base import BaseModel
+from model.net_vlad import NetVLAD
+try:
+    from paddlenlp.transformers import BertModel
+except ImportError as e:
+    print(
+        f"{e}, [paddlenlp] package and it's dependencies is required for T2VLAD."
+    )
+
+
+class Mish(nn.Layer):
+    '''
+    Applies the mish function element-wise:
+    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+    SRC: https://github.com/digantamisra98/Mish/blob/master/Mish/Torch/mish.py
+    '''
+    def forward(self, input):
+        '''
+        Forward pass of the function.
+        '''
+        return input * paddle.tanh(F.softplus(input))
+
+
+def kronecker_prod(t1, t2):
+    # kronecker is performed along the last dim
+    kron = paddle.bmm(t1.reshape([-1, t1.size(-1)], 1),
+                      t2.reshape([-1, 1, t2.size(-1)]))
+    return kron.reshape[(t1.shape[0], t1.shape[1], -1)]
+
+
+def drop_nans(x, ind, validate_missing):
+    """Remove nans, which we expect to find at missing indices.
+    Args:
+        x (paddle.Tensor): features
+        ind (paddle.Tensor): binary values denoting whether or not a given feature is present
+        validate_missing (bool): whether to validate that the missing location contains a nan.
+
+    Returns:
+        (paddle.tensor): the features, with the missing values masked to zero.
+    """
+
+    missing = paddle.nonzero(ind == 0).flatten()
+    if missing.numel():
+        if validate_missing:
+            vals = x[missing[0]]
+            assert paddle.isnan(vals.reshape(
+                [-1])[0]), "expected nans at missing locations"
+        #Prevent overwrite of the original tensor
+        x_ = x
+        x_[missing] = 0
+        x = x_
+    if paddle.isnan(x).sum() > 0:
+        raise ValueError("Still find nans after removing it!")
+    return x
+
+
+class CENet(BaseModel):
+    def __init__(self, text_dim, expert_dims, vlad_clusters, ghost_clusters,
+                 feat_aggregation, ce_shared_dim, use_mish, mimic_ce_dims):
+        super().__init__()
+        self.expert_dims = expert_dims
+        self.feat_aggregation = feat_aggregation
+
+        vlad_feat_sizes = {key: val for key, val in vlad_clusters.items()}
+
+        if vlad_clusters["text"] == 0:
+            self.text_pooling = nn.Sequential()
+        else:
+            self.text_pooling = NetVLAD(
+                feature_size=text_dim,
+                cluster_size=vlad_clusters["text"],
+                ghost_clusters=ghost_clusters["text"],
+            )
+            self.text_bert = BertModel.from_pretrained('bert-base-uncased')
+            text_dim = self.text_pooling.out_dim
+
+        self.ce = CEModule(
+            text_dim=text_dim,
+            expert_dims=expert_dims,
+            vlad_feat_sizes=vlad_feat_sizes,
+            mimic_ce_dims=mimic_ce_dims,
+            use_mish=use_mish,
+            same_dim=ce_shared_dim,
+        )
+
+    def forward(self,
+                experts,
+                ind,
+                cap_id=None,
+                att_mask=None,
+                text=None,
+                raw_captions=None,
+                text_token_mask=None):
+        aggregated_experts = OrderedDict()
+
+        # Handle all nan-checks
+        for mod in self.expert_dims:
+            experts[mod] = drop_nans(x=experts[mod],
+                                     ind=ind[mod],
+                                     validate_missing=True)
+            aggregated_experts[mod] = experts[mod]
+
+        start = time.time()
+        # When pooling multiple captions for a single video, we treat them as separate
+        # members of the minibatch, so the total pooling op does the following:
+        # pooling: B x captions_per_video x max_sentence_length x text_feat_dim
+        # -> B x captions_per_video (cluster_dim * text_feat_dim)
+        B, captions_per_video, max_words, text_feat_dim = text.shape
+        text = text.reshape([B * captions_per_video, max_words, text_feat_dim])
+        if isinstance(self.text_pooling, NetVLAD):
+            kwargs = {"mask": text_token_mask}
+        else:
+            kwargs = {}
+        cap_id = cap_id.reshape([B * captions_per_video, -1])
+        att_mask = att_mask.reshape([B * captions_per_video, -1])
+        att_mask = att_mask.unsqueeze(axis=[1, 2])
+        bert_out = self.text_bert(cap_id,
+                                  token_type_ids=None,
+                                  attention_mask=att_mask)
+        text = bert_out[0]
+        text, _, save_ass = self.text_pooling(text, **kwargs)
+        text = text.reshape([B, captions_per_video, -1])
+
+        return self.ce(text, aggregated_experts, ind, raw_captions,
+                       self.text_pooling, start)
+
+
+def _get_clones(module, N):
+    return nn.LayerList([copy.deepcopy(module) for i in range(N)])
+
+
+class TransformerLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=True):
+        super().__init__()
+        self.self_attn = nn.MultiHeadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = F.relu
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        q = q.transpose([1, 0, 2])
+        k = k.transpose([1, 0, 2])
+        src = src.transpose([1, 0, 2])
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)
+        src2 = src2.transpose([1, 0, 2])
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(self,
+                    src,
+                    src_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        q = q.transpose([1, 0, 2])
+        k = k.transpose([1, 0, 2])
+        src2 = src2.transpose([1, 0, 2])
+        src2 = self.self_attn(q, key=k, value=src2, attn_mask=src_mask)
+        src2 = src2.transpose([1, 0, 2])
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(self,
+                src,
+                src_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, pos)
+        return self.forward_post(src, src_mask, pos)
+
+
+class Transformer(nn.Layer):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():  # may have a problem
+            if p.dim() > 1:
+                nn.initializer.XavierUniform(p)
+
+    def forward(self,
+                src,
+                mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+
+        for layer in self.layers:
+            output = layer(output)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class CEModule(nn.Layer):
+    def __init__(self, expert_dims, text_dim, use_mish, mimic_ce_dims,
+                 vlad_feat_sizes, same_dim):
+        super().__init__()
+
+        modalities = list(expert_dims.keys())
+        self.expert_dims = expert_dims
+        self.modalities = modalities
+        self.mimic_ce_dims = mimic_ce_dims
+        self.same_dim = same_dim
+        self.use_mish = use_mish
+        self.vlad_feat_sizes = vlad_feat_sizes
+        self.reduce_dim = 64
+        self.moe_cg = ContextGating
+        self.vis_transformer = True
+
+        if self.use_mish:
+            self.non_lin = Mish()
+        else:
+            self.non_lin = nn.ReLU()
+
+        num_mods = len(expert_dims)
+        self.moe_fc = nn.Linear(text_dim, len(expert_dims))
+        self.moe_weights = paddle.ones([1, num_mods]) / num_mods
+
+        # The batch size of the face input can vary (due to missing inputs), so we
+        # probably shouldn't use BN on this branch. It's probably fine to leave it
+        # n for the corresponding text inputs, (but we should switch to GN)
+        use_bns = [True for modality in self.modalities]
+
+        # NOTE: When use_ce is not used, the text features are projected to
+        # subspaces of different dimensions.  When use_ce is used, they must all
+        # be projected to `same_dim` (to allow fusion). The only excpetion is for an
+        # ablation in which we mimic the `same_dim` reduction to measure whether this
+        # projection influences overall performance.
+
+        self.repeat_temporal = {}
+        for mod in modalities:
+            self.repeat_temporal[mod] = 1
+
+        in_dims = [
+            expert_dims[mod][0] * self.repeat_temporal[mod]
+            for mod in modalities
+        ]
+        agg_dims = [
+            expert_dims[mod][1] * self.repeat_temporal[mod]
+            for mod in modalities
+        ]
+        feat_dims = [
+            expert_dims[mod][0] // self.vlad_feat_sizes[mod]
+            for mod in modalities
+        ]
+        if self.vis_transformer:
+            num_encoder_layers = 1
+            d_model = 768
+            nhead = 4
+            dim_feedforward = 768
+            dropout = 0  #dropout=0.1
+            normalize_before = True
+            encoder_layer = TransformerLayer(d_model, nhead, dim_feedforward,
+                                             dropout)
+            encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+            self.transformers = Transformer(encoder_layer, num_encoder_layers,
+                                            encoder_norm)
+
+        if self.mimic_ce_dims:
+            dim_reducers = [ReduceDim(in_dim, same_dim) for in_dim in feat_dims]
+            self.video_dim_reduce = nn.LayerList(dim_reducers)
+
+        gated_vid_embds = [
+            GatedEmbeddingUnit(in_dim, same_dim, use_bn=True)
+            for in_dim in feat_dims
+        ]
+        text_out_dims = [same_dim for _ in agg_dims]
+        self.video_GU = nn.LayerList(gated_vid_embds)
+        gated_text_embds = [
+            GatedEmbeddingUnit(text_dim, dim, use_bn=True)
+            for dim in text_out_dims
+        ]
+        self.text_GU = nn.LayerList(gated_text_embds)
+
+    def compute_moe_weights(self, text, ind):
+        # compute weights for all captions (including when assigned K captions to
+        # the same video)
+        B, K, D = text.shape
+        M = len(self.modalities)
+        msg = f"expected between 1 and 10 modalities, found {M} ({self.modalities})"
+        assert 1 <= M <= 10, msg
+
+        # Treat each caption independently in the softmax (which runs over modalities)
+        text = text.reshape([B * K, D])
+
+        moe_weights = self.moe_fc(text)  # BK x D -> BK x M
+        moe_weights = F.softmax(moe_weights, axis=1)
+        moe_weights = moe_weights.reshape([B, K, M])
+        return moe_weights
+
+    def forward(self, text, experts, ind, raw_captions, vis_vlad, stime):
+        """Compute joint embeddings and, if requested, a confusion matrix between
+        video and text representations in the minibatch.
+
+        Notation: B = batch size, M = number of modalities
+        """
+
+        # Pass text embeddings through gated units
+        text_embd = {}
+
+        # Unroll repeated captions into present minibatch
+        B, captions_per_video, feat_dim = text.shape
+        text = text.reshape([B * captions_per_video, feat_dim])
+        for modality, layer in zip(self.modalities, self.text_GU):
+            # NOTE: Due to the batch norm, the gated units are sensitive to passing
+            # in a lot of zeroes, so we do the masking step after the forwards pass
+            text_ = layer(text)
+
+            # We always assume that text is available for retrieval
+            text_ = text_.reshape([B, captions_per_video, -1])
+            text_embd[modality] = text_
+        text = text.reshape([B, captions_per_video, -1])
+
+        # vladded nans are handled earlier (during pooling)
+        # We also avoid zeroing random features, since this will leak information
+        # exclude = list(self.vlad_feat_sizes.keys()) + list(self.random_feats)
+        # experts = self.mask_missing_embeddings(experts, ind, exclude=exclude)
+
+        # MOE weights computation + normalization - note that we use the first caption
+        # sample to predict the weights
+        moe_weights = self.compute_moe_weights(text, ind=ind)
+        text_local = text.reshape([B * captions_per_video, -1])
+
+        vis_local = {}
+        for modality in self.modalities:
+            vis_local[modality] = experts[modality]
+
+        all_vis_feat = []
+        if hasattr(self, "video_dim_reduce"):
+            # Embed all features to a common dimension
+            for modality, layer in zip(self.modalities, self.video_dim_reduce):
+                all_vis_feat.append(layer(vis_local[modality]))
+        all_vis_feat = paddle.concat(all_vis_feat, axis=1)
+
+        if self.vis_transformer:
+            experts_tensor = all_vis_feat
+            experts_tensor = experts_tensor.transpose([1, 0, 2])
+            att_out = self.transformers(experts_tensor, mask=None, pos=None)
+            all_vis_feat = att_out.transpose([1, 0, 2])
+
+        vis_local, _, save_ass = vis_vlad(all_vis_feat, freeze=True)
+        cross_view_conf_matrix_tv = paddle.matmul(text_local, vis_local.t())
+
+        for modality in self.modalities:
+            experts[modality] = experts[modality].max(axis=1)
+
+        for modality, layer in zip(self.modalities, self.video_GU):
+            experts[modality] = layer(experts[modality])
+
+        cross_view_conf_matrix = sharded_cross_view_inner_product(
+            ind=ind,
+            vid_embds=experts,
+            text_embds=text_embd,
+            text_weights=moe_weights,
+            subspaces=self.modalities,
+            raw_captions=raw_captions,
+        )
+        cross_view_conf_matrix = 0.5 * cross_view_conf_matrix + 0.5 * cross_view_conf_matrix_tv
+        return {
+            "modalities": self.modalities,
+            "cross_view_conf_matrix": cross_view_conf_matrix,
+        }
+
+
+class GatedEmbeddingUnit(nn.Layer):
+    def __init__(self, input_dimension, output_dimension, use_bn):
+        super(GatedEmbeddingUnit, self).__init__()
+        self.fc = nn.Linear(input_dimension, output_dimension)
+        self.cg = ContextGating(output_dimension, add_batch_norm=use_bn)
+
+    def forward(self, x):
+        x = self.fc(x)
+        x = self.cg(x)
+        x = F.normalize(x)
+        return x
+
+
+class ReduceDim(nn.Layer):
+    def __init__(self, input_dimension, output_dimension):
+        super(ReduceDim, self).__init__()
+        self.fc = nn.Linear(input_dimension, output_dimension)
+
+    def forward(self, x):
+        x = self.fc(x)
+        x = F.normalize(x, axis=-1)
+        return x
+
+
+class ContextGating(nn.Layer):
+    def __init__(self, dimension, add_batch_norm=True):
+        super(ContextGating, self).__init__()
+        self.fc = nn.Linear(dimension, dimension)
+        self.add_batch_norm = add_batch_norm
+        self.batch_norm = nn.BatchNorm1D(dimension)
+
+    def forward(self, x):
+        x1 = self.fc(x)
+        if self.add_batch_norm:
+            x1 = self.batch_norm(x1)
+        x = paddle.concat([x, x1], axis=1)
+        return F.glu(x, axis=1)
+
+
+def sharded_cross_view_inner_product(vid_embds,
+                                     text_embds,
+                                     text_weights,
+                                     subspaces,
+                                     ind,
+                                     tol=1E-5,
+                                     raw_captions=None):
+    """Compute a similarity matrix from sharded vectors.
+
+    Args:
+        embds1 (dict[str:paddle.Tensor]): the set of sub-embeddings that, when
+            concatenated, form the whole. The ith shard has shape `B x K x F_i`
+            (i.e. they can differ in the last dimension).
+        embds2 (dict[str:paddle.Tensor]): same format.
+        weights2 (paddle.Tensor): weights for the shards in `embds2`.
+
+    Returns:
+        (paddle.tensor): similarity matrix of size `BK x BK`.
+
+    NOTE: If multiple captions are provided, we can aggregate their similarities to
+    provide a single video-text similarity score.
+    """
+    B = vid_embds[subspaces[0]].shape[0]
+    T, num_caps, _ = text_embds[subspaces[0]].shape
+
+    # unroll separate captions onto first dimension and treat them separately
+    sims = paddle.zeros([T * num_caps, B])
+    text_weights = text_weights.reshape([T * num_caps, -1])
+    if True:
+        mus = [round(x, 3) for x in text_weights.mean(0).numpy().tolist()]
+        stds = [round(x, 3) for x in text_weights.std(0).numpy().tolist()]
+        summary = ">>>"
+        for mod, mu, std in zip(subspaces, mus, stds):
+            summary += f"{mod}: {mu} +/- {std} "
+
+    # mark expert availabilities along the second axis
+    available = paddle.ones([1, B, len(subspaces)], dtype=text_weights.dtype)
+    for ii, modality in enumerate(subspaces):
+        ind[modality] = paddle.to_tensor(ind[modality], dtype='float32')
+        available[:, :, ii] = ind[modality]
+    msg = "expected `available` modality mask to only contain 0s or 1s"
+    assert set(paddle.unique(available).cpu().numpy()).issubset(set([0,
+                                                                     1])), msg
+    # set the text weights along the first axis and combine with availabilities to
+    # produce a <T x B x num_experts> tensor
+    text_weight_tensor = text_weights.reshape([T * num_caps, 1,
+                                               len(subspaces)]) * available
+    # normalise to account for missing experts
+    normalising_weights = text_weight_tensor.sum(2).reshape(
+        [T * num_caps, B, 1])
+    text_weight_tensor = paddle.divide(text_weight_tensor, normalising_weights)
+
+    l2_mass_text, l2_mass_vid = 1, 1
+
+    for idx, modality in enumerate(subspaces):
+        vid_embd_ = vid_embds[modality].reshape([B, -1]) / l2_mass_vid
+        text_embd_ = text_embds[modality].reshape([T * num_caps, -1])
+        msg = "expected weights to be applied to text embeddings"
+        assert text_embd_.shape[0] == text_weights.shape[0], msg
+        text_embd_ = text_embd_ / l2_mass_text
+        weighting = text_weight_tensor[:, :, idx]
+        sims += weighting * paddle.matmul(text_embd_,
+                                          vid_embd_.t())  # (T x num_caps) x (B)
+
+    if paddle.isnan(sims).sum().item():
+        raise ValueError("Found nans in similarity matrix!")
+
+    return sims
diff --git a/docs/src/applications/T2VLAD/model/net_vlad.py b/docs/src/applications/T2VLAD/model/net_vlad.py
new file mode 100644
index 000000000..99ef7a193
--- /dev/null
+++ b/docs/src/applications/T2VLAD/model/net_vlad.py
@@ -0,0 +1,100 @@
+"""NetVLAD implementation.
+"""
+# Copyright 2021 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class NetVLAD(nn.Layer):
+    def __init__(self, cluster_size, feature_size, ghost_clusters=0,
+                 add_batch_norm=True):
+        super().__init__()
+
+        self.feature_size = feature_size
+        self.cluster_size = cluster_size
+        self.ghost_clusters = ghost_clusters
+
+        init_sc = (1 / math.sqrt(feature_size))
+        init_sc = paddle.to_tensor(init_sc)
+        clusters = cluster_size + ghost_clusters
+
+        # The `clusters` weights are the `(w,b)` in the paper
+        self.clusters = paddle.create_parameter([feature_size, clusters], dtype='float32', default_initializer=nn.initializer.Assign(paddle.randn([feature_size, clusters]) * init_sc))
+        self.batch_norm1 = nn.BatchNorm1D(clusters) if add_batch_norm else None
+        self.batch_norm2 = nn.BatchNorm1D(clusters) if add_batch_norm else None
+        # The `clusters2` weights are the visual words `c_k` in the paper
+        self.clusters1 = paddle.create_parameter([1, feature_size, cluster_size], dtype='float32', default_initializer=nn.initializer.Assign(paddle.randn([1, feature_size, cluster_size]) * init_sc))
+        self.clusters2 = paddle.create_parameter([1, feature_size, cluster_size], dtype='float32', default_initializer=nn.initializer.Assign(paddle.randn([1, feature_size, cluster_size]) * init_sc)) 
+        self.out_dim = self.cluster_size * feature_size
+    
+    def sanity_checks(self, x):
+        """Catch any nans in the inputs/clusters"""
+        if paddle.isnan(paddle.sum(x)):
+            raise ValueError("nan inputs")
+        if paddle.isnan(self.clusters[0][0]): 
+            raise ValueError("nan clusters")
+        
+    def forward(self, x, freeze=False, mask=None):
+        """Aggregates feature maps into a fixed size representation.  In the following
+        notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size.
+
+        Args:
+            x (th.Tensor): B x N x D
+
+        Returns:
+            (th.Tensor): B x DK
+        """
+        self.sanity_checks(x)
+        max_sample = x.shape[1] 
+        x = x.reshape([-1, self.feature_size]) # B x N x D -> BN x D
+
+        if freeze == True:
+            clusters = self.clusters.detach()
+            clusters2 = self.clusters1
+            batch_norm =  self.batch_norm1
+        else:
+            clusters = self.clusters
+            clusters2 = self.clusters2
+            batch_norm =  self.batch_norm2
+
+        assignment = paddle.matmul(x, clusters) # (BN x D) x (D x (K+G)) -> BN x (K+G)
+        if batch_norm:
+            assignment = batch_norm(assignment)
+
+        assignment = F.softmax(assignment, axis=1) # BN x (K+G) -> BN x (K+G)
+        save_ass = assignment.reshape([-1, max_sample, self.cluster_size+1])
+
+        assignment = assignment[:, :self.cluster_size]
+        assignment = assignment.reshape([-1, max_sample, self.cluster_size]) # -> B x N x K
+        a_sum = paddle.sum(assignment, axis=1, keepdim=True) # B x N x K -> B x 1 x K
+        a = a_sum * self.clusters2
+        assignment = assignment.transpose([0, 2, 1])  # B x N x K -> B x K x N
+
+        x = x.reshape([-1, max_sample, self.feature_size]) # BN x D -> B x N x D
+        vlad = paddle.matmul(assignment, x) # (B x K x N) x (B x N x D) -> B x K x D
+        vlad = vlad.transpose([0, 2, 1]) # -> B x D x K
+        vlad = vlad - a
+
+        # L2 intra norm
+        vlad_ = F.normalize(vlad)
+
+        # flattening + L2 norm
+        vlad = vlad_.reshape([-1, self.cluster_size * self.feature_size])  # -> B x DK
+        vlad = F.normalize(vlad)
+        return vlad, vlad_, save_ass  # B x DK
\ No newline at end of file
diff --git a/docs/src/applications/T2VLAD/model/text.py b/docs/src/applications/T2VLAD/model/text.py
new file mode 100644
index 000000000..fbf32ab1c
--- /dev/null
+++ b/docs/src/applications/T2VLAD/model/text.py
@@ -0,0 +1,148 @@
+"""This module defines the TextEmbedding interface for converting video descriptions and
+queries into embeddings.
+"""
+import zipfile
+import functools
+from abc import abstractmethod
+from pathlib import Path
+
+import numpy as np
+import paddle
+import gensim
+import requests
+import transformers
+from typeguard import typechecked
+from zsvision.zs_utils import BlockTimer
+
+from model.s3dg import S3D
+
+class TextEmbedding:
+    def __init__(self, model, dim: int):
+        self.model = model
+        self.dim = dim
+        #self.device = None
+
+    @abstractmethod
+    def text2vec(self, text: str) -> np.ndarray:
+        """Convert a string of text into an embedding.
+
+        Args:
+            text: the content to be embedded
+
+        Returns:
+            (d x n) array, where d is the dimensionality of the embedding and `n` is the
+                number of words that were successfully parsed from the text string.
+
+        NOTE: For some text embedding models (such as word2vec), not all words are
+        converted to vectors (e.g. certain kinds of stop words) - these are dropped from
+        the output.
+        """
+        raise NotImplementedError
+
+    #@typechecked
+    #def set_device(self, device: torch.device):
+    #    self.model = self.model.to(device)
+    #    self.device = device
+
+
+@functools.lru_cache(maxsize=64, typed=False)
+def load_w2v_model_from_cache(
+        w2v_weights: Path,
+) -> gensim.models.keyedvectors.Word2VecKeyedVectors:
+    with BlockTimer("Loading w2v from disk"):
+        model = gensim.models.KeyedVectors.load_word2vec_format(
+            fname=w2v_weights,
+            binary=True,
+        )
+    return model
+
+
+@typechecked
+def fetch_model(url: str, weights_path: Path):
+    weights_path.parent.mkdir(exist_ok=True, parents=True)
+    with BlockTimer(f"Fetching weights {url} -> {weights_path}"):
+        resp = requests.get(url, verify=False)
+        with open(weights_path, "wb") as f:
+            f.write(resp.content)
+
+
+class W2VEmbedding(TextEmbedding):
+    """This model embeds text using the google-released implementation of the word2vec
+    model introduced in:
+
+        Mikolov, T., Sutskever, I., Chen, K., Corrado, G. S., & Dean, J. (2013).
+        Distributed representations of words and phrases and their compositionality.
+        In Advances in neural information processing systems (pp. 3111-3119).
+
+    For words that are present in the w2v vocabulary, a 300-dimensional embedding is
+    produced via a lookup table.
+    """
+    @typechecked
+    def __init__(
+            self,
+            dim: int,
+            mirror: str,
+            weights_path: Path,
+            fetch_weights: bool = True,
+    ):
+        if not weights_path.exists():
+            if fetch_weights:
+                fetch_model(url=mirror, weights_path=weights_path)
+            else:
+                raise ValueError(f"w2v weights missing at {weights_path}")
+
+        model = load_w2v_model_from_cache(weights_path)
+        super().__init__(model=model, dim=dim)
+
+    @typechecked
+    def text2vec(self, text: str) -> np.ndarray:
+        # convert the text string to tokens that can be processed by w2v.  We handle
+        # 'a' as a special case.
+        tokens = [x for x in text.split(" ") if x != "a" and x in self.model.vocab]
+
+        embeddings = []
+        for token in tokens:
+            embeddings.append(self.model.get_vector(token))
+        embeddings = np.array(embeddings)
+        # For empty sequences, we use zeros with the dimensionality of the features on
+        # the second dimension (this is the format expected by the CE codebase)
+        if embeddings.size == 0:
+            embeddings = np.zeros((0, self.dim))
+        return embeddings
+
+    #@typechecked
+    #def set_device(self, device: torch.device):
+    #    msg = f"w2v only supports CPU-based execution found {device.type}"
+    #    assert device.type == "cpu", msg
+
+
+class OpenAI_GPT(TextEmbedding):
+    """This model produces 768-embeddings using a pretrained GPT model, introduced
+    in the paper:
+
+    Radford, A., Narasimhan, K., Salimans, T., & Sutskever, I. (2018).
+    Improving language understanding by generative pre-training,
+    https://cdn.openai.com/research-covers/language-unsupervised/language_understanding
+    _paper.pdf
+    """
+
+    def __init__(self):
+        self.tokenizer = transformers.OpenAIGPTTokenizer.from_pretrained("openai-gpt")
+        model = transformers.OpenAIGPTModel.from_pretrained("openai-gpt")
+        model.eval()
+        super().__init__(model=model)
+
+    @typechecked
+    def text2vec(self, text: str) -> np.ndarray:
+        tokenized_text = self.tokenizer.tokenize(text)
+
+        # Convert token to vocabulary indices
+        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
+        tokens_tensor = paddle.to_tensor(indexed_tokens, dtype='int64') #tokens_tensor = torch.LongTensor([indexed_tokens]).to(self.model.device)
+
+        with paddle.no_grad():
+            hidden_states = self.model(tokens_tensor)
+            embeddings = hidden_states[0].numpy()
+        return embeddings.squeeze(0)
+
+
diff --git a/docs/src/applications/T2VLAD/parse_config.py b/docs/src/applications/T2VLAD/parse_config.py
new file mode 100644
index 000000000..c952b9d61
--- /dev/null
+++ b/docs/src/applications/T2VLAD/parse_config.py
@@ -0,0 +1,239 @@
+# Copyright 2021 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import paddle
+import pprint
+import logging
+from typing import Dict
+from pathlib import Path
+from datetime import datetime
+from operator import getitem
+from functools import reduce
+
+from mergedeep import Strategy, merge
+from zsvision.zs_utils import set_nested_key_val
+from typeguard import typechecked
+
+from utils import read_json, write_json
+from logger import setup_logging
+
+
+class ConfigParser:
+    def __init__(self, args, options='', timestamp=True, slave_mode=False):
+        # slave_mode - when calling the config parser form an existing process, we
+        # avoid reinitialising the logger and ignore sys.argv when argparsing.
+
+        # parse default and custom cli options
+        for opt in options:
+            args.add_argument(*opt.flags, default=None, type=opt.type)
+
+        if slave_mode:
+            args = args.parse_args(args=[])
+        else:
+            args = args.parse_args()
+
+        if args.resume and not slave_mode:
+            self.resume = Path(args.resume)
+        else:
+            msg_no_cfg = "Config file must be specified"
+            assert args.config is not None, msg_no_cfg
+            self.resume = None
+        self.cfg_fname = Path(args.config)
+
+        config = self.load_config(self.cfg_fname)
+        self._config = _update_config(config, options, args)
+
+        if self._config.get("eval_config", False):
+            # validate path to evaluation file
+            eval_cfg_path = self._config.get("eval_config")
+            msg = f"eval_config was specified, but `{eval_cfg_path}` does not exist"
+            assert Path(self._config.get("eval_config")).exists(), msg
+
+        # set save_dir where trained model and log will be saved.
+        if "tester" in self.config:
+            save_dir = Path(self.config['tester']['save_dir'])
+        else:
+            save_dir = Path(self.config['trainer']['save_dir'])
+        timestamp = datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S") if timestamp else ""
+
+        if slave_mode:
+            timestamp = f"{timestamp}-eval-worker"
+
+        exper_name = self.set_exper_name(args, config=config)
+
+        if getattr(args, "group_id", False):
+            subdir = Path(args.group_id) / f"seed-{args.group_seed}" / timestamp
+        else:
+            subdir = timestamp
+
+        self._save_dir = save_dir / 'models' / exper_name / subdir
+        self._log_dir = save_dir / 'log' / exper_name / subdir
+        self._exper_name = exper_name
+        self._args = args
+
+        # if set, remove all previous experiments with the current config
+        if vars(args).get("purge_exp_dir", False):
+            for dirpath in (self._save_dir, self._log_dir):
+                config_dir = dirpath.parent
+                existing = list(config_dir.glob("*"))
+                print(f"purging {len(existing)} directories from config_dir...")
+                tic = time.time()
+                os.system(f"rm -rf {config_dir}")
+                print(f"Finished purge in {time.time() - tic:.3f}s")
+
+        self.save_dir.mkdir(parents=True, exist_ok=True)
+        self.log_dir.mkdir(parents=True, exist_ok=True)
+
+        # save updated config file to the checkpoint dir
+        write_json(self.config, self.save_dir / 'config.json')
+
+        # configure logging module
+        if not slave_mode:
+            self.log_path = setup_logging(self.log_dir)
+
+        self.log_levels = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}
+
+    def set_exper_name(self, args, config):
+        # We assume that the config files are organised into directories such that
+        # each directory has the name of the dataset.
+        dataset_name = self.cfg_fname.parent.stem
+        exper_name = f"{dataset_name}-{self.cfg_fname.stem}"
+        if args.custom_args:
+            key_val_lists = args.custom_args.split("+")
+            for key_val_pair in key_val_lists:
+                print(f"parsing key-val pair : {key_val_pair}")
+                key, val = key_val_pair.split("@")
+                set_nested_key_val(key, val, self._config)
+                # remove periods from key names
+                key_ = key.replace("_.", "--")
+                # remove commas from value names
+                val = val.replace(",", "--")
+                custom_tag = "-".join(key_.split(".")[-2:])
+                exper_name = f"{exper_name}-{custom_tag}-{val}"
+
+        if getattr(args, "disable_workers", False):
+            print("Disabling data loader workers....")
+            config["data_loader"]["args"]["num_workers"] = 0
+
+        if getattr(args, "train_single_epoch", False):
+            print("Restricting training to a single epoch....")
+            config["trainer"]["epochs"] = 1
+            config["trainer"]["save_period"] = 1
+            config["trainer"]["skip_first_n_saves"] = 0
+            exper_name = f"{exper_name}-train-single-epoch"
+        return exper_name
+
+    @staticmethod
+    @typechecked
+    def load_config(cfg_fname: Path) -> Dict:
+        config = read_json(cfg_fname)
+        # apply inheritance through config hierarchy
+        descendant, ancestors = config, []
+        while "inherit_from" in descendant:
+            parent_config = read_json(Path(descendant["inherit_from"]))
+            ancestors.append(parent_config)
+            descendant = parent_config
+        for ancestor in ancestors:
+            merge(ancestor, config, strategy=Strategy.REPLACE)
+            config = ancestor
+        return config
+
+    def init(self, name, module, *args, **kwargs):
+        """Finds a function handle with the name given as 'type' in config, and returns
+        the instance initialized with corresponding keyword args given as 'args'.
+        """
+        module_name = self[name]['type']
+        module_args = dict(self[name]['args'])
+        msg = (f"Fail for {module_name}\n"
+               f"overwriting kwargs given in config file is not allowed\n"
+               f"passed kwargs: {kwargs}\n"
+               f"for module_args: {module_args})")
+        assert all([k not in module_args for k in kwargs]), msg
+        module_args.update(kwargs)
+        return getattr(module, module_name)(*args, **module_args)
+
+    def __getitem__(self, name):
+        return self.config[name]
+
+    def __len__(self):
+        # NOTE: This is used for boolean checking deep inside ray.tune, so we required it
+        # to be defined.
+        return len(self.config)
+
+    def __setitem__(self, name, value):
+        self.config[name] = value
+
+    def __contains__(self, name):
+        return name in self.config
+
+    def get(self, name, default):
+        return self.config.get(name, default)
+
+    def keys(self):
+        return self.config.keys()
+
+    def get_logger(self, name, verbosity=2):
+        msg_verbosity = "verbosity option {} is invalid. Valid options are {}."
+        msg_verbosity = msg_verbosity.format(verbosity, self.log_levels.keys())
+        assert verbosity in self.log_levels, msg_verbosity
+        logger = logging.getLogger(name)
+        logger.setLevel(self.log_levels[verbosity])
+        return logger
+
+    # setting read-only attributes
+    @property
+    def config(self):
+        return self._config
+
+    @property
+    def save_dir(self):
+        return self._save_dir
+
+    @property
+    def log_dir(self):
+        return self._log_dir
+
+    def __repr__(self):
+        return pprint.PrettyPrinter().pformat(self.__dict__)
+
+    def items(self):
+        return self._config.items()
+
+
+# helper functions used to update config dict with custom cli options
+def _update_config(config, options, args):
+    for opt in options:
+        value = getattr(args, _get_opt_name(opt.flags))
+        if value is not None:
+            _set_by_path(config, opt.target, value)
+    return config
+
+
+def _get_opt_name(flags):
+    for flg in flags:
+        if flg.startswith('--'):
+            return flg.replace('--', '')
+    return flags[0].replace('--', '')
+
+
+def _set_by_path(tree, keys, value):
+    """Set a value in a nested object in tree by sequence of keys."""
+    _get_by_path(tree, keys[:-1])[keys[-1]] = value
+
+
+def _get_by_path(tree, keys):
+    """Access a nested object in tree by sequence of keys."""
+    return reduce(getitem, keys, tree)
diff --git a/docs/src/applications/T2VLAD/test.py b/docs/src/applications/T2VLAD/test.py
new file mode 100644
index 000000000..d1ce9e454
--- /dev/null
+++ b/docs/src/applications/T2VLAD/test.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import copy
+import random
+import paddle
+import logging
+import argparse
+
+import numpy as np
+import model.model as module_arch
+import model.metric as module_metric
+import data_loader.data_loaders as module_data
+
+from typing import Tuple
+from pathlib import Path
+from typeguard import typechecked
+from mergedeep import Strategy, merge
+from parse_config import ConfigParser
+from trainer.trainer import verbose, ctxt_mgr
+from utils.util import compute_dims, compute_trn_config
+
+@typechecked
+def compress_predictions(query_masks: np.ndarray, sims: np.ndarray, topk: int = 10):
+    """We store the indices of the top-k predictions, rather than the full similarity
+    matrix, to reduce storage requirements.
+
+    NOTE: The similarity matrix contains `num_queries x num_videos` elements, where
+    `num_queries = num_videos x max_num_queries_per_video`.  We first mask out
+    locations in the similarity matrix that correspond to invalid queries (these are
+    produced by videos with fewer than `max_num_queries_per_video` descriptions).
+    """
+
+    # validate the input shapes
+    assert query_masks.ndim == 2, "Expected query_masks to be a matrix"
+    query_num_videos, query_max_per_video = query_masks.shape
+    sims_queries, sims_num_videos = sims.shape
+    msg = (f"Expected sims and query masks to represent the same number of videos "
+           f"(found {sims_num_videos} v {query_num_videos}")
+    assert query_num_videos == sims_num_videos, msg
+    msg = (f"Expected sims and query masks to represent the same number of queries "
+           f"(found {sims_queries} v {query_num_videos * query_max_per_video}")
+    assert query_max_per_video * query_num_videos == sims_queries, msg
+
+    valid_sims = sims[query_masks.flatten().astype(np.bool)]
+    ranks = np.argsort(-valid_sims, axis=1)
+    return ranks[:, :topk]
+
+
+@typechecked
+def get_model_and_data_loaders(
+        config: ConfigParser,
+        logger: logging.Logger,
+        model_path: Path,
+) -> Tuple[paddle.nn.Layer, module_data.ExpertDataLoader]:
+    expert_dims, raw_input_dims = compute_dims(config)
+    trn_config = compute_trn_config(config)
+
+    data_loaders = config.init(
+        name='data_loader',
+        module=module_data,
+        logger=logger,
+        raw_input_dims=raw_input_dims,
+        text_feat=config["experts"]["text_feat"],
+        text_dim=config["experts"]["text_dim"],
+        text_agg=config["experts"]["text_agg"],
+        use_zeros_for_missing=config["experts"].get("use_zeros_for_missing", False),
+        eval_only=True,
+    )
+
+    model = config.init(
+        name='arch',
+        module=module_arch,
+        expert_dims=expert_dims,
+        text_dim=config["experts"]["text_dim"],
+        ce_shared_dim=config["experts"].get("ce_shared_dim", None),
+        feat_aggregation=config["data_loader"]["args"]["feat_aggregation"],
+    )
+    model_path = config._args.resume
+    logger.info(f"Loading checkpoint: {model_path} ...")
+    checkpoint = paddle.load(model_path)
+    state_dict = checkpoint
+    if config['n_gpu'] > 1:
+        model = paddle.DataParallel(model)
+    model.load_dict(state_dict)
+
+    return model, data_loaders
+
+
+def evaluation(config, logger=None, trainer=None):
+
+    if logger is None:
+        logger = config.get_logger('test')
+
+    if getattr(config._args, "eval_from_training_config", False):
+        eval_conf = copy.deepcopy(config)
+        merge(eval_conf._config, config["eval_settings"], strategy=Strategy.REPLACE)
+        config = eval_conf
+
+    logger.info("Running evaluation with configuration:")
+    logger.info(config)
+
+    # Set the random initial seeds
+    seed = config["seed"]
+    logger.info(f"Setting experiment random seed to {seed}")
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+    model, data_loaders = get_model_and_data_loaders(
+        config=config,
+        logger=logger,
+        model_path=Path(config._args.resume),
+    )
+    logger.info(model)
+
+    metrics = [getattr(module_metric, met) for met in config['metrics']]
+
+    # prepare model for testing.  Note that some datasets fail to fit the retrieval
+    # set on the GPU, so we run them on the CPU
+    model.eval()
+
+    with paddle.no_grad():
+        samples, meta = data_loaders["retrieval"]
+        #import pdb; pdb.set_trace()
+        # To use the nan-checks safely, we need make temporary copies of the data
+        all_text_num = samples['text'].shape[0]
+        text_keys = ['text', 'cap_id', 'att_mask', 'text_token_mask']
+        chk = 100
+        tck = 100 
+
+        if samples['text'].shape[0] % chk == 0:
+            vid_batch = samples['text'].shape[0] // chk
+        else:
+            vid_batch = samples['text'].shape[0] // chk + 1
+        if samples['text'].shape[0] % tck == 0:
+            text_batch  =  samples['text'].shape[0] // tck
+        else: 
+            text_batch  =  samples['text'].shape[0] // tck + 1
+        sub_sims = []
+        for idx in range(text_batch):
+            if idx % 5 == 0:
+                print(idx,'/',text_batch)
+            sub_samples = {}
+            for key in text_keys:
+                sub_samples.update({key: samples[key][idx*tck:idx*tck+tck]})
+            subsub_sims = []
+            for vid in range(vid_batch):
+                sub_samples['experts'] = {}
+                sub_samples['ind'] = {}
+                for expert in samples['experts'].keys():
+                    sub_samples['experts'][expert] = samples['experts'][expert][vid*chk:vid*chk+chk]
+                    sub_samples['ind'][expert] = samples['ind'][expert][vid*chk:vid*chk+chk]
+                with ctxt_mgr(sub_samples) as valid:
+                    output = model(**valid)
+                subsub_sims.append(output["cross_view_conf_matrix"].cpu())
+            subsub_sims = paddle.concat(subsub_sims, axis=1)
+            sub_sims.append(subsub_sims)
+        sub_sims = paddle.concat(sub_sims, axis=0)
+        sims = paddle.to_tensor(sub_sims, dtype='float32').numpy()
+        dataset = data_loaders.dataset_name
+
+        nested_metrics = {}
+        for metric in metrics:
+            metric_name = metric.__name__
+            res = metric(sims, query_masks=meta["query_masks"])
+            verbose(epoch=0, metrics=res, name=dataset, mode=metric_name)
+            if trainer is not None:
+                if not trainer.mini_train:
+                    trainer.writer.set_step(step=0, mode="val")
+                # avoid tensboard folding by prefixing
+                metric_name_ = f"test_{metric_name}"
+                trainer.log_metrics(res, metric_name=metric_name_, mode="val")
+            nested_metrics[metric_name] = res
+
+    log = {}
+    for subkey, subval in nested_metrics.items():
+        for subsubkey, subsubval in subval.items():
+            log[f"test_{subkey}_{subsubkey}"] = subsubval
+    for key, value in log.items():
+        logger.info(" {:15s}: {}".format(str(key), value))
+
+
+if __name__ == '__main__':
+    args = argparse.ArgumentParser(description='PyTorch Template')
+    args.add_argument('--config', default=None, type=str, help="config file path")
+    args.add_argument('--resume', default=None, help='path to checkpoint for evaluation')
+    args.add_argument('--eval_from_training_config', action="store_true",
+                      help="if true, evaluate directly from a training config file.")
+    args.add_argument("--custom_args", help="qualified key,val pairs")
+    eval_config = ConfigParser(args)
+
+    cfg_msg = "For evaluation, a model checkpoint must be specified via the --resume flag"
+    assert eval_config._args.resume, cfg_msg
+    if eval_config._config.get("eval_settings", False):
+        merge(eval_config._config, eval_config["eval_settings"], strategy=Strategy.REPLACE)
+        evaluation(eval_config)
diff --git a/docs/src/applications/T2VLAD/train.py b/docs/src/applications/T2VLAD/train.py
new file mode 100644
index 000000000..9e093b221
--- /dev/null
+++ b/docs/src/applications/T2VLAD/train.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import time
+import copy
+import socket
+import paddle
+import argparse
+import warnings
+
+import numpy as np
+import model.loss as module_loss
+import model.model as module_arch
+import model.metric as module_metric
+import data_loader.data_loaders as module_data
+
+
+from pathlib import Path
+from utils import set_seeds
+from trainer import Trainer
+from test import evaluation
+from mergedeep import merge, Strategy
+from parse_config import ConfigParser
+from logger.log_parser import log_summary
+from utils import compute_dims, compute_trn_config
+
+def run_exp(config):
+    warnings.filterwarnings('ignore')
+    logger = config.get_logger('train')
+
+    expert_dims, raw_input_dims = compute_dims(config, logger)
+    trn_config = compute_trn_config(config)
+
+    if config._args.group_seed:
+        seeds = [int(config._args.group_seed)]
+    else:
+        seeds = [int(x) for x in config._args.seeds.split(",")]
+
+    for ii, seed in enumerate(seeds):
+        tic = time.time()
+        logger.info(f"{ii + 1}/{len(seeds)} Setting experiment random seed to {seed}")
+        set_seeds(seed)
+        config["seed"] = seed
+
+        model = config.init(
+            name='arch',
+            module=module_arch,
+            expert_dims=expert_dims,
+            text_dim=config["experts"]["text_dim"],
+            ce_shared_dim=config["experts"].get("ce_shared_dim", None),
+            feat_aggregation=config["data_loader"]["args"]["feat_aggregation"],
+        )
+        logger.info(model)
+
+        data_loaders = config.init(
+            name='data_loader',
+            module=module_data,
+            logger=logger,
+            raw_input_dims=raw_input_dims,
+            text_feat=config["experts"]["text_feat"],
+            text_dim=config["experts"]["text_dim"],
+            text_agg=config["experts"]["text_agg"],
+            use_zeros_for_missing=config["experts"].get("use_zeros_for_missing", False),
+            eval_only=False,
+        )
+
+        loss = config.init(name="loss", module=module_loss)
+        metrics = [getattr(module_metric, met) for met in config['metrics']]
+
+        lr_scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.0001, step_size=5, gamma=0.9)
+        optimizer = paddle.optimizer.AdamW(learning_rate=lr_scheduler, weight_decay=1e-4, parameters=model.parameters(), grad_clip=paddle.nn.ClipGradByGlobalNorm(2))
+
+        trainer = Trainer(
+            model,
+            loss,
+            metrics,
+            optimizer,
+            config=config,
+            data_loaders=data_loaders,
+            lr_scheduler=lr_scheduler,
+            mini_train=config._args.mini_train,
+            visualizer=None,
+            val_freq=config["trainer"].get("val_freq", 1),
+            force_cpu_val=config.get("force_cpu_val", False),
+            skip_first_n_saves=config["trainer"].get("skip_first_n_saves", 0),
+            include_optim_in_save_model=config["trainer"].get("include_optim_in_save_model", 1),
+            cache_targets=set(config.get("cache_targets", [])),
+        )
+        trainer.train()
+        best_model_path = config.save_dir / "trained_model.pdparams"
+        duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic))
+        logger.info(f"Training took {duration}")
+
+    # If multiple runs were conducted, report relevant statistics
+    if len(seeds) > 1:
+        log_summary(
+            logger=logger,
+            log_path=config.log_path,
+            eval_mode=config["eval_mode"],
+            fixed_num_epochs=config["trainer"]["epochs"],
+        )
+    print(f"Log file stored at {config.log_path}")
+
+    # Report the location of the "best" model of the final seeded run (here
+    # "best" corresponds to the model with the highest geometric mean over the
+    # R@1, R@5 and R@10 metrics when a validation set is used, or simply the final
+    # epoch of training for fixed-length schedules).
+    print(f"The best performing model can be found at {str(best_model_path)}")
+
+
+def main():
+    args = argparse.ArgumentParser(description='Main entry point for training')
+    args.add_argument('--config', help='config file path')
+    args.add_argument('--resume', help='path to latest model (default: None)')
+    args.add_argument('--mini_train', action="store_true")
+    args.add_argument('--group_id', help="if supplied, group these experiments")
+    args.add_argument('--disable_workers', action="store_true")
+    args.add_argument('--refresh_lru_cache', action="store_true")
+    args.add_argument('--train_single_epoch', action="store_true")
+    args.add_argument('--purge_exp_dir', action="store_true",
+                      help="remove all previous experiments with the given config")
+    args.add_argument("--dbg", default="ipdb.set_trace")
+    args.add_argument("--custom_args", help="qualified key,val pairs")
+
+    # Seeds can either be passed directly as a comma separated list at the command line,
+    # or individually for separate experiments as a group (used for slurm experiments)
+    seed_args = args.add_mutually_exclusive_group()
+    seed_args.add_argument('--seeds', default="0", help="comma separated list of seeds")
+    seed_args.add_argument('--group_seed', help="seed for group member")
+    args = ConfigParser(args)
+    os.environ["PYTHONBREAKPOINT"] = args._args.dbg
+    args["data_loader"]["args"]["refresh_lru_cache"] = args._args.refresh_lru_cache
+    msg = (f"Expected the number of training epochs ({args['trainer']['epochs']})"
+           f"to exceed the save period ({args['trainer']['save_period']}), otherwise"
+           " no checkpoints will be saved.")
+    assert args["trainer"]["epochs"] >= args["trainer"]["save_period"], msg
+    run_exp(config=args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docs/src/applications/T2VLAD/trainer/__init__.py b/docs/src/applications/T2VLAD/trainer/__init__.py
new file mode 100644
index 000000000..5c0a8a4a9
--- /dev/null
+++ b/docs/src/applications/T2VLAD/trainer/__init__.py
@@ -0,0 +1 @@
+from .trainer import *
diff --git a/docs/src/applications/T2VLAD/trainer/trainer.py b/docs/src/applications/T2VLAD/trainer/trainer.py
new file mode 100644
index 000000000..55f1d5011
--- /dev/null
+++ b/docs/src/applications/T2VLAD/trainer/trainer.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import numpy as np
+
+from base import BaseTrainer
+from utils import memory_summary
+from contextlib import contextmanager
+
+
+def verbose(epoch, metrics, mode, name="TEST"):
+    r1, r5, r10, r50 = metrics["R1"], metrics["R5"], metrics["R10"], metrics["R50"]
+    msg = f"[{mode}]{name:s} epoch {epoch}, R@1: {r1:.1f}"
+    msg += f", R@5: {r5:.1f}, R@10 {r10:.1f}, R@50 {r50:.1f}"
+    msg += f"MedR: {metrics['MedR']:g}, MeanR: {metrics['MeanR']:.1f}"
+    print(msg)
+
+
+@contextmanager
+def ctxt_mgr(samples):
+    """Provide a context for managing temporary, cloned copies of retrieval
+    sample tensors.
+
+    The rationale here is that to use nan-checking in the model (to validate the
+    positions of missing experts), we need to modify the underlying tensors. This
+    function lets the evaluation code run (and modify) temporary copies, without
+    modifying the originals.
+    """
+
+    exp_dict = samples["experts"].items()
+    experts = {key: val.clone() for key, val in exp_dict}
+    samples_ = {
+        "experts": experts,
+        "ind": samples["ind"],
+        "text": samples["text"],
+        "cap_id": samples["cap_id"],
+        "att_mask": samples["att_mask"],
+    }
+    if "text_token_mask" in samples:
+        samples_["text_token_mask"] = samples["text_token_mask"]
+    try:
+        yield samples_
+    finally:
+        del samples_
+
+
+class Trainer(BaseTrainer):
+    """
+    Trainer class
+
+    Note:
+        Inherited from BaseTrainer.
+    """
+    def __init__(self, model, loss, metrics, optimizer, config, data_loaders,
+                 lr_scheduler, visualizer, skip_first_n_saves,
+                 include_optim_in_save_model, force_cpu_val, cache_targets=set(),
+                 num_keep_ckpts=3, mini_train=False, val_freq=1, skip_tboard=False):
+        super().__init__(model, loss, metrics, optimizer, config, mini_train=mini_train,
+                         skip_tboard=skip_tboard, num_keep_ckpts=num_keep_ckpts)
+        self.config = config
+        self.cache_targets = cache_targets
+        self.data_loaders = data_loaders
+        self.lr_scheduler = lr_scheduler
+        self.mini_train = mini_train
+        self.len_epoch = len(self.data_loaders["train"])
+        self.log_step = int(np.sqrt(data_loaders["train"].batch_size))
+        self.visualizer = visualizer
+        self.force_cpu_val = force_cpu_val
+        self.val_freq = val_freq
+        self.skip_first_n_saves = skip_first_n_saves
+        self.include_optim_in_save_model = include_optim_in_save_model
+        self.seen = {"train": 0, "val": 0}
+
+    def _train_epoch(self, epoch):
+        """
+        Training logic for an epoch
+
+        :param epoch: Current training epoch.
+        :return: A log that contains all information you want to save.
+
+        Note:
+            If you have additional information to record, for example:
+                > additional_log = {"x": x, "y": y}
+            merge it with log before return. i.e.
+                > log = {**log, **additional_log}
+                > return log
+
+            The metrics in log must have the key 'metrics'.
+        """
+        total_loss = 0
+        self.model.train()
+        memory_summary()
+
+        for batch_idx, minibatch in enumerate(self.data_loaders["train"]):
+            output = self.model(**minibatch)
+            if "retrieval" in self.data_loaders.dataloaders:
+                loss = self.loss(output["cross_view_conf_matrix"])
+            else:
+                loss = self.loss(x=output["class_preds"], target=labels)
+            
+            loss.backward()
+            self.optimizer.step()
+            self.optimizer.clear_grad()
+
+            sample_key = list(minibatch["experts"].keys())[0]
+            batch_size = minibatch["experts"][sample_key].shape[0]
+            self.seen["train"] += batch_size
+
+            total_loss += loss.item()
+
+            if batch_idx % self.log_step == 0:
+                prog = self._progress(batch_idx)
+                self.logger.info(f"Train Epoch: {epoch} {prog} Loss: {loss.item():.6f}")
+
+            if batch_idx == self.len_epoch or (self.mini_train and batch_idx > 3):
+                break
+
+        log = {'loss': total_loss / self.len_epoch}
+        if epoch % self.val_freq == 0:
+            nested_log, cached_preds = self._valid_epoch(epoch)
+            log.update(nested_log)
+        else:
+            nested_log, cached_preds = {}, None
+            self.logger.info(f"skipping val for epoch: {epoch}")
+
+        self.lr_scheduler.step()
+
+        self.logger.info(f"LR {self.lr_scheduler.get_lr()}")
+        return log, cached_preds
+
+    def _valid_epoch(self, epoch):
+        """Validate model after an epoch of training and store results to disk.
+
+        Args:
+            epoch (int): the current epoch
+
+        Returns:
+            A log that contains information about validation
+
+        NOTE: The validation metrics in log must have the key 'val_metrics'.
+        """
+        self.model.eval()
+        cached_preds = {key: {"vid_name": [], "preds": [], "labels": []}
+                        for key in self.cache_targets}
+
+        with paddle.no_grad():
+            if "retrieval" in self.data_loaders.dataloaders:
+                samples, meta = self.data_loaders["retrieval"]
+                sample_key = list(samples["experts"].keys())[0]
+                batch_size = samples["experts"][sample_key].shape[0]
+                self.seen["val"] += batch_size
+                num_queries = samples["text"].shape[0] * samples["text"].shape[1]
+                safe_queries = 1
+                text_keys = ['text', 'cap_id', 'att_mask', 'text_token_mask']
+                if num_queries > safe_queries:
+                    chk = 50
+                    tck = 50
+                    if samples['text'].shape[0] % chk == 0:
+                        vid_batch = samples['text'].shape[0] // chk
+                    else:
+                        vid_batch = samples['text'].shape[0] // chk + 1
+                    if samples['text'].shape[0] % tck == 0:
+                        text_batch  =  samples['text'].shape[0] // tck
+                    else:
+                        text_batch  =  samples['text'].shape[0] // tck + 1
+
+                    sub_sims = []
+                    for idx in range(text_batch):
+                        if idx % 5 == 0:
+                            print(idx,'/',text_batch)
+                        sub_samples = {}
+                        for key in text_keys:
+                            sub_samples.update({key: samples[key][idx*tck:idx*tck+tck]})
+                        subsub_sims = []
+                        for vid in range(vid_batch):
+                            sub_samples['experts'] = {}
+                            sub_samples['ind'] = {} 
+                            for expert in samples['experts'].keys():
+                                sub_samples['experts'][expert] = samples['experts'][expert][vid*chk:vid*chk+chk]
+                                sub_samples['ind'][expert] = samples['ind'][expert][vid*chk:vid*chk+chk]
+                            with ctxt_mgr(sub_samples) as xx:
+                                output = self.model(**xx)
+                            subsub_sims.append(output["cross_view_conf_matrix"].cpu())
+                    
+                        subsub_sims = paddle.concat(subsub_sims, axis=1)
+                        sub_sims.append(subsub_sims)
+
+                    sims = paddle.concat(sub_sims, axis=0)
+                    sims = paddle.to_tensor(sims, dtype='float32').cpu().numpy()
+                else:
+                    with ctxt_mgr(samples) as xx:
+                        output = self.model(**xx)
+                    sims = paddle.to_tensor(output["cross_view_conf_matrix"], dtype='float32').cpu().numpy()
+
+                # sample the loss (using only the first query for each video)
+                queries_per_vid = meta["query_masks"].shape[1]
+                sims_ = paddle.to_tensor(sims).reshape([-1, queries_per_vid, sims.shape[-1]])
+                loss = self.loss(sims_[:, 0, :])
+                dataset = self.data_loaders.dataset_name
+                nested_metrics = {}
+                for metric in self.metrics:
+                    metric_name = metric.__name__
+                    res = metric(sims, query_masks=meta["query_masks"])
+                    if metric_name == "mean_average_precision":
+                        print(f"Epoch: {epoch}, mean AP: {res['mAP']}")
+                    else:
+                        verbose(epoch=epoch, metrics=res, name=dataset, mode=metric_name)
+                    nested_metrics[metric_name] = res
+
+                # TODO(Samuel) disabled visualisation for now, simple to add in later
+                num_test_caps = self.data_loaders.num_test_captions
+                if num_test_caps == 1 and meta["raw_captions"] is not None:
+                    if self.visualizer is not None:
+                        self.visualizer.visualize_ranking(
+                            sims=sims,
+                            meta=meta,
+                            epoch=epoch,
+                            nested_metrics=nested_metrics,
+                        )
+                return {"nested_val_metrics": nested_metrics}, cached_preds
+
+            elif "val" in self.data_loaders.dataloaders:
+                metrics = [x() for x in self.metrics]
+                for batch_idx, minibatch in enumerate(self.data_loaders["val"]):
+                    labels = minibatch.pop("labels")
+                    vid_name = minibatch.pop("vid_name")
+                    output = self.model(**minibatch)
+                    if "val" in self.cache_targets:
+                        cached_preds["val"]["vid_name"].append(vid_name)
+                        cached_preds["val"]["preds"].append(output["class_preds"])
+
+                    for metric in metrics:
+                        metric.add(output=output["class_preds"], target=labels)
+                    if batch_idx % self.log_step == 0:
+                        prog = self._progress(batch_idx)
+                        self.logger.info(f"Val Epoch: {epoch} {prog}")
+                
+                nested_metrics = {}
+                for metric in metrics:
+                    if hasattr(metric, "topk"):
+                        res = {f"top{key}": val for key, val in
+                               zip(metric.topk, metric.value())}
+                        nested_metrics["accuracy"] = res
+                    else:
+                        raise ValueError(f"unsupported mettric: {type(metric)}")
+                nested = {"nested_val_metrics": nested_metrics}
+
+                for target in self.cache_targets - {"val"}:
+                    for batch_idx, minibatch in enumerate(self.data_loaders["tiny"]):
+                        if "labels" in minibatch:
+                            cached_preds[target]["labels"].append(minibatch.pop("labels"))
+                        cached_preds[target]["vid_name"].append(minibatch.pop("vid_name"))
+                        output = self.model(**minibatch)
+                        cached_preds[target]["preds"].append(output["class_preds"])
+
+                # aggregate all cached predictions
+                for target in self.cache_targets:
+                    for key, val in cached_preds[target].items():
+                        cached_preds[key] = paddle.concat(val).cpu().numpy()
+                return nested, cached_preds
+
+    def _progress(self, batch_idx):
+        base = '[{}/{} ({:.0f}%)]'
+        if hasattr(self.data_loaders, 'n_samples'):
+            current = batch_idx * self.data_loaders.batch_size
+            total = self.data_loaders.n_samples
+        else:
+            current = batch_idx
+            total = self.len_epoch
+        return base.format(current, total, 100.0 * current / total)
diff --git a/docs/src/applications/T2VLAD/utils/__init__.py b/docs/src/applications/T2VLAD/utils/__init__.py
new file mode 100644
index 000000000..46d3a156a
--- /dev/null
+++ b/docs/src/applications/T2VLAD/utils/__init__.py
@@ -0,0 +1 @@
+from .util import *
diff --git a/docs/src/applications/T2VLAD/utils/util.py b/docs/src/applications/T2VLAD/utils/util.py
new file mode 100644
index 000000000..7a5899577
--- /dev/null
+++ b/docs/src/applications/T2VLAD/utils/util.py
@@ -0,0 +1,327 @@
+"""
+Exclude from autoreload
+%aimport -util.utils
+"""
+import os
+import json
+import random
+from pathlib import Path
+from datetime import datetime
+from typing import List
+from itertools import repeat
+from collections import OrderedDict
+
+import numpy as np
+import paddle
+import psutil
+import humanize
+from PIL import Image
+from typeguard import typechecked
+
+
+@typechecked
+def filter_cmd_args(cmd_args: List[str], remove: List[str]) -> List[str]:
+    drop = []
+    for key in remove:
+        if key not in cmd_args:
+            continue
+        pos = cmd_args.index(key)
+        drop.append(pos)
+        if len(cmd_args) > (pos + 1) and not cmd_args[pos + 1].startswith("--"):
+            drop.append(pos + 1)
+    for pos in reversed(drop):
+        cmd_args.pop(pos)
+    return cmd_args
+
+
+@typechecked
+def set_seeds(seed: int):
+    """Set seeds for randomisation libraries.
+
+    Args:
+        seed: the seed value
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+def memory_summary():
+    vmem = psutil.virtual_memory()
+    msg = (
+        f">>> Currently using {vmem.percent}% of system memory "
+        f"{humanize.naturalsize(vmem.used)}/{humanize.naturalsize(vmem.available)}"
+    )
+    print(msg)
+
+
+def flatten_dict(x, keysep="-"):
+    flat_dict = {}
+    for key, val in x.items():
+        if isinstance(val, dict):
+            flat_subdict = flatten_dict(val)
+            flat_dict.update({f"{key}{keysep}{subkey}": subval
+                              for subkey, subval in flat_subdict.items()})
+        else:
+            flat_dict.update({key: val})
+    return flat_dict
+
+
+def expert_tensor_storage(experts, feat_aggregation):
+    expert_storage = {"fixed": set(), "variable": set(), "flaky": set()}
+    # fixed_sz_experts, variable_sz_experts, flaky_experts = set(), set(), set()
+    for expert, config in feat_aggregation.items():
+        if config["temporal"] in {"vlad",  "fixed_seg"}:
+            expert_storage["variable"].add(expert)
+        elif config["temporal"] in {"avg", "max", "avg-max", "max-avg", "avg-max-ent", 
+                                    "max-avg-ent"}:
+            expert_storage["fixed"].add(expert)
+        else:
+            raise ValueError(f"unknown temporal strategy: {config['temporal']}")
+        # some "flaky" experts are only available for a fraction of videos - we need
+        # to pass this information (in the form of indices) into the network for any
+        # experts present in the current dataset
+        if config.get("flaky", False):
+            expert_storage["flaky"].add(expert)
+
+    # we only allocate storage for experts used by the current dataset
+    for key, value in expert_storage.items():
+        expert_storage[key] = value.intersection(set(experts))
+    return expert_storage
+
+
+def read_json(fname):
+    with fname.open('rt') as handle:
+        return json.load(handle, object_hook=OrderedDict)
+
+
+def path2str(x):
+    """Recursively convert pathlib objects to strings to enable serialization"""
+    for key, val in x.items():
+        if isinstance(val, dict):
+            path2str(val)
+        elif isinstance(val, Path):
+            x[key] = str(val)
+
+
+def write_json(content, fname, paths2strs=False):
+    if paths2strs:
+        path2str(content)
+    with fname.open('wt') as handle:
+        json.dump(content, handle, indent=4, sort_keys=False)
+
+
+def inf_loop(data_loader):
+    ''' wrapper function for endless data loader. '''
+    for loader in repeat(data_loader):
+        yield from loader
+
+
+class HashableDict(dict):
+    def __hash__(self):
+        return hash(frozenset(self))
+
+
+class HashableOrderedDict(dict):
+    def __hash__(self):
+        return hash(frozenset(self))
+
+
+def compute_trn_config(config, logger=None):
+    trn_config = {}
+    feat_agg = config["data_loader"]["args"]["feat_aggregation"]
+    for static_expert in feat_agg.keys():
+        if static_expert in feat_agg:
+            if "trn_seg" in feat_agg[static_expert].keys():
+                trn_config[static_expert] = feat_agg[static_expert]["trn_seg"]
+    return trn_config
+
+
+def compute_dims(config, logger=None):
+    if logger is None:
+        logger = config.get_logger('utils')
+
+    experts = config["experts"]
+    # TODO(Samuel): clean up the logic since it's a little convoluted
+    ordered = sorted(config["experts"]["modalities"])
+
+    if experts["drop_feats"]:
+        to_drop = experts["drop_feats"].split(",")
+        logger.info(f"dropping: {to_drop}")
+        ordered = [x for x in ordered if x not in to_drop]
+
+    feat_agg = config["data_loader"]["args"]["feat_aggregation"]
+    dims = []
+    arch_args = config["arch"]["args"]
+    vlad_clusters = arch_args["vlad_clusters"]
+    for expert in ordered:
+        temporal = feat_agg[expert]["temporal"]
+        if expert == "face":
+            in_dim, out_dim = experts["face_dim"], experts["face_dim"]
+        elif expert == "features_scene" and temporal == "vlad":
+            in_dim, out_dim = 2208 * vlad_clusters["features_scene"], 2208
+        elif expert == "features_s3d" and temporal == "vlad":
+            in_dim, out_dim = 1024 * vlad_clusters["features_s3d"], 1024
+        elif expert == "features_flow" and temporal == "vlad":
+            in_dim, out_dim = 1024 * vlad_clusters["features_flow"], 1024
+        elif expert == "features_rgb" and temporal == "vlad":
+            in_dim, out_dim = 2048 * vlad_clusters["features_rgb"], 2048
+        elif expert == "features_ocr" and temporal == "vlad":
+            in_dim, out_dim = 300 * vlad_clusters["features_ocr"], 300
+        elif expert == "features_face" and temporal == "vlad":
+            in_dim, out_dim = 512 * vlad_clusters["features_face"], 512
+        elif expert == "features_speech" and temporal == "vlad":
+            in_dim, out_dim = 300 * vlad_clusters["features_speech"], 300
+        elif expert == "features_audio" and temporal == "vlad":
+            in_dim, out_dim = 128 * vlad_clusters["features_audio"], 128
+        elif expert == "audio" and temporal == "vlad":
+            in_dim, out_dim = 128 * vlad_clusters["audio"], 128
+        elif expert == "audio" and temporal == "vlad":
+            in_dim, out_dim = 128 * vlad_clusters["audio"], 128
+        elif expert == "speech" and temporal == "vlad":
+            in_dim, out_dim = 300 * vlad_clusters["speech"], 300
+        elif expert == "ocr" and temporal == "vlad":
+            in_dim, out_dim = 300 * vlad_clusters["ocr"], 300
+        elif expert == "detection":
+            # allow for avg pooling
+            det_clusters = arch_args["vlad_clusters"].get("detection", 1)
+            in_dim, out_dim = 1541 * det_clusters, 1541
+        elif expert == "detection-sem":
+            if config["data_loader"]["args"].get("spatial_feats", False):
+                base = 300 + 16
+            else:
+                base = 300 + 5
+            det_clusters = arch_args["vlad_clusters"].get("detection-sem", 1)
+            in_dim, out_dim = base * det_clusters, base
+        elif expert == "openpose":
+            base = 54
+            det_clusters = arch_args["vlad_clusters"].get("openpose", 1)
+            in_dim, out_dim = base * det_clusters, base
+        else:
+            common_dim = feat_agg[expert]["feat_dims"][feat_agg[expert]["type"]]
+            # account for aggregation of multilpe forms (e.g. avg + max pooling)
+            common_dim = common_dim * len(feat_agg[expert]["temporal"].split("-"))
+            in_dim, out_dim = common_dim, common_dim
+
+        # For the CE architecture, we need to project all features to a common
+        # dimensionality
+        if arch_args.get("mimic_ce_dims", False):
+            out_dim = experts["ce_shared_dim"]
+
+        dims.append((expert, (in_dim, out_dim)))
+    expert_dims = OrderedDict(dims)
+
+    if vlad_clusters["text"] == 0:
+        msg = "vlad can only be disabled for text with single tokens"
+        assert config["data_loader"]["args"]["max_tokens"]["text"] == 1, msg
+
+    if config["experts"]["text_agg"] == "avg":
+        msg = "averaging can only be performed with text using single tokens"
+        assert config["arch"]["args"]["vlad_clusters"]["text"] == 0
+        assert config["data_loader"]["args"]["max_tokens"]["text"] == 1
+
+    # To remove the dependency of dataloader on the model architecture, we create a
+    # second copy of the expert dimensions which accounts for the number of vlad
+    # clusters
+    raw_input_dims = OrderedDict()
+    for expert, dim_pair in expert_dims.items():
+        raw_dim = dim_pair[0]
+        if expert in {"audio", "speech", "ocr", "detection", "detection-sem", "openpose", "features_audio", "features_speech", "features_face", "features_ocr",  "features_rgb", "features_flow", "features_s3d", "features_scene",
+                      "speech.mozilla.0"}:
+            if feat_agg[expert]["temporal"] == "vlad":
+                raw_dim = raw_dim // vlad_clusters.get(expert, 1)
+        raw_input_dims[expert] = raw_dim
+
+    return expert_dims, raw_input_dims
+
+
+def ensure_tensor(x):
+    if not isinstance(x, paddle.Tensor): #if not isinstance(x, torch.Tensor):
+        x = paddle.to_tensor(x) #    x = torch.from_numpy(x)
+    return x
+
+
+class Timer:
+    def __init__(self):
+        self.cache = datetime.now()
+
+    def check(self):
+        now = datetime.now()
+        duration = now - self.cache
+        self.cache = now
+        return duration.total_seconds()
+
+    def reset(self):
+        self.cache = datetime.now()
+
+
+def tensor2im(input_image, imtype=np.uint8):
+    """"Converts a Tensor array into a numpy image array.
+
+    Parameters:
+        input_image (tensor) --  the input image tensor array
+        imtype (type)        --  the desired type of the converted numpy array
+    """
+    if not isinstance(input_image, np.ndarray):
+        if isinstance(input_image, paddle.Tensor): #if isinstance(input_image, torch.Tensor):  # get the data from a variable
+            image_tensor = input_image #image_tensor = input_image.data
+        else:
+            return input_image
+        # convert it into a numpy array
+        image_numpy = image_tensor[0].cpu().float().numpy()
+        if image_numpy.shape[0] == 1:  # grayscale to RGB
+            image_numpy = np.tile(image_numpy, (3, 1, 1))
+        # post-processing: tranpose and scaling
+        image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0
+    else:  # if it is a numpy array, do nothing
+        image_numpy = input_image
+    return image_numpy.astype(imtype)
+
+
+def save_image(image_numpy, image_path):
+    """Save a numpy image to the disk
+
+    Parameters:
+        image_numpy (numpy array) -- input numpy array
+        image_path (str)          -- the path of the image
+    """
+    image_pil = Image.fromarray(image_numpy)
+    image_pil.save(image_path)
+
+
+def print_numpy(x, val=True, shp=False):
+    """Print the mean, min, max, median, std, and size of a numpy array
+
+    Parameters:
+        val (bool) -- if print the values of the numpy array
+        shp (bool) -- if print the shape of the numpy array
+    """
+    x = x.astype(np.float64)
+    if shp:
+        print('shape,', x.shape)
+    if val:
+        x = x.flatten()
+        print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (
+            np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))
+
+
+def mkdirs(paths):
+    """create empty directories if they don't exist
+
+    Parameters:
+        paths (str list) -- a list of directory paths
+    """
+    if isinstance(paths, list) and not isinstance(paths, str):
+        for path in paths:
+            mkdir(path)
+    else:
+        mkdir(paths)
+
+
+def mkdir(path):
+    """create a single empty directory if it didn't exist
+
+    Parameters:
+        path (str) -- a single directory path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
diff --git a/docs/src/applications/TableTennis/ActionRecognition/README.md b/docs/src/applications/TableTennis/ActionRecognition/README.md
new file mode 100644
index 000000000..bd60bc8ef
--- /dev/null
+++ b/docs/src/applications/TableTennis/ActionRecognition/README.md
@@ -0,0 +1,98 @@
+# 乒乓球动作识别模型
+
+
+## 内容
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型推理](#模型推理)
+- [模型优化](#模型优化)
+- [模型部署](#模型部署)
+- [参考论文](#参考论文)
+
+在开始使用之前，您需要按照以下命令安装额外的依赖包：
+```bash
+python -m pip install imageio
+```
+
+## 模型简介
+该代码库用于乒乓球动作识别, 基于paddle2.2版本开发，结合PaddleVideo中的VideoSwinTransformer模型，对给定的乒乓球视频进行动作分类。
+主要分为如下几步
+ - 图像特征抽取，SwinTransformer3D
+ - 动作分类，I3DHead
+
+
+## 数据准备
+
+TODO
+
+## 模型训练
+主要代码来自VideoSwin模型：[VideoSwin](../../../docs/zh-CN/model_zoo/recognition/videoswin.md)
+
+1. 使用VideoSwin在K400上的预训练模型基础上进行finetune，因此首先下载K400的预训练模型并放置到`data`目录下
+    ```bash
+    wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_k400.pdparams
+    ```
+
+2. 使用`TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml`配置文件进行训练
+    训练启动命令如下：
+    ```bash
+    # 单卡
+    python3.7 -u main.py --amp --validate -c applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml
+    # 多卡
+    python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin_tabletennis main.py --amp --validate -c applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml
+    ```
+
+## 模型评估
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_videoswin_tabletennis  main.py  --test -c configs/recognition/video_swin_transformer/videoswin_tabletennis.yaml -w "output/VideoSwin_TableTennis/VideoSwin_TableTennis_best.pdparams"
+```
+
+## 模型推理
+
+我们提供了一个在乒乓球数据集上训练好的模型以及一个乒乓球样例的视频pkl文件，以供测试
+```
+wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_tennis.pdparams # 下载乒乓球数据集上训练好的模型
+wget -P data/ https://videotag.bj.bcebos.com/Data/example_tennis.pkl # 下载乒乓球样例输入视频pkl文件
+```
+
+### 导出推理模型
+```
+python3.7 tools/export_model.py -c applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml \
+                                -p output/VideoSwin_TableTennis/VideoSwin_TableTennis_best.pdparams \
+                                -o inference/VideoSwin_TableTennis
+```
+上述命令会根据传入的`.pdparams`模型，在`inference/VideoSwin_TableTennis`文件夹下生成推理模型，主要包括3个文件：`VideoSwin_TableTennis.pdiparams`、`VideoSwin_TableTennis.pdmodel`、`VideoSwin_TableTennis.info`
+
+### 使用推理模型
+测试文件使用`.pkl`文件，其包含了已抽取的用于预测的乒乓球视频帧。
+运行预测代码
+```bash
+python3.7 tools/predict.py --input_file data/example_tennis_7.pkl \
+                           --config applications/TableTennis/ActionRecognition/configs/videoswin_tabletennis.yaml \
+                           --model_file inference/VideoSwin_TableTennis/VideoSwin_TableTennis.pdmodel \
+                           --params_file inference/VideoSwin_TableTennis/VideoSwin_TableTennis.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+执行以上命令会产出一个原视频叠加预测结果文本(Top1类别+概率)的gif图片，保存在本目录的results文件夹下，gif文件名与输入的pkl文件名相同。
+效果如下图：
+
+![example_7.gif](results/example_tennis_7.gif)
+
+
+## 模型优化
+在实际使用场景中可根据视频内容尝试优化策略
+- 可根据动作持续时间的长短，调整采样的段数num_seg和段内采样的帧数seg_len
+- 可以根据数据集大小调整模型训练的超参数，包括权重衰减、DropOut概率、学习率、更换优化器等，以获得更优的结果。
+- 本代码的backbone部分可以作为视频特征提取模块，代替其它的动作识别backbone，以获得表征能力更强的视频特征，以提升整体任务的精度。
+
+
+## 模型部署
+TODO
+
+
+## 参考论文
+
+- [Video Swin Transformer](https://arxiv.org/pdf/2106.13230.pdf), Ze Liu, Jia Ning, Yue Cao, Yixuan Wei
diff --git a/docs/src/applications/TableTennis/datasets/script/submission_format_transfer.py b/docs/src/applications/TableTennis/datasets/script/submission_format_transfer.py
new file mode 100644
index 000000000..ecd3487c2
--- /dev/null
+++ b/docs/src/applications/TableTennis/datasets/script/submission_format_transfer.py
@@ -0,0 +1,64 @@
+import json
+import math
+
+with open('/workspace/bianjiang03/DATA/Output_for_bmn/prop.json') as f:
+    data = json.load(f)
+f.close()
+
+transferred = dict()
+
+# 25 fps for all videos
+fps = 25
+
+for item in data:
+    temp = []
+    for seg in item['bmn_results']:
+        temp_dict = {
+            'score': seg['score'],
+            'segment':
+            [round(seg['start'] / fps, 2),
+             round(seg['end'] / fps, 2)]
+        }
+        temp.append(temp_dict)
+    transferred[item['video_name']] = temp
+
+target_format = {
+    'version': 'A-test',
+    'results': transferred,
+    'external_data': {}
+}
+
+jsonString = json.dumps(target_format, indent=4, ensure_ascii=False)
+jsonFile = open('/workspace/bianjiang03/DATA/Output_for_bmn/submission.json',
+                'w')
+jsonFile.write(jsonString)
+jsonFile.close()
+
+# target format
+# {
+#   "version": NA,
+#   "results": {
+#     "name_of_clip_1": [
+#       {
+#         "score": 0.64,
+#         "segment": [2.33,3.15]
+#       },
+#       {
+#         "score": 0.77,
+#         "segment": [7.64, 7.84]
+#       }
+#     ],
+# 	"name_of_clip_2": [
+#       {
+#         "score": 0.84,
+#         "segment": [9.73,10.15]
+#       },
+#       {
+#         "score": 0.87,
+#         "segment": [17.11, 17.84]
+#       }
+#     ],
+# 	...
+#   }
+#   "external_data": {}
+# }
diff --git a/docs/src/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py b/docs/src/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py
new file mode 100644
index 000000000..9da8e12a5
--- /dev/null
+++ b/docs/src/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py
@@ -0,0 +1,93 @@
+#!./python27-gcc482/bin/python
+# coding: utf-8
+"""
+BAIDU CLOUD action
+"""
+
+import os
+import sys
+import pickle
+import json
+import time
+import shutil
+
+import numpy as np
+
+sys.path.append(
+    "/workspace/bianjiang03/App_TableTennis/PaddleVideo/FootballAction/predict/action_detect"
+)
+import models.bmn_infer as prop_model
+from utils.preprocess import get_images
+from utils.config_utils import parse_config, print_configs
+import utils.config_utils as config_utils
+
+import logger
+
+logger = logger.Logger()
+
+
+def load_model(cfg_file="configs/configs.yaml"):
+    """
+    load_model
+    """
+    logger.info("load model ... ")
+    global infer_configs
+    infer_configs = parse_config(cfg_file)
+    print_configs(infer_configs, "Infer")
+
+    t0 = time.time()
+    global prop_model
+    prop_model = prop_model.InferModel(infer_configs)
+    t1 = time.time()
+    logger.info("step0: load model time: {} min\n".format((t1 - t0) * 1.0 / 60))
+
+
+def video_classify(video_name, dataset_dir):
+    """
+    extract_feature
+    """
+    logger.info('predict ... ')
+    logger.info(video_name)
+
+    # step 1: extract feature
+
+    feature_path = dataset_dir + video_name
+    video_features = pickle.load(open(feature_path, 'rb'))
+    print('===video_features===', video_name)
+
+    # step2: get proposal
+    t0 = time.time()
+    bmn_results = prop_model.predict(infer_configs, material=video_features)
+    t1 = time.time()
+    logger.info(np.array(bmn_results).shape)
+    logger.info("step2: proposal time: {} min".format((t1 - t0) * 1.0 / 60))
+
+    return bmn_results
+
+
+if __name__ == '__main__':
+    dataset_dir = '/workspace/bianjiang03/DATA/Features_competition_test_A/'
+    output_dir = '/workspace/bianjiang03/DATA'
+    if not os.path.exists(output_dir + '/Output_for_bmn'):
+        os.mkdir(output_dir + '/Output_for_bmn')
+    results = []
+
+    load_model()
+
+    directory = os.fsencode(dataset_dir)
+
+    for file in os.listdir(directory):
+        filename = os.fsdecode(file)
+        bmn_results = video_classify(filename, dataset_dir)
+        results.append({
+            'video_name': filename.split('.pkl')[0],
+            'num_proposal': len(bmn_results),
+            'bmn_results': bmn_results
+        })
+
+    with open(output_dir + '/Output_for_bmn/prop.json', 'w',
+              encoding='utf-8') as f:
+        data = json.dumps(results, indent=4, ensure_ascii=False)
+        f.write(data)
+
+    print('Done with the inference!')
diff --git a/docs/src/applications/TableTennis/fix_bad_label.py b/docs/src/applications/TableTennis/fix_bad_label.py
new file mode 100644
index 000000000..3d99ed142
--- /dev/null
+++ b/docs/src/applications/TableTennis/fix_bad_label.py
@@ -0,0 +1,37 @@
+import copy
+import json
+import re
+import os
+
+url = '/home/aistudio/work/BMN/Input_for_bmn/feature/'
+directory = os.fsencode(url)
+count = 0
+target_set = []
+
+for file in os.listdir(directory):
+    filename = os.fsdecode(file)
+    target_name = filename.split('.npy')[0]
+    target_set.append(target_name)
+    count += 1
+print('Feature size:', len(target_set))
+
+with open('/home/aistudio/work/BMN/Input_for_bmn/label.json') as f:
+    data = json.load(f)
+
+delet_set = []
+for key in data.keys():
+    if not key in target_set:
+        delet_set.append(key)
+
+print('(Label) Original size:', len(data))
+print('(Label) Deleted size:', len(delet_set))
+
+for item in delet_set:
+    data.pop(item, None)
+
+print('(Label) Fixed size:', len(data))
+
+jsonString = json.dumps(data, indent=4, ensure_ascii=False)
+jsonFile = open('/home/aistudio/work/BMN/Input_for_bmn/label_fixed.json', 'w')
+jsonFile.write(jsonString)
+jsonFile.close()
diff --git a/docs/src/applications/TableTennis/get_instance_for_bmn.py b/docs/src/applications/TableTennis/get_instance_for_bmn.py
new file mode 100644
index 000000000..40e681e70
--- /dev/null
+++ b/docs/src/applications/TableTennis/get_instance_for_bmn.py
@@ -0,0 +1,227 @@
+"""
+get instance for bmn
+使用winds=8的滑窗，将所有子窗口的长度之和小于winds的进行合并
+合并后，父窗口代表bmn训练数据，子窗口代表tsn训练数据
+"""
+import os
+import sys
+import json
+import random
+import pickle
+import numpy as np
+import math
+
+# for table tennis
+bmn_window = 8
+dataset = "/home/aistudio/work/BMN/"
+feat_dir = dataset + '/Features_example'
+out_dir = dataset + '/Input_for_bmn'
+label_files = {
+    'train': 'label_cls14_small_train.json',
+    'validation': 'label_cls14_small_test.json'
+}
+
+global fps
+
+
+def gen_gts_for_bmn(gts_data):
+    """
+    @param, gts_data, original gts for action detection
+    @return, gts_bmn, output gts dict for bmn
+    """
+    fps = gts_data['fps']
+    gts_bmn = {'fps': fps, 'gts': []}
+    for sub_item in gts_data['gts']:
+        url = sub_item['url']
+
+        max_length = sub_item['total_frames']
+
+        gts_bmn['gts'].append({
+            'url': url,
+            'total_frames': max_length,
+            'root_actions': []
+        })
+        sub_actions = sub_item['actions']
+        # 跳过没有动作的片段
+        if len(sub_actions) == 0:
+            continue
+        # duration > bmn_window， 动作持续时间大于bmn_windows，直接删除
+        for idx, sub_action in enumerate(sub_actions):
+            if sub_action['end_id'] - sub_action['start_id'] > bmn_window:
+                sub_actions.pop(idx)
+
+        # 【滑动窗口，把每一个视频里的动作片段提取出来】
+        root_actions = [sub_actions[0]]
+        # before_id, 前一动作的最后一帧
+        # after_id, 后一动作的第一帧
+        before_id = 0
+        for idx in range(1, len(sub_actions)):
+            cur_action = sub_actions[idx]
+            duration = (cur_action['end_id'] - root_actions[0]['start_id'])
+            if duration > bmn_window:  # windows只能包住一个动作就包，包不住就包多个
+                after_id = cur_action['start_id']
+                gts_bmn['gts'][-1]['root_actions'].append({
+                    'before_id':
+                    before_id,
+                    'after_id':
+                    after_id,
+                    'actions':
+                    root_actions
+                })
+                before_id = root_actions[-1]['end_id']  #更新滑窗
+                root_actions = [cur_action]
+            else:
+                root_actions.append(cur_action)
+            if idx == len(sub_actions) - 1:
+                after_id = max_length
+                gts_bmn['gts'][-1]['root_actions'].append({
+                    'before_id':
+                    before_id,
+                    'after_id':
+                    after_id,
+                    'actions':
+                    root_actions
+                })
+
+    return gts_bmn
+
+
+def combile_gts(gts_bmn, gts_process, mode):
+    """
+    1、bmn_window 范围内只有一个动作，只取一个目标框
+    2、bmn_window 范围内有多个动作，取三个目标框(第一个动作、最后一个动作、所有动作)
+    """
+    global fps
+    fps = gts_process['fps']
+    duration_second = bmn_window * 1.0
+    duration_frame = bmn_window * fps
+    feature_frame = duration_frame
+    for item in gts_process['gts']:
+        url = item['url']
+        basename = os.path.basename(url).split('.')[0]
+        root_actions = item['root_actions']
+        # 把每一个视频里的动作片段提取出来
+        for root_action in root_actions:
+            segments = []
+            # all actions
+            segments.append({
+                'actions': root_action['actions'],
+                'before_id': root_action['before_id'],
+                'after_id': root_action['after_id']
+            })
+            if len(root_action['actions']) > 1:  #如果有多个动作，则第一个动作和最后一个动作，额外添加一次
+                # first action
+                segments.append({
+                    'actions': [root_action['actions'][0]],
+                    'before_id':
+                    root_action['before_id'],
+                    'after_id':
+                    root_action['actions'][1]['start_id']
+                })
+                # last action
+                segments.append({
+                    'actions': [root_action['actions'][-1]],
+                    'before_id':
+                    root_action['actions'][-2]['end_id'],
+                    'after_id':
+                    root_action['after_id']
+                })
+
+            # 把动作片段处理成window size大小，以适配BMN输入
+            for segment in segments:
+                before_id = segment['before_id']
+                after_id = segment['after_id']
+                actions = segment['actions']
+                # before_id到after_id太长了，从里面取window_size帧，要先确定一个起始点，然后动作都要包住
+                box0 = max(actions[-1]['end_id'] - bmn_window,
+                           before_id)  #确定起始点
+                box1 = min(actions[0]['start_id'],
+                           after_id - bmn_window)  #确实起始点
+                if box0 <= box1:  # 一次检查
+                    if int(box0) - int(box1) == 0:
+                        cur_start = box0
+                    else:
+                        box0 = math.ceil(box0)
+                        box1 = int(box1)
+                        cur_start = random.randint(box0, box1)
+                    cur_end = cur_start + bmn_window
+                    cur_start = round(cur_start, 2)
+                    cur_end = round(cur_end, 2)
+                    name = '{}_{}_{}'.format(basename, cur_start, cur_end)
+                    annotations = []
+                    for action in actions:
+                        label = str(1.0 * action['label_ids'][0])
+                        label_name = action['label_names'][0]
+                        seg0 = 1.0 * round((action['start_id'] - cur_start),
+                                           2)  #存储的是到开始位置(时间: s)的距离
+                        seg1 = 1.0 * round((action['end_id'] - cur_start), 2)
+                        annotations.append({
+                            'segment': [seg0, seg1],
+                            'label': label,
+                            'label_name': label_name
+                        })
+                    gts_bmn[name] = {
+                        'duration_second': duration_second,
+                        'duration_frame': duration_frame,
+                        'feature_frame': feature_frame,
+                        'subset': mode,
+                        'annotations': annotations
+                    }
+
+    return gts_bmn
+
+
+def save_feature_to_numpy(gts_bmn, folder):
+    global fps
+    print('save feature for bmn ...')
+    if not os.path.exists(folder):
+        os.mkdir(folder)
+    process_gts_bmn = {}
+    miss = 0
+    for item, value in gts_bmn.items():
+        # split to rsplit 针对文件命名修改
+        basename, start_id, end_id = item.rsplit('_', 2)
+        if not basename in process_gts_bmn:
+            process_gts_bmn[basename] = []
+        process_gts_bmn[basename].append({
+            'name': item,
+            'start': float(start_id),
+            'end': float(end_id)
+        })
+    for item, values in process_gts_bmn.items():
+        feat_path = os.path.join(feat_dir, item + '.pkl')
+        feature_video = pickle.load(open(feat_path, 'rb'))['image_feature']
+        for value in values:
+            save_cut_name = os.path.join(folder, value['name'])
+            a, b, c = save_cut_name.rsplit('_', 2)
+            if float(b) > 360:
+                print(b)
+            start_frame = round(value['start'] * fps)
+            end_frame = round(value['end'] * fps)
+            if end_frame > len(feature_video):
+                miss += 1
+                continue
+            feature_cut = [
+                feature_video[i] for i in range(start_frame, end_frame)
+            ]
+            np_feature_cut = np.array(feature_cut, dtype=np.float32)
+            np.save(save_cut_name, np_feature_cut)
+
+    print('miss number (broken sample):', miss)
+
+
+if __name__ == "__main__":
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+    gts_bmn = {}
+    for item, value in label_files.items():
+        label_file = os.path.join(dataset, value)
+        gts_data = json.load(open(label_file, 'rb'))
+        gts_process = gen_gts_for_bmn(gts_data)
+        gts_bmn = combile_gts(gts_bmn, gts_process, item)
+
+    with open(out_dir + '/label.json', 'w', encoding='utf-8') as f:
+        data = json.dumps(gts_bmn, indent=4, ensure_ascii=False)
+        f.write(data)
+
+    save_feature_to_numpy(gts_bmn, out_dir + '/feature')
diff --git a/docs/src/applications/TableTennis/gts_format_transfer.py b/docs/src/applications/TableTennis/gts_format_transfer.py
new file mode 100644
index 000000000..ad1c51ecf
--- /dev/null
+++ b/docs/src/applications/TableTennis/gts_format_transfer.py
@@ -0,0 +1,12 @@
+import json
+
+with open('/home/aistudio/work/BMN/Input_for_bmn/label_fixed.json') as f:
+    data = json.load(f)
+f.close()
+
+target_format = {'taxonomy': None, 'database': data, 'version': None}
+
+jsonString = json.dumps(target_format, indent=4, ensure_ascii=False)
+jsonFile = open('/home/aistudio/work/BMN/Input_for_bmn/label_gts.json', 'w')
+jsonFile.write(jsonString)
+jsonFile.close()
diff --git a/docs/src/applications/TableTennis/predict/action_detect/action.py b/docs/src/applications/TableTennis/predict/action_detect/action.py
new file mode 100644
index 000000000..c1776981f
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/action.py
@@ -0,0 +1,186 @@
+#!./python27-gcc482/bin/python
+# coding: utf-8
+"""
+BAIDU CLOUD action
+"""
+
+import os
+import sys
+import pickle
+import json
+import time
+import functools
+
+import numpy as np
+
+from utils.preprocess import get_images
+from utils.config_utils import parse_config, print_configs
+import mfcc.feature_extractor as mfcc_extractor
+
+import models.pptsm_infer as image_model
+import models.audio_infer as audio_model
+import models.bmn_infer as prop_model
+import models.lstm_infer as classify_model
+
+import logger
+
+logger = logger.Logger()
+
+
+def record_time_info(func):
+    """decorator func to log cost time for func
+    """
+    @functools.wraps(func)
+    def timer(*args):
+        """log cost time for func
+        """
+        logger.info("function [{}] processing ...".format(func.__name__))
+        start_time = time.time()
+        retval = func(*args)
+        cost_time = round(time.time() - start_time, 5)
+        logger.info("function [{}] run time: {:.2f} min".format(
+            func.__name__, cost_time / 60))
+        return retval
+
+    return timer
+
+
+class ActionDetection(object):
+    """ModelPredict"""
+    def __init__(self, cfg_file="configs/configs.yaml"):
+        cfg = parse_config(cfg_file)
+        self.configs = cfg
+        print_configs(self.configs, "Infer")
+
+        name = 'COMMON'
+        self.DEBUG = cfg[name]['DEBUG']
+        self.BMN_ONLY = cfg[name]['BMN_ONLY']
+        self.LSTM_ONLY = cfg[name]['LSTM_ONLY']
+        self.PCM_ONLY = cfg[name]['PCM_ONLY']
+        if self.LSTM_ONLY:
+            self.prop_dict = {}
+            for dataset in ['EuroCup2016']:
+                prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(
+                    dataset)
+                json_data = json.load(open(prop_json, 'r'))
+                for item in json_data:
+                    basename = prop_json.replace('feature_bmn/prop.json', 'mp4')
+                    basename = basename + '/' + item['video_name'] + '.mp4'
+                    self.prop_dict[basename] = item['bmn_results']
+
+    @record_time_info
+    def load_model(self):
+        """
+        load_model
+        """
+        if not self.DEBUG:
+            self.image_model = image_model.InferModel(self.configs)
+            if not self.PCM_ONLY:
+                self.audio_model = audio_model.InferModel(self.configs)
+
+        if not self.LSTM_ONLY:
+            self.prop_model = prop_model.InferModel(self.configs)
+
+        if not self.BMN_ONLY:
+            self.classify_model = classify_model.InferModel(self.configs)
+
+        logger.info("==> Action Detection prepared.")
+
+    @record_time_info
+    def infer(self, imgs_path, pcm_path, fps=5):
+        """
+        extract_feature
+        """
+        self.imgs_path = imgs_path
+        self.pcm_path = pcm_path
+        self.configs['COMMON']['fps'] = fps
+
+        logger.info("==> input video {}".format(os.path.basename(
+            self.imgs_path)))
+
+        # step 1: extract feature
+        video_features = self.extract_feature()
+
+        # step2: get proposal
+        bmn_results = self.extract_proposal(video_features)
+
+        # step3: classify
+        material = {'feature': video_features, 'proposal': bmn_results}
+        action_results = self.video_classify(material)
+
+        return bmn_results, action_results
+
+    @record_time_info
+    def video_classify(self, material):
+        """video classify"""
+        if self.BMN_ONLY:
+            return []
+        action_results = self.classify_model.predict(self.configs,
+                                                     material=material)
+        logger.info('action shape {}'.format(np.array(action_results).shape))
+        return action_results
+
+    @record_time_info
+    def extract_proposal(self, video_features):
+        """extract proposal"""
+        if self.LSTM_ONLY:
+            basename = self.imgs_path.replace('frames', 'mp4') + '.mp4'
+            bmn_results = self.prop_dict[basename]
+            return bmn_results
+        bmn_results = self.prop_model.predict(self.configs,
+                                              material=video_features)
+        logger.info('proposal shape {}'.format(np.array(bmn_results).shape))
+        return bmn_results
+
+    @record_time_info
+    def extract_feature(self):
+        """extract feature"""
+        if not self.DEBUG:
+            image_path_list = get_images(self.imgs_path)
+            self.configs['PPTSM']['frame_list'] = image_path_list
+            self.configs['AUDIO']['pcm_file'] = self.pcm_path
+            image_features = self.image_model.predict(self.configs)
+            if self.PCM_ONLY:
+                sample_rate = self.configs['AUDIO']['sample_rate']
+                pcm_features = mfcc_extractor.extract_pcm(
+                    self.pcm_path, sample_rate)
+                audio_features = []
+            else:
+                audio_features, pcm_features = self.audio_model.predict(
+                    self.configs)
+
+            np_image_features = np.array(image_features, dtype=np.float32)
+            np_audio_features = np.array(audio_features, dtype=np.float32)
+            np_pcm_features = np.array(pcm_features, dtype=np.float32)
+
+            video_features = {
+                'image_feature': np_image_features,
+                'audio_feature': np_audio_features,
+                'pcm_feature': np_pcm_features
+            }
+        else:
+            feature_path = self.imgs_path.replace("frames", "features") + '.pkl'
+            video_features = pickle.load(open(feature_path, 'rb'))
+
+        logger.info("feature shape {} {} {}".format(
+            video_features['image_feature'].shape,
+            video_features['audio_feature'].shape,
+            video_features['pcm_feature'].shape))
+
+        return video_features
+
+
+if __name__ == '__main__':
+
+    model_predict = ActionDetection(cfg_file="../configs/configs.yaml")
+    model_predict.load_model()
+
+    imgs_path = "/home/work/datasets/EuroCup2016/frames/1be705a8f67648da8ec4b4296fa80895"
+    pcm_path = "/home/work/datasets/EuroCup2016/pcm/1be705a8f67648da8ec4b4296fa80895.pcm"
+
+    bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)
+    results = {'bmn_results': bmn_results, 'action_results': action_results}
+
+    with open('results.json', 'w', encoding='utf-8') as f:
+        data = json.dumps(results, indent=4, ensure_ascii=False)
+        f.write(data)
diff --git a/docs/src/applications/TableTennis/predict/action_detect/logger.py b/docs/src/applications/TableTennis/predict/action_detect/logger.py
new file mode 100644
index 000000000..5d1c3bb9a
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/logger.py
@@ -0,0 +1,24 @@
+"""
+logger
+"""
+import os
+import logging
+
+
+class Logger(logging.Logger):
+    """Customized logger for news stripper
+    """
+    def __init__(self):
+        super(Logger, self).__init__(self)
+        if not os.path.exists('logs'):
+            os.mkdir('logs')
+        handler = logging.FileHandler("logs/action_detect.log")
+        # handler.setLevel(logging.DEBUG)
+        handler.setLevel(logging.INFO)
+
+        format = "%(levelname)s: %(asctime)s: %(filename)s:%(lineno)d %(message)s"
+        datefmt = "%y-%m-%d %H:%M:%S"
+
+        formatter = logging.Formatter(format, datefmt)
+        handler.setFormatter(formatter)
+        self.addHandler(handler)
diff --git a/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py b/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py
new file mode 100755
index 000000000..505f923cc
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py
@@ -0,0 +1,183 @@
+"""
+audio feature extract
+"""
+# coding: utf-8
+import os
+import numpy as np
+import pickle
+import mfcc.vgg_params as vgg_params
+import sys
+
+
+def frame(data, window_length, hop_length):
+    """
+    frame
+    """
+    num_samples = data.shape[0]
+    #print("window_length , hop_length", window_length, hop_length)
+    #print("num_sample = ", num_samples)
+    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
+    #print(" num_frames = ", num_frames)
+    shape = (num_frames, window_length) + data.shape[1:]
+    #print(" shape = ", shape)
+    strides = (data.strides[0] * hop_length, ) + data.strides
+    #print("data.strides = ", data.strides)
+    #print("strides = ", strides)
+    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
+
+
+def periodic_hann(window_length):
+    """
+    periodic_hann
+    """
+    return 0.5 - (0.5 *
+                  np.cos(2 * np.pi / window_length * np.arange(window_length)))
+
+
+def stft_magnitude(signal, fft_length, hop_length=None, window_length=None):
+    """
+    stft_magnitude
+    """
+    frames = frame(signal, window_length, hop_length)
+    window = periodic_hann(window_length)
+    windowed_frames = frames * window
+    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
+
+
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+
+
+def hertz_to_mel(frequencies_hertz):
+    """
+    hertz_to_mel
+    """
+    return _MEL_HIGH_FREQUENCY_Q * np.log(1.0 + (frequencies_hertz /
+                                                 _MEL_BREAK_FREQUENCY_HERTZ))
+
+
+def spectrogram_to_mel_matrix(num_mel_bins=20,
+                              num_spectrogram_bins=129,
+                              audio_sample_rate=8000,
+                              lower_edge_hertz=125.0,
+                              upper_edge_hertz=3800.0):
+    """
+    spectrogram_to_mel_matrix
+    """
+    nyquist_hertz = audio_sample_rate / 2.
+    if lower_edge_hertz >= upper_edge_hertz:
+        raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
+                         (lower_edge_hertz, upper_edge_hertz))
+    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz,
+                                         num_spectrogram_bins)
+    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
+    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
+                                 hertz_to_mel(upper_edge_hertz),
+                                 num_mel_bins + 2)
+    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
+    for i in range(num_mel_bins):
+        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
+        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
+                       (center_mel - lower_edge_mel))
+        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
+                       (upper_edge_mel - center_mel))
+        mel_weights_matrix[:,
+                           i] = np.maximum(0.0,
+                                           np.minimum(lower_slope, upper_slope))
+    mel_weights_matrix[0, :] = 0.0
+    return mel_weights_matrix
+
+
+def log_mel_spectrogram(data,
+                        audio_sample_rate=8000,
+                        log_offset=0.0,
+                        window_length_secs=0.025,
+                        hop_length_secs=0.010,
+                        **kwargs):
+    """
+    log_mel_spectrogram
+    """
+    window_length_samples = int(round(audio_sample_rate * window_length_secs))
+    #print("audio_sample_rate = ", audio_sample_rate)
+    #print("window_length_secs = ", window_length_secs)
+    #print("window_length_sample ", window_length_samples)
+    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
+    #print("hop_length_samples ", hop_length_samples)
+    fft_length = 2**int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
+    #print(" fft_lengt = ", fft_length)
+    spectrogram = stft_magnitude(data,
+                                 fft_length=fft_length,
+                                 hop_length=hop_length_samples,
+                                 window_length=window_length_samples)
+    #print(" spectrogram.shape = ", spectrogram.shape)
+    mel_spectrogram = np.dot(
+        spectrogram,
+        spectrogram_to_mel_matrix(num_spectrogram_bins=spectrogram.shape[1],
+                                  audio_sample_rate=audio_sample_rate,
+                                  **kwargs))
+
+    return np.log(mel_spectrogram + log_offset)
+
+
+def wav_to_example(wav_data, sample_rate):
+    """
+    wav_to_example
+    """
+    #sample_rate, wav_data = wavfile.read(wav_file)
+    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
+    #wav_data = wav_data[:16000*30]
+    #print(" wav_data ", wav_data.shape)
+    #print(" wav_data ", wav_data.shape)
+    pad_zero_num = int(sample_rate * (vgg_params.STFT_WINDOW_LENGTH_SECONDS -
+                                      vgg_params.STFT_HOP_LENGTH_SECONDS))
+    wav_data_extend = np.hstack((wav_data, np.zeros(pad_zero_num)))
+    wav_data = wav_data_extend
+    #print(" wav_data ", wav_data.shape)
+    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
+    #print(" wav_data after convert to -1 1", wav_data)
+    #if wav_data.shape[0] > max_second * sample_rate:
+    #    wav_data = wav_data[:max_second * sample_rate, :]
+    if len(wav_data.shape) > 1:
+        wav_data = np.mean(wav_data, axis=1)
+    #print(" wav_data after mean", wav_data.shape, len(wav_data.shape), wav_data)
+    # Resample to the rate assumed by vgg.
+    #if sample_rate != vgg_params.SAMPLE_RATE:
+    #    wav_data = resampy.resample(wav_data, sample_rate, vgg_params.SAMPLE_RATE)
+    log_mel = log_mel_spectrogram(
+        wav_data,
+        audio_sample_rate=vgg_params.SAMPLE_RATE,
+        log_offset=vgg_params.LOG_OFFSET,
+        window_length_secs=vgg_params.STFT_WINDOW_LENGTH_SECONDS,
+        hop_length_secs=vgg_params.STFT_HOP_LENGTH_SECONDS,
+        num_mel_bins=vgg_params.NUM_MEL_BINS,
+        lower_edge_hertz=vgg_params.MEL_MIN_HZ,
+        upper_edge_hertz=vgg_params.MEL_MAX_HZ)
+    # Frame features into examples.
+    features_sample_rate = 1.0 / vgg_params.STFT_HOP_LENGTH_SECONDS
+    example_window_length = int(
+        round(vgg_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
+
+    example_hop_length = int(
+        round(vgg_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
+    log_mel_examples = frame(log_mel,
+                             window_length=example_window_length,
+                             hop_length=example_hop_length)
+    return log_mel_examples
+
+
+def extract_pcm(pcm_file, sample_rate):
+    with open(pcm_file, "rb") as f:
+        pcm_data = f.read()
+    audio_data = np.fromstring(pcm_data, dtype=np.int16)
+    examples = wav_to_example(audio_data, sample_rate)
+    return examples
+
+
+if __name__ == "__main__":
+    wav_file = sys.argv[1]
+    print("wav_file = ", wav_file)
+    with open(wav_file, "rb") as f:
+        pcm_data = f.read()
+    audio_data = np.fromstring(pcm_data, dtype=np.int16)
+    examples_batch = wav_to_example(audio_data, 16000)
+    print("examples_batch.shape", examples_batch.shape)
diff --git a/docs/src/applications/TableTennis/predict/action_detect/mfcc/model_config.py b/docs/src/applications/TableTennis/predict/action_detect/mfcc/model_config.py
new file mode 100644
index 000000000..194365ece
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/mfcc/model_config.py
@@ -0,0 +1,51 @@
+"""
+audio model config
+"""
+import numpy as np
+
+import mfcc.feature_extractor as feature_extractor
+
+
+class ModelAudio(object):
+    """
+    modelAudio
+    """
+    def __init__(self, configs, use_gpu=1):
+        self.use_gpu = use_gpu
+
+        self.audio_fps = configs.COMMON.fps
+        self.audio_feat_scale = configs.TSN.audio_scale
+        self.sample_rate = 16000
+
+    def predict_slice(self, wav_data, sample_rate):
+        """
+        audio predict
+        """
+        examples_batch = feature_extractor.wav_to_example(
+            wav_data, sample_rate)[0]
+        return examples_batch
+
+    def predict_audio(self, audio_file):
+        """
+        predict_audio
+        """
+        audio_feature_list = []
+        # read pcm
+        sample_rate = self.sample_rate
+        try:
+            with open(audio_file, "rb") as f:
+                pcm_data = f.read()
+            audio_data = np.fromstring(pcm_data, dtype=np.int16)
+            audio_status = "audio load success"
+        except Exception as e:
+            audio_data = []
+            audio_status = "audio load failed"
+        step = 1
+        len_video = int(len(audio_data) / sample_rate)
+        print(len_video)
+        for i in range(0, len_video, step):
+            audio_data_part = audio_data[i * sample_rate:(i + step) *
+                                         sample_rate]
+            feature_audio = self.predict_slice(audio_data_part, sample_rate)
+            audio_feature_list.append(feature_audio)
+        return audio_feature_list
diff --git a/docs/src/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py b/docs/src/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py
new file mode 100755
index 000000000..0a9951961
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py
@@ -0,0 +1,37 @@
+"""Global parameters for the VGGish model.
+See vggish_slim.py for more information.
+"""
+
+# Architectural constants.
+NUM_FRAMES = 50  # Frames in input mel-spectrogram patch.
+NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
+EMBEDDING_SIZE = 128  # Size of embedding layer.
+
+# Hyperparameters used in feature and example generation.
+SAMPLE_RATE = 16000
+STFT_WINDOW_LENGTH_SECONDS = 0.040
+STFT_HOP_LENGTH_SECONDS = 0.020
+NUM_MEL_BINS = NUM_BANDS
+MEL_MIN_HZ = 125
+MEL_MAX_HZ = 7500
+LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
+EXAMPLE_WINDOW_SECONDS = 1.00  # Each example contains 96 10ms frames
+EXAMPLE_HOP_SECONDS = 1.00  # with zero overlap.
+
+# Parameters used for embedding postprocessing.
+PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
+PCA_MEANS_NAME = 'pca_means'
+QUANTIZE_MIN_VAL = -2.0
+QUANTIZE_MAX_VAL = +2.0
+
+# Hyperparameters used in training.
+INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
+LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
+ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
+
+# Names of ops, tensors, and features.
+INPUT_OP_NAME = 'vggish/input_features'
+INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
+OUTPUT_OP_NAME = 'vggish/embedding'
+OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
+AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
diff --git a/docs/src/applications/TableTennis/predict/action_detect/models/audio_infer.py b/docs/src/applications/TableTennis/predict/action_detect/models/audio_infer.py
new file mode 100644
index 000000000..f50b7efa5
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/models/audio_infer.py
@@ -0,0 +1,78 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """audio infer"""
+    def __init__(self, cfg, name='AUDIO'):
+        name = name.upper()
+        self.name = name
+        model_file = cfg[name]['model_file']
+        params_file = cfg[name]['params_file']
+        gpu_mem = cfg[name]['gpu_mem']
+        device_id = cfg[name]['device_id']
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input_tensor = self.predictor.get_input_handle(input_names[0])
+
+        output_names = self.predictor.get_output_names()
+        self.output_tensor = self.predictor.get_output_handle(output_names[0])
+
+    def infer(self, input):
+        """infer"""
+        self.input_tensor.copy_from_cpu(input)
+        self.predictor.run()
+        output = self.output_tensor.copy_to_cpu()
+        return output
+
+    def predict(self, infer_config):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config)
+        feature_list = []
+        pcm_list = []
+        for infer_iter, data in enumerate(infer_reader()):
+            inputs = np.array(data, dtype='float32')
+            output = self.infer(inputs)
+            feature_list.append(np.squeeze(output))
+            pcm_list.append(inputs)
+        feature_values = np.vstack(feature_list)
+        pcm_values = np.vstack(pcm_list)
+        return feature_values, pcm_values
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml'
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    pcm_path = '/home/work/datasets/WorldCup2018/pcm/6e577252c4004961ac7caa738a52c238.pcm'
+    t0 = time.time()
+    cfg['AUDIO']['pcm_file'] = pcm_path
+    outputs = model.predict(cfg)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    t1 = time.time()
+
+    print(outputs.shape)
+    print(outputs[0])
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py b/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py
new file mode 100644
index 000000000..dce2fcdd6
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/models/bmn_infer.py
@@ -0,0 +1,164 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import json
+import pickle
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+from utils.process_result import process_proposal
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """bmn infer"""
+    def __init__(self, cfg, name='BMN'):
+        name = name.upper()
+        self.name = name
+        model_file = cfg[name]['model_file']
+        params_file = cfg[name]['params_file']
+        gpu_mem = cfg[name]['gpu_mem']
+        device_id = cfg[name]['device_id']
+
+        self.nms_thread = cfg[name]['nms_thread']
+        self.min_pred_score = cfg[name]['score_thread']
+        self.min_frame_thread = cfg['COMMON']['fps']
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input_tensor = self.predictor.get_input_handle(input_names[0])
+
+        output_names = self.predictor.get_output_names()
+        self.output1_tensor = self.predictor.get_output_handle(output_names[0])
+        self.output2_tensor = self.predictor.get_output_handle(output_names[1])
+        self.output3_tensor = self.predictor.get_output_handle(output_names[2])
+
+    def infer(self, input):
+        """infer"""
+        self.input_tensor.copy_from_cpu(input)
+        self.predictor.run()
+        output1 = self.output1_tensor.copy_to_cpu()
+        output2 = self.output2_tensor.copy_to_cpu()
+        output3 = self.output3_tensor.copy_to_cpu()
+        return output1, output2, output3
+
+    def generate_props(self,
+                       pred_bmn,
+                       pred_start,
+                       pred_end,
+                       max_window=200,
+                       min_window=5):
+        """generate_props"""
+        video_len = min(pred_bmn.shape[-1],
+                        min(pred_start.shape[-1], pred_end.shape[-1]))
+        pred_bmn = pred_bmn[0, :, :] * pred_bmn[1, :, :]
+        start_mask = self.boundary_choose(pred_start)
+        start_mask[0] = 1.
+        end_mask = self.boundary_choose(pred_end)
+        end_mask[-1] = 1.
+        score_results = []
+        for idx in range(min_window, max_window):
+            for jdx in range(video_len):
+                start_index = jdx
+                end_index = start_index + idx
+                if end_index < video_len and start_mask[
+                        start_index] == 1 and end_mask[end_index] == 1:
+                    xmin = start_index
+                    xmax = end_index
+                    xmin_score = pred_start[start_index]
+                    xmax_score = pred_end[end_index]
+                    bmn_score = pred_bmn[idx, jdx]
+                    conf_score = xmin_score * xmax_score * bmn_score
+                    score_results.append([xmin, xmax, conf_score])
+        return score_results
+
+    def boundary_choose(self, score_list):
+        """boundary_choose"""
+        max_score = max(score_list)
+        mask_high = (score_list > max_score * 0.5)
+        score_list = list(score_list)
+        score_middle = np.array([0.0] + score_list + [0.0])
+        score_front = np.array([0.0, 0.0] + score_list)
+        score_back = np.array(score_list + [0.0, 0.0])
+        mask_peak = ((score_middle > score_front) & (score_middle > score_back))
+        mask_peak = mask_peak[1:-1]
+        mask = (mask_high | mask_peak).astype('float32')
+        return mask
+
+    def predict(self, infer_config, material):
+        """predict"""
+        infer_reader = reader.get_reader(self.name,
+                                         'infer',
+                                         infer_config,
+                                         material=material)
+        feature_list = []
+        for infer_iter, data in enumerate(infer_reader()):
+            inputs = [items[0] for items in data]
+            winds = [items[1] for items in data]
+            feat_info = [items[2] for items in data]
+            feature_T = feat_info[0][0]
+            feature_N = feat_info[0][1]
+
+            inputs = np.array(inputs)
+            pred_bmn, pred_sta, pred_end = self.infer(inputs)
+
+            if infer_iter == 0:
+                sum_pred_bmn = np.zeros((2, feature_N, feature_T))
+                sum_pred_sta = np.zeros((feature_T, ))
+                sum_pred_end = np.zeros((feature_T, ))
+                sum_pred_cnt = np.zeros((feature_T, ))
+
+            for idx, sub_wind in enumerate(winds):
+                sum_pred_bmn[:, :, sub_wind[0]:sub_wind[1]] += pred_bmn[idx]
+                sum_pred_sta[sub_wind[0]:sub_wind[1]] += pred_sta[idx]
+                sum_pred_end[sub_wind[0]:sub_wind[1]] += pred_end[idx]
+                sum_pred_cnt[sub_wind[0]:sub_wind[1]] += np.ones(
+                    (sub_wind[1] - sub_wind[0], ))
+
+        pred_bmn = sum_pred_bmn / sum_pred_cnt
+        pred_sta = sum_pred_sta / sum_pred_cnt
+        pred_end = sum_pred_end / sum_pred_cnt
+
+        score_result = self.generate_props(pred_bmn, pred_sta, pred_end)
+        results = process_proposal(score_result, self.min_frame_thread,
+                                   self.nms_thread, self.min_pred_score)
+
+        return results
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml'
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'
+
+    # feature
+    feature_path = imgs_path.replace("frames", "features") + '.pkl'
+    video_features = pickle.load(open(feature_path, 'rb'))
+
+    t0 = time.time()
+    outputs = model.predict(cfg, video_features)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    t1 = time.time()
+
+    results = {'proposal': outputs}
+    with open('results.json', 'w', encoding='utf-8') as f:
+        data = json.dumps(results, indent=4, ensure_ascii=False)
+        f.write(data)
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py b/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py
new file mode 100644
index 000000000..90685feff
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/models/lstm_infer.py
@@ -0,0 +1,158 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import json
+import pickle
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+from utils.process_result import get_action_result
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """lstm infer"""
+    def __init__(self, cfg, name='ACTION'):
+        name = name.upper()
+        self.name = name
+        model_file = cfg[name]['model_file']
+        params_file = cfg[name]['params_file']
+        gpu_mem = cfg[name]['gpu_mem']
+        device_id = cfg[name]['device_id']
+
+        self.topk = cfg[name]['topk']
+        self.frame_offset = cfg[name]['nms_offset']
+        self.nms_thread = cfg[name]['nms_thread']
+        self.cls_thread = cfg[name]['classify_score_thread']
+        self.iou_thread = cfg[name]['iou_score_thread']
+
+        self.label_map_file = cfg['COMMON']['label_dic']
+        self.fps = cfg['COMMON']['fps']
+        self.nms_id = 5
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input1_tensor = self.predictor.get_input_handle(input_names[0])
+        self.input2_tensor = self.predictor.get_input_handle(input_names[1])
+
+        output_names = self.predictor.get_output_names()
+        self.output1_tensor = self.predictor.get_output_handle(output_names[0])
+        self.output2_tensor = self.predictor.get_output_handle(output_names[1])
+
+    def infer(self, input1_arr, input1_lod, input2_arr=None, input2_lod=None):
+        """infer"""
+        self.input1_tensor.copy_from_cpu(input1_arr)
+        self.input1_tensor.set_lod(input1_lod)
+        if not input2_arr is None:
+            self.input2_tensor.copy_from_cpu(input2_arr)
+            self.input2_tensor.set_lod(input2_lod)
+        self.predictor.run()
+        output1 = self.output1_tensor.copy_to_cpu()
+        output2 = self.output2_tensor.copy_to_cpu()
+        # print(output.shape)
+        return output1, output2
+
+    def pre_process(self, input):
+        """pre process"""
+        input_arr = []
+        input_lod = [0]
+        start_lod = 0
+        end_lod = 0
+        for sub_item in input:
+            end_lod = start_lod + len(sub_item)
+            input_lod.append(end_lod)
+            input_arr.extend(sub_item)
+            start_lod = end_lod
+        input_arr = np.array(input_arr)
+        # print(input_arr.shape)
+        # print([input_lod])
+        return input_arr, [input_lod]
+
+    def predict(self, infer_config, material):
+        """predict"""
+        infer_reader = reader.get_reader(self.name,
+                                         'infer',
+                                         infer_config,
+                                         material=material)
+        results = []
+        for infer_iter, data in enumerate(infer_reader()):
+            video_id = [[items[-2], items[-1]] for items in data]
+            input1 = [items[0] for items in data]
+            input2 = [items[1] for items in data]
+            input1_arr, input1_lod = self.pre_process(input1)
+            input2_arr, input2_lod = self.pre_process(input2)
+            output1, output2 = self.infer(input1_arr, input1_lod, input2_arr,
+                                          input2_lod)
+            # output1, output2 = self.infer(input1_arr, input1_lod)
+
+            predictions_id = output1
+            predictions_iou = output2
+            for i in range(len(predictions_id)):
+                topk_inds = predictions_id[i].argsort()[0 - self.topk:]
+                topk_inds = topk_inds[::-1]
+                preds_id = predictions_id[i][topk_inds]
+                preds_iou = predictions_iou[i][0]
+                results.append((video_id[i], preds_id.tolist(),
+                                topk_inds.tolist(), preds_iou.tolist()))
+
+        predict_result = get_action_result(results, self.label_map_file,
+                                           self.fps, self.cls_thread,
+                                           self.iou_thread, self.nms_id,
+                                           self.nms_thread, self.frame_offset)
+        return predict_result
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml'
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    # proposal total
+    prop_dict = {}
+    for dataset in ['EuroCup2016', 'WorldCup2018']:
+        prop_json = '/home/work/datasets/{}/feature_bmn/prop.json'.format(
+            dataset)
+        json_data = json.load(open(prop_json, 'r'))
+        for item in json_data:
+            basename = prop_json.replace('feature_bmn/prop.json', 'mp4')
+            basename = basename + '/' + item['video_name'] + '.mp4'
+            prop_dict[basename] = item['bmn_results']
+
+    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238'
+
+    # feature
+    feature_path = imgs_path.replace("frames", "features") + '.pkl'
+    video_features = pickle.load(open(feature_path, 'rb'))
+
+    # proposal
+    basename = imgs_path.replace('frames', 'mp4') + '.mp4'
+    bmn_results = prop_dict[basename]
+
+    material = {'feature': video_features, 'proposal': bmn_results}
+
+    t0 = time.time()
+    outputs = model.predict(cfg, material)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    # print(outputs.shape)
+    t1 = time.time()
+    results = {'actions': outputs}
+    with open('results.json', 'w', encoding='utf-8') as f:
+        data = json.dumps(results, indent=4, ensure_ascii=False)
+        f.write(data)
+
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/TableTennis/predict/action_detect/models/pptsm_infer.py b/docs/src/applications/TableTennis/predict/action_detect/models/pptsm_infer.py
new file mode 100644
index 000000000..58cf95707
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/models/pptsm_infer.py
@@ -0,0 +1,77 @@
+"""
+ppTSM InferModel
+"""
+import sys
+import numpy as np
+import time
+
+sys.path.append('../')
+from utils.preprocess import get_images
+from utils.config_utils import parse_config
+
+import reader
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+class InferModel(object):
+    """pptsm infer"""
+    def __init__(self, cfg, name='PPTSM'):
+        name = name.upper()
+        self.name = name
+        model_file = cfg[name]['model_file']
+        params_file = cfg[name]['params_file']
+        gpu_mem = cfg[name]['gpu_mem']
+        device_id = cfg[name]['device_id']
+
+        # model init
+        config = Config(model_file, params_file)
+        config.enable_use_gpu(gpu_mem, device_id)
+        config.switch_ir_optim(True)  # default true
+        config.enable_memory_optim()
+
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(config)
+
+        input_names = self.predictor.get_input_names()
+        self.input_tensor = self.predictor.get_input_handle(input_names[0])
+
+        output_names = self.predictor.get_output_names()
+        self.output_tensor = self.predictor.get_output_handle(output_names[1])
+
+    def infer(self, input):
+        """infer"""
+        self.input_tensor.copy_from_cpu(input)
+        self.predictor.run()
+        output = self.output_tensor.copy_to_cpu()
+        return output
+
+    def predict(self, infer_config):
+        """predict"""
+        infer_reader = reader.get_reader(self.name, 'infer', infer_config)
+        feature_list = []
+        for infer_iter, data in enumerate(infer_reader()):
+            inputs = [items[:-1] for items in data]
+            inputs = np.array(inputs)
+            output = self.infer(inputs)
+            feature_list.append(np.squeeze(output))
+        feature_list = np.vstack(feature_list)
+        return feature_list
+
+
+if __name__ == "__main__":
+    cfg_file = '/home/work/inference/configs/configs.yaml'
+    cfg = parse_config(cfg_file)
+    model = InferModel(cfg)
+
+    imgs_path = '/home/work/datasets/WorldCup2018/frames/6e577252c4004961ac7caa738a52c238/'
+    imgs_list = get_images(imgs_path)
+    t0 = time.time()
+    cfg['PPTSM']['frame_list'] = imgs_list
+    outputs = model.predict(cfg)
+    # outputs = model.infer(np.random.rand(32, 8, 3, 224, 224).astype(np.float32))
+    t1 = time.time()
+
+    print(outputs.shape)
+    print('cost time = {} min'.format((t1 - t0) / 60.0))
diff --git a/docs/src/applications/TableTennis/predict/action_detect/reader/__init__.py b/docs/src/applications/TableTennis/predict/action_detect/reader/__init__.py
new file mode 100644
index 000000000..c4cc42a9e
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/reader/__init__.py
@@ -0,0 +1,15 @@
+"""
+read map for model
+"""
+from reader.reader_utils import regist_reader, get_reader
+# import reader.tsminf_reader as tsminf_reader
+# import reader.audio_reader as audio_reader
+import reader.bmninf_reader as bmninf_reader
+import reader.feature_reader as feature_reader
+
+# regist reader, sort by alphabet
+# regist_reader("TSM", tsminf_reader.TSMINFReader)
+# regist_reader("PPTSM", tsminf_reader.TSMINFReader)
+# regist_reader("AUDIO", audio_reader.AudioReader)
+regist_reader("BMN", bmninf_reader.BMNINFReader)
+regist_reader("ACTION", feature_reader.FeatureReader)
diff --git a/docs/src/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py b/docs/src/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py
new file mode 100644
index 000000000..112c9cb3f
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py
@@ -0,0 +1,154 @@
+"""
+# @File  : bmninf_reader.py
+# @Author: macaihong
+# @Date  : 2019/12/15
+# @Desc  :
+"""
+
+import os
+import random
+import pickle
+import json
+import numpy as np
+import multiprocessing
+
+import numpy as np
+
+from .reader_utils import DataReader
+
+
+def get_sw_prop(duration, window=200, step=10):
+    """
+    get_sw_prop
+    """
+    pr = []
+    local_boxes = []
+    for k in np.arange(0, duration - window + step, step):
+        start_id = k
+        end_id = min(duration, k + window)
+        if end_id - start_id < window:
+            start_id = end_id - window
+        local_boxes = (start_id, end_id)
+        pr.append(local_boxes)
+
+    def valid_proposal(duration, span):
+        """
+        valid_proposal
+        """
+        # fileter proposals
+        # a valid proposal should have at least one second in the video
+        real_span = min(duration, span[1]) - span[0]
+        return real_span >= 1
+
+    pr = list(filter(lambda x: valid_proposal(duration, x), pr))
+    return pr
+
+
+class BMNINFReader(DataReader):
+    """
+    Data reader for BMN model, which was stored as features extracted by prior networks
+    dataset cfg: feat_path, feature path,
+                 tscale, temporal length of BM map,
+                 dscale, duration scale of BM map,
+                 anchor_xmin, anchor_xmax, the range of each point in the feature sequence,
+                 batch_size, batch size of input data,
+                 num_threads, number of threads of data processing
+    """
+    def __init__(self, name, mode, cfg, material=None):
+        self.name = name
+        self.mode = mode
+        self.tscale = cfg[self.name.upper()]['tscale']  # 200
+        self.dscale = cfg[self.name.upper()]['dscale']  # 200
+        # self.subset = cfg[self.name.upper()]['subset']
+        self.tgap = 1. / self.tscale
+        self.step = cfg[self.name.upper()]['window_step']
+
+        self.material = material
+        src_feature = self.material
+
+        image_feature = src_feature['image_feature']
+        # pcm_feature = src_feature['pcm_feature']
+        # pcm_feature = pcm_feature.reshape((pcm_feature.shape[0] * 5, 640))
+        # print(rgb_feature.shape, audio_feature.shape, pcm_feature.shape)
+        # min_length = min(image_feature.shape[0], pcm_feature.shape[0])
+        #if min_length == 0:
+        #    continue
+        # image_feature = image_feature[:min_length, :]
+        # pcm_feature = pcm_feature[:min_length, :]
+        # self.features = np.concatenate((image_feature, pcm_feature), axis=1)
+        self.features = image_feature
+        self.duration = len(self.features)
+        self.window = self.tscale
+
+        self.get_dataset_dict()
+        self.get_match_map()
+
+        self.batch_size = cfg[self.name.upper()]['batch_size']
+        if (mode == 'test') or (mode == 'infer'):
+            self.num_threads = 1  # set num_threads as 1 for test and infer
+
+    def get_dataset_dict(self):
+        """
+        get_dataset_dict
+        """
+        self.video_list = get_sw_prop(self.duration, self.window, self.step)
+
+    def get_match_map(self):
+        """
+        get_match_map
+        """
+        match_map = []
+        for idx in range(self.tscale):
+            tmp_match_window = []
+            xmin = self.tgap * idx
+            for jdx in range(1, self.tscale + 1):
+                xmax = xmin + self.tgap * jdx
+                tmp_match_window.append([xmin, xmax])
+            match_map.append(tmp_match_window)
+        match_map = np.array(match_map)
+        match_map = np.transpose(match_map, [1, 0, 2])
+        match_map = np.reshape(match_map, [-1, 2])
+        self.match_map = match_map
+        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]
+        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]
+
+    def load_file(self, video_wind):
+        """
+        load_file
+        """
+        start_feat_id = video_wind[0]
+        end_feat_id = video_wind[1]
+        video_feat = self.features[video_wind[0]:video_wind[1]]
+        video_feat = video_feat.T
+        video_feat = video_feat.astype("float32")
+        return video_feat
+
+    def create_reader(self):
+        """
+        reader creator for ctcn model
+        """
+        return self.make_infer_reader()
+
+    def make_infer_reader(self):
+        """
+        reader for inference
+        """
+        def reader():
+            """
+            reader
+            """
+            batch_out = []
+            # for video_name in self.video_list:
+            for video_wind in self.video_list:
+                video_idx = self.video_list.index(video_wind)
+                video_feat = self.load_file(video_wind)
+                batch_out.append(
+                    (video_feat, video_wind, [self.duration, self.dscale]))
+
+                if len(batch_out) == self.batch_size:
+                    yield batch_out
+                    batch_out = []
+            if len(batch_out) > 0:
+                yield batch_out
+
+        return reader
diff --git a/docs/src/applications/TableTennis/predict/action_detect/reader/feature_reader.py b/docs/src/applications/TableTennis/predict/action_detect/reader/feature_reader.py
new file mode 100644
index 000000000..f46dd3f41
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/reader/feature_reader.py
@@ -0,0 +1,91 @@
+"""
+attention-lstm feature reader
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+import numpy as np
+import random
+import code
+
+from .reader_utils import DataReader
+
+
+class FeatureReader(DataReader):
+    """
+    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks
+    This is for the three models: lstm, attention cluster, nextvlad
+
+    dataset cfg: num_classes
+                 batch_size
+                 list
+                 NextVlad only: eigen_file
+    """
+    def __init__(self, name, mode, cfg, material=None):
+        self.name = name
+        self.mode = mode
+        self.batch_size = cfg[self.name.upper()]['batch_size']
+
+        self.feature = material['feature']
+        self.proposal = material['proposal']
+        self.fps = 5
+
+    def create_reader(self):
+        """
+        create_reader
+        """
+        image_feature_list = self.feature['image_feature']
+        audio_feature_list = self.feature['audio_feature']
+        pcm_feature_list = self.feature['pcm_feature']
+        pcm_feature_list = pcm_feature_list.reshape(
+            (pcm_feature_list.shape[0] * 5, 640))
+
+        fl = self.proposal
+
+        if self.mode == 'train':
+            random.shuffle(fl)
+
+        def reader():
+            """
+            reader
+            """
+            batch_out = []
+            for prop_info in fl:
+                start_id = int(prop_info['start'])
+                end_id = int(prop_info['end'])
+                bmn_score = float(prop_info['score'])
+                try:
+                    image_feature = image_feature_list[start_id:end_id]
+                    audio_feature = audio_feature_list[int(start_id / self.fps
+                                                           ):int(end_id /
+                                                                 self.fps)]
+                    pcm_feature = pcm_feature_list[start_id:end_id]
+
+                    # image_feature = np.concatenate((image_feature, pcm_feature), axis=1)
+
+                    batch_out.append(
+                        (image_feature, audio_feature, 0, prop_info))
+                    if len(batch_out) == self.batch_size:
+                        yield batch_out
+                        batch_out = []
+                except Exception as e:
+                    continue
+
+        return reader
diff --git a/docs/src/applications/TableTennis/predict/action_detect/reader/reader_utils.py b/docs/src/applications/TableTennis/predict/action_detect/reader/reader_utils.py
new file mode 100644
index 000000000..5acff1778
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/reader/reader_utils.py
@@ -0,0 +1,107 @@
+"""
+reader_util
+"""
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+
+
+class ReaderNotFoundError(Exception):
+    """
+    "Error: reader not found"
+    """
+    def __init__(self, reader_name, avail_readers):
+        super(ReaderNotFoundError, self).__init__()
+        self.reader_name = reader_name
+        self.avail_readers = avail_readers
+
+    def __str__(self):
+        msg = "Reader {} Not Found.\nAvailiable readers:\n".format(
+            self.reader_name)
+        for reader in self.avail_readers:
+            msg += "  {}\n".format(reader)
+        return msg
+
+
+class DataReader(object):
+    """
+    data reader for video input
+    """
+    def __init__(self, model_name, mode, cfg):
+        self.name = model_name
+        self.mode = mode
+        self.cfg = cfg
+
+    def create_reader(self):
+        """
+        Not implemented
+        """
+        pass
+
+    def get_config_from_sec(self, sec, item, default=None):
+        """
+        get_config_from_sec
+        """
+        if sec.upper() not in self.cfg:
+            return default
+        return self.cfg[sec.upper()].get(item, default)
+
+
+class ReaderZoo(object):
+    """
+    ReaderZoo
+    """
+    def __init__(self):
+        """
+        __init__
+        """
+        self.reader_zoo = {}
+
+    def regist(self, name, reader):
+        """
+        regist
+        """
+        assert reader.__base__ == DataReader, "Unknow model type {}".format(
+            type(reader))
+        self.reader_zoo[name] = reader
+
+    def get(self, name, mode, cfg, material=None):
+        """
+        get
+        """
+        for k, v in self.reader_zoo.items():
+            if k == name:
+                return v(name, mode, cfg, material)
+        raise ReaderNotFoundError(name, self.reader_zoo.keys())
+
+
+# singleton reader_zoo
+reader_zoo = ReaderZoo()
+
+
+def regist_reader(name, reader):
+    """
+    regist_reader
+    """
+    reader_zoo.regist(name, reader)
+
+
+def get_reader(name, mode, cfg, material=None):
+    """
+    get_reader
+    """
+    reader_model = reader_zoo.get(name, mode, cfg, material)
+    return reader_model.create_reader()
diff --git a/docs/src/applications/TableTennis/predict/action_detect/utils/config_utils.py b/docs/src/applications/TableTennis/predict/action_detect/utils/config_utils.py
new file mode 100644
index 000000000..a5f3f5f3b
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/utils/config_utils.py
@@ -0,0 +1,81 @@
+"""
+config_utils
+"""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import ast
+
+import logger
+
+logger = logger.Logger()
+
+CONFIG_SECS = [
+    'train',
+    'valid',
+    'test',
+    'infer',
+]
+
+
+class AttrDict(dict):
+    """
+    AttrDict
+    """
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    import yaml
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def create_attr_dict(yaml_config):
+    """create_attr_dict"""
+    for key, value in yaml_config.items():
+        if isinstance(value, dict):
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = ast.literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+    return
+
+
+def print_configs(cfg, mode):
+    """print_configs"""
+    logger.info(
+        "---------------- {:>5} Arguments ----------------".format(mode))
+    for sec, sec_items in cfg.items():
+        logger.info("{}:".format(sec))
+        for k, v in sec_items.items():
+            logger.info("    {}:{}".format(k, v))
+    logger.info("-------------------------------------------------")
diff --git a/docs/src/applications/TableTennis/predict/action_detect/utils/preprocess.py b/docs/src/applications/TableTennis/predict/action_detect/utils/preprocess.py
new file mode 100644
index 000000000..1451df1fe
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/utils/preprocess.py
@@ -0,0 +1,36 @@
+""" extract frames and pcm"""
+import os
+import sys
+import shutil
+
+
+def ffmpeg_frames(mp4_addr, frame_out_folder, fps=5):
+    """ffmpeg_frames"""
+    if os.path.exists(frame_out_folder):
+        shutil.rmtree(frame_out_folder)
+    os.makedirs(frame_out_folder)
+    cmd = './src/utils/ffmpeg -v 0 -i %s -r %d -q 0 %s/%s.jpg' % (
+        mp4_addr, fps, frame_out_folder, '%08d')
+    os.system(cmd)
+
+
+def ffmpeg_pcm(mp4_addr, save_file_name):
+    """ffmpeg_pcm"""
+    cmd = './src/utils/ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s -v 0' \
+        % (mp4_addr, save_file_name)
+    os.system(cmd)
+
+
+def ffmpeg_mp4(mp4_url, mp4_addr):
+    """ffmpeg_mp4"""
+    cmd = "wget %s -O %s -q" % (mp4_url, mp4_addr)
+    print("cmd = ", cmd)
+    os.system(cmd)
+
+
+def get_images(image_path):
+    """get_images"""
+    images = sorted(os.listdir(image_path))
+    images = images
+    images_path_list = [image_path + '/' + im for im in images]
+    return images_path_list
diff --git a/docs/src/applications/TableTennis/predict/action_detect/utils/process_result.py b/docs/src/applications/TableTennis/predict/action_detect/utils/process_result.py
new file mode 100644
index 000000000..b7e351264
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/action_detect/utils/process_result.py
@@ -0,0 +1,155 @@
+"""
+# @File  : process_result.py
+# @Author: macaihong
+# @Date  : 2019/12/15
+# @Desc  :
+"""
+
+import sys
+import os
+import re
+import numpy as np
+import pickle
+import json
+import logger
+
+logger = logger.Logger()
+
+
+def get_data_res(label_map, data, topk):
+    """get_data_res"""
+    sum_vid = len(data)
+    video_result = []
+    for i in range(sum_vid):
+        vid_name = data[i][0][0]
+        # true_label predict_start predict_end predict_score predict_len gt_iou gt_start gt_ioa
+        feature_start_id = float(data[i][0][1]['start'])
+        feature_end_id = float(data[i][0][1]['end'])
+        feature_stage1_score = data[i][0][1]['score']
+        predict_res = []
+        for k in range(topk):
+            score_top = data[i][1][k]
+            labelid_top = data[i][2][k]
+            label_iou = data[i][3]
+            labelname_top = label_map[str(labelid_top)]
+            video_result.append([
+                feature_start_id, feature_end_id, labelid_top, labelname_top,
+                score_top, label_iou
+            ])
+    return video_result
+
+
+def base_nms(bboxes, thresh, delta=0, nms_id=2):
+    """
+    One-dimensional non-maximal suppression
+    :param bboxes: [[vid, label, st, ed, score, ...], ...]
+    :param thresh:
+    :return:
+    """
+    """
+    t1 = bboxes[:, 0]
+    t2 = bboxes[:, 1]
+    scores = bboxes[:, nms_id]
+    """
+
+    t1 = np.array([max(0, x[0] - delta) for x in bboxes])
+    t2 = np.array([x[1] + delta for x in bboxes])
+    scores = np.array([x[nms_id] for x in bboxes])
+
+    durations = t2 - t1
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        tt1 = np.maximum(t1[i], t1[order[1:]])
+        tt2 = np.minimum(t2[i], t2[order[1:]])
+        intersection = tt2 - tt1
+        IoU = intersection / (durations[i] + durations[order[1:]] -
+                              intersection).astype(float)
+
+        inds = np.where(IoU <= thresh)[0]
+        order = order[inds + 1]
+    return [bboxes[i] for i in keep]
+
+
+def process_proposal(source_prop_box,
+                     min_frame_thread=5,
+                     nms_thresh=0.7,
+                     score_thresh=0.01):
+    """process_video_prop"""
+    prop_box = []
+    for items in source_prop_box:
+        start_frame = float(items[0])
+        end_frame = float(items[1])
+        score = float(items[2])
+        if end_frame - start_frame < min_frame_thread or score < score_thresh:
+            continue
+        prop_box.append([start_frame, end_frame, score])
+
+    prop_box_keep = base_nms(prop_box, nms_thresh)
+
+    prop_res = []
+    for res in prop_box_keep:
+        prop_res.append({'start': res[0], 'end': res[1], 'score': res[2]})
+
+    return prop_res
+
+
+def process_video_classify(video_prop, fps, score_thread, iou_thread, \
+                           nms_id=5, nms_thread=0.01, nms_delta=10, backgroundid=0):
+    """process_video_classify"""
+    prop_filter = []
+    for item in video_prop:
+        if item[2] == backgroundid:
+            continue
+        prop_filter.append(item)
+
+    # prop_filter = sorted(prop_filter, key=lambda x: x[nms_id], reverse=True)
+    prop_filter = base_nms(prop_filter, nms_thread, nms_delta, nms_id)
+    prop_filter = sorted(prop_filter, key=lambda x: x[0])
+
+    video_results = []
+    for item in prop_filter:
+        start_sec = item[0] / fps
+        end_sec = item[1] / fps
+
+        start_id_frame = item[0]
+        end_id_frame = item[1]
+        # start_time = "%02d:%02d:%02d" % ((start_id_frame / fps) / 3600, \
+        #     ((start_id_frame / fps) % 3600) / 60, (start_id_frame / fps) % 60)
+        # end_time = "%02d:%02d:%02d" % ((end_id_frame / fps) / 3600, \
+        #     ((end_id_frame / fps) % 3600) / 60, (end_id_frame / fps) % 60)
+        start_time = int(start_id_frame / fps)
+        end_time = int(end_id_frame / fps)
+
+        label_id = item[2]
+        label_name = item[3]
+        label_classify_score = item[4]
+        label_iou_score = item[5]
+        if label_classify_score > score_thread and label_iou_score > iou_thread:
+            video_results.append({
+                "start_time": start_time,
+                "end_time": end_time,
+                "label_id": label_id,
+                "label_name": label_name,
+                "classify_score": label_classify_score,
+                "iou_score": label_iou_score
+            })
+
+    return video_results
+
+
+def get_action_result(result_info, label_map_file, fps, score_thread=0, \
+                      iou_thread=0, nms_id=5, nms_thread=0.01, frame_offset=10, topk=1):
+    """get_action_result"""
+
+    label_map = json.load(open(label_map_file, 'r', encoding='utf-8'))
+
+    org_result = get_data_res(label_map, result_info, topk)
+    nms_result = process_video_classify(org_result, fps, score_thread,
+                                        iou_thread, nms_id, nms_thread,
+                                        frame_offset)
+
+    return nms_result
diff --git a/docs/src/applications/TableTennis/predict/eval.py b/docs/src/applications/TableTennis/predict/eval.py
new file mode 100644
index 000000000..5455f935c
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/eval.py
@@ -0,0 +1,287 @@
+"""
+get instance for lstm
+根据gts计算每个proposal_bmn的iou、ioa、label等信息
+"""
+import os
+import sys
+import json
+import random
+import pickle
+import numpy as np
+
+import io
+
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+dataset = "/home/work/datasets"
+
+label_index_file = './configs/index_label_football_7.json'
+eval_datasets = ['EuroCup2016']
+label_files = {
+    'train': 'label_cls8_train.json',
+    'validation': 'label_cls8_val.json'
+}
+
+global fps, mode
+label_index = json.load(open(label_index_file, 'rb'))
+
+
+def load_gts():
+    global fps
+    gts_data = {'fps': 0, 'gts': {}}
+    for eval_data in eval_datasets:
+        for item, value in label_files.items():
+            label_file = '{}/{}/{}'.format(dataset, eval_data, value)
+            gts = json.load(open(label_file, 'rb'))
+            gts_data['fps'] = gts['fps']
+            fps = gts['fps']
+            for gt in gts['gts']:
+                gt['mode'] = item
+                basename = '{}/{}/mp4/{}'.format(dataset, eval_data,
+                                                 os.path.basename(gt['url']))
+                gts_data['gts'][basename] = gt
+    return gts_data['gts']
+
+
+def computeIoU(e1, e2):
+    """
+    clc iou and ioa
+    """
+    if not (e1['label'] == e2['label'] and e1['basename'] == e2['basename']):
+        return 0.
+    area1 = e1["end"] - e1["start"]
+    area2 = e2["end"] - e2["start"]
+    x1 = np.maximum(e1["start"], e2["start"])
+    x2 = np.minimum(e1["end"], e2["end"])
+    inter = np.maximum(0.0, x2 - x1)
+    iou = 0.0 if (area1 + area2 -
+                  inter) == 0 else inter * 1.0 / (area1 + area2 - inter)
+    if not mode == 'proposal':
+        iou = 0.0 if area2 == 0 else inter * 1.0 / area2
+    return iou
+
+
+def convert_proposal(boxes, basename, score_threshold=0.01):
+    boxes = sorted(boxes, key=lambda x: float(x['score']), reverse=True)
+    res = []
+    for box in boxes:
+        if not float(box['score']) >= score_threshold:
+            continue
+        res.append({
+            'basename': basename,
+            'start': int(float(box['start']) / fps),
+            'end': int(float(box['end']) / fps),
+            'label': 0
+        })
+    return res
+
+
+def convert_classify(boxes, basename, iou_threshold, score_threshold):
+    boxes = sorted(boxes,
+                   key=lambda x:
+                   (float(x['classify_score']), float(x['iou_score'])),
+                   reverse=True)
+
+    def convert_time_to_frame(time_type):
+        return int(time_type)
+        h, m, s = time_type.split(':')
+        return int(h) * 3600 + int(m) * 60 + int(s)
+
+    res = []
+    for box in boxes:
+        if not (box['iou_score'] >= iou_threshold
+                and box['classify_score'] >= score_threshold):
+            continue
+        res.append({
+            'basename': basename,
+            'start': convert_time_to_frame(box['start_time']),
+            'end': convert_time_to_frame(box['end_time']),
+            'label': box['label_id']
+        })
+    return res
+
+
+def convert_groundtruth(boxes, basename, phase=None):
+    res = []
+    for box in boxes:
+        for item in box['label_ids']:
+            label = 0 if phase == 'proposal' else item
+            res.append({
+                'basename': basename,
+                'start': box['start_id'],
+                'end': box['end_id'],
+                'label': label
+            })
+    return res
+
+
+def print_head(iou):
+    print("\nioa = {:.1f}".format(iou))
+    res_str = ''
+    for item in ['label_name']:
+        res_str += '{:<12s}'.format(item)
+    for item in [
+            'label_id', 'precision', 'recall', 'hit_prop', 'num_prop',
+            'hit_gts', 'num_gts'
+    ]:
+        res_str += '{:<10s}'.format(item)
+    print(res_str)
+
+
+def print_result(res_dict, label='avg'):
+    if label == 'avg':
+        res_str = '{:<22s}'.format(str(label))
+    else:
+        res_str = '{0:{2}<6s}{1:<10s}'.format(label_index[str(label)],
+                                              str(label), chr(12288))
+
+    for item in ['prec', 'recall']:
+        res_str += '{:<10.4f}'.format(res_dict[item])
+    for item in ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']:
+        res_str += '{:<10d}'.format(res_dict[item])
+    print(res_str)
+
+
+def evaluation(res_boxes, gts_boxes, label_range, iou_range, show_sub=False):
+    iou_map = [computeIoU(resId, gtsId) for resId in res_boxes \
+                                        for gtsId in gts_boxes]
+    iou_map = np.array(iou_map).reshape((len(res_boxes), len(gts_boxes)))
+    hit_map_prop_total = np.max(iou_map, axis=1)
+    hit_map_index_total = np.argmax(iou_map, axis=1)
+
+    res_dict = ['hit_prop', 'num_prop', 'hit_gts', 'num_gts']
+
+    for iou_threshold in iou_range:
+        if show_sub:
+            print_head(iou_threshold)
+
+        iou_prop = np.array([k >= iou_threshold for k in hit_map_prop_total])
+        average_results = {}
+        for label_id in label_range:
+            sub_results = {}
+            label_prop = np.array([k['label'] == label_id for k in res_boxes])
+            label_gts = np.array([k['label'] == label_id for k in gts_boxes])
+            sub_results['num_prop'] = sum(label_prop)
+            sub_results['num_gts'] = sum(label_gts)
+            if sub_results['num_prop'] == 0:
+                hit_prop_index = []
+            else:
+                hit_prop_index = label_prop & iou_prop
+            sub_results['hit_prop'] = sum(hit_prop_index)
+            sub_results['hit_gts'] = len(
+                set(hit_map_index_total[hit_prop_index]))
+
+            sub_results['prec'] = 0.0 if sub_results['num_prop'] == 0 \
+                                      else sub_results['hit_prop'] * 1.0 / sub_results['num_prop']
+            sub_results['recall'] = 0.0 if sub_results['num_gts'] == 0 \
+                                        else sub_results['hit_gts'] * 1.0 / sub_results['num_gts']
+            if show_sub:
+                print_result(sub_results, label=label_id)
+            for item in res_dict:
+                if not item in average_results:
+                    average_results[item] = 0
+                average_results[item] += sub_results[item]
+        if len(label_range) == 1:  # proposal 不需要输出average值
+            continue
+        average_results['prec'] = 0.0 if average_results['num_prop'] == 0 \
+                                      else average_results['hit_prop'] * 1.0 / average_results['num_prop']
+        average_results['recall'] = 0.0 if average_results['num_gts'] == 0 \
+                                        else average_results['hit_gts'] * 1.0 / average_results['num_gts']
+        if show_sub:
+            print_result(average_results)
+
+        average_results['F1'] = 0.0 if (average_results['prec'] + average_results['recall'] == 0) \
+                                    else 2 * average_results['prec'] * average_results['recall'] / \
+                                            (average_results['prec'] + average_results['recall'])
+        return average_results
+
+
+def get_eval_results(predicts,
+                     gts_data,
+                     phase,
+                     iou_threshold=0.3,
+                     score_threshold=0.3,
+                     show_sub=False):
+    global mode
+    mode = phase
+    res_boxes = []
+    gts_boxes = []
+    for ped_data in predicts:
+        basename = ped_data['video_name']
+
+        # eval sub data
+        such_eval = False
+        for eval_name in eval_datasets:
+            if eval_name in basename:
+                such_eval = True
+                break
+        if not such_eval:
+            continue
+
+        gts = gts_data[basename]['actions']
+        if phase == 'proposal':
+            res_boxes.extend(
+                convert_proposal(ped_data['bmn_results'], basename,
+                                 score_threshold))
+            gts_boxes.extend(
+                convert_groundtruth(gts, basename, phase='proposal'))
+            label_range = [0]
+            iou_range = np.arange(0.1, 1, 0.1)
+        else:
+            res_boxes.extend(
+                convert_classify(ped_data['action_results'], basename,
+                                 iou_threshold, score_threshold))
+            gts_boxes.extend(convert_groundtruth(gts, basename))
+            label_range = range(1, len(label_index))
+            iou_range = np.arange(0.5, 0.6, 0.1)
+
+    eval_results = evaluation(res_boxes,
+                              gts_boxes,
+                              label_range,
+                              iou_range,
+                              show_sub=show_sub)
+
+    return eval_results
+
+
+if __name__ == "__main__":
+    result_file = sys.argv[1]
+    predicts = json.load(open(result_file, 'r', encoding='utf-8'))
+    gts_data = load_gts()
+
+    get_eval_results(predicts,
+                     gts_data,
+                     'proposal',
+                     score_threshold=0.03,
+                     show_sub=True)
+    #get_eval_results(predicts, gts_data, 'actions')
+
+    best_F1 = -0.1
+    best_res = {}
+    best_iou_threshold = 0.
+    best_score_threshold = 0.
+    for iou_threshold in np.arange(0.1, 0.9, 0.1):
+        for score_threshold in np.arange(0.1, 1, 0.1):
+            avg_res = get_eval_results(predicts,
+                                       gts_data,
+                                       'actions',
+                                       iou_threshold=iou_threshold,
+                                       score_threshold=score_threshold,
+                                       show_sub=False)
+            if best_F1 < avg_res['F1']:
+                best_F1 = avg_res['F1']
+                best_res = avg_res
+                best_iou_threshold = iou_threshold
+                best_score_threshold = score_threshold
+    print("best iou threshold = {:.1f}".format(best_iou_threshold))
+    print("best score threshold = {:.1f}".format(best_score_threshold))
+    print('best F1 score = {:.4f}'.format(best_F1))
+    print_head(0.5)
+    print_result(best_res)
+
+    get_eval_results(predicts,
+                     gts_data,
+                     'actions',
+                     iou_threshold=best_iou_threshold,
+                     score_threshold=best_score_threshold,
+                     show_sub=True)
diff --git a/docs/src/applications/TableTennis/predict/predict.py b/docs/src/applications/TableTennis/predict/predict.py
new file mode 100644
index 000000000..7407cad0b
--- /dev/null
+++ b/docs/src/applications/TableTennis/predict/predict.py
@@ -0,0 +1,36 @@
+import os
+import sys
+import json
+
+sys.path.append('action_detect')
+from action import ActionDetection
+
+if __name__ == '__main__':
+    dataset_dir = "/home/work/datasets/EuroCup2016"
+
+    model_predict = ActionDetection(cfg_file="./configs/configs.yaml")
+    model_predict.load_model()
+
+    video_url = os.path.join(dataset_dir, 'url_val.list')
+    with open(video_url, 'r') as f:
+        lines = f.readlines()
+    lines = [os.path.join(dataset_dir, k.strip()) for k in lines]
+
+    results = []
+    for line in lines:
+        video_name = line
+        print(video_name)
+
+        imgs_path = video_name.replace(".mp4", "").replace("mp4", "frames")
+        pcm_path = video_name.replace(".mp4", ".pcm").replace("mp4", "pcm")
+
+        bmn_results, action_results = model_predict.infer(imgs_path, pcm_path)
+        results.append({
+            'video_name': line,
+            'bmn_results': bmn_results,
+            'action_results': action_results
+        })
+
+    with open('results.json', 'w', encoding='utf-8') as f:
+        data = json.dumps(results, indent=4, ensure_ascii=False)
+        f.write(data)
diff --git a/docs/src/applications/TableTennis/val_split.py b/docs/src/applications/TableTennis/val_split.py
new file mode 100644
index 000000000..1a1e8e6ac
--- /dev/null
+++ b/docs/src/applications/TableTennis/val_split.py
@@ -0,0 +1,19 @@
+import json
+
+with open('/home/aistudio/data/label_cls14_train.json') as f:
+    data = json.load(f)
+f.close()
+
+val = {'gts': data['gts'][0:5], 'fps': 25}
+
+jsonString = json.dumps(val, indent=4, ensure_ascii=False)
+jsonFile = open('/home/aistudio/data/label_cls14_val.json', 'w')
+jsonFile.write(jsonString)
+jsonFile.close()
+
+train = {'gts': data['gts'][5:], 'fps': 25}
+
+jsonString = json.dumps(train, indent=4, ensure_ascii=False)
+jsonFile = open('/home/aistudio/data/label_cls14_train.json', 'w')
+jsonFile.write(jsonString)
+jsonFile.close()
diff --git a/docs/src/applications/VideoQualityAssessment/README.md b/docs/src/applications/VideoQualityAssessment/README.md
new file mode 100644
index 000000000..7b475e027
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/README.md
@@ -0,0 +1,190 @@
+# 视频质量评价模型
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [模型优化](#模型优化)
+- [模型部署](#模型部署)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+该代码库主要基于paddle2.1版本开发，主要是在ppTSM网络模型的基础上修改的一种无参考视频质量评估方法，通过读入视频的视频帧来判断该视频的质量。
+
+针对视频内容的理解，可以自动分析视频内容的质量，帮助选出最优的关键帧或关键片段作为视频封面，提升视频的点击转换和用户体验。
+
+本项目目前支持Linux下的GPU单卡和多卡运行环境。
+
+## 数据准备
+
+```
+数据集来自公开数据集KonVid-150k，共153842个ugc视频，其中训练集(KonVid-150k-A)152265个，验证集(KonVid-150k-B)1577个
+示例数据集以及数据集官网地址: datasets/dataset_url.list
+数据集标注文件为dataset中的train.txt和eval.txt
+```
+
+## 模型训练
+
+环境安装：
+
+- PaddlePaddle >= 2.1.0
+- Python >= 3.7
+- PaddleX >= 2.0.0
+
+- CUDA >= 10.1
+- cuDNN >= 7.6.4
+- nccl >= 2.1.2
+
+安装Python依赖库：
+
+Python依赖库在[requirements.txt](https://github.com/PaddlePaddle/PaddleVideo/blob/master/requirements.txt)中给出，可通过如下命令安装：
+
+```
+python3.7 -m pip install --upgrade pip
+pip3.7 install --upgrade -r requirements.txt
+```
+
+使用`paddle.distributed.launch`启动模型训练和测试脚本（`main.py`），可以更方便地启动多卡训练与测试，或直接运行(./run.sh)
+
+```shell
+sh run.sh
+```
+我们将所有标准的启动命令都放在了```run.sh```中，注意选择想要运行的脚本。
+
+参考如下方式启动模型训练，`paddle.distributed.launch`通过设置`gpus`指定GPU运行卡号，
+指定`--validate`来启动训练时评估。
+
+```bash
+# PaddleVideo通过launch方式启动多卡多进程训练
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    --log_dir=log_pptsm \
+    main.py \
+    --amp \
+    --validate \
+    -c ./configs/recognition/tsm/pptsm_regression.yaml
+```
+
+其中，`-c`用于指定配置文件的路径，可通过配置文件修改相关训练配置信息，也可以通过添加`-o`参数来更新配置：
+
+```bash
+python -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    main.py \
+    -c ./configs/recognition/tsm/pptsm_regression.yaml \
+    --validate \
+    -o DATASET.batch_size=16
+```
+`-o`用于指定需要修改或者添加的参数，其中`-o DATASET.batch_size=16`表示更改batch_size大小为16。
+
+运行上述命令，将会输出运行日志，并默认保存在./log目录下，如：`worker.0` , `worker.1` ... , worker日志文件对应每张卡上的输出
+
+【train阶段】打印当前时间，当前epoch/epoch总数，当前batch id，评估指标，耗时，ips等信息：
+
+
+    [11/16 04:40:37] epoch:[  1/1  ] train step:100  loss: 5.31382 lr: 0.000250 batch_cost: 0.73082 sec, reader_cost: 0.38075 sec, ips: 5.47330 instance/sec.
+
+
+【eval阶段】打印当前时间，当前epoch/epoch总数，当前batch id，评估指标，耗时，ips等信息：
+
+
+    [11/16 04:40:37] epoch:[  1/1  ] val step:0    loss: 4.42741 batch_cost: 1.37882 sec, reader_cost: 0.00000 sec, ips: 2.90104 instance/sec.
+
+
+【epoch结束】打印当前时间，学习率，评估指标，耗时，ips等信息：
+
+
+    [11/16 04:40:37] lr=0.00012487
+    [11/16 04:40:37] train_SROCC=0.4456697876616565
+    [11/16 04:40:37] train_PLCC=0.48071880604403616
+    [11/16 04:40:37] END epoch:1   val loss_avg: 5.21620 avg_batch_cost: 0.04321 sec, avg_reader_cost: 0.00000 sec, batch_cost_sum: 112.69575 sec, avg_ips: 8.41203 instance/sec.
+
+
+当前为评估结果最好的epoch时，打印最优精度：
+
+    [11/16 04:40:57] max_SROCC=0.7116468111328617
+    [11/16 04:40:57] max_PLCC=0.733503995526737
+
+### 模型恢复训练
+
+如果训练任务终止，可以加载断点权重文件(优化器-学习率参数，断点文件)继续训练。
+需要指定`-o resume_epoch`参数，该参数表示从```resume_epoch```轮开始继续训练.
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    main.py \
+    --amp \
+    -c ./configs/recognition/tsm/pptsm_regression.yaml \
+    --validate \
+    -o resume_epoch=5
+
+```
+
+### 模型微调
+
+进行模型微调（Finetune），对自定义数据集进行模型微调，需要指定 `--weights` 参数来加载预训练模型。
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    main.py \
+    --amp \
+    -c ./configs/recognition/tsm/pptsm_regression.yaml \
+    --validate \
+    --weights=./output/model_name/ppTSM_best.pdparams
+```
+
+PaddleVideo会自动**不加载**shape不匹配的参数
+
+## 模型测试
+
+需要指定 `--test`来启动测试模式，并指定`--weights`来加载预训练模型。
+
+```bash
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    main.py \
+    -c ./configs/recognition/tsm/pptsm_regression.yaml \
+    --test \
+    --weights=./output/model_name/ppTSM_best.pdparams
+```
+
+## 模型优化
+
+在实际使用场景中可根据视频质量以及尺寸尝试优化策略
+
+- 可通过原图输入来替换RandomCrop:224操作，准确率由SROCC=0.8176,PLCC=0.8361提升到SROCC=0.8617,PLCC=0.8910,不同模型以及特征增强操作的效果对比如下表所示
+
+  |  模型  |                  特征增强                   | val_SROCC | val_PLCC |
+  | :----: | :-----------------------------------------: | :-------: | :------: |
+  | GSTVQA |                  原图输入                   |  0.7932   |  0.8006  |
+  | ppTSM  | train--RandomCrop=224  val--center_crop=224 |  0.8176   |  0.8361  |
+  | ppTSM  | train--RandomCrop=512  val--center_crop=512 |  0.8603   |  0.8822  |
+  | ppTSM  |                  原图输入                   |  0.8617   |  0.8910  |
+
+  
+
+- 考虑应用场景视频的 aspect ratio 大都为 16：9 和 4：3 等，同时为了避免非均匀缩放拉伸带来的干扰 ，可以采用了（224x3）x(224x2)=672x448 的输入尺寸来更充分得利用有限的输入尺寸。 
+
+## 模型部署
+
+本代码解决方案在官方验证集(KonVid-150k-B)上的指标效果为SROCC=0.8176,PLCC=0.8361。
+
+## 参考论文
+
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
+
+- [Quality Assessment of In-the-Wild Videos](https://dl.acm.org/citation.cfm?doid=3343031.3351028), Dingquan Li, Tingting Jiang, and Ming Jiang
+
diff --git a/docs/src/applications/VideoQualityAssessment/main.py b/docs/src/applications/VideoQualityAssessment/main.py
new file mode 100644
index 000000000..1d68e173c
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/main.py
@@ -0,0 +1,88 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+import argparse
+from paddlevideo.utils import get_config
+from paddlevideo.tasks import train_model, test_model
+from paddlevideo.utils import get_dist_info
+
+
+def parse_args():
+    """parse_args"""
+    parser = argparse.ArgumentParser("PaddleVideo train script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/example.yaml',
+                        help='config file path')
+    parser.add_argument('-o',
+                        '--override',
+                        action='append',
+                        default=[],
+                        help='config options to be overridden')
+    parser.add_argument('--test',
+                        action='store_true',
+                        help='whether to test a model')
+    parser.add_argument('--train_dali',
+                        action='store_true',
+                        help='whether to use dali to speed up training')
+    parser.add_argument('--multigrid',
+                        action='store_true',
+                        help='whether to use multigrid training')
+    parser.add_argument('-w',
+                        '--weights',
+                        type=str,
+                        help='weights for finetuning or testing')
+    parser.add_argument('--fleet',
+                        action='store_true',
+                        help='whether to use fleet run distributed training')
+    parser.add_argument('--amp',
+                        action='store_true',
+                        help='whether to open amp training.')
+
+    parser.add_argument(
+        '--validate',
+        action='store_true',
+        help='whether to evaluate the checkpoint during training')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    """main"""
+    args = parse_args()
+    cfg = get_config(args.config, overrides=args.override)
+
+    _, world_size = get_dist_info()
+    parallel = world_size != 1
+    if parallel:
+        paddle.distributed.init_parallel_env()
+
+    if args.test:
+        test_model(cfg, weights=args.weights, parallel=parallel)
+    else:
+        train_model(cfg,
+                    weights=args.weights,
+                    parallel=parallel,
+                    validate=args.validate,
+                    fleet=args.fleet,
+                    amp=args.amp)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/__init__.py
new file mode 100644
index 000000000..49a4637b2
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/__init__.py
@@ -0,0 +1,17 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .version import paddlevideo_version
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py
new file mode 100644
index 000000000..a3f8fa24c
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py
@@ -0,0 +1,21 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .builder import build_dataset, build_dataloader, build_batch_pipeline
+from .dataset import VideoDataset
+__all__ = [
+    'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset'
+]
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/builder.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/builder.py
new file mode 100644
index 000000000..6dc1d74bd
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/builder.py
@@ -0,0 +1,126 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import signal
+import os
+import paddle
+from paddle.io import DataLoader, DistributedBatchSampler
+from .registry import DATASETS, PIPELINES
+from ..utils.build_utils import build
+from .pipelines.compose import Compose
+from paddlevideo.utils import get_logger
+import numpy as np
+
+logger = get_logger("paddlevideo")
+
+
+def build_pipeline(cfg):
+    """Build pipeline.
+    Args:
+        cfg (dict): root config dict.
+    """
+    return Compose(cfg)
+
+
+def build_dataset(cfg):
+    """Build dataset.
+    Args:
+        cfg (dict): root config dict.
+
+    Returns:
+        dataset: dataset.
+    """
+    #XXX: ugly code here!
+    cfg_dataset, cfg_pipeline = cfg
+    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)
+    dataset = build(cfg_dataset, DATASETS, key="format")
+    return dataset
+
+
+def build_batch_pipeline(cfg):
+    """build batch pipeline"""
+    batch_pipeline = build(cfg, PIPELINES)
+    return batch_pipeline
+
+
+def build_dataloader(dataset,
+                     batch_size,
+                     num_workers,
+                     places,
+                     shuffle=True,
+                     drop_last=True,
+                     multigrid=False,
+                     collate_fn_cfg=None,
+                     **kwargs):
+    """Build Paddle Dataloader.
+
+    XXX explain how the dataloader work!
+
+    Args:
+        dataset (paddle.dataset): A PaddlePaddle dataset object.
+        batch_size (int): batch size on single card.
+        num_worker (int): num_worker
+        shuffle(bool): whether to shuffle the data at every epoch.
+    """
+    sampler = DistributedBatchSampler(dataset,
+                                      batch_size=batch_size,
+                                      shuffle=shuffle,
+                                      drop_last=drop_last)
+
+    #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.
+    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:
+    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.
+
+    def mix_collate_fn(batch):
+        """mix collate fn"""
+        pipeline = build_batch_pipeline(collate_fn_cfg)
+        batch = pipeline(batch)
+        slots = []
+        for items in batch:
+            for i, item in enumerate(items):
+                if len(slots) < len(items):
+                    slots.append([item])
+                else:
+                    slots[i].append(item)
+        return [np.stack(slot, axis=0) for slot in slots]
+
+    #if collate_fn_cfg is not None:
+    #ugly code here. collate_fn is mix op config
+    #    collate_fn = mix_collate_fn(collate_fn_cfg)
+
+    data_loader = DataLoader(
+        dataset,
+        batch_sampler=sampler,
+        places=places,
+        num_workers=num_workers,
+        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,
+        return_list=True,
+        **kwargs)
+
+    return data_loader
+
+
+def term_mp(sig_num, frame):
+    """ kill all child processes
+    """
+    pid = os.getpid()
+    pgid = os.getpgid(os.getpid())
+    logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid))
+    os.killpg(pgid, signal.SIGKILL)
+    return
+
+
+signal.signal(signal.SIGINT, term_mp)
+signal.signal(signal.SIGTERM, term_mp)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py
new file mode 100644
index 000000000..2de277e1d
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py
@@ -0,0 +1,21 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .video import VideoDataset
+#from .frame import FrameDataset
+from .frame_rec import FrameRecDataset
+
+__all__ = ['VideoDataset', 'FrameRecDataset']
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py
new file mode 100644
index 000000000..9400aca56
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py
@@ -0,0 +1,83 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os.path as osp
+import copy
+import numpy as np
+from abc import ABC, abstractmethod
+
+import paddle
+from paddle.io import Dataset
+
+
+class BaseDataset(Dataset, ABC):
+    """Base class for datasets
+
+    All datasets should subclass it.
+    All subclass should overwrite:
+
+    - Method: `load_file`, load info from index file.
+    - Method: `prepare_train`, providing train data.
+    - Method: `prepare_test`, providing test data.
+
+    Args:
+        file_path (str): index file path.
+        pipeline (Sequence XXX)
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): whether to build test dataset. Default: False.
+
+    """
+    def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):
+
+        super().__init__()
+        self.file_path = file_path
+        self.data_prefix = osp.realpath(data_prefix) if \
+            data_prefix is not None and osp.isdir(data_prefix) else data_prefix
+        self.test_mode = test_mode
+        self.pipeline = pipeline
+        self.info = self.load_file()
+
+    @abstractmethod
+    def load_file(self):
+        """load the video information from the index file path."""
+        pass
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        #unsqueeze label to list
+        return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        #unsqueeze label to list
+        return results['imgs'], np.array([results['labels']])
+
+    def __len__(self):
+        """get the size of the dataset."""
+        return len(self.info)
+
+    def __getitem__(self, idx):
+        """ Get the sample for either training or testing given index"""
+        if self.test_mode:
+            return self.prepare_test(idx)
+        else:
+            return self.prepare_train(idx)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py
new file mode 100644
index 000000000..de2dab63a
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py
@@ -0,0 +1,110 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class FrameRecDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                mp4_path, frame_dir, frames_len, labels = line_split
+                
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(frame_dir=frame_dir,
+                         suffix=self.suffix,
+                         frames_len=frames_len,
+                         labels=float(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py
new file mode 100644
index 000000000..289e526f5
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py
@@ -0,0 +1,95 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class VideoDataset(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+    def __init__(self, file_path, pipeline, num_retries=5, **kwargs):
+        self.num_retries = num_retries
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                filename, labels = line_split
+                #TODO(hj): Required suffix format: may mp4/avi/wmv
+                filename = filename + '.avi'
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                info.append(dict(filename=filename, labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py
new file mode 100644
index 000000000..b1fd63e31
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py
@@ -0,0 +1,50 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .augmentations import (
+    Scale,
+    RandomCrop,
+    CenterCrop,
+    RandomFlip,
+    Image2Array,
+    Normalization,
+    JitterScale,
+    MultiCrop,
+    PackOutput,
+)
+
+from .compose import Compose
+from .decode import VideoDecoder, FrameDecoder
+from .sample import Sampler
+from .mix import Mixup, Cutmix
+
+__all__ = [
+    'Scale',
+    'RandomCrop',
+    'CenterCrop',
+    'RandomFlip',
+    'Image2Array',
+    'Normalization',
+    'Compose',
+    'VideoDecoder',
+    'FrameDecoder',
+    'Sampler',
+    'Mixup',
+    'Cutmix',
+    'JitterScale',
+    'MultiCrop',
+    'PackOutput',
+]
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py
new file mode 100644
index 000000000..bee6b3281
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py
@@ -0,0 +1,498 @@
+"""
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import random
+import numpy as np
+import math
+from PIL import Image
+from ..registry import PIPELINES
+from collections.abc import Sequence
+
+
+@PIPELINES.register()
+class Scale(object):
+    """
+    Scale images.
+    Args:
+        short_size(float | int): Short size of an image will be scaled to the short_size.
+    """
+    def __init__(self, short_size):
+        self.short_size = short_size
+
+    def __call__(self, results):
+        """
+        Performs resize operations.
+        Args:
+            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            resized_imgs: List where each item is a PIL.Image after scaling.
+        """
+        imgs = results['imgs']
+        resized_imgs = []
+        for i in range(len(imgs)):
+            img = imgs[i]
+            w, h = img.size
+            if (w <= h and w == self.short_size) or (h <= w
+                                                     and h == self.short_size):
+                resized_imgs.append(img)
+                continue
+
+            if w < h:
+                ow = self.short_size
+                oh = int(self.short_size * 4.0 / 3.0)
+                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+            else:
+                oh = self.short_size
+                ow = int(self.short_size * 4.0 / 3.0)
+                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+        results['imgs'] = resized_imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomCrop(object):
+    """
+    Random crop images.
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+    """
+    def __init__(self, target_size):
+        self.target_size = target_size
+
+    def __call__(self, results):
+        """
+        Performs random crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            crop_imgs: List where each item is a PIL.Image after random crop.
+        """
+        imgs = results['imgs']
+        w, h = imgs[0].size
+        th, tw = self.target_size, self.target_size
+
+        assert (w >= self.target_size) and (h >= self.target_size), \
+            "image width({}) and height({}) should be larger than crop size {}".format(
+                w, h, self.target_size)
+
+        crop_images = []
+        x1 = random.randint(0, w - tw)
+        y1 = random.randint(0, h - th)
+
+        for img in imgs:
+            if w == tw and h == th:
+                crop_images.append(img)
+            else:
+                crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        results['imgs'] = crop_images
+        return results
+
+
+@PIPELINES.register()
+class CenterCrop(object):
+    """
+    Center crop images.
+    Args:
+        target_size(int): Center crop a square with the target_size from an image.
+    """
+    def __init__(self, target_size):
+        self.target_size = target_size
+
+    def __call__(self, results):
+        """
+        Performs Center crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            ccrop_imgs: List where each item is a PIL.Image after Center crop.
+        """
+        imgs = results['imgs']
+        ccrop_imgs = []
+        for img in imgs:
+            w, h = img.size
+            th, tw = self.target_size, self.target_size
+            assert (w >= self.target_size) and (h >= self.target_size), \
+                "image width({}) and height({}) should be larger than crop size {}".format(
+                    w, h, self.target_size)
+            x1 = int(round((w - tw) / 2.))
+            y1 = int(round((h - th) / 2.))
+            ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        results['imgs'] = ccrop_imgs
+        return results
+
+
+@PIPELINES.register()
+class MultiScaleCrop(object):
+    def __init__(
+            self,
+            target_size,  #NOTE: named target size now, but still pass short size in it!
+            scales=None,
+            max_distort=1,
+            fix_crop=True,
+            more_fix_crop=True):
+        self.target_size = target_size
+        self.scales = scales if scales else [1, .875, .75, .66]
+        self.max_distort = max_distort
+        self.fix_crop = fix_crop
+        self.more_fix_crop = more_fix_crop
+
+    def __call__(self, results):
+        """
+        Performs MultiScaleCrop operations.
+        Args:
+            imgs: List where wach item is a PIL.Image.
+            XXX:
+        results:
+
+        """
+        imgs = results['imgs']
+
+        input_size = [self.target_size, self.target_size]
+
+        im_size = imgs[0].size
+
+        # get random crop offset
+        def _sample_crop_size(im_size):
+            image_w, image_h = im_size[0], im_size[1]
+
+            base_size = min(image_w, image_h)
+            crop_sizes = [int(base_size * x) for x in self.scales]
+            crop_h = [
+                input_size[1] if abs(x - input_size[1]) < 3 else x
+                for x in crop_sizes
+            ]
+            crop_w = [
+                input_size[0] if abs(x - input_size[0]) < 3 else x
+                for x in crop_sizes
+            ]
+
+            pairs = []
+            for i, h in enumerate(crop_h):
+                for j, w in enumerate(crop_w):
+                    if abs(i - j) <= self.max_distort:
+                        pairs.append((w, h))
+            crop_pair = random.choice(pairs)
+            if not self.fix_crop:
+                w_offset = random.randint(0, image_w - crop_pair[0])
+                h_offset = random.randint(0, image_h - crop_pair[1])
+            else:
+                w_step = (image_w - crop_pair[0]) / 4
+                h_step = (image_h - crop_pair[1]) / 4
+
+                ret = list()
+                ret.append((0, 0))  # upper left
+                if w_step != 0:
+                    ret.append((4 * w_step, 0))  # upper right
+                if h_step != 0:
+                    ret.append((0, 4 * h_step))  # lower left
+                if h_step != 0 and w_step != 0:
+                    ret.append((4 * w_step, 4 * h_step))  # lower right
+                if h_step != 0 or w_step != 0:
+                    ret.append((2 * w_step, 2 * h_step))  # center
+
+                if self.more_fix_crop:
+                    ret.append((0, 2 * h_step))  # center left
+                    ret.append((4 * w_step, 2 * h_step))  # center right
+                    ret.append((2 * w_step, 4 * h_step))  # lower center
+                    ret.append((2 * w_step, 0 * h_step))  # upper center
+
+                    ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+                    ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+                    ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+                    ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+
+                w_offset, h_offset = random.choice(ret)
+
+            return crop_pair[0], crop_pair[1], w_offset, h_offset
+
+        crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)
+        crop_img_group = [
+            img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
+            for img in imgs
+        ]
+        ret_img_group = [
+            img.resize((input_size[0], input_size[1]), Image.BILINEAR)
+            for img in crop_img_group
+        ]
+        results['imgs'] = ret_img_group
+        return results
+
+
+@PIPELINES.register()
+class RandomFlip(object):
+    """
+    Random Flip images.
+    Args:
+        p(float): Random flip images with the probability p.
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, results):
+        """
+        Performs random flip operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            flip_imgs: List where each item is a PIL.Image after random flip.
+        """
+        imgs = results['imgs']
+        v = random.random()
+        if v < self.p:
+            results['imgs'] = [
+                img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs
+            ]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class Image2Array(object):
+    """
+    transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.
+    Args:
+        transpose: whether to transpose or not, default True, False for slowfast.
+    """
+    def __init__(self, transpose=True):
+        self.transpose = transpose
+
+    def __call__(self, results):
+        """
+        Performs Image to NumpyArray operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            np_imgs: Numpy array.
+        """
+        imgs = results['imgs']
+        np_imgs = (np.stack(imgs)).astype('float32')
+        if self.transpose:
+            np_imgs = np_imgs.transpose(0, 3, 1, 2)  #nchw
+        results['imgs'] = np_imgs
+        return results
+
+
+@PIPELINES.register()
+class Normalization(object):
+    """
+    Normalization.
+    Args:
+        mean(Sequence[float]): mean values of different channels.
+        std(Sequence[float]): std values of different channels.
+        tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]
+    """
+    def __init__(self, mean, std, tensor_shape=[3, 1, 1]):
+        if not isinstance(mean, Sequence):
+            raise TypeError(
+                'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
+        if not isinstance(std, Sequence):
+            raise TypeError(
+                'Std must be list, tuple or np.ndarray, but got {type(std)}')
+        self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)
+        self.std = np.array(std).reshape(tensor_shape).astype(np.float32)
+
+    def __call__(self, results):
+        """
+        Performs normalization operations.
+        Args:
+            imgs: Numpy array.
+        return:
+            np_imgs: Numpy array after normalization.
+        """
+        imgs = results['imgs']
+        norm_imgs = imgs / 255.
+        norm_imgs -= self.mean
+        norm_imgs /= self.std
+        results['imgs'] = norm_imgs
+        return results
+
+
+@PIPELINES.register()
+class JitterScale(object):
+    """
+    Scale image, while the target short size is randomly select between min_size and max_size.
+    Args:
+        min_size: Lower bound for random sampler.
+        max_size: Higher bound for random sampler.
+    """
+    def __init__(self,
+                 min_size,
+                 max_size,
+                 short_cycle_factors=[0.5, 0.7071],
+                 default_min_size=256):
+        self.default_min_size = default_min_size
+        self.orig_min_size = self.min_size = min_size
+        self.max_size = max_size
+        self.short_cycle_factors = short_cycle_factors
+
+    def __call__(self, results):
+        """
+        Performs jitter resize operations.
+        Args:
+            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            resized_imgs: List where each item is a PIL.Image after scaling.
+        """
+        short_cycle_idx = results.get('short_cycle_idx')
+        if short_cycle_idx in [0, 1]:
+            self.min_size = int(
+                round(self.short_cycle_factors[short_cycle_idx] *
+                      self.default_min_size))
+        else:
+            self.min_size = self.orig_min_size
+
+        imgs = results['imgs']
+        size = int(round(np.random.uniform(self.min_size, self.max_size)))
+        assert (len(imgs) >= 1) , \
+            "len(imgs):{} should be larger than 1".format(len(imgs))
+        width, height = imgs[0].size
+        if (width <= height and width == size) or (height <= width
+                                                   and height == size):
+            return results
+
+        new_width = size
+        new_height = size
+        if width < height:
+            new_height = int(math.floor((float(height) / width) * size))
+        else:
+            new_width = int(math.floor((float(width) / height) * size))
+
+        frames_resize = []
+        for j in range(len(imgs)):
+            img = imgs[j]
+            scale_img = img.resize((new_width, new_height), Image.BILINEAR)
+            frames_resize.append(scale_img)
+
+        results['imgs'] = frames_resize
+        return results
+
+
+@PIPELINES.register()
+class MultiCrop(object):
+    """
+    Random crop image.
+    This operation can perform multi-crop during multi-clip test, as in slowfast model.
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+    """
+    def __init__(self,
+                 target_size,
+                 default_crop_size=224,
+                 short_cycle_factors=[0.5, 0.7071],
+                 test_mode=False):
+        self.orig_target_size = self.target_size = target_size
+        self.short_cycle_factors = short_cycle_factors
+        self.default_crop_size = default_crop_size
+        self.test_mode = test_mode
+
+    def __call__(self, results):
+        """
+        Performs random crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            crop_imgs: List where each item is a PIL.Image after random crop.
+        """
+        imgs = results['imgs']
+        spatial_sample_index = results['spatial_sample_index']
+        spatial_num_clips = results['spatial_num_clips']
+
+        short_cycle_idx = results.get('short_cycle_idx')
+        if short_cycle_idx in [0, 1]:
+            self.target_size = int(
+                round(self.short_cycle_factors[short_cycle_idx] *
+                      self.default_crop_size))
+        else:
+            self.target_size = self.orig_target_size  # use saved value before call
+
+        w, h = imgs[0].size
+        if w == self.target_size and h == self.target_size:
+            return results
+
+        assert (w >= self.target_size) and (h >= self.target_size), \
+            "image width({}) and height({}) should be larger than crop size({},{})".format(w, h, self.target_size, self.target_size)
+        frames_crop = []
+        if not self.test_mode:
+            x_offset = random.randint(0, w - self.target_size)
+            y_offset = random.randint(0, h - self.target_size)
+        else:  #multi-crop
+            x_gap = int(
+                math.ceil((w - self.target_size) / (spatial_num_clips - 1)))
+            y_gap = int(
+                math.ceil((h - self.target_size) / (spatial_num_clips - 1)))
+            if h > w:
+                x_offset = int(math.ceil((w - self.target_size) / 2))
+                if spatial_sample_index == 0:
+                    y_offset = 0
+                elif spatial_sample_index == spatial_num_clips - 1:
+                    y_offset = h - self.target_size
+                else:
+                    y_offset = y_gap * spatial_sample_index
+            else:
+                y_offset = int(math.ceil((h - self.target_size) / 2))
+                if spatial_sample_index == 0:
+                    x_offset = 0
+                elif spatial_sample_index == spatial_num_clips - 1:
+                    x_offset = w - self.target_size
+                else:
+                    x_offset = x_gap * spatial_sample_index
+
+        for img in imgs:
+            nimg = img.crop((x_offset, y_offset, x_offset + self.target_size,
+                             y_offset + self.target_size))
+            frames_crop.append(nimg)
+        results['imgs'] = frames_crop
+        return results
+
+
+@PIPELINES.register()
+class PackOutput(object):
+    """
+    In slowfast model, we want to get slow pathway from fast pathway based on
+    alpha factor.
+    Args:
+        alpha(int): temporal length of fast/slow
+    """
+    def __init__(self, alpha):
+        self.alpha = alpha
+
+    def __call__(self, results):
+        fast_pathway = results['imgs']
+
+        # sample num points between start and end
+        slow_idx_start = 0
+        slow_idx_end = fast_pathway.shape[0] - 1
+        slow_idx_num = fast_pathway.shape[0] // self.alpha
+        slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,
+                                       slow_idx_num).astype("int64")
+        slow_pathway = fast_pathway[slow_idxs_select]
+
+        # T H W C -> C T H W.
+        slow_pathway = slow_pathway.transpose(3, 0, 1, 2)
+        fast_pathway = fast_pathway.transpose(3, 0, 1, 2)
+
+        # slow + fast
+        frames_list = [slow_pathway, fast_pathway]
+        results['imgs'] = frames_list
+        return results
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py
new file mode 100644
index 000000000..ef0c1d008
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py
@@ -0,0 +1,79 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from collections.abc import Sequence
+from ..registry import PIPELINES
+import traceback
+from ...utils import build
+from ...utils import get_logger
+
+
+@PIPELINES.register()
+class Compose(object):
+    """
+    Composes several pipelines(include decode func, sample func, and transforms) together.
+
+    Note: To deal with ```list``` type cfg temporaray, like:
+
+        transform:
+            - Crop: # A list
+                attribute: 10
+            - Resize: # A list
+                attribute: 20
+
+    every key of list will pass as the key name to build a module.
+    XXX: will be improved in the future.
+
+    Args:
+        pipelines (list): List of transforms to compose.
+    Returns:
+        A compose object which is callable, __call__ for this Compose
+        object will call each given :attr:`transforms` sequencely.
+    """
+    def __init__(self, pipelines):
+        #assert isinstance(pipelines, Sequence)
+        self.pipelines = []
+        for p in pipelines.values():
+            if isinstance(p, dict):
+                p = build(p, PIPELINES)
+                self.pipelines.append(p)
+            elif isinstance(p, list):
+                for t in p:
+                    #XXX: to deal with old format cfg, ugly code here!
+                    temp_dict = dict(name=list(t.keys())[0])
+                    for all_sub_t in t.values():
+                        if all_sub_t is not None:
+                            temp_dict.update(all_sub_t) 
+      
+                    t = build(temp_dict, PIPELINES)
+                    self.pipelines.append(t)
+            elif callable(p):
+                self.pipelines.append(p)
+            else:
+                raise TypeError('pipelines must be callable or a dict,'
+                                'but got {type(p)}')
+    def __call__(self, data):
+        """call"""
+        for p in self.pipelines:
+            try:
+                data = p(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger = get_logger("paddlevideo")
+                logger.info("fail to perform transform [{}] with error: "
+                      "{} and stack:\n{}".format(p, e, str(stack_info)))
+                raise e
+        return data
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py
new file mode 100644
index 000000000..b8e749aff
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py
@@ -0,0 +1,165 @@
+"""
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import sys
+from io import BytesIO
+import os
+import random
+
+import numpy as np
+import pickle
+import cv2
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class VideoDecoder(object):
+    """
+    Decode mp4 file to frames.
+    Args:
+        filepath: the file path of mp4 file
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        #XXX get info from results!!!
+        file_path = results['filename']
+        cap = cv2.VideoCapture(file_path)
+        videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        sampledFrames = []
+        for i in range(videolen):
+            ret, frame = cap.read()
+            # maybe first frame is empty
+            if ret == False:
+                continue
+            img = frame[:, :, ::-1]
+            sampledFrames.append(img)
+        results['frames'] = sampledFrames
+        results['frames_len'] = len(sampledFrames)
+        results['format'] = 'video'
+        return results
+
+
+@PIPELINES.register()
+class FrameDecoder(object):
+    """just parse results
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['format'] = 'frame'
+        return results
+
+
+@PIPELINES.register()
+class FeatureDecoder(object):
+    """
+        Perform feature decode operations.e.g.youtube8m
+    """
+    def __init__(self, num_classes, max_len=512, has_label=True):
+        self.max_len = max_len
+        self.num_classes = num_classes
+        self.has_label = has_label
+
+    def __call__(self, results):
+        """
+        Perform feature decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        #1. load pkl
+        #2. parse to rgb/audio/
+        #3. padding
+
+        filepath = results['filename']
+        data = pickle.load(open(filepath, 'rb'), encoding='bytes')
+
+        record = data
+        nframes = record[b'nframes']
+        rgb = record[b'feature'].astype(float)
+        audio = record[b'audio'].astype(float)
+        if self.has_label:
+            label = record[b'label']
+            one_hot_label = self.make_one_hot(label, self.num_classes)
+
+        rgb = rgb[0:nframes, :]
+        audio = audio[0:nframes, :]
+
+        rgb = self.dequantize(rgb,
+                              max_quantized_value=2.,
+                              min_quantized_value=-2.)
+        audio = self.dequantize(audio,
+                                max_quantized_value=2,
+                                min_quantized_value=-2)
+
+        if self.has_label:
+            results['labels'] = one_hot_label.astype("float32")
+
+        feat_pad_list = []
+        feat_len_list = []
+        mask_list = []
+        vitem = [rgb, audio]
+        for vi in range(2):  #rgb and audio
+            if vi == 0:
+                prefix = "rgb_"
+            else:
+                prefix = "audio_"
+            feat = vitem[vi]
+            results[prefix + 'len'] = feat.shape[0]
+            #feat pad step 1. padding
+            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
+                                dtype=np.float32)
+            feat_pad = np.concatenate((feat, feat_add), axis=0)
+            results[prefix + 'data'] = feat_pad.astype("float32")
+            #feat pad step 2. mask
+            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
+            feat_mask_add = feat_add
+            feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),
+                                       axis=0)
+            results[prefix + 'mask'] = feat_mask.astype("float32")
+
+        return results
+
+    def dequantize(self,
+                   feat_vector,
+                   max_quantized_value=2.,
+                   min_quantized_value=-2.):
+        """
+        Dequantize the feature from the byte format to the float format
+        """
+
+        assert max_quantized_value > min_quantized_value
+        quantized_range = max_quantized_value - min_quantized_value
+        scalar = quantized_range / 255.0
+        bias = (quantized_range / 512.0) + min_quantized_value
+
+        return feat_vector * scalar + bias
+
+    def make_one_hot(self, label, dim=3862):
+        """make one hot"""
+        one_hot_label = np.zeros(dim)
+        one_hot_label = one_hot_label.astype(float)
+        for ind in label:
+            one_hot_label[int(ind)] = 1
+        return one_hot_label
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py
new file mode 100644
index 000000000..fe7a8073b
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py
@@ -0,0 +1,91 @@
+"""
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import random
+import numpy as np
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class Mixup(object):
+    """
+    Mixup operator.
+    Args:
+        alpha(float): alpha value.
+    """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self.alpha = alpha
+
+    def __call__(self, batch):
+        imgs, labels = list(zip(*batch))
+        imgs = np.array(imgs)
+        labels = np.array(labels)
+        bs = len(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self.alpha, self.alpha)
+        lams = np.array([lam] * bs, dtype=np.float32)
+        imgs = lam * imgs + (1 - lam) * imgs[idx]
+        return list(zip(imgs, labels, labels[idx], lams))
+
+
+@PIPELINES.register()
+class Cutmix(object):
+    """ Cutmix operator
+    Args:
+        alpha(float): alpha value.
+    """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self.alpha = alpha
+
+    def rand_bbox(self, size, lam):
+        """ rand_bbox """
+        w = size[2]
+        h = size[3]
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = np.int(w * cut_rat)
+        cut_h = np.int(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w)
+        bby1 = np.clip(cy - cut_h // 2, 0, h)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w)
+        bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+        return bbx1, bby1, bbx2, bby2
+
+    def __call__(self, batch):
+        imgs, labels = list(zip(*batch))
+        imgs = np.array(imgs)
+        labels = np.array(labels)
+
+        bs = len(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self.alpha, self.alpha)
+
+        bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)
+        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.shape[-2] * imgs.shape[-1]))
+        lams = np.array([lam] * bs, dtype=np.float32)
+
+        return list(zip(imgs, labels, labels[idx], lams))
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py
new file mode 100644
index 000000000..6990f9eb0
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py
@@ -0,0 +1,102 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import random
+from PIL import Image
+from ..registry import PIPELINES
+import os
+import numpy as np
+
+@PIPELINES.register()
+class Sampler(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        mode(str): 'train', 'valid'
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+
+    def __init__(self, num_seg, seg_len, valid_mode=False):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.valid_mode = valid_mode
+
+
+    def _get(self, frames_idx, results):
+        data_format =results['format']
+
+        if data_format == "frame":
+            frame_dir = results['frame_dir']
+            imgs = []
+            for idx in frames_idx:
+                img = Image.open(os.path.join(frame_dir, results['suffix'].format(idx))).convert('RGB')
+                imgs.append(img)
+
+        elif data_format == "video":
+
+            frames = np.array(results['frames'])
+            imgs = []
+            for idx in frames_idx:
+                imgbuf = frames[idx]
+                img = Image.fromarray(imgbuf, mode='RGB')
+                imgs.append(img)
+        else:
+            raise NotImplementedError
+        results['imgs'] = imgs
+        return results
+
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        frames_len = int(results['frames_len'])
+        average_dur = int(int(frames_len) / self.num_seg)
+        frames_idx = []
+        for i in range(self.num_seg):
+            idx = 0
+            if not self.valid_mode:
+                if average_dur >= self.seg_len:
+                    idx = random.randint(0, average_dur - self.seg_len)
+                    idx += i * average_dur
+                elif average_dur >= 1:
+                    idx += i * average_dur
+                else: # average_dur = 0
+                    idx = i % frames_len
+            else:
+                if average_dur >= self.seg_len:
+                    idx = (average_dur - 1) // 2
+                    idx += i * average_dur
+                elif average_dur >= 1:
+                    idx += i * average_dur
+                else:
+                    idx = i % frames_len
+            for jj in range(idx, idx+self.seg_len):
+                if results['format'] == 'video':
+                    frames_idx.append(int(jj%frames_len))
+                elif results['format'] == 'frame':
+                    #frame from 000001
+                    frames_idx.append(jj+1)
+                else:
+                    raise NotImplementedError
+
+        return self._get(frames_idx, results)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/registry.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/registry.py
new file mode 100644
index 000000000..88948acd0
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/loader/registry.py
@@ -0,0 +1,20 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from ..utils import Registry
+
+PIPELINES = Registry("pipeline")
+DATASETS = Registry("datasets")
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py
new file mode 100644
index 000000000..ee41ae860
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py
@@ -0,0 +1,23 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .registry import METRIC
+from .build import build_metric
+from .quality_metric import QuqlityMetric
+
+__all__ = [
+    'METRIC', 'build_metric', 'QuqlityMetric'
+]
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/base.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/base.py
new file mode 100644
index 000000000..b6e41bbce
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/base.py
@@ -0,0 +1,39 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+"""
+
+from abc import abstractmethod
+import numpy as np
+import paddle
+from paddlevideo.utils import get_dist_info
+
+from .registry import METRIC
+
+
+class BaseMetric(object):
+    """Base Metric"""
+    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):
+        self.data_size = data_size
+        self.batch_size = batch_size
+        _, self.world_size = get_dist_info()
+        self.log_interval = log_interval
+
+    @abstractmethod
+    def update(self):
+        """update"""
+        raise NotImplementedError
+
+    @abstractmethod
+    def accumulate(self):
+        """accumulate"""
+        raise NotImplementedError
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/build.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/build.py
new file mode 100644
index 000000000..852fe1514
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/build.py
@@ -0,0 +1,23 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .registry import METRIC
+from ..utils import build
+
+
+def build_metric(cfg):
+    """build metric"""
+    return build(cfg, METRIC)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py
new file mode 100644
index 000000000..a4c50ad15
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py
@@ -0,0 +1,73 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+"""
+
+import numpy as np
+import paddle
+from paddle.hapi.model import _all_gather
+
+from scipy import stats
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class QuqlityMetric(BaseMetric):
+    """CenterCropQualityMetric"""
+    def __init__(self, data_size, batch_size, log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.output = []
+        self.label = []
+        self.y_pred = np.zeros(data_size)
+        self.y_test = np.zeros(data_size)
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        labels = data[1]
+        
+        predict_output = paddle.tolist(outputs)
+        predict_label = paddle.tolist(labels)
+        predict_output_len = len(predict_output)
+        for i in range(predict_output_len):
+            self.output.append(predict_output[i][0])
+            self.label.append(predict_label[i][0])
+        
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        test_output_np = np.array(self.output)
+        test_label_np = np.array(self.label)
+        PLCC = stats.pearsonr(test_output_np, test_label_np)[0]
+        SROCC = stats.spearmanr(test_output_np, test_label_np)[0]
+        
+        logger.info('[TEST] finished, PLCC= {}, SROCC= {} '.format(PLCC, SROCC))
+
+    def accumulate_train(self, output, label):
+        """accumulate_train"""
+        output_np = np.array(output)
+        label_np = np.array(label)
+        PLCC = stats.pearsonr(output_np, label_np)[0]
+        SROCC = stats.spearmanr(output_np, label_np)[0]
+        return PLCC, SROCC
+
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py
new file mode 100644
index 000000000..24c74262e
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py
@@ -0,0 +1,19 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from ..utils import Registry
+
+METRIC = Registry('metric')
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py
new file mode 100644
index 000000000..a9c4e1a06
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py
@@ -0,0 +1,45 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .backbones import ResNet
+from .builder import (build_backbone, build_head, build_recognizer,
+                      build_localizer, build_loss)
+from .heads import BaseHead, TSNHead, TSMRecHead
+from .losses import SmoothL1Loss, L1Loss
+from .framework.recognizers import BaseRecognizer, recognizer2d
+from .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS
+from .weight_init import weight_init_
+
+__all__ = [
+    'BACKBONES',
+    'HEADS',
+    'RECOGNIZERS',
+    'LOCALIZERS',
+    'LOSSES',
+    'build_recognizer',
+    'build_localizer',
+    'build_head',
+    'build_backbone',
+    'build_loss',
+    'ResNet',
+    'TSNHead',
+    'BaseHead',
+    'TSMRecHead',
+    'BaseRecognizer',
+    'Recognizer2d',
+    'SmoothL1Loss',
+    'L1Loss',
+]
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py
new file mode 100644
index 000000000..aa9e591c8
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py
@@ -0,0 +1,20 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .resnet import ResNet
+from .resnet_tweaks_tsm import ResNetTweaksTSM
+
+__all__ = ['ResNet', 'ResNetTweaksTSM']
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py
new file mode 100644
index 000000000..a03b38c4b
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py
@@ -0,0 +1,290 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import numpy as np
+import math
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+
+    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(out_channels,
+                                       weight_attr=ParamAttr(name=bn_name +
+                                                             "_scale"),
+                                       bias_attr=ParamAttr(bn_name + "_offset"))
+
+    def forward(self, inputs):
+        """forward"""
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    """BottleneckBlock"""
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2b")
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        """forward"""
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.relu(y)
+
+
+class BasicBlock(nn.Layer):
+    """BasicBlock"""
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 act=None,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     kernel_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        """forward"""
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNet(nn.Layer):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, pretrained=None):
+        super(ResNet, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = [64, 256, 512, 1024]
+        out_channels = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(in_channels=3,
+                                out_channels=64,
+                                kernel_size=7,
+                                stride=2,
+                                act="relu",
+                                name="conv1")
+        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        conv_name,
+                        BottleneckBlock(
+                            # NOTE: Be careful! Here is different from TSM model.
+                            in_channels=in_channels[block]
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(in_channels=in_channels[block]
+                                   if i == 0 else out_channels[block],
+                                   out_channels=out_channels[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        #XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+
+        """
+        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,
+        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+        #y = paddle.reshape(
+        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+        y = self.conv(inputs)
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py
new file mode 100644
index 000000000..d6af04eb9
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py
@@ -0,0 +1,328 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import numpy as np
+import math
+
+import sys
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils.save_load import load_ckpt
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        is_tweaks_mode (bool): switch for tweaks. Default: False.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+
+    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_tweaks_mode=False,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.is_tweaks_mode = is_tweaks_mode
+        self._pool2d_avg = AvgPool2D(kernel_size=2,
+                                     stride=2,
+                                     padding=0,
+                                     ceil_mode=True)
+
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(out_channels,
+                                       weight_attr=ParamAttr(name=bn_name +
+                                                             "_scale"),
+                                       bias_attr=ParamAttr(bn_name + "_offset"))
+
+    def forward(self, inputs):
+        """forward"""
+        if self.is_tweaks_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    """BottleneckBlock"""
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 num_seg=8,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2b")
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=
+                1,  #ResNet-D 2/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+                #             whose stride is changed to 1, works well in practice.
+                is_tweaks_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+        self.num_seg = num_seg
+
+    def forward(self, inputs):
+        """forward"""
+        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.relu(y)
+
+
+class BasicBlock(nn.Layer):
+    """BasicBlock"""
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 act=None,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     kernel_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        """forward"""
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTweaksTSM(nn.Layer):
+    """ResNet TSM backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, num_seg=8, pretrained=None):
+        super(ResNetTweaksTSM, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+        self.num_seg = num_seg
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = 64
+        out_channels = [64, 128, 256, 512]
+
+        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+        self.conv1_1 = ConvBNLayer(in_channels=3,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=2,
+                                   act='relu',
+                                   name="conv1_1")
+        self.conv1_2 = ConvBNLayer(in_channels=32,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   name="conv1_2")
+        self.conv1_3 = ConvBNLayer(in_channels=32,
+                                   out_channels=64,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   name="conv1_3")
+        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' %
+                        (block, i),  #same with PaddleClas, for loading pretrain
+                        BottleneckBlock(
+                            in_channels=in_channels
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            num_seg=self.num_seg,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    in_channels = out_channels[block] * 4
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(in_channels=in_channels[block]
+                                   if i == 0 else out_channels[block],
+                                   out_channels=out_channels[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        #XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+
+        """
+        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,
+        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+        #y = paddle.reshape(
+        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+        ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py
new file mode 100644
index 000000000..3007a5d56
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS
+from ..utils import build
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return build(cfg, BACKBONES)
+
+
+def build_head(cfg):
+    """Build head."""
+    return build(cfg, HEADS)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return build(cfg, LOSSES)
+
+
+def build_recognizer(cfg):
+    """Build recognizer."""
+    return build(cfg, RECOGNIZERS, key='framework')
+
+
+def build_localizer(cfg):
+    """Build localizer."""
+    return build(cfg, LOCALIZERS, key='framework')
+
+
+def build_model(cfg):
+    cfg_copy = cfg.copy()
+    framework_type = cfg_copy.get('framework')
+    if framework_type in RECOGNIZERS:
+        return build_recognizer(cfg)
+    elif framework_type in LOCALIZERS:
+        return build_localizer(cfg)
+    else:
+        raise NotImplementedError
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py
new file mode 100644
index 000000000..f7977905e
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py
@@ -0,0 +1,22 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .recognizers import BaseRecognizer, Recognizer2D
+
+__all__ = [
+    'BaseRecognizer',
+    'Recognizer2D',
+]
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py
new file mode 100644
index 000000000..95e4b1fb2
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py
@@ -0,0 +1,19 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+"""
+
+from .base import BaseRecognizer
+from .recognizer2d import Recognizer2D
+
+
+__all__ = ['BaseRecognizer', 'Recognizer2D']
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py
new file mode 100644
index 000000000..aa13090de
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py
@@ -0,0 +1,97 @@
+"""
+start
+"""
+
+from abc import abstractmethod
+from ... import builder
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class BaseRecognizer(nn.Layer):
+    """Base class for recognizers.
+
+    All recognizers should subclass it.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Classification head to process feature.
+
+    """
+    def __init__(self, backbone=None, head=None):
+
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            self.head.init_weights()
+        else:
+           self.head = None
+
+
+    def init_weights(self):
+        """Initialize the model network weights. """
+
+        self.backbone.init_weights(
+        )  #TODO: required? while backbone without base class
+        self.head.init_weights()
+
+    def extract_feature(self, imgs):
+        """Extract features through a backbone.
+
+    Args:
+        imgs (paddle.Tensor) : The input images.
+
+        Returns:
+            feature (paddle.Tensor) : The extracted features.
+        """
+        feature = self.backbone(imgs)
+        return feature
+
+    def forward(self, imgs, **kwargs):
+        """Define how the model is going to run, from input to output.
+        """
+        batches = imgs.shape[0]
+        num_segs = imgs.shape[1]
+        imgs = paddle.reshape(imgs, [-1] + list(imgs.shape[2:]))
+
+        if self.backbone is not None:
+            feature = self.extract_feature(imgs)
+        else:
+            feature = imgs
+        if self.head is not None:
+            cls_score = self.head(feature, num_segs)
+        else:
+            cls_score = None
+
+
+        return cls_score
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py
new file mode 100644
index 000000000..c33f21640
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py
@@ -0,0 +1,52 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+"""
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer2D(BaseRecognizer):
+    """2D recognizer model framework."""
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        #NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.
+
+        #labels = labels.squeeze()
+        #XXX: unsqueeze label to [label] ?
+
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self(imgs)
+        loss_metrics = self.head.loss(cls_score, labels)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self(imgs)
+        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        #NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+        imgs = data_batch[0]
+        cls_score = self(imgs)
+        return cls_score
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py
new file mode 100644
index 000000000..011c71e03
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py
@@ -0,0 +1,21 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .base import BaseHead
+from .tsn_head import TSNHead
+from .tsm_rec_head import TSMRecHead
+
+__all__ = ['BaseHead', 'TSNHead', 'TSMRecHead']
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py
new file mode 100644
index 000000000..379cccb9c
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py
@@ -0,0 +1,143 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import numpy as np
+from abc import abstractmethod
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..builder import build_loss
+from paddlevideo.utils import get_logger, get_dist_info
+
+logger = get_logger("paddlevideo")
+
+
+class BaseHead(nn.Layer):
+    """Base class for head part.
+
+    All head should subclass it.
+    All subclass should overwrite:
+
+    - Methods: ```init_weights```, initializing weights.
+    - Methods: ```forward```, forward function.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channels in input feature.
+        loss_cfg (dict): Config for building loss. Default: dict(type='CrossEntropyLoss').
+        ls_eps (float): label smoothing epsilon. Default: 0. .
+
+    """
+    def __init__(
+        self,
+        num_classes,
+        in_channels,
+        loss_cfg=dict(
+            name="CrossEntropyLoss"
+        ),  #TODO(shipping): only pass a name or standard build cfg format.
+        #multi_class=False, NOTE(shipping): not supported now.
+        ls_eps=0.):
+
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.loss_func = build_loss(loss_cfg)
+        #self.multi_class = multi_class NOTE(shipping): not supported now
+        self.ls_eps = ls_eps
+
+    @abstractmethod
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(self, x):
+        """Define how the head is going to run.
+        """
+        raise NotImplementedError
+
+    def loss(self, scores, labels, valid_mode=False, **kwargs):
+        """Calculate the loss accroding to the model output ```scores```,
+           and the target ```labels```.
+
+        Args:
+            scores (paddle.Tensor): The output of the model.
+            labels (paddle.Tensor): The target output of the model.
+
+        Returns:
+            losses (dict): A dict containing field 'loss'(mandatory) and 'top1_acc', 'top5_acc'(optional).
+
+        """
+        if len(labels) == 1:  #commonly case
+            labels = labels[0]
+            losses = dict()
+            if self.ls_eps != 0. and not valid_mode:  # label_smooth
+                loss = self.label_smooth_loss(scores, labels, **kwargs)
+            else:
+                loss = self.loss_func(scores, labels, **kwargs)
+
+            top1, top5 = self.get_acc(scores, labels, valid_mode)
+            losses['top1'] = top1
+            losses['top5'] = top5
+            losses['loss'] = loss
+            return losses
+        elif len(labels) == 3:  # mix_up
+            labels_a, labels_b, lam = labels
+            lam = lam[0]  # get lam value
+            losses = dict()
+
+            if self.ls_eps != 0:
+                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)
+                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)
+            else:
+                loss_a = self.loss_func(scores, labels_a, **kwargs)
+                loss_b = self.loss_func(scores, labels_a, **kwargs)
+            loss = lam * loss_a + (1 - lam) * loss_b
+            top1a, top5a = self.get_acc(scores, labels_a, valid_mode)
+            top1b, top5b = self.get_acc(scores, labels_b, valid_mode)
+            top1 = lam * top1a + (1 - lam) * top1b
+            top5 = lam * top5a + (1 - lam) * top5b
+            losses['top1'] = top1
+            losses['top5'] = top5
+            losses['loss'] = loss
+            return losses
+        else:
+            raise NotImplementedError
+
+    def label_smooth_loss(self, scores, labels, **kwargs):
+        """label smooth loss"""
+        labels = F.one_hot(labels, self.num_classes)
+        labels = F.label_smooth(labels, epsilon=self.ls_eps)
+        labels = paddle.squeeze(labels, axis=1)
+        loss = self.loss_func(scores, labels, soft_label=True, **kwargs)
+        return loss
+
+    def get_acc(self, scores, labels, valid_mode):
+        """get acc"""
+        top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)
+        top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)
+        _, world_size = get_dist_info()
+        #NOTE(shipping): deal with multi cards validate
+        if world_size > 1 and valid_mode:  #reduce sum when valid
+            top1 = paddle.distributed.all_reduce(
+                top1, op=paddle.distributed.ReduceOp.SUM) / world_size
+            top5 = paddle.distributed.all_reduce(
+                top5, op=paddle.distributed.ReduceOp.SUM) / world_size
+
+        return top1, top5
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py
new file mode 100644
index 000000000..ae08693b5
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py
@@ -0,0 +1,153 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import math
+import paddle
+import paddle.nn.functional as F
+from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout
+from .base import BaseHead
+from .tsn_head import TSNHead
+from ..registry import HEADS
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class TSMRecHead(TSNHead):
+    """ TSM Rec Head
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.8.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.001.
+        kwargs (dict, optional): Any keyword argument to initialize.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='L1Loss'),
+                 drop_ratio=0.8,
+                 std=0.01,
+                 data_format="NCHW",
+                 **kwargs):
+
+        super().__init__(num_classes,
+                         in_channels,
+                         loss_cfg,
+                         drop_ratio=drop_ratio,
+                         std=std,
+                         data_format=data_format,
+                         **kwargs)
+
+        self.stdv = 1.0 / math.sqrt(self.in_channels * 1.0)
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'Uniform',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     low=-self.stdv,
+                     high=self.stdv)
+        self.fc.bias.learning_rate = 2.0
+        self.fc.bias.regularizer = paddle.regularizer.L2Decay(0.)
+
+    def forward(self, x, num_seg):
+        """Define how the head is going to run.
+
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+        # [N * num_segs, in_channels, 7, 7]
+        x = self.avgpool2d(x)
+        # [N * num_segs, in_channels, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N * num_seg, in_channels, 1, 1]
+        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+        # [N, num_seg, in_channels]
+        x = paddle.mean(x, axis=1)
+        # [N, 1, in_channels]
+        x = paddle.reshape(x, shape=[-1, self.in_channels])
+        # [N, in_channels]
+        score = self.fc(x)
+        # [N, num_class]
+        #m = paddle.nn.Sigmoid()
+        #score = m(score)
+        return score
+
+    def loss(self, scores, labels, valid_mode=False, **kwargs):
+        """Calculate the loss accroding to the model output ```scores```,
+           and the target ```labels```.
+
+        Args:
+            scores (paddle.Tensor): The output of the model.
+            labels (paddle.Tensor): The target output of the model.
+
+        Returns:
+            losses (dict): A dict containing field 'loss'(mandatory).
+
+        """
+        if len(labels) == 1:  #commonly case
+            output = []
+            label = []
+            labels = labels[0]
+            losses = dict()
+            loss = self.loss_func(scores, labels, **kwargs)
+
+            score_list = paddle.tolist(scores)
+            label_list = paddle.tolist(labels)
+            score_list_len = len(score_list)
+            for i in range(score_list_len):
+                output.append(score_list[i][0])
+                label.append(label_list[i][0])
+            losses['loss'] = loss
+            losses['output'] = output
+            losses['label'] = label
+            return losses
+        elif len(labels) == 3:
+            labels_a, labels_b, lam = labels
+            labels_a = paddle.cast(labels_a, dtype='float32')
+            labels_b = paddle.cast(labels_b, dtype='float32')
+            lam = lam[0]  # get lam value
+            losses = dict()
+
+            if self.ls_eps != 0:
+                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)
+                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)
+            else:
+                loss_a = self.loss_func(scores, labels_a, **kwargs)
+                loss_b = self.loss_func(scores, labels_a, **kwargs)
+            loss = lam * loss_a + (1 - lam) * loss_b
+
+            losses['loss'] = loss
+            losses['output'] = output
+            losses['label'] = label
+            return losses
+        else:
+            raise NotImplementedError
+
+    def label_smooth_loss(self, scores, labels, **kwargs):
+        """label smooth loss"""
+        labels = F.label_smooth(labels, epsilon=self.ls_eps)
+        labels = paddle.squeeze(labels, axis=1)
+        loss = self.loss_func(scores, labels, **kwargs)
+        return loss
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py
new file mode 100644
index 000000000..7ca1a43dc
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+import paddle.nn.functional as F
+
+
+@HEADS.register()
+class TSNHead(BaseHead):
+    """TSN Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.4.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 drop_ratio=0.4,
+                 std=0.01,
+                 data_format="NCHW",
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+        self.drop_ratio = drop_ratio
+        self.std = std
+
+        #NOTE: global pool performance
+        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)
+
+        if self.drop_ratio != 0:
+            self.dropout = Dropout(p=self.drop_ratio)
+        else:
+            self.dropout = None
+
+        self.fc = Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'Normal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.,
+                     std=self.std)
+
+    def forward(self, x, num_seg):
+        """Define how the head is going to run.
+
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+
+        #XXX: check dropout location!
+
+        # [N * num_segs, in_channels, 7, 7]
+        x = self.avgpool2d(x)
+        # [N * num_segs, in_channels, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N * num_seg, in_channels, 1, 1]
+        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+        # [N, num_seg, in_channels]
+        x = paddle.mean(x, axis=1)
+        # [N, 1, in_channels]
+        x = paddle.reshape(x, shape=[-1, self.in_channels])
+        # [N, in_channels]
+        score = self.fc(x)
+        # [N, num_class]
+        #score = F.softmax(score)  #NOTE remove
+        return score
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py
new file mode 100644
index 000000000..ba45f3c65
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py
@@ -0,0 +1,21 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .base import BaseWeightedLoss
+from .smooth_l1_loss import SmoothL1Loss
+from .l1_loss import L1Loss
+
+__all__ = ['SmoothL1Loss', 'L1Loss']
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py
new file mode 100644
index 000000000..b34ac4422
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py
@@ -0,0 +1,51 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from abc import  abstractmethod
+import paddle
+import paddle.nn as nn
+
+#XXX use _forward?? or forward??
+class BaseWeightedLoss(nn.Layer):
+    """Base class for loss.
+
+    All subclass should overwrite the ``_forward()`` method which returns the
+    normal loss without loss weights.
+
+    Args:
+        loss_weight (float): Factor scalar multiplied on the loss.
+            Default: 1.0.
+    """
+
+    def __init__(self, loss_weight=1.0):
+        super().__init__()
+        self.loss_weight = loss_weight
+
+    @abstractmethod
+    def _forward(self, *args, **kwargs):
+        pass
+
+    def forward(self, *args, **kwargs):
+        """Defines the computation performed at every call.
+        Args:
+            *args: The positional arguments for the corresponding
+                loss.
+            **kwargs: The keyword arguments for the corresponding
+                loss.
+        Returns:
+            paddle.Tensor: The calculated loss.
+        """
+        return self._forward(*args, **kwargs) * self.loss_weight
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py
new file mode 100644
index 000000000..61272d2d0
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py
@@ -0,0 +1,38 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class L1Loss(BaseWeightedLoss):
+    """L1 Loss."""
+    def _forward(self, score, labels):
+        """Forward function.
+        Args:
+            score (paddle.Tensor): The class score.
+            labels (paddle.Tensor): The ground truth labels.
+        Returns:
+            loss (paddle.Tensor): The returned L1 loss.
+        """
+        
+        labels = labels.astype(score.dtype)
+        loss = F.l1_loss(score, labels)
+        return loss
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py
new file mode 100644
index 000000000..5ded6e7e3
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py
@@ -0,0 +1,39 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class SmoothL1Loss(BaseWeightedLoss):
+    """smooth L1 Loss."""
+    def _forward(self, score, labels):
+        """Forward function.
+        Args:
+            score (paddle.Tensor): The class score.
+            labels (paddle.Tensor): The ground truth labels.
+        Returns:
+            loss (paddle.Tensor): The returned smooth L1 Loss.
+        """
+        
+        labels = labels.astype(score.dtype)
+        loss = F.smooth_l1_loss(score, labels)
+        
+        return loss
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py
new file mode 100644
index 000000000..8dcd6a897
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py
@@ -0,0 +1,23 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from ..utils import Registry
+
+BACKBONES = Registry('backbone')
+HEADS = Registry('head')
+RECOGNIZERS = Registry('recognizer')
+LOCALIZERS = Registry('localizer')
+LOSSES = Registry('loss')
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py
new file mode 100644
index 000000000..ae3a670bb
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py
@@ -0,0 +1,55 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import numpy as np
+import paddle.nn.initializer as init
+
+
+def weight_init_(layer,
+                 func,
+                 weight_name=None,
+                 bias_name=None,
+                 bias_value=0.0,
+                 **kwargs):
+    """
+    In-place params init function.
+    Usage:
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        data = np.ones([3, 4], dtype='float32')
+        linear = paddle.nn.Linear(4, 4)
+        input = paddle.to_tensor(data)
+        print(linear.weight)
+        linear(input)
+
+        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)
+        print(linear.weight)
+    """
+
+    if hasattr(layer, 'weight') and layer.weight is not None:
+        getattr(init, func)(**kwargs)(layer.weight)
+        if weight_name is not None:
+            # override weight name
+            layer.weight.name = weight_name
+
+    if hasattr(layer, 'bias') and layer.bias is not None:
+        init.Constant(bias_value)(layer.bias)
+        if bias_name is not None:
+            # override bias name
+            layer.bias.name = bias_name
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py
new file mode 100644
index 000000000..ed48d380c
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py
@@ -0,0 +1,17 @@
+"""
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from .optimizer import build_optimizer
+from .lr import build_lr
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py
new file mode 100644
index 000000000..93afe9703
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py
@@ -0,0 +1,201 @@
+"""
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import math
+from paddle.optimizer.lr import *
+
+"""
+PaddleVideo Learning Rate Schedule:
+You can use paddle.optimizer.lr
+or define your custom_lr in this file.
+"""
+
+
+class CustomWarmupCosineDecay(LRScheduler):
+    """
+    We combine warmup and stepwise-cosine which is used in slowfast model.
+
+    Args:
+        warmup_start_lr (float): start learning rate used in warmup stage.
+        warmup_epochs (int): the number epochs of warmup.
+        cosine_base_lr (float|int, optional): base learning rate in cosine schedule.
+        max_epoch (int): total training epochs.
+        num_iters(int): number iterations of each epoch.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    Returns:
+        ``CosineAnnealingDecay`` instance to schedule learning rate.
+    """
+    def __init__(self,
+                 warmup_start_lr,
+                 warmup_epochs,
+                 cosine_base_lr,
+                 max_epoch,
+                 num_iters,
+                 last_epoch=-1,
+                 verbose=False):
+        self.warmup_start_lr = warmup_start_lr
+        self.warmup_epochs = warmup_epochs
+        self.cosine_base_lr = cosine_base_lr
+        self.max_epoch = max_epoch
+        self.num_iters = num_iters
+        #call step() in base class, last_lr/last_epoch/base_lr will be update
+        super(CustomWarmupCosineDecay, self).__init__(last_epoch=last_epoch,
+                                                      verbose=verbose)
+
+    def step(self, epoch=None):
+        """
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+        Returns:
+            None
+        """
+        if epoch is None:
+            if self.last_epoch == -1:
+                self.last_epoch += 1
+            else:
+                self.last_epoch += 1 / self.num_iters  # update step with iters
+        else:
+            self.last_epoch = epoch
+        self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def _lr_func_cosine(self, cur_epoch, cosine_base_lr, max_epoch):
+        """start to cosine"""
+        return cosine_base_lr * (math.cos(math.pi * cur_epoch / max_epoch) +
+                                 1.0) * 0.5
+
+    def get_lr(self):
+        """Define lr policy"""
+        lr = self._lr_func_cosine(self.last_epoch, self.cosine_base_lr,
+                                  self.max_epoch)
+        lr_end = self._lr_func_cosine(self.warmup_epochs, self.cosine_base_lr,
+                                      self.max_epoch)
+
+        # Perform warm up.
+        if self.last_epoch < self.warmup_epochs:
+            lr_start = self.warmup_start_lr
+            alpha = (lr_end - lr_start) / self.warmup_epochs
+            lr = self.last_epoch * alpha + lr_start
+        return lr
+
+
+class CustomWarmupPiecewiseDecay(LRScheduler):
+    """
+    This op combine warmup and stepwise-cosine which is used in slowfast model.
+
+    Args:
+        warmup_start_lr (float): start learning rate used in warmup stage.
+        warmup_epochs (int): the number epochs of warmup.
+        step_base_lr (float|int, optional): base learning rate in step schedule.
+        max_epoch (int): total training epochs.
+        num_iters(int): number iterations of each epoch.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    Returns:
+        ``CustomWarmupPiecewiseDecay`` instance to schedule learning rate.
+    """
+    def __init__(self,
+                 warmup_start_lr,
+                 warmup_epochs,
+                 step_base_lr,
+                 lrs,
+                 gamma,
+                 steps,
+                 max_epoch,
+                 num_iters,
+                 last_epoch=0,
+                 verbose=False):
+        self.warmup_start_lr = warmup_start_lr
+        self.warmup_epochs = warmup_epochs
+        self.step_base_lr = step_base_lr
+        self.lrs = lrs
+        self.gamma = gamma
+        self.steps = steps
+        self.max_epoch = max_epoch
+        self.num_iters = num_iters
+        self.last_epoch = last_epoch
+        self.last_lr = self.warmup_start_lr  # used in first iter
+        self.verbose = verbose
+        self._var_name = None
+
+    def step(self, epoch=None, rebuild=False):
+        """
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+        Returns:
+            None
+        """
+        if epoch is None:
+            if not rebuild:
+                self.last_epoch += 1 / self.num_iters  # update step with iters
+        else:
+            self.last_epoch = epoch
+        self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def _lr_func_steps_with_relative_lrs(self, cur_epoch, lrs, base_lr, steps,
+                                         max_epoch):
+        """lr func steps with relative lrs"""
+        # get step index
+        steps = steps + [max_epoch]
+        for ind, step in enumerate(steps):
+            if cur_epoch < step:
+                break
+
+        return lrs[ind - 1] * base_lr
+
+    def get_lr(self):
+        """Define lr policy"""
+        lr = self._lr_func_steps_with_relative_lrs(
+            self.last_epoch,
+            self.lrs,
+            self.step_base_lr,
+            self.steps,
+            self.max_epoch,
+        )
+        lr_end = self._lr_func_steps_with_relative_lrs(
+            self.warmup_epochs,
+            self.lrs,
+            self.step_base_lr,
+            self.steps,
+            self.max_epoch,
+        )
+
+        # Perform warm up.
+        if self.last_epoch < self.warmup_epochs:
+            lr_start = self.warmup_start_lr
+            alpha = (lr_end - lr_start) / self.warmup_epochs
+            lr = self.last_epoch * alpha + lr_start
+        return lr
+
+
+class CustomPiecewiseDecay(PiecewiseDecay):
+    """CustomPiecewiseDecay"""
+    def __init__(self, **kargs):
+        """start"""
+        kargs.pop('num_iters')
+        super().__init__(**kargs)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/lr.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/lr.py
new file mode 100644
index 000000000..62c2c4625
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/lr.py
@@ -0,0 +1,49 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import copy
+import paddle
+from . import custom_lr
+
+
+def build_lr(cfg, num_iters):
+    """
+    Build a learning rate scheduler accroding to ```OPTIMIZER``` configuration, and it always pass into the optimizer.
+    In configuration:
+    learning_rate:
+        name: 'PiecewiseDecay'
+        boundaries: [20, 60]
+        values: [0.00025, 0.000025, 0.0000025]
+
+
+    Returns:
+        A paddle.optimizer.lr instance.
+    """
+
+    cfg_copy = cfg.copy()
+
+    #when learning_rate is LRScheduler
+    if cfg_copy.get('learning_rate') and isinstance(cfg_copy['learning_rate'],
+                                                    dict):
+        cfg_copy['learning_rate'] = build_lr(
+            cfg_copy['learning_rate'],
+            num_iters)  #not support only inner iter_step
+
+    lr_name = cfg_copy.pop('name')
+    if cfg_copy.get('iter_step'):
+        cfg_copy['num_iters'] = num_iters
+        cfg_copy.pop('iter_step')
+
+    return getattr(custom_lr, lr_name)(**cfg_copy)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py
new file mode 100644
index 000000000..9feaf777c
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py
@@ -0,0 +1,79 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import copy
+import paddle
+
+
+def build_optimizer(cfg, lr_scheduler, parameter_list=None):
+
+    """
+    Build an optimizer and learning rate scheduler to optimize parameters accroding to ```OPTIMIZER``` field in configuration .
+
+    In configuration:
+    OPTIMIZER:
+        name: Momentum
+        momentum: 0.9
+        weight_decay: 0.001
+    or
+
+    OPTIMIZER:
+        name: Momentum
+        momentum: 0.9
+        weight_decay:
+            name: "L1"
+            value: 0.001
+
+    Momentum optimizer will be applied to optimize network and L1Decay regularizer will be applied to avoid overfit.
+
+    OPTIMIZER:
+        name: Adam
+        weight_decay:
+            name: "L2"
+            value: 0.001
+
+    Adam optimizer will be applied to optimize network and L2Decay regularizer will applied to avoid overfit.
+
+    Refer to ```https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/regularizer/L2Decay_en.html``` for more details.
+
+    Args:
+        cfg (dict): optimizer configuration.
+        lr_schduler: learning rate scheduler.
+        parameter_list (list): parameters to be optimized.
+
+    Returns:
+        optimizer (paddle.optimizer): paddle optimizer.
+
+    """
+    
+
+    cfg_copy = cfg.copy()
+    #XXX check none and illegal cfg!!!
+    opt_name = cfg_copy.pop('name')
+    # deal with weight decay
+    if cfg_copy.get('weight_decay'):
+        if isinstance(cfg_copy.get('weight_decay'), float) or 'L1' in cfg_copy.get('weight_decay').get('name').upper():
+            cfg_copy['weight_decay'] = cfg_copy.get('weight_decay').get('value')
+        elif 'L2' in cfg_copy.get('weight_decay').get('name').upper():
+            cfg_copy['weight_decay'] = paddle.regularizer.L2Decay(cfg_copy.get('weight_decay').get('value'))
+        else:
+            raise ValueError
+
+    cfg_copy.pop('learning_rate')
+
+    return getattr(paddle.optimizer, opt_name)(lr_scheduler,
+                                               parameters=parameter_list,
+                                               **cfg_copy)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py
new file mode 100644
index 000000000..53aa1f9b8
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py
@@ -0,0 +1,20 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .train import train_model
+from .test import test_model
+
+__all__ = ['train_model', 'test_model']
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/test.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/test.py
new file mode 100644
index 000000000..df44c2580
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/test.py
@@ -0,0 +1,80 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddlevideo.utils import get_logger
+from ..loader.builder import build_dataloader, build_dataset
+from ..metrics import build_metric
+from ..modeling.builder import build_model
+from paddlevideo.utils import load
+import time
+
+
+logger = get_logger("paddlevideo")
+
+
+@paddle.no_grad()
+def test_model(cfg, weights, parallel=True):
+    """Test model entry
+
+    Args:
+        cfg (dict): configuration.
+        weights (str): weights path to load.
+        parallel (bool): Whether to do multi-cards testing. Default: True.
+
+    """
+
+    # 1. Construct model.
+    model = build_model(cfg.MODEL)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    # 2. Construct dataset and dataloader.
+    cfg.DATASET.test.test_mode = True
+    dataset = build_dataset((cfg.DATASET.test, cfg.PIPELINE.test))
+    batch_size = cfg.DATASET.get("test_batch_size", 1)
+    places = paddle.set_device('gpu')
+    # default num worker: 0, which means no subprocess will be created
+    num_workers = cfg.DATASET.get('num_workers', 0)
+    dataloader_setting = dict(batch_size=batch_size,
+                              num_workers=num_workers,
+                              places=places,
+                              drop_last=False,
+                              shuffle=False)
+
+    data_loader = build_dataloader(dataset, **dataloader_setting)
+
+    model.eval()
+
+    state_dicts = load(weights)
+    model.set_state_dict(state_dicts)
+
+    # add params to metrics
+    cfg.METRIC.data_size = len(dataset)
+    cfg.METRIC.batch_size = batch_size
+
+    Metric = build_metric(cfg.METRIC)
+    
+    for batch_id, data in enumerate(data_loader):
+        if parallel:
+            outputs = model._layers.test_step(data)
+        else:
+            outputs = model.test_step(data)
+        Metric.update(batch_id, data, outputs)
+    
+    Metric.accumulate()
+    
+
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py
new file mode 100644
index 000000000..0c7a4079e
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/tasks/train.py
@@ -0,0 +1,295 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import time
+import os.path as osp
+
+import paddle
+import paddle.distributed.fleet as fleet
+from ..loader.builder import build_dataloader, build_dataset
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..metrics import build_metric
+from ..utils import do_preciseBN
+from paddlevideo.utils import get_logger, coloring
+from paddlevideo.utils import (AverageMeter, build_rec_record, log_batch,
+                               log_epoch, save, load, mkdir)
+#from paddlevideo.metrics import QualityMetric
+import numpy as np
+from scipy import stats
+
+
+def train_model(cfg,
+                weights=None,
+                parallel=True,
+                validate=True,
+                amp=False,
+                fleet=False):
+    """Train model entry
+
+    Args:
+    	cfg (dict): configuration.
+        weights (str): weights path for finetuning.
+    	parallel (bool): Whether multi-cards training. Default: True.
+        validate (bool): Whether to do evaluation. Default: False.
+
+    """
+    if fleet:
+        fleet.init(is_collective=True)
+
+    logger = get_logger("paddlevideo")
+    batch_size = cfg.DATASET.get('batch_size', 8)
+    valid_batch_size = cfg.DATASET.get('valid_batch_size', batch_size)
+    places = paddle.set_device('gpu')
+
+    # default num worker: 0, which means no subprocess will be created
+    num_workers = cfg.DATASET.get('num_workers', 0)
+    model_name = cfg.model_name
+    output_dir = cfg.get("output_dir", "./output/model_name/")
+    mkdir(output_dir)
+
+    # 1. Construct model
+    model = build_model(cfg.MODEL)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    if fleet:
+        model = paddle.distributed_model(model)
+
+    # 2. Construct dataset and dataloader
+    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))
+    train_dataloader_setting = dict(batch_size=batch_size,
+                                    num_workers=num_workers,
+                                    collate_fn_cfg=cfg.get('MIX', None),
+                                    places=places)
+
+    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)
+    if validate:
+        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))
+        validate_dataloader_setting = dict(
+            batch_size=valid_batch_size,
+            num_workers=num_workers,
+            places=places,
+            drop_last=False,
+            shuffle=cfg.DATASET.get(
+                'shuffle_valid',
+                False)  #NOTE: attention lstm need shuffle valid data.
+        )
+        valid_loader = build_dataloader(valid_dataset,
+                                        **validate_dataloader_setting)
+
+    # 3. Construct solver.
+    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+    optimizer = build_optimizer(cfg.OPTIMIZER,
+                                lr,
+                                parameter_list=model.parameters())
+    if fleet:
+        optimizer = fleet.distributed_optimizer(optimizer)
+    # Resume
+    resume_epoch = cfg.get("resume_epoch", 0)
+    if resume_epoch:
+        filename = osp.join(output_dir,
+                            model_name + "_epoch_{}".format(resume_epoch))
+        resume_model_dict = load(filename + '.pdparams')
+        resume_opt_dict = load(filename + '.pdopt')
+        model.set_state_dict(resume_model_dict)
+        optimizer.set_state_dict(resume_opt_dict)
+
+    # Finetune:
+    if weights:
+        assert resume_epoch == 0, "Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it."
+        model_dict = load(weights)
+        model.set_state_dict(model_dict)
+
+    # 4. Train Model
+    ###AMP###
+    if amp:
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+    best = 0.
+    max_SROCC = 0
+    max_PLCC = 0
+    Metric = build_metric(cfg.METRIC)
+    for epoch in range(0, cfg.epochs):
+        if epoch < resume_epoch:
+            logger.info(
+                "| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... "
+            )
+            continue
+        model.train()
+        record_list = build_rec_record(cfg.MODEL)
+        tic = time.time()
+        train_output = []
+        train_label = []
+
+        for i, data in enumerate(train_loader):
+            record_list['reader_time'].update(time.time() - tic)
+
+            # 4.1 forward
+            ###AMP###
+            if amp:
+                with paddle.amp.auto_cast(
+                        custom_black_list={"temporal_shift", "reduce_mean"}):
+                    if parallel:
+                        outputs = model._layers.train_step(data)
+                        ## required for DataParallel, will remove in next version
+                        model._reducer.prepare_for_backward(
+                            list(model._find_varbase(outputs)))
+                    else:
+                        outputs = model.train_step(data)
+
+                train_output.extend(outputs['output'])
+                train_label.extend(outputs['label'])
+
+                avg_loss = outputs['loss']
+                scaled = scaler.scale(avg_loss)
+                scaled.backward()
+                # keep prior to 2.0 design
+                scaler.minimize(optimizer, scaled)
+                optimizer.clear_grad()
+
+            else:
+                if parallel:
+                    outputs = model._layers.train_step(data)
+                    ## required for DataParallel, will remove in next version
+                    model._reducer.prepare_for_backward(
+                        list(model._find_varbase(outputs)))
+                else:
+                    outputs = model.train_step(data)
+
+                train_output.extend(outputs['output'])
+                train_label.extend(outputs['label'])
+                # 4.2 backward
+                avg_loss = outputs['loss']
+                avg_loss.backward()
+                # 4.3 minimize
+                optimizer.step()
+                optimizer.clear_grad()
+
+            # log record
+            record_list['lr'].update(optimizer._global_learning_rate(),
+                                     batch_size)
+            for name, value in outputs.items():
+                if name == 'output' or name == 'label':
+                    continue
+                record_list[name].update(value, batch_size)
+
+            record_list['batch_time'].update(time.time() - tic)
+            tic = time.time()
+
+            if i % cfg.get("log_interval", 10) == 0:
+                ips = "ips: {:.5f} instance/sec.".format(
+                    batch_size / record_list["batch_time"].val)
+                log_batch(record_list, i, epoch + 1, cfg.epochs, "train", ips)
+
+            # learning rate iter step
+            if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+                lr.step()
+
+        # learning rate epoch step
+        if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+            lr.step()
+
+        train_PLCC, train_SROCC = Metric.accumulate_train(
+            train_output, train_label)
+        logger.info("train_SROCC={}".format(train_SROCC))
+        logger.info("train_PLCC={}".format(train_PLCC))
+
+        ips = "ips: {:.5f} instance/sec.".format(
+            batch_size * record_list["batch_time"].count /
+            record_list["batch_time"].sum)
+        log_epoch(record_list, epoch + 1, "train", ips)
+
+        eval_output = []
+        eval_label = []
+
+        def evaluate(best, max_SROCC, max_PLCC):
+            """evaluate"""
+            model.eval()
+            record_list = build_rec_record(cfg.MODEL)
+            record_list.pop('lr')
+            tic = time.time()
+
+            for i, data in enumerate(valid_loader):
+
+                if parallel:
+                    outputs = model._layers.val_step(data)
+                else:
+                    outputs = model.val_step(data)
+                eval_output.extend(outputs['output'])
+                eval_label.extend(outputs['label'])
+
+                # log_record
+                for name, value in outputs.items():
+                    if name == 'output' or name == 'label':
+                        continue
+                    record_list[name].update(value, batch_size)
+
+                record_list['batch_time'].update(time.time() - tic)
+                tic = time.time()
+
+                if i % cfg.get("log_interval", 10) == 0:
+                    ips = "ips: {:.5f} instance/sec.".format(
+                        batch_size / record_list["batch_time"].val)
+                    log_batch(record_list, i, epoch + 1, cfg.epochs, "val", ips)
+
+            eval_PLCC, eval_SROCC = Metric.accumulate_train(
+                eval_output, eval_label)
+            logger.info("val_SROCC={}".format(eval_SROCC))
+            logger.info("val_PLCC={}".format(eval_PLCC))
+
+            if max_SROCC <= eval_SROCC and max_PLCC <= eval_PLCC:
+                max_SROCC = eval_SROCC
+                max_PLCC = eval_PLCC
+                logger.info("max_SROCC={}".format(max_SROCC))
+                logger.info("max_PLCC={}".format(max_PLCC))
+                save(optimizer.state_dict(),
+                     osp.join(output_dir, model_name + "_best.pdopt"))
+                save(model.state_dict(),
+                     osp.join(output_dir, model_name + "_best.pdparams"))
+
+            ips = "ips: {:.5f} instance/sec.".format(
+                batch_size * record_list["batch_time"].count /
+                record_list["batch_time"].sum)
+            log_epoch(record_list, epoch + 1, "val", ips)
+
+            return best, max_SROCC, max_PLCC
+
+        # use precise bn to improve acc
+        if cfg.get("PRECISEBN") and (epoch % cfg.PRECISEBN.preciseBN_interval
+                                     == 0 or epoch == cfg.epochs - 1):
+            do_preciseBN(
+                model, train_loader, parallel,
+                min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader)))
+
+        # 5. Validation
+        if validate and (epoch % cfg.get("val_interval", 1) == 0
+                         or epoch == cfg.epochs - 1):
+            with paddle.no_grad():
+                best, max_SROCC, max_PLCC = evaluate(best, max_SROCC, max_PLCC)
+
+        # 6. Save model
+        if epoch % cfg.get("save_interval", 1) == 0 or epoch == cfg.epochs - 1:
+            save(
+                optimizer.state_dict(),
+                osp.join(output_dir,
+                         model_name + "_epoch_{}.pdopt".format(epoch)))
+            save(
+                model.state_dict(),
+                osp.join(output_dir,
+                         model_name + "_epoch_{}.pdparams".format(epoch)))
+
+    logger.info('training {model_name} finished')
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py
new file mode 100644
index 000000000..fe58285e0
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py
@@ -0,0 +1,25 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .registry import Registry
+from .build_utils import build
+from .config import *
+from .logger import setup_logger, coloring, get_logger
+from .record import AverageMeter, build_record, build_rec_record, log_batch, log_epoch
+from .dist_utils import get_dist_info, main_only
+from .save_load import save, load, load_ckpt, mkdir
+from .precise_bn import do_preciseBN
+__all__ = ['Registry', 'build']
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py
new file mode 100644
index 000000000..c8ed1cbe4
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py
@@ -0,0 +1,36 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+def build(cfg, registry, key='name'):
+    """Build a module from config dict.
+    Args:
+        cfg (dict): Config dict. It should at least contain the key.
+        registry (XXX): The registry to search the type from.
+        key (str): the key.
+    Returns:
+        obj: The constructed object.
+    """
+
+    assert isinstance(cfg, dict) and key in cfg
+
+    cfg_copy = cfg.copy()
+    obj_type = cfg_copy.pop(key)
+
+    obj_cls = registry.get(obj_type)
+    if obj_cls is None:
+        raise KeyError('{} is not in the {} registry'.format(
+                obj_type, registry.name))
+    return obj_cls(**cfg_copy)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py
new file mode 100644
index 000000000..b98bb447c
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/config.py
@@ -0,0 +1,180 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+import yaml
+from paddlevideo.utils.logger import coloring, get_logger, setup_logger
+
+__all__ = ['get_config']
+
+logger = setup_logger("./", name="paddlevideo", level="INFO")
+
+
+class AttrDict(dict):
+    """Attr Dict"""
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+
+def create_attr_dict(yaml_config):
+    """create attr dict"""
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    placeholder = "-" * 60
+    for k, v in sorted(d.items()):
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", coloring(k,
+                                                                   "HEADER")))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ",
+                                         coloring(str(k), "HEADER")))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ",
+                                           coloring(k, "HEADER"),
+                                           coloring(v, "OKGREEN")))
+
+        if k.isupper():
+            logger.info(placeholder)
+
+
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    print_dict(config)
+
+
+def check_config(config):
+    """
+    Check config
+    """
+    pass
+
+
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+    def str2num(v):
+        """str2num"""
+        try:
+            return eval(v)
+        except Exception:
+            return v
+
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            if not ks[0] in dl:
+                logger.warning('A new filed ({}, {}) detected!'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            assert ks[0] in dl, (
+                '({}) doesn\'t exist in {}, a new dict field is invalid'.format(
+                    ks[0], dl))
+            override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                epochs=20',
+                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    if options is not None:
+        for opt in options:
+            assert isinstance(opt,
+                              str), ("option({}) should be a str".format(opt))
+            assert "=" in opt, (
+                "option({}) should contain a ="
+                "to distinguish between key and value".format(opt))
+            pair = opt.split('=')
+            assert len(pair) == 2, ("there can be only a = in the option")
+            key, value = pair
+            keys = key.split('.')
+            override(config, keys, value)
+
+    return config
+
+
+def get_config(fname, overrides=None, show=True):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    override_config(config, overrides)
+    if show:
+        print_config(config)
+    check_config(config)
+    return config
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py
new file mode 100644
index 000000000..ebfdba653
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py
@@ -0,0 +1,36 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import functools
+
+import paddle
+import paddle.distributed as dist
+
+def get_dist_info():
+    """get_dist_info"""
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    return rank, world_size
+
+def main_only(func):
+    """main_only"""
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        """wrapper"""
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/logger.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/logger.py
new file mode 100644
index 000000000..f4f6116c8
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/logger.py
@@ -0,0 +1,117 @@
+"""
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import logging
+import os
+import sys
+import datetime
+
+from paddle.distributed import ParallelEnv
+
+
+
+Color = {
+    'RED': '\033[31m',
+    'HEADER': '\033[35m',  # deep purple
+    'PURPLE': '\033[95m',  # purple
+    'OKBLUE': '\033[94m',
+    'OKGREEN': '\033[92m',
+    'WARNING': '\033[93m',
+    'FAIL': '\033[91m',
+    'ENDC': '\033[0m'
+}
+
+
+def coloring(message, color="OKGREEN"):
+    """coloring"""
+    assert color in Color.keys()
+    if os.environ.get('COLORING', True):
+        return Color[color] + str(message) + Color["ENDC"]
+    else:
+        return message
+
+
+logger_initialized = []
+
+
+def setup_logger(output=None, name="paddlevideo", level="INFO"):
+    """
+    Initialize the paddlevideo logger and set its verbosity level to "INFO".
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+    Returns:
+        logging.Logger: a logger
+    """
+    def time_zone(sec, fmt):
+        real_time = datetime.datetime.now()
+        return real_time.timetuple()
+    logging.Formatter.converter = time_zone
+
+    logger = logging.getLogger(name)
+    if level == "INFO":
+        logger.setLevel(logging.INFO)
+    elif level=="DEBUG":
+        logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    if level == "DEBUG":
+        plain_formatter = logging.Formatter(
+            "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
+            datefmt="%m/%d %H:%M:%S")
+    else:
+        plain_formatter = logging.Formatter(
+            "[%(asctime)s] %(message)s",
+            datefmt="%m/%d %H:%M:%S")
+    # stdout logging: master only
+    local_rank = ParallelEnv().local_rank
+    if local_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, ".log.txt")
+        if local_rank > 0:
+            filename = filename + ".rank{}".format(local_rank)
+
+        # PathManager.mkdirs(os.path.dirname(filename))
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        # fh = logging.StreamHandler(_cached_log_stream(filename)
+        fh = logging.FileHandler(filename, mode='a')
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+    logger_initialized.append(name)
+    return logger
+
+
+def get_logger(name, output=None):
+    """get logger"""
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    return setup_logger(name=name, output=name)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py
new file mode 100644
index 000000000..3f80517f8
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py
@@ -0,0 +1,84 @@
+"""
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+import itertools
+
+from paddlevideo.utils import get_logger
+logger = get_logger("paddlevideo")
+"""
+Implement precise bn, which is useful for improving accuracy.
+"""
+
+
+def do_preciseBN(model, data_loader, parallel, num_iters=200):
+    """
+    Recompute and update the batch norm stats to make them more precise. During
+    training both BN stats and the weight are changing after every iteration, so
+    the running average can not precisely reflect the actual stats of the
+    current model.
+    In this function, the BN stats are recomputed with fixed weights, to make
+    the running average more precise. Specifically, it computes the true average
+    of per-batch mean/variance instead of the running average.
+    This is useful to improve validation accuracy.
+    Args:
+        model: the model whose bn stats will be recomputed
+        data_loader: an iterator. Produce data as input to the model
+        num_iters: number of iterations to compute the stats.
+    Return:
+        the model with precise mean and variance in bn layers.
+    """
+    bn_layers_list = [
+        m for m in model.sublayers()
+        if any((isinstance(m, bn_type)
+                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,
+                                paddle.nn.BatchNorm3D))) and m.training
+    ]
+    if len(bn_layers_list) == 0:
+        return
+
+    # moving_mean=moving_mean*momentum+batch_mean*(1.−momentum)
+    # we set momentum=0. to get the true mean and variance during forward
+    momentum_actual = [bn._momentum for bn in bn_layers_list]
+    for bn in bn_layers_list:
+        bn._momentum = 0.
+
+    running_mean = [paddle.zeros_like(bn._mean)
+                    for bn in bn_layers_list]  #pre-ignore
+    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]
+
+    ind = -1
+    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):
+        logger.info("doing precise BN {} / {}...".format(ind + 1, num_iters))
+        if parallel:
+            model._layers.train_step(data)
+        else:
+            model.train_step(data)
+
+        for i, bn in enumerate(bn_layers_list):
+            # Accumulates the bn stats.
+            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)
+            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)
+
+    assert ind == num_iters - 1, (
+        "update_bn_stats is meant to run for {} iterations, but the dataloader stops at {} iterations."
+        .format(num_iters, ind))
+
+    # Sets the precise bn stats.
+    for i, bn in enumerate(bn_layers_list):
+        bn._mean.set_value(running_mean[i])
+        bn._variance.set_value(running_var[i])
+        bn._momentum = momentum_actual[i]
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/record.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/record.py
new file mode 100644
index 000000000..64e9c4448
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/record.py
@@ -0,0 +1,122 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from collections import OrderedDict
+from .logger import get_logger, coloring
+logger = get_logger("paddlevideo")
+
+__all__ = ['AverageMeter', 'build_record', 'build_rec_record', 'log_batch', 'log_epoch']
+
+
+def build_record(cfg):
+    framework_type = cfg.get('framework')
+    record_list = [
+        ("loss", AverageMeter('loss', '7.5f')),
+        ("lr", AverageMeter('lr', 'f', need_avg=False)),
+    ]
+    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework
+        record_list.append(("hit_at_one", AverageMeter("hit_at_one", '.5f')))
+        record_list.append(("perr", AverageMeter("perr", '.5f')))
+        record_list.append(("gap", AverageMeter("gap", '.5f')))
+    elif 'Recognizer' in cfg.framework:
+        record_list.append(("top1", AverageMeter("top1", '.5f')))
+        record_list.append(("top5", AverageMeter("top5", '.5f')))
+
+    record_list.append(("batch_time", AverageMeter('elapse', '.3f')))
+    record_list.append(("reader_time", AverageMeter('reader', '.3f')))
+    record_list = OrderedDict(record_list)
+    return record_list
+
+def build_rec_record(cfg):
+    """build rec record"""
+    framework_type = cfg.get('framework')
+    record_list = [
+        ("loss", AverageMeter('loss', '7.5f')),
+        ("lr", AverageMeter('lr', 'f', need_avg=False)),
+    ]
+    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework
+        record_list.append(("hit_at_one", AverageMeter("hit_at_one", '.5f')))
+        record_list.append(("perr", AverageMeter("perr", '.5f')))
+        record_list.append(("gap", AverageMeter("gap", '.5f')))
+
+    record_list.append(("batch_time", AverageMeter('elapse', '.3f')))
+    record_list.append(("reader_time", AverageMeter('reader', '.3f')))
+    record_list = OrderedDict(record_list)
+    return record_list
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    """
+    def __init__(self, name='', fmt='f', need_avg=True):
+        self.name = name
+        self.fmt = fmt
+        self.need_avg = need_avg
+        self.reset()
+
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """ update """
+        if isinstance(val, paddle.Tensor):
+            val = float(val)
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def total(self):
+        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)
+
+    @property
+    def total_minute(self):
+        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,
+                                                            self=self)
+
+    @property
+    def mean(self):
+        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(
+            self=self) if self.need_avg else ''
+
+    @property
+    def value(self):
+        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)
+
+
+def log_batch(metric_list, batch_id, epoch_id, total_epoch, mode, ips):
+    metric_str = ' '.join([str(m.value) for m in metric_list.values()])
+    epoch_str = "epoch:[{:>3d}/{:<3d}]".format(epoch_id, total_epoch)
+    step_str = "{:s} step:{:<4d}".format(mode, batch_id)
+    logger.info("{:s} {:s} {:s}s {}".format(
+        coloring(epoch_str, "HEADER") if batch_id == 0 else epoch_str,
+        coloring(step_str, "PURPLE"), coloring(metric_str, 'OKGREEN'), ips))
+
+
+def log_epoch(metric_list, epoch, mode, ips):
+    metric_avg = ' '.join([str(m.mean) for m in metric_list.values()] +
+                          [metric_list['batch_time'].total])
+
+    end_epoch_str = "END epoch:{:<3d}".format(epoch)
+
+    logger.info("{:s} {:s} {:s}s {}".format(coloring(end_epoch_str, "RED"),
+                                            coloring(mode, "PURPLE"),
+                                            coloring(metric_avg, "OKGREEN"),
+                                            ips))
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/registry.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/registry.py
new file mode 100644
index 000000000..44c5c98d8
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/registry.py
@@ -0,0 +1,98 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+class Registry(object):
+    """
+    The registry that provides name -> object mapping, to support third-party users' custom modules.
+
+    To register an object:
+
+    .. code-block:: python
+
+        BACKBONES = Registry('backbone')
+        @BACKBONES.register()
+        class ResNet:
+            pass
+    Or:
+    .. code-block:: python
+
+        BACKBONES = Registry('backbone')
+        class ResNet:
+            pass
+        BACKBONES.register(ResNet)
+
+    Usage: To build a module.
+
+    .. code-block:: python
+        backbone_name = "ResNet"
+        b = BACKBONES.get(backbone_name)()
+
+    """
+    def __init__(self, name):
+        """
+        Args:
+            name (str): the name of this registry
+        """
+        self._name = name
+        self._obj_map = {}
+
+    def __contains__(self, key):
+        return self._obj_map.get(key) is not None
+
+    def _do_register(self, name, obj):
+        """do register"""
+        assert (
+            name not in self._obj_map
+        ), "An object named '{}' was already registered in '{}' registry!".format(
+            name, self._name)
+        self._obj_map[name] = obj
+
+    def register(self, obj=None, name=None):
+        """
+        Register the given object under the the name `obj.__name__`.
+        Can be used as either a decorator or not. See docstring of this class for usage.
+        """
+        if obj is None:
+            # used as a decorator
+            def deco(func_or_class, name=name):
+                if name is None:
+                    name = func_or_class.__name__
+                self._do_register(name, func_or_class)
+                return func_or_class
+
+            return deco
+
+        # used as a function call
+        if name is None:
+            name = obj.__name__
+        self._do_register(name, obj)
+
+    def get(self, name):
+        """Get the registry record.
+
+        Args:
+            name (str): The class name.
+
+        Returns:
+            ret: The class.
+        """
+        ret = self._obj_map.get(name)
+        if ret is None:
+            raise KeyError(
+                "No object named '{}' found in '{}' registry!".format(
+                    name, self._name))
+
+        return ret
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py
new file mode 100644
index 000000000..523155344
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py
@@ -0,0 +1,87 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+import os.path as osp
+import time
+
+import pickle
+from tqdm import tqdm
+import paddle
+
+from paddlevideo.utils import get_logger
+from paddlevideo.utils import main_only
+
+
+#XXX(shipping): maybe need load N times because of different cards have different params.
+@main_only
+def load_ckpt(model, weight_path):
+    """
+    load_ckpt
+    """
+    #model.set_state_dict(state_dict)
+
+    if not osp.isfile(weight_path):
+        raise IOError('{weight_path} is not a checkpoint file')
+    #state_dicts = load(weight_path)
+
+    logger = get_logger("paddlevideo")
+    state_dicts = paddle.load(weight_path)
+    tmp = {}
+    total_len = len(model.state_dict())
+    localkeyname = [i for i in state_dicts]
+
+    with tqdm(total=total_len,
+              position=1,
+              bar_format='{desc}',
+              desc="Loading weights") as desc:
+        #for item in tqdm(model.state_dict(), total=total_len, position=0):
+        for i, item in enumerate(
+                tqdm(model.state_dict(), total=total_len, position=0)):
+            name = item
+            desc.set_description('Loading %s' % name)
+            print("model name is {}, correspoding local name is {}".format(
+                name, localkeyname[i]))
+            #tmp[name] = state_dicts[name]
+            tmp[name] = state_dicts[localkeyname[i]]
+            time.sleep(0.01)
+        ret_str = "loading {:<20d} weights completed.".format(
+            len(model.state_dict()))
+        desc.set_description(ret_str)
+        model.set_state_dict(tmp)
+
+
+def mkdir(dir):
+    """mkdir"""
+    if not os.path.exists(dir):
+        # avoid error when train with multiple gpus
+        try:
+            os.makedirs(dir)
+        except:
+            pass
+
+
+@main_only
+def save(obj, path):
+    """save"""
+    paddle.save(obj, path)
+
+
+def load(file_name):
+    """load"""
+    if not osp.isfile(file_name):
+        raise IOError('{file_name} not exist')
+    return paddle.load(file_name)
diff --git a/docs/src/applications/VideoQualityAssessment/paddlevideo/version.py b/docs/src/applications/VideoQualityAssessment/paddlevideo/version.py
new file mode 100644
index 000000000..50266f5e0
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/paddlevideo/version.py
@@ -0,0 +1,18 @@
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+__all__ = ["paddlevideo_version"]
+paddlevideo_version = "0.0.1"
diff --git a/docs/src/applications/VideoQualityAssessment/run.sh b/docs/src/applications/VideoQualityAssessment/run.sh
new file mode 100644
index 000000000..f7f5c49c5
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/run.sh
@@ -0,0 +1,20 @@
+export CUDA_VISIBLE_DEVICES=0
+
+# run  training
+python3.7 -B -m paddle.distributed.launch --gpus="0"  --log_dir=log_pptsm  main.py --amp  --validate -c configs/recognition/tsm/pptsm_regression.yaml
+
+# run testing
+#python3.7 -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_pptsm main.py -c configs/recognition/tsm/pptsm_regression.yaml --test --weights=output/model_name/ppTSM_best.pdparams
+
+#finetune
+#python3 -m paddle.distributed.launch --gpus="0,1,2,3" main.py --amp -c ./configs/recognition/tsm/pptsm_regression.yaml --validate --weights=./output/model_name/ppTSM_best.pdparams
+
+#resume
+#python3 -m paddle.distributed.launch --gpus="0,1,2,3" main.py --amp -c ./configs/recognition/tsm/pptsm_regression.yaml --validate -o resume_epoch=2
+# export_models script
+# just use `example` as example, please replace to real name.
+#python3.7 tools/export_model.py -c configs/example.yaml -p output/model_name/ppTSM_best.pdparams -o ./inference
+
+# predict script
+# just use `example` as example, please replace to real name.
+#python3.7 tools/predict.py -v example.avi --model_file "./inference/example.pdmodel" --params_file "./inference/example.pdiparams" --enable_benchmark=False --model="example" --num_seg=8
diff --git a/docs/src/applications/VideoQualityAssessment/save_model.sh b/docs/src/applications/VideoQualityAssessment/save_model.sh
new file mode 100644
index 000000000..5cf6fbdff
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/save_model.sh
@@ -0,0 +1,5 @@
+python tools/export_model.py \
+  -c ./configs/recognition/tsm/pptsm.yaml \
+  -p ./output/ppTSM/ppTSM_best.pdparams \
+  -o ./inference/ \
+  --num_seg=32 
diff --git a/docs/src/applications/VideoQualityAssessment/setup.py b/docs/src/applications/VideoQualityAssessment/setup.py
new file mode 100644
index 000000000..477013b71
--- /dev/null
+++ b/docs/src/applications/VideoQualityAssessment/setup.py
@@ -0,0 +1,57 @@
+"""
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from setuptools import setup
+from io import open
+
+with open('requirements.txt', encoding="utf-8-sig") as f:
+    requirements = f.readlines()
+
+def readme():
+    """readme"""
+    with open('docs/en/whl_en.md', encoding="utf-8-sig") as f:
+        README = f.read()
+    return README
+
+
+setup(
+    name='paddlevideo', #name of .whl file
+    packages=['ppvideo'], #install package name
+    package_dir={'ppvideo': ''},
+    include_package_data=True, #Accept all data files and directories matched by MANIFEST.in
+    install_requires=requirements,
+    entry_points={"console_scripts": ["ppvideo= ppvideo.tools.paddlevideo_clas:main"]},
+    version='0.0.1',
+    license='Apache License 2.0',
+    description='Awesome Video toolkits based on PaddlePaddle ',
+    long_description=readme(),
+    long_description_content_type='text/markdown',
+    url='https://github.com/PaddlePaddle/PaddleVideo',
+    download_url='https://github.com/PaddlePaddle/PaddleVideo.git',
+    keywords=[
+    'A treasure chest for video understanding powered by PaddlePaddle.'
+    ],
+    classifiers=[
+        'Intended Audience :: Developers', 'Operating System :: OS Independent',
+        'Natural Language :: Chinese (Simplified)',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7', 'Topic :: Utilities'
+    ],)
diff --git a/docs/src/applications/VideoTag/FineTune.md b/docs/src/applications/VideoTag/FineTune.md
new file mode 100644
index 000000000..f0ce9ed48
--- /dev/null
+++ b/docs/src/applications/VideoTag/FineTune.md
@@ -0,0 +1,206 @@
+# 模型微调指南
+
+---
+## 内容
+参考本文档，您可以使用自己的训练数据在VideoTag预训练模型上进行fine-tune，训练出自己的模型。
+
+文档内容包括:
+- [原理解析](#原理解析)
+- [对AttentionLSTM模型进行微调](#对AttentionLSTM模型进行微调)
+- [对TSN模型进行微调](#对TSN模型进行微调)
+- [扩展内容](#扩展内容)
+- [参考论文](#参考论文)
+
+
+## 原理解析
+VideoTag采用两阶段建模方式，由两个模型组成: TSN + AttentionLSTM。
+
+Temporal Segment Network (TSN) 是经典的基于2D-CNN的视频分类模型。该模型通过稀疏采样视频帧的方式，在捕获视频时序信息的同时降低了计算量。详细内容请参考论文[Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)
+AttentionLSTM以视频的特征向量作为输入，采用双向长短时记忆网络（LSTM）对所有帧特征进行编码，并增加Attention层，将每个时刻的隐状态输出与自适应权重线性加权得到最终分类向量。详细内容请参考论文[AttentionCluster](https://arxiv.org/abs/1711.09550)
+
+VideoTag训练时分两个阶段: 第一阶段使用少量视频样本（十万级别）训练大规模视频特征提取模型(TSN)；第二阶段使用千万级数据训练预测器(AttentionLSTM)。
+
+VideoTag预测时也分两个阶段: 第一阶段以视频文件作为输入，经过去除了全连接层以及损失函数层的TSN网络后得到输出特征向量；第二阶段以TSN网络输出的特征向量作为输入，经过AttentionLSTM后得到最终的分类结果。
+
+基于我们的预模型，您可以使用自己的训练数据进行fine-tune:
+
+- [对AttentionLSTM模型进行微调](#对AttentionLSTM模型进行微调)
+- [对TSN模型进行微调](#对TSN模型进行微调)
+
+
+## 对AttentionLSTM模型进行微调
+AttentionLSTM以视频特征作为输入，显存占用少，训练速度较TSN更快，因此推荐优先对AttentionLSTM模型进行微调。输入视频首先经过TSN预训练模型提取特征向量，然后将特征向量作为训练输入数据，微调AttentionLSTM模型。
+
+### TSN预模型提取特征向量
+
+#### 数据准备
+
+- 预训练权重下载: 参考[样例代码运行指南-数据准备-预训练权重下载](./Run.md)
+
+- 准备训练数据: 准备好待训练的视频数据，并在video\_tag/data/TsnExtractor.list文件中指定待训练的文件路径，内容格式如下:
+
+```
+my_video_path/my_video_file1.mp4
+my_video_path/my_video_file2.mp4
+...
+```
+
+#### 特征提取
+特征提取脚本如下:
+
+```
+python tsn_extractor.py --model_name=TSN --config=./configs/tsn.yaml --weights=./weights/tsn.pdparams
+```
+
+- 通过--weights可指定TSN权重参数的存储路径，默认为video\_tag/weights/tsn.pdparams
+
+- 通过--save\_dir可指定特征向量保存路径，默认为video\_tag/data/tsn\_features，不同输入视频的特征向量提取结果分文件保存在不同的npy文件中，目录形式为:
+
+```
+video_tag
+  ├──data
+    ├──tsn_features
+      ├── my_feature_file1.npy
+      ├── my_feature_file2.npy
+      ...
+```
+- tsn提取的特征向量维度为```帧数*特征维度```，默认为300 * 2048。
+
+### AttentionLSTM模型Fine-tune
+
+#### 数据准备
+VideoTag中的AttentionLSTM以TSN模型提取的特征向量作为输入。在video\_tag/data/dataset/attention\_lstm/train.list文件中指定待训练的文件路径和对应的标签，内容格式如下:
+
+```
+my_feature_path/my_feature_file1.npy label1 label2
+my_feature_path/my_feature_file2.npy label1
+...
+```
+- 一个输入视频可以有多个标签，标签索引为整型数据，文件名与标签之间、多个标签之间以一个空格分隔；
+
+- 标签索引与标签名称的之间的对应关系以list文件指定，可参考VideoTag用到的label_3396.txt文件构造，行索引对应标签索引;
+
+- 验证集、测试集以及预测数据集的构造方式同训练集类似，仅需要在video\_tag/data/attention\_lstm/目录下对应的list文件中指定相关文件路径/标签即可。
+
+#### 模型训练
+使用VideoTag中的AttentionLSTM预模型进行fine-tune训练脚本如下:
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python train.py --model_name=AttentionLSTM --config=./configs/attention_lstm.yaml --pretrain=./weights/attention_lstm
+```
+
+- AttentionLSTM模型默认使用8卡训练，总的batch size数是1024。若使用单卡训练，请修改环境变量，脚本如下:
+```
+export CUDA_VISIBLE_DEVICES=0
+python train.py --model_name=AttentionLSTM --config=./configs/attention_lstm-single.yaml --pretrain=./weights/attention_lstm
+```
+
+- 请确保训练样本数大于batch_size数
+
+- 通过--pretrain参数可指定AttentionLSTM预训练模型的路径，默认为./weights/attention\_lstm；
+
+- 模型相关配置写在video_tag/configs/attention\_lstm.yaml文件中，可以方便的调节各项超参数；
+
+- 通过--save_dir参数可指定训练模型参数的保存路径，默认为./data/checkpoints；
+
+#### 模型评估
+可用如下方式进行模型评估:
+```
+python eval.py --model_name=AttentionLSTM --config=./configs/attention_lstm.yaml --weights=./data/checkpoints/AttentionLSTM_epoch9.pdparams
+```
+- 通过--weights参数可指定评估需要的权重，默认为./data/checkpoints/AttentionLSTM_epoch9.pdparams；
+
+- 评估结果以log的形式直接打印输出GAP、Hit@1等精度指标。
+
+#### 模型推断
+可用如下方式进行模型推断:
+```
+python predict.py --model_name=AttentionLSTM --config=./configs/attention_lstm.yaml --weights=./data/checkpoints/AttentionLSTM_epoch9.pdparams
+```
+
+- 通过--weights参数可指定推断需要的权重，默认为./data/checkpoints/AttentionLSTM_epoch9.pdparams；
+
+- 通过--label_file参数指定标签文件，请根据自己的数据修改，默认为./label_3396.txt;
+
+- 预测结果会以日志形式打印出来，同时也保存在json文件中，通过--save_dir参数可指定预测结果保存路径，默认为./data/predict_results/attention_lstm。
+
+
+## 对TSN模型进行微调
+VideoTag中使用的TSN模型以mp4文件为输入，backbone为ResNet101。
+
+### 数据准备
+
+准备好训练视频文件后，在video\_tag/data/dataset/tsn/train.list文件中指定待训练的文件路径和对应的标签即可，内容格式如下:
+
+```
+my_video_path/my_video_file1.mp4 label1
+my_video_path/my_video_file2.mp4 label2
+...
+```
+- 一个输入视频只能有一个标签，标签索引为整型数据，标签索引与文件名之间以一个空格分隔；
+
+- 验证集、测试集以及预测数据集的构造方式同训练集类似，仅需要在video\_tag/data/dataset/tsn目录下对应的list文件中指定相关文件路径/标签即可。
+
+#### 模型训练
+使用VideoTag中的TSN预模型进行fine-tune训练脚本如下:
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python train.py --model_name=TSN --config=./configs/tsn.yaml --pretrain=./weights/tsn
+```
+
+- TSN模型默认使用8卡训练，总的batch size数是256。若使用单卡训练，请修改环境变量，脚本如下:
+```
+export CUDA_VISIBLE_DEVICES=0
+python train.py --model_name=TSN --config=./configs/tsn-single.yaml --pretrain=./weights/tsn
+```
+
+- 通过--pretrain参数可指定TSN预训练模型的路径，示例为./weights/tsn；
+
+- 模型相关配置写在video_tag/configs/tsn.yaml文件中，可以方便的调节各项超参数；
+
+- 通过--save_dir参数可指定训练模型参数的保存路径，默认为./data/checkpoints；
+
+#### 模型评估
+可用如下方式进行模型评估:
+```
+python eval.py --model_name=TSN --config=./configs/tsn.yaml --weights=./data/checkpoints/TSN_epoch44.pdparams
+```
+
+- 通过--weights参数可指定评估需要的权重，示例为./data/checkpoints/TSN_epoch44.pdparams；
+
+- 评估结果以log的形式直接打印输出TOP1_ACC、TOP5_ACC等精度指标。
+
+#### 模型推断
+可用如下方式进行模型推断:
+```
+python predict.py --model_name=TSN --config=./configs/tsn.yaml --weights=./data/checkpoints/TSN_epoch44.pdparams --save_dir=./data/predict_results/tsn/
+```
+
+- 通过--weights参数可指定推断需要的权重，示例为./data/checkpoints/TSN_epoch44.pdparams；
+
+- 通过--label_file参数指定标签文件，请根据自己的数据修改，默认为./label_3396.txt;
+
+- 预测结果会以日志形式打印出来，同时也保存在json文件中，通过--save_dir参数可指定预测结果保存路径，示例为./data/predict_results/tsn。
+
+### 训练加速
+TSN模型默认以mp4的视频文件作为输入，训练时需要先对视频文件解码，再将解码后的数据送入网络进行训练，如果视频文件很大，这个过程将会很耗时。
+
+为加速训练，可以先将视频解码成图片，然后保存下来，训练时直接根据索引读取帧图片作为输入，加快训练过程。
+
+- 数据准备: 首先将视频解码，存成帧图片；然后生成帧图片的文件路径列表。实现过程可参考[ucf-101数据准备](../../../../dygraph/tsn/data/dataset/ucf101/README.md)
+
+- 修改配置文件: 修改配置文件./config/tsn.yaml，其中MODEL.format值改为"frames"，不同模式下的filelist值改为对应的帧图片文件list。
+
+
+## 扩展内容
+
+- 更多关于TSN模型的内容可参考PaddleCV视频库[TSN视频分类模型](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/video/models/tsn/README.md)。
+
+- 更多关于AttentionLSTM模型的内容可参考PaddleCV视频库[AttentionLSTM视频分类模型](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/video/models/attention_lstm)。
+
+
+## 参考论文
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
+
+- [Beyond Short Snippets: Deep Networks for Video Classification](https://arxiv.org/abs/1503.08909) Joe Yue-Hei Ng, Matthew Hausknecht, Sudheendra Vijayanarasimhan, Oriol Vinyals, Rajat Monga, George Toderici
diff --git a/docs/src/applications/VideoTag/README.md b/docs/src/applications/VideoTag/README.md
new file mode 100644
index 000000000..b33c3666c
--- /dev/null
+++ b/docs/src/applications/VideoTag/README.md
@@ -0,0 +1,31 @@
+# VideoTag 飞桨大规模视频分类模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [使用方法](#使用方法)
+
+
+## 模型简介
+
+飞桨大规模视频分类模型VideoTag基于百度短视频业务千万级数据，支持3000个源于产业实践的实用标签，具有良好的泛化能力，非常适用于国内大规模（千万/亿/十亿级别）短视频分类场景的应用。VideoTag采用两阶段建模方式，即图像建模和序列学习。第一阶段，使用少量视频样本（十万级别）训练大规模视频特征提取模型(Extractor)；第二阶段，使用千万级数据训练预测器(Predictor)，最终实现在超大规模（千万/亿/十亿级别）短视频上产业应用，其原理示意如下图所示。
+
+<p align="center">
+<img src="images.png" height=220 width=800 hspace='10'/> <br />
+VideoTag模型示意图
+</p>
+
+- 数据处理：视频是按特定顺序排列的一组图像的集合，这些图像也称为帧。视频分类任务需要先对短视频进行解码，然后再将输出的图像帧序列灌入到VideoTag中进行训练和预测。
+
+- 图像建模：先从训练数据中，对每个类别均匀采样少量样本数据，构成十万量级的训练视频。然后使用TSN网络进行训练，提取所有视频帧的TSN模型分类层前一层的特征数据。在这个过程中，每一帧都被转化成相应的特征向量，一段视频被转化成一个特征序列。
+
+- 序列学习：采用Attention clusters、LSTM和Nextvlad对特征序列进行建模，学习各个特征之间的组合方式，进一步提高模型准确率。由于序列学习相比于图像建模耗时更短，因此可以融合多个具有互补性的序列模型。示例代码仅使用Attention\_LSTM网络进行序列特征预测。
+
+- 预测结果：融合多个模型结果实现视频分类，进一步提高分类准确率。
+
+
+## 使用方法
+- [1. 如何运行样例代码](./Run.md)
+- [2. 如何使用自己的数据进行测试](./Test.md)
+- [3. 如何进行模型fine-tune](./FineTune.md)
diff --git a/docs/src/applications/VideoTag/Run.md b/docs/src/applications/VideoTag/Run.md
new file mode 100644
index 000000000..3ffee7f18
--- /dev/null
+++ b/docs/src/applications/VideoTag/Run.md
@@ -0,0 +1,109 @@
+# 样例代码运行指南
+
+---
+## 内容
+参考本文档，您可以快速熟悉VideoTag的使用方法，观察VideoTag的预训练模型在示例视频上的预测结果。
+
+文档内容包括:
+- [安装说明](#安装说明)
+- [数据准备](#数据准备)
+- [模型推断](#模型推断)
+
+
+## 安装说明
+
+### 环境依赖：
+
+```
+    CUDA >= 9.0
+    cudnn >= 7.5
+```
+
+### 依赖安装:
+
+- 1.7.0 <= PaddlePaddle版本 <= 2.0.0: pip install paddlepaddle-gpu==1.8.4.post97 -i https://mirror.baidu.com/pypi/simple
+- opencv版本 >= 4.1.0: pip install opencv-python==4.2.0.32
+
+## 数据准备
+
+### 预训练权重下载
+
+我们提供了[TSN](https://videotag.bj.bcebos.com/video_tag_tsn.tar)和[AttentionLSTM](https://videotag.bj.bcebos.com/video_tag_lstm.tar)预训练权重，请在video\_tag目录下新建weights目录，并将下载解压后的参数文件放在weights目录下:
+
+```
+    mkdir weights
+    cd weights
+    wget https://videotag.bj.bcebos.com/video_tag_tsn.tar
+    wget https://videotag.bj.bcebos.com/video_tag_lstm.tar
+    tar -zxvf video_tag_tsn.tar
+    tar -zxvf video_tag_lstm.tar
+    rm video_tag_tsn.tar -rf
+    rm video_tag_lstm.tar -rf
+    mv video_tag_tsn/* .
+    mv attention_lstm/* .
+    rm video_tag_tsn/ -rf
+    rm attention_lstm -rf
+```
+
+所得目录结构如下：
+
+```
+video_tag
+  ├──weights
+    ├── attention_lstm.pdmodel
+    ├── attention_lstm.pdopt  
+    ├── attention_lstm.pdparams
+    ├── tsn.pdmodel
+    ├── tsn.pdopt
+    └── tsn.pdparams
+```
+
+### 示例视频下载
+
+我们提供了[样例视频](https://videotag.bj.bcebos.com/mp4.tar)方便用户测试，请下载后解压，并将视频文件放置在video\_tag/data/mp4目录下:
+
+```
+    cd data/
+    wget https://videotag.bj.bcebos.com/mp4.tar
+    tar -zxvf mp4.tar
+    rm mp4.tar -rf
+```
+
+所得目录结构如下：
+
+```
+video_tag
+  ├──data
+    ├── mp4
+      ├── 1.mp4
+      ├── 2.mp4
+      └── ...
+```
+
+## 模型推断
+
+模型推断的启动方式如下：
+
+    python videotag_test.py
+
+- 预测结果会以日志方式打印，示例如下:
+```
+[========video_id [ data/mp4/1.mp4 ] , topk(20) preds: ========]
+class_id: 3110, class_name: 训练 ,  probability:  0.97730666399
+class_id: 2159, class_name: 蹲 ,  probability:  0.945082366467
+...
+[========video_id [ data/mp4/2.mp4 ] , topk(20) preds: ========]
+class_id: 2773, class_name: 舞蹈 ,  probability:  0.850423932076
+class_id: 1128, class_name: 表演艺术 ,  probability:  0.0446354188025
+...
+```
+
+- 通过--save\_dir可指定预测结果存储路径，默认为video\_tag/data/VideoTag\_results，不同输入视频的预测结果分文件保存在不同的json文件中，文件的内容格式为：
+
+```
+    [file_path,
+     {"class_name": class_name1, "probability": probability1, "class_id": class_id1},
+     {"class_name": class_name2, "probability": probability2, "class_id": class_id2},
+     ...
+    ]
+```
diff --git a/docs/src/applications/VideoTag/Test.md b/docs/src/applications/VideoTag/Test.md
new file mode 100644
index 000000000..155f7634d
--- /dev/null
+++ b/docs/src/applications/VideoTag/Test.md
@@ -0,0 +1,31 @@
+# 预训练模型自测指南
+
+## 内容
+参考本文档，您可以快速测试VideoTag的预训练模型在自己业务数据上的预测效果。
+
+主要内容包括:
+- [数据准备](#数据准备)
+- [模型推断](#模型推断)
+
+## 数据准备
+
+在数据准备阶段，您需要准备好自己的测试数据，并在video\_tag/data/VideoTag\_test.list文件中指定待推断的测试文件路径，内容格式如下:
+```
+my_video_path/my_video_file1.mp4
+my_video_path/my_video_file2.mp4
+...
+```
+
+## 模型推断
+
+模型推断的启动方式如下：
+
+    python videotag_test.py
+
+- 目前支持的视频文件输入格式为：mp4、mkv和webm格式；
+
+- 模型会从输入的视频文件中*均匀抽取300帧*用于预测。对于较长的视频文件，建议先截取有效部分输入模型以提高预测速度；
+
+- 通过--use\_gpu参数可指定是否使用gpu进行推断，默认使用gpu。对于10s左右的短视频文件，gpu推断时间约为4s；
+
+- 通过--filelist可指定输入list文件路径，默认为video\_tag/data/VideoTag\_test.list。
diff --git a/docs/src/applications/VideoTag/eval.py b/docs/src/applications/VideoTag/eval.py
new file mode 100644
index 000000000..d0bde90c5
--- /dev/null
+++ b/docs/src/applications/VideoTag/eval.py
@@ -0,0 +1,134 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import time
+import logging
+import argparse
+import ast
+import paddle
+import paddle.static as static
+
+from utils.config_utils import *
+import models
+from reader import get_reader
+from metrics import get_metrics
+from utils.utility import check_cuda
+from utils.utility import check_version
+
+logging.root.handlers = []
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name',
+                        type=str,
+                        default='AttentionCluster',
+                        help='name of model to train.')
+    parser.add_argument('--config',
+                        type=str,
+                        default='configs/attention_cluster.txt',
+                        help='path to config file of model')
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=None,
+        help='test batch size. None to use config file setting.')
+    parser.add_argument('--use_gpu',
+                        type=ast.literal_eval,
+                        default=True,
+                        help='default use gpu.')
+    parser.add_argument(
+        '--weights',
+        type=str,
+        default='./data/checkpoints/AttentionLSTM_epoch9.pdparams',
+        help='weight path.')
+    parser.add_argument(
+        '--save_dir',
+        type=str,
+        default=os.path.join('data', 'evaluate_results'),
+        help='output dir path, default to use ./data/evaluate_results')
+    parser.add_argument('--log_interval',
+                        type=int,
+                        default=1,
+                        help='mini-batch interval to log.')
+    args = parser.parse_args()
+    return args
+
+
+def test(args):
+    # parse config
+    config = parse_config(args.config)
+    test_config = merge_configs(config, 'test', vars(args))
+    print_configs(test_config, "Test")
+    use_dali = test_config['TEST'].get('use_dali', False)
+
+    # build model
+    test_model = models.get_model(args.model_name, test_config, mode='test')
+    test_model.build_input(use_dataloader=False)
+    test_model.build_model()
+    test_feeds = test_model.feeds()
+    test_fetch_list = test_model.fetches()
+
+    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
+    exe = static.Executor(place)
+
+    exe.run(static.default_startup_program())
+
+    if args.weights:
+        assert os.path.exists(
+            args.weights), "Given weight dir {} not exist.".format(args.weights)
+    weights = args.weights or test_model.get_weights()
+
+    logger.info('load test weights from {}'.format(weights))
+
+    test_model.load_test_weights(exe, weights, static.default_main_program())
+
+    # get reader and metrics
+    test_reader = get_reader(args.model_name.upper(), 'test', test_config)
+    test_metrics = get_metrics(args.model_name.upper(), 'test', test_config)
+
+    test_feeder = paddle.fluid.DataFeeder(place=place, feed_list=test_feeds)
+
+    epoch_period = []
+    for test_iter, data in enumerate(test_reader()):
+        cur_time = time.time()
+        test_outs = exe.run(fetch_list=test_fetch_list,
+                            feed=test_feeder.feed(data))
+        period = time.time() - cur_time
+        epoch_period.append(period)
+        test_metrics.accumulate(test_outs)
+
+        # metric here
+        if args.log_interval > 0 and test_iter % args.log_interval == 0:
+            info_str = '[EVAL] Batch {}'.format(test_iter)
+            test_metrics.calculate_and_log_out(test_outs, info_str)
+
+    if not os.path.isdir(args.save_dir):
+        os.makedirs(args.save_dir)
+    test_metrics.finalize_and_log_out("[EVAL] eval finished. ", args.save_dir)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    # check whether the installed paddle is compiled with GPU
+    check_cuda(args.use_gpu)
+    check_version()
+    logger.info(args)
+
+    test(args)
diff --git a/docs/src/applications/VideoTag/metrics/__init__.py b/docs/src/applications/VideoTag/metrics/__init__.py
new file mode 100644
index 000000000..0d1df762b
--- /dev/null
+++ b/docs/src/applications/VideoTag/metrics/__init__.py
@@ -0,0 +1 @@
+from .metrics_util import get_metrics
diff --git a/docs/src/applications/VideoTag/metrics/kinetics/accuracy_metrics.py b/docs/src/applications/VideoTag/metrics/kinetics/accuracy_metrics.py
new file mode 100644
index 000000000..fd187db5b
--- /dev/null
+++ b/docs/src/applications/VideoTag/metrics/kinetics/accuracy_metrics.py
@@ -0,0 +1,107 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+import datetime
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class MetricsCalculator():
+    def __init__(self, name, mode):
+        self.name = name
+        self.mode = mode  # 'train', 'val', 'test'
+        self.reset()
+
+    def reset(self):
+        logger.info('Resetting {} metrics...'.format(self.mode))
+        self.aggr_acc1 = 0.0
+        self.aggr_acc5 = 0.0
+        self.aggr_loss = 0.0
+        self.aggr_batch_size = 0
+
+    def finalize_metrics(self):
+        self.avg_acc1 = self.aggr_acc1 / self.aggr_batch_size
+        self.avg_acc5 = self.aggr_acc5 / self.aggr_batch_size
+        self.avg_loss = self.aggr_loss / self.aggr_batch_size
+
+    def get_computed_metrics(self):
+        json_stats = {}
+        json_stats['avg_loss'] = self.avg_loss
+        json_stats['avg_acc1'] = self.avg_acc1
+        json_stats['avg_acc5'] = self.avg_acc5
+        return json_stats
+
+    def calculate_metrics(self, loss, softmax, labels):
+        accuracy1 = compute_topk_accuracy(softmax, labels, top_k=1) * 100.
+        accuracy5 = compute_topk_accuracy(softmax, labels, top_k=5) * 100.
+        return accuracy1, accuracy5
+
+    def accumulate(self, loss, softmax, labels):
+        cur_batch_size = softmax.shape[0]
+        # if returned loss is None for e.g. test, just set loss to be 0.
+        if loss is None:
+            cur_loss = 0.
+        else:
+            cur_loss = np.mean(np.array(loss))  #
+        self.aggr_batch_size += cur_batch_size
+        self.aggr_loss += cur_loss * cur_batch_size
+
+        accuracy1 = compute_topk_accuracy(softmax, labels, top_k=1) * 100.
+        accuracy5 = compute_topk_accuracy(softmax, labels, top_k=5) * 100.
+        self.aggr_acc1 += accuracy1 * cur_batch_size
+        self.aggr_acc5 += accuracy5 * cur_batch_size
+
+        return
+
+
+# ----------------------------------------------
+# other utils
+# ----------------------------------------------
+def compute_topk_correct_hits(top_k, preds, labels):
+    '''Compute the number of corret hits'''
+    batch_size = preds.shape[0]
+
+    top_k_preds = np.zeros((batch_size, top_k), dtype=np.float32)
+    for i in range(batch_size):
+        top_k_preds[i, :] = np.argsort(-preds[i, :])[:top_k]
+
+    correctness = np.zeros(batch_size, dtype=np.int32)
+    for i in range(batch_size):
+        if labels[i] in top_k_preds[i, :].astype(np.int32).tolist():
+            correctness[i] = 1
+    correct_hits = sum(correctness)
+
+    return correct_hits
+
+
+def compute_topk_accuracy(softmax, labels, top_k):
+
+    computed_metrics = {}
+
+    assert labels.shape[0] == softmax.shape[0], "Batch size mismatch."
+    aggr_batch_size = labels.shape[0]
+    aggr_top_k_correct_hits = compute_topk_correct_hits(top_k, softmax, labels)
+
+    # normalize results
+    computed_metrics = \
+        float(aggr_top_k_correct_hits) / aggr_batch_size
+
+    return computed_metrics
diff --git a/docs/src/applications/VideoTag/metrics/metrics_util.py b/docs/src/applications/VideoTag/metrics/metrics_util.py
new file mode 100644
index 000000000..2e264ed93
--- /dev/null
+++ b/docs/src/applications/VideoTag/metrics/metrics_util.py
@@ -0,0 +1,279 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import division
+
+import logging
+
+import os
+import io
+import numpy as np
+import json
+from metrics.youtube8m import eval_util as youtube8m_metrics
+from metrics.kinetics import accuracy_metrics as kinetics_metrics
+
+logger = logging.getLogger(__name__)
+
+
+class Metrics(object):
+    def __init__(self, name, mode, metrics_args):
+        """Not implemented"""
+        pass
+
+    def calculate_and_log_out(self, fetch_list, info=''):
+        """Not implemented"""
+        pass
+
+    def accumulate(self, fetch_list, info=''):
+        """Not implemented"""
+        pass
+
+    def finalize_and_log_out(self, info='', savedir='./'):
+        """Not implemented"""
+        pass
+
+    def reset(self):
+        """Not implemented"""
+        pass
+
+
+class Youtube8mMetrics(Metrics):
+    def __init__(self, name, mode, metrics_args):
+        self.name = name
+        self.mode = mode
+        self.num_classes = metrics_args['MODEL']['num_classes']
+        self.topk = metrics_args['MODEL']['topk']
+        self.calculator = youtube8m_metrics.EvaluationMetrics(
+            self.num_classes, self.topk)
+        if self.mode == 'infer':
+            self.infer_results = []
+
+    def calculate_and_log_out(self, fetch_list, info=''):
+        loss = np.mean(np.array(fetch_list[0]))
+        pred = np.array(fetch_list[1])
+        label = np.array(fetch_list[2])
+        hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)
+        perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(
+            pred, label)
+        gap = youtube8m_metrics.calculate_gap(pred, label)
+        logger.info(info + ' , loss = {0}, Hit@1 = {1}, PERR = {2}, GAP = {3}'.format(\
+                     '%.6f' % loss, '%.2f' % hit_at_one, '%.2f' % perr, '%.2f' % gap))
+
+    def accumulate(self, fetch_list, info=''):
+        if self.mode == 'infer':
+            predictions = np.array(fetch_list[0])
+            video_id = fetch_list[1]
+            for i in range(len(predictions)):
+                topk_inds = predictions[i].argsort()[0 - self.topk:]
+                topk_inds = topk_inds[::-1]
+                preds = predictions[i][topk_inds]
+                self.infer_results.append(
+                    (video_id[i], topk_inds.tolist(), preds.tolist()))
+        else:
+            loss = np.array(fetch_list[0])
+            pred = np.array(fetch_list[1])
+            label = np.array(fetch_list[2])
+            self.calculator.accumulate(loss, pred, label)
+
+    def finalize_and_log_out(self,
+                             info='',
+                             savedir='./data/results',
+                             label_file='./label_3396.txt'):
+        if self.mode == 'infer':
+            for index, item in enumerate(self.infer_results):
+                video_id = item[0]
+                print('[========video_id [ {} ] , topk({}) preds: ========]\n'.
+                      format(video_id, self.topk))
+
+                f = io.open(label_file, "r", encoding="utf-8")
+                fl = f.readlines()
+                res_list = []
+                res_list.append(video_id)
+                for i in range(len(item[1])):
+                    class_id = item[1][i]
+                    class_prob = item[2][i]
+                    class_name = fl[class_id].split('\n')[0]
+                    print('class_id: {},'.format(class_id), 'class_name:',
+                          class_name,
+                          ',  probability:  {} \n'.format(class_prob))
+                    save_dict = {
+                        "'class_id": class_id,
+                        "class_name": class_name,
+                        "probability": class_prob
+                    }
+                    res_list.append(save_dict)
+
+                # save infer result into output dir
+                with io.open(os.path.join(savedir,
+                                          'result' + str(index) + '.json'),
+                             'w',
+                             encoding='utf-8') as f:
+                    f.write(json.dumps(res_list, ensure_ascii=False))
+        else:
+            epoch_info_dict = self.calculator.get()
+            logger.info(info + '\tavg_hit_at_one: {0},\tavg_perr: {1},\tavg_loss :{2},\taps: {3},\tgap:{4}'\
+                     .format(epoch_info_dict['avg_hit_at_one'], epoch_info_dict['avg_perr'], \
+                             epoch_info_dict['avg_loss'], epoch_info_dict['aps'], epoch_info_dict['gap']))
+
+    def reset(self):
+        self.calculator.clear()
+        if self.mode == 'infer':
+            self.infer_results = []
+
+
+class Kinetics400Metrics(Metrics):
+    def __init__(self, name, mode, metrics_args):
+        self.name = name
+        self.mode = mode
+        self.topk = metrics_args['MODEL']['topk']
+        self.calculator = kinetics_metrics.MetricsCalculator(name, mode.lower())
+        if self.mode == 'infer':
+            self.infer_results = []
+
+    def calculate_and_log_out(self, fetch_list, info=''):
+        if len(fetch_list) == 3:
+            loss = fetch_list[0]
+            loss = np.mean(np.array(loss))
+            pred = np.array(fetch_list[1])
+            label = np.array(fetch_list[2])
+        else:
+            loss = 0.
+            pred = np.array(fetch_list[0])
+            label = np.array(fetch_list[1])
+        acc1, acc5 = self.calculator.calculate_metrics(loss, pred, label)
+        logger.info(info + '\tLoss: {},\ttop1_acc: {}, \ttop5_acc: {}'.format('%.6f' % loss, \
+                       '%.2f' % acc1, '%.2f' % acc5))
+        return loss
+
+    def accumulate(self, fetch_list, info=''):
+        if self.mode == 'infer':
+            predictions = np.array(fetch_list[0])
+            video_id = fetch_list[1]
+            for i in range(len(predictions)):
+                topk_inds = predictions[i].argsort()[0 - self.topk:]
+                topk_inds = topk_inds[::-1]
+                preds = predictions[i][topk_inds]
+                self.infer_results.append(
+                    (video_id[i], topk_inds.tolist(), preds.tolist()))
+        else:
+            if len(fetch_list) == 3:
+                loss = fetch_list[0]
+                loss = np.mean(np.array(loss))
+                pred = np.array(fetch_list[1])
+                label = np.array(fetch_list[2])
+            else:
+                loss = 0.
+                pred = np.array(fetch_list[0])
+                label = np.array(fetch_list[1])
+            self.calculator.accumulate(loss, pred, label)
+
+    def finalize_and_log_out(self,
+                             info='',
+                             savedir='./data/results',
+                             label_file='./label_3396.txt'):
+        if self.mode == 'infer':
+            for index, item in enumerate(self.infer_results):
+                video_id = item[0]
+                print('[========video_id [ {} ] , topk({}) preds: ========]\n'.
+                      format(video_id, self.topk))
+
+                f = io.open(label_file, "r", encoding="utf-8")
+                fl = f.readlines()
+                res_list = []
+                res_list.append(video_id)
+                for i in range(len(item[1])):
+                    class_id = item[1][i]
+                    class_prob = item[2][i]
+                    class_name = fl[class_id].split('\n')[0]
+                    print('class_id: {},'.format(class_id), 'class_name:',
+                          class_name,
+                          ',  probability:  {} \n'.format(class_prob))
+                    save_dict = {
+                        "'class_id": class_id,
+                        "class_name": class_name,
+                        "probability": class_prob
+                    }
+                    res_list.append(save_dict)
+
+                # save infer result into output dir
+                with io.open(os.path.join(savedir,
+                                          'result' + str(index) + '.json'),
+                             'w',
+                             encoding='utf-8') as f:
+                    f.write(json.dumps(res_list, ensure_ascii=False))
+        else:
+            self.calculator.finalize_metrics()
+            metrics_dict = self.calculator.get_computed_metrics()
+            loss = metrics_dict['avg_loss']
+            acc1 = metrics_dict['avg_acc1']
+            acc5 = metrics_dict['avg_acc5']
+            logger.info(info + '\tLoss: {},\ttop1_acc: {}, \ttop5_acc: {}'.format('%.6f' % loss, \
+                       '%.2f' % acc1, '%.2f' % acc5))
+
+    def reset(self):
+        self.calculator.reset()
+        if self.mode == 'infer':
+            self.infer_results = []
+
+
+class MetricsNotFoundError(Exception):
+    "Error: metrics not found"
+
+    def __init__(self, metrics_name, avail_metrics):
+        super(MetricsNotFoundError, self).__init__()
+        self.metrics_name = metrics_name
+        self.avail_metrics = avail_metrics
+
+    def __str__(self):
+        msg = "Metrics {} Not Found.\nAvailiable metrics:\n".format(
+            self.metrics_name)
+        for metric in self.avail_metrics:
+            msg += "  {}\n".format(metric)
+        return msg
+
+
+class MetricsZoo(object):
+    def __init__(self):
+        self.metrics_zoo = {}
+
+    def regist(self, name, metrics):
+        assert metrics.__base__ == Metrics, "Unknow model type {}".format(
+            type(metrics))
+        self.metrics_zoo[name] = metrics
+
+    def get(self, name, mode, cfg):
+        for k, v in self.metrics_zoo.items():
+            if k == name:
+                return v(name, mode, cfg)
+        raise MetricsNotFoundError(name, self.metrics_zoo.keys())
+
+
+# singleton metrics_zoo
+metrics_zoo = MetricsZoo()
+
+
+def regist_metrics(name, metrics):
+    metrics_zoo.regist(name, metrics)
+
+
+def get_metrics(name, mode, cfg):
+    return metrics_zoo.get(name, mode, cfg)
+
+
+# sort by alphabet
+regist_metrics("ATTENTIONLSTM", Youtube8mMetrics)
+regist_metrics("TSN", Kinetics400Metrics)
diff --git a/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py b/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py
new file mode 100644
index 000000000..5e8a71cae
--- /dev/null
+++ b/docs/src/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py
@@ -0,0 +1,274 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate or keep track of the interpolated average precision.
+
+It provides an interface for calculating interpolated average precision for an
+entire list or the top-n ranked items. For the definition of the
+(non-)interpolated average precision:
+http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf
+
+Example usages:
+1) Use it as a static function call to directly calculate average precision for
+a short ranked list in the memory.
+
+```
+import random
+
+p = np.array([random.random() for _ in xrange(10)])
+a = np.array([random.choice([0, 1]) for _ in xrange(10)])
+
+ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)
+```
+
+2) Use it as an object for long ranked list that cannot be stored in memory or
+the case where partial predictions can be observed at a time (Tensorflow
+predictions). In this case, we first call the function accumulate many times
+to process parts of the ranked list. After processing all the parts, we call
+peek_interpolated_ap_at_n.
+```
+p1 = np.array([random.random() for _ in xrange(5)])
+a1 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+p2 = np.array([random.random() for _ in xrange(5)])
+a2 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+
+# interpolated average precision at 10 using 1000 break points
+calculator = average_precision_calculator.AveragePrecisionCalculator(10)
+calculator.accumulate(p1, a1)
+calculator.accumulate(p2, a2)
+ap3 = calculator.peek_ap_at_n()
+```
+"""
+
+import heapq
+import random
+import numbers
+
+import numpy
+
+
+class AveragePrecisionCalculator(object):
+    """Calculate the average precision and average precision at n."""
+    def __init__(self, top_n=None):
+        """Construct an AveragePrecisionCalculator to calculate average precision.
+
+    This class is used to calculate the average precision for a single label.
+
+    Args:
+      top_n: A positive Integer specifying the average precision at n, or
+        None to use all provided data points.
+
+    Raises:
+      ValueError: An error occurred when the top_n is not a positive integer.
+    """
+        if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):
+            raise ValueError("top_n must be a positive integer or None.")
+
+        self._top_n = top_n  # average precision at n
+        self._total_positives = 0  # total number of positives have seen
+        self._heap = []  # max heap of (prediction, actual)
+
+    @property
+    def heap_size(self):
+        """Gets the heap size maintained in the class."""
+        return len(self._heap)
+
+    @property
+    def num_accumulated_positives(self):
+        """Gets the number of positive samples that have been accumulated."""
+        return self._total_positives
+
+    def accumulate(self, predictions, actuals, num_positives=None):
+        """Accumulate the predictions and their ground truth labels.
+
+    After the function call, we may call peek_ap_at_n to actually calculate
+    the average precision.
+    Note predictions and actuals must have the same shape.
+
+    Args:
+      predictions: a list storing the prediction scores.
+      actuals: a list storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      num_positives = If the 'predictions' and 'actuals' inputs aren't complete,
+      then it's possible some true positives were missed in them. In that case,
+      you can provide 'num_positives' in order to accurately track recall.
+
+    Raises:
+      ValueError: An error occurred when the format of the input is not the
+      numpy 1-D array or the shape of predictions and actuals does not match.
+    """
+        if len(predictions) != len(actuals):
+            raise ValueError(
+                "the shape of predictions and actuals does not match.")
+
+        if not num_positives is None:
+            if not isinstance(num_positives,
+                              numbers.Number) or num_positives < 0:
+                raise ValueError(
+                    "'num_positives' was provided but it wan't a nonzero number."
+                )
+
+        if not num_positives is None:
+            self._total_positives += num_positives
+        else:
+            self._total_positives += numpy.size(numpy.where(actuals > 0))
+        topk = self._top_n
+        heap = self._heap
+
+        for i in range(numpy.size(predictions)):
+            if topk is None or len(heap) < topk:
+                heapq.heappush(heap, (predictions[i], actuals[i]))
+            else:
+                if predictions[i] > heap[0][0]:  # heap[0] is the smallest
+                    heapq.heappop(heap)
+                    heapq.heappush(heap, (predictions[i], actuals[i]))
+
+    def clear(self):
+        """Clear the accumulated predictions."""
+        self._heap = []
+        self._total_positives = 0
+
+    def peek_ap_at_n(self):
+        """Peek the non-interpolated average precision at n.
+
+    Returns:
+      The non-interpolated average precision at n (default 0).
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+    """
+        if self.heap_size <= 0:
+            return 0
+        predlists = numpy.array(list(zip(*self._heap)))
+
+        ap = self.ap_at_n(predlists[0],
+                          predlists[1],
+                          n=self._top_n,
+                          total_num_positives=self._total_positives)
+        return ap
+
+    @staticmethod
+    def ap(predictions, actuals):
+        """Calculate the non-interpolated average precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      actuals: a numpy 1-D array storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+
+    Returns:
+      The non-interpolated average precision at n.
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+
+    Raises:
+      ValueError: An error occurred when the format of the input is not the
+      numpy 1-D array or the shape of predictions and actuals does not match.
+    """
+        return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None)
+
+    @staticmethod
+    def ap_at_n(predictions, actuals, n=20, total_num_positives=None):
+        """Calculate the non-interpolated average precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      actuals: a numpy 1-D array storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      n: the top n items to be considered in ap@n.
+      total_num_positives : (optionally) you can specify the number of total
+        positive
+      in the list. If specified, it will be used in calculation.
+
+    Returns:
+      The non-interpolated average precision at n.
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+
+    Raises:
+      ValueError: An error occurred when
+      1) the format of the input is not the numpy 1-D array;
+      2) the shape of predictions and actuals does not match;
+      3) the input n is not a positive integer.
+    """
+        if len(predictions) != len(actuals):
+            raise ValueError(
+                "the shape of predictions and actuals does not match.")
+
+        if n is not None:
+            if not isinstance(n, int) or n <= 0:
+                raise ValueError("n must be 'None' or a positive integer."
+                                 " It was '%s'." % n)
+
+        ap = 0.0
+
+        predictions = numpy.array(predictions)
+        actuals = numpy.array(actuals)
+
+        # add a shuffler to avoid overestimating the ap
+        predictions, actuals = AveragePrecisionCalculator._shuffle(
+            predictions, actuals)
+        sortidx = sorted(range(len(predictions)),
+                         key=lambda k: predictions[k],
+                         reverse=True)
+
+        if total_num_positives is None:
+            numpos = numpy.size(numpy.where(actuals > 0))
+        else:
+            numpos = total_num_positives
+
+        if numpos == 0:
+            return 0
+
+        if n is not None:
+            numpos = min(numpos, n)
+        delta_recall = 1.0 / numpos
+        poscount = 0.0
+
+        # calculate the ap
+        r = len(sortidx)
+        if n is not None:
+            r = min(r, n)
+        for i in range(r):
+            if actuals[sortidx[i]] > 0:
+                poscount += 1
+                ap += poscount / (i + 1) * delta_recall
+        return ap
+
+    @staticmethod
+    def _shuffle(predictions, actuals):
+        random.seed(0)
+        suffidx = random.sample(range(len(predictions)), len(predictions))
+        predictions = predictions[suffidx]
+        actuals = actuals[suffidx]
+        return predictions, actuals
+
+    @staticmethod
+    def _zero_one_normalize(predictions, epsilon=1e-7):
+        """Normalize the predictions to the range between 0.0 and 1.0.
+
+    For some predictions like SVM predictions, we need to normalize them before
+    calculate the interpolated average precision. The normalization will not
+    change the rank in the original list and thus won't change the average
+    precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      epsilon: a small constant to avoid denominator being zero.
+
+    Returns:
+      The normalized prediction.
+    """
+        denominator = numpy.max(predictions) - numpy.min(predictions)
+        ret = (predictions - numpy.min(predictions)) / numpy.max(
+            denominator, epsilon)
+        return ret
diff --git a/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py b/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py
new file mode 100644
index 000000000..5a78d1f3b
--- /dev/null
+++ b/docs/src/applications/VideoTag/metrics/youtube8m/eval_util.py
@@ -0,0 +1,244 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Provides functions to help with evaluating models."""
+import datetime
+import numpy
+
+from . import mean_average_precision_calculator as map_calculator
+from . import average_precision_calculator as ap_calculator
+
+
+def flatten(l):
+    """ Merges a list of lists into a single list. """
+    return [item for sublist in l for item in sublist]
+
+
+def calculate_hit_at_one(predictions, actuals):
+    """Performs a local (numpy) calculation of the hit at one.
+
+  Args:
+    predictions: Matrix containing the outputs of the model.
+      Dimensions are 'batch' x 'num_classes'.
+    actuals: Matrix containing the ground truth labels.
+      Dimensions are 'batch' x 'num_classes'.
+
+  Returns:
+    float: The average hit at one across the entire batch.
+  """
+    top_prediction = numpy.argmax(predictions, 1)
+    hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]
+    return numpy.average(hits)
+
+
+def calculate_precision_at_equal_recall_rate(predictions, actuals):
+    """Performs a local (numpy) calculation of the PERR.
+
+  Args:
+    predictions: Matrix containing the outputs of the model.
+      Dimensions are 'batch' x 'num_classes'.
+    actuals: Matrix containing the ground truth labels.
+      Dimensions are 'batch' x 'num_classes'.
+
+  Returns:
+    float: The average precision at equal recall rate across the entire batch.
+  """
+    aggregated_precision = 0.0
+    num_videos = actuals.shape[0]
+    for row in numpy.arange(num_videos):
+        num_labels = int(numpy.sum(actuals[row]))
+        top_indices = numpy.argpartition(predictions[row],
+                                         -num_labels)[-num_labels:]
+        item_precision = 0.0
+        for label_index in top_indices:
+            if predictions[row][label_index] > 0:
+                item_precision += actuals[row][label_index]
+        item_precision /= top_indices.size
+        aggregated_precision += item_precision
+    aggregated_precision /= num_videos
+    return aggregated_precision
+
+
+def calculate_gap(predictions, actuals, top_k=20):
+    """Performs a local (numpy) calculation of the global average precision.
+
+  Only the top_k predictions are taken for each of the videos.
+
+  Args:
+    predictions: Matrix containing the outputs of the model.
+      Dimensions are 'batch' x 'num_classes'.
+    actuals: Matrix containing the ground truth labels.
+      Dimensions are 'batch' x 'num_classes'.
+    top_k: How many predictions to use per video.
+
+  Returns:
+    float: The global average precision.
+  """
+    gap_calculator = ap_calculator.AveragePrecisionCalculator()
+    sparse_predictions, sparse_labels, num_positives = top_k_by_class(
+        predictions, actuals, top_k)
+    gap_calculator.accumulate(flatten(sparse_predictions),
+                              flatten(sparse_labels), sum(num_positives))
+    return gap_calculator.peek_ap_at_n()
+
+
+def top_k_by_class(predictions, labels, k=20):
+    """Extracts the top k predictions for each video, sorted by class.
+
+  Args:
+    predictions: A numpy matrix containing the outputs of the model.
+      Dimensions are 'batch' x 'num_classes'.
+    k: the top k non-zero entries to preserve in each prediction.
+
+  Returns:
+    A tuple (predictions,labels, true_positives). 'predictions' and 'labels'
+    are lists of lists of floats. 'true_positives' is a list of scalars. The
+    length of the lists are equal to the number of classes. The entries in the
+    predictions variable are probability predictions, and
+    the corresponding entries in the labels variable are the ground truth for
+    those predictions. The entries in 'true_positives' are the number of true
+    positives for each class in the ground truth.
+
+  Raises:
+    ValueError: An error occurred when the k is not a positive integer.
+  """
+    if k <= 0:
+        raise ValueError("k must be a positive integer.")
+    k = min(k, predictions.shape[1])
+    num_classes = predictions.shape[1]
+    prediction_triplets = []
+    for video_index in range(predictions.shape[0]):
+        prediction_triplets.extend(
+            top_k_triplets(predictions[video_index], labels[video_index], k))
+    out_predictions = [[] for v in range(num_classes)]
+    out_labels = [[] for v in range(num_classes)]
+    for triplet in prediction_triplets:
+        out_predictions[triplet[0]].append(triplet[1])
+        out_labels[triplet[0]].append(triplet[2])
+    out_true_positives = [numpy.sum(labels[:, i]) for i in range(num_classes)]
+
+    return out_predictions, out_labels, out_true_positives
+
+
+def top_k_triplets(predictions, labels, k=20):
+    """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in
+  (prediction, class) format"""
+    m = len(predictions)
+    k = min(k, m)
+    indices = numpy.argpartition(predictions, -k)[-k:]
+    return [(index, predictions[index], labels[index]) for index in indices]
+
+
+class EvaluationMetrics(object):
+    """A class to store the evaluation metrics."""
+    def __init__(self, num_class, top_k):
+        """Construct an EvaluationMetrics object to store the evaluation metrics.
+
+    Args:
+      num_class: A positive integer specifying the number of classes.
+      top_k: A positive integer specifying how many predictions are considered per video.
+
+    Raises:
+      ValueError: An error occurred when MeanAveragePrecisionCalculator cannot
+        not be constructed.
+    """
+        self.sum_hit_at_one = 0.0
+        self.sum_perr = 0.0
+        self.sum_loss = 0.0
+        self.map_calculator = map_calculator.MeanAveragePrecisionCalculator(
+            num_class)
+        self.global_ap_calculator = ap_calculator.AveragePrecisionCalculator()
+        self.top_k = top_k
+        self.num_examples = 0
+
+    #def accumulate(self, predictions, labels, loss):
+    def accumulate(self, loss, predictions, labels):
+        """Accumulate the metrics calculated locally for this mini-batch.
+
+    Args:
+      predictions: A numpy matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+      labels: A numpy matrix containing the ground truth labels.
+        Dimensions are 'batch' x 'num_classes'.
+      loss: A numpy array containing the loss for each sample.
+
+    Returns:
+      dictionary: A dictionary storing the metrics for the mini-batch.
+
+    Raises:
+      ValueError: An error occurred when the shape of predictions and actuals
+        does not match.
+    """
+        batch_size = labels.shape[0]
+        mean_hit_at_one = calculate_hit_at_one(predictions, labels)
+        mean_perr = calculate_precision_at_equal_recall_rate(
+            predictions, labels)
+        mean_loss = numpy.mean(loss)
+
+        # Take the top 20 predictions.
+        sparse_predictions, sparse_labels, num_positives = top_k_by_class(
+            predictions, labels, self.top_k)
+        self.map_calculator.accumulate(sparse_predictions, sparse_labels,
+                                       num_positives)
+        self.global_ap_calculator.accumulate(flatten(sparse_predictions),
+                                             flatten(sparse_labels),
+                                             sum(num_positives))
+
+        self.num_examples += batch_size
+        self.sum_hit_at_one += mean_hit_at_one * batch_size
+        self.sum_perr += mean_perr * batch_size
+        self.sum_loss += mean_loss * batch_size
+
+        return {
+            "hit_at_one": mean_hit_at_one,
+            "perr": mean_perr,
+            "loss": mean_loss
+        }
+
+    def get(self):
+        """Calculate the evaluation metrics for the whole epoch.
+
+    Raises:
+      ValueError: If no examples were accumulated.
+
+    Returns:
+      dictionary: a dictionary storing the evaluation metrics for the epoch. The
+        dictionary has the fields: avg_hit_at_one, avg_perr, avg_loss, and
+        aps (default nan).
+    """
+        if self.num_examples <= 0:
+            raise ValueError("total_sample must be positive.")
+        avg_hit_at_one = self.sum_hit_at_one / self.num_examples
+        avg_perr = self.sum_perr / self.num_examples
+        avg_loss = self.sum_loss / self.num_examples
+
+        aps = self.map_calculator.peek_map_at_n()
+        gap = self.global_ap_calculator.peek_ap_at_n()
+
+        epoch_info_dict = {}
+        return {
+            "avg_hit_at_one": avg_hit_at_one,
+            "avg_perr": avg_perr,
+            "avg_loss": avg_loss,
+            "aps": aps,
+            "gap": gap
+        }
+
+    def clear(self):
+        """Clear the evaluation metrics and reset the EvaluationMetrics object."""
+        self.sum_hit_at_one = 0.0
+        self.sum_perr = 0.0
+        self.sum_loss = 0.0
+        self.map_calculator.clear()
+        self.global_ap_calculator.clear()
+        self.num_examples = 0
diff --git a/docs/src/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py b/docs/src/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py
new file mode 100644
index 000000000..bf26db25e
--- /dev/null
+++ b/docs/src/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py
@@ -0,0 +1,113 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate the mean average precision.
+
+It provides an interface for calculating mean average precision
+for an entire list or the top-n ranked items.
+
+Example usages:
+We first call the function accumulate many times to process parts of the ranked
+list. After processing all the parts, we call peek_map_at_n
+to calculate the mean average precision.
+
+```
+import random
+
+p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])
+a = np.array([[random.choice([0, 1]) for _ in xrange(50)]
+     for _ in xrange(1000)])
+
+# mean average precision for 50 classes.
+calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(
+            num_class=50)
+calculator.accumulate(p, a)
+aps = calculator.peek_map_at_n()
+```
+"""
+
+import numpy
+from . import average_precision_calculator
+
+
+class MeanAveragePrecisionCalculator(object):
+    """This class is to calculate mean average precision.
+  """
+    def __init__(self, num_class):
+        """Construct a calculator to calculate the (macro) average precision.
+
+    Args:
+      num_class: A positive Integer specifying the number of classes.
+      top_n_array: A list of positive integers specifying the top n for each
+      class. The top n in each class will be used to calculate its average
+      precision at n.
+      The size of the array must be num_class.
+
+    Raises:
+      ValueError: An error occurred when num_class is not a positive integer;
+      or the top_n_array is not a list of positive integers.
+    """
+        if not isinstance(num_class, int) or num_class <= 1:
+            raise ValueError("num_class must be a positive integer.")
+
+        self._ap_calculators = []  # member of AveragePrecisionCalculator
+        self._num_class = num_class  # total number of classes
+        for i in range(num_class):
+            self._ap_calculators.append(
+                average_precision_calculator.AveragePrecisionCalculator())
+
+    def accumulate(self, predictions, actuals, num_positives=None):
+        """Accumulate the predictions and their ground truth labels.
+
+    Args:
+      predictions: A list of lists storing the prediction scores. The outer
+      dimension corresponds to classes.
+      actuals: A list of lists storing the ground truth labels. The dimensions
+      should correspond to the predictions input. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      num_positives: If provided, it is a list of numbers representing the
+      number of true positives for each class. If not provided, the number of
+      true positives will be inferred from the 'actuals' array.
+
+    Raises:
+      ValueError: An error occurred when the shape of predictions and actuals
+      does not match.
+    """
+        if not num_positives:
+            num_positives = [None for i in predictions.shape[1]]
+
+        calculators = self._ap_calculators
+        for i in range(len(predictions)):
+            calculators[i].accumulate(predictions[i], actuals[i],
+                                      num_positives[i])
+
+    def clear(self):
+        for calculator in self._ap_calculators:
+            calculator.clear()
+
+    def is_empty(self):
+        return ([calculator.heap_size for calculator in self._ap_calculators
+                 ] == [0 for _ in range(self._num_class)])
+
+    def peek_map_at_n(self):
+        """Peek the non-interpolated mean average precision at n.
+
+    Returns:
+      An array of non-interpolated average precision at n (default 0) for each
+      class.
+    """
+        aps = [
+            self._ap_calculators[i].peek_ap_at_n()
+            for i in range(self._num_class)
+        ]
+        return aps
diff --git a/docs/src/applications/VideoTag/models/__init__.py b/docs/src/applications/VideoTag/models/__init__.py
new file mode 100644
index 000000000..4a3adbbfb
--- /dev/null
+++ b/docs/src/applications/VideoTag/models/__init__.py
@@ -0,0 +1,7 @@
+from .model import regist_model, get_model
+from .attention_lstm import AttentionLSTM
+from .tsn import TSN
+
+# regist models, sort by alphabet
+regist_model("AttentionLSTM", AttentionLSTM)
+regist_model("TSN", TSN)
diff --git a/docs/src/applications/VideoTag/models/attention_lstm/__init__.py b/docs/src/applications/VideoTag/models/attention_lstm/__init__.py
new file mode 100644
index 000000000..cb872f0e4
--- /dev/null
+++ b/docs/src/applications/VideoTag/models/attention_lstm/__init__.py
@@ -0,0 +1 @@
+from .attention_lstm import *
diff --git a/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py b/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py
new file mode 100644
index 000000000..17d3f35e2
--- /dev/null
+++ b/docs/src/applications/VideoTag/models/attention_lstm/attention_lstm.py
@@ -0,0 +1,180 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+
+from ..model import ModelBase
+from .lstm_attention import LSTMAttentionModel
+
+import logging
+import paddle
+import paddle.static as static
+logger = logging.getLogger(__name__)
+
+__all__ = ["AttentionLSTM"]
+
+
+class AttentionLSTM(ModelBase):
+    def __init__(self, name, cfg, mode='train', is_videotag=False):
+        super(AttentionLSTM, self).__init__(name, cfg, mode)
+        self.is_videotag = is_videotag
+        self.get_config()
+
+    def get_config(self):
+        # get model configs
+        self.feature_names = self.cfg.MODEL.feature_names
+        self.feature_dims = self.cfg.MODEL.feature_dims
+        self.num_classes = self.cfg.MODEL.num_classes
+        self.embedding_size = self.cfg.MODEL.embedding_size
+        self.lstm_size = self.cfg.MODEL.lstm_size
+        self.drop_rate = self.cfg.MODEL.drop_rate
+
+        # get mode configs
+        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size', 1)
+        self.num_gpus = self.get_config_from_sec(self.mode, 'num_gpus', 1)
+
+        if self.mode == 'train':
+            self.learning_rate = self.get_config_from_sec(
+                'train', 'learning_rate', 1e-3)
+            self.weight_decay = self.get_config_from_sec(
+                'train', 'weight_decay', 8e-4)
+            self.num_samples = self.get_config_from_sec('train', 'num_samples',
+                                                        5000000)
+            self.decay_epochs = self.get_config_from_sec(
+                'train', 'decay_epochs', [5])
+            self.decay_gamma = self.get_config_from_sec('train', 'decay_gamma',
+                                                        0.1)
+
+    def build_input(self, use_dataloader):
+        self.feature_input = []
+        for name, dim in zip(self.feature_names, self.feature_dims):
+            self.feature_input.append(
+                static.data(shape=[None, dim],
+                           lod_level=1,
+                           dtype='float32',
+                           name=name))
+        if self.mode != 'infer':
+            self.label_input = static.data(shape=[None, self.num_classes],
+                                          dtype='float32',
+                                          name='label')
+        else:
+            self.label_input = None
+        if use_dataloader:
+            assert self.mode != 'infer', \
+                    'dataloader is not recommendated when infer, please set use_dataloader to be false.'
+            self.dataloader = paddle.io.DataLoader.from_generator(
+                feed_list=self.feature_input + [self.label_input],
+                capacity=8,
+                iterable=True)
+
+    def build_model(self):
+        att_outs = []
+        for i, (input_dim,
+                feature) in enumerate(zip(self.feature_dims,
+                                          self.feature_input)):
+            att = LSTMAttentionModel(input_dim, self.embedding_size,
+                                     self.lstm_size, self.drop_rate)
+            att_out = att.forward(feature, is_training=(self.mode == 'train'))
+            att_outs.append(att_out)
+        if len(att_outs) > 1:
+            out = paddle.concat(x=att_outs, axis=1)
+        else:
+            out = att_outs[0]  # video only, without audio in videoTag
+
+        fc1 = static.nn.fc(
+            x=out,
+            size=8192,
+            activation='relu',
+            bias_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(coeff=0.0),
+                initializer=paddle.nn.initializer.Normal(std=0.0)),
+            name='fc1')
+        fc2 = static.nn.fc(
+            x=fc1,
+            size=4096,
+            activation='tanh',
+            bias_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(coeff=0.0),
+                initializer=paddle.nn.initializer.Normal(std=0.0)),
+            name='fc2')
+
+        self.logit = static.nn.fc(x=fc2, size=self.num_classes, activation=None, \
+                              bias_attr=paddle.ParamAttr(regularizer=paddle.regularizer.L2Decay(coeff=0.0),
+                                                  initializer=paddle.nn.initializer.Normal(std=0.0)), name='output')
+
+        self.output = paddle.nn.functional.sigmoid(self.logit)
+
+    def optimizer(self):
+        assert self.mode == 'train', "optimizer only can be get in train mode"
+        values = [
+            self.learning_rate * (self.decay_gamma**i)
+            for i in range(len(self.decay_epochs) + 1)
+        ]
+        iter_per_epoch = self.num_samples / self.batch_size
+        boundaries = [e * iter_per_epoch for e in self.decay_epochs]
+        return paddle.optimizer.RMSProp(
+            learning_rate=paddle.optimizer.lr.PiecewiseDecay(values=values,
+                                                       boundaries=boundaries),
+            centered=True,
+            weight_decay=paddle.regularizer.L2Decay(coeff=self.weight_decay))
+
+    def loss(self):
+        assert self.mode != 'infer', "invalid loss calculationg in infer mode"
+        cost = paddle.nn.functional.binary_cross_entropy(
+            input=self.logit, label=self.label_input, reduction=None)
+        cost = paddle.sum(x=cost, axis=-1)
+        sum_cost = paddle.sum(x=cost)
+        self.loss_ = paddle.scale(sum_cost,
+                                        scale=self.num_gpus,
+                                        bias_after_scale=False)
+        return self.loss_
+
+    def outputs(self):
+        return [self.output, self.logit]
+
+    def feeds(self):
+        return self.feature_input if self.mode == 'infer' else self.feature_input + [
+            self.label_input
+        ]
+
+    def fetches(self):
+        if self.mode == 'train' or self.mode == 'valid':
+            losses = self.loss()
+            fetch_list = [losses, self.output, self.label_input]
+        elif self.mode == 'test':
+            losses = self.loss()
+            fetch_list = [losses, self.output, self.label_input]
+        elif self.mode == 'infer':
+            fetch_list = [self.output]
+        else:
+            raise NotImplementedError('mode {} not implemented'.format(
+                self.mode))
+
+        return fetch_list
+
+    def weights_info(self):
+        return None, None
+
+    def load_pretrain_params(self, exe, pretrain, prog):
+        logger.info(
+            "Load pretrain weights from {}, exclude fc layer.".format(pretrain))
+
+        state_dict = paddle.static.load_program_state(pretrain)
+        dict_keys = list(state_dict.keys())
+        for name in dict_keys:
+            if "fc_0" in name:
+                del state_dict[name]
+                logger.info(
+                    'Delete {} from pretrained parameters. Do not load it'.
+                    format(name))
+        paddle.static.set_program_state(prog, state_dict)
diff --git a/docs/src/applications/VideoTag/models/attention_lstm/lstm_attention.py b/docs/src/applications/VideoTag/models/attention_lstm/lstm_attention.py
new file mode 100644
index 000000000..2862f4b27
--- /dev/null
+++ b/docs/src/applications/VideoTag/models/attention_lstm/lstm_attention.py
@@ -0,0 +1,83 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.static as static
+
+
+class LSTMAttentionModel(object):
+    """LSTM Attention Model"""
+    def __init__(self,
+                 bias_attr,
+                 embedding_size=512,
+                 lstm_size=1024,
+                 drop_rate=0.5):
+        self.lstm_size = lstm_size
+        self.embedding_size = embedding_size
+        self.drop_rate = drop_rate
+
+    def forward(self, input, is_training):
+        input_fc = static.nn.fc(
+            x=input,
+            size=self.embedding_size,
+            activation='tanh',
+            bias_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(coeff=0.0),
+                initializer=paddle.nn.initializer.Normal(std=0.0)),
+            name='rgb_fc')
+
+        lstm_forward_fc = static.nn.fc(
+            x=input_fc,
+            size=self.lstm_size * 4,
+            activation=None,
+            bias_attr=False,  # video_tag
+            name='rgb_fc_forward')
+
+        lstm_forward, _ = paddle.fluid.layers.dynamic_lstm(input=lstm_forward_fc,
+                                                    size=self.lstm_size * 4,
+                                                    is_reverse=False,
+                                                    name='rgb_lstm_forward')
+
+        lsmt_backward_fc = static.nn.fc(
+            x=input_fc,
+            size=self.lstm_size * 4,
+            activation=None,
+            bias_attr=False,  #video_tag
+            name='rgb_fc_backward')
+
+        lstm_backward, _ = paddle.fluid.layers.dynamic_lstm(input=lsmt_backward_fc,
+                                                     size=self.lstm_size * 4,
+                                                     is_reverse=True,
+                                                     name='rgb_lstm_backward')
+
+        lstm_concat = paddle.concat(x=[lstm_forward, lstm_backward],
+                                          axis=1)
+
+        lstm_dropout = paddle.nn.functional.dropout2d(x=lstm_concat,
+                                            p=self.drop_rate,
+                                            training=is_training)
+
+        lstm_weight = static.nn.fc(
+            x=lstm_dropout,
+            size=1,
+            activation='sequence_softmax',
+            bias_attr=False,  #video_tag
+            name='rgb_weight')
+
+        scaled = paddle.multiply(x=lstm_dropout,
+                                              y=lstm_weight)
+        lstm_pool = paddle.static.nn.sequence_pool(input=scaled, pool_type='sum')
+
+        return lstm_pool
diff --git a/docs/src/applications/VideoTag/models/model.py b/docs/src/applications/VideoTag/models/model.py
new file mode 100644
index 000000000..bfcdccea8
--- /dev/null
+++ b/docs/src/applications/VideoTag/models/model.py
@@ -0,0 +1,192 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import wget
+import logging
+import paddle
+import paddle.static as static
+try:
+    from configparser import ConfigParser
+except:
+    from ConfigParser import ConfigParser
+
+from .utils import download, AttrDict
+
+WEIGHT_DIR = os.path.join(os.path.expanduser('~'), '.paddle', 'weights')
+
+logger = logging.getLogger(__name__)
+
+
+def is_parameter(var):
+    return isinstance(var, paddle.framework.Parameter)
+
+
+class NotImplementError(Exception):
+    "Error: model function not implement"
+
+    def __init__(self, model, function):
+        super(NotImplementError, self).__init__()
+        self.model = model.__class__.__name__
+        self.function = function.__name__
+
+    def __str__(self):
+        return "Function {}() is not implemented in model {}".format(
+            self.function, self.model)
+
+
+class ModelNotFoundError(Exception):
+    "Error: model not found"
+
+    def __init__(self, model_name, avail_models):
+        super(ModelNotFoundError, self).__init__()
+        self.model_name = model_name
+        self.avail_models = avail_models
+
+    def __str__(self):
+        msg = "Model {} Not Found.\nAvailiable models:\n".format(
+            self.model_name)
+        for model in self.avail_models:
+            msg += "  {}\n".format(model)
+        return msg
+
+
+class ModelBase(object):
+    def __init__(self, name, cfg, mode='train'):
+        assert mode in ['train', 'valid', 'test', 'infer'], \
+                "Unknown mode type {}".format(mode)
+        self.name = name
+        self.is_training = (mode == 'train')
+        self.mode = mode
+        self.cfg = cfg
+        self.dataloader = None
+
+    def build_model(self):
+        "build model struct"
+        raise NotImplementError(self, self.build_model)
+
+    def build_input(self, use_dataloader):
+        "build input Variable"
+        raise NotImplementError(self, self.build_input)
+
+    def optimizer(self):
+        "get model optimizer"
+        raise NotImplementError(self, self.optimizer)
+
+    def outputs(self):
+        "get output variable"
+        raise NotImplementError(self, self.outputs)
+
+    def loss(self):
+        "get loss variable"
+        raise NotImplementError(self, self.loss)
+
+    def feeds(self):
+        "get feed inputs list"
+        raise NotImplementError(self, self.feeds)
+
+    def fetches(self):
+        "get fetch list of model"
+        raise NotImplementError(self, self.fetches)
+
+    def weights_info(self):
+        "get model weight default path and download url"
+        raise NotImplementError(self, self.weights_info)
+
+    def get_weights(self):
+        "get model weight file path, download weight from Paddle if not exist"
+        path, url = self.weights_info()
+        path = os.path.join(WEIGHT_DIR, path)
+        if not os.path.isdir(WEIGHT_DIR):
+            logger.info('{} not exists, will be created automatically.'.format(
+                WEIGHT_DIR))
+            os.makedirs(WEIGHT_DIR)
+        if os.path.exists(path):
+            return path
+
+        logger.info("Download weights of {} from {}".format(self.name, url))
+        wget.download(url, path)
+        return path
+
+    def dataloader(self):
+        return self.dataloader
+
+    def epoch_num(self):
+        "get train epoch num"
+        return self.cfg.TRAIN.epoch
+
+    def pretrain_info(self):
+        "get pretrain base model directory"
+        return (None, None)
+
+    def get_pretrain_weights(self):
+        "get model weight file path, download weight from Paddle if not exist"
+        path, url = self.pretrain_info()
+        if not path:
+            return None
+
+        path = os.path.join(WEIGHT_DIR, path)
+        if not os.path.isdir(WEIGHT_DIR):
+            logger.info('{} not exists, will be created automatically.'.format(
+                WEIGHT_DIR))
+            os.makedirs(WEIGHT_DIR)
+        if os.path.exists(path):
+            return path
+
+        logger.info("Download pretrain weights of {} from {}".format(
+            self.name, url))
+        download(url, path)
+        return path
+
+    def load_pretrain_params(self, exe, pretrain, prog):
+        logger.info("Load pretrain weights from {}".format(pretrain))
+        state_dict = paddle.static.load_program_state(pretrain)
+        paddle.static.set_program_state(prog, state_dict)
+
+    def load_test_weights(self, exe, weights, prog):
+        params_list = list(filter(is_parameter, prog.list_vars()))
+        static.load(prog, weights, executor=exe, var_list=params_list)
+
+    def get_config_from_sec(self, sec, item, default=None):
+        if sec.upper() not in self.cfg:
+            return default
+        return self.cfg[sec.upper()].get(item, default)
+
+
+class ModelZoo(object):
+    def __init__(self):
+        self.model_zoo = {}
+
+    def regist(self, name, model):
+        assert model.__base__ == ModelBase, "Unknow model type {}".format(
+            type(model))
+        self.model_zoo[name] = model
+
+    def get(self, name, cfg, mode='train', is_videotag=False):
+        for k, v in self.model_zoo.items():
+            if k.upper() == name.upper():
+                return v(name, cfg, mode, is_videotag)
+        raise ModelNotFoundError(name, self.model_zoo.keys())
+
+
+# singleton model_zoo
+model_zoo = ModelZoo()
+
+
+def regist_model(name, model):
+    model_zoo.regist(name, model)
+
+
+def get_model(name, cfg, mode='train', is_videotag=False):
+    return model_zoo.get(name, cfg, mode, is_videotag)
diff --git a/docs/src/applications/VideoTag/models/tsn/__init__.py b/docs/src/applications/VideoTag/models/tsn/__init__.py
new file mode 100644
index 000000000..bd57d2687
--- /dev/null
+++ b/docs/src/applications/VideoTag/models/tsn/__init__.py
@@ -0,0 +1 @@
+from .tsn import *
diff --git a/docs/src/applications/VideoTag/models/tsn/tsn.py b/docs/src/applications/VideoTag/models/tsn/tsn.py
new file mode 100644
index 000000000..e67653d2e
--- /dev/null
+++ b/docs/src/applications/VideoTag/models/tsn/tsn.py
@@ -0,0 +1,165 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+
+from ..model import ModelBase
+from .tsn_res_model import TSN_ResNet
+
+import logging
+import paddle
+import paddle.static as static
+logger = logging.getLogger(__name__)
+
+__all__ = ["TSN"]
+
+
+class TSN(ModelBase):
+    def __init__(self, name, cfg, mode='train', is_videotag=False):
+        super(TSN, self).__init__(name, cfg, mode=mode)
+        self.is_videotag = is_videotag
+        self.get_config()
+
+    def get_config(self):
+        self.num_classes = self.get_config_from_sec('model', 'num_classes')
+        self.seg_num = self.get_config_from_sec('model', 'seg_num')
+        self.seglen = self.get_config_from_sec('model', 'seglen')
+        self.image_mean = self.get_config_from_sec('model', 'image_mean')
+        self.image_std = self.get_config_from_sec('model', 'image_std')
+        self.num_layers = self.get_config_from_sec('model', 'num_layers')
+
+        self.num_epochs = self.get_config_from_sec('train', 'epoch')
+        self.total_videos = self.get_config_from_sec('train', 'total_videos')
+        self.base_learning_rate = self.get_config_from_sec(
+            'train', 'learning_rate')
+        self.learning_rate_decay = self.get_config_from_sec(
+            'train', 'learning_rate_decay')
+        self.l2_weight_decay = self.get_config_from_sec('train',
+                                                        'l2_weight_decay')
+        self.momentum = self.get_config_from_sec('train', 'momentum')
+
+        self.seg_num = self.get_config_from_sec(self.mode, 'seg_num',
+                                                self.seg_num)
+        self.target_size = self.get_config_from_sec(self.mode, 'target_size')
+        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size')
+
+    def build_input(self, use_dataloader=True):
+        image_shape = [3, self.target_size, self.target_size]
+        image_shape[0] = image_shape[0] * self.seglen
+        image_shape = [None, self.seg_num] + image_shape
+        self.use_dataloader = use_dataloader
+
+        image = static.data(name='image', shape=image_shape, dtype='float32')
+        if self.mode != 'infer':
+            label = static.data(name='label', shape=[None, 1], dtype='int64')
+        else:
+            label = None
+
+        if use_dataloader:
+            assert self.mode != 'infer', \
+                        'dataloader is not recommendated when infer, please set use_dataloader to be false.'
+            self.dataloader = paddle.io.DataLoader.from_generator(
+                feed_list=[image, label], capacity=4, iterable=True)
+
+        self.feature_input = [image]
+        self.label_input = label
+
+    def create_model_args(self):
+        cfg = {}
+        cfg['layers'] = self.num_layers
+        cfg['class_dim'] = self.num_classes
+        cfg['seg_num'] = self.seg_num
+        return cfg
+
+    def build_model(self):
+        cfg = self.create_model_args()
+        videomodel = TSN_ResNet(layers=cfg['layers'],
+                                seg_num=cfg['seg_num'],
+                                is_training=(self.mode == 'train'),
+                                is_extractor=self.is_videotag)
+        out = videomodel.net(input=self.feature_input[0],
+                             class_dim=cfg['class_dim'])
+        self.network_outputs = [out]
+
+    def optimizer(self):
+        assert self.mode == 'train', "optimizer only can be get in train mode"
+        epoch_points = [self.num_epochs / 3, self.num_epochs * 2 / 3]
+        total_videos = self.total_videos
+        step = int(total_videos / self.batch_size + 1)
+        bd = [e * step for e in epoch_points]
+        base_lr = self.base_learning_rate
+        lr_decay = self.learning_rate_decay
+        lr = [base_lr, base_lr * lr_decay, base_lr * lr_decay * lr_decay]
+        l2_weight_decay = self.l2_weight_decay
+        momentum = self.momentum
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
+                                                       values=lr),
+            momentum=momentum,
+            weight_decay=paddle.regularizer.L2Decay(coeff=l2_weight_decay))
+
+        return optimizer
+
+    def loss(self):
+        assert self.mode != 'infer', "invalid loss calculationg in infer mode"
+        cost = paddle.nn.functional.cross_entropy(input=self.network_outputs[0], \
+                           label=self.label_input, ignore_index=-1)
+        self.loss_ = paddle.mean(x=cost)
+        return self.loss_
+
+    def outputs(self):
+        return self.network_outputs
+
+    def feeds(self):
+        return self.feature_input if self.mode == 'infer' else self.feature_input + [
+            self.label_input
+        ]
+
+    def fetches(self):
+        if self.mode == 'train' or self.mode == 'valid':
+            losses = self.loss()
+            fetch_list = [losses, self.network_outputs[0], self.label_input]
+        elif self.mode == 'test':
+            losses = self.loss()
+            fetch_list = [losses, self.network_outputs[0], self.label_input]
+        elif self.mode == 'infer':
+            fetch_list = self.network_outputs
+        else:
+            raise NotImplementedError('mode {} not implemented'.format(
+                self.mode))
+
+        return fetch_list
+
+    def pretrain_info(self):
+        return None, None
+
+    def weights_info(self):
+        return None
+
+    def load_pretrain_params(self, exe, pretrain, prog):
+        def is_parameter(var):
+            return isinstance(var, paddle.framework.Parameter)
+
+        logger.info(
+            "Load pretrain weights from {}, exclude fc layer.".format(pretrain))
+
+        print("===pretrain===", pretrain)
+        state_dict = paddle.static.load_program_state(pretrain)
+        dict_keys = list(state_dict.keys())
+        # remove fc layer when pretrain, because the number of classes in final fc may not match
+        for name in dict_keys:
+            if "fc_0" in name:
+                del state_dict[name]
+                print('Delete {} from pretrained parameters. Do not load it'.
+                      format(name))
+        paddle.static.set_program_state(prog, state_dict)
diff --git a/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py b/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py
new file mode 100644
index 000000000..3472c6b5a
--- /dev/null
+++ b/docs/src/applications/VideoTag/models/tsn/tsn_res_model.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import time
+import sys
+import paddle
+import paddle.static as static
+import math
+
+
+class TSN_ResNet():
+    def __init__(self,
+                 layers=50,
+                 seg_num=7,
+                 is_training=True,
+                 is_extractor=False):
+        self.layers = layers
+        self.seg_num = seg_num
+        self.is_training = is_training
+        self.is_extractor = is_extractor
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = paddle.static.nn.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            param_attr=paddle.ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        return paddle.static.nn.batch_norm(
+            input=conv,
+            act=act,
+            is_test=(not self.is_training),
+            param_attr=paddle.ParamAttr(name=bn_name + "_scale"),
+            bias_attr=paddle.ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + '_variance')
+
+    def shortcut(self, input, ch_out, stride, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, name):
+        conv0 = self.conv_bn_layer(input=input,
+                                   num_filters=num_filters,
+                                   filter_size=1,
+                                   act='relu',
+                                   name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(input=conv0,
+                                   num_filters=num_filters,
+                                   filter_size=3,
+                                   stride=stride,
+                                   act='relu',
+                                   name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(input=conv1,
+                                   num_filters=num_filters * 4,
+                                   filter_size=1,
+                                   act=None,
+                                   name=name + "_branch2c")
+
+        short = self.shortcut(input,
+                              num_filters * 4,
+                              stride,
+                              name=name + "_branch1")
+
+        return paddle.add(x=short, y=conv2)
+
+    def net(self, input, class_dim=101):
+        layers = self.layers
+        seg_num = self.seg_num
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        # reshape input
+        channels = input.shape[2]
+        short_size = input.shape[3]
+        input = paddle.reshape(
+            x=input, shape=[-1, channels, short_size, short_size])
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        conv = self.conv_bn_layer(input=input,
+                                  num_filters=64,
+                                  filter_size=7,
+                                  stride=2,
+                                  act='relu',
+                                  name='conv1')
+        conv = paddle.nn.functional.max_pool2d(x=conv,
+                                   kernel_size=3,
+                                   stride=2,
+                                   padding=1)
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    name=conv_name)
+
+        pool = paddle.nn.functional.adaptive_avg_pool2d(x=conv, output_size=1)
+
+        feature = paddle.reshape(x=pool,
+                                       shape=[-1, seg_num, pool.shape[1]])
+        if self.is_extractor:
+            out = feature
+        else:
+            out = paddle.mean(x=feature, axis=1)
+
+            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+            out = static.nn.fc(
+                x=out,
+                size=class_dim,
+                activation='softmax',
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Uniform(low=-stdv, high=stdv)))
+        return out
diff --git a/docs/src/applications/VideoTag/models/utils.py b/docs/src/applications/VideoTag/models/utils.py
new file mode 100644
index 000000000..348079428
--- /dev/null
+++ b/docs/src/applications/VideoTag/models/utils.py
@@ -0,0 +1,47 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import wget
+import tarfile
+
+__all__ = ['decompress', 'download', 'AttrDict']
+
+
+def decompress(path):
+    t = tarfile.open(path)
+    t.extractall(path=os.path.split(path)[0])
+    t.close()
+    os.remove(path)
+
+
+def download(url, path):
+    weight_dir = os.path.split(path)[0]
+    if not os.path.exists(weight_dir):
+        os.makedirs(weight_dir)
+
+    path = path + ".tar.gz"
+    wget.download(url, path)
+    decompress(path)
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
diff --git a/docs/src/applications/VideoTag/predict.py b/docs/src/applications/VideoTag/predict.py
new file mode 100644
index 000000000..1e26c6782
--- /dev/null
+++ b/docs/src/applications/VideoTag/predict.py
@@ -0,0 +1,171 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import time
+import logging
+import argparse
+import ast
+import numpy as np
+import paddle
+import paddle.static as static
+try:
+    import cPickle as pickle
+except:
+    import pickle
+
+from utils.config_utils import *
+import models
+from reader import get_reader
+from metrics import get_metrics
+from utils.utility import check_cuda
+from utils.utility import check_version
+
+logging.root.handlers = []
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.DEBUG, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name',
+                        type=str,
+                        default='AttentionCluster',
+                        help='name of model to train.')
+    parser.add_argument('--config',
+                        type=str,
+                        default='configs/attention_cluster.txt',
+                        help='path to config file of model')
+    parser.add_argument('--use_gpu',
+                        type=ast.literal_eval,
+                        default=True,
+                        help='default use gpu.')
+    parser.add_argument(
+        '--weights',
+        type=str,
+        default='./data/checkpoints/AttentionLSTM_epoch9.pdparams',
+        help='weight path.')
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=1,
+                        help='sample number in a batch for inference.')
+    parser.add_argument('--filelist',
+                        type=str,
+                        default=None,
+                        help='path to inferenece data file lists file.')
+    parser.add_argument('--log_interval',
+                        type=int,
+                        default=1,
+                        help='mini-batch interval to log.')
+    parser.add_argument('--infer_topk',
+                        type=int,
+                        default=20,
+                        help='topk predictions to restore.')
+    parser.add_argument('--save_dir',
+                        type=str,
+                        default=os.path.join('data', 'predict_results',
+                                             'attention_lstm'),
+                        help='directory to store results')
+    parser.add_argument('--video_path',
+                        type=str,
+                        default=None,
+                        help='directory to store results')
+    parser.add_argument('--label_file',
+                        type=str,
+                        default='label_3396.txt',
+                        help='chinese label file path')
+    args = parser.parse_args()
+    return args
+
+
+def infer(args):
+    # parse config
+    config = parse_config(args.config)
+    infer_config = merge_configs(config, 'infer', vars(args))
+    print_configs(infer_config, "Infer")
+    infer_model = models.get_model(args.model_name, infer_config, mode='infer')
+    infer_model.build_input(use_dataloader=False)
+    infer_model.build_model()
+    infer_feeds = infer_model.feeds()
+    infer_outputs = infer_model.outputs()
+
+    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
+    exe = static.Executor(place)
+
+    exe.run(static.default_startup_program())
+
+    filelist = args.filelist or infer_config.INFER.filelist
+    filepath = args.video_path or infer_config.INFER.get('filepath', '')
+    if filepath != '':
+        assert os.path.exists(filepath), "{} not exist.".format(filepath)
+    else:
+        assert os.path.exists(filelist), "{} not exist.".format(filelist)
+
+    # get infer reader
+    infer_reader = get_reader(args.model_name.upper(), 'infer', infer_config)
+
+    if args.weights:
+        assert os.path.exists(
+            args.weights), "Given weight dir {} not exist.".format(args.weights)
+    # if no weight files specified, download weights from paddle
+    weights = args.weights or infer_model.get_weights()
+
+    infer_model.load_test_weights(exe, weights, static.default_main_program())
+
+    infer_feeder = paddle.fluid.DataFeeder(place=place, feed_list=infer_feeds)
+    fetch_list = infer_model.fetches()
+
+    infer_metrics = get_metrics(args.model_name.upper(), 'infer', infer_config)
+    infer_metrics.reset()
+
+    periods = []
+    cur_time = time.time()
+    for infer_iter, data in enumerate(infer_reader()):
+        data_feed_in = [items[:-1] for items in data]
+        video_id = [items[-1] for items in data]
+        infer_outs = exe.run(fetch_list=fetch_list,
+                             feed=infer_feeder.feed(data_feed_in))
+        infer_result_list = [item for item in infer_outs] + [video_id]
+
+        prev_time = cur_time
+        cur_time = time.time()
+        period = cur_time - prev_time
+        periods.append(period)
+
+        infer_metrics.accumulate(infer_result_list)
+
+        if args.log_interval > 0 and infer_iter % args.log_interval == 0:
+            logger.info('Processed {} samples'.format(
+                (infer_iter + 1) * len(video_id)))
+
+    logger.info('[INFER] infer finished. average time: {}'.format(
+        np.mean(periods)))
+
+    if not os.path.isdir(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    infer_metrics.finalize_and_log_out(savedir=args.save_dir,
+                                       label_file=args.label_file)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    # check whether the installed paddle is compiled with GPU
+    check_cuda(args.use_gpu)
+    check_version()
+    logger.info(args)
+
+    infer(args)
diff --git a/docs/src/applications/VideoTag/reader/__init__.py b/docs/src/applications/VideoTag/reader/__init__.py
new file mode 100644
index 000000000..3d814a62d
--- /dev/null
+++ b/docs/src/applications/VideoTag/reader/__init__.py
@@ -0,0 +1,7 @@
+from .reader_utils import regist_reader, get_reader
+from .feature_reader import FeatureReader
+from .kinetics_reader import KineticsReader
+
+# regist reader, sort by alphabet
+regist_reader("ATTENTIONLSTM", FeatureReader)
+regist_reader("TSN", KineticsReader)
diff --git a/docs/src/applications/VideoTag/reader/feature_reader.py b/docs/src/applications/VideoTag/reader/feature_reader.py
new file mode 100644
index 000000000..3be921f6d
--- /dev/null
+++ b/docs/src/applications/VideoTag/reader/feature_reader.py
@@ -0,0 +1,80 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import sys
+from .reader_utils import DataReader
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+    from io import BytesIO
+import numpy as np
+import random
+
+python_ver = sys.version_info
+
+
+class FeatureReader(DataReader):
+    """
+    Data reader for youtube-8M dataset, which was stored as features extracted by prior networks
+    This is for the three models: lstm
+
+    dataset cfg: num_classes
+                 batch_size
+                 list
+    """
+    def __init__(self, name, mode, cfg):
+        self.name = name
+        self.mode = mode
+        self.num_classes = cfg.MODEL.num_classes
+
+        # set batch size and file list
+        self.batch_size = cfg[mode.upper()]['batch_size']
+        self.filelist = cfg[mode.upper()]['filelist']
+        self.seg_num = cfg.MODEL.get('seg_num', None)
+
+    def create_reader(self):
+        fl = open(self.filelist).readlines()
+        fl = [line.strip() for line in fl if line.strip() != '']
+        if self.mode == 'train':
+            random.shuffle(fl)
+
+        def reader():
+            batch_out = []
+            for item in fl:
+                fileinfo = item.split(' ')
+                filepath = fileinfo[0]
+                rgb = np.load(filepath, allow_pickle=True)
+                nframes = rgb.shape[0]
+                label = [int(i) for i in fileinfo[1:]]
+                one_hot_label = make_one_hot(label, self.num_classes)
+
+                if self.mode != 'infer':
+                    batch_out.append((rgb, one_hot_label))
+                else:
+                    batch_out.append((rgb, filepath.split('/')[-1]))
+                if len(batch_out) == self.batch_size:
+                    yield batch_out
+                    batch_out = []
+
+        return reader
+
+
+def make_one_hot(label, dim=3862):
+    one_hot_label = np.zeros(dim)
+    one_hot_label = one_hot_label.astype(float)
+    for ind in label:
+        one_hot_label[int(ind)] = 1
+    return one_hot_label
diff --git a/docs/src/applications/VideoTag/reader/kinetics_reader.py b/docs/src/applications/VideoTag/reader/kinetics_reader.py
new file mode 100644
index 000000000..bb3f47ae0
--- /dev/null
+++ b/docs/src/applications/VideoTag/reader/kinetics_reader.py
@@ -0,0 +1,367 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import cv2
+import math
+import random
+import functools
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+    from io import BytesIO
+import numpy as np
+import paddle
+
+from PIL import Image, ImageEnhance
+import logging
+
+from .reader_utils import DataReader
+
+logger = logging.getLogger(__name__)
+python_ver = sys.version_info
+
+
+class VideoRecord(object):
+    '''
+    define a class method which used to describe the frames information of videos
+    1. self._data[0] is the frames' path
+    2. self._data[1] is the number of frames
+    3. self._data[2] is the label of frames
+    '''
+    def __init__(self, row):
+        self._data = row
+
+    @property
+    def path(self):
+        return self._data[0]
+
+    @property
+    def num_frames(self):
+        return int(self._data[1])
+
+    @property
+    def label(self):
+        return int(self._data[2])
+
+
+class KineticsReader(DataReader):
+    """
+    Data reader for kinetics dataset of two format mp4 and pkl.
+    1. mp4, the original format of kinetics400
+    2. pkl, the mp4 was decoded previously and stored as pkl
+    In both case, load the data, and then get the frame data in the form of numpy and label as an integer.
+     dataset cfg: format
+                  num_classes
+                  seg_num
+                  short_size
+                  target_size
+                  num_reader_threads
+                  buf_size
+                  image_mean
+                  image_std
+                  batch_size
+                  list
+    """
+    def __init__(self, name, mode, cfg):
+        super(KineticsReader, self).__init__(name, mode, cfg)
+        self.format = cfg.MODEL.format
+        self.num_classes = self.get_config_from_sec('model', 'num_classes')
+        self.seg_num = self.get_config_from_sec('model', 'seg_num')
+        self.seglen = self.get_config_from_sec('model', 'seglen')
+
+        self.seg_num = self.get_config_from_sec(mode, 'seg_num', self.seg_num)
+        self.short_size = self.get_config_from_sec(mode, 'short_size')
+        self.target_size = self.get_config_from_sec(mode, 'target_size')
+        self.num_reader_threads = self.get_config_from_sec(
+            mode, 'num_reader_threads')
+        self.buf_size = self.get_config_from_sec(mode, 'buf_size')
+        self.fix_random_seed = self.get_config_from_sec(mode, 'fix_random_seed')
+
+        self.img_mean = np.array(cfg.MODEL.image_mean).reshape(
+            [3, 1, 1]).astype(np.float32)
+        self.img_std = np.array(cfg.MODEL.image_std).reshape([3, 1, 1]).astype(
+            np.float32)
+        # set batch size and file list
+        self.batch_size = cfg[mode.upper()]['batch_size']
+        self.filelist = cfg[mode.upper()]['filelist']
+
+        if self.fix_random_seed:
+            random.seed(0)
+            np.random.seed(0)
+            self.num_reader_threads = 1
+
+    def create_reader(self):
+        assert os.path.exists(self.filelist), \
+                    '{} not exist, please check the data list'.format(self.filelist)
+        _reader = self._reader_creator(self.filelist, self.mode, seg_num=self.seg_num, seglen = self.seglen, \
+                         short_size = self.short_size, target_size = self.target_size, \
+                         img_mean = self.img_mean, img_std = self.img_std, \
+                         shuffle = (self.mode == 'train'), \
+                         num_threads = self.num_reader_threads, \
+                         buf_size = self.buf_size, format = self.format)
+
+        def _batch_reader():
+            batch_out = []
+            for imgs, label in _reader():
+                if imgs is None:
+                    continue
+                batch_out.append((imgs, label))
+                if len(batch_out) == self.batch_size:
+                    yield batch_out
+                    batch_out = []
+
+        return _batch_reader
+
+    def _reader_creator(self,
+                        file_list,
+                        mode,
+                        seg_num,
+                        seglen,
+                        short_size,
+                        target_size,
+                        img_mean,
+                        img_std,
+                        shuffle=False,
+                        num_threads=1,
+                        buf_size=1024,
+                        format='frames'):
+        def decode_mp4(sample, mode, seg_num, seglen, short_size, target_size,
+                       img_mean, img_std):
+            sample = sample[0].split(' ')
+            mp4_path = sample[0]
+            if mode == "infer":
+                label = mp4_path.split('/')[-1]
+            else:
+                label = int(sample[1])
+            try:
+                imgs = mp4_loader(mp4_path, seg_num, seglen, mode)
+                if len(imgs) < 1:
+                    logger.error('{} frame length {} less than 1.'.format(
+                        mp4_path, len(imgs)))
+                    return None, None
+            except:
+                logger.error('Error when loading {}'.format(mp4_path))
+                return None, None
+
+            return imgs_transform(imgs, mode, seg_num, seglen, \
+                         short_size, target_size, img_mean, img_std, name = self.name), label
+
+        def decode_frames(sample, mode, seg_num, seglen, short_size,
+                          target_size, img_mean, img_std):
+            recode = VideoRecord(sample[0].split(' '))
+            frames_dir_path = recode.path
+            if mode == "infer":
+                label = frames_dir_path
+            else:
+                label = recode.label
+
+            try:
+                imgs = frames_loader(recode, seg_num, seglen, mode)
+                if len(imgs) < 1:
+                    logger.error('{} frame length {} less than 1.'.format(
+                        frames_dir_path, len(imgs)))
+                    return None, None
+            except:
+                logger.error('Error when loading {}'.format(frames_dir_path))
+                return None, None
+
+            return imgs_transform(imgs,
+                                  mode,
+                                  seg_num,
+                                  seglen,
+                                  short_size,
+                                  target_size,
+                                  img_mean,
+                                  img_std,
+                                  name=self.name), label
+
+        def reader_():
+            with open(file_list) as flist:
+                lines = [line.strip() for line in flist]
+                if shuffle:
+                    random.shuffle(lines)
+                for line in lines:
+                    file_path = line.strip()
+                    yield [file_path]
+
+        if format == 'frames':
+            decode_func = decode_frames
+        elif format == 'video':
+            decode_func = decode_mp4
+        else:
+            raise ("Not implemented format {}".format(format))
+
+        mapper = functools.partial(decode_func,
+                                   mode=mode,
+                                   seg_num=seg_num,
+                                   seglen=seglen,
+                                   short_size=short_size,
+                                   target_size=target_size,
+                                   img_mean=img_mean,
+                                   img_std=img_std)
+
+        return paddle.reader.decorator.xmap_readers(mapper,
+                                     reader_,
+                                     num_threads,
+                                     buf_size,
+                                     order=True)
+
+
+def imgs_transform(imgs,
+                   mode,
+                   seg_num,
+                   seglen,
+                   short_size,
+                   target_size,
+                   img_mean,
+                   img_std,
+                   name=''):
+    imgs = group_scale(imgs, short_size)
+
+    np_imgs = np.array([np.array(img).astype('float32') for img in imgs])  #dhwc
+
+    if mode == 'train':
+        np_imgs = group_crop(np_imgs, target_size)
+        np_imgs = group_random_flip(np_imgs)
+    else:
+        np_imgs = group_crop(np_imgs, target_size, is_center=True)
+
+    np_imgs = np_imgs.transpose(0, 3, 1, 2) / 255  #dchw
+    np_imgs -= img_mean
+    np_imgs /= img_std
+
+    return np_imgs
+
+
+def group_crop(np_imgs, target_size, is_center=True):
+    d, h, w, c = np_imgs.shape
+    th, tw = target_size, target_size
+    assert (w >= target_size) and (h >= target_size), \
+          "image width({}) and height({}) should be larger than crop size".format(w, h, target_size)
+
+    if is_center:
+        h_off = int(round((h - th) / 2.))
+        w_off = int(round((w - tw) / 2.))
+    else:
+        w_off = random.randint(0, w - tw)
+        h_off = random.randint(0, h - th)
+
+    img_crop = np_imgs[:, h_off:h_off + target_size,
+                       w_off:w_off + target_size, :]
+    return img_crop
+
+
+def group_random_flip(np_imgs):
+    prob = random.random()
+    if prob < 0.5:
+        ret = np_imgs[:, :, ::-1, :]
+        return ret
+    else:
+        return np_imgs
+
+
+def group_scale(imgs, target_size):
+    resized_imgs = []
+    for i in range(len(imgs)):
+        img = imgs[i]
+        w, h = img.size
+        if (w <= h and w == target_size) or (h <= w and h == target_size):
+            resized_imgs.append(img)
+            continue
+
+        if w < h:
+            ow = target_size
+            oh = int(target_size * 4.0 / 3.0)
+            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+        else:
+            oh = target_size
+            ow = int(target_size * 4.0 / 3.0)
+            resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+
+    return resized_imgs
+
+
+def mp4_loader(filepath, nsample, seglen, mode):
+    cap = cv2.VideoCapture(filepath)
+    videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    sampledFrames = []
+    for i in range(videolen):
+        ret, frame = cap.read()
+        # maybe first frame is empty
+        if ret == False:
+            continue
+        img = frame[:, :, ::-1]
+        sampledFrames.append(img)
+    average_dur = int(len(sampledFrames) / nsample)
+    imgs = []
+    for i in range(nsample):
+        idx = 0
+        if mode == 'train':
+            if average_dur >= seglen:
+                idx = random.randint(0, average_dur - seglen)
+                idx += i * average_dur
+            elif average_dur >= 1:
+                idx += i * average_dur
+            else:
+                idx = i
+        else:
+            if average_dur >= seglen:
+                idx = (average_dur - 1) // 2
+                idx += i * average_dur
+            elif average_dur >= 1:
+                idx += i * average_dur
+            else:
+                idx = i
+
+        for jj in range(idx, idx + seglen):
+            imgbuf = sampledFrames[int(jj % len(sampledFrames))]
+            img = Image.fromarray(imgbuf, mode='RGB')
+            imgs.append(img)
+
+    return imgs
+
+
+def frames_loader(recode, nsample, seglen, mode):
+    imgpath, num_frames = recode.path, recode.num_frames
+    average_dur = int(num_frames / nsample)
+    imgs = []
+    for i in range(nsample):
+        idx = 0
+        if mode == 'train':
+            if average_dur >= seglen:
+                idx = random.randint(0, average_dur - seglen)
+                idx += i * average_dur
+            elif average_dur >= 1:
+                idx += i * average_dur
+            else:
+                idx = i
+        else:
+            if average_dur >= seglen:
+                idx = (average_dur - 1) // 2
+                idx += i * average_dur
+            elif average_dur >= 1:
+                idx += i * average_dur
+            else:
+                idx = i
+
+        for jj in range(idx, idx + seglen):
+            img = Image.open(
+                os.path.join(imgpath,
+                             'img_{:05d}.jpg'.format(jj + 1))).convert('RGB')
+            imgs.append(img)
+    return imgs
diff --git a/docs/src/applications/VideoTag/reader/reader_utils.py b/docs/src/applications/VideoTag/reader/reader_utils.py
new file mode 100644
index 000000000..f6a9ef5eb
--- /dev/null
+++ b/docs/src/applications/VideoTag/reader/reader_utils.py
@@ -0,0 +1,80 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import pickle
+import cv2
+import numpy as np
+import random
+
+
+class ReaderNotFoundError(Exception):
+    "Error: reader not found"
+
+    def __init__(self, reader_name, avail_readers):
+        super(ReaderNotFoundError, self).__init__()
+        self.reader_name = reader_name
+        self.avail_readers = avail_readers
+
+    def __str__(self):
+        msg = "Reader {} Not Found.\nAvailiable readers:\n".format(
+            self.reader_name)
+        for reader in self.avail_readers:
+            msg += "  {}\n".format(reader)
+        return msg
+
+
+class DataReader(object):
+    """data reader for video input"""
+    def __init__(self, model_name, mode, cfg):
+        self.name = model_name
+        self.mode = mode
+        self.cfg = cfg
+
+    def create_reader(self):
+        """Not implemented"""
+        pass
+
+    def get_config_from_sec(self, sec, item, default=None):
+        if sec.upper() not in self.cfg:
+            return default
+        return self.cfg[sec.upper()].get(item, default)
+
+
+class ReaderZoo(object):
+    def __init__(self):
+        self.reader_zoo = {}
+
+    def regist(self, name, reader):
+        assert reader.__base__ == DataReader, "Unknow model type {}".format(
+            type(reader))
+        self.reader_zoo[name] = reader
+
+    def get(self, name, mode, cfg):
+        for k, v in self.reader_zoo.items():
+            if k == name:
+                return v(name, mode, cfg)
+        raise ReaderNotFoundError(name, self.reader_zoo.keys())
+
+
+# singleton reader_zoo
+reader_zoo = ReaderZoo()
+
+
+def regist_reader(name, reader):
+    reader_zoo.regist(name, reader)
+
+
+def get_reader(name, mode, cfg):
+    reader_model = reader_zoo.get(name, mode, cfg)
+    return reader_model.create_reader()
diff --git a/docs/src/applications/VideoTag/train.py b/docs/src/applications/VideoTag/train.py
new file mode 100644
index 000000000..ec5d91ba4
--- /dev/null
+++ b/docs/src/applications/VideoTag/train.py
@@ -0,0 +1,212 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import argparse
+import ast
+import logging
+import paddle
+import paddle.static as static
+
+from utils.train_utils import train_with_dataloader
+import models
+from utils.config_utils import *
+from reader import get_reader
+from metrics import get_metrics
+from utils.utility import check_cuda
+from utils.utility import check_version
+
+logging.root.handlers = []
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Paddle Video train script")
+    parser.add_argument('--model_name',
+                        type=str,
+                        default='AttentionCluster',
+                        help='name of model to train.')
+    parser.add_argument('--config',
+                        type=str,
+                        default='configs/attention_cluster.txt',
+                        help='path to config file of model')
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=None,
+        help='training batch size. None to use config file setting.')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=None,
+        help='learning rate use for training. None to use config file setting.')
+    parser.add_argument('--pretrain',
+                        type=str,
+                        default=None,
+                        help='path to pretrain weights.')
+    parser.add_argument('--use_gpu',
+                        type=ast.literal_eval,
+                        default=True,
+                        help='default use gpu.')
+    parser.add_argument('--no_memory_optimize',
+                        action='store_true',
+                        default=False,
+                        help='whether to use memory optimize in train')
+    parser.add_argument('--epoch',
+                        type=int,
+                        default=None,
+                        help='epoch number, 0 for read from config file')
+    parser.add_argument('--valid_interval',
+                        type=int,
+                        default=1,
+                        help='validation epoch interval, 0 for no validation.')
+    parser.add_argument('--save_dir',
+                        type=str,
+                        default=os.path.join('data', 'checkpoints'),
+                        help='directory name to save train snapshoot')
+    parser.add_argument('--log_interval',
+                        type=int,
+                        default=1,
+                        help='mini-batch interval to log.')
+    parser.add_argument('--fix_random_seed',
+                        type=ast.literal_eval,
+                        default=False,
+                        help='If set True, enable continuous evaluation job.')
+    args = parser.parse_args()
+    return args
+
+
+def train(args):
+    # parse config
+    config = parse_config(args.config)
+    train_config = merge_configs(config, 'train', vars(args))
+    valid_config = merge_configs(config, 'valid', vars(args))
+    print_configs(train_config, 'Train')
+    train_model = models.get_model(args.model_name, train_config, mode='train')
+    valid_model = models.get_model(args.model_name, valid_config, mode='valid')
+
+    # build model
+    startup = static.Program()
+    train_prog = static.Program()
+    if args.fix_random_seed:
+        startup.random_seed = 1000
+        train_prog.random_seed = 1000
+    with static.program_guard(train_prog, startup):
+        with paddle.utils.unique_name.guard():
+            train_model.build_input(use_dataloader=True)
+            train_model.build_model()
+            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label
+            train_feeds = train_model.feeds()
+            train_fetch_list = train_model.fetches()
+            train_loss = train_fetch_list[0]
+            optimizer = train_model.optimizer()
+            optimizer.minimize(train_loss)
+            train_dataloader = train_model.dataloader()
+
+    valid_prog = static.Program()
+    with static.program_guard(valid_prog, startup):
+        with paddle.utils.unique_name.guard():
+            valid_model.build_input(use_dataloader=True)
+            valid_model.build_model()
+            valid_feeds = valid_model.feeds()
+            valid_fetch_list = valid_model.fetches()
+            valid_dataloader = valid_model.dataloader()
+
+    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
+    exe = static.Executor(place)
+    exe.run(startup)
+
+    if args.pretrain:
+        train_model.load_pretrain_params(exe, args.pretrain, train_prog)
+
+    build_strategy = static.BuildStrategy()
+    build_strategy.enable_inplace = True
+
+    exec_strategy = static.ExecutionStrategy()
+
+    compiled_train_prog = static.CompiledProgram(
+        train_prog).with_data_parallel(loss_name=train_loss.name,
+                                       build_strategy=build_strategy,
+                                       exec_strategy=exec_strategy)
+    compiled_valid_prog = static.CompiledProgram(
+        valid_prog).with_data_parallel(share_vars_from=compiled_train_prog,
+                                       build_strategy=build_strategy,
+                                       exec_strategy=exec_strategy)
+
+    # get reader
+    bs_denominator = 1
+    if args.use_gpu:
+        # check number of GPUs
+        gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
+        if gpus == "":
+            pass
+        else:
+            gpus = gpus.split(",")
+            num_gpus = len(gpus)
+            assert num_gpus == train_config.TRAIN.num_gpus, \
+                   "num_gpus({}) set by CUDA_VISIBLE_DEVICES " \
+                   "shoud be the same as that " \
+                   "set in {}({})".format(
+                   num_gpus, args.config, train_config.TRAIN.num_gpus)
+        bs_denominator = train_config.TRAIN.num_gpus
+
+    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /
+                                        bs_denominator)
+    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /
+                                        bs_denominator)
+    train_reader = get_reader(args.model_name.upper(), 'train', train_config)
+    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)
+
+    # get metrics
+    train_metrics = get_metrics(args.model_name.upper(), 'train', train_config)
+    valid_metrics = get_metrics(args.model_name.upper(), 'valid', valid_config)
+
+    epochs = args.epoch or train_model.epoch_num()
+
+    exe_places = static.cuda_places() if args.use_gpu else static.cpu_places()
+    train_dataloader.set_sample_list_generator(train_reader, places=exe_places)
+    valid_dataloader.set_sample_list_generator(valid_reader, places=exe_places)
+
+    train_with_dataloader(exe,
+                          train_prog,
+                          compiled_train_prog,
+                          train_dataloader,
+                          train_fetch_list,
+                          train_metrics,
+                          epochs=epochs,
+                          log_interval=args.log_interval,
+                          valid_interval=args.valid_interval,
+                          save_dir=args.save_dir,
+                          save_model_name=args.model_name,
+                          fix_random_seed=args.fix_random_seed,
+                          compiled_test_prog=compiled_valid_prog,
+                          test_dataloader=valid_dataloader,
+                          test_fetch_list=valid_fetch_list,
+                          test_metrics=valid_metrics)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    # check whether the installed paddle is compiled with GPU
+    check_cuda(args.use_gpu)
+    check_version()
+    logger.info(args)
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    train(args)
diff --git a/docs/src/applications/VideoTag/tsn_extractor.py b/docs/src/applications/VideoTag/tsn_extractor.py
new file mode 100644
index 000000000..ae5bd0943
--- /dev/null
+++ b/docs/src/applications/VideoTag/tsn_extractor.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import time
+import logging
+import argparse
+import ast
+import numpy as np
+import paddle
+import paddle.static as static
+try:
+    import cPickle as pickle
+except:
+    import pickle
+
+from utils.config_utils import *
+import models
+from reader import get_reader
+from metrics import get_metrics
+from utils.utility import check_cuda
+from utils.utility import check_version
+
+logging.root.handlers = []
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.DEBUG, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name',
+                        type=str,
+                        default='AttentionCluster',
+                        help='name of model to train.')
+    parser.add_argument('--config',
+                        type=str,
+                        default='configs/attention_cluster.txt',
+                        help='path to config file of model')
+    parser.add_argument('--use_gpu',
+                        type=ast.literal_eval,
+                        default=True,
+                        help='default use gpu.')
+    parser.add_argument(
+        '--weights',
+        type=str,
+        default=None,
+        help=
+        'weight path, None to automatically download weights provided by Paddle.'
+    )
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=1,
+                        help='sample number in a batch for inference.')
+    parser.add_argument('--filelist',
+                        type=str,
+                        default='./data/TsnExtractor.list',
+                        help='path to inferenece data file lists file.')
+    parser.add_argument('--log_interval',
+                        type=int,
+                        default=1,
+                        help='mini-batch interval to log.')
+    parser.add_argument('--infer_topk',
+                        type=int,
+                        default=20,
+                        help='topk predictions to restore.')
+    parser.add_argument('--save_dir',
+                        type=str,
+                        default=os.path.join('data', 'tsn_features'),
+                        help='directory to store tsn feature results')
+    parser.add_argument('--video_path',
+                        type=str,
+                        default=None,
+                        help='directory to store results')
+    args = parser.parse_args()
+    return args
+
+
+def infer(args):
+    # parse config
+    config = parse_config(args.config)
+    infer_config = merge_configs(config, 'infer', vars(args))
+    print_configs(infer_config, "Infer")
+    infer_model = models.get_model(args.model_name,
+                                   infer_config,
+                                   mode='infer',
+                                   is_videotag=True)
+    infer_model.build_input(use_dataloader=False)
+    infer_model.build_model()
+    infer_feeds = infer_model.feeds()
+    infer_outputs = infer_model.outputs()
+
+    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
+    exe = static.Executor(place)
+
+    exe.run(static.default_startup_program())
+
+    filelist = args.filelist or infer_config.INFER.filelist
+    filepath = args.video_path or infer_config.INFER.get('filepath', '')
+    if filepath != '':
+        assert os.path.exists(filepath), "{} not exist.".format(filepath)
+    else:
+        assert os.path.exists(filelist), "{} not exist.".format(filelist)
+
+    # get infer reader
+    infer_reader = get_reader(args.model_name.upper(), 'infer', infer_config)
+
+    if args.weights:
+        assert os.path.exists(
+            args.weights), "Given weight dir {} not exist.".format(args.weights)
+    # if no weight files specified, download weights from paddle
+    weights = args.weights or infer_model.get_weights()
+
+    infer_model.load_test_weights(exe, weights, static.default_main_program())
+
+    infer_feeder = paddle.fluid.DataFeeder(place=place, feed_list=infer_feeds)
+    fetch_list = infer_model.fetches()
+
+    infer_metrics = get_metrics(args.model_name.upper(), 'infer', infer_config)
+    infer_metrics.reset()
+
+    if not os.path.isdir(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    for infer_iter, data in enumerate(infer_reader()):
+        data_feed_in = [items[:-1] for items in data]
+        video_id = [items[-1] for items in data]
+        bs = len(video_id)
+        feature_outs = exe.run(fetch_list=fetch_list,
+                               feed=infer_feeder.feed(data_feed_in))
+        for i in range(bs):
+            filename = video_id[i].split('/')[-1][:-4]
+            np.save(os.path.join(args.save_dir, filename + '.npy'),
+                    feature_outs[0][i])  #shape: seg_num*feature_dim
+
+    logger.info("Feature extraction End~")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    # check whether the installed paddle is compiled with GPU
+    check_cuda(args.use_gpu)
+    check_version()
+    logger.info(args)
+
+    infer(args)
diff --git a/docs/src/applications/VideoTag/utils/config_utils.py b/docs/src/applications/VideoTag/utils/config_utils.py
new file mode 100644
index 000000000..647e541d6
--- /dev/null
+++ b/docs/src/applications/VideoTag/utils/config_utils.py
@@ -0,0 +1,75 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import yaml
+from .utility import AttrDict
+import logging
+logger = logging.getLogger(__name__)
+
+CONFIG_SECS = [
+    'train',
+    'valid',
+    'test',
+    'infer',
+]
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    import yaml
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.Loader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+    return
+
+
+def merge_configs(cfg, sec, args_dict):
+    assert sec in CONFIG_SECS, "invalid config section {}".format(sec)
+    sec_dict = getattr(cfg, sec.upper())
+    for k, v in args_dict.items():
+        if v is None:
+            continue
+        try:
+            if hasattr(sec_dict, k):
+                setattr(sec_dict, k, v)
+        except:
+            pass
+    return cfg
+
+
+def print_configs(cfg, mode):
+    logger.info(
+        "---------------- {:>5} Arguments ----------------".format(mode))
+    for sec, sec_items in cfg.items():
+        logger.info("{}:".format(sec))
+        for k, v in sec_items.items():
+            logger.info("    {}:{}".format(k, v))
+    logger.info("-------------------------------------------------")
diff --git a/docs/src/applications/VideoTag/utils/train_utils.py b/docs/src/applications/VideoTag/utils/train_utils.py
new file mode 100644
index 000000000..dc90ecf83
--- /dev/null
+++ b/docs/src/applications/VideoTag/utils/train_utils.py
@@ -0,0 +1,161 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import numpy as np
+import paddle
+import paddle.static as static
+import paddle.profiler as profiler
+import logging
+import shutil
+
+logger = logging.getLogger(__name__)
+
+
+def log_lr_and_step():
+    try:
+        # In optimizers, if learning_rate is set as constant, lr_var
+        # name is 'learning_rate_0', and iteration counter is not
+        # recorded. If learning_rate is set as decayed values from
+        # learning_rate_scheduler, lr_var name is 'learning_rate',
+        # and iteration counter is recorded with name '@LR_DECAY_COUNTER@',
+        # better impliment is required here
+        lr_var = static.global_scope().find_var("learning_rate")
+        if not lr_var:
+            lr_var = static.global_scope().find_var("learning_rate_0")
+        lr = np.array(lr_var.get_tensor())
+
+        lr_count = '[-]'
+        lr_count_var = static.global_scope().find_var("@LR_DECAY_COUNTER@")
+        if lr_count_var:
+            lr_count = np.array(lr_count_var.get_tensor())
+        logger.info(
+            "------- learning rate {}, learning rate counter {} -----".format(
+                np.array(lr), np.array(lr_count)))
+    except:
+        logger.warn("Unable to get learning_rate and LR_DECAY_COUNTER.")
+
+
+def test_with_dataloader(exe,
+                         compiled_test_prog,
+                         test_dataloader,
+                         test_fetch_list,
+                         test_metrics,
+                         log_interval=0,
+                         save_model_name=''):
+    if not test_dataloader:
+        logger.error("[TEST] get dataloader failed.")
+    test_metrics.reset()
+    test_iter = 0
+
+    for data in test_dataloader():
+        test_outs = exe.run(compiled_test_prog,
+                            fetch_list=test_fetch_list,
+                            feed=data)
+        test_metrics.accumulate(test_outs)
+        if log_interval > 0 and test_iter % log_interval == 0:
+            test_metrics.calculate_and_log_out(test_outs, \
+               info = '[TEST] test_iter {} '.format(test_iter))
+        test_iter += 1
+    test_metrics.finalize_and_log_out("[TEST] Finish")
+
+
+def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, \
+                        train_fetch_list, train_metrics, epochs = 10, \
+                        log_interval = 0, valid_interval = 0, save_dir = './', \
+                        num_trainers = 1, trainer_id = 0, \
+                        save_model_name = 'model', fix_random_seed = False, \
+                        compiled_test_prog = None, test_dataloader = None, \
+                        test_fetch_list = None, test_metrics = None, \
+                        is_profiler = None, profiler_path = None):
+    if not train_dataloader:
+        logger.error("[TRAIN] get dataloader failed.")
+    epoch_periods = []
+    train_loss = 0
+
+    # NOTE: profiler tools, used for benchmark
+    if is_profiler:
+        prof = profiler.Profiler()
+    for epoch in range(epochs):
+        log_lr_and_step()
+
+        train_iter = 0
+        epoch_periods = []
+
+        cur_time = time.time()
+        for data in train_dataloader():
+            if is_profiler and train_iter == log_interval:
+                prof.start()
+
+            train_outs = exe.run(compiled_train_prog,
+                                 fetch_list=train_fetch_list,
+                                 feed=data)
+            period = time.time() - cur_time
+            epoch_periods.append(period)
+            timeStamp = time.time()
+            localTime = time.localtime(timeStamp)
+            strTime = time.strftime("%Y-%m-%d %H:%M:%S", localTime)
+            if log_interval > 0 and (train_iter % log_interval == 0):
+                train_metrics.calculate_and_log_out(train_outs, \
+                        info = '[TRAIN {}] Epoch {}, iter {}, time {}, '.format(strTime, epoch, train_iter, period))
+            train_iter += 1
+            cur_time = time.time()
+
+            if is_profiler:
+                prof.step()
+                if train_iter == log_interval + 5:
+                    prof.stop()
+                    prof.export(path=profiler_path, format="json")
+                    return
+
+        if len(epoch_periods) < 1:
+            logger.info(
+                'No iteration was executed, please check the data reader')
+            sys.exit(1)
+
+        logger.info(
+            '[TRAIN] Epoch {} training finished, average time: {}'.format(
+                epoch, np.mean(epoch_periods[1:])))
+
+        if trainer_id == 0:
+            save_model(exe, train_prog, save_dir, save_model_name,
+                       "_epoch{}".format(epoch))
+        if compiled_test_prog and valid_interval > 0 and (
+                epoch + 1) % valid_interval == 0:
+            test_with_dataloader(exe, compiled_test_prog, test_dataloader,
+                                 test_fetch_list, test_metrics, log_interval,
+                                 save_model_name)
+
+    if trainer_id == 0:
+        save_model(exe, train_prog, save_dir, save_model_name)
+    #when fix_random seed for debug
+    if fix_random_seed:
+        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
+        gpu_num = len(cards.split(","))
+        print("kpis\ttrain_cost_card{}\t{}".format(gpu_num, train_loss))
+        print("kpis\ttrain_speed_card{}\t{}".format(gpu_num,
+                                                    np.mean(epoch_periods)))
+
+
+def save_model(exe, program, save_dir, model_name, postfix=''):
+    """save paramters and optimizer related varaibles"""
+    if not os.path.isdir(save_dir):
+        os.makedirs(save_dir)
+    saved_model_name = model_name + postfix
+
+    paddle.static.save(program, os.path.join(save_dir, saved_model_name))
+
+    return
diff --git a/docs/src/applications/VideoTag/utils/utility.py b/docs/src/applications/VideoTag/utils/utility.py
new file mode 100644
index 000000000..1cc56517a
--- /dev/null
+++ b/docs/src/applications/VideoTag/utils/utility.py
@@ -0,0 +1,70 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import signal
+import logging
+import paddle
+
+__all__ = ['AttrDict']
+
+logger = logging.getLogger(__name__)
+
+
+def _term(sig_num, addition):
+    print('current pid is %s, group id is %s' % (os.getpid(), os.getpgrp()))
+    os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)
+
+
+signal.signal(signal.SIGTERM, _term)
+signal.signal(signal.SIGINT, _term)
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+def check_cuda(use_cuda, err = \
+    "\nYou can not set use_gpu = True in the model because you are using paddlepaddle-cpu.\n \
+    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_gpu = False to run models on CPU.\n"
+                                                                                                                     ):
+    try:
+        if use_cuda == True and paddle.is_compiled_with_cuda() == False:
+            print(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_version():
+    """
+     Log error and exit when the installed version of paddlepaddle is
+     not satisfied.
+     """
+    err = "PaddlePaddle version 1.6 or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code." \
+
+    try:
+        paddle.utils.require_version('1.6.0')
+    except Exception as e:
+        logger.error(err)
+        sys.exit(1)
diff --git a/docs/src/applications/VideoTag/videotag_test.py b/docs/src/applications/VideoTag/videotag_test.py
new file mode 100644
index 000000000..f41d8560c
--- /dev/null
+++ b/docs/src/applications/VideoTag/videotag_test.py
@@ -0,0 +1,238 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import time
+import logging
+import argparse
+import ast
+import numpy as np
+import paddle
+import paddle.static as static
+
+from utils.config_utils import *
+import models
+from reader import get_reader
+from metrics import get_metrics
+from utils.utility import check_cuda
+from utils.utility import check_version
+
+logging.root.handlers = []
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--extractor_config',
+                        type=str,
+                        default='configs/tsn.yaml',
+                        help='path to config file of model')
+    parser.add_argument('--extractor_name',
+                        type=str,
+                        default='TSN',
+                        help='extractor model name, default TSN')
+    parser.add_argument('--predictor_config',
+                        '--pconfig',
+                        type=str,
+                        default='configs/attention_lstm.yaml',
+                        help='path to config file of model')
+    parser.add_argument(
+        '--predictor_name',
+        '--pname',
+        type=str,
+        default='AttentionLSTM',
+        help='predictor model name, as AttentionLSTM, AttentionCluster, NEXTVLAD'
+    )
+    parser.add_argument('--use_gpu',
+                        type=ast.literal_eval,
+                        default=True,
+                        help='default use gpu.')
+    parser.add_argument('--extractor_weights',
+                        type=str,
+                        default='weights/tsn',
+                        help='extractor weight path')
+    parser.add_argument('--predictor_weights',
+                        '--pweights',
+                        type=str,
+                        default='weights/attention_lstm',
+                        help='predictor weight path')
+    parser.add_argument('--filelist',
+                        type=str,
+                        default='./data/VideoTag_test.list',
+                        help='path of video data, multiple video')
+    parser.add_argument('--save_dir',
+                        type=str,
+                        default='data/VideoTag_results',
+                        help='output file path')
+    parser.add_argument('--label_file',
+                        type=str,
+                        default='label_3396.txt',
+                        help='chinese label file path')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    """
+    Video classification model of 3000 Chinese tags.
+    videotag_extractor_prdictor (as videotag_TSN_AttentionLSTM)
+    two stages in our model:
+        1. extract feature from input video(mp4 format) using extractor
+        2. predict classification results from extracted feature  using predictor
+    we implement this using two name scopes, ie. extractor_scope and predictor_scope.
+    """
+
+    if not os.path.isdir(args.save_dir):
+        os.makedirs(args.save_dir)
+    extractor_config = parse_config(args.extractor_config)
+    extractor_infer_config = merge_configs(extractor_config, 'infer',
+                                           vars(args))
+    extractor_start_time = time.time()
+    extractor_scope = paddle.static.Scope()
+    with static.scope_guard(extractor_scope):
+        extractor_startup_prog = static.Program()
+        extractor_main_prog = static.Program()
+        with static.program_guard(extractor_main_prog, extractor_startup_prog):
+            paddle.disable_static()
+                # build model
+            extractor_model = models.get_model(args.extractor_name,
+                                               extractor_infer_config,
+                                               mode='infer',
+                                               is_videotag=True)
+            extractor_model.build_input(use_dataloader=False)
+            extractor_model.build_model()
+            extractor_feeds = extractor_model.feeds()
+            extractor_fetch_list = extractor_model.fetches()
+
+            place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
+            exe = static.Executor(place)
+
+            exe.run(extractor_startup_prog)
+
+            logger.info('load extractor weights from {}'.format(
+                args.extractor_weights))
+
+            extractor_model.load_pretrain_params(exe,
+                                                 args.extractor_weights,
+                                                 extractor_main_prog)
+
+                # get reader and metrics
+            extractor_reader = get_reader(args.extractor_name, 'infer',
+                                          extractor_infer_config)
+            extractor_feeder = paddle.fluid.DataFeeder(place=place,
+                                                feed_list=extractor_feeds)
+
+            feature_list = []
+            file_list = []
+            for idx, data in enumerate(extractor_reader()):
+                file_id = [item[-1] for item in data]
+                feed_data = [item[:-1] for item in data]
+                feature_out = exe.run(fetch_list=extractor_fetch_list,
+                                      feed=extractor_feeder.feed(feed_data))
+                feature_list.append(feature_out[0])  #get out from list
+                file_list.append(file_id)
+                logger.info(
+                    '========[Stage 1 Sample {} ] Extractor finished======'.
+                    format(idx))
+            paddle.enable_static()
+        extractor_end_time = time.time()
+        print('extractor_time', extractor_end_time - extractor_start_time)
+
+    predictor_config = parse_config(args.predictor_config)
+    predictor_infer_config = merge_configs(predictor_config, 'infer',
+                                           vars(args))
+
+    # get Predictor input from Extractor output
+    predictor_feed_list = []
+    for i in range(len(feature_list)):
+        feature_out = feature_list[i]
+        if args.predictor_name == "AttentionCluster":
+            extractor_seg_num = extractor_infer_config.INFER.seg_num
+            predictor_seg_num = predictor_infer_config.MODEL.seg_num
+            idxs = []
+            stride = float(extractor_seg_num) / predictor_seg_num
+            for j in range(predictor_seg_num):
+                pos = (j + np.random.random()) * stride
+                idxs.append(min(extractor_seg_num - 1, int(pos)))
+            extractor_feature = feature_out[:, idxs, :].astype(
+                float)  # get from bs dim
+        else:
+            extractor_feature = feature_out.astype(float)
+        predictor_feed_data = [extractor_feature]
+        predictor_feed_list.append((predictor_feed_data, file_list[i]))
+
+    predictor_start_time = time.time()
+    predictor_scope = paddle.static.Scope()
+    with static.scope_guard(predictor_scope):
+        predictor_startup_prog = static.Program()
+        predictor_main_prog = static.Program()
+        with static.program_guard(predictor_main_prog, predictor_startup_prog):
+            paddle.disable_static()
+                # parse config
+            predictor_model = models.get_model(args.predictor_name,
+                                               predictor_infer_config,
+                                               mode='infer')
+            predictor_model.build_input(use_dataloader=False)
+            predictor_model.build_model()
+            predictor_feeds = predictor_model.feeds()
+
+            exe.run(predictor_startup_prog)
+
+            logger.info('load predictor weights from {}'.format(
+                args.predictor_weights))
+            predictor_model.load_test_weights(exe, args.predictor_weights,
+                                              predictor_main_prog)
+
+            predictor_feeder = paddle.fluid.DataFeeder(place=place,
+                                                feed_list=predictor_feeds)
+            predictor_fetch_list = predictor_model.fetches()
+            predictor_metrics = get_metrics(args.predictor_name.upper(),
+                                            'infer', predictor_infer_config)
+            predictor_metrics.reset()
+
+            for idx, data in enumerate(predictor_feed_list):
+                file_id = data[1]
+                predictor_feed_data = data[0]
+                final_outs = exe.run(
+                    fetch_list=predictor_fetch_list,
+                    feed=predictor_feeder.feed(predictor_feed_data))
+                logger.info(
+                    '=======[Stage 2 Sample {} ] Predictor finished========'
+                    .format(idx))
+                final_result_list = [item
+                                     for item in final_outs] + [file_id]
+
+                predictor_metrics.accumulate(final_result_list)
+            predictor_metrics.finalize_and_log_out(
+                savedir=args.save_dir, label_file=args.label_file)
+            paddle.enable_static()
+    predictor_end_time = time.time()
+    print('predictor_time', predictor_end_time - predictor_start_time)
+
+
+if __name__ == '__main__':
+    start_time = time.time()
+    args = parse_args()
+    print(args)
+    check_cuda(args.use_gpu)
+    check_version()
+    logger.info(args)
+    main()
+    end_time = time.time()
+    period = end_time - start_time
+    print('[INFER] infer finished. cost time: {}'.format(period))
diff --git a/docs/src/benchmark/TimeSformer/README.md b/docs/src/benchmark/TimeSformer/README.md
new file mode 100644
index 000000000..d2bb20204
--- /dev/null
+++ b/docs/src/benchmark/TimeSformer/README.md
@@ -0,0 +1,14 @@
+执行
+```bash
+bash ./run_all.sh down_data
+```
+即可运行.
+
+run_all.sh内部的执行步骤：
+1. cd 到 ../../ (也就是 PaddleVideo 目录)
+2. 切换到benchmark_dev分支
+3. 安装 PaddleVideo 所需依赖
+4. cd 回PaddleVideo/data/ucf101
+5. wget下载数据集并解压缩，并下载预训练权重放到data目录下
+6. 再次cd 回到 ../../ (也就是 PaddleVideo 目录)
+8. 按照不同的参数执行 run_benchmark.sh 脚本
diff --git a/docs/src/benchmark/TimeSformer/run_all.sh b/docs/src/benchmark/TimeSformer/run_all.sh
new file mode 100644
index 000000000..c7646f361
--- /dev/null
+++ b/docs/src/benchmark/TimeSformer/run_all.sh
@@ -0,0 +1,57 @@
+# 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行： paddlepaddle/paddle:latest-gpu-cuda10.2-cudnn7  paddle=2.1.2  py=37
+# 执行目录：需说明
+sed -i '/set\ -xe/d' run_benchmark.sh
+cd ../../ # cd到PaddleVideo项目根目录下
+git checkout benchmark_dev
+log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}  #  benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录
+
+# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
+python -m pip install -r requirements.txt
+
+# 2 拷贝该模型需要数据、预训练模型
+unalias cp
+cp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs1.yaml configs/recognition/timesformer/
+cp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs1_mp.yaml configs/recognition/timesformer/
+cp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs14.yaml configs/recognition/timesformer/
+cp -f benchmark/TimeSformer/timesformer_ucf101_videos_benchmark_bs14_mp.yaml configs/recognition/timesformer/
+if [ ! -f "data/ucf101/trainlist_benchmark_mp.txt" ]; then
+    wget -P data/ucf101/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/trainlist_benchmark_mp.txt
+fi
+wget -P data/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
+alias cp='cp -i'
+
+cd data/ucf101 # 进入PaddleVideo/data/ucf101
+if [ $1 = "down_data" ];then
+    wget --no-check-certificate "https://www.crcv.ucf.edu/data/UCF101/UCF101.rar" # 下载训练数据
+    unrar x UCF101.rar # 解压
+    mv ./UCF-101 ./videos # 重命名文件夹为./videos
+    rm -rf ./UCF101.rar
+else    # 使用本地数据
+    rm -rf videos
+    ln -s ${data_path}/dygraph_data/TSM/ucf101/videos ./videos
+fi
+cd ../../ # 返回PaddleVideo
+
+# 3 批量运行（如不方便批量，1，2需放到单个模型中）
+
+model_mode_list=(TimeSformer)
+fp_item_list=(fp32 fp16)
+bs_item_list=(1)    #  14
+for model_mode in ${model_mode_list[@]}; do
+      for fp_item in ${fp_item_list[@]}; do
+          for bs_item in ${bs_item_list[@]}
+            do
+            run_mode=sp
+            log_name=video_${model_mode}_${run_mode}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8
+            echo "index is speed, 1gpus, begin, ${log_name}"
+            CUDA_VISIBLE_DEVICES=0 bash benchmark/${model_mode}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${model_mode} | tee ${log_path}/${log_name}_speed_1gpus 2>&1
+            sleep 60
+
+            run_mode=mp
+            log_name=video_${model_mode}_${run_mode}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8
+            echo "index is speed, 8gpus, run_mode is multi_process, begin, ${log_name}"
+            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/${model_mode}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${model_mode} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
+            sleep 60
+            done
+      done
+done
diff --git a/docs/src/benchmark/TimeSformer/run_benchmark.sh b/docs/src/benchmark/TimeSformer/run_benchmark.sh
new file mode 100644
index 000000000..fd50dbde1
--- /dev/null
+++ b/docs/src/benchmark/TimeSformer/run_benchmark.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+set -xe
+# 运行示例：CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
+# 参数说明
+function _set_params(){
+    run_mode=${1:-"sp"}          # 单卡sp|多卡mp
+    batch_size=${2:-"1"}
+    fp_item=${3:-"fp32"}        # fp32|fp16
+    model_item=${4:-"model_item"}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR 后续QA设置该参数
+# 添加benchmark日志解析所需参数
+    base_batch_size=${batch_size}
+    mission_name="视频分类"
+    direction_id="0"
+    ips_unit="instance/sec"
+    skip_steps=10                     # 解析日志，有些模型前几个step耗时长，需要跳过                                    (必填)
+    keyword="ips:"                 # 解析日志，筛选出数据所在行的关键字                                             (必填)
+    index="1"
+    model_name=${model_item}_bs${batch_size}_${fp_item}
+
+#   以下不用修改   
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
+}
+function _train(){
+    echo "Train on ${num_gpu_devices} GPUs"
+    echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
+    
+    case ${run_mode} in
+    sp) 
+        if [ ${fp_item} == 'fp32' ]; then
+            train_cmd="python -u main.py -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}.yaml"
+        elif [ ${fp_item} == 'fp16' ]; then
+            train_cmd="python -u main.py --amp -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}.yaml"
+        else
+            echo "choose fp_item(fp32 or fp16)"
+            exit 1
+        fi;;
+    mp)
+        rm -rf ./mylog
+        if [ ${fp_item} == 'fp32' ]; then
+            train_cmd="python -u -B -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES --log_dir=./mylog main.py \
+            -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}_mp.yaml"
+            log_parse_file="mylog/workerlog.0"
+        elif [ ${fp_item} == 'fp16' ]; then
+            train_cmd="python -u -B -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES --log_dir=./mylog main.py --amp \
+            -c configs/recognition/timesformer/timesformer_ucf101_videos_benchmark_bs${batch_size}_mp.yaml"
+            log_parse_file="mylog/workerlog.0"
+        else
+            echo "choose fp_item(fp32 or fp16)"
+            exit 1
+        fi;;
+    *) echo "choose run_mode(sp or mp)"; exit 1;
+    esac
+# 以下不用修改
+    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+        export job_fail_flag=1
+    else
+        echo -e "${model_name}, SUCCESS"
+        export job_fail_flag=0
+    fi
+    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+ 
+    if [ $run_mode = "mp" -a -d mylog ]; then
+        rm ${log_file}
+        cp mylog/workerlog.0 ${log_file}
+    fi
+}
+ 
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
+_set_params $@
+# _train       # 如果只想产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
+
diff --git a/docs/src/data/50salads/prepare_asrf_data.py b/docs/src/data/50salads/prepare_asrf_data.py
new file mode 100644
index 000000000..249faff8a
--- /dev/null
+++ b/docs/src/data/50salads/prepare_asrf_data.py
@@ -0,0 +1,113 @@
+import argparse
+import glob
+import os
+import sys
+from typing import Dict
+import numpy as np
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+dataset_names = ["50salads", "breakfast", "gtea"]
+
+
+def get_class2id_map(dataset: str,
+                     dataset_dir: str = "./dataset") -> Dict[str, int]:
+    """
+    Args:
+        dataset: 50salads, gtea, breakfast
+        dataset_dir: the path to the datset directory
+    """
+
+    assert (dataset in dataset_names
+            ), "You have to choose 50salads, gtea or breakfast as dataset."
+
+    with open(os.path.join(dataset_dir, "{}/mapping.txt".format(dataset)),
+              "r") as f:
+        actions = f.read().split("\n")[:-1]
+
+    class2id_map = dict()
+    for a in actions:
+        class2id_map[a.split()[1]] = int(a.split()[0])
+
+    return class2id_map
+
+
+def get_arguments() -> argparse.Namespace:
+    """
+    parse all the arguments from command line inteface
+    return a list of parsed arguments
+    """
+
+    parser = argparse.ArgumentParser(
+        description="convert ground truth txt files to numpy array")
+    parser.add_argument(
+        "--dataset_dir",
+        type=str,
+        default="./dataset",
+        help="path to a dataset directory (default: ./dataset)",
+    )
+
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = get_arguments()
+
+    datasets = ["50salads", "gtea", "breakfast", "baseball"]
+
+    for dataset in datasets:
+        # make directory for saving ground truth numpy arrays
+        cls_save_dir = os.path.join(args.dataset_dir, dataset, "gt_arr")
+        if not os.path.exists(cls_save_dir):
+            os.mkdir(cls_save_dir)
+
+        # make directory for saving ground truth numpy arrays
+        boundary_save_dir = os.path.join(args.dataset_dir, dataset,
+                                         "gt_boundary_arr")
+        if not os.path.exists(boundary_save_dir):
+            os.mkdir(boundary_save_dir)
+
+        # class to index mapping
+        class2id_map = get_class2id_map(dataset, dataset_dir=args.dataset_dir)
+
+        gt_dir = os.path.join(args.dataset_dir, dataset, "groundTruth")
+        gt_paths = glob.glob(os.path.join(gt_dir, "*.txt"))
+
+        for gt_path in gt_paths:
+            # the name of ground truth text file
+            gt_name = os.path.relpath(gt_path, gt_dir)
+
+            with open(gt_path, "r") as f:
+                gt = f.read().split("\n")[:-1]
+
+            gt_array = np.zeros(len(gt))
+            for i in range(len(gt)):
+                gt_array[i] = class2id_map[gt[i]]
+
+            # save array
+            np.save(os.path.join(cls_save_dir, gt_name[:-4] + ".npy"), gt_array)
+
+            # the name of ground truth text file
+            gt_name = os.path.relpath(gt_path, gt_dir)
+
+            with open(gt_path, "r") as f:
+                gt = f.read().split("\n")[:-1]
+
+            # define the frame where new action starts as boundary frame
+            boundary = np.zeros(len(gt))
+            last = gt[0]
+            boundary[0] = 1
+            for i in range(1, len(gt)):
+                if last != gt[i]:
+                    boundary[i] = 1
+                    last = gt[i]
+
+            # save array
+            np.save(os.path.join(boundary_save_dir, gt_name[:-4] + ".npy"),
+                    boundary)
+
+    print("Done")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/src/data/50salads/transform_segmentation_label.py b/docs/src/data/50salads/transform_segmentation_label.py
new file mode 100644
index 000000000..2b7c3c726
--- /dev/null
+++ b/docs/src/data/50salads/transform_segmentation_label.py
@@ -0,0 +1,195 @@
+import json
+import numpy as np
+import argparse
+import os
+
+from tqdm import tqdm
+
+
+def generate_mapping_list_txt(action_dict, out_path):
+    out_txt_file_path = os.path.join(out_path, "mapping.txt")
+    f = open(out_txt_file_path, "w", encoding='utf-8')
+    for key, action_name in action_dict.items():
+        str_str = str(key) + " " + action_name + "\n"
+        f.write(str_str)
+    # add None
+    str_str = str(len(action_dict)) + " None" + "\n"
+    f.write(str_str)
+    f.close()
+
+
+def segmentation_convert_localization_label(prefix_data_path, out_path,
+                                            action_dict, fps):
+    label_path = os.path.join(prefix_data_path)
+    label_txt_name_list = os.listdir(label_path)
+
+    labels_dict = {}
+    labels_dict["fps"] = fps
+    labels_list = []
+    for label_name in tqdm(label_txt_name_list, desc='label convert:'):
+        label_dict = {}
+        label_dict["url"] = label_name.split(".")[0] + ".mp4"
+        label_txt_path = os.path.join(prefix_data_path, label_name)
+
+        with open(label_txt_path, "r", encoding='utf-8') as f:
+            gt = f.read().split("\n")[:-1]
+        label_dict["total_frames"] = len(gt)
+
+        boundary_index_list = [0]
+        before_action_name = gt[0]
+        for index in range(1, len(gt)):
+            if before_action_name != gt[index]:
+                boundary_index_list.append(index)
+                before_action_name = gt[index]
+        actions_list = []
+        for index in range(len(boundary_index_list) - 1):
+            if gt[boundary_index_list[index]] != "None":
+                action_name = gt[boundary_index_list[index]]
+                start_sec = float(boundary_index_list[index]) / float(fps)
+                end_sec = float(boundary_index_list[index + 1] - 1) / float(fps)
+                action_id = action_dict[action_name]
+                label_action_dict = {}
+                label_action_dict["label_names"] = action_name
+                label_action_dict["start_id"] = start_sec
+                label_action_dict["end_id"] = end_sec
+                label_action_dict["label_ids"] = [action_id]
+                actions_list.append(label_action_dict)
+
+        label_dict["actions"] = actions_list
+        labels_list.append(label_dict)
+    labels_dict["gts"] = labels_list
+    output_path = os.path.join(out_path, "output.json")
+    f = open(output_path, "w", encoding='utf-8')
+    f.write(json.dumps(labels_dict, indent=4))
+    f.close()
+
+
+def generate_action_dict(label):
+    action_dict = {}
+    for gt in label["gts"]:
+        for action in gt["actions"]:
+            label_id = action["label_ids"][0]
+            label_name = action["label_names"][0]
+            action_dict[label_id] = label_name
+
+    return action_dict
+
+
+def load_action_dict(data_path):
+    mapping_txt_path = os.path.join(data_path, "mapping.txt")
+    with open(mapping_txt_path, "r", encoding='utf-8') as f:
+        actions = f.read().split("\n")[:-1]
+
+    class2id_map = dict()
+    for a in actions:
+        class2id_map[a.split()[1]] = int(a.split()[0])
+
+    return class2id_map
+
+
+def localization_convert_segmentation_label(label, prefix_data_path, out_path):
+    path = os.path.join(out_path, "groundTruth")
+    isExists = os.path.exists(path)
+    if not isExists:
+        os.makedirs(path)
+        print(path + ' 创建成功')
+    else:
+        print(path + ' 目录已存在')
+
+    fps = float(label["fps"])
+    video_list = []
+    for gt in tqdm(label["gts"], desc='label convert:'):
+        video_name = gt["url"].split(".")[0]
+        data_path = os.path.join(prefix_data_path, video_name + ".pkl")
+        video_list.append(video_name + ".txt")
+        feature = np.load(data_path, allow_pickle=True)["image_feature"]
+
+        num_feture = feature.shape[0]
+        seg_label = ["None"] * (num_feture)
+        for action in gt["actions"]:
+            start_id = action["start_id"]
+            end_id = action["end_id"]
+
+            label_name = action["label_names"]
+
+            start_index = int(np.floor(start_id * fps))
+            end_index = int(np.floor(end_id * fps)) + 1
+
+            if end_index < num_feture - 1:
+                seg_label[start_index:end_index] = label_name * (end_index -
+                                                                 start_index)
+            elif start_index < num_feture - 1:
+                seg_label[start_index:] = label_name * (num_feture -
+                                                        start_index)
+            else:
+                pass
+
+        if len(seg_label) != num_feture:
+            seg_label = seg_label[:num_feture]
+        out_txt_file_path = os.path.join(out_path, "groundTruth",
+                                         video_name + ".txt")
+        str = '\n'
+        f = open(out_txt_file_path, "w", encoding='utf-8')
+        f.write(str.join(seg_label) + str)
+        f.close()
+    out_txt_file_path = os.path.join(out_path, "train_list.txt")
+    str = '\n'
+    f = open(out_txt_file_path, "w", encoding='utf-8')
+    f.write(str.join(video_list) + str)
+    f.close()
+
+
+def main():
+    args = get_arguments()
+
+    if args.mode in ["segmentation", "localization"]:
+        if args.mode == "segmentation":
+            with open(args.label_path, 'r', encoding='utf-8') as json_file:
+                label = json.load(json_file)
+            action_dict = generate_action_dict(label)
+            generate_mapping_list_txt(action_dict, args.out_path)
+            localization_convert_segmentation_label(label, args.data_path,
+                                                    args.out_path)
+
+        elif args.mode == "localization":
+            action_dict = load_action_dict(args.label_path)
+            segmentation_convert_localization_label(args.data_path,
+                                                    args.out_path,
+                                                    action_dict,
+                                                    fps=25.0)
+
+    else:
+        raise NotImplementedError
+
+
+def get_arguments():
+    """
+    parse all the arguments from command line inteface
+    return a list of parsed arguments
+    """
+
+    parser = argparse.ArgumentParser(
+        description="convert segmentation and localization label")
+    parser.add_argument("label_path", type=str, help="path of a label file")
+    parser.add_argument(
+        "data_path",
+        type=str,
+        help="path of video feature or segmentation label txt.",
+    )
+    parser.add_argument(
+        "out_path",
+        type=str,
+        help="path of output file.",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="segmentation",
+        help="Convert segmentation label or localization label.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/src/data/ntu-rgb-d/download_dataset.sh b/docs/src/data/ntu-rgb-d/download_dataset.sh
new file mode 100644
index 000000000..bcdadf06d
--- /dev/null
+++ b/docs/src/data/ntu-rgb-d/download_dataset.sh
@@ -0,0 +1,12 @@
+cd data/ntu-rgb-d
+
+# download
+wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1CUZnBtYwifVXS21yVg62T-vrPVayso5H' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1CUZnBtYwifVXS21yVg62T-vrPVayso5H" -O nturgbd_skeletons_s001_to_s017.zip && rm -rf /tmp/cookies.txt
+
+unzip nturgbd_skeletons_s001_to_s017.zip && rm -rf nturgbd_skeletons_s001_to_s017.zip
+
+wget https://videotag.bj.bcebos.com/Data/statistics.zip
+
+mkdir statistics
+
+unzip statistics.zip -d statistics/ && rm -rf statistics.zip
diff --git a/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py b/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py
new file mode 100644
index 000000000..be5457bca
--- /dev/null
+++ b/docs/src/data/ntu-rgb-d/get_raw_denoised_data.py
@@ -0,0 +1,471 @@
+# ref: https://github.com/Uason-Chen/CTR-GCN/blob/main/data/ntu/get_raw_denoised_data.py
+
+import os
+import os.path as osp
+import numpy as np
+import pickle
+import logging
+
+root_path = './'
+raw_data_file = osp.join(root_path, 'raw_data', 'raw_skes_data.pkl')
+save_path = osp.join(root_path, 'denoised_data')
+
+if not osp.exists(save_path):
+    os.mkdir(save_path)
+
+rgb_ske_path = osp.join(save_path, 'rgb+ske')
+if not osp.exists(rgb_ske_path):
+    os.mkdir(rgb_ske_path)
+
+actors_info_dir = osp.join(save_path, 'actors_info')
+if not osp.exists(actors_info_dir):
+    os.mkdir(actors_info_dir)
+
+missing_count = 0
+noise_len_thres = 11
+noise_spr_thres1 = 0.8
+noise_spr_thres2 = 0.69754
+noise_mot_thres_lo = 0.089925
+noise_mot_thres_hi = 2
+
+noise_len_logger = logging.getLogger('noise_length')
+noise_len_logger.setLevel(logging.INFO)
+noise_len_logger.addHandler(
+    logging.FileHandler(osp.join(save_path, 'noise_length.log')))
+noise_len_logger.info('{:^20}\t{:^17}\t{:^8}\t{}'.format(
+    'Skeleton', 'bodyID', 'Motion', 'Length'))
+
+noise_spr_logger = logging.getLogger('noise_spread')
+noise_spr_logger.setLevel(logging.INFO)
+noise_spr_logger.addHandler(
+    logging.FileHandler(osp.join(save_path, 'noise_spread.log')))
+noise_spr_logger.info('{:^20}\t{:^17}\t{:^8}\t{:^8}'.format(
+    'Skeleton', 'bodyID', 'Motion', 'Rate'))
+
+noise_mot_logger = logging.getLogger('noise_motion')
+noise_mot_logger.setLevel(logging.INFO)
+noise_mot_logger.addHandler(
+    logging.FileHandler(osp.join(save_path, 'noise_motion.log')))
+noise_mot_logger.info('{:^20}\t{:^17}\t{:^8}'.format('Skeleton', 'bodyID',
+                                                     'Motion'))
+
+fail_logger_1 = logging.getLogger('noise_outliers_1')
+fail_logger_1.setLevel(logging.INFO)
+fail_logger_1.addHandler(
+    logging.FileHandler(osp.join(save_path, 'denoised_failed_1.log')))
+
+fail_logger_2 = logging.getLogger('noise_outliers_2')
+fail_logger_2.setLevel(logging.INFO)
+fail_logger_2.addHandler(
+    logging.FileHandler(osp.join(save_path, 'denoised_failed_2.log')))
+
+missing_skes_logger = logging.getLogger('missing_frames')
+missing_skes_logger.setLevel(logging.INFO)
+missing_skes_logger.addHandler(
+    logging.FileHandler(osp.join(save_path, 'missing_skes.log')))
+missing_skes_logger.info('{:^20}\t{}\t{}'.format('Skeleton', 'num_frames',
+                                                 'num_missing'))
+
+missing_skes_logger1 = logging.getLogger('missing_frames_1')
+missing_skes_logger1.setLevel(logging.INFO)
+missing_skes_logger1.addHandler(
+    logging.FileHandler(osp.join(save_path, 'missing_skes_1.log')))
+missing_skes_logger1.info('{:^20}\t{}\t{}\t{}\t{}\t{}'.format(
+    'Skeleton', 'num_frames', 'Actor1', 'Actor2', 'Start', 'End'))
+
+missing_skes_logger2 = logging.getLogger('missing_frames_2')
+missing_skes_logger2.setLevel(logging.INFO)
+missing_skes_logger2.addHandler(
+    logging.FileHandler(osp.join(save_path, 'missing_skes_2.log')))
+missing_skes_logger2.info('{:^20}\t{}\t{}\t{}'.format('Skeleton', 'num_frames',
+                                                      'Actor1', 'Actor2'))
+
+
+def denoising_by_length(ske_name, bodies_data):
+    """
+    Denoising data based on the frame length for each bodyID.
+    Filter out the bodyID which length is less or equal than the predefined threshold.
+
+    """
+    noise_info = str()
+    new_bodies_data = bodies_data.copy()
+    for (bodyID, body_data) in new_bodies_data.items():
+        length = len(body_data['interval'])
+        if length <= noise_len_thres:
+            noise_info += 'Filter out: %s, %d (length).\n' % (bodyID, length)
+            noise_len_logger.info('{}\t{}\t{:.6f}\t{:^6d}'.format(
+                ske_name, bodyID, body_data['motion'], length))
+            del bodies_data[bodyID]
+    if noise_info != '':
+        noise_info += '\n'
+
+    return bodies_data, noise_info
+
+
+def get_valid_frames_by_spread(points):
+    """
+    Find the valid (or reasonable) frames (index) based on the spread of X and Y.
+
+    :param points: joints or colors
+    """
+    num_frames = points.shape[0]
+    valid_frames = []
+    for i in range(num_frames):
+        x = points[i, :, 0]
+        y = points[i, :, 1]
+        if (x.max() - x.min()) <= noise_spr_thres1 * (y.max() - y.min()):  # 0.8
+            valid_frames.append(i)
+    return valid_frames
+
+
+def denoising_by_spread(ske_name, bodies_data):
+    """
+    Denoising data based on the spread of Y value and X value.
+    Filter out the bodyID which the ratio of noisy frames is higher than the predefined
+    threshold.
+
+    bodies_data: contains at least 2 bodyIDs
+    """
+    noise_info = str()
+    denoised_by_spr = False  # mark if this sequence has been processed by spread.
+
+    new_bodies_data = bodies_data.copy()
+    # for (bodyID, body_data) in bodies_data.items():
+    for (bodyID, body_data) in new_bodies_data.items():
+        if len(bodies_data) == 1:
+            break
+        valid_frames = get_valid_frames_by_spread(body_data['joints'].reshape(
+            -1, 25, 3))
+        num_frames = len(body_data['interval'])
+        num_noise = num_frames - len(valid_frames)
+        if num_noise == 0:
+            continue
+
+        ratio = num_noise / float(num_frames)
+        motion = body_data['motion']
+        if ratio >= noise_spr_thres2:  # 0.69754
+            del bodies_data[bodyID]
+            denoised_by_spr = True
+            noise_info += 'Filter out: %s (spread rate >= %.2f).\n' % (
+                bodyID, noise_spr_thres2)
+            noise_spr_logger.info('%s\t%s\t%.6f\t%.6f' %
+                                  (ske_name, bodyID, motion, ratio))
+        else:  # Update motion
+            joints = body_data['joints'].reshape(-1, 25, 3)[valid_frames]
+            body_data['motion'] = min(
+                motion, np.sum(np.var(joints.reshape(-1, 3), axis=0)))
+            noise_info += '%s: motion %.6f -> %.6f\n' % (bodyID, motion,
+                                                         body_data['motion'])
+            # TODO: Consider removing noisy frames for each bodyID
+
+    if noise_info != '':
+        noise_info += '\n'
+
+    return bodies_data, noise_info, denoised_by_spr
+
+
+def denoising_by_motion(ske_name, bodies_data, bodies_motion):
+    """
+    Filter out the bodyID which motion is out of the range of predefined interval
+
+    """
+    # Sort bodies based on the motion, return a list of tuples
+    # bodies_motion = sorted(bodies_motion.items(), key=lambda x, y: cmp(x[1], y[1]), reverse=True)
+    bodies_motion = sorted(bodies_motion.items(),
+                           key=lambda x: x[1],
+                           reverse=True)
+
+    # Reserve the body data with the largest motion
+    denoised_bodies_data = [(bodies_motion[0][0],
+                             bodies_data[bodies_motion[0][0]])]
+    noise_info = str()
+
+    for (bodyID, motion) in bodies_motion[1:]:
+        if (motion < noise_mot_thres_lo) or (motion > noise_mot_thres_hi):
+            noise_info += 'Filter out: %s, %.6f (motion).\n' % (bodyID, motion)
+            noise_mot_logger.info('{}\t{}\t{:.6f}'.format(
+                ske_name, bodyID, motion))
+        else:
+            denoised_bodies_data.append((bodyID, bodies_data[bodyID]))
+    if noise_info != '':
+        noise_info += '\n'
+
+    return denoised_bodies_data, noise_info
+
+
+def denoising_bodies_data(bodies_data):
+    """
+    Denoising data based on some heuristic methods, not necessarily correct for all samples.
+
+    Return:
+      denoised_bodies_data (list): tuple: (bodyID, body_data).
+    """
+    ske_name = bodies_data['name']
+    bodies_data = bodies_data['data']
+
+    # Step 1: Denoising based on frame length.
+    bodies_data, noise_info_len = denoising_by_length(ske_name, bodies_data)
+
+    if len(bodies_data) == 1:  # only has one bodyID left after step 1
+        return bodies_data.items(), noise_info_len
+
+    # Step 2: Denoising based on spread.
+    bodies_data, noise_info_spr, denoised_by_spr = denoising_by_spread(
+        ske_name, bodies_data)
+
+    if len(bodies_data) == 1:
+        return bodies_data.items(), noise_info_len + noise_info_spr
+
+    bodies_motion = dict()  # get body motion
+    for (bodyID, body_data) in bodies_data.items():
+        bodies_motion[bodyID] = body_data['motion']
+    # Sort bodies based on the motion
+    # bodies_motion = sorted(bodies_motion.items(), key=lambda x, y: cmp(x[1], y[1]), reverse=True)
+    bodies_motion = sorted(bodies_motion.items(),
+                           key=lambda x: x[1],
+                           reverse=True)
+    denoised_bodies_data = list()
+    for (bodyID, _) in bodies_motion:
+        denoised_bodies_data.append((bodyID, bodies_data[bodyID]))
+
+    return denoised_bodies_data, noise_info_len + noise_info_spr
+
+    # TODO: Consider denoising further by integrating motion method
+
+    # if denoised_by_spr:  # this sequence has been denoised by spread
+    #     bodies_motion = sorted(bodies_motion.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
+    #     denoised_bodies_data = list()
+    #     for (bodyID, _) in bodies_motion:
+    #         denoised_bodies_data.append((bodyID, bodies_data[bodyID]))
+    #     return denoised_bodies_data, noise_info
+
+    # Step 3: Denoising based on motion
+    # bodies_data, noise_info = denoising_by_motion(ske_name, bodies_data, bodies_motion)
+
+    # return bodies_data, noise_info
+
+
+def get_one_actor_points(body_data, num_frames):
+    """
+    Get joints and colors for only one actor.
+    For joints, each frame contains 75 X-Y-Z coordinates.
+    For colors, each frame contains 25 x 2 (X, Y) coordinates.
+    """
+    joints = np.zeros((num_frames, 75), dtype=np.float32)
+    colors = np.ones((num_frames, 1, 25, 2), dtype=np.float32) * np.nan
+    start, end = body_data['interval'][0], body_data['interval'][-1]
+    joints[start:end + 1] = body_data['joints'].reshape(-1, 75)
+    colors[start:end + 1, 0] = body_data['colors']
+
+    return joints, colors
+
+
+def remove_missing_frames(ske_name, joints, colors):
+    """
+    Cut off missing frames which all joints positions are 0s
+
+    For the sequence with 2 actors' data, also record the number of missing frames for
+    actor1 and actor2, respectively (for debug).
+    """
+    num_frames = joints.shape[0]
+    num_bodies = colors.shape[1]  # 1 or 2
+
+    if num_bodies == 2:  # DEBUG
+        missing_indices_1 = np.where(joints[:, :75].sum(axis=1) == 0)[0]
+        missing_indices_2 = np.where(joints[:, 75:].sum(axis=1) == 0)[0]
+        cnt1 = len(missing_indices_1)
+        cnt2 = len(missing_indices_2)
+
+        start = 1 if 0 in missing_indices_1 else 0
+        end = 1 if num_frames - 1 in missing_indices_1 else 0
+        if max(cnt1, cnt2) > 0:
+            if cnt1 > cnt2:
+                info = '{}\t{:^10d}\t{:^6d}\t{:^6d}\t{:^5d}\t{:^3d}'.format(
+                    ske_name, num_frames, cnt1, cnt2, start, end)
+                missing_skes_logger1.info(info)
+            else:
+                info = '{}\t{:^10d}\t{:^6d}\t{:^6d}'.format(
+                    ske_name, num_frames, cnt1, cnt2)
+                missing_skes_logger2.info(info)
+
+    # Find valid frame indices that the data is not missing or lost
+    # For two-subjects action, this means both data of actor1 and actor2 is missing.
+    valid_indices = np.where(joints.sum(axis=1) != 0)[0]  # 0-based index
+    missing_indices = np.where(joints.sum(axis=1) == 0)[0]
+    num_missing = len(missing_indices)
+
+    if num_missing > 0:  # Update joints and colors
+        joints = joints[valid_indices]
+        colors[missing_indices] = np.nan
+        global missing_count
+        missing_count += 1
+        missing_skes_logger.info('{}\t{:^10d}\t{:^11d}'.format(
+            ske_name, num_frames, num_missing))
+
+    return joints, colors
+
+
+def get_bodies_info(bodies_data):
+    bodies_info = '{:^17}\t{}\t{:^8}\n'.format('bodyID', 'Interval', 'Motion')
+    for (bodyID, body_data) in bodies_data.items():
+        start, end = body_data['interval'][0], body_data['interval'][-1]
+        bodies_info += '{}\t{:^8}\t{:f}\n'.format(bodyID, str([start, end]),
+                                                  body_data['motion'])
+
+    return bodies_info + '\n'
+
+
+def get_two_actors_points(bodies_data):
+    """
+    Get the first and second actor's joints positions and colors locations.
+
+    # Arguments:
+        bodies_data (dict): 3 key-value pairs: 'name', 'data', 'num_frames'.
+        bodies_data['data'] is also a dict, while the key is bodyID, the value is
+        the corresponding body_data which is also a dict with 4 keys:
+          - joints: raw 3D joints positions. Shape: (num_frames x 25, 3)
+          - colors: raw 2D color locations. Shape: (num_frames, 25, 2)
+          - interval: a list which records the frame indices.
+          - motion: motion amount
+
+    # Return:
+        joints, colors.
+    """
+    ske_name = bodies_data['name']
+    label = int(ske_name[-2:])
+    num_frames = bodies_data['num_frames']
+    bodies_info = get_bodies_info(bodies_data['data'])
+
+    bodies_data, noise_info = denoising_bodies_data(
+        bodies_data)  # Denoising data
+    bodies_info += noise_info
+
+    bodies_data = list(bodies_data)
+    if len(bodies_data) == 1:  # Only left one actor after denoising
+        if label >= 50:  # DEBUG: Denoising failed for two-subjects action
+            fail_logger_2.info(ske_name)
+
+        bodyID, body_data = bodies_data[0]
+        joints, colors = get_one_actor_points(body_data, num_frames)
+        bodies_info += 'Main actor: %s' % bodyID
+    else:
+        if label < 50:  # DEBUG: Denoising failed for one-subject action
+            fail_logger_1.info(ske_name)
+
+        joints = np.zeros((num_frames, 150), dtype=np.float32)
+        colors = np.ones((num_frames, 2, 25, 2), dtype=np.float32) * np.nan
+
+        bodyID, actor1 = bodies_data[0]  # the 1st actor with largest motion
+        start1, end1 = actor1['interval'][0], actor1['interval'][-1]
+        joints[start1:end1 + 1, :75] = actor1['joints'].reshape(-1, 75)
+        colors[start1:end1 + 1, 0] = actor1['colors']
+        actor1_info = '{:^17}\t{}\t{:^8}\n'.format('Actor1', 'Interval', 'Motion') + \
+                      '{}\t{:^8}\t{:f}\n'.format(bodyID, str([start1, end1]), actor1['motion'])
+        del bodies_data[0]
+
+        actor2_info = '{:^17}\t{}\t{:^8}\n'.format('Actor2', 'Interval',
+                                                   'Motion')
+        start2, end2 = [0, 0]  # initial interval for actor2 (virtual)
+
+        while len(bodies_data) > 0:
+            bodyID, actor = bodies_data[0]
+            start, end = actor['interval'][0], actor['interval'][-1]
+            if min(end1, end) - max(start1,
+                                    start) <= 0:  # no overlap with actor1
+                joints[start:end + 1, :75] = actor['joints'].reshape(-1, 75)
+                colors[start:end + 1, 0] = actor['colors']
+                actor1_info += '{}\t{:^8}\t{:f}\n'.format(
+                    bodyID, str([start, end]), actor['motion'])
+                # Update the interval of actor1
+                start1 = min(start, start1)
+                end1 = max(end, end1)
+            elif min(end2, end) - max(start2,
+                                      start) <= 0:  # no overlap with actor2
+                joints[start:end + 1, 75:] = actor['joints'].reshape(-1, 75)
+                colors[start:end + 1, 1] = actor['colors']
+                actor2_info += '{}\t{:^8}\t{:f}\n'.format(
+                    bodyID, str([start, end]), actor['motion'])
+                # Update the interval of actor2
+                start2 = min(start, start2)
+                end2 = max(end, end2)
+            del bodies_data[0]
+
+        bodies_info += ('\n' + actor1_info + '\n' + actor2_info)
+
+    with open(osp.join(actors_info_dir, ske_name + '.txt'), 'w') as fw:
+        fw.write(bodies_info + '\n')
+
+    return joints, colors
+
+
+def get_raw_denoised_data():
+    """
+    Get denoised data (joints positions and color locations) from raw skeleton sequences.
+
+    For each frame of a skeleton sequence, an actor's 3D positions of 25 joints represented
+    by an 2D array (shape: 25 x 3) is reshaped into a 75-dim vector by concatenating each
+    3-dim (x, y, z) coordinates along the row dimension in joint order. Each frame contains
+    two actor's joints positions constituting a 150-dim vector. If there is only one actor,
+    then the last 75 values are filled with zeros. Otherwise, select the main actor and the
+    second actor based on the motion amount. Each 150-dim vector as a row vector is put into
+    a 2D numpy array where the number of rows equals the number of valid frames. All such
+    2D arrays are put into a list and finally the list is serialized into a cPickle file.
+
+    For the skeleton sequence which contains two or more actors (mostly corresponds to the
+    last 11 classes), the filename and actors' information are recorded into log files.
+    For better understanding, also generate RGB+skeleton videos for visualization.
+    """
+
+    with open(raw_data_file, 'rb') as fr:  # load raw skeletons data
+        raw_skes_data = pickle.load(fr)
+
+    num_skes = len(raw_skes_data)
+    print('Found %d available skeleton sequences.' % num_skes)
+
+    raw_denoised_joints = []
+    raw_denoised_colors = []
+    frames_cnt = []
+
+    for (idx, bodies_data) in enumerate(raw_skes_data):
+        ske_name = bodies_data['name']
+        print('Processing %s' % ske_name)
+        num_bodies = len(bodies_data['data'])
+
+        if num_bodies == 1:  # only 1 actor
+            num_frames = bodies_data['num_frames']
+            body_data = list(bodies_data['data'].values())[0]
+            joints, colors = get_one_actor_points(body_data, num_frames)
+        else:  # more than 1 actor, select two main actors
+            joints, colors = get_two_actors_points(bodies_data)
+            # Remove missing frames
+            joints, colors = remove_missing_frames(ske_name, joints, colors)
+            num_frames = joints.shape[0]  # Update
+            # Visualize selected actors' skeletons on RGB videos.
+
+        raw_denoised_joints.append(joints)
+        raw_denoised_colors.append(colors)
+        frames_cnt.append(num_frames)
+
+        if (idx + 1) % 1000 == 0:
+            print('Processed: %.2f%% (%d / %d), ' % \
+                  (100.0 * (idx + 1) / num_skes, idx + 1, num_skes) + \
+                  'Missing count: %d' % missing_count)
+
+    raw_skes_joints_pkl = osp.join(save_path, 'raw_denoised_joints.pkl')
+    with open(raw_skes_joints_pkl, 'wb') as f:
+        pickle.dump(raw_denoised_joints, f, pickle.HIGHEST_PROTOCOL)
+
+    raw_skes_colors_pkl = osp.join(save_path, 'raw_denoised_colors.pkl')
+    with open(raw_skes_colors_pkl, 'wb') as f:
+        pickle.dump(raw_denoised_colors, f, pickle.HIGHEST_PROTOCOL)
+
+    frames_cnt = np.array(frames_cnt, dtype=np.int)
+    np.savetxt(osp.join(save_path, 'frames_cnt.txt'), frames_cnt, fmt='%d')
+
+    print('Saved raw denoised positions of {} frames into {}'.format(
+        np.sum(frames_cnt), raw_skes_joints_pkl))
+    print('Found %d files that have missing data' % missing_count)
+
+
+if __name__ == '__main__':
+    get_raw_denoised_data()
diff --git a/docs/src/data/ntu-rgb-d/get_raw_skes_data.py b/docs/src/data/ntu-rgb-d/get_raw_skes_data.py
new file mode 100644
index 000000000..3cd2912e5
--- /dev/null
+++ b/docs/src/data/ntu-rgb-d/get_raw_skes_data.py
@@ -0,0 +1,157 @@
+# ref: https://github.com/Uason-Chen/CTR-GCN/blob/main/data/ntu/get_raw_skes_data.py
+
+import os.path as osp
+import os
+import numpy as np
+import pickle
+import logging
+
+
+def get_raw_bodies_data(skes_path, ske_name, frames_drop_skes,
+                        frames_drop_logger):
+    """
+    Get raw bodies data from a skeleton sequence.
+
+    Each body's data is a dict that contains the following keys:
+      - joints: raw 3D joints positions. Shape: (num_frames x 25, 3)
+      - colors: raw 2D color locations. Shape: (num_frames, 25, 2)
+      - interval: a list which stores the frame indices of this body.
+      - motion: motion amount (only for the sequence with 2 or more bodyIDs).
+
+    Return:
+      a dict for a skeleton sequence with 3 key-value pairs:
+        - name: the skeleton filename.
+        - data: a dict which stores raw data of each body.
+        - num_frames: the number of valid frames.
+    """
+    ske_file = osp.join(skes_path, ske_name + '.skeleton')
+    assert osp.exists(ske_file), 'Error: Skeleton file %s not found' % ske_file
+    # Read all data from .skeleton file into a list (in string format)
+    print('Reading data from %s' % ske_file[-29:])
+    with open(ske_file, 'r') as fr:
+        str_data = fr.readlines()
+
+    num_frames = int(str_data[0].strip('\r\n'))
+    frames_drop = []
+    bodies_data = dict()
+    valid_frames = -1  # 0-based index
+    current_line = 1
+
+    for f in range(num_frames):
+        num_bodies = int(str_data[current_line].strip('\r\n'))
+        current_line += 1
+
+        if num_bodies == 0:  # no data in this frame, drop it
+            frames_drop.append(f)  # 0-based index
+            continue
+
+        valid_frames += 1
+        joints = np.zeros((num_bodies, 25, 3), dtype=np.float32)
+        colors = np.zeros((num_bodies, 25, 2), dtype=np.float32)
+
+        for b in range(num_bodies):
+            bodyID = str_data[current_line].strip('\r\n').split()[0]
+            current_line += 1
+            num_joints = int(str_data[current_line].strip('\r\n'))  # 25 joints
+            current_line += 1
+
+            for j in range(num_joints):
+                temp_str = str_data[current_line].strip('\r\n').split()
+                joints[b, j, :] = np.array(temp_str[:3], dtype=np.float32)
+                colors[b, j, :] = np.array(temp_str[5:7], dtype=np.float32)
+                current_line += 1
+
+            if bodyID not in bodies_data:  # Add a new body's data
+                body_data = dict()
+                body_data['joints'] = joints[b]  # ndarray: (25, 3)
+                body_data['colors'] = colors[b,
+                                             np.newaxis]  # ndarray: (1, 25, 2)
+                body_data['interval'] = [valid_frames
+                                         ]  # the index of the first frame
+            else:  # Update an already existed body's data
+                body_data = bodies_data[bodyID]
+                # Stack each body's data of each frame along the frame order
+                body_data['joints'] = np.vstack(
+                    (body_data['joints'], joints[b]))
+                body_data['colors'] = np.vstack(
+                    (body_data['colors'], colors[b, np.newaxis]))
+                pre_frame_idx = body_data['interval'][-1]
+                body_data['interval'].append(pre_frame_idx +
+                                             1)  # add a new frame index
+
+            bodies_data[bodyID] = body_data  # Update bodies_data
+
+    num_frames_drop = len(frames_drop)
+    assert num_frames_drop < num_frames, \
+        'Error: All frames data (%d) of %s is missing or lost' % (num_frames, ske_name)
+    if num_frames_drop > 0:
+        frames_drop_skes[ske_name] = np.array(frames_drop, dtype=np.int)
+        frames_drop_logger.info('{}: {} frames missed: {}\n'.format(
+            ske_name, num_frames_drop, frames_drop))
+
+    # Calculate motion (only for the sequence with 2 or more bodyIDs)
+    if len(bodies_data) > 1:
+        for body_data in bodies_data.values():
+            body_data['motion'] = np.sum(np.var(body_data['joints'], axis=0))
+
+    return {
+        'name': ske_name,
+        'data': bodies_data,
+        'num_frames': num_frames - num_frames_drop
+    }
+
+
+def get_raw_skes_data():
+
+    skes_name = np.loadtxt(skes_name_file, dtype=str)
+
+    num_files = skes_name.size
+    print('Found %d available skeleton files.' % num_files)
+
+    raw_skes_data = []
+    frames_cnt = np.zeros(num_files, dtype=np.int)
+
+    for (idx, ske_name) in enumerate(skes_name):
+        bodies_data = get_raw_bodies_data(skes_path, ske_name, frames_drop_skes,
+                                          frames_drop_logger)
+        raw_skes_data.append(bodies_data)
+        frames_cnt[idx] = bodies_data['num_frames']
+        if (idx + 1) % 1000 == 0:
+            print('Processed: %.2f%% (%d / %d)' % \
+                  (100.0 * (idx + 1) / num_files, idx + 1, num_files))
+
+    with open(save_data_pkl, 'wb') as fw:
+        pickle.dump(raw_skes_data, fw, pickle.HIGHEST_PROTOCOL)
+    np.savetxt(osp.join(save_path, 'raw_data', 'frames_cnt.txt'),
+               frames_cnt,
+               fmt='%d')
+
+    print('Saved raw bodies data into %s' % save_data_pkl)
+    print('Total frames: %d' % np.sum(frames_cnt))
+
+    with open(frames_drop_pkl, 'wb') as fw:
+        pickle.dump(frames_drop_skes, fw, pickle.HIGHEST_PROTOCOL)
+
+
+if __name__ == '__main__':
+    save_path = './'
+
+    skes_path = '../ntu-rgb-d/nturgb+d_skeletons/'
+    stat_path = osp.join(save_path, 'statistics')
+    if not osp.exists('./raw_data'):
+        os.makedirs('./raw_data')
+
+    skes_name_file = osp.join(stat_path, 'skes_available_name.txt')
+    save_data_pkl = osp.join(save_path, 'raw_data', 'raw_skes_data.pkl')
+    frames_drop_pkl = osp.join(save_path, 'raw_data', 'frames_drop_skes.pkl')
+
+    frames_drop_logger = logging.getLogger('frames_drop')
+    frames_drop_logger.setLevel(logging.INFO)
+    frames_drop_logger.addHandler(
+        logging.FileHandler(osp.join(save_path, 'raw_data', 'frames_drop.log')))
+    frames_drop_skes = dict()
+
+    get_raw_skes_data()
+
+    with open(frames_drop_pkl, 'wb') as fw:
+        pickle.dump(frames_drop_skes, fw, pickle.HIGHEST_PROTOCOL)
diff --git a/docs/src/data/ntu-rgb-d/seq_transformation.py b/docs/src/data/ntu-rgb-d/seq_transformation.py
new file mode 100644
index 000000000..952845289
--- /dev/null
+++ b/docs/src/data/ntu-rgb-d/seq_transformation.py
@@ -0,0 +1,266 @@
+# ref: https://github.com/Uason-Chen/CTR-GCN/blob/main/data/ntu/seq_transformation.py
+
+import os
+import os.path as osp
+import numpy as np
+import pickle
+import logging
+from sklearn.model_selection import train_test_split
+
+root_path = './'
+stat_path = osp.join(root_path, 'statistics')
+setup_file = osp.join(stat_path, 'setup.txt')
+camera_file = osp.join(stat_path, 'camera.txt')
+performer_file = osp.join(stat_path, 'performer.txt')
+replication_file = osp.join(stat_path, 'replication.txt')
+label_file = osp.join(stat_path, 'label.txt')
+skes_name_file = osp.join(stat_path, 'skes_available_name.txt')
+
+denoised_path = osp.join(root_path, 'denoised_data')
+raw_skes_joints_pkl = osp.join(denoised_path, 'raw_denoised_joints.pkl')
+frames_file = osp.join(denoised_path, 'frames_cnt.txt')
+
+save_path = './'
+
+if not osp.exists(save_path):
+    os.mkdir(save_path)
+
+
+def remove_nan_frames(ske_name, ske_joints, nan_logger):
+    num_frames = ske_joints.shape[0]
+    valid_frames = []
+
+    for f in range(num_frames):
+        if not np.any(np.isnan(ske_joints[f])):
+            valid_frames.append(f)
+        else:
+            nan_indices = np.where(np.isnan(ske_joints[f]))[0]
+            nan_logger.info('{}\t{:^5}\t{}'.format(ske_name, f + 1,
+                                                   nan_indices))
+
+    return ske_joints[valid_frames]
+
+
+def seq_translation(skes_joints):
+    for idx, ske_joints in enumerate(skes_joints):
+        num_frames = ske_joints.shape[0]
+        num_bodies = 1 if ske_joints.shape[1] == 75 else 2
+        if num_bodies == 2:
+            missing_frames_1 = np.where(ske_joints[:, :75].sum(axis=1) == 0)[0]
+            missing_frames_2 = np.where(ske_joints[:, 75:].sum(axis=1) == 0)[0]
+            cnt1 = len(missing_frames_1)
+            cnt2 = len(missing_frames_2)
+
+        i = 0  # get the "real" first frame of actor1
+        while i < num_frames:
+            if np.any(ske_joints[i, :75] != 0):
+                break
+            i += 1
+
+        origin = np.copy(ske_joints[i, 3:6])  # new origin: joint-2
+
+        for f in range(num_frames):
+            if num_bodies == 1:
+                ske_joints[f] -= np.tile(origin, 25)
+            else:  # for 2 actors
+                ske_joints[f] -= np.tile(origin, 50)
+
+        if (num_bodies == 2) and (cnt1 > 0):
+            ske_joints[missing_frames_1, :75] = np.zeros((cnt1, 75),
+                                                         dtype=np.float32)
+
+        if (num_bodies == 2) and (cnt2 > 0):
+            ske_joints[missing_frames_2, 75:] = np.zeros((cnt2, 75),
+                                                         dtype=np.float32)
+
+        skes_joints[idx] = ske_joints  # Update
+
+    return skes_joints
+
+
+def frame_translation(skes_joints, skes_name, frames_cnt):
+    nan_logger = logging.getLogger('nan_skes')
+    nan_logger.setLevel(logging.INFO)
+    nan_logger.addHandler(logging.FileHandler("./nan_frames.log"))
+    nan_logger.info('{}\t{}\t{}'.format('Skeleton', 'Frame', 'Joints'))
+
+    for idx, ske_joints in enumerate(skes_joints):
+        num_frames = ske_joints.shape[0]
+        # Calculate the distance between spine base (joint-1) and spine (joint-21)
+        j1 = ske_joints[:, 0:3]
+        j21 = ske_joints[:, 60:63]
+        dist = np.sqrt(((j1 - j21)**2).sum(axis=1))
+
+        for f in range(num_frames):
+            origin = ske_joints[f, 3:
+                                6]  # new origin: middle of the spine (joint-2)
+            if (ske_joints[f, 75:] == 0).all():
+                ske_joints[f, :75] = (ske_joints[f, :75] - np.tile(origin, 25)) / \
+                                      dist[f] + np.tile(origin, 25)
+            else:
+                ske_joints[f] = (ske_joints[f] - np.tile(origin, 50)) / \
+                                 dist[f] + np.tile(origin, 50)
+
+        ske_name = skes_name[idx]
+        ske_joints = remove_nan_frames(ske_name, ske_joints, nan_logger)
+        frames_cnt[idx] = num_frames  # update valid number of frames
+        skes_joints[idx] = ske_joints
+
+    return skes_joints, frames_cnt
+
+
+def align_frames(skes_joints, frames_cnt):
+    """
+    Align all sequences with the same frame length.
+
+    """
+    num_skes = len(skes_joints)
+    max_num_frames = frames_cnt.max()  # 300
+    aligned_skes_joints = np.zeros((num_skes, max_num_frames, 150),
+                                   dtype=np.float32)
+
+    for idx, ske_joints in enumerate(skes_joints):
+        num_frames = ske_joints.shape[0]
+        num_bodies = 1 if ske_joints.shape[1] == 75 else 2
+        if num_bodies == 1:
+            aligned_skes_joints[idx, :num_frames] = np.hstack(
+                (ske_joints, np.zeros_like(ske_joints)))
+        else:
+            aligned_skes_joints[idx, :num_frames] = ske_joints
+
+    return aligned_skes_joints
+
+
+def one_hot_vector(labels):
+    num_skes = len(labels)
+    labels_vector = np.zeros((num_skes, 60))
+    for idx, l in enumerate(labels):
+        labels_vector[idx, l] = 1
+
+    return labels_vector
+
+
+def split_train_val(train_indices, method='sklearn', ratio=0.05):
+    """
+    Get validation set by splitting data randomly from training set with two methods.
+    In fact, I thought these two methods are equal as they got the same performance.
+
+    """
+    if method == 'sklearn':
+        return train_test_split(train_indices,
+                                test_size=ratio,
+                                random_state=10000)
+    else:
+        np.random.seed(10000)
+        np.random.shuffle(train_indices)
+        val_num_skes = int(np.ceil(0.05 * len(train_indices)))
+        val_indices = train_indices[:val_num_skes]
+        train_indices = train_indices[val_num_skes:]
+        return train_indices, val_indices
+
+
+def split_dataset(skes_name, skes_joints, label, performer, camera, evaluation,
+                  save_path):
+    train_indices, test_indices = get_indices(performer, camera, evaluation)
+    m = 'sklearn'  # 'sklearn' or 'numpy'
+    # Select validation set from training set
+    # train_indices, val_indices = split_train_val(train_indices, m)
+
+    # Save labels and num_frames for each sequence of each data set
+    train_labels = label[train_indices]
+    test_labels = label[test_indices]
+
+    train_x = skes_joints[train_indices]
+    # train_y = one_hot_vector(train_labels)
+    test_x = skes_joints[test_indices]
+    # test_y = one_hot_vector(test_labels)
+
+    evaluation_path = osp.join(save_path, evaluation)
+    isExists = osp.exists(evaluation_path)
+    if not isExists:
+        os.makedirs(evaluation_path)
+
+    train_data_save_path = osp.join(evaluation_path, 'train_data.npy')
+    train_label_save_path = osp.join(evaluation_path, 'train_label.pkl')
+    val_data_save_path = osp.join(evaluation_path, 'val_data.npy')
+    val_label_save_path = osp.join(evaluation_path, 'val_label.pkl')
+
+    # reshape data
+    N, T, VC = train_x.shape
+    train_x = np.reshape(train_x, (N, T, 2, 25, 3))
+    train_x = np.transpose(train_x, (0, 4, 1, 3, 2))
+
+    N, T, VC = test_x.shape
+    test_x = np.reshape(test_x, (N, T, 2, 25, 3))
+    test_x = np.transpose(test_x, (0, 4, 1, 3, 2))
+    # save train
+    np.save(train_data_save_path, train_x)
+    out = [skes_name[train_indices], train_labels]
+    with open(train_label_save_path, 'wb') as f:
+        pickle.dump(out, f)
+    # save test
+    np.save(val_data_save_path, test_x)
+    out = [skes_name[test_indices], test_labels]
+    with open(val_label_save_path, 'wb') as f:
+        pickle.dump(out, f)
+
+
+def get_indices(performer, camera, evaluation='xsub'):
+    test_indices = np.empty(0)
+    train_indices = np.empty(0)
+
+    if evaluation == 'xsub':  # Cross Subject (Subject IDs)
+        train_ids = [
+            1, 2, 4, 5, 8, 9, 13, 14, 15, 16, 17, 18, 19, 25, 27, 28, 31, 34,
+            35, 38
+        ]
+        test_ids = [
+            3, 6, 7, 10, 11, 12, 20, 21, 22, 23, 24, 26, 29, 30, 32, 33, 36, 37,
+            39, 40
+        ]
+
+        # Get indices of test data
+        for idx in test_ids:
+            temp = np.where(performer == idx)[0]  # 0-based index
+            test_indices = np.hstack((test_indices, temp)).astype(np.int)
+
+        # Get indices of training data
+        for train_id in train_ids:
+            temp = np.where(performer == train_id)[0]  # 0-based index
+            train_indices = np.hstack((train_indices, temp)).astype(np.int)
+    else:  # Cross View (Camera IDs)
+        train_ids = [2, 3]
+        test_ids = 1
+        # Get indices of test data
+        temp = np.where(camera == test_ids)[0]  # 0-based index
+        test_indices = np.hstack((test_indices, temp)).astype(np.int)
+
+        # Get indices of training data
+        for train_id in train_ids:
+            temp = np.where(camera == train_id)[0]  # 0-based index
+            train_indices = np.hstack((train_indices, temp)).astype(np.int)
+
+    return train_indices, test_indices
+
+
+if __name__ == '__main__':
+    camera = np.loadtxt(camera_file, dtype=np.int)  # camera id: 1, 2, 3
+    performer = np.loadtxt(performer_file, dtype=np.int)  # subject id: 1~40
+    label = np.loadtxt(label_file, dtype=np.int) - 1  # action label: 0~59
+
+    frames_cnt = np.loadtxt(frames_file, dtype=np.int)  # frames_cnt
+    skes_name = np.loadtxt(skes_name_file, dtype=np.string_)
+
+    with open(raw_skes_joints_pkl, 'rb') as fr:
+        skes_joints = pickle.load(fr)  # a list
+
+    skes_joints = seq_translation(skes_joints)
+
+    skes_joints = align_frames(skes_joints,
+                               frames_cnt)  # aligned to the same frame length
+
+    evaluations = ['xview', 'xsub']
+    for evaluation in evaluations:
+        split_dataset(skes_name, skes_joints, label, performer, camera,
+                      evaluation, save_path)
+    print('Done!')
diff --git a/docs/src/deploy/cpp_infer/external-cmake/auto-log.cmake b/docs/src/deploy/cpp_infer/external-cmake/auto-log.cmake
new file mode 100644
index 000000000..9be9c2fb3
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/external-cmake/auto-log.cmake
@@ -0,0 +1,12 @@
+find_package(Git REQUIRED)
+include(FetchContent)
+
+set(FETCHCONTENT_BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}/third-party")
+
+FetchContent_Declare(
+  extern_Autolog
+  PREFIX autolog
+  GIT_REPOSITORY https://github.com/LDOUBLEV/AutoLog.git
+  GIT_TAG        main
+)
+FetchContent_MakeAvailable(extern_Autolog)
diff --git a/docs/src/deploy/cpp_infer/include/postprocess_op.h b/docs/src/deploy/cpp_infer/include/postprocess_op.h
new file mode 100644
index 000000000..d250432b4
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/include/postprocess_op.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+#include "include/utility.h"
+
+
+namespace PaddleVideo
+{
+
+    class Softmax
+    {
+    public:
+        virtual void Inplace_Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end);
+        virtual std::vector<float> Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end);
+    };
+
+} // namespace PaddleVideo
diff --git a/docs/src/deploy/cpp_infer/include/preprocess_op.h b/docs/src/deploy/cpp_infer/include/preprocess_op.h
new file mode 100644
index 000000000..2c3979372
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/include/preprocess_op.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+using namespace std;
+using namespace paddle;
+
+namespace PaddleVideo
+{
+
+    class Normalize
+    {
+    public:
+        virtual void Run(cv::Mat *im, const std::vector<float> &mean,
+                         const std::vector<float> &scale, const bool is_scale = true);
+    };
+
+    // RGB -> CHW
+    class Permute
+    {
+    public:
+        virtual void Run(const cv::Mat *img, float *data);
+    };
+
+    class Scale
+    {
+    public:
+        virtual void Run(const cv::Mat &img, cv::Mat &resize_img,
+                         bool use_tensorrt = false,
+                         const int &short_size = 256);
+    };
+
+    class CenterCrop
+    {
+    public:
+        virtual void Run(const cv::Mat &img, cv::Mat &crop_img,
+                         bool use_tensorrt = false,
+                         const int &target_size = 224);
+    };
+
+    class TenCrop
+    {
+    public:
+        virtual void Run(const cv::Mat &img, std::vector<cv::Mat> &crop_frames,
+                         const int &begin_index,
+                         bool use_tensorrt = false,
+                         const int &target_size = 224);
+    };
+} // namespace PaddleVideo
diff --git a/docs/src/deploy/cpp_infer/include/utility.h b/docs/src/deploy/cpp_infer/include/utility.h
new file mode 100644
index 000000000..d26d8175d
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/include/utility.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <stdlib.h>
+#include <vector>
+
+#include <algorithm>
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/opencv.hpp"
+
+namespace PaddleVideo
+{
+
+    class Utility
+    {
+    public:
+        static std::vector<std::string> ReadDict(const std::string &path);
+
+        static void GetAllFiles(const char *dir_name, std::vector<std::string> &all_inputs);
+
+        static cv::Mat GetRotateCropImage(const cv::Mat &srcimage, std::vector<std::vector<int>> box);
+
+        template <class ForwardIterator> inline static size_t argmax(ForwardIterator first, ForwardIterator last)
+        {
+            return std::distance(first, std::max_element(first, last));
+        }
+
+        static std::vector<cv::Mat> SampleFramesFromVideo(const std::string &VideoPath, const int &num_seg, const int &seg_len);
+    };
+
+} // namespace PaddleVideo
diff --git a/docs/src/deploy/cpp_infer/include/video_rec.h b/docs/src/deploy/cpp_infer/include/video_rec.h
new file mode 100644
index 000000000..c1ac2abfe
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/include/video_rec.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"
+#include "paddle_inference_api.h"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+#include <include/postprocess_op.h>
+#include <include/preprocess_op.h>
+#include <include/utility.h>
+
+using namespace paddle_infer;
+
+namespace PaddleVideo
+{
+
+    class VideoRecognizer
+    {
+    public:
+        explicit VideoRecognizer(const std::string &model_dir, const std::string &inference_model_name, const bool &use_gpu, const int &num_seg,
+                                 const int &rec_batch_num, const int &gpu_id,
+                                 const int &gpu_mem, const int &cpu_math_library_num_threads,
+                                 const bool &use_mkldnn, const std::string &label_path,
+                                 const bool &use_tensorrt, const std::string &precision, const std::vector<float> &_mean = {0.406, 0.456, 0.485},
+                                 const std::vector<float> &_scale = {0.225, 0.224, 0.229})
+        {
+            this->inference_model_name = inference_model_name;
+            this->use_gpu_ = use_gpu;
+            this->num_seg = num_seg;
+            this->rec_batch_num = rec_batch_num;
+            this->gpu_id_ = gpu_id;
+            this->gpu_mem_ = gpu_mem;
+            this->cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+            this->use_mkldnn_ = use_mkldnn;
+            this->use_tensorrt_ = use_tensorrt;
+            this->precision_ = precision;
+            this->mean_ = _mean;
+            this->scale_ = _scale;
+            this->label_list_ = Utility::ReadDict(label_path);
+            LoadModel(model_dir);
+        }
+
+        // Load Paddle inference model
+        void LoadModel(const std::string &model_dir);
+
+        void Run(const std::vector<string> &frames_batch_path, const std::vector<std::vector<cv::Mat> > &frames_batch, std::vector<double> *times);
+
+    private:
+        std::string inference_model_name;
+        std::shared_ptr<Predictor> predictor_;
+
+        bool use_gpu_ = false;
+        int gpu_id_ = 0;
+
+        int rec_batch_num = 1;
+        int gpu_mem_ = 4000;
+        int cpu_math_library_num_threads_ = 4;
+        bool use_mkldnn_ = false;
+        int num_seg = 8;
+        std::vector<std::string> label_list_;
+        std::vector<float> mean_ = {0.406, 0.456, 0.485};
+        std::vector<float> scale_ = {0.225, 0.224, 0.229};
+        bool is_scale_ = true;
+        bool use_tensorrt_ = false;
+        std::string precision_ = "fp32";
+
+        // Instantiate pre-process operation object(s)
+        Scale scale_op_;
+
+        CenterCrop centercrop_op_;
+        TenCrop tencrop_op_;
+
+        Normalize normalize_op_;
+        Permute permute_op_;
+
+        // Instantiate post-process operation object(s)
+        Softmax softmax_op_;
+
+    }; // class VideoRecognizer
+
+} // namespace PaddleVideo
diff --git a/docs/src/deploy/cpp_infer/readme.md b/docs/src/deploy/cpp_infer/readme.md
new file mode 100644
index 000000000..1bd1f1dec
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/readme.md
@@ -0,0 +1,324 @@
+[English](./readme_en.md) | 简体中文
+
+# 服务器端C++预测
+
+本章节介绍PaddleVideo模型的的C++部署方法，python预测部署方法请参考各自模型的**模型推理**章节。
+C++在性能计算上优于python，因此，在大多数CPU、GPU部署场景，多采用C++的部署方式，本节将介绍如何在Linux（CPU/GPU）环境下配置C++环境并完成
+PaddleVideo模型部署。
+
+在开始使用之前，您需要按照以下命令安装额外的依赖包：
+```bash
+python -m pip install git+https://github.com/LDOUBLEV/AutoLog
+```
+
+## 1. 准备环境
+
+- Linux环境，推荐使用docker。
+
+- Windows环境，目前支持基于`Visual Studio 2019 Community`进行编译（TODO）
+
+* 该文档主要介绍基于Linux环境的PaddleVideo C++预测流程，如果需要在Windows下基于预测库进行C++预测，具体编译方法请参考[Windows下编译教程](./docs/windows_vs2019_build.md)（TODO）
+* **准备环境的目的是得到编译好的opencv库与paddle预测库**。
+
+### 1.1 编译opencv库
+
+* 首先需要从opencv官网上下载在Linux环境下源码编译的压缩包，并解压成文件夹。以opencv3.4.7为例，下载命令如下：
+
+    ```bash
+    cd deploy/cpp_infer
+    wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz
+    tar -xf 3.4.7.tar.gz
+    ```
+
+    解压完毕后在`deploy/cpp_infer`目录下可以得到解压出的`opencv-3.4.7`的文件夹。
+
+* 安装ffmpeg
+
+    opencv配合ffmpeg才能在linux下正常读取视频，否则可能遇到视频帧数返回为0或无法读取任何视频帧的情况
+
+    采用较为简单的apt安装，安装命令如下：
+
+    ```bash
+    apt-get update
+
+    apt install libavformat-dev
+    apt install libavcodec-dev
+    apt install libswresample-dev
+    apt install libswscale-dev
+    apt install libavutil-dev
+    apt install libsdl1.2-dev
+
+    apt-get install ffmpeg
+    ```
+
+* 准备编译opencv，首先进入`opencv-3.4.7`的文件夹，然后设置opencv源码路径`root_path`以及安装路径`install_path`。执行命令如下：
+
+    ```bash
+    cd opencv-3.4.7
+
+    root_path=$PWD  # 当前所在路径即为opencv-3.4.7的绝对路径
+    install_path=${root_path}/opencv3
+
+    rm -rf build
+    mkdir build
+    cd build
+
+    cmake .. \
+        -DCMAKE_INSTALL_PREFIX=${install_path} \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DBUILD_SHARED_LIBS=OFF \
+        -DWITH_IPP=OFF \
+        -DBUILD_IPP_IW=OFF \
+        -DWITH_LAPACK=OFF \
+        -DWITH_EIGEN=OFF \
+        -DCMAKE_INSTALL_LIBDIR=lib64 \
+        -DWITH_ZLIB=ON \
+        -DBUILD_ZLIB=ON \
+        -DWITH_JPEG=ON \
+        -DBUILD_JPEG=ON \
+        -DWITH_PNG=ON \
+        -DBUILD_PNG=ON \
+        -DWITH_TIFF=ON \
+        -DBUILD_TIFF=ON \
+        -DWITH_FFMPEG=ON
+
+    make -j
+    make install
+    ```
+
+    `make install`完成之后，会在该文件夹下生成opencv头文件和库文件，用于后面的Video推理C++代码编译。
+
+    最终会以安装路径`install_path`为指定路径，得到一个`opencv3`的文件夹，其文件结构如下所示。
+
+    ```shell
+    opencv-3.4.7/
+    ├── opencv3/  # 安装在opencv3目录下
+    │   ├── bin/
+    │   ├── include/
+    │   ├── lib/
+    │   ├── lib64/
+    │   └── share/
+    ```
+
+### 1.2 下载或者编译Paddle预测库
+
+有2种方式获取Paddle预测库，下面进行详细介绍。
+
+
+#### 1.2.1 直接下载安装
+
+* [Paddle预测库官网](https://paddleinference.paddlepaddle.org.cn/v2.2/user_guides/download_lib.html) 上提供了不同cuda版本的Linux预测库，可以在官网查看并**选择合适的预测库版本**（建议选择paddle版本>=2.0.1版本的预测库，推荐使用2.2.2的预测库）。
+
+* 下载得到一个`paddle_inference.tgz`压缩包，然后将它解压成文件夹，命令如下(以机器环境为gcc8.2为例)：
+
+    ```bash
+    wget https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz
+    tar -xf paddle_inference.tgz
+    ```
+
+    最终会在当前的文件夹中生成`paddle_inference/`的子文件夹。
+
+#### 1.2.2 预测库源码编译
+* 如果希望获取最新预测库特性，可以从Paddle github上克隆最新代码，源码编译预测库。
+* 可以参考[Paddle预测库安装编译说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi) 的说明，从github上获取Paddle代码，然后进行编译，生成最新的预测库。使用git获取代码方法如下。
+
+    ```shell
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git checkout release/2.2
+    ```
+
+* 进入Paddle目录后，编译方法如下。
+
+    ```shell
+    rm -rf build
+    mkdir build
+    cd build
+
+    cmake  .. \
+        -DWITH_CONTRIB=OFF \
+        -DWITH_MKL=ON \
+        -DWITH_MKLDNN=ON  \
+        -DWITH_TESTING=OFF \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_INFERENCE_API_TEST=OFF \
+        -DON_INFER=ON \
+        -DWITH_PYTHON=ON
+    make -j4
+    make inference_lib_dist -j4 # 4为编译时使用核数，可根据机器情况自行修改
+    ```
+
+    更多编译参数选项介绍可以参考[文档说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi)。
+
+
+* 编译完成之后，可以在`build/paddle_inference_install_dir/`文件下看到生成了以下文件及文件夹。
+
+    ```bash
+    build/
+    └── paddle_inference_install_dir/
+        ├── CMakeCache.txt
+        ├── paddle/
+        ├── third_party/
+        └── version.txt
+    ```
+
+    其中`paddle`就是C++预测所需的Paddle库，`version.txt`中包含当前预测库的版本信息。
+
+## 2. 编译并运行预测demo
+
+### 2.1 将模型导出为inference model
+
+* 该步骤与python部署方式下的导出预测模型相同，可以参考各自模型的模型预测章节。导出的几个相关inference model文件用于模型预测。**以PP-TSM为例**，导出预测模型的目录结构如下。
+
+    ```
+    inference/
+    └── ppTSM/
+        ├── ppTSM.pdiparams
+        ├── ppTSM.pdiparamsinfo
+        └── ppTSM.pdmodel
+    ```
+
+
+### 2.2 编译PaddleVideo C++预测demo
+
+* 进入到`deploy/cpp_infer`目录下，执行以下编译命令
+
+    ```shell
+    bash tools/build.sh
+    ```
+
+    `tools/build.sh`中的Paddle C++预测库、opencv等其他依赖库的地址需要换成自己机器上的实际地址。
+
+* 具体地，需要修改`tools/build.sh`中的环境路径，相关内容如下：
+
+    ```shell
+    OPENCV_DIR=your_opencv_dir
+    LIB_DIR=your_paddle_inference_dir
+    CUDA_LIB_DIR=your_cuda_lib_dir
+    CUDNN_LIB_DIR=your_cudnn_lib_dir
+    ```
+
+    上述参数如下(以下路径用户可根据自己机器的情况对应修改)
+
+    ```bash
+    OPENCV_DIR=/path/to/opencv3
+    LIB_DIR=/path/to/paddle_inference
+    CUDA_LIB_DIR=/usr/local/cuda/lib64
+    CUDNN_LIB_DIR=/usr/lib/x86_64-linux-gnu/
+    ```
+
+    `OPENCV_DIR`为opencv编译安装的地址
+    `LIB_DIR`为下载(`paddle_inference`文件夹)或者编译生成的Paddle预测库地址(`build/paddle_inference_install_dir`文件夹)
+    `CUDA_LIB_DIR`为cuda库文件地址，在docker中为`/usr/local/cuda/lib64`
+    `CUDNN_LIB_DIR`为cudnn库文件地址，在docker中为`/usr/lib/x86_64-linux-gnu/`。
+    **如果希望预测时开启TensorRT加速功能，那么还需要修改`tools/build.sh`3处代码**
+    1. 设置`DWITH_GPU=ON`
+    2. 设置`DWITH_TENSORRT=ON`
+    3. 设置`TENSORRT_DIR=/path/to/TensorRT-x.x.x.x`
+
+    **以上路径都写绝对路径，不要写相对路径**
+
+
+* 编译完成之后，会在`cpp_infer/build`文件夹下生成一个名为`ppvideo`的可执行文件。
+
+
+### 2.3 运行PaddleVideo C++预测demo
+
+运行方式：
+
+```bash
+./build/ppvideo <mode> [--param1] [--param2] [...]
+```
+
+其中，`mode`为必选参数，表示选择的功能，取值范围['rec']，表示**视频识别**（更多功能会陆续加入）。
+
+
+##### 1. 调用视频识别：
+```bash
+# 调用PP-TSM识别
+./build/ppvideo rec \
+--rec_model_dir=../../inference/ppTSM \
+--inference_model_name=ppTSM \
+--video_dir=./example_video_dir \
+--num_seg=8 \
+--seg_len=1
+
+# 调用PP-TSN识别
+./build/ppvideo rec \
+--rec_model_dir=../../inference/ppTSN \
+--inference_model_name=ppTSN \
+--video_dir=./example_video_dir \
+--num_seg=25 \
+--seg_len=1
+```
+更多参数如下：
+
+- 通用参数
+
+    | 参数名称      | 类型 | 默认参数        | 意义                                                         |
+    | ------------- | ---- | --------------- | ------------------------------------------------------------ |
+    | use_gpu       | bool | false           | 是否使用GPU                                                  |
+    | gpu_id        | int  | 0               | GPU id，使用GPU时有效                                        |
+    | gpu_mem       | int  | 4000            | 申请的GPU内存                                                |
+    | cpu_threads   | int  | 10              | CPU预测时的线程数，在机器核数充足的情况下，该值越大，预测速度越快 |
+    | enable_mkldnn | bool | false           | 是否使用mkldnn库                                             |
+    | use_tensorrt  | bool | false           | 是否使用tensorrt库                                           |
+    | precision     | str  | "fp32"          | 使用fp32/fp16/uint8精度来预测                                |
+    | benchmark     | bool | true            | 预测时是否开启benchmark，开启后会在最后输出配置、模型、耗时等信息。 |
+
+
+- 视频识别模型相关
+
+    | 参数名称       | 类型   | 默认参数                                      | 意义                                 |
+    | -------------- | ------ | --------------------------------------------- | ------------------------------------ |
+    | video_dir      | string | "../example_video_dir"                        | 存放将要识别的视频的文件夹路径       |
+    | rec_model_dir  | string | ""                                            | 存放导出的预测模型的文件夹路径       |
+    | inference_model_name | string | "ppTSM"                                 | 预测模型的名称 |
+    | num_seg        | int    | 8                                             | 视频分段的段数                       |
+    | seg_len        | int    | 1                                             | 视频每段抽取的帧数                   |
+    | rec_batch_num  | int    | 1                                             | 模型预测时的batch size               |
+    | char_list_file | str    | "../../data/k400/Kinetics-400_label_list.txt" | 存放所有类别标号和对应名字的文本路径 |
+
+​	以example_video_dir下的样例视频`example01.avi`为输入视频为例，最终屏幕上会输出检测结果如下。
+
+```bash
+[./inference/ppTSM]
+[./deploy/cpp_infer/example_video_dir]
+total videos num: 1
+./example_video_dir/example01.avi   class: 5 archery       score: 0.999556
+I1125 08:10:45.834288 13955 autolog.h:50] ----------------------- Config info -----------------------
+I1125 08:10:45.834458 13955 autolog.h:51] runtime_device: cpu
+I1125 08:10:45.834467 13955 autolog.h:52] ir_optim: True
+I1125 08:10:45.834475 13955 autolog.h:53] enable_memory_optim: True
+I1125 08:10:45.834483 13955 autolog.h:54] enable_tensorrt: 0
+I1125 08:10:45.834518 13955 autolog.h:55] enable_mkldnn: False
+I1125 08:10:45.834525 13955 autolog.h:56] cpu_math_library_num_threads: 10
+I1125 08:10:45.834532 13955 autolog.h:57] ----------------------- Data info -----------------------
+I1125 08:10:45.834540 13955 autolog.h:58] batch_size: 1
+I1125 08:10:45.834547 13955 autolog.h:59] input_shape: dynamic
+I1125 08:10:45.834556 13955 autolog.h:60] data_num: 1
+I1125 08:10:45.834564 13955 autolog.h:61] ----------------------- Model info -----------------------
+I1125 08:10:45.834573 13955 autolog.h:62] model_name: rec
+I1125 08:10:45.834579 13955 autolog.h:63] precision: fp32
+I1125 08:10:45.834586 13955 autolog.h:64] ----------------------- Perf info ------------------------
+I1125 08:10:45.834594 13955 autolog.h:65] Total time spent(ms): 2739
+I1125 08:10:45.834602 13955 autolog.h:67] preprocess_time(ms): 10.6524, inference_time(ms): 1269.55, postprocess_time(ms): 0.009118
+```
+
+### 3 FAQ
+
+1. 编译demo过程中出现以下错误
+
+    ```shell
+    make[2]: *** No rule to make target '/usr/lib/x86_64-linux-gn/libcudnn.so', needed by 'ppvideo'.  Stop.
+    make[2]: *** Waiting for unfinished jobs....
+    [ 16%] Building CXX object CMakeFiles/ppvideo.dir/src/main.cpp.o
+    [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/preprocess_op.cpp.o
+    [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/postprocess_op.cpp.o
+    [ 83%] Building CXX object CMakeFiles/ppvideo.dir/src/utility.cpp.o
+    [ 83%] Building CXX object CMakeFiles/ppvideo.dir/src/video_rec.cpp.o
+    CMakeFiles/Makefile2:95: recipe for target 'CMakeFiles/ppvideo.dir/all' failed
+    make[1]: *** [CMakeFiles/ppvideo.dir/all] Error 2
+    Makefile:83: recipe for target 'all' failed
+    make: *** [all] Error 2
+    ```
+    可能是`CUDNN_LIB_DIR`设置的不对，导致找不到该目录下的`libcudnn.so`。
diff --git a/docs/src/deploy/cpp_infer/readme_en.md b/docs/src/deploy/cpp_infer/readme_en.md
new file mode 100644
index 000000000..1752c260f
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/readme_en.md
@@ -0,0 +1,316 @@
+English | [简体中文](./readme.md)
+
+# Server-side C++ prediction
+
+This chapter introduces the C++ deployment method of the PaddleVideo model. For the python prediction deployment method, please refer to the **Model Reasoning** chapter of the respective model.
+C++ is better than python in terms of performance calculation. Therefore, in most CPU and GPU deployment scenarios, C++ deployment methods are mostly used. This section will introduce how to configure the C++ environment in the Linux (CPU/GPU) environment and complete it.
+PaddleVideo model deployment.
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install [paddledet](git+https://github.com/LDOUBLEV/AutoLog)
+```
+
+## 1. Prepare the environment
+
+- For Linux environment, docker is recommended.
+
+- Windows environment, currently supports compilation based on `Visual Studio 2019 Community` (TODO)
+
+* This document mainly introduces the PaddleVideo C++ prediction process based on the Linux environment. If you need to perform C++ prediction based on the prediction library under Windows, please refer to [Windows Compilation Tutorial](./docs/windows_vs2019_build.md)(TODO) for the specific compilation method
+* **The purpose of preparing the environment is to get the compiled opencv library and paddle prediction library**.
+
+### 1.1 Compile opencv library
+
+* First, you need to download the compressed package compiled from the source code in the Linux environment from the opencv official website, and unzip it into a folder. Take opencv3.4.7 as an example, the download command is as follows:
+
+    ```bash
+    cd deploy/cpp_infer
+    wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz
+    tar -xf 3.4.7.tar.gz
+    ```
+
+    After decompression, you can get the decompressed folder of `opencv-3.4.7` in the `deploy/cpp_infer` directory.
+
+* Install ffmpeg
+
+    Opencv and ffmpeg can read the video normally under linux, otherwise it may encounter the situation that the number of video frames returns to 0 or no video frame can be read
+
+    Using a relatively simple apt installation, the installation command is as follows:
+
+    ```bash
+    apt-get update
+
+    apt install libavformat-dev
+    apt install libavcodec-dev
+    apt install libswresample-dev
+    apt install libswscale-dev
+    apt install libavutil-dev
+    apt install libsdl1.2-dev
+
+    apt-get install ffmpeg
+    ```
+
+* To prepare to compile opencv, first enter the `opencv-3.4.7` folder, and then set the opencv source path `root_path` and the installation path `install_path`. The execution command is as follows:
+
+    ```bash
+    cd opencv-3.4.7
+
+    root_path=$PWD  # That is the absolute path of opencv-3.4.7
+    install_path=${root_path}/opencv3
+
+    rm -rf build
+    mkdir build
+    cd build
+
+    cmake .. \
+        -DCMAKE_INSTALL_PREFIX=${install_path} \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DBUILD_SHARED_LIBS=OFF \
+        -DWITH_IPP=OFF \
+        -DBUILD_IPP_IW=OFF \
+        -DWITH_LAPACK=OFF \
+        -DWITH_EIGEN=OFF \
+        -DCMAKE_INSTALL_LIBDIR=lib64 \
+        -DWITH_ZLIB=ON \
+        -DBUILD_ZLIB=ON \
+        -DWITH_JPEG=ON \
+        -DBUILD_JPEG=ON \
+        -DWITH_PNG=ON \
+        -DBUILD_PNG=ON \
+        -DWITH_TIFF=ON \
+        -DBUILD_TIFF=ON \
+        -DWITH_FFMPEG=ON
+
+    make -j
+    make install
+    ```
+
+    After the completion of `make install`, opencv header files and library files will be generated in this folder, which will be used to compile the Video inference C++ code later.
+
+    Finally, the installation path `install_path` will be used as the specified path, and a folder of `opencv3` will be obtained. The file structure is shown below.
+
+    ```shell
+    opencv-3.4.7/
+    ├── opencv3/
+    │   ├── bin/
+    │   ├── include/
+    │   ├── lib/
+    │   ├── lib64/
+    │   └── share/
+    ```
+
+### 1.2 Download or compile Paddle prediction library
+
+There are two ways to obtain the Paddle prediction library, which will be described in detail below.
+
+
+#### 1.2.1 Download and install directly
+
+* [Paddle prediction library official website](https://paddleinference.paddlepaddle.org.cn/v2.2/user_guides/download_lib.html) provides different cuda versions of Linux prediction libraries, you can Check and **select the appropriate prediction library version** on the official website (it is recommended to select the prediction library with paddle version>=2.0.1, and the prediction library of 2.2.2 is recommended).
+
+* Download and get a `paddle_inference.tgz` compressed package, and then unzip it into a folder, the command is as follows (taking the machine environment as gcc8.2 as an example):
+
+    ```bash
+    wget https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz
+    tar -xf paddle_inference.tgz
+    ```
+
+    Eventually, a subfolder of `paddle_inference/` will be generated in the current folder.
+
+#### 1.2.2 Prediction library source code compilation
+* If you want to get the latest prediction library features, you can clone the latest code from Paddle github and compile the prediction library from source code.
+* You can refer to [Paddle prediction library installation and compilation instructions](https://paddleinference.paddlepaddle.org.cn/user_guides/source_compile.html) instructions from github Obtain the Paddle code, and then compile it to generate the latest prediction library. The method of using git to get the code is as follows.
+
+    ```shell
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git checkout release/2.2
+    ```
+
+* After entering the Paddle directory, the compilation method is as follows.
+
+    ```shell
+    rm -rf build
+    mkdir build
+    cd build
+
+    cmake .. \
+        -DWITH_CONTRIB=OFF \
+        -DWITH_MKL=ON \
+        -DWITH_MKLDNN=ON \
+        -DWITH_TESTING=OFF \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_INFERENCE_API_TEST=OFF \
+        -DON_INFER=ON \
+        -DWITH_PYTHON=ON
+    make -j
+    make inference_lib_dist -j4 # 4为编译时使用核数，可根据机器情况自行修改
+    ```
+
+    You can refer to [documentation](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi) for more introduction of compilation parameter options.
+
+
+* After the compilation is complete, you can see the following files and folders are generated under the file `build/paddle_inference_install_dir/`.
+
+    ```
+    build/
+    └── paddle_inference_install_dir/
+        ├── CMakeCache.txt
+        ├── paddle/
+        ├── third_party/
+        └── version.txt
+    ```
+
+    Among them, `paddle` is the Paddle library required for C++ prediction, and `version.txt` contains the version information of the current prediction library.
+
+## 2. Compile and run the prediction demo
+
+### 2.1 Export the model as an inference model
+
+* This step is the same as the export prediction model under the python deployment mode. You can refer to the model prediction chapter of the respective model. Several related inference model files exported are used for model prediction. **Taking PP-TSM as an example**, the directory structure of the derived prediction model is as follows.
+
+    ```
+    inference/
+    └── ppTSM/
+        ├── ppTSM.pdiparams
+        ├── ppTSM.pdiparamsinfo
+        └── ppTSM.pdmodel
+    ```
+
+
+### 2.2 Compile PaddleVideo C++ prediction demo
+
+* Enter the `deploy/cpp_infer` directory and execute the following compile command
+
+    ```shell
+    bash tools/build.sh
+    ```
+
+    The addresses of the Paddle C++ prediction library, opencv and other dependent libraries in `tools/build.sh` need to be replaced with the actual addresses on your own machine.
+
+* Specifically, you need to modify the environment path in `tools/build.sh`, the relevant content is as follows:
+
+    ```shell
+    OPENCV_DIR=your_opencv_dir
+    LIB_DIR=your_paddle_inference_dir
+    CUDA_LIB_DIR=/usr/local/cuda/lib64
+    CUDNN_LIB_DIR=/usr/lib/x86_64-linux-gnu/
+    ```
+
+    The above parameters are as follows (the following path users can modify according to their own machine conditions)
+
+    `OPENCV_DIR` is the address where opencv is compiled and installed
+     `LIB_DIR` is the download (`paddle_inference` folder) or the generated Paddle prediction library address (`build/paddle_inference_install_dir` folder)
+     `CUDA_LIB_DIR` is the address of the cuda library file, which is `/usr/local/cuda/lib64` in docker
+     `CUDNN_LIB_DIR` is the cudnn library file address, which is `/usr/lib/x86_64-linux-gnu/` in docker.
+     **If you want to enable TensorRT acceleration during prediction, you need to modify the code at `tools/build.sh`3**
+     1. Set `DWITH_GPU=ON`
+     2. Set `DWITH_TENSORRT=ON`
+     3. Set `TENSORRT_DIR=/path/to/TensorRT-x.x.x.x`
+
+    **The above paths are all absolute paths, do not use relative paths**
+
+* After the compilation is complete, an executable file named `ppvideo` will be generated in the `cpp_infer/build` folder.
+
+
+### 2.3 Run PaddleVideo C++ prediction demo
+
+Operation mode:
+
+```bash
+./build/ppvideo <mode> [--param1] [--param2] [...]
+```
+
+Among them, `mode` is a required parameter, which means the selected function, and the value range is ['rec'], which means **video recognition** (more functions will be added in succession).
+
+
+##### 1. Call video recognition:
+
+```bash
+# run PP-TSM inference
+./build/ppvideo rec \
+--rec_model_dir=../../inference/ppTSM \
+--inference_model_name=ppTSM \
+--video_dir=./example_video_dir \
+--num_seg=8 \
+--seg_len=1
+
+# run PP-TSN inference
+./build/ppvideo rec \
+--rec_model_dir=../../inference/ppTSN \
+--inference_model_name=ppTSN \
+--video_dir=./example_video_dir \
+--num_seg=25 \
+--seg_len=1
+```
+More parameters are as follows:
+
+- General parameters
+
+    | Parameter name | Type | Default parameter | Meaning |
+    | ------------- | ---- | --------------- | ------------------------------------------------------------ |
+    | use_gpu | bool | false | Whether to use GPU |
+    | gpu_id | int | 0 | GPU id, valid when using GPU |
+    | gpu_mem | int | 4000 | GPU memory requested |
+    | cpu_threads | int | 10 | The number of threads for CPU prediction. When the number of machine cores is sufficient, the larger the value, the faster the prediction speed |
+    | enable_mkldnn | bool | false | Whether to use mkldnn library |
+    | use_tensorrt | bool | false | Whether to use the tensorrt library |
+    | precision | str | "fp32" | Use fp32/fp16/uint8 precision to predict |
+    | benchmark | bool | true | Whether to enable benchmark during prediction, after enabling it, the configuration, model, time-consuming and other information will be output at the end. |
+
+- Video recognition model related
+
+    | Parameter name | Type | Default parameter | Meaning |
+    | -------------- | ------ | --------------------------------------------- | ------------------------------------ |
+    | video_dir | string | "../example_video_dir" | The path of the folder where the video to be recognized is stored |
+    | rec_model_dir | string | "" | The folder path where the exported prediction model is stored |
+    | inference_model_name | string | "ppTSM" | The name of the model used in the prediction |
+    | num_seg | int | 8 | Number of video segments |
+    | seg_len | int | 1 | The number of frames extracted in each segment of the video |
+    | rec_batch_num | int | 1 | Batch size during model prediction |
+    | char_list_file | str | "../../data/k400/Kinetics-400_label_list.txt" | The text path for storing all category labels and corresponding names |
+
+​	Take the sample video `example01.avi` under example_video_dir as the input video as an example, the final 	screen will output the detection results as follows.
+
+```bash
+[./inference/ppTSM]
+[./deploy/cpp_infer/example_video_dir]
+total videos num: 1
+./example_video_dir/example01.avi   class: 5 archery       score: 0.999556
+I1125 08:10:45.834288 13955 autolog.h:50] ----------------------- Config info -----------------------
+I1125 08:10:45.834458 13955 autolog.h:51] runtime_device: cpu
+I1125 08:10:45.834467 13955 autolog.h:52] ir_optim: True
+I1125 08:10:45.834475 13955 autolog.h:53] enable_memory_optim: True
+I1125 08:10:45.834483 13955 autolog.h:54] enable_tensorrt: 0
+I1125 08:10:45.834518 13955 autolog.h:55] enable_mkldnn: False
+I1125 08:10:45.834525 13955 autolog.h:56] cpu_math_library_num_threads: 10
+I1125 08:10:45.834532 13955 autolog.h:57] ----------------------- Data info -----------------------
+I1125 08:10:45.834540 13955 autolog.h:58] batch_size: 1
+I1125 08:10:45.834547 13955 autolog.h:59] input_shape: dynamic
+I1125 08:10:45.834556 13955 autolog.h:60] data_num: 1
+I1125 08:10:45.834564 13955 autolog.h:61] ----------------------- Model info -----------------------
+I1125 08:10:45.834573 13955 autolog.h:62] model_name: rec
+I1125 08:10:45.834579 13955 autolog.h:63] precision: fp32
+I1125 08:10:45.834586 13955 autolog.h:64] ----------------------- Perf info ------------------------
+I1125 08:10:45.834594 13955 autolog.h:65] Total time spent(ms): 2739
+I1125 08:10:45.834602 13955 autolog.h:67] preprocess_time(ms): 10.6524, inference_time(ms): 1269.55, postprocess_time(ms): 0.009118
+```
+
+### 3 FAQ
+
+1. The following error occurred during the compilation of the demo
+
+     ```shell
+     make[2]: *** No rule to make target '/usr/lib/x86_64-linux-gn/libcudnn.so', needed by 'ppvideo'. Stop.
+     make[2]: *** Waiting for unfinished jobs....
+     [ 16%] Building CXX object CMakeFiles/ppvideo.dir/src/main.cpp.o
+     [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/preprocess_op.cpp.o
+     [ 50%] Building CXX object CMakeFiles/ppvideo.dir/src/postprocess_op.cpp.o
+     [83%] Building CXX object CMakeFiles/ppvideo.dir/src/utility.cpp.o
+     [ 83%] Building CXX object CMakeFiles/ppvideo.dir/src/video_rec.cpp.o
+     CMakeFiles/Makefile2:95: recipe for target 'CMakeFiles/ppvideo.dir/all' failed
+     make[1]: *** [CMakeFiles/ppvideo.dir/all] Error 2
+     Makefile:83: recipe for target 'all' failed
+     make: *** [all] Error 2
+     ````
+     It may be that `CUDNN_LIB_DIR` is set incorrectly, resulting in that `libcudnn.so` in this directory cannot be found.
diff --git a/docs/src/deploy/cpp_infer/src/main.cpp b/docs/src/deploy/cpp_infer/src/main.cpp
new file mode 100644
index 000000000..e91ea63ca
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/src/main.cpp
@@ -0,0 +1,173 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "omp.h"
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+#include <include/video_rec.h>
+#include <include/utility.h>
+#include <sys/stat.h>
+
+#include <gflags/gflags.h>
+#include "auto_log/autolog.h"
+
+// general parameters
+DEFINE_bool(use_gpu, false, "Infering with GPU or CPU.");
+DEFINE_int32(gpu_id, 0, "Device id of GPU to execute.");
+DEFINE_int32(gpu_mem, 4000, "GPU id when infering with GPU.");
+DEFINE_int32(cpu_threads, 10, "Num of threads with CPU.");
+DEFINE_bool(enable_mkldnn, false, "Whether use mkldnn with CPU.");
+DEFINE_bool(use_tensorrt, false, "Whether use tensorrt.");
+DEFINE_string(precision, "fp32", "Precision be one of fp32/fp16/int8.");
+DEFINE_bool(benchmark, true, "Whether to log and report benchmark information during inference.");
+
+
+// video recognition related
+DEFINE_string(video_dir, "", "Dir of input video(s).");
+DEFINE_string(rec_model_dir, "../example_video_dir", "Path of video rec inference model.");
+DEFINE_string(inference_model_name, "ppTSM", "The name of the model used in the prediction.");
+DEFINE_int32(num_seg, 8, "number of frames input to model, which are extracted from a video.");
+DEFINE_int32(seg_len, 1, "number of frames from a segment.");
+DEFINE_int32(rec_batch_num, 1, "rec_batch_num.");
+DEFINE_string(char_list_file, "../../data/k400/Kinetics-400_label_list.txt", "Path of dictionary.");
+
+
+using namespace std;
+using namespace cv;
+using namespace PaddleVideo;
+
+
+static bool PathExists(const std::string& path)
+{
+#ifdef _WIN32
+    struct _stat buffer;
+    return (_stat(path.c_str(), &buffer) == 0);
+#else
+    struct stat buffer;
+    return (stat(path.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+
+int main_rec(std::vector<cv::String> &cv_all_video_names)
+{
+    std::vector<double> time_info = {0, 0, 0}; // Statement time statistics vector
+    VideoRecognizer rec(FLAGS_rec_model_dir, FLAGS_inference_model_name, FLAGS_use_gpu, FLAGS_num_seg,
+                        FLAGS_rec_batch_num, FLAGS_gpu_id,
+                        FLAGS_gpu_mem, FLAGS_cpu_threads,
+                        FLAGS_enable_mkldnn, FLAGS_char_list_file,
+                        FLAGS_use_tensorrt, FLAGS_precision); // Instantiate a video recognition object
+
+    int batch_num = FLAGS_rec_batch_num;
+    for (int i = 0, n = cv_all_video_names.size(); i < n; i += batch_num) // Process each video
+    {
+        int start_idx = i;
+        int end_idx = min(i + batch_num, n);
+        std::vector<std::vector<cv::Mat> > frames_batch;
+        for (int j = start_idx; j < end_idx; ++j)
+        {
+            std::vector<cv::Mat> frames = Utility::SampleFramesFromVideo(cv_all_video_names[i], FLAGS_num_seg, FLAGS_seg_len);
+            frames_batch.emplace_back(frames);
+        }
+        std::vector<double> rec_times; // Initialization time consumption statistics
+
+        // Take the read several video frames and send them to the run method of the recognition class to predict
+        rec.Run(std::vector<string>(cv_all_video_names.begin() + start_idx, cv_all_video_names.begin() + end_idx), frames_batch, &rec_times);
+
+        time_info[0] += rec_times[0];
+        time_info[1] += rec_times[1];
+        time_info[2] += rec_times[2];
+    }
+    if (FLAGS_benchmark)
+    {
+        AutoLogger autolog("rec",
+                           FLAGS_use_gpu,
+                           FLAGS_use_tensorrt,
+                           FLAGS_enable_mkldnn,
+                           FLAGS_cpu_threads,
+                           FLAGS_rec_batch_num,
+                           "dynamic",
+                           FLAGS_precision,
+                           time_info,
+                           cv_all_video_names.size()); // Generate detailed information on the run
+        autolog.report(); // Print running details
+    }
+
+    return 0;
+}
+
+
+void check_params(char* mode)
+{
+    if (strcmp(mode, "rec") == 0)
+    {
+        std::cout << "[" << FLAGS_rec_model_dir << "]" << std::endl;
+        std::cout << "[" << FLAGS_video_dir << "]" << std::endl;
+        if (FLAGS_rec_model_dir.empty() || FLAGS_video_dir.empty())
+        {
+            std::cout << "Usage[rec]: ./ppvideo --rec_model_dir=/PATH/TO/REC_INFERENCE_MODEL/ "
+                      << "--video_dir=/PATH/TO/INPUT/VIDEO/" << std::endl;
+            exit(1);
+        }
+    }
+    if (FLAGS_precision != "fp32" && FLAGS_precision != "fp16" && FLAGS_precision != "int8")
+    {
+        cout << "precison should be 'fp32'(default), 'fp16' or 'int8'. " << endl;
+        exit(1);
+    }
+}
+
+
+int main(int argc, char **argv)
+{
+    if (argc <= 1 || (strcmp(argv[1], "rec") != 0)) //Get user input and check
+    {
+        std::cout << "Please choose one mode of [rec] !" << std::endl;
+        return -1;
+    }
+    std::cout << "mode: " << argv[1] << endl; // Type of inference task required for output
+
+    // Parsing command-line
+    google::ParseCommandLineFlags(&argc, &argv, true);
+    check_params(argv[1]);
+
+    if (!PathExists(FLAGS_video_dir)) // Determine whether the directory where the video exists
+    {
+        std::cerr << "[ERROR] video path not exist! video_dir: " << FLAGS_video_dir << endl;
+        exit(1);
+    }
+
+    std::vector<cv::String> cv_all_video_names; // Store all video paths
+
+    cv::glob(FLAGS_video_dir, cv_all_video_names); // Search all videos under FLAGS_video_dir, save in cv_all_video_names
+    std::cout << "total videos num: " << cv_all_video_names.size() << endl; // 输出搜索到的视频个数
+
+    if (strcmp(argv[1], "rec") == 0)
+    {
+        return main_rec(cv_all_video_names); // Output the number of videos searched
+    }
+    return 0;
+}
diff --git a/docs/src/deploy/cpp_infer/src/postprocess_op.cpp b/docs/src/deploy/cpp_infer/src/postprocess_op.cpp
new file mode 100644
index 000000000..fdc6c5a6e
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/src/postprocess_op.cpp
@@ -0,0 +1,50 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <include/postprocess_op.h>
+
+namespace PaddleVideo
+{
+    void Softmax::Inplace_Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end)
+    {
+        const float max_value = *std::max_element(_begin, _end);
+        float denominator = 0.0f;
+        for (auto it = _begin; it != _end; ++it)
+        {
+            *it = std::exp((*it) - max_value);
+            denominator += (*it);
+        }
+        for (auto it = _begin; it != _end; ++it)
+        {
+            *it /= denominator;
+        }
+    }
+    std::vector<float> Softmax::Run(const std::vector<float>::iterator &_begin, const std::vector<float>::iterator &_end)
+    {
+        std::vector<float> prob(_begin, _end);
+        const float max_value = *std::max_element(prob.begin(), prob.end());
+        float denominator = 0.0f;
+        for (auto it = _begin, it_p = prob.begin(); it != _end; ++it, ++it_p)
+        {
+            (*it_p) = std::exp((*it) - max_value);
+            denominator += (*it_p);
+        }
+        for (auto it = prob.begin(); it != prob.end(); ++it)
+        {
+            (*it) /= denominator;
+        }
+        return prob;
+    }
+
+} // namespace PaddleVideo
diff --git a/docs/src/deploy/cpp_infer/src/preprocess_op.cpp b/docs/src/deploy/cpp_infer/src/preprocess_op.cpp
new file mode 100644
index 000000000..951f7b02c
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/src/preprocess_op.cpp
@@ -0,0 +1,135 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"
+#include "paddle_inference_api.h"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+#include <include/preprocess_op.h>
+
+
+namespace PaddleVideo
+{
+
+    void Permute::Run(const cv::Mat *im, float *data)
+    {
+        int rh = im->rows;
+        int rw = im->cols;
+        int rc = im->channels();
+        for (int i = 0; i < rc; ++i)
+        {
+            // Extract the i-th channel of im and write it into the array with (data + i * rh * rw) as the starting address
+            cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw), rc - 1 - i);
+        }
+    }
+
+    void Normalize::Run(cv::Mat *im, const std::vector<float> &mean,
+                        const std::vector<float> &scale, const bool is_scale)
+    {
+        double e = 1.0;
+        if (is_scale)
+        {
+            e /= 255.0;
+        }
+        (*im).convertTo(*im, CV_32FC3, e);
+        std::vector<cv::Mat> bgr_channels(3);
+        cv::split(*im, bgr_channels);
+        for (auto i = 0; i < bgr_channels.size(); i++)
+        {
+            bgr_channels[i].convertTo(bgr_channels[i], CV_32FC1, 1.0 / scale[i], (0.0 - mean[i]) / scale[i]);
+        }
+        cv::merge(bgr_channels, *im);
+    }
+
+    void Scale::Run(const cv::Mat &img, cv::Mat &resize_img, bool use_tensorrt, const int &short_size)
+    {
+        int h = img.rows;
+        int w = img.cols;
+        if ((w <= h && w == short_size) || (h <= w && h == short_size))
+        {
+            img.copyTo(resize_img);
+        }
+        else
+        {
+            int oh, ow;
+            if (w < h)
+            {
+                ow = short_size;
+                oh = h * ow / w;
+            }
+            else
+            {
+                oh = short_size;
+                ow = w * oh / h;
+            }
+            cv::resize(img, resize_img, cv::Size(ow, oh), 0.0f, 0.0f, cv::INTER_LINEAR);
+        }
+    }
+
+    void CenterCrop::Run(const cv::Mat &img, cv::Mat &crop_img, bool use_tensorrt, const int &target_size)
+    {
+        int h = img.rows;
+        int w = img.cols;
+        int crop_h = target_size;
+        int crop_w = target_size;
+        if (w < crop_w || h < crop_h)
+        {
+            printf("[Error] image width (%d) and height (%d) should be larger than crop size (%d)",
+                   w, h, target_size);
+        }
+        else
+        {
+            int x1 = (w - crop_w) / 2;
+            int y1 = (h - crop_h) / 2;
+            crop_img = img(cv::Rect(x1, y1, crop_w, crop_h));
+        }
+    }
+
+    void TenCrop::Run(const cv::Mat &img, std::vector<cv::Mat> &crop_imgs, const int &begin_index, bool use_tensorrt, const int &target_size)
+    {
+        int h = img.rows;
+        int w = img.cols;
+        int crop_h = target_size;
+        int crop_w = target_size;
+        int w_step = (w - crop_w) / 4;
+        int h_step = (h - crop_h) / 4;
+        pair<int, int>offsets[5] =
+        {
+            {0,          0},
+            {4 * w_step, 0},
+            {0,          4 * h_step},
+            {4 * w_step, 4 * h_step},
+            {2 * w_step, 2 * h_step}
+        };
+        for (int i = 0; i < 5; ++i)
+        {
+            const int &j = i * 2;
+            const int &x1 = offsets[i].first;
+            const int &y1 = offsets[i].second;
+            crop_imgs[begin_index + j] = img(cv::Rect(x1, y1, crop_w, crop_h)); // cropped
+            cv::flip(img(cv::Rect(x1, y1, crop_w, crop_h)), crop_imgs[begin_index + j + 1], 0); // cropped
+        }
+    }
+} // namespace PaddleVideo
diff --git a/docs/src/deploy/cpp_infer/src/utility.cpp b/docs/src/deploy/cpp_infer/src/utility.cpp
new file mode 100644
index 000000000..b95988ff1
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/src/utility.cpp
@@ -0,0 +1,192 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <dirent.h>
+#include <include/utility.h>
+#include <iostream>
+#include <ostream>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <vector>
+
+namespace PaddleVideo
+{
+
+    std::vector<std::string> Utility::ReadDict(const std::string &path)
+    {
+        std::ifstream in(path);
+        std::string line;
+        std::vector<std::string> m_vec;
+        if (in)
+        {
+            while (getline(in, line))
+            {
+                m_vec.push_back(line);
+            }
+        }
+        else
+        {
+            std::cout << "no such label file: " << path << ", exit the program..."
+                      << std::endl;
+            exit(1);
+        }
+        return m_vec; // Use fstream to read the category list and return with vector
+    }
+
+    void Utility::GetAllFiles(const char *dir_name, std::vector<std::string> &all_inputs)
+    {
+        if (NULL == dir_name)
+        {
+            std::cout << " dir_name is null ! " << std::endl;
+            return;
+        }
+        struct stat s;
+        lstat(dir_name, &s);
+        if (!S_ISDIR(s.st_mode))
+        {
+            std::cout << "dir_name is not a valid directory !" << std::endl;
+            all_inputs.push_back(dir_name);
+            return;
+        }
+        else
+        {
+            struct dirent *filename; // return value for readdir()
+            DIR *dir;                // return value for opendir()
+            dir = opendir(dir_name);
+            if (NULL == dir)
+            {
+                std::cout << "Can not open dir " << dir_name << std::endl;
+                return;
+            }
+            std::cout << "Successfully opened the dir !" << std::endl;
+            while ((filename = readdir(dir)) != NULL)
+            {
+                if (strcmp(filename->d_name, ".") == 0 ||
+                    strcmp(filename->d_name, "..") == 0)
+                    continue;
+                // img_dir + std::string("/") + all_inputs[0];
+                all_inputs.push_back(dir_name + std::string("/") +
+                                     std::string(filename->d_name));
+            }
+        }
+    }
+
+    cv::Mat Utility::GetRotateCropImage(const cv::Mat &srcimage, std::vector<std::vector<int>> box)
+    {
+        cv::Mat image;
+        srcimage.copyTo(image);
+        std::vector<std::vector<int>> points = box;
+
+        int x_collect[4] = {box[0][0], box[1][0], box[2][0], box[3][0]};
+        int y_collect[4] = {box[0][1], box[1][1], box[2][1], box[3][1]};
+        int left = int(*std::min_element(x_collect, x_collect + 4));
+        int right = int(*std::max_element(x_collect, x_collect + 4));
+        int top = int(*std::min_element(y_collect, y_collect + 4));
+        int bottom = int(*std::max_element(y_collect, y_collect + 4));
+
+        cv::Mat img_crop;
+        image(cv::Rect(left, top, right - left, bottom - top)).copyTo(img_crop);
+
+        for (int i = 0; i < points.size(); i++)
+        {
+            points[i][0] -= left;
+            points[i][1] -= top;
+        }
+
+        int img_crop_width = int(sqrt(pow(points[0][0] - points[1][0], 2) +
+                                      pow(points[0][1] - points[1][1], 2)));
+        int img_crop_height = int(sqrt(pow(points[0][0] - points[3][0], 2) +
+                                       pow(points[0][1] - points[3][1], 2)));
+
+        cv::Point2f pts_std[4];
+        pts_std[0] = cv::Point2f(0., 0.);
+        pts_std[1] = cv::Point2f(img_crop_width, 0.);
+        pts_std[2] = cv::Point2f(img_crop_width, img_crop_height);
+        pts_std[3] = cv::Point2f(0.f, img_crop_height);
+
+        cv::Point2f pointsf[4];
+        pointsf[0] = cv::Point2f(points[0][0], points[0][1]);
+        pointsf[1] = cv::Point2f(points[1][0], points[1][1]);
+        pointsf[2] = cv::Point2f(points[2][0], points[2][1]);
+        pointsf[3] = cv::Point2f(points[3][0], points[3][1]);
+
+        cv::Mat M = cv::getPerspectiveTransform(pointsf, pts_std);
+
+        cv::Mat dst_img;
+        cv::warpPerspective(img_crop, dst_img, M,
+                            cv::Size(img_crop_width, img_crop_height),
+                            cv::BORDER_REPLICATE);
+
+        if (float(dst_img.rows) >= float(dst_img.cols) * 1.5)
+        {
+            cv::Mat srcCopy = cv::Mat(dst_img.rows, dst_img.cols, dst_img.depth());
+            cv::transpose(dst_img, srcCopy);
+            cv::flip(srcCopy, srcCopy, 0);
+            return srcCopy;
+        }
+        else
+        {
+            return dst_img;
+        }
+    }
+
+    std::vector<cv::Mat> Utility::SampleFramesFromVideo(const std::string &VideoPath, const int &num_seg, const int &seg_len)
+    {
+        cv::VideoCapture capture(VideoPath); // Create a video object
+        if (!capture.isOpened())
+        {
+            printf("[Error] video cannot be opened, please check the video [%s]\n", VideoPath.c_str());
+            capture.release();
+            exit(1);
+        }
+
+        int frames_len = capture.get(cv::CAP_PROP_FRAME_COUNT); // Get the total number of video frames
+
+        int average_dur = int(frames_len / num_seg);
+        std::vector<int> frames_idx;
+
+        for (int i = 0; i < num_seg; ++i)
+        {
+            int idx = 0;
+            if (average_dur >= seg_len)
+            {
+                idx = (average_dur - 1) / 2;
+                idx += i * average_dur;
+            }
+            else if (average_dur >= 1)
+            {
+                idx += i * average_dur;
+            }
+            else
+            {
+                idx = i;
+            }
+            for (int j = idx; j < idx + seg_len; ++j)
+            {
+                frames_idx.emplace_back(j % frames_len);
+            }
+        }
+        std::vector<cv::Mat> sampled_frames;
+        cv::Mat frame; // Create an object for storing sampled frames
+        for (int i = 0; i < num_seg; ++i)
+        {
+            const int &frame_idx = frames_idx[i];
+            capture.set(cv::CAP_PROP_POS_FRAMES, frame_idx); // Set to frame_idx frame
+            capture >> frame;
+            sampled_frames.push_back(frame);
+        }
+        capture.release(); // Release the video object
+        return sampled_frames;
+    }
+} // namespace PaddleVideo
diff --git a/docs/src/deploy/cpp_infer/src/video_rec.cpp b/docs/src/deploy/cpp_infer/src/video_rec.cpp
new file mode 100644
index 000000000..5f7e19b97
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/src/video_rec.cpp
@@ -0,0 +1,304 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <include/video_rec.h>
+
+namespace PaddleVideo
+{
+    void VideoRecognizer::Run(const std::vector<string> &frames_batch_path, const std::vector<std::vector<cv::Mat> > &frames_batch, std::vector<double> *times)
+    {
+        // Copy parameters to the function
+        int real_batch_num = frames_batch.size();
+
+        std::vector<cv::Mat> srcframes(real_batch_num * this->num_seg, cv::Mat());
+
+        for (int i = 0; i < real_batch_num; ++i)
+        {
+            for (int j = 0; j < this->num_seg; ++j)
+            {
+                frames_batch[i][j].copyTo(srcframes[i * this->num_seg + j]);
+            }
+        }
+
+        auto preprocess_start = std::chrono::steady_clock::now();
+        /* Preprocess */
+        std::vector<cv::Mat> resize_frames;
+        std::vector<cv::Mat> crop_frames;
+        std::vector<float> input;
+        int num_views = 1;
+
+        if (this->inference_model_name == "ppTSM")
+        {
+            num_views = 1;
+            // 1. Scale
+            resize_frames = std::vector<cv::Mat>(real_batch_num * this->num_seg, cv::Mat());
+            for (int i = 0; i < real_batch_num; ++i)
+            {
+                for (int j = 0; j < this->num_seg; ++j)
+                {
+                    this->scale_op_.Run(srcframes[i * this->num_seg + j], resize_frames[i * this->num_seg + j], this->use_tensorrt_, 256);
+                }
+            }
+
+            // 2. CenterCrop
+            crop_frames = std::vector<cv::Mat>(real_batch_num * num_views * this->num_seg, cv::Mat());
+            for (int i = 0; i < real_batch_num; ++i)
+            {
+                for (int j = 0; j < this->num_seg; ++j)
+                {
+                    this->centercrop_op_.Run(resize_frames[i * this->num_seg + j], crop_frames[i * this->num_seg + j], this->use_tensorrt_, 224);
+                }
+            }
+
+            // 3. Normalization(inplace operation)
+            for (int i = 0; i < real_batch_num; ++i)
+            {
+                for (int j = 0; j < this->num_seg; ++j)
+                {
+                    for (int k = 0; k < num_views; ++k)
+                    {
+                        this->normalize_op_.Run(&crop_frames[i * num_views * this->num_seg + j * num_views + k], this->mean_, this->scale_, this->is_scale_);
+                    }
+                }
+            }
+
+            // 4. Image2Array
+            int rh = crop_frames[0].rows;
+            int rw = crop_frames[0].cols;
+            int rc = crop_frames[0].channels();
+            input = std::vector<float>(real_batch_num * num_views * this->num_seg *  crop_frames[0].rows * crop_frames[0].cols * rc, 0.0f);
+            for (int i = 0; i < real_batch_num; ++i)
+            {
+                for (int j = 0; j < this->num_seg; ++j)
+                {
+                    for (int k = 0; k < num_views; ++k)
+                    {
+                        this->permute_op_.Run(&crop_frames[i * num_views * this->num_seg + j * num_views + k], input.data() + (i * num_views * this->num_seg + j * num_views + k) * (rh * rw * rc));
+                    }
+                }
+            }
+        }
+        else if(this->inference_model_name == "ppTSN")
+        {
+            num_views = 10;
+            // 1. Scale
+            resize_frames = std::vector<cv::Mat>(real_batch_num * this->num_seg, cv::Mat());
+            for (int i = 0; i < real_batch_num; ++i)
+            {
+                for (int j = 0; j < this->num_seg; ++j)
+                {
+                    this->scale_op_.Run(srcframes[i * this->num_seg + j], resize_frames[i * this->num_seg + j], this->use_tensorrt_, 256);
+                }
+            }
+
+            // 2. TenCrop
+            crop_frames = std::vector<cv::Mat>(real_batch_num * this->num_seg * num_views, cv::Mat());
+            for (int i = 0; i < real_batch_num; ++i)
+            {
+                for (int j = 0; j < this->num_seg; ++j)
+                {
+                    this->tencrop_op_.Run(resize_frames[i * this->num_seg + j], crop_frames, (i * this->num_seg  + j) * num_views, this->use_tensorrt_, 224);
+                }
+            }
+
+            // 3. Normalization(inplace operation)
+            for (int i = 0; i < real_batch_num; ++i)
+            {
+                for (int j = 0; j < this->num_seg; ++j)
+                {
+                    for (int k = 0; k < num_views; ++k)
+                    {
+                        this->normalize_op_.Run(&crop_frames[i * this->num_seg * num_views + j * num_views + k], this->mean_, this->scale_, this->is_scale_);
+                    }
+                }
+            }
+
+            // 4. Image2Array
+            int rh = crop_frames[0].rows;
+            int rw = crop_frames[0].cols;
+            int rc = crop_frames[0].channels();
+            input = std::vector<float>(real_batch_num * this->num_seg * num_views *  crop_frames[0].rows * crop_frames[0].cols * rc, 0.0f);
+            for (int i = 0; i < real_batch_num; ++i)
+            {
+                for (int j = 0; j < this->num_seg; ++j)
+                {
+                    for (int k = 0; k < num_views; ++k)
+                    {
+                        this->permute_op_.Run(&crop_frames[i * this->num_seg * num_views + j * num_views + k], input.data() + (i * this->num_seg * num_views + j * num_views + k) * (rh * rw * rc));
+                    }
+                }
+            }
+        }
+        else
+        {
+            throw "[Error] Not implemented yet";
+        }
+        auto preprocess_end = std::chrono::steady_clock::now();
+
+        /* Inference */
+        auto input_names = this->predictor_->GetInputNames();
+        auto input_t = this->predictor_->GetInputHandle(input_names[0]);
+        input_t->Reshape({real_batch_num * num_views * this->num_seg, 3, crop_frames[0].rows, crop_frames[0].cols});
+        auto inference_start = std::chrono::steady_clock::now();
+        input_t->CopyFromCpu(input.data());
+        this->predictor_->Run(); // Use the inference library to predict
+
+        std::vector<float> predict_batch;
+        auto output_names = this->predictor_->GetOutputNames();
+        auto output_t = this->predictor_->GetOutputHandle(output_names[0]);
+        auto predict_shape = output_t->shape();
+
+        // Get the number of class
+        int class_num = predict_shape[1];
+
+        int out_numel = std::accumulate(predict_shape.begin(), predict_shape.end(), 1, std::multiplies<int>());
+        predict_batch.resize(out_numel); // NxC
+        output_t->CopyToCpu(predict_batch.data()); // Copy the model output to predict_batch
+
+        // Convert output (logits) into probabilities
+        for (int i = 0; i < real_batch_num; ++i)
+        {
+            this->softmax_op_.Inplace_Run(predict_batch.begin() + i * class_num, predict_batch.begin() + (i + 1) * class_num);
+        }
+
+        auto inference_end = std::chrono::steady_clock::now();
+
+        // output decode
+        auto postprocess_start = std::chrono::steady_clock::now();
+        std::vector<std::string> str_res;
+        std::vector<float>scores;
+
+        for (int i = 0; i < real_batch_num; ++i)
+        {
+            int argmax_idx = int(Utility::argmax(predict_batch.begin() + i * class_num, predict_batch.begin() + (i + 1) * class_num));
+            float score = predict_batch[argmax_idx];
+            scores.push_back(score);
+            str_res.push_back(this->label_list_[argmax_idx]);
+        }
+        auto postprocess_end = std::chrono::steady_clock::now();
+        for (int i = 0; i < str_res.size(); i++)
+        {
+            std::cout << frames_batch_path[i] << "\tclass: " << str_res[i] << "\tscore: " << scores[i] << endl;
+        }
+
+        std::chrono::duration<float> preprocess_diff = preprocess_end - preprocess_start;
+        times->push_back(double(preprocess_diff.count() * 1000));
+        std::chrono::duration<float> inference_diff = inference_end - inference_start;
+        times->push_back(double(inference_diff.count() * 1000));
+        std::chrono::duration<float> postprocess_diff = postprocess_end - postprocess_start;
+        times->push_back(double(postprocess_diff.count() * 1000));
+    }
+
+    void VideoRecognizer::LoadModel(const std::string &model_dir)
+    {
+        //   AnalysisConfig config;
+        paddle_infer::Config config;
+        config.SetModel(model_dir + "/" + this->inference_model_name + ".pdmodel",
+                        model_dir + "/" + this->inference_model_name + ".pdiparams");
+
+        if (this->use_gpu_)
+        {
+            config.EnableUseGpu(this->gpu_mem_, this->gpu_id_);
+            if (this->use_tensorrt_)
+            {
+                auto precision = paddle_infer::Config::Precision::kFloat32;
+                if (this->precision_ == "fp16")
+                {
+                    precision = paddle_infer::Config::Precision::kHalf;
+                }
+                else if (this->precision_ == "int8")
+                {
+                    precision = paddle_infer::Config::Precision::kInt8;
+                }
+
+                if (this->inference_model_name == "ppTSM" || this->inference_model_name == "TSM")
+                {
+                    config.EnableTensorRtEngine(
+                        1 << 30, // workspaceSize
+                        this->rec_batch_num * this->num_seg * 1, // maxBatchSize
+                        3, // minSubgraphSize
+                        precision, // precision
+                        false,// useStatic
+                        false //useCalibMode
+                    );
+                }
+                else if(this->inference_model_name == "ppTSN" || this->inference_model_name == "TSN")
+                {
+                    config.EnableTensorRtEngine(
+                        1 << 30,
+                        this->rec_batch_num * this->num_seg * 10,
+                        3, // minSubgraphSize
+                        precision,// precision
+                        false,// useStatic
+                        false //useCalibMode
+                    );
+                }
+                else
+                {
+                    config.EnableTensorRtEngine(
+                        1 << 30, // workspaceSize
+                        this->rec_batch_num, // maxBatchSize
+                        3, // minSubgraphSize
+                        precision,// precision
+                        false,// useStatic
+                        false //useCalibMode
+                    );
+                }
+
+                std::cout << "Enable TensorRT is: " << config.tensorrt_engine_enabled() << std::endl;
+
+                /* some model dose not suppport dynamic shape with TRT, deactivate it by default */
+
+                // std::map<std::string, std::vector<int> > min_input_shape =
+                // {
+                //     {"data_batch_0", {1, this->num_seg, 3, 1, 1}}
+                // };
+                // std::map<std::string, std::vector<int> > max_input_shape =
+                // {
+                //     {"data_batch_0", {1, this->num_seg, 3, 256, 256}}
+                // };
+                // std::map<std::string, std::vector<int> > opt_input_shape =
+                // {
+                //     {"data_batch_0", {this->rec_batch_num,  this->num_seg, 3, 224, 224}}
+                // };
+
+                // config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                //                               opt_input_shape);
+            }
+        }
+        else
+        {
+            config.DisableGpu();
+            if (this->use_mkldnn_)
+            {
+                config.EnableMKLDNN();
+                // cache 10 different shapes for mkldnn to avoid memory leak
+                config.SetMkldnnCacheCapacity(10);
+            }
+            config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
+        }
+
+        config.SwitchUseFeedFetchOps(false);
+        // true for multiple input
+        config.SwitchSpecifyInputNames(true);
+
+        config.SwitchIrOptim(true);
+
+        config.EnableMemoryOptim();
+        config.DisableGlogInfo();
+
+        this->predictor_ = CreatePredictor(config);
+    }
+
+} // namespace PaddleVideo
diff --git a/docs/src/deploy/cpp_infer/tools/build.sh b/docs/src/deploy/cpp_infer/tools/build.sh
new file mode 100644
index 000000000..c04ede091
--- /dev/null
+++ b/docs/src/deploy/cpp_infer/tools/build.sh
@@ -0,0 +1,22 @@
+OPENCV_DIR=your_opencv_dir
+LIB_DIR=your_paddle_inference_dir
+CUDA_LIB_DIR=your_cuda_lib_dir
+CUDNN_LIB_DIR=your_cudnn_lib_dir
+TENSORRT_DIR=your_tensorRT_dir
+
+BUILD_DIR=build
+rm -rf ${BUILD_DIR}
+mkdir ${BUILD_DIR}
+cd ${BUILD_DIR}
+cmake .. \
+    -DPADDLE_LIB=${LIB_DIR} \
+    -DWITH_MKL=ON \
+    -DWITH_GPU=OFF \
+    -DWITH_STATIC_LIB=OFF \
+    -DWITH_TENSORRT=OFF \
+    -DOPENCV_DIR=${OPENCV_DIR} \
+    -DCUDNN_LIB=${CUDNN_LIB_DIR} \
+    -DCUDA_LIB=${CUDA_LIB_DIR} \
+    -DTENSORRT_DIR=${TENSORRT_DIR} \
+
+make -j
diff --git a/docs/src/deploy/cpp_serving/paddle_env_install.sh b/docs/src/deploy/cpp_serving/paddle_env_install.sh
new file mode 100644
index 000000000..3f062027b
--- /dev/null
+++ b/docs/src/deploy/cpp_serving/paddle_env_install.sh
@@ -0,0 +1,35 @@
+unset GREP_OPTIONS
+
+function install_trt(){
+  CUDA_VERSION=$(nvcc --version | egrep -o "V[0-9]+.[0-9]+" | cut -c2-)
+  if [ $CUDA_VERSION == "10.2" ]; then
+    wget https://paddle-ci.gz.bcebos.com/TRT/TensorRT6-cuda10.2-cudnn7.tar.gz --no-check-certificate
+    tar -zxf TensorRT6-cuda10.2-cudnn7.tar.gz -C /usr/local
+    cp -rf /usr/local/TensorRT-6.0.1.8/include/*  /usr/include/ && cp -rf /usr/local/TensorRT-6.0.1.8/lib/* /usr/lib/
+    rm -rf TensorRT6-cuda10.2-cudnn7.tar.gz
+  elif [ $CUDA_VERSION == "11.2" ]; then
+    wget https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz --no-check-certificate
+    tar -zxf TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz -C /usr/local
+    cp -rf /usr/local/TensorRT-8.0.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.0.3.4/lib/* /usr/lib/
+    rm -rf TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz
+  else
+    echo "No Cuda Found, no need to install TensorRT"
+  fi
+}
+
+function env_install()
+{
+    apt install -y libcurl4-openssl-dev libbz2-dev
+    wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && tar xf centos_ssl.tar && rm -rf centos_ssl.tar && mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so
+    rm -rf /usr/local/go && wget -qO- https://paddle-ci.gz.bcebos.com/go1.15.12.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/go && \
+    mkdir /root/go/bin && \
+    mkdir /root/go/src && \
+    echo "GOROOT=/usr/local/go" >> /root/.bashrc && \
+    echo "GOPATH=/root/go" >> /root/.bashrc && \
+    echo "PATH=/usr/local/go/bin:/root/go/bin:$PATH" >> /root/.bashrc
+    install_trt
+}
+
+env_install
diff --git a/docs/src/deploy/cpp_serving/preprocess_ops.py b/docs/src/deploy/cpp_serving/preprocess_ops.py
new file mode 100644
index 000000000..88ec9d691
--- /dev/null
+++ b/docs/src/deploy/cpp_serving/preprocess_ops.py
@@ -0,0 +1,126 @@
+import os
+import sys
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, "../../")))
+
+from paddlevideo.loader.pipelines import (CenterCrop, Image2Array,
+                                          Normalization, Sampler, Scale,
+                                          VideoDecoder, TenCrop)
+import numpy as np
+from typing import Dict, Tuple, List, Callable
+
+VALID_MODELS = ["PPTSM", "PPTSN"]
+
+import os
+import sys
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, "../../")))
+
+from paddlevideo.loader.pipelines import (CenterCrop, Image2Array,
+                                          Normalization, Sampler, Scale,
+                                          VideoDecoder, TenCrop)
+import numpy as np
+from typing import Dict, Tuple, List, Callable
+
+VALID_MODELS = ["PPTSM", "PPTSN"]
+
+class Compose:
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img):
+        for t in self.transforms:
+            img = t(img)
+        return img
+
+
+def np_softmax(x: np.ndarray, axis: int = 0) -> np.ndarray:
+    """softmax function
+
+    Args:
+        x (np.ndarray): logits
+        axis (int): axis
+
+    Returns:
+        np.ndarray: probs
+    """
+    x -= np.max(x, axis=axis, keepdims=True)
+    x = np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)
+    return x
+
+
+def preprocess_PPTSM(video_path: str) -> Tuple[Dict[str, np.ndarray], List]:
+    """preprocess
+
+    Args:
+        video_path (str): input video path
+
+    Returns:
+        Tuple[Dict[str, np.ndarray], List]: feed and fetch
+    """
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    seq = Compose([
+        VideoDecoder(),
+        Sampler(8, 1, valid_mode=True),
+        Scale(256),
+        CenterCrop(224),
+        Image2Array(),
+        Normalization(img_mean, img_std)
+    ])
+    results = {"filename": video_path}
+    results = seq(results)
+    tmp_inp = np.expand_dims(results["imgs"], axis=0)  # [b,t,c,h,w]
+    tmp_inp = np.expand_dims(tmp_inp, axis=0)  # [1,b,t,c,h,w]
+    feed = {"data_batch_0": tmp_inp}
+    fetch = ["outputs"]
+    return feed, fetch
+
+
+def preprocess_PPTSN(video_path: str) -> Tuple[Dict[str, np.ndarray], List]:
+    """preprocess
+
+    Args:
+        video_path (str): input video path
+
+    Returns:
+        Tuple[Dict[str, np.ndarray], List]: feed and fetch
+    """
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    seq = Compose([
+        VideoDecoder(),
+        Sampler(25, 1, valid_mode=True, select_left=True),
+        Scale(256, fixed_ratio=True, do_round=True, backend='cv2'),
+        TenCrop(224),
+        Image2Array(),
+        Normalization(img_mean, img_std)
+    ])
+    results = {"filename": video_path}
+    results = seq(results)
+    tmp_inp = np.expand_dims(results["imgs"], axis=0)  # [b,t,c,h,w]
+    tmp_inp = np.expand_dims(tmp_inp, axis=0)  # [1,b,t,c,h,w]
+    feed = {"data_batch_0": tmp_inp}
+    fetch = ["outputs"]
+    return feed, fetch
+
+
+def get_preprocess_func(model_name: str) -> Callable:
+    """get preprocess function by model_name
+
+    Args:
+        model_name (str): model's name, must in `VALID_MODELS`
+
+
+    Returns:
+        Callable: preprocess function corresponding to model name
+    """
+    if model_name == "PPTSM":
+        return preprocess_PPTSM
+    elif model_name == "PPTSN":
+        return preprocess_PPTSN
+    else:
+        raise ValueError(
+            f"model_name must in {VALID_MODELS}, but got model_name")
diff --git a/docs/src/deploy/cpp_serving/readme.md b/docs/src/deploy/cpp_serving/readme.md
new file mode 100644
index 000000000..0172e7260
--- /dev/null
+++ b/docs/src/deploy/cpp_serving/readme.md
@@ -0,0 +1,164 @@
+简体中文 | [English](./readme_en.md)
+# 模型服务化部署
+
+## 简介
+
+[Paddle Serving](https://github.com/PaddlePaddle/Serving) 旨在帮助深度学习开发者轻松部署在线预测服务，支持一键部署工业级的服务能力、客户端和服务端之间高并发和高效通信、并支持多种编程语言开发客户端。
+
+该部分以 HTTP 预测服务部署为例，介绍怎样在 PaddleVideo 中使用 PaddleServing 部署模型服务。目前只支持 Linux 平台部署，暂不支持 Windows 平台。
+
+## Serving 安装
+Serving 官网推荐使用 docker 安装并部署 Serving 环境。首先需要拉取 docker 环境并创建基于 Serving 的 docker。
+
+```bash
+# 启动GPU docker
+docker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel
+nvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash
+nvidia-docker exec -it test bash
+
+# 启动CPU docker
+docker pull paddlepaddle/serving:0.7.0-devel
+docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash
+docker exec -it test bash
+```
+
+进入 docker 后，需要安装 Serving 相关的 python 包。
+```bash
+python3.7 -m pip install paddle-serving-client==0.7.0
+python3.7 -m pip install paddle-serving-app==0.7.0
+
+#若为CPU部署环境:
+python3.7 -m pip install paddle-serving-server==0.7.0  # CPU
+python3.7 -m pip install paddlepaddle==2.2.0           # CPU
+
+#若为GPU部署环境
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102  # GPU with CUDA10.2 + TensorRT6
+python3.7 -m pip install paddlepaddle-gpu==2.2.0                   # GPU with CUDA10.2
+
+#其他GPU环境需要确认环境再选择执行哪一条
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101  # GPU with CUDA10.1 + TensorRT6
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112  # GPU with CUDA11.2 + TensorRT8
+```
+
+* 如果安装速度太慢，可以通过 `-i https://pypi.tuna.tsinghua.edu.cn/simple` 更换源，加速安装过程。
+
+* 更多环境和对应的安装包详见：https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md
+
+## 行为识别服务部署
+### 模型转换
+使用 PaddleServing 做服务化部署时，需要将保存的 inference 模型转换为 Serving 模型。下面以 PP-TSM 模型为例，介绍如何部署行为识别服务。
+- 下载 PP-TSM 推理模型并转换为 Serving 模型：
+
+  ```bash
+  # 进入PaddleVideo目录
+  cd PaddleVideo
+  # 下载推理模型并解压到./inference下
+  mkdir ./inference
+  pushd ./inference
+  wget  https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip
+  unzip ppTSM.zip
+  popd
+
+  # 转换成 Serving 模型
+  pushd deploy/cpp_serving
+  python3.7 -m paddle_serving_client.convert \
+  --dirname ../../inference/ppTSM \
+  --model_filename ppTSM.pdmodel \
+  --params_filename ppTSM.pdiparams \
+  --serving_server ./ppTSM_serving_server \
+  --serving_client ./ppTSM_serving_client
+  popd
+  ```
+
+  | 参数              | 类型 | 默认值             | 描述                                                         |
+  | ----------------- | ---- | ------------------ | ------------------------------------------------------------ |
+  | `dirname`         | str  | -                  | 需要转换的模型文件存储路径，Program结构文件和参数文件均保存在此目录。 |
+  | `model_filename`  | str  | None               | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 `__model__` 作为默认的文件名 |
+  | `params_filename` | str  | None               | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None |
+  | `serving_server`  | str  | `"serving_server"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |
+  | `serving_client`  | str  | `"serving_client"` | 转换后的客户端配置文件存储路径。默认值为serving_client       |
+
+- 推理模型转换完成后，会在`deploy/cpp_serving`文件夹下生成 `ppTSM_serving_client` 和 `ppTSM_serving_server` 两个文件夹，具备如下格式：
+  ```bash
+  PaddleVideo/deploy/cpp_serving
+  ├── ppTSM_serving_client
+  │   ├── serving_client_conf.prototxt
+  │   └── serving_client_conf.stream.prototxt
+  └── ppTSM_serving_server
+      ├── ppTSM.pdiparams
+      ├── ppTSM.pdmodel
+      ├── serving_server_conf.prototxt
+      └── serving_server_conf.stream.prototxt
+  ```
+  得到模型文件之后，需要分别修改 `ppTSM_serving_client` 下的 `serving_client_conf.prototxt` 和 `ppTSM_serving_server` 下的 `serving_server_conf.prototxt`，将两份文件中`fetch_var` 下的 `alias_name` 均改为 `outputs`
+
+  **备注**:  Serving 为了兼容不同模型的部署，提供了输入输出重命名的功能。这样，不同的模型在推理部署时，只需要修改配置文件的`alias_name`即可，无需修改代码即可完成推理部署。
+  修改后的`serving_server_conf.prototxt`如下所示:
+
+  ```yaml
+  feed_var {
+    name: "data_batch_0"
+    alias_name: "data_batch_0"
+    is_lod_tensor: false
+    feed_type: 1
+    shape: 8
+    shape: 3
+    shape: 224
+    shape: 224
+  }
+  fetch_var {
+    name: "linear_2.tmp_1"
+    alias_name: "outputs"
+    is_lod_tensor: false
+    fetch_type: 1
+    shape: 400
+  }
+  ```
+### 服务部署和请求
+`cpp_serving` 目录包含了启动 pipeline 服务、C++ serving服务和发送预测请求的代码，具体包括：
+  ```bash
+  run_cpp_serving.sh          # 启动C++ serving server端的脚本
+  pipeline_http_client.py     # client端发送数据并获取预测结果的脚本
+  paddle_env_install.sh       # 安装C++ serving环境脚本
+  preprocess_ops.py           # 存放预处理函数的文件
+  ```
+#### C++ Serving
+- 进入工作目录：
+  ```bash
+  cd deploy/cpp_serving
+  ```
+
+- 启动服务：
+  ```bash
+  # 在后台启动，过程中打印输出的日志会重定向保存到nohup.txt中，可以使用tailf nohup.txt查看输出
+  bash run_cpp_serving.sh
+  ```
+
+- 发送请求并获取结果：
+  ```bash
+  python3.7 serving_client.py \
+  -n PPTSM \
+  -c ./ppTSM_serving_client/serving_client_conf.prototxt \
+  --input_file=../../data/example.avi
+  ```
+成功运行后，模型预测的结果会打印在 cmd 窗口中，结果如下：
+
+  ```bash
+  I0510 04:33:00.110025 37097 naming_service_thread.cpp:202] brpc::policy::ListNamingService("127.0.0.1:9993"): added 1
+  I0510 04:33:01.904764 37097 general_model.cpp:490] [client]logid=0,client_cost=1640.96ms,server_cost=1623.21ms.
+  {'class_id': '[5]', 'prob': '[0.9907387495040894]'}
+  ```
+**如果过程中报错显示找不到libnvinfer.so.6，可以执行脚本`paddle_env_install.sh`安装相关环境**
+  ```bash
+  bash paddle_env_install.sh
+  ```
+
+
+## FAQ
+**Q1**： 发送请求后没有结果返回或者提示输出解码报错
+
+**A1**： 启动服务和发送请求时不要设置代理，可以在启动服务前和发送请求前关闭代理，关闭代理的命令是：
+```
+unset https_proxy
+unset http_proxy
+```
diff --git a/docs/src/deploy/cpp_serving/readme_en.md b/docs/src/deploy/cpp_serving/readme_en.md
new file mode 100644
index 000000000..c731ec00b
--- /dev/null
+++ b/docs/src/deploy/cpp_serving/readme_en.md
@@ -0,0 +1,165 @@
+English | [简体中文](./readme.md)
+
+# Model service deployment
+
+## Introduction
+
+[Paddle Serving](https://github.com/PaddlePaddle/Serving) aims to help deep learning developers easily deploy online prediction services, support one-click deployment of industrial-grade service capabilities, high concurrency between client and server Efficient communication and support for developing clients in multiple programming languages.
+
+This section takes the HTTP prediction service deployment as an example to introduce how to use PaddleServing to deploy the model service in PaddleVideo. Currently, only Linux platform deployment is supported, and Windows platform is not currently supported.
+
+## Serving installation
+The Serving official website recommends using docker to install and deploy the Serving environment. First, you need to pull the docker environment and create a Serving-based docker.
+
+```bash
+# start GPU docker
+docker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel
+nvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash
+nvidia-docker exec -it test bash
+
+# start CPU docker
+docker pull paddlepaddle/serving:0.7.0-devel
+docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash
+docker exec -it test bash
+```
+
+After entering docker, you need to install Serving-related python packages.
+```bash
+python3.7 -m pip install paddle-serving-client==0.7.0
+python3.7 -m pip install paddle-serving-app==0.7.0
+
+#If it is a CPU deployment environment:
+python3.7 -m pip install paddle-serving-server==0.7.0 #CPU
+python3.7 -m pip install paddlepaddle==2.2.0 # CPU
+
+#If it is a GPU deployment environment
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102 # GPU with CUDA10.2 + TensorRT6
+python3.7 -m pip install paddlepaddle-gpu==2.2.0 # GPU with CUDA10.2
+
+#Other GPU environments need to confirm the environment and then choose which one to execute
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101 # GPU with CUDA10.1 + TensorRT6
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112 # GPU with CUDA11.2 + TensorRT8
+```
+
+* If the installation speed is too slow, you can change the source through `-i https://pypi.tuna.tsinghua.edu.cn/simple` to speed up the installation process.
+
+* For more environment and corresponding installation packages, see: https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md
+
+## Action recognition service deployment
+### Model conversion
+When using PaddleServing for service deployment, you need to convert the saved inference model into a Serving model. The following uses the PP-TSM model as an example to introduce how to deploy the action recognition service.
+- Download PP-TSM inference model and convert to Serving model:
+  ```bash
+  # Enter PaddleVideo directory
+  cd PaddleVideo
+
+  # Download the inference model and extract it to ./inference
+  mkdir ./inference
+  pushd ./inference
+  wget https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip
+  unzip ppTSM.zip
+  popd
+
+  # Convert to Serving model
+  pushd deploy/cpp_serving
+  python3.7 -m paddle_serving_client.convert \
+  --dirname ../../inference/ppTSM \
+  --model_filename ppTSM.pdmodel \
+  --params_filename ppTSM.pdiparams \
+  --serving_server ./ppTSM_serving_server \
+  --serving_client ./ppTSM_serving_client
+  popd
+  ```
+
+  | parameter | type | default value | description |
+  | ----------------- | ---- | ------------------ | ------- -------------------------------------------------- --- |
+  | `dirname` | str | - | The storage path of the model file to be converted. The program structure file and parameter file are saved in this directory. |
+  | `model_filename` | str | None | The name of the file storing the model Inference Program structure that needs to be converted. If set to None, use `__model__` as the default filename |
+  | `params_filename` | str | None | File name where all parameters of the model to be converted are stored. It needs to be specified if and only if all model parameters are stored in a single binary file. If the model parameters are stored in separate files, set it to None |
+  | `serving_server` | str | `"serving_server"` | The storage path of the converted model files and configuration files. Default is serving_server |
+  | `serving_client` | str | `"serving_client"` | The converted client configuration file storage path. Default is serving_client |
+
+- After the inference model conversion is completed, two folders, `ppTSM_serving_client` and `ppTSM_serving_server` will be generated under the `deploy/cpp_serving` folder, with the following formats:
+  ```bash
+  PaddleVideo/deploy/cpp_serving
+  ├── ppTSM_serving_client
+  │   ├── serving_client_conf.prototxt
+  │   └── serving_client_conf.stream.prototxt
+  └── ppTSM_serving_server
+      ├── ppTSM.pdiparams
+      ├── ppTSM.pdmodel
+      ├── serving_server_conf.prototxt
+      └── serving_server_conf.stream.prototxt
+  ```
+  After getting the model file, you need to modify `serving_client_conf.prototxt` under `ppTSM_serving_client` and `serving_server_conf.prototxt` under `ppTSM_serving_server` respectively, and change `alias_name` under `fetch_var` in both files to `outputs`
+
+  **Remarks**: In order to be compatible with the deployment of different models, Serving provides the function of input and output renaming. In this way, when different models are inferred and deployed, they only need to modify the `alias_name` of the configuration file, and the inference deployment can be completed without modifying the code.
+  The modified `serving_server_conf.prototxt` looks like this:
+
+  ```yaml
+  feed_var {
+    name: "data_batch_0"
+    alias_name: "data_batch_0"
+    is_lod_tensor: false
+    feed_type: 1
+    shape: 8
+    shape: 3
+    shape: 224
+    shape: 224
+  }
+  fetch_var {
+    name: "linear_2.tmp_1"
+    alias_name: "outputs"
+    is_lod_tensor: false
+    fetch_type: 1
+    shape: 400
+  }
+  ```
+### Service deployment and requests
+The `cpp_serving` directory contains the code for starting the pipeline service, the C++ serving service and sending the prediction request, including:
+  ```bash
+  run_cpp_serving.sh # Start the script on the C++ serving server side
+  pipeline_http_client.py # The script on the client side to send data and get the prediction results
+  paddle_env_install.sh # Install C++ serving environment script
+  preprocess_ops.py # file to store preprocessing functions
+  ```
+#### C++ Serving
+- Go to the working directory:
+  ```bash
+  cd deploy/cpp_serving
+  ```
+
+- Start the service:
+  ```bash
+  # Start in the background, the logs printed during the process will be redirected and saved to nohup.txt
+  bash run_cpp_serving.sh
+  ```
+
+- Send the request and get the result:
+```bash
+python3.7 serving_client.py \
+-n PPTSM \
+-c ./ppTSM_serving_client/serving_client_conf.prototxt \
+--input_file=../../data/example.avi
+```
+After a successful run, the results of the model prediction will be printed in the cmd window, and the results are as follows:
+
+  ```bash
+  I0510 04:33:00.110025 37097 naming_service_thread.cpp:202] brpc::policy::ListNamingService("127.0.0.1:9993"): added 1
+  I0510 04:33:01.904764 37097 general_model.cpp:490] [client]logid=0,client_cost=1640.96ms,server_cost=1623.21ms.
+   {'class_id': '[5]', 'prob': '[0.9907387495040894]'}
+   ```
+**If an error is reported during the process and it shows that libnvinfer.so.6 cannot be found, you can execute the script `paddle_env_install.sh` to install the relevant environment**
+   ```bash
+   bash paddle_env_install.sh
+   ```
+
+
+## FAQ
+**Q1**: No result is returned after the request is sent or an output decoding error is prompted
+
+**A1**: Do not set the proxy when starting the service and sending the request. You can close the proxy before starting the service and sending the request. The command to close the proxy is:
+```
+unset https_proxy
+unset http_proxy
+```
diff --git a/docs/src/deploy/cpp_serving/run_cpp_serving.sh b/docs/src/deploy/cpp_serving/run_cpp_serving.sh
new file mode 100644
index 000000000..24cf7cfe4
--- /dev/null
+++ b/docs/src/deploy/cpp_serving/run_cpp_serving.sh
@@ -0,0 +1,10 @@
+## sample script
+# run paddlevideo server with PP-TSM:
+nohup python3.7 -m paddle_serving_server.serve \
+--model ./ppTSM_serving_server \
+--port 9993 &
+
+## run paddlevideo server with PP-TSN:
+# nohup python3.7 -m paddle_serving_server.serve \
+# --model ./ppTSN_serving_server \
+# --port 9993 &
diff --git a/docs/src/deploy/cpp_serving/serving_client.py b/docs/src/deploy/cpp_serving/serving_client.py
new file mode 100644
index 000000000..3917d9316
--- /dev/null
+++ b/docs/src/deploy/cpp_serving/serving_client.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from typing import Any, Dict
+
+import numpy as np
+from paddle_serving_client import Client
+
+from preprocess_ops import get_preprocess_func, np_softmax
+
+
+def postprocess(fetch_map: Dict[str, np.ndarray]) -> Dict[str, Any]:
+    """postprocess
+
+    Args:
+        fetch_map (Dict[str, np.ndarray]): raw prediction
+
+    Returns:
+        Dict[str, Any]: postprocessed prediction
+    """
+    score_list = fetch_map["outputs"]  # [b,num_classes]
+    fetch_dict = {"class_id": [], "prob": []}
+    for score in score_list:
+        score = np_softmax(score, axis=0)
+        score = score.tolist()
+        max_score = max(score)
+        fetch_dict["class_id"].append(score.index(max_score))
+        fetch_dict["prob"].append(max_score)
+
+    fetch_dict["class_id"] = str(fetch_dict["class_id"])
+    fetch_dict["prob"] = str(fetch_dict["prob"])
+    return fetch_dict
+
+
+def parse_args():
+    # general params
+    parser = argparse.ArgumentParser("PaddleVideo CPP Serving model script")
+    parser.add_argument("-n",
+                        "--name",
+                        type=str,
+                        default="PPTSM",
+                        help="model's name, such as PPTSM, PPTSN...")
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=str,
+        help="serving client config file(serving_client_conf.prototxt) path")
+    parser.add_argument("--url",
+                        type=str,
+                        default="127.0.0.1:9993",
+                        help="url to access cpp serving")
+    parser.add_argument("--logid", type=int, default="10000", help="log id")
+    parser.add_argument("--input_file",
+                        type=str,
+                        default="../../data/example.avi",
+                        help="input video file")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    # parse args
+    args = parse_args()
+    url = args.url
+    logid = args.logid
+    input_file_path = args.input_file
+    model_name = args.name
+
+    # get preprocess by model name
+    preprocess = get_preprocess_func(model_name)
+
+    # initialize client object & connect
+    client = Client()
+    client.load_client_config(args.config)
+    client.connect([url])
+
+    # preprocess
+    feed, fetch = preprocess(input_file_path)
+
+    # send data & get prediction from server
+    fetch_map = client.predict(feed=feed, fetch=fetch)
+
+    # postprocess & output
+    result = postprocess(fetch_map)
+    print(result)
diff --git a/docs/src/deploy/paddle2onnx/predict_onnx.py b/docs/src/deploy/paddle2onnx/predict_onnx.py
new file mode 100644
index 000000000..47a223cd4
--- /dev/null
+++ b/docs/src/deploy/paddle2onnx/predict_onnx.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import sys
+from os import path as osp
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../tools')))
+
+from utils import build_inference_helper, get_config
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    # general params
+    parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/example.yaml',
+                        help='config file path')
+    parser.add_argument("-i", "--input_file", type=str, help="input file path")
+    parser.add_argument("--onnx_file", type=str, help="onnx model file path")
+
+    # params for onnx predict
+    parser.add_argument("-b", "--batch_size", type=int, default=1)
+    parser.add_argument("--use_gpu",
+                        type=str2bool,
+                        default=False,
+                        help="set to False when using onnx")
+    parser.add_argument("--precision", type=str, default="fp32")
+    parser.add_argument("--ir_optim", type=str2bool, default=True)
+    parser.add_argument("--enable_benchmark",
+                        type=str2bool,
+                        default=False,
+                        help="set to False when using onnx")
+    parser.add_argument("--cpu_threads", type=int, default=4)
+
+    return parser.parse_args()
+
+
+def create_onnx_predictor(args, cfg=None):
+    import onnxruntime as ort
+    onnx_file = args.onnx_file
+    config = ort.SessionOptions()
+    if args.use_gpu:
+        raise ValueError(
+            "onnx inference now only supports cpu! please set `use_gpu` to False."
+        )
+    else:
+        config.intra_op_num_threads = args.cpu_threads
+        if args.ir_optim:
+            config.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    predictor = ort.InferenceSession(onnx_file, sess_options=config)
+    return config, predictor
+
+
+def parse_file_paths(input_path: str) -> list:
+    if osp.isfile(input_path):
+        files = [
+            input_path,
+        ]
+    else:
+        files = os.listdir(input_path)
+        files = [
+            file for file in files
+            if (file.endswith(".avi") or file.endswith(".mp4"))
+        ]
+        files = [osp.join(input_path, file) for file in files]
+    return files
+
+
+def main():
+    """predict using onnx model
+    """
+    args = parse_args()
+    cfg = get_config(args.config, show=False)
+
+    model_name = cfg.model_name
+
+    print(f"Inference model({model_name})...")
+    InferenceHelper = build_inference_helper(cfg.INFERENCE)
+
+    inference_config, predictor = create_onnx_predictor(args)
+
+    # get input_tensor and output_tensor
+    input_names = predictor.get_inputs()[0].name
+    output_names = predictor.get_outputs()[0].name
+
+    # get the absolute file path(s) to be processed
+    files = parse_file_paths(args.input_file)
+    if args.enable_benchmark:
+        test_video_num = 12
+        num_warmup = 3
+        # instantiate auto log
+        try:
+            import auto_log
+        except ImportError as e:
+            print(f"{e}, [git+https://github.com/LDOUBLEV/AutoLog] "
+                  f"package and it's dependencies is required for "
+                  f"python-inference when enable_benchmark=True.")
+        pid = os.getpid()
+        autolog = auto_log.AutoLogger(
+            model_name=cfg.model_name,
+            model_precision=args.precision,
+            batch_size=args.batch_size,
+            data_shape="dynamic",
+            save_path="./output/auto_log.lpg",
+            inference_config=inference_config,
+            pids=pid,
+            process_name=None,
+            gpu_ids=None,
+            time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
+            warmup=num_warmup)
+        files = [args.input_file for _ in range(test_video_num + num_warmup)]
+
+    # Inferencing process
+    batch_num = args.batch_size
+    for st_idx in range(0, len(files), batch_num):
+        ed_idx = min(st_idx + batch_num, len(files))
+
+        # auto log start
+        if args.enable_benchmark:
+            autolog.times.start()
+
+        # Pre process batched input
+        batched_inputs = InferenceHelper.preprocess_batch(files[st_idx:ed_idx])
+
+        # get pre process time cost
+        if args.enable_benchmark:
+            autolog.times.stamp()
+
+        # run inference
+        batched_outputs = predictor.run(
+            output_names=[output_names],
+            input_feed={input_names: batched_inputs[0]})
+
+        # get inference process time cost
+        if args.enable_benchmark:
+            autolog.times.stamp()
+
+        InferenceHelper.postprocess(batched_outputs, not args.enable_benchmark)
+
+        # get post process time cost
+        if args.enable_benchmark:
+            autolog.times.end(stamp=True)
+
+        # time.sleep(0.01)  # sleep for T4 GPU
+
+    # report benchmark log if enabled
+    if args.enable_benchmark:
+        autolog.report()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/src/deploy/paddle2onnx/readme.md b/docs/src/deploy/paddle2onnx/readme.md
new file mode 100644
index 000000000..04bb9e77d
--- /dev/null
+++ b/docs/src/deploy/paddle2onnx/readme.md
@@ -0,0 +1,70 @@
+# paddle2onnx 模型转化与预测
+
+本章节介绍 PP-TSN 模型如何转化为 ONNX 模型，并基于 ONNX 引擎预测。
+
+## 1. 环境准备
+
+需要准备 Paddle2ONNX 模型转化环境，和 ONNX 模型预测环境。
+
+Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式，算子目前稳定支持导出 ONNX Opset 9~11，部分Paddle算子支持更低的ONNX Opset转换。
+更多细节可参考 [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)
+
+- 安装 Paddle2ONNX
+```bash
+python3.7 -m pip install paddle2onnx
+```
+
+- 安装 ONNXRuntime
+```bash
+# 建议安装 1.9.0 版本，可根据环境更换版本号
+python3.7 -m pip install onnxruntime==1.9.0
+```
+
+## 2. 模型转换
+
+- PP-TSN inference模型下载
+
+    ```bash
+    # 下载inference模型到PaddleVideo/inference/ppTSN/ 目录下
+    mkdir -p ./inference
+    wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip
+
+    # 解压inference模型
+    pushd ./inference
+    unzip ppTSN.zip
+    popd
+    ```
+
+- 模型转换
+
+    使用 Paddle2ONNX 将 Paddle inference模型转换为 ONNX 格式模型：
+
+    ```bash
+    paddle2onnx \
+    --model_dir=./inference/ppTSN \
+    --model_filename=ppTSN.pdmodel \
+    --params_filename=ppTSN.pdiparams \
+    --save_file=./inference/ppTSN/ppTSN.onnx \
+    --opset_version=10 \
+    --enable_onnx_checker=True
+    ```
+执行完毕后，可以发现 `./inference/ppTSN` 目录下生成了一个 ONNX 格式的模型文件 `ppTSN.onnx`
+
+## 3. onnx 预测
+
+接下来就可以用 ONNX 格式模型进行预测，其用法与paddle 预测模型类似
+执行如下命令：
+```bash
+python3.7 deploy/paddle2onnx/predict_onnx.py \
+--input_file data/example.avi \
+--config configs/recognition/pptsn/pptsn_k400_videos.yaml \
+--onnx_file=./inference/ppTSN/ppTSN.onnx
+```
+
+结果如下：
+```bash
+Current video file: data/example.avi
+        top-1 class: 5
+        top-1 score: 0.9998553991317749
+```
+可以验证该结果与Paddle inference的预测结果完全一致
diff --git a/docs/src/deploy/paddle2onnx/readme_en.md b/docs/src/deploy/paddle2onnx/readme_en.md
new file mode 100644
index 000000000..6fe67726f
--- /dev/null
+++ b/docs/src/deploy/paddle2onnx/readme_en.md
@@ -0,0 +1,70 @@
+# paddle2onnx model conversion and prediction
+
+This chapter describes how the PP-TSN model is transformed into an ONNX model and predicted based on the ONNX engine.
+
+## 1. Environment preparation
+
+Need to prepare Paddle2ONNX model conversion environment, and ONNX model prediction environment.
+
+Paddle2ONNX supports converting the PaddlePaddle model format to the ONNX model format. The operator currently supports exporting ONNX Opset 9~11 stably, and some Paddle operators support lower ONNX Opset conversion.
+For more details, please refer to [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)
+
+- Install Paddle2ONNX
+```bash
+python3.7 -m pip install paddle2onnx
+```
+
+- Install ONNXRuntime
+```bash
+# It is recommended to install version 1.9.0, and the version number can be changed according to the environment
+python3.7 -m pip install onnxruntime==1.9.0
+```
+
+## 2. Model conversion
+
+- PP-TSN inference model download
+
+    ```bash
+    # Download the inference model to the PaddleVideo/inference/ppTSN/ directory
+    mkdir -p ./inference
+    wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip
+
+    # Decompress the inference model
+    pushd ./inference
+    unzip ppTSN.zip
+    popd
+    ```
+
+- Model conversion
+
+    Convert Paddle inference models to ONNX format models using Paddle2ONNX:
+
+    ```bash
+    paddle2onnx \
+    --model_dir=./inference/ppTSN \
+    --model_filename=ppTSN.pdmodel \
+    --params_filename=ppTSN.pdiparams \
+    --save_file=./inference/ppTSN/ppTSN.onnx \
+    --opset_version=10 \
+    --enable_onnx_checker=True
+    ```
+After execution, you can find that a model file `ppTSN.onnx` in ONNX format is generated in the `./inference/ppTSN` directory
+
+## 3. onnx prediction
+
+Next, you can use the ONNX format model for prediction, which is similar to the paddle prediction model
+Execute the following command:
+```bash
+python3.7 deploy/paddle2onnx/predict_onnx.py \
+--input_file data/example.avi \
+--config configs/recognition/pptsn/pptsn_k400_videos.yaml \
+--onnx_file=./inference/ppTSN/ppTSN.onnx
+```
+
+The result is as follows:
+```bash
+Current video file: data/example.avi
+        top-1 class: 5
+        top-1 score: 0.9998553991317749
+```
+It can be verified that the result is completely consistent with the prediction result of Paddle inference
diff --git a/docs/src/deploy/python_serving/pipeline_http_client.py b/docs/src/deploy/python_serving/pipeline_http_client.py
new file mode 100644
index 000000000..5a604b622
--- /dev/null
+++ b/docs/src/deploy/python_serving/pipeline_http_client.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import requests
+
+from utils import numpy_to_base64, parse_file_paths, video_to_numpy
+
+
+def parse_args():
+    # general params
+    parser = argparse.ArgumentParser("PaddleVideo Web Serving model script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/PP-TSM.yaml',
+                        help='serving config file path')
+    parser.add_argument('-ptn',
+                        '--port_number',
+                        type=int,
+                        default=18080,
+                        help='http port number')
+    parser.add_argument('-i',
+                        '--input_file',
+                        type=str,
+                        help='input file path or directory path')
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    url = f"http://127.0.0.1:{args.port_number}/video/prediction"
+
+    files_list = parse_file_paths(args.input_file)
+
+    for file_path in files_list:
+        # decoding video and get stacked frames as ndarray
+        decoded_frames = video_to_numpy(file_path=file_path)
+
+        # encode ndarray to base64 string for transportation.
+        decoded_frames_base64 = numpy_to_base64(decoded_frames)
+
+        # generate dict & convert to json.
+        data = {
+            "key": ["frames", "frames_shape"],
+            "value": [decoded_frames_base64,
+                      str(decoded_frames.shape)]
+        }
+        data = json.dumps(data)
+
+        # transport to server & get get results.
+        r = requests.post(url=url, data=data, timeout=100)
+
+        # print result
+        print(r.json())
diff --git a/docs/src/deploy/python_serving/pipeline_rpc_client.py b/docs/src/deploy/python_serving/pipeline_rpc_client.py
new file mode 100644
index 000000000..809c45d0f
--- /dev/null
+++ b/docs/src/deploy/python_serving/pipeline_rpc_client.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+
+import argparse
+
+from utils import numpy_to_base64, parse_file_paths, video_to_numpy
+
+
+def parse_args():
+    # general params
+    parser = argparse.ArgumentParser("PaddleVideo Web Serving model script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/PP-TSM.yaml',
+                        help='serving config file path')
+    parser.add_argument('-ptn',
+                        '--port_number',
+                        type=int,
+                        default=9993,
+                        help='rpc port number')
+    parser.add_argument('-i',
+                        '--input_file',
+                        type=str,
+                        help='input file path or directory path')
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    client = PipelineClient()
+    client.connect([f'127.0.0.1:{args.port_number}'])
+
+    files_list = parse_file_paths(args.input_file)
+
+    for file_path in files_list:
+        # decoding video and get stacked frames as ndarray
+        decoded_frames = video_to_numpy(file_path=file_path)
+
+        # encode ndarray to base64 string for transportation.
+        decoded_frames_base64 = numpy_to_base64(decoded_frames)
+
+        # transport to server & get get results.
+        ret = client.predict(feed_dict={
+            "frames": decoded_frames_base64,
+            "frames_shape": str(decoded_frames.shape)
+        },
+                             fetch=["label", "prob"])
+
+        # print result
+        print(ret)
diff --git a/docs/src/deploy/python_serving/readme.md b/docs/src/deploy/python_serving/readme.md
new file mode 100644
index 000000000..6d00b48c5
--- /dev/null
+++ b/docs/src/deploy/python_serving/readme.md
@@ -0,0 +1,185 @@
+简体中文 | [English](./readme_en.md)
+# 模型服务化部署
+
+## 简介
+
+[Paddle Serving](https://github.com/PaddlePaddle/Serving) 旨在帮助深度学习开发者轻松部署在线预测服务，支持一键部署工业级的服务能力、客户端和服务端之间高并发和高效通信、并支持多种编程语言开发客户端。
+
+该部分以 HTTP 预测服务部署为例，介绍怎样在 PaddleVideo 中使用 PaddleServing 部署模型服务。目前只支持 Linux 平台部署，暂不支持 Windows 平台。
+
+## Serving 安装
+Serving 官网推荐使用 docker 安装并部署 Serving 环境。首先需要拉取 docker 环境并创建基于 Serving 的 docker。
+
+```bash
+# 启动GPU docker
+docker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel
+nvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash
+nvidia-docker exec -it test bash
+
+# 启动CPU docker
+docker pull paddlepaddle/serving:0.7.0-devel
+docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash
+docker exec -it test bash
+```
+
+进入 docker 后，需要安装 Serving 相关的 python 包。
+```bash
+python3.7 -m pip install paddle-serving-client==0.7.0
+python3.7 -m pip install paddle-serving-app==0.7.0
+python3.7 -m pip install faiss-cpu==1.7.1post2
+
+#若为CPU部署环境:
+python3.7 -m pip install paddle-serving-server==0.7.0  # CPU
+python3.7 -m pip install paddlepaddle==2.2.0           # CPU
+
+#若为GPU部署环境
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102  # GPU with CUDA10.2 + TensorRT6
+python3.7 -m pip install paddlepaddle-gpu==2.2.0                   # GPU with CUDA10.2
+
+#其他GPU环境需要确认环境再选择执行哪一条
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101  # GPU with CUDA10.1 + TensorRT6
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112  # GPU with CUDA11.2 + TensorRT8
+```
+
+* 如果安装速度太慢，可以通过 `-i https://pypi.tuna.tsinghua.edu.cn/simple` 更换源，加速安装过程
+* 更多环境和对应的安装包详见：https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md
+
+## 行为识别服务部署
+### 模型转换
+使用 PaddleServing 做服务化部署时，需要将保存的 inference 模型转换为 Serving 模型。下面以 PP-TSM 模型为例，介绍如何部署行为识别服务。
+- 下载训练好的 PP-TSM 的模型，并转化为推理模型：
+  ```bash
+  # 进入PaddleVideo目录
+  cd PaddleVideo
+
+  wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams
+
+  python3.7 tools/export_model.py \
+  -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+  -p data/ppTSM_k400_uniform.pdparams \
+  -o inference/ppTSM
+  ```
+
+- 我们也提供了转换好的推理模型，按以下命令下载并解压
+  ```bash
+  mkdir ./inference
+  wget -nc -P ./inference https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate
+  pushd ./inference
+  unzip ppTSM.zip
+  popd
+  ```
+- 用 paddle_serving_client 把转换好的推理模型再转换成易于 Server 部署的模型格式：
+  ```bash
+  python3.7 -m paddle_serving_client.convert \
+  --dirname inference/ppTSM \
+  --model_filename ppTSM.pdmodel \
+  --params_filename ppTSM.pdiparams \
+  --serving_server ./deploy/python_serving/ppTSM_serving_server/ \
+  --serving_client ./deploy/python_serving/ppTSM_serving_client/
+  ```
+  | 参数              | 类型 | 默认值             | 描述                                                         |
+  | ----------------- | ---- | ------------------ | ------------------------------------------------------------ |
+  | `dirname`         | str  | -                  | 需要转换的模型文件存储路径，Program结构文件和参数文件均保存在此目录。 |
+  | `model_filename`  | str  | None               | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 `__model__` 作为默认的文件名 |
+  | `params_filename` | str  | None               | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None |
+  | `serving_server`  | str  | `"serving_server"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |
+  | `serving_client`  | str  | `"serving_client"` | 转换后的客户端配置文件存储路径。默认值为serving_client       |
+
+PP-TSM 推理模型转换完成后，会在当前文件夹多出 `ppTSM_serving_server` 和 `ppTSM_serving_client` 的文件夹，具备如下格式：
+  ```bash
+  PaddleVideo/deploy/python_serving
+  ├── ppTSM_serving_server
+      ├── ppTSM.pdiparams
+      ├── ppTSM.pdmodel
+      ├── serving_server_conf.prototxt
+      └── serving_server_conf.stream.prototxt
+  ├── ppTSM_serving_client
+      ├── serving_client_conf.prototxt
+      └── serving_client_conf.stream.prototxt
+  ```
+得到模型文件之后，需要分别修改 `ppTSM_serving_server` 和 `ppTSM_serving_client` 下的文件 `serving_server_conf.prototxt`，将 两份文件中`fetch_var` 下的 `alias_name` 均改为 `outputs`
+
+**备注**:  Serving 为了兼容不同模型的部署，提供了输入输出重命名的功能。这样，不同的模型在推理部署时，只需要修改配置文件的`alias_name`即可，无需修改代码即可完成推理部署。
+修改后的`serving_server_conf.prototxt`如下所示:
+
+```yaml
+feed_var {
+  name: "data_batch_0"
+  alias_name: "data_batch_0"
+  is_lod_tensor: false
+  feed_type: 1
+  shape: 8
+  shape: 3
+  shape: 224
+  shape: 224
+}
+fetch_var {
+  name: "linear_2.tmp_1"
+  alias_name: "outputs"
+  is_lod_tensor: false
+  fetch_type: 1
+  shape: 400
+}
+
+```
+### 服务部署和请求
+`python_serving` 目录包含了启动 pipeline 服务、C++ serving服务(TODO)和发送预测请求的代码，具体包括：
+```bash
+__init__.py
+configs/xxx.yaml            # 启动pipeline服务的配置文件
+pipeline_http_client.py     # http方式发送pipeline预测请求的python脚本
+pipeline_rpc_client.py      # rpc方式发送pipeline预测请求的python脚本
+recognition_web_service.py  # 启动pipeline服务端的python脚本
+utils.py                    # 储存预测过程中常用的函数，如parse_file_paths, numpy_to_base64, video_to_numpy
+```
+#### Python Serving
+- 进入工作目录：
+```bash
+cd deploy/python_serving
+```
+
+- 启动服务：
+```bash
+# 在当前命令行窗口启动并保持在前端
+python3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml
+# 在后台启动，过程中打印输出的日志会重定向保存到log.txt中
+python3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml &>log.txt &
+```
+
+- 发送请求：
+```bash
+# 以http方式的发送预测请求并接受结果
+python3.7 pipeline_http_client.py -i ../../data/example.avi
+
+# 以rpc方式的发送预测请求并接受结果
+python3.7 pipeline_rpc_client.py -i ../../data/example.avi
+```
+成功运行后，模型预测的结果会打印在 cmd 窗口中，结果如下：
+
+```bash
+# http方式打印的结果
+{'err_no': 0, 'err_msg': '', 'key': ['label', 'prob'], 'value': ["['archery']", '[0.9907388687133789]'], 'tensors': []}
+
+# rpc方式打印的结果
+PipelineClient::predict pack_data time:1645631086.764019
+PipelineClient::predict before time:1645631086.8485317
+key: "label"
+key: "prob"
+value: "[\'archery\']"
+value: "[0.9907388687133789]"
+```
+
+## FAQ
+**Q1**： 发送请求后没有结果返回或者提示输出解码报错
+
+**A1**： 启动服务和发送请求时不要设置代理，可以在启动服务前和发送请求前关闭代理，关闭代理的命令是：
+```
+unset https_proxy
+unset http_proxy
+```
+
+**Q2**： 服务端启动后没有反应，一直停在`start proxy service`不动
+
+**A2**： 很可能是启动过程中遇到了问题，可以在`./deploy/python_serving/PipelineServingLogs/pipeline.log`日志文件中查看详细报错信息
+
+更多的服务部署类型，如 `RPC 预测服务` 等，可以参考 Serving 的[github 官网](https://github.com/PaddlePaddle/Serving/tree/v0.7.0/examples)
diff --git a/docs/src/deploy/python_serving/readme_en.md b/docs/src/deploy/python_serving/readme_en.md
new file mode 100644
index 000000000..2f7e2443f
--- /dev/null
+++ b/docs/src/deploy/python_serving/readme_en.md
@@ -0,0 +1,185 @@
+English | [简体中文](./readme.md)
+# Model service deployment
+
+## Introduction
+
+[Paddle Serving](https://github.com/PaddlePaddle/Serving) aims to help deep learning developers easily deploy online prediction services, support one-click deployment of industrial-grade service capabilities, high concurrency between client and server Efficient communication and support for developing clients in multiple programming languages.
+
+This section takes the HTTP prediction service deployment as an example to introduce how to use PaddleServing to deploy the model service in PaddleVideo. Currently, only Linux platform deployment is supported, and Windows platform is not currently supported.
+
+## Serving installation
+The Serving official website recommends using docker to install and deploy the Serving environment. First, you need to pull the docker environment and create a Serving-based docker.
+
+```bash
+# start GPU docker
+docker pull paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel
+nvidia-docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-cuda10.2-cudnn7-devel bash
+nvidia-docker exec -it test bash
+
+# start CPU docker
+docker pull paddlepaddle/serving:0.7.0-devel
+docker run -p 9292:9292 --name test -dit paddlepaddle/serving:0.7.0-devel bash
+docker exec -it test bash
+```
+
+After entering docker, you need to install Serving-related python packages.
+```bash
+python3.7 -m pip install paddle-serving-client==0.7.0
+python3.7 -m pip install paddle-serving-app==0.7.0
+python3.7 -m pip install faiss-cpu==1.7.1post2
+
+#If it is a CPU deployment environment:
+python3.7 -m pip install paddle-serving-server==0.7.0 #CPU
+python3.7 -m pip install paddlepaddle==2.2.0 # CPU
+
+#If it is a GPU deployment environment
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post102 # GPU with CUDA10.2 + TensorRT6
+python3.7 -m pip install paddlepaddle-gpu==2.2.0 # GPU with CUDA10.2
+
+#Other GPU environments need to confirm the environment and then choose which one to execute
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post101 # GPU with CUDA10.1 + TensorRT6
+python3.7 -m pip install paddle-serving-server-gpu==0.7.0.post112 # GPU with CUDA11.2 + TensorRT8
+```
+
+* If the installation speed is too slow, you can change the source through `-i https://pypi.tuna.tsinghua.edu.cn/simple` to speed up the installation process
+* For more environment and corresponding installation packages, see: https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md
+
+## Behavior recognition service deployment
+### Model conversion
+When using PaddleServing for service deployment, you need to convert the saved inference model into a Serving model. The following uses the PP-TSM model as an example to introduce how to deploy the behavior recognition service.
+- Download the trained PP-TSM model and convert it into an inference model:
+  ```bash
+  # Enter PaddleVideo directory
+  cd PaddleVideo
+
+  wget -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams
+
+  python3.7 tools/export_model.py \
+  -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+  -p data/ppTSM_k400_uniform.pdparams \
+  -o inference/ppTSM
+  ```
+
+- We also provide the converted inference model, download and unzip by the following command
+  ```bash
+  mkdir ./inference
+  wget -nc -P ./inference https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate
+  pushd ./inference
+  unzip ppTSM.zip
+  popd
+  ```
+- Use paddle_serving_client to convert the converted inference model into a model format that is easy for server deployment:
+  ```bash
+  python3.7 -m paddle_serving_client.convert \
+  --dirname inference/ppTSM \
+  --model_filename ppTSM.pdmodel \
+  --params_filename ppTSM.pdiparams \
+  --serving_server ./deploy/python_serving/ppTSM_serving_server/ \
+  --serving_client ./deploy/python_serving/ppTSM_serving_client/
+  ```
+  | parameter | type | default value | description |
+  | ----------------- | ---- | ------------------ | ------- -------------------------------------------------- --- |
+  | `dirname` | str | - | The storage path of the model file to be converted. The program structure file and parameter file are saved in this directory. |
+  | `model_filename` | str | None | The name of the file storing the model Inference Program structure that needs to be converted. If set to None, use `__model__` as the default filename |
+  | `params_filename` | str | None | File name where all parameters of the model to be converted are stored. It needs to be specified if and only if all model parameters are stored in a single binary file. If the model parameters are stored in separate files, set it to None |
+  | `serving_server` | str | `"serving_server"` | The storage path of the converted model files and configuration files. Default is serving_server |
+  | `serving_client` | str | `"serving_client"` | The converted client configuration file storage path. Default is serving_client |
+
+After the PP-TSM inference model is converted, there will be additional folders of `ppTSM_serving_server` and `ppTSM_serving_client` in the current folder, with the following formats:
+  ```bash
+  PaddleVideo/deploy/python_serving
+  ├── ppTSM_serving_server
+      ├── ppTSM.pdiparams
+      ├── ppTSM.pdmodel
+      ├── serving_server_conf.prototxt
+      └── serving_server_conf.stream.prototxt
+  ├── ppTSM_serving_client
+      ├── serving_client_conf.prototxt
+      └── serving_client_conf.stream.prototxt
+  ```
+After getting the model files, you need to modify the files `serving_server_conf.prototxt` under `ppTSM_serving_server` and `ppTSM_serving_client` respectively, and change `alias_name` under `fetch_var` in both files to `outputs`
+
+**Remarks**: In order to be compatible with the deployment of different models, Serving provides the function of input and output renaming. In this way, when different models are inferred and deployed, they only need to modify the `alias_name` of the configuration file, and the inference deployment can be completed without modifying the code.
+The modified `serving_server_conf.prototxt` looks like this:
+
+```yaml
+feed_var {
+  name: "data_batch_0"
+  alias_name: "data_batch_0"
+  is_lod_tensor: false
+  feed_type: 1
+  shape: 8
+  shape: 3
+  shape: 224
+  shape: 224
+}
+fetch_var {
+  name: "linear_2.tmp_1"
+  alias_name: "outputs"
+  is_lod_tensor: false
+  fetch_type: 1
+  shape: 400
+}
+
+```
+### Service deployment and requests
+The `python_serving` directory contains the code for starting the pipeline service, C++ serving service (TODO) and sending prediction requests, including:
+```bash
+__init__.py
+configs/xxx.yaml            # start the configuration file of the pipeline service
+pipeline_http_client.py     # python script for sending pipeline prediction request via http
+pipeline_rpc_client.py      # python script for sending pipeline prediction request in rpc mode
+recognition_web_service.py  # python script that starts the pipeline server
+utils.py                    # common functions used in inference, such as parse_file_paths, numpy_to_base64, video_to_numpy
+```
+#### Python Serving
+- Go to the working directory:
+```bash
+cd deploy/python_serving
+```
+
+- Start the service:
+```bash
+# Start in the current command line window and stay in front
+python3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml
+# Start in the background, the logs printed during the process will be redirected and saved to log.txt
+python3.7 recognition_web_service.py -n PPTSM -c configs/PP-TSM.yaml &>log.txt &
+```
+
+- send request:
+```bash
+# Send a prediction request in http and receive the result
+python3.7 pipeline_http_client.py -i ../../data/example.avi
+
+# Send a prediction request in rpc and receive the result
+python3.7 pipeline_rpc_client.py -i ../../data/example.avi
+```
+After a successful run, the results of the model prediction will be printed in the cmd window, and the results are as follows:
+
+```bash
+# http method print result
+{'err_no': 0, 'err_msg': '', 'key': ['label', 'prob'], 'value': ["['archery']", '[0.9907388687133789]'], 'tensors ': []}
+
+# The result of printing in rpc mode
+PipelineClient::predict pack_data time:1645631086.764019
+PipelineClient::predict before time:1645631086.8485317
+key: "label"
+key: "prob"
+value: "[\'archery\']"
+value: "[0.9907388687133789]"
+```
+
+## FAQ
+**Q1**: No result is returned after the request is sent or an output decoding error is prompted
+
+**A1**: Do not set the proxy when starting the service and sending the request. You can close the proxy before starting the service and sending the request. The command to close the proxy is:
+```
+unset https_proxy
+unset http_proxy
+```
+
+**Q2**: There is no response after the server is started, and it has been stopped at `start proxy service`
+
+**A2**: It is likely that a problem was encountered during the startup process. You can view the detailed error message in the `./deploy/python_serving/PipelineServingLogs/pipeline.log` log file
+
+For more service deployment types, such as `RPC prediction service`, you can refer to Serving's [github official website](https://github.com/PaddlePaddle/Serving/tree/v0.7.0/examples)
diff --git a/docs/src/deploy/python_serving/recognition_web_service.py b/docs/src/deploy/python_serving/recognition_web_service.py
new file mode 100644
index 000000000..3e4143916
--- /dev/null
+++ b/docs/src/deploy/python_serving/recognition_web_service.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import base64
+import os
+import sys
+from typing import Callable, Dict, List
+
+import numpy as np
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+
+from paddle_serving_app.reader import Sequential
+from paddlevideo.loader.pipelines import (CenterCrop, Image2Array,
+                                          Normalization, Sampler, Scale,
+                                          TenCrop)
+
+try:
+    from paddle_serving_server_gpu.web_service import Op, WebService
+except ImportError:
+    from paddle_serving_server.web_service import Op, WebService
+
+VALID_MODELS = ["PPTSM", "PPTSN"]
+
+
+def get_preprocess_seq(model_name: str) -> List[Callable]:
+    """get preprocess sequence by model name
+
+    Args:
+        model_name (str): model name for web serving, such as 'PPTSM', 'PPTSN'
+
+    Returns:
+        List[Callable]: preprocess operators in list.
+    """
+    if model_name == 'PPTSM':
+        preprocess_seq = [
+            Sampler(8, 1, valid_mode=True),
+            Scale(256),
+            CenterCrop(224),
+            Image2Array(),
+            Normalization([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        ]
+    elif model_name == 'PPTSN':
+        preprocess_seq = [
+            Sampler(25, 1, valid_mode=True, select_left=True),
+            Scale(256, fixed_ratio=True, do_round=True, backend='cv2'),
+            TenCrop(224),
+            Image2Array(),
+            Normalization([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        ]
+    else:
+        raise ValueError(
+            f"model_name must in {VALID_MODELS}, but got {model_name}")
+    return preprocess_seq
+
+
+def np_softmax(x: np.ndarray, axis=0) -> np.ndarray:
+    """softmax function
+
+    Args:
+        x (np.ndarray): logits.
+
+    Returns:
+        np.ndarray: probs.
+    """
+    x -= np.max(x, axis=axis, keepdims=True)
+    x = np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)
+    return x
+
+
+class VideoOp(Op):
+    def init_op(self):
+        """init_op
+        """
+        self.seq = Sequential(get_preprocess_seq(args.name))
+
+        self.label_dict = {}
+        with open("../../data/k400/Kinetics-400_label_list.txt", "r") as fin:
+            for line in fin:
+                label_ind, label_name = line.strip().split(' ')
+                label_ind = int(label_ind)
+                self.label_dict[label_ind] = label_name.strip()
+
+    def preprocess(self, input_dicts: Dict, data_id: int, log_id: int):
+        """preprocess
+
+        Args:
+            input_dicts (Dict): input_dicts.
+            data_id (int): data_id.
+            log_id (int): log_id.
+
+        Returns:
+            output_data: data for process stage.
+            is_skip_process: skip process stage or not, False default
+            prod_errcode: None default, otherwise, product errores occured.
+                          It is handled in the same way as exception.
+            prod_errinfo: "" default.
+        """
+        (_, input_dict), = input_dicts.items()
+        for key in input_dict.keys():
+            if key == "frames":
+                frame_data = base64.b64decode(input_dict[key].encode('utf8'))
+                frame_data = np.fromstring(frame_data, np.uint8)
+            elif key == 'frames_shape':
+                shape_data = eval(input_dict[key])
+            else:
+                raise ValueError(f"unexpected key received: {key}")
+        frame_data = frame_data.reshape(shape_data)
+        frame_len = frame_data.shape[0]
+        frame_data = np.split(frame_data, frame_len, axis=0)
+        frame_data = [frame.squeeze(0) for frame in frame_data]
+        results = {
+            'frames': frame_data,
+            'frames_len': frame_len,
+            'format': 'video',
+            'backend': 'cv2'
+        }
+        results = self.seq(results)
+        tmp_inp = np.expand_dims(results['imgs'], axis=0)  # [b,t,c,h,w]
+
+        # The input for the network is input_data[0], so need to add 1 dimension at the beginning
+        tmp_inp = np.expand_dims(tmp_inp, axis=0).copy()  # [1,b,t,c,h,w]
+        return {"data_batch_0": tmp_inp}, False, None, ""
+
+    def postprocess(self, input_dicts: Dict, fetch_dict: Dict, data_id: int,
+                    log_id: int):
+        """postprocess
+
+        Args:
+            input_dicts (Dict): data returned in preprocess stage, dict(for single predict) or list(for batch predict).
+            fetch_dict (Dict): data returned in process stage, dict(for single predict) or list(for batch predict).
+            data_id (int): inner unique id, increase auto.
+            log_id (int): logid, 0 default.
+
+        Returns:
+            fetch_dict: fetch result must be dict type.
+            prod_errcode: None default, otherwise, product errores occured.
+                          It is handled in the same way as exception.
+            prod_errinfo: "" default.
+        """
+        score_list = fetch_dict["outputs"]
+        result = {"label": [], "prob": []}
+        for score in score_list:
+            score = np_softmax(score)
+            score = score.tolist()
+            max_score = max(score)
+            max_index = score.index(max_score)
+            result["label"].append(self.label_dict[max_index])
+            result["prob"].append(max_score)
+        result["label"] = str(result["label"])
+        result["prob"] = str(result["prob"])
+        return result, None, ""
+
+
+class VideoService(WebService):
+    def get_pipeline_response(self, read_op):
+        """get_pipeline_response
+
+        Args:
+            read_op ([type]): [description]
+
+        Returns:
+            [type]: [description]
+        """
+        video_op = VideoOp(name="video", input_ops=[read_op])
+        return video_op
+
+
+def parse_args():
+    # general params
+    parser = argparse.ArgumentParser("PaddleVideo Web Serving model script")
+    parser.add_argument(
+        '-n',
+        '--name',
+        type=str,
+        default='PPTSM',
+        help='model name used in web serving, such as PPTSM, PPTSN...')
+
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/PP-TSM.yaml',
+                        help='serving config file path')
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    # get args such as serving config yaml path.
+    args = parse_args()
+
+    # start serving
+    uci_service = VideoService(name="video")
+    uci_service.prepare_pipeline_config(yaml_file=args.config)
+    uci_service.run_service()
diff --git a/docs/src/deploy/python_serving/utils.py b/docs/src/deploy/python_serving/utils.py
new file mode 100644
index 000000000..242b4244d
--- /dev/null
+++ b/docs/src/deploy/python_serving/utils.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import os
+import os.path as osp
+
+import cv2
+import numpy as np
+
+
+def numpy_to_base64(array: np.ndarray) -> str:
+    """numpy_to_base64
+
+    Args:
+        array (np.ndarray): input ndarray.
+
+    Returns:
+        bytes object: encoded str.
+    """
+    return base64.b64encode(array).decode('utf8')
+
+
+def video_to_numpy(file_path: str) -> np.ndarray:
+    """decode video with cv2 and return stacked frames
+       as numpy.
+
+    Args:
+        file_path (str): video file path.
+
+    Returns:
+        np.ndarray: [T,H,W,C] in uint8.
+    """
+    cap = cv2.VideoCapture(file_path)
+    videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    decoded_frames = []
+    for i in range(videolen):
+        ret, frame = cap.read()
+        # maybe first frame is empty
+        if ret is False:
+            continue
+        img = frame[:, :, ::-1]
+        decoded_frames.append(img)
+    decoded_frames = np.stack(decoded_frames, axis=0)
+    return decoded_frames
+
+
+def parse_file_paths(input_path: str) -> list:
+    """get data pathes from input_path
+
+    Args:
+        input_path (str): input file path or directory which contains input file(s).
+
+    Returns:
+        list: path(es) of input file(s)
+    """
+    assert osp.exists(input_path), \
+        f"{input_path} did not exists!"
+    if osp.isfile(input_path):
+        files = [
+            input_path,
+        ]
+    else:
+        files = os.listdir(input_path)
+        files = [
+            file for file in files
+            if (file.endswith(".avi") or file.endswith(".mp4"))
+        ]
+        files = [osp.join(input_path, file) for file in files]
+    return files
diff --git a/docs/src/deploy/slim/quant_post_static.py b/docs/src/deploy/slim/quant_post_static.py
new file mode 100644
index 000000000..f517c0e40
--- /dev/null
+++ b/docs/src/deploy/slim/quant_post_static.py
@@ -0,0 +1,120 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import os.path as osp
+import sys
+
+import numpy as np
+import paddle
+from paddleslim.quant import quant_post_static
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+
+from paddlevideo.loader.builder import build_dataloader, build_dataset
+from paddlevideo.utils import get_config, get_logger
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default=
+        '../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml',
+        help='quantization config file path')
+    parser.add_argument('-o',
+                        '--override',
+                        action='append',
+                        default=[],
+                        help='config options to be overridden')
+    parser.add_argument("--use_gpu",
+                        type=str2bool,
+                        default=True,
+                        help="whether use gpui during quantization")
+
+    return parser.parse_args()
+
+
+def post_training_quantization(cfg, use_gpu: bool = True):
+    """Quantization entry
+
+    Args:
+        cfg (dict): quntization configuration.
+        use_gpu (bool, optional): whether to use gpu during quantization. Defaults to True.
+    """
+    logger = get_logger("paddlevideo")
+
+    place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+
+    # get defined params
+    batch_nums = cfg.DATASET.pop('batch_nums')
+    batch_size = cfg.DATASET.get('batch_size', 1)
+    num_workers = cfg.DATASET.get('num_workers', 0)
+    inference_file_name = cfg.get('model_name', 'inference')
+    inference_model_dir = cfg.get('inference_model_dir',
+                                  f'./inference/{inference_file_name}')
+    quant_output_dir = cfg.get('quant_output_dir',
+                               osp.join(inference_model_dir, 'quant_model'))
+
+    # build dataloader for quantization, lite data is enough
+    slim_dataset = build_dataset((cfg.DATASET.quant, cfg.PIPELINE.quant))
+    slim_dataloader_setting = dict(batch_size=batch_size,
+                                   num_workers=num_workers,
+                                   places=place,
+                                   drop_last=False,
+                                   shuffle=False)
+    slim_loader = build_dataloader(slim_dataset, **slim_dataloader_setting)
+
+    logger.info("Build slim_loader finished")
+
+    def sample_generator(loader):
+        def __reader__():
+            for indx, data in enumerate(loader):
+                # must return np.ndarray, not paddle.Tensor
+                videos = np.array(data[0])
+                yield videos
+
+        return __reader__
+
+    # execute quantization in static graph mode
+    paddle.enable_static()
+
+    exe = paddle.static.Executor(place)
+
+    logger.info("Staring Post-Training Quantization...")
+
+    quant_post_static(executor=exe,
+                      model_dir=inference_model_dir,
+                      quantize_model_path=quant_output_dir,
+                      sample_generator=sample_generator(slim_loader),
+                      model_filename=f'{inference_file_name}.pdmodel',
+                      params_filename=f'{inference_file_name}.pdiparams',
+                      batch_size=batch_size,
+                      batch_nums=batch_nums,
+                      algo='KL')
+
+    logger.info("Post-Training Quantization finished...")
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    cfg = get_config(args.config, overrides=args.override)
+    post_training_quantization(cfg, args.use_gpu)
diff --git a/docs/src/deploy/slim/readme.md b/docs/src/deploy/slim/readme.md
new file mode 100644
index 000000000..cc9f764f8
--- /dev/null
+++ b/docs/src/deploy/slim/readme.md
@@ -0,0 +1,133 @@
+
+## Slim功能介绍
+复杂的模型有利于提高模型的性能，但也导致模型中存在一定冗余。此部分提供精简模型的功能，包括两部分：模型量化（量化训练、离线量化）、模型剪枝。
+
+其中模型量化将全精度缩减到定点数减少这种冗余，达到减少模型计算复杂度，提高模型推理性能的目的。
+模型量化可以在基本不损失模型的精度的情况下，将FP32精度的模型参数转换为Int8精度，减小模型参数大小并加速计算，使用量化后的模型在移动端等部署时更具备速度优势。
+
+模型剪枝将CNN中不重要的卷积核裁剪掉，减少模型参数量，从而降低模型计算复杂度。
+
+本教程将介绍如何使用飞桨模型压缩库PaddleSlim做PaddleVideo模型的压缩。
+[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) 集成了模型剪枝、量化（包括量化训练和离线量化）、蒸馏和神经网络搜索等多种业界常用且领先的模型压缩功能，如果您感兴趣，可以关注并了解。
+
+在开始本教程之前，建议先了解[PaddleVideo模型的训练方法](../../docs/zh-CN/usage.md)以及[PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/latest/index.html)
+
+
+## 快速开始
+当训练出一个模型后，如果希望进一步的压缩模型大小并加速预测，可使用量化或者剪枝的方法压缩模型。
+
+模型压缩主要包括五个步骤：
+1. 安装 PaddleSlim
+2. 准备训练好的模型
+3. 模型压缩
+4. 导出量化推理模型
+5. 量化模型预测部署
+
+### 1. 安装PaddleSlim
+
+* 可以通过pip install的方式进行安装。
+
+```bash
+python3.7 -m pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+* 如果获取PaddleSlim的最新特性，可以从源码安装。
+
+```bash
+git clone https://github.com/PaddlePaddle/PaddleSlim.git
+cd Paddleslim
+python3.7 setup.py install
+```
+
+### 2. 准备训练好的模型
+
+PaddleVideo提供了一系列训练好的[模型](../../docs/zh-CN/model_zoo/README.md)，如果待量化的模型不在列表中，需要按照[常规训练](../../docs/zh-CN/usage.md)方法得到训练好的模型。
+
+### 3. 模型压缩
+
+进入PaddleVideo根目录
+
+```bash
+cd PaddleVideo
+```
+
+离线量化代码位于`deploy/slim/quant_post_static.py`。
+
+#### 3.1 模型量化
+
+量化训练包括离线量化训练和在线量化训练(TODO)，在线量化训练效果更好，需加载预训练模型，在定义好量化策略后即可对模型进行量化。
+
+##### 3.1.1 在线量化训练
+TODO
+
+##### 3.1.2 离线量化
+
+**注意**：目前离线量化，必须使用已经训练好的模型导出的`inference model`进行量化。一般模型导出`inference model`可参考[教程](../../docs/zh-CN/usage.md#5-模型推理).
+
+一般来说，离线量化损失模型精度较多。
+
+以PP-TSM模型为例，生成`inference model`后，离线量化运行方式如下
+
+```bash
+# 下载并解压出少量数据用于离线量化的校准
+pushd ./data/k400
+wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+tar -xf k400_rawframes_small.tar
+popd
+
+# 然后进入deploy/slim目录下
+cd deploy/slim
+
+# 执行离线量化命令
+python3.7 quant_post_static.py \
+-c ../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml \
+--use_gpu=True
+```
+
+除`use_gpu`外，所有的量化环境参数都在`pptsm_k400_frames_uniform_quantization.yaml`文件中进行配置
+其中`inference_model_dir`表示上一步导出的`inference model`目录路径，`quant_output_dir`表示量化模型的输出目录路径
+
+执行成功后，在`quant_output_dir`的目录下生成了`__model__`文件和`__params__`文件，这二者用于存储生成的离线量化模型
+类似`inference model`的使用方法，接下来可以直接用这两个文件进行预测部署，无需再重新导出模型。
+
+```bash
+# 使用PP-TSM离线量化模型进行预测
+# 回到PaddleVideo目录下
+cd ../../
+
+# 使用量化模型进行预测
+python3.7 tools/predict.py \
+--input_file data/example.avi \
+--config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+--model_file ./inference/ppTSM/quant_model/__model__ \
+--params_file ./inference/ppTSM/quant_model/__params__ \
+--use_gpu=True \
+--use_tensorrt=False
+```
+
+输出如下：
+```bash
+Current video file: data/example.avi
+        top-1 class: 5
+        top-1 score: 0.9997928738594055
+```
+#### 3.2 模型剪枝
+TODO
+
+
+### 4. 导出模型
+TODO
+
+
+### 5. 模型部署
+
+上述步骤导出的模型可以通过PaddleLite的opt模型转换工具完成模型转换。
+模型部署的可参考
+[Serving Python部署](../python_serving/readme.md)
+[Serving C++部署](../cpp_serving/readme.md)
+
+
+## 训练超参数建议
+
+* 量化训练时，建议加载常规训练得到的预训练模型，加速量化训练收敛。
+* 量化训练时，建议初始学习率修改为常规训练的`1/20~1/10`，同时将训练epoch数修改为常规训练的`1/5~1/2`，学习率策略方面，加上Warmup，其他配置信息不建议修改。
diff --git a/docs/src/deploy/slim/readme_en.md b/docs/src/deploy/slim/readme_en.md
new file mode 100644
index 000000000..c7f6ba20a
--- /dev/null
+++ b/docs/src/deploy/slim/readme_en.md
@@ -0,0 +1,132 @@
+## Slim function introduction
+A complex model is beneficial to improve the performance of the model, but it also leads to some redundancy in the model. This part provides the function of reducing the model, including two parts: model quantization (quantization training, offline quantization), model pruning.
+
+Among them, model quantization reduces the full precision to fixed-point numbers to reduce this redundancy, so as to reduce the computational complexity of the model and improve the inference performance of the model.
+Model quantization can convert FP32-precision model parameters to Int8-precision without losing the accuracy of the model, reducing the size of model parameters and speeding up the calculation. Using the quantized model has a speed advantage when deploying on mobile terminals.
+
+Model pruning cuts out the unimportant convolution kernels in the CNN, reduces the amount of model parameters, and thus reduces the computational complexity of the model.
+
+This tutorial will introduce how to use PaddleSlim, a paddle model compression library, to compress PaddleVideo models.
+[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) integrates model pruning, quantization (including quantization training and offline quantization), distillation and neural network search and other commonly used and leading model compression functions in the industry. If you are interested, you can follow and understand.
+
+Before starting this tutorial, it is recommended to understand [PaddleVideo model training method](../../docs/zh-CN/usage.md) and [PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/ latest/index.html)
+
+
+## quick start
+After training a model, if you want to further compress the model size and speed up prediction, you can use quantization or pruning to compress the model.
+
+Model compression mainly includes five steps:
+1. Install PaddleSlim
+2. Prepare the trained model
+3. Model Compression
+4. Export the quantitative inference model
+5. Quantitative Model Prediction Deployment
+
+### 1. Install PaddleSlim
+
+* It can be installed by pip install.
+
+```bash
+python3.7 -m pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+* If you get the latest features of PaddleSlim, you can install it from source.
+
+```bash
+git clone https://github.com/PaddlePaddle/PaddleSlim.git
+cd Paddleslim
+python3.7 setup.py install
+```
+
+### 2. Prepare the trained model
+
+PaddleVideo provides a series of trained [models](../../docs/zh-CN/model_zoo/README.md). If the model to be quantized is not in the list, you need to follow the [regular training](../ ../docs/zh-CN/usage.md) method to get the trained model.
+
+### 3. Model Compression
+
+Go to PaddleVideo root directory
+
+```bash
+cd PaddleVideo
+```
+
+The offline quantization code is located in `deploy/slim/quant_post_static.py`.
+
+#### 3.1 Model Quantization
+
+Quantization training includes offline quantization training and online quantization training (TODO). The effect of online quantization training is better. The pre-training model needs to be loaded, and the model can be quantized after the quantization strategy is defined.
+
+##### 3.1.1 Online quantitative training
+TODO
+
+##### 3.1.2 Offline Quantization
+
+**Note**: For offline quantization, you must use the `inference model` exported from the trained model for quantization. For general model export `inference model`, please refer to [Tutorial](../../docs/zh-CN/usage.md#5-Model Inference).
+
+Generally speaking, the offline quantization loss model has more accuracy.
+
+Taking the PP-TSM model as an example, after generating the `inference model`, the offline quantization operation is as follows
+
+```bash
+# download a small amount of data for calibration
+pushd ./data/k400
+wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+tar -xf k400_rawframes_small.tar
+popd
+
+# then switch to deploy/slim
+cd deploy/slim
+
+# execute quantization script
+python3.7 quant_post_static.py \
+-c ../../configs/recognition/pptsm/pptsm_k400_frames_uniform_quantization.yaml \
+--use_gpu=True
+```
+
+All quantization environment parameters except `use_gpu` are configured in `pptsm_k400_frames_uniform_quantization.yaml` file
+Where `inference_model_dir` represents the directory path of the `inference model` exported in the previous step, and `quant_output_dir` represents the output directory path of the quantization model
+
+After successful execution, the `__model__` file and the `__params__` file are generated in the `quant_output_dir` directory, which are used to store the generated offline quantization model
+Similar to the usage of `inference model`, you can directly use these two files for prediction deployment without re-exporting the model.
+
+```bash
+# Use PP-TSM offline quantization model for prediction
+# Go back to the PaddleVideo directory
+cd ../../
+
+# Use the quantized model to make predictions
+python3.7 tools/predict.py \
+--input_file data/example.avi \
+--config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+--model_file ./inference/ppTSM/quant_model/__model__ \
+--params_file ./inference/ppTSM/quant_model/__params__ \
+--use_gpu=True \
+--use_tensorrt=False
+```
+
+The output is as follows:
+```bash
+Current video file: data/example.avi
+        top-1 class: 5
+        top-1 score: 0.9997928738594055
+```
+#### 3.2 Model pruning
+TODO
+
+
+### 4. Export the model
+TODO
+
+
+### 5. Model Deployment
+
+The model exported in the above steps can be converted through the opt model conversion tool of PaddleLite.
+Reference for model deployment
+[Serving Python Deployment](../python_serving/readme.md)
+[Serving C++ Deployment](../cpp_serving/readme.md)
+
+
+## Training hyperparameter suggestions
+
+* During quantitative training, it is recommended to load the pre-trained model obtained from regular training to accelerate the convergence of quantitative training.
+* During quantitative training, it is recommended to modify the initial learning rate to `1/20~1/10` of conventional training, and modify the number of training epochs to `1/5~1/2` of conventional training. In terms of learning rate strategy, add On Warmup, other configuration information is not recommended to be modified.
diff --git a/docs/en/benchmark.md b/docs/src/english_documents/benchmark.md
similarity index 100%
rename from docs/en/benchmark.md
rename to docs/src/english_documents/benchmark.md
diff --git a/docs/en/dataset/AVA.md b/docs/src/english_documents/dataset/AVA.md
similarity index 100%
rename from docs/en/dataset/AVA.md
rename to docs/src/english_documents/dataset/AVA.md
diff --git a/docs/en/dataset/ActivityNet.md b/docs/src/english_documents/dataset/ActivityNet.md
similarity index 100%
rename from docs/en/dataset/ActivityNet.md
rename to docs/src/english_documents/dataset/ActivityNet.md
diff --git a/docs/en/dataset/Oxford_RobotCar.md b/docs/src/english_documents/dataset/Oxford_RobotCar.md
similarity index 100%
rename from docs/en/dataset/Oxford_RobotCar.md
rename to docs/src/english_documents/dataset/Oxford_RobotCar.md
diff --git a/docs/en/dataset/README.md b/docs/src/english_documents/dataset/README.md
similarity index 100%
rename from docs/en/dataset/README.md
rename to docs/src/english_documents/dataset/README.md
diff --git a/docs/en/dataset/SegmentationDataset.md b/docs/src/english_documents/dataset/SegmentationDataset.md
similarity index 100%
rename from docs/en/dataset/SegmentationDataset.md
rename to docs/src/english_documents/dataset/SegmentationDataset.md
diff --git a/docs/en/dataset/fsd.md b/docs/src/english_documents/dataset/fsd.md
similarity index 100%
rename from docs/en/dataset/fsd.md
rename to docs/src/english_documents/dataset/fsd.md
diff --git a/docs/src/english_documents/dataset/k400.md b/docs/src/english_documents/dataset/k400.md
new file mode 100644
index 000000000..f76de02c8
--- /dev/null
+++ b/docs/src/english_documents/dataset/k400.md
@@ -0,0 +1,78 @@
+[简体中文](../../zh-CN/dataset/k400.md) | English
+
+# Kinetics-400 Preparation
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+- [Frames](#Frames)
+
+---
+
+
+## Introduction
+
+Kinetics-400 is a commonly used benchmark dataset in the video field. Please refer to its official website [Kinetics](https://deepmind.com/research/open-source/kinetics) for details. You can refer to the official address [ActivityNet](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics), and use the download script provided to download the dataset.
+
+## Download
+
+Considering the difficulty of downloading the K400 data set, we provide two download methods: (1) Baidu network disk download (2) Script download
+
+### Baidu SkyDrive Download
+
+Netdisk link: https://pan.baidu.com/s/1S_CGBjWOUAuxL_cCX5kMPg
+Extraction code: `ppvi`
+
+### Script download
+
+- Download the training set link list file [train_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list) and the validation set link list file [val_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list).
+
+Write the download script `download.sh` as follows:
+
+```bash
+file=$1
+
+while read line 
+do
+  wget "$line"
+done <$file
+```
+
+Download training set command:
+```bash
+bash download.sh train_link.list
+```
+
+Download verification set command:
+```bash
+bash download.sh val_link.list
+```
+
+---
+
+|category | Number of data  | list file |
+| :------: | :----------: | :----: |
+|Training set | 234619  |  [train.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list)|
+|Validation set | 19761 |  [val.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list)|
+
+- After downloading, unzip and add the data path to list file.
+
+- Due to the failure of some video link, part of original data is missing. This copies need about 135G of storage space.
+
+> This copies is only used for academic research. If it is helpful to you, welcome to star [our project](https://github.com/PaddlePaddle/PaddleVideo)
+
+
+## Frames
+In order to speed up the training process of the network, we first extract frames from the video file (K400 video file is in mp4 format). Compared with the method of network training directly through video files, the method of frames can greatly accelerate the speed of network training。
+
+Enter the following command to extract the frames of the K400 video file
+
+```python
+python extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4
+```
+
+After the video file frames are extracted, they will be stored in the specified `./rawframes` path, and the size is about 2T.
+
+|category | Number of data  | list file |
+| :------: | :----------: | :----: |
+|Training set | 234619  |  [train_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list)|
+|Validation set | 19761 |  [val_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list)|
diff --git a/docs/en/dataset/msrvtt.md b/docs/src/english_documents/dataset/msrvtt.md
similarity index 100%
rename from docs/en/dataset/msrvtt.md
rename to docs/src/english_documents/dataset/msrvtt.md
diff --git a/docs/en/dataset/ntu-rgbd.md b/docs/src/english_documents/dataset/ntu-rgbd.md
similarity index 100%
rename from docs/en/dataset/ntu-rgbd.md
rename to docs/src/english_documents/dataset/ntu-rgbd.md
diff --git a/docs/en/dataset/ucf101.md b/docs/src/english_documents/dataset/ucf101.md
similarity index 100%
rename from docs/en/dataset/ucf101.md
rename to docs/src/english_documents/dataset/ucf101.md
diff --git a/docs/en/dataset/ucf24.md b/docs/src/english_documents/dataset/ucf24.md
similarity index 100%
rename from docs/en/dataset/ucf24.md
rename to docs/src/english_documents/dataset/ucf24.md
diff --git a/docs/src/english_documents/dataset/youtube8m.md b/docs/src/english_documents/dataset/youtube8m.md
new file mode 100644
index 000000000..e1c09a11d
--- /dev/null
+++ b/docs/src/english_documents/dataset/youtube8m.md
@@ -0,0 +1,56 @@
+English | [简体中文](../../zh-CN/dataset/youtube8m.md)
+
+# YouTube-8M Data Preparation
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+- [Conversion](#Conversion)
+
+
+## Introduction
+
+YouTube-8M is a large-scale video classification data set, containing more than 8 million video URLs. The tag system covers more than 3800 knowledge graph entities. One video corresponds to multiple tags (3-4 on average) and is labeled by machine.
+
+**The length of each video is between 120s and 500s
+Due to the large amount of video data, the image classification model was used to extract frame-level features in advance, and PCA was used to reduce the dimensionality of the features to obtain multi-frame 1024-dimensional features. Similarly, the audio model was used to obtain multi-frame 128-dimensional features. Audio characteristics. **
+> The dataset used here is the updated YouTube-8M data set in 2018 (May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features).
+  
+
+## Download
+1. Create a new directory for storing features (take the PaddleVideo directory as an example)
+    ```bash
+    cd data/yt8m
+    mkdir frame
+    cd frame
+    ```
+2. Download the training and validation set to the frame folder
+    ```bash
+    curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python
+    curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python
+    ```
+    The download process is shown in the figure
+    ![image](https://user-images.githubusercontent.com/23737287/140709613-1e2d6ec0-a82e-474d-b220-7803065b0153.png)
+
+    After the data download is complete, you will get 3844 training data files and 3844 verification data files (TFRecord format)
+
+## Conversion
+1. Install tensorflow to read tfrecord data
+    ```bash
+    python3.7 -m pip install tensorflow-gpu==1.14.0
+    ```
+2. Convert the downloaded TFRecord file into a pickle file for PaddlePaddle to use
+    ```bash
+    cd .. # From the frame directory back to the yt8m directory
+    python3.7 tf2pkl.py ./frame ./pkl_frame/ # Convert train*.tfrecord and validate*.tfrecord in the frame folder to pkl format
+    ```
+3. Generate a single pkl file path set, and split pkl into multiple small pkl files based on this file, and generate the final split pkl file path required
+    ```bash
+    ls pkl_frame/train*.pkl> train.list # Write the path of train*.pkl to train.list
+    ls pkl_frame/validate*.pkl> val.list # Write the path of validate*.pkl into val.list
+
+    python3.7 split_yt8m.py train.list # Split each train*.pkl into multiple train*_split*.pkl
+    python3.7 split_yt8m.py val.list # Split each validate*.pkl into multiple validate*_split*.pkl
+    
+    ls pkl_frame/train*_split*.pkl> train.list # Rewrite the path of train*_split*.pkl into train.list
+    ls pkl_frame/validate*_split*.pkl> val.list # Rewrite the path of validate*_split*.pkl into val.list
+    ``` 
diff --git a/docs/en/install.md b/docs/src/english_documents/install.md
similarity index 100%
rename from docs/en/install.md
rename to docs/src/english_documents/install.md
diff --git a/docs/en/model_zoo/README.md b/docs/src/english_documents/model_zoo/README.md
similarity index 100%
rename from docs/en/model_zoo/README.md
rename to docs/src/english_documents/model_zoo/README.md
diff --git a/docs/en/model_zoo/detection/SlowFast_FasterRCNN_en.md b/docs/src/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md
similarity index 100%
rename from docs/en/model_zoo/detection/SlowFast_FasterRCNN_en.md
rename to docs/src/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md
diff --git a/docs/en/model_zoo/estimation/adds.md b/docs/src/english_documents/model_zoo/estimation/adds.md
similarity index 100%
rename from docs/en/model_zoo/estimation/adds.md
rename to docs/src/english_documents/model_zoo/estimation/adds.md
diff --git a/docs/en/model_zoo/localization/bmn.md b/docs/src/english_documents/model_zoo/localization/bmn.md
similarity index 100%
rename from docs/en/model_zoo/localization/bmn.md
rename to docs/src/english_documents/model_zoo/localization/bmn.md
diff --git a/docs/en/model_zoo/localization/yowo.md b/docs/src/english_documents/model_zoo/localization/yowo.md
similarity index 100%
rename from docs/en/model_zoo/localization/yowo.md
rename to docs/src/english_documents/model_zoo/localization/yowo.md
diff --git a/docs/en/model_zoo/multimodal/actbert.md b/docs/src/english_documents/model_zoo/multimodal/actbert.md
similarity index 100%
rename from docs/en/model_zoo/multimodal/actbert.md
rename to docs/src/english_documents/model_zoo/multimodal/actbert.md
diff --git a/docs/en/model_zoo/partition/transnetv2.md b/docs/src/english_documents/model_zoo/partition/transnetv2.md
similarity index 100%
rename from docs/en/model_zoo/partition/transnetv2.md
rename to docs/src/english_documents/model_zoo/partition/transnetv2.md
diff --git a/docs/en/model_zoo/recognition/agcn.md b/docs/src/english_documents/model_zoo/recognition/agcn.md
similarity index 100%
rename from docs/en/model_zoo/recognition/agcn.md
rename to docs/src/english_documents/model_zoo/recognition/agcn.md
diff --git a/docs/en/model_zoo/recognition/agcn2s.md b/docs/src/english_documents/model_zoo/recognition/agcn2s.md
similarity index 100%
rename from docs/en/model_zoo/recognition/agcn2s.md
rename to docs/src/english_documents/model_zoo/recognition/agcn2s.md
diff --git a/docs/en/model_zoo/recognition/attention_lstm.md b/docs/src/english_documents/model_zoo/recognition/attention_lstm.md
similarity index 100%
rename from docs/en/model_zoo/recognition/attention_lstm.md
rename to docs/src/english_documents/model_zoo/recognition/attention_lstm.md
diff --git a/docs/en/model_zoo/recognition/ctrgcn.md b/docs/src/english_documents/model_zoo/recognition/ctrgcn.md
similarity index 100%
rename from docs/en/model_zoo/recognition/ctrgcn.md
rename to docs/src/english_documents/model_zoo/recognition/ctrgcn.md
diff --git a/docs/en/model_zoo/recognition/movinet.md b/docs/src/english_documents/model_zoo/recognition/movinet.md
similarity index 100%
rename from docs/en/model_zoo/recognition/movinet.md
rename to docs/src/english_documents/model_zoo/recognition/movinet.md
diff --git a/docs/en/model_zoo/recognition/posec3d.md b/docs/src/english_documents/model_zoo/recognition/posec3d.md
similarity index 100%
rename from docs/en/model_zoo/recognition/posec3d.md
rename to docs/src/english_documents/model_zoo/recognition/posec3d.md
diff --git a/docs/en/model_zoo/recognition/pp-timesformer.md b/docs/src/english_documents/model_zoo/recognition/pp-timesformer.md
similarity index 100%
rename from docs/en/model_zoo/recognition/pp-timesformer.md
rename to docs/src/english_documents/model_zoo/recognition/pp-timesformer.md
diff --git a/docs/en/model_zoo/recognition/pp-tsm.md b/docs/src/english_documents/model_zoo/recognition/pp-tsm.md
similarity index 100%
rename from docs/en/model_zoo/recognition/pp-tsm.md
rename to docs/src/english_documents/model_zoo/recognition/pp-tsm.md
diff --git a/docs/en/model_zoo/recognition/pp-tsn.md b/docs/src/english_documents/model_zoo/recognition/pp-tsn.md
similarity index 100%
rename from docs/en/model_zoo/recognition/pp-tsn.md
rename to docs/src/english_documents/model_zoo/recognition/pp-tsn.md
diff --git a/docs/en/model_zoo/recognition/slowfast.md b/docs/src/english_documents/model_zoo/recognition/slowfast.md
similarity index 100%
rename from docs/en/model_zoo/recognition/slowfast.md
rename to docs/src/english_documents/model_zoo/recognition/slowfast.md
diff --git a/docs/en/model_zoo/recognition/stgcn.md b/docs/src/english_documents/model_zoo/recognition/stgcn.md
similarity index 100%
rename from docs/en/model_zoo/recognition/stgcn.md
rename to docs/src/english_documents/model_zoo/recognition/stgcn.md
diff --git a/docs/en/model_zoo/recognition/timesformer.md b/docs/src/english_documents/model_zoo/recognition/timesformer.md
similarity index 100%
rename from docs/en/model_zoo/recognition/timesformer.md
rename to docs/src/english_documents/model_zoo/recognition/timesformer.md
diff --git a/docs/en/model_zoo/recognition/tokenshift_transformer.md b/docs/src/english_documents/model_zoo/recognition/tokenshift_transformer.md
similarity index 100%
rename from docs/en/model_zoo/recognition/tokenshift_transformer.md
rename to docs/src/english_documents/model_zoo/recognition/tokenshift_transformer.md
diff --git a/docs/en/model_zoo/recognition/tsm.md b/docs/src/english_documents/model_zoo/recognition/tsm.md
similarity index 100%
rename from docs/en/model_zoo/recognition/tsm.md
rename to docs/src/english_documents/model_zoo/recognition/tsm.md
diff --git a/docs/en/model_zoo/recognition/tsn.md b/docs/src/english_documents/model_zoo/recognition/tsn.md
similarity index 100%
rename from docs/en/model_zoo/recognition/tsn.md
rename to docs/src/english_documents/model_zoo/recognition/tsn.md
diff --git a/docs/en/model_zoo/recognition/tsn_dali.md b/docs/src/english_documents/model_zoo/recognition/tsn_dali.md
similarity index 100%
rename from docs/en/model_zoo/recognition/tsn_dali.md
rename to docs/src/english_documents/model_zoo/recognition/tsn_dali.md
diff --git a/docs/en/model_zoo/recognition/videoswin.md b/docs/src/english_documents/model_zoo/recognition/videoswin.md
similarity index 100%
rename from docs/en/model_zoo/recognition/videoswin.md
rename to docs/src/english_documents/model_zoo/recognition/videoswin.md
diff --git a/docs/en/model_zoo/segmentation/asrf.md b/docs/src/english_documents/model_zoo/segmentation/asrf.md
similarity index 100%
rename from docs/en/model_zoo/segmentation/asrf.md
rename to docs/src/english_documents/model_zoo/segmentation/asrf.md
diff --git a/docs/en/model_zoo/segmentation/cfbi.md b/docs/src/english_documents/model_zoo/segmentation/cfbi.md
similarity index 100%
rename from docs/en/model_zoo/segmentation/cfbi.md
rename to docs/src/english_documents/model_zoo/segmentation/cfbi.md
diff --git a/docs/en/model_zoo/segmentation/mstcn.md b/docs/src/english_documents/model_zoo/segmentation/mstcn.md
similarity index 100%
rename from docs/en/model_zoo/segmentation/mstcn.md
rename to docs/src/english_documents/model_zoo/segmentation/mstcn.md
diff --git a/docs/en/quick_start.md b/docs/src/english_documents/quick_start.md
similarity index 100%
rename from docs/en/quick_start.md
rename to docs/src/english_documents/quick_start.md
diff --git a/docs/en/tools.md b/docs/src/english_documents/tools.md
similarity index 100%
rename from docs/en/tools.md
rename to docs/src/english_documents/tools.md
diff --git a/docs/en/tutorials/Action Recognition Datasets b/docs/src/english_documents/tutorials/Action Recognition Datasets
similarity index 100%
rename from docs/en/tutorials/Action Recognition Datasets
rename to docs/src/english_documents/tutorials/Action Recognition Datasets
diff --git a/docs/en/tutorials/Action Recognition Papers b/docs/src/english_documents/tutorials/Action Recognition Papers
similarity index 100%
rename from docs/en/tutorials/Action Recognition Papers
rename to docs/src/english_documents/tutorials/Action Recognition Papers
diff --git a/docs/en/tutorials/Spatio-Temporal Action Detection Papers b/docs/src/english_documents/tutorials/Spatio-Temporal Action Detection Papers
similarity index 100%
rename from docs/en/tutorials/Spatio-Temporal Action Detection Papers
rename to docs/src/english_documents/tutorials/Spatio-Temporal Action Detection Papers
diff --git a/docs/en/tutorials/TSM.md b/docs/src/english_documents/tutorials/TSM.md
similarity index 100%
rename from docs/en/tutorials/TSM.md
rename to docs/src/english_documents/tutorials/TSM.md
diff --git a/docs/en/tutorials/Temporal Action Detection Papers b/docs/src/english_documents/tutorials/Temporal Action Detection Papers
similarity index 100%
rename from docs/en/tutorials/Temporal Action Detection Papers
rename to docs/src/english_documents/tutorials/Temporal Action Detection Papers
diff --git a/docs/en/tutorials/accelerate.md b/docs/src/english_documents/tutorials/accelerate.md
similarity index 100%
rename from docs/en/tutorials/accelerate.md
rename to docs/src/english_documents/tutorials/accelerate.md
diff --git a/docs/en/tutorials/config.md b/docs/src/english_documents/tutorials/config.md
similarity index 100%
rename from docs/en/tutorials/config.md
rename to docs/src/english_documents/tutorials/config.md
diff --git a/docs/en/tutorials/customized_usage.md b/docs/src/english_documents/tutorials/customized_usage.md
similarity index 100%
rename from docs/en/tutorials/customized_usage.md
rename to docs/src/english_documents/tutorials/customized_usage.md
diff --git a/docs/en/tutorials/demos b/docs/src/english_documents/tutorials/demos
similarity index 100%
rename from docs/en/tutorials/demos
rename to docs/src/english_documents/tutorials/demos
diff --git a/docs/en/tutorials/deployment.md b/docs/src/english_documents/tutorials/deployment.md
similarity index 100%
rename from docs/en/tutorials/deployment.md
rename to docs/src/english_documents/tutorials/deployment.md
diff --git a/docs/en/tutorials/modular_design.md b/docs/src/english_documents/tutorials/modular_design.md
similarity index 100%
rename from docs/en/tutorials/modular_design.md
rename to docs/src/english_documents/tutorials/modular_design.md
diff --git a/docs/en/tutorials/pp-tsm.md b/docs/src/english_documents/tutorials/pp-tsm.md
similarity index 100%
rename from docs/en/tutorials/pp-tsm.md
rename to docs/src/english_documents/tutorials/pp-tsm.md
diff --git a/docs/en/tutorials/summarize.md b/docs/src/english_documents/tutorials/summarize.md
similarity index 100%
rename from docs/en/tutorials/summarize.md
rename to docs/src/english_documents/tutorials/summarize.md
diff --git a/docs/en/usage.md b/docs/src/english_documents/usage.md
similarity index 100%
rename from docs/en/usage.md
rename to docs/src/english_documents/usage.md
diff --git a/docs/src/main.py b/docs/src/main.py
new file mode 100644
index 000000000..2d18e75df
--- /dev/null
+++ b/docs/src/main.py
@@ -0,0 +1,141 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import random
+
+import numpy as np
+import paddle
+
+from paddlevideo.tasks import (test_model, train_dali, train_model,
+                               train_model_multigrid)
+from paddlevideo.utils import get_config, get_dist_info
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleVideo train script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/example.yaml',
+                        help='config file path')
+    parser.add_argument('-o',
+                        '--override',
+                        action='append',
+                        default=[],
+                        help='config options to be overridden')
+    parser.add_argument('--test',
+                        action='store_true',
+                        help='whether to test a model')
+    parser.add_argument('--train_dali',
+                        action='store_true',
+                        help='whether to use dali to speed up training')
+    parser.add_argument('--multigrid',
+                        action='store_true',
+                        help='whether to use multigrid training')
+    parser.add_argument('-w',
+                        '--weights',
+                        type=str,
+                        help='weights for finetuning or testing')
+    parser.add_argument('--fleet',
+                        action='store_true',
+                        help='whether to use fleet run distributed training')
+    parser.add_argument('--amp',
+                        action='store_true',
+                        help='whether to open amp training.')
+    parser.add_argument(
+        '--amp_level',
+        type=str,
+        default=None,
+        help="optimize level when open amp training, can only be 'O1' or 'O2'.")
+    parser.add_argument(
+        '--validate',
+        action='store_true',
+        help='whether to evaluate the checkpoint during training')
+    parser.add_argument(
+        '--seed',
+        type=int,
+        default=1234,
+        help='fixed all random seeds when the program is running')
+    parser.add_argument(
+        '--max_iters',
+        type=int,
+        default=None,
+        help='max iterations when training(this arg only used in test_tipc)')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format '
+        '\"key1=value1;key2=value2;key3=value3\".')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = get_config(args.config, overrides=args.override)
+
+    # enable to use npu if paddle is built with npu
+    if paddle.is_compiled_with_custom_device('npu') :
+        cfg.__setattr__("use_npu", True)
+    elif paddle.device.is_compiled_with_xpu():
+        cfg.__setattr__("use_xpu", True)
+
+    # set seed if specified
+    seed = args.seed
+    if seed is not None:
+        assert isinstance(
+            seed, int), f"seed must be a integer when specified, but got {seed}"
+        random.seed(seed)
+        np.random.seed(seed)
+        paddle.seed(seed)
+
+    # set amp_level if amp is enabled
+    if args.amp:
+        if args.amp_level is None:
+            args.amp_level = 'O1'  # set defaualt amp_level to 'O1'
+        else:
+            assert args.amp_level in [
+                'O1', 'O2'
+            ], f"amp_level must be 'O1' or 'O2' when amp enabled, but got {args.amp_level}."
+
+    _, world_size = get_dist_info()
+    parallel = world_size != 1
+    if parallel:
+        paddle.distributed.init_parallel_env()
+
+    if args.test:
+        test_model(cfg, weights=args.weights, parallel=parallel)
+    elif args.train_dali:
+        train_dali(cfg, weights=args.weights, parallel=parallel)
+    elif args.multigrid:
+        train_model_multigrid(cfg,
+                              world_size=world_size,
+                              validate=args.validate)
+    else:
+        train_model(cfg,
+                    weights=args.weights,
+                    parallel=parallel,
+                    validate=args.validate,
+                    use_fleet=args.fleet,
+                    use_amp=args.amp,
+                    amp_level=args.amp_level,
+                    max_iters=args.max_iters,
+                    profiler_options=args.profiler_options)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docs/src/paddlevideo/__init__.py b/docs/src/paddlevideo/__init__.py
new file mode 100644
index 000000000..8b03acf29
--- /dev/null
+++ b/docs/src/paddlevideo/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .version import paddlevideo_version
diff --git a/docs/src/paddlevideo/loader/__init__.py b/docs/src/paddlevideo/loader/__init__.py
new file mode 100644
index 000000000..4ed9b11a7
--- /dev/null
+++ b/docs/src/paddlevideo/loader/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import build_dataset, build_dataloader, build_batch_pipeline
+from .dataset import VideoDataset
+from .dali_loader import TSN_Dali_loader, get_input_data
+
+__all__ = [
+    'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset',
+    'TSN_Dali_loader', 'get_input_data'
+]
diff --git a/docs/src/paddlevideo/loader/builder.py b/docs/src/paddlevideo/loader/builder.py
new file mode 100644
index 000000000..23a65c3bf
--- /dev/null
+++ b/docs/src/paddlevideo/loader/builder.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import signal
+import os
+import paddle
+from paddle.io import DataLoader, DistributedBatchSampler
+from .registry import DATASETS, PIPELINES
+from ..utils.build_utils import build
+from .pipelines.compose import Compose
+from paddlevideo.utils import get_logger
+from paddlevideo.utils.multigrid import DistributedShortSampler
+import numpy as np
+
+logger = get_logger("paddlevideo")
+
+
+def build_pipeline(cfg):
+    """Build pipeline.
+    Args:
+        cfg (dict): root config dict.
+    """
+    if cfg == None:
+        return
+    return Compose(cfg)
+
+
+def build_dataset(cfg):
+    """Build dataset.
+    Args:
+        cfg (dict): root config dict.
+
+    Returns:
+        dataset: dataset.
+    """
+    #XXX: ugly code here!
+    cfg_dataset, cfg_pipeline = cfg
+    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)
+    dataset = build(cfg_dataset, DATASETS, key="format")
+    return dataset
+
+
+def build_batch_pipeline(cfg):
+
+    batch_pipeline = build(cfg, PIPELINES)
+    return batch_pipeline
+
+
+def build_dataloader(dataset,
+                     batch_size,
+                     num_workers,
+                     places,
+                     shuffle=True,
+                     drop_last=True,
+                     multigrid=False,
+                     collate_fn_cfg=None,
+                     **kwargs):
+    """Build Paddle Dataloader.
+
+    XXX explain how the dataloader work!
+
+    Args:
+        dataset (paddle.dataset): A PaddlePaddle dataset object.
+        batch_size (int): batch size on single card.
+        num_worker (int): num_worker
+        shuffle(bool): whether to shuffle the data at every epoch.
+    """
+    if multigrid:
+        sampler = DistributedShortSampler(dataset,
+                                          batch_sizes=batch_size,
+                                          shuffle=True,
+                                          drop_last=True)
+    else:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=shuffle,
+                                          drop_last=drop_last)
+
+    #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.
+    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:
+    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.
+
+    def mix_collate_fn(batch):
+        pipeline = build_batch_pipeline(collate_fn_cfg)
+        batch = pipeline(batch)
+        slots = []
+        for items in batch:
+            for i, item in enumerate(items):
+                if len(slots) < len(items):
+                    slots.append([item])
+                else:
+                    slots[i].append(item)
+        return [np.stack(slot, axis=0) for slot in slots]
+
+    #if collate_fn_cfg is not None:
+    #ugly code here. collate_fn is mix op config
+    #    collate_fn = mix_collate_fn(collate_fn_cfg)
+
+    data_loader = DataLoader(
+        dataset,
+        batch_sampler=sampler,
+        places=places,
+        num_workers=num_workers,
+        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,
+        return_list=True,
+        **kwargs)
+
+    return data_loader
+
+
+def term_mp(sig_num, frame):
+    """ kill all child processes
+    """
+    pid = os.getpid()
+    pgid = os.getpgid(os.getpid())
+    logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid))
+    os.killpg(pgid, signal.SIGKILL)
+    return
+
+
+signal.signal(signal.SIGINT, term_mp)
+signal.signal(signal.SIGTERM, term_mp)
diff --git a/docs/src/paddlevideo/loader/dali_loader.py b/docs/src/paddlevideo/loader/dali_loader.py
new file mode 100644
index 000000000..4fb0e2843
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dali_loader.py
@@ -0,0 +1,206 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import math
+
+import paddle
+from paddle.distributed import ParallelEnv
+import paddle.distributed as dist
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+try:
+    from nvidia.dali.pipeline import Pipeline
+    import nvidia.dali.ops as ops
+    import nvidia.dali.types as types
+    import tempfile
+    from nvidia.dali.plugin.paddle import DALIGenericIterator
+except:
+    Pipeline = object
+
+
+def get_input_data(data):
+    return paddle.to_tensor(data[0]['image']), paddle.to_tensor(
+        data[0]['label'])
+
+
+class TSN_Dali_loader(object):
+    def __init__(self, cfg):
+        self.batch_size = cfg.batch_size
+        self.file_path = cfg.file_path
+
+        self.num_seg = cfg.num_seg
+        self.seglen = cfg.seglen
+        self.short_size = cfg.short_size
+        self.target_size = cfg.target_size
+
+        # set num_shards and shard_id when distributed training is implemented
+        self.num_shards = dist.get_world_size()
+        self.shard_id = ParallelEnv().local_rank
+        self.dali_mean = cfg.mean * (self.num_seg * self.seglen)
+        self.dali_std = cfg.std * (self.num_seg * self.seglen)
+
+    def build_dali_reader(self):
+        """
+        build dali training reader
+        """
+        def reader_():
+            with open(self.file_path) as flist:
+                full_lines = [line for line in flist]
+                if (not hasattr(reader_, 'seed')):
+                    reader_.seed = 0
+                random.Random(reader_.seed).shuffle(full_lines)
+                logger.info(f"reader shuffle seed: {reader_.seed}.")
+                if reader_.seed is not None:
+                    reader_.seed += 1
+
+                per_node_lines = int(
+                    math.ceil(len(full_lines) * 1.0 / self.num_shards))
+                total_lines = per_node_lines * self.num_shards
+
+                # aligned full_lines so that it can evenly divisible
+                full_lines += full_lines[:(total_lines - len(full_lines))]
+                assert len(full_lines) == total_lines
+
+                # trainer get own sample
+                lines = full_lines[self.shard_id:total_lines:self.num_shards]
+                assert len(lines) == per_node_lines
+
+                logger.info(
+                    f"shard_id: {self.shard_id}, trainer_count: {self.num_shards}"
+                )
+                logger.info(
+                    f"read videos from {self.shard_id * per_node_lines}, "
+                    f"length: {per_node_lines}, "
+                    f"lines length: {len(lines)}, "
+                    f"total: {len(full_lines)}")
+
+            video_files = ''.join([item for item in lines])
+            tf = tempfile.NamedTemporaryFile()
+            tf.write(str.encode(video_files))
+            tf.flush()
+            video_files = tf.name
+
+            device_id = ParallelEnv().local_rank
+            logger.info(f'---------- device_id: {device_id} -----------')
+
+            pipe = VideoPipe(batch_size=self.batch_size,
+                             num_threads=1,
+                             device_id=device_id,
+                             file_list=video_files,
+                             sequence_length=self.num_seg * self.seglen,
+                             num_seg=self.num_seg,
+                             seg_length=self.seglen,
+                             resize_shorter_scale=self.short_size,
+                             crop_target_size=self.target_size,
+                             is_training=True,
+                             num_shards=self.num_shards,
+                             shard_id=self.shard_id,
+                             dali_mean=self.dali_mean,
+                             dali_std=self.dali_std)
+
+            logger.info(
+                'initializing dataset, it will take several minutes if it is too large .... '
+            )
+            video_loader = DALIGenericIterator([pipe], ['image', 'label'],
+                                               len(lines),
+                                               dynamic_shape=True,
+                                               auto_reset=True)
+
+            return video_loader
+
+        dali_reader = reader_()
+        return dali_reader
+
+
+class VideoPipe(Pipeline):
+    def __init__(self,
+                 batch_size,
+                 num_threads,
+                 device_id,
+                 file_list,
+                 sequence_length,
+                 num_seg,
+                 seg_length,
+                 resize_shorter_scale,
+                 crop_target_size,
+                 is_training=False,
+                 initial_prefetch_size=20,
+                 num_shards=1,
+                 shard_id=0,
+                 dali_mean=0.,
+                 dali_std=1.0):
+        super(VideoPipe, self).__init__(batch_size, num_threads, device_id)
+        self.input = ops.VideoReader(device="gpu",
+                                     file_list=file_list,
+                                     sequence_length=sequence_length,
+                                     num_seg=num_seg,
+                                     seg_length=seg_length,
+                                     is_training=is_training,
+                                     num_shards=num_shards,
+                                     shard_id=shard_id,
+                                     random_shuffle=is_training,
+                                     initial_fill=initial_prefetch_size)
+        # the sequece data read by ops.VideoReader is of shape [F, H, W, C]
+        # Because the ops.Resize does not support sequence data,
+        # it will be transposed into [H, W, F, C],
+        # then reshaped to [H, W, FC], and then resized like a 2-D image.
+        self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
+        self.reshape = ops.Reshape(device="gpu",
+                                   rel_shape=[1.0, 1.0, -1],
+                                   layout='HWC')
+        self.resize = ops.Resize(device="gpu",
+                                 resize_shorter=resize_shorter_scale)
+        # crops and mirror are applied by ops.CropMirrorNormalize.
+        # Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
+        # It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
+        self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))
+        self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))
+        self.mirror_generator = ops.Uniform(range=(0.0, 1.0))
+        self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)
+        self.crop_mirror_norm = ops.CropMirrorNormalize(
+            device="gpu",
+            crop=[crop_target_size, crop_target_size],
+            mean=dali_mean,
+            std=dali_std)
+        self.reshape_back = ops.Reshape(
+            device="gpu",
+            shape=[num_seg, seg_length * 3, crop_target_size, crop_target_size],
+            layout='FCHW')
+        self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
+
+    def define_graph(self):
+        output, label = self.input(name="Reader")
+        output = self.transpose(output)
+        output = self.reshape(output)
+
+        output = self.resize(output)
+        output = output / 255.
+        pos_x = self.pos_rng_x()
+        pos_y = self.pos_rng_y()
+        mirror_flag = self.mirror_generator()
+        mirror_flag = (mirror_flag > 0.5)
+        mirror_flag = self.cast_mirror(mirror_flag)
+        output = self.crop_mirror_norm(output,
+                                       crop_pos_x=pos_x,
+                                       crop_pos_y=pos_y,
+                                       mirror=mirror_flag)
+        output = self.reshape_back(output)
+        label = self.cast_label(label)
+        return output, label
+
+    def __len__(self):
+        return self.epoch_size()
diff --git a/docs/src/paddlevideo/loader/dataset/MRI.py b/docs/src/paddlevideo/loader/dataset/MRI.py
new file mode 100644
index 000000000..990cb87bd
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/MRI.py
@@ -0,0 +1,109 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MRIDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir, frames_len, labels = line_split
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(
+                        frame_dir=frame_dir,
+                        #suffix=self.suffix,
+                        frames_len=frames_len,
+                        labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid gisven index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs']), np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs']), np.array([results['labels']])
diff --git a/docs/src/paddlevideo/loader/dataset/MRI_SlowFast.py b/docs/src/paddlevideo/loader/dataset/MRI_SlowFast.py
new file mode 100644
index 000000000..db905e4e4
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/MRI_SlowFast.py
@@ -0,0 +1,111 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class SFMRIDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir, frames_len, labels = line_split
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(
+                        frame_dir=frame_dir,
+                        #suffix=self.suffix,
+                        frames_len=frames_len,
+                        labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid gisven index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs'][0]), np.array(
+                results['imgs'][1]), np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs'][0]), np.array(
+                results['imgs'][1]), np.array([results['labels']])
diff --git a/docs/src/paddlevideo/loader/dataset/__init__.py b/docs/src/paddlevideo/loader/dataset/__init__.py
new file mode 100644
index 000000000..e97419105
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .actbert_dataset import ActBertDataset
+from .ava_dataset import AVADataset
+from .bmn_dataset import BMNDataset
+from .davis_dataset import DavisDataset
+from .feature import FeatureDataset
+from .frame import FrameDataset, FrameDataset_Sport
+from .MRI import MRIDataset
+from .MRI_SlowFast import SFMRIDataset
+from .msrvtt import MSRVTTDataset
+from .actbert_dataset import ActBertDataset
+from .asrf_dataset import ASRFDataset
+from .ms_tcn_dataset import MSTCNDataset
+from .oxford import MonoDataset
+from .skeleton import SkeletonDataset
+from .slowfast_video import SFVideoDataset
+from .video import VideoDataset
+from .ucf101_skeleton import UCF101SkeletonDataset
+from .ucf24_dataset import UCF24Dataset
+
+
+__all__ = [
+    'VideoDataset', 'FrameDataset', 'SFVideoDataset', 'BMNDataset',
+    'FeatureDataset', 'SkeletonDataset', 'AVADataset', 'MonoDataset',
+    'MSRVTTDataset', 'ActBertDataset', 'DavisDataset', 'MRIDataset',
+    'SFMRIDataset', 'FrameDataset_Sport', 'MSTCNDataset', 'ASRFDataset',
+    'UCF101SkeletonDataset', 'UCF24Dataset'
+]
diff --git a/docs/src/paddlevideo/loader/dataset/actbert_dataset.py b/docs/src/paddlevideo/loader/dataset/actbert_dataset.py
new file mode 100644
index 000000000..8cccf5cc5
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/actbert_dataset.py
@@ -0,0 +1,74 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+try:
+    import lmdb
+except ImportError as e:
+    print(
+        f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
+    )
+import pickle
+import json
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+    )
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class ActBertDataset(BaseDataset):
+    """ActBert dataset.
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        bert_model="bert-base-uncased",
+        data_prefix=None,
+        test_mode=False,
+    ):
+        self.bert_model = bert_model
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        feature_data = np.load(self.file_path, allow_pickle=True)
+        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model,
+                                                       do_lower_case=True)
+        self.info = []
+        for item in feature_data:
+            self.info.append(dict(feature=item, tokenizer=self.tokenizer))
+        return self.info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid given index. """
+        results = copy.deepcopy(self.info[idx])
+        #print('==results==', results)
+        results = self.pipeline(results)
+        return results['features']
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        pass
diff --git a/docs/src/paddlevideo/loader/dataset/asrf_dataset.py b/docs/src/paddlevideo/loader/dataset/asrf_dataset.py
new file mode 100644
index 000000000..15bd35a3c
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/asrf_dataset.py
@@ -0,0 +1,104 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class ASRFDataset(BaseDataset):
+    """Video dataset for action segmentation.
+    """
+
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        feature_path,
+        label_path,
+        boundary_path,
+        **kwargs,
+    ):
+        super().__init__(file_path, pipeline, **kwargs)
+        self.label_path = label_path
+        self.boundary_path = boundary_path
+        self.feature_path = feature_path
+
+    def load_file(self):
+        """Load index file to get video information."""
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID: Prepare data for training/valid given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        file_name = video_name.split('.')[0] + ".npy"
+        label_file_path = os.path.join(self.label_path, file_name)
+        label = np.load(label_file_path).astype(np.int64)
+
+        # load boundary
+        file_name = video_name.split('.')[0] + ".npy"
+        boundary_file_path = os.path.join(self.boundary_path, file_name)
+        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_label'] = copy.deepcopy(label)
+        results['video_boundary'] = copy.deepcopy(boundary)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_label'], results['video_boundary']
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        file_name = video_name.split('.')[0] + ".npy"
+        label_file_path = os.path.join(self.label_path, file_name)
+        label = np.load(label_file_path).astype(np.int64)
+
+        # load boundary
+        file_name = video_name.split('.')[0] + ".npy"
+        boundary_file_path = os.path.join(self.boundary_path, file_name)
+        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_label'] = copy.deepcopy(label)
+        results['video_boundary'] = copy.deepcopy(boundary)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_label'], results['video_boundary']
diff --git a/docs/src/paddlevideo/loader/dataset/ava_dataset.py b/docs/src/paddlevideo/loader/dataset/ava_dataset.py
new file mode 100644
index 000000000..744e15bb6
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/ava_dataset.py
@@ -0,0 +1,249 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import sys
+import os
+import pickle
+from datetime import datetime
+from ...metrics.ava_utils import ava_evaluate_results
+from ..registry import DATASETS
+from .base import BaseDataset
+from collections import defaultdict
+
+
+@DATASETS.register()
+class AVADataset(BaseDataset):
+    """AVA dataset for spatial temporal detection.
+    the dataset loads raw frames, bounding boxes, proposals and applies
+    transformations to return the frame tensors and other information.
+    """
+
+    _FPS = 30
+
+    def __init__(self,
+                 pipeline,
+                 file_path=None,
+                 exclude_file=None,
+                 label_file=None,
+                 suffix='{:05}.jpg',
+                 proposal_file=None,
+                 person_det_score_thr=0.9,
+                 num_classes=81,
+                 data_prefix=None,
+                 test_mode=False,
+                 num_max_proposals=1000,
+                 timestamp_start=900,
+                 timestamp_end=1800):
+        self.custom_classes = None
+        self.exclude_file = exclude_file
+        self.label_file = label_file
+        self.proposal_file = proposal_file
+        assert 0 <= person_det_score_thr <= 1, (
+            'The value of '
+            'person_det_score_thr should in [0, 1]. ')
+        self.person_det_score_thr = person_det_score_thr
+        self.num_classes = num_classes
+        self.suffix = suffix
+        self.num_max_proposals = num_max_proposals
+        self.timestamp_start = timestamp_start
+        self.timestamp_end = timestamp_end
+        super().__init__(
+            file_path,
+            pipeline,
+            data_prefix,
+            test_mode,
+        )
+        if self.proposal_file is not None:
+            self.proposals = self._load(self.proposal_file)
+        else:
+            self.proposals = None
+        if not test_mode:
+            valid_indexes = self.filter_exclude_file()
+            self.info = self.info = [self.info[i] for i in valid_indexes]
+
+    def _load(self, path):
+        f = open(path, 'rb')
+        res = pickle.load(f)
+        f.close()
+        return res
+
+    def parse_img_record(self, img_records):
+        bboxes, labels, entity_ids = [], [], []
+        while len(img_records) > 0:
+            img_record = img_records[0]
+            num_img_records = len(img_records)
+            selected_records = list(
+                filter(
+                    lambda x: np.array_equal(x['entity_box'], img_record[
+                        'entity_box']), img_records))
+            num_selected_records = len(selected_records)
+            img_records = list(
+                filter(
+                    lambda x: not np.array_equal(x['entity_box'], img_record[
+                        'entity_box']), img_records))
+            assert len(img_records) + num_selected_records == num_img_records
+
+            bboxes.append(img_record['entity_box'])
+            valid_labels = np.array([
+                selected_record['label'] for selected_record in selected_records
+            ])
+
+            label = np.zeros(self.num_classes, dtype=np.float32)
+            label[valid_labels] = 1.
+
+            labels.append(label)
+            entity_ids.append(img_record['entity_id'])
+
+        bboxes = np.stack(bboxes)
+        labels = np.stack(labels)
+        entity_ids = np.stack(entity_ids)
+        return bboxes, labels, entity_ids
+
+    def filter_exclude_file(self):
+        valid_indexes = []
+        if self.exclude_file is None:
+            valid_indexes = list(range(len(self.info)))
+        else:
+            exclude_video_infos = [
+                x.strip().split(',') for x in open(self.exclude_file)
+            ]
+            for i, video_info in enumerate(self.info):
+                valid_indexes.append(i)
+                for video_id, timestamp in exclude_video_infos:
+                    if (video_info['video_id'] == video_id
+                            and video_info['timestamp'] == int(timestamp)):
+                        valid_indexes.pop()
+                        break
+        return valid_indexes
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        records_dict_by_img = defaultdict(list)
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split(',')
+
+                video_id = line_split[0]
+                timestamp = int(line_split[1])
+                img_key = f'{video_id},{timestamp:04d}'
+
+                entity_box = np.array(list(map(float, line_split[2:6])))
+                label = int(line_split[6])
+                entity_id = int(line_split[7])
+                shot_info = (0, (self.timestamp_end - self.timestamp_start) *
+                             self._FPS)
+
+                video_info = dict(video_id=video_id,
+                                  timestamp=timestamp,
+                                  entity_box=entity_box,
+                                  label=label,
+                                  entity_id=entity_id,
+                                  shot_info=shot_info)
+                records_dict_by_img[img_key].append(video_info)
+
+        for img_key in records_dict_by_img:
+            video_id, timestamp = img_key.split(',')
+            bboxes, labels, entity_ids = self.parse_img_record(
+                records_dict_by_img[img_key])
+            ann = dict(gt_bboxes=bboxes,
+                       gt_labels=labels,
+                       entity_ids=entity_ids)
+            frame_dir = video_id
+            if self.data_prefix is not None:
+                frame_dir = osp.join(self.data_prefix, frame_dir)
+            video_info = dict(frame_dir=frame_dir,
+                              video_id=video_id,
+                              timestamp=int(timestamp),
+                              img_key=img_key,
+                              shot_info=shot_info,
+                              fps=self._FPS,
+                              ann=ann)
+            info.append(video_info)
+
+        return info
+
+    def prepare_train(self, idx):
+        results = copy.deepcopy(self.info[idx])
+        img_key = results['img_key']
+
+        results['suffix'] = self.suffix
+        results['timestamp_start'] = self.timestamp_start
+        results['timestamp_end'] = self.timestamp_end
+
+        if self.proposals is not None:
+            if img_key not in self.proposals:
+                results['proposals'] = np.array([[0, 0, 1, 1]])
+                results['scores'] = np.array([1])
+            else:
+                proposals = self.proposals[img_key]
+                assert proposals.shape[-1] in [4, 5]
+                if proposals.shape[-1] == 5:
+                    thr = min(self.person_det_score_thr, max(proposals[:, 4]))
+                    positive_inds = (proposals[:, 4] >= thr)
+                    proposals = proposals[positive_inds]
+                    proposals = proposals[:self.num_max_proposals]
+                    results['proposals'] = proposals[:, :4]
+                    results['scores'] = proposals[:, 4]
+                else:
+                    proposals = proposals[:self.num_max_proposals]
+                    results['proposals'] = proposals
+
+        ann = results.pop('ann')
+        results['gt_bboxes'] = ann['gt_bboxes']
+        results['gt_labels'] = ann['gt_labels']
+        results['entity_ids'] = ann['entity_ids']
+
+        #ret = self.pipeline(results, "")
+        ret = self.pipeline(results)
+        #padding for dataloader
+        len_proposals = ret['proposals'].shape[0]
+        len_gt_bboxes = ret['gt_bboxes'].shape[0]
+        len_gt_labels = ret['gt_labels'].shape[0]
+        len_scores = ret['scores'].shape[0]
+        len_entity_ids = ret['entity_ids'].shape[0]
+        padding_len = 128
+        ret['proposals'] = self.my_padding_2d(ret['proposals'], padding_len)
+        ret['gt_bboxes'] = self.my_padding_2d(ret['gt_bboxes'], padding_len)
+        ret['gt_labels'] = self.my_padding_2d(ret['gt_labels'], padding_len)
+        ret['scores'] = self.my_padding_1d(ret['scores'], padding_len)
+        ret['entity_ids'] = self.my_padding_1d(ret['entity_ids'], padding_len)
+        return ret['imgs'][0], ret['imgs'][1], ret['proposals'], ret[
+            'gt_bboxes'], ret['gt_labels'], ret['scores'], ret[
+                'entity_ids'], np.array(
+                    ret['img_shape'], dtype=int
+                ), idx, len_proposals, len_gt_bboxes, len_gt_labels, len_scores, len_entity_ids
+
+    def my_padding_2d(self, feat, max_len):
+        feat_add = np.zeros((max_len - feat.shape[0], feat.shape[1]),
+                            dtype=np.float32)
+        feat_pad = np.concatenate((feat, feat_add), axis=0)
+        return feat_pad
+
+    def my_padding_1d(self, feat, max_len):
+        feat_add = np.zeros((max_len - feat.shape[0]), dtype=np.float32)
+        feat_pad = np.concatenate((feat, feat_add), axis=0)
+        return feat_pad
+
+    def prepare_test(self, idx):
+        return self.prepare_train(idx)
+
+    def evaluate(self, results):
+        return ava_evaluate_results(self.info, len(self), results,
+                                    self.custom_classes, self.label_file,
+                                    self.file_path, self.exclude_file)
diff --git a/docs/src/paddlevideo/loader/dataset/base.py b/docs/src/paddlevideo/loader/dataset/base.py
new file mode 100644
index 000000000..2549dc411
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/base.py
@@ -0,0 +1,80 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import numpy as np
+from abc import ABC, abstractmethod
+
+import paddle
+from paddle.io import Dataset
+
+
+class BaseDataset(Dataset, ABC):
+    """Base class for datasets
+
+    All datasets should subclass it.
+    All subclass should overwrite:
+
+    - Method: `load_file`, load info from index file.
+    - Method: `prepare_train`, providing train data.
+    - Method: `prepare_test`, providing test data.
+
+    Args:
+        file_path (str): index file path.
+        pipeline (Sequence XXX)
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): whether to build test dataset. Default: False.
+
+    """
+    def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):
+        super().__init__()
+        self.file_path = file_path
+        self.data_prefix = osp.realpath(data_prefix) if \
+            data_prefix is not None and osp.isdir(data_prefix) else data_prefix
+        self.test_mode = test_mode
+        self.pipeline = pipeline
+        self.info = self.load_file()
+
+    @abstractmethod
+    def load_file(self):
+        """load the video information from the index file path."""
+        pass
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        #unsqueeze label to list
+        return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        #unsqueeze label to list
+        return results['imgs'], np.array([results['labels']])
+
+    def __len__(self):
+        """get the size of the dataset."""
+        return len(self.info)
+
+    def __getitem__(self, idx):
+        """ Get the sample for either training or testing given index"""
+        if self.test_mode:
+            return self.prepare_test(idx)
+        else:
+            return self.prepare_train(idx)
diff --git a/docs/src/paddlevideo/loader/dataset/bmn_dataset.py b/docs/src/paddlevideo/loader/dataset/bmn_dataset.py
new file mode 100644
index 000000000..44c765191
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/bmn_dataset.py
@@ -0,0 +1,72 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class BMNDataset(BaseDataset):
+    """Video dataset for action localization.
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        subset,
+        **kwargs,
+    ):
+        self.subset = subset
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        annos = json.load(open(self.file_path))
+        for video_name in annos.keys():
+            video_subset = annos[video_name]["subset"]
+            if self.subset in video_subset:
+                info.append(
+                    dict(
+                        video_name=video_name,
+                        video_info=annos[video_name],
+                    ))
+        #sort by video_name
+        sort_f = lambda elem: elem['video_name']
+        info.sort(key=sort_f)
+        #add video_idx to info
+        for idx, elem in enumerate(info):
+            info[idx]['video_idx'] = idx
+        logger.info("{} subset video numbers: {}".format(
+            self.subset, len(info)))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID: Prepare data for training/valid given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        return results['video_feat'], results['gt_iou_map'], results['gt_start'],\
+               results['gt_end']
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        return results['video_feat'], results['gt_iou_map'], results['gt_start'], \
+               results['gt_end'], results['video_idx']
diff --git a/docs/src/paddlevideo/loader/dataset/davis_dataset.py b/docs/src/paddlevideo/loader/dataset/davis_dataset.py
new file mode 100644
index 000000000..20a275971
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/davis_dataset.py
@@ -0,0 +1,189 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import os.path as osp
+import copy
+import random
+import numpy as np
+import shutil
+from PIL import Image
+import cv2
+from paddle.io import Dataset
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+class VOS_Test(Dataset):
+    """process frames in each video
+    """
+    def __init__(self,
+                 image_root,
+                 label_root,
+                 seq_name,
+                 images,
+                 labels,
+                 pipeline=None,
+                 rgb=False,
+                 resolution=None):
+        self.image_root = image_root
+        self.label_root = label_root
+        self.seq_name = seq_name
+        self.images = images  # image file list
+        self.labels = labels
+        self.obj_num = 1
+        self.num_frame = len(self.images)
+        self.pipeline = pipeline
+        self.rgb = rgb
+        self.resolution = resolution
+
+        self.obj_nums = []
+        temp_obj_num = 0
+        for img_name in self.images:
+            self.obj_nums.append(temp_obj_num)
+            current_label_name = img_name.split('.')[0] + '.png'
+            if current_label_name in self.labels:
+                current_label = self.read_label(current_label_name)
+                if temp_obj_num < np.unique(
+                        current_label)[-1]:  #get object number from label_id
+                    temp_obj_num = np.unique(current_label)[-1]
+
+    def __len__(self):
+        return len(self.images)
+
+    def read_image(self, idx):
+        img_name = self.images[idx]
+        img_path = os.path.join(self.image_root, self.seq_name, img_name)
+        img = cv2.imread(img_path)
+        img = np.array(img, dtype=np.float32)
+        if self.rgb:
+            img = img[:, :, [2, 1, 0]]
+        return img
+
+    def read_label(self, label_name):
+        label_path = os.path.join(self.label_root, self.seq_name, label_name)
+        label = Image.open(label_path)
+        label = np.array(label, dtype=np.uint8)
+        return label
+
+    def __getitem__(self, idx):
+        img_name = self.images[idx]
+        current_img = self.read_image(idx)
+        current_img = np.array(current_img)
+        height, width, channels = current_img.shape
+        if self.resolution is not None:
+            width = int(np.ceil(float(width) * self.resolution / float(height)))
+            height = int(self.resolution)
+
+        current_label_name = img_name.split('.')[0] + '.png'
+        obj_num = self.obj_nums[idx]
+
+        if current_label_name in self.labels:
+            current_label = self.read_label(current_label_name)
+            current_label = np.array(current_label)
+            sample = {
+                'current_img': current_img,
+                'current_label': current_label
+            }
+        else:
+            sample = {
+                'current_img': current_img
+            }  #only the first frame contains label
+
+        sample['meta'] = {
+            'seq_name': self.seq_name,
+            'frame_num': self.num_frame,
+            'obj_num': obj_num,
+            'current_name': img_name,
+            'height': height,
+            'width': width,
+            'flip': False
+        }
+        if self.pipeline is not None:
+            sample = self.pipeline(sample)
+        for s in sample:
+            s['current_img'] = np.array(s['current_img'])
+            if 'current_label' in s.keys():
+                s['current_label'] = s['current_label']
+        return sample
+
+
+@DATASETS.register()
+class DavisDataset(BaseDataset):
+    """Davis 2017 dataset.
+    """
+    def __init__(
+        self,
+        file_path,
+        result_root,
+        pipeline,
+        data_prefix=None,
+        test_mode=False,
+        year=2017,
+        rgb=False,
+        resolution='480p',
+    ):
+        self.rgb = rgb
+        self.result_root = result_root
+        self.resolution = resolution
+        self.year = year
+        self.spt = 'val' if test_mode else 'train'
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        self.image_root = os.path.join(self.file_path, 'JPEGImages',
+                                       self.resolution)
+        self.label_root = os.path.join(self.file_path, 'Annotations',
+                                       self.resolution)
+        seq_names = []
+        with open(
+                os.path.join(self.file_path, 'ImageSets', str(self.year),
+                             self.spt + '.txt')) as f:
+            seqs_tmp = f.readlines()
+        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
+        seq_names.extend(seqs_tmp)
+        self.info = list(np.unique(seq_names))
+        return self.info
+
+    def prepare_test(self, idx):
+        seq_name = self.info[idx]  #video name
+        images = list(
+            np.sort(os.listdir(os.path.join(self.image_root, seq_name))))
+        labels = [images[0].replace('jpg', 'png')]  #we have first frame target
+
+        # copy first frame target
+        if not os.path.isfile(
+                os.path.join(self.result_root, seq_name, labels[0])):
+            if not os.path.exists(os.path.join(self.result_root, seq_name)):
+                os.makedirs(os.path.join(self.result_root, seq_name))
+            source_label_path = os.path.join(self.label_root, seq_name,
+                                             labels[0])
+            result_label_path = os.path.join(self.result_root, seq_name,
+                                             labels[0])
+
+            shutil.copy(source_label_path, result_label_path)
+
+        seq_dataset = VOS_Test(self.image_root,
+                               self.label_root,
+                               seq_name,
+                               images,
+                               labels,
+                               self.pipeline,
+                               rgb=self.rgb,
+                               resolution=480)
+        return seq_dataset
diff --git a/docs/src/paddlevideo/loader/dataset/feature.py b/docs/src/paddlevideo/loader/dataset/feature.py
new file mode 100644
index 000000000..df5e33ee1
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/feature.py
@@ -0,0 +1,80 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import os.path as osp
+
+from ..registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register()
+class FeatureDataset(BaseDataset):
+    """Feature dataset for action recognition
+       Example:(TODO)
+       Args:(TODO)
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        data_prefix=None,
+        test_mode=False,
+        suffix=None,
+    ):
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                filename = line.strip().split()[0]
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                if self.suffix is not None:
+                    filename = filename + self.suffix
+
+                info.append(dict(filename=filename))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+
+        if 'iou_norm' in results:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results[
+                        'labels'], results['iou_norm']
+        else:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results['labels']
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for testing given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+
+        if 'iou_norm' in results:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results[
+                        'labels'], results['iou_norm']
+        else:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results['labels']
diff --git a/docs/src/paddlevideo/loader/dataset/frame.py b/docs/src/paddlevideo/loader/dataset/frame.py
new file mode 100644
index 000000000..b02f52659
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/frame.py
@@ -0,0 +1,177 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class FrameDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir, frames_len, labels = line_split
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(frame_dir=frame_dir,
+                         suffix=self.suffix,
+                         frames_len=frames_len,
+                         labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+
+@DATASETS.register()
+class FrameDataset_Sport(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir = line_split[0]
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(dict(frame_dir=frame_dir, suffix=self.suffix))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
diff --git a/docs/src/paddlevideo/loader/dataset/ms_tcn_dataset.py b/docs/src/paddlevideo/loader/dataset/ms_tcn_dataset.py
new file mode 100644
index 000000000..56e3b7bbb
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/ms_tcn_dataset.py
@@ -0,0 +1,110 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MSTCNDataset(BaseDataset):
+    """Video dataset for action segmentation.
+    """
+
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        feature_path,
+        gt_path,
+        actions_map_file_path,
+        **kwargs,
+    ):
+        super().__init__(file_path, pipeline, **kwargs)
+        self.gt_path = gt_path
+        self.actions_map_file_path = actions_map_file_path
+        self.feature_path = feature_path
+
+        # actions dict generate
+        file_ptr = open(self.actions_map_file_path, 'r')
+        actions = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        self.actions_dict = dict()
+        for a in actions:
+            self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+        self.num_classes = len(self.actions_dict.keys())
+
+    def load_file(self):
+        """Load index file to get video information."""
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID: Prepare data for training/valid given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        target_file_path = os.path.join(self.gt_path, video_name)
+        file_ptr = open(target_file_path, 'r')
+        content = file_ptr.read().split('\n')[:-1]
+        classes = np.zeros(min(np.shape(video_feat)[1], len(content)), dtype='int64')
+        for i in range(len(classes)):
+            classes[i] = self.actions_dict[content[i]]
+        # classes = classes * (-100)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_gt'] = copy.deepcopy(classes)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_gt']
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        target_file_path = os.path.join(self.gt_path, video_name)
+        file_ptr = open(target_file_path, 'r')
+        content = file_ptr.read().split('\n')[:-1]
+        classes = np.zeros(min(np.shape(video_feat)[1], len(content)))
+        for i in range(len(classes)):
+            classes[i] = self.actions_dict[content[i]]
+        # classes = classes * (-100)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_gt'] = copy.deepcopy(classes)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_gt']
diff --git a/docs/src/paddlevideo/loader/dataset/msrvtt.py b/docs/src/paddlevideo/loader/dataset/msrvtt.py
new file mode 100644
index 000000000..0e5294fff
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/msrvtt.py
@@ -0,0 +1,220 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+try:
+    import lmdb
+except ImportError as e:
+    print(
+        f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
+    )
+import pickle
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+    )
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MSRVTTDataset(BaseDataset):
+    """MSR-VTT dataset for text-video clip retrieval.
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        features_path,
+        bert_model="bert-base-uncased",
+        padding_index=0,
+        max_seq_length=36,
+        max_region_num=36,
+        max_action_num=5,
+        vision_feature_dim=2048,
+        action_feature_dim=2048,
+        spatials_dim=5,
+        data_prefix=None,
+        test_mode=False,
+    ):
+        self.features_path = features_path
+        self.bert_model = bert_model
+        self.padding_index = padding_index
+        self.max_seq_length = max_seq_length
+        self.max_region_num = max_region_num
+        self._max_action_num = max_action_num
+        self.vision_feature_dim = vision_feature_dim
+        self.action_feature_dim = action_feature_dim
+        self.spatials_dim = spatials_dim
+        self._tokenizer = BertTokenizer.from_pretrained(bert_model,
+                                                        do_lower_case=True)
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+        self.tokenize()
+        self.gen_feature()
+
+    def load_file(self):
+        """Load index file to get video information."""
+        with open(self.file_path) as fin:
+            self.image_entries = []
+            self.caption_entries = []
+            for line in fin.readlines():
+                line = line.strip()
+                vid_id = line.split(',')[0]
+                self.image_entries.append(vid_id)
+                self.caption_entries.append({
+                    "caption": line.split(',')[1],
+                    "vid_id": vid_id
+                })
+        self.env = lmdb.open(self.features_path)
+
+    def tokenize(self):
+        for entry in self.caption_entries:
+            tokens = []
+            tokens.append("[CLS]")
+            for token in self._tokenizer.tokenize(entry["caption"]):
+                tokens.append(token)
+            tokens.append("[SEP]")
+            tokens = self._tokenizer.convert_tokens_to_ids(tokens)
+
+            segment_ids = [0] * len(tokens)
+            input_mask = [1] * len(tokens)
+
+            if len(tokens) < self.max_seq_length:
+                padding = [self.padding_index
+                           ] * (self.max_seq_length - len(tokens))
+                tokens = tokens + padding
+                input_mask += padding
+                segment_ids += padding
+
+            entry["token"] = np.array(tokens).astype('int64')
+            entry["input_mask"] = np.array(input_mask)
+            entry["segment_ids"] = np.array(segment_ids).astype('int64')
+
+    def get_image_feature(self, video_id):
+        video_id = str(video_id).encode()
+        with self.env.begin(write=False) as txn:
+            item = pickle.loads(txn.get(video_id))
+            video_id = item["video_id"]
+            image_h = int(item["image_h"])
+            image_w = int(item["image_w"])
+
+            features = item["features"].reshape(-1, self.vision_feature_dim)
+            boxes = item["boxes"].reshape(-1, 4)
+
+            num_boxes = features.shape[0]
+            g_feat = np.sum(features, axis=0) / num_boxes
+            num_boxes = num_boxes + 1
+            features = np.concatenate(
+                [np.expand_dims(g_feat, axis=0), features], axis=0)
+
+            action_features = item["action_features"].reshape(
+                -1, self.action_feature_dim)
+
+            image_location = np.zeros((boxes.shape[0], self.spatials_dim),
+                                      dtype=np.float32)
+            image_location[:, :4] = boxes
+            image_location[:,
+                           4] = ((image_location[:, 3] - image_location[:, 1]) *
+                                 (image_location[:, 2] - image_location[:, 0]) /
+                                 (float(image_w) * float(image_h)))
+
+            image_location[:, 0] = image_location[:, 0] / float(image_w)
+            image_location[:, 1] = image_location[:, 1] / float(image_h)
+            image_location[:, 2] = image_location[:, 2] / float(image_w)
+            image_location[:, 3] = image_location[:, 3] / float(image_h)
+
+            g_location = np.array([0, 0, 1, 1, 1])
+            image_location = np.concatenate(
+                [np.expand_dims(g_location, axis=0), image_location], axis=0)
+        return features, num_boxes, image_location, action_features
+
+    def gen_feature(self):
+        num_inst = len(self.image_entries)  #1000
+        self.features_all = np.zeros(
+            (num_inst, self.max_region_num, self.vision_feature_dim))
+        self.action_features_all = np.zeros(
+            (num_inst, self._max_action_num, self.action_feature_dim))
+        self.spatials_all = np.zeros(
+            (num_inst, self.max_region_num, self.spatials_dim))
+        self.image_mask_all = np.zeros((num_inst, self.max_region_num))
+        self.action_mask_all = np.zeros((num_inst, self._max_action_num))
+
+        for i, image_id in enumerate(self.image_entries):
+            features, num_boxes, boxes, action_features = self.get_image_feature(
+                image_id)
+
+            mix_num_boxes = min(int(num_boxes), self.max_region_num)
+            mix_boxes_pad = np.zeros((self.max_region_num, self.spatials_dim))
+            mix_features_pad = np.zeros(
+                (self.max_region_num, self.vision_feature_dim))
+
+            image_mask = [1] * (int(mix_num_boxes))
+            while len(image_mask) < self.max_region_num:
+                image_mask.append(0)
+            action_mask = [1] * (self._max_action_num)
+            while len(action_mask) < self._max_action_num:
+                action_mask.append(0)
+
+            mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
+            mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
+
+            self.features_all[i] = mix_features_pad
+            x = action_features.shape[0]
+            self.action_features_all[i][:x] = action_features[:]
+            self.image_mask_all[i] = np.array(image_mask)
+            self.action_mask_all[i] = np.array(action_mask)
+            self.spatials_all[i] = mix_boxes_pad
+
+        self.features_all = self.features_all.astype("float32")
+        self.action_features_all = self.action_features_all.astype("float32")
+        self.image_mask_all = self.image_mask_all.astype("int64")
+        self.action_mask_all = self.action_mask_all.astype("int64")
+        self.spatials_all = self.spatials_all.astype("float32")
+
+    def prepare_train(self, idx):
+        pass
+
+    def prepare_test(self, idx):
+        entry = self.caption_entries[idx]
+        caption = entry["token"]
+        input_mask = entry["input_mask"]
+        segment_ids = entry["segment_ids"]
+
+        target_all = np.zeros(1000)
+        for i, image_id in enumerate(self.image_entries):
+            if image_id == entry["vid_id"]:
+                target_all[i] = 1
+
+        return (
+            caption,
+            self.action_features_all,
+            self.features_all,
+            self.spatials_all,
+            segment_ids,
+            input_mask,
+            self.image_mask_all,
+            self.action_mask_all,
+            target_all,
+        )
+
+    def __len__(self):
+        return len(self.caption_entries)
diff --git a/docs/src/paddlevideo/loader/dataset/oxford.py b/docs/src/paddlevideo/loader/dataset/oxford.py
new file mode 100644
index 000000000..a9e65c698
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/oxford.py
@@ -0,0 +1,62 @@
+# Copyright Niantic 2019. Patent Pending. All rights reserved.
+#
+# This software is licensed under the terms of the Monodepth2 licence
+# which allows for non-commercial use only, the full terms of which are made
+# available in the LICENSE file.
+
+from __future__ import absolute_import, division, print_function
+
+import copy
+from os import path as osp
+
+from PIL import Image
+
+from ..registry import DATASETS
+from .base import BaseDataset
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning
+    # (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        with Image.open(f) as img:
+            return img.convert('RGB')
+
+
+@DATASETS.register()
+class MonoDataset(BaseDataset):
+    def __init__(self,
+                 file_path,
+                 data_prefix,
+                 pipeline,
+                 num_retries=0,
+                 suffix='.png',
+                 **kwargs):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, **kwargs)
+
+    def load_file(self):
+        info = []
+        with open(self.file_path, 'r') as f:
+            for line in f:
+                filename = line.strip() + self.suffix
+                folder = osp.dirname(filename)
+                frame_index = line.strip().split('/')[1]
+                info.append(
+                    dict(data_path=self.data_prefix,
+                         filename=filename,
+                         folder=folder,
+                         frame_index=int(frame_index)))
+        return info
+
+    def prepare_train(self, idx):
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        results['imgs']['idx'] = idx
+        return results['imgs'], results['day_or_night']
+
+    def prepare_test(self, idx):
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        return results['imgs'], results['day_or_night']
diff --git a/docs/src/paddlevideo/loader/dataset/skeleton.py b/docs/src/paddlevideo/loader/dataset/skeleton.py
new file mode 100644
index 000000000..30a3f3e70
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/skeleton.py
@@ -0,0 +1,78 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import pickle
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class SkeletonDataset(BaseDataset):
+    """
+    Skeleton dataset for action recognition.
+    The dataset loads skeleton feature, and apply norm operatations.
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(obj): Define the pipeline of data preprocessing.
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+    """
+    def __init__(self, file_path, pipeline, label_path=None, test_mode=False):
+        self.label_path = label_path
+        super().__init__(file_path, pipeline, test_mode=test_mode)
+
+    def load_file(self):
+        """Load feature file to get skeleton information."""
+        logger.info("Loading data, it will take some moment...")
+        self.data = np.load(self.file_path)
+        if self.label_path:
+            if self.label_path.endswith('npy'):
+                self.label = np.load(self.label_path)
+            elif self.label_path.endswith('pkl'):
+                with open(self.label_path, 'rb') as f:
+                    sample_name, self.label = pickle.load(f)
+        else:
+            logger.info(
+                "Label path not provided when test_mode={}, here just output predictions."
+                .format(self.test_mode))
+        logger.info("Data Loaded!")
+        return self.data  # used for __len__
+
+    def prepare_train(self, idx):
+        """Prepare the feature for training/valid given index. """
+        results = dict()
+        results['data'] = copy.deepcopy(self.data[idx])
+        results['label'] = copy.deepcopy(self.label[idx])
+        results = self.pipeline(results)
+        return results['data'], results['label']
+
+    def prepare_test(self, idx):
+        """Prepare the feature for test given index. """
+        results = dict()
+        results['data'] = copy.deepcopy(self.data[idx])
+        if self.label_path:
+            results['label'] = copy.deepcopy(self.label[idx])
+            results = self.pipeline(results)
+            return results['data'], results['label']
+        else:
+            results = self.pipeline(results)
+            return [results['data']]
diff --git a/docs/src/paddlevideo/loader/dataset/slowfast_video.py b/docs/src/paddlevideo/loader/dataset/slowfast_video.py
new file mode 100644
index 000000000..1adf89c54
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/slowfast_video.py
@@ -0,0 +1,143 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+@DATASETS.register()
+class SFVideoDataset(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+
+       .. code-block:: txt
+
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           num_ensemble_views(int): temporal segment when multi-crop test
+           num_spatial_crops(int): spatial crop number when multi-crop test
+           **kwargs: Keyword arguments for ```BaseDataset```.
+
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        num_ensemble_views=1,
+        num_spatial_crops=1,
+        num_retries=5,
+        num_samples_precise_bn=None,
+        **kwargs,
+    ):
+        self.num_ensemble_views = num_ensemble_views
+        self.num_spatial_crops = num_spatial_crops
+        self.num_retries = num_retries
+        self.num_samples_precise_bn = num_samples_precise_bn
+        super().__init__(file_path, pipeline, **kwargs)
+        #set random seed
+        random.seed(0)
+        np.random.seed(0)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                filename, labels = line_split
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                for tidx in range(self.num_ensemble_views):
+                    for sidx in range(self.num_spatial_crops):
+                        info.append(
+                            dict(
+                                filename=filename,
+                                labels=int(labels),
+                                temporal_sample_index=tidx,
+                                spatial_sample_index=sidx,
+                                temporal_num_clips=self.num_ensemble_views,
+                                spatial_num_clips=self.num_spatial_crops,
+                            ))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        short_cycle = False
+        if isinstance(idx, tuple):
+            idx, short_cycle_idx = idx
+            short_cycle = True
+        for ir in range(self.num_retries):
+            try:
+                #Multi-grid short cycle
+                if short_cycle:
+                    results = copy.deepcopy(self.info[idx])
+                    results['short_cycle_idx'] = short_cycle_idx
+                else:
+                    results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+
+            return results['imgs'][0], results['imgs'][1], np.array(
+                [results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'][0], results['imgs'][1], np.array(
+                [results['labels']]), np.array([idx])
+
+    def __len__(self):
+        """get the size of the dataset."""
+        if self.num_samples_precise_bn is None:
+            return len(self.info)
+        else:
+            random.shuffle(self.info)
+            return min(self.num_samples_precise_bn, len(self.info))
diff --git a/docs/src/paddlevideo/loader/dataset/ucf101_skeleton.py b/docs/src/paddlevideo/loader/dataset/ucf101_skeleton.py
new file mode 100644
index 000000000..8177933f2
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/ucf101_skeleton.py
@@ -0,0 +1,89 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import pickle
+
+import paddle
+from paddle.io import Dataset
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class UCF101SkeletonDataset(BaseDataset):
+    """
+    Skeleton dataset for action recognition.
+    The dataset loads skeleton feature, and apply norm operatations.
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(obj): Define the pipeline of data preprocessing.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+    """
+
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 split,
+                 repeat_times,
+                 test_mode=False):
+        self.split = split
+        self.repeat_times = repeat_times
+        super().__init__(file_path, pipeline, test_mode=test_mode)
+        self._ori_len = len(self.info)
+        self.start_index = 0
+        self.modality = "Pose"
+
+    def load_file(self):
+        """Load annotation file to get video information."""
+        assert self.file_path.endswith('.pkl')
+        return self.load_pkl_annotations()
+
+    def load_pkl_annotations(self):
+        with open(self.file_path, "rb") as f:
+            data = pickle.load(f)
+
+        if self.split:
+            split, data = data['split'], data['annotations']
+            identifier = 'filename' if 'filename' in data[0] else 'frame_dir'
+            data = [x for x in data if x[identifier] in split[self.split]]
+
+        return data
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training given the index."""
+        results = copy.deepcopy(self.info[idx % self._ori_len])
+        results['modality'] = self.modality
+        results['start_index'] = self.start_index
+
+        return self.pipeline(results)
+
+    def prepare_test(self, idx):
+        """Prepare the frames for testing given the index."""
+        results = copy.deepcopy(self.info[idx % self._ori_len])
+        results['modality'] = self.modality
+        results['start_index'] = self.start_index
+
+        return self.pipeline(results)
+
+    def __len__(self):
+        """get the size of the dataset."""
+        return len(self.info) * self.repeat_times
diff --git a/docs/src/paddlevideo/loader/dataset/ucf24_dataset.py b/docs/src/paddlevideo/loader/dataset/ucf24_dataset.py
new file mode 100644
index 000000000..ad2e84e2a
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/ucf24_dataset.py
@@ -0,0 +1,76 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class UCF24Dataset(BaseDataset):
+    """Dataset for YOWO
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+
+    def __init__(self, file_path, pipeline, num_retries=5, **kwargs):
+        self.num_retries = num_retries
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            lines = fin.readlines()
+        for line in lines:
+            line = line.strip()  # 'data/ucf24/labels/class_name/video_name/key_frame.txt'
+            filename = line.replace('txt', 'jpg').replace(
+                'labels', 'rgb-images')  # key frame path
+
+            info.append(dict(filename=filename))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        im_path = results['filename']
+        im_path = im_path.replace('jpg', 'txt')
+        im_split = im_path.split('/')
+        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
+        return results['imgs'], np.array([results['labels']]), frame_index
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        # Try to catch Exception caused by reading corrupted video file
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        im_path = results['filename']
+        im_path = im_path.replace('jpg', 'txt')
+        im_split = im_path.split('/')
+        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
+        return results['imgs'], np.array([results['labels']]), frame_index
diff --git a/docs/src/paddlevideo/loader/dataset/video.py b/docs/src/paddlevideo/loader/dataset/video.py
new file mode 100644
index 000000000..f2d8f897a
--- /dev/null
+++ b/docs/src/paddlevideo/loader/dataset/video.py
@@ -0,0 +1,95 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class VideoDataset(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                filename, labels = line_split
+                #TODO(hj): Required suffix format: may mp4/avi/wmv
+                filename = filename + self.suffix
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                info.append(dict(filename=filename, labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
diff --git a/docs/src/paddlevideo/loader/pipelines/__init__.py b/docs/src/paddlevideo/loader/pipelines/__init__.py
new file mode 100644
index 000000000..6e6afdc53
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/__init__.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .anet_pipeline import GetMatchMap, GetVideoLabel, LoadFeat
+from .augmentations import (CenterCrop, ColorJitter, GroupRandomFlip,
+                            GroupResize, Image2Array, JitterScale, MultiCrop,
+                            Normalization, PackOutput, RandomCrop, RandomFlip,
+                            RandomResizedCrop, Scale, TenCrop, ToArray,
+                            UniformCrop, RandomGamma, MultiCenterCrop,
+                            RandomBrightness, RandomHue, RandomSaturation, YowoAug)
+from .augmentations_ava import *
+from .compose import Compose
+from .decode import FeatureDecoder, FrameDecoder, VideoDecoder, ActionFeatureDecoder
+from .decode_image import ImageDecoder
+from .decode_sampler import DecodeSampler
+from .mix import Cutmix, Mixup, VideoMix
+from .multimodal import FeaturePadding, RandomCap, RandomMask, Tokenize
+from .sample import Sampler, SamplerPkl
+from .sample_ava import *
+from .segmentation import MultiNorm, MultiRestrictSize
+from .skeleton_pipeline import AutoPadding, Iden, SkeletonNorm
+from .skeleton_pipeline import SketeonCropSample, SketeonModalityTransform, RandomRotation
+from .skeleton_pipeline import (UniformSampleFrames, PoseDecode, PoseCompact,
+                                RandomResizedCrop_V2, Flip_V2, CenterCrop_V2,
+                                GeneratePoseTarget, FormatShape, Collect)
+from .decode_sampler_MRI import SFMRI_DecodeSampler
+from .segmentation_pipline import SegmentationSampler
+from .sample_ucf24 import SamplerUCF24
+
+__all__ = [
+    'ImageDecoder', 'RandomMask', 'UniformCrop', 'SkeletonNorm', 'Tokenize',
+    'Sampler', 'FeatureDecoder', 'DecodeSampler', 'TenCrop', 'Compose',
+    'AutoPadding', 'Normalization', 'Mixup', 'Image2Array', 'Scale',
+    'GroupResize', 'VideoDecoder', 'FrameDecoder', 'PackOutput',
+    'ActionFeatureDecoder', 'GetVideoLabel', 'Cutmix', 'CenterCrop',
+    'RandomCrop', 'LoadFeat', 'RandomCap', 'JitterScale', 'Iden', 'VideoMix',
+    'ColorJitter', 'RandomFlip', 'ToArray', 'FeaturePadding', 'GetMatchMap',
+    'GroupRandomFlip', 'MultiCrop', 'SFMRI_DecodeSampler', 'MultiRestrictSize',
+    'MultiNorm', 'RandomResizedCrop', 'SamplerPkl', 'SegmentationSampler',
+    'SketeonCropSample', 'SketeonModalityTransform', 'RandomRotation',
+    'RandomGamma', 'MultiCenterCrop', 'RandomBrightness', 'RandomHue',
+    'RandomSaturation', 'UniformSampleFrames', 'PoseDecode', 'PoseCompact',
+    'Resize', 'RandomResizedCrop_V2', 'Flip_V2', 'GeneratePoseTarget',
+    'FormatShape', 'Collect', 'RandomSaturation', 'SamplerUCF24', 'YowoAug'
+]
diff --git a/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py b/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py
new file mode 100644
index 000000000..210d733b7
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/anet_pipeline.py
@@ -0,0 +1,150 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from ..registry import PIPELINES
+"""pipeline ops for Activity Net.
+"""
+
+
+@PIPELINES.register()
+class LoadFeat(object):
+    def __init__(self, feat_path):
+        self.feat_path = feat_path
+
+    def __call__(self, results):
+        video_name = results['video_name']
+        file_name = video_name + ".npy"
+        file_path = os.path.join(self.feat_path, file_name)
+        #TODO: check path
+        video_feat = np.load(file_path)
+        video_feat = video_feat.T
+        video_feat = video_feat.astype("float32")
+        results['video_feat'] = video_feat
+        return results
+
+
+@PIPELINES.register()
+class GetMatchMap(object):
+    def __init__(self, tscale):
+        self.tscale = tscale
+        self.tgap = 1. / self.tscale
+
+    def __call__(self, results):
+        match_map = []
+        for idx in range(self.tscale):
+            tmp_match_window = []
+            xmin = self.tgap * idx
+            for jdx in range(1, self.tscale + 1):
+                xmax = xmin + self.tgap * jdx
+                tmp_match_window.append([xmin, xmax])
+            match_map.append(tmp_match_window)
+        match_map = np.array(match_map)
+        match_map = np.transpose(match_map, [1, 0, 2])
+        match_map = np.reshape(match_map, [-1, 2])
+
+        anchor_xmin = [self.tgap * i for i in range(self.tscale)]
+        anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]
+
+        results['match_map'] = match_map
+        results['anchor_xmin'] = anchor_xmin
+        results['anchor_xmax'] = anchor_xmax
+        return results
+
+
+@PIPELINES.register()
+class GetVideoLabel(object):
+    def __init__(self, tscale, dscale, datatype="float32"):
+        self.tscale = tscale
+        self.dscale = dscale
+        self.tgap = 1. / self.tscale
+        self.datatype = datatype
+
+    def iou_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
+        """Compute jaccard score between a box and the anchors.
+        """
+        len_anchors = anchors_max - anchors_min
+        int_xmin = np.maximum(anchors_min, box_min)
+        int_xmax = np.minimum(anchors_max, box_max)
+        inter_len = np.maximum(int_xmax - int_xmin, 0.)
+        union_len = len_anchors - inter_len + box_max - box_min
+        jaccard = np.divide(inter_len, union_len)
+        return jaccard
+
+    def ioa_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
+        """Compute intersection between score a box and the anchors.
+        """
+        len_anchors = anchors_max - anchors_min
+        int_xmin = np.maximum(anchors_min, box_min)
+        int_xmax = np.minimum(anchors_max, box_max)
+        inter_len = np.maximum(int_xmax - int_xmin, 0.)
+        scores = np.divide(inter_len, len_anchors)
+        return scores
+
+    def __call__(self, results):
+        video_info = results['video_info']
+        match_map = results['match_map']
+        anchor_xmin = results['anchor_xmin']
+        anchor_xmax = results['anchor_xmax']
+
+        video_second = video_info['duration_second']
+        video_labels = video_info['annotations']
+
+        gt_bbox = []
+        gt_iou_map = []
+        for gt in video_labels:
+            tmp_start = max(min(1, gt["segment"][0] / video_second), 0)
+            tmp_end = max(min(1, gt["segment"][1] / video_second), 0)
+            gt_bbox.append([tmp_start, tmp_end])
+            tmp_gt_iou_map = self.iou_with_anchors(match_map[:, 0],
+                                                   match_map[:, 1], tmp_start,
+                                                   tmp_end)
+            tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,
+                                        [self.dscale, self.tscale])
+            gt_iou_map.append(tmp_gt_iou_map)
+        gt_iou_map = np.array(gt_iou_map)
+        gt_iou_map = np.max(gt_iou_map, axis=0)
+
+        gt_bbox = np.array(gt_bbox)
+        gt_xmins = gt_bbox[:, 0]
+        gt_xmaxs = gt_bbox[:, 1]
+        gt_len_small = 3 * self.tgap
+        gt_start_bboxs = np.stack(
+            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
+        gt_end_bboxs = np.stack(
+            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
+
+        match_score_start = []
+        for jdx in range(len(anchor_xmin)):
+            match_score_start.append(
+                np.max(
+                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+                                          gt_start_bboxs[:, 0],
+                                          gt_start_bboxs[:, 1])))
+        match_score_end = []
+        for jdx in range(len(anchor_xmin)):
+            match_score_end.append(
+                np.max(
+                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+                                          gt_end_bboxs[:, 0], gt_end_bboxs[:,
+                                                                           1])))
+
+        gt_start = np.array(match_score_start)
+        gt_end = np.array(match_score_end)
+
+        results['gt_iou_map'] = gt_iou_map.astype(self.datatype)
+        results['gt_start'] = gt_start.astype(self.datatype)
+        results['gt_end'] = gt_end.astype(self.datatype)
+        return results
diff --git a/docs/src/paddlevideo/loader/pipelines/augmentations.py b/docs/src/paddlevideo/loader/pipelines/augmentations.py
new file mode 100644
index 000000000..24f3c716d
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/augmentations.py
@@ -0,0 +1,1427 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+from collections.abc import Sequence
+
+import cv2
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class Scale(object):
+    """
+    Scale images.
+    Args:
+        short_size(float | int): Short size of an image will be scaled to the short_size.
+        fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True
+        do_round(bool): Whether to round up when calculating the zoom ratio. default: False
+        backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'
+    """
+    def __init__(self,
+                 short_size,
+                 fixed_ratio=True,
+                 keep_ratio=None,
+                 do_round=False,
+                 backend='pillow'):
+        self.short_size = short_size
+        assert (fixed_ratio and not keep_ratio) or (not fixed_ratio), \
+            f"fixed_ratio and keep_ratio cannot be true at the same time"
+        self.fixed_ratio = fixed_ratio
+        self.keep_ratio = keep_ratio
+        self.do_round = do_round
+
+        assert backend in [
+            'pillow', 'cv2'
+        ], f"Scale's backend must be pillow or cv2, but get {backend}"
+        self.backend = backend
+
+    def __call__(self, results):
+        """
+        Performs resize operations.
+        Args:
+            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            resized_imgs: List where each item is a PIL.Image after scaling.
+        """
+        imgs = results['imgs']
+        resized_imgs = []
+        for i in range(len(imgs)):
+            img = imgs[i]
+            if isinstance(img, np.ndarray):
+                h, w, _ = img.shape
+            elif isinstance(img, Image.Image):
+                w, h = img.size
+            else:
+                raise NotImplementedError
+            if (w <= h and w == self.short_size) or (h <= w
+                                                     and h == self.short_size):
+                if self.backend == 'pillow' and not isinstance(
+                        img, Image.Image):
+                    img = Image.fromarray(img)
+                resized_imgs.append(img)
+                continue
+
+            if w <= h:
+                ow = self.short_size
+                if self.fixed_ratio:
+                    oh = int(self.short_size * 4.0 / 3.0)
+                elif self.keep_ratio is False:
+                    oh = self.short_size
+                else:
+                    scale_factor = self.short_size / w
+                    oh = int(h * float(scale_factor) +
+                             0.5) if self.do_round else int(h *
+                                                            self.short_size / w)
+                    ow = int(w * float(scale_factor) +
+                             0.5) if self.do_round else self.short_size
+            else:
+                oh = self.short_size
+                if self.fixed_ratio:
+                    ow = int(self.short_size * 4.0 / 3.0)
+                elif self.keep_ratio is False:
+                    ow = self.short_size
+                else:
+                    scale_factor = self.short_size / h
+                    oh = int(h * float(scale_factor) +
+                             0.5) if self.do_round else self.short_size
+                    ow = int(w * float(scale_factor) +
+                             0.5) if self.do_round else int(w *
+                                                            self.short_size / h)
+            if self.backend == 'pillow':
+                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+            elif self.backend == 'cv2' and (self.keep_ratio is not None):
+                resized_imgs.append(
+                    cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR))
+            else:
+                resized_imgs.append(
+                    Image.fromarray(
+                        cv2.resize(np.asarray(img), (ow, oh),
+                                   interpolation=cv2.INTER_LINEAR)))
+        results['imgs'] = resized_imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomCrop(object):
+    """
+    Random crop images.
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+    """
+    def __init__(self, target_size):
+        self.target_size = target_size
+
+    def __call__(self, results):
+        """
+        Performs random crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            crop_imgs: List where each item is a PIL.Image after random crop.
+        """
+        imgs = results['imgs']
+        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]
+            h, w = imgs.shape[2:]
+        else:
+            w, h = imgs[0].size
+        th, tw = self.target_size, self.target_size
+
+        assert (w >= self.target_size) and (h >= self.target_size), \
+            "image width({}) and height({}) should be larger than crop size".format(
+                w, h, self.target_size)
+
+        crop_images = []
+        if 'backend' in results and results['backend'] == 'pyav':
+            x1 = np.random.randint(0, w - tw)
+            y1 = np.random.randint(0, h - th)
+            crop_images = imgs[:, :, y1:y1 + th, x1:x1 + tw]  # [C, T, th, tw]
+        else:
+            x1 = random.randint(0, w - tw)
+            y1 = random.randint(0, h - th)
+            for img in imgs:
+                if w == tw and h == th:
+                    crop_images.append(img)
+                else:
+                    crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        results['imgs'] = crop_images
+        return results
+
+
+@PIPELINES.register()
+class RandomResizedCrop(RandomCrop):
+    def __init__(self,
+                 area_range=(0.08, 1.0),
+                 aspect_ratio_range=(3 / 4, 4 / 3),
+                 target_size=224,
+                 backend='cv2'):
+
+        self.area_range = area_range
+        self.aspect_ratio_range = aspect_ratio_range
+        self.target_size = target_size
+        self.backend = backend
+
+    @staticmethod
+    def get_crop_bbox(img_shape,
+                      area_range,
+                      aspect_ratio_range,
+                      max_attempts=10):
+
+        assert 0 < area_range[0] <= area_range[1] <= 1
+        assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]
+
+        img_h, img_w = img_shape
+        area = img_h * img_w
+
+        min_ar, max_ar = aspect_ratio_range
+        aspect_ratios = np.exp(
+            np.random.uniform(np.log(min_ar), np.log(max_ar),
+                              size=max_attempts))
+        target_areas = np.random.uniform(*area_range, size=max_attempts) * area
+        candidate_crop_w = np.round(np.sqrt(target_areas *
+                                            aspect_ratios)).astype(np.int32)
+        candidate_crop_h = np.round(np.sqrt(target_areas /
+                                            aspect_ratios)).astype(np.int32)
+
+        for i in range(max_attempts):
+            crop_w = candidate_crop_w[i]
+            crop_h = candidate_crop_h[i]
+            if crop_h <= img_h and crop_w <= img_w:
+                x_offset = random.randint(0, img_w - crop_w)
+                y_offset = random.randint(0, img_h - crop_h)
+                return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h
+
+        # Fallback
+        crop_size = min(img_h, img_w)
+        x_offset = (img_w - crop_size) // 2
+        y_offset = (img_h - crop_size) // 2
+        return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size
+
+    def __call__(self, results):
+        imgs = results['imgs']
+        if self.backend == 'pillow':
+            img_w, img_h = imgs[0].size
+        elif self.backend == 'cv2':
+            img_h, img_w, _ = imgs[0].shape
+        elif self.backend == 'pyav':
+            img_h, img_w = imgs.shape[2:]  # [cthw]
+        else:
+            raise NotImplementedError
+
+        left, top, right, bottom = self.get_crop_bbox(
+            (img_h, img_w), self.area_range, self.aspect_ratio_range)
+
+        if self.backend == 'pillow':
+            img_w, img_h = imgs[0].size
+            imgs = [img.crop(left, top, right, bottom) for img in imgs]
+        elif self.backend == 'cv2':
+            img_h, img_w, _ = imgs[0].shape
+            imgs = [img[top:bottom, left:right] for img in imgs]
+        elif self.backend == 'pyav':
+            img_h, img_w = imgs.shape[2:]  # [cthw]
+            imgs = imgs[:, :, top:bottom, left:right]
+        else:
+            raise NotImplementedError
+        results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class CenterCrop(object):
+    """
+    Center crop images.
+    Args:
+        target_size(int): Center crop a square with the target_size from an image.
+        do_round(bool): Whether to round up the coordinates of the upper left corner of the cropping area. default: True
+    """
+    def __init__(self, target_size, do_round=True, backend='pillow'):
+        self.target_size = target_size
+        self.do_round = do_round
+        self.backend = backend
+
+    def __call__(self, results):
+        """
+        Performs Center crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            ccrop_imgs: List where each item is a PIL.Image after Center crop.
+        """
+        imgs = results['imgs']
+        ccrop_imgs = []
+        th, tw = self.target_size, self.target_size
+        if isinstance(imgs, paddle.Tensor):
+            h, w = imgs.shape[-2:]
+            x1 = int(round((w - tw) / 2.0)) if self.do_round else (w - tw) // 2
+            y1 = int(round((h - th) / 2.0)) if self.do_round else (h - th) // 2
+            ccrop_imgs = imgs[:, :, y1:y1 + th, x1:x1 + tw]
+        else:
+            for img in imgs:
+                if self.backend == 'pillow':
+                    w, h = img.size
+                elif self.backend == 'cv2':
+                    h, w, _ = img.shape
+                else:
+                    raise NotImplementedError
+                assert (w >= self.target_size) and (h >= self.target_size), \
+                    "image width({}) and height({}) should be larger than crop size".format(
+                        w, h, self.target_size)
+                x1 = int(round(
+                    (w - tw) / 2.0)) if self.do_round else (w - tw) // 2
+                y1 = int(round(
+                    (h - th) / 2.0)) if self.do_round else (h - th) // 2
+                if self.backend == 'cv2':
+                    ccrop_imgs.append(img[y1:y1 + th, x1:x1 + tw])
+                elif self.backend == 'pillow':
+                    ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        results['imgs'] = ccrop_imgs
+        return results
+
+
+@PIPELINES.register()
+class MultiScaleCrop(object):
+    """
+    Random crop images in with multiscale sizes
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+        scales(int): List of candidate cropping scales.
+        max_distort(int): Maximum allowable deformation combination distance.
+        fix_crop(int): Whether to fix the cutting start point.
+        allow_duplication(int): Whether to allow duplicate candidate crop starting points.
+        more_fix_crop(int): Whether to allow more cutting starting points.
+    """
+    def __init__(
+            self,
+            target_size,  # NOTE: named target size now, but still pass short size in it!
+            scales=None,
+            max_distort=1,
+            fix_crop=True,
+            allow_duplication=False,
+            more_fix_crop=True,
+            backend='pillow'):
+
+        self.target_size = target_size
+        self.scales = scales if scales else [1, .875, .75, .66]
+        self.max_distort = max_distort
+        self.fix_crop = fix_crop
+        self.allow_duplication = allow_duplication
+        self.more_fix_crop = more_fix_crop
+        assert backend in [
+            'pillow', 'cv2'
+        ], f"MultiScaleCrop's backend must be pillow or cv2, but get {backend}"
+        self.backend = backend
+
+    def __call__(self, results):
+        """
+        Performs MultiScaleCrop operations.
+        Args:
+            imgs: List where wach item is a PIL.Image.
+            XXX:
+        results:
+
+        """
+        imgs = results['imgs']
+
+        input_size = [self.target_size, self.target_size]
+
+        im_size = imgs[0].size
+
+        # get random crop offset
+        def _sample_crop_size(im_size):
+            image_w, image_h = im_size[0], im_size[1]
+
+            base_size = min(image_w, image_h)
+            crop_sizes = [int(base_size * x) for x in self.scales]
+            crop_h = [
+                input_size[1] if abs(x - input_size[1]) < 3 else x
+                for x in crop_sizes
+            ]
+            crop_w = [
+                input_size[0] if abs(x - input_size[0]) < 3 else x
+                for x in crop_sizes
+            ]
+
+            pairs = []
+            for i, h in enumerate(crop_h):
+                for j, w in enumerate(crop_w):
+                    if abs(i - j) <= self.max_distort:
+                        pairs.append((w, h))
+            crop_pair = random.choice(pairs)
+            if not self.fix_crop:
+                w_offset = random.randint(0, image_w - crop_pair[0])
+                h_offset = random.randint(0, image_h - crop_pair[1])
+            else:
+                w_step = (image_w - crop_pair[0]) / 4
+                h_step = (image_h - crop_pair[1]) / 4
+
+                ret = list()
+                ret.append((0, 0))  # upper left
+                if self.allow_duplication or w_step != 0:
+                    ret.append((4 * w_step, 0))  # upper right
+                if self.allow_duplication or h_step != 0:
+                    ret.append((0, 4 * h_step))  # lower left
+                if self.allow_duplication or (h_step != 0 and w_step != 0):
+                    ret.append((4 * w_step, 4 * h_step))  # lower right
+                if self.allow_duplication or (h_step != 0 or w_step != 0):
+                    ret.append((2 * w_step, 2 * h_step))  # center
+
+                if self.more_fix_crop:
+                    ret.append((0, 2 * h_step))  # center left
+                    ret.append((4 * w_step, 2 * h_step))  # center right
+                    ret.append((2 * w_step, 4 * h_step))  # lower center
+                    ret.append((2 * w_step, 0 * h_step))  # upper center
+
+                    ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+                    ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+                    ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+                    ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+
+                w_offset, h_offset = random.choice(ret)
+
+            return crop_pair[0], crop_pair[1], w_offset, h_offset
+
+        crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)
+        crop_img_group = [
+            img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
+            for img in imgs
+        ]
+        if self.backend == 'pillow':
+            ret_img_group = [
+                img.resize((input_size[0], input_size[1]), Image.BILINEAR)
+                for img in crop_img_group
+            ]
+        else:
+            ret_img_group = [
+                Image.fromarray(
+                    cv2.resize(np.asarray(img),
+                               dsize=(input_size[0], input_size[1]),
+                               interpolation=cv2.INTER_LINEAR))
+                for img in crop_img_group
+            ]
+        results['imgs'] = ret_img_group
+        return results
+
+
+@PIPELINES.register()
+class RandomFlip(object):
+    """
+    Random Flip images.
+    Args:
+        p(float): Random flip images with the probability p.
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, results):
+        """
+        Performs random flip operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            flip_imgs: List where each item is a PIL.Image after random flip.
+        """
+        imgs = results['imgs']
+        v = random.random()
+        if v < self.p:
+            if isinstance(imgs, paddle.Tensor):
+                results['imgs'] = paddle.flip(imgs, axis=[3])
+            elif isinstance(imgs[0], np.ndarray):
+                results['imgs'] = [cv2.flip(img, 1, img) for img in imgs
+                                   ]  # [[h,w,c], [h,w,c], ..., [h,w,c]]
+            else:
+                results['imgs'] = [
+                    img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs
+                ]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomBrightness(object):
+    """
+    Random Brightness images.
+    Args:
+        p(float): Random brightness images with the probability p.
+    """
+    def __init__(self, p=0.1, brightness=1):
+        self.p = p
+        self.brightness = brightness
+
+    def __call__(self, results):
+        """
+        Performs random brightness operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            brightness_imgs: List where each item is a PIL.Image after random brightness.
+        """
+        imgs = results['imgs']
+        v = random.random()
+
+        if v < self.p:
+            transform = ColorJitter(brightness=self.brightness)
+            results['imgs'] = [transform(img) for img in imgs]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomSaturation(object):
+    """
+    Random Saturation images.
+    Args:
+        p(float): Random saturation images with the probability p.
+    """
+    def __init__(self, p=0.1, saturation=2):
+        self.p = p
+        self.saturation = saturation
+
+    def __call__(self, results):
+        """
+        Performs random saturation operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            saturation_imgs: List where each item is a PIL.Image after random saturation.
+        """
+        imgs = results['imgs']
+        v = random.random()
+
+        if v < self.p:
+            transform = ColorJitter(saturation=self.saturation)
+            results['imgs'] = [transform(img) for img in imgs]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomHue(object):
+    """
+    Random Hue images.
+    Args:
+        p(float): Random hue images with the probability p.
+    """
+    def __init__(self, p=0.1, hue=0.5):
+        self.p = p
+        self.hue = hue
+
+    def __call__(self, results):
+        """
+        Performs random hue operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            hue_imgs: List where each item is a PIL.Image after random hue.
+        """
+        imgs = results['imgs']
+        v = random.random()
+
+        if v < self.p:
+            transform = ColorJitter(hue=self.hue)
+            results['imgs'] = [transform(img) for img in imgs]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomGamma(object):
+    """
+    Random Gamma images.
+    Args:
+        p(float): Random gamma images with the probability p.
+        gamma (float): Non negative real number, same as `\\gamma` in the equation.
+                       gamma larger than 1 make the shadows darker,
+                      while gamma smaller than 1 make dark regions lighter.
+    """
+    def __init__(self, p=0.1, gamma=0.2):
+        self.p = p
+        self.value = [1 - gamma, 1 + gamma]
+        self.value[0] = max(self.value[0], 0)
+
+    def _adust_gamma(self, img, gamma, gain=1.0):
+        flag = False
+        if isinstance(img, np.ndarray):
+            flag = True
+            img = Image.fromarray(img)
+        input_mode = img.mode
+        img = img.convert("RGB")
+        gamma_map = [
+            int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma))
+            for ele in range(256)
+        ] * 3
+        img = img.point(
+            gamma_map)  # use PIL's point-function to accelerate this part
+        img = img.convert(input_mode)
+        if flag:
+            img = np.array(img)
+        return img
+
+    def __call__(self, results):
+        """
+        Performs random gamma operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            gamma_imgs: List where each item is a PIL.Image after random gamma.
+        """
+        imgs = results['imgs']
+        v = random.random()
+
+        if v < self.p:
+            gamma = random.uniform(self.value[0], self.value[1])
+            results['imgs'] = [self._adust_gamma(img, gamma) for img in imgs]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class Image2Array(object):
+    """
+    transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.
+    Args:
+        transpose: whether to transpose or not, default True, False for slowfast.
+    """
+    def __init__(self, transpose=True, data_format='tchw'):
+        assert data_format in [
+            'tchw', 'cthw'
+        ], f"Target format must in ['tchw', 'cthw'], but got {data_format}"
+        self.transpose = transpose
+        self.data_format = data_format
+
+    def __call__(self, results):
+        """
+        Performs Image to NumpyArray operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            np_imgs: Numpy array.
+        """
+        imgs = results['imgs']
+        if 'backend' in results and results[
+                'backend'] == 'pyav':  # [T,H,W,C] in [0, 1]
+            if self.transpose:
+                if self.data_format == 'tchw':
+                    t_imgs = imgs.transpose((0, 3, 1, 2))  # tchw
+                else:
+                    t_imgs = imgs.transpose((3, 0, 1, 2))  # cthw
+            results['imgs'] = t_imgs
+        else:
+            t_imgs = np.stack(imgs).astype('float32')
+            if self.transpose:
+                if self.data_format == 'tchw':
+                    t_imgs = t_imgs.transpose(0, 3, 1, 2)  # tchw
+                else:
+                    t_imgs = t_imgs.transpose(3, 0, 1, 2)  # cthw
+            results['imgs'] = t_imgs
+        return results
+
+
+@PIPELINES.register()
+class Normalization(object):
+    """
+    Normalization.
+    Args:
+        mean(Sequence[float]): mean values of different channels.
+        std(Sequence[float]): std values of different channels.
+        tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]
+    """
+    def __init__(self, mean, std, tensor_shape=[3, 1, 1], inplace=False):
+        if not isinstance(mean, Sequence):
+            raise TypeError(
+                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
+        if not isinstance(std, Sequence):
+            raise TypeError(
+                f'Std must be list, tuple or np.ndarray, but got {type(std)}')
+
+        self.inplace = inplace
+        if not inplace:
+            self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)
+            self.std = np.array(std).reshape(tensor_shape).astype(np.float32)
+        else:
+            self.mean = np.array(mean, dtype=np.float32)
+            self.std = np.array(std, dtype=np.float32)
+
+    def __call__(self, results):
+        """
+        Performs normalization operations.
+        Args:
+            imgs: Numpy array.
+        return:
+            np_imgs: Numpy array after normalization.
+        """
+        if self.inplace:
+            n = len(results['imgs'])
+            h, w, c = results['imgs'][0].shape
+            norm_imgs = np.empty((n, h, w, c), dtype=np.float32)
+            for i, img in enumerate(results['imgs']):
+                norm_imgs[i] = img
+
+            for img in norm_imgs:  # [n,h,w,c]
+                mean = np.float64(self.mean.reshape(1, -1))  # [1, 3]
+                stdinv = 1 / np.float64(self.std.reshape(1, -1))  # [1, 3]
+                cv2.subtract(img, mean, img)
+                cv2.multiply(img, stdinv, img)
+        else:
+            imgs = results['imgs']
+            norm_imgs = imgs / 255.0
+            norm_imgs -= self.mean
+            norm_imgs /= self.std
+            if 'backend' in results and results['backend'] == 'pyav':
+                norm_imgs = paddle.to_tensor(norm_imgs, dtype=paddle.float32)
+        results['imgs'] = norm_imgs
+        return results
+
+
+@PIPELINES.register()
+class JitterScale(object):
+    """
+    Scale image, while the target short size is randomly select between min_size and max_size.
+    Args:
+        min_size: Lower bound for random sampler.
+        max_size: Higher bound for random sampler.
+    """
+    def __init__(self,
+                 min_size,
+                 max_size,
+                 short_cycle_factors=[0.5, 0.7071],
+                 default_min_size=256):
+        self.default_min_size = default_min_size
+        self.orig_min_size = self.min_size = min_size
+        self.max_size = max_size
+        self.short_cycle_factors = short_cycle_factors
+
+    def __call__(self, results):
+        """
+        Performs jitter resize operations.
+        Args:
+            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            resized_imgs: List where each item is a PIL.Image after scaling.
+        """
+        short_cycle_idx = results.get('short_cycle_idx')
+        if short_cycle_idx in [0, 1]:
+            self.min_size = int(
+                round(self.short_cycle_factors[short_cycle_idx] *
+                      self.default_min_size))
+        else:
+            self.min_size = self.orig_min_size
+
+        imgs = results['imgs']
+        size = int(round(np.random.uniform(self.min_size, self.max_size)))
+        assert (len(imgs) >= 1), \
+            "len(imgs):{} should be larger than 1".format(len(imgs))
+
+        if 'backend' in results and results['backend'] == 'pyav':
+            height, width = imgs.shape[2:]
+        else:
+            width, height = imgs[0].size
+        if (width <= height and width == size) or (height <= width
+                                                   and height == size):
+            return results
+
+        new_width = size
+        new_height = size
+        if width < height:
+            new_height = int(math.floor((float(height) / width) * size))
+        else:
+            new_width = int(math.floor((float(width) / height) * size))
+
+        if 'backend' in results and results['backend'] == 'pyav':
+            frames_resize = F.interpolate(imgs,
+                                          size=(new_height, new_width),
+                                          mode="bilinear",
+                                          align_corners=False)  # [c,t,h,w]
+        else:
+            frames_resize = []
+            for j in range(len(imgs)):
+                img = imgs[j]
+                scale_img = img.resize((new_width, new_height), Image.BILINEAR)
+                frames_resize.append(scale_img)
+
+        results['imgs'] = frames_resize
+        return results
+
+
+@PIPELINES.register()
+class MultiCenterCrop(object):
+    """
+    center crop, left center crop right center crop
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+    """
+    def __init__(self, target_size):
+        self.target_size = target_size
+
+    def __call__(self, results):
+        """
+        Performs random crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            crop_imgs: List where each item is a PIL.Image after random crop.
+        """
+        imgs = results['imgs']
+        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]
+            h, w = imgs.shape[2:]
+        else:
+            w, h = imgs[0].size
+        th, tw = self.target_size, self.target_size
+
+        assert (w >= self.target_size) and (h >= self.target_size), \
+            "image width({}) and height({}) should be larger than crop size".format(
+                w, h, self.target_size)
+
+        crop_images = []
+        #just for tensor
+        crop_imgs_center = []
+        crop_imgs_left = []
+        crop_imgs_right = []
+        if 'backend' in results and results['backend'] == 'pyav':
+            #center_corp
+            x1 = 0
+            if w > self.target_size:
+                x1 = int((w - self.target_size) / 2.0)
+            y1 = 0
+            if h > self.target_size:
+                y1 = int((h - self.target_size) / 2.0)
+            crop_imgs_center = imgs[:, :, y1:y1 + th,
+                                    x1:x1 + tw].numpy()  # [C, T, th, tw]
+            #left_crop
+            x1 = 0
+            y1 = 0
+            if h > self.target_size:
+                y1 = int((h - self.target_size) / 2.0)
+            crop_imgs_left = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy()
+            #right_crop
+            x1 = 0
+            y1 = 0
+            if w > self.target_size:
+                x1 = w - self.target_size
+            if h > self.target_size:
+                y1 = int((h - self.target_size) / 2.0)
+            crop_imgs_right = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy()
+            crop_imgs = np.concatenate(
+                (crop_imgs_center, crop_imgs_left, crop_imgs_right), axis=1)
+            crop_images = paddle.to_tensor(crop_imgs)
+
+        else:
+            x1 = 0
+            if w > self.target_size:
+                x1 = random.randint(0, w - tw)
+            y1 = 0
+            if h > self.target_size:
+                y1 = random.randint(0, h - th)
+            for img in imgs:
+                if w == tw and h == th:
+                    crop_images.append(img)
+                else:
+                    crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        results['imgs'] = crop_images
+        return results
+
+
+@PIPELINES.register()
+class MultiCrop(object):
+    """
+    Random crop image.
+    This operation can perform multi-crop during multi-clip test, as in slowfast model.
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+    """
+    def __init__(self,
+                 target_size,
+                 default_crop_size=224,
+                 short_cycle_factors=[0.5, 0.7071],
+                 test_mode=False):
+        self.orig_target_size = self.target_size = target_size
+        self.short_cycle_factors = short_cycle_factors
+        self.default_crop_size = default_crop_size
+        self.test_mode = test_mode
+
+    def __call__(self, results):
+        """
+        Performs random crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            crop_imgs: List where each item is a PIL.Image after random crop.
+        """
+        imgs = results['imgs']
+        spatial_sample_index = results['spatial_sample_index']
+        spatial_num_clips = results['spatial_num_clips']
+
+        short_cycle_idx = results.get('short_cycle_idx')
+        if short_cycle_idx in [0, 1]:
+            self.target_size = int(
+                round(self.short_cycle_factors[short_cycle_idx] *
+                      self.default_crop_size))
+        else:
+            self.target_size = self.orig_target_size  # use saved value before call
+
+        w, h = imgs[0].size
+        if w == self.target_size and h == self.target_size:
+            return results
+
+        assert (w >= self.target_size) and (h >= self.target_size), \
+            "image width({}) and height({}) should be larger than crop size({},{})".format(w, h, self.target_size, self.target_size)
+        frames_crop = []
+        if not self.test_mode:
+            x_offset = random.randint(0, w - self.target_size)
+            y_offset = random.randint(0, h - self.target_size)
+        else:  # multi-crop
+            x_gap = int(
+                math.ceil((w - self.target_size) / (spatial_num_clips - 1)))
+            y_gap = int(
+                math.ceil((h - self.target_size) / (spatial_num_clips - 1)))
+            if h > w:
+                x_offset = int(math.ceil((w - self.target_size) / 2))
+                if spatial_sample_index == 0:
+                    y_offset = 0
+                elif spatial_sample_index == spatial_num_clips - 1:
+                    y_offset = h - self.target_size
+                else:
+                    y_offset = y_gap * spatial_sample_index
+            else:
+                y_offset = int(math.ceil((h - self.target_size) / 2))
+                if spatial_sample_index == 0:
+                    x_offset = 0
+                elif spatial_sample_index == spatial_num_clips - 1:
+                    x_offset = w - self.target_size
+                else:
+                    x_offset = x_gap * spatial_sample_index
+
+        for img in imgs:
+            nimg = img.crop((x_offset, y_offset, x_offset + self.target_size,
+                             y_offset + self.target_size))
+            frames_crop.append(nimg)
+        results['imgs'] = frames_crop
+        return results
+
+
+@PIPELINES.register()
+class PackOutput(object):
+    """
+    In slowfast model, we want to get slow pathway from fast pathway based on
+    alpha factor.
+    Args:
+        alpha(int): temporal length of fast/slow
+    """
+    def __init__(self, alpha):
+        self.alpha = alpha
+
+    def __call__(self, results):
+        fast_pathway = results['imgs']
+
+        # sample num points between start and end
+        slow_idx_start = 0
+        slow_idx_end = fast_pathway.shape[0] - 1
+        slow_idx_num = fast_pathway.shape[0] // self.alpha
+        slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,
+                                       slow_idx_num).astype("int64")
+        slow_pathway = fast_pathway[slow_idxs_select]
+
+        # T H W C -> C T H W.
+        slow_pathway = slow_pathway.transpose(3, 0, 1, 2)
+        fast_pathway = fast_pathway.transpose(3, 0, 1, 2)
+
+        # slow + fast
+        frames_list = [slow_pathway, fast_pathway]
+        results['imgs'] = frames_list
+        return results
+
+
+@PIPELINES.register()
+class GroupFullResSample(object):
+    def __init__(self, crop_size, flip=False):
+        self.crop_size = crop_size if not isinstance(crop_size, int) else (
+            crop_size, crop_size)
+        self.flip = flip
+
+    def __call__(self, results):
+        img_group = results['imgs']
+
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+
+        offsets = list()
+        offsets.append((0 * w_step, 2 * h_step))  # left
+        offsets.append((4 * w_step, 2 * h_step))  # right
+        offsets.append((2 * w_step, 2 * h_step))  # center
+
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                if self.flip:
+                    flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+                    flip_group.append(flip_crop)
+
+            oversample_group.extend(normal_group)
+            if self.flip:
+                oversample_group.extend(flip_group)
+
+        results['imgs'] = oversample_group
+        return results
+
+
+@PIPELINES.register()
+class TenCrop:
+    """
+    Crop out 5 regions (4 corner points + 1 center point) from the picture,
+    and then flip the cropping result to get 10 cropped images, which can make the prediction result more robust.
+    Args:
+        target_size(int | tuple[int]): (w, h) of target size for crop.
+    """
+    def __init__(self, target_size):
+        self.target_size = (target_size, target_size)
+
+    def __call__(self, results):
+        imgs = results['imgs']
+        img_w, img_h = imgs[0].size
+        crop_w, crop_h = self.target_size
+        w_step = (img_w - crop_w) // 4
+        h_step = (img_h - crop_h) // 4
+        offsets = [
+            (0, 0),
+            (4 * w_step, 0),
+            (0, 4 * h_step),
+            (4 * w_step, 4 * h_step),
+            (2 * w_step, 2 * h_step),
+        ]
+        img_crops = list()
+        for x_offset, y_offset in offsets:
+            crop = [
+                img.crop(
+                    (x_offset, y_offset, x_offset + crop_w, y_offset + crop_h))
+                for img in imgs
+            ]
+            crop_fliped = [
+                timg.transpose(Image.FLIP_LEFT_RIGHT) for timg in crop
+            ]
+            img_crops.extend(crop)
+            img_crops.extend(crop_fliped)
+
+        results['imgs'] = img_crops
+        return results
+
+
+@PIPELINES.register()
+class UniformCrop:
+    """
+    Perform uniform spatial sampling on the images,
+    select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions.
+    Args:
+        target_size(int | tuple[int]): (w, h) of target size for crop.
+    """
+    def __init__(self, target_size, backend='cv2'):
+        if isinstance(target_size, tuple):
+            self.target_size = target_size
+        elif isinstance(target_size, int):
+            self.target_size = (target_size, target_size)
+        else:
+            raise TypeError(
+                f'target_size must be int or tuple[int], but got {type(target_size)}'
+            )
+        self.backend = backend
+
+    def __call__(self, results):
+
+        imgs = results['imgs']
+        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]
+            img_h, img_w = imgs.shape[2:]
+        elif self.backend == 'pillow':
+            img_w, img_h = imgs[0].size
+        else:
+            img_h, img_w = imgs[0].shape[:2]
+
+        crop_w, crop_h = self.target_size
+        if crop_h == img_h:
+            w_step = (img_w - crop_w) // 2
+            offsets = [
+                (0, 0),
+                (w_step * 2, 0),
+                (w_step, 0),
+            ]
+        elif crop_w == img_w:
+            h_step = (img_h - crop_h) // 2
+            offsets = [
+                (0, 0),
+                (0, h_step * 2),
+                (0, h_step),
+            ]
+        else:
+            raise ValueError(
+                f"img_w({img_w}) == crop_w({crop_w}) or img_h({img_h}) == crop_h({crop_h})"
+            )
+        img_crops = []
+        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]
+            for x_offset, y_offset in offsets:
+                crop = imgs[:, :, y_offset:y_offset + crop_h,
+                            x_offset:x_offset + crop_w]
+                img_crops.append(crop)
+            img_crops = paddle.concat(img_crops, axis=1)
+        else:
+            if self.backend == 'pillow':
+                for x_offset, y_offset in offsets:
+                    crop = [
+                        img.crop((x_offset, y_offset, x_offset + crop_w,
+                                  y_offset + crop_h)) for img in imgs
+                    ]
+                    img_crops.extend(crop)
+            else:
+                for x_offset, y_offset in offsets:
+                    crop = [
+                        img[y_offset:y_offset + crop_h,
+                            x_offset:x_offset + crop_w] for img in imgs
+                    ]
+                    img_crops.extend(crop)
+        results['imgs'] = img_crops
+        return results
+
+
+@PIPELINES.register()
+class GroupResize(object):
+    def __init__(self, height, width, scale, K, mode='train'):
+        self.height = height
+        self.width = width
+        self.scale = scale
+        self.resize = {}
+        self.K = np.array(K, dtype=np.float32)
+        self.mode = mode
+        for i in range(self.scale):
+            s = 2**i
+            self.resize[i] = paddle.vision.transforms.Resize(
+                (self.height // s, self.width // s), interpolation='lanczos')
+
+    def __call__(self, results):
+        if self.mode == 'infer':
+            imgs = results['imgs']
+            for k in list(imgs):  # ("color", 0, -1)
+                if "color" in k or "color_n" in k:
+                    n, im, _ = k
+                    for i in range(self.scale):
+                        imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])
+        else:
+            imgs = results['imgs']
+            for scale in range(self.scale):
+                K = self.K.copy()
+
+                K[0, :] *= self.width // (2**scale)
+                K[1, :] *= self.height // (2**scale)
+
+                inv_K = np.linalg.pinv(K)
+                imgs[("K", scale)] = K
+                imgs[("inv_K", scale)] = inv_K
+
+            for k in list(imgs):
+                if "color" in k or "color_n" in k:
+                    n, im, i = k
+                    for i in range(self.scale):
+                        imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])
+
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class ColorJitter(object):
+    """Randomly change the brightness, contrast, saturation and hue of an image.
+    """
+    def __init__(self,
+                 brightness=0,
+                 contrast=0,
+                 saturation=0,
+                 hue=0,
+                 mode='train',
+                 p=0.5,
+                 keys=None):
+        self.mode = mode
+        self.colorjitter = paddle.vision.transforms.ColorJitter(
+            brightness, contrast, saturation, hue)
+        self.p = p
+
+    def __call__(self, results):
+        """
+        Args:
+            results (PIL Image): Input image.
+
+        Returns:
+            PIL Image: Color jittered image.
+        """
+
+        do_color_aug = random.random() > self.p
+        imgs = results['imgs']
+        for k in list(imgs):
+            f = imgs[k]
+            if "color" in k or "color_n" in k:
+                n, im, i = k
+                imgs[(n, im, i)] = f
+                if do_color_aug:
+                    imgs[(n + "_aug", im, i)] = self.colorjitter(f)
+                else:
+                    imgs[(n + "_aug", im, i)] = f
+        if self.mode == "train":
+            for i in results['frame_idxs']:
+                del imgs[("color", i, -1)]
+                del imgs[("color_aug", i, -1)]
+                del imgs[("color_n", i, -1)]
+                del imgs[("color_n_aug", i, -1)]
+        else:
+            for i in results['frame_idxs']:
+                del imgs[("color", i, -1)]
+                del imgs[("color_aug", i, -1)]
+
+        results['img'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class GroupRandomFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, results):
+
+        imgs = results['imgs']
+        do_flip = random.random() > self.p
+        if do_flip:
+            for k in list(imgs):
+                if "color" in k or "color_n" in k:
+                    n, im, i = k
+                    imgs[(n, im,
+                          i)] = imgs[(n, im,
+                                      i)].transpose(Image.FLIP_LEFT_RIGHT)
+            if "depth_gt" in imgs:
+                imgs['depth_gt'] = np.array(np.fliplr(imgs['depth_gt']))
+
+        results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class ToArray(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        imgs = results['imgs']
+        for k in list(imgs):
+            if "color" in k or "color_n" in k or "color_aug" in k or "color_n_aug" in k:
+                n, im, i = k
+                imgs[(n, im,
+                      i)] = np.array(imgs[(n, im, i)]).astype('float32') / 255.0
+                imgs[(n, im, i)] = imgs[(n, im, i)].transpose((2, 0, 1))
+        if "depth_gt" in imgs:
+            imgs['depth_gt'] = np.array(imgs['depth_gt']).astype('float32')
+
+        results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class YowoAug(object):
+    def __init__(self, target_size=224, jitter=0.2, hue=0.1, saturation=1.5, exposure=1.5, valid_mode=False):
+        self.shape = (target_size, target_size)
+        self.jitter = jitter
+        self.hue = hue
+        self.saturation = saturation
+        self.exposure = exposure
+        self.valid_mode = valid_mode
+
+    def _rand_scale(self, s):
+        scale = random.uniform(1, s)
+        if (random.randint(1, 10000) % 2):
+            return scale
+        return 1. / scale
+
+    def _distort_image(self, im, hue, sat, val):
+        im = im.convert('HSV')
+        cs = list(im.split())
+        cs[1] = cs[1].point(lambda i: i * sat)
+        cs[2] = cs[2].point(lambda i: i * val)
+
+        def _change_hue(x):
+            x += hue * 255
+            if x > 255:
+                x -= 255
+            if x < 0:
+                x += 255
+            return x
+
+        cs[0] = cs[0].point(_change_hue)
+        im = Image.merge(im.mode, tuple(cs))
+
+        im = im.convert('RGB')
+        # constrain_image(im)
+        return im
+
+    def _random_distort_image(self, im, dhue, dsat, dexp):
+        res = self._distort_image(im, dhue, dsat, dexp)
+        return res
+
+    def _read_truths_args(self, lab_path, min_box_scale):
+        truths = np.loadtxt(lab_path)
+        truths = np.reshape(truths, (truths.size // 5, 5))
+        new_truths = []
+        for i in range(truths.shape[0]):
+            cx = (truths[i][1] + truths[i][3]) / (2 * 320)
+            cy = (truths[i][2] + truths[i][4]) / (2 * 240)
+            imgw = (truths[i][3] - truths[i][1]) / 320
+            imgh = (truths[i][4] - truths[i][2]) / 240
+            truths[i][0] = truths[i][0] - 1
+            truths[i][1] = cx
+            truths[i][2] = cy
+            truths[i][3] = imgw
+            truths[i][4] = imgh
+
+            if truths[i][3] < min_box_scale:
+                continue
+            new_truths.append([truths[i][0], truths[i][1], truths[i][2], truths[i][3], truths[i][4]])
+        return np.array(new_truths)
+
+    def _fill_truth_detection(self, labpath, flip, dx, dy, sx, sy):
+        max_boxes = 50
+        label = np.zeros((max_boxes, 5))
+        bs = np.loadtxt(labpath)
+        bs = np.reshape(bs, (-1, 5))
+
+        for i in range(bs.shape[0]):
+            cx = (bs[i][1] + bs[i][3]) / (2 * 320)
+            cy = (bs[i][2] + bs[i][4]) / (2 * 240)
+            imgw = (bs[i][3] - bs[i][1]) / 320
+            imgh = (bs[i][4] - bs[i][2]) / 240
+            bs[i][0] = bs[i][0] - 1
+            bs[i][1] = cx
+            bs[i][2] = cy
+            bs[i][3] = imgw
+            bs[i][4] = imgh
+
+        cc = 0
+        for i in range(bs.shape[0]):
+            x1 = bs[i][1] - bs[i][3] / 2
+            y1 = bs[i][2] - bs[i][4] / 2
+            x2 = bs[i][1] + bs[i][3] / 2
+            y2 = bs[i][2] + bs[i][4] / 2
+
+            x1 = min(0.999, max(0, x1 * sx - dx))
+            y1 = min(0.999, max(0, y1 * sy - dy))
+            x2 = min(0.999, max(0, x2 * sx - dx))
+            y2 = min(0.999, max(0, y2 * sy - dy))
+
+            bs[i][1] = (x1 + x2) / 2
+            bs[i][2] = (y1 + y2) / 2
+            bs[i][3] = (x2 - x1)
+            bs[i][4] = (y2 - y1)
+
+            if flip:
+                bs[i][1] = 0.999 - bs[i][1]
+
+            if bs[i][3] < 0.001 or bs[i][4] < 0.001:
+                continue
+            label[cc] = bs[i]
+            cc += 1
+            if cc >= 50:
+                break
+
+        label = np.reshape(label, (-1))
+        return label
+
+    def __call__(self, results):
+        clip = results['imgs']
+        frame_num = len(clip)
+        oh = clip[0].height
+        ow = clip[0].width
+        labpath = results['filename'].replace('jpg', 'txt').replace('rgb-images', 'labels')
+        if not self.valid_mode:
+            dw = int(ow * self.jitter)
+            dh = int(oh * self.jitter)
+
+            pleft = random.randint(-dw, dw)
+            pright = random.randint(-dw, dw)
+            ptop = random.randint(-dh, dh)
+            pbot = random.randint(-dh, dh)
+
+            swidth = ow - pleft - pright
+            sheight = oh - ptop - pbot
+
+            sx = float(swidth) / ow
+            sy = float(sheight) / oh
+
+            dx = (float(pleft) / ow) / sx
+            dy = (float(ptop) / oh) / sy
+
+            flip = random.randint(1, 10000) % 2
+
+            dhue = random.uniform(-self.hue, self.hue)
+            dsat = self._rand_scale(self.saturation)
+            dexp = self._rand_scale(self.exposure)
+
+            # Augment
+            cropped = [img.crop((pleft, ptop, pleft + swidth - 1, ptop + sheight - 1)) for img in clip]
+
+            sized = [img.resize(self.shape) for img in cropped]
+
+            if flip:
+                sized = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in sized]
+
+            clip = [self._random_distort_image(img, dhue, dsat, dexp) for img in sized]
+
+            label = self._fill_truth_detection(labpath, flip, dx, dy, 1. / sx, 1. / sy)
+
+        else:
+            label = np.zeros([50 * 5])
+            tmp = self._read_truths_args(labpath, 8.0 / clip[0].width).astype('float32')
+            tmp = np.reshape(tmp, [-1])
+            tsz = tmp.size
+            if tsz > 50 * 5:
+                label = tmp[0:50 * 5]
+            elif tsz > 0:
+                label[0:tsz] = tmp
+            clip = [img.resize(self.shape) for img in clip]
+
+        clip = [np.asarray(img).astype('float32') / 255.0 for img in clip]
+        clip = np.concatenate(clip, 0).reshape([frame_num, 224, 224, 3])
+        clip = np.transpose(clip, [3, 0, 1, 2])
+        results['imgs'] = clip
+        results['labels'] = label
+        return results
diff --git a/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py b/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py
new file mode 100644
index 000000000..4f0c43d89
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/augmentations_ava.py
@@ -0,0 +1,749 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+import math
+from PIL import Image
+from ..registry import PIPELINES
+from collections.abc import Sequence
+import cv2
+
+pillow_interp_codes = {
+    'nearest': Image.NEAREST,
+    'bilinear': Image.BILINEAR,
+    'bicubic': Image.BICUBIC,
+    'box': Image.BOX,
+    'lanczos': Image.LANCZOS,
+    'hamming': Image.HAMMING
+}
+
+cv2_interp_codes = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'bicubic': cv2.INTER_CUBIC,
+    'area': cv2.INTER_AREA,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+
+
+def _init_lazy_if_proper(results, lazy):
+    """Initialize lazy operation properly.
+
+    Make sure that a lazy operation is properly initialized,
+    and avoid a non-lazy operation accidentally getting mixed in.
+
+    Required keys in results are "imgs" if "img_shape" not in results,
+    otherwise, Required keys in results are "img_shape", add or modified keys
+    are "img_shape", "lazy".
+    Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip",
+    "flip_direction", "interpolation".
+
+    Args:
+        results (dict): A dict stores data pipeline result.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    if 'img_shape' not in results:
+        results['img_shape'] = results['imgs'][0].shape[:2]
+    if lazy:
+        if 'lazy' not in results:
+            img_h, img_w = results['img_shape']
+            lazyop = dict()
+            lazyop['original_shape'] = results['img_shape']
+            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],
+                                           dtype=np.float32)
+            lazyop['flip'] = False
+            lazyop['flip_direction'] = None
+            lazyop['interpolation'] = None
+            results['lazy'] = lazyop
+    else:
+        assert 'lazy' not in results, 'Use Fuse after lazy operations'
+
+
+def _scale_size(size, scale):
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    w, h = size
+    return int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)
+
+
+def rescale_size(old_size, scale, return_scale=False):
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imresize(img,
+             size,
+             return_scale=False,
+             interpolation='bilinear',
+             out=None,
+             backend=None):
+    """Resize image to a given size.  """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = 'cv2'
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(
+            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+@PIPELINES.register()
+class EntityBoxRescale:
+    """Rescale the entity box and proposals according to the image shape.
+
+    Required keys are "proposals", "gt_bboxes", added or modified keys are
+    "gt_bboxes". If original "proposals" is not None, "proposals" and
+    will be added or modified.
+
+    Args:
+        scale_factor (np.ndarray): The scale factor used entity_box rescaling.
+    """
+
+    def __init__(self, scale_factor):
+        self.scale_factor = scale_factor
+
+    def __call__(self, results):
+        scale_factor = np.concatenate([self.scale_factor, self.scale_factor])
+
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            results['gt_bboxes'] = gt_bboxes * scale_factor
+
+        if 'proposals' in results:
+            proposals = results['proposals']
+            if proposals is not None:
+                assert proposals.shape[1] == 4, (
+                    'proposals shape should be in '
+                    f'(n, 4), but got {proposals.shape}')
+                results['proposals'] = proposals * scale_factor
+
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(scale_factor={self.scale_factor})'
+
+
+@PIPELINES.register()
+class EntityBoxCrop:
+    """Crop the entity boxes and proposals according to the cropped images.
+
+    Required keys are "proposals", "gt_bboxes", added or modified keys are
+    "gt_bboxes". If original "proposals" is not None, "proposals" will be
+    modified.
+
+    Args:
+        crop_bbox(np.ndarray | None): The bbox used to crop the original image.
+    """
+
+    def __init__(self, crop_bbox):
+        self.crop_bbox = crop_bbox
+
+    def __call__(self, results):
+        proposals = results['proposals']
+        gt_bboxes = results['gt_bboxes']
+
+        if self.crop_bbox is None:
+            return results
+
+        x1, y1, x2, y2 = self.crop_bbox
+        img_w, img_h = x2 - x1, y2 - y1
+
+        assert gt_bboxes.shape[-1] == 4
+        gt_bboxes_ = gt_bboxes.copy()
+        gt_bboxes_[..., 0::2] = np.clip(gt_bboxes[..., 0::2] - x1, 0, img_w - 1)
+        gt_bboxes_[..., 1::2] = np.clip(gt_bboxes[..., 1::2] - y1, 0, img_h - 1)
+        results['gt_bboxes'] = gt_bboxes_
+
+        if proposals is not None:
+            assert proposals.shape[-1] == 4
+            proposals_ = proposals.copy()
+            proposals_[..., 0::2] = np.clip(proposals[..., 0::2] - x1, 0,
+                                            img_w - 1)
+            proposals_[..., 1::2] = np.clip(proposals[..., 1::2] - y1, 0,
+                                            img_h - 1)
+            results['proposals'] = proposals_
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(crop_bbox={self.crop_bbox})'
+
+
+@PIPELINES.register()
+class EntityBoxFlip:
+    """Flip the entity boxes and proposals with a probability.
+
+    Reverse the order of elements in the given bounding boxes and proposals
+    with a specific direction. The shape of them are preserved, but the
+    elements are reordered. Only the horizontal flip is supported (seems
+    vertical flipping makes no sense). Required keys are "proposals",
+    "gt_bboxes", added or modified keys are "gt_bboxes". If "proposals"
+    is not None, it will also be modified.
+
+    Args:
+        img_shape (tuple[int]): The img shape.
+    """
+
+    def __init__(self, img_shape):
+        self.img_shape = img_shape
+
+    def __call__(self, results):
+        proposals = results['proposals']
+        gt_bboxes = results['gt_bboxes']
+        img_h, img_w = self.img_shape
+
+        assert gt_bboxes.shape[-1] == 4
+        gt_bboxes_ = gt_bboxes.copy()
+        gt_bboxes_[..., 0::4] = img_w - gt_bboxes[..., 2::4] - 1
+        gt_bboxes_[..., 2::4] = img_w - gt_bboxes[..., 0::4] - 1
+        if proposals is not None:
+            assert proposals.shape[-1] == 4
+            proposals_ = proposals.copy()
+            proposals_[..., 0::4] = img_w - proposals[..., 2::4] - 1
+            proposals_[..., 2::4] = img_w - proposals[..., 0::4] - 1
+        else:
+            proposals_ = None
+
+        results['proposals'] = proposals_
+        results['gt_bboxes'] = gt_bboxes_
+
+        return results
+
+    def __repr__(self):
+        repr_str = f'{self.__class__.__name__}(img_shape={self.img_shape})'
+        return repr_str
+
+
+@PIPELINES.register()
+class Resize:
+    """Resize images to a specific size.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "lazy",
+    "resize_size". Required keys in "lazy" is None, added or modified key is
+    "interpolation".
+
+    Args:
+        scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling
+            factor or maximum size:
+            If it is a float number, the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, the image will
+            be rescaled as large as possible within the scale.
+            Otherwise, it serves as (w, h) of output size.
+        keep_ratio (bool): If set to True, Images will be resized without
+            changing the aspect ratio. Otherwise, it will resize images to a
+            given size. Default: True.
+        interpolation (str): Algorithm used for interpolation:
+            "nearest" | "bilinear". Default: "bilinear".
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self,
+                 scale,
+                 keep_ratio=True,
+                 interpolation='bilinear',
+                 lazy=False):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        if isinstance(scale, float):
+            if scale <= 0:
+                raise ValueError(f'Invalid scale {scale}, must be positive.')
+        elif isinstance(scale, tuple):
+            max_long_edge = max(scale)
+            max_short_edge = min(scale)
+            if max_short_edge == -1:
+                # assign np.inf to long edge for rescaling short edge later.
+                scale = (np.inf, max_long_edge)
+        else:
+            raise TypeError(
+                f'Scale must be float or tuple of int, but got {type(scale)}')
+        self.scale = scale
+        self.keep_ratio = keep_ratio
+        self.interpolation = interpolation
+        self.lazy = lazy
+
+    def __call__(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+
+        _init_lazy_if_proper(results, self.lazy)
+
+        if 'scale_factor' not in results:
+            results['scale_factor'] = np.array([1, 1], dtype=np.float32)
+        img_h, img_w = results['img_shape']
+
+        if self.keep_ratio:
+            new_w, new_h = rescale_size((img_w, img_h), self.scale)
+        else:
+            new_w, new_h = self.scale
+
+        self.scale_factor = np.array([new_w / img_w, new_h / img_h],
+                                     dtype=np.float32)
+        results['img_shape'] = (new_h, new_w)
+        results['keep_ratio'] = self.keep_ratio
+        results['scale_factor'] = results['scale_factor'] * self.scale_factor
+
+        if not self.lazy:
+            if 'imgs' in results:
+                results['imgs'] = [
+                    imresize(
+                        img, (new_w, new_h), interpolation=self.interpolation)
+                    for img in results['imgs']
+                ]
+            if 'keypoint' in results:
+                results['keypoint'] = results['keypoint'] * self.scale_factor
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+            lazyop['interpolation'] = self.interpolation
+
+        #if 'gt_bboxes' in results:
+        assert not self.lazy
+        entity_box_rescale = EntityBoxRescale(self.scale_factor)
+        results = entity_box_rescale(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale={self.scale}, keep_ratio={self.keep_ratio}, '
+                    f'interpolation={self.interpolation}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@PIPELINES.register()
+class RandomRescale:
+    """Randomly resize images so that the short_edge is resized to a specific
+    size in a given range. The scale ratio is unchanged after resizing.
+    """
+
+    def __init__(self, scale_range, interpolation='bilinear'):
+        scale_range = eval(scale_range)
+        self.scale_range = scale_range
+
+        assert len(scale_range) == 2
+        assert scale_range[0] < scale_range[1]
+        assert np.all([x > 0 for x in scale_range])
+
+        self.keep_ratio = True
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        short_edge = np.random.randint(self.scale_range[0],
+                                       self.scale_range[1] + 1)
+        resize = Resize((-1, short_edge),
+                        keep_ratio=True,
+                        interpolation=self.interpolation,
+                        lazy=False)
+        results = resize(results)
+
+        results['short_edge'] = short_edge
+        return results
+
+    def __repr__(self):
+        scale_range = self.scale_range
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '
+                    f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@PIPELINES.register()
+class Rescale:
+    """resize images so that the short_edge is resized to a specific
+    size in a given range. The scale ratio is unchanged after resizing.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "resize_size",
+    "short_edge".
+
+    Args:
+        scale_range (tuple[int]): The range of short edge length. A closed
+            interval.
+        interpolation (str): Algorithm used for interpolation:
+            "nearest" | "bilinear". Default: "bilinear".
+    """
+
+    def __init__(self, scale_range, interpolation='bilinear'):
+        scale_range = eval(scale_range)
+        self.scale_range = scale_range
+
+        self.keep_ratio = True
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        resize = Resize(
+            self.scale_range,
+            keep_ratio=True,
+            interpolation=self.interpolation,
+            lazy=False)
+        results = resize(results)
+        return results
+
+    def __repr__(self):
+        scale_range = self.scale_range
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '
+                    f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@PIPELINES.register()
+class RandomCrop_v2:
+    """Vanilla square random crop that specifics the output size.
+
+    Required keys in results are "imgs" and "img_shape", added or
+    modified keys are "imgs", "lazy"; Required keys in "lazy" are "flip",
+    "crop_bbox", added or modified key is "crop_bbox".
+
+    Args:
+        size (int): The output size of the images.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self, size, lazy=False):
+        if not isinstance(size, int):
+            raise TypeError(f'Size must be an int, but got {type(size)}')
+        self.size = size
+        self.lazy = lazy
+
+    def __call__(self, results):
+        """Performs the RandomCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+
+        img_h, img_w = results['img_shape']
+        assert self.size <= img_h and self.size <= img_w
+
+        y_offset = 0
+        x_offset = 0
+        if img_h > self.size:
+            y_offset = int(np.random.randint(0, img_h - self.size))
+        if img_w > self.size:
+            x_offset = int(np.random.randint(0, img_w - self.size))
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = x_offset / img_w, y_offset / img_h
+        w_ratio, h_ratio = self.size / img_w, self.size / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_x_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        new_h, new_w = self.size, self.size
+
+        results['crop_bbox'] = np.array(
+            [x_offset, y_offset, x_offset + new_w, y_offset + new_h])
+        results['img_shape'] = (new_h, new_w)
+
+        if not self.lazy:
+            results['imgs'] = [
+                img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]
+                for img in results['imgs']
+            ]
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = x_offset * (lazy_right - lazy_left) / img_w
+            right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w
+            top = y_offset * (lazy_bottom - lazy_top) / img_h
+            bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array(
+                [(lazy_left + left), (lazy_top + top), (lazy_left + right),
+                 (lazy_top + bottom)],
+                dtype=np.float32)
+
+        # Process entity boxes
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            entity_box_crop = EntityBoxCrop(results['crop_bbox'])
+            results = entity_box_crop(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(size={self.size}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+def imflip_(img, direction='horizontal'):
+    """Inplace flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image (inplace).
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return cv2.flip(img, 1, img)
+    elif direction == 'vertical':
+        return cv2.flip(img, 0, img)
+    else:
+        return cv2.flip(img, -1, img)
+
+
+def iminvert(img):
+    """Invert (negate) an image.
+
+    Args:
+        img (ndarray): Image to be inverted.
+
+    Returns:
+        ndarray: The inverted image.
+    """
+    return np.full_like(img, 255) - img
+
+
+@PIPELINES.register()
+class Flip:
+    """Flip the input images with a probability.
+
+    Reverse the order of elements in the given imgs with a specific direction.
+    The shape of the imgs is preserved, but the elements are reordered.
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "lazy" and "flip_direction". Required keys in "lazy" is
+    None, added or modified key are "flip" and "flip_direction". The Flip
+    augmentation should be placed after any cropping / reshaping augmentations,
+    to make sure crop_quadruple is calculated properly.
+
+    Args:
+        flip_ratio (float): Probability of implementing flip. Default: 0.5.
+        direction (str): Flip imgs horizontally or vertically. Options are
+            "horizontal" | "vertical". Default: "horizontal".
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+    _directions = ['horizontal', 'vertical']
+
+    def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False):
+        if direction not in self._directions:
+            raise ValueError(f'Direction {direction} is not supported. '
+                             f'Currently support ones are {self._directions}')
+        self.flip_ratio = flip_ratio
+        self.direction = direction
+        self.lazy = lazy
+
+    def __call__(self, results):
+        """Performs the Flip augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        flip = np.random.rand() < self.flip_ratio
+
+        results['flip'] = flip
+        results['flip_direction'] = self.direction
+
+        if not self.lazy:
+            if flip:
+                for i, img in enumerate(results['imgs']):
+                    imflip_(img, self.direction)
+                lt = len(results['imgs'])
+            else:
+                results['imgs'] = list(results['imgs'])
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Use one Flip please')
+            lazyop['flip'] = flip
+            lazyop['flip_direction'] = self.direction
+
+        if 'gt_bboxes' in results and flip:
+            assert not self.lazy and self.direction == 'horizontal'
+            entity_box_flip = EntityBoxFlip(results['img_shape'])
+            results = entity_box_flip(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (
+            f'{self.__class__.__name__}('
+            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '
+            f'lazy={self.lazy})')
+        return repr_str
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+    """Inplace normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    # cv2 inplace normalization does not accept uint8
+    assert img.dtype != np.uint8
+    mean = np.float64(mean.reshape(1, -1))
+    stdinv = 1 / np.float64(std.reshape(1, -1))
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+    cv2.subtract(img, mean, img)  # inplace
+    cv2.multiply(img, stdinv, img)  # inplace
+    return img
+
+
+@PIPELINES.register()
+class Normalize:
+    """Normalize images with the given mean and std value.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs" and "img_norm_cfg". If modality is 'Flow', additional
+    keys "scale_factor" is required
+
+    Args:
+        mean (Sequence[float]): Mean values of different channels.
+        std (Sequence[float]): Std values of different channels.
+        to_bgr (bool): Whether to convert channels from RGB to BGR.
+            Default: False.
+        adjust_magnitude (bool): Indicate whether to adjust the flow magnitude
+            on 'scale_factor' when modality is 'Flow'. Default: False.
+    """
+
+    def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False):
+        if not isinstance(mean, Sequence):
+            raise TypeError(
+                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
+
+        if not isinstance(std, Sequence):
+            raise TypeError(
+                f'Std must be list, tuple or np.ndarray, but got {type(std)}')
+
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_bgr = to_bgr
+        self.adjust_magnitude = adjust_magnitude
+
+    def __call__(self, results):
+        n = len(results['imgs'])
+        h, w, c = results['imgs'][0].shape
+        imgs = np.empty((n, h, w, c), dtype=np.float32)
+        for i, img in enumerate(results['imgs']):
+            imgs[i] = img
+
+        for img in imgs:
+            imnormalize_(img, self.mean, self.std, self.to_bgr)
+
+        results['imgs'] = imgs
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_bgr=self.to_bgr)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'mean={self.mean}, '
+                    f'std={self.std}, '
+                    f'to_bgr={self.to_bgr}, '
+                    f'adjust_magnitude={self.adjust_magnitude})')
+        return repr_str
diff --git a/docs/src/paddlevideo/loader/pipelines/compose.py b/docs/src/paddlevideo/loader/pipelines/compose.py
new file mode 100644
index 000000000..76eb4ed4d
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/compose.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Sequence
+from ..registry import PIPELINES
+import traceback
+from ...utils import build
+from ...utils import get_logger
+
+
+@PIPELINES.register()
+class Compose(object):
+    """
+    Composes several pipelines(include decode func, sample func, and transforms) together.
+
+    Note: To deal with ```list``` type cfg temporaray, like:
+
+        transform:
+            - Crop: # A list
+                attribute: 10
+            - Resize: # A list
+                attribute: 20
+
+    every key of list will pass as the key name to build a module.
+    XXX: will be improved in the future.
+
+    Args:
+        pipelines (list): List of transforms to compose.
+    Returns:
+        A compose object which is callable, __call__ for this Compose
+        object will call each given :attr:`transforms` sequencely.
+    """
+    def __init__(self, pipelines):
+        #assert isinstance(pipelines, Sequence)
+        self.pipelines = []
+        for p in pipelines.values():
+            if isinstance(p, dict):
+                p = build(p, PIPELINES)
+                self.pipelines.append(p)
+            elif isinstance(p, list):
+                for t in p:
+                    #XXX: to deal with old format cfg, ugly code here!
+                    temp_dict = dict(name=list(t.keys())[0])
+                    for all_sub_t in t.values():
+                        if all_sub_t is not None:
+                            temp_dict.update(all_sub_t) 
+      
+                    t = build(temp_dict, PIPELINES)
+                    self.pipelines.append(t)
+            elif callable(p):
+                self.pipelines.append(p)
+            else:
+                raise TypeError(f'pipelines must be callable or a dict,'
+                                f'but got {type(p)}')
+    def __call__(self, data):
+        for p in self.pipelines:
+            try:
+                data = p(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger = get_logger("paddlevideo")
+                logger.info("fail to perform transform [{}] with error: "
+                      "{} and stack:\n{}".format(p, e, str(stack_info)))
+                raise e
+        return data
diff --git a/docs/src/paddlevideo/loader/pipelines/decode.py b/docs/src/paddlevideo/loader/pipelines/decode.py
new file mode 100644
index 000000000..5d138ff6e
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/decode.py
@@ -0,0 +1,347 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+try:
+    import av
+except ImportError as e:
+    print(
+        f"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models."
+    )
+import cv2
+import pickle
+import decord as de
+import math
+import random
+from ..registry import PIPELINES
+
+
+def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
+    delta = max(video_size - clip_size, 0)
+    if clip_idx == -1:  # here
+        # Random temporal sampling.
+        start_idx = random.uniform(0, delta)
+    else:  # ignore
+        # Uniformly sample the clip with the given index.
+        start_idx = delta * clip_idx / num_clips
+    end_idx = start_idx + clip_size - 1
+    return start_idx, end_idx
+
+
+@PIPELINES.register()
+class VideoDecoder(object):
+    """
+    Decode mp4 file to frames.
+    Args:
+        filepath: the file path of mp4 file
+    """
+    def __init__(self,
+                 backend='cv2',
+                 mode='train',
+                 sampling_rate=32,
+                 num_seg=8,
+                 num_clips=1,
+                 target_fps=30):
+
+        self.backend = backend
+        # params below only for TimeSformer
+        self.mode = mode
+        self.sampling_rate = sampling_rate
+        self.num_seg = num_seg
+        self.num_clips = num_clips
+        self.target_fps = target_fps
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        file_path = results['filename']
+        results['format'] = 'video'
+        results['backend'] = self.backend
+
+        if self.backend == 'cv2':
+            cap = cv2.VideoCapture(file_path)
+            videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            sampledFrames = []
+            for i in range(videolen):
+                ret, frame = cap.read()
+                # maybe first frame is empty
+                if ret == False:
+                    continue
+                img = frame[:, :, ::-1]
+                sampledFrames.append(img)
+            results['frames'] = sampledFrames
+            results['frames_len'] = len(sampledFrames)
+
+        elif self.backend == 'decord':
+            container = de.VideoReader(file_path)
+            frames_len = len(container)
+            results['frames'] = container
+            results['frames_len'] = frames_len
+
+        elif self.backend == 'pyav':  # for TimeSformer
+            if self.mode in ["train", "valid"]:
+                clip_idx = -1
+            elif self.mode in ["test"]:
+                clip_idx = 0
+            else:
+                raise NotImplementedError
+
+            container = av.open(file_path)
+
+            num_clips = 1  # always be 1
+
+            # decode process
+            fps = float(container.streams.video[0].average_rate)
+
+            frames_length = container.streams.video[0].frames
+            duration = container.streams.video[0].duration
+
+            if duration is None:
+                # If failed to fetch the decoding information, decode the entire video.
+                decode_all_video = True
+                video_start_pts, video_end_pts = 0, math.inf
+            else:
+                decode_all_video = False
+                start_idx, end_idx = get_start_end_idx(
+                    frames_length,
+                    self.sampling_rate * self.num_seg / self.target_fps * fps,
+                    clip_idx, num_clips)
+                timebase = duration / frames_length
+                video_start_pts = int(start_idx * timebase)
+                video_end_pts = int(end_idx * timebase)
+
+            frames = None
+            # If video stream was found, fetch video frames from the video.
+            if container.streams.video:
+                margin = 1024
+                seek_offset = max(video_start_pts - margin, 0)
+
+                container.seek(seek_offset,
+                               any_frame=False,
+                               backward=True,
+                               stream=container.streams.video[0])
+                tmp_frames = {}
+                buffer_count = 0
+                max_pts = 0
+                for frame in container.decode(**{"video": 0}):
+                    max_pts = max(max_pts, frame.pts)
+                    if frame.pts < video_start_pts:
+                        continue
+                    if frame.pts <= video_end_pts:
+                        tmp_frames[frame.pts] = frame
+                    else:
+                        buffer_count += 1
+                        tmp_frames[frame.pts] = frame
+                        if buffer_count >= 0:
+                            break
+                video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]
+
+                container.close()
+
+                frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
+                clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps
+
+                start_idx, end_idx = get_start_end_idx(
+                    len(frames),  # frame_len
+                    clip_sz,
+                    clip_idx if decode_all_video else
+                    0,  # If decode all video, -1 in train and valid, 0 in test;
+                    # else, always 0 in train, valid and test, as we has selected clip size frames when decode.
+                    1)
+                results['frames'] = frames
+                results['frames_len'] = len(frames)
+                results['start_idx'] = start_idx
+                results['end_idx'] = end_idx
+        else:
+            raise NotImplementedError
+        return results
+
+
+@PIPELINES.register()
+class FrameDecoder(object):
+    """just parse results
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['format'] = 'frame'
+        return results
+
+
+@PIPELINES.register()
+class MRIDecoder(object):
+    """just parse results
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['format'] = 'MRI'
+        return results
+
+
+@PIPELINES.register()
+class FeatureDecoder(object):
+    """
+        Perform feature decode operations.e.g.youtube8m
+    """
+    def __init__(self, num_classes, max_len=512, has_label=True):
+        self.max_len = max_len
+        self.num_classes = num_classes
+        self.has_label = has_label
+
+    def __call__(self, results):
+        """
+        Perform feature decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        #1. load pkl
+        #2. parse to rgb/audio/
+        #3. padding
+
+        filepath = results['filename']
+        data = pickle.load(open(filepath, 'rb'), encoding='bytes')
+
+        record = data
+        nframes = record['nframes'] if 'nframes' in record else record[
+            b'nframes']
+        rgb = record['feature'].astype(
+            float) if 'feature' in record else record[b'feature'].astype(float)
+        audio = record['audio'].astype(
+            float) if 'audio' in record else record[b'audio'].astype(float)
+        if self.has_label:
+            label = record['label'] if 'label' in record else record[b'label']
+            one_hot_label = self.make_one_hot(label, self.num_classes)
+
+        rgb = rgb[0:nframes, :]
+        audio = audio[0:nframes, :]
+
+        rgb = self.dequantize(rgb,
+                              max_quantized_value=2.,
+                              min_quantized_value=-2.)
+        audio = self.dequantize(audio,
+                                max_quantized_value=2,
+                                min_quantized_value=-2)
+
+        if self.has_label:
+            results['labels'] = one_hot_label.astype("float32")
+
+        feat_pad_list = []
+        feat_len_list = []
+        mask_list = []
+        vitem = [rgb, audio]
+        for vi in range(2):  #rgb and audio
+            if vi == 0:
+                prefix = "rgb_"
+            else:
+                prefix = "audio_"
+            feat = vitem[vi]
+            results[prefix + 'len'] = feat.shape[0]
+            #feat pad step 1. padding
+            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
+                                dtype=np.float32)
+            feat_pad = np.concatenate((feat, feat_add), axis=0)
+            results[prefix + 'data'] = feat_pad.astype("float32")
+            #feat pad step 2. mask
+            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
+            feat_mask_add = feat_add
+            feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),
+                                       axis=0)
+            results[prefix + 'mask'] = feat_mask.astype("float32")
+
+        return results
+
+    def dequantize(self,
+                   feat_vector,
+                   max_quantized_value=2.,
+                   min_quantized_value=-2.):
+        """
+        Dequantize the feature from the byte format to the float format
+        """
+
+        assert max_quantized_value > min_quantized_value
+        quantized_range = max_quantized_value - min_quantized_value
+        scalar = quantized_range / 255.0
+        bias = (quantized_range / 512.0) + min_quantized_value
+
+        return feat_vector * scalar + bias
+
+    def make_one_hot(self, label, dim=3862):
+        one_hot_label = np.zeros(dim)
+        one_hot_label = one_hot_label.astype(float)
+        for ind in label:
+            one_hot_label[int(ind)] = 1
+        return one_hot_label
+
+
+@PIPELINES.register()
+class ActionFeatureDecoder(object):
+    """
+        Perform feature decode operations on footballaction
+    """
+    def __init__(self, num_classes, max_len=512, has_label=True):
+        self.max_len = max_len
+        self.num_classes = num_classes
+        self.has_label = has_label
+
+    def __call__(self, results):
+        """
+        Perform feature decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        #1. load pkl
+        #2. parse to rgb/audio/
+        #3. padding
+
+        filepath = results['filename']
+        data = pickle.load(open(filepath, 'rb'), encoding='bytes')
+
+        pkl_data = data
+        rgb = pkl_data['image_feature'].astype(float)
+        audio = pkl_data['audio_feature'].astype(float)
+        label_id_info = pkl_data['label_info']
+        label_cls = [label_id_info['label']]
+        label_one = int(label_cls[0])
+        if len(label_cls) > 1:
+            label_index = random.randint(0, 1)
+            label_one = int(label_cls[label_index])
+        iou_norm = float(label_id_info['norm_iou'])
+        results['labels'] = np.array([label_one])
+        results['iou_norm'] = float(iou_norm)
+
+        vitem = [rgb, audio]
+        for vi in range(2):  #rgb and audio
+            if vi == 0:
+                prefix = "rgb_"
+            else:
+                prefix = "audio_"
+            feat = vitem[vi]
+            results[prefix + 'len'] = feat.shape[0]
+            #feat pad step 1. padding
+            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
+                                dtype=np.float32)
+            feat_pad = np.concatenate((feat, feat_add), axis=0)
+            results[prefix + 'data'] = feat_pad.astype("float32")
+            #feat pad step 2. mask
+            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
+            feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0)
+            results[prefix + 'mask'] = feat_mask.astype("float32")
+
+        return results
diff --git a/docs/src/paddlevideo/loader/pipelines/decode_image.py b/docs/src/paddlevideo/loader/pipelines/decode_image.py
new file mode 100644
index 000000000..64a7e2fc1
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/decode_image.py
@@ -0,0 +1,206 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import PIL.Image as pil
+
+try:
+    import skimage.transform
+except ImportError as e:
+    print(
+        f"Warning! {e}, [scikit-image] package and it's dependencies is required for ADDS."
+    )
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class ImageDecoder(object):
+    """Decode Image
+    """
+    def __init__(self,
+                 dataset,
+                 frame_idxs,
+                 num_scales,
+                 side_map,
+                 full_res_shape,
+                 img_ext,
+                 backend='cv2'):
+        self.backend = backend
+        self.dataset = dataset
+        self.frame_idxs = frame_idxs
+        self.num_scales = num_scales
+        self.side_map = side_map
+        self.full_res_shape = full_res_shape
+        self.img_ext = img_ext
+
+    def _pil_loader(self, path):
+        with open(path, 'rb') as f:
+            with Image.open(f) as img:
+                return img.convert('RGB')
+
+    def get_color(self, folder, frame_index, side):
+        color = self._pil_loader(
+            self.get_image_path(self.dataset, folder, frame_index, side))
+        return color
+
+    def get_image_path(self, dataset, folder, frame_index, side):
+        if dataset == "kitti":
+            f_str = "{:010d}{}".format(frame_index, self.img_ext)
+            image_path = os.path.join(self.data_path, folder, f_str)
+        elif dataset == "kitti_odom":
+            f_str = "{:06d}{}".format(frame_index, self.img_ext)
+            image_path = os.path.join(self.data_path,
+                                      "sequences/{:02d}".format(int(folder)),
+                                      "image_{}".format(self.side_map[side]),
+                                      f_str)
+        elif dataset == "kitti_depth":
+            f_str = "{:010d}{}".format(frame_index, self.img_ext)
+            image_path = os.path.join(
+                self.data_path, folder,
+                "image_0{}/data".format(self.side_map[side]), f_str)
+
+        return image_path
+
+    def get_depth(self, dataset, folder, frame_index, side):
+        if dataset == "kitii_depth":
+            f_str = "{:010d}.png".format(frame_index)
+            depth_path = os.path.join(
+                self.data_path, folder,
+                "proj_depth/groundtruth/image_0{}".format(self.side_map[side]),
+                f_str)
+
+            depth_gt = pil.open(depth_path)
+            depth_gt = depth_gt.resize(self.full_res_shape, pil.NEAREST)
+            depth_gt = np.array(depth_gt).astype(np.float32) / 256
+
+        else:
+            f_str = "{:010d}{}".format(frame_index, self.img_ext)
+            depth_path = os.path.join(self.data_path, folder + '_gt', f_str)
+
+            img_file = Image.open(depth_path)
+            depth_png = np.array(img_file, dtype=int)
+            img_file.close()
+            # make sure we have a proper 16bit depth map here.. not 8bit!
+            assert np.max(depth_png) > 255, \
+                "np.max(depth_png)={}, path={}".format(np.max(depth_png), depth_path)
+
+            depth_gt = depth_png.astype(np.float) / 256.
+
+            depth_gt = depth_gt[160:960 - 160, :]
+
+            depth_gt = skimage.transform.resize(depth_gt,
+                                                self.full_res_shape[::-1],
+                                                order=0,
+                                                preserve_range=True,
+                                                mode='constant')
+
+        return depth_gt
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        if results.get('mode', None) == 'infer':
+            imgs = {}
+            imgs[("color", 0,
+                  -1)] = Image.open(results["filename"]).convert("RGB")
+            results['imgs'] = imgs
+            return results
+
+        self.data_path = results['data_path']
+        results['backend'] = self.backend
+
+        imgs = {}
+
+        results['frame_idxs'] = self.frame_idxs
+        results['num_scales'] = self.num_scales
+
+        file_name = results['filename']
+        folder = results['folder']
+        frame_index = results['frame_index']
+        line = file_name.split('/')
+        istrain = folder.split('_')[1]
+        if 'mode' not in results:
+            results['mode'] = istrain
+        results['day_or_night'] = folder.split('_')[0]
+
+        if istrain == "train":
+            if folder[0] == 'd':
+                folder2 = folder + '_fake_night'
+                flag = 0
+            else:
+                folder2 = folder + '_fake_day'
+                tmp = folder
+                folder = folder2
+                folder2 = tmp
+                flag = 1
+
+            if len(line) == 3:
+                side = line[2]
+            else:
+                side = None
+
+            results['side'] = side
+
+            for i in self.frame_idxs:
+
+                if i == "s":
+                    other_side = {"r": "l", "l": "r"}[side]
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index, other_side)
+                    imgs[("color_n", i,
+                          -1)] = self.get_color(folder2, frame_index,
+                                                other_side)
+                else:
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index + i, side)
+                    imgs[("color_n", i,
+                          -1)] = self.get_color(folder2, frame_index + i, side)
+
+            istrain = folder.split('_')[1]
+            if istrain != 'train':
+                if flag:
+                    depth_gt = self.get_depth(folder2, frame_index, side)
+                else:
+                    depth_gt = self.get_depth(folder, frame_index, side)
+                imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
+        elif istrain == 'val':
+            if len(line) == 3:
+                side = line[2]
+            else:
+                side = None
+
+            for i in self.frame_idxs:
+                if i == "s":
+                    other_side = {"r": "l", "l": "r"}[side]
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index, other_side)
+                else:
+
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index + i, side)
+
+            # adjusting intrinsics to match each scale in the pyramid
+
+            depth_gt = self.get_depth(self.dataset, folder, frame_index, side)
+            imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
+        results['imgs'] = imgs
+
+        return results
diff --git a/docs/src/paddlevideo/loader/pipelines/decode_sampler.py b/docs/src/paddlevideo/loader/pipelines/decode_sampler.py
new file mode 100644
index 000000000..2f8f8743d
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/decode_sampler.py
@@ -0,0 +1,93 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+from PIL import Image
+import decord as de
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class DecodeSampler(object):
+    """
+    We use 'decord' for decode and sampling, which is faster than opencv.
+    This is used in slowfast model.
+    Args:
+        num_frames(int): the number of frames we want to sample.
+        sampling_rate(int): sampling rate for video data.
+        target_fps(int): desired fps, default 30
+        test_mode(bool): whether test or train/valid. In slowfast, we use multicrop when test.
+    """
+    def __init__(self,
+                 num_frames,
+                 sampling_rate,
+                 default_sampling_rate=2,
+                 target_fps=30,
+                 test_mode=False):
+        self.num_frames = num_frames
+        self.orig_sampling_rate = self.sampling_rate = sampling_rate
+        self.default_sampling_rate = default_sampling_rate
+        self.target_fps = target_fps
+        self.test_mode = test_mode
+
+    def get_start_end_idx(self, video_size, clip_size, clip_idx,
+                          temporal_num_clips):
+        delta = max(video_size - clip_size, 0)
+        if not self.test_mode:
+            # Random temporal sampling.
+            start_idx = random.uniform(0, delta)
+        else:
+            # Uniformly sample the clip with the given index.
+            start_idx = delta * clip_idx / temporal_num_clips
+        end_idx = start_idx + clip_size - 1
+        return start_idx, end_idx
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        short_cycle_idx = results.get('short_cycle_idx')
+        if short_cycle_idx:
+            self.sampling_rate = random.randint(self.default_sampling_rate,
+                                                self.orig_sampling_rate)
+
+        filepath = results['filename']
+        temporal_sample_index = results['temporal_sample_index']
+        temporal_num_clips = results['temporal_num_clips']
+
+        vr = de.VideoReader(filepath)
+        videolen = len(vr)
+
+        fps = vr.get_avg_fps()
+        clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps
+
+        start_idx, end_idx = self.get_start_end_idx(videolen, clip_size,
+                                                    temporal_sample_index,
+                                                    temporal_num_clips)
+        index = np.linspace(start_idx, end_idx, self.num_frames).astype("int64")
+        index = np.clip(index, 0, videolen)
+
+        frames_select = vr.get_batch(index)  #1 for buffer
+
+        # dearray_to_img
+        np_frames = frames_select.asnumpy()
+        frames_select_list = []
+        for i in range(np_frames.shape[0]):
+            imgbuf = np_frames[i]
+            frames_select_list.append(Image.fromarray(imgbuf, mode='RGB'))
+        results['imgs'] = frames_select_list
+        return results
diff --git a/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py b/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py
new file mode 100644
index 000000000..08d1dd061
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/decode_sampler_MRI.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import numpy as np
+from PIL import Image
+try:
+    import SimpleITK as sitk
+except ImportError as e:
+    print(
+        f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
+    )
+import cv2
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class SFMRI_DecodeSampler(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        valid_mode(bool): True or False.
+        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_seg,
+                 seg_len,
+                 valid_mode=False,
+                 select_left=False,
+                 dense_sample=False,
+                 linspace_sample=False):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.valid_mode = valid_mode
+        self.select_left = select_left
+        self.dense_sample = dense_sample
+        self.linspace_sample = linspace_sample
+
+    def _get(self, frames_idx_s, frames_idx_f, results):
+
+        frame_dir = results['frame_dir']
+        imgs_s = []
+        imgs_f = []
+        MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
+        for idx in frames_idx_s:
+            item = MRI[idx]
+            item = cv2.resize(item, (224, 224))
+            imgs_s.append(item)
+
+        for idx in frames_idx_f:
+            item = MRI[idx]
+            item = cv2.resize(item, (224, 224))
+            imgs_f.append(item)
+
+        results['imgs'] = [imgs_s, imgs_f]
+        return results
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        frames_len = int(results['frames_len'])
+        average_dur1 = int(frames_len / self.num_seg[0])
+        average_dur2 = int(frames_len / self.num_seg[1])
+        frames_idx_s = []
+        frames_idx_f = []
+        if self.linspace_sample:
+            if 'start_idx' in results and 'end_idx' in results:
+                offsets_s = np.linspace(results['start_idx'],
+                                        results['end_idx'], self.num_seg[0])
+                offsets_f = np.linspace(results['start_idx'],
+                                        results['end_idx'], self.num_seg[1])
+            else:
+                offsets_s = np.linspace(0, frames_len - 1, self.num_seg[0])
+                offsets_f = np.linspace(0, frames_len - 1, self.num_seg[1])
+            offsets_s = np.clip(offsets_s, 0, frames_len - 1).astype(np.int64)
+            offsets_f = np.clip(offsets_f, 0, frames_len - 1).astype(np.int64)
+
+            frames_idx_s = list(offsets_s)
+            frames_idx_f = list(offsets_f)
+
+            return self._get(frames_idx_s, frames_idx_f, results)
+
+        if not self.select_left:
+            if self.dense_sample:  # For ppTSM
+                if not self.valid_mode:  # train
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride1 = 64 // self.num_seg[0]
+                    t_stride2 = 64 // self.num_seg[1]
+                    start_idx = 0 if sample_pos == 1 else np.random.randint(
+                        0, sample_pos - 1)
+                    offsets_s = [(idx * t_stride1 + start_idx) % frames_len + 1
+                                 for idx in range(self.num_seg[0])]
+                    offsets_f = [(idx * t_stride2 + start_idx) % frames_len + 1
+                                 for idx in range(self.num_seg[1])]
+                    frames_idx_s = offsets_s
+                    frames_idx_f = offsets_f
+                else:
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride1 = 64 // self.num_seg[0]
+                    t_stride2 = 64 // self.num_seg[1]
+                    start_list = np.linspace(0,
+                                             sample_pos - 1,
+                                             num=10,
+                                             dtype=int)
+                    offsets_s = []
+                    offsets_f = []
+                    for start_idx in start_list.tolist():
+                        offsets_s += [
+                            (idx * t_stride1 + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg[0])
+                        ]
+                    for start_idx in start_list.tolist():
+                        offsets_f += [
+                            (idx * t_stride2 + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg[1])
+                        ]
+                    frames_idx_s = offsets_s
+                    frames_idx_f = offsets_f
+            else:
+                for i in range(self.num_seg[0]):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur1 >= self.seg_len:
+                            idx = random.randint(0, average_dur1 - self.seg_len)
+                            idx += i * average_dur1
+                        elif average_dur1 >= 1:
+                            idx += i * average_dur1
+                        else:
+                            idx = i
+                    else:
+                        if average_dur1 >= self.seg_len:
+                            idx = (average_dur1 - 1) // 2
+                            idx += i * average_dur1
+                        elif average_dur1 >= 1:
+                            idx += i * average_dur1
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        frames_idx_s.append(jj)
+
+                for i in range(self.num_seg[1]):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur2 >= self.seg_len:
+                            idx = random.randint(0, average_dur2 - self.seg_len)
+                            idx += i * average_dur2
+                        elif average_dur2 >= 1:
+                            idx += i * average_dur2
+                        else:
+                            idx = i
+                    else:
+                        if average_dur2 >= self.seg_len:
+                            idx = (average_dur2 - 1) // 2
+                            idx += i * average_dur2
+                        elif average_dur2 >= 1:
+                            idx += i * average_dur2
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        frames_idx_f.append(jj)
+
+            return self._get(frames_idx_s, frames_idx_f, results)
+
+        else:  # for TSM
+            if not self.valid_mode:
+                if average_dur2 > 0:
+                    offsets_s = np.multiply(list(range(
+                        self.num_seg[0])), average_dur1) + np.random.randint(
+                            average_dur1, size=self.num_seg[0])
+
+                    offsets_f = np.multiply(list(range(
+                        self.num_seg[1])), average_dur2) + np.random.randint(
+                            average_dur2, size=self.num_seg[1])
+                elif frames_len > self.num_seg[1]:
+                    offsets_s = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg[0]))
+                    offsets_f = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg[1]))
+                else:
+                    offsets_s = np.zeros(shape=(self.num_seg[0], ))
+                    offsets_f = np.zeros(shape=(self.num_seg[1], ))
+            else:
+                if frames_len > self.num_seg[1]:
+                    average_dur_float_s = frames_len / self.num_seg[0]
+                    offsets_s = np.array([
+                        int(average_dur_float_s / 2.0 + average_dur_float_s * x)
+                        for x in range(self.num_seg[0])
+                    ])
+                    average_dur_float_f = frames_len / self.num_seg[1]
+                    offsets_f = np.array([
+                        int(average_dur_float_f / 2.0 + average_dur_float_f * x)
+                        for x in range(self.num_seg[1])
+                    ])
+                else:
+                    offsets_s = np.zeros(shape=(self.num_seg[0], ))
+                    offsets_f = np.zeros(shape=(self.num_seg[1], ))
+
+            frames_idx_s = list(offsets_s)
+            frames_idx_f = list(offsets_f)
+
+            return self._get(frames_idx_s, frames_idx_f, results)
diff --git a/docs/src/paddlevideo/loader/pipelines/mix.py b/docs/src/paddlevideo/loader/pipelines/mix.py
new file mode 100644
index 000000000..ccc5f98cf
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/mix.py
@@ -0,0 +1,116 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class Mixup(object):
+    """
+    Mixup operator.
+    Args:
+        alpha(float): alpha value.
+    """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self.alpha = alpha
+
+    def __call__(self, batch):
+        imgs, labels = list(zip(*batch))
+        imgs = np.array(imgs)
+        labels = np.array(labels)
+        bs = len(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self.alpha, self.alpha)
+        lams = np.array([lam] * bs, dtype=np.float32)
+        imgs = lam * imgs + (1 - lam) * imgs[idx]
+        return list(zip(imgs, labels, labels[idx], lams))
+
+
+@PIPELINES.register()
+class Cutmix(object):
+    """ Cutmix operator
+    Args:
+        alpha(float): alpha value.
+    """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self.alpha = alpha
+
+    def rand_bbox(self, size, lam):
+        """ rand_bbox """
+        w = size[2]
+        h = size[3]
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = np.int(w * cut_rat)
+        cut_h = np.int(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w)
+        bby1 = np.clip(cy - cut_h // 2, 0, h)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w)
+        bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+        return bbx1, bby1, bbx2, bby2
+
+    def __call__(self, batch):
+        imgs, labels = list(zip(*batch))
+        imgs = np.array(imgs)
+        labels = np.array(labels)
+
+        bs = len(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self.alpha, self.alpha)
+
+        bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)
+        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.shape[-2] * imgs.shape[-1]))
+        lams = np.array([lam] * bs, dtype=np.float32)
+
+        return list(zip(imgs, labels, labels[idx], lams))
+
+
+@PIPELINES.register()
+class VideoMix(object):
+    """
+    VideoMix operator.
+    Args:
+        cutmix_prob(float): prob choose cutmix
+        mixup_alpha(float): alpha for mixup aug
+        cutmix_alpha(float): alpha for cutmix aug
+    """
+    def __init__(self, cutmix_prob=0.5, mixup_alpha=0.2, cutmix_alpha=1.0):
+        assert cutmix_prob > 0., \
+                'parameter cutmix_prob[%f] should > 0.0' % (cutmix_prob)
+        assert mixup_alpha > 0., \
+                'parameter mixup_alpha[%f] should > 0.0' % (mixup_alpha)
+        assert cutmix_alpha > 0., \
+                'parameter cutmix_alpha[%f] should > 0.0' % (cutmix_alpha)
+        self.cutmix_prob = cutmix_prob
+        self.mixup = Mixup(mixup_alpha)
+        self.cutmix = Cutmix(cutmix_alpha)
+
+    def __call__(self, batch):
+        if np.random.random() < self.cutmix_prob:
+            return self.cutmix(batch)
+        else:
+            return self.mixup(batch)
diff --git a/docs/src/paddlevideo/loader/pipelines/multimodal.py b/docs/src/paddlevideo/loader/pipelines/multimodal.py
new file mode 100644
index 000000000..d7c508c78
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/multimodal.py
@@ -0,0 +1,380 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+from PIL import Image
+import decord as de
+import copy
+import json
+from ..registry import PIPELINES
+
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+    )
+
+
+@PIPELINES.register()
+class FeaturePadding(object):
+    """
+    Padding feature to target shape.
+    """
+    def __init__(self, max_region_num=36, max_action_num=5):
+        self.max_region_num = max_region_num
+        self.max_action_num = max_action_num
+
+    def __call__(self, results):
+        """
+        Padding feature.
+        """
+        pack_feature = results['feature']
+        tokenizer = results['tokenizer']
+        image_feature_wp, image_target_wp, image_location_wp, \
+                num_boxes,  image_h, image_w, image_id, caption, \
+                action_feature_wp, action_target_wp, num_actions = pack_feature
+
+        image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)
+        image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)
+        image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)
+
+        action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)
+        action_target = np.zeros((self.max_action_num, ), dtype=np.int64)
+
+        num_boxes = int(num_boxes)
+        image_feature[:num_boxes] = image_feature_wp
+        image_target[:num_boxes] = image_target_wp
+        image_location[:num_boxes, :4] = image_location_wp
+
+        image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (
+            image_location[:, 2] - image_location[:, 0]) / (float(image_w) *
+                                                            float(image_h))
+
+        image_location[:, 0] = image_location[:, 0] / float(image_w)
+        image_location[:, 1] = image_location[:, 1] / float(image_h)
+        image_location[:, 2] = image_location[:, 2] / float(image_w)
+        image_location[:, 3] = image_location[:, 3] / float(image_h)
+
+        image_feature = copy.deepcopy(image_feature)
+        image_target = copy.deepcopy(image_target)
+
+        num_actions = int(num_actions)
+        action_feature[:num_actions] = action_feature_wp
+        action_target[:num_actions] = action_target_wp
+        action_feature = copy.deepcopy(action_feature)
+        action_target = copy.deepcopy(action_target)
+
+        results = dict(image_feat=image_feature,
+                       image_target=image_target,
+                       caption=caption,
+                       image_loc=image_location,
+                       num_boxes=int(num_boxes),
+                       action_feat=action_feature,
+                       action_target=action_target,
+                       num_actions=int(num_actions),
+                       tokenizer=tokenizer)
+        return results
+
+
+@PIPELINES.register()
+class RandomCap(object):
+    def __init__(self, caption_path):
+        """
+        Random Caption for NSP task
+        """
+        self.caption_path = caption_path
+
+    def select_caption(self, caption):
+        captions = caption.split('!')
+        rind = random.randint(0, len(captions) - 1)
+        caption = captions[rind]
+        return caption
+
+    def get_random_caption(self, all_captions):
+        num_caps = len(all_captions)
+        rand_doc_idx = random.randint(0, num_caps - 1)
+        caption = all_captions[rand_doc_idx]
+        caption = self.select_caption(caption)
+        return caption
+
+    def random_cap(self, caption, all_captions):
+        if random.random() > 0.5:
+            label = 0
+        else:
+            caption = self.get_random_caption(all_captions)
+            label = 1
+        return caption, label
+
+    def __call__(self, results):
+        caption = results['caption']
+        all_captions = list(json.load(open(self.caption_path, 'r')))
+        caption = self.select_caption(caption)
+        caption, label = self.random_cap(caption, all_captions)
+        results['caption'] = caption
+        results['is_next'] = label
+        return results
+
+
+@PIPELINES.register()
+class Tokenize(object):
+    def __init__(self, ):
+        """
+        Tokenize caption
+        """
+        pass
+
+    def __call__(self, results):
+        caption = results['caption']
+        tokenizer = results['tokenizer']
+        tokens_caption = tokenizer.tokenize(caption)
+        results['caption'] = tokens_caption
+        return results
+
+
+@PIPELINES.register()
+class RandomMask(object):
+    def __init__(self,
+                 max_seq_length=36,
+                 max_action_length=5,
+                 max_region_length=36):
+        self.max_seq_length = max_seq_length
+        self.max_action_length = max_action_length
+        self.max_region_length = max_region_length
+
+    def get_image_global_feature(self, image_feat, image_loc, image_mask):
+        g_image_feat = np.sum(image_feat, axis=0) / np.sum(
+            image_mask, axis=0, keepdims=True)
+        image_feat = np.concatenate(
+            [np.expand_dims(g_image_feat, axis=0), image_feat],
+            axis=0).astype("float32")
+
+        g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32")
+        image_loc = np.concatenate(
+            [np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)
+
+        g_image_mask = np.array([1])
+        image_mask = np.concatenate([g_image_mask, image_mask], axis=0)
+
+        return image_feat, image_loc, image_mask
+
+    def _truncate_seq_pair(self, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length.
+        This is a simple heuristic which will always truncate the longer sequence
+        one token at a time. This makes more sense than truncating an equal percent
+        of tokens from each, since if one sequence is very short then each token
+        that's truncated likely contains more information than a longer sequence.
+        """
+        while True:
+            total_length = len(tokens_b)
+            if total_length <= max_length:
+                break
+            tokens_b.pop()
+
+    def random_word(self, tokens, tokenizer):
+        """
+        Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
+        Args:
+            tokens: list of str, tokenized sentence.
+            tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
+        Return:
+            (list of str, list of int), masked tokens and related labels for LM prediction
+        """
+        output_label = []
+
+        for i, token in enumerate(tokens):
+            prob = random.random()
+            # mask token with 15% probability
+
+            if prob < 0.15:
+                prob /= 0.15
+
+                # 80% randomly change token to mask token
+                if prob < 0.8:
+                    tokens[i] = "[MASK]"
+
+                # 10% randomly change token to random token
+                elif prob < 0.9:
+                    #tok = random.choice(list(tokenizer.vocab.items()))[0]
+                    tok = tokenizer.vocab.idx_to_token[random.randint(
+                        0,
+                        tokenizer.vocab_size,
+                    )]
+                    tokens[i] = tok
+
+                # rest 10% randomly keep current token
+                # append current token to output (we will predict these later)
+                try:
+                    output_label.append(tokenizer.vocab[token])
+                except KeyError:
+                    # For unknown words (should not occur with BPE vocab)
+                    output_label.append(tokenizer.vocab["[UNK]"])
+                    print(
+                        "Cannot find token '{}' in vocab. Using [UNK] insetad".
+                        format(token))
+            else:
+                # no masking token (will be ignored by loss function later)
+                output_label.append(-1)
+
+        return tokens, output_label
+
+    def random_region(self, image_feat, image_loc, num_boxes):
+        output_label = []
+
+        for i in range(num_boxes):
+            prob = random.random()
+            # mask token with 15% probability
+            if prob < 0.15:
+                prob /= 0.15
+
+                # 80% randomly change token to mask token
+                if prob < 0.9:
+                    image_feat[i] = 0
+
+                # rest 20% randomly keep current token
+                # append current token to output (we will predict these later)
+                output_label.append(1)
+            else:
+                # no masking token (will be ignored by loss function later)
+                output_label.append(-1)
+
+        return image_feat, image_loc, output_label
+
+    def random_action(self, action_feat, action_target, num_actions):
+        output_label = []
+
+        for i in range(num_actions):
+            prob = random.random()
+            # mask token with 15% probability
+            if prob < 0.15:
+                prob /= 0.15
+
+                # 90% randomly change token to mask token
+                if prob < 0.9:
+                    action_feat[i] = 0
+
+                # rest 10% randomly keep current token
+                # append current token to output (we will predict these later)
+                output_label.append(action_target[i])
+            else:
+                # no masking token (will be ignored by loss function later)
+                output_label.append(-1)
+
+        return action_feat, output_label
+
+    def __call__(self, results):
+        caption = results['caption']
+        tokenizer = results['tokenizer']
+        image_feat = results['image_feat']
+        image_loc = results['image_loc']
+        num_boxes = results['num_boxes']
+        action_feat = results['action_feat']
+        action_target = results['action_target']
+        num_actions = results['num_actions']
+        is_next = results['is_next']
+        image_target = results['image_target']
+
+        self._truncate_seq_pair(caption, self.max_seq_length - 2)
+        caption, caption_label = self.random_word(caption, tokenizer)
+
+        image_feat, image_loc, image_label = self.random_region(
+            image_feat, image_loc, num_boxes)
+        action_feat, action_label = self.random_action(action_feat,
+                                                       action_target,
+                                                       num_actions)
+
+        # concatenate lm labels and account for CLS, SEP, SEP
+        lm_label_ids = [-1] + caption_label + [-1]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambigiously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+
+        tokens = []
+        segment_ids = []
+
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+
+        for token in caption:
+            tokens.append(token)
+            segment_ids.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
+        input_mask = [1] * (len(input_ids))
+        image_mask = [1] * (num_boxes)
+        action_mask = [1] * (num_actions)
+
+        # Zero-pad up to the visual sequence length.
+        while len(image_mask) < self.max_region_length:
+            image_mask.append(0)
+            image_label.append(-1)
+        while len(action_mask) < self.max_action_length:
+            action_mask.append(0)
+            action_label.append(-1)
+
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < self.max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+            lm_label_ids.append(-1)
+
+        assert len(input_ids) == self.max_seq_length
+        assert len(input_mask) == self.max_seq_length
+        assert len(segment_ids) == self.max_seq_length
+        assert len(lm_label_ids) == self.max_seq_length
+        assert len(image_mask) == self.max_region_length
+        assert len(image_label) == self.max_region_length
+        assert len(action_mask) == self.max_action_length
+        assert len(action_label) == self.max_action_length
+
+        image_feat, image_loc, image_mask = self.get_image_global_feature(
+            image_feat, image_loc, np.array(image_mask))
+        features = [
+            np.array(input_ids),
+            action_feat,
+            image_feat,
+            image_loc,
+            np.array(segment_ids),
+            np.array(input_mask),
+            image_mask,
+            np.array(action_mask),
+            np.array(lm_label_ids),
+            np.array(action_label),
+            np.array(is_next),
+            np.array(image_label),
+            image_target,
+        ]
+        results['features'] = features
+        return results
diff --git a/docs/src/paddlevideo/loader/pipelines/sample.py b/docs/src/paddlevideo/loader/pipelines/sample.py
new file mode 100644
index 000000000..0a1d068a7
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/sample.py
@@ -0,0 +1,382 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import numpy as np
+from PIL import Image
+try:
+    import SimpleITK as sitk
+except ImportError as e:
+    print(
+        f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
+    )
+import cv2
+
+from ..registry import PIPELINES
+
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+    from io import BytesIO
+
+
+@PIPELINES.register()
+class Sampler(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        valid_mode(bool): True or False.
+        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_seg,
+                 seg_len,
+                 frame_interval=None,
+                 valid_mode=False,
+                 select_left=False,
+                 dense_sample=False,
+                 linspace_sample=False,
+                 use_pil=True):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.frame_interval = frame_interval
+        self.valid_mode = valid_mode
+        self.select_left = select_left
+        self.dense_sample = dense_sample
+        self.linspace_sample = linspace_sample
+        self.use_pil = use_pil
+
+    def _get(self, frames_idx, results):
+        data_format = results['format']
+
+        if data_format == "frame":
+            frame_dir = results['frame_dir']
+            imgs = []
+            for idx in frames_idx:
+                img = Image.open(
+                    os.path.join(frame_dir,
+                                 results['suffix'].format(idx))).convert('RGB')
+                imgs.append(img)
+
+        elif data_format == "MRI":
+            frame_dir = results['frame_dir']
+            imgs = []
+            MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
+            for idx in frames_idx:
+                item = MRI[idx]
+                item = cv2.resize(item, (224, 224))
+                imgs.append(item)
+
+        elif data_format == "video":
+            if results['backend'] == 'cv2':
+                frames = np.array(results['frames'])
+                imgs = []
+                for idx in frames_idx:
+                    imgbuf = frames[idx]
+                    img = Image.fromarray(imgbuf, mode='RGB')
+                    imgs.append(img)
+            elif results['backend'] == 'decord':
+                container = results['frames']
+                if self.use_pil:
+                    frames_select = container.get_batch(frames_idx)
+                    # dearray_to_img
+                    np_frames = frames_select.asnumpy()
+                    imgs = []
+                    for i in range(np_frames.shape[0]):
+                        imgbuf = np_frames[i]
+                        imgs.append(Image.fromarray(imgbuf, mode='RGB'))
+                else:
+                    if frames_idx.ndim != 1:
+                        frames_idx = np.squeeze(frames_idx)
+                    frame_dict = {
+                        idx: container[idx].asnumpy()
+                        for idx in np.unique(frames_idx)
+                    }
+                    imgs = [frame_dict[idx] for idx in frames_idx]
+            elif results['backend'] == 'pyav':
+                imgs = []
+                frames = np.array(results['frames'])
+                for idx in frames_idx:
+                    if self.dense_sample:
+                        idx = idx - 1
+                    imgbuf = frames[idx]
+                    imgs.append(imgbuf)
+                imgs = np.stack(imgs)  # thwc
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+        results['imgs'] = imgs
+        return results
+
+    def _get_train_clips(self, num_frames):
+        ori_seg_len = self.seg_len * self.frame_interval
+        avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg
+
+        if avg_interval > 0:
+            base_offsets = np.arange(self.num_seg) * avg_interval
+            clip_offsets = base_offsets + np.random.randint(avg_interval,
+                                                            size=self.num_seg)
+        elif num_frames > max(self.num_seg, ori_seg_len):
+            clip_offsets = np.sort(
+                np.random.randint(num_frames - ori_seg_len + 1,
+                                  size=self.num_seg))
+        elif avg_interval == 0:
+            ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg
+            clip_offsets = np.around(np.arange(self.num_seg) * ratio)
+        else:
+            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames):
+        ori_seg_len = self.seg_len * self.frame_interval
+        avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)
+        if num_frames > ori_seg_len - 1:
+            base_offsets = np.arange(self.num_seg) * avg_interval
+            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
+        else:
+            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+        return clip_offsets
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        frames_len = int(results['frames_len'])
+        frames_idx = []
+        if self.frame_interval is not None:
+            assert isinstance(self.frame_interval, int)
+            if not self.valid_mode:
+                offsets = self._get_train_clips(frames_len)
+            else:
+                offsets = self._get_test_clips(frames_len)
+
+            offsets = offsets[:, None] + np.arange(
+                self.seg_len)[None, :] * self.frame_interval
+            offsets = np.concatenate(offsets)
+
+            offsets = offsets.reshape((-1, self.seg_len))
+            offsets = np.mod(offsets, frames_len)
+            offsets = np.concatenate(offsets)
+
+            if results['format'] == 'video':
+                frames_idx = offsets
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+            else:
+                raise NotImplementedError
+
+            return self._get(frames_idx, results)
+
+        if self.linspace_sample:
+            if 'start_idx' in results and 'end_idx' in results:
+                offsets = np.linspace(results['start_idx'], results['end_idx'],
+                                      self.num_seg)
+            else:
+                offsets = np.linspace(0, frames_len - 1, self.num_seg)
+            offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
+            if results['format'] == 'video':
+                frames_idx = list(offsets)
+                frames_idx = [x % frames_len for x in frames_idx]
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+
+            elif results['format'] == 'MRI':
+                frames_idx = list(offsets)
+
+            else:
+                raise NotImplementedError
+            return self._get(frames_idx, results)
+
+        average_dur = int(frames_len / self.num_seg)
+        if not self.select_left:
+            if self.dense_sample:  # For ppTSM
+                if not self.valid_mode:  # train
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride = 64 // self.num_seg
+                    start_idx = 0 if sample_pos == 1 else np.random.randint(
+                        0, sample_pos - 1)
+                    offsets = [(idx * t_stride + start_idx) % frames_len + 1
+                               for idx in range(self.num_seg)]
+                    frames_idx = offsets
+                else:
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride = 64 // self.num_seg
+                    start_list = np.linspace(0,
+                                             sample_pos - 1,
+                                             num=10,
+                                             dtype=int)
+                    offsets = []
+                    for start_idx in start_list.tolist():
+                        offsets += [
+                            (idx * t_stride + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg)
+                        ]
+                    frames_idx = offsets
+            else:
+                for i in range(self.num_seg):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur >= self.seg_len:
+                            idx = random.randint(0, average_dur - self.seg_len)
+                            idx += i * average_dur
+                        elif average_dur >= 1:
+                            idx += i * average_dur
+                        else:
+                            idx = i
+                    else:
+                        if average_dur >= self.seg_len:
+                            idx = (average_dur - 1) // 2
+                            idx += i * average_dur
+                        elif average_dur >= 1:
+                            idx += i * average_dur
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        if results['format'] == 'video':
+                            frames_idx.append(int(jj % frames_len))
+                        elif results['format'] == 'frame':
+                            frames_idx.append(jj + 1)
+
+                        elif results['format'] == 'MRI':
+                            frames_idx.append(jj)
+                        else:
+                            raise NotImplementedError
+            return self._get(frames_idx, results)
+
+        else:  # for TSM
+            if not self.valid_mode:
+                if average_dur > 0:
+                    offsets = np.multiply(list(range(self.num_seg)),
+                                          average_dur) + np.random.randint(
+                                              average_dur, size=self.num_seg)
+                elif frames_len > self.num_seg:
+                    offsets = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg))
+                else:
+                    offsets = np.zeros(shape=(self.num_seg, ))
+            else:
+                if frames_len > self.num_seg:
+                    average_dur_float = frames_len / self.num_seg
+                    offsets = np.array([
+                        int(average_dur_float / 2.0 + average_dur_float * x)
+                        for x in range(self.num_seg)
+                    ])
+                else:
+                    offsets = np.zeros(shape=(self.num_seg, ))
+
+            if results['format'] == 'video':
+                frames_idx = list(offsets)
+                frames_idx = [x % frames_len for x in frames_idx]
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+
+            elif results['format'] == 'MRI':
+                frames_idx = list(offsets)
+
+            else:
+                raise NotImplementedError
+
+            return self._get(frames_idx, results)
+
+
+@PIPELINES.register()
+class SamplerPkl(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        mode(str): 'train', 'valid'
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self, num_seg, seg_len, backend='pillow', valid_mode=False):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.valid_mode = valid_mode
+        self.backend = backend
+
+    def _get(self, buf):
+        if isinstance(buf, str):
+            img = Image.open(StringIO(buf))
+        else:
+            img = Image.open(BytesIO(buf))
+        img = img.convert('RGB')
+        if self.backend != 'pillow':
+            img = np.array(img)
+        return img
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        filename = results['frame_dir']
+        data_loaded = pickle.load(open(filename, 'rb'), encoding='bytes')
+        video_name, label, frames = data_loaded
+        if isinstance(label, dict):
+            label = label['动作类型']
+            results['labels'] = label
+        elif len(label) == 1:
+            results['labels'] = int(label[0])
+        else:
+            results['labels'] = int(label[0]) if random.random() < 0.5 else int(
+                label[1])
+        results['frames_len'] = len(frames)
+        frames_len = results['frames_len']
+        average_dur = int(int(frames_len) / self.num_seg)
+        imgs = []
+        for i in range(self.num_seg):
+            idx = 0
+            if not self.valid_mode:
+                if average_dur >= self.seg_len:
+                    idx = random.randint(0, average_dur - self.seg_len)
+                    idx += i * average_dur
+                elif average_dur >= 1:
+                    idx += i * average_dur
+                else:
+                    idx = i
+            else:
+                if average_dur >= self.seg_len:
+                    idx = (average_dur - 1) // 2
+                    idx += i * average_dur
+                elif average_dur >= 1:
+                    idx += i * average_dur
+                else:
+                    idx = i
+
+            for jj in range(idx, idx + self.seg_len):
+                imgbuf = frames[int(jj % results['frames_len'])]
+                img = self._get(imgbuf)
+                imgs.append(img)
+        results['backend'] = self.backend
+        results['imgs'] = imgs
+
+        return results
diff --git a/docs/src/paddlevideo/loader/pipelines/sample_ava.py b/docs/src/paddlevideo/loader/pipelines/sample_ava.py
new file mode 100644
index 000000000..39e90a216
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/sample_ava.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from PIL import Image
+from ..registry import PIPELINES
+import os
+import numpy as np
+import io
+import os.path as osp
+from abc import ABCMeta, abstractmethod
+import cv2
+from cv2 import IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_UNCHANGED
+import inspect
+
+imread_backend = 'cv2'
+imread_flags = {
+    'color': IMREAD_COLOR,
+    'grayscale': IMREAD_GRAYSCALE,
+    'unchanged': IMREAD_UNCHANGED
+}
+
+
+@PIPELINES.register()
+class SampleFrames:
+    """Sample frames from the video. """
+
+    def __init__(self,
+                 clip_len,
+                 frame_interval=1,
+                 num_clips=1,
+                 temporal_jitter=False,
+                 twice_sample=False,
+                 out_of_bound_opt='loop',
+                 test_mode=False):
+        self.clip_len = clip_len
+        self.frame_interval = frame_interval
+        self.num_clips = num_clips
+        self.temporal_jitter = temporal_jitter
+        self.twice_sample = twice_sample
+        self.out_of_bound_opt = out_of_bound_opt
+        self.test_mode = test_mode
+        assert self.out_of_bound_opt in ['loop', 'repeat_last']
+
+    def _get_train_clips(self, num_frames):
+        """Get clip offsets in train mode. """
+        ori_clip_len = self.clip_len * self.frame_interval
+        avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips
+        if avg_interval > 0:
+            base_offsets = np.arange(self.num_clips) * avg_interval
+            clip_offsets = base_offsets + np.random.randint(
+                avg_interval, size=self.num_clips)
+        elif num_frames > max(self.num_clips, ori_clip_len):
+            clip_offsets = np.sort(
+                np.random.randint(
+                    num_frames - ori_clip_len + 1, size=self.num_clips))
+        elif avg_interval == 0:
+            ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips
+            clip_offsets = np.around(np.arange(self.num_clips) * ratio)
+        else:
+            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames):
+        """Get clip offsets in test mode. """
+        ori_clip_len = self.clip_len * self.frame_interval
+        avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)
+        if num_frames > ori_clip_len - 1:
+            base_offsets = np.arange(self.num_clips) * avg_interval
+            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
+            if self.twice_sample:
+                clip_offsets = np.concatenate([clip_offsets, base_offsets])
+        else:
+            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
+        return clip_offsets
+
+    def _sample_clips(self, num_frames):
+        """Choose clip offsets for the video in a given mode. """
+        if self.test_mode:
+            clip_offsets = self._get_test_clips(num_frames)
+        else:
+            clip_offsets = self._get_train_clips(num_frames)
+        return clip_offsets
+
+    def __call__(self, results):
+        """Perform the SampleFrames loading. """
+        total_frames = results['total_frames']
+        clip_offsets = self._sample_clips(total_frames)
+        frame_inds = clip_offsets[:, None] + np.arange(
+            self.clip_len)[None, :] * self.frame_interval
+        frame_inds = np.concatenate(frame_inds)
+        if self.temporal_jitter:
+            perframe_offsets = np.random.randint(
+                self.frame_interval, size=len(frame_inds))
+            frame_inds += perframe_offsets
+        frame_inds = frame_inds.reshape((-1, self.clip_len))
+        if self.out_of_bound_opt == 'loop':
+            frame_inds = np.mod(frame_inds, total_frames)
+        elif self.out_of_bound_opt == 'repeat_last':
+            safe_inds = frame_inds < total_frames
+            unsafe_inds = 1 - safe_inds
+            last_ind = np.max(safe_inds * frame_inds, axis=1)
+            new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
+            frame_inds = new_inds
+        else:
+            raise ValueError('Illegal out_of_bound option.')
+        start_index = results['start_index']
+        frame_inds = np.concatenate(frame_inds) + start_index
+        results['frame_inds'] = frame_inds.astype(np.int)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'frame_interval={self.frame_interval}, '
+                    f'num_clips={self.num_clips}, '
+                    f'temporal_jitter={self.temporal_jitter}, '
+                    f'twice_sample={self.twice_sample}, '
+                    f'out_of_bound_opt={self.out_of_bound_opt}, '
+                    f'test_mode={self.test_mode})')
+        return repr_str
+
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends. """
+
+    @abstractmethod
+    def get(self, filepath):
+        pass
+
+    @abstractmethod
+    def get_text(self, filepath):
+        pass
+
+class HardDiskBackend(BaseStorageBackend):
+    """Raw hard disks storage backend."""
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        with open(filepath, 'rb') as f:
+            value_buf = f.read()
+        return value_buf
+
+    def get_text(self, filepath):
+        filepath = str(filepath)
+        with open(filepath, 'r') as f:
+            value_buf = f.read()
+        return value_buf
+
+class FileClient:
+    """A general file client to access files in different backend. """
+
+    _backends = {
+        'disk': HardDiskBackend,
+    }
+
+    def __init__(self, backend='disk', **kwargs):
+        if backend not in self._backends:
+            raise ValueError(
+                f'Backend {backend} is not supported. Currently supported ones'
+                f' are {list(self._backends.keys())}')
+        self.backend = backend
+        self.client = self._backends[backend](**kwargs)
+
+    @classmethod
+    def _register_backend(cls, name, backend, force=False):
+        if not isinstance(name, str):
+            raise TypeError('the backend name should be a string, '
+                            f'but got {type(name)}')
+        if not inspect.isclass(backend):
+            raise TypeError(
+                f'backend should be a class but got {type(backend)}')
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(
+                f'backend {backend} is not a subclass of BaseStorageBackend')
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, '
+                'add "force=True" if you want to override it')
+
+        cls._backends[name] = backend
+
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False):
+        """Register a backend to FileClient. """
+
+        if backend is not None:
+            cls._register_backend(name, backend, force=force)
+            return
+
+        def _register(backend_cls):
+            cls._register_backend(name, backend_cls, force=force)
+            return backend_cls
+
+        return _register
+
+    def get(self, filepath):
+        return self.client.get(filepath)
+
+    def get_text(self, filepath):
+        return self.client.get_text(filepath)
+
+@PIPELINES.register()
+class RawFrameDecode:
+    """Load and decode frames with given indices. """
+
+    def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs):
+        self.io_backend = io_backend
+        self.decoding_backend = decoding_backend
+        self.kwargs = kwargs
+        self.file_client = None
+
+    def _pillow2array(self,img, flag='color', channel_order='bgr'):
+        """Convert a pillow image to numpy array. """
+
+        channel_order = channel_order.lower()
+        if channel_order not in ['rgb', 'bgr']:
+            raise ValueError('channel order must be either "rgb" or "bgr"')
+
+        if flag == 'unchanged':
+            array = np.array(img)
+            if array.ndim >= 3 and array.shape[2] >= 3:  # color image
+                array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
+        else:
+            # If the image mode is not 'RGB', convert it to 'RGB' first.
+            if img.mode != 'RGB':
+                if img.mode != 'LA':
+                    # Most formats except 'LA' can be directly converted to RGB
+                    img = img.convert('RGB')
+                else:
+                    # When the mode is 'LA', the default conversion will fill in
+                    #  the canvas with black, which sometimes shadows black objects
+                    #  in the foreground.
+                    #
+                    # Therefore, a random color (124, 117, 104) is used for canvas
+                    img_rgba = img.convert('RGBA')
+                    img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+                    img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
+            if flag == 'color':
+                array = np.array(img)
+                if channel_order != 'rgb':
+                    array = array[:, :, ::-1]  # RGB to BGR
+            elif flag == 'grayscale':
+                img = img.convert('L')
+                array = np.array(img)
+            else:
+                raise ValueError(
+                    'flag must be "color", "grayscale" or "unchanged", '
+                    f'but got {flag}')
+        return array
+
+    def _imfrombytes(self,content, flag='color', channel_order='bgr'):#, backend=None):
+        """Read an image from bytes. """
+
+        img_np = np.frombuffer(content, np.uint8)
+        flag = imread_flags[flag] if isinstance(flag, str) else flag
+        img = cv2.imdecode(img_np, flag)
+        if flag == IMREAD_COLOR and channel_order == 'rgb':
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        return img
+
+    def __call__(self, results):
+        """Perform the ``RawFrameDecode`` to pick frames given indices.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        # mmcv.use_backend(self.decoding_backend)
+
+        directory = results['frame_dir']
+        suffix = results['suffix']
+        #modality = results['modality']
+
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+        imgs = list()
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        offset = results.get('offset', 0)
+
+        for frame_idx in results['frame_inds']:
+            frame_idx += offset
+            filepath = osp.join(directory, suffix.format(frame_idx))
+            img_bytes = self.file_client.get(filepath) #以二进制方式读取图片
+            # Get frame with channel order RGB directly.
+
+            cur_frame = self._imfrombytes(img_bytes, channel_order='rgb')
+            imgs.append(cur_frame)
+
+        results['imgs'] = imgs
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+
+        # we resize the gt_bboxes and proposals to their real scale
+        h, w = results['img_shape']
+        scale_factor = np.array([w, h, w, h])
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            gt_bboxes_new = (gt_bboxes * scale_factor).astype(np.float32)
+            results['gt_bboxes'] = gt_bboxes_new
+        if 'proposals' in results and results['proposals'] is not None:
+            proposals = results['proposals']
+            proposals = (proposals * scale_factor).astype(np.float32)
+            results['proposals'] = proposals
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'io_backend={self.io_backend}, '
+                    f'decoding_backend={self.decoding_backend})')
+        return repr_str
+
+@PIPELINES.register()
+class SampleAVAFrames(SampleFrames):
+
+    def __init__(self, clip_len, frame_interval=2, test_mode=False):
+
+        super().__init__(clip_len, frame_interval, test_mode=test_mode)
+
+    def _get_clips(self, center_index, skip_offsets, shot_info):
+        start = center_index - (self.clip_len // 2) * self.frame_interval
+        end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval
+        frame_inds = list(range(start, end, self.frame_interval))
+        frame_inds = frame_inds + skip_offsets
+        frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1)
+
+        return frame_inds
+
+    def __call__(self, results):
+        fps = results['fps']
+        timestamp = results['timestamp']
+        timestamp_start = results['timestamp_start']
+        shot_info = results['shot_info']
+
+        #delta=(timestamp - timestamp_start) 为该帧距离15min视频开头有几秒
+        #center_index=fps*delta为该帧距离15min视频开头有几帧
+        #center_index+1是为了避免后续采样时出现负数? 
+        #后续需要以center_index为中心前后采样视频帧片段
+        center_index = fps * (timestamp - timestamp_start) + 1
+
+        skip_offsets = np.random.randint(
+            -self.frame_interval // 2, (self.frame_interval + 1) // 2,
+            size=self.clip_len)
+        frame_inds = self._get_clips(center_index, skip_offsets, shot_info)
+
+        results['frame_inds'] = np.array(frame_inds, dtype=np.int)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = 1
+        results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32)
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'frame_interval={self.frame_interval}, '
+                    f'test_mode={self.test_mode})')
+        return repr_str
+
diff --git a/docs/src/paddlevideo/loader/pipelines/sample_ucf24.py b/docs/src/paddlevideo/loader/pipelines/sample_ucf24.py
new file mode 100644
index 000000000..7d9e90433
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/sample_ucf24.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class SamplerUCF24(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_frames(int): The amount of frames used in a video
+        frame_interval(int): Sampling rate
+        valid_mode(bool): True or False.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_frames=16,
+                 frame_interval=1,
+                 valid_mode=False):
+        self.num_frames = num_frames
+        self.frame_interval = frame_interval if valid_mode else random.randint(1, 2)
+        self.valid_mode = valid_mode
+
+    def _get(self, frames_idxs, img_folder, results):
+        imgs = []
+        for idx in frames_idxs:
+            img = Image.open(
+                os.path.join(img_folder, '{:05d}.jpg'.format(idx))).convert('RGB')
+            imgs.append(img)
+        results['imgs'] = imgs
+        return results
+
+    def _make_clip(self, im_ind, max_num):
+        frame_idxs = []
+        for i in reversed(range(self.num_frames)):
+            # make it as a loop
+            i_temp = im_ind - i * self.frame_interval
+            if i_temp < 1:
+                i_temp = 1
+            elif i_temp > max_num:
+                i_temp = max_num
+            frame_idxs.append(i_temp)
+        return frame_idxs
+
+    def __call__(self, results):
+        img_folder, key_frame = os.path.split(results['filename'])
+        frame_len = len(os.listdir(img_folder))
+        key_idx = int(key_frame[0:5])
+        frame_idxs = self._make_clip(key_idx, frame_len)
+        return self._get(frame_idxs, img_folder, results)
diff --git a/docs/src/paddlevideo/loader/pipelines/segmentation.py b/docs/src/paddlevideo/loader/pipelines/segmentation.py
new file mode 100644
index 000000000..247144267
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/segmentation.py
@@ -0,0 +1,130 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from PIL import Image
+import copy
+import cv2
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class MultiRestrictSize(object):
+    def __init__(self,
+                 min_size=None,
+                 max_size=800,
+                 flip=False,
+                 multi_scale=[1.3]):
+        self.min_size = min_size
+        self.max_size = max_size
+        self.multi_scale = multi_scale
+        self.flip = flip
+        assert ((min_size is None)) or ((max_size is None))
+
+    def __call__(self, sample):
+        samples = []
+        image = sample['current_img']
+        h, w = image.shape[:2]
+        for scale in self.multi_scale:
+            # Fixed range of scales
+            sc = None
+            # Align short edge
+            if not (self.min_size is None):
+                if h > w:
+                    short_edge = w
+                else:
+                    short_edge = h
+                if short_edge > self.min_size:
+                    sc = float(self.min_size) / short_edge
+            else:
+                if h > w:
+                    long_edge = h
+                else:
+                    long_edge = w
+                if long_edge > self.max_size:
+                    sc = float(self.max_size) / long_edge
+
+            if sc is None:
+                new_h = h
+                new_w = w
+            else:
+                new_h = sc * h
+                new_w = sc * w
+            new_h = int(new_h * scale)
+            new_w = int(new_w * scale)
+
+            if (new_h - 1) % 16 != 0:
+                new_h = int(np.around((new_h - 1) / 16.) * 16 + 1)
+            if (new_w - 1) % 16 != 0:
+                new_w = int(np.around((new_w - 1) / 16.) * 16 + 1)
+
+            if new_h == h and new_w == w:
+                samples.append(sample)
+            else:
+                new_sample = {}
+                for elem in sample.keys():
+                    if 'meta' in elem:
+                        new_sample[elem] = sample[elem]
+                        continue
+                    tmp = sample[elem]
+                    if 'label' in elem:
+                        new_sample[elem] = sample[elem]
+                        continue
+                    else:
+                        flagval = cv2.INTER_CUBIC
+                        tmp = cv2.resize(tmp,
+                                         dsize=(new_w, new_h),
+                                         interpolation=flagval)
+                        new_sample[elem] = tmp
+                samples.append(new_sample)
+
+            if self.flip:
+                now_sample = samples[-1]
+                new_sample = {}
+                for elem in now_sample.keys():
+                    if 'meta' in elem:
+                        new_sample[elem] = now_sample[elem].copy()
+                        new_sample[elem]['flip'] = True
+                        continue
+                    tmp = now_sample[elem]
+                    tmp = tmp[:, ::-1].copy()
+                    new_sample[elem] = tmp
+                samples.append(new_sample)
+
+        return samples
+
+
+@PIPELINES.register()
+class MultiNorm(object):
+    def __call__(self, samples):
+        for idx in range(len(samples)):
+            sample = samples[idx]
+            for elem in sample.keys():
+                if 'meta' in elem:
+                    continue
+                tmp = sample[elem]
+                if tmp is None:
+                    continue
+
+                if tmp.ndim == 2:
+                    tmp = tmp[:, :, np.newaxis]
+                else:
+                    tmp = tmp / 255.
+                    tmp -= (0.485, 0.456, 0.406)
+                    tmp /= (0.229, 0.224, 0.225)
+
+                tmp = tmp.transpose((2, 0, 1))
+                samples[idx][elem] = tmp
+
+        return samples
diff --git a/docs/src/paddlevideo/loader/pipelines/segmentation_pipline.py b/docs/src/paddlevideo/loader/pipelines/segmentation_pipline.py
new file mode 100644
index 000000000..dda6deec4
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/segmentation_pipline.py
@@ -0,0 +1,40 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+import random
+import paddle
+from ..registry import PIPELINES
+"""
+pipeline ops for Action Segmentation Dataset.
+"""
+
+
+@PIPELINES.register()
+class SegmentationSampler(object):
+
+    def __init__(self, sample_rate):
+        self.sample_rate = sample_rate
+
+    def __call__(self, results):
+        for key, data in results.items():
+            if len(data.shape) == 1:
+                data = data[::self.sample_rate]
+                results[key] = copy.deepcopy(data)
+            else:
+                data = data[:, ::self.sample_rate]
+                results[key] = copy.deepcopy(data)
+        return results
diff --git a/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py b/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py
new file mode 100644
index 000000000..d31c816bc
--- /dev/null
+++ b/docs/src/paddlevideo/loader/pipelines/skeleton_pipeline.py
@@ -0,0 +1,1554 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import collections
+from itertools import repeat
+import copy as cp
+from collections import abc
+import numpy as np
+import paddle.nn.functional as F
+import random
+import paddle
+from ..registry import PIPELINES
+from .augmentations_ava import iminvert, imflip_
+"""pipeline ops for Activity Net.
+"""
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+def _init_lazy_if_proper(results, lazy):
+    """Initialize lazy operation properly.
+
+    Make sure that a lazy operation is properly initialized,
+    and avoid a non-lazy operation accidentally getting mixed in.
+
+    Required keys in results are "imgs" if "img_shape" not in results,
+    otherwise, Required keys in results are "img_shape", add or modified keys
+    are "img_shape", "lazy".
+    Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip",
+    "flip_direction", "interpolation".
+
+    Args:
+        results (dict): A dict stores data pipeline result.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    if 'img_shape' not in results:
+        results['img_shape'] = results['imgs'][0].shape[:2]
+    if lazy:
+        if 'lazy' not in results:
+            img_h, img_w = results['img_shape']
+            lazyop = dict()
+            lazyop['original_shape'] = results['img_shape']
+            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],
+                                           dtype=np.float32)
+            lazyop['flip'] = False
+            lazyop['flip_direction'] = None
+            lazyop['interpolation'] = None
+            results['lazy'] = lazyop
+    else:
+        assert 'lazy' not in results, 'Use Fuse after lazy operations'
+
+
+@PIPELINES.register()
+class AutoPadding(object):
+    """
+    Sample or Padding frame skeleton feature.
+    Args:
+        window_size: int, temporal size of skeleton feature.
+        random_pad: bool, whether do random padding when frame length < window size. Default: False.
+    """
+
+    def __init__(self, window_size, random_pad=False):
+        self.window_size = window_size
+        self.random_pad = random_pad
+
+    def get_frame_num(self, data):
+        C, T, V, M = data.shape
+        for i in range(T - 1, -1, -1):
+            tmp = np.sum(data[:, i, :, :])
+            if tmp > 0:
+                T = i + 1
+                break
+        return T
+
+    def __call__(self, results):
+        data = results['data']
+
+        C, T, V, M = data.shape
+        T = self.get_frame_num(data)
+        if T == self.window_size:
+            data_pad = data[:, :self.window_size, :, :]
+        elif T < self.window_size:
+            begin = random.randint(
+                0, self.window_size - T) if self.random_pad else 0
+            data_pad = np.zeros((C, self.window_size, V, M))
+            data_pad[:, begin:begin + T, :, :] = data[:, :T, :, :]
+        else:
+            if self.random_pad:
+                index = np.random.choice(
+                    T, self.window_size, replace=False).astype('int64')
+            else:
+                index = np.linspace(0, T, self.window_size).astype("int64")
+            data_pad = data[:, index, :, :]
+
+        results['data'] = data_pad
+        return results
+
+
+@PIPELINES.register()
+class SkeletonNorm(object):
+    """
+    Normalize skeleton feature.
+    Args:
+        aixs: dimensions of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default: 2.
+    """
+
+    def __init__(self, axis=2, squeeze=False):
+        self.axis = axis
+        self.squeeze = squeeze
+
+    def __call__(self, results):
+        data = results['data']
+
+        # Centralization
+        data = data - data[:, :, 8:9, :]
+        data = data[:self.axis, :, :, :]  # get (x,y) from (x,y, acc)
+        C, T, V, M = data.shape
+        if self.squeeze:
+            data = data.reshape((C, T, V))  # M = 1
+
+        results['data'] = data.astype('float32')
+        if 'label' in results:
+            label = results['label']
+            results['label'] = np.expand_dims(label, 0).astype('int64')
+        return results
+
+
+@PIPELINES.register()
+class Iden(object):
+    """
+    Wrapper Pipeline
+    """
+
+    def __init__(self, label_expand=True):
+        self.label_expand = label_expand
+
+    def __call__(self, results):
+        data = results['data']
+        results['data'] = data.astype('float32')
+
+        if 'label' in results and self.label_expand:
+            label = results['label']
+            results['label'] = np.expand_dims(label, 0).astype('int64')
+        return results
+
+
+@PIPELINES.register()
+class RandomRotation(object):
+    """
+    Random rotation sketeton.
+    Args:
+        argument: bool, if rotation.
+        theta: float, rotation rate.
+    """
+
+    def __init__(self, argument, theta=0.3):
+        self.theta = theta
+        self.argument = argument
+
+    def _rot(self, rot):
+        """
+        rot: T,3
+        """
+        cos_r, sin_r = np.cos(rot), np.sin(rot)  # T,3
+        zeros = np.zeros((rot.shape[0], 1))  # T,1
+        ones = np.ones((rot.shape[0], 1))  # T,1
+
+        r1 = np.stack((ones, zeros, zeros), axis=-1)  # T,1,3
+        rx2 = np.stack((zeros, cos_r[:, 0:1], sin_r[:, 0:1]), axis=-1)  # T,1,3
+        rx3 = np.stack((zeros, -sin_r[:, 0:1], cos_r[:, 0:1]), axis=-1)  # T,1,3
+        rx = np.concatenate((r1, rx2, rx3), axis=1)  # T,3,3
+
+        ry1 = np.stack((cos_r[:, 1:2], zeros, -sin_r[:, 1:2]), axis=-1)
+        r2 = np.stack((zeros, ones, zeros), axis=-1)
+        ry3 = np.stack((sin_r[:, 1:2], zeros, cos_r[:, 1:2]), axis=-1)
+        ry = np.concatenate((ry1, r2, ry3), axis=1)
+
+        rz1 = np.stack((cos_r[:, 2:3], sin_r[:, 2:3], zeros), axis=-1)
+        r3 = np.stack((zeros, zeros, ones), axis=-1)
+        rz2 = np.stack((-sin_r[:, 2:3], cos_r[:, 2:3], zeros), axis=-1)
+        rz = np.concatenate((rz1, rz2, r3), axis=1)
+
+        rot = np.matmul(np.matmul(rz, ry), rx)
+        return rot
+
+    def __call__(self, results):
+        # C,T,V,M
+        data = results['data']
+        if self.argument:
+            C, T, V, M = data.shape
+            data_numpy = np.transpose(data, (1, 0, 2, 3)).conjugate().reshape(
+                T, C, V * M)  # T,3,V*M
+            rot = np.random.uniform(-self.theta, self.theta, 3)
+            rot = np.stack(
+                [
+                    rot,
+                ] * T, axis=0)
+            rot = self._rot(rot)  # T,3,3
+            data_numpy = np.matmul(rot, data_numpy)
+            data_numpy = data_numpy.reshape(T, C, V, M)
+            data_numpy = np.transpose(data_numpy, (1, 0, 2, 3))
+            data = data_numpy
+        results['data'] = data.astype(np.float32)
+        return results
+
+
+@PIPELINES.register()
+class SketeonCropSample(object):
+    """
+    Sketeon Crop Sampler.
+    Args:
+        crop_model: str, crop model, support: ['center'].
+        p_interval: list, crop len
+        window_size: int, sample windows size.
+    """
+
+    def __init__(self, window_size, crop_model='center', p_interval=1):
+        assert crop_model in ['center'], "Don't support :" + crop_model
+
+        self.crop_model = crop_model
+        self.window_size = window_size
+        self.p_interval = p_interval
+
+    def __call__(self, results):
+        if self.crop_model == 'center':
+            # input: C,T,V,M
+            data = results['data']
+            valid_frame_num = np.sum(data.sum(0).sum(-1).sum(-1) != 0)
+
+            C, T, V, M = data.shape
+            begin = 0
+            end = valid_frame_num
+            valid_size = end - begin
+
+            #crop
+            if len(self.p_interval) == 1:
+                p = self.p_interval[0]
+                bias = int((1 - p) * valid_size / 2)
+                data = data[:, begin + bias:end - bias, :, :]  # center_crop
+                cropped_length = data.shape[1]
+            else:
+                p = np.random.rand(1) * (self.p_interval[1] - self.p_interval[0]
+                                         ) + self.p_interval[0]
+                # constraint cropped_length lower bound as 64
+                cropped_length = np.minimum(
+                    np.maximum(int(np.floor(valid_size * p)), 64), valid_size)
+                bias = np.random.randint(0, valid_size - cropped_length + 1)
+                data = data[:, begin + bias:begin + bias + cropped_length, :, :]
+
+            # resize
+            data = np.transpose(data, (0, 2, 3, 1)).conjugate().reshape(
+                C * V * M, cropped_length)
+            data = data[None, None, :, :]
+            # could perform both up sample and down sample
+            data_tensor = paddle.to_tensor(data)
+            data_tensor = F.interpolate(
+                data_tensor,
+                size=(C * V * M, self.window_size),
+                mode='bilinear',
+                align_corners=False).squeeze()
+            data = paddle.transpose(
+                paddle.reshape(data_tensor, (C, V, M, self.window_size)),
+                (0, 3, 1, 2)).numpy()
+        else:
+            raise NotImplementedError
+        results['data'] = data
+        return results
+
+
+@PIPELINES.register()
+class SketeonModalityTransform(object):
+    """
+    Sketeon Crop Sampler.
+    Args:
+        crop_model: str, crop model, support: ['center'].
+        p_interval: list, crop len
+        window_size: int, sample windows size.
+    """
+
+    def __init__(self, bone, motion, joint=True, graph='ntu_rgb_d'):
+
+        self.joint = joint
+        self.bone = bone
+        self.motion = motion
+        self.graph = graph
+        if self.graph == "ntu_rgb_d":
+            self.bone_pairs = ((1, 2), (2, 21), (3, 21), (4, 3), (5, 21),
+                               (6, 5), (7, 6), (8, 7), (9, 21), (10, 9),
+                               (11, 10), (12, 11), (13, 1), (14, 13), (15, 14),
+                               (16, 15), (17, 1), (18, 17), (19, 18), (20, 19),
+                               (22, 23), (21, 21), (23, 8), (24, 25), (25, 12))
+        else:
+            raise NotImplementedError
+
+    def __call__(self, results):
+        if self.joint:
+            return results
+        data_numpy = results['data']
+        if self.bone:
+            bone_data_numpy = np.zeros_like(data_numpy)
+            for v1, v2 in self.bone_pairs:
+                bone_data_numpy[:, :, v1 -
+                                1] = data_numpy[:, :, v1 -
+                                                1] - data_numpy[:, :, v2 - 1]
+            data_numpy = bone_data_numpy
+        if self.motion:
+            data_numpy[:, :-1] = data_numpy[:, 1:] - data_numpy[:, :-1]
+            data_numpy[:, -1] = 0
+        results['data'] = data_numpy
+        return results
+
+
+@PIPELINES.register()
+class UniformSampleFrames:
+    """Uniformly sample frames from the video.
+
+    To sample an n-frame clip from the video. UniformSampleFrames basically
+    divide the video into n segments of equal length and randomly sample one
+    frame from each segment. To make the testing results reproducible, a
+    random seed is set during testing, to make the sampling results
+    deterministic.
+
+    Required keys are "total_frames", "start_index" , added or modified keys
+    are "frame_inds", "clip_len", "frame_interval" and "num_clips".
+
+    Args:
+        clip_len (int): Frames of each sampled output clip.
+        num_clips (int): Number of clips to be sampled. Default: 1.
+        test_mode (bool): Store True when building test or validation dataset.
+            Default: False.
+        seed (int): The random seed used during test time. Default: 255.
+    """
+
+    def __init__(self, clip_len, num_clips=1, test_mode=False, seed=255):
+
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+        self.seed = seed
+
+    def _get_train_clips(self, num_frames, clip_len):
+        """Uniformly sample indices for training clips.
+
+        Args:
+            num_frames (int): The number of frames.
+            clip_len (int): The length of the clip.
+        """
+
+        assert self.num_clips == 1
+        if num_frames < clip_len:
+            start = np.random.randint(0, num_frames)
+            inds = np.arange(start, start + clip_len)
+        elif clip_len <= num_frames < 2 * clip_len:
+            basic = np.arange(clip_len)
+            inds = np.random.choice(
+                clip_len + 1, num_frames - clip_len, replace=False)
+            offset = np.zeros(clip_len + 1, dtype=np.int64)
+            offset[inds] = 1
+            offset = np.cumsum(offset)
+            inds = basic + offset[:-1]
+        else:
+            bids = np.array(
+                [i * num_frames // clip_len for i in range(clip_len + 1)])
+            bsize = np.diff(bids)
+            bst = bids[:clip_len]
+            offset = np.random.randint(bsize)
+            inds = bst + offset
+        return inds
+
+    def _get_test_clips(self, num_frames, clip_len):
+        """Uniformly sample indices for testing clips.
+
+        Args:
+            num_frames (int): The number of frames.
+            clip_len (int): The length of the clip.
+        """
+
+        np.random.seed(self.seed)
+        if num_frames < clip_len:
+            # Then we use a simple strategy
+            if num_frames < self.num_clips:
+                start_inds = list(range(self.num_clips))
+            else:
+                start_inds = [
+                    i * num_frames // self.num_clips
+                    for i in range(self.num_clips)
+                ]
+            inds = np.concatenate(
+                [np.arange(i, i + clip_len) for i in start_inds])
+        elif clip_len <= num_frames < clip_len * 2:
+            all_inds = []
+            for i in range(self.num_clips):
+                basic = np.arange(clip_len)
+                inds = np.random.choice(
+                    clip_len + 1, num_frames - clip_len, replace=False)
+                offset = np.zeros(clip_len + 1, dtype=np.int64)
+                offset[inds] = 1
+                offset = np.cumsum(offset)
+                inds = basic + offset[:-1]
+                all_inds.append(inds)
+            inds = np.concatenate(all_inds)
+        else:
+            bids = np.array(
+                [i * num_frames // clip_len for i in range(clip_len + 1)])
+            bsize = np.diff(bids)
+            bst = bids[:clip_len]
+            all_inds = []
+            for i in range(self.num_clips):
+                offset = np.random.randint(bsize)
+                all_inds.append(bst + offset)
+            inds = np.concatenate(all_inds)
+        return inds
+
+    def __call__(self, results):
+        num_frames = results['total_frames']
+
+        if self.test_mode:
+            inds = self._get_test_clips(num_frames, self.clip_len)
+        else:
+            inds = self._get_train_clips(num_frames, self.clip_len)
+
+        inds = np.mod(inds, num_frames)
+        start_index = results['start_index']
+        inds = inds + start_index
+
+        results['frame_inds'] = inds.astype(np.int)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = None
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'num_clips={self.num_clips}, '
+                    f'test_mode={self.test_mode}, '
+                    f'seed={self.seed})')
+        return repr_str
+
+
+@PIPELINES.register()
+class PoseDecode:
+    """Load and decode pose with given indices.
+
+    Required keys are "keypoint", "frame_inds" (optional), "keypoint_score"
+    (optional), added or modified keys are "keypoint", "keypoint_score" (if
+    applicable).
+    """
+
+    @staticmethod
+    def _load_kp(kp, frame_inds):
+        """Load keypoints given frame indices.
+
+        Args:
+            kp (np.ndarray): The keypoint coordinates.
+            frame_inds (np.ndarray): The frame indices.
+        """
+
+        return [x[frame_inds].astype(np.float32) for x in kp]
+
+    @staticmethod
+    def _load_kpscore(kpscore, frame_inds):
+        """Load keypoint scores given frame indices.
+
+        Args:
+            kpscore (np.ndarray): The confidence scores of keypoints.
+            frame_inds (np.ndarray): The frame indices.
+        """
+
+        return [x[frame_inds].astype(np.float32) for x in kpscore]
+
+    def __call__(self, results):
+
+        if 'frame_inds' not in results:
+            results['frame_inds'] = np.arange(results['total_frames'])
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        offset = results.get('offset', 0)
+        frame_inds = results['frame_inds'] + offset
+
+        if 'keypoint_score' in results:
+            kpscore = results['keypoint_score']
+            results['keypoint_score'] = kpscore[:, frame_inds].astype(
+                np.float32)
+
+        if 'keypoint' in results:
+            results['keypoint'] = results['keypoint'][:, frame_inds].astype(
+                np.float32)
+
+        return results
+
+    def __repr__(self):
+        repr_str = f'{self.__class__.__name__}()'
+        return repr_str
+
+
+@PIPELINES.register()
+class PoseCompact:
+    """Convert the coordinates of keypoints to make it more compact.
+    Specifically, it first find a tight bounding box that surrounds all joints
+    in each frame, then we expand the tight box by a given padding ratio. For
+    example, if 'padding == 0.25', then the expanded box has unchanged center,
+    and 1.25x width and height.
+
+    Required keys in results are "img_shape", "keypoint", add or modified keys
+    are "img_shape", "keypoint", "crop_quadruple".
+
+    Args:
+        padding (float): The padding size. Default: 0.25.
+        threshold (int): The threshold for the tight bounding box. If the width
+            or height of the tight bounding box is smaller than the threshold,
+            we do not perform the compact operation. Default: 10.
+        hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded
+            box. Float indicates the specific ratio and tuple indicates a
+            ratio range. If set as None, it means there is no requirement on
+            hw_ratio. Default: None.
+        allow_imgpad (bool): Whether to allow expanding the box outside the
+            image to meet the hw_ratio requirement. Default: True.
+
+    Returns:
+        type: Description of returned object.
+    """
+
+    def __init__(self,
+                 padding=0.25,
+                 threshold=10,
+                 hw_ratio=None,
+                 allow_imgpad=True):
+
+        self.padding = padding
+        self.threshold = threshold
+        if hw_ratio is not None:
+            hw_ratio = _pair(hw_ratio)
+
+        self.hw_ratio = hw_ratio
+
+        self.allow_imgpad = allow_imgpad
+        assert self.padding >= 0
+
+    def _combine_quadruple(self, a, b):
+        return (a[0] + a[2] * b[0], a[1] + a[3] * b[1], a[2] * b[2],
+                a[3] * b[3])
+
+    def __call__(self, results):
+        img_shape = results['img_shape']
+        h, w = img_shape
+        kp = results['keypoint']
+
+        # Make NaN zero
+        kp[np.isnan(kp)] = 0.
+        kp_x = kp[..., 0]
+        kp_y = kp[..., 1]
+
+        min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)
+        min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)
+        max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)
+        max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)
+
+        # The compact area is too small
+        if max_x - min_x < self.threshold or max_y - min_y < self.threshold:
+            return results
+
+        center = ((max_x + min_x) / 2, (max_y + min_y) / 2)
+        half_width = (max_x - min_x) / 2 * (1 + self.padding)
+        half_height = (max_y - min_y) / 2 * (1 + self.padding)
+
+        if self.hw_ratio is not None:
+            half_height = max(self.hw_ratio[0] * half_width, half_height)
+            half_width = max(1 / self.hw_ratio[1] * half_height, half_width)
+
+        min_x, max_x = center[0] - half_width, center[0] + half_width
+        min_y, max_y = center[1] - half_height, center[1] + half_height
+
+        # hot update
+        if not self.allow_imgpad:
+            min_x, min_y = int(max(0, min_x)), int(max(0, min_y))
+            max_x, max_y = int(min(w, max_x)), int(min(h, max_y))
+        else:
+            min_x, min_y = int(min_x), int(min_y)
+            max_x, max_y = int(max_x), int(max_y)
+
+        kp_x[kp_x != 0] -= min_x
+        kp_y[kp_y != 0] -= min_y
+
+        new_shape = (max_y - min_y, max_x - min_x)
+        results['img_shape'] = new_shape
+
+        # the order is x, y, w, h (in [0, 1]), a tuple
+        crop_quadruple = results.get('crop_quadruple', (0., 0., 1., 1.))
+        new_crop_quadruple = (min_x / w, min_y / h, (max_x - min_x) / w,
+                              (max_y - min_y) / h)
+        crop_quadruple = self._combine_quadruple(crop_quadruple,
+                                                 new_crop_quadruple)
+        results['crop_quadruple'] = crop_quadruple
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
+                    f'threshold={self.threshold}, '
+                    f'hw_ratio={self.hw_ratio}, '
+                    f'allow_imgpad={self.allow_imgpad})')
+        return repr_str
+
+
+class CropBase:
+    @staticmethod
+    def _crop_kps(kps, crop_bbox):
+        return kps - crop_bbox[:2]
+
+    @staticmethod
+    def _crop_imgs(imgs, crop_bbox):
+        x1, y1, x2, y2 = crop_bbox
+        return [img[y1:y2, x1:x2] for img in imgs]
+
+    @staticmethod
+    def _box_crop(box, crop_bbox):
+        """Crop the bounding boxes according to the crop_bbox.
+
+        Args:
+            box (np.ndarray): The bounding boxes.
+            crop_bbox(np.ndarray): The bbox used to crop the original image.
+        """
+
+        x1, y1, x2, y2 = crop_bbox
+        img_w, img_h = x2 - x1, y2 - y1
+
+        box_ = box.copy()
+        box_[..., 0::2] = np.clip(box[..., 0::2] - x1, 0, img_w - 1)
+        box_[..., 1::2] = np.clip(box[..., 1::2] - y1, 0, img_h - 1)
+        return box_
+
+    def _all_box_crop(self, results, crop_bbox):
+        """Crop the gt_bboxes and proposals in results according to crop_bbox.
+
+        Args:
+            results (dict): All information about the sample, which contain
+                'gt_bboxes' and 'proposals' (optional).
+            crop_bbox(np.ndarray): The bbox used to crop the original image.
+        """
+        results['gt_bboxes'] = self._box_crop(results['gt_bboxes'], crop_bbox)
+        if 'proposals' in results and results['proposals'] is not None:
+            assert results['proposals'].shape[1] == 4
+            results['proposals'] = self._box_crop(results['proposals'],
+                                                  crop_bbox)
+        return results
+
+    def __call__(self, results):
+        raise NotImplementedError
+
+
+@PIPELINES.register()
+class RandomResizedCrop_V2(CropBase):
+    """Random crop that specifics the area and height-weight ratio range.
+
+    Required keys in results are "img_shape", "crop_bbox", "imgs" (optional),
+    "keypoint" (optional), added or modified keys are "imgs", "keypoint",
+    "crop_bbox" and "lazy"; Required keys in "lazy" are "flip", "crop_bbox",
+    added or modified key is "crop_bbox".
+
+    Args:
+        area_range (Tuple[float]): The candidate area scales range of
+            output cropped images. Default: (0.08, 1.0).
+        aspect_ratio_range (Tuple[float]): The candidate aspect ratio range of
+            output cropped images. Default: (3 / 4, 4 / 3).
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self,
+                 area_range=(0.08, 1.0),
+                 aspect_ratio_range=(3 / 4, 4 / 3),
+                 lazy=False):
+        self.area_range = eval(area_range)
+        self.aspect_ratio_range = aspect_ratio_range
+        self.lazy = lazy
+        if not is_tuple_of(self.area_range, float):
+            raise TypeError(f'Area_range must be a tuple of float, '
+                            f'but got {type(area_range)}')
+        if not is_tuple_of(self.aspect_ratio_range, float):
+            raise TypeError(f'Aspect_ratio_range must be a tuple of float, '
+                            f'but got {type(aspect_ratio_range)}')
+
+    @staticmethod
+    def get_crop_bbox(img_shape,
+                      area_range,
+                      aspect_ratio_range,
+                      max_attempts=10):
+        """Get a crop bbox given the area range and aspect ratio range.
+
+        Args:
+            img_shape (Tuple[int]): Image shape
+            area_range (Tuple[float]): The candidate area scales range of
+                output cropped images. Default: (0.08, 1.0).
+            aspect_ratio_range (Tuple[float]): The candidate aspect
+                ratio range of output cropped images. Default: (3 / 4, 4 / 3).
+                max_attempts (int): The maximum of attempts. Default: 10.
+            max_attempts (int): Max attempts times to generate random candidate
+                bounding box. If it doesn't qualified one, the center bounding
+                box will be used.
+        Returns:
+            (list[int]) A random crop bbox within the area range and aspect
+            ratio range.
+        """
+        assert 0 < area_range[0] <= area_range[1] <= 1
+        assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]
+
+        img_h, img_w = img_shape
+        area = img_h * img_w
+
+        min_ar, max_ar = aspect_ratio_range
+        aspect_ratios = np.exp(
+            np.random.uniform(
+                np.log(min_ar), np.log(max_ar), size=max_attempts))
+        target_areas = np.random.uniform(*area_range, size=max_attempts) * area
+        candidate_crop_w = np.round(np.sqrt(
+            target_areas * aspect_ratios)).astype(np.int32)
+        candidate_crop_h = np.round(np.sqrt(
+            target_areas / aspect_ratios)).astype(np.int32)
+
+        for i in range(max_attempts):
+            crop_w = candidate_crop_w[i]
+            crop_h = candidate_crop_h[i]
+            if crop_h <= img_h and crop_w <= img_w:
+                x_offset = random.randint(0, img_w - crop_w)
+                y_offset = random.randint(0, img_h - crop_h)
+                return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h
+
+        # Fallback
+        crop_size = min(img_h, img_w)
+        x_offset = (img_w - crop_size) // 2
+        y_offset = (img_h - crop_size) // 2
+        return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size
+
+    def __call__(self, results):
+        """Performs the RandomResizeCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+
+        img_h, img_w = results['img_shape']
+
+        left, top, right, bottom = self.get_crop_bbox(
+            (img_h, img_w), self.area_range, self.aspect_ratio_range)
+        new_h, new_w = bottom - top, right - left
+
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = left / img_w, top / img_h
+        w_ratio, h_ratio = new_w / img_w, new_h / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_h_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        crop_bbox = np.array([left, top, right, bottom])
+        results['crop_bbox'] = crop_bbox
+        results['img_shape'] = (new_h, new_w)
+
+        if not self.lazy:
+            if 'keypoint' in results:
+                results['keypoint'] = self._crop_kps(results['keypoint'],
+                                                     crop_bbox)
+            if 'imgs' in results:
+                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = left * (lazy_right - lazy_left) / img_w
+            right = right * (lazy_right - lazy_left) / img_w
+            top = top * (lazy_bottom - lazy_top) / img_h
+            bottom = bottom * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array(
+                [(lazy_left + left), (lazy_top + top), (lazy_left + right),
+                 (lazy_top + bottom)],
+                dtype=np.float32)
+
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            results = self._all_box_crop(results, results['crop_bbox'])
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'area_range={self.area_range}, '
+                    f'aspect_ratio_range={self.aspect_ratio_range}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+def is_seq_of(seq, expected_type, seq_type=None):
+    """Check whether it is a sequence of some type.
+
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type.
+
+    Returns:
+        bool: Whether the sequence is valid.
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def is_tuple_of(seq, expected_type):
+    """Check whether it is a tuple of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=tuple)
+
+
+@PIPELINES.register()
+class CenterCrop_V2(CropBase):
+    """Crop the center area from images.
+
+    Required keys are "img_shape", "imgs" (optional), "keypoint" (optional),
+    added or modified keys are "imgs", "keypoint", "crop_bbox", "lazy" and
+    "img_shape". Required keys in "lazy" is "crop_bbox", added or modified key
+    is "crop_bbox".
+
+    Args:
+        crop_size (int | tuple[int]): (w, h) of crop size.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self, crop_size, lazy=False):
+        self.crop_size = _pair(crop_size)
+        self.lazy = lazy
+        if not is_tuple_of(self.crop_size, int):
+            raise TypeError(f'Crop_size must be int or tuple of int, '
+                            f'but got {type(crop_size)}')
+
+    def __call__(self, results):
+        """Performs the CenterCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+
+        img_h, img_w = results['img_shape']
+        crop_w, crop_h = self.crop_size
+
+        left = (img_w - crop_w) // 2
+        top = (img_h - crop_h) // 2
+        right = left + crop_w
+        bottom = top + crop_h
+        new_h, new_w = bottom - top, right - left
+
+        crop_bbox = np.array([left, top, right, bottom])
+        results['crop_bbox'] = crop_bbox
+        results['img_shape'] = (new_h, new_w)
+
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = left / img_w, top / img_h
+        w_ratio, h_ratio = new_w / img_w, new_h / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_h_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        if not self.lazy:
+            if 'keypoint' in results:
+                results['keypoint'] = self._crop_kps(results['keypoint'],
+                                                     crop_bbox)
+            if 'imgs' in results:
+                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = left * (lazy_right - lazy_left) / img_w
+            right = right * (lazy_right - lazy_left) / img_w
+            top = top * (lazy_bottom - lazy_top) / img_h
+            bottom = bottom * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array(
+                [(lazy_left + left), (lazy_top + top), (lazy_left + right),
+                 (lazy_top + bottom)],
+                dtype=np.float32)
+
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            results = self._all_box_crop(results, results['crop_bbox'])
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(crop_size={self.crop_size}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@PIPELINES.register()
+class Flip_V2:
+    """Flip the input images with a probability.
+
+    Reverse the order of elements in the given imgs with a specific direction.
+    The shape of the imgs is preserved, but the elements are reordered.
+
+    Required keys are "img_shape", "modality", "imgs" (optional), "keypoint"
+    (optional), added or modified keys are "imgs", "keypoint", "lazy" and
+    "flip_direction". Required keys in "lazy" is None, added or modified key
+    are "flip" and "flip_direction". The Flip augmentation should be placed
+    after any cropping / reshaping augmentations, to make sure crop_quadruple
+    is calculated properly.
+
+    Args:
+        flip_ratio (float): Probability of implementing flip. Default: 0.5.
+        direction (str): Flip imgs horizontally or vertically. Options are
+            "horizontal" | "vertical". Default: "horizontal".
+        flip_label_map (Dict[int, int] | None): Transform the label of the
+            flipped image with the specific label. Default: None.
+        left_kp (list[int]): Indexes of left keypoints, used to flip keypoints.
+            Default: None.
+        right_kp (list[ind]): Indexes of right keypoints, used to flip
+            keypoints. Default: None.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+    _directions = ['horizontal', 'vertical']
+
+    def __init__(self,
+                 flip_ratio=0.5,
+                 direction='horizontal',
+                 flip_label_map=None,
+                 left_kp=None,
+                 right_kp=None,
+                 lazy=False):
+        if direction not in self._directions:
+            raise ValueError(f'Direction {direction} is not supported. '
+                             f'Currently support ones are {self._directions}')
+        self.flip_ratio = flip_ratio
+        self.direction = direction
+        self.flip_label_map = flip_label_map
+        self.left_kp = left_kp
+        self.right_kp = right_kp
+        self.lazy = lazy
+
+    def _flip_imgs(self, imgs, modality):
+        _ = [imflip_(img, self.direction) for img in imgs]
+        lt = len(imgs)
+        if modality == 'Flow':
+            # The 1st frame of each 2 frames is flow-x
+            for i in range(0, lt, 2):
+                imgs[i] = iminvert(imgs[i])
+        return imgs
+
+    def _flip_kps(self, kps, kpscores, img_width):
+        kp_x = kps[..., 0]
+        kp_x[kp_x != 0] = img_width - kp_x[kp_x != 0]
+        new_order = list(range(kps.shape[2]))
+        if self.left_kp is not None and self.right_kp is not None:
+            for left, right in zip(self.left_kp, self.right_kp):
+                new_order[left] = right
+                new_order[right] = left
+        kps = kps[:, :, new_order]
+        if kpscores is not None:
+            kpscores = kpscores[:, :, new_order]
+        return kps, kpscores
+
+    @staticmethod
+    def _box_flip(box, img_width):
+        """Flip the bounding boxes given the width of the image.
+
+        Args:
+            box (np.ndarray): The bounding boxes.
+            img_width (int): The img width.
+        """
+        box_ = box.copy()
+        box_[..., 0::4] = img_width - box[..., 2::4]
+        box_[..., 2::4] = img_width - box[..., 0::4]
+        return box_
+
+    def __call__(self, results):
+        """Performs the Flip augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+            assert self.direction == 'horizontal', (
+                'Only horizontal flips are'
+                'supported for human keypoints')
+
+        modality = results['modality']
+        if modality == 'Flow':
+            assert self.direction == 'horizontal'
+
+        flip = np.random.rand() < self.flip_ratio
+
+        results['flip'] = flip
+        results['flip_direction'] = self.direction
+        img_width = results['img_shape'][1]
+
+        if self.flip_label_map is not None and flip:
+            results['label'] = self.flip_label_map.get(results['label'],
+                                                       results['label'])
+
+        if not self.lazy:
+            if flip:
+                if 'imgs' in results:
+                    results['imgs'] = self._flip_imgs(results['imgs'], modality)
+                if 'keypoint' in results:
+                    kp = results['keypoint']
+                    kpscore = results.get('keypoint_score', None)
+                    kp, kpscore = self._flip_kps(kp, kpscore, img_width)
+                    results['keypoint'] = kp
+                    if 'keypoint_score' in results:
+                        results['keypoint_score'] = kpscore
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Use one Flip please')
+            lazyop['flip'] = flip
+            lazyop['flip_direction'] = self.direction
+
+        if 'gt_bboxes' in results and flip:
+            assert not self.lazy and self.direction == 'horizontal'
+            width = results['img_shape'][1]
+            results['gt_bboxes'] = self._box_flip(results['gt_bboxes'], width)
+            if 'proposals' in results and results['proposals'] is not None:
+                assert results['proposals'].shape[1] == 4
+                results['proposals'] = self._box_flip(results['proposals'],
+                                                      width)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (
+            f'{self.__class__.__name__}('
+            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '
+            f'flip_label_map={self.flip_label_map}, lazy={self.lazy})')
+        return repr_str
+
+
+@PIPELINES.register()
+class FormatShape:
+    """Format final imgs shape to the given input_format.
+
+    Required keys are "imgs", "num_clips" and "clip_len", added or modified
+    keys are "imgs" and "input_shape".
+
+    Args:
+        input_format (str): Define the final imgs format.
+        collapse (bool): To collpase input_format N... to ... (NCTHW to CTHW,
+            etc.) if N is 1. Should be set as True when training and testing
+            detectors. Default: False.
+    """
+
+    def __init__(self, input_format, collapse=False):
+        self.input_format = input_format
+        self.collapse = collapse
+        if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:
+            raise ValueError(
+                f'The input format {self.input_format} is invalid.')
+
+    def __call__(self, results):
+        """Performs the FormatShape formating.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        if not isinstance(results['imgs'], np.ndarray):
+            results['imgs'] = np.array(results['imgs'])
+        imgs = results['imgs']
+        # [M x H x W x C]
+        # M = 1 * N_crops * N_clips * L
+        if self.collapse:
+            assert results['num_clips'] == 1
+
+        if self.input_format == 'NCTHW':
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+
+            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+            # N_crops x N_clips x L x H x W x C
+            imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
+            # N_crops x N_clips x C x L x H x W
+            imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+            # M' x C x L x H x W
+            # M' = N_crops x N_clips
+        elif self.input_format == 'NCHW':
+            imgs = np.transpose(imgs, (0, 3, 1, 2))
+            # M x C x H x W
+        elif self.input_format == 'NCHW_Flow':
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+            # N_crops x N_clips x L x H x W x C
+            imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4))
+            # N_crops x N_clips x L x C x H x W
+            imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) +
+                                imgs.shape[4:])
+            # M' x C' x H x W
+            # M' = N_crops x N_clips
+            # C' = L x C
+        elif self.input_format == 'NPTCHW':
+            num_proposals = results['num_proposals']
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+            imgs = imgs.reshape((num_proposals, num_clips * clip_len) +
+                                imgs.shape[1:])
+            # P x M x H x W x C
+            # M = N_clips x L
+            imgs = np.transpose(imgs, (0, 1, 4, 2, 3))
+            # P x M x C x H x W
+
+        if self.collapse:
+            assert imgs.shape[0] == 1
+            imgs = imgs.squeeze(0)
+
+        results['imgs'] = imgs
+        results['input_shape'] = imgs.shape
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(input_format='{self.input_format}')"
+        return repr_str
+
+
+@PIPELINES.register()
+class Collect:
+    """Collect data from the loader relevant to the specific task.
+
+    This keeps the items in ``keys`` as it is, and collect items in
+    ``meta_keys`` into a meta item called ``meta_name``.This is usually
+    the last stage of the data loader pipeline.
+    For example, when keys='imgs', meta_keys=('filename', 'label',
+    'original_shape'), meta_name='img_metas', the results will be a dict with
+    keys 'imgs' and 'img_metas', where 'img_metas' is a DataContainer of
+    another dict with keys 'filename', 'label', 'original_shape'.
+
+    Args:
+        keys (Sequence[str]): Required keys to be collected.
+        meta_name (str): The name of the key that contains meta infomation.
+            This key is always populated. Default: "img_metas".
+        meta_keys (Sequence[str]): Keys that are collected under meta_name.
+            The contents of the ``meta_name`` dictionary depends on
+            ``meta_keys``.
+            By default this includes:
+
+            - "filename": path to the image file
+            - "label": label of the image file
+            - "original_shape": original shape of the image as a tuple
+                (h, w, c)
+            - "img_shape": shape of the image input to the network as a tuple
+                (h, w, c).  Note that images may be zero padded on the
+                bottom/right, if the batch tensor is larger than this shape.
+            - "pad_shape": image shape after padding
+            - "flip_direction": a str in ("horiziontal", "vertival") to
+                indicate if the image is fliped horizontally or vertically.
+            - "img_norm_cfg": a dict of normalization information:
+                - mean - per channel mean subtraction
+                - std - per channel std divisor
+                - to_rgb - bool indicating if bgr was converted to rgb
+        nested (bool): If set as True, will apply data[x] = [data[x]] to all
+            items in data. The arg is added for compatibility. Default: False.
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'label', 'original_shape', 'img_shape',
+                            'pad_shape', 'flip_direction', 'img_norm_cfg'),
+                 meta_name='img_metas'):
+        self.keys = keys
+        self.meta_keys = meta_keys
+        self.meta_name = meta_name
+
+    def __call__(self, results):
+        """Performs the Collect formating.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        data = []
+        for key in self.keys:
+            data.append(results[key])
+
+        if len(self.meta_keys) != 0:
+            meta = {}
+            for key in self.meta_keys:
+                meta[key] = results[key]
+            data.append(meta)
+
+        return data
+
+    def __repr__(self):
+        return (f'{self.__class__.__name__}('
+                f'keys={self.keys}, meta_keys={self.meta_keys}, '
+                f'nested={self.nested})')
+
+
+@PIPELINES.register()
+class GeneratePoseTarget:
+    """Generate pseudo heatmaps based on joint coordinates and confidence.
+
+    Required keys are "keypoint", "img_shape", "keypoint_score" (optional),
+    added or modified keys are "imgs".
+
+    Args:
+        sigma (float): The sigma of the generated gaussian map. Default: 0.6.
+        use_score (bool): Use the confidence score of keypoints as the maximum
+            of the gaussian maps. Default: True.
+        with_kp (bool): Generate pseudo heatmaps for keypoints. Default: True.
+        with_limb (bool): Generate pseudo heatmaps for limbs. At least one of
+            'with_kp' and 'with_limb' should be True. Default: False.
+        skeletons (tuple[tuple]): The definition of human skeletons.
+            Default: ((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), (7, 9),
+                      (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), (13, 15),
+                      (6, 12), (12, 14), (14, 16), (11, 12)),
+            which is the definition of COCO-17p skeletons.
+        double (bool): Output both original heatmaps and flipped heatmaps.
+            Default: False.
+        left_kp (tuple[int]): Indexes of left keypoints, which is used when
+            flipping heatmaps. Default: (1, 3, 5, 7, 9, 11, 13, 15),
+            which is left keypoints in COCO-17p.
+        right_kp (tuple[int]): Indexes of right keypoints, which is used when
+            flipping heatmaps. Default: (2, 4, 6, 8, 10, 12, 14, 16),
+            which is right keypoints in COCO-17p.
+    """
+
+    def __init__(self,
+                 sigma=0.6,
+                 use_score=True,
+                 with_kp=True,
+                 with_limb=False,
+                 skeletons=((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),
+                            (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),
+                            (13, 15), (6, 12), (12, 14), (14, 16), (11, 12)),
+                 double=False,
+                 left_kp=(1, 3, 5, 7, 9, 11, 13, 15),
+                 right_kp=(2, 4, 6, 8, 10, 12, 14, 16)):
+
+        self.sigma = sigma
+        self.use_score = use_score
+        self.with_kp = with_kp
+        self.with_limb = with_limb
+        self.double = double
+
+        # an auxiliary const
+        self.eps = 1e-4
+
+        assert self.with_kp or self.with_limb, (
+            'At least one of "with_limb" '
+            'and "with_kp" should be set as True.')
+        self.left_kp = left_kp
+        self.right_kp = right_kp
+        self.skeletons = skeletons
+
+    def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values):
+        """Generate pseudo heatmap for one keypoint in one frame.
+
+        Args:
+            img_h (int): The height of the heatmap.
+            img_w (int): The width of the heatmap.
+            centers (np.ndarray): The coordinates of corresponding keypoints
+                (of multiple persons).
+            sigma (float): The sigma of generated gaussian.
+            max_values (np.ndarray): The max values of each keypoint.
+
+        Returns:
+            np.ndarray: The generated pseudo heatmap.
+        """
+
+        heatmap = np.zeros([img_h, img_w], dtype=np.float32)
+
+        for center, max_value in zip(centers, max_values):
+            mu_x, mu_y = center[0], center[1]
+            if max_value < self.eps:
+                continue
+
+            st_x = max(int(mu_x - 3 * sigma), 0)
+            ed_x = min(int(mu_x + 3 * sigma) + 1, img_w)
+            st_y = max(int(mu_y - 3 * sigma), 0)
+            ed_y = min(int(mu_y + 3 * sigma) + 1, img_h)
+            x = np.arange(st_x, ed_x, 1, np.float32)
+            y = np.arange(st_y, ed_y, 1, np.float32)
+
+            # if the keypoint not in the heatmap coordinate system
+            if not (len(x) and len(y)):
+                continue
+            y = y[:, None]
+
+            patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2)
+            patch = patch * max_value
+            heatmap[st_y:ed_y, st_x:ed_x] = np.maximum(
+                heatmap[st_y:ed_y, st_x:ed_x], patch)
+
+        return heatmap
+
+    def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,
+                                start_values, end_values):
+        """Generate pseudo heatmap for one limb in one frame.
+
+        Args:
+            img_h (int): The height of the heatmap.
+            img_w (int): The width of the heatmap.
+            starts (np.ndarray): The coordinates of one keypoint in the
+                corresponding limbs (of multiple persons).
+            ends (np.ndarray): The coordinates of the other keypoint in the
+                corresponding limbs (of multiple persons).
+            sigma (float): The sigma of generated gaussian.
+            start_values (np.ndarray): The max values of one keypoint in the
+                corresponding limbs.
+            end_values (np.ndarray): The max values of the other keypoint in
+                the corresponding limbs.
+
+        Returns:
+            np.ndarray: The generated pseudo heatmap.
+        """
+
+        heatmap = np.zeros([img_h, img_w], dtype=np.float32)
+
+        for start, end, start_value, end_value in zip(starts, ends,
+                                                      start_values, end_values):
+            value_coeff = min(start_value, end_value)
+            if value_coeff < self.eps:
+                continue
+
+            min_x, max_x = min(start[0], end[0]), max(start[0], end[0])
+            min_y, max_y = min(start[1], end[1]), max(start[1], end[1])
+
+            min_x = max(int(min_x - 3 * sigma), 0)
+            max_x = min(int(max_x + 3 * sigma) + 1, img_w)
+            min_y = max(int(min_y - 3 * sigma), 0)
+            max_y = min(int(max_y + 3 * sigma) + 1, img_h)
+
+            x = np.arange(min_x, max_x, 1, np.float32)
+            y = np.arange(min_y, max_y, 1, np.float32)
+
+            if not (len(x) and len(y)):
+                continue
+
+            y = y[:, None]
+            x_0 = np.zeros_like(x)
+            y_0 = np.zeros_like(y)
+
+            # distance to start keypoints
+            d2_start = ((x - start[0])**2 + (y - start[1])**2)
+
+            # distance to end keypoints
+            d2_end = ((x - end[0])**2 + (y - end[1])**2)
+
+            # the distance between start and end keypoints.
+            d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2)
+
+            if d2_ab < 1:
+                full_map = self.generate_a_heatmap(img_h, img_w, [start], sigma,
+                                                   [start_value])
+                heatmap = np.maximum(heatmap, full_map)
+                continue
+
+            coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab
+
+            a_dominate = coeff <= 0
+            b_dominate = coeff >= 1
+            seg_dominate = 1 - a_dominate - b_dominate
+
+            position = np.stack([x + y_0, y + x_0], axis=-1)
+            projection = start + np.stack([coeff, coeff],
+                                          axis=-1) * (end - start)
+            d2_line = position - projection
+            d2_line = d2_line[:, :, 0]**2 + d2_line[:, :, 1]**2
+            d2_seg = (a_dominate * d2_start + b_dominate * d2_end +
+                      seg_dominate * d2_line)
+
+            patch = np.exp(-d2_seg / 2. / sigma**2)
+            patch = patch * value_coeff
+
+            heatmap[min_y:max_y, min_x:max_x] = np.maximum(
+                heatmap[min_y:max_y, min_x:max_x], patch)
+
+        return heatmap
+
+    def generate_heatmap(self, img_h, img_w, kps, sigma, max_values):
+        """Generate pseudo heatmap for all keypoints and limbs in one frame (if
+        needed).
+
+        Args:
+            img_h (int): The height of the heatmap.
+            img_w (int): The width of the heatmap.
+            kps (np.ndarray): The coordinates of keypoints in this frame.
+            sigma (float): The sigma of generated gaussian.
+            max_values (np.ndarray): The confidence score of each keypoint.
+
+        Returns:
+            np.ndarray: The generated pseudo heatmap.
+        """
+
+        heatmaps = []
+        if self.with_kp:
+            num_kp = kps.shape[1]
+            for i in range(num_kp):
+                heatmap = self.generate_a_heatmap(img_h, img_w, kps[:, i],
+                                                  sigma, max_values[:, i])
+                heatmaps.append(heatmap)
+
+        if self.with_limb:
+            for limb in self.skeletons:
+                start_idx, end_idx = limb
+                starts = kps[:, start_idx]
+                ends = kps[:, end_idx]
+
+                start_values = max_values[:, start_idx]
+                end_values = max_values[:, end_idx]
+                heatmap = self.generate_a_limb_heatmap(
+                    img_h, img_w, starts, ends, sigma, start_values, end_values)
+                heatmaps.append(heatmap)
+
+        return np.stack(heatmaps, axis=-1)
+
+    def gen_an_aug(self, results):
+        """Generate pseudo heatmaps for all frames.
+
+        Args:
+            results (dict): The dictionary that contains all info of a sample.
+
+        Returns:
+            list[np.ndarray]: The generated pseudo heatmaps.
+        """
+
+        all_kps = results['keypoint']
+        kp_shape = all_kps.shape
+
+        if 'keypoint_score' in results:
+            all_kpscores = results['keypoint_score']
+        else:
+            all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32)
+
+        img_h, img_w = results['img_shape']
+        num_frame = kp_shape[1]
+
+        imgs = []
+        for i in range(num_frame):
+            sigma = self.sigma
+            kps = all_kps[:, i]
+            kpscores = all_kpscores[:, i]
+
+            max_values = np.ones(kpscores.shape, dtype=np.float32)
+            if self.use_score:
+                max_values = kpscores
+
+            hmap = self.generate_heatmap(img_h, img_w, kps, sigma, max_values)
+            imgs.append(hmap)
+
+        return imgs
+
+    def __call__(self, results):
+        if not self.double:
+            results['imgs'] = np.stack(self.gen_an_aug(results))
+        else:
+            results_ = cp.deepcopy(results)
+            flip = Flip_V2(
+                flip_ratio=1, left_kp=self.left_kp, right_kp=self.right_kp)
+            results_ = flip(results_)
+            results['imgs'] = np.concatenate(
+                [self.gen_an_aug(results),
+                 self.gen_an_aug(results_)])
+        results['label'] = np.array([results['label']])
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'sigma={self.sigma}, '
+                    f'use_score={self.use_score}, '
+                    f'with_kp={self.with_kp}, '
+                    f'with_limb={self.with_limb}, '
+                    f'skeletons={self.skeletons}, '
+                    f'double={self.double}, '
+                    f'left_kp={self.left_kp}, '
+                    f'right_kp={self.right_kp})')
+        return repr_str
diff --git a/docs/src/paddlevideo/loader/registry.py b/docs/src/paddlevideo/loader/registry.py
new file mode 100644
index 000000000..add663104
--- /dev/null
+++ b/docs/src/paddlevideo/loader/registry.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+PIPELINES = Registry("pipeline")
+DATASETS = Registry("datasets")
diff --git a/docs/src/paddlevideo/metrics/ActivityNet/__init__.py b/docs/src/paddlevideo/metrics/ActivityNet/__init__.py
new file mode 100644
index 000000000..eefabbd72
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ActivityNet/__init__.py
@@ -0,0 +1,3 @@
+from .anet_prop import ANETproposal
+
+__all__ = ['ANETproposal']
diff --git a/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py b/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py
new file mode 100644
index 000000000..411b164f9
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ActivityNet/anet_prop.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import json
+import numpy as np
+import pandas as pd
+import urllib.request as urllib2
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+class ANETproposal(object):
+    """
+    This class is used for calculating AR@N and AUC;
+    Code transfer from ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)
+    """
+    GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']
+    PROPOSAL_FIELDS = ['results', 'version', 'external_data']
+    API = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/challenge19/api.py'
+
+    def __init__(self,
+                 ground_truth_filename=None,
+                 proposal_filename=None,
+                 ground_truth_fields=GROUND_TRUTH_FIELDS,
+                 proposal_fields=PROPOSAL_FIELDS,
+                 tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                 max_avg_nr_proposals=None,
+                 subset='validation',
+                 verbose=False,
+                 check_status=True):
+        if not ground_truth_filename:
+            raise IOError('Please input a valid ground truth file.')
+        if not proposal_filename:
+            raise IOError('Please input a valid proposal file.')
+        self.subset = subset
+        self.tiou_thresholds = tiou_thresholds
+        self.max_avg_nr_proposals = max_avg_nr_proposals
+        self.verbose = verbose
+        self.gt_fields = ground_truth_fields
+        self.pred_fields = proposal_fields
+        self.recall = None
+        self.avg_recall = None
+        self.proposals_per_video = None
+        self.check_status = check_status
+        # Retrieve blocked videos from server.
+        if self.check_status:
+            self.blocked_videos = self.get_blocked_videos()
+        else:
+            self.blocked_videos = list()
+        # Import ground truth and proposals.
+        self.ground_truth, self.activity_index = self._import_ground_truth(
+            ground_truth_filename)
+        self.proposal = self._import_proposal(proposal_filename)
+
+        if self.verbose:
+            print('[INIT] Loaded annotations from {} subset.'.format(subset))
+            nr_gt = len(self.ground_truth)
+            print('\tNumber of ground truth instances: {}'.format(nr_gt))
+            nr_pred = len(self.proposal)
+            print('\tNumber of proposals: {}'.format(nr_pred))
+            print('\tFixed threshold for tiou score: {}'.format(
+                self.tiou_thresholds))
+
+    def _import_ground_truth(self, ground_truth_filename):
+        """
+        Reads ground truth file, checks if it is well formatted, and returns
+        the ground truth instances and the activity classes.
+
+        Parameters:
+        ground_truth_filename (str): full path to the ground truth json file.
+        Returns:
+        ground_truth (df): Data frame containing the ground truth instances.
+        activity_index (dict): Dictionary containing class index.
+        """
+        with open(ground_truth_filename, 'r') as fobj:
+            data = json.load(fobj)
+        # Checking format
+        if not all([field in data.keys() for field in self.gt_fields]):
+            raise IOError('Please input a valid ground truth file.')
+
+        # Read ground truth data.
+        activity_index, cidx = {}, 0
+        video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
+        for videoid, v in data['database'].items():
+            if self.subset != v['subset']:
+                continue
+            if videoid in self.blocked_videos:
+                continue
+            for ann in v['annotations']:
+                if ann['label'] not in activity_index:
+                    activity_index[ann['label']] = cidx
+                    cidx += 1
+                video_lst.append(videoid)
+                t_start_lst.append(float(ann['segment'][0]))
+                t_end_lst.append(float(ann['segment'][1]))
+                label_lst.append(activity_index[ann['label']])
+
+        ground_truth = pd.DataFrame({
+            'video-id': video_lst,
+            't-start': t_start_lst,
+            't-end': t_end_lst,
+            'label': label_lst
+        })
+        return ground_truth, activity_index
+
+    def _import_proposal(self, proposal_filename):
+        """
+        Reads proposal file, checks if it is well formatted, and returns
+        the proposal instances.
+
+        Parameters:
+        proposal_filename (str): Full path to the proposal json file.
+        Returns:
+        proposal (df): Data frame containing the proposal instances.
+        """
+        with open(proposal_filename, 'r') as fobj:
+            data = json.load(fobj)
+        # Checking format...
+        if not all([field in data.keys() for field in self.pred_fields]):
+            raise IOError('Please input a valid proposal file.')
+
+        # Read predictions.
+        video_lst, t_start_lst, t_end_lst = [], [], []
+        score_lst = []
+        for videoid, v in data['results'].items():
+            if videoid in self.blocked_videos:
+                continue
+            for result in v:
+                video_lst.append(videoid)
+                t_start_lst.append(float(result['segment'][0]))
+                t_end_lst.append(float(result['segment'][1]))
+                score_lst.append(result['score'])
+        proposal = pd.DataFrame({
+            'video-id': video_lst,
+            't-start': t_start_lst,
+            't-end': t_end_lst,
+            'score': score_lst
+        })
+        return proposal
+
+    def evaluate(self):
+        """
+        Evaluates a proposal file. To measure the performance of a
+        method for the proposal task, we computes the area under the
+        average recall vs average number of proposals per video curve.
+        """
+        recall, avg_recall, proposals_per_video = self.average_recall_vs_avg_nr_proposals(
+            self.ground_truth,
+            self.proposal,
+            max_avg_nr_proposals=self.max_avg_nr_proposals,
+            tiou_thresholds=self.tiou_thresholds)
+
+        area_under_curve = np.trapz(avg_recall, proposals_per_video)
+
+        if self.verbose:
+            print('[RESULTS] Performance on ActivityNet proposal task.')
+            with open("data/bmn/BMN_Test_results/auc_result.txt",
+                      "a") as text_file:
+                text_file.write(
+                    '\tArea Under the AR vs AN curve: {}% \n'.format(
+                        100. * float(area_under_curve) /
+                        proposals_per_video[-1]))
+            print('\tArea Under the AR vs AN curve: {}%'.format(
+                100. * float(area_under_curve) / proposals_per_video[-1]))
+
+        self.recall = recall
+        self.avg_recall = avg_recall
+        self.proposals_per_video = proposals_per_video
+
+    def average_recall_vs_avg_nr_proposals(self,
+                                           ground_truth,
+                                           proposals,
+                                           max_avg_nr_proposals=None,
+                                           tiou_thresholds=np.linspace(
+                                               0.5, 0.95, 10)):
+        """
+        Computes the average recall given an average number of
+        proposals per video.
+
+        Parameters:
+        ground_truth(df): Data frame containing the ground truth instances.
+            Required fields: ['video-id', 't-start', 't-end']
+        proposal(df): Data frame containing the proposal instances.
+            Required fields: ['video-id, 't-start', 't-end', 'score']
+        tiou_thresholds(1d-array | optional): array with tiou thresholds.
+
+        Returns:
+        recall(2d-array): recall[i,j] is recall at ith tiou threshold at the jth
+            average number of average number of proposals per video.
+        average_recall(1d-array): recall averaged over a list of tiou threshold.
+            This is equivalent to recall.mean(axis=0).
+        proposals_per_video(1d-array): average number of proposals per video.
+        """
+
+        # Get list of videos.
+        video_lst = ground_truth['video-id'].unique()
+
+        if not max_avg_nr_proposals:
+            max_avg_nr_proposals = float(
+                proposals.shape[0]) / video_lst.shape[0]
+
+        ratio = max_avg_nr_proposals * float(
+            video_lst.shape[0]) / proposals.shape[0]
+
+        # Adaptation to query faster
+        ground_truth_gbvn = ground_truth.groupby('video-id')
+        proposals_gbvn = proposals.groupby('video-id')
+
+        # For each video, computes tiou scores among the retrieved proposals.
+        score_lst = []
+        total_nr_proposals = 0
+        for videoid in video_lst:
+            # Get ground-truth instances associated to this video.
+            ground_truth_videoid = ground_truth_gbvn.get_group(videoid)
+            this_video_ground_truth = ground_truth_videoid.loc[:, [
+                't-start', 't-end'
+            ]].values
+
+            # Get proposals for this video.
+            try:
+                proposals_videoid = proposals_gbvn.get_group(videoid)
+            except:
+                n = this_video_ground_truth.shape[0]
+                score_lst.append(np.zeros((n, 1)))
+                continue
+
+            this_video_proposals = proposals_videoid.loc[:,
+                                                         ['t-start', 't-end'
+                                                          ]].values
+
+            if this_video_proposals.shape[0] == 0:
+                n = this_video_ground_truth.shape[0]
+                score_lst.append(np.zeros((n, 1)))
+                continue
+
+            # Sort proposals by score.
+            sort_idx = proposals_videoid['score'].argsort()[::-1]
+            this_video_proposals = this_video_proposals[sort_idx, :]
+
+            if this_video_proposals.ndim != 2:
+                this_video_proposals = np.expand_dims(this_video_proposals,
+                                                      axis=0)
+            if this_video_ground_truth.ndim != 2:
+                this_video_ground_truth = np.expand_dims(
+                    this_video_ground_truth, axis=0)
+
+            nr_proposals = np.minimum(
+                int(this_video_proposals.shape[0] * ratio),
+                this_video_proposals.shape[0])
+            total_nr_proposals += nr_proposals
+            this_video_proposals = this_video_proposals[:nr_proposals, :]
+
+            # Compute tiou scores.
+            tiou = self.wrapper_segment_iou(this_video_proposals,
+                                            this_video_ground_truth)
+            score_lst.append(tiou)
+
+        # Given that the length of the videos is really varied, we
+        # compute the number of proposals in terms of a ratio of the total
+        # proposals retrieved, i.e. average recall at a percentage of proposals
+        # retrieved per video.
+
+        # Computes average recall.
+        pcn_lst = np.arange(1, 101) / 100.0 * (max_avg_nr_proposals * float(
+            video_lst.shape[0]) / total_nr_proposals)
+        matches = np.empty((video_lst.shape[0], pcn_lst.shape[0]))
+        positives = np.empty(video_lst.shape[0])
+        recall = np.empty((tiou_thresholds.shape[0], pcn_lst.shape[0]))
+        # Iterates over each tiou threshold.
+        for ridx, tiou in enumerate(tiou_thresholds):
+
+            # Inspect positives retrieved per video at different
+            # number of proposals (percentage of the total retrieved).
+            for i, score in enumerate(score_lst):
+                # Total positives per video.
+                positives[i] = score.shape[0]
+                # Find proposals that satisfies minimum tiou threshold.
+                true_positives_tiou = score >= tiou
+                # Get number of proposals as a percentage of total retrieved.
+                pcn_proposals = np.minimum(
+                    (score.shape[1] * pcn_lst).astype(int), score.shape[1])
+
+                for j, nr_proposals in enumerate(pcn_proposals):
+                    # Compute the number of matches for each percentage of the proposals
+                    matches[i, j] = np.count_nonzero(
+                        (true_positives_tiou[:, :nr_proposals]).sum(axis=1))
+
+            # Computes recall given the set of matches per video.
+            recall[ridx, :] = matches.sum(axis=0) / positives.sum()
+
+        # Recall is averaged.
+        avg_recall = recall.mean(axis=0)
+
+        # Get the average number of proposals per video.
+        proposals_per_video = pcn_lst * (float(total_nr_proposals) /
+                                         video_lst.shape[0])
+
+        return recall, avg_recall, proposals_per_video
+
+    def get_blocked_videos(self, api=API):
+        api_url = '{}?action=get_blocked'.format(api)
+        req = urllib2.Request(api_url)
+        response = urllib2.urlopen(req)
+        return json.loads(response.read())
+
+    def wrapper_segment_iou(self, target_segments, candidate_segments):
+        """
+        Compute intersection over union btw segments
+        Parameters:
+        target_segments(nd-array): 2-dim array in format [m x 2:=[init, end]]
+        candidate_segments(nd-array): 2-dim array in format [n x 2:=[init, end]]
+        Returns:
+        tiou(nd-array): 2-dim array [n x m] with IOU ratio.
+        Note: It assumes that candidate-segments are more scarce that target-segments
+        """
+        if candidate_segments.ndim != 2 or target_segments.ndim != 2:
+            raise ValueError('Dimension of arguments is incorrect')
+
+        n, m = candidate_segments.shape[0], target_segments.shape[0]
+        tiou = np.empty((n, m))
+        for i in range(m):
+            tiou[:, i] = self.segment_iou(target_segments[i, :],
+                                          candidate_segments)
+
+        return tiou
+
+    def segment_iou(self, target_segment, candidate_segments):
+        """
+        Compute the temporal intersection over union between a
+        target segment and all the test segments.
+
+        Parameters:
+        target_segment(1d-array): Temporal target segment containing [starting, ending] times.
+        candidate_segments(2d-array): Temporal candidate segments containing N x [starting, ending] times.
+
+        Returns:
+        tiou(1d-array): Temporal intersection over union score of the N's candidate segments.
+        """
+        tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])
+        tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])
+        # Intersection including Non-negative overlap score.
+        segments_intersection = (tt2 - tt1).clip(0)
+        # Segment union.
+        segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \
+                         + (target_segment[1] - target_segment[0]) - segments_intersection
+        # Compute overlap as the ratio of the intersection
+        # over union of two segments.
+        tIoU = segments_intersection.astype(float) / segments_union
+        return tIoU
diff --git a/docs/src/paddlevideo/metrics/__init__.py b/docs/src/paddlevideo/metrics/__init__.py
new file mode 100644
index 000000000..b693b876c
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bmn_metric import BMNMetric
+from .build import build_metric
+from .center_crop_metric import CenterCropMetric
+from .depth_metric import DepthMetric
+from .msrvtt_metric import MSRVTTMetric
+from .multi_crop_metric import MultiCropMetric
+from .registry import METRIC
+from .skeleton_metric import SkeletonMetric
+from .transnetv2_metric import TransNetV2Metric
+from .youtube8m.eval_util import HitOneMetric
+from .segmentation_metric import SegmentationMetric
+from .ava_metric import AVAMetric
+from .vos_metric import VOSMetric
+from .center_crop_metric_MRI import CenterCropMetric_MRI
+from .yowo_metric import YOWOMetric
+
+__all__ = [
+    'METRIC', 'build_metric', 'MultiCropMetric', 'BMNMetric',
+    'CenterCropMetric', 'SkeletonMetric', 'HitOneMetric', 'TransNetV2Metric',
+    'DepthMetric', 'MSRVTTMetric', 'VOSMetric', 'CenterCropMetric_MRI','AVAMetric',
+    'SegmentationMetric', 'YOWOMetric'
+]
diff --git a/docs/src/paddlevideo/metrics/ava_evaluation/README.md b/docs/src/paddlevideo/metrics/ava_evaluation/README.md
new file mode 100644
index 000000000..7414d0fbb
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ava_evaluation/README.md
@@ -0,0 +1,2 @@
+The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet).
+Some unused codes are removed to minimize the length of codes added.
diff --git a/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py b/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py
new file mode 100644
index 000000000..13eb03469
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ava_evaluation/metrics.py
@@ -0,0 +1,143 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Functions for computing metrics like precision, recall, CorLoc and etc."""
+
+import numpy as np
+
+
+def compute_precision_recall(scores, labels, num_gt):
+    """Compute precision and recall.
+
+    Args:
+        scores: A float numpy array representing detection score
+        labels: A boolean numpy array representing true/false positive labels
+        num_gt: Number of ground truth instances
+
+    Raises:
+        ValueError: if the input is not of the correct format
+
+    Returns:
+        precision: Fraction of positive instances over detected ones. This
+            value is None if no ground truth labels are present.
+        recall: Fraction of detected positive instance over all positive
+            instances. This value is None if no ground truth labels are
+            present.
+    """
+    if (not isinstance(labels, np.ndarray) or labels.dtype != np.bool
+            or len(labels.shape) != 1):
+        raise ValueError('labels must be single dimension bool numpy array')
+
+    if not isinstance(scores, np.ndarray) or len(scores.shape) != 1:
+        raise ValueError('scores must be single dimension numpy array')
+
+    if num_gt < np.sum(labels):
+        raise ValueError(
+            'Number of true positives must be smaller than num_gt.')
+
+    if len(scores) != len(labels):
+        raise ValueError('scores and labels must be of the same size.')
+
+    if num_gt == 0:
+        return None, None
+
+    sorted_indices = np.argsort(scores)
+    sorted_indices = sorted_indices[::-1]
+    labels = labels.astype(int)
+    true_positive_labels = labels[sorted_indices]
+    false_positive_labels = 1 - true_positive_labels
+    cum_true_positives = np.cumsum(true_positive_labels)
+    cum_false_positives = np.cumsum(false_positive_labels)
+    precision = cum_true_positives.astype(float) / (
+        cum_true_positives + cum_false_positives)
+    recall = cum_true_positives.astype(float) / num_gt
+    return precision, recall
+
+
+def compute_average_precision(precision, recall):
+    """Compute Average Precision according to the definition in VOCdevkit.
+
+    Precision is modified to ensure that it does not decrease as recall
+    decrease.
+
+    Args:
+        precision: A float [N, 1] numpy array of precisions
+        recall: A float [N, 1] numpy array of recalls
+
+    Raises:
+        ValueError: if the input is not of the correct format
+
+    Returns:
+        average_precison: The area under the precision recall curve. NaN if
+            precision and recall are None.
+    """
+    if precision is None:
+        if recall is not None:
+            raise ValueError('If precision is None, recall must also be None')
+        return np.NAN
+
+    if not isinstance(precision, np.ndarray) or not isinstance(
+            recall, np.ndarray):
+        raise ValueError('precision and recall must be numpy array')
+    if precision.dtype != np.float or recall.dtype != np.float:
+        raise ValueError('input must be float numpy array.')
+    if len(precision) != len(recall):
+        raise ValueError('precision and recall must be of the same size.')
+    if not precision.size:
+        return 0.0
+    if np.amin(precision) < 0 or np.amax(precision) > 1:
+        raise ValueError('Precision must be in the range of [0, 1].')
+    if np.amin(recall) < 0 or np.amax(recall) > 1:
+        raise ValueError('recall must be in the range of [0, 1].')
+    if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
+        raise ValueError('recall must be a non-decreasing array')
+
+    recall = np.concatenate([[0], recall, [1]])
+    precision = np.concatenate([[0], precision, [0]])
+
+    # Preprocess precision to be a non-decreasing array
+    for i in range(len(precision) - 2, -1, -1):
+        precision[i] = np.maximum(precision[i], precision[i + 1])
+
+    indices = np.where(recall[1:] != recall[:-1])[0] + 1
+    average_precision = np.sum(
+        (recall[indices] - recall[indices - 1]) * precision[indices])
+    return average_precision
+
+
+def compute_cor_loc(num_gt_imgs_per_class,
+                    num_images_correctly_detected_per_class):
+    """Compute CorLoc according to the definition in the following paper.
+
+    https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf
+
+    Returns nans if there are no ground truth images for a class.
+
+    Args:
+        num_gt_imgs_per_class: 1D array, representing number of images
+            containing at least one object instance of a particular class
+        num_images_correctly_detected_per_class: 1D array, representing number
+            of images that are correctly detected at least one object instance
+            of a particular class
+
+    Returns:
+        corloc_per_class: A float numpy array represents the corloc score of
+            each class
+    """
+    # Divide by zero expected for classes with no gt examples.
+    with np.errstate(divide='ignore', invalid='ignore'):
+        return np.where(
+            num_gt_imgs_per_class == 0, np.nan,
+            num_images_correctly_detected_per_class / num_gt_imgs_per_class)
diff --git a/docs/src/paddlevideo/metrics/ava_evaluation/np_box_list.py b/docs/src/paddlevideo/metrics/ava_evaluation/np_box_list.py
new file mode 100644
index 000000000..f9b101e6f
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ava_evaluation/np_box_list.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Numpy BoxList classes and functions."""
+
+import numpy as np
+
+
+class BoxList:
+    """Box collection.
+
+    BoxList represents a list of bounding boxes as numpy array, where each
+    bounding box is represented as a row of 4 numbers,
+    [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes within
+    a given list correspond to a single image.
+
+    Optionally, users can add additional related fields (such as
+    objectness/classification scores).
+    """
+
+    def __init__(self, data):
+        """Constructs box collection.
+
+        Args:
+            data: a numpy array of shape [N, 4] representing box coordinates
+
+        Raises:
+            ValueError: if bbox data is not a numpy array
+            ValueError: if invalid dimensions for bbox data
+        """
+        if not isinstance(data, np.ndarray):
+            raise ValueError('data must be a numpy array.')
+        if len(data.shape) != 2 or data.shape[1] != 4:
+            raise ValueError('Invalid dimensions for box data.')
+        if data.dtype != np.float32 and data.dtype != np.float64:
+            raise ValueError(
+                'Invalid data type for box data: float is required.')
+        if not self._is_valid_boxes(data):
+            raise ValueError('Invalid box data. data must be a numpy array of '
+                             'N*[y_min, x_min, y_max, x_max]')
+        self.data = {'boxes': data}
+
+    def num_boxes(self):
+        """Return number of boxes held in collections."""
+        return self.data['boxes'].shape[0]
+
+    def get_extra_fields(self):
+        """Return all non-box fields."""
+        return [k for k in self.data if k != 'boxes']
+
+    def has_field(self, field):
+        return field in self.data
+
+    def add_field(self, field, field_data):
+        """Add data to a specified field.
+
+        Args:
+            field: a string parameter used to speficy a related field to be
+                accessed.
+            field_data: a numpy array of [N, ...] representing the data
+                associated with the field.
+        Raises:
+            ValueError: if the field is already exist or the dimension of the
+                field data does not matches the number of boxes.
+        """
+        if self.has_field(field):
+            raise ValueError('Field ' + field + 'already exists')
+        if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(
+        ):
+            raise ValueError('Invalid dimensions for field data')
+        self.data[field] = field_data
+
+    def get(self):
+        """Convenience function for accesssing box coordinates.
+
+        Returns:
+            a numpy array of shape [N, 4] representing box corners
+        """
+        return self.get_field('boxes')
+
+    def get_field(self, field):
+        """Accesses data associated with the specified field in the box
+        collection.
+
+        Args:
+            field: a string parameter used to speficy a related field to be
+                accessed.
+
+        Returns:
+            a numpy 1-d array representing data of an associated field
+
+        Raises:
+            ValueError: if invalid field
+        """
+        if not self.has_field(field):
+            raise ValueError(f'field {field} does not exist')
+        return self.data[field]
+
+    def get_coordinates(self):
+        """Get corner coordinates of boxes.
+
+        Returns:
+            a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]
+        """
+        box_coordinates = self.get()
+        y_min = box_coordinates[:, 0]
+        x_min = box_coordinates[:, 1]
+        y_max = box_coordinates[:, 2]
+        x_max = box_coordinates[:, 3]
+        return [y_min, x_min, y_max, x_max]
+
+    def _is_valid_boxes(self, data):
+        """Check whether data fullfills the format of N*[ymin, xmin, ymax,
+        xmin].
+
+        Args:
+            data: a numpy array of shape [N, 4] representing box coordinates
+
+        Returns:
+            a boolean indicating whether all ymax of boxes are equal or greater
+            than ymin, and all xmax of boxes are equal or greater than xmin.
+        """
+        if len(data):
+            for v in data:
+                if v[0] > v[2] or v[1] > v[3]:
+                    return False
+        return True
diff --git a/docs/src/paddlevideo/metrics/ava_evaluation/np_box_ops.py b/docs/src/paddlevideo/metrics/ava_evaluation/np_box_ops.py
new file mode 100644
index 000000000..94e7d300c
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ava_evaluation/np_box_ops.py
@@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for [N, 4] numpy arrays representing bounding boxes.
+
+Example box operations that are supported:
+    * Areas: compute bounding box areas
+    * IOU: pairwise intersection-over-union scores
+"""
+
+import numpy as np
+
+
+def area(boxes):
+    """Computes area of boxes.
+
+    Args:
+        boxes: Numpy array with shape [N, 4] holding N boxes
+
+    Returns:
+        a numpy array with shape [N*1] representing box areas
+    """
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def intersection(boxes1, boxes2):
+    """Compute pairwise intersection areas between boxes.
+
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes
+        boxes2: a numpy array with shape [M, 4] holding M boxes
+
+    Returns:
+        a numpy array with shape [N*M] representing pairwise intersection area
+    """
+    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
+    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
+
+    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
+    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
+    intersect_heights = np.maximum(
+        np.zeros(all_pairs_max_ymin.shape),
+        all_pairs_min_ymax - all_pairs_max_ymin)
+    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
+    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
+    intersect_widths = np.maximum(
+        np.zeros(all_pairs_max_xmin.shape),
+        all_pairs_min_xmax - all_pairs_max_xmin)
+    return intersect_heights * intersect_widths
+
+
+def iou(boxes1, boxes2):
+    """Computes pairwise intersection-over-union between box collections.
+
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes.
+        boxes2: a numpy array with shape [M, 4] holding N boxes.
+
+    Returns:
+        a numpy array with shape [N, M] representing pairwise iou scores.
+    """
+    intersect = intersection(boxes1, boxes2)
+    area1 = area(boxes1)
+    area2 = area(boxes2)
+    union = (
+        np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) -
+        intersect)
+    return intersect / union
+
+
+def ioa(boxes1, boxes2):
+    """Computes pairwise intersection-over-area between box collections.
+
+    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
+    their intersection area over box2's area. Note that ioa is not symmetric,
+    that is, IOA(box1, box2) != IOA(box2, box1).
+
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes.
+        boxes2: a numpy array with shape [M, 4] holding N boxes.
+
+    Returns:
+        a numpy array with shape [N, M] representing pairwise ioa scores.
+    """
+    intersect = intersection(boxes1, boxes2)
+    areas = np.expand_dims(area(boxes2), axis=0)
+    return intersect / areas
diff --git a/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py b/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py
new file mode 100644
index 000000000..c9f00540f
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py
@@ -0,0 +1,658 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""object_detection_evaluation module.
+
+ObjectDetectionEvaluation is a class which manages ground truth information of
+a object detection dataset, and computes frequently used detection metrics such
+as Precision, Recall, CorLoc of the provided detection results.
+It supports the following operations:
+1) Add ground truth information of images sequentially.
+2) Add detection result of images sequentially.
+3) Evaluate detection metrics on already inserted detection results.
+4) Write evaluation result into a pickle file for future processing or
+   visualization.
+
+Note: This module operates on numpy boxes and box lists.
+"""
+import collections
+import logging
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+
+from . import metrics, per_image_evaluation, standard_fields
+
+
+class DetectionEvaluator:
+    """Interface for object detection evalution classes.
+
+    Example usage of the Evaluator:
+    ------------------------------
+    evaluator = DetectionEvaluator(categories)
+
+    # Detections and groundtruth for image 1.
+    evaluator.add_single_groundtruth_image_info(...)
+    evaluator.add_single_detected_image_info(...)
+
+    # Detections and groundtruth for image 2.
+    evaluator.add_single_groundtruth_image_info(...)
+    evaluator.add_single_detected_image_info(...)
+
+    metrics_dict = evaluator.evaluate()
+    """
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self, categories):
+        """Constructor.
+
+        Args:
+            categories: A list of dicts, each of which has the following keys -
+                'id': (required) an integer id uniquely identifying this
+                    category.
+                'name': (required) string representing category name e.g.,
+                    'cat', 'dog'.
+        """
+        self._categories = categories
+
+    @abstractmethod
+    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+        """Adds groundtruth for a single image to be used for evaluation.
+
+        Args:
+            image_id: A unique string/integer identifier for the image.
+            groundtruth_dict: A dictionary of groundtruth numpy arrays required
+                for evaluations.
+        """
+
+    @abstractmethod
+    def add_single_detected_image_info(self, image_id, detections_dict):
+        """Adds detections for a single image to be used for evaluation.
+
+        Args:
+            image_id: A unique string/integer identifier for the image.
+            detections_dict: A dictionary of detection numpy arrays required
+                for evaluation.
+        """
+
+    @abstractmethod
+    def evaluate(self):
+        """Evaluates detections and returns a dictionary of metrics."""
+
+    @abstractmethod
+    def clear(self):
+        """Clears the state to prepare for a fresh evaluation."""
+
+
+class ObjectDetectionEvaluator(DetectionEvaluator):
+    """A class to evaluate detections."""
+
+    def __init__(
+        self,
+        categories,
+        matching_iou_threshold=0.5,
+        evaluate_corlocs=False,
+        metric_prefix=None,
+        use_weighted_mean_ap=False,
+        evaluate_masks=False,
+    ):
+        """Constructor.
+
+        Args:
+            categories: A list of dicts, each of which has the following keys -
+                'id': (required) an integer id uniquely identifying this
+                    category.
+                'name': (required) string representing category name e.g.,
+                    'cat', 'dog'.
+            matching_iou_threshold: IOU threshold to use for matching
+                groundtruth boxes to detection boxes.
+            evaluate_corlocs: (optional) boolean which determines if corloc
+                scores are to be returned or not.
+            metric_prefix: (optional) string prefix for metric name; if None,
+                no prefix is used.
+            use_weighted_mean_ap: (optional) boolean which determines if the
+                mean average precision is computed directly from the scores and
+                tp_fp_labels of all classes.
+            evaluate_masks: If False, evaluation will be performed based on
+                boxes. If True, mask evaluation will be performed instead.
+
+        Raises:
+            ValueError: If the category ids are not 1-indexed.
+        """
+        super(ObjectDetectionEvaluator, self).__init__(categories)
+        self._num_classes = max([cat['id'] for cat in categories])
+        if min(cat['id'] for cat in categories) < 1:
+            raise ValueError('Classes should be 1-indexed.')
+        self._matching_iou_threshold = matching_iou_threshold
+        self._use_weighted_mean_ap = use_weighted_mean_ap
+        self._label_id_offset = 1
+        self._evaluate_masks = evaluate_masks
+        self._evaluation = ObjectDetectionEvaluation(
+            num_groundtruth_classes=self._num_classes,
+            matching_iou_threshold=self._matching_iou_threshold,
+            use_weighted_mean_ap=self._use_weighted_mean_ap,
+            label_id_offset=self._label_id_offset,
+        )
+        self._image_ids = set([])
+        self._evaluate_corlocs = evaluate_corlocs
+        self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''
+
+    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+        """Adds groundtruth for a single image to be used for evaluation.
+
+        Args:
+            image_id: A unique string/integer identifier for the image.
+            groundtruth_dict: A dictionary containing -
+                standard_fields.InputDataFields.groundtruth_boxes: float32
+                    numpy array of shape [num_boxes, 4] containing `num_boxes`
+                    groundtruth boxes of the format [ymin, xmin, ymax, xmax] in
+                    absolute image coordinates.
+                standard_fields.InputDataFields.groundtruth_classes: integer
+                    numpy array of shape [num_boxes] containing 1-indexed
+                    groundtruth classes for the boxes.
+                standard_fields.InputDataFields.groundtruth_difficult: Optional
+                    length M numpy boolean array denoting whether a ground
+                    truth box is a difficult instance or not. This field is
+                    optional to support the case that no boxes are difficult.
+                standard_fields.InputDataFields.groundtruth_instance_masks:
+                    Optional numpy array of shape [num_boxes, height, width]
+                    with values in {0, 1}.
+
+        Raises:
+            ValueError: On adding groundtruth for an image more than once. Will
+                also raise error if instance masks are not in groundtruth
+                dictionary.
+        """
+        if image_id in self._image_ids:
+            raise ValueError(
+                'Image with id {} already added.'.format(image_id))
+
+        groundtruth_classes = (
+            groundtruth_dict[
+                standard_fields.InputDataFields.groundtruth_classes] -
+            self._label_id_offset)
+        # If the key is not present in the groundtruth_dict or the array is
+        # empty (unless there are no annotations for the groundtruth on this
+        # image) use values from the dictionary or insert None otherwise.
+        if (standard_fields.InputDataFields.groundtruth_difficult
+                in groundtruth_dict.keys()) and (groundtruth_dict[
+                    standard_fields.InputDataFields.groundtruth_difficult].size
+                                                 or
+                                                 not groundtruth_classes.size):
+            groundtruth_difficult = groundtruth_dict[
+                standard_fields.InputDataFields.groundtruth_difficult]
+        else:
+            groundtruth_difficult = None
+            if not len(self._image_ids) % 1000:
+                logging.warn(('image %s does not have groundtruth difficult '
+                              'flag specified'), image_id)
+        groundtruth_masks = None
+        if self._evaluate_masks:
+            if (standard_fields.InputDataFields.groundtruth_instance_masks
+                    not in groundtruth_dict):
+                raise ValueError(
+                    'Instance masks not in groundtruth dictionary.')
+            groundtruth_masks = groundtruth_dict[
+                standard_fields.InputDataFields.groundtruth_instance_masks]
+        self._evaluation.add_single_ground_truth_image_info(
+            image_key=image_id,
+            groundtruth_boxes=groundtruth_dict[
+                standard_fields.InputDataFields.groundtruth_boxes],
+            groundtruth_class_labels=groundtruth_classes,
+            groundtruth_is_difficult_list=groundtruth_difficult,
+            groundtruth_masks=groundtruth_masks,
+        )
+        self._image_ids.update([image_id])
+
+    def add_single_detected_image_info(self, image_id, detections_dict):
+        """Adds detections for a single image to be used for evaluation.
+
+        Args:
+            image_id: A unique string/integer identifier for the image.
+            detections_dict: A dictionary containing -
+                standard_fields.DetectionResultFields.detection_boxes: float32
+                    numpy array of shape [num_boxes, 4] containing `num_boxes`
+                    detection boxes of the format [ymin, xmin, ymax, xmax] in
+                    absolute image coordinates.
+                standard_fields.DetectionResultFields.detection_scores: float32
+                    numpy array of shape [num_boxes] containing detection
+                    scores for the boxes.
+                standard_fields.DetectionResultFields.detection_classes:
+                    integer numpy array of shape [num_boxes] containing
+                    1-indexed detection classes for the boxes.
+                standard_fields.DetectionResultFields.detection_masks: uint8
+                    numpy array of shape [num_boxes, height, width] containing
+                    `num_boxes` masks of values ranging between 0 and 1.
+
+        Raises:
+            ValueError: If detection masks are not in detections dictionary.
+        """
+        detection_classes = (
+            detections_dict[
+                standard_fields.DetectionResultFields.detection_classes] -
+            self._label_id_offset)
+        detection_masks = None
+        if self._evaluate_masks:
+            if (standard_fields.DetectionResultFields.detection_masks
+                    not in detections_dict):
+                raise ValueError(
+                    'Detection masks not in detections dictionary.')
+            detection_masks = detections_dict[
+                standard_fields.DetectionResultFields.detection_masks]
+        self._evaluation.add_single_detected_image_info(
+            image_key=image_id,
+            detected_boxes=detections_dict[
+                standard_fields.DetectionResultFields.detection_boxes],
+            detected_scores=detections_dict[
+                standard_fields.DetectionResultFields.detection_scores],
+            detected_class_labels=detection_classes,
+            detected_masks=detection_masks,
+        )
+
+    def create_category_index(self, categories):
+        """Creates dictionary of COCO compatible categories keyed by category
+        id.
+
+        Args:
+            categories: a list of dicts, each of which has the following keys:
+                'id': (required) an integer id uniquely identifying this
+                    category.
+                'name': (required) string representing category name
+                    e.g., 'cat', 'dog', 'pizza'.
+
+        Returns:
+            category_index: a dict containing the same entries as categories,
+                but keyed by the 'id' field of each category.
+        """
+        category_index = {}
+        for cat in categories:
+            category_index[cat['id']] = cat
+        return category_index
+
+    def evaluate(self):
+        """Compute evaluation result.
+
+        Returns:
+            A dictionary of metrics with the following fields -
+
+            1. summary_metrics:
+                'Precision/mAP@<matching_iou_threshold>IOU': mean average
+                precision at the specified IOU threshold
+
+            2. per_category_ap: category specific results with keys of the form
+               'PerformanceByCategory/mAP@<matching_iou_threshold>IOU/category'
+        """
+        (
+            per_class_ap,
+            mean_ap,
+            _,
+            _,
+            per_class_corloc,
+            mean_corloc,
+        ) = self._evaluation.evaluate()
+
+        metric = f'mAP@{self._matching_iou_threshold}IOU'
+        pascal_metrics = {self._metric_prefix + metric: mean_ap}
+        if self._evaluate_corlocs:
+            pascal_metrics[self._metric_prefix +
+                           'Precision/meanCorLoc@{}IOU'.format(
+                               self._matching_iou_threshold)] = mean_corloc
+        category_index = self.create_category_index(self._categories)
+        for idx in range(per_class_ap.size):
+            if idx + self._label_id_offset in category_index:
+                display_name = (
+                    self._metric_prefix +
+                    'PerformanceByCategory/AP@{}IOU/{}'.format(
+                        self._matching_iou_threshold,
+                        category_index[idx + self._label_id_offset]['name'],
+                    ))
+                pascal_metrics[display_name] = per_class_ap[idx]
+
+                # Optionally add CorLoc metrics.classes
+                if self._evaluate_corlocs: #False
+                    display_name = (
+                        self._metric_prefix +
+                        'PerformanceByCategory/CorLoc@{}IOU/{}'.format(
+                            self._matching_iou_threshold,
+                            category_index[idx +
+                                           self._label_id_offset]['name'],
+                        ))
+                    pascal_metrics[display_name] = per_class_corloc[idx]
+
+        return pascal_metrics
+
+    def clear(self):
+        """Clears the state to prepare for a fresh evaluation."""
+        self._evaluation = ObjectDetectionEvaluation(
+            num_groundtruth_classes=self._num_classes,
+            matching_iou_threshold=self._matching_iou_threshold,
+            use_weighted_mean_ap=self._use_weighted_mean_ap,
+            label_id_offset=self._label_id_offset,
+        )
+        self._image_ids.clear()
+
+
+class PascalDetectionEvaluator(ObjectDetectionEvaluator):
+    """A class to evaluate detections using PASCAL metrics."""
+
+    def __init__(self, categories, matching_iou_threshold=0.5):
+        super(PascalDetectionEvaluator, self).__init__(
+            categories,
+            matching_iou_threshold=matching_iou_threshold,
+            evaluate_corlocs=False,
+            use_weighted_mean_ap=False,
+        )
+
+
+ObjectDetectionEvalMetrics = collections.namedtuple(
+    'ObjectDetectionEvalMetrics',
+    [
+        'average_precisions',
+        'mean_ap',
+        'precisions',
+        'recalls',
+        'corlocs',
+        'mean_corloc',
+    ],
+)
+
+
+class ObjectDetectionEvaluation:
+    """Internal implementation of Pascal object detection metrics."""
+
+    def __init__(
+        self,
+        num_groundtruth_classes,
+        matching_iou_threshold=0.5,
+        nms_iou_threshold=1.0,
+        nms_max_output_boxes=10000,
+        use_weighted_mean_ap=False,
+        label_id_offset=0,
+    ):
+        if num_groundtruth_classes < 1:
+            raise ValueError(
+                'Need at least 1 groundtruth class for evaluation.')
+
+        self.per_image_eval = per_image_evaluation.PerImageEvaluation(
+            num_groundtruth_classes=num_groundtruth_classes,
+            matching_iou_threshold=matching_iou_threshold,
+        )
+        self.num_class = num_groundtruth_classes
+        self.use_weighted_mean_ap = use_weighted_mean_ap
+        self.label_id_offset = label_id_offset
+
+        self.groundtruth_boxes = {}
+        self.groundtruth_class_labels = {}
+        self.groundtruth_masks = {}
+        self.groundtruth_is_difficult_list = {}
+        self.groundtruth_is_group_of_list = {}
+        self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int)
+        self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int)
+
+        self._initialize_detections()
+
+    def _initialize_detections(self):
+        self.detection_keys = set()
+        self.scores_per_class = [[] for _ in range(self.num_class)]
+        self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]
+        self.num_images_correctly_detected_per_class = np.zeros(self.num_class)
+        self.average_precision_per_class = np.empty(
+            self.num_class, dtype=float)
+        self.average_precision_per_class.fill(np.nan)
+        self.precisions_per_class = []
+        self.recalls_per_class = []
+        self.corloc_per_class = np.ones(self.num_class, dtype=float)
+
+    def clear_detections(self):
+        self._initialize_detections()
+
+    def add_single_ground_truth_image_info(
+        self,
+        image_key,
+        groundtruth_boxes,
+        groundtruth_class_labels,
+        groundtruth_is_difficult_list=None,
+        groundtruth_is_group_of_list=None,
+        groundtruth_masks=None,
+    ):
+        """Adds groundtruth for a single image to be used for evaluation.
+
+        Args:
+            image_key: A unique string/integer identifier for the image.
+            groundtruth_boxes: float32 numpy array of shape [num_boxes, 4]
+                containing `num_boxes` groundtruth boxes of the format
+                [ymin, xmin, ymax, xmax] in absolute image coordinates.
+            groundtruth_class_labels: integer numpy array of shape [num_boxes]
+                containing 0-indexed groundtruth classes for the boxes.
+            groundtruth_is_difficult_list: A length M numpy boolean array
+                denoting whether a ground truth box is a difficult instance or
+                not. To support the case that no boxes are difficult, it is by
+                default set as None.
+            groundtruth_is_group_of_list: A length M numpy boolean array
+                denoting whether a ground truth box is a group-of box or not.
+                To support the case that no boxes are groups-of, it is by
+                default set as None.
+            groundtruth_masks: uint8 numpy array of shape
+                [num_boxes, height, width] containing `num_boxes` groundtruth
+                masks. The mask values range from 0 to 1.
+        """
+        if image_key in self.groundtruth_boxes:
+            logging.warn(('image %s has already been added to the ground '
+                          'truth database.'), image_key)
+            return
+
+        self.groundtruth_boxes[image_key] = groundtruth_boxes
+        self.groundtruth_class_labels[image_key] = groundtruth_class_labels
+        self.groundtruth_masks[image_key] = groundtruth_masks
+        if groundtruth_is_difficult_list is None:
+            num_boxes = groundtruth_boxes.shape[0]
+            groundtruth_is_difficult_list = np.zeros(num_boxes, dtype=bool)
+        self.groundtruth_is_difficult_list[
+            image_key] = groundtruth_is_difficult_list.astype(dtype=bool)
+        if groundtruth_is_group_of_list is None:
+            num_boxes = groundtruth_boxes.shape[0]
+            groundtruth_is_group_of_list = np.zeros(num_boxes, dtype=bool)
+        self.groundtruth_is_group_of_list[
+            image_key] = groundtruth_is_group_of_list.astype(dtype=bool)
+
+        self._update_ground_truth_statistics(
+            groundtruth_class_labels,
+            groundtruth_is_difficult_list.astype(dtype=bool),
+            groundtruth_is_group_of_list.astype(dtype=bool),
+        )
+
+    def add_single_detected_image_info(
+        self,
+        image_key,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        detected_masks=None,
+    ):
+        """Adds detections for a single image to be used for evaluation.
+
+        Args:
+            image_key: A unique string/integer identifier for the image.
+            detected_boxes: float32 numpy array of shape [num_boxes, 4]
+                containing `num_boxes` detection boxes of the format
+                [ymin, xmin, ymax, xmax] in absolute image coordinates.
+            detected_scores: float32 numpy array of shape [num_boxes]
+                containing detection scores for the boxes.
+            detected_class_labels: integer numpy array of shape [num_boxes]
+                containing 0-indexed detection classes for the boxes.
+            detected_masks: np.uint8 numpy array of shape
+                [num_boxes, height, width] containing `num_boxes` detection
+                masks with values ranging between 0 and 1.
+
+        Raises:
+            ValueError: if the number of boxes, scores and class labels differ
+                in length.
+        """
+        if len(detected_boxes) != len(detected_scores) or len(
+                detected_boxes) != len(detected_class_labels):
+            raise ValueError(
+                'detected_boxes, detected_scores and '
+                'detected_class_labels should all have same lengths. Got'
+                '[%d, %d, %d]' % len(detected_boxes),
+                len(detected_scores),
+                len(detected_class_labels),
+            )
+
+        if image_key in self.detection_keys:
+            logging.warn(('image %s has already been added to the ground '
+                          'truth database.'), image_key)
+            return
+
+        self.detection_keys.add(image_key)
+        if image_key in self.groundtruth_boxes:
+            groundtruth_boxes = self.groundtruth_boxes[image_key]
+            groundtruth_class_labels = self.groundtruth_class_labels[image_key]
+            # Masks are popped instead of look up. The reason is that we do not
+            # want to keep all masks in memory which can cause memory overflow.
+            groundtruth_masks = self.groundtruth_masks.pop(image_key)
+            groundtruth_is_difficult_list = self.groundtruth_is_difficult_list[
+                image_key]
+            groundtruth_is_group_of_list = self.groundtruth_is_group_of_list[
+                image_key]
+        else:
+            groundtruth_boxes = np.empty(shape=[0, 4], dtype=float)
+            groundtruth_class_labels = np.array([], dtype=int)
+            if detected_masks is None:
+                groundtruth_masks = None
+            else:
+                groundtruth_masks = np.empty(shape=[0, 1, 1], dtype=float)
+            groundtruth_is_difficult_list = np.array([], dtype=bool)
+            groundtruth_is_group_of_list = np.array([], dtype=bool)
+        (
+            scores,
+            tp_fp_labels,
+        ) = self.per_image_eval.compute_object_detection_metrics(
+            detected_boxes=detected_boxes,
+            detected_scores=detected_scores,
+            detected_class_labels=detected_class_labels,
+            groundtruth_boxes=groundtruth_boxes,
+            groundtruth_class_labels=groundtruth_class_labels,
+            groundtruth_is_difficult_list=groundtruth_is_difficult_list,
+            groundtruth_is_group_of_list=groundtruth_is_group_of_list,
+            detected_masks=detected_masks,
+            groundtruth_masks=groundtruth_masks,
+        )
+
+        for i in range(self.num_class):
+            if scores[i].shape[0] > 0:
+                self.scores_per_class[i].append(scores[i])
+                self.tp_fp_labels_per_class[i].append(tp_fp_labels[i])
+
+    def _update_ground_truth_statistics(
+        self,
+        groundtruth_class_labels,
+        groundtruth_is_difficult_list,
+        groundtruth_is_group_of_list,
+    ):
+        """Update grouth truth statitistics.
+
+        1. Difficult boxes are ignored when counting the number of ground truth
+        instances as done in Pascal VOC devkit.
+        2. Difficult boxes are treated as normal boxes when computing CorLoc
+        related statitistics.
+
+        Args:
+            groundtruth_class_labels: An integer numpy array of length M,
+                representing M class labels of object instances in ground truth
+            groundtruth_is_difficult_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a difficult instance or
+                not
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a group-of box or not
+        """
+        for class_index in range(self.num_class):
+            num_gt_instances = np.sum(groundtruth_class_labels[
+                ~groundtruth_is_difficult_list
+                & ~groundtruth_is_group_of_list] == class_index)
+            self.num_gt_instances_per_class[class_index] += num_gt_instances
+            if np.any(groundtruth_class_labels == class_index):
+                self.num_gt_imgs_per_class[class_index] += 1
+
+    def evaluate(self):
+        """Compute evaluation result.
+
+        Returns:
+            A named tuple with the following fields -
+                average_precision: float numpy array of average precision for
+                    each class.
+                mean_ap: mean average precision of all classes, float scalar
+                precisions: List of precisions, each precision is a float numpy
+                    array
+                recalls: List of recalls, each recall is a float numpy array
+                corloc: numpy float array
+                mean_corloc: Mean CorLoc score for each class, float scalar
+        """
+        if (self.num_gt_instances_per_class == 0).any():
+            print(
+                'The following classes have no ground truth examples: %s',
+                np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) +
+                self.label_id_offset, "self.detection_keys:",self.detection_keys
+            )
+
+        if self.use_weighted_mean_ap:
+            all_scores = np.array([], dtype=float)
+            all_tp_fp_labels = np.array([], dtype=bool)
+
+        for class_index in range(self.num_class):
+            if self.num_gt_instances_per_class[class_index] == 0:
+                continue
+
+            if not self.scores_per_class[class_index]:
+                scores = np.array([], dtype=float)
+                tp_fp_labels = np.array([], dtype=bool)
+            else:
+                scores = np.concatenate(self.scores_per_class[class_index])
+                tp_fp_labels = np.concatenate(
+                    self.tp_fp_labels_per_class[class_index])
+            if self.use_weighted_mean_ap:
+                all_scores = np.append(all_scores, scores)
+                all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)
+            precision, recall = metrics.compute_precision_recall(
+                scores,
+                tp_fp_labels,
+                self.num_gt_instances_per_class[class_index],
+            )
+            self.precisions_per_class.append(precision)
+            self.recalls_per_class.append(recall)
+            average_precision = metrics.compute_average_precision(
+                precision, recall)
+            self.average_precision_per_class[class_index] = average_precision
+
+        self.corloc_per_class = metrics.compute_cor_loc(
+            self.num_gt_imgs_per_class,
+            self.num_images_correctly_detected_per_class,
+        )
+
+        if self.use_weighted_mean_ap:
+            num_gt_instances = np.sum(self.num_gt_instances_per_class)
+            precision, recall = metrics.compute_precision_recall(
+                all_scores, all_tp_fp_labels, num_gt_instances)
+            mean_ap = metrics.compute_average_precision(precision, recall)
+        else:
+            mean_ap = np.nanmean(self.average_precision_per_class)
+        mean_corloc = np.nanmean(self.corloc_per_class)
+        return ObjectDetectionEvalMetrics(
+            self.average_precision_per_class,
+            mean_ap,
+            self.precisions_per_class,
+            self.recalls_per_class,
+            self.corloc_per_class,
+            mean_corloc,
+        )
diff --git a/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py b/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py
new file mode 100644
index 000000000..3013ae7ce
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py
@@ -0,0 +1,452 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Evaluate Object Detection result on a single image.
+
+Annotate each detected result as true positives or false positive according to
+a predefined IOU ratio. Non Maximum Supression is used by default. Multi class
+detection is supported by default. Based on the settings, per image evaluation
+is either performed on boxes or on object masks.
+"""
+
+import numpy as np
+
+from . import np_box_list, np_box_ops
+
+
+class PerImageEvaluation:
+    """Evaluate detection result of a single image."""
+
+    def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5):
+        """Initialized PerImageEvaluation by evaluation parameters.
+
+        Args:
+            num_groundtruth_classes: Number of ground truth object classes
+            matching_iou_threshold: A ratio of area intersection to union,
+                which is the threshold to consider whether a detection is true
+                positive or not
+        """
+        self.matching_iou_threshold = matching_iou_threshold
+        self.num_groundtruth_classes = num_groundtruth_classes
+
+    def compute_object_detection_metrics(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        groundtruth_boxes,
+        groundtruth_class_labels,
+        groundtruth_is_difficult_list,
+        groundtruth_is_group_of_list,
+        detected_masks=None,
+        groundtruth_masks=None,
+    ):
+        """Evaluates detections as being tp, fp or ignored from a single image.
+
+        The evaluation is done in two stages:
+        1. All detections are matched to non group-of boxes; true positives
+            are determined and detections matched to difficult boxes are
+            ignored.
+        2. Detections that are determined as false positives are matched
+            against group-of boxes and ignored if matched.
+
+        Args:
+            detected_boxes: A float numpy array of shape [N, 4], representing N
+                regions of detected object regions.
+                Each row is of the format [y_min, x_min, y_max, x_max]
+            detected_scores: A float numpy array of shape [N, 1], representing
+                the confidence scores of the detected N object instances.
+            detected_class_labels: A integer numpy array of shape [N, 1],
+                repreneting the class labels of the detected N object
+                instances.
+            groundtruth_boxes: A float numpy array of shape [M, 4],
+                representing M regions of object instances in ground truth
+            groundtruth_class_labels: An integer numpy array of shape [M, 1],
+                representing M class labels of object instances in ground truth
+            groundtruth_is_difficult_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a difficult instance or
+                not
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box has group-of tag
+            detected_masks: (optional) A uint8 numpy array of shape
+                [N, height, width]. If not None, the metrics will be computed
+                based on masks.
+            groundtruth_masks: (optional) A uint8 numpy array of shape
+                [M, height, width].
+
+        Returns:
+            scores: A list of C float numpy arrays. Each numpy array is of
+                shape [K, 1], representing K scores detected with object class
+                label c
+            tp_fp_labels: A list of C boolean numpy arrays. Each numpy array
+                is of shape [K, 1], representing K True/False positive label of
+                object instances detected with class label c
+        """
+        (
+            detected_boxes,
+            detected_scores,
+            detected_class_labels,
+            detected_masks,
+        ) = self._remove_invalid_boxes(
+            detected_boxes,
+            detected_scores,
+            detected_class_labels,
+            detected_masks,
+        )
+        scores, tp_fp_labels = self._compute_tp_fp(
+            detected_boxes=detected_boxes,
+            detected_scores=detected_scores,
+            detected_class_labels=detected_class_labels,
+            groundtruth_boxes=groundtruth_boxes,
+            groundtruth_class_labels=groundtruth_class_labels,
+            groundtruth_is_difficult_list=groundtruth_is_difficult_list,
+            groundtruth_is_group_of_list=groundtruth_is_group_of_list,
+            detected_masks=detected_masks,
+            groundtruth_masks=groundtruth_masks,
+        )
+
+        return scores, tp_fp_labels
+
+    def _compute_tp_fp(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        groundtruth_boxes,
+        groundtruth_class_labels,
+        groundtruth_is_difficult_list,
+        groundtruth_is_group_of_list,
+        detected_masks=None,
+        groundtruth_masks=None,
+    ):
+        """Labels true/false positives of detections of an image across all
+        classes.
+
+        Args:
+            detected_boxes: A float numpy array of shape [N, 4], representing N
+                regions of detected object regions.
+                Each row is of the format [y_min, x_min, y_max, x_max]
+            detected_scores: A float numpy array of shape [N, 1], representing
+                the confidence scores of the detected N object instances.
+            detected_class_labels: A integer numpy array of shape [N, 1],
+                repreneting the class labels of the detected N object
+                instances.
+            groundtruth_boxes: A float numpy array of shape [M, 4],
+                representing M regions of object instances in ground truth
+            groundtruth_class_labels: An integer numpy array of shape [M, 1],
+                representing M class labels of object instances in ground truth
+            groundtruth_is_difficult_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a difficult instance or
+                not
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box has group-of tag
+            detected_masks: (optional) A np.uint8 numpy array of shape
+                [N, height, width]. If not None, the scores will be computed
+                based on masks.
+            groundtruth_masks: (optional) A np.uint8 numpy array of shape
+                [M, height, width].
+
+        Returns:
+            result_scores: A list of float numpy arrays. Each numpy array is of
+                shape [K, 1], representing K scores detected with object class
+                label c
+            result_tp_fp_labels: A list of boolean numpy array. Each numpy
+                array is of shape [K, 1], representing K True/False positive
+                label of object instances detected with class label c
+
+        Raises:
+            ValueError: If detected masks is not None but groundtruth masks are
+                None, or the other way around.
+        """
+        if detected_masks is not None and groundtruth_masks is None:
+            raise ValueError(
+                'Detected masks is available but groundtruth masks is not.')
+        if detected_masks is None and groundtruth_masks is not None:
+            raise ValueError(
+                'Groundtruth masks is available but detected masks is not.')
+
+        result_scores = []
+        result_tp_fp_labels = []
+        for i in range(self.num_groundtruth_classes):
+            groundtruth_is_difficult_list_at_ith_class = (
+                groundtruth_is_difficult_list[groundtruth_class_labels == i])
+            groundtruth_is_group_of_list_at_ith_class = (
+                groundtruth_is_group_of_list[groundtruth_class_labels == i])
+            (
+                gt_boxes_at_ith_class,
+                gt_masks_at_ith_class,
+                detected_boxes_at_ith_class,
+                detected_scores_at_ith_class,
+                detected_masks_at_ith_class,
+            ) = self._get_ith_class_arrays(detected_boxes, detected_scores,
+                                           detected_masks,
+                                           detected_class_labels,
+                                           groundtruth_boxes,
+                                           groundtruth_masks,
+                                           groundtruth_class_labels, i)
+            scores, tp_fp_labels = self._compute_tp_fp_for_single_class(
+                detected_boxes=detected_boxes_at_ith_class,
+                detected_scores=detected_scores_at_ith_class,
+                groundtruth_boxes=gt_boxes_at_ith_class,
+                groundtruth_is_difficult_list=(
+                    groundtruth_is_difficult_list_at_ith_class),
+                groundtruth_is_group_of_list=(
+                    groundtruth_is_group_of_list_at_ith_class),
+                detected_masks=detected_masks_at_ith_class,
+                groundtruth_masks=gt_masks_at_ith_class,
+            )
+            result_scores.append(scores)
+            result_tp_fp_labels.append(tp_fp_labels)
+        return result_scores, result_tp_fp_labels
+
+    def _get_overlaps_and_scores_box_mode(
+        self,
+        detected_boxes,
+        detected_scores,
+        groundtruth_boxes,
+        groundtruth_is_group_of_list,
+    ):
+        """Computes overlaps and scores between detected and groudntruth boxes.
+
+        Args:
+            detected_boxes: A numpy array of shape [N, 4] representing detected
+                box coordinates
+            detected_scores: A 1-d numpy array of length N representing
+                classification score
+            groundtruth_boxes: A numpy array of shape [M, 4] representing
+                ground truth box coordinates
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box has group-of tag. If a
+                groundtruth box is group-of box, every detection matching this
+                box is ignored.
+
+        Returns:
+            iou: A float numpy array of size [num_detected_boxes,
+                num_gt_boxes]. If gt_non_group_of_boxlist.num_boxes() == 0 it
+                will be None.
+            ioa: A float numpy array of size [num_detected_boxes,
+                num_gt_boxes]. If gt_group_of_boxlist.num_boxes() == 0 it will
+                be None.
+            scores: The score of the detected boxlist.
+            num_boxes: Number of non-maximum suppressed detected boxes.
+        """
+        detected_boxlist = np_box_list.BoxList(detected_boxes)
+        detected_boxlist.add_field('scores', detected_scores)
+        gt_non_group_of_boxlist = np_box_list.BoxList(
+            groundtruth_boxes[~groundtruth_is_group_of_list])
+
+        iou = np_box_ops.iou(detected_boxlist.get(),
+                             gt_non_group_of_boxlist.get())
+        scores = detected_boxlist.get_field('scores')
+        num_boxes = detected_boxlist.num_boxes()
+        return iou, None, scores, num_boxes
+
+    def _compute_tp_fp_for_single_class(
+        self,
+        detected_boxes,
+        detected_scores,
+        groundtruth_boxes,
+        groundtruth_is_difficult_list,
+        groundtruth_is_group_of_list,
+        detected_masks=None,
+        groundtruth_masks=None,
+    ):
+        """Labels boxes detected with the same class from the same image as
+        tp/fp.
+
+        Args:
+            detected_boxes: A numpy array of shape [N, 4] representing detected
+                box coordinates
+            detected_scores: A 1-d numpy array of length N representing
+                classification score
+            groundtruth_boxes: A numpy array of shape [M, 4] representing
+                groundtruth box coordinates
+            groundtruth_is_difficult_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a difficult instance or
+                not. If a groundtruth box is difficult, every detection
+                matching this box is ignored.
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box has group-of tag. If a
+                groundtruth box is group-of box, every detection matching this
+                box is ignored.
+            detected_masks: (optional) A uint8 numpy array of shape
+                [N, height, width]. If not None, the scores will be computed
+                based on masks.
+            groundtruth_masks: (optional) A uint8 numpy array of shape
+                [M, height, width].
+
+        Returns:
+            Two arrays of the same size, containing all boxes that were
+            evaluated as being true positives or false positives; if a box
+            matched to a difficult box or to a group-of box, it is ignored.
+
+            scores: A numpy array representing the detection scores.
+            tp_fp_labels: a boolean numpy array indicating whether a detection
+                is a true positive.
+        """
+        if detected_boxes.size == 0:
+            return np.array([], dtype=float), np.array([], dtype=bool)
+
+        (
+            iou,
+            _,
+            scores,
+            num_detected_boxes,
+        ) = self._get_overlaps_and_scores_box_mode(
+            detected_boxes=detected_boxes,
+            detected_scores=detected_scores,
+            groundtruth_boxes=groundtruth_boxes,
+            groundtruth_is_group_of_list=groundtruth_is_group_of_list,
+        )
+
+        if groundtruth_boxes.size == 0:
+            return scores, np.zeros(num_detected_boxes, dtype=bool)
+
+        tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool)
+        is_matched_to_difficult_box = np.zeros(num_detected_boxes, dtype=bool)
+        is_matched_to_group_of_box = np.zeros(num_detected_boxes, dtype=bool)
+
+        # The evaluation is done in two stages:
+        # 1. All detections are matched to non group-of boxes; true positives
+        #    are determined and detections matched to difficult boxes are
+        #    ignored.
+        # 2. Detections that are determined as false positives are matched
+        #    against group-of boxes and ignored if matched.
+
+        # Tp-fp evaluation for non-group of boxes (if any).
+        if iou.shape[1] > 0:
+            groundtruth_nongroup_of_is_difficult_list = (
+                groundtruth_is_difficult_list[~groundtruth_is_group_of_list])
+            max_overlap_gt_ids = np.argmax(iou, axis=1)
+            is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)
+            for i in range(num_detected_boxes):
+                gt_id = max_overlap_gt_ids[i]
+                if iou[i, gt_id] >= self.matching_iou_threshold:
+                    if not groundtruth_nongroup_of_is_difficult_list[gt_id]:
+                        if not is_gt_box_detected[gt_id]:
+                            tp_fp_labels[i] = True
+                            is_gt_box_detected[gt_id] = True
+                    else:
+                        is_matched_to_difficult_box[i] = True
+
+        return (
+            scores[~is_matched_to_difficult_box & ~is_matched_to_group_of_box],
+            tp_fp_labels[~is_matched_to_difficult_box
+                         & ~is_matched_to_group_of_box],
+        )
+
+    def _get_ith_class_arrays(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_masks,
+        detected_class_labels,
+        groundtruth_boxes,
+        groundtruth_masks,
+        groundtruth_class_labels,
+        class_index,
+    ):
+        """Returns numpy arrays belonging to class with index `class_index`.
+
+        Args:
+            detected_boxes: A numpy array containing detected boxes.
+            detected_scores: A numpy array containing detected scores.
+            detected_masks: A numpy array containing detected masks.
+            detected_class_labels: A numpy array containing detected class
+                labels.
+            groundtruth_boxes: A numpy array containing groundtruth boxes.
+            groundtruth_masks: A numpy array containing groundtruth masks.
+            groundtruth_class_labels: A numpy array containing groundtruth
+                class labels.
+            class_index: An integer index.
+
+        Returns:
+            gt_boxes_at_ith_class: A numpy array containing groundtruth boxes
+                labeled as ith class.
+            gt_masks_at_ith_class: A numpy array containing groundtruth masks
+                labeled as ith class.
+            detected_boxes_at_ith_class: A numpy array containing detected
+                boxes corresponding to the ith class.
+            detected_scores_at_ith_class: A numpy array containing detected
+                scores corresponding to the ith class.
+            detected_masks_at_ith_class: A numpy array containing detected
+                masks corresponding to the ith class.
+        """
+        selected_groundtruth = groundtruth_class_labels == class_index
+        gt_boxes_at_ith_class = groundtruth_boxes[selected_groundtruth]
+        if groundtruth_masks is not None:
+            gt_masks_at_ith_class = groundtruth_masks[selected_groundtruth]
+        else:
+            gt_masks_at_ith_class = None
+        selected_detections = detected_class_labels == class_index
+        detected_boxes_at_ith_class = detected_boxes[selected_detections]
+        detected_scores_at_ith_class = detected_scores[selected_detections]
+        if detected_masks is not None:
+            detected_masks_at_ith_class = detected_masks[selected_detections]
+        else:
+            detected_masks_at_ith_class = None
+        return (
+            gt_boxes_at_ith_class,
+            gt_masks_at_ith_class,
+            detected_boxes_at_ith_class,
+            detected_scores_at_ith_class,
+            detected_masks_at_ith_class,
+        )
+
+    def _remove_invalid_boxes(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        detected_masks=None,
+    ):
+        """Removes entries with invalid boxes.
+
+        A box is invalid if either its xmax is smaller than its xmin, or its
+        ymax is smaller than its ymin.
+
+        Args:
+            detected_boxes: A float numpy array of size [num_boxes, 4]
+                containing box coordinates in [ymin, xmin, ymax, xmax] format.
+            detected_scores: A float numpy array of size [num_boxes].
+            detected_class_labels: A int32 numpy array of size [num_boxes].
+            detected_masks: A uint8 numpy array of size
+                [num_boxes, height, width].
+
+        Returns:
+            valid_detected_boxes: A float numpy array of size
+                [num_valid_boxes, 4] containing box coordinates in
+                [ymin, xmin, ymax, xmax] format.
+            valid_detected_scores: A float numpy array of size
+                [num_valid_boxes].
+            valid_detected_class_labels: A int32 numpy array of size
+                [num_valid_boxes].
+            valid_detected_masks: A uint8 numpy array of size
+                [num_valid_boxes, height, width].
+        """
+        valid_indices = np.logical_and(
+            detected_boxes[:, 0] < detected_boxes[:, 2],
+            detected_boxes[:, 1] < detected_boxes[:, 3],
+        )
+        detected_boxes = detected_boxes[valid_indices]
+        detected_scores = detected_scores[valid_indices]
+        detected_class_labels = detected_class_labels[valid_indices]
+        if detected_masks is not None:
+            detected_masks = detected_masks[valid_indices]
+        return [
+            detected_boxes,
+            detected_scores,
+            detected_class_labels,
+            detected_masks,
+        ]
diff --git a/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py b/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py
new file mode 100644
index 000000000..8edf46d08
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ava_evaluation/standard_fields.py
@@ -0,0 +1,115 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Contains classes specifying naming conventions used for object detection.
+
+Specifies:
+  InputDataFields: standard fields used by reader/preprocessor/batcher.
+  DetectionResultFields: standard fields returned by object detector.
+"""
+
+
+class InputDataFields:
+    """Names for the input tensors.
+
+    Holds the standard data field names to use for identifying input tensors.
+    This should be used by the decoder to identify keys for the returned
+    tensor_dict containing input tensors. And it should be used by the model to
+    identify the tensors it needs.
+
+    Attributes:
+        image: image.
+        original_image: image in the original input size.
+        key: unique key corresponding to image.
+        source_id: source of the original image.
+        filename: original filename of the dataset (without common path).
+        groundtruth_image_classes: image-level class labels.
+        groundtruth_boxes: coordinates of the ground truth boxes in the image.
+        groundtruth_classes: box-level class labels.
+        groundtruth_label_types: box-level label types (e.g. explicit
+            negative).
+        groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]
+            is the groundtruth a single object or a crowd.
+        groundtruth_area: area of a groundtruth segment.
+        groundtruth_difficult: is a `difficult` object
+        groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of
+            the same class, forming a connected group, where instances are
+            heavily occluding each other.
+        proposal_boxes: coordinates of object proposal boxes.
+        proposal_objectness: objectness score of each proposal.
+        groundtruth_instance_masks: ground truth instance masks.
+        groundtruth_instance_boundaries: ground truth instance boundaries.
+        groundtruth_instance_classes: instance mask-level class labels.
+        groundtruth_keypoints: ground truth keypoints.
+        groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
+        groundtruth_label_scores: groundtruth label scores.
+        groundtruth_weights: groundtruth weight factor for bounding boxes.
+        num_groundtruth_boxes: number of groundtruth boxes.
+        true_image_shapes: true shapes of images in the resized images, as
+            resized images can be padded with zeros.
+    """
+
+    image = 'image'
+    original_image = 'original_image'
+    key = 'key'
+    source_id = 'source_id'
+    filename = 'filename'
+    groundtruth_image_classes = 'groundtruth_image_classes'
+    groundtruth_boxes = 'groundtruth_boxes'
+    groundtruth_classes = 'groundtruth_classes'
+    groundtruth_label_types = 'groundtruth_label_types'
+    groundtruth_is_crowd = 'groundtruth_is_crowd'
+    groundtruth_area = 'groundtruth_area'
+    groundtruth_difficult = 'groundtruth_difficult'
+    groundtruth_group_of = 'groundtruth_group_of'
+    proposal_boxes = 'proposal_boxes'
+    proposal_objectness = 'proposal_objectness'
+    groundtruth_instance_masks = 'groundtruth_instance_masks'
+    groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'
+    groundtruth_instance_classes = 'groundtruth_instance_classes'
+    groundtruth_keypoints = 'groundtruth_keypoints'
+    groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'
+    groundtruth_label_scores = 'groundtruth_label_scores'
+    groundtruth_weights = 'groundtruth_weights'
+    num_groundtruth_boxes = 'num_groundtruth_boxes'
+    true_image_shape = 'true_image_shape'
+
+
+class DetectionResultFields:
+    """Naming conventions for storing the output of the detector.
+
+    Attributes:
+        source_id: source of the original image.
+        key: unique key corresponding to image.
+        detection_boxes: coordinates of the detection boxes in the image.
+        detection_scores: detection scores for the detection boxes in the
+            image.
+        detection_classes: detection-level class labels.
+        detection_masks: contains a segmentation mask for each detection box.
+        detection_boundaries: contains an object boundary for each detection
+            box.
+        detection_keypoints: contains detection keypoints for each detection
+            box.
+        num_detections: number of detections in the batch.
+    """
+
+    source_id = 'source_id'
+    key = 'key'
+    detection_boxes = 'detection_boxes'
+    detection_scores = 'detection_scores'
+    detection_classes = 'detection_classes'
+    detection_masks = 'detection_masks'
+    detection_boundaries = 'detection_boundaries'
+    detection_keypoints = 'detection_keypoints'
+    num_detections = 'num_detections'
diff --git a/docs/src/paddlevideo/metrics/ava_metric.py b/docs/src/paddlevideo/metrics/ava_metric.py
new file mode 100644
index 000000000..b17c8c82d
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ava_metric.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+from collections import OrderedDict
+from paddlevideo.utils import get_logger, load, log_batch, AverageMeter
+from .registry import METRIC
+from .base import BaseMetric
+import time
+from datetime import datetime
+from .ava_utils import ava_evaluate_results
+
+logger = get_logger("paddlevideo")
+""" An example for metrics class.
+    MultiCropMetric for slowfast.
+"""
+
+
+@METRIC.register
+class AVAMetric(BaseMetric):
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 file_path,
+                 exclude_file,
+                 label_file,
+                 custom_classes,
+                 log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+
+        self.file_path = file_path
+        self.exclude_file = exclude_file
+        self.label_file = label_file
+        self.custom_classes = custom_classes
+
+        self.results = []
+
+        record_list = [
+            ("loss", AverageMeter('loss', '7.5f')),
+            ("recall@thr=0.5", AverageMeter("recall@thr=0.5", '.5f')),
+            ("prec@thr=0.5", AverageMeter("prec@thr=0.5", '.5f')),
+            ("recall@top3", AverageMeter("recall@top3", '.5f')),
+            ("prec@top3", AverageMeter("prec@top3", '.5f')),
+            ("recall@top5", AverageMeter("recall@top5", '.5f')),
+            ("prec@top5", AverageMeter("prec@top5", '.5f')),
+            ("mAP@0.5IOU", AverageMeter("mAP@0.5IOU", '.5f')),
+            ("batch_time", AverageMeter('batch_cost', '.5f')),
+            ("reader_time", AverageMeter('reader_cost', '.5f')),
+        ]
+
+        self.record_list = OrderedDict(record_list)
+
+        self.tic = time.time()
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+
+        self.results.extend(outputs)
+        self.record_list['batch_time'].update(time.time() - self.tic)
+        tic = time.time()
+        ips = "ips: {:.5f} instance/sec.".format(
+            self.batch_size / self.record_list["batch_time"].val)
+        log_batch(self.record_list, batch_id, 0, 0, "test", ips)
+
+    def set_dataset_info(self, info, dataset_len):
+        self.info = info
+        self.dataset_len = dataset_len
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        test_res = ava_evaluate_results(self.info, self.dataset_len,
+                                        self.results, None, self.label_file,
+                                        self.file_path, self.exclude_file)
+
+        for name, value in test_res.items():
+            self.record_list[name].update(value, self.batch_size)
+
+        return self.record_list
diff --git a/docs/src/paddlevideo/metrics/ava_utils.py b/docs/src/paddlevideo/metrics/ava_utils.py
new file mode 100644
index 000000000..b127267ed
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ava_utils.py
@@ -0,0 +1,394 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import heapq
+import logging
+import time
+from collections import defaultdict
+from .ava_evaluation import object_detection_evaluation as det_eval
+from .ava_evaluation import standard_fields
+from .recall import eval_recalls
+import shutil
+import pickle
+import time
+import os
+import os.path as osp
+from paddlevideo.utils import get_logger, get_dist_info
+import paddle.distributed as dist
+import sys
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+import paddle
+
+
+def det2csv(info, dataset_len, results, custom_classes):
+    csv_results = []
+    for idx in range(dataset_len):
+        video_id = info[idx]['video_id']
+        timestamp = info[idx]['timestamp']
+
+        result = results[idx]
+        for label, _ in enumerate(result):
+            for bbox in result[label]:
+                if type(bbox) == paddle.Tensor:
+                    bbox = bbox.numpy()
+                
+                bbox_ = tuple(bbox.tolist())
+                if custom_classes is not None:
+                    actual_label = custom_classes[label + 1]
+                else:
+                    actual_label = label + 1
+                csv_results.append((
+                    video_id,
+                    timestamp,
+                ) + bbox_[:4] + (actual_label, ) + bbox_[4:])
+    return csv_results
+
+
+# results is organized by class
+def results2csv(info, dataset_len, results, out_file, custom_classes=None):
+    if isinstance(results[0], list):
+        csv_results = det2csv(info, dataset_len, results, custom_classes)
+
+    # save space for float
+    def tostr(item):
+        if isinstance(item, float):
+            return f'{item:.3f}'
+        return str(item)
+
+    with open(out_file, 'w') as f:
+        for csv_result in csv_results:
+            f.write(','.join(map(lambda x: tostr(x), csv_result)))
+            f.write('\n')
+
+
+def print_time(message, start):
+    print('==> %g seconds to %s' % (time.time() - start, message))
+
+
+def make_image_key(video_id, timestamp):
+    """Returns a unique identifier for a video id & timestamp."""
+    return f'{video_id},{int(timestamp):04d}'
+
+
+def read_csv(csv_file, class_whitelist=None, capacity=0):
+    """Loads boxes and class labels from a CSV file in the AVA format.
+
+    CSV file format described at https://research.google.com/ava/download.html.
+
+    Args:
+        csv_file: A file object.
+        class_whitelist: If provided, boxes corresponding to (integer) class
+        labels not in this set are skipped.
+        capacity: Maximum number of labeled boxes allowed for each example.
+        Default is 0 where there is no limit.
+
+    Returns:
+        boxes: A dictionary mapping each unique image key (string) to a list of
+        boxes, given as coordinates [y1, x1, y2, x2].
+        labels: A dictionary mapping each unique image key (string) to a list
+        of integer class lables, matching the corresponding box in `boxes`.
+        scores: A dictionary mapping each unique image key (string) to a list
+        of score values lables, matching the corresponding label in `labels`.
+        If scores are not provided in the csv, then they will default to 1.0.
+    """
+    start = time.time()
+    entries = defaultdict(list)
+    boxes = defaultdict(list)
+    labels = defaultdict(list)
+    scores = defaultdict(list)
+    reader = csv.reader(csv_file)
+    for row in reader:
+        assert len(row) in [7, 8], 'Wrong number of columns: ' + row
+        image_key = make_image_key(row[0], row[1])
+        x1, y1, x2, y2 = [float(n) for n in row[2:6]]
+        action_id = int(row[6])
+        if class_whitelist and action_id not in class_whitelist:
+            continue
+
+        score = 1.0
+        if len(row) == 8:
+            score = float(row[7])
+        if capacity < 1 or len(entries[image_key]) < capacity:
+            heapq.heappush(entries[image_key],
+                           (score, action_id, y1, x1, y2, x2))
+        elif score > entries[image_key][0][0]:
+            heapq.heapreplace(entries[image_key],
+                              (score, action_id, y1, x1, y2, x2))
+    for image_key in entries:
+        # Evaluation API assumes boxes with descending scores
+        entry = sorted(entries[image_key], key=lambda tup: -tup[0])
+        for item in entry:
+            score, action_id, y1, x1, y2, x2 = item
+            boxes[image_key].append([y1, x1, y2, x2])
+            labels[image_key].append(action_id)
+            scores[image_key].append(score)
+    print_time('read file ' + csv_file.name, start)
+    return boxes, labels, scores
+
+
+def read_exclusions(exclusions_file):
+    """Reads a CSV file of excluded timestamps.
+
+    Args:
+        exclusions_file: A file object containing a csv of video-id,timestamp.
+
+    Returns:
+        A set of strings containing excluded image keys, e.g.
+        "aaaaaaaaaaa,0904",
+        or an empty set if exclusions file is None.
+    """
+    excluded = set()
+    if exclusions_file:
+        reader = csv.reader(exclusions_file)
+    for row in reader:
+        assert len(row) == 2, 'Expected only 2 columns, got: ' + row
+        excluded.add(make_image_key(row[0], row[1]))
+    return excluded
+
+
+def read_labelmap(labelmap_file):
+    """Reads a labelmap without the dependency on protocol buffers.
+
+    Args:
+        labelmap_file: A file object containing a label map protocol buffer.
+
+    Returns:
+        labelmap: The label map in the form used by the
+        object_detection_evaluation
+        module - a list of {"id": integer, "name": classname } dicts.
+        class_ids: A set containing all of the valid class id integers.
+    """
+    labelmap = []
+    class_ids = set()
+    name = ''
+    class_id = ''
+    for line in labelmap_file:
+        if line.startswith('  name:'):
+            name = line.split('"')[1]
+        elif line.startswith('  id:') or line.startswith('  label_id:'):
+            class_id = int(line.strip().split(' ')[-1])
+            labelmap.append({'id': class_id, 'name': name})
+            class_ids.add(class_id)
+    return labelmap, class_ids
+
+
+# Seems there is at most 100 detections for each image
+def ava_eval(result_file,
+             result_type,
+             label_file,
+             ann_file,
+             exclude_file,
+             max_dets=(100, ),
+             verbose=True,
+             custom_classes=None):
+
+    assert result_type in ['mAP']
+    start = time.time()
+    categories, class_whitelist = read_labelmap(open(label_file))
+
+    if custom_classes is not None:
+        custom_classes = custom_classes[1:]
+        assert set(custom_classes).issubset(set(class_whitelist))
+        class_whitelist = custom_classes
+        categories = [cat for cat in categories if cat['id'] in custom_classes]
+
+    # loading gt, do not need gt score
+    gt_boxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist, 0)
+    if verbose:
+        print_time('Reading detection results', start)
+
+    if exclude_file is not None:
+        excluded_keys = read_exclusions(open(exclude_file))
+    else:
+        excluded_keys = list()
+
+    start = time.time()
+    boxes, labels, scores = read_csv(open(result_file), class_whitelist, 0)
+    if verbose:
+        print_time('Reading detection results', start)
+
+    if result_type == 'proposal':
+        gts = [
+            np.array(gt_boxes[image_key], dtype=float) for image_key in gt_boxes
+        ]
+        proposals = []
+        for image_key in gt_boxes:
+            if image_key in boxes:
+                proposals.append(
+                    np.concatenate(
+                        (np.array(boxes[image_key], dtype=float),
+                         np.array(scores[image_key], dtype=float)[:, None]),
+                        axis=1))
+            else:
+                # if no corresponding proposal, add a fake one
+                proposals.append(np.array([0, 0, 1, 1, 1]))
+
+        # Proposals used here are with scores
+        recalls = eval_recalls(gts, proposals, np.array(max_dets),
+                               np.arange(0.5, 0.96, 0.05))
+        ar = recalls.mean(axis=1)
+        ret = {}
+        for i, num in enumerate(max_dets):
+            print(f'Recall@0.5@{num}\t={recalls[i, 0]:.4f}')
+            print(f'AR@{num}\t={ar[i]:.4f}')
+            ret[f'Recall@0.5@{num}'] = recalls[i, 0]
+            ret[f'AR@{num}'] = ar[i]
+        return ret
+
+    if result_type == 'mAP':
+        pascal_evaluator = det_eval.PascalDetectionEvaluator(categories)
+
+        start = time.time()
+        for image_key in gt_boxes:
+            if verbose and image_key in excluded_keys:
+                logging.info(
+                    'Found excluded timestamp in detections: %s.'
+                    'It will be ignored.', image_key)
+                continue
+            pascal_evaluator.add_single_ground_truth_image_info(
+                image_key, {
+                    standard_fields.InputDataFields.groundtruth_boxes:
+                    np.array(gt_boxes[image_key], dtype=float),
+                    standard_fields.InputDataFields.groundtruth_classes:
+                    np.array(gt_labels[image_key], dtype=int),
+                    standard_fields.InputDataFields.groundtruth_difficult:
+                    np.zeros(len(gt_boxes[image_key]), dtype=bool)
+                })
+        if verbose:
+            print_time('Convert groundtruth', start)
+
+        start = time.time()
+        for image_key in boxes:
+            if verbose and image_key in excluded_keys:
+                logging.info(
+                    'Found excluded timestamp in detections: %s.'
+                    'It will be ignored.', image_key)
+                continue
+            pascal_evaluator.add_single_detected_image_info(
+                image_key, {
+                    standard_fields.DetectionResultFields.detection_boxes:
+                    np.array(boxes[image_key], dtype=float),
+                    standard_fields.DetectionResultFields.detection_classes:
+                    np.array(labels[image_key], dtype=int),
+                    standard_fields.DetectionResultFields.detection_scores:
+                    np.array(scores[image_key], dtype=float)
+                })
+        if verbose:
+            print_time('convert detections', start)
+
+        start = time.time()
+        metrics = pascal_evaluator.evaluate()
+        if verbose:
+            print_time('run_evaluator', start)
+        for display_name in metrics:
+            print(f'{display_name}=\t{metrics[display_name]}')
+        ret = {
+            display_name: metrics[display_name]
+            for display_name in metrics if 'ByCategory' not in display_name
+        }
+        return ret
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def dump_to_fileobj(obj, file, **kwargs):
+    kwargs.setdefault('protocol', 2)
+    pickle.dump(obj, file, **kwargs)
+
+
+def dump_to_path(obj, filepath, mode='wb'):
+    with open(filepath, mode) as f:
+        dump_to_fileobj(obj, f)
+
+
+def load_from_fileobj(file, **kwargs):
+    return pickle.load(file, **kwargs)
+
+
+def load_from_path(filepath, mode='rb'):
+    with open(filepath, mode) as f:
+        return load_from_fileobj(f)
+
+
+def collect_results_cpu(result_part, size):
+    """Collect results in cpu mode.
+    It saves the results on different gpus to 'tmpdir' and collects
+    them by the rank 0 worker.
+    """
+    tmpdir = osp.join('./', 'collect_results_cpu')
+    #1. load results of all parts from tmp dir
+    mkdir_or_exist(tmpdir)
+    rank, world_size = get_dist_info()
+    dump_to_path(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    if rank != 0:
+        return None
+    #2. collect all parts
+    while 1:
+        all_exist = True
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            if not Path(part_file).exists():
+                all_exist = False
+        if all_exist:
+            break
+        else:
+            time.sleep(60)
+    time.sleep(120)
+    #3. load results of all parts from tmp dir
+    part_list = []
+    for i in range(world_size):
+        part_file = osp.join(tmpdir, f'part_{i}.pkl')
+        part_list.append(load_from_path(part_file))
+    #4. sort the results
+    ordered_results = []
+    for res in zip(*part_list):
+        ordered_results.extend(list(res))
+    ordered_results = ordered_results[:
+                                      size]  #the dataloader may pad some samples
+    #5. remove results of all parts from tmp dir, avoid dump_file fail to tmp dir when dir not exists.
+    for i in range(world_size):
+        part_file = osp.join(tmpdir, f'part_{i}.pkl')
+        os.remove(part_file)
+
+    return ordered_results
+
+
+def ava_evaluate_results(info, dataset_len, results, custom_classes, label_file,
+                         file_path, exclude_file):
+    # need to create a temp result file
+    time_now = datetime.now().strftime('%Y%m%d_%H%M%S')
+    temp_file = f'AVA_{time_now}_result.csv'
+    results2csv(info, dataset_len, results, temp_file)
+    ret = {}
+    eval_result = ava_eval(
+        temp_file,
+        'mAP',
+        label_file,
+        file_path,  #ann_file,
+        exclude_file,
+        custom_classes=custom_classes)
+    ret.update(eval_result)
+
+    os.remove(temp_file)
+
+    return ret
diff --git a/docs/src/paddlevideo/metrics/base.py b/docs/src/paddlevideo/metrics/base.py
new file mode 100644
index 000000000..98422322b
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/base.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+
+import paddle
+from paddlevideo.utils import get_dist_info
+
+from .registry import METRIC
+
+
+class BaseMetric(object):
+    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):
+        self.data_size = data_size
+        self.batch_size = batch_size
+        _, self.world_size = get_dist_info()
+        self.log_interval = log_interval
+
+    def gather_from_gpu(self,
+                        gather_object: paddle.Tensor,
+                        concat_axis=0) -> paddle.Tensor:
+        """gather Tensor from all gpus into a list and concatenate them on `concat_axis`.
+
+        Args:
+            gather_object (paddle.Tensor): gather object Tensor
+            concat_axis (int, optional): axis for concatenation. Defaults to 0.
+
+        Returns:
+            paddle.Tensor: gatherd & concatenated Tensor
+        """
+        gather_object_list = []
+        paddle.distributed.all_gather(gather_object_list, gather_object.cuda())
+        return paddle.concat(gather_object_list, axis=concat_axis)
+
+    @abstractmethod
+    def update(self):
+        raise NotImplementedError(
+            "'update' method must be implemented in subclass")
+
+    @abstractmethod
+    def accumulate(self):
+        raise NotImplementedError(
+            "'accumulate' method must be implemented in subclass")
diff --git a/docs/src/paddlevideo/metrics/bmn_metric.py b/docs/src/paddlevideo/metrics/bmn_metric.py
new file mode 100644
index 000000000..cc36283f9
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/bmn_metric.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import json
+import numpy as np
+import pandas as pd
+import multiprocessing as mp
+
+from .registry import METRIC
+from .base import BaseMetric
+from .ActivityNet import ANETproposal
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+def iou_with_anchors(anchors_min, anchors_max, box_min, box_max):
+    """Compute jaccard score between a box and the anchors.
+    """
+    len_anchors = anchors_max - anchors_min
+    int_xmin = np.maximum(anchors_min, box_min)
+    int_xmax = np.minimum(anchors_max, box_max)
+    inter_len = np.maximum(int_xmax - int_xmin, 0.)
+    union_len = len_anchors - inter_len + box_max - box_min
+    jaccard = np.divide(inter_len, union_len)
+    return jaccard
+
+
+def boundary_choose(score_list):
+    """Choose start and end boundary from score.
+    """
+    max_score = max(score_list)
+    mask_high = (score_list > max_score * 0.5)
+    score_list = list(score_list)
+    score_middle = np.array([0.0] + score_list + [0.0])
+    score_front = np.array([0.0, 0.0] + score_list)
+    score_back = np.array(score_list + [0.0, 0.0])
+    mask_peak = ((score_middle > score_front) & (score_middle > score_back))
+    mask_peak = mask_peak[1:-1]
+    mask = (mask_high | mask_peak).astype('float32')
+    return mask
+
+
+def soft_nms(df, alpha, t1, t2):
+    '''
+    df: proposals generated by network;
+    alpha: alpha value of Gaussian decaying function;
+    t1, t2: threshold for soft nms.
+    '''
+    df = df.sort_values(by="score", ascending=False)
+    tstart = list(df.xmin.values[:])
+    tend = list(df.xmax.values[:])
+    tscore = list(df.score.values[:])
+
+    rstart = []
+    rend = []
+    rscore = []
+
+    while len(tscore) > 1 and len(rscore) < 101:
+        max_index = tscore.index(max(tscore))
+        tmp_iou_list = iou_with_anchors(np.array(tstart), np.array(tend),
+                                        tstart[max_index], tend[max_index])
+        for idx in range(0, len(tscore)):
+            if idx != max_index:
+                tmp_iou = tmp_iou_list[idx]
+                tmp_width = tend[max_index] - tstart[max_index]
+                if tmp_iou > t1 + (t2 - t1) * tmp_width:
+                    tscore[idx] = tscore[idx] * np.exp(
+                        -np.square(tmp_iou) / alpha)
+
+        rstart.append(tstart[max_index])
+        rend.append(tend[max_index])
+        rscore.append(tscore[max_index])
+        tstart.pop(max_index)
+        tend.pop(max_index)
+        tscore.pop(max_index)
+
+    newDf = pd.DataFrame()
+    newDf['score'] = rscore
+    newDf['xmin'] = rstart
+    newDf['xmax'] = rend
+    return newDf
+
+
+@METRIC.register
+class BMNMetric(BaseMetric):
+    """
+    Metrics for BMN. Two Stages in this metric:
+    (1) Get test results using trained model, results will be saved in BMNMetric.result_path;
+    (2) Calculate metrics using results file from stage (1).
+    """
+
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 tscale,
+                 dscale,
+                 file_path,
+                 ground_truth_filename,
+                 subset,
+                 output_path,
+                 result_path,
+                 get_metrics=True,
+                 log_interval=1):
+        """
+        Init for BMN metrics.
+        Params:
+            get_metrics: whether to calculate AR@N and AUC metrics or not, default True.
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        assert self.batch_size == 1, " Now we just support batch_size==1 test"
+        assert self.world_size == 1, " Now we just support single-card test"
+
+        self.tscale = tscale
+        self.dscale = dscale
+        self.file_path = file_path
+        self.ground_truth_filename = ground_truth_filename
+        self.subset = subset
+        self.output_path = output_path
+        self.result_path = result_path
+        self.get_metrics = get_metrics
+
+        if not os.path.isdir(self.output_path):
+            os.makedirs(self.output_path)
+        if not os.path.isdir(self.result_path):
+            os.makedirs(self.result_path)
+
+        self.video_dict, self.video_list = self.get_dataset_dict(
+            self.file_path, self.subset)
+
+    def get_dataset_dict(self, file_path, subset):
+        annos = json.load(open(file_path))
+        video_dict = {}
+        for video_name in annos.keys():
+            video_subset = annos[video_name]["subset"]
+            if subset in video_subset:
+                video_dict[video_name] = annos[video_name]
+        video_list = list(video_dict.keys())
+        video_list.sort()
+        return video_dict, video_list
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        fid = data[4].numpy()
+        pred_bm, pred_start, pred_end = outputs
+        pred_bm = pred_bm.numpy()
+        pred_start = pred_start[0].numpy()
+        pred_end = pred_end[0].numpy()
+
+        snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]
+        snippet_xmaxs = [
+            1.0 / self.tscale * i for i in range(1, self.tscale + 1)
+        ]
+        cols = ["xmin", "xmax", "score"]
+
+        video_name = self.video_list[fid[0]]
+        pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]
+        start_mask = boundary_choose(pred_start)
+        start_mask[0] = 1.
+        end_mask = boundary_choose(pred_end)
+        end_mask[-1] = 1.
+        score_vector_list = []
+        for idx in range(self.dscale):
+            for jdx in range(self.tscale):
+                start_index = jdx
+                end_index = start_index + idx
+                if end_index < self.tscale and start_mask[
+                        start_index] == 1 and end_mask[end_index] == 1:
+                    xmin = snippet_xmins[start_index]
+                    xmax = snippet_xmaxs[end_index]
+                    xmin_score = pred_start[start_index]
+                    xmax_score = pred_end[end_index]
+                    bm_score = pred_bm[idx, jdx]
+                    conf_score = xmin_score * xmax_score * bm_score
+                    score_vector_list.append([xmin, xmax, conf_score])
+
+        score_vector_list = np.stack(score_vector_list)
+        video_df = pd.DataFrame(score_vector_list, columns=cols)
+        video_df.to_csv(os.path.join(self.output_path, "%s.csv" % video_name),
+                        index=False)
+
+        if batch_id % self.log_interval == 0:
+            logger.info("Processing................ batch {}".format(batch_id))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        # check clip index of each video
+        #Stage1
+        self.bmn_post_processing(self.video_dict, self.subset, self.output_path,
+                                 self.result_path)
+        if self.get_metrics:
+            logger.info("[TEST] calculate metrics...")
+            #Stage2
+            uniform_average_nr_proposals_valid, uniform_average_recall_valid, uniform_recall_valid = self.cal_metrics(
+                self.ground_truth_filename,
+                os.path.join(self.result_path, "bmn_results_validation.json"),
+                max_avg_nr_proposals=100,
+                tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                subset='validation')
+            logger.info("AR@1; AR@5; AR@10; AR@100")
+            logger.info("%.02f %.02f %.02f %.02f" %
+                        (100 * np.mean(uniform_recall_valid[:, 0]),
+                         100 * np.mean(uniform_recall_valid[:, 4]),
+                         100 * np.mean(uniform_recall_valid[:, 9]),
+                         100 * np.mean(uniform_recall_valid[:, -1])))
+
+    def bmn_post_processing(self, video_dict, subset, output_path, result_path):
+        video_list = list(video_dict.keys())
+        global result_dict
+        result_dict = mp.Manager().dict()
+        pp_num = 12
+
+        num_videos = len(video_list)
+        num_videos_per_thread = int(num_videos / pp_num)
+        processes = []
+        for tid in range(pp_num - 1):
+            tmp_video_list = video_list[tid * num_videos_per_thread:(tid + 1) *
+                                        num_videos_per_thread]
+            p = mp.Process(target=self.video_process,
+                           args=(tmp_video_list, video_dict, output_path,
+                                 result_dict))
+            p.start()
+            processes.append(p)
+        tmp_video_list = video_list[(pp_num - 1) * num_videos_per_thread:]
+        p = mp.Process(target=self.video_process,
+                       args=(tmp_video_list, video_dict, output_path,
+                             result_dict))
+        p.start()
+        processes.append(p)
+        for p in processes:
+            p.join()
+
+        result_dict = dict(result_dict)
+        output_dict = {
+            "version": "VERSION 1.3",
+            "results": result_dict,
+            "external_data": {}
+        }
+        outfile = open(
+            os.path.join(result_path, "bmn_results_%s.json" % subset), "w")
+
+        # json.dump(output_dict, outfile)
+        # in case of file name in chinese
+        json.dump(output_dict, outfile, ensure_ascii=False)
+        outfile.close()
+
+    def video_process(self,
+                      video_list,
+                      video_dict,
+                      output_path,
+                      result_dict,
+                      snms_alpha=0.4,
+                      snms_t1=0.55,
+                      snms_t2=0.9):
+
+        for video_name in video_list:
+            logger.info("Processing video........" + video_name)
+            df = pd.read_csv(os.path.join(output_path, video_name + ".csv"))
+            if len(df) > 1:
+                df = soft_nms(df, snms_alpha, snms_t1, snms_t2)
+
+            video_duration = video_dict[video_name]["duration_second"]
+            proposal_list = []
+            for idx in range(min(100, len(df))):
+                tmp_prop={"score":df.score.values[idx], \
+                          "segment":[max(0,df.xmin.values[idx])*video_duration, \
+                                     min(1,df.xmax.values[idx])*video_duration]}
+                proposal_list.append(tmp_prop)
+
+            video_name = video_name[2:] if video_name[:2] == 'v_' else video_name
+            result_dict[video_name] = proposal_list
+
+    def cal_metrics(self,
+                    ground_truth_filename,
+                    proposal_filename,
+                    max_avg_nr_proposals=100,
+                    tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                    subset='validation'):
+
+        anet_proposal = ANETproposal(ground_truth_filename,
+                                     proposal_filename,
+                                     tiou_thresholds=tiou_thresholds,
+                                     max_avg_nr_proposals=max_avg_nr_proposals,
+                                     subset=subset,
+                                     verbose=True,
+                                     check_status=False)
+        anet_proposal.evaluate()
+        recall = anet_proposal.recall
+        average_recall = anet_proposal.avg_recall
+        average_nr_proposals = anet_proposal.proposals_per_video
+
+        return (average_nr_proposals, average_recall, recall)
diff --git a/docs/src/paddlevideo/metrics/build.py b/docs/src/paddlevideo/metrics/build.py
new file mode 100644
index 000000000..82e4b5026
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/build.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import METRIC
+from ..utils import build
+
+
+def build_metric(cfg):
+    return build(cfg, METRIC)
diff --git a/docs/src/paddlevideo/metrics/center_crop_metric.py b/docs/src/paddlevideo/metrics/center_crop_metric.py
new file mode 100644
index 000000000..0ca6112f0
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/center_crop_metric.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from typing import List
+
+import paddle
+from paddlevideo.utils import get_logger
+
+from .base import BaseMetric
+from .registry import METRIC
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class CenterCropMetric(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval, **kwargs)
+        self.rest_data_size = data_size  # Number of samples remaining to be tested
+        self.all_outputs = []
+        self.all_labels = []
+        self.topk = kwargs.get("topk", [1, 5])
+
+    def update(self, batch_id: int, data: List, outputs: paddle.Tensor) -> None:
+        """update metrics during each iter
+
+        Args:
+            batch_id (int): iter id of current batch.
+            data (List): list of batched data, such as [inputs, labels]
+            outputs (paddle.Tensor): batched outputs from model
+        """
+        labels = data[1]
+        if self.world_size > 1:
+            labels_gathered = self.gather_from_gpu(labels, concat_axis=0)
+            outpus_gathered = self.gather_from_gpu(outputs, concat_axis=0)
+        else:
+            labels_gathered = labels
+            outpus_gathered = outputs
+
+        # Avoid resampling effects when testing with multiple cards
+        labels_gathered = labels_gathered[0:min(len(labels_gathered), self.
+                                                rest_data_size)]
+        outpus_gathered = outpus_gathered[0:min(len(outpus_gathered), self.
+                                                rest_data_size)]
+        self.all_labels.append(labels_gathered)
+        self.all_outputs.append(outpus_gathered)
+        self.rest_data_size -= outpus_gathered.shape[0]
+
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate, compute, and show metrics when finished all iters.
+        """
+        self.all_outputs = paddle.concat(self.all_outputs, axis=0)
+        self.all_labels = paddle.concat(self.all_labels, axis=0)
+
+        result_str = []
+        for _k in self.topk:
+            topk_val = paddle.metric.accuracy(input=self.all_outputs,
+                                              label=self.all_labels,
+                                              k=_k).item()
+            result_str.append(f"avg_acc{_k}={topk_val}")
+        result_str = ", ".join(result_str)
+        logger.info(f"[TEST] finished, {result_str}")
diff --git a/docs/src/paddlevideo/metrics/center_crop_metric_MRI.py b/docs/src/paddlevideo/metrics/center_crop_metric_MRI.py
new file mode 100644
index 000000000..b6d231ac3
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/center_crop_metric_MRI.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class CenterCropMetric_MRI(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1, if_slowfast=0):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.top1 = []
+        self.if_slowfast = if_slowfast
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        labels = data[1]
+
+        if self.if_slowfast:
+            labels = data[2]
+
+        top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
+        #top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
+        #NOTE(shipping): deal with multi cards validate
+        if self.world_size > 1:
+            top1 = paddle.distributed.all_reduce(
+                top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            # top5 = paddle.distributed.all_reduce(
+            #     top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+        self.top1.append(top1.numpy())
+        #self.top5.append(top5.numpy())
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        logger.info('[TEST] finished, avg_acc1= {}'.format(
+            np.mean(np.array(self.top1))))
diff --git a/docs/src/paddlevideo/metrics/depth_metric.py b/docs/src/paddlevideo/metrics/depth_metric.py
new file mode 100644
index 000000000..c160e16ba
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/depth_metric.py
@@ -0,0 +1,77 @@
+import numpy as np
+import paddle
+from paddlevideo.utils import get_logger
+
+from .base import BaseMetric
+from .registry import METRIC
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class DepthMetric(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.abs_rel = []
+        self.sq_rel = []
+        self.rmse = []
+        self.rmse_log = []
+        self.a1 = []
+        self.a2 = []
+        self.a3 = []
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = outputs['abs_rel'], outputs['sq_rel'], outputs['rmse'], \
+                                                      outputs['rmse_log'], outputs['a1'], outputs['a2'],outputs['a3']
+        # preds ensemble
+        if self.world_size > 1:
+            abs_rel = paddle.distributed.all_reduce(
+                outputs['abs_rel'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            sq_rel = paddle.distributed.all_reduce(
+                outputs['sq_rel'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            rmse = paddle.distributed.all_reduce(
+                outputs['rmse'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            rmse_log = paddle.distributed.all_reduce(
+                outputs['rmse_log'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            a1 = paddle.distributed.all_reduce(
+                outputs['a1'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            a2 = paddle.distributed.all_reduce(
+                outputs['a2'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            a3 = paddle.distributed.all_reduce(
+                outputs['a3'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+        self.abs_rel.append(abs_rel)
+        self.sq_rel.append(sq_rel)
+        self.rmse.append(rmse)
+        self.rmse_log.append(rmse_log)
+        self.a1.append(a1)
+        self.a2.append(a2)
+        self.a3.append(a3)
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        logger.info(
+            '[TEST] finished, abs_rel= {}, sq_rel= {} , rmse= {}, rmse_log= {},'
+            'a1= {}, a2= {}, a3= {}'.format(np.mean(np.array(self.abs_rel)),
+                                            np.mean(np.array(self.sq_rel)),
+                                            np.mean(np.array(self.rmse)),
+                                            np.mean(np.array(self.rmse_log)),
+                                            np.mean(np.array(self.a1)),
+                                            np.mean(np.array(self.a2)),
+                                            np.mean(np.array(self.a3))))
diff --git a/docs/src/paddlevideo/metrics/msrvtt_metric.py b/docs/src/paddlevideo/metrics/msrvtt_metric.py
new file mode 100644
index 000000000..99e73343d
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/msrvtt_metric.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class MSRVTTMetric(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.score_matrix = np.zeros((data_size, data_size))
+        self.target_matrix = np.zeros((data_size, data_size))
+        self.rank_matrix = np.ones((data_size)) * data_size
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        target = data[-1]
+        cm_logit = outputs[-1]
+
+        self.score_matrix[batch_id, :] = F.softmax(
+            cm_logit, axis=1)[:, 0].reshape([-1]).numpy()
+        self.target_matrix[batch_id, :] = target.reshape([-1]).numpy()
+
+        rank = np.where((np.argsort(-self.score_matrix[batch_id]) == np.where(
+            self.target_matrix[batch_id] == 1)[0][0]) == 1)[0][0]
+        self.rank_matrix[batch_id] = rank
+
+        rank_matrix_tmp = self.rank_matrix[:batch_id + 1]
+        r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp)
+        r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp)
+        r10 = 100.0 * np.sum(rank_matrix_tmp < 10) / len(rank_matrix_tmp)
+
+        medr = np.floor(np.median(rank_matrix_tmp) + 1)
+        meanr = np.mean(rank_matrix_tmp) + 1
+        logger.info(
+            "[{}] Final r1:{:.3f}, r5:{:.3f}, r10:{:.3f}, mder:{:.3f}, meanr:{:.3f}"
+            .format(batch_id, r1, r5, r10, medr, meanr))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        logger.info("Eval Finished!")
diff --git a/docs/src/paddlevideo/metrics/multi_crop_metric.py b/docs/src/paddlevideo/metrics/multi_crop_metric.py
new file mode 100644
index 000000000..5f20ced89
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/multi_crop_metric.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+from paddle.hapi.model import _all_gather
+
+from paddlevideo.utils import get_logger
+from .registry import METRIC
+from .base import BaseMetric
+
+logger = get_logger("paddlevideo")
+""" An example for metrics class.
+    MultiCropMetric for slowfast.
+"""
+
+
+@METRIC.register
+class MultiCropMetric(BaseMetric):
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 num_ensemble_views,
+                 num_spatial_crops,
+                 num_classes,
+                 log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.num_ensemble_views = num_ensemble_views
+        self.num_spatial_crops = num_spatial_crops
+        self.num_classes = num_classes
+
+        self.num_clips = self.num_ensemble_views * self.num_spatial_crops
+        num_videos = self.data_size // self.num_clips
+        self.video_preds = np.zeros((num_videos, self.num_classes))
+        self.video_labels = np.zeros((num_videos, 1), dtype="int64")
+        self.clip_count = {}
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        labels = data[2]
+        clip_ids = data[3]
+
+        # gather mulit card, results of following process in each card is the same.
+        if self.world_size > 1:
+            outputs = _all_gather(outputs, self.world_size)
+            labels = _all_gather(labels.cuda(), self.world_size)
+            clip_ids = _all_gather(clip_ids.cuda(), self.world_size)
+
+        # to numpy
+        preds = outputs.numpy()
+        labels = labels.numpy().astype("int64")
+        clip_ids = clip_ids.numpy()
+
+        # preds ensemble
+        for ind in range(preds.shape[0]):
+            vid_id = int(clip_ids[ind]) // self.num_clips
+            ts_idx = int(clip_ids[ind]) % self.num_clips
+            if vid_id not in self.clip_count:
+                self.clip_count[vid_id] = []
+            if ts_idx in self.clip_count[vid_id]:
+                logger.info(
+                    "[TEST] Passed!! read video {} clip index {} / {} repeatedly."
+                    .format(vid_id, ts_idx, clip_ids[ind]))
+            else:
+                self.clip_count[vid_id].append(ts_idx)
+                self.video_preds[vid_id] += preds[ind]  # ensemble method: sum
+                if self.video_labels[vid_id].sum() > 0:
+                    assert self.video_labels[vid_id] == labels[ind]
+                self.video_labels[vid_id] = labels[ind]
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        # check clip index of each video
+        for key in self.clip_count.keys():
+            if len(self.clip_count[key]) != self.num_clips or sum(
+                    self.clip_count[key]) != self.num_clips * (self.num_clips -
+                                                               1) / 2:
+                logger.info(
+                    "[TEST] Count Error!! video [{}] clip count [{}] not match number clips {}"
+                    .format(key, self.clip_count[key], self.num_clips))
+
+        video_preds = paddle.to_tensor(self.video_preds)
+        video_labels = paddle.to_tensor(self.video_labels)
+        acc_top1 = paddle.metric.accuracy(input=video_preds,
+                                          label=video_labels,
+                                          k=1)
+        acc_top5 = paddle.metric.accuracy(input=video_preds,
+                                          label=video_labels,
+                                          k=5)
+        logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {} '.format(
+            acc_top1.numpy(), acc_top5.numpy()))
diff --git a/docs/src/paddlevideo/metrics/recall.py b/docs/src/paddlevideo/metrics/recall.py
new file mode 100644
index 000000000..3612e2244
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/recall.py
@@ -0,0 +1,84 @@
+import numpy as np
+import paddle 
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    ious_ = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros(ious.shape[0])
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        ious_[k, :] = tmp_ious
+
+    ious_ = np.fliplr(np.sort(ious_, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (ious_ >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    if isinstance(proposal_nums, list):
+        proposal_nums_ = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        proposal_nums_ = np.array([proposal_nums])
+    else:
+        proposal_nums_ = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, list):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return proposal_nums_, _iou_thrs
+
+
+def eval_recalls(gts, proposals, proposal_nums=None, iou_thrs=None):
+    """Calculate recalls. """
+    img_num = len(gts)
+    assert img_num == len(proposals)
+
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(
+                torch.tensor(gts[i]),
+                torch.tensor(img_proposal[:prop_num, :4]))
+            ious = ious.data.numpy()
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+    return recalls
diff --git a/docs/src/paddlevideo/metrics/registry.py b/docs/src/paddlevideo/metrics/registry.py
new file mode 100644
index 000000000..221444023
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/registry.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+METRIC = Registry('metric')
diff --git a/docs/src/paddlevideo/metrics/segmentation_metric.py b/docs/src/paddlevideo/metrics/segmentation_metric.py
new file mode 100644
index 000000000..3719450e4
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/segmentation_metric.py
@@ -0,0 +1,389 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import argparse
+import pandas as pd
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+def get_labels_scores_start_end_time(input_np,
+                                     frame_wise_labels,
+                                     actions_dict,
+                                     bg_class=["background", "None"]):
+    labels = []
+    starts = []
+    ends = []
+    scores = []
+
+    boundary_score_ptr = 0
+
+    last_label = frame_wise_labels[0]
+    if frame_wise_labels[0] not in bg_class:
+        labels.append(frame_wise_labels[0])
+        starts.append(0)
+    for i in range(len(frame_wise_labels)):
+        if frame_wise_labels[i] != last_label:
+            if frame_wise_labels[i] not in bg_class:
+                labels.append(frame_wise_labels[i])
+                starts.append(i)
+            if last_label not in bg_class:
+                ends.append(i)
+                score = np.mean(
+                        input_np[actions_dict[labels[boundary_score_ptr]], \
+                            starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]
+                        )
+                scores.append(score)
+                boundary_score_ptr = boundary_score_ptr + 1
+            last_label = frame_wise_labels[i]
+    if last_label not in bg_class:
+        ends.append(i + 1)
+        score = np.mean(
+                    input_np[actions_dict[labels[boundary_score_ptr]], \
+                        starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]
+                    )
+        scores.append(score)
+        boundary_score_ptr = boundary_score_ptr + 1
+
+    return labels, starts, ends, scores
+
+
+def get_labels_start_end_time(frame_wise_labels,
+                              bg_class=["background", "None"]):
+    labels = []
+    starts = []
+    ends = []
+    last_label = frame_wise_labels[0]
+    if frame_wise_labels[0] not in bg_class:
+        labels.append(frame_wise_labels[0])
+        starts.append(0)
+    for i in range(len(frame_wise_labels)):
+        if frame_wise_labels[i] != last_label:
+            if frame_wise_labels[i] not in bg_class:
+                labels.append(frame_wise_labels[i])
+                starts.append(i)
+            if last_label not in bg_class:
+                ends.append(i)
+            last_label = frame_wise_labels[i]
+    if last_label not in bg_class:
+        ends.append(i + 1)
+    return labels, starts, ends
+
+
+def levenstein(p, y, norm=False):
+    m_row = len(p)
+    n_col = len(y)
+    D = np.zeros([m_row + 1, n_col + 1], np.float)
+    for i in range(m_row + 1):
+        D[i, 0] = i
+    for i in range(n_col + 1):
+        D[0, i] = i
+
+    for j in range(1, n_col + 1):
+        for i in range(1, m_row + 1):
+            if y[j - 1] == p[i - 1]:
+                D[i, j] = D[i - 1, j - 1]
+            else:
+                D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,
+                              D[i - 1, j - 1] + 1)
+
+    if norm:
+        score = (1 - D[-1, -1] / max(m_row, n_col)) * 100
+    else:
+        score = D[-1, -1]
+
+    return score
+
+
+def edit_score(recognized,
+               ground_truth,
+               norm=True,
+               bg_class=["background", "None"]):
+    P, _, _ = get_labels_start_end_time(recognized, bg_class)
+    Y, _, _ = get_labels_start_end_time(ground_truth, bg_class)
+    return levenstein(P, Y, norm)
+
+
+def f_score(recognized, ground_truth, overlap, bg_class=["background", "None"]):
+    p_label, p_start, p_end = get_labels_start_end_time(recognized, bg_class)
+    y_label, y_start, y_end = get_labels_start_end_time(ground_truth, bg_class)
+
+    tp = 0
+    fp = 0
+
+    hits = np.zeros(len(y_label))
+
+    for j in range(len(p_label)):
+        intersection = np.minimum(p_end[j], y_end) - np.maximum(
+            p_start[j], y_start)
+        union = np.maximum(p_end[j], y_end) - np.minimum(p_start[j], y_start)
+        IoU = (1.0 * intersection / union) * (
+            [p_label[j] == y_label[x] for x in range(len(y_label))])
+        # Get the best scoring segment
+        idx = np.array(IoU).argmax()
+
+        if IoU[idx] >= overlap and not hits[idx]:
+            tp += 1
+            hits[idx] = 1
+        else:
+            fp += 1
+    fn = len(y_label) - sum(hits)
+    return float(tp), float(fp), float(fn)
+
+
+def boundary_AR(pred_boundary, gt_boundary, overlap_list, max_proposal):
+
+    p_label, p_start, p_end, p_scores = pred_boundary
+    y_label, y_start, y_end, _ = gt_boundary
+
+    # sort proposal
+    pred_dict = {
+        "label": p_label,
+        "start": p_start,
+        "end": p_end,
+        "scores": p_scores
+    }
+    pdf = pd.DataFrame(pred_dict)
+    pdf = pdf.sort_values(by="scores", ascending=False)
+    p_label = list(pdf["label"])
+    p_start = list(pdf["start"])
+    p_end = list(pdf["end"])
+    p_scores = list(pdf["scores"])
+
+    # refine AN
+    if len(p_label) < max_proposal and len(p_label) > 0:
+        p_label = p_label + [p_label[-1]] * (max_proposal - len(p_label))
+        p_start = p_start + [p_start[-1]] * (max_proposal - len(p_start))
+        p_start = p_start + p_start[len(p_start) -
+                                    (max_proposal - len(p_start)):]
+        p_end = p_end + [p_end[-1]] * (max_proposal - len(p_end))
+        p_scores = p_scores + [p_scores[-1]] * (max_proposal - len(p_scores))
+    elif len(p_label) > max_proposal:
+        p_label[max_proposal:] = []
+        p_start[max_proposal:] = []
+        p_end[max_proposal:] = []
+        p_scores[max_proposal:] = []
+
+    t_AR = np.zeros(len(overlap_list))
+
+    for i in range(len(overlap_list)):
+        overlap = overlap_list[i]
+
+        tp = 0
+        fp = 0
+        hits = np.zeros(len(y_label))
+
+        for j in range(len(p_label)):
+            intersection = np.minimum(p_end[j], y_end) - np.maximum(
+                p_start[j], y_start)
+            union = np.maximum(p_end[j], y_end) - np.minimum(
+                p_start[j], y_start)
+            IoU = (1.0 * intersection / union)
+            # Get the best scoring segment
+            idx = np.array(IoU).argmax()
+
+            if IoU[idx] >= overlap and not hits[idx]:
+                tp += 1
+                hits[idx] = 1
+            else:
+                fp += 1
+        fn = len(y_label) - sum(hits)
+
+        recall = float(tp) / (float(tp) + float(fn))
+        t_AR[i] = recall
+
+    AR = np.mean(t_AR)
+    return AR
+
+
+@METRIC.register
+class SegmentationMetric(BaseMetric):
+    """
+    Test for Video Segmentation based model.
+    """
+
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 overlap,
+                 actions_map_file_path,
+                 log_interval=1,
+                 tolerance=5,
+                 boundary_threshold=0.7,
+                 max_proposal=100):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        # actions dict generate
+        file_ptr = open(actions_map_file_path, 'r')
+        actions = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        self.actions_dict = dict()
+        for a in actions:
+            self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+        # cls score
+        self.overlap = overlap
+        self.overlap_len = len(overlap)
+
+        self.cls_tp = np.zeros(self.overlap_len)
+        self.cls_fp = np.zeros(self.overlap_len)
+        self.cls_fn = np.zeros(self.overlap_len)
+        self.total_correct = 0
+        self.total_edit = 0
+        self.total_frame = 0
+        self.total_video = 0
+
+        # boundary score
+        self.max_proposal = max_proposal
+        self.AR_at_AN = [[] for _ in range(max_proposal)]
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        groundTruth = data[1]
+
+        predicted = outputs['predict']
+        output_np = outputs['output_np']
+
+        outputs_np = predicted.numpy()
+        outputs_arr = output_np.numpy()[0, :]
+        gt_np = groundTruth.numpy()[0, :]
+
+        recognition = []
+        for i in range(outputs_np.shape[0]):
+            recognition = np.concatenate((recognition, [
+                list(self.actions_dict.keys())[list(
+                    self.actions_dict.values()).index(outputs_np[i])]
+            ]))
+        recog_content = list(recognition)
+
+        gt_content = []
+        for i in range(gt_np.shape[0]):
+            gt_content = np.concatenate((gt_content, [
+                list(self.actions_dict.keys())[list(
+                    self.actions_dict.values()).index(gt_np[i])]
+            ]))
+        gt_content = list(gt_content)
+
+        pred_boundary = get_labels_scores_start_end_time(
+            outputs_arr, recog_content, self.actions_dict)
+        gt_boundary = get_labels_scores_start_end_time(
+            np.ones(outputs_arr.shape), gt_content, self.actions_dict)
+
+        # cls score
+        correct = 0
+        total = 0
+        edit = 0
+
+        for i in range(len(gt_content)):
+            total += 1
+            #accumulate
+            self.total_frame += 1
+
+            if gt_content[i] == recog_content[i]:
+                correct += 1
+                #accumulate
+                self.total_correct += 1
+
+        edit_num = edit_score(recog_content, gt_content)
+        edit += edit_num
+        self.total_edit += edit_num
+
+        for s in range(self.overlap_len):
+            tp1, fp1, fn1 = f_score(recog_content, gt_content, self.overlap[s])
+
+            # accumulate
+            self.cls_tp[s] += tp1
+            self.cls_fp[s] += fp1
+            self.cls_fn[s] += fn1
+
+        # accumulate
+        self.total_video += 1
+
+        # proposal score
+        for AN in range(self.max_proposal):
+            AR = boundary_AR(pred_boundary,
+                             gt_boundary,
+                             self.overlap,
+                             max_proposal=(AN + 1))
+            self.AR_at_AN[AN].append(AR)
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        # cls metric
+        Acc = 100 * float(self.total_correct) / self.total_frame
+        Edit = (1.0 * self.total_edit) / self.total_video
+        Fscore = dict()
+        for s in range(self.overlap_len):
+            precision = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fp[s])
+            recall = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fn[s])
+
+            f1 = 2.0 * (precision * recall) / (precision + recall)
+
+            f1 = np.nan_to_num(f1) * 100
+            Fscore[self.overlap[s]] = f1
+
+        # proposal metric
+        proposal_AUC = np.array(self.AR_at_AN) * 100
+        AUC = np.mean(proposal_AUC)
+        AR_at_AN1 = np.mean(proposal_AUC[0, :])
+        AR_at_AN5 = np.mean(proposal_AUC[4, :])
+        AR_at_AN15 = np.mean(proposal_AUC[14, :])
+
+        # log metric
+        log_mertic_info = "dataset model performence: "
+        # preds ensemble
+        log_mertic_info += "Acc: {:.4f}, ".format(Acc)
+        log_mertic_info += 'Edit: {:.4f}, '.format(Edit)
+        for s in range(len(self.overlap)):
+            log_mertic_info += 'F1@{:0.2f}: {:.4f}, '.format(
+                self.overlap[s], Fscore[self.overlap[s]])
+
+        # boundary metric
+        log_mertic_info += "Auc: {:.4f}, ".format(AUC)
+        log_mertic_info += "AR@AN1: {:.4f}, ".format(AR_at_AN1)
+        log_mertic_info += "AR@AN5: {:.4f}, ".format(AR_at_AN5)
+        log_mertic_info += "AR@AN15: {:.4f}, ".format(AR_at_AN15)
+        logger.info(log_mertic_info)
+
+        # log metric
+        metric_dict = dict()
+        metric_dict['Acc'] = Acc
+        metric_dict['Edit'] = Edit
+        for s in range(len(self.overlap)):
+            metric_dict['F1@{:0.2f}'.format(
+                self.overlap[s])] = Fscore[self.overlap[s]]
+        metric_dict['Auc'] = AUC
+        metric_dict['AR@AN1'] = AR_at_AN1
+        metric_dict['AR@AN5'] = AR_at_AN5
+        metric_dict['AR@AN15'] = AR_at_AN15
+
+        # clear for next epoch
+        # cls
+        self.cls_tp = np.zeros(self.overlap_len)
+        self.cls_fp = np.zeros(self.overlap_len)
+        self.cls_fn = np.zeros(self.overlap_len)
+        self.total_correct = 0
+        self.total_edit = 0
+        self.total_frame = 0
+        self.total_video = 0
+        # proposal
+        self.AR_at_AN = [[] for _ in range(self.max_proposal)]
+
+        return metric_dict
diff --git a/docs/src/paddlevideo/metrics/skeleton_metric.py b/docs/src/paddlevideo/metrics/skeleton_metric.py
new file mode 100644
index 000000000..797847806
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/skeleton_metric.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+import csv
+import paddle.nn.functional as F
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class SkeletonMetric(BaseMetric):
+    """
+    Test for Skeleton based model.
+    note: only support batch size = 1, single card test.
+
+    Args:
+        out_file: str, file to save test results.
+    """
+
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 out_file='submission.csv',
+                 log_interval=1,
+                 top_k=5):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.top1 = []
+        self.top5 = []
+        self.values = []
+        self.out_file = out_file
+        self.k = top_k
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        if data[0].shape[0] != outputs.shape[0]:
+            num_segs = data[0].shape[1]
+            batch_size = outputs.shape[0]
+            outputs = outputs.reshape(
+                [batch_size // num_segs, num_segs, outputs.shape[-1]])
+            outputs = outputs.mean(axis=1)
+        if len(data) == 2:  # data with label
+            labels = data[1]
+            top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
+            top5 = paddle.metric.accuracy(input=outputs, label=labels, k=self.k)
+            if self.world_size > 1:
+                top1 = paddle.distributed.all_reduce(
+                    top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+                top5 = paddle.distributed.all_reduce(
+                    top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            self.top1.append(top1.numpy())
+            self.top5.append(top5.numpy())
+        else:  # data without label, only support batch_size=1. Used for fsd-10.
+            prob = F.softmax(outputs)
+            clas = paddle.argmax(prob, axis=1).numpy()[0]
+            self.values.append((batch_id, clas))
+
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        if self.top1:  # data with label
+            logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {}'.format(
+                np.mean(np.array(self.top1)), np.mean(np.array(self.top5))))
+        else:
+            headers = ['sample_index', 'predict_category']
+            with open(
+                    self.out_file,
+                    'w',
+            ) as fp:
+                writer = csv.writer(fp)
+                writer.writerow(headers)
+                writer.writerows(self.values)
+            logger.info("Results saved in {} !".format(self.out_file))
diff --git a/docs/src/paddlevideo/metrics/transnetv2_metric.py b/docs/src/paddlevideo/metrics/transnetv2_metric.py
new file mode 100644
index 000000000..337088176
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/transnetv2_metric.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+def predictions_to_scenes(predictions):
+    scenes = []
+    t, t_prev, start = -1, 0, 0
+    for i, t in enumerate(predictions):
+        if t_prev == 1 and t == 0:
+            start = i
+        if t_prev == 0 and t == 1 and i != 0:
+            scenes.append([start, i])
+        t_prev = t
+    if t == 0:
+        scenes.append([start, i])
+
+    # just fix if all predictions are 1
+    if len(scenes) == 0:
+        return np.array([[0, len(predictions) - 1]], dtype=np.int32)
+
+    return np.array(scenes, dtype=np.int32)
+
+
+def evaluate_scenes(gt_scenes, pred_scenes, n_frames_miss_tolerance=2):
+    """
+    Adapted from: https://github.com/gyglim/shot-detection-evaluation
+    The original based on: http://imagelab.ing.unimore.it/imagelab/researchActivity.asp?idActivity=19
+
+    n_frames_miss_tolerance:
+        Number of frames it is possible to miss ground truth by, and still being counted as a correct detection.
+
+    Examples of computation with different tolerance margin:
+    n_frames_miss_tolerance = 0
+      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.5, 5.5]]
+      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[5.5, 5.5]] -> HIT
+      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[4.5, 4.5]] -> MISS
+    n_frames_miss_tolerance = 1
+      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.0, 6.0]]
+      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[5.0, 6.0]] -> HIT
+      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[4.0, 5.0]] -> HIT
+      gt_scenes:   [[0, 3], [4, 9]] -> gt_trans:   [[3.0, 4.0]] -> MISS
+    n_frames_miss_tolerance = 2
+      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[4.5, 6.5]]
+      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[4.5, 6.5]] -> HIT
+      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[3.5, 5.5]] -> HIT
+      gt_scenes:   [[0, 3], [4, 9]] -> gt_trans:   [[2.5, 4.5]] -> HIT
+      gt_scenes:   [[0, 2], [3, 9]] -> gt_trans:   [[1.5, 3.5]] -> MISS
+
+      Users should be careful about adopting these functions in any commercial matters.
+    """
+
+    shift = n_frames_miss_tolerance / 2
+    gt_scenes = gt_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])
+    pred_scenes = pred_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])
+
+    gt_trans = np.stack([gt_scenes[:-1, 1], gt_scenes[1:, 0]], 1)
+    pred_trans = np.stack([pred_scenes[:-1, 1], pred_scenes[1:, 0]], 1)
+
+    i, j = 0, 0
+    tp, fp, fn = 0, 0, 0
+
+    while i < len(gt_trans) or j < len(pred_trans):
+        if j == len(pred_trans) or pred_trans[j, 0] > gt_trans[i, 1]:
+            fn += 1
+            i += 1
+        elif i == len(gt_trans) or pred_trans[j, 1] < gt_trans[i, 0]:
+            fp += 1
+            j += 1
+        else:
+            i += 1
+            j += 1
+            tp += 1
+
+    if tp + fp != 0:
+        p = tp / (tp + fp)
+    else:
+        p = 0
+
+    if tp + fn != 0:
+        r = tp / (tp + fn)
+    else:
+        r = 0
+
+    if p + r != 0:
+        f1 = (p * r * 2) / (p + r)
+    else:
+        f1 = 0
+
+    assert tp + fn == len(gt_trans)
+    assert tp + fp == len(pred_trans)
+
+    return p, r, f1, (tp, fp, fn)
+
+
+def create_scene_based_summaries(one_hot_pred, one_hot_gt):
+    thresholds = np.array([
+        0.02, 0.06, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9
+    ])
+    precision, recall, f1, tp, fp, fn = np.zeros_like(thresholds), np.zeros_like(thresholds),\
+                                        np.zeros_like(thresholds), np.zeros_like(thresholds),\
+                                        np.zeros_like(thresholds), np.zeros_like(thresholds)
+
+    gt_scenes = predictions_to_scenes(one_hot_gt)
+    for i in range(len(thresholds)):
+        pred_scenes = predictions_to_scenes(
+            (one_hot_pred > thresholds[i]).astype(np.uint8)
+        )
+        precision[i], recall[i], f1[i], (tp[i], fp[i], fn[i]) = evaluate_scenes(gt_scenes, pred_scenes)
+
+    best_idx = np.argmax(f1)
+
+    return f1[best_idx]
+
+
+@METRIC.register
+class TransNetV2Metric(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.predictions = []
+        self.total_stats = {"tp": 0, "fp": 0, "fn": 0}
+
+    def update(self, batch_id, data, one_hot):
+        """update metrics during each iter
+        """
+        if isinstance(one_hot, tuple):
+            one_hot = one_hot[0]
+        one_hot = paddle.nn.functional.sigmoid(one_hot)[0]
+        self.predictions.append(one_hot.numpy()[25:75])
+        gt_scenes = data[1]
+        is_new_file = data[2]
+        if is_new_file:
+            self.compute(gt_scenes)
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def compute(self, gt_scenes):
+        predictions = np.concatenate(self.predictions, 0)[:len(frames)]
+        _, _, _, (tp, fp, fn), fp_mistakes, fn_mistakes = evaluate_scenes(
+            gt_scenes, predictions_to_scenes((predictions >= args.thr).astype(np.uint8)))
+
+        self.total_stats["tp"] += tp
+        self.total_stats["fp"] += fp
+        self.total_stats["fn"] += fn
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        p = self.total_stats["tp"] / (self.total_stats["tp"] + self.total_stats["fp"])
+        r = self.total_stats["tp"] / (self.total_stats["tp"] + self.total_stats["fn"])
+        f1 = (p * r * 2) / (p + r)
+        logger.info('[TEST] finished, Precision= {:5.2f}, Recall= {:5.2f} , F1 Score= {:5.2f} '.format(
+            p * 100, r * 100, f1 * 100))
\ No newline at end of file
diff --git a/docs/src/paddlevideo/metrics/ucf24_utils.py b/docs/src/paddlevideo/metrics/ucf24_utils.py
new file mode 100644
index 000000000..6552645b1
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/ucf24_utils.py
@@ -0,0 +1,783 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Forked from: https://github.com/rafaelpadilla/Object-Detection-Metrics
+# Developed by: Rafael Padilla (rafael.padilla@smt.ufrj.br)
+
+import glob
+import os
+import shutil
+import sys
+from collections import Counter
+import numpy as np
+from enum import Enum
+import cv2
+
+
+class MethodAveragePrecision(Enum):
+    """
+    Class representing if the coordinates are relative to the
+    image size or are absolute values.
+
+        Developed by: Rafael Padilla
+        Last modification: Apr 28 2018
+    """
+    EveryPointInterpolation = 1
+    ElevenPointInterpolation = 2
+
+
+class CoordinatesType(Enum):
+    """
+    Class representing if the coordinates are relative to the
+    image size or are absolute values.
+
+        Developed by: Rafael Padilla
+        Last modification: Apr 28 2018
+    """
+    Relative = 1
+    Absolute = 2
+
+
+class BBType(Enum):
+    """
+    Class representing if the bounding box is groundtruth or not.
+
+        Developed by: Rafael Padilla
+        Last modification: May 24 2018
+    """
+    GroundTruth = 1
+    Detected = 2
+
+
+class BBFormat(Enum):
+    """
+    Class representing the format of a bounding box.
+    It can be (X,Y,width,height) => XYWH
+    or (X1,Y1,X2,Y2) => XYX2Y2
+        Developed by: Rafael Padilla
+        Last modification: May 24 2018
+    """
+    XYWH = 1
+    XYX2Y2 = 2
+
+
+def convertToRelativeValues(size, box):
+    dw = 1. / (size[0])
+    dh = 1. / (size[1])
+    cx = (box[1] + box[0]) / 2.0
+    cy = (box[3] + box[2]) / 2.0
+    w = box[1] - box[0]
+    h = box[3] - box[2]
+    x = cx * dw
+    y = cy * dh
+    w = w * dw
+    h = h * dh
+    return x, y, w, h
+
+
+def convertToAbsoluteValues(size, box):
+    xIn = round(((2 * float(box[0]) - float(box[2])) * size[0] / 2))
+    yIn = round(((2 * float(box[1]) - float(box[3])) * size[1] / 2))
+    xEnd = xIn + round(float(box[2]) * size[0])
+    yEnd = yIn + round(float(box[3]) * size[1])
+    if xIn < 0:
+        xIn = 0
+    if yIn < 0:
+        yIn = 0
+    if xEnd >= size[0]:
+        xEnd = size[0] - 1
+    if yEnd >= size[1]:
+        yEnd = size[1] - 1
+    return xIn, yIn, xEnd, yEnd
+
+
+def add_bb_into_image(image, bb, color=(255, 0, 0), thickness=2, label=None):
+    r = int(color[0])
+    g = int(color[1])
+    b = int(color[2])
+
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    fontScale = 0.5
+    fontThickness = 1
+
+    x1, y1, x2, y2 = bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+    x1 = int(x1)
+    y1 = int(y1)
+    x2 = int(x2)
+    y2 = int(y2)
+    cv2.rectangle(image, (x1, y1), (x2, y2), (b, g, r), thickness)
+    # Add label
+    if label is not None:
+        # Get size of the text box
+        (tw, th) = cv2.getTextSize(label, font, fontScale, fontThickness)[0]
+        # Top-left coord of the textbox
+        (xin_bb, yin_bb) = (x1 + thickness, y1 - th + int(12.5 * fontScale))
+        # Checking position of the text top-left (outside or inside the bb)
+        if yin_bb - th <= 0:  # if outside the image
+            yin_bb = y1 + th  # put it inside the bb
+        r_Xin = x1 - int(thickness / 2)
+        r_Yin = y1 - th - int(thickness / 2)
+        # Draw filled rectangle to put the text in it
+        cv2.rectangle(image, (r_Xin, r_Yin - thickness),
+                      (r_Xin + tw + thickness * 3, r_Yin + th + int(12.5 * fontScale)), (b, g, r),
+                      -1)
+        cv2.putText(image, label, (xin_bb, yin_bb), font, fontScale, (0, 0, 0), fontThickness,
+                    cv2.LINE_AA)
+    return image
+
+
+class BoundingBox:
+    def __init__(self,
+                 imageName,
+                 classId,
+                 x,
+                 y,
+                 w,
+                 h,
+                 typeCoordinates=None,
+                 imgSize=None,
+                 bbType=None,
+                 classConfidence=None,
+                 format=None):
+        """Constructor.
+        Args:
+            imageName: String representing the image name.
+            classId: String value representing class id.
+            x: Float value representing the X upper-left coordinate of the bounding box.
+            y: Float value representing the Y upper-left coordinate of the bounding box.
+            w: Float value representing the width bounding box.
+            h: Float value representing the height bounding box.
+            typeCoordinates: (optional) Enum (Relative or Absolute) represents if the bounding box
+            coordinates (x,y,w,h) are absolute or relative to size of the image. Default:'Absolute'.
+            imgSize: (optional) 2D vector (width, height)=>(int, int) represents the size of the
+            image of the bounding box. If typeCoordinates is 'Relative', imgSize is required.
+            bbType: (optional) Enum (Groundtruth or Detection) identifies if the bounding box
+            represents a ground truth or a detection. If it is a detection, the classConfidence has
+            to be informed.
+            classConfidence: (optional) Float value representing the confidence of the detected
+            class. If detectionType is Detection, classConfidence needs to be informed.
+            format: (optional) Enum (BBFormat.XYWH or BBFormat.XYX2Y2) indicating the format of the
+            coordinates of the bounding boxes. BBFormat.XYWH: <left> <top> <width> <height>
+            BBFormat.XYX2Y2: <left> <top> <right> <bottom>.
+        """
+        self._imageName = imageName
+        self._typeCoordinates = typeCoordinates
+        if typeCoordinates == CoordinatesType.Relative and imgSize is None:
+            raise IOError(
+                'Parameter \'imgSize\' is required. It is necessary to inform the image size.')
+        if bbType == BBType.Detected and classConfidence is None:
+            raise IOError(
+                'For bbType=\'Detection\', it is necessary to inform the classConfidence value.')
+
+        self._classConfidence = classConfidence
+        self._bbType = bbType
+        self._classId = classId
+        self._format = format
+
+        # If relative coordinates, convert to absolute values
+        # For relative coords: (x,y,w,h)=(X_center/img_width , Y_center/img_height)
+        if typeCoordinates == CoordinatesType.Relative:
+            (self._x, self._y, self._w, self._h) = convertToAbsoluteValues(imgSize, (x, y, w, h))
+            self._width_img = imgSize[0]
+            self._height_img = imgSize[1]
+            if format == BBFormat.XYWH:
+                self._x2 = self._w
+                self._y2 = self._h
+                self._w = self._x2 - self._x
+                self._h = self._y2 - self._y
+            else:
+                raise IOError(
+                    'For relative coordinates, the format must be XYWH (x,y,width,height)')
+        # For absolute coords: (x,y,w,h)=real bb coords
+        else:
+            self._x = x
+            self._y = y
+            if format == BBFormat.XYWH:
+                self._w = w
+                self._h = h
+                self._x2 = self._x + self._w
+                self._y2 = self._y + self._h
+            else:  # format == BBFormat.XYX2Y2: <left> <top> <right> <bottom>.
+                self._x2 = w
+                self._y2 = h
+                self._w = self._x2 - self._x
+                self._h = self._y2 - self._y
+        if imgSize is None:
+            self._width_img = None
+            self._height_img = None
+        else:
+            self._width_img = imgSize[0]
+            self._height_img = imgSize[1]
+
+    def getAbsoluteBoundingBox(self, format=None):
+        if format == BBFormat.XYWH:
+            return self._x, self._y, self._w, self._h
+        elif format == BBFormat.XYX2Y2:
+            return self._x, self._y, self._x2, self._y2
+
+    def getRelativeBoundingBox(self, imgSize=None):
+        if imgSize is None and self._width_img is None and self._height_img is None:
+            raise IOError(
+                'Parameter \'imgSize\' is required. It is necessary to inform the image size.')
+        if imgSize is None:
+            return convertToRelativeValues((imgSize[0], imgSize[1]),
+                                           (self._x, self._y, self._w, self._h))
+        else:
+            return convertToRelativeValues((self._width_img, self._height_img),
+                                           (self._x, self._y, self._w, self._h))
+
+    def getImageName(self):
+        return self._imageName
+
+    def getConfidence(self):
+        return self._classConfidence
+
+    def getFormat(self):
+        return self._format
+
+    def getClassId(self):
+        return self._classId
+
+    def getImageSize(self):
+        return self._width_img, self._height_img
+
+    def getCoordinatesType(self):
+        return self._typeCoordinates
+
+    def getBBType(self):
+        return self._bbType
+
+    @staticmethod
+    def compare(det1, det2):
+        det1BB = det1.getAbsoluteBoundingBox(format=BBFormat.XYWH)
+        det1ImgSize = det1.getImageSize()
+        det2BB = det2.getAbsoluteBoundingBox(format=BBFormat.XYWH)
+        det2ImgSize = det2.getImageSize()
+
+        if det1.getClassId() == det2.getClassId() and \
+                det1.classConfidence == det2.classConfidenc() and \
+                det1BB[0] == det2BB[0] and \
+                det1BB[1] == det2BB[1] and \
+                det1BB[2] == det2BB[2] and \
+                det1BB[3] == det2BB[3] and \
+                det1ImgSize[0] == det1ImgSize[0] and \
+                det2ImgSize[1] == det2ImgSize[1]:
+            return True
+        return False
+
+    @staticmethod
+    def clone(boundingBox):
+        absBB = boundingBox.getAbsoluteBoundingBox(format=BBFormat.XYWH)
+        newBoundingBox = BoundingBox(
+            boundingBox.getImageName(),
+            boundingBox.getClassId(),
+            absBB[0],
+            absBB[1],
+            absBB[2],
+            absBB[3],
+            typeCoordinates=boundingBox.getCoordinatesType(),
+            imgSize=boundingBox.getImageSize(),
+            bbType=boundingBox.getBBType(),
+            classConfidence=boundingBox.getConfidence(),
+            format=BBFormat.XYWH)
+        return newBoundingBox
+
+
+class BoundingBoxes:
+    def __init__(self):
+        self._boundingBoxes = []
+
+    def addBoundingBox(self, bb):
+        self._boundingBoxes.append(bb)
+
+    def removeBoundingBox(self, _boundingBox):
+        for d in self._boundingBoxes:
+            if BoundingBox.compare(d, _boundingBox):
+                del self._boundingBoxes[d]
+                return
+
+    def removeAllBoundingBoxes(self):
+        self._boundingBoxes = []
+
+    def getBoundingBoxes(self):
+        return self._boundingBoxes
+
+    def getBoundingBoxByClass(self, classId):
+        boundingBoxes = []
+        for d in self._boundingBoxes:
+            if d.getClassId() == classId:  # get only specified bounding box type
+                boundingBoxes.append(d)
+        return boundingBoxes
+
+    def getClasses(self):
+        classes = []
+        for d in self._boundingBoxes:
+            c = d.getClassId()
+            if c not in classes:
+                classes.append(c)
+        return classes
+
+    def getBoundingBoxesByType(self, bbType):
+        # get only specified bb type
+        return [d for d in self._boundingBoxes if d.getBBType() == bbType]
+
+    def getBoundingBoxesByImageName(self, imageName):
+        # get only specified bb type
+        return [d for d in self._boundingBoxes if d.getImageName() == imageName]
+
+    def count(self, bbType=None):
+        if bbType is None:  # Return all bounding boxes
+            return len(self._boundingBoxes)
+        count = 0
+        for d in self._boundingBoxes:
+            if d.getBBType() == bbType:  # get only specified bb type
+                count += 1
+        return count
+
+    def clone(self):
+        newBoundingBoxes = BoundingBoxes()
+        for d in self._boundingBoxes:
+            det = BoundingBox.clone(d)
+            newBoundingBoxes.addBoundingBox(det)
+        return newBoundingBoxes
+
+    def drawAllBoundingBoxes(self, image, imageName):
+        bbxes = self.getBoundingBoxesByImageName(imageName)
+        for bb in bbxes:
+            if bb.getBBType() == BBType.GroundTruth:  # if ground truth
+                image = add_bb_into_image(image, bb, color=(0, 255, 0))  # green
+            else:  # if detection
+                image = add_bb_into_image(image, bb, color=(255, 0, 0))  # red
+        return image
+
+
+class Evaluator:
+    def GetPascalVOCMetrics(self,
+                            boundingboxes,
+                            IOUThreshold=0.5,
+                            method=None):
+        """Get the metrics used by the VOC Pascal 2012 challenge.
+        Get
+        Args:
+            boundingboxes: Object of the class BoundingBoxes representing ground truth and detected
+            bounding boxes;
+            IOUThreshold: IOU threshold indicating which detections will be considered TP or FP
+            (default value = 0.5);
+            method (default = EveryPointInterpolation): It can be calculated as the implementation
+            in the official PASCAL VOC toolkit (EveryPointInterpolation), or applying the 11-point
+            interpolatio as described in the paper "The PASCAL Visual Object Classes(VOC) Challenge"
+            or EveryPointInterpolation"  (ElevenPointInterpolation);
+        Returns:
+            A list of dictionaries. Each dictionary contains information and metrics of each class.
+            The keys of each dictionary are:
+            dict['class']: class representing the current dictionary;
+            dict['precision']: array with the precision values;
+            dict['recall']: array with the recall values;
+            dict['AP']: average precision;
+            dict['interpolated precision']: interpolated precision values;
+            dict['interpolated recall']: interpolated recall values;
+            dict['total positives']: total number of ground truth positives;
+            dict['total TP']: total number of True Positive detections;
+            dict['total FP']: total number of False Negative detections;
+        """
+        ret = []  # list containing metrics (precision, recall, average precision) of each class
+        # List with all ground truths (Ex: [imageName,class,confidence=1, (bb coordinates XYX2Y2)])
+        groundTruths = []
+        # List with all detections (Ex: [imageName,class,confidence,(bb coordinates XYX2Y2)])
+        detections = []
+        # Get all classes
+        classes = []
+        # Loop through all bounding boxes and separate them into GTs and detections
+        for bb in boundingboxes.getBoundingBoxes():
+            # [imageName, class, confidence, (bb coordinates XYX2Y2)]
+            if bb.getBBType() == BBType.GroundTruth:
+                groundTruths.append([
+                    bb.getImageName(),
+                    bb.getClassId(), 1,
+                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+                ])
+            else:
+                detections.append([
+                    bb.getImageName(),
+                    bb.getClassId(),
+                    bb.getConfidence(),
+                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+                ])
+            # get class
+            if bb.getClassId() not in classes:
+                classes.append(bb.getClassId())
+        classes = sorted(classes)
+        # Precision x Recall is obtained individually by each class
+        # Loop through by classes
+        for c in classes:
+            # Get only detection of class c
+            dects = []
+            [dects.append(d) for d in detections if d[1] == c]
+            # Get only ground truths of class c
+            gts = []
+            [gts.append(g) for g in groundTruths if g[1] == c]
+            npos = len(gts)
+            # sort detections by decreasing confidence
+            dects = sorted(dects, key=lambda conf: conf[2], reverse=True)
+            TP = np.zeros(len(dects))
+            FP = np.zeros(len(dects))
+            # create dictionary with amount of gts for each image
+            det = Counter([cc[0] for cc in gts])
+            for key, val in det.items():
+                det[key] = np.zeros(val)
+            # Loop through detections
+            for d in range(len(dects)):
+                # Find ground truth image
+                gt = [gt for gt in gts if gt[0] == dects[d][0]]
+                iouMax = sys.float_info.min
+                for j in range(len(gt)):
+                    iou = Evaluator.iou(dects[d][3], gt[j][3])
+                    if iou > iouMax:
+                        iouMax = iou
+                        jmax = j
+                # Assign detection as true positive/don't care/false positive
+                if iouMax >= IOUThreshold:
+                    if det[dects[d][0]][jmax] == 0:
+                        TP[d] = 1  # count as true positive
+                        det[dects[d][0]][jmax] = 1  # flag as already 'seen'
+                    else:
+                        FP[d] = 1  # count as false positive
+                # - A detected "cat" is overlaped with a GT "cat" with IOU >= IOUThreshold.
+                else:
+                    FP[d] = 1  # count as false positive
+            # compute precision, recall and average precision
+            acc_FP = np.cumsum(FP)
+            acc_TP = np.cumsum(TP)
+            rec = acc_TP / npos
+            prec = np.divide(acc_TP, (acc_FP + acc_TP))
+            # Depending on the method, call the right implementation
+            if method == MethodAveragePrecision.EveryPointInterpolation:
+                [ap, mpre, mrec, ii] = Evaluator.CalculateAveragePrecision(rec, prec)
+            else:
+                [ap, mpre, mrec, _] = Evaluator.ElevenPointInterpolatedAP(rec, prec)
+            # add class result in the dictionary to be returned
+            r = {
+                'class': c,
+                'precision': prec,
+                'recall': rec,
+                'AP': ap,
+                'interpolated precision': mpre,
+                'interpolated recall': mrec,
+                'total positives': npos,
+                'total TP': np.sum(TP),
+                'total FP': np.sum(FP)
+            }
+            ret.append(r)
+        return ret
+
+    @staticmethod
+    def CalculateAveragePrecision(rec, prec):
+        mrec = [0]
+        [mrec.append(e) for e in rec]
+        mrec.append(1)
+        mpre = [0]
+        [mpre.append(e) for e in prec]
+        mpre.append(0)
+        for i in range(len(mpre) - 1, 0, -1):
+            mpre[i - 1] = max(mpre[i - 1], mpre[i])
+        ii = []
+        for i in range(len(mrec) - 1):
+            if mrec[1:][i] != mrec[0:-1][i]:
+                ii.append(i + 1)
+        ap = 0
+        for i in ii:
+            ap = ap + np.sum((mrec[i] - mrec[i - 1]) * mpre[i])
+        return [ap, mpre[0:len(mpre) - 1], mrec[0:len(mpre) - 1], ii]
+
+    @staticmethod
+    # 11-point interpolated average precision
+    def ElevenPointInterpolatedAP(rec, prec):
+        mrec = []
+        [mrec.append(e) for e in rec]
+        mpre = []
+        [mpre.append(e) for e in prec]
+        recallValues = np.linspace(0, 1, 11)
+        recallValues = list(recallValues[::-1])
+        rhoInterp = []
+        recallValid = []
+        for r in recallValues:
+            # Obtain all recall values higher or equal than r
+            argGreaterRecalls = np.argwhere(mrec[:] >= r)
+            pmax = 0
+            # If there are recalls above r
+            if argGreaterRecalls.size != 0:
+                pmax = max(mpre[argGreaterRecalls.min():])
+            recallValid.append(r)
+            rhoInterp.append(pmax)
+        # By definition AP = sum(max(precision whose recall is above r))/11
+        ap = sum(rhoInterp) / 11
+        # Generating values for the plot
+        rvals = [recallValid[0]]
+        [rvals.append(e) for e in recallValid]
+        rvals.append(0)
+        pvals = [0]
+        [pvals.append(e) for e in rhoInterp]
+        pvals.append(0)
+        # rhoInterp = rhoInterp[::-1]
+        cc = []
+        for i in range(len(rvals)):
+            p = (rvals[i], pvals[i - 1])
+            if p not in cc:
+                cc.append(p)
+            p = (rvals[i], pvals[i])
+            if p not in cc:
+                cc.append(p)
+        recallValues = [i[0] for i in cc]
+        rhoInterp = [i[1] for i in cc]
+        return [ap, rhoInterp, recallValues, None]
+
+    # For each detections, calculate IOU with reference
+    @staticmethod
+    def _getAllIOUs(reference, detections):
+        ret = []
+        bbReference = reference.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+        # img = np.zeros((200,200,3), np.uint8)
+        for d in detections:
+            bb = d.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+            iou = Evaluator.iou(bbReference, bb)
+            ret.append((iou, reference, d))  # iou, reference, detection
+        return sorted(ret, key=lambda i: i[0], reverse=True)  # sort by iou (from highest to lowest)
+
+    @staticmethod
+    def iou(boxA, boxB):
+        # if boxes dont intersect
+        if Evaluator._boxesIntersect(boxA, boxB) is False:
+            return 0
+        interArea = Evaluator._getIntersectionArea(boxA, boxB)
+        union = Evaluator._getUnionAreas(boxA, boxB, interArea=interArea)
+        # intersection over union
+        iou = interArea / union
+        assert iou >= 0
+        return iou
+
+    @staticmethod
+    def _boxesIntersect(boxA, boxB):
+        if boxA[0] > boxB[2]:
+            return False  # boxA is right of boxB
+        if boxB[0] > boxA[2]:
+            return False  # boxA is left of boxB
+        if boxA[3] < boxB[1]:
+            return False  # boxA is above boxB
+        if boxA[1] > boxB[3]:
+            return False  # boxA is below boxB
+        return True
+
+    @staticmethod
+    def _getIntersectionArea(boxA, boxB):
+        xA = max(boxA[0], boxB[0])
+        yA = max(boxA[1], boxB[1])
+        xB = min(boxA[2], boxB[2])
+        yB = min(boxA[3], boxB[3])
+        # intersection area
+        return (xB - xA + 1) * (yB - yA + 1)
+
+    @staticmethod
+    def _getUnionAreas(boxA, boxB, interArea=None):
+        area_A = Evaluator._getArea(boxA)
+        area_B = Evaluator._getArea(boxB)
+        if interArea is None:
+            interArea = Evaluator._getIntersectionArea(boxA, boxB)
+        return float(area_A + area_B - interArea)
+
+    @staticmethod
+    def _getArea(box):
+        return (box[2] - box[0] + 1) * (box[3] - box[1] + 1)
+
+
+# Validate formats
+def ValidateFormats(argFormat, argName, errors):
+    if argFormat == 'xywh':
+        return BBFormat.XYWH
+    elif argFormat == 'xyrb':
+        return BBFormat.XYX2Y2
+    elif argFormat is None:
+        return BBFormat.XYWH  # default when nothing is passed
+    else:
+        errors.append(
+            'argument %s: invalid value. It must be either \'xywh\' or \'xyrb\'' % argName)
+
+
+# Validate mandatory args
+def ValidateMandatoryArgs(arg, argName, errors):
+    if arg is None:
+        errors.append('argument %s: required argument' % argName)
+    else:
+        return True
+
+
+def ValidateImageSize(arg, argName, argInformed, errors):
+    errorMsg = 'argument %s: required argument if %s is relative' % (argName, argInformed)
+    ret = None
+    if arg is None:
+        errors.append(errorMsg)
+    else:
+        arg = arg.replace('(', '').replace(')', '')
+        args = arg.split(',')
+        if len(args) != 2:
+            errors.append(
+                '%s. It must be in the format \'width,height\' (e.g. \'600,400\')' % errorMsg)
+        else:
+            if not args[0].isdigit() or not args[1].isdigit():
+                errors.append(
+                    '%s. It must be in INdiaTEGER the format \'width,height\' (e.g. \'600,400\')' %
+                    errorMsg)
+            else:
+                ret = (int(args[0]), int(args[1]))
+    return ret
+
+
+# Validate coordinate types
+def ValidateCoordinatesTypes(arg, argName, errors):
+    if arg == 'abs':
+        return CoordinatesType.Absolute
+    elif arg == 'rel':
+        return CoordinatesType.Relative
+    elif arg is None:
+        return CoordinatesType.Absolute  # default when nothing is passed
+    errors.append('argument %s: invalid value. It must be either \'rel\' or \'abs\'' % argName)
+
+
+def getBoundingBoxes(directory,
+                     isGT,
+                     bbFormat,
+                     coordType,
+                     allBoundingBoxes=None,
+                     allClasses=None,
+                     imgSize=(0, 0)):
+    """Read txt files containing bounding boxes (ground truth and detections)."""
+    print(directory)
+    if allBoundingBoxes is None:
+        allBoundingBoxes = BoundingBoxes()
+    if allClasses is None:
+        allClasses = []
+    # Read ground truths
+    os.chdir(directory)
+    files = glob.glob("*.txt")
+    files.sort()
+
+    for f in files:
+        nameOfImage = f.replace(".txt", "")
+        fh1 = open(f, "r")
+        for line in fh1:
+            line = line.replace("\n", "")
+            if line.replace(' ', '') == '':
+                continue
+            splitLine = line.split(" ")
+            if isGT:
+                idClass = (splitLine[0])  # class
+                x = float(splitLine[1])
+                y = float(splitLine[2])
+                w = float(splitLine[3])
+                h = float(splitLine[4])
+                bb = BoundingBox(
+                    nameOfImage,
+                    idClass,
+                    x,
+                    y,
+                    w,
+                    h,
+                    coordType,
+                    imgSize,
+                    BBType.GroundTruth,
+                    format=bbFormat)
+            else:
+                idClass = (splitLine[0])  # class
+                confidence = float(splitLine[1])
+                x = float(splitLine[2])
+                y = float(splitLine[3])
+                w = float(splitLine[4])
+                h = float(splitLine[5])
+                bb = BoundingBox(
+                    nameOfImage,
+                    idClass,
+                    x,
+                    y,
+                    w,
+                    h,
+                    coordType,
+                    imgSize,
+                    BBType.Detected,
+                    confidence,
+                    format=bbFormat)
+            allBoundingBoxes.addBoundingBox(bb)
+            if idClass not in allClasses:
+                allClasses.append(idClass)
+        fh1.close()
+    return allBoundingBoxes, allClasses
+
+
+def get_mAP(gtFolder, detFolder, threshold=0.5, savePath=None):
+    gtFormat = 'xyrb'
+    detFormat = 'xyrb'
+    gtCoordinates = 'abs'
+    detCoordinates = 'abs'
+    gtFolder = os.path.join(os.path.abspath('.'), gtFolder)
+    detFolder = os.path.join(os.path.abspath('.'), detFolder)
+
+    iouThreshold = threshold
+
+    # Arguments validation
+    errors = []
+    # Validate formats
+    gtFormat = ValidateFormats(gtFormat, 'gtFormat', errors)
+    detFormat = ValidateFormats(detFormat, '-detformat', errors)
+
+    # Coordinates types
+    gtCoordType = ValidateCoordinatesTypes(gtCoordinates, '-gtCoordinates', errors)
+    detCoordType = ValidateCoordinatesTypes(detCoordinates, '-detCoordinates', errors)
+    imgSize = (0, 0)
+
+    # Create directory to save results
+    shutil.rmtree(savePath, ignore_errors=True)  # Clear folder
+    if savePath is not None:
+        os.makedirs(savePath)
+
+    # Get groundtruth boxes
+    allBoundingBoxes, allClasses = getBoundingBoxes(
+        gtFolder, True, gtFormat, gtCoordType, imgSize=imgSize)
+    # Get detected boxes
+    allBoundingBoxes, allClasses = getBoundingBoxes(
+        detFolder, False, detFormat, detCoordType, allBoundingBoxes, allClasses, imgSize=imgSize)
+    allClasses.sort()
+
+    evaluator = Evaluator()
+    acc_AP = 0
+    validClasses = 0
+
+    # Plot Precision x Recall curve
+    detections = evaluator.GetPascalVOCMetrics(allBoundingBoxes, iouThreshold,
+                                               method=MethodAveragePrecision.EveryPointInterpolation)
+
+    # each detection is a class and store AP and mAP results in AP_res list
+    AP_res = []
+    for metricsPerClass in detections:
+        # Get metric values per each class
+        cl = metricsPerClass['class']
+        ap = metricsPerClass['AP']
+        totalPositives = metricsPerClass['total positives']
+
+        if totalPositives > 0:
+            validClasses = validClasses + 1
+            acc_AP = acc_AP + ap
+            ap_str = "{0:.2f}%".format(ap * 100)
+            AP_res.append('AP: %s (%s)' % (ap_str, cl))
+    mAP = acc_AP / validClasses
+    mAP_str = "{0:.2f}%".format(mAP * 100)
+    AP_res.append('mAP: %s' % mAP_str)
+    return AP_res
\ No newline at end of file
diff --git a/docs/src/paddlevideo/metrics/vos_metric.py b/docs/src/paddlevideo/metrics/vos_metric.py
new file mode 100644
index 000000000..54eadb8d9
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/vos_metric.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import paddle
+import zipfile
+import time
+from PIL import Image
+
+from paddle.io import DataLoader
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class VOSMetric(BaseMetric):
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 result_root,
+                 zip_dir,
+                 log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.video_num = 0
+        self.total_time = 0
+        self.total_frame = 0
+        self.total_sfps = 0
+        self.total_video_num = data_size
+        self.count = 0
+        self.result_root = result_root
+        self.zip_dir = zip_dir
+
+    def update(self, batch_id, data, model):
+        """update metrics during each iter
+        """
+        self.video_num += 1
+        seq_dataset = data
+        seq_name = seq_dataset.seq_name
+
+        logger.info('Prcessing Seq {} [{}/{}]:'.format(seq_name, self.video_num,
+                                                       self.total_video_num))
+        seq_dataloader = DataLoader(seq_dataset,
+                                    return_list=True,
+                                    batch_size=1,
+                                    shuffle=False,
+                                    num_workers=0)
+        seq_total_time = 0
+        seq_total_frame = 0
+        ref_embeddings = []
+        ref_masks = []
+        prev_embedding = []
+        prev_mask = []
+        with paddle.no_grad():
+            for frame_idx, samples in enumerate(seq_dataloader):
+                time_start = time.time()
+                all_preds = []
+                join_label = None
+                for aug_idx in range(len(samples)):
+                    if len(ref_embeddings) <= aug_idx:
+                        ref_embeddings.append([])
+                        ref_masks.append([])
+                        prev_embedding.append(None)
+                        prev_mask.append(None)
+
+                    sample = samples[aug_idx]
+                    ref_emb = ref_embeddings[aug_idx]
+                    ref_m = ref_masks[aug_idx]
+                    prev_emb = prev_embedding[aug_idx]
+                    prev_m = prev_mask[aug_idx]
+
+                    current_img = sample['current_img']
+                    if 'current_label' in sample.keys():
+                        current_label = sample['current_label']
+                        current_label = paddle.to_tensor(current_label)
+                    else:
+                        current_label = None
+
+                    obj_num = sample['meta']['obj_num']
+                    imgname = sample['meta']['current_name']
+                    ori_height = sample['meta']['height']
+                    ori_width = sample['meta']['width']
+                    current_img = current_img
+                    obj_num = obj_num
+                    bs, _, h, w = current_img.shape
+                    data_batch = [
+                        ref_emb, ref_m, prev_emb, prev_m, current_img,
+                        [ori_height, ori_width], obj_num
+                    ]
+
+                    all_pred, current_embedding = model(data_batch, mode='test')
+
+                    if frame_idx == 0:
+                        if current_label is None:
+                            logger.info(
+                                "No first frame label in Seq {}.".format(
+                                    seq_name))
+                        ref_embeddings[aug_idx].append(current_embedding)
+                        ref_masks[aug_idx].append(current_label)
+
+                        prev_embedding[aug_idx] = current_embedding
+                        prev_mask[aug_idx] = current_label
+                    else:
+                        if sample['meta']['flip']:  #False
+                            all_pred = self.flip_tensor(all_pred, 3)
+                        #  In YouTube-VOS, not all the objects appear in the first frame for the first time. Thus, we
+                        #  have to introduce new labels for new objects, if necessary.
+                        if not sample['meta']['flip'] and not (
+                                current_label is None) and join_label is None:
+                            join_label = paddle.cast(current_label,
+                                                     dtype='int64')
+                        all_preds.append(all_pred)
+                        if current_label is not None:
+                            ref_embeddings[aug_idx].append(current_embedding)
+                        prev_embedding[aug_idx] = current_embedding
+
+                if frame_idx > 0:
+                    all_preds = paddle.concat(all_preds, axis=0)
+                    all_preds = paddle.mean(
+                        all_preds, axis=0)  #average results if augmentation
+                    pred_label = paddle.argmax(all_preds, axis=0)
+                    if join_label is not None:
+                        join_label = paddle.squeeze(paddle.squeeze(join_label,
+                                                                   axis=0),
+                                                    axis=0)
+                        keep = paddle.cast((join_label == 0), dtype="int64")
+                        pred_label = pred_label * keep + join_label * (1 - keep)
+                        pred_label = pred_label
+                    current_label = paddle.reshape(
+                        pred_label, shape=[1, 1, ori_height, ori_width])
+                    flip_pred_label = self.flip_tensor(pred_label, 1)
+                    flip_current_label = paddle.reshape(
+                        flip_pred_label, shape=[1, 1, ori_height, ori_width])
+
+                    for aug_idx in range(len(samples)):
+                        if join_label is not None:
+                            if samples[aug_idx]['meta']['flip']:
+                                ref_masks[aug_idx].append(flip_current_label)
+                            else:
+                                ref_masks[aug_idx].append(current_label)
+                        if samples[aug_idx]['meta']['flip']:
+                            prev_mask[aug_idx] = flip_current_label
+                        else:
+                            prev_mask[
+                                aug_idx] = current_label  #update prev_mask
+
+                    one_frametime = time.time() - time_start
+                    seq_total_time += one_frametime
+                    seq_total_frame += 1
+                    obj_num = float(obj_num)
+                    logger.info('Frame: {}, Obj Num: {}, Time: {}'.format(
+                        imgname[0], obj_num, one_frametime))
+                    self.save_mask(
+                        pred_label,
+                        os.path.join(self.result_root, seq_name,
+                                     imgname[0].split('.')[0] + '.png'))
+                else:
+                    one_frametime = time.time() - time_start
+                    seq_total_time += one_frametime
+                    logger.info('Ref Frame: {}, Time: {}'.format(
+                        imgname[0], one_frametime))
+
+            del (ref_embeddings)
+            del (ref_masks)
+            del (prev_embedding)
+            del (prev_mask)
+            del (seq_dataset)
+            del (seq_dataloader)
+
+        seq_avg_time_per_frame = seq_total_time / seq_total_frame
+        self.total_time += seq_total_time
+        self.total_frame += seq_total_frame
+        total_avg_time_per_frame = self.total_time / self.total_frame
+        self.total_sfps += seq_avg_time_per_frame
+        avg_sfps = self.total_sfps / (batch_id + 1)
+        logger.info("Seq {} FPS: {}, Total FPS: {}, FPS per Seq: {}".format(
+            seq_name, 1. / seq_avg_time_per_frame,
+            1. / total_avg_time_per_frame, 1. / avg_sfps))
+
+    def flip_tensor(self, tensor, dim=0):
+        inv_idx = paddle.cast(paddle.arange(tensor.shape[dim] - 1, -1, -1),
+                              dtype="int64")
+        tensor = paddle.index_select(x=tensor, index=inv_idx, axis=dim)
+        return tensor
+
+    def save_mask(self, mask_tensor, path):
+        _palette = [
+            0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128,
+            0, 128, 128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191,
+            128, 0, 64, 0, 128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64,
+            0, 128, 64, 0, 0, 191, 0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22,
+            22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27,
+            28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33,
+            33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39,
+            39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43, 44, 44, 44,
+            45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50,
+            50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,
+            56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61,
+            62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67,
+            67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73,
+            73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78,
+            79, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 83, 83, 83, 84, 84,
+            84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 89, 89, 89, 90,
+            90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94, 94, 94, 95, 95, 95,
+            96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100, 100, 100, 101,
+            101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105, 105,
+            105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109,
+            110, 110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114,
+            114, 114, 115, 115, 115, 116, 116, 116, 117, 117, 117, 118, 118,
+            118, 119, 119, 119, 120, 120, 120, 121, 121, 121, 122, 122, 122,
+            123, 123, 123, 124, 124, 124, 125, 125, 125, 126, 126, 126, 127,
+            127, 127, 128, 128, 128, 129, 129, 129, 130, 130, 130, 131, 131,
+            131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135, 135, 135,
+            136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,
+            140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144,
+            144, 145, 145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148,
+            149, 149, 149, 150, 150, 150, 151, 151, 151, 152, 152, 152, 153,
+            153, 153, 154, 154, 154, 155, 155, 155, 156, 156, 156, 157, 157,
+            157, 158, 158, 158, 159, 159, 159, 160, 160, 160, 161, 161, 161,
+            162, 162, 162, 163, 163, 163, 164, 164, 164, 165, 165, 165, 166,
+            166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170, 170,
+            170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174,
+            175, 175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179,
+            179, 179, 180, 180, 180, 181, 181, 181, 182, 182, 182, 183, 183,
+            183, 184, 184, 184, 185, 185, 185, 186, 186, 186, 187, 187, 187,
+            188, 188, 188, 189, 189, 189, 190, 190, 190, 191, 191, 191, 192,
+            192, 192, 193, 193, 193, 194, 194, 194, 195, 195, 195, 196, 196,
+            196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200, 200, 200,
+            201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,
+            205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209,
+            209, 210, 210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213,
+            214, 214, 214, 215, 215, 215, 216, 216, 216, 217, 217, 217, 218,
+            218, 218, 219, 219, 219, 220, 220, 220, 221, 221, 221, 222, 222,
+            222, 223, 223, 223, 224, 224, 224, 225, 225, 225, 226, 226, 226,
+            227, 227, 227, 228, 228, 228, 229, 229, 229, 230, 230, 230, 231,
+            231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235, 235,
+            235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239,
+            240, 240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244,
+            244, 244, 245, 245, 245, 246, 246, 246, 247, 247, 247, 248, 248,
+            248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 252, 252, 252,
+            253, 253, 253, 254, 254, 254, 255, 255, 255
+        ]
+        mask = mask_tensor.cpu().numpy().astype('uint8')
+        mask = Image.fromarray(mask).convert('P')
+        mask.putpalette(_palette)
+        mask.save(path)
+
+    def zip_folder(self, source_folder, zip_dir):
+        f = zipfile.ZipFile(zip_dir, 'w', zipfile.ZIP_DEFLATED)
+        pre_len = len(os.path.dirname(source_folder))
+        for dirpath, dirnames, filenames in os.walk(source_folder):
+            for filename in filenames:
+                pathfile = os.path.join(dirpath, filename)
+                arcname = pathfile[pre_len:].strip(os.path.sep)
+                f.write(pathfile, arcname)
+        f.close()
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        self.zip_folder(self.result_root, self.zip_dir)
+        logger.info('Save result to {}.'.format(self.zip_dir))
diff --git a/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py b/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py
new file mode 100644
index 000000000..bdbd6e0d0
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/youtube8m/average_precision_calculator.py
@@ -0,0 +1,274 @@
+# Copyright 2020 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate or keep track of the interpolated average precision.
+
+It provides an interface for calculating interpolated average precision for an
+entire list or the top-n ranked items. For the definition of the
+(non-)interpolated average precision:
+http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf
+
+Example usages:
+1) Use it as a static function call to directly calculate average precision for
+a short ranked list in the memory.
+
+```
+import random
+
+p = np.array([random.random() for _ in xrange(10)])
+a = np.array([random.choice([0, 1]) for _ in xrange(10)])
+
+ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)
+```
+
+2) Use it as an object for long ranked list that cannot be stored in memory or
+the case where partial predictions can be observed at a time (Tensorflow
+predictions). In this case, we first call the function accumulate many times
+to process parts of the ranked list. After processing all the parts, we call
+peek_interpolated_ap_at_n.
+```
+p1 = np.array([random.random() for _ in xrange(5)])
+a1 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+p2 = np.array([random.random() for _ in xrange(5)])
+a2 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+
+# interpolated average precision at 10 using 1000 break points
+calculator = average_precision_calculator.AveragePrecisionCalculator(10)
+calculator.accumulate(p1, a1)
+calculator.accumulate(p2, a2)
+ap3 = calculator.peek_ap_at_n()
+```
+"""
+
+import heapq
+import random
+import numbers
+
+import numpy
+
+
+class AveragePrecisionCalculator(object):
+    """Calculate the average precision and average precision at n."""
+    def __init__(self, top_n=None):
+        """Construct an AveragePrecisionCalculator to calculate average precision.
+
+    This class is used to calculate the average precision for a single label.
+
+    Args:
+      top_n: A positive Integer specifying the average precision at n, or
+        None to use all provided data points.
+
+    Raises:
+      ValueError: An error occurred when the top_n is not a positive integer.
+    """
+        if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):
+            raise ValueError("top_n must be a positive integer or None.")
+
+        self._top_n = top_n  # average precision at n
+        self._total_positives = 0  # total number of positives have seen
+        self._heap = []  # max heap of (prediction, actual)
+
+    @property
+    def heap_size(self):
+        """Gets the heap size maintained in the class."""
+        return len(self._heap)
+
+    @property
+    def num_accumulated_positives(self):
+        """Gets the number of positive samples that have been accumulated."""
+        return self._total_positives
+
+    def accumulate(self, predictions, actuals, num_positives=None):
+        """Accumulate the predictions and their ground truth labels.
+
+    After the function call, we may call peek_ap_at_n to actually calculate
+    the average precision.
+    Note predictions and actuals must have the same shape.
+
+    Args:
+      predictions: a list storing the prediction scores.
+      actuals: a list storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      num_positives = If the 'predictions' and 'actuals' inputs aren't complete,
+      then it's possible some true positives were missed in them. In that case,
+      you can provide 'num_positives' in order to accurately track recall.
+
+    Raises:
+      ValueError: An error occurred when the format of the input is not the
+      numpy 1-D array or the shape of predictions and actuals does not match.
+    """
+        if len(predictions) != len(actuals):
+            raise ValueError(
+                "the shape of predictions and actuals does not match.")
+
+        if not num_positives is None:
+            if not isinstance(num_positives,
+                              numbers.Number) or num_positives < 0:
+                raise ValueError(
+                    "'num_positives' was provided but it wan't a nonzero number."
+                )
+
+        if not num_positives is None:
+            self._total_positives += num_positives
+        else:
+            self._total_positives += numpy.size(numpy.where(actuals > 0))
+        topk = self._top_n
+        heap = self._heap
+
+        for i in range(numpy.size(predictions)):
+            if topk is None or len(heap) < topk:
+                heapq.heappush(heap, (predictions[i], actuals[i]))
+            else:
+                if predictions[i] > heap[0][0]:  # heap[0] is the smallest
+                    heapq.heappop(heap)
+                    heapq.heappush(heap, (predictions[i], actuals[i]))
+
+    def clear(self):
+        """Clear the accumulated predictions."""
+        self._heap = []
+        self._total_positives = 0
+
+    def peek_ap_at_n(self):
+        """Peek the non-interpolated average precision at n.
+
+    Returns:
+      The non-interpolated average precision at n (default 0).
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+    """
+        if self.heap_size <= 0:
+            return 0
+        predlists = numpy.array(list(zip(*self._heap)))
+
+        ap = self.ap_at_n(predlists[0],
+                          predlists[1],
+                          n=self._top_n,
+                          total_num_positives=self._total_positives)
+        return ap
+
+    @staticmethod
+    def ap(predictions, actuals):
+        """Calculate the non-interpolated average precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      actuals: a numpy 1-D array storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+
+    Returns:
+      The non-interpolated average precision at n.
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+
+    Raises:
+      ValueError: An error occurred when the format of the input is not the
+      numpy 1-D array or the shape of predictions and actuals does not match.
+    """
+        return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None)
+
+    @staticmethod
+    def ap_at_n(predictions, actuals, n=20, total_num_positives=None):
+        """Calculate the non-interpolated average precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      actuals: a numpy 1-D array storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      n: the top n items to be considered in ap@n.
+      total_num_positives : (optionally) you can specify the number of total
+        positive
+      in the list. If specified, it will be used in calculation.
+
+    Returns:
+      The non-interpolated average precision at n.
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+
+    Raises:
+      ValueError: An error occurred when
+      1) the format of the input is not the numpy 1-D array;
+      2) the shape of predictions and actuals does not match;
+      3) the input n is not a positive integer.
+    """
+        if len(predictions) != len(actuals):
+            raise ValueError(
+                "the shape of predictions and actuals does not match.")
+
+        if n is not None:
+            if not isinstance(n, int) or n <= 0:
+                raise ValueError("n must be 'None' or a positive integer."
+                                 " It was '%s'." % n)
+
+        ap = 0.0
+
+        predictions = numpy.array(predictions)
+        actuals = numpy.array(actuals)
+
+        # add a shuffler to avoid overestimating the ap
+        predictions, actuals = AveragePrecisionCalculator._shuffle(
+            predictions, actuals)
+        sortidx = sorted(range(len(predictions)),
+                         key=lambda k: predictions[k],
+                         reverse=True)
+
+        if total_num_positives is None:
+            numpos = numpy.size(numpy.where(actuals > 0))
+        else:
+            numpos = total_num_positives
+
+        if numpos == 0:
+            return 0
+
+        if n is not None:
+            numpos = min(numpos, n)
+        delta_recall = 1.0 / numpos
+        poscount = 0.0
+
+        # calculate the ap
+        r = len(sortidx)
+        if n is not None:
+            r = min(r, n)
+        for i in range(r):
+            if actuals[sortidx[i]] > 0:
+                poscount += 1
+                ap += poscount / (i + 1) * delta_recall
+        return ap
+
+    @staticmethod
+    def _shuffle(predictions, actuals):
+        random.seed(0)
+        suffidx = random.sample(range(len(predictions)), len(predictions))
+        predictions = predictions[suffidx]
+        actuals = actuals[suffidx]
+        return predictions, actuals
+
+    @staticmethod
+    def _zero_one_normalize(predictions, epsilon=1e-7):
+        """Normalize the predictions to the range between 0.0 and 1.0.
+
+    For some predictions like SVM predictions, we need to normalize them before
+    calculate the interpolated average precision. The normalization will not
+    change the rank in the original list and thus won't change the average
+    precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      epsilon: a small constant to avoid denominator being zero.
+
+    Returns:
+      The normalized prediction.
+    """
+        denominator = numpy.max(predictions) - numpy.min(predictions)
+        ret = (predictions - numpy.min(predictions)) / numpy.max(
+            denominator, epsilon)
+        return ret
diff --git a/docs/src/paddlevideo/metrics/youtube8m/eval_util.py b/docs/src/paddlevideo/metrics/youtube8m/eval_util.py
new file mode 100644
index 000000000..724c72f60
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/youtube8m/eval_util.py
@@ -0,0 +1,205 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Provides functions to help with evaluating models."""
+import numpy as np
+import paddle
+from paddlevideo.utils import get_logger
+
+from ..base import BaseMetric
+from ..registry import METRIC
+from . import average_precision_calculator as ap_calculator
+from . import mean_average_precision_calculator as map_calculator
+
+logger = get_logger("paddlevideo")
+
+
+def flatten(l):
+    """ Merges a list of lists into a single list. """
+    return [item for sublist in l for item in sublist]
+
+
+def calculate_hit_at_one(predictions, actuals):
+    """
+    Hit@k: indicates the fraction of test samples that contain at least
+    one of the ground truth labels in the top k predictions,
+    i.e topk.
+
+    Args:
+        predictions: Matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+        actuals: Matrix containing the ground truth labels.
+        Dimensions are 'batch' x 'num_classes'.
+
+    Returns:
+        float: The average hit at one across the entire batch.
+    """
+    top_prediction = np.argmax(predictions, 1)
+    hits = actuals[np.arange(actuals.shape[0]), top_prediction]
+    return np.mean(hits)
+
+
+def calculate_precision_at_equal_recall_rate(predictions, actuals):
+    """
+    PERR: measures the video-level annotation precision when we retrieve the same number
+     of entities per video as there are in the ground-truth.
+    More details please refer to:  https://arxiv.org/abs/1609.08675
+
+    Args:
+        predictions: Matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+        actuals: Matrix containing the ground truth labels.
+        Dimensions are 'batch' x 'num_classes'.
+
+    Returns:
+        float: The average precision at equal recall rate across the entire batch.
+    """
+    aggregated_precision = 0.0
+    num_videos = actuals.shape[0]
+    for row in np.arange(num_videos):
+        num_labels = int(np.sum(actuals[row]))
+        top_indices = np.argpartition(predictions[row],
+                                      -num_labels)[-num_labels:]
+        item_precision = 0.0
+        for label_index in top_indices:
+            if predictions[row][label_index] > 0:
+                item_precision += actuals[row][label_index]
+        item_precision /= top_indices.size
+        aggregated_precision += item_precision
+    aggregated_precision /= num_videos
+    return aggregated_precision
+
+
+def calculate_gap(predictions, actuals, top_k=20):
+    """
+    GAP: the global average precision.
+
+    Only the top_k predictions are taken for each of the videos.
+
+    Args:
+        predictions: Matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+        actuals: Matrix containing the ground truth labels.
+        Dimensions are 'batch' x 'num_classes'.
+        top_k: How many predictions to use per video.
+
+    Returns:
+        float: The global average precision.
+    """
+    gap_calculator = ap_calculator.AveragePrecisionCalculator()
+    sparse_predictions, sparse_labels, num_positives = top_k_by_class(
+        predictions, actuals, top_k)
+    gap_calculator.accumulate(flatten(sparse_predictions),
+                              flatten(sparse_labels), sum(num_positives))
+    return gap_calculator.peek_ap_at_n()
+
+
+def top_k_by_class(predictions, labels, k=20):
+    """Extracts the top k predictions for each video, sorted by class.
+
+    Args:
+        predictions: A numpy matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+        k: the top k non-zero entries to preserve in each prediction.
+
+    Returns:
+        A tuple (predictions,labels, true_positives). 'predictions' and 'labels'
+        are lists of lists of floats. 'true_positives' is a list of scalars. The
+        length of the lists are equal to the number of classes. The entries in the
+        predictions variable are probability predictions, and
+        the corresponding entries in the labels variable are the ground truth for
+        those predictions. The entries in 'true_positives' are the number of true
+        positives for each class in the ground truth.
+
+    Raises:
+        ValueError: An error occurred when the k is not a positive integer.
+    """
+    if k <= 0:
+        raise ValueError("k must be a positive integer.")
+    k = min(k, predictions.shape[1])
+    num_classes = predictions.shape[1]
+    prediction_triplets = []
+    for video_index in range(predictions.shape[0]):
+        prediction_triplets.extend(
+            top_k_triplets(predictions[video_index], labels[video_index], k))
+    out_predictions = [[] for v in range(num_classes)]
+    out_labels = [[] for v in range(num_classes)]
+    for triplet in prediction_triplets:
+        out_predictions[triplet[0]].append(triplet[1])
+        out_labels[triplet[0]].append(triplet[2])
+    out_true_positives = [np.sum(labels[:, i]) for i in range(num_classes)]
+
+    return out_predictions, out_labels, out_true_positives
+
+
+def top_k_triplets(predictions, labels, k=20):
+    """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in
+    (prediction, class) format"""
+    m = len(predictions)
+    k = min(k, m)
+    indices = np.argpartition(predictions, -k)[-k:]
+    return [(index, predictions[index], labels[index]) for index in indices]
+
+
+@METRIC.register
+class HitOneMetric(BaseMetric):
+    """A class to store the evaluation metrics."""
+    def __init__(self,
+                 num_class,
+                 top_k,
+                 data_size,
+                 batch_size,
+                 log_interval=20):
+        """Construct an HitOneMetric object to store the evaluation metrics."""
+        self.hit_at_one = []
+        self.perr = []
+        self.gap = []
+        super().__init__(data_size, batch_size, log_interval)
+
+    def accumulate(self):
+        logger.info(
+            '[TEST] finished, hit_at_one = {:.5f}, perr = {:.5f}, gap = {:.5f}'.
+            format(np.mean(np.array(self.hit_at_one)),
+                   np.mean(np.array(self.perr)), np.mean(np.array(self.gap))))
+
+    def clear(self):
+        """Clear the evaluation metrics and reset the HitOneMetric object."""
+        self.hit_at_one = []
+        self.perr = []
+        self.gap = []
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        hit_at_one = paddle.to_tensor(outputs['hit_at_one'])
+        perr = paddle.to_tensor(outputs['perr'])
+        gap = paddle.to_tensor(outputs['gap'])
+        # NOTE(shipping): deal with multi cards validate
+        if self.world_size > 1:
+            hit_at_one = paddle.distributed.all_reduce(
+                hit_at_one,
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            perr = paddle.distributed.all_reduce(
+                perr, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            gap = paddle.distributed.all_reduce(
+                gap, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+        self.hit_at_one.append(hit_at_one.numpy())
+        self.perr.append(perr.numpy())
+        self.gap.append(gap.numpy())
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{}...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size),
+            ))
diff --git a/docs/src/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py b/docs/src/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py
new file mode 100644
index 000000000..0ae8b0ed3
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py
@@ -0,0 +1,114 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate the mean average precision.
+
+It provides an interface for calculating mean average precision
+for an entire list or the top-n ranked items.
+
+Example usages:
+We first call the function accumulate many times to process parts of the ranked
+list. After processing all the parts, we call peek_map_at_n
+to calculate the mean average precision.
+
+```
+import random
+
+p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])
+a = np.array([[random.choice([0, 1]) for _ in xrange(50)]
+     for _ in xrange(1000)])
+
+# mean average precision for 50 classes.
+calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(
+            num_class=50)
+calculator.accumulate(p, a)
+aps = calculator.peek_map_at_n()
+```
+"""
+
+import numpy
+from . import average_precision_calculator
+
+
+class MeanAveragePrecisionCalculator(object):
+    """This class is to calculate mean average precision.
+  """
+
+    def __init__(self, num_class):
+        """Construct a calculator to calculate the (macro) average precision.
+
+    Args:
+      num_class: A positive Integer specifying the number of classes.
+      top_n_array: A list of positive integers specifying the top n for each
+      class. The top n in each class will be used to calculate its average
+      precision at n.
+      The size of the array must be num_class.
+
+    Raises:
+      ValueError: An error occurred when num_class is not a positive integer;
+      or the top_n_array is not a list of positive integers.
+    """
+        if not isinstance(num_class, int) or num_class <= 1:
+            raise ValueError("num_class must be a positive integer.")
+
+        self._ap_calculators = []  # member of AveragePrecisionCalculator
+        self._num_class = num_class  # total number of classes
+        for i in range(num_class):
+            self._ap_calculators.append(
+                average_precision_calculator.AveragePrecisionCalculator())
+
+    def accumulate(self, predictions, actuals, num_positives=None):
+        """Accumulate the predictions and their ground truth labels.
+
+    Args:
+      predictions: A list of lists storing the prediction scores. The outer
+      dimension corresponds to classes.
+      actuals: A list of lists storing the ground truth labels. The dimensions
+      should correspond to the predictions input. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      num_positives: If provided, it is a list of numbers representing the
+      number of true positives for each class. If not provided, the number of
+      true positives will be inferred from the 'actuals' array.
+
+    Raises:
+      ValueError: An error occurred when the shape of predictions and actuals
+      does not match.
+    """
+        if not num_positives:
+            num_positives = [None for i in predictions.shape[1]]
+
+        calculators = self._ap_calculators
+        for i in range(len(predictions)):
+            calculators[i].accumulate(predictions[i], actuals[i],
+                                      num_positives[i])
+
+    def clear(self):
+        for calculator in self._ap_calculators:
+            calculator.clear()
+
+    def is_empty(self):
+        return ([calculator.heap_size for calculator in self._ap_calculators] ==
+                [0 for _ in range(self._num_class)])
+
+    def peek_map_at_n(self):
+        """Peek the non-interpolated mean average precision at n.
+
+    Returns:
+      An array of non-interpolated average precision at n (default 0) for each
+      class.
+    """
+        aps = [
+            self._ap_calculators[i].peek_ap_at_n()
+            for i in range(self._num_class)
+        ]
+        return aps
diff --git a/docs/src/paddlevideo/metrics/yowo_metric.py b/docs/src/paddlevideo/metrics/yowo_metric.py
new file mode 100644
index 000000000..032df0c73
--- /dev/null
+++ b/docs/src/paddlevideo/metrics/yowo_metric.py
@@ -0,0 +1,82 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import os
+from paddlevideo.utils import get_logger
+from .registry import METRIC
+from .base import BaseMetric
+from .ucf24_utils import get_mAP
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class YOWOMetric(BaseMetric):
+    """
+    Metrics for YOWO. Two Stages in this metric:
+    (1) Get test results using trained model, results will be saved in YOWOMetric.result_path;
+    (2) Calculate metrics using results file from stage (1).
+    """
+
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 gt_folder,
+                 result_path,
+                 threshold=0.5,
+                 save_path=None,
+                 log_interval=1):
+        """
+        Init for BMN metrics.
+        Params:
+            gtfolder:groundtruth folder path for ucf24
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.result_path = result_path
+        self.gt_folder = gt_folder
+        self.threshold = threshold
+        self.save_path = save_path
+
+        if not osp.isdir(self.result_path):
+            os.makedirs(self.result_path)
+
+    def update(self, batch_id, data, outputs):
+        frame_idx = outputs['frame_idx']
+        boxes = outputs["boxes"]
+        for j in range(len(frame_idx)):
+            detection_path = osp.join(self.result_path, frame_idx[j])
+            with open(detection_path, 'w+') as f_detect:
+                for box in boxes[j]:
+                    x1 = round(float(box[0] - box[2] / 2.0) * 320.0)
+                    y1 = round(float(box[1] - box[3] / 2.0) * 240.0)
+                    x2 = round(float(box[0] + box[2] / 2.0) * 320.0)
+                    y2 = round(float(box[1] + box[3] / 2.0) * 240.0)
+
+                    det_conf = float(box[4])
+                    for j in range((len(box) - 5) // 2):
+                        cls_conf = float(box[5 + 2 * j].item())
+                        prob = det_conf * cls_conf
+                        f_detect.write(
+                            str(int(box[6]) + 1) + ' ' + str(prob) + ' ' + str(x1) + ' ' + str(y1) + ' ' + str(
+                                x2) + ' ' + str(y2) + '\n')
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        metric_list = get_mAP(self.gt_folder, self.result_path, self.threshold, self.save_path)
+        for info in metric_list:
+            logger.info(info)
diff --git a/docs/src/paddlevideo/modeling/__init__.py b/docs/src/paddlevideo/modeling/__init__.py
new file mode 100644
index 000000000..639bd3403
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .assigners import MaxIoUAssignerAVA
+from .backbones import ResNet
+from .builder import (build_backbone, build_head, build_localizer, build_loss,
+                      build_recognizer)
+from .framework.detectors import BaseDetector, FastRCNN, TwoStageDetector
+from .framework.recognizers import BaseRecognizer, Recognizer2D
+from .heads import (AVARoIHead, BaseHead, BBoxHeadAVA, SingleRoIExtractor3D,
+                    TSNHead)
+from .losses import CrossEntropyLoss
+from .registry import (BACKBONES, DETECTORS, HEADS, LOCALIZERS, LOSSES,
+                       PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)
+from .samplers import RandomSampler
+from .weight_init import kaiming_normal_, trunc_normal_, weight_init_
+
+__all__ = [
+    'BACKBONES', 'HEADS', 'RECOGNIZERS', 'LOCALIZERS', 'PARTITIONERS', 'LOSSES',
+    'build_recognizer', 'build_localizer', 'build_head', 'build_backbone',
+    'build_loss', 'ResNet', 'TSNHead', 'BaseHead', 'BaseRecognizer',
+    'Recognizer2d', 'CrossEntropyLoss', 'ROI_EXTRACTORS',
+    'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'MaxIoUAssignerAVA',
+    'RandomSampler', 'DETECTORS', 'kaiming_normal_', 'trunc_normal_',
+    'weight_init_'
+]
diff --git a/docs/src/paddlevideo/modeling/assigners/__init__.py b/docs/src/paddlevideo/modeling/assigners/__init__.py
new file mode 100644
index 000000000..a4570db2f
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/assigners/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .max_iou_assigner_ava import MaxIoUAssignerAVA
+
+__all__ = ['MaxIoUAssignerAVA']
diff --git a/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py b/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py
new file mode 100644
index 000000000..5cc72bf53
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/assigners/max_iou_assigner_ava.py
@@ -0,0 +1,148 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import numpy as np
+from ..registry import BBOX_ASSIGNERS
+from ..bbox_utils import bbox_overlaps
+
+class AssignResult():
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.  """
+        self_inds = paddle.arange(1, len(gt_labels) + 1, dtype="int32")
+        gt_inds_squeeze = paddle.squeeze(self.gt_inds, axis=0)
+        self.gt_inds = paddle.concat([self_inds, gt_inds_squeeze])
+        gt_label_ones = paddle.full((len(gt_labels), ), 1, dtype='float32')
+        max_overlaps_squeeze = paddle.squeeze(self.max_overlaps, axis=0)
+        self.max_overlaps = paddle.concat([gt_label_ones, max_overlaps_squeeze])
+        if self.labels is not None:
+            self.labels = paddle.concat([gt_labels, self.labels])
+
+@BBOX_ASSIGNERS.register()
+class MaxIoUAssignerAVA():
+    """Assign a corresponding gt bbox or background to each bbox.  """
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 ignore_wrt_candidates=True,
+                 match_low_quality=True,
+                 gpu_assign_thr=-1,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+
+    def assign(self, 
+               bboxes, 
+               gt_bboxes, 
+               gt_labels=None):
+        """Assign gt to bboxes.  """
+        overlaps = bbox_overlaps(gt_bboxes, bboxes)
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        return assign_result
+
+    def assign_wrt_overlaps(self, overlaps, gt_labels=None):
+        """Assign w.r.t. the overlaps of bboxes with gts.  """
+        num_gts, num_bboxes = overlaps.shape[0], overlaps.shape[1]
+        # 1. assign -1
+        assigned_gt_inds = paddle.full((num_bboxes, ), -1, dtype='int32')
+
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        max_overlaps, argmax_overlaps = paddle.topk(overlaps, k=1, axis=0)
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        gt_max_overlaps, gt_argmax_overlaps = paddle.topk(overlaps, k=1, axis=1) 
+
+        # 2. assign negative: below the negative inds are set to be 0
+        match_labels = paddle.full(argmax_overlaps.shape, -1, dtype='int32')
+        match_labels = paddle.where(max_overlaps < self.neg_iou_thr,
+                            paddle.zeros_like(match_labels), match_labels)
+
+        # 3. assign positive: above positive IoU threshold
+        argmax_overlaps_int32 = paddle.cast(argmax_overlaps, 'int32')
+        match_labels = paddle.where(max_overlaps >= self.pos_iou_thr,
+                                argmax_overlaps_int32 + 1, match_labels)
+        assigned_gt_inds = match_labels
+        if self.match_low_quality:
+            # Low-quality matching will overwirte the assigned_gt_inds
+            # assigned in Step 3. Thus, the assigned gt might not be the
+            # best one for prediction.
+            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox
+            # 1 & 2, bbox 1 will be assigned as the best target for bbox A
+            # in step 3. However, if GT bbox 2's gt_argmax_overlaps = A,
+            # bbox A's assigned_gt_inds will be overwritten to be bbox B.
+            # This might be the reason that it is not used in ROI Heads.
+            for i in range(num_gts):
+                if gt_max_overlaps.numpy()[i] >= self.min_pos_iou:
+                    if self.gt_max_assign_all:
+                        equal_x_np = overlaps[i, :].numpy()
+                        equal_y_np = gt_max_overlaps[i].numpy()
+                        max_iou_inds = np.equal(equal_x_np, equal_y_np)
+                        max_iou_inds = paddle.to_tensor(max_iou_inds)
+                        max_iou_inds = paddle.reshape( max_iou_inds, [1,max_iou_inds.shape[0]] )
+                        match_labels_gts = paddle.full(max_iou_inds.shape, i+1, dtype='int32')
+                        match_labels = paddle.where(max_iou_inds, match_labels_gts, match_labels)
+                        assigned_gt_inds = match_labels
+                    else:
+                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        if gt_labels is not None:
+            # consider multi-class case (AVA)
+            assert len(gt_labels[0]) > 1
+            assigned_labels = paddle.full([num_bboxes, len(gt_labels[0])], 0, dtype='float32')
+            assigned_gt_inds_reshape = assigned_gt_inds.reshape([assigned_gt_inds.shape[1]])
+            pos_inds = paddle.nonzero( assigned_gt_inds_reshape , as_tuple=False)
+            pos_inds_num = float(paddle.numel(pos_inds))
+            if pos_inds_num > 0:
+                pos_inds = paddle.squeeze(pos_inds, axis = 1 )
+                assigned_gt_inds_squeeze = paddle.squeeze(assigned_gt_inds, axis=0)
+                assigned_gt_inds_select = paddle.index_select(assigned_gt_inds_squeeze, pos_inds) - 1
+                gt_labels_select = paddle.index_select(gt_labels, assigned_gt_inds_select)
+                A = assigned_gt_inds_squeeze
+                X = assigned_gt_inds_squeeze - 1
+                Y = paddle.zeros_like(X)
+                if A.shape[0]==1:
+                    if float(A) > 0:
+                        T=X
+                    else:
+                        T=Y
+                else:
+                    T = paddle.where(A>0, X, Y)
+                S = paddle.index_select(gt_labels, T)
+                AE = paddle.expand(A, [S.shape[1], A.shape[0]]) 
+                AET = paddle.transpose(AE, perm=[1, 0])
+                R = paddle.where(AET>0, S, assigned_labels) 
+                assigned_labels = R
+        else:
+            assigned_labels = None
+        ret = AssignResult(
+            num_gts,
+            assigned_gt_inds,
+            max_overlaps,
+            labels=assigned_labels)
+        return ret
diff --git a/docs/src/paddlevideo/modeling/backbones/__init__.py b/docs/src/paddlevideo/modeling/backbones/__init__.py
new file mode 100644
index 000000000..a88cedca0
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/__init__.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .actbert import BertForMultiModalPreTraining
+from .adds import ADDS_DepthNet
+from .agcn import AGCN
+from .asrf import ASRF
+from .bmn import BMN
+from .cfbi import CFBI
+from .movinet import MoViNet
+from .ms_tcn import MSTCN
+from .resnet import ResNet
+from .resnet_slowfast import ResNetSlowFast
+from .resnet_slowfast_MRI import ResNetSlowFast_MRI
+from .resnet_tsm import ResNetTSM
+from .resnet_tsm_MRI import ResNetTSM_MRI
+from .resnet_tsn_MRI import ResNetTSN_MRI
+from .resnet_tweaks_tsm import ResNetTweaksTSM
+from .resnet_tweaks_tsn import ResNetTweaksTSN
+from .stgcn import STGCN
+from .swin_transformer import SwinTransformer3D
+from .transnetv2 import TransNetV2
+from .vit import VisionTransformer
+from .vit_tweaks import VisionTransformer_tweaks
+from .ms_tcn import MSTCN
+from .asrf import ASRF
+from .resnet_tsn_MRI import ResNetTSN_MRI
+from .resnet_tsm_MRI import ResNetTSM_MRI
+from .resnet_slowfast_MRI import ResNetSlowFast_MRI
+from .cfbi import CFBI
+from .ctrgcn import CTRGCN
+from .agcn2s import AGCN2s
+from .movinet import MoViNet
+from .resnet3d_slowonly import ResNet3dSlowOnly
+from .toshift_vit import TokenShiftVisionTransformer
+from .pptsm_mv2 import PPTSM_MobileNetV2
+from .pptsm_mv3 import PPTSM_MobileNetV3
+from .pptsm_v2 import PPTSM_v2
+from .yowo import YOWO
+
+__all__ = [
+    'ResNet', 'ResNetTSM', 'ResNetTweaksTSM', 'ResNetSlowFast', 'BMN',
+    'ResNetTweaksTSN', 'VisionTransformer', 'STGCN', 'AGCN', 'TransNetV2',
+    'ADDS_DepthNet', 'VisionTransformer_tweaks', 'BertForMultiModalPreTraining',
+    'ResNetTSN_MRI', 'ResNetTSM_MRI', 'ResNetSlowFast_MRI', 'CFBI', 'MSTCN',
+    'ASRF', 'MoViNet', 'SwinTransformer3D', 'CTRGCN',
+    'TokenShiftVisionTransformer', 'AGCN2s', 'PPTSM_MobileNetV2',
+    'PPTSM_MobileNetV3', 'PPTSM_v2', 'ResNet3dSlowOnly', 'YOWO'
+]
diff --git a/docs/src/paddlevideo/modeling/backbones/actbert.py b/docs/src/paddlevideo/modeling/backbones/actbert.py
new file mode 100644
index 000000000..dbee1fd8c
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/actbert.py
@@ -0,0 +1,1158 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import numpy as np
+import math
+import copy
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout)
+from paddle.nn.initializer import Constant, Normal
+from ...utils.save_load import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+ACT2FN = {"gelu": F.gelu, "relu": F.relu, "swish": F.swish}
+
+
+class BertEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, vocab_size, max_position_embeddings, type_vocab_size,
+                 hidden_size, hidden_dropout_prob):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(vocab_size,
+                                            hidden_size,
+                                            padding_idx=0)
+        self.position_embeddings = nn.Embedding(max_position_embeddings,
+                                                hidden_size)
+        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.shape[1]
+        position_ids = paddle.arange(end=seq_length, dtype="int64")
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)  #8,36  -> 8,36,768
+        position_embeddings = self.position_embeddings(
+            position_ids)  #8,36  -> 8,36,768
+        token_type_embeddings = self.token_type_embeddings(
+            token_type_ids)  #8,36  -> 8,36,768
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertImageEmbeddings(nn.Layer):
+    def __init__(self, v_feature_size, v_hidden_size, v_hidden_dropout_prob):
+        super(BertImageEmbeddings, self).__init__()
+        self.image_embeddings = nn.Linear(v_feature_size, v_hidden_size)
+        self.image_location_embeddings = nn.Linear(5, v_hidden_size)
+        self.LayerNorm = nn.LayerNorm(v_hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(v_hidden_dropout_prob)
+
+    def forward(self, input_ids, input_loc):
+        img_embeddings = self.image_embeddings(
+            input_ids)  #8,37,2048 -> 8,37,1024
+        loc_embeddings = self.image_location_embeddings(
+            input_loc)  #8,37,5 -> 8,37,1024
+        embeddings = self.LayerNorm(img_embeddings + loc_embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings  # shape: bs*seq_len*hs
+
+
+class BertActionEmbeddings(nn.Layer):
+    def __init__(self, a_feature_size, a_hidden_size, a_hidden_dropout_prob):
+        super(BertActionEmbeddings, self).__init__()
+        self.action_embeddings = nn.Linear(a_feature_size, a_hidden_size)
+        self.LayerNorm = nn.LayerNorm(a_hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(a_hidden_dropout_prob)
+
+    def forward(self, input_ids):
+        action_embeddings = self.action_embeddings(
+            input_ids)  #8,5,2048 -> 8,5,768
+        embeddings = self.LayerNorm(action_embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Layer):
+    def __init__(self, hidden_size, num_attention_heads,
+                 attention_probs_dropout_prob):
+        super(BertSelfAttention, self).__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (hidden_size, num_attention_heads))
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [
+            self.num_attention_heads,
+            self.attention_head_size,
+        ]
+        x = x.reshape(new_x_shape)
+        return x.transpose((0, 2, 1, 3))
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer,
+                                         key_layer.transpose((0, 1, 3, 2)))
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose((0, 2, 1, 3))
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+class BertSelfOutput(nn.Layer):
+    def __init__(self, hidden_size, hidden_dropout_prob):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Layer):
+    def __init__(self, hidden_size, hidden_dropout_prob, num_attention_heads,
+                 attention_probs_dropout_prob):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(hidden_size, num_attention_heads,
+                                      attention_probs_dropout_prob)
+        self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)
+
+    def forward(self, input_tensor, attention_mask):
+        self_output, attention_probs = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output, attention_probs
+
+
+class BertIntermediate(nn.Layer):
+    def __init__(self, hidden_size, intermediate_size, hidden_act):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(hidden_size, intermediate_size)
+        if isinstance(hidden_act, str) or (sys.version_info[0] == 2
+                                           and isinstance(hidden_act, str)):
+            self.intermediate_act_fn = ACT2FN[hidden_act]
+        else:
+            self.intermediate_act_fn = hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Layer):
+    def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(intermediate_size, hidden_size)
+        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertEntAttention(nn.Layer):
+    """Core mudule of tangled transformer.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        v_hidden_size,
+        a_hidden_size,
+        bi_hidden_size,
+        attention_probs_dropout_prob,
+        v_attention_probs_dropout_prob,
+        a_attention_probs_dropout_prob,
+        av_attention_probs_dropout_prob,
+        at_attention_probs_dropout_prob,
+        bi_num_attention_heads,
+    ):
+        super(BertEntAttention, self).__init__()
+        if bi_hidden_size % bi_num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (bi_hidden_size, bi_num_attention_heads))
+
+        self.num_attention_heads = bi_num_attention_heads
+        self.attention_head_size = int(bi_hidden_size / bi_num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        # self attention layers for vision input
+        self.query1 = nn.Linear(v_hidden_size, self.all_head_size)
+        self.key1 = nn.Linear(v_hidden_size, self.all_head_size)
+        self.value1 = nn.Linear(v_hidden_size, self.all_head_size)
+        self.dropout1 = nn.Dropout(v_attention_probs_dropout_prob)
+
+        # self attention layers for text input
+        self.query2 = nn.Linear(hidden_size, self.all_head_size)
+        self.key2 = nn.Linear(hidden_size, self.all_head_size)
+        self.value2 = nn.Linear(hidden_size, self.all_head_size)
+        self.dropout2 = nn.Dropout(attention_probs_dropout_prob)
+
+        # self attention layers for action input
+        self.query3 = nn.Linear(a_hidden_size, self.all_head_size)
+        self.key3 = nn.Linear(a_hidden_size, self.all_head_size)
+        self.value3 = nn.Linear(a_hidden_size, self.all_head_size)
+        self.dropout3 = nn.Dropout(a_attention_probs_dropout_prob)
+
+        # self attention layers for action_text
+        self.key_at = nn.Linear(bi_hidden_size, self.all_head_size)
+        self.value_at = nn.Linear(bi_hidden_size, self.all_head_size)
+        self.dropout_at = nn.Dropout(av_attention_probs_dropout_prob)
+
+        # self attention layers for action_vision
+        self.key_av = nn.Linear(bi_hidden_size, self.all_head_size)
+        self.value_av = nn.Linear(bi_hidden_size, self.all_head_size)
+        self.dropout_av = nn.Dropout(at_attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [
+            self.num_attention_heads,
+            self.attention_head_size,
+        ]
+        x = x.reshape(new_x_shape)
+        return x.transpose((0, 2, 1, 3))
+
+    def forward(
+        self,
+        input_tensor1,
+        attention_mask1,
+        input_tensor2,
+        attention_mask2,
+        input_tensor3,
+        attention_mask3,
+    ):
+
+        # for vision input.
+        mixed_query_layer1 = self.query1(input_tensor1)
+        mixed_key_layer1 = self.key1(input_tensor1)
+        mixed_value_layer1 = self.value1(input_tensor1)
+
+        query_layer1 = self.transpose_for_scores(mixed_query_layer1)
+        key_layer1 = self.transpose_for_scores(mixed_key_layer1)
+        value_layer1 = self.transpose_for_scores(mixed_value_layer1)
+
+        # for text input:
+        mixed_query_layer2 = self.query2(input_tensor2)
+        mixed_key_layer2 = self.key2(input_tensor2)
+        mixed_value_layer2 = self.value2(input_tensor2)
+
+        query_layer2 = self.transpose_for_scores(mixed_query_layer2)
+        key_layer2 = self.transpose_for_scores(mixed_key_layer2)
+        value_layer2 = self.transpose_for_scores(mixed_value_layer2)
+
+        # for action input:
+        mixed_query_layer3 = self.query3(input_tensor3)
+        mixed_key_layer3 = self.key3(input_tensor3)
+        mixed_value_layer3 = self.value3(input_tensor3)
+
+        query_layer3 = self.transpose_for_scores(mixed_query_layer3)
+        key_layer3 = self.transpose_for_scores(mixed_key_layer3)
+        value_layer3 = self.transpose_for_scores(mixed_value_layer3)
+
+        def do_attention(query_layer, key_layer, value_layer, attention_mask,
+                         dropout):
+            """ compute attention """
+            attention_scores = paddle.matmul(query_layer,
+                                             key_layer.transpose((0, 1, 3, 2)))
+            attention_scores = attention_scores / math.sqrt(
+                self.attention_head_size)
+            attention_scores = attention_scores + attention_mask
+
+            # Normalize the attention scores to probabilities.
+            attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = dropout(attention_probs)
+
+            context_layer = paddle.matmul(attention_probs, value_layer)
+            context_layer = context_layer.transpose((0, 2, 1, 3))
+            new_context_layer_shape = context_layer.shape[:-2] + [
+                self.all_head_size
+            ]
+            context_layer = context_layer.reshape(new_context_layer_shape)
+            return context_layer
+
+        context_av = do_attention(query_layer3, key_layer1, value_layer1,
+                                  attention_mask1, self.dropout_av)
+        context_at = do_attention(query_layer3, key_layer2, value_layer2,
+                                  attention_mask2, self.dropout_at)
+
+        context_key_av = self.key_av(context_av).transpose((0, 2, 1))
+        # interpolate only support 4-D tensor now.
+        context_key_av = F.interpolate(context_key_av.unsqueeze(-1),
+                                       size=(key_layer2.shape[2],
+                                             1)).squeeze(-1)
+        context_key_av = self.transpose_for_scores(
+            context_key_av.transpose((0, 2, 1)))
+        key_layer2 = key_layer2 + context_key_av
+
+        context_key_at = self.key_at(context_at).transpose((0, 2, 1))
+        context_key_at = F.interpolate(context_key_at.unsqueeze(-1),
+                                       size=(key_layer1.shape[2],
+                                             1)).squeeze(-1)
+        context_key_at = self.transpose_for_scores(
+            context_key_at.transpose((0, 2, 1)))
+        key_layer1 = key_layer1 + context_key_at
+
+        context_val_av = self.value_at(context_av).transpose((0, 2, 1))
+        context_val_av = F.interpolate(context_val_av.unsqueeze(-1),
+                                       size=(value_layer2.shape[2],
+                                             1)).squeeze(-1)
+        context_val_av = self.transpose_for_scores(
+            context_val_av.transpose((0, 2, 1)))
+        value_layer2 = value_layer2 + context_val_av
+
+        context_val_at = self.value_at(context_at).transpose((0, 2, 1))
+        context_val_at = F.interpolate(context_val_at.unsqueeze(-1),
+                                       size=(value_layer1.shape[2],
+                                             1)).squeeze(-1)
+        context_val_at = self.transpose_for_scores(
+            context_val_at.transpose((0, 2, 1)))
+        value_layer1 = value_layer1 + context_val_at
+
+        context_layer1 = do_attention(query_layer1, key_layer1, value_layer1,
+                                      attention_mask1, self.dropout1)
+        context_layer2 = do_attention(query_layer2, key_layer2, value_layer2,
+                                      attention_mask2, self.dropout2)
+        context_layer3 = do_attention(query_layer3, key_layer3, value_layer3,
+                                      attention_mask3, self.dropout3)
+
+        return context_layer1, context_layer2, context_layer3  # vision, text, action
+
+
+class BertEntOutput(nn.Layer):
+    def __init__(
+        self,
+        bi_hidden_size,
+        hidden_size,
+        v_hidden_size,
+        v_hidden_dropout_prob,
+        hidden_dropout_prob,
+    ):
+        super(BertEntOutput, self).__init__()
+
+        self.dense1 = nn.Linear(bi_hidden_size, v_hidden_size)
+        self.LayerNorm1 = nn.LayerNorm(v_hidden_size, epsilon=1e-12)
+        self.dropout1 = nn.Dropout(v_hidden_dropout_prob)
+
+        self.dense2 = nn.Linear(bi_hidden_size, hidden_size)
+        self.LayerNorm2 = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout2 = nn.Dropout(hidden_dropout_prob)
+
+        self.dense3 = nn.Linear(bi_hidden_size, hidden_size)
+        self.LayerNorm3 = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout3 = nn.Dropout(hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states1,
+        input_tensor1,
+        hidden_states2,
+        input_tensor2,
+        hidden_states3,
+        input_tensor3,
+    ):
+        context_state1 = self.dense1(hidden_states1)
+        context_state1 = self.dropout1(context_state1)
+
+        context_state2 = self.dense2(hidden_states2)
+        context_state2 = self.dropout2(context_state2)
+
+        context_state3 = self.dense3(hidden_states3)
+        context_state3 = self.dropout3(context_state3)
+
+        hidden_states1 = self.LayerNorm1(context_state1 + input_tensor1)
+        hidden_states2 = self.LayerNorm2(context_state2 + input_tensor2)
+        hidden_states3 = self.LayerNorm3(context_state3 + input_tensor3)
+
+        return hidden_states1, hidden_states2, hidden_states3
+
+
+class BertLayer(nn.Layer):
+    def __init__(self, hidden_size, intermediate_size, hidden_act,
+                 hidden_dropout_prob, num_attention_heads,
+                 attention_probs_dropout_prob):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(hidden_size, hidden_dropout_prob,
+                                       num_attention_heads,
+                                       attention_probs_dropout_prob)
+        self.intermediate = BertIntermediate(hidden_size, intermediate_size,
+                                             hidden_act)
+        self.output = BertOutput(intermediate_size, hidden_size,
+                                 hidden_dropout_prob)
+
+    def forward(self, hidden_states, attention_mask):
+        attention_output, attention_probs = self.attention(
+            hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output, attention_probs
+
+
+class BertConnectionLayer(nn.Layer):
+    def __init__(self, hidden_size, v_hidden_size, a_hidden_size,
+                 bi_hidden_size, bi_num_attention_heads,
+                 attention_probs_dropout_prob, v_attention_probs_dropout_prob,
+                 a_attention_probs_dropout_prob,
+                 av_attention_probs_dropout_prob,
+                 at_attention_probs_dropout_prob, intermediate_size,
+                 v_intermediate_size, a_intermediate_size, hidden_act,
+                 v_hidden_act, a_hidden_act, hidden_dropout_prob,
+                 v_hidden_dropout_prob, a_hidden_dropout_prob):
+        super(BertConnectionLayer, self).__init__()
+        self.ent_attention = BertEntAttention(
+            hidden_size,
+            v_hidden_size,
+            a_hidden_size,
+            bi_hidden_size,
+            attention_probs_dropout_prob,
+            v_attention_probs_dropout_prob,
+            a_attention_probs_dropout_prob,
+            av_attention_probs_dropout_prob,
+            at_attention_probs_dropout_prob,
+            bi_num_attention_heads,
+        )
+
+        self.ent_output = BertEntOutput(
+            bi_hidden_size,
+            hidden_size,
+            v_hidden_size,
+            v_hidden_dropout_prob,
+            hidden_dropout_prob,
+        )
+
+        self.v_intermediate = BertIntermediate(v_hidden_size,
+                                               v_intermediate_size,
+                                               v_hidden_act)
+        self.v_output = BertOutput(v_intermediate_size, v_hidden_size,
+                                   v_hidden_dropout_prob)
+
+        self.t_intermediate = BertIntermediate(hidden_size, intermediate_size,
+                                               hidden_act)
+        self.t_output = BertOutput(intermediate_size, hidden_size,
+                                   hidden_dropout_prob)
+
+        self.a_intermediate = BertIntermediate(a_hidden_size,
+                                               a_intermediate_size,
+                                               a_hidden_act)
+        self.a_output = BertOutput(a_intermediate_size, a_hidden_size,
+                                   a_hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_tensor1,
+        attention_mask1,
+        input_tensor2,
+        attention_mask2,
+        input_tensor3,
+        attention_mask3,
+    ):
+
+        ent_output1, ent_output2, ent_output3 = self.ent_attention(
+            input_tensor1, attention_mask1, input_tensor2, attention_mask2,
+            input_tensor3, attention_mask3)
+
+        attention_output1, attention_output2, attention_output3 = self.ent_output(
+            ent_output1, input_tensor1, ent_output2, input_tensor2, ent_output3,
+            input_tensor3)
+
+        intermediate_output1 = self.v_intermediate(attention_output1)
+        layer_output1 = self.v_output(intermediate_output1, attention_output1)
+
+        intermediate_output2 = self.t_intermediate(attention_output2)
+        layer_output2 = self.t_output(intermediate_output2, attention_output2)
+
+        intermediate_output3 = self.a_intermediate(attention_output3)
+        layer_output3 = self.a_output(intermediate_output3, attention_output3)
+
+        return layer_output1, layer_output2, layer_output3
+
+
+class BertEncoder(nn.Layer):
+    """
+    ActBert Encoder, consists 3 pathway of multi-BertLayers and BertConnectionLayer.
+    """
+    def __init__(
+        self,
+        v_ent_attention_id,
+        t_ent_attention_id,
+        a_ent_attention_id,
+        fixed_t_layer,
+        fixed_v_layer,
+        hidden_size,
+        v_hidden_size,
+        a_hidden_size,
+        bi_hidden_size,
+        intermediate_size,
+        v_intermediate_size,
+        a_intermediate_size,
+        hidden_act,
+        v_hidden_act,
+        a_hidden_act,
+        hidden_dropout_prob,
+        v_hidden_dropout_prob,
+        a_hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        v_attention_probs_dropout_prob,
+        a_attention_probs_dropout_prob,
+        av_attention_probs_dropout_prob,
+        at_attention_probs_dropout_prob,
+        num_attention_heads,
+        v_num_attention_heads,
+        a_num_attention_heads,
+        bi_num_attention_heads,
+        num_hidden_layers,
+        v_num_hidden_layers,
+        a_num_hidden_layers,
+    ):
+        super(BertEncoder, self).__init__()
+        self.v_ent_attention_id = v_ent_attention_id
+        self.t_ent_attention_id = t_ent_attention_id
+        self.a_ent_attention_id = a_ent_attention_id
+        self.fixed_t_layer = fixed_t_layer
+        self.fixed_v_layer = fixed_v_layer
+
+        layer = BertLayer(hidden_size, intermediate_size, hidden_act,
+                          hidden_dropout_prob, num_attention_heads,
+                          attention_probs_dropout_prob)
+        v_layer = BertLayer(v_hidden_size, v_intermediate_size, v_hidden_act,
+                            v_hidden_dropout_prob, v_num_attention_heads,
+                            v_attention_probs_dropout_prob)
+        a_layer = BertLayer(a_hidden_size, a_intermediate_size, a_hidden_act,
+                            a_hidden_dropout_prob, a_num_attention_heads,
+                            a_attention_probs_dropout_prob)
+        connect_layer = BertConnectionLayer(
+            hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,
+            bi_num_attention_heads, attention_probs_dropout_prob,
+            v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,
+            av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,
+            intermediate_size, v_intermediate_size, a_intermediate_size,
+            hidden_act, v_hidden_act, a_hidden_act, hidden_dropout_prob,
+            v_hidden_dropout_prob, a_hidden_dropout_prob)
+
+        self.layer = nn.LayerList(
+            [copy.deepcopy(layer) for _ in range(num_hidden_layers)])  #12
+        self.v_layer = nn.LayerList(
+            [copy.deepcopy(v_layer) for _ in range(v_num_hidden_layers)])  #2
+        self.a_layer = nn.LayerList(
+            [copy.deepcopy(a_layer) for _ in range(a_num_hidden_layers)])  #3
+        self.c_layer = nn.LayerList([
+            copy.deepcopy(connect_layer) for _ in range(len(v_ent_attention_id))
+        ]  #2  [0,1]
+                                    )
+
+    def forward(
+        self,
+        txt_embedding,
+        image_embedding,
+        action_embedding,
+        txt_attention_mask,
+        image_attention_mask,
+        action_attention_mask,
+        output_all_encoded_layers=True,
+    ):
+        v_start, a_start, t_start = 0, 0, 0
+        count = 0
+        all_encoder_layers_t = []
+        all_encoder_layers_v = []
+        all_encoder_layers_a = []
+
+        for v_layer_id, a_layer_id, t_layer_id in zip(self.v_ent_attention_id,
+                                                      self.a_ent_attention_id,
+                                                      self.t_ent_attention_id):
+            v_end = v_layer_id
+            a_end = a_layer_id
+            t_end = t_layer_id
+
+            assert self.fixed_t_layer <= t_end
+            assert self.fixed_v_layer <= v_end
+
+            ### region embedding
+            for idx in range(v_start,
+                             self.fixed_v_layer):  #两次训练，这个循环都没有进去  #前面的层固定住
+                with paddle.no_grad():
+                    image_embedding, image_attention_probs = self.v_layer[idx](
+                        image_embedding, image_attention_mask)
+                    v_start = self.fixed_v_layer
+            for idx in range(v_start, v_end):
+                image_embedding, image_attention_probs = self.v_layer[idx](
+                    image_embedding, image_attention_mask)
+
+            ### action embedding
+            for idx in range(a_start, a_end):
+                action_embedding, action_attention_probs = self.a_layer[idx](
+                    action_embedding, action_attention_mask)
+
+            ### text embedding
+            for idx in range(t_start, self.fixed_t_layer):
+                with paddle.no_grad():
+                    txt_embedding, txt_attention_probs = self.layer[idx](
+                        txt_embedding, txt_attention_mask)
+                    t_start = self.fixed_t_layer
+            for idx in range(t_start, t_end):
+                txt_embedding, txt_attention_probs = self.layer[idx](
+                    txt_embedding, txt_attention_mask)
+
+            image_embedding, txt_embedding, action_embedding = self.c_layer[
+                count](image_embedding, image_attention_mask, txt_embedding,
+                       txt_attention_mask, action_embedding,
+                       action_attention_mask)
+
+            v_start = v_end
+            t_start = t_end
+            a_start = a_end
+            count += 1
+
+            if output_all_encoded_layers:
+                all_encoder_layers_t.append(txt_embedding)
+                all_encoder_layers_v.append(image_embedding)
+                all_encoder_layers_a.append(action_embedding)
+
+        for idx in range(v_start, len(self.v_layer)):  # 1
+            image_embedding, image_attention_probs = self.v_layer[idx](
+                image_embedding, image_attention_mask)
+
+        for idx in range(a_start, len(self.a_layer)):
+            action_embedding, action_attention_probs = self.a_layer[idx](
+                action_embedding, action_attention_mask)
+
+        for idx in range(t_start, len(self.layer)):
+            txt_embedding, txt_attention_probs = self.layer[idx](
+                txt_embedding, txt_attention_mask)
+
+        # add the end part to finish.
+        if not output_all_encoded_layers:
+            all_encoder_layers_t.append(txt_embedding)  #8, 36, 768
+            all_encoder_layers_v.append(image_embedding)  #8, 37, 1024
+            all_encoder_layers_a.append(action_embedding)  #8, 5, 768
+
+        return all_encoder_layers_t, all_encoder_layers_v, all_encoder_layers_a
+
+
+class BertPooler(nn.Layer):
+    """ "Pool" the model by simply taking the hidden state corresponding
+        to the first token.
+    """
+    def __init__(self, hidden_size, bi_hidden_size):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(hidden_size, bi_hidden_size)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states):
+        first_token_tensor = hidden_states[:, 0]  #8, 768
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertModel(nn.Layer):
+    def __init__(
+        self,
+        vocab_size,
+        max_position_embeddings,
+        type_vocab_size,
+        v_feature_size,
+        a_feature_size,
+        num_hidden_layers,
+        v_num_hidden_layers,
+        a_num_hidden_layers,
+        v_ent_attention_id,
+        t_ent_attention_id,
+        a_ent_attention_id,
+        fixed_t_layer,
+        fixed_v_layer,
+        hidden_size,
+        v_hidden_size,
+        a_hidden_size,
+        bi_hidden_size,
+        intermediate_size,
+        v_intermediate_size,
+        a_intermediate_size,
+        hidden_act,
+        v_hidden_act,
+        a_hidden_act,
+        hidden_dropout_prob,
+        v_hidden_dropout_prob,
+        a_hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        v_attention_probs_dropout_prob,
+        a_attention_probs_dropout_prob,
+        av_attention_probs_dropout_prob,
+        at_attention_probs_dropout_prob,
+        num_attention_heads,
+        v_num_attention_heads,
+        a_num_attention_heads,
+        bi_num_attention_heads,
+    ):
+        super(BertModel, self).__init__()
+        # initilize word embedding
+        self.embeddings = BertEmbeddings(vocab_size, max_position_embeddings,
+                                         type_vocab_size, hidden_size,
+                                         hidden_dropout_prob)
+        # initlize the region embedding
+        self.v_embeddings = BertImageEmbeddings(v_feature_size, v_hidden_size,
+                                                v_hidden_dropout_prob)
+        # initlize the action embedding
+        self.a_embeddings = BertActionEmbeddings(a_feature_size, a_hidden_size,
+                                                 a_hidden_dropout_prob)
+
+        self.encoder = BertEncoder(
+            v_ent_attention_id, t_ent_attention_id, a_ent_attention_id,
+            fixed_t_layer, fixed_v_layer, hidden_size, v_hidden_size,
+            a_hidden_size, bi_hidden_size, intermediate_size,
+            v_intermediate_size, a_intermediate_size, hidden_act, v_hidden_act,
+            a_hidden_act, hidden_dropout_prob, v_hidden_dropout_prob,
+            a_hidden_dropout_prob, attention_probs_dropout_prob,
+            v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,
+            av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,
+            num_attention_heads, v_num_attention_heads, a_num_attention_heads,
+            bi_num_attention_heads, num_hidden_layers, v_num_hidden_layers,
+            a_num_hidden_layers)
+
+        self.t_pooler = BertPooler(hidden_size, bi_hidden_size)
+        self.v_pooler = BertPooler(v_hidden_size, bi_hidden_size)
+        self.a_pooler = BertPooler(a_hidden_size, bi_hidden_size)
+
+    def forward(
+        self,
+        text_ids,
+        action_feat,
+        image_feat,
+        image_loc,
+        token_type_ids=None,
+        text_mask=None,
+        image_mask=None,
+        action_mask=None,
+        output_all_encoded_layers=False,
+    ):
+        """
+        text_ids: input text ids. Shape: [batch_size, seqence_length]
+        action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]
+        image_feat: input image feature. Shape: [batch_size, region_length, image_feature_dim]]
+        image_loc: input region location. Shape: [batch_size, region_length, region_location_dim]
+        token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]
+        text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]
+        image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]
+        action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]
+        output_all_encoded_layers: is output encoded layers feature or not. Type: Bool.
+        """
+        if text_mask is None:
+            text_mask = paddle.ones_like(text_ids)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(text_ids)
+        if image_mask is None:
+            image_mask = paddle.ones(image_feat.shape[0],
+                                     image_feat.shape[1]).astype(text_ids.dtype)
+        if action_mask is None:
+            action_mask = paddle.ones(action_feat.shape[0],
+                                      action_feat.shape[1]).astype(
+                                          text_ids.dtype)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length].
+        extended_text_mask = text_mask.unsqueeze(1).unsqueeze(2)
+        extended_image_mask = image_mask.unsqueeze(1).unsqueeze(2)
+        extended_action_mask = action_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        def set_mask(extended_attention_mask):
+            extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+            return extended_attention_mask
+
+        extended_text_mask = set_mask(extended_text_mask)
+        extended_image_mask = set_mask(extended_image_mask)
+        extended_action_mask = set_mask(extended_action_mask)
+
+        t_embedding_output = self.embeddings(text_ids, token_type_ids)
+        v_embedding_output = self.v_embeddings(image_feat, image_loc)
+        a_embedding_output = self.a_embeddings(action_feat)
+
+        # var = [t_embedding_output, v_embedding_output, a_embedding_output]
+        # import numpy as np
+        # for i, item in enumerate(var):
+        #     np.save('tmp/' + str(i)+'.npy', item.numpy())
+
+        encoded_layers_t, encoded_layers_v, encoded_layers_a = self.encoder(
+            t_embedding_output,
+            v_embedding_output,
+            a_embedding_output,
+            extended_text_mask,
+            extended_image_mask,
+            extended_action_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+        )
+
+        sequence_output_t = encoded_layers_t[-1]  #get item from list
+        sequence_output_v = encoded_layers_v[-1]
+        sequence_output_a = encoded_layers_a[-1]
+
+        pooled_output_t = self.t_pooler(sequence_output_t)
+        pooled_output_v = self.v_pooler(sequence_output_v)
+        pooled_output_a = self.a_pooler(sequence_output_a)
+
+        if not output_all_encoded_layers:
+            encoded_layers_t = encoded_layers_t[-1]
+            encoded_layers_v = encoded_layers_v[-1]
+            encoded_layers_a = encoded_layers_a[-1]
+
+        return encoded_layers_t, encoded_layers_v, encoded_layers_a, \
+            pooled_output_t, pooled_output_v, pooled_output_a
+
+
+# For Head
+class BertPredictionHeadTransform(nn.Layer):
+    def __init__(self, hidden_size, hidden_act):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        if isinstance(hidden_act, str) or (sys.version_info[0] == 2
+                                           and isinstance(hidden_act, str)):
+            self.transform_act_fn = ACT2FN[hidden_act]
+        else:
+            self.transform_act_fn = hidden_act
+        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Layer):
+    def __init__(self, hidden_size, hidden_act, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        assert bert_model_embedding_weights.shape[1] == hidden_size
+        vocab_size = bert_model_embedding_weights.shape[0]
+
+        # another implementation which would create another big params:
+        # self.decoder = nn.Linear(hidden_size, vocab_size)   # NOTE bias default: constant 0.0
+        # self.decoder.weight = self.create_parameter(shape=[hidden_size, vocab_size],
+        #                                             default_initializer=nn.initializer.Assign(
+        #                                                 bert_model_embedding_weights.t()))  # transpose
+
+        self.decoder_weight = bert_model_embedding_weights
+        self.decoder_bias = self.create_parameter(
+            shape=[vocab_size],
+            dtype=bert_model_embedding_weights.dtype,
+            is_bias=True)  # NOTE bias default: constant 0.0
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = paddle.tensor.matmul(
+            hidden_states, self.decoder_weight,
+            transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+class BertImageActionPredictionHead(nn.Layer):
+    def __init__(self, hidden_size, hidden_act, target_size):
+        super(BertImageActionPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)
+
+        self.decoder = nn.Linear(hidden_size, target_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertPreTrainingHeads(nn.Layer):
+    def __init__(self, hidden_size, v_hidden_size, a_hidden_size,
+                 bi_hidden_size, hidden_act, v_hidden_act, a_hidden_act,
+                 v_target_size, a_target_size, fusion_method,
+                 bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(hidden_size, hidden_act,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(bi_hidden_size, 2)
+        self.imagePredictions = BertImageActionPredictionHead(
+            v_hidden_size, v_hidden_act, v_target_size)  # visual class number
+        self.actionPredictions = BertImageActionPredictionHead(
+            a_hidden_size, a_hidden_act, a_target_size)  # action class number
+        self.fusion_method = fusion_method
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, sequence_output_t, sequence_output_v, sequence_output_a,
+                pooled_output_t, pooled_output_v, pooled_output_a):
+
+        if self.fusion_method == 'sum':
+            pooled_output = self.dropout(pooled_output_t + pooled_output_v +
+                                         pooled_output_a)
+        elif self.fusion_method == 'mul':
+            pooled_output = self.dropout(pooled_output_t * pooled_output_v +
+                                         pooled_output_a)
+        else:
+            assert False
+
+        prediction_scores_t = self.predictions(
+            sequence_output_t)  # 8， 36 ，30522
+        seq_relationship_score = self.seq_relationship(pooled_output)  # 8, 2
+        prediction_scores_v = self.imagePredictions(
+            sequence_output_v)  # 8, 37, 1601
+        prediction_scores_a = self.actionPredictions(
+            sequence_output_a)  # 8, 5, 401
+
+        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
+
+
+@BACKBONES.register()
+class BertForMultiModalPreTraining(nn.Layer):
+    """BERT model with multi modal pre-training heads.
+    """
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        v_target_size=1601,
+        a_target_size=700,
+        v_feature_size=2048,
+        a_feature_size=2048,
+        num_hidden_layers=12,
+        v_num_hidden_layers=2,
+        a_num_hidden_layers=3,
+        t_ent_attention_id=[10, 11],
+        v_ent_attention_id=[0, 1],
+        a_ent_attention_id=[0, 1],
+        fixed_t_layer=0,
+        fixed_v_layer=0,
+        hidden_size=768,
+        v_hidden_size=1024,
+        a_hidden_size=768,
+        bi_hidden_size=1024,
+        intermediate_size=3072,
+        v_intermediate_size=1024,
+        a_intermediate_size=3072,
+        hidden_act="gelu",
+        v_hidden_act="gelu",
+        a_hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        v_hidden_dropout_prob=0.1,
+        a_hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        v_attention_probs_dropout_prob=0.1,
+        a_attention_probs_dropout_prob=0.1,
+        av_attention_probs_dropout_prob=0.1,
+        at_attention_probs_dropout_prob=0.1,
+        num_attention_heads=12,
+        v_num_attention_heads=8,
+        a_num_attention_heads=12,
+        bi_num_attention_heads=8,
+        fusion_method="mul",
+        pretrained=None,
+    ):
+        """
+        vocab_size: vocabulary size. Default: 30522.
+        max_position_embeddings: max position id. Default: 512.
+        type_vocab_size: max segment id. Default: 2.
+        v_target_size: class number of visual word. Default: 1601.
+        a_target_size: class number of action word. Default: 700.
+        v_feature_size: input visual feature dimension. Default: 2048.
+        a_feature_size: input action feature dimension. Default: 2048.
+        num_hidden_layers: number of BertLayer in text transformer. Default: 12.
+        v_num_hidden_layers: number of BertLayer in visual transformer. Default: 2.
+        a_num_hidden_layers: number of BertLayer in action transformer. Default:3.
+        t_ent_attention_id: index id of BertConnectionLayer in text transformer. Default: [10, 11].
+        v_ent_attention_id: index id of BertConnectionLayer in visual transformer. Default:[0, 1].
+        a_ent_attention_id: index id of BertConnectionLayer in action transformer. Default:[0, 1].
+        fixed_t_layer: index id of fixed BertLayer in text transformer. Default: 0.
+        fixed_v_layer: index id of fixed BertLayer in visual transformer. Default: 0.
+        hidden_size: hidden size in text BertLayer. Default: 768.
+        v_hidden_size: hidden size in visual BertLayer. Default: 1024.
+        a_hidden_size: hidden size in action BertLayer. Default: 768.
+        bi_hidden_size: hidden size in BertConnectionLayer. Default: 1024,
+        intermediate_size: intermediate size in text BertLayer. Default: 3072.
+        v_intermediate_size: intermediate size in visual BertLayer. Default: 1024.
+        a_intermediate_size: intermediate size in text BertLayer. Default: 3072.
+        hidden_act: hidden activation function in text BertLayer. Default: "gelu".
+        v_hidden_act: hidden activation function in visual BertLayer. Default: "gelu".
+        a_hidden_act: hidden activation function in action BertLayer. Default: "gelu".
+        hidden_dropout_prob: hidden dropout probability in text Embedding Layer. Default: 0.1
+        v_hidden_dropout_prob: hidden dropout probability in visual Embedding Layer. Default: 0.1
+        a_hidden_dropout_prob: hidden dropout probability in action Embedding Layer. Default: 0.1
+        attention_probs_dropout_prob: attention dropout probability in text BertLayer. Default: 0.1
+        v_attention_probs_dropout_prob: attention dropout probability in visual BertLayer. Default: 0.1
+        a_attention_probs_dropout_prob: attention dropout probability in action BertLayer. Default: 0.1
+        av_attention_probs_dropout_prob: attention dropout probability in action-visual BertConnectionLayer. Default: 0.1
+        at_attention_probs_dropout_prob: attention dropout probability in action-text BertConnectionLayer. Default: 0.1
+        num_attention_heads: number of heads in text BertLayer. Default: 12.
+        v_num_attention_heads: number of heads in visual BertLayer. Default: 8.
+        a_num_attention_heads: number of heads in action BertLayer. Default: 12.
+        bi_num_attention_heads: number of heads in BertConnectionLayer. Default: 8.
+        fusion_method: methods of fusing pooled output from 3 transformer. Default: "mul".
+        """
+        super(BertForMultiModalPreTraining, self).__init__()
+        self.pretrained = pretrained
+        self.vocab_size = vocab_size
+        self.a_target_size = a_target_size
+
+        self.bert = BertModel(
+            vocab_size,
+            max_position_embeddings,
+            type_vocab_size,
+            v_feature_size,
+            a_feature_size,
+            num_hidden_layers,
+            v_num_hidden_layers,
+            a_num_hidden_layers,
+            v_ent_attention_id,
+            t_ent_attention_id,
+            a_ent_attention_id,
+            fixed_t_layer,
+            fixed_v_layer,
+            hidden_size,
+            v_hidden_size,
+            a_hidden_size,
+            bi_hidden_size,
+            intermediate_size,
+            v_intermediate_size,
+            a_intermediate_size,
+            hidden_act,
+            v_hidden_act,
+            a_hidden_act,
+            hidden_dropout_prob,
+            v_hidden_dropout_prob,
+            a_hidden_dropout_prob,
+            attention_probs_dropout_prob,
+            v_attention_probs_dropout_prob,
+            a_attention_probs_dropout_prob,
+            av_attention_probs_dropout_prob,
+            at_attention_probs_dropout_prob,
+            num_attention_heads,
+            v_num_attention_heads,
+            a_num_attention_heads,
+            bi_num_attention_heads,
+        )
+        self.cls = BertPreTrainingHeads(
+            hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,
+            hidden_act, v_hidden_act, a_hidden_act, v_target_size,
+            a_target_size, fusion_method,
+            self.bert.embeddings.word_embeddings.weight)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, (nn.Linear, nn.Embedding)):
+                    weight_init_(layer, 'Normal', std=0.02)
+                elif isinstance(layer, nn.LayerNorm):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(
+            self,
+            text_ids,  #8,36
+            action_feat,  #8,5,2048
+            image_feat,  #8,37,2048
+            image_loc,  #8,37,5
+            token_type_ids=None,  #8,36
+            text_mask=None,  #8,36
+            image_mask=None,  #8,37
+            action_mask=None,  #8,5
+    ):
+        """
+        text_ids: input text ids. Shape: [batch_size, seqence_length]
+        action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]
+        image_feat: input image feature. Shape: [batch_size, region_length+1, image_feature_dim]], add 1 for image global feature.
+        image_loc: input region location. Shape: [batch_size, region_length+1, region_location_dim], add 1 for image global feature location.
+        token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]
+        text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]
+        image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]
+        action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]
+        """
+        sequence_output_t, sequence_output_v, sequence_output_a, \
+        pooled_output_t, pooled_output_v, pooled_output_a = self.bert(
+            text_ids,
+            action_feat,
+            image_feat,
+            image_loc,
+            token_type_ids,
+            text_mask,
+            image_mask,
+            action_mask,
+            output_all_encoded_layers=False,
+        )
+
+        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.cls(
+            sequence_output_t, sequence_output_v, sequence_output_a,
+            pooled_output_t, pooled_output_v, pooled_output_a)
+
+        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
diff --git a/docs/src/paddlevideo/modeling/backbones/adds.py b/docs/src/paddlevideo/modeling/backbones/adds.py
new file mode 100644
index 000000000..21cd212cb
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/adds.py
@@ -0,0 +1,1146 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import OrderedDict
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import BatchNorm2D, Conv2D
+from paddle.nn.initializer import Constant, Normal
+from paddle.vision.models import ResNet
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import kaiming_normal_, _calculate_fan_in_and_fan_out
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+normal_ = Normal(mean=0, std=1e-3)
+
+
+def disp_to_depth(disp, min_depth, max_depth):
+    """Convert network's sigmoid output into depth prediction
+    The formula for this conversion is given in the 'additional considerations'
+    section of the paper.
+    """
+    min_disp = 1 / max_depth
+    max_disp = 1 / min_depth
+    scaled_disp = min_disp + (max_disp - min_disp) * disp
+    depth = 1 / scaled_disp
+    return scaled_disp, depth
+
+
+def gram_matrix(y):
+    (b, ch, h, w) = y.shape
+    features = y.reshape([b, ch, w * h])
+    features_t = paddle.transpose(features, [0, 2, 1])
+    gram = features.bmm(features_t) / (ch * h * w)
+    return gram
+
+
+def convt_bn_relu(in_channels,
+                  out_channels,
+                  kernel_size,
+                  stride=1,
+                  padding=0,
+                  output_padding=0,
+                  bn=True,
+                  relu=True):
+    bias = not bn
+    layers = []
+    layers.append(
+        nn.Conv2DTranspose(in_channels,
+                           out_channels,
+                           kernel_size,
+                           stride,
+                           padding,
+                           output_padding,
+                           bias_attr=bias))
+    if bn:
+        layers.append(nn.BatchNorm2D(out_channels))
+
+    if relu:
+        layers.append(nn.LeakyReLU(0.2))
+    layers = nn.Sequential(*layers)
+
+    # initialize the weights
+    for m in layers.sublayers(include_self=True):
+        if isinstance(m, nn.Conv2DTranspose):
+            normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            ones_(m.weight)
+            zeros_(m.bias)
+    return layers
+
+
+def transformation_from_parameters(axisangle, translation, invert=False):
+    """Convert the network's (axisangle, translation) output into a 4x4 matrix
+    """
+    R = rot_from_axisangle(axisangle)
+    t = translation.clone()
+
+    if invert:
+        R = R.transpose([0, 2, 1])
+        t *= -1
+
+    T = get_translation_matrix(t)
+
+    if invert:
+        M = paddle.matmul(R, T)
+    else:
+        M = paddle.matmul(T, R)
+
+    return M
+
+
+def get_translation_matrix(translation_vector):
+    """Convert a translation vector into a 4x4 transformation matrix
+    """
+    t = translation_vector.reshape([-1, 3, 1])
+    gather_object = paddle.stack([
+        paddle.zeros([
+            translation_vector.shape[0],
+        ], paddle.float32),
+        paddle.ones([
+            translation_vector.shape[0],
+        ], paddle.float32),
+        paddle.squeeze(t[:, 0], axis=-1),
+        paddle.squeeze(t[:, 1], axis=-1),
+        paddle.squeeze(t[:, 2], axis=-1),
+    ])
+    gather_index = paddle.to_tensor([
+        [1],
+        [0],
+        [0],
+        [2],
+        [0],
+        [1],
+        [0],
+        [3],
+        [0],
+        [0],
+        [1],
+        [4],
+        [0],
+        [0],
+        [0],
+        [1],
+    ])
+    T = paddle.gather_nd(gather_object, gather_index)
+    T = T.reshape([4, 4, -1]).transpose((2, 0, 1))
+    return T
+
+
+def rot_from_axisangle(vec):
+    """Convert an axisangle rotation into a 4x4 transformation matrix
+    (adapted from https://github.com/Wallacoloo/printipi)
+    Input 'vec' has to be Bx1x3
+    """
+    angle = paddle.norm(vec, 2, 2, True)
+    axis = vec / (angle + 1e-7)
+
+    ca = paddle.cos(angle)
+    sa = paddle.sin(angle)
+    C = 1 - ca
+
+    x = axis[..., 0].unsqueeze(1)
+    y = axis[..., 1].unsqueeze(1)
+    z = axis[..., 2].unsqueeze(1)
+
+    xs = x * sa
+    ys = y * sa
+    zs = z * sa
+    xC = x * C
+    yC = y * C
+    zC = z * C
+    xyC = x * yC
+    yzC = y * zC
+    zxC = z * xC
+
+    gather_object = paddle.stack([
+        paddle.squeeze(x * xC + ca, axis=(-1, -2)),
+        paddle.squeeze(xyC - zs, axis=(-1, -2)),
+        paddle.squeeze(zxC + ys, axis=(-1, -2)),
+        paddle.squeeze(xyC + zs, axis=(-1, -2)),
+        paddle.squeeze(y * yC + ca, axis=(-1, -2)),
+        paddle.squeeze(yzC - xs, axis=(-1, -2)),
+        paddle.squeeze(zxC - ys, axis=(-1, -2)),
+        paddle.squeeze(yzC + xs, axis=(-1, -2)),
+        paddle.squeeze(z * zC + ca, axis=(-1, -2)),
+        paddle.ones([
+            vec.shape[0],
+        ], dtype=paddle.float32),
+        paddle.zeros([
+            vec.shape[0],
+        ], dtype=paddle.float32)
+    ])
+    gather_index = paddle.to_tensor([
+        [0],
+        [1],
+        [2],
+        [10],
+        [3],
+        [4],
+        [5],
+        [10],
+        [6],
+        [7],
+        [8],
+        [10],
+        [10],
+        [10],
+        [10],
+        [9],
+    ])
+    rot = paddle.gather_nd(gather_object, gather_index)
+    rot = rot.reshape([4, 4, -1]).transpose((2, 0, 1))
+    return rot
+
+
+def upsample(x):
+    """Upsample input tensor by a factor of 2
+    """
+    return F.interpolate(x, scale_factor=2, mode="nearest")
+
+
+def get_smooth_loss(disp, img):
+    """Computes the smoothness loss for a disparity image
+    The color image is used for edge-aware smoothness
+    """
+    grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])
+    grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])
+
+    grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),
+                             1,
+                             keepdim=True)
+    grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),
+                             1,
+                             keepdim=True)
+
+    grad_disp_x *= paddle.exp(-grad_img_x)
+    grad_disp_y *= paddle.exp(-grad_img_y)
+
+    return grad_disp_x.mean() + grad_disp_y.mean()
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2D(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias_attr=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2D(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias_attr=False)
+
+
+def resnet_multiimage_input(num_layers, num_input_images=1):
+    """Constructs a ResNet model.
+    Args:
+        num_layers (int): Number of resnet layers. Must be 18 or 50
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        num_input_images (int): Number of frames stacked as input
+    """
+    assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet"
+    blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
+
+    block_type = {18: BasicBlock, 50: Bottleneck}[num_layers]
+
+    model = ResNetMultiImageInput(block_type,
+                                  num_layers,
+                                  blocks,
+                                  num_input_images=num_input_images)
+    model.init_weights()
+    return model
+
+
+class ConvBlock(nn.Layer):
+    """Layer to perform a convolution followed by ELU
+    """
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+
+        self.conv = Conv3x3(in_channels, out_channels)
+        self.nonlin = nn.ELU()
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.nonlin(out)
+        return out
+
+
+class Conv3x3(nn.Layer):
+    """Layer to pad and convolve input
+    """
+    def __init__(self, in_channels, out_channels, use_refl=True):
+        super(Conv3x3, self).__init__()
+
+        if use_refl:
+            self.pad = nn.Pad2D(1, mode='reflect')
+        else:
+            self.pad = nn.Pad2D(1)
+        self.conv = nn.Conv2D(int(in_channels), int(out_channels), 3)
+
+    def forward(self, x):
+        out = self.pad(x)
+        out = self.conv(out)
+        return out
+
+
+class BackprojectDepth(nn.Layer):
+    """Layer to transform a depth image into a point cloud
+    """
+    def __init__(self, batch_size, height, width):
+        super(BackprojectDepth, self).__init__()
+
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+
+        meshgrid = np.meshgrid(range(self.width),
+                               range(self.height),
+                               indexing='xy')
+        id_coords = np.stack(meshgrid, axis=0).astype(np.float32)
+        self.id_coords = self.create_parameter(shape=list(id_coords.shape),
+                                               dtype=paddle.float32)
+        self.id_coords.set_value(id_coords)
+        self.add_parameter("id_coords", self.id_coords)
+        self.id_coords.stop_gradient = True
+
+        self.ones = self.create_parameter(
+            shape=[self.batch_size, 1, self.height * self.width],
+            default_initializer=ones_)
+        self.add_parameter("ones", self.ones)
+        self.ones.stop_gradient = True
+
+        pix_coords = paddle.unsqueeze(
+            paddle.stack([
+                self.id_coords[0].reshape([
+                    -1,
+                ]), self.id_coords[1].reshape([
+                    -1,
+                ])
+            ], 0), 0)
+        pix_coords = pix_coords.tile([batch_size, 1, 1])
+        pix_coords = paddle.concat([pix_coords, self.ones], 1)
+        self.pix_coords = self.create_parameter(shape=list(pix_coords.shape), )
+        self.pix_coords.set_value(pix_coords)
+        self.add_parameter("pix_coords", self.pix_coords)
+        self.pix_coords.stop_gradient = True
+
+    def forward(self, depth, inv_K):
+        cam_points = paddle.matmul(inv_K[:, :3, :3], self.pix_coords)
+        cam_points = depth.reshape([self.batch_size, 1, -1]) * cam_points
+        cam_points = paddle.concat([cam_points, self.ones], 1)
+
+        return cam_points
+
+
+class Project3D(nn.Layer):
+    """Layer which projects 3D points into a camera with intrinsics K and at position T
+    """
+    def __init__(self, batch_size, height, width, eps=1e-7):
+        super(Project3D, self).__init__()
+
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+        self.eps = eps
+
+    def forward(self, points, K, T):
+        P = paddle.matmul(K, T)[:, :3, :]
+
+        cam_points = paddle.matmul(P, points)
+
+        pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) +
+                                             self.eps)
+        pix_coords = pix_coords.reshape(
+            [self.batch_size, 2, self.height, self.width])
+        pix_coords = pix_coords.transpose([0, 2, 3, 1])
+        pix_coords[..., 0] /= self.width - 1
+        pix_coords[..., 1] /= self.height - 1
+        pix_coords = (pix_coords - 0.5) * 2
+        return pix_coords
+
+
+class SSIM(nn.Layer):
+    """Layer to compute the SSIM loss between a pair of images
+    """
+    def __init__(self):
+        super(SSIM, self).__init__()
+        self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)
+
+        self.refl = nn.Pad2D(1, mode='reflect')
+
+        self.C1 = 0.01**2
+        self.C2 = 0.03**2
+
+    def forward(self, x, y):
+        x = self.refl(x)
+        y = self.refl(y)
+
+        mu_x = self.mu_x_pool(x)
+        mu_y = self.mu_y_pool(y)
+
+        sigma_x = self.sig_x_pool(x**2) - mu_x**2
+        sigma_y = self.sig_y_pool(y**2) - mu_y**2
+        sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y
+
+        SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)
+        SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)
+
+        return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)
+
+
+class ResNetMultiImageInput(ResNet):
+    """Constructs a resnet model with varying number of input images.
+    Adapted from https://github.com/pypaddle/vision/blob/master/paddlevision/models/resnet.py
+    """
+    def __init__(self, block, depth, layers, num_input_images=1):
+        super(ResNetMultiImageInput, self).__init__(block, depth)
+        self.inplanes = 64
+        self.conv1 = nn.Conv2D(num_input_images * 3,
+                               64,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(64)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+    def init_weights(self):
+        for layer in self.sublayers(include_self=True):
+            if isinstance(layer, nn.Conv2D):
+                kaiming_normal_(layer.weight,
+                                mode='fan_out',
+                                nonlinearity='relu')
+            elif isinstance(layer, nn.BatchNorm2D):
+                ones_(layer.weight)
+                zeros_(layer.bias)
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+
+    Note: weight and bias initialization include initialize values
+    and name the restored parameters, values initialization
+    are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            bias_attr=False)
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(out_channels)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2D
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                "Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Layer):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2D
+        width = int(planes * (base_width / 64.)) * groups
+
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class DepthDecoder(nn.Layer):
+    def __init__(self,
+                 num_ch_enc,
+                 scales=range(4),
+                 num_output_channels=1,
+                 use_skips=True):
+        super(DepthDecoder, self).__init__()
+
+        self.num_output_channels = num_output_channels
+        self.use_skips = use_skips
+        self.upsample_mode = 'nearest'
+        self.scales = scales
+
+        self.num_ch_enc = num_ch_enc
+        self.num_ch_dec = np.array([16, 32, 64, 128, 256])
+
+        # decoder
+        self.convs = OrderedDict()
+        for i in range(4, -1, -1):
+            # upconv_0
+            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i +
+                                                                           1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out)
+
+            # upconv_1
+            num_ch_in = self.num_ch_dec[i]
+            if self.use_skips and i > 0:
+                num_ch_in += self.num_ch_enc[i - 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out)
+
+        for s in self.scales:
+            self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s],
+                                                  self.num_output_channels)
+
+        self.decoder = nn.LayerList(list(self.convs.values()))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input_features):
+        outputs = {}
+
+        # decoder
+        x = input_features[-1]
+        for i in range(4, -1, -1):
+            x = self.convs[("upconv", i, 0)](x)
+            x = [upsample(x)]
+            if self.use_skips and i > 0:
+                x += [input_features[i - 1]]
+            x = paddle.concat(x, 1)
+            x = self.convs[("upconv", i, 1)](x)
+            if i in self.scales:
+                outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv",
+                                                                i)](x))
+        return outputs
+
+
+class PoseDecoder(nn.Layer):
+    def __init__(self,
+                 num_ch_enc,
+                 num_input_features,
+                 num_frames_to_predict_for=None,
+                 stride=1):
+        super(PoseDecoder, self).__init__()
+
+        self.num_ch_enc = num_ch_enc
+        self.num_input_features = num_input_features
+
+        if num_frames_to_predict_for is None:
+            num_frames_to_predict_for = num_input_features - 1
+        self.num_frames_to_predict_for = num_frames_to_predict_for
+
+        self.convs = OrderedDict()
+        self.convs[("squeeze")] = nn.Conv2D(self.num_ch_enc[-1], 256, 1)
+        self.convs[("pose", 0)] = nn.Conv2D(num_input_features * 256, 256, 3,
+                                            stride, 1)
+        self.convs[("pose", 1)] = nn.Conv2D(256, 256, 3, stride, 1)
+        self.convs[("pose", 2)] = nn.Conv2D(256, 6 * num_frames_to_predict_for,
+                                            1)
+
+        self.relu = nn.ReLU()
+
+        self.net = nn.LayerList(list(self.convs.values()))
+
+    def forward(self, input_features):
+        last_features = [f[-1] for f in input_features]
+
+        cat_features = [
+            self.relu(self.convs["squeeze"](f)) for f in last_features
+        ]
+        cat_features = paddle.concat(cat_features, 1)
+
+        out = cat_features
+        for i in range(3):
+            out = self.convs[("pose", i)](out)
+            if i != 2:
+                out = self.relu(out)
+
+        out = out.mean(3).mean(2)
+
+        out = 0.01 * out.reshape([-1, self.num_frames_to_predict_for, 1, 6])
+
+        axisangle = out[..., :3]
+        translation = out[..., 3:]
+
+        return axisangle, translation
+
+
+class ResnetEncoder(nn.Layer):
+    """Pypaddle module for a resnet encoder
+    """
+    def __init__(self, num_layers, pretrained=False, num_input_images=1):
+        super(ResnetEncoder, self).__init__()
+
+        self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+
+        resnets = {
+            18: paddle.vision.models.resnet18,
+            34: paddle.vision.models.resnet34,
+            50: paddle.vision.models.resnet50,
+            101: paddle.vision.models.resnet101,
+            152: paddle.vision.models.resnet152
+        }
+
+        if num_layers not in resnets:
+            raise ValueError(
+                "{} is not a valid number of resnet layers".format(num_layers))
+
+        if num_input_images > 1:
+            self.encoder = resnet_multiimage_input(num_layers, pretrained,
+                                                   num_input_images)
+        else:
+            self.encoder = resnets[num_layers](pretrained)
+
+        if num_layers > 34:
+            self.num_ch_enc[1:] *= 4
+
+        ######################################
+        # night public first conv
+        ######################################
+        self.conv1 = nn.Conv2D(3,
+                               64,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(64)
+        self.relu = nn.ReLU()  # NOTE
+
+        self.conv_shared = nn.Conv2D(512, 64, kernel_size=1)
+
+        ##########################################
+        # private source encoder, day
+        ##########################################
+        self.encoder_day = resnets[num_layers](pretrained)
+        self.conv_diff_day = nn.Conv2D(
+            512, 64, kernel_size=1)  # no bn after conv, so bias=true
+
+        ##########################################
+        # private target encoder, night
+        ##########################################
+        self.encoder_night = resnets[num_layers](pretrained)
+        self.conv_diff_night = nn.Conv2D(512, 64, kernel_size=1)
+
+        ######################################
+        # shared decoder (small decoder), use a simple de-conv to upsample the features with no skip connection
+        ######################################
+        self.convt5 = convt_bn_relu(in_channels=512,
+                                    out_channels=256,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convt4 = convt_bn_relu(in_channels=256,
+                                    out_channels=128,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convt3 = convt_bn_relu(in_channels=128,
+                                    out_channels=64,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convt2 = convt_bn_relu(in_channels=64,
+                                    out_channels=64,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convt1 = convt_bn_relu(in_channels=64,
+                                    out_channels=64,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convtf = nn.Conv2D(64, 3, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, input_image, is_night):
+        if self.training:
+            result = []
+            input_data = (input_image - 0.45) / 0.225
+            if is_night == 'day':
+                # source private encoder, day
+                private_feature = self.encoder_day.conv1(input_data)
+                private_feature = self.encoder_day.bn1(private_feature)
+                private_feature = self.encoder_day.relu(private_feature)
+                private_feature = self.encoder_day.maxpool(private_feature)
+                private_feature = self.encoder_day.layer1(private_feature)
+                private_feature = self.encoder_day.layer2(private_feature)
+                private_feature = self.encoder_day.layer3(private_feature)
+                private_feature = self.encoder_day.layer4(private_feature)
+                private_code = self.conv_diff_day(private_feature)
+                private_gram = gram_matrix(private_feature)
+                result.append(private_code)
+                result.append(private_gram)
+
+            elif is_night == 'night':
+                # target private encoder, night
+                private_feature = self.encoder_night.conv1(input_data)
+                private_feature = self.encoder_night.bn1(private_feature)
+                private_feature = self.encoder_night.relu(private_feature)
+                private_feature = self.encoder_night.maxpool(private_feature)
+                private_feature = self.encoder_night.layer1(private_feature)
+                private_feature = self.encoder_night.layer2(private_feature)
+                private_feature = self.encoder_night.layer3(private_feature)
+                private_feature = self.encoder_night.layer4(private_feature)
+                private_code = self.conv_diff_night(private_feature)
+
+                private_gram = gram_matrix(private_feature)
+                result.append(private_code)
+                result.append(private_gram)
+
+        # shared encoder
+        self.features = []
+        x = (input_image - 0.45) / 0.225
+        if is_night == 'day':
+            x = self.encoder.conv1(x)
+            x = self.encoder.bn1(x)
+            self.features.append(self.encoder.relu(x))
+        else:
+            x = self.conv1(x)
+            x = self.bn1(x)
+            self.features.append(self.relu(x))
+
+        self.features.append(
+            self.encoder.layer1(self.encoder.maxpool(self.features[-1])))
+        self.features.append(self.encoder.layer2(self.features[-1]))
+        self.features.append(self.encoder.layer3(self.features[-1]))
+        self.features.append(self.encoder.layer4(self.features[-1]))
+
+        if self.training:
+            shared_code = self.conv_shared(self.features[-1])
+            shared_gram = gram_matrix(self.features[-1])
+            result.append(shared_code)  # use this to calculate loss of diff
+            result.append(shared_gram)
+            result.append(
+                self.features[-1])  # use this to calculate loss of similarity
+
+            union_code = private_feature + self.features[-1]
+            rec_code = self.convt5(union_code)
+            rec_code = self.convt4(rec_code)
+            rec_code = self.convt3(rec_code)
+            rec_code = self.convt2(rec_code)
+            rec_code = self.convt1(rec_code)
+            rec_code = self.convtf(rec_code)
+            result.append(rec_code)
+
+            return self.features, result
+        else:
+            return self.features
+
+
+class ResnetEncoder_pose(nn.Layer):
+    """Pypaddle module for a resnet encoder
+    """
+    def __init__(self, num_layers, pretrained=False, num_input_images=1):
+        super(ResnetEncoder_pose, self).__init__()
+
+        self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+        resnets = {
+            18: paddle.vision.models.resnet18,
+            34: paddle.vision.models.resnet34,
+            50: paddle.vision.models.resnet50,
+            101: paddle.vision.models.resnet101,
+            152: paddle.vision.models.resnet152
+        }
+
+        if num_layers not in resnets:
+            raise ValueError(
+                "{} is not a valid number of resnet layers".format(num_layers))
+
+        if num_input_images > 1:
+            self.encoder = resnet_multiimage_input(num_layers, num_input_images)
+        else:
+            self.encoder = resnets[num_layers](pretrained)
+
+        if num_layers > 34:
+            self.num_ch_enc[1:] *= 4
+
+    def forward(self, input_image):
+        features = []
+        x = (input_image - 0.45) / 0.225
+        x = self.encoder.conv1(x)
+        x = self.encoder.bn1(x)
+        features.append(self.encoder.relu(x))
+        features.append(self.encoder.layer1(self.encoder.maxpool(features[-1])))
+        features.append(self.encoder.layer2(features[-1]))
+        features.append(self.encoder.layer3(features[-1]))
+        features.append(self.encoder.layer4(features[-1]))
+
+        return features
+
+
+@BACKBONES.register()
+class ADDS_DepthNet(nn.Layer):
+    def __init__(self,
+                 num_layers=18,
+                 frame_ids=[0, -1, 1],
+                 height=256,
+                 width=512,
+                 batch_size=6,
+                 pose_model_input="pairs",
+                 use_stereo=False,
+                 only_depth_encoder=False,
+                 pretrained=None,
+                 scales=[0, 1, 2, 3],
+                 min_depth=0.1,
+                 max_depth=100.0,
+                 pose_model_type='separate_resnet',
+                 v1_multiscale=False,
+                 predictive_mask=False,
+                 disable_automasking=False):
+        super(ADDS_DepthNet, self).__init__()
+        self.num_layers = num_layers
+        self.height = height
+        self.width = width
+        self.batch_size = batch_size
+        self.frame_ids = frame_ids
+        self.pose_model_input = pose_model_input
+        self.use_stereo = use_stereo
+        self.only_depth_encoder = only_depth_encoder
+        self.pretrained = pretrained
+        self.scales = scales
+        self.pose_model_type = pose_model_type
+        self.predictive_mask = predictive_mask
+        self.disable_automasking = disable_automasking
+        self.v1_multiscale = v1_multiscale
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+
+        self.num_input_frames = len(self.frame_ids)
+        self.num_pose_frames = 2 if self.pose_model_input == "pairs" else self.num_input_frames
+
+        assert self.frame_ids[0] == 0, "frame_ids must start with 0"
+
+        self.use_pose_net = not (self.use_stereo and self.frame_ids == [0])
+
+        self.encoder = ResnetEncoder(self.num_layers)
+        if not self.only_depth_encoder:
+            self.depth = DepthDecoder(self.encoder.num_ch_enc, self.scales)
+        if self.use_pose_net and not self.only_depth_encoder:
+            if self.pose_model_type == "separate_resnet":
+                self.pose_encoder = ResnetEncoder_pose(
+                    self.num_layers, num_input_images=self.num_pose_frames)
+                self.pose = PoseDecoder(self.pose_encoder.num_ch_enc,
+                                        num_input_features=1,
+                                        num_frames_to_predict_for=2)
+
+        self.backproject_depth = {}
+        self.project_3d = {}
+        for scale in self.scales:
+            h = self.height // (2**scale)
+            w = self.width // (2**scale)
+
+            self.backproject_depth[scale] = BackprojectDepth(
+                self.batch_size, h, w)
+            self.project_3d[scale] = Project3D(batch_size, h, w)
+
+    def init_weights(self):
+        """First init model's weight"""
+        for m in self.sublayers(include_self=True):
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight, a=math.sqrt(5))
+                if m.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(m.weight)
+                    bound = 1 / math.sqrt(fan_in)
+                    uniform_ = paddle.nn.initializer.Uniform(-bound, bound)
+                    uniform_(m.bias)
+        """Second, if provide pretrained ckpt, load it"""
+        if self.pretrained:  # load pretrained weights
+            load_ckpt(self, self.pretrained)
+
+    def forward(self, inputs, day_or_night='day'):
+        if self.training:
+            features, result = self.encoder(inputs["color_aug", 0, 0], 'day')
+            features_night, result_night = self.encoder(
+                inputs[("color_n_aug", 0, 0)], 'night')
+
+            outputs = self.depth(features)
+            outputs_night = self.depth(features_night)
+            if self.use_pose_net and not self.only_depth_encoder:
+                outputs.update(self.predict_poses(inputs, 'day'))
+                outputs_night.update(self.predict_poses(inputs, 'night'))
+
+                self.generate_images_pred(inputs, outputs, 'day')
+                self.generate_images_pred(inputs, outputs_night, 'night')
+
+            outputs['frame_ids'] = self.frame_ids
+            outputs['scales'] = self.scales
+            outputs['result'] = result
+            outputs['result_night'] = result_night
+            outputs_night['frame_ids'] = self.frame_ids
+            outputs_night['scales'] = self.scales
+            outputs['outputs_night'] = outputs_night
+        else:
+            if isinstance(inputs, dict):
+                input_color = inputs[("color", 0, 0)]
+                features = self.encoder(input_color, day_or_night[0])
+                outputs = self.depth(features)
+
+                pred_disp, _ = disp_to_depth(outputs[("disp", 0)],
+                                             self.min_depth, self.max_depth)
+
+                pred_disp = pred_disp[:, 0].numpy()
+
+                outputs['pred_disp'] = np.squeeze(pred_disp)
+
+                outputs['gt'] = np.squeeze(inputs['depth_gt'].numpy())
+            else:
+                input_color = inputs
+                features = self.encoder(input_color, day_or_night)
+                outputs = self.depth(features)
+
+                pred_disp, _ = disp_to_depth(outputs[("disp", 0)],
+                                             self.min_depth, self.max_depth)
+
+                pred_disp = pred_disp[:, 0]
+                outputs = paddle.squeeze(pred_disp)
+        return outputs
+
+    def predict_poses(self, inputs, is_night):
+        """Predict poses between input frames for monocular sequences.
+        """
+        outputs = {}
+        if self.num_pose_frames == 2:
+            if is_night:
+                pose_feats = {
+                    f_i: inputs["color_n_aug", f_i, 0]
+                    for f_i in self.frame_ids
+                }
+            else:
+                pose_feats = {
+                    f_i: inputs["color_aug", f_i, 0]
+                    for f_i in self.frame_ids
+                }
+
+            for f_i in self.frame_ids[1:]:
+                if f_i != "s":
+                    if f_i < 0:
+                        pose_inputs = [pose_feats[f_i], pose_feats[0]]
+                    else:
+                        pose_inputs = [pose_feats[0], pose_feats[f_i]]
+
+                    if self.pose_model_type == "separate_resnet":
+                        pose_inputs = [
+                            self.pose_encoder(paddle.concat(pose_inputs,
+                                                            axis=1))
+                        ]
+
+                    axisangle, translation = self.pose(pose_inputs)
+                    outputs[("axisangle", 0, f_i)] = axisangle
+                    outputs[("translation", 0, f_i)] = translation
+
+                    # Invert the matrix if the frame id is negative
+                    outputs[("cam_T_cam", 0,
+                             f_i)] = transformation_from_parameters(
+                                 axisangle[:, 0],
+                                 translation[:, 0],
+                                 invert=(f_i < 0))
+            return outputs
+
+    def generate_images_pred(self, inputs, outputs, is_night):
+        """Generate the warped (reprojected) color images for a minibatch.
+        Generated images are saved into the `outputs` dictionary.
+        """
+        _, _, height, width = inputs['color', 0, 0].shape
+        for scale in self.scales:
+            disp = outputs[("disp", scale)]
+            if self.v1_multiscale:
+                source_scale = scale
+            else:
+                disp = F.interpolate(disp, [height, width],
+                                     mode="bilinear",
+                                     align_corners=False)
+                source_scale = 0
+
+            _, depth = disp_to_depth(disp, self.min_depth, self.max_depth)
+
+            outputs[("depth", 0, scale)] = depth
+            for i, frame_id in enumerate(self.frame_ids[1:]):
+
+                T = outputs[("cam_T_cam", 0, frame_id)]
+
+                cam_points = self.backproject_depth[source_scale](
+                    depth, inputs[("inv_K", source_scale)])
+                pix_coords = self.project_3d[source_scale](
+                    cam_points, inputs[("K", source_scale)], T)
+
+                outputs[("sample", frame_id, scale)] = pix_coords
+
+                if is_night:
+                    inputs[("color_n", frame_id,
+                            source_scale)].stop_gradient = False
+                    outputs[("color", frame_id,
+                             scale)] = paddle.nn.functional.grid_sample(
+                                 inputs[("color_n", frame_id, source_scale)],
+                                 outputs[("sample", frame_id, scale)],
+                                 padding_mode="border",
+                                 align_corners=False)
+
+                else:
+                    inputs[("color", frame_id,
+                            source_scale)].stop_gradient = False
+                    outputs[("color", frame_id,
+                             scale)] = paddle.nn.functional.grid_sample(
+                                 inputs[("color", frame_id, source_scale)],
+                                 outputs[("sample", frame_id, scale)],
+                                 padding_mode="border",
+                                 align_corners=False)
+
+                if not self.disable_automasking:
+                    if is_night:
+                        outputs[("color_identity", frame_id, scale)] = \
+                            inputs[("color_n", frame_id, source_scale)]
+                    else:
+                        outputs[("color_identity", frame_id, scale)] = \
+                            inputs[("color", frame_id, source_scale)]
diff --git a/docs/src/paddlevideo/modeling/backbones/agcn.py b/docs/src/paddlevideo/modeling/backbones/agcn.py
new file mode 100644
index 000000000..9f870c66b
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/agcn.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..registry import BACKBONES
+
+
+class GCN(nn.Layer):
+    def __init__(self, in_channels, out_channels, vertex_nums=25, stride=1):
+        super(GCN, self).__init__()
+        self.conv1 = nn.Conv2D(in_channels=in_channels,
+                               out_channels=3 * out_channels,
+                               kernel_size=1,
+                               stride=1)
+        self.conv2 = nn.Conv2D(in_channels=vertex_nums * 3,
+                               out_channels=vertex_nums,
+                               kernel_size=1)
+
+    def forward(self, x):
+        # x --- N,C,T,V
+        x = self.conv1(x)  # N,3C,T,V
+        N, C, T, V = x.shape
+        x = paddle.reshape(x, [N, C // 3, 3, T, V])  # N,C,3,T,V
+        x = paddle.transpose(x, perm=[0, 1, 2, 4, 3])  # N,C,3,V,T
+        x = paddle.reshape(x, [N, C // 3, 3 * V, T])  # N,C,3V,T
+        x = paddle.transpose(x, perm=[0, 2, 1, 3])  # N,3V,C,T
+        x = self.conv2(x)  # N,V,C,T
+        x = paddle.transpose(x, perm=[0, 2, 3, 1])  # N,C,T,V
+        return x
+
+
+class Block(paddle.nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 vertex_nums=25,
+                 temporal_size=9,
+                 stride=1,
+                 residual=True):
+        super(Block, self).__init__()
+        self.residual = residual
+        self.out_channels = out_channels
+
+        self.bn_res = nn.BatchNorm2D(out_channels)
+        self.conv_res = nn.Conv2D(in_channels=in_channels,
+                                  out_channels=out_channels,
+                                  kernel_size=1,
+                                  stride=(stride, 1))
+        self.gcn = GCN(in_channels=in_channels,
+                       out_channels=out_channels,
+                       vertex_nums=vertex_nums)
+        self.tcn = nn.Sequential(
+            nn.BatchNorm2D(out_channels),
+            nn.ReLU(),
+            nn.Conv2D(in_channels=out_channels,
+                      out_channels=out_channels,
+                      kernel_size=(temporal_size, 1),
+                      padding=((temporal_size - 1) // 2, 0),
+                      stride=(stride, 1)),
+            nn.BatchNorm2D(out_channels),
+        )
+
+    def forward(self, x):
+        if self.residual:
+            y = self.conv_res(x)
+            y = self.bn_res(y)
+        x = self.gcn(x)
+        x = self.tcn(x)
+        out = x + y if self.residual else x
+        out = F.relu(out)
+        return out
+
+
+@BACKBONES.register()
+class AGCN(nn.Layer):
+    """
+    AGCN model improves the performance of ST-GCN using
+    Adaptive Graph Convolutional Networks.
+    Args:
+        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.
+    """
+    def __init__(self, in_channels=2, **kwargs):
+        super(AGCN, self).__init__()
+
+        self.data_bn = nn.BatchNorm1D(25 * 2)
+        self.agcn = nn.Sequential(
+            Block(in_channels=in_channels,
+                  out_channels=64,
+                  residual=False,
+                  **kwargs), Block(in_channels=64, out_channels=64, **kwargs),
+            Block(in_channels=64, out_channels=64, **kwargs),
+            Block(in_channels=64, out_channels=64, **kwargs),
+            Block(in_channels=64, out_channels=128, stride=2, **kwargs),
+            Block(in_channels=128, out_channels=128, **kwargs),
+            Block(in_channels=128, out_channels=128, **kwargs),
+            Block(in_channels=128, out_channels=256, stride=2, **kwargs),
+            Block(in_channels=256, out_channels=256, **kwargs),
+            Block(in_channels=256, out_channels=256, **kwargs))
+
+        self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))
+
+    def forward(self, x):
+        # data normalization
+        N, C, T, V, M = x.shape
+
+        x = x.transpose((0, 4, 1, 2, 3))  # N, M, C, T, V
+        x = x.reshape((N * M, C, T, V))
+
+        x = self.agcn(x)
+
+        x = self.pool(x)  # NM,C,T,V --> NM,C,1,1
+        C = x.shape[1]
+        x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1)  # N,C,1,1
+
+        return x
diff --git a/docs/src/paddlevideo/modeling/backbones/agcn2s.py b/docs/src/paddlevideo/modeling/backbones/agcn2s.py
new file mode 100644
index 000000000..a630c68a7
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/agcn2s.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import numpy as np
+from ..registry import BACKBONES
+
+
+def import_class(name):
+    components = name.split('.')
+    mod = __import__(components[0])
+    for comp in components[1:]:
+        mod = getattr(mod, comp)
+    return mod
+
+
+class UnitTCN(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):
+        super(UnitTCN, self).__init__()
+        pad = int((kernel_size - 1) / 2)
+        self.conv = nn.Conv2D(in_channels,
+                              out_channels,
+                              kernel_size=(kernel_size, 1),
+                              padding=(pad, 0),
+                              stride=(stride, 1))
+
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        " input size : (N*M, C, T, V)"
+        x = self.bn(self.conv(x))
+        return x
+
+
+class UnitGCN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 coff_embedding=4,
+                 num_subset=3):
+        super(UnitGCN, self).__init__()
+        inter_channels = out_channels // coff_embedding
+        self.inter_c = inter_channels
+        PA = self.create_parameter(shape=A.shape, dtype='float32')
+        self.PA = PA
+        self.A = paddle.to_tensor(A.astype(np.float32))
+        self.num_subset = num_subset
+
+        self.conv_a = nn.LayerList()
+        self.conv_b = nn.LayerList()
+        self.conv_d = nn.LayerList()
+        for i in range(self.num_subset):
+            self.conv_a.append(nn.Conv2D(in_channels, inter_channels, 1))
+            self.conv_b.append(nn.Conv2D(in_channels, inter_channels, 1))
+            self.conv_d.append(nn.Conv2D(in_channels, out_channels, 1))
+
+        if in_channels != out_channels:
+            self.down = nn.Sequential(nn.Conv2D(in_channels, out_channels, 1),
+                                      nn.BatchNorm2D(out_channels))
+        else:
+            self.down = lambda x: x
+
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.soft = nn.Softmax(-2)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        N, C, T, V = x.shape
+        A = self.A + self.PA
+
+        y = None
+        for i in range(self.num_subset):
+            A1 = paddle.transpose(self.conv_a[i](x),
+                                  perm=[0, 3, 1,
+                                        2]).reshape([N, V, self.inter_c * T])
+            A2 = self.conv_b[i](x).reshape([N, self.inter_c * T, V])
+            A1 = self.soft(paddle.matmul(A1, A2) / A1.shape[-1])
+            A1 = A1 + A[i]
+            A2 = x.reshape([N, C * T, V])
+            z = self.conv_d[i](paddle.matmul(A2, A1).reshape([N, C, T, V]))
+            y = z + y if y is not None else z
+
+        y = self.bn(y)
+        y += self.down(x)
+        return self.relu(y)
+
+
+class Block(nn.Layer):
+    def __init__(self, in_channels, out_channels, A, stride=1, residual=True):
+        super(Block, self).__init__()
+        self.gcn1 = UnitGCN(in_channels, out_channels, A)
+        self.tcn1 = UnitTCN(out_channels, out_channels, stride=stride)
+        self.relu = nn.ReLU()
+        if not residual:
+            self.residual = lambda x: 0
+
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+
+        else:
+            self.residual = UnitTCN(in_channels,
+                                    out_channels,
+                                    kernel_size=1,
+                                    stride=stride)
+
+    def forward(self, x):
+        x = self.tcn1(self.gcn1(x)) + self.residual(x)
+        return self.relu(x)
+
+
+# This Graph structure is for the NTURGB+D dataset. If you use a custom dataset, modify num_node and the corresponding graph adjacency structure.
+class Graph:
+    def __init__(self, labeling_mode='spatial'):
+        num_node = 25
+        self_link = [(i, i) for i in range(num_node)]
+        inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
+                            (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),
+                            (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),
+                            (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),
+                            (23, 8), (24, 25), (25, 12)]
+        inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]
+        outward = [(j, i) for (i, j) in inward]
+        neighbor = inward + outward
+
+        self.num_node = num_node
+        self.self_link = self_link
+        self.inward = inward
+        self.outward = outward
+        self.neighbor = neighbor
+        self.A = self.get_adjacency_matrix(labeling_mode)
+
+    def edge2mat(self, link, num_node):
+        A = np.zeros((num_node, num_node))
+        for i, j in link:
+            A[j, i] = 1
+        return A
+
+    def normalize_digraph(self, A):
+        Dl = np.sum(A, 0)
+        h, w = A.shape
+        Dn = np.zeros((w, w))
+        for i in range(w):
+            if Dl[i] > 0:
+                Dn[i, i] = Dl[i]**(-1)
+        AD = np.dot(A, Dn)
+        return AD
+
+    def get_spatial_graph(self, num_node, self_link, inward, outward):
+        I = self.edge2mat(self_link, num_node)
+        In = self.normalize_digraph(self.edge2mat(inward, num_node))
+        Out = self.normalize_digraph(self.edge2mat(outward, num_node))
+        A = np.stack((I, In, Out))
+        return A
+
+    def get_adjacency_matrix(self, labeling_mode=None):
+        if labeling_mode is None:
+            return self.A
+        if labeling_mode == 'spatial':
+            A = self.get_spatial_graph(self.num_node, self.self_link,
+                                       self.inward, self.outward)
+        else:
+            raise ValueError()
+        return A
+
+
+@BACKBONES.register()
+class AGCN2s(nn.Layer):
+    def __init__(self,
+                 num_point=25,
+                 num_person=2,
+                 graph='ntu_rgb_d',
+                 graph_args=dict(),
+                 in_channels=3):
+        super(AGCN2s, self).__init__()
+
+        if graph == 'ntu_rgb_d':
+            self.graph = Graph(**graph_args)
+        else:
+            raise ValueError()
+
+        A = self.graph.A
+        self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point)
+
+        self.l1 = Block(in_channels, 64, A, residual=False)
+        self.l2 = Block(64, 64, A)
+        self.l3 = Block(64, 64, A)
+        self.l4 = Block(64, 64, A)
+        self.l5 = Block(64, 128, A, stride=2)
+        self.l6 = Block(128, 128, A)
+        self.l7 = Block(128, 128, A)
+        self.l8 = Block(128, 256, A, stride=2)
+        self.l9 = Block(256, 256, A)
+        self.l10 = Block(256, 256, A)
+
+    def forward(self, x):
+        N, C, T, V, M = x.shape
+
+        x = x.transpose([0, 4, 3, 1, 2]).reshape_([N, M * V * C, T])
+        x = self.data_bn(x)
+        x = x.reshape_([N, M, V, C,
+                        T]).transpose([0, 1, 3, 4,
+                                       2]).reshape_([N * M, C, T, V])
+
+        x = self.l1(x)
+        x = self.l2(x)
+        x = self.l3(x)
+        x = self.l4(x)
+        x = self.l5(x)
+        x = self.l6(x)
+        x = self.l7(x)
+        x = self.l8(x)
+        x = self.l9(x)
+        x = self.l10(x)
+
+        return x
diff --git a/docs/src/paddlevideo/modeling/backbones/asrf.py b/docs/src/paddlevideo/modeling/backbones/asrf.py
new file mode 100644
index 000000000..37437b3ed
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/asrf.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/yabufarha/ms-tcn/blob/master/model.py
+# https://github.com/yiskw713/asrf/libs/models/tcn.py
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import copy
+import random
+import math
+
+from paddle import ParamAttr
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from .ms_tcn import DilatedResidualLayer
+from ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch
+
+
+@BACKBONES.register()
+class ASRF(nn.Layer):
+
+    def __init__(self, in_channel, num_features, num_classes, num_stages,
+                 num_layers):
+        super().__init__()
+        self.in_channel = in_channel
+        self.num_features = num_features
+        self.num_classes = num_classes
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+
+        # define layers
+        self.conv_in = nn.Conv1D(self.in_channel, self.num_features, 1)
+
+        shared_layers = [
+            DilatedResidualLayer(2**i, self.num_features, self.num_features)
+            for i in range(self.num_layers)
+        ]
+        self.shared_layers = nn.LayerList(shared_layers)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """
+        initialize model layers' weight
+        """
+        # init weight
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv1D):
+                layer.weight.set_value(
+                    KaimingUniform_like_torch(layer.weight).astype('float32'))
+                if layer.bias is not None:
+                    layer.bias.set_value(
+                        init_bias(layer.weight, layer.bias).astype('float32'))
+
+    def forward(self, x):
+        """ ASRF forward
+        """
+        out = self.conv_in(x)
+        for layer in self.shared_layers:
+            out = layer(out)
+        return out
diff --git a/docs/src/paddlevideo/modeling/backbones/bmn.py b/docs/src/paddlevideo/modeling/backbones/bmn.py
new file mode 100644
index 000000000..200d1920a
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/bmn.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+import paddle
+from paddle import ParamAttr
+from ..registry import BACKBONES
+
+
+def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
+                           num_sample_perbin):
+    """ generate sample mask for a boundary-matching pair """
+    plen = float(seg_xmax - seg_xmin)
+    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
+    total_samples = [
+        seg_xmin + plen_sample * ii
+        for ii in range(num_sample * num_sample_perbin)
+    ]
+    p_mask = []
+    for idx in range(num_sample):
+        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
+                                    num_sample_perbin]
+        bin_vector = np.zeros([tscale])
+        for sample in bin_samples:
+            sample_upper = math.ceil(sample)
+            sample_decimal, sample_down = math.modf(sample)
+            if (tscale - 1) >= int(sample_down) >= 0:
+                bin_vector[int(sample_down)] += 1 - sample_decimal
+            if (tscale - 1) >= int(sample_upper) >= 0:
+                bin_vector[int(sample_upper)] += sample_decimal
+        bin_vector = 1.0 / num_sample_perbin * bin_vector
+        p_mask.append(bin_vector)
+    p_mask = np.stack(p_mask, axis=1)
+    return p_mask
+
+
+def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
+                      num_sample_perbin):
+    """ generate sample mask for each point in Boundary-Matching Map """
+    mask_mat = []
+    for start_index in range(tscale):
+        mask_mat_vector = []
+        for duration_index in range(dscale):
+            if start_index + duration_index < tscale:
+                p_xmin = start_index
+                p_xmax = start_index + duration_index
+                center_len = float(p_xmax - p_xmin) + 1
+                sample_xmin = p_xmin - center_len * prop_boundary_ratio
+                sample_xmax = p_xmax + center_len * prop_boundary_ratio
+                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
+                                                tscale, num_sample,
+                                                num_sample_perbin)
+            else:
+                p_mask = np.zeros([tscale, num_sample])
+            mask_mat_vector.append(p_mask)
+        mask_mat_vector = np.stack(mask_mat_vector, axis=2)
+        mask_mat.append(mask_mat_vector)
+    mask_mat = np.stack(mask_mat, axis=3)
+    mask_mat = mask_mat.astype(np.float32)
+
+    sample_mask = np.reshape(mask_mat, [tscale, -1])
+    return sample_mask
+
+
+def init_params(name, in_channels, kernel_size):
+    fan_in = in_channels * kernel_size * 1
+    k = 1. / math.sqrt(fan_in)
+    param_attr = ParamAttr(name=name,
+                           initializer=paddle.nn.initializer.Uniform(low=-k,
+                                                                     high=k))
+    return param_attr
+
+
+@BACKBONES.register()
+class BMN(paddle.nn.Layer):
+    """BMN model from
+    `"BMN: Boundary-Matching Network for Temporal Action Proposal Generation" <https://arxiv.org/abs/1907.09702>`_
+    Args:
+        tscale (int): sequence length, default 100.
+        dscale (int): max duration length, default 100.
+        prop_boundary_ratio (float): ratio of expanded temporal region in proposal boundary, default 0.5.
+        num_sample (int): number of samples betweent starting boundary and ending boundary of each propoasl, default 32.
+        num_sample_perbin (int):  number of selected points in each sample, default 3.
+    """
+
+    def __init__(
+        self,
+        tscale,
+        dscale,
+        prop_boundary_ratio,
+        num_sample,
+        num_sample_perbin,
+        feat_dim=400,
+    ):
+        super(BMN, self).__init__()
+
+        #init config
+        self.feat_dim = feat_dim
+        self.tscale = tscale
+        self.dscale = dscale
+        self.prop_boundary_ratio = prop_boundary_ratio
+        self.num_sample = num_sample
+        self.num_sample_perbin = num_sample_perbin
+
+        self.hidden_dim_1d = 256
+        self.hidden_dim_2d = 128
+        self.hidden_dim_3d = 512
+
+        # Base Module
+        self.b_conv1 = paddle.nn.Conv1D(
+            in_channels=self.feat_dim,
+            out_channels=self.hidden_dim_1d,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            weight_attr=init_params('Base_1_w', self.feat_dim, 3),
+            bias_attr=init_params('Base_1_b', self.feat_dim, 3))
+        self.b_conv1_act = paddle.nn.ReLU()
+
+        self.b_conv2 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=self.hidden_dim_1d,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            weight_attr=init_params('Base_2_w', self.hidden_dim_1d, 3),
+            bias_attr=init_params('Base_2_b', self.hidden_dim_1d, 3))
+        self.b_conv2_act = paddle.nn.ReLU()
+
+        # Temporal Evaluation Module
+        self.ts_conv1 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=self.hidden_dim_1d,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            weight_attr=init_params('TEM_s1_w', self.hidden_dim_1d, 3),
+            bias_attr=init_params('TEM_s1_b', self.hidden_dim_1d, 3))
+        self.ts_conv1_act = paddle.nn.ReLU()
+
+        self.ts_conv2 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=1,
+            kernel_size=1,
+            padding=0,
+            groups=1,
+            weight_attr=init_params('TEM_s2_w', self.hidden_dim_1d, 1),
+            bias_attr=init_params('TEM_s2_b', self.hidden_dim_1d, 1))
+        self.ts_conv2_act = paddle.nn.Sigmoid()
+
+        self.te_conv1 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=self.hidden_dim_1d,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            weight_attr=init_params('TEM_e1_w', self.hidden_dim_1d, 3),
+            bias_attr=init_params('TEM_e1_b', self.hidden_dim_1d, 3))
+        self.te_conv1_act = paddle.nn.ReLU()
+        self.te_conv2 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=1,
+            kernel_size=1,
+            padding=0,
+            groups=1,
+            weight_attr=init_params('TEM_e2_w', self.hidden_dim_1d, 1),
+            bias_attr=init_params('TEM_e2_b', self.hidden_dim_1d, 1))
+        self.te_conv2_act = paddle.nn.Sigmoid()
+
+        #Proposal Evaluation Module
+        self.p_conv1 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=self.hidden_dim_2d,
+            kernel_size=3,
+            padding=1,
+            groups=1,
+            weight_attr=init_params('PEM_1d_w', self.hidden_dim_1d, 3),
+            bias_attr=init_params('PEM_1d_b', self.hidden_dim_1d, 3))
+        self.p_conv1_act = paddle.nn.ReLU()
+
+        # init to speed up
+        sample_mask = get_interp1d_mask(self.tscale, self.dscale,
+                                        self.prop_boundary_ratio,
+                                        self.num_sample, self.num_sample_perbin)
+        self.sample_mask = paddle.to_tensor(sample_mask)
+        self.sample_mask.stop_gradient = True
+
+        self.p_conv3d1 = paddle.nn.Conv3D(
+            in_channels=128,
+            out_channels=self.hidden_dim_3d,
+            kernel_size=(self.num_sample, 1, 1),
+            stride=(self.num_sample, 1, 1),
+            padding=0,
+            weight_attr=ParamAttr(name="PEM_3d1_w"),
+            bias_attr=ParamAttr(name="PEM_3d1_b"))
+        self.p_conv3d1_act = paddle.nn.ReLU()
+
+        self.p_conv2d1 = paddle.nn.Conv2D(
+            in_channels=512,
+            out_channels=self.hidden_dim_2d,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(name="PEM_2d1_w"),
+            bias_attr=ParamAttr(name="PEM_2d1_b"))
+        self.p_conv2d1_act = paddle.nn.ReLU()
+
+        self.p_conv2d2 = paddle.nn.Conv2D(
+            in_channels=128,
+            out_channels=self.hidden_dim_2d,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(name="PEM_2d2_w"),
+            bias_attr=ParamAttr(name="PEM_2d2_b"))
+        self.p_conv2d2_act = paddle.nn.ReLU()
+
+        self.p_conv2d3 = paddle.nn.Conv2D(
+            in_channels=128,
+            out_channels=self.hidden_dim_2d,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(name="PEM_2d3_w"),
+            bias_attr=ParamAttr(name="PEM_2d3_b"))
+        self.p_conv2d3_act = paddle.nn.ReLU()
+
+        self.p_conv2d4 = paddle.nn.Conv2D(
+            in_channels=128,
+            out_channels=2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(name="PEM_2d4_w"),
+            bias_attr=ParamAttr(name="PEM_2d4_b"))
+        self.p_conv2d4_act = paddle.nn.Sigmoid()
+
+    def init_weights(self):
+        pass
+
+    def forward(self, x):
+        #Base Module
+        x = self.b_conv1(x)
+        x = self.b_conv1_act(x)
+        x = self.b_conv2(x)
+        x = self.b_conv2_act(x)
+
+        #TEM
+        xs = self.ts_conv1(x)
+        xs = self.ts_conv1_act(xs)
+        xs = self.ts_conv2(xs)
+        xs = self.ts_conv2_act(xs)
+        xs = paddle.squeeze(xs, axis=[1])
+        xe = self.te_conv1(x)
+        xe = self.te_conv1_act(xe)
+        xe = self.te_conv2(xe)
+        xe = self.te_conv2_act(xe)
+        xe = paddle.squeeze(xe, axis=[1])
+
+        #PEM
+        xp = self.p_conv1(x)
+        xp = self.p_conv1_act(xp)
+        #BM layer
+        xp = paddle.matmul(xp, self.sample_mask)
+        xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale])
+
+        xp = self.p_conv3d1(xp)
+        xp = self.p_conv3d1_act(xp)
+        xp = paddle.squeeze(xp, axis=[2])
+        xp = self.p_conv2d1(xp)
+        xp = self.p_conv2d1_act(xp)
+        xp = self.p_conv2d2(xp)
+        xp = self.p_conv2d2_act(xp)
+        xp = self.p_conv2d3(xp)
+        xp = self.p_conv2d3_act(xp)
+        xp = self.p_conv2d4(xp)
+        xp = self.p_conv2d4_act(xp)
+        return xp, xs, xe
diff --git a/docs/src/paddlevideo/modeling/backbones/cfbi.py b/docs/src/paddlevideo/modeling/backbones/cfbi.py
new file mode 100644
index 000000000..5fbf044b7
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/cfbi.py
@@ -0,0 +1,88 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..registry import BACKBONES
+from .deeplab import DeepLab
+
+
+class FPN(nn.Layer):
+    """FPN Layer"""
+    def __init__(self, in_dim_4x, in_dim_8x, in_dim_16x, out_dim):
+        super(FPN, self).__init__()
+        self.toplayer = self._make_layer(in_dim_16x, out_dim)
+        self.latlayer1 = self._make_layer(in_dim_8x, out_dim)
+        self.latlayer2 = self._make_layer(in_dim_4x, out_dim)
+
+        self.smooth1 = self._make_layer(out_dim,
+                                        out_dim,
+                                        kernel_size=3,
+                                        padding=1)
+        self.smooth2 = self._make_layer(out_dim,
+                                        out_dim,
+                                        kernel_size=3,
+                                        padding=1)
+
+    def _make_layer(self, in_dim, out_dim, kernel_size=1, padding=0):
+        return nn.Sequential(
+            nn.Conv2D(in_dim,
+                      out_dim,
+                      kernel_size=kernel_size,
+                      stride=1,
+                      padding=padding,
+                      bias_attr=False),
+            nn.GroupNorm(num_groups=32, num_channels=out_dim))
+
+    def forward(self, x_4x, x_8x, x_16x):
+        """ forward function"""
+        x_16x = self.toplayer(x_16x)
+        x_8x = self.latlayer1(x_8x)
+        x_4x = self.latlayer2(x_4x)
+
+        x_8x = x_8x + F.interpolate(
+            x_16x, size=x_8x.shape[-2:], mode='bilinear', align_corners=True)
+        x_4x = x_4x + F.interpolate(
+            x_8x, size=x_4x.shape[-2:], mode='bilinear', align_corners=True)
+
+        x_8x = self.smooth1(x_8x)
+        x_4x = self.smooth2(x_4x)
+
+        return F.relu(x_4x), F.relu(x_8x), F.relu(x_16x)
+
+
+@BACKBONES.register()
+class CFBI(nn.Layer):
+    """CFBI plus backbone"""
+    def __init__(self,
+                 backbone='resnet',
+                 freeze_bn=True,
+                 model_aspp_outdim=256,
+                 in_dim_8x=512,
+                 model_semantic_embedding_dim=256):  #,epsilon=1e-05):
+        super(CFBI, self).__init__()
+        #self.epsilon = epsilon
+        self.feature_extracter = DeepLab(backbone=backbone, freeze_bn=freeze_bn)
+        self.fpn = FPN(in_dim_4x=model_aspp_outdim,
+                       in_dim_8x=in_dim_8x,
+                       in_dim_16x=model_aspp_outdim,
+                       out_dim=model_semantic_embedding_dim)
+
+    def forward(self, x):
+        """forward function"""
+        x, aspp_x, low_level, mid_level = self.feature_extracter(x, True)
+        x_4x, x_8x, x_16x = self.fpn(x, mid_level, aspp_x)
+        return x_4x, x_8x, x_16x, low_level
diff --git a/docs/src/paddlevideo/modeling/backbones/ctrgcn.py b/docs/src/paddlevideo/modeling/backbones/ctrgcn.py
new file mode 100644
index 000000000..9d645f4e9
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/ctrgcn.py
@@ -0,0 +1,514 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+
+def conv_init(conv):
+    if conv.weight is not None:
+        weight_init_(conv.weight, 'kaiming_normal_', mode='fan_in')
+    if conv.bias is not None:
+        nn.initializer.Constant(value=0.0)(conv.bias)
+
+
+def bn_init(bn, scale):
+    nn.initializer.Constant(value=float(scale))(bn.weight)
+    nn.initializer.Constant(value=0.0)(bn.bias)
+
+
+def einsum(x1, x3):
+    """paddle.einsum only support in dynamic graph mode.
+    x1 : n c u v
+    x2 : n c t v
+    """
+    n, c, u, v1 = x1.shape
+    n, c, t, v3 = x3.shape
+    assert (v1 == v3), "Args of einsum not match!"
+    x1 = paddle.transpose(x1, perm=[0, 1, 3, 2])  # n c v u
+    y = paddle.matmul(x3, x1)
+    # out: n c t u
+    return y
+
+
+class CTRGC(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 rel_reduction=8,
+                 mid_reduction=1):
+        super(CTRGC, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if in_channels == 3 or in_channels == 9:
+            self.rel_channels = 8
+            self.mid_channels = 16
+        else:
+            self.rel_channels = in_channels // rel_reduction
+            self.mid_channels = in_channels // mid_reduction
+        self.conv1 = nn.Conv2D(self.in_channels,
+                               self.rel_channels,
+                               kernel_size=1)
+        self.conv2 = nn.Conv2D(self.in_channels,
+                               self.rel_channels,
+                               kernel_size=1)
+        self.conv3 = nn.Conv2D(self.in_channels,
+                               self.out_channels,
+                               kernel_size=1)
+        self.conv4 = nn.Conv2D(self.rel_channels,
+                               self.out_channels,
+                               kernel_size=1)
+        self.tanh = nn.Tanh()
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                conv_init(m)
+            elif isinstance(m, nn.BatchNorm2D):
+                bn_init(m, 1)
+
+    def forward(self, x, A=None, alpha=1):
+        x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean(-2), self.conv3(
+            x)
+        x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2))
+        x1 = self.conv4(x1) * alpha + (
+            A.unsqueeze(0).unsqueeze(0) if A is not None else 0)  # N,C,V,V
+        # We only support 'paddle.einsum()' in dynamic graph mode, if use in infer model please implement self.
+        # x1 = paddle.einsum('ncuv,nctv->nctu', x1, x3)
+        x1 = einsum(x1, x3)
+        return x1
+
+
+class TemporalConv(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1):
+        super(TemporalConv, self).__init__()
+        pad = (kernel_size + (kernel_size - 1) * (dilation - 1) - 1) // 2
+        self.conv = nn.Conv2D(in_channels,
+                              out_channels,
+                              kernel_size=(kernel_size, 1),
+                              padding=(pad, 0),
+                              stride=(stride, 1),
+                              dilation=(dilation, 1))
+
+        self.bn = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class MultiScale_TemporalConv(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 dilations=[1, 2, 3, 4],
+                 residual=True,
+                 residual_kernel_size=1):
+
+        super(MultiScale_TemporalConv, self).__init__()
+        assert out_channels % (
+            len(dilations) +
+            2) == 0, '# out channels should be multiples of # branches'
+
+        # Multiple branches of temporal convolution
+        self.num_branches = len(dilations) + 2
+        branch_channels = out_channels // self.num_branches
+        if type(kernel_size) == list:
+            assert len(kernel_size) == len(dilations)
+        else:
+            kernel_size = [kernel_size] * len(dilations)
+        # Temporal Convolution branches
+        self.branches = nn.LayerList([
+            nn.Sequential(
+                nn.Conv2D(in_channels,
+                          branch_channels,
+                          kernel_size=1,
+                          padding=0),
+                nn.BatchNorm2D(branch_channels),
+                nn.ReLU(),
+                TemporalConv(branch_channels,
+                             branch_channels,
+                             kernel_size=ks,
+                             stride=stride,
+                             dilation=dilation),
+            ) for ks, dilation in zip(kernel_size, dilations)
+        ])
+
+        # Additional Max & 1x1 branch
+        self.branches.append(
+            nn.Sequential(
+                nn.Conv2D(in_channels,
+                          branch_channels,
+                          kernel_size=1,
+                          padding=0), nn.BatchNorm2D(branch_channels),
+                nn.ReLU(),
+                nn.MaxPool2D(kernel_size=(3, 1),
+                             stride=(stride, 1),
+                             padding=(1, 0)), nn.BatchNorm2D(branch_channels)))
+
+        self.branches.append(
+            nn.Sequential(
+                nn.Conv2D(in_channels,
+                          branch_channels,
+                          kernel_size=1,
+                          padding=0,
+                          stride=(stride, 1)), nn.BatchNorm2D(branch_channels)))
+
+        # Residual connection
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = TemporalConv(in_channels,
+                                         out_channels,
+                                         kernel_size=residual_kernel_size,
+                                         stride=stride)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        # initialize
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                conv_init(m)
+            elif isinstance(m, nn.BatchNorm2D):
+                weight_init_(m.weight, 'Normal', std=0.02, mean=1.0)
+                nn.initializer.Constant(value=0.0)(m.bias)
+
+    def forward(self, x):
+        # Input dim: (N,C,T,V)
+        res = self.residual(x)
+        branch_outs = []
+        for tempconv in self.branches:
+            out = tempconv(x)
+            branch_outs.append(out)
+
+        out = paddle.concat(branch_outs, axis=1)
+        out += res
+        return out
+
+
+class unit_tcn(nn.Layer):
+
+    def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):
+        super(unit_tcn, self).__init__()
+        pad = int((kernel_size - 1) / 2)
+        self.conv = nn.Conv2D(in_channels,
+                              out_channels,
+                              kernel_size=(kernel_size, 1),
+                              padding=(pad, 0),
+                              stride=(stride, 1))
+
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+        conv_init(self.conv)
+        bn_init(self.bn, 1)
+
+    def forward(self, x):
+        x = self.bn(self.conv(x))
+        return x
+
+
+class unit_gcn(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 coff_embedding=4,
+                 adaptive=True,
+                 residual=True):
+        super(unit_gcn, self).__init__()
+        inter_channels = out_channels // coff_embedding
+        self.inter_c = inter_channels
+        self.out_c = out_channels
+        self.in_c = in_channels
+        self.adaptive = adaptive
+        self.num_subset = A.shape[0]
+        self.convs = nn.LayerList()
+
+        for i in range(self.num_subset):
+            self.convs.append(CTRGC(in_channels, out_channels))
+
+        if residual:
+            if in_channels != out_channels:
+                self.down = nn.Sequential(
+                    nn.Conv2D(in_channels, out_channels, 1),
+                    nn.BatchNorm2D(out_channels))
+            else:
+                self.down = lambda x: x
+        else:
+            self.down = lambda x: 0
+        if self.adaptive:
+            pa_param = paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(A.astype(np.float32)))
+            self.PA = paddle.create_parameter(shape=A.shape,
+                                              dtype='float32',
+                                              attr=pa_param)
+        else:
+            A_tensor = paddle.to_tensor(A, dtype="float32")
+            self.A = paddle.create_parameter(
+                shape=A_tensor.shape,
+                dtype='float32',
+                default_initializer=paddle.nn.initializer.Assign(A_tensor))
+            self.A.stop_gradient = True
+        alpha_tensor = paddle.to_tensor(np.zeros(1), dtype="float32")
+        self.alpha = paddle.create_parameter(
+            shape=alpha_tensor.shape,
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Assign(alpha_tensor))
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.soft = nn.Softmax(-2)
+        self.relu = nn.ReLU()
+
+    def init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                conv_init(m)
+            elif isinstance(m, nn.BatchNorm2D):
+                bn_init(m, 1)
+        bn_init(self.bn, 1e-6)
+
+    def forward(self, x):
+        y = None
+        if self.adaptive:
+            A = self.PA
+        else:
+            A = self.A.cuda(x.get_device())
+        for i in range(self.num_subset):
+            z = self.convs[i](x, A[i], self.alpha)
+            y = z + y if y is not None else z
+        y = self.bn(y)
+        y += self.down(x)
+        y = self.relu(y)
+        return y
+
+
+class TCN_GCN_unit(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 stride=1,
+                 residual=True,
+                 adaptive=True,
+                 kernel_size=5,
+                 dilations=[1, 2]):
+        super(TCN_GCN_unit, self).__init__()
+        self.gcn1 = unit_gcn(in_channels, out_channels, A, adaptive=adaptive)
+        self.tcn1 = MultiScale_TemporalConv(out_channels,
+                                            out_channels,
+                                            kernel_size=kernel_size,
+                                            stride=stride,
+                                            dilations=dilations,
+                                            residual=False)
+        self.relu = nn.ReLU()
+        if not residual:
+            self.residual = lambda x: 0
+
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+
+        else:
+            self.residual = unit_tcn(in_channels,
+                                     out_channels,
+                                     kernel_size=1,
+                                     stride=stride)
+
+    def forward(self, x):
+        y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))
+        return y
+
+
+class NTUDGraph:
+
+    def __init__(self, labeling_mode='spatial'):
+        num_node = 25
+        self_link = [(i, i) for i in range(num_node)]
+        inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
+                            (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),
+                            (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),
+                            (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),
+                            (23, 8), (24, 25), (25, 12)]
+        inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]
+        outward = [(j, i) for (i, j) in inward]
+        neighbor = inward + outward
+
+        self.num_node = num_node
+        self.self_link = self_link
+        self.inward = inward
+        self.outward = outward
+        self.neighbor = neighbor
+        self.A = self.get_adjacency_matrix(labeling_mode)
+
+    def edge2mat(self, link, num_node):
+        A = np.zeros((num_node, num_node))
+        for i, j in link:
+            A[j, i] = 1
+        return A
+
+    def normalize_digraph(self, A):
+        Dl = np.sum(A, 0)
+        h, w = A.shape
+        Dn = np.zeros((w, w))
+        for i in range(w):
+            if Dl[i] > 0:
+                Dn[i, i] = Dl[i]**(-1)
+        AD = np.dot(A, Dn)
+        return AD
+
+    def get_spatial_graph(self, num_node, self_link, inward, outward):
+        I = self.edge2mat(self_link, num_node)
+        In = self.normalize_digraph(self.edge2mat(inward, num_node))
+        Out = self.normalize_digraph(self.edge2mat(outward, num_node))
+        A = np.stack((I, In, Out))
+        return A
+
+    def get_adjacency_matrix(self, labeling_mode=None):
+        if labeling_mode is None:
+            return self.A
+        if labeling_mode == 'spatial':
+            A = self.get_spatial_graph(self.num_node, self.self_link,
+                                       self.inward, self.outward)
+        else:
+            raise ValueError()
+        return A
+
+
+@BACKBONES.register()
+class CTRGCN(nn.Layer):
+    """
+    CTR-GCN model from:
+    `"Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition" <https://arxiv.org/abs/2107.12213>`_
+    Args:
+        num_point: int, numbers of sketeton point.
+        num_person: int, numbers of person.
+        base_channel: int, model's hidden dim.
+        graph: str, sketeton adjacency matrix name.
+        graph_args: dict, sketeton adjacency graph class args.
+        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 3.
+        adaptive: bool, if adjacency matrix can adaptive.
+    """
+
+    def __init__(self,
+                 num_point=25,
+                 num_person=2,
+                 base_channel=64,
+                 graph='ntu_rgb_d',
+                 graph_args=dict(),
+                 in_channels=3,
+                 adaptive=True):
+        super(CTRGCN, self).__init__()
+
+        if graph == 'ntu_rgb_d':
+            self.graph = NTUDGraph(**graph_args)
+        else:
+            raise ValueError()
+
+        A = self.graph.A  # 3,25,25
+
+        self.num_point = num_point
+        self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point)
+        self.base_channel = base_channel
+
+        self.l1 = TCN_GCN_unit(in_channels,
+                               self.base_channel,
+                               A,
+                               residual=False,
+                               adaptive=adaptive)
+        self.l2 = TCN_GCN_unit(self.base_channel,
+                               self.base_channel,
+                               A,
+                               adaptive=adaptive)
+        self.l3 = TCN_GCN_unit(self.base_channel,
+                               self.base_channel,
+                               A,
+                               adaptive=adaptive)
+        self.l4 = TCN_GCN_unit(self.base_channel,
+                               self.base_channel,
+                               A,
+                               adaptive=adaptive)
+        self.l5 = TCN_GCN_unit(self.base_channel,
+                               self.base_channel * 2,
+                               A,
+                               stride=2,
+                               adaptive=adaptive)
+        self.l6 = TCN_GCN_unit(self.base_channel * 2,
+                               self.base_channel * 2,
+                               A,
+                               adaptive=adaptive)
+        self.l7 = TCN_GCN_unit(self.base_channel * 2,
+                               self.base_channel * 2,
+                               A,
+                               adaptive=adaptive)
+        self.l8 = TCN_GCN_unit(self.base_channel * 2,
+                               self.base_channel * 4,
+                               A,
+                               stride=2,
+                               adaptive=adaptive)
+        self.l9 = TCN_GCN_unit(self.base_channel * 4,
+                               self.base_channel * 4,
+                               A,
+                               adaptive=adaptive)
+        self.l10 = TCN_GCN_unit(self.base_channel * 4,
+                                self.base_channel * 4,
+                                A,
+                                adaptive=adaptive)
+
+    def init_weights(self):
+        bn_init(self.data_bn, 1)
+
+    def forward(self, x):
+        N, C, T, V, M = x.shape
+        x = paddle.transpose(x, perm=[0, 4, 3, 1, 2])
+        x = paddle.reshape(x, (N, M * V * C, T))
+
+        x = self.data_bn(x)
+
+        x = paddle.reshape(x, (N, M, V, C, T))
+        x = paddle.transpose(x, perm=(0, 1, 3, 4, 2))
+
+        x = paddle.reshape(x, (N * M, C, T, V))
+
+        x = self.l1(x)
+        x = self.l2(x)
+        x = self.l3(x)
+        x = self.l4(x)
+        x = self.l5(x)
+        x = self.l6(x)
+        x = self.l7(x)
+        x = self.l8(x)
+        x = self.l9(x)
+        x = self.l10(x)
+
+        return x, N, M
diff --git a/docs/src/paddlevideo/modeling/backbones/darknet.py b/docs/src/paddlevideo/modeling/backbones/darknet.py
new file mode 100644
index 000000000..3f48bf619
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/darknet.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride,
+                 padding,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + ".conv.weights"),
+            bias_attr=False)
+
+        bn_name = name + ".bn"
+        self._bn = nn.BatchNorm(
+            num_channels=output_channels,
+            act="leaky_relu",
+            param_attr=ParamAttr(name=bn_name + ".scale"),
+            bias_attr=ParamAttr(name=bn_name + ".offset"),
+            moving_mean_name=bn_name + ".mean",
+            moving_variance_name=bn_name + ".var")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._bn(x)
+        return x
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, name=None):
+        super(BasicBlock, self).__init__()
+
+        self._conv1 = ConvBNLayer(input_channels=input_channels, output_channels=output_channels, filter_size=[
+                                  3, 3], stride=1, padding=1,  name=name+'.0')
+        self._max_pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self._conv2 = ConvBNLayer(input_channels=output_channels, output_channels=output_channels *
+                                  2, filter_size=[3, 3], stride=1, padding=1, name=name+'.1')
+        self._conv3 = ConvBNLayer(input_channels=output_channels*2, output_channels=output_channels,
+                                  filter_size=[1, 1], stride=1, padding=0, name=name+'.2')
+
+    def forward(self, x):
+        x = self._conv1(x)
+        x = self._max_pool(x)
+        x = self._conv2(x)
+        x = self._conv3(x)
+        return x
+
+
+class Reorg(nn.Layer):
+    def __init__(self, stride=2):
+        super(Reorg, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        stride = self.stride
+        assert (x.dim() == 4)
+        B = x.shape[0]
+        C = x.shape[1]
+        H = x.shape[2]
+        W = x.shape[3]
+        assert (H % stride == 0)
+        assert (W % stride == 0)
+        ws = stride
+        hs = stride
+        x = x.reshape([B, C, H // hs, hs, W // ws, ws]
+                      ).transpose([0, 1, 2, 4, 3, 5])
+        x = x.reshape([B, C, H // hs * W // ws, hs * ws]
+                      ).transpose([0, 1, 3, 2])
+        x = x.reshape([B, C, hs * ws, H // hs, W // ws]
+                      ).transpose([0, 2, 1, 3, 4])
+        x = x.reshape([B, hs * ws * C, H // hs, W // ws])
+        return x
+
+
+class Darknet(nn.Layer):
+    def __init__(self, pretrained=None):
+        super(Darknet, self).__init__()
+        self.pretrained = pretrained
+        self._conv1 = ConvBNLayer(
+            input_channels=3, output_channels=32, filter_size=3, stride=1, padding=1, name='input')
+        self._max_pool1 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self._basic_block_11 = BasicBlock(
+            input_channels=32, output_channels=64, name='1.1')
+        self._basic_block_12 = BasicBlock(
+            input_channels=64, output_channels=128, name='1.2')
+        self._basic_block_13 = BasicBlock(
+            input_channels=128, output_channels=256, name='1.3')
+        self._conv2 = ConvBNLayer(
+            input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='up1')
+        self._conv3 = ConvBNLayer(
+            input_channels=512, output_channels=256, filter_size=1, stride=1, padding=0, name='down1')
+        self._conv4 = ConvBNLayer(
+            input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='2.1')
+        self._max_pool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self._conv5 = ConvBNLayer(
+            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='2.2')
+        self._conv6 = ConvBNLayer(input_channels=1024, output_channels=512,
+                                  filter_size=1, stride=1, padding=0, name='2.3')  # ori
+        self._conv7 = ConvBNLayer(
+            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='up2')
+        self._conv8 = ConvBNLayer(input_channels=1024, output_channels=512,
+                                  filter_size=1, stride=1, padding=0, name='down2')
+        self._conv9 = ConvBNLayer(
+            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.1')
+        self._conv10 = ConvBNLayer(
+            input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.2')
+        self._conv11 = ConvBNLayer(
+            input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.3')
+        self._conv12 = ConvBNLayer(
+            input_channels=512, output_channels=64, filter_size=1, stride=1, padding=0, name='4.1')
+        self._reorg = Reorg()
+        self._conv13 = ConvBNLayer(
+            input_channels=1280, output_channels=1024, filter_size=3, stride=1, padding=1, name='5.1')
+        self._conv14 = nn.Conv2D(1024, 425, kernel_size=1)
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._max_pool1(x)
+        x = self._basic_block_11(x)
+        x = self._basic_block_12(x)
+        x = self._basic_block_13(x)
+        x = self._conv2(x)
+        x = self._conv3(x)
+        ori = self._conv4(x)
+        x = self._max_pool2(ori)
+        x = self._conv5(x)
+        x = self._conv6(x)
+        x = self._conv7(x)
+        x = self._conv8(x)
+        x = self._conv9(x)
+        x = self._conv10(x)
+        x1 = self._conv11(x)
+        x2 = self._conv12(ori)
+        x2 = self._reorg(x2)
+        x = paddle.concat([x2, x1], 1)
+        x = self._conv13(x)
+        x = self._conv14(x)
+        return x
diff --git a/docs/src/paddlevideo/modeling/backbones/deeplab.py b/docs/src/paddlevideo/modeling/backbones/deeplab.py
new file mode 100644
index 000000000..c566205ac
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/deeplab.py
@@ -0,0 +1,454 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import copy
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..registry import BACKBONES
+
+
+class FrozenBatchNorm2D(nn.Layer):
+    """
+    BatchNorm2D where the batch statistics and the affine parameters
+    are fixed
+    """
+    def __init__(self, n, epsilon=1e-5):
+        super(FrozenBatchNorm2D, self).__init__()
+        x1 = paddle.ones([n])
+        x2 = paddle.zeros([n])
+        weight = self.create_parameter(
+            shape=x1.shape, default_initializer=nn.initializer.Assign(x1))
+        bias = self.create_parameter(
+            shape=x2.shape, default_initializer=nn.initializer.Assign(x2))
+        running_mean = self.create_parameter(
+            shape=x2.shape, default_initializer=nn.initializer.Assign(x2))
+        running_var = self.create_parameter(
+            shape=x1.shape, default_initializer=nn.initializer.Assign(x1))
+        self.add_parameter('weight', weight)
+        self.add_parameter('bias', bias)
+        self.add_parameter('running_mean', running_mean)
+        self.add_parameter('running_var', running_var)
+        self.epsilon = epsilon
+
+    def forward(self, x):
+        scale = self.weight * paddle.rsqrt((self.running_var + self.epsilon))
+        bias = self.bias - self.running_mean * scale
+        scale = paddle.reshape(scale, [1, -1, 1, 1])
+        bias = paddle.reshape(bias, [1, -1, 1, 1])
+        return x * scale + bias
+
+
+class Bottleneck(nn.Layer):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 BatchNorm=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
+        self.bn1 = BatchNorm(planes)
+        self.conv2 = nn.Conv2D(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               dilation=dilation,
+                               padding=dilation,
+                               bias_attr=False)
+        self.bn2 = BatchNorm(planes)
+        self.conv3 = nn.Conv2D(planes,
+                               planes * 4,
+                               kernel_size=1,
+                               bias_attr=False)
+        self.bn3 = BatchNorm(planes * 4)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Layer):
+    def __init__(self,
+                 block,
+                 layers,
+                 output_stride,
+                 BatchNorm,
+                 pretrained=False):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        blocks = [1, 2, 4]
+        if output_stride == 16:
+            strides = [1, 2, 2, 1]
+            dilations = [1, 1, 1, 2]
+        elif output_stride == 8:
+            strides = [1, 2, 1, 1]
+            dilations = [1, 1, 2, 4]
+        else:
+            raise NotImplementedError
+
+        # Modules
+        self.conv1 = nn.Conv2D(3,
+                               64,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias_attr=False)
+        self.bn1 = BatchNorm(64)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block,
+                                       64,
+                                       layers[0],
+                                       stride=strides[0],
+                                       dilation=dilations[0],
+                                       BatchNorm=BatchNorm)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=strides[1],
+                                       dilation=dilations[1],
+                                       BatchNorm=BatchNorm)
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=strides[2],
+                                       dilation=dilations[2],
+                                       BatchNorm=BatchNorm)
+        self.layer4 = self._make_MG_unit(block,
+                                         512,
+                                         blocks=blocks,
+                                         stride=strides[3],
+                                         dilation=dilations[3],
+                                         BatchNorm=BatchNorm)
+        self._init_weight()
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    stride=1,
+                    dilation=1,
+                    BatchNorm=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes,
+                          planes * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, dilation, downsample,
+                  BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      dilation=dilation,
+                      BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def _make_MG_unit(self,
+                      block,
+                      planes,
+                      blocks,
+                      stride=1,
+                      dilation=1,
+                      BatchNorm=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes,
+                          planes * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes,
+                  planes,
+                  stride,
+                  dilation=blocks[0] * dilation,
+                  downsample=downsample,
+                  BatchNorm=BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, len(blocks)):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      stride=1,
+                      dilation=blocks[i] * dilation,
+                      BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, input, return_mid_level=False):
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        low_level_feat = x
+        x = self.layer2(x)
+        mid_level_feat = x
+        x = self.layer3(x)
+        x = self.layer4(x)
+        if return_mid_level:
+            return x, low_level_feat, mid_level_feat
+        else:
+            return x, low_level_feat
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+class _ASPPModule(nn.Layer):
+    def __init__(self, inplanes, planes, kernel_size, padding, dilation,
+                 BatchNorm):
+        super(_ASPPModule, self).__init__()
+        self.atrous_conv = nn.Conv2D(inplanes,
+                                     planes,
+                                     kernel_size=kernel_size,
+                                     stride=1,
+                                     padding=padding,
+                                     dilation=dilation,
+                                     bias_attr=False)
+        self.bn = BatchNorm(planes)
+        self.relu = nn.ReLU()
+
+        self._init_weight()
+
+    def forward(self, x):
+        x = self.atrous_conv(x)
+        x = self.bn(x)
+
+        return self.relu(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                m.weight_attr = nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.BatchNorm2D):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+
+class ASPP(nn.Layer):
+    def __init__(self, backbone, output_stride, BatchNorm):
+        super(ASPP, self).__init__()
+        if backbone == 'drn':
+            inplanes = 512
+        elif backbone == 'mobilenet':
+            inplanes = 320
+        else:
+            inplanes = 2048
+        if output_stride == 16:
+            dilations = [1, 6, 12, 18]
+        elif output_stride == 8:
+            dilations = [1, 12, 24, 36]
+        else:
+            raise NotImplementedError
+
+        self.aspp1 = _ASPPModule(inplanes,
+                                 256,
+                                 1,
+                                 padding=0,
+                                 dilation=dilations[0],
+                                 BatchNorm=BatchNorm)
+        self.aspp2 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[1],
+                                 dilation=dilations[1],
+                                 BatchNorm=BatchNorm)
+        self.aspp3 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[2],
+                                 dilation=dilations[2],
+                                 BatchNorm=BatchNorm)
+        self.aspp4 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[3],
+                                 dilation=dilations[3],
+                                 BatchNorm=BatchNorm)
+
+        self.global_avg_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2D((1, 1)),
+            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),
+            BatchNorm(256), nn.ReLU())
+        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)
+        self.bn1 = BatchNorm(256)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(0.1)
+        self._init_weight()
+
+    def forward(self, x):
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.interpolate(x5,
+                           size=x4.shape[2:],
+                           mode='bilinear',
+                           align_corners=True)
+        x = paddle.concat(x=[x1, x2, x3, x4, x5], axis=1)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        return self.dropout(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+class Decoder(nn.Layer):
+    def __init__(self, backbone, BatchNorm):
+        super(Decoder, self).__init__()
+        if backbone == 'resnet':
+            low_level_inplanes = 256
+        elif backbone == 'mobilenet':
+            raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)
+        self.bn1 = BatchNorm(48)
+        self.relu = nn.ReLU()
+
+        self.last_conv = nn.Sequential(
+            nn.Conv2D(304,
+                      256,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=False), BatchNorm(256), nn.ReLU(),
+            nn.Sequential(),
+            nn.Conv2D(256,
+                      256,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=False), BatchNorm(256), nn.ReLU(),
+            nn.Sequential())
+
+        self._init_weight()
+
+    def forward(self, x, low_level_feat):
+        low_level_feat = self.conv1(low_level_feat)
+        low_level_feat = self.bn1(low_level_feat)
+        low_level_feat = self.relu(low_level_feat)
+
+        x = F.interpolate(x,
+                          size=low_level_feat.shape[2:],
+                          mode='bilinear',
+                          align_corners=True)
+        x = paddle.concat(x=[x, low_level_feat], axis=1)
+        x = self.last_conv(x)
+
+        return x
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+class DeepLab(nn.Layer):
+    """DeepLab model for segmentation"""
+    def __init__(self, backbone='resnet', output_stride=16, freeze_bn=True):
+        super(DeepLab, self).__init__()
+
+        if freeze_bn == True:
+            print("Use frozen BN in DeepLab!")
+            BatchNorm = FrozenBatchNorm2D
+        else:
+            BatchNorm = nn.BatchNorm2D
+
+        self.backbone = ResNet(Bottleneck, [3, 4, 23, 3],
+                               output_stride,
+                               BatchNorm,
+                               pretrained=True)
+        self.aspp = ASPP(backbone, output_stride, BatchNorm)
+        self.decoder = Decoder(backbone, BatchNorm)
+
+    def forward(self, input, return_aspp=False):
+        """forward function"""
+        if return_aspp:
+            x, low_level_feat, mid_level_feat = self.backbone(input, True)
+        else:
+            x, low_level_feat = self.backbone(input)
+        aspp_x = self.aspp(x)
+        x = self.decoder(aspp_x, low_level_feat)
+
+        if return_aspp:
+            return x, aspp_x, low_level_feat, mid_level_feat
+        else:
+            return x, low_level_feat
diff --git a/docs/src/paddlevideo/modeling/backbones/movinet.py b/docs/src/paddlevideo/modeling/backbones/movinet.py
new file mode 100644
index 000000000..cb6d4fddf
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/movinet.py
@@ -0,0 +1,574 @@
+import collections.abc
+from itertools import repeat
+from typing import Any, Callable, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.layer import Identity
+
+from ..registry import BACKBONES
+from collections import OrderedDict
+
+container_abcs = collections.abc
+"""Model Config
+"""
+
+A0 = {'block_num': [0, 1, 3, 3, 4, 4]}
+A0['conv1'] = [3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1)]
+A0['b2_l0'] = [8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1)]
+A0['b3_l0'] = [8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0)]
+A0['b3_l1'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b3_l2'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b4_l0'] = [32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0)]
+A0['b4_l1'] = [56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b4_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b5_l0'] = [56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1)]
+A0['b5_l1'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b5_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b5_l3'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b6_l0'] = [56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1)]
+A0['b6_l1'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]
+A0['b6_l2'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]
+A0['b6_l3'] = [104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]
+A0['conv7'] = [104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0)]
+
+MODEL_CONFIG = {'A0': A0}
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+def _make_divisible(v: float,
+                    divisor: int,
+                    min_value: Optional[int] = None) -> int:
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8.
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+class CausalModule(nn.Layer):
+    def __init__(self) -> None:
+        super().__init__()
+        self.activation = None
+
+    def reset_activation(self) -> None:
+        self.activation = None
+
+
+class Conv2dBNActivation(nn.Sequential):
+    def __init__(
+        self,
+        in_planes: int,
+        out_planes: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        padding: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = 1,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., nn.Layer]] = None,
+        activation_layer: Optional[Callable[..., nn.Layer]] = None,
+        **kwargs: Any,
+    ) -> None:
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        if norm_layer is None:
+            norm_layer = Identity
+        if activation_layer is None:
+            activation_layer = Identity
+        self.kernel_size = kernel_size
+        self.stride = stride
+        dict_layers = (nn.Conv2D(in_planes,
+                                 out_planes,
+                                 kernel_size=kernel_size,
+                                 stride=stride,
+                                 padding=padding,
+                                 groups=groups,
+                                 **kwargs), norm_layer(out_planes,
+                                                       momentum=0.1),
+                       activation_layer())
+
+        self.out_channels = out_planes
+        super(Conv2dBNActivation, self).__init__(dict_layers[0], dict_layers[1],
+                                                 dict_layers[2])
+
+
+class Conv3DBNActivation(nn.Sequential):
+    def __init__(
+        self,
+        in_planes: int,
+        out_planes: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        padding: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., nn.Layer]] = None,
+        activation_layer: Optional[Callable[..., nn.Layer]] = None,
+        **kwargs: Any,
+    ) -> None:
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        if norm_layer is None:
+            norm_layer = Identity
+        if activation_layer is None:
+            activation_layer = Identity
+        self.kernel_size = kernel_size
+        self.stride = stride
+
+        dict_layers = (nn.Conv3D(in_planes,
+                                 out_planes,
+                                 kernel_size=kernel_size,
+                                 stride=stride,
+                                 padding=padding,
+                                 groups=groups,
+                                 **kwargs), norm_layer(out_planes,
+                                                       momentum=0.1),
+                       activation_layer())
+        self.out_channels = out_planes
+        super(Conv3DBNActivation, self).__init__(dict_layers[0], dict_layers[1],
+                                                 dict_layers[2])
+
+
+class ConvBlock3D(CausalModule):
+    def __init__(
+        self,
+        in_planes: int,
+        out_planes: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        causal: bool,
+        conv_type: str,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        norm_layer: Optional[Callable[..., nn.Layer]] = None,
+        activation_layer: Optional[Callable[..., nn.Layer]] = None,
+        bias_attr: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        self.conv_2 = None
+
+        if causal is True:
+            padding = (0, padding[1], padding[2])
+        if conv_type != "2plus1d" and conv_type != "3d":
+            raise ValueError("only 2plus2d or 3d are " +
+                             "allowed as 3d convolutions")
+
+        if conv_type == "2plus1d":
+            self.conv_1 = Conv2dBNActivation(in_planes,
+                                             out_planes,
+                                             kernel_size=(kernel_size[1],
+                                                          kernel_size[2]),
+                                             padding=(padding[1], padding[2]),
+                                             stride=(stride[1], stride[2]),
+                                             activation_layer=activation_layer,
+                                             norm_layer=norm_layer,
+                                             bias_attr=bias_attr,
+                                             **kwargs)
+            if kernel_size[0] > 1:
+                self.conv_2 = Conv2dBNActivation(
+                    in_planes,
+                    out_planes,
+                    kernel_size=(kernel_size[0], 1),
+                    padding=(padding[0], 0),
+                    stride=(stride[0], 1),
+                    activation_layer=activation_layer,
+                    norm_layer=norm_layer,
+                    bias_attr=bias_attr,
+                    **kwargs)
+        elif conv_type == "3d":
+            self.conv_1 = Conv3DBNActivation(in_planes,
+                                             out_planes,
+                                             kernel_size=kernel_size,
+                                             padding=padding,
+                                             activation_layer=activation_layer,
+                                             norm_layer=norm_layer,
+                                             stride=stride,
+                                             bias_attr=bias_attr,
+                                             **kwargs)
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.dim_pad = self.kernel_size[0] - 1
+        self.stride = stride
+        self.causal = causal
+        self.conv_type = conv_type
+
+    def _forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.dim_pad > 0 and self.conv_2 is None and self.causal is True:
+            x = self._cat_stream_buffer(x)
+        b, c, t, h, w = x.shape
+        if self.conv_type == "2plus1d":
+            x = paddle.transpose(x, (0, 2, 1, 3, 4))  # bcthw --> btchw
+            x = paddle.reshape_(x, (-1, c, h, w))  # btchw --> bt,c,h,w
+        x = self.conv_1(x)
+        if self.conv_type == "2plus1d":
+            b, c, h, w = x.shape
+            x = paddle.reshape_(x, (-1, t, c, h, w))  # bt,c,h,w --> b,t,c,h,w
+            x = paddle.transpose(x, (0, 2, 1, 3, 4))  # b,t,c,h,w --> b,c,t,h,w
+            if self.conv_2 is not None:
+                if self.dim_pad > 0 and self.causal is True:
+                    x = self._cat_stream_buffer(x)
+                b, c, t, h, w = x.shape
+                x = paddle.reshape_(x, (b, c, t, h * w))
+                x = self.conv_2(x)
+                b, c, t, _ = x.shape
+                x = paddle.reshape_(x, (b, c, t, h, w))
+        return x
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self._forward(x)
+        return x
+
+    def _cat_stream_buffer(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.activation is None:
+            self._setup_activation(x.shape)
+        x = paddle.concat((self.activation, x), 2)
+        self._save_in_activation(x)
+        return x
+
+    def _save_in_activation(self, x: paddle.Tensor) -> None:
+        assert self.dim_pad > 0
+        self.activation = paddle.to_tensor(x.numpy()[:, :, -self.dim_pad:,
+                                                     ...]).clone().detach()
+
+    def _setup_activation(self, input_shape: Tuple[float, ...]) -> None:
+        assert self.dim_pad > 0
+        self.activation = paddle.zeros(shape=[
+            *input_shape[:2],  # type: ignore
+            self.dim_pad,
+            *input_shape[3:]
+        ])
+
+
+class TemporalCGAvgPool3D(CausalModule):
+    def __init__(self, ) -> None:
+        super().__init__()
+        self.n_cumulated_values = 0
+        self.register_forward_post_hook(self._detach_activation)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        input_shape = x.shape
+        cumulative_sum = paddle.cumsum(x, axis=2)
+        if self.activation is None:
+            self.activation = cumulative_sum[:, :, -1:].clone()
+        else:
+            cumulative_sum += self.activation
+            self.activation = cumulative_sum[:, :, -1:].clone()
+
+        noe = paddle.arange(1, input_shape[2] + 1)
+        axis = paddle.to_tensor([0, 1, 3, 4])
+        noe = paddle.unsqueeze(noe, axis=axis)
+        divisor = noe.expand(x.shape)
+        x = cumulative_sum / (self.n_cumulated_values + divisor)
+        self.n_cumulated_values += input_shape[2]
+        return x
+
+    @staticmethod
+    def _detach_activation(module: CausalModule, inputs: paddle.Tensor,
+                           output: paddle.Tensor) -> None:
+        module.activation.detach()
+
+    def reset_activation(self) -> None:
+        super().reset_activation()
+        self.n_cumulated_values = 0
+
+
+class SqueezeExcitation(nn.Layer):
+    def __init__(self,
+                 input_channels: int,
+                 activation_2: nn.Layer,
+                 activation_1: nn.Layer,
+                 conv_type: str,
+                 causal: bool,
+                 squeeze_factor: int = 4,
+                 bias_attr: bool = True) -> None:
+        super().__init__()
+        self.causal = causal
+        se_multiplier = 2 if causal else 1
+        squeeze_channels = _make_divisible(
+            input_channels // squeeze_factor * se_multiplier, 8)
+        self.temporal_cumualtive_GAvg3D = TemporalCGAvgPool3D()
+        self.fc1 = ConvBlock3D(input_channels * se_multiplier,
+                               squeeze_channels,
+                               kernel_size=(1, 1, 1),
+                               padding=0,
+                               causal=causal,
+                               conv_type=conv_type,
+                               bias_attr=bias_attr)
+        self.activation_1 = activation_1()
+        self.activation_2 = activation_2()
+        self.fc2 = ConvBlock3D(squeeze_channels,
+                               input_channels,
+                               kernel_size=(1, 1, 1),
+                               padding=0,
+                               causal=causal,
+                               conv_type=conv_type,
+                               bias_attr=bias_attr)
+
+    def _scale(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        if self.causal:
+            x_space = paddle.mean(inputs, axis=[3, 4], keepdim=True)
+            scale = self.temporal_cumualtive_GAvg3D(x_space)
+            scale = paddle.concat((scale, x_space), axis=1)
+        else:
+            scale = F.adaptive_avg_pool3d(inputs, 1)
+        scale = self.fc1(scale)
+        scale = self.activation_1(scale)
+        scale = self.fc2(scale)
+        return self.activation_2(scale)
+
+    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        scale = self._scale(inputs)
+        return scale * inputs
+
+
+class BasicBneck(nn.Layer):
+    def __init__(
+        self,
+        input_channels,
+        out_channels,
+        expanded_channels,
+        kernel_size,
+        stride,
+        padding,
+        padding_avg,
+        causal: bool,
+        conv_type: str,
+        norm_layer: Optional[Callable[..., nn.Layer]] = None,
+        activation_layer: Optional[Callable[..., nn.Layer]] = None,
+    ) -> None:
+        super().__init__()
+
+        assert type(stride) is tuple
+
+        if (not stride[0] == 1 or not (1 <= stride[1] <= 2)
+                or not (1 <= stride[2] <= 2)):
+            raise ValueError('illegal stride value')
+
+        self.res = None
+
+        layers = []
+        if expanded_channels != out_channels:
+            # expand
+            self.expand = ConvBlock3D(in_planes=input_channels,
+                                      out_planes=expanded_channels,
+                                      kernel_size=(1, 1, 1),
+                                      padding=(0, 0, 0),
+                                      causal=causal,
+                                      conv_type=conv_type,
+                                      norm_layer=norm_layer,
+                                      activation_layer=activation_layer)
+        # deepwise
+        self.deep = ConvBlock3D(in_planes=expanded_channels,
+                                out_planes=expanded_channels,
+                                kernel_size=kernel_size,
+                                padding=padding,
+                                stride=stride,
+                                groups=expanded_channels,
+                                causal=causal,
+                                conv_type=conv_type,
+                                norm_layer=norm_layer,
+                                activation_layer=activation_layer)
+
+        # SE
+        self.se = SqueezeExcitation(
+            expanded_channels,
+            causal=causal,
+            activation_1=activation_layer,
+            activation_2=(nn.Sigmoid if conv_type == "3d" else nn.Hardsigmoid),
+            conv_type=conv_type)
+        # project
+        self.project = ConvBlock3D(expanded_channels,
+                                   out_channels,
+                                   kernel_size=(1, 1, 1),
+                                   padding=(0, 0, 0),
+                                   causal=causal,
+                                   conv_type=conv_type,
+                                   norm_layer=norm_layer,
+                                   activation_layer=Identity)
+
+        if not (stride == (1, 1, 1) and input_channels == out_channels):
+            if stride != (1, 1, 1):
+                layers.append(
+                    nn.AvgPool3D((1, 3, 3), stride=stride, padding=padding_avg))
+            layers.append(
+                ConvBlock3D(
+                    in_planes=input_channels,
+                    out_planes=out_channels,
+                    kernel_size=(1, 1, 1),
+                    padding=(0, 0, 0),
+                    norm_layer=norm_layer,
+                    activation_layer=Identity,
+                    causal=causal,
+                    conv_type=conv_type,
+                ))
+            self.res = nn.Sequential(*layers)
+        self.alpha = self.create_parameter(shape=[1], dtype="float32")
+
+    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        if self.res is not None:
+            residual = self.res(inputs)
+        else:
+            residual = inputs
+        if self.expand is not None:
+            x = self.expand(inputs)
+        else:
+            x = inputs
+
+        x = self.deep(x)
+        x = self.se(x)
+        x = self.project(x)
+        result = residual + self.alpha * x
+        return result
+
+
+@BACKBONES.register()
+class MoViNet(nn.Layer):
+    def __init__(
+        self,
+        model_type: str = 'A0',
+        hidden_dim: int = 2048,
+        causal: bool = True,
+        num_classes: int = 400,
+        conv_type: str = "3d",
+    ) -> None:
+        super().__init__()
+        """
+        causal: causal mode
+        num_classes: number of classes for classifcation
+        conv_type: type of convolution either 3d or 2plus1d
+        """
+        blocks_dic = OrderedDict()
+        cfg = MODEL_CONFIG[model_type]
+
+        norm_layer = nn.BatchNorm3D if conv_type == "3d" else nn.BatchNorm2D
+        activation_layer = nn.Swish if conv_type == "3d" else nn.Hardswish
+
+        # conv1
+        self.conv1 = ConvBlock3D(in_planes=cfg['conv1'][0],
+                                 out_planes=cfg['conv1'][1],
+                                 kernel_size=cfg['conv1'][2],
+                                 stride=cfg['conv1'][3],
+                                 padding=cfg['conv1'][4],
+                                 causal=causal,
+                                 conv_type=conv_type,
+                                 norm_layer=norm_layer,
+                                 activation_layer=activation_layer)
+        # blocks
+        for i in range(2, len(cfg['block_num']) + 1):
+            for j in range(cfg['block_num'][i - 1]):
+                blocks_dic[f'b{i}_l{j}'] = BasicBneck(
+                    cfg[f'b{i}_l{j}'][0],
+                    cfg[f'b{i}_l{j}'][1],
+                    cfg[f'b{i}_l{j}'][2],
+                    cfg[f'b{i}_l{j}'][3],
+                    cfg[f'b{i}_l{j}'][4],
+                    cfg[f'b{i}_l{j}'][5],
+                    cfg[f'b{i}_l{j}'][6],
+                    causal=causal,
+                    conv_type=conv_type,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer)
+        self.blocks = nn.Sequential(*(blocks_dic.values()))
+
+        # conv7
+        self.conv7 = ConvBlock3D(in_planes=cfg['conv7'][0],
+                                 out_planes=cfg['conv7'][1],
+                                 kernel_size=cfg['conv7'][2],
+                                 stride=cfg['conv7'][3],
+                                 padding=cfg['conv7'][4],
+                                 causal=causal,
+                                 conv_type=conv_type,
+                                 norm_layer=norm_layer,
+                                 activation_layer=activation_layer)
+        # pool
+        self.classifier = nn.Sequential(
+            # dense9
+            ConvBlock3D(in_planes=cfg['conv7'][1],
+                        out_planes=hidden_dim,
+                        kernel_size=(1, 1, 1),
+                        causal=causal,
+                        conv_type=conv_type,
+                        bias_attr=True),
+            nn.Swish(),
+            nn.Dropout(p=0.2),
+            # dense10d
+            ConvBlock3D(in_planes=hidden_dim,
+                        out_planes=num_classes,
+                        kernel_size=(1, 1, 1),
+                        causal=causal,
+                        conv_type=conv_type,
+                        bias_attr=True),
+        )
+        if causal:
+            self.cgap = TemporalCGAvgPool3D()
+        self.apply(self._weight_init)
+        self.causal = causal
+
+    def avg(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.causal:
+            avg = F.adaptive_avg_pool3d(x, (x.shape[2], 1, 1))
+            avg = self.cgap(avg)[:, :, -1:]
+        else:
+            avg = F.adaptive_avg_pool3d(x, 1)
+        return avg
+
+    @staticmethod
+    def _weight_init(m):
+        if isinstance(m, nn.Conv3D):
+            nn.initializer.KaimingNormal(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0.0)(m.bias)
+        elif isinstance(m, (nn.BatchNorm3D, nn.BatchNorm2D, nn.GroupNorm)):
+            nn.initializer.Constant(1.0)(m.weight)
+            nn.initializer.Constant(0.0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.initializer.Normal(m.weight, 0, 0.01)
+            nn.initializer.Constant(0.0)(m.bias)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.conv1(x)
+        x = self.blocks(x)
+        x = self.conv7(x)
+        x = self.avg(x)
+        x = self.classifier(x)
+        x = x.flatten(1)
+        return x
+
+    @staticmethod
+    def _clean_activation_buffers(m):
+        if issubclass(type(m), CausalModule):
+            m.reset_activation()
+
+    def clean_activation_buffers(self) -> None:
+        self.apply(self._clean_activation_buffers)
+
+
+if __name__ == '__main__':
+    net = MoViNet(causal=False, conv_type='3d')
+    paddle.summary(net, input_size=(1, 3, 8, 224, 224))
diff --git a/docs/src/paddlevideo/modeling/backbones/ms_tcn.py b/docs/src/paddlevideo/modeling/backbones/ms_tcn.py
new file mode 100644
index 000000000..fb49b9c80
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/ms_tcn.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import copy
+import random
+import math
+
+from paddle import ParamAttr
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = len(tensor.shape)
+    if dimensions < 2:
+        raise ValueError("Fan in and fan out can not be computed \
+        for tensor with fewer than 2 dimensions")
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.shape[1]
+        fan_out = tensor.shape[0]
+    else:
+        num_input_fmaps = tensor.shape[1]
+        num_output_fmaps = tensor.shape[0]
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[0][0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def calculate_gain(nonlinearity=None, a=None):
+    if nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if a != None:
+            return math.sqrt(2.0 / (1 + a**2))
+        else:
+            return math.sqrt(2.0 / (1 + 0.01**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4
+    else:
+        return 1
+
+
+def KaimingUniform_like_torch(weight_npy,
+                              mode='fan_in',
+                              nonlinearity='leaky_relu'):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+    if mode == 'fan_in':
+        fan_mode = fan_in
+    else:
+        fan_mode = fan_out
+    a = math.sqrt(5.0)
+    gain = calculate_gain(nonlinearity=nonlinearity, a=a)
+    std = gain / math.sqrt(fan_mode)
+    bound = math.sqrt(3.0) * std
+    return np.random.uniform(-bound, bound, weight_npy.shape)
+
+
+def init_bias(weight_npy, bias_npy):
+    # attention this weight is not bias
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+    bound = 1.0 / math.sqrt(fan_in)
+    return np.random.uniform(-bound, bound, bias_npy.shape)
+
+
+class SingleStageModel(nn.Layer):
+
+    def __init__(self, num_layers, num_f_maps, dim, num_classes):
+        super(SingleStageModel, self).__init__()
+        self.conv_in = nn.Conv1D(dim, num_f_maps, 1)
+        self.layers = nn.LayerList([
+            copy.deepcopy(DilatedResidualLayer(2**i, num_f_maps, num_f_maps))
+            for i in range(num_layers)
+        ])
+        self.conv_out = nn.Conv1D(num_f_maps, num_classes, 1)
+
+    def forward(self, x):
+        out = self.conv_in(x)
+        for layer in self.layers:
+            out = layer(out)
+        out = self.conv_out(out)
+        return out
+
+
+class DilatedResidualLayer(nn.Layer):
+
+    def __init__(self, dilation, in_channels, out_channels):
+        super(DilatedResidualLayer, self).__init__()
+        self.conv_dilated = nn.Conv1D(in_channels,
+                                      out_channels,
+                                      3,
+                                      padding=dilation,
+                                      dilation=dilation)
+        self.conv_in = nn.Conv1D(out_channels, out_channels, 1)
+        self.dropout = nn.Dropout()
+
+    def forward(self, x):
+        out = F.relu(self.conv_dilated(x))
+        out = self.conv_in(out)
+        out = self.dropout(out)
+        return (x + out)
+
+
+@BACKBONES.register()
+class MSTCN(nn.Layer):
+
+    def __init__(self, num_stages, num_layers, num_f_maps, dim, num_classes):
+        super().__init__()
+        self.stage1 = SingleStageModel(num_layers, num_f_maps, dim, num_classes)
+        self.stages = nn.LayerList([
+            copy.deepcopy(
+                SingleStageModel(num_layers, num_f_maps, num_classes,
+                                 num_classes)) for s in range(num_stages - 1)
+        ])
+
+    def forward(self, x):
+        """ MSTCN forward
+        """
+        out = self.stage1(x)
+        outputs = out.unsqueeze(0)
+        for s in self.stages:
+            out = s(F.softmax(out, axis=1))
+            outputs = paddle.concat((outputs, out.unsqueeze(0)), axis=0)
+        return outputs
+
+    def init_weights(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv1D):
+                layer.weight.set_value(
+                    KaimingUniform_like_torch(layer.weight).astype('float32'))
+                if layer.bias is not None:
+                    layer.bias.set_value(
+                        init_bias(layer.weight, layer.bias).astype('float32'))
diff --git a/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py b/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py
new file mode 100644
index 000000000..28d045d1b
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/pptsm_mv2.py
@@ -0,0 +1,282 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+# Download URL of pretrained model
+# {
+# "MobileNetV2":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_ssld_pretrained.pdparams",
+
+# "MobileNetV2_x0_25":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_25_pretrained.pdparams",
+# "MobileNetV2_x0_5":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_5_pretrained.pdparams",
+# "MobileNetV2_x0_75":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_75_pretrained.pdparams",
+# "MobileNetV2_x1_5":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x1_5_pretrained.pdparams",
+# "MobileNetV2_x2_0":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x2_0_pretrained.pdparams"
+# }
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 name=None,
+                 use_cudnn=True):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(in_channels=num_channels,
+                            out_channels=num_filters,
+                            kernel_size=filter_size,
+                            stride=stride,
+                            padding=padding,
+                            groups=num_groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance")
+
+    def forward(self, inputs, if_act=True):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if if_act:
+            y = F.relu6(y)
+        return y
+
+
+class InvertedResidualUnit(nn.Layer):
+    def __init__(self, num_channels, num_in_filter, num_filters, stride,
+                 filter_size, padding, expansion_factor, name, num_seg):
+        super(InvertedResidualUnit, self).__init__()
+        self.num_seg = num_seg
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        self._expand_conv = ConvBNLayer(num_channels=num_channels,
+                                        num_filters=num_expfilter,
+                                        filter_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        num_groups=1,
+                                        name=name + "_expand")
+
+        self._bottleneck_conv = ConvBNLayer(num_channels=num_expfilter,
+                                            num_filters=num_expfilter,
+                                            filter_size=filter_size,
+                                            stride=stride,
+                                            padding=padding,
+                                            num_groups=num_expfilter,
+                                            use_cudnn=False,
+                                            name=name + "_dwise")
+
+        self._linear_conv = ConvBNLayer(num_channels=num_expfilter,
+                                        num_filters=num_filters,
+                                        filter_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        num_groups=1,
+                                        name=name + "_linear")
+
+    def forward(self, inputs, ifshortcut):
+        # add temporal shift module
+        y = inputs
+        if ifshortcut:
+            y = F.temporal_shift(y, self.num_seg, 1.0 / self.num_seg)
+
+        y = self._expand_conv(y, if_act=True)
+        y = self._bottleneck_conv(y, if_act=True)
+        y = self._linear_conv(y, if_act=False)
+        if ifshortcut:
+            y = paddle.add(inputs, y)
+        return y
+
+
+class InvresiBlocks(nn.Layer):
+    def __init__(self, in_c, t, c, n, s, name, num_seg):
+        super(InvresiBlocks, self).__init__()
+
+        self._first_block = InvertedResidualUnit(num_channels=in_c,
+                                                 num_in_filter=in_c,
+                                                 num_filters=c,
+                                                 stride=s,
+                                                 filter_size=3,
+                                                 padding=1,
+                                                 expansion_factor=t,
+                                                 name=name + "_1",
+                                                 num_seg=num_seg)
+
+        self._block_list = []
+        for i in range(1, n):
+            block = self.add_sublayer(name + "_" + str(i + 1),
+                                      sublayer=InvertedResidualUnit(
+                                          num_channels=c,
+                                          num_in_filter=c,
+                                          num_filters=c,
+                                          stride=1,
+                                          filter_size=3,
+                                          padding=1,
+                                          expansion_factor=t,
+                                          name=name + "_" + str(i + 1),
+                                          num_seg=num_seg))
+            self._block_list.append(block)
+
+    def forward(self, inputs):
+        y = self._first_block(inputs, ifshortcut=False)
+        for block in self._block_list:
+            y = block(y, ifshortcut=True)
+        return y
+
+
+class MobileNet(nn.Layer):
+    def __init__(self,
+                 class_num=400,
+                 scale=1.0,
+                 pretrained=None,
+                 prefix_name="",
+                 num_seg=8):
+        super(MobileNet, self).__init__()
+        self.scale = scale
+        self.class_num = class_num
+        self.pretrained = pretrained
+        self.num_seg = num_seg
+
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+
+        self.conv1 = ConvBNLayer(num_channels=3,
+                                 num_filters=int(32 * scale),
+                                 filter_size=3,
+                                 stride=2,
+                                 padding=1,
+                                 name=prefix_name + "conv1_1")
+
+        self.block_list = []
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            block = self.add_sublayer(prefix_name + "conv" + str(i),
+                                      sublayer=InvresiBlocks(in_c=in_c,
+                                                             t=t,
+                                                             c=int(c * scale),
+                                                             n=n,
+                                                             s=s,
+                                                             name=prefix_name +
+                                                             "conv" + str(i),
+                                                             num_seg=num_seg))
+            self.block_list.append(block)
+            in_c = int(c * scale)
+
+        self.out_c = int(1280 * scale) if scale > 1.0 else 1280
+        self.conv9 = ConvBNLayer(num_channels=in_c,
+                                 num_filters=self.out_c,
+                                 filter_size=1,
+                                 stride=1,
+                                 padding=0,
+                                 name=prefix_name + "conv9")
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.out = Linear(self.out_c,
+                          class_num,
+                          weight_attr=ParamAttr(name=prefix_name +
+                                                "fc10_weights"),
+                          bias_attr=ParamAttr(name=prefix_name + "fc10_offset"))
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        y = self.conv1(inputs, if_act=True)
+        for block in self.block_list:
+            y = block(y)
+        y = self.conv9(y, if_act=True)
+        y = self.pool2d_avg(y)
+
+        y = paddle.reshape(y, [-1, self.num_seg, y.shape[1]])
+        y = paddle.mean(y, axis=1)
+        y = paddle.reshape(y, shape=[-1, self.out_c])
+
+        y = self.out(y)
+        return y
+
+
+@BACKBONES.register()
+def PPTSM_MobileNetV2(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=1.0, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x0_25(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=0.25, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x0_5(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=0.5, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x0_75(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=0.75, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x1_5(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=1.5, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x2_0(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=2.0, **kwargs)
+    return model
diff --git a/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py b/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py
new file mode 100644
index 000000000..cd10baca4
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/pptsm_mv3.py
@@ -0,0 +1,408 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1905.02244
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+# Download URL of pretrained model
+# MODEL_URLS = {
+#     "MobileNetV3_small_x1_0":
+#     "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_ssld_pretrained.pdparams",
+#     "MobileNetV3_large_x1_0":
+#     "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_ssld_pretrained.pdparams",
+# }
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV3_small": ["blocks[0]", "blocks[2]", "blocks[7]", "blocks[10]"],
+    "MobileNetV3_large":
+    ["blocks[0]", "blocks[2]", "blocks[5]", "blocks[11]", "blocks[14]"]
+}
+
+# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively.
+# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.
+# k: kernel_size
+# exp: middle channel number in depthwise block
+# c: output channel number in depthwise block
+# se: whether to use SE block
+# act: which activation to use
+# s: stride in depthwise block
+NET_CONFIG = {
+    "large": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, False, "relu", 1],
+        [3, 64, 24, False, "relu", 2],
+        [3, 72, 24, False, "relu", 1],
+        [5, 72, 40, True, "relu", 2],
+        [5, 120, 40, True, "relu", 1],
+        [5, 120, 40, True, "relu", 1],
+        [3, 240, 80, False, "hardswish", 2],
+        [3, 200, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 480, 112, True, "hardswish", 1],
+        [3, 672, 112, True, "hardswish", 1],
+        [5, 672, 160, True, "hardswish", 2],
+        [5, 960, 160, True, "hardswish", 1],
+        [5, 960, 160, True, "hardswish", 1],
+    ],
+    "small": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, True, "relu", 2],
+        [3, 72, 24, False, "relu", 2],
+        [3, 88, 24, False, "relu", 1],
+        [5, 96, 40, True, "hardswish", 2],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 120, 48, True, "hardswish", 1],
+        [5, 144, 48, True, "hardswish", 1],
+        [5, 288, 96, True, "hardswish", 2],
+        [5, 576, 96, True, "hardswish", 1],
+        [5, 576, 96, True, "hardswish", 1],
+    ]
+}
+# first conv output channel number in MobileNetV3
+STEM_CONV_NUMBER = 16
+# last second conv output channel for "small"
+LAST_SECOND_CONV_SMALL = 576
+# last second conv output channel for "large"
+LAST_SECOND_CONV_LARGE = 960
+# last conv output channel number for "large" and "small"
+LAST_CONV = 1280
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class MobileNetV3(nn.Layer):
+    """
+    MobileNetV3
+    Args:
+        config: list. MobileNetV3 depthwise blocks config.
+        scale: float=1.0. The coefficient that controls the size of network parameters.
+        class_num: int=1000. The number of classes.
+        inplanes: int=16. The output channel number of first convolution layer.
+        class_squeeze: int=960. The output channel number of penultimate convolution layer.
+        class_expand: int=1280. The output channel number of last convolution layer.
+        dropout_prob: float=0.2.  Probability of setting units to zero.
+    Returns:
+        model: nn.Layer. Specific MobileNetV3 model depends on args.
+    """
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=400,
+                 inplanes=STEM_CONV_NUMBER,
+                 class_squeeze=LAST_SECOND_CONV_LARGE,
+                 class_expand=LAST_CONV,
+                 dropout_prob=0.2,
+                 num_seg=8,
+                 pretrained=None,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.cfg = config
+        self.scale = scale
+        self.inplanes = inplanes
+        self.class_squeeze = class_squeeze
+        self.class_expand = class_expand
+        self.class_num = class_num
+        self.num_seg = num_seg
+        self.pretrained = pretrained
+
+        self.conv = ConvBNLayer(in_c=3,
+                                out_c=_make_divisible(self.inplanes *
+                                                      self.scale),
+                                filter_size=3,
+                                stride=2,
+                                padding=1,
+                                num_groups=1,
+                                if_act=True,
+                                act="hardswish")
+
+        self.blocks = nn.Sequential(*[
+            ResidualUnit(in_c=_make_divisible(self.inplanes * self.scale if i ==
+                                              0 else self.cfg[i - 1][2] *
+                                              self.scale),
+                         mid_c=_make_divisible(self.scale * exp),
+                         out_c=_make_divisible(self.scale * c),
+                         filter_size=k,
+                         stride=s,
+                         use_se=se,
+                         num_seg=self.num_seg,
+                         act=act)
+            for i, (k, exp, c, se, act, s) in enumerate(self.cfg)
+        ])
+
+        self.last_second_conv = ConvBNLayer(
+            in_c=_make_divisible(self.cfg[-1][2] * self.scale),
+            out_c=_make_divisible(self.scale * self.class_squeeze),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(in_channels=_make_divisible(self.scale *
+                                                            self.class_squeeze),
+                                out_channels=self.class_expand,
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias_attr=False)
+
+        self.hardswish = nn.Hardswish()
+        if dropout_prob is not None:
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        else:
+            self.dropout = None
+
+        self.fc = Linear(self.class_expand, class_num)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.last_second_conv(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        # feature aggregation for video
+        x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]])
+        x = paddle.mean(x, axis=1)
+        x = paddle.reshape(x, shape=[-1, self.class_expand])
+
+        x = self.fc(x)
+
+        return x
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None):
+        super().__init__()
+
+        self.conv = Conv2D(in_channels=in_c,
+                           out_channels=out_c,
+                           kernel_size=filter_size,
+                           stride=stride,
+                           padding=padding,
+                           groups=num_groups,
+                           bias_attr=False)
+        self.bn = BatchNorm(num_channels=out_c,
+                            act=None,
+                            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.act(x)
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 num_seg=8,
+                 act=None):
+        super().__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+        self.num_seg = num_seg
+
+        self.expand_conv = ConvBNLayer(in_c=in_c,
+                                       out_c=mid_c,
+                                       filter_size=1,
+                                       stride=1,
+                                       padding=0,
+                                       if_act=True,
+                                       act=act)
+        self.bottleneck_conv = ConvBNLayer(in_c=mid_c,
+                                           out_c=mid_c,
+                                           filter_size=filter_size,
+                                           stride=stride,
+                                           padding=int((filter_size - 1) // 2),
+                                           num_groups=mid_c,
+                                           if_act=True,
+                                           act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_c)
+        self.linear_conv = ConvBNLayer(in_c=mid_c,
+                                       out_c=out_c,
+                                       filter_size=1,
+                                       stride=1,
+                                       padding=0,
+                                       if_act=False,
+                                       act=None)
+
+    def forward(self, x):
+        identity = x
+
+        if self.if_shortcut:
+            x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg)
+
+        x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, x)
+        return x
+
+
+# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid
+class Hardsigmoid(nn.Layer):
+    def __init__(self, slope=0.2, offset=0.5):
+        super().__init__()
+        self.slope = slope
+        self.offset = offset
+
+    def forward(self, x):
+        return nn.functional.hardsigmoid(x,
+                                         slope=self.slope,
+                                         offset=self.offset)
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(in_channels=channel,
+                            out_channels=channel // reduction,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(in_channels=channel // reduction,
+                            out_channels=channel,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0)
+        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        return paddle.multiply(x=identity, y=x)
+
+
+def PPTSM_MobileNetV3_small_x1_0(pretrained=None, **kwargs):
+    """
+    MobileNetV3_small_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        pretrained=pretrained,
+        **kwargs)
+    return model
+
+
+@BACKBONES.register()
+def PPTSM_MobileNetV3(pretrained=None, **kwargs):
+    """
+    MobileNetV3_large_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        pretrained=pretrained,
+        **kwargs)
+    return model
diff --git a/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py b/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py
new file mode 100644
index 000000000..07dc5bf4e
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/pptsm_v2.py
@@ -0,0 +1,405 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear, BatchNorm2D
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+# MODEL_URLS = {
+#     "PPLCNetV2":
+#     "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_ssld_pretrained.pdparams",
+# }
+
+MODEL_STAGES_PATTERN = {
+    "PPLCNet": ["blocks2", "blocks3", "blocks4", "blocks5", "blocks6"]
+}
+
+NET_CONFIG = {
+    # in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut
+    "stage1": [64, 3, False, False, False, False],
+    "stage2": [128, 3, False, False, False, False],
+    "stage3": [256, 5, True, True, True, False],
+    "stage4": [512, 5, False, True, False, True],
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class GlobalAttention(nn.Layer):
+    """
+    Lightweight temporal attention module.
+    """
+
+    def __init__(self, num_seg=8):
+        super().__init__()
+        self.fc = nn.Linear(in_features=num_seg,
+                            out_features=num_seg,
+                            weight_attr=ParamAttr(learning_rate=5.0,
+                                                  regularizer=L2Decay(1e-4)),
+                            bias_attr=ParamAttr(learning_rate=10.0,
+                                                regularizer=L2Decay(0.0)))
+        self.num_seg = num_seg
+
+    def forward(self, x):
+        _, C, H, W = x.shape
+        x0 = x
+
+        x = x.reshape([-1, self.num_seg, C * H * W])
+        x = paddle.mean(x, axis=2)  # efficient way of avg_pool
+        x = x.squeeze(axis=-1)
+        x = self.fc(x)
+        attention = F.sigmoid(x)
+        attention = attention.reshape(
+            (-1, self.num_seg, 1, 1, 1))  #for broadcast
+
+        x0 = x0.reshape([-1, self.num_seg, C, H, W])
+        y = paddle.multiply(x0, attention)
+        y = y.reshape_([-1, C, H, W])
+        return y
+
+
+class ConvBNLayer(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_act=True):
+        super().__init__()
+        self.use_act = use_act
+        self.conv = Conv2D(in_channels=in_channels,
+                           out_channels=out_channels,
+                           kernel_size=kernel_size,
+                           stride=stride,
+                           padding=(kernel_size - 1) // 2,
+                           groups=groups,
+                           weight_attr=ParamAttr(initializer=KaimingNormal()),
+                           bias_attr=False)
+
+        self.bn = BatchNorm2D(out_channels,
+                              weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                              bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if self.use_act:
+            self.act = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+        return x
+
+
+class SEModule(nn.Layer):
+
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(in_channels=channel,
+                            out_channels=channel // reduction,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(in_channels=channel // reduction,
+                            out_channels=channel,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0)
+        self.hardsigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class RepDepthwiseSeparable(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 dw_size=3,
+                 split_pw=False,
+                 use_rep=False,
+                 use_se=False,
+                 use_shortcut=False):
+        super().__init__()
+        self.is_repped = False
+
+        self.dw_size = dw_size
+        self.split_pw = split_pw
+        self.use_rep = use_rep
+        self.use_se = use_se
+        self.use_shortcut = True if use_shortcut and stride == 1 and in_channels == out_channels else False
+
+        if self.use_rep:
+            self.dw_conv_list = nn.LayerList()
+            for kernel_size in range(self.dw_size, 0, -2):
+                if kernel_size == 1 and stride != 1:
+                    continue
+                dw_conv = ConvBNLayer(in_channels=in_channels,
+                                      out_channels=in_channels,
+                                      kernel_size=kernel_size,
+                                      stride=stride,
+                                      groups=in_channels,
+                                      use_act=False)
+                self.dw_conv_list.append(dw_conv)
+            self.dw_conv = nn.Conv2D(in_channels=in_channels,
+                                     out_channels=in_channels,
+                                     kernel_size=dw_size,
+                                     stride=stride,
+                                     padding=(dw_size - 1) // 2,
+                                     groups=in_channels)
+        else:
+            self.dw_conv = ConvBNLayer(in_channels=in_channels,
+                                       out_channels=in_channels,
+                                       kernel_size=dw_size,
+                                       stride=stride,
+                                       groups=in_channels)
+
+        self.act = nn.ReLU()
+
+        if use_se:
+            self.se = SEModule(in_channels)
+
+        if self.split_pw:
+            pw_ratio = 0.5
+            self.pw_conv_1 = ConvBNLayer(in_channels=in_channels,
+                                         kernel_size=1,
+                                         out_channels=int(out_channels *
+                                                          pw_ratio),
+                                         stride=1)
+            self.pw_conv_2 = ConvBNLayer(in_channels=int(out_channels *
+                                                         pw_ratio),
+                                         kernel_size=1,
+                                         out_channels=out_channels,
+                                         stride=1)
+        else:
+            self.pw_conv = ConvBNLayer(in_channels=in_channels,
+                                       kernel_size=1,
+                                       out_channels=out_channels,
+                                       stride=1)
+
+    def forward(self, x):
+        if self.use_rep:
+            input_x = x
+            if self.is_repped:
+                x = self.act(self.dw_conv(x))
+            else:
+                y = self.dw_conv_list[0](x)
+                for dw_conv in self.dw_conv_list[1:]:
+                    y += dw_conv(x)
+                x = self.act(y)
+        else:
+            x = self.dw_conv(x)
+
+        if self.use_se:
+            x = self.se(x)
+        if self.split_pw:
+            x = self.pw_conv_1(x)
+            x = self.pw_conv_2(x)
+        else:
+            x = self.pw_conv(x)
+        if self.use_shortcut:
+            x = x + input_x
+        return x
+
+    def rep(self):
+        if self.use_rep:
+            self.is_repped = True
+            kernel, bias = self._get_equivalent_kernel_bias()
+            self.dw_conv.weight.set_value(kernel)
+            self.dw_conv.bias.set_value(bias)
+
+    def _get_equivalent_kernel_bias(self):
+        kernel_sum = 0
+        bias_sum = 0
+        for dw_conv in self.dw_conv_list:
+            kernel, bias = self._fuse_bn_tensor(dw_conv)
+            kernel = self._pad_tensor(kernel, to_size=self.dw_size)
+            kernel_sum += kernel
+            bias_sum += bias
+        return kernel_sum, bias_sum
+
+    def _fuse_bn_tensor(self, branch):
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+    def _pad_tensor(self, tensor, to_size):
+        from_size = tensor.shape[-1]
+        if from_size == to_size:
+            return tensor
+        pad = (to_size - from_size) // 2
+        return F.pad(tensor, [pad, pad, pad, pad])
+
+
+class PPTSM_v2_LCNet(nn.Layer):
+
+    def __init__(self,
+                 scale,
+                 depths,
+                 class_num=400,
+                 dropout_prob=0,
+                 num_seg=8,
+                 use_temporal_att=False,
+                 pretrained=None,
+                 use_last_conv=True,
+                 class_expand=1280):
+        super().__init__()
+        self.scale = scale
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+        self.num_seg = num_seg
+        self.use_temporal_att = use_temporal_att
+        self.pretrained = pretrained
+
+        self.stem = nn.Sequential(*[
+            ConvBNLayer(in_channels=3,
+                        kernel_size=3,
+                        out_channels=make_divisible(32 * scale),
+                        stride=2),
+            RepDepthwiseSeparable(in_channels=make_divisible(32 * scale),
+                                  out_channels=make_divisible(64 * scale),
+                                  stride=1,
+                                  dw_size=3)
+        ])
+
+        # stages
+        self.stages = nn.LayerList()
+        for depth_idx, k in enumerate(NET_CONFIG):
+            in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut = NET_CONFIG[
+                k]
+            self.stages.append(
+                nn.Sequential(*[
+                    RepDepthwiseSeparable(in_channels=make_divisible(
+                        (in_channels if i == 0 else in_channels * 2) * scale),
+                                          out_channels=make_divisible(
+                                              in_channels * 2 * scale),
+                                          stride=2 if i == 0 else 1,
+                                          dw_size=kernel_size,
+                                          split_pw=split_pw,
+                                          use_rep=use_rep,
+                                          use_se=use_se,
+                                          use_shortcut=use_shortcut)
+                    for i in range(depths[depth_idx])
+                ]))
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        if self.use_last_conv:
+            self.last_conv = Conv2D(in_channels=make_divisible(
+                NET_CONFIG["stage4"][0] * 2 * scale),
+                                    out_channels=self.class_expand,
+                                    kernel_size=1,
+                                    stride=1,
+                                    padding=0,
+                                    bias_attr=False)
+            self.act = nn.ReLU()
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        in_features = self.class_expand if self.use_last_conv else NET_CONFIG[
+            "stage4"][0] * 2 * scale
+        self.fc = Linear(in_features, class_num)
+        if self.use_temporal_att:
+            self.global_attention = GlobalAttention(num_seg=self.num_seg)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, x):
+        x = self.stem(x)
+        count = 0
+        for stage in self.stages:
+            # only add temporal attention and tsm in stage3 for efficiency
+            if count == 2:
+                # add temporal attention
+                if self.use_temporal_att:
+                    x = self.global_attention(x)
+                x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg)
+            count += 1
+            x = stage(x)
+
+        x = self.avg_pool(x)
+        if self.use_last_conv:
+            x = self.last_conv(x)
+            x = self.act(x)
+            x = self.dropout(x)
+
+        # Feature aggregation
+        x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]])
+        x = paddle.mean(x, axis=1)
+        x = paddle.reshape(x, shape=[-1, self.class_expand])
+
+        x = self.fc(x)
+        return x
+
+
+@BACKBONES.register()
+def PPTSM_v2(pretrained=None, use_ssld=False, **kwargs):
+    """
+    PP-TSM_v2 model.
+    Args:
+        pretrained: str, means the path of the pretrained model.
+    Returns:
+        model: nn.Layer.
+    """
+    model = PPTSM_v2_LCNet(pretrained=pretrained,
+                           scale=1.0,
+                           depths=[2, 2, 6, 2],
+                           dropout_prob=0.2,
+                           **kwargs)
+    return model
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet.py b/docs/src/paddlevideo/modeling/backbones/resnet.py
new file mode 100644
index 000000000..2f07991a2
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet.py
@@ -0,0 +1,283 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+
+    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(out_channels,
+                                       weight_attr=ParamAttr(name=bn_name +
+                                                             "_scale"),
+                                       bias_attr=ParamAttr(bn_name + "_offset"))
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2b")
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.relu(y)
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 filter_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 filter_size=3,
+                                 act=None,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     filter_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNet(nn.Layer):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, pretrained=None):
+        super(ResNet, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = [64, 256, 512, 1024]
+        out_channels = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(in_channels=3,
+                                out_channels=64,
+                                kernel_size=7,
+                                stride=2,
+                                act="relu",
+                                name="conv1")
+        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        conv_name,
+                        BottleneckBlock(
+                            # NOTE: Be careful! Here is different from TSM model.
+                            in_channels=in_channels[block]
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(in_channels=in_channels[block]
+                                   if i == 0 else out_channels[block],
+                                   out_channels=out_channels[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        #XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+
+        """
+        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,
+        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+        #y = paddle.reshape(
+        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+        y = self.conv(inputs)
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet3d.py b/docs/src/paddlevideo/modeling/backbones/resnet3d.py
new file mode 100644
index 000000000..33edefe61
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet3d.py
@@ -0,0 +1,641 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+import collections
+from itertools import repeat
+
+import paddle
+from paddle import nn
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+_triple = _ntuple(3)
+
+
+class ConvBNLayer(nn.Layer):
+    """A conv block that bundles conv/norm/activation layers.
+
+        This block simplifies the usage of convolution layers, which are commonly
+        used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+        It is based upon three build methods: `build_conv_layer()`,
+        `build_norm_layer()` and `build_activation_layer()`.
+
+        Besides, we add some additional features in this module.
+        1. Automatically set `bias` of the conv layer.
+        2. Spectral norm is supported.
+        3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+        supports zero and circular padding, and we add "reflect" padding mode.
+
+        Args:
+            in_channels (int): Number of channels in the input feature map.
+                Same as that in ``nn._ConvNd``.
+            out_channels (int): Number of channels produced by the convolution.
+                Same as that in ``nn._ConvNd``.
+            kernel_size (int | tuple[int]): Size of the convolving kernel.
+                Same as that in ``nn._ConvNd``.
+            stride (int | tuple[int]): Stride of the convolution.
+                Same as that in ``nn._ConvNd``.
+            padding (int | tuple[int]): Zero-padding added to both sides of
+                the input. Same as that in ``nn._ConvNd``.
+            dilation (int | tuple[int]): Spacing between kernel elements.
+                Same as that in ``nn._ConvNd``.
+            groups (int): Number of blocked connections from input channels to
+                output channels. Same as that in ``nn._ConvNd``.
+        """
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=0,
+            stride=1,
+            dilation=1,
+            groups=1,
+            act=None,
+            bias=None,
+    ):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = nn.Conv3D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=bias)
+
+        self._batch_norm = nn.BatchNorm3D(out_channels, momentum=0.1)
+        self.act = act
+        if act is not None:
+            self._act_op = nn.ReLU()
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act is not None:
+            y = self._act_op(y)
+
+        return y
+
+
+class Bottleneck3d(nn.Layer):
+    """Bottleneck 3d block for ResNet3D.
+
+    Args:
+        inplanes (int): Number of channels for the input in first conv3d layer.
+        planes (int): Number of channels produced by some norm/conv3d layers.
+        spatial_stride (int): Spatial stride in the conv3d layer. Default: 1.
+        temporal_stride (int): Temporal stride in the conv3d layer. Default: 1.
+        dilation (int): Spacing between kernel elements. Default: 1.
+        downsample (nn.Module | None): Downsample layer. Default: None.
+        inflate (bool): Whether to inflate kernel. Default: True.
+        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+            kernel sizes and padding strides for conv1 and conv2 in each block.
+            Default: '3x1x1'.
+        non_local (bool): Determine whether to apply non-local module in this
+            block. Default: False.
+        non_local_cfg (dict): Config for non-local module. Default: ``dict()``.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers. required keys are ``type``,
+            Default: ``dict(type='BN3d')``.
+        act_cfg (dict): Config dict for activation layer.
+            Default: ``dict(type='ReLU')``.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 spatial_stride=1,
+                 temporal_stride=1,
+                 dilation=1,
+                 downsample=None,
+                 inflate=True,
+                 inflate_style='3x1x1',
+                 non_local=False,
+                 non_local_cfg=dict(),
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        super().__init__()
+        assert inflate_style in ['3x1x1', '3x3x3']
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.spatial_stride = spatial_stride
+        self.temporal_stride = temporal_stride
+        self.dilation = dilation
+        self.inflate = inflate
+        self.inflate_style = inflate_style
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.act_cfg = act_cfg
+        self.with_cp = with_cp
+        self.non_local = non_local
+        self.non_local_cfg = non_local_cfg
+
+        self.conv1_stride_s = 1
+        self.conv2_stride_s = spatial_stride
+        self.conv1_stride_t = 1
+        self.conv2_stride_t = temporal_stride
+
+        if self.inflate:
+            if inflate_style == '3x1x1':
+                conv1_kernel_size = (3, 1, 1)
+                conv1_padding = (1, 0, 0)
+                conv2_kernel_size = (1, 3, 3)
+                conv2_padding = (0, dilation, dilation)
+            else:
+                conv1_kernel_size = (1, 1, 1)
+                conv1_padding = (0, 0, 0)
+                conv2_kernel_size = (3, 3, 3)
+                conv2_padding = (1, dilation, dilation)
+        else:
+            conv1_kernel_size = (1, 1, 1)
+            conv1_padding = (0, 0, 0)
+            conv2_kernel_size = (1, 3, 3)
+            conv2_padding = (0, dilation, dilation)
+        self.conv1 = ConvBNLayer(
+            in_channels=inplanes,
+            out_channels=planes,
+            kernel_size=conv1_kernel_size,
+            stride=(self.conv1_stride_t, self.conv1_stride_s,
+                    self.conv1_stride_s),
+            padding=conv1_padding,
+            bias=False,
+            act='relu')
+
+        self.conv2 = ConvBNLayer(
+            in_channels=planes,
+            out_channels=planes,
+            kernel_size=conv2_kernel_size,
+            stride=(self.conv2_stride_t, self.conv2_stride_s,
+                    self.conv2_stride_s),
+            padding=conv2_padding,
+            dilation=(1, dilation, dilation),
+            bias=False,
+            act='relu')
+
+        self.conv3 = ConvBNLayer(
+            in_channels=planes,
+            out_channels=planes * self.expansion,
+            kernel_size=1,
+            bias=False,
+            act=None,
+        )
+
+        self.downsample = downsample
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+
+        def _inner_forward(x):
+            """Forward wrapper for utilizing checkpoint."""
+            identity = x
+
+            out = self.conv1(x)
+            out = self.conv2(out)
+            out = self.conv3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out = out + identity
+            return out
+
+        out = _inner_forward(x)
+        out = self.relu(out)
+
+        if self.non_local:
+            out = self.non_local_block(out)
+
+        return out
+
+
+class ResNet3d(nn.Layer):
+    """ResNet 3d backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        pretrained (str | None): Name of pretrained model.
+        stage_blocks (tuple | None): Set number of stages for each res layer.
+            Default: None.
+        pretrained2d (bool): Whether to load pretrained 2D model.
+            Default: True.
+        in_channels (int): Channel num of input features. Default: 3.
+        base_channels (int): Channel num of stem output features. Default: 64.
+        out_indices (Sequence[int]): Indices of output feature. Default: (3, ).
+        num_stages (int): Resnet stages. Default: 4.
+        spatial_strides (Sequence[int]):
+            Spatial strides of residual blocks of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        temporal_strides (Sequence[int]):
+            Temporal strides of residual blocks of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        conv1_kernel (Sequence[int]): Kernel size of the first conv layer.
+            Default: ``(3, 7, 7)``.
+        conv1_stride_s (int): Spatial stride of the first conv layer.
+            Default: 2.
+        conv1_stride_t (int): Temporal stride of the first conv layer.
+            Default: 1.
+        pool1_stride_s (int): Spatial stride of the first pooling layer.
+            Default: 2.
+        pool1_stride_t (int): Temporal stride of the first pooling layer.
+            Default: 1.
+        with_pool2 (bool): Whether to use pool2. Default: True.
+        inflate (Sequence[int]): Inflate Dims of each block.
+            Default: (1, 1, 1, 1).
+        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+            kernel sizes and padding strides for conv1 and conv2 in each block.
+            Default: '3x1x1'.
+        conv_cfg (dict): Config for conv layers. required keys are ``type``
+            Default: ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers. required keys are ``type`` and
+            ``requires_grad``.
+            Default: ``dict(type='BN3d', requires_grad=True)``.
+        act_cfg (dict): Config dict for activation layer.
+            Default: ``dict(type='ReLU', inplace=True)``.
+        norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+            running stats (mean and var). Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        non_local (Sequence[int]): Determine whether to apply non-local module
+            in the corresponding block of each stages. Default: (0, 0, 0, 0).
+        non_local_cfg (dict): Config for non-local module. Default: ``dict()``.
+        zero_init_residual (bool):
+            Whether to use zero initialization for residual block,
+            Default: True.
+        kwargs (dict, optional): Key arguments for "make_res_layer".
+    """
+
+    arch_settings = {
+        50: (Bottleneck3d, (3, 4, 6, 3)),
+        101: (Bottleneck3d, (3, 4, 23, 3)),
+        152: (Bottleneck3d, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 stage_blocks=None,
+                 pretrained2d=True,
+                 in_channels=3,
+                 num_stages=4,
+                 base_channels=64,
+                 out_indices=(3, ),
+                 spatial_strides=(1, 2, 2, 2),
+                 temporal_strides=(1, 1, 1, 1),
+                 dilations=(1, 1, 1, 1),
+                 conv1_kernel=(3, 7, 7),
+                 conv1_stride_s=2,
+                 conv1_stride_t=1,
+                 pool1_stride_s=2,
+                 pool1_stride_t=1,
+                 with_pool1=True,
+                 with_pool2=True,
+                 inflate=(1, 1, 1, 1),
+                 inflate_style='3x1x1',
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d', requires_grad=True),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 non_local=(0, 0, 0, 0),
+                 non_local_cfg=dict(),
+                 zero_init_residual=True,
+                 **kwargs):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.pretrained2d = pretrained2d
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.stage_blocks = stage_blocks
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.spatial_strides = spatial_strides
+        self.temporal_strides = temporal_strides
+        self.dilations = dilations
+        assert len(spatial_strides) == len(temporal_strides) == len(
+            dilations) == num_stages
+        if self.stage_blocks is not None:
+            assert len(self.stage_blocks) == num_stages
+
+        self.conv1_kernel = conv1_kernel
+        self.conv1_stride_s = conv1_stride_s
+        self.conv1_stride_t = conv1_stride_t
+        self.pool1_stride_s = pool1_stride_s
+        self.pool1_stride_t = pool1_stride_t
+        self.with_pool1 = with_pool1
+        self.with_pool2 = with_pool2
+        self.stage_inflations = _ntuple(num_stages)(inflate)
+        self.non_local_stages = _ntuple(num_stages)(non_local)
+        self.inflate_style = inflate_style
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        self.block, stage_blocks = self.arch_settings[depth]
+
+        if self.stage_blocks is None:
+            self.stage_blocks = stage_blocks[:num_stages]
+
+        self.inplanes = self.base_channels
+
+        self.non_local_cfg = non_local_cfg
+
+        self._make_stem_layer()
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            spatial_stride = spatial_strides[i]
+            temporal_stride = temporal_strides[i]
+            dilation = dilations[i]
+            planes = self.base_channels * 2**i
+            res_layer = self.make_res_layer(
+                self.block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                spatial_stride=spatial_stride,
+                temporal_stride=temporal_stride,
+                dilation=dilation,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                act_cfg=self.act_cfg,
+                non_local=self.non_local_stages[i],
+                non_local_cfg=self.non_local_cfg,
+                inflate=self.stage_inflations[i],
+                inflate_style=self.inflate_style,
+                with_cp=with_cp,
+                **kwargs)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_sublayer(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = self.block.expansion * self.base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    @staticmethod
+    def make_res_layer(block,
+                       inplanes,
+                       planes,
+                       blocks,
+                       spatial_stride=1,
+                       temporal_stride=1,
+                       dilation=1,
+                       inflate=1,
+                       inflate_style='3x1x1',
+                       non_local=0,
+                       non_local_cfg=dict(),
+                       norm_cfg=None,
+                       act_cfg=None,
+                       conv_cfg=None,
+                       with_cp=False,
+                       **kwargs):
+        """Build residual layer for ResNet3D.
+
+        Args:
+            block (nn.Module): Residual module to be built.
+            inplanes (int): Number of channels for the input feature
+                in each block.
+            planes (int): Number of channels for the output feature
+                in each block.
+            blocks (int): Number of residual blocks.
+            spatial_stride (int | Sequence[int]): Spatial strides in
+                residual and conv layers. Default: 1.
+            temporal_stride (int | Sequence[int]): Temporal strides in
+                residual and conv layers. Default: 1.
+            dilation (int): Spacing between kernel elements. Default: 1.
+            inflate (int | Sequence[int]): Determine whether to inflate
+                for each block. Default: 1.
+            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
+                the kernel sizes and padding strides for conv1 and conv2
+                in each block. Default: '3x1x1'.
+            non_local (int | Sequence[int]): Determine whether to apply
+                non-local module in the corresponding block of each stages.
+                Default: 0.
+            non_local_cfg (dict): Config for non-local module.
+                Default: ``dict()``.
+            conv_cfg (dict | None): Config for norm layers. Default: None.
+            norm_cfg (dict | None): Config for norm layers. Default: None.
+            act_cfg (dict | None): Config for activate layers. Default: None.
+            with_cp (bool | None): Use checkpoint or not. Using checkpoint
+                will save some memory while slowing down the training speed.
+                Default: False.
+
+        Returns:
+            nn.Module: A residual layer for the given config.
+        """
+        inflate = inflate if not isinstance(inflate,
+                                            int) else (inflate, ) * blocks
+        non_local = non_local if not isinstance(non_local,
+                                                int) else (non_local, ) * blocks
+        assert len(inflate) == blocks and len(non_local) == blocks
+        downsample = None
+        if spatial_stride != 1 or inplanes != planes * block.expansion:
+            downsample = ConvBNLayer(
+                in_channels=inplanes,
+                out_channels=planes * block.expansion,
+                kernel_size=1,
+                stride=(temporal_stride, spatial_stride, spatial_stride),
+                bias=False,
+                act=None)
+
+        layers = []
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                spatial_stride=spatial_stride,
+                temporal_stride=temporal_stride,
+                dilation=dilation,
+                downsample=downsample,
+                inflate=(inflate[0] == 1),
+                inflate_style=inflate_style,
+                non_local=(non_local[0] == 1),
+                non_local_cfg=non_local_cfg,
+                norm_cfg=norm_cfg,
+                conv_cfg=conv_cfg,
+                act_cfg=act_cfg,
+                with_cp=with_cp,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    spatial_stride=1,
+                    temporal_stride=1,
+                    dilation=dilation,
+                    inflate=(inflate[i] == 1),
+                    inflate_style=inflate_style,
+                    non_local=(non_local[i] == 1),
+                    non_local_cfg=non_local_cfg,
+                    norm_cfg=norm_cfg,
+                    conv_cfg=conv_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp,
+                    **kwargs))
+
+        return nn.Sequential(*layers)
+
+    @staticmethod
+    def _inflate_conv_params(conv3d, state_dict_2d, module_name_2d,
+                             inflated_param_names):
+        """Inflate a conv module from 2d to 3d.
+
+        Args:
+            conv3d (nn.Module): The destination conv3d module.
+            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
+            module_name_2d (str): The name of corresponding conv module in the
+                2d model.
+            inflated_param_names (list[str]): List of parameters that have been
+                inflated.
+        """
+        weight_2d_name = module_name_2d + '.weight'
+
+        conv2d_weight = state_dict_2d[weight_2d_name]
+        kernel_t = conv3d.weight.data.shape[2]
+
+        new_weight = conv2d_weight.data.unsqueeze(2).expand_as(
+            conv3d.weight) / kernel_t
+        conv3d.weight.data.copy_(new_weight)
+        inflated_param_names.append(weight_2d_name)
+
+        if getattr(conv3d, 'bias') is not None:
+            bias_2d_name = module_name_2d + '.bias'
+            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])
+            inflated_param_names.append(bias_2d_name)
+
+    @staticmethod
+    def _inflate_bn_params(bn3d, state_dict_2d, module_name_2d,
+                           inflated_param_names):
+        """Inflate a norm module from 2d to 3d.
+
+        Args:
+            bn3d (nn.Module): The destination bn3d module.
+            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
+            module_name_2d (str): The name of corresponding bn module in the
+                2d model.
+            inflated_param_names (list[str]): List of parameters that have been
+                inflated.
+        """
+        for param_name, param in bn3d.named_parameters():
+            param_2d_name = f'{module_name_2d}.{param_name}'
+            param_2d = state_dict_2d[param_2d_name]
+            if param.data.shape != param_2d.shape:
+                warnings.warn(f'The parameter of {module_name_2d} is not'
+                              'loaded due to incompatible shapes. ')
+                return
+
+            param.data.copy_(param_2d)
+            inflated_param_names.append(param_2d_name)
+
+        for param_name, param in bn3d.named_buffers():
+            param_2d_name = f'{module_name_2d}.{param_name}'
+            # some buffers like num_batches_tracked may not exist in old
+            # checkpoints
+            if param_2d_name in state_dict_2d:
+                param_2d = state_dict_2d[param_2d_name]
+                param.data.copy_(param_2d)
+                inflated_param_names.append(param_2d_name)
+
+    def _make_stem_layer(self):
+        """Construct the stem layers consists of a conv+norm+act module and a
+        pooling layer."""
+
+        self.conv1 = ConvBNLayer(
+            in_channels=self.in_channels,
+            out_channels=self.base_channels,
+            kernel_size=self.conv1_kernel,
+            stride=(self.conv1_stride_t, self.conv1_stride_s,
+                    self.conv1_stride_s),
+            padding=tuple([(k - 1) // 2 for k in _triple(self.conv1_kernel)]),
+            bias=False,
+            act="relu")
+
+        self.maxpool = nn.MaxPool3D(
+            kernel_size=(1, 3, 3),
+            stride=(self.pool1_stride_t, self.pool1_stride_s,
+                    self.pool1_stride_s),
+            padding=(0, 1, 1))
+
+        self.pool2 = nn.MaxPool3D(kernel_size=(2, 1, 1), stride=(2, 1, 1))
+
+    @staticmethod
+    def _init_weights(self, pretrained=None):
+        pass
+
+    def init_weights(self, pretrained=None):
+        self._init_weights(self, pretrained)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The feature of the input
+            samples extracted by the backbone.
+        """
+        x = self.conv1(x)
+        if self.with_pool1:
+            x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i == 0 and self.with_pool2:
+                x = self.pool2(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Set the optimization status when training."""
+        super().train()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, paddle.nn._BatchNormBase):
+                    m.eval()
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py b/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py
new file mode 100644
index 000000000..eb5b0807a
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet3d_slowonly.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import paddle
+import paddle.nn as nn
+
+from .resnet3d import ResNet3d, ConvBNLayer
+from ..registry import BACKBONES
+
+
+@BACKBONES.register()
+class ResNet3dSlowOnly(ResNet3d):
+    """A pathway of Slowfast based on ResNet3d.
+
+    Args:
+        *args (arguments): Arguments same as :class:``ResNet3d``.
+        channel_ratio (int): Reduce the channel number of fast pathway
+            by ``channel_ratio``, corresponding to ``beta`` in the paper.
+            Default: 8.
+        **kwargs (keyword arguments): Keywords arguments for ResNet3d.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.inplanes = self.base_channels
+
+        self.lateral_connections = []
+        for i in range(len(self.stage_blocks)):
+            planes = self.base_channels * 2**i
+            self.inplanes = planes * self.block.expansion
+
+    def make_res_layer(self,
+                       block,
+                       inplanes,
+                       planes,
+                       blocks,
+                       spatial_stride=1,
+                       temporal_stride=1,
+                       dilation=1,
+                       inflate=1,
+                       inflate_style='3x1x1',
+                       non_local=0,
+                       non_local_cfg=dict(),
+                       conv_cfg=None,
+                       norm_cfg=None,
+                       act_cfg=None,
+                       with_cp=False):
+        """Build residual layer for Slowfast.
+
+        Args:
+            block (nn.Module): Residual module to be built.
+            inplanes (int): Number of channels for the input
+                feature in each block.
+            planes (int): Number of channels for the output
+                feature in each block.
+            blocks (int): Number of residual blocks.
+            spatial_stride (int | Sequence[int]): Spatial strides
+                in residual and conv layers. Default: 1.
+            temporal_stride (int | Sequence[int]): Temporal strides in
+                residual and conv layers. Default: 1.
+            dilation (int): Spacing between kernel elements. Default: 1.
+            inflate (int | Sequence[int]): Determine whether to inflate
+                for each block. Default: 1.
+            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
+                the kernel sizes and padding strides for conv1 and
+                conv2 in each block. Default: ``3x1x1``.
+            non_local (int | Sequence[int]): Determine whether to apply
+                non-local module in the corresponding block of each stages.
+                Default: 0.
+            non_local_cfg (dict): Config for non-local module.
+                Default: ``dict()``.
+            conv_cfg (dict | None): Config for conv layers. Default: None.
+            norm_cfg (dict | None): Config for norm layers. Default: None.
+            act_cfg (dict | None): Config for activate layers. Default: None.
+            with_cp (bool): Use checkpoint or not. Using checkpoint will save
+                some memory while slowing down the training speed.
+                Default: False.
+
+        Returns:
+            nn.Module: A residual layer for the given config.
+        """
+        inflate = inflate if not isinstance(inflate,
+                                            int) else (inflate, ) * blocks
+        non_local = non_local if not isinstance(non_local,
+                                                int) else (non_local, ) * blocks
+        assert len(inflate) == blocks and len(non_local) == blocks
+
+        lateral_inplanes = 0
+        if (spatial_stride != 1
+                or (inplanes + lateral_inplanes) != planes * block.expansion):
+            downsample = ConvBNLayer(
+                in_channels=inplanes + lateral_inplanes,
+                out_channels=planes * block.expansion,
+                kernel_size=1,
+                stride=(temporal_stride, spatial_stride, spatial_stride),
+                bias=False,
+                act=None)
+        else:
+            downsample = None
+
+        layers = []
+        layers.append(
+            block(
+                inplanes + lateral_inplanes,
+                planes,
+                spatial_stride,
+                temporal_stride,
+                dilation,
+                downsample,
+                inflate=(inflate[0] == 1),
+                inflate_style=inflate_style,
+                non_local=(non_local[0] == 1),
+                non_local_cfg=non_local_cfg,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                with_cp=with_cp))
+        inplanes = planes * block.expansion
+
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    1,
+                    1,
+                    dilation,
+                    inflate=(inflate[i] == 1),
+                    inflate_style=inflate_style,
+                    non_local=(non_local[i] == 1),
+                    non_local_cfg=non_local_cfg,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp))
+
+        return nn.Sequential(*layers)
+
+    def _inflate_conv_params(self, conv3d, state_dict_2d, module_name_2d,
+                             inflated_param_names):
+        """Inflate a conv module from 2d to 3d.
+
+        The differences of conv modules betweene 2d and 3d in Pathway
+        mainly lie in the inplanes due to lateral connections. To fit the
+        shapes of the lateral connection counterpart, it will expand
+        parameters by concatting conv2d parameters and extra zero paddings.
+
+        Args:
+            conv3d (nn.Module): The destination conv3d module.
+            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
+            module_name_2d (str): The name of corresponding conv module in the
+                2d model.
+            inflated_param_names (list[str]): List of parameters that have been
+                inflated.
+        """
+        weight_2d_name = module_name_2d + '.weight'
+        conv2d_weight = state_dict_2d[weight_2d_name]
+        old_shape = conv2d_weight.shape
+        new_shape = conv3d.weight.data.shape
+        kernel_t = new_shape[2]
+
+        if new_shape[1] != old_shape[1]:
+            if new_shape[1] < old_shape[1]:
+                warnings.warn(f'The parameter of {module_name_2d} is not'
+                              'loaded due to incompatible shapes. ')
+                return
+            # Inplanes may be different due to lateral connections
+            new_channels = new_shape[1] - old_shape[1]
+            pad_shape = old_shape
+            pad_shape = pad_shape[:1] + (new_channels, ) + pad_shape[2:]
+            # Expand parameters by concat extra channels
+            conv2d_weight = paddle.concat(
+                (conv2d_weight, paddle.zeros(pad_shape)), axis=1)
+
+        new_weight = conv2d_weight.data.unsqueeze(2).expand_as(
+            conv3d.weight) / kernel_t
+        conv3d.weight.data.copy_(new_weight)
+        inflated_param_names.append(weight_2d_name)
+
+        if getattr(conv3d, 'bias') is not None:
+            bias_2d_name = module_name_2d + '.bias'
+            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])
+            inflated_param_names.append(bias_2d_name)
+
+
+if __name__ == '__main__':
+    net = ResNet3dSlowOnly(
+        depth=50,
+        in_channels=17,
+        base_channels=32,
+        conv1_kernel=(1, 7, 7),
+        num_stages=3,
+        out_indices=[2],
+        stage_blocks=[3, 4, 6],
+        conv1_stride_s=1,
+        pool1_stride_s=1,
+        inflate=[0, 1, 1],
+        with_pool2=False,
+        spatial_strides=[2, 2, 2],
+        temporal_strides=[1, 1, 2],
+        dilations=[1, 1, 1])
+    pass
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py b/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py
new file mode 100644
index 000000000..a67915946
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet_slowfast.py
@@ -0,0 +1,795 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal
+from ..registry import BACKBONES
+from paddlevideo.utils.multigrid import get_norm
+import sys
+import numpy as np
+import paddle.distributed as dist
+
+# seed random seed
+paddle.framework.seed(0)
+
+
+# get init parameters for conv layer
+def get_conv_init(fan_out):
+    return KaimingNormal(fan_in=fan_out)
+
+
+def get_bn_param_attr(bn_weight=1.0, coeff=0.0):
+    param_attr = paddle.ParamAttr(
+        initializer=paddle.nn.initializer.Constant(bn_weight),
+        regularizer=paddle.regularizer.L2Decay(coeff))
+    return param_attr
+
+
+"""Video models."""
+
+
+class BottleneckTransform(paddle.nn.Layer):
+    """
+    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of
+        temporal kernel.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 temp_kernel_size,
+                 stride,
+                 dim_inner,
+                 num_groups,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 eps=1e-5,
+                 dilation=1,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            dilation (int): size of dilation.
+        """
+        super(BottleneckTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._stride_1x1 = stride_1x1
+        self.norm_module = norm_module
+        self._construct(dim_in, dim_out, stride, dim_inner, num_groups,
+                        dilation)
+
+    def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,
+                   dilation):
+        str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)
+
+        fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self.a = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_inner,
+            kernel_size=[self.temp_kernel_size, 1, 1],
+            stride=[1, str1x1, str1x1],
+            padding=[int(self.temp_kernel_size // 2), 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.a_bn = self.norm_module(num_features=dim_inner,
+                                     epsilon=self._eps,
+                                     weight_attr=get_bn_param_attr(),
+                                     bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        # 1x3x3, BN, ReLU.
+        fan = (dim_inner) * (1 * 3 * 3)
+        initializer_tmp = get_conv_init(fan)
+
+        self.b = paddle.nn.Conv3D(
+            in_channels=dim_inner,
+            out_channels=dim_inner,
+            kernel_size=[1, 3, 3],
+            stride=[1, str3x3, str3x3],
+            padding=[0, dilation, dilation],
+            groups=num_groups,
+            dilation=[1, dilation, dilation],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.b_bn = self.norm_module(num_features=dim_inner,
+                                     epsilon=self._eps,
+                                     weight_attr=get_bn_param_attr(),
+                                     bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        # 1x1x1, BN.
+        fan = (dim_out) * (1 * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self.c = paddle.nn.Conv3D(
+            in_channels=dim_inner,
+            out_channels=dim_out,
+            kernel_size=[1, 1, 1],
+            stride=[1, 1, 1],
+            padding=[0, 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.c_bn = self.norm_module(
+            num_features=dim_out,
+            epsilon=self._eps,
+            weight_attr=get_bn_param_attr(bn_weight=0.0),
+            bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        # Branch2a.
+        x = self.a(x)
+        x = self.a_bn(x)
+        x = F.relu(x)
+
+        # Branch2b.
+        x = self.b(x)
+        x = self.b_bn(x)
+        x = F.relu(x)
+
+        # Branch2c
+        x = self.c(x)
+        x = self.c_bn(x)
+        return x
+
+
+class ResBlock(paddle.nn.Layer):
+    """
+    Residual block.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 temp_kernel_size,
+                 stride,
+                 dim_inner,
+                 num_groups=1,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 eps=1e-5,
+                 dilation=1,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        ResBlock class constructs redisual blocks. More details can be found in:
+            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+            "Deep residual learning for image recognition."
+            https://arxiv.org/abs/1512.03385
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            trans_func (string): transform function to be used to construct the
+                bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            dilation (int): size of dilation.
+        """
+        super(ResBlock, self).__init__()
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self.norm_module = norm_module
+        self._construct(
+            dim_in,
+            dim_out,
+            temp_kernel_size,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+    ):
+        # Use skip connection with projection if dim or res change.
+        if (dim_in != dim_out) or (stride != 1):
+            fan = (dim_out) * (1 * 1 * 1)
+            initializer_tmp = get_conv_init(fan)
+            self.branch1 = paddle.nn.Conv3D(
+                in_channels=dim_in,
+                out_channels=dim_out,
+                kernel_size=1,
+                stride=[1, stride, stride],
+                padding=0,
+                weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+                bias_attr=False,
+                dilation=1)
+            self.branch1_bn = self.norm_module(
+                num_features=dim_out,
+                epsilon=self._eps,
+                weight_attr=get_bn_param_attr(),
+                bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        self.branch2 = BottleneckTransform(dim_in,
+                                           dim_out,
+                                           temp_kernel_size,
+                                           stride,
+                                           dim_inner,
+                                           num_groups,
+                                           stride_1x1=stride_1x1,
+                                           inplace_relu=inplace_relu,
+                                           dilation=dilation,
+                                           norm_module=self.norm_module)
+
+    def forward(self, x):
+        if hasattr(self, "branch1"):
+            x1 = self.branch1(x)
+            x1 = self.branch1_bn(x1)
+            x2 = self.branch2(x)
+            x = paddle.add(x=x1, y=x2)
+        else:
+            x2 = self.branch2(x)
+            x = paddle.add(x=x, y=x2)
+
+        x = F.relu(x)
+        return x
+
+
+class ResStage(paddle.nn.Layer):
+    """
+    Stage of 3D ResNet. It expects to have one or more tensors as input for
+        multi-pathway (SlowFast) cases.  More details can be found here:
+
+        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+        "Slowfast networks for video recognition."
+        https://arxiv.org/pdf/1812.03982.pdf
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 stride,
+                 temp_kernel_sizes,
+                 num_blocks,
+                 dim_inner,
+                 num_groups,
+                 num_block_temp_kernel,
+                 dilation,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        ResStage builds p streams, where p can be greater or equal to one.
+        Args:
+            dim_in (list): list of p the channel dimensions of the input.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            dim_out (list): list of p the channel dimensions of the output.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            temp_kernel_sizes (list): list of the p temporal kernel sizes of the
+                convolution in the bottleneck. Different temp_kernel_sizes
+                control different pathway.
+            stride (list): list of the p strides of the bottleneck. Different
+                stride control different pathway.
+            num_blocks (list): list of p numbers of blocks for each of the
+                pathway.
+            dim_inner (list): list of the p inner channel dimensions of the
+                input. Different channel dimensions control the input dimension
+                of different pathways.
+            num_groups (list): list of number of p groups for the convolution.
+                num_groups=1 is for standard ResNet like networks, and
+                num_groups>1 is for ResNeXt like networks.
+            num_block_temp_kernel (list): extent the temp_kernel_sizes to
+                num_block_temp_kernel blocks, then fill temporal kernel size
+                of 1 for the rest of the layers.
+            dilation (list): size of dilation for each pathway.
+        """
+        super(ResStage, self).__init__()
+        assert all((num_block_temp_kernel[i] <= num_blocks[i]
+                    for i in range(len(temp_kernel_sizes))))
+        self.num_blocks = num_blocks
+        self.temp_kernel_sizes = [
+            (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +
+            [1] * (num_blocks[i] - num_block_temp_kernel[i])
+            for i in range(len(temp_kernel_sizes))
+        ]
+        assert (len({
+            len(dim_in),
+            len(dim_out),
+            len(temp_kernel_sizes),
+            len(stride),
+            len(num_blocks),
+            len(dim_inner),
+            len(num_groups),
+            len(num_block_temp_kernel),
+        }) == 1)
+        self.num_pathways = len(self.num_blocks)
+        self.norm_module = norm_module
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+    ):
+
+        for pathway in range(self.num_pathways):
+            for i in range(self.num_blocks[pathway]):
+                res_block = ResBlock(
+                    dim_in[pathway] if i == 0 else dim_out[pathway],
+                    dim_out[pathway],
+                    self.temp_kernel_sizes[pathway][i],
+                    stride[pathway] if i == 0 else 1,
+                    dim_inner[pathway],
+                    num_groups[pathway],
+                    stride_1x1=stride_1x1,
+                    inplace_relu=inplace_relu,
+                    dilation=dilation[pathway],
+                    norm_module=self.norm_module)
+                self.add_sublayer("pathway{}_res{}".format(pathway, i),
+                                  res_block)
+
+    def forward(self, inputs):
+        output = []
+        for pathway in range(self.num_pathways):
+            x = inputs[pathway]
+
+            for i in range(self.num_blocks[pathway]):
+                m = getattr(self, "pathway{}_res{}".format(pathway, i))
+                x = m(x)
+            output.append(x)
+
+        return output
+
+
+class ResNetBasicStem(paddle.nn.Layer):
+    """
+    ResNe(X)t 3D stem module.
+    Performs spatiotemporal Convolution, BN, and Relu following by a
+        spatiotemporal pooling.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 kernel,
+                 stride,
+                 padding,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        super(ResNetBasicStem, self).__init__()
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.eps = eps
+        self.norm_module = norm_module
+        self._construct_stem(dim_in, dim_out)
+
+    def _construct_stem(self, dim_in, dim_out):
+        fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])
+        initializer_tmp = get_conv_init(fan)
+
+        self._conv = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=self.kernel,
+            stride=self.stride,
+            padding=self.padding,
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self._bn = self.norm_module(num_features=dim_out,
+                                    epsilon=self.eps,
+                                    weight_attr=get_bn_param_attr(),
+                                    bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._bn(x)
+        x = F.relu(x)
+
+        x = F.max_pool3d(x=x,
+                         kernel_size=[1, 3, 3],
+                         stride=[1, 2, 2],
+                         padding=[0, 1, 1],
+                         data_format="NCDHW")
+        return x
+
+
+class VideoModelStem(paddle.nn.Layer):
+    """
+    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool
+    on input data tensor for slow and fast pathways.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 kernel,
+                 stride,
+                 padding,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (list): the list of channel dimensions of the inputs.
+            dim_out (list): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernels' size of the convolutions in the stem
+                layers. Temporal kernel size, height kernel size, width kernel
+                size in order.
+            stride (list): the stride sizes of the convolutions in the stem
+                layer. Temporal kernel stride, height kernel size, width kernel
+                size in order.
+            padding (list): the paddings' sizes of the convolutions in the stem
+                layer. Temporal padding size, height padding size, width padding
+                size in order.
+            eps (float): epsilon for batch norm.
+        """
+        super(VideoModelStem, self).__init__()
+
+        assert (len({
+            len(dim_in),
+            len(dim_out),
+            len(kernel),
+            len(stride),
+            len(padding),
+        }) == 1), "Input pathway dimensions are not consistent."
+        self.num_pathways = len(dim_in)
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.eps = eps
+        self.norm_module = norm_module
+        self._construct_stem(dim_in, dim_out)
+
+    def _construct_stem(self, dim_in, dim_out):
+        for pathway in range(len(dim_in)):
+            stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],
+                                   self.kernel[pathway], self.stride[pathway],
+                                   self.padding[pathway], self.eps,
+                                   self.norm_module)
+            self.add_sublayer("pathway{}_stem".format(pathway), stem)
+
+    def forward(self, x):
+        assert (len(x) == self.num_pathways
+                ), "Input tensor does not contain {} pathway".format(
+                    self.num_pathways)
+
+        for pathway in range(len(x)):
+            m = getattr(self, "pathway{}_stem".format(pathway))
+            x[pathway] = m(x[pathway])
+
+        return x
+
+
+class FuseFastToSlow(paddle.nn.Layer):
+    """
+    Fuses the information from the Fast pathway to the Slow pathway. Given the
+    tensors from Slow pathway and Fast pathway, fuse information from Fast to
+    Slow, then return the fused tensors from Slow and Fast pathway in order.
+    """
+    def __init__(self,
+                 dim_in,
+                 fusion_conv_channel_ratio,
+                 fusion_kernel,
+                 alpha,
+                 fuse_bn_relu=1,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (int): the channel dimension of the input.
+            fusion_conv_channel_ratio (int): channel ratio for the convolution
+                used to fuse from Fast pathway to Slow pathway.
+            fusion_kernel (int): kernel size of the convolution used to fuse
+                from Fast pathway to Slow pathway.
+            alpha (int): the frame rate ratio between the Fast and Slow pathway.
+            eps (float): epsilon for batch norm.
+        """
+        super(FuseFastToSlow, self).__init__()
+        self.fuse_bn_relu = fuse_bn_relu
+        fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self._conv_f2s = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_in * fusion_conv_channel_ratio,
+            kernel_size=[fusion_kernel, 1, 1],
+            stride=[alpha, 1, 1],
+            padding=[fusion_kernel // 2, 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,
+                               epsilon=eps,
+                               weight_attr=get_bn_param_attr(),
+                               bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        x_s = x[0]
+        x_f = x[1]
+        fuse = self._conv_f2s(x_f)
+        #  TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.
+        if self.fuse_bn_relu:
+            fuse = self._bn(fuse)
+            fuse = F.relu(fuse)
+        x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)
+
+        return [x_s_fuse, x_f]
+
+
+@BACKBONES.register()
+class ResNetSlowFast(paddle.nn.Layer):
+    """
+    SlowFast model builder for SlowFast network.
+
+    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+    "Slowfast networks for video recognition."
+    https://arxiv.org/pdf/1812.03982.pdf
+    """
+    def __init__(
+        self,
+        alpha,
+        beta,
+        bn_norm_type="batchnorm",
+        bn_num_splits=1,
+        num_pathways=2,
+        depth=50,
+        num_groups=1,
+        input_channel_num=[3, 3],
+        width_per_group=64,
+        fusion_conv_channel_ratio=2,
+        fusion_kernel_sz=7,  #5?
+        pool_size_ratio=[[1, 1, 1], [1, 1, 1]],
+        fuse_bn_relu = 1,
+        spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]],
+        use_pool_af_s2 = 1,
+    ):
+        """
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        super(ResNetSlowFast, self).__init__()
+
+        self.alpha = alpha  #8
+        self.beta = beta  #8
+        self.norm_module = get_norm(bn_norm_type, bn_num_splits)
+        self.num_pathways = num_pathways
+        self.depth = depth
+        self.num_groups = num_groups
+        self.input_channel_num = input_channel_num
+        self.width_per_group = width_per_group
+        self.fusion_conv_channel_ratio = fusion_conv_channel_ratio
+        self.fusion_kernel_sz = fusion_kernel_sz  # NOTE: modify to 7 in 8*8, 5 in old implement
+        self.pool_size_ratio = pool_size_ratio
+        self.fuse_bn_relu = fuse_bn_relu
+        self.spatial_strides = spatial_strides
+        self.use_pool_af_s2 = use_pool_af_s2
+        self._construct_network()
+
+    def _construct_network(self):
+        """
+        Builds a SlowFast model.
+        The first pathway is the Slow pathway
+        and the second pathway is the Fast pathway.
+
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        temp_kernel = [
+            [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.
+            [[1], [3]],  # res2 temporal kernel for slow and fast pathway.
+            [[1], [3]],  # res3 temporal kernel for slow and fast pathway.
+            [[3], [3]],  # res4 temporal kernel for slow and fast pathway.
+            [[3], [3]],
+        ]  # res5 temporal kernel for slow and fast pathway.
+
+        self.s1 = VideoModelStem(
+            dim_in=self.input_channel_num,
+            dim_out=[self.width_per_group, self.width_per_group // self.beta],
+            kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],
+            stride=[[1, 2, 2]] * 2,
+            padding=[
+                [temp_kernel[0][0][0] // 2, 3, 3],
+                [temp_kernel[0][1][0] // 2, 3, 3],
+            ],
+            norm_module=self.norm_module)
+        self.s1_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu)
+
+        # ResNet backbone
+        MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}
+        (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]
+
+        num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]
+        spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]
+        spatial_strides = self.spatial_strides
+        #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]
+        #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment
+
+        out_dim_ratio = self.beta // self.fusion_conv_channel_ratio  #4
+        dim_inner = self.width_per_group * self.num_groups  #64
+
+        self.s2 = ResStage(dim_in=[
+            self.width_per_group + self.width_per_group // out_dim_ratio,
+            self.width_per_group // self.beta,
+        ],
+                           dim_out=[
+                               self.width_per_group * 4,
+                               self.width_per_group * 4 // self.beta,
+                           ],
+                           dim_inner=[dim_inner, dim_inner // self.beta],
+                           temp_kernel_sizes=temp_kernel[1],
+                           stride=spatial_strides[0],
+                           num_blocks=[d2] * 2,
+                           num_groups=[self.num_groups] * 2,
+                           num_block_temp_kernel=num_block_temp_kernel[0],
+                           dilation=spatial_dilations[0],
+                           norm_module=self.norm_module)
+
+        self.s2_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 4 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s3 = ResStage(
+            dim_in=[
+                self.width_per_group * 4 +
+                self.width_per_group * 4 // out_dim_ratio,
+                self.width_per_group * 4 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 8,
+                self.width_per_group * 8 // self.beta,
+            ],
+            dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],
+            temp_kernel_sizes=temp_kernel[2],
+            stride=spatial_strides[1],
+            num_blocks=[d3] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[1],
+            dilation=spatial_dilations[1],
+            norm_module=self.norm_module,
+        )
+
+        self.s3_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 8 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s4 = ResStage(
+            dim_in=[
+                self.width_per_group * 8 +
+                self.width_per_group * 8 // out_dim_ratio,
+                self.width_per_group * 8 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 16,
+                self.width_per_group * 16 // self.beta,
+            ],
+            dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],
+            temp_kernel_sizes=temp_kernel[3],
+            stride=spatial_strides[2],
+            num_blocks=[d4] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[2],
+            dilation=spatial_dilations[2],
+            norm_module=self.norm_module,
+        )
+
+        self.s4_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 16 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s5 = ResStage(
+            dim_in=[
+                self.width_per_group * 16 +
+                self.width_per_group * 16 // out_dim_ratio,
+                self.width_per_group * 16 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 32,
+                self.width_per_group * 32 // self.beta,
+            ],
+            dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],
+            temp_kernel_sizes=temp_kernel[4],
+            stride=spatial_strides[3],
+            num_blocks=[d5] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[3],
+            dilation=spatial_dilations[3],
+            norm_module=self.norm_module,
+        )
+
+    def init_weights(self):
+        pass
+
+    def forward(self, x):
+        x = self.s1(x)  #VideoModelStem
+        x = self.s1_fuse(x)  #FuseFastToSlow
+        x = self.s2(x)  #ResStage
+        x = self.s2_fuse(x)
+
+        #  TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.
+        if self.use_pool_af_s2:
+            for pathway in range(self.num_pathways):
+                x[pathway] = F.max_pool3d(x=x[pathway],
+                                          kernel_size=self.pool_size_ratio[pathway],
+                                          stride=self.pool_size_ratio[pathway],
+                                          padding=[0, 0, 0],
+                                          data_format="NCDHW")
+
+        x = self.s3(x)
+        x = self.s3_fuse(x)
+        x = self.s4(x)
+        x = self.s4_fuse(x)
+        x = self.s5(x)
+        return x
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py b/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py
new file mode 100644
index 000000000..d348d45cf
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py
@@ -0,0 +1,796 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal
+from ..registry import BACKBONES
+from paddlevideo.utils.multigrid import get_norm
+import sys
+import numpy as np
+import paddle.distributed as dist
+
+# seed random seed
+paddle.framework.seed(0)
+
+
+# get init parameters for conv layer
+def get_conv_init(fan_out):
+    return KaimingNormal(fan_in=fan_out)
+
+
+def get_bn_param_attr(bn_weight=1.0, coeff=0.0):
+    param_attr = paddle.ParamAttr(
+        initializer=paddle.nn.initializer.Constant(bn_weight),
+        regularizer=paddle.regularizer.L2Decay(coeff))
+    return param_attr
+
+
+"""Video models."""
+
+
+class BottleneckTransform(paddle.nn.Layer):
+    """
+    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of
+        temporal kernel.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 temp_kernel_size,
+                 stride,
+                 dim_inner,
+                 num_groups,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 eps=1e-5,
+                 dilation=1,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            dilation (int): size of dilation.
+        """
+        super(BottleneckTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._stride_1x1 = stride_1x1
+        self.norm_module = norm_module
+        self._construct(dim_in, dim_out, stride, dim_inner, num_groups,
+                        dilation)
+
+    def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,
+                   dilation):
+        str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)
+
+        fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self.a = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_inner,
+            kernel_size=[self.temp_kernel_size, 1, 1],
+            stride=[1, str1x1, str1x1],
+            padding=[int(self.temp_kernel_size // 2), 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.a_bn = self.norm_module(num_features=dim_inner,
+                                     epsilon=self._eps,
+                                     weight_attr=get_bn_param_attr(),
+                                     bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        # 1x3x3, BN, ReLU.
+        fan = (dim_inner) * (1 * 3 * 3)
+        initializer_tmp = get_conv_init(fan)
+
+        self.b = paddle.nn.Conv3D(
+            in_channels=dim_inner,
+            out_channels=dim_inner,
+            kernel_size=[1, 3, 3],
+            stride=[1, str3x3, str3x3],
+            padding=[0, dilation, dilation],
+            groups=num_groups,
+            dilation=[1, dilation, dilation],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.b_bn = self.norm_module(num_features=dim_inner,
+                                     epsilon=self._eps,
+                                     weight_attr=get_bn_param_attr(),
+                                     bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        # 1x1x1, BN.
+        fan = (dim_out) * (1 * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self.c = paddle.nn.Conv3D(
+            in_channels=dim_inner,
+            out_channels=dim_out,
+            kernel_size=[1, 1, 1],
+            stride=[1, 1, 1],
+            padding=[0, 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.c_bn = self.norm_module(
+            num_features=dim_out,
+            epsilon=self._eps,
+            weight_attr=get_bn_param_attr(bn_weight=0.0),
+            bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        # Branch2a.
+        x = self.a(x)
+        x = self.a_bn(x)
+        x = F.relu(x)
+
+        # Branch2b.
+        x = self.b(x)
+        x = self.b_bn(x)
+        x = F.relu(x)
+
+        # Branch2c
+        x = self.c(x)
+        x = self.c_bn(x)
+        return x
+
+
+class ResBlock(paddle.nn.Layer):
+    """
+    Residual block.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 temp_kernel_size,
+                 stride,
+                 dim_inner,
+                 num_groups=1,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 eps=1e-5,
+                 dilation=1,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        ResBlock class constructs redisual blocks. More details can be found in:
+            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+            "Deep residual learning for image recognition."
+            https://arxiv.org/abs/1512.03385
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            trans_func (string): transform function to be used to construct the
+                bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            dilation (int): size of dilation.
+        """
+        super(ResBlock, self).__init__()
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self.norm_module = norm_module
+        self._construct(
+            dim_in,
+            dim_out,
+            temp_kernel_size,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+    ):
+        # Use skip connection with projection if dim or res change.
+        if (dim_in != dim_out) or (stride != 1):
+            fan = (dim_out) * (1 * 1 * 1)
+            initializer_tmp = get_conv_init(fan)
+            self.branch1 = paddle.nn.Conv3D(
+                in_channels=dim_in,
+                out_channels=dim_out,
+                kernel_size=1,
+                stride=[1, stride, stride],
+                padding=0,
+                weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+                bias_attr=False,
+                dilation=1)
+            self.branch1_bn = self.norm_module(
+                num_features=dim_out,
+                epsilon=self._eps,
+                weight_attr=get_bn_param_attr(),
+                bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        self.branch2 = BottleneckTransform(dim_in,
+                                           dim_out,
+                                           temp_kernel_size,
+                                           stride,
+                                           dim_inner,
+                                           num_groups,
+                                           stride_1x1=stride_1x1,
+                                           inplace_relu=inplace_relu,
+                                           dilation=dilation,
+                                           norm_module=self.norm_module)
+
+    def forward(self, x):
+        if hasattr(self, "branch1"):
+            x1 = self.branch1(x)
+            x1 = self.branch1_bn(x1)
+            x2 = self.branch2(x)
+            x = paddle.add(x=x1, y=x2)
+        else:
+            x2 = self.branch2(x)
+            x = paddle.add(x=x, y=x2)
+
+        x = F.relu(x)
+        return x
+
+
+class ResStage(paddle.nn.Layer):
+    """
+    Stage of 3D ResNet. It expects to have one or more tensors as input for
+        multi-pathway (SlowFast) cases.  More details can be found here:
+
+        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+        "Slowfast networks for video recognition."
+        https://arxiv.org/pdf/1812.03982.pdf
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 stride,
+                 temp_kernel_sizes,
+                 num_blocks,
+                 dim_inner,
+                 num_groups,
+                 num_block_temp_kernel,
+                 dilation,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        ResStage builds p streams, where p can be greater or equal to one.
+        Args:
+            dim_in (list): list of p the channel dimensions of the input.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            dim_out (list): list of p the channel dimensions of the output.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            temp_kernel_sizes (list): list of the p temporal kernel sizes of the
+                convolution in the bottleneck. Different temp_kernel_sizes
+                control different pathway.
+            stride (list): list of the p strides of the bottleneck. Different
+                stride control different pathway.
+            num_blocks (list): list of p numbers of blocks for each of the
+                pathway.
+            dim_inner (list): list of the p inner channel dimensions of the
+                input. Different channel dimensions control the input dimension
+                of different pathways.
+            num_groups (list): list of number of p groups for the convolution.
+                num_groups=1 is for standard ResNet like networks, and
+                num_groups>1 is for ResNeXt like networks.
+            num_block_temp_kernel (list): extent the temp_kernel_sizes to
+                num_block_temp_kernel blocks, then fill temporal kernel size
+                of 1 for the rest of the layers.
+            dilation (list): size of dilation for each pathway.
+        """
+        super(ResStage, self).__init__()
+        assert all((num_block_temp_kernel[i] <= num_blocks[i]
+                    for i in range(len(temp_kernel_sizes))))
+        self.num_blocks = num_blocks
+        self.temp_kernel_sizes = [
+            (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +
+            [1] * (num_blocks[i] - num_block_temp_kernel[i])
+            for i in range(len(temp_kernel_sizes))
+        ]
+        assert (len({
+            len(dim_in),
+            len(dim_out),
+            len(temp_kernel_sizes),
+            len(stride),
+            len(num_blocks),
+            len(dim_inner),
+            len(num_groups),
+            len(num_block_temp_kernel),
+        }) == 1)
+        self.num_pathways = len(self.num_blocks)
+        self.norm_module = norm_module
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+    ):
+
+        for pathway in range(self.num_pathways):
+            for i in range(self.num_blocks[pathway]):
+                res_block = ResBlock(
+                    dim_in[pathway] if i == 0 else dim_out[pathway],
+                    dim_out[pathway],
+                    self.temp_kernel_sizes[pathway][i],
+                    stride[pathway] if i == 0 else 1,
+                    dim_inner[pathway],
+                    num_groups[pathway],
+                    stride_1x1=stride_1x1,
+                    inplace_relu=inplace_relu,
+                    dilation=dilation[pathway],
+                    norm_module=self.norm_module)
+                self.add_sublayer("pathway{}_res{}".format(pathway, i),
+                                  res_block)
+
+    def forward(self, inputs):
+        output = []
+        for pathway in range(self.num_pathways):
+            x = inputs[pathway]
+
+            for i in range(self.num_blocks[pathway]):
+                m = getattr(self, "pathway{}_res{}".format(pathway, i))
+                x = m(x)
+            output.append(x)
+
+        return output
+
+
+class ResNetBasicStem(paddle.nn.Layer):
+    """
+    ResNe(X)t 3D stem module.
+    Performs spatiotemporal Convolution, BN, and Relu following by a
+        spatiotemporal pooling.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 kernel,
+                 stride,
+                 padding,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        super(ResNetBasicStem, self).__init__()
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.eps = eps
+        self.norm_module = norm_module
+        self._construct_stem(dim_in, dim_out)
+
+    def _construct_stem(self, dim_in, dim_out):
+        fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])
+        initializer_tmp = get_conv_init(fan)
+
+        self._conv = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=self.kernel,
+            stride=self.stride,
+            padding=self.padding,
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self._bn = self.norm_module(num_features=dim_out,
+                                    epsilon=self.eps,
+                                    weight_attr=get_bn_param_attr(),
+                                    bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._bn(x)
+        x = F.relu(x)
+
+        x = F.max_pool3d(x=x,
+                         kernel_size=[1, 3, 3],
+                         stride=[1, 2, 2],
+                         padding=[0, 1, 1],
+                         data_format="NCDHW")
+        return x
+
+
+class VideoModelStem(paddle.nn.Layer):
+    """
+    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool
+    on input data tensor for slow and fast pathways.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 kernel,
+                 stride,
+                 padding,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (list): the list of channel dimensions of the inputs.
+            dim_out (list): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernels' size of the convolutions in the stem
+                layers. Temporal kernel size, height kernel size, width kernel
+                size in order.
+            stride (list): the stride sizes of the convolutions in the stem
+                layer. Temporal kernel stride, height kernel size, width kernel
+                size in order.
+            padding (list): the paddings' sizes of the convolutions in the stem
+                layer. Temporal padding size, height padding size, width padding
+                size in order.
+            eps (float): epsilon for batch norm.
+        """
+        super(VideoModelStem, self).__init__()
+
+        assert (len({
+            len(dim_in),
+            len(dim_out),
+            len(kernel),
+            len(stride),
+            len(padding),
+        }) == 1), "Input pathway dimensions are not consistent."
+        self.num_pathways = len(dim_in)
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.eps = eps
+        self.norm_module = norm_module
+        self._construct_stem(dim_in, dim_out)
+
+    def _construct_stem(self, dim_in, dim_out):
+        for pathway in range(len(dim_in)):
+            stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],
+                                   self.kernel[pathway], self.stride[pathway],
+                                   self.padding[pathway], self.eps,
+                                   self.norm_module)
+            self.add_sublayer("pathway{}_stem".format(pathway), stem)
+
+    def forward(self, x):
+        assert (len(x) == self.num_pathways
+                ), "Input tensor does not contain {} pathway".format(
+                    self.num_pathways)
+
+        for pathway in range(len(x)):
+            m = getattr(self, "pathway{}_stem".format(pathway))
+            x[pathway] = m(x[pathway])
+
+        return x
+
+
+class FuseFastToSlow(paddle.nn.Layer):
+    """
+    Fuses the information from the Fast pathway to the Slow pathway. Given the
+    tensors from Slow pathway and Fast pathway, fuse information from Fast to
+    Slow, then return the fused tensors from Slow and Fast pathway in order.
+    """
+    def __init__(self,
+                 dim_in,
+                 fusion_conv_channel_ratio,
+                 fusion_kernel,
+                 alpha,
+                 fuse_bn_relu=1,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (int): the channel dimension of the input.
+            fusion_conv_channel_ratio (int): channel ratio for the convolution
+                used to fuse from Fast pathway to Slow pathway.
+            fusion_kernel (int): kernel size of the convolution used to fuse
+                from Fast pathway to Slow pathway.
+            alpha (int): the frame rate ratio between the Fast and Slow pathway.
+            eps (float): epsilon for batch norm.
+        """
+        super(FuseFastToSlow, self).__init__()
+        self.fuse_bn_relu = fuse_bn_relu
+        fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self._conv_f2s = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_in * fusion_conv_channel_ratio,
+            kernel_size=[fusion_kernel, 1, 1],
+            stride=[alpha, 1, 1],
+            padding=[fusion_kernel // 2, 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,
+                               epsilon=eps,
+                               weight_attr=get_bn_param_attr(),
+                               bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        x_s = x[0]
+        x_f = x[1]
+        fuse = self._conv_f2s(x_f)
+        #  TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.
+        if self.fuse_bn_relu:
+            fuse = self._bn(fuse)
+            fuse = F.relu(fuse)
+        x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)
+
+        return [x_s_fuse, x_f]
+
+
+@BACKBONES.register()
+class ResNetSlowFast_MRI(paddle.nn.Layer):
+    """
+    SlowFast model builder for SlowFast network.
+
+    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+    "Slowfast networks for video recognition."
+    https://arxiv.org/pdf/1812.03982.pdf
+    """
+    def __init__(
+        self,
+        alpha,
+        beta,
+        bn_norm_type="batchnorm",
+        bn_num_splits=1,
+        num_pathways=2,
+        depth=50,
+        num_groups=1,
+        input_channel_num=[1, 1],
+        width_per_group=64,
+        fusion_conv_channel_ratio=2,
+        fusion_kernel_sz=7,  #5?
+        pool_size_ratio=[[1, 1, 1], [1, 1, 1]],
+        fuse_bn_relu=1,
+        spatial_strides=[[1, 1], [2, 2], [2, 2], [2, 2]],
+        use_pool_af_s2=1,
+    ):
+        """
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        super(ResNetSlowFast_MRI, self).__init__()
+
+        self.alpha = alpha  #8
+        self.beta = beta  #8
+        self.norm_module = get_norm(bn_norm_type, bn_num_splits)
+        self.num_pathways = num_pathways
+        self.depth = depth
+        self.num_groups = num_groups
+        self.input_channel_num = input_channel_num
+        self.width_per_group = width_per_group
+        self.fusion_conv_channel_ratio = fusion_conv_channel_ratio
+        self.fusion_kernel_sz = fusion_kernel_sz  # NOTE: modify to 7 in 8*8, 5 in old implement
+        self.pool_size_ratio = pool_size_ratio
+        self.fuse_bn_relu = fuse_bn_relu
+        self.spatial_strides = spatial_strides
+        self.use_pool_af_s2 = use_pool_af_s2
+        self._construct_network()
+
+    def _construct_network(self):
+        """
+        Builds a SlowFast model.
+        The first pathway is the Slow pathway
+        and the second pathway is the Fast pathway.
+
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        temp_kernel = [
+            [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.
+            [[1], [3]],  # res2 temporal kernel for slow and fast pathway.
+            [[1], [3]],  # res3 temporal kernel for slow and fast pathway.
+            [[3], [3]],  # res4 temporal kernel for slow and fast pathway.
+            [[3], [3]],
+        ]  # res5 temporal kernel for slow and fast pathway.
+
+        self.s1 = VideoModelStem(
+            dim_in=self.input_channel_num,
+            dim_out=[self.width_per_group, self.width_per_group // self.beta],
+            kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],
+            stride=[[1, 2, 2]] * 2,
+            padding=[
+                [temp_kernel[0][0][0] // 2, 3, 3],
+                [temp_kernel[0][1][0] // 2, 3, 3],
+            ],
+            norm_module=self.norm_module)
+        self.s1_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu)
+
+        # ResNet backbone
+        MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}
+        (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]
+
+        num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]
+        spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]
+        spatial_strides = self.spatial_strides
+        #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]
+        #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment
+
+        out_dim_ratio = self.beta // self.fusion_conv_channel_ratio  #4
+        dim_inner = self.width_per_group * self.num_groups  #64
+
+        self.s2 = ResStage(dim_in=[
+            self.width_per_group + self.width_per_group // out_dim_ratio,
+            self.width_per_group // self.beta,
+        ],
+                           dim_out=[
+                               self.width_per_group * 4,
+                               self.width_per_group * 4 // self.beta,
+                           ],
+                           dim_inner=[dim_inner, dim_inner // self.beta],
+                           temp_kernel_sizes=temp_kernel[1],
+                           stride=spatial_strides[0],
+                           num_blocks=[d2] * 2,
+                           num_groups=[self.num_groups] * 2,
+                           num_block_temp_kernel=num_block_temp_kernel[0],
+                           dilation=spatial_dilations[0],
+                           norm_module=self.norm_module)
+
+        self.s2_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 4 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s3 = ResStage(
+            dim_in=[
+                self.width_per_group * 4 +
+                self.width_per_group * 4 // out_dim_ratio,
+                self.width_per_group * 4 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 8,
+                self.width_per_group * 8 // self.beta,
+            ],
+            dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],
+            temp_kernel_sizes=temp_kernel[2],
+            stride=spatial_strides[1],
+            num_blocks=[d3] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[1],
+            dilation=spatial_dilations[1],
+            norm_module=self.norm_module,
+        )
+
+        self.s3_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 8 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s4 = ResStage(
+            dim_in=[
+                self.width_per_group * 8 +
+                self.width_per_group * 8 // out_dim_ratio,
+                self.width_per_group * 8 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 16,
+                self.width_per_group * 16 // self.beta,
+            ],
+            dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],
+            temp_kernel_sizes=temp_kernel[3],
+            stride=spatial_strides[2],
+            num_blocks=[d4] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[2],
+            dilation=spatial_dilations[2],
+            norm_module=self.norm_module,
+        )
+
+        self.s4_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 16 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s5 = ResStage(
+            dim_in=[
+                self.width_per_group * 16 +
+                self.width_per_group * 16 // out_dim_ratio,
+                self.width_per_group * 16 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 32,
+                self.width_per_group * 32 // self.beta,
+            ],
+            dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],
+            temp_kernel_sizes=temp_kernel[4],
+            stride=spatial_strides[3],
+            num_blocks=[d5] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[3],
+            dilation=spatial_dilations[3],
+            norm_module=self.norm_module,
+        )
+
+    def init_weights(self):
+        pass
+
+    def forward(self, x):
+        x = self.s1(x)  #VideoModelStem
+        x = self.s1_fuse(x)  #FuseFastToSlow
+        x = self.s2(x)  #ResStage
+        x = self.s2_fuse(x)
+
+        #  TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.
+        if self.use_pool_af_s2:
+            for pathway in range(self.num_pathways):
+                x[pathway] = F.max_pool3d(
+                    x=x[pathway],
+                    kernel_size=self.pool_size_ratio[pathway],
+                    stride=self.pool_size_ratio[pathway],
+                    padding=[0, 0, 0],
+                    data_format="NCDHW")
+
+        x = self.s3(x)
+        x = self.s3_fuse(x)
+        x = self.s4(x)
+        x = self.s4_fuse(x)
+        x = self.s5(x)
+        return x
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py b/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py
new file mode 100644
index 000000000..70788ecf2
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet_tsm.py
@@ -0,0 +1,353 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format="NCHW"):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False,
+                            data_format=data_format)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(name=bn_name + "_scale",
+                                  regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(name=bn_name + "_offset",
+                                regularizer=L2Decay(0.0)),
+            data_format=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 num_seg=8,
+                 name=None,
+                 data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+        self.data_format = data_format
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="relu",
+                                 name=name + "_branch2a",
+                                 data_format=data_format)
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2b",
+                                 data_format=data_format)
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c",
+                                 data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1",
+                                     data_format=data_format)
+
+        self.shortcut = shortcut
+        self.num_seg = num_seg
+
+    def forward(self, inputs):
+        if paddle.is_compiled_with_custom_device('npu'):
+            x = inputs
+            seg_num = self.num_seg
+            shift_ratio = 1.0 / self.num_seg
+
+            shape = x.shape  #[N*T, C, H, W]
+            reshape_x = x.reshape(
+                (-1, seg_num, shape[1], shape[2], shape[3]))  #[N, T, C, H, W]
+            pad_x = F.pad(reshape_x, [
+                0,
+                0,
+                1,
+                1,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ])  #[N, T+2, C, H, W]
+            c1 = int(shape[1] * shift_ratio)
+            c2 = int(shape[1] * 2 * shift_ratio)
+            slice1 = pad_x[:, :seg_num, :c1, :, :]
+            slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]
+            slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]
+            concat_x = paddle.concat([slice1, slice2, slice3],
+                                     axis=2)  #[N, T, C, H, W]
+            shifts = concat_x.reshape(shape)
+        else:
+            shifts = F.temporal_shift(inputs,
+                                      self.num_seg,
+                                      1.0 / self.num_seg,
+                                      data_format=self.data_format)
+
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.relu(y)
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None,
+                 data_format="NCHW"):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            name=name + "_branch2a",
+            data_format=data_format,
+        )
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b",
+            data_format=data_format,
+        )
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1",
+                data_format=data_format,
+            )
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTSM(nn.Layer):
+    """ResNet TSM backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, num_seg=8, data_format="NCHW", pretrained=None):
+        super(ResNetTSM, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+        self.num_seg = num_seg
+        self.data_format = data_format
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = 64
+        out_channels = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(in_channels=3,
+                                out_channels=64,
+                                kernel_size=7,
+                                stride=2,
+                                act="relu",
+                                name="conv1",
+                                data_format=self.data_format)
+        self.pool2D_max = MaxPool2D(
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            data_format=self.data_format,
+        )
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        conv_name,
+                        BottleneckBlock(
+                            in_channels=in_channels
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            num_seg=self.num_seg,
+                            shortcut=shortcut,
+                            name=conv_name,
+                            data_format=self.data_format))
+                    in_channels = out_channels[block] * 4
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(
+                            in_channels=in_channels[block]
+                            if i == 0 else out_channels[block],
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name,
+                            data_format=self.data_format,
+                        ))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        #XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+
+        """
+        #NOTE: (deprecated design) Already merge axis 0(batches) and axis 1(clips) before extracting feature phase,
+        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+        #y = paddle.reshape(
+        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+        #NOTE: As paddlepaddle to_static method need a "pure" model to trim. It means from
+        #  1. the phase of generating data[images, label] from dataloader
+        #     to
+        #  2. last layer of a model, always is FC layer
+
+        y = self.conv(inputs)
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py b/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py
new file mode 100644
index 000000000..e814f0fda
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet_tsm_MRI.py
@@ -0,0 +1,327 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+import sys
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils.save_load import load_ckpt
+from paddle.regularizer import L2Decay
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        is_tweaks_mode (bool): switch for tweaks. Default: False.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+
+    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_tweaks_mode=False,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.is_tweaks_mode = is_tweaks_mode
+        #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+        #             whose stride is changed to 1, works well in practice.
+        self._pool2d_avg = AvgPool2D(kernel_size=2,
+                                     stride=2,
+                                     padding=0,
+                                     ceil_mode=True)
+
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(name=bn_name + "_scale",
+                                  regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(bn_name + "_offset", regularizer=L2Decay(0.0)))
+
+    def forward(self, inputs):
+        if self.is_tweaks_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 num_seg=8,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="leaky_relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="leaky_relu",
+                                 name=name + "_branch2b")
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=
+                1,  #ResNet-D 2/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+                #             whose stride is changed to 1, works well in practice.
+                is_tweaks_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+        self.num_seg = num_seg
+
+    def forward(self, inputs):
+        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.leaky_relu(y)
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 filter_size=3,
+                                 stride=stride,
+                                 act="leaky_relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 filter_size=3,
+                                 act=None,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     filter_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.leaky_relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTSM_MRI(nn.Layer):
+    """ResNet TSM backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, num_seg=8, pretrained=None, in_channels=1):
+        super(ResNetTSM_MRI, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+        self.num_seg = num_seg
+        self.in_channels = in_channels
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = 64
+        out_channels = [64, 128, 256, 512]
+
+        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+        self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=2,
+                                   act='leaky_relu',
+                                   name="conv1_1")
+        self.conv1_2 = ConvBNLayer(in_channels=32,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='leaky_relu',
+                                   name="conv1_2")
+        self.conv1_3 = ConvBNLayer(in_channels=32,
+                                   out_channels=64,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='leaky_relu',
+                                   name="conv1_3")
+        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' %
+                        (block, i),  #same with PaddleClas, for loading pretrain
+                        BottleneckBlock(
+                            in_channels=in_channels
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            num_seg=self.num_seg,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    in_channels = out_channels[block] * 4
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(in_channels=in_channels[block]
+                                   if i == 0 else out_channels[block],
+                                   out_channels=out_channels[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        #XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+
+        """
+        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,
+        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+        #y = paddle.reshape(
+        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+        ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py b/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py
new file mode 100644
index 000000000..439a0eff8
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet_tsn_MRI.py
@@ -0,0 +1,331 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from paddle.nn import Conv2D, BatchNorm
+from paddle.nn import MaxPool2D, AvgPool2D
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+__all__ = ["ResNetTSN_MRI"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_tweaks_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.is_tweaks_mode = is_tweaks_mode
+        self._pool2d_avg = AvgPool2D(kernel_size=2,
+                                     stride=2,
+                                     padding=0,
+                                     ceil_mode=True)
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights",
+                                                  learning_rate=lr_mult),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            out_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale',
+                                 learning_rate=lr_mult,
+                                 regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(bn_name + '_offset',
+                                learning_rate=lr_mult,
+                                regularizer=L2Decay(0.0)),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_tweaks_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     lr_mult=lr_mult,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 act=None,
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     lr_mult=lr_mult,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTSN_MRI(nn.Layer):
+    """ResNetTweaksTSN backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self,
+                 layers=50,
+                 pretrained=None,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 in_channels=1):
+        super(ResNetTSN_MRI, self).__init__()
+
+        self.pretrained = pretrained
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        self.lr_mult_list = lr_mult_list
+        self.in_channels = in_channels
+        assert isinstance(
+            self.lr_mult_list,
+            (list, tuple
+             )), "lr_mult_list should be in (list, tuple) but got {}".format(
+                 type(self.lr_mult_list))
+        assert len(
+            self.lr_mult_list
+        ) == 5, "lr_mult_list length should should be 5 but got {}".format(
+            len(self.lr_mult_list))
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024
+                        ] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=2,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_1")
+        self.conv1_2 = ConvBNLayer(in_channels=32,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_2")
+        self.conv1_3 = ConvBNLayer(in_channels=32,
+                                   out_channels=64,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152, 200] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            lr_mult=self.lr_mult_list[block + 1],
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(in_channels=num_channels[block]
+                                   if i == 0 else num_filters[block],
+                                   out_channels=num_filters[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   if_first=block == i == 0,
+                                   name=conv_name,
+                                   lr_mult=self.lr_mult_list[block + 1]))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be
+            initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        # XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    # XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py b/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py
new file mode 100644
index 000000000..089da4e65
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py
@@ -0,0 +1,362 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+import sys
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils.save_load import load_ckpt
+
+# Download URL of pretrained model
+# {
+# "ResNet50_vd":
+# "wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams",
+# "ResNet101_vd":
+# "https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams",
+# "ResNet18_vd":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+# "ResNet34_vd":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet34_vd_ssld_pretrained.pdparams",
+# "ResNet152_vd":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+# "ResNet200_vd":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+# }
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        is_tweaks_mode (bool): switch for tweaks. Default: False.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_tweaks_mode=False,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.is_tweaks_mode = is_tweaks_mode
+        #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+        #             whose stride is changed to 1, works well in practice.
+        self._pool2d_avg = AvgPool2D(kernel_size=2,
+                                     stride=2,
+                                     padding=0,
+                                     ceil_mode=True)
+
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(name=bn_name + "_scale",
+                                  regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(bn_name + "_offset", regularizer=L2Decay(0.0)))
+
+    def forward(self, inputs):
+        if self.is_tweaks_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 num_seg=8,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="leaky_relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="leaky_relu",
+                                 name=name + "_branch2b")
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+        self.num_seg = num_seg
+
+    def forward(self, inputs):
+        if paddle.is_compiled_with_custom_device('npu'):
+            x = inputs
+            seg_num = self.num_seg
+            shift_ratio = 1.0 / self.num_seg
+
+            shape = x.shape  #[N*T, C, H, W]
+            reshape_x = x.reshape(
+                (-1, seg_num, shape[1], shape[2], shape[3]))  #[N, T, C, H, W]
+            pad_x = F.pad(reshape_x, [
+                0,
+                0,
+                1,
+                1,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ])  #[N, T+2, C, H, W]
+            c1 = int(shape[1] * shift_ratio)
+            c2 = int(shape[1] * 2 * shift_ratio)
+            slice1 = pad_x[:, :seg_num, :c1, :, :]
+            slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]
+            slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]
+            concat_x = paddle.concat([slice1, slice2, slice3],
+                                     axis=2)  #[N, T, C, H, W]
+            shifts = concat_x.reshape(shape)
+        else:
+            shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)
+
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.leaky_relu(y)
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 num_seg=8,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.num_seg = num_seg
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="leaky_relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 act=None,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     kernel_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        # add temporal shift module
+        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.leaky_relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTweaksTSM(nn.Layer):
+    """ResNet TSM backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, num_seg=8, pretrained=None):
+        super(ResNetTweaksTSM, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+        self.num_seg = num_seg
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = 64
+        out_channels = [64, 128, 256, 512]
+
+        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+        self.conv1_1 = ConvBNLayer(in_channels=3,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=2,
+                                   act='leaky_relu',
+                                   name="conv1_1")
+        self.conv1_2 = ConvBNLayer(in_channels=32,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='leaky_relu',
+                                   name="conv1_2")
+        self.conv1_3 = ConvBNLayer(in_channels=32,
+                                   out_channels=64,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='leaky_relu',
+                                   name="conv1_3")
+        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' %
+                        (block, i),  #same with PaddleClas, for loading pretrain
+                        BottleneckBlock(
+                            in_channels=in_channels
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            num_seg=self.num_seg,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    in_channels = out_channels[block] * 4
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            in_channels = [64, 64, 128, 256]
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(in_channels=in_channels[block]
+                                   if i == 0 else out_channels[block],
+                                   out_channels=out_channels[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   num_seg=self.num_seg,
+                                   name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    # no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+        """
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py b/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py
new file mode 100644
index 000000000..36b33073f
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py
@@ -0,0 +1,328 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from paddle.nn import Conv2D, BatchNorm
+from paddle.nn import MaxPool2D, AvgPool2D
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+__all__ = ["ResNetTweaksTSN"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_tweaks_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.is_tweaks_mode = is_tweaks_mode
+        self._pool2d_avg = AvgPool2D(kernel_size=2,
+                                     stride=2,
+                                     padding=0,
+                                     ceil_mode=True)
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights",
+                                                  learning_rate=lr_mult),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            out_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale',
+                                 learning_rate=lr_mult,
+                                 regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(bn_name + '_offset',
+                                learning_rate=lr_mult,
+                                regularizer=L2Decay(0.0)),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_tweaks_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     lr_mult=lr_mult,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 act=None,
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     lr_mult=lr_mult,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTweaksTSN(nn.Layer):
+    """ResNetTweaksTSN backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self,
+                 layers=50,
+                 pretrained=None,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0]):
+        super(ResNetTweaksTSN, self).__init__()
+
+        self.pretrained = pretrained
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        self.lr_mult_list = lr_mult_list
+        assert isinstance(
+            self.lr_mult_list,
+            (list, tuple
+             )), "lr_mult_list should be in (list, tuple) but got {}".format(
+                 type(self.lr_mult_list))
+        assert len(
+            self.lr_mult_list
+        ) == 5, "lr_mult_list length should should be 5 but got {}".format(
+            len(self.lr_mult_list))
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024
+                        ] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(in_channels=3,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=2,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_1")
+        self.conv1_2 = ConvBNLayer(in_channels=32,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_2")
+        self.conv1_3 = ConvBNLayer(in_channels=32,
+                                   out_channels=64,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152, 200] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            lr_mult=self.lr_mult_list[block + 1],
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(in_channels=num_channels[block]
+                                   if i == 0 else num_filters[block],
+                                   out_channels=num_filters[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   if_first=block == i == 0,
+                                   name=conv_name,
+                                   lr_mult=self.lr_mult_list[block + 1]))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be
+            initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        # XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    # XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/docs/src/paddlevideo/modeling/backbones/resnext101.py b/docs/src/paddlevideo/modeling/backbones/resnext101.py
new file mode 100644
index 000000000..4f1d553bd
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/resnext101.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import paddle
+
+
+class ConvBNLayer(paddle.nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None,
+                 data_format="NCDHW"):
+        super(ConvBNLayer, self).__init__()
+        self._conv = paddle.nn.Conv3D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            padding_mode=padding_mode,
+            weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.KaimingNormal(
+                fan_in=num_filters * filter_size * filter_size), name=name+'_weights'),
+            bias_attr=bias_attr,
+            data_format=data_format)
+        bn_name = "bn_" + name
+        self._batch_norm = paddle.nn.BatchNorm3D(
+            num_filters,
+            momentum=0.9,
+            epsilon=1e-05,
+            weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(
+                1.), name=bn_name + '_scale'),
+            bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(
+                0.), name=bn_name + '_offset'),
+            data_format=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+def _downsample_basic_block(self, x, planes, stride):
+    out = paddle.nn.functional.avg_pool3d(x, kernel_size=1, stride=stride)
+    shape = out.shape
+    zero_pads = paddle.zeros(shape=[shape[0], planes - shape[1], shape[2], shape[3], shape[4]],
+                                   dtype='float32')
+    out = paddle.concat(x=[out, zero_pads], axis=1)
+
+
+class BottleneckBlock(paddle.nn.Layer):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None, name=None):
+        super(BottleneckBlock, self).__init__()
+
+        mid_planes = cardinality * int(planes / 32)
+        self.conv0 = ConvBNLayer(
+            inplanes, mid_planes, filter_size=1, bias_attr=False, name=name+'_branch2a')
+        self.conv1 = ConvBNLayer(mid_planes, mid_planes, filter_size=3, stride=stride,
+                                 padding=1, groups=cardinality, bias_attr=False, name=name+'_branch2b')
+        self.conv2 = ConvBNLayer(mid_planes, planes * self.expansion,
+                                 filter_size=1, bias_attr=False, name=name+'_branch2c')
+        self.downsample = downsample
+        self.stride = stride
+        self.relu = paddle.nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv0(x)
+        out = self.relu(out)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNeXt(paddle.nn.Layer):
+    def __init__(self,
+                 block,
+                 layers,
+                 shortcut_type='B',
+                 cardinality=32):
+        self.inplanes = 64
+        super(ResNeXt, self).__init__()
+        self.conv = ConvBNLayer(
+            3,
+            64,
+            filter_size=7,
+            stride=(1, 2, 2),
+            padding=(3, 3, 3),
+            bias_attr=False,
+            name="res_conv1"
+        )
+        self.relu = paddle.nn.ReLU()
+        self.maxpool = paddle.nn.MaxPool3D(kernel_size=(3, 3, 3), stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type,
+                                       cardinality, stride=1, name='layer1')
+
+        self.layer2 = self._make_layer(
+            block, 256, layers[1], shortcut_type, cardinality, stride=2, name='layer2')
+
+        self.layer3 = self._make_layer(
+            block, 512, layers[2], shortcut_type, cardinality, stride=2, name='layer3')
+
+        self.layer4 = self._make_layer(
+            block, 1024, layers[3], shortcut_type, cardinality, stride=2, name='layer4')
+        self.avgpool = paddle.nn.AvgPool3D((2, 1, 1), stride=1, exclusive=False)
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    shortcut_type,
+                    cardinality,
+                    stride=1,
+                    name=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if shortcut_type == 'A':
+                downsample = partial(self._downsample_basic_block,
+                                     planes=planes * block.expansion,
+                                     stride=stride)
+            else:
+                downsample = ConvBNLayer(
+                    self.inplanes,
+                    planes * block.expansion,
+                    1,
+                    stride=stride,
+                    bias_attr=False,
+                    name=name+'downsample'
+                )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, cardinality, stride, downsample, name=name+'_downsample'))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes,
+                          cardinality, name=name+'_res_block'+str(i)))
+
+        return paddle.nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+
+
+def ResNext101():
+    """Constructs a ResNext-101 model.
+    """
+    model = ResNeXt(BottleneckBlock, [3, 4, 23, 3])
+    return model
diff --git a/docs/src/paddlevideo/modeling/backbones/stgcn.py b/docs/src/paddlevideo/modeling/backbones/stgcn.py
new file mode 100644
index 000000000..40d9d0dda
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/stgcn.py
@@ -0,0 +1,343 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+
+def zero(x):
+    return 0
+
+
+def iden(x):
+    return x
+
+
+def einsum(x, A):
+    """paddle.einsum will be implemented in release/2.2.
+    """
+    x = x.transpose((0, 2, 3, 1, 4))
+    n, c, t, k, v = x.shape
+    k2, v2, w = A.shape
+    assert (k == k2 and v == v2), "Args of einsum not match!"
+    x = x.reshape((n, c, t, k * v))
+    A = A.reshape((k * v, w))
+    y = paddle.matmul(x, A)
+    return y
+
+
+def get_hop_distance(num_node, edge, max_hop=1):
+    A = np.zeros((num_node, num_node))
+    for i, j in edge:
+        A[j, i] = 1
+        A[i, j] = 1
+
+    # compute hop steps
+    hop_dis = np.zeros((num_node, num_node)) + np.inf
+    transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
+    arrive_mat = (np.stack(transfer_mat) > 0)
+    for d in range(max_hop, -1, -1):
+        hop_dis[arrive_mat[d]] = d
+    return hop_dis
+
+
+def normalize_digraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-1)
+    AD = np.dot(A, Dn)
+    return AD
+
+
+class Graph():
+
+    def __init__(self,
+                 layout='openpose',
+                 strategy='uniform',
+                 max_hop=1,
+                 dilation=1):
+        self.max_hop = max_hop
+        self.dilation = dilation
+
+        self.get_edge(layout)
+        self.hop_dis = get_hop_distance(self.num_node,
+                                        self.edge,
+                                        max_hop=max_hop)
+        self.get_adjacency(strategy)
+
+    def __str__(self):
+        return self.A
+
+    def get_edge(self, layout):
+        # edge is a list of [child, parent] paris
+
+        if layout == 'fsd10':
+            self.num_node = 25
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_link = [(1, 8), (0, 1), (15, 0), (17, 15), (16, 0),
+                             (18, 16), (5, 1), (6, 5), (7, 6), (2, 1), (3, 2),
+                             (4, 3), (9, 8), (10, 9), (11, 10), (24, 11),
+                             (22, 11), (23, 22), (12, 8), (13, 12), (14, 13),
+                             (21, 14), (19, 14), (20, 19)]
+            self.edge = self_link + neighbor_link
+            self.center = 8
+        elif layout == 'ntu-rgb+d':
+            self.num_node = 25
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
+                              (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),
+                              (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),
+                              (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),
+                              (23, 8), (24, 25), (25, 12)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            self.edge = self_link + neighbor_link
+            self.center = 21 - 1
+        elif layout == 'coco_keypoint':
+            self.num_node = 17
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_1base = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6),
+                              (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12),
+                              (11, 13), (12, 14), (13, 15), (14, 16), (11, 12)]
+            neighbor_link = [(i, j) for (i, j) in neighbor_1base]
+            self.edge = self_link + neighbor_link
+            self.center = 11
+        else:
+            raise ValueError("Do Not Exist This Layout.")
+
+    def get_adjacency(self, strategy):
+        valid_hop = range(0, self.max_hop + 1, self.dilation)
+        adjacency = np.zeros((self.num_node, self.num_node))
+        for hop in valid_hop:
+            adjacency[self.hop_dis == hop] = 1
+        normalize_adjacency = normalize_digraph(adjacency)
+
+        if strategy == 'spatial':
+            A = []
+            for hop in valid_hop:
+                a_root = np.zeros((self.num_node, self.num_node))
+                a_close = np.zeros((self.num_node, self.num_node))
+                a_further = np.zeros((self.num_node, self.num_node))
+                for i in range(self.num_node):
+                    for j in range(self.num_node):
+                        if self.hop_dis[j, i] == hop:
+                            if self.hop_dis[j, self.center] == self.hop_dis[
+                                    i, self.center]:
+                                a_root[j, i] = normalize_adjacency[j, i]
+                            elif self.hop_dis[j, self.center] > self.hop_dis[
+                                    i, self.center]:
+                                a_close[j, i] = normalize_adjacency[j, i]
+                            else:
+                                a_further[j, i] = normalize_adjacency[j, i]
+                if hop == 0:
+                    A.append(a_root)
+                else:
+                    A.append(a_root + a_close)
+                    A.append(a_further)
+            A = np.stack(A)
+            self.A = A
+        else:
+            raise ValueError("Do Not Exist This Strategy")
+
+
+class ConvTemporalGraphical(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 t_kernel_size=1,
+                 t_stride=1,
+                 t_padding=0,
+                 t_dilation=1):
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self.conv = nn.Conv2D(in_channels,
+                              out_channels * kernel_size,
+                              kernel_size=(t_kernel_size, 1),
+                              padding=(t_padding, 0),
+                              stride=(t_stride, 1),
+                              dilation=(t_dilation, 1))
+
+    def forward(self, x, A):
+        assert A.shape[0] == self.kernel_size
+
+        x = self.conv(x)
+        n, kc, t, v = x.shape
+        x = x.reshape((n, self.kernel_size, kc // self.kernel_size, t, v))
+        x = einsum(x, A)
+
+        return x, A
+
+
+class st_gcn_block(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dropout=0,
+                 residual=True):
+        super(st_gcn_block, self).__init__()
+
+        assert len(kernel_size) == 2
+        assert kernel_size[0] % 2 == 1
+        padding = ((kernel_size[0] - 1) // 2, 0)
+
+        self.gcn = ConvTemporalGraphical(in_channels, out_channels,
+                                         kernel_size[1])
+
+        self.tcn = nn.Sequential(
+            nn.BatchNorm2D(out_channels),
+            nn.ReLU(),
+            nn.Conv2D(
+                out_channels,
+                out_channels,
+                (kernel_size[0], 1),
+                (stride, 1),
+                padding,
+            ),
+            nn.BatchNorm2D(out_channels),
+            nn.Dropout(dropout),
+        )
+
+        if not residual:
+            self.residual = zero
+
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = iden
+
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2D(in_channels,
+                          out_channels,
+                          kernel_size=1,
+                          stride=(stride, 1)),
+                nn.BatchNorm2D(out_channels),
+            )
+
+        self.relu = nn.ReLU()
+
+    def forward(self, x, A):
+        res = self.residual(x)
+        x, A = self.gcn(x, A)
+        x = self.tcn(x) + res
+        return self.relu(x), A
+
+
+@BACKBONES.register()
+class STGCN(nn.Layer):
+    """
+    ST-GCN model from:
+    `"Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition" <https://arxiv.org/abs/1801.07455>`_
+    Args:
+        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.
+        edge_importance_weighting: bool, whether to use edge attention. Default True.
+        data_bn: bool, whether to use data BatchNorm. Default True.
+    """
+
+    def __init__(self,
+                 in_channels=2,
+                 edge_importance_weighting=True,
+                 data_bn=True,
+                 layout='fsd10',
+                 strategy='spatial',
+                 **kwargs):
+        super(STGCN, self).__init__()
+        self.data_bn = data_bn
+        # load graph
+        self.graph = Graph(
+            layout=layout,
+            strategy=strategy,
+        )
+        A = paddle.to_tensor(self.graph.A, dtype='float32')
+        self.register_buffer('A', A)
+
+        # build networks
+        spatial_kernel_size = A.shape[0]
+        temporal_kernel_size = 9
+        kernel_size = (temporal_kernel_size, spatial_kernel_size)
+        self.data_bn = nn.BatchNorm1D(in_channels *
+                                      A.shape[1]) if self.data_bn else iden
+        kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}
+        self.st_gcn_networks = nn.LayerList((
+            st_gcn_block(in_channels,
+                         64,
+                         kernel_size,
+                         1,
+                         residual=False,
+                         **kwargs0),
+            st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+            st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+            st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+            st_gcn_block(64, 128, kernel_size, 2, **kwargs),
+            st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+            st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+            st_gcn_block(128, 256, kernel_size, 2, **kwargs),
+            st_gcn_block(256, 256, kernel_size, 1, **kwargs),
+            st_gcn_block(256, 256, kernel_size, 1, **kwargs),
+        ))
+
+        # initialize parameters for edge importance weighting
+        if edge_importance_weighting:
+            self.edge_importance = nn.ParameterList([
+                self.create_parameter(
+                    shape=self.A.shape,
+                    default_initializer=nn.initializer.Constant(1))
+                for i in self.st_gcn_networks
+            ])
+        else:
+            self.edge_importance = [1] * len(self.st_gcn_networks)
+
+        self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                weight_init_(layer, 'Normal', mean=0.0, std=0.02)
+            elif isinstance(layer, nn.BatchNorm2D):
+                weight_init_(layer, 'Normal', mean=1.0, std=0.02)
+            elif isinstance(layer, nn.BatchNorm1D):
+                weight_init_(layer, 'Normal', mean=1.0, std=0.02)
+
+    def forward(self, x):
+        # data normalization
+        N, C, T, V, M = x.shape
+        x = x.transpose((0, 4, 3, 1, 2))  # N, M, V, C, T
+        x = x.reshape((N * M, V * C, T))
+        if self.data_bn:
+            x.stop_gradient = False
+        x = self.data_bn(x)
+        x = x.reshape((N, M, V, C, T))
+        x = x.transpose((0, 1, 3, 4, 2))  # N, M, C, T, V
+        x = x.reshape((N * M, C, T, V))
+
+        # forward
+        for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
+            x, _ = gcn(x, paddle.multiply(self.A, importance))
+
+        x = self.pool(x)  # NM,C,T,V --> NM,C,1,1
+        C = x.shape[1]
+        x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1)  # N,C,1,1
+        return x
diff --git a/docs/src/paddlevideo/modeling/backbones/swin_transformer.py b/docs/src/paddlevideo/modeling/backbones/swin_transformer.py
new file mode 100644
index 000000000..2bbc6b364
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/swin_transformer.py
@@ -0,0 +1,742 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import lru_cache, reduce
+from operator import mul
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Constant
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    # issuecomment-532968956 ...
+    See discussion: https://github.com/tensorflow/tpu/issues/494
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(nn.Layer):
+    """ Multilayer perceptron."""
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """window_partition
+    Args:
+        x (Tensor): x.shape = [B, D, H, W, C]
+        window_size (tuple[int]): window_size
+
+    Returns:
+        Tensor: (B*num_windows, window_size*window_size, C)
+    """
+    B, D, H, W, C = x.shape
+    x = x.reshape([
+        B, D // window_size[0], window_size[0], H // window_size[1],
+        window_size[1], W // window_size[2], window_size[2], C
+    ])
+    windows = x.transpose([0, 1, 3, 5, 2, 4, 6,
+                           7]).reshape([-1, reduce(mul, window_size), C])
+    return windows
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+def window_reverse(windows, window_size, B, D, H, W):
+    """
+    Args:
+        windows: (B*num_windows, window_size, window_size, C)
+        window_size (tuple[int]): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, D, H, W, C)
+    """
+    x = windows.reshape([
+        B, D // window_size[0], H // window_size[1], W // window_size[2],
+        window_size[0], window_size[1], window_size[2], -1
+    ])
+    x = x.transpose([0, 1, 4, 2, 5, 3, 6, 7]).reshape([B, D, H, W, -1])
+    return x
+
+
+def get_window_size(x_size, window_size, shift_size=None):
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+
+
+class WindowAttention3D(nn.Layer):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The temporal length, height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = self.create_parameter(
+            shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1) *
+                   (2 * window_size[2] - 1), num_heads),
+            default_initializer=zeros_,
+        )  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+        self.add_parameter("relative_position_bias_table",
+                           self.relative_position_bias_table)
+        # get pair-wise relative position index for each token inside the window
+        coords_d = paddle.arange(self.window_size[0])
+        coords_h = paddle.arange(self.window_size[1])
+        coords_w = paddle.arange(self.window_size[2])
+        coords = paddle.stack(paddle.meshgrid(coords_d, coords_h,
+                                              coords_w))  # 3, Wd, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 3, Wd*Wh*Ww
+
+        relative_coords = coords_flatten.unsqueeze(
+            axis=2) - coords_flatten.unsqueeze(axis=1)  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+
+        # relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1)  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = relative_coords.transpose([1, 2, 0
+                                                     ])  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1] -
+                                     1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
+        relative_position_index = relative_coords.sum(
+            axis=-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, N, N) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [B_, N, 3, self.num_heads,
+             C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C
+
+        q = q * self.scale
+        attn = q @ k.transpose([0, 1, 3, 2])
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index[:N, :N].reshape([-1])].reshape(
+                [N, N, -1])  # Wd*Wh*Ww,Wd*Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wd*Wh*Ww, Wd*Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)  # B_, nH, N, N
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0).astype(attn.dtype)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B_, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock3D(nn.Layer):
+    """ Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (tuple[int]): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(2, 7, 7),
+                 shift_size=(0, 0, 0),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_checkpoint=False):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        # self.use_checkpoint=use_checkpoint
+
+        assert 0 <= self.shift_size[0] < self.window_size[
+            0], "shift_size must in 0-window_size"
+        assert 0 <= self.shift_size[1] < self.window_size[
+            1], "shift_size must in 0-window_size"
+        assert 0 <= self.shift_size[2] < self.window_size[
+            2], "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention3D(dim,
+                                      window_size=self.window_size,
+                                      num_heads=num_heads,
+                                      qkv_bias=qkv_bias,
+                                      qk_scale=qk_scale,
+                                      attn_drop=attn_drop,
+                                      proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward_part1(self, x, mask_matrix):
+        B = paddle.shape(x)[0]
+        _, D, H, W, C = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+
+        x = self.norm1(x)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = pad_d0 = 0
+        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
+        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
+        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
+        x = F.pad(x, (pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1),
+                  data_format='NDHWC')
+        _, Dp, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if any(i > 0 for i in shift_size):
+            shifted_x = paddle.roll(x,
+                                    shifts=(-shift_size[0], -shift_size[1],
+                                            -shift_size[2]),
+                                    axis=(1, 2, 3))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x,
+                                     window_size)  # B*nW, Wd*Wh*Ww, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # B*nW, Wd*Wh*Ww, C
+        # merge windows
+        attn_windows = attn_windows.reshape([-1, *(window_size + (C, ))])
+        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,
+                                   Wp)  # B D' H' W' C
+        # reverse cyclic shift
+        if any(i > 0 for i in shift_size):
+            x = paddle.roll(shifted_x,
+                            shifts=(shift_size[0], shift_size[1],
+                                    shift_size[2]),
+                            axis=(1, 2, 3))
+        else:
+            x = shifted_x
+
+        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+            x = x[:, :D, :H, :W, :]
+        return x
+
+    def forward_part2(self, x):
+        return self.drop_path(self.mlp(self.norm2(x)))
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+            mask_matrix: Attention mask for cyclic shift.
+        """
+
+        shortcut = x
+        x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x).astype(shortcut.dtype)
+        x = x + self.forward_part2(x).astype(x.dtype)
+
+        return x
+
+
+class PatchMerging(nn.Layer):
+    """ Patch Merging Layer
+
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+        """
+        B, D, H, W, C = x.shape
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, W % 2, 0, H % 2, 0, 0), data_format='NDHWC')
+
+        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
+        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
+        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
+        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+# cache each stage results
+@lru_cache()
+def compute_mask(D, H, W, window_size, shift_size):
+    img_mask = paddle.zeros((1, D, H, W, 1))  # 1 Dp Hp Wp 1
+    cnt = 0
+    for d in slice(-window_size[0]), slice(-window_size[0],
+                                           -shift_size[0]), slice(
+                                               -shift_size[0], None):
+        for h in slice(-window_size[1]), slice(-window_size[1],
+                                               -shift_size[1]), slice(
+                                                   -shift_size[1], None):
+            for w in slice(-window_size[2]), slice(-window_size[2],
+                                                   -shift_size[2]), slice(
+                                                       -shift_size[2], None):
+                img_mask[:, d, h, w, :] = cnt
+                cnt += 1
+    mask_windows = window_partition(img_mask,
+                                    window_size)  # nW, ws[0]*ws[1]*ws[2], 1
+    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    # attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    huns = -100.0 * paddle.ones_like(attn_mask)
+    attn_mask = huns * (attn_mask != 0).astype("float32")
+    return attn_mask
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (tuple[int]): Local window size. Default: (1,7,7).
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=(1, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock3D(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_checkpoint=use_checkpoint,
+            ) for i in range(depth)
+        ])
+
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, C, D, H, W).
+        """
+        # calculate attention mask for SW-MSA
+        B = paddle.shape(x)[0]
+        _, C, D, H, W = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+        # x = rearrange(x, 'b c d h w -> b d h w c')
+        x = x.transpose([0, 2, 3, 4, 1])
+        Dp = int(np.ceil(D / window_size[0])) * window_size[0]
+        Hp = int(np.ceil(H / window_size[1])) * window_size[1]
+        Wp = int(np.ceil(W / window_size[2])) * window_size[2]
+        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size)
+        for blk in self.blocks:
+            x = blk(x, attn_mask)
+        x = x.reshape([B, D, H, W, C])
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+        x = x.transpose([0, 4, 1, 2, 3])
+        return x
+
+
+class PatchEmbed3D(nn.Layer):
+    """ Video to Patch Embedding.
+
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+    def __init__(self,
+                 patch_size=(2, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv3D(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        _, _, D, H, W = x.shape
+        if W % self.patch_size[2] != 0:
+            x = F.pad(
+                x, (0, self.patch_size[2] - W % self.patch_size[2], 0, 0, 0, 0),
+                data_format='NCDHW')
+        if H % self.patch_size[1] != 0:
+            x = F.pad(
+                x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1], 0, 0),
+                data_format='NCDHW')
+        if D % self.patch_size[0] != 0:
+            x = F.pad(
+                x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]),
+                data_format='NCDHW')
+
+        x = self.proj(x)  # B C D Wh Ww
+        if self.norm is not None:
+            D, Wh, Ww = x.shape[2], x.shape[3], x.shape[4]
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, D, Wh, Ww])
+
+        return x
+
+
+@BACKBONES.register()
+class SwinTransformer3D(nn.Layer):
+    """ Swin Transformer backbone.
+        A Paddle impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer: Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+    """
+    def __init__(self,
+                 pretrained=None,
+                 patch_size=(4, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=(2, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=False,
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrained = pretrained
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.frozen_stages = frozen_stages
+        self.window_size = window_size
+        self.patch_size = patch_size
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed3D(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if i_layer < self.num_layers - 1 else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+
+        # add a norm layer for each output
+        self.norm = norm_layer(self.num_features)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 1:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_fn(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def init_weights(self):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        """First init model's weight"""
+
+        self.apply(self._init_fn)
+        """Second, if provide pretrained ckpt, load it"""
+        if isinstance(
+                self.pretrained, str
+        ) and self.pretrained.strip() != "":  # load pretrained weights
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            pass
+        else:
+            raise NotImplementedError
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x = x.transpose([0, 2, 3, 4, 1])
+        x = self.norm(x)
+        x = x.transpose([0, 4, 1, 2, 3])
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer3D, self).train(mode)
+        self._freeze_stages()
diff --git a/docs/src/paddlevideo/modeling/backbones/toshift_vit.py b/docs/src/paddlevideo/modeling/backbones/toshift_vit.py
new file mode 100644
index 000000000..a4819968e
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/toshift_vit.py
@@ -0,0 +1,413 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Constant
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+__all__ = ['VisionTransformer']
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    # issuecomment-532968956 ...
+    See discussion: https://github.com/tensorflow/tpu/issues/494
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.0,
+                 proj_drop=0.0):
+        super().__init__()
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_drop = nn.Dropout(attn_drop)
+
+    def forward(self, x):
+        N, C = x.shape[1:]
+        qkv = self.qkv(x).reshape(
+            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.0,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.0,
+                 attn_drop=0.0,
+                 drop_path=0.1,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 num_segments = 8,
+                 fold_div = 4):
+                #attention_type='divided_space_time',
+        super().__init__()
+        self.n_seg = num_segments       #ckk
+        self.foldP_div = fold_div       #ckk
+        #self.attention_type = attention_type
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attn_drop=attn_drop,
+                              proj_drop=drop)
+
+        # Temporal Attention Parameters
+        '''
+        if self.attention_type == 'divided_space_time':
+            if isinstance(norm_layer, str):
+                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+            elif isinstance(norm_layer, Callable):
+                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)
+            else:
+                raise TypeError(
+                    "The norm_layer must be str or paddle.nn.layer.Layer class")
+            self.temporal_attn = Attention(dim,
+                                           num_heads=num_heads,
+                                           qkv_bias=qkv_bias,
+                                           qk_scale=qk_scale,
+                                           attn_drop=attn_drop,
+                                           proj_drop=drop)
+            self.temporal_fc = nn.Linear(dim, dim)
+        '''
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+    # token_shift
+    def shuift_tk(self, x):
+        t = self.n_seg
+        bt, n, c = x.shape
+        b = bt // t
+        x = x.reshape([b, t, n, c]) #B T N C
+        
+        fold = c // self.foldP_div
+        out = paddle.zeros_like(x)
+        out.stop_gradient = True
+        # print("#### fold ", fold)
+        # print(out.shape)
+        # print(x[:, 1:, 0, :fold].unsqueeze(2).shape)
+        # print(out[:, :-1, 0:1, :fold].shape)
+        # exit(0)
+        out[:, :-1, 0, :fold] = x[:, 1:, 0, :fold] # shift left
+        out[:, 1:,  0, fold:2*fold] = x[:,:-1:, 0, fold:2*fold]
+        
+        out[:, :, 1:, :2*fold] = x[:, :, 1:, :2*fold]
+        out[:, :, :, 2*fold:] = x[:, :, :, 2*fold:]
+        
+        return out.reshape([bt, n, c])
+    
+    def forward(self, x):
+        x = self.shuift_tk(x)
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = self.shuift_tk(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(in_channels,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = x.transpose((0, 2, 1, 3, 4))
+        x = x.reshape([-1, C, H, W])
+        x = self.proj(x)
+        W = x.shape[-1]
+        x = x.flatten(2).transpose((0, 2, 1))
+        return x, T, W
+
+
+@BACKBONES.register()
+class TokenShiftVisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+    def __init__(self,
+                 pretrained=None,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 num_seg=8,
+                 attention_type='divided_space_time',
+                 **args):
+
+        super().__init__()
+        self.pretrained = pretrained
+        self.num_seg = num_seg
+        self.attention_type = attention_type
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(img_size=img_size,
+                                      patch_size=patch_size,
+                                      in_channels=in_channels,
+                                      embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        # Positional Embeddings
+        self.cls_token = self.create_parameter(shape=(1, 1, embed_dim),
+                                               default_initializer=zeros_)
+        self.pos_embed = self.create_parameter(shape=(1, num_patches + 1,
+                                                      embed_dim),
+                                               default_initializer=zeros_)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if self.attention_type != 'space_only':
+            self.time_embed = self.create_parameter(shape=(1, num_seg,
+                                                           embed_dim),
+                                                    default_initializer=zeros_)
+            self.time_drop = nn.Dropout(p=drop_rate)
+
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.add_parameter("cls_token", self.cls_token)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(dim=embed_dim,
+                  num_heads=num_heads,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[i],
+                  norm_layer=norm_layer,
+                  epsilon=epsilon,
+                  num_segments= self.num_seg
+                  ) for i in range(depth)
+                #attention_type=self.attention_type
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+    def init_weights(self):
+        """First init model's weight"""
+        trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        self.apply(self._init_fn)
+
+        if self.attention_type == 'divided_space_time':
+            i = 0
+            for m in self.blocks.sublayers(include_self=True):
+                m_str = str(m)
+                if 'Block' in m_str:
+                    if i > 0:
+                        zeros_(m.temporal_fc.weight)
+                        zeros_(m.temporal_fc.bias)
+                    i += 1
+
+        """Second, if provide pretrained ckpt, load it"""
+
+        if isinstance(
+                self.pretrained, str
+        ) and self.pretrained.strip() != "":  # load pretrained weights
+            load_ckpt(self,
+                      self.pretrained,
+                      num_patches=self.patch_embed.num_patches,
+                      num_seg=self.num_seg,
+                      attention_type=self.attention_type)
+
+    def _init_fn(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            ones_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        # B = x.shape[0]
+        B = paddle.shape(x)[0]
+        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]
+        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]
+        x = paddle.concat((cls_tokens, x), axis=1)
+        pos_interp = (x.shape[1] != self.pos_embed.shape[1])
+        if pos_interp:
+            pos_embed = self.pos_embed
+            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(
+                (0, 2, 1))
+            P = int(other_pos_embed.shape[2]**0.5)
+            H = x.shape[1] // W
+            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])
+            new_pos_embed = F.interpolate(other_pos_embed,
+                                          size=(H, W),
+                                          mode='nearest')
+            new_pos_embed = new_pos_embed.flatten(2)
+            new_pos_embed = new_pos_embed.transpose((0, 2, 1))
+            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),
+                                          axis=1)
+            x = x + new_pos_embed
+        else:
+            x = x + self.pos_embed
+
+        x = self.pos_drop(x)
+
+        # Attention blocks
+        for blk in self.blocks:
+            x = blk(x)
+
+
+        x = self.norm(x)
+        return x[:, 0]  # [B,  embed_dim]  -> [B*T, embed_dim]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
\ No newline at end of file
diff --git a/docs/src/paddlevideo/modeling/backbones/transnetv2.py b/docs/src/paddlevideo/modeling/backbones/transnetv2.py
new file mode 100644
index 000000000..60603e2c9
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/transnetv2.py
@@ -0,0 +1,582 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as functional
+import random
+from paddle import ParamAttr
+
+from ..registry import BACKBONES
+
+
+class OctConv3D(nn.Layer):
+    def __init__(self, in_filters, filters, kernel_size=3, dilation_rate=(1, 1, 1), alpha=0.25,
+                 use_bias=True, kernel_initializer=nn.initializer.KaimingNormal()):
+        super(OctConv3D, self).__init__()
+
+        self.low_channels = int(filters * alpha)
+        self.high_channels = filters - self.low_channels
+
+        self.high_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,
+                                      dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+                                      weight_attr=ParamAttr(initializer=kernel_initializer),
+                                      bias_attr=ParamAttr(
+                                          initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+        self.high_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,
+                                     dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+                                     weight_attr=ParamAttr(initializer=kernel_initializer),
+                                     bias_attr=False)
+        self.low_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,
+                                     dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+                                     weight_attr=ParamAttr(initializer=kernel_initializer),
+                                     bias_attr=False)
+        self.low_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,
+                                    dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+                                    weight_attr=ParamAttr(initializer=kernel_initializer),
+                                    bias_attr=ParamAttr(
+                                        initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+        self.upsampler = nn.Upsample(size=(1, 2, 2), data_format='NCDHW')
+        self.downsampler = nn.AvgPool3D(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=(0, 1, 1))
+
+    @staticmethod
+    def pad_to(tensor, target_shape):
+        shape = tensor.shape
+        padding = [[0, tar - curr] for curr, tar in zip(shape, target_shape)]
+        return functional.pad(tensor, padding, "CONSTANT", data_format='NCDHW')
+
+    @staticmethod
+    def crop_to(tensor, target_width, target_height):
+        return tensor[:, :, :target_height, :target_width]
+
+    def forward(self, inputs):
+        low_inputs, high_inputs = inputs
+
+        high_to_high = self.high_to_high(high_inputs)
+        high_to_low = self.high_to_low(self.downsampler(high_inputs))
+
+        low_to_high = self.upsampler(self.low_to_high(low_inputs))
+        low_to_low = self.low_to_low(low_inputs)
+
+        high_output = high_to_high[:, :, :, :low_to_high.shape[3], :low_to_high.shape[4]] + low_to_high
+        low_output = low_to_low + high_to_low[:, :, :, :low_to_low.shape[3], :low_to_low.shape[4]]
+
+        return low_output, high_output
+
+
+class Conv3DConfigurable(nn.Layer):
+    def __init__(self,
+                 in_filters,
+                 filters,
+                 dilation_rate,
+                 separable=True,
+                 octave=False,
+                 use_bias=True):
+        super(Conv3DConfigurable, self).__init__()
+        assert not (separable and octave)
+
+        if separable:
+            conv1 = nn.Conv3D(in_filters, 2 * filters, kernel_size=(1, 3, 3),
+                              dilation=(1, 1, 1), padding=(0, 1, 1),
+                              weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+                              bias_attr=False)
+            conv2 = nn.Conv3D(2 * filters, filters, kernel_size=(3, 1, 1),
+                              dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 0, 0),
+                              weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+                              bias_attr=ParamAttr(
+                                  initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+            self.layers = nn.LayerList([conv1, conv2])
+        elif octave:
+            conv = OctConv3D(in_filters, filters, kernel_size=3, dilation_rate=(dilation_rate, 1, 1),
+                             use_bias=use_bias,
+                             kernel_initializer=nn.initializer.KaimingNormal())
+            self.layers = [conv]
+        else:
+            conv = nn.Conv3D(in_filters, filters, kernel_size=3,
+                             dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 1, 1),
+                             weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+                             bias_attr=ParamAttr(
+                                 initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+            self.layers = nn.LayerList([conv])
+
+    def forward(self, inputs):
+        x = inputs
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class DilatedDCNNV2(nn.Layer):
+    def __init__(self,
+                 in_filters,
+                 filters,
+                 batch_norm=True,
+                 activation=None,
+                 octave_conv=False):
+        super(DilatedDCNNV2, self).__init__()
+        assert not (octave_conv and batch_norm)
+
+        self.Conv3D_1 = Conv3DConfigurable(in_filters, filters, 1, use_bias=not batch_norm, octave=octave_conv)
+        self.Conv3D_2 = Conv3DConfigurable(in_filters, filters, 2, use_bias=not batch_norm, octave=octave_conv)
+        self.Conv3D_4 = Conv3DConfigurable(in_filters, filters, 4, use_bias=not batch_norm, octave=octave_conv)
+        self.Conv3D_8 = Conv3DConfigurable(in_filters, filters, 8, use_bias=not batch_norm, octave=octave_conv)
+        self.octave = octave_conv
+
+        self.bn = nn.BatchNorm3D(filters * 4, momentum=0.99, epsilon=1e-03,
+                                 weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),
+                                 bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                                 ) if batch_norm else None
+        self.activation = activation
+
+    def forward(self, inputs):
+        conv1 = self.Conv3D_1(inputs)
+        conv2 = self.Conv3D_2(inputs)
+        conv3 = self.Conv3D_4(inputs)
+        conv4 = self.Conv3D_8(inputs)
+
+        # shape of convi[j]/convi is [B, 3, T, H, W], concat in channel dimension
+        if self.octave:
+            x = [paddle.concat([conv1[0], conv2[0], conv3[0], conv4[0]], axis=1),
+                 paddle.concat([conv1[1], conv2[1], conv3[1], conv4[1]], axis=1)]
+        else:
+            x = paddle.concat([conv1, conv2, conv3, conv4], axis=1)
+
+        if self.bn is not None:
+            x = self.bn(x)
+
+        if self.activation is not None:
+            if self.octave:
+                x = [self.activation(x[0]), self.activation(x[1])]
+            else:
+                x = self.activation(x)
+        return x
+
+
+class StackedDDCNNV2(nn.Layer):
+    def __init__(self,
+                 in_filters,
+                 n_blocks,
+                 filters,
+                 shortcut=True,
+                 use_octave_conv=False,
+                 pool_type="avg",
+                 stochastic_depth_drop_prob=0.0):
+        super(StackedDDCNNV2, self).__init__()
+        assert pool_type == "max" or pool_type == "avg"
+        if use_octave_conv and pool_type == "max":
+            print("WARN: Octave convolution was designed with average pooling, not max pooling.")
+
+        self.shortcut = shortcut
+        self.DDCNN = nn.LayerList([
+            DilatedDCNNV2(in_filters if i == 1 else filters * 4, filters, octave_conv=use_octave_conv,
+                          activation=functional.relu if i != n_blocks else None) for i in range(1, n_blocks + 1)
+        ])
+        self.pool = nn.MaxPool3D(kernel_size=(1, 2, 2)) if pool_type == "max" else nn.AvgPool3D(kernel_size=(1, 2, 2))
+        self.octave = use_octave_conv
+        self.stochastic_depth_drop_prob = stochastic_depth_drop_prob
+
+    def forward(self, inputs):
+        x = inputs
+        shortcut = None
+
+        if self.octave:
+            x = [self.pool(x), x]
+        for block in self.DDCNN:
+            x = block(x)
+            if shortcut is None:
+                shortcut = x
+        # shape of x[i] is [B, 3, T, H, W], concat in channel dimension
+        if self.octave:
+            x = paddle.concat([x[0], self.pool(x[1])], axis=1)
+
+        x = functional.relu(x)
+
+        if self.shortcut is not None:
+            if self.stochastic_depth_drop_prob != 0.:
+                if self.training:
+                    if random.random() < self.stochastic_depth_drop_prob:
+                        x = shortcut
+                    else:
+                        x = x + shortcut
+                else:
+                    x = (1 - self.stochastic_depth_drop_prob) * x + shortcut
+            else:
+                x += shortcut
+
+        if not self.octave:
+            x = self.pool(x)
+        return x
+
+
+class ResNetBlock(nn.Layer):
+    def __init__(self, in_filters, filters, strides=(1, 1)):
+        super(ResNetBlock, self).__init__()
+
+        self.conv1 = nn.Conv2D(in_filters, filters, kernel_size=(3, 3), stride=strides, padding=(1, 1),
+                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(filters,
+                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),
+                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+
+        self.conv2 = nn.Conv2D(filters, filters, kernel_size=(3, 3), padding=(1, 1),
+                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                               bias_attr=False)
+        self.bn2 = nn.BatchNorm2D(filters,
+                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)),
+                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        x = self.bn1(x)
+        x = functional.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+
+        shortcut = inputs
+        x += shortcut
+
+        return functional.relu(x)
+
+
+class ResNetFeatures(nn.Layer):
+    def __init__(self, in_filters=3,
+                 mean=[0.485, 0.456, 0.406],
+                 std=[0.229, 0.224, 0.225]):
+        super(ResNetFeatures, self).__init__()
+        self.conv1 = nn.Conv2D(in_channels=in_filters, out_channels=64, kernel_size=(7, 7),
+                               stride=(2, 2), padding=(3, 3),
+                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(num_features=64, momentum=0.99, epsilon=1e-03,
+                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),
+                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                                  )
+        self.max_pool = nn.MaxPool2D(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+
+        self.layer2a = ResNetBlock(64, 64)
+        self.layer2b = ResNetBlock(64, 64)
+
+        self.mean = paddle.to_tensor(mean)
+        self.std = paddle.to_tensor(std)
+
+    def forward(self, inputs):
+        shape = inputs.shape
+        x = paddle.reshape(inputs, [shape[0] * shape[2], shape[1], shape[3], shape[4]])
+        x = (x - self.mean) / self.std
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = functional.relu(x)
+        x = self.max_pool(x)
+        x = self.layer2a(x)
+        x = self.layer2b(x)
+
+        new_shape = x.shape
+        x = paddle.reshape(x, [shape[0], new_shape[1], shape[2], new_shape[2], new_shape[3]])
+        return x
+
+
+class FrameSimilarity(nn.Layer):
+    def __init__(self,
+                 in_filters,
+                 similarity_dim=128,
+                 lookup_window=101,
+                 output_dim=128,
+                 stop_gradient=False,
+                 use_bias=False):
+        super(FrameSimilarity, self).__init__()
+        self.projection = nn.Linear(in_filters, similarity_dim,
+                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                    bias_attr=use_bias)
+        self.fc = nn.Linear(lookup_window, output_dim,
+                            weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                            bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+
+        self.lookup_window = lookup_window
+        self.stop_gradient = stop_gradient
+        assert lookup_window % 2 == 1, "`lookup_window` must be odd integer"
+
+    def forward(self, inputs):
+        x = paddle.concat([paddle.mean(x, axis=[3, 4]) for x in inputs], axis=1)
+        x = paddle.transpose(x, (0, 2, 1))
+
+        if self.stop_gradient:
+            x = x.stop_gradient
+
+        x = self.projection(x)
+        x = functional.normalize(x, p=2, axis=2)
+        batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]
+        time_window = x.shape[1]
+        similarities = paddle.bmm(x, x.transpose([0, 2, 1]))  # [batch_size, time_window, time_window]
+
+        similarities_padded = functional.pad(similarities,
+                                             [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],
+                                             data_format='NCL')
+
+        batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])
+        batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])
+        time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])
+        time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])
+        lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])
+        lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices
+        indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)
+        similarities = paddle.gather_nd(similarities_padded, indices)
+        return functional.relu(self.fc(similarities))
+
+
+class ConvexCombinationRegularization(nn.Layer):
+    def __init__(self, in_filters, filters=32, delta_scale=10., loss_weight=0.01):
+        super(ConvexCombinationRegularization, self).__init__()
+
+        self.projection = nn.Conv3D(in_filters, filters, kernel_size=1, dilation=1, padding=(0, 0, 0),
+                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+        self.features = nn.Conv3D((filters * 3), filters * 2,
+                                  kernel_size=(3, 3, 3), dilation=1, padding=(1, 1, 1),
+                                  weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+        self.dense = nn.Linear(64, 1, weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), bias_attr=True)
+        self.loss = nn.SmoothL1Loss(reduction='none')
+        self.delta_scale = delta_scale
+        self.loss_weight = loss_weight
+
+    def forward(self, image_inputs, feature_inputs):
+        x = feature_inputs
+        x = self.projection(x)
+        x = functional.relu(x)
+        batch_size = x.shape[0]
+        window_size = x.shape[2]
+        first_frame = paddle.tile(x[:, :, :1], [1, 1, window_size, 1, 1])
+        last_frame = paddle.tile(x[:, :, -1:], [1, 1, window_size, 1, 1])
+        x = paddle.concat([x, first_frame, last_frame], 1)
+        x = self.features(x)
+        x = functional.relu(x)
+        x = paddle.mean(x, axis=[3, 4])
+        x = paddle.transpose(x, (0, 2, 1))
+        alpha = self.dense(x)
+        alpha = paddle.transpose(alpha, (0, 2, 1))
+
+        first_img = paddle.tile(image_inputs[:, :, :1], [1, 1, window_size, 1, 1])
+        last_img = paddle.tile(image_inputs[:, :, -1:], [1, 1, window_size, 1, 1])
+
+        alpha_ = functional.sigmoid(alpha)
+        alpha_ = paddle.reshape(alpha_, [batch_size, 1, window_size, 1, 1])
+        predictions_ = (alpha_ * first_img + (1 - alpha_) * last_img)
+        loss_ = self.loss(label=image_inputs / self.delta_scale, input=predictions_ / self.delta_scale)
+        loss_ = self.loss_weight * paddle.mean(loss_)
+        return alpha, loss_
+
+
+class ColorHistograms(nn.Layer):
+    def __init__(self,
+                 lookup_window=101,
+                 output_dim=None):
+        super(ColorHistograms, self).__init__()
+
+        self.fc = nn.Linear(lookup_window, output_dim,
+                            weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                            bias_attr=ParamAttr(
+                                initializer=nn.initializer.Constant(value=0.))) if output_dim is not None else None
+        self.lookup_window = lookup_window
+        assert lookup_window % 2 == 1, "`lookup_window` must be odd integer"
+
+    def compute_color_histograms(self, frames):
+        frames = frames.astype('int32')
+
+        def get_bin(frames):
+            # returns 0 .. 511
+            R, G, B = frames[:, :, 0], frames[:, :, 1], frames[:, :, 2]
+            R, G, B = R // 32, G // 32, B // 32
+            return (R * 64) + (G * 8) + B
+
+        batch_size = paddle.slice(frames.shape, starts=[0], ends=[1], axes=[0]) if frames.shape[0] == -1 else frames.shape[0]
+        time_window, height, width, no_channels = frames.shape[1:]
+
+        assert no_channels == 3 or no_channels == 6
+        if no_channels == 3:
+            frames_flatten = frames.reshape([-1, height * width, 3])
+        else:
+            frames_flatten = frames.reshape([-1, height * width * 2, 3])
+
+        binned_values = get_bin(frames_flatten)
+
+        frame_bin_prefix = (paddle.arange(0, batch_size * time_window) * 512).reshape([-1, 1])
+        binned_values = (binned_values + frame_bin_prefix).reshape([-1, 1])
+        histograms = paddle.zeros_like(frame_bin_prefix, dtype='int32').tile([512]).reshape([-1])
+        histograms = histograms.scatter_nd_add(binned_values, paddle.ones_like(binned_values, dtype='int32').reshape([-1]))
+        histograms = histograms.reshape([batch_size, time_window, 512]).astype('float32')
+        histograms_normalized = functional.normalize(histograms, p=2, axis=2)
+        return histograms_normalized
+
+    def forward(self, inputs):
+        x = self.compute_color_histograms(inputs)
+        batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]
+        time_window = x.shape[1]
+        similarities = paddle.bmm(x, x.transpose([0, 2, 1]))  # [batch_size, time_window, time_window]
+        similarities_padded = functional.pad(similarities,
+                                             [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],
+                                             data_format='NCL')
+
+        batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])
+        batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])
+        time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])
+        time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])
+        lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])
+        lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices
+
+        indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)
+        similarities = paddle.gather_nd(similarities_padded, indices)
+
+        if self.fc is not None:
+            return functional.relu(self.fc(similarities))
+        return similarities
+
+
+@BACKBONES.register()
+class TransNetV2(nn.Layer):
+    """TransNetV2 model from
+    `"TransNet V2: An effective deep network architecture for fast shot transition detection" <https://arxiv.org/abs/2008.04838>`_
+    """
+    def __init__(self,
+                 F=16, L=3, S=2, D=1024,
+                 use_many_hot_targets=True,
+                 use_frame_similarity=True,
+                 use_color_histograms=True,
+                 use_mean_pooling=False,
+                 dropout_rate=0.5,
+                 use_convex_comb_reg=False,
+                 use_resnet_features=False,
+                 use_resnet_like_top=False,
+                 frame_similarity_on_last_layer=False,
+                 mean=[0.485, 0.456, 0.406],
+                 std=[0.229, 0.224, 0.225]):
+        super(TransNetV2, self).__init__()
+
+        self.mean = np.array(mean, np.float32).reshape([1, 3, 1, 1]) * 255
+        self.std = np.array(std, np.float32).reshape([1, 3, 1, 1]) * 255
+
+        self.use_resnet_features = use_resnet_features
+        self.resnet_layers = ResNetFeatures(in_filters=3, mean=self.mean, std=self.std) if self.use_resnet_features else None
+        self.resnet_like_top = use_resnet_like_top
+        if self.resnet_like_top:
+            self.resnet_like_top_conv = nn.Conv3D(64 if self.use_resnet_features else 3, 32, kernel_size=(3, 7, 7),
+                                                  stride=(1, 2, 2),
+                                                  padding=(1, 3, 3),
+                                                  weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                                  bias_attr=False)
+            self.resnet_like_top_bn = nn.BatchNorm3D(32, momentum=0.99, epsilon=1e-03,
+                                                     weight_attr=ParamAttr(
+                                                         initializer=nn.initializer.Constant(value=1.)),
+                                                     bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+            self.resnet_like_top_max_pool = nn.MaxPool3D(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+                                                         padding=(0, 1, 1))
+
+        if self.resnet_like_top:
+            in_filters = 32
+        elif self.use_resnet_features:
+            in_filters = 64
+        else:
+            in_filters = 3
+        self.SDDCNN = nn.LayerList(
+            [StackedDDCNNV2(in_filters=in_filters, n_blocks=S, filters=F,
+                            stochastic_depth_drop_prob=0.)] +
+            [StackedDDCNNV2(in_filters=(F * 2 ** (i - 1)) * 4, n_blocks=S, filters=F * 2 ** i) for i in range(1, L)]
+        )
+
+        self.frame_sim_layer = FrameSimilarity(
+            sum([(F * 2 ** i) * 4 for i in range(L)]), lookup_window=101, output_dim=128, similarity_dim=128,
+            use_bias=True
+        ) if use_frame_similarity else None
+        self.color_hist_layer = ColorHistograms(
+            lookup_window=101, output_dim=128
+        ) if use_color_histograms else None
+
+        self.dropout = nn.Dropout(dropout_rate) if dropout_rate is not None else None
+
+        output_dim = ((F * 2 ** (L - 1)) * 4) * 3 * 6  # 3x6 for spatial dimensions
+        if use_frame_similarity: output_dim += 128
+        if use_color_histograms: output_dim += 128
+
+        self.use_mean_pooling = use_mean_pooling
+
+        self.has_downsample = False
+        if self.use_resnet_features or self.resnet_like_top or self.use_mean_pooling:
+            self.has_downsample = True
+        self.fc1 = nn.Linear(512 if self.has_downsample else output_dim, D,
+                             weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                             bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                             )
+        self.frame_similarity_on_last_layer = frame_similarity_on_last_layer
+        self.cls_layer1 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,
+                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                                    )
+        self.cls_layer2 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,
+                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                                    ) if use_many_hot_targets else None
+
+        self.convex_comb_reg = ConvexCombinationRegularization(
+            in_filters=(F * 2 ** (L - 1) * 4)) if use_convex_comb_reg else None
+
+    def forward(self, inputs):
+        assert list(inputs.shape[2:]) == [27, 48, 3] and inputs.dtype == paddle.float32, \
+            "incorrect input type and/or shape"
+        out_dict = {}
+
+        # shape [B, T, H, W, 3] to shape [B, 3, T, H, W]
+        x = inputs.transpose([0, 4, 1, 2, 3])
+        if self.use_resnet_features:
+            x = self.resnet_layers(x)
+        else:
+            x = x / 255.
+        inputs = inputs.clip(min=0).astype('uint8')
+        if self.resnet_like_top:
+            x = self.resnet_like_top_conv(x)
+            x = self.resnet_like_top_bn(x)
+            x = self.resnet_like_top_max_pool(x)
+        block_features = []
+        for block in self.SDDCNN:
+            x = block(x)
+            block_features.append(x)
+        if self.convex_comb_reg is not None:
+            out_dict["alphas"], out_dict["comb_reg_loss"] = self.convex_comb_reg(inputs.transpose([0, 4, 1, 2, 3]), x)
+        if self.use_mean_pooling:
+            x = paddle.mean(x, axis=[3, 4])
+            x = x.transpose([0, 2, 1])
+        else:
+            x = x.transpose([0, 2, 3, 4, 1])
+            x = x.reshape([x.shape[0], x.shape[1], x.shape[2]*x.shape[3]*x.shape[4]])
+        if self.frame_sim_layer is not None:
+            x = paddle.concat([self.frame_sim_layer(block_features), x], 2)
+        if self.color_hist_layer is not None:
+            x = paddle.concat([self.color_hist_layer(inputs), x], 2)
+        x = self.fc1(x)
+        x = functional.relu(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        if self.frame_sim_layer is not None and self.frame_similarity_on_last_layer:
+            x = paddle.concat([self.frame_sim_layer(block_features), x], 2)
+        one_hot = self.cls_layer1(x)
+        if self.cls_layer2 is not None:
+            out_dict["many_hot"] = self.cls_layer2(x)
+
+        if len(out_dict) > 0:
+            return one_hot, out_dict
+
+        return one_hot
+
diff --git a/docs/src/paddlevideo/modeling/backbones/vit.py b/docs/src/paddlevideo/modeling/backbones/vit.py
new file mode 100644
index 000000000..84f434f93
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/vit.py
@@ -0,0 +1,465 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Constant
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+__all__ = ['VisionTransformer']
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    # issuecomment-532968956 ...
+    See discussion: https://github.com/tensorflow/tpu/issues/494
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.0,
+                 proj_drop=0.0):
+        super().__init__()
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_drop = nn.Dropout(attn_drop)
+
+    def forward(self, x):
+        N, C = x.shape[1:]
+        qkv = self.qkv(x).reshape(
+            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.0,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.0,
+                 attn_drop=0.0,
+                 drop_path=0.1,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 attention_type='divided_space_time'):
+
+        super().__init__()
+        self.attention_type = attention_type
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attn_drop=attn_drop,
+                              proj_drop=drop)
+
+        # Temporal Attention Parameters
+        if self.attention_type == 'divided_space_time':
+            if isinstance(norm_layer, str):
+                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+            elif isinstance(norm_layer, Callable):
+                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)
+            else:
+                raise TypeError(
+                    "The norm_layer must be str or paddle.nn.layer.Layer class")
+            self.temporal_attn = Attention(dim,
+                                           num_heads=num_heads,
+                                           qkv_bias=qkv_bias,
+                                           qk_scale=qk_scale,
+                                           attn_drop=attn_drop,
+                                           proj_drop=drop)
+            self.temporal_fc = nn.Linear(dim, dim)
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x, B, T, W):
+        num_spatial_tokens = (x.shape[1] - 1) // T
+        H = num_spatial_tokens // W
+        if self.attention_type in ['space_only', 'joint_space_time']:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+            return x
+        elif self.attention_type == 'divided_space_time':
+            ########## Temporal ##########
+            xt = x[:, 1:, :]
+            _, _, _, _t, _m = B, H, W, T, xt.shape[-1]
+            xt = xt.reshape([-1, _t, _m])
+
+            res_temporal = self.drop_path(
+                self.temporal_attn(self.temporal_norm1(xt)))
+
+            _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]
+            res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])
+
+            res_temporal = self.temporal_fc(res_temporal)
+            xt = x[:, 1:, :] + res_temporal
+
+            ########## Spatial ##########
+            init_cls_token = x[:, 0, :].unsqueeze(1)
+            cls_token = init_cls_token.tile((1, T, 1))
+            _b, _t, _m = cls_token.shape
+            cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)
+
+            xs = xt
+            _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]
+            xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(
+                (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])
+            xs = paddle.concat((cls_token, xs), axis=1)
+            res_spatial = self.drop_path(self.attn(self.norm1(xs)))
+
+            # Taking care of CLS token
+            cls_token = res_spatial[:, 0, :]
+            _, _t, _m = B, T, cls_token.shape[-1]
+            cls_token = cls_token.reshape([-1, _t, _m])
+            # averaging for every frame
+            cls_token = paddle.mean(cls_token, axis=1, keepdim=True)
+
+            res_spatial = res_spatial[:, 1:, :]
+            _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]
+            res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(
+                (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])
+
+            res = res_spatial
+            x = xt
+            x = paddle.concat((init_cls_token, x), axis=1) + paddle.concat(
+                (cls_token, res), axis=1)
+
+            # Mlp
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+            return x
+        else:
+            raise NotImplementedError
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(in_channels,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = x.transpose((0, 2, 1, 3, 4))
+        x = x.reshape([-1, C, H, W])
+        x = self.proj(x)
+        W = x.shape[-1]
+        x = x.flatten(2).transpose((0, 2, 1))
+        return x, T, W
+
+
+@BACKBONES.register()
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+    def __init__(self,
+                 pretrained=None,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 num_seg=8,
+                 attention_type='divided_space_time',
+                 **args):
+        super().__init__()
+        self.pretrained = pretrained
+        self.num_seg = num_seg
+        self.attention_type = attention_type
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(img_size=img_size,
+                                      patch_size=patch_size,
+                                      in_channels=in_channels,
+                                      embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        # Positional Embeddings
+        self.cls_token = self.create_parameter(shape=(1, 1, embed_dim),
+                                               default_initializer=zeros_)
+        self.pos_embed = self.create_parameter(shape=(1, num_patches + 1,
+                                                      embed_dim),
+                                               default_initializer=zeros_)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if self.attention_type != 'space_only':
+            self.time_embed = self.create_parameter(shape=(1, num_seg,
+                                                           embed_dim),
+                                                    default_initializer=zeros_)
+            self.time_drop = nn.Dropout(p=drop_rate)
+
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.add_parameter("cls_token", self.cls_token)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(dim=embed_dim,
+                  num_heads=num_heads,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[i],
+                  norm_layer=norm_layer,
+                  epsilon=epsilon,
+                  attention_type=self.attention_type) for i in range(depth)
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+    def init_weights(self):
+        """First init model's weight"""
+        trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        self.apply(self._init_fn)
+
+        if self.attention_type == 'divided_space_time':
+            i = 0
+            for m in self.blocks.sublayers(include_self=True):
+                m_str = str(m)
+                if 'Block' in m_str:
+                    if i > 0:
+                        zeros_(m.temporal_fc.weight)
+                        zeros_(m.temporal_fc.bias)
+                    i += 1
+        """Second, if provide pretrained ckpt, load it"""
+        if isinstance(
+                self.pretrained, str
+        ) and self.pretrained.strip() != "":  # load pretrained weights
+            load_ckpt(self,
+                      self.pretrained,
+                      num_patches=self.patch_embed.num_patches,
+                      num_seg=self.num_seg,
+                      attention_type=self.attention_type)
+
+    def _init_fn(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            ones_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        # B = x.shape[0]
+        B = paddle.shape(x)[0]
+        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]
+        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]
+        x = paddle.concat((cls_tokens, x), axis=1)
+        pos_interp = (x.shape[1] != self.pos_embed.shape[1])
+        if pos_interp:
+            pos_embed = self.pos_embed
+            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(
+                (0, 2, 1))
+            P = int(other_pos_embed.shape[2]**0.5)
+            H = x.shape[1] // W
+            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])
+            new_pos_embed = F.interpolate(other_pos_embed,
+                                          size=(H, W),
+                                          mode='nearest')
+            new_pos_embed = new_pos_embed.flatten(2)
+            new_pos_embed = new_pos_embed.transpose((0, 2, 1))
+            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),
+                                          axis=1)
+            x = x + new_pos_embed
+        else:
+            x = x + self.pos_embed
+
+        x = self.pos_drop(x)
+
+        # Time Embeddings
+        if self.attention_type != 'space_only':
+            cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(
+                T)[0].index_select(paddle.to_tensor([0]), axis=1)
+            x = x[:, 1:]
+            _, _n, _m = x.shape
+            _t = T
+            x = x.reshape([-1, _t, _n, _m]).transpose(
+                (0, 2, 1, 3)).reshape([-1, _t, _m])
+            # Resizing time embeddings in case they don't match
+            time_interp = (T != self.time_embed.shape[1])
+            if time_interp:  # T' != T
+                time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)
+                new_time_embed = F.interpolate(time_embed,
+                                               size=(T, x.shape[-1]),
+                                               mode='nearest').squeeze(0)
+                new_time_embed = new_time_embed.transpose((0, 2, 1))
+                x = x + new_time_embed
+            else:
+                x = x + self.time_embed
+
+            x = self.time_drop(x)
+            _, _t, _m = x.shape
+            x = x.reshape([-1, W * W * T, _m])
+            x = paddle.concat((cls_tokens, x), axis=1)
+
+        # Attention blocks
+        for blk in self.blocks:
+            x = blk(x, B, T, W)
+
+        # Predictions for space-only baseline
+        if self.attention_type == 'space_only':
+            _, _n, _m = x.shape
+            _t = T
+            x = x.reshape([-1, _t, _n, _m])
+            x = paddle.mean(x, 1)  # averaging predictions for every frame
+
+        x = self.norm(x)
+        return x[:, 0]  # [B,  embed_dim]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
diff --git a/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py b/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py
new file mode 100644
index 000000000..a20af30f1
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/vit_tweaks.py
@@ -0,0 +1,515 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddle.regularizer import L2Decay
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+__all__ = ['VisionTransformer_tweaks']
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def rand_bbox(size, lam):
+    """ rand_bbox """
+    w = size[2]
+    h = size[3]
+    cut_rat = np.sqrt(1. - lam)
+    cut_w = np.int(w * cut_rat)
+    cut_h = np.int(h * cut_rat)
+
+    # uniform
+    cx = np.random.randint(w)
+    cy = np.random.randint(h)
+
+    bbx1 = np.clip(cx - cut_w // 2, 0, w)
+    bby1 = np.clip(cy - cut_h // 2, 0, h)
+    bbx2 = np.clip(cx + cut_w // 2, 0, w)
+    bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+    return bbx1, bby1, bbx2, bby2
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    # issuecomment-532968956 ...
+    See discussion: https://github.com/tensorflow/tpu/issues/494
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.,
+                 wd_bias=True,
+                 lr_mult=1.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 wd_bias=True,
+                 lr_mult=1.0):
+        super().__init__()
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_drop = nn.Dropout(attn_drop)
+
+    def forward(self, x):
+        N, C = x.shape[1:]
+        qkv = self.qkv(x).reshape(
+            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.0,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.0,
+                 attn_drop=0.0,
+                 drop_path=0.1,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 attention_type='divided_space_time',
+                 wd_bias=True,
+                 lr_mult=1.0):
+
+        super().__init__()
+        self.attention_type = attention_type
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attn_drop=attn_drop,
+                              proj_drop=drop,
+                              wd_bias=wd_bias,
+                              lr_mult=lr_mult)
+
+        # Temporal Attention Parameters
+        if self.attention_type == 'divided_space_time':
+            if isinstance(norm_layer, str):
+                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+            elif isinstance(norm_layer, Callable):
+                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)
+            else:
+                raise TypeError(
+                    "The norm_layer must be str or paddle.nn.layer.Layer class")
+            self.temporal_attn = Attention(dim,
+                                           num_heads=num_heads,
+                                           qkv_bias=qkv_bias,
+                                           qk_scale=qk_scale,
+                                           attn_drop=attn_drop,
+                                           proj_drop=drop,
+                                           wd_bias=wd_bias,
+                                           lr_mult=lr_mult)
+            self.temporal_fc = nn.Linear(dim, dim)
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop,
+                       wd_bias=wd_bias,
+                       lr_mult=lr_mult)
+
+    def forward(self, x, B, T, W):
+        num_spatial_tokens = (x.shape[1] - 1) // T
+        H = num_spatial_tokens // W
+        if self.attention_type in ['space_only', 'joint_space_time']:
+            x = paddle.add(x, self.drop_path(self.attn(self.norm1(x))))
+            x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))
+            return x
+        elif self.attention_type == 'divided_space_time':
+            ########## Temporal ##########
+            xt = x[:, 1:, :]
+            _, _, _, _t, _m = B, H, W, T, xt.shape[-1]
+            xt = xt.reshape([-1, _t, _m])
+
+            res_temporal = self.drop_path(
+                self.temporal_attn(self.temporal_norm1(xt)))
+
+            _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]
+            res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])
+
+            res_temporal = self.temporal_fc(res_temporal)
+            xt = paddle.add(x[:, 1:, :], res_temporal)
+
+            ########## Spatial ##########
+            init_cls_token = x[:, 0, :].unsqueeze(1)
+            cls_token = init_cls_token.tile((1, T, 1))
+            _b, _t, _m = cls_token.shape
+            cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)
+
+            xs = xt
+            _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]
+            xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(
+                (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])
+            xs = paddle.concat((cls_token, xs), axis=1)
+            res_spatial = self.drop_path(self.attn(self.norm1(xs)))
+
+            # Taking care of CLS token
+            cls_token = res_spatial[:, 0, :]
+            _, _t, _m = B, T, cls_token.shape[-1]
+            cls_token = cls_token.reshape([-1, _t, _m])
+            # averaging for every frame
+            cls_token = paddle.mean(cls_token, axis=1, keepdim=True)
+
+            res_spatial = res_spatial[:, 1:, :]
+            _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]
+            res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(
+                (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])
+
+            res = res_spatial
+            x = xt
+            x = paddle.add(paddle.concat((init_cls_token, x), axis=1),
+                           paddle.concat((cls_token, res), axis=1))
+            # Mlp
+            x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))
+            return x
+        else:
+            raise NotImplementedError
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 wd_bias=True,
+                 lr_mult=1.0):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(in_channels,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = x.transpose((0, 2, 1, 3, 4))  # [B,T,C,H,W]
+        x = x.reshape([-1, C, H, W])  # [BT,C,H,W]
+        x = self.proj(x)  # [BT,F,nH,nW]
+        W = x.shape[-1]
+        x = x.flatten(2).transpose((0, 2, 1))  # [BT,F,nHnW]
+        return x, T, W
+
+
+@BACKBONES.register()
+class VisionTransformer_tweaks(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+    def __init__(self,
+                 pretrained=None,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 num_seg=8,
+                 attention_type='divided_space_time',
+                 wd_bias=True,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 **args):
+        super().__init__()
+        self.pretrained = pretrained
+        self.num_seg = num_seg
+        self.attention_type = attention_type
+        self.lr_mult_list = lr_mult_list
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(img_size=img_size,
+                                      patch_size=patch_size,
+                                      in_channels=in_channels,
+                                      embed_dim=embed_dim,
+                                      wd_bias=wd_bias,
+                                      lr_mult=self.lr_mult_list[0])
+        num_patches = self.patch_embed.num_patches
+
+        # Positional Embeddings
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim),
+            default_initializer=zeros_,
+            attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.pos_embed = self.create_parameter(
+            shape=(1, num_patches + 1, embed_dim),
+            default_initializer=zeros_,
+            attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if self.attention_type != 'space_only':
+            self.time_embed = self.create_parameter(
+                shape=(1, num_seg, embed_dim),
+                default_initializer=zeros_,
+                attr=ParamAttr(regularizer=L2Decay(0.0)))
+            self.time_drop = nn.Dropout(p=drop_rate)
+
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.add_parameter("cls_token", self.cls_token)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(dim=embed_dim,
+                  num_heads=num_heads,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[i],
+                  norm_layer=norm_layer,
+                  epsilon=epsilon,
+                  attention_type=self.attention_type,
+                  wd_bias=wd_bias,
+                  lr_mult=self.lr_mult_list[(i // 4) + 1]) for i in range(depth)
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+    def init_weights(self):
+        """First init model's weight"""
+        trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        self.apply(self._init_fn)
+
+        if self.attention_type == 'divided_space_time':
+            i = 0
+            for m in self.blocks.sublayers(include_self=True):
+                m_str = str(m)
+                if 'Block' in m_str:
+                    if i > 0:
+                        zeros_(m.temporal_fc.weight)
+                        zeros_(m.temporal_fc.bias)
+                    i += 1
+        """Second, if provide pretrained ckpt, load it"""
+        if isinstance(
+                self.pretrained, str
+        ) and self.pretrained.strip() != "":  # load pretrained weights
+            load_ckpt(self,
+                      self.pretrained,
+                      num_patches=self.patch_embed.num_patches,
+                      num_seg=self.num_seg,
+                      attention_type=self.attention_type)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            pass
+        else:
+            raise NotImplementedError
+
+    def _init_fn(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            ones_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        # B = x.shape[0]
+        B = paddle.shape(x)[0]
+        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]
+        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]
+        x = paddle.concat((cls_tokens, x), axis=1)
+        pos_interp = (x.shape[1] != self.pos_embed.shape[1])
+        if pos_interp:
+            pos_embed = self.pos_embed
+            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(
+                (0, 2, 1))
+            P = int(other_pos_embed.shape[2]**0.5)
+            H = x.shape[1] // W
+            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])
+            new_pos_embed = F.interpolate(other_pos_embed,
+                                          size=(H, W),
+                                          mode='nearest')
+            new_pos_embed = new_pos_embed.flatten(2)
+            new_pos_embed = new_pos_embed.transpose((0, 2, 1))
+            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),
+                                          axis=1)
+            x = paddle.add(x, new_pos_embed)
+        else:
+            x = paddle.add(x, self.pos_embed)
+
+        x = self.pos_drop(x)
+
+        # Time Embeddings
+        if self.attention_type != 'space_only':
+            cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(
+                T)[0].index_select(paddle.to_tensor([0]), axis=1)
+            x = x[:, 1:]
+            _, _n, _m = x.shape
+            _t = T
+            x = x.reshape([-1, _t, _n, _m]).transpose(
+                (0, 2, 1, 3)).reshape([-1, _t, _m])
+            # Resizing time embeddings in case they don't match
+            time_interp = (T != self.time_embed.shape[1])
+            if time_interp:  # T' != T
+                time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)
+                new_time_embed = F.interpolate(time_embed,
+                                               size=(T, x.shape[-1]),
+                                               mode='nearest').squeeze(0)
+                new_time_embed = new_time_embed.transpose((0, 2, 1))
+                x = paddle.add(x, new_time_embed)
+            else:
+                x = paddle.add(x, self.time_embed)
+
+            x = self.time_drop(x)
+            _, _t, _m = x.shape
+            x = x.reshape([-1, W * W * T, _m])
+            x = paddle.concat((cls_tokens, x), axis=1)
+
+        # Attention blocks
+        for blk in self.blocks:
+            x = blk(x, B, T, W)
+
+        # Predictions for space-only baseline
+        if self.attention_type == 'space_only':
+            _, _n, _m = x.shape
+            _t = T
+            x = x.reshape([-1, _t, _n, _m])
+            x = paddle.mean(x, 1)  # averaging predictions for every frame
+
+        x = self.norm(x)
+        return x[:, 0]  # [B,  embed_dim]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
diff --git a/docs/src/paddlevideo/modeling/backbones/yowo.py b/docs/src/paddlevideo/modeling/backbones/yowo.py
new file mode 100644
index 000000000..5e6b88d57
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/backbones/yowo.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..registry import BACKBONES
+from .darknet import Darknet
+from .resnext101 import ResNext101
+import paddle.nn as nn
+import paddle
+
+
+class CAM_Module(nn.Layer):
+    def __init__(self, in_dim):
+        super(CAM_Module, self).__init__()
+        self.chanel_in = in_dim
+        temp = paddle.zeros([1], dtype='float32')
+        self.gamma = paddle.create_parameter(shape=temp.shape, dtype=str(temp.numpy().dtype),
+                                             default_initializer=paddle.nn.initializer.Assign(temp))
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x):
+        m_batchsize, C, height, width = x.shape
+        proj_query = paddle.reshape(x, [m_batchsize, C, -1])
+        proj_key = paddle.transpose(paddle.reshape(
+            x, [m_batchsize, C, -1]), perm=[0, 2, 1])
+        energy = paddle.bmm(proj_query, proj_key)
+        energy_new = paddle.expand_as(paddle.max(
+            energy, axis=-1, keepdim=True), energy) - energy
+        attention = self.softmax(energy_new)
+        proj_value = paddle.reshape(x, [m_batchsize, C, -1])
+
+        out = paddle.bmm(attention, proj_value)
+        out = out.reshape([m_batchsize, C, height, width])
+        out = self.gamma * out + x
+        return out
+
+
+class CFAMBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(CFAMBlock, self).__init__()
+        inter_channels = 1024
+        self.conv_bn_relu1 = nn.Sequential(nn.Conv2D(in_channels, inter_channels, kernel_size=1, bias_attr=False),
+                                           nn.BatchNorm2D(inter_channels),
+                                           nn.ReLU())
+        self.conv_bn_relu2 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),
+                                           nn.BatchNorm2D(inter_channels),
+                                           nn.ReLU())
+
+        self.sc = CAM_Module(inter_channels)
+
+        self.conv_bn_relu3 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),
+                                           nn.BatchNorm2D(inter_channels),
+                                           nn.ReLU())
+        self.conv_out = nn.Sequential(nn.Dropout2D(0.1), nn.Conv2D(
+            inter_channels, out_channels, 1, bias_attr=True))
+
+    def forward(self, x):
+        x = self.conv_bn_relu1(x)
+        x = self.conv_bn_relu2(x)
+        x = self.sc(x)
+        x = self.conv_bn_relu3(x)
+        output = self.conv_out(x)
+
+        return output
+
+
+@BACKBONES.register()
+class YOWO(nn.Layer):
+    def __init__(self, num_class, pretrained_2d=None, pretrained_3d=None):
+        super(YOWO, self).__init__()
+
+        self.pretrained_2d = pretrained_2d
+        self.pretrained_3d = pretrained_3d
+        self.backbone_2d = Darknet()
+        self.backbone_3d = ResNext101()
+        self.num_ch_2d = 425
+        self.num_ch_3d = 2048
+        self.num_class = num_class
+        self.cfam = CFAMBlock(self.num_ch_2d + self.num_ch_3d, 1024)
+        self.conv_final = nn.Conv2D(
+            1024, 5 * (self.num_class + 4 + 1), kernel_size=1, bias_attr=False)
+        self.seen = 0
+
+    def init_weights(self):
+        if self.pretrained_2d is not None:
+            self.backbone_2d = self.load_pretrain_weight(
+                self.backbone_2d, self.pretrained_2d)
+        if self.pretrained_3d is not None:
+            self.backbone_3d = self.load_pretrain_weight(
+                self.backbone_3d, self.pretrained_3d)
+
+    def load_pretrain_weight(self, model, weights_path):
+        model_dict = model.state_dict()
+
+        param_state_dict = paddle.load(weights_path)
+        ignore_weights = set()
+
+        # hack: fit for faster rcnn. Pretrain weights contain prefix of 'backbone'
+        # while res5 module is located in bbox_head.head. Replace the prefix of
+        # res5 with 'bbox_head.head' to load pretrain weights correctly.
+        for k in list(param_state_dict.keys()):
+            if 'backbone.res5' in k:
+                new_k = k.replace('backbone', 'bbox_head.head')
+                if new_k in model_dict.keys():
+                    value = param_state_dict.pop(k)
+                    param_state_dict[new_k] = value
+
+        for name, weight in param_state_dict.items():
+            if name in model_dict.keys():
+                if list(weight.shape) != list(model_dict[name].shape):
+                    print(
+                        '{} not used, shape {} unmatched with {} in model.'.format(
+                            name, weight.shape, list(model_dict[name].shape)))
+                    ignore_weights.add(name)
+            else:
+                print('Redundant weight {} and ignore it.'.format(name))
+                ignore_weights.add(name)
+
+        for weight in ignore_weights:
+            param_state_dict.pop(weight, None)
+
+        model.set_dict(param_state_dict)
+        print('Finish loading model weights: {}'.format(weights_path))
+        return model
+
+    def forward(self, input):
+        x_3d = input  # Input clip
+        x_2d = input[:, :, -1, :, :]  # Last frame of the clip that is read
+
+        x_2d = self.backbone_2d(x_2d)
+
+        x_3d = self.backbone_3d(x_3d)
+
+        x_3d = paddle.squeeze(x_3d, axis=2)
+
+        x = paddle.concat([x_3d, x_2d], axis=1)
+        x = self.cfam(x)
+        out = self.conv_final(x)
+
+        return out
diff --git a/docs/src/paddlevideo/modeling/bbox_utils.py b/docs/src/paddlevideo/modeling/bbox_utils.py
new file mode 100644
index 000000000..23b4555b4
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/bbox_utils.py
@@ -0,0 +1,528 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn.functional as F
+import math
+import numpy as np
+
+
+def bbox2delta(src_boxes, tgt_boxes, weights):
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    wx, wy, ww, wh = weights
+    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
+    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
+    dw = ww * paddle.log(tgt_w / src_w)
+    dh = wh * paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    return deltas
+
+
+def delta2bbox(deltas, boxes, weights):
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+    # Prevent sending too large values into paddle.exp()
+    dw = paddle.clip(dw, max=clip_scale)
+    dh = paddle.clip(dh, max=clip_scale)
+
+    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    return pred_boxes
+
+
+def expand_bbox(bboxes, scale):
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
+    bboxes_exp[:, 0] = x_c - w_half
+    bboxes_exp[:, 2] = x_c + w_half
+    bboxes_exp[:, 1] = y_c - h_half
+    bboxes_exp[:, 3] = y_c + h_half
+
+    return bboxes_exp
+
+
+def clip_bbox(boxes, im_shape):
+    h, w = im_shape[0], im_shape[1]
+    x1 = boxes[:, 0].clip(0, w)
+    y1 = boxes[:, 1].clip(0, h)
+    x2 = boxes[:, 2].clip(0, w)
+    y2 = boxes[:, 3].clip(0, h)
+    return paddle.stack([x1, y1, x2, y2], axis=1)
+
+
+def nonempty_bbox(boxes, min_size=0, return_mask=False):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    mask = paddle.logical_and(w > min_size, w > min_size)
+    if return_mask:
+        return mask
+    keep = paddle.nonzero(mask).flatten()
+    return keep
+
+
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def bbox_overlaps(boxes1, boxes2):
+    """
+    Calculate overlaps between boxes1 and boxes2
+
+    Args:
+        boxes1 (Tensor): boxes with shape [M, 4]
+        boxes2 (Tensor): boxes with shape [N, 4]
+
+    Return:
+        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
+    """
+    area1 = bbox_area(boxes1)
+    area2 = bbox_area(boxes2)
+
+    xy_max = paddle.minimum(
+        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
+    xy_min = paddle.maximum(
+        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
+    width_height = xy_max - xy_min
+    width_height = width_height.clip(min=0)
+    inter = width_height.prod(axis=2)
+
+    overlaps = paddle.where(inter > 0, inter /
+                            (paddle.unsqueeze(area1, 1) + area2 - inter),
+                            paddle.zeros_like(inter))
+    return overlaps
+
+
+def xywh2xyxy(box):
+    x, y, w, h = box
+    x1 = x - w * 0.5
+    y1 = y - h * 0.5
+    x2 = x + w * 0.5
+    y2 = y + h * 0.5
+    return [x1, y1, x2, y2]
+
+
+def make_grid(h, w, dtype):
+    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
+    return paddle.stack((xv, yv), 2).cast(dtype=dtype)
+
+
+def decode_yolo(box, anchor, downsample_ratio):
+    """decode yolo box
+
+    Args:
+        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        anchor (list): anchor with the shape [na, 2]
+        downsample_ratio (int): downsample ratio, default 32
+        scale (float): scale, default 1.
+
+    Return:
+        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
+    """
+    x, y, w, h = box
+    na, grid_h, grid_w = x.shape[1:4]
+    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
+    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
+    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
+
+    anchor = paddle.to_tensor(anchor)
+    anchor = paddle.cast(anchor, x.dtype)
+    anchor = anchor.reshape((1, na, 1, 1, 2))
+    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
+    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
+
+    return [x1, y1, w1, h1]
+
+
+def iou_similarity(box1, box2, eps=1e-9):
+    """Calculate iou of box1 and box2
+
+    Args:
+        box1 (Tensor): box with the shape [N, M1, 4]
+        box2 (Tensor): box with the shape [N, M2, 4]
+
+    Return:
+        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
+    """
+    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
+    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
+    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
+    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
+    x1y1 = paddle.maximum(px1y1, gx1y1)
+    x2y2 = paddle.minimum(px2y2, gx2y2)
+    overlap = (x2y2 - x1y1).clip(0).prod(-1)
+    area1 = (px2y2 - px1y1).clip(0).prod(-1)
+    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
+    union = area1 + area2 - overlap + eps
+    return overlap / union
+
+
+def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
+    """calculate the iou of box1 and box2
+
+    Args:
+        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        giou (bool): whether use giou or not, default False
+        diou (bool): whether use diou or not, default False
+        ciou (bool): whether use ciou or not, default False
+        eps (float): epsilon to avoid divide by zero
+
+    Return:
+        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
+    """
+    px1, py1, px2, py2 = box1
+    gx1, gy1, gx2, gy2 = box2
+    x1 = paddle.maximum(px1, gx1)
+    y1 = paddle.maximum(py1, gy1)
+    x2 = paddle.minimum(px2, gx2)
+    y2 = paddle.minimum(py2, gy2)
+
+    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
+
+    area1 = (px2 - px1) * (py2 - py1)
+    area1 = area1.clip(0)
+
+    area2 = (gx2 - gx1) * (gy2 - gy1)
+    area2 = area2.clip(0)
+
+    union = area1 + area2 - overlap + eps
+    iou = overlap / union
+
+    if giou or ciou or diou:
+        # convex w, h
+        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
+        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
+        if giou:
+            c_area = cw * ch + eps
+            return iou - (c_area - union) / c_area
+        else:
+            # convex diagonal squared
+            c2 = cw**2 + ch**2 + eps
+            # center distance
+            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
+            if diou:
+                return iou - rho2 / c2
+            else:
+                w1, h1 = px2 - px1, py2 - py1 + eps
+                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
+                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
+                v = (4 / math.pi**2) * paddle.pow(delta, 2)
+                alpha = v / (1 + eps - iou + v)
+                alpha.stop_gradient = True
+                return iou - (rho2 / c2 + v * alpha)
+    else:
+        return iou
+
+
+def rect2rbox(bboxes):
+    """
+    :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax)
+    :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle)
+    """
+    bboxes = bboxes.reshape(-1, 4)
+    num_boxes = bboxes.shape[0]
+
+    x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0
+    y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0
+    edges1 = np.abs(bboxes[:, 2] - bboxes[:, 0])
+    edges2 = np.abs(bboxes[:, 3] - bboxes[:, 1])
+    angles = np.zeros([num_boxes], dtype=bboxes.dtype)
+
+    inds = edges1 < edges2
+
+    rboxes = np.stack((x_ctr, y_ctr, edges1, edges2, angles), axis=1)
+    rboxes[inds, 2] = edges2[inds]
+    rboxes[inds, 3] = edges1[inds]
+    rboxes[inds, 4] = np.pi / 2.0
+    return rboxes
+
+
+def delta2rbox(Rrois,
+               deltas,
+               means=[0, 0, 0, 0, 0],
+               stds=[1, 1, 1, 1, 1],
+               wh_ratio_clip=1e-6):
+    """
+    :param Rrois: (cx, cy, w, h, theta)
+    :param deltas: (dx, dy, dw, dh, dtheta)
+    :param means:
+    :param stds:
+    :param wh_ratio_clip:
+    :return:
+    """
+    means = paddle.to_tensor(means)
+    stds = paddle.to_tensor(stds)
+    deltas = paddle.reshape(deltas, [-1, deltas.shape[-1]])
+    denorm_deltas = deltas * stds + means
+
+    dx = denorm_deltas[:, 0]
+    dy = denorm_deltas[:, 1]
+    dw = denorm_deltas[:, 2]
+    dh = denorm_deltas[:, 3]
+    dangle = denorm_deltas[:, 4]
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)
+    dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)
+
+    Rroi_x = Rrois[:, 0]
+    Rroi_y = Rrois[:, 1]
+    Rroi_w = Rrois[:, 2]
+    Rroi_h = Rrois[:, 3]
+    Rroi_angle = Rrois[:, 4]
+
+    gx = dx * Rroi_w * paddle.cos(Rroi_angle) - dy * Rroi_h * paddle.sin(
+        Rroi_angle) + Rroi_x
+    gy = dx * Rroi_w * paddle.sin(Rroi_angle) + dy * Rroi_h * paddle.cos(
+        Rroi_angle) + Rroi_y
+    gw = Rroi_w * dw.exp()
+    gh = Rroi_h * dh.exp()
+    ga = np.pi * dangle + Rroi_angle
+    ga = (ga + np.pi / 4) % np.pi - np.pi / 4
+    ga = paddle.to_tensor(ga)
+
+    gw = paddle.to_tensor(gw, dtype='float32')
+    gh = paddle.to_tensor(gh, dtype='float32')
+    bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1)
+    return bboxes
+
+
+def rbox2delta(proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]):
+    """
+
+    Args:
+        proposals:
+        gt:
+        means: 1x5
+        stds: 1x5
+
+    Returns:
+
+    """
+    proposals = proposals.astype(np.float64)
+
+    PI = np.pi
+
+    gt_widths = gt[..., 2]
+    gt_heights = gt[..., 3]
+    gt_angle = gt[..., 4]
+
+    proposals_widths = proposals[..., 2]
+    proposals_heights = proposals[..., 3]
+    proposals_angle = proposals[..., 4]
+
+    coord = gt[..., 0:2] - proposals[..., 0:2]
+    dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4])
+          * coord[..., 1]) / proposals_widths
+    dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4])
+          * coord[..., 1]) / proposals_heights
+    dw = np.log(gt_widths / proposals_widths)
+    dh = np.log(gt_heights / proposals_heights)
+    da = (gt_angle - proposals_angle)
+
+    da = (da + PI / 4) % PI - PI / 4
+    da /= PI
+
+    deltas = np.stack([dx, dy, dw, dh, da], axis=-1)
+    means = np.array(means, dtype=deltas.dtype)
+    stds = np.array(stds, dtype=deltas.dtype)
+    deltas = (deltas - means) / stds
+    deltas = deltas.astype(np.float32)
+    return deltas
+
+
+def bbox_decode(bbox_preds,
+                anchors,
+                means=[0, 0, 0, 0, 0],
+                stds=[1, 1, 1, 1, 1]):
+    """decode bbox from deltas
+    Args:
+        bbox_preds: [N,H,W,5]
+        anchors: [H*W,5]
+    return:
+        bboxes: [N,H,W,5]
+    """
+    means = paddle.to_tensor(means)
+    stds = paddle.to_tensor(stds)
+    num_imgs, H, W, _ = bbox_preds.shape
+    bboxes_list = []
+    for img_id in range(num_imgs):
+        bbox_pred = bbox_preds[img_id]
+        # bbox_pred.shape=[5,H,W]
+        bbox_delta = bbox_pred
+        anchors = paddle.to_tensor(anchors)
+        bboxes = delta2rbox(
+            anchors, bbox_delta, means, stds, wh_ratio_clip=1e-6)
+        bboxes = paddle.reshape(bboxes, [H, W, 5])
+        bboxes_list.append(bboxes)
+    return paddle.stack(bboxes_list, axis=0)
+
+
+def poly_to_rbox(polys):
+    """
+    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
+    to
+    rotated_boxes:[x_ctr,y_ctr,w,h,angle]
+    """
+    rotated_boxes = []
+    for poly in polys:
+        poly = np.array(poly[:8], dtype=np.float32)
+
+        pt1 = (poly[0], poly[1])
+        pt2 = (poly[2], poly[3])
+        pt3 = (poly[4], poly[5])
+        pt4 = (poly[6], poly[7])
+
+        edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[
+            1]) * (pt1[1] - pt2[1]))
+        edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[
+            1]) * (pt2[1] - pt3[1]))
+
+        width = max(edge1, edge2)
+        height = min(edge1, edge2)
+
+        rbox_angle = 0
+        if edge1 > edge2:
+            rbox_angle = np.arctan2(
+                np.float(pt2[1] - pt1[1]), np.float(pt2[0] - pt1[0]))
+        elif edge2 >= edge1:
+            rbox_angle = np.arctan2(
+                np.float(pt4[1] - pt1[1]), np.float(pt4[0] - pt1[0]))
+
+        def norm_angle(angle, range=[-np.pi / 4, np.pi]):
+            return (angle - range[0]) % range[1] + range[0]
+
+        rbox_angle = norm_angle(rbox_angle)
+
+        x_ctr = np.float(pt1[0] + pt3[0]) / 2
+        y_ctr = np.float(pt1[1] + pt3[1]) / 2
+        rotated_box = np.array([x_ctr, y_ctr, width, height, rbox_angle])
+        rotated_boxes.append(rotated_box)
+    ret_rotated_boxes = np.array(rotated_boxes)
+    assert ret_rotated_boxes.shape[1] == 5
+    return ret_rotated_boxes
+
+
+def cal_line_length(point1, point2):
+    import math
+    return math.sqrt(
+        math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2))
+
+
+def get_best_begin_point_single(coordinate):
+    x1, y1, x2, y2, x3, y3, x4, y4 = coordinate
+    xmin = min(x1, x2, x3, x4)
+    ymin = min(y1, y2, y3, y4)
+    xmax = max(x1, x2, x3, x4)
+    ymax = max(y1, y2, y3, y4)
+    combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]],
+                 [[x4, y4], [x1, y1], [x2, y2], [x3, y3]],
+                 [[x3, y3], [x4, y4], [x1, y1], [x2, y2]],
+                 [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]]
+    dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
+    force = 100000000.0
+    force_flag = 0
+    for i in range(4):
+        temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \
+                     + cal_line_length(combinate[i][1], dst_coordinate[1]) \
+                     + cal_line_length(combinate[i][2], dst_coordinate[2]) \
+                     + cal_line_length(combinate[i][3], dst_coordinate[3])
+        if temp_force < force:
+            force = temp_force
+            force_flag = i
+    if force_flag != 0:
+        pass
+    return np.array(combinate[force_flag]).reshape(8)
+
+
+def rbox2poly_single(rrect):
+    """
+    rrect:[x_ctr,y_ctr,w,h,angle]
+    to
+    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
+    """
+    x_ctr, y_ctr, width, height, angle = rrect[:5]
+    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
+    # rect 2x4
+    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
+    R = np.array([[np.cos(angle), -np.sin(angle)],
+                  [np.sin(angle), np.cos(angle)]])
+    # poly
+    poly = R.dot(rect)
+    x0, x1, x2, x3 = poly[0, :4] + x_ctr
+    y0, y1, y2, y3 = poly[1, :4] + y_ctr
+    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)
+    poly = get_best_begin_point_single(poly)
+    return poly
+
+
+def rbox2poly(rrects):
+    """
+    rrect:[x_ctr,y_ctr,w,h,angle]
+    to
+    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
+    """
+    polys = []
+    for rrect in rrects:
+        x_ctr, y_ctr, width, height, angle = rrect[:5]
+        tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
+        rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
+        R = np.array([[np.cos(angle), -np.sin(angle)],
+                      [np.sin(angle), np.cos(angle)]])
+        poly = R.dot(rect)
+        x0, x1, x2, x3 = poly[0, :4] + x_ctr
+        y0, y1, y2, y3 = poly[1, :4] + y_ctr
+        poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)
+        poly = get_best_begin_point_single(poly)
+        polys.append(poly)
+    polys = np.array(polys)
+    return polys
diff --git a/docs/src/paddlevideo/modeling/builder.py b/docs/src/paddlevideo/modeling/builder.py
new file mode 100644
index 000000000..71503eb4d
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/builder.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, BBOX_CODERS, PARTITIONERS, MULTIMODAL, SEGMENT, SEGMENTERS
+from ..utils import build
+from .registry import (BACKBONES, BBOX_ASSIGNERS, BBOX_CODERS, BBOX_SAMPLERS,
+                       DETECTORS, ESTIMATORS, HEADS, LOCALIZERS, LOSSES,
+                       MULTIMODAL, PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return build(cfg, BACKBONES)
+
+
+def build_roi_extractor(cfg):
+    """Build roi extractor."""
+    return build(cfg, ROI_EXTRACTORS)
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build(cfg, BBOX_ASSIGNERS)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    return build(cfg, BBOX_SAMPLERS)
+
+
+def build_roi_extractor(cfg):
+    """Build roi extractor."""
+    return build(cfg, ROI_EXTRACTORS)
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build(cfg, BBOX_ASSIGNERS)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    return build(cfg, BBOX_SAMPLERS)
+
+
+def build_head(cfg):
+    """Build head."""
+    return build(cfg, HEADS)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return build(cfg, LOSSES)
+
+
+def build_recognizer(cfg):
+    """Build recognizer."""
+    return build(cfg, RECOGNIZERS, key='framework')
+
+
+def build_segmenter(cfg):
+    """Build segmenter."""
+    return build(cfg, SEGMENTERS, key='framework')
+
+
+def build_localizer(cfg):
+    """Build localizer."""
+    return build(cfg, LOCALIZERS, key='framework')
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    """Build detector."""
+    return build(cfg, DETECTORS, key='framework')
+
+
+def build_partitioner(cfg):
+    """Build partitioner."""
+    return build(cfg, PARTITIONERS, key='framework')
+
+
+def build_estimator(cfg):
+    """Build estimator."""
+    return build(cfg, ESTIMATORS, key='framework')
+
+
+def build_multimodal(cfg):
+    """Build multimodal."""
+    return build(cfg, MULTIMODAL, key='framework')
+
+
+def build_segment(cfg):
+    """Build segment."""
+    return build(cfg, SEGMENT, key='framework')
+
+
+def build_model(cfg):
+    cfg_copy = cfg.copy()
+    framework_type = cfg_copy.get('framework')
+    if framework_type in RECOGNIZERS:
+        return build_recognizer(cfg)
+    elif framework_type in LOCALIZERS:
+        return build_localizer(cfg)
+    elif framework_type in PARTITIONERS:
+        return build_partitioner(cfg)
+    elif framework_type in DETECTORS:
+        return build_detector(cfg)
+    elif framework_type in ESTIMATORS:
+        return build_estimator(cfg)
+    elif framework_type in MULTIMODAL:
+        return build_multimodal(cfg)
+    elif framework_type in SEGMENTERS:
+        return build_segmenter(cfg)
+    elif framework_type in SEGMENT:
+        return build_segment(cfg)
+    else:
+        raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/__init__.py b/docs/src/paddlevideo/modeling/framework/__init__.py
new file mode 100644
index 000000000..d68fe09ac
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .estimators import BaseEstimator, DepthEstimator
+from .localizers import BaseLocalizer, BMNLocalizer
+from .partitioners import BasePartitioner, TransNetV2Partitioner
+from .recognizers import BaseRecognizer, Recognizer2D
+from .multimodal import ActBert, BaseMultimodal
+from .segment import BaseSegment, CFBI
+from .segmenters import MSTCN
+
+__all__ = [
+    'BaseRecognizer', 'Recognizer2D', 'BaseLocalizer', 'BMNLocalizer',
+    'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator',
+    'DepthEstimator', 'BaseMultimodal', 'ActBert', 'BaseSegment', 'CFBI',
+    'MSTCN'
+]
diff --git a/docs/src/paddlevideo/modeling/framework/detectors/__init__.py b/docs/src/paddlevideo/modeling/framework/detectors/__init__.py
new file mode 100644
index 000000000..74dcac0a3
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/detectors/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseDetector
+from .fast_rcnn import FastRCNN
+from .two_stage import TwoStageDetector
+
+__all__ = ['BaseDetector', 'TwoStageDetector', 'FastRCNN']
diff --git a/docs/src/paddlevideo/modeling/framework/detectors/base.py b/docs/src/paddlevideo/modeling/framework/detectors/base.py
new file mode 100644
index 000000000..4d5ccb8fe
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/detectors/base.py
@@ -0,0 +1,51 @@
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+from ...registry import DETECTORS
+
+@DETECTORS.register()
+class BaseDetector(nn.Layer):
+    """Base class for detectors.  """
+    def __init__(self, backbone=None, head=None):
+
+        super().__init__()
+
+    def init_weights(self):
+        """Initialize the model network weights. """
+        self.backbone.init_weights()  
+        self.head.init_weights()
+
+    def extract_feature(self, imgs, iter_num):
+        """Extract features through a backbone.  """
+        feature = self.backbone(imgs)
+        return feature
+
+    def forward(self,  data_batch, mode='infer'):
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/detectors/fast_rcnn.py b/docs/src/paddlevideo/modeling/framework/detectors/fast_rcnn.py
new file mode 100644
index 000000000..e8f912dbe
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/detectors/fast_rcnn.py
@@ -0,0 +1,34 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .two_stage import TwoStageDetector
+from ...registry import DETECTORS
+
+@DETECTORS.register()
+class FastRCNN(TwoStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 neck=None,
+                 pretrained=None):
+        super(FastRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            roi_head=head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
diff --git a/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py b/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py
new file mode 100644
index 000000000..f9deb1d0f
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/detectors/two_stage.py
@@ -0,0 +1,186 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from ... import builder
+import paddle.distributed as dist
+from ...registry import DETECTORS
+from .base import BaseDetector
+
+
+@DETECTORS.register()
+class TwoStageDetector(BaseDetector):
+    """Base class for two-stage detectors.  """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(TwoStageDetector, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = neck  # useless
+
+        if rpn_head is not None:
+            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+            rpn_head_ = rpn_head.copy()
+            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+            self.rpn_head = builder.build_head(rpn_head_)
+
+        if roi_head is not None:
+            self.roi_head = builder.build_head(roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if pretrained is not None:
+            self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_rpn(self):
+        """whether the detector has RPN"""
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    @property
+    def with_roi_head(self):
+        """whether the detector has a RoI head"""
+        return hasattr(self, 'roi_head') and self.roi_head is not None
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in detector.  """
+        super(TwoStageDetector, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_rpn:
+            self.rpn_head.init_weights()
+        if self.with_roi_head:
+            self.roi_head.init_weights(pretrained)
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone."""
+        x = self.backbone(img)
+        return x
+
+    def train_step(self, data, **kwargs):
+        img_slow = data[0]
+        img_fast = data[1]
+        proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(
+            data)
+        img_shape = data[7]
+        img_idx = data[8]
+        img_metas = scores, entity_ids
+        x = self.extract_feat(img=[img_slow, img_fast])
+        roi_losses = self.roi_head.train_step(x, img_metas, proposals,
+                                              gt_bboxes, gt_labels, **kwargs)
+        losses = dict()
+        losses.update(roi_losses)
+
+        return losses
+
+    def val_step(self, data, rescale=False):
+        img_slow = data[0]
+        img_fast = data[1]
+        proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(
+            data)
+        img_shape = data[7]
+        img_metas = scores, entity_ids
+        x = self.extract_feat(img=[img_slow, img_fast])
+
+        return self.roi_head.simple_test(x,
+                                         proposals[0],
+                                         img_shape,
+                                         rescale=rescale)
+
+    def test_step(self, data, rescale=False):
+        return self.val_step(data, rescale)
+
+    def infer_step(self, data, rescale=False):
+        ''' model inference'''
+
+        img_slow = data[0]
+        img_fast = data[1]
+        proposals = data[2]
+        img_shape = data[3]
+
+        # using slowfast model to extract spatio-temporal features
+        x = self.extract_feat(img=[img_slow, img_fast])
+
+        ret = self.roi_head.simple_test(x,
+                                        proposals[0],
+                                        img_shape,
+                                        rescale=rescale)
+        return ret
+
+    def get_unpad_datas(self, data):
+        ''' get original datas padded in dataset '''
+        pad_proposals = data[2]
+        pad_gt_bboxes = data[3]
+        pad_gt_labels = data[4]
+        pad_scores, pad_entity_ids = data[5], data[6]
+        len_proposals = data[9]
+        len_gt_bboxes = data[10]
+        len_gt_labels = data[11]
+        len_scores = data[12]
+        len_entity_ids = data[13]
+        N = pad_proposals.shape[0]
+        proposals = []
+        gt_bboxes = []
+        gt_labels = []
+        scores = []
+        entity_ids = []
+        for bi in range(N):
+            pad_proposal = pad_proposals[bi]
+            len_proposal = len_proposals[bi]
+            index_proposal = paddle.arange(len_proposal)
+            proposal = paddle.index_select(x=pad_proposal,
+                                           index=index_proposal,
+                                           axis=0)
+            proposals.append(proposal)
+
+            pad_gt_bbox = pad_gt_bboxes[bi]
+            len_gt_bbox = len_gt_bboxes[bi]
+            index_gt_bbox = paddle.arange(len_gt_bbox)
+            gt_bbox = paddle.index_select(x=pad_gt_bbox,
+                                          index=index_gt_bbox,
+                                          axis=0)
+            gt_bboxes.append(gt_bbox)
+
+            pad_gt_label = pad_gt_labels[bi]
+            len_gt_label = len_gt_labels[bi]
+            index_gt_label = paddle.arange(len_gt_label)
+            gt_label = paddle.index_select(x=pad_gt_label,
+                                           index=index_gt_label,
+                                           axis=0)
+            gt_labels.append(gt_label)
+
+            pad_score = pad_scores[bi]
+            len_score = len_scores[bi]
+            index_score = paddle.arange(len_score)
+            score = paddle.index_select(x=pad_score, index=index_score, axis=0)
+            scores.append(score)
+
+            pad_entity_id = pad_entity_ids[bi]
+            len_entity_id = len_entity_ids[bi]
+            index_entity_id = paddle.arange(len_entity_id)
+            entity_id = paddle.index_select(x=pad_entity_id,
+                                            index=index_entity_id,
+                                            axis=0)
+            entity_ids.append(entity_id)
+
+        return proposals, gt_bboxes, gt_labels, scores, entity_ids
diff --git a/docs/src/paddlevideo/modeling/framework/estimators/__init__.py b/docs/src/paddlevideo/modeling/framework/estimators/__init__.py
new file mode 100644
index 000000000..e2bda935c
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/estimators/__init__.py
@@ -0,0 +1,4 @@
+from .base import BaseEstimator
+from .depth_estimator import DepthEstimator
+
+__all__ = ['DepthEstimator', 'BaseEstimator']
diff --git a/docs/src/paddlevideo/modeling/framework/estimators/base.py b/docs/src/paddlevideo/modeling/framework/estimators/base.py
new file mode 100644
index 000000000..cdddd674f
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/estimators/base.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+
+import paddle
+import paddle.nn as nn
+from paddlevideo.modeling.registry import ESTIMATORS
+from paddlevideo.utils import get_logger
+
+from ... import builder
+
+logger = get_logger("paddlevideo")
+
+
+@ESTIMATORS.register()
+class BaseEstimator(nn.Layer):
+    """BaseEstimator
+
+    """
+    def __init__(self, backbone=None, head=None):
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch):
+        """Define how the model is going to valid, from input to output."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/estimators/depth_estimator.py b/docs/src/paddlevideo/modeling/framework/estimators/depth_estimator.py
new file mode 100644
index 000000000..13ee87775
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/estimators/depth_estimator.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+from paddlevideo.modeling.framework.estimators.base import BaseEstimator
+from paddlevideo.modeling.registry import ESTIMATORS
+from paddlevideo.utils import get_logger
+
+from ... import builder
+
+logger = get_logger("paddlevideo")
+
+
+@ESTIMATORS.register()
+class DepthEstimator(BaseEstimator):
+    """DepthEstimator
+    """
+    def forward_net(self, inputs, day_or_night='day_and_night'):
+        if self.backbone is not None:
+            outputs = self.backbone(inputs, day_or_night)
+        else:
+            outputs = inputs
+        return outputs
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        inputs, _ = data_batch
+        outputs = self.forward_net(inputs, day_or_night='day_and_night')
+        loss_metrics = self.head.loss(inputs, outputs)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        inputs, day_or_night = data_batch
+        outputs = self.forward_net(inputs, day_or_night=day_or_night)
+        loss_metrics = self.head.loss(inputs, outputs)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        inputs, day_or_night = data_batch
+        outputs = self.forward_net(inputs, day_or_night=day_or_night)
+        loss_metrics = self.head.loss(inputs, outputs)
+        return loss_metrics
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        inputs = data_batch[0]
+        outputs = self.forward_net(inputs, day_or_night='day')
+        return outputs
diff --git a/docs/src/paddlevideo/modeling/framework/localizers/__init__.py b/docs/src/paddlevideo/modeling/framework/localizers/__init__.py
new file mode 100644
index 000000000..323a72c84
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/localizers/__init__.py
@@ -0,0 +1,19 @@
+# copyright (c) 2020  paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license"
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from .base import BaseLocalizer
+from .bmn_localizer import BMNLocalizer
+from .yowo_localizer import YOWOLocalizer
+
+__all__ = ['BaseLocalizer', 'BMNLocalizer', 'YOWOLocalizer']
diff --git a/docs/src/paddlevideo/modeling/framework/localizers/base.py b/docs/src/paddlevideo/modeling/framework/localizers/base.py
new file mode 100644
index 000000000..cfd2869f6
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/localizers/base.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+import paddle.nn as nn
+from ... import builder
+
+
+class BaseLocalizer(nn.Layer):
+    """Base class for Localization.
+    All localizer should subclass it.
+    All subclass should overwrite:
+    - Methods:``train_step``, define your train step.
+    - Methods:``valid_step``, define your valid step, always the same as train_step.
+    - Methods:``test_step``, define your test step.
+    """
+    def __init__(self, backbone, loss):
+        super().__init__()
+        self.backbone = builder.build_backbone(backbone)
+        self.loss = builder.build_loss(loss)
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize the model network weights. """
+        if getattr(self.backbone, 'init_weights'):
+            self.backbone.init_weights()
+        else:
+            pass
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.  input_data_batch -> loss_metric
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating setp. input_data_batch -> loss_metric
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Tets setp. to get acc in test data. input_data_batch -> output
+        """
+        raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/localizers/bmn_localizer.py b/docs/src/paddlevideo/modeling/framework/localizers/bmn_localizer.py
new file mode 100644
index 000000000..5afbd3a0c
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/localizers/bmn_localizer.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import LOCALIZERS
+from .base import BaseLocalizer
+
+import paddle
+
+
+@LOCALIZERS.register()
+class BMNLocalizer(BaseLocalizer):
+    """BMN Localization framework
+    """
+    def forward_net(self, imgs):
+        """Call backbone forward.
+        """
+        preds = self.backbone(imgs)
+        return preds
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        x_data = data_batch[0]
+        gt_iou_map = data_batch[1]
+        gt_start = data_batch[2]
+        gt_end = data_batch[3]
+        gt_iou_map.stop_gradient = True
+        gt_start.stop_gradient = True
+        gt_end.stop_gradient = True
+
+        # call Model forward
+        pred_bm, pred_start, pred_end = self.forward_net(x_data)
+        # call Loss forward
+        loss = self.loss(pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
+                         gt_end)
+        avg_loss = paddle.mean(loss)
+        loss_metrics = dict()
+        loss_metrics['loss'] = avg_loss
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        return self.train_step(data_batch)
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        x_data = data_batch[0]
+        pred_bm, pred_start, pred_end = self.forward_net(x_data)
+        return pred_bm, pred_start, pred_end
+
+    def infer_step(self, data_batch):
+        """Infer step
+        """
+        x_data = data_batch[0]
+
+        # call Model forward
+        pred_bm, pred_start, pred_end = self.forward_net(x_data)
+        return pred_bm, pred_start, pred_end
diff --git a/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py b/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py
new file mode 100644
index 000000000..c3613c615
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/localizers/yowo_localizer.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import LOCALIZERS
+from .base import BaseLocalizer
+from .yowo_utils import truths_length, nms, get_region_boxes, bbox_iou
+
+
+@LOCALIZERS.register()
+class YOWOLocalizer(BaseLocalizer):
+    """YOWO Localization framework
+    """
+
+    def forward_net(self, imgs):
+        """Call backbone forward.
+        """
+        # imgs.shape=[N,C,T,H,W], for YOWO
+        preds = self.backbone(imgs)
+        return preds
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        x_data = data_batch[0]
+        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor
+        target.stop_gradient = True
+
+        # call Model forward
+        out = self.forward_net(x_data)
+        # call Loss forward
+        loss, nCorrect = self.loss(out, target)
+        loss_metrics = dict()
+        loss_metrics['loss'] = loss
+        loss_metrics['nCorrect'] = nCorrect
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        total = 0.0
+        proposals = 0.0
+        correct = 0.0
+        fscore = 0.0
+        eps = 1e-5
+        nms_thresh = 0.4
+        iou_thresh = 0.5
+
+        x_data = data_batch[0]
+        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor
+        frame_idx = data_batch[2]
+        target.stop_gradient = True
+        # call Model forward
+        out = self.forward_net(x_data)
+        all_boxes = get_region_boxes(out)
+        out_boxes = []
+
+        for i in range(out.shape[0]):
+            boxes = all_boxes[i]
+            boxes = nms(boxes, nms_thresh)
+            out_boxes.append(boxes)
+            truths = target[i].reshape([-1, 5])
+            num_gts = truths_length(truths)
+            total = total + num_gts
+            pred_list = []
+            for i in range(len(boxes)):
+                if boxes[i][4] > 0.25:
+                    proposals = proposals + 1
+                    pred_list.append(i)
+            for i in range(num_gts):
+                box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]]
+                best_iou = 0
+                best_j = -1
+                for j in pred_list:  # ITERATE THROUGH ONLY CONFIDENT BOXES
+                    iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False)
+                    if iou > best_iou:
+                        best_j = j
+                        best_iou = iou
+                if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]:
+                    correct = correct + 1
+
+        precision = 1.0 * correct / (proposals + eps)
+        recall = 1.0 * correct / (total + eps)
+        fscore = 2.0 * precision * recall / (precision + recall + eps)
+
+        outs = dict()
+        outs['precision'] = precision
+        outs['recall'] = recall
+        outs['fscore'] = fscore
+        outs['frame_idx'] = frame_idx
+        return outs
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        total = 0.0
+        proposals = 0.0
+        correct = 0.0
+        fscore = 0.0
+        eps = 1e-5
+        nms_thresh = 0.4
+        iou_thresh = 0.5
+
+        x_data = data_batch[0]
+        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor
+        frame_idx = data_batch[2]
+        target.stop_gradient = True
+        # call Model forward
+        out = self.forward_net(x_data)
+        all_boxes = get_region_boxes(out)
+        out_boxes = []
+
+        for i in range(out.shape[0]):
+            boxes = all_boxes[i]
+            boxes = nms(boxes, nms_thresh)
+            out_boxes.append(boxes)
+            truths = target[i].reshape([-1, 5])
+            num_gts = truths_length(truths)
+            total = total + num_gts
+            pred_list = []
+            for i in range(len(boxes)):
+                if boxes[i][4] > 0.25:
+                    proposals = proposals + 1
+                    pred_list.append(i)
+            for i in range(num_gts):
+                box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]]
+                best_iou = 0
+                best_j = -1
+                for j in pred_list:  # ITERATE THROUGH ONLY CONFIDENT BOXES
+                    iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False)
+                    if iou > best_iou:
+                        best_j = j
+                        best_iou = iou
+                if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]:
+                    correct = correct + 1
+
+        precision = 1.0 * correct / (proposals + eps)
+        recall = 1.0 * correct / (total + eps)
+        fscore = 2.0 * precision * recall / (precision + recall + eps)
+
+        outs = dict()
+        outs['boxes'] = out_boxes
+        outs['precision'] = precision
+        outs['recall'] = recall
+        outs['fscore'] = fscore
+        outs['frame_idx'] = frame_idx
+        return outs
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        out = self.forward_net(data_batch[0])
+        return out
\ No newline at end of file
diff --git a/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py b/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py
new file mode 100644
index 000000000..9f0e01685
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/localizers/yowo_utils.py
@@ -0,0 +1,359 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import numpy as np
+from builtins import range as xrange
+
+
+def truths_length(truths):
+    for i in range(50):
+        if truths[i][1] == 0:
+            return i
+
+
+def nms(boxes, nms_thresh):
+    if len(boxes) == 0:
+        return boxes
+
+    det_confs = paddle.zeros([len(boxes)])
+    for i in range(len(boxes)):
+        det_confs[i] = 1 - boxes[i][4]
+
+    sortIds = paddle.argsort(det_confs)
+    out_boxes = []
+    for i in range(len(boxes)):
+        box_i = boxes[sortIds[i]]
+        if box_i[4] > 0:
+            out_boxes.append(box_i)
+            for j in range(i + 1, len(boxes)):
+                box_j = boxes[sortIds[j]]
+                if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh:
+                    box_j[4] = 0
+    return out_boxes
+
+
+def convert2cpu(gpu_matrix):
+    float_32_g = gpu_matrix.astype('float32')
+    return float_32_g.cpu()
+
+
+def convert2cpu_long(gpu_matrix):
+    int_64_g = gpu_matrix.astype('int64')
+    return int_64_g.cpu()
+
+
+def get_region_boxes(output, conf_thresh=0.005, num_classes=24,
+                     anchors=[0.70458, 1.18803, 1.26654, 2.55121, 1.59382,
+                              4.08321, 2.30548, 4.94180, 3.52332, 5.91979],
+                     num_anchors=5, only_objectness=1, validation=False):
+    anchor_step = len(anchors) // num_anchors
+    if output.dim() == 3:
+        output = output.unsqueeze(0)
+    batch = output.shape[0]
+    assert (output.shape[1] == (5 + num_classes) * num_anchors)
+    h = output.shape[2]
+    w = output.shape[3]
+    all_boxes = []
+    output = paddle.reshape(
+        output, [batch * num_anchors, 5 + num_classes, h * w])
+    output = paddle.transpose(output, (1, 0, 2))
+    output = paddle.reshape(
+        output, [5 + num_classes, batch * num_anchors * h * w])
+
+    grid_x = paddle.linspace(0, w - 1, w)
+    grid_x = paddle.tile(grid_x, [h, 1])
+    grid_x = paddle.tile(grid_x, [batch * num_anchors, 1, 1])
+    grid_x = paddle.reshape(grid_x, [batch * num_anchors * h * w]).cuda()
+
+    grid_y = paddle.linspace(0, h - 1, h)
+    grid_y = paddle.tile(grid_y, [w, 1]).t()
+    grid_y = paddle.tile(grid_y, [batch * num_anchors, 1, 1])
+    grid_y = paddle.reshape(grid_y, [batch * num_anchors * h * w]).cuda()
+
+    sigmoid = nn.Sigmoid()
+    xs = sigmoid(output[0]) + grid_x
+    ys = sigmoid(output[1]) + grid_y
+
+    anchor_w = paddle.to_tensor(anchors)
+    anchor_w = paddle.reshape(anchor_w, [num_anchors, anchor_step])
+    anchor_w = paddle.index_select(anchor_w, index=paddle.to_tensor(
+        np.array([0]).astype('int32')), axis=1)
+
+    anchor_h = paddle.to_tensor(anchors)
+    anchor_h = paddle.reshape(anchor_h, [num_anchors, anchor_step])
+    anchor_h = paddle.index_select(anchor_h, index=paddle.to_tensor(
+        np.array([1]).astype('int32')), axis=1)
+
+    anchor_w = paddle.tile(anchor_w, [batch, 1])
+    anchor_w = paddle.tile(anchor_w, [1, 1, h * w])
+    anchor_w = paddle.reshape(anchor_w, [batch * num_anchors * h * w]).cuda()
+
+    anchor_h = paddle.tile(anchor_h, [batch, 1])
+    anchor_h = paddle.tile(anchor_h, [1, 1, h * w])
+    anchor_h = paddle.reshape(anchor_h, [batch * num_anchors * h * w]).cuda()
+
+    ws = paddle.exp(output[2]) * anchor_w
+    hs = paddle.exp(output[3]) * anchor_h
+
+    det_confs = sigmoid(output[4])
+
+    cls_confs = paddle.to_tensor(output[5:5 + num_classes], stop_gradient=True)
+    cls_confs = paddle.transpose(cls_confs, [1, 0])
+    s = nn.Softmax()
+    cls_confs = paddle.to_tensor(s(cls_confs))
+
+    cls_max_confs = paddle.max(cls_confs, axis=1)
+    cls_max_ids = paddle.argmax(cls_confs, axis=1)
+
+    cls_max_confs = paddle.reshape(cls_max_confs, [-1])
+    cls_max_ids = paddle.reshape(cls_max_ids, [-1])
+
+    sz_hw = h * w
+    sz_hwa = sz_hw * num_anchors
+
+    det_confs = convert2cpu(det_confs)
+    cls_max_confs = convert2cpu(cls_max_confs)
+    cls_max_ids = convert2cpu_long(cls_max_ids)
+    xs = convert2cpu(xs)
+    ys = convert2cpu(ys)
+    ws = convert2cpu(ws)
+    hs = convert2cpu(hs)
+    if validation:
+        cls_confs = convert2cpu(cls_confs.reshape([-1, num_classes]))
+    for b in range(batch):
+        boxes = []
+        for cy in range(h):
+            for cx in range(w):
+                for i in range(num_anchors):
+                    ind = b * sz_hwa + i * sz_hw + cy * w + cx
+                    det_conf = det_confs[ind]
+                    if only_objectness:
+                        conf = det_confs[ind]
+                    else:
+                        conf = det_confs[ind] * cls_max_confs[ind]
+
+                    if conf > conf_thresh:
+                        bcx = xs[ind]
+                        bcy = ys[ind]
+                        bw = ws[ind]
+                        bh = hs[ind]
+                        cls_max_conf = cls_max_confs[ind]
+                        cls_max_id = cls_max_ids[ind]
+                        box = [bcx / w, bcy / h, bw / w, bh / h,
+                               det_conf, cls_max_conf, cls_max_id]
+                        if (not only_objectness) and validation:
+                            for c in range(num_classes):
+                                tmp_conf = cls_confs[ind][c]
+                                if c != cls_max_id and det_confs[ind] * tmp_conf > conf_thresh:
+                                    box.append(tmp_conf)
+                                    box.append(c)
+                        boxes.append(box)
+        all_boxes.append(boxes)
+    return all_boxes
+
+
+def bbox_iou(box1, box2, x1y1x2y2=True):
+    if x1y1x2y2:
+        mx = min(box1[0], box2[0])
+        Mx = max(box1[2], box2[2])
+        my = min(box1[1], box2[1])
+        My = max(box1[3], box2[3])
+        w1 = box1[2] - box1[0]
+        h1 = box1[3] - box1[1]
+        w2 = box2[2] - box2[0]
+        h2 = box2[3] - box2[1]
+    else:
+        mx = min(float(box1[0] - box1[2] / 2.0),
+                 float(box2[0] - box2[2] / 2.0))
+        Mx = max(float(box1[0] + box1[2] / 2.0),
+                 float(box2[0] + box2[2] / 2.0))
+        my = min(float(box1[1] - box1[3] / 2.0),
+                 float(box2[1] - box2[3] / 2.0))
+        My = max(float(box1[1] + box1[3] / 2.0),
+                 float(box2[1] + box2[3] / 2.0))
+        w1 = box1[2]
+        h1 = box1[3]
+        w2 = box2[2]
+        h2 = box2[3]
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    carea = 0
+    if cw <= 0 or ch <= 0:
+        return paddle.to_tensor(0.0)
+
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+def bbox_ious(boxes1, boxes2, x1y1x2y2=True):
+    if x1y1x2y2:
+        mx = paddle.min(boxes1[0], boxes2[0])
+        Mx = paddle.max(boxes1[2], boxes2[2])
+        my = paddle.min(boxes1[1], boxes2[1])
+        My = paddle.max(boxes1[3], boxes2[3])
+        w1 = boxes1[2] - boxes1[0]
+        h1 = boxes1[3] - boxes1[1]
+        w2 = boxes2[2] - boxes2[0]
+        h2 = boxes2[3] - boxes2[1]
+    else:
+        mx = paddle.min(paddle.stack(
+            [boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0], axis=0), axis=0)
+        Mx = paddle.max(paddle.stack(
+            [boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0], axis=0), axis=0)
+        my = paddle.min(paddle.stack(
+            [boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0], axis=0), axis=0)
+        My = paddle.max(paddle.stack(
+            [boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0], axis=0), axis=0)
+        w1 = boxes1[2]
+        h1 = boxes1[3]
+        w2 = boxes2[2]
+        h2 = boxes2[3]
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    mask = paddle.cast(cw <= 0, dtype="int32") + \
+        paddle.cast(ch <= 0, dtype="int32") > 0
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    carea[mask] = 0
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+# this function works for building the groud truth
+def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,
+                  sil_thresh):
+    # nH, nW here are number of grids in y and x directions (7, 7 here)
+    nB = target.shape[0]  # batch size
+    nA = num_anchors  # 5 for our case
+    nC = num_classes
+    anchor_step = len(anchors) // num_anchors
+    conf_mask = paddle.ones([nB, nA, nH, nW]) * noobject_scale
+    coord_mask = paddle.zeros([nB, nA, nH, nW])
+    cls_mask = paddle.zeros([nB, nA, nH, nW])
+    tx = paddle.zeros([nB, nA, nH, nW])
+    ty = paddle.zeros([nB, nA, nH, nW])
+    tw = paddle.zeros([nB, nA, nH, nW])
+    th = paddle.zeros([nB, nA, nH, nW])
+    tconf = paddle.zeros([nB, nA, nH, nW])
+    tcls = paddle.zeros([nB, nA, nH, nW])
+
+    # for each grid there are nA anchors
+    # nAnchors is the number of anchor for one image
+    nAnchors = nA * nH * nW
+    nPixels = nH * nW
+    # for each image
+    for b in xrange(nB):
+        # get all anchor boxes in one image
+        # (4 * nAnchors)
+        cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()
+        # initialize iou score for each anchor
+        cur_ious = paddle.zeros([nAnchors])
+        for t in xrange(50):
+            # for each anchor 4 coordinate parameters, already in the coordinate system for the whole image
+            # this loop is for anchors in each image
+            # for each anchor 5 parameters are available (class, x, y, w, h)
+            if target[b][t * 5 + 1] == 0:
+                break
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            # groud truth boxes
+            cur_gt_boxes = paddle.tile(paddle.to_tensor(
+                [gx, gy, gw, gh], dtype='float32').t(), [nAnchors, 1]).t()
+            # bbox_ious is the iou value between orediction and groud truth
+            cur_ious = paddle.max(
+                paddle.stack([cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)], axis=0), axis=0)
+        # if iou > a given threshold, it is seen as it includes an object
+        # conf_mask[b][cur_ious>sil_thresh] = 0
+        conf_mask_t = paddle.reshape(conf_mask, [nB, -1])
+        conf_mask_t[b, cur_ious > sil_thresh] = 0
+        conf_mask_tt = paddle.reshape(conf_mask_t[b], [nA, nH, nW])
+        conf_mask[b] = conf_mask_tt
+
+    # number of ground truth
+    nGT = 0
+    nCorrect = 0
+    for b in xrange(nB):
+        # anchors for one batch (at least batch size, and for some specific classes, there might exist more than one anchor)
+        for t in xrange(50):
+            if target[b][t * 5 + 1] == 0:
+                break
+            nGT = nGT + 1
+            best_iou = 0.0
+            best_n = -1
+            min_dist = 10000
+            # the values saved in target is ratios
+            # times by the width and height of the output feature maps nW and nH
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gi = int(gx)
+            gj = int(gy)
+
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            gt_box = [0, 0, gw, gh]
+            for n in xrange(nA):
+                # get anchor parameters (2 values)
+                aw = anchors[anchor_step * n]
+                ah = anchors[anchor_step * n + 1]
+                anchor_box = [0, 0, aw, ah]
+                # only consider the size (width and height) of the anchor box
+                iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
+                # get the best anchor form with the highest iou
+                if iou > best_iou:
+                    best_iou = iou
+                    best_n = n
+
+            # then we determine the parameters for an anchor (4 values together)
+            gt_box = [gx, gy, gw, gh]
+            # find corresponding prediction box
+            pred_box = pred_boxes[b * nAnchors +
+                                  best_n * nPixels + gj * nW + gi]
+
+            # only consider the best anchor box, for each image
+            coord_mask[b, best_n, gj, gi] = 1
+            cls_mask[b, best_n, gj, gi] = 1
+
+            # in this cell of the output feature map, there exists an object
+            conf_mask[b, best_n, gj, gi] = object_scale
+            tx[b, best_n, gj, gi] = paddle.cast(
+                target[b][t * 5 + 1] * nW - gi, dtype='float32')
+            ty[b, best_n, gj, gi] = paddle.cast(
+                target[b][t * 5 + 2] * nH - gj, dtype='float32')
+            tw[b, best_n, gj, gi] = math.log(
+                gw / anchors[anchor_step * best_n])
+            th[b, best_n, gj, gi] = math.log(
+                gh / anchors[anchor_step * best_n + 1])
+            iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou
+            # confidence equals to iou of the corresponding anchor
+            tconf[b, best_n, gj, gi] = paddle.cast(iou, dtype='float32')
+            tcls[b, best_n, gj, gi] = paddle.cast(
+                target[b][t * 5], dtype='float32')
+            # if ious larger than 0.5, we justify it as a correct prediction
+            if iou > 0.5:
+                nCorrect = nCorrect + 1
+    # true values are returned
+    return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
diff --git a/docs/src/paddlevideo/modeling/framework/multimodal/__init__.py b/docs/src/paddlevideo/modeling/framework/multimodal/__init__.py
new file mode 100644
index 000000000..e1efec3d7
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/multimodal/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseMultimodal
+from .actbert import ActBert
+
+__all__ = ['BaseMultimodal', 'ActBert']
diff --git a/docs/src/paddlevideo/modeling/framework/multimodal/actbert.py b/docs/src/paddlevideo/modeling/framework/multimodal/actbert.py
new file mode 100644
index 000000000..4f2c074ff
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/multimodal/actbert.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import MULTIMODAL
+from .base import BaseMultimodal
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@MULTIMODAL.register()
+class ActBert(BaseMultimodal):
+    """ActBert model framework."""
+    def forward_net(self, text_ids, action_feat, image_feat, image_loc,
+                    token_type_ids, text_mask, image_mask, action_mask):
+        pred = self.backbone(text_ids, action_feat, image_feat, image_loc,
+                             token_type_ids, text_mask, image_mask, action_mask)
+        return pred
+
+    def train_step(self, data_batch):
+        """For ActBert Dataset. Define how the model is going to train, from input to output.
+        """
+        text_ids, action_feat, image_feat, image_loc, \
+        token_type_ids, text_mask, image_mask, action_mask, \
+        text_labels, action_label, next_sentence_label, image_label, image_target = data_batch
+        loss_metrics = dict()
+        pred = self.backbone(text_ids, action_feat, image_feat, image_loc,
+                             token_type_ids, text_mask, image_mask, action_mask)
+        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = pred
+        total_loss = self.loss(prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \
+                text_labels, image_label, image_target, action_label, next_sentence_label)
+        loss_metrics['loss'] = paddle.mean(total_loss)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """For ActBert Dataset. Define how the model is going to val, from input to output.
+        """
+        return self.train_step(data_batch)
+
+    def test_step(self, data_batch):
+        """For MSR-VTT Dataset. Define how the model is going to test, from input to output."""
+        text_ids, action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask = data_batch[:
+                                                                                                                      -1]
+        action_feat = action_feat.squeeze(0)
+        image_feat = image_feat.squeeze(0)
+        image_loc = image_loc.squeeze(0)
+        image_mask = image_mask.squeeze(0)
+        action_mask = action_mask.squeeze(0)
+        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.forward_net(text_ids, \
+            action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask)
+        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
+
+    def infer_step(self, data_batch):
+        pass
diff --git a/docs/src/paddlevideo/modeling/framework/multimodal/base.py b/docs/src/paddlevideo/modeling/framework/multimodal/base.py
new file mode 100644
index 000000000..bc57f9765
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/multimodal/base.py
@@ -0,0 +1,81 @@
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseMultimodal(nn.Layer):
+    """Base class for Multimodal.
+
+    All Multimodal model should subclass it.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Head to process feature.
+        loss(dict): Loss function.
+
+    """
+    def __init__(self, backbone=None, head=None, loss=None):
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+        if loss is not None:
+            self.loss = builder.build_loss(loss)
+        else:
+            self.loss = None
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch, **kwargs):
+        """Infer step.
+        """
+        raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/partitioners/__init__.py b/docs/src/paddlevideo/modeling/framework/partitioners/__init__.py
new file mode 100644
index 000000000..0c6de50a3
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/partitioners/__init__.py
@@ -0,0 +1,18 @@
+# copyright (c) 2020  paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license"
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from .base import BasePartitioner
+from .transnetv2_partitioner import TransNetV2Partitioner
+
+__all__ = ['BasePartitioner', 'TransNetV2Partitioner']
diff --git a/docs/src/paddlevideo/modeling/framework/partitioners/base.py b/docs/src/paddlevideo/modeling/framework/partitioners/base.py
new file mode 100644
index 000000000..a7c925975
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/partitioners/base.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+import paddle.nn as nn
+from ... import builder
+
+
+class BasePartitioner(nn.Layer):
+    """Base class for Partition.
+    All partitioner should subclass it.
+    All subclass should overwrite:
+    - Methods:``train_step``, define your train step.
+    - Methods:``valid_step``, define your valid step, always the same as train_step.
+    - Methods:``test_step``, define your test step.
+    """
+    def __init__(self, backbone=None, head=None):
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+
+    def init_weights(self):
+        """Initialize the model network weights. """
+        if getattr(self.backbone, 'init_weights'):
+            self.backbone.init_weights()
+        else:
+            pass
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.  input_data_batch -> loss_metric
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating setp. input_data_batch -> loss_metric
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Tets setp. to get acc in test data. input_data_batch -> output
+        """
+        raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py b/docs/src/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py
new file mode 100644
index 000000000..c3295068c
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import PARTITIONERS
+from .base import BasePartitioner
+
+import paddle
+
+
+@PARTITIONERS.register()
+class TransNetV2Partitioner(BasePartitioner):
+    """TransNetV2 Partitioner framework
+    """
+    def forward_net(self, imgs):
+        one_hot_pred = self.backbone(imgs)
+        return one_hot_pred
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        frame_sequence = data_batch[0]
+        one_hot_gt, many_hot_gt = data_batch[1:]
+        one_hot_pred = self.forward_net(frame_sequence)
+        dict_ = {}
+        if isinstance(one_hot_pred, tuple):
+            one_hot_pred, dict_ = one_hot_pred
+        many_hot_pred = dict_.get("many_hot", None)
+        comb_reg_loss = dict_.get("comb_reg_loss", None)
+        loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,
+                                    many_hot_pred, many_hot_gt,
+                                    reg_losses={"comb_reg": comb_reg_loss})
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        frame_sequence = data_batch[0]
+        one_hot_gt, many_hot_gt = data_batch[1:]
+        one_hot_pred = self.forward_net(frame_sequence)
+        dict_ = {}
+        if isinstance(one_hot_pred, tuple):
+            one_hot_pred, dict_ = one_hot_pred
+        many_hot_pred = dict_.get("many_hot", None)
+        comb_reg_loss = dict_.get("comb_reg_loss", None)
+        loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,
+                                      many_hot_pred, many_hot_gt,
+                                      reg_losses={"comb_reg": comb_reg_loss})
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+        frame_sequence = data_batch[0]
+        one_hot_pred = self.forward_net(frame_sequence)
+        return one_hot_pred
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        frame_sequence = data_batch[0]
+        one_hot_pred = self.forward_net(frame_sequence)
+        return one_hot_pred
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/__init__.py b/docs/src/paddlevideo/modeling/framework/recognizers/__init__.py
new file mode 100644
index 000000000..764b37f94
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseRecognizer
+from .recognizer1d import Recognizer1D, RecognizerAction
+from .recognizer2d import Recognizer2D
+from .recognizer3d import Recognizer3D
+from .recognizer_transformer import RecognizerTransformer
+from .recognizer_gcn import RecognizerGCN
+from .recognizerMRI import RecognizerMRI
+from .recognizer3dMRI import Recognizer3DMRI
+from .recognizer_transformer_MRI import RecognizerTransformer_MRI
+from .recognizer_movinet_frame import MoViNetRecognizerFrame
+from .recognizerDistillation import RecognizerDistillation
+
+__all__ = [
+    'BaseRecognizer', 'Recognizer1D', 'Recognizer2D', 'Recognizer3D',
+    'RecognizerTransformer', 'RecognizerGCN', 'RecognizerMRI',
+    'Recognizer3DMRI', 'RecognizerTransformer_MRI', 'MoViNetRecognizerFrame',
+    'RecognizerAction', 'RecognizerDistillation'
+]
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/base.py b/docs/src/paddlevideo/modeling/framework/recognizers/base.py
new file mode 100644
index 000000000..bf31caf04
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/base.py
@@ -0,0 +1,81 @@
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseRecognizer(nn.Layer):
+    """Base class for recognizers.
+
+    All recognizers should subclass it.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Classification head to process feature.
+
+    """
+    def __init__(self, backbone=None, head=None, runtime_cfg=None):
+
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+
+        # Settings when the model is running,
+        # such as 'avg_type'
+        self.runtime_cfg = runtime_cfg
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch, **kwargs):
+        """Infer step.
+        """
+        raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizer1d.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer1d.py
new file mode 100644
index 000000000..2c7fa94e3
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer1d.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+
+
+@RECOGNIZERS.register()
+class Recognizer1D(BaseRecognizer):
+    """1D recognizer model framework."""
+    def forward_net(self, imgs):
+        """Define how the model is going to train, from input to output.
+        """
+        lstm_logit, lstm_output = self.head(imgs)
+        return lstm_logit, lstm_output
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels = data_batch
+        imgs = [(rgb_data, rgb_len, rgb_mask),
+                (audio_data, audio_len, audio_mask)]
+
+        # call forward
+        lstm_logit, lstm_output = self.forward_net(imgs)
+        loss = self.head.loss(lstm_logit, labels)
+        hit_at_one, perr, gap = self.head.metric(lstm_output, labels)
+        loss_metrics = dict()
+        loss_metrics['loss'] = loss
+        loss_metrics['hit_at_one'] = hit_at_one
+        loss_metrics['perr'] = perr
+        loss_metrics['gap'] = gap
+
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        return self.train_step(data_batch)
+
+    def test_step(self, data_batch):
+        """Testing setp.
+        """
+        return self.train_step(data_batch)
+
+    def infer_step(self, data_batch):
+        """Infering setp.
+        """
+        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch
+        imgs = [(rgb_data, rgb_len, rgb_mask),
+                (audio_data, audio_len, audio_mask)]
+        # call forward
+        lstm_logit, _ = self.forward_net(imgs)
+        return lstm_logit
+
+
+@RECOGNIZERS.register()
+class RecognizerAction(BaseRecognizer):
+    """1D recognizer model framework."""
+    def forward_net(self, imgs):
+        """Define how the model is going to train, from input to output.
+        """
+        lstm_logit, lstm_output = self.head(imgs)
+        return lstm_logit, lstm_output
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels, labels_iou = data_batch
+        imgs = [(rgb_data, rgb_len, rgb_mask),
+                (audio_data, audio_len, audio_mask)]
+
+        # call forward
+        output_logit, output_iou = self.forward_net(imgs)
+        loss = self.head.loss(output_logit, output_iou, labels, labels_iou)
+        top1, top5 = self.head.metric(output_logit, labels)
+        loss_metrics = dict()
+        loss_metrics['loss'] = loss
+        loss_metrics['top1'] = top1
+        loss_metrics['top5'] = top5
+
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        return self.train_step(data_batch)
+
+    def test_step(self, data_batch):
+        """Testing setp.
+        """
+        return self.train_step(data_batch)
+
+    def infer_step(self, data_batch):
+        """Infering setp.
+        """
+        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch
+        imgs = [(rgb_data, rgb_len, rgb_mask),
+                (audio_data, audio_len, audio_mask)]
+        # call forward
+        output_logit, output_iou = self.forward_net(imgs)
+        return output_logit, output_iou
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizer2d.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer2d.py
new file mode 100644
index 000000000..d8aa6619f
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer2d.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer2D(BaseRecognizer):
+    """2D recognizer model framework."""
+    def forward_net(self, imgs):
+        # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.
+        num_segs = imgs.shape[
+            1]  # imgs.shape=[N,T,C,H,W], for most commonly case
+        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))
+
+        if self.backbone is not None:
+            feature = self.backbone(imgs)
+        else:
+            feature = imgs
+
+        if self.head is not None:
+            cls_score = self.head(feature, num_segs)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+        imgs = data_batch[0]
+        cls_score = self.forward_net(imgs)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        imgs = data_batch[0]
+        cls_score = self.forward_net(imgs)
+        return cls_score
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3d.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3d.py
new file mode 100644
index 000000000..f0ecff1f6
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3d.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer3D(BaseRecognizer):
+    """3D Recognizer model framework.
+    """
+
+    def forward_net(self, imgs):
+        """Define how the model is going to run, from input to output.
+        """
+        feature = self.backbone(imgs)
+        cls_score = self.head(feature)
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':
+            imgs = data_batch[0]
+            labels = data_batch[1:]
+            if imgs.dim() == 6:
+                imgs = imgs.reshape([-1] + imgs.shape[2:])
+        else:
+            imgs = data_batch[0:2]
+            labels = data_batch[2:]
+
+        # call forward
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':
+            imgs = data_batch[0]
+            labels = data_batch[1:]
+            if imgs.dim() == 6:
+                imgs = imgs.reshape([-1] + imgs.shape[2:])
+        else:
+            imgs = data_batch[0:2]
+            labels = data_batch[2:]
+
+        # call forward
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':
+            imgs = data_batch[0]
+            if imgs.dim() == 6:
+                imgs = imgs.reshape([-1] + imgs.shape[2:])
+        else:
+            imgs = data_batch[0:2]
+        # call forward
+        cls_score = self.forward_net(imgs)
+
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':
+            imgs = data_batch[0]
+            # call forward
+            imgs = imgs.reshape([-1] + imgs.shape[2:])
+            cls_score = self.forward_net(imgs)
+        else:
+            imgs = data_batch[0:2]
+            # call forward
+            cls_score = self.forward_net(imgs)
+
+        return cls_score
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py
new file mode 100644
index 000000000..9298491c0
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+from paddlevideo.utils import get_logger
+import paddle
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer3DMRI(BaseRecognizer):
+    """3D Recognizer model framework.
+    """
+    def forward_net(self, imgs):
+        """Define how the model is going to run, from input to output.
+        """
+
+        imgs[0] = paddle.cast(imgs[0], "float32")
+        imgs[1] = paddle.cast(imgs[1], "float32")
+        imgs[0] = imgs[0].unsqueeze(1)
+        imgs[1] = imgs[1].unsqueeze(1)
+
+        feature = self.backbone(imgs)
+        cls_score = self.head(feature)
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        imgs = data_batch[0:2]
+        labels = data_batch[2:]
+
+        # call forward
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        imgs = data_batch[0:2]
+        labels = data_batch[2:]
+
+        # call forward
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score,
+                                      labels,
+                                      valid_mode=True,
+                                      if_top5=False)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        imgs = data_batch[0:2]
+        # call forward
+        cls_score = self.forward_net(imgs)
+
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        imgs = data_batch[0:2]
+        # call forward
+        cls_score = self.forward_net(imgs)
+
+        return cls_score
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py
new file mode 100644
index 000000000..6f48a08b8
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+import paddle
+import paddle.nn as nn
+
+from ...registry import RECOGNIZERS
+from ... import builder
+from paddlevideo.utils import get_logger, get_dist_info
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerDistillation(nn.Layer):
+    """recognizer Distillation framework."""
+    def __init__(self,
+                 freeze_params_list=None,
+                 models=None,
+                 loss=None,
+                 **kargs):
+        """
+        Args:
+            freeze_params_list: list, set each model is trainable or not
+            models: config of distillaciton model.
+            loss: config of loss list
+        """
+        super().__init__()
+        self.model_list = []
+        self.model_name_list = []
+        self.loss_cfgs = loss
+
+        if freeze_params_list is None:
+            freeze_params_list = [False] * len(models)
+        assert len(freeze_params_list) == len(models)
+
+        # build Teacher and Student model
+        for idx, model_config in enumerate(models):
+            assert len(model_config) == 1
+            key = list(model_config.keys())[0]  #Teacher or Student
+            model_config = model_config[key]
+            model_name = model_config['backbone']['name']
+
+            backbone, head = None, None
+            if model_config.get('backbone'):
+                backbone = builder.build_backbone(model_config['backbone'])
+                if hasattr(backbone, 'init_weights'):
+                    backbone.init_weights()
+            if model_config.get('head'):
+                head = builder.build_head(model_config['head'])
+                if hasattr(head, 'init_weights'):
+                    head.init_weights()
+
+            model = nn.Sequential(backbone, head)
+            logger.info('build distillation {} model done'.format(key))
+            # for add all parameters in nn.Layer class
+            self.model_list.append(self.add_sublayer(key, model))
+            self.model_name_list.append({model_name: key})
+
+            # set model trainable or not
+            if freeze_params_list[idx]:
+                for param in model.parameters():
+                    param.trainable = False
+
+        # build loss: support for loss list
+        self.loss_func_list = []
+        mode_keys = list(loss.keys())
+        for mode in mode_keys:
+            loss_cfgs = loss[mode]
+            for loss_cfg in loss_cfgs:
+                loss_func_dict = {}
+                model_name_pairs = loss_cfg.pop('model_name_pairs')
+                loss_func = builder.build_loss(loss_cfg)
+                loss_func_dict['mode'] = mode
+                loss_func_dict['loss_func'] = loss_func
+                loss_func_dict['model_name_pairs'] = model_name_pairs
+                self.loss_func_list.append(loss_func_dict)
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    def get_loss(self, output, labels, mode):
+        """
+        Args:
+            output: dict, output name and its value
+            labels: label of data
+            mode: str, 'Train' or 'Val'
+        """
+        output['GroundTruth'] = labels
+        loss_list = []
+
+        for loss_func_dict in self.loss_func_list:
+            if mode == loss_func_dict['mode']:
+                model_name_pairs = loss_func_dict['model_name_pairs']
+                loss_func = loss_func_dict['loss_func']
+                loss_val = loss_func(output[model_name_pairs[0]],
+                                     output[model_name_pairs[1]])
+                loss_list.append(loss_val)
+
+        total_loss = paddle.add_n(loss_list)
+        return total_loss
+
+    def get_acc(self, scores, labels, mode='Train'):
+        def _get_acc(score, label, mode='Train'):
+            top1 = paddle.metric.accuracy(input=score, label=label, k=1)
+            top5 = paddle.metric.accuracy(input=score, label=label, k=5)
+            _, world_size = get_dist_info()
+            # Deal with multi cards validate
+            if world_size > 1 and mode == 'Val':  #reduce sum when valid
+                top1 = paddle.distributed.all_reduce(
+                    top1, op=paddle.distributed.ReduceOp.SUM) / world_size
+                top5 = paddle.distributed.all_reduce(
+                    top5, op=paddle.distributed.ReduceOp.SUM) / world_size
+            return top1, top5
+
+        if len(labels) == 1:
+            label = labels[0]
+            return _get_acc(scores, label)
+        # Deal with VideoMix
+        elif len(labels) == 3:
+            label_a, label_b, lam = labels
+            top1a, top5a = _get_acc(scores, label_a, mode)
+            top1b, top5b = _get_acc(scores, label_b, mode)
+            top1 = lam * top1a + (1 - lam) * top1b
+            top5 = lam * top5a + (1 - lam) * top5b
+            return top1, top5
+
+    def forward_model(self, imgs, model_name, model):
+        if model_name in ['PPTSM_v2', 'ResNetTweaksTSM']:
+            # [N,T,C,H,W] -> [N*T,C,H,W]
+            imgs = paddle.reshape(imgs, [-1] + list(imgs.shape[2:]))
+
+        return model(imgs)
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        out = {}
+        loss_metrics = {}
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+
+        for idx, item in enumerate(self.model_name_list):
+            model = self.model_list[idx]
+            model_name = list(item.keys())[0]
+            model_type = item[model_name]  # Teacher or Student
+            out[model_type] = self.forward_model(imgs, model_name, model)
+
+        # out_student, out_teacher
+        loss = self.get_loss(out, labels, 'Train')
+        loss_metrics['loss'] = loss
+        # calculate acc with student output
+        top1, top5 = self.get_acc(out['Student'], labels)
+        loss_metrics['top1'] = top1
+        loss_metrics['top5'] = top5
+
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        out = {}
+        loss_metrics = {}
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+
+        for idx, item in enumerate(self.model_name_list):
+            model = self.model_list[idx]
+            model_name = list(item.keys())[0]
+            model_type = item[model_name]  # Teacher or Student
+            out[model_type] = self.forward_model(imgs, model_name, model)
+
+        # Loss of student with gt:  out_student, label
+        loss = self.get_loss(out, labels, 'Val')
+        loss_metrics['loss'] = loss
+
+        top1, top5 = self.get_acc(out['Student'], labels, 'Val')
+        loss_metrics['top1'] = top1
+        loss_metrics['top5'] = top5
+
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        imgs = data_batch[0]
+
+        # Use Student to test
+        for idx, item in enumerate(self.model_name_list):
+            model = self.model_list[idx]
+            model_name = list(item.keys())[0]
+            model_type = item[model_name]  # Teacher or Student
+            if model_type == "Student":
+                out = self.forward_model(imgs, model_name, model)
+
+        return out
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        imgs = data_batch[0]
+
+        # Use Student to infer
+        for idx, item in enumerate(self.model_name_list):
+            model = self.model_list[idx]
+            model_name = list(item.keys())[0]
+            model_type = item[model_name]  # Teacher or Student
+            if model_type == "Student":
+                out = self.forward_model(imgs, model_name, model)
+
+        return out
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizerMRI.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizerMRI.py
new file mode 100644
index 000000000..4b1713e61
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizerMRI.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerMRI(BaseRecognizer):
+    """2D recognizer model framework."""
+    def forward_net(self, imgs):
+        # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.
+        num_segs = imgs.shape[
+            1]  # imgs.shape=[N,T,C,H,W], for most commonly case
+        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))
+        imgs = paddle.cast(imgs, "float32")  #############
+        imgs = imgs.unsqueeze(1)
+
+        if self.backbone != None:
+            feature = self.backbone(imgs)
+        else:
+            feature = imgs
+
+        if self.head != None:
+            cls_score = self.head(feature, num_segs)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score,
+                                      labels,
+                                      valid_mode=True,
+                                      if_top5=False)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+        imgs = data_batch[0]
+        cls_score = self.forward_net(imgs)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        imgs = data_batch[0]
+        cls_score = self.forward_net(imgs)
+        return cls_score
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py
new file mode 100644
index 000000000..281c5ac9e
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerGCN(BaseRecognizer):
+    """GCN Recognizer model framework.
+    """
+
+    def __init__(self,
+                 backbone=None,
+                 head=None,
+                 runtime_cfg=None,
+                 if_top5=True):
+        """
+        Args:
+            backbone (dict): Backbone modules to extract feature.
+            head (dict): Classification head to process feature.
+            is_top5 (bool): Whether to display top-5 accuracy during training/validation steps.
+        """
+        super(RecognizerGCN, self).__init__(backbone, head, runtime_cfg)
+        self.if_top5 = if_top5
+
+    def forward_net(self, data):
+        """Define how the model is going to run, from input to output.
+        """
+        feature = self.backbone(data)
+        cls_score = self.head(feature)
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        data = data_batch[0]
+        label = data_batch[1:]
+
+        # call forward
+        cls_score = self.forward_net(data)
+        loss_metrics = self.head.loss(cls_score, label, if_top5=self.if_top5)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        data = data_batch[0]
+        label = data_batch[1:]
+
+        # call forward
+        cls_score = self.forward_net(data)
+        loss_metrics = self.head.loss(cls_score,
+                                      label,
+                                      valid_mode=True,
+                                      if_top5=self.if_top5)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        data = data_batch[0]
+
+        # call forward
+        cls_score = self.forward_net(data)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        data = data_batch[0]
+
+        # call forward
+        cls_score = self.forward_net(data)
+        return cls_score
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py
new file mode 100644
index 000000000..1ad2e149a
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+
+from paddlevideo.utils import get_logger
+from .base import BaseRecognizer
+from ...registry import RECOGNIZERS
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class MoViNetRecognizerFrame(BaseRecognizer):
+
+    def forward_net(self, imgs):
+        """Define how the model is going to run, from input to output.
+        """
+        self.backbone.clean_activation_buffers()
+        outputs = self.backbone(imgs)
+        cls_score = self.head(outputs)
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1]  #.astype("int64")
+        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+        # call forward
+        cls_score = self.forward_net(data)
+        loss_metrics = self.head.loss_func(cls_score, labels)
+        top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)
+        top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)
+        output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}
+        return output
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1]  #.astype("int64")
+        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+        # call forward
+        cls_score = self.forward_net(data)
+        loss_metrics = self.head.loss_func(cls_score, labels)
+        top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)
+        top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)
+        output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}
+        return output
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        imgs = data_batch[0]
+        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+        # call forward
+        cls_score = self.forward_net(data)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        imgs = data_batch[0]
+        # call forward
+        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+        cls_score = self.forward_net(data)
+
+        return cls_score
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py
new file mode 100644
index 000000000..4144edacf
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import paddle.nn.functional as F
+from paddlevideo.utils import get_logger
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerTransformer(BaseRecognizer):
+    """Transformer's recognizer model framework."""
+    def forward_net(self, imgs):
+        # imgs.shape=[N,C,T,H,W], for transformer case
+        if self.backbone is not None:
+            feature = self.backbone(imgs)
+        else:
+            feature = imgs
+
+        if self.head is not None:
+            cls_score = self.head(feature)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        imgs = data_batch[0]
+        num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg
+        cls_score = []
+        for i in range(num_views):
+            view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *
+                        self.runtime_cfg.test.num_seg]
+            cls_score.append(self.forward_net(view))
+        cls_score = self._average_view(cls_score,
+                                       self.runtime_cfg.test.avg_type)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        imgs = data_batch[0]
+        num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg
+        cls_score = []
+        for i in range(num_views):
+            view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *
+                        self.runtime_cfg.test.num_seg]
+            cls_score.append(self.forward_net(view))
+        cls_score = self._average_view(cls_score,
+                                       self.runtime_cfg.test.avg_type)
+        return cls_score
+
+    def _average_view(self, cls_score, avg_type='score'):
+        """Combine the predicted results of different views
+
+        Args:
+            cls_score (list): results of multiple views
+            avg_type (str, optional): Average calculation method. Defaults to 'score'.
+        """
+        assert avg_type in ['score', 'prob'], \
+            f"Currently only the average of 'score' or 'prob' is supported, but got {avg_type}"
+        if avg_type == 'score':
+            return paddle.add_n(cls_score) / len(cls_score)
+        elif avg_type == 'prob':
+            return paddle.add_n(
+                [F.softmax(score, axis=-1)
+                 for score in cls_score]) / len(cls_score)
+        else:
+            raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py
new file mode 100644
index 000000000..e8696b4da
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import paddle.nn.functional as F
+from paddlevideo.utils import get_logger
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerTransformer_MRI(BaseRecognizer):
+    """Transformer's recognizer model framework."""
+    def forward_net(self, imgs):
+        # imgs.shape=[N,C,T,H,W], for transformer case
+
+        imgs = paddle.cast(imgs, "float32")  #############
+        imgs = imgs.unsqueeze(1)
+
+        if self.backbone != None:
+            feature = self.backbone(imgs)
+        else:
+            feature = imgs
+
+        if self.head != None:
+            cls_score = self.head(feature)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score,
+                                      labels,
+                                      valid_mode=True,
+                                      if_top5=False)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        imgs = data_batch[0]
+        num_views = imgs.shape[2] // self.backbone.seg_num
+        cls_score = []
+        for i in range(num_views):
+            view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *
+                        self.backbone.seg_num]
+            cls_score.append(self.forward_net(view))
+        cls_score = self.average_view(cls_score)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        imgs = data_batch[0]
+        num_views = imgs.shape[2] // self.backbone.seg_num
+        cls_score = []
+        for i in range(num_views):
+            view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *
+                        self.backbone.seg_num]
+            cls_score.append(self.forward_net(view))
+        cls_score = self.average_view(cls_score)
+        return cls_score
+
+    def average_view(self, cls_score, average_type='score'):
+        """Combine the scores of different views
+
+        Args:
+            cls_score (list): Scores of multiple views
+            average_type (str, optional): Average calculation method. Defaults to 'score'.
+        """
+        assert average_type in ['score', 'prob'], \
+            f"Currently only the average of 'score' or 'prob' is supported, but got {average_type}"
+        if average_type == 'score':
+            return paddle.add_n(cls_score) / len(cls_score)
+        elif average_type == 'avg':
+            return paddle.add_n([F.softmax(score)
+                                 for score in cls_score]) / len(cls_score)
+        else:
+            raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/segment/__init__.py b/docs/src/paddlevideo/modeling/framework/segment/__init__.py
new file mode 100644
index 000000000..28a1d2e15
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/segment/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseSegment
+from .cfbi import CFBI
+
+__all__ = ['BaseSegment', 'CFBI']
diff --git a/docs/src/paddlevideo/modeling/framework/segment/base.py b/docs/src/paddlevideo/modeling/framework/segment/base.py
new file mode 100644
index 000000000..0c5cb07f7
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/segment/base.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseSegment(nn.Layer):
+    """Base class for semi-Video Object Segmentation.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Head to process feature.
+        loss(dict): Loss function.
+    """
+    def __init__(self, backbone=None, head=None, loss=None):
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+        if loss is not None:
+            self.loss = builder.build_loss(loss)
+        else:
+            self.loss = None
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch, **kwargs):
+        """Infer step.
+        """
+        raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/segment/cfbi.py b/docs/src/paddlevideo/modeling/framework/segment/cfbi.py
new file mode 100644
index 000000000..dcdc512f0
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/segment/cfbi.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from .utils import foreground2background, global_matching_for_eval, local_matching, calculate_attention_head_for_eval
+from ...registry import SEGMENT
+from .base import BaseSegment
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@SEGMENT.register()
+class CFBI(BaseSegment):
+    """CFBI model framework."""
+    def __init__(self, backbone=None, head=None, loss=None):
+        super().__init__(backbone, head, loss)
+        x1 = paddle.zeros([3, 1, 1, 1])
+        self.bg_bias = paddle.create_parameter(
+            shape=x1.shape,
+            dtype=x1.dtype,
+            default_initializer=nn.initializer.Assign(x1))
+        self.fg_bias = paddle.create_parameter(
+            shape=x1.shape,
+            dtype=x1.dtype,
+            default_initializer=nn.initializer.Assign(x1))
+        self.epsilon = 1e-05
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output.
+        """
+        self.test_mode = True
+        ref_embeddings, ref_masks, prev_embedding, prev_mask, current_frame, pred_size, gt_ids = data_batch
+        current_frame_embedding_4x, current_frame_embedding_8x, current_frame_embedding_16x, \
+        current_low_level = self.backbone(current_frame)
+
+        current_frame_embedding = [
+            current_frame_embedding_4x, current_frame_embedding_8x,
+            current_frame_embedding_16x
+        ]
+
+        if prev_embedding is None:
+            return None, current_frame_embedding
+        else:
+            bs, c, h, w = current_frame_embedding_4x.shape
+
+            tmp_dic, _ = self.before_seghead_process(
+                ref_embeddings,
+                prev_embedding,
+                current_frame_embedding,
+                ref_masks,
+                prev_mask,
+                gt_ids,
+                current_low_level=current_low_level,
+            )
+            all_pred = []
+            for i in range(bs):
+                pred = tmp_dic[i]
+
+                pred = F.interpolate(pred,
+                                     size=[pred_size[0], pred_size[1]],
+                                     mode='bilinear',
+                                     align_corners=True)
+                all_pred.append(pred)
+            all_pred = paddle.concat(all_pred, axis=0)
+            all_pred = F.softmax(all_pred, axis=1)
+            return all_pred, current_frame_embedding
+
+    def before_seghead_process(self,
+                               ref_frame_embeddings=None,
+                               previous_frame_embeddings=None,
+                               current_frame_embeddings=None,
+                               ref_frame_labels=None,
+                               previous_frame_mask=None,
+                               gt_ids=None,
+                               current_low_level=None):
+        """ process befor segmentation head"""
+        TEST_GLOBAL_MATCHING_CHUNK = [4, 1, 1]
+        TEST_GLOBAL_ATROUS_RATE = [2, 1, 1]
+        TRAIN_LOCAL_ATROUS_RATE = [2, 1, 1]
+        TEST_LOCAL_ATROUS_RATE = [2, 1, 1]
+        MODEL_FLOAT16_MATCHING = False
+        TEST_GLOBAL_MATCHING_MIN_PIXEL = 100
+        MODEL_MULTI_LOCAL_DISTANCE = [[4, 8, 12, 16, 20, 24],
+                                      [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]]
+        TRAIN_LOCAL_PARALLEL = True
+        TEST_LOCAL_PARALLEL = True
+        MODEL_MATCHING_BACKGROUND = True
+        MODEL_SEMANTIC_MATCHING_DIM = [32, 64, 128]
+
+        dic_tmp = []
+        boards = {}
+        scale_ref_frame_labels = []
+        scale_previous_frame_labels = []
+        for current_frame_embedding in current_frame_embeddings:
+            bs, c, h, w = current_frame_embedding.shape
+            if not self.test_mode:
+                raise NotImplementedError
+            else:
+                ref_frame_embeddings = list(zip(*ref_frame_embeddings))
+                all_scale_ref_frame_label = []
+                for ref_frame_label in ref_frame_labels:
+                    scale_ref_frame_label = paddle.cast(F.interpolate(
+                        paddle.cast(ref_frame_label, dtype="float32"),
+                        size=(h, w),
+                        mode='nearest'),
+                                                        dtype="int32")
+                    all_scale_ref_frame_label.append(scale_ref_frame_label)
+                scale_ref_frame_labels.append(all_scale_ref_frame_label)
+            scale_previous_frame_label = paddle.cast(F.interpolate(
+                paddle.cast(previous_frame_mask, dtype="float32"),
+                size=(h, w),
+                mode='nearest'),
+                                                     dtype="int32")
+            scale_previous_frame_labels.append(scale_previous_frame_label)
+        for n in range(bs):
+            ref_obj_ids = paddle.reshape(
+                paddle.cast(paddle.arange(0,
+                                          np.array(gt_ids)[n] + 1),
+                            dtype="int32"), [-1, 1, 1, 1])
+            obj_num = ref_obj_ids.shape[0]
+            low_level_feat = paddle.unsqueeze(current_low_level[n], axis=0)
+            all_CE_input = []
+            all_attention_head = []
+            for scale_idx, current_frame_embedding, ref_frame_embedding, previous_frame_embedding, \
+                scale_ref_frame_label, scale_previous_frame_label in zip(range(3), \
+                    current_frame_embeddings, ref_frame_embeddings, previous_frame_embeddings, \
+                    scale_ref_frame_labels, scale_previous_frame_labels):
+                #Prepare
+                seq_current_frame_embedding = current_frame_embedding[n]
+                seq_prev_frame_embedding = previous_frame_embedding[n]
+                seq_previous_frame_label = paddle.cast(
+                    (paddle.cast(scale_previous_frame_label[n], dtype="int32")
+                     == ref_obj_ids),
+                    dtype="float32")
+                if np.array(gt_ids)[n] > 0:
+                    dis_bias = paddle.concat([
+                        paddle.unsqueeze(self.bg_bias[scale_idx], axis=0),
+                        paddle.expand(
+                            paddle.unsqueeze(self.fg_bias[scale_idx], axis=0),
+                            [np.array(gt_ids)[n], -1, -1, -1])
+                    ],
+                                             axis=0)
+                else:
+                    dis_bias = paddle.unsqueeze(self.bg_bias[scale_idx], axis=0)
+                #Global FG map
+                matching_dim = MODEL_SEMANTIC_MATCHING_DIM[scale_idx]
+                seq_current_frame_embedding_for_matching = paddle.transpose(
+                    seq_current_frame_embedding[:matching_dim], [1, 2, 0])
+
+                if not self.test_mode:
+                    raise NotImplementedError
+                else:
+                    all_scale_ref_frame_label = scale_ref_frame_label
+                    all_ref_frame_embedding = ref_frame_embedding
+                    all_reference_embeddings = []
+                    all_reference_labels = []
+                    seq_ref_frame_labels = []
+                    count = 0
+                    for idx in range(len(all_scale_ref_frame_label)):
+
+                        ref_frame_embedding = all_ref_frame_embedding[idx]
+                        scale_ref_frame_label = all_scale_ref_frame_label[idx]
+
+                        seq_ref_frame_embedding = ref_frame_embedding[n]
+                        seq_ref_frame_embedding = paddle.transpose(
+                            seq_ref_frame_embedding, [1, 2, 0])
+                        seq_ref_frame_label = paddle.cast(
+                            (paddle.cast(scale_ref_frame_label[n],
+                                         dtype="int32") == ref_obj_ids),
+                            dtype="float32")
+                        seq_ref_frame_labels.append(seq_ref_frame_label)
+                        seq_ref_frame_label = paddle.transpose(
+                            paddle.squeeze(seq_ref_frame_label, axis=1),
+                            [1, 2, 0])
+                        all_reference_embeddings.append(
+                            seq_ref_frame_embedding[:, :, :matching_dim])
+                        all_reference_labels.append(seq_ref_frame_label)
+                    global_matching_fg = global_matching_for_eval(
+                        all_reference_embeddings=all_reference_embeddings,
+                        query_embeddings=
+                        seq_current_frame_embedding_for_matching,
+                        all_reference_labels=all_reference_labels,
+                        n_chunks=TEST_GLOBAL_MATCHING_CHUNK[scale_idx],
+                        dis_bias=dis_bias,
+                        atrous_rate=TEST_GLOBAL_ATROUS_RATE[scale_idx],
+                        use_float16=MODEL_FLOAT16_MATCHING,
+                        atrous_obj_pixel_num=TEST_GLOBAL_MATCHING_MIN_PIXEL)
+
+                # Local FG map
+                seq_prev_frame_embedding_for_matching = paddle.transpose(
+                    seq_prev_frame_embedding[:matching_dim], [1, 2, 0])
+                seq_previous_frame_label_for_matching = paddle.transpose(
+                    paddle.squeeze(seq_previous_frame_label, axis=1), [1, 2, 0])
+                local_matching_fg = local_matching(
+                    prev_frame_embedding=seq_prev_frame_embedding_for_matching,
+                    query_embedding=seq_current_frame_embedding_for_matching,
+                    prev_frame_labels=seq_previous_frame_label_for_matching,
+                    multi_local_distance=MODEL_MULTI_LOCAL_DISTANCE[scale_idx],
+                    dis_bias=dis_bias,
+                    atrous_rate=TRAIN_LOCAL_ATROUS_RATE[scale_idx] if
+                    not self.test_mode else TEST_LOCAL_ATROUS_RATE[scale_idx],
+                    use_float16=MODEL_FLOAT16_MATCHING,
+                    allow_downsample=False,
+                    allow_parallel=TRAIN_LOCAL_PARALLEL
+                    if not self.test_mode else TEST_LOCAL_PARALLEL)
+
+                #Aggregate Pixel-level Matching
+                to_cat_global_matching_fg = paddle.transpose(
+                    paddle.squeeze(global_matching_fg, axis=0), [2, 3, 0, 1])
+                to_cat_local_matching_fg = paddle.transpose(
+                    paddle.squeeze(local_matching_fg, axis=0), [2, 3, 0, 1])
+                all_to_cat = [
+                    to_cat_global_matching_fg, to_cat_local_matching_fg,
+                    seq_previous_frame_label
+                ]
+
+                #Global and Local BG map
+                if MODEL_MATCHING_BACKGROUND:
+                    to_cat_global_matching_bg = foreground2background(
+                        to_cat_global_matching_fg,
+                        np.array(gt_ids)[n] + 1)
+                    reshaped_prev_nn_feature_n = paddle.unsqueeze(
+                        paddle.transpose(to_cat_local_matching_fg,
+                                         [0, 2, 3, 1]),
+                        axis=1)
+                    to_cat_local_matching_bg = foreground2background(
+                        reshaped_prev_nn_feature_n,
+                        np.array(gt_ids)[n] + 1)
+                    to_cat_local_matching_bg = paddle.squeeze(paddle.transpose(
+                        to_cat_local_matching_bg, [0, 4, 2, 3, 1]),
+                                                              axis=-1)
+                    all_to_cat += [
+                        to_cat_local_matching_bg, to_cat_global_matching_bg
+                    ]
+
+                to_cat_current_frame_embedding = paddle.expand(
+                    paddle.unsqueeze(current_frame_embedding[n], axis=0),
+                    [obj_num, -1, -1, -1])
+                to_cat_prev_frame_embedding = paddle.expand(
+                    paddle.unsqueeze(previous_frame_embedding[n], axis=0),
+                    [obj_num, -1, -1, -1])
+                to_cat_prev_frame_embedding_fg = to_cat_prev_frame_embedding * seq_previous_frame_label
+                to_cat_prev_frame_embedding_bg = to_cat_prev_frame_embedding * (
+                    1 - seq_previous_frame_label)
+                all_to_cat += [
+                    to_cat_current_frame_embedding,
+                    to_cat_prev_frame_embedding_fg,
+                    to_cat_prev_frame_embedding_bg
+                ]
+
+                CE_input = paddle.concat(all_to_cat, axis=1)
+                #Instance-level Attention
+                if not self.test_mode:
+                    raise NotImplementedError
+                else:
+                    attention_head = calculate_attention_head_for_eval(
+                        all_ref_frame_embedding,
+                        seq_ref_frame_labels,
+                        paddle.expand(
+                            paddle.unsqueeze(previous_frame_embedding[n],
+                                             axis=0), [obj_num, -1, -1, -1]),
+                        seq_previous_frame_label,
+                        epsilon=self.epsilon)
+
+                all_CE_input.append(CE_input)
+                all_attention_head.append(attention_head)
+
+            #Collaborative Ensembler
+            pred = self.head(all_CE_input, all_attention_head, low_level_feat)
+            dic_tmp.append(pred)
+
+        return dic_tmp, boards
diff --git a/docs/src/paddlevideo/modeling/framework/segment/utils.py b/docs/src/paddlevideo/modeling/framework/segment/utils.py
new file mode 100644
index 000000000..1ec3be4d2
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/segment/utils.py
@@ -0,0 +1,754 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def foreground2background(dis, obj_num):
+    if obj_num == 1:
+        return dis
+    bg_dis = []
+    for i in range(obj_num):
+        obj_back = []
+        for j in range(obj_num):
+            if i == j:
+                continue
+            obj_back.append(paddle.unsqueeze(dis[j], axis=0))
+        obj_back = paddle.concat(x=obj_back, axis=1)
+        obj_back = paddle.min(x=obj_back, axis=1, keepdim=True)
+        bg_dis.append(obj_back)
+    bg_dis = paddle.concat(x=bg_dis, axis=0)
+    return bg_dis
+
+
+WRONG_LABEL_PADDING_DISTANCE = 5e4
+
+
+#GLOBAL_DIST_MAP
+def _pairwise_distances(x, x2, y, y2):
+    """
+    Computes pairwise squared l2 distances between tensors x and y.
+    Args:
+    x: [n, feature_dim].
+    y: [m, feature_dim].
+    Returns:
+    d: [n, m].
+    """
+    xs = x2
+    ys = y2
+
+    xs = paddle.unsqueeze(xs, axis=1)
+    ys = paddle.unsqueeze(ys, axis=0)
+    d = xs + ys - 2. * paddle.matmul(x, y, transpose_y=True)
+    return d
+
+
+def _flattened_pairwise_distances(reference_embeddings, ref_square,
+                                  query_embeddings, query_square):
+    """
+    Calculates flattened tensor of pairwise distances between ref and query.
+    Args:
+        reference_embeddings: [..., embedding_dim],
+          the embedding vectors for the reference frame
+        query_embeddings: [..., embedding_dim],
+          the embedding vectors for the query frames.
+    Returns:
+        dists: [reference_embeddings.size / embedding_dim, query_embeddings.size / embedding_dim]
+    """
+    dists = _pairwise_distances(query_embeddings, query_square,
+                                reference_embeddings, ref_square)
+    return dists
+
+
+def _nn_features_per_object_for_chunk(reference_embeddings, ref_square,
+                                      query_embeddings, query_square,
+                                      wrong_label_mask):
+    """Extracts features for each object using nearest neighbor attention.
+    Args:
+        reference_embeddings: [n_chunk, embedding_dim],
+          the embedding vectors for the reference frame.
+        query_embeddings: [m_chunk, embedding_dim],
+          the embedding vectors for the query frames.
+        wrong_label_mask: [n_objects, n_chunk],
+          the mask for pixels not used for matching.
+    Returns:
+        nn_features: A float32 tensor of nearest neighbor features of shape
+          [m_chunk, n_objects, n_chunk].
+    """
+    if reference_embeddings.dtype == "float16":
+        wrong_label_mask = paddle.cast(wrong_label_mask, dtype="float16")
+    else:
+        wrong_label_mask = paddle.cast(wrong_label_mask, dtype="float32")
+
+    reference_embeddings_key = reference_embeddings
+    query_embeddings_key = query_embeddings
+    dists = _flattened_pairwise_distances(reference_embeddings_key, ref_square,
+                                          query_embeddings_key, query_square)
+    dists = (paddle.unsqueeze(dists, axis=1) +
+             paddle.unsqueeze(wrong_label_mask, axis=0) *
+             WRONG_LABEL_PADDING_DISTANCE)
+    features = paddle.min(dists, axis=2, keepdim=True)
+    return features
+
+
+def _nearest_neighbor_features_per_object_in_chunks(reference_embeddings_flat,
+                                                    query_embeddings_flat,
+                                                    reference_labels_flat,
+                                                    n_chunks):
+    """Calculates the nearest neighbor features per object in chunks to save mem.
+    Uses chunking to bound the memory use.
+    Args:
+        reference_embeddings_flat: [n, embedding_dim],
+          the embedding vectors for the reference frame.
+        query_embeddings_flat: [m, embedding_dim],
+          the embedding vectors for the query frames.
+        reference_labels_flat: [n, n_objects],
+          the class labels of the reference frame.
+        n_chunks: Integer, the number of chunks to use to save memory
+          (set to 1 for no chunking).
+    Returns:
+        nn_features: [m, n_objects, n].
+    """
+
+    feature_dim, embedding_dim = query_embeddings_flat.shape
+    chunk_size = int(np.ceil(float(feature_dim) / n_chunks))
+    wrong_label_mask = reference_labels_flat < 0.1
+
+    wrong_label_mask = paddle.transpose(x=wrong_label_mask, perm=[1, 0])
+    ref_square = paddle.sum(paddle.pow(reference_embeddings_flat, 2), axis=1)
+    query_square = paddle.sum(paddle.pow(query_embeddings_flat, 2), axis=1)
+
+    all_features = []
+    for n in range(n_chunks):
+        if n_chunks == 1:
+            query_embeddings_flat_chunk = query_embeddings_flat
+            query_square_chunk = query_square
+            chunk_start = 0
+        else:
+            chunk_start = n * chunk_size
+            chunk_end = (n + 1) * chunk_size
+            query_square_chunk = query_square[chunk_start:chunk_end]
+            if query_square_chunk.shape[0] == 0:
+                continue
+            query_embeddings_flat_chunk = query_embeddings_flat[
+                chunk_start:chunk_end]
+        features = _nn_features_per_object_for_chunk(
+            reference_embeddings_flat, ref_square, query_embeddings_flat_chunk,
+            query_square_chunk, wrong_label_mask)
+        all_features.append(features)
+    if n_chunks == 1:
+        nn_features = all_features[0]
+    else:
+        nn_features = paddle.concat(all_features, axis=0)
+
+    return nn_features
+
+
+def global_matching(reference_embeddings,
+                    query_embeddings,
+                    reference_labels,
+                    n_chunks=100,
+                    dis_bias=0.,
+                    ori_size=None,
+                    atrous_rate=1,
+                    use_float16=True,
+                    atrous_obj_pixel_num=0):
+    """
+    Calculates the distance to the nearest neighbor per object.
+    For every pixel of query_embeddings calculate the distance to the
+    nearest neighbor in the (possibly subsampled) reference_embeddings per object.
+    Args:
+        reference_embeddings: [height, width, embedding_dim],
+          the embedding vectors for the reference frame.
+        query_embeddings: [height, width,
+          embedding_dim], the embedding vectors for the query frames.
+        reference_labels: [height, width, obj_nums],
+          the class labels of the reference frame.
+        n_chunks: Integer, the number of chunks to use to save memory
+          (set to 1 for no chunking).
+        dis_bias: [n_objects], foreground and background bias
+        ori_size: (ori_height, ori_width),
+          the original spatial size. If "None", (ori_height, ori_width) = (height, width).
+        atrous_rate: Integer, the atrous rate of reference_embeddings.
+        use_float16: Bool, if "True", use float16 type for matching.
+    Returns:
+        nn_features: [1, ori_height, ori_width, n_objects, feature_dim].
+    """
+
+    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])
+    if use_float16:
+        query_embeddings = paddle.cast(query_embeddings, dtype="float16")
+        reference_embeddings = paddle.cast(reference_embeddings,
+                                           dtype="float16")
+    h, w, embedding_dim = query_embeddings.shape
+    obj_nums = reference_labels.shape[2]
+
+    if atrous_rate > 1:
+        h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+        w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+        selected_points = paddle.zeros([h + h_pad, w + w_pad])
+        selected_points = selected_points.view(
+            (h + h_pad) // atrous_rate, atrous_rate, (w + w_pad) // atrous_rate,
+            atrous_rate)
+        selected_points[:, 0, :, 0] = 1.
+        selected_points = paddle.reshape(selected_points,
+                                         [h + h_pad, w + w_pad, 1])[:h, :w]
+        is_big_obj = (paddle.sum(
+            reference_labels,
+            axis=(0, 1))) > (atrous_obj_pixel_num * atrous_rate**2)
+        reference_labels[:, :,
+                         is_big_obj] = reference_labels[:, :,
+                                                        is_big_obj] * selected_points
+
+    reference_embeddings_flat = paddle.reshape(reference_embeddings,
+                                               [-1, embedding_dim])
+    reference_labels_flat = paddle.reshape(reference_labels, [-1, obj_nums])
+    query_embeddings_flat = paddle.reshape(query_embeddings,
+                                           [-1, embedding_dim])
+
+    all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9
+    reference_labels_flat = paddle.reshape(
+        paddle.masked_select(reference_labels_flat,
+                             paddle.expand(all_ref_fg, [-1, obj_nums])),
+        [-1, obj_nums])
+    if reference_labels_flat.shape[0] == 0:
+        return paddle.ones([1, h, w, obj_nums, 1])
+    reference_embeddings_flat = paddle.reshape(
+        paddle.masked_select(reference_embeddings_flat,
+                             paddle.expand(all_ref_fg, [-1, embedding_dim])),
+        [-1, embedding_dim])
+
+    nn_features = _nearest_neighbor_features_per_object_in_chunks(
+        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
+        n_chunks)
+
+    nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])
+    nn_features_reshape = (
+        F.sigmoid(nn_features_reshape +
+                  paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2
+
+    #TODO: ori_size is not None
+
+    if use_float16:
+        nn_features_reshape = paddle.cast(nn_features_reshape, dtype="float32")
+    return nn_features_reshape
+
+
+def global_matching_for_eval(all_reference_embeddings,
+                             query_embeddings,
+                             all_reference_labels,
+                             n_chunks=20,
+                             dis_bias=0.,
+                             ori_size=None,
+                             atrous_rate=1,
+                             use_float16=True,
+                             atrous_obj_pixel_num=0):
+    """
+    Calculates the distance to the nearest neighbor per object.
+    For every pixel of query_embeddings calculate the distance to the
+    nearest neighbor in the (possibly subsampled) reference_embeddings per object.
+    Args:
+        all_reference_embeddings: A list of reference_embeddings,
+          each with size [height, width, embedding_dim],
+          the embedding vectors for the reference frame.
+        query_embeddings: [n_query_images, height, width,
+          embedding_dim], the embedding vectors for the query frames.
+        all_reference_labels: A list of reference_labels,
+          each with size [height, width, obj_nums],
+          the class labels of the reference frame.
+        n_chunks: Integer, the number of chunks to use to save memory
+          (set to 1 for no chunking).
+        dis_bias: [n_objects], foreground and background bias
+        ori_size: (ori_height, ori_width),
+          the original spatial size. If "None", (ori_height, ori_width) = (height, width).
+        atrous_rate: Integer, the atrous rate of reference_embeddings.
+        use_float16: Bool, if "True", use float16 type for matching.
+    Returns:
+        nn_features: [n_query_images, ori_height, ori_width, n_objects, feature_dim].
+    """
+
+    h, w, embedding_dim = query_embeddings.shape
+    obj_nums = all_reference_labels[0].shape[2]
+    all_reference_embeddings_flat = []
+    all_reference_labels_flat = []
+    ref_num = len(all_reference_labels)
+    n_chunks *= ref_num
+    if atrous_obj_pixel_num > 0:
+        if atrous_rate > 1:
+            h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+            w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+            selected_points = paddle.zeros([h + h_pad, w + w_pad])
+            selected_points = paddle.reshape(
+                selected_points, [(h + h_pad) // atrous_rate, atrous_rate,
+                                  (w + w_pad) // atrous_rate, atrous_rate])
+            selected_points[:, 0, :, 0] = 1.
+            selected_points = paddle.reshape(selected_points,
+                                             [h + h_pad, w + w_pad, 1])[:h, :w]
+
+        for reference_embeddings, reference_labels, idx in zip(
+                all_reference_embeddings, all_reference_labels, range(ref_num)):
+            if atrous_rate > 1:
+                is_big_obj = paddle.sum(
+                    reference_labels,
+                    axis=(0, 1)) > (atrous_obj_pixel_num * atrous_rate**2)
+                is_big_obj = list(np.array(is_big_obj))
+                for j in range(len(is_big_obj)):
+                    if is_big_obj[j] == True:
+                        reference_labels[:, :, j:j +
+                                         1] = reference_labels[:, :, j:j +
+                                                               1] * selected_points
+
+            reference_embeddings_flat = paddle.reshape(reference_embeddings,
+                                                       [-1, embedding_dim])
+            reference_labels_flat = paddle.reshape(reference_labels,
+                                                   [-1, obj_nums])
+
+            all_reference_embeddings_flat.append(reference_embeddings_flat)
+            all_reference_labels_flat.append(reference_labels_flat)
+
+        reference_embeddings_flat = paddle.concat(
+            x=all_reference_embeddings_flat, axis=0)
+        reference_labels_flat = paddle.concat(x=all_reference_labels_flat,
+                                              axis=0)
+    else:
+        if ref_num == 1:
+            reference_embeddings, reference_labels = all_reference_embeddings[
+                0], all_reference_labels[0]
+            if atrous_rate > 1:
+                h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+                w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+                if h_pad > 0 or w_pad > 0:
+                    reference_embeddings = F.pad(reference_embeddings,
+                                                 [0, h_pad, 0, w_pad, 0, 0])
+                    reference_labels = F.pad(reference_labels,
+                                             [0, h_pad, 0, w_pad, 0, 0])
+                reference_embeddings = paddle.reshape(
+                    reference_embeddings,
+                    [(h + h_pad) // atrous_rate, atrous_rate,
+                     (w + w_pad) // atrous_rate, atrous_rate, 32])
+                reference_labels = paddle.reshape(
+                    reference_labels,
+                    [(h + h_pad) // atrous_rate, atrous_rate,
+                     (w + w_pad) // atrous_rate, atrous_rate, -1])
+                reference_embeddings = paddle.reshape(
+                    reference_embeddings[:, 0, :, 0, :],
+                    reference_embeddings[:, 0, :, 0, :].shape)
+                reference_labels = paddle.reshape(
+                    reference_labels[:, 0, :, 0, :],
+                    reference_labels[:, 0, :, 0, :].shape)
+            reference_embeddings_flat = paddle.reshape(reference_embeddings,
+                                                       [-1, embedding_dim])
+            reference_labels_flat = paddle.reshape(reference_labels,
+                                                   [-1, obj_nums])
+        else:
+            for reference_embeddings, reference_labels, idx in zip(
+                    all_reference_embeddings, all_reference_labels,
+                    range(ref_num)):
+                if atrous_rate > 1:
+                    h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+                    w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+                    if h_pad > 0 or w_pad > 0:
+                        reference_embeddings = F.pad(reference_embeddings,
+                                                     [0, h_pad, 0, w_pad, 0, 0])
+                        reference_labels = F.pad(reference_labels,
+                                                 [0, h_pad, 0, w_pad, 0, 0])
+
+                    reference_embeddings = paddle.reshape(
+                        reference_embeddings,
+                        [(h + h_pad) // atrous_rate, atrous_rate,
+                         (w + w_pad) // atrous_rate, atrous_rate, -1])
+                    reference_labels = paddle.reshape(
+                        reference_labels,
+                        [(h + h_pad) // atrous_rate, atrous_rate,
+                         (w + w_pad) // atrous_rate, atrous_rate, -1])
+                    reference_embeddings = paddle.reshape(
+                        reference_embeddings[:, 0, :, 0, :],
+                        reference_embeddings[:, 0, :, 0, :].shape)
+                    reference_labels = paddle.reshape(
+                        reference_labels[:, 0, :, 0, :],
+                        reference_labels[:, 0, :, 0, :].shape)
+
+                reference_embeddings_flat = paddle.reshape(
+                    reference_embeddings, [-1, embedding_dim])
+                reference_labels_flat = paddle.reshape(reference_labels,
+                                                       [-1, obj_nums])
+
+                all_reference_embeddings_flat.append(reference_embeddings_flat)
+                all_reference_labels_flat.append(reference_labels_flat)
+
+            reference_embeddings_flat = paddle.concat(
+                all_reference_embeddings_flat, axis=0)
+            reference_labels_flat = paddle.concat(all_reference_labels_flat,
+                                                  axis=0)
+
+    query_embeddings_flat = paddle.reshape(query_embeddings,
+                                           [-1, embedding_dim])
+
+    all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9
+    reference_labels_flat = paddle.reshape(
+        paddle.masked_select(reference_labels_flat,
+                             paddle.expand(all_ref_fg, [-1, obj_nums])),
+        [-1, obj_nums])
+    if reference_labels_flat.shape[0] == 0:
+        return paddle.ones([1, h, w, obj_nums, 1])
+    reference_embeddings_flat = paddle.reshape(
+        paddle.masked_select(reference_embeddings_flat,
+                             paddle.expand(all_ref_fg, [-1, embedding_dim])),
+        [-1, embedding_dim])
+    if use_float16:
+        query_embeddings_flat = paddle.cast(query_embeddings_flat,
+                                            dtype="float16")
+        reference_embeddings_flat = paddle.cast(reference_embeddings_flat,
+                                                dtype="float16")
+    nn_features = _nearest_neighbor_features_per_object_in_chunks(
+        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
+        n_chunks)
+
+    nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])
+    nn_features_reshape = (
+        F.sigmoid(nn_features_reshape +
+                  paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2
+
+    # TODO: ori_size is not None
+
+    if use_float16:
+        nn_features_reshape = paddle.cast(nn_features_reshape, dtype="float32")
+    return nn_features_reshape
+
+
+#LOCAL_DIST_MAP
+def local_pairwise_distances(x,
+                             y,
+                             max_distance=9,
+                             atrous_rate=1,
+                             allow_downsample=False):
+    """Computes pairwise squared l2 distances using a local search window.
+        Use for-loop for saving memory.
+    Args:
+        x: Float32 tensor of shape [height, width, feature_dim].
+        y: Float32 tensor of shape [height, width, feature_dim].
+        max_distance: Integer, the maximum distance in pixel coordinates
+          per dimension which is considered to be in the search window.
+        atrous_rate: Integer, the atrous rate of local matching.
+        allow_downsample: Bool, if "True", downsample x and y
+          with a stride of 2.
+    Returns:
+        Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].
+    """
+    if allow_downsample:
+        ori_height = x.shape[0]
+        ori_width = x.shape[1]
+        x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)
+        y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)
+        down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)
+        x = F.interpolate(x,
+                          size=down_size,
+                          mode='bilinear',
+                          align_corners=True)
+        y = F.interpolate(y,
+                          size=down_size,
+                          mode='bilinear',
+                          align_corners=True)
+        x = paddle.unsqueeze(paddle.transpose(x, [1, 2, 0]), axis=0)
+        y = paddle.unsqueeze(paddle.transpose(y, [1, 2, 0]), axis=0)
+
+    pad_max_distance = max_distance - max_distance % atrous_rate
+    # no change pad
+    padded_y = F.pad(y, (0, 0, pad_max_distance, pad_max_distance,
+                         pad_max_distance, pad_max_distance),
+                     value=WRONG_LABEL_PADDING_DISTANCE)
+
+    height, width, _ = x.shape
+    dists = []
+    for y in range(2 * pad_max_distance // atrous_rate + 1):
+        y_start = y * atrous_rate
+        y_end = y_start + height
+        y_slice = padded_y[y_start:y_end]
+        for x in range(2 * max_distance + 1):
+            x_start = x * atrous_rate
+            x_end = x_start + width
+            offset_y = y_slice[:, x_start:x_end]
+            dist = paddle.sum(paddle.pow((x - offset_y), 2), axis=2)
+            dists.append(dist)
+    dists = paddle.stack(dists, axis=2)
+
+    return dists
+
+
+def local_pairwise_distances_parallel(x,
+                                      y,
+                                      max_distance=9,
+                                      atrous_rate=1,
+                                      allow_downsample=True):
+    """Computes pairwise squared l2 distances using a local search window.
+    Args:
+        x: Float32 tensor of shape [height, width, feature_dim].
+        y: Float32 tensor of shape [height, width, feature_dim].
+        max_distance: Integer, the maximum distance in pixel coordinates
+          per dimension which is considered to be in the search window.
+        atrous_rate: Integer, the atrous rate of local matching.
+        allow_downsample: Bool, if "True", downsample x and y
+          with a stride of 2.
+    Returns:
+        Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].
+    """
+
+    ori_height, ori_width, _ = x.shape
+    x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)
+    y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)
+    if allow_downsample:
+        down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)
+        x = F.interpolate(x,
+                          size=down_size,
+                          mode='bilinear',
+                          align_corners=True)
+        y = F.interpolate(y,
+                          size=down_size,
+                          mode='bilinear',
+                          align_corners=True)
+
+    _, channels, height, width = x.shape
+
+    x2 = paddle.reshape(paddle.sum(paddle.pow(x, 2), axis=1),
+                        [height, width, 1])
+    y2 = paddle.reshape(paddle.sum(paddle.pow(y, 2), axis=1),
+                        [1, 1, height, width])
+
+    pad_max_distance = max_distance - max_distance % atrous_rate
+    # no change pad
+    padded_y = F.pad(y, (pad_max_distance, pad_max_distance, pad_max_distance,
+                         pad_max_distance))
+    padded_y2 = F.pad(y2, (pad_max_distance, pad_max_distance, pad_max_distance,
+                           pad_max_distance),
+                      value=WRONG_LABEL_PADDING_DISTANCE)
+
+    offset_y = paddle.transpose(
+        paddle.reshape(
+            F.unfold(x=padded_y,
+                     kernel_sizes=[height, width],
+                     strides=[atrous_rate, atrous_rate]),
+            [channels, height * width, -1]), [1, 0, 2])
+    offset_y2 = paddle.reshape(
+        F.unfold(padded_y2,
+                 kernel_sizes=[height, width],
+                 strides=[atrous_rate, atrous_rate]), [height, width, -1])
+    x = paddle.transpose(paddle.reshape(x, [channels, height * width, -1]),
+                         [1, 2, 0])
+
+    dists = x2 + offset_y2 - 2. * paddle.reshape(paddle.matmul(x, offset_y),
+                                                 [height, width, -1])
+
+    return dists
+
+
+def local_matching(prev_frame_embedding,
+                   query_embedding,
+                   prev_frame_labels,
+                   dis_bias=0.,
+                   multi_local_distance=[15],
+                   ori_size=None,
+                   atrous_rate=1,
+                   use_float16=True,
+                   allow_downsample=True,
+                   allow_parallel=True):
+    """Computes nearest neighbor features while only allowing local matches.
+    Args:
+        prev_frame_embedding: [height, width, embedding_dim],
+          the embedding vectors for the last frame.
+        query_embedding: [height, width, embedding_dim],
+          the embedding vectors for the query frames.
+        prev_frame_labels: [height, width, n_objects],
+        the class labels of the previous frame.
+        multi_local_distance: A list of Integer,
+          a list of maximum distance allowed for local matching.
+        ori_size: (ori_height, ori_width),
+          the original spatial size. If "None", (ori_height, ori_width) = (height, width).
+        atrous_rate: Integer, the atrous rate of local matching.
+        use_float16: Bool, if "True", use float16 type for matching.
+        allow_downsample: Bool, if "True", downsample prev_frame_embedding and query_embedding
+          with a stride of 2.
+        allow_parallel: Bool, if "True", do matching in a parallel way. If "False", do matching in
+          a for-loop way, which will save GPU memory.
+    Returns:
+        nn_features: A float32 np.array of nearest neighbor features of shape
+          [1, height, width, n_objects, 1].
+    """
+    max_distance = multi_local_distance[-1]
+
+    if ori_size is None:
+        height, width = prev_frame_embedding.shape[:2]
+        ori_size = (height, width)
+
+    obj_num = prev_frame_labels.shape[2]
+    pad = paddle.ones([1]) * WRONG_LABEL_PADDING_DISTANCE
+    if use_float16:
+        query_embedding = paddle.cast(query_embedding, dtype="float16")
+        prev_frame_embedding = paddle.cast(prev_frame_embedding,
+                                           dtype="float16")
+        pad = paddle.cast(pad, dtype="float16")
+
+    if allow_parallel:
+        d = local_pairwise_distances_parallel(query_embedding,
+                                              prev_frame_embedding,
+                                              max_distance=max_distance,
+                                              atrous_rate=atrous_rate,
+                                              allow_downsample=allow_downsample)
+    else:
+        d = local_pairwise_distances(query_embedding,
+                                     prev_frame_embedding,
+                                     max_distance=max_distance,
+                                     atrous_rate=atrous_rate,
+                                     allow_downsample=allow_downsample)
+
+    height, width = d.shape[:2]
+
+    labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]), 1)
+    labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]),
+                              axis=1)
+    if (height, width) != ori_size:
+        labels = F.interpolate(labels, size=(height, width), mode='nearest')
+
+    pad_max_distance = max_distance - max_distance % atrous_rate
+    atrous_max_distance = pad_max_distance // atrous_rate
+    #no change pad
+    padded_labels = F.pad(labels, (
+        pad_max_distance,
+        pad_max_distance,
+        pad_max_distance,
+        pad_max_distance,
+    ),
+                          mode='constant',
+                          value=0)
+
+    offset_masks = paddle.transpose(
+        paddle.reshape(
+            F.unfold(padded_labels,
+                     kernel_sizes=[height, width],
+                     strides=[atrous_rate, atrous_rate]),
+            [obj_num, height, width, -1]), [1, 2, 3, 0]) > 0.9
+
+    d_tiled = paddle.expand(paddle.unsqueeze(
+        d, axis=-1), [-1, -1, -1, obj_num])  # h, w, num_local_pos, obj_num
+
+    d_masked = paddle.where(offset_masks, d_tiled, pad)
+    dists = paddle.min(d_masked, axis=2)
+    multi_dists = [
+        paddle.unsqueeze(paddle.transpose(dists, [2, 0, 1]), axis=1)
+    ]  # n_objects, num_multi_local, h, w
+
+    reshaped_d_masked = paddle.reshape(d_masked, [
+        height, width, 2 * atrous_max_distance + 1, 2 * atrous_max_distance + 1,
+        obj_num
+    ])
+    for local_dis in multi_local_distance[:-1]:
+        local_dis = local_dis // atrous_rate
+        start_idx = atrous_max_distance - local_dis
+        end_idx = atrous_max_distance + local_dis + 1
+        new_d_masked = paddle.reshape(
+            reshaped_d_masked[:, :, start_idx:end_idx, start_idx:end_idx, :],
+            reshaped_d_masked[:, :, start_idx:end_idx,
+                              start_idx:end_idx, :].shape)
+        new_d_masked = paddle.reshape(new_d_masked,
+                                      [height, width, -1, obj_num])
+        new_dists = paddle.min(new_d_masked, axis=2)
+        new_dists = paddle.unsqueeze(paddle.transpose(new_dists, [2, 0, 1]),
+                                     axis=1)
+        multi_dists.append(new_dists)
+
+    multi_dists = paddle.concat(multi_dists, axis=1)
+    multi_dists = (F.sigmoid(multi_dists +
+                             paddle.reshape(dis_bias, [-1, 1, 1, 1])) - 0.5) * 2
+
+    if use_float16:
+        multi_dists = paddle.cast(multi_dists, dtype="float32")
+
+    if (height, width) != ori_size:
+        multi_dists = F.interpolate(multi_dists,
+                                    size=ori_size,
+                                    mode='bilinear',
+                                    align_corners=True)
+    multi_dists = paddle.transpose(multi_dists, perm=[2, 3, 0, 1])
+    multi_dists = paddle.reshape(multi_dists,
+                                 [1, ori_size[0], ori_size[1], obj_num, -1])
+
+    return multi_dists
+
+
+def calculate_attention_head(ref_embedding,
+                             ref_label,
+                             prev_embedding,
+                             prev_label,
+                             epsilon=1e-5):
+
+    ref_head = ref_embedding * ref_label
+    ref_head_pos = paddle.sum(ref_head, axis=(2, 3))
+    ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos
+    ref_pos_num = paddle.sum(ref_label, axis=(2, 3))
+    ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))
+    ref_head_pos = ref_head_pos / (ref_pos_num + epsilon)
+    ref_head_neg = ref_head_neg / (ref_neg_num + epsilon)
+
+    prev_head = prev_embedding * prev_label
+    prev_head_pos = paddle.sum(prev_head, axis=(2, 3))
+    prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos
+    prev_pos_num = paddle.sum(prev_label, axis=(2, 3))
+    prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))
+    prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)
+    prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)
+
+    total_head = paddle.concat(
+        x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)
+
+    return total_head
+
+
+def calculate_attention_head_for_eval(ref_embeddings,
+                                      ref_labels,
+                                      prev_embedding,
+                                      prev_label,
+                                      epsilon=1e-5):
+    total_ref_head_pos = 0.
+    total_ref_head_neg = 0.
+    total_ref_pos_num = 0.
+    total_ref_neg_num = 0.
+
+    for idx in range(len(ref_embeddings)):
+        ref_embedding = ref_embeddings[idx]
+        ref_label = ref_labels[idx]
+        ref_head = ref_embedding * ref_label
+        ref_head_pos = paddle.sum(ref_head, axis=(2, 3))
+        ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos
+        ref_pos_num = paddle.sum(ref_label, axis=(2, 3))
+        ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))
+        total_ref_head_pos = total_ref_head_pos + ref_head_pos
+        total_ref_head_neg = total_ref_head_neg + ref_head_neg
+        total_ref_pos_num = total_ref_pos_num + ref_pos_num
+        total_ref_neg_num = total_ref_neg_num + ref_neg_num
+    ref_head_pos = total_ref_head_pos / (total_ref_pos_num + epsilon)
+    ref_head_neg = total_ref_head_neg / (total_ref_neg_num + epsilon)
+
+    prev_head = prev_embedding * prev_label
+    prev_head_pos = paddle.sum(prev_head, axis=(2, 3))
+    prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos
+    prev_pos_num = paddle.sum(prev_label, axis=(2, 3))
+    prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))
+    prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)
+    prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)
+
+    total_head = paddle.concat(
+        x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)
+    return total_head
diff --git a/docs/src/paddlevideo/modeling/framework/segmenters/__init__.py b/docs/src/paddlevideo/modeling/framework/segmenters/__init__.py
new file mode 100644
index 000000000..de4bf5734
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/segmenters/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseSegmenter
+from .ms_tcn import MSTCN
+from .asrf import ASRF
+
+__all__ = ['BaseSegmenter', 'MSTCN', 'ASRF']
diff --git a/docs/src/paddlevideo/modeling/framework/segmenters/asrf.py b/docs/src/paddlevideo/modeling/framework/segmenters/asrf.py
new file mode 100644
index 000000000..3d962c714
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/segmenters/asrf.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import SEGMENTERS
+from .base import BaseSegmenter
+
+import paddle
+import paddle.nn.functional as F
+from .utils import ASRFPostProcessing
+
+
+@SEGMENTERS.register()
+class ASRF(BaseSegmenter):
+    """ASRF model framework."""
+
+    def __init__(self,
+                 postprocessing_method,
+                 boundary_threshold,
+                 backbone=None,
+                 head=None,
+                 loss=None):
+
+        super().__init__(backbone=backbone, head=head, loss=loss)
+        self.postprocessing_method = postprocessing_method
+        self.boundary_threshold = boundary_threshold
+
+    def forward_net(self, video_feature):
+        """Define how the model is going to train, from input to output.
+        """
+        if self.backbone is not None:
+            feature = self.backbone(video_feature)
+        else:
+            feature = video_feature
+
+        if self.head is not None:
+            network_outputs = self.head(feature)
+        else:
+            network_outputs = None
+
+        return network_outputs
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        feature, label, boundary = data_batch
+        # call forward
+        outputs_cls, outputs_boundary = self.forward_net(feature)
+
+        # transfer data
+        outputs_cls_np = outputs_cls[-1].numpy()
+        outputs_boundary_np = outputs_boundary[-1].numpy()
+
+        # caculate loss
+        if self.loss is not None:
+            output_loss = self.loss(feature, outputs_cls, label,
+                                    outputs_boundary, boundary)
+        else:
+            output_loss = None
+
+        # predict post process
+        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,
+                                       self.postprocessing_method)
+        predicted = paddle.squeeze(predicted)
+
+        loss_metrics = dict()
+        loss_metrics['loss'] = output_loss
+        loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, label)
+
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        feature, label, boundary = data_batch
+
+        # call forward
+        outputs_cls, outputs_boundary = self.forward_net(feature)
+
+        # transfer data
+        outputs_cls_np = outputs_cls[-1].numpy()
+        outputs_boundary_np = outputs_boundary[-1].numpy()
+
+        ## caculate loss
+        if self.loss is not None:
+            output_loss = self.loss(feature, outputs_cls, label,
+                                    outputs_boundary, boundary)
+        else:
+            output_loss = None
+
+        # predict post process
+        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,
+                                       self.postprocessing_method)
+        predicted = paddle.squeeze(predicted)
+
+        outputs_dict = dict()
+        outputs_dict['loss'] = output_loss
+        outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, label)
+        return outputs_dict
+
+    def test_step(self, data_batch):
+        """Testing setp.
+        """
+        feature, _, _ = data_batch
+
+        outputs_dict = dict()
+        # call forward
+        outputs_cls, outputs_boundary = self.forward_net(feature)
+        # transfer data
+        outputs_cls_np = outputs_cls[-1].numpy()
+        outputs_boundary_np = outputs_boundary[-1].numpy()
+
+        # predict post process
+        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,
+                                       self.postprocessing_method)
+        outputs_dict['predict'] = paddle.to_tensor(predicted[0, :])
+        outputs_dict['output_np'] = F.sigmoid(outputs_cls[-1])
+        return outputs_dict
+
+    def infer_step(self, data_batch):
+        """Infering setp.
+        """
+        feature = data_batch[0]
+
+        # call forward
+        outputs_cls, outputs_boundary = self.forward_net(feature)
+        # transfer data
+        outputs_cls_np = outputs_cls[-1]
+        outputs_boundary_np = outputs_boundary[-1]
+
+        outputs = [
+            outputs_cls_np, outputs_boundary_np,
+            F.sigmoid(outputs_cls[-1])
+        ]
+        return outputs
diff --git a/docs/src/paddlevideo/modeling/framework/segmenters/base.py b/docs/src/paddlevideo/modeling/framework/segmenters/base.py
new file mode 100644
index 000000000..e0856d9ad
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/segmenters/base.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseSegmenter(nn.Layer):
+    """Base class for segementers.
+
+    All segementers should subclass it.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Classification head to process feature.
+
+    """
+
+    def __init__(self, backbone=None, head=None, loss=None):
+
+        super().__init__()
+        # build backbone
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        # build head
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+        # build loss
+        if loss is not None:
+            self.loss_name = loss.name
+            self.loss = builder.build_loss(loss)
+            if hasattr(self.loss, 'init_weights'):
+                self.loss.init_weights()
+        else:
+            self.loss = None
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch, **kwargs):
+        """Infer step.
+        """
+        raise NotImplementedError
diff --git a/docs/src/paddlevideo/modeling/framework/segmenters/ms_tcn.py b/docs/src/paddlevideo/modeling/framework/segmenters/ms_tcn.py
new file mode 100644
index 000000000..a5982a7c9
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/segmenters/ms_tcn.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import SEGMENTERS
+from .base import BaseSegmenter
+
+import paddle
+import paddle.nn.functional as F
+
+
+@SEGMENTERS.register()
+class MSTCN(BaseSegmenter):
+    """MS-TCN model framework."""
+
+    def forward_net(self, video_feature):
+        """Define how the model is going to train, from input to output.
+        """
+        if self.backbone is not None:
+            feature = self.backbone(video_feature)
+        else:
+            feature = video_feature
+
+        if self.head is not None:
+            cls_score = self.head(feature)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        video_feat, video_gt = data_batch
+
+        # call forward
+        output = self.forward_net(video_feat)
+        loss = 0.
+        for i in range(len(output)):
+            loss += self.head.loss(output[i], video_gt)
+
+        predicted = paddle.argmax(output[-1], axis=1)
+        predicted = paddle.squeeze(predicted)
+
+        loss_metrics = dict()
+        loss_metrics['loss'] = loss
+        loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        video_feat, video_gt = data_batch
+
+        # call forward
+        output = self.forward_net(video_feat)
+        loss = 0.
+        for i in range(len(output)):
+            loss += self.head.loss(output[i], video_gt)
+
+        predicted = paddle.argmax(output[-1], axis=1)
+        predicted = paddle.squeeze(predicted)
+
+        outputs_dict = dict()
+        outputs_dict['loss'] = loss
+        outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)
+        return outputs_dict
+
+    def test_step(self, data_batch):
+        """Testing setp.
+        """
+        video_feat, _ = data_batch
+
+        outputs_dict = dict()
+        # call forward
+        output = self.forward_net(video_feat)
+        predicted = paddle.argmax(output[-1], axis=1)
+        predicted = paddle.squeeze(predicted)
+        outputs_dict['predict'] = predicted
+        outputs_dict['output_np'] = F.sigmoid(output[-1])
+        return outputs_dict
+
+    def infer_step(self, data_batch):
+        """Infering setp.
+        """
+        video_feat = data_batch[0]
+
+        # call forward
+        output = self.forward_net(video_feat)
+        predicted = paddle.argmax(output[-1], axis=1)
+        predicted = paddle.squeeze(predicted)
+        output_np = F.sigmoid(output[-1])
+        return predicted, output_np
diff --git a/docs/src/paddlevideo/modeling/framework/segmenters/utils.py b/docs/src/paddlevideo/modeling/framework/segmenters/utils.py
new file mode 100644
index 000000000..9c21cbb86
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/framework/segmenters/utils.py
@@ -0,0 +1,343 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# https://github.com/yiskw713/asrf/libs/postprocess.py
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import math
+
+
+class GaussianSmoothing(nn.Layer):
+    """
+    Apply gaussian smoothing on a 1d tensor.
+    Filtering is performed seperately for each channel
+    in the input using a depthwise convolution.
+    Arguments:
+        channels (int, sequence): Number of channels of the input tensors. Output will
+            have this number of channels as well.
+        kernel_size (int, sequence): Size of the gaussian kernel.
+        sigma (float, sequence): Standard deviation of the gaussian kernel.
+    """
+
+    def __init__(self, kernel_size=15, sigma=1.0):
+        super().__init__()
+        self.kernel_size = kernel_size
+
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrid = paddle.arange(kernel_size)
+
+        meshgrid = paddle.cast(meshgrid, dtype='float32')
+
+        mean = (kernel_size - 1) / 2
+        kernel = kernel / (sigma * math.sqrt(2 * math.pi))
+        kernel = kernel * paddle.exp(-(((meshgrid - mean) / sigma)**2) / 2)
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        # kernel = kernel / paddle.max(kernel)
+
+        self.kernel = paddle.reshape(kernel, [1, 1, -1])
+
+    def forward(self, inputs):
+        """
+        Apply gaussian filter to input.
+        Arguments:
+            input (paddle.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (paddle.Tensor): Filtered output.
+        """
+        _, c, _ = inputs.shape
+        inputs = F.pad(inputs,
+                       pad=((self.kernel_size - 1) // 2,
+                            (self.kernel_size - 1) // 2),
+                       mode="reflect",
+                       data_format='NCL')
+
+        kernel = paddle.expand(self.kernel, shape=[c, 1, self.kernel_size])
+        return F.conv1d(inputs, weight=kernel, groups=c)
+
+
+def argrelmax(prob, threshold=0.7):
+    """
+    Calculate arguments of relative maxima.
+    prob: np.array. boundary probability maps distributerd in [0, 1]
+    prob shape is (T)
+    ignore the peak whose value is under threshold
+
+    Return:
+        Index of peaks for each batch
+    """
+    # ignore the values under threshold
+    prob[prob < threshold] = 0.0
+
+    # calculate the relative maxima of boundary maps
+    # treat the first frame as boundary
+    peak = np.concatenate(
+        [
+            np.ones((1), dtype=np.bool),
+            (prob[:-2] < prob[1:-1]) & (prob[2:] < prob[1:-1]),
+            np.zeros((1), dtype=np.bool),
+        ],
+        axis=0,
+    )
+
+    peak_idx = np.where(peak)[0].tolist()
+
+    return peak_idx
+
+
+def is_probability(x):
+    assert x.ndim == 3
+
+    if x.shape[1] == 1:
+        # sigmoid
+        if x.min() >= 0 and x.max() <= 1:
+            return True
+        else:
+            return False
+    else:
+        # softmax
+        _sum = np.sum(x, axis=1).astype(np.float32)
+        _ones = np.ones_like(_sum, dtype=np.float32)
+        return np.allclose(_sum, _ones)
+
+
+def convert2probability(x):
+    """
+    Args: x (N, C, T)
+    """
+    assert x.ndim == 3
+
+    if is_probability(x):
+        return x
+    else:
+        if x.shape[1] == 1:
+            # sigmoid
+            prob = 1 / (1 + np.exp(-x))
+        else:
+            # softmax
+            prob = np.exp(x) / np.sum(np.exp(x), axis=1)
+        return prob.astype(np.float32)
+
+
+def convert2label(x):
+    assert x.ndim == 2 or x.ndim == 3
+
+    if x.ndim == 2:
+        return x.astype(np.int64)
+    else:
+        if not is_probability(x):
+            x = convert2probability(x)
+
+        label = np.argmax(x, axis=1)
+        return label.astype(np.int64)
+
+
+def refinement_with_boundary(outputs, boundaries, boundary_threshold):
+    """
+    Get segments which is defined as the span b/w two boundaries,
+    and decide their classes by majority vote.
+    Args:
+        outputs: numpy array. shape (N, C, T)
+            the model output for frame-level class prediction.
+        boundaries: numpy array.  shape (N, 1, T)
+            boundary prediction.
+        boundary_threshold: the threshold of the size of action segments. float(default=0.7)
+    Return:
+        preds: np.array. shape (N, T)
+            final class prediction considering boundaries.
+    """
+
+    preds = convert2label(outputs)
+    boundaries = convert2probability(boundaries)
+
+    for i, (output, pred, boundary) in enumerate(zip(outputs, preds,
+                                                     boundaries)):
+        idx = argrelmax(boundary[0, :], threshold=boundary_threshold)
+
+        # add the index of the last action ending
+        T = pred.shape[0]
+        idx.append(T)
+
+        # majority vote
+        for j in range(len(idx) - 1):
+            count = np.bincount(pred[idx[j]:idx[j + 1]])
+            modes = np.where(count == count.max())[0]
+            if len(modes) == 1:
+                mode = modes
+            else:
+                if outputs.ndim == 3:
+                    # if more than one majority class exist
+                    prob_sum_max = 0
+                    for m in modes:
+                        prob_sum = output[m, idx[j]:idx[j + 1]].sum()
+                        if prob_sum_max < prob_sum:
+                            mode = m
+                            prob_sum_max = prob_sum
+                else:
+                    # decide first mode when more than one majority class
+                    # have the same number during oracle experiment
+                    mode = modes[0]
+
+            preds[i, idx[j]:idx[j + 1]] = mode
+    return preds
+
+
+def relabeling(outputs, theta_t):
+    """
+        Relabeling small action segments with their previous action segment
+        Args:
+            output: the results of action segmentation. (N, T) or (N, C, T)
+            theta_t: the threshold of the size of action segments.
+        Return:
+            relabeled output. (N, T)
+        """
+
+    preds = convert2label(outputs)
+
+    for i in range(preds.shape[0]):
+        # shape (T,)
+        last = preds[i][0]
+        cnt = 1
+        for j in range(1, preds.shape[1]):
+            if last == preds[i][j]:
+                cnt += 1
+            else:
+                if cnt > theta_t:
+                    cnt = 1
+                    last = preds[i][j]
+                else:
+                    preds[i][j - cnt:j] = preds[i][j - cnt - 1]
+                    cnt = 1
+                    last = preds[i][j]
+
+        if cnt <= theta_t:
+            preds[i][j - cnt:j] = preds[i][j - cnt - 1]
+
+    return preds
+
+
+def smoothing(outputs, filter_func):
+    """
+        Smoothing action probabilities with gaussian filter.
+        Args:
+            outputs: frame-wise action probabilities. (N, C, T)
+        Return:
+            predictions: final prediction. (N, T)
+        """
+
+    outputs = convert2probability(outputs)
+    outputs = filter_func(paddle.to_tensor(outputs)).numpy()
+
+    preds = convert2label(outputs)
+    return preds
+
+
+def ASRFPostProcessing(outputs_cls,
+                       outputs_boundary,
+                       refinement_method,
+                       boundary_threshold=0.7,
+                       theta_t=15,
+                       kernel_size=15):
+    """
+    ASRF post processing is to refine action boundary
+    Args:
+        outputs_cls: the results of action segmentation. (N, T) or (N, C, T)
+        outputs_boundary: action boundary probability. (N, 1, T)
+        refinement_method: the way of refine predict boundary and classification. str
+        boundary_threshold: the threshold of the size of action segments. float(default=0.7)
+        theta_t: the threshold of the size of action segments. int(default=15)
+        kernel_size: Size of the gaussian kernel. int(default=15)
+    Return:
+        preds output. (N, T)
+    """
+    func = [
+        "refinement_with_boundary",
+        "relabeling",
+        "smoothing",
+    ]
+
+    if refinement_method == "smoothing":
+        filter_func = GaussianSmoothing(kernel_size)
+        preds = smoothing(outputs_cls, filter_func)
+    elif refinement_method == "relabeling":
+        preds = relabeling(outputs_cls, theta_t)
+    elif refinement_method == "refinement_with_boundary":
+        preds = refinement_with_boundary(outputs_cls, outputs_boundary,
+                                         boundary_threshold)
+    else:
+        preds = np.zeros((1, 1))
+        assert refinement_method in func
+
+    return paddle.to_tensor(preds)
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = len(tensor.shape)
+    if dimensions < 2:
+        raise ValueError("Fan in and fan out can not be computed \
+        for tensor with fewer than 2 dimensions")
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.shape[1]
+        fan_out = tensor.shape[0]
+    else:
+        num_input_fmaps = tensor.shape[1]
+        num_output_fmaps = tensor.shape[0]
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[0][0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def calculate_gain(nonlinearity=None, a=None):
+    if nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if a is not None:
+            return math.sqrt(2.0 / (1 + a**2))
+        else:
+            return math.sqrt(2.0 / (1 + 0.01**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4
+    else:
+        return 1
+
+
+def KaimingUniform_like_torch(weight_npy,
+                              mode='fan_in',
+                              nonlinearity='leaky_relu'):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+    if mode == 'fan_in':
+        fan_mode = fan_in
+    else:
+        fan_mode = fan_out
+    a = math.sqrt(5.0)
+    gain = calculate_gain(nonlinearity=nonlinearity, a=a)
+    std = gain / math.sqrt(fan_mode)
+    bound = math.sqrt(3.0) * std
+    return np.random.uniform(-bound, bound, weight_npy.shape)
+
+
+def init_bias(weight_npy, bias_npy):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+    bound = 1.0 / math.sqrt(fan_in)
+    return np.random.uniform(-bound, bound, bias_npy.shape)
diff --git a/docs/src/paddlevideo/modeling/heads/__init__.py b/docs/src/paddlevideo/modeling/heads/__init__.py
new file mode 100644
index 000000000..49f71cce9
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/__init__.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .adds_head import AddsHead
+from .asrf_head import ASRFHead
+from .attention_lstm_head import AttentionLstmHead, ActionAttentionLstmHead
+from .base import BaseHead
+from .bbox_head import BBoxHeadAVA
+from .cfbi_head import CollaborativeEnsemblerMS
+from .i3d_head import I3DHead
+from .movinet_head import MoViNetHead
+from .ms_tcn_head import MSTCNHead
+from .pptimesformer_head import ppTimeSformerHead
+from .pptsm_head import ppTSMHead
+from .pptsn_head import ppTSNHead
+from .roi_head import AVARoIHead
+from .single_straight3d import SingleRoIExtractor3D
+from .slowfast_head import SlowFastHead
+from .stgcn_head import STGCNHead
+from .timesformer_head import TimeSformerHead
+from .transnetv2_head import TransNetV2Head
+from .tsm_head import TSMHead
+from .tsn_head import TSNHead
+from .ms_tcn_head import MSTCNHead
+from .asrf_head import ASRFHead
+from .ctrgcn_head import CTRGCNHead
+from .movinet_head import MoViNetHead
+from .agcn2s_head import AGCN2sHead
+from .token_shift_head import TokenShiftHead
+
+__all__ = [
+    'BaseHead', 'TSNHead', 'TSMHead', 'ppTSMHead', 'ppTSNHead', 'SlowFastHead',
+    'AttentionLstmHead', 'TimeSformerHead', 'STGCNHead', 'TransNetV2Head',
+    'I3DHead', 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'AddsHead',
+    'ppTimeSformerHead', 'CollaborativeEnsemblerMS', 'MSTCNHead', 'ASRFHead',
+    'MoViNetHead', 'CTRGCNHead', 'TokenShiftHead', 'ActionAttentionLstmHead',
+    'AGCN2sHead'
+]
diff --git a/docs/src/paddlevideo/modeling/heads/adds_head.py b/docs/src/paddlevideo/modeling/heads/adds_head.py
new file mode 100644
index 000000000..3b1cd2462
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/adds_head.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle.nn as nn
+from paddlevideo.utils import get_dist_info
+import paddle
+from ..builder import build_loss
+from ..registry import HEADS
+
+MIN_DEPTH = 1e-3
+MAX_DEPTH = 80
+
+
+@HEADS.register()
+class AddsHead(nn.Layer):
+    """TimeSformerHead Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 avg_reprojection,
+                 disparity_smoothness,
+                 no_ssim,
+                 loss_cfg=dict(name='ADDSLoss'),
+                 max_gt_depth=60,
+                 pred_depth_scale_factor=1):
+
+        super(AddsHead, self).__init__()
+        loss_cfg['avg_reprojection'] = avg_reprojection
+        loss_cfg['disparity_smoothness'] = disparity_smoothness
+        loss_cfg['no_ssim'] = no_ssim
+        self.max_gt_depth = max_gt_depth
+        self.pred_depth_scale_factor = pred_depth_scale_factor
+        self.loss_func = build_loss(loss_cfg)
+
+    def forward(self):
+        raise NotImplemented
+
+    def loss(self, inputs, outputs):
+        if self.training:
+            return self.loss_func(inputs, outputs)
+        else:
+            abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.get_metrics(
+                outputs['pred_disp'], outputs['gt'])
+            outputs['abs_rel'] = abs_rel
+            outputs['sq_rel'] = sq_rel
+            outputs['rmse'] = rmse
+            outputs['rmse_log'] = rmse_log
+            outputs['a1'] = a1
+            outputs['a2'] = a2
+            outputs['a3'] = a3
+            return outputs
+
+    def get_metrics(self, pred_disp, gt_depth):
+        gt_height, gt_width = gt_depth.shape[:2]
+
+        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
+        pred_depth = 1 / pred_disp
+
+        mask = gt_depth > 0
+
+        pred_depth = pred_depth[mask]
+        gt_depth = gt_depth[mask]
+
+        pred_depth *= self.pred_depth_scale_factor
+        ratio = np.median(gt_depth) / np.median(pred_depth)
+        pred_depth *= ratio
+
+        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH
+        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH
+
+        mask2 = gt_depth <= self.max_gt_depth
+        pred_depth = pred_depth[mask2]
+        gt_depth = gt_depth[mask2]
+
+        abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.compute_errors(
+            gt_depth, pred_depth)
+
+        _, world_size = get_dist_info()
+        if world_size > 1:
+            # educe sum when valid
+            # TODO: there are some problems with multi gpu gather code.
+            abs_rel = paddle.to_tensor(abs_rel)
+            sq_rel = paddle.to_tensor(sq_rel)
+            rmse = paddle.to_tensor(rmse)
+            rmse_log = paddle.to_tensor(rmse_log)
+            a1 = paddle.to_tensor(a1)
+            a2 = paddle.to_tensor(a2)
+            a3 = paddle.to_tensor(a3)
+            abs_rel = paddle.distributed.all_reduce(
+                abs_rel, op=paddle.distributed.ReduceOp.SUM) / world_size
+            sq_rel = paddle.distributed.all_reduce(
+                sq_rel, op=paddle.distributed.ReduceOp.SUM) / world_size
+            rmse = paddle.distributed.all_reduce(
+                rmse, op=paddle.distributed.ReduceOp.SUM) / world_size
+            rmse_log = paddle.distributed.all_reduce(
+                rmse_log, op=paddle.distributed.ReduceOp.SUM) / world_size
+            a1 = paddle.distributed.all_reduce(
+                a1, op=paddle.distributed.ReduceOp.SUM) / world_size
+            a2 = paddle.distributed.all_reduce(
+                a2, op=paddle.distributed.ReduceOp.SUM) / world_size
+            a3 = paddle.distributed.all_reduce(
+                a3, op=paddle.distributed.ReduceOp.SUM) / world_size
+            return abs_rel.item(), sq_rel.item(), rmse.item(), rmse_log.item(
+            ), a1.item(), a2.item(), a3.item()
+
+        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3
+
+    def compute_errors(self, gt, pred):
+        """Computation of error metrics between predicted and ground truth depths
+        """
+        thresh = np.maximum((gt / pred), (pred / gt))
+        a1 = (thresh < 1.25).mean()
+        a2 = (thresh < 1.25**2).mean()
+        a3 = (thresh < 1.25**3).mean()
+
+        rmse = (gt - pred)**2
+        rmse = np.sqrt(rmse.mean())
+
+        rmse_log = (np.log(gt) - np.log(pred))**2
+        rmse_log = np.sqrt(rmse_log.mean())
+
+        abs_rel = np.mean(np.abs(gt - pred) / gt)
+
+        sq_rel = np.mean(((gt - pred)**2) / gt)
+
+        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3
diff --git a/docs/src/paddlevideo/modeling/heads/agcn2s_head.py b/docs/src/paddlevideo/modeling/heads/agcn2s_head.py
new file mode 100644
index 000000000..92cb5e4b1
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/agcn2s_head.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class AGCN2sHead(BaseHead):
+    """
+    Head for AGCN2s model.
+    Args:
+        in_channels: int, input feature channels. Default: 64.
+        num_classes: int, output the number of classes.
+        M: int, number of people.
+        drop_out: float, dropout ratio of layer. Default: 0.
+    """
+    def __init__(self, in_channels=64, num_classes=10, M=2, **kwargs):
+        super().__init__(num_classes, in_channels, **kwargs)
+        self.in_channels = in_channels
+        self.M = M
+        weight_attr = paddle.ParamAttr(
+            name="linear_weight",
+            initializer=paddle.nn.initializer.Normal(mean=0.0,
+                                                     std=math.sqrt(
+                                                         2. / num_classes)))
+
+        self.fc = nn.Linear(self.in_channels * 4,
+                            self.num_classes,
+                            weight_attr=weight_attr)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        """
+        assert x.shape[
+            0] % self.M == 0, f'The first dimension of the output must be an integer multiple of the number of people M, but recieved shape[0]={x.shape[0]}, M={self.M}'
+        # N*M,C,T,V
+        N = x.shape[0] // self.M
+        c_new = x.shape[1]
+        x = x.reshape([N, self.M, c_new, -1])
+        x = x.mean(3).mean(1)
+
+        return self.fc(x)
diff --git a/docs/src/paddlevideo/modeling/heads/asrf_head.py b/docs/src/paddlevideo/modeling/heads/asrf_head.py
new file mode 100644
index 000000000..c3aab77ad
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/asrf_head.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/yiskw713/asrf/libs/models/tcn.py
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddle import ParamAttr
+
+from ..backbones.ms_tcn import SingleStageModel
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+from ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch
+
+
+@HEADS.register()
+class ASRFHead(BaseHead):
+
+    def __init__(self,
+                 num_classes,
+                 num_features,
+                 num_stages,
+                 num_layers,
+                 num_stages_asb=None,
+                 num_stages_brb=None):
+        super().__init__(num_classes=num_classes, in_channels=num_features)
+        if not isinstance(num_stages_asb, int):
+            num_stages_asb = num_stages
+
+        if not isinstance(num_stages_brb, int):
+            num_stages_brb = num_stages
+
+        self.num_layers = num_layers
+        self.num_stages_asb = num_stages_asb
+        self.num_stages_brb = num_stages_brb
+        self.num_features = num_features
+
+        # cls score
+        self.overlap = 0.5
+
+        self.conv_cls = nn.Conv1D(self.num_features, self.num_classes, 1)
+        self.conv_boundary = nn.Conv1D(self.num_features, 1, 1)
+
+        # action segmentation branch
+        asb = [
+            SingleStageModel(self.num_layers, self.num_features,
+                             self.num_classes, self.num_classes)
+            for _ in range(self.num_stages_asb - 1)
+        ]
+
+        # boundary regression branch
+        brb = [
+            SingleStageModel(self.num_layers, self.num_features, 1, 1)
+            for _ in range(self.num_stages_brb - 1)
+        ]
+        self.brb = nn.LayerList(brb)
+        self.asb = nn.LayerList(asb)
+
+        self.activation_asb = nn.Softmax(axis=1)
+        self.activation_brb = nn.Sigmoid()
+
+    def init_weights(self):
+        """
+        initialize model layers' weight
+        """
+        # init weight
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv1D):
+                layer.weight.set_value(
+                    KaimingUniform_like_torch(layer.weight).astype('float32'))
+                if layer.bias is not None:
+                    layer.bias.set_value(
+                        init_bias(layer.weight, layer.bias).astype('float32'))
+
+    def forward(self, x):
+        """
+        ASRF head
+        """
+        out_cls = self.conv_cls(x)
+        out_boundary = self.conv_boundary(x)
+
+        outputs_cls = [out_cls]
+        outputs_boundary = [out_boundary]
+
+        for as_stage in self.asb:
+            out_cls = as_stage(self.activation_asb(out_cls))
+            outputs_cls.append(out_cls)
+
+        for br_stage in self.brb:
+            out_boundary = br_stage(self.activation_brb(out_boundary))
+            outputs_boundary.append(out_boundary)
+
+        return outputs_cls, outputs_boundary
+
+    def get_F1_score(self, predicted, groundTruth):
+        recog_content = list(predicted.numpy())
+        gt_content = list(groundTruth[0].numpy())
+
+        # cls score
+        correct = 0
+        total = 0
+        edit = 0
+
+        for i in range(len(gt_content)):
+            total += 1
+
+            if gt_content[i] == recog_content[i]:
+                correct += 1
+
+        edit_num = self.edit_score(recog_content, gt_content)
+        edit += edit_num
+
+        tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)
+
+        # cls metric
+
+        precision = tp / float(tp + fp)
+        recall = tp / float(fp + fn)
+
+        if precision + recall > 0.0:
+            f1 = 2.0 * (precision * recall) / (precision + recall)
+        else:
+            f1 = 0.0
+        f1 = np.nan_to_num(f1)
+        return f1
+
+    def get_labels_start_end_time(self, frame_wise_labels):
+        labels = []
+        starts = []
+        ends = []
+        last_label = frame_wise_labels[0]
+        labels.append(frame_wise_labels[0])
+        starts.append(0)
+        for i in range(len(frame_wise_labels)):
+            if frame_wise_labels[i] != last_label:
+                labels.append(frame_wise_labels[i])
+                starts.append(i)
+                ends.append(i)
+                last_label = frame_wise_labels[i]
+        ends.append(i + 1)
+        return labels, starts, ends
+
+    def levenstein(self, p, y, norm=False):
+        m_row = len(p)
+        n_col = len(y)
+        D = np.zeros([m_row + 1, n_col + 1], np.float)
+        for i in range(m_row + 1):
+            D[i, 0] = i
+        for i in range(n_col + 1):
+            D[0, i] = i
+
+        for j in range(1, n_col + 1):
+            for i in range(1, m_row + 1):
+                if y[j - 1] == p[i - 1]:
+                    D[i, j] = D[i - 1, j - 1]
+                else:
+                    D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,
+                                  D[i - 1, j - 1] + 1)
+
+        if norm:
+            score = (1 - D[-1, -1] / max(m_row, n_col)) * 100
+        else:
+            score = D[-1, -1]
+
+        return score
+
+    def edit_score(self, recognized, ground_truth, norm=True):
+        P, _, _ = self.get_labels_start_end_time(recognized)
+        Y, _, _ = self.get_labels_start_end_time(ground_truth)
+        return self.levenstein(P, Y, norm)
+
+    def f_score(self, recognized, ground_truth, overlap):
+        p_label, p_start, p_end = self.get_labels_start_end_time(recognized)
+        y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)
+
+        tp = 0
+        fp = 0
+
+        hits = np.zeros(len(y_label))
+
+        for j in range(len(p_label)):
+            intersection = np.minimum(p_end[j], y_end) - np.maximum(
+                p_start[j], y_start)
+            union = np.maximum(p_end[j], y_end) - np.minimum(
+                p_start[j], y_start)
+            IoU = (1.0 * intersection / union) * (
+                [p_label[j] == y_label[x] for x in range(len(y_label))])
+            # Get the best scoring segment
+            idx = np.array(IoU).argmax()
+
+            if IoU[idx] >= overlap and not hits[idx]:
+                tp += 1
+                hits[idx] = 1
+            else:
+                fp += 1
+        fn = len(y_label) - sum(hits)
+        return float(tp), float(fp), float(fn)
diff --git a/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py b/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py
new file mode 100644
index 000000000..24c31adb8
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/attention_lstm_head.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+from paddle.nn.initializer import Normal
+from paddle.regularizer import L2Decay
+import paddle.nn.functional as F
+
+from ...metrics.youtube8m import eval_util as youtube8m_metrics
+from ..registry import HEADS
+from ..weight_init import weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class AttentionLstmHead(BaseHead):
+    """AttentionLstmHead.
+    Args: TODO
+    """
+    def __init__(self,
+                 num_classes=3862,
+                 feature_num=2,
+                 feature_dims=[1024, 128],
+                 embedding_size=512,
+                 lstm_size=1024,
+                 in_channels=2048,
+                 loss_cfg=dict(name='CrossEntropyLoss')):
+        super(AttentionLstmHead, self).__init__(num_classes, in_channels,
+                                                loss_cfg)
+        self.num_classes = num_classes
+        self.feature_dims = feature_dims
+        self.embedding_size = embedding_size
+        self.lstm_size = lstm_size
+        self.feature_num = len(self.feature_dims)
+        for i in range(self.feature_num):  # 0:rgb, 1:audio
+            fc_feature = paddle.nn.Linear(in_features=self.feature_dims[i],
+                                          out_features=self.embedding_size)
+            self.add_sublayer("fc_feature{}".format(i), fc_feature)
+
+            bi_lstm = paddle.nn.LSTM(input_size=self.embedding_size,
+                                     hidden_size=self.lstm_size,
+                                     direction="bidirectional")
+            self.add_sublayer("bi_lstm{}".format(i), bi_lstm)
+
+            drop_rate = 0.5
+            self.dropout = paddle.nn.Dropout(drop_rate)
+
+            att_fc = paddle.nn.Linear(in_features=self.lstm_size * 2,
+                                      out_features=1)
+            self.add_sublayer("att_fc{}".format(i), att_fc)
+            self.softmax = paddle.nn.Softmax()
+
+        self.fc_out1 = paddle.nn.Linear(in_features=self.lstm_size * 4,
+                                        out_features=8192,
+                                        bias_attr=ParamAttr(
+                                            regularizer=L2Decay(0.0),
+                                            initializer=Normal()))
+        self.relu = paddle.nn.ReLU()
+        self.fc_out2 = paddle.nn.Linear(in_features=8192,
+                                        out_features=4096,
+                                        bias_attr=ParamAttr(
+                                            regularizer=L2Decay(0.0),
+                                            initializer=Normal()))
+        self.fc_logit = paddle.nn.Linear(in_features=4096,
+                                         out_features=self.num_classes,
+                                         bias_attr=ParamAttr(
+                                             regularizer=L2Decay(0.0),
+                                             initializer=Normal()))
+        self.sigmoid = paddle.nn.Sigmoid()
+
+    def init_weights(self):
+        pass
+
+    def forward(self, inputs):
+        # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)]
+        # deal with features with different length
+        # 1. padding to same lenght, make a tensor
+        # 2. make a mask tensor with the same shpae with 1
+        # 3. compute output using mask tensor, s.t. output is nothing todo with padding
+        assert (len(inputs) == self.feature_num
+                ), "Input tensor does not contain {} features".format(
+                    self.feature_num)
+        att_outs = []
+        for i in range(len(inputs)):
+            # 1. fc
+            m = getattr(self, "fc_feature{}".format(i))
+            output_fc = m(inputs[i][0])
+            output_fc = paddle.tanh(output_fc)
+
+            # 2. bi_lstm
+            m = getattr(self, "bi_lstm{}".format(i))
+            lstm_out, _ = m(inputs=output_fc, sequence_length=inputs[i][1])
+
+            lstm_dropout = self.dropout(lstm_out)
+
+            # 3. att_fc
+            m = getattr(self, "att_fc{}".format(i))
+            lstm_weight = m(lstm_dropout)
+
+            # 4. softmax replace start, for it's relevant to sum in time step
+            lstm_exp = paddle.exp(lstm_weight)
+            lstm_mask = paddle.mean(inputs[i][2], axis=2)
+            lstm_mask = paddle.unsqueeze(lstm_mask, axis=2)
+            lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask)
+            lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1)
+            exponent = -1
+            lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent)
+            lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2)
+            lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator)
+            lstm_weight = lstm_softmax
+            # softmax replace end
+
+            lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight)
+
+            # 5. sequence_pool's replace start, for it's relevant to sum in time step
+            lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask)
+            fea_lens = inputs[i][1]
+            fea_len = int(fea_lens[0])
+            lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1)
+            # sequence_pool's replace end
+            att_outs.append(lstm_pool)
+        att_out = paddle.concat(att_outs, axis=1)
+        fc_out1 = self.fc_out1(att_out)
+        fc_out1_act = self.relu(fc_out1)
+        fc_out2 = self.fc_out2(fc_out1_act)
+        fc_out2_act = paddle.tanh(fc_out2)
+        fc_logit = self.fc_logit(fc_out2_act)
+        output = self.sigmoid(fc_logit)
+        return fc_logit, output
+
+    def loss(self, lstm_logit, labels, **kwargs):
+        labels.stop_gradient = True
+        losses = dict()
+        bce_logit_loss = paddle.nn.BCEWithLogitsLoss(reduction='sum')
+        sum_cost = bce_logit_loss(lstm_logit, labels)
+        return sum_cost
+
+    def metric(self, lstm_output, labels):
+        pred = lstm_output.numpy()
+        label = labels.numpy()
+        hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)
+        perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(
+            pred, label)
+        gap = youtube8m_metrics.calculate_gap(pred, label)
+        return hit_at_one, perr, gap
+
+
+@HEADS.register()
+class ActionAttentionLstmHead(BaseHead):
+    """AttentionLstmHead for FootballAction
+    Args: TODO
+    """
+    def __init__(self,
+                 num_classes=8,
+                 feature_num=2,
+                 feature_dims=[2048, 1024],
+                 embedding_size=512,
+                 lstm_size=1024,
+                 in_channels=2048,
+                 loss_cfg=dict(name='CrossEntropyLoss')):
+        super(ActionAttentionLstmHead, self).__init__(num_classes, in_channels,
+                                                      loss_cfg)
+        self.num_classes = num_classes
+        self.feature_dims = feature_dims
+        self.embedding_size = embedding_size
+        self.lstm_size = lstm_size
+        self.feature_num = len(self.feature_dims)
+        for i in range(self.feature_num):  # 0:rgb, 1:audio
+            bi_lstm = paddle.nn.LSTM(input_size=self.feature_dims[i],
+                                     hidden_size=self.feature_dims[i],
+                                     direction="bidirectional")
+            self.add_sublayer("bi_lstm{}".format(i), bi_lstm)
+
+            drop_rate = 0.5
+            self.dropout = paddle.nn.Dropout(drop_rate)
+
+            att_fc = paddle.nn.Linear(in_features=self.feature_dims[i] * 2,
+                                      out_features=1)
+            self.add_sublayer("att_fc{}".format(i), att_fc)
+            self.softmax = paddle.nn.Softmax()
+
+        self.fc1 = paddle.nn.Linear(in_features=2 * sum(self.feature_dims),
+                                    out_features=8192,
+                                    bias_attr=ParamAttr(
+                                        regularizer=L2Decay(0.0),
+                                        initializer=Normal()))
+        self.bn1 = paddle.nn.BatchNorm(num_channels=8192)
+        self.dropout1 = paddle.nn.Dropout(0.5)
+        self.fc2 = paddle.nn.Linear(in_features=8192,
+                                    out_features=4096,
+                                    bias_attr=ParamAttr(
+                                        regularizer=L2Decay(0.0),
+                                        initializer=Normal()))
+        self.bn2 = paddle.nn.BatchNorm(num_channels=4096)
+        self.dropout2 = paddle.nn.Dropout(0.5)
+        self.fc3 = paddle.nn.Linear(
+            in_features=4096,
+            out_features=self.num_classes,
+        )
+        self.fc4 = paddle.nn.Linear(
+            in_features=4096,
+            out_features=1,
+        )
+
+    def init_weights(self):
+        pass
+
+    def forward(self, inputs):
+        # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)]
+        # deal with features with different length
+        # 1. padding to same lenght, make a tensor
+        # 2. make a mask tensor with the same shpae with 1
+        # 3. compute output using mask tensor, s.t. output is nothing todo with padding
+        assert (len(inputs) == self.feature_num
+                ), "Input tensor does not contain {} features".format(
+                    self.feature_num)
+        att_outs = []
+        for i in range(len(inputs)):
+            m = getattr(self, "bi_lstm{}".format(i))
+            lstm_out, _ = m(inputs=inputs[i][0], sequence_length=inputs[i][1])
+
+            lstm_dropout = self.dropout(lstm_out)
+
+            # 3. att_fc
+            m = getattr(self, "att_fc{}".format(i))
+            lstm_weight = m(lstm_dropout)
+
+            # 4. softmax replace start, for it's relevant to sum in time step
+            lstm_exp = paddle.exp(lstm_weight)
+            lstm_mask = paddle.mean(inputs[i][2], axis=2)
+            lstm_mask = paddle.unsqueeze(lstm_mask, axis=2)
+            lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask)
+            lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1)
+            exponent = -1
+            lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent)
+            lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2)
+            lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator)
+            lstm_weight = lstm_softmax
+            # softmax replace end
+
+            lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight)
+
+            # 5. sequence_pool's replace start, for it's relevant to sum in time step
+            lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask)
+            # fea_lens = inputs[i][1]
+            # fea_len = int(fea_lens[0])
+            lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1)
+            # sequence_pool's replace end
+            att_outs.append(lstm_pool)
+        att_out = paddle.concat(att_outs, axis=1)
+        y = self.fc1(att_out)
+        y = self.bn1(y)
+        y = F.relu(y)
+        y = self.dropout1(y)
+        y = self.fc2(y)
+        y = self.bn2(y)
+        y = F.relu(y)
+        y = self.dropout2(y)
+        out1 = self.fc3(y)
+        out1 = F.softmax(out1)
+        out2 = self.fc4(y)
+        out2 = F.sigmoid(out2)
+        return out1, out2
+
+    def loss(self, logits, iou, labels, labels_iou, **kwargs):
+        alpha = 10
+        softmax_loss = F.cross_entropy(logits, labels)
+        labels_iou = labels_iou.astype('float32')
+        mse_loss = paddle.sum(F.square_error_cost(iou, labels_iou), axis=-1)
+        sum_loss = softmax_loss + alpha * mse_loss
+        return sum_loss
+
+    def metric(self, scores, labels):
+        top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)
+        top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)
+        return top1, top5
diff --git a/docs/src/paddlevideo/modeling/heads/base.py b/docs/src/paddlevideo/modeling/heads/base.py
new file mode 100644
index 000000000..99a1408db
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/base.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from abc import abstractmethod
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..builder import build_loss
+from paddlevideo.utils import get_logger, get_dist_info
+
+logger = get_logger("paddlevideo")
+
+
+class BaseHead(nn.Layer):
+    """Base class for head part.
+
+    All head should subclass it.
+    All subclass should overwrite:
+
+    - Methods: ```init_weights```, initializing weights.
+    - Methods: ```forward```, forward function.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channels in input feature.
+        loss_cfg (dict): Config for building loss. Default: dict(type='CrossEntropyLoss').
+        ls_eps (float): label smoothing epsilon. Default: 0. .
+
+    """
+    def __init__(
+        self,
+        num_classes=None,
+        in_channels=None,
+        loss_cfg=dict(
+            name="CrossEntropyLoss"
+        ),  #TODO(shipping): only pass a name or standard build cfg format.
+        #multi_class=False, NOTE(shipping): not supported now.
+        ls_eps=0.):
+
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.loss_func = build_loss(loss_cfg)
+        #self.multi_class = multi_class NOTE(shipping): not supported now
+        self.ls_eps = ls_eps
+
+    @abstractmethod
+    def forward(self, x):
+        """Define how the head is going to run.
+        """
+        raise NotImplemented
+
+    def loss(self, scores, labels, valid_mode=False, if_top5=True, **kwargs):
+        """Calculate the loss accroding to the model output ```scores```,
+           and the target ```labels```.
+
+        Args:
+            scores (paddle.Tensor): The output of the model.
+            labels (paddle.Tensor): The target output of the model.
+
+        Returns:
+            losses (dict): A dict containing field 'loss'(mandatory) and 'top1_acc', 'top5_acc'(optional).
+
+        """
+        if len(labels) == 1:  #commonly case
+            labels = labels[0]
+            losses = dict()
+            if self.ls_eps != 0. and not valid_mode:  # label_smooth
+                loss = self.label_smooth_loss(scores, labels, **kwargs)
+            else:
+                loss = self.loss_func(scores, labels, **kwargs)
+            if if_top5:
+                top1, top5 = self.get_acc(scores, labels, valid_mode)
+                losses['top1'] = top1
+                losses['top5'] = top5
+                losses['loss'] = loss
+            else:
+                top1 = self.get_acc(scores, labels, valid_mode, if_top5)
+                losses['top1'] = top1
+                losses['loss'] = loss
+            return losses
+        # MRI目前二分类无top5
+        elif len(labels) == 3:  # mix_up
+            labels_a, labels_b, lam = labels
+            lam = lam[0]  # get lam value
+            losses = dict()
+            if self.ls_eps != 0:
+                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)
+                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)
+            else:
+                loss_a = self.loss_func(scores, labels_a, **kwargs)
+                loss_b = self.loss_func(scores, labels_b, **kwargs)
+            loss = lam * loss_a + (1 - lam) * loss_b
+
+            if if_top5:
+                top1a, top5a = self.get_acc(scores, labels_a, valid_mode)
+                top1b, top5b = self.get_acc(scores, labels_b, valid_mode)
+                top1 = lam * top1a + (1 - lam) * top1b
+                top5 = lam * top5a + (1 - lam) * top5b
+                losses['top1'] = top1
+                losses['top5'] = top5
+                losses['loss'] = loss
+
+            else:
+                top1a = self.get_acc(scores, labels_a, valid_mode, if_top5)
+                top1b = self.get_acc(scores, labels_b, valid_mode, if_top5)
+                top1 = lam * top1a + (1 - lam) * top1b
+                losses['top1'] = top1
+                losses['loss'] = loss
+
+            return losses
+        else:
+            raise NotImplemented
+
+    def label_smooth_loss(self, scores, labels, **kwargs):
+        """
+        Args:
+            scores (paddle.Tensor): [N, num_classes]
+            labels (paddle.Tensor): [N, ]
+        Returns:
+            paddle.Tensor: [1,]
+        """
+        if paddle.is_compiled_with_custom_device('npu'):
+            """
+            Designed for the lack of temporary operators of NPU,
+            main idea is to split smooth loss into uniform distribution loss
+            and hard label calculation
+            """
+            hard_loss = (1.0 - self.ls_eps) * F.cross_entropy(scores, labels)
+            uniform_loss = (self.ls_eps / self.num_classes) * (
+                -F.log_softmax(scores, -1).sum(-1).mean(0))
+            loss = hard_loss + uniform_loss
+        else:
+            labels = F.one_hot(labels, self.num_classes)
+            labels = F.label_smooth(labels, epsilon=self.ls_eps)
+            labels = paddle.squeeze(labels, axis=1)
+            loss = self.loss_func(scores, labels, soft_label=True, **kwargs)
+        return loss
+
+    def get_acc(self, scores, labels, valid_mode, if_top5=True):
+        if if_top5:
+            top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)
+            top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)
+            _, world_size = get_dist_info()
+            #NOTE(shipping): deal with multi cards validate
+            if world_size > 1 and valid_mode:  #reduce sum when valid
+                paddle.distributed.all_reduce(
+                    top1, op=paddle.distributed.ReduceOp.SUM)
+                top1 = top1 / world_size
+                paddle.distributed.all_reduce(
+                    top5, op=paddle.distributed.ReduceOp.SUM)
+                top5 = top5 / world_size
+
+            return top1, top5
+        else:
+            top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)
+            _, world_size = get_dist_info()
+            #NOTE(shipping): deal with multi cards validate
+            if world_size > 1 and valid_mode:  #reduce sum when valid
+                paddle.distributed.all_reduce(
+                    top1, op=paddle.distributed.ReduceOp.SUM)
+                top1 = top1 / world_size
+
+            return top1
diff --git a/docs/src/paddlevideo/modeling/heads/bbox_head.py b/docs/src/paddlevideo/modeling/heads/bbox_head.py
new file mode 100644
index 000000000..688251ebb
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/bbox_head.py
@@ -0,0 +1,225 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle 
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from .. import builder
+
+from ..registry import HEADS
+
+@HEADS.register()
+class BBoxHeadAVA(nn.Layer):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively.  """
+
+    def __init__(
+            self,
+            temporal_pool_type='avg',
+            spatial_pool_type='max',
+            in_channels=2048,
+            num_classes=81,# The first class is reserved, to classify bbox as pos / neg
+            dropout_ratio=0,
+            dropout_before_pool=True,
+            topk=(3, 5),
+            multilabel=True):
+
+        super(BBoxHeadAVA, self).__init__()
+        assert temporal_pool_type in ['max', 'avg']
+        assert spatial_pool_type in ['max', 'avg']
+        self.temporal_pool_type = temporal_pool_type
+        self.spatial_pool_type = spatial_pool_type
+
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+
+        self.dropout_ratio = dropout_ratio
+        self.dropout_before_pool = dropout_before_pool
+
+        self.multilabel = multilabel
+        if topk is None:
+            self.topk = ()
+        elif isinstance(topk, int):
+            self.topk = (topk, )
+        elif isinstance(topk, tuple):
+            assert all([isinstance(k, int) for k in topk])
+            self.topk = topk
+        else:
+            raise TypeError('topk should be int or tuple[int], '
+                            f'but get {type(topk)}')
+        # Class 0 is ignored when calculaing multilabel accuracy,
+        # so topk cannot be equal to num_classes
+        assert all([k < num_classes for k in self.topk])
+        assert self.multilabel
+
+        in_channels = self.in_channels
+        if self.temporal_pool_type == 'avg':
+            self.temporal_pool = nn.AdaptiveAvgPool3D((1, None, None))
+        else:
+            self.temporal_pool = nn.AdaptiveMaxPool3D((1, None, None))
+        if self.spatial_pool_type == 'avg':
+            self.spatial_pool = nn.AdaptiveAvgPool3D((None, 1, 1))
+        else:
+            self.spatial_pool = nn.AdaptiveMaxPool3D((None, 1, 1))
+
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout(dropout_ratio)
+
+        weight_attr = paddle.framework.ParamAttr(name="weight",
+                                                 initializer=paddle.nn.initializer.Normal(mean=0.0, std=0.01))
+        bias_attr = paddle.ParamAttr(name="bias",
+                                     initializer=paddle.nn.initializer.Constant(value=0.0))
+
+        self.fc_cls = nn.Linear(in_channels, num_classes, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        self.debug_imgs = None
+
+    def forward(self, x,rois, rois_num):
+        roi = paddle.concat(rois)
+        roi_x1 = paddle.index_select(roi, index=paddle.to_tensor(0), axis=1)
+        roi_x2 = paddle.index_select(roi, index=paddle.to_tensor(2), axis=1)
+        roi_w = roi_x2 - roi_x1
+        roi_y1 = paddle.index_select(roi, index=paddle.to_tensor(1), axis=1)
+        roi_y2 = paddle.index_select(roi, index=paddle.to_tensor(3), axis=1)
+        roi_h = roi_y2 - roi_y1
+        roi_area = paddle.multiply(roi_w, roi_h)
+        A = roi_area
+        A1 = paddle.full(A.shape, 1, dtype='int32')
+        A2 = paddle.where(A == 0, paddle.zeros_like(A1), A1)
+        AE = paddle.expand(A2, [A.shape[0], x.shape[1]])
+        rois_num = paddle.to_tensor(rois_num, dtype='int32')
+        if self.dropout_before_pool and self.dropout_ratio > 0 :
+            x = self.dropout(x)
+        x = self.temporal_pool(x)
+        x = self.spatial_pool(x)
+        if not self.dropout_before_pool and self.dropout_ratio > 0 :
+            x = self.dropout(x)
+        x = paddle.reshape(x, [x.shape[0], -1])
+        x = paddle.multiply(x, paddle.cast(AE,"float32"))
+        cls_score = self.fc_cls(x)
+        # We do not predict bbox, so return None
+        return cls_score, None
+
+    def get_targets(self, sampling_results, gt_bboxes, gt_labels, pos_weight):
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        neg_proposals = [res.neg_bboxes for res in sampling_results]
+        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
+        cls_reg_targets = self.bbox_target(pos_proposals, neg_proposals,
+                                      pos_gt_labels, pos_weight)
+        return cls_reg_targets
+
+    def bbox_target(self, pos_bboxes_list, neg_bboxes_list, gt_labels, pos_weight):
+        """Generate classification targets for bboxes.  """
+        labels, label_weights = [], []
+        pos_weight = 1.0 if pos_weight <= 0 else pos_weight
+    
+        assert len(pos_bboxes_list) == len(neg_bboxes_list) == len(gt_labels)
+        length = len(pos_bboxes_list)
+    
+        for i in range(length):
+            pos_bboxes = pos_bboxes_list[i]
+            neg_bboxes = neg_bboxes_list[i]
+            gt_label = gt_labels[i]
+            num_pos = pos_bboxes.shape[0]
+            if neg_bboxes is not None:
+                num_neg = neg_bboxes.shape[0]
+            else:
+                num_neg = 0
+            num_samples = num_pos + num_neg
+            neg_label = paddle.zeros([num_neg, gt_label.shape[1]])
+            label = paddle.concat([gt_label,neg_label])
+            labels.append(label)
+    
+        labels = paddle.concat(labels, 0)
+        return labels
+
+    def recall_prec(self, pred_vec, target_vec):
+        correct = paddle.to_tensor(np.logical_and(pred_vec.numpy(), target_vec.numpy()))
+        correct = paddle.where(correct, 
+                                    paddle.full(correct.shape,1,dtype='int32'),
+                                    paddle.full(correct.shape,0,dtype='int32'))
+        recall_correct = paddle.cast(paddle.sum(correct, axis=1), 'float32')
+        target_vec = paddle.where(target_vec, 
+                                    paddle.full(target_vec.shape,1,dtype='int32'),
+                                    paddle.full(target_vec.shape,0,dtype='int32'))
+        recall_target = paddle.cast(paddle.sum(target_vec, axis=1),'float32')
+        recall = recall_correct / recall_target
+        pred_vec = paddle.where(pred_vec, 
+                                    paddle.full(pred_vec.shape,1,dtype='int32'),
+                                    paddle.full(pred_vec.shape,0,dtype='int32'))
+        prec_target = paddle.cast(paddle.sum(pred_vec, axis=1) + 1e-6, 'float32')
+        prec = recall_correct / prec_target
+        recall_mean = paddle.mean(recall)
+        prec_mean = paddle.mean(prec)
+        return recall_mean, prec_mean
+
+    def multilabel_accuracy(self, pred, target, thr=0.5):
+        pred = paddle.nn.functional.sigmoid(pred)
+        pred_vec = pred > thr
+        target_vec = target > 0.5
+        recall_thr, prec_thr = self.recall_prec(pred_vec, target_vec)
+        recalls, precs = [], []
+        for k in self.topk:
+            _, pred_label = paddle.topk(pred, k, 1, True, True)
+            pred_vec = paddle.full(pred.shape,0,dtype='bool')
+            num_sample = pred.shape[0]
+            for i in range(num_sample):
+                pred_vec[i, pred_label[i].numpy()] = 1  
+            recall_k, prec_k = self.recall_prec(pred_vec, target_vec)
+            recalls.append(recall_k)
+            precs.append(prec_k)
+        return recall_thr, prec_thr, recalls, precs
+
+    def loss(self,
+             cls_score,
+             labels):
+        losses = dict()
+        if cls_score is not None:
+            # Only use the cls_score
+            labels = labels[:, 1:]
+            pos_inds_bool = paddle.sum(labels, axis=-1) > 0
+            pos_inds = paddle.where(paddle.sum(labels, axis=-1) > 0,
+                                    paddle.full([labels.shape[0]],1,dtype='int32'),
+                                    paddle.full([labels.shape[0]],0,dtype='int32'))
+            pos_inds = paddle.nonzero(pos_inds, as_tuple=False)
+            cls_score = paddle.index_select(cls_score, pos_inds, axis=0)
+            cls_score = cls_score[:, 1:] 
+            labels = paddle.index_select(labels, pos_inds, axis=0)
+            bce_loss = F.binary_cross_entropy_with_logits
+            loss = bce_loss(cls_score, labels, reduction='none')
+            losses['loss'] = paddle.mean(loss)
+            recall_thr, prec_thr, recall_k, prec_k = self.multilabel_accuracy(
+                cls_score, labels, thr=0.5)
+            losses['recall@thr=0.5'] = recall_thr
+            losses['prec@thr=0.5'] = prec_thr
+            for i, k in enumerate(self.topk):
+                losses[f'recall@top{k}'] = recall_k[i]
+                losses[f'prec@top{k}'] = prec_k[i]
+        return losses
+
+    def get_det_bboxes(self,
+                       rois,
+                       cls_score,
+                       img_shape,
+                       flip=False,
+                       crop_quadruple=None,
+                       cfg=None):
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        assert self.multilabel
+        m = paddle.nn.Sigmoid()
+        scores = m(cls_score)
+        bboxes = rois
+        return bboxes, scores
diff --git a/docs/src/paddlevideo/modeling/heads/cfbi_head.py b/docs/src/paddlevideo/modeling/heads/cfbi_head.py
new file mode 100644
index 000000000..f7cbd910e
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/cfbi_head.py
@@ -0,0 +1,448 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+class IA_gate(nn.Layer):
+    def __init__(self, in_dim, out_dim):
+        super(IA_gate, self).__init__()
+        self.IA = nn.Linear(in_dim, out_dim)
+
+    def forward(self, x, IA_head):
+        a = self.IA(IA_head)
+        a = 1. + paddle.tanh(a)
+        a = paddle.unsqueeze(paddle.unsqueeze(a, axis=-1), axis=-1)
+        x = a * x
+        return x
+
+
+class GCT(nn.Layer):
+    def __init__(self, num_channels, epsilon=1e-5, mode='l2', after_relu=False):
+        super(GCT, self).__init__()
+        x1 = paddle.zeros([1, num_channels, 1, 1])
+        x2 = paddle.ones([1, num_channels, 1, 1])
+        self.alpha = paddle.create_parameter(
+            shape=x2.shape,
+            dtype=x2.dtype,
+            default_initializer=nn.initializer.Assign(x2))
+        self.alpha.stop_gradient = False
+        self.gamma = paddle.create_parameter(
+            shape=x1.shape,
+            dtype=x1.dtype,
+            default_initializer=nn.initializer.Assign(x1))
+        self.gamma.stop_gradient = False
+        self.beta = paddle.create_parameter(
+            shape=x1.shape,
+            dtype=x1.dtype,
+            default_initializer=nn.initializer.Assign(x1))
+        self.beta.stop_gradient = False
+
+        self.epsilon = epsilon
+        self.mode = mode
+        self.after_relu = after_relu
+
+    def forward(self, x):
+
+        if self.mode == 'l2':
+            embedding = paddle.pow(
+                paddle.sum(paddle.pow(x, 2), axis=[2, 3], keepdim=True) +
+                self.epsilon, 0.5) * self.alpha
+            norm = self.gamma / paddle.pow(
+                (paddle.mean(paddle.pow(embedding, 2), axis=1, keepdim=True) +
+                 self.epsilon), 0.5)
+        elif self.mode == 'l1':
+            if not self.after_relu:
+                _x = paddle.abs(x)
+            else:
+                _x = x
+            embedding = paddle.sum(_x, axis=(2, 3), keepdim=True) * self.alpha
+            norm = self.gamma / (paddle.mean(
+                paddle.abs(embedding), axis=1, keepdim=True) + self.epsilon)
+        else:
+            print('Unknown mode!')
+            exit()
+
+        gate = 1. + paddle.tanh(embedding * norm + self.beta)
+
+        return x * gate
+
+
+class Bottleneck(nn.Layer):
+    def __init__(self, inplanes, outplanes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = 4
+        planes = int(outplanes / expansion)
+
+        self.GCT1 = GCT(inplanes)
+        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
+        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=planes)
+
+        self.conv2 = nn.Conv2D(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               dilation=dilation,
+                               padding=dilation,
+                               bias_attr=False)
+        self.bn2 = nn.GroupNorm(num_groups=32, num_channels=planes)
+
+        self.conv3 = nn.Conv2D(planes,
+                               planes * expansion,
+                               kernel_size=1,
+                               bias_attr=False)
+        self.bn3 = nn.GroupNorm(num_groups=32, num_channels=planes * expansion)
+        self.relu = nn.ReLU()
+        if stride != 1 or inplanes != planes * expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(inplanes,
+                          planes * expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                nn.GroupNorm(num_groups=32, num_channels=planes * expansion),
+            )
+        else:
+            downsample = None
+        self.downsample = downsample
+
+        self.stride = stride
+        self.dilation = dilation
+
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+
+    def forward(self, x):
+        residual = x
+
+        out = self.GCT1(x)
+        out = self.conv1(out)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class _ASPPModule(nn.Layer):
+    def __init__(self, inplanes, planes, kernel_size, padding, dilation):
+        super(_ASPPModule, self).__init__()
+        self.GCT = GCT(inplanes)
+        self.atrous_conv = nn.Conv2D(inplanes,
+                                     planes,
+                                     kernel_size=kernel_size,
+                                     stride=1,
+                                     padding=padding,
+                                     dilation=dilation,
+                                     bias_attr=False)
+        self.bn = nn.GroupNorm(num_groups=int(planes / 4), num_channels=planes)
+        self.relu = nn.ReLU()
+
+        self._init_weight()
+
+    def forward(self, x):
+        x = self.GCT(x)
+        x = self.atrous_conv(x)
+        x = self.bn(x)
+
+        return self.relu(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+class ASPP(nn.Layer):
+    def __init__(self):
+        super(ASPP, self).__init__()
+
+        inplanes = 512
+        dilations = [1, 6, 12, 18]
+
+        self.aspp1 = _ASPPModule(inplanes,
+                                 128,
+                                 1,
+                                 padding=0,
+                                 dilation=dilations[0])
+        self.aspp2 = _ASPPModule(inplanes,
+                                 128,
+                                 3,
+                                 padding=dilations[1],
+                                 dilation=dilations[1])
+        self.aspp3 = _ASPPModule(inplanes,
+                                 128,
+                                 3,
+                                 padding=dilations[2],
+                                 dilation=dilations[2])
+        self.aspp4 = _ASPPModule(inplanes,
+                                 128,
+                                 3,
+                                 padding=dilations[3],
+                                 dilation=dilations[3])
+
+        self.global_avg_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2D((1, 1)),
+            nn.Conv2D(inplanes, 128, 1, stride=1, bias_attr=False), nn.ReLU())
+
+        self.GCT = GCT(640)
+        self.conv1 = nn.Conv2D(640, 256, 1, bias_attr=False)
+        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=256)
+        self.relu = nn.ReLU()
+        self._init_weight()
+
+    def forward(self, x):
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.interpolate(x5,
+                           size=x4.shape[2:],
+                           mode='bilinear',
+                           align_corners=True)
+        x = paddle.concat([x1, x2, x3, x4, x5], axis=1)
+
+        x = self.GCT(x)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        return x
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+@HEADS.register()
+class CollaborativeEnsemblerMS(nn.Layer):
+    def __init__(
+        self,
+        model_semantic_embedding_dim=256,
+        model_multi_local_distance=[[4, 8, 12, 16, 20, 24],
+                                    [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]],
+        model_head_embedding_dim=256,
+        model_refine_channels=64,
+        model_low_level_inplanes=256,
+    ):
+        super(CollaborativeEnsemblerMS, self).__init__()
+        in_dim_4x = model_semantic_embedding_dim * 3 + 3 + 2 * len(
+            model_multi_local_distance[0])
+        in_dim_8x = model_semantic_embedding_dim * 3 + 3 + 2 * len(
+            model_multi_local_distance[1])
+        in_dim_16x = model_semantic_embedding_dim * 3 + 3 + 2 * len(
+            model_multi_local_distance[2])
+        attention_dim = model_semantic_embedding_dim * 4
+        embed_dim = model_head_embedding_dim
+        refine_dim = model_refine_channels
+        low_level_dim = model_low_level_inplanes
+
+        IA_in_dim = attention_dim
+
+        self.relu = nn.ReLU()
+
+        # stage 1
+
+        self.S1_IA1 = IA_gate(IA_in_dim, in_dim_4x)
+        self.S1_layer1 = Bottleneck(in_dim_4x, embed_dim)
+
+        self.S1_IA2 = IA_gate(IA_in_dim, embed_dim)
+        self.S1_layer2 = Bottleneck(embed_dim, embed_dim, 1, 2)
+
+        # stage2
+        self.S2_IA1 = IA_gate(IA_in_dim, embed_dim)
+        self.S2_layer1 = Bottleneck(embed_dim, embed_dim * 2, 2)
+
+        self.S2_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_8x)
+        self.S2_layer2 = Bottleneck(embed_dim * 2 + in_dim_8x, embed_dim * 2, 1,
+                                    2)
+
+        self.S2_IA3 = IA_gate(IA_in_dim, embed_dim * 2)
+        self.S2_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)
+
+        # stage3
+        self.S3_IA1 = IA_gate(IA_in_dim, embed_dim * 2)
+        self.S3_layer1 = Bottleneck(embed_dim * 2, embed_dim * 2, 2)
+
+        self.S3_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_16x)
+        self.S3_layer2 = Bottleneck(embed_dim * 2 + in_dim_16x, embed_dim * 2,
+                                    1, 2)
+
+        self.S3_IA3 = IA_gate(IA_in_dim, embed_dim * 2)
+        self.S3_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)
+
+        self.ASPP_IA = IA_gate(IA_in_dim, embed_dim * 2)
+        self.ASPP = ASPP()
+
+        # Decoder
+        self.GCT_sc = GCT(low_level_dim + embed_dim)
+        self.conv_sc = nn.Conv2D(low_level_dim + embed_dim,
+                                 refine_dim,
+                                 1,
+                                 bias_attr=False)
+        self.bn_sc = nn.GroupNorm(num_groups=int(refine_dim / 4),
+                                  num_channels=refine_dim)
+        self.relu = nn.ReLU()
+
+        self.IA10 = IA_gate(IA_in_dim, embed_dim + refine_dim)
+        self.conv1 = nn.Conv2D(embed_dim + refine_dim,
+                               int(embed_dim / 2),
+                               kernel_size=3,
+                               padding=1,
+                               bias_attr=False)
+        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))
+
+        self.IA11 = IA_gate(IA_in_dim, int(embed_dim / 2))
+        self.conv2 = nn.Conv2D(int(embed_dim / 2),
+                               int(embed_dim / 2),
+                               kernel_size=3,
+                               padding=1,
+                               bias_attr=False)
+        self.bn2 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))
+
+        # Output
+        self.IA_final_fg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)
+        self.IA_final_bg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)
+
+        self.conv_sc.weight.data = nn.initializer.KaimingNormal()
+        self.conv1.weight.data = nn.initializer.KaimingNormal()
+        self.conv2.weight.data = nn.initializer.KaimingNormal()
+
+    def forward(self, all_x, all_IA_head=None, low_level_feat=None):
+        x_4x, x_8x, x_16x = all_x
+        IA_head = all_IA_head[0]
+
+        # stage 1
+        x = self.S1_IA1(x_4x, IA_head)
+        x = self.S1_layer1(x)
+
+        x = self.S1_IA2(x, IA_head)
+        x = self.S1_layer2(x)
+
+        low_level_feat = paddle.concat(
+            [paddle.expand(low_level_feat, [x.shape[0], -1, -1, -1]), x],
+            axis=1)
+
+        # stage 2
+        x = self.S2_IA1(x, IA_head)
+        x = self.S2_layer1(x)
+
+        x = paddle.concat([x, x_8x], axis=1)
+        x = self.S2_IA2(x, IA_head)
+        x = self.S2_layer2(x)
+
+        x = self.S2_IA3(x, IA_head)
+        x = self.S2_layer3(x)
+
+        # stage 3
+        x = self.S3_IA1(x, IA_head)
+        x = self.S3_layer1(x)
+
+        x = paddle.concat([x, x_16x], axis=1)
+        x = self.S3_IA2(x, IA_head)
+        x = self.S3_layer2(x)
+
+        x = self.S3_IA3(x, IA_head)
+        x = self.S3_layer3(x)
+
+        # ASPP + Decoder
+        x = self.ASPP_IA(x, IA_head)
+        x = self.ASPP(x)
+
+        x = self.decoder(x, low_level_feat, IA_head)
+
+        fg_logit = self.IA_logit(x, IA_head, self.IA_final_fg)
+        bg_logit = self.IA_logit(x, IA_head, self.IA_final_bg)
+
+        pred = self.augment_background_logit(fg_logit, bg_logit)
+
+        return pred
+
+    def IA_logit(self, x, IA_head, IA_final):
+        n, c, h, w = x.shape
+        x = paddle.reshape(x, [1, n * c, h, w])
+        IA_output = IA_final(IA_head)
+        IA_weight = IA_output[:, :c]
+        IA_bias = IA_output[:, -1]
+        IA_weight = paddle.reshape(IA_weight, [n, c, 1, 1])
+
+        IA_bias = paddle.reshape(IA_bias, [-1])
+        logit = paddle.reshape(
+            F.conv2d(x, weight=IA_weight, bias=IA_bias, groups=n), [n, 1, h, w])
+        return logit
+
+    def decoder(self, x, low_level_feat, IA_head):
+        x = F.interpolate(x,
+                          size=low_level_feat.shape[2:],
+                          mode='bicubic',
+                          align_corners=True)
+
+        low_level_feat = self.GCT_sc(low_level_feat)
+        low_level_feat = self.conv_sc(low_level_feat)
+        low_level_feat = self.bn_sc(low_level_feat)
+        low_level_feat = self.relu(low_level_feat)
+
+        x = paddle.concat([x, low_level_feat], axis=1)
+        x = self.IA10(x, IA_head)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.IA11(x, IA_head)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        return x
+
+    def augment_background_logit(self, fg_logit, bg_logit):
+        #  We augment the logit of absolute background by using the relative background logit of all the
+        #  foreground objects.
+        obj_num = fg_logit.shape[0]
+        pred = fg_logit
+        if obj_num > 1:
+            bg_logit = bg_logit[1:obj_num, :, :, :]
+            aug_bg_logit = paddle.min(bg_logit, axis=0, keepdim=True)
+            pad = paddle.expand(paddle.zeros(aug_bg_logit.shape),
+                                [obj_num - 1, -1, -1, -1])
+            aug_bg_logit = paddle.concat([aug_bg_logit, pad], axis=0)
+            pred = pred + aug_bg_logit
+        pred = paddle.transpose(pred, [1, 0, 2, 3])
+        return pred
diff --git a/docs/src/paddlevideo/modeling/heads/ctrgcn_head.py b/docs/src/paddlevideo/modeling/heads/ctrgcn_head.py
new file mode 100644
index 000000000..c551d0d3e
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/ctrgcn_head.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class CTRGCNHead(BaseHead):
+    """
+    Head for CTR-GCN model.
+    Args:
+        in_channels: int, input feature channels. Default: 64.
+        num_classes: int, output the number of classes.
+        drop_out: float, dropout ratio of layer. Default: 0.
+    """
+
+    def __init__(self, in_channels=64, num_classes=10, drop_out=0, **kwargs):
+        super().__init__(num_classes, in_channels, **kwargs)
+        self.in_channels = in_channels
+        self.drop_out = drop_out
+
+        self.fc = nn.Linear(self.in_channels * 4, self.num_classes)
+        if drop_out:
+            self.drop_out = nn.Dropout(self.drop_out)
+        else:
+            self.drop_out = lambda x: x
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                weight_init_(layer.weight,
+                             'Normal',
+                             mean=0.0,
+                             std=math.sqrt(2. / self.num_classes))
+
+    def forward(self, output_patch):
+        """Define how the head is going to run.
+        """
+        x, N, M = output_patch
+        # N*M,C,T,V
+        _, c_new, T, V = x.shape
+        x = paddle.reshape(x, shape=[N, M, c_new, T * V])
+        x = x.mean(3).mean(1)
+        x = self.drop_out(x)
+
+        return self.fc(x)
diff --git a/docs/src/paddlevideo/modeling/heads/i3d_head.py b/docs/src/paddlevideo/modeling/heads/i3d_head.py
new file mode 100644
index 000000000..269c8184e
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/i3d_head.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+
+from ..registry import HEADS
+from ..weight_init import weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class I3DHead(BaseHead):
+    """Classification head for I3D.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Default: dict(name='CrossEntropyLoss')
+        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+        drop_ratio (float): Probability of dropout layer. Default: 0.5.
+        std (float): Std value for Initiation. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 spatial_type='avg',
+                 drop_ratio=0.5,
+                 std=0.01,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+
+        self.spatial_type = spatial_type
+        self.drop_ratio = drop_ratio
+        self.stdv = std
+        if self.drop_ratio != 0:
+            self.dropout = nn.Dropout(p=self.drop_ratio)
+        else:
+            self.dropout = None
+        self.fc = nn.Linear(
+            self.in_channels,
+            self.num_classes,
+            weight_attr=ParamAttr(learning_rate=10.0),
+            bias_attr=ParamAttr(learning_rate=10.0),
+        )
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels.
+            self.avg_pool = nn.AdaptiveAvgPool3D((1, 1, 1))
+        else:
+            self.avg_pool = None
+
+    def init_weights(self):
+        """Initiate the parameters from scratch."""
+        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The classification scores for input samples.
+        """
+        # [N, in_channels, 4, 7, 7]
+        if self.avg_pool is not None:
+            x = self.avg_pool(x)
+        # [N, in_channels, 1, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels, 1, 1, 1]
+        N = paddle.shape(x)[0]
+        x = x.reshape([N, -1])
+        # [N, in_channels]
+        cls_score = self.fc(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/docs/src/paddlevideo/modeling/heads/movinet_head.py b/docs/src/paddlevideo/modeling/heads/movinet_head.py
new file mode 100644
index 000000000..924b01489
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/movinet_head.py
@@ -0,0 +1,15 @@
+import collections.abc
+
+container_abcs = collections.abc
+from ..registry import HEADS
+from .base import BaseHead
+from ..builder import build_loss
+
+
+@HEADS.register()
+class MoViNetHead(BaseHead):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args):
+        return x
diff --git a/docs/src/paddlevideo/modeling/heads/ms_tcn_head.py b/docs/src/paddlevideo/modeling/heads/ms_tcn_head.py
new file mode 100644
index 000000000..e0f435f2a
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/ms_tcn_head.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddle import ParamAttr
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class MSTCNHead(BaseHead):
+
+    def __init__(self, num_classes, in_channels):
+        super().__init__(num_classes, in_channels)
+        self.ce = nn.CrossEntropyLoss(ignore_index=-100)
+        self.mse = nn.MSELoss(reduction='none')
+        self.num_classes = num_classes
+
+        # cls score
+        self.overlap = 0.5
+
+    def forward(self, x):
+        """MS-TCN no head
+        """
+        return x
+
+    def loss(self, output, video_gt):
+        """calculate loss
+        """
+        output_transpose = paddle.transpose(output, [2, 0, 1])
+        ce_x = paddle.reshape(output_transpose,
+                              (output_transpose.shape[0] *
+                               output_transpose.shape[1], self.num_classes))
+        ce_y = video_gt[0, :]
+        ce_loss = self.ce(ce_x, ce_y)
+        loss = ce_loss
+
+        mse = self.mse(F.log_softmax(output[:, :, 1:], axis=1),
+                       F.log_softmax(output.detach()[:, :, :-1], axis=1))
+        mse = paddle.clip(mse, min=0, max=16)
+        mse_loss = 0.15 * paddle.mean(mse)
+        loss += mse_loss
+
+        return loss
+
+    def get_F1_score(self, predicted, groundTruth):
+        recog_content = list(predicted.numpy())
+        gt_content = list(groundTruth[0].numpy())
+
+        # cls score
+        correct = 0
+        total = 0
+        edit = 0
+
+        for i in range(len(gt_content)):
+            total += 1
+
+            if gt_content[i] == recog_content[i]:
+                correct += 1
+
+        edit_num = self.edit_score(recog_content, gt_content)
+        edit += edit_num
+
+        tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)
+
+        # cls metric
+
+        precision = tp / float(tp + fp)
+        recall = tp / float(fp + fn)
+
+        if precision + recall > 0.0:
+            f1 = 2.0 * (precision * recall) / (precision + recall)
+        else:
+            f1 = 0.0
+        f1 = np.nan_to_num(f1)
+        return f1
+
+    def get_labels_start_end_time(self, frame_wise_labels):
+        labels = []
+        starts = []
+        ends = []
+        last_label = frame_wise_labels[0]
+        labels.append(frame_wise_labels[0])
+        starts.append(0)
+        for i in range(len(frame_wise_labels)):
+            if frame_wise_labels[i] != last_label:
+                labels.append(frame_wise_labels[i])
+                starts.append(i)
+                ends.append(i)
+                last_label = frame_wise_labels[i]
+        ends.append(i + 1)
+        return labels, starts, ends
+
+    def levenstein(self, p, y, norm=False):
+        m_row = len(p)
+        n_col = len(y)
+        D = np.zeros([m_row + 1, n_col + 1], np.float)
+        for i in range(m_row + 1):
+            D[i, 0] = i
+        for i in range(n_col + 1):
+            D[0, i] = i
+
+        for j in range(1, n_col + 1):
+            for i in range(1, m_row + 1):
+                if y[j - 1] == p[i - 1]:
+                    D[i, j] = D[i - 1, j - 1]
+                else:
+                    D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,
+                                  D[i - 1, j - 1] + 1)
+
+        if norm:
+            score = (1 - D[-1, -1] / max(m_row, n_col)) * 100
+        else:
+            score = D[-1, -1]
+
+        return score
+
+    def edit_score(self, recognized, ground_truth, norm=True):
+        P, _, _ = self.get_labels_start_end_time(recognized)
+        Y, _, _ = self.get_labels_start_end_time(ground_truth)
+        return self.levenstein(P, Y, norm)
+
+    def f_score(self, recognized, ground_truth, overlap):
+        p_label, p_start, p_end = self.get_labels_start_end_time(recognized)
+        y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)
+
+        tp = 0
+        fp = 0
+
+        hits = np.zeros(len(y_label))
+
+        for j in range(len(p_label)):
+            intersection = np.minimum(p_end[j], y_end) - np.maximum(
+                p_start[j], y_start)
+            union = np.maximum(p_end[j], y_end) - np.minimum(
+                p_start[j], y_start)
+            IoU = (1.0 * intersection / union) * (
+                [p_label[j] == y_label[x] for x in range(len(y_label))])
+            # Get the best scoring segment
+            idx = np.array(IoU).argmax()
+
+            if IoU[idx] >= overlap and not hits[idx]:
+                tp += 1
+                hits[idx] = 1
+            else:
+                fp += 1
+        fn = len(y_label) - sum(hits)
+        return float(tp), float(fp), float(fn)
diff --git a/docs/src/paddlevideo/modeling/heads/pptimesformer_head.py b/docs/src/paddlevideo/modeling/heads/pptimesformer_head.py
new file mode 100644
index 000000000..113bde8b5
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/pptimesformer_head.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Linear
+
+from ..registry import HEADS
+from ..weight_init import trunc_normal_, weight_init_
+from .base import BaseHead
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+
+@HEADS.register()
+class ppTimeSformerHead(BaseHead):
+    """TimeSformerHead Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 std=0.02,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+        self.std = std
+        self.fc = Linear(self.in_channels,
+                         self.num_classes,
+                         bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'TruncatedNormal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.0,
+                     std=self.std)
+        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal
+        trunc_normal_(self.fc.weight, std=self.std)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+        # XXX: check dropout location!
+        # x.shape = [N, embed_dim]
+
+        score = self.fc(x)
+        # [N, num_class]
+        # x = F.softmax(x)  # NOTE remove
+        return score
diff --git a/docs/src/paddlevideo/modeling/heads/pptsm_head.py b/docs/src/paddlevideo/modeling/heads/pptsm_head.py
new file mode 100644
index 000000000..45f50fd13
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/pptsm_head.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+from paddle.nn import Linear
+from paddle.regularizer import L2Decay
+from .tsn_head import TSNHead
+from ..registry import HEADS
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class ppTSMHead(TSNHead):
+    """ ppTSM Head
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.8.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.001.
+        kwargs (dict, optional): Any keyword argument to initialize.
+    """
+    def __init__(
+            self,
+            num_classes,
+            in_channels,  # NOTE: 2048 for >= R50, 512 for <= R34
+            drop_ratio=0.8,
+            std=0.01,
+            data_format="NCHW",
+            num_seg=8,
+            **kwargs):
+
+        super().__init__(num_classes,
+                         in_channels,
+                         drop_ratio=drop_ratio,
+                         std=std,
+                         data_format=data_format,
+                         **kwargs)
+
+        self.fc = Linear(self.in_channels,
+                         self.num_classes,
+                         weight_attr=ParamAttr(learning_rate=5.0,
+                                               regularizer=L2Decay(1e-4)),
+                         bias_attr=ParamAttr(learning_rate=10.0,
+                                             regularizer=L2Decay(0.0)))
+        self.stdv = std
+        self.num_seg = num_seg
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)
+
+    def forward(self, x, num_seg=None):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+
+        #XXX: check dropout location!
+        # [N * num_segs, in_channels, 7, 7]
+        x = self.avgpool2d(x)
+        # [N * num_segs, in_channels, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+            # [N * num_seg, in_channels, 1, 1]
+        num_seg = num_seg if num_seg is not None else self.num_seg
+        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+        # [N, num_seg, in_channels]
+        x = paddle.mean(x, axis=1)
+        # [N, in_channels]
+        x = paddle.reshape(x, shape=[-1, self.in_channels])
+        # [N, in_channels]
+        score = self.fc(x)
+        # [N, num_class]
+        #x = F.softmax(x)  #NOTE remove
+        return score
diff --git a/docs/src/paddlevideo/modeling/heads/pptsn_head.py b/docs/src/paddlevideo/modeling/heads/pptsn_head.py
new file mode 100644
index 000000000..2655c903c
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/pptsn_head.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout
+from paddle.regularizer import L2Decay
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class ppTSNHead(BaseHead):
+    """ppTSN Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.4.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        data_format(str): data format of input tensor in ['NCHW', 'NHWC']. Default: 'NCHW'.
+        fclr5(bool): Whether to increase the learning rate of the fully connected layer. Default: True
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 drop_ratio=0.4,
+                 std=0.01,
+                 data_format="NCHW",
+                 fclr5=True,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+        self.drop_ratio = drop_ratio
+        self.std = std
+
+        # NOTE: global pool performance
+        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)
+
+        if self.drop_ratio != 0:
+            self.dropout = Dropout(p=self.drop_ratio)
+        else:
+            self.dropout = None
+        self.fc = Linear(
+            self.in_channels,
+            self.num_classes,
+            weight_attr=ParamAttr(learning_rate=5.0 if fclr5 else 1.0,
+                                  regularizer=L2Decay(1e-4)),
+            bias_attr=ParamAttr(learning_rate=10.0 if fclr5 else 1.0,
+                                regularizer=L2Decay(0.0)))
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+        weight_init_(self.fc,
+                     'Normal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.,
+                     std=self.std)
+
+    def forward(self, x, num_seg=8):
+        """Define how the head is going to run.
+
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+
+        # XXX: check dropout location!
+        # [N * num_segs, in_channels, 7, 7]
+        x = self.avgpool2d(x)
+        # [N * num_segs, in_channels, 1, 1]
+        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+        # [N, num_seg, in_channels]
+        x = paddle.mean(x, axis=1)
+        # [N, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+            # [N, in_channels]
+        x = paddle.reshape(x, shape=[-1, self.in_channels])
+        # [N, in_channels]
+        score = self.fc(x)
+        # [N, num_class]
+        # x = F.softmax(x)  # NOTE remove
+        return score
diff --git a/docs/src/paddlevideo/modeling/heads/roi_extractor.py b/docs/src/paddlevideo/modeling/heads/roi_extractor.py
new file mode 100644
index 000000000..3aaef23c1
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/roi_extractor.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+#@register
+class RoIAlign(object):
+
+    def __init__(self,
+                 resolution=14,
+                 spatial_scale=0.0625,
+                 sampling_ratio=0,
+                 aligned=False):
+        super(RoIAlign, self).__init__()
+        self.resolution = resolution
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.aligned = aligned
+
+    def __call__(self, feats, roi, rois_num):
+        roi = paddle.concat(roi) if len(roi) > 1 else roi[0]
+        rois_num = paddle.to_tensor(rois_num, dtype='int32')
+        rois_num = paddle.cast(rois_num, dtype='int32')
+        if len(feats) == 1:
+            roi_feat = paddle.vision.ops.roi_align(feats,
+                                     roi,
+                                     rois_num,
+                                     self.resolution,
+                                     self.spatial_scale,
+                                     self.sampling_ratio,
+                                     self.aligned)
+        else:
+            rois_feat_list = []
+            roi_feat = paddle.vision.ops.roi_align(feats,
+                                     roi,
+                                     rois_num,
+                                     self.resolution,
+                                     self.spatial_scale,
+                                     self.sampling_ratio,
+                                     self.aligned)
+
+        return roi_feat
diff --git a/docs/src/paddlevideo/modeling/heads/roi_head.py b/docs/src/paddlevideo/modeling/heads/roi_head.py
new file mode 100644
index 000000000..be34a33ef
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/roi_head.py
@@ -0,0 +1,177 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from .. import builder
+from ..registry import HEADS
+
+
+def bbox2result(bboxes, labels, num_classes, img_shape, thr=0.01):
+    """Convert detection results to a list of numpy arrays.  """
+    if len(bboxes) == 0:
+        return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32))
+    else:
+        bboxes = bboxes[0]
+        labels = labels
+        img_shape_np = img_shape
+        img_h, img_w = img_shape_np[0][0], img_shape_np[0][1]
+
+        img_w = paddle.cast(img_w, dtype='int32')
+        img_h = paddle.cast(img_h, dtype='int32')
+
+        bboxes[:, 0::2] /= img_w
+        bboxes[:, 1::2] /= img_h
+
+        # We only handle multilabel now
+        assert labels.shape[-1] > 1
+
+        scores = labels  # rename
+        thr = (thr, ) * num_classes if isinstance(thr, float) else thr
+        assert scores.shape[1] == num_classes
+        assert len(thr) == num_classes
+
+        result = []
+        for i in range(num_classes - 1):
+            #step1. 对该类, 每个bbox的得分是否大于阈值
+            where = scores[:, i + 1] > thr[i + 1]
+
+            where = paddle.nonzero(where)  # index
+            bboxes_select = paddle.index_select(x=bboxes, index=where)
+            bboxes_select = bboxes_select[:, :4]
+
+            scores_select = paddle.index_select(x=scores, index=where)
+            scores_select = scores_select[:, i + 1:i + 2]
+
+            result.append(
+                #对于step1中得分大于阈值的bbox(可能为空), 将bbox及在该类的score放入result列表.
+                paddle.concat((bboxes_select, scores_select), axis=1).numpy())
+
+        return result
+
+
+@HEADS.register()
+class AVARoIHead(nn.Layer):
+
+    def __init__(self,
+                 assigner,
+                 sampler,
+                 pos_weight=1.0,
+                 action_thr=0.0,
+                 bbox_roi_extractor=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+        self.assigner = assigner
+        self.sampler = sampler
+        self.pos_weight = pos_weight
+        self.action_thr = action_thr
+        self.init_assigner_sampler()
+        if bbox_head is not None:
+            self.init_bbox_head(bbox_roi_extractor, bbox_head)
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        self.bbox_assigner = builder.build_assigner(self.assigner)
+        self.bbox_sampler = builder.build_sampler(self.sampler, context=self)
+
+    def init_bbox_head(self, bbox_roi_extractor, bbox_head):
+        """Initialize ``bbox_head``"""
+        self.bbox_roi_extractor = builder.build_roi_extractor(
+            bbox_roi_extractor)
+        self.bbox_head = builder.build_head(bbox_head)
+
+    def _bbox_forward(self, x, rois, rois_num):
+        bbox_feat = self.bbox_roi_extractor(x, rois, rois_num)
+        cls_score, bbox_pred = self.bbox_head(
+            bbox_feat, rois, rois_num
+        )  #deal with: when roi's width or height = 0 , roi_align is wrong
+        bbox_results = dict(cls_score=cls_score,
+                            bbox_pred=bbox_pred,
+                            bbox_feats=bbox_feat)
+        return bbox_results
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels):
+        """Run forward function and calculate loss for box head in training."""
+        rois = [res.bboxes for res in sampling_results]
+        rois_num = [res.bboxes.shape[0] for res in sampling_results]
+        bbox_results = self._bbox_forward(x, rois, rois_num)
+        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
+                                                  gt_labels, self.pos_weight)
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], bbox_targets)
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def train_step(self, x, img_metas, proposal_list, gt_bboxes, gt_labels):
+        #1. assign gts and sample proposals
+        num_imgs = len(img_metas[0])
+        sampling_results = []
+        for i in range(num_imgs):
+            assign_result = self.bbox_assigner.assign(proposal_list[i],
+                                                      gt_bboxes[i],
+                                                      gt_labels[i])
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       proposal_list[i],
+                                                       gt_bboxes[i],
+                                                       gt_labels[i])
+            sampling_results.append(sampling_result)
+
+        #2. forward and loss
+        bbox_results = self._bbox_forward_train(x, sampling_results, gt_bboxes,
+                                                gt_labels)
+        losses = dict()
+        losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def simple_test(self, x, proposal_list, img_shape, rescale=False):
+        x_shape = x[0].shape
+        #assert x_shape[0] == 1, 'only accept 1 sample at test mode'
+
+        det_bboxes, det_labels = self.simple_test_bboxes(x,
+                                                         img_shape,
+                                                         proposal_list,
+                                                         self.action_thr,
+                                                         rescale=rescale)
+
+        bbox_results = bbox2result(det_bboxes, det_labels,
+                                   self.bbox_head.num_classes, img_shape,
+                                   self.action_thr)
+        return [bbox_results]
+
+    def simple_test_bboxes(self,
+                           x,
+                           img_shape,
+                           proposals,
+                           action_thr,
+                           rescale=False):
+        """Test only det bboxes without augmentation."""
+        rois = [proposals]
+        rois_num = [rois[0].shape[0]]
+        bbox_results = self._bbox_forward(x, rois, rois_num)
+        cls_score = bbox_results['cls_score']
+        crop_quadruple = np.array([0, 0, 1, 1])
+        flip = False
+        det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
+            rois,
+            cls_score,
+            img_shape,
+            flip=flip,
+            crop_quadruple=crop_quadruple)
+
+        return det_bboxes, det_labels
diff --git a/docs/src/paddlevideo/modeling/heads/single_straight3d.py b/docs/src/paddlevideo/modeling/heads/single_straight3d.py
new file mode 100644
index 000000000..805d93ebf
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/single_straight3d.py
@@ -0,0 +1,79 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import numpy as np
+from ..registry import ROI_EXTRACTORS
+from .roi_extractor import RoIAlign
+
+
+@ROI_EXTRACTORS.register()
+class SingleRoIExtractor3D(nn.Layer):
+    """Extract RoI features from a single level feature map.  """
+    def __init__(self,
+                 roi_layer_type='RoIAlign',
+                 featmap_stride=16,
+                 output_size=16,
+                 sampling_ratio=0,
+                 pool_mode='avg',
+                 aligned=True,
+                 with_temporal_pool=True,
+                 with_global=False):
+        super().__init__()
+        self.roi_layer_type = roi_layer_type
+        assert self.roi_layer_type in ['RoIPool', 'RoIAlign']
+        self.featmap_stride = featmap_stride
+        self.spatial_scale = 1. / self.featmap_stride
+        self.output_size = output_size
+        self.sampling_ratio = sampling_ratio
+        self.pool_mode = pool_mode
+        self.aligned = aligned
+        self.with_temporal_pool = with_temporal_pool
+        self.with_global = with_global
+
+        self.roi_layer = RoIAlign(resolution=self.output_size,
+                                  spatial_scale=self.spatial_scale,
+                                  sampling_ratio=self.sampling_ratio,
+                                  aligned=self.aligned)
+
+    def init_weights(self):
+        pass
+
+    # The shape of feat is N, C, T, H, W
+    def forward(self, feat, rois, rois_num):
+        if len(feat) >= 2:
+            assert self.with_temporal_pool
+        if self.with_temporal_pool:
+            xi = 0
+            for x in feat:
+                xi = xi + 1
+                y = paddle.mean(x, 2, keepdim=True)
+            feat = [paddle.mean(x, 2, keepdim=True) for x in feat]
+        feat = paddle.concat(feat, axis=1)  # merge slow and fast
+        roi_feats = []
+        for t in range(feat.shape[2]):
+            if type(t) == paddle.static.Variable:
+                index = paddle.to_tensor(t)
+            else:
+                data_index = np.array([t]).astype('int32')
+                index = paddle.to_tensor(data_index)
+
+            frame_feat = paddle.index_select(feat, index, axis=2)
+            frame_feat = paddle.squeeze(frame_feat,
+                                        axis=2)  #axis=2,避免N=1时, 第一维度被删除.
+            roi_feat = self.roi_layer(frame_feat, rois, rois_num)
+            roi_feats.append(roi_feat)
+
+        ret = paddle.stack(roi_feats, axis=2)
+        return ret
diff --git a/docs/src/paddlevideo/modeling/heads/slowfast_head.py b/docs/src/paddlevideo/modeling/heads/slowfast_head.py
new file mode 100644
index 000000000..bd18bafda
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/slowfast_head.py
@@ -0,0 +1,137 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..registry import HEADS
+from .base import BaseHead
+
+import paddle
+import paddle.nn.functional as F
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class SlowFastHead(BaseHead):
+    """
+    ResNe(X)t 3D head.
+    This layer performs a fully-connected projection during training, when the
+    input size is 1x1x1. It performs a convolutional projection during testing
+    when the input size is larger than 1x1x1. If the inputs are from multiple
+    different pathways, the inputs will be concatenated after pooling.
+    """
+    def __init__(self,
+                 width_per_group,
+                 alpha,
+                 beta,
+                 num_classes,
+                 num_frames,
+                 crop_size,
+                 dropout_rate,
+                 pool_size_ratio=[[1, 1, 1], [1, 1, 1]],
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 multigrid_short=False,
+                 **kwargs):
+        """
+        ResNetBasicHead takes p pathways as input where p in [1, infty].
+
+        Args:
+            dim_in (list): the list of channel dimensions of the p inputs to the
+                ResNetHead.
+            num_classes (int): the channel dimensions of the p outputs to the
+                ResNetHead.
+            pool_size (list): the list of kernel sizes of p spatial temporal
+                poolings, temporal pool kernel size, spatial pool kernel size,
+                spatial pool kernel size in order.
+            dropout_rate (float): dropout rate. If equal to 0.0, perform no
+                dropout.
+        """
+        super().__init__(num_classes, loss_cfg, **kwargs)
+        self.multigrid_short = multigrid_short
+        self.width_per_group = width_per_group
+        self.alpha = alpha
+        self.beta = beta
+        self.num_classes = num_classes
+        self.num_frames = num_frames
+        self.crop_size = crop_size
+        self.dropout_rate = dropout_rate
+        self.pool_size_ratio = pool_size_ratio
+
+        self.dim_in = [
+            self.width_per_group * 32,
+            self.width_per_group * 32 // self.beta,
+        ]
+        self.pool_size = [None, None] if self.multigrid_short else [
+            [
+                self.num_frames // self.alpha // self.pool_size_ratio[0][0],
+                self.crop_size // 32 // self.pool_size_ratio[0][1],
+                self.crop_size // 32 // self.pool_size_ratio[0][2],
+            ],
+            [
+                self.num_frames // self.pool_size_ratio[1][0],
+                self.crop_size // 32 // self.pool_size_ratio[1][1],
+                self.crop_size // 32 // self.pool_size_ratio[1][2],
+            ],
+        ]
+
+        assert (len({len(self.pool_size), len(self.dim_in)
+                     }) == 1), "pathway dimensions are not consistent."
+        self.num_pathways = len(self.pool_size)
+
+        self.dropout = paddle.nn.Dropout(p=self.dropout_rate)
+
+        self.projection = paddle.nn.Linear(
+            in_features=sum(self.dim_in),
+            out_features=self.num_classes,
+        )
+
+    def init_weights(self):
+        weight_init_(self.projection,
+                     "Normal",
+                     bias_value=0.0,
+                     mean=0.0,
+                     std=0.01)
+
+    def forward(self, inputs):
+        assert (len(inputs) == self.num_pathways
+                ), "Input tensor does not contain {} pathway".format(
+                    self.num_pathways)
+        pool_out = []
+        for pathway in range(self.num_pathways):
+            if self.pool_size[pathway] is None:
+                tmp_out = F.adaptive_avg_pool3d(x=inputs[pathway],
+                                                output_size=(1, 1, 1),
+                                                data_format="NCDHW")
+            else:
+                tmp_out = F.avg_pool3d(x=inputs[pathway],
+                                       kernel_size=self.pool_size[pathway],
+                                       stride=1,
+                                       data_format="NCDHW")
+            pool_out.append(tmp_out)
+
+        x = paddle.concat(x=pool_out, axis=1)
+        x = paddle.transpose(x=x, perm=(0, 2, 3, 4, 1))
+
+        # Perform dropout.
+        if self.dropout_rate > 0.0:
+            x = self.dropout(x)
+
+        x = self.projection(x)
+
+        # Performs fully convlutional inference.
+        if not self.training:  # attr of base class
+            x = F.softmax(x, axis=4)
+            x = paddle.mean(x, axis=[1, 2, 3])
+
+        x = paddle.reshape(x, shape=(x.shape[0], -1))
+        return x
diff --git a/docs/src/paddlevideo/modeling/heads/stgcn_head.py b/docs/src/paddlevideo/modeling/heads/stgcn_head.py
new file mode 100644
index 000000000..fc80d6633
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/stgcn_head.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class STGCNHead(BaseHead):
+    """
+    Head for ST-GCN model.
+    Args:
+        in_channels: int, input feature channels. Default: 256.
+        num_classes: int, number classes. Default: 10.
+    """
+    def __init__(self, in_channels=256, num_classes=10, **kwargs):
+        super().__init__(num_classes, in_channels, **kwargs)
+        self.fcn = nn.Conv2D(in_channels=in_channels,
+                             out_channels=num_classes,
+                             kernel_size=1)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                weight_init_(layer, 'Normal', std=0.02)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        """
+        x = self.fcn(x)
+        x = paddle.reshape_(x, (x.shape[0], -1))  # N,C,1,1 --> N,C
+
+        return x
diff --git a/docs/src/paddlevideo/modeling/heads/timesformer_head.py b/docs/src/paddlevideo/modeling/heads/timesformer_head.py
new file mode 100644
index 000000000..d02a3cca8
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/timesformer_head.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Linear
+
+from ..registry import HEADS
+from ..weight_init import trunc_normal_, weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class TimeSformerHead(BaseHead):
+    """TimeSformerHead Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 std=0.02,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+        self.std = std
+        self.fc = Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'TruncatedNormal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.0,
+                     std=self.std)
+        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal
+        trunc_normal_(self.fc.weight, std=self.std)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+        # XXX: check dropout location!
+        # x.shape = [N, embed_dim]
+
+        score = self.fc(x)
+        # [N, num_class]
+        # x = F.softmax(x)  # NOTE remove
+        return score
diff --git a/docs/src/paddlevideo/modeling/heads/token_shift_head.py b/docs/src/paddlevideo/modeling/heads/token_shift_head.py
new file mode 100644
index 000000000..52e9309da
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/token_shift_head.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Linear
+import paddle
+
+from ..registry import HEADS
+from ..weight_init import trunc_normal_, weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class TokenShiftHead(BaseHead):
+    """TokenShift Transformer Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        num_seg(int): The number of segments. Default: 8. 
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        ls_eps (float): Label smoothing epsilon. Default: 0.01.
+        std (float): Std(Scale) Value in normal initilizar. Default: 0.02.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_seg=8,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 ls_eps=0.01,
+                 std=0.02,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, ls_eps)
+        self.num_seg = num_seg
+        self.std = std
+        self.fc = Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'TruncatedNormal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.0,
+                     std=self.std)
+        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal
+        trunc_normal_(self.fc.weight, std=self.std)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+        # XXX: check dropout location!
+        # x.shape = [N, embed_dim]
+        score = self.fc(x)
+        # [N*T, num_class]
+        _, _m = score.shape
+        _t = self.num_seg
+        score = score.reshape([-1, _t, _m])
+        score = paddle.mean(score, 1)  # averaging predictions for every frame
+        score = paddle.squeeze(score, axis=1)
+        return score
diff --git a/docs/src/paddlevideo/modeling/heads/transnetv2_head.py b/docs/src/paddlevideo/modeling/heads/transnetv2_head.py
new file mode 100644
index 000000000..2ea67d4d3
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/transnetv2_head.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..losses import TransNetV2Loss
+from ...metrics.transnetv2_metric import create_scene_based_summaries
+
+@HEADS.register()
+class TransNetV2Head(BaseHead):
+    """TransNetV2 Head.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name="TransNetV2Loss")
+                 ):
+        super().__init__(num_classes,
+                         in_channels,
+                         loss_cfg)
+
+    def loss(self, one_hot_pred, one_hot_gt,
+                many_hot_pred=None, many_hot_gt=None, reg_losses=None):
+        losses = dict()
+        loss = self.loss_func(scores, labels, **kwargs)
+
+        f1 = self.get_score(one_hot_pred, one_hot_gt)
+        losses['f1'] = f1
+        losses['loss'] = loss
+        return losses
+
+    def get_score(self, one_hot_pred, one_hot_gt):
+        f1 = create_scene_based_summaries(one_hot_pred, one_hot_gt)
+        return f1
diff --git a/docs/src/paddlevideo/modeling/heads/tsm_head.py b/docs/src/paddlevideo/modeling/heads/tsm_head.py
new file mode 100644
index 000000000..955930168
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/tsm_head.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+from paddle import ParamAttr
+from paddle.nn import Linear
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from .tsn_head import TSNHead
+from ..registry import HEADS
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class TSMHead(TSNHead):
+    """ TSM Head
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.5.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.001.
+        kwargs (dict, optional): Any keyword argument to initialize.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 drop_ratio=0.5,
+                 std=0.001,
+                 data_format="NCHW",
+                 **kwargs):
+        super().__init__(num_classes,
+                         in_channels,
+                         drop_ratio=drop_ratio,
+                         std=std,
+                         data_format=data_format,
+                         **kwargs)
+
+        self.fc = Linear(self.in_channels,
+                         self.num_classes,
+                         weight_attr=ParamAttr(learning_rate=5.0,
+                                               regularizer=L2Decay(1e-4)),
+                         bias_attr=ParamAttr(learning_rate=10.0,
+                                             regularizer=L2Decay(0.0)))
+
+        assert (data_format in [
+            'NCHW', 'NHWC'
+        ]), f"data_format must be 'NCHW' or 'NHWC', but got {data_format}"
+
+        self.data_format = data_format
+
+        self.stdv = std
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)
+
+    def forward(self, x, num_seg):
+        """Define how the tsm-head is going to run.
+
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+        # x.shape = [N * num_segs, in_channels, 7, 7]
+
+        x = self.avgpool2d(x)  # [N * num_segs, in_channels, 1, 1]
+
+        if self.dropout is not None:
+            x = self.dropout(x)  # [N * num_seg, in_channels, 1, 1]
+
+        if self.data_format == 'NCHW':
+            x = paddle.reshape(x, x.shape[:2])
+        else:
+            x = paddle.reshape(x, x.shape[::3])
+        score = self.fc(x)  # [N * num_seg, num_class]
+        score = paddle.reshape(
+            score, [-1, num_seg, score.shape[1]])  # [N, num_seg, num_class]
+        score = paddle.mean(score, axis=1)  # [N, num_class]
+        score = paddle.reshape(score,
+                               shape=[-1, self.num_classes])  # [N, num_class]
+        # score = F.softmax(score)  #NOTE remove
+        return score
diff --git a/docs/src/paddlevideo/modeling/heads/tsn_head.py b/docs/src/paddlevideo/modeling/heads/tsn_head.py
new file mode 100644
index 000000000..f2f906bce
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/heads/tsn_head.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class TSNHead(BaseHead):
+    """TSN Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.4.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 drop_ratio=0.4,
+                 std=0.01,
+                 data_format="NCHW",
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+        self.drop_ratio = drop_ratio
+        self.std = std
+
+        #NOTE: global pool performance
+        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)
+
+        if self.drop_ratio != 0:
+            self.dropout = Dropout(p=self.drop_ratio)
+        else:
+            self.dropout = None
+
+        self.fc = Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'Normal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.,
+                     std=self.std)
+
+    def forward(self, x, num_seg):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+
+        #XXX: check dropout location!
+        # [N * num_segs, in_channels, 7, 7]
+
+        x = self.avgpool2d(x)
+        # [N * num_segs, in_channels, 1, 1]
+        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+        # [N, num_seg, in_channels]
+        x = paddle.mean(x, axis=1)
+        # [N, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+            # [N, in_channels]
+        score = self.fc(x)
+        # [N, num_class]
+        #x = F.softmax(x)  #NOTE remove
+        return score
diff --git a/docs/src/paddlevideo/modeling/losses/__init__.py b/docs/src/paddlevideo/modeling/losses/__init__.py
new file mode 100644
index 000000000..d784c4c11
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BaseWeightedLoss
+from .bmn_loss import BMNLoss
+from .cross_entropy_loss import CrossEntropyLoss
+from .depth_loss import ADDSLoss
+from .transnetv2_loss import TransNetV2Loss
+from .actbert_loss import ActBertLoss
+from .asrf_loss import ASRFLoss
+from .distillation_loss import DistillationCELoss, DistillationDMLLoss
+from .yowo_loss import RegionLoss
+
+__all__ = [
+    'CrossEntropyLoss', 'BMNLoss', 'TransNetV2Loss', 'ActBertLoss', 'ADDSLoss',
+    'BaseWeightedLoss', 'ASRFLoss', 'DistillationCELoss', 'DistillationDMLLoss',
+    'RegionLoss'
+]
diff --git a/docs/src/paddlevideo/modeling/losses/actbert_loss.py b/docs/src/paddlevideo/modeling/losses/actbert_loss.py
new file mode 100644
index 000000000..10ffea6e6
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/actbert_loss.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class ActBertLoss(BaseWeightedLoss):
+    """Loss for ActBert model
+    """
+    def __init__(self, vocab_size=30522, a_target_size=700):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.a_target_size = a_target_size
+        self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+        self.vis_criterion = nn.KLDivLoss(reduction="none")
+
+    def forward(self, prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \
+                text_labels, image_label, image_target, action_label, next_sentence_label):
+        """
+        Args:
+            text_label: text label(with mask). Shape: [batch_size, seqence_length]
+            image_label: image label(with mask). Shape: [batch_size, region_length]
+            image_target: label of image feature distribution,
+                            Shape: [batch_size, region_length-1, num_image_class](minus 1 for xxx).
+            action label: action label(with mask), Shape: [batch_size, action_length]
+            next_sentence_label: is next sentence or not. Shape: [batch_size]
+        """
+        prediction_scores_v = prediction_scores_v[:,
+                                                  1:]  #8,37,1601 --> 8,36,1601
+
+        img_loss = self.vis_criterion(
+            F.log_softmax(prediction_scores_v, axis=2),
+            image_target  #8,36,1601
+        )
+        masked_img_loss = paddle.sum(
+            img_loss * (image_label == 1).unsqueeze(2).astype('float32')) / max(
+                paddle.sum((image_label == 1).astype('float32')), 1e-6)
+
+        masked_text_loss = self.loss_fct(
+            prediction_scores_t.reshape([-1, self.vocab_size]),  #8,36,30522
+            text_labels.reshape([-1]),  #8,36   # label -1 will be ignored
+        )
+
+        masked_action_loss = self.loss_fct(
+            prediction_scores_a.reshape([-1, self.a_target_size]),  #8,5,700
+            action_label.reshape([-1]),  #8,5
+        )
+
+        next_sentence_loss = self.loss_fct(
+            seq_relationship_score.reshape([-1, 2]),
+            next_sentence_label.reshape([-1])  #8,2
+        )
+
+        total_loss = masked_text_loss.unsqueeze(0) + masked_img_loss.unsqueeze(
+            0) + masked_action_loss.unsqueeze(0) + next_sentence_loss.unsqueeze(
+                0)
+        return total_loss
diff --git a/docs/src/paddlevideo/modeling/losses/asrf_loss.py b/docs/src/paddlevideo/modeling/losses/asrf_loss.py
new file mode 100644
index 000000000..ce5d6b1ad
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/asrf_loss.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/yiskw713/asrf/libs/loss_fn/__init__.py
+
+import numpy as np
+import pandas as pd
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import sys
+import os
+
+from ..registry import LOSSES
+
+
+class TMSE(nn.Layer):
+    """
+    Temporal MSE Loss Function
+    Proposed in Y. A. Farha et al. MS-TCN: Multi-Stage Temporal Convolutional Network for ActionSegmentation in CVPR2019
+    arXiv: https://arxiv.org/pdf/1903.01945.pdf
+    """
+
+    def __init__(self, threshold=4, ignore_index=255):
+        super().__init__()
+        self.threshold = threshold
+        self.ignore_index = ignore_index
+        self.mse = nn.MSELoss(reduction="none")
+
+    def forward(self, preds, gts):
+
+        total_loss = 0.0
+        batch_size = preds.shape[0]
+        for pred, gt in zip(preds, gts):
+            pred = paddle.gather(pred,
+                                 paddle.nonzero(gt != self.ignore_index)[:, 0])
+
+            loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),
+                            F.log_softmax(pred[:, :-1], axis=1))
+
+            loss = paddle.clip(loss, min=0, max=self.threshold**2)
+            total_loss += paddle.mean(loss)
+
+        return total_loss / batch_size
+
+
+class GaussianSimilarityTMSE(nn.Layer):
+    """
+    Temporal MSE Loss Function with Gaussian Similarity Weighting
+    """
+
+    def __init__(self, threshold=4, sigma=1.0, ignore_index=255):
+        super().__init__()
+        self.threshold = threshold
+        self.ignore_index = ignore_index
+        self.mse = nn.MSELoss(reduction="none")
+        self.sigma = sigma
+
+    def forward(self, preds, gts, sim_index):
+        """
+        Args:
+            preds: the output of model before softmax. (N, C, T)
+            gts: Ground Truth. (N, T)
+            sim_index: similarity index. (N, C, T)
+        Return:
+            the value of Temporal MSE weighted by Gaussian Similarity.
+        """
+        total_loss = 0.0
+        batch_size = preds.shape[0]
+        for pred, gt, sim in zip(preds, gts, sim_index):
+            pred = paddle.gather(pred,
+                                 paddle.nonzero(gt != self.ignore_index)[:, 0],
+                                 axis=1)
+            sim = paddle.gather(sim,
+                                paddle.nonzero(gt != self.ignore_index)[:, 0],
+                                axis=1)
+
+            # calculate gaussian similarity
+            diff = sim[:, 1:] - sim[:, :-1]
+            similarity = paddle.exp(
+                (-1 * paddle.norm(diff, axis=0)) / (2 * self.sigma**2))
+
+            # calculate temporal mse
+            loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),
+                            F.log_softmax(pred[:, :-1], axis=1))
+            loss = paddle.clip(loss, min=0, max=self.threshold**2)
+
+            # gaussian similarity weighting
+            loss = similarity * loss
+
+            total_loss += paddle.mean(loss)
+
+        return total_loss / batch_size
+
+
+class FocalLoss(nn.Layer):
+
+    def __init__(self,
+                 weight=None,
+                 size_average=True,
+                 batch_average=True,
+                 ignore_index=255,
+                 gamma=2.0,
+                 alpha=0.25):
+        super().__init__()
+
+        self.gamma = gamma
+        self.alpha = alpha
+        self.batch_average = batch_average
+        self.criterion = nn.CrossEntropyLoss(weight=weight,
+                                             ignore_index=ignore_index,
+                                             size_average=size_average)
+
+    def forward(self, logit, target):
+        n, _, _ = logit.size()
+
+        logpt = -self.criterion(logit, target.long())
+        pt = paddle.exp(logpt)
+
+        if self.alpha is not None:
+            logpt *= self.alpha
+
+        loss = -((1 - pt)**self.gamma) * logpt
+
+        if self.batch_average:
+            loss /= n
+
+        return loss
+
+
+class ActionSegmentationLoss(nn.Layer):
+    """
+    Loss Function for Action Segmentation
+    You can choose the below loss functions and combine them.
+        - Cross Entropy Loss (CE)
+        - Focal Loss
+        - Temporal MSE (TMSE)
+        - Gaussian Similarity TMSE (GSTMSE)
+    """
+
+    def __init__(self,
+                 num_classes,
+                 file_path,
+                 label_path,
+                 ce=True,
+                 focal=True,
+                 tmse=False,
+                 gstmse=False,
+                 weight=None,
+                 threshold=4.,
+                 ignore_index=255,
+                 ce_weight=1.0,
+                 focal_weight=1.0,
+                 tmse_weight=0.15,
+                 gstmse_weight=0.15):
+        super().__init__()
+        self.criterions = []
+        self.weights = []
+
+        self.num_classes = num_classes
+        self.file_path = file_path
+        self.label_path = label_path
+        if weight:
+            class_weight = self.get_class_weight()
+        else:
+            class_weight = None
+
+        if ce:
+            self.criterions.append(
+                nn.CrossEntropyLoss(weight=class_weight,
+                                    ignore_index=ignore_index))
+            self.weights.append(ce_weight)
+
+        if focal:
+            self.criterions.append(FocalLoss(ignore_index=ignore_index))
+            self.weights.append(focal_weight)
+
+        if tmse:
+            self.criterions.append(
+                TMSE(threshold=threshold, ignore_index=ignore_index))
+            self.weights.append(tmse_weight)
+
+        if gstmse:
+            self.criterions.append(
+                GaussianSimilarityTMSE(threshold=threshold,
+                                       ignore_index=ignore_index))
+            self.weights.append(gstmse_weight)
+
+        if len(self.criterions) == 0:
+            print("You have to choose at least one loss function.")
+            sys.exit(1)
+
+    def get_class_weight(self):
+        """
+        Class weight for CrossEntropy
+        Class weight is calculated in the way described in:
+            D. Eigen and R. Fergus, “Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture,” in ICCV,
+            openaccess: https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Eigen_Predicting_Depth_Surface_ICCV_2015_paper.pdf
+        """
+        # load file list
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+
+        nums = [0 for i in range(self.num_classes)]
+        for i in range(len(info)):
+            video_name = info[i]
+            file_name = video_name.split('.')[0] + ".npy"
+            label_file_path = os.path.join(self.label_path, file_name)
+            label = np.load(label_file_path).astype(np.int64)
+            num, cnt = np.unique(label, return_counts=True)
+            for n, c in zip(num, cnt):
+                nums[n] += c
+
+        class_num = paddle.to_tensor(nums, dtype="float32")
+        total = class_num.sum().item()
+        frequency = class_num / total
+        median = paddle.median(frequency)
+        class_weight = median / frequency
+        return class_weight
+
+    def forward(self, preds, gts, sim_index):
+        """
+        Args:
+            preds: paddle.float (N, C, T).
+            gts: paddle.int64 (N, T).
+            sim_index: paddle.float (N, C', T).
+        """
+        loss = 0.0
+        for criterion, weight in zip(self.criterions, self.weights):
+            if isinstance(criterion, GaussianSimilarityTMSE):
+                loss += weight * criterion(preds, gts, sim_index)
+            elif isinstance(criterion, nn.CrossEntropyLoss):
+                preds_t = paddle.transpose(preds, perm=[0, 2, 1])
+                loss += weight * criterion(preds_t, gts)
+            else:
+                loss += weight * criterion(preds, gts)
+
+        return loss
+
+
+class BoundaryRegressionLoss(nn.Layer):
+    """
+    Boundary Regression Loss
+        bce: Binary Cross Entropy Loss for Boundary Prediction
+        mse: Mean Squared Error
+    """
+
+    def __init__(self,
+                 file_path,
+                 label_path,
+                 bce=True,
+                 focal=False,
+                 mse=False,
+                 weight=None,
+                 pos_weight=None):
+        super().__init__()
+
+        self.criterions = []
+        self.file_path = file_path
+        self.label_path = label_path
+
+        pos_weight = self.get_pos_weight()
+
+        if bce:
+            self.criterions.append(
+                nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight))
+
+        if focal:
+            self.criterions.append(FocalLoss())
+
+        if mse:
+            self.criterions.append(nn.MSELoss())
+
+        if len(self.criterions) == 0:
+            print("You have to choose at least one loss function.")
+            sys.exit(1)
+
+    def get_pos_weight(self, norm=None):
+        """
+        pos_weight for binary cross entropy with logits loss
+        pos_weight is defined as reciprocal of ratio of positive samples in the dataset
+        """
+        # load file list
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+
+        n_classes = 2  # boundary or not
+        nums = [0 for i in range(n_classes)]
+        for i in range(len(info)):
+            video_name = info[i]
+            file_name = video_name.split('.')[0] + ".npy"
+            label_file_path = os.path.join(self.label_path, file_name)
+            label = np.load(label_file_path).astype(np.int64)
+            num, cnt = np.unique(label, return_counts=True)
+            for n, c in zip(num, cnt):
+                nums[n] += c
+
+        pos_ratio = nums[1] / sum(nums)
+        pos_weight = 1 / pos_ratio
+
+        if norm is not None:
+            pos_weight /= norm
+
+        return paddle.to_tensor(pos_weight, dtype="float32")
+
+    def forward(self, preds, gts):
+        """
+        Args:
+            preds: paddle.float (N, 1, T).
+            gts: paddle.float (N, 1, T).
+        """
+        loss = 0.0
+        batch_size = float(preds.shape[0])
+
+        for criterion in self.criterions:
+            for pred, gt in zip(preds, gts):
+                loss += criterion(pred, gt)
+
+        return loss / batch_size
+
+
+@LOSSES.register()
+class ASRFLoss(nn.Layer):
+
+    def __init__(self,
+                 lambda_bound_loss,
+                 num_classes,
+                 file_path,
+                 label_path,
+                 boundary_path,
+                 ce=True,
+                 asl_focal=True,
+                 tmse=False,
+                 gstmse=False,
+                 asl_weight=None,
+                 threshold=4.,
+                 ignore_index=255,
+                 ce_weight=1.0,
+                 focal_weight=1.0,
+                 tmse_weight=0.15,
+                 gstmse_weight=0.15,
+                 bce=True,
+                 brl_focal=False,
+                 mse=False,
+                 brl_weight=None):
+        super().__init__()
+        self.criterion_cls = ActionSegmentationLoss(ce=ce,
+                                                    focal=asl_focal,
+                                                    tmse=tmse,
+                                                    gstmse=gstmse,
+                                                    weight=asl_weight,
+                                                    threshold=threshold,
+                                                    ignore_index=ignore_index,
+                                                    ce_weight=ce_weight,
+                                                    focal_weight=focal_weight,
+                                                    tmse_weight=tmse_weight,
+                                                    gstmse_weight=gstmse_weight,
+                                                    file_path=file_path,
+                                                    label_path=label_path,
+                                                    num_classes=num_classes)
+        self.criterion_boundary = BoundaryRegressionLoss(
+            bce=bce,
+            focal=brl_focal,
+            mse=mse,
+            weight=brl_weight,
+            file_path=file_path,
+            label_path=boundary_path)
+        self.lambda_bound_loss = lambda_bound_loss
+
+    def forward(self, x, output_cls, label, outputs_boundary, boundary):
+        loss = 0.0
+        if isinstance(output_cls, list):
+            n = len(output_cls)
+            for out in output_cls:
+                loss += self.criterion_cls(out, label, x) / n
+        else:
+            loss += self.criterion_cls(output_cls, label, x)
+
+        if isinstance(outputs_boundary, list):
+            n = len(outputs_boundary)
+            for out in outputs_boundary:
+                loss += self.lambda_bound_loss * self.criterion_boundary(
+                    out, boundary) / n
+        else:
+            loss += self.lambda_bound_loss * self.criterion_boundary(
+                outputs_boundary, boundary)
+
+        return loss
diff --git a/docs/src/paddlevideo/modeling/losses/base.py b/docs/src/paddlevideo/modeling/losses/base.py
new file mode 100644
index 000000000..7284252e6
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/base.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import  abstractmethod
+import paddle
+import paddle.nn as nn
+
+#XXX use _forward?? or forward??
+class BaseWeightedLoss(nn.Layer):
+    """Base class for loss.
+
+    All subclass should overwrite the ``_forward()`` method which returns the
+    normal loss without loss weights.
+
+    Args:
+        loss_weight (float): Factor scalar multiplied on the loss.
+            Default: 1.0.
+    """
+
+    def __init__(self, loss_weight=1.0):
+        super().__init__()
+        self.loss_weight = loss_weight
+
+    @abstractmethod
+    def _forward(self, *args, **kwargs):
+        pass
+
+    def forward(self, *args, **kwargs):
+        """Defines the computation performed at every call.
+        Args:
+            *args: The positional arguments for the corresponding
+                loss.
+            **kwargs: The keyword arguments for the corresponding
+                loss.
+        Returns:
+            paddle.Tensor: The calculated loss.
+        """
+        return self._forward(*args, **kwargs) * self.loss_weight
diff --git a/docs/src/paddlevideo/modeling/losses/bmn_loss.py b/docs/src/paddlevideo/modeling/losses/bmn_loss.py
new file mode 100644
index 000000000..e43485013
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/bmn_loss.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class BMNLoss(BaseWeightedLoss):
+    """Loss for BMN model
+    Args:
+        tscale (int): sequence length, default 100.
+        dscale (int): max duration length, default 100.
+    """
+    def __init__(self, dscale, tscale):
+        super().__init__()
+        self.dscale = dscale
+        self.tscale = tscale
+
+    def _get_mask(self, dscale, tscale):
+        bm_mask = []
+        for idx in range(dscale):
+            mask_vector = [1 for i in range(tscale - idx)
+                           ] + [0 for i in range(idx)]
+            bm_mask.append(mask_vector)
+        bm_mask = np.array(bm_mask, dtype='float32')
+        bm_mask = paddle.to_tensor(bm_mask)
+        bm_mask.stop_gradient = True
+        return bm_mask
+
+    def tem_loss_func(self, pred_start, pred_end, gt_start, gt_end):
+        def bi_loss(pred_score, gt_label, datatype):
+            pred_score = paddle.reshape(x=pred_score, shape=[-1])
+            gt_label = paddle.reshape(x=gt_label, shape=[-1])
+            gt_label.stop_gradient = True
+            pmask = paddle.cast(x=(gt_label > 0.5), dtype=datatype)
+            num_entries = paddle.cast(paddle.shape(pmask), dtype=datatype)
+            num_positive = paddle.cast(paddle.sum(pmask), dtype=datatype)
+            ratio = num_entries / num_positive
+            coef_0 = 0.5 * ratio / (ratio - 1)
+            coef_1 = 0.5 * ratio
+            epsilon = 0.000001
+            loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)
+            loss_pos = coef_1 * paddle.mean(loss_pos)
+            loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),
+                                       (1.0 - pmask))
+            loss_neg = coef_0 * paddle.mean(loss_neg)
+            loss = -1 * (loss_pos + loss_neg)
+            return loss
+
+        loss_start = bi_loss(pred_start, gt_start, pred_start.dtype)
+        loss_end = bi_loss(pred_end, gt_end, pred_start.dtype)
+        loss = loss_start + loss_end
+        return loss
+
+    def pem_reg_loss_func(self, pred_score, gt_iou_map, mask):
+        gt_iou_map = paddle.multiply(gt_iou_map, mask)
+
+        u_hmask = paddle.cast(x=gt_iou_map > 0.7, dtype=pred_score.dtype)
+        u_mmask = paddle.logical_and(gt_iou_map <= 0.7, gt_iou_map > 0.3)
+        u_mmask = paddle.cast(x=u_mmask, dtype=pred_score.dtype)
+        u_lmask = paddle.logical_and(gt_iou_map <= 0.3, gt_iou_map >= 0.)
+        u_lmask = paddle.cast(x=u_lmask, dtype=pred_score.dtype)
+        u_lmask = paddle.multiply(u_lmask, mask)
+
+        num_h = paddle.cast(paddle.sum(u_hmask), dtype=pred_score.dtype)
+        num_m = paddle.cast(paddle.sum(u_mmask), dtype=pred_score.dtype)
+        num_l = paddle.cast(paddle.sum(u_lmask), dtype=pred_score.dtype)
+
+        r_m = num_h / num_m
+        u_smmask = paddle.uniform(shape=[
+            gt_iou_map.shape[1], gt_iou_map.shape[2]
+        ],
+                                  min=0.0,
+                                  max=1.0).astype(pred_score.dtype)
+        u_smmask = paddle.multiply(u_mmask, u_smmask)
+        u_smmask = paddle.cast(x=(u_smmask > (1. - r_m)),
+                               dtype=pred_score.dtype)
+
+        r_l = num_h / num_l
+        u_slmask = paddle.uniform(shape=[
+            gt_iou_map.shape[1], gt_iou_map.shape[2]
+        ],
+                                  min=0.0,
+                                  max=1.0).astype(pred_score.dtype)
+        u_slmask = paddle.multiply(u_lmask, u_slmask)
+        u_slmask = paddle.cast(x=(u_slmask > (1. - r_l)),
+                               dtype=pred_score.dtype)
+
+        weights = u_hmask + u_smmask + u_slmask
+        weights.stop_gradient = True
+        loss = F.square_error_cost(pred_score, gt_iou_map)
+        loss = paddle.multiply(loss, weights)
+        loss = 0.5 * paddle.sum(loss) / paddle.sum(weights)
+
+        return loss
+
+    def pem_cls_loss_func(self, pred_score, gt_iou_map, mask):
+        gt_iou_map = paddle.multiply(gt_iou_map, mask)
+        gt_iou_map.stop_gradient = True
+        pmask = paddle.cast(x=(gt_iou_map > 0.9), dtype=pred_score.dtype)
+        nmask = paddle.cast(x=(gt_iou_map <= 0.9), dtype=pred_score.dtype)
+        nmask = paddle.multiply(nmask, mask)
+
+        num_positive = paddle.sum(pmask)
+        num_entries = num_positive + paddle.sum(nmask)
+        ratio = num_entries / num_positive
+        coef_0 = 0.5 * ratio / (ratio - 1)
+        coef_1 = 0.5 * ratio
+        epsilon = 0.000001
+        loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)
+        loss_pos = coef_1 * paddle.sum(loss_pos)
+        loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),
+                                   nmask)
+        loss_neg = coef_0 * paddle.sum(loss_neg)
+        loss = -1 * (loss_pos + loss_neg) / num_entries
+        return loss
+
+    def forward(self, pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
+                gt_end):
+        pred_bm_reg = paddle.squeeze(paddle.slice(pred_bm,
+                                                  axes=[1],
+                                                  starts=[0],
+                                                  ends=[1]),
+                                     axis=[1])
+        pred_bm_cls = paddle.squeeze(paddle.slice(pred_bm,
+                                                  axes=[1],
+                                                  starts=[1],
+                                                  ends=[2]),
+                                     axis=[1])
+
+        bm_mask = self._get_mask(self.dscale, self.tscale)
+
+        pem_reg_loss = self.pem_reg_loss_func(pred_bm_reg, gt_iou_map, bm_mask)
+        pem_cls_loss = self.pem_cls_loss_func(pred_bm_cls, gt_iou_map, bm_mask)
+
+        tem_loss = self.tem_loss_func(pred_start, pred_end, gt_start, gt_end)
+
+        loss = tem_loss + 10 * pem_reg_loss + pem_cls_loss
+        return loss
diff --git a/docs/src/paddlevideo/modeling/losses/cross_entropy_loss.py b/docs/src/paddlevideo/modeling/losses/cross_entropy_loss.py
new file mode 100644
index 000000000..953f77c07
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/cross_entropy_loss.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class CrossEntropyLoss(BaseWeightedLoss):
+    """Cross Entropy Loss."""
+    def _forward(self, score, labels, **kwargs):
+        """Forward function.
+        Args:
+            score (paddle.Tensor): The class score.
+            labels (paddle.Tensor): The ground truth labels.
+            kwargs: Any keyword argument to be used to calculate
+                CrossEntropy loss.
+        Returns:
+            loss (paddle.Tensor): The returned CrossEntropy loss.
+        """
+        loss = F.cross_entropy(score, labels, **kwargs)
+        return loss
diff --git a/docs/src/paddlevideo/modeling/losses/depth_loss.py b/docs/src/paddlevideo/modeling/losses/depth_loss.py
new file mode 100644
index 000000000..ba9a2cb04
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/depth_loss.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+def get_smooth_loss(disp, img):
+    """Computes the smoothness loss for a disparity image
+    The color image is used for edge-aware smoothness
+    """
+    grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])
+    grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])
+
+    grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),
+                             1,
+                             keepdim=True)
+    grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),
+                             1,
+                             keepdim=True)
+
+    grad_disp_x *= paddle.exp(-grad_img_x)
+    grad_disp_y *= paddle.exp(-grad_img_y)
+
+    return grad_disp_x.mean() + grad_disp_y.mean()
+
+
+class DiffLoss(nn.Layer):
+    def __init__(self):
+        super(DiffLoss, self).__init__()
+
+    def forward(self, input1, input2):
+        batch_size = input1.shape[0]
+        input1 = input1.reshape([batch_size, -1])
+        input2 = input2.reshape([batch_size, -1])
+
+        input1_l2 = input1
+        input2_l2 = input2
+
+        diff_loss = 0
+        dim = input1.shape[1]
+        for i in range(input1.shape[0]):
+            diff_loss = diff_loss + paddle.mean(
+                ((input1_l2[i:i + 1, :].mm(input2_l2[i:i + 1, :].T)).pow(2)) /
+                dim)
+
+        diff_loss = diff_loss / input1.shape[0]
+
+        return diff_loss
+
+
+class MSE(nn.Layer):
+    def __init__(self):
+        super(MSE, self).__init__()
+
+    def forward(self, pred, real):
+        diffs = paddle.add(real, -pred)
+        n = paddle.numel(diffs)
+        mse = paddle.sum(diffs.pow(2)) / n
+
+        return mse
+
+
+class SIMSE(nn.Layer):
+    def __init__(self):
+        super(SIMSE, self).__init__()
+
+    def forward(self, pred, real):
+        diffs = paddle.add(real, -pred)
+        n = paddle.numel(diffs)
+        simse = paddle.sum(diffs).pow(2) / (n**2)
+
+        return simse
+
+
+class SSIM(nn.Layer):
+    """Layer to compute the SSIM loss between a pair of images
+    """
+    def __init__(self):
+        super(SSIM, self).__init__()
+        self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)
+
+        self.refl = nn.Pad2D(1, mode='reflect')
+
+        self.C1 = 0.01**2
+        self.C2 = 0.03**2
+
+    def forward(self, x, y):
+        x = self.refl(x)
+        y = self.refl(y)
+
+        mu_x = self.mu_x_pool(x)
+        mu_y = self.mu_y_pool(y)
+
+        sigma_x = self.sig_x_pool(x**2) - mu_x**2
+        sigma_y = self.sig_y_pool(y**2) - mu_y**2
+        sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y
+
+        SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)
+        SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)
+
+        return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)
+
+
+@LOSSES.register()
+class ADDSLoss(BaseWeightedLoss):
+    def __init__(self, avg_reprojection, disparity_smoothness, no_ssim):
+        super(ADDSLoss, self).__init__()
+        self.avg_reprojection = avg_reprojection
+        self.disparity_smoothness = disparity_smoothness
+        self.no_ssim = no_ssim
+
+        self.loss_diff = DiffLoss()
+        self.loss_recon1 = MSE()
+        self.loss_recon2 = SIMSE()
+        self.loss_similarity = MSE()
+
+    def compute_reprojection_loss(self, pred, target):
+        """Computes reprojection loss between a batch of predicted and target images
+        """
+        abs_diff = paddle.abs(target - pred)
+        l1_loss = abs_diff.mean(1, True)
+
+        if not self.no_ssim:
+            self.ssim = SSIM()
+
+        if self.no_ssim:
+            reprojection_loss = l1_loss
+        else:
+            ssim_loss = self.ssim(pred, target).mean(1, True)
+            reprojection_loss = 0.85 * ssim_loss + 0.15 * l1_loss
+
+        return reprojection_loss
+
+    def compute_losses(self, inputs, outputs, is_night):
+        """Compute the reprojection and smoothness losses for a minibatch
+        """
+        losses = {}
+        total_loss = 0
+
+        for scale in outputs['scales']:
+            loss = 0
+            reprojection_losses = []
+
+            source_scale = 0
+
+            disp = outputs[("disp", scale)]
+            if is_night:
+                color = inputs[("color_n", 0, scale)]
+                target = inputs[("color_n", 0, source_scale)]
+            else:
+                color = inputs[("color", 0, scale)]
+                target = inputs[("color", 0, source_scale)]
+
+            for frame_id in outputs['frame_ids'][1:]:
+                pred = outputs[("color", frame_id, scale)]
+                reprojection_losses.append(
+                    self.compute_reprojection_loss(pred, target))
+
+            reprojection_losses = paddle.concat(reprojection_losses, 1)
+
+            identity_reprojection_losses = []
+            for frame_id in outputs['frame_ids'][1:]:
+                if is_night:
+                    pred = inputs[("color_n", frame_id, source_scale)]
+                else:
+                    pred = inputs[("color", frame_id, source_scale)]
+                identity_reprojection_losses.append(
+                    self.compute_reprojection_loss(pred, target))
+
+            identity_reprojection_losses = paddle.concat(
+                identity_reprojection_losses, 1)
+
+            if self.avg_reprojection:
+                identity_reprojection_loss = identity_reprojection_losses.mean(
+                    1, keepdim=True)
+            else:
+                # save both images, and do min all at once below
+                identity_reprojection_loss = identity_reprojection_losses
+
+            if self.avg_reprojection:
+                reprojection_loss = reprojection_losses.mean(1, keepdim=True)
+            else:
+                reprojection_loss = reprojection_losses
+
+            # add random numbers to break ties
+            identity_reprojection_loss = identity_reprojection_loss + paddle.randn(
+                identity_reprojection_loss.shape) * 0.00001
+
+            combined = paddle.concat(
+                (identity_reprojection_loss, reprojection_loss), axis=1)
+            if combined.shape[1] == 1:
+                to_optimise = combined
+            else:
+                to_optimise = paddle.min(combined, axis=1)
+
+            loss = loss + to_optimise.mean()
+
+            mean_disp = disp.mean(2, True).mean(3, True)
+            norm_disp = disp / (mean_disp + 1e-7)
+            smooth_loss = get_smooth_loss(norm_disp, color)
+
+            loss = loss + self.disparity_smoothness * smooth_loss / (2**scale)
+            total_loss = total_loss + loss
+            losses["loss/{}".format(scale)] = loss
+
+        total_loss /= len(outputs['scales'])
+        losses["loss"] = total_loss
+        return losses
+
+    def forward(self, inputs, outputs):
+
+        losses_day = self.compute_losses(inputs, outputs, 'day')
+        losses_night = self.compute_losses(inputs, outputs['outputs_night'],
+                                           'night')
+
+        loss = 0
+        losses = []
+        # diff
+        target_diff1 = 0.5 * self.loss_diff(
+            outputs['result'][0], outputs['result'][2])  # 10 when batchsize=1
+        target_diff2 = 0.5 * self.loss_diff(outputs['result_night'][0],
+                                            outputs['result_night'][2])
+        losses.append(target_diff1)
+        losses.append(target_diff2)
+        loss = loss + target_diff1
+        loss = loss + target_diff2
+
+        target_diff3 = 1 * self.loss_diff(
+            outputs['result'][1], outputs['result'][3])  # 10 when batchsize=1
+        target_diff4 = 1 * self.loss_diff(outputs['result_night'][1],
+                                          outputs['result_night'][3])
+        losses.append(target_diff3)
+        losses.append(target_diff4)
+        loss = loss + target_diff3
+        loss = loss + target_diff4
+
+        # recon
+        target_mse = 1 * self.loss_recon1(outputs['result'][5],
+                                          inputs["color_aug", 0, 0])
+        loss = loss + target_mse
+
+        target_simse = 1 * self.loss_recon2(outputs['result'][5],
+                                            inputs["color_aug", 0, 0])
+        loss = loss + target_simse
+
+        losses.append(target_mse)
+        losses.append(target_simse)
+        target_mse_night = 1 * self.loss_recon1(outputs['result_night'][5],
+                                                inputs["color_n_aug", 0, 0])
+        loss = loss + target_mse_night
+
+        target_simse_night = 1 * self.loss_recon2(outputs['result_night'][5],
+                                                  inputs["color_n_aug", 0, 0])
+        loss = loss + target_simse_night
+
+        losses.append(target_mse_night)
+        losses.append(target_simse_night)
+
+        # depth loss
+        pseudo_label = outputs[("disp", 0)].detach()
+        depth_loss = 1 * self.loss_similarity(
+            outputs['outputs_night'][("disp", 0)], pseudo_label)
+        loss = loss + depth_loss
+
+        losses.append(depth_loss)
+
+        outputs['loss'] = loss + losses_day['loss'] + losses_night['loss']
+        outputs['losses_day'] = losses_day['loss']
+        outputs['losses_night'] = losses_night['loss']
+
+        return outputs
diff --git a/docs/src/paddlevideo/modeling/losses/distillation_loss.py b/docs/src/paddlevideo/modeling/losses/distillation_loss.py
new file mode 100644
index 000000000..d27f9418a
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/distillation_loss.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class DistillationCELoss(BaseWeightedLoss):
+    """Distillation Entropy Loss."""
+    def _forward(self, score, labels, **kwargs):
+        """Forward function.
+        Args:
+            score (paddle.Tensor): The class score.
+            labels (paddle.Tensor): The ground truth labels.
+            kwargs: Any keyword argument to be used to calculate
+                CrossEntropy loss.
+        Returns:
+            loss (paddle.Tensor): The returned CrossEntropy loss.
+        """
+        if len(labels) == 1:
+            label = labels[0]
+            loss = F.cross_entropy(score, label, **kwargs)
+        # Deal with VideoMix
+        elif len(labels) == 3:
+            label_a, label_b, lam = labels
+            loss_a = F.cross_entropy(score, label_a, **kwargs)
+            loss_b = F.cross_entropy(score, label_b, **kwargs)
+            loss = lam * loss_a + (1 - lam) * loss_b
+            loss = paddle.mean(loss)  #lam shape is bs
+        return loss
+
+
+@LOSSES.register()
+class DistillationDMLLoss(BaseWeightedLoss):
+    """
+    DistillationDMLLoss
+    """
+    def __init__(self, act="softmax", eps=1e-12, **kargs):
+        super().__init__(**kargs)
+        if act is not None:
+            assert act in ["softmax", "sigmoid"]
+        if act == "softmax":
+            self.act = nn.Softmax(axis=-1)
+        elif act == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            self.act = None
+        self.eps = eps
+
+    def _kldiv(self, x, target):
+        class_num = x.shape[-1]
+        cost = target * paddle.log(
+            (target + self.eps) / (x + self.eps)) * class_num
+        return cost
+
+    def _forward(self, x, target):
+        if self.act is not None:
+            x = self.act(x)
+            target = self.act(target)
+        loss = self._kldiv(x, target) + self._kldiv(target, x)
+        loss = loss / 2
+        loss = paddle.mean(loss)
+        return loss
diff --git a/docs/src/paddlevideo/modeling/losses/transnetv2_loss.py b/docs/src/paddlevideo/modeling/losses/transnetv2_loss.py
new file mode 100644
index 000000000..624c46852
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/transnetv2_loss.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class TransNetV2Loss(BaseWeightedLoss):
+    """Loss for TransNetV2 model
+    """
+    def __init__(self, transition_weight=5.0, many_hot_loss_weight=0.1):
+        self.transition_weight = transition_weight
+        self.many_hot_loss_weight = many_hot_loss_weight
+        super().__init__()
+
+    def _forward(self, one_hot_pred, one_hot_gt,
+                many_hot_pred=None, many_hot_gt=None, reg_losses=None):
+        assert transition_weight != 1
+
+        one_hot_pred = one_hot_pred[:, :, 0]
+
+        one_hot_gt = one_hot_gt.astype('float32')
+        one_hot_loss = F.binary_cross_entropy_with_logits(logit=one_hot_pred, label=one_hot_gt, reduction='none')
+
+        one_hot_loss *= 1 + one_hot_gt * (transition_weight - 1)
+
+        one_hot_loss = paddle.mean(one_hot_loss)
+
+        many_hot_loss = 0.
+        if many_hot_loss_weight != 0. and many_hot_pred is not None:
+            many_hot_loss = many_hot_loss_weight * paddle.mean(
+                F.binary_cross_entropy_with_logits(logit=many_hot_pred[:, :, 0],
+                                                   label=many_hot_gt.astype('float32'), reduction='none'))
+
+        total_loss = one_hot_loss + many_hot_loss
+
+        if reg_losses is not None:
+            for name, value in reg_losses.items():
+                if value is not None:
+                    total_loss += value
+
+        return total_loss
\ No newline at end of file
diff --git a/docs/src/paddlevideo/modeling/losses/yowo_loss.py b/docs/src/paddlevideo/modeling/losses/yowo_loss.py
new file mode 100644
index 000000000..5ca3290e2
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/losses/yowo_loss.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from paddle.static import Variable
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+from ..framework.localizers.yowo_utils import build_targets
+
+
+class FocalLoss(nn.Layer):
+    """
+        This criterion is a implemenation of Focal Loss, which is proposed in
+        Focal Loss for Dense Object Detection.
+
+            Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
+
+        The losses are averaged across observations for each minibatch.
+
+        Args:
+            alpha(1D Tensor, Variable) : the scalar factor for this criterion
+            gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5),
+                                   putting more focus on hard, misclassiﬁed examples
+            size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.
+                                However, if the field size_average is set to False, the losses are
+                                instead summed for each minibatch.
+
+    """
+
+    def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
+        super(FocalLoss, self).__init__()
+
+        if alpha is None:
+            self.alpha = paddle.ones(
+                [class_num, 1])
+            self.alpha.stop_gradient = False
+        else:
+            if isinstance(alpha, Variable):
+                self.alpha = alpha
+            else:
+                self.alpha = (alpha)
+                self.alpha.stop_gradient = False
+        self.gamma = gamma
+        self.class_num = class_num
+        self.size_average = size_average
+
+    def forward(self, inputs, targets):
+        N = inputs.shape[0]
+        C = inputs.shape[1]
+        P = F.softmax(inputs, axis=1)
+
+        tmp = numpy.zeros((N, C))
+        class_mask = paddle.to_tensor(tmp, place=inputs.place)
+        class_mask.stop_gradient = False
+        ids = paddle.reshape(targets, [-1, 1])
+        class_mask = F.one_hot(ids.squeeze(-1), class_mask.shape[1])
+
+        if "Place" not in str(inputs.place) and "Place" not in str(self.alpha.place):
+            self.alpha = self.alpha.cuda()
+
+        alpha = self.alpha[paddle.reshape(ids.detach(), [-1])]
+
+        probs = paddle.reshape((P * class_mask).sum(1), [-1, 1])
+
+        log_p = probs.log()
+
+        batch_loss = -alpha * (paddle.pow((1 - probs), self.gamma)) * log_p
+
+        if self.size_average:
+            loss = batch_loss.mean()
+        else:
+            loss = batch_loss.sum()
+        return loss
+
+
+@LOSSES.register()
+class RegionLoss(BaseWeightedLoss):
+    # for our model anchors has 10 values and number of anchors is 5
+    # parameters: 24, 10 float values, 24, 5
+    def __init__(self, num_classes, anchors, num_anchors, object_scale, noobject_scale, class_scale, coord_scale):
+        super().__init__()
+        self.num_classes = num_classes
+        self.anchors = [float(x) for x in anchors]
+        self.num_anchors = num_anchors
+        self.anchor_step = len(self.anchors) // self.num_anchors  # each anchor has 2 parameters
+        self.object_scale = object_scale
+        self.noobject_scale = noobject_scale
+        self.class_scale = class_scale
+        self.coord_scale = coord_scale
+        self.focalloss = FocalLoss(class_num=self.num_classes, gamma=2, size_average=False)
+        self.thresh = 0.6
+
+    def convert2cpu(self, gpu_matrix):
+        # return paddle.to_tensor((gpu_matrix.shape), dtype="float32").copy_(gpu_matrix)
+        return gpu_matrix.cpu()
+
+    def forward(self, output, target):
+        # output : B*A*(4+1+num_classes)*H*W            8*5*29*24*24
+        # B: number of batches
+        # A: number of anchors
+        # 4: 4 parameters for each bounding box
+        # 1: confidence score
+        # num_classes
+        # H: height of the image (in grids)
+        # W: width of the image (in grids)
+        # for each grid cell, there are A*(4+1+num_classes) parameters
+        nB = output.detach().shape[0]  # batch
+        nA = self.num_anchors  # anchor_num
+        nC = self.num_classes
+        nH = output.detach().shape[2]
+        nW = output.detach().shape[3]
+
+        # resize the output (all parameters for each anchor can be reached)
+        output = paddle.reshape(output, [nB, nA, (5 + nC), nH, nW])
+        # anchor's parameter tx
+
+        x = F.sigmoid(
+            paddle.reshape(paddle.index_select(output, paddle.to_tensor([0], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW]))
+        x.stop_gradient = False
+        # anchor's parameter ty
+        y = F.sigmoid(
+            paddle.reshape(paddle.index_select(output, paddle.to_tensor([1], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW]))
+        y.stop_gradient = False
+        # anchor's parameter tw
+        w = paddle.reshape(paddle.index_select(output, paddle.to_tensor([2], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW])
+        w.stop_gradient = False
+        # anchor's parameter th
+        h = paddle.reshape(paddle.index_select(output, paddle.to_tensor([3], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW])
+        h.stop_gradient = False
+        # confidence score for each anchor
+        conf = F.sigmoid(
+            paddle.reshape(paddle.index_select(output, paddle.to_tensor([4], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW]))
+        conf.stop_gradient = False
+        # anchor's parameter class label
+        cls = paddle.index_select(output, paddle.linspace(5, 5 + nC - 1, nC, 'int64').cuda(), axis=2)
+        cls.stop_gradient = False
+        # resize the data structure so that for every anchor there is a class label in the last dimension
+        cls = paddle.reshape(paddle.transpose(paddle.reshape(cls, [nB * nA, nC, nH * nW]), [0, 2, 1]),
+                             [nB * nA * nH * nW, nC])
+
+        # for the prediction of localization of each bounding box, there exist 4 parameters (tx, ty, tw, th)
+        # pred_boxes = torch.cuda.FloatTensor(4, nB*nA*nH*nW)
+        pred_boxes = paddle.zeros([4, nB * nA * nH * nW], dtype='float32').cuda()
+        # tx and ty
+        grid_x = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nW - 1, nW), [nH, 1]), [nB * nA, 1, 1]),
+                                [nB * nA * nH * nW]).cuda()
+        grid_y = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nH - 1, nH), [nW, 1]).t(), [nB * nA, 1, 1]),
+                                [nB * nA * nH * nW]).cuda()
+        # for each anchor there are anchor_step variables (with the structure num_anchor*anchor_step)
+        # for each row(anchor), the first variable is anchor's width, second is anchor's height
+        # pw and ph
+        anchor_w = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]),
+                                       paddle.to_tensor([0], dtype='int64'), axis=1).cuda()
+        anchor_h = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]),
+                                       paddle.to_tensor([1], dtype='int64'), axis=1).cuda()
+        # for each pixel (grid) repeat the above process (obtain width and height of each grid)
+        anchor_w = paddle.reshape(paddle.tile(paddle.tile(anchor_w, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW])
+        anchor_h = paddle.reshape(paddle.tile(paddle.tile(anchor_h, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW])
+        # prediction of bounding box localization
+        # x.data and y.data: top left corner of the anchor
+        # grid_x, grid_y: tx and ty predictions made by yowo
+
+        x_data = paddle.reshape(x.detach(), [-1])
+        y_data = paddle.reshape(y.detach(), [-1])
+        w_data = paddle.reshape(w.detach(), [-1])
+        h_data = paddle.reshape(h.detach(), [-1])
+
+        pred_boxes[0] = paddle.cast(x_data, dtype='float32') + paddle.cast(grid_x, dtype='float32')  # bx
+        pred_boxes[1] = paddle.cast(y_data, dtype='float32') + paddle.cast(grid_y, dtype='float32')  # by
+        pred_boxes[2] = paddle.exp(paddle.cast(w_data, dtype='float32')) * paddle.cast(anchor_w, dtype='float32')  # bw
+        pred_boxes[3] = paddle.exp(paddle.cast(h_data, dtype='float32')) * paddle.cast(anchor_h, dtype='float32')  # bh
+        # the size -1 is inferred from other dimensions
+        # pred_boxes (nB*nA*nH*nW, 4)
+
+        pred_boxes = self.convert2cpu(
+            paddle.cast(paddle.reshape(paddle.transpose(pred_boxes, (1, 0)), [-1, 4]), dtype='float32'))
+
+        nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
+                                                                                                    target.detach(),
+                                                                                                    self.anchors, nA,
+                                                                                                    nC, \
+                                                                                                    nH, nW,
+                                                                                                    self.noobject_scale,
+                                                                                                    self.object_scale,
+                                                                                                    self.thresh)
+        cls_mask = (cls_mask == 1)
+        #  keep those with high box confidence scores (greater than 0.25) as our final predictions
+        nProposals = int((conf > 0.25).sum().detach().item())
+
+        tx = (tx).cuda()
+        tx.stop_gradient = False
+        ty = ty.cuda()
+        ty.stop_gradient = False
+        tw = tw.cuda()
+        tw.stop_gradient = False
+        th = th.cuda()
+        th.stop_gradient = False
+        tconf = tconf.cuda()
+        tconf.stop_gradient = False
+
+        tcls = paddle.reshape(tcls, [-1]).astype('int64')[paddle.reshape(cls_mask, [-1])].cuda()
+        tcls.stop_gradient = False
+
+        coord_mask = coord_mask.cuda()
+        coord_mask.stop_gradient = False
+        conf_mask = conf_mask.cuda().sqrt()
+        coord_mask.stop_gradient = False
+        cls_mask = paddle.tile(paddle.reshape(cls_mask, [-1, 1]), [1, nC]).cuda()
+        cls_mask.stop_gradient = False
+
+        cls = paddle.reshape(cls[cls_mask], [-1, nC])
+
+        # losses between predictions and targets (ground truth)
+        # In total 6 aspects are considered as losses:
+        # 4 for bounding box location, 2 for prediction confidence and classification seperately
+        L1_loss = nn.SmoothL1Loss(reduction='sum')
+        loss_x = self.coord_scale * L1_loss(paddle.cast(x, dtype="float32") * coord_mask, tx * coord_mask) / 2.0
+        loss_y = self.coord_scale * L1_loss(paddle.cast(y, dtype="float32") * coord_mask, ty * coord_mask) / 2.0
+        loss_w = self.coord_scale * L1_loss(paddle.cast(w * coord_mask, dtype="float32"), tw * coord_mask) / 2.0
+        loss_h = self.coord_scale * L1_loss(paddle.cast(h * coord_mask, dtype="float32"), th * coord_mask) / 2.0
+        loss_conf = nn.MSELoss(reduction='sum')(paddle.cast(conf, dtype="float32") * conf_mask, tconf * conf_mask) / 2.0
+
+        # try focal loss with gamma = 2
+        loss_cls = self.class_scale * self.focalloss(cls, tcls)
+
+        # sum of loss
+        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
+
+        return loss, nCorrect
+
+
diff --git a/docs/src/paddlevideo/modeling/registry.py b/docs/src/paddlevideo/modeling/registry.py
new file mode 100644
index 000000000..b8140e1c2
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/registry.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+BACKBONES = Registry('backbone')
+HEADS = Registry('head')
+RECOGNIZERS = Registry('recognizer')
+SEGMENTERS = Registry('Segmenters')
+LOCALIZERS = Registry('localizer')
+PARTITIONERS = Registry('partitioner')
+LOSSES = Registry('loss')
+ROI_EXTRACTORS = Registry('roi_extractor')
+DETECTORS = Registry('detectors')
+BBOX_ASSIGNERS = Registry('bbox_assigner')
+BBOX_SAMPLERS = Registry('bbox_sampler')
+BBOX_CODERS = Registry('bbox_coder')
+ESTIMATORS = Registry('estimator')
+MULTIMODAL = Registry('multimodal')
+SEGMENT = Registry('segment')
diff --git a/docs/src/paddlevideo/modeling/samplers/__init__.py b/docs/src/paddlevideo/modeling/samplers/__init__.py
new file mode 100644
index 000000000..0cf7f15e5
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/samplers/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .random_sampler import RandomSampler
+
+__all__ = ['RandomSampler']
diff --git a/docs/src/paddlevideo/modeling/samplers/random_sampler.py b/docs/src/paddlevideo/modeling/samplers/random_sampler.py
new file mode 100644
index 000000000..480845474
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/samplers/random_sampler.py
@@ -0,0 +1,146 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import numpy as np
+from ..registry import BBOX_SAMPLERS
+
+class SamplingResult():
+    """Bbox sampling result.  """
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_bboxes = paddle.index_select(bboxes,pos_inds)
+        
+        # neg_inds may be empty
+        if neg_inds.shape[0]!=0:
+            self.neg_bboxes = paddle.index_select(bboxes,neg_inds)
+        else:
+            self.neg_bboxes=None
+        
+        self.pos_is_gt  = paddle.index_select(gt_flags,pos_inds)
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = paddle.index_select(assign_result.gt_inds,pos_inds) - 1
+
+        if float(gt_bboxes.numel()) == 0:
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = paddle.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+
+            self.pos_gt_bboxes = paddle.index_select(gt_bboxes, self.pos_assigned_gt_inds)
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = paddle.index_select(assign_result.labels, pos_inds)
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        if self.neg_bboxes is not None:
+            ret = paddle.concat([self.pos_bboxes, self.neg_bboxes])
+        else:
+            # neg bbox may be empty
+            ret = self.pos_bboxes
+        return ret
+
+
+
+@BBOX_SAMPLERS.register()
+class RandomSampler():
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+ 
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               **kwargs):
+        """Sample positive and negative bboxes.  """
+
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        bboxes = bboxes[:, :4]
+
+        gt_flags = paddle.full([bboxes.shape[0], ], 0, dtype='int32')
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError(
+                    'gt_labels must be given when add_gt_as_proposals is True')
+            bboxes = paddle.concat([gt_bboxes, bboxes])
+            assign_result.add_gt_(gt_labels)
+            gt_ones = paddle.full([gt_bboxes.shape[0], ], 1, dtype='int32')
+            gt_flags = paddle.concat([gt_ones, gt_flags])
+
+        #1. 得到正样本的数量, inds
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self._sample_pos( assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        pos_inds = paddle.to_tensor(np.unique(pos_inds.numpy()))
+
+        #2. 得到负样本的数量, inds
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        neg_inds = self._sample_neg(
+            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+        neg_inds = paddle.to_tensor(np.unique(neg_inds.numpy()))
+
+        #3. 得到sampling result
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
+    def random_choice(self, gallery, num):
+        """Random select some elements from the gallery.  """
+        assert len(gallery) >= num
+
+        perm = paddle.arange(gallery.numel())[:num]
+        perm = paddle.randperm(gallery.numel())[:num] 
+        rand_inds = paddle.index_select(gallery, perm)
+        return rand_inds
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some positive samples."""
+        #1.首先看一下给的bboxes里面有哪些label是大于0的 得到了他们的index
+        pos_inds = paddle.nonzero(assign_result.gt_inds, as_tuple=False)
+
+        #2. 只要这个pos_inds的数目不是0个 这些就都可以是positive sample
+        # 当pos_inds的数目小于num_expected(想要的sample的最大数目), 就直接用这个pos_inds
+        # 反之就从这么多index里随机采样num_expected个出来
+        if float(pos_inds.numel()) != 0:
+            pos_inds = pos_inds.squeeze() 
+        if float(pos_inds.numel()) <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some negative samples."""
+        neg_inds = paddle.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if float(neg_inds.numel()) != 0:
+            neg_inds = neg_inds.squeeze() 
+        if (float(neg_inds.numel())) <= float(num_expected):
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
diff --git a/docs/src/paddlevideo/modeling/weight_init.py b/docs/src/paddlevideo/modeling/weight_init.py
new file mode 100644
index 000000000..472289526
--- /dev/null
+++ b/docs/src/paddlevideo/modeling/weight_init.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn.initializer as init
+import numpy as np
+from scipy import special
+
+
+def weight_init_(layer,
+                 func,
+                 weight_name=None,
+                 bias_name=None,
+                 bias_value=0.0,
+                 **kwargs):
+    """
+    In-place params init function.
+    Usage:
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        data = np.ones([3, 4], dtype='float32')
+        linear = paddle.nn.Linear(4, 4)
+        input = paddle.to_tensor(data)
+        print(linear.weight)
+        linear(input)
+
+        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)
+        print(linear.weight)
+    """
+
+    if hasattr(layer, 'weight') and layer.weight is not None:
+        getattr(init, func)(**kwargs)(layer.weight)
+        if weight_name is not None:
+            # override weight name
+            layer.weight.name = weight_name
+
+    if hasattr(layer, 'bias') and layer.bias is not None:
+        init.Constant(bias_value)(layer.bias)
+        if bias_name is not None:
+            # override bias name
+            layer.bias.name = bias_name
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        print("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+              "The distribution of values may be incorrect.")
+
+    with paddle.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to [2l-1, 2u-1].
+        tmp = np.random.uniform(2 * l - 1, 2 * u - 1,
+                                size=list(tensor.shape)).astype(np.float32)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tmp = special.erfinv(tmp)
+
+        # Transform to proper mean, std
+        tmp *= (std * math.sqrt(2.0))
+        tmp += mean
+
+        # Clamp to ensure it's in the proper range
+        tmp = np.clip(tmp, a, b)
+        tensor.set_value(paddle.to_tensor(tmp))
+
+        return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = tensor.dim()
+    if dimensions < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    num_input_fmaps = tensor.shape[1]
+    num_output_fmaps = tensor.shape[0]
+    receptive_field_size = 1
+    if tensor.dim() > 2:
+        receptive_field_size = tensor[0][0].numel()
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def kaiming_normal_(tensor, a=0., mode='fan_in', nonlinearity='leaky_relu'):
+    def _calculate_correct_fan(tensor, mode):
+        mode = mode.lower()
+        valid_modes = ['fan_in', 'fan_out']
+        if mode not in valid_modes:
+            raise ValueError(
+                "Mode {} not supported, please use one of {}".format(
+                    mode, valid_modes))
+
+        fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+        return fan_in if mode == 'fan_in' else fan_out
+
+    def calculate_gain(nonlinearity, param=None):
+        linear_fns = [
+            'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+            'conv_transpose2d', 'conv_transpose3d'
+        ]
+        if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+            return 1
+        elif nonlinearity == 'tanh':
+            return 5.0 / 3
+        elif nonlinearity == 'relu':
+            return math.sqrt(2.0)
+        elif nonlinearity == 'leaky_relu':
+            if param is None:
+                negative_slope = 0.01
+            elif not isinstance(param, bool) and isinstance(
+                    param, int) or isinstance(param, float):
+                negative_slope = param
+            else:
+                raise ValueError(
+                    "negative_slope {} not a valid number".format(param))
+            return math.sqrt(2.0 / (1 + negative_slope**2))
+        else:
+            raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    with paddle.no_grad():
+        paddle.nn.initializer.Normal(0, std)(tensor)
+        return tensor
diff --git a/docs/src/paddlevideo/solver/__init__.py b/docs/src/paddlevideo/solver/__init__.py
new file mode 100644
index 000000000..01cf9cdd7
--- /dev/null
+++ b/docs/src/paddlevideo/solver/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import build_optimizer
+from .lr import build_lr
diff --git a/docs/src/paddlevideo/solver/custom_lr.py b/docs/src/paddlevideo/solver/custom_lr.py
new file mode 100644
index 000000000..bbf8d742a
--- /dev/null
+++ b/docs/src/paddlevideo/solver/custom_lr.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from paddle.optimizer.lr import *
+import numpy as np
+"""
+PaddleVideo Learning Rate Schedule:
+You can use paddle.optimizer.lr
+or define your custom_lr in this file.
+"""
+
+
+class CustomWarmupCosineDecay(LRScheduler):
+    r"""
+    We combine warmup and stepwise-cosine which is used in slowfast model.
+
+    Args:
+        warmup_start_lr (float): start learning rate used in warmup stage.
+        warmup_epochs (int): the number epochs of warmup.
+        cosine_base_lr (float|int, optional): base learning rate in cosine schedule.
+        max_epoch (int): total training epochs.
+        num_iters(int): number iterations of each epoch.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    Returns:
+        ``CosineAnnealingDecay`` instance to schedule learning rate.
+    """
+
+    def __init__(self,
+                 warmup_start_lr,
+                 warmup_epochs,
+                 cosine_base_lr,
+                 max_epoch,
+                 num_iters,
+                 last_epoch=-1,
+                 verbose=False):
+        self.warmup_start_lr = warmup_start_lr
+        self.warmup_epochs = warmup_epochs
+        self.cosine_base_lr = cosine_base_lr
+        self.max_epoch = max_epoch
+        self.num_iters = num_iters
+        #call step() in base class, last_lr/last_epoch/base_lr will be update
+        super(CustomWarmupCosineDecay, self).__init__(last_epoch=last_epoch,
+                                                      verbose=verbose)
+
+    def step(self, epoch=None):
+        """
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+        Returns:
+            None
+        """
+        if epoch is None:
+            if self.last_epoch == -1:
+                self.last_epoch += 1
+            else:
+                self.last_epoch += 1 / self.num_iters  # update step with iters
+        else:
+            self.last_epoch = epoch
+        self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def _lr_func_cosine(self, cur_epoch, cosine_base_lr, max_epoch):
+        return cosine_base_lr * (math.cos(math.pi * cur_epoch / max_epoch) +
+                                 1.0) * 0.5
+
+    def get_lr(self):
+        """Define lr policy"""
+        lr = self._lr_func_cosine(self.last_epoch, self.cosine_base_lr,
+                                  self.max_epoch)
+        lr_end = self._lr_func_cosine(self.warmup_epochs, self.cosine_base_lr,
+                                      self.max_epoch)
+
+        # Perform warm up.
+        if self.last_epoch < self.warmup_epochs:
+            lr_start = self.warmup_start_lr
+            alpha = (lr_end - lr_start) / self.warmup_epochs
+            lr = self.last_epoch * alpha + lr_start
+        return lr
+
+
+class CustomWarmupPiecewiseDecay(LRScheduler):
+    r"""
+    This op combine warmup and stepwise-cosine which is used in slowfast model.
+
+    Args:
+        warmup_start_lr (float): start learning rate used in warmup stage.
+        warmup_epochs (int): the number epochs of warmup.
+        step_base_lr (float|int, optional): base learning rate in step schedule.
+        max_epoch (int): total training epochs.
+        num_iters(int): number iterations of each epoch.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    Returns:
+        ``CustomWarmupPiecewiseDecay`` instance to schedule learning rate.
+    """
+
+    def __init__(self,
+                 warmup_start_lr,
+                 warmup_epochs,
+                 step_base_lr,
+                 lrs,
+                 gamma,
+                 steps,
+                 max_epoch,
+                 num_iters,
+                 last_epoch=0,
+                 verbose=False):
+        self.warmup_start_lr = warmup_start_lr
+        self.warmup_epochs = warmup_epochs
+        self.step_base_lr = step_base_lr
+        self.lrs = lrs
+        self.gamma = gamma
+        self.steps = steps
+        self.max_epoch = max_epoch
+        self.num_iters = num_iters
+        self.last_epoch = last_epoch
+        self.last_lr = self.warmup_start_lr  # used in first iter
+        self.verbose = verbose
+        self._var_name = None
+
+    def step(self, epoch=None, rebuild=False):
+        """
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+        Returns:
+            None
+        """
+        if epoch is None:
+            if not rebuild:
+                self.last_epoch += 1 / self.num_iters  # update step with iters
+        else:
+            self.last_epoch = epoch
+        self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print(
+                'step Epoch {}: {} set learning rate to {}.self.num_iters={}, 1/self.num_iters={}'
+                .format(self.last_epoch, self.__class__.__name__, self.last_lr,
+                        self.num_iters, 1 / self.num_iters))
+
+    def _lr_func_steps_with_relative_lrs(self, cur_epoch, lrs, base_lr, steps,
+                                         max_epoch):
+        # get step index
+        steps = steps + [max_epoch]
+        for ind, step in enumerate(steps):
+            if cur_epoch < step:
+                break
+        if self.verbose:
+            print(
+                '_lr_func_steps_with_relative_lrs, cur_epoch {}: {}, steps {}, ind {}, step{}, max_epoch{}'
+                .format(cur_epoch, self.__class__.__name__, steps, ind, step,
+                        max_epoch))
+
+        return lrs[ind - 1] * base_lr
+
+    def get_lr(self):
+        """Define lr policy"""
+        lr = self._lr_func_steps_with_relative_lrs(
+            self.last_epoch,
+            self.lrs,
+            self.step_base_lr,
+            self.steps,
+            self.max_epoch,
+        )
+        lr_end = self._lr_func_steps_with_relative_lrs(
+            self.warmup_epochs,
+            self.lrs,
+            self.step_base_lr,
+            self.steps,
+            self.max_epoch,
+        )
+
+        # Perform warm up.
+        if self.last_epoch < self.warmup_epochs:
+            lr_start = self.warmup_start_lr
+            alpha = (lr_end - lr_start) / self.warmup_epochs
+            lr = self.last_epoch * alpha + lr_start
+        if self.verbose:
+            print(
+                'get_lr, Epoch {}: {}, lr {}, lr_end {}, self.lrs{}, self.step_base_lr{}, self.steps{}, self.max_epoch{}'
+                .format(self.last_epoch, self.__class__.__name__, lr, lr_end,
+                        self.lrs, self.step_base_lr, self.steps,
+                        self.max_epoch))
+
+        return lr
+
+
+class CustomPiecewiseDecay(PiecewiseDecay):
+
+    def __init__(self, **kargs):
+        kargs.pop('num_iters')
+        super().__init__(**kargs)
+
+
+class CustomWarmupCosineStepDecay(LRScheduler):
+
+    def __init__(self,
+                 warmup_iters,
+                 warmup_ratio=0.1,
+                 min_lr=0,
+                 base_lr=3e-5,
+                 max_epoch=30,
+                 last_epoch=-1,
+                 num_iters=None,
+                 verbose=False):
+
+        self.warmup_ratio = warmup_ratio
+        self.min_lr = min_lr
+        self.warmup_epochs = warmup_iters
+        self.warmup_iters = warmup_iters * num_iters
+        self.cnt_iters = 0
+        self.cnt_epoch = 0
+        self.num_iters = num_iters
+        self.tot_iters = max_epoch * num_iters
+        self.max_epoch = max_epoch
+        self.cosine_base_lr = base_lr  # initial lr for all param groups
+        self.regular_lr = self.get_regular_lr()
+        super().__init__(last_epoch=last_epoch, verbose=verbose)
+
+    def annealing_cos(self, start, end, factor, weight=1):
+        cos_out = math.cos(math.pi * factor) + 1
+        return end + 0.5 * weight * (start - end) * cos_out
+
+    def get_regular_lr(self):
+        progress = self.cnt_epoch
+        max_progress = self.max_epoch
+        target_lr = self.min_lr
+        return self.annealing_cos(self.cosine_base_lr, target_lr, progress /
+                                  max_progress)  # self.cosine_base_lr
+
+    def get_warmup_lr(self, cur_iters):
+        k = (1 - cur_iters / self.warmup_iters) * (1 - self.warmup_ratio)
+        warmup_lr = self.regular_lr * (1 - k)  # 3e-5 * (1-k)
+        return warmup_lr
+
+    def step(self, epoch=None):
+        self.regular_lr = self.get_regular_lr()
+        self.last_lr = self.get_lr()
+        self.cnt_epoch = (self.cnt_iters +
+                          1) // self.num_iters  # update step with iters
+        self.cnt_iters += 1
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def get_lr(self):
+        """Define lr policy"""
+        cur_iter = self.cnt_iters
+        if cur_iter >= self.warmup_iters:
+            return self.regular_lr
+        else:
+            warmup_lr = self.get_warmup_lr(cur_iter)
+            return warmup_lr
+
+
+class CustomWarmupAdjustDecay(LRScheduler):
+    r"""
+    We combine warmup and stepwise-cosine which is used in slowfast model.
+
+    Args:
+        step_base_lr (float): start learning rate used in warmup stage.
+        warmup_epochs (int): the number epochs of warmup.
+        lr_decay_rate (float|int, optional): base learning rate decay rate.
+        step (int): step in change learning rate.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    Returns:
+        ``CosineAnnealingDecay`` instance to schedule learning rate.
+    """
+
+    def __init__(self,
+                 step_base_lr,
+                 warmup_epochs,
+                 lr_decay_rate,
+                 boundaries,
+                 num_iters=None,
+                 last_epoch=-1,
+                 verbose=False):
+        self.step_base_lr = step_base_lr
+        self.warmup_epochs = warmup_epochs
+        self.lr_decay_rate = lr_decay_rate
+        self.boundaries = boundaries
+        self.num_iters = num_iters
+        #call step() in base class, last_lr/last_epoch/base_lr will be update
+        super(CustomWarmupAdjustDecay, self).__init__(last_epoch=last_epoch,
+                                                      verbose=verbose)
+
+    def step(self, epoch=None):
+        """
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+        Returns:
+            None
+        """
+        if epoch is None:
+            if self.last_epoch == -1:
+                self.last_epoch += 1
+            else:
+                self.last_epoch += 1 / self.num_iters  # update step with iters
+        else:
+            self.last_epoch = epoch
+
+        self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_epochs:
+            lr = self.step_base_lr * (self.last_epoch + 1) / self.warmup_epochs
+        else:
+            lr = self.step_base_lr * (self.lr_decay_rate**np.sum(
+                self.last_epoch >= np.array(self.boundaries)))
+        return lr
diff --git a/docs/src/paddlevideo/solver/lr.py b/docs/src/paddlevideo/solver/lr.py
new file mode 100644
index 000000000..3a56fad16
--- /dev/null
+++ b/docs/src/paddlevideo/solver/lr.py
@@ -0,0 +1,52 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from paddle.optimizer.lr import LRScheduler
+
+from . import custom_lr
+
+
+def build_lr(cfg: Dict, num_iters: int) -> LRScheduler:
+    """Build a learning rate scheduler accroding to ```OPTIMIZER``` configuration, and it always pass into the optimizer.
+    In configuration:
+    learning_rate:
+        name: 'PiecewiseDecay'
+        boundaries: [20, 60]
+        values: [0.00025, 0.000025, 0.0000025]
+
+    Args:
+        cfg (Dict): learning rate configuration.
+        num_iters (int): The number of iterations that may be used when calculating the learning rate
+
+    Returns:
+        LRScheduler: learning rate scheduler.
+    """
+
+    cfg_copy = cfg.copy()
+
+    #when learning_rate is LRScheduler
+    if cfg_copy.get('learning_rate') and isinstance(cfg_copy['learning_rate'],
+                                                    dict):
+        cfg_copy['learning_rate'] = build_lr(
+            cfg_copy['learning_rate'],
+            num_iters)  #not support only inner iter_step
+
+    lr_name = cfg_copy.pop('name')
+    if cfg_copy.get('iter_step'):
+        cfg_copy['num_iters'] = num_iters
+        cfg_copy.pop('iter_step')
+
+    return getattr(custom_lr, lr_name)(**cfg_copy)
diff --git a/docs/src/paddlevideo/solver/optimizer.py b/docs/src/paddlevideo/solver/optimizer.py
new file mode 100644
index 000000000..8924e2135
--- /dev/null
+++ b/docs/src/paddlevideo/solver/optimizer.py
@@ -0,0 +1,136 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+# for python3.11
+if not hasattr(inspect, 'getargspec'):
+    inspect.getargspec = inspect.getfullargspec
+
+from typing import Dict
+
+import paddle
+from paddle.optimizer.lr import LRScheduler
+from paddle.regularizer import L1Decay, L2Decay
+from paddlevideo.utils import get_logger
+
+
+def build_optimizer(cfg: Dict,
+                    lr_scheduler: LRScheduler,
+                    model: paddle.nn.Layer,
+                    use_amp: bool = False,
+                    amp_level: str = None) -> paddle.optimizer.Optimizer:
+    """Build an optimizer and learning rate scheduler to optimize parameters accroding to ```OPTIMIZER``` field in configuration.
+
+    In configuration:
+    OPTIMIZER:
+        name: Momentum
+        momentum: 0.9
+        weight_decay: 0.001
+    or
+
+    OPTIMIZER:
+        name: Momentum
+        momentum: 0.9
+        weight_decay:
+            name: "L1"
+            value: 0.001
+
+    Momentum optimizer will be applied to optimize network and L1Decay regularizer will be applied to avoid overfit.
+
+    OPTIMIZER:
+        name: Adam
+        weight_decay:
+            name: "L2"
+            value: 0.001
+
+    Adam optimizer will be applied to optimize network and L2Decay regularizer will applied to avoid overfit.
+
+    Refer to ```https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/regularizer/L2Decay_en.html``` for more details.
+
+    Args:
+        cfg (Dict): optimizer configuration.
+        lr_scheduler (LRScheduler): learning rate scheduler.
+        model (paddle.nn.Layer, optional): model which contains parameters to be optimized. Defaults to None.
+        use_amp (bool, optional): Whether use amp. Defaults to False.
+        amp_level (str, optional): amp level when amp is enabled. Defaults to None.
+
+
+    Returns:
+        paddle.optimizer.Optimizer: an optimizer for the input model.
+    """
+    logger = get_logger("paddlevideo")
+    cfg_copy = cfg.copy()
+    # NOTE: check none and illegal cfg!!!
+    opt_name = cfg_copy.pop('name')
+    # deal with weight decay
+    if cfg_copy.get('weight_decay'):
+        if isinstance(cfg_copy.get('weight_decay'),
+                      float):  # just an float factor
+            cfg_copy['weight_decay'] = cfg_copy.get('weight_decay')
+        elif 'L1' in cfg_copy.get('weight_decay').get(
+                'name').upper():  # specify L2 wd and it's float factor
+            cfg_copy['weight_decay'] = L1Decay(
+                cfg_copy.get('weight_decay').get('value'))
+        elif 'L2' in cfg_copy.get('weight_decay').get(
+                'name').upper():  # specify L1 wd and it's float factor
+            cfg_copy['weight_decay'] = L2Decay(
+                cfg_copy.get('weight_decay').get('value'))
+        else:
+            raise ValueError
+
+    # deal with grad clip
+    if cfg_copy.get('grad_clip'):
+        if isinstance(cfg_copy.get('grad_clip'), float):
+            cfg_copy['grad_clip'] = cfg_copy.get('grad_clip').get('value')
+        elif 'global' in cfg_copy.get('grad_clip').get('name').lower():
+            cfg_copy['grad_clip'] = paddle.nn.ClipGradByGlobalNorm(
+                cfg_copy.get('grad_clip').get('value'))
+        else:
+            raise ValueError
+
+    # Set for optimizers that cannot be applied to l2decay, i.e. AdamW
+    if cfg_copy.get('no_weight_decay_name'):
+        no_weight_decay_name = cfg_copy.pop('no_weight_decay_name')
+        no_weight_decay_name_list = no_weight_decay_name.split(' ')
+
+        # NOTE: use param.name not name
+        no_weight_decay_param_list = [
+            param.name for name, param in model.named_parameters()
+            if any(key_word in name for key_word in no_weight_decay_name_list)
+        ]  # get the full param name of no weight decay
+
+        _apply_decay_param_fun = lambda name: name not in no_weight_decay_param_list
+        cfg_copy['apply_decay_param_fun'] = _apply_decay_param_fun
+        logger.info(
+            f"No weight Decay list :({len(no_weight_decay_param_list)})",
+            no_weight_decay_param_list)
+
+    cfg_copy.pop('learning_rate')
+
+    # set multi_precision
+    optimizer_setting = {
+        'learning_rate': lr_scheduler,
+        'parameters': model.parameters(),
+        **cfg_copy
+    }
+    optimizer_init_args = inspect.getargspec(
+        getattr(paddle.optimizer, opt_name).__init__).args
+    if use_amp and amp_level == "O2" and "multi_precision" in optimizer_init_args:
+        # support "multi_precision" arg in optimizer's __init__ function.
+        optimizer_setting.update({"multi_precision": True})
+        logger.info(
+            "Set multi_precision=True for optimizer when use_amp=True and amp_level='O2'"
+        )
+
+    return getattr(paddle.optimizer, opt_name)(**optimizer_setting)
diff --git a/docs/src/paddlevideo/tasks/__init__.py b/docs/src/paddlevideo/tasks/__init__.py
new file mode 100644
index 000000000..4d43f0955
--- /dev/null
+++ b/docs/src/paddlevideo/tasks/__init__.py
@@ -0,0 +1,20 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .train import train_model
+from .test import test_model
+from .train_dali import train_dali
+from .train_multigrid import train_model_multigrid
+
+__all__ = ['train_model', 'test_model', 'train_dali', 'train_model_multigrid']
diff --git a/docs/src/paddlevideo/tasks/test.py b/docs/src/paddlevideo/tasks/test.py
new file mode 100644
index 000000000..31c8653df
--- /dev/null
+++ b/docs/src/paddlevideo/tasks/test.py
@@ -0,0 +1,90 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddlevideo.utils import get_logger, load
+
+from ..loader.builder import build_dataloader, build_dataset
+from ..metrics import build_metric
+from ..modeling.builder import build_model
+
+logger = get_logger("paddlevideo")
+
+
+@paddle.no_grad()
+def test_model(cfg, weights, parallel=True):
+    """Test model entry
+
+    Args:
+        cfg (dict): configuration.
+        weights (str): weights path to load.
+        parallel (bool): Whether to do multi-cards testing. Default: True.
+
+    """
+
+    if cfg.get('use_npu', False):
+        places = paddle.set_device('npu')
+    elif cfg.get('use_xpu', False):
+        places = paddle.set_device('xpu')
+    else:
+        places = paddle.set_device('gpu')
+
+    # 1. Construct model.
+    if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):
+        cfg.MODEL.backbone.pretrained = ''  # disable pretrain model init
+    model = build_model(cfg.MODEL)
+
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    # 2. Construct dataset and dataloader.
+    cfg.DATASET.test.test_mode = True
+    dataset = build_dataset((cfg.DATASET.test, cfg.PIPELINE.test))
+    batch_size = cfg.DATASET.get("test_batch_size", 8)
+
+    # default num worker: 0, which means no subprocess will be created
+    num_workers = cfg.DATASET.get('num_workers', 0)
+    num_workers = cfg.DATASET.get('test_num_workers', num_workers)
+    dataloader_setting = dict(batch_size=batch_size,
+                              num_workers=num_workers,
+                              places=places,
+                              drop_last=False,
+                              shuffle=False)
+
+    data_loader = build_dataloader(
+        dataset, **dataloader_setting) if cfg.model_name not in ['CFBI'
+                                                                 ] else dataset
+
+    model.eval()
+
+    state_dicts = load(weights)
+    model.set_state_dict(state_dicts)
+
+    # add params to metrics
+    cfg.METRIC.data_size = len(dataset)
+    cfg.METRIC.batch_size = batch_size
+    Metric = build_metric(cfg.METRIC)
+
+    if cfg.MODEL.framework == "FastRCNN":
+        Metric.set_dataset_info(dataset.info, len(dataset))
+
+    for batch_id, data in enumerate(data_loader):
+        if cfg.model_name in [
+                'CFBI'
+        ]:  # for VOS task, dataset for video and dataloader for frames in each video
+            Metric.update(batch_id, data, model)
+        else:
+            outputs = model(data, mode='test')
+            Metric.update(batch_id, data, outputs)
+    Metric.accumulate()
diff --git a/docs/src/paddlevideo/tasks/train.py b/docs/src/paddlevideo/tasks/train.py
new file mode 100644
index 000000000..451ec5db8
--- /dev/null
+++ b/docs/src/paddlevideo/tasks/train.py
@@ -0,0 +1,426 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import time
+
+import paddle
+import paddle.amp as amp
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from paddlevideo.utils import (add_profiler_step, build_record, get_logger,
+                               load, log_batch, log_epoch, mkdir, save)
+
+from ..loader.builder import build_dataloader, build_dataset
+from ..metrics.ava_utils import collect_results_cpu
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..utils import do_preciseBN
+
+
+def train_model(cfg,
+                weights=None,
+                parallel=True,
+                validate=True,
+                use_amp=False,
+                amp_level=None,
+                max_iters=None,
+                use_fleet=False,
+                profiler_options=None):
+    """Train model entry
+
+    Args:
+        cfg (dict): configuration.
+        weights (str, optional): weights path for finetuning. Defaults to None.
+        parallel (bool, optional): whether multi-cards training. Defaults to True.
+        validate (bool, optional): whether to do evaluation. Defaults to True.
+        use_amp (bool, optional): whether to use automatic mixed precision during training. Defaults to False.
+        amp_level (str, optional): amp optmization level, must be 'O1' or 'O2' when use_amp is True. Defaults to None.
+        max_iters (int, optional): max running iters in an epoch. Defaults to None.
+        use_fleet (bool, optional): whether to use fleet. Defaults to False.
+        profiler_options (str, optional): configuration for the profiler function. Defaults to None.
+
+    """
+    if use_fleet:
+        fleet.init(is_collective=True)
+
+    logger = get_logger("paddlevideo")
+    batch_size = cfg.DATASET.get('batch_size', 8)
+    valid_batch_size = cfg.DATASET.get('valid_batch_size', batch_size)
+
+    # gradient accumulation settings
+    use_gradient_accumulation = cfg.get('GRADIENT_ACCUMULATION', None)
+    if use_gradient_accumulation and dist.get_world_size() >= 1:
+        global_batch_size = cfg.GRADIENT_ACCUMULATION.get(
+            'global_batch_size', None)
+        num_gpus = dist.get_world_size()
+
+        assert isinstance(
+            global_batch_size, int
+        ), f"global_batch_size must be int, but got {type(global_batch_size)}"
+        assert batch_size <= global_batch_size, \
+            f"global_batch_size({global_batch_size}) must not be less than batch_size({batch_size})"
+
+        cur_global_batch_size = batch_size * num_gpus  # The number of batches calculated by all GPUs at one time
+        assert global_batch_size % cur_global_batch_size == 0, \
+            f"The global batchsize({global_batch_size}) must be divisible by cur_global_batch_size({cur_global_batch_size})"
+        cfg.GRADIENT_ACCUMULATION[
+            "num_iters"] = global_batch_size // cur_global_batch_size
+        # The number of iterations required to reach the global batchsize
+        logger.info(
+            f"Using gradient accumulation training strategy, "
+            f"global_batch_size={global_batch_size}, "
+            f"num_gpus={num_gpus}, "
+            f"num_accumulative_iters={cfg.GRADIENT_ACCUMULATION.num_iters}")
+
+    if cfg.get('use_npu', False):
+        places = paddle.set_device('npu')
+    elif cfg.get('use_xpu', False):
+        places = paddle.set_device('xpu')
+    else:
+        places = paddle.set_device('gpu')
+
+    # default num worker: 0, which means no subprocess will be created
+    num_workers = cfg.DATASET.get('num_workers', 0)
+    valid_num_workers = cfg.DATASET.get('valid_num_workers', num_workers)
+    model_name = cfg.model_name
+    output_dir = cfg.get("output_dir", f"./output/{model_name}")
+    mkdir(output_dir)
+
+    # 1. Construct model
+    model = build_model(cfg.MODEL)
+
+    if cfg.get('to_static', False):
+        specs = None
+        model = paddle.jit.to_static(model, input_spec=specs)
+        logger.info(
+            "Successfully to apply @to_static with specs: {}".format(specs))
+
+    # 2. Construct dataset and dataloader for training and evaluation
+    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))
+    train_dataloader_setting = dict(
+        batch_size=batch_size,
+        num_workers=num_workers,
+        collate_fn_cfg=cfg.get('MIX', None),
+        places=places)
+    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)
+
+    if validate:
+        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))
+        validate_dataloader_setting = dict(
+            batch_size=valid_batch_size,
+            num_workers=valid_num_workers,
+            places=places,
+            drop_last=False,
+            shuffle=cfg.DATASET.get(
+                'shuffle_valid',
+                False)  # NOTE: attention_LSTM needs to shuffle valid data.
+        )
+        valid_loader = build_dataloader(valid_dataset,
+                                        **validate_dataloader_setting)
+
+    # 3. Construct learning rate scheduler(lr) and optimizer
+    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+    optimizer = build_optimizer(
+        cfg.OPTIMIZER, lr, model=model, use_amp=use_amp, amp_level=amp_level)
+
+    # 4. Construct scalar and convert parameters for amp(optional)
+    if use_amp:
+        scaler = amp.GradScaler(
+            init_loss_scaling=2.0**16,
+            incr_every_n_steps=2000,
+            decr_every_n_nan_or_inf=1)
+        # convert model parameters to fp16 when amp_level is O2(pure fp16)
+        model, optimizer = amp.decorate(
+            models=model,
+            optimizers=optimizer,
+            level=amp_level,
+            master_weight=True,
+            save_dtype=None)
+        # NOTE: save_dtype is set to float32 now.
+        logger.info(f"Training in amp mode, amp_level={amp_level}.")
+    else:
+        assert amp_level is None, f"amp_level must be None when training in fp32 mode, but got {amp_level}."
+        logger.info("Training in fp32 mode.")
+
+    # 5. Resume(optional)
+    resume_epoch = cfg.get("resume_epoch", 0)
+    if resume_epoch:
+        filename = osp.join(output_dir,
+                            model_name + f"_epoch_{resume_epoch:05d}")
+        resume_model_dict = load(filename + '.pdparams')
+        resume_opt_dict = load(filename + '.pdopt')
+        model.set_state_dict(resume_model_dict)
+        optimizer.set_state_dict(resume_opt_dict)
+        logger.info("Resume from checkpoint: {}".format(filename))
+
+    # 6. Finetune(optional)
+    if weights:
+        assert resume_epoch == 0, f"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it."
+        model_dict = load(weights)
+        model.set_state_dict(model_dict)
+        logger.info("Finetune from checkpoint: {}".format(weights))
+
+    # 7. Parallelize(optional)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    if use_fleet:
+        model = fleet.distributed_model(model)
+        optimizer = fleet.distributed_optimizer(optimizer)
+
+    # 8. Train Model
+    best = 0.0
+    for epoch in range(0, cfg.epochs):
+        if epoch < resume_epoch:
+            logger.info(
+                f"| epoch: [{epoch + 1}] <= resume_epoch: [{resume_epoch}], continue..."
+            )
+            continue
+        model.train()
+
+        record_list = build_record(cfg.MODEL)
+        tic = time.time()
+        for i, data in enumerate(train_loader):
+            """Next two line of code only used in test_tipc,
+            ignore it most of the time"""
+            if max_iters is not None and i >= max_iters:
+                break
+
+            record_list['reader_time'].update(time.time() - tic)
+
+            # Collect performance information when profiler_options is activate
+            add_profiler_step(profiler_options)
+
+            # 8.1 forward
+            # AMP #
+            if use_amp:
+                with amp.auto_cast(
+                        custom_black_list={"reduce_mean", "conv3d"},
+                        level=amp_level):
+                    outputs = model(data, mode='train')
+                avg_loss = outputs['loss']
+                if use_gradient_accumulation:
+                    # clear grad at when epoch begins
+                    if i == 0:
+                        optimizer.clear_grad()
+                    # Loss normalization
+                    avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters
+                    # Loss scaling
+                    scaled = scaler.scale(avg_loss)
+                    # 8.2 backward
+                    scaled.backward()
+                    # 8.3 minimize
+                    if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:
+                        scaler.minimize(optimizer, scaled)
+                        optimizer.clear_grad()
+                else:  # general case
+                    # Loss scaling
+                    scaled = scaler.scale(avg_loss)
+                    # 8.2 backward
+                    scaled.backward()
+                    # 8.3 minimize
+                    scaler.minimize(optimizer, scaled)
+                    optimizer.clear_grad()
+            else:
+                outputs = model(data, mode='train')
+                avg_loss = outputs['loss']
+                if use_gradient_accumulation:
+                    # clear grad at when epoch begins
+                    if i == 0:
+                        optimizer.clear_grad()
+                    # Loss normalization
+                    avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters
+                    # 8.2 backward
+                    avg_loss.backward()
+                    # 8.3 minimize
+                    if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:
+                        optimizer.step()
+                        optimizer.clear_grad()
+                else:  # general case
+                    # 8.2 backward
+                    avg_loss.backward()
+                    # 8.3 minimize
+                    optimizer.step()
+                    optimizer.clear_grad()
+
+            # log record
+            record_list['lr'].update(optimizer.get_lr(), batch_size)
+            for name, value in outputs.items():
+                if name in record_list:
+                    record_list[name].update(value, batch_size)
+
+            record_list['batch_time'].update(time.time() - tic)
+            tic = time.time()
+
+            if i % cfg.get("log_interval", 10) == 0:
+                ips = "ips: {:.5f} instance/sec,".format(
+                    batch_size / record_list["batch_time"].val)
+                cur_progress = ((i + 1) + epoch * len(train_loader)) / (
+                    len(train_loader) * cfg.epochs)
+                eta = int(record_list["batch_time"].sum *
+                          (1 - cur_progress) / cur_progress + 0.5)
+                log_batch(record_list, i, epoch + 1, cfg.epochs, "train", ips,
+                          eta)
+
+            # learning rate iter step
+            if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+                lr.step()
+
+        # learning rate epoch step
+        if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+            lr.step()
+
+        ips = "avg_ips: {:.5f} instance/sec.".format(
+            batch_size * record_list["batch_time"].count /
+            record_list["batch_time"].sum)
+        log_epoch(record_list, epoch + 1, "train", ips)
+
+        def evaluate(best):
+            model.eval()
+            results = []
+            record_list = build_record(cfg.MODEL)
+            record_list.pop('lr')
+            tic = time.time()
+            if parallel:
+                rank = dist.get_rank()
+            # single_gpu_test and multi_gpu_test
+            for i, data in enumerate(valid_loader):
+                """Next two line of code only used in test_tipc,
+                ignore it most of the time"""
+                if max_iters is not None and i >= max_iters:
+                    break
+
+                if use_amp:
+                    with amp.auto_cast(
+                            custom_black_list={"reduce_mean", "conv3d"},
+                            level=amp_level):
+                        outputs = model(data, mode='valid')
+                else:
+                    outputs = model(data, mode='valid')
+
+                if cfg.MODEL.framework == "FastRCNN":
+                    results.extend(outputs)
+
+                # log_record
+                if cfg.MODEL.framework != "FastRCNN":
+                    for name, value in outputs.items():
+                        if name in record_list:
+                            record_list[name].update(value, batch_size)
+
+                record_list['batch_time'].update(time.time() - tic)
+                tic = time.time()
+
+                if i % cfg.get("log_interval", 10) == 0:
+                    ips = "ips: {:.5f} instance/sec.".format(
+                        valid_batch_size / record_list["batch_time"].val)
+                    log_batch(record_list, i, epoch + 1, cfg.epochs, "val", ips)
+
+            if cfg.MODEL.framework == "FastRCNN":
+                if parallel:
+                    results = collect_results_cpu(results, len(valid_dataset))
+                if not parallel or (parallel and rank == 0):
+                    eval_res = valid_dataset.evaluate(results)
+                    for name, value in eval_res.items():
+                        record_list[name].update(value, valid_batch_size)
+
+            ips = "avg_ips: {:.5f} instance/sec.".format(
+                valid_batch_size * record_list["batch_time"].count /
+                record_list["batch_time"].sum)
+            log_epoch(record_list, epoch + 1, "val", ips)
+
+            best_flag = False
+            if cfg.MODEL.framework == "FastRCNN" and (not parallel or
+                                                      (parallel and rank == 0)):
+                if record_list["mAP@0.5IOU"].val > best:
+                    best = record_list["mAP@0.5IOU"].val
+                    best_flag = True
+                return best, best_flag
+
+            if cfg.MODEL.framework == "YOWOLocalizer" and (not parallel or
+                                                           (parallel and rank == 0)):
+                if record_list["fscore"].avg > best:
+                    best = record_list["fscore"].avg
+                    best_flag = True
+                return best, best_flag
+
+            # forbest2, cfg.MODEL.framework != "FastRCNN":
+            for top_flag in ['hit_at_one', 'top1', 'rmse', "F1@0.50"]:
+                if record_list.get(top_flag):
+                    if top_flag != 'rmse' and record_list[top_flag].avg > best:
+                        best = record_list[top_flag].avg
+                        best_flag = True
+                    elif top_flag == 'rmse' and (
+                            best == 0.0 or record_list[top_flag].avg < best):
+                        best = record_list[top_flag].avg
+                        best_flag = True
+
+            return best, best_flag
+
+        # use precise bn to improve acc
+        if cfg.get("PRECISEBN") and (
+                epoch % cfg.PRECISEBN.preciseBN_interval == 0
+                or epoch == cfg.epochs - 1):
+            do_preciseBN(model, train_loader, parallel,
+                         min(cfg.PRECISEBN.num_iters_preciseBN,
+                             len(train_loader)), use_amp, amp_level)
+
+        # 9. Validation
+        if validate and (epoch % cfg.get("val_interval", 1) == 0
+                         or epoch == cfg.epochs - 1):
+            with paddle.no_grad():
+                best, save_best_flag = evaluate(best)
+            # save best
+            if save_best_flag:
+                save(optimizer.state_dict(),
+                     osp.join(output_dir, model_name + "_best.pdopt"))
+                save_student_model_flag = True if "Distillation" in cfg.MODEL.framework else False
+                save(
+                    model.state_dict(),
+                    osp.join(output_dir, model_name + "_best.pdparams"),
+                    save_student_model=save_student_model_flag)
+                if model_name == "AttentionLstm":
+                    logger.info(
+                        f"Already save the best model (hit_at_one){best}")
+                elif cfg.MODEL.framework == "FastRCNN":
+                    logger.info(
+                        f"Already save the best model (mAP@0.5IOU){int(best * 10000) / 10000}"
+                    )
+                elif cfg.MODEL.framework == "DepthEstimator":
+                    logger.info(
+                        f"Already save the best model (rmse){int(best * 10000) / 10000}"
+                    )
+                elif cfg.MODEL.framework in ['MSTCN', 'ASRF']:
+                    logger.info(
+                        f"Already save the best model (F1@0.50){int(best * 10000) / 10000}"
+                    )
+                elif cfg.MODEL.framework in ['YOWOLocalizer']:
+                    logger.info(
+                        f"Already save the best model (fsocre){int(best * 10000) / 10000}"
+                    )
+                else:
+                    logger.info(
+                        f"Already save the best model (top1 acc){int(best * 10000) / 10000}"
+                    )
+
+        # 10. Save model and optimizer
+        if epoch % cfg.get("save_interval", 1) == 0 or epoch == cfg.epochs - 1:
+            save(optimizer.state_dict(),
+                 osp.join(output_dir,
+                          model_name + f"_epoch_{epoch + 1:05d}.pdopt"))
+            save(model.state_dict(),
+                 osp.join(output_dir,
+                          model_name + f"_epoch_{epoch + 1:05d}.pdparams"))
+
+    logger.info(f'training {model_name} finished')
diff --git a/docs/src/paddlevideo/tasks/train_dali.py b/docs/src/paddlevideo/tasks/train_dali.py
new file mode 100644
index 000000000..8dd0a20f5
--- /dev/null
+++ b/docs/src/paddlevideo/tasks/train_dali.py
@@ -0,0 +1,143 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os.path as osp
+
+import paddle
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..utils import do_preciseBN
+from paddlevideo.utils import get_logger, coloring
+from paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,
+                               save, load, mkdir)
+from paddlevideo.loader import TSN_Dali_loader, get_input_data
+"""
+We only supported DALI training for TSN model now.
+"""
+
+
+def train_dali(cfg, weights=None, parallel=True):
+    """Train model entry
+
+    Args:
+    	cfg (dict): configuration.
+        weights (str): weights path for finetuning.
+    	parallel (bool): Whether multi-cards training. Default: True.
+
+    """
+
+    logger = get_logger("paddlevideo")
+    batch_size = cfg.DALI_LOADER.get('batch_size', 8)
+    places = paddle.set_device('gpu')
+    model_name = cfg.model_name
+    output_dir = cfg.get("output_dir", f"./output/{model_name}")
+    mkdir(output_dir)
+
+    # 1. Construct model
+    model = build_model(cfg.MODEL)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    # 2. Construct dali dataloader
+    train_loader = TSN_Dali_loader(cfg.DALI_LOADER).build_dali_reader()
+
+    # 3. Construct solver.
+    lr = build_lr(cfg.OPTIMIZER.learning_rate, None)
+    optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)
+
+    # Resume
+    resume_epoch = cfg.get("resume_epoch", 0)
+    if resume_epoch:
+        filename = osp.join(output_dir,
+                            model_name + f"_epoch_{resume_epoch:05d}")
+        resume_model_dict = load(filename + '.pdparams')
+        resume_opt_dict = load(filename + '.pdopt')
+        model.set_state_dict(resume_model_dict)
+        optimizer.set_state_dict(resume_opt_dict)
+
+    # Finetune:
+    if weights:
+        assert resume_epoch == 0, f"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it."
+        model_dict = load(weights)
+        model.set_state_dict(model_dict)
+
+    # 4. Train Model
+    for epoch in range(0, cfg.epochs):
+        if epoch < resume_epoch:
+            logger.info(
+                f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... "
+            )
+            continue
+        model.train()
+        record_list = build_record(cfg.MODEL)
+        tic = time.time()
+        for i, data in enumerate(train_loader):
+            data = get_input_data(data)
+            record_list['reader_time'].update(time.time() - tic)
+            # 4.1 forward
+            outputs = model(data, mode='train')
+            # 4.2 backward
+            avg_loss = outputs['loss']
+            avg_loss.backward()
+            # 4.3 minimize
+            optimizer.step()
+            optimizer.clear_grad()
+
+            # log record
+            record_list['lr'].update(optimizer._global_learning_rate(),
+                                     batch_size)
+            for name, value in outputs.items():
+                record_list[name].update(value, batch_size)
+
+            record_list['batch_time'].update(time.time() - tic)
+            tic = time.time()
+
+            if i % cfg.get("log_interval", 10) == 0:
+                ips = "ips: {:.5f} instance/sec.".format(
+                    batch_size / record_list["batch_time"].val)
+                log_batch(record_list, i, epoch + 1, cfg.epochs, "train", ips)
+
+            # learning rate iter step
+            if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+                lr.step()
+
+        # learning rate epoch step
+        if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+            lr.step()
+
+        ips = "ips: {:.5f} instance/sec.".format(
+            batch_size * record_list["batch_time"].count /
+            record_list["batch_time"].sum)
+        log_epoch(record_list, epoch + 1, "train", ips)
+
+        # use precise bn to improve acc
+        if cfg.get("PRECISEBN") and (epoch % cfg.PRECISEBN.preciseBN_interval
+                                     == 0 or epoch == cfg.epochs - 1):
+            do_preciseBN(
+                model, train_loader, parallel,
+                min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader)))
+
+        # 5. Save model and optimizer
+        if epoch % cfg.get("save_interval", 1) == 0 or epoch == cfg.epochs - 1:
+            save(
+                optimizer.state_dict(),
+                osp.join(output_dir,
+                         model_name + f"_epoch_{epoch+1:05d}.pdopt"))
+            save(
+                model.state_dict(),
+                osp.join(output_dir,
+                         model_name + f"_epoch_{epoch+1:05d}.pdparams"))
+
+    logger.info(f'training {model_name} finished')
diff --git a/docs/src/paddlevideo/tasks/train_multigrid.py b/docs/src/paddlevideo/tasks/train_multigrid.py
new file mode 100644
index 000000000..19e756fe9
--- /dev/null
+++ b/docs/src/paddlevideo/tasks/train_multigrid.py
@@ -0,0 +1,335 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os.path as osp
+
+import paddle
+import paddle.distributed as dist
+
+from ..loader.builder import build_dataloader, build_dataset
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..utils import do_preciseBN
+from paddlevideo.utils import get_logger, coloring
+from paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,
+                               save, load, mkdir)
+from paddlevideo.utils.multigrid import MultigridSchedule, aggregate_sub_bn_stats, subn_load, subn_save, is_eval_epoch
+
+
+def construct_loader(cfg, places, validate, precise_bn, num_iters_precise_bn,
+                     world_size):
+    batch_size = cfg.DATASET.get('batch_size', 2)
+    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))
+    precise_bn_dataloader_setting = dict(
+        batch_size=batch_size,
+        num_workers=cfg.DATASET.get('num_workers', 0),
+        places=places,
+    )
+    if precise_bn:
+        cfg.DATASET.train.num_samples_precise_bn = num_iters_precise_bn * batch_size * world_size
+        precise_bn_dataset = build_dataset((cfg.DATASET.train,
+                                            cfg.PIPELINE.train))
+        precise_bn_loader = build_dataloader(precise_bn_dataset,
+                                             **precise_bn_dataloader_setting)
+        cfg.DATASET.train.num_samples_precise_bn = None
+    else:
+        precise_bn_loader = None
+
+    if cfg.MULTIGRID.SHORT_CYCLE:
+        # get batch size list in short cycle schedule
+        bs_factor = [
+            int(
+                round((float(cfg.PIPELINE.train.transform[1]['MultiCrop'][
+                    'target_size']) / (s * cfg.MULTIGRID.default_crop_size))
+                      **2)) for s in cfg.MULTIGRID.short_cycle_factors
+        ]
+        batch_sizes = [
+            batch_size * bs_factor[0],
+            batch_size * bs_factor[1],
+            batch_size,
+        ]
+        train_dataloader_setting = dict(
+            batch_size=batch_sizes,
+            multigrid=True,
+            num_workers=cfg.DATASET.get('num_workers', 0),
+            places=places,
+        )
+    else:
+        train_dataloader_setting = precise_bn_dataloader_setting
+
+    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)
+    if validate:
+        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))
+        validate_dataloader_setting = dict(
+            batch_size=batch_size,
+            num_workers=cfg.DATASET.get('num_workers', 0),
+            places=places,
+            drop_last=False,
+            shuffle=False)
+        valid_loader = build_dataloader(valid_dataset,
+                                        **validate_dataloader_setting)
+    else:
+        valid_loader = None
+
+    return train_loader, valid_loader, precise_bn_loader
+
+
+def build_trainer(cfg, places, parallel, validate, precise_bn,
+                  num_iters_precise_bn, world_size):
+    """
+    Build training model and its associated tools, including optimizer,
+    dataloaders and meters.
+    Args:
+        cfg (CfgNode): configs.
+    Returns:
+        model: training model.
+        optimizer: optimizer.
+        train_loader: training data loader.
+        val_loader: validatoin data loader.
+        precise_bn_loader: training data loader for computing
+            precise BN.
+    """
+    model = build_model(cfg.MODEL)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    train_loader, valid_loader, precise_bn_loader = \
+        construct_loader(cfg,
+                         places,
+                         validate,
+                         precise_bn,
+                         num_iters_precise_bn,
+                         world_size,
+                         )
+
+    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+    optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)
+
+    return (
+        model,
+        lr,
+        optimizer,
+        train_loader,
+        valid_loader,
+        precise_bn_loader,
+    )
+
+
+def train_model_multigrid(cfg, world_size=1, validate=True):
+    """Train model entry
+
+    Args:
+    	cfg (dict): configuration.
+    	parallel (bool): Whether multi-card training. Default: True
+        validate (bool): Whether to do evaluation. Default: False.
+
+    """
+    # Init multigrid.
+    multigrid = None
+    if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE:
+        multigrid = MultigridSchedule()
+        cfg = multigrid.init_multigrid(cfg)
+        if cfg.MULTIGRID.LONG_CYCLE:
+            cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0)
+    multi_save_epoch = [i[-1] - 1 for i in multigrid.schedule]
+
+    parallel = world_size != 1
+    logger = get_logger("paddlevideo")
+    batch_size = cfg.DATASET.get('batch_size', 2)
+
+    if cfg.get('use_npu', False):
+        places = paddle.set_device('npu')
+    elif cfg.get('use_xpu', False):
+        places = paddle.set_device('xpu')
+    else:
+        places = paddle.set_device('gpu')
+
+    model_name = cfg.model_name
+    output_dir = cfg.get("output_dir", f"./output/{model_name}")
+    mkdir(output_dir)
+    local_rank = dist.ParallelEnv().local_rank
+    precise_bn = cfg.get("PRECISEBN")
+    num_iters_precise_bn = cfg.PRECISEBN.num_iters_preciseBN
+
+    # 1. Construct model
+    model = build_model(cfg.MODEL)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    # 2. Construct dataloader
+    train_loader, valid_loader, precise_bn_loader = \
+        construct_loader(cfg,
+                         places,
+                         validate,
+                         precise_bn,
+                         num_iters_precise_bn,
+                         world_size,
+                         )
+
+    # 3. Construct optimizer
+    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+    optimizer = build_optimizer(
+        cfg.OPTIMIZER, lr, parameter_list=model.parameters())
+
+    # Resume
+    resume_epoch = cfg.get("resume_epoch", 0)
+    if resume_epoch:
+        filename = osp.join(
+            output_dir,
+            model_name + str(local_rank) + '_' + f"{resume_epoch:05d}")
+        subn_load(model, filename, optimizer)
+
+    # 4. Train Model
+    best = 0.
+    total_epochs = int(cfg.epochs * cfg.MULTIGRID.epoch_factor)
+    for epoch in range(total_epochs):
+        if epoch < resume_epoch:
+            logger.info(
+                f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... "
+            )
+            continue
+
+        if cfg.MULTIGRID.LONG_CYCLE:
+            cfg, changed = multigrid.update_long_cycle(cfg, epoch)
+            if changed:
+                logger.info("====== Rebuild model/optimizer/loader =====")
+                (
+                    model,
+                    lr,
+                    optimizer,
+                    train_loader,
+                    valid_loader,
+                    precise_bn_loader,
+                ) = build_trainer(cfg, places, parallel, validate, precise_bn,
+                                  num_iters_precise_bn, world_size)
+
+                #load checkpoint after re-build model
+                if epoch != 0:
+                    #epoch no need to -1, haved add 1 when save
+                    filename = osp.join(
+                        output_dir,
+                        model_name + str(local_rank) + '_' + f"{(epoch):05d}")
+                    subn_load(model, filename, optimizer)
+                #update lr last epoch, not to use saved params
+                lr.last_epoch = epoch
+                lr.step(rebuild=True)
+
+        model.train()
+        record_list = build_record(cfg.MODEL)
+        tic = time.time()
+        for i, data in enumerate(train_loader):
+            record_list['reader_time'].update(time.time() - tic)
+            # 4.1 forward
+            outputs = model(data, mode='train')
+            # 4.2 backward
+            avg_loss = outputs['loss']
+            avg_loss.backward()
+            # 4.3 minimize
+            optimizer.step()
+            optimizer.clear_grad()
+
+            # log record
+            record_list['lr'].update(
+                float(optimizer._global_learning_rate()), batch_size)
+            for name, value in outputs.items():
+                record_list[name].update(float(value), batch_size)
+            record_list['batch_time'].update(time.time() - tic)
+            tic = time.time()
+
+            if i % cfg.get("log_interval", 10) == 0:
+                ips = "ips: {:.5f} instance/sec.".format(
+                    batch_size / record_list["batch_time"].val)
+                log_batch(record_list, i, epoch + 1, total_epochs, "train", ips)
+
+            # learning rate iter step
+            if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+                lr.step()
+
+        # learning rate epoch step
+        if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+            lr.step()
+
+        ips = "ips: {:.5f} instance/sec.".format(
+            batch_size * record_list["batch_time"].count /
+            record_list["batch_time"].sum)
+        log_epoch(record_list, epoch + 1, "train", ips)
+
+        def evaluate(best):
+            model.eval()
+            record_list = build_record(cfg.MODEL)
+            record_list.pop('lr')
+            tic = time.time()
+            for i, data in enumerate(valid_loader):
+                outputs = model(data, mode='valid')
+
+                # log_record
+                for name, value in outputs.items():
+                    record_list[name].update(float(value), batch_size)
+
+                record_list['batch_time'].update(time.time() - tic)
+                tic = time.time()
+
+                if i % cfg.get("log_interval", 10) == 0:
+                    ips = "ips: {:.5f} instance/sec.".format(
+                        batch_size / record_list["batch_time"].val)
+                    log_batch(record_list, i, epoch + 1, total_epochs, "val",
+                              ips)
+
+            ips = "ips: {:.5f} instance/sec.".format(
+                batch_size * record_list["batch_time"].count /
+                record_list["batch_time"].sum)
+            log_epoch(record_list, epoch + 1, "val", ips)
+
+            best_flag = False
+            if record_list.get('top1') and record_list['top1'].avg > best:
+                best = record_list['top1'].avg
+                best_flag = True
+            return best, best_flag
+
+        # use precise bn to improve acc
+        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):
+            logger.info(f"do precise BN in {epoch+1} ...")
+            do_preciseBN(model, precise_bn_loader, parallel,
+                         min(num_iters_precise_bn, len(precise_bn_loader)))
+
+        #  aggregate sub_BN stats
+        logger.info("Aggregate sub_BatchNorm stats...")
+        aggregate_sub_bn_stats(model)
+
+        # 5. Validation
+        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):
+            logger.info(f"eval in {epoch+1} ...")
+            with paddle.no_grad():
+                best, save_best_flag = evaluate(best)
+            # save best
+            if save_best_flag:
+                save(optimizer.state_dict(),
+                     osp.join(output_dir, model_name + "_best.pdopt"))
+                save(model.state_dict(),
+                     osp.join(output_dir, model_name + "_best.pdparams"))
+                logger.info(
+                    f"Already save the best model (top1 acc){int(best * 10000) / 10000}"
+                )
+
+        # 6. Save model and optimizer
+        if is_eval_epoch(
+                cfg, epoch,
+                total_epochs, multigrid.schedule) or epoch % cfg.get(
+                    "save_interval", 10) == 0 or epoch in multi_save_epoch:
+            logger.info("[Save parameters] ======")
+            subn_save(output_dir, model_name + str(local_rank) + '_', epoch + 1,
+                      model, optimizer)
+
+    logger.info(f'training {model_name} finished')
diff --git a/docs/src/paddlevideo/utils/__init__.py b/docs/src/paddlevideo/utils/__init__.py
new file mode 100644
index 000000000..d18561d76
--- /dev/null
+++ b/docs/src/paddlevideo/utils/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import Registry
+from .build_utils import build
+from .config import *
+from .logger import setup_logger, coloring, get_logger
+from .record import AverageMeter, build_record, log_batch, log_epoch
+from .dist_utils import get_dist_info, main_only
+from .save_load import save, load, load_ckpt, mkdir
+from .precise_bn import do_preciseBN
+from .profiler import add_profiler_step
+__all__ = ['Registry', 'build']
diff --git a/docs/src/paddlevideo/utils/build_utils.py b/docs/src/paddlevideo/utils/build_utils.py
new file mode 100644
index 000000000..73c0ca46b
--- /dev/null
+++ b/docs/src/paddlevideo/utils/build_utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def build(cfg, registry, key='name'):
+    """Build a module from config dict.
+    Args:
+        cfg (dict): Config dict. It should at least contain the key.
+        registry (XXX): The registry to search the type from.
+        key (str): the key.
+    Returns:
+        obj: The constructed object.
+    """
+
+    assert isinstance(cfg, dict) and key in cfg
+
+    cfg_copy = cfg.copy()
+    obj_type = cfg_copy.pop(key)
+
+    obj_cls = registry.get(obj_type)
+    if obj_cls is None:
+        raise KeyError('{} is not in the {} registry'.format(
+                obj_type, registry.name))
+    return obj_cls(**cfg_copy)
diff --git a/docs/src/paddlevideo/utils/config.py b/docs/src/paddlevideo/utils/config.py
new file mode 100644
index 000000000..f4d794116
--- /dev/null
+++ b/docs/src/paddlevideo/utils/config.py
@@ -0,0 +1,174 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import yaml
+from paddlevideo.utils.logger import coloring, get_logger, setup_logger
+
+__all__ = ['get_config']
+
+logger = setup_logger("./", name="paddlevideo", level="INFO")
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    placeholder = "-" * 60
+    for k, v in sorted(d.items()):
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", coloring(k,
+                                                                   "HEADER")))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ",
+                                         coloring(str(k), "HEADER")))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ",
+                                           coloring(k, "HEADER"),
+                                           coloring(v, "OKGREEN")))
+
+        if k.isupper():
+            logger.info(placeholder)
+
+
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    print_dict(config)
+
+
+def check_config(config):
+    """
+    Check config
+    """
+    pass
+
+
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+    def str2num(v):
+        try:
+            return eval(v)
+        except Exception:
+            return v
+
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            if not ks[0] in dl:
+                logger.warning('A new filed ({}) detected!'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            assert ks[0] in dl, (
+                '({}) doesn\'t exist in {}, a new dict field is invalid'.format(
+                    ks[0], dl))
+            override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                epochs=20',
+                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    if options is not None:
+        for opt in options:
+            assert isinstance(opt,
+                              str), ("option({}) should be a str".format(opt))
+            assert "=" in opt, (
+                "option({}) should contain a ="
+                "to distinguish between key and value".format(opt))
+            pair = opt.split('=')
+            assert len(pair) == 2, ("there can be only a = in the option")
+            key, value = pair
+            keys = key.split('.')
+            override(config, keys, value)
+
+    return config
+
+
+def get_config(fname, overrides=None, show=True):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    override_config(config, overrides)
+    if show:
+        print_config(config)
+    check_config(config)
+    return config
diff --git a/docs/src/paddlevideo/utils/dist_utils.py b/docs/src/paddlevideo/utils/dist_utils.py
new file mode 100644
index 000000000..7659e88c1
--- /dev/null
+++ b/docs/src/paddlevideo/utils/dist_utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+
+import paddle
+import paddle.distributed as dist
+
+def get_dist_info():
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    return rank, world_size
+
+def main_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper
diff --git a/docs/src/paddlevideo/utils/logger.py b/docs/src/paddlevideo/utils/logger.py
new file mode 100644
index 000000000..e9791b89b
--- /dev/null
+++ b/docs/src/paddlevideo/utils/logger.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import datetime
+
+from paddle.distributed import ParallelEnv
+
+
+
+Color = {
+    'RED': '\033[31m',
+    'HEADER': '\033[35m',  # deep purple
+    'PURPLE': '\033[95m',  # purple
+    'OKBLUE': '\033[94m',
+    'OKGREEN': '\033[92m',
+    'WARNING': '\033[93m',
+    'FAIL': '\033[91m',
+    'ENDC': '\033[0m'
+}
+
+
+def coloring(message, color="OKGREEN"):
+    assert color in Color.keys()
+    if os.environ.get('COLORING', True):
+        return Color[color] + str(message) + Color["ENDC"]
+    else:
+        return message
+
+
+logger_initialized = []
+
+
+def setup_logger(output=None, name="paddlevideo", level="INFO"):
+    """
+    Initialize the paddlevideo logger and set its verbosity level to "INFO".
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+    Returns:
+        logging.Logger: a logger
+    """
+    def time_zone(sec, fmt):
+        real_time = datetime.datetime.now()
+        return real_time.timetuple()
+    logging.Formatter.converter = time_zone
+
+    logger = logging.getLogger(name)
+    if level == "INFO":
+        logger.setLevel(logging.INFO)
+    elif level=="DEBUG":
+        logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    if level == "DEBUG":
+        plain_formatter = logging.Formatter(
+            "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
+            datefmt="%m/%d %H:%M:%S")
+    else:
+        plain_formatter = logging.Formatter(
+            "[%(asctime)s] %(message)s",
+            datefmt="%m/%d %H:%M:%S")
+    # stdout logging: master only
+    local_rank = ParallelEnv().local_rank
+    if local_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, ".log.txt")
+        if local_rank > 0:
+            filename = filename + ".rank{}".format(local_rank)
+
+        # PathManager.mkdirs(os.path.dirname(filename))
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        # fh = logging.StreamHandler(_cached_log_stream(filename)
+        fh = logging.FileHandler(filename, mode='a')
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+    logger_initialized.append(name)
+    return logger
+
+
+def get_logger(name, output=None):
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    return setup_logger(name=name, output=name)
diff --git a/docs/src/paddlevideo/utils/multigrid/__init__.py b/docs/src/paddlevideo/utils/multigrid/__init__.py
new file mode 100644
index 000000000..10295b59b
--- /dev/null
+++ b/docs/src/paddlevideo/utils/multigrid/__init__.py
@@ -0,0 +1,10 @@
+from .multigrid import MultigridSchedule
+from .batchnorm_helper import get_norm, aggregate_sub_bn_stats
+from .short_sampler import DistributedShortSampler
+from .save_load_helper import subn_save, subn_load
+from .interval_helper import is_eval_epoch
+
+__all__ = [
+    'MultigridSchedule', 'get_norm', 'aggregate_sub_bn_stats',
+    'DistributedShortSampler', 'subn_save', 'subn_load', 'is_eval_epoch'
+]
diff --git a/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py b/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py
new file mode 100644
index 000000000..e39b067d8
--- /dev/null
+++ b/docs/src/paddlevideo/utils/multigrid/batchnorm_helper.py
@@ -0,0 +1,142 @@
+from functools import partial
+import paddle
+
+
+def get_norm(bn_norm_type, bn_num_splits):
+    """
+    Args:
+        cfg (CfgNode): model building configs, details are in the comments of
+            the config file.
+    Returns:
+        nn.Layer: the normalization layer.
+    """
+    if bn_norm_type == "batchnorm":
+        return paddle.nn.BatchNorm3D
+    elif bn_norm_type == "sub_batchnorm":
+        return partial(SubBatchNorm3D, num_splits=bn_num_splits)
+    else:
+        raise NotImplementedError(
+            "Norm type {} is not supported".format(bn_norm_type))
+
+
+def aggregate_sub_bn_stats(model):
+    """
+    Recursively find all SubBN modules and aggregate sub-BN stats.
+    Args:
+        model (nn.Layer): model to be aggregate sub-BN stats
+    Returns:
+        count (int): number of SubBN module found.
+    """
+    count = 0
+    for child in model.children():
+        if isinstance(child, SubBatchNorm3D):
+            child.aggregate_stats()
+            count += 1
+        else:
+            count += aggregate_sub_bn_stats(child)
+    return count
+
+
+class SubBatchNorm3D(paddle.nn.Layer):
+    """
+    Implement based on paddle2.0.
+    The standard BN layer computes stats across all examples in a GPU. In some
+    cases it is desirable to compute stats across only a subset of examples
+    SubBatchNorm3D splits the batch dimension into N splits, and run BN on
+    each of them separately (so that the stats are computed on each subset of
+    examples (1/N of batch) independently. During evaluation, it aggregates
+    the stats from all splits into one BN.
+    """
+    def __init__(self, num_splits, **args):
+        """
+        Args:
+            num_splits (int): number of splits.
+            args (list): list of args
+        """
+        super(SubBatchNorm3D, self).__init__()
+        self.num_splits = num_splits
+        self.num_features = args["num_features"]
+        self.weight_attr = args["weight_attr"]
+        self.bias_attr = args["bias_attr"]
+
+        # Keep only one set of weight and bias (outside).
+        if self.weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None,
+                shape=[self.num_features],
+                default_initializer=paddle.nn.initializer.Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self.weight_attr,
+                shape=[self.num_features],
+                default_initializer=paddle.nn.initializer.Constant(1.0))
+            self.weight.stop_gradient = self.weight_attr is not None \
+                                        and self.weight_attr.learning_rate == 0.
+
+        if self.bias_attr == False:
+            self.bias = self.create_parameter(attr=None,
+                                              shape=[self.num_features],
+                                              is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(attr=self.bias_attr,
+                                              shape=[self.num_features],
+                                              is_bias=True)
+            self.bias.stop_gradient = self.bias_attr is not None \
+                                      and self.bias_attr.learning_rate == 0.
+
+        # set weights and bias fixed (inner).
+        args["weight_attr"] = False
+        args["bias_attr"] = False
+        self.bn = paddle.nn.BatchNorm3D(**args)
+        # update number of features used in split_bn
+        args["num_features"] = self.num_features * self.num_splits
+        self.split_bn = paddle.nn.BatchNorm3D(**args)
+
+    def _get_aggregated_mean_std(self, means, stds, n):
+        """
+        Calculate the aggregated mean and stds.
+        Use the method of update mean and std when merge multi-part data.
+        Args:
+            means (tensor): mean values.
+            stds (tensor): standard deviations.
+            n (int): number of sets of means and stds.
+        """
+        mean = paddle.sum(paddle.reshape(means, (n, -1)), axis=0) / n
+        std = (paddle.sum(paddle.reshape(stds, (n, -1)), axis=0) / n +
+               paddle.sum(paddle.reshape(
+                   paddle.pow((paddle.reshape(means, (n, -1)) - mean), 2),
+                   (n, -1)),
+                          axis=0) / n)
+        return mean, std
+
+    def aggregate_stats(self):
+        """
+        Synchronize running_mean, and running_var to self.bn.
+        Call this before eval, then call model.eval();
+        When eval, forward function will call self.bn instead of self.split_bn,
+        During this time the running_mean, and running_var of self.bn has been obtained from
+        self.split_bn.
+        """
+        if self.split_bn.training:
+            bn_mean_tensor, bn_variance_tensor = self._get_aggregated_mean_std(
+                self.split_bn._mean,
+                self.split_bn._variance,
+                self.num_splits,
+            )
+            self.bn._mean.set_value(bn_mean_tensor)
+            self.bn._variance.set_value(bn_variance_tensor)
+
+    def forward(self, x):
+        if self.training:
+            n, c, t, h, w = x.shape
+            x = paddle.reshape(
+                x, (n // self.num_splits, c * self.num_splits, t, h, w))
+            x = self.split_bn(x)
+            x = paddle.reshape(x, (n, c, t, h, w))
+        else:
+            x = self.bn(x)
+        x = paddle.multiply(x, paddle.reshape(self.weight, (-1, 1, 1, 1)))
+        x = paddle.add(x, paddle.reshape(self.bias, (-1, 1, 1, 1)))
+        return x
diff --git a/docs/src/paddlevideo/utils/multigrid/interval_helper.py b/docs/src/paddlevideo/utils/multigrid/interval_helper.py
new file mode 100644
index 000000000..2df4bc702
--- /dev/null
+++ b/docs/src/paddlevideo/utils/multigrid/interval_helper.py
@@ -0,0 +1,19 @@
+def is_eval_epoch(cfg, cur_epoch, total_epochs, multigrid_schedule):
+    """
+    Determine if the model should be evaluated at the current epoch.
+    Args:
+        cfg (CfgNode): configs. Details can be found in
+            slowfast/config/defaults.py
+        cur_epoch (int): current epoch.
+        multigrid_schedule (List): schedule for multigrid training.
+    """
+    if cur_epoch + 1 == total_epochs:
+        return True
+    if multigrid_schedule is not None:
+        prev_epoch = 0
+        for s in multigrid_schedule:
+            if cur_epoch < s[-1]:
+                period = max(
+                    (s[-1] - prev_epoch) // cfg.MULTIGRID.EVAL_FREQ + 1, 1)
+                return (s[-1] - 1 - cur_epoch) % period == 0
+            prev_epoch = s[-1]
diff --git a/docs/src/paddlevideo/utils/multigrid/multigrid.py b/docs/src/paddlevideo/utils/multigrid/multigrid.py
new file mode 100644
index 000000000..a296a0608
--- /dev/null
+++ b/docs/src/paddlevideo/utils/multigrid/multigrid.py
@@ -0,0 +1,233 @@
+"""Functions for multigrid training."""
+
+import numpy as np
+
+
+class MultigridSchedule(object):
+    """
+    This class defines multigrid training schedule and update cfg accordingly.
+    """
+    def init_multigrid(self, cfg):
+        """
+        Update cfg based on multigrid settings.
+        Args:
+            cfg (configs): configs that contains training and multigrid specific
+                hyperparameters.
+        Returns:
+            cfg (configs): the updated cfg.
+        """
+        self.schedule = None
+        # We may modify cfg.DATASET.batch_size, cfg.PIPELINE.train.decode_sampler.num_frames, and
+        # cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] during training, so we store their original
+        # value in cfg and use them as global variables.
+        cfg.MULTIGRID.default_batch_size = cfg.DATASET.batch_size  # total bs,64
+        cfg.MULTIGRID.default_temporal_size = cfg.PIPELINE.train.decode_sampler.num_frames  # 32
+        cfg.MULTIGRID.default_crop_size = cfg.PIPELINE.train.transform[1][
+            'MultiCrop']['target_size']  # 224
+
+        if cfg.MULTIGRID.LONG_CYCLE:
+            self.schedule = self.get_long_cycle_schedule(cfg)
+            cfg.OPTIMIZER.learning_rate.steps = [0] + [
+                s[-1] for s in self.schedule
+            ]
+            # Fine-tuning phase.
+            cfg.OPTIMIZER.learning_rate.steps[-1] = (
+                cfg.OPTIMIZER.learning_rate.steps[-2] +
+                cfg.OPTIMIZER.learning_rate.steps[-1]) // 2
+            cfg.OPTIMIZER.learning_rate.lrs = [
+                cfg.OPTIMIZER.learning_rate.gamma**s[0] * s[1][0]
+                for s in self.schedule
+            ]
+            # Fine-tuning phase.
+            cfg.OPTIMIZER.learning_rate.lrs = cfg.OPTIMIZER.learning_rate.lrs[:-1] + [
+                cfg.OPTIMIZER.learning_rate.lrs[-2],
+                cfg.OPTIMIZER.learning_rate.lrs[-1],
+            ]
+
+            cfg.OPTIMIZER.learning_rate.max_epoch = self.schedule[-1][-1]
+
+        elif cfg.MULTIGRID.SHORT_CYCLE:
+            cfg.OPTIMIZER.learning_rate.steps = [
+                int(s * cfg.MULTIGRID.epoch_factor)
+                for s in cfg.OPTIMIZER.learning_rate.steps
+            ]
+            cfg.OPTIMIZER.learning_rate.max_epoch = int(
+                cfg.OPTIMIZER.learning_rate.max_epoch *
+                cfg.OPTIMIZER.learning_rate.max_epoch)
+        return cfg
+
+    def update_long_cycle(self, cfg, cur_epoch):
+        """
+        Before every epoch, check if long cycle shape should change. If it
+            should, update cfg accordingly.
+        Args:
+            cfg (configs): configs that contains training and multigrid specific
+                hyperparameters.
+            cur_epoch (int): current epoch index.
+        Returns:
+            cfg (configs): the updated cfg.
+            changed (bool): whether to change long cycle shape at this epoch
+        """
+        base_b, base_t, base_s = get_current_long_cycle_shape(
+            self.schedule, cur_epoch)
+        if base_s != cfg.PIPELINE.train.transform[1]['MultiCrop'][
+                'target_size'] or base_t != cfg.PIPELINE.train.decode_sampler.num_frames:
+            #NOTE Modify
+            # no need to modify, used by pool_size in head, None when multigrid
+            # cfg.MODEL.head.num_frames = base_t
+            # cfg.MODEL.head.crop_size  = base_s
+            cfg.PIPELINE.train.decode_sampler.num_frames = base_t
+            cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] = base_s
+            cfg.DATASET.batch_size = base_b * cfg.MULTIGRID.default_batch_size  #change bs
+
+            bs_factor = (float(cfg.DATASET.batch_size) /
+                         cfg.MULTIGRID.bn_base_size)
+
+            if bs_factor == 1:  #single bs == bn_base_size (== 8)
+                cfg.MODEL.backbone.bn_norm_type = "batchnorm"
+            else:
+                cfg.MODEL.backbone.bn_norm_type = "sub_batchnorm"
+                cfg.MODEL.backbone.bn_num_splits = int(bs_factor)
+
+            cfg.MULTIGRID.long_cycle_sampling_rate = cfg.PIPELINE.train.decode_sampler.sampling_rate * (
+                cfg.MULTIGRID.default_temporal_size // base_t)
+            print("Long cycle updates:")
+            print("\tbn_norm_type: {}".format(cfg.MODEL.backbone.bn_norm_type))
+            if cfg.MODEL.backbone.bn_norm_type == "sub_batchnorm":
+                print("\tbn_num_splits: {}".format(
+                    cfg.MODEL.backbone.bn_num_splits))
+            print("\tTRAIN.batch_size[single card]: {}".format(
+                cfg.DATASET.batch_size))
+            print("\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format(
+                cfg.PIPELINE.train.decode_sampler.num_frames,
+                cfg.MULTIGRID.long_cycle_sampling_rate))
+            print("\tDATA.train_crop_size: {}".format(
+                cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']))
+            return cfg, True
+        else:
+            return cfg, False
+
+    def get_long_cycle_schedule(self, cfg):
+        """
+        Based on multigrid hyperparameters, define the schedule of a long cycle.
+        Args:
+            cfg (configs): configs that contains training and multigrid specific
+                hyperparameters.
+        Returns:
+            schedule (list): Specifies a list long cycle base shapes and their
+                corresponding training epochs.
+        """
+
+        steps = cfg.OPTIMIZER.learning_rate.steps
+
+        default_size = float(
+            cfg.PIPELINE.train.decode_sampler.num_frames *
+            cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']**
+            2)  # 32 * 224 * 224  C*H*W
+        default_iters = steps[-1]  # 196
+
+        # Get shapes and average batch size for each long cycle shape.
+        avg_bs = []
+        all_shapes = []
+        #        for t_factor, s_factor in cfg.MULTIGRID.long_cycle_factors:
+        for item in cfg.MULTIGRID.long_cycle_factors:
+            t_factor, s_factor = item["value"]
+            base_t = int(
+                round(cfg.PIPELINE.train.decode_sampler.num_frames * t_factor))
+            base_s = int(
+                round(
+                    cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']
+                    * s_factor))
+            if cfg.MULTIGRID.SHORT_CYCLE:
+                shapes = [
+                    [
+                        base_t,
+                        cfg.MULTIGRID.default_crop_size *
+                        cfg.MULTIGRID.short_cycle_factors[0],
+                    ],
+                    [
+                        base_t,
+                        cfg.MULTIGRID.default_crop_size *
+                        cfg.MULTIGRID.short_cycle_factors[1],
+                    ],
+                    [base_t, base_s],
+                ]  #first two is short_cycle, last is the base long_cycle
+            else:
+                shapes = [[base_t, base_s]]
+
+            # (T, S) -> (B, T, S)
+            shapes = [[
+                int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]
+            ] for s in shapes]
+            avg_bs.append(np.mean([s[0] for s in shapes]))
+            all_shapes.append(shapes)
+
+        # Get schedule regardless of cfg.MULTIGRID.epoch_factor.
+        total_iters = 0
+        schedule = []
+        for step_index in range(len(steps) - 1):
+            step_epochs = steps[step_index + 1] - steps[step_index]
+
+            for long_cycle_index, shapes in enumerate(all_shapes):
+                #ensure each of 4 sequences run the same num of iters
+                cur_epochs = (step_epochs * avg_bs[long_cycle_index] /
+                              sum(avg_bs))
+
+                # get cur_iters from cur_epochs
+                cur_iters = cur_epochs / avg_bs[long_cycle_index]
+                total_iters += cur_iters
+                schedule.append((step_index, shapes[-1], cur_epochs))
+
+        iter_saving = default_iters / total_iters  # ratio between default iters and real iters
+
+        final_step_epochs = cfg.OPTIMIZER.learning_rate.max_epoch - steps[-1]
+
+        # We define the fine-tuning phase to have the same amount of iteration
+        # saving as the rest of the training.
+        #final_step_epochs / iter_saving make fine-tune having the same iters as training
+        ft_epochs = final_step_epochs / iter_saving * avg_bs[-1]
+
+        #        schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs))
+        schedule.append((step_index + 1, all_shapes[-1][-1], ft_epochs))
+
+        # Obtrain final schedule given desired cfg.MULTIGRID.epoch_factor.
+        x = (cfg.OPTIMIZER.learning_rate.max_epoch *
+             cfg.MULTIGRID.epoch_factor / sum(s[-1] for s in schedule))
+
+        final_schedule = []
+        total_epochs = 0
+        for s in schedule:
+            epochs = s[2] * x
+            total_epochs += epochs
+            final_schedule.append((s[0], s[1], int(round(total_epochs))))
+        print_schedule(final_schedule)
+        return final_schedule
+
+
+def print_schedule(schedule):
+    """
+    Log schedule.
+    """
+    print(
+        "Long_cycle_index\tBase_shape(bs_factor,temporal_size,crop_size)\tEpochs"
+    )
+    for s in schedule:
+        print("{}\t\t\t{}\t\t\t\t\t{}".format(s[0], s[1], s[2]))
+
+
+def get_current_long_cycle_shape(schedule, epoch):
+    """
+    Given a schedule and epoch index, return the long cycle base shape.
+    Args:
+        schedule (configs): configs that contains training and multigrid specific
+            hyperparameters.
+        cur_epoch (int): current epoch index.
+    Returns:
+        shapes (list): A list describing the base shape in a long cycle:
+            [batch size relative to default,
+            number of frames, spatial dimension].
+    """
+    for s in schedule:
+        if epoch < s[-1]:
+            return s[1]
+    return schedule[-1][1]
diff --git a/docs/src/paddlevideo/utils/multigrid/save_load_helper.py b/docs/src/paddlevideo/utils/multigrid/save_load_helper.py
new file mode 100644
index 000000000..94a52d58b
--- /dev/null
+++ b/docs/src/paddlevideo/utils/multigrid/save_load_helper.py
@@ -0,0 +1,237 @@
+import os
+import numpy as np
+import paddle
+import copy
+
+
+def sub_to_normal_bn(sd):
+    """
+    When save, Convert the Sub-BN paprameters to normal BN parameters in a state dict.
+    There are two copies of BN layers in a Sub-BN implementation: `bn.bn` and
+    `bn.split_bn`. `bn.split_bn` is used during training and
+    "compute_precise_bn". Before saving or evaluation, its stats are copied to
+    `bn.bn`. We rename `bn.bn` to `bn` and store it to be consistent with normal
+    BN layers.
+    Args:
+        sd (OrderedDict): a dict of parameters which might contain Sub-BN
+        parameters.
+    Returns:
+        new_sd (OrderedDict): a dict with Sub-BN parameters reshaped to
+        normal parameters.
+    """
+    modifications = [
+        ("bn.bn._mean", "bn._mean"),
+        ("bn.bn._variance", "bn._variance"),
+    ]
+    to_remove = ["bn.bn.", ".split_bn."]
+    key_list = list(sd.keys())  #odict_keys to list
+    for key in key_list:
+        for before, after in modifications:
+            if key.endswith(before):
+                new_key = key.split(before)[0] + after
+                sd[new_key] = sd.pop(key)
+
+        for rm in to_remove:
+            if rm in key and key in sd:
+                del sd[key]
+
+
+def normal_to_sub_bn(checkpoint_sd, model_sd):
+    """
+    When load, Convert BN parameters to Sub-BN parameters if model contains Sub-BNs.
+    Args:
+        checkpoint_sd (OrderedDict): source dict of parameters.
+        model_sd (OrderedDict): target dict of parameters.
+    Returns:
+        new_sd (OrderedDict): converted dict of parameters.
+    """
+    for key in model_sd:
+        if key not in checkpoint_sd:
+            # not to replace bn.weight and bn.bias
+            if "bn.split_bn." in key and "bn.weight" not in key and "bn.bias" not in key:
+                load_key = key.replace("bn.split_bn.", "bn.")
+                bn_key = key.replace("bn.split_bn.", "bn.bn.")
+                checkpoint_sd[key] = checkpoint_sd.pop(load_key)
+                checkpoint_sd[bn_key] = checkpoint_sd[key]
+
+    # match the shape of bn.split_bn._xx
+    # model_sd: split_bn.rm.shape = num_feature*num_split
+    # checkpoint_sd: split_bn.rm.shape = bn.rm.shape = num_feature
+    for key in model_sd:
+        if key in checkpoint_sd:
+            model_blob_shape = model_sd[key].shape  #bn.split_bn
+            c2_blob_shape = checkpoint_sd[key].shape  #bn.bn
+
+            if (len(model_blob_shape) == 1 and len(c2_blob_shape) == 1
+                    and model_blob_shape[0] > c2_blob_shape[0]
+                    and model_blob_shape[0] % c2_blob_shape[0] == 0):
+                before_shape = checkpoint_sd[key].shape
+                checkpoint_sd[key] = np.concatenate(
+                    [checkpoint_sd[key]] *
+                    (model_blob_shape[0] // c2_blob_shape[0]))
+                if 'split_bn' not in key:  #split_bn is excepted
+                    print("{} {} -> {}".format(key, before_shape,
+                                               checkpoint_sd[key].shape))
+    return checkpoint_sd
+
+
+def mapping_opt_dict(opt_dict, model_key_list):
+    """
+    Paddle Name schedule: conv_1.w -> conv_2.w
+    Sometimes: sub_bn -> bn
+    when re-build model, we desire the parameter name to be coincident,
+    but the parameters name index will be added, as conv_1 to conv_2, not conv_1.
+    It will raise error if we set old saved parameters to new created optimizer.
+    as conv_2 cannot find in state_dict(only conv_1).
+    Args:
+        opt_dict: optimizer state dict, including the name and value of parameters gradient.
+        model_key_list: the parameters name list of re-build model.
+    Return: optimizer state dict with modified keys
+    """
+    def get_name_info(PNAME, PN_key_list, key_list):
+        min_index = float('inf')
+        max_index = 0
+        for name in PN_key_list[1:]:
+            for key in key_list:
+                if name in key:
+                    index = int(key.split('.')[0].split(name)[-1])
+                    if index < min_index:
+                        min_index = index
+                    if index > max_index:
+                        max_index = index
+            num_name = max_index - min_index + 1
+            PNAME[name].append((min_index, max_index, num_name))
+            min_index = float('inf')
+            max_index = 0
+
+    PNAME = {
+        "LR_Scheduler": [],
+        "conv3d_": [],
+        "linear_": [],
+        "sub_batch_norm3d_": [],
+        "batch_norm3d_": [],
+    }
+
+    pd_key_list = list(opt_dict.keys())
+    print("The number of parameters in saved optimizer state dict = {}".format(
+        len(pd_key_list)))
+    print("The number of parameters in re-build model list = {}".format(
+        len(model_key_list)))
+    # 1 may be LR_Scheduler
+    PN_key_list = list(PNAME.keys())
+
+    # get the number of each PNAME
+    get_name_info(PNAME, PN_key_list, pd_key_list)
+    get_name_info(PNAME, PN_key_list, model_key_list)
+    print("[Parameters info] prefix: min_index, max_index, number_params: \n",
+          PNAME)
+
+    # whether to change name of bn layer
+    change_name = False
+    if PNAME["sub_batch_norm3d_"][0][-1] == -float('inf'):
+        PN_key_list.remove("sub_batch_norm3d_")
+        if PNAME["sub_batch_norm3d_"][1][-1] != -float('inf'):
+            print(
+                "Optimizer state dict saved bn, but Re-build model use sub_bn, changed name!"
+            )
+            change_name = True
+        else:
+            print("Optimizer state dict saved bn, and Re-build model use bn")
+    else:
+        PN_key_list.remove("batch_norm3d_")
+        if PNAME["sub_batch_norm3d_"][1][-1] == -float('inf'):
+            print(
+                "Optimizer state dict saved sub_bn, but Re-build model use bn, changed name!"
+            )
+            change_name = True
+        else:
+            print(
+                "Optimizer state dict saved sub_bn, Re-build model use sub_bn")
+
+    #update key name
+    # sub_bn -> bn name mapping, pre-define dict
+    change_dict = {
+        "sub_batch_norm3d_": "batch_norm3d_",
+        "batch_norm3d_": "sub_batch_norm3d_"
+    }
+    for key in pd_key_list:
+        for name in PN_key_list[1:]:
+            if key.startswith(name):
+                start = change_dict[name] if (
+                    change_name and "batch_norm" in name) else name
+                str_index = key.split('.')[0].split(name)[-1]
+                index = int(str_index)
+                new_index = str(index +
+                                (PNAME[start][1][0] - PNAME[name][0][0]))
+                end = key.split('.')[-1]
+                update_key = start + new_index + '.' + end
+                opt_dict[update_key] = opt_dict.pop(key)
+
+    return opt_dict
+
+
+def subn_save(save_dir, name_prefix, epoch, video_model, optimizer):
+    if not os.path.isdir(save_dir):
+        os.makedirs(save_dir)
+    model_path = os.path.join(save_dir, name_prefix + "{:05d}".format(epoch))
+    model_dict = video_model.state_dict()
+    sub_to_normal_bn(model_dict)
+    opti_dict = optimizer.state_dict()
+    paddle.save(model_dict, model_path + '.pdparams')
+    paddle.save(opti_dict, model_path + '.pdopt')
+    print('[Saved Epoch {} parameters and optimizer state ]'.format(epoch))
+
+
+def subn_load(model, ck_path, optimizer=None):
+    """
+    Load the checkpoint from the given file.
+    Args:
+        model (model): model to load the weights from the checkpoint.
+        optimizer (optim, optional): optimizer to load the historical state.
+        ck_path (str): checkpoint path
+    Returns:
+        (int): the number of training epoch of the checkpoint.
+    """
+
+    assert os.path.exists(ck_path + ".pdparams"), \
+        "Given dir {}.pdparams not exist.".format(ck_path)
+    print("load checkpint from {}.pdparams".format(ck_path))
+
+    model_dict = model.state_dict()
+    checkpoint_dict = paddle.load(ck_path + ".pdparams")
+    #    checkpoint_dict = copy.deepcopy(checkpoint_dict_orig)  #not modify when multi card
+    pre_train_dict = normal_to_sub_bn(checkpoint_dict, model_dict)
+
+    # Match pre-trained weights that have same shape as current model.
+    pre_train_dict_match = {
+        k: v
+        for k, v in pre_train_dict.items()
+        if k in model_dict and tuple(v.shape) == tuple(model_dict[k].shape)
+    }
+
+    # Weights that do not have match from the pre-trained model.
+    not_load_layers = [
+        k for k in model_dict.keys() if k not in pre_train_dict_match.keys()
+    ]
+    # Log weights that are not loaded with the pre-trained weights.
+    if not_load_layers:
+        for k in not_load_layers:
+            if 'bn.weight' not in k and 'bn.bias' not in k:
+                print("Network weights {} not loaded.".format(k))
+
+    # Load pre-trained weights.
+    model.set_state_dict(pre_train_dict_match)
+
+    if optimizer:
+        assert os.path.exists(ck_path + ".pdopt"), \
+            "Given dir {}.pdopt not exist.".format(ck_path)
+        print("load checkpint from {}.pdopt".format(ck_path))
+        opt_dict = paddle.load(ck_path + ".pdopt")
+        # get parameters that required gradient from re-build model
+        model_key_list = []
+        for param in model.parameters():
+            if param.stop_gradient == False:
+                model_key_list.append(param.name)
+
+        new_opt_dict = mapping_opt_dict(opt_dict, model_key_list)
+        optimizer.set_state_dict(new_opt_dict)
diff --git a/docs/src/paddlevideo/utils/multigrid/short_sampler.py b/docs/src/paddlevideo/utils/multigrid/short_sampler.py
new file mode 100644
index 000000000..c4dd1bbed
--- /dev/null
+++ b/docs/src/paddlevideo/utils/multigrid/short_sampler.py
@@ -0,0 +1,146 @@
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+import math
+import paddle
+
+
+__all__ = ["DistributedShortSampler"]
+
+
+class DistributedShortSampler(paddle.io.BatchSampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    In such case, each process can pass a DistributedBatchSampler instance
+    as a DataLoader sampler, and load a subset of the original dataset that
+    is exclusive to it.
+    .. note::
+        Batch size is dynamic changed following short cycle schedule.
+
+    Args:
+        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
+                     or other python object which implemented
+                     `__len__` for BatchSampler to get sample
+                     number of data source.
+        batch_sizes(list): batch size list of one cycle.
+        num_replicas(int, optional): porcess number in distributed training.
+            If :attr:`num_replicas` is None, :attr:`num_replicas` will be
+            retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.
+            Default None.
+        rank(int, optional): the rank of the current process among :attr:`num_replicas`
+            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
+            :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.
+        shuffle(bool): whther to shuffle indices order before genrating
+            batch indices. Default False.
+        drop_last(bool): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size. Default False
+    """
+    def __init__(self,
+                 dataset,
+                 batch_sizes,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=False,
+                 drop_last=False):
+        self.dataset = dataset
+
+        assert any(isinstance(batch_size, int) and batch_size > 0 for batch_size in batch_sizes), \
+            "batch_size should be a positive integer"
+        self.batch_sizes = batch_sizes
+        self.len_batch_sizes = len(self.batch_sizes)
+        assert isinstance(shuffle, bool), \
+            "shuffle should be a boolean value"
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), \
+            "drop_last should be a boolean number"
+
+        
+        if num_replicas is not None:
+            assert isinstance(num_replicas, int) and num_replicas > 0, \
+                "num_replicas should be a positive integer"
+            self.nranks = num_replicas
+        else:
+            self.nranks = paddle.distributed.ParallelEnv().nranks
+
+        if rank is not None:
+            assert isinstance(rank, int) and rank >= 0, \
+                "rank should be a non-negative integer"
+            self.local_rank = rank
+        else:
+            self.local_rank = paddle.distributed.ParallelEnv().local_rank
+
+        self.drop_last = drop_last
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
+        self.total_size = self.num_samples * self.nranks
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        indices += indices[:(self.total_size -
+                             len(indices))]  #completion last iter
+        assert len(indices) == self.total_size
+        if self.shuffle:
+            np.random.RandomState(self.epoch).shuffle(indices)
+            self.epoch += 1
+
+        # subsample
+        def _get_indices_by_batch_size(indices):
+            total_batch_size = sum(self.batch_sizes)
+            subsampled_indices = []
+            last_batch_size = self.total_size % (
+                total_batch_size * self.nranks)  #number samples of last batch
+            assert last_batch_size % self.nranks == 0
+            last_local_batch_size = last_batch_size // self.nranks
+
+            for i in range(self.local_rank * total_batch_size,
+                           len(indices) - last_batch_size,
+                           total_batch_size * self.nranks):
+                subsampled_indices.extend(indices[i:i + total_batch_size])
+
+            indices = indices[len(indices) - last_batch_size:]
+            subsampled_indices.extend(
+                indices[self.local_rank *
+                        last_local_batch_size:(self.local_rank + 1) *
+                        last_local_batch_size])
+            return subsampled_indices
+
+        if self.nranks > 1:
+            indices = _get_indices_by_batch_size(indices)
+
+        assert len(indices) == self.num_samples  #index length in each card
+        _sample_iter = iter(indices)
+
+        batch_indices = []
+        counter = 0
+        batch_size = self.batch_sizes[0]
+        for idx in _sample_iter:
+            batch_indices.append(
+                (idx, counter %
+                 self.len_batch_sizes))  #to be used in dataloader get_item
+            if len(batch_indices) == batch_size:
+                yield batch_indices
+                counter += 1
+                batch_size = self.batch_sizes[counter % self.len_batch_sizes]
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        avg_batch_size = sum(self.batch_sizes) / float(self.len_batch_sizes)
+        if self.drop_last:
+            return int(np.floor(self.num_samples / avg_batch_size))
+        else:
+            return int(np.ceil(self.num_samples / avg_batch_size))
+
+    def set_epoch(self, epoch):
+        """
+        Sets the epoch number. When :attr:`shuffle=True`, this number is used
+        as seeds of random numbers. By default, users may not set this, all
+        replicas (workers) use a different random ordering for each epoch.
+        If set same number at each epoch, this sampler will yield the same
+        ordering at all epoches.
+        Arguments:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/docs/src/paddlevideo/utils/precise_bn.py b/docs/src/paddlevideo/utils/precise_bn.py
new file mode 100644
index 000000000..c9fdd4047
--- /dev/null
+++ b/docs/src/paddlevideo/utils/precise_bn.py
@@ -0,0 +1,94 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import itertools
+
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+"""
+Implement precise bn, which is useful for improving accuracy.
+"""
+
+
+@paddle.no_grad()  # speed up and save CUDA memory
+def do_preciseBN(model,
+                 data_loader,
+                 parallel,
+                 num_iters=200,
+                 use_amp=False,
+                 amp_level=None):
+    """
+    Recompute and update the batch norm stats to make them more precise. During
+    training both BN stats and the weight are changing after every iteration, so
+    the running average can not precisely reflect the actual stats of the
+    current model.
+    In this function, the BN stats are recomputed with fixed weights, to make
+    the running average more precise. Specifically, it computes the true average
+    of per-batch mean/variance instead of the running average.
+    This is useful to improve validation accuracy.
+    Args:
+        model: the model whose bn stats will be recomputed
+        data_loader: an iterator. Produce data as input to the model
+        num_iters: number of iterations to compute the stats.
+    Return:
+        the model with precise mean and variance in bn layers.
+    """
+    bn_layers_list = [
+        m for m in model.sublayers()
+        if any((isinstance(m, bn_type)
+                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,
+                                paddle.nn.BatchNorm3D))) and m.training
+    ]
+    if len(bn_layers_list) == 0:
+        return
+
+    # moving_mean=moving_mean*momentum+batch_mean*(1.−momentum)
+    # we set momentum=0. to get the true mean and variance during forward
+    momentum_actual = [bn._momentum for bn in bn_layers_list]
+    for bn in bn_layers_list:
+        bn._momentum = 0.
+
+    running_mean = [paddle.zeros_like(bn._mean)
+                    for bn in bn_layers_list]  # pre-ignore
+    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]
+
+    ind = -1
+    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):
+        logger.info("Computing precise BN {} / {}...".format(
+            ind + 1, num_iters))
+
+        if use_amp:
+            with paddle.amp.auto_cast(
+                    custom_black_list={"reduce_mean",
+                                       "conv3d"}, level=amp_level):
+                model(data, mode='train')
+        else:
+            model(data, mode='train')
+
+        for i, bn in enumerate(bn_layers_list):
+            # Accumulates the bn stats.
+            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)
+            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)
+
+    assert ind == num_iters - 1, (
+        "update_bn_stats is meant to run for {} iterations, but the dataloader stops at {} iterations."
+        .format(num_iters, ind))
+
+    # Sets the precise bn stats.
+    for i, bn in enumerate(bn_layers_list):
+        bn._mean.set_value(running_mean[i])
+        bn._variance.set_value(running_var[i])
+        bn._momentum = momentum_actual[i]
diff --git a/docs/src/paddlevideo/utils/profiler.py b/docs/src/paddlevideo/utils/profiler.py
new file mode 100644
index 000000000..629ef4ef0
--- /dev/null
+++ b/docs/src/paddlevideo/utils/profiler.py
@@ -0,0 +1,128 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle.profiler as profiler
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+_prof = None
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True,
+            'timer_only': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+            elif key == 'timer_only':
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _prof 
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan
+    # timer_only = True  only the model's throughput and time overhead are displayed
+    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.
+    # timer_only = False the output Timeline information can be found in the profiler_log directory
+    if _prof is None:
+        _timer_only = str(_profiler_options['timer_only']) == str(True)
+        _prof = profiler.Profiler(
+                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),
+                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),
+                   timer_only = _timer_only)
+        _prof.start()
+    else:
+        _prof.step()
+        
+    if _profiler_step_id == _profiler_options['batch_range'][1]:
+        _prof.stop()
+        _prof.summary(
+             op_detail=True,
+             thread_sep=False,
+             time_unit='ms')
+        _prof = None
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/docs/src/paddlevideo/utils/record.py b/docs/src/paddlevideo/utils/record.py
new file mode 100644
index 000000000..db8717c45
--- /dev/null
+++ b/docs/src/paddlevideo/utils/record.py
@@ -0,0 +1,168 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+from collections import OrderedDict
+
+import paddle
+
+from .logger import coloring, get_logger
+
+logger = get_logger("paddlevideo")
+
+__all__ = ['AverageMeter', 'build_record', 'log_batch', 'log_epoch']
+
+
+def build_record(cfg):
+    record_list = [
+        ("loss", AverageMeter('loss', '7.5f')),
+        ("lr", AverageMeter('lr', 'f', need_avg=False)),
+    ]
+    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework
+        record_list.append(("hit_at_one", AverageMeter("hit_at_one", '.5f')))
+        record_list.append(("perr", AverageMeter("perr", '.5f')))
+        record_list.append(("gap", AverageMeter("gap", '.5f')))
+    elif 'Recognizer' in cfg.framework:
+        record_list.append(("top1", AverageMeter("top1", '.5f')))
+        record_list.append(("top5", AverageMeter("top5", '.5f')))
+    elif 'FastRCNN' in cfg.framework:
+        record_list.append(
+            ("recall@thr=0.5", AverageMeter("recall@thr=0.5", '.5f')))
+        record_list.append(("prec@thr=0.5", AverageMeter("prec@thr=0.5",
+                                                         '.5f')))
+        record_list.append(("recall@top3", AverageMeter("recall@top3", '.5f')))
+        record_list.append(("prec@top3", AverageMeter("prec@top3", '.5f')))
+        record_list.append(("recall@top5", AverageMeter("recall@top5", '.5f')))
+        record_list.append(("prec@top5", AverageMeter("prec@top5", '.5f')))
+        record_list.append(("mAP@0.5IOU", AverageMeter("mAP@0.5IOU", '.5f')))
+    elif 'DepthEstimator' in cfg.framework:
+        record_list.append(("abs_rel", AverageMeter("abs_rel", '.5f')))
+        record_list.append(("sq_rel", AverageMeter("sq_rel", '.5f')))
+        record_list.append(("rmse", AverageMeter("rmse", '.5f')))
+        record_list.append(("rmse_log", AverageMeter("rmse_log", '.5f')))
+        record_list.append(("a1", AverageMeter("a1", '.5f')))
+        record_list.append(("a2", AverageMeter("a2", '.5f')))
+        record_list.append(("a3", AverageMeter("a3", '.5f')))
+        record_list.append(("losses_day", AverageMeter("losses_day", '.5f')))
+        record_list.append(("losses_night", AverageMeter("losses_night",
+                                                         '.5f')))
+    elif 'MSTCN' in cfg.framework or 'ASRF' in cfg.framework:
+        record_list.append(("F1@0.50", AverageMeter("F1@0.50", '.5f')))
+
+    elif 'YOWOLocalizer' in cfg.framework:
+        record_list.append(("nCorrect", AverageMeter('nCorrect', '.1f')))
+        record_list.append(("fscore", AverageMeter("fscore", '.5f')))
+
+    record_list.append(("batch_time", AverageMeter('batch_cost', '.5f')))
+    record_list.append(("reader_time", AverageMeter('reader_cost', '.5f')))
+    record_list = OrderedDict(record_list)
+    return record_list
+
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    """
+    def __init__(self, name='', fmt='f', need_avg=True):
+        self.name = name
+        self.fmt = fmt
+        self.need_avg = need_avg
+        self.reset()
+
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """ update """
+        if isinstance(val, paddle.Tensor):
+            val = float(val)
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def total(self):
+        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)
+
+    @property
+    def total_minute(self):
+        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,
+                                                            self=self)
+
+    @property
+    def mean(self):
+        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(
+            self=self) if self.need_avg else ''
+
+    @property
+    def value(self):
+        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)
+
+
+def log_batch(metric_list,
+              batch_id,
+              epoch_id,
+              total_epoch,
+              mode,
+              ips,
+              eta_sec: int = None):
+    batch_cost = str(metric_list['batch_time'].value) + ' sec,'
+    reader_cost = str(metric_list['reader_time'].value) + ' sec,'
+
+    metric_values = []
+    for m in metric_list:
+        if not (m == 'batch_time' or m == 'reader_time'):
+            metric_values.append(metric_list[m].value)
+    metric_str = ' '.join([str(v) for v in metric_values])
+    epoch_str = "epoch:[{:>3d}/{:<3d}]".format(epoch_id, total_epoch)
+    step_str = "{:s} step:{:<4d}".format(mode, batch_id)
+    if eta_sec is not None:
+        eta_str = "eta: {:s}".format(
+            str(datetime.timedelta(seconds=int(eta_sec))))
+    else:
+        eta_str = ''
+    max_mem_reserved_str = ""
+    max_mem_allocated_str = ""
+    if paddle.device.is_compiled_with_cuda():
+        max_mem_reserved_str = f"max_mem_reserved: {format(paddle.device.cuda.max_memory_reserved() / (1024 ** 2), '.2f')} MB"
+        max_mem_allocated_str = f"max_mem_allocated: {format(paddle.device.cuda.max_memory_allocated() / (1024 ** 2), '.2f')} MB"
+    logger.info("{:s} {:s} {:s} {:s} {:s} {} {:s}, {} {}".format(
+        coloring(epoch_str, "HEADER") if batch_id == 0 else epoch_str,
+        coloring(step_str, "PURPLE"), coloring(metric_str, 'OKGREEN'),
+        coloring(batch_cost, "OKGREEN"), coloring(reader_cost, 'OKGREEN'), ips,
+        eta_str, max_mem_reserved_str, max_mem_allocated_str))
+
+
+def log_epoch(metric_list, epoch, mode, ips):
+    batch_cost = 'avg_' + str(metric_list['batch_time'].value) + ' sec,'
+    reader_cost = 'avg_' + str(metric_list['reader_time'].value) + ' sec,'
+    batch_sum = str(metric_list['batch_time'].total) + ' sec,'
+
+    metric_values = []
+    for m in metric_list:
+        if not (m == 'batch_time' or m == 'reader_time'):
+            metric_values.append(metric_list[m].mean)
+    metric_str = ' '.join([str(v) for v in metric_values])
+
+    end_epoch_str = "END epoch:{:<3d}".format(epoch)
+
+    logger.info("{:s} {:s} {:s} {:s} {:s} {:s} {}".format(
+        coloring(end_epoch_str, "RED"), coloring(mode, "PURPLE"),
+        coloring(metric_str, "OKGREEN"), coloring(batch_cost, "OKGREEN"),
+        coloring(reader_cost, "OKGREEN"), coloring(batch_sum, "OKGREEN"), ips))
diff --git a/docs/src/paddlevideo/utils/registry.py b/docs/src/paddlevideo/utils/registry.py
new file mode 100644
index 000000000..81b76bd51
--- /dev/null
+++ b/docs/src/paddlevideo/utils/registry.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Registry(object):
+    """
+    The registry that provides name -> object mapping, to support third-party users' custom modules.
+
+    To register an object:
+
+    .. code-block:: python
+
+        BACKBONES = Registry('backbone')
+        @BACKBONES.register()
+        class ResNet:
+            pass
+    Or:
+    .. code-block:: python
+
+        BACKBONES = Registry('backbone')
+        class ResNet:
+            pass
+        BACKBONES.register(ResNet)
+
+    Usage: To build a module.
+
+    .. code-block:: python
+        backbone_name = "ResNet"
+        b = BACKBONES.get(backbone_name)()
+
+    """
+    def __init__(self, name):
+        """
+        Args:
+            name (str): the name of this registry
+        """
+        self._name = name
+        self._obj_map = {}
+
+    def __contains__(self, key):
+        return self._obj_map.get(key) is not None
+
+    def _do_register(self, name, obj):
+        assert (
+            name not in self._obj_map
+        ), "An object named '{}' was already registered in '{}' registry!".format(
+            name, self._name)
+        self._obj_map[name] = obj
+
+    def register(self, obj=None, name=None):
+        """
+        Register the given object under the the name `obj.__name__`.
+        Can be used as either a decorator or not. See docstring of this class for usage.
+        """
+        if obj is None:
+            # used as a decorator
+            def deco(func_or_class, name=name):
+                if name is None:
+                    name = func_or_class.__name__
+                self._do_register(name, func_or_class)
+                return func_or_class
+
+            return deco
+
+        # used as a function call
+        if name is None:
+            name = obj.__name__
+        self._do_register(name, obj)
+
+    def get(self, name):
+        """Get the registry record.
+
+        Args:
+            name (str): The class name.
+
+        Returns:
+            ret: The class.
+        """
+        ret = self._obj_map.get(name)
+        if ret is None:
+            raise KeyError(
+                "No object named '{}' found in '{}' registry!".format(
+                    name, self._name))
+
+        return ret
diff --git a/docs/src/paddlevideo/utils/save_load.py b/docs/src/paddlevideo/utils/save_load.py
new file mode 100644
index 000000000..10bb5f0d1
--- /dev/null
+++ b/docs/src/paddlevideo/utils/save_load.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import os.path as osp
+import time
+
+import paddle
+import paddle.nn.functional as F
+from paddlevideo.utils import get_logger, main_only
+from tqdm import tqdm
+import numpy as np
+from scipy import ndimage
+
+
+def pretrain_swin_param_trans(model, state_dicts):
+    # delete classifier's params
+    if 'head.fc' + '.weight' in state_dicts:
+        del state_dicts['head.fc' + '.weight']
+    if 'head.fc' + '.bias' in state_dicts:
+        del state_dicts['head.fc' + '.bias']
+
+    state_dicts = {
+        k.replace('backbone.', ''): v
+        for k, v in state_dicts.items()
+    }
+
+    if len(state_dicts) == len(model.state_dict()):
+        print("Load 3D weights")
+        return state_dicts
+
+    print("Load 2D weights")
+    relative_position_index_keys = [
+        k for k in state_dicts.keys() if "relative_position_index" in k
+    ]
+    for k in relative_position_index_keys:
+        del state_dicts[k]
+
+    # delete attn_mask since we always re-init it
+    attn_mask_keys = [k for k in state_dicts.keys() if "attn_mask" in k]
+    for k in attn_mask_keys:
+        del state_dicts[k]
+
+    state_dicts['patch_embed.proj.weight'] = state_dicts[
+        'patch_embed.proj.weight'].unsqueeze(2).tile(
+            [1, 1, model.patch_size[0], 1, 1]) / model.patch_size[0]
+
+    # bicubic interpolate relative_position_bias_table if not match
+    relative_position_bias_table_keys = [
+        k for k in state_dicts.keys() if "relative_position_bias_table" in k
+    ]
+    total_len = len(relative_position_bias_table_keys)
+    with tqdm(total=total_len,
+              position=1,
+              bar_format='{desc}',
+              desc="Loading weights") as desc:
+        for key in tqdm(relative_position_bias_table_keys,
+                        total=total_len,
+                        position=0):
+            relative_position_bias_table_pretrained = state_dicts[key]
+            relative_position_bias_table_current = model.state_dict()[key]
+            L1, nH1 = relative_position_bias_table_pretrained.shape
+            L2, nH2 = relative_position_bias_table_current.shape
+            L2 = (2 * model.window_size[1] - 1) * (2 * model.window_size[2] - 1)
+            wd = model.window_size[0]
+            if nH1 != nH2:
+                desc.set_description(f"Error in loading {key}, skip")
+            else:
+                if L1 != L2:
+                    S1 = int(L1**0.5)
+                    relative_position_bias_table_pretrained_resized = paddle.nn.functional.interpolate(
+                        relative_position_bias_table_pretrained.transpose(
+                            [1, 0]).reshape([1, nH1, S1, S1]),
+                        size=(2 * model.window_size[1] - 1,
+                              2 * model.window_size[2] - 1),
+                        mode='bicubic')
+                    relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.reshape(
+                        [nH2, L2]).transpose([1, 0])
+                desc.set_description(f"Loading {key}")
+            state_dicts[key] = relative_position_bias_table_pretrained.tile(
+                [2 * wd - 1, 1])
+            time.sleep(0.01)
+    ret_str = "loading {:<20d} weights completed.".format(
+        len(model.state_dict()))
+    desc.set_description(ret_str)
+    return state_dicts
+
+
+def pretrain_vit_param_trans(model, state_dicts, num_patches, num_seg,
+                             attention_type):
+    """
+    Convert ViT's pre-trained model parameters to a parameter dictionary that matches the existing model
+    """
+    if 'head' + '.weight' in state_dicts:
+        del state_dicts['head' + '.weight']
+    if 'head' + '.bias' in state_dicts:
+        del state_dicts['head' + '.bias']
+
+    total_len = len(model.state_dict())
+    if num_patches + 1 != state_dicts['pos_embed'].shape[1]:  # when
+        pos_embed = state_dicts['pos_embed']
+        cls_pos_embed = paddle.to_tensor(
+            pos_embed[0, 0, :]).unsqueeze(0).unsqueeze(1)
+        other_pos_embed = paddle.to_tensor(pos_embed[0, 1:, :])
+        gs_new = int(np.sqrt(num_patches))
+        gs_old = int(np.sqrt(other_pos_embed.shape[0]))
+        zoom = (gs_new / gs_old, gs_new / gs_old, 1)
+        other_pos_embed = paddle.reshape(other_pos_embed, [gs_old, gs_old, -1])
+        other_pos_embed = ndimage.zoom(other_pos_embed, zoom, order=1)
+        other_pos_embed = paddle.to_tensor(other_pos_embed)
+        new_pos_embed = paddle.reshape(other_pos_embed, [1, num_patches, -1])
+        new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), axis=1)
+        state_dicts['pos_embed'] = new_pos_embed
+        time.sleep(0.01)
+
+    if 'time_embed' in state_dicts and num_seg != state_dicts[
+            'time_embed'].shape[1]:
+        time_embed = state_dicts['time_embed'].transpose((0, 2, 1)).unsqueeze(0)
+        new_time_embed = F.interpolate(time_embed,
+                                       size=(time_embed.shape[-2], num_seg),
+                                       mode='nearest')
+        state_dicts['time_embed'] = new_time_embed.squeeze(0).transpose(
+            (0, 2, 1))
+        time.sleep(0.01)
+    with tqdm(total=total_len,
+              position=1,
+              bar_format='{desc}',
+              desc="Loading weights") as desc:
+        if attention_type == 'divided_space_time':
+            new_state_dicts = state_dicts.copy()
+            for key in tqdm(state_dicts):
+                if 'blocks' in key and 'attn' in key:
+                    desc.set_description("Loading %s" % key)
+                    new_key = key.replace('attn', 'temporal_attn')
+                    if not new_key in state_dicts:
+                        new_state_dicts[new_key] = state_dicts[key]
+                    else:
+                        new_state_dicts[new_key] = state_dicts[new_key]
+                if 'blocks' in key and 'norm1' in key:
+                    desc.set_description("Loading %s" % key)
+                    new_key = key.replace('norm1', 'temporal_norm1')
+                    if not new_key in state_dicts:
+                        new_state_dicts[new_key] = state_dicts[key]
+                    else:
+                        new_state_dicts[new_key] = state_dicts[new_key]
+                time.sleep(0.01)
+        elif attention_type == 'space_only':  # tokenshift raw vit
+            new_state_dicts = state_dicts.copy()
+
+    ret_str = "loading {:<20d} weights completed.".format(
+        len(model.state_dict()))
+    desc.set_description(ret_str)
+    return new_state_dicts
+
+
+def pretrain_resnet18_param_trans(model, loaded_dict):
+    encoder_dict = model.encoder.state_dict()
+    pose_encoder_dict = model.pose_encoder.state_dict()
+
+    names = ['encoder.', 'encoder_day.', 'encoder_night.']
+    for name in names:
+        total_len = len(loaded_dict.items())
+        with tqdm(total=total_len,
+                  position=1,
+                  bar_format='{desc}',
+                  desc="Loading weights") as desc:
+            for key, value in tqdm(loaded_dict.items(),
+                                   total=total_len,
+                                   position=0):
+                key = str(name + key)
+                if key in encoder_dict:
+                    encoder_dict[key] = value
+                    desc.set_description('Loading %s' % key)
+                time.sleep(0.01)
+
+    num_input_images = 2
+    loaded_dict['conv1.weight'] = paddle.concat(
+        [loaded_dict['conv1.weight']] * num_input_images, 1) / num_input_images
+    total_len = len(loaded_dict.items())
+    with tqdm(total=total_len,
+              position=1,
+              bar_format='{desc}',
+              desc="Loading weights") as desc:
+        for name, value in tqdm(loaded_dict.items(),
+                                total=total_len,
+                                position=0):
+            name = str('encoder.' + name)
+            if name in pose_encoder_dict:
+                pose_encoder_dict[name] = value
+                desc.set_description('Loading %s' % key)
+            time.sleep(0.01)
+        ret_str = "loading {:<20d} weights completed.".format(
+            len(model.state_dict()))
+        desc.set_description(ret_str)
+    return encoder_dict, pose_encoder_dict
+
+
+#XXX(shipping): maybe need load N times because of different cards have different params.
+@main_only
+def load_ckpt(model, weight_path, **kargs):
+    """
+    1. Load pre-trained model parameters
+    2. Extract and convert from the pre-trained model to the parameters
+    required by the existing model
+    3. Load the converted parameters of the existing model
+    """
+    #model.set_state_dict(state_dict)
+
+    if not osp.isfile(weight_path):
+        raise IOError(f'{weight_path} is not a checkpoint file')
+    #state_dicts = load(weight_path)
+
+    logger = get_logger("paddlevideo")
+    state_dicts = paddle.load(weight_path)
+    if 'ResnetEncoder' in str(model):
+        encoder_dict, pose_encoder_dict = pretrain_resnet18_param_trans(
+            model, state_dicts)
+        model.encoder.load_dict(encoder_dict)
+        model.pose_encoder.load_dict(pose_encoder_dict)
+        tmp = model.state_dict()
+    elif "VisionTransformer" in str(model):  # For TimeSformer case
+        tmp = pretrain_vit_param_trans(model, state_dicts, kargs['num_patches'],
+                                       kargs['num_seg'],
+                                       kargs['attention_type'])
+    elif 'SwinTransformer3D' in str(model):
+        tmp = pretrain_swin_param_trans(model, state_dicts)
+    else:
+        tmp = {}
+        total_len = len(model.state_dict())
+        with tqdm(total=total_len,
+                  position=1,
+                  bar_format='{desc}',
+                  desc="Loading weights") as desc:
+            for item in tqdm(model.state_dict(), total=total_len, position=0):
+                name = item
+                desc.set_description('Loading %s' % name)
+                if name not in state_dicts:  # Convert from non-parallel model
+                    if str('backbone.' + name) in state_dicts:
+                        tmp[name] = state_dicts['backbone.' + name]
+                else:  # Convert from parallel model
+                    tmp[name] = state_dicts[name]
+                time.sleep(0.01)
+        ret_str = "loading {:<20d} weights completed.".format(
+            len(model.state_dict()))
+        desc.set_description(ret_str)
+    model.set_state_dict(tmp)
+
+
+def mkdir(dir):
+    if not os.path.exists(dir):
+        # avoid error when train with multiple gpus
+        try:
+            os.makedirs(dir)
+        except:
+            pass
+
+
+def _extract_student_weights(all_params, student_prefix="Student."):
+    s_params = {
+        key[len(student_prefix):]: all_params[key]
+        for key in all_params if student_prefix in key
+    }
+    return s_params
+
+
+@main_only
+def save(obj, path, save_student_model=False):
+    if save_student_model:
+        s_params = _extract_student_weights(obj)
+        student_path = path.replace(".pdparams", "_student.pdparams")
+        if len(s_params) > 0:
+            paddle.save(s_params, student_path)
+    paddle.save(obj, path)
+
+
+def load(file_name):
+    if not osp.isfile(file_name):
+        raise IOError(f'{file_name} not exist')
+    return paddle.load(file_name)
diff --git a/docs/src/paddlevideo/version.py b/docs/src/paddlevideo/version.py
new file mode 100644
index 000000000..b5b7f481f
--- /dev/null
+++ b/docs/src/paddlevideo/version.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["paddlevideo_version"]
+paddlevideo_version = "0.0.1"
diff --git a/docs/src/run.sh b/docs/src/run.sh
new file mode 100644
index 000000000..e26ff4d93
--- /dev/null
+++ b/docs/src/run.sh
@@ -0,0 +1,89 @@
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+#export FLAGS_conv_workspace_size_limit=800 #MB
+#export FLAGS_cudnn_exhaustive_search=1
+#export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+
+start_time=$(date +%s)
+
+# run pp-tsm training
+#python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm  main.py --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+
+# run pp-tsm_v2 distillation training
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm_v2  main.py --validate -c configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml
+
+# run ava training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=logdir.ava_part main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava_part.yaml
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=logdir.ava_all.1203 main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava_all.yaml
+
+# run adds training
+# python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20
+
+# run tsm training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+
+# run tsm amp training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+
+# run tsm amp training, nhwc
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml
+
+# run tsn training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py  --validate -c configs/recognition/tsn/tsn_k400_frames.yaml
+
+# run video-swin-transformer training
+# python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_videoswin main.py --amp --validate -c configs/recognition/videoswin/videoswin_k400_videos.yaml
+
+# run slowfast training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast  main.py --validate -c configs/recognition/slowfast/slowfast.yaml
+
+# run slowfast multi-grid training
+# python3.7 -B -m paddle.distributed.launch --selected_gpus="0,1,2,3,4,5,6,7" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml
+
+# run bmn training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3"  --log_dir=log_bmn main.py  --validate -c configs/localization/bmn.yaml
+
+# run attention_lstm training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm  main.py  --validate -c configs/recognition/attention_lstm/attention_lstm_youtube-8m.yaml
+
+# run pp-tsn training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsn  main.py  --validate -c configs/recognition/pptsn/pptsn_k400_frames.yaml
+
+# run timesformer training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_timesformer  main.py  --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml
+
+# run pp-timesformer training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptimesformer  main.py  --validate -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml
+
+# run st-gcn training
+# python3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml
+
+# run agcn training
+# python3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml
+
+# run actbert training
+# python3.7 main.py  --validate -c configs/multimodal/actbert/actbert.yaml
+
+# run tsn dali training
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml
+
+
+# test.sh
+# just use `example` as example, please replace to real name.
+# python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_test main.py --test -c configs/example.yaml -w "output/example/example_best.pdparams"
+
+# NOTE: run bmn test, only support single card, bs=1
+# python3.7 main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00010.pdparams -o DATASET.batch_size=1
+
+# export_models script
+# just use `example` as example, please replace to real name.
+# python3.7 tools/export_model.py -c configs/example.yaml -p output/example/example_best.pdparams -o ./inference
+
+# predict script
+# just use `example` as example, please replace to real name.
+# python3.7 tools/predict.py -v example.avi --model_file "./inference/example.pdmodel" --params_file "./inference/example.pdiparams" --enable_benchmark=False --model="example" --num_seg=8
+
+end_time=$(date +%s)
+cost_time=$[ $end_time-$start_time ]
+echo "Time to train is $(($cost_time/60))min $(($cost_time%60))s"
diff --git a/docs/src/setup.py b/docs/src/setup.py
new file mode 100644
index 000000000..8ad7069e7
--- /dev/null
+++ b/docs/src/setup.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from setuptools import setup
+from io import open
+
+with open('requirements.txt', encoding="utf-8-sig") as f:
+    requirements = f.readlines()
+
+
+def readme():
+    with open('docs/en/quick_start.md', encoding="utf-8-sig") as f:
+        README = f.read()
+    return README
+
+
+setup(
+    name='ppvideo',  #name of .whl file
+    packages=['ppvideo'],  #install package name
+    package_dir={'ppvideo': ''},
+    include_package_data=
+    True,  #Accept all data files and directories matched by MANIFEST.in
+    install_requires=requirements,
+    entry_points={"console_scripts": ["ppvideo= ppvideo.tools.wheel:main"]},
+    version='2.3.0',
+    license='Apache License 2.0',
+    description='Awesome Video toolkits based on PaddlePaddle ',
+    long_description=readme(),
+    long_description_content_type='text/markdown',
+    url='https://github.com/PaddlePaddle/PaddleVideo',
+    download_url='https://github.com/PaddlePaddle/PaddleVideo.git',
+    keywords=[
+        'A treasure chest for video understanding powered by PaddlePaddle.'
+    ],
+    classifiers=[
+        'Intended Audience :: Developers', 'Operating System :: OS Independent',
+        'Natural Language :: Chinese (Simplified)',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7', 'Topic :: Utilities'
+    ],
+)
diff --git a/docs/src/test_tipc/README.md b/docs/src/test_tipc/README.md
new file mode 100644
index 000000000..9ff322755
--- /dev/null
+++ b/docs/src/test_tipc/README.md
@@ -0,0 +1,133 @@
+
+# 飞桨训推一体认证（TIPC）
+
+## 1. 简介
+
+飞桨除了基本的模型训练和预测，还提供了支持多端多平台的高性能推理部署工具。本文档提供了PaddleVideo中所有模型的飞桨训推一体认证 (Training and Inference Pipeline Certification(TIPC)) 信息和测试工具，方便用户查阅每种模型的训练推理部署打通情况，并可以进行一键测试。
+
+<div align="center">
+    <img src="docs/guide.png" width="1000">
+</div>
+
+## 2. 汇总信息
+
+打通情况汇总如下，已填写的部分表示可以使用本工具进行一键测试，未填写的表示正在支持中。
+
+**字段说明：**
+- 基础训练预测：包括模型训练、Paddle Inference Python预测。
+- 更多训练方式：包括多机多卡(TODO)、混合精度。
+- 模型压缩：包括裁剪、离线/在线量化(TODO)、蒸馏(TODO)。
+- 其他预测部署：包括Paddle Inference C++预测、Paddle Serving部署、Paddle-Lite部署(TODO)等。
+
+更详细的mkldnn、Tensorrt等预测加速相关功能的支持情况可以查看各测试工具的[更多教程](#more)。
+
+| 算法名称 | 模型名称 | 模型类型 | 基础<br>训练预测 | 更多<br>训练方式 | 模型压缩 |  其他预测部署  |
+| :--- | :--- |  :----:  | :--------: |  :----  |   :----  |   :----  |
+| PP-TSM     |pptsm_k400_frames_uniform | 动作识别 | 支持 | 混合精度 | 离线量化 | Paddle Inference: C++ |
+| PP-TSN |pptsn_k400_videos | 动作识别 | 支持 | 混合精度 | - | Paddle Inference: C++ |
+| AGCN |agcn_fsd	 | 动作识别 | 支持 | 混合精度 | - | - |
+| STGCN |stgcn_fsd | 动作识别 | 支持 | 混合精度 | - | - |
+| TimeSformer |timesformer_k400_videos | 动作识别 | 支持 | 混合精度 | - | - |
+| SlowFast |slowfast | 动作识别 | 支持 | 混合精度 | - | - |
+| TSM  |tsm_k400_frames | 动作识别 | 支持 | 混合精度 | - | - |
+| TSN  |tsn_k400_frames          | 动作识别 |支持|混合精度|-|-|
+| AttentionLSTM |attention_lstm_youtube8m | 动作识别 | 支持 | 混合精度 | - | - |
+| BMN |bmn | 动作时间定位 | 支持 | 混合精度 | - | - |
+
+
+
+## 3. 测试工具简介
+### 目录介绍
+
+```shell
+test_tipc/
+├── configs/  # 配置文件目录
+│   ├── PP-TSM/
+│   │   ├── train_infer_python.txt # PP-TSM在Linux上进行python训练预测（基础训练预测）的配置文件
+│   │   ├── serving_infer_cpp.txt  # PP-TSM在Linux上进行cpp serving测试的配置文件
+│   │   ├── train_amp_infer_python.txt # PP-TSM在Linux上进行python训练预测（混合精度训练预测）的配置文件
+│   │   ├── serving_infer_python.txt # PP-TSM在Linux上进行python serving预测的配置文件
+│   │   └── train_ptq_infer_python.txt # PP-TSM在Linux上进行离线量化推理测试的配置文件
+│   ├── PP-TSN/
+│   │   ├── train_infer_python.txt # PP-TSN在Linux上进行python训练预测（基础训练预测）的配置文件
+│   │   ├── paddle2onnx_infer_python.txt # PP-TSN在Linux上进行Paddle2ONNX预测（基础训练预测）的配置文件
+│   │   ├── serving_infer_cpp.txt  # PP-TSN在Linux上进行cpp serving测试的配置文件
+│   │   └── train_amp_infer_python.txt # PP-TSN在Linux上进行python训练预测（混合精度训练预测）的配置文件
+│   ├── ...
+│   └── ...
+├── results/   # 预先保存的预测结果，用于和实际预测结果进行精度比对
+│   ├── PP-TSM/
+│   │	├── python_ppvideo_PP-TSM_results_fp16.txt # 预存的PP-TSM识别识别模型python预测fp16精度的结果
+│   │	└── python_ppvideo_PP-TSM_results_fp32.txt # 预存的PP-TSM识别识别模型python预测fp32精度的结果
+│   ├── PP-TSN/
+│   │	├── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型python预测fp16精度的结果
+│   │	└── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型python预测fp32精度的结果
+│   ├── PP-TSN_CPP/
+│   │	├── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型C++预测fp16精度的结果
+│   │	└── python_ppvideo_PP-TSN_results_fp32.txt # 预存的PP-TSN识别识别模型C++预测fp32精度的结果
+│   ├── ...
+│   └── ...
+├── prepare.sh                        # 完成test_*.sh运行所需要的数据和模型下载
+├── docs/                             # 详细的TIPC各种功能文档
+├── test_train_inference_python.sh    # 测试python训练预测的主程序
+├── test_inference_cpp.sh             # 测试C++预测的主程序
+├── test_paddle2onnx.sh               # 测试paddle2onnx转换与推理的主程序
+├── compare_results.py                # 用于对比log中的预测结果与results中的预存结果精度误差是否在限定范围内
+└── README.md                         # 介绍文档
+```
+
+### 测试流程概述
+
+使用本工具，可以测试不同功能的支持情况，以及预测结果是否对齐，测试流程概括如下：
+<div align="center">
+    <img src="docs/Video_TIPC.png" width="800">
+</div>
+
+
+1. 运行prepare.sh准备测试所需数据和模型；
+2. 运行要测试的功能对应的测试脚本`test_*.sh`，产出log，由log可以看到不同配置是否运行成功；
+3. 用`compare_results.py`对比log中的预测结果和预存在results目录下的结果，判断预测精度是否符合预期（在误差范围内）。
+
+测试单项功能仅需两行命令，**如需测试不同模型/功能，替换配置文件即可**，命令格式如下：
+```shell
+# 功能：准备数据
+# 格式：bash + 运行脚本 + 参数1: 配置文件选择 + 参数2: 模式选择
+bash test_tipc/prepare.sh  configs/[model_name]/[params_file_name]  [Mode]
+
+# 功能：运行测试
+# 格式：bash + 运行脚本 + 参数1: 配置文件选择 + 参数2: 模式选择
+bash test_tipc/test_train_inference_python.sh configs/[model_name]/[params_file_name]  [Mode]
+```
+
+例如，测试基本训练预测功能的`lite_train_lite_infer`模式，运行：
+```shell
+# 准备数据
+bash test_tipc/prepare.sh ./test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'
+# 运行测试
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/PP-TSM/train_infer_python.txt 'lite_train_lite_infer'
+```
+关于本示例命令的更多信息可查看[基础训练预测使用文档](./docs/test_train_inference_python.md)。
+
+### 配置文件命名规范
+在`configs`目录下存放所有模型测试需要用到的配置文件，配置文件的命名遵循如下规范：
+
+1. 基础训练预测配置简单命名为：`train_infer_python.txt`，表示**Linux环境下单机、不使用混合精度训练+python预测**，其完整命名对应`train_linux_gpu_normal_normal_infer_python_linux_gpu_cpu.txt`，由于本配置文件使用频率较高，这里进行了名称简化。
+
+2. 其他带训练配置命名格式为：`train_训练硬件环境(linux_gpu/linux_dcu/…)_是否多机(fleet/normal)_是否混合精度(amp/normal)_预测模式(infer/lite/serving/js)_语言(cpp/python/java)_预测硬件环境(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`。如，linux gpu下多机多卡+混合精度链条测试对应配置 `train_linux_gpu_fleet_amp_infer_python_linux_gpu_cpu.txt`，linux dcu下基础训练预测对应配置 `train_linux_dcu_normal_normal_infer_python_linux_dcu.txt`。
+
+3. 仅预测的配置（如serving、lite等）命名格式：`model_训练硬件环境(linux_gpu/linux_dcu/…)_是否多机(fleet/normal)_是否混合精度(amp/normal)_(infer/lite/serving/js)_语言(cpp/python/java)_预测硬件环境(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`，即，与2相比，仅第一个字段从train换为model，测试时模型直接下载获取，这里的“训练硬件环境”表示所测试的模型是在哪种环境下训练得到的。
+
+**根据上述命名规范，可以直接从子目录名称和配置文件名找到需要测试的场景和功能对应的配置文件。**
+
+<a name="more"></a>
+
+## 4. 开始测试
+各功能测试中涉及混合精度、裁剪、量化等训练相关，及mkldnn、Tensorrt等多种预测相关参数配置，请点击下方相应链接了解更多细节和使用教程：
+- [test_train_inference_python 使用](docs/test_train_inference_python.md) ：测试基于Python的模型训练、评估、推理等基本功能。
+- [test_amp_train_inference_python 使用](docs/test_train_amp_inference_python.md) ：测试基于Python的**混合精度**模型训练、评估、推理等基本功能。
+- [test_inference_cpp 使用](docs/test_inference_cpp.md) ：测试基于C++的模型推理功能。
+- [test_paddle2onnx 使用](docs/test_paddle2onnx.md) ：测试基于python2onnx模型的推理功能。
+- [test_serving_infer_python 使用](docs/test_serving_infer_python.md) ：测试基于Paddle Serving的服务化部署功能。
+- [test_serving_infer_cpp 使用](docs/test_serving_infer_cpp.md) ：测试基于C++的模型推理功能。
+- [test_ptq_inference_python 使用](docs/test_train_ptq_inference_python.md) ：测试离线量化训练推理功能。
+- [test_train_fleet_inference_python 使用](./docs/test_train_fleet_inference_python.md)：测试基于Python的多机多卡训练与推理等基本功能
diff --git a/docs/src/test_tipc/benchmark_train.sh b/docs/src/test_tipc/benchmark_train.sh
new file mode 100644
index 000000000..02ade8140
--- /dev/null
+++ b/docs/src/test_tipc/benchmark_train.sh
@@ -0,0 +1,318 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+# set env
+python=python
+export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3`
+export model_commit=$(git log|head -n1|awk '{print $2}')
+export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
+export frame_version=${str_tmp%%.post*}
+export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`)
+
+# BENCHMARK_ROOT='.'  # only for self-test
+
+# run benchmark sh
+# Usage:
+# bash run_benchmark_train.sh config.txt params
+# or
+# bash run_benchmark_train.sh config.txt
+
+function func_parser_params(){
+    strs=$1
+    IFS="="
+    array=(${strs})
+    tmp=${array[1]}
+    echo ${tmp}
+}
+
+function func_sed_params(){
+    filename=$1
+    line=$2
+    param_value=$3
+    params=`sed -n "${line}p" $filename`
+    IFS=":"
+    array=(${params})
+    key=${array[0]}
+    value=${array[1]}
+    if [[ $value =~ 'benchmark_train' ]];then
+        IFS='='
+        _val=(${value})
+        param_value="${param_value}"
+    fi
+    new_params="${key}:${param_value}"
+    IFS=";"
+    cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'"
+    eval $cmd
+}
+
+function set_gpu_id(){
+    string=$1
+    _str=${string:1:6}
+    IFS="C"
+    arr=(${_str})
+    M=${arr[0]}
+    P=${arr[1]}
+    gn=`expr $P - 1`
+    gpu_num=`expr $gn / $M`
+    seq=`seq -s "," 0 $gpu_num`
+    echo $seq
+}
+
+function get_repo_name(){
+    IFS=";"
+    cur_dir=$(pwd)
+    IFS="/"
+    arr=(${cur_dir})
+    echo ${arr[-1]}
+}
+
+FILENAME=$1
+# copy FILENAME as new
+new_filename="./test_tipc/benchmark_train.txt"
+cmd=`yes|cp $FILENAME $new_filename`
+FILENAME=$new_filename
+# MODE must be one of ['benchmark_train']
+MODE=$2
+PARAMS=$3
+REST_ARGS=$4
+# bash test_tipc/benchmark_train.sh /workspace/PaddleVideo/test_tipc/configs/BMN/train_infer_python.txt benchmark_train dynamicTostatic_bs8_fp32_DP_N1C8
+
+to_static=""
+# parse "to_static" options and modify trainer into "to_static_trainer"
+if [[ $PARAMS =~ "dynamicTostatic" ]] ;then
+   to_static="d2sT_"
+   sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME
+   # clear PARAM contents
+   if [ $PARAMS = "to_static" ] ;then
+    PARAMS=""
+   fi
+fi
+
+IFS=$'\n'
+# parser params from train_benchmark.txt
+dataline=`cat $FILENAME`
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+
+# 获取'train_benchmark_params'所在的行数
+line_num=`grep -n -w "train_benchmark_params" $FILENAME  | cut -d ":" -f 1`
+# for train log parser
+batch_size=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+fp_items=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+epoch=$(func_parser_value "${lines[line_num]}")
+
+line_num=`expr $line_num + 1`
+profile_option_key=$(func_parser_key "${lines[line_num]}")
+profile_option_params=$(func_parser_value "${lines[line_num]}")
+profile_option="${profile_option_key}:${profile_option_params}"
+
+line_num=`expr $line_num + 1`
+flags_value=$(func_parser_value "${lines[line_num]}")
+
+# 设置每个模型max-iters，以获取稳定的ips
+line_num=`expr $line_num + 1`
+max_iters_value=$(func_parser_value "${lines[line_num]}")
+
+# set flags
+IFS=";"
+flags_list=(${flags_value})
+for _flag in ${flags_list[*]}; do
+    cmd="export ${_flag}"
+    eval $cmd
+done
+
+# set log_name
+repo_name=$(get_repo_name )
+SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)}   # */benchmark_log
+mkdir -p "${SAVE_LOG}/benchmark_log/"
+status_log="${SAVE_LOG}/benchmark_log/results.log"
+# get benchmark profiling params : PROFILING_TIMER_ONLY=no|True|False
+PROFILING_TIMER_ONLY=${PROFILING_TIMER_ONLY:-"True"}
+# The number of lines in which train params can be replaced.
+line_python=3
+line_gpuid=4
+line_precision=6
+line_epoch=7
+line_batchsize=9
+line_profile=12
+line_eval_py=24
+line_eval_py_2=25
+line_export_py=38
+line_export_py_2=28
+line_export_py_3=30
+line_norm_train=16
+
+func_sed_params "$FILENAME" "${line_eval_py}" "null"
+func_sed_params "$FILENAME" "${line_eval_py_2}" "null"
+func_sed_params "$FILENAME" "${line_export_py}" "null"
+func_sed_params "$FILENAME" "${line_export_py_2}" "null"
+func_sed_params "$FILENAME" "${line_export_py_3}" "null"
+func_sed_params "$FILENAME" "${line_python}"  "$python"
+
+
+# 末尾加上--max_iters=30和--log_interval=1，以便运行并输出足量数据
+set_log_interval_cmd="sed -i '${line_norm_train}s/.*/& --max_iters=${max_iters_value} -o log_interval=1/' '${filename}'"
+eval $set_log_interval_cmd
+
+# 去掉--validate，benchmark不需要validate
+remove_validate_cmd="sed -i '${line_norm_train}s/--validate//' '${filename}'"
+eval $remove_validate_cmd
+
+# if params
+if  [ ! -n "$PARAMS" ] ;then
+    # PARAMS input is not a word.
+    IFS="|"
+    batch_size_list=(${batch_size})
+    fp_items_list=(${fp_items})
+    device_num_list=(N1C4)
+    run_mode="DP"
+elif [[ ${PARAMS} = "dynamicTostatic" ]] ;then
+    IFS="|"
+    model_type=$PARAMS
+    batch_size_list=(${batch_size})
+    fp_items_list=(${fp_items})
+    device_num_list=(N1C4)
+    run_mode="DP"
+else
+    # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}
+    IFS="_"
+    params_list=(${PARAMS})
+    model_type=${params_list[0]}
+    batch_size=${params_list[1]}
+    batch_size=`echo  ${batch_size} | tr -cd "[0-9]" `
+    precision=${params_list[2]}
+    run_mode=${params_list[3]}
+    device_num=${params_list[4]}
+    IFS=";"
+
+    if [ ${precision} = "null" ];then
+        precision="fp32"
+    fi
+
+    fp_items_list=($precision)
+    batch_size_list=($batch_size)
+    device_num_list=($device_num)
+fi
+
+log_interval='--log_interval 1'
+IFS="|"
+for batch_size in ${batch_size_list[*]}; do
+    for precision in ${fp_items_list[*]}; do
+        for device_num in ${device_num_list[*]}; do
+            # sed batchsize and precision
+            func_sed_params "$FILENAME" "${line_precision}" "$precision"
+            func_sed_params "$FILENAME" "${line_batchsize}" "$batch_size"
+            func_sed_params "$FILENAME" "${line_epoch}" "$epoch"
+            gpu_id=$(set_gpu_id $device_num)
+
+            if [ ${#gpu_id} -le 1 ];then
+                func_sed_params "$FILENAME" "${line_gpuid}" "0"  # sed used gpu_id 
+                if [[ ${PROFILING_TIMER_ONLY} != "no" ]];then
+                    echo "run profile"
+                    # The default value of profile_option's timer_only parameter is True
+                    if [[ ${PROFILING_TIMER_ONLY} = "False" ]];then
+                        profile_option="${profile_option};timer_only=False"
+                    fi
+                    log_path="$SAVE_LOG/profiling_log"
+                    mkdir -p $log_path
+                    log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling"
+                    # set profile_option params
+                    tmp=`sed -i "${line_profile}s/.*/\"${profile_option}\"/" "${FILENAME}"`
+                    # for models which need to accumulate gradient.
+                    if [[ ${model_name} =~ "TimeSformer" ]]; then
+                        global_bs=`expr ${batch_size} \* ${device_num:3:4} \* 8`
+                        modify_global_bs_cmd="sed -i '${line_norm_train}s/.*/& -o GRADIENT_ACCUMULATION.global_batch_size=${global_bs}/' '${filename}'"
+                        eval $modify_global_bs_cmd
+                    fi
+                    # run test_train_inference_python.sh
+                    cmd="timeout 5m bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                    echo $cmd
+                    eval ${cmd}
+                    eval "cat ${log_path}/${log_name}"
+                fi
+                echo "run without profile"  
+                # without profile
+                log_path="$SAVE_LOG/train_log"
+                speed_log_path="$SAVE_LOG/index"
+                mkdir -p $log_path
+                mkdir -p $speed_log_path
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
+                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed profile_id as null
+
+                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                echo $cmd
+                job_bt=`date '+%Y%m%d%H%M%S'`
+                eval $cmd
+                job_et=`date '+%Y%m%d%H%M%S'`
+                export model_run_time=$((${job_et}-${job_bt}))
+                eval "cat ${log_path}/${log_name}"
+
+                # parser log
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
+                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
+                        --model_name ${_model_name} \
+                        --base_batch_size ${batch_size} \
+                        --run_mode ${run_mode} \
+                        --fp_item ${precision} \
+                        --keyword ips: \
+                        --skip_steps 5 \
+                        --device_num ${device_num} \
+                        --speed_unit instance/sec \
+                        --convergence_key loss: "
+                echo $cmd
+                eval $cmd
+                last_status=${PIPESTATUS[0]}
+                status_check $last_status "${cmd}" "${status_log}" "${model_name}"
+            else
+                IFS=";"
+                unset_env=`unset CUDA_VISIBLE_DEVICES`
+                log_path="$SAVE_LOG/train_log"
+                speed_log_path="$SAVE_LOG/index"
+                mkdir -p $log_path
+                mkdir -p $speed_log_path
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
+                func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id"  # sed used gpu_id
+                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed --profile_option as null
+
+                # for models which need to accumulate gradient.
+                if [[ ${model_name} =~ "TimeSformer" ]]; then
+                    global_bs=`expr ${batch_size} \* ${device_num:3:4} \* 8`
+                    modify_global_bs_cmd="sed -i '${line_norm_train}s/.*/& -o GRADIENT_ACCUMULATION.global_batch_size=${global_bs}/' '${filename}'"
+                    eval $modify_global_bs_cmd
+                fi
+
+                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                echo $cmd
+                job_bt=`date '+%Y%m%d%H%M%S'`
+                eval $cmd
+                job_et=`date '+%Y%m%d%H%M%S'`
+                export model_run_time=$((${job_et}-${job_bt}))
+                eval "cat ${log_path}/${log_name}"
+                # parser log
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
+                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
+                        --model_name ${_model_name} \
+                        --base_batch_size ${batch_size} \
+                        --run_mode ${run_mode} \
+                        --fp_item ${precision} \
+                        --keyword ips: \
+                        --skip_steps 5 \
+                        --device_num ${device_num} \
+                        --speed_unit instance/sec \
+                        --convergence_key loss: "
+                echo $cmd
+                eval $cmd
+                last_status=${PIPESTATUS[0]}
+                status_check $last_status "${cmd}" "${status_log}" "${model_name}"
+            fi
+        done
+    done
+done
diff --git a/docs/src/test_tipc/common_func.sh b/docs/src/test_tipc/common_func.sh
new file mode 100644
index 000000000..b12d3dd9c
--- /dev/null
+++ b/docs/src/test_tipc/common_func.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+function func_parser_key(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    tmp=${array[0]}
+    echo ${tmp}
+}
+
+function func_parser_value(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    tmp=${array[1]}
+    echo ${tmp}
+}
+
+function func_set_params(){
+    key=$1
+    value=$2
+    if [ ${key}x = "null"x ];then
+        echo " "
+    elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then
+        echo " "
+    else 
+        echo "${key}=${value}"
+    fi
+}
+
+function func_parser_params(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    key=${array[0]}
+    tmp=${array[1]}
+    IFS="|"
+    res=""
+    for _params in ${tmp[*]}; do
+        IFS="="
+        array=(${_params})
+        mode=${array[0]}
+        value=${array[1]}
+        if [[ ${mode} = ${MODE} ]]; then
+            IFS="|"
+            #echo $(func_set_params "${mode}" "${value}")
+            echo $value
+            break
+        fi
+        IFS="|"
+    done
+    echo ${res}
+}
+
+function status_check(){
+    last_status=$1   # the exit code
+    run_command=$2
+    run_log=$3
+    model_name=$4
+    log_path=$5
+    if [ $last_status -eq 0 ]; then
+        echo -e "\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log}
+    else
+        echo -e "\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log}
+    fi
+}
diff --git a/docs/src/test_tipc/compare_results.py b/docs/src/test_tipc/compare_results.py
new file mode 100644
index 000000000..dd8308dc9
--- /dev/null
+++ b/docs/src/test_tipc/compare_results.py
@@ -0,0 +1,171 @@
+import numpy as np
+import os
+import subprocess
+import json
+import argparse
+import glob
+
+
+def init_args():
+    parser = argparse.ArgumentParser()
+    # params for testing assert allclose
+    parser.add_argument("--atol", type=float, default=1e-3)
+    parser.add_argument("--rtol", type=float, default=1e-3)
+    parser.add_argument("--gt_file", type=str, default="")
+    parser.add_argument("--log_file", type=str, default="")
+    parser.add_argument("--precision", type=str, default="fp32")
+    return parser
+
+
+def parse_args():
+    parser = init_args()
+    return parser.parse_args()
+
+
+def run_shell_command(cmd):
+    p = subprocess.Popen(cmd,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE,
+                         shell=True)
+    out, err = p.communicate()
+
+    if p.returncode == 0:
+        return out.decode('utf-8')
+    else:
+        return None
+
+
+def parser_results_from_log_by_name(log_path, names_list):
+    if not os.path.exists(log_path):
+        raise ValueError("The log file {} does not exists!".format(log_path))
+
+    if names_list is None or len(names_list) < 1:
+        return []
+
+    parser_results = {}
+    lines = open(log_path, 'r').read().splitlines()
+    if 'python_infer' in log_path:  # parse python inference
+        for line in lines:
+            split_items = line.replace('\t', ' ')
+            split_items = split_items.split(' ')
+            split_items = [item for item in split_items if len(item) > 0]
+            for name in names_list:
+                if name in line:
+                    if '.' in split_items[-1]:
+                        parser_results[name] = float(split_items[-1])
+                    else:
+                        parser_results[name] = int(split_items[-1])
+    else:  # parse cpp inference
+        for line in lines:
+            split_items = line.replace('\t', ' ')
+            split_items = split_items.split(' ')
+            split_items = [item for item in split_items if len(item) > 0]
+            if all([(name + ':') in split_items for name in names_list]):
+                # print(split_items)
+                parser_results['class'] = int(split_items[2])
+                parser_results['score'] = float(split_items[-1])
+    return parser_results
+
+
+def load_gt_from_file(gt_file):
+    if not os.path.exists(gt_file):
+        raise ValueError("The log file {} does not exists!".format(gt_file))
+    with open(gt_file, 'r') as f:
+        data = f.readlines()
+        f.close()
+    parser_gt = {}
+    for line in data:
+        if 'top-1 class' in line:
+            split_items = line.replace('\t', ' ')
+            split_items = split_items.split(' ')
+            split_items = [item for item in split_items if len(item) > 0]
+            parser_gt['top-1 class'] = int(split_items[-1])
+        elif 'top-1 score' in line:
+            split_items = line.replace('\t', ' ')
+            split_items = split_items.split(' ')
+            split_items = [item for item in split_items if len(item) > 0]
+            parser_gt['top-1 score'] = float(split_items[-1])
+        elif "score" in line and 'segment' in line:
+            location_dict = eval(line)
+            parser_gt[f"score_{len(parser_gt)}"] = location_dict['score']
+            parser_gt[f"segment_{len(parser_gt)}"] = location_dict['segment']
+        elif "class:" in line and "score:" in line:
+            split_items = line.replace('\t', ' ')
+            split_items = split_items.split(' ')
+            split_items = [item for item in split_items if len(item) > 0]
+            parser_gt['class'] = int(split_items[2])
+            parser_gt['score'] = float(split_items[-1])
+    return parser_gt
+
+
+def load_gt_from_txts(gt_file):
+    gt_list = glob.glob(gt_file)
+    gt_collection = {}
+    for gt_f in gt_list:
+        gt_dict = load_gt_from_file(gt_f)
+        basename = os.path.basename(gt_f)
+        if "fp32" in basename:
+            gt_collection["fp32"] = [gt_dict, gt_f]
+        elif "fp16" in basename:
+            gt_collection["fp16"] = [gt_dict, gt_f]
+        elif "int8" in basename:
+            gt_collection["int8"] = [gt_dict, gt_f]
+        else:
+            continue
+    return gt_collection
+
+
+def collect_predict_from_logs(log_path, key_list):
+    log_list = glob.glob(log_path)
+    pred_collection = {}
+    for log_f in log_list:
+        pred_dict = parser_results_from_log_by_name(log_f, key_list)
+        key = os.path.basename(log_f)
+        pred_collection[key] = pred_dict
+
+    return pred_collection
+
+
+def testing_assert_allclose(dict_x, dict_y, atol=1e-7, rtol=1e-7):
+    for k in dict_x:
+        np.testing.assert_allclose(np.array(dict_x[k]),
+                                   np.array(dict_y[k]),
+                                   atol=atol,
+                                   rtol=rtol)
+
+
+if __name__ == "__main__":
+    # Usage example:
+    # test python infer:
+    ## python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/PP-TSM/*.txt  --log_file=./test_tipc/output/PP-TSM/python_infer_*.log
+    # test cpp infer:
+    ## python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/PP-TSM_CPP/*.txt  --log_file=./test_tipc/output/PP-TSM_CPP/cpp_infer_*.log
+
+    args = parse_args()
+
+    gt_collection = load_gt_from_txts(args.gt_file)
+    key_list = gt_collection["fp32"][0].keys()
+    pred_collection = collect_predict_from_logs(args.log_file, key_list)
+    for filename in pred_collection.keys():
+        if "fp32" in filename:
+            gt_dict, gt_filename = gt_collection["fp32"]
+        elif "fp16" in filename:
+            gt_dict, gt_filename = gt_collection["fp16"]
+        elif "int8" in filename:
+            gt_dict, gt_filename = gt_collection["int8"]
+        else:
+            continue
+        pred_dict = pred_collection[filename]
+        try:
+            testing_assert_allclose(gt_dict,
+                                    pred_dict,
+                                    atol=args.atol,
+                                    rtol=args.rtol)
+            print(
+                "Assert allclose passed! The results of {} and {} are consistent!"
+                .format(filename, gt_filename))
+        except Exception as E:
+            print(E)
+            raise ValueError(
+                "The results of {} and the results of {} are inconsistent!".
+                format(filename, gt_filename))
diff --git a/docs/src/test_tipc/extract_loss.py b/docs/src/test_tipc/extract_loss.py
new file mode 100644
index 000000000..f6bcb9777
--- /dev/null
+++ b/docs/src/test_tipc/extract_loss.py
@@ -0,0 +1,102 @@
+import sys
+import argparse
+import re
+
+
+def parameter_parser():
+    parser = argparse.ArgumentParser(description="Support Args:")
+    parser.add_argument("-v",
+                        "--valid-expr",
+                        type=str,
+                        default="*",
+                        help="when not match, the line will discard.")
+    parser.add_argument("-e",
+                        "--extract-expr",
+                        type=str,
+                        default="^{%s}$,",
+                        help="the extract expr for the loss: loss {%f}")
+    parser.add_argument("-r",
+                        "--reduction-expr",
+                        type=str,
+                        default="print",
+                        help="print | sum | mean")
+    parser.add_argument("-n",
+                        "--discard",
+                        type=int,
+                        default=0,
+                        help="while reduction, discard [0:n] and [-n:]")
+    parser.add_argument("-d", "--debug", type=bool, default=False, help="debug")
+    return parser.parse_args()
+
+
+args = parameter_parser()
+
+
+def log(*inp, **kargs):
+    if args.debug:
+        print(*inp, **kargs)
+
+
+def is_valid(line, valid_expr):
+    if valid_expr == "*": return True
+    if valid_expr in line: return True
+    return False
+
+
+def extract(line, extract_expr):
+    """
+    return tuple, the output will be
+    """
+    log("Extract_expression is : ", extract_expr)
+    x = re.findall("\{%(.)\}", extract_expr)
+    assert len(x) == 1, "Must exist a {%d} | {%f} | {%s} "
+    t = x[0]
+    type_converter = {
+        'f': float,
+        'i': int,
+        's': str,
+    }
+    type_extracter = {
+        "f": r'(-?\\d+\\.\\d+)',
+        "i": r'(-?\\d+)',
+        "s": r'(.*?)',
+    }
+    log(type_extracter[t])
+    pattern = re.sub("\{%(.)\}", type_extracter[t], extract_expr, 1)
+    log("Created Pattern is: ", pattern)
+    x = re.findall(pattern, line)
+    if len(x) == 0: return None
+    assert len(x) == 1, f"Multi Match for `{extract_expr}` in line: \n{line}"
+    log("Find in line: ", x[0].strip())
+    return type_converter[t](x[0].strip())
+
+
+def action(tuple_list, action):
+    # discard the warm up
+    if args.discard > 0:
+        tuple_list = tuple_list[args.discard:]
+        tuple_list = tuple_list[:-args.discard]
+    # do action for each item
+    if action == "sum":
+        print(sum(tuple_list))
+    if action == "mean":
+        if len(tuple_list) == 0: print("null")
+        else: print(sum(tuple_list) / len(tuple_list))
+    if action == "print":
+        for item in tuple_list:
+            print(item)
+
+
+def main():
+    current_step = 0
+    tuple_list = []
+    for line in sys.stdin:
+        line = line.strip()
+        if is_valid(line, args.valid_expr):
+            ret = extract(line, args.extract_expr)
+            if ret: tuple_list.append(ret)
+    action(tuple_list, args.reduction_expr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/src/test_tipc/prepare.sh b/docs/src/test_tipc/prepare.sh
new file mode 100644
index 000000000..96c4f5380
--- /dev/null
+++ b/docs/src/test_tipc/prepare.sh
@@ -0,0 +1,577 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+
+# set -xe
+
+:<<!
+MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer',
+#                 'whole_infer',
+#                 'cpp_infer', ]
+!
+
+MODE=$2
+
+dataline=$(cat ${FILENAME})
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# determine python interpreter version
+python=python
+# install auto-log package.
+${python} -m pip install unrar
+${python} -m pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl
+
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+
+trainer_list=$(func_parser_value "${lines[14]}")
+
+
+
+
+if [ ${MODE} = "lite_train_lite_infer" ];then
+    if [ ${model_name} == "PP-TSM" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "PP-TSN" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "AGCN" ]; then
+        # pretrain lite train data
+        pushd data/fsd10
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+        popd
+    elif [ ${model_name} == "STGCN" ]; then
+        # pretrain lite train data
+        pushd data/fsd10
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+        popd
+    elif [ ${model_name} == "AGCN2s" ]; then
+        # pretrain lite train data
+        pushd data/fsd10
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+        popd
+    elif [ ${model_name} == "TSM" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+    elif [ ${model_name} == "TSN" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+    elif [ ${model_name} == "TimeSformer" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "AttentionLSTM" ]; then
+        pushd data/yt8m
+        ## download & decompression training data
+        wget -nc https://videotag.bj.bcebos.com/Data/yt8m_rawframe_small.tar
+        tar -xf yt8m_rawframe_small.tar
+        ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+        ${python} tf2pkl.py ./frame ./pkl_frame/
+        ls pkl_frame/train*.pkl > train_small.list # 将train*.pkl的路径写入train_small.list
+        ls pkl_frame/validate*.pkl > val_small.list # 将validate*.pkl的路径写入val_small.list
+
+        ${python} split_yt8m.py train_small.list # 拆分每个train*.pkl变成多个train*_split*.pkl
+        ${python} split_yt8m.py val_small.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl
+
+        ls pkl_frame/train*_split*.pkl > train_small.list # 将train*_split*.pkl的路径重新写入train_small.list
+        ls pkl_frame/validate*_split*.pkl > val_small.list # 将validate*_split*.pkl的路径重新写入val_small.list
+        popd
+    elif [ ${model_name} == "SlowFast" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+    elif [ ${model_name} == "BMN" ]; then
+        # pretrain lite train data
+        pushd ./data
+        mkdir bmn_data
+        cd bmn_data
+        wget -nc https://videotag.bj.bcebos.com/Data/BMN_lite/bmn_feat.tar.gz
+        tar -xf bmn_feat.tar.gz
+        wget -nc https://videotag.bj.bcebos.com/Data/BMN_lite/activitynet_1.3_annotations.json
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json
+        popd
+    elif [ ${model_name} == "TokenShiftVisionTransformer" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "PoseC3D" ]; then
+        # pretrain lite train data
+        pushd ./data
+        mkdir posec3d_data
+        cd posec3d_data
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PoseC3D_data_small.tar
+        tar -xf PoseC3D_data_small.tar
+        popd
+    elif [ ${model_name} == "YOWO" ]; then
+        # pretrain lite train data
+        pushd ./data
+        wget -nc https://videotag.bj.bcebos.com/Data/ucf-24-lite.zip
+        unzip -qo ucf-24-lite.zip
+        pushd ./ucf24
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/YOWO_epoch_00005.pdparams
+        popd
+    else
+        echo "Not added into TIPC yet."
+    fi
+
+elif [ ${MODE} = "whole_train_whole_infer" ];then
+    if [ ${model_name} == "PP-TSM" ]; then
+        # pretrain whole train data
+        pushd ./data/k400
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+        bash download_k400_data.sh train_link.list
+        bash download_k400_data.sh val_link.list
+        ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file
+        # download annotations
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "PP-TSN" ]; then
+        # pretrain whole train data
+        pushd ./data/k400
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+        bash download_k400_data.sh train_link.list
+        bash download_k400_data.sh val_link.list
+        # download annotations
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "AGCN" ]; then
+        # pretrain whole train data
+        pushd data/fsd10
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+        popd
+    elif [ ${model_name} == "STGCN" ]; then
+        # pretrain whole train data
+        pushd data/fsd10
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+        popd
+    elif [ ${model_name} == "TSM" ]; then
+        # pretrain whole train data
+        pushd ./data/k400
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+        bash download_k400_data.sh train_link.list
+        bash download_k400_data.sh val_link.list
+        ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file
+        # download annotations
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+    elif [ ${model_name} == "TSN" ]; then
+        # pretrain whole train data
+        pushd ./data/k400
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+        bash download_k400_data.sh train_link.list
+        bash download_k400_data.sh val_link.list
+        ${python} extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4 # extract frames from video file
+        # download annotations
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+    elif [ ${model_name} == "TimeSformer" ]; then
+        # pretrain whole train data
+        pushd ./data/k400
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+        bash download_k400_data.sh train_link.list
+        bash download_k400_data.sh val_link.list
+        # download annotations
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "AttentionLSTM" ]; then
+        # pretrain whole train data
+        pushd data/yt8m
+        mkdir frame
+        cd frame
+        ## download & decompression training data
+        curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python
+        curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python
+        ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+        cd ..
+        ${python} tf2pkl.py ./frame ./pkl_frame/
+        ls pkl_frame/train*.pkl > train.list # 将train*.pkl的路径写入train.list
+        ls pkl_frame/validate*.pkl > val.list # 将validate*.pkl的路径写入val.list
+
+        ${python} split_yt8m.py train.list # 拆分每个train*.pkl变成多个train*_split*.pkl
+        ${python} split_yt8m.py val.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl
+
+        ls pkl_frame/train*_split*.pkl > train.list # 将train*_split*.pkl的路径重新写入train.list
+        ls pkl_frame/validate*_split*.pkl > val.list # 将validate*_split*.pkl的路径重新写入val.list
+        popd
+    elif [ ${model_name} == "SlowFast" ]; then
+        # pretrain whole train data
+        pushd ./data/k400
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list
+        wget -nc https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list
+        bash download_k400_data.sh train_link.list
+        bash download_k400_data.sh val_link.list
+        # download annotations
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list
+        popd
+    elif [ ${model_name} == "BMN" ]; then
+        # pretrain whole train data
+        pushd ./data
+        mkdir bmn_data
+        cd bmn_data
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz
+        tar -xf bmn_feat.tar.gz
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json
+        popd
+    else
+        echo "Not added into TIPC yet."
+    fi
+elif [ ${MODE} = "lite_train_whole_infer" ];then
+    if [ ${model_name} == "PP-TSM" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "PP-TSN" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "AGCN" ]; then
+        # pretrain lite train data
+        pushd data/fsd10
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+        popd
+    elif [ ${model_name} == "STGCN" ]; then
+        # pretrain lite train data
+        pushd data/fsd10
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_data.npy
+        wget -nc https://videotag.bj.bcebos.com/Data/FSD_train_label.npy
+        popd
+    elif [ ${model_name} == "TSM" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+    elif [ ${model_name} == "TSN" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+    elif [ ${model_name} == "TimeSformer" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "AttentionLSTM" ]; then
+        # pretrain lite train data
+        pushd data/yt8m
+        ## download & decompression training data
+        wget -nc https://videotag.bj.bcebos.com/Data/yt8m_rawframe_small.tar
+        tar -xf yt8m_rawframe_small.tar
+        ${python} -m pip install tensorflow-gpu==1.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+        ${python} tf2pkl.py ./frame ./pkl_frame/
+        ls pkl_frame/train*.pkl > train_small.list # 将train*.pkl的路径写入train_small.list
+        ls pkl_frame/validate*.pkl > val_small.list # 将validate*.pkl的路径写入val_small.list
+
+        ${python} split_yt8m.py train_small.list # 拆分每个train*.pkl变成多个train*_split*.pkl
+        ${python} split_yt8m.py val_small.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl
+
+        ls pkl_frame/train*_split*.pkl > train_small.list # 将train*_split*.pkl的路径重新写入train_small.list
+        ls pkl_frame/validate*_split*.pkl > val_small.list # 将validate*_split*.pkl的路径重新写入val_small.list
+        popd
+    elif [ ${model_name} == "SlowFast" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+    elif [ ${model_name} == "BMN" ]; then
+        # pretrain lite train data
+        pushd ./data
+        mkdir bmn_data
+        cd bmn_data
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz
+        tar -xf bmn_feat.tar.gz
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json
+        popd
+    else
+        echo "Not added into TIPC yet."
+    fi
+elif [ ${MODE} = "whole_infer" ];then
+    if [ ${model_name} = "PP-TSM" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams --no-check-certificate
+    elif [ ${model_name} = "PP-TSN" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams --no-check-certificate
+    elif [ ${model_name} == "AGCN" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams --no-check-certificate
+    elif [ ${model_name} == "STGCN" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams --no-check-certificate
+    elif [ ${model_name} == "TSM" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams --no-check-certificate
+    elif [ ${model_name} == "TSN" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams --no-check-certificate
+    elif [ ${model_name} == "TimeSformer" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams --no-check-certificate
+    elif [ ${model_name} == "AttentionLSTM" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams --no-check-certificate
+    elif [ ${model_name} == "SlowFast" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams --no-check-certificate
+    elif [ ${model_name} == "BMN" ]; then
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams --no-check-certificate
+    else
+        echo "Not added into TIPC yet."
+    fi
+fi
+
+if [ ${MODE} = "benchmark_train" ];then
+    ${python} -m pip install -r requirements.txt
+    if [ ${model_name} == "PP-TSM" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "PP-TSN" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "AGCN" ]; then
+        echo "Not added into TIPC yet."
+    elif [ ${model_name} == "STGCN" ]; then
+        echo "Not added into TIPC yet."
+    elif [ ${model_name} == "TSM" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        # download datalist for fleet benchmark
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/train_fleet_frames.list
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/val_fleet_frames.list
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+    elif [ ${model_name} == "TSN" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams --no-check-certificate
+    elif [ ${model_name} == "TimeSformer" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams --no-check-certificate
+    elif [ ${model_name} == "AttentionLSTM" ]; then
+        echo "Not added into TIPC yet."
+    elif [ ${model_name} == "SlowFast" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+    elif [ ${model_name} == "BMN" ]; then
+        # pretrain lite train data
+        pushd ./data
+        mkdir bmn_data
+        cd bmn_data
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz
+        tar -xf bmn_feat.tar.gz
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json
+        wget -nc https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json
+        popd
+    elif [ ${model_name} == "VideoSwin" ]; then
+        # pretrain lite train data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar
+        tar -xf k400_videos_small.tar
+        popd
+        # download pretrained weights
+        wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_small_patch4_window7_224.pdparams --no-check-certificate
+    else
+        echo "Not added into TIPC yet."
+    fi
+fi
+
+if [ ${MODE} = "klquant_whole_infer" ]; then
+    if [ ${model_name} = "PP-TSM" ]; then
+        # download lite data
+        pushd ./data/k400
+        wget -nc https://videotag.bj.bcebos.com/Data/k400_rawframes_small.tar
+        tar -xf k400_rawframes_small.tar
+        popd
+        # download inference model
+        mkdir ./inference
+        pushd ./inference
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate
+        unzip ppTSM.zip
+        popd
+    else
+        echo "Not added into TIPC yet."
+    fi
+fi
+
+if [ ${MODE} = "cpp_infer" ];then
+    # install required packages
+    apt-get update
+    apt install libavformat-dev
+    apt install libavcodec-dev
+    apt install libswresample-dev
+    apt install libswscale-dev
+    apt install libavutil-dev
+    apt install libsdl1.2-dev
+    apt-get install ffmpeg
+
+    if [ ${model_name} = "PP-TSM" ]; then
+        # download pretrained weights
+        wget -nc -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams --no-check-certificate
+        # export inference model
+        ${python} tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml -p data/ppTSM_k400_uniform.pdparams -o ./inference/ppTSM
+    elif [ ${model_name} = "PP-TSN" ]; then
+        # download pretrained weights
+        wget -nc -P data/ https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams --no-check-certificate
+        # export inference model
+        ${python} tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_videos.yaml -p data/ppTSN_k400.pdparams -o ./inference/ppTSN
+    else
+        echo "Not added into TIPC now."
+    fi
+fi
+
+if [ ${MODE} = "serving_infer_python" ];then
+    if [[ ${model_name} == "PP-TSM" ]];then
+        # prepare lite infer data for serving
+        pushd ./data
+        mkdir python_serving_infer_video_dir
+        cp ./example.avi python_serving_infer_video_dir/
+        popd
+        # prepare inference model
+        mkdir ./inference
+        pushd ./inference
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM.zip --no-check-certificate
+        unzip ppTSM.zip
+        popd
+    elif [[ ${model_name} == "PP-TSN" ]];then
+        # prepare lite infer data for serving
+        pushd ./data
+        mkdir python_serving_infer_video_dir
+        cp ./example.avi python_serving_infer_video_dir/
+        popd
+        # prepare inference model
+        mkdir ./inference
+        pushd ./inference
+        wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip --no-check-certificate
+        unzip ppTSN.zip
+        popd
+    else
+        echo "Not added into TIPC now."
+    fi
+fi
+
+if [ ${MODE} = "paddle2onnx_infer" ];then
+    # install paddle2onnx
+    python_name_list=$(func_parser_value "${lines[2]}")
+    IFS='|'
+    array=(${python_name_list})
+    python_name=${array[0]}
+    ${python_name} -m pip install paddle2onnx
+    ${python_name} -m pip install onnxruntime==1.9.0
+
+    if [ ${model_name} = "PP-TSM" ]; then
+        echo "Not added into TIPC now."
+    elif [ ${model_name} = "PP-TSN" ]; then
+        mkdir -p ./inference
+        wget -P ./inference/ https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSN.zip
+        # unzip inference model
+        pushd ./inference
+        unzip ppTSN.zip
+        popd
+    else
+        echo "Not added into TIPC now."
+    fi
+fi
diff --git a/docs/src/test_tipc/test_inference_cpp.sh b/docs/src/test_tipc/test_inference_cpp.sh
new file mode 100644
index 000000000..855d4ebb1
--- /dev/null
+++ b/docs/src/test_tipc/test_inference_cpp.sh
@@ -0,0 +1,228 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+MODE=$2
+dataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# parser cpp inference model
+model_name=$(func_parser_value "${lines[1]}")
+use_opencv=$(func_parser_value "${lines[2]}")
+cpp_infer_model_dir_list=$(func_parser_value "${lines[3]}")
+cpp_infer_is_quant=$(func_parser_value "${lines[4]}")
+# parser cpp inference
+inference_cmd=$(func_parser_value "${lines[5]}")
+cpp_use_gpu_key=$(func_parser_key "${lines[6]}")
+cpp_use_gpu_list=$(func_parser_value "${lines[6]}")
+cpp_use_mkldnn_key=$(func_parser_key "${lines[7]}")
+cpp_use_mkldnn_list=$(func_parser_value "${lines[7]}")
+cpp_cpu_threads_key=$(func_parser_key "${lines[8]}")
+cpp_cpu_threads_list=$(func_parser_value "${lines[8]}")
+cpp_batch_size_key=$(func_parser_key "${lines[9]}")
+cpp_batch_size_list=$(func_parser_value "${lines[9]}")
+cpp_use_trt_key=$(func_parser_key "${lines[10]}")
+cpp_use_trt_list=$(func_parser_value "${lines[10]}")
+cpp_precision_key=$(func_parser_key "${lines[11]}")
+cpp_precision_list=$(func_parser_value "${lines[11]}")
+cpp_infer_model_key=$(func_parser_key "${lines[12]}")
+cpp_image_dir_key=$(func_parser_key "${lines[13]}")
+cpp_infer_img_dir=$(func_parser_value "${lines[13]}")
+cpp_infer_key1=$(func_parser_key "${lines[14]}")
+cpp_infer_value1=$(func_parser_value "${lines[14]}")
+cpp_benchmark_key=$(func_parser_key "${lines[15]}")
+cpp_benchmark_value=$(func_parser_value "${lines[15]}")
+cpp_infer_key2=$(func_parser_key "${lines[16]}")
+cpp_infer_value2=$(func_parser_value "${lines[16]}")
+cpp_infer_key3=$(func_parser_key "${lines[17]}")
+cpp_infer_value3=$(func_parser_value "${lines[17]}")
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_cpp.log"
+
+
+function func_cpp_inference(){
+    IFS='|'
+    _script=$1
+    _model_dir=$2
+    _log_path=$3
+    _img_dir=$4
+    _flag_quant=$5
+    # inference
+    for use_gpu in ${cpp_use_gpu_list[*]}; do
+        if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+            for use_mkldnn in ${cpp_use_mkldnn_list[*]}; do
+                if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+                    continue
+                fi
+                for threads in ${cpp_cpu_threads_list[*]}; do
+                    for batch_size in ${cpp_batch_size_list[*]}; do
+                        precision="fp32"
+                        if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+                            precison="int8"
+                        fi
+                        _save_log_path="${_log_path}/cpp_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
+                        set_infer_data=$(func_set_params "${cpp_image_dir_key}" "${_img_dir}")
+                        set_benchmark=$(func_set_params "${cpp_benchmark_key}" "${cpp_benchmark_value}")
+                        set_batchsize=$(func_set_params "${cpp_batch_size_key}" "${batch_size}")
+                        set_cpu_threads=$(func_set_params "${cpp_cpu_threads_key}" "${threads}")
+                        set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}")
+                        set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}")
+                        set_infer_params2=$(func_set_params "${cpp_infer_key2}" "${cpp_infer_value2}")
+                        set_infer_params3=$(func_set_params "${cpp_infer_key3}" "${cpp_infer_value3}")
+                        command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${cpp_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params2} ${set_infer_params3} > ${_save_log_path} 2>&1 "
+                        eval $command
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "${status_log}" "${model_name}"
+                    done
+                done
+            done
+        elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+            for use_trt in ${cpp_use_trt_list[*]}; do
+                for precision in ${cpp_precision_list[*]}; do
+                    if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then
+                        continue
+                    fi
+                    if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
+                        continue
+                    fi
+                    if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
+                        continue
+                    fi
+                    for batch_size in ${cpp_batch_size_list[*]}; do
+                        _save_log_path="${_log_path}/cpp_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
+                        set_infer_data=$(func_set_params "${cpp_image_dir_key}" "${_img_dir}")
+                        set_benchmark=$(func_set_params "${cpp_benchmark_key}" "${cpp_benchmark_value}")
+                        set_batchsize=$(func_set_params "${cpp_batch_size_key}" "${batch_size}")
+                        set_tensorrt=$(func_set_params "${cpp_use_trt_key}" "${use_trt}")
+                        set_precision=$(func_set_params "${cpp_precision_key}" "${precision}")
+                        set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}")
+                        set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}")
+                        set_infer_params2=$(func_set_params "${cpp_infer_key2}" "${cpp_infer_value2}")
+                        set_infer_params3=$(func_set_params "${cpp_infer_key3}" "${cpp_infer_value3}")
+                        command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params2} ${set_infer_params3} > ${_save_log_path} 2>&1 "
+                        eval $command
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "${status_log}" "${model_name}"
+
+                    done
+                done
+            done
+        else
+            echo "Does not support hardware other than CPU and GPU Currently!"
+        fi
+    done
+}
+
+
+cd deploy/cpp_infer
+if [ ${use_opencv} = "True" ]; then
+    if [ -d "opencv-3.4.7/opencv3/" ] && [ $(md5sum opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = "faa2b5950f8bee3f03118e600c74746a" ];then
+        echo "################### build opencv skipped ###################"
+    else
+        echo "################### building opencv ###################"
+        rm -rf opencv-3.4.7.tar.gz opencv-3.4.7/
+        wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/opencv-3.4.7.tar.gz
+        tar -xf opencv-3.4.7.tar.gz
+
+        cd opencv-3.4.7/
+        install_path=$(pwd)/opencv3
+
+        rm -rf build
+        mkdir build
+        cd build
+
+        cmake .. \
+            -DCMAKE_INSTALL_PREFIX=${install_path} \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DBUILD_SHARED_LIBS=OFF \
+            -DWITH_IPP=OFF \
+            -DBUILD_IPP_IW=OFF \
+            -DWITH_LAPACK=OFF \
+            -DWITH_EIGEN=OFF \
+            -DCMAKE_INSTALL_LIBDIR=lib64 \
+            -DWITH_ZLIB=ON \
+            -DBUILD_ZLIB=ON \
+            -DWITH_JPEG=ON \
+            -DBUILD_JPEG=ON \
+            -DWITH_PNG=ON \
+            -DBUILD_PNG=ON \
+            -DWITH_TIFF=ON \
+            -DBUILD_TIFF=ON \
+            -DWITH_FFMPEG=ON
+
+        make -j
+        make install
+        cd ../
+        echo "################### building opencv finished ###################"
+    fi
+fi
+
+
+if [ !-d "paddle_inference" ]; then
+    echo "################### download inference lib skipped ###################"
+else
+    echo "################### downloading inference lib ###################"
+    wget -nc https://paddle-inference-lib.bj.bcebos.com/2.1.1-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddle_inference.tgz
+    tar -xf paddle_inference.tgz
+    echo "################### downloading inference lib finished ###################"
+fi
+
+echo "################### building PaddleVideo demo ####################"
+if [ ${use_opencv} = "True" ]; then
+    OPENCV_DIR=$(pwd)/opencv-3.4.7/opencv3
+else
+    OPENCV_DIR=''
+fi
+
+LIB_DIR=$(pwd)/paddle_inference
+CUDA_LIB_DIR=$(dirname `find /usr -name libcudart.so`)
+CUDNN_LIB_DIR=$(dirname `find /usr -name libcudnn.so`)
+
+BUILD_DIR=build
+rm -rf ${BUILD_DIR}
+mkdir ${BUILD_DIR}
+cd ${BUILD_DIR}
+cmake .. \
+    -DPADDLE_LIB=${LIB_DIR} \
+    -DWITH_MKL=ON \
+    -DWITH_GPU=OFF \
+    -DWITH_STATIC_LIB=OFF \
+    -DWITH_TENSORRT=OFF \
+    -DOPENCV_DIR=${OPENCV_DIR} \
+    -DCUDNN_LIB=${CUDNN_LIB_DIR} \
+    -DCUDA_LIB=${CUDA_LIB_DIR} \
+    -DTENSORRT_DIR=${TENSORRT_DIR} \
+
+make -j
+cd ../../../
+echo "################### building PaddleVideo demo finished ###################"
+
+
+# set cuda device
+GPUID=$2
+if [ ${#GPUID} -le 0 ];then
+    env=" "
+else
+    env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+fi
+set CUDA_VISIBLE_DEVICES
+eval $env
+
+
+echo "################### running test ###################"
+export Count=0
+IFS="|"
+infer_quant_flag=(${cpp_infer_is_quant})
+for infer_model in ${cpp_infer_model_dir_list[*]}; do
+    #run inference
+    is_quant=${infer_quant_flag[Count]}
+    func_cpp_inference "${inference_cmd}" "${infer_model}" "${LOG_PATH}" "${cpp_infer_img_dir}" ${is_quant}
+    Count=$(($Count + 1))
+done
diff --git a/docs/src/test_tipc/test_paddle2onnx.sh b/docs/src/test_tipc/test_paddle2onnx.sh
new file mode 100644
index 000000000..add31dd9b
--- /dev/null
+++ b/docs/src/test_tipc/test_paddle2onnx.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+MODE=$2
+dataline=$(cat ${FILENAME})
+lines=(${dataline})
+# common params
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+
+
+# parser params
+dataline=$(awk 'NR==1, NR==14{print}'  $FILENAME)
+IFS=$'\n'
+lines=(${dataline})
+
+# parser paddle2onnx
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+padlle2onnx_cmd=$(func_parser_value "${lines[3]}")
+infer_model_dir_key=$(func_parser_key "${lines[4]}")
+infer_model_dir_value=$(func_parser_value "${lines[4]}")
+model_filename_key=$(func_parser_key "${lines[5]}")
+model_filename_value=$(func_parser_value "${lines[5]}")
+params_filename_key=$(func_parser_key "${lines[6]}")
+params_filename_value=$(func_parser_value "${lines[6]}")
+save_file_key=$(func_parser_key "${lines[7]}")
+save_file_value=$(func_parser_value "${lines[7]}")
+opset_version_key=$(func_parser_key "${lines[8]}")
+opset_version_value=$(func_parser_value "${lines[8]}")
+enable_onnx_checker_key=$(func_parser_key "${lines[9]}")
+enable_onnx_checker_value=$(func_parser_value "${lines[9]}")
+# parser onnx inference
+inference_py=$(func_parser_value "${lines[10]}")
+config_key=$(func_parser_key "${lines[11]}")
+config_value=$(func_parser_value "${lines[11]}")
+model_key=$(func_parser_key "${lines[12]}")
+input_file_key=$(func_parser_key "${lines[13]}")
+input_file_value=$(func_parser_value "${lines[13]}")
+
+
+LOG_PATH="./log/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_paddle2onnx.log"
+
+
+function func_paddle2onnx(){
+    IFS='|'
+    _script=$1
+
+    # paddle2onnx
+    _save_log_path="${LOG_PATH}/paddle2onnx_infer_cpu.log"
+    set_dirname=$(func_set_params "${infer_model_dir_key}" "${infer_model_dir_value}")
+    set_model_filename=$(func_set_params "${model_filename_key}" "${model_filename_value}")
+    set_params_filename=$(func_set_params "${params_filename_key}" "${params_filename_value}")
+    set_save_model=$(func_set_params "${save_file_key}" "${save_file_value}")
+    set_opset_version=$(func_set_params "${opset_version_key}" "${opset_version_value}")
+    set_enable_onnx_checker=$(func_set_params "${enable_onnx_checker_key}" "${enable_onnx_checker_value}")
+    trans_log="${LOG_PATH}/trans_model.log"
+    trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} > ${trans_log} 2>&1 "
+    eval $trans_model_cmd
+    last_status=${PIPESTATUS[0]}
+    status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}"
+    # python inference
+    set_gpu=$(func_set_params "${use_gpu_key}" "${use_gpu_value}")
+    set_model_dir=$(func_set_params "${model_key}" "${save_file_value}")
+    set_input_file=$(func_set_params "${input_file_key}" "${input_file_value}")
+    set_config=$(func_set_params "${config_key}" "${config_value}")
+    infer_model_cmd="${python} ${inference_py} ${set_config} ${set_input_file} ${set_model_dir} > ${_save_log_path} 2>&1 "
+    eval $infer_model_cmd
+    last_status=${PIPESTATUS[0]}
+    status_check $last_status "${infer_model_cmd}" "${status_log}" "${model_name}"
+}
+
+
+echo "################### run test ###################"
+
+export Count=0
+IFS="|"
+func_paddle2onnx
diff --git a/docs/src/test_tipc/test_ptq_inference_python.sh b/docs/src/test_tipc/test_ptq_inference_python.sh
new file mode 100644
index 000000000..369772769
--- /dev/null
+++ b/docs/src/test_tipc/test_ptq_inference_python.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer']
+MODE=$2
+
+dataline=$(awk 'NR==1, NR==32{print}'  $FILENAME)
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+use_gpu_key=$(func_parser_key "${lines[3]}")
+use_gpu_value=$(func_parser_value "${lines[3]}")
+quant_config_file_key=$(func_parser_key "${lines[4]}")
+quant_config_file_value=$(func_parser_value "${lines[4]}")
+model_path_key=$(func_parser_key "${lines[5]}")
+model_path_value=$(func_parser_value "${lines[5]}")
+output_dir_key=$(func_parser_key "${lines[6]}")
+output_dir_value=$(func_parser_value "${lines[6]}")
+data_dir_key=$(func_parser_key "${lines[7]}")
+data_dir_value=$(func_parser_value "${lines[7]}")
+data_anno_key=$(func_parser_key "${lines[8]}")
+data_anno_value=$(func_parser_value "${lines[8]}")
+batch_num_key=$(func_parser_key "${lines[9]}")
+batch_num_value=$(func_parser_value "${lines[9]}")
+quant_batch_size_key=$(func_parser_key "${lines[10]}")
+quant_batch_size_value=$(func_parser_value "${lines[10]}")
+
+# parser trainer
+train_py=$(func_parser_value "${lines[13]}")
+
+# parser inference
+inference_py=$(func_parser_value "${lines[16]}")
+use_gpu_key=$(func_parser_key "${lines[17]}")
+use_gpu_list=$(func_parser_value "${lines[17]}")
+infer_config_file_key=$(func_parser_key "${lines[18]}")
+infer_config_file_value=$(func_parser_value "${lines[18]}")
+infer_batch_size_key=$(func_parser_key "${lines[19]}")
+infer_batch_size_list=$(func_parser_value "${lines[19]}")
+infer_model_key=$(func_parser_key "${lines[20]}")
+infer_model_value=$(func_parser_value "${lines[20]}")
+infer_params_key=$(func_parser_key "${lines[21]}")
+infer_params_value=$(func_parser_value "${lines[21]}")
+infer_video_key=$(func_parser_key "${lines[22]}")
+infer_video_dir=$(func_parser_value "${lines[22]}")
+benchmark_key=$(func_parser_key "${lines[23]}")
+benchmark_value=$(func_parser_value "${lines[23]}")
+
+
+function func_inference(){
+    IFS='|'
+    _python=$1
+    _script=$2
+    _model_dir=$3
+    _log_path=$4
+    _img_dir=$5
+    # inference
+    for use_gpu in ${use_gpu_list[*]}; do
+        # cpu
+        if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+            for batch_size in ${infer_batch_size_list[*]}; do
+                _save_log_path="${_log_path}/python_infer_cpu_batchsize_${batch_size}.log"
+                set_infer_data=$(func_set_params "${infer_video_key}" "${_img_dir}")
+                set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                set_batchsize=$(func_set_params "${infer_batch_size_key}" "${batch_size}")
+                set_model_file_path=$(func_set_params "${infer_model_key}" "${infer_model_value}")
+                set_params_file_path=$(func_set_params "${infer_params_key}" "${infer_params_value}")
+                set_config_file_path=$(func_set_params "${infer_config_file_key}" "${infer_config_file_value}")
+                command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_config_file_path} ${set_model_file_path} ${set_params_file_path} ${set_batchsize} ${set_infer_data} ${set_benchmark} > ${_save_log_path} 2>&1 "
+                # echo $command
+                eval $command
+                last_status=${PIPESTATUS[0]}
+                eval "cat ${_save_log_path}"
+                status_check $last_status "${command}" "${status_log}" "${model_name}"
+            done
+        # gpu
+        elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+            for batch_size in ${infer_batch_size_list[*]}; do
+                _save_log_path="${_log_path}/python_infer_gpu_batchsize_${batch_size}.log"
+                set_infer_data=$(func_set_params "${infer_video_key}" "${_img_dir}")
+                set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                set_batchsize=$(func_set_params "${infer_batch_size_key}" "${batch_size}")
+                set_model_file_path=$(func_set_params "${infer_model_key}" "${infer_model_value}")
+                set_params_file_path=$(func_set_params "${infer_params_key}" "${infer_params_value}")
+                set_config_file_path=$(func_set_params "${infer_config_file_key}" "${infer_config_file_value}")
+                command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_config_file_path} ${set_model_file_path} ${set_params_file_path} ${set_batchsize} ${set_infer_data} ${set_benchmark} > ${_save_log_path} 2>&1 "
+                echo $command
+                eval $command
+                last_status=${PIPESTATUS[0]}
+                eval "cat ${_save_log_path}"
+                status_check $last_status "${command}" "${status_log}" "${model_name}"
+            done
+        else
+            echo "Does not support hardware other than CPU and GPU Currently!"
+        fi
+    done
+}
+
+# log
+LOG_PATH="./log/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_python.log"
+
+if [ ${MODE} = "whole_infer" ]; then
+    IFS="|"
+    # run export
+    set_output_dir=$(func_set_params "${output_dir_key}" "${output_dir_value}")
+    set_data_dir=$(func_set_params "${data_dir_key}" "${data_dir_value}")
+    set_data_anno=$(func_set_params "${data_anno_key}" "${data_anno_value}")
+    set_batch_size=$(func_set_params "${quant_batch_size_key}" "${quant_batch_size_value}")
+    set_batch_num=$(func_set_params "${batch_num_key}" "${batch_num_value}")
+    set_model_path=$(func_set_params "${model_path_key}" "${model_path_value}")
+    set_config_file=$(func_set_params "${quant_config_file_key}" "${quant_config_file_value}")
+    set_use_gpu=$(func_set_params "${use_gpu_key}" "${use_gpu_value}")
+
+    export_log_path="${LOG_PATH}/${MODE}_export_${Count}.log"
+    export_cmd="${python} ${train_py} ${set_use_gpu} ${set_config_file} ${set_model_path} ${set_batch_num} ${set_batch_size} ${set_data_dir} ${set_data_anno} ${set_output_dir} > ${export_log_path} 2>&1 "
+    echo $export_cmd
+    eval $export_cmd
+    status_export=$?
+    status_check $status_export "${export_cmd}" "${status_log}" "${model_name}"
+
+    save_infer_dir=${output_dir_value}
+    #run inference
+    func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_video_dir}"
+
+fi
diff --git a/docs/src/test_tipc/test_serving_infer_cpp.sh b/docs/src/test_tipc/test_serving_infer_cpp.sh
new file mode 100644
index 000000000..7b7480cc7
--- /dev/null
+++ b/docs/src/test_tipc/test_serving_infer_cpp.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+MODE=$2
+dataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# parser serving
+model_name=$(func_parser_value "${lines[1]}")
+python_list=$(func_parser_value "${lines[2]}")
+trans_model_py=$(func_parser_value "${lines[3]}")
+infer_model_dir_key=$(func_parser_key "${lines[4]}")
+infer_model_dir_value=$(func_parser_value "${lines[4]}")
+model_filename_key=$(func_parser_key "${lines[5]}")
+model_filename_value=$(func_parser_value "${lines[5]}")
+params_filename_key=$(func_parser_key "${lines[6]}")
+params_filename_value=$(func_parser_value "${lines[6]}")
+serving_server_key=$(func_parser_key "${lines[7]}")
+serving_server_value=$(func_parser_value "${lines[7]}")
+serving_client_key=$(func_parser_key "${lines[8]}")
+serving_client_value=$(func_parser_value "${lines[8]}")
+serving_dir_value=$(func_parser_value "${lines[9]}")
+run_model_path_key=$(func_parser_key "${lines[10]}")
+run_model_path_value=$(func_parser_value "${lines[10]}")
+port_key=$(func_parser_key "${lines[11]}")
+port_value=$(func_parser_value "${lines[11]}")
+cpp_client_value=$(func_parser_value "${lines[12]}")
+input_video_key=$(func_parser_key "${lines[13]}")
+input_video_value=$(func_parser_value "${lines[13]}")
+
+
+LOG_PATH="./test_tipc/output/log/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_serving.log"
+
+function func_serving(){
+    IFS='|'
+    _python=$1
+    _script=$2
+    _model_dir=$3
+
+    # phase 1: save model
+    set_dirname=$(func_set_params "${infer_model_dir_key}" "${infer_model_dir_value}")
+    set_model_filename=$(func_set_params "${model_filename_key}" "${model_filename_value}")
+    set_params_filename=$(func_set_params "${params_filename_key}" "${params_filename_value}")
+    set_serving_server=$(func_set_params "${serving_server_key}" "${serving_server_value}")
+    set_serving_client=$(func_set_params "${serving_client_key}" "${serving_client_value}")
+    python_list=(${python_list})
+    python=${python_list[0]}
+    trans_log="${LOG_PATH}/cpp_trans_model.log"
+    trans_model_cmd="${python} ${trans_model_py} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_serving_server} ${set_serving_client} > ${trans_log} 2>&1 "
+    eval ${trans_model_cmd}
+    last_status=${PIPESTATUS[0]}
+    status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}"
+
+    # modify the alias name of fetch_var to "outputs"
+    server_fetch_var_line_cmd="sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \"outputs\"/' $serving_server_value/serving_server_conf.prototxt"
+    eval ${server_fetch_var_line_cmd}
+    client_fetch_var_line_cmd="sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \"outputs\"/' $serving_client_value/serving_client_conf.prototxt"
+    eval ${client_fetch_var_line_cmd}
+    cd ${serving_dir_value}
+    echo $PWD
+    unset https_proxy
+    unset http_proxy
+
+    _save_log_path="${LOG_PATH}/cpp_client_infer_gpu_batchsize_1.log"
+    # phase 2: run server
+    server_log_path="${LOG_PATH}/cpp_server_gpu.log"
+    cpp_server_cmd="${python} -m paddle_serving_server.serve ${run_model_path_key} ${run_model_path_value} ${port_key} ${port_value} > ${server_log_path} 2>&1 &"
+    eval ${cpp_server_cmd}
+    sleep 20s
+
+    # phase 3: run client
+    real_model_name=${model_name/PP-/PP}
+    serving_client_conf_path="${serving_client_value/deploy\/cpp_serving\/}"
+    serving_client_conf_path="${serving_client_conf_path/\/\//}serving_client_conf.prototxt"
+    cpp_client_cmd="${python} ${cpp_client_value} -n ${real_model_name} -c ${serving_client_conf_path} ${input_video_key} ${input_video_value} > ${_save_log_path} 2>&1 "
+    eval ${cpp_client_cmd}
+    last_status=${PIPESTATUS[0]}
+
+    eval "cat ${_save_log_path}"
+    cd ../../
+    status_check $last_status "${cpp_server_cmd}" "${status_log}" "${model_name}"
+    ps ux | grep -i 'paddle_serving_server' | awk '{print $2}' | xargs kill -s 9
+}
+
+
+# set cuda device
+GPUID=$3
+if [ ${#GPUID} -le 0 ];then
+    env=" "
+else
+    env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+fi
+set CUDA_VISIBLE_DEVICES
+eval $env
+
+
+echo "################### run test ###################"
+
+export Count=0
+IFS="|"
+func_serving "${web_service_cmd}"
diff --git a/docs/src/test_tipc/test_serving_infer_python.sh b/docs/src/test_tipc/test_serving_infer_python.sh
new file mode 100644
index 000000000..a9a072b9e
--- /dev/null
+++ b/docs/src/test_tipc/test_serving_infer_python.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+dataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)
+MODE=$2
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# parser serving
+model_name=$(func_parser_value "${lines[1]}")
+python_list=$(func_parser_value "${lines[2]}")
+trans_model_py=$(func_parser_value "${lines[3]}")
+infer_model_dir_key=$(func_parser_key "${lines[4]}")
+infer_model_dir_value=$(func_parser_value "${lines[4]}")
+model_filename_key=$(func_parser_key "${lines[5]}")
+model_filename_value=$(func_parser_value "${lines[5]}")
+params_filename_key=$(func_parser_key "${lines[6]}")
+params_filename_value=$(func_parser_value "${lines[6]}")
+serving_server_key=$(func_parser_key "${lines[7]}")
+serving_server_value=$(func_parser_value "${lines[7]}")
+serving_client_key=$(func_parser_key "${lines[8]}")
+serving_client_value=$(func_parser_value "${lines[8]}")
+serving_dir_value=$(func_parser_value "${lines[9]}")
+web_service_py=$(func_parser_value "${lines[10]}")
+pipeline_py=$(func_parser_value "${lines[11]}")
+video_dir_key=$(func_parser_key "${lines[12]}")
+video_dir_value=$(func_parser_value "${lines[12]}")
+
+
+LOG_PATH="./test_tipc/output/log/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_serving.log"
+
+
+function func_serving(){
+    IFS='|'
+    _python=$1
+    _script=$2
+    _model_dir=$3
+
+    # python serving code
+    set_dirname=$(func_set_params "${infer_model_dir_key}" "${infer_model_dir_value}")
+    set_model_filename=$(func_set_params "${model_filename_key}" "${model_filename_value}")
+    set_params_filename=$(func_set_params "${params_filename_key}" "${params_filename_value}")
+    set_serving_server=$(func_set_params "${serving_server_key}" "${serving_server_value}")
+
+    set_serving_client=$(func_set_params "${serving_client_key}" "${serving_client_value}")
+    python_list=(${python_list})
+    python=${python_list[0]}
+    trans_log="${LOG_PATH}/python_trans_model.log"
+    trans_model_cmd="${python} ${trans_model_py} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_serving_server} ${set_serving_client} > ${trans_log} 2>&1 "
+
+    eval ${trans_model_cmd}
+
+    # modify the alias name of fetch_var to "outputs"
+    server_fetch_var_line_cmd="sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \"outputs\"/' $serving_server_value/serving_server_conf.prototxt"
+    eval ${server_fetch_var_line_cmd}
+    client_fetch_var_line_cmd="sed -i '/fetch_var/,/is_lod_tensor/s/alias_name: .*/alias_name: \"outputs\"/' $serving_client_value/serving_client_conf.prototxt"
+    eval ${client_fetch_var_line_cmd}
+
+    cd ${serving_dir_value}
+    echo 'PWD= '$PWD
+    unset https_proxy
+    unset http_proxy
+
+    server_log_path="${LOG_PATH}/python_server_gpu.log"
+    web_service_cmd="${python} ${web_service_py} > ${server_log_path} 2>&1 &"
+    eval $web_service_cmd
+    last_status=${PIPESTATUS[0]}
+    status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}"
+    sleep 30s # not too short is ok
+
+    _save_log_path="../../${LOG_PATH}/python_server_infer_gpu_batchsize_1.log"
+    set_video_dir=$(func_set_params "${video_dir_key}" "${video_dir_value}")
+    pipeline_cmd="${python} ${pipeline_py} ${set_video_dir} > ${_save_log_path} 2>&1 "
+
+    eval $pipeline_cmd
+    last_status=${PIPESTATUS[0]}
+
+    eval "cat ${_save_log_path}"
+    cd ../../
+    status_check $last_status "${pipeline_cmd}" "${status_log}" "${model_name}"
+    ps ux | grep -E 'web_service|pipeline' | awk '{print $2}' | xargs kill -s 9
+}
+
+
+# set cuda device
+GPUID=$3
+if [ ${#GPUID} -le 0 ];then
+    env=" "
+else
+    env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+fi
+set CUDA_VISIBLE_DEVICES
+eval $env
+
+
+echo "################### run test ###################"
+
+export Count=0
+IFS="|"
+func_serving "${web_service_cmd}"
diff --git a/docs/src/test_tipc/test_train_dy2static_python.sh b/docs/src/test_tipc/test_train_dy2static_python.sh
new file mode 100644
index 000000000..b82f3eb04
--- /dev/null
+++ b/docs/src/test_tipc/test_train_dy2static_python.sh
@@ -0,0 +1,73 @@
+source test_tipc/common_func.sh
+
+IFS=$'\n'
+BASE_CONFIG_FILE=$1
+# always use the lite_train_lite_infer mode to speed. Modify the config file.
+MODE=lite_train_lite_infer
+BASEDIR=$(dirname "$0")
+
+# get the log path.
+dataline=$(cat ${BASE_CONFIG_FILE})
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+rm -rf $LOG_PATH
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_python.log"
+
+# make cudnn algorithm deterministic, such as conv.
+export FLAGS_cudnn_deterministic=True
+
+# read the base config and parse and run the sub commands
+config_line_numbers=`cat ${BASE_CONFIG_FILE} | grep -n "============" | cut -d':' -f1`
+for cln in $config_line_numbers
+do
+    # change IFS to prevent \n is parsed as delimiter.
+    IFS=""
+    config_lines=$(cat ${BASE_CONFIG_FILE} | sed -n "${cln},\$p" | head -n 22)
+    config_name=`echo ${config_lines} | grep '=====' | cut -d' ' -f2`
+    FILENAME=$LOG_PATH/dy2static_$config_name.txt
+    echo "[Start dy2static]" "${config_name} : ${FILENAME}"
+    echo ${config_lines} > $FILENAME
+    sed -i 's/gpu_list.*$/gpu_list:0/g' $FILENAME
+
+    # execute the last line command
+    custom_cmd=$(echo $config_lines | tail -n 1)
+    echo "CustomCmd is: " $custom_cmd
+    eval $custom_cmd
+
+    IFS=$'\n'
+
+    # start dygraph train
+    dygraph_output=$LOG_PATH/${config_name}_python_train_infer_dygraph_output.txt
+    dygraph_loss=$LOG_PATH/${config_name}_dygraph_loss.txt
+    cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} $MODE >$dygraph_output 2>&1"
+    echo $cmd
+    eval $cmd
+
+    # start dy2static train
+    dy2static_output=$LOG_PATH/${config_name}_python_train_infer_dy2static_output.txt
+    dy2static_loss=$LOG_PATH/${config_name}_dy2static_loss.txt
+    sed -i '16s/$/ -o to_static=True/' ${FILENAME}
+    cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} $MODE >$dy2static_output 2>&1"
+    echo $cmd
+    eval $cmd
+
+    # analysis and compare the losses.
+    dyout=`cat $dy2static_output | python test_tipc/extract_loss.py -v 'train step' -e 'loss: {%f} ' | head -n 3`
+    stout=`cat $dygraph_output   | python test_tipc/extract_loss.py -v 'train step' -e 'loss: {%f} ' | head -n 3`
+    echo $dyout > $dygraph_loss
+    echo $stout > $dy2static_loss
+    diff_log=$LOG_PATH/${config_name}_diff_log.txt
+    diff_cmd="diff -w $dygraph_loss $dy2static_loss > $diff_log"
+    eval $diff_cmd
+    last_status=$?
+    cat $diff_log
+    if [ "$dyout" = "" ]; then
+        status_check 1 $diff_cmd $status_log $model_name $diff_log
+    elif [ "$stout" = "" ]; then
+        status_check 2 $diff_cmd $status_log $model_name $diff_log
+    else
+        status_check $last_status $diff_cmd $status_log $model_name $diff_log
+    fi
+done
diff --git a/docs/src/test_tipc/test_train_inference_python.sh b/docs/src/test_tipc/test_train_inference_python.sh
new file mode 100644
index 000000000..2b92332ee
--- /dev/null
+++ b/docs/src/test_tipc/test_train_inference_python.sh
@@ -0,0 +1,433 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer']
+MODE=$2
+
+dataline=$(cat ${FILENAME})
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+gpu_list=$(func_parser_value "${lines[3]}")
+train_use_gpu_key=$(func_parser_key "${lines[4]}")
+train_use_gpu_value=$(func_parser_value "${lines[4]}")
+autocast_list=$(func_parser_value "${lines[5]}")
+autocast_key=$(func_parser_key "${lines[5]}")
+epoch_key=$(func_parser_key "${lines[6]}")
+epoch_num=$(func_parser_value "${lines[6]}")
+save_model_key=$(func_parser_key "${lines[7]}")
+train_batch_key=$(func_parser_key "${lines[8]}")
+train_batch_value=$(func_parser_value "${lines[8]}")
+pretrain_model_key=$(func_parser_key "${lines[9]}")
+pretrain_model_value=$(func_parser_value "${lines[9]}")
+train_model_name=$(func_parser_value "${lines[10]}")
+train_param_key1=$(func_parser_key "${lines[12]}")
+train_param_value1=$(func_parser_value "${lines[12]}")
+train_param_key2=$(func_parser_key "${lines[11]}")
+train_param_value2=$(func_parser_value "${lines[11]}")
+
+trainer_list=$(func_parser_value "${lines[14]}")
+trainer_norm=$(func_parser_key "${lines[15]}")
+norm_trainer=$(func_parser_value "${lines[15]}")
+pact_key=$(func_parser_key "${lines[16]}")
+pact_trainer=$(func_parser_value "${lines[16]}")
+fpgm_key=$(func_parser_key "${lines[17]}")
+fpgm_trainer=$(func_parser_value "${lines[17]}")
+distill_key=$(func_parser_key "${lines[18]}")
+distill_trainer=$(func_parser_value "${lines[18]}")
+amp_key=$(func_parser_key "${lines[19]}")
+amp_trainer=$(func_parser_value "${lines[19]}")
+trainer_key2=$(func_parser_key "${lines[20]}")
+trainer_value2=$(func_parser_value "${lines[20]}")
+
+eval_py=$(func_parser_value "${lines[23]}")
+eval_key1=$(func_parser_key "${lines[24]}")
+eval_value1=$(func_parser_value "${lines[24]}")
+
+save_infer_key=$(func_parser_key "${lines[27]}")
+save_infer_value=$(func_parser_value "${lines[27]}")
+
+export_weight=$(func_parser_key "${lines[28]}")
+norm_export=$(func_parser_value "${lines[29]}")
+pact_export=$(func_parser_value "${lines[30]}")
+fpgm_export=$(func_parser_value "${lines[31]}")
+distill_export=$(func_parser_value "${lines[32]}")
+export_key1=$(func_parser_key "${lines[33]}")
+export_value1=$(func_parser_value "${lines[33]}")
+export_key2=$(func_parser_key "${lines[34]}")
+export_value2=$(func_parser_value "${lines[34]}")
+inference_dir=$(func_parser_value "${lines[35]}")
+
+# parser inference model
+infer_model_dir_list=$(func_parser_value "${lines[36]}")
+infer_export_list=$(func_parser_value "${lines[37]}")
+infer_is_quant=$(func_parser_value "${lines[38]}")
+# parser inference
+inference_py=$(func_parser_value "${lines[39]}")
+use_gpu_key=$(func_parser_key "${lines[40]}")
+use_gpu_list=$(func_parser_value "${lines[40]}")
+use_mkldnn_key=$(func_parser_key "${lines[41]}")
+use_mkldnn_list=$(func_parser_value "${lines[41]}")
+cpu_threads_key=$(func_parser_key "${lines[42]}")
+cpu_threads_list=$(func_parser_value "${lines[42]}")
+batch_size_key=$(func_parser_key "${lines[43]}")
+batch_size_list=$(func_parser_value "${lines[43]}")
+use_trt_key=$(func_parser_key "${lines[44]}")
+use_trt_list=$(func_parser_value "${lines[44]}")
+precision_key=$(func_parser_key "${lines[45]}")
+precision_list=$(func_parser_value "${lines[45]}")
+infer_model_key=$(func_parser_key "${lines[46]}")
+infer_model_value=$(func_parser_value "${lines[46]}")
+
+video_dir_key=$(func_parser_key "${lines[47]}")
+infer_video_dir=$(func_parser_value "${lines[47]}")
+save_log_key=$(func_parser_key "${lines[48]}")
+benchmark_key=$(func_parser_key "${lines[49]}")
+benchmark_value=$(func_parser_value "${lines[49]}")
+
+infer_key1=$(func_parser_key "${lines[50]}")
+infer_value1=$(func_parser_value "${lines[50]}")
+
+line_num=`grep -n -w "to_static_train_benchmark_params" $FILENAME  | cut -d ":" -f 1`
+to_static_key=$(func_parser_key "${lines[line_num]}")
+to_static_trainer=$(func_parser_value "${lines[line_num]}")
+
+# parser klquant_infer
+if [ ${MODE} = "klquant_whole_infer" ]; then
+    dataline=$(awk 'NR==1 NR==17{print}'  $FILENAME)
+    lines=(${dataline})
+    model_name=$(func_parser_value "${lines[1]}")
+    python=$(func_parser_value "${lines[2]}")
+    # parser inference model
+    infer_model_dir_list=$(func_parser_value "${lines[3]}")
+    infer_export_list=$(func_parser_value "${lines[4]}")
+    infer_is_quant=$(func_parser_value "${lines[5]}")
+    # parser inference
+    inference_py=$(func_parser_value "${lines[6]}")
+    use_gpu_key=$(func_parser_key "${lines[7]}")
+    use_gpu_list=$(func_parser_value "${lines[7]}")
+    use_mkldnn_key=$(func_parser_key "${lines[8]}")
+    use_mkldnn_list=$(func_parser_value "${lines[8]}")
+    cpu_threads_key=$(func_parser_key "${lines[9]}")
+    cpu_threads_list=$(func_parser_value "${lines[9]}")
+    batch_size_key=$(func_parser_key "${lines[10]}")
+    batch_size_list=$(func_parser_value "${lines[10]}")
+    use_trt_key=$(func_parser_key "${lines[11]}")
+    use_trt_list=$(func_parser_value "${lines[11]}")
+    precision_key=$(func_parser_key "${lines[12]}")
+    precision_list=$(func_parser_value "${lines[12]}")
+    infer_model_key=$(func_parser_key "${lines[13]}")
+    video_dir_key=$(func_parser_key "${lines[14]}")
+    infer_video_dir=$(func_parser_value "${lines[14]}")
+    save_log_key=$(func_parser_key "${lines[15]}")
+    benchmark_key=$(func_parser_key "${lines[16]}")
+    benchmark_value=$(func_parser_value "${lines[16]}")
+    infer_key1=$(func_parser_key "${lines[17]}")
+    infer_value1=$(func_parser_value "${lines[17]}")
+fi
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_python.log"
+
+
+function func_inference(){
+    IFS='|'
+    _python=$1
+    _script=$2
+    _model_dir=$3
+    _log_path=$4
+    _video_dir=$5
+    _flag_quant=$6
+    _gpu=$7
+    # inference
+    for use_gpu in ${use_gpu_list[*]}; do
+        if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+            for use_mkldnn in ${use_mkldnn_list[*]}; do
+                if [[ ${use_mkldnn} = "False" ]] && [[ ${_flag_quant} = "True" ]]; then
+                    continue
+                fi
+                for threads in ${cpu_threads_list[*]}; do
+                    for batch_size in ${batch_size_list[*]}; do
+                        for precision in ${precision_list[*]}; do
+                            if [[ ${use_mkldnn} = "False" ]] && [[ ${precision} = "fp16" ]]; then
+                                continue
+                            fi # skip when enable fp16 but disable mkldnn
+                            if [[ ${_flag_quant} = "True" ]] && [[ ${precision} != "int8" ]]; then
+                                continue
+                            fi # skip when quant model inference but precision is not int8
+                            set_precision=$(func_set_params "${precision_key}" "${precision}")
+
+                            _save_log_path="${_log_path}/python_infer_cpu_gpus_${_gpu}_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
+                            mkdir -p ${_log_path}
+                            set_infer_data=$(func_set_params "${video_dir_key}" "${infer_video_dir}")
+                            set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                            set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                            set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}")
+                            set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}/${infer_model_value}")
+                            set_infer_params1=$(func_set_params "${infer_key1}" "${_model_dir}/${infer_value1}")
+                            command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+                            eval $command
+                            last_status=${PIPESTATUS[0]}
+                            eval "cat ${_save_log_path}"
+                            status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+                        done
+                    done
+                done
+            done
+        elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+            for use_trt in ${use_trt_list[*]}; do
+                for precision in ${precision_list[*]}; do
+                    if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then
+                        continue
+                    fi
+                    if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [[ ${use_trt} = "False" ]]; then
+                        continue
+                    fi
+                    if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [[ ${_flag_quant} = "True" ]]; then
+                        continue
+                    fi
+                    for batch_size in ${batch_size_list[*]}; do
+                        _save_log_path="${_log_path}/python_infer_gpu_gpus_${_gpu}_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
+                        set_infer_data=$(func_set_params "${video_dir_key}" "${infer_video_dir}")
+
+                        set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                        set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                        set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}")
+                        set_precision=$(func_set_params "${precision_key}" "${precision}")
+                        set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}/${infer_model_value}")
+                        set_infer_params1=$(func_set_params "${infer_key1}" "${_model_dir}/${infer_value1}")
+                        command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+
+                        eval $command
+
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+
+                    done
+                done
+            done
+        else
+            echo "Does not support hardware other than CPU and GPU Currently!"
+        fi
+    done
+}
+
+if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
+    GPUID=$3
+    if [ ${#GPUID} -le 0 ];then
+        env=" "
+    else
+        env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+    fi
+    set CUDA_VISIBLE_DEVICES
+    eval $env
+    export Count=0
+    IFS="|"
+    infer_run_exports=(${infer_export_list})
+    infer_quant_flag=(${infer_is_quant})
+    for infer_model in ${infer_model_dir_list[*]}; do
+        # run export
+        if [ ${infer_run_exports[Count]} != "null" ];then
+            save_infer_dir=$(dirname $infer_model)
+            set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
+            set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}")
+            export_log_path="${LOG_PATH}_export_${Count}.log"
+            export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 "
+            echo ${infer_run_exports[Count]}
+            eval $export_cmd
+            echo $export_cmd
+            status_export=$?
+            status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+
+        else
+            save_infer_dir=${infer_model}
+        fi
+        #run inference
+        is_quant=${infer_quant_flag[Count]}
+        if [ ${MODE} = "klquant_infer" ]; then
+            is_quant="True"
+        fi
+        func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_video_dir}" ${is_quant} "${gpu}"
+        Count=$(($Count + 1))
+    done
+else
+    IFS="|"
+    export Count=0
+    USE_GPU_KEY=(${train_use_gpu_value})
+    for gpu in ${gpu_list[*]}; do
+        train_use_gpu=${USE_GPU_KEY[Count]}
+        Count=$(($Count + 1))
+        ips=""
+        if [ ${gpu} = "-1" ];then
+            env=""
+        elif [ ${#gpu} -le 1 ];then
+            env="export CUDA_VISIBLE_DEVICES=${gpu}"
+            eval ${env}
+        elif [ ${#gpu} -le 15 ];then
+            IFS=","
+            array=(${gpu})
+            env="export CUDA_VISIBLE_DEVICES=${array[0]}"
+            IFS="|"
+        else
+            IFS=";"
+            array=(${gpu})
+            ips=${array[0]}
+            gpu=${array[1]}
+            IFS="|"
+            env=" "
+        fi
+        for autocast in ${autocast_list[*]}; do
+            if [ ${autocast} = "fp16" ]; then
+                set_amp_config="--amp --amp_level 'O2'"
+            else
+                set_amp_config=" "
+            fi
+            for trainer in ${trainer_list[*]}; do
+                flag_quant=False
+                if [ ${trainer} = ${pact_key} ]; then
+                    run_train=${pact_trainer}
+                    run_export=${pact_export}
+                    flag_quant=True
+                elif [ ${trainer} = "${fpgm_key}" ]; then
+                    run_train=${fpgm_trainer}
+                    run_export=${fpgm_export}
+                elif [ ${trainer} = "${distill_key}" ]; then
+                    run_train=${distill_trainer}
+                    run_export=${distill_export}
+                elif [ ${trainer} = ${amp_key} ]; then
+                    run_train=${amp_trainer}
+                    run_export=${norm_export}
+                elif [[ ${trainer} = ${trainer_key2} ]]; then
+                    run_train=${trainer_value2}
+                    run_export=${export_value2}
+                # In case of @to_static, we re-used norm_traier,
+                # but append "-o to_static=True" for config
+                # to trigger "to_static" logic in 'train.py'
+                elif [ ${trainer} = "${to_static_key}" ]; then
+                    run_train="${norm_trainer}  ${to_static_trainer}"
+                    run_export=${norm_export}
+                else
+                    run_train=${norm_trainer}
+                    run_export=${norm_export}
+                fi
+
+                if [ ${run_train} = "null" ]; then
+                    continue
+                fi
+                if [[ ${MODE} != "benchmark_train" ]] && [[ ! ${MODE} =~ "whole_train" ]]; then
+                    # 训练参数末尾加上--max_iters=30和--log_interval=1，以便运行并输出足量数据
+                    run_train=${run_train}" --max_iters=30"
+                fi
+                set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
+                set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
+
+                if [[ $MODE =~ "whole_train" ]]; then
+                    set_epoch=""
+                fi
+
+                set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
+                if [[ $MODE =~ "whole_train" ]]; then
+                    train_batch_key=""
+                    train_batch_value=""
+                fi
+                set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
+                if [[ $MODE =~ "whole_train" ]]; then
+                    train_param_key1=""
+                    train_param_value1=""
+                fi
+                set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
+                if [[ $MODE =~ "whole_train" ]]; then
+                    train_param_key2=""
+                    train_param_value2=""
+                fi
+                set_train_params2=$(func_set_params "${train_param_key2}" "${train_param_value2}")
+                set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}")
+                if [ ${#ips} -le 15 ];then
+                    # len(ips)<=15, single machine
+                    nodes=1
+                    save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
+                else
+                    # if length of ips > 15, then it is seen as multi-machine
+                    # 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0
+                    IFS=","
+                    ips_array=(${ips})
+                    IFS="|"
+                    nodes=${#ips_array[@]}
+                    save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
+                fi
+
+                # load pretrain from norm training if current trainer is pact or fpgm trainer
+                if ([ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]) && [ ${nodes} -le 1 ]; then
+                    set_pretrain="${load_norm_train_model}"
+                fi
+
+                set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
+                if [ ${#gpu} -le 2 ];then  # train with cpu or single gpu
+                    cmd="${python} ${run_train} ${set_amp_config} ${set_use_gpu}  ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_train_params2} > ${LOG_PATH}/train.log 2>&1"
+                elif [ ${#ips} -le 15 ];then  # train with multi-gpu
+                    cmd="${python} -B -m paddle.distributed.launch --devices=\"${gpu}\" ${run_train} ${set_amp_config} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_train_params2} > ${LOG_PATH}/train.log 2>&1"
+                else     # train with multi-machine
+                    cmd="${python} -B -m paddle.distributed.launch --ips=${ips} --devices=\"${gpu}\" ${run_train} ${set_amp_config} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_train_params1} ${set_train_params2} > ${LOG_PATH}/train.log 2>&1"
+                fi
+
+                # run train
+                eval $cmd
+                # display log for benchmark train
+                eval "cat ${LOG_PATH}/train.log"
+                eval "cat ${LOG_PATH}/train.log >> ${save_log}.log"
+                status_check $? "${cmd}" "${status_log}" "${model_name}" "${save_log}.log"
+
+                # set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}")
+                # save norm trained models to set pretrain for pact training and fpgm training
+                if [ [${trainer} = ${trainer_norm}] ] && [ [${nodes} -le 1] ]; then
+                    load_norm_train_model=${set_eval_pretrain}
+                fi
+                # run eval
+                if [ ${eval_py} != "null" ]; then
+                    real_model_name=${model_name/PP-/pp}
+                    set_eval_params1=$(func_set_params "${eval_key1}" "${save_log}/${real_model_name}_epoch_00001.pdparams")
+                    eval_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log"
+                    if [[ $MODE =~ "lite_infer" ]] && [[ ${train_param_key1} != "null" ]]; then
+                        eval_cmd="${python} ${eval_py} ${set_use_gpu} ${set_eval_params1} ${train_param_key1}=${train_param_value1} > ${eval_log_path} 2>&1 "
+                    else
+                        eval_cmd="${python} ${eval_py} ${set_use_gpu} ${set_eval_params1} > ${eval_log_path} 2>&1 "
+                    fi
+                    eval $eval_cmd
+                    status_check $? "${eval_cmd}" "${status_log}" "${model_name}" "${eval_log_path}"
+                fi
+                # run export model
+                if [ ${run_export} != "null" ]; then
+                    save_infer_path="${save_log}"
+                    real_model_name=${model_name/PP-/pp}
+                    set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${real_model_name}_epoch_00001.pdparams")
+
+                    set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_log}")
+                    export_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_export.log"
+                    export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 "
+                    eval $export_cmd
+                    status_check $? "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+
+                    #run inference
+                    eval $env
+                    save_infer_path="${save_log}"
+                    if [ ${inference_dir} != "null" ] && [ ${inference_dir} != '##' ]; then
+                        infer_model_dir=${save_infer_path}
+                    else
+                        infer_model_dir=${save_infer_path}
+                    fi
+                    func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${infer_video_dir}" "${flag_quant}" "${gpu}"
+
+                    eval "unset CUDA_VISIBLE_DEVICES"
+                fi
+            done  # done with:    for trainer in ${trainer_list[*]}; do
+        done      # done with:    for autocast in ${autocast_list[*]}; do
+    done          # done with:    for gpu in ${gpu_list[*]}; do
+fi  # end if [ ${MODE} = "infer" ]; then
diff --git a/docs/src/test_tipc/test_train_inference_python_npu.sh b/docs/src/test_tipc/test_train_inference_python_npu.sh
new file mode 100644
index 000000000..ed627ab09
--- /dev/null
+++ b/docs/src/test_tipc/test_train_inference_python_npu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+function readlinkf() {
+    perl -MCwd -e "print Cwd::abs_path shift" "$1";
+}
+
+function func_parser_config() {
+    strs=$1
+    IFS=" "
+    array=(${strs})
+    tmp=${array[2]}
+    echo ${tmp}
+}
+
+BASEDIR=$(dirname "$0")
+REPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)
+
+FILENAME=$1
+
+# disable mkldnn on non x86_64 env
+arch=$(uname -i)
+if [ $arch != "x86_64" ]; then
+    sed -i "s/--enable_mkldnn:True|False/--enable_mkldnn:False/g" $FILENAME
+    sed -i "s/--enable_mkldnn:True/--enable_mkldnn:False/g" $FILENAME
+fi
+
+# change gpu to npu in tipc txt configs
+sed -i "s/use_gpu/use_npu/g" $FILENAME
+# disable benchmark as AutoLog required nvidia-smi command
+sed -i "s/--enable_benchmark:True/--enable_benchmark:False/g" $FILENAME
+# python has been updated to version 3.9 for npu backend
+sed -i "s/python3.7/python3.9/g" $FILENAME
+dataline=`cat $FILENAME`
+
+# change gpu to npu in execution script
+sed -i "s/\"gpu\"/\"npu\"/g" test_tipc/test_train_inference_python.sh
+
+# pass parameters to test_train_inference_python.sh
+cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} $2"
+echo -e "\033[1;32m Started to run command: ${cmd}!  \033[0m"
+eval $cmd
diff --git a/docs/src/test_tipc/test_train_inference_python_xpu.sh b/docs/src/test_tipc/test_train_inference_python_xpu.sh
new file mode 100644
index 000000000..c1069a1e5
--- /dev/null
+++ b/docs/src/test_tipc/test_train_inference_python_xpu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+function readlinkf() {
+    perl -MCwd -e "print Cwd::abs_path shift" "$1";
+}
+
+function func_parser_config() {
+    strs=$1
+    IFS=" "
+    array=(${strs})
+    tmp=${array[2]}
+    echo ${tmp}
+}
+
+BASEDIR=$(dirname "$0")
+REPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)
+
+FILENAME=$1
+
+# disable mkldnn on non x86_64 env
+arch=$(uname -i)
+if [ $arch != "x86_64" ]; then
+    sed -i "s/--enable_mkldnn:True|False/--enable_mkldnn:False/g" $FILENAME
+    sed -i "s/--enable_mkldnn:True/--enable_mkldnn:False/g" $FILENAME
+fi
+
+# change gpu to xpu in tipc txt configs
+sed -i "s/use_gpu/use_xpu/g" $FILENAME
+# disable benchmark as AutoLog required nvidia-smi command
+sed -i "s/--enable_benchmark:True/--enable_benchmark:False/g" $FILENAME
+# python has been updated to version 3.9 for npu backend
+sed -i "s/python3.7/python3.9/g" $FILENAME
+dataline=`cat $FILENAME`
+
+# change gpu to xpu in execution script
+sed -i "s/\"gpu\"/\"xpu\"/g" test_tipc/test_train_inference_python.sh
+
+# pass parameters to test_train_inference_python.sh
+cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} $2"
+echo -e "\033[1;32m Started to run command: ${cmd}!  \033[0m"
+eval $cmd
diff --git a/docs/src/tools/__init__.py b/docs/src/tools/__init__.py
new file mode 100644
index 000000000..e8d173d8f
--- /dev/null
+++ b/docs/src/tools/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['utils', 'PaddleVideo', 'ava_predict']
+
+from . import utils
+from .wheel import PaddleVideo
+from . import ava_predict
diff --git a/docs/src/tools/ava_predict.py b/docs/src/tools/ava_predict.py
new file mode 100644
index 000000000..5d333a24a
--- /dev/null
+++ b/docs/src/tools/ava_predict.py
@@ -0,0 +1,509 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import paddle
+import os, sys
+import copy as cp
+import cv2
+import math
+try:
+    import ppdet
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddledet] package and it's dependencies is required for AVA."
+    )
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from paddlevideo.modeling.builder import build_model
+from paddlevideo.utils import get_config
+from paddlevideo.loader.builder import build_dataloader, build_dataset, build_pipeline
+from paddlevideo.metrics.ava_utils import read_labelmap
+
+import time
+from os import path as osp
+import numpy as np
+from paddlevideo.utils import get_config
+import pickle
+
+from paddlevideo.utils import (get_logger, load, mkdir, save)
+import shutil
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255)  # BGR, white
+MSGCOLOR = (128, 128, 128)  # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+plate_blue = plate_blue.split('-')
+plate_blue = [hex2color(h) for h in plate_blue]
+plate_green = '004b23-006400-007200-008000-38b000-70e000'
+plate_green = plate_green.split('-')
+plate_green = [hex2color(h) for h in plate_green]
+
+
+def abbrev(name):
+    """Get the abbreviation of label name:
+    'take (an object) from (a person)' -> 'take ... from ...'
+    """
+    while name.find('(') != -1:
+        st, ed = name.find('('), name.find(')')
+        name = name[:st] + '...' + name[ed + 1:]
+    return name
+
+
+# annotations is pred results
+def visualize(frames, annotations, plate=plate_blue, max_num=5):
+    """Visualize frames with predicted annotations.
+    Args:
+        frames (list[np.ndarray]): Frames for visualization, note that
+            len(frames) % len(annotations) should be 0.
+        annotations (list[list[tuple]]): The predicted results.
+        plate (str): The plate used for visualization. Default: plate_blue.
+        max_num (int): Max number of labels to visualize for a person box.
+            Default: 5，目前不能大于5.
+    Returns:
+        list[np.ndarray]: Visualized frames.
+    """
+
+    assert max_num + 1 <= len(plate)
+    plate = [x[::-1] for x in plate]
+    frames_ = cp.deepcopy(frames)
+    nf, na = len(frames), len(annotations)
+    assert nf % na == 0
+    nfpa = len(frames) // len(annotations)
+    anno = None
+    h, w, _ = frames[0].shape
+    # proposals被归一化需要还原真实坐标值
+    scale_ratio = np.array([w, h, w, h])
+
+    for i in range(na):
+        anno = annotations[i]
+        if anno is None:
+            continue
+        for j in range(nfpa):
+            ind = i * nfpa + j
+            frame = frames_[ind]
+            for ann in anno:
+                box = ann[0]
+                label = ann[1]
+                if not len(label):
+                    continue
+                score = ann[2]
+                box = (box * scale_ratio).astype(np.int64)
+                st, ed = tuple(box[:2]), tuple(box[2:])
+                cv2.rectangle(frame, st, ed, plate[0], 2)
+                for k, lb in enumerate(label):
+                    if k >= max_num:
+                        break
+                    text = abbrev(lb)
+                    text = ': '.join([text, str(score[k])])
+                    location = (0 + st[0], 18 + k * 18 + st[1])
+                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+                                               THICKNESS)[0]
+                    textwidth = textsize[0]
+                    diag0 = (location[0] + textwidth, location[1] - 14)
+                    diag1 = (location[0], location[1] + 2)
+                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                                FONTCOLOR, THICKNESS, LINETYPE)
+
+    return frames_
+
+
+def frame_extraction(video_path, target_dir):
+    """Extract frames given video_path.
+    Args:
+        video_path (str): The video_path.
+    """
+
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir, exist_ok=True)
+
+    # Should be able to handle videos up to several hours
+    frame_tmpl = osp.join(target_dir, '{:05d}.jpg')
+    vid = cv2.VideoCapture(video_path)
+
+    FPS = int(vid.get(5))
+
+    frames = []
+    frame_paths = []
+
+    flag, frame = vid.read()
+    index = 1
+    while flag:
+        frames.append(frame)
+        frame_path = frame_tmpl.format(index)
+        frame_paths.append(frame_path)
+        cv2.imwrite(frame_path, frame)
+        index += 1
+        flag, frame = vid.read()
+    return frame_paths, frames, FPS
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    # general params
+    parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/example.yaml',
+                        help='config file path')
+
+    parser.add_argument('--video_path', help='video file/url')
+
+    parser.add_argument('-o',
+                        '--override',
+                        action='append',
+                        default=[],
+                        help='config options to be overridden')
+    parser.add_argument('-w',
+                        '--weights',
+                        type=str,
+                        help='weights for finetuning or testing')
+
+    #detection_model_name
+    parser.add_argument('--detection_model_name',
+                        help='the name of detection model ')
+    # detection_model_weights
+    parser.add_argument('--detection_model_weights',
+                        help='the weights path of detection model ')
+
+    # params for predict
+    parser.add_argument('--out-filename',
+                        default='ava_det_demo.mp4',
+                        help='output filename')
+    parser.add_argument('--predict-stepsize',
+                        default=8,
+                        type=int,
+                        help='give out a prediction per n frames')
+    parser.add_argument(
+        '--output-stepsize',
+        default=4,
+        type=int,
+        help=('show one frame per n frames in the demo, we should have: '
+              'predict_stepsize % output_stepsize == 0'))
+    parser.add_argument('--output-fps',
+                        default=6,
+                        type=int,
+                        help='the fps of demo video output')
+
+    return parser.parse_args()
+
+
+# 一帧的结果。根据概率大小进行排序
+def pack_result(human_detection, result):
+    """Short summary.
+    Args:
+        human_detection (np.ndarray): Human detection result.
+        result (type): The predicted label of each human proposal.
+    Returns:
+        tuple: Tuple of human proposal, label name and label score.
+    """
+    results = []
+    if result is None:
+        return None
+
+    for prop, res in zip(human_detection, result):
+        res.sort(key=lambda x: -x[1])
+
+        results.append((prop, [x[0] for x in res], [x[1] for x in res]))
+
+    return results
+
+
+# 构造数据处理需要的results
+def get_timestep_result(frame_dir, timestamp, clip_len, frame_interval, FPS):
+    result = {}
+
+    result["frame_dir"] = frame_dir
+
+    frame_num = len(os.listdir(frame_dir))
+
+    dir_name = frame_dir.split("/")[-1]
+    result["video_id"] = dir_name
+
+    result['timestamp'] = timestamp
+
+    timestamp_str = '{:04d}'.format(timestamp)
+    img_key = dir_name + "," + timestamp_str
+    result['img_key'] = img_key
+
+    result['shot_info'] = (1, frame_num)
+    result['fps'] = FPS
+
+    result['suffix'] = '{:05}.jpg'
+
+    result['timestamp_start'] = 1
+    result['timestamp_end'] = int(frame_num / result['fps'])
+
+    return result
+
+
+def detection_inference(frame_paths, output_dir, model_name, weights_path):
+    """Detect human boxes given frame paths.
+    Args:
+        frame_paths (list[str]): The paths of frames to do detection inference.
+    Returns:
+        list[np.ndarray]: The human detection results.
+    """
+
+    detection_cfg = ppdet.model_zoo.get_config_file(model_name)
+    detection_cfg = ppdet.core.workspace.load_config(detection_cfg)
+    detection_trainer = ppdet.engine.Trainer(detection_cfg, mode='test')
+    detection_trainer.load_weights(weights_path)
+
+    print('Performing Human Detection for each frame')
+
+    detection_trainer.predict(frame_paths, output_dir=output_dir, save_txt=True)
+
+    print("finish object detection")
+
+    results = []
+
+    for frame_path in frame_paths:
+        (file_dir, file_name) = os.path.split(frame_path)
+        (file_path, ext) = os.path.splitext(frame_path)
+
+        txt_file_name = file_name.replace(ext, ".txt")
+        txt_path = os.path.join(output_dir, txt_file_name)
+        results.append(txt_path)
+
+    return results
+
+
+def get_detection_result(txt_file_path, img_h, img_w, person_det_score_thr):
+    """
+    根据检测结果文件得到图像中人的检测框(proposals)和置信度（scores）
+    txt_file_path:检测结果存放路径
+    img_h:图像高度
+    img_w:图像宽度
+    """
+
+    proposals = []
+    scores = []
+
+    with open(txt_file_path, 'r') as detection_file:
+        lines = detection_file.readlines()
+        for line in lines:  # person 0.9842637181282043 0.0 469.1407470703125 944.7770385742188 831.806396484375
+            items = line.split(" ")
+            if items[0] != 'person':  #只要人
+                continue
+
+            score = items[1]
+
+            if (float)(score) < person_det_score_thr:
+                continue
+
+            x1 = (float(items[2])) / img_w
+            y1 = ((float)(items[3])) / img_h
+            box_w = ((float)(items[4]))
+            box_h = ((float)(items[5]))
+
+            x2 = (float(items[2]) + box_w) / img_w
+            y2 = (float(items[3]) + box_h) / img_h
+
+            scores.append(score)
+
+            proposals.append([x1, y1, x2, y2])
+
+    return np.array(proposals), np.array(scores)
+
+
+@paddle.no_grad()
+def main(args):
+    config = get_config(args.config, show=False)  #parse config file
+
+    # extract frames from video
+    video_path = args.video_path
+    frame_dir = 'tmp_frames'
+    frame_paths, frames, FPS = frame_extraction(video_path, frame_dir)
+
+    num_frame = len(frame_paths)  #视频秒数*FPS
+    assert num_frame != 0
+    print("Frame Number：", num_frame)
+
+    # 帧图像高度和宽度
+    h, w, _ = frames[0].shape
+
+    # Get clip_len, frame_interval and calculate center index of each clip
+    data_process_pipeline = build_pipeline(config.PIPELINE.test)  #测试时输出处理流水配置
+
+    clip_len = config.PIPELINE.test.sample['clip_len']
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    frame_interval = config.PIPELINE.test.sample['frame_interval']
+
+    # 此处关键帧每秒取一个
+    clip_len = config.PIPELINE.test.sample['clip_len']
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    frame_interval = config.PIPELINE.test.sample['frame_interval']
+    window_size = clip_len * frame_interval
+    timestamps = np.arange(window_size // 2, (num_frame + 1 - window_size // 2),
+                           args.predict_stepsize)
+    print("timetamps number:", len(timestamps))
+
+    # get selected frame list according to timestamps
+    selected_frame_list = []
+    for timestamp in timestamps:
+        selected_frame_list.append(frame_paths[timestamp - 1])
+
+    # Load label_map
+    label_map_path = config.DATASET.test['label_file']
+    categories, class_whitelist = read_labelmap(open(label_map_path))
+    label_map = {}
+    for item in categories:
+        id = item['id']
+        name = item['name']
+        label_map[id] = name
+
+    # Construct model.
+    if config.MODEL.backbone.get('pretrained'):
+        config.MODEL.backbone.pretrained = ''  # disable pretrain model init
+    model = build_model(config.MODEL)
+
+    model.eval()
+    state_dicts = load(args.weights)
+    model.set_state_dict(state_dicts)
+
+    detection_result_dir = 'tmp_detection'
+    detection_model_name = args.detection_model_name
+    detection_model_weights = args.detection_model_weights
+    detection_txt_list = detection_inference(selected_frame_list,
+                                             detection_result_dir,
+                                             detection_model_name,
+                                             detection_model_weights)
+    assert len(detection_txt_list) == len(timestamps)
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    human_detections = []
+    predictions = []
+
+    index = 0
+    for timestamp, detection_txt_path in zip(timestamps, detection_txt_list):
+        proposals, scores = get_detection_result(
+            detection_txt_path, h, w,
+            (float)(config.DATASET.test['person_det_score_thr']))
+        if proposals.shape[0] == 0:
+            predictions.append(None)
+            human_detections.append(None)
+            continue
+
+        human_detections.append(proposals)
+
+        result = get_timestep_result(frame_dir,
+                                     timestamp,
+                                     clip_len,
+                                     frame_interval,
+                                     FPS=FPS)
+        result["proposals"] = proposals
+        result["scores"] = scores
+
+        new_result = data_process_pipeline(result)
+        proposals = new_result['proposals']
+
+        img_slow = new_result['imgs'][0]
+        img_slow = img_slow[np.newaxis, :]
+        img_fast = new_result['imgs'][1]
+        img_fast = img_fast[np.newaxis, :]
+
+        proposals = proposals[np.newaxis, :]
+
+        scores = scores[np.newaxis, :]
+
+        img_shape = np.asarray(new_result['img_shape'])
+        img_shape = img_shape[np.newaxis, :]
+
+        data = [
+            paddle.to_tensor(img_slow, dtype='float32'),
+            paddle.to_tensor(img_fast, dtype='float32'),
+            paddle.to_tensor(proposals, dtype='float32'), scores,
+            paddle.to_tensor(img_shape, dtype='int32')
+        ]
+
+        with paddle.no_grad():
+            result = model(data, mode='infer')
+
+            result = result[0]
+            prediction = []
+
+            person_num = proposals.shape[1]
+            # N proposals
+            for i in range(person_num):
+                prediction.append([])
+
+            # Perform action score thr
+            for i in range(len(result)):
+                if i + 1 not in class_whitelist:
+                    continue
+                for j in range(person_num):
+                    if result[i][j, 4] > config.MODEL.head['action_thr']:
+                        prediction[j].append((label_map[i + 1], result[i][j,
+                                                                          4]))
+            predictions.append(prediction)
+
+        index = index + 1
+        if index % 10 == 0:
+            print(index, "/", len(timestamps))
+
+    results = []
+    for human_detection, prediction in zip(human_detections, predictions):
+        results.append(pack_result(human_detection, prediction))
+
+    def dense_timestamps(timestamps, n):
+        """Make it nx frames."""
+        old_frame_interval = (timestamps[1] - timestamps[0])
+        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+        new_frame_inds = np.arange(
+            len(timestamps) * n) * old_frame_interval / n + start
+        return new_frame_inds.astype(np.int)
+
+    dense_n = int(args.predict_stepsize / args.output_stepsize)  #30
+    frames = [
+        cv2.imread(frame_paths[i - 1])
+        for i in dense_timestamps(timestamps, dense_n)
+    ]
+
+    vis_frames = visualize(frames, results)
+
+    try:
+        import moviepy.editor as mpy
+    except ImportError:
+        raise ImportError('Please install moviepy to enable output file')
+
+    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+                                fps=args.output_fps)
+    vid.write_videofile(args.out_filename)
+    print("finish write !")
+
+    # delete tmp files and dirs
+    shutil.rmtree(frame_dir)
+    shutil.rmtree(detection_result_dir)
+
+
+if __name__ == '__main__':
+    args = parse_args()  #解析参数
+    main(args)
diff --git a/docs/src/tools/export_model.py b/docs/src/tools/export_model.py
new file mode 100644
index 000000000..401091aa1
--- /dev/null
+++ b/docs/src/tools/export_model.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import os.path as osp
+import sys
+
+import paddle
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from paddlevideo.modeling.builder import build_model
+from paddlevideo.utils import get_config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleVideo export model script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/example.yaml',
+                        help='config file path')
+    parser.add_argument('--override',
+                        action='append',
+                        default=[],
+                        help='config options to be overridden')
+    parser.add_argument("-p",
+                        "--pretrained_params",
+                        default='./best.pdparams',
+                        type=str,
+                        help='params path')
+    parser.add_argument("-o",
+                        "--output_path",
+                        type=str,
+                        default="./inference",
+                        help='output path')
+
+    parser.add_argument('--save_name',
+                        type=str,
+                        default=None,
+                        help='specify the exported inference \
+                             files(pdiparams and pdmodel) name,\
+                             only used in TIPC')
+
+    return parser.parse_args()
+
+
+def trim_config(cfg):
+    """
+    Reuse the trainging config will bring useless attributes, such as: backbone.pretrained model.
+    and some build phase attributes should be overrided, such as: backbone.num_seg.
+    Trim it here.
+    """
+    model_name = cfg.model_name
+    if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):
+        cfg.MODEL.backbone.pretrained = ""  # not ued when inference
+
+    # for distillation
+    if cfg.MODEL.get('models'):
+        if cfg.MODEL.models[0]['Teacher']['backbone'].get('pretrained'):
+            cfg.MODEL.models[0]['Teacher']['backbone']['pretrained'] = ""
+        if cfg.MODEL.models[1]['Student']['backbone'].get('pretrained'):
+            cfg.MODEL.models[1]['Student']['backbone']['pretrained'] = ""
+
+    return cfg, model_name
+
+
+def get_input_spec(cfg, model_name):
+    if model_name in ['ppTSM', 'TSM', 'MoViNet', 'ppTSMv2']:
+        input_spec = [[
+            InputSpec(
+                shape=[None, cfg.num_seg, 3, cfg.target_size, cfg.target_size],
+                dtype='float32'),
+        ]]
+    elif model_name in ['TokenShiftVisionTransformer']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['TSN', 'ppTSN']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, cfg.num_seg * 10, 3, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['BMN']:
+        input_spec = [[
+            InputSpec(shape=[None, cfg.feat_dim, cfg.tscale],
+                      dtype='float32',
+                      name='feat_input'),
+        ]]
+    elif model_name in ['TimeSformer', 'ppTimeSformer']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['VideoSwin']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_seg * cfg.seg_len * 1, cfg.target_size,
+                cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['VideoSwin_TableTennis']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_seg * cfg.seg_len * 3, cfg.target_size,
+                cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['AttentionLSTM']:
+        input_spec = [[
+            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],
+                      dtype='float32'),  # for rgb_data
+            InputSpec(shape=[
+                None,
+            ], dtype='int64'),  # for rgb_len
+            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],
+                      dtype='float32'),  # for rgb_mask
+            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],
+                      dtype='float32'),  # for audio_data
+            InputSpec(shape=[
+                None,
+            ], dtype='int64'),  # for audio_len
+            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],
+                      dtype='float32'),  # for audio_mask
+        ]]
+    elif model_name in ['SlowFast']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,
+                cfg.target_size
+            ],
+                      dtype='float32',
+                      name='slow_input'),
+            InputSpec(shape=[
+                None, 3, cfg.num_frames, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32',
+                      name='fast_input'),
+        ]]
+    elif model_name in ['STGCN', 'AGCN', 'CTRGCN']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, cfg.num_channels, cfg.window_size, cfg.vertex_nums,
+                cfg.person_nums
+            ],
+                      dtype='float32'),
+        ]]
+    # 由于在模型运行过程中涉及到第一维乘human个数(N*M), 所以这里用1作为shape
+    elif model_name in ['AGCN2s']:
+        input_spec = [[
+            InputSpec(shape=[
+                1, cfg.num_channels, cfg.window_size, cfg.vertex_nums,
+                cfg.person_nums
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['TransNetV2']:
+        input_spec = [[
+            InputSpec(shape=[
+                None,
+                cfg.num_frames,
+                cfg.height,
+                cfg.width,
+                cfg.num_channels,
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['MSTCN', 'ASRF']:
+        input_spec = [[
+            InputSpec(shape=[None, cfg.num_channels, None], dtype='float32'),
+        ]]
+    elif model_name in ['ADDS']:
+        input_spec = [[
+            InputSpec(shape=[None, cfg.num_channels, cfg.height, cfg.width],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['AVA_SlowFast_FastRcnn']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,
+                cfg.target_size
+            ],
+                      dtype='float32',
+                      name='slow_input'),
+            InputSpec(shape=[
+                None, 3, cfg.num_frames, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32',
+                      name='fast_input'),
+            InputSpec(shape=[None, None, 4], dtype='float32', name='proposals'),
+            InputSpec(shape=[None, 2], dtype='float32', name='img_shape')
+        ]]
+    elif model_name in ['PoseC3D']:
+        input_spec = [[
+            InputSpec(shape=[None, 1, 17, 48, 56, 56], dtype='float32'),
+        ]]
+    elif model_name in ['YOWO']:
+        input_spec = [[
+            InputSpec(shape=[
+                1, 3, cfg.num_seg, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    return input_spec
+
+
+def main():
+    args = parse_args()
+    cfg, model_name = trim_config(
+        get_config(args.config, overrides=args.override, show=False))
+
+    print(f"Building model({model_name})...")
+    model = build_model(cfg.MODEL)
+    assert osp.isfile(
+        args.pretrained_params
+    ), f"pretrained params ({args.pretrained_params} is not a file path.)"
+
+    if not os.path.isdir(args.output_path):
+        os.makedirs(args.output_path)
+
+    print(f"Loading params from ({args.pretrained_params})...")
+    params = paddle.load(args.pretrained_params)
+    model.set_dict(params)
+
+    model.eval()
+
+    # for rep nets
+    for layer in model.sublayers():
+        if hasattr(layer, "rep") and not getattr(layer, "is_repped"):
+            layer.rep()
+            
+    input_spec = get_input_spec(cfg.INFERENCE, model_name)
+    model = to_static(model, input_spec=input_spec)
+    paddle.jit.save(
+        model,
+        osp.join(args.output_path,
+                 model_name if args.save_name is None else args.save_name))
+    print(
+        f"model ({model_name}) has been already saved in ({args.output_path}).")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/src/tools/predict.py b/docs/src/tools/predict.py
new file mode 100644
index 000000000..bc9bd8ce7
--- /dev/null
+++ b/docs/src/tools/predict.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from os import path as osp
+import paddle
+from paddle import inference
+from paddle.inference import Config, create_predictor
+
+from utils import build_inference_helper
+from paddlevideo.utils import get_config
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    # general params
+    parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/example.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    parser.add_argument("-i", "--input_file", type=str, help="input file path")
+    parser.add_argument(
+        "--time_test_file",
+        type=str2bool,
+        default=False,
+        help="whether input time test file")
+    parser.add_argument("--model_file", type=str)
+    parser.add_argument("--params_file", type=str)
+
+    # params for paddle predict
+    parser.add_argument("-b", "--batch_size", type=int, default=1)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+    parser.add_argument("--use_xpu", type=str2bool, default=False)
+    parser.add_argument("--use_npu", type=str2bool, default=False)
+    parser.add_argument("--precision", type=str, default="fp32")
+    parser.add_argument("--ir_optim", type=str2bool, default=True)
+    parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+    parser.add_argument("--gpu_mem", type=int, default=8000)
+    parser.add_argument("--enable_benchmark", type=str2bool, default=False)
+    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
+    parser.add_argument("--cpu_threads", type=int, default=None)
+    parser.add_argument("--disable_glog", type=str2bool, default=False)
+    # parser.add_argument("--hubserving", type=str2bool, default=False)  #TODO
+
+    return parser.parse_args()
+
+
+def create_paddle_predictor(args, cfg):
+    config = Config(args.model_file, args.params_file)
+    if args.use_gpu:
+        config.enable_use_gpu(args.gpu_mem, 0)
+    elif args.use_npu:
+        config.enable_npu()
+    elif args.use_xpu:
+        config.enable_xpu()
+    else:
+        config.disable_gpu()
+        if args.cpu_threads:
+            config.set_cpu_math_library_num_threads(args.cpu_threads)
+        if args.enable_mkldnn:
+            # cache 10 different shapes for mkldnn to avoid memory leak
+            config.set_mkldnn_cache_capacity(10)
+            config.enable_mkldnn()
+            if args.precision == "fp16":
+                config.enable_mkldnn_bfloat16()
+
+    # config.disable_glog_info()
+    config.switch_ir_optim(args.ir_optim)  # default true
+    if args.use_tensorrt:
+        # choose precision
+        if args.precision == "fp16":
+            precision = inference.PrecisionType.Half
+        elif args.precision == "int8":
+            precision = inference.PrecisionType.Int8
+        else:
+            precision = inference.PrecisionType.Float32
+
+        # calculate real max batch size during inference when tenrotRT enabled
+        max_batch_size = args.batch_size
+        if 'num_seg' in cfg.INFERENCE:
+            # num_seg: number of segments when extracting frames.
+            # seg_len: number of frames extracted within a segment, default to 1.
+            # num_views: the number of video frame groups obtained by cropping and flipping,
+            # uniformcrop=3, tencrop=10, centercrop=1.
+            num_seg = cfg.INFERENCE.num_seg
+            seg_len = cfg.INFERENCE.get('seg_len', 1)
+            num_views = 1
+            if 'tsm' in cfg.model_name.lower():
+                num_views = 1  # CenterCrop
+            elif 'tsn' in cfg.model_name.lower():
+                num_views = 10  # TenCrop
+            elif 'timesformer' in cfg.model_name.lower():
+                num_views = 3  # UniformCrop
+            elif 'videoswin' in cfg.model_name.lower():
+                num_views = 3  # UniformCrop
+            elif 'tokenshift' in cfg.model_name.lower():
+                num_views = 3  # UniformCrop
+            max_batch_size = args.batch_size * num_views * num_seg * seg_len
+        config.enable_tensorrt_engine(
+            precision_mode=precision, max_batch_size=max_batch_size)
+
+    config.enable_memory_optim()
+    # use zero copy
+    config.switch_use_feed_fetch_ops(False)
+
+    # disable glog
+    if args.disable_glog:
+        config.disable_glog_info()
+
+    # for ST-GCN tensorRT case usage
+    # config.delete_pass("shuffle_channel_detect_pass")
+
+    predictor = create_predictor(config)
+
+    return config, predictor
+
+
+def parse_file_paths(input_path: str) -> list:
+    if osp.isfile(input_path):
+        files = [
+            input_path,
+        ]
+    else:
+        files = os.listdir(input_path)
+        files = [
+            file for file in files
+            if (file.endswith(".avi") or file.endswith(".mp4"))
+        ]
+        files = [osp.join(input_path, file) for file in files]
+    return files
+
+
+def main():
+    """predict using paddle inference model
+    """
+    args = parse_args()
+    cfg = get_config(args.config, overrides=args.override, show=False)
+
+    model_name = cfg.model_name
+    print(f"Inference model({model_name})...")
+    InferenceHelper = build_inference_helper(cfg.INFERENCE)
+
+    inference_config, predictor = create_paddle_predictor(args, cfg)
+
+    # get input_tensor and output_tensor
+    input_names = predictor.get_input_names()
+    output_names = predictor.get_output_names()
+    input_tensor_list = []
+    output_tensor_list = []
+    for item in input_names:
+        input_tensor_list.append(predictor.get_input_handle(item))
+    for item in output_names:
+        output_tensor_list.append(predictor.get_output_handle(item))
+
+    # get the absolute file path(s) to be processed
+    if model_name in ["MSTCN", "ASRF"]:
+        files = InferenceHelper.get_process_file(args.input_file)
+    else:
+        files = parse_file_paths(args.input_file)
+
+    if model_name == 'TransNetV2':
+        for file in files:
+            inputs = InferenceHelper.preprocess(file)
+            outputs = []
+            for input in inputs:
+                # Run inference
+                for i in range(len(input_tensor_list)):
+                    input_tensor_list[i].copy_from_cpu(input)
+                predictor.run()
+                output = []
+                for j in range(len(output_tensor_list)):
+                    output.append(output_tensor_list[j].copy_to_cpu())
+                outputs.append(output)
+
+            # Post process output
+            InferenceHelper.postprocess(outputs)
+
+    elif model_name == 'AVA_SlowFast_FastRcnn':
+        for file in files:  # for videos
+            inputs = InferenceHelper.preprocess(file)
+            outputs = []
+            for input in inputs:
+                # Run inference
+                input_len = len(input_tensor_list)
+
+                for i in range(input_len):
+                    if type(input[i]) == paddle.Tensor:
+                        input_tmp = input[i].numpy()
+                    else:
+                        input_tmp = input[i]
+                    input_tensor_list[i].copy_from_cpu(input_tmp)
+                predictor.run()
+                output = []
+                for j in range(len(output_tensor_list)):
+                    output.append(output_tensor_list[j].copy_to_cpu())
+                outputs.append(output)
+
+            # Post process output
+            InferenceHelper.postprocess(outputs)
+    elif model_name == 'YOWO':
+        for file in files:  # for videos
+            (_, filename) = os.path.split(file)
+            (filename, _) = os.path.splitext(filename)
+            save_dir = osp.join('inference', 'YOWO_infer')
+            if not osp.exists('inference'):
+                os.mkdir('inference')
+            if not osp.exists(save_dir):
+                os.mkdir(save_dir)
+            save_path = osp.join(save_dir, filename)
+            if not osp.exists(save_path):
+                os.mkdir(save_path)
+            inputs, frames = InferenceHelper.preprocess(file)
+            for idx, input in enumerate(inputs):
+                # Run inference
+                outputs = []
+                input_len = len(input_tensor_list)
+                for i in range(input_len):
+                    input_tensor_list[i].copy_from_cpu(input[i])
+                predictor.run()
+                for j in range(len(output_tensor_list)):
+                    outputs.append(output_tensor_list[j].copy_to_cpu())
+                # Post process output
+                InferenceHelper.postprocess(outputs, frames[idx], osp.join(save_path, str(idx).zfill(3)))
+    else:
+        if args.enable_benchmark:
+            num_warmup = 3
+
+            # instantiate auto log
+            try:
+                import auto_log
+            except ImportError as e:
+                print(f"{e}, [git+https://github.com/LDOUBLEV/AutoLog] "
+                      f"package and it's dependencies is required for "
+                      f"python-inference when enable_benchmark=True.")
+            pid = os.getpid()
+            autolog = auto_log.AutoLogger(
+                model_name=cfg.model_name,
+                model_precision=args.precision,
+                batch_size=args.batch_size,
+                data_shape="dynamic",
+                save_path="./output/auto_log.lpg",
+                inference_config=inference_config,
+                pids=pid,
+                process_name=None,
+                gpu_ids=0 if args.use_gpu else None,
+                time_keys=[
+                    'preprocess_time', 'inference_time', 'postprocess_time'
+                ],
+                warmup=num_warmup)
+            if not args.time_test_file:
+                test_video_num = 15
+                files = [args.input_file for _ in range(test_video_num)]
+            else:
+                f_input = open(args.input_file, 'r')
+                files = [i.strip() for i in f_input.readlines()]
+                test_video_num = len(files)
+                f_input.close()
+
+        # Inferencing process
+        batch_num = args.batch_size
+        for st_idx in range(0, len(files), batch_num):
+            ed_idx = min(st_idx + batch_num, len(files))
+
+            # auto log start
+            if args.enable_benchmark:
+                autolog.times.start()
+
+            # Pre process batched input
+            batched_inputs = InferenceHelper.preprocess_batch(
+                files[st_idx:ed_idx])
+
+            # get pre process time cost
+            if args.enable_benchmark:
+                autolog.times.stamp()
+
+            # run inference
+            for i in range(len(input_tensor_list)):
+                input_tensor_list[i].copy_from_cpu(batched_inputs[i])
+            predictor.run()
+
+            batched_outputs = []
+            for j in range(len(output_tensor_list)):
+                batched_outputs.append(output_tensor_list[j].copy_to_cpu())
+
+            # get inference process time cost
+            if args.enable_benchmark:
+                autolog.times.stamp()
+
+            InferenceHelper.postprocess(batched_outputs,
+                                        not args.enable_benchmark)
+
+            # get post process time cost
+            if args.enable_benchmark:
+                autolog.times.end(stamp=True)
+
+            # time.sleep(0.01)  # sleep for T4 GPU
+
+    # report benchmark log if enabled
+    if args.enable_benchmark:
+        autolog.report()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/src/tools/summary.py b/docs/src/tools/summary.py
new file mode 100644
index 000000000..28bd6f7eb
--- /dev/null
+++ b/docs/src/tools/summary.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import sys
+import os.path as osp
+
+import paddle
+import paddle.nn.functional as F
+from paddle.jit import to_static
+import paddleslim
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from paddlevideo.modeling.builder import build_model
+from paddlevideo.utils import get_config
+
+
+def parse_args():
+
+    parser = argparse.ArgumentParser("PaddleVideo Summary")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/example.yaml',
+                        help='config file path')
+
+    parser.add_argument("--img_size", type=int, default=224)
+    parser.add_argument("--num_seg", type=int, default=8)
+    parser.add_argument("--FLOPs",
+                        action="store_true",
+                        help="whether to print FLOPs")
+
+    return parser.parse_args()
+
+
+def _trim(cfg, args):
+    """
+    Reuse the trainging config will bring useless attribute, such as: backbone.pretrained model. Trim it here.
+    """
+    model_name = cfg.model_name
+    cfg = cfg.MODEL
+    cfg.backbone.pretrained = ""
+
+    if 'num_seg' in cfg.backbone:
+        cfg.backbone.num_seg = args.num_seg
+    return cfg, model_name
+
+
+def main():
+    args = parse_args()
+    cfg, model_name = _trim(get_config(args.config, show=False), args)
+    print(f"Building model({model_name})...")
+    model = build_model(cfg)
+
+    img_size = args.img_size
+    num_seg = args.num_seg
+    #NOTE: only support tsm now, will refine soon
+    params_info = paddle.summary(model, (1, 1, num_seg, 3, img_size, img_size))
+    print(params_info)
+
+    if args.FLOPs:
+        flops_info = paddleslim.analysis.flops(
+            model, [1, 1, num_seg, 3, img_size, img_size])
+        print(flops_info)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/src/tools/utils.py b/docs/src/tools/utils.py
new file mode 100644
index 000000000..bbdd2d192
--- /dev/null
+++ b/docs/src/tools/utils.py
@@ -0,0 +1,1670 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import sys
+from typing import List
+import pickle
+
+import cv2
+try:
+    import imageio
+except ImportError as e:
+    print(
+        f"Warning! {e}, [imageio] package and it's dependencies is required for VideoSwin."
+    )
+try:
+    import matplotlib as mpl
+    import matplotlib.cm as cm
+except ImportError as e:
+    print(
+        f"Warning! {e}, [matplotlib] package and it's dependencies is required for ADDS."
+    )
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import pandas
+from PIL import Image
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+from abc import abstractmethod
+
+from paddlevideo.loader.builder import build_pipeline
+from paddlevideo.loader.pipelines import (
+    AutoPadding, CenterCrop, DecodeSampler, FeatureDecoder, FrameDecoder,
+    GroupResize, Image2Array, ImageDecoder, JitterScale, MultiCrop,
+    Normalization, PackOutput, Sampler, SamplerPkl, Scale, SkeletonNorm,
+    TenCrop, ToArray, UniformCrop, VideoDecoder, SegmentationSampler,
+    SketeonCropSample, MultiCenterCrop, SketeonCropSample, UniformSampleFrames,
+    PoseDecode, PoseCompact, Resize, CenterCrop_V2, GeneratePoseTarget,
+    FormatShape, Collect)
+from paddlevideo.metrics.ava_utils import read_labelmap
+from paddlevideo.metrics.bmn_metric import boundary_choose, soft_nms
+from paddlevideo.utils import Registry, build, get_config
+from paddlevideo.modeling.framework.segmenters.utils import ASRFPostProcessing
+
+from tools.ava_predict import (detection_inference, frame_extraction,
+                               get_detection_result, get_timestep_result,
+                               pack_result, visualize)
+from paddlevideo.modeling.framework.localizers.yowo_utils import nms, get_region_boxes
+
+INFERENCE = Registry('inference')
+
+
+def build_inference_helper(cfg):
+    return build(cfg, INFERENCE)
+
+
+class Base_Inference_helper():
+    def __init__(self,
+                 num_seg=8,
+                 seg_len=1,
+                 short_size=256,
+                 target_size=224,
+                 top_k=1):
+        """Base_Inference_helper
+
+        Args:
+            num_seg (int, optional): number of segmentations of an sliced input video. Defaults to 8.
+            seg_len (int, optional): length of each segmentation. Defaults to 1.
+            short_size (int, optional): short size of input video. Defaults to 256.
+            target_size (int, optional): size of cropped video. Defaults to 224.
+            top_k (int, optional): select topk result in outputs. Defaults to 1.
+        """
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+
+    @abstractmethod
+    def preprocess(self, input_file: str):
+        """preprocess abstractmethod
+
+        Args:
+            input_file (str): input file path.
+        """
+        pass
+
+    def preprocess_batch(self, file_list: List[str]) -> List[np.ndarray]:
+        """preprocess for file list
+
+        Args:
+            file_list (List[str]): file pathes in an list, [path1, path2, ...].
+
+        Returns:
+            List[np.ndarray]: batched inputs data, [data_batch[0], data_batch[1], ...].
+        """
+        batched_inputs = []
+        for file in file_list:
+            inputs = self.preprocess(file)
+            batched_inputs.append(inputs)
+        batched_inputs = [
+            np.concatenate([item[i] for item in batched_inputs])
+            for i in range(len(batched_inputs[0]))
+        ]
+        self.input_file = file_list
+        return batched_inputs
+
+    def postprocess(self,
+                    output: np.ndarray,
+                    print_output: bool = True,
+                    return_result: bool = False):
+        """postprocess
+
+        Args:
+            output (np.ndarray): batched output scores, shape of (batch_size, class_num).
+            print_output (bool, optional): whether to print result. Defaults to True.
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        output = output[0]  # [B, num_cls]
+        N = len(self.input_file)
+        if output.shape[0] != N:
+            output = output.reshape([N] + [output.shape[0] // N] +
+                                    list(output.shape[1:]))  # [N, T, C]
+            output = output.mean(axis=1)  # [N, C]
+        output = F.softmax(paddle.to_tensor(output), axis=-1).numpy()
+        results_list = []
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            topk_class = classes[:self.top_k]
+            topk_scores = scores[:self.top_k]
+            result = {
+                "video_id": self.input_file[i],
+                "topk_class": topk_class,
+                "topk_scores": topk_scores
+            }
+            results_list.append(result)
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                print("\ttop-{0} class: {1}".format(self.top_k, topk_class))
+                print("\ttop-{0} score: {1}".format(self.top_k, topk_scores))
+        if return_result:
+            return results_list
+
+
+@INFERENCE.register()
+class ppTSM_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=8,
+                 seg_len=1,
+                 short_size=256,
+                 target_size=224,
+                 top_k=1):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        img_mean = [0.485, 0.456, 0.406]
+        img_std = [0.229, 0.224, 0.225]
+        ops = [
+            VideoDecoder(backend="decord"),
+            Sampler(self.num_seg, self.seg_len, valid_mode=True),
+            Scale(self.short_size),
+            CenterCrop(self.target_size),
+            Image2Array(),
+            Normalization(img_mean, img_std)
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class ppTSN_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=25,
+                 seg_len=1,
+                 short_size=256,
+                 target_size=224,
+                 top_k=1):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        img_mean = [0.485, 0.456, 0.406]
+        img_std = [0.229, 0.224, 0.225]
+        ops = [
+            VideoDecoder(backend="decord"),
+            Sampler(self.num_seg,
+                    self.seg_len,
+                    valid_mode=True,
+                    select_left=True),
+            Scale(self.short_size,
+                  fixed_ratio=True,
+                  do_round=True,
+                  backend='cv2'),
+            TenCrop(self.target_size),
+            Image2Array(),
+            Normalization(img_mean, img_std)
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class BMN_Inference_helper(Base_Inference_helper):
+    def __init__(self, feat_dim, dscale, tscale, result_path):
+        self.feat_dim = feat_dim
+        self.dscale = dscale
+        self.tscale = tscale
+        self.result_path = result_path
+        if not os.path.isdir(self.result_path):
+            os.makedirs(self.result_path)
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        file_info = json.load(open(input_file))
+        self.feat_path = file_info['feat_path']
+        self.video_duration = file_info['duration_second']
+        feat = np.load(self.feat_path).astype('float32').T
+        res = np.expand_dims(feat, axis=0).copy()
+
+        return [res]
+
+    def postprocess(self, outputs, print_output=True):
+        """
+        output: list
+        """
+        pred_bm, pred_start, pred_end = outputs
+        self._gen_props(pred_bm, pred_start[0], pred_end[0], print_output)
+
+    def _gen_props(self, pred_bm, pred_start, pred_end, print_output):
+        snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]
+        snippet_xmaxs = [
+            1.0 / self.tscale * i for i in range(1, self.tscale + 1)
+        ]
+
+        pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]
+        start_mask = boundary_choose(pred_start)
+        start_mask[0] = 1.
+        end_mask = boundary_choose(pred_end)
+        end_mask[-1] = 1.
+        score_vector_list = []
+        for idx in range(self.dscale):
+            for jdx in range(self.tscale):
+                start_index = jdx
+                end_index = start_index + idx
+                if end_index < self.tscale and start_mask[
+                        start_index] == 1 and end_mask[end_index] == 1:
+                    xmin = snippet_xmins[start_index]
+                    xmax = snippet_xmaxs[end_index]
+                    xmin_score = pred_start[start_index]
+                    xmax_score = pred_end[end_index]
+                    bm_score = pred_bm[idx, jdx]
+                    conf_score = xmin_score * xmax_score * bm_score
+                    score_vector_list.append([xmin, xmax, conf_score])
+
+        cols = ["xmin", "xmax", "score"]
+        score_vector_list = np.stack(score_vector_list)
+        df = pandas.DataFrame(score_vector_list, columns=cols)
+
+        result_dict = {}
+        proposal_list = []
+        df = soft_nms(df, alpha=0.4, t1=0.55, t2=0.9)
+        for idx in range(min(100, len(df))):
+            tmp_prop={"score":df.score.values[idx], \
+                      "segment":[max(0,df.xmin.values[idx])*self.video_duration, \
+                                 min(1,df.xmax.values[idx])*self.video_duration]}
+            proposal_list.append(tmp_prop)
+
+        result_dict[self.feat_path] = proposal_list
+
+        # print top-5 predictions
+        if print_output:
+            print("Current video file: {0} :".format(self.feat_path))
+            for pred in proposal_list[:5]:
+                print(pred)
+
+        # save result
+        outfile = open(
+            os.path.join(self.result_path, "bmn_results_inference.json"), "w")
+
+        json.dump(result_dict, outfile)
+
+
+@INFERENCE.register()
+class TokenShift_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=8,
+                 seg_len=1,
+                 short_size=256,
+                 target_size=256,
+                 top_k=1,
+                 mean=[0.5, 0.5, 0.5],
+                 std=[0.5, 0.5, 0.5]):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+        self.mean = mean
+        self.std = std
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        ops = [
+            VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),
+            Sampler(self.num_seg, self.seg_len, valid_mode=True),
+            Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),
+            Image2Array(data_format='cthw'),
+            JitterScale(self.short_size, self.short_size),
+            MultiCenterCrop(self.target_size)
+        ]
+        for op in ops:
+            results = op(results)
+
+        # [N,C,Tx3,H,W]
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class TimeSformer_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=8,
+                 seg_len=1,
+                 short_size=224,
+                 target_size=224,
+                 top_k=1,
+                 mean=[0.45, 0.45, 0.45],
+                 std=[0.225, 0.225, 0.225]):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+        self.mean = mean
+        self.std = std
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        ops = [
+            VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),
+            Sampler(self.num_seg,
+                    self.seg_len,
+                    valid_mode=True,
+                    linspace_sample=True),
+            Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),
+            Image2Array(data_format='cthw'),
+            JitterScale(self.short_size, self.short_size),
+            UniformCrop(self.target_size)
+        ]
+        for op in ops:
+            results = op(results)
+
+        # [N,C,Tx3,H,W]
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class VideoSwin_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=4,
+                 seg_len=32,
+                 frame_interval=2,
+                 short_size=224,
+                 target_size=224,
+                 top_k=1,
+                 mean=[123.675, 116.28, 103.53],
+                 std=[58.395, 57.12, 57.375]):
+
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.frame_interval = frame_interval
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+        self.mean = mean
+        self.std = std
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        self.input_file = input_file
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        ops = [
+            VideoDecoder(backend='decord', mode='valid'),
+            Sampler(num_seg=self.num_seg,
+                    frame_interval=self.frame_interval,
+                    seg_len=self.seg_len,
+                    valid_mode=True,
+                    use_pil=False),
+            Scale(short_size=self.short_size,
+                  fixed_ratio=False,
+                  keep_ratio=True,
+                  backend='cv2',
+                  do_round=True),
+            CenterCrop(target_size=224, backend='cv2'),
+            Normalization(mean=self.mean,
+                          std=self.std,
+                          tensor_shape=[3, 1, 1, 1],
+                          inplace=True),
+            Image2Array(data_format='cthw')
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+    def postprocess(self, output, print_output=True):
+        """
+        output: list
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        output = output[0]  # [B, num_cls]
+        N = len(self.input_file)
+        if output.shape[0] != N:
+            output = output.reshape([N] + [output.shape[0] // N] +
+                                    list(output.shape[1:]))  # [N, T, C]
+            output = output.mean(axis=1)  # [N, C]
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                for j in range(self.top_k):
+                    print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+                    print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+
+
+@INFERENCE.register()
+class VideoSwin_TableTennis_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=1,
+                 seg_len=32,
+                 short_size=256,
+                 target_size=224,
+                 top_k=1):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'frame_dir': input_file, 'suffix': 'img_{:05}.jpg'}
+        img_mean = [123.675, 116.28, 103.53]
+        img_std = [58.395, 57.12, 57.375]
+        ops = [
+            FrameDecoder(),
+            SamplerPkl(num_seg=self.num_seg,
+                       seg_len=self.seg_len,
+                       backend='cv2',
+                       valid_mode=True),
+            Scale(short_size=self.short_size,
+                  fixed_ratio=False,
+                  keep_ratio=True,
+                  backend='cv2',
+                  do_round=True),
+            UniformCrop(target_size=self.target_size, backend='cv2'),
+            Normalization(mean=img_mean,
+                          std=img_std,
+                          tensor_shape=[3, 1, 1, 1],
+                          inplace=True),
+            Image2Array(data_format='cthw')
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+    def add_text_to_video(
+            self,
+            video_path,
+            output_dir="applications/TableTennis/ActionRecognition/results",
+            text=None):
+        os.makedirs(output_dir, exist_ok=True)
+        if video_path.endswith('.pkl'):
+            try:
+                import cPickle as pickle
+                from cStringIO import StringIO
+            except ImportError:
+                import pickle
+                from io import BytesIO
+            from PIL import Image
+            data_loaded = pickle.load(open(video_path, 'rb'), encoding='bytes')
+            _, _, frames = data_loaded
+            frames_len = len(frames)
+
+        else:
+            videoCapture = cv2.VideoCapture()
+            videoCapture.open(video_path)
+
+            fps = videoCapture.get(cv2.CAP_PROP_FPS)
+            frame_width = int(videoCapture.get(cv2.CAP_PROP_FRAME_WIDTH))
+            frame_height = int(videoCapture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+            frames_len = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT)
+            print("fps=", int(fps), "frames=", int(frames_len), "scale=",
+                  f"{frame_height}x{frame_width}")
+
+        frames_rgb_list = []
+        for i in range(int(frames_len)):
+            if video_path.endswith('.pkl'):
+                frame = np.array(
+                    Image.open(BytesIO(frames[i])).convert("RGB").resize(
+                        (240, 135)))[:, :, ::-1].astype('uint8')
+            else:
+                _, frame = videoCapture.read()
+            frame = cv2.putText(frame, text, (30, 30), cv2.FONT_HERSHEY_COMPLEX,
+                                1.0, (0, 0, 255), 2)
+            frames_rgb_list.append(frame[:, :, ::-1])  # bgr to rgb
+        if not video_path.endswith('.pkl'):
+            videoCapture.release()
+        cv2.destroyAllWindows()
+        output_filename = os.path.basename(video_path)
+        output_filename = output_filename.split('.')[0] + '.gif'
+        imageio.mimsave(f'{output_dir}/{output_filename}',
+                        frames_rgb_list,
+                        'GIF',
+                        duration=0.00085)
+
+    def postprocess(self, output, print_output=True, save_gif=True):
+        """
+        output: list
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        output = output[0]  # [B, num_cls]
+        N = len(self.input_file)
+        if output.shape[0] != N:
+            output = output.reshape([N] + [output.shape[0] // N] +
+                                    list(output.shape[1:]))  # [N, T, C]
+            output = output.mean(axis=1)  # [N, C]
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                for j in range(self.top_k):
+                    print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+                    print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+            if save_gif:
+                self.add_text_to_video(
+                    self.input_file[0],
+                    text=f"{str(classes[0])} {float(scores[0]):.5f}")
+
+
+@INFERENCE.register()
+class SlowFast_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_frames=32,
+                 sampling_rate=2,
+                 target_size=256,
+                 alpha=8,
+                 top_k=1):
+        self.num_frames = num_frames
+        self.sampling_rate = sampling_rate
+        self.target_size = target_size
+        self.alpha = alpha
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {
+            'filename': input_file,
+            'temporal_sample_index': 0,
+            'spatial_sample_index': 0,
+            'temporal_num_clips': 1,
+            'spatial_num_clips': 1
+        }
+        img_mean = [0.45, 0.45, 0.45]
+        img_std = [0.225, 0.225, 0.225]
+        ops = [
+            DecodeSampler(self.num_frames, self.sampling_rate, test_mode=True),
+            JitterScale(self.target_size, self.target_size),
+            MultiCrop(self.target_size),
+            Image2Array(transpose=False),
+            Normalization(img_mean, img_std, tensor_shape=[1, 1, 1, 3]),
+            PackOutput(self.alpha),
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = []
+        for item in results['imgs']:
+            res.append(np.expand_dims(item, axis=0).copy())
+        return res
+
+    def postprocess(self, output, print_output=True):
+        """
+        output: list
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        output = output[0]  # [B, num_cls]
+
+        N = len(self.input_file)
+        if output.shape[0] != N:
+            output = output.reshape([N] + [output.shape[0] // N] +
+                                    list(output.shape[1:]))  # [N, T, C]
+            output = output.mean(axis=1)  # [N, C]
+        # output = F.softmax(paddle.to_tensor(output), axis=-1).numpy() # done in it's head
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                for j in range(self.top_k):
+                    print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+                    print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+
+
+@INFERENCE.register()
+class STGCN_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_channels,
+                 window_size,
+                 vertex_nums,
+                 person_nums,
+                 top_k=1):
+        self.num_channels = num_channels
+        self.window_size = window_size
+        self.vertex_nums = vertex_nums
+        self.person_nums = person_nums
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        data = np.load(input_file)
+        results = {'data': data}
+        ops = [AutoPadding(window_size=self.window_size), SkeletonNorm()]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['data'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class CTRGCN_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_channels=3,
+                 vertex_nums=25,
+                 person_nums=2,
+                 window_size=64,
+                 p_interval=[0.95],
+                 top_k=1):
+        self.window_size = window_size
+        self.p_interval = p_interval
+        self.num_channels = num_channels
+        self.vertex_nums = vertex_nums
+        self.person_nums = person_nums
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        data = np.load(input_file)
+        results = {'data': data}
+        ops = [
+            SketeonCropSample(window_size=self.window_size,
+                              p_interval=self.p_interval)
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['data'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class AGCN2s_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 window_size=300,
+                 num_channels=3,
+                 vertex_nums=25,
+                 person_nums=2,
+                 top_k=1):
+        self.window_size = window_size
+        self.num_channels = num_channels
+        self.vertex_nums = vertex_nums
+        self.person_nums = person_nums
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        data = np.load(input_file)
+        results = {'data': data}
+
+        res = np.expand_dims(results['data'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class MSTCN_Inference_helper(Base_Inference_helper):
+    def __init__(self, num_channels, actions_map_file_path, feature_path=None):
+        self.num_channels = num_channels
+        file_ptr = open(actions_map_file_path, 'r')
+        actions = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        self.actions_dict = dict()
+        for a in actions:
+            self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+        self.feature_path = feature_path
+        self.file_name_list = []
+
+    def get_process_file(self, input_file_txt):
+        with open(input_file_txt, 'r') as file_ptr:
+            info = file_ptr.read().split('\n')[:-1]
+
+        files = []
+        for video_name in info:
+            if self.feature_path is not None:
+                file_name = video_name.split('.')[0] + ".npy"
+                input_file = os.path.join(self.feature_path, file_name)
+            else:
+                input_file = video_name
+
+            assert os.path.isfile(
+                input_file) is not None, "{0} not exists".format(input_file)
+            files.append(input_file)
+
+            self.file_name_list.append(input_file.split('/')[-1].split('.')[0])
+        return files
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, feature file list txt path
+        return: list
+        """
+        output_list = []
+
+        data = np.load(input_file)
+        results = {'video_feat': data, 'video_gt': None}
+        ops = []
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['video_feat'], axis=0).copy()
+        output_list.append(res)
+        return output_list
+
+    def postprocess(self, output, print_output=True):
+        reslut_path = os.path.join("./inference/infer_results/")
+        if not os.path.isdir(reslut_path):
+            os.makedirs(reslut_path)
+        output = [output]
+        for outputs in output:
+            output_np = outputs[0]
+            recognition = []
+            for i in range(output_np.shape[0]):
+                recognition = np.concatenate((recognition, [
+                    list(self.actions_dict.keys())[list(
+                        self.actions_dict.values()).index(output_np[i])]
+                ]))
+            recog_content = list(recognition)
+            recog_content = [line + "\n" for line in recog_content]
+
+            filename = self.file_name_list.pop(0)
+
+            write_path = os.path.join(reslut_path, filename + ".txt")
+            f = open(write_path, "w")
+            f.writelines(recog_content)
+            f.close()
+        print("result write in : " + write_path)
+
+
+@INFERENCE.register()
+class ASRF_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_channels,
+                 actions_map_file_path,
+                 postprocessing_method,
+                 boundary_threshold,
+                 feature_path=None):
+        self.num_channels = num_channels
+        file_ptr = open(actions_map_file_path, 'r')
+        actions = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        self.actions_dict = dict()
+        for a in actions:
+            self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+        self.postprocessing_method = postprocessing_method
+        self.boundary_threshold = boundary_threshold
+        self.feature_path = feature_path
+        self.file_name_list = []
+
+    def get_process_file(self, input_file_txt):
+        with open(input_file_txt, 'r') as file_ptr:
+            info = file_ptr.read().split('\n')[:-1]
+
+        files = []
+        for video_name in info:
+            if self.feature_path is not None:
+                file_name = video_name.split('.')[0] + ".npy"
+                input_file = os.path.join(self.feature_path, file_name)
+            else:
+                input_file = video_name
+
+            assert os.path.isfile(
+                input_file) is not None, "{0} not exists".format(input_file)
+            files.append(input_file)
+
+            self.file_name_list.append(input_file.split('/')[-1].split('.')[0])
+        return files
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, feature file list txt path
+        return: list
+        """
+
+        output_list = []
+
+        data = np.load(input_file)
+        results = {'video_feat': data, 'video_gt': None}
+        ops = []
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['video_feat'], axis=0).copy()
+        output_list.append(res)
+        return output_list
+
+    def postprocess(self, output, print_output=True):
+        reslut_path = os.path.join("./inference/infer_results/")
+        if not os.path.isdir(reslut_path):
+            os.makedirs(reslut_path)
+        output = [output]
+        for outputs in output:
+            outputs_cls_np = outputs[0]
+            outputs_boundary_np = outputs[1]
+
+            output_np = ASRFPostProcessing(
+                outputs_cls_np,
+                outputs_boundary_np,
+                self.postprocessing_method,
+                boundary_threshold=self.boundary_threshold).numpy()[0, :]
+
+            recognition = []
+            for i in range(output_np.shape[0]):
+                recognition = np.concatenate((recognition, [
+                    list(self.actions_dict.keys())[list(
+                        self.actions_dict.values()).index(output_np[i])]
+                ]))
+            recog_content = list(recognition)
+            recog_content = [line + "\n" for line in recog_content]
+
+            filename = self.file_name_list.pop(0)
+
+            write_path = os.path.join(reslut_path, filename + ".txt")
+            f = open(write_path, "w")
+            f.writelines(recog_content)
+            f.close()
+        print("result write in : " + write_path)
+
+
+@INFERENCE.register()
+class AttentionLSTM_Inference_helper(Base_Inference_helper):
+    def __init__(
+            self,
+            num_classes,  #Optional, the number of classes to be classified.
+            feature_num,
+            feature_dims,
+            embedding_size,
+            lstm_size,
+            top_k=1):
+        self.num_classes = num_classes
+        self.feature_num = feature_num
+        self.feature_dims = feature_dims
+        self.embedding_size = embedding_size
+        self.lstm_size = lstm_size
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        ops = [FeatureDecoder(num_classes=self.num_classes, has_label=False)]
+        for op in ops:
+            results = op(results)
+
+        res = []
+        for modality in ['rgb', 'audio']:
+            res.append(
+                np.expand_dims(results[f'{modality}_data'], axis=0).copy())
+            res.append(
+                np.expand_dims(results[f'{modality}_len'], axis=0).copy())
+            res.append(
+                np.expand_dims(results[f'{modality}_mask'], axis=0).copy())
+        return res
+
+
+@INFERENCE.register()
+class TransNetV2_Inference_helper():
+    def __init__(self,
+                 num_frames,
+                 height,
+                 width,
+                 num_channels,
+                 threshold=0.5,
+                 output_path=None,
+                 visualize=True):
+        self._input_size = (height, width, num_channels)
+        self.output_path = output_path
+        self.len_frames = 0
+        self.threshold = threshold
+        self.visualize = visualize
+
+    def input_iterator(self, frames):
+        # return windows of size 100 where the first/last 25 frames are from the previous/next batch
+        # the first and last window must be padded by copies of the first and last frame of the video
+        no_padded_frames_start = 25
+        no_padded_frames_end = 25 + 50 - (
+            len(frames) % 50 if len(frames) % 50 != 0 else 50)  # 25 - 74
+
+        start_frame = np.expand_dims(frames[0], 0)
+        end_frame = np.expand_dims(frames[-1], 0)
+        padded_inputs = np.concatenate([start_frame] * no_padded_frames_start +
+                                       [frames] +
+                                       [end_frame] * no_padded_frames_end, 0)
+
+        ptr = 0
+        while ptr + 100 <= len(padded_inputs):
+            out = padded_inputs[ptr:ptr + 100]
+            out = out.astype(np.float32)
+            ptr += 50
+            yield out[np.newaxis]
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: iterator
+        """
+        try:
+            import ffmpeg
+        except ImportError as e:
+            print(
+                f"Warning! {e}, [ffmpeg-python] package and it's dependencies is required for TransNetV2."
+            )
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        self.input_file = input_file
+        self.filename = os.path.splitext(os.path.split(self.input_file)[1])[0]
+        video_stream, err = ffmpeg.input(
+            self.input_file).output("pipe:",
+                                    format="rawvideo",
+                                    pix_fmt="rgb24",
+                                    s="48x27").run(capture_stdout=True,
+                                                   capture_stderr=True)
+        self.frames = np.frombuffer(video_stream,
+                                    np.uint8).reshape([-1, 27, 48, 3])
+        self.len_frames = len(self.frames)
+
+        return self.input_iterator(self.frames)
+
+    def predictions_to_scenes(self, predictions):
+        predictions = (predictions > self.threshold).astype(np.uint8)
+        scenes = []
+        t, t_prev, start = -1, 0, 0
+        for i, t in enumerate(predictions):
+            if t_prev == 1 and t == 0:
+                start = i
+            if t_prev == 0 and t == 1 and i != 0:
+                scenes.append([start, i])
+            t_prev = t
+        if t == 0:
+            scenes.append([start, i])
+
+        # just fix if all predictions are 1
+        if len(scenes) == 0:
+            return np.array([[0, len(predictions) - 1]], dtype=np.int32)
+
+        return np.array(scenes, dtype=np.int32)
+
+    def visualize_predictions(self, frames, predictions):
+        from PIL import Image, ImageDraw
+
+        if isinstance(predictions, np.ndarray):
+            predictions = [predictions]
+
+        ih, iw, ic = frames.shape[1:]
+        width = 25
+
+        # pad frames so that length of the video is divisible by width
+        # pad frames also by len(predictions) pixels in width in order to show predictions
+        pad_with = width - len(frames) % width if len(
+            frames) % width != 0 else 0
+        frames = np.pad(frames, [(0, pad_with), (0, 1), (0, len(predictions)),
+                                 (0, 0)])
+
+        predictions = [np.pad(x, (0, pad_with)) for x in predictions]
+        height = len(frames) // width
+
+        img = frames.reshape([height, width, ih + 1, iw + len(predictions), ic])
+        img = np.concatenate(np.split(
+            np.concatenate(np.split(img, height), axis=2)[0], width),
+                             axis=2)[0, :-1]
+
+        img = Image.fromarray(img)
+        draw = ImageDraw.Draw(img)
+
+        # iterate over all frames
+        for i, pred in enumerate(zip(*predictions)):
+            x, y = i % width, i // width
+            x, y = x * (iw + len(predictions)) + iw, y * (ih + 1) + ih - 1
+
+            # we can visualize multiple predictions per single frame
+            for j, p in enumerate(pred):
+                color = [0, 0, 0]
+                color[(j + 1) % 3] = 255
+
+                value = round(p * (ih - 1))
+                if value != 0:
+                    draw.line((x + j, y, x + j, y - value),
+                              fill=tuple(color),
+                              width=1)
+        return img
+
+    def postprocess(self, outputs, print_output=True):
+        """
+        output: list
+        """
+        predictions = []
+        for output in outputs:
+            single_frame_logits, all_frames_logits = output
+            single_frame_pred = F.sigmoid(paddle.to_tensor(single_frame_logits))
+            all_frames_pred = F.sigmoid(paddle.to_tensor(all_frames_logits))
+            predictions.append((single_frame_pred.numpy()[0, 25:75, 0],
+                                all_frames_pred.numpy()[0, 25:75, 0]))
+        single_frame_pred = np.concatenate(
+            [single_ for single_, all_ in predictions])
+        all_frames_pred = np.concatenate(
+            [all_ for single_, all_ in predictions])
+        single_frame_predictions, all_frame_predictions = single_frame_pred[:
+                                                                            self
+                                                                            .
+                                                                            len_frames], all_frames_pred[:
+                                                                                                         self
+                                                                                                         .
+                                                                                                         len_frames]
+
+        scenes = self.predictions_to_scenes(single_frame_predictions)
+
+        if print_output:
+            print("Current video file: {0}".format(self.input_file))
+            print("\tShot Boundarys: {0}".format(scenes))
+
+        if self.output_path:
+            if not os.path.exists(self.output_path):
+                os.makedirs(self.output_path)
+            predictions = np.stack(
+                [single_frame_predictions, all_frame_predictions], 1)
+            predictions_file = os.path.join(self.output_path,
+                                            self.filename + "_predictions.txt")
+            np.savetxt(predictions_file, predictions, fmt="%.6f")
+            scenes_file = os.path.join(self.output_path,
+                                       self.filename + "_scenes.txt")
+            np.savetxt(scenes_file, scenes, fmt="%d")
+
+            if self.visualize:
+                pil_image = self.visualize_predictions(
+                    self.frames,
+                    predictions=(single_frame_predictions,
+                                 all_frame_predictions))
+                image_file = os.path.join(self.output_path,
+                                          self.filename + "_vis.png")
+                pil_image.save(image_file)
+
+
+@INFERENCE.register()
+class ADDS_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 frame_idxs=[0],
+                 num_scales=4,
+                 side_map={
+                     "2": 2,
+                     "3": 3,
+                     "l": 2,
+                     "r": 3
+                 },
+                 height=256,
+                 width=512,
+                 full_res_shape=None,
+                 num_channels=None,
+                 img_ext=".png",
+                 K=None):
+
+        self.frame_idxs = frame_idxs
+        self.num_scales = num_scales
+        self.side_map = side_map
+        self.full_res_shape = full_res_shape
+        self.img_ext = img_ext
+        self.height = height
+        self.width = width
+        self.K = K
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {
+            'filename': input_file,
+            'mode': 'infer',
+            'day_or_night': 'day',
+        }
+        ops = [
+            ImageDecoder(
+                backend='pil',
+                dataset='kitti',
+                frame_idxs=self.frame_idxs,
+                num_scales=self.num_scales,
+                side_map=self.side_map,
+                full_res_shape=self.full_res_shape,
+                img_ext=self.img_ext,
+            ),
+            GroupResize(
+                height=self.height,
+                width=self.width,
+                K=self.K,
+                scale=1,
+                mode='infer',
+            ),
+            ToArray(),
+        ]
+        for op in ops:
+            results = op(results)
+        res = results['imgs'][('color', 0, 0)]
+        res = np.expand_dims(res, axis=0).copy()
+        return [res]
+
+    def postprocess(self, output, print_output, save_dir='data/'):
+        """
+        output: list
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        print(len(output))
+        N = len(self.input_file)
+        for i in range(N):
+            pred_depth = output[i]  # [H, W]
+            if print_output:
+                print("Current input image: {0}".format(self.input_file[i]))
+                file_name = os.path.basename(self.input_file[i]).split('.')[0]
+                save_path = os.path.join(save_dir,
+                                         file_name + "_depth" + ".png")
+                pred_depth_color = self._convertPNG(pred_depth)
+                pred_depth_color.save(save_path)
+                print(f"pred depth image saved to: {save_path}")
+
+    def _convertPNG(self, image_numpy):
+        disp_resized = cv2.resize(image_numpy, (1280, 640))
+        disp_resized_np = disp_resized
+        vmax = np.percentile(disp_resized_np, 95)
+        normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
+        mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
+        colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
+                          255).astype(np.uint8)
+        im = Image.fromarray(colormapped_im)
+        return im
+
+
+@INFERENCE.register()
+class AVA_SlowFast_FastRCNN_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 detection_model_name,
+                 detection_model_weights,
+                 config_file_path,
+                 predict_stepsize=8,
+                 output_stepsize=4,
+                 output_fps=6,
+                 out_filename='ava_det_demo.mp4',
+                 num_frames=32,
+                 alpha=4,
+                 target_size=256):
+        self.detection_model_name = detection_model_name
+        self.detection_model_weights = detection_model_weights
+
+        self.config = get_config(config_file_path,
+                                 show=False)  #parse config file
+        self.predict_stepsize = predict_stepsize
+        self.output_stepsize = output_stepsize
+        self.output_fps = output_fps
+        self.out_filename = out_filename
+        self.num_frames = num_frames
+        self.alpha = alpha
+        self.target_size = target_size
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        """
+
+        frame_dir = 'tmp_frames'
+        self.frame_paths, frames, FPS = frame_extraction(input_file, frame_dir)
+        num_frame = len(self.frame_paths)  #视频秒数*FPS
+        assert num_frame != 0
+
+        # 帧图像高度和宽度
+        h, w, _ = frames[0].shape
+
+        # Get clip_len, frame_interval and calculate center index of each clip
+        data_process_pipeline = build_pipeline(
+            self.config.PIPELINE.test)  #测试时输出处理流水配置
+
+        clip_len = self.config.PIPELINE.test.sample['clip_len']
+        assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+        frame_interval = self.config.PIPELINE.test.sample['frame_interval']
+
+        # 此处关键帧每秒取一个
+        clip_len = self.config.PIPELINE.test.sample['clip_len']
+        assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+        frame_interval = self.config.PIPELINE.test.sample['frame_interval']
+        window_size = clip_len * frame_interval
+        timestamps = np.arange(window_size // 2,
+                               (num_frame + 1 - window_size // 2),
+                               self.predict_stepsize)
+
+        selected_frame_list = []
+        for timestamp in timestamps:
+            selected_frame_list.append(self.frame_paths[timestamp - 1])
+
+        # Load label_map
+        label_map_path = self.config.DATASET.test['label_file']
+        self.categories, self.class_whitelist = read_labelmap(
+            open(label_map_path))
+        label_map = {}
+        for item in self.categories:
+            id = item['id']
+            name = item['name']
+            label_map[id] = name
+
+        self.label_map = label_map
+
+        detection_result_dir = 'tmp_detection'
+        detection_model_name = self.detection_model_name
+        detection_model_weights = self.detection_model_weights
+        detection_txt_list = detection_inference(selected_frame_list,
+                                                 detection_result_dir,
+                                                 detection_model_name,
+                                                 detection_model_weights)
+        assert len(detection_txt_list) == len(timestamps)
+
+        human_detections = []
+        data_list = []
+        person_num_list = []
+
+        for timestamp, detection_txt_path in zip(timestamps,
+                                                 detection_txt_list):
+            proposals, scores = get_detection_result(
+                detection_txt_path, h, w,
+                (float)(self.config.DATASET.test['person_det_score_thr']))
+
+            if proposals.shape[0] == 0:
+                #person_num_list.append(0)
+                human_detections.append(None)
+                continue
+
+            human_detections.append(proposals)
+
+            result = get_timestep_result(frame_dir,
+                                         timestamp,
+                                         clip_len,
+                                         frame_interval,
+                                         FPS=FPS)
+            result["proposals"] = proposals
+            result["scores"] = scores
+
+            new_result = data_process_pipeline(result)
+            proposals = new_result['proposals']
+
+            img_slow = new_result['imgs'][0]
+            img_slow = img_slow[np.newaxis, :]
+            img_fast = new_result['imgs'][1]
+            img_fast = img_fast[np.newaxis, :]
+
+            proposals = proposals[np.newaxis, :]
+
+            scores = scores[np.newaxis, :]
+
+            img_shape = np.asarray(new_result['img_shape'])
+            img_shape = img_shape[np.newaxis, :]
+
+            data = [
+                paddle.to_tensor(img_slow, dtype='float32'),
+                paddle.to_tensor(img_fast, dtype='float32'),
+                paddle.to_tensor(proposals, dtype='float32'),
+                paddle.to_tensor(img_shape, dtype='int32')
+            ]
+
+            person_num = proposals.shape[1]
+            person_num_list.append(person_num)
+
+            data_list.append(data)
+
+        self.human_detections = human_detections
+        self.person_num_list = person_num_list
+        self.timestamps = timestamps
+        self.frame_dir = frame_dir
+        self.detection_result_dir = detection_result_dir
+
+        return data_list
+
+    def postprocess(self, outputs, print_output=True):
+        """
+        output: list
+        """
+        predictions = []
+
+        assert len(self.person_num_list) == len(outputs)
+
+        #print("***  self.human_detections",len( self.human_detections))
+        #print("***  outputs",len( outputs))
+
+        index = 0
+        for t_index in range(len(self.timestamps)):
+            if self.human_detections[t_index] is None:
+                predictions.append(None)
+                continue
+
+            human_detection = self.human_detections[t_index]
+
+            output = outputs[index]
+            result = output  #长度为类别个数，不包含背景
+
+            person_num = self.person_num_list[index]
+
+            index = index + 1
+
+            prediction = []
+
+            if human_detection is None:
+                predictions.append(None)
+                continue
+
+            # N proposals
+            for i in range(person_num):
+                prediction.append([])
+
+            # Perform action score thr
+            for i in range(len(result)):  # for class
+                if i + 1 not in self.class_whitelist:
+                    continue
+                for j in range(person_num):
+                    if result[i][j, 4] > self.config.MODEL.head['action_thr']:
+                        prediction[j].append(
+                            (self.label_map[i + 1], result[i][j, 4]
+                             ))  # label_map is a dict, label index start from 1
+            predictions.append(prediction)
+
+        results = []
+        for human_detection, prediction in zip(self.human_detections,
+                                               predictions):
+            results.append(pack_result(human_detection, prediction))
+
+        def dense_timestamps(timestamps, n):
+            """Make it nx frames."""
+            old_frame_interval = (timestamps[1] - timestamps[0])
+            start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+            new_frame_inds = np.arange(
+                len(timestamps) * n) * old_frame_interval / n + start
+            return new_frame_inds.astype(np.int)
+
+        dense_n = int(self.predict_stepsize / self.output_stepsize)  #30
+        frames = [
+            cv2.imread(self.frame_paths[i - 1])
+            for i in dense_timestamps(self.timestamps, dense_n)
+        ]
+
+        vis_frames = visualize(frames, results)
+
+        try:
+            import moviepy.editor as mpy
+        except ImportError:
+            raise ImportError('Please install moviepy to enable output file')
+
+        vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+                                    fps=self.output_fps)
+        vid.write_videofile(self.out_filename)
+        print("finish write !")
+
+        # delete tmp files and dirs
+        shutil.rmtree(self.frame_dir)
+        shutil.rmtree(self.detection_result_dir)
+
+
+@INFERENCE.register()
+class PoseC3D_Inference_helper(Base_Inference_helper):
+    def __init__(self, top_k=1):
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        with open(input_file, 'rb') as f:
+            data = pickle.load(f)
+        self.input_file = input_file
+
+        left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
+        right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
+        ops = [
+            UniformSampleFrames(clip_len=48, num_clips=10, test_mode=True),
+            PoseDecode(),
+            PoseCompact(hw_ratio=1., allow_imgpad=True),
+            Resize(scale=(-1, 56)),
+            CenterCrop_V2(crop_size=56),
+            GeneratePoseTarget(sigma=0.6,
+                               use_score=True,
+                               with_kp=True,
+                               with_limb=False,
+                               double=True,
+                               left_kp=left_kp,
+                               right_kp=right_kp),
+            FormatShape(input_format='NCTHW'),
+            Collect(keys=['imgs', 'label'], meta_keys=[])
+        ]
+
+        for op in ops:
+            results = op(data)
+        results = [results[0][np.newaxis, :, :, :, :, :]]
+        self.num_segs = results[0].shape[1]
+        return results
+
+    def postprocess(self, outputs, print_output=True):
+        batch_size = outputs[0].shape[0]
+        cls_score = outputs[0].reshape(
+            [batch_size // self.num_segs, self.num_segs, outputs[0].shape[-1]])
+        output = F.softmax(paddle.to_tensor(cls_score),
+                           axis=2).mean(axis=1).numpy()
+        N = len(self.input_file)
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                for j in range(self.top_k):
+                    print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+                    print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+                  
+
+@INFERENCE.register()
+class YOWO_Inference_helper(Base_Inference_helper):
+
+    def __init__(self,
+                 num_seg=16,
+                 target_size=224,
+                 nms_thresh=0.5,
+                 conf_thresh_valid=0.5,
+                 mean=[0.4345, 0.4051, 0.3775],
+                 std=[0.2768, 0.2713, 0.2737]):
+        self.num_seg = num_seg
+        self.target_size = target_size
+        self.nms_thresh = nms_thresh
+        self.conf_thresh_valid = conf_thresh_valid
+        self.mean = mean
+        self.std = std
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        cap = cv2.VideoCapture(input_file)
+        queue = []
+        inputs = []
+        frames = []
+        while (cap.isOpened()):
+            ret, frame = cap.read()
+            if ret == False:
+                break
+            if len(queue) <= 0:  # At initialization, populate queue with initial frame
+                for i in range(self.num_seg):
+                    queue.append(frame)
+
+            # Add the read frame to last and pop out the oldest one
+            queue.append(frame)
+            queue.pop(0)
+
+            # Resize images
+            imgs = [cv2.resize(img, (self.target_size, self.target_size), interpolation=cv2.INTER_LINEAR) for img in
+                    queue]
+
+            # Convert image to CHW keeping BGR order.
+            imgs = [img.transpose([2, 0, 1]) for img in imgs]
+
+            # Image [0, 255] -> [0, 1].
+            imgs = [img / 255.0 for img in imgs]
+
+            imgs = [
+                np.ascontiguousarray(
+                    img.reshape((3, imgs[0].shape[1], imgs[0].shape[2]))
+                ).astype(np.float32)
+                for img in imgs
+            ]
+
+            # Concat list of images to single ndarray.
+            imgs = np.concatenate(
+                [np.expand_dims(img, axis=1) for img in imgs], axis=1
+            )
+
+            imgs = np.ascontiguousarray(imgs)
+            imgs = np.expand_dims(imgs, axis=0)
+            imgs = np.expand_dims(imgs, axis=0)
+            inputs.append(imgs)
+            frames.append(queue[-1])
+
+        return inputs, frames
+
+    def postprocess(self, outputs, frame, filename, save_img=True):
+        """
+        outputs: list
+        frames: list
+        """
+        labels = [
+            "Basketball", "BasketballDunk", "Biking", "CliffDiving", "CricketBowling",
+            "Diving", "Fencing", "FloorGymnastics", "GolfSwing", "HorseRiding",
+            "IceDancing", "LongJump", "PoleVault", "RopeClimbing", "SalsaSpin",
+            "SkateBoarding", "Skiing", "Skijet", "SoccerJuggling", "Surfing",
+            "TennisSwing", "TrampolineJumping", "VolleyballSpiking", "WalkingWithDog"]
+        nms_thresh = 0.5
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        for out in outputs:
+            out = paddle.to_tensor(out)
+            preds = []
+            all_boxes = get_region_boxes(out)
+            for i in range(out.shape[0]):
+                boxes = all_boxes[i]
+                boxes = nms(boxes, nms_thresh)
+
+                for box in boxes:
+                    x1 = round(float(box[0] - box[2] / 2.0) * 320.0)
+                    y1 = round(float(box[1] - box[3] / 2.0) * 240.0)
+                    x2 = round(float(box[0] + box[2] / 2.0) * 320.0)
+                    y2 = round(float(box[1] + box[3] / 2.0) * 240.0)
+
+                    det_conf = float(box[4])
+                    for j in range((len(box) - 5) // 2):
+                        cls_conf = float(box[5 + 2 * j].item())
+                        prob = det_conf * cls_conf
+                    preds.append([[x1, y1, x2, y2], prob, labels[int(box[6])]])
+
+            for _, dets in enumerate(preds):
+                if dets[1] < 0.4:
+                    break
+                text = dets[2] + ' ' + '{:.2f}'.format(dets[1])
+                cv2.rectangle(frame, (dets[0][0], dets[0][1]), (dets[0][2], dets[0][3]), (0, 255, 0), 2)
+                cv2.putText(frame, text, (dets[0][0] + 3, dets[0][1] - 5 - 10 * _), font, 0.5, (0, 255, 0), 2)
+            cv2.imwrite('{}.jpg'.format(filename), frame)
\ No newline at end of file
diff --git a/docs/src/tools/wheel.py b/docs/src/tools/wheel.py
new file mode 100644
index 000000000..77281bee8
--- /dev/null
+++ b/docs/src/tools/wheel.py
@@ -0,0 +1,354 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+__dir__ = os.path.dirname(__file__)
+sys.path.append(os.path.join(__dir__, ''))
+
+import numpy as np
+import tarfile
+import requests
+from tqdm import tqdm
+import shutil
+
+from paddle import inference
+from paddle.inference import Config, create_predictor
+
+from tools.utils import ppTSM_Inference_helper
+
+__all__ = ['PaddleVideo']
+
+# path of download model and data
+BASE_DIR = os.path.expanduser("~/.paddlevideo_inference/")
+BASE_INFERENCE_MODEL_DIR = os.path.join(BASE_DIR, 'inference_model')
+BASE_VIDEOS_DIR = os.path.join(BASE_DIR, 'videos')
+
+# support Models
+MODELS = {
+    'ppTSM':
+    'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_infer.tar',
+    'ppTSM_v2':
+    'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_v2_infer.tar'
+}
+
+MODEL_NAMES = list(MODELS.keys())
+
+
+def parse_args(mMain=True, add_help=True):
+    """
+    Args:
+        mMain: bool. True for command args, False for python interface
+    """
+    import argparse
+
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    if mMain == True:
+
+        # general params
+        parser = argparse.ArgumentParser(add_help=add_help)
+        parser.add_argument("--model_name", type=str, default='')
+        parser.add_argument("-v", "--video_file", type=str, default='')
+        parser.add_argument("--use_gpu", type=str2bool, default=True)
+
+        # params for decode and sample
+        parser.add_argument("--num_seg", type=int, default=16)
+
+        # params for preprocess
+        parser.add_argument("--short_size", type=int, default=256)
+        parser.add_argument("--target_size", type=int, default=224)
+
+        # params for predict
+        parser.add_argument("--model_file", type=str, default='')
+        parser.add_argument("--params_file", type=str)
+        parser.add_argument("-b", "--batch_size", type=int, default=1)
+        parser.add_argument("--use_fp16", type=str2bool, default=False)
+        parser.add_argument("--ir_optim", type=str2bool, default=True)
+        parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+        parser.add_argument("--gpu_mem", type=int, default=8000)
+        parser.add_argument("--top_k", type=int, default=1)
+        parser.add_argument("--enable_mkldnn", type=bool, default=False)
+        parser.add_argument("--label_name_path", type=str, default='')
+
+        return parser.parse_args()
+
+    else:
+        return argparse.Namespace(model_name='',
+                                  video_file='',
+                                  use_gpu=True,
+                                  num_seg=16,
+                                  short_size=256,
+                                  target_size=224,
+                                  model_file='',
+                                  params_file='',
+                                  batch_size=1,
+                                  use_fp16=False,
+                                  ir_optim=True,
+                                  use_tensorrt=False,
+                                  gpu_mem=8000,
+                                  top_k=1,
+                                  enable_mkldnn=False,
+                                  label_name_path='')
+
+
+def parse_file_paths(input_path: str) -> list:
+    if os.path.isfile(input_path):
+        files = [
+            input_path,
+        ]
+    else:
+        files = os.listdir(input_path)
+        files = [
+            file for file in files
+            if (file.endswith(".avi") or file.endswith(".mp4"))
+        ]
+        files = [os.path.join(input_path, file) for file in files]
+    return files
+
+
+def download_with_progressbar(url, save_path):
+    response = requests.get(url, stream=True)
+    total_size_in_bytes = int(response.headers.get('content-length', 0))
+    block_size = 1024  # 1 Kibibyte
+    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+    with open(save_path, 'wb') as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+    if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes:
+        raise Exception("Something went wrong while downloading models")
+
+
+def download_inference_model(model_storage_directory, url):
+    # using custom model
+    tar_file_name_list = [
+        'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel'
+    ]
+    if not os.path.exists(
+            os.path.join(model_storage_directory,
+                         'inference.pdiparams')) or not os.path.exists(
+                             os.path.join(model_storage_directory,
+                                          'inference.pdmodel')):
+        tmp_path = os.path.join(model_storage_directory, url.split('/')[-1])
+        print('download {} to {}'.format(url, tmp_path))
+        os.makedirs(model_storage_directory, exist_ok=True)
+        download_with_progressbar(url, tmp_path)  #download
+
+        #save to directory
+        with tarfile.open(tmp_path, 'r') as tarObj:
+            for member in tarObj.getmembers():
+                filename = None
+                for tar_file_name in tar_file_name_list:
+                    if tar_file_name in member.name:
+                        filename = tar_file_name
+                if filename is None:
+                    continue
+                file = tarObj.extractfile(member)
+                with open(os.path.join(model_storage_directory, filename),
+                          'wb') as f:
+                    f.write(file.read())
+        os.remove(tmp_path)
+
+
+def create_paddle_predictor(args):
+    config = Config(args.model_file, args.params_file)
+
+    if args.use_gpu:
+        config.enable_use_gpu(args.gpu_mem, 0)
+    else:
+        config.disable_gpu()
+        if args.enable_mkldnn:
+            # cache 10 different shapes for mkldnn to avoid memory leak
+            config.set_mkldnn_cache_capacity(10)
+            config.enable_mkldnn()
+
+    config.disable_glog_info()
+    config.switch_ir_optim(args.ir_optim)  # default true
+    if args.use_tensorrt:
+        config.enable_tensorrt_engine(
+            precision_mode=Config.Precision.Half
+            if args.use_fp16 else Config.Precision.Float32,
+            max_batch_size=args.batch_size)
+
+    config.enable_memory_optim()
+    # use zero copy
+    config.switch_use_feed_fetch_ops(False)
+    predictor = create_predictor(config)
+
+    return predictor
+
+
+def load_label_name_dict(path):
+    result = {}
+    if not os.path.exists(path):
+        print(
+            'Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!'
+        )
+    else:
+        for line in open(path, 'r'):
+            partition = line.split('\n')[0].partition(' ')
+            try:
+                result[int(partition[0])] = str(partition[-1])
+            except:
+                result = {}
+                break
+    return result
+
+
+class PaddleVideo(object):
+    def __init__(self, **kwargs):
+        print(
+            '\nInference models that Paddle provides are listed as follows:\n{}'
+            .format(MODEL_NAMES), '\n')
+        process_params = parse_args(mMain=False, add_help=False)
+        process_params.__dict__.update(**kwargs)
+
+        if not os.path.exists(process_params.model_file):
+            if process_params.model_name is None:
+                raise Exception('Please input model name that you want to use!')
+            if process_params.model_name in MODEL_NAMES:
+                url = MODELS[process_params.model_name]
+                download_path = os.path.join(BASE_INFERENCE_MODEL_DIR,
+                                             process_params.model_name)
+                if not os.path.exists(download_path):
+                    os.makedirs(download_path)
+
+                #create pretrained model download_path
+                download_inference_model(model_storage_directory=download_path,
+                                         url=url)
+
+                process_params.model_file = os.path.join(
+                    download_path, 'inference.pdmodel')
+                process_params.params_file = os.path.join(
+                    download_path, 'inference.pdiparams')
+                process_params.label_name_path = os.path.join(
+                    __dir__, '../data/k400/Kinetics-400_label_list.txt')
+            else:
+                raise Exception(
+                    'If you want to use your own model, Please input model_file as model path!'
+                )
+        else:
+            print('Using user-specified model and params!')
+        print("process params are as follows: \n{}".format(process_params))
+        self.label_name_dict = load_label_name_dict(
+            process_params.label_name_path)
+
+        self.args = process_params
+        self.predictor = create_paddle_predictor(process_params)
+
+    def predict(self, video):
+        """
+        predict label of video with paddlevideo
+        Args:
+            video:input video for clas, support single video , internet url, folder path containing series of videos
+        Returns:
+            list[dict:{videoname: "",class_ids: [], scores: [], label_names: []}],if label name path is None,label names will be empty
+        """
+        video_list = []
+        assert isinstance(video, (str))
+
+        # get input_tensor and output_tensor
+        input_names = self.predictor.get_input_names()
+        output_names = self.predictor.get_output_names()
+        input_tensor_list = []
+        output_tensor_list = []
+        for item in input_names:
+            input_tensor_list.append(self.predictor.get_input_handle(item))
+        for item in output_names:
+            output_tensor_list.append(self.predictor.get_output_handle(item))
+
+        if isinstance(video, str):
+            # download internet video
+            if video.startswith('http'):
+                if not os.path.exists(BASE_VIDEOS_DIR):
+                    os.makedirs(BASE_VIDEOS_DIR)
+                video_path = os.path.join(BASE_VIDEOS_DIR, 'tmp.mp4')
+                download_with_progressbar(video, video_path)
+                print("Current using video from Internet:{}, renamed as: {}".
+                      format(video, video_path))
+                video = video_path
+            files = parse_file_paths(video)
+        else:
+            print('Please input legal video!')
+
+        # Inferencing process
+        InferenceHelper = ppTSM_Inference_helper(
+            num_seg=self.args.num_seg,
+            short_size=self.args.short_size,
+            target_size=self.args.target_size,
+            top_k=self.args.top_k)
+        batch_num = self.args.batch_size
+        for st_idx in range(0, len(files), batch_num):
+            ed_idx = min(st_idx + batch_num, len(files))
+
+            # Pre process batched input
+            batched_inputs = InferenceHelper.preprocess_batch(
+                files[st_idx:ed_idx])
+
+            # run inference
+            for i in range(len(input_tensor_list)):
+                input_tensor_list[i].copy_from_cpu(batched_inputs[i])
+            self.predictor.run()
+
+            batched_outputs = []
+            for j in range(len(output_tensor_list)):
+                batched_outputs.append(output_tensor_list[j].copy_to_cpu())
+
+            results_list = InferenceHelper.postprocess(batched_outputs,
+                                                       print_output=False,
+                                                       return_result=True)
+
+            for res in results_list:
+                classes = res["topk_class"]
+                label_names = []
+                if len(self.label_name_dict) != 0:
+                    label_names = [self.label_name_dict[c] for c in classes]
+                res["label_names"] = label_names
+
+                print("Current video file: {0}".format(res["video_id"]))
+                print("\ttop-{0} classes: {1}".format(len(res["topk_class"]),
+                                                      res["topk_class"]))
+                print("\ttop-{0} scores: {1}".format(len(res["topk_scores"]),
+                                                     res["topk_scores"]))
+                print("\ttop-{0} label names: {1}".format(
+                    len(res["label_names"]), res["label_names"]))
+
+
+def main():
+    # for cmd
+    args = parse_args(mMain=True)
+    clas_engine = PaddleVideo(**(args.__dict__))
+    clas_engine.predict(args.video_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docs/tree.html b/docs/tree.html
new file mode 100644
index 000000000..090ba7c40
--- /dev/null
+++ b/docs/tree.html
@@ -0,0 +1,1237 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link rel="stylesheet" href="github-markdown.css">
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap-icons/font/bootstrap-icons.css" rel="stylesheet">
+    <title>Project structure of: PaddlePaddle/PaddleVideo</title>
+    <style>
+        .directory:hover {
+            cursor: pointer;
+            text-decoration: underline;
+        }
+
+        :target {
+            background-color: yellow !important;
+        }
+
+        .folded::before {
+            content: '\25B6';
+            margin-right: 8px;
+            /* Add space between marker and text */
+        }
+
+        .expanded::before {
+            content: '\25BC';
+            margin-right: 8px;
+            /* Add space between marker and text */
+        }
+    </style>
+    <script>
+        function toggleVisibility(element) {
+            let mclass = element.getAttribute("class");
+            if (mclass == "expanded") {
+                element.setAttribute('class', 'folded');
+            } else {
+                element.setAttribute('class', 'expanded');
+            }
+            const siblings = element.parentNode.children;
+            for (const sibling of siblings) {
+                if (sibling !== element) {
+                    sibling.style.display = (sibling.style.display === 'none') ? 'block' : 'none';
+                }
+            }
+        }
+    </script>
+    <style>
+        .markdown-body {
+            box-sizing: border-box;
+            min-width: 200px;
+            max-width: 980px;
+            margin: 0 auto;
+            padding: 45px;
+        }
+
+        @media (max-width: 767px) {
+            .markdown-body {
+                padding: 15px;
+            }
+
+            .partial-repository-url {
+                display: none;
+            }
+        }
+
+        ul {
+            list-style: none;
+        }
+        #feeling-lucky:hover{
+            cursor: pointer;
+        }
+    </style>
+</head>
+
+<body>
+    <article class="markdown-body">
+        <h2>Project structure<span hierarchy="0" class="partial-repository-url"> of: PaddlePaddle/PaddleVideo</span><div style="float: right;"><a title="Document index" style="margin:3.5px;" href="index.html"><i class="bi bi-search"></i></a><a title="Feeling lucky" style="margin:3.5px;" id="feeling-lucky" href="#"><i class="bi bi-dice-3"></i></a><a title="Expand tree" style="margin:3.5px;" href="tree.html?full=true" id="expand-tree"><i class="bi bi-caret-down-square"></i></a></div></h2>
+<ul>
+<li><span hierarchy="0" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/"><code>PaddleVideo</code></strong> <em>Multimedia AI Tools &amp; Libraries</em></span><ul>
+<li><a class="file_link" href="index.html?q=/__init__.py" id="/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Python module, licenses, imports PaddleVideo class.</em></li>
+<li><span hierarchy="1" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/"><code>applications</code></strong> <em>Applications: Tools and Solutions Directory</em></span><ul>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/AbnormalActionDetection/"><code>AbnormalActionDetection</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/AbnormalActionDetection/README.md" id="/applications/AbnormalActionDetection/README.md"><code>README&period;md</code></a> <em>Video action detection for abnormal behavior. SlowFast+FasterRCNN.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/Anti-UAV/"><code>Anti-UAV</code></strong> <em>Detect and prevent UAVs in restricted zones with object detection models.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/Anti-UAV/get_image_label.py" id="/applications/Anti-UAV/get_image_label.py"><code>get&UnderBar;image&UnderBar;label&period;py</code></a> <em>Trains and validates object detection models using video frames.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Anti-UAV/README.md" id="/applications/Anti-UAV/README.md"><code>README&period;md</code></a> <em>Detect UAVs in restricted zones using PaddleDetection.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/BasketballAction/"><code>BasketballAction</code></strong> <em>Basketball action prediction and analysis toolkit, PaddlePaddle 2.0</em></span><ul>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/BasketballAction/predict/"><code>predict</code></strong> <em>Basketball action prediction and analysis toolkit</em></span><ul>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/BasketballAction/predict/action_detect/"><code>action&UnderBar;detect</code></strong> <em>Basketball action detection and analysis tool for ML and datasets</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/action.py" id="/applications/BasketballAction/predict/action_detect/action.py"><code>action&period;py</code></a> <em>Basketball action detection Python script</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/logger.py" id="/applications/BasketballAction/predict/action_detect/logger.py"><code>logger&period;py</code></a> <em>Custom logger class for news stripper app</em></li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/BasketballAction/predict/action_detect/mfcc/"><code>mfcc</code></strong> <em>Basketball audio analysis and VGGish model.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py" id="/applications/BasketballAction/predict/action_detect/mfcc/feature_extractor.py"><code>feature&UnderBar;extractor&period;py</code></a> <em>Audio feature extraction for basketball actions.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/mfcc/model_config.py" id="/applications/BasketballAction/predict/action_detect/mfcc/model_config.py"><code>model&UnderBar;config&period;py</code></a> <em>ModelAudio: Extracts audio features, slices data, appends to list.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py" id="/applications/BasketballAction/predict/action_detect/mfcc/vgg_params.py"><code>vgg&UnderBar;params&period;py</code></a> <em>Global VGGish model parameters. Audio feature extraction, PCA, embedding. Adjustable settings.</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/BasketballAction/predict/action_detect/models/"><code>models</code></strong> <em>Basketball action detection models</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/models/audio_infer.py" id="/applications/BasketballAction/predict/action_detect/models/audio_infer.py"><code>audio&UnderBar;infer&period;py</code></a> <em>Audio inference model for predicting basketball actions.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/models/bmn_infer.py" id="/applications/BasketballAction/predict/action_detect/models/bmn_infer.py"><code>bmn&UnderBar;infer&period;py</code></a> <em>Basketball action detection via BMN inferencing</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/models/lstm_infer.py" id="/applications/BasketballAction/predict/action_detect/models/lstm_infer.py"><code>lstm&UnderBar;infer&period;py</code></a> <em>Basketball LSTM action detection model with GPU optim.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py" id="/applications/BasketballAction/predict/action_detect/models/pptsm_infer.py"><code>pptsm&UnderBar;infer&period;py</code></a> <em>PaddleVideo-based action detection with PPTSM inference</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/BasketballAction/predict/action_detect/reader/"><code>reader</code></strong> <em>Manages audio readers for YouTube dataset ML action detection</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/reader/__init__.py" id="/applications/BasketballAction/predict/action_detect/reader/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Registers readers for different file formats. Alphabetical order.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/reader/audio_reader.py" id="/applications/BasketballAction/predict/action_detect/reader/audio_reader.py"><code>audio&UnderBar;reader&period;py</code></a> <em>AudioReader class for YouTube-8M dataset management.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py" id="/applications/BasketballAction/predict/action_detect/reader/bmninf_reader.py"><code>bmninf&UnderBar;reader&period;py</code></a> <em>Reads and processes BMN model data for video analysis.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/reader/feature_reader.py" id="/applications/BasketballAction/predict/action_detect/reader/feature_reader.py"><code>feature&UnderBar;reader&period;py</code></a> <em>Python FeatureReader for YouTube-8M dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/reader/reader_utils.py" id="/applications/BasketballAction/predict/action_detect/reader/reader_utils.py"><code>reader&UnderBar;utils&period;py</code></a> <em>Video input data reader classes and singleton reader_zoo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py" id="/applications/BasketballAction/predict/action_detect/reader/tsminf_reader.py"><code>tsminf&UnderBar;reader&period;py</code></a> <em>TSMINF reader: multiprocessing, jpg format, ML transformations</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/BasketballAction/predict/action_detect/utils/"><code>utils</code></strong> <em>Basketball action detection and prediction tools</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/utils/config_utils.py" id="/applications/BasketballAction/predict/action_detect/utils/config_utils.py"><code>config&UnderBar;utils&period;py</code></a> <em>Loads, processes, prints config files with nested dictionaries.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/utils/preprocess.py" id="/applications/BasketballAction/predict/action_detect/utils/preprocess.py"><code>preprocess&period;py</code></a> <em>Python FFmpeg utils for frames, audio, MP4 download</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/action_detect/utils/process_result.py" id="/applications/BasketballAction/predict/action_detect/utils/process_result.py"><code>process&UnderBar;result&period;py</code></a> <em>Processes video results, applies NMS</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/eval.py" id="/applications/BasketballAction/predict/eval.py"><code>eval&period;py</code></a> <em>Best IOU and score threshold for evaluating basketball actions found.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/predict/predict.py" id="/applications/BasketballAction/predict/predict.py"><code>predict&period;py</code></a> <em>Basketball Action Prediction, JSON Output.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/BasketballAction/README.md" id="/applications/BasketballAction/README.md"><code>README&period;md</code></a> <em>Basketball action detection app, F1-score 80.14%, PaddlePaddle 2.0</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/"><code>EIVideo</code></strong> <em>EIVideo: Image/video tools for Windows</em></span><ul>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/"><code>EIVideo</code></strong> <em>EIVideo: Image/video retrieval and conversion tool.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/__init__.py" id="/applications/EIVideo/EIVideo/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>EIVideo <strong>init</strong>.py: Sets root path, defines constants, constructs full paths.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/api.py" id="/applications/EIVideo/EIVideo/api.py"><code>api&period;py</code></a> <em>EIVideo API: Retrieves, converts, and annotates images/videos.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/main.py" id="/applications/EIVideo/EIVideo/main.py"><code>main&period;py</code></a> <em>Trains PaddleVideo model with distributed training and mixed precision.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/"><code>paddlevideo</code></strong> <em>EIVideo PaddlePaddle video processing tools</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>EIVideo/paddlevideo initialization file</em></li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/loader/"><code>loader</code></strong> <em>EIVideo PaddleVideo loader, functionalities organized.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/loader/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports, defines, exports, licenses</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py" id="/applications/EIVideo/EIVideo/paddlevideo/loader/builder.py"><code>builder&period;py</code></a> <em>PaddleVideo dataset loader, signal handlers for SIGINT/TERM.</em></li>
+<li><span hierarchy="6" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/"><code>pipelines</code></strong> <em>EIVideo pipelines: composable components for image preprocessing</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Image preprocessing pipelines for PaddleVideo's EIVideo application.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py" id="/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/compose.py"><code>compose&period;py</code></a> <em>Flexible composition of pipeline components.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py" id="/applications/EIVideo/EIVideo/paddlevideo/loader/pipelines/custom_transforms_f.py"><code>custom&UnderBar;transforms&UnderBar;f&period;py</code></a> <em>Paddle Video's image preprocessing classes for resizing, aspect ratio adjustment, and custom cropping transforms.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py" id="/applications/EIVideo/EIVideo/paddlevideo/loader/registry.py"><code>registry&period;py</code></a> <em>Organizes PaddleVideo functionalities in 4 registries.</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/metrics/"><code>metrics</code></strong> <em>Video metrics for object segmentation applications.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/metrics/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Apache licensed PaddleVideo library init file for VOSMetric and build_metric functions.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py" id="/applications/EIVideo/EIVideo/paddlevideo/metrics/base.py"><code>base&period;py</code></a> <em>Abstract base class for PaddleVideo metrics.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py" id="/applications/EIVideo/EIVideo/paddlevideo/metrics/build.py"><code>build&period;py</code></a> <em>Apache License v2.0, Python build metric tool</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py" id="/applications/EIVideo/EIVideo/paddlevideo/metrics/registry.py"><code>registry&period;py</code></a> <em>Registry initializer for metric management.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py" id="/applications/EIVideo/EIVideo/paddlevideo/metrics/vos_metric.py"><code>vos&UnderBar;metric&period;py</code></a> <em>VOS Metric: Video Object Segmentation.</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/"><code>modeling</code></strong> <em>EIVideo model components library</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports and registers PaddleVideo modules, includes popular models.</em></li>
+<li><span hierarchy="6" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/"><code>backbones</code></strong> <em>Video backbones repository for Paddle, various models.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>DeepLab import for deeplab_manet module included in <strong>all</strong> list.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/aspp_manet.py"><code>aspp&UnderBar;manet&period;py</code></a> <em>ASPP-MANET backbone model initialization in Paddle</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/decoder_manet.py"><code>decoder&UnderBar;manet&period;py</code></a> <em>Paddle Decoder class for Manet architecture</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/deeplab_manet.py"><code>deeplab&UnderBar;manet&period;py</code></a> <em>Introduces FrozenBatchNorm2d, DeepLab network backbone with freezed BatchNorm layers.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/backbones/resnet_manet.py"><code>resnet&UnderBar;manet&period;py</code></a> <em>ResNet-MANET model with BatchNorm, ReLU, residual blocks.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/builder.py"><code>builder&period;py</code></a> <em>Builds computer vision models with configuration.</em></li>
+<li><span hierarchy="6" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/"><code>framework</code></strong> <em>PaddleVideo framework: BaseSegment, Manet classes, video segmentation</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo framework: BaseSegment, Manet classes.</em></li>
+<li><span hierarchy="7" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/"><code>segment</code></strong> <em>Video segmentation framework in PaddlePaddle</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddlePaddle components for video segmentation</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/base.py"><code>base&period;py</code></a> <em>Base class for semi-Video Object Segmentation in PaddlePaddle</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/framework/segment/manet_stage1.py"><code>manet&UnderBar;stage1&period;py</code></a> <em>Manet Stage 1 Video Segmentation</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="6" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/"><code>heads</code></strong> <em>Attention-based video heads for PaddlePaddle</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Module init, copyright, license, imports IntVOS.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/heads/IntVOS.py"><code>IntVOS&period;py</code></a> <em>Compute L2 distances, apply attention, feature extraction, and pooling.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/registry.py"><code>registry&period;py</code></a> <em>Registry classes for video pipeline components in PaddleVideo's EIVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py" id="/applications/EIVideo/EIVideo/paddlevideo/modeling/weight_init.py"><code>weight&UnderBar;init&period;py</code></a> <em>Initialize weights for PaddlePaddle layer with custom options</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/tasks/"><code>tasks</code></strong> <em>PaddleVideo tasks: testing, model imports.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/tasks/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports "test_model" from "test.py", adds to <strong>all</strong></em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py" id="/applications/EIVideo/EIVideo/paddlevideo/tasks/test.py"><code>test&period;py</code></a> <em>Test model function for gradient-free testing with multi-card support.</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/EIVideo/paddlevideo/utils/"><code>utils</code></strong> <em>Video processing utilities for PaddlePaddle</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo utility functions and classes import, logger/profiler setup.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/build_utils.py"><code>build&UnderBar;utils&period;py</code></a> <em>Builds object from config and registry.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/config.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/config.py"><code>config&period;py</code></a> <em>EIVideo: Config file parsing, printing, visualizing, and checking.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/dist_utils.py"><code>dist&UnderBar;utils&period;py</code></a> <em>Distributed computing utilities for PaddleVideo's EIVideo module.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/logger.py"><code>logger&period;py</code></a> <em>Colorful, configurable, non-propagating logger for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/manet_utils.py"><code>manet&UnderBar;utils&period;py</code></a> <em>OpenCV, PaddleVideo utilities with PyTorch init.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/precise_bn.py"><code>precise&UnderBar;bn&period;py</code></a> <em>PreciseBN: recompute BN stats for improved accuracy</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/profiler.py"><code>profiler&period;py</code></a> <em>Profiler initialization and stop for PaddlePaddle's operator-level timing.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/record.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/record.py"><code>record&period;py</code></a> <em>Logs video processing metrics for clarity.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/registry.py"><code>registry&period;py</code></a> <em>Registry class for mapping names to objects</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py" id="/applications/EIVideo/EIVideo/paddlevideo/utils/save_load.py"><code>save&UnderBar;load&period;py</code></a> <em>Adapts ViT model, loads/saves PaddlePaddle models.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/paddlevideo/version.py" id="/applications/EIVideo/EIVideo/paddlevideo/version.py"><code>version&period;py</code></a> <em>PaddleVideo version: 0.0.1, Apache License 2.0</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/README.MD" id="/applications/EIVideo/EIVideo/README.MD"><code>README&period;MD</code></a> <em>Chinese CLI video annotation tool guide</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/setup.py" id="/applications/EIVideo/EIVideo/setup.py"><code>setup&period;py</code></a> <em>Source citation required for code</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/EIVideo/version.py" id="/applications/EIVideo/EIVideo/version.py"><code>version&period;py</code></a> <em>EIVideo version 0.1a by Acer Zhang (credit requested)</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/QEIVideo/"><code>QEIVideo</code></strong> <em>Interactive PyQt5 tabs GUI video editor.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/__init__.py" id="/applications/EIVideo/QEIVideo/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>QEIVideo Root &amp; Version</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/build_gui.py" id="/applications/EIVideo/QEIVideo/build_gui.py"><code>build&UnderBar;gui&period;py</code></a> <em>Video processing GUI builder with PyQt5.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/QEIVideo/gui/"><code>gui</code></strong> <em>EIVideo: Video/painting app with GUI for path drawing and UI-driven functionality.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/gui/__init__.py" id="/applications/EIVideo/QEIVideo/gui/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Author, date, copyright - EIVideo's QEIVideo gui init.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/gui/demo.py" id="/applications/EIVideo/QEIVideo/gui/demo.py"><code>demo&period;py</code></a> <em>DrawFrame class QWidget for path drawing and mouse events.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/gui/ui_main_window.py" id="/applications/EIVideo/QEIVideo/gui/ui_main_window.py"><code>ui&UnderBar;main&UnderBar;window&period;py</code></a> <em>EIVideo: UI-driven video/painting app with "Hi" message.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/start.py" id="/applications/EIVideo/QEIVideo/start.py"><code>start&period;py</code></a> <em>Initialize QApplication, create GUI, run event loop.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/QEIVideo/tools/"><code>tools</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/tools/__init__.py" id="/applications/EIVideo/QEIVideo/tools/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>EIVideo QEIVideo gui module init comment</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/QEIVideo/ui/"><code>ui</code></strong> <em>QEIVideo GUI demo with PyQt5 and interactive tabs</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/ui/__init__.py" id="/applications/EIVideo/QEIVideo/ui/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>QEIVideo gui init, author, date, copyright.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/ui/demo.py" id="/applications/EIVideo/QEIVideo/ui/demo.py"><code>demo&period;py</code></a> <em>PyQt5 video player UI with interactive buttons and tabs.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/version.py" id="/applications/EIVideo/QEIVideo/version.py"><code>version&period;py</code></a> <em>EIVideo app version info by Acer Zhang 01/11/2022.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/QEIVideo/widget/"><code>widget</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/QEIVideo/widget/PaintBoard.py" id="/applications/EIVideo/QEIVideo/widget/PaintBoard.py"><code>PaintBoard&period;py</code></a> <em>PaintBoard: Widget for drawing, clearing, changing pen attributes, and retrieving content.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/README.md" id="/applications/EIVideo/README.md"><code>README&period;md</code></a> <em>Windows video annotation tool with Baidu AI, maintained by QPT-Family on GitHub.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/resources/"><code>resources</code></strong> <em>EIVideo GitHub branch and code management</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/resources/cmd" id="/applications/EIVideo/resources/cmd"><code>cmd</code></a> <em>Updating EIVideo on GitHub, managing branches and code.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/EIVideo/resources/QT/"><code>QT</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/EIVideo/resources/QT/demo.ui" id="/applications/EIVideo/resources/QT/demo.ui"><code>demo&period;ui</code></a> <em>Qt application UI video demo with interactive elements.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FightRecognition/"><code>FightRecognition</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FightRecognition/README.md" id="/applications/FightRecognition/README.md"><code>README&period;md</code></a> <em>Fight Recognition model guide for PaddleVideo.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FigureSkating/"><code>FigureSkating</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FigureSkating/README.md" id="/applications/FigureSkating/README.md"><code>README&period;md</code></a> <em>OpenPose for figure skating action data processing</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/"><code>FootballAction</code></strong> <em>FootballAction: Audio-based sports prediction.</em></span><ul>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/checkpoints/"><code>checkpoints</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/checkpoints/download.sh" id="/applications/FootballAction/checkpoints/download.sh"><code>download&period;sh</code></a> <em>Download, extract, and delete 4 tar files related to FootballAction.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/datasets/"><code>datasets</code></strong> <em>Football action datasets processing</em></span><ul>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/datasets/EuroCup2016/"><code>EuroCup2016</code></strong> <em>EuroCup2016 football dataset &amp; videos</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/datasets/EuroCup2016/dataset_url.list" id="/applications/FootballAction/datasets/EuroCup2016/dataset_url.list"><code>dataset&UnderBar;url&period;list</code></a> <em>13 EuroCup2016 football videos from BCEBOS</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh" id="/applications/FootballAction/datasets/EuroCup2016/download_dataset.sh"><code>download&UnderBar;dataset&period;sh</code></a> <em>Downloads 12 EuroCup2016 videos using wget.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/datasets/EuroCup2016/url.list" id="/applications/FootballAction/datasets/EuroCup2016/url.list"><code>url&period;list</code></a> <em>Unique EuroCup2016 URLs for MP4 videos.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/datasets/EuroCup2016/url_val.list" id="/applications/FootballAction/datasets/EuroCup2016/url_val.list"><code>url&UnderBar;val&period;list</code></a> <em>List of hashed MP4 URLs for EuroCup2016 videos.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/datasets/script/"><code>script</code></strong> <em>Football action dataset scripts processing</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/datasets/script/get_frames_pcm.py" id="/applications/FootballAction/datasets/script/get_frames_pcm.py"><code>get&UnderBar;frames&UnderBar;pcm&period;py</code></a> <em>Python script extracts frames and audio from MP4 videos.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/datasets/script/get_instance_for_bmn.py" id="/applications/FootballAction/datasets/script/get_instance_for_bmn.py"><code>get&UnderBar;instance&UnderBar;for&UnderBar;bmn&period;py</code></a> <em>Generates BMN-formatted output from ground truth data.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/datasets/script/get_instance_for_lstm.py" id="/applications/FootballAction/datasets/script/get_instance_for_lstm.py"><code>get&UnderBar;instance&UnderBar;for&UnderBar;lstm&period;py</code></a> <em>Calculates IoU, checks hits, splits datasets for handling football actions.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/datasets/script/get_instance_for_pptsm.py" id="/applications/FootballAction/datasets/script/get_instance_for_pptsm.py"><code>get&UnderBar;instance&UnderBar;for&UnderBar;pptsm&period;py</code></a> <em>Processes video data, extracts action instances, creates dataset.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/extractor/"><code>extractor</code></strong> <em>Videos features extraction using pre-trained models.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/extractor/extract_bmn.py" id="/applications/FootballAction/extractor/extract_bmn.py"><code>extract&UnderBar;bmn&period;py</code></a> <em>Classify and extract features from videos using pre-trained model.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/extractor/extract_feat.py" id="/applications/FootballAction/extractor/extract_feat.py"><code>extract&UnderBar;feat&period;py</code></a> <em>Baidu models extract audio, classify videos.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/predict/"><code>predict</code></strong> <em>Football action prediction using audio features</em></span><ul>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/predict/action_detect/"><code>action&UnderBar;detect</code></strong> <em>Football action prediction with MFCC, VGGish, and AudioReader.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/action.py" id="/applications/FootballAction/predict/action_detect/action.py"><code>action&period;py</code></a> <em>Action detection system with ML/DL for Baidu Cloud</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/logger.py" id="/applications/FootballAction/predict/action_detect/logger.py"><code>logger&period;py</code></a> <em>Custom logger class for news stripper app</em></li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/predict/action_detect/mfcc/"><code>mfcc</code></strong> <em>Football action detection using MFCC and VGGish</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py" id="/applications/FootballAction/predict/action_detect/mfcc/feature_extractor.py"><code>feature&UnderBar;extractor&period;py</code></a> <em>Extracts MFCC and STFT audio features for football action detection.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/mfcc/model_config.py" id="/applications/FootballAction/predict/action_detect/mfcc/model_config.py"><code>model&UnderBar;config&period;py</code></a> <em>Extracts audio features, slices data, returns feature list.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py" id="/applications/FootballAction/predict/action_detect/mfcc/vgg_params.py"><code>vgg&UnderBar;params&period;py</code></a> <em>Global VGGish model parameters for audio feature extraction</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/predict/action_detect/models/"><code>models</code></strong> <em>Football action detection and prediction models</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/models/audio_infer.py" id="/applications/FootballAction/predict/action_detect/models/audio_infer.py"><code>audio&UnderBar;infer&period;py</code></a> <em>Audio inference model for predicting football actions.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/models/bmn_infer.py" id="/applications/FootballAction/predict/action_detect/models/bmn_infer.py"><code>bmn&UnderBar;infer&period;py</code></a> <em>Paddle BMN Infer action detection model, averaging predictions.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/models/lstm_infer.py" id="/applications/FootballAction/predict/action_detect/models/lstm_infer.py"><code>lstm&UnderBar;infer&period;py</code></a> <em>LSTM model for football action prediction</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/models/pptsm_infer.py" id="/applications/FootballAction/predict/action_detect/models/pptsm_infer.py"><code>pptsm&UnderBar;infer&period;py</code></a> <em>PPTSM model inference for football actions.</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/predict/action_detect/reader/"><code>reader</code></strong> <em>Football action detection using AudioReader and YouTube-8M</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/reader/__init__.py" id="/applications/FootballAction/predict/action_detect/reader/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports and registers readers for map files.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/reader/audio_reader.py" id="/applications/FootballAction/predict/action_detect/reader/audio_reader.py"><code>audio&UnderBar;reader&period;py</code></a> <em>AudioReader class for YouTube-8M dataset management.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py" id="/applications/FootballAction/predict/action_detect/reader/bmninf_reader.py"><code>bmninf&UnderBar;reader&period;py</code></a> <em>BMNINF reader for football action detection</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/reader/feature_reader.py" id="/applications/FootballAction/predict/action_detect/reader/feature_reader.py"><code>feature&UnderBar;reader&period;py</code></a> <em>Attention-based LSTM feature reader for FootballAction</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/reader/reader_utils.py" id="/applications/FootballAction/predict/action_detect/reader/reader_utils.py"><code>reader&UnderBar;utils&period;py</code></a> <em>Reader registration and lookup for video input data.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py" id="/applications/FootballAction/predict/action_detect/reader/tsminf_reader.py"><code>tsminf&UnderBar;reader&period;py</code></a> <em>TSMINF Reader: JPG video dataset, threading, action detection.</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/FootballAction/predict/action_detect/utils/"><code>utils</code></strong> <em>Video processing and config management utilities for FootballAction.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/utils/config_utils.py" id="/applications/FootballAction/predict/action_detect/utils/config_utils.py"><code>config&UnderBar;utils&period;py</code></a> <em>Loads and processes config files for organization.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/utils/preprocess.py" id="/applications/FootballAction/predict/action_detect/utils/preprocess.py"><code>preprocess&period;py</code></a> <em>Four FFmpeg functions for video, audio handling</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/action_detect/utils/process_result.py" id="/applications/FootballAction/predict/action_detect/utils/process_result.py"><code>process&UnderBar;result&period;py</code></a> <em>Processes video results with NMS</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/eval.py" id="/applications/FootballAction/predict/eval.py"><code>eval&period;py</code></a> <em>Evaluates precision, recall, and F1 scores for football action prediction models.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/predict/predict.py" id="/applications/FootballAction/predict/predict.py"><code>predict&period;py</code></a> <em>Loads model, reads videos, predicts actions, stores results.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/FootballAction/README.md" id="/applications/FootballAction/README.md"><code>README&period;md</code></a> <em>Enhanced FootballAction model in PaddleVideo.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/Ma-Net/"><code>Ma-Net</code></strong> <em>Trains &amp; evaluates Ma-Net models with Tensor utilities.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/config.py" id="/applications/Ma-Net/config.py"><code>config&period;py</code></a> <em>Ma-Net config file: Imports, parses, trains, tests.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/Ma-Net/dataloaders/"><code>dataloaders</code></strong> <em>Applications for data loaders and helpers</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/dataloaders/custom_transforms_f.py" id="/applications/Ma-Net/dataloaders/custom_transforms_f.py"><code>custom&UnderBar;transforms&UnderBar;f&period;py</code></a> <em>Data augmentation transforms for PaddlePaddle</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/dataloaders/DAVIS2017.md" id="/applications/Ma-Net/dataloaders/DAVIS2017.md"><code>DAVIS2017&period;md</code></a> <em>Download and organize DAVIS2017 dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/dataloaders/DAVIS2017_cn.md" id="/applications/Ma-Net/dataloaders/DAVIS2017_cn.md"><code>DAVIS2017&UnderBar;cn&period;md</code></a> <em>DAVIS2017 dataset prep for Ma-Net</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/dataloaders/davis_2017_f.py" id="/applications/Ma-Net/dataloaders/davis_2017_f.py"><code>davis&UnderBar;2017&UnderBar;f&period;py</code></a> <em>DAVIS 2017 dataset loader, Ma-Net preprocessing</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/dataloaders/helpers.py" id="/applications/Ma-Net/dataloaders/helpers.py"><code>helpers&period;py</code></a> <em>Helps with image data conversion, masking, and normalization.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/dataloaders/samplers.py" id="/applications/Ma-Net/dataloaders/samplers.py"><code>samplers&period;py</code></a> <em>Randomly samples identities and instances with replacement.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/Ma-Net/networks/"><code>networks</code></strong> <em>Customized Ma-Net Networks for Image Classification</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/aspp.py" id="/applications/Ma-Net/networks/aspp.py"><code>aspp&period;py</code></a> <em>ASPP module layer for Ma-Net CNN</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/Ma-Net/networks/backbone/"><code>backbone</code></strong> <em>Various MA-Net backbones for diverse tasks</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/backbone/__init__.py" id="/applications/Ma-Net/networks/backbone/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Backbone network builder for specified model and stride.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/backbone/drn.py" id="/applications/Ma-Net/networks/backbone/drn.py"><code>drn&period;py</code></a> <em>Deep Residual Network (DRN) model &amp; MA-Net architecture in PaddlePaddle</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/backbone/mobilenet.py" id="/applications/Ma-Net/networks/backbone/mobilenet.py"><code>mobilenet&period;py</code></a> <em>MobileNetV2 model for Ma-Net application.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/backbone/resnet.py" id="/applications/Ma-Net/networks/backbone/resnet.py"><code>resnet&period;py</code></a> <em>ResNet architecture with batch normalization, ReLU, output strides, residual connections.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/backbone/xception.py" id="/applications/Ma-Net/networks/backbone/xception.py"><code>xception&period;py</code></a> <em>AlignedXception network for image classification</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/decoder.py" id="/applications/Ma-Net/networks/decoder.py"><code>decoder&period;py</code></a> <em>Decoder neural network layer for class prediction</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/deeplab.py" id="/applications/Ma-Net/networks/deeplab.py"><code>deeplab&period;py</code></a> <em>Deeplab: BatchNorm, ASPP, decoder, freeze.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/IntVOS.py" id="/applications/Ma-Net/networks/IntVOS.py"><code>IntVOS&period;py</code></a> <em>Video object segmentation with PaddlePaddle and neural networks.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/networks/loss.py" id="/applications/Ma-Net/networks/loss.py"><code>loss&period;py</code></a> <em>Custom loss function for image classification tasks with optional hard example mining.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/README.md" id="/applications/Ma-Net/README.md"><code>README&period;md</code></a> <em>MA-Net PaddleVideo model testing and training script</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/README_cn.md" id="/applications/Ma-Net/README_cn.md"><code>README&UnderBar;cn&period;md</code></a> <em>Chinese Ma-Net video segmentation model README.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/run.sh" id="/applications/Ma-Net/run.sh"><code>run&period;sh</code></a> <em>Train, test DeepLabV3_coco on DAVIS dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/test.py" id="/applications/Ma-Net/test.py"><code>test&period;py</code></a> <em>DAVIS2017 image processing with PaddlePaddle, 8-turn interactive classification.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/train_stage1.py" id="/applications/Ma-Net/train_stage1.py"><code>train&UnderBar;stage1&period;py</code></a> <em>Ma-Net video object detection, training and visualization.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/train_stage2.py" id="/applications/Ma-Net/train_stage2.py"><code>train&UnderBar;stage2&period;py</code></a> <em>Trains stage 2 Ma-Net models, applies loss, evaluates performance.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/Ma-Net/utils/"><code>utils</code></strong> <em>Tensor processing and label conversion utilities</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/utils/api.py" id="/applications/Ma-Net/utils/api.py"><code>api&period;py</code></a> <em>Tensor handling utilities for PyTorch and PaddlePaddle</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/utils/mask_damaging.py" id="/applications/Ma-Net/utils/mask_damaging.py"><code>mask&UnderBar;damaging&period;py</code></a> <em>Damages and scales mask with rotations and translations.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/utils/meters.py" id="/applications/Ma-Net/utils/meters.py"><code>meters&period;py</code></a> <em>Computes and stores average values.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/Ma-Net/utils/utils.py" id="/applications/Ma-Net/utils/utils.py"><code>utils&period;py</code></a> <em>Converts labels to RGB color map.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/MultimodalVideoTag/"><code>MultimodalVideoTag</code></strong> <em>Multimodal video tagging scenarios and tools.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/download.sh" id="/applications/MultimodalVideoTag/download.sh"><code>download&period;sh</code></a> <em>Downloads ernie model, checkpoints, and test dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/eval_and_save_model.sh" id="/applications/MultimodalVideoTag/eval_and_save_model.sh"><code>eval&UnderBar;and&UnderBar;save&UnderBar;model&period;sh</code></a> <em>Evaluates and saves AttentionLstmErnie model in specified directories.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/inference.sh" id="/applications/MultimodalVideoTag/inference.sh"><code>inference&period;sh</code></a> <em>Inference script for AttentionLstmErnie model.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/README.md" id="/applications/MultimodalVideoTag/README.md"><code>README&period;md</code></a> <em>Multimodal video classification with PaddlePaddle 2.0</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/MultimodalVideoTag/scenario_lib/"><code>scenario&UnderBar;lib</code></strong> <em>Multimodal video tagging scenarios library.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py" id="/applications/MultimodalVideoTag/scenario_lib/accuracy_metrics.py"><code>accuracy&UnderBar;metrics&period;py</code></a> <em>Accuracy metrics calculator for multimodal video tagging models.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/config.py" id="/applications/MultimodalVideoTag/scenario_lib/config.py"><code>config&period;py</code></a> <em>Merges and prints config sections.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/MultimodalVideoTag/scenario_lib/datareader/"><code>datareader</code></strong> <em>Multimodal video tag scenario library reader management</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py" id="/applications/MultimodalVideoTag/scenario_lib/datareader/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>MultimodalVideoTag: ATTENTIONLSTMERNIE reader imported.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py" id="/applications/MultimodalVideoTag/scenario_lib/datareader/ernie_task_reader.py"><code>ernie&UnderBar;task&UnderBar;reader&period;py</code></a> <em>ERNIE task reader: preprocesses text, formats sequences for ERNIE models.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py" id="/applications/MultimodalVideoTag/scenario_lib/datareader/feature_reader.py"><code>feature&UnderBar;reader&period;py</code></a> <em>Multimodal LSTM, attention, NextVlad feature reader.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py" id="/applications/MultimodalVideoTag/scenario_lib/datareader/reader_utils.py"><code>reader&UnderBar;utils&period;py</code></a> <em>ReaderZoo manages reader instances with singleton pattern.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py" id="/applications/MultimodalVideoTag/scenario_lib/datareader/tokenization.py"><code>tokenization&period;py</code></a> <em>Tokenization and Unicode conversion for text</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py" id="/applications/MultimodalVideoTag/scenario_lib/eval_and_save_model.py"><code>eval&UnderBar;and&UnderBar;save&UnderBar;model&period;py</code></a> <em>Multimodal video tagging with PaddlePaddle and AttentionLstmErnie.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/inference.py" id="/applications/MultimodalVideoTag/scenario_lib/inference.py"><code>inference&period;py</code></a> <em>Paddle Video Inference: Multimodal Tagging, GPU Support</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/MultimodalVideoTag/scenario_lib/models/"><code>models</code></strong> <em>Multimodal video tagging models directory</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py" id="/applications/MultimodalVideoTag/scenario_lib/models/attention_lstm_ernie.py"><code>attention&UnderBar;lstm&UnderBar;ernie&period;py</code></a> <em>ERNIE-based LSTM attention model for multimodal video tagging</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/models/ernie.py" id="/applications/MultimodalVideoTag/scenario_lib/models/ernie.py"><code>ernie&period;py</code></a> <em>ERNIE model, multimodal video tagging, Paddle</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py" id="/applications/MultimodalVideoTag/scenario_lib/models/transformer_encoder.py"><code>transformer&UnderBar;encoder&period;py</code></a> <em>Transformer encoder layer for NLP tasks</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/train.py" id="/applications/MultimodalVideoTag/scenario_lib/train.py"><code>train&period;py</code></a> <em>Video training: PaddlePaddle, feeds, outputs, loss, optimizer, build, train, save.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/scenario_lib/utils.py" id="/applications/MultimodalVideoTag/scenario_lib/utils.py"><code>utils&period;py</code></a> <em>Multimodal Video Tagging Utilities</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/MultimodalVideoTag/train.sh" id="/applications/MultimodalVideoTag/train.sh"><code>train&period;sh</code></a> <em>Train Attention LSTM Ernie model with GPU optimization.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/PP-Care/"><code>PP-Care</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/PP-Care/Readme.md" id="/applications/PP-Care/Readme.md"><code>Readme&period;md</code></a> <em>PP-Care video understanding app with TSM and ResNet50</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/PPHuman/"><code>PPHuman</code></strong> <em>PaddleVideo-PPHuman dataset and model</em></span><ul>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/PPHuman/datasets/"><code>datasets</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/PPHuman/datasets/prepare_dataset.py" id="/applications/PPHuman/datasets/prepare_dataset.py"><code>prepare&UnderBar;dataset&period;py</code></a> <em>Prepare PaddleVideo and PPHuman dataset keypoints.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/PPHuman/README.md" id="/applications/PPHuman/README.md"><code>README&period;md</code></a> <em>Trains PaddleVideo JSON for PP-Human inference</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/README.md" id="/applications/README.md"><code>README&period;md</code></a> <em>PaddleVideo applications: action detection, recognition, classification, and analysis.</em></li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/T2VLAD/"><code>T2VLAD</code></strong> <em>Video analysis and retrieval models training.</em></span><ul>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/T2VLAD/base/"><code>base</code></strong> <em>T2VLAD base: models, trainer.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/base/__init__.py" id="/applications/T2VLAD/base/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports base model and trainer modules.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/base/base_dataset.py" id="/applications/T2VLAD/base/base_dataset.py"><code>base&UnderBar;dataset&period;py</code></a> <em>Base class for video feature datasets.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/base/base_model.py" id="/applications/T2VLAD/base/base_model.py"><code>base&UnderBar;model&period;py</code></a> <em>Base class for PaddleVideo models</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/base/base_trainer.py" id="/applications/T2VLAD/base/base_trainer.py"><code>base&UnderBar;trainer&period;py</code></a> <em>T2VLAD trainer: multi-epoch, metrics tracking, model saving.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/T2VLAD/data/"><code>data</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/data/download_features.sh" id="/applications/T2VLAD/data/download_features.sh"><code>download&UnderBar;features&period;sh</code></a> <em>Download, extract MSRVTT datasets</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/T2VLAD/data_loader/"><code>data&UnderBar;loader</code></strong> <em>Efficient data loader for MSRVTT with LRU cache and PP library.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/data_loader/data_loaders.py" id="/applications/T2VLAD/data_loader/data_loaders.py"><code>data&UnderBar;loaders&period;py</code></a> <em>Efficient data loader for training with LRU cache, PP library, MSRVTT dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/data_loader/MSRVTT_dataset.py" id="/applications/T2VLAD/data_loader/MSRVTT_dataset.py"><code>MSRVTT&UnderBar;dataset&period;py</code></a> <em>MSR-Vtt dataset loader and checker</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/T2VLAD/logger/"><code>logger</code></strong> <em>T2VLAD logger functions and performance metrics directory.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/logger/__init__.py" id="/applications/T2VLAD/logger/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports all logger functions and classes in PaddleVideo's T2VLAD app.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/logger/log_parser.py" id="/applications/T2VLAD/logger/log_parser.py"><code>log&UnderBar;parser&period;py</code></a> <em>Computes performance metrics for epochs.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/logger/logger.py" id="/applications/T2VLAD/logger/logger.py"><code>logger&period;py</code></a> <em>Configures logger based on JSON file or uses basic setup.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/T2VLAD/model/"><code>model</code></strong> <em>Video analysis with T2VLAD and text embedding models.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/model/loss.py" id="/applications/T2VLAD/model/loss.py"><code>loss&period;py</code></a> <em>Contrastive loss, max margin ranking, T2VLAD models</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/model/metric.py" id="/applications/T2VLAD/model/metric.py"><code>metric&period;py</code></a> <em>Calculates retrieval metrics and offers visualization options.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/model/model.py" id="/applications/T2VLAD/model/model.py"><code>model&period;py</code></a> <em>Video analysis CENet model with T2VLAD and BERT.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/model/net_vlad.py" id="/applications/T2VLAD/model/net_vlad.py"><code>net&UnderBar;vlad&period;py</code></a> <em>T2VLAD: VLAD representation network.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/model/text.py" id="/applications/T2VLAD/model/text.py"><code>text&period;py</code></a> <em>TextEmbedding: Word2Vec for video descriptions and queries. CPU-only, GPT or Word2Vec.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/parse_config.py" id="/applications/T2VLAD/parse_config.py"><code>parse&UnderBar;config&period;py</code></a> <em>ConfigParser: argument parsers, slave mode, directories, config, experiment settings.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/README.md" id="/applications/T2VLAD/README.md"><code>README&period;md</code></a> <em>Trains T2VLAD text video retrieval model using PaddleVideo on MSR-VTT dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/README_en.md" id="/applications/T2VLAD/README_en.md"><code>README&UnderBar;en&period;md</code></a> <em>T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/test.py" id="/applications/T2VLAD/test.py"><code>test&period;py</code></a> <em>Compress predictions using PaddleVideo library.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/train.py" id="/applications/T2VLAD/train.py"><code>train&period;py</code></a> <em>Trains video analysis model, handles args, saves checkpoints</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/T2VLAD/trainer/"><code>trainer</code></strong> <em>Video retrieval model training application.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/trainer/__init__.py" id="/applications/T2VLAD/trainer/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports all from "trainer" module.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/trainer/trainer.py" id="/applications/T2VLAD/trainer/trainer.py"><code>trainer&period;py</code></a> <em>Trains video retrieval model, efficient sample copies, logs progress, computes Mean Average Precision.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/T2VLAD/utils/"><code>utils</code></strong> <em>Tensor format utilities for data processing and categorizing.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/utils/__init__.py" id="/applications/T2VLAD/utils/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports all util module functions.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/T2VLAD/utils/util.py" id="/applications/T2VLAD/utils/util.py"><code>util&period;py</code></a> <em>Data processing, categorizing, feature adjustment, Tensor format utilities.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/"><code>TableTennis</code></strong> <em>Table tennis apps for prediction and analysis.</em></span><ul>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/ActionRecognition/"><code>ActionRecognition</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/ActionRecognition/README.md" id="/applications/TableTennis/ActionRecognition/README.md"><code>README&period;md</code></a> <em>Table Tennis Action Recognition with VideoSwinTransformer</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/datasets/"><code>datasets</code></strong></span><ul>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/datasets/script/"><code>script</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/datasets/script/submission_format_transfer.py" id="/applications/TableTennis/datasets/script/submission_format_transfer.py"><code>submission&UnderBar;format&UnderBar;transfer&period;py</code></a> <em>Converts JSON timestamps, formats for table tennis analysis.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/extractor/"><code>extractor</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py" id="/applications/TableTennis/extractor/extract_bmn_for_tabletennis.py"><code>extract&UnderBar;bmn&UnderBar;for&UnderBar;tabletennis&period;py</code></a> <em>Classify videos, extract features, predict bounding boxes.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/fix_bad_label.py" id="/applications/TableTennis/fix_bad_label.py"><code>fix&UnderBar;bad&UnderBar;label&period;py</code></a> <em>Fixes bad labels in Table Tennis application</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/get_instance_for_bmn.py" id="/applications/TableTennis/get_instance_for_bmn.py"><code>get&UnderBar;instance&UnderBar;for&UnderBar;bmn&period;py</code></a> <em>Generates BMN model ground truth data for table tennis.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/gts_format_transfer.py" id="/applications/TableTennis/gts_format_transfer.py"><code>gts&UnderBar;format&UnderBar;transfer&period;py</code></a> <em>JSON file format transfer tool</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/predict/"><code>predict</code></strong> <em>Table Tennis Prediction Utilities</em></span><ul>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/predict/action_detect/"><code>action&UnderBar;detect</code></strong> <em>Table Tennis Action Prediction &amp; Detection Utilities</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/action.py" id="/applications/TableTennis/predict/action_detect/action.py"><code>action&period;py</code></a> <em>Baidu Cloud action detection Python script</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/logger.py" id="/applications/TableTennis/predict/action_detect/logger.py"><code>logger&period;py</code></a> <em>Custom logger for action detection in Table Tennis app</em></li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/predict/action_detect/mfcc/"><code>mfcc</code></strong> <em>Table Tennis Audio Analysis: VGGish Model &amp; MFCC</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py" id="/applications/TableTennis/predict/action_detect/mfcc/feature_extractor.py"><code>feature&UnderBar;extractor&period;py</code></a> <em>Extracts audio features for Table Tennis prediction using VGG-16.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/mfcc/model_config.py" id="/applications/TableTennis/predict/action_detect/mfcc/model_config.py"><code>model&UnderBar;config&period;py</code></a> <em>Audio feature extraction for Table Tennis predictions</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py" id="/applications/TableTennis/predict/action_detect/mfcc/vgg_params.py"><code>vgg&UnderBar;params&period;py</code></a> <em>Global VGGish model parameters defined, with customizable audio features extraction.</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/predict/action_detect/models/"><code>models</code></strong> <em>Table Tennis Action Detection Models Directory</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/models/audio_infer.py" id="/applications/TableTennis/predict/action_detect/models/audio_infer.py"><code>audio&UnderBar;infer&period;py</code></a> <em>Audio inference model for PaddleVideo predictions</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/models/bmn_infer.py" id="/applications/TableTennis/predict/action_detect/models/bmn_infer.py"><code>bmn&UnderBar;infer&period;py</code></a> <em>GPU-optimized action detection inference model</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/models/lstm_infer.py" id="/applications/TableTennis/predict/action_detect/models/lstm_infer.py"><code>lstm&UnderBar;infer&period;py</code></a> <em>Table Tennis LSTM Action Detection Model</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/models/pptsm_infer.py" id="/applications/TableTennis/predict/action_detect/models/pptsm_infer.py"><code>pptsm&UnderBar;infer&period;py</code></a> <em>InferModel class for PPTSM model inference with GPU.</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/predict/action_detect/reader/"><code>reader</code></strong> <em>TableTennis Action Detection using LSTM, Attention Cluster, and NextVlad Models.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/reader/__init__.py" id="/applications/TableTennis/predict/action_detect/reader/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports reader classes, registers alphabetically.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py" id="/applications/TableTennis/predict/action_detect/reader/bmninf_reader.py"><code>bmninf&UnderBar;reader&period;py</code></a> <em>BMNINFReader: TableTennis Action Detection Dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/reader/feature_reader.py" id="/applications/TableTennis/predict/action_detect/reader/feature_reader.py"><code>feature&UnderBar;reader&period;py</code></a> <em>Table tennis action detection from YouTube-8M dataset using LSTM, attention cluster, and nextVlad models.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/reader/reader_utils.py" id="/applications/TableTennis/predict/action_detect/reader/reader_utils.py"><code>reader&UnderBar;utils&period;py</code></a> <em>ReaderZoo class for PaddleVideo TableTennis error handling and reader management.</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/TableTennis/predict/action_detect/utils/"><code>utils</code></strong> <em>TableTennis action detection utilities for video processing</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/utils/config_utils.py" id="/applications/TableTennis/predict/action_detect/utils/config_utils.py"><code>config&UnderBar;utils&period;py</code></a> <em>PaddleVideo TableTennis config parsing utils</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/utils/preprocess.py" id="/applications/TableTennis/predict/action_detect/utils/preprocess.py"><code>preprocess&period;py</code></a> <em>Video, audio extraction and sorting utility.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/action_detect/utils/process_result.py" id="/applications/TableTennis/predict/action_detect/utils/process_result.py"><code>process&UnderBar;result&period;py</code></a> <em>Calculates video results with suppression, removes overlaps, and processes video properties.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/eval.py" id="/applications/TableTennis/predict/eval.py"><code>eval&period;py</code></a> <em>Optimize F1 scores for table tennis action prediction.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/predict/predict.py" id="/applications/TableTennis/predict/predict.py"><code>predict&period;py</code></a> <em>Video prediction setup for TableTennis using PaddleVideo</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/TableTennis/val_split.py" id="/applications/TableTennis/val_split.py"><code>val&UnderBar;split&period;py</code></a> <em>Splits JSON table tennis gts into training and validation sets</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/"><code>VideoQualityAssessment</code></strong> <em>Video quality assessment tools and apps using PaddlePaddle</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/main.py" id="/applications/VideoQualityAssessment/main.py"><code>main&period;py</code></a> <em>Trains PaddleVideo models, supports dist. test.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/"><code>paddlevideo</code></strong> <em>Video Quality Assessment and Optimization Library</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo library license and imports.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/loader/"><code>loader</code></strong> <em>Video dataset loader for PaddleVideo pipelines optimization.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Video dataset loading and processing module for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/builder.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/builder.py"><code>builder&period;py</code></a> <em>Python file constructs video pipelines using PaddleVideo &amp; PaddlePaddle.</em></li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/loader/dataset/"><code>dataset</code></strong> <em>Efficient video dataset loader for PaddleVideo.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/dataset/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Video and frame datasets for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/dataset/base.py"><code>base&period;py</code></a> <em>Video dataset class for loading and preparing data.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/dataset/frame_rec.py"><code>frame&UnderBar;rec&period;py</code></a> <em>PaddleVideo's FrameRecDataset for raw frame loading and transformation.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/dataset/video.py"><code>video&period;py</code></a> <em>PaddleVideo: Efficient Video Dataset Loading and Processing</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/"><code>pipelines</code></strong> <em>Video quality assessment pipelines in PaddleVideo.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo library functions for video analysis tasks.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/augmentations.py"><code>augmentations&period;py</code></a> <em>Scaling and multi-cropping for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/compose.py"><code>compose&period;py</code></a> <em>Composes video pipeline elements, handles lists, old config workaround.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/decode.py"><code>decode&period;py</code></a> <em>PaddleVideo's VideoDecoder: Decodes MP4, RGB frames, audio, and masks.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/mix.py"><code>mix&period;py</code></a> <em>Mixup operator for video quality assessment.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/pipelines/sample.py"><code>sample&period;py</code></a> <em>Sampler class for PIL image data sampling in video files.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/loader/registry.py" id="/applications/VideoQualityAssessment/paddlevideo/loader/registry.py"><code>registry&period;py</code></a> <em>Manages registries for pipelines and datasets in PaddleVideo.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/metrics/"><code>metrics</code></strong> <em>Video quality metrics registry for PaddleVideo library.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/metrics/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Video Quality Assessment app initialization</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/metrics/base.py" id="/applications/VideoQualityAssessment/paddlevideo/metrics/base.py"><code>base&period;py</code></a> <em>BaseMetric: Foundation for video quality metrics, subclasses required.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/metrics/build.py" id="/applications/VideoQualityAssessment/paddlevideo/metrics/build.py"><code>build&period;py</code></a> <em>Build and import metrics for PaddleVideo library.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py" id="/applications/VideoQualityAssessment/paddlevideo/metrics/quality_metric.py"><code>quality&UnderBar;metric&period;py</code></a> <em>Calculates PLCC &amp; SROCC using numpy and scipy stats.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py" id="/applications/VideoQualityAssessment/paddlevideo/metrics/registry.py"><code>registry&period;py</code></a> <em>Video quality assessment metrics registry in PaddleVideo library.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/modeling/"><code>modeling</code></strong> <em>Video quality assessment models and registration.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Video quality assessment models registration and exportation.</em></li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/"><code>backbones</code></strong> <em>Video backbone models: ResNet, ResNetTweaksTSM.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Backbone models: ResNet, ResNetTweaksTSM imported and defined.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet.py"><code>resnet&period;py</code></a> <em>ResNet backbone using ConvBNLayer and blocks.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py"><code>resnet&UnderBar;tweaks&UnderBar;tsm&period;py</code></a> <em>TSM ResNet model with configurable layers</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/builder.py"><code>builder&period;py</code></a> <em>Model builder for computer vision components</em></li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/modeling/framework/"><code>framework</code></strong> <em>PaddleVideo: BaseRecognizer, Recognizer2D, trains 2D models.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/framework/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo: BaseRecognizer &amp; Recognizer2D classes defined.</em></li>
+<li><span hierarchy="6" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/"><code>recognizers</code></strong> <em>Trains 2D models, PaddleVideo's Recognizer2D class.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports, adds to <strong>all</strong>, and disclaims.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/base.py"><code>base&period;py</code></a> <em>Base class for model recognizers in PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/framework/recognizers/recognizer2d.py"><code>recognizer2d&period;py</code></a> <em>Trains 2D models using PaddleVideo's Recognizer2D class</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/modeling/heads/"><code>heads</code></strong> <em>TSN-based video quality assessment modeling</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/heads/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Head models for Video Quality Assessment in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/heads/base.py"><code>base&period;py</code></a> <em>PaddleVideo BaseHead model for Video Quality Assessment</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsm_rec_head.py"><code>tsm&UnderBar;rec&UnderBar;head&period;py</code></a> <em>TSMRecHead: TSN-based classifier for TSNs</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/heads/tsn_head.py"><code>tsn&UnderBar;head&period;py</code></a> <em>TSN head for video quality assessment</em></li>
+</ul>
+</li>
+<li><span hierarchy="5" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/modeling/losses/"><code>losses</code></strong> <em>Loss functions for PaddleVideo VQA</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/losses/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Implements loss functions for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/losses/base.py"><code>base&period;py</code></a> <em>Base loss function for PaddleVideo, requires subclass for _forward method.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/losses/l1_loss.py"><code>l1&UnderBar;loss&period;py</code></a> <em>L1 Loss: Image/Video Quality Assessment</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/losses/smooth_l1_loss.py"><code>smooth&UnderBar;l1&UnderBar;loss&period;py</code></a> <em>Smooth L1 Loss Function</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/registry.py"><code>registry&period;py</code></a> <em>Registers various models in PaddleVideo's VQA app.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py" id="/applications/VideoQualityAssessment/paddlevideo/modeling/weight_init.py"><code>weight&UnderBar;init&period;py</code></a> <em>Custom weight initialization in PaddlePaddle</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/solver/"><code>solver</code></strong> <em>Video Quality Optimizer and Scheduler Learning Rate Solver.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/solver/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo: Building Video Quality Optimizer and LR.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py" id="/applications/VideoQualityAssessment/paddlevideo/solver/custom_lr.py"><code>custom&UnderBar;lr&period;py</code></a> <em>Learning rate schedulers for PaddleVideo optimization.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/solver/lr.py" id="/applications/VideoQualityAssessment/paddlevideo/solver/lr.py"><code>lr&period;py</code></a> <em>Creates learning rate scheduler for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py" id="/applications/VideoQualityAssessment/paddlevideo/solver/optimizer.py"><code>optimizer&period;py</code></a> <em>Constructs optimizer and learning rate scheduler for parameter optimization.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/tasks/"><code>tasks</code></strong> <em>Train and assess video quality with PaddleVideo.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/tasks/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Import, define, train, test functions for PaddleVideo's Video Quality Assessment module.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/tasks/test.py" id="/applications/VideoQualityAssessment/paddlevideo/tasks/test.py"><code>test&period;py</code></a> <em>Tests Paddle model with parallel processing</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/tasks/train.py" id="/applications/VideoQualityAssessment/paddlevideo/tasks/train.py"><code>train&period;py</code></a> <em>Train video quality model with PaddleVideo.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoQualityAssessment/paddlevideo/utils/"><code>utils</code></strong> <em>Video utility tools for Paddle.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py" id="/applications/VideoQualityAssessment/paddlevideo/utils/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo utils module with Registry, build, and more.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py" id="/applications/VideoQualityAssessment/paddlevideo/utils/build_utils.py"><code>build&UnderBar;utils&period;py</code></a> <em>Module Builder from Config</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/utils/config.py" id="/applications/VideoQualityAssessment/paddlevideo/utils/config.py"><code>config&period;py</code></a> <em>Config and AttrDict functions for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py" id="/applications/VideoQualityAssessment/paddlevideo/utils/dist_utils.py"><code>dist&UnderBar;utils&period;py</code></a> <em>Dist utils for PaddleVideo's distributed video quality assessment.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/utils/logger.py" id="/applications/VideoQualityAssessment/paddlevideo/utils/logger.py"><code>logger&period;py</code></a> <em>Logger class for PaddleVideo's VQA app, enabling distributed logging.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py" id="/applications/VideoQualityAssessment/paddlevideo/utils/precise_bn.py"><code>precise&UnderBar;bn&period;py</code></a> <em>Precise batch normalization updates for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/utils/record.py" id="/applications/VideoQualityAssessment/paddlevideo/utils/record.py"><code>record&period;py</code></a> <em>Records metrics for Video Quality Assessment in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/utils/registry.py" id="/applications/VideoQualityAssessment/paddlevideo/utils/registry.py"><code>registry&period;py</code></a> <em>Registry: Objects registration and retrieval via names.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py" id="/applications/VideoQualityAssessment/paddlevideo/utils/save_load.py"><code>save&UnderBar;load&period;py</code></a> <em>Save, load functions for model weights using Paddle.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/paddlevideo/version.py" id="/applications/VideoQualityAssessment/paddlevideo/version.py"><code>version&period;py</code></a> <em>PaddleVideo library version "0.0.1" under Apache License 2.0.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/README.md" id="/applications/VideoQualityAssessment/README.md"><code>README&period;md</code></a> <em>Video quality assessment model using ppTSM network on KonVid-150k dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/run.sh" id="/applications/VideoQualityAssessment/run.sh"><code>run&period;sh</code></a> <em>TSM model training and testing script</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/save_model.sh" id="/applications/VideoQualityAssessment/save_model.sh"><code>save&UnderBar;model&period;sh</code></a> <em>Saves best model from TSM_pptsm.yaml with 32 segments.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoQualityAssessment/setup.py" id="/applications/VideoQualityAssessment/setup.py"><code>setup&period;py</code></a> <em>Video understanding with PaddlePaddle toolkits setup.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoTag/"><code>VideoTag</code></strong> <em>Deep learning video tagging and customization app.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/eval.py" id="/applications/VideoTag/eval.py"><code>eval&period;py</code></a> <em>Prepares environment, imports libraries, defines functions, and runs tests.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/FineTune.md" id="/applications/VideoTag/FineTune.md"><code>FineTune&period;md</code></a> <em>Fine-tune VideoTag model with custom data, AttentionLSTM, TSN</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoTag/metrics/"><code>metrics</code></strong> <em>Video and YouTube metrics evaluation.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/metrics/__init__.py" id="/applications/VideoTag/metrics/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Video metrics import from metrics_util</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoTag/metrics/kinetics/"><code>kinetics</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/metrics/kinetics/accuracy_metrics.py" id="/applications/VideoTag/metrics/kinetics/accuracy_metrics.py"><code>accuracy&UnderBar;metrics&period;py</code></a> <em>Calculates video accuracy metrics in PaddleVideo's VideoTag app.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/metrics/metrics_util.py" id="/applications/VideoTag/metrics/metrics_util.py"><code>metrics&UnderBar;util&period;py</code></a> <em>Evaluates metrics in video analysis tasks.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoTag/metrics/youtube8m/"><code>youtube8m</code></strong> <em>Evaluates YouTube-8M video classification metrics</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py" id="/applications/VideoTag/metrics/youtube8m/average_precision_calculator.py"><code>average&UnderBar;precision&UnderBar;calculator&period;py</code></a> <em>Calculates interpolated average precision for classification tasks.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/metrics/youtube8m/eval_util.py" id="/applications/VideoTag/metrics/youtube8m/eval_util.py"><code>eval&UnderBar;util&period;py</code></a> <em>Video classification model evaluation metrics in PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py" id="/applications/VideoTag/metrics/youtube8m/mean_average_precision_calculator.py"><code>mean&UnderBar;average&UnderBar;precision&UnderBar;calculator&period;py</code></a> <em>Mean Average Precision Calculator for Youtube-8m Dataset</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoTag/models/"><code>models</code></strong> <em>Various deep learning models for video tagging</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/models/__init__.py" id="/applications/VideoTag/models/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Registered models AttentionLSTM and TSN for easy retrieval.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoTag/models/attention_lstm/"><code>attention&UnderBar;lstm</code></strong> <em>LSTM Attention Video Tagging Models</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/models/attention_lstm/__init__.py" id="/applications/VideoTag/models/attention_lstm/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports functions and classes from "attention_lstm.py" for easy access.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/models/attention_lstm/attention_lstm.py" id="/applications/VideoTag/models/attention_lstm/attention_lstm.py"><code>attention&UnderBar;lstm&period;py</code></a> <em>AttentionLSTM model for video tagging</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/models/attention_lstm/lstm_attention.py" id="/applications/VideoTag/models/attention_lstm/lstm_attention.py"><code>lstm&UnderBar;attention&period;py</code></a> <em>LSTM Attention Model: Dynamic LSTM, Dropout</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/models/model.py" id="/applications/VideoTag/models/model.py"><code>model&period;py</code></a> <em>VideoTag app's Python model module for Paddle.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoTag/models/tsn/"><code>tsn</code></strong> <em>TSN ResNet model for PaddlePaddle, subdirectory</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/models/tsn/__init__.py" id="/applications/VideoTag/models/tsn/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports all "tsn" subdirectory components.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/models/tsn/tsn.py" id="/applications/VideoTag/models/tsn/tsn.py"><code>tsn&period;py</code></a> <em>TSN model class with segmentation and training parameters.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/models/tsn/tsn_res_model.py" id="/applications/VideoTag/models/tsn/tsn_res_model.py"><code>tsn&UnderBar;res&UnderBar;model&period;py</code></a> <em>TSN ResNet model for PaddlePaddle</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/models/utils.py" id="/applications/VideoTag/models/utils.py"><code>utils&period;py</code></a> <em>Decompress, download functions and AttrDict class.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/predict.py" id="/applications/VideoTag/predict.py"><code>predict&period;py</code></a> <em>Predicts video tags using PaddleVideo's models.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoTag/reader/"><code>reader</code></strong> <em>VideoTag LSTM DataReader: Efficient Video Dataset Readers</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/reader/__init__.py" id="/applications/VideoTag/reader/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>VideoTag reader classes imported, registered. Alphabetical order.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/reader/feature_reader.py" id="/applications/VideoTag/reader/feature_reader.py"><code>feature&UnderBar;reader&period;py</code></a> <em>YouTube-8M LSTM DataReader in Python.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/reader/kinetics_reader.py" id="/applications/VideoTag/reader/kinetics_reader.py"><code>kinetics&UnderBar;reader&period;py</code></a> <em>KineticsReader: Efficient Kinetics dataset reader with data augmentation.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/reader/reader_utils.py" id="/applications/VideoTag/reader/reader_utils.py"><code>reader&UnderBar;utils&period;py</code></a> <em>Register and retrieve readers in Zoo</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/README.md" id="/applications/VideoTag/README.md"><code>README&period;md</code></a> <em>VideoTag: Large-scale video classification via image modeling and sequence learning.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/Run.md" id="/applications/VideoTag/Run.md"><code>Run&period;md</code></a> <em>VideoTag app installation guide and usage.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/Test.md" id="/applications/VideoTag/Test.md"><code>Test&period;md</code></a> <em>Test VideoTag model on custom data using Python script.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/train.py" id="/applications/VideoTag/train.py"><code>train&period;py</code></a> <em>Video tagging model training in Python</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/tsn_extractor.py" id="/applications/VideoTag/tsn_extractor.py"><code>tsn&UnderBar;extractor&period;py</code></a> <em>Inference script for videos with Infer model.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/applications/VideoTag/utils/"><code>utils</code></strong> <em>VideoTag utility directory, PaddlePaddle 1.6.0, GPU config and training support.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/utils/config_utils.py" id="/applications/VideoTag/utils/config_utils.py"><code>config&UnderBar;utils&period;py</code></a> <em>Config utils for PaddleVideo's VideoTag app</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/utils/train_utils.py" id="/applications/VideoTag/utils/train_utils.py"><code>train&UnderBar;utils&period;py</code></a> <em>PaddlePaddle training utility with dataloader.</em></li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/utils/utility.py" id="/applications/VideoTag/utils/utility.py"><code>utility&period;py</code></a> <em>Ensures PaddlePaddle version 1.6.0 installed, handles GPU usage.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/applications/VideoTag/videotag_test.py" id="/applications/VideoTag/videotag_test.py"><code>videotag&UnderBar;test&period;py</code></a> <em>Video tagging model performance testing with PaddlePaddle and PaddleVideo.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="1" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/benchmark/"><code>benchmark</code></strong></span><ul>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/benchmark/TimeSformer/"><code>TimeSformer</code></strong> <em>TimeSformer video classification benchmark directory</em></span><ul>
+<li><a class="file_link" href="index.html?q=/benchmark/TimeSformer/README.md" id="/benchmark/TimeSformer/README.md"><code>README&period;md</code></a> <em>Run benchmark script for TimeSformer model in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/benchmark/TimeSformer/run_all.sh" id="/benchmark/TimeSformer/run_all.sh"><code>run&UnderBar;all&period;sh</code></a> <em>Benchmark TimeSformer on PaddleVideo with UCF101 dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/benchmark/TimeSformer/run_benchmark.sh" id="/benchmark/TimeSformer/run_benchmark.sh"><code>run&UnderBar;benchmark&period;sh</code></a> <em>TimeSformer video classification benchmark tests</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="1" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/data/"><code>data</code></strong> <em>Multimedia storage and processing hub.</em></span><ul>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/data/50salads/"><code>50salads</code></strong> <em>Salad video data for classification and organization.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/data/50salads/prepare_asrf_data.py" id="/data/50salads/prepare_asrf_data.py"><code>prepare&UnderBar;asrf&UnderBar;data&period;py</code></a> <em>Prepares ASRF data for classification.</em></li>
+<li><a class="file_link" href="index.html?q=/data/50salads/transform_segmentation_label.py" id="/data/50salads/transform_segmentation_label.py"><code>transform&UnderBar;segmentation&UnderBar;label&period;py</code></a> <em>Process video data for labeling and organization.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/data/ntu-rgb-d/"><code>ntu-rgb-d</code></strong> <em>NTU RGB-D dataset processing, analysis</em></span><ul>
+<li><a class="file_link" href="index.html?q=/data/ntu-rgb-d/download_dataset.sh" id="/data/ntu-rgb-d/download_dataset.sh"><code>download&UnderBar;dataset&period;sh</code></a> <em>Download and unzip NTU-RGB-D skeleton data, then statistics.</em></li>
+<li><a class="file_link" href="index.html?q=/data/ntu-rgb-d/get_raw_denoised_data.py" id="/data/ntu-rgb-d/get_raw_denoised_data.py"><code>get&UnderBar;raw&UnderBar;denoised&UnderBar;data&period;py</code></a> <em>Processes and denoises NTU RGB-D dataset data.</em></li>
+<li><a class="file_link" href="index.html?q=/data/ntu-rgb-d/get_raw_skes_data.py" id="/data/ntu-rgb-d/get_raw_skes_data.py"><code>get&UnderBar;raw&UnderBar;skes&UnderBar;data&period;py</code></a> <em>Extracts body info from skeleton data using NTU RGB-D dataset.</em></li>
+<li><a class="file_link" href="index.html?q=/data/ntu-rgb-d/seq_transformation.py" id="/data/ntu-rgb-d/seq_transformation.py"><code>seq&UnderBar;transformation&period;py</code></a> <em>Python script for NTU RGB+D data processing, splitting, and encoding.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="1" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/"><code>deploy</code></strong> <em>Deploy compressed PaddlePaddle models efficiently.</em></span><ul>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/cpp_infer/"><code>cpp&UnderBar;infer</code></strong> <em>Dockerized C++ PaddleVideo AI inference tools</em></span><ul>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/cpp_infer/external-cmake/"><code>external-cmake</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/external-cmake/auto-log.cmake" id="/deploy/cpp_infer/external-cmake/auto-log.cmake"><code>auto-log&period;cmake</code></a> <em>Autolog external project finder and inclusion with CMake</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/cpp_infer/include/"><code>include</code></strong> <em>PaddleVideo C++ pre/post processing, OpenCV integration.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/include/postprocess_op.h" id="/deploy/cpp_infer/include/postprocess_op.h"><code>postprocess&UnderBar;op&period;h</code></a> <em>Softmax Inplace Postprocessing Class</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/include/preprocess_op.h" id="/deploy/cpp_infer/include/preprocess_op.h"><code>preprocess&UnderBar;op&period;h</code></a> <em>Image preprocessing operations for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/include/utility.h" id="/deploy/cpp_infer/include/utility.h"><code>utility&period;h</code></a> <em>Utility functions for PaddleVideo operations.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/include/video_rec.h" id="/deploy/cpp_infer/include/video_rec.h"><code>video&UnderBar;rec&period;h</code></a> <em>Video recognition class with OpenCV, PaddlePaddle integration</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/readme.md" id="/deploy/cpp_infer/readme.md"><code>readme&period;md</code></a> <em>Deploys PaddleVideo models, C++, displays results, requires libcudnn.so fix.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/readme_en.md" id="/deploy/cpp_infer/readme_en.md"><code>readme&UnderBar;en&period;md</code></a> <em>Deploy PaddleVideo models on Linux with Docker support.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/cpp_infer/src/"><code>src</code></strong> <em>Video AI inference, OpenCV Softmax, GPU optimization.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/src/main.cpp" id="/deploy/cpp_infer/src/main.cpp"><code>main&period;cpp</code></a> <em>Process video frames using OpenCV and PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/src/postprocess_op.cpp" id="/deploy/cpp_infer/src/postprocess_op.cpp"><code>postprocess&UnderBar;op&period;cpp</code></a> <em>Softmax postprocessing for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/src/preprocess_op.cpp" id="/deploy/cpp_infer/src/preprocess_op.cpp"><code>preprocess&UnderBar;op&period;cpp</code></a> <em>Normalizes and scales images for inference in PaddleVideo library.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/src/utility.cpp" id="/deploy/cpp_infer/src/utility.cpp"><code>utility&period;cpp</code></a> <em>Utility functions: ReadDict, frame capture, and conversion.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/src/video_rec.cpp" id="/deploy/cpp_infer/src/video_rec.cpp"><code>video&UnderBar;rec&period;cpp</code></a> <em>Video AI inference, processing time measurement, GPU optimized.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/cpp_infer/tools/"><code>tools</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_infer/tools/build.sh" id="/deploy/cpp_infer/tools/build.sh"><code>build&period;sh</code></a> <em>Setup and compile script for OpenCV, PaddlePaddle, CUDA, cuDNN, TensorRT.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/cpp_serving/"><code>cpp&UnderBar;serving</code></strong> <em>Streamline C++ PaddleVideo deployment.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_serving/paddle_env_install.sh" id="/deploy/cpp_serving/paddle_env_install.sh"><code>paddle&UnderBar;env&UnderBar;install&period;sh</code></a> <em>Setup PaddleVideo C++ serving environment with TensorRT.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_serving/preprocess_ops.py" id="/deploy/cpp_serving/preprocess_ops.py"><code>preprocess&UnderBar;ops&period;py</code></a> <em>Composes image processing steps, preprocesses video frames.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_serving/readme.md" id="/deploy/cpp_serving/readme.md"><code>readme&period;md</code></a> <em>Deploy Paddle Serving in Docker with GPU/CPU options.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_serving/readme_en.md" id="/deploy/cpp_serving/readme_en.md"><code>readme&UnderBar;en&period;md</code></a> <em>Accelerates PaddleServing installation with Docker, Linux, and GPU support.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_serving/run_cpp_serving.sh" id="/deploy/cpp_serving/run_cpp_serving.sh"><code>run&UnderBar;cpp&UnderBar;serving&period;sh</code></a> <em>Runs PaddleVideo server with PP-TSM/TSN models on different ports.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/cpp_serving/serving_client.py" id="/deploy/cpp_serving/serving_client.py"><code>serving&UnderBar;client&period;py</code></a> <em>Video client for Paddle Serving using PaddleVideo</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/paddle2onnx/"><code>paddle2onnx</code></strong> <em>Convert PaddlePaddle models to ONNX for efficient inference</em></span><ul>
+<li><a class="file_link" href="index.html?q=/deploy/paddle2onnx/predict_onnx.py" id="/deploy/paddle2onnx/predict_onnx.py"><code>predict&UnderBar;onnx&period;py</code></a> <em>Perform video object detection with ONNX predictor.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/paddle2onnx/readme.md" id="/deploy/paddle2onnx/readme.md"><code>readme&period;md</code></a> <em>Converts PaddlePaddle to ONNX for inference</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/paddle2onnx/readme_en.md" id="/deploy/paddle2onnx/readme_en.md"><code>readme&UnderBar;en&period;md</code></a> <em>Convert Paddle2ONNX PP-TSN model for video prediction.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/python_serving/"><code>python&UnderBar;serving</code></strong> <em>Deploy Python models using PaddlePaddle.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/deploy/python_serving/pipeline_http_client.py" id="/deploy/python_serving/pipeline_http_client.py"><code>pipeline&UnderBar;http&UnderBar;client&period;py</code></a> <em>Serves PaddleVideo models, parses args, sends video data via HTTP.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/python_serving/pipeline_rpc_client.py" id="/deploy/python_serving/pipeline_rpc_client.py"><code>pipeline&UnderBar;rpc&UnderBar;client&period;py</code></a> <em>Web serving client for PaddleVideo models</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/python_serving/readme.md" id="/deploy/python_serving/readme.md"><code>readme&period;md</code></a> <em>Deploy PaddlePaddle model for serving with PaddleServing.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/python_serving/readme_en.md" id="/deploy/python_serving/readme_en.md"><code>readme&UnderBar;en&period;md</code></a> <em>Deploy PaddleServing for HTTP deep learning prediction.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/python_serving/recognition_web_service.py" id="/deploy/python_serving/recognition_web_service.py"><code>recognition&UnderBar;web&UnderBar;service&period;py</code></a> <em>Python web service for image recognition using PaddlePaddle</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/python_serving/utils.py" id="/deploy/python_serving/utils.py"><code>utils&period;py</code></a> <em>Converts video frames to numpy array, base64 strings.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/deploy/slim/"><code>slim</code></strong> <em>Compress PaddleVideo with quantization, pruning, distillation.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/deploy/slim/quant_post_static.py" id="/deploy/slim/quant_post_static.py"><code>quant&UnderBar;post&UnderBar;static&period;py</code></a> <em>Introduces quantization in PaddleVideo for GPU utilization.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/slim/readme.md" id="/deploy/slim/readme.md"><code>readme&period;md</code></a> <em>PaddleVideo model compression with PaddleSlim.</em></li>
+<li><a class="file_link" href="index.html?q=/deploy/slim/readme_en.md" id="/deploy/slim/readme_en.md"><code>readme&UnderBar;en&period;md</code></a> <em>Model compression library for PaddleVideo with quantization, pruning, and distillation.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="1" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/"><code>english&UnderBar;documents</code></strong> <em>English language resources and tools.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/benchmark.md" id="/english_documents/benchmark.md"><code>benchmark&period;md</code></a> <em>PaddleVideo: Speed benchmark, Slowfast 2x faster, Action segmentation on Breakfast dataset.</em></li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/dataset/"><code>dataset</code></strong> <em>Comprehensive RGB-D and video datasets</em></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/ActivityNet.md" id="/english_documents/dataset/ActivityNet.md"><code>ActivityNet&period;md</code></a> <em>ActivityNet dataset prep for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/AVA.md" id="/english_documents/dataset/AVA.md"><code>AVA&period;md</code></a> <em>AVA dataset: download, cut, frame extraction, organization.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/fsd.md" id="/english_documents/dataset/fsd.md"><code>fsd&period;md</code></a> <em>Figure Skating Dataset: 30 fps, Open Pose key points, train/test data available.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/k400.md" id="/english_documents/dataset/k400.md"><code>k400&period;md</code></a> <em>Download and extract Kinetics-400 dataset frames.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/msrvtt.md" id="/english_documents/dataset/msrvtt.md"><code>msrvtt&period;md</code></a> <em>MSR-VTT: 10K videos, ActBERT model, multi-modal transformers.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/ntu-rgbd.md" id="/english_documents/dataset/ntu-rgbd.md"><code>ntu-rgbd&period;md</code></a> <em>Prepares NTU RGB+D dataset for CTR-GCN</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/Oxford_RobotCar.md" id="/english_documents/dataset/Oxford_RobotCar.md"><code>Oxford&UnderBar;RobotCar&period;md</code></a> <em>Oxford-RobotCar: Day-Night Depth Estimation Dataset</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/README.md" id="/english_documents/dataset/README.md"><code>README&period;md</code></a> <em>Comprehensive action recognition datasets table.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/SegmentationDataset.md" id="/english_documents/dataset/SegmentationDataset.md"><code>SegmentationDataset&period;md</code></a> <em>Video action segmentation dataset using breakfast, 50salads, and gtea datasets with pre-training model features.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/ucf101.md" id="/english_documents/dataset/ucf101.md"><code>ucf101&period;md</code></a> <em>UCF101 dataset organization tool</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/ucf24.md" id="/english_documents/dataset/ucf24.md"><code>ucf24&period;md</code></a> <em>UCF24 dataset preparation with PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/dataset/youtube8m.md" id="/english_documents/dataset/youtube8m.md"><code>youtube8m&period;md</code></a> <em>Massive YouTube video classification dataset.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/english_documents/install.md" id="/english_documents/install.md"><code>install&period;md</code></a> <em>Install PaddlePaddle and PaddleVideo, requirements, setup, usage.</em></li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/model_zoo/"><code>model&UnderBar;zoo</code></strong> <em>Advanced video models &amp; segmentation library.</em></span><ul>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/model_zoo/detection/"><code>detection</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md" id="/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md"><code>SlowFast&UnderBar;FasterRCNN&UnderBar;en&period;md</code></a> <em>Train SlowFast Faster RCNN model for action detection</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/model_zoo/estimation/"><code>estimation</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/estimation/adds.md" id="/english_documents/model_zoo/estimation/adds.md"><code>adds&period;md</code></a> <em>Estimates depth using day/night images with ADDS-DepthNet code.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/model_zoo/localization/"><code>localization</code></strong> <em>Localized model zoo with BMN and YOWO.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/localization/bmn.md" id="/english_documents/model_zoo/localization/bmn.md"><code>bmn&period;md</code></a> <em>BMN model: temporal action proposal generation with commands.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/localization/yowo.md" id="/english_documents/model_zoo/localization/yowo.md"><code>yowo&period;md</code></a> <em>YOWO: Single-stage, channel fusion, attention, UCF101-24 pre-trained.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/model_zoo/multimodal/"><code>multimodal</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/multimodal/actbert.md" id="/english_documents/model_zoo/multimodal/actbert.md"><code>actbert&period;md</code></a> <em>ActBERT: Multimodal, global action, TNT block, SOTA video-language.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/model_zoo/partition/"><code>partition</code></strong></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/partition/transnetv2.md" id="/english_documents/model_zoo/partition/transnetv2.md"><code>transnetv2&period;md</code></a> <em>TransNetV2: Deep Learning Video Segmentation Model</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/README.md" id="/english_documents/model_zoo/README.md"><code>README&period;md</code></a> <em>Model Zoo: Comprehensive Action Recognition &amp; Segmentation Models.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/model_zoo/recognition/"><code>recognition</code></strong> <em>Advanced video/motion recognition models.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/agcn.md" id="/english_documents/model_zoo/recognition/agcn.md"><code>agcn&period;md</code></a> <em>AGCN: Improved ST-GCN for Video Recognition</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/agcn2s.md" id="/english_documents/model_zoo/recognition/agcn2s.md"><code>agcn2s&period;md</code></a> <em>AGCN2s: Enhanced ST-GCN for motion recognition.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/attention_lstm.md" id="/english_documents/model_zoo/recognition/attention_lstm.md"><code>attention&UnderBar;lstm&period;md</code></a> <em>AttentionLSTM: LSTMs, Attention layer, YouTube-8M, PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/ctrgcn.md" id="/english_documents/model_zoo/recognition/ctrgcn.md"><code>ctrgcn&period;md</code></a> <em>PaddlePaddle, CTR-GCN, NTU-RGB+D, bone-based behavior recognition, 99.9988% accuracy</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/movinet.md" id="/english_documents/model_zoo/recognition/movinet.md"><code>movinet&period;md</code></a> <em>MoViNet: Lightweight Video Recognition Model</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/posec3d.md" id="/english_documents/model_zoo/recognition/posec3d.md"><code>posec3d&period;md</code></a> <em>Trains PoseC3D model, tests without GPU.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/pp-timesformer.md" id="/english_documents/model_zoo/recognition/pp-timesformer.md"><code>pp-timesformer&period;md</code></a> <em>PP-TimeSformer: Enhanced TimeSformer for video recognition, multi-GPU support.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/pp-tsm.md" id="/english_documents/model_zoo/recognition/pp-tsm.md"><code>pp-tsm&period;md</code></a> <em>PP-TSM: Action Recognition, UCF101/Kinetics, PaddlePaddle, ResNet101</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/pp-tsn.md" id="/english_documents/model_zoo/recognition/pp-tsn.md"><code>pp-tsn&period;md</code></a> <em>Enhanced TSN model for video recognition</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/slowfast.md" id="/english_documents/model_zoo/recognition/slowfast.md"><code>slowfast&period;md</code></a> <em>SlowFast model: Video Recognition with Multigrid Training.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/stgcn.md" id="/english_documents/model_zoo/recognition/stgcn.md"><code>stgcn&period;md</code></a> <em>ST-GCN model training, testing, and inference.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/timesformer.md" id="/english_documents/model_zoo/recognition/timesformer.md"><code>timesformer&period;md</code></a> <em>TimeSformer: Efficient Video Classifier</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/tokenshift_transformer.md" id="/english_documents/model_zoo/recognition/tokenshift_transformer.md"><code>tokenshift&UnderBar;transformer&period;md</code></a> <em>Token Shift Transformer: High Accuracy Video Classification</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/tsm.md" id="/english_documents/model_zoo/recognition/tsm.md"><code>tsm&period;md</code></a> <em>Trains TSM model on UCF-101, Kinetics-400 datasets. ResNet-50, PaddlePaddle, AMP, Momentum optimization, L2_Decay, three sampling methods.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/tsn.md" id="/english_documents/model_zoo/recognition/tsn.md"><code>tsn&period;md</code></a> <em>TSN: 2D-CNN video classification with ResNet-50 and Kinetics-400.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/tsn_dali.md" id="/english_documents/model_zoo/recognition/tsn_dali.md"><code>tsn&UnderBar;dali&period;md</code></a> <em>Improve TSN training speed with DALI for action recognition.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/recognition/videoswin.md" id="/english_documents/model_zoo/recognition/videoswin.md"><code>videoswin&period;md</code></a> <em>SOTA Video-Swin-Transformer model with multi-scale, efficient attention.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/model_zoo/segmentation/"><code>segmentation</code></strong> <em>Video segmentation model zoo with PaddlePaddle and CFBI improvements.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/segmentation/asrf.md" id="/english_documents/model_zoo/segmentation/asrf.md"><code>asrf&period;md</code></a> <em>Improved video action segmentation model using PaddlePaddle.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/segmentation/cfbi.md" id="/english_documents/model_zoo/segmentation/cfbi.md"><code>cfbi&period;md</code></a> <em>CFBI video segmentation model, ECCV 2020.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/model_zoo/segmentation/mstcn.md" id="/english_documents/model_zoo/segmentation/mstcn.md"><code>mstcn&period;md</code></a> <em>Trains, evaluates, and compares MS-TCN video action segmentation models.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/english_documents/quick_start.md" id="/english_documents/quick_start.md"><code>quick&UnderBar;start&period;md</code></a> <em>PaddleVideo: Install, Use, Action Recognition</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tools.md" id="/english_documents/tools.md"><code>tools&period;md</code></a> <em>Tools for PaddleVideo: model parameters, FLOPs calculation, and exported model testing. Python 3.7 required.</em></li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/english_documents/tutorials/"><code>tutorials</code></strong> <em>Video recognition tutorials for efficient understanding.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/accelerate.md" id="/english_documents/tutorials/accelerate.md"><code>accelerate&period;md</code></a> <em>English/Chinese tutorial links provided.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/Action%20Recognition%20Datasets" id="/english_documents/tutorials/Action Recognition Datasets"><code>Action Recognition Datasets</code></a> <em>Action Recognition Datasets: Essential resources for training and evaluation.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/Action%20Recognition%20Papers" id="/english_documents/tutorials/Action Recognition Papers"><code>Action Recognition Papers</code></a> <em>Efficient Action Recognition Methods</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/config.md" id="/english_documents/tutorials/config.md"><code>config&period;md</code></a> <em>Inversion-of-Control, Dependency Injection with Config File</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/customized_usage.md" id="/english_documents/tutorials/customized_usage.md"><code>customized&UnderBar;usage&period;md</code></a> <em>Customize PaddleVideo framework elements.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/demos" id="/english_documents/tutorials/demos"><code>demos</code></a> <em>Demo: Action Recognition with Various Algorithms</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/deployment.md" id="/english_documents/tutorials/deployment.md"><code>deployment&period;md</code></a> <em>Convert dygraph models for deployment with PaddleInference.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/modular_design.md" id="/english_documents/tutorials/modular_design.md"><code>modular&UnderBar;design&period;md</code></a> <em>Modular Design Tutorial Translation</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/pp-tsm.md" id="/english_documents/tutorials/pp-tsm.md"><code>pp-tsm&period;md</code></a> <em>Optimized video recognition model for high performance and fast inference.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/Spatio-Temporal%20Action%20Detection%20Papers" id="/english_documents/tutorials/Spatio-Temporal Action Detection Papers"><code>Spatio-Temporal Action Detection Papers</code></a> <em>Comprehensive list of Spatio-Temporal Action Detection papers, 2015-2017.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/summarize.md" id="/english_documents/tutorials/summarize.md"><code>summarize&period;md</code></a> <em>Video Classification: RGB, Skeleton, Deep Learning</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/Temporal%20Action%20Detection%20Papers" id="/english_documents/tutorials/Temporal Action Detection Papers"><code>Temporal Action Detection Papers</code></a> <em>Action detection and localization in videos through research papers.</em></li>
+<li><a class="file_link" href="index.html?q=/english_documents/tutorials/TSM.md" id="/english_documents/tutorials/TSM.md"><code>TSM&period;md</code></a> <em>TSM: Efficient Video Understanding, Balanced Performance</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/english_documents/usage.md" id="/english_documents/usage.md"><code>usage&period;md</code></a> <em>PaddleVideo Linux setup, multi-card training, log format, fine-tuning, benchmarking</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/main.py" id="/main.py"><code>main&period;py</code></a> <em>Parallelized PaddleVideo model training with distributed environments.</em></li>
+<li><a class="file_link" href="index.html?q=/MANIFEST.in" id="/MANIFEST.in"><code>MANIFEST&period;in</code></a> <em>Includes essential files for PaddleVideo package distribution.</em></li>
+<li><span hierarchy="1" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/"><code>paddlevideo</code></strong> <em>ML-powered video analysis and processing tools.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/__init__.py" id="/paddlevideo/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo library initialization.</em></li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/loader/"><code>loader</code></strong> <em>Dali-powered loader for PaddleVideo.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/__init__.py" id="/paddlevideo/loader/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo loader/init functions for datasets, dataloaders, pipelines.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/builder.py" id="/paddlevideo/loader/builder.py"><code>builder&period;py</code></a> <em>Constructs PaddleVideo pipeline, builds loader for distributed training.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dali_loader.py" id="/paddlevideo/loader/dali_loader.py"><code>dali&UnderBar;loader&period;py</code></a> <em>Dali-powered Paddle video loader</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/loader/dataset/"><code>dataset</code></strong> <em>Raw video dataset loader for PaddlePaddle.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/__init__.py" id="/paddlevideo/loader/dataset/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports PaddleVideo dataset classes for video understanding.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/actbert_dataset.py" id="/paddlevideo/loader/dataset/actbert_dataset.py"><code>actbert&UnderBar;dataset&period;py</code></a> <em>ActBERT dataset setup for PaddlePaddle video processing</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/asrf_dataset.py" id="/paddlevideo/loader/dataset/asrf_dataset.py"><code>asrf&UnderBar;dataset&period;py</code></a> <em>Action segmentation dataset loader for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/ava_dataset.py" id="/paddlevideo/loader/dataset/ava_dataset.py"><code>ava&UnderBar;dataset&period;py</code></a> <em>AVA Dataset Class for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/base.py" id="/paddlevideo/loader/dataset/base.py"><code>base&period;py</code></a> <em>BaseDataset class for PaddlePaddle with loading, preparing, retrieving. Supports list results.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/bmn_dataset.py" id="/paddlevideo/loader/dataset/bmn_dataset.py"><code>bmn&UnderBar;dataset&period;py</code></a> <em>BMNDataset: Action localization video datasets loader.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/davis_dataset.py" id="/paddlevideo/loader/dataset/davis_dataset.py"><code>davis&UnderBar;dataset&period;py</code></a> <em>VOS Test class for Davis 2017 dataset in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/feature.py" id="/paddlevideo/loader/dataset/feature.py"><code>feature&period;py</code></a> <em>PaddleVideo library's FeatureDataset initialization and methods.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/frame.py" id="/paddlevideo/loader/dataset/frame.py"><code>frame&period;py</code></a> <em>PaddleVideo library: load, transform, process video data.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/MRI.py" id="/paddlevideo/loader/dataset/MRI.py"><code>MRI&period;py</code></a> <em>MRI dataset loader for action recognition</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/MRI_SlowFast.py" id="/paddlevideo/loader/dataset/MRI_SlowFast.py"><code>MRI&UnderBar;SlowFast&period;py</code></a> <em>MRI_SlowFast: Imports, classes, processes video data for training/validation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/ms_tcn_dataset.py" id="/paddlevideo/loader/dataset/ms_tcn_dataset.py"><code>ms&UnderBar;tcn&UnderBar;dataset&period;py</code></a> <em>MS-TCN dataset class and loader</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/msrvtt.py" id="/paddlevideo/loader/dataset/msrvtt.py"><code>msrvtt&period;py</code></a> <em>Prepares MSRVTT dataset for training/testing.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/oxford.py" id="/paddlevideo/loader/dataset/oxford.py"><code>oxford&period;py</code></a> <em>Oxford dataset class for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/skeleton.py" id="/paddlevideo/loader/dataset/skeleton.py"><code>skeleton&period;py</code></a> <em>Skeleton Dataset Loader for Action Recognition</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/slowfast_video.py" id="/paddlevideo/loader/dataset/slowfast_video.py"><code>slowfast&UnderBar;video&period;py</code></a> <em>Video dataset for action recognition using PaddleVideo's SFVideoDataset.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/ucf101_skeleton.py" id="/paddlevideo/loader/dataset/ucf101_skeleton.py"><code>ucf101&UnderBar;skeleton&period;py</code></a> <em>UCF101 Skeleton Dataset PaddleVideo Class</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/ucf24_dataset.py" id="/paddlevideo/loader/dataset/ucf24_dataset.py"><code>ucf24&UnderBar;dataset&period;py</code></a> <em>Ucf24Dataset: PaddleVideo's UCF24 dataset loader.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/dataset/video.py" id="/paddlevideo/loader/dataset/video.py"><code>video&period;py</code></a> <em>VideoDataset: Raw video loader with index, handling corruption.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/loader/pipelines/"><code>pipelines</code></strong> <em>Multimodal video pipelines in PaddlePaddle OpenCV.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/__init__.py" id="/paddlevideo/loader/pipelines/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo pipeline components and modules import.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/anet_pipeline.py" id="/paddlevideo/loader/pipelines/anet_pipeline.py"><code>anet&UnderBar;pipeline&period;py</code></a> <em>PaddleVideo: Feature extraction, IoU calculation, and data preparation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/augmentations.py" id="/paddlevideo/loader/pipelines/augmentations.py"><code>augmentations&period;py</code></a> <em>PaddleVideo loader with resize, augmentation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/augmentations_ava.py" id="/paddlevideo/loader/pipelines/augmentations_ava.py"><code>augmentations&UnderBar;ava&period;py</code></a> <em>AVA dataset augmentation and resizing in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/compose.py" id="/paddlevideo/loader/pipelines/compose.py"><code>compose&period;py</code></a> <em>Unifies pipeline components, flexible transformations.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/decode.py" id="/paddlevideo/loader/pipelines/decode.py"><code>decode&period;py</code></a> <em>TimeSformer model for video decoding in PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/decode_image.py" id="/paddlevideo/loader/pipelines/decode_image.py"><code>decode&UnderBar;image&period;py</code></a> <em>Decode image pipeline for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/decode_sampler.py" id="/paddlevideo/loader/pipelines/decode_sampler.py"><code>decode&UnderBar;sampler&period;py</code></a> <em>Video decoding pipeline with PIL conversion.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/decode_sampler_MRI.py" id="/paddlevideo/loader/pipelines/decode_sampler_MRI.py"><code>decode&UnderBar;sampler&UnderBar;MRI&period;py</code></a> <em>Decodes and samples MRI frames for SFMRI.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/mix.py" id="/paddlevideo/loader/pipelines/mix.py"><code>mix&period;py</code></a> <em>Video Mix Operator: Data Augmentation for Image Classification</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/multimodal.py" id="/paddlevideo/loader/pipelines/multimodal.py"><code>multimodal&period;py</code></a> <em>Introduces FeaturePadding class for multimodal data preprocessing.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/sample.py" id="/paddlevideo/loader/pipelines/sample.py"><code>sample&period;py</code></a> <em>Python code defines PaddleVideo pipeline for efficient frame sampling.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/sample_ava.py" id="/paddlevideo/loader/pipelines/sample_ava.py"><code>sample&UnderBar;ava&period;py</code></a> <em>SampleFrames class for PaddleVideo loader pipelines in OpenCV.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/sample_ucf24.py" id="/paddlevideo/loader/pipelines/sample_ucf24.py"><code>sample&UnderBar;ucf24&period;py</code></a> <em>SamplerUCF24: Pipeline for sampling frames from videos.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/segmentation.py" id="/paddlevideo/loader/pipelines/segmentation.py"><code>segmentation&period;py</code></a> <em>Multi-scale segmentation for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/segmentation_pipline.py" id="/paddlevideo/loader/pipelines/segmentation_pipline.py"><code>segmentation&UnderBar;pipline&period;py</code></a> <em>Segmentation sampler for PaddleVideo pipelines.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/pipelines/skeleton_pipeline.py" id="/paddlevideo/loader/pipelines/skeleton_pipeline.py"><code>skeleton&UnderBar;pipeline&period;py</code></a> <em>Data processing classes for efficient PaddleVideo.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/loader/registry.py" id="/paddlevideo/loader/registry.py"><code>registry&period;py</code></a> <em>Custom pipeline and dataset registries for PaddleVideo.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/metrics/"><code>metrics</code></strong> <em>Assess PaddleVideo metrics on UCF101 dataset.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/__init__.py" id="/paddlevideo/metrics/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports various metrics for video analysis.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/metrics/ActivityNet/"><code>ActivityNet</code></strong> <em>Efficiently calculates ActivityNet proposal metrics.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ActivityNet/__init__.py" id="/paddlevideo/metrics/ActivityNet/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports ANETproposal as public API.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ActivityNet/anet_prop.py" id="/paddlevideo/metrics/ActivityNet/anet_prop.py"><code>anet&UnderBar;prop&period;py</code></a> <em>Efficiently calculates metrics for ActivityNet proposals.</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/metrics/ava_evaluation/"><code>ava&UnderBar;evaluation</code></strong> <em>Standardizes AVA object detection metrics.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ava_evaluation/metrics.py" id="/paddlevideo/metrics/ava_evaluation/metrics.py"><code>metrics&period;py</code></a> <em>Calculates precision, recall, and CorLoc metrics for object detection.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ava_evaluation/np_box_list.py" id="/paddlevideo/metrics/ava_evaluation/np_box_list.py"><code>np&UnderBar;box&UnderBar;list&period;py</code></a> <em>Manages bounding boxes, checks validity.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ava_evaluation/np_box_ops.py" id="/paddlevideo/metrics/ava_evaluation/np_box_ops.py"><code>np&UnderBar;box&UnderBar;ops&period;py</code></a> <em>Numpy bounding box ops for IoU calculation</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py" id="/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py"><code>object&UnderBar;detection&UnderBar;evaluation&period;py</code></a> <em>Object detection evaluation module for AVA dataset with mAP metrics.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py" id="/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py"><code>per&UnderBar;image&UnderBar;evaluation&period;py</code></a> <em>Python code for AVA performance metrics evaluation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ava_evaluation/README.md" id="/paddlevideo/metrics/ava_evaluation/README.md"><code>README&period;md</code></a> <em>Action recognition metrics evaluation tool</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ava_evaluation/standard_fields.py" id="/paddlevideo/metrics/ava_evaluation/standard_fields.py"><code>standard&UnderBar;fields&period;py</code></a> <em>Standardizes object detection metrics.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ava_metric.py" id="/paddlevideo/metrics/ava_metric.py"><code>ava&UnderBar;metric&period;py</code></a> <em>AVA metric computation for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ava_utils.py" id="/paddlevideo/metrics/ava_utils.py"><code>ava&UnderBar;utils&period;py</code></a> <em>AVA metric handling for video sequences</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/base.py" id="/paddlevideo/metrics/base.py"><code>base&period;py</code></a> <em>PaddleVideo Base Metrics Class</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/bmn_metric.py" id="/paddlevideo/metrics/bmn_metric.py"><code>bmn&UnderBar;metric&period;py</code></a> <em>Calculates BMN metric for object detection.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/build.py" id="/paddlevideo/metrics/build.py"><code>build&period;py</code></a> <em>Python script, Apache License 2.0, builds metrics from config</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/center_crop_metric.py" id="/paddlevideo/metrics/center_crop_metric.py"><code>center&UnderBar;crop&UnderBar;metric&period;py</code></a> <em>Center crop metric for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/center_crop_metric_MRI.py" id="/paddlevideo/metrics/center_crop_metric_MRI.py"><code>center&UnderBar;crop&UnderBar;metric&UnderBar;MRI&period;py</code></a> <em>Calculates top-1/5 accuracy metrics for image classification tasks with multi-GPU support.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/depth_metric.py" id="/paddlevideo/metrics/depth_metric.py"><code>depth&UnderBar;metric&period;py</code></a> <em>DepthMetric: Process, accumulate, all-reduce, and average metrics.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/msrvtt_metric.py" id="/paddlevideo/metrics/msrvtt_metric.py"><code>msrvtt&UnderBar;metric&period;py</code></a> <em>MS-RNN/VTT model metrics computation and logging</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/multi_crop_metric.py" id="/paddlevideo/metrics/multi_crop_metric.py"><code>multi&UnderBar;crop&UnderBar;metric&period;py</code></a> <em>MultiCrop metric for PaddleVideo accuracy</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/recall.py" id="/paddlevideo/metrics/recall.py"><code>recall&period;py</code></a> <em>Calculate recall for object detection using IoU and image proposals.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/registry.py" id="/paddlevideo/metrics/registry.py"><code>registry&period;py</code></a> <em>Manage diverse metrics with Registry class in utils module.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/segmentation_metric.py" id="/paddlevideo/metrics/segmentation_metric.py"><code>segmentation&UnderBar;metric&period;py</code></a> <em>Label change detection with precision, recall, F1 score in video segmentation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/skeleton_metric.py" id="/paddlevideo/metrics/skeleton_metric.py"><code>skeleton&UnderBar;metric&period;py</code></a> <em>Skeleton metric: Skeleton-based model performance evaluation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/transnetv2_metric.py" id="/paddlevideo/metrics/transnetv2_metric.py"><code>transnetv2&UnderBar;metric&period;py</code></a> <em>TransNetV2 metrics: precision, recall, F1-score calculation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/ucf24_utils.py" id="/paddlevideo/metrics/ucf24_utils.py"><code>ucf24&UnderBar;utils&period;py</code></a> <em>Ucf24Metrics: UCF101 dataset utilities for precision/recall, mAP.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/vos_metric.py" id="/paddlevideo/metrics/vos_metric.py"><code>vos&UnderBar;metric&period;py</code></a> <em>VOSMetric: Video Object Segmentation Metrics</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/metrics/youtube8m/"><code>youtube8m</code></strong> <em>Evaluation tools for PaddleVideo on YouTube8m dataset.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/youtube8m/average_precision_calculator.py" id="/paddlevideo/metrics/youtube8m/average_precision_calculator.py"><code>average&UnderBar;precision&UnderBar;calculator&period;py</code></a> <em>Average precision calculator for video detection tasks.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/youtube8m/eval_util.py" id="/paddlevideo/metrics/youtube8m/eval_util.py"><code>eval&UnderBar;util&period;py</code></a> <em>Evaluates PaddleVideo performance in Youtube8m using Hit@1 and precision.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py" id="/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py"><code>mean&UnderBar;average&UnderBar;precision&UnderBar;calculator&period;py</code></a> <em>Calculates mAP for ranked lists with interpolated precisions.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/metrics/yowo_metric.py" id="/paddlevideo/metrics/yowo_metric.py"><code>yowo&UnderBar;metric&period;py</code></a> <em>YOWOMetric class for PaddleVideo metrics calculation and logging</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/"><code>modeling</code></strong> <em>Video modeling tools for efficiency</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/__init__.py" id="/paddlevideo/modeling/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo model registry and functions initialization</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/assigners/"><code>assigners</code></strong> <em>Assigners: Max IoU AVA classifier imports, for PaddleVideo model</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/assigners/__init__.py" id="/paddlevideo/modeling/assigners/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports MaxIoUAssignerAVA for importability.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/assigners/max_iou_assigner_ava.py" id="/paddlevideo/modeling/assigners/max_iou_assigner_ava.py"><code>max&UnderBar;iou&UnderBar;assigner&UnderBar;ava&period;py</code></a> <em>Max IoU Assigner AVA classifier</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/backbones/"><code>backbones</code></strong> <em>Video backbone models for PaddlePaddle</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/__init__.py" id="/paddlevideo/modeling/backbones/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Backbone models for video analysis in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/actbert.py" id="/paddlevideo/modeling/backbones/actbert.py"><code>actbert&period;py</code></a> <em>BERT embeddings for multimodal video action recognition with ACTBERT backbone</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/adds.py" id="/paddlevideo/modeling/backbones/adds.py"><code>adds&period;py</code></a> <em>PaddleVideo backbone with ResNet V1.5, DepthDecoder, and PoseDecoder for diverse pose estimation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/agcn.py" id="/paddlevideo/modeling/backbones/agcn.py"><code>agcn&period;py</code></a> <em>AGCN: Adaptive GCN Backbone for Graph Convolution Tasks</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/agcn2s.py" id="/paddlevideo/modeling/backbones/agcn2s.py"><code>agcn2s&period;py</code></a> <em>Temporal Convolutional Networks with GCN Units for NTURGB+D</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/asrf.py" id="/paddlevideo/modeling/backbones/asrf.py"><code>asrf&period;py</code></a> <em>ASRF backbone for PaddleVideo tasks</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/bmn.py" id="/paddlevideo/modeling/backbones/bmn.py"><code>bmn&period;py</code></a> <em>BMN model: Masks, ConvLayers for BMSN backbone in Paddle.ai</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/cfbi.py" id="/paddlevideo/modeling/backbones/cfbi.py"><code>cfbi&period;py</code></a> <em>FPN-based CFBI model for feature extraction</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/ctrgcn.py" id="/paddlevideo/modeling/backbones/ctrgcn.py"><code>ctrgcn&period;py</code></a> <em>CTRGCN backbone for video models with batch norm and NTUGraph</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/darknet.py" id="/paddlevideo/modeling/backbones/darknet.py"><code>darknet&period;py</code></a> <em>Darknet backbone with ConvBNLayer, concatenation, convolutions.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/deeplab.py" id="/paddlevideo/modeling/backbones/deeplab.py"><code>deeplab&period;py</code></a> <em>Deeplab: PaddlePaddle Network for Semantic Segmentation</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/movinet.py" id="/paddlevideo/modeling/backbones/movinet.py"><code>movinet&period;py</code></a> <em>MoViNet model with MobileNetV2 layers for video analysis</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/ms_tcn.py" id="/paddlevideo/modeling/backbones/ms_tcn.py"><code>ms&UnderBar;tcn&period;py</code></a> <em>MSTCN backbone, SingleStageModel class, Kaiming initialization</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/pptsm_mv2.py" id="/paddlevideo/modeling/backbones/pptsm_mv2.py"><code>pptsm&UnderBar;mv2&period;py</code></a> <em>MobileNetV2 backbone for PaddlePaddle</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/pptsm_mv3.py" id="/paddlevideo/modeling/backbones/pptsm_mv3.py"><code>pptsm&UnderBar;mv3&period;py</code></a> <em>PPTSM-Mv3 Backbone for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/pptsm_v2.py" id="/paddlevideo/modeling/backbones/pptsm_v2.py"><code>pptsm&UnderBar;v2&period;py</code></a> <em>Python module: PPTSM_v2 video backbone.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet.py" id="/paddlevideo/modeling/backbones/resnet.py"><code>resnet&period;py</code></a> <em>ResNet backbone model with dynamic blocks</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet3d.py" id="/paddlevideo/modeling/backbones/resnet3d.py"><code>resnet3d&period;py</code></a> <em>3D ResNet model for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet3d_slowonly.py" id="/paddlevideo/modeling/backbones/resnet3d_slowonly.py"><code>resnet3d&UnderBar;slowonly&period;py</code></a> <em>SlowOnly 3D ResNet backbone with Slowfast pathway</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet_slowfast.py" id="/paddlevideo/modeling/backbones/resnet_slowfast.py"><code>resnet&UnderBar;slowfast&period;py</code></a> <em>ResNetSlowFast for video recognition with slow-fast pathways.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py" id="/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py"><code>resnet&UnderBar;slowfast&UnderBar;MRI&period;py</code></a> <em>Initialize ResNet SlowFast MRI model for video analysis.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet_tsm.py" id="/paddlevideo/modeling/backbones/resnet_tsm.py"><code>resnet&UnderBar;tsm&period;py</code></a> <em>ResNet-TSM Backbone: Deprecated, Needs Updates</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet_tsm_MRI.py" id="/paddlevideo/modeling/backbones/resnet_tsm_MRI.py"><code>resnet&UnderBar;tsm&UnderBar;MRI&period;py</code></a> <em>ResNet-TSM MRI model in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet_tsn_MRI.py" id="/paddlevideo/modeling/backbones/resnet_tsn_MRI.py"><code>resnet&UnderBar;tsn&UnderBar;MRI&period;py</code></a> <em>ResNet-TSN model, PaddlePaddle, weights init, results output</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py" id="/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py"><code>resnet&UnderBar;tweaks&UnderBar;tsm&period;py</code></a> <em>TSM ResNet backbone for Temporal Segment Networks</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py" id="/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py"><code>resnet&UnderBar;tweaks&UnderBar;tsn&period;py</code></a> <em>ResNet TSN backbones in PaddleVideo library.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/resnext101.py" id="/paddlevideo/modeling/backbones/resnext101.py"><code>resnext101&period;py</code></a> <em>ResNeXt-101 PaddlePaddle model defined.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/stgcn.py" id="/paddlevideo/modeling/backbones/stgcn.py"><code>stgcn&period;py</code></a> <em>STGCN model for spatio-temporal data processing in Python</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/swin_transformer.py" id="/paddlevideo/modeling/backbones/swin_transformer.py"><code>swin&UnderBar;transformer&period;py</code></a> <em>Swin Transformer backbone for image processing in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/toshift_vit.py" id="/paddlevideo/modeling/backbones/toshift_vit.py"><code>toshift&UnderBar;vit&period;py</code></a> <em>Vision Transformer Model with Token Shift</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/transnetv2.py" id="/paddlevideo/modeling/backbones/transnetv2.py"><code>transnetv2&period;py</code></a> <em>OctConv3D 3D convolutional layer for shot transition detection in TransNetV2</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/vit.py" id="/paddlevideo/modeling/backbones/vit.py"><code>vit&period;py</code></a> <em>VisionTransformer-based video processing in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/vit_tweaks.py" id="/paddlevideo/modeling/backbones/vit_tweaks.py"><code>vit&UnderBar;tweaks&period;py</code></a> <em>Vit_tweaks: VisionTransformer enhancements.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/backbones/yowo.py" id="/paddlevideo/modeling/backbones/yowo.py"><code>yowo&period;py</code></a> <em>YOWO: YOLO-style backbone for video classification</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/bbox_utils.py" id="/paddlevideo/modeling/bbox_utils.py"><code>bbox&UnderBar;utils&period;py</code></a> <em>Bounding box utilities for YOLO.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/builder.py" id="/paddlevideo/modeling/builder.py"><code>builder&period;py</code></a> <em>Dynamic video object detection model builder.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/framework/"><code>framework</code></strong> <em>Multimodal video modeling framework</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/__init__.py" id="/paddlevideo/modeling/framework/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo framework: base classes for model modeling.</em></li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/framework/detectors/"><code>detectors</code></strong> <em>Two-stage detectors in PaddleVideo Python implementation</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/detectors/__init__.py" id="/paddlevideo/modeling/framework/detectors/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo detectors: Base, FastRCNN, TwoStage.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/detectors/base.py" id="/paddlevideo/modeling/framework/detectors/base.py"><code>base&period;py</code></a> <em>BaseDetector: Parent for detectors, common features and abstract train_step.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/detectors/fast_rcnn.py" id="/paddlevideo/modeling/framework/detectors/fast_rcnn.py"><code>fast&UnderBar;rcnn&period;py</code></a> <em>FastRCNN: Two-stage object detection.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/detectors/two_stage.py" id="/paddlevideo/modeling/framework/detectors/two_stage.py"><code>two&UnderBar;stage&period;py</code></a> <em>Two-stage detector SlowFast model Python implementation.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/framework/estimators/"><code>estimators</code></strong> <em>PaddleVideo: Estimators for base, depth, imports, and classes.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/estimators/__init__.py" id="/paddlevideo/modeling/framework/estimators/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports, classes, DepthEstimator, BaseEstimator</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/estimators/base.py" id="/paddlevideo/modeling/framework/estimators/base.py"><code>base&period;py</code></a> <em>BaseEstimator class for PaddleVideo modeling</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/estimators/depth_estimator.py" id="/paddlevideo/modeling/framework/estimators/depth_estimator.py"><code>depth&UnderBar;estimator&period;py</code></a> <em>DepthEstimator: Depth estimation model with forward_net, loss.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/framework/localizers/"><code>localizers</code></strong> <em>Localizers for PaddleVideo post-processing</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/localizers/__init__.py" id="/paddlevideo/modeling/framework/localizers/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo localizers for video tasks in various techniques.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/localizers/base.py" id="/paddlevideo/modeling/framework/localizers/base.py"><code>base&period;py</code></a> <em>Base class for PaddlePaddle localization models</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/localizers/bmn_localizer.py" id="/paddlevideo/modeling/framework/localizers/bmn_localizer.py"><code>bmn&UnderBar;localizer&period;py</code></a> <em>BMN Localizer Model for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/localizers/yowo_localizer.py" id="/paddlevideo/modeling/framework/localizers/yowo_localizer.py"><code>yowo&UnderBar;localizer&period;py</code></a> <em>YOWOLocalizer: NMS, matching, precision, recall, F-score.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/localizers/yowo_utils.py" id="/paddlevideo/modeling/framework/localizers/yowo_utils.py"><code>yowo&UnderBar;utils&period;py</code></a> <em>YOLOv2 post-processing with PaddlePaddle</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/framework/multimodal/"><code>multimodal</code></strong> <em>Multimodal PaddleVideo framework directory</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/multimodal/__init__.py" id="/paddlevideo/modeling/framework/multimodal/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Multimodal Model Framework for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/multimodal/actbert.py" id="/paddlevideo/modeling/framework/multimodal/actbert.py"><code>actbert&period;py</code></a> <em>Multimodal ActBert model for text, video, action prediction.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/multimodal/base.py" id="/paddlevideo/modeling/framework/multimodal/base.py"><code>base&period;py</code></a> <em>Multimodal model base class for PaddleVideo.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/framework/partitioners/"><code>partitioners</code></strong> <em>PaddleVideo partitioners for modeling framework</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/partitioners/__init__.py" id="/paddlevideo/modeling/framework/partitioners/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo Partitioner Initialization File</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/partitioners/base.py" id="/paddlevideo/modeling/framework/partitioners/base.py"><code>base&period;py</code></a> <em>Base partitioner class for PaddleVideo's modeling framework</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py" id="/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py"><code>transnetv2&UnderBar;partitioner&period;py</code></a> <em>TransNetV2 Partitioner for PaddleVideo</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/framework/recognizers/"><code>recognizers</code></strong> <em>PaddleVideo recognizers: GCN, MoViNet, Transformers</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/__init__.py" id="/paddlevideo/modeling/framework/recognizers/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imported recognizers for video recognition tasks in PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/base.py" id="/paddlevideo/modeling/framework/recognizers/base.py"><code>base&period;py</code></a> <em>Base recognizer model initialization and operation modes.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizer1d.py" id="/paddlevideo/modeling/framework/recognizers/recognizer1d.py"><code>recognizer1d&period;py</code></a> <em>1D recognizer model in PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizer2d.py" id="/paddlevideo/modeling/framework/recognizers/recognizer2d.py"><code>recognizer2d&period;py</code></a> <em>2D video model for PaddleVideo analysis</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizer3d.py" id="/paddlevideo/modeling/framework/recognizers/recognizer3d.py"><code>recognizer3d&period;py</code></a> <em>Recognizer3D: 3D model framework, training, validation, testing</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py" id="/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py"><code>recognizer3dMRI&period;py</code></a> <em>3D Recognizer model for PaddleVideo training, validation, and testing.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py" id="/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py"><code>recognizer&UnderBar;gcn&period;py</code></a> <em>Introduces GCN Recognizer model for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py" id="/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py"><code>recognizer&UnderBar;movinet&UnderBar;frame&period;py</code></a> <em>MoViNetRecognizerFrame: Framework for model training, testing, inference.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py" id="/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py"><code>recognizer&UnderBar;transformer&period;py</code></a> <em>Transformer-based recognizer model implementation</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py" id="/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py"><code>recognizer&UnderBar;transformer&UnderBar;MRI&period;py</code></a> <em>RecognizerTransformer_MRI: Multi-view MRI classifier.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py" id="/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py"><code>recognizerDistillation&period;py</code></a> <em>Recognizer distillation in PaddleVideo framework.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/recognizers/recognizerMRI.py" id="/paddlevideo/modeling/framework/recognizers/recognizerMRI.py"><code>recognizerMRI&period;py</code></a> <em>2D Image Classifier Model using RecognizerMRI.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/framework/segment/"><code>segment</code></strong> <em>Video object segmentation framework with instance-level attention.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/segment/__init__.py" id="/paddlevideo/modeling/framework/segment/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Segment models in PaddleVideo, BaseSegment and CFBI.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/segment/base.py" id="/paddlevideo/modeling/framework/segment/base.py"><code>base&period;py</code></a> <em>Semi-Video Object Segmentation abstract base class</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/segment/cfbi.py" id="/paddlevideo/modeling/framework/segment/cfbi.py"><code>cfbi&period;py</code></a> <em>CFBI model for image segmentation and video processing with instance-level attention.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/segment/utils.py" id="/paddlevideo/modeling/framework/segment/utils.py"><code>utils&period;py</code></a> <em>PaddleVideo segment matching, ASPP models, padding, distances, data prep.</em></li>
+</ul>
+</li>
+<li><span hierarchy="4" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/framework/segmenters/"><code>segmenters</code></strong> <em>PaddleVideo segmenters framework and models with Gaussian smoothing, Kaiming Uniform.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/segmenters/__init__.py" id="/paddlevideo/modeling/framework/segmenters/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo segmenter module with 3 classes for feature extraction, segmentation, and separation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/segmenters/asrf.py" id="/paddlevideo/modeling/framework/segmenters/asrf.py"><code>asrf&period;py</code></a> <em>ASRF segmenter: ASRF backbone, head network, validation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/segmenters/base.py" id="/paddlevideo/modeling/framework/segmenters/base.py"><code>base&period;py</code></a> <em>BaseSegmenter: Foundation for PaddleVideo segmenters with train, valid, test, and inference.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/segmenters/ms_tcn.py" id="/paddlevideo/modeling/framework/segmenters/ms_tcn.py"><code>ms&UnderBar;tcn&period;py</code></a> <em>MS-TCN video segmenter model</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/framework/segmenters/utils.py" id="/paddlevideo/modeling/framework/segmenters/utils.py"><code>utils&period;py</code></a> <em>Gaussian smoothing, Kaiming Uniform for layer init.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/heads/"><code>heads</code></strong> <em>Video Modeling Heads: Efficient ROI Alignment</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/__init__.py" id="/paddlevideo/modeling/heads/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports various head classes for video tasks.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/adds_head.py" id="/paddlevideo/modeling/heads/adds_head.py"><code>adds&UnderBar;head&period;py</code></a> <em>AddHead: Detection, Loss, Metrics, Multi-GPU Support</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/agcn2s_head.py" id="/paddlevideo/modeling/heads/agcn2s_head.py"><code>agcn2s&UnderBar;head&period;py</code></a> <em>AGCN2sHead: PaddleVideo model head with channels, classes, people, dropout</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/asrf_head.py" id="/paddlevideo/modeling/heads/asrf_head.py"><code>asrf&UnderBar;head&period;py</code></a> <em>ASRF head: Action recognition, conv. layers, precision, recall, F1</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/attention_lstm_head.py" id="/paddlevideo/modeling/heads/attention_lstm_head.py"><code>attention&UnderBar;lstm&UnderBar;head&period;py</code></a> <em>Attention LSTM head for PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/base.py" id="/paddlevideo/modeling/heads/base.py"><code>base&period;py</code></a> <em>Base PaddleVideo head: Classification, loss/accuracy, label smoothing.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/bbox_head.py" id="/paddlevideo/modeling/heads/bbox_head.py"><code>bbox&UnderBar;head&period;py</code></a> <em>Object detection bbox head with classification targets, dropout, and recall/precision calculation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/cfbi_head.py" id="/paddlevideo/modeling/heads/cfbi_head.py"><code>cfbi&UnderBar;head&period;py</code></a> <em>CollaborativeEnsemblerMS neural network architecture.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/ctrgcn_head.py" id="/paddlevideo/modeling/heads/ctrgcn_head.py"><code>ctrgcn&UnderBar;head&period;py</code></a> <em>CTRGCN Head: Neural network head for CTR-GCN model in PaddleVideo</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/i3d_head.py" id="/paddlevideo/modeling/heads/i3d_head.py"><code>i3d&UnderBar;head&period;py</code></a> <em>I3D head with pooling, dropout, and options.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/movinet_head.py" id="/paddlevideo/modeling/heads/movinet_head.py"><code>movinet&UnderBar;head&period;py</code></a> <em>MoViNetHead: BaseHead extension, unmodified input.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/ms_tcn_head.py" id="/paddlevideo/modeling/heads/ms_tcn_head.py"><code>ms&UnderBar;tcn&UnderBar;head&period;py</code></a> <em>MS-TCN Head model: F1 scoring, loss calculation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/pptimesformer_head.py" id="/paddlevideo/modeling/heads/pptimesformer_head.py"><code>pptimesformer&UnderBar;head&period;py</code></a> <em>PaddlePaddle TimeSformer Head</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/pptsm_head.py" id="/paddlevideo/modeling/heads/pptsm_head.py"><code>pptsm&UnderBar;head&period;py</code></a> <em>PaddlePaddle Video: ppTSMHead class for TSN models.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/pptsn_head.py" id="/paddlevideo/modeling/heads/pptsn_head.py"><code>pptsn&UnderBar;head&period;py</code></a> <em>PaddlePaddle ppTSN head for classification tasks with dropout.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/roi_extractor.py" id="/paddlevideo/modeling/heads/roi_extractor.py"><code>roi&UnderBar;extractor&period;py</code></a> <em>RoIAlign extractor for PaddlePaddle</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/roi_head.py" id="/paddlevideo/modeling/heads/roi_head.py"><code>roi&UnderBar;head&period;py</code></a> <em>ROI alignment head with NMS for bounding boxes.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/single_straight3d.py" id="/paddlevideo/modeling/heads/single_straight3d.py"><code>single&UnderBar;straight3d&period;py</code></a> <em>3D ROI extractor class for feature extraction</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/slowfast_head.py" id="/paddlevideo/modeling/heads/slowfast_head.py"><code>slowfast&UnderBar;head&period;py</code></a> <em>SlowFast head 3D projection PaddleVideo class</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/stgcn_head.py" id="/paddlevideo/modeling/heads/stgcn_head.py"><code>stgcn&UnderBar;head&period;py</code></a> <em>STGCNHead: Convolutional layer, forward pass in PaddlePaddle</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/timesformer_head.py" id="/paddlevideo/modeling/heads/timesformer_head.py"><code>timesformer&UnderBar;head&period;py</code></a> <em>TimeSformer Head: Model Head for TimeSformer</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/token_shift_head.py" id="/paddlevideo/modeling/heads/token_shift_head.py"><code>token&UnderBar;shift&UnderBar;head&period;py</code></a> <em>TokenShiftHead: Paddle's Linear classification head</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/transnetv2_head.py" id="/paddlevideo/modeling/heads/transnetv2_head.py"><code>transnetv2&UnderBar;head&period;py</code></a> <em>TransNetV2Head: CV model head, loss calculation.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/tsm_head.py" id="/paddlevideo/modeling/heads/tsm_head.py"><code>tsm&UnderBar;head&period;py</code></a> <em>TSM head: TSNHead extension for classification tasks</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/heads/tsn_head.py" id="/paddlevideo/modeling/heads/tsn_head.py"><code>tsn&UnderBar;head&period;py</code></a> <em>TSN head: Adaptive pooling, dropout, class scores</em></li>
+</ul>
+</li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/losses/"><code>losses</code></strong> <em>PaddleVideo Losses: Various Tasks, Optimized Functions</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/__init__.py" id="/paddlevideo/modeling/losses/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports diverse losses for PaddleVideo tasks.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/actbert_loss.py" id="/paddlevideo/modeling/losses/actbert_loss.py"><code>actbert&UnderBar;loss&period;py</code></a> <em>ActBert loss function code</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/asrf_loss.py" id="/paddlevideo/modeling/losses/asrf_loss.py"><code>asrf&UnderBar;loss&period;py</code></a> <em>Custom video modeling loss functions defined</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/base.py" id="/paddlevideo/modeling/losses/base.py"><code>base&period;py</code></a> <em>Base loss function class for PaddlePaddle.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/bmn_loss.py" id="/paddlevideo/modeling/losses/bmn_loss.py"><code>bmn&UnderBar;loss&period;py</code></a> <em>BMN loss function, PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/cross_entropy_loss.py" id="/paddlevideo/modeling/losses/cross_entropy_loss.py"><code>cross&UnderBar;entropy&UnderBar;loss&period;py</code></a> <em>Custom PaddlePaddle loss for classification tasks</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/depth_loss.py" id="/paddlevideo/modeling/losses/depth_loss.py"><code>depth&UnderBar;loss&period;py</code></a> <em>Calculates depth losses for disparity tasks.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/distillation_loss.py" id="/paddlevideo/modeling/losses/distillation_loss.py"><code>distillation&UnderBar;loss&period;py</code></a> <em>Distillation loss, KL divergence, CrossEntropy, weighted average.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/transnetv2_loss.py" id="/paddlevideo/modeling/losses/transnetv2_loss.py"><code>transnetv2&UnderBar;loss&period;py</code></a> <em>TransNetV2 loss calculation class</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/losses/yowo_loss.py" id="/paddlevideo/modeling/losses/yowo_loss.py"><code>yowo&UnderBar;loss&period;py</code></a> <em>Optimize hard examples, YOLOv3-style losses for bounding box location, confidence, classification on GPU.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/registry.py" id="/paddlevideo/modeling/registry.py"><code>registry&period;py</code></a> <em>Model registration for efficient organization.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/modeling/samplers/"><code>samplers</code></strong> <em>Licensing, RandomSampler: Balanced bbox sampling.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/samplers/__init__.py" id="/paddlevideo/modeling/samplers/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Licensing, RandomSampler import.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/samplers/random_sampler.py" id="/paddlevideo/modeling/samplers/random_sampler.py"><code>random&UnderBar;sampler&period;py</code></a> <em>RandomSampler: Balanced positive-negative bbox sampling.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/modeling/weight_init.py" id="/paddlevideo/modeling/weight_init.py"><code>weight&UnderBar;init&period;py</code></a> <em>Initialize layer weights in PaddlePaddle.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/solver/"><code>solver</code></strong> <em>Optimizer and learning rate schedulers for PaddleVideo.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/solver/__init__.py" id="/paddlevideo/solver/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports optimizer and learning rate functions under Apache 2.0 license.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/solver/custom_lr.py" id="/paddlevideo/solver/custom_lr.py"><code>custom&UnderBar;lr&period;py</code></a> <em>Custom warmup &amp; decay schedulers for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/solver/lr.py" id="/paddlevideo/solver/lr.py"><code>lr&period;py</code></a> <em>Learning rate scheduler construction.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/solver/optimizer.py" id="/paddlevideo/solver/optimizer.py"><code>optimizer&period;py</code></a> <em>Initializes optimizer configurations, handles decay and creates optimizer.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/tasks/"><code>tasks</code></strong> <em>Trains PaddleVideo models with various tasks and configurations.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/tasks/__init__.py" id="/paddlevideo/tasks/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo tasks module initialization file</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/tasks/test.py" id="/paddlevideo/tasks/test.py"><code>test&period;py</code></a> <em>Test PaddlePaddle models in parallel</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/tasks/train.py" id="/paddlevideo/tasks/train.py"><code>train&period;py</code></a> <em>Train PaddlePaddle's Distributed Model with AMP, Log, Evaluate, and Save.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/tasks/train_dali.py" id="/paddlevideo/tasks/train_dali.py"><code>train&UnderBar;dali&period;py</code></a> <em>Trains DALI TSN model with optimization steps, logs metrics.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/tasks/train_multigrid.py" id="/paddlevideo/tasks/train_multigrid.py"><code>train&UnderBar;multigrid&period;py</code></a> <em>Trains PaddleVideo models with multigrid configuration.</em></li>
+</ul>
+</li>
+<li><span hierarchy="2" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/utils/"><code>utils</code></strong> <em>PaddleVideo ML &amp; video utilities.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/__init__.py" id="/paddlevideo/utils/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>PaddleVideo utils initialization.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/build_utils.py" id="/paddlevideo/utils/build_utils.py"><code>build&UnderBar;utils&period;py</code></a> <em>Build utility function for object creation from config and registry.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/config.py" id="/paddlevideo/utils/config.py"><code>config&period;py</code></a> <em>Configure and load YAML files with AttrDict, logger, visualization support.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/dist_utils.py" id="/paddlevideo/utils/dist_utils.py"><code>dist&UnderBar;utils&period;py</code></a> <em>Distributed computing utilities for PaddleVideo.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/logger.py" id="/paddlevideo/utils/logger.py"><code>logger&period;py</code></a> <em>PaddleVideo's colorful logger setup with non-propagation.</em></li>
+<li><span hierarchy="3" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/paddlevideo/utils/multigrid/"><code>multigrid</code></strong> <em>Multigrid utility functions for Paddle Video.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/multigrid/__init__.py" id="/paddlevideo/utils/multigrid/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports useful functions from PaddleVideo library.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/multigrid/batchnorm_helper.py" id="/paddlevideo/utils/multigrid/batchnorm_helper.py"><code>batchnorm&UnderBar;helper&period;py</code></a> <em>BatchNorm3D helper for PyTorch</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/multigrid/interval_helper.py" id="/paddlevideo/utils/multigrid/interval_helper.py"><code>interval&UnderBar;helper&period;py</code></a> <em>Determine evaluation epoch using multigrid schedule</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/multigrid/multigrid.py" id="/paddlevideo/utils/multigrid/multigrid.py"><code>multigrid&period;py</code></a> <em>Manages multigrid training schedules.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/multigrid/save_load_helper.py" id="/paddlevideo/utils/multigrid/save_load_helper.py"><code>save&UnderBar;load&UnderBar;helper&period;py</code></a> <em>Ensures state dict consistency and loads pre-trained weights.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/multigrid/short_sampler.py" id="/paddlevideo/utils/multigrid/short_sampler.py"><code>short&UnderBar;sampler&period;py</code></a> <em>DistributedShortSampler: Efficient Distributed Data Loading</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/precise_bn.py" id="/paddlevideo/utils/precise_bn.py"><code>precise&UnderBar;bn&period;py</code></a> <em>Precise Batch Normalization for faster training</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/profiler.py" id="/paddlevideo/utils/profiler.py"><code>profiler&period;py</code></a> <em>PaddleVideo Profiler: Performance Analysis and Optimization Tool</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/record.py" id="/paddlevideo/utils/record.py"><code>record&period;py</code></a> <em>Records metrics, calculates means, logs progress in training processes.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/registry.py" id="/paddlevideo/utils/registry.py"><code>registry&period;py</code></a> <em>Registry for mapping names to objects with registration and retrieval methods.</em></li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/utils/save_load.py" id="/paddlevideo/utils/save_load.py"><code>save&UnderBar;load&period;py</code></a> <em>Save/load functions for PaddlePaddle models.</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/paddlevideo/version.py" id="/paddlevideo/version.py"><code>version&period;py</code></a> <em>PaddleVideo version 0.0.1, Apache License 2.0</em></li>
+</ul>
+</li>
+<li><a class="file_link" href="index.html?q=/README.md" id="/README.md"><code>README&period;md</code></a> <em>Advanced video processing library with deployment support</em></li>
+<li><a class="file_link" href="index.html?q=/README_en.md" id="/README_en.md"><code>README&UnderBar;en&period;md</code></a> <em>Deep learning library for video processing</em></li>
+<li><a class="file_link" href="index.html?q=/run.sh" id="/run.sh"><code>run&period;sh</code></a> <em>Trains deep learning models for computer vision tasks using PaddlePaddle.</em></li>
+<li><a class="file_link" href="index.html?q=/setup.py" id="/setup.py"><code>setup&period;py</code></a> <em>PaddleVideo setup: PaddlePaddle-powered video tool, Python 3.7.</em></li>
+<li><span hierarchy="1" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/test_tipc/"><code>test&UnderBar;tipc</code></strong> <em>Test Paddle Video, GPU/CPU benchmarking.</em></span><ul>
+<li><a class="file_link" href="index.html?q=/test_tipc/benchmark_train.sh" id="/test_tipc/benchmark_train.sh"><code>benchmark&UnderBar;train&period;sh</code></a> <em>Benchmarks PaddleVideo model, adjusting batch sizes and precisions for performance.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/common_func.sh" id="/test_tipc/common_func.sh"><code>common&UnderBar;func&period;sh</code></a> <em>Common function script for parsing, setting, and status checking.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/compare_results.py" id="/test_tipc/compare_results.py"><code>compare&UnderBar;results&period;py</code></a> <em>Compares log results with ground truth for testing.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/extract_loss.py" id="/test_tipc/extract_loss.py"><code>extract&UnderBar;loss&period;py</code></a> <em>Extracts and processes expressions for calculations.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/prepare.sh" id="/test_tipc/prepare.sh"><code>prepare&period;sh</code></a> <em>Prepare PaddlePaddle's video models, handle data &amp; install packages</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/README.md" id="/test_tipc/README.md"><code>README&period;md</code></a> <em>TIPC-supported PaddleVideo tutorials and features.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/test_inference_cpp.sh" id="/test_tipc/test_inference_cpp.sh"><code>test&UnderBar;inference&UnderBar;cpp&period;sh</code></a> <em>PaddleVideo model inference tests with MKLDNN or float point precision.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/test_paddle2onnx.sh" id="/test_tipc/test_paddle2onnx.sh"><code>test&UnderBar;paddle2onnx&period;sh</code></a> <em>Convert PaddlePaddle to ONNX.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/test_ptq_inference_python.sh" id="/test_tipc/test_ptq_inference_python.sh"><code>test&UnderBar;ptq&UnderBar;inference&UnderBar;python&period;sh</code></a> <em>Script for PaddleVideo model inference tests on GPUs/CPUs with varied batch sizes.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/test_serving_infer_cpp.sh" id="/test_tipc/test_serving_infer_cpp.sh"><code>test&UnderBar;serving&UnderBar;infer&UnderBar;cpp&period;sh</code></a> <em>Bash script initializes model, serves via Python/C++, tests web function with IFS separation.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/test_serving_infer_python.sh" id="/test_tipc/test_serving_infer_python.sh"><code>test&UnderBar;serving&UnderBar;infer&UnderBar;python&period;sh</code></a> <em>Configures model serving, sets up API server, transfers model, and handles cleanup tasks.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/test_train_dy2static_python.sh" id="/test_tipc/test_train_dy2static_python.sh"><code>test&UnderBar;train&UnderBar;dy2static&UnderBar;python&period;sh</code></a> <em>Trains and analyzes two models, compares losses, logs differences.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/test_train_inference_python.sh" id="/test_tipc/test_train_inference_python.sh"><code>test&UnderBar;train&UnderBar;inference&UnderBar;python&period;sh</code></a> <em>Efficient PaddleVideo model training and evaluation.</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/test_train_inference_python_npu.sh" id="/test_tipc/test_train_inference_python_npu.sh"><code>test&UnderBar;train&UnderBar;inference&UnderBar;python&UnderBar;npu&period;sh</code></a> <em>Update config for NPU, disable MKLDNN on non-x86_64, set Python to 3.9, change exec script to "npu".</em></li>
+<li><a class="file_link" href="index.html?q=/test_tipc/test_train_inference_python_xpu.sh" id="/test_tipc/test_train_inference_python_xpu.sh"><code>test&UnderBar;train&UnderBar;inference&UnderBar;python&UnderBar;xpu&period;sh</code></a> <em>XPU PaddleVideo script with Python 3.9 NPU, logs start.</em></li>
+</ul>
+</li>
+<li><span hierarchy="1" class="expanded" onclick="toggleVisibility(this)" ><strong class="directory" id="/tools/"><code>tools</code></strong> <em>Video Modeling Tools</em></span><ul>
+<li><a class="file_link" href="index.html?q=/tools/__init__.py" id="/tools/__init__.py"><code>&UnderBar;&UnderBar;init&UnderBar;&UnderBar;&period;py</code></a> <em>Imports and defines package contents.</em></li>
+<li><a class="file_link" href="index.html?q=/tools/ava_predict.py" id="/tools/ava_predict.py"><code>ava&UnderBar;predict&period;py</code></a> <em>AVA model prediction with PaddleVideo and OpenCV.</em></li>
+<li><a class="file_link" href="index.html?q=/tools/export_model.py" id="/tools/export_model.py"><code>export&UnderBar;model&period;py</code></a> <em>Export PaddleVideo model functions.</em></li>
+<li><a class="file_link" href="index.html?q=/tools/predict.py" id="/tools/predict.py"><code>predict&period;py</code></a> <em>Paddle Video Tool: Predict with TensorRT YOWO</em></li>
+<li><a class="file_link" href="index.html?q=/tools/summary.py" id="/tools/summary.py"><code>summary&period;py</code></a> <em>PaddleVideo model construction and evaluation.</em></li>
+<li><a class="file_link" href="index.html?q=/tools/utils.py" id="/tools/utils.py"><code>utils&period;py</code></a> <em>PaddleVideo tools for inference, detection, and pose estimation</em></li>
+<li><a class="file_link" href="index.html?q=/tools/wheel.py" id="/tools/wheel.py"><code>wheel&period;py</code></a> <em>Wheel.py: Downloads, initializes, and iterates model results for video label prediction.</em></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+    </article>
+    <script type="text/javascript">
+        function getQueryParams() {
+            var search = window.location.search.substring(1); // Remove leading '?'
+            var queryParams = {};
+            search.split('&').forEach(function (pair) {
+                var parts = pair.split('=');
+                var key = decodeURIComponent(parts[0]);
+                var value = decodeURIComponent(parts[1]);
+                queryParams[key] = value;
+            });
+            return queryParams;
+        }
+
+        const queryParams = getQueryParams(window.location.search);
+        const show_full = queryParams.full == "true";
+        //const is_random = queryParams.random == 'true';
+        function feelingLucky() {
+            var elements = document.getElementsByClassName("file_link");
+            var randomIndex = Math.floor(Math.random() * elements.length);
+            window.location = elements[randomIndex].href;
+        }
+        //if (is_random) {feelingLucky();}
+        document.getElementById("feeling-lucky").onclick = feelingLucky;
+        if (!show_full) {
+            const spans = document.querySelectorAll('span');
+            for (let span of spans) {
+                if (span.getAttribute("hierarchy") == '0') { continue }
+                toggleVisibility(span);
+            }
+        } else {
+            const expand_elem = document.getElementById('expand-tree');
+            expand_elem.setAttribute("href", "tree.html");
+            expand_elem.setAttribute("title", "Undo expand");
+            expand_elem.innerHTML = '<i class="bi bi-caret-up-square"></i>'
+        }
+
+    </script>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/docs/zh-CN/annotation_tools.md b/docs/zh-CN/annotation_tools.md
deleted file mode 100644
index 00374bdce..000000000
--- a/docs/zh-CN/annotation_tools.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# 视频标注工具-BILS
-
-BILS (Baidu Intelligent Labeling System) 是一款支持时间轴打标签的视频标注软件，可被用于视频事件定位 、短视频分类等任务的标注工作。用户界面简约，操作简单、易上手。
-
-## 下载链接
-
-mac端: [dmg包下载](https://videotag.bj.bcebos.com/Annotation-tools/4.11-EIVideo-0.0.0.dmg)
-
-windows端： [exe文件下载](https://videotag.bj.bcebos.com/Annotation-tools/EIVideo-Setup-0.0.0.exe)
-
-使用教学视频: [视频下载](https://videotag.bj.bcebos.com/Annotation-tools/4.11-%E4%BA%A7%E5%93%81%E8%AF%B4%E6%98%8E.mp4)
-
-## 百度网盘下载
-
-链接: https://pan.baidu.com/s/1EmDaLwZstczICF8oeNJ0pg 
-
-提取码: ea9v 
-
-## 源码链接
-
-软件源码参考[BILS](applications/BILS)，欢迎共建～
-
-
diff --git a/docs/zh-CN/benchmark.md b/docs/zh-CN/benchmark.md
deleted file mode 100644
index 7f7bd56c6..000000000
--- a/docs/zh-CN/benchmark.md
+++ /dev/null
@@ -1,281 +0,0 @@
-简体中文 | [English](../en/benchmark.md)
-
-# Benchmark
-
-
-本文档给出了PaddleVideo系列模型在各平台预测耗时benchmark。
-
----
-
-## 目录
-
-- [1. 视频分类模型](#1)
-    - [1.1 测试数据](#11)
-    - [1.2 测试环境](#12)
-    - [1.3 测试结果](#13)
-        - [1.3.1 GPU推理速度一览](#131)
-        - [1.3.2 CPU推理速度一览](#132)
-    - [1.4 测试方法](#14)
-        - [1.4.1 单个模型测试](#141)
-        - [1.4.2 批量测试](#141)
-- [2. 时序分割模型](#2)
-    - [2.1 测试环境](#21)
-    - [2.2 测试结果](#2)
-
-
-## 1. 视频分类模型
-
-### 1.1 测试数据
-
-我们从Kinetics-400数据集中，随机选择提供100条用于benchmark时间测试，测试数据可以[点击](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/time-test.tar)下载。
-
-解压后文件目录：
-```txt
-time-test
-├── data       # 测试视频文件
-└── file.list  # 文件列表
-```
-
-视频属性如下:
-
-```txt
-mean video time:  9.67s
-mean video width:  373
-mean video height:  256
-mean fps:  25
-```
-
-### 1.2 测试环境
-
-硬件环境：
-
-- CPU: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-- GPU: Tesla V100 16G
-
-软件环境：
-- Python 3.7
-- PaddlePaddle 2.3.1
-- CUDA 10.2
-- CUDNN 8.1.1
-- 各python库版本参考[requirement.txt](../../requirements.txt)
-
-### 1.3 测试结果
-
-#### 1.3.1 GPU推理速度一览
-
-各模型性能数据按预测总时间排序，结果如下:
-
-|模型名称 | 骨干网络 | 配置文件 | 精度% | 预处理时间ms | 模型推理时间ms | 预测总时间ms |
-| :---- | :---- | :----: |:----: |:----: |:----: |:----: |
-| PP-TSM | MobileNetV2 | [pptsm_mv2_k400_videos_uniform.yaml](../../configs/recognition/pptsm/pptsm_mv2_k400_videos_uniform.yaml) | 68.09 | 51.5 | 3.31 | 54.81 |
-| PP-TSM | MobileNetV3 | [pptsm_mv3_k400_frames_uniform.yaml](../../configs/recognition/pptsm/pptsm_mv3_k400_frames_uniform.yaml) | 69.84 | 51 | 4.34 | 55.34 |
-| **PP-TSMv2** | PP-LCNet_v2.8f |	[pptsm_lcnet_k400_8frames_uniform.yaml](../../configs/recognition/pptsm/v2/pptsm_lcnet_k400_8frames_uniform.yaml) | **72.45**| 55.31 | 4.37 | **59.68** |
-| TSM | R50 | [tsm_k400_frames.yaml](../../configs/recognition/tsm/tsm_k400_frames.yaml) | 71.06 | 52.02 | 9.87 | 61.89 |
-|**PP-TSM**	| R50 |	[pptsm_k400_frames_uniform.yaml](../../configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml) | **75.11** | 51.84 | 11.26 | **63.1** |
-|PP-TSM	| R101 | [pptsm_k400_frames_dense_r101.yaml](../../configs/recognition/pptsm/pptsm_k400_frames_dense_r101.yaml) | 76.35| 52.1 | 17.91 | 70.01 |
-| PP-TSMv2 | PP-LCNet_v2.16f |	[pptsm_lcnet_k400_16frames_uniform.yaml](../../configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform.yaml) | 74.38 |  69.4 | 7.55 | 76.95 |
-| SlowFast | 4*16 |	[slowfast.yaml](../../configs/recognition/slowfast/slowfast.yaml) | 74.35 | 99.27 | 27.4 | 126.67 |
-| *VideoSwin | B | [videoswin_k400_videos.yaml](../../configs/recognition/videoswin/videoswin_k400_videos.yaml) | 82.4 | 95.65 | 117.22 | 212.88 |
-| MoViNet | A0 | [movinet_k400_frame.yaml](../../configs/recognition/movinet/movinet_k400_frame.yaml) | 66.62 | 150.36 | 47.24 | 197.60 |
-| *PP-TimeSformer | base | [pptimesformer_k400_videos.yaml](../../configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml) | 78.87 | 299.48 | 133.41 | 432.90 |
-| *TimeSformer |	base |	[timesformer_k400_videos.yaml](../../configs/recognition/timesformer/timesformer_k400_videos.yaml) | 77.29 | 301.54 | 136.12 | 437.67 |
-| TSN | R50	| [tsn_k400_frames.yaml](../../configs/recognition/tsn/tsn_k400_frames.yaml) | 69.81 | 794.30 | 168.70 | 963.00 |
-| PP-TSN | R50 | [pptsn_k400_frames.yaml](../../configs/recognition/pptsn/pptsn_k400_frames.yaml) | 75.06 | 837.75 | 175.12 | 1012.87 |
-
-* 注:带`*`表示该模型未使用tensorRT进行预测加速。
-
-- TSN预测时采用TenCrop，比TSM采用的CenterCrop更加耗时。TSN如果使用CenterCrop，则速度稍优于TSM，但精度会低3.5个点。
-
-#### 1.3.2 CPU推理速度一览
-
-各模型性能数据按预测总时间排序，结果如下:
-
-|模型名称 | 骨干网络 | 配置文件 | 精度% | 预处理时间ms | 模型推理时间ms | 预测总时间ms |
-| :---- | :---- | :----: |:----: |:----: |:----: |:----: |
-| PP-TSM | MobileNetV2 | [pptsm_mv2_k400_videos_uniform.yaml](../../configs/recognition/pptsm/pptsm_mv2_k400_videos_uniform.yaml) | 68.09 | 52.62 | 137.03 | 189.65 |
-| PP-TSM | MobileNetV3 | [pptsm_mv3_k400_frames_uniform.yaml](../../configs/recognition/pptsm/pptsm_mv3_k400_frames_uniform.yaml) | 69.84| 53.44 | 139.13 | 192.58 |
-| **PP-TSMv2** | PP-LCNet_v2.8f |	[pptsm_lcnet_k400_8frames_uniform.yaml](../../configs/recognition/pptsm/v2/pptsm_lcnet_k400_8frames_uniform.yaml) | **72.45**| 53.37 | 189.62 | **242.99** |
-| **PP-TSMv2** | PP-LCNet_v2.16f |	[pptsm_lcnet_k400_16frames_uniform.yaml](../../configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform.yaml) | **74.38**|  68.07 | 388.64 | **456.71** |
-| SlowFast | 4*16 |	[slowfast.yaml](../../configs/recognition/slowfast/slowfast.yaml) | 74.35 | 110.04 | 1201.36 | 1311.41 |
-| TSM | R50 | [tsm_k400_frames.yaml](../../configs/recognition/tsm/tsm_k400_frames.yaml) | 71.06 | 52.47 | 1302.49 | 1354.96 |
-|PP-TSM	| R50 |	[pptsm_k400_frames_uniform.yaml](../../configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml) | 75.11 | 52.26  | 1354.21 | 1406.48 |
-|*MoViNet | A0 | [movinet_k400_frame.yaml](../../configs/recognition/movinet/movinet_k400_frame.yaml) | 66.62 | 148.30 |	1290.46 | 1438.76 |
-|PP-TSM	| R101 | [pptsm_k400_frames_dense_r101.yaml](../../configs/recognition/pptsm/pptsm_k400_frames_dense_r101.yaml) | 76.35| 52.50 | 2236.94 | 2289.45 |
-| PP-TimeSformer | base | [pptimesformer_k400_videos.yaml](../../configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml) | 78.87 | 294.89	| 13426.53 | 13721.43 |
-| TimeSformer |	base |	[timesformer_k400_videos.yaml](../../configs/recognition/timesformer/timesformer_k400_videos.yaml) | 77.29 | 297.33 |	14034.77 |	14332.11 |
-| TSN | R50	| [tsn_k400_frames.yaml](../../configs/recognition/tsn/tsn_k400_frames.yaml) | 69.81 | 860.41 | 18359.26 | 19219.68 |
-| PP-TSN | R50 | [pptsn_k400_frames.yaml](../../configs/recognition/pptsn/pptsn_k400_frames.yaml) | 75.06 | 835.86 | 19778.60 | 20614.46 |
-| *VideoSwin | B | [videoswin_k400_videos.yaml](../../configs/recognition/videoswin/videoswin_k400_videos.yaml) | 82.4 | 76.21 | 32983.49 | 33059.70 |
-
-
-* 注: 带`*`表示该模型未使用mkldnn进行预测加速。
-
-
-### 1.4 测试方法
-
-在进行测试之前，需要安装[requirements.txt](../../requirements.txt)相关依赖，并且还需安装`AutoLog`用于记录计算时间，使用如下命令安装:
-```bash
-python3.7 -m pip install --upgrade pip
-pip3.7 install --upgrade -r requirements.txt
-python3.7 -m pip install git+https://github.com/LDOUBLEV/AutoLog
-```
-
-#### 1.4.1 单个模型测试
-
-以PP-TSM模型为例，请先参考[PP-TSM文档](./model_zoo/recognition/pp-tsm.md)导出推理模型，之后使用如下命令进行速度测试：
-
-```python
-python3.7 tools/predict.py --input_file time-test/file.list \
-                          --time_test_file=True \
-                          --config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
-                          --model_file inference/ppTSM/ppTSM.pdmodel \
-                          --params_file inference/ppTSM/ppTSM.pdiparams \
-                          --use_gpu=False \
-                          --use_tensorrt=False \
-                          --enable_mkldnn=True \
-                          --enable_benchmark=True \
-                          --disable_glog True
-```
-
-- 各参数含义如下：
-
-```txt
-input_file:     指定测试文件/文件列表, 示例使用1.1小节提供的测试数据
-time_test_file: 是否进行时间测试，请设为True
-config:         指定模型配置文件
-model_file:     指定推理文件pdmodel路径
-params_file:    指定推理文件pdiparams路径
-use_gpu:        是否使用GPU预测, False则使用CPU预测
-use_tensorrt:   是否开启TensorRT预测
-enable_mkldnn:  开启benchmark时间测试，默认设为True
-disable_glog:   是否关闭推理时的日志，请设为True
-```
-
-- 测试时，GPU推理使用FP32+TensorRT配置下，CPU使用mkldnn加速。运行100次，去除前3次的warmup时间，得到推理平均时间。
-
-#### 1.4.2 批量测试
-
-使用以下批量测试脚本，可以方便的将性能结果进行复现：
-
-- 1. 下载预训练模型:
-
-```bash
-mkdir ckpt
-cd ckpt
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform_distill.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_uniform_distill_r101.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_mv2_k400.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_mv3_k400.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PPTSMv2_k400_16f_dml.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400_8.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f_distill.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.3/MoViNetA0_k400.pdparams
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_k400.pdparams
-```
-
-- 2. 准备各模型配置参数列表文件`model.list`
-
-```txt
-PP-TSM_R50      configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml        ckpt/ppTSM_k400_uniform_distill.pdparams ppTSM
-PP-TSM_R101     configs/recognition/pptsm/pptsm_k400_frames_dense_r101.yaml     ckpt/ppTSM_k400_uniform_distill_r101.pdparams ppTSM
-PP-TSM_MobileNetV2      configs/recognition/pptsm/pptsm_mv2_k400_videos_uniform.yaml    ckpt/ppTSM_mv2_k400.pdparams ppTSM
-PP-TSM_MobileNetV3      configs/recognition/pptsm/pptsm_mv3_k400_frames_uniform.yaml    ckpt/ppTSM_mv3_k400.pdparams ppTSM
-PP-TSMv2_PP-LCNet_v2    configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml      ckpt/PPTSMv2_k400_16f_dml.pdparams ppTSMv2
-PP-TSN_R50      configs/recognition/pptsn/pptsn_k400_frames.yaml        ckpt/ppTSN_k400_8.pdparams ppTSN
-PP-TimeSformer_base     configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml        ckpt/ppTimeSformer_k400_8f_distill.pdparams ppTimeSformer
-TSM_R50 configs/recognition/tsm/tsm_k400_frames.yaml    ckpt/TSM_k400.pdparams TSM
-TSN_R50 configs/recognition/tsn/tsn_k400_frames.yaml    ckpt/TSN_k400.pdparams TSN
-TimeSformer_base        configs/recognition/timesformer/timesformer_k400_videos.yaml    ckpt/TimeSformer_k400.pdparams TimeSformer
-SlowFast_416    configs/recognition/slowfast/slowfast.yaml      ckpt/SlowFast.pdparams SlowFast
-MoViNet_A0      configs/recognition/movinet/movinet_k400_frame.yaml     ckpt/MoViNetA0_k400.pdparams MoViNet
-VideoSwin_B     configs/recognition/videoswin/videoswin_k400_videos.yaml        ckpt/VideoSwin_k400.pdparams VideoSwin
-```
-
-- 3. 批量导出模型，执行时传入model.list文件
-
-```bash
-file=$1
-
-while read line
-do
-    arr=($line)
-    ModelName=${arr[0]}
-    ConfigFile=${arr[1]}
-    ParamsPath=${arr[2]}
-    echo $ModelName
-
-    python3.7 tools/export_model.py -c $ConfigFile \
-                                    -p $ParamsPath \
-                                    -o inference/$ModelName
-done <$file
-```
-
-- 4. 测试时间，执行时传入model.list文件
-
-```bash
-file=$1
-
-while read line
-do
-    arr=($line)
-    ModelName=${arr[0]}
-    ConfigFile=${arr[1]}
-    ParamsPath=${arr[2]}
-    Model=${arr[3]}
-
-    python3.7 tools/predict.py --input_file ../../time-test/file.list \
-                            --time_test_file=True \
-                            --config $ConfigFile \
-                            --model_file inference/$ModelName/$Model.pdmodel \
-                            --params_file inference/$ModelName/$Model.pdiparams \
-                            --use_gpu=False \
-                            --use_tensorrt=False \
-                            --enable_mkldnn=False \
-                            --enable_benchmark=True \
-                            --disable_glog True
-    echo =====$ModelName END====
-done <$file
-```
-
----
-
-## 2. 时序分割模型
-
-### 2.1 测试环境
-
-硬件环境:
-
-- 8 NVIDIA Tesla V100 (16G) GPUs
-- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-
-软件环境:
-
-- Python 3.7
-- PaddlePaddle2.0
-- CUDA 10.1
-- CUDNN 7.6.3
-- NCCL 2.1.15
-- GCC 8.2.0
-
-### 2.2 测试结果
-
-本仓库提供经典和热门时序动作分割模型的性能和精度对比
-
-| Model | Metrics | Value | Flops(M) |Params(M) | test time(ms) bs=1 | test time(ms) bs=2 | inference time(ms) bs=1 | inference time(ms) bs=2 |
-| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
-| MS-TCN | F1@0.5 | 38.8% | 791.360 | 0.8 | 170 | - | 10.68 | - |
-| ASRF | F1@0.5 | 55.7% | 1,283.328 | 1.3 | 190 | - | 16.34 | - |
-
-* 模型名称：填写模型的具体名字，比如PP-TSM
-* Metrics：填写模型测试时所用的指标，使用的数据集为**breakfast**
-* Value：填写Metrics指标对应的数值，一般保留小数点后两位
-* Flops(M)：模型一次前向运算所需的浮点运算量，可以调用PaddleVideo/tools/summary.py脚本计算（不同模型可能需要稍作修改），保留小数点后一位，使用数据**输入形状为(1, 2048, 1000)的张量**测得
-* Params(M)：模型参数量，和Flops一起会被脚本计算出来，保留小数点后一位
-* test time(ms) bs=1：python脚本开batchsize=1测试时，一个样本所需的耗时，保留小数点后两位。测试使用的数据集为**breakfast**。
-* test time(ms) bs=2：python脚本开batchsize=2测试时，一个样本所需的耗时，保留小数点后两位。时序动作分割模型一般是全卷积网络，所以训练、测试和推理的batch_size都是1。测试使用的数据集为**breakfast**。
-* inference time(ms) bs=1：推理模型用GPU（默认V100）开batchsize=1测试时，一个样本所需的耗时，保留小数点后两位。推理使用的数据集为**breakfast**。
-* inference time(ms) bs=2：推理模型用GPU（默认V100）开batchsize=1测试时，一个样本所需的耗时，保留小数点后两位。时序动作分割模型一般是全卷积网络，所以训练、测试和推理的batch_size都是1。推理使用的数据集为**breakfast**。
diff --git a/docs/zh-CN/competition.md b/docs/zh-CN/competition.md
deleted file mode 100644
index 12bc99924..000000000
--- a/docs/zh-CN/competition.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# 赛事支持
-
-## 1. 基于飞桨实现花样滑冰选手骨骼点动作识别大赛
-
-- [比赛链接](https://aistudio.baidu.com/aistudio/competition/detail/115/0/introduction)
-- [AI Studio基线项目](https://aistudio.baidu.com/aistudio/projectdetail/2417717)
-- [视频教程](https://www.bilibili.com/video/BV1w3411172G)
-- **[冠军方案](https://aistudio.baidu.com/aistudio/projectdetail/2925777)**
-- [第2名方案](https://aistudio.baidu.com/aistudio/projectdetail/3036788)
-- [第3名方案](https://aistudio.baidu.com/aistudio/projectdetail/2929633)
-
-
-## 2. 基于飞桨实现乒乓球时序动作定位大赛
-
-- [比赛链接](https://aistudio.baidu.com/aistudio/competition/detail/127/0/introduction)
-- **[冠军方案](https://aistudio.baidu.com/aistudio/projectdetail/3545680)**
-- [冠军模型TCANet++](https://github.com/PaddlePaddle/PaddleVideo/pull/437)
-- [第2名方案](https://aistudio.baidu.com/aistudio/projectdetail/3548768)
-- [第3名方案](https://aistudio.baidu.com/aistudio/projectdetail/3435449)
-
-## 3. CCKS 2021：知识增强的视频语义理解
-
-- [比赛链接](https://www.biendata.xyz/competition/ccks_2021_videounderstanding/)
diff --git a/docs/zh-CN/contribute/README.md b/docs/zh-CN/contribute/README.md
deleted file mode 100644
index c22ca32e4..000000000
--- a/docs/zh-CN/contribute/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# 代码贡献指南
-
-- [1. 如何添加新算法](./add_new_algorithm.md)
-- [2. 配置系统设计解析](./config.md)
-- [3. 如何提pr](./how_to_contribute.md)
diff --git a/docs/zh-CN/contribute/add_new_algorithm.md b/docs/zh-CN/contribute/add_new_algorithm.md
deleted file mode 100644
index 25218b342..000000000
--- a/docs/zh-CN/contribute/add_new_algorithm.md
+++ /dev/null
@@ -1,414 +0,0 @@
-# 添加新算法
-
-PaddleVideo将一个算法分解为以下几个部分，并对各部分进行模块化处理，方便快速组合出新的算法。
-
-* [1. 数据加载和处理](#1)
-* [2. 网络](#2)
-* [3. 优化器](#3)
-* [4. 训练策略](#4)
-* [5. 指标评估](#5)
-
-示例代码如下：
-```python
-import numpy as np
-import paddle
-from paddle.io import Dataset, DataLoader
-import paddle.nn as nn
-
-# 1. 数据加载和处理
-## 1.2 数据预处理Pipeline
-class ExamplePipeline(object):
-    """ Example Pipeline"""
-    def __init__(self, mean=0, std=1.0):
-        self.mean = mean
-        self.std = std
-
-    def __call__(self, results):
-        data = results['data']
-        norm_data = (data - self.mean) / self.std
-        results['data'] = norm_data
-        return results
-
-## 1.1 数据集类
-class ExampleDataset(Dataset):
-    """ExampleDataset"""
-    def __init__(self):
-        super(ExampleDataset, self).__init__()
-        self.x = np.random.rand(100, 20, 20)
-        self.y = np.random.randint(10, size = (100, 1))
-
-    def __getitem__(self, idx):
-        x_item = self.x[idx]
-        results = {}
-        results['data'] = x_item
-        pipeline = ExamplePipeline()
-        results = pipeline(results)
-        x_item = results['data'].astype('float32')
-        y_item = self.y[idx].astype('int64')
-        return x_item, y_item
-
-    def __len__(self):
-        return self.x.shape[0]
-
-train_dataset = ExampleDataset()
-## 1.3 封装为Dataloader对象
-train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
-
-# 2. 网络
-class ExampleModel(nn.Layer):
-    """Example Model"""
-    def __init__(self):
-        super(ExampleModel, self).__init__()
-        ## 2.1 网络Backbobe
-        self.layer1 = paddle.nn.Flatten(1, -1)
-        self.layer2 = paddle.nn.Linear(400, 512)
-        self.layer3 = paddle.nn.ReLU()
-        self.layer4 = paddle.nn.Dropout(0.2)
-        ## 2.2 网络Head
-        self.layer5 = paddle.nn.Linear(512, 10)
-
-    def forward(self, x):
-        """ model forward"""
-        y = self.layer1(x)
-        y = self.layer2(y)
-        y = self.layer3(y)
-        y = self.layer4(y)
-        y = self.layer5(y)
-        return y
-
-model = ExampleModel()
-model.train()
-
-# 3. 优化器
-optim = paddle.optimizer.Adam(parameters=model.parameters())
-
-epochs = 5
-for epoch in range(epochs):
-    for batch_id, data in enumerate(train_loader()):
-        x_data = data[0]  
-        y_data = data[1]  
-        predicts = model(x_data)  
-
-        ## 2.3 网络Loss
-        loss = paddle.nn.functional.cross_entropy(predicts, y_data)
-
-        acc = paddle.metric.accuracy(predicts, y_data)
-
-        loss.backward()
-        print("epoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc.numpy()))
-
-        optim.step()
-        optim.clear_grad()
-```
-上述代码的运行输出日志如下：
-```txt
-epoch: 0, batch_id: 0, loss is: [2.5613842], acc is: [0.]
-epoch: 0, batch_id: 1, loss is: [2.5776138], acc is: [0.1]
-epoch: 0, batch_id: 2, loss is: [2.551022], acc is: [0.1]
-epoch: 0, batch_id: 3, loss is: [2.782001], acc is: [0.]
-epoch: 0, batch_id: 4, loss is: [2.787499], acc is: [0.1]
-```
-将以上代码集成进PaddleVideo的示例pr参考 [#257](https://github.com/PaddlePaddle/PaddleVideo/pull/257)
-
-下面将分别对每个部分进行介绍，并介绍如何在该部分里添加新算法所需模块。
-
-<a name="1"></a>
-
-## 1. 数据加载和处理
-
-数据加载和处理部分由`Dataset类`、`预处理Pipeline`和`Dataloader对象`组成。`Dataset类`是数据集类，其中的`__getitem__`方法定义了每一个视频样本数据的处理方式。`预处理Pipeline`定义了数据预处理步骤，包括视频的读取，解码以及数据增强等操作。`预处理定义的Pipeline`通常在`Dataset类`的`__getitem__`方法中被调用，以完成对视频预处理操作。这一部分在[paddlevideo/loader](../../../paddlevideo/loader)下。 各个文件及文件夹作用说明如下:
-
-```txt
-paddlevideo/loader/
-├── dataset
-│   ├── base.py            # Dataset基类
-│   ├── frame.py           # 处理Frame格式输入的Dataset类
-│   └── video.py           # 处理Video格式输入的Dataset类
-├── pipelines
-│   ├── decode.py          # 解码Pipeline，对视频进行解码
-│   ├── sample.py          # 抽帧Pipeline，对视频抽帧的方式
-│   ├── augmentations.py   # 数据增强Pipeline，包括缩放、裁剪、反转、正则化等
-...
-```
-
-PaddleVideo内置了针对不同数据集的Dataset相关模块，对于没有内置的模块可通过如下步骤添加:
-
-1. 在 [paddlevideo/loader/dataset](../../../paddlevideo/loader/dataset) 文件夹下新建文件，如my_dataset.py。
-2. 在 my_dataset.py 文件内添加相关代码，示例代码如下:
-
-```python
-@DATASETS.register()  # 通过装饰器，自动进行注册
-class MyDataset:
-    def __init__(self, *args, **kwargs):
-        # your init code
-        pass
-
-    def load_file(self):
-        info = []
-        # load file list
-        return info
-
-    def prepare_train(self, idx):
-        results = copy.deepcopy(self.info[idx])
-        results = self.pipeline(results) # train pipeline  
-        return results['image'], results['labels'] #return your data item
-
-    def prepare_test(self, idx):
-        results = copy.deepcopy(self.info[idx])
-        results = self.pipeline(results) # test pipeline  
-        return results['image'], results['labels'] #return your data item
-```
-
-3. 在 [paddlevideo/loader/dataset/\_\_init\_\_.py](../../../paddlevideo/loader/dataset/__init__.py) 文件内导入添加的模块。
-
-最后在config文件中指定Dataset类名即可使用。如:
-
-```yaml
-# Define your Dataset name and args
-DATASET:
-    batch_size: 16                                # single-card bacth size
-    num_workers: 4                                # the number of subprocess on each GPU.
-    train:
-        format: "FrameDataset"                    # Dataset class
-        data_prefix: "data/k400/rawframes"        # train data root path
-        file_path: "data/k400/train_frames.list"  # train data list file path
-        suffix: 'img_{:05}.jpg'
-    valid:
-        format: "FrameDataset"                    # Dataset class
-        data_prefix: "data/k400/rawframes"        # valid data root path
-        file_path: "data/k400/train_frames.list"  # valid data list file path
-        suffix: 'img_{:05}.jpg'
-    test:
-        format: "FrameDataset"                    # Dataset class
-        data_prefix: "data/k400/rawframes"        # test data root path
-        file_path: "data/k400/train_frames.list"  # test data list file path
-        suffix: 'img_{:05}.jpg'
-```
-
-- 关于模块注册机制的详细说明，可以参考[配置系统设计](./config.md)
-
-PaddleVideo内置了大量视频编解码及图像变换相关模块，对于没有内置的模块可通过如下步骤添加:
-
-1. 在 [paddlevideo/loader/pipelines](../../../paddlevideo/loader/pipelines) 文件夹下新建文件，如my_pipeline.py。
-2. 在 my_pipeline.py 文件内添加相关代码，示例代码如下:
-
-```python
-@PIPELINES.register()  # 通过装饰器，自动进行注册
-class MyPipeline:  
-    def __init__(self, *args, **kwargs):
-        # your init code
-        pass
-
-    def __call__(self, results):
-        img = results['image']
-        label = results['label']
-        # your process code
-
-        results['image'] = img
-        results['label'] = label
-        return results
-```
-
-3. 在 [paddlevideo/loader/pipelines/\_\_init\_\_.py](../../../paddlevideo/loader/pipelines/__init__.py) 文件内导入添加的模块。
-
-数据处理的所有处理步骤由不同的模块顺序执行而成，在config文件中按照列表的形式组合并执行。如:
-
-```yaml
-# Define your pipeline name and args
-PIPELINE:
-    train:
-        decode:
-            name: "FrameDecoder"             # Pipeline Class name
-        sample:
-            name: "Sampler"                  # Pipeline Class name
-            num_seg: 8                       # init args
-            seg_len: 1                       # init args
-            valid_mode: False                # init args
-        transform:
-            - Scale:                         # Pipeline Class name
-                short_size: 256              # init args
-```
-
-<a name="2"></a>
-
-## 2. 网络
-
-网络部分完成了网络的组网操作，PaddleVideo将网络划分为四三部分，这一部分在[paddlevideo/modeling](../../../paddlevideo/modeling)下。 进入网络的数据将按照顺序(backbones->heads->loss)依次通过这三个部分。backbone用于特征提取，loss通过heads的[loss方法](https://github.com/PaddlePaddle/PaddleVideo/blob/5f7e22f406d11912eef511bafae28c594ccaa07e/paddlevideo/modeling/heads/base.py#L67)被调用。除了损失值，训练过程中如果想观察其它的精度指标(如top1, top5)，也可以在head中定义相应的计算方法，参考[get_acc方法](https://github.com/PaddlePaddle/PaddleVideo/blob/5f7e22f406d11912eef511bafae28c594ccaa07e/paddlevideo/modeling/heads/base.py#L122)，loss模块最终返回一个[loss字典](https://github.com/PaddlePaddle/PaddleVideo/blob/5f7e22f406d11912eef511bafae28c594ccaa07e/paddlevideo/modeling/heads/base.py#L81)，存储loss值以及其它需要的精度指标。
-
-```bash
-├── framework     # 组合backbones->heads->loss，定义从输入数据到输出loss的过程
-├── backbones     # 网络的特征提取模块
-├── heads         # 网络的输出模块
-└── losses        # 网络的损失函数模块
-```
-
-PaddleVideo内置了TSN、TSM、SlowFast、ST-GCN、BMN等算法相关的常用模块，对于没有内置的模块可通过如下步骤添加，四个部分添加步骤一致，以backbones为例:
-
-1. 在 [paddlevideo/modeling/backbones](../../../paddlevideo/modeling/backbones) 文件夹下新建文件，如my_backbone.py。
-2. 在 my_backbone.py 文件内添加相关代码，示例代码如下:
-
-```python
-@BACKBONES.register()    # 通过装饰器，自动进行注册
-class MyBackbone(nn.Layer):
-    def __init__(self, *args, **kwargs):
-        super(MyBackbone, self).__init__()
-        # your init code
-        self.conv = nn.xxxx
-
-    def forward(self, inputs):
-        # your network forward
-        y = self.conv(inputs)
-        return y
-```
-
-3. 在 [paddlevideo/modeling/backbones/\_\_init\_\_.py](../../../paddlevideo/modeling/backbones/__init__.py)文件内导入添加的模块。
-
-在完成网络的四部分模块添加之后，只需要配置文件中进行配置即可使用，如:
-
-```yaml
-MODEL:
-    framework: "Recognizer2D"    # Framework class name
-    backbone:  
-        name: "ResNetTweaksTSM"  # Backbone class name
-        depth: 50                # init args
-    head:
-        name: "ppTSMHead"        # Heads class name
-        num_classes: 400         # init args
-    loss:
-        name: "MyLoss"           # Losses class name
-        scale: 0.1               # init args
-```
-
-<a name="3"></a>
-
-## 3. 优化器
-
-优化器用于训练网络。优化器内部还包含了网络正则化和学习率衰减模块。 这一部分在[paddlevideo/solver/](../../../paddlevideo/solver/)下。 PaddleVideo内置了飞桨框架所有的[优化器模块](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.1/api/paddle/optimizer/Overview_cn.html#api)和[学习率衰减模块](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.1/api/paddle/optimizer/Overview_cn.html#about-lr)。只需要在配置文件中指定相应模块名称及参数即可方便的调用，示例：
-
-```yaml
-OPTIMIZER:
-    name: 'Momentum'                        # Optimizer class name
-    momentum: 0.9                           # init args
-    learning_rate:  
-        name: 'PiecewiseDecay'              # Learning rate scheduler class name
-        boundaries: [10, 20]                # init args
-        values: [0.001, 0.0001, 0.00001]    # init args
-```
-
-对于没有内置的模块可通过如下步骤添加，以`learning rate`为例:
-
-1. 在 [paddlevideo/solver/custom_lr.py](../../../paddlevideo/solver/custom_lr.py) 文件内创建自己的学习率调整策略，示例代码如下:
-
-```python
-class MyLR(LRScheduler):
-    def __init__(self, *args, **kwargs):
-        self.learning_rate = learning_rate
-
-    def step(self, epoch):
-        # learning rate step scheduler
-        self.last_lr = xxx
-
-```
-
-在学习率模块添加之后，只需要配置文件中进行配置即可使用，如:
-
-```yaml
-OPTIMIZER:
-  name: 'Momentum'
-  momentum: 0.9
-  learning_rate:
-    iter_step: True
-    name: 'CustomWarmupCosineDecay'   # LR class name
-    max_epoch: 80                     # init args
-    warmup_epochs: 10                 # init args
-```
-
-<a name="4"></a>
-
-## 4. 训练策略
-
-PaddleVideo内置了很多模型训练相关trick，包括标签平滑、数据增强Mix-up、PreciseBN等，只需要在配置文件中指定相应模块名称及参数即可方便的调用，示例：
-
-```yaml
-
-MODEL:
-    framework: "Recognizer2D"
-    backbone:
-        name: "ResNetTweaksTSM"
-    head:
-        name: "ppTSMHead"
-        ls_eps: 0.1                  # ls_eps字段添加label smooth，并指定平滑系数
-
-MIX:
-    name: "Mixup"                    # 添加数据增强 Mix-up策略
-    alpha: 0.2                       # 指定mix系数
-
-PRECISEBN:                           # 添加preciseBN策略
-  preciseBN_interval: 5              # 指定prciseBN间隔
-  num_iters_preciseBN: 200           # 指定preciseBN运行的batchs数量
-
-```
-
-训练相关的代码通过[paddlevideo/tasks/train.py](../../../paddlevideo/tasks/train.py)被组织起来，最终被[PaddleVideo/main.py](../../../../PaddleVideo/main.py)调用启动训练，单卡训练和多卡训练的启动方式略有不同。单卡训练启动方式如下:
-
-```bash
-export CUDA_VISIBLE_DEVICES=0         #指定使用的GPU显卡id
-python3.7 main.py  --validate -c configs_path/your_config.yaml
-```
-- `--validate` 参数指定训练时运行validation
-- `-c` 参数指定配置文件路径
--  `-o`: 指定重写参数，例如： `-o DATASET.batch_size=16` 用于重写train时batch size大小
-
-多卡训练通过paddle.distributed.launch启动，方式如下:
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
-```
-- `--gpus`参数指定使用的GPU显卡id
-- `--log_dir`参数指定日志保存目录
-多卡训练详细说明可以参考[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.1/guides/02_paddle2.0_develop/06_device_cn.html#danjiduokaxunlian)
-
-
-<a name="5"></a>
-
-## 5. 指标评估
-
-训练完成后，需要进行指标评估，paddlevideo将指标评估模块与训练模块解耦，通过在[PaddleVideo/main.py](../../../../PaddleVideo/main.py)运行时指定`--test`参数调用test模块进行指标评估，评估方法的实现主体在[paddlevideo/metrics/](../../../paddlevideo/metrics)下。 PaddleVideo内置了Uniform、Dense等相关的指标评估模块，对于没有内置的模块可通过如下步骤添加:
-
-1. 在 [paddlevideo/metrics/](../../../paddlevideo/metrics/) 文件夹下新建文件，如my_metric.py。
-2. 在 my_metric.py 文件内添加相关代码，示例代码如下:
-
-```python
-@METRIC.register        # 通过装饰器，自动进行注册
-class MyMetric(BaseMetric):
-    def __init__(self, *args, **kwargs):
-        self.top1 = []
-
-    def update(self, batch_id, data, outputs):
-        # update metrics during each iter
-        self.top1.append(xx)
-
-    def accumulate(self):
-        # accumulate metrics when finished all iters.
-        xxx
-        print(np.mean(np.array(self.top1)))
-
-```
-
-3. 在 [paddlevideo/metrics/\_\_init\_\_.py](../../../paddlevideo/metrics/__init__.py)文件内导入添加的模块。
-
-在指标评估模块添加之后，只需要配置文件中进行配置即可使用，如:
-
-```yaml
-METRIC:
-    name: 'CenterCropMetric'    # Metric class name
-```
-
-模型测试运行方法如下：
-```bash
-python3.7 main.py --test -c config_path/your_config.yaml -w weight_path/your_weight.pdparams
-```
-- `--test`参数指定运行测试模式
-- `-c`参数指定配置文件
-- `-w`参数指定训练好的权重保存路径
-
diff --git a/docs/zh-CN/contribute/config.md b/docs/zh-CN/contribute/config.md
deleted file mode 100644
index a38964c45..000000000
--- a/docs/zh-CN/contribute/config.md
+++ /dev/null
@@ -1,242 +0,0 @@
-简体中文 | [English](../../en/tutorials/config.md)
-
-# 配置系统设计
-
----
-
-本文档将介绍PaddleVideo利用依赖注入技术实现控制反转，来对整个系统进行解耦，通过可自定义调整的配置文件来控制整个系统从而实现模块化。最后，介绍了配置文件和PaddleVideo运行时参数的含义。
-
-
-## 设计原则
-
-首先，模型库中会有很多对一个类实例化的操作，例如：
-
-```python
-class TSM():
-    pass
-
-model = TSM(init_attributes)
-```
-当越来越多的实例被创建，这种调用方法和被调用方法间的联系陡然上升，增加了整个系统的耦合性，对启用新功能建设，或是对已用功能扩展产生不便。
-当然我们可以建立一个工厂模式来解决这个问题，根据配置文件的指定输入，来统一的做条件判断：
-
-```python
-if model_name == "TSM":
-    model = TSM()
-elif model_name == "TSN":
-    model = TSN()
-elif ...
-```
-或是像如下代码片段
-
-```python
-optimizer_cfg = dict(name:"MOMENTUM", params: XXX)
-if optimizer_cfg.name = "MOMENTUM":
-    optimizer = MOMENTUM(optimizer_cfg.pop(name))
-elif:
-    ...
-```
-
-可是，越来越多的条件判断被创建出来，还是没有统一彻底的解决这个问题。
-而在其他系统中被广泛利用的 控制反转/依赖注入 技术，PaddleVideo将其利用起来进行系统解耦，并应用到诸如 LOSS METRICS BACKBONE HEAD等场景中。
-PaddleVideo实现了两个组件用于完成控制反转/依赖注入：
-
-- Register, 注册器，用于注册一个模块组件
-- Builder, 用于建立（实例化）一个已注册的组件
-
-1. Register 注册器
-
-PaddleVideo实现了类似setter和getter方法
-
-[source code](../../paddlevideo/utils/registry.py)
-
-```python
-#excerpt from source code.
-class Registry():
-    def __init__(self, name):
-        self._name = name
-        self._obj_map = {}
-
-    #mapping name -> object
-    def register(self,  obj, name):
-        self._obj_map[name] = obj
-
-    #get object
-    def get(self, name):
-        ret = self._obj_map.get(name)
-        return ret
-```
-
-用于建立字符串和对象的map，如下的代码将ResNet类注册到BACKBONE map中
-
-```python
-
-    BACKBONES = Registry('backbone')
-    class ResNet:
-        pass
-    BACKBONES.register(ResNet)
-```
-
-或是通过python3语法糖来装饰一个类
-
-```python
-    BACKBONES = Registry('backbone') #new a Register
-    @BACKBONES.register() #regist resnet as a backbone.
-    class ResNet:
-        pass
-```
-
-2. Builder
-
-应用python的反射机制，调用get方法 得到一个已经注册的模块：
-```python
-    # Usage: To build a module.
-
-    backbone_name = "ResNet"
-    b = BACKBONES.get(backbone_name)()
-```
-
-至此，PaddleVideo注册了一个实例，不是在他的调用地方，而是在他的声明处，一个简单的IoC系统建立起来了。
-最后，PaddleVideo 通过这种方式建立了所有组件，并和配置文件参数一一对应。这里，一一对应的含义是：配置文件中的字段，`name` 代表着类的名字，其余字段对应着这个类的初始化参数。当然，除了`name` 我们也应用了别的名字来标记类名，例如：`framework`
-
-```yaml
-head:
-    name: "TSMHead"  # class name
-    num_classes: 400 # TSMHead class init attributes
-    ...
-```
-
----
-
-## 配置参数
-
-配置文件中，有多组字段，如下
-
-- **MODEL:** 代笔模型结构
-- **DATASET:** 数据集和dataloader配置
-- **PIPELINE:** 数据处理流程配置字段
-- **OPTIMIZER:** 优化器字段
-
-和一些共有的参数， 如：
-
-- model_name
-- log_interval
-- epochs
-- resume_epoch
-- log_level
-...
-
-## 模块概览
-
-<table>
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>Architectures</b>
-      </td>
-      <td>
-        <b>Frameworks</b>
-      </td>
-      <td>
-        <b>Components</b>
-      </td>
-      <td>
-        <b>Data Augmentation</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-        <ul><li><b>Recognition</b></li>
-          <ul>
-            <li>TSN</li>
-            <li>TSM</li>
-            <li>SlowFast</li>
-            <li>PP-TSM</li>
-            <li>VideoTag</li>
-            <li>AttentionLSTM</li>
-          </ul>
-        </ul>
-        <ul><li><b>Localization</b></li>
-          <ul>
-            <li>BMN</li>
-          </ul>
-        </ul>
-      </td>
-      <td>
-          <li>Recognizer1D</li>
-          <li>Recognizer2D</li>
-          <li>Recognizer3D</li>
-          <li>Localizer</li>
-        <HR></HR>
-        <ul>Backbone
-            <li>resnet</li>
-            <li>resnet_tsm</li>
-            <li>resnet_tweaks_tsm</li>
-            <li>bmn</li>
-        </ul>
-        <ul>Head
-            <li>pptsm_head</li>
-            <li>tsm_head</li>
-            <li>tsn_head</li>
-            <li>bmn_head</li>
-            <slowfast_head></li>
-            <bmn_head></li>
-        </ul>
-      </td>
-      <td>
-        <ul><li><b>Solver</b></li>
-          <ul><li><b>Optimizer</b></li>
-              <ul>
-                <li>Momentum</li>
-                <li>RMSProp</li>
-              </ul>
-          </ul>
-          <ul><li><b>LearningRate</b></li>
-              <ul>
-                <li>PiecewiseDecay</li>
-              </ul>
-          </ul>
-        </ul>
-        <ul><li><b>Loss</b></li>
-          <ul>
-            <li>CrossEntropy</li>
-            <li>BMNLoss</li>  
-          </ul>  
-        </ul>  
-        <ul><li><b>Metrics</b></li>
-          <ul>
-            <li>CenterCrop</li>
-            <li>MultiCrop</li>  
-          </ul>  
-        </ul>
-      </td>
-      <td>
-        <ul><li><b>Video</b></li>
-          <ul>
-            <li>Mixup</li>
-            <li>Cutmix</li>  
-          </ul>  
-        </ul>
-        <ul><li><b>Image</b></li>
-            <ul>
-                <li>Scale</li>
-                <li>Random FLip</li>
-                <li>Jitter Scale</li>  
-                <li>Crop</li>
-                <li>MultiCrop</li>
-                <li>Center Crop</li>
-                <li>MultiScaleCrop</li>
-                <li>Random Crop</li>
-                <li>PackOutput</li>
-            </ul>
-         </ul>
-      </td>  
-    </tr>
-
-
-</td>
-    </tr>
-  </tbody>
-</table>
-
----
diff --git a/docs/zh-CN/contribute/how_to_contribute.md b/docs/zh-CN/contribute/how_to_contribute.md
deleted file mode 100644
index 752d38a66..000000000
--- a/docs/zh-CN/contribute/how_to_contribute.md
+++ /dev/null
@@ -1,262 +0,0 @@
-# PaddleVideo 社区贡献指南
----
-
-## 目录
-
-- [如何贡献代码](#1)
-    - [1.1 PaddleVideo 分支说明](#1.1)
-    - [1.2 PaddleVideo 代码提交流程与规范](#1.2)
-        - [1.2.1 fork 和 clone 代码](#1.2.1)
-        - [1.2.2 和远程仓库建立连接](#1.2.2)
-        - [1.2.3 创建本地分支](#1.2.3)
-        - [1.2.4 使用 pre-commit 勾子](#1.2.4)
-        - [1.2.5 修改与提交代码](#1.2.5)
-        - [1.2.6 保持本地仓库最新](#1.2.6)
-        - [1.2.7 push到远程仓库](#1.2.7)
-        - [1.2.8 提交Pull Request](#1.2.8)
-        - [1.2.9 签署 CLA 协议和通过单元测试](#1.2.9)
-        - [1.2.10 删除分支](#1.2.10)
-        - [1.2.11 提交代码的一些约定](#1.2.11)
-- [总结](#2)
-- [参考文献](#3)
-
-<a name="1"></a>
-## 一、如何贡献代码
-
-<a name="1.1"></a>
-### 1.1 PaddleVideo 分支说明
-
-PaddleVideo 未来将维护 2 种分支，分别为：
-
-* release/x.x.x 系列分支：为稳定的发行版本分支，会适时打 tag 发布版本，适配 Paddle 的 release 版本。当前最新的分支为 release/2.2.0 分支。随着版本迭代， release/x.x.x 系列分支会越来越多，默认维护最新版本的 release 分支，其他的分支不再维护。
-* develop 分支：为开发分支，也是默认分支，适配 Paddle 的 develop 版本，主要用于开发新功能。如果有同学需要进行二次开发，请选择 develop 分支。为了保证 develop 分支能在需要的时候拉出 release/x.x.x 分支， develop 分支的代码只能使用 Paddle 最新 release 分支中有效的 api 。也就是说，如果 Paddle develop 分支中开发了新的 api，但尚未出现在 release 分支代码中，那么请不要在 PaddleVideo 中使用。除此之外，对于不涉及 api 的性能优化、参数调整、策略更新等，都可以正常进行开发。
-
-PaddleVideo 的历史分支，未来将不再维护。考虑到一些同学可能仍在使用，这些分支还会继续保留：
-
-* application 分支：这个分支主要存放应用案例相关代码，目前包括VideoTag和FootballAction，后续会将此分支代码迁移至develop分支，并随 release/x.x.x 发版。
-
-
-PaddleVideo 欢迎大家向 repo 中积极贡献代码，下面给出一些贡献代码的基本流程。
-
-<a name="1.2"></a>
-### 1.2 PaddleVideo 代码提交流程与规范
-
-<a name="1.2.1"></a>
-#### 1.2.1 fork 和 clone 代码
-
-* 跳转到 [PaddleVideo GitHub首页](https://github.com/PaddlePaddle/PaddleVideo) ，然后单击 Fork 按钮，生成自己目录下的仓库，比如 `https://github.com/USERNAME/PaddleVideo` 。
-
-
-<div align="center">
-<img src="../../images/contribute/001_fork.png"  width = "600" />
-</div>
-
-
-* 将远程仓库 clone 到本地
-
-```shell
-# 拉取develop分支的代码
-git clone https://github.com/USERNAME/PaddleVideo.git
-cd PaddleVideo
-```
-
-clone 的地址可以从下面获取
-
-<div align="center">
-<img src="../../images/contribute/002_clone.png"  width = "600" />
-</div>
-
-<a name="1.2.2"></a>
-#### 1.2.2 和远程仓库建立连接
-
-首先通过 `git remote -v` 查看当前远程仓库的信息。
-
-```
-origin    https://github.com/USERNAME/PaddleVideo.git (fetch)
-origin    https://github.com/USERNAME/PaddleVideo.git (push)
-```
-
-上面的信息只包含了 clone 的远程仓库的信息，也就是自己用户名下的 PaddleVideo ，接下来我们创建一个原始 PaddleVideo 仓库的远程主机，命名为 upstream 。
-
-```shell
-git remote add upstream https://github.com/PaddlePaddle/PaddleVideo.git
-```
-
-使用 `git remote -v` 查看当前远程仓库的信息，输出如下，发现包括了 origin 和 upstream 2 个远程仓库。
-
-```
-origin    https://github.com/USERNAME/PaddleVideo.git (fetch)
-origin    https://github.com/USERNAME/PaddleVideo.git (push)
-upstream    https://github.com/PaddlePaddle/PaddleVideo.git (fetch)
-upstream    https://github.com/PaddlePaddle/PaddleVideo.git (push)
-```
-
-这主要是为了后续在提交 pull request (PR) 时，始终保持本地仓库最新。
-
-<a name="1.2.3"></a>
-#### 1.2.3 创建本地分支
-
-可以基于当前分支创建新的本地分支，命令如下。
-
-```shell
-git checkout -b new_branch
-```
-
-也可以基于远程或者上游的分支创建新的分支，命令如下。
-
-```shell
-# 基于用户远程仓库(origin)的develop创建new_branch分支
-git checkout -b new_branch origin/develop
-# 基于上游远程仓库(upstream)的develop创建new_branch分支
-# 如果需要从upstream创建新的分支，需要首先使用git fetch upstream获取上游代码
-git checkout -b new_branch upstream/develop
-```
-
-最终会显示切换到新的分支，输出信息如下
-
-```
-Branch new_branch set up to track remote branch develop from upstream.
-Switched to a new branch 'new_branch'
-```
-
-<a name="1.2.4"></a>
-#### 1.2.4 使用 pre-commit 勾子
-
-Paddle 开发人员使用 pre-commit 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
-
-pre-commit 测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 PaddleVideo ，首先安装并在当前目录运行它：
-
-```shell
-pip install pre-commit
-pre-commit install
-```
-
-* **注意**
-
-1. Paddle 使用 clang-format 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
-2. 通过 `pip install pre-commit` 和 `conda install -c conda-forge pre-commit` 安装的 `yapf` 稍有不同的，PaddleVideo 开发人员使用的是 `pip install pre-commit` 。
-
-<a name="1.2.5"></a>
-#### 1.2.5 修改与提交代码
-
-可以通过 `git status` 查看改动的文件。
-对 PaddleVideo 的 `README.md` 做了一些修改，希望提交上去。则可以通过以下步骤
-
-```shell
-git add README.md
-pre-commit
-```
-
-重复上述步骤，直到 pre-comit 格式检查不报错。如下所示。
-
-<div align="center">
-<img src="../../images/contribute/003_precommit.png"  width = "600" />
-</div>
-
-
-使用下面的命令完成提交。
-
-```shell
-git commit -m "your commit info"
-```
-
-<a name="1.2.6"></a>
-#### 1.2.6 保持本地仓库最新
-
-获取 upstream 的最新代码并更新当前分支。这里的 upstream 来自于 1.2 节的`和远程仓库建立连接`部分。
-
-```shell
-git fetch upstream
-# 如果是希望提交到其他分支，则需要从upstream的其他分支pull代码，这里是develop
-git pull upstream develop
-```
-
-<a name="1.2.7"></a>
-#### 1.2.7 push到远程仓库
-
-```shell
-git push origin new_branch
-```
-
-<a name="1.2.8"></a>
-#### 1.2.8 提交Pull Request
-
-点击 new pull request，选择本地分支和目标分支，如下图所示。在 PR 的描述说明中，填写该 PR 所完成的功能。接下来等待 review ，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
-
-<div align="center">
-<img src="../../images/contribute/004_pr.png"  width = "600" />
-</div>
-
-<a name="1.2.9"></a>
-#### 1.2.9 签署 CLA 协议和通过单元测试
-
-* 签署 CLA
-在首次向 PaddlePaddle 提交 Pull Request 时，您需要您签署一次 CLA (Contributor License Agreement) 协议，以保证您的代码可以被合入，具体签署方式如下：
-
-1. 请您查看 PR 中的 Check 部分，找到 license/cla ，并点击右侧 detail ，进入 CLA 网站
-2. 点击 CLA 网站中的 `Sign in with GitHub to agree` , 点击完成后将会跳转回您的 Pull Request 页面
-
-<a name="1.2.10"></a>
-#### 1.2.10 删除分支
-
-* 删除远程分支
-
-在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
-
-也可以使用 `git push origin :分支名` 删除远程分支，如：
-
-
-```shell
-git push origin :new_branch
-```
-
-* 删除本地分支
-
-```shell
-# 切换到develop分支，否则无法删除当前分支
-git checkout develop
-
-# 删除new_branch分支
-git branch -D new_branch
-```
-
-<a name="1.2.11"></a>
-#### 1.2.11 提交代码的一些约定
-
-为了使官方维护人员在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
-
-1）请保证 Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，官方维护人员一般不做评审。
-
-2）提交 Pull Request前：
-
-请注意 commit 的数量。
-
-原因：如果仅仅修改一个文件但提交了十几个 commit ，每个 commit 只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个 commit 才能知道做了哪些修改，且不排除 commit 之间的修改存在相互覆盖的情况。
-
-建议：每次提交时，保持尽量少的 commit ，可以通过 `git commit --amend` 补充上次的 commit 。对已经 Push 到远程仓库的多个 commit ，可以参考 [squash commits after push](https://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed) 。
-
-请注意每个 commit 的名称：应能反映当前 commit 的内容，不能太随意。
-
-3）如果解决了某个 Issue 的问题，请在该 Pull Request 的第一个评论框中加上： `fix #issue_number` ，这样当该 Pull Request 被合并后，会自动关闭对应的 Issue 。关键词包括： close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved ，请选择合适的词汇。详细可参考 [Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages) 。
-
-此外，在回复评审人意见时，请您遵守以下约定：
-
-1）官方维护人员的每一个 review 意见都希望得到回复，这样会更好地提升开源社区的贡献。
-
-- 对评审意见同意且按其修改完的，给个简单的 Done 即可；
-- 对评审意见不同意的，请给出您自己的反驳理由。
-
-2）如果评审意见比较多,
-
-- 请给出总体的修改情况。
-- 请采用 `start a review` 进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
-
-<a name="2"></a>
-## 二、总结
-
-* 开源社区依赖于众多开发者与用户的贡献和反馈，在这里感谢与期待大家向 PaddleVideo 提出宝贵的意见与 Pull Request ，希望我们可以一起打造一个领先实用全面的视频理解代码仓库！
-
-<a name="3"></a>
-## 三、参考文献
-1. [PaddlePaddle本地开发指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/10_contribution/local_dev_guide_cn.html)
-2. [向开源框架提交pr的过程](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh_CN/advanced_tutorials/how_to_contribute.md)
diff --git a/docs/zh-CN/dataset/AVA.md b/docs/zh-CN/dataset/AVA.md
deleted file mode 100644
index a30f5d8c3..000000000
--- a/docs/zh-CN/dataset/AVA.md
+++ /dev/null
@@ -1,105 +0,0 @@
-[English](../../en/dataset/AVA.md) | 简体中文
-# AVA数据准备
-此文档主要介绍AVA数据集的相关准备流程。主要介绍 AVA数据集的视频文件下载，标注文件准备，视频文件切分
-视频文件提取帧数据，以及拉取提名文件等。在开始之前，请把当前工作目录设定在 `$PaddleVideo/data/ava/shell`
-
----
-
-## 1.  视频数据下载
-想要获取更多有关AVA数据集的信息，您可以访问其官方网站[AVA](https://research.google.com/ava/index.html).
-至于数据集下载，您可以参看考[AVA Download](https://github.com/cvdfoundation/ava-dataset) ，该Repo详细介绍了AVA视频数据的下载方法.
-我们也提供了视频文件的下载脚本：
-
-```shell
-bash download_videos.sh
-```
-
-为了方便用户，我们将视频文件以zip包的形式上传到百度网盘，您可以直接进行下载 [Link]() <sup>coming soon</sup>.
-
-
-**注意: 您自己下载的视频文件应当被放置在`data/ava/videos`文件夹下**  
-
----
-## 2.准备标注文件
-
-接下来，您可以使用下面的脚本来准备标注文件
-
-```shell
-bash download_annotations.sh
-```
-
-该脚本会默认下载`ava_v2.1.zip`，如果您想下载`v2.2`,您可以使用：
-
-```shell
-VERSION=2.2 bash download_annotations.sh
-```
-
-**注意：事实上，我们也同样在百度网盘中提供了该标注文件，所以您无需自己下载** 
-
----
-## 3. 切分视频文件
-
-以帧率30fps,切分视频文件从第15分钟到第30分钟
-
-```shell
-bash cut_videos.sh
-```
----
-
-## 4. 提取RGB帧
-
-您可以通过以下的脚本使用`ffmpeg`来提取RGB帧.
-
-```shell
-bash extract_rgb_frames.sh
-```
-
----
-
-## 5.拉取提名文件
-
-这个脚本来自于Facbook研究院[Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks). 
-您可以使用如下的脚本来获取预计算的提名文件列表。
-
-```shell
-bash fetch_ava_proposals.sh
-```
-
----
-## 6.目录结构
-
-经过整个AVA数据处理流程后，您可以获得AVA的帧文件，视频文件和标注文件
-
-整个项目(AVA)的目录结构如下所示：
-
-```
-PaddleVideo
-├── configs
-├── paddlevideo
-├── docs
-├── tools
-├── data
-│   ├── ava
-│   │   ├── annotations
-│   │   |   ├── ava_dense_proposals_train.FAIR.recall_93.9.pkl
-│   │   |   ├── ava_dense_proposals_val.FAIR.recall_93.9.pkl
-│   │   |   ├── ava_dense_proposals_test.FAIR.recall_93.9.pkl
-│   │   |   ├── ava_train_v2.1.csv
-│   │   |   ├── ava_val_v2.1.csv
-│   │   |   ├── ava_train_excluded_timestamps_v2.1.csv
-│   │   |   ├── ava_val_excluded_timestamps_v2.1.csv
-│   │   |   ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt
-│   │   ├── videos
-│   │   │   ├── 053oq2xB3oU.mkv
-│   │   │   ├── 0f39OWEqJ24.mp4
-│   │   │   ├── ...
-│   │   ├── videos_15min
-│   │   │   ├── 053oq2xB3oU.mkv
-│   │   │   ├── 0f39OWEqJ24.mp4
-│   │   │   ├── ...
-│   │   ├── rawframes
-│   │   │   ├── 053oq2xB3oU
-|   │   │   │   ├── img_00001.jpg
-|   │   │   │   ├── img_00002.jpg
-|   │   │   │   ├── ...
-```
\ No newline at end of file
diff --git a/docs/zh-CN/dataset/ActivityNet.md b/docs/zh-CN/dataset/ActivityNet.md
deleted file mode 100644
index 68a7fd90b..000000000
--- a/docs/zh-CN/dataset/ActivityNet.md
+++ /dev/null
@@ -1,80 +0,0 @@
-[English](../../en/dataset/ActivityNet.md) | 简体中文
-
-# ActivityNet数据准备
-
-- [数据集介绍](#数据集介绍)
-- [数据下载与处理](#数据下载与处理)
-
-## 数据集介绍
-
-ActivityNet是一个用于大规模视频理解任务的数据集，可用于动作定位、动作识别等任务。
-
-
-## 数据下载与处理
-1. BMN模型使用的是处理过后的ActivityNet 1.3数据集，有如下两种使用方法：
-    - 使用我们处理好的ActivityNet 1.3数据集(压缩包约5.5G)，每一个视频有对应的动作标签、持续区间、持续帧数、持续秒数等信息
-        使用以下命令下载：
-        ```bash
-        wget https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz  # 下载处理好的视频特征数据
-        wget https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json  # 下载处理好的标签数据
-        ```
-        或者点击以下超链接下载：
-
-        [视频特征数据](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)
-        [视频特征数据](https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json)
-
-        然后解压下下好的视频特征压缩包
-        ```bash
-        tar -xf bmn_feat.tar.gz
-        ```
-
-    - 自行提取特征
-
-        首先参考[下载说明](https://github.com/activitynet/ActivityNet/tree/master/Crawler)下载原始数据集。在训练此模型时，需要先使用TSN对源文件抽取特征。可以[自行抽取](https://github.com/yjxiong/temporal-segment-networks)视频帧及光流信息，预训练好的TSN模型可从[此处](https://github.com/yjxiong/anet2016-cuhk)下载。
-
-
-    `activitynet_1.3_annotations.json`标签文件内的信息如下所示：
-    ```json
-    {
-        "v_QOlSCBRmfWY": {
-            "duration_second": 82.73,
-            "subset": "training",
-            "duration_frame": 2067,
-            "annotations": [{
-                "segment": [6.195294851794072, 77.73085420904837],
-                "label": "Ballet"
-            }],
-            "feature_frame": 2064
-        },
-        "v_ehGHCYKzyZ8": {
-            "duration_second": 61.718999999999994,
-            "subset": "training",
-            "duration_frame": 1822,
-            "annotations": [{
-                "segment": [43.95990729267573, 45.401932082395355],
-                "label": "Doing crunches"
-            }],
-            "feature_frame": 1808
-        },
-        ...,
-        ...
-    }
-    ```
-    最终应该能得到`19228`个视频特征npy文件，对应`activitynet_1.3_annotations.json`文件中的`19228`个标签信息。
-
-2. 新建`data/bmn_data`文件夹，再将下载完毕后将视频特征数据解压出来放入该文件夹下，最终应该组织成以下形式：
-    ```
-    PaddleVideo
-    ├── data
-    │   ├── bmn_data
-    │   │   ├── fix_feat_100
-    │   │   │   ├── v___c8enCfzqw.npy
-    │   │   │   ├── v___dXUJsj3yo.npy
-    │   │   │   ├── ...
-    │   │   │
-    │   │   └── activitynet_1.3_annotations.json
-    ```
-
-3. 最后修改配置文件configs/localization/bmn.yaml中的`feat_path`字段指定特征文件夹路径，通过`file_path`字段指定标签文件路径。
-
-
diff --git a/docs/zh-CN/dataset/Oxford_RobotCar.md b/docs/zh-CN/dataset/Oxford_RobotCar.md
deleted file mode 100644
index ee12ff885..000000000
--- a/docs/zh-CN/dataset/Oxford_RobotCar.md
+++ /dev/null
@@ -1,152 +0,0 @@
-[English](../../en/dataset/Oxford_RobotCar.md) | 简体中文
-
-# Oxford-RobotCar-for-ADDS数据准备
-
-- [数据集简介](#数据集简介)
-- [数据集下载](#数据集下载)
-- [数据预处理](#数据预处理)
-- [1. 图像去畸变](#1-图像去畸变)
-- [2. 动态帧筛选](#2-动态帧筛选)
-- [3. 图像重命名](#3-图像重命名)
-- [4. 白天-伪夜晚图像对准备](#4-白天-伪夜晚图像对准备)
-
-
-## 数据集简介
-
-[Oxford RobotCar Dataset](https://robotcar-dataset.robots.ox.ac.uk/) 是一个大规模自动驾驶数据集, 包含了大量不同自动驾驶场景下的数据.
-
-这里用到的是从原始的Oxford RobotCar数据集中筛选出一部分用于白天-夜晚深度估计的数据, 即Oxford-RobotCar-for-ADDS.
-
-如果您要使用Oxford-RobotCar-for-ADDS, 请引用以下论文:
-```latex
-@article{maddern20171,
-  title={1 year, 1000 km: The oxford robotcar dataset},
-  author={Maddern, Will and Pascoe, Geoffrey and Linegar, Chris and Newman, Paul},
-  journal={The International Journal of Robotics Research},
-  volume={36},
-  number={1},
-  pages={3--15},
-  year={2017},
-  publisher={SAGE Publications Sage UK: London, England}
-}
-```
-```latex
-@inproceedings{liu2021self,
-  title={Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation},
-  author={Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun},
-  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
-  pages={12737--12746},
-  year={2021}
-}
-```
-
-## 数据集下载
-
-1. 下载序列[2014-12-09](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-09-13-21-02/) 中Bumblebee XB3的左目图像作为白天场景的训练集, 下载好的图像解压在同一文件夹下.
-2. 下载序列[2014-12-16](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-16-18-44-24/) 中Bumblebee XB3的左目图像作为夜晚场景的训练集, 下载好的图像解压在同一文件夹下.
-3. 验证集的图像和深度真值从原始数据集中筛选, 下载地址如下：
-    ```shell
-    https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt
-    https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt
-    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.001
-    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.002
-    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.001
-    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.002
-    https://videotag.bj.bcebos.com/Data/ADDS/day_val_451.7z
-    https://videotag.bj.bcebos.com/Data/ADDS/day_val_451_gt.7z
-    https://videotag.bj.bcebos.com/Data/ADDS/night_val_411.7z
-    https://videotag.bj.bcebos.com/Data/ADDS/night_val_411_gt.7z
-    ```
-    附原始未处理数据下载地址：
-    ```shell
-    # 白天数据
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.001
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.002
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.003
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.004
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.005
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.006
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.007
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.008
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.009
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.010
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.011
-    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.012
-
-    # 夜晚数据
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.001
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.002
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.003
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.004
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.005
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.006
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.007
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.008
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.009
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.010
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.011
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.012
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.013
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.014
-    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.015
-    ```
-## 数据预处理
-
-#### 1. 图像去畸变
-
-使用官方提供的工具箱[robotcar-dataset-sdk](https://github.com/ori-mrg/robotcar-dataset-sdk/tree/master/python) 对序列2014-12-09和2014-12-16的图像完成去畸变.
-
-
-#### 2. 动态帧筛选
-
-由于我们使用自监督的方法, 需要筛选出动态帧用于训练. 筛选原则为帧间位姿变化大于0.1m则认为是动态帧. 经过筛选后获得训练集的序列.
-
-
-#### 3. 图像重命名
-
-将原始图像时间戳重命名为连续数字序列. 白天场景对应关系见[1209_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt), 夜晚场景对应关系见[1216_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt). 重命名后的数据格式如下:
-```
-├── oxford_processing
-    ├── day_train_all      #白天训练图像文件夹 (day_train_all.7z.001 ~ day_train_all.7z.012)
-    ├── night_train_all    #夜晚训练图像文件夹 (night_train_all.7z.001 ~ day_train_all.7z.015)
-    ├── day_val_451        #白天验证图像文件夹 (day_val_451.7z)
-    ├── day_val_451_gt     #白天验证深度真值文件夹 (day_val_451_gt.7z)
-    ├── night_val_411      #夜晚验证图像文件夹 (night_val_411.7z)
-    └── night_val_411_gt   #夜晚验证深度真值文件夹 (night_val_411_gt.7z)
-```
-
-其中用于训练和验证的序列如下:
-
-```
-splits/oxford_day/train_files.txt       # 白天训练序列
-splits/oxford_night/train_files.txt     # 夜晚训练序列
-splits/oxford_day_451/val_files.txt     # 白天验证序列
-splits/oxford_night_411/val_files.txt   # 夜晚验证序列
-```
-训练所用路径文本的下载地址：
-```shell
-https://videotag.bj.bcebos.com/Data/ADDS/train_files.txt
-https://videotag.bj.bcebos.com/Data/ADDS/val_day_files.txt
-https://videotag.bj.bcebos.com/Data/ADDS/val_night_files.txt
-```
-
-#### 4. 白天-伪夜晚图像对准备
-
-为了用我们的框架提取出白天和夜晚图像的共有信息,我们用[CycleGAN](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix)生成白天-伪夜晚图像对,其中伪夜晚为CycleGAN生成的与白天对应的夜晚图像, 所有图像都缩放为192x640, 夜晚图像用直方图均衡化增强, 训练75个epoch, 最终得到Oxford-RobotCar-for-ADDS. 生成的白天-伪夜晚图像对数据格式如下,可直接用于ADDS-DepthNet的训练和验证:
-```
-data
-└── oxford
-    ├── splits
-        ├── train_files.txt
-        ├── val_day_files.txt
-        └── val_night_files.txt
-    └── oxford_processing_forADDS
-        ├── day_train_all/      #白天训练图像文件夹 (解压自day_train_all.7z.001 ~ day_train_all.7z.002)
-        ├── night_train_all/    #夜晚训练图像文件夹 (解压自night_train_all.7z.001 ~ day_train_all.7z.002)
-        ├── day_val_451/        #白天验证图像文件夹 (解压自day_val_451.7z)
-        ├── day_val_451_gt/     #白天验证深度真值文件夹 (解压自day_val_451_gt.7z)
-        ├── night_val_411/      #夜晚验证图像文件夹 (解压自night_val_411.7z)
-        └── night_val_411_gt/   #夜晚验证深度真值文件夹 (解压自night_val_411_gt.7z)
-```
-
-其中用于训练和验证的序列与前述保持一致.
diff --git a/docs/zh-CN/dataset/README.md b/docs/zh-CN/dataset/README.md
deleted file mode 100644
index 31aabee07..000000000
--- a/docs/zh-CN/dataset/README.md
+++ /dev/null
@@ -1,73 +0,0 @@
-简体中文 | [English](../../en/dataset/README.md)
-
-# 数据集
-
-## 1. 概览
-
-<table>
-  <tbody><tr>
-    <td colspan="4">动作识别</td>
-  </tr>
-  <tr>
-    <td><a href="./k400.md">Kinetics-400</a> (<a href="https://deepmind.com/research/open-source/kinetics/" rel="nofollow">Homepage</a>) (CVPR'2017)</td>
-    <td><a href="./ucf101.md">UCF101</a> (<a href="https://www.crcv.ucf.edu/research/data-sets/ucf101/" rel="nofollow">Homepage</a>) (CRCV-IR-12-01)</td>
-    <td><a href="./ActivityNet.md">ActivityNet</a> (<a href="http://activity-net.org/" rel="nofollow">Homepage</a>) (CVPR'2015)</td>
-    <td><a href="./youtube8m.md">YouTube-8M</a> (<a href="https://research.google.com/youtube8m/" rel="nofollow">Homepage</a>) (CVPR'2017)</td>
-  </tr>
-  <tr>
-    <td colspan="4">动作定位</td>
-  </tr>
-  <tr>
-    <td><a href="./ActivityNet.md">ActivityNet</a> (<a href="http://activity-net.org/" rel="nofollow">Homepage</a>) (CVPR'2015)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="4">时空动作检测</td>
-  </tr>
-  <tr>
-    <td><a href="./AVA.md">AVA</a> (<a href="https://research.google.com/ava/index.html" rel="nofollow">Homepage</a>) (CVPR'2018)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="4">基于骨架的动作识别</td>
-  </tr>
-  <tr>
-    <td><a href="./ntu-rgbd.md">NTURGB+D</a> (<a href="https://rose1.ntu.edu.sg/dataset/actionRecognition/" rel="nofollow">Homepage</a>) (IEEE CS'2016)</td>
-    <td><a href="./fsd.md">FSD</a> (<a href="https://aistudio.baidu.com/aistudio/competition/detail/115/0/introduction" rel="nofollow">Homepage</a>)</td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="4">单目深度估计</td>
-  </tr>
-  <tr>
-    <td><a href="./Oxford_RobotCar.md">Oxford-RobotCar</a> (<a href="https://robotcar-dataset.robots.ox.ac.uk/" rel="nofollow">Homepage</a>) (IJRR'2017)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="4">文本视频检索</td>
-  </tr>
-  <tr>
-    <td><a href="./msrvtt.md">MSR-VTT</a> (<a href="https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/" rel="nofollow">Homepage</a>) (CVPR'2016)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="4">文本视频预训练</td>
-  </tr>
-  <tr>
-    <td><a href="./howto100m.md">HowTo100M</a> (<a href="https://www.di.ens.fr/willow/research/howto100m/" rel="nofollow">Homepage</a>) (ICCV'2019)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-
-</tbody>
-</table>
diff --git a/docs/zh-CN/dataset/SegmentationDataset.md b/docs/zh-CN/dataset/SegmentationDataset.md
deleted file mode 100644
index c43b5910f..000000000
--- a/docs/zh-CN/dataset/SegmentationDataset.md
+++ /dev/null
@@ -1,35 +0,0 @@
-简体中文 | [English](../../en/dataset/SegmentationDataset.md)
-
-# 视频动作分割模型数据使用说明
-
-视频动作分割模型使用breakfast、50salads和gtea数据集，使用方法为使用预训练模型提取的特征，可以从MS-TCN官方代码库中获取。[feat](https://zenodo.org/record/3625992#.Xiv9jGhKhPY)
-
-- 数据集文件树形式
-```txt
-─── gtea
-    ├── features
-    │   ├── S1_Cheese_C1.npy
-    │   ├── S1_Coffee_C1.npy
-    │   ├── S1_CofHoney_C1.npy
-    │   └── ...
-    ├── groundTruth
-    │   ├── S1_Cheese_C1.txt
-    │   ├── S1_Coffee_C1.txt
-    │   ├── S1_CofHoney_C1.txt
-    │   └── ...
-    ├── splits
-    │   ├── test.split1.bundle
-    │   ├── test.split2.bundle
-    │   ├── test.split3.bundle
-    │   └── ...
-    └── mapping.txt
-```
-
-- 数据集存放文件树形式
-```txt
-─── data
-    ├── 50salads
-    ├── breakfast
-    ├── gtea
-    └── ...
-```
diff --git a/docs/zh-CN/dataset/fsd.md b/docs/zh-CN/dataset/fsd.md
deleted file mode 100644
index 670cb694f..000000000
--- a/docs/zh-CN/dataset/fsd.md
+++ /dev/null
@@ -1,56 +0,0 @@
-[English](../../en/dataset/fsd.md) | 简体中文
-
-# 基于飞桨实现花样滑冰选手骨骼点动作识别大赛数据准备
-
-- [数据集介绍](#数据集介绍)
-- [数据下载](#数据下载)
-
----
-
-
-## 数据集介绍
-
-基于飞桨实现花样滑冰选手骨骼点动作识别大赛数据集旨在通过花样滑冰研究人体的运动。在花样滑冰运动中，人体姿态和运动轨迹相较于其他运动呈现复杂性强、类别多的特点，有助于细粒度图深度学习新模型、新任务的研究。
-
-
-在FSD-10 中，所有的视频素材从2017 到2018 年的花样滑冰锦标赛中采集。源视频素材中视频的帧率被统一标准化至每秒30 帧，并且图像大小是1080 * 720 来保证数据集的相对一致性。之后我们通过2D姿态估计算法Open Pose对视频进行逐帧骨骼点提取，最后以.npy格式保存数据集。
-
-训练数据集与测试数据集的目录结构如下所示：
-
-```txt
-train_data.npy        # 2922
-train_label.npy       # 2922
-test_A_data.npy       # 628
-test_B_data.npy       # 634
-```
-
-其中train_label.npy通过np.load()读取后会得到一个一维张量，每一个元素为一个值在0-29之间的整形变量代表动作的标签；data.npy文件通过np.load()读取后，会得到一个形状为N×C×T×V×M的五维张量，每个维度的具体含义如下：
-
-| 维度符号 | 维度值大小 | 维度含义	| 补充说明 |
-| :---- | :----: | :----: | :---- |
-| N	| 样本数	| 代表N个样本 | 	无 |
-| C | 3	| 分别代表每个关节点的x, y坐标和置信度 |	每个x，y均被放缩至-1到1之间 |
-| T	| 1500 |	代表动作的持续时间长度，共有1500帧	| 有的动作的实际长度可能不足1500，例如可能只有500的有效帧数，我们在其后重复补充0直到1500帧，来保证T维度的统一性 |
-| V |	25 |	代表25个关节点 |	具体关节点的含义可看下方的骨架示例图 |
-| M |	1	| 代表1个运动员个数	| 无 |
-
-骨架示例图：
-
-
-<div align="left">
-  <img src="../../images/skeleton_example.png" width="180px"/><br>
-</div>
-
-
-
-## 数据下载
-
-在[2021 CCF BDCI 基于飞桨实现花样滑冰选手骨骼点动作识别比赛](https://aistudio.baidu.com/aistudio/competition/detail/115/0/introduction)主页报名后即可获取下载链接
-
-| 数据集 | Data | Label	|
-| :---- | :----: | :----: |
-| 训练集	| [train_data.npy](https://videotag.bj.bcebos.com/Data/FSD_train_data.npy)	| [train_label.npy](https://videotag.bj.bcebos.com/Data/FSD_train_label.npy) |
-| 测试集A	| comming soon	| comming soon |
-
-
-> 由于版权原因，RGB数据暂不开放。
diff --git a/docs/zh-CN/dataset/howto100m.md b/docs/zh-CN/dataset/howto100m.md
deleted file mode 100644
index 63711a4c8..000000000
--- a/docs/zh-CN/dataset/howto100m.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# HowTo100M 数据准备
-
-HowTo100M 数据相关准备，包括HowTo100M数据下载和数据下载后文件组织结构。
-
-## 数据下载
-
-HowTo100M 从1.2M Youtube 教学视频中切分出136M包含字幕的视频片段，涵盖23k活动类型，包括做饭、手工制作、日常护理、园艺、健身等等，数据集约10T大小。
-
-因为完整数据集体积过大，这里我们只提供少量数据，供大家跑通训练前向。如需下载全量数据，请参考：[HowTo100M](https://www.di.ens.fr/willow/research/howto100m/)
-
-为了方便使用，我们提供的数据版本已对HowTo100M数据集中的物体特征和动作特征进行了特征提取。 
-
-首先，请确保在 `data/howto100m` 目录下，输入如下命令，下载数据集。
-
-```bash
-bash download_features.sh
-```
-
-下载完成后，data目录下文件组织形式如下：
-
-```
-├── data
-|   ├── howto100m
-|   │   ├── actbert_train_data.npy
-|   │   ├── caption_train.json
-|   |   ├── caption_val.json
-
-```
-
-## 参考论文
-- Antoine Miech, Dimitri Zhukov, Jean-Baptiste Alayrac, Makarand Tapaswi, Ivan Laptev, and Josef Sivic. Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In ICCV, 2019.
diff --git a/docs/zh-CN/dataset/k400.md b/docs/zh-CN/dataset/k400.md
deleted file mode 100644
index 7eeceacb2..000000000
--- a/docs/zh-CN/dataset/k400.md
+++ /dev/null
@@ -1,77 +0,0 @@
-[English](../../en/dataset/k400.md) | 简体中文
-
-# Kinetics-400 数据准备
-
-- [数据集介绍](#数据集介绍)
-- [下载video数据](#下载video数据)
-- [提取frames数据](#提取frames数据)
-
----
-
-
-## 数据集介绍
-
-Kinetics-400是视频领域benchmark常用数据集，详细介绍可以参考其官方网站[Kinetics](https://deepmind.com/research/open-source/kinetics)。下载方式可参考官方地址[ActivityNet](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)，使用其提供的下载脚本下载数据集。
-
-## 下载video数据
-
-考虑到K400数据集下载困难的问题，我们提供了两种下载方式： (1) 百度网盘下载 (2) 脚本下载
-
-### 百度网盘下载
-
-网盘链接：https://pan.baidu.com/s/1S_CGBjWOUAuxL_cCX5kMPg
-提取码：ppvi
-
-### 脚本下载
-
-- 下载训练集链接列表文件[train_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list)和验证集链接列表文件[val_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list)。
-
-编写下载脚本`download.sh`如下:
-```bash
-file=$1
-
-while read line 
-do
-  wget "$line"
-done <$file
-```
-
-下载训练集命令：
-```bash
-bash download.sh train_link.list
-```
-
-下载验证集命令:
-```bash
-bash download.sh val_link.list
-```
-
----
-
-|类别 | 数据条数  | list文件 |
-| :------: | :----------: | :----: |
-|训练集 | 234619  |  [train.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list)|
-|验证集 | 19761 |  [val.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list)|
-
-- 下载后自行解压，并将数据路径添加到相应的list文件中。
-
-- 由于部分视频原始链接失效，数据有部分缺失，全部文件大概需要135G左右的存储空间，PaddleVideo使用的也是这份数据。
-
-> 此份数据仅限于学术研究，若对您有帮助，欢迎给[项目](https://github.com/PaddlePaddle/PaddleVideo)star~
-
-
-## 提取frames数据
-为了加速网络的训练过程，我们首先对视频文件（K400视频文件为mp4格式）提取帧 (frames)。相对于直接通过视频文件进行网络训练的方式，frames的方式能够极大加快网络训练的速度。
-
-输入如下命令，即可提取K400视频文件的frames
-
-```python
-python extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4
-```
-
-视频文件frames提取完成后，会存储在指定的`./rawframes`路径下，大小约为2T左右。
-
-|类别 | 数据条数  | list文件 |
-| :------: | :----------: | :----: |
-|训练集 | 234619  |  [train_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list)|
-|验证集 | 19761 |  [val_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list)|
diff --git a/docs/zh-CN/dataset/msrvtt.md b/docs/zh-CN/dataset/msrvtt.md
deleted file mode 100644
index b2cfdedc3..000000000
--- a/docs/zh-CN/dataset/msrvtt.md
+++ /dev/null
@@ -1,72 +0,0 @@
-[English](../../en/dataset/msrvtt.md) | 简体中文
-
-# MSR-VTT 数据准备
-
-- [数据集介绍](#数据集介绍)
-- [T2VLAD模型数据准备](#T2VLAD模型数据准备)
-- [ActBERT模型数据准备](#T2VLAD模型数据准备)
-- [参考文献](#参考文献)
-
-## 数据集介绍
-
-MSR-VTT(Microsoft Research Video to Text) 是一个包含视频及字幕的大规模数据集，由来自20个类别的10,000个视频片段组成，每个视频片段由20个英文句子注释。我们使用9000个视频片段用于训练，1000个用于测试。更多详细信息可以参考网站：[MSRVTT](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)
-
-## T2VLAD模型数据准备
-[T2VLAD模型文档](../../../applications/T2VLAD/README.md)
-
-为了方便使用，我们提供的数据版本已对MSR-VTT数据集中对视频进行了特征提取。
-
-首先，请确保在 `applications/T2VLAD/data` 目录下，输入如下命令，下载数据集。
-
-```bash
-bash download_features.sh
-```
-
-下载完成后，data目录下文件组织形式如下：
-
-```
-├── data
-|   ├── MSRVTT
-|   │   ├── raw-captions.pkl
-|   │   ├── train_list_jsfusion.txt
-|   │   ├── val_list_jsfusion.txt
-|   │   ├── aggregated_text_feats
-|   |   |   ├── w2v_MSRVTT_openAIGPT.pickle
-|   |   ├── mmt_feats
-|   │   │   ├── features.audio.pkl
-|   │   │   ├── features.face_agg.pkl
-|   │   │   ├── features.flos_agg.pkl
-|   │   │   ├── features.ocr.pkl
-|   │   │   ├── features.rgb_agg.pkl
-|   │   │   ├── features.s3d.pkl
-|   │   │   ├── features.scene.pkl
-|   │   │   ├── features.speech.pkl
-
-```
-
-## ActBERT模型数据准备
-[ActBERT模型文档](../model_zoo/multimodal/actbert.md)
-
-下载数据特征：
-```
-wget https://videotag.bj.bcebos.com/Data/ActBERT/msrvtt_test.lmdb.tar
-wget https://videotag.bj.bcebos.com/Data/ActBERT/MSRVTT_JSFUSION_test.csv
-```
-
-将下载得到的`msrvtt_test.lmdb.tar`解压：
-```
-tar -zxvf msrvtt_test.lmdb.tar
-```
-
-最终得到的文件组织形式如下：
-```
-├── data
-|   ├── MSR-VTT
-|   │   ├── MSRVTT_JSFUSION_test.csv
-|   │   ├── msrvtt_test.lmdb
-|   │       ├── data.mdb
-|   │       ├── lock.mdb
-```
-
-## 参考论文
-- Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. Multi-modal transformer for video retrieval. In ECCV, 2020.
diff --git a/docs/zh-CN/dataset/ntu-rgbd.md b/docs/zh-CN/dataset/ntu-rgbd.md
deleted file mode 100644
index c0910fa48..000000000
--- a/docs/zh-CN/dataset/ntu-rgbd.md
+++ /dev/null
@@ -1,158 +0,0 @@
-[English](../../en/dataset/ntu-rgbd.md) | 简体中文
-
-# NTU-RGB+D 数据准备
-
-- [数据集介绍](#数据集介绍)
-- [ST-GCN数据集准备](#ST-GCN数据集准备)
-- [CTR-GCN数据集准备](#CTR-GCN数据集准备)
-
----
-
-
-## 数据集介绍
-
-NTU-RGB+D是基于骨骼的行为识别数据集，包含60个种类的动作，56880个样本，详细介绍可以参考其官方网站[NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/)。该数据集在划分训练集和测试集时采用了两种不同的划分标准。Cross-Subject按照人物ID划分，训练集40320个样本，测试集16560个样本。Cross-View安装相机划分，相机2和3采集的样本为训练集，包含37930个样本，相机1采集的样本为测试集，包含18960个样本。
-
-
-## ST-GCN数据集准备
-
-以下是ST-GCN模型的数据集准备流程介绍。
-
-### 数据集下载
-
-我们提供处理好的数据集下载地址[NTU-RGB-D.tar](https://videotag.bj.bcebos.com/Data/NTU-RGB-D.tar)(~3.1G)，下载后通过命令```tar -zxvf NTU-RGB-D.tar ```进行解压，得到的数据目录如下：
-
-```txt
-─── NTU-RGB-D
-    ├── xsub
-    │   ├── train_data.npy
-    │   ├── train_label.pkl
-    │   ├── val_data.npy
-    │   └── val_label.pkl
-    └── xview
-        ├── train_data.npy
-        ├── train_label.pkl
-        ├── val_data.npy
-        └── val_label.pkl
-```
-
-> 数据来源于[st-gcn](https://github.com/open-mmlab/mmskeleton/blob/master/doc/SKELETON_DATA.md)。
-
-## CTR-GCN数据集准备
-
-以下是CTR-GCN模型的数据集准备流程介绍。
-
-### 数据集下载
-
-在`data\ntu-rgb-d`目录有下载其官方网站[NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/)提供的数据集的脚本`download_dataset.sh`
-
-```bash
-sh data/ntu-rgb-d/download_dataset.sh
-```
-
-运行脚本后会得到如下的数据目录：
-```txt
-─── ntu-rgb-d
-    ├── download_dataset.sh
-    ├── nturgb+d_skeletons
-    │   ├── S001C001P001R001A001.skeleton
-    │   ├── S001C001P001R001A002.skeleton
-    │   ├── S001C001P001R001A003.skeleton
-    │   ├── S001C001P001R001A004.skeleton
-    │   ├── S001C001P001R001A005.skeleton
-    │   ├── S001C001P001R001A006.skeleton
-    │   ├── S001C001P001R001A007.skeleton
-    │   ├── ....
-    │   └── S017C003P020R002A060.skeleton
-    ├── get_raw_denoised_data.py
-    ├── get_raw_skes_data.py
-    ├── seq_transformation.py
-    └── statistics
-        ├── camera.txt
-        ├── label.txt
-        ├── performer.txt
-        ├── replication.txt
-        ├── setup.txt
-        └── skes_available_name.txt
-
-```
-
-### 数据集处理
-
-运行如下脚本，将数据处理成CTR-GCN所需的格式。
-
-> 注：若自定义数据集，提前准备好`data/ntu-rgb-d/statistics/skes_available_name.txt`文件，该文件是待处理的骨骼点数据文件名清单。
-
-```bash
-cd ./data/ntu-rgb-d
-# Get skeleton of each performer
-python get_raw_skes_data.py
-# Remove the bad skeleton
-python get_raw_denoised_data.py
-# Transform the skeleton to the center of the first frame
-python seq_transformation.py
-```
-
-最终数据集处理后得到如下文件树形式
-
-```txt
-─── ntu-rgb-d
-    ├── download_dataset.sh
-    ├── nturgb+d_skeletons
-    │   ├── S001C001P001R001A001.skeleton
-    │   ├── S001C001P001R001A002.skeleton
-    │   ├── S001C001P001R001A003.skeleton
-    │   ├── S001C001P001R001A004.skeleton
-    │   ├── S001C001P001R001A005.skeleton
-    │   ├── S001C001P001R001A006.skeleton
-    │   ├── S001C001P001R001A007.skeleton
-    │   ├── ....
-    │   └── S017C003P020R002A060.skeleton
-    ├── denoised_data
-    │   ├── actors_info
-    │   │   ├── S001C001P001R001A024.txt
-    │   │   ├── S001C001P001R001A025.txt
-    │   │   ├── S001C001P001R001A026.txt
-    │   │   ├── ....
-    │   │   ├── S017C003P020R002A059.txt
-    │   │   └── S017C003P020R002A060.txt
-    │   ├── denoised_failed_1.log
-    │   ├── denoised_failed_2.log
-    │   ├── frames_cnt.txt
-    │   ├── missing_skes_1.log
-    │   ├── missing_skes_2.log
-    │   ├── missing_skes.log
-    │   ├── noise_length.log
-    │   ├── noise_motion.log
-    │   ├── noise_spread.log
-    │   ├── raw_denoised_colors.pkl
-    │   ├── raw_denoised_joints.pkl
-    │   └── rgb+ske
-    ├── raw_data
-    │   ├── frames_cnt.txt
-    │   ├── frames_drop.log
-    │   ├── frames_drop_skes.pkl
-    │   └── raw_skes_data.pkl
-    ├── get_raw_denoised_data.py
-    ├── get_raw_skes_data.py
-    ├── seq_transformation.py
-    ├── statistics
-    │   ├── camera.txt
-    │   ├── label.txt
-    │   ├── performer.txt
-    │   ├── replication.txt
-    │   ├── setup.txt
-    │   └── skes_available_name.txt
-    ├── xview
-    │   ├── train_data.npy
-    │   ├── train_label.pkl
-    │   ├── val_data.npy
-    │   └── val_label.pkl
-    └── xsub
-        ├── train_data.npy
-        ├── train_label.pkl
-        ├── val_data.npy
-        └── val_label.pkl
-```
-
-> 注：文件夹`denoised_data`、`raw_data`和`nturgb+d_skeletons`都为处理处理的临时文件，可在提取出`xview`和`xsub`后删除。
diff --git a/docs/zh-CN/dataset/ucf24.md b/docs/zh-CN/dataset/ucf24.md
deleted file mode 100644
index 5f5600a5a..000000000
--- a/docs/zh-CN/dataset/ucf24.md
+++ /dev/null
@@ -1,72 +0,0 @@
-简体中文 | [English](../../en/dataset/ucf24.md)
-
-# UCF24数据准备
-UCF24数据的相关准备。主要包括UCF24的RGB帧文件、标注文件的下载和生成文件的路径list。
-
----
-## 1. 数据下载
-UCF24数据的详细信息可以参考网站[UCF24](http://www.thumos.info/download.html)。 为了方便使用，PaddleVideo提供了UCF24数据的RGB帧、标注文件的下载脚本。
-
-首先，请确保在[data/ucf24/ 目录](../../../data/ucf24)下，输入如下UCF24数据集的RGB帧、标注文件的命令。
-
-```shell
-bash download_frames_annotations.sh
-```
-
-- 运行该命令需要安装unrar解压工具，可使用pip方式安装。
-
-- RGB帧文件会存储在[data/ucf24/rgb-images/ 文件夹](../../../data/ucf24/rgb-images)下
-
-- 标注文件会存储在[data/ucf24/lables/ 文件夹](../../../data/ucf24/labels)下
-
----
-## 2. 生成文件的路径list
-指定格式划分文件，输入如下命令
-
-```python
-python build_split.py --raw_path ./splitfiles
-```
-
-**参数说明**
-
-`--raw_path`： 表示原始划分文件的存储路径
-
-
-# 以上步骤完成后，文件组织形式如下所示
-
-```
-├── data
-│   ├── ucf24
-│   |   ├── groundtruths_ucf
-│   |   ├── labels
-│   |   |   ├── Basketball
-│   |   |   |   ├── v_Basketball_g01_c01
-│   |   |   |   |   ├── 00009.txt
-│   |   |   |   |   ├── 00010.txt
-│   |   |   |   |   ├── ...
-│   |   |   |   |   ├── 00050.txt
-│   |   |   |   |   ├── 00051.txt
-│   |   |   ├── ...
-│   |   |   ├── WalkingWithDog
-│   |   |   |   ├── v_WalkingWithDog_g01_c01
-│   |   |   |   ├── ...
-│   |   |   |   ├── v_WalkingWithDog_g25_c04
-│   |   ├── rgb-images
-│   |   |   ├── Basketball
-│   |   |   |   ├── v_Basketball_g01_c01
-│   |   |   |   |   ├── 00001.jpg
-│   |   |   |   |   ├── 00002.jpg
-│   |   |   |   |   ├── ...
-│   |   |   |   |   ├── 00140.jpg
-│   |   |   |   |   ├── 00141.jpg
-│   |   |   ├── ...
-│   |   |   ├── WalkingWithDog
-│   |   |   |   ├── v_WalkingWithDog_g01_c01
-│   |   |   |   ├── ...
-│   |   |   |   ├── v_WalkingWithDog_g25_c04
-│   |   ├── splitfiles
-│   |   |   ├── trainlist01.txt
-│   |   |   |── testlist01.txt 
-│   |   ├── trainlist.txt
-│   |   |── testlist.txt 
-```
diff --git a/docs/zh-CN/dataset/youtube8m.md b/docs/zh-CN/dataset/youtube8m.md
deleted file mode 100644
index e0a62680a..000000000
--- a/docs/zh-CN/dataset/youtube8m.md
+++ /dev/null
@@ -1,59 +0,0 @@
-[English](../../en/dataset/youtube8m.md) | 简体中文
-
-# YouTube-8M数据准备
-
-- [数据集简介](#数据集简介)
-- [数据集下载](#数据集下载)
-- [数据格式转化](#数据格式转化)
-
-
-## 数据集简介
-
-YouTube-8M 是一个大规模视频分类数据集，包含800多万个视频url，标签体系涵盖3800多种知识图谱实体，1个视频对应多个标签(平均3-4个)，使用机器进行标注。
-
-**每个视频的长度在120s到500s之间
-由于视频数据量太大，因此预先使用图像分类模型提取了frame-level的特征，并使用PCA对特征进行了降维处理得到多帧1024维的特征，类似地用音频模型处理得到多帧128维的音频特征。**
-> 这里用到的是YouTube-8M 2018年更新之后的数据集（May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features）。  
-  
-
-## 数据集下载
-
-1. 新建存放特征的目录（以PaddleVideo目录下为例）
-    ```bash
-    cd data/yt8m
-    mkdir frame
-    cd frame
-    ```
-2. 下载训练、验证集到frame文件夹中
-    ```bash
-    curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python
-    curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python
-    ```
-    下载过程如图所示
-    ![image](https://user-images.githubusercontent.com/23737287/140709613-1e2d6ec0-a82e-474d-b220-7803065b0153.png)
-
-    数据下载完成后，将会得到3844个训练数据文件和3844个验证数据文件（TFRecord格式）
-
-
-## 数据格式转化
-1. 安装tensorflow-gpu用于读入tfrecord数据
-    ```bash
-    python3.7 -m pip install tensorflow-gpu==1.14.0
-    ```
-3. 将下载的TFRecord文件转化为pickle文件以便PaddlePaddle使用
-    ```bash
-    cd .. # 从frame目录回到yt8m目录
-    python3.7 tf2pkl.py ./frame ./pkl_frame/ # 将frame文件夹下的train*.tfrecord和validate*.tfrecord转化为pkl格式
-    ```
-2. 生成单个pkl文件路径集合，并根据此文件将pkl拆分为多个小pkl文件，并生成最终需要的拆分pkl文件路径
-    ```bash
-    ls pkl_frame/train*.pkl > train.list # 将train*.pkl的路径写入train.list
-    ls pkl_frame/validate*.pkl > val.list # 将validate*.pkl的路径写入val.list
-
-    python3.7 split_yt8m.py train.list # 拆分每个train*.pkl变成多个train*_split*.pkl
-    python3.7 split_yt8m.py val.list # 拆分每个validate*.pkl变成多个validate*_split*.pkl
-    
-    ls pkl_frame/train*_split*.pkl > train.list # 将train*_split*.pkl的路径重新写入train.list
-    ls pkl_frame/validate*_split*.pkl > val.list # 将validate*_split*.pkl的路径重新写入val.list
-    ```
-
diff --git a/docs/zh-CN/distillation.md b/docs/zh-CN/distillation.md
deleted file mode 100644
index 76fa5a444..000000000
--- a/docs/zh-CN/distillation.md
+++ /dev/null
@@ -1,218 +0,0 @@
-# 知识蒸馏
-
----
-## 目录
-
-- [1. 知识蒸馏简介](#1)
-    - [1.1 Response based distillation](#1.1)
-    - [1.2 Feature based distillation](#1.2)
-    - [1.3 Relation based distillation](#1.3)
-- [2. PaddleVideo支持的知识蒸馏算法](#2)
-    - [2.1 DML](#2.1)
-- [3. 参考文献](#3)
-
-
-<a name="1"></a>
-
-### 1. 知识蒸馏简介
-
-近年来，深度神经网络在计算机视觉、自然语言处理等领域被验证是一种极其有效的解决问题的方法。通过构建合适的神经网络，加以训练，最终网络模型的性能指标基本上都会超过传统算法。
-
-在数据量足够大的情况下，通过合理构建网络模型的方式增加其参数量，可以显著改善模型性能，但是这又带来了模型复杂度急剧提升的问题。大模型在实际场景中使用的成本较高。
-
-深度神经网络一般有较多的参数冗余，目前有几种主要的方法对模型进行压缩，减小其参数量。如裁剪、量化、知识蒸馏等，其中知识蒸馏是指使用教师模型(teacher model)去指导学生模型(student model)学习特定任务，保证小模型在参数量不变的情况下，得到比较大的性能提升，甚至获得与大模型相似的精度指标 [1]。
-
-根据蒸馏方式的不同，可以将知识蒸馏方法分为3个不同的类别：Response based distillation、Feature based distillation、Relation based distillation。下面进行详细介绍。
-
-<a name='1.1'></a>
-
-#### 1.1 Response based distillation
-
-最早的知识蒸馏算法 KD，由 Hinton 提出，训练的损失函数中除了 gt loss 之外，还引入了学生模型与教师模型输出的 KL 散度，最终精度超过单纯使用 gt loss 训练的精度。这里需要注意的是，在训练的时候，需要首先训练得到一个更大的教师模型，来指导学生模型的训练过程。
-
-上述标准的蒸馏方法是通过一个大模型作为教师模型来指导学生模型提升效果，而后来又发展出 DML(Deep Mutual Learning)互学习蒸馏方法 [7]，即通过两个结构相同的模型互相学习。具体的。相比于 KD 等依赖于大的教师模型的知识蒸馏算法，DML 脱离了对大的教师模型的依赖，蒸馏训练的流程更加简单，模型产出效率也要更高一些。
-
-<a name='1.2'></a>
-
-#### 1.2 Feature based distillation
-
-Heo 等人提出了 OverHaul [8], 计算学生模型与教师模型的 feature map distance，作为蒸馏的 loss，在这里使用了学生模型、教师模型的转移，来保证二者的 feature map 可以正常地进行 distance 的计算。
-
-基于 feature map distance 的知识蒸馏方法也能够和 `1.1 章节` 中的基于 response 的知识蒸馏算法融合在一起，同时对学生模型的输出结果和中间层 feature map 进行监督。而对于 DML 方法来说，这种融合过程更为简单，因为不需要对学生和教师模型的 feature map 进行转换，便可以完成对齐(alignment)过程。P
-
-<a name='1.3'></a>
-
-#### 1.3 Relation based distillation
-
-[1.1](#1.1) 和 [1.2](#1.2) 章节中的论文中主要是考虑到学生模型与教师模型的输出或者中间层 feature map，这些知识蒸馏算法只关注个体的输出结果，没有考虑到个体之间的输出关系。
-
-Park 等人提出了 RKD [10]，基于关系的知识蒸馏算法，RKD 中进一步考虑个体输出之间的关系，使用 2 种损失函数，二阶的距离损失（distance-wise）和三阶的角度损失（angle-wise）
-
-本论文提出的算法关系知识蒸馏（RKD）迁移教师模型得到的输出结果间的结构化关系给学生模型，不同于之前的只关注个体输出结果，RKD 算法使用两种损失函数：二阶的距离损失(distance-wise)和三阶的角度损失(angle-wise)。在最终计算蒸馏损失函数的时候，同时考虑 KD loss 和 RKD loss。最终精度优于单独使用 KD loss 蒸馏得到的模型精度。
-
-<a name='2'></a>
-
-### 2. PaddleVideo支持的知识蒸馏算法
-
-#### 2.1 DML
-
-##### 2.1.1 DML 算法介绍
-
-论文信息：
-
-> [Deep Mutual Learning](https://openaccess.thecvf.com/content_cvpr_2018/html/Zhang_Deep_Mutual_Learning_CVPR_2018_paper.html)
->
-> Ying Zhang, Tao Xiang, Timothy M. Hospedales, Huchuan Lu
->
-> CVPR, 2018
-
-DML论文中，在蒸馏的过程中，不依赖于教师模型，两个结构相同的模型互相学习，计算彼此输出（logits）的KL散度，最终完成训练过程。
-
-
-在Kinetics-400公开数据集上，效果如下所示。
-
-| 策略 | 骨干网络 | 配置文件 | Top-1 acc |
-| --- | --- | --- | --- |
-| baseline | PP-TSMv2 | [pptsm_lcnet_k400_frames_uniform.yaml](../configs/recognition/pptsm/v2/pptsm_lcnet_k400_frames_uniform.yaml) | 73.1% |
-| DML | PP-TSMv2 | [pptsm_lcnet_k400_frames_uniform_dml_distillation.yaml](../configs/recognition/pptsm/v2/pptsm_lcnet_k400_frames_uniform_dml_distillation.yaml) | 74.38%(**+1.28%**) |
-
-
-##### 2.1.2 DML 配置
-
-DML配置如下所示。在模型构建Arch字段中，需要同时定义学生模型与教师模型，教师模型与学生模型均保持梯度更新状态。在损失函数Loss字段中，需要定义`DistillationDMLLoss`（学生与教师之间的JS-Div loss）以及`DistillationCELoss`（学生与教师关于真值标签的CE loss），作为训练的损失函数。
-
-
-使用蒸馏训练，配置文件需要做一定的修改：
-原始Student模型训练配置文件：
-```yaml
-MODEL:
-    framework: "Recognizer2D"
-    backbone:
-        name: "PPTSM_v2"
-        pretrained: "data/PPLCNetV2_base_ssld_pretrained.pdparams"
-        num_seg: 16
-    head:
-        name: "MoViNetHead"
-```
-
-DML配置如下所示。在模型构建MODEL字段中，需要指定framework为`RecognizerDistillation`，同时定义学生模型与教师模型，教师模型与学生模型均保持梯度更新状态。在损失函数Loss字段中，需要定义`DistillationDMLLoss`（学生与教师之间的JS-Div loss）以及`DistillationCELoss`（学生与教师关于真值标签的CE loss），作为训练的损失函数。
-
-```yaml
-MODEL:
-    framework: "RecognizerDistillation"
-    freeze_params_list:
-    - False # Teacher是否可学习
-    - False # Student是否可学习
-    models:
-    - Teacher: # 指定Teacher模型
-        backbone:
-            name: "ResNetTweaksTSM" #Teacher模型名称
-            pretrained: "data/ResNet50_vd_ssld_v2_pretrained.pdparams"
-            depth: 50
-            num_seg: 16
-        head:
-            name: "ppTSMHead" # Teacher模型head
-            num_classes: 400
-            in_channels: 2048
-            drop_ratio: 0.5
-            std: 0.01
-            num_seg: 16
-    - Student:
-        backbone: # 指定Student模型
-            name: "PPTSM_v2" #Student模型名称
-            pretrained: "data/PPLCNetV2_base_ssld_pretrained.pdparams"
-            num_seg: 16
-        head:
-            name: "MoViNetHead" # Student模型head
-    loss: # 指定蒸馏loss
-        Train:  # 训练时loss计算
-            - name: "DistillationCELoss" # 蒸馏损失1
-              model_name_pairs: ["Student", "GroundTruth"] # 计算loss的对象
-            - name: "DistillationCELoss" # 蒸馏损失2
-              model_name_pairs: ["Teacher", "GroundTruth"]
-            - name: "DistillationDMLLoss" # 蒸馏损失3
-              model_name_pairs: ["Student", "Teacher"]
-        Val:   # 评估时loss计算
-            - name: "DistillationCELoss"
-              model_name_pairs: ["Student", "GroundTruth"]
-```
-
-若将教师模型设置为Student自身，便是一种简单的自蒸馏方式，示例配置文件如下：
-```yaml
-MODEL:
-    framework: "RecognizerDistillation"
-    freeze_params_list:
-    - False # Teacher是否可学习
-    - False # Student是否可学习
-    models:
-    - Teacher: # 指定Teacher模型
-        backbone:
-            name: "PPTSM_v2"
-            pretrained: "data/PPLCNetV2_base_ssld_pretrained.pdparams"
-            num_seg: 16
-        head:
-            name: "MoViNetHead"
-    - Student:
-        backbone: # 指定Student模型
-            name: "PPTSM_v2"
-            pretrained: "data/PPLCNetV2_base_ssld_pretrained.pdparams"
-            num_seg: 16
-        head:
-            name: "MoViNetHead"
-    loss: # 指定蒸馏loss
-        Train:  # 训练时loss计算
-            - name: "DistillationCELoss" # 蒸馏损失1
-              model_name_pairs: ["Student", "GroundTruth"] # 计算loss的对象
-            - name: "DistillationCELoss" # 蒸馏损失2
-              model_name_pairs: ["Teacher", "GroundTruth"]
-            - name: "DistillationDMLLoss" # 蒸馏损失3
-              model_name_pairs: ["Student", "Teacher"]
-        Val:   # 评估时loss计算
-            - name: "DistillationCELoss"
-              model_name_pairs: ["Student", "GroundTruth"]
-```
-
-实验发现，在Kinetics-400公开数据集上，使用自蒸馏方法，PP-TSMv2的精度也能获得1个点左右的提升:
-
-| 策略 | 教师网络 | Top-1 acc |
-| --- | --- | --- |
-| baseline | - | 69.06% |
-| DML | PP-TSMv2 | 70.34%(**+1.28%**) |
-| DML | PP-TSM_ResNet50 | 71.27%(**+2.20%**) |
-
-* 注：完整的PP-TSMv2加了其它trick训练，这里为了方便对比，baseline未加其它tricks，因此指标比官网最终开源出来的模型精度低一些。
-
-完成配置文件的修改后，参考[使用说明](./usage.md)即可开启模型训练、测试与推理。
-
-
-<a name="3"></a>
-
-## 3. 参考文献
-
-[1] Hinton G, Vinyals O, Dean J. Distilling the knowledge in a neural network[J]. arXiv preprint arXiv:1503.02531, 2015.
-
-[2] Bagherinezhad H, Horton M, Rastegari M, et al. Label refinery: Improving imagenet classification through label progression[J]. arXiv preprint arXiv:1805.02641, 2018.
-
-[3] Yalniz I Z, Jégou H, Chen K, et al. Billion-scale semi-supervised learning for image classification[J]. arXiv preprint arXiv:1905.00546, 2019.
-
-[4] Cubuk E D, Zoph B, Mane D, et al. Autoaugment: Learning augmentation strategies from data[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2019: 113-123.
-
-[5] Touvron H, Vedaldi A, Douze M, et al. Fixing the train-test resolution discrepancy[C]//Advances in Neural Information Processing Systems. 2019: 8250-8260.
-
-[6] Cui C, Guo R, Du Y, et al. Beyond Self-Supervision: A Simple Yet Effective Network Distillation Alternative to Improve Backbones[J]. arXiv preprint arXiv:2103.05959, 2021.
-
-[7] Zhang Y, Xiang T, Hospedales T M, et al. Deep mutual learning[C]//Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2018: 4320-4328.
-
-[8] Heo B, Kim J, Yun S, et al. A comprehensive overhaul of feature distillation[C]//Proceedings of the IEEE/CVF International Conference on Computer Vision. 2019: 1921-1930.
-
-[9] Du Y, Li C, Guo R, et al. PP-OCRv2: Bag of Tricks for Ultra Lightweight OCR System[J]. arXiv preprint arXiv:2109.03144, 2021.
-
-[10] Park W, Kim D, Lu Y, et al. Relational knowledge distillation[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019: 3967-3976.
-
-[11] Zhao B, Cui Q, Song R, et al. Decoupled Knowledge Distillation[J]. arXiv preprint arXiv:2203.08679, 2022.
-
-[12] Ji M, Heo B, Park S. Show, attend and distill: Knowledge distillation via attention-based feature matching[C]//Proceedings of the AAAI Conference on Artificial Intelligence. 2021, 35(9): 7945-7952.
-
-[13] Huang T, You S, Wang F, et al. Knowledge Distillation from A Stronger Teacher[J]. arXiv preprint arXiv:2205.10536, 2022.
-
-[14] https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/zh_CN/advanced_tutorials/knowledge_distillation.md#1.1.2
diff --git a/docs/zh-CN/install.md b/docs/zh-CN/install.md
deleted file mode 100644
index b792912f8..000000000
--- a/docs/zh-CN/install.md
+++ /dev/null
@@ -1,89 +0,0 @@
-简体中文 | [English](../en/install.md)
-
-# 安装说明
-
----
-
-- [简介](#简介)
-- [安装PaddlePaddle](#安装PaddlePaddle)
-- [安装PaddleVideo](#安装PaddleVideo)
-
-## 简介
-
-使用PaddleVideo之前，请先安装PaddlePaddle及相关依赖项。
-
-
-## 安装PaddlePaddle
-
-运行PaddleVideo需要`PaddlePaddle 2.0`或更高版本。请参照[安装文档](http://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
-PaddleVideo只支持python3.7及以上的运行环境，依赖项请安装python3.7及以上的安装包
-
-如果已经安装好了cuda、cudnn、nccl或者安装好了nvidia-docker运行环境，可以pip3安装最新GPU版本PaddlePaddle
-
-```bash
-pip3 install paddlepaddle-gpu --upgrade
-```
-
-也可以从源码编译安装PaddlePaddle，请参照[安装文档](http://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
-
-使用以下命令可以验证PaddlePaddle是否安装成功。
-
-```python3
-import paddle
-paddle.utils.run_check()
-```
-
-查看PaddlePaddle版本的命令如下：
-
-```bash
-python3 -c "import paddle; print(paddle.__version__)"
-```
-
-注意：
-- 从源码编译的PaddlePaddle版本号为0.0.0，请确保使用了PaddlePaddle 2.0及之后的源码编译。
-- PaddleVideo基于PaddlePaddle高性能的分布式训练能力，若您从源码编译，请确保打开编译选项，**WITH_DISTRIBUTE=ON**。具体编译选项参考[编译选项表](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#id3)。
-- 在docker中运行时，为保证docker容器有足够的共享内存用于Paddle的数据读取加速，在创建docker容器时，请设置参数`--shm_size=32g`，条件允许的话可以设置为更大的值。
-
-**运行环境需求:**
-
-- Python3.7 or later version (当前只支持Linux系统)
-- CUDA >= 10.1
-- cuDNN >= 7.6.4
-- nccl >= 2.1.2
-
-
-## 安装PaddleVideo
-
-**克隆PaddleVideo模型库：**
-
-```
-cd path_to_clone_PaddleVideo
-git clone https://github.com/PaddlePaddle/PaddleVideo.git
-cd PaddleVideo
-```
-
-**安装Python依赖库：**
-
-Python依赖库在[requirements.txt](https://github.com/PaddlePaddle/PaddleVideo/blob/master/requirements.txt)中给出，可通过如下命令安装：
-
-```
-python3.7 -m pip install --upgrade pip
-pip3.7 install --upgrade -r requirements.txt
-```
-
-**从python安装包安装PaddleVideo：**
-
-使用pypi安装
-
-```bash
-pip3.7 install ppvideo==2.3.0
-```
-
-安装完成后，可以使用命令行方式启动程序
-```bash
-ppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'
-```
-
-wheel包更多的使用教程可以参考[快速开始](./quick_start.md)
-
----
diff --git a/docs/zh-CN/model_zoo/README.md b/docs/zh-CN/model_zoo/README.md
deleted file mode 100644
index 76d1610ff..000000000
--- a/docs/zh-CN/model_zoo/README.md
+++ /dev/null
@@ -1,124 +0,0 @@
-简体中文 | [English](../../en/model_zoo/README.md)
-
-# 前沿算法与模型
-
-## 1. 概要
-
-PaddleVideo包含视频理解方向众多模型，包括基于RGB的行为识别模型，基于骨骼点的行为识别模型、时序动作检测模型、时序分割模型、时空动作检测模型、视频目标分割模型、多模态模型。其中基于RGB的行为识别方向是PaddleVideo核心建设的方向，因其训练得到的好的特征提取器提取的特征，是众多下游任务的基础输入。
-
-与图像识别不同的是，行为识别任务的核心是提取时序信息。按模型结构的不同，基于RGB的行为识别方法大体上可以分为基于2D网络、基于3D网络、基于RNN以及基于Transformer结构的模型。2D网络一般会使用图像预训练模型配合时序模块提取时序信息，比如TSN、TSM等，简单高效。由于视频多一个时序维度，因此很自然的会使用3D卷积提取时序信息，比如I3D、SlowFast。3D模型的计算量一般比较大，训练迭代次数也更多一些。基于RNN的网络以视频特征作为输入，利用RNN提取时序信息，如AttentionLSTM。近期学界涌现了众多基于Transformer结构的行为识别网络，如TimeSformer、VideoSwin。相较于卷积网络，transformer结构的网络精度更高，计算量也会大些。
-
-PaddleVideo自研并开源了PP-TSM，该模型基于TSM进行优化，在保持模型参数量和计算量不增加的前提下，精度得到大幅提升，欢迎使用。更多前沿模型复现与基础模型优化工作，敬请期待～
-
-## 2. 模型概览
-
-<table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
-  <tr>
-    <td colspan="5" style="font-weight:bold;">行为识别方法</td>
-  </tr>
-  <tr>
-    <td><a href="./recognition/pp-tsm.md">PP-TSM</a> (PP series)</td>
-    <td><a href="./recognition/pp-tsn.md">PP-TSN</a> (PP series)</td>
-    <td><a href="./recognition/pp-timesformer.md">PP-TimeSformer</a> (PP series)</td>
-    <td><a href="./recognition/tsn.md">TSN</a> (2D’)</td>
-    <td><a href="./recognition/tsm.md">TSM</a> (2D‘)</td>
-  <tr>
-    <td><a href="./recognition/slowfast.md">SlowFast</a> (3D’)</td>
-    <td><a href="./recognition/timesformer.md">TimeSformer</a> (Transformer‘)</td>
-    <td><a href="./recognition/videoswin.md">VideoSwin</a> (Transformer’)</td>
-    <td><a href="./recognition/tokenshift_transformer.md">TokenShift</a> (3D’)</td>
-    <td><a href="./recognition/attention_lstm.md">AttentionLSTM</a> (RNN‘)</td>
-  </tr>
-  <tr>
-    <td><a href="./recognition/movinet.md">MoViNet</a> (Lite‘)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="5" style="font-weight:bold;">基于骨骼点的行为识别方法</td>
-  </tr>
-  <tr>
-    <td><a href="./recognition/stgcn.md">ST-GCN</a> (GCN’)</td>
-    <td><a href="./recognition/agcn.md">AGCN</a> (GCN‘)</td>
-    <td><a href="./recognition/agcn2s.md">2s-AGCN</a> (GCN‘)</td>
-    <td><a href="./recognition/ctrgcn.md">CTR-GCN</a> (GCN‘)</td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="5" style="font-weight:bold;">时序动作检测方法</td>
-  </tr>
-  <tr>
-    <td><a href="./localization/bmn.md">BMN</a> (One-stage‘)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="5" style="font-weight:bold;">视频时序分割</td>
-  </tr>
-  <tr>
-    <td><a href="./segmentation/mstcn.md">MS-TCN</a> </td>
-    <td><a href="./segmentation/asrf.md">ASRF</a> </td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="5" style="font-weight:bold;">时空动作检测方法</td>
-  </tr>
-  <tr>
-    <td><a href="./detection/SlowFast_FasterRCNN.md">SlowFast+Fast R-CNN</a>
-    <td></td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="5" style="font-weight:bold;">多模态</td>
-  </tr>
-  <tr>
-    <td><a href="./multimodal/actbert.md">ActBERT</a> (Learning‘)</td>
-    <td><a href="../../../applications/T2VLAD/README.md">T2VLAD</a> (Retrieval‘)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="5" style="font-weight:bold;">视频目标分割</td>
-  </tr>
-  <tr>
-    <td><a href="./segmentation/cfbi.md">CFBI</a> (Semi‘)</td>
-    <td><a href="../../../applications/EIVideo/EIVideo/docs/zh-CN/manet.md">MA-Net</a> (Supervised‘)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-  <tr>
-    <td colspan="5" style="font-weight:bold;">单目深度估计</td>
-  </tr>
-  <tr>
-    <td><a href="./estimation/adds.md">ADDS</a> (Unsupervised‘)</td>
-    <td></td>
-    <td></td>
-    <td></td>
-    <td></td>
-  </tr>
-</table>
-
-
-## 3. AI-Studio模型教程
-
-- [【官方】Paddle 2.1实现视频理解优化模型 -- PP-TSM](https://aistudio.baidu.com/aistudio/projectdetail/3399656?contributionType=1)
-- [【官方】Paddle 2.1实现视频理解优化模型 -- PP-TSN](https://aistudio.baidu.com/aistudio/projectdetail/2879980?contributionType=1)
-- [【官方】Paddle 2.1实现视频理解经典模型 -- TSN](https://aistudio.baidu.com/aistudio/projectdetail/2250682)
-- [【官方】Paddle 2.1实现视频理解经典模型 -- TSM](https://aistudio.baidu.com/aistudio/projectdetail/2310889)
-- [BMN视频动作定位](https://aistudio.baidu.com/aistudio/projectdetail/2250674)
-- [花样滑冰选手骨骼点动作识别ST-GCN教程](https://aistudio.baidu.com/aistudio/projectdetail/2417717)
-- [【实践】CV领域的Transformer模型TimeSformer实现视频理解](https://aistudio.baidu.com/aistudio/projectdetail/3413254?contributionType=1)
-
-## 4. Benchmark
-
-各模型训练推理速度参考 [Benchmark](../benchmark.md).
diff --git a/docs/zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md b/docs/zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md
deleted file mode 100644
index d0a64e948..000000000
--- a/docs/zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md
+++ /dev/null
@@ -1,140 +0,0 @@
-简体中文 | [English](../../../en/model_zoo/detection/SlowFast_FasterRCNN_en.md)
-
-# SlowFast_FasterRCNN
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-
-在开始使用之前，您需要按照以下命令安装额外的依赖包：
-```bash
-python -m pip install moviepy
-python -m pip install et_xmlfile
-python -m pip install paddledet
-```
-
-## 模型简介
-
-[SlowFast](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/slowfast.md)模型是视频领域的高精度模型之一，对于动作识别任务，还需要检测出当前画面人物，因此SlowFast_FasterRCNN模型以人的检测结果和视频数据为输入，通过SlowFast模型提取时空特征，然后利用FasterRCNN的head得到画面中每个人的动作和位置。
-
-我们提供了详尽理论及代码讲解，并可使用免费在线GPU算力资源，一键运行的AI Studio Notebook项目，使用链接：[基于SlowFast+FasterRCNN的动作识别](https://aistudio.baidu.com/aistudio/projectdetail/3267637?contributionType=1)
-
-详细内容请参考论文[SlowFast Networks for Video Recognition](https://arxiv.org/pdf/1812.03982.pdf)中AVA Action Detection相关内容。
-
-## 数据准备
-
-本项目利用[AVA数据集](https://research.google.com/ava/download.html)进行动作检测。AVA v2.2数据集包括430个视频，其中235个用于训练，64个用于验证，131个用于测试。对每个视频中15分钟的帧进行了标注，每秒标注一帧。标注文件格式为CSV。
-
-相关处理脚本在`data/ava/script`目录下。
-
-### 1 下载视频
-```
-bash  download_videos.sh
-```
-
-### 2 下载标注
-```
-bash  download_annotations.sh
-```
-
-### 3 下载检测结果
-
-```
-bash  fetch_ava_proposals.sh
-```
-
-### 4 视频切割
-把下载的视频中第15分钟起后面的15分钟的片段切割出来：
-
-```
-bash  cut_videos.sh
-```
-
-### 5 提取视频帧
-```
-bash  extract_rgb_frames.sh
-```
-
-此处以AVA v2.1版本为例，进行关键文件介绍：
-* ava_videos_15min_frames文件夹中存放以FPS为帧率抽取的视频帧；
-* ava_train_v2.1.csv文件存放训练数据标注；
-* ava_train_excluded_timestamps_v2.1.csv文件中存放废弃的时间戳数据；
-* ava_dense_proposals_train.FAIR.recall_93.9.pkl文件中为每个关键帧中人的位置和置信度数据；
-* ava_action_list_v2.1_for_activitynet_2018.pbtxt为动作类别数据。
-
-## 模型训练
-
-下载预训练模型：
-```
-wget https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams
-```
-
-
-* `-c`后面的参数是配置文件的路径。
-* `-w`后面的参数是finetuning或者测试时的权重，本案例将在Kinetics 400上训练的SlowFast R50模型作为预训练权重，通过下面的表格可获取。
-* `--validate`参数表示在训练过程中进行模型评估。
-
-```
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=logdir.ava main.py --validate -w SlowFast_8*8.pdparams -c configs/detection/ava/ava.yaml
-```
-
-## 模型评估
-
-基于训练好的模型进行评估：
-```
-python main.py --test \
-   -w output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams \
-   -c configs/detection/ava/ava.yaml
-```
-
-| architecture | depth | Pretrain Model |  frame length x sample rate  | MAP | AVA version | model |
-| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |------------- |
-| SlowFast | R50 | [Kinetics 400](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) | 8 x 8 | 23.2 | 2.1 | [`link`](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SlowFastRCNN_AVA.pdparams) |
-
-
-## 模型推理
-
-本项目动作识别分成两个阶段，第一个阶段得到人的proposals，然后再输入到SlowFast+FasterRCNN模型中进行动作识别。
-
-对于画面中人的检测，可利用[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)中的模型。
-
-PaddleDetection安装：
-```
-# 安装其他依赖
-cd PaddleDetection/
-pip install -r requirements.txt
-
-# 编译安装paddledet
-python setup.py install
-```
-
-下载训练好的检测模型参数：
-```
-wget https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams
-```
-
-导出模型：
-
-```
-!python tools/export_model.py \
-  -c configs/detection/ava/ava.yaml \
-  -o inference_output \
-  -p output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams
-```
-
-基于导出的模型做推理：
-
-```
-python tools/predict.py \
-    -c configs/detection/ava/ava.yaml \
-    --input_file "data/-IELREHXDEMO.mp4" \
-    --model_file "inference_output/AVA_SlowFast_FastRcnn.pdmodel" \
-    --params_file "inference_output/AVA_SlowFast_FastRcnn.pdiparams" \
-    --use_gpu=True \
-    --use_tensorrt=False
-```
diff --git a/docs/zh-CN/model_zoo/estimation/adds.md b/docs/zh-CN/model_zoo/estimation/adds.md
deleted file mode 100644
index 339507687..000000000
--- a/docs/zh-CN/model_zoo/estimation/adds.md
+++ /dev/null
@@ -1,133 +0,0 @@
-[English](../../../en/model_zoo/estimation/adds.md) | 简体中文
-
-# ADDS-DepthNet模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-在开始使用之前，您需要按照以下命令安装额外的依赖包：
-```bash
-python -m pip install scikit-image
-python -m pip install matplotlib
-```
-
-## 模型简介
-
-本模型以百度机器人与自动驾驶实验室的**ICCV 2021论文 [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628)** 为参考，
-复现了基于白天和夜晚图像的自监督单目深度估计模型，其利用了白天和夜晚的图像数据互补性质，减缓了昼夜图像较大的域偏移以及照明变化对深度估计的精度带来的影响，在具有挑战性的牛津RobotCar数据集上实现了全天图像的最先进的深度估计结果。
-
-
-## 数据准备
-
-Oxford RobotCar dataset数据下载及准备请参考[Oxford RobotCar dataset数据准备](../../dataset/Oxford_RobotCar.md)
-
-
-## 模型训练
-
-### Oxford RobotCar dataset数据集训练
-
-#### 下载并添加预训练模型
-
-1. 下载图像预训练模型[resnet18.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams)作为Backbone初始化参数，或通过wget命令下载
-
-   ```bash
-   wget -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams
-   ```
-
-2. 打开`PaddleVideo/configs/estimation/adds/adds.yaml`，将下载好的权重存放路径填写到下方`pretrained:`之后
-
-    ```yaml
-    MODEL: #MODEL field
-        framework: "DepthEstimator" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
-        backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
-            name: 'ADDS_DepthNet'
-            pretrained: 将路径填写到此处
-    ```
-
-#### 开始训练
-
-- Oxford RobotCar dataset数据集使用单卡训练，训练方式的启动命令如下：
-
-    ```bash
-    python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20
-    ```
-
-
-## 模型测试
-
-- ADDS-DepthNet模型在训练时同步进行验证（只对白天或者夜晚的数据进行验证），您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-  ```bash
-  Already save the best model (rmse)8.5531
-  ```
-
-- 由于模型暂时一次只能测试yaml文件中给定路径的一个白天或者夜晚的数据集，因此若要得到本文档开头处的完整测试分数，需要运行4次测试命令并分别记录下它们的指标（白天40m、白天60m、夜晚40m、夜晚60m）
-
-- 训练好的模型下载地址：[ADDS_car.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ADDS_car.pdparams)
-
-- 测试命令如下：
-
-  ```bash
-  # 夜晚40m
-  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_night_files.txt" -o MODEL.head.max_gt_depth=40
-
-  # 夜晚60m
-  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_night_files.txt" -o MODEL.head.max_gt_depth=60
-
-  # 白天40m
-  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_day_files.txt" -o MODEL.head.max_gt_depth=40
-
-  # 白天60m
-  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_day_files.txt" -o MODEL.head.max_gt_depth=60
-  ```
-
-    在Oxford RobotCar dataset的validation数据集上的测试指标如下：
-
-  | version     | Max Depth | Abs Rel | Sq Rel | RMSE  | RMSE log | <img src="https://latex.codecogs.com/svg.image?\delta&space;<&space;1.25&space;" title="\delta < 1.25 " /> | <img src="https://latex.codecogs.com/svg.image?\delta&space;<&space;1.25^2" title="\delta < 1.25^2" /> | <img src="https://latex.codecogs.com/svg.image?\delta&space;<&space;1.25^3" title="\delta < 1.25^3" /> |
-  | ----------- | --------- | ------- | ------ | ----- | -------- | ----------------- | ------------------- | ------------------- |
-  | ours(night) | 40        | 0.209   | 1.741  | 6.031 | 0.243    | 0.708             | 0.923               | 0.975               |
-  | ours(night) | 60        | 0.207   | 2.052  | 7.888 | 0.258    | 0.686             | 0.909               | 0.970               |
-  | ours(day)   | 40        | 0.114   | 0.574  | 3.411 | 0.157    | 0.860             | 0.977               | 0.993               |
-  | ours(day)   | 60        | 0.119   | 0.793  | 4.842 | 0.173    | 0.838             | 0.967               | 0.991               |
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/estimation/adds/adds.yaml -p data/ADDS_car.pdparams -o inference/ADDS
-```
-
-上述命令将生成预测所需的模型结构文件`ADDS.pdmodel`和模型权重文件`ADDS.pdiparams`以及`ADDS.pdiparams.info`文件，均存放在`inference/ADDS/`目录下
-
-上述bash命令中各个参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.png \
-                           --config configs/estimation/adds/adds.yaml \
-                           --model_file inference/ADDS/ADDS.pdmodel \
-                           --params_file inference/ADDS/ADDS.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-推理结束会默认以伪彩的方式保存下模型估计出的深度图。
-
-以下是样例图片和对应的预测深度图：
-
-<img src="../../../images/oxford_image.png" width = "512" height = "256" alt="image" align=center />
-
-<img src="../../../images/oxford_image_depth.png" width = "512" height = "256" alt="depth" align=center />
-
-
-## 参考论文
-
-- [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628), Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun
diff --git a/docs/zh-CN/model_zoo/localization/bmn.md b/docs/zh-CN/model_zoo/localization/bmn.md
deleted file mode 100644
index f923e86a6..000000000
--- a/docs/zh-CN/model_zoo/localization/bmn.md
+++ /dev/null
@@ -1,128 +0,0 @@
-[English](../../../en/model_zoo/localization/bmn.md) | 简体中文
-
-# BMN 视频动作定位模型
-
----
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-BMN模型是百度自研，2019年ActivityNet夺冠方案，为视频动作定位问题中proposal的生成提供高效的解决方案，在PaddlePaddle上首次开源。此模型引入边界匹配(Boundary-Matching, BM)机制来评估proposal的置信度，按照proposal开始边界的位置及其长度将所有可能存在的proposal组合成一个二维的BM置信度图，图中每个点的数值代表其所对应的proposal的置信度分数。网络由三个模块组成，基础模块作为主干网络处理输入的特征序列，TEM模块预测每一个时序位置属于动作开始、动作结束的概率，PEM模块生成BM置信度图。
-
-AI Studio项目使用链接：[ActivityNet Challenge 2019 冠军模型：BMN](https://aistudio.baidu.com/aistudio/projectdetail/2250674?contributionType=1)
-
-<p align="center">
-<img src="../../../images/BMN.png" height=300 width=400 hspace='10'/> <br />
-BMN Overview
-</p>
-
-## 数据准备
-
-BMN的训练数据采用ActivityNet1.3提供的数据集，数据下载及准备请参考[ActivityNet数据说明](../../dataset/ActivityNet.md)
-
-## 模型训练
-
-数据准备完毕后，可以通过如下方式启动训练：
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-
-python -B -m paddle.distributed.launch --gpus="0,1,2,3"  --log_dir=log_bmn main.py  --validate -c configs/localization/bmn.yaml
-```
-
-- 从头开始训练，使用上述启动命令行或者脚本程序即可启动训练，不需要用到预训练模型
-
-### 单卡训练
-
-单卡训练请将配置文件中的`DATASET.batch_size`字段修改为16，如下:
-
-```yaml
-DATASET: #DATASET field
-  batch_size: 16 #single card bacth size
-```
-
-单卡训练启动方式如下:
-
-```bash
-python -B main.py  --validate -c configs/localization/bmn.yaml
-```
-
-
-## 模型测试
-
-可通过如下方式进行模型测试:
-
-```bash
-python main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00009.pdparams -o DATASET.test_batch_size=1
-```
-
-- 目前仅支持**单卡**， `batch_size`为**1**进行模型测试，
-
--  请下载[activity\_net\_1\_3\_new.json](https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json)文件，并通过`METRIC.ground_truth_filename`字段指定该ground_truth文件，相较于原始的activity\_net.v1-3.min.json文件，我们过滤了其中一些失效的视频条目。
-
-- 通过 `-w`参数指定待测试模型文件的路径，您可以下载我们训练好的模型进行测试[BMN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams)
-
-- 上述程序会将运行结果保存在配置文件`METRIC.output_path`字段指定的路径，默认为`data/bmn/BMN_Test_output`文件夹下，测试结果保存在配置文件`METRIC.result_path`字段指定的文件，默认为`data/bmn/BMN_Test_results/bmn_results_validation.json`文件。
-
-- 我们基于ActivityNet官方提供的测试脚本，计算AR@AN和AUC。具体计算过程请参考[anet_prop.py](https://github.com/PaddlePaddle/PaddleVideo/blob/main/paddlevideo/metrics/ActivityNet/anet_prop.py)文件。
-
-- 注：评估时可能会出现loss为nan的情况。这是由于评估时用的是单个样本，可能存在没有iou>0.6的样本，所以为nan，对最终的评估结果没有影响。
-
-在ActivityNet1.3数据集下评估精度如下:
-
-| AR@1 | AR@5 | AR@10 | AR@100 | AUC |
-| :---: | :---: | :---: | :---: | :---: |
-| 33.26 | 49.48 | 56.86 | 75.19 | 67.23% |
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/localization/bmn.yaml \
-                                -p data/BMN.pdparams \
-                                -o inference/BMN
-```
-
-上述命令将生成预测所需的模型结构文件`BMN.pdmodel`和模型权重文件`BMN.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example_feat.list \
-                           --config configs/localization/bmn.yaml \
-                           --model_file inference/BMN/BMN.pdmodel \
-                           --params_file inference/BMN/BMN.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-- `input_file`为文本文件，指定待推断的文件信息，包括特征文件路径`feat_path`和视频时长(单位:s)`duration_second`。
-
-输出示例如下:
-
-```
-BMN Inference results of data/example_feat.npy :
-{'score': 0.7968077063560486, 'segment': [0.0, 122.9877]}
-{'score': 0.49097609519958496, 'segment': [12.423000000000002, 124.23]}
-{'score': 0.21395835280418396, 'segment': [39.7536, 122.9877]}
-{'score': 0.2106524258852005, 'segment': [0.0, 109.3224]}
-{'score': 0.06876271963119507, 'segment': [23.6037, 114.2916]}
-```
-
-- 默认只打印前5个得分最高的proposal，所有的预测结果可在输出文件中查看，默认输出文件路径为`data/bmn/BMN_INFERENCE_results`。输出路径可在配置文件中的`INFERENCE.result_path`自行修改。
-
-## 参考论文
-
-- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.
diff --git a/docs/zh-CN/model_zoo/localization/yowo.md b/docs/zh-CN/model_zoo/localization/yowo.md
deleted file mode 100644
index 7aac7a937..000000000
--- a/docs/zh-CN/model_zoo/localization/yowo.md
+++ /dev/null
@@ -1,135 +0,0 @@
-[English](../../../en/model_zoo/localization/yowo.md) | 简体中文
-
-# YOWO 视频动作检测模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-YOWO是具有两个分支的单阶段网络。一个分支通过2D-CNN提取关键帧（即当前帧）的空间特征，而另一个分支则通过3D-CNN获取由先前帧组成的剪辑的时空特征。为准确汇总这些特征，YOWO使用了一种通道融合和关注机制，最大程度地利用了通道间的依赖性。最后将融合后的特征进行帧级检测。
-
-<div align="center">
-<img src="../../../images/yowo.jpg">
-</div>
-
-
-## 数据准备
-
-UCF101-24数据下载及准备请参考[UCF101-24数据准备](../../dataset/ucf24.md)
-
-
-## 模型训练
-
-### UCF101-24数据集训练
-
-#### 下载并添加预训练模型
-
-1. 下载预训练模型 [resnext101_kinetics](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams) 和 [darknet](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam) 作为Backbone初始化参数，或通过wget命令下载
-
-   ```bash
-    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam
-    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams
-   ```
-
-2. 打开`PaddleVideo/configs/localization/yowo.yaml`，将下载好的权重存放路径分别填写到下方`pretrained_2d:`和`pretrained_3d:`之后
-
-    ```yaml
-    MODEL:
-        framework: "YOWOLocalizer"
-        backbone:
-            name: "YOWO"
-            num_class: 24
-            pretrained_2d: 将2D预训练模型路径填写到此处
-            pretrained_3d: 将3D预训练模型路径填写到此处
-    ```
-
-#### 开始训练
-
-- UCF101-24数据集使用单卡训练，训练方式的启动命令如下:
-
-    ```bash
-    python3 main.py -c configs/localization/yowo.yaml --validate --seed=1
-    ```
-    
-- 开启amp混合精度训练，可加速训练过程，其训练启动命令如下：
-
-    ```bash
-    python3 main.py --amp -c configs/localization/yowo.yaml --validate --seed=1
-    ```
-    
-- 另外您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`，参数用法请参考[config](../../contribute/config.md)。
-
-
-## 模型测试
-
-- YOWO 模型在训练时同步进行验证，您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-  ```
-  Already save the best model (fsocre)0.8779
-  ```
-
-- 由于 YOWO 模型测试模式的评价指标为的**Frame-mAP (@ IoU 0.5)**，与训练过程中验证模式采用的**fscore**不同，所以训练日志中记录的验证指标`fscore`不代表最终的测试分数，因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标，命令如下：
-
-  ```bash
-  python3 main.py -c configs/localization/yowo.yaml --test --seed=1 -w 'output/YOWO/YOWO_epoch_00005.pdparams'
-  ```
-
-
-  当测试配置采用如下参数时，在UCF101-24的test数据集上的测试指标如下：
-
-  | Model    | 3D-CNN backbone | 2D-CNN backbone | Dataset  |Input    | Frame-mAP <br>(@ IoU 0.5)    |   checkpoints  |
-  | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: |
-  | YOWO | 3D-ResNext-101 | Darknet-19 | UCF101-24 | 16-frames, d=1 | 80.94 | [YOWO.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/YOWO_epoch_00005.pdparams) |
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3 tools/export_model.py -c configs/localization/yowo.yaml -p 'output/YOWO/YOWO_epoch_00005.pdparams'
-```
-
-上述命令将生成预测所需的模型结构文件`YOWO.pdmodel`和模型权重文件`YOWO.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](../../usage.md#5-模型推理)
-
-### 使用预测引擎推理
-
-- 下载测试视频[HorseRiding.avi](https://videotag.bj.bcebos.com/Data/HorseRiding.avi)以快速体验，或通过wget命令下载，下载的视频放到`data/ucf24`目录下：
-
-```bash
-wget -nc https://videotag.bj.bcebos.com/Data/HorseRiding.avi
-```
-
-- 运行以下命令进行推理：
-
-```bash
-python3 tools/predict.py -c configs/localization/yowo.yaml -i 'data/ucf24/HorseRiding.avi' --model_file ./inference/YOWO.pdmodel --params_file ./inference/YOWO.pdiparams
-```
-
-- 推理结束后，将在`inference/YOWO_infer`目录下保存图片形式的预测结果。可通过运行以下命令，将图片序列转换为gif动图，以完成最终可视化：
-
-```
-python3 data/ucf24/visualization.py --frames_dir ./inference/YOWO_infer/HorseRiding --duration 0.04
-```
-
-最终生成的可视化结果如下:
-
-<div align="center">
-  <img  src="../../../images/horse_riding.gif" alt="Horse Riding">
-</div>
-
-可以看到，使用在UCF101-24上训练好的YOWO模型对```data/ucf24/HorseRiding.avi```进行预测，每张帧输出的类别均为HorseRiding，置信度为0.80左右。
-
-## 参考论文
-
-- [You Only Watch Once: A Unified CNN Architecture for Real-Time Spatiotemporal Action Localization](https://arxiv.org/pdf/1911.06644.pdf), Köpüklü O, Wei X, Rigoll G.
\ No newline at end of file
diff --git a/docs/zh-CN/model_zoo/multimodal/actbert.md b/docs/zh-CN/model_zoo/multimodal/actbert.md
deleted file mode 100644
index 3853968c2..000000000
--- a/docs/zh-CN/model_zoo/multimodal/actbert.md
+++ /dev/null
@@ -1,103 +0,0 @@
-[English](../../../en/model_zoo/multimodal/actbert.md) | 简体中文
-
-# ActBERT多模态预训练模型
-
----
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [参考论文](#参考论文)
-
-在开始使用之前，您需要按照以下命令安装额外的依赖包：
-```bash
-python -m pip install paddlenlp
-python -m pip install lmdb
-```
-
-## 模型简介
-
-ActBERT是百度在CVPR2020提出的多模态预训练模型，它结合输入文本、图像和视频动作三种模态，使用一种全新的纠缠编码模块从三个来源进行多模态特征学习，以增强两个视觉输入和语言之间的互动功能。模型采用RandomMask和NSP的方式进行训练，在文本视频搜索、视频描述生成等5个下游任务中表现优异。
-
-<div align="center">
-<img src="../../../images/actbert.png" height=400 width=500 hspace='10'/> <br />
-</div>
-
-
-## 数据准备
-
-HowTo100M数据下载及准备请参考[HowTo100M数据准备](../../dataset/howto100m.md)
-
-MSR-VTT数据下载及准备请参考[MSR-VTT数据准备](../../dataset/msrvtt.md)
-
-
-## 模型训练
-
-### HowTo100M数据集训练
-
-#### 下载并添加预训练模型
-
-下载BERT预训练模型[bert-base-uncased](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams)作为Backbone初始化参数，或是通过命令行下载
-
-```bash
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams
-```
-
-并将文件路径添加到配置文件中的`MODEL.framework.backbone.pretrained`字段，如下：
-
-```yaml
-MODEL:
-    framework: "ActBert"
-    backbone:
-        name: "BertForMultiModalPreTraining"
-        pretrained: 将路径填写到此处
-```
-
-- 由于训练数据集过大，本代码提供小数据训练功能，训练配置仅供参考~
-
-#### 开始训练
-
-- 训练启动命令如下:
-
-```bash
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_actbert  main.py  --validate -c configs/multimodal/actbert/actbert.yaml
-```
-
-- 开启amp混合精度训练，可加速训练过程，其训练启动命令如下：
-
-```bash
-export FLAGS_conv_workspace_size_limit=800 #MB
-export FLAGS_cudnn_exhaustive_search=1
-export FLAGS_cudnn_batchnorm_spatial_persistent=1
-
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_actbert  main.py  --amp --validate -c configs/multimodal/actbert/actbert.yaml
-```
-
-- 另外您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的。
-
-
-## 模型测试
-
-- 对下游任务：文本-视频检索，在MSR-VTT数据集上评估性能，评估脚本启动方式如下：
-
-
-```bash
-python3.7 main.py --test -c configs/multimodal/actbert/actbert_msrvtt.yaml -w Actbert.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-
-MSR-VTT数据集测试精度:
-
-| R@1 | R@5 | R@10 | Median R | Mean R | checkpoints |
-| :------: | :----------: | :----: | :----: | :----: | :----: |
-| 8.6 | 31.2 | 45.5 | 13.0 | 28.5 | [ActBERT.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ActBERT.pdparams) |
-
-
-## 参考论文
-
-- [ActBERT: Learning Global-Local Video-Text Representations
-](https://arxiv.org/abs/2011.07231), Linchao Zhu, Yi Yang
diff --git a/docs/zh-CN/model_zoo/partition/transnetv2.md b/docs/zh-CN/model_zoo/partition/transnetv2.md
deleted file mode 100644
index 51c251094..000000000
--- a/docs/zh-CN/model_zoo/partition/transnetv2.md
+++ /dev/null
@@ -1,85 +0,0 @@
-[English](../../../en/model_zoo/partition/transnetv2.md) | 简体中文
-
-# TransNetV2视频切分模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-在开始使用之前，您需要按照以下命令安装额外的依赖包：
-```bash
-python -m pip install ffmpeg-python==0.2.0
-```
-
-## 模型简介
-
-TransNetV2是一种基于深度学习的视频切分模型，通过DDCNN V2结构进行特征学习，并加入RGB颜色直方图、视频帧相似度进行更有效的特征提取，最终获取每一帧是否是镜头边界帧的概率，从而完成视频切分。该算法效果较好，且计算高效，十分适合工业落地。
-
-![](../../../images/transnetv2.png)
-
-本代码当前仅支持模型推理，模型的训练和测试将在后续提供。
-
-
-## 数据准备
-
-coming soon
-
-
-## 模型训练
-
-coming soon
-
-
-## 模型测试
-
-coming soon
-
-
-## 模型推理
-
-下载在ClipShots和TRECVID IACC.3上训练好的TransNetV2模型参数 [TransNetV2_shots.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams)，也可以通过命令行下载
-
-```bash
-wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams
-```
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/partitioners/transnetv2/transnetv2.yaml -p data/TransNetV2_shots.pdparams -o inference/TransNetV2
-```
-
-上述命令将生成预测所需的模型结构文件`TransNetV2.pdmodel`和模型权重文件`TransNetV2.pdiparams`以及`TransNetV2.pdiparams.info`文件，均存放在`inference/TransNetV2/`目录下
-
-上述bash命令中各个参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/partitioners/transnetv2/transnetv2.yaml \
-                           --model_file inference/TransNetV2/TransNetV2.pdmodel \
-                           --params_file inference/TransNetV2/TransNetV2.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-通过定义配置文件`transnetv2.yaml`中`output_path`参数，可以将每帧的预测概率输出到`{output_path}/example_predictions.txt`中，预测得到的镜头边界输出到`{output_path}/example_scenes.txt`中。
-通过定义配置文件`transnetv2.yaml`中`visualize`参数为True，可以将预测结果可视化，可视化结果保存至`{output_path}/example_vis.png`。
-
-输出示例如下:
-
-```bash
-Current video file: data/example.avi
-      Shot Boundarys: [[  0 130]]
-```
-
-可以看到，使用TransNetV2模型对`data/example.avi`进行预测，输出的视频镜头边界帧为[0,130]。
-## 参考论文
-
-- [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838), Tomáš Souček, Jakub Lokoč
diff --git a/docs/zh-CN/model_zoo/recognition/agcn.md b/docs/zh-CN/model_zoo/recognition/agcn.md
deleted file mode 100644
index a7a1dce56..000000000
--- a/docs/zh-CN/model_zoo/recognition/agcn.md
+++ /dev/null
@@ -1,134 +0,0 @@
-[English](../../../en/model_zoo/recognition/agcn.md) | 简体中文
-
-# AGCN基于骨骼的行为识别模型
-
----
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-
-我们对[ST-GCN模型](./stgcn.md)进行了优化，实现了精度更高的AGCN模型，模型优化细节参考[AGCN模型解析](https://www.bilibili.com/video/BV1w3411172G).
-
-
-## 数据准备
-
-花样滑冰比赛数据下载及准备请参考[花样滑冰数据准备](../../dataset/fsd.md)
-
-NTU-RGBD数据下载及准备请参考[NTU-RGBD数据准备](../../dataset/ntu-rgbd.md)
-
-## 模型训练
-
-### 花样滑冰比赛数据集训练
-
-- 花样滑冰比赛数据集使用单卡训练，启动命令如下:
-
-```bash
-python3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml
-```
-
-- 由于赛事未提供验证集数据，因此训练时不做valid。
-
-- 您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，参数用法请参考[config](../../contribute/config.md)。
-
-### NTU-RGBD数据集训练
-
-- NTU-RGBD数据集使用4卡训练，启动命令如下:
-
-```bash
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3"  --log_dir=log_agcn  main.py  --validate -c configs/recognition/agcn/agcn_ntucs.yaml
-```
-
-- `agcn_ntucs.yaml`配置文件为NTU-RGB+D数据集按cross-subject划分方式对应的训练配置。
-
-
-## 模型测试
-
-### 花样滑冰比赛数据集模型测试
-
-- 模型测试的启动命令如下：
-
-```bash
-python3.7 main.py --test -c configs/recognition/agcn/agcn_fsd.yaml  -w output/AGCN/AGCN_epoch_00100.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-- 评估结果保存在submission.csv文件中，可在[评测官网](https://aistudio.baidu.com/aistudio/competition/detail/115)提交查看得分。
-
-模型在花样滑冰比赛数据集上baseline实验精度如下:
-
-| Test_Data | Top-1 | checkpoints |
-| :----: | :----: | :---- |
-| Test_A | 62.29 | [AGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams) |
-
-
-### NTU-RGB+D数据集模型测试
-
-- 模型测试的启动命令如下：
-
-```bash
-python3.7 main.py --test -c configs/recognition/agcn/agcn_ntucs.yaml -w output/AGCN/AGCN_best.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-模型在NTU-RGB+D数据集上实验精度如下:
-
-| split | Top-1 | checkpoints |
-| :----: | :----: | :---- |
-| cross-subject | 83.27 | [AGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_ntucs.pdparams) |
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml \
-                                -p data/AGCN_fsd.pdparams \
-                                -o inference/AGCN
-```
-
-上述命令将生成预测所需的模型结构文件`AGCN.pdmodel`和模型权重文件`AGCN.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \
-                           --config configs/recognition/agcn/agcn_fsd.yaml \
-                           --model_file inference/AGCN/AGCN.pdmodel \
-                           --params_file inference/AGCN/AGCN.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```
-Current video file: data/fsd10/example_skeleton.npy
-        top-1 class: 27
-        top-1 score: 0.8965644240379333
-```
-
-可以看到，使用在FSD上训练好的AGCN模型对`data/example_skeleton.npy`进行预测，输出的top1类别id为`27`，置信度为0.89。
-
-## 参考论文
-
-- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
-
-- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1805.07694), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
-
-- [Skeleton-Based Action Recognition with Multi-Stream Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1912.06971), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
-
-- Many thanks to [li7819559](https://github.com/li7819559) and [ZhaoJingjing713](https://github.com/ZhaoJingjing713) for contributing the code.
diff --git a/docs/zh-CN/model_zoo/recognition/agcn2s.md b/docs/zh-CN/model_zoo/recognition/agcn2s.md
deleted file mode 100644
index c2a59c3f3..000000000
--- a/docs/zh-CN/model_zoo/recognition/agcn2s.md
+++ /dev/null
@@ -1,107 +0,0 @@
-[English](../../../en/model_zoo/recognition/agcn2s.md)  | 简体中文
-
-# 2s-AGCN基于骨骼的行为识别模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-## 模型简介
-
-![模型结构图](../../../images/agcn2s.png)
-
-[2s-AGCN](https://openaccess.thecvf.com/content_CVPR_2019/papers/Shi_Two-Stream_Adaptive_Graph_Convolutional_Networks_for_Skeleton-Based_Action_Recognition_CVPR_2019_paper.pdf)是发表在CVPR2019上的一篇针对ST-GCN进行改进的文章，文章提出双流自适应卷积网络，针对原始ST-GCN的缺点进行了改进。在现有的基于GCN的方法中，图的拓扑是手动设置的，并且固定在所有图层和输入样本上。另外，骨骼数据的二阶信息（骨骼的长度和方向）对于动作识别自然是更有益和更具区分性的，在当时方法中很少进行研究。因此，文章主要提出一个基于骨架节点和骨骼两种信息融合的双流网络，并在图卷积中的邻接矩阵加入自适应矩阵，大幅提升骨骼动作识别的准确率，也为后续的工作奠定了基础（后续的骨骼动作识别基本都是基于多流的网络框架）。
-
-## 数据准备
-
-数据下载及处理与CTR-GCN一致，详情请参考[NTU-RGBD数据准备](../../dataset/ntu-rgbd.md)
-
-## 模型训练
-
-### NTU-RGBD数据集训练
-
-模型训练参数的配置文件均在`configs/recognition/agcn2s/`文件夹中，启动命令如下:
-
-```bash
-# train cross subject with bone data
-python main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucs_bone.yaml --seed 1
-# train cross subject with joint data
-python main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucs_joint.yaml --seed 1
-# train cross view with bone data
-python main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucv_bone.yaml --seed 1
-# train cross view with joint data
-python main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucv_joint.yaml --seed 1
-```
-
-## 模型测试
-
-### NTU-RGBD数据集模型测试
-
-模型测试参数的配置文件均在`configs/recognition/agcn2s/`文件夹中，启动命令如下:
-
-```bash
-# test cross subject with bone data
-python main.py --test -c configs/recognition/2sagcn/2sagcn_ntucs_bone.yaml -w data/2SAGCN_ntucs_bone.pdparams
-# test cross subject with joint data
-python main.py --test -c configs/recognition/2sagcn/2sagcn_ntucs_joint.yaml -w data/2SAGCN_ntucs_joint.pdparams
-# test cross view with bone data
-python main.py --test -c configs/recognition/2sagcn/2sagcn_ntucv_bone.yaml -w data/2SAGCN_ntucv_bone.pdparams
-# test cross view with joint data
-python main.py --test -c configs/recognition/2sagcn/2sagcn_ntucv_joint.yaml -w data/2SAGCN_ntucv_joint.pdparams
-```
-
-* 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-模型在NTU-RGBD数据集上的测试效果如下
-
-|                |  CS   |   CV   |
-| :------------: | :---: | :----: |
-| Js-AGCN(joint) | 85.8% | 94.13% |
-| Bs-AGCN(bone)  | 86.7% | 93.9%  |
-
-训练日志：[日志](https://github.com/ELKYang/2s-AGCN-paddle/tree/main/work_dir/ntu)
-
-VisualDL可视化日志：[VDL](https://github.com/ELKYang/2s-AGCN-paddle/tree/main/runs)
-
-模型权重如下：
-
-|      | CS-Js                                                        | CS-Bs                                                        | CV-JS                                                        | CV-Bs                                                        |
-| ---- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| 地址 | [ntu_cs_agcn_joint](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cs_agcn_joint-48-30674.pdparams) | [ntu_cs_agcn_bone](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cs_agcn_bone-44-28170.pdparams) | [ntu_cv_agcn_joint](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cv_agcn_joint-38-22932.pdparams) | [ntu_cv_agcn_bone](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cv_agcn_bone-49-29400.pdparams) |
-
-## 模型推理
-
-### 导出inference模型（以cs_joint为例）
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/agcn2s/agcn2s_ntucs_joint.yaml \
-                                -p data/AGCN2s_ntucs_joint.pdparams \
-                                -o inference/AGCN2s_ntucs_joint
-```
-
-上述命令将生成预测所需的模型结构文件`AGCN2s_ntucs_joint.pdmodel`和模型权重文件`AGCN2s_ntucs_joint.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \
-                           --config configs/recognition/agcn2s/2sagcn_ntucs_joint.yaml \
-                           --model_file inference/AGCN2s_ntucs_joint/AGCN2s_ntucs_joint.pdmodel \
-                           --params_file inference/AGCN2s_ntucs_joint/AGCN2s_ntucs_joint.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-### 预测引擎推理结果
-![预测引擎推理结果图](../../../images/agcn2s_result.png)
-
-## 参考论文
-
-- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://openaccess.thecvf.com/content_CVPR_2019/papers/Shi_Two-Stream_Adaptive_Graph_Convolutional_Networks_for_Skeleton-Based_Action_Recognition_CVPR_2019_paper.pdf), Lei Shi and Yifan Zhang and Jian Cheng and Hanqing Lu
diff --git a/docs/zh-CN/model_zoo/recognition/attention_lstm.md b/docs/zh-CN/model_zoo/recognition/attention_lstm.md
deleted file mode 100644
index df04f07f2..000000000
--- a/docs/zh-CN/model_zoo/recognition/attention_lstm.md
+++ /dev/null
@@ -1,86 +0,0 @@
-简体中文 | [English](../../../en/model_zoo/recognition/attention_lstm.md)
-
-# AttentionLSTM
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-## 模型简介
-
-循环神经网络（RNN）常用于序列数据的处理，可建模视频连续多帧的时序信息，在视频分类领域为基础常用方法。
-该模型采用了双向长短时记忆网络（LSTM），将视频的所有帧特征依次编码。与传统方法直接采用LSTM最后一个时刻的输出不同，该模型增加了一个Attention层，每个时刻的隐状态输出都有一个自适应权重，然后线性加权得到最终特征向量。参考论文中实现的是两层LSTM结构，而**本模型实现的是带Attention的双向LSTM**。
-
-Attention层可参考论文[AttentionCluster](https://arxiv.org/abs/1711.09550)
-
-## 数据准备
-
-PaddleVide提供了在Youtube-8M数据集上训练和测试脚本。Youtube-8M数据下载及准备请参考[YouTube-8M数据准备](../../dataset/youtube8m.md)
-
-## 模型训练
-
-### Youtube-8M数据集训练
-
-#### 开始训练
-
-- Youtube-8M数据集使用8卡训练，feature格式下会使用视频和音频特征作为输入，数据的训练启动命令如下
-
-  ```bash
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm  main.py  --validate -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
-  ```
-
-## 模型测试
-
-命令如下：
-
-```bash
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm  main.py  --test -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml -w "output/AttentionLSTM/AttentionLSTM_best.pdparams"
-```
-
-当测试配置采用如下参数时，在Youtube-8M的validation数据集上的测试指标如下：
-
-| Hit@1 | PERR | GAP  | checkpoints |
-| :-----: | :---------: | :---: | ----- |
-|  89.05  | 80.49 | 86.30 |   [AttentionLSTM_yt8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams)      |
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \
-                                -p data/AttentionLSTM_yt8.pdparams \
-                                -o inference/AttentionLSTM
-```
-
-上述命令将生成预测所需的模型结构文件`AttentionLSTM.pdmodel`和模型权重文件`AttentionLSTM.pdiparams`。
-
-各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-模型推理)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.pkl \
-                           --config configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \
-                           --model_file inference/AttentionLSTM/AttentionLSTM.pdmodel \
-                           --params_file inference/AttentionLSTM/AttentionLSTM.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-输出示例如下：
-```bash
-Current video file: data/example.pkl
-        top-1 class: 11
-        top-1 score: 0.9841002225875854
-```
-可以看到，使用在Youtube-8M上训练好的AttentionLSTM模型对data/example.pkl进行预测，输出的top1类别id为11，置信度为0.98。
-## 参考论文
-
-- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen
-- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan
-
diff --git a/docs/zh-CN/model_zoo/recognition/ctrgcn.md b/docs/zh-CN/model_zoo/recognition/ctrgcn.md
deleted file mode 100644
index 8c39f2686..000000000
--- a/docs/zh-CN/model_zoo/recognition/ctrgcn.md
+++ /dev/null
@@ -1,132 +0,0 @@
-[English](../../../en/model_zoo/recognition/ctrgcn.md)  | 简体中文
-
-# CTR-GCN基于骨骼的行为识别模型
-
----
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-[CTR-GCN](https://github.com/Uason-Chen/CTR-GCN.git)是ICCV 2021提出的基于骨骼的行为识别模型，通过将改动应用在具有拓扑结构的人体骨骼数据上的图卷积，使用时空图卷积提取时空特征进行行为识别，提升了基于骨骼的行为识别任务精度。
-
-<div align="center">
-<img src="../../../images/ctrgcn.jpg" height=200 width=750 hspace='10'/> <br />
-</div>
-
-
-## 数据准备
-
-NTU-RGBD数据下载及准备请参考[NTU-RGBD数据准备](../../dataset/ntu-rgbd.md)
-
-
-## 模型训练
-
-### NTU-RGBD数据集训练
-
-- NTU-RGBD数据集，默认使用单卡训练，启动命令如下：
-
-```bash
-# joint modality
-python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml --seed 1
-
-# bone modality
-python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml --seed 1
-
-# motion modality
-python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml --seed 1
-
-# bone motion modality
-python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml --seed 1
-```
-
-- 若使用4卡训练，请线性调整学习率，训练启动命令如下:
-
-```bash
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3"  --log_dir=log_ctrgcn  main.py  --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml
-```
-
-- 配置文件`ctrgcn_ntucs_joint.yaml`为NTU-RGB+D数据集按cross-subject划分方式对应的训练配置。
-
-
-## 模型测试
-
-### NTU-RGB+D数据集模型测试
-
-- 模型测试的启动命令如下：
-
-```bash
-# joint modality
-python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml -w data/CTRGCN_ntucs_joint.pdparams
-
-# bone modality
-python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml -w data/CTRGCN_ntucs_bone.pdparams
-
-# motion modality
-python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml -w data/CTRGCN_ntucs_motion.pdparams
-
-# bone motion modality
-python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml -w data/CTRGCN_ntucs_bone_motion.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-模型在NTU-RGB+D数据集上实验精度如下:
-
-| split | modality | Top-1 | checkpoints |
-| :----: | :----: | :----: | :----: |
-| cross-subject | joint | 89.93 | [CTRGCN_ntucs_joint.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_joint.pdparams) |
-| cross-subject | bone | 85.24 | [CTRGCN_ntucs_bone.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone.pdparams) |
-| cross-subject | motion | 85.33 | [CTRGCN_ntucs_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_motion.pdparams) |
-| cross-subject | bone motion | 84.53 | [CTRGCN_ntucs_bone_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone_motion.pdparams) |
-
-
-
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \
-                                -p data/CTRGCN_ntucs_joint.pdparams \
-                                -o inference/CTRGCN
-```
-
-上述命令将生成预测所需的模型结构文件`CTRGCN_joint.pdmodel`和模型权重文件`CTRGCN_joint.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \
-                           --config configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \
-                           --model_file inference/CTRGCN_joint/CTRGCN_joint.pdmodel \
-                           --params_file inference/CTRGCN_joint/CTRGCN_joint.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```
-Current video file: data/example_NTU-RGB-D_sketeton.npy
-        top-1 class: 4
-        top-1 score: 0.999988317489624
-```
-
-可以看到，使用在NTU-RGBD数据集上训练好的ST-GCN模型对`data/example_NTU-RGB-D_sketeton.npy`进行预测，输出的top1类别id为`4`，置信度为0.999988317489624。
-
-
-## 参考论文
-
-- [Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213), Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming
diff --git a/docs/zh-CN/model_zoo/recognition/movinet.md b/docs/zh-CN/model_zoo/recognition/movinet.md
deleted file mode 100644
index 6f5ba7dd4..000000000
--- a/docs/zh-CN/model_zoo/recognition/movinet.md
+++ /dev/null
@@ -1,90 +0,0 @@
-[English](../../../en/model_zoo/recognition/movinet.md) | 简体中文
-
-# MoViNet视频分类模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-MoViNet是Google Research研发的移动视频网络。它使用神经结构搜索的方法来搜索MoViNet空间结构，使用因果卷积算子和流缓冲区来弥补准确率的损失，Temporal Ensembles提升准确率，是一个可以用于在线推理视频流的，轻量高效视频模型。
-
-## 数据准备
-
-Kinetics-400数据下载及准备请参考[kinetics-400数据准备](../../dataset/k400.md)
-
-## 模型训练
-
-数据准备完成后，可通过如下方式启动训练：
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_movinet main.py --validate -c configs/recognition/movinet/movinet_k400_frame.yaml
-```
-
-## 模型测试
-
-- MoViNet模型在训练时同步进行测试，您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-```txt
-Already save the best model (top1 acc)0.6489
-```
-
-- 若需单独运行测试代码，其启动命令如下：
-
-```bash
-python3.7 main.py --test -c configs/recognition/movinet/movinet_k400_frame.yaml -w output/MoViNet/MoViNet_best.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-当测试配置采用如下参数时，在Kinetics-400的validation数据集上的评估精度如下：
-
-| Config | Sampling method | num_seg | target_size | Top-1 | checkpoints |
-| :------: | :--------: | :-------: | :-------: | :-----: | :-----: |
-| A0 | Uniform | 50 | 172  | 66.62 | [MoViNetA0_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/MoViNetA0_k400.pdparams)  |
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/movinet/movinet_k400_frame.yaml \
-                                -p data/MoViNetA0_k400.pdparams \
-                                -o inference/MoViNetA0
-```
-
-上述命令将生成预测所需的模型结构文件`MoViNetA0.pdmodel`和模型权重文件`MoViNetA0.pdiparams`。
-
-各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/recognition/movinet/movinet_k400_frame.yaml \
-                           --model_file inference/MoViNetA0/MoViNet.pdmodel \
-                           --params_file inference/MoViNetA0/MoViNet.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-```txt
-Current video file: data/example.avi
-        top-1 class: 5
-        top-1 score: 0.7667049765586853
-```
-
-## 参考论文
-
-- [MoViNets: Mobile Video Networks for Efficient Video Recognition](https://arxiv.org/abs/2103.11511)
diff --git a/docs/zh-CN/model_zoo/recognition/posec3d.md b/docs/zh-CN/model_zoo/recognition/posec3d.md
deleted file mode 100644
index 42eadc830..000000000
--- a/docs/zh-CN/model_zoo/recognition/posec3d.md
+++ /dev/null
@@ -1,110 +0,0 @@
-[English](../../../en/model_zoo/recognition/posec3d.md) | 简体中文
-
-# PoseC3D基于骨骼的行为识别模型
-
----
-## 内容
-
-- [PoseC3D基于骨骼的行为识别模型](#posec3d基于骨骼的行为识别模型)
-  - [内容](#内容)
-  - [模型简介](#模型简介)
-  - [数据准备](#数据准备)
-  - [模型训练](#模型训练)
-    - [UCF101数据集训练](#ucf101数据集训练)
-  - [模型测试](#模型测试)
-    - [UCF101数据集模型测试](#ucf101数据集模型测试)
-  - [模型推理](#模型推理)
-    - [导出inference模型](#导出inference模型)
-    - [使用预测引擎推理](#使用预测引擎推理)
-  - [参考论文](#参考论文)
-
-
-## 模型简介
-
-
-人体骨架作为人类行为的一种简洁的表现形式，近年来受到越来越多的关注。许多基于骨架的动作识别方法都采用了图卷积网络（GCN）来提取人体骨架上的特征。尽管在以前的工作中取得了积极的成果，但基于GCN的方法在健壮性、互操作性和可扩展性方面受到限制。在本文中，作者提出了一种新的基于骨架的动作识别方法PoseC3D，它依赖于3D热图堆栈而不是图形序列作为人体骨架的基本表示。与基于GCN的方法相比，PoseC3D在学习时空特征方面更有效，对姿态估计噪声更具鲁棒性，并且在跨数据集环境下具有更好的通用性。此外，PoseC3D可以在不增加计算成本的情况下处理多人场景，其功能可以在早期融合阶段轻松与其他模式集成，这为进一步提升性能提供了巨大的设计空间。在四个具有挑战性的数据集上，PoseC3D在单独用于Keletons和与RGB模式结合使用时，持续获得优异的性能。
-
-## 数据准备
-
-UCF-101-Skeleton数据集来自mmaction2项目，是由ResNet50作为主干网的Faster-RCNN识别人类，然后使用HRNet-w32实现动作估计。地址如下:
-
-[https://github.com/open-mmlab/mmaction2/tree/master/tools/data/skeleton](https://github.com/open-mmlab/mmaction2/tree/master/tools/data/skeleton)
-
-以及预训练模型下载地址:
-
-[https://aistudio.baidu.com/aistudio/datasetdetail/140593](https://aistudio.baidu.com/aistudio/datasetdetail/140593)
-
-## 模型训练
-
-### UCF101数据集训练
-
-- UCF101数据集使用单卡训练，启动命令如下:
-
-```bash
-python3.7 main.py --validate -c configs/recognition/posec3d/posec3d.yaml --weights res3d_k400.pdparams
-```
-
-
-
-- 您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，参数用法请参考[config](../../tutorials/config.md)。
-
-
-## 模型测试
-
-### UCF101数据集模型测试
-
-- 模型测试的启动命令如下：
-
-```bash
-python3.7 main.py --test -c configs/recognition/posec3d/posec3d.yaml  -w output/PoseC3D/PoseC3D_epoch_0012.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-
-模型在UCF101数据集上baseline实验精度如下:
-
-| Test_Data | Top-1 | checkpoints |
-| :----: | :----: | :---- |
-| UCF101 test1 | 87.05 | [PoseC3D_ucf101.pdparams]() |
-
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/posec3d/posec3d.yaml \
-                                -p data/PoseC3D_ucf101.pdparams \
-                                -o inference/PoseC3D
-```
-
-上述命令将生成预测所需的模型结构文件`PoseC3D.pdmodel`和模型权重文件`PoseC3D.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example_UCF101_skeleton.pkl\
-                           --config configs/recognition/posec3d/posec3d.yaml \
-                           --model_file inference/PoseC3D/PoseC3D.pdmodel \
-                           --params_file inference/PoseC3D/PoseC3D.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```
-Current video file: data/example_UCF101_skeleton.pkl
-	top-1 class: 0
-	top-1 score: 0.6731489896774292
-```
-
-可以看到，使用在UCF101上训练好的PoseC3D模型对`data/example_UCF101_skeleton.pkl`进行预测，输出的top1类别id为`0`，置信度为0.67。
-
-## 参考论文
-
-- [Revisiting Skeleton-based Action Recognition](https://arxiv.org/pdf/2104.13586v1.pdf), Haodong Duan, Yue Zhao, Kai Chen, Dian Shao, Dahua Lin, Bo Dai
diff --git a/docs/zh-CN/model_zoo/recognition/pp-timesformer.md b/docs/zh-CN/model_zoo/recognition/pp-timesformer.md
deleted file mode 100644
index 0cb3cf25c..000000000
--- a/docs/zh-CN/model_zoo/recognition/pp-timesformer.md
+++ /dev/null
@@ -1,157 +0,0 @@
-[English](../../../en/model_zoo/recognition/pp-timesformer.md) | 简体中文
-
-# PP-TimeSformer视频分类模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-我们对[TimeSformer模型](./timesformer.md)进行了改进和优化，得到了更高精度的2D实用视频分类模型**PP-TimeSformer**。在不增加参数量和计算量的情况下，在UCF-101、Kinetics-400等数据集上精度显著超过原版，在Kinetics-400数据集上的精度如下表所示。
-
-| Version | Top1 |
-| :------ | :----: |
-| Ours ([swa](#refer-anchor-1)+distill+16frame) | 79.44 |
-| Ours ([swa](#refer-anchor-1)+distill)  | 78.87 |
-| Ours ([swa](#refer-anchor-1)) | **78.61** |
-| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/timesformer#kinetics-400) | 77.92 |
-
-
-## 数据准备
-
-K400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
-
-UCF101数据下载及准备请参考[UCF-101数据准备](../../dataset/ucf101.md)
-
-
-## 模型训练
-
-### Kinetics-400数据集训练
-
-#### 下载并添加预训练模型
-
-1. 下载图像预训练模型[ViT_base_patch16_224_miil_21k.pdparams](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams)作为Backbone初始化参数，或通过wget命令下载
-
-   ```bash
-   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
-   ```
-
-2. 打开`PaddleVideo/configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml`，将下载好的权重存放路径填写到下方`pretrained:`之后
-
-    ```yaml
-    MODEL:
-        framework: "RecognizerTransformer"
-        backbone:
-            name: "VisionTransformer"
-            pretrained: 将路径填写到此处
-    ```
-
-#### 开始训练
-
-- Kinetics400数据集使用8卡训练，训练方式的启动命令如下:
-
-    ```bash
-    # videos数据格式
-    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptimesformer  main.py  --validate -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml
-    ```
-
-- 开启amp混合精度训练，可加速训练过程，其训练启动命令如下：
-
-    ```bash
-    export FLAGS_conv_workspace_size_limit=800 # MB
-    export FLAGS_cudnn_exhaustive_search=1
-    export FLAGS_cudnn_batchnorm_spatial_persistent=1
-    # videos数据格式
-    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptimesformer  main.py --amp --validate -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml
-    ```
-
-- 另外您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`，参数用法请参考[config](../../tutorials/config.md)。
-
-
-## 模型测试
-
-- PP-TimeSformer模型在训练时同步进行验证，您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-  ```
-  Already save the best model (top1 acc)0.7258
-  ```
-
-- 由于PP-TimeSformer模型测试模式的采样方式是速度稍慢但精度高一些的**UniformCrop**，与训练过程中验证模式采用的**RandomCrop**不同，所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数，因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标，命令如下：
-
-  ```bash
-  # 8-frames 模型测试命令
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptimesformer  main.py  --test -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml -w "output/ppTimeSformer/ppTimeSformer_best.pdparams"
-
-  # 16-frames模型测试命令
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptimesformer main.py --test \
-  -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
-  -o MODEL.backbone.num_seg=16 \
-  -o MODEL.runtime_cfg.test.num_seg=16 \
-  -o MODEL.runtime_cfg.test.avg_type='prob' \
-  -o PIPELINE.test.decode.num_seg=16 \
-  -o PIPELINE.test.sample.num_seg=16 \
-  -w "data/ppTimeSformer_k400_16f_distill.pdparams"
-  ```
-
-
-  当测试配置采用如下参数时，在Kinetics-400的validation数据集上的测试指标如下：
-
-   | backbone           | Sampling method | num_seg | target_size | Top-1 | checkpoints |
-   | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |
-   | Vision Transformer |   UniformCrop   |   8    |     224     | 78.61 | [ppTimeSformer_k400_8f.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f.pdparams) |
-   | Vision Transformer | UniformCrop | 8 | 224 | 78.87 | [ppTimeSformer_k400_8f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f_distill.pdparams) |
-   | Vision Transformer | UniformCrop | 16 | 224 | 79.44 | [ppTimeSformer_k400_16f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_16f_distill.pdparams) |
-
-
-- 测试时，PP-TimeSformer视频采样策略为使用linspace采样：时序上，从待采样视频序列的第一帧到最后一帧区间内，均匀生成`num_seg`个稀疏采样点（包括端点）；空间上，选择长边两端及中间位置（左中右 或 上中下）3个区域采样。1个视频共采样1个clip。
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
-                                -p data/ppTimeSformer_k400_8f.pdparams \
-                                -o inference/ppTimeSformer
-```
-
-上述命令将生成预测所需的模型结构文件`ppTimeSformer.pdmodel`和模型权重文件`ppTimeSformer.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](../../start.md#2-模型推理)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
-                           --model_file inference/ppTimeSformer/ppTimeSformer.pdmodel \
-                           --params_file inference/ppTimeSformer/ppTimeSformer.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```
-Current video file: data/example.avi
-        top-1 class: 5
-        top-1 score: 0.9997474551200867
-```
-
-可以看到，使用在Kinetics-400上训练好的ppTimeSformer模型对`data/example.avi`进行预测，输出的top1类别id为`5`，置信度为0.99。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`，可知预测类别名称为`archery`。
-
-## 参考论文
-
-- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
-- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
-<div id="refer-anchor-1"></div>
-
-- [Averaging Weights Leads to Wider Optima and Better Generalization](https://arxiv.org/abs/1803.05407v3), Pavel Izmailov, Dmitrii Podoprikhin, Timur Garipov
-- [ImageNet-21K Pretraining for the Masses](https://arxiv.org/pdf/2104.10972v4.pdf), Tal Ridnik, Emanuel Ben-Baruch, Asaf Noy
diff --git a/docs/zh-CN/model_zoo/recognition/pp-tsm.md b/docs/zh-CN/model_zoo/recognition/pp-tsm.md
deleted file mode 100644
index a8f113dbc..000000000
--- a/docs/zh-CN/model_zoo/recognition/pp-tsm.md
+++ /dev/null
@@ -1,283 +0,0 @@
-[English](../../../en/model_zoo/recognition/pp-tsm.md) | 简体中文
-
-# PP-TSM视频分类模型
-
----
-## 目录
-
-- [1. 简介](#1)
-- [2. 性能benchmark](#2)
-- [3. 数据准备](#3)
-- [4. 模型训练](#4)
-    - [4.1 预训练模型下载](#41)
-    - [4.2 多卡训练](#42)
-    - [4.3 蒸馏训练](#43)
-    - [4.4 配置文件说明](#44)
-    - [4.5 配置文件推荐使用](#45)
-- [5. 模型测试](#5)
-    - [5.1 中心采样测试](#51)
-    - [5.2 密集采样测试](#52)
-- [6. 模型推理部署](#6)
-    - [6.1 导出推理模型](#61)
-    - [6.2 基于python预测引擎推理](#62)
-    - [6.3 基于c++预测引擎推理](#63)
-    - [6.4 服务化部署](#64)
-    - [6.5 Paddle2ONNX 模型预测与转换](#65)
-- [7. 模型库下载](#7)
-- [8. 参考论文](#8)
-
-<a name="1"></a>
-## 1. 简介
-
-视频分类与图像分类相似，均属于识别任务，对于给定的输入视频，视频分类模型需要输出其预测的标签类别。如果标签都是行为类别，则该任务也常被称为**行为识别**。与图像分类不同的是，视频分类往往需要利用多帧图像之间的时序信息。PP-TSM是PaddleVideo自研的实用产业级视频分类模型，在实现前沿算法的基础上，考虑精度和速度的平衡，进行模型瘦身和精度优化，使其可能满足产业落地需求。
-
-### PP-TSM
-
-PP-TSM基于ResNet-50骨干网络进行优化，从数据增强、网络结构微调、训练策略、BN层优化、预训练模型选择、模型蒸馏等6个方面进行模型调优，在中心采样评估方式下，Kinetics-400上精度较原论文实现提升3.95个点。更多细节请参考[PP-TSM模型解析](https://zhuanlan.zhihu.com/p/382134297)。
-
-### PP-TSMv2
-
-PP-TSMv2是轻量化的视频分类模型，基于CPU端模型[PP-LCNetV2](https://github.com/PaddlePaddle/PaddleClas/blob/release/2.4/docs/zh_CN/models/PP-LCNetV2.md)进行优化，从骨干网络与预训练模型选择、数据增强、tsm模块调优、输入帧数优化、解码速度优化、DML蒸馏、LTA模块等7个方面进行模型调优，在中心采样评估方式下，精度达到75.16%，输入10s视频在CPU端的推理速度仅需456ms。更多细节参考[PP-TSMv2技术报告](./pp-tsm_v2.md)。
-
-
-<a name="2"></a>
-## 2. 性能benchmark
-
-PP-TSMv2模型与主流模型之间CPU推理速度对比(按预测总时间排序)：
-
-|模型名称 | 骨干网络 | 精度% | 预处理时间ms | 模型推理时间ms | 预测总时间ms |
-| :---- | :---- | :----: |:----: |:----: |:----: |
-| PP-TSM | MobileNetV2 |  68.09 | 52.62 | 137.03 | 189.65 |
-| PP-TSM | MobileNetV3 |  69.84| 53.44 | 139.13 | 192.58 |
-| **PP-TSMv2** | PP-LCNet_v2.8f | **72.45**| 53.37 | 189.62 | **242.99** |
-| **PP-TSMv2** | PP-LCNet_v2.16f |	**75.16**|  68.07 | 388.64 | **456.71** |
-| SlowFast | 4*16 |74.35 | 110.04 | 1201.36 | 1311.41 |
-| TSM | R50 |  71.06 | 52.47 | 1302.49 | 1354.96 |
-|PP-TSM	| R50 |	75.11 | 52.26  | 1354.21 | 1406.48 |
-|*MoViNet | A0 | 66.62 | 148.30 |	1290.46 | 1438.76 |
-|PP-TSM	| R101 |  76.35| 52.50 | 2236.94 | 2289.45 |
-| TimeSformer |	base |	 77.29 | 297.33 |	14034.77 |	14332.11 |
-| TSN | R50	| 69.81 | 860.41 | 18359.26 | 19219.68 |
-| *VideoSwin | B | 82.4 | 76.21 | 32983.49 | 33059.70 |
-
-
-* 注: 带`*`表示该模型未使用mkldnn进行预测加速。
-
-更多细节请查看[benchmark](../../benchmark.md)文档。
-
-<a name="3"></a>
-## 3. 数据准备
-
-Kinetics-400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
-
-UCF101数据下载及准备请参考[UCF-101数据准备](../../dataset/ucf101.md)
-
-<a name="4"></a>
-## 4. 模型训练
-
-下面以Kinetics-400数据集为例，说明模型训练、测试、推理、压缩方法。
-
-<a name="41"></a>
-### 4.1 预训练模型下载
-
-PP-TSM模型使用[PaddleClas ssld](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/zh_CN/advanced_tutorials/ssld.md)图像预训练模型作为Backbone初始化参数，各预训练模型下载链接如下：
-
-|模型名称 | 骨干网络 | 预训练模型 |
-| :---- | :---- | :----: |
-| PP-TSMv2 | **LCNet_v2** |[下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_ssld_pretrained.pdparams) |
-| PP-TSM | **ResNet50** | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams) |
-| PP-TSM | MobileNetV2 | [下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_ssld_pretrained.pdparams) |
-| PP-TSM | MobileNetV3 | [下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_ssld_pretrained.pdparams) |
-| PP-TSM | ResNet101 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams) |
-
-更多预训练模型下载链接可参考`paddlevideo/modeling/backbones/pptsm_xx.py`中各文件头部注释。
-
-下载完成后，将文件路径添加到配置文件中的`MODEL.framework.backbone.pretrained`字段，如下：
-
-```yaml
-MODEL:
-    framework: "Recognizer2D"
-    backbone:
-        name: "ResNetTweaksTSM"
-        pretrained: 将路径填写到此处
-```
-
-<a name="42"></a>
-### 4.2 多卡训练
-
-PP-TSMv2在Kinetics400数据集使用8卡训练，多卡训练启动命令如下:
-
-```bash
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform.yaml
-```
-
-- 训练各参数含义参考[使用说明](../../usage.md)，若希望加速训练过程，可以按照使用说明第6章节开启混合精度训练。
-
-- `batch_size`可以根据机器显存大小进行调整，请注意`batch_size`调整后学习率大小`learning rate`也需要按比例调整。
-
-
-<a name="43"></a>
-### 4.3 蒸馏训练
-
-通过模型蒸馏将大模型的知识迁移到小模型中，可以进一步提升模型精度。PP-TSMv2基于DML蒸馏，teacher模型使用PP-TSM ResNet-50 backbone。蒸馏训练启动方式如下：
-
-```bash
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml
-```
-
-知识蒸馏更多细节参考[知识蒸馏](../../distillation.md)。
-
-
-<a name="44"></a>
-### 4.4 配置文件说明
-
-PP-TSM模型提供的各配置文件均放置在[configs/recognition/pptsm](../../../../configs/recognition/pptsm)目录下，配置文件名按如下格式组织:
-
-`模型名称_骨干网络名称_数据集名称_数据格式_测试方式_其它.yaml`。
-
-- 数据格式包括`frame`和`video`，`video`表示使用在线解码的方式进行训练，`frame`表示先将视频解码成图像帧存储起来，训练时直接读取图片进行训练。使用不同数据格式，仅需修改配置文件中的`DATASET`和`PIPELINE`字段，参考[pptsm_k400_frames_uniform.yaml](../../../../configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml)和[pptsm_k400_videos_uniform.yaml](../../../../configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml)。注意，由于编解码的细微差异，两种格式训练得到的模型在精度上可能会有些许差异。
-
-- 测试方式包括`uniform`和`dense`，uniform表示中心采样，dense表示密集采样，更多细节参考第5章节模型测试部分。
-
-- 您也可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的。
-
-<a name="45"></a>
-### 4.5 配置文件推荐使用
-
-- 1. 数据格式：如硬盘存储空间足够，推荐使用`frame`格式，解码一次后，后续可以获得更快的训练速度。相较于使用视频格式训练，frame格式输入可以加快训练速度，加速比约4-5倍，但会占用更大的存储空间，如Kinetics-400数据集video格式135G，解码成图像后需要2T。
-
-- 2. 测试方式：对于产业落地场景，推荐使用`uniform`方式，简洁高效，可以获得较好的精度与速度平衡。
-
-- 3. 对于CPU或端侧需求，推荐使用`PP-TSMv2`，精度较高，速度快，具体性能和速度对比请查看[benchmark](../../benchmark.md)文档。PP-TSMv2提供8帧输入和16帧输入两套配置，8帧速度更快，精度稍低。16帧精度更高，速度稍慢。如果追求高精度，推荐使用16帧，配置文件为无蒸馏-[pptsm_lcnet_k400_16frames_uniform.yaml](../../../../configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform.yaml)，加蒸馏-[pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml](../../../../configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml)。相对于无蒸馏，蒸馏后能获得更高的精度，但训练时需要更大的显存，以运行教师模型。如果对速度要求极高，推荐使用8帧，配置文件为无蒸馏-[pptsm_lcnet_k400_8frames_uniform.yaml](../../../../configs/recognition/pptsm/v2/pptsm_lcnet_k400_8frames_uniform.yaml)，加蒸馏-[pptsm_lcnet_k400_8frames_uniform_dml_distillation.yaml](../../../../configs/recognition/pptsm/v2/pptsm_lcnet_k400_8frames_uniform_dml_distillation.yaml)。
-
-- 4. 对于GPU服务器端需求，推荐使用`PP-TSM`，对应配置文件为[pptsm_k400_frames_uniform.yaml](../../../../configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml)。GPU端推理，速度瓶颈更多在于数据预处理(视频编解码)部分，更优的解码器和更高的精度，会是侧重考虑的部分。
-
-<a name="5"></a>
-## 5. 模型测试
-
-对于视频分类任务，模型测试时有两种不同的方式，`中心采样`(Uniform)和`密集采样`(Dense)。中心采样速度快，适合产业应用，但精度稍低。密集采样能进一步提升精度，但由于测试要对多个clip进行预测，比较耗时。轻量化模型PP-TSMv2统一使用中心采样方式进行评估。PP-TSM则提供两种不同的评估方式。
-
-<a name="51"></a>
-### 5.1 中心采样测试
-
-中心采样测试，1个视频共采样1个clips。对输入视频，时序上，等分成`num_seg`段，每段中间位置采样1帧；空间上，中心位置采样。对Uniform采样方式，PP-TSM模型在训练时同步进行测试，您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-```txt
-Already save the best model (top1 acc)0.7467
-```
-
-也可以使用如下命令对训练好的模型进行测试：
-```bash
-python3 main.py --test -c configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml -w output/PPTSMv2/PPTSMv2_best.pdparams
-```
-
-<a name="52"></a>
-### 5.2 密集采样测试
-
-密集采样测试，1个视频共采样`10*3=30`个clips。时序上，先等分成10个片段，每段从起始位置开始，以`64//num_seg`为间隔连续采样`num_seg`帧；空间上，左中，中心，右中3个位置采样。对Dense采样方式，需要在训练完成后单独运行测试代码，其启动命令如下：
-
-```bash
-python3 main.py --test -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml -w output/ppTSM/ppTSM_best.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-
-<a name="6"></a>
-## 6. 模型推理
-
-<a name="61"></a>
-### 导出推理模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml \
-                                -p output/PPTSMv2/PPTSMv2_best.pdparams \
-                                -o inference/PPTSMv2
-```
-
-上述命令会在`inference/PPTSMv2`下生成预测所需的文件，结构如下:
-```
-├── inference/PPTSMv2
-│   ├── PPTSMv2.pdiparams       # 模型权重文件
-│   ├── PPTSMv2.pdiparams.info  # 模型信息文件
-│   └── PPTSMv2.pdmodel           # 模型结构文件
-```
-
-<a name="62"></a>
-### 基于python预测引擎推理
-
-运行下面命令，对示例视频文件`data/example.avi`进行分类:
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/recognition/pptsm/v2/pptsm_lcnet_k400_16frames_uniform_dml_distillation.yaml \
-                           --model_file inference/PPTSMv2/PPTSMv2.pdmodel \
-                           --params_file inference/PPTSMv2/PPTSMv2.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-
-输出示例如下:
-
-```
-Current video file: data/example.avi
-        top-1 class: 5
-        top-1 score: 1.0
-```
-
-
-可以看到，使用在Kinetics-400上训练好的PP-TSMv2模型对`data/example.avi`进行预测，输出的top1类别id为`5`，置信度为1.0。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`，可知预测类别名称为`archery`。
-
-<a name="63"></a>
-### 基于c++预测引擎推理
-
-PaddleVideo 提供了基于 C++ 预测引擎推理的示例，您可以参考[服务器端C++预测](../../../../deploy/cpp_infer/)来完成相应的推理部署。
-
-
-<a name="64"></a>
-### 服务化部署
-
-Paddle Serving 提供高性能、灵活易用的工业级在线推理服务。Paddle Serving 支持 RESTful、gRPC、bRPC 等多种协议，提供多种异构硬件和多种操作系统环境下推理解决方案。更多关于Paddle Serving 的介绍，可以参考[Paddle Serving](https://github.com/PaddlePaddle/Serving) 代码仓库。
-
-PaddleVideo 提供了基于 Paddle Serving 来完成模型服务化部署的示例，您可以参考[基于python的模型服务化部署](../../../../deploy/python_serving/)或[基于c++的模型服务化部署](../../../../deploy/cpp_serving/)来完成相应的部署工作。
-
-
-<a name="65"></a>
-### Paddle2ONNX 模型预测与转换
-
-Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式。通过 ONNX 可以完成将 Paddle 模型到多种推理引擎的部署，包括TensorRT/OpenVINO/MNN/TNN/NCNN，以及其它对 ONNX 开源格式进行支持的推理引擎或硬件。更多关于 Paddle2ONNX 的介绍，可以参考[Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) 代码仓库。
-
-PaddleVideo 提供了基于 Paddle2ONNX 来完成 inference 模型转换 ONNX 模型并作推理预测的示例，您可以参考[Paddle2ONNX 模型转换与预测](../../../../deploy/paddle2onnx/)来完成相应的部署工作。
-
-
-<a name="7"></a>
-## 7. 模型库下载
-
-在Kinetics-400数据集上模型效果:
-
-| 模型名称 | 骨干网络 | 测试方式 | 采样帧数 | Top-1% | 训练模型 |
-| :------: | :----------: | :----: | :----: | :----: | :---- |
-| PP-TSMv2 | LCNet_v2 |  Uniform | 8 | 71.81 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PPTSMv2_k400_8f.pdparams) |
-| PP-TSMv2 | LCNet_v2 |  Uniform | 16 | 73.1 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PPTSMv2_k400_16f.pdparams) |
-| PP-TSM | MobileNetV2 |  Uniform | 8 | 68.09 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_mv2_k400.pdparams) |
-| PP-TSM | MobileNetV3 |  Uniform | 8 | 69.84 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/ppTSM_mv3_k400.pdparams) |
-| PP-TSM | ResNet50 |  Uniform | 8 | 74.54 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams) |
-| PP-TSM | ResNet50 |  Dense | 8 | 75.69 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense.pdparams) |
-| PP-TSM | ResNet101 | Dense | 8 | 77.15 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_dense_r101.pdparams) |
-
-
-蒸馏模型:
-
-| 模型名称 | 骨干网络 | 蒸馏方式 | 测试方式 | 采样帧数 | Top-1% | 训练模型 |
-| :------: | :----------: | :----: | :----: | :----: | :---- | :---- |
-| PP-TSMv2 | LCNet_v2 | DML | Uniform | 8 | 72.45 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PPTSMv2_k400_8f_dml.pdparams) \| [Student模型](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PPTSMv2_k400_8f_dml_student.pdparams) |
-| PP-TSMv2 | LCNet_v2 | DML | Uniform | 16 | 75.16 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PPTSMv2_k400_16f_dml.pdparams) \| [Student模型](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/PPTSMv2_k400_16f_dml_student.pdparams) |
-| PP-TSM | ResNet50 | KD | Uniform | 8 | 75.11 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform_distill.pdparams) |
-| PP-TSM | ResNet50 | KD | Dense | 8 | 76.16 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense_distill.pdparams) |
-| PP-TSM | ResNet101 | KD | Uniform | 8 | 76.35 | [下载链接](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_uniform_distill_r101.pdparams) |
-
-
-## 参考论文
-
-- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
-- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
diff --git a/docs/zh-CN/model_zoo/recognition/pp-tsm_v2.md b/docs/zh-CN/model_zoo/recognition/pp-tsm_v2.md
deleted file mode 100644
index 90f70c53b..000000000
--- a/docs/zh-CN/model_zoo/recognition/pp-tsm_v2.md
+++ /dev/null
@@ -1,191 +0,0 @@
-# PP-TSMv2
----
-## 目录
-
-- [1. 模型简介](#1)
-- [2. 模型细节](#2)
-    - [2.1 骨干网络与预训练模型选择](#21)
-    - [2.2 数据增强](#22)
-    - [2.3 tsm模块调优](#23)
-    - [2.4 输入帧数优化](#24)
-    - [2.5 解码速度优化](#25)
-    - [2.6 DML蒸馏](#26)
-    - [2.7 LTA模块](#27)  
-- [3. 快速体验](#3)
-- [4. 模型训练、压缩、推理部署](#4)
-
-
-<a name="1"></a>
-## 1. 模型简介
-
-视频分类任务是指输入视频，输出标签类别。如果标签都是行为类别，则该任务也称为行为识别。随着AI在各个行业的应用普及，工业及体育场景下对轻量化行为识别模型的需求日益增多，为此我们提出了高效的轻量化行为识别模型PP-TSMv2。
-
-PP-TSMv2沿用了部分PP-TSM的优化策略，从骨干网络与预训练模型选择、数据增强、tsm模块调优、输入帧数优化、解码速度优化、DML蒸馏、LTA模块等7个方面进行模型调优，在中心采样评估方式下，精度达到75.16%，输入10s视频在CPU端的推理速度仅需456ms。
-
-
-<a name="2"></a>
-## 2. 模型细节
-
-<a name="21"></a>
-### 2.1 骨干网络与预训练模型选择
-
-在骨干网络的选择上，PP-TSMv2选用了针对基于CPU端设计的轻量化骨干网络[PP-LCNetV2](https://github.com/PaddlePaddle/PaddleClas/blob/release/2.4/docs/zh_CN/models/PP-LCNetV2.md)。PP-LCNetV2通过组合PW卷积、SE模块、Shortcut和Reparameterizatio等策略，在不使用额外数据的前提下，在图像分类 ImageNet 数据集上的性能如下表所示。
-
-| Model | Top-1 Acc(\%) | Latency(ms) |
-|:--:|:--:|:--:|
-| MobileNetV3_Large_x1_25 | 76.4 | 5.19 |
-| PPLCNetV2_base | 77.04 | 4.32 |
-| PPLCNetV2_base_ssld | 80.07| 4.32 |
-
-在预训练模型选择上，我们以使用[SSLD](https://github.com/PaddlePaddle/PaddleClas/blob/release/2.4/docs/zh_CN/advanced_tutorials/knowledge_distillation.md)在 ImageNet上蒸馏得到的模型作为预训练模型，相较于未使用ssld的预训练模型，其提升效果如下表所示。
-
-| 策略 | Top-1 Acc(\%) |
-|:--:|:--:|
-| baseline | 69.06 |
-| baseline + SSLD Backbone | 69.95(+**0.89**) |
-
-<a name="22"></a>
-### 2.2 数据增强
-
-我们沿用了PP-TSM使用数据增强策略VideoMix，将两个视频以一定的权值叠加构成新的输入样本。具体的，对每个视频，首先抽取固定数量的帧，并给每一帧赋予相同的权重，然后与另一个视频叠加作为新的输入视频。这种方式对精度的提升效果如下表所示。
-
-| 策略 | Top-1 Acc(\%) |
-|:--:|:--:|
-| baseline | 69.06 |
-| baseline + VideoMix | 69.36(+**0.3**) |
-
-<a name="23"></a>
-### 2.3 tsm模块调优
-
-在骨干网络的基础上，我们添加了时序位移模块提取时序信息。对于插入位置，TSM原论文中将temporal_shift模块插入残差结构之中，但PP-LCNetV2为了加快模型速度，去除了部分残差连接。PP-LCNetV2整体结构分为4个stage，我们实验探索了时序位移模块最佳插入位置。对于插入数量，temporal_shift模块会加大模型的运行时间，我们探索了其最优插入数量，实验结果如下表所示。
-
-| 策略 | Top-1 Acc(\%) |
-|:--:|:--:|
-| baseline | 69.06 |
-| baseline + tsm in stage1 | 69.84 |
-| baseline + tsm in stage2 | 69.84 |
-| **baseline + tsm in stage3** | **70.02(+0.96)** |
-| baseline + tsm in stage4 | 69.98 |
-| baseline + tsm in stage1,2 | 69.77 |
-| baseline + tsm in stage3,4 | 70.05 |
-| baseline + tsm in stage1,2,3,4 | 70.06 |
-
-可以看到，在高层插入时序位移模块的效果优于低层。在Stage3中插入1个temporal_shift模块，能达到精度和速度上的最优。
-
-<a name="24"></a>
-### 2.4 输入帧数优化
-
-对于10s的视频，我们会抽取一定数量的帧输入到网络中。PP-TSMv2采样分段采样策略，即先将视频按时间长度等分成N段，然后在每段中随机选取一帧，组合得到N帧作为模型输入。
-
-输入帧数的增加一定程度上能提升模型精度，但同时会带来数据预处理及模型推理时间的显著增加。综合考虑性能和速度，我们采用16帧作为输入，相较于8帧输入，精度提升效果如下表所示。
-
-| 策略 | Top-1 Acc(\%) |
-|:--:|:--:|
-| baseline | 69.06 |
-| baseline + 16f | 69.78(+**0.72**) |
-
-<a name="25"></a>
-### 2.5 解码速度优化
-
-在解码速度上，我们对比了常见的视频解码库在视频分段采样策略中的速度，[测试数据](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/time-test.tar)，不同解码库速度对比如下表所示。PP-TSMv2最终选用[decord](https://github.com/dmlc/decord)作为解码器。
-
-| lib | Time/s |
-|:--:|:--:|
-| opencv | 0.056 |
-| **decord** | **0.043** |
-| PyAV| 0.045 |
-
-- 实用tips，若使用opencv进行解码，代码作如下优化能极大提升解码速度:
-
-```python
-    cap = cv2.VideoCapture(file_path)
-    videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-
-    # 1. decode all frames, time cost!
-    sampledFrames = []
-    for i in range(videolen):
-        ret, frame = cap.read()
-        # maybe first frame is empty
-        if ret == False:
-            continue
-        img = frame[:, :, ::-1]
-        sampledFrames.append(img)
-
-    cap.release()
-
-    # 2. get frame index
-    frames_idx = [xxx]
-
-    # 3. sample
-    frames = np.array(sampledFrames)
-    imgs = []
-    for idx in frames_idx:
-        imgbuf = frames[idx]
-        img = Image.fromarray(imgbuf, mode='RGB')
-        imgs.append(img)
-```
-
-优化后:
-```python
-    cap = cv2.VideoCapture(file_path)
-    videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-
-    # 1. get frame index
-    frames_idx = [xxx]
-
-    # 2. decode target frame
-    imgs = []
-    for i in range(videolen):
-        ret = cap.grab()
-        # maybe first frame is empty
-        if ret == False:
-            continue  
-        if frames_idx and i == frames_idx[0]:
-            frames_idx.pop(0)
-            ret, frame = cap.retrieve()
-            if frame is None:
-                break
-            imgbuf = frame[:, :, ::-1]
-            img = Image.fromarray(imgbuf, mode='RGB')
-            imgs.append(img)
-        if frames_idx == None:
-            break
-    cap.release()
-```
-
-<a name="26"></a>
-### 2.6 DML蒸馏
-
-通过[模型蒸馏](../../distillation.md)将大模型的知识迁移到小模型中，可以进一步提升模型精度。PP-TSMv2使用[DML蒸馏](https://arxiv.org/pdf/1706.00384.pdf)方法，在蒸馏的过程中，不依赖于教师模型，两个结构相同的模型互相学习，计算彼此输出（logits）的KL散度，最终完成训练过程。PP-TSMv2优化过程中，分别尝试了以自身或者以PP-TSM ResNet-50 backbone作为教师模型进行蒸馏，性能提升效果如下表。
-
-| 策略 | 教师模型 | Top-1 acc |
-| --- | --- | --- |
-| baseline | - | 69.06% |
-| DML | PP-TSMv2 | 70.34%(**+1.28%**) |
-| DML | PP-TSM_ResNet50 | 71.27%(**+2.20%**) |
-
-
-<a name="27"></a>
-### 2.7 LTA模块
-
-temporal shift模块通过把特征在时间通道上位移，获取时序信息。但这种位移方式仅让局部的特征进行交互，缺少对全局时序信息的建模能力。为此我们提出了轻量化的时序attention模块(Lightweight Temporal Attention, LTA)，如图所示，通过全局池化组合可学习的fc层，得到全局尺度上的时序attention。在tsm模块之前，添加时序attention模块，使得网络在全局信息的指导下进行时序位移。LTA模块能够在基本不增加推理时间的前提下，进一步提升模型精度。
-
-<div align="left">
-  <img src="https://user-images.githubusercontent.com/22365664/209295833-3f68ab67-c7e4-460f-ad12-68d4b9115460.png" width="450px"/><br>
-</div>
-
-| 策略 | Top-1 Acc(\%) |
-|:--:|:--:|
-| pptsmv2 w/o temporal_attention | 74.38 |
-| pptsmv2 w/ temporal_attention | 75.16(+**0.78**) |
-
-<a name="3"></a>
-## 3. 快速体验
-
-参考[快速开始文档](../../quick_start.md)，安装`ppvideo` 2.3.0版本，即可快速体验使用PP-TSMv2模型进行预测。
-
-<a name="4"></a>
-## 4. 模型训练、压缩、推理部署
-
-
-更多教程，包括模型训练、模型压缩、推理部署等，请参考[使用文档](./pp-tsm.md)。
diff --git a/docs/zh-CN/model_zoo/recognition/pp-tsn.md b/docs/zh-CN/model_zoo/recognition/pp-tsn.md
deleted file mode 100644
index 3229fdb0d..000000000
--- a/docs/zh-CN/model_zoo/recognition/pp-tsn.md
+++ /dev/null
@@ -1,148 +0,0 @@
-[English](../../../en/model_zoo/recognition/pp-tsn.md) | 简体中文
-
-# PP-TSN视频分类模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-我们对[TSN模型](./tsn.md)进行了改进，得到了更高精度的2D实用视频分类模型**PP-TSN**。在不增加参数量和计算量的情况下，在UCF-101、Kinetics-400等数据集上精度显著超过原版，在Kinetics-400数据集上的精度如下表所示。
-
-| Version | Top1 |
-| :------ | :----: |
-| Ours (distill) | 75.06 |
-| Ours | **73.68** |
-| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn#kinetics-400) | 71.80 |
-
-
-## 数据准备
-
-K400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
-
-UCF101数据下载及准备请参考[UCF-101数据准备](../../dataset/ucf101.md)
-
-
-## 模型训练
-
-### Kinetics-400数据集训练
-
-#### 下载并添加预训练模型
-
-1. 下载图像蒸馏预训练模型[ResNet50_vd_ssld_v2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams)作为Backbone初始化参数，或通过wget命令下载
-
-   ```bash
-   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams
-   ```
-
-2. 打开`PaddleVideo/configs/recognition/pptsn/pptsn_k400_frames.yaml`，将下载好的权重存放路径填写到下方`pretrained:`之后
-
-    ```yaml
-    MODEL:
-        framework: "Recognizer2D"
-        backbone:
-            name: "ResNetTweaksTSN"
-            pretrained: 将路径填写到此处
-    ```
-
-#### 开始训练
-
-- Kinetics400数据集使用8卡训练，训练方式的启动命令如下:
-
-    ```bash
-    # frames数据格式
-    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsn  main.py  --validate -c configs/recognition/pptsn/pptsn_k400_frames.yaml
-
-    # videos数据格式
-    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsn  main.py  --validate -c configs/recognition/pptsn/pptsn_k400_videos.yaml
-    ```
-
-- 开启amp混合精度训练，可加速训练过程，其训练启动命令如下：
-
-    ```bash
-    export FLAGS_conv_workspace_size_limit=800 # MB
-    export FLAGS_cudnn_exhaustive_search=1
-    export FLAGS_cudnn_batchnorm_spatial_persistent=1
-
-    # frames数据格式
-    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsn  main.py --amp --validate -c configs/recognition/pptsn/pptsn_k400_frames.yaml
-
-    # videos数据格式
-    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsn  main.py --amp --validate -c configs/recognition/pptsn/pptsn_k400_videos.yaml
-    ```
-
-- 另外您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`，参数用法请参考[config](../../tutorials/config.md)。
-
-
-## 模型测试
-
-- PP-TSN模型在训练时同步进行验证，您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-  ```
-  Already save the best model (top1 acc)0.7004
-  ```
-
-- 由于PP-TSN模型测试模式的采样方式是速度稍慢但精度高一些的**TenCrop**，与训练过程中验证模式采用的**CenterCrop**不同，所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数，因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标，命令如下：
-
-  ```bash
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py  --test -c configs/recognition/pptsn/pptsn_k400_frames.yaml -w "output/ppTSN/ppTSN_best.pdparams"
-  ```
-
-
-  当测试配置采用如下参数时，在Kinetics-400的validation数据集上的测试指标如下：
-
-
-  | backbone | Sampling method | distill | num_seg | target_size | Top-1 | checkpoints |
-  | :------: | :----------: | :----: | :----: | :----: | :---- | :---: |
-  | ResNet50 | TenCrop | False | 3 | 224 | 73.68 | [ppTSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams) |
-  | ResNet50 | TenCrop | True | 8 | 224 | 75.06 | [ppTSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400_8.pdparams) |
-
-- PP-TSN视频采样策略为TenCrop采样：时序上，将待输入视频均匀分成`num_seg`段区间，每段的中间位置采样1帧；空间上，从左上角、右上角、中心点、左下角、右下角5个子区域各采样224x224的区域，并加上水平翻转，一共得到10个采样结果。1个视频共采样1个clip。
-
-- distill为`True`表示使用了蒸馏所得的预训练模型，具体蒸馏方案参考[PP-TSM蒸馏方案](https://zhuanlan.zhihu.com/p/382134297)。
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_frames.yaml -p data/ppTSN_k400.pdparams -o inference/ppTSN
-```
-
-上述命令将生成预测所需的模型结构文件`ppTSN.pdmodel`和模型权重文件`ppTSN.pdiparams`以及`ppTSN.pdiparams.info`文件，均存放在`inference/ppTSN/`目录下
-
-上述bash命令中各个参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/recognition/pptsn/pptsn_k400_frames.yaml \
-                           --model_file inference/ppTSN/ppTSN.pdmodel \
-                           --params_file inference/ppTSN/ppTSN.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```bash
-Current video file: data/example.avi
-        top-1 class: 5
-        top-1 score: 0.998979389667511
-```
-
-可以看到，使用在Kinetics-400上训练好的PP-TSN模型对`data/example.avi`进行预测，输出的top1类别id为`5`，置信度为0.99。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`，可知预测类别名称为`archery`。
-
-## 参考论文
-
-- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/pdf/1608.00859.pdf), Limin Wang, Yuanjun Xiong, Zhe Wang
-- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
diff --git a/docs/zh-CN/model_zoo/recognition/slowfast.md b/docs/zh-CN/model_zoo/recognition/slowfast.md
deleted file mode 100644
index 030aaab4b..000000000
--- a/docs/zh-CN/model_zoo/recognition/slowfast.md
+++ /dev/null
@@ -1,140 +0,0 @@
-简体中文 | [English](../../../en/model_zoo/recognition/slowfast.md)
-
-# SlowFast视频分类模型
-
----
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-SlowFast是视频分类领域的高精度模型，使用slow和fast两个分支。slow分支以稀疏采样得到的帧作为输入，捕捉视频中的表观信息。fast分支以高频采样得到的帧作为输入，捕获视频中的运动信息，最终将两个分支的特征拼接得到预测结果。
-
-<p align="center">
-<img src="../../../images/SlowFast.png" height=300 width=500 hspace='10'/> <br />
-SlowFast Overview
-</p>
-
-详细内容请参考ICCV 2019论文[SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982)
-
-
-## 数据准备
-
-SlowFast模型的训练数据采用Kinetics400数据集，数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
-
-
-## 模型训练
-
-数据准备完成后，可通过如下方式启动训练：
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-
-python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast  main.py --validate -c configs/recognition/slowfast/slowfast.yaml
-```
-
-- 从头开始训练，使用上述启动命令行或者脚本程序即可启动训练，不需要用到预训练模型。
-
-- 建议使用多卡训练方式，单卡由于batch\_size减小，精度可能会有损失。
-
-
-### 训练资源要求
-
-*  8卡V100，总batch\_size=64，单卡batch\_size=8，单卡显存占用约9G。
-*  训练速度相较原始实现提速100%，详细参考[benchmark](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/benchmark.md#实验结果)
-
-### 训练加速
-
-SlowFast为3D模型，训练异常耗时，为进一步加速模型的训练，我们实现了[Multigrid加速策略算法](https://arxiv.org/abs/1912.00998)，其训练启动方式如下:
-
-```bash
-python -B -m paddle.distributed.launch --selected_gpus="0,1,2,3,4,5,6,7" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml
-```
-
-性能数据如下:
-
-| 训练策略 | 单个epoch平均耗时/min | 训练总时间/min | 加速比 |
-| :------ | :-----: | :------: |:------: |
-| Multigrid | 27.25 |  9758(6.7天) | 2.89x |
-| Normal | 78.76 | 15438(10.7天) | base |
-
-速度详细数据说明可参考[加速文档](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/tutorials/accelerate.md#%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5%E5%8A%A0%E9%80%9F)。
-
-## 模型测试
-
-可通过如下命令进行模型测试:
-
-```bash
-python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast_test main.py --test -c  configs/recognition/slowfast/slowfast.yaml -w output/SlowFast/SlowFast_epoch_000196.pdparams
-```
-
-- 通过 `-w`参数指定待测试模型文件的路径，您可以下载我们训练好的模型进行测试[SlowFast.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams)
-
-- 使用```multi_crop```的方式进行评估，因此评估有一定耗时，建议使用多卡评估，加快评估速度。若使用默认方式进行多卡评估，耗时约4小时。
-
-- 模型最终的评估精度会打印在日志文件中。
-
-若使用单卡评估，启动方式如下：
-
-```bash
-python -B main.py --test -c  configs/recognition/slowfast/slowfast.yaml -w output/SlowFast/SlowFast_epoch_000196.pdparams
-```
-
-
-在Kinetics400数据集下评估精度及权重文件如下:
-
-| Configs | Acc1 | Acc5 | Weights |
-| :---: | :---: | :---: | :---: |
-|  [slowfast.yaml](../../../../configs/recognition/slowfast/slowfast.yaml) | 74.35 | 91.33 | [slowfast_4x16.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams) |
-|  [slowfast_multigrid.yaml](../../../../configs/recognition/slowfast/slowfast_multigrid.yaml) | 75.84  | 92.33 | [slowfast_8x8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) |
-
-- 由于Kinetics400数据集部分源文件已缺失，无法下载，我们使用的数据集比官方数据少~5%，因此精度相比于论文公布的结果有一定损失。相同数据下，精度已与原实现对齐。
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml \
-                                -p data/SlowFast.pdparams \
-                                -o inference/SlowFast
-```
-
-上述命令将生成预测所需的模型结构文件`SlowFast.pdmodel`和模型权重文件`SlowFast.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/recognition/slowfast/slowfast.yaml \
-                           --model_file inference/SlowFast/SlowFast.pdmodel \
-                           --params_file inference/SlowFast/SlowFast.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```
-Current video file: data/example.avi
-        top-1 class: 5
-        top-1 score: 1.0
-```
-
-可以看到，使用在Kinetics-400上训练好的SlowFast模型对`data/example.avi`进行预测，输出的top1类别id为`5`，置信度为1.0。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`，可知预测类别名称为`archery`。
-
-
-## 参考论文
-
-- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.
-- [A Multigrid Method for Efficiently Training Video Models](https://arxiv.org/abs/1912.00998), Chao-Yuan Wu, Ross Girshick, et al.
diff --git a/docs/zh-CN/model_zoo/recognition/stgcn.md b/docs/zh-CN/model_zoo/recognition/stgcn.md
deleted file mode 100644
index bd8fd884f..000000000
--- a/docs/zh-CN/model_zoo/recognition/stgcn.md
+++ /dev/null
@@ -1,136 +0,0 @@
-[English](../../../en/model_zoo/recognition/stgcn.md)  | 简体中文
-
-# ST-GCN基于骨骼的行为识别模型
-
----
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-ST-GCN是AAAI 2018提出的经典的基于骨骼的行为识别模型，通过将图卷积应用在具有拓扑结构的人体骨骼数据上，使用时空图卷积提取时空特征进行行为识别，极大地提升了基于骨骼的行为识别任务精度。
-
-我们提供了详尽理论及代码讲解，并可使用免费在线GPU算力资源，一键运行的AI Studio Notebook项目， 使用链接：[基于飞桨实现花样滑冰选手骨骼点动作识别大赛baseline](https://aistudio.baidu.com/aistudio/projectdetail/2417717?contributionType=1)
-
-<div align="center">
-<img src="../../../images/st-gcn.png" height=200 width=950 hspace='10'/> <br />
-</div>
-
-
-## 数据准备
-
-花样滑冰比赛数据下载及准备请参考[花样滑冰数据准备](../../dataset/fsd.md)
-
-NTU-RGBD数据下载及准备请参考[NTU-RGBD数据准备](../../dataset/ntu-rgbd.md)
-
-
-## 模型训练
-
-### 花样滑冰数据集训练
-
-- 花样滑冰数据集使用单卡训练，启动命令如下:
-
-```bash
-python3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml
-```
-
-- 由于赛事未提供验证集数据，因此训练时不做valid。
-
-- 您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，参数用法请参考[config](../../tutorials/config.md)。
-
-
-### NTU-RGBD数据集训练
-
-- NTU-RGBD数据集使用4卡训练，启动命令如下:
-
-```bash
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3"  --log_dir=log_stgcn  main.py  --validate -c configs/recognition/stgcn/stgcn_ntucs.yaml
-```
-
-- 配置文件`stgcn_ntucs.yaml`为NTU-RGB+D数据集按cross-subject划分方式对应的训练配置。
-
-
-## 模型测试
-
-### 花样滑冰数据集模型测试
-
-- 模型测试的启动命令如下：
-
-```bash
-python3.7 main.py --test -c configs/recognition/stgcn/stgcn_fsd.yaml -w output/STGCN/STGCN_epoch_00090.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-- 评估结果保存在submission.csv文件中，可在[评测官网](https://aistudio.baidu.com/aistudio/competition/detail/115)提交查看得分。
-
-模型在花样滑冰数据集上baseline实验精度如下:
-
-Test_Data| Top-1 | checkpoints |
-| :----: | :----: | :---- |
-| Test_A | 59.07 | [STGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams) |
-
-
-### NTU-RGB+D数据集模型测试
-
-- 模型测试的启动命令如下：
-
-```bash
-python3.7 main.py --test -c configs/recognition/stgcn/stgcn_ntucs.yaml -w output/STGCN/STGCN_best.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-模型在NTU-RGB+D数据集上实验精度如下:
-
-| split | Top-1 | checkpoints |
-| :----: | :----: | :---- |
-| cross-subject | 82.28 | [STGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_ntucs.pdparams) |
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml \
-                                -p data/STGCN_fsd.pdparams \
-                                -o inference/STGCN
-```
-
-上述命令将生成预测所需的模型结构文件`STGCN.pdmodel`和模型权重文件`STGCN.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \
-                           --config configs/recognition/stgcn/stgcn_fsd.yaml \
-                           --model_file inference/STGCN/STGCN.pdmodel \
-                           --params_file inference/STGCN/STGCN.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```
-Current video file: data/fsd10/example_skeleton.npy
-        top-1 class: 27
-        top-1 score: 0.9912770986557007
-```
-
-可以看到，使用在花样滑冰数据集上训练好的ST-GCN模型对`data/example_skeleton.npy`进行预测，输出的top1类别id为`27`，置信度为0.9912770986557007。
-
-
-## 参考论文
-
-- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
diff --git a/docs/zh-CN/model_zoo/recognition/timesformer.md b/docs/zh-CN/model_zoo/recognition/timesformer.md
deleted file mode 100644
index ae30f9574..000000000
--- a/docs/zh-CN/model_zoo/recognition/timesformer.md
+++ /dev/null
@@ -1,136 +0,0 @@
-[English](../../../en/model_zoo/recognition/timesformer.md) | 简体中文
-
-# TimeSformer视频分类模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-TimeSformer是基于vision transformer的视频分类模型，具有无卷积、全局感受野、时间序列建模能力强的特点。目前在Kinetics-400数据集上达到了SOTA精度，超过了经典的基于CNN的视频分类模型TSN和TSM以及Slowfast，而且具有更短的训练用时（Kinetics-400数据集训练用时39小时）。**本代码实现的是论文中的时间-空间分离的注意力级联网络**。
-
-<div align="center">
-<img src="../../../images/timesformer_attention_arch.png" alt="image-20210628210446041"/><img src="../../../images/timesformer_attention_visualize.png" alt="image-20210628210446041"  />
-</div>
-
-
-## 数据准备
-
-K400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
-
-UCF101数据下载及准备请参考[UCF-101数据准备](../../dataset/ucf101.md)
-
-
-## 模型训练
-
-### Kinetics-400数据集训练
-
-#### 下载并添加预训练模型
-
-1. 下载图像预训练模型[ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams)作为Backbone初始化参数，或通过wget命令下载
-
-   ```bash
-   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
-   ```
-
-2. 打开`PaddleVideo/configs/recognition/timesformer/timesformer_k400_videos.yaml`，将下载好的权重存放路径填写到下方`pretrained:`之后
-
-    ```yaml
-    MODEL:
-        framework: "RecognizerTransformer"
-        backbone:
-            name: "VisionTransformer"
-            pretrained: 将路径填写到此处
-    ```
-
-#### 开始训练
-
-- Kinetics400数据集使用8卡训练，训练方式的启动命令如下:
-
-    ```bash
-    # videos数据格式
-    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_timesformer  main.py  --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml
-    ```
-    
-- 开启amp混合精度训练，可加速训练过程，其训练启动命令如下：
-
-    ```bash
-    export FLAGS_conv_workspace_size_limit=800 # MB
-    export FLAGS_cudnn_exhaustive_search=1
-    export FLAGS_cudnn_batchnorm_spatial_persistent=1
-    # videos数据格式
-    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_timesformer  main.py --amp --validate -c configs/recognition/timesformer/timesformer_k400_videos.yaml
-    ```
-    
-- 另外您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`，参数用法请参考[config](../../tutorials/config.md)。
-
-
-## 模型测试
-
-- TimeSformer模型在训练时同步进行验证，您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-  ```
-  Already save the best model (top1 acc)0.7258
-  ```
-
-- 由于TimeSformer模型测试模式的采样方式是速度稍慢但精度高一些的**UniformCrop**，与训练过程中验证模式采用的**RandomCrop**不同，所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数，因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标，命令如下：
-
-  ```bash
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_timesformer  main.py  --test -c configs/recognition/timesformer/timesformer_k400_videos.yaml -w "output/TimeSformer/TimeSformer_best.pdparams"
-  ```
-
-
-  当测试配置采用如下参数时，在Kinetics-400的validation数据集上的测试指标如下：
-
-   |      backbone      | Sampling method | num_seg | target_size | Top-1 |                         checkpoints                          |
-   | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |
-   | Vision Transformer |   UniformCrop   |   8    |     224     | 77.29 | [TimeSformer_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams) |
-
-
-- 测试时，TimeSformer视频采样策略为使用Linspace采样：时序上，从待采样视频序列的第一帧到最后一帧区间内，均匀生成`num_seg`个稀疏采样点（包括端点）；空间上，选择长边两端及中间位置（左中右 或 上中下）3个区域采样。1个视频共采样1个clip。
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml \
-                                -p data/TimeSformer_k400.pdparams \
-                                -o inference/TimeSformer
-```
-
-上述命令将生成预测所需的模型结构文件`TimeSformer.pdmodel`和模型权重文件`TimeSformer.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](../../start.md#2-模型推理)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/recognition/timesformer/timesformer_k400_videos.yaml \
-                           --model_file inference/TimeSformer/TimeSformer.pdmodel \
-                           --params_file inference/TimeSformer/TimeSformer.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```
-Current video file: data/example.avi
-        top-1 class: 5
-        top-1 score: 0.9997474551200867
-```
-
-可以看到，使用在Kinetics-400上训练好的TimeSformer模型对`data/example.avi`进行预测，输出的top1类别id为`5`，置信度为0.99。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`，可知预测类别名称为`archery`。
-
-## 参考论文
-
-- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
diff --git a/docs/zh-CN/model_zoo/recognition/tokenshift_transformer.md b/docs/zh-CN/model_zoo/recognition/tokenshift_transformer.md
deleted file mode 100644
index 155c4bb04..000000000
--- a/docs/zh-CN/model_zoo/recognition/tokenshift_transformer.md
+++ /dev/null
@@ -1,124 +0,0 @@
-[English](../../../en/model_zoo/recognition/tokenshift_transformer.md) | 简体中文
-
-# Token Shift Transformer视频分类模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-Token Shift Transformer 是基于vision transformer的视频分类模型，具有可解释性强、对超大规模数据具有高判别能力以及处理不同长度输入的灵活性等优点。Token Shift Module 是一种新颖的零参数、零 FLOPs 模块，用于对每个 Transformer编码器内的时间关系进行建模。
-
-<div align="center">
-<img src="../../../images/tokenshift_structure.png">
-</div>
-
-
-## 数据准备
-
-UCF-101数据下载及准备请参考[UCF-101数据准备](../../dataset/ucf101.md)
-
-
-## 模型训练
-
-### UCF-101数据集训练
-
-#### 下载并添加预训练模型
-
-1. 下载图像预训练模型[ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams)作为Backbone初始化参数，或通过wget命令下载
-
-   ```bash
-   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
-   ```
-
-2. 打开`PaddleVideo/configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml`，将下载好的权重存放路径填写到下方`pretrained:`之后
-
-    ```yaml
-    MODEL:
-        framework: "RecognizerTransformer"
-        backbone:
-            name: "TokenShiftVisionTransformer"
-            pretrained: 将路径填写到此处
-    ```
-
-#### 开始训练
-
-- UCF-101数据集使用单卡训练，训练方式的启动命令如下:
-
-    ```bash
-    # videos数据格式
-    python3 main.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --validate --seed=1234
-    ```
-    
-- 开启amp混合精度训练，可加速训练过程，其训练启动命令如下：
-
-    ```bash
-    # videos数据格式
-    python3 main.py --amp -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --validate --seed=1234
-    ```
-    
-- 另外您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`，参数用法请参考[config](../../contribute/config.md)。
-
-
-## 模型测试
-
-- Token Shift Transformer模型在训练时同步进行验证，您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-  ```
-  Already save the best model (top1 acc)0.9201
-  ```
-
-- 由于Token Shift Transformer模型测试模式的采样方为的**uniform**采样，与训练过程中验证模式采用的**dense**采样不同，所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数，因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标，命令如下：
-
-  ```bash
-  python3 main.py --amp -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --test --seed=1234 -w 'output/TokenShiftVisionTransformer/TokenShiftVisionTransformer_best.pdparams'
-  ```
-
-
-  当测试配置采用如下参数时，在UCF-101的validation数据集上的测试指标如下：
-
-   |      backbone      | sampling method | num_seg | target_size | Top-1 |                         checkpoints                          |
-   | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |
-  | Vision Transformer | Uniform | 8 | 256 | 92.81 | [TokenShiftTransformer.pdparams](https://drive.google.com/drive/folders/1k_TpAqaJZYJE8C5g5pT9phdyk9DrY_XL?usp=sharing) |
-
-
-- Uniform采样: 时序上，等分成`num_seg`段，每段中间位置采样1帧；空间上，中心位置采样。1个视频共采样1个clip。
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3 tools/export_model.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml -p 'output/TokenShiftVisionTransformer/TokenShiftVisionTransformer_best.pdparams'
-```
-
-上述命令将生成预测所需的模型结构文件`TokenShiftVisionTransformer.pdmodel`和模型权重文件`TokenShiftVisionTransformer.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](../../usage.md#5-模型推理)
-
-### 使用预测引擎推理
-
-```bash
-python3 tools/predict.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml -i 'data/BrushingTeeth.avi' --model_file ./inference/TokenShiftVisionTransformer.pdmodel --params_file ./inference/TokenShiftVisionTransformer.pdiparams
-```
-
-输出示例如下:
-
-```
-Current video file: data/BrushingTeeth.avi
-	top-1 class: 19
-	top-1 score: 0.9959074258804321
-```
-
-可以看到，使用在UCF-101上训练好的Token Shift Transformer模型对`data/BrushingTeeth.avi`进行预测，输出的top1类别id为`19`，置信度为0.99。通过查阅类别id与名称对应表，可知预测类别名称为`brushing_teeth`。
-
-## 参考论文
-
-- [Token Shift Transformer for Video Classification](https://arxiv.org/pdf/2108.02432v1.pdf), Zhang H, Hao Y, Ngo C W.
\ No newline at end of file
diff --git a/docs/zh-CN/model_zoo/recognition/tsm.md b/docs/zh-CN/model_zoo/recognition/tsm.md
deleted file mode 100644
index b3591040a..000000000
--- a/docs/zh-CN/model_zoo/recognition/tsm.md
+++ /dev/null
@@ -1,231 +0,0 @@
-[English](../../../en/model_zoo/recognition/tsm.md) | 简体中文
-
-# TSM视频分类模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [实现细节](#实现细节)
-- [参考论文](#参考论文)
-
-## 模型简介
-
-Temporal Shift Module (TSM) 是当前比较受关注的视频分类模型，通过通道移动的方法在不增加任何额外参数量和计算量的情况下，极大地提升了模型对于视频时间信息的利用能力，并且由于其具有轻量高效的特点，十分适合工业落地。
-
-我们提供了详尽理论及代码讲解，并可使用免费在线GPU算力资源，一键运行的AI Studio Notebook项目，
-使用链接：[Paddle2.1实现视频理解经典模型-TSM](https://aistudio.baidu.com/aistudio/projectdetail/2310889?contributionType=1)
-
-<div align="center">
-<img src="../../../images/tsm_architecture.png" height=250 width=700 hspace='10'/> <br />
-</div>
-
-
-
-本代码实现的模型为**基于单路RGB图像**的TSM网络，Backbone采用ResNet-50结构。
-
-详细内容请参考ICCV 2019年论文 [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf)
-
-## 数据准备
-
-Kinetics400数据下载及准备请参考[k400数据准备](../../dataset/k400.md)
-
-UCF101数据下载及准备请参考[ucf101数据准备](../../dataset/ucf101.md)
-
-## 模型训练
-
-### Kinetics-400数据集训练
-
-#### 下载并添加预训练模型
-
-1. 加载在ImageNet1000上训练好的ResNet50权重作为Backbone初始化参数[ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams)，也可以通过命令行下载
-
-   ```bash
-   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
-   ```
-
-2. 打开`PaddleVideo/configs/recognition/tsm/tsm_k400_frames.yaml`，将下载好的权重路径填写到下方`pretrained:`之后
-
-   ```yaml
-   MODEL:
-       framework: "Recognizer2D"
-       backbone:
-           name: "ResNetTSM"
-           pretrained: 将路径填写到此处
-   ```
-
-#### 开始训练
-
-- Kinetics400数据集使用8卡训练，frames格式数据的训练启动命令如下:
-
-  ```bash
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
-  ```
-
-- Kinetics400数据集使用8卡训练，videos格式数据的训练启动命令如下:
-
-  ```bash
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_videos.yaml
-  ```
-
-- 开启amp混合精度训练，可加速训练过程，其训练启动命令如下：
-
-   ```bash
-   export FLAGS_conv_workspace_size_limit=800 #MB
-   export FLAGS_cudnn_exhaustive_search=1
-   export FLAGS_cudnn_batchnorm_spatial_persistent=1
-
-   python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
-   ```
-
-- 使用amp混合精度训练时，配合`nhwc`的数据格式有更好的加速效果，其训练启动方式如下:
-
-   ```bash
-   export FLAGS_conv_workspace_size_limit=800 #MB
-   export FLAGS_cudnn_exhaustive_search=1
-   export FLAGS_cudnn_batchnorm_spatial_persistent=1
-
-   python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml
-   ```
-
-- 另外您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，配置文件命名方式为`模型_数据集_文件格式_数据格式.yaml`，具体参数用法请参考[config](../../tutorials/config.md)。
-
-
-
-### UCF-101数据集训练
-
-#### 下载并添加预训练模型
-
-1. 加载在Kinetics-400上训练好的权重作为Backbone初始化参数[TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams)，也可以通过命令行下载
-
-   ```bash
-   wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams
-   ```
-
-2. 打开`PaddleVideo/configs/recognition/tsm/tsm_ucf101_frames.yaml`，将下载好的权重路径填写到下方`pretrained:`之后
-
-   ```yaml
-   MODEL:
-       framework: "Recognizer2D"
-       backbone:
-           name: "ResNetTSM"
-           pretrained: 将路径填写到此处
-   ```
-
-#### 开始训练
-
-- UCF-101数据集使用4卡训练，frames格式数据的训练启动命令如下:
-
-  ```bash
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml
-  ```
-
-- UCF-101数据集使用4卡训练，videos格式数据的训练启动命令如下:
-
-  ```bash
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_ucf101_videos.yaml
-  ```
-
-- 开启amp混合精度训练，可加速训练过程，其训练启动命令如下：
-
-   ```bash
-   export FLAGS_conv_workspace_size_limit=800 #MB
-   export FLAGS_cudnn_exhaustive_search=1
-   export FLAGS_cudnn_batchnorm_spatial_persistent=1
-
-   python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml
-   ```
-
-- 使用amp混合精度训练时，配合`nhwc`的数据格式有更好的加速效果，其训练启动方式如下:
-
-   ```bash
-   export FLAGS_conv_workspace_size_limit=800 #MB
-   export FLAGS_cudnn_exhaustive_search=1
-   export FLAGS_cudnn_batchnorm_spatial_persistent=1
-
-   python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames_nhwc.yaml
-   ```
-
-
-## 模型测试
-
-- TSM模型在训练时同步进行测试，您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-```txt
-Already save the best model (top1 acc)0.7106
-```
-
-- 若需单独运行测试代码，其启动命令如下：
-
-```bash
-python3.7 main.py --test -c configs/recognition/tsm/tsm_k400_frames.yaml -w output/TSM/TSM_best.pdparams
-```
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
----
-
-当测试配置采用如下参数时，在Kinetics-400的validation数据集上的评估精度如下：
-
-| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
-| :--------: | :---------------: | :-------: | :-----------: | :-----: | :-----------: | :-----------: |
-| ResNet50 | Uniform         | NCHW | 8       | 224         | 71.06 | [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams)        |
-
-当测试配置采用如下参数时，在UCF-101的validation数据集上的评估精度如下：
-
-| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
-| :------: | :-------------: | :-----------------: | :-----: | :---------: | :---: | :---------: |
-| ResNet50 |     Uniform     | NCHW              |    8    |     224     | 94.42 |    [TSM_ucf101_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_nchw.pdparams)     |
-| ResNet50 |     Uniform     | NCHW+AMP |    8    |     224     | 94.40 |   [TSM_ucf101_amp_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nchw.pdparams)     |
-| ResNet50 |     Uniform     | NHWC+AMP |    8    |     224     | 94.55 |   [TSM_ucf101_amp_nhwc.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nhwc.pdparams)     |
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml \
-                                -p data/TSM_k400.pdparams \
-                                -o inference/TSM
-```
-
-上述命令将生成预测所需的模型结构文件`TSM.pdmodel`和模型权重文件`TSM.pdiparams`。
-
-各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/recognition/tsm/tsm_k400_frames.yaml \
-                           --model_file inference/TSM/TSM.pdmodel \
-                           --params_file inference/TSM/TSM.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-## 实现细节
-
-**数据处理**
-
-- 模型读取Kinetics-400数据集中的`mp4`数据，首先将每条视频数据划分成`num_seg`段，然后均匀地从每段中抽取1帧图像，得到稀疏采样的`num_seg`张视频帧，再对这`num_seg`帧图像做同样的随机数据增强，包括多尺度的随机裁剪、随机左右翻转、数据归一化等，最后缩放至`target_size`。
-
-**训练策略**
-
-- 采用Momentum优化算法训练，momentum=0.9
-- 采用L2_Decay，权重衰减系数为1e-4
-- 采用全局梯度裁剪，裁剪系数为20.0
-- 总epoch数为50，学习率在epoch达到20、40进行0.1倍的衰减
-- FC层的权重与偏置的学习率分别为为整体学习率的5倍、10倍，且偏置不设置L2_Decay
-- Dropout_ratio=0.5
-
-**参数初始化**
-
-- 以Normal(mean=0, std=0.001)的正态分布来初始化FC层的权重，以常数0来初始化FC层的偏置
-
-## 参考论文
-
-- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
-
diff --git a/docs/zh-CN/model_zoo/recognition/tsn.md b/docs/zh-CN/model_zoo/recognition/tsn.md
deleted file mode 100644
index 6564e26af..000000000
--- a/docs/zh-CN/model_zoo/recognition/tsn.md
+++ /dev/null
@@ -1,120 +0,0 @@
-简体中文 | [English](../../../en/model_zoo/recognition/tsn.md)
-
-# TSN
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [实现细节](#实现细节)
-- [参考论文](#参考论文)
-
-## 模型简介
-
-Temporal Segment Network (TSN) 是视频分类领域经典的基于2D-CNN的解决方案。该方法主要解决视频的长时间行为识别问题，通过稀疏采样视频帧的方式代替稠密采样，既能捕获视频的全局信息，也能去除冗余，降低计算量。核心思想是将每帧的特征做平均融合作为视频的整体特征，再输入分类器进行分类。本代码实现的模型为**基于单路RGB图像**的TSN网络，Backbone采用ResNet-50结构。
-
-我们提供了详尽理论及代码讲解，并可使用免费在线GPU算力资源，一键运行的AI Studio Notebook项目，使用链接：[Paddle 2.1实现视频理解经典模型-TSN](https://aistudio.baidu.com/aistudio/projectdetail/2250682?contributionType=1)
-
-<div align="center">
-<img src="../../../images/tsn_architecture.png" height=350 width=80000 hspace='10'/> <br />
-</div>
-
-详细内容请参考ECCV 2016年的论文[Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)
-
-## 数据准备
-
-PaddleVide提供了在Kinetics-400数据集上训练和测试练脚本。Kinetics-400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
-
-## 模型训练
-
-### Kinetics-400数据集训练
-
-#### 下载并添加预训练模型
-
-1. 加载在ImageNet1000上训练好的ResNet50权重作为Backbone初始化参数[ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams)，也可以通过命令行下载
-
-   ```bash
-   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
-   ```
-
-2. 打开`PaddleVideo/configs/recognition/tsn/tsn_k400_frames.yaml`，将下载好的权重路径填写到下方`pretrained:`之后
-
-   ```yaml
-   MODEL:
-       framework: "Recognizer2D"
-       backbone:
-           name: "ResNet"
-           pretrained: 将路径填写到此处
-   ```
-
-#### 开始训练
-
-- Kinetics-400数据集使用8卡训练，frames格式数据的训练启动命令如下
-
-  ```bash
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py  --validate -c configs/recognition/tsn/tsn_k400_frames.yaml
-  ```
-
-## 模型测试
-
-由于TSN模型测试模式的采样方式是速度稍慢但精度高一些的**TenCrop**，与训练过程中验证模式采用的**CenterCrop**不同，所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数，因此在训练完成之后可以用测试模式对最好的模型进行测试获取最终的指标，命令如下：
-
-```bash
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py  --test -c configs/recognition/tsn/tsn_k400_frames.yaml -w "output/TSN/TSN_best.pdparams"
-```
-
-当测试配置采用如下参数时，在Kinetics-400的validation数据集上的测试指标如下：
-
-| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints                                                  |
-| :------: | :-------------: | :---------------: | :-----: | :---------: | :---: | ------------------------------------------------------------ |
-| ResNet50 |     TenCrop     |       NCHW        |   3    |     224     | 69.81 | [TSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams) |
-| ResNet50 |     TenCrop     |       NCHW        |   8    |     224     | 71.70 | [TSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400_8.pdparams) |
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml \
-                                -p data/TSN_k400.pdparams \
-                                -o inference/TSN
-```
-
-上述命令将生成预测所需的模型结构文件`TSN.pdmodel`和模型权重文件`TSN.pdiparams`。
-
-各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-模型推理)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/recognition/tsn/tsn_k400_frames.yaml \
-                           --model_file inference/TSN/TSN.pdmodel \
-                           --params_file inference/TSN/TSN.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-## 实现细节
-
-**数据处理：**
-
-- 模型读取Kinetics-400数据集中的`mp4`数据，首先将每条视频数据划分成`num_seg`段，然后均匀地从每段中抽取1帧图像，得到稀疏采样的`num_seg`张视频帧，再对这`num_seg`帧图像做同样的随机数据增强，包括多尺度的随机裁剪、随机左右翻转、数据归一化等，最后缩放至`target_size`
-
-**训练策略：**
-
-- 采用Momentum优化算法训练，momentum=0.9
-- 采用L2_Decay，权重衰减系数为1e-4
-- 采用全局梯度裁剪，裁剪系数为40.0
-- 总epoch数为100，学习率在epoch达到40、80进行0.1倍的衰减
-- Dropout_ratio=0.4
-
-**参数初始化**
-
-- TSN模型的卷积层采用Paddle默认的[KaimingNormal](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/initializer/KaimingNormal_cn.html#kaimingnormal)和[Constant](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/initializer/Constant_cn.html#constant)初始化方法，以Normal(mean=0, std=0.01)的正态分布来初始化FC层的权重，以常数0来初始化FC层的偏置
-
-## 参考论文
-
-- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
diff --git a/docs/zh-CN/model_zoo/recognition/tsn_dali.md b/docs/zh-CN/model_zoo/recognition/tsn_dali.md
deleted file mode 100644
index b9b2d1ed6..000000000
--- a/docs/zh-CN/model_zoo/recognition/tsn_dali.md
+++ /dev/null
@@ -1,111 +0,0 @@
-[English](../../../en/model_zoo/recognition/tsn_dali.md) | 简体中文
-
-# TSN模型-DALI训练加速
-
-- [方案简介](#方案简介)
-- [环境配置](#环境配置)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考文献](#参考文献)
-
-## 方案简介
-训练速度慢是视频模型训练常见的问题，PaddleVideo使用飞桨2.0的dataloader接口进行数据读取，凭借其优异的多进程加速能力，模型的训练速度可以显著增加。TSN是视频领域常用的2D模型，我们对其训练速度进行了进一步优化。基于[nvidia DALI](https://github.com/NVIDIA/DALI)的GPU解码能力，我们对nvidia DALI进行了二次开发，实现了其均匀分段的帧采样方式，进一步提升了TSN模型的训练速度。
-
-### 性能
-
-测试环境: 
-```
-机器: Tesla v100
-显存: 4卡16G
-Cuda: 9.0
-单卡batch_size: 32
-```
-
-训练速度对比如下:
-
-| 加速方式  | batch耗时/s  | reader耗时/s | ips:instance/sec | 加速比 | 
-| :--------------- | :--------: | :------------: | :------------: | :------------: |
-| DALI | 2.083 | 1.804 | 15.36597  | 1.41x | 
-| Dataloader:  单卡num_workers=4 | 2.943 | 2.649 | 10.87460| base |
-| pytorch实现 | TODO | TODO | TODO | TODO |
-
-
-## 环境配置
-
-我们提供docker运行环境方便您使用，基础镜像为:
-
-```
-    huangjun12/paddlevideo:tsn_dali_cuda9_0
-```
-
-基于以上docker镜像创建docker容器，运行命令为:
-
-```bash
-nvidia-docker run --name tsn-DALI -v /home:/workspace --network=host -it --shm-size 64g -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video huangjun12/paddlevideo:tsn_dali_cuda9_0 /bin/bash
-```
-- docker中安装好了飞桨2.0.0-rc1版本和我们二次开发后的DALI，创建容器后您可以在docker环境中直接开始tsn模型训练，无需额外配置环境。
-
-## 数据准备
-
-PaddleVide提供了在K400和UCF101两种数据集上训练TSN的训练脚本。
-
-- K400数据下载及准备请参考[K400数据准备](../../dataset/k400.md)
-
-- UCF101数据下载及准备请参考[UCF101数据准备](../../dataset/ucf101.md)
-
-## 模型训练
-
-### 预训练模型下载
-
-加载在ImageNet1000上训练好的ResNet50权重作为Backbone初始化参数，请下载此[模型参数](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams),
-或是通过命令行下载
-
-```bash
-wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
-```
-
-并将路径添加到configs中backbone字段下
-
-```yaml
-MODEL:
-    framework: "Recognizer2D"
-    backbone:
-        name: "ResNet"
-        pretrained: 将路径填写到此处
-```
-
-### 开始训练
-
-模型训练的启动命令为: 
-
-```bash
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml -o log_level="INFO"
-```
-
-- 通过`-c`指定模型训练参数配置文件，模型及训练参数配置请参考配置文件```configs/recognition/tsn/tsn_dali.yaml```。
-
-- 如若进行finetune，请下载PaddleVideo的已发布模型[comming soon]()， 通过`--weights`指定权重存放路径可进行模型finetune。 
-
-- 您可以自定义修改参数配置，参数用法请参考[config](../../tutorials/config.md)。
-
-## 模型测试
-
-模型测试方法请参考TSN模型使用文档[模型测试部分](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsn.md#模型测试)
-
-## 模型推理
-
-模型推理方法请参考TSN模型使用文档[模型推理部分](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsn.md#模型推理)
-
-## 参考论文
-
-- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
-
-
-
-
-
-
-
-
diff --git a/docs/zh-CN/model_zoo/recognition/videoswin.md b/docs/zh-CN/model_zoo/recognition/videoswin.md
deleted file mode 100644
index f9d5a747d..000000000
--- a/docs/zh-CN/model_zoo/recognition/videoswin.md
+++ /dev/null
@@ -1,139 +0,0 @@
-[English](../../../en/model_zoo/recognition/videoswin.md) | 简体中文
-
-# Video-Swin-Transformer视频分类模型
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-Video-Swin-Transformer是基于Swin Transformer的视频分类模型，其利用了Swin Transformer的多尺度建模和高效局部注意力特性，目前在Kinetics-400数据集上达到了SOTA精度，超过了同为transformer结构的TimeSformer模型。
-
-
-![VideoSwin](../../../images/videoswin.jpg)
-
-- **注：运行VideoSwin模型，请安装PaddlePaddle==2.3.1及以上版本**
-
-```bash
-python3.7 -m pip install paddlepaddle-gpu==2.3.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-## 数据准备
-
-K400数据下载及准备请参考[Kinetics-400数据准备](../../dataset/k400.md)
-
-
-## 模型训练
-
-### Kinetics-400数据集训练
-
-下面以VideoSwin_base模型在Kinetics-400数据集进行训练为例
-
-#### 下载并添加预训练模型
-
-1. 下载图像预训练模型[swin_base_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams)作为Backbone初始化参数，或通过wget命令下载
-
-   ```bash
-   wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams # ImageNet pretrained model for VideoSwin_base
-
-   # wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_small_patch4_window7_224.pdparams # Imagenet pretrained model for VideoSwin_small
-   ```
-
-2. 打开`configs/recognition/videoswin/videoswin_base_k400_videos.yaml`，将下载好的权重存放路径填写到下方`pretrained:`之后
-
-    ```yaml
-    MODEL:
-        framework: "RecognizerTransformer"
-        backbone:
-            name: "SwinTransformer3D"
-            pretrained: 将路径填写到此处
-    ```
-
-#### 开始训练
-
-- Kinetics400数据集使用8卡训练，训练方式的启动命令如下:
-
-    ```bash
-    # videos数据格式
-    python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_videoswin_base main.py --validate -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml
-    ```
-
-- 开启amp混合精度训练，可加速训练过程，其训练启动命令如下：
-
-    ```bash
-    export FLAGS_conv_workspace_size_limit=800 # MB
-    export FLAGS_cudnn_exhaustive_search=1
-    export FLAGS_cudnn_batchnorm_spatial_persistent=1
-    # videos数据格式
-    python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_videoswin_base main.py --amp --validate -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml
-    ```
-
-- 另外您可以自定义修改参数配置，以达到在不同的数据集上进行训练/测试的目的，建议配置文件的命名方式为`模型_数据集名称_文件格式_数据格式_采样方式.yaml`，参数用法请参考[config](../../contribute/config.md)。
-
-
-## 模型测试
-
-- Video-Swin-Transformer模型在训练时同步进行验证，您可以通过在训练日志中查找关键字`best`获取模型测试精度，日志示例如下:
-
-  ```log
-  Already save the best model (top1 acc)0.7258
-  ```
-
-- 由于Video-Swin-Transformer模型测试模式的采样方式是速度稍慢但精度高一些的**UniformCrop**，与训练过程中验证模式采用的**CenterCrop**不同，所以训练日志中记录的验证指标`topk Acc`不代表最终的测试分数，因此在训练完成之后可以用测试模式对指定的模型进行测试获取最终的指标，命令如下：
-
-  ```bash
-  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_videoswin_base main.py --test -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml -w "output/VideoSwin_base/VideoSwin_base_best.pdparams"
-  ```
-
-  当测试配置采用如下参数时，在Kinetics-400的validation数据集上的测试指标如下：
-
-   |        backbone        | Sampling method | num_seg | target_size | Top-1 |                                                        checkpoints                                                         | pretrain model |
-   | :--------------------: | :-------------: | :-----: | :---------: | :---- | :------------------------------------------------------------------------------------------------------------------------: | :----: |
-   | Swin-Transformer_base  |   UniformCrop   |   32    |     224     | 82.40 |  [SwinTransformer_k400_base.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_base_k400.pdparams)  | [swin_base_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams) |
-   | Swin-Transformer_small |   UniformCrop   |   32    |     224     | 80.18 | [SwinTransformer_k400_small.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_small_k400.pdparams) | [swin_small_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_small_patch4_window7_224.pdparams) |
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml \
-                                -p data/VideoSwin_base_k400.pdparams \
-                                -o inference/VideoSwin_base
-```
-
-上述命令将生成预测所需的模型结构文件`VideoSwin_base.pdmodel`和模型权重文件`VideoSwin_base.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](../../usage.md#5-模型推理)
-
-### 使用预测引擎推理
-
-```bash
-python3.7 tools/predict.py --input_file data/example.avi \
-                           --config configs/recognition/videoswin/videoswin_base_k400_videos.yaml \
-                           --model_file inference/VideoSwin_base/VideoSwin_base.pdmodel \
-                           --params_file inference/VideoSwin_base/VideoSwin_base.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```log
-Current video file: data/example.avi
-        top-1 class: 5
-        top-1 score: 0.9999829530715942
-```
-
-可以看到，使用在Kinetics-400上训练好的Video-Swin-Transformer模型对`data/example.avi`进行预测，输出的top1类别id为`5`，置信度为0.99。通过查阅类别id与名称对应表`data/k400/Kinetics-400_label_list.txt`，可知预测类别名称为`archery`。
-
-## 参考论文
-
-- [Video Swin Transformer](https://arxiv.org/pdf/2106.13230.pdf), Ze Liu, Jia Ning, Yue Cao, Yixuan Wei
diff --git a/docs/zh-CN/model_zoo/segmentation/asrf.md b/docs/zh-CN/model_zoo/segmentation/asrf.md
deleted file mode 100644
index 8394916d8..000000000
--- a/docs/zh-CN/model_zoo/segmentation/asrf.md
+++ /dev/null
@@ -1,142 +0,0 @@
-[English](../../../en/model_zoo/segmentation/asrf.md) | 简体中文
-
-# ASRF 视频动作分割模型
-
----
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-## 模型简介
-
-ASRF模型是在视频动作分割模型MS-TCN上的改进，发表在2021年的WACV上。我们对官方实现的pytorch代码进行复现，在PaddleVideo获得了近似的结果。
-
-<p align="center">
-<img src="../../../images/asrf.png" height=300 width=400 hspace='10'/> <br />
-ASRF Overview
-</p>
-
-## 数据准备
-
-ASRF的训练数据可以选择50salads、breakfast、gtea三个数据集，数据下载及准备请参考[视频动作分割数据集](../../dataset/SegmentationDataset.md)
-
-不同于MS-TCN，ASRF模型需要额外的数据构建，脚本流程如下
-```bash
-python data/50salads/prepare_asrf_data.py --dataset_dir data/
-```
-
-## 模型训练
-
-数据准备完毕后，可以通过如下方式启动训练：
-
-```bash
-# gtea数据集
-export CUDA_VISIBLE_DEVICES=3
-python3.7 main.py  --validate -c configs/segmentation/asrf/asrf_gtea.yaml --seed 1538574472
-```
-
-- 从头开始训练，使用上述启动命令行或者脚本程序即可启动训练，不需要用到预训练模型，视频动作分割模型通常为全卷积网络，由于视频的长度不一，故视频动作分割模型的batch_size字段通常设为1，即不需要批量训练，目前也仅支持**单样本**训练
-
-## 模型测试
-
-可通过如下方式进行模型测试：
-
-```bash
-python main.py  --test -c configs/segmentation/asrf/asrf_gtea.yaml --weights=./output/ASRF/ASRF_split_1.pdparams
-```
-
-- 指标的具体实现是参考MS-TCN作者[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py)提供的测试脚本，计算Acc、Edit和F1分数。
-
-- pytorch的复现来源于官方提供的[代码库](https://github.com/yiskw713/asrf)
-
-- 数据集的评估方法采用MS-TCN论文中的折交验证方法，而折交的划分方式与MS-TCN论文中相同。
-
-在Breakfast数据集下评估精度如下(采用4折交验证):
-
-| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| paper | 67.6% | 72.4% | 74.3% | 68.9% | 56.1% |
-| pytorch | 65.8% | 71.0% | 72.3% | 66.5% | 54.9% |
-| paddle | 66.1% | 71.9% | 73.3% | 67.9% | 55.7% |
-
-在50salads数据集下评估精度如下(采用5折交验证):
-
-| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| paper | 84.5% | 79.3% | 82.9% | 83.5% | 77.3% |
-| pytorch | 81.4% | 75.6% | 82.7% | 81.2% | 77.2% |
-| paddle | 81.6% | 75.8% | 83.0% | 81.5% | 74.8% |
-
-在gtea数据集下评估精度如下(采用4折交验证):
-
-| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| paper | 77.3% | 83.7% | 89.4% | 87.8% | 79.8% |
-| pytorch | 76.3% | 79.6% | 87.3% | 85.8% | 74.9% |
-| paddle | 77.1% | 83.3% | 88.9% | 87.5% | 79.1% |
-
-给出在gtea数据集下的折交的模型权重
-
-Test_Data| F1@0.5 | checkpoints |
-| :----: | :----: | :---- |
-| gtea_split1 | 72.4409 | [ASRF_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_1.pdparams) |
-| gtea_split2 | 76.6666 | [ASRF_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_2.pdparams) |
-| gtea_split3 | 84.5528 | [ASRF_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_3.pdparams) |
-| gtea_split4 | 82.6771 | [ASRF_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_4.pdparams) |
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/segmentation/asrf/asrf_gtea.yaml \
-                                -p data/ASRF_gtea_split_1.pdparams \
-                                -o inference/ASRF
-```
-
-上述命令将生成预测所需的模型结构文件`ASRF.pdmodel`和模型权重文件`ASRF.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-输入预测模型的txt文件为需要预测的文件列表，如:
-```
-S1_Cheese_C1.npy
-S1_CofHoney_C1.npy
-S1_Coffee_C1.npy
-S1_Hotdog_C1.npy
-...
-```
-
-```bash
-python3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \
-                           --config configs/segmentation/asrf/asrf_gtea.yaml \
-                           --model_file inference/ASRF/ASRF.pdmodel \
-                           --params_file inference/ASRF/ASRF.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```bash
-result write in : ./inference/infer_results/S1_Cheese_C1.txt
-result write in : ./inference/infer_results/S1_CofHoney_C1.txt
-result write in : ./inference/infer_results/S1_Coffee_C1.txt
-result write in : ./inference/infer_results/S1_Hotdog_C1.txt
-result write in : ./inference/infer_results/S1_Pealate_C1.txt
-result write in : ./inference/infer_results/S1_Peanut_C1.txt
-result write in : ./inference/infer_results/S1_Tea_C1.txt
-```
-
-
-## 参考论文
-
-- [Alleviating Over-segmentation Errors by Detecting Action Boundaries](https://arxiv.org/pdf/2007.06866v1.pdf), Yuchi Ishikawa, Seito Kasai, Yoshimitsu Aoki, Hirokatsu Kataoka
diff --git a/docs/zh-CN/model_zoo/segmentation/cfbi.md b/docs/zh-CN/model_zoo/segmentation/cfbi.md
deleted file mode 100644
index fac110bce..000000000
--- a/docs/zh-CN/model_zoo/segmentation/cfbi.md
+++ /dev/null
@@ -1,49 +0,0 @@
-[English](../../../en/model_zoo/segmentation/cfbi.md) | 简体中文
-
-# CFBI视频分割模型
-
----
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型测试](#模型测试)
-- [参考论文](#参考论文)
-
-
-## 模型简介
-
-CFBI是百度在ECCV 2020提出的视频目标分割模型，该模型基于前背景整合的协作式方法，将前景目标对象与背景对象的嵌入特征进行对比，从而提升视频分割的效果。给定参考帧（第一帧）和前一帧的图像和目标分割，模型会预测出当前帧的分割。
-
-<div align="center">
-<img src="../../../images/cfbi.png" height=400 width=600 hspace='10'/> <br />
-</div>
-
-
-## 数据准备
-
-DAVIS数据下载及准备请参考[DAVIS数据准备](../../../../applications/Ma-Net/dataloaders/DAVIS2017_cn.md)
-
-
-## 模型测试
-
-- 测试启动脚本如下：
-
-```bash
-python3.7 main.py --test -c configs/segmentation/cfbip_davis.yaml -w CFBIp_davis.pdparams
-```
-
-- 通过`-c`参数指定配置文件，通过`-w`指定权重存放路径进行模型测试。
-
-- 运行上述命令，会将结果保存在配置文件中指定的`result_root`下，获取数值评估指标，请使用[davis2017-evaluation工具](https://github.com/davisvideochallenge/davis2017-evaluation)。
-
-DAVIS数据集测试精度:
-
-| J&F-Mean | J-Mean | J-Recall | J-Decay | F-Mean | F-Recall | F-Decay | checkpoints |
-| :------: | :-----: | :----: | :----: | :----: | :----: | :----: | :----: |
-| 0.823 | 0.793 | 0.885 | 0.083 | 0.852 | 0.932 | 0.100 | [CFBIp_r101_davis.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/CFBIp_r101_davis.pdparams) |
-
-
-## 参考论文
-
-- [Collaborative Video Object Segmentation by Foreground-Background Integration](https://arxiv.org/abs/2003.08333), Zongxin Yang, Yunchao Wei, Yi Yang
diff --git a/docs/zh-CN/model_zoo/segmentation/mstcn.md b/docs/zh-CN/model_zoo/segmentation/mstcn.md
deleted file mode 100644
index fac5b7b87..000000000
--- a/docs/zh-CN/model_zoo/segmentation/mstcn.md
+++ /dev/null
@@ -1,131 +0,0 @@
-[English](../../../en/model_zoo/segmentation/mstcn.md) | 简体中文
-
-# MS-TCN 视频动作分割模型
-
----
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [模型训练](#模型训练)
-- [模型测试](#模型测试)
-- [模型推理](#模型推理)
-- [参考论文](#参考论文)
-
-## 模型简介
-
-MS-TCN模型是视频动作分割模型的经典的模型，发表在2019年的CVPR上。我们对官方实现的pytorch代码进行一些优化，在PaddleVideo获得了更高精度的结果。
-
-<p align="center">
-<img src="../../../images/mstcn.PNG" height=300 width=400 hspace='10'/> <br />
-MS-TCN Overview
-</p>
-
-## 数据准备
-
-MS-TCN的训练数据可以选择50salads、breakfast、gtea三个数据集，数据下载及准备请参考[视频动作分割数据集](../../dataset/SegmentationDataset.md)
-
-## 模型训练
-
-数据准备完毕后，可以通过如下方式启动训练：
-
-```bash
-# gtea数据集
-export CUDA_VISIBLE_DEVICES=3
-python3.7 main.py  --validate -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --seed 1538574472
-```
-
-- 从头开始训练，使用上述启动命令行或者脚本程序即可启动训练，不需要用到预训练模型，视频动作分割模型通常为全卷积网络，由于视频的长度不一，故视频动作分割模型的batch_size字段通常设为1，即不需要批量训练，目前也仅支持**单样本**训练
-
-## 模型测试
-
-可通过如下方式进行模型测试：
-
-```bash
-python main.py  --test -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --weights=./output/MSTCN/MSTCN_split_1.pdparams
-```
-
-- 指标的具体实现是参考MS-TCN作者[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py)提供的测试脚本，计算Acc、Edit和F1分数。
-
-- 数据集的评估方法采用MS-TCN论文中的折交验证方法，而折交的划分方式与MS-TCN论文中相同。
-
-在Breakfast数据集下评估精度如下(采用4折交验证):
-
-| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| paper | 66.3% | 61.7% | 48.1% | 48.1% | 37.9% |
-| paddle | 65.2% | 61.5% | 53.7% | 49.2% | 38.8% |
-
-在50salads数据集下评估精度如下(采用5折交验证):
-
-| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| paper | 80.7% | 67.9% | 76.3% | 74.0% | 64.5% |
-| paddle | 81.1% | 71.5% | 77.9% | 75.5% | 66.5% |
-
-在gtea数据集下评估精度如下(采用4折交验证):
-
-| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| paper | 79.2% | 81.4% | 87.5% | 85.4% | 74.6% |
-| paddle | 76.9% | 81.8% | 86.4% | 84.7% | 74.8% |
-
-给出在gtea数据集下的折交的模型权重
-
-Test_Data| F1@0.5 | checkpoints |
-| :----: | :----: | :---- |
-| gtea_split1 | 70.2509 | [MSTCN_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_1.pdparams) |
-| gtea_split2 | 70.7224 | [MSTCN_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_2.pdparams) |
-| gtea_split3 | 80.0 | [MSTCN_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_3.pdparams) |
-| gtea_split4 | 78.1609 | [MSTCN_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_4.pdparams) |
-
-
-## 模型推理
-
-### 导出inference模型
-
-```bash
-python3.7 tools/export_model.py -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \
-                                -p data/MSTCN_gtea_split_1.pdparams \
-                                -o inference/MSTCN
-```
-
-上述命令将生成预测所需的模型结构文件`MSTCN.pdmodel`和模型权重文件`MSTCN.pdiparams`。
-
-- 各参数含义可参考[模型推理方法](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
-
-### 使用预测引擎推理
-
-输入预测模型的txt文件为需要预测的文件列表，如:
-```
-S1_Cheese_C1.npy
-S1_CofHoney_C1.npy
-S1_Coffee_C1.npy
-S1_Hotdog_C1.npy
-...
-```
-
-```bash
-python3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \
-                           --config configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \
-                           --model_file inference/MSTCN/MSTCN.pdmodel \
-                           --params_file inference/MSTCN/MSTCN.pdiparams \
-                           --use_gpu=True \
-                           --use_tensorrt=False
-```
-
-输出示例如下:
-
-```bash
-result write in : ./inference/infer_results/S1_Cheese_C1.txt
-result write in : ./inference/infer_results/S1_CofHoney_C1.txt
-result write in : ./inference/infer_results/S1_Coffee_C1.txt
-result write in : ./inference/infer_results/S1_Hotdog_C1.txt
-result write in : ./inference/infer_results/S1_Pealate_C1.txt
-result write in : ./inference/infer_results/S1_Peanut_C1.txt
-result write in : ./inference/infer_results/S1_Tea_C1.txt
-```
-
-## 参考论文
-
-- [MS-TCN: Multi-Stage Temporal Convolutional Network for Action Segmentation](https://arxiv.org/pdf/1903.01945.pdf), Y. Abu Farha and J. Gall.
diff --git a/docs/zh-CN/quick_start.md b/docs/zh-CN/quick_start.md
deleted file mode 100644
index 614486228..000000000
--- a/docs/zh-CN/quick_start.md
+++ /dev/null
@@ -1,158 +0,0 @@
-简体中文 | [English](../en/quick_start.md)
-
-# PaddleVideo快速开始
-
-- [1. 安装](#1)
-  - [1.1 安装PaddlePaddle](#11)
-  - [1.2 安装PaddleVideo Whl包](#12)
-- [2. 便捷使用](#2)
-  - [2.1 命令行使用](#21)
-  - [2.2 Python脚本使用](#22)
-- [3.参数介绍](#3)
-- [4.常见问题](#4)
-
-## 1. 安装
-
-<a name="11"></a>
-### 1.1 安装PaddlePaddle
-
-- 您的机器安装的是CUDA9或CUDA10，请运行以下命令安装
-
-  ```bash
-  python3.7 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple
-  ```
-
-- 您的机器是CPU，请运行以下命令安装
-
-  ```bash
-  python3.7 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
-  ```
-
-更多的版本需求，请参照[飞桨官网安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
-
-<a name="12"></a>
-### 1.2 安装PaddleVideo whl包
-- 方式1: 使用pypi安装（建议使用）
-
-```bash
-pip3.7 install ppvideo==2.3.0
-```
-
-
-- 方式2: 本地打包whl文件并安装
-```bash
-python3.7 setup.py bdist_wheel
-python3.7 -m pip install dist/ppvideo-2.3.0-py3-none-any.whl
-```
-
-
-## 2. 便捷使用
-
-<a name="21"></a>
-### 2.1 命令行使用
-
-安装完成后，运行如下脚本命令：
-```bash
-ppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'
-```
-
-- 上述代码使用`PP-TSM_v2`模型，基于`CPU`，对`data/example.avi`示例文件进行预测。
-
-- 示例视频长度约10s，抽帧策略采用分段抽帧，即先将视频按时间轴等分成16段，每段抽取一帧，所有帧组合之后，输入网络进行预测。
-
-运行结果如下：
-
-```
-Current video file: data/example.avi
-        top-1 classes: [5]
-        top-1 scores: [1.]
-        top-1 label names: ['archery']
-```
-
-可以看到，使用在Kinetics-400上训练好的`PP-TSM_v2`模型对`data/example.avi`进行行为识别，输出的top1类别id为`5`，置信度为`1.0`，预测类别名称为`archery`。
-
-<a name="22"></a>
-### 2.2 python脚本使用
-
-安装完成后，运行如下示例代码：
-
-```python
-from ppvideo import PaddleVideo
-clas = PaddleVideo(model_name='ppTSM_v2', use_gpu=False)
-video_file='data/example.avi'
-clas.predict(video_file)
-```
-
-上述代码使用`PP-TSM_v2`模型，基于`CPU`，对`data/example.avi`示例文件进行预测，运行结果如下：
-
-```
-Current video file: data/example.avi
-        top-1 classes: [5]
-        top-1 scores: [1.]
-        top-1 label names: ['archery']
-```
-
-可以看到，使用在Kinetics-400上训练好的`PP-TSM_v2`模型对`data/example.avi`进行预测，输出的top1类别id为`5`，置信度为`1.0`，预测类别名称为`archery`。
-
-<a name="3"></a>
-## 3. 参数介绍
-
-| 参数名称 | 参数类型 | 参数含义 |
-| :---: | :---: | :--- |
-| model_name | str | 可选，模型名称，`'ppTSM`'或`'ppTSM_v2'`。 如果不指定，需要通过`model_file`和`params_file`，提供自己的推理模型文件路径进行推理。 |
-| video_file | str | 必选，视频文件路径，支持格式：单个视频文件路径，包含多个视频的文件夹。 |
-| use_gpu | bool | 是否使用GPU，默认为True。 |
-| num_seg | int | TSM分段采样策略中segment的数量，同时也是视频中抽帧的数量，8对应`ppTSM`模型，16对应`ppTSM_v2`模型，默认为16。 |
-| short_size | int |  帧的短边尺寸大小，默认为256。|
-| target_size | int | 帧的目标尺寸大小，默认为224。|
-| model_file | str | 可选，推理模型的模型文件(`.pdmodel`)的路径。|
-| params_file | str | 可选，推理模型的参数文件(`.pdiparams`)的路径。|
-| batch_size | int | Batch size, 默认为1。|
-| use_fp16 | bool | 是否使用float16，默认为False。|
-| use_tensorrt | bool| 是否使用Tensorrt，默认为False。|
-| gpu_mem | int | GPU使用显存大小，默认为8000。|
-| enable_mkldnn | bool | 是否使用MKLDNN，默认为False。|
-| top_k | int | 指定返回的top_k，默认为1。|
-| label_name_path | str | 类别id和类别名称对应关系文件。默认使用Kinetics-400数据集使用的标签文件`data/k400/Kinetics-400_label_list.txt`，可参考以上格式替换成自己的标签文件。|
-
-示例命令1：
-```bash
-ppvideo --model_name='ppTSM_v2' --num_seg=16 --video_file="data/mp4" --batch_size=2  --top_k=5
-```
-- 命令表示使用`PP-TSM_v2`模型，对`data/mp4`文件夹下的所有视频文件进行推理，`batch_size`为2，输出`top5`结果。
-- `ppTSM`对应的`num_seg`为8，`ppTSM_v2`对应的`num_seg`为16。
-- 使用GPU预测，占用显存约为`1400MB`。
-
-输出示例：
-```txt
-Current video file: data/mp4/example3.avi
-        top-5 classes: [  5 345 311 159 327]
-        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]
-        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']
-Current video file: data/mp4/example2.avi
-        top-5 classes: [  5 345 311 159 327]
-        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]
-        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']
-Current video file: data/mp4/example.avi
-        top-5 classes: [  5 345 311 159 327]
-        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]
-        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']
-Current video file: data/mp4/example1.avi
-        top-5 classes: [  5 345 311 159 327]
-        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]
-        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']
-```
-
-示例命令2：
-```bash
-ppvideo --model_name='ppTSM' --num_seg=8 --video_file="data/mp4" --batch_size=2  --top_k=5
-```
-- 命令表示使用`ppTSM`模型进行推理。
-
-<a name="4"></a>
-## 4. 常见问题
-
-1. 在下载opecv-python的过程中你可能遇到困难，可以尝试使用其他源进行安装：
-```
-python3.7 -m pip install opencv-python==4.2.0.32 -i https://pypi.doubanio.com/simple
-```
diff --git a/docs/zh-CN/tools.md b/docs/zh-CN/tools.md
deleted file mode 100644
index 50d5c7e47..000000000
--- a/docs/zh-CN/tools.md
+++ /dev/null
@@ -1,19 +0,0 @@
-简体中文 | [English](../en/tools.md)
-
-# 小工具
-
-这篇文档主要介绍PaddleVideo的一些小工具
-
-## 统计 Params
-
-```shell
-python3.7 tools/summary.py -c configs/recognition/tsm/tsm.yaml
-```
-
-## 统计FLOPS
-
-```shell
-python3.7 tools/summary.py -c configs/recognition/tsm/tsm.yaml --FLOPs
-```
-
-## 测试导出模型 <sup>coming soon</sup>
diff --git a/docs/zh-CN/tutorials/I3D.md b/docs/zh-CN/tutorials/I3D.md
deleted file mode 100644
index bd2c32a54..000000000
--- a/docs/zh-CN/tutorials/I3D.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# I3D
-
-## 简介
-本文提出了一种基于 2D-ConvNet 扩展的双流膨胀模型 I3D，作者将图像分类的 2D 卷积网络的滤波器和池化核扩展到 3D 中，使得从视频中学习无缝时空特征提取器成为可能。
-
-## 重点贡献
-* 提出了 Kinetics 数据集
-* 提出了双流 3D 卷积模型
-
-## kinetics数据集
-Kinetics 数据集有 400 个人体动作类别，每个类别有 400 多个视频片段，这些数据来自真实有挑战的 YouTube 视频。数据集包括的动作大类别有：
-1. 单人动作：绘画、饮酒、笑
-2. 人与人之间的动作：拥抱、亲吻、握手
-3. 人与物之间的动作：打开礼物、洗碗、除草
-4. 需要细分的动作，比如不同类型的游泳
-5. 侧重于物体的信息，比如不同类型的乐器
-
-## 动机
-图像领域有一个超大规模的 ImageNet 数据集，很多图像任务采用的都是 ImageNet 预训练模型，并且取得了不错了效果。在视频领域中，如果我们有一个超大规模的数据集，将在该数据集上预训练好的的动作分类模型应用到其他时序任务或不同的数据集上是否会有类似性能的提升。为了验证这个猜想，作者将在 Kinetics 上的预训练模型应用到 HMDB-51 和 UCF-101 这种小的数据集上。实验结果表明，性能总是会得到提升，提升的程度与模型的结构有关。
-
-基于此发现，作者提出了 I3D，基于 InceptionV1 的 I3D 模型在经过 Kinetics 预训练后，其性能远远超过了当时最先进的水平。
-
-## 主要工作
-1. 在 Kinetics 数据集上做模型预训练，将预训练模型应用到 HMDB-51 和 UCF101 数据集上，验证大规模视频数据的有效性；
-2. 基于 2D-ConvNet，提出了新的行为识别模型 I3D；
-
-## 行为识别方法分析
-当前，行为识别模型主要的不同点：
-1. 卷积和层运算使用的是 2D 核还是 3D 核；
-2. 网络的输入仅仅包含的是 RGB 视频还是也包括预计算的光流；
-3. 在 2D-ConvNet 情况下，帧之间的信息如何传播；
-
-## 模型分析
-作者比较和研究了一些模型，这些模型有的基于 2D-ConvNet，有的基于 3D-ConvNet。之前基于 3D-ConvNet 的模型由于可用的训练数据少，网络结构相对较浅。于是本文将非常深的 2D-ConvNet 图像分类网络膨胀为 3D-ConvNet 的时空特征提取网络，同时将其作为 two-stream 框架的主干网络。由于之前的 2D-ConvNet 网络本身比较深，又可以使用 2D-ConvNet 的参数初始化相应 3D-ConvNet 的网络，因此可以解决之前训练数据不足的问题。
-
-这里作者分析五种网络结构，如下图所示。
-
-<center><img src="../../images/i3d_compare.jpg" width="700" hegiht="" ></center>
-<center><br>网络结构</br></center>
-<br></br>
-
-### The Old I: ConvNet+LSTM
-将图像分类模型应用到视频分析上的一个直接想法是，把视频中的每帧看作一张独立的图像，提取每张图像后，对整个视频求均值。但这样做完全忽略了视频中的时序信息，一个比较合理的方法是在网络的末端添加一个 LSTM 层，用于学习视频帧之间的时序关系。因此 ConvNet+LSTM 的文章在 InceptionV1 的最后一个 average-pooling 后面加了一个包含 512 个隐含节点的 LSTM，最后接了一个用于分类的全连接层。
-
-### The Old 2 ：3D ConvNets
-3D-ConvNets 是建模视频任务一个很自然的想法，与标准的卷积网络相比，增加了一个时空维度。由于时空维度的增加，使得 3D-ConvNets 比 2D-ConvNets 有更多的参数，增加了网络训练的困难。此外，网络结构是 3D 的，无法直接复用 2D 模型的参数。
-
-### The Old III: Two-Stream Networks
-ConvNet+LSTM 的模型结构仅仅捕获高层信息的变化，对于帧和帧之间在底层动作信息的捕获是不够的，并且底层动作信息在行为识别中是非常重要的。于是一些研究人员提出了 Two-Stream 网络，Two-Stream 分为两路，一路用于提取 RGB 信息，一路用于提取光流信息；这样的网络设计对空间维度和时间维度的提取都比较好。这种方法比较容器训练和测试，并且在公开数据集上取得了比较不错的效果。
-
-> Two-Stream 中的两个模型是分开训练的。
-
-### The New: Two-Stream Inflated 3D ConvNets
-#### 1 inflating 2D ConvNets into 3D
-把在 ImageNet 上表现好的 2D 模型直接扩展为 3D 模型，具体做法是将 2D 结构中的二维卷积核与池化核扩展一维，由之前的 <a href="https://www.codecogs.com/eqnedit.php?latex=N&space;\times&space;N" target="_blank"><img src="https://latex.codecogs.com/gif.latex?N&space;\times&space;N" title="N \times N" /></a> 变成 <a href="https://www.codecogs.com/eqnedit.php?latex=N&space;\times&space;N&space;\times&space;N" target="_blank"><img src="https://latex.codecogs.com/gif.latex?N&space;\times&space;N&space;\times&space;N" title="N \times N \times N" /></a>。
-
-#### Bootstrapping 3D filters from 2D Filters
-作者将一帧图像沿着时间轴复制 N 次，将其变为一个 boring video。为了保证在这个 boring video 上做卷积操作后池化激活与原始图像经过卷积操作后的池化激活相同，这里用到的方法是将 2D 卷积核 <a href="https://www.codecogs.com/eqnedit.php?latex=N&space;\times&space;N" target="_blank"><img src="https://latex.codecogs.com/gif.latex?N&space;\times&space;N" title="N \times N" /></a> 在时间维度上重复 N 次，得到 <a href="https://www.codecogs.com/eqnedit.php?latex=N&space;\times&space;N&space;\times&space;N" target="_blank"><img src="https://latex.codecogs.com/gif.latex?N&space;\times&space;N&space;\times&space;N" title="N \times N \times N" /></a>，之后再除以 N 的方式，确保滤波器的响应是相同的。
-
-#### Pacing receptive field growth in space,time and network depth
-将 2D-ConvNet 扩展到 3D-ConvNet 后，如何设置时间维度上的 kernel。目前几乎所有的图像相关的模型都平等的看待空间维度中的水平和垂直两个方向，两个方向上的 kernel 是相等的。当加入时间维度后，再使用完全对称的感受野并不是最好的选择，应该考虑帧速率和图像尺寸。
-* 如果时间维度比空间维度增长过快，可能会影响物体边缘信息，从而破坏物体的特征检测；
-* 如果时间维度比空间维度增长过慢，可能无法很好的捕捉场景的动态信息，从而影响对动作的检测；
-
-因此，作者对 InceptinV1 进行扩展时，大多数保持了对称特征，如第一个卷积核由 <a href="https://www.codecogs.com/eqnedit.php?latex=7&space;\times&space;7" target="_blank"><img src="https://latex.codecogs.com/gif.latex?7&space;\times&space;7" title="7 \times 7" /></a> 变成了 <a href="https://www.codecogs.com/eqnedit.php?latex=7&space;\times&space;7&space;\times&space;7" target="_blank"><img src="https://latex.codecogs.com/gif.latex?7&space;\times&space;7&space;\times&space;7" title="7 \times 7 \times 7" /></a>，stride 也从原来的 (2,2) 变成了 (2,2,2)；只对少数做了改变，如前面两个 max-pool，并不是 <a href="https://www.codecogs.com/eqnedit.php?latex=2&space;\times&space;2&space;\times&space;2" target="_blank"><img src="https://latex.codecogs.com/gif.latex?2&space;\times&space;2&space;\times&space;2" title="2 \times 2 \times 2" /></a>，而是 <a href="https://www.codecogs.com/eqnedit.php?latex=1&space;\times&space;2&space;\times&space;2" target="_blank"><img src="https://latex.codecogs.com/gif.latex?1&space;\times&space;2&space;\times&space;2" title="1 \times 2 \times 2" /></a>，这样能够比较好的保留时间维度的信息，以及最后的 avg-pool 不是<a href="https://www.codecogs.com/eqnedit.php?latex=7&space;\times&space;7&space;\times&space;7" target="_blank"><img src="https://latex.codecogs.com/gif.latex?7&space;\times&space;7&space;\times&space;7" title="7 \times 7 \times 7" /></a> 而是 <a href="https://www.codecogs.com/eqnedit.php?latex=2&space;\times&space;7&space;\times&space;7" target="_blank"><img src="https://latex.codecogs.com/gif.latex?2&space;\times&space;7&space;\times&space;7" title="2 \times 7 \times 7" /></a> 。
-
-<center><img src="../../images/i3d_expand.jpg" width="700" hegiht="" ></center>
-<center><br>网络扩展</br></center>
-<br></br>
-
-
-#### Two 3D Streams
-虽然，3D-ConvNet 已经能够比较好的提取视频中的动作特征，但带有光流的 two-stream 结构对动作识别依然有巨大的帮助。因此作者将 3D-ConvNet 设计成 two-stream 形式，训练时左右两个网络分开训练，预测时对两个网络的预测结果做均值。
-
-## 实验结果
-在 UCF-101，HMDB-51 或 Kinetics 上进行训练和测试时的分类准确度。
-
-<center><img src="../../images/i3d_expriment1.jpg" width="700" hegiht="" ></center>
-<center><br>实验结果1</br></center>
-<br></br>
-
-从 ImageNet 预训练或没有进行预训练模型在 Kinetics 上的表现。
-
-<center><img src="../../images/i3d_expriment2.jpg" width="700" hegiht="" ></center>
-<center><br>实验结果2</br></center>
-<br></br>
-
-
-## 参考
-[Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset](https://arxiv.org/abs/1705.07750)
diff --git a/docs/zh-CN/tutorials/SlowFast.md b/docs/zh-CN/tutorials/SlowFast.md
deleted file mode 100644
index b80e8d12d..000000000
--- a/docs/zh-CN/tutorials/SlowFast.md
+++ /dev/null
@@ -1,172 +0,0 @@
-# SlowFast
-
-## 背景
-SlowFast 由 Facebook FAIR 的何恺明团队提出，用于视频识别。SlowFast 包含两条路径：
-* Slow pathway
-* Fast pathway
-
-Slow pathway 运行低帧率，用于捕捉空间语义信息；Fast pathway 运行高帧率，获取精确的时间运动信息。通过降低通道数量，Fast pathway 分支可以变成轻量的网络，同时也能够学到视频中有用的时域信息。SlowFast 在没有任何预训练的情况下，在 Kinetics 数据集上的准确率达到了 79.0%。
-
-## 动机
-SlowFast 受到灵长类视觉系统中视网膜神经节细胞的生物学研究的启发。研究发现，这些细胞中约80%的都是P-cell，约15～20% 是 M-cell。M-cell 以较高的时间频率工作，能够对快速的时间变化作出响应，但是对空间细节和颜色不敏感。P-cell 则提供良好的空间细节和颜色信息，但时间分辨率较低，对刺激反应比较慢。
-
-SlowFast 与此相似：
-* SlowFast 有两条路径，分别处理低帧率和高帧率；
-* Fast pathway 用于捕捉快速变化的动作，单涉及到的细节信息较少，与M-cell类似；
-* Fast pathway 是轻量的，与M-cell的占比类似。
-
-## 简介
-在图像识别领域，对称处理图像 I(x,y) 中两个空间维度 x 和 y 是常见的做法，自然图像的统计也证明了其合理性。这是由于自然图像具有第一近似各向同性（所有方向具有相同的可能性）和平移不变性。但对于视频信号 I(x,y,t)来说，并不是所有的时空方向都有相同的可能性。因此不能像时空卷积那样对称地处理空间和时间。此时需要将网络结构分开，分别处理空间结构和时间事件。
-
-视觉内容的类别空间语义变化通常比较缓慢。比如，挥手不会在这个动作进行期间改变“手”的类别；一个人从走路变为跑步，识别结果也一直是“人”。因此类别语义的识别（以及颜色、纹理、光照等）可以以较慢的速度刷新。另一方面，正在执行的动作比其主体识别变化的速度要快得多，如拍手、挥手、摇摆、走路或跳跃。因此需要用较快的帧率刷新（高时间分辨率），来对快速变化的动作进行建模。
-
-## 思路
-基于上述想法作者提出了一种用于视频识别的双路径模型 SlowFast 。
-
-<p align="center">
-<img src="../../images/slowfast_network.jpg" height=300 width="" hspace='10'/> <br />
-网络结构
-</p>
-
-
-如上图所示，一条路径用于捕获图像或稀疏帧提供的语义信息，以低帧率运行，刷新速度慢。另一条路径用于捕获快速变化的动作，刷新速度快、时间分辨率高，该路径是轻量级的，仅占整体计算量的20%。这是由于这条路径通道较少，处理空间信息的能力较差，但空间信息可以由第一个路径以简洁的方式来处理。
-
-依据两条路径运行的帧率高低不同，作者将第一条路径称为“Slow pathway”；第二条路径称为“Fast pathway”；两条路径通过横向连接进行融合。
-
-## SlowFast
-SlowFast 的两个分支以不同的速率运行，作者通过使用两个分支模拟生物学上的大小细胞。
-
-### Slow Pathway
-Slow pathway 可以是任意在视频片段上做时空卷积的模型，如时空残差网络，C3D，I3D，Non-local网络等。Slow pathway 的关键之处在于对视频帧进行采样时，时间步长 <a href="https://www.codecogs.com/eqnedit.php?latex=\tau" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\tau" title="\tau" /></a> 较大，也就是说，只处理 <a href="https://www.codecogs.com/eqnedit.php?latex=\tau" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\tau" title="\tau" /></a> 帧中的一帧。这里，作者建议 <a href="https://www.codecogs.com/eqnedit.php?latex=\tau" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\tau" title="\tau" /></a> 的取值为 16，对于 30fps 的视频，差不多每秒采样 2 帧。如果 Slow pathway 采样的帧数是 T，那么原始视频片段的长度为 <a href="https://www.codecogs.com/eqnedit.php?latex=T\times&space;\tau" target="_blank"><img src="https://latex.codecogs.com/gif.latex?T\times&space;\tau" title="T\times \tau" /></a>。
-
-### Fast Pathway
-#### 高帧率
-Fast pathway 的目的为了在时间维度上有良好的特征表示，Fast pathway 的时间步长 <a href="https://www.codecogs.com/eqnedit.php?latex=\frac{\tau}{\alpha}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\frac{\tau}{\alpha}" title="\frac{\tau}{\alpha}" /></a> 较小，其中 <a href="https://www.codecogs.com/eqnedit.php?latex=\alpha&space;>&space;1" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\alpha&space;>&space;1" title="\alpha > 1" /></a> 是 Slow pathway 和 Fast pathway 之间帧率比，作者建议 <a href="https://www.codecogs.com/eqnedit.php?latex=\alpha" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\alpha" title="\alpha" /></a> 的取值为 8。由于两条路径在同一个视频上进行操作，因此 Fast pathway 采样到的帧数量为 <a href="https://www.codecogs.com/eqnedit.php?latex=\alpha&space;T" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\alpha&space;T" title="\alpha T" /></a>，比 Slow pathway 密集 <a href="https://www.codecogs.com/eqnedit.php?latex=\alpha" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\alpha" title="\alpha" /></a> 倍。
-
-#### 高时间分辨率特征
-Fast pathway 具有高输入分辨率，同时整个网络结构会运行高分辨率特征。在最后的分类全局池化层之前作者没有采用时间下采样层，因此在特征张量在时间维度上一直保持在 <a href="https://www.codecogs.com/eqnedit.php?latex=\alpha&space;T" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\alpha&space;T" title="\alpha T" /></a> 帧。
-
-#### 低通道容量
-Fast pathway 是一个与 Slow pathway 相似的卷积网络，但通道数只有 Slow pathway 的 <a href="https://www.codecogs.com/eqnedit.php?latex=\beta" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\beta" title="\beta" /></a> 倍，其中 <a href="https://www.codecogs.com/eqnedit.php?latex=\beta&space;<&space;1" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\beta&space;<&space;1" title="\beta < 1" /></a>，作者建议 <a href="https://www.codecogs.com/eqnedit.php?latex=\beta" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\beta" title="\beta" /></a> 的取值为 <a href="https://www.codecogs.com/eqnedit.php?latex=\frac{1}{8}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\frac{1}{8}" title="\frac{1}{8}" /></a>。这是的 Fast pathway 比 Slow pathway 的计算更高效。
-
-低通道容量可以理解为表示空间语义信息的能力较弱。由于 Fast pathway 的通道数更少，因此 Fast pathway 的空间建模能力应该弱于 Slow pathway。但 SlowFast 的实验结果表明这反而是有利的，它弱化了空间建模能力，却增强了时间建模能力。
-
-### 横向连接
-作者通过横向连接对两条路径的信息进行融合，使得 Slow pathway 知道 Fast pathway 在学习什么。作者在两条路径中的每个“阶段”上使用一个横向连接，由于两条路径的时间维度不同，因此在进行横向连接时需要通过变换对两条路径的维度进行匹配。最后，将两条路径的输出进行全局平均池化，并将池化后的特征拼接在一起作为全连接分类器层的输入。
-
-### 实例化
-SlowFast 模型的思想是通用的，可以用不同的主干网络来实现。如下图所示是一个 SlowFast 实例化的例子，其中黄色是通道数量，绿色是时序帧分辨率。
-
-<p align="center">
-<img src="../../images/slowfast_structure.jpg"  width="500" hegiht="" hspace='10'/> <br />
-实例化
-</p>
-
-作者用<a href="https://www.codecogs.com/eqnedit.php?latex=T\times&space;S^2" target="_blank"><img src="https://latex.codecogs.com/gif.latex?T\times&space;S^2" title="T\times S^2" /></a>表示时空尺度，其中 T 是时间长度，S 是正方形裁剪区域的宽和高。
-
-#### Slow Pathway
-Slow pathway 是一个具有时间步长的 3D ResNet，网络时间维度的输入帧数 T = 4，从 64 帧视频片段中稀疏采样得到，时间步长 <a href="https://www.codecogs.com/eqnedit.php?latex=\tau=16" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\tau=16" title="\tau=16" /></a>。作者没有采用时间下采样在实例化中，由于当输入步长较大时，这样做是有害的。
-
-Slow pathway 与 C3D/I3D 模型不同，从 conv_1 到 res_3 的滤波器本质上都是2D卷积核，只有 res_4 和 res_5 使用的是非退化时间卷积。之所以采用这种设计是由于作者通过实验发现，在早期层使用时间卷积会降低准确率。作者认为是由于当物体快速移动且时间步长较大时，在一个时间感受野内的相关性就很小，除非空间感受野也足够地大。
-
-#### Fast Pathway
-Fast pathway 的时间分辨率较高，通道容量较低。Fast pathway 的每个模块中都使用了非退化时间的卷积，并且没有使用时间下采样层。之所以这样设计是因为作者发现 Fast pathway 的时间卷积有很好的时间分辨率，可以捕捉细节动作。
-
-#### 横向连接
-横向连接是从 Fast pathway 到 Slow pathway，在融合之前需要保证两个维度是匹配的，Slow pathway 的特征维度是 <a href="https://www.codecogs.com/eqnedit.php?latex=\left\{&space;T,S^2,C&space;\right\}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\left\{&space;T,S^2,C&space;\right\}" title="\left\{ T,S^2,C \right\}" /></a>，Fast pathway 的特征维度是 <a href="https://www.codecogs.com/eqnedit.php?latex=\left\{&space;\alpha&space;T,S^2,\beta&space;C&space;\right\}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\left\{&space;\alpha&space;T,S^2,\beta&space;C&space;\right\}" title="\left\{ \alpha T,S^2,\beta C \right\}" /></a>，在连接方案上作者进行了如下实验：
-* Time-to-channel：对<a href="https://www.codecogs.com/eqnedit.php?latex=\alpha&space;T,&space;S^2,&space;\beta&space;C" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\alpha&space;T,&space;S^2,&space;\beta&space;C" title="\alpha T, S^2, \beta C" /></a>进行变形和转置，得到 <a href="https://www.codecogs.com/eqnedit.php?latex=\left\{&space;T,S^2,\alpha&space;\beta&space;C&space;\right\}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\left\{&space;T,S^2,\alpha&space;\beta&space;C&space;\right\}" title="\left\{ T,S^2,\alpha \beta C \right\}" /></a>，也就是说将所有的 <a href="https://www.codecogs.com/eqnedit.php?latex=\alpha" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\alpha" title="\alpha" /></a> 帧放入一帧的多个通道内。
-* Time-strided sampling：每 <a href="https://www.codecogs.com/eqnedit.php?latex=\alpha" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\alpha" title="\alpha" /></a> 帧，采样一帧，所以 <a href="https://www.codecogs.com/eqnedit.php?latex={\alpha&space;T,&space;S^2,&space;\beta&space;C}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?{\alpha&space;T,&space;S^2,&space;\beta&space;C}" title="{\alpha T, S^2, \beta C}" /></a> 就变成了 <a href="https://www.codecogs.com/eqnedit.php?latex=\left\{&space;T,S^2,\beta&space;C&space;\right\}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\left\{&space;T,S^2,\beta&space;C&space;\right\}" title="\left\{ T,S^2,\beta C \right\}" /></a> 。
-* Time-strided convolution：使用 3D 卷积，卷积核大小是 <a href="https://www.codecogs.com/eqnedit.php?latex=5&space;\times&space;1^2" target="_blank"><img src="https://latex.codecogs.com/gif.latex?5&space;\times&space;1^2" title="5 \times 1^2" /></a>，输出通道数为 <a href="https://www.codecogs.com/eqnedit.php?latex=2\beta&space;C" target="_blank"><img src="https://latex.codecogs.com/gif.latex?2\beta&space;C" title="2\beta C" /></a>，步长为<a href="https://www.codecogs.com/eqnedit.php?latex=\alpha" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\alpha" title="\alpha" /></a>。
-
-## PaddleVideo
-PaddleVideo 关于采样的核心代码
-```python
-class PackOutput(object):
-    """
-    In slowfast model, we want to get slow pathway from fast pathway based on
-    alpha factor.
-    Args:
-        alpha(int): temporal length of fast/slow
-    """
-    def __init__(self, alpha):
-        self.alpha = alpha
-
-    def __call__(self, results):
-        fast_pathway = results['imgs']
-
-        # sample num points between start and end
-        slow_idx_start = 0
-        slow_idx_end = fast_pathway.shape[0] - 1
-        slow_idx_num = fast_pathway.shape[0] // self.alpha   # slow 的采样数量
-        # 在区间[slow_idx_start, slow_idx_end] 内均匀采样
-        slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,
-                                       slow_idx_num).astype("int64")
-        slow_pathway = fast_pathway[slow_idxs_select]  # 取出采样到的图片
-
-        # T H W C -> C T H W.
-        slow_pathway = slow_pathway.transpose(3, 0, 1, 2)   # 对维度做转换
-        fast_pathway = fast_pathway.transpose(3, 0, 1, 2)
-
-        # slow + fast
-        frames_list = [slow_pathway, fast_pathway]
-        results['imgs'] = frames_list
-        return results
-```
-
-PaddleVideo 中关于特征融合的核心代码
-```python
-class FuseFastToSlow(paddle.nn.Layer):
-    """
-    Fuses the information from the Fast pathway to the Slow pathway. Given the
-    tensors from Slow pathway and Fast pathway, fuse information from Fast to
-    Slow, then return the fused tensors from Slow and Fast pathway in order.
-    """
-    def __init__(self,
-                 dim_in,
-                 fusion_conv_channel_ratio,
-                 fusion_kernel,
-                 alpha,
-                 eps=1e-5,
-                 norm_module=paddle.nn.BatchNorm3D):
-        """
-        Args:
-            dim_in (int): the channel dimension of the input.
-            fusion_conv_channel_ratio (int): channel ratio for the convolution
-                used to fuse from Fast pathway to Slow pathway.
-            fusion_kernel (int): kernel size of the convolution used to fuse
-                from Fast pathway to Slow pathway.
-            alpha (int): the frame rate ratio between the Fast and Slow pathway.
-            eps (float): epsilon for batch norm.
-        """
-        super(FuseFastToSlow, self).__init__()
-        fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)
-        initializer_tmp = get_conv_init(fan)
-
-        self._conv_f2s = paddle.nn.Conv3D(
-            in_channels=dim_in,
-            out_channels=dim_in * fusion_conv_channel_ratio,
-            kernel_size=[fusion_kernel, 1, 1],
-            stride=[alpha, 1, 1],
-            padding=[fusion_kernel // 2, 0, 0],
-            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
-            bias_attr=False)
-        self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,
-                               epsilon=eps,
-                               weight_attr=get_bn_param_attr(),
-                               bias_attr=get_bn_param_attr(bn_weight=0.0))
-
-    def forward(self, x):
-        x_s = x[0]
-        x_f = x[1]
-        fuse = self._conv_f2s(x_f)
-        fuse = self._bn(fuse)
-        fuse = F.relu(fuse)
-        x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)
-
-        return [x_s_fuse, x_f]
-```
-
-
-
-## 参考
-[SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982)
diff --git a/docs/zh-CN/tutorials/TSM.md b/docs/zh-CN/tutorials/TSM.md
deleted file mode 100644
index cae140867..000000000
--- a/docs/zh-CN/tutorials/TSM.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# TSM模型原理及PaddleVideo实践
-# 1. 背景与动机
-目前互联网视频数据日益增多，用户观看短视频、小视频的时长也迅速增长，如何对海量的视频资源快速准确地分析、处理、归类是一个亟待解决的问题。视频理解技术可以多维度解析视频内容，理解视频语义，自动分类打标签，极大节省人工审核效率，节约成本；同时实现精准用户推荐，提升体验效果。
-本文将给大家介绍视频理解领域的经典模型**TSM (Temporal Shift Module)**, TSM是由**MIT**和**IBM Watson AI Lab**的`Ji Lin，Chuang Gan和SongHan`等人提出的通过时间位移模拟3D建模达到效果和性能的平衡，提高视频理解能力的模块。
-
-
-
-跟TSM最相关的视频理解模型当属Limin Wang等人在ECCV2016上发表的Temporal Segment Network (TSN)了。TSN模型从视频中采样N帧图像并通过最简单直接地对N帧图像分类结果进行平均的方式进行时序信息融合，取得了当时State-of-the-art的性能，并得到大规模的应用。考虑到TSN模型对时序信息的建模不够充分，以I3D，S3D, P3D等为代表的一系列工作通过3D卷积进行端到端联合时空建模，这一系列工作尽管能捕获时空特征，但是相比TSN，由2D卷积到3D卷积不可避免地引入了额外计算量。TSM巧妙的通过时间维度特征map移位的想法，理论上用零额外计算开销达到了不同帧之间特征融合联合建模的目的。
-
-论文传送门: [Temporal Shift Module for Efficient VideoUnderstanding](https://arxiv.org/pdf/1811.08383v2.pdf)
-
-先看一下下图的例子：如果图片分别从左往右播放和从右往左播放，测试者会给出不同但是正确的理解结果，说明对视频的理解强依赖于视频的时序关系，你猜对了！这就是TSM提出的动机，即捕捉视频的时间信息。
-<p align="center">
-<img src="../../images/temporal.png" height=188 width=500 hspace='10'/> <br />
-</p>
-看起来好像很有意思，我们下面继续深入解析一下TSM的核心模块。
-
-# 2. TSM关键技术介绍
-
-在传统的图片分析的基础上，视频分析需要研究者补充关于时间信息（temporal information）的建模结构。目前，2D CNN和3D CNN是视频理解中最常用的两个方法：使用2D CNN 模型运算量少但会丧失部分时间信息；而使用3D CNN虽然效果好但运算量极大。面对这样的情况，MIT和IBM Watson AI Lab的Ji Lin，Chuang Gan和Song Han等人提出了Temporal Shift Module (TSM)模块。他们将时间位移模块嵌入2D CNN，从而可以在不添加任何额外的计算量和参数的情况下，轻松地达到与3D CNN效果相当的视频理解能力。
-<p align="center">
-<img src="../../images/tsm_intr.png" height=188 width=500 hspace='10'/> <br />
-</p>
-上图中矩阵的行和列分别表示特征图中的temporal和channel维度。在TSM模块中，将一部分的channel在temporal维度上向前位移一步，一部分的channel在temporal维度上向后位移一步，位移后的空缺补零。通过这种方式在特征图中引入temporal维度上的上下文交互，通过通道移动操作可以使得在当前帧中包含了前后两帧的通道信息，这样再进行2D卷积操作就能像3D卷积一样直接提取视频的时空信息，
-提高了模型在时间维度上的建模能力。在此基础上，研发人员将模块进一步细分为适合在线视频使用的TSM模块和适合离线视频使用的TSM模块。
-<p align="center">
-<img src="../../images/tsm_architecture.png" height=188 width=500 hspace='10'/> <br />
-</p>
-
-双向（bi-direction）的TSM模块可获取过去和未来的时空信息，适合高吞吐量的离线视频使用；而单向（uni-direction）的TSM模块仅可比对现在和过去的时空信息，适用于低延迟在线视频的识别。
-此外，论文中作者还考虑了TSM模块插入的位置，对比了两种TSM插入方式：**Residual tsm** 和 **In-place tsm**，作者发现使用**Residual tsm**方式会比 **In-place tsm** 的方式效果更好，文中作者解释为**In-place tsm** 会影响模型对空间信息的提取。
-<p align="center">
-<img src="../../images/residual_tsm.png" height=188 width=500 hspace='10'/> <br />
-</p>
-
-
-好了，TSM模块基本原理搞清楚了是不是**So Easy ！！！**，接下来问题来了，代码该如何实现呢？
-
-# 3. 关键代码解析
-
-原理搞清楚了，下面来看看代码如何实现，首先我们来看看torch版本如何实现的，呃呃呃...，不好意思torch框架并未提供TSM的API，我们只能自己动手啦，具体实现代码如下图所示：
-<p align="center">
-<img src="../../images/torch_tsm.png" height=160 width=500 hspace='10'/> <br />
-</p>
-
-这意味着你只需要在TSN的代码基础上添加4行代码就能将准确率在Something-Something这样的数据集上**翻上一倍！！！**  是不是简单高效的模型 ？不得不向大佬低头！
-
-But...，
-
-
-飞桨框架充分考虑到广大用户的需求已经为各位童鞋实现了TSM的OP
-<p align="center">
-<img src="../../images/tsm_op.png" height=300 width=400 hspace='10'/> <br />
-</p>
-
-所以各位童鞋再也不用自己实现了，**直接调用就可以啦！！！,直接调用就可以啦！！！，直接调用就可以啦！！！**，重要的事情讲三遍，
-
-是不是以为事情到这里就结束啦 ？ 唉! **Too young Too simple !!!**
-
-我们在此基础上还对其进行了性能优化，在降低显存消耗的同时，可以提速5倍以上，详细信息可以参考[加速文档](./accelerate.md)
-
-下面我们来看看使用飞桨如何实现TSM：
-
-`import paddle.nn.functional as F`
-
-
-`shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)`
-
-两行代码就可以实现TSM了，是不是很简单？
-
-# Reference
-[1] [Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018](https://arxiv.org/pdf/1811.08383v2.pdf).
-
-
-[2] [Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016](https://arxiv.org/abs/1608.00859).
diff --git a/docs/zh-CN/tutorials/TSN.md b/docs/zh-CN/tutorials/TSN.md
deleted file mode 100644
index 7e6804821..000000000
--- a/docs/zh-CN/tutorials/TSN.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# TSN
-
-## 背景
-TSN 可以看作是对 two-stream 的改进，通过设计有效的卷积网络体系结构 TSN 解决视频动作分类中的两个主要问题：
-* 长距离时序依赖问题（有些动作在视频中持续的时间较长）；
-* 解决数据量较少的问题；
-
-## 贡献
-TSN 的贡献可概括为以下两点：
-* TSN 模型基于 long-range temporal structure 建模，结合了 sparse temporal sampling strategy 和 video-level supervision 从而保证对整段视频学习的有效性和高效性；
-* 提出了一系列最佳实践方案；
-
-## 原理
-由于 two-stream 网络处理的是单帧图像（空间网络）或者短片段中的一堆帧图像（时序网络），因此 two-stream 网络无法满足时间跨度较长的视频动作。为了能够处理长范围时序结构的情况，可以使用密集帧采样方式从视频中获取长时间信息，但这样会增加时间成本同时采样到的连续帧之间存在冗余。于是在 TSN 模型中作者使用稀疏采用的方式来替代密集采样，降低计算量的同时一定程度上也去除了冗余信息。
-
-TSN 采用和 two-stream 相似的结构，网络由空间流卷积网络和时间流卷积组成。TSN 使用稀疏采样的方式从整段视频采出一系列的短片段，其中每个片段都会有一个对自身动作类别的初步预测，之后通过对这些片段的预测结果进行“融合”得出对整个视频的预测结果。
-
-## 网络结构
-如下图所示，一个视频被分为 ![formula](https://render.githubusercontent.com/render/math?math=K) 段（ segment ）；之后对每个段使用稀疏采样的方式采出一个片段（ snippet ）；然后使用“段共识函数”对不同片段的预测结果进行融合生成“段共识”，此时完成了一个视频级的预测；最后对所有模式的预测结果进行融合生成最终的预测结果。
-
-
-<p align="center">
-<img src="../../images/tsn_structure.jpg" height=200 width=500 hspace='10'/> <br />
-</p>
-
-> 这里注意 segment 和 snippet 的区别
-
-TSN 采用与 two-stream 类似的结构，使用空间网络操作一帧 RGB 图像，时序卷积网络操作连续的光流图像。但由于更深的网络结构能够提升对物体的识别能力，因此 TSN 中作者采用 BN-Inception 构建网络。
-
-## 损失函数
-
-给定一段视频 ![formula](https://render.githubusercontent.com/render/math?math=V)，按相等间隔分为 ![formula](https://render.githubusercontent.com/render/math?math=K) 段 ![formula](https://render.githubusercontent.com/render/math?math={S_1,S_2,...,S_K})。 TSN 对一系列片段的建模如下：
-
-<a href="https://www.codecogs.com/eqnedit.php?latex=TSN(T_1,T_2,...,T_K)=H(G(F(T_1;W),F(T_2;W),...,F(T_K;W)))" target="_blank"><img src="https://latex.codecogs.com/gif.latex?TSN(T_1,T_2,...,T_K)=H(G(F(T_1;W),F(T_2;W),...,F(T_K;W)))" title="TSN(T_1,T_2,...,T_K)=H(G(F(T_1;W),F(T_2;W),...,F(T_K;W)))" /></a>
-
-其中，<a href="https://www.codecogs.com/eqnedit.php?latex=(T_1,T_2,...,T_K)" target="_blank"><img src="https://latex.codecogs.com/gif.latex?(T_1,T_2,...,T_K)" title="(T_1,T_2,...,T_K)" /></a> 表示片段序列，从每个段 ![formula](https://render.githubusercontent.com/render/math?math=S_k) 中随机采样获取对应的片段 ![formula](https://render.githubusercontent.com/render/math?math=T_k)；<a href="https://www.codecogs.com/eqnedit.php?latex=F(T_k;W)" target="_blank"><img src="https://latex.codecogs.com/gif.latex?F(T_k;W)" title="F(T_k;W)" /></a> 表示作用于短片段 ![formula](https://render.githubusercontent.com/render/math?math=T_k) 的卷积网络，![formula](https://render.githubusercontent.com/render/math?math=W) 为网络的参数，返回值为 ![formula](https://render.githubusercontent.com/render/math?math=T_k) 相对于所有类别的得分；段共识函数 ![formula](https://render.githubusercontent.com/render/math?math=G) 用于融合所有片段的预测结果。预测函数 ![formula](https://render.githubusercontent.com/render/math?math=H)用于预测整段视频属于每个动作类别的概率，它的输入为段共识函数 ![formula](https://render.githubusercontent.com/render/math?math=G) 的结果。
-
-最后，采用标准分类交叉熵计算部分共识的损失：
-
-<a href="https://www.codecogs.com/eqnedit.php?latex=L\left(&space;y,G&space;\right)&space;=-\sum_{i=1}^C{y_i\left(&space;G_i-\log&space;\sum_{j=1}^C{\exp\text{\&space;}G_j}&space;\right)}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?L\left(&space;y,G&space;\right)&space;=-\sum_{i=1}^C{y_i\left(&space;G_i-\log&space;\sum_{j=1}^C{\exp\text{\&space;}G_j}&space;\right)}" title="L\left( y,G \right) =-\sum_{i=1}^C{y_i\left( G_i-\log \sum_{j=1}^C{\exp\text{\ }G_j} \right)}" /></a>
-
-
-其中，![formula](https://render.githubusercontent.com/render/math?math=C) 是类别总数；![formula](https://render.githubusercontent.com/render/math?math=y_i) 是类别 ![formula](https://render.githubusercontent.com/render/math?math=i) 的 ![formula](https://render.githubusercontent.com/render/math?math=groundtruth)；论文中段的数量 ![formula](https://render.githubusercontent.com/render/math?math=K) 设置为 ![formula](https://render.githubusercontent.com/render/math?math=3)；共识函数 ![formula](https://render.githubusercontent.com/render/math?math=G) 采用取均值的方式，从所有片段的相同类别中推断出某个类别得分 ![formula](https://render.githubusercontent.com/render/math?math=G_i)。
-
-## 模型输入
-对于图像任务而言，只能够使用图像本身提取特征。但对视频来说，除了每一帧图像外，还有视频中的光流信息。为了探索更多输入形式对模型效果影响，TSN 模型在空间卷积网络中除了使用单一 RGB 图像外，还使用了 RGB difference；在时序卷积网络中除了将连续的光流场作为输入外还采用了扭曲的光流场。
-
-<p align="center">
-<img src="../../images/tsn_input.jpg" height=200 width=500 hspace='10'/> <br />
-</p>
-
-单一 RGB 图像只能表示静态信息，缺少上下文信息。但连续两帧之间的差异能够表示动作的改变，因此作者尝试将 RGB difference 作为模型的一种输入。
-
-TSN 将光流场作为输入捕获运动信息；将扭曲光流场作为输入抑制背景运动，使得专注于视频中的人物运动。
-
-## 训练
-由于数据集较小，为了避免过拟合，作者提出了一系列的训练策略。
-
-### 数据增强
-通过数据增强可生成额外的训练样本，一定程度上能够避免模型的过拟合。two-stream 中采用的数据增强方式有随机裁剪和水平翻转，在 TSN 中作者新增了两种数据增强方法：
-* 角裁剪：仅从图片的边角或中心提取区域，避免默认关注图片的中心；
-* 尺度抖动：将输入图像或者光流场的大小固定为 <a href="https://www.codecogs.com/eqnedit.php?latex=256&space;\times&space;340" target="_blank"><img src="https://latex.codecogs.com/gif.latex?256&space;\times&space;340" title="256 \times 340" /></a>，裁剪区域的宽和高随机从 ![formula](https://render.githubusercontent.com/render/math?math={256,224,192,168}) 中选择。最终，裁剪区域将被 <a href="https://www.codecogs.com/eqnedit.php?latex=224&space;\times&space;224" target="_blank"><img src="https://latex.codecogs.com/gif.latex?224&space;\times&space;224" title="224 \times 224" /></a> 用于网络训练。
-
-### 交叉预训练
-由于空间网络以 RGB 图片作为输入，因此作者在空间网络上直接使用 ImageNet 预训练模型初始化网络的参数。对于以 RGB difference 和光流作为输入的模型，作者提出了交叉预训练技术，使用 RGB 预训练模型初始化时序网络。首先，通过线性变换将光流场离散到从 0 到 255 的区间，使得光流场和 RGB 的取值范围相同；之后修改 RGB 模型的第一个卷积层，对 RGB 通道上的权重进行取均值操作；然后依据时序网络的输入通道数复制 RGB 均值。该策略能够有效的避免时序网络出现过拟合现象。
-
-### 正则化技术
-由于光流分布和 RGB 分布不同，因此除了第一个 BN 层，其余 BN 层的参数都被固定。此外，为了进一步降低过拟合产生的影响，作者在 BN-Inception 的全局 pooling 层后添加一个额外的 dropout 层，其中空间卷积网络的 dropout 比例设置为 0.8；时序卷积网络的 dropout 比例设置为 0.7。
-
-## 数据集
-模型在 HMDB51 和 UCF101 两个主流的动作识别数据集上进行。其中，HMDB51 数据集包含 51 个动作分类的 6766 个视频剪辑；UCF101 数据集包含 13320 个视频剪辑，共 101 类动作。
-
-## 实现细节
-* 基于动量的小批量随机梯度下降算法，momentum 设置为 0.9；
-* batch size 为 256；
-* 使用 ImageNet 预训练模型对网络权重进行初始化；
-* learning rate 调整，对于空间网络，初始化为 0.01，并且每 2000 次迭代后降变为原来的 0.1 倍，训练过程共迭代 4500 次；对于时序网络，初始化为 0.005，并且在第 12000 和 18000 次迭代之后降为原来的 0.1 倍，训练过程共迭代 20000 次；
-* 使用 TVL1 光流算法来提取正常光流场和扭曲光流场。
-* 8 块 TITANX GPUs
-
-## PaddleVideo
-为了加快 TSN 模型的推理速度，PaddleVideo 去掉了与 RGB difference、光流以及扭曲光流相关的部分。
-
-PaddleVideo 中实现稀疏采样的关键代码：
-```python
-frames_len = results['frames_len']   # 视频中总的帧数
-average_dur = int(int(frames_len) / self.num_seg)   # 每段中视频的数量
-frames_idx = []   # 存放采样到的索引
-for i in range(self.num_seg):
-    idx = 0  # 采样的起始位置
-    if not self.valid_mode:
-        # 如果训练
-        if average_dur >= self.seg_len:
-            idx = random.randint(0, average_dur - self.seg_len)
-            idx += i * average_dur
-        elif average_dur >= 1:
-            idx += i * average_dur
-        else:
-            idx = i
-    else:
-        # 如果测试
-        if average_dur >= self.seg_len:
-            idx = (average_dur - 1) // 2
-            idx += i * average_dur
-        elif average_dur >= 1:
-            idx += i * average_dur
-        else:
-            idx = i
-    # 从采样位置采连续的帧
-    for jj in range(idx, idx + self.seg_len):
-        if results['format'] == 'video':
-            frames_idx.append(int(jj % frames_len))
-        elif results['format'] == 'frame':
-            frames_idx.append(jj + 1)
-        else:
-            raise NotImplementedError
-```
-
-PaddleVideo 中实现“段共识”的核心代码：
-```
-# [N * num_segs, in_channels, 7, 7]
-x = self.avgpool2d(x)
-# [N * num_segs, in_channels, 1, 1]
-if self.dropout is not None:
-    x = self.dropout(x)
-# [N * num_seg, in_channels, 1, 1]
-x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
-# [N, num_seg, in_channels]
-x = paddle.mean(x, axis=1)
-# [N, 1, in_channels]
-x = paddle.reshape(x, shape=[-1, self.in_channels])
-# [N, in_channels]
-score = self.fc(x)
-```
-
-## 广告时间
-如果文档对您理解 TSN 模型有帮助，欢迎👍star🌟，👏fork，您的支持是我们前进的动力⛽️。
-
-## 参考
-[Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)
diff --git a/docs/zh-CN/tutorials/accelerate.md b/docs/zh-CN/tutorials/accelerate.md
deleted file mode 100644
index fcf7a655e..000000000
--- a/docs/zh-CN/tutorials/accelerate.md
+++ /dev/null
@@ -1,242 +0,0 @@
-简体中文 | [English](../../en/tutorials/accelerate.md)
-
-- [简介](#简介)
-- [模型运算加速](#模型运算加速)
-- [数据读取加速](#数据读取加速)
-- [训练策略加速](#训练策略加速)
-- [分布式训练](#分布式训练)
-
-
-# 简介
-
-视频任务相比于图像任务的训练往往更加耗时，其原因主要有两点:
-- 数据：视频解码耗时。mp4/mkv等视频文件都是经过encode后的压缩文件，通过需要经过解码和抽帧步骤才能得到原始的图像数据流，之后经过图像变换/增强操作才能将其喂入网络进行训练。如果视频帧数多，解码过程极其耗时。
-- 模型：视频任务使用的模型通常有更大的参数量与计算量。为学习时序特征，视频模型一般会使用3D卷积核/(2+1)D/双流网络，这都会使得模型的参数量与计算量大大增加。
-
-本教程介绍如下视频模型训练加速方法:
-
-- 模型上，通过op融合或混合精度训练的方式提升op运算效率
-- 数据上，通过多进程或者并行计算的方式加速数据读取速度
-- 训练策略上，通过multigrid策略减少训练耗时
-- 多机分布式减少训练耗时
-
-以上训练加速方法都已经集成进PaddleVideo中，欢迎试用~
-
-如非特别说明，本教程所有实验的测试环境如下:
-```
-GPU: v100，4卡*16G
-CPU: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-PaddlePaddle: 2.0.0-rc1
-Cuda: 10.2
-```
-
-
-# 模型运算加速
-
-- [OP融合](##OP融合)
-- [混合精度训练](##混合精度训练)
-
-## OP融合
-
-针对[TSM模型](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsm.md)，我们实现了[temporal shift op](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/fluid/layers/temporal_shift_cn.html#temporal-shift)，在节省显存的同时加速训练过程。
-
-测试方法:
-使用不同形状的Tensor，以不同的方式实现temporal shift，记录显存占用和运行时间。
-
-测试代码:
-
-- temporal shift op实现方式
-```python
-import time
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-SHAPE = [32, 16, 32, 32]
-#SHAPE = [128, 64, 128, 128]
-
-otl = []
-input = paddle.randn(SHAPE)
-for i in range(10000):
-    t1 = time.time()
-    out1 = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
-    t2 = time.time()
-    ot = t2 - t1
-    if i > 1000:
-        otl.append(ot)
-print("op time: ", sum(otl)/len(otl))
-```
-
-- 组合op实现方式
-```python
-import time
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-SHAPE = [32, 16, 32, 32]
-#SHAPE = [128, 64, 128, 128]
-
-def temporal_shift(x, seg_num, shift_ratio):
-    shape = x.shape #[N*T, C, H, W]
-    reshape_x = x.reshape((-1, seg_num, shape[1], shape[2], shape[3])) #[N, T, C, H, W]
-    pad_x = paddle.fluid.layers.pad(reshape_x, [0,0,1,1,0,0,0,0,0,0,]) #[N, T+2, C, H, W]
-    c1 = int(shape[1] * shift_ratio)
-    c2 = int(shape[1] * 2 * shift_ratio)
-    slice1 = pad_x[:, :seg_num, :c1, :, :]
-    slice2 = pad_x[:, 2:seg_num+2, c1:c2, :, :]
-    slice3 = pad_x[:, 1:seg_num+1, c2:, :, :]
-    concat_x = paddle.concat([slice1, slice2, slice3], axis=2) #[N, T, C, H, W]
-    return concat_x.reshape(shape)
-
-ctl = []
-input = paddle.randn(SHAPE)
-for i in range(10000):
-    t2 = time.time()
-    out2 = temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
-    t3 = time.time()
-    ct = t3 - t2
-    if i > 1000:
-        ctl.append(ct)
-print("combine time: ", sum(ctl)/len(ctl))
-```
-
-性能数据如下:
-
-| 输入tensor形状 | 实现方式 | 显存占用/M| 计算时间/s | 加速比 |
-| :------ | :-----: | :------: | :------: | :------: |
-| 32\*16\*32\*32 |op组合方式 | 1074 | 0.00029325 |  baseline |
-| 32\*16\*32\*32 | temporal shift op | 1058 | 0.000045770 | **6.4x** |
-| 128\*64\*128\*128 |op组合方式 | 5160 | 0.0099088 |  baseline |
-| 128\*64\*128\*128 | temporal shift op | 2588 | 0.0018617 | **5.3x** |
-
-
-
-## 混合精度训练
-
-Comming soon~
-
-# 数据读取加速
-
-- [更优的解码库Decord](##更优的解码库Decord)
-- [多进程加速Dataloader](##多进程加速Dataloader)
-- [数据预处理DALI](##数据预处理DALI)
-- [预先解码存成图像](##预先解码存成图像)
-
-对于单机训练，视频模型的训练瓶颈大多是在数据预处理上，因此本节主要介绍在数据处理上的一些加速经验。
-
-## 更优的解码库Decord
-
-视频在喂入网络之前，需要经过一系列的数据预处理操作得到数据流，这些操作通常包括:
-
-- 解码: 将视频文件解码成数据流
-- 抽帧: 从视频中抽取部分帧用于网络训练
-- 数据增强：缩放、裁剪、随机翻转、正则化
-
-其中解码是最为耗时的。相较于传统的opencv或pyAV解码库，这里推荐使用性能更优的解码库[decord](https://github.com/dmlc/decord)。目前[SlowFast模型](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/slowfast.md)使用decord进行视频解码([源码](https://github.com/PaddlePaddle/PaddleVideo/blob/main/paddlevideo/loader/pipelines/decode_sampler.py))，对单进程的速度提升有较大作用。
-
-我们分别以opencv/decord为解码器，实现SlowFast模型数据预处理pipeline，然后随机从kinetics-400数据集中选取200条视频，计算各pipeline处理每条视频的平均时间。
-
-性能测试数据如下:
-
-| 解码库 | 版本 | pipeline处理每条视频的平均时间/s | 加速比 |
-| :------ | :-----: | :------: | :------: |
-| opencv | 4.2.0 | 0.20965035 | baseline |
-| decord | 0.4.2 | 0.13788146 |  **1.52x** |
-
-
-## 多进程加速Dataloader
-
-数据准备好后喂入网络进行训练，网络运算使用GPU并行加速相对较快。对于单个进程来说，速度瓶颈大多在数据处理部分，GPU大部分时间是在等待CPU完成数据预处理。
-飞桨2.0使用[Dataloader](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/io/DataLoader_cn.html#dataloader)进行数据加载，DataLoader支持单进程和多进程的数据加载方式，当 num_workers 大于0时，将使用多进程方式异步加载数据。多进程加速协作，可以overlap掉GPU大部分等待的时间，提升GPU利用率，显著加速训练过程。
-
-我们分别设置num_workers为0或4，单卡batch_size统一设置为8，统计训练一个batch的平均耗时。
-
-性能测试数据对比如下:
-| 卡数 | 单卡num_workers | batch_cost/s | ips | 加速比 |
-| :------ | :-----: | :------: |:------: |:------: |
-| 单卡 | 0 | 1.763 | 4.53887 | 单卡baseline |
-| 单卡 | 4 | 0.578 | 13.83729 | **3.04x** |
-| 4卡 | 0 | 1.866 | 4.28733 | 多卡baseline |
-| 4卡 | 4 | 0.615 | 13.00625 | **3.03x** |
-
-其中ips = batch_size/batch_cost，即为训练一个instance(一个video)的平均耗时。
-
-**结合使用decord和飞桨dataloader，加上在数据增强部分做一些细节优化，SlowFast模型训练速度增益为100%，详细数据可以参考[benchmark](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/benchmark.md)**。
-
-## 数据预处理DALI
-
-既然GPU等待CPU进行数据处理耗时，能否把数据处理放到GPU上呢？[NVIDIA DALI](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/)将数据预处理pipeline转移到GPU上执行，可以显著提升训练速度。针对视频文件，DALI提供`VideoReader`op进行解码抽帧操作，但目前其仅支持连续采样的方式进行抽帧。而视频领域常用的2D模型TSN或TSM，它们均采用分段采样方式，即把视频均匀分成N段segument，然后在每个segument内随机选取一帧，最后把选取的帧组合作为输入张量。为此，我们基于DALI进行了二次开发，实现了支持分段采样方式的`VideoReader`op。为方便用户使用，我们提供了配置好的docker运行环境，具体使用方法参考[TSN-DALI使用教程](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsn_dali.md)。
-
-测试环境:
-```
-机器: Tesla v100
-显存: 4卡16G
-Cuda: 9.0
-单卡batch_size: 32
-```
-
-性能测试数据如下:
-
-| 加速方式  | batch耗时/s  | reader耗时/s | ips:instance/sec | 加速比 |
-| :--------------- | :--------: | :------------: | :------------: | :------------: |
-| DALI | 2.083 | 1.804 | 15.36597  | **1.41x** |
-| Dataloader:  单卡num_workers=4 | 2.943 | 2.649 | 10.87460| baseline |
-| pytorch实现 | TODO | TODO | TODO | TODO |
-
-
-## 预先解码存成图像
-
-这是一种简单直接的方法，既然视频解码耗时，那可以事先将视频解码好，存成图片，模型训练时直接读取图像即可。这种方法可以显著提升视频模型训练速度，但它也有一个很明显的缺点，就是需要耗费大量的内存空间。以kinetics-400数据集为例，共包含24万个训练样本，mp4文件约130G，解码存成图像后，占用的内存空间约为2T，所以这种方法比较适用于较小规模的数据集，如ucf-101。PaddleVideo提供了[预先解码](https://github.com/PaddlePaddle/PaddleVideo/blob/main/data/ucf101/extract_rawframes.py)的脚本，并且[TSN模型](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsn.md)和[TSM模型](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/zh-CN/model_zoo/recognition/tsm.md)均支持直接使用frame格式的数据进行训练，详细实现参考[源码](https://github.com/PaddlePaddle/PaddleVideo/blob/main/paddlevideo/loader/dataset/frame.py)。
-
-
-测试方法: 数据集选用UCF-101，模型为ppTSM，模型参数参考默认配置[pptsm.yaml](https://github.com/PaddlePaddle/PaddleVideo/blob/main/configs/recognition/tsm/pptsm.yaml)，Dataloader的num_workers参数设为0，分别以video和frame格式作为输入，单卡训练，性能数据如下:
-
-| 数据格式  | batch耗时/s  | reader耗时/s | ips:instance/sec | reader加速比 | 加速比 |
-| :--------------- | :--------: | :------------: | :------------: | :------------: | :------------: |
-| frame | 1.008 | 0.591 | 15.87405  | 4.79x | **3.22x** |
-| video | 3.249 | 2.832 | 4.92392| baseline | baseline |
-
-
-# 训练策略加速
-
-前述方法大多从工程的角度思考训练速度的提升，在算法策略上，FAIR在CVPR 2020中提出了[Multigrid加速策略算法](https://arxiv.org/abs/1912.00998)，它的基本思想如下:
-
-在图像分类任务中，若经过预处理后图像的高度和宽度分别为H和W，batch_size为N，则网络输入batch的Tensor形状为`[N, C, H, W]`，其中C等于3，指RGB三个通道。
-对应到视频任务，由于增加了时序通道，输入batch的Tensor形状为`[N, C, T, H, W]`。
-传统的训练策略中，每个batch的输入Tensor形状都是固定的，即都是`[N, C, T, H, W]`。若以高分辨的图像作为输入，即设置较大的`[T, H, W]`，则模型精度会高一些，但训练会更慢；若以低分辨的图像作为输入，即设置较小的`[T, H, W]`，则可以使用更大的batch size，训练更快，但模型精度会降低。在一个epoch中，能否让不同batch的输入Tensor的形状动态变化，既能提升训练速度，又能保证模型精度？
-
-基于以上思想，FAIR在实验的基础上提出了Multigrid训练策略: 固定`N*C*T*H*W`的值，降低`T*H*W`时增大`N`的值，增大`T*H*W`时减小`N`的值。具体包含两种策略：
-
-- Long cycle: 设完整训练需要N个epoch，将整个训练过程分4个阶段，每个阶段对应的输入tensor形状为:
-```
-[8N, T/4, H/sqrt(2), W/sqrt(2)], [4N, T/2, H/sqrt(2), W/sqrt(2)], [2N, T/2, H, W], [N, T, H, W]
-```
-
-- Short cycle: 在Long cycle的基础上，Short-cycle让每个iter的输入Tensor形状都会发生变化，变化策略为:
-```
-[H/2, W/2], [H/sqrt(2), W/sqrt(2)], [H, W]
-```
-
-我们基于飞桨实现了Multigrid训练加速策略，对SlowFast模型训练进行加速，使用方法请参考文档[SlowFast训练加速](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/slowfast.md#%E8%AE%AD%E7%BB%83%E5%8A%A0%E9%80%9F)。
-
-测试环境:
-```
-机器: Tesla v100
-显存: 8卡32G
-Cuda: 9.0
-单卡batch_size: 8
-数据集: Kinetics-400
-Paddle版本: 2.0-rc0
-```
-
-性能数据如下:
-
-| 训练策略 | 单个epoch平均耗时/min | 训练总时间/min | 加速比 |
-| :------ | :-----: | :------: |:------: |
-| Multigrid | 27.25 |  9758(6.7天) | 2.89x |
-| Normal | 78.76 | 15438(10.7天) | base |
-
-# 分布式训练
-
-Comming soon~
diff --git a/docs/zh-CN/tutorials/deployment.md b/docs/zh-CN/tutorials/deployment.md
deleted file mode 100644
index 28084055a..000000000
--- a/docs/zh-CN/tutorials/deployment.md
+++ /dev/null
@@ -1,58 +0,0 @@
-简体中文 | [English](../../en/tutorials/deployment.md)
-
-# 推理
-
-## 如何导出一个用于预测的模型？
-
-为了之后的模型预测和部署，我们需要导出模型结构，模型参数，这里应用了PaddlePaddle最新的动转静能力
-执行脚本 ```tools.export_model.py```
-```python
-python3.7 tools/export_model.py -c 配置文件 -o 输出地址 -p 权重文件
-```
-
-`export_model.py` 中，首先会重新build一个网络，这里注意，有些用于预测的模型初始化参数可能和训练时不一致，请注意更改。
-`export_model.py` 添加了针对TSM的`num_seg`等参数，会用to_static动转静，并调用jit.save来保存预测模型，注意：这里的inputspec需要指定一个`假` 输入来运行网路。
-
-具体原理请参考 [动转静](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/04_dygraph_to_static/index_cn.html) 官方文档。
-
-## 如何检查保存的预测模型正确性？
-
-这里我们提供了```tools/test_export_model.py```脚本用于检查预测模型的正确性。
-
-```python
-python3 tools/test_export_model.py -p 权重文件 -i 导出的模型文件夹地址 -c 配置文件
-```
-
-`test_export_model.py`只是打印了输出的shape信息，可根据实际需求进行更改，完整的测试流程应该包含下一步：使用预测引擎进行推理
-
-## 如何使用预测引擎进行推理？
-
-这里我们提供了```tools/predict.py``` 进行模型推理。
-
-```python
- python3.7 tools/predict.py -v example.avi --model_file "./inference/example.pdmodel" --params_file "./inference/example.pdiparams" --enable_benchmark=False --model="example" --num_seg=8
- ```
- 
- 对example.avi进行预测并返回预测结果
- 
- ## 如何测试推理速度
- 我们提供了统一的测试脚本
- 
- ```python
- python3.7 tools/predict.py --enable_benchmark=True --model_file=模型文件 --params_file=参数文件
- ```
- 
- ## 如何使用服务器端C++推理?
- 
- <sup> coming soon </sup>
-
- # 部署
- 
- ## 如何使用PaddleHub Serving进行部署？
- 
- <sup> coming soon </sup>
- 
- ## 如何使用PaddleLite进行端上部署？
- 
- <sup> coming soon </sup>
- 
diff --git a/docs/zh-CN/tutorials/pp-tsm.md b/docs/zh-CN/tutorials/pp-tsm.md
deleted file mode 100644
index 119db69e3..000000000
--- a/docs/zh-CN/tutorials/pp-tsm.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# PP-TSM高效实用视频识别模型  
-
-PP-TSM是PaddleVideo基于TSM优化和改进的视频模型，  
-其精度(UCF101和Kinetics400数据集top1)和推理速度均优于TSM论文及其他开源的TSM模型5%，3%以上，  
-要求使用PaddlePaddle2.0(可使用pip安装) 或适当的develop版本。  
-
-在仅用ImageNet pretrain情况下，PP-TSM在UCF101和Kinetics400数据集top1分别达到89.5%和73.5%，  
-在单卡V100上FP32推理速度为147 VPS （基于Kinectics400数据集）.
-在单卡V100上开启TensorRT下FP16推理速度为TODO。
-
-pp-TSM在Kinetics400上top1精度为73.5%，是至今为止开源的2D视频模型中在相同条件下的最高性能。  
-
-PP-TSM从如下方面优化和提升TSM模型的精度和速度：  
-1、基于知识蒸馏的预训练模型  ， +1.3%  
-2、网络结构微调  ，+2.5%  
-3、更优的batch size ，+0.2%   
-4、更优的L2正则化  ，+0.3%  
-5、label_smoothing  ，+0.2%  
-6、更优的lr decay  ，+0.15%  
-7、数据增广  ，+0.3%  
-8、更优的epoch num  ，+0.15%  
-9、bn策略  ，+0.4%  
-10、集成PaddleInference进行预测推理  
-11、知识蒸馏、优化器等更多TODO策略    
-其中，每项策略的精度提升指标参考上述数据（基于ucf101及k400上进行实验）。
-
-## preciseBN
-
-在介绍preciseBN之前，我们先回顾一下BN(Batch Norm)。BN层是一种正则化层，在训练时，它根据当前batch的数据按通道计算的均值和方差，然后进行归一化运算，公式如图:
-
-详细介绍可参考[BatchNorm文档](https://paddlepaddle.org.cn/documentation/docs/zh/2.0-rc1/api/paddle/fluid/dygraph/BatchNorm_cn.html#batchnorm)。
-
-假设训练数据的分布和测试数据的分布是一致的，在训练时我们会计算并保存滑动均值和滑动方差，供测试时使用。滑动均值和滑动方差的计算方式如下:
-
-简单的说，moving_mean等于当前batch计算的均值与历史保存的moving_mean的加权和，即为滑动均值。**但滑动均值并不等于真实的均值**，因此测试时的精度仍会受到一定影响。
-为了提升测试精度，我们需要重新计算一个更加精确的均值，这就是preciseBN的目的。
-
-真实的均值如何计算？最直观的想法是，把所有训练数据组成一个batch，输入网络进行前向传播，每经过一个BN层，计算一下当前特征的均值和方差。
-由于训练样本过多，实际操作中不可能这么做。
-所以近似做法是，网络训练完成后，固定住网络中的参数不动，将所有训练数据分成N个batch，依次输入网络进行前向计算，在这个过程中保存下来每个iter的均值和方差，最终得到所有训练样本精确的均值和方差。
-这就是preciseBN的计算方法。具体实现参考[preciseBN](https://github.com/PaddlePaddle/PaddleVideo/blob/main/paddlevideo/utils/precise_bn.py)。
-
-实际使用时，由于迭代所有训练样本比较耗费时间，一般只会跑200个iter左右。
-
-
diff --git a/docs/zh-CN/tutorials/ppagcn.md b/docs/zh-CN/tutorials/ppagcn.md
deleted file mode 100644
index f2807847a..000000000
--- a/docs/zh-CN/tutorials/ppagcn.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# PP-AGCN模型详解
-
----
-
-## 内容
-
-- [ST-GCN模型简介](#ST-GCN模型简介)
-- [PP-AGCN模型改进](#PP-AGCN模型改进)
-
-## ST-GCN模型简介
-
-ST-GCN模型由香港中文大学-商汤科技联合实验室在AAAI 2018中提出，不仅为解决基于人体骨架关键点的人类动作识别问题提供了新颖的思路，在标准的动作识别数据集上也取得了较大的性能提升。
-时空图卷积网络模型ST-GCN通过将图卷积网络（GCN）和时间卷积网络（TCN）结合起来，扩展到时空图模型，设计出了用于行为识别的骨骼点序列通用表示，
-该模型将人体骨骼表示为图，如图2所示，其中图的每个节点对应于人体的一个关节点。图中存在两种类型的边，即符合关节的自然连接的空间边（spatial edge）和在连续的时间步骤中连接相同关节的
-时间边（temporal edge）。在此基础上构建多层的时空图卷积，它允许信息沿着空间和时间两个维度进行整合。
-
-ST-GCN的网络结构大致可以分为三个部分，首先，对网络输入一个五维矩阵(N, C, T, V, M)，其中N为视频数据量；C为关节特征向量，包括(x,y,acc)；T为视频中抽取的关键帧的数量；
-V表示关节的数量，在本项目中采用25个关节数量；M则是一个视频中的人数，然后再对输入数据进行Batch Normalization批量归一化，接着，通过设计ST-GCN单元，
-引入ATT注意力模型并交替使用GCN图卷积网络和TCN时间卷积网络，对时间和空间维度进行变换，在这一过程中对关节的特征维度进行升维，对关键帧维度进行降维，
-最后，通过调用平均池化层、全连接层，并后接SoftMax层输出，对特征进行分类。
-
-
-## PP-AGCN模型详解
diff --git a/docs/zh-CN/tutorials/reletive_issues b/docs/zh-CN/tutorials/reletive_issues
deleted file mode 100644
index 6db03e074..000000000
--- a/docs/zh-CN/tutorials/reletive_issues
+++ /dev/null
@@ -1,77 +0,0 @@
-video_path is what ? #4510
-https://github.com/PaddlePaddle/models/issues/4510
-
-关于BSN/BMN模型 #4411
-https://github.com/PaddlePaddle/models/issues/4411
-
-微调nextvald的参数，如何加载部分参数呢 #4367
-https://github.com/PaddlePaddle/models/issues/4367
-
-用TSN视频分类模型进行finetune时的问题 #4358
-https://github.com/PaddlePaddle/models/issues/4358
-
-用paddle视频分类模型进行finetune开发报错。 #4353
-https://github.com/PaddlePaddle/models/issues/4353
-
-BMN/BSN模型评估时报错 #4110
-https://github.com/PaddlePaddle/models/issues/4110
-
-The avg losses are not same for training and validation when the same data are used. #4973
-https://github.com/PaddlePaddle/models/issues/4973
-
-How can I load a pretrained model in AttentionCluster to train my own data? #4972
-https://github.com/PaddlePaddle/models/issues/4972
-
-ETS模型的数据处理 #4957
-https://github.com/PaddlePaddle/models/issues/4957
-
-BMN模型推理出现错误 #4881
-https://github.com/PaddlePaddle/models/issues/4881
-
-C-TCN模型数据集不支持MP4格式,MP4转为pickle文件格式需要提供相应处理工具脚本 #4782
-https://github.com/PaddlePaddle/models/issues/4782
-
-CTCN 有没有使用I3D特征的demo #4756
-https://github.com/PaddlePaddle/models/issues/4756
-
-ctcn的数据集.pkl文件的b'feats'和b'scores'是什么？我注意到ctcn_reader.py只用到了b'scores'，是否b'scores'才是需要的特征？还有对应的txt文件是什么？假设我需要把BMN的数据集转化为ctcn的数据集，该怎么做？ #4750
-https://github.com/PaddlePaddle/models/issues/4750
-
-使用BMN预训练模型训练的时候报错 #4749
-https://github.com/PaddlePaddle/models/issues/4749
-
-使用BMN进行预测时，输出的json文件 视频ID少了两个字符，所有的文件都是这样 #4745
-https://github.com/PaddlePaddle/models/issues/4745
-
-BMN模型batch_size调小之后loss为nan #4738
-https://github.com/PaddlePaddle/models/issues/4738
-
-BMN的输入问题 #4724
-https://github.com/PaddlePaddle/models/issues/4724
-
-报一个video_tag的BUG #4698
-https://github.com/PaddlePaddle/models/issues/4698
-
-PaddleCV-video-ctcn 训练到Epoch21，iter1365停止不动 #4719
-https://github.com/PaddlePaddle/models/issues/4719
-
-STNET跑模型推断，显卡显存充足，提示显存不足 #4608
-https://github.com/PaddlePaddle/models/issues/4608
-
-训练stnet读取kinetics数据集时出线错误 求解决 #4529
-https://github.com/PaddlePaddle/models/issues/4529
-
-有关CTCN视频动作定位的问题 #4508
-https://github.com/PaddlePaddle/models/issues/4508
-
-谁有这个yt8m 的tfrecord？ #4506
-https://github.com/PaddlePaddle/models/issues/4506
-
-The NeXtVLAD final model couldn't be used ??? #4502
-https://github.com/PaddlePaddle/models/issues/4502
-
-Hi, I'm wondering if there is an end-to-end solution for the youtube8M attention_lstm model? #4201
-https://github.com/PaddlePaddle/models/issues/4201
-
-CTCN模型训练一段时间后NAN #4123
-https://github.com/PaddlePaddle/models/issues/4123
diff --git a/docs/zh-CN/tutorials/summarize.md b/docs/zh-CN/tutorials/summarize.md
deleted file mode 100644
index c37dc2403..000000000
--- a/docs/zh-CN/tutorials/summarize.md
+++ /dev/null
@@ -1,138 +0,0 @@
-# 视频分类和动作识别介绍
-
-## 广泛的应用场景
-视频分类在多个领域上都有广泛的应用，如短视频、推荐、搜索、电视台、广告，安防，监控等领域。
-
-## 多种细分任务
-与图像任务相似，视频任务也可以分为分类（识别）和检测任务两大类，结合不同的场景还可以对这两类任务具体进行细分：
-
-+ Task1：修剪视频识别(Trimmed Action Recognition)。输入一段只包含一个动作的修剪视频，输出视频分类，如下图所示：
-<p align="center">
-<img src="../../images/action_classification.png" height=300 width=700 hspace='10'/> <br />
- 行为分类
-</p>
-
-  从使用的数据模态上区分，分类任务还可以继续细分为基于单模态数据的分类和基于多模态数据的分类，基于RGB图像的分类和基于人体骨架的分类等等，如下图所示：
-
-  <p align="center">
-  <img src="../../images/multimodality.png" height=300 width=500 hspace='10'/> <br />
- 多种模态
-  </p>
-从视频的视角上分还可以分为第一人称视角的行为识别和第三人称视角的行为识别，单一视角的识别和多视角融合的识别，有兴趣的用户可自行查阅相关文献。
-
-+ Task2：未修剪视频分类(Untrimmed Video Classification)。与修剪视频识别不同的是，未修剪的视频中通常含有多个动作，而且视频很长。有许多动作或许都不是我们所关注的。通过对输入的长视频进行全局分析，然后软分类到多个类别。
-
-+ Task3：时序行为提名(Temporal Action Proposal)。类似于图像目标检测任务中的候选框提取。在一段长视频中通常含有很多动作，任务是从视频中找出可能含有动作的视频段。
-
-+ Task4：时序行为定位(Temporal Action Localization)。相比于上面的时序行为提名而言，时序行为定位和我们常说的目标检测一致，要求从视频中找到可能存在行为的视频段，并且给视频段分类，如下图所示：
- <p align="center">
-<img src="../../images/action_detection.png" height=200 width=1000 hspace='10'/> <br />
- 行为检测
-</p>
-
-+ Task5：密集行为描述(Dense-Captioning Events)。之所以称为密集行为描述，主要是因为该任务要求在时序行为定位(检测)的基础上进行视频行为描述。也就是说，该任务需要将一段**未修剪的视频**进行**时序行为定位**得到许多包含行为的视频段后，并对该视频段进行**行为描述**。
-
-
-## 数据集简介
-
-### 视频分类数据集
-
-模型的训练和验证离不开全面、大量以及具有较好标注的数据集。随着视频行为识别研究的不断深入，越来越多的数据集应用于这一领域的研究。典型的数据集如下：
-+ KTH数据集[<sup>1</sup>](#1)
-
-KTH数据集是一个早期的小型行为识别数据集，包括599段视频6类动作（走、跳、跑、击拳、挥手、拍手）背景相对静止，除了镜头的拉近拉远，摄像机的运动比较轻微。由于该数据集比较小，当训练较大型的3D网络时很容易过拟合，因此当前的大部分研究训练过程多数不基于此数据集。
-+ UCF10数据集[<sup>2</sup>](#2)
-
-UCF101是一个中型数据集视频主要来自于YouTube，包含13320段视频，共101类动作，每类动作由25个人完成，每个人做4-7组动作。在Kinetics数据集发布之前UCF101和HMDB51数据集在很长的一段时间里被作为benchmark用于评估行为识别方法的效果。
-+ HMDB51数据集[<sup>3</sup>](#3)
-
-Brown university大学提出的HMDB51数据集于2011年发布，视频多数来源于电影，还有一部分来自公共数据库以及YouTube等网络视频库。数据库包含有6849段样本，分为51类，每类至少包含有101段样本。
-+ Kinetics数据集[<sup>4</sup>](#4)
-
-Kinetics是当前最为重要的一个大型行为识别数据集，该数据集在2017年由Google的Deepmind团队提出，视频数据同样来自于YouTube,总共400个类别(现已经扩充到700类)，30多万段视频数据(Kinetics-700已经扩充到了60多万段视频)，每段视频持续10秒左右。动作类别主要分为三大类：“人”，“人与动物”，“人与人互动”。Kinetics数据集可以训练3D-Resnet达到152层而不发生过拟合,解决了之前训练数据集过小难以训练深层3D网络的困境。当前Kinetics已经取代了UCF101和HMDB51成为了行为识别领域的benchmark。当前，大多数研究都采用此数据集进行效果评估和预训练。
-+ Something-Something数据集[<sup>5</sup>](#5)
-
-SomethingV1包含108499段标注视频(V2已经扩展到了220847)，每一个时长都在2到6秒之间。这些视频包含了174种类别的动作，与前面的数据集不同此数据集的识别需要更强的时间信息,因此在检验模型时域建模能力方面此数据集具有很重要的参考价值。
-除了以上的主流数据集外目前还有复杂动作识别的Charades[<sup>6</sup>](#6)数据集、Breakfast Action[<sup>7</sup>](#7)数据集、以及百万级别的体育视频数据集Sports 1M[<sup>8</sup>](#8)。
-
-### 检测任务数据集
-
-+ THUMOS 2014
-
-来自于THUMOS Challenge 2014,。它的训练集为UCF101数据集，验证集和测试集分别包括1010和1574个未分割的视频片段。在行为检测任务中只有20类动作的未分割视频是有时序行为片段标注的，包括200个验证集(3007个行为片段)和213个测试集视频(包含3358个行为片段)。
-
-+ MEXaction2
-
-MEXaction2数据集中包含两类动作：骑马和斗牛。该数据集由三个部分组成：YouTube视频，UCF101中的骑马视频以及INA视频。其中YouTube视频片段和UCF101中的骑马视频是分割好的短视频片段，被用于训练集。而INA视频为多段长的未分割的视频，时长共计77小时，且被分为训练，验证和测试集三部分。训练集中共有1336个行为片段，验证集中有310个行为片段，测试集中有329个行为片断。且MEXaction2数据集的特点是其中的未分割视频长度都非常长，被标注的行为片段仅占视频总长的很低比例。
-
-+ ActivityNet
-
-目前最大的数据库，同样包含分类和检测两个任务。这个数据集仅提供视频的youtube链接，而不能直接下载视频，所以还需要用python中的youtube下载工具来自动下载。该数据集包含200个动作类别，20000（训练+验证+测试集）左右的视频，视频时长共计约700小时.
-
-
-## 经典模型简介
-如图所示，动作识别框架主要包括三个步骤:特征提取、运动表示和分类。其中，如何提取视频的时空特征是行为识别和视频分类的核心问题。
- <p align="center">
-<img src="../../images/action_framework.png" height=300 width=700 hspace='10'/> <br />
-行为识别框架
-</p>
-依据使用方法的不同可以总体上将行为识别（视频分类）方法概括为基于手工特征方法阶段和基于深度学习方法阶段。基于手工特征的方法阶段比较典型的运动描述子有DTP和IDT，这也是深度学习应用于这一领域之前为大家所公认的最为优秀的运动描述子，感兴趣的读者可以自行查阅文末的相关参考文献。从2014年起，深度学习的方法逐渐开始应用于视频分类领域，目前基于深度学习的方法已经成为了学术界的研究热点，并且在实际的应用效果上看也远远超越了手工设计的运动特征。从2014年至今围绕着如何表征运动特征这一问题，学术界提出了许多经典的网络结构，如下图所示：
- <p align="center">
-<img src="../../images/classic_model.png" height=300 width=700 hspace='10'/> <br />
-典型的方法
-</p>
-
-目前Paddlevideo模型库中已经囊括了TSN[<sup>9</sup>](#9) ，TSM[<sup>10</sup>](#10)，slowfast[<sup>11</sup>](#11)等经典的行为识别网络，我们后续会陆续对视频领域的经典模型和论文进行详细解析，敬请期待！
-
-
-## 相关比赛介绍
-+ [ActivityNet](http://activity-net.org/challenges/2020/challenge.html)
-
-ActivityNet是一个大规模行为识别竞赛，自2016年开始，每年与CVPR同时进行，到今年为止已经连续举办了4届。它侧重于从用户产生的视频中识别出日常生活，高层次，面向目标的活动，视频取自互联网视频门户Youtube。目前，ActivityNet比赛已经成为了行为识别领域影响力最大的比赛。
-
-
-## Reference
-
-<div id='1'>
-[1] Schuldt C, Laptev I, Caputo B.Recognizing Human Actions: A Local SVM Approach Proceedings of International Conference on Pattern Recognition. Piscataway, NJ: IEEE, 2004:23-26
-</div>
-<br/>
-<div id='2'>
-[2] Soomro K, Zamir A R, Shah M. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. arXiv:1212.0402,2012.
-</div>
-<br/>
-<div id='3'>
-[3] Kuehne H, Jhuang H, Garrote E, et al. HMDB: a large video database for human motion recognition Proceedings of IEEE International Conference on Computer Vision. Piscataway, NJ: IEEE, 2011:2556-2563.
-</div>
-<br/>
-<div id='4'>
-[4] Carreira J , Zisserman A . Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2017:6299-6308.
-</div>
-<br/>
-<div id='5'>
-[5] Goyal R, Kahou S E, Michalski V. The “something something” video database for learning and evaluating visual common sense. arXiv:1706.04261,2017.
-</div>
-<br/>
-<div id='6'>
-[6] Sigurdsson G A , Varol Gül, Wang Xiaolong, et al. Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding. arXiv: 604.01753,2016
-</div>
-<br/>
-<div id='7'>
-[7] Kuehne H, Arslan A, Serre T. The Language of Actions Recovering the Syntax and Semantics of Goal-Directed Human Activities  Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014.
-</div>
-<br/>
-<div id='8'>
-[8] Karpathy A , Toderici G , Shetty S , et al. Large-Scale Video Classification with Convolutional Neural Networks Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014:1725-1732.
-</div>
-<br/>
-<div id='9'>
-[9] Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016.
-</div>
-<br/>
-<div id='10'>
-[10] Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018.
-</div>
-<br/>
-<div id='11'>
-[11] Feichtenhofer C , Fan Haoqi , Malik J , et al. SlowFast Networks for Video Recognition. arXiv:1812.03982,2018.
-</div>
diff --git a/docs/zh-CN/usage.md b/docs/zh-CN/usage.md
deleted file mode 100644
index cfee4807b..000000000
--- a/docs/zh-CN/usage.md
+++ /dev/null
@@ -1,207 +0,0 @@
-简体中文 | [English](../en/start.md)
-
-# 使用指南
----
-
-* [1. 模型训练](#1)
-* [2. 模型恢复训练](#2)
-* [3. 模型微调](#3)
-* [4. 模型测试](#4)
-* [5. 模型推理](#5)
-* [6. 混合精度训练](#6)
-
-
-请参考[安装指南](./install.md)配置运行环境，PaddleVideo目前支持Linux下的GPU单卡和多卡运行环境。
-
-
-
-<a name="1"></a>
-## 1. 模型训练
-
-PaddleVideo支持单机单卡和单机多卡训练，单卡训练和多卡训练的启动方式略有不同。
-
-### 1.1 单卡训练
-
-启动脚本示例:
-
-```bash
-export CUDA_VISIBLE_DEVICES=0         #指定使用的GPU显卡id
-python3.7 main.py  --validate -c configs_path/your_config.yaml
-```
-- `-c` 必选参数，指定运行的配置文件路径，具体配置参数含义参考[配置文档](./contribute/config.md#config-yaml-details)
-- `--validate` 可选参数，指定训练时是否评估
--  `-o`: 可选参数，指定重写参数，例如： `-o DATASET.batch_size=16` 用于重写train时batch size大小
-
-### 1.2 多卡训练
-
-通过`paddle.distributed.launch`启动，启动脚本示例:
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=your_log_dir  main.py  --validate -c configs_path/your_config.yaml
-```
-- `--gpus`参数指定使用的GPU显卡id
-- `--log_dir`参数指定日志保存目录
-多卡训练详细说明可以参考[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.1/guides/02_paddle2.0_develop/06_device_cn.html#danjiduokaxunlian)
-
-
-我们将所有标准的启动命令都放在了```run.sh```中，直接运行(./run.sh)可以方便地启动多卡训练与测试，注意选择想要运行的脚本
-```shell
-sh run.sh
-```
-
-### 1.3 输出日志
-
-运行训练命令，将会输出运行日志，并默认保存在./log目录下，如：`worker.0` , `worker.1` ... , worker日志文件对应每张卡上的输出
-
-【train阶段】打印当前时间，当前epoch/epoch总数，当前batch id，评估指标，耗时，ips等信息：
-```txt
-[09/24 14:13:00] epoch:[  1/1  ] train step:100  loss: 5.31382 lr: 0.000250 top1: 0.00000 top5: 0.00000 batch_cost: 0.73082 sec, reader_cost: 0.38075 sec, ips: 5.47330 instance/sec.
-```
-
-【eval阶段】打印当前时间，当前epoch/epoch总数，当前batch id，评估指标，耗时，ips等信息：
-```txt
-[09/24 14:16:55] epoch:[  1/1  ] val step:0    loss: 4.42741 top1: 0.00000 top5: 0.00000 batch_cost: 1.37882 sec, reader_cost: 0.00000 sec, ips: 2.90104 instance/sec.
-```
-
-【epoch结束】打印当前时间，评估指标，耗时，ips等信息：
-```txt
-[09/24 14:18:46] END epoch:1   val loss_avg: 5.21620 top1_avg: 0.02215 top5_avg: 0.08808 avg_batch_cost: 0.04321 sec, avg_reader_cost: 0.00000 sec, batch_cost_sum: 112.69575 sec, avg_ips: 8.41203 instance/sec.
-```
-
-当前为评估结果最好的epoch时，打印最优精度：
-```txt
-[09/24 14:18:47] Already save the best model (top1 acc)0.7467
-```
-
-### 1.4 输出存储路径
-
-- PaddleVideo各文件夹的默认存储路径如下：
-
-```
-PaddleVideo
-    ├── paddlevideo
-    ├── ... #other source codes
-    ├── output #ouput 权重，优化器参数等存储路径
-    |    ├── example
-    |    |   ├── example_best.pdparams #path_to_weights
-    |    |   └── ...  
-    |    └── ...  
-    ├── log  #log存储路径
-    |    ├── worker.0
-    |    ├── worker.1
-    |    └── ...  
-    └── inference #预测文件存储路径
-         ├── example.pdiparams file
-         ├── example.pdimodel file
-         └── example.pdiparmas.info file
-```
-
-- 训练Epoch默认从1开始计数，参数文件的保存格式为`ModelName_epoch_00001.pdparams`，命名中的数字对应Epoch编号。
-
-
-<a name="2"></a>
-
-## 2. 模型恢复训练
-
-如果训练任务终止，可以加载断点权重文件(优化器-学习率参数，断点文件)继续训练。
-需要指定`-o resume_epoch`参数，该参数表示从```resume_epoch```轮开始继续训练.
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-
-python3 -m paddle.distributed.launch \
-    --gpus="0,1,2,3" \
-    main.py \
-    -c ./configs/example.yaml \
-    --validate \
-    -o resume_epoch=5
-```
-
-<a name="3"></a>
-
-## 3. 模型微调
-
-进行模型微调（Finetune），对自定义数据集进行模型微调，需要指定 `--weights` 参数来加载预训练模型。
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-
-python3 -m paddle.distributed.launch \
-    --gpus="0,1,2,3" \
-    main.py \
-    -c ./configs/example.yaml \
-    --validate \
-    --weights=./output/example/path_to_weights
-```
-
-PaddleVideo会自动**不加载**shape不匹配的参数
-
-
-<a name="4"></a>
-
-## 4. 模型测试
-
-需要指定 `--test`来启动测试模式，并指定`--weights`来加载预训练模型。
-
-```bash
-python3 -m paddle.distributed.launch \
-    --gpus="0,1,2,3" \
-    main.py \
-    -c ./configs/example.yaml \
-    --test \
-    --weights=./output/example/path_to_weights
-```
-
-<a name="5"></a>
-
-## 5. 模型推理
-
-通过导出inference模型，PaddlePaddle支持使用预测引擎进行预测推理。接下来介绍如何用预测引擎进行推理：
-首先，对训练好的模型进行转换
-指定`-c`参数加载配置文件，指定`-p`参数加载模型权重，指定`-o`用于指定转换后模型的存储路径。
-
-```bash
-python tools/export_model.py \
-    -c ./configs/example.yaml \
-    -p ./output/example/path_to_weights \
-    -o ./inference
-```
-
-
-上述命令将生成模型结构文件（`model_name.pdmodel`）和模型权重文件（`model_name.pdiparams`），然后可以使用预测引擎进行推理：
-
-```bash
-python tools/predict.py \
-    --input_file "data/example.avi" \
-    --model_file "./inference/TSN.pdmodel" \
-    --params_file "./inference/TSN.pdiparams" \
-    --use_gpu=True \
-    --use_tensorrt=False
-```
-
-其中：
-
-+ `input_file`：待预测的文件路径或文件夹路径，如 `./test.avi`
-+ `model_file`：模型结构文件路径，如 `./inference/TSN.pdmodel`
-+ `params_file`：模型权重文件路径，如 `./inference/TSN.pdiparams`
-+ `use_tensorrt`：是否使用 TesorRT 预测引擎，默认值：`False`
-+ `use_gpu`：是否使用 GPU 预测，默认值：`True`
-
-
-<a name="6"></a>
-
-## 6. 混合精度训练
-
-[混合精度训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/amp_cn.html#amp)使用fp16数据类型进行训练，可以加速训练过程，减少显存占用，其训练启动命令如下：
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-
-export FLAGS_conv_workspace_size_limit=800 #MB
-export FLAGS_cudnn_exhaustive_search=1
-export FLAGS_cudnn_batchnorm_spatial_persistent=1
-
-python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=your_log_dir  main.py --amp --validate -c configs_path/your_config.yaml
-```
-
-各模型详细的使用文档，可以参考[Models](./model_zoo/README.md)
diff --git a/english_documents/benchmark.md b/english_documents/benchmark.md
new file mode 100644
index 000000000..bc41dfe12
--- /dev/null
+++ b/english_documents/benchmark.md
@@ -0,0 +1,69 @@
+[简体中文](../zh-CN/benchmark.md) | English
+# Benchmark
+
+We compare our results with some popular frameworks and official releases in terms of speed.
+
+## Environment
+
+### Hardware
+
+- 8 NVIDIA Tesla V100 (16G) GPUs
+- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+
+### Software
+
+- Python 3.7
+- PaddlePaddle2.0
+- CUDA 10.1
+- CUDNN 7.6.3
+- NCCL 2.1.15
+- GCC 8.2.0
+
+## Experiments and Statistics
+The statistic is the average training time, including data processing and model training time, and the training speed is measured with ips(instance per second). Note that we skip the first 50 iters as they may contain the device warmup time.
+
+Here we compare PaddleVideo with the other video understanding toolkits in the same data and model settings.
+
+To ensure the fairness of the comparison, the comparison experiments were conducted under the same hardware environment and using the same dataset. The dataset we used is generated by the [data preparation](dataset/k400.md), and in each model setting, the same data preprocessing methods are applied to make sure the same feature input.
+
+Significant improvement can be observed when comparing with other video understanding framework as shown in the table below, Especially the [Slowfast](../../configs/recognition/slowfast/slowfast.yaml) model is nearly 2x faster than the counterparts.
+
+
+
+## Results
+### Recognizers
+
+| Model | batch size <sub>x</sub> gpus | PaddleVideo(ips) | Reference(ips) | MMAction2 (ips)  | PySlowFast (ips)|
+| :------: | :-------------------:|:---------------:|:---------------: | :---------------:  |:---------------: |
+| [TSM](../../configs/recognition/tsm/tsm.yaml) | 16x8 | 58.1 | 46.04(temporal-shift-module) | To do | X |
+| [PPTSM](../../configs/recognition/tsm/pptsm.yaml) | 16x8 |  57.6 | X |    X   | X |
+| [TSN](../../configs/recognition/tsn/tsn.yaml) | 16x8 |  841.1 |  To do (tsn-pytorch) | To do | X |
+| [Slowfast](../../configs/recognition/slowfast/slowfast.yaml)| 16x8 | 99.5 | X | To do | 43.2 |
+| [Attention_LSTM](../../configs/recognition/attention_lstm/attention_lstm.yaml) |  128x8  | 112.6  | X | X | X |
+
+
+### Localizers
+
+| Model | PaddleVideo(ips) |MMAction2 (ips) |BMN(boundary matching network) (ips)|
+| :--- | :---------------: | :-------------------------------------: | :-------------------------------------: |
+| [BMN](../../configs/localization/bmn.yaml)  | 43.84 | x | x |
+
+
+### Segmenters
+
+This repo provides performance and accuracy comparison between classical and popular sequential action segmentation models
+
+| Model | Metrics | Value | Flops(M) |Params(M) | test time(ms) bs=1 | test time(ms) bs=2 | inference time(ms) bs=1 | inference time(ms) bs=2 |
+| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| MS-TCN | F1@0.5 | 38.8% | 791.360 | 0.8 | 170 | - | 10.68 | - |
+| ASRF | F1@0.5 | 55.7% | 1,283.328 | 1.3 | 190 | - | 16.34 | - |
+
+* Model: model name, for example: PP-TSM
+* Metrics: Fill in the indicators used in the model test, and the data set used is **breakfast**
+* Value: Fill in the value corresponding to the metrics index, and generally keep two decimal places
+* Flops(M): The floating-point computation required for one forward operation of the model can be called `paddlevideo/tools/summary.py`script calculation (different models may need to be modified slightly), keep one decimal place, and measure it with data **input tensor with shape of (1, 2048, 1000)**
+* Params(M): The model parameter quantity, together with flops, will be calculated by the script, and one decimal place will be reserved
+* test time(ms) bs=1: When the python script starts the batchsize = 1 test, the time required for a sample is kept to two decimal places. The data set used in the test is **breakfast**.
+* test time(ms) bs=2: When the python script starts the batchsize = 2 test, the time required for a sample is kept to two decimal places. The sequential action segmentation model is generally a full convolution network, so the batch of training, testing and reasoning_ Size is 1. The data set used in the test is **breakfast**.
+* inference time(ms) bs=1: When the reasoning model is tested with GPU (default V100) with batchsize = 1, the time required for a sample is reserved to two decimal places. The dataset used for reasoning is **breakfast**.
+* inference time(ms) bs=2: When the reasoning model is tested with GPU (default V100) with batchsize = 1, the time required for a sample is reserved to two decimal places. The sequential action segmentation model is generally a full convolution network, so the batch of training, testing and reasoning_ Size is 1. The dataset used for reasoning is **breakfast**.
diff --git a/english_documents/dataset/AVA.md b/english_documents/dataset/AVA.md
new file mode 100644
index 000000000..ee95a5775
--- /dev/null
+++ b/english_documents/dataset/AVA.md
@@ -0,0 +1,113 @@
+[简体中文](../../zh-CN/dataset/k400.md) | English
+# AVA Data Preparation
+This document mainly introduces the preparation process of AVA dataset.
+It mainly includes five parts: Video Data Download, Prepare Annotations, Cut video files,
+Extract the RGB frames, Pulling Proposal Files,et al.
+Before we start, please make sure that the directory is located at `$PaddleVideo/data/ava/script`.
+
+
+---
+
+
+## 1. Video data Download
+For basic dataset information, you can refer to the official website [AVA](https://research.google.com/ava/index.html).
+For the dataset download, you can refer to the [AVA Download](https://github.com/cvdfoundation/ava-dataset) ，
+which introduce the way to download the dataset. We also provide the shell script for downloading the video files
+
+```shell
+bash download_videos.sh
+```
+
+Furthermore,considering the difficulty in downloading,
+we upload the video files to Baidu cloud disk in the form of zip packages, and users can download it by themselves according to their needs.
+[Link]() <sup>coming soon</sup>.
+
+
+**Note: the video files should be placed in `data/ava/videos`**
+
+---
+## 2.Prepare Annotations
+
+Next, you can run the following script to prepare annotations.
+
+```shell
+bash download_annotations.sh
+```
+
+This command will download `ava_v2.1.zip` for AVA `v2.1` annotation. If you need the AVA `v2.2` annotation, you can try the following script.
+
+```shell
+VERSION=2.2 bash download_annotations.sh
+```
+
+**Note: In fact,we will also provide the annotation zip files in Baidu cloud disk**
+
+---
+## 3. cut video files
+
+Cut each video from its 15th to 30th minute and make them at 30 fps.
+
+```shell
+bash cut_videos.sh
+```
+---
+
+## 4. Extract RGB Frames
+
+you can use the ffmpeg to extract RGB frames by the following script.
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+---
+
+## 5.Pulling Proposal Files
+
+The scripts are adapted from FAIR's [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks).
+
+Run the follow scripts to fetch pre-computed proposal list.
+
+```shell
+bash fetch_ava_proposals.sh
+```
+
+---
+## 6.Folder Structure
+
+After the whole data pipeline for AVA preparation.
+you can get the rawframes (RGB), videos and annotation files for AVA.
+
+In the context of the whole project (for AVA only), the folder structure will look like:
+
+```
+PaddleVideo
+├── configs
+├── paddlevideo
+├── docs
+├── tools
+├── data
+│   ├── ava
+│   │   ├── annotations
+│   │   |   ├── ava_dense_proposals_train.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_dense_proposals_val.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_dense_proposals_test.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_train_v2.1.csv
+│   │   |   ├── ava_val_v2.1.csv
+│   │   |   ├── ava_train_excluded_timestamps_v2.1.csv
+│   │   |   ├── ava_val_excluded_timestamps_v2.1.csv
+│   │   |   ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt
+│   │   ├── videos
+│   │   │   ├── 053oq2xB3oU.mkv
+│   │   │   ├── 0f39OWEqJ24.mp4
+│   │   │   ├── ...
+│   │   ├── videos_15min
+│   │   │   ├── 053oq2xB3oU.mkv
+│   │   │   ├── 0f39OWEqJ24.mp4
+│   │   │   ├── ...
+│   │   ├── rawframes
+│   │   │   ├── 053oq2xB3oU
+|   │   │   │   ├── img_00001.jpg
+|   │   │   │   ├── img_00002.jpg
+|   │   │   │   ├── ...
+```
diff --git a/english_documents/dataset/ActivityNet.md b/english_documents/dataset/ActivityNet.md
new file mode 100644
index 000000000..006a93670
--- /dev/null
+++ b/english_documents/dataset/ActivityNet.md
@@ -0,0 +1,80 @@
+[简体中文](../../zh-CN/dataset/ActivityNet.md) | English
+
+# ActivityNet data preparation
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+
+## Introduction
+
+ActivityNet is a dataset for large-scale video understanding tasks, which can be used for tasks such as action localization, action recognition, etc.
+
+
+## Download
+1. The BMN model uses the processed ActivityNet 1.3 dataset. There are two ways to use it:
+    - Using our processed ActivityNet 1.3 dataset (compressed package is about 5.5G), each video has corresponding action labels, duration intervals, duration frames, duration seconds and other information
+        Download with the following command:
+        ```bash
+        wget https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz # Download the processed video feature data
+        wget https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json # Download the processed label data
+        ```
+
+        Or click the following hyperlinks to download:
+
+        [Video feature data](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)
+        [Video feature data](https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json)
+
+        then decompression `bmn_feat.tar.gz`
+        ```bash
+        tar -xf bmn_feat.tar.gz
+        ```
+
+    - Extract features by yourself
+
+        First refer to [Download Instructions](https://github.com/activitynet/ActivityNet/tree/master/Crawler) to download the original dataset. When training this model, you need to use TSN to extract features from the source files first. You can [self-extract](https://github.com/yjxiong/temporal-segment-networks) video frame and optical flow information, and the pre-trained TSN model can be downloaded from [here](https://github.com/ yjxiong/anet2016-cuhk) download.
+
+
+    The information in the `activitynet_1.3_annotations.json` tag file is as follows:
+    ```json
+    {
+        "v_QOlSCBRmfWY": {
+            "duration_second": 82.73,
+            "subset": "training",
+            "duration_frame": 2067,
+            "annotations": [{
+                "segment": [6.195294851794072, 77.73085420904837],
+                "label": "Ballet"
+            }],
+            "feature_frame": 2064
+        },
+        "v_ehGHCYKzyZ8": {
+            "duration_second": 61.7189999999999994,
+            "subset": "training",
+            "duration_frame": 1822,
+            "annotations": [{
+                "segment": [43.95990729267573, 45.401932082395355],
+                "label": "Doing crunches"
+            }],
+            "feature_frame": 1808
+        },
+        ...,
+        ...
+    }
+    ```
+
+    In the end, `19228` video feature npy files are obtained, corresponding to the `19228` label information in the `activitynet_1.3_annotations.json` file.
+
+2. Create a new `data/bmn_data` folder, and then unzip the video feature data after downloading and put it in this folder, and finally it should be organized into the following form:
+    ```
+    PaddleVideo
+    ├── data
+    │   ├── bmn_data
+    │   │   ├── fix_feat_100
+    │   │   │   ├── v___c8enCfzqw.npy
+    │   │   │   ├── v___dXUJsj3yo.npy
+    │   │   │   ├── ...
+    │   │   │
+    │   │   └── activitynet_1.3_annotations.json
+    ```
+
+3. Finally, modify the `feat_path` field in the configuration file configs/localization/bmn.yaml to specify the feature directory path, and the `file_path` field to specify the label file path.
diff --git a/english_documents/dataset/Oxford_RobotCar.md b/english_documents/dataset/Oxford_RobotCar.md
new file mode 100644
index 000000000..c02b54a01
--- /dev/null
+++ b/english_documents/dataset/Oxford_RobotCar.md
@@ -0,0 +1,162 @@
+[简体中文](../../zh-CN/dataset/Oxford_RobotCar.md) | English
+
+# Oxford-RobotCar-for-ADDS data preparation
+
+- [Introduction](#Introduction)
+- [Data Set Download](#Download)
+- [Preprocessing](#Preprocessing)
+- [1. Image De-distortion](#1-Image-de-distortion)
+- [2. Dynamic frame filter](#2-Dynamic-frame-filter)
+- [3. Image Rename](#3-Image-Rename)
+- [4. Preparation for Day-Pseudo Night Image Pair](#4-Day-Pseudo-Night-Image-Pair-Preparation)
+
+
+## Introduction
+
+[Oxford RobotCar Dataset](https://robotcar-dataset.robots.ox.ac.uk/) is a large-scale autonomous driving data set that contains a large amount of data in different autonomous driving scenarios.
+
+What is used here is to filter a part of the data used for day-night depth estimation from the original Oxford RobotCar data set, namely Oxford-RobotCar-for-ADDS.
+
+If you want to use Oxford-RobotCar-for-ADDS, please cite the following papers:
+```latex
+@article{maddern20171,
+  title={1 year, 1000 km: The oxford robotcar dataset},
+  author={Maddern, Will and Pascoe, Geoffrey and Linegar, Chris and Newman, Paul},
+  journal={The International Journal of Robotics Research},
+  volume={36},
+  number={1},
+  pages={3--15},
+  year={2017},
+  publisher={SAGE Publications Sage UK: London, England}
+}
+```
+```latex
+@inproceedings{liu2021self,
+  title={Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation},
+  author={Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={12737--12746},
+  year={2021}
+}
+```
+
+## Download
+
+1. Download the left eye image of Bumblebee XB3 in the sequence [2014-12-09](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-09-13-21-02/) as For the training set of the daytime scene, the downloaded images are decompressed in the same folder.
+2. Download the left eye image of Bumblebee XB3 in the sequence [2014-12-16](https://robotcar-dataset.robots.ox.ac.uk/datasets/2014-12-16-18-44-24/) as The training set of the night scene, the downloaded images are unzipped in the same folder.
+3. The images and depth truth values ​​of the validation set are filtered from the original data set and downloaded from the link we gave. (The data download links are below)
+    ```shell
+    https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt
+    https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt
+    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.001
+    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all.7z.002
+    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.001
+    https://videotag.bj.bcebos.com/Data/ADDS/day_train_all_fake_night.7z.002
+    https://videotag.bj.bcebos.com/Data/ADDS/day_val_451.7z
+    https://videotag.bj.bcebos.com/Data/ADDS/day_val_451_gt.7z
+    https://videotag.bj.bcebos.com/Data/ADDS/night_val_411.7z
+    https://videotag.bj.bcebos.com/Data/ADDS/night_val_411_gt.7z
+    ```
+    the original raw data download links:
+    ```shell
+    # data in day
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.001
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.002
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.003
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.004
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.005
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.006
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.007
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.008
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.009
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.010
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.011
+    https://videotag.bj.bcebos.com/Data/original-ADDS/day_train_all.7z.012
+
+    # data in night
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.001
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.002
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.003
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.004
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.005
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.006
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.007
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.008
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.009
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.010
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.011
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.012
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.013
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.014
+    https://videotag.bj.bcebos.com/Data/original-ADDS/night_train_all.7z.015
+    ```
+
+## Preprocessing
+
+### 1-Image-de-distortion
+
+Use the official toolbox [robotcar-dataset-sdk](https://github.com/ori-mrg/robotcar-dataset-sdk/tree/master/python) to pair the sequence 2014-12-09 and 2014-12- The image of 16 is de-distorted.
+
+
+### 2-Dynamic-frame-filter
+
+Since we use the self-supervised method, we need to filter out dynamic frames for training. The filtering principle is that the inter-frame pose change is greater than 0.1m and it is considered a dynamic frame. After filtering, the sequence of the training set is obtained.
+
+
+### 3-Image-Rename
+
+Rename the original image timestamp to a continuous number sequence. For daytime scene correspondence, see [1209_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1209_all_files.txt), for night scene correspondence, see [1216_all_files.txt](https://videotag.bj.bcebos.com/Data/ADDS/1216_all_files.txt). The renamed data format is as follows:
+```
+├── oxford_processing
+    ├── day_train_all #Day training image folder (day_train_all.7z.001 ~ day_train_all.7z.012)
+    ├── night_train_all #Night training image folder (night_train_all.7z.001 ~ day_train_all.7z.015)
+    ├── day_val_451 #Daytime verification image folder (day_val_451.7z)
+    ├── day_val_451_gt #Daytime verification depth truth value folder (day_val_451_gt.7z)
+    ├── night_val_411 #night verification image folder (night_val_411.7z)
+    └── night_val_411_gt #Night verification depth truth value folder (night_val_411_gt.7z)
+```
+
+annotation files download links are below:
+```shell
+https://videotag.bj.bcebos.com/Data/ADDS/train_files.txt
+https://videotag.bj.bcebos.com/Data/ADDS/val_day_files.txt
+https://videotag.bj.bcebos.com/Data/ADDS/val_night_files.txt
+```
+
+The sequence used for training and verification is as follows:
+
+```
+splits/oxford_day/train_files.txt # training sequence during the day
+splits/oxford_night/train_files.txt # training sequence at night
+splits/oxford_day_451/val_files.txt # verification sequence during the day
+splits/oxford_night_411/val_files.txt # night verification sequence
+```
+
+### 4-Day-Pseudo-Night-Image-Pair-Preparation
+
+In order to use our framework to extract the common information of day and night images, we use [CycleGAN](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix) to generate day-pseudo-night image pairs, where pseudo-night The night images corresponding to the daytime generated for CycleGAN, all images are scaled to 192x640, the night images are enhanced with histogram equalization, 75 epochs are trained, and the Oxford-RobotCar-for-ADDS is finally obtained. The generated day-pseudo-night image pair The data format is as follows, which can be directly used for training and verification of ADDS-DepthNet:
+```
+├── oxford_processing_forADDS
+    ├── day_train_all #Day training image folder (day_train_all.7z.001 ~ day_train_all.7z.002)
+    ├── night_train_all #Night training image folder (night_train_all.7z.001 ~ day_train_all.7z.002)
+    ├── day_val_451 #Daytime verification image folder (day_val_451.7z)
+    ├── day_val_451_gt #Daytime verification depth truth value folder (day_val_451_gt.7z)
+    ├── night_val_411 #night verification image folder (night_val_411.7z)
+    └── night_val_411_gt #Night verification depth truth value folder (night_val_411_gt.7z)
+data
+└── oxford
+    ├── splits
+        ├── train_files.txt
+        ├── val_day_files.txt
+        └── val_night_files.txt
+    └── oxford_processing_forADDS
+        ├── day_train_all/      #Day training image folder (from day_train_all.7z.001 ~ day_train_all.7z.002)
+        ├── night_train_all/    #Night training image folder (from night_train_all.7z.001 ~ day_train_all.7z.002)
+        ├── day_val_451/        #Daytime verification image folder (from day_val_451.7z)
+        ├── day_val_451_gt/     #Daytime verification depth truth value folder (from day_val_451_gt.7z)
+        ├── night_val_411/      #night verification image folder (from night_val_411.7z)
+        └── night_val_411_gt/   #Night verification depth truth value folder (from night_val_411_gt.7z)
+
+```
+
+The sequences used for training and verification are consistent with the foregoing.
diff --git a/english_documents/dataset/README.md b/english_documents/dataset/README.md
new file mode 100644
index 000000000..03fe7e35f
--- /dev/null
+++ b/english_documents/dataset/README.md
@@ -0,0 +1,73 @@
+English | [简体中文](../../zh_CN/dataset/README.md)
+
+# Dataset
+
+## 1. Dataset List
+
+
+<table>
+  <tbody><tr>
+    <td colspan="4">Action Recognition</td>
+  </tr>
+  <tr>
+    <td><a href="./k400.md">Kinetics-400</a> (<a href="https://deepmind.com/research/open-source/kinetics/" rel="nofollow">Homepage</a>) (CVPR'2017)</td>
+    <td><a href="./ucf101.md">UCF101</a> (<a href="https://www.crcv.ucf.edu/research/data-sets/ucf101/" rel="nofollow">Homepage</a>) (CRCV-IR-12-01)</td>
+    <td><a href="./ActivityNet.md">ActivityNet</a> (<a href="http://activity-net.org/" rel="nofollow">Homepage</a>) (CVPR'2015)</td>
+    <td><a href="./youtube8m.md">YouTube-8M</a> (<a href="https://research.google.com/youtube8m/" rel="nofollow">Homepage</a>) (CVPR'2017)</td>
+  </tr>
+  <tr>
+    <td colspan="4">Action Localization</td>
+  </tr>
+  <tr>
+    <td><a href="./ActivityNet.md">ActivityNet</a> (<a href="http://activity-net.org/" rel="nofollow">Homepage</a>) (CVPR'2015)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="4">Spatio-Temporal Action Detection</td>
+  </tr>
+  <tr>
+    <td><a href="./AVA.md">AVA</a> (<a href="https://research.google.com/ava/index.html" rel="nofollow">Homepage</a>) (CVPR'2018)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="4">Skeleton-based Action Recognition</td>
+  </tr>
+  <tr>
+    <td><a href="./ntu-rgbd.md">NTURGB+D</a> (<a href="https://rose1.ntu.edu.sg/dataset/actionRecognition/" rel="nofollow">Homepage</a>) (IEEE CS'2016)</td>
+    <td><a href="./fsd.md">FSD</a> (<a href="https://aistudio.baidu.com/aistudio/competition/detail/115/0/introduction" rel="nofollow">Homepage</a>)</td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="4">Depth Estimation</td>
+  </tr>
+  <tr>
+    <td><a href="./Oxford_RobotCar.md">Oxford-RobotCar</a> (<a href="https://robotcar-dataset.robots.ox.ac.uk/" rel="nofollow">Homepage</a>) (IJRR'2017)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="4">Text-Video Retrieval</td>
+  </tr>
+  <tr>
+    <td><a href="docs/zh-CN/dataset/msrvtt.md">MSR-VTT</a> (<a href="https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/" rel="nofollow">Homepage</a>) (CVPR'2016)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="4">Text-Video Pretrained Model</td>
+  </tr>
+  <tr>
+    <td><a href="docs/zh-CN/dataset/howto100m.md">HowTo100M</a> (<a href="https://www.di.ens.fr/willow/research/howto100m/" rel="nofollow">Homepage</a>) (ICCV'2019)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+</tbody>
+</table>
diff --git a/english_documents/dataset/SegmentationDataset.md b/english_documents/dataset/SegmentationDataset.md
new file mode 100644
index 000000000..3f67fb272
--- /dev/null
+++ b/english_documents/dataset/SegmentationDataset.md
@@ -0,0 +1,35 @@
+English | [简体中文](../../zh-CN/dataset/SegmentationDataset.md)
+
+# Video Action Segmentation Dataset
+
+The video motion segmentation model uses breakfast, 50salads and gtea data sets. The use method is to use the features extracted by the pre training model, which can be obtained from the ms-tcn official code base.[feat](https://zenodo.org/record/3625992#.Xiv9jGhKhPY)
+
+- Dataset tree
+```txt
+─── gtea
+    ├── features
+    │   ├── S1_Cheese_C1.npy
+    │   ├── S1_Coffee_C1.npy
+    │   ├── S1_CofHoney_C1.npy
+    │   └── ...
+    ├── groundTruth
+    │   ├── S1_Cheese_C1.txt
+    │   ├── S1_Coffee_C1.txt
+    │   ├── S1_CofHoney_C1.txt
+    │   └── ...
+    ├── splits
+    │   ├── test.split1.bundle
+    │   ├── test.split2.bundle
+    │   ├── test.split3.bundle
+    │   └── ...
+    └── mapping.txt
+```
+
+- data tree
+```txt
+─── data
+    ├── 50salads
+    ├── breakfast
+    ├── gtea
+    └── ...
+```
diff --git a/english_documents/dataset/fsd.md b/english_documents/dataset/fsd.md
new file mode 100644
index 000000000..2f16a485a
--- /dev/null
+++ b/english_documents/dataset/fsd.md
@@ -0,0 +1,55 @@
+[简体中文](../../zh-CN/dataset/fsd.md) | English
+
+# Figure Skating Dataset
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+
+---
+
+
+## Introduction
+
+In figure skating, compared with other sports, human posture and trajectory show the characteristics of strong complexity, which is helpful to the research of fine-grained action recognition tasks.
+
+For FSD Dataset, all video materials are collected from the Figure Skating Championships from 2017 to 2018. The frame rate of the video is uniformly standardized to 30 frames per second, and the image size is 1080 * 720 to ensure the relative consistency of the dataset. After that, we use the 2D pose estimation algorithm Open Pose to extract frame by frame key points from the video, and finally save the data in `.npy` format.
+
+The directory structure of training dataset and test dataset is as follows:
+
+```txt
+train_data.npy        # 2922
+train_label.npy       # 2922
+test_A_data.npy       # 628
+test_B_data.npy       # 634
+```
+
+`train_label.npy` can be read using `np.load()`, each element is an integer variable with a value between 0-29, representing the label of the action. `data.npy` can be read using `np.load()`, return a tensor with the shape of `N×C×T×V×M`, the specific meaning of each dimension is as follows:
+
+| Dimension | Size | Meaning	| Notes |
+| :---- | :----: | :----: | :---- |
+| N	| N	| Number of samples | - |
+| C | 3	| The coordinates and confidence of each joint point respectively |	rescale to -1~1 |
+| T	| 1500 |	 The duration of the action	| The actual length of some actions may be less than 1500, in such case we will pad 0 to ensure the unity of T dimension. |
+| V |	25 | Number of joint points |	See the skeleton example below for the meaning of specific joint points. |
+| M |	1	|  Number of athletes	| - |
+
+
+skeleton example：
+
+<div align="left">
+  <img src="../../images/skeleton_example.png" width="180px"/><br>
+</div>
+
+
+
+## Download
+
+You can get the download link after registering on the [competition homepage](https://www.datafountain.cn/competitions/519).
+
+| Set | Data | Label	|
+| :---- | :----: | :----: |
+| Train	| [train_data.npy](https://videotag.bj.bcebos.com/Data/FSD_train_data.npy)	| [train_label.npy](https://videotag.bj.bcebos.com/Data/FSD_train_label.npy) |
+| TestA	| comming soon	| comming soon |
+
+
+> RGB datasets would not be provided for copyright reasons.
diff --git a/english_documents/dataset/k400.md b/english_documents/dataset/k400.md
new file mode 100644
index 000000000..f76de02c8
--- /dev/null
+++ b/english_documents/dataset/k400.md
@@ -0,0 +1,78 @@
+[简体中文](../../zh-CN/dataset/k400.md) | English
+
+# Kinetics-400 Preparation
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+- [Frames](#Frames)
+
+---
+
+
+## Introduction
+
+Kinetics-400 is a commonly used benchmark dataset in the video field. Please refer to its official website [Kinetics](https://deepmind.com/research/open-source/kinetics) for details. You can refer to the official address [ActivityNet](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics), and use the download script provided to download the dataset.
+
+## Download
+
+Considering the difficulty of downloading the K400 data set, we provide two download methods: (1) Baidu network disk download (2) Script download
+
+### Baidu SkyDrive Download
+
+Netdisk link: https://pan.baidu.com/s/1S_CGBjWOUAuxL_cCX5kMPg
+Extraction code: `ppvi`
+
+### Script download
+
+- Download the training set link list file [train_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/train_link.list) and the validation set link list file [val_link.list](https://ai-rank.bj.bcebos.com/Kinetics400/val_link.list).
+
+Write the download script `download.sh` as follows:
+
+```bash
+file=$1
+
+while read line 
+do
+  wget "$line"
+done <$file
+```
+
+Download training set command:
+```bash
+bash download.sh train_link.list
+```
+
+Download verification set command:
+```bash
+bash download.sh val_link.list
+```
+
+---
+
+|category | Number of data  | list file |
+| :------: | :----------: | :----: |
+|Training set | 234619  |  [train.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train.list)|
+|Validation set | 19761 |  [val.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val.list)|
+
+- After downloading, unzip and add the data path to list file.
+
+- Due to the failure of some video link, part of original data is missing. This copies need about 135G of storage space.
+
+> This copies is only used for academic research. If it is helpful to you, welcome to star [our project](https://github.com/PaddlePaddle/PaddleVideo)
+
+
+## Frames
+In order to speed up the training process of the network, we first extract frames from the video file (K400 video file is in mp4 format). Compared with the method of network training directly through video files, the method of frames can greatly accelerate the speed of network training。
+
+Enter the following command to extract the frames of the K400 video file
+
+```python
+python extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext mp4
+```
+
+After the video file frames are extracted, they will be stored in the specified `./rawframes` path, and the size is about 2T.
+
+|category | Number of data  | list file |
+| :------: | :----------: | :----: |
+|Training set | 234619  |  [train_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/train_frames.list)|
+|Validation set | 19761 |  [val_frames.list](https://videotag.bj.bcebos.com/PaddleVideo/Data/Kinetic400/val_frames.list)|
diff --git a/english_documents/dataset/msrvtt.md b/english_documents/dataset/msrvtt.md
new file mode 100644
index 000000000..390ba9d00
--- /dev/null
+++ b/english_documents/dataset/msrvtt.md
@@ -0,0 +1,79 @@
+[简体中文](../../zh-CN/dataset/msrvtt.md) | English
+
+# MSR-VTT Preparation
+
+- [Introduction](#1.1)
+- [Download for T2VLAD](#1.2)
+- [Download for ActBERT](#1.3)
+- [Reference](#1.4)
+
+
+<a name="1.1"></a>
+## Introduction
+
+MSR-VTT(Microsoft Research Video to Text) is a large-scale dataset containing videos and subtitles, which is composed of 10000 video clips from 20 categories, and each video clip is annotated with 20 English sentences. We used 9000 video clips for training and 1000 for testing. For more details, please refer to the website: [MSRVTT](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)
+
+<a name="1.2"></a>
+## Download for T2VLAD
+
+[T2VLAD doc](../../../applications/T2VLAD/README_en.md)
+
+For ease of use, we provided extracted features of video.
+
+First, make sure to enter the following command in the `applications/T2VLAD/data` directory to download the dataset.
+
+```bash
+bash download_features.sh
+```
+
+After downloading, the files in the data directory are organized as follows:
+
+```
+├── data
+|   ├── MSR-VTT
+|   │   ├── raw-captions.pkl
+|   │   ├── train_list_jsfusion.txt
+|   │   ├── val_list_jsfusion.txt
+|   │   ├── aggregated_text_feats
+|   |   |   ├── w2v_MSRVTT_openAIGPT.pickle
+|   |   ├── mmt_feats
+|   │   │   ├── features.audio.pkl
+|   │   │   ├── features.face_agg.pkl
+|   │   │   ├── features.flos_agg.pkl
+|   │   │   ├── features.ocr.pkl
+|   │   │   ├── features.rgb_agg.pkl
+|   │   │   ├── features.s3d.pkl
+|   │   │   ├── features.scene.pkl
+|   │   │   ├── features.speech.pkl
+
+```
+<a name="1.3"></a>
+## Download for ActBERT
+
+[ActBERT doc](../model_zoo/multimodal/actbert.md)
+
+Download data features:
+```
+wget https://videotag.bj.bcebos.com/Data/ActBERT/msrvtt_test.lmdb.tar
+wget https://videotag.bj.bcebos.com/Data/ActBERT/MSRVTT_JSFUSION_test.csv
+```
+
+Decompress the `msrvtt_test.lmdb.tar`：
+```
+tar -zxvf msrvtt_test.lmdb.tar
+```
+
+The files in the data directory are organized as follows:
+
+```
+├── data
+|   ├── MSR-VTT
+|   │   ├── MSRVTT_JSFUSION_test.csv
+|   │   ├── msrvtt_test.lmdb
+|   │       ├── data.mdb
+|   │       ├── lock.mdb
+```
+
+<a name="1.4"></a>
+## Reference
+- Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. Multi-modal transformer for video retrieval. In ECCV, 2020.
diff --git a/english_documents/dataset/ntu-rgbd.md b/english_documents/dataset/ntu-rgbd.md
new file mode 100644
index 000000000..0b0056b7d
--- /dev/null
+++ b/english_documents/dataset/ntu-rgbd.md
@@ -0,0 +1,158 @@
+[简体中文](../../zh-CN/dataset/ntu-rgbd.md) | English
+
+# NTU-RGB+D Preparation
+
+- [Introduction](#Introduction)
+- [ST-GCN Data Prepare](#ST-GCN_Data_Prepare)
+- [CTR-GTCN Data Prepare](#CTR-GCN_Data_Prepare)
+
+---
+
+
+## Introduction
+
+NTU-RGB+D contains 60 action classes and 56,880 video samples for skeleton-based action recognition. Please refer to its official website[NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) for more details.
+
+The dataset contains two splits when dividing the training set and test set. For Cross-subject, the dataset is divided according to character id, with 40320 samples in training set and 16560 samples in test set. For Cross-view, the dataset is divided according to camera division. The samples collected by cameras 2 and 3 are training sets, including 37930 samples, and the samples collected by camera 1 are test sets, including 18960 samples.
+
+## ST-GCN_Data_Prepare
+
+ST-GCN data prepare preceduce are introducted follow.
+
+### Download
+We provide the download link of the processed dataset [NTU-RGB-D.tar](https://videotag.bj.bcebos.com/Data/NTU-RGB-D.tar)(~3.1G). Please download and unzip with ```tar -zxvf NTU-RGB-D.tar ``` , the directory structure is as follows：
+
+```txt
+─── NTU-RGB-D
+    ├── xsub
+    │   ├── train_data.npy
+    │   ├── train_label.pkl
+    │   ├── val_data.npy
+    │   └── val_label.pkl
+    └── xview
+        ├── train_data.npy
+        ├── train_label.pkl
+        ├── val_data.npy
+        └── val_label.pkl
+```
+
+> This is a copies from [st-gcn](https://github.com/open-mmlab/mmskeleton/blob/master/doc/SKELETON_DATA.md).
+
+## CTR-GCN_Data_Prepare
+
+CTR-GCN data prepare preceduce are introducted follow.
+
+### Download
+
+There is script `download_dataset.sh` to download the dataset from official website [NTU-RGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) in dictory `data\ntu-rgb-d`.
+
+```bash
+sh data/ntu-rgb-d/download_dataset.sh
+```
+
+File tree:
+```txt
+─── ntu-rgb-d
+    ├── download_dataset.sh
+    ├── nturgb+d_skeletons
+    │   ├── S001C001P001R001A001.skeleton
+    │   ├── S001C001P001R001A002.skeleton
+    │   ├── S001C001P001R001A003.skeleton
+    │   ├── S001C001P001R001A004.skeleton
+    │   ├── S001C001P001R001A005.skeleton
+    │   ├── S001C001P001R001A006.skeleton
+    │   ├── S001C001P001R001A007.skeleton
+    │   ├── ....
+    │   └── S017C003P020R002A060.skeleton
+    ├── get_raw_denoised_data.py
+    ├── get_raw_skes_data.py
+    ├── seq_transformation.py
+    └── statistics
+        ├── camera.txt
+        ├── label.txt
+        ├── performer.txt
+        ├── replication.txt
+        ├── setup.txt
+        └── skes_available_name.txt
+
+```
+
+### Prepare
+
+run follow script, then data will be precessed to the data format need by CTR-GCN.
+
+> Note：if make dataset by yourself, please prepare `data/ntu-rgb-d/statistics/skes_available_name.txt`, which is the list of skeletons files that will be precessed.
+
+```bash
+cd ./data/ntu-rgb-d
+# Get skeleton of each performer
+python get_raw_skes_data.py
+# Remove the bad skeleton
+python get_raw_denoised_data.py
+# Transform the skeleton to the center of the first frame
+python seq_transformation.py
+```
+
+File tree:
+
+```txt
+─── ntu-rgb-d
+    ├── download_dataset.sh
+    ├── nturgb+d_skeletons
+    │   ├── S001C001P001R001A001.skeleton
+    │   ├── S001C001P001R001A002.skeleton
+    │   ├── S001C001P001R001A003.skeleton
+    │   ├── S001C001P001R001A004.skeleton
+    │   ├── S001C001P001R001A005.skeleton
+    │   ├── S001C001P001R001A006.skeleton
+    │   ├── S001C001P001R001A007.skeleton
+    │   ├── ....
+    │   └── S017C003P020R002A060.skeleton
+    ├── denoised_data
+    │   ├── actors_info
+    │   │   ├── S001C001P001R001A024.txt
+    │   │   ├── S001C001P001R001A025.txt
+    │   │   ├── S001C001P001R001A026.txt
+    │   │   ├── ....
+    │   │   ├── S017C003P020R002A059.txt
+    │   │   └── S017C003P020R002A060.txt
+    │   ├── denoised_failed_1.log
+    │   ├── denoised_failed_2.log
+    │   ├── frames_cnt.txt
+    │   ├── missing_skes_1.log
+    │   ├── missing_skes_2.log
+    │   ├── missing_skes.log
+    │   ├── noise_length.log
+    │   ├── noise_motion.log
+    │   ├── noise_spread.log
+    │   ├── raw_denoised_colors.pkl
+    │   ├── raw_denoised_joints.pkl
+    │   └── rgb+ske
+    ├── raw_data
+    │   ├── frames_cnt.txt
+    │   ├── frames_drop.log
+    │   ├── frames_drop_skes.pkl
+    │   └── raw_skes_data.pkl
+    ├── get_raw_denoised_data.py
+    ├── get_raw_skes_data.py
+    ├── seq_transformation.py
+    ├── statistics
+    │   ├── camera.txt
+    │   ├── label.txt
+    │   ├── performer.txt
+    │   ├── replication.txt
+    │   ├── setup.txt
+    │   └── skes_available_name.txt
+    ├── xview
+    │   ├── train_data.npy
+    │   ├── train_label.pkl
+    │   ├── val_data.npy
+    │   └── val_label.pkl
+    └── xsub
+        ├── train_data.npy
+        ├── train_label.pkl
+        ├── val_data.npy
+        └── val_label.pkl
+```
+
+> Note：dictory `denoised_data`、`raw_data`and`nturgb+d_skeletons`, that are temporal files, can be deleted, if extracted `xview` and `xsub`.
diff --git a/docs/zh-CN/dataset/ucf101.md b/english_documents/dataset/ucf101.md
similarity index 78%
rename from docs/zh-CN/dataset/ucf101.md
rename to english_documents/dataset/ucf101.md
index 83b422d80..478a306c4 100644
--- a/docs/zh-CN/dataset/ucf101.md
+++ b/english_documents/dataset/ucf101.md
@@ -3,34 +3,29 @@ UCF101数据的相关准备。主要包括UCF101的video文件下载，video文
 
 ---
 ## 1. 数据下载
-UCF101数据的详细信息可以参考网站[UCF101](https://www.crcv.ucf.edu/data/UCF101.php)。 为了方便使用，PaddleVideo提供了UCF101数据的annotations文件和videos文件的下载脚本。
+UCF101数据的详细信息可以参考网站[UCF101](https://www.crcv.ucf.edu/data/UCF101.php)。 为了方便用户使用，我们提供了UCF101数据的annotations文件和videos文件的下载脚本。
 
 ### 下载annotations文件
-首先，请确保在[data/ucf101/ 目录](../../../data/ucf101)下，输入如下UCF101数据集的标注文件的命令。
+首先，请确保在`./data/dataset/ucf101/`目录下，输入如下UCF101数据集的标注文件的命令。
 ```shell
 bash download_annotations.sh
 ```
 
 ### 下载UCF101的视频文件
-同样需要确保在[data/ucf101/ 目录](../../../data/ucf101)下，输入下述命令下载视频文件
-
+同样需要确保在`./data/dataset/ucf101/`目录下，输入下述命令下载视频文件
 ```shell
 bash download_videos.sh
 ```
-- 运行该命令需要安装unrar解压工具，可使用pip方式安装。
-
-- 下载完成后视频文件会存储在[data/ucf101/videos/ 文件夹](../../../data/ucf101/videos)下，视频文件大小为6.8G。
+下载完成后视频文件会存储在`./data/dataset/ucf101/videos/`文件夹下，视频文件大小为6.8G。
 
 ---
 ## 2. 提取视频文件的frames
 为了加速网络的训练过程，我们首先对视频文件（ucf101视频文件为avi格式）提取帧 (frames)。相对于直接通过视频文件进行网络训练的方式，frames的方式能够加快网络训练的速度。
 
 直接输入如下命令，即可提取ucf101视频文件的frames
-
 ``` python
 python extract_rawframes.py ./videos/ ./rawframes/ --level 2 --ext avi
 ```
-
 视频文件frames提取完成后，会存储在`./rawframes`文件夹下，大小为56G。
 
 ---
@@ -53,7 +48,7 @@ python build_ucf101_file_list.py rawframes/ --level 2 --format rawframes --out_l
 
 `--format`： 表示是针对视频还是frames生成路径list
 
-`--out_list_path `： 表示生成的路径list文件存储位置
+`--out_list_path `： 表示生的路径list文件存储位置
 
 
 # 以上步骤完成后，文件组织形式如下所示
@@ -68,11 +63,9 @@ python build_ucf101_file_list.py rawframes/ --level 2 --format rawframes --out_l
 |   │   │   ├── videos
 |   │   │   │   ├── ApplyEyeMakeup
 |   │   │   │   │   ├── v_ApplyEyeMakeup_g01_c01.avi
-|   │   │   │   │   └── ...
+|  
 |   │   │   │   ├── YoYo
 |   │   │   │   │   ├── v_YoYo_g25_c05.avi
-|   │   │   │   │   └── ...
-|   │   │   │   └── ...
 |   │   │   ├── rawframes
 |   │   │   │   ├── ApplyEyeMakeup
 |   │   │   │   │   ├── v_ApplyEyeMakeup_g01_c01
diff --git a/english_documents/dataset/ucf24.md b/english_documents/dataset/ucf24.md
new file mode 100644
index 000000000..c7ecc67e0
--- /dev/null
+++ b/english_documents/dataset/ucf24.md
@@ -0,0 +1,73 @@
+English | [简体中文](../../zh-CN/dataset/ucf24.md)
+
+# UCF24 Data Preparation
+This document mainly introduces the preparation process of UCF24 dataset. It mainly includes the download of the RGB frame files, the annotation files and the pathlist of the generated file.
+
+---
+## 1. Data Download
+Detailed information on UCF24 data can be found on the website [UCF24](http://www.thumos.info/download.html). For ease of use, PaddleVideo provides a download script for the RGB frame, annotation file of the UCF24 data.
+
+First, please ensure access to the [data/ucf24/ directory](../../../data/ucf24) and enter the following command for downloading the RGB frame, annotation file of the UCF24 dataset.
+
+```shell
+bash download_frames_annotations.sh
+```
+
+- To run this command you need to install the unrar decompression tool, which can be installed using the pip method.
+
+- The RGB frame files will be stored in the [data/ucf24/rgb-images/ directory](../../../data/ucf24/rgb-images)
+
+- The annotation files will be stored in the [data/ucf24/lables/ directory](../../../data/ucf24/labels)
+
+---
+## 2. File Pathlist Generation
+To specify the format for dividing the file, enter the following command
+
+```python
+python build_split.py --raw_path ./splitfiles
+```
+
+**Description of parameters**
+
+`--raw_path`： indicates the storage path of the original division file
+
+
+# Folder Structure
+After the whole data pipeline for UCF24 preparation, the folder structure will look like:
+
+```
+├── data
+│   ├── ucf24
+│   |   ├── groundtruths_ucf
+│   |   ├── labels
+│   |   |   ├── Basketball
+│   |   |   |   ├── v_Basketball_g01_c01
+│   |   |   |   |   ├── 00009.txt
+│   |   |   |   |   ├── 00010.txt
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00050.txt
+│   |   |   |   |   ├── 00051.txt
+│   |   |   ├── ...
+│   |   |   ├── WalkingWithDog
+│   |   |   |   ├── v_WalkingWithDog_g01_c01
+│   |   |   |   ├── ...
+│   |   |   |   ├── v_WalkingWithDog_g25_c04
+│   |   ├── rgb-images
+│   |   |   ├── Basketball
+│   |   |   |   ├── v_Basketball_g01_c01
+│   |   |   |   |   ├── 00001.jpg
+│   |   |   |   |   ├── 00002.jpg
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00140.jpg
+│   |   |   |   |   ├── 00141.jpg
+│   |   |   ├── ...
+│   |   |   ├── WalkingWithDog
+│   |   |   |   ├── v_WalkingWithDog_g01_c01
+│   |   |   |   ├── ...
+│   |   |   |   ├── v_WalkingWithDog_g25_c04
+│   |   ├── splitfiles
+│   |   |   ├── trainlist01.txt
+│   |   |   |── testlist01.txt 
+│   |   ├── trainlist.txt
+│   |   |── testlist.txt 
+```
diff --git a/english_documents/dataset/youtube8m.md b/english_documents/dataset/youtube8m.md
new file mode 100644
index 000000000..e1c09a11d
--- /dev/null
+++ b/english_documents/dataset/youtube8m.md
@@ -0,0 +1,56 @@
+English | [简体中文](../../zh-CN/dataset/youtube8m.md)
+
+# YouTube-8M Data Preparation
+
+- [Introduction](#Introduction)
+- [Download](#Download)
+- [Conversion](#Conversion)
+
+
+## Introduction
+
+YouTube-8M is a large-scale video classification data set, containing more than 8 million video URLs. The tag system covers more than 3800 knowledge graph entities. One video corresponds to multiple tags (3-4 on average) and is labeled by machine.
+
+**The length of each video is between 120s and 500s
+Due to the large amount of video data, the image classification model was used to extract frame-level features in advance, and PCA was used to reduce the dimensionality of the features to obtain multi-frame 1024-dimensional features. Similarly, the audio model was used to obtain multi-frame 128-dimensional features. Audio characteristics. **
+> The dataset used here is the updated YouTube-8M data set in 2018 (May 2018 version (current): 6.1M videos, 3862 classes, 3.0 labels/video, 2.6B audio-visual features).
+  
+
+## Download
+1. Create a new directory for storing features (take the PaddleVideo directory as an example)
+    ```bash
+    cd data/yt8m
+    mkdir frame
+    cd frame
+    ```
+2. Download the training and validation set to the frame folder
+    ```bash
+    curl data.yt8m.org/download.py | partition=2/frame/train mirror=asia python
+    curl data.yt8m.org/download.py | partition=2/frame/validate mirror=asia python
+    ```
+    The download process is shown in the figure
+    ![image](https://user-images.githubusercontent.com/23737287/140709613-1e2d6ec0-a82e-474d-b220-7803065b0153.png)
+
+    After the data download is complete, you will get 3844 training data files and 3844 verification data files (TFRecord format)
+
+## Conversion
+1. Install tensorflow to read tfrecord data
+    ```bash
+    python3.7 -m pip install tensorflow-gpu==1.14.0
+    ```
+2. Convert the downloaded TFRecord file into a pickle file for PaddlePaddle to use
+    ```bash
+    cd .. # From the frame directory back to the yt8m directory
+    python3.7 tf2pkl.py ./frame ./pkl_frame/ # Convert train*.tfrecord and validate*.tfrecord in the frame folder to pkl format
+    ```
+3. Generate a single pkl file path set, and split pkl into multiple small pkl files based on this file, and generate the final split pkl file path required
+    ```bash
+    ls pkl_frame/train*.pkl> train.list # Write the path of train*.pkl to train.list
+    ls pkl_frame/validate*.pkl> val.list # Write the path of validate*.pkl into val.list
+
+    python3.7 split_yt8m.py train.list # Split each train*.pkl into multiple train*_split*.pkl
+    python3.7 split_yt8m.py val.list # Split each validate*.pkl into multiple validate*_split*.pkl
+    
+    ls pkl_frame/train*_split*.pkl> train.list # Rewrite the path of train*_split*.pkl into train.list
+    ls pkl_frame/validate*_split*.pkl> val.list # Rewrite the path of validate*_split*.pkl into val.list
+    ``` 
diff --git a/english_documents/install.md b/english_documents/install.md
new file mode 100644
index 000000000..4dc50b792
--- /dev/null
+++ b/english_documents/install.md
@@ -0,0 +1,74 @@
+[简体中文](../zh-CN/install.md) | English
+
+# Installation
+
+---
+
+- [Introduction](#Introduction)
+- [Install PaddlePaddle](#Install-PaddlePaddle)
+- [Install PaddleVideo](#Install-PaddleVideo)
+
+## Introduction
+
+This document introduces how to install PaddlePaddle、PaddleVideo and its requirements.
+
+## Install PaddlePaddle
+
+Python 3.7, CUDA 10.1, CUDNN7.6.4 nccl2.1.2 and later version are required at first, For now, PaddleVideo only support training on the GPU device. Please follow the instructions in the [Installation](http://www.paddlepaddle.org.cn/install/quick) if the PaddlePaddle on the device is lower than v2.0
+
+**Install PaddlePaddle**
+
+```bash
+pip3 install paddlepaddle-gpu --upgrade
+```
+
+or compile from source code, please refer to [Installation](http://www.paddlepaddle.org.cn/install/quick).
+
+Verify Installation
+
+```python
+import paddle
+paddle.utils.run_check()
+```
+
+Check PaddlePaddle version：
+
+```bash
+python3 -c "import paddle; print(paddle.__version__)"
+```
+
+Note:
+- Make sure the compiled version is later than PaddlePaddle2.0.
+- Indicate **WITH_DISTRIBUTE=ON** when compiling, Please refer to [Instruction](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#id3) for more details.
+- When running in the docker, in order to ensure that the container has enough shared memory for data read acceleration of Paddle, please set the parameter `--shm_size=32g` at creating a docker container, if conditions permit, you can set it to a larger value.
+
+---
+
+## Install PaddleVideo
+
+**Clone PaddleVideo:**
+
+```bash
+cd path_to_clone_PaddleVideo
+git clone https://github.com/PaddlePaddle/PaddleVideo.git
+```
+
+**Install requirements**
+
+```bash
+python3.7 -m pip install --upgrade pip
+pip3.7 install --upgrade -r requirements.txt
+```
+
+**Install python package**
+
+```bash
+pip3.7 install ppvideo==2.3.0
+```
+
+use scripts:
+```bash
+ppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'
+```
+
+
diff --git a/english_documents/model_zoo/README.md b/english_documents/model_zoo/README.md
new file mode 100644
index 000000000..1c3e4454b
--- /dev/null
+++ b/english_documents/model_zoo/README.md
@@ -0,0 +1,106 @@
+[简体中文](../../zh-CN/model_zoo/README.md) | English
+
+# Academic algorithms
+
+## 1. Introduction
+
+We implemented action recgonition model and action localization model in this repo.
+
+
+## 2. Model list
+
+<table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
+  <tr>
+    <td colspan="5" style="font-weight:bold;">Action recognition method</td>
+  </tr>
+  <tr>
+    <td><a href="./recognition/pp-tsm.md">PP-TSM</a> (PP series)</td>
+    <td><a href="./recognition/pp-tsn.md">PP-TSN</a> (PP series)</td>
+    <td><a href="./recognition/pp-timesformer.md">PP-TimeSformer</a> (PP series)</td>
+    <td><a href="./recognition/tsn.md">TSN</a> (2D’)</td>
+    <td><a href="./recognition/tsm.md">TSM</a> (2D')</td>
+  <tr>
+    <td><a href="./recognition/slowfast.md">SlowFast</a> (3D’)</td>
+    <td><a href="./recognition/timesformer.md">TimeSformer</a> (Transformer')</td>
+    <td><a href="./recognition/videoswin.md">VideoSwin</a> (Transformer’)</td>
+    <td><a href="./recognition/tokenshift_transformer.md">TokenShift</a> (3D’)</td>
+    <td><a href="./recognition/attention_lstm.md">AttentionLSTM</a> (RNN‘)</td>
+  </tr>
+  <tr>
+    <td><a href="./recognition/movinet.md">MoViNet</a> (Lite‘)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">Skeleton based action recognition</td>
+  </tr>
+  <tr>
+    <td><a href="./recognition/stgcn.md">ST-GCN</a> (Custom’)</td>
+    <td><a href="./recognition/agcn.md">AGCN</a> (Adaptive')</td>
+    <td><a href="./recognition/agcn2s.md">2s-AGCN</a> (Adaptive')</td>
+    <td><a href="./recognition/ctrgcn.md">CTR-GCN</a> (GCN‘)</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">Sequence action detection method</td>
+  </tr>
+  <tr>
+    <td><a href="./localization/bmn.md">BMN</a> (One-stage')</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">temporal segment</td>
+  </tr>
+  <tr>
+    <td><a href="./segmentation/mstcn.md">MS-TCN</a> </td>
+    <td><a href="./segmentation/asrf.md">ASRF</a> </td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">Spatio-temporal motion detection method</td>
+  </tr>
+  <tr>
+    <td><a href="docs/en/model_zoo/detection/SlowFast_FasterRCNN_en.md">SlowFast+Fast R-CNN</a>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">Multimodal</td>
+  </tr>
+  <tr>
+    <td><a href="./multimodal/actbert.md">ActBERT</a> (Learning')</td>
+    <td><a href="../../../applications/T2VLAD/README.md">T2VLAD</a> (Retrieval')</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">Video target segmentation</td>
+  </tr>
+  <tr>
+    <td><a href="./segmentation/cfbi.md">CFBI</a> (Semi')</td>
+    <td><a href="../../../applications/EIVideo/EIVideo/docs/en/manet.md">MA-Net</a> (Supervised')</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">Monocular depth estimation</td>
+  </tr>
+  <tr>
+    <td><a href="./estimation/adds.md">ADDS</a> (Unsupervised‘)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+</table>
diff --git a/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md b/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md
new file mode 100644
index 000000000..b100f4428
--- /dev/null
+++ b/english_documents/model_zoo/detection/SlowFast_FasterRCNN_en.md
@@ -0,0 +1,129 @@
+[简体中文](../../../zh-CN/model_zoo/detection/SlowFast_FasterRCNN.md) | English
+
+# SlowFast_FasterRCNN
+
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install moviepy
+python -m pip install et_xmlfile
+python -m pip install paddledet
+```
+
+## Introduction
+
+The [SlowFast](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/model_zoo/recognition/slowfast.md) model is one of the high-precision models in the video field. For action detection task, it is also neccessary to detect the person in current frame. Therefore, the SlowFast_FasterRCNN model takes human detection results and video frames as input, extracts spatiotemporal features through the SlowFast model, and then uses FasterRCNN's head gets the actions and positions of humans in the frame.
+
+The corresponding AI Studio Notebook Link：[基于SlowFast+FasterRCNN的动作识别](https://aistudio.baidu.com/aistudio/projectdetail/3267637?contributionType=1)
+
+For details, please refer to the paper [SlowFast Networks for Video Recognition](https://arxiv.org/pdf/1812.03982.pdf).
+
+## Data
+
+We use [AVA dataset](https://research.google.com/ava/download.html) for action detection. The AVA v2.2 dataset contains 430 videos split into 235 for training, 64 for validation, and 131 for test. Each video has 15 minutes annotated in 1 second intervals.
+
+### 1 Dowload Videos
+```
+bash  download_videos.sh
+```
+
+### 2 Download Annotations
+```
+bash  download_annotations.sh
+```
+
+### 3 Download Proposals
+
+```
+bash  fetch_ava_proposals.sh
+```
+
+### 4 Cut Videos
+
+```
+bash  cut_videos.sh
+```
+
+### 5 Extract Frames
+
+```
+bash  extract_rgb_frames.sh
+```
+
+For AVA v2.1, there is a simple introduction to some key files：
+* 'ava_videos_15min_frames' dir stores video frames extracted with FPS as the frame rate；
+* 'ava_train_v2.1.csv' file stores the trainning annotations；
+* 'ava_train_excluded_timestamps_v2.1.csv' file stores excluded timestamps；
+* 'ava_dense_proposals_train.FAIR.recall_93.9.pkl' file stores humans' bboxes and scores of key frames；
+* 'ava_action_list_v2.1_for_activitynet_2018.pbtxt' file stores为 action list.
+
+## Train
+
+* `-c`: config file path;
+* `-w`: weights of model. The pretrained model can be downloaded from the table below;
+* `--validate`: evaluate model during training.
+
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=logdir.ava main.py --validate -w paddle.init_param.pdparams -c configs/detection/ava/ava.yaml
+```
+
+## Test
+
+Test model based on the best model:
+```
+python main.py --test \
+   -w output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams \
+   -c configs/detection/ava/ava.yaml
+```
+
+
+| architecture | depth | Pretrain Model |  frame length x sample rate  | MAP | AVA version | model |
+| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |------------- |
+| SlowFast | R50 | [Kinetics 400](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) | 8 x 8 | 23.2 | 2.1 | [`link`](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/SlowFastRCNN_AVA.pdparams) |
+
+
+## Inference
+
+The action detection of this project is divided into two stages. In the first stage, humans' proposals are obtained, and then input into the SlowFast+FasterRCNN model for action recognition.
+
+For human detection，you can use the trained model in [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection).
+
+Install PaddleDetection:
+```
+cd PaddleDetection/
+pip install -r requirements.txt
+!python setup.py install
+```
+
+Download detection model:
+```
+# faster_rcnn_r50_fpn_1x_coco as an example
+wget https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams
+```
+
+export model:
+```
+python tools/export_model.py \
+  -c configs/detection/ava/ava.yaml \
+  -o inference_output \
+  -p output/AVA_SlowFast_FastRcnn/AVA_SlowFast_FastRcnn_best.pdparams
+```
+
+inference based on the exported model:
+```
+python tools/predict.py \
+    -c configs/detection/ava/ava.yaml \
+    --input_file "data/-IELREHXDEMO.mp4" \
+    --model_file "inference_output/AVA_SlowFast_FastRcnn.pdmodel" \
+    --params_file "inference_output/AVA_SlowFast_FastRcnn.pdiparams" \
+    --use_gpu=True \
+    --use_tensorrt=False
+```
diff --git a/english_documents/model_zoo/estimation/adds.md b/english_documents/model_zoo/estimation/adds.md
new file mode 100644
index 000000000..c055db59d
--- /dev/null
+++ b/english_documents/model_zoo/estimation/adds.md
@@ -0,0 +1,133 @@
+[Simplified Chinese](../../../zh-CN/model_zoo/estimation/adds.md) | English
+
+# ADDS-DepthNet model
+
+## content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install scikit-image
+python -m pip install matplotlib
+```
+
+## Introduction
+
+This model is based on the ICCV 2021 paper **[Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628)** of Baidu Robotics and Autonomous Driving Laboratory,
+The self-supervised monocular depth estimation model based on day and night images is reproduced, which utilizes the complementary nature of day and night image data, and slows down the large domain shift of day and night images and the accuracy of depth estimation caused by lighting changes. Impact, the most advanced depth estimation results of all-sky images have been achieved on the challenging Oxford RobotCar data set.
+
+
+## Data
+
+For data download and preparation of Oxford RobotCar dataset, please refer to [Oxford RobotCar dataset data preparation](../../dataset/Oxford_RobotCar.md)
+
+
+## Train
+
+### Oxford RobotCar dataset training
+
+#### Download and add pre-trained models
+
+1. Download the image pre-training model [resnet18.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams) as Backbone initialization parameters, or download through the wget command
+
+   ```bash
+   wget -P ./data https://videotag.bj.bcebos.com/PaddleVideo-release2.2/Resnet18_Imagenet.pdparams
+   ```
+
+2. Open `PaddleVideo/configs/estimation/adds/adds.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+    ```yaml
+    MODEL: #MODEL field
+        framework: "DepthEstimator" #Mandatory, indicate the type of network, associate to the'paddlevideo/modeling/framework/'.
+        backbone: #Mandatory, indicate the type of backbone, associate to the'paddlevideo/modeling/backbones/'.
+            name: 'ADDS_DepthNet'
+            pretrained: fill in the path here
+    ```
+
+#### Start training
+
+- The Oxford RobotCar dataset uses a single card for training, and the starting command for the training method is as follows:
+
+    ```bash
+    python3.7 main.py --validate -c configs/estimation/adds/adds.yaml --seed 20
+    ```
+
+
+## Test
+
+- The ADDS-DepthNet model is verified synchronously during training (only the day or night data is verified). You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+  ```bash
+  Already save the best model (rmse)8.5531
+  ```
+
+- Because the model can only test one day or night data set at a given path in the yaml file at a time, to get the complete test score at the beginning of this document, you need to run 4 test commands and record their indicators ( 40m during the day, 60m during the day, 40m at night, 60m at night)
+
+- Download URL of the trained model: [ADDS_car.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ADDS_car.pdparams)
+
+- The test commands are as follows:
+
+  ```bash
+  # Night 40m
+  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_night_files.txt" -o MODEL.head.max_gt_depth=40
+
+  # Night 60m
+  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_night_files.txt" -o MODEL.head.max_gt_depth=60
+
+  # Daytime 40m
+  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_day_files.txt" -o MODEL.head.max_gt_depth=40
+
+  # Daytime 60m
+  python3.7 main.py --test -c configs/estimation/adds/adds.yaml -w "output/ADDS/ADDS_best.pdparams" -o DATASET.test.file_path="data/oxford/splits/oxford_day/val_day_files.txt" -o MODEL.head.max_gt_depth=60
+  ```
+
+    The test indicators on the validation dataset of Oxford RobotCar dataset are as follows:
+
+  | version | Max Depth | Abs Rel | Sq Rel | RMSE | RMSE log | <img src="https://latex.codecogs.com/svg.image?\delta&space;<&space;1.25&space;" title="\delta < 1.25 " /> | <img src="https://latex.codecogs.com/svg.image?\delta&space;<&space;1.25^2" title="\delta < 1.25^2" /> | <img src="https://latex.codecogs.com/svg.image?\delta&space;<&space;1.25^3" title="\delta < 1.25^3" /> |
+  | ----------- | --------- | ------- | ------ | ----- | ------- | ----------------- |------------------- | ------------------- |
+  | ours(night) | 40 | 0.209 | 1.741 | 6.031 | 0.243 | 0.708 | 0.923 | 0.975 |
+  | ours(night) | 60 | 0.207 | 2.052 | 7.888 | 0.258 | 0.686 | 0.909 | 0.970 |
+  | ours(day) | 40 | 0.114 | 0.574 | 3.411 | 0.157 | 0.860 | 0.977 | 0.993 |
+  | ours(day) | 60 | 0.119 | 0.793 | 4.842 | 0.173 | 0.838 | 0.967 | 0.991 |
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/estimation/adds/adds.yaml -p data/ADDS_car.pdparams -o inference/ADDS
+```
+
+The above command will generate the model structure file `ADDS.pdmodel` and model weight files `ADDS.pdiparams` and `ADDS.pdiparams.info` files needed for prediction, all of which are stored in the `inference/ADDS/` directory
+
+For the meaning of each parameter in the above bash command, please refer to [Model Inference Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/en/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)
+
+### Use predictive engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.png \
+                           --config configs/estimation/adds/adds.yaml \
+                           --model_file inference/ADDS/ADDS.pdmodel \
+                           --params_file inference/ADDS/ADDS.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+At the end of the inference, the depth map estimated by the model will be saved in pseudo-color by default.
+
+The following is a sample picture and the corresponding predicted depth map：
+
+<img src="../../../images/oxford_image.png" width = "512" height = "256" alt="image" align=center />
+
+<img src="../../../images/oxford_image_depth.png" width = "512" height = "256" alt="depth" align=center />
+
+
+## Reference
+
+- [Self-supervised Monocular Depth Estimation for All Day Images using Domain Separation](https://arxiv.org/abs/2108.07628), Liu, Lina and Song, Xibin and Wang, Mengmeng and Liu, Yong and Zhang, Liangjun
diff --git a/english_documents/model_zoo/localization/bmn.md b/english_documents/model_zoo/localization/bmn.md
new file mode 100644
index 000000000..eb64c6706
--- /dev/null
+++ b/english_documents/model_zoo/localization/bmn.md
@@ -0,0 +1,104 @@
+[简体中文 ](../../../zh-CN/model_zoo/localization/bmn.md) | English
+
+# BMN
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+BMN model contains three modules: Base Module handles the input feature sequence, and out- puts feature sequence shared by the following two modules; Temporal Evaluation Module evaluates starting and ending probabilities of each location in video to generate boundary probability sequences; Proposal Evaluation Module con- tains the BM layer to transfer feature sequence to BM fea- ture map, and contains a series of 3D and 2D convolutional layers to generate BM confidence map.
+
+<p align="center">
+<img src="../../../images/BMN.png" height=300 width=400 hspace='10'/> <br />
+BMN Overview
+</p>
+
+
+## Data
+
+We use ActivityNet dataset to train this model，data preparation please refer to [ActivityNet dataset](../../dataset/ActivityNet.md).
+
+
+## Train
+
+You can start training by such command：
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python -B -m paddle.distributed.launch --gpus="0,1,2,3"  --log_dir=log_bmn main.py  --validate -c configs/localization/bmn.yaml
+```
+
+
+## Test
+
+You can start testing by such command：
+
+```bash
+python main.py --test -c configs/localization/bmn.yaml -w output/BMN/BMN_epoch_00009.pdparams -o DATASET.test_batch_size=1
+```
+
+- For now, we only support testing with **single card** and `batch_size=1`.
+
+-  Please download [activity\_net\_1\_3\_new.json](https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json) label file and specify the path to `METRIC.ground_truth_filename` in config file.
+
+-  Args `-w` is used to specifiy the model path，you can download our model in [BMN.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/BMN/BMN.pdparams)
+
+
+Test accuracy in ActivityNet1.3:
+
+| AR@1 | AR@5 | AR@10 | AR@100 | AUC |
+| :---: | :---: | :---: | :---: | :---: |
+| 33.26 | 49.48 | 56.86 | 75.19 | 67.23% |
+
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `BMN.pdmodel` and parameters file `BMN.pdiparams`, use: 
+
+```bash
+python3.7 tools/export_model.py -c configs/localization/bmn.yaml \
+                                -p data/BMN.pdparams \
+                                -o inference/BMN
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example_feat.list \
+                           --config configs/localization/bmn.yaml \
+                           --model_file inference/BMN/BMN.pdmodel \
+                           --params_file inference/BMN/BMN.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```
+BMN Inference results of data/example_feat.npy :
+{'score': 0.7968077063560486, 'segment': [0.0, 122.9877]}
+{'score': 0.49097609519958496, 'segment': [12.423000000000002, 124.23]}
+{'score': 0.21395835280418396, 'segment': [39.7536, 122.9877]}
+{'score': 0.2106524258852005, 'segment': [0.0, 109.3224]}
+{'score': 0.06876271963119507, 'segment': [23.6037, 114.2916]}
+```
+
+Inference results are saved in `data/bmn/BMN_INFERENCE_results`. 
+
+## Reference
+
+- [BMN: Boundary-Matching Network for Temporal Action Proposal Generation](https://arxiv.org/abs/1907.09702), Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.
diff --git a/english_documents/model_zoo/localization/yowo.md b/english_documents/model_zoo/localization/yowo.md
new file mode 100644
index 000000000..ab3be429f
--- /dev/null
+++ b/english_documents/model_zoo/localization/yowo.md
@@ -0,0 +1,138 @@
+[简体中文](../../../zh-CN/model_zoo/localization/yowo.md) | English
+
+# YOWO
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#DATA)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+YOWO is a single-stage network with two branches. One branch extracts spatial features of key frames (i.e., the current frame) via 2D-CNN, while the other branch acquires spatio-temporal features of clips consisting of previous frames via 3D-CNN. To accurately aggregate these features, YOWO uses a channel fusion and attention mechanism that maximizes the inter-channel dependencies. Finally, the fused features are subjected to frame-level detection.
+
+
+<div align="center">
+<img src="../../../images/yowo.jpg">
+</div>
+
+
+## Data
+
+UCF101-24 data download and preparation please refer to [UCF101-24 data preparation](../../dataset/ucf24.md)
+
+
+## Train
+
+### UCF101-24 data set training
+
+#### Download and add pre-trained models
+
+1. Download the pre-training model [resnext-101-kinetics](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams) 和 [darknet](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam) as Backbone initialization parameters, or download through the wget command
+
+   ```bash
+    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam
+    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams
+   ```
+
+2. Open `PaddleVideo/configs/localization/yowo.yaml`, and fill in the downloaded weight storage path below `pretrained_2d:` and `pretrained_3d:` respectively
+
+    ```yaml
+    MODEL:
+        framework: "YOWOLocalizer"
+        backbone:
+            name: "YOWO"
+            num_class: 24
+            pretrained_2d: fill in the path of 2D pre-training model here
+            pretrained_3d: fill in the path of 3D pre-training model here
+    ```
+
+#### Start training
+
+- The UCF101-24 data set uses 1 card for training, and the start command of the training method is as follows:
+
+    ```bash
+    python3 main.py -c configs/localization/yowo.yaml --validate --seed=1
+    ```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+    ```bash
+    python3 main.py --amp -c configs/localization/yowo.yaml --validate --seed=1
+    ```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The YOWO model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+  ```
+  Already save the best model (fsocre)0.8779
+  ```
+
+- Since the verification index of the YOWO model test mode is **Frame-mAP (@ IoU 0.5)**, which is different from the **fscore** used in the verification mode during the training process, so the verification index recorded in the training log, called `fscore `, does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:
+
+  ```bash
+  python3 main.py -c configs/localization/yowo.yaml --test --seed=1 -w 'output/YOWO/YOWO_epoch_00005.pdparams'
+  ```
+
+
+  When the test configuration uses the following parameters, the test indicators on the validation data set of UCF101-24 are as follows:
+
+
+  | Model    | 3D-CNN backbone | 2D-CNN backbone | Dataset  |Input    | Frame-mAP <br>(@ IoU 0.5)    |   checkpoints  |
+  | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: |
+  | YOWO | 3D-ResNext-101 | Darknet-19 | UCF101-24 | 16-frames, d=1 | 80.94 | [YOWO.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/YOWO_epoch_00005.pdparams) |
+
+
+
+## Inference
+
+### Export inference model
+
+```bash
+python3 tools/export_model.py -c configs/localization/yowo.yaml -p 'output/YOWO/YOWO_epoch_00005.pdparams'
+```
+
+The above command will generate the model structure file `YOWO.pdmodel` and the model weight file `YOWO.pdiparams` required for prediction.
+
+- For the meaning of each parameter, please refer to [Model Reasoning Method](../../usage.md#2-infer)
+
+### Use prediction engine inference
+
+- Download the test video [HorseRiding.avi](https://videotag.bj.bcebos.com/Data/HorseRiding.avi) for a quick experience, or via the wget command. The downloaded video should be placed in the `data/ucf24` directory:
+
+```bash
+wget -nc https://videotag.bj.bcebos.com/Data/HorseRiding.avi
+```
+
+- Run the following command for inference:
+
+```bash
+python3 tools/predict.py -c configs/localization/yowo.yaml -i 'data/ucf24/HorseRiding.avi' --model_file ./inference/YOWO.pdmodel --params_file ./inference/YOWO.pdiparams
+```
+
+- When inference is over, the prediction results in image form will be saved in the `inference/YOWO_infer` directory. The image sequence can be converted to a gif by running the following command to complete the final visualisation.
+
+```
+python3 data/ucf24/visualization.py --frames_dir ./inference/YOWO_infer/HorseRiding --duration 0.04
+```
+
+The resulting visualization is as follows:
+
+<div align="center">
+  <img  src="../../../images/horse_riding.gif" alt="Horse Riding">
+</div>
+
+It can be seen that using the YOWO model trained on UCF101-24 to predict `data/ucf24/HorseRiding.avi`, the category of each frame output is HorseRiding with a confidence level of about 0.80.
+
+## Reference
+
+- [You Only Watch Once: A Unified CNN Architecture for Real-Time Spatiotemporal Action Localization](https://arxiv.org/pdf/1911.06644.pdf), Köpüklü O, Wei X, Rigoll G.
\ No newline at end of file
diff --git a/english_documents/model_zoo/multimodal/actbert.md b/english_documents/model_zoo/multimodal/actbert.md
new file mode 100644
index 000000000..f884a5e8f
--- /dev/null
+++ b/english_documents/model_zoo/multimodal/actbert.md
@@ -0,0 +1,98 @@
+[简体中文](../../../zh-CN/model_zoo/multimodal/actbert.md) | English
+
+# ActBERT
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Reference](#Reference)
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install paddlenlp
+python -m pip install lmdb
+```
+
+## Introduction
+
+Actbert is proposed by Baidu in CVPR2020 for multimodal pretrain task. It leverage global action information to cat- alyze mutual interactions between linguistic texts and local regional objects.  This method introduce a TaNgled Transformer block (TNT) to encode three sources of information, i.e., global actions, local regional objects, and linguistic descriptions. ActBERT significantly outperforms the state- of-the-art in five downstream video-and-language tasks, i.e., text-video clip retrieval, video captioning, video question answering, action segmentation, and action step localization.
+
+<div align="center">
+<img src="../../../images/actbert.png" height=400 width=500 hspace='10'/> <br />
+</div>
+
+
+## Data
+
+Please refer to Kinetics400 data download and preparation doc [HowTo100M-data](../../dataset/howto100m.md)
+
+Please refer to MSR-VTT data download and preparation doc [MSR-VTT-data](../../dataset/umsrvtt.md)
+
+
+## Train
+
+### Train on HowTo100M
+
+#### download pretrain-model
+
+Please download [bert-base-uncased](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams) as pretraind model:
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/bert-base-uncased.pdparams
+```
+
+and add path to `MODEL.framework.backbone.pretrained` in config file as：
+
+```yaml
+MODEL:
+    framework: "ActBert"
+    backbone:
+        name: "BertForMultiModalPreTraining"
+        pretrained: your weight path
+```
+
+- We provide training option on small data, config file is for reference only.
+
+#### Start training
+
+- Train ActBERT on HowTo100M scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_actbert  main.py  --validate -c configs/multimodal/actbert/actbert.yaml
+```
+
+- AMP is useful for speeding up training:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_actbert  main.py  --amp --validate -c configs/multimodal/actbert/actbert.yaml
+```
+
+
+## Test
+
+- Evaluation performs on downstream task, i.e. text-video clip retrieval on MSR-VTT dataset, test accuracy can be obtained using scripts:
+
+```bash
+python3.7 main.py --test -c configs/multimodal/actbert/actbert_msrvtt.yaml -w Actbert.pdparams
+```
+
+
+Metrics on MSR-VTT:
+
+| R@1 | R@5 | R@10 | Median R | Mean R | checkpoints |
+| :------: | :----------: | :----: | :----: | :----: | :----: |
+| 8.6 | 31.2 | 45.5 | 13.0 | 28.5 | [ActBERT.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ActBERT.pdparams) |
+
+
+## Reference
+
+- [ActBERT: Learning Global-Local Video-Text Representations
+](https://arxiv.org/abs/2011.07231), Linchao Zhu, Yi Yang
diff --git a/english_documents/model_zoo/partition/transnetv2.md b/english_documents/model_zoo/partition/transnetv2.md
new file mode 100644
index 000000000..e98e2c570
--- /dev/null
+++ b/english_documents/model_zoo/partition/transnetv2.md
@@ -0,0 +1,80 @@
+[简体中文](../../../zh-CN/model_zoo/partition/transnetv2.md) | English
+
+# TransNetV2
+
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Details](#Details)
+- [Reference](#Reference)
+
+Before getting started, you need to install additional dependencies as follows:
+```bash
+python -m pip install ffmpeg-python==0.2.0
+```
+
+## Introduction
+
+TransNetV2 is a video segmentation model based on deep learning. It performs feature learning through the DDCNN V2 structure, and adds RGB color histograms and video frame similarity for more effective feature extraction, and finally obtains whether each frame is a shot boundary frame Probability, thereby completing the video segmentation. The algorithm has good effect and efficient calculation, which is very suitable for industrial landing.
+
+![](../../../images/transnetv2.png)
+
+This code currently only supports model inference, and model training and testing will be provided in the future.
+
+Please refer to the paper for details. [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838)
+
+## Data
+
+coming soon
+
+
+## Train
+
+coming soon
+
+
+## Test
+
+coming soon
+
+
+## Inference
+
+
+Load the TransNetV2 weights trained on ClipShots and TRECVID IACC.3 dataset [TransNetV2_shots.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams), or download through the command line
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TransNetV2_shots.pdparams
+```
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/partitioners/transnetv2/transnetv2.yaml -p data/TransNetV2_shots.pdparams -o inference/TransNetV2
+```
+
+The above command will generate the model structure file`TransNetV2.pdmodel`and the model weight file`TransNetV2.pdiparams`required for prediction.
+
+For the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-Model Reasoning)
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/partitioners/transnetv2/transnetv2.yaml \
+                           --model_file inference/TransNetV2/TransNetV2.pdmodel \
+                           --params_file inference/TransNetV2/TransNetV2.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+By defining the `output_path` parameters in `transnetv2.yaml`, the prediction probability of each frame can be output to `{output_path}/example_predictions.txt`, and the predicted lens boundary is output to `{output_path}/example_scenes.txt`.
+By defining the `visualize` parameter in `transnetv2.yaml`, the predicted results can be visualized, and the visual results are saved to `{output_path}/example_vis.png`.
+
+## Reference
+
+- [TransNet V2: An effective deep network architecture for fast shot transition detection](https://arxiv.org/abs/2008.04838), Tomáš Souček, Jakub Lokoč
diff --git a/english_documents/model_zoo/recognition/agcn.md b/english_documents/model_zoo/recognition/agcn.md
new file mode 100644
index 000000000..85677c729
--- /dev/null
+++ b/english_documents/model_zoo/recognition/agcn.md
@@ -0,0 +1,129 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/agcn.md) | English
+
+# AGCN
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+We implemented Adaptive Graph Convolution Network to improve the accuracy of [ST-GCN](./stgcn.md).
+
+## Data
+
+Please refer to FSD-10 data download and preparation doc [FSD](../../dataset/fsd.md)
+
+Please refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)
+
+## Train
+
+### Train on FSD
+
+- Train AGCN on FSD scripts:
+
+```bash
+python3.7 main.py -c configs/recognition/agcn/agcn_fsd.yaml
+```
+
+- Turn off `valid` when training, as validation dataset is not available for the competition.
+
+### Train on NTU-RGBD
+
+- Train AGCN on NTU-RGBD scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3"  --log_dir=log_agcn  main.py  --validate -c configs/recognition/agcn/agcn_ntucs.yaml
+```
+
+- config file `agcn_ntucs.yaml` corresponding to the config of AGCN on NTU-RGB+D dataset with cross-subject splits.
+
+
+## Test
+
+### Test onf FSD
+
+- Test scripts：
+
+```bash
+python3.7 main.py --test -c configs/recognition/agcn/agcn_fsd.yaml  -w output/AGCN/AGCN_epoch_00100.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+- Evaluation results will be saved in `submission.csv` file, final score can be obtained in [competition website](https://aistudio.baidu.com/aistudio/competition/detail/115).
+
+Accuracy on FSD dataset:
+
+| Test_Data | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| Test_A | 62.29 | [AGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_fsd.pdparams)|
+
+
+### Test on NTU-RGB+D
+
+- Test scripts：
+
+```bash
+python3.7 main.py --test -c configs/recognition/agcn/agcn_ntucs.yaml -w output/AGCN/AGCN_best.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+Accuracy on NTU-RGB+D dataset:
+
+| split | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| cross-subject | 83.27 | [AGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AGCN_ntucs.pdparams)|
+
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `AGCN.pdmodel` and parameters file `AGCN.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/agcn/agcn_fsd.yaml \
+                                -p data/AGCN_fsd.pdparams \
+                                -o inference/AGCN
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \
+                           --config configs/recognition/agcn/agcn_fsd.yaml \
+                           --model_file inference/AGCN/AGCN.pdmodel \
+                           --params_file inference/AGCN/AGCN.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/fsd10/example_skeleton.npy
+        top-1 class: 27
+        top-1 score: 0.8965644240379333
+```
+
+
+## Reference
+
+- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
+
+- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1805.07694), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+
+- [Skeleton-Based Action Recognition with Multi-Stream Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1912.06971), Lei Shi, Yifan Zhang, Jian Cheng, Hanqing Lu
+
+- Many thanks to [li7819559](https://github.com/li7819559) and [ZhaoJingjing713](https://github.com/ZhaoJingjing713) for contributing the code.
diff --git a/english_documents/model_zoo/recognition/agcn2s.md b/english_documents/model_zoo/recognition/agcn2s.md
new file mode 100644
index 000000000..d289507f7
--- /dev/null
+++ b/english_documents/model_zoo/recognition/agcn2s.md
@@ -0,0 +1,113 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/2sAGCN.md) | English
+
+# CTR-GCN
+
+---
+
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+![模型结构图](../../../images/agcn2s.png)
+
+[2s-AGCN](https://openaccess.thecvf.com/content_CVPR_2019/papers/Shi_Two-Stream_Adaptive_Graph_Convolutional_Networks_for_Skeleton-Based_Action_Recognition_CVPR_2019_paper.pdf) is an improved article on ST-GCN published in CVPR2019. It proposes a dual-flow adaptive convolutional network, which improves the shortcomings of the original ST-GCN. In the existing GCN based approach, the topology of the graph is set manually and fixed to all layers and input samples. In addition, the second-order information of bone data (bone length and orientation) is naturally more beneficial and discriminating for motion recognition, which was rarely studied in the methods at that time. Therefore, this paper puts forward a node and bones of two kinds of information fusion based on skeleton shuangliu network, and join in figure convolution adjacency matrix adaptive matrix, a sharp rise in the bones of gesture recognition accuracy, also has laid the foundation for subsequent work (the subsequent basic skeleton gesture recognition are based on the flow of network framework).
+
+## Data
+
+Data download and processing are consistent with CTR-GCN. For details, please refer to [NTU-RGBD Data Preparation](../../dataset/ntu-rgbd.md)
+
+## Train
+
+### Train on NTU-RGBD
+
+Train CTR-GCN on NTU-RGBD scripts using single gpu：
+
+```bash
+# train cross subject with bone data
+python main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucs_bone.yaml --seed 1
+# train cross subject with joint data
+python main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucs_joint.yaml --seed 1
+# train cross view with bone data
+python main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucv_bone.yaml --seed 1
+# train cross view with joint data
+python main.py --validate -c configs/recognition/agcn2s/agcn2s_ntucv_joint.yaml --seed 1
+```
+
+config file `agcn2s_ntucs_joint.yaml` corresponding to the config of 2s-AGCN on NTU-RGB+D dataset with cross-subject splits.
+
+## Test
+
+### Test on NTU-RGB+D
+
+Test scripts：
+
+```bash
+# test cross subject with bone data
+python main.py --test -c configs/recognition/2sagcn/2sagcn_ntucs_bone.yaml -w data/2SAGCN_ntucs_bone.pdparams
+# test cross subject with joint data
+python main.py --test -c configs/recognition/2sagcn/2sagcn_ntucs_joint.yaml -w data/2SAGCN_ntucs_joint.pdparams
+# test cross view with bone data
+python main.py --test -c configs/recognition/2sagcn/2sagcn_ntucv_bone.yaml -w data/2SAGCN_ntucv_bone.pdparams
+# test cross view with joint data
+python main.py --test -c configs/recognition/2sagcn/2sagcn_ntucv_joint.yaml -w data/2SAGCN_ntucv_joint.pdparams
+```
+
+* Specify the config file with `-c`, specify the weight path with `-w`.
+
+Accuracy on NTU-RGB+D dataset:
+
+|                |  CS   |   CV   |
+| :------------: | :---: | :----: |
+| Js-AGCN(joint) | 85.8% | 94.13% |
+| Bs-AGCN(bone)  | 86.7% | 93.9%  |
+
+Train log：[download](https://github.com/ELKYang/2s-AGCN-paddle/tree/main/work_dir/ntu)
+
+VisualDL log：[download](https://github.com/ELKYang/2s-AGCN-paddle/tree/main/runs)
+
+checkpoints：
+
+|                            CS-Js                             |                            CS-Bs                             |                            CV-Js                             |                            CV-Bs                             |
+| :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| [ntu_cs_agcn_joint](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cs_agcn_joint-48-30674.pdparams) | [ntu_cs_agcn_bone](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cs_agcn_bone-44-28170.pdparams) | [ntu_cv_agcn_joint](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cv_agcn_joint-38-22932.pdparams) | [ntu_cv_agcn_bone](https://github.com/ELKYang/2s-AGCN-paddle/blob/main/weights/ntu_cv_agcn_bone-49-29400.pdparams) |
+
+## Inference
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/agcn2s/2sagcn_ntucs_joint.yaml \
+                                -p data/AGCN2s_ntucs_joint.pdparams \
+                                -o inference/AGCN2s_ntucs_joint
+```
+
+To get model architecture file `AGCN2s_ntucs_joint.pdmodel` and parameters file `AGCN2s_ntucs_joint.pdiparams`.
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \
+                           --config configs/recognition/agcn2s/2sagcn_ntucs_joint.yaml \
+                           --model_file inference/AGCN2s_ntucs_joint/AGCN2s_ntucs_joint.pdmodel \
+                           --params_file inference/AGCN2s_ntucs_joint/AGCN2s_ntucs_joint.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+### infer result
+![预测引擎推理结果图](../../../images/agcn2s_result.png)
+
+
+## Reference
+
+- [Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://openaccess.thecvf.com/content_CVPR_2019/papers/Shi_Two-Stream_Adaptive_Graph_Convolutional_Networks_for_Skeleton-Based_Action_Recognition_CVPR_2019_paper.pdf), Lei Shi and Yifan Zhang and Jian Cheng and Hanqing Lu
+
diff --git a/english_documents/model_zoo/recognition/attention_lstm.md b/english_documents/model_zoo/recognition/attention_lstm.md
new file mode 100644
index 000000000..42fb5d90e
--- /dev/null
+++ b/english_documents/model_zoo/recognition/attention_lstm.md
@@ -0,0 +1,84 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/attention_lstm.md) | English
+
+# AttentionLSTM
+
+## content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+Recurrent Neural Networks (RNN) are often used in the processing of sequence data, which can model the sequence information of multiple consecutive frames of video, and are commonly used methods in the field of video classification.
+This model uses a two-way long and short-term memory network (LSTM) to encode all the frame features of the video in sequence. Unlike the traditional method that directly uses the output of the last moment of LSTM, this model adds an Attention layer, and the hidden state output at each moment has an adaptive weight, and then linearly weights the final feature vector. The reference paper implements a two-layer LSTM structure, while **this model implements a two-way LSTM with Attention**.
+
+The Attention layer can refer to the paper [AttentionCluster](https://arxiv.org/abs/1711.09550)
+
+## Data
+
+PaddleVide provides training and testing scripts on the Youtube-8M dataset. Youtube-8M data download and preparation please refer to [YouTube-8M data preparation](../../dataset/youtube8m.md)
+
+## Train
+
+### Youtube-8M data set training
+
+#### Start training
+
+- The Youtube-8M data set uses 8 cards for training. In the feature format, video and audio features will be used as input. The training start command of the data is as follows
+
+  ```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm main.py --validate -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml
+  ```
+
+## Test
+
+The command is as follows:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_attetion_lstm main.py --test -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml -w "output/AttentionLSTM/AttentionLSTM_best.pdparams"
+```
+
+When the test configuration uses the following parameters, the test indicators on the validation data set of Youtube-8M are as follows:
+
+| Hit@1 | PERR | GAP | checkpoints |
+| :-----: | :---------: | :---: | ----- |
+| 89.05 | 80.49 | 86.30 | [AttentionLSTM_yt8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/AttentionLSTM_yt8.pdparams) |
+
+## Inference
+
+### Export inference model
+```bash
+python3.7 tools/export_model.py -c configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \
+                                -p data/AttentionLSTM_yt8.pdparams \
+                                -o inference/AttentionLSTM
+```
+
+The above command will generate the model structure file `AttentionLSTM.pdmodel` and the model weight file `AttentionLSTM.pdiparams` required for prediction.
+
+For the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0.0/docs/en/start.md#2-infer)
+
+### Use prediction engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.pkl \
+                           --config configs/recognition/attention_lstm/attention_lstm_youtube8m.yaml \
+                           --model_file inference/AttentionLSTM/AttentionLSTM.pdmodel \
+                           --params_file inference/AttentionLSTM/AttentionLSTM.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+An example of the output is as follows:
+```bash
+Current video file: data/example.pkl
+         top-1 class: 11
+         top-1 score: 0.9841002225875854
+```
+It can be seen that using the AttentionLSTM model trained on Youtube-8M to predict data/example.pkl, the output top1 category id is 11, and the confidence is 0.98.
+## Reference paper
+
+- [Attention Clusters: Purely Attention Based Local Feature Integration for Video Classification](https://arxiv.org/abs/1711.09550), Xiang Long, Chuang Gan, Gerard de Melo, Jiajun Wu, Xiao Liu, Shilei Wen
+- [YouTube-8M: A Large-Scale Video Classification Benchmark](https://arxiv.org/abs/1609.08675), Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, Sudheendra Vijayanarasimhan
diff --git a/english_documents/model_zoo/recognition/ctrgcn.md b/english_documents/model_zoo/recognition/ctrgcn.md
new file mode 100644
index 000000000..bdec0aacc
--- /dev/null
+++ b/english_documents/model_zoo/recognition/ctrgcn.md
@@ -0,0 +1,128 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/ctrgcn.md) | English
+
+# CTR-GCN
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+[CTRGCN](https://github.com/Uason-Chen/CTR-GCN.git) is a bone based behavior recognition model proposed by iccv 2021. By applying the changes to the graph convolution of human bone data with topological structure, and using spatio-temporal graph convolution to extract spatio-temporal features for behavior recognition, the accuracy of bone based behavior recognition task is greatly improved.
+
+<div align="center">
+<img src="../../../images/ctrgcn.jpg" height=200 width=950 hspace='10'/> <br />
+</div>
+
+
+## Data
+
+Please refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)
+
+
+## Train
+
+
+### Train on NTU-RGBD
+
+- Train CTR-GCN on NTU-RGBD scripts using single gpu：
+
+```bash
+# joint modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml --seed 1
+
+# bone modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml --seed 1
+
+# motion modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml --seed 1
+
+# bone motion modality
+python main.py --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml --seed 1
+```
+
+- Train CTR-GCN on NTU-RGBD scriptsusing multi gpus:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3"  --log_dir=log_ctrgcn  main.py  --validate -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml
+```
+
+- config file `ctrgcn_ntucs_joint.yaml` corresponding to the config of CTR-GCN on NTU-RGB+D dataset with cross-subject splits.
+
+
+## Test
+
+### Test on NTU-RGB+D
+
+- Test scripts：
+
+```bash
+# joint modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml -w data/CTRGCN_ntucs_joint.pdparams
+
+# bone modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone.yaml -w data/CTRGCN_ntucs_bone.pdparams
+
+# motion modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_motion.yaml -w data/CTRGCN_ntucs_motion.pdparams
+
+# bone motion modality
+python3.7 main.py --test -c configs/recognition/ctrgcn/ctrgcn_ntucs_bone_motion.yaml -w data/CTRGCN_ntucs_bone_motion.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+
+Accuracy on NTU-RGB+D dataset:
+
+| split | modality | Top-1 | checkpoints |
+| :----: | :----: | :----: | :----: |
+| cross-subject | joint | 89.93 | [CTRGCN_ntucs_joint.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_joint.pdparams) |
+| cross-subject | bone | 85.24 | [CTRGCN_ntucs_bone.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone.pdparams) |
+| cross-subject | motion | 85.33 | [CTRGCN_ntucs_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_motion.pdparams) |
+| cross-subject | bone motion | 84.53 | [CTRGCN_ntucs_bone_motion.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/CTRGCN_ntucs_bone_motion.pdparams) |
+
+
+## Inference
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \
+                                -p data/CTRGCN_ntucs_joint.pdparams \
+                                -o inference/CTRGCN
+```
+
+ To get model architecture file `CTRGCN.pdmodel` and parameters file `CTRGCN.pdiparams`, use:
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example_NTU-RGB-D_sketeton.npy \
+                           --config configs/recognition/ctrgcn/ctrgcn_ntucs_joint.yaml \
+                           --model_file inference/CTRGCN_joint/CTRGCN_joint.pdmodel \
+                           --params_file inference/CTRGCN_joint/CTRGCN_joint.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/example_NTU-RGB-D_sketeton.npy
+        top-1 class: 4
+        top-1 score: 0.999988317489624
+```
+
+## Reference
+
+- [Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213), Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming
diff --git a/english_documents/model_zoo/recognition/movinet.md b/english_documents/model_zoo/recognition/movinet.md
new file mode 100644
index 000000000..317501938
--- /dev/null
+++ b/english_documents/model_zoo/recognition/movinet.md
@@ -0,0 +1,91 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/movinet.md) | English
+
+# MoViNet
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+Movinet is a mobile video network developed by Google research. It uses causal convolution operator with stream buffer and temporal ensembles to improve accuracy. It is a lightweight and efficient video model that can be used for online reasoning video stream.
+
+
+## Data
+
+Please refer to Kinetics400 data download and preparation doc [k400-data](../../dataset/K400.md)
+
+
+## Train
+
+- Train MoViNet on kinetics-400 scripts:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_movinet main.py --validate -c configs/recognition/movinet/movinet_k400_frame.yaml
+```
+
+## Test
+
+- For uniform sampling, test accuracy can be found in training-logs by search key word `best`, such as:
+
+```txt
+Already save the best model (top1 acc)0.6489
+```
+
+- Test scripts:
+
+```bash
+python3.7 main.py --test -c configs/recognition/movinet/movinet_k400_frame.yaml -w output/MoViNet/MoViNet_best.pdparams
+```
+
+
+Accuracy on Kinetics400:
+
+| Config | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :--------: | :-------: | :-------: | :-----: | :-----: |
+| A0 | Uniform | 50 | 172  | 66.62 | [MoViNetA0_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/MoViNetA0_k400.pdparams)  |
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `MoViNetA0.pdmodel` and parameters file `MoViNetA0.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/movinet/movinet_k400_frame.yaml \
+                                -p data/MoViNetA0_k400.pdparams \
+                                -o inference/MoViNetA0
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/recognition/movinet/movinet_k400_frame.yaml \
+                           --model_file inference/MoViNetA0/MoViNet.pdmodel \
+                           --params_file inference/MoViNetA0/MoViNet.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/example.avi
+        top-1 class: 5
+        top-1 score: 0.7667049765586853
+```
+
+## Reference
+
+- [MoViNets: Mobile Video Networks for Efficient Video Recognition](https://arxiv.org/abs/2103.11511)
diff --git a/english_documents/model_zoo/recognition/posec3d.md b/english_documents/model_zoo/recognition/posec3d.md
new file mode 100644
index 000000000..97334ff94
--- /dev/null
+++ b/english_documents/model_zoo/recognition/posec3d.md
@@ -0,0 +1,100 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/posec3d.md) | English
+
+# PoseC3D
+
+---
+## Contents
+
+- [PoseC3D](#PoseC3D)
+  - [Contents](#contents)
+  - [Introduction](#introduction)
+  - [Data](#data)
+  - [Train](#train)
+    - [Train on UCF101.](#train-on-ucf101)
+  - [Test](#test)
+    - [Test onf UCF101](#test-onf-ucf101)
+  - [Inference](#inference)
+    - [export inference model](#export-inference-model)
+    - [infer](#infer)
+  - [Reference](#reference)
+
+
+## Introduction
+
+Human  skeleton,  as  a  compact  representation  of  hu-man  action,  has  received  increasing  attention  in  recentyears.    Many  skeleton-based  action  recognition  methodsadopt graph convolutional networks (GCN) to extract fea-tures on top of human skeletons.   Despite the positive re-sults  shown  in  previous  works,  GCN-based  methods  aresubject  to  limitations  in  robustness,  interoperability,  andscalability.  In this work, we propose PoseC3D, a new ap-proach  to  skeleton-based  action  recognition,  which  relieson  a  3D  heatmap  stack  instead  of  a  graph  sequence  asthe base representation of human skeletons.  Compared toGCN-based methods, PoseC3D is more effective in learningspatiotemporal features, more robust against pose estima-tion noises, and generalizes better in cross-dataset settings.Also, PoseC3D can handle multiple-person scenarios with-out additional computation cost, and its features can be eas-ily integrated with other modalities at early fusion stages,which  provides  a  great  design  space  to  further  boost  theperformance. On four challenging datasets, PoseC3D con-sistently obtains superior performance, when used alone onskeletons and in combination with the RGB modality.
+
+## Data
+
+Please download UCF101 skeletons datasets and pretraind model weights.
+
+[https://aistudio.baidu.com/aistudio/datasetdetail/140593](https://aistudio.baidu.com/aistudio/datasetdetail/140593)
+
+## Train
+
+### Train on UCF101.
+
+- Train PoseC3D model:
+
+```bash
+python3.7 main.py --validate -c configs/recognition/posec3d/posec3d.yaml --weights res3d_k400.pdparams
+```
+
+
+## Test
+
+### Test onf UCF101
+
+- Test scripts：
+
+```bash
+python3.7 main.py --test -c configs/recognition/posec3d/posec3d.yaml  -w output/PoseC3D/PoseC3D_epoch_0012.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+
+Accuracy on UCF101 dataset:
+
+| Test_Data | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| UCF101 test1 | 87.05 | [PoseC3D_ucf101.pdparams]() |
+
+
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `PoseC3D.pdmodel` and parameters file `PoseC3D.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/posec3d/posec3d.yaml \
+                                -p data/PoseC3D_ucf101.pdparams \
+                                -o inference/PoseC3D
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example_UCF101_skeleton.pkl\
+                           --config configs/recognition/posec3d/posec3d.yaml \
+                           --model_file inference/PoseC3D/PoseC3D.pdmodel \
+                           --params_file inference/PoseC3D/PoseC3D.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/example_UCF101_skeleton.pkl
+	top-1 class: 0
+	top-1 score: 0.6731489896774292
+```
+
+
+## Reference
+
+- [Revisiting Skeleton-based Action Recognition](https://arxiv.org/pdf/2104.13586v1.pdf), Haodong Duan, Yue Zhao, Kai Chen, Dian Shao, Dahua Lin, Bo Dai
diff --git a/english_documents/model_zoo/recognition/pp-timesformer.md b/english_documents/model_zoo/recognition/pp-timesformer.md
new file mode 100644
index 000000000..9acbc6487
--- /dev/null
+++ b/english_documents/model_zoo/recognition/pp-timesformer.md
@@ -0,0 +1,156 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/pp-timesformer.md) | English
+
+# TimeSformer Video Classification Model
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+We have improved the [TimeSformer model](./timesformer.md) and obtained a more accurate 2D practical video classification model **PP-TimeSformer**. Without increasing the amount of parameters and calculations, the accuracy on the UCF-101, Kinetics-400 and other data sets significantly exceeds the original version. The accuracy on the Kinetics-400 data set is shown in the table below.
+
+| Version | Top1 |
+| :------ | :----: |
+| Ours ([swa](#refer-anchor-1)+distill+16frame) | 79.44 |
+| Ours ([swa](#refer-anchor-1)+distill)  | 78.87 |
+| Ours ([swa](#refer-anchor-1)) | **78.61** |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/timesformer#kinetics-400) | 77.92 |
+
+
+## Data
+
+K400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)
+
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Download the image pre-training model [ViT_base_patch16_224_miil_21k.pdparams](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through wget command
+
+   ```bash
+   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
+   ```
+
+2. Open `PaddleVideo/configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+    ```yaml
+    MODEL:
+        framework: "RecognizerTransformer"
+        backbone:
+            name: "VisionTransformer_tweaks"
+            pretrained: fill in the path here
+    ```
+
+#### Start training
+
+- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:
+
+    ```bash
+    # videos data format
+    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --validate -c configs/recognition/ pptimesformer/pptimesformer_k400_videos.yaml
+    ```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+    ```bash
+    export FLAGS_conv_workspace_size_limit=800 # MB
+    export FLAGS_cudnn_exhaustive_search=1
+    export FLAGS_cudnn_batchnorm_spatial_persistent=1
+    # videos data format
+    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptimesformer main.py --amp --validate -c configs /recognition/pptimesformer/pptimesformer_k400_videos.yaml
+    ```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The PP-TimeSformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+  ```
+  Already save the best model (top1 acc)0.7258
+  ```
+
+- Because the sampling method of the PP-TimeSformer model test mode is a slightly slower but higher accuracy **UniformCrop**, which is different from the **RandomCrop** used in the verification mode during the training process, so the verification index recorded in the training log` topk Acc` does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:
+
+  ```bash
+  # 8-frames testing script
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptimesformer  main.py  --test -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml -w "output/ppTimeSformer/ppTimeSformer_best.pdparams"
+
+  # 16-frames testing script
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptimesformer main.py --test \
+  -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
+  -o MODEL.backbone.num_seg=16 \
+  -o MODEL.runtime_cfg.test.num_seg=16 \
+  -o PIPELINE.test.decode.num_seg=16 \
+  -o PIPELINE.test.sample.num_seg=16 \
+  -w "data/ppTimeSformer_k400_16f_distill.pdparams"
+  ```
+
+
+  When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+   | backbone           | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+   | :----------------: | :-------------: | :-----: | :---------: | :---- | :----------------------------------------------------------: |
+   | Vision Transformer |   UniformCrop   |   8    |     224     | 78.61 | [ppTimeSformer_k400_8f.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f.pdparams) |
+   | Vision Transformer | UniformCrop | 8 | 224 | 78.87 | [ppTimeSformer_k400_8f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_8f_distill.pdparams) |
+   | Vision Transformer | UniformCrop | 16 | 224 | 79.44 | [ppTimeSformer_k400_16f_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTimeSformer_k400_16f_distill.pdparams) |
+
+
+- During the test, the PP-TimeSformer video sampling strategy is to use linspace sampling: in time sequence, from the first frame to the last frame of the video sequence to be sampled, `num_seg` sparse sampling points (including endpoints) are uniformly generated; spatially , Select 3 areas to sample at both ends of the long side and the middle position (left, middle, right or top, middle, and bottom). A total of 1 clip is sampled for 1 video.
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
+                                -p data/ppTimeSformer_k400_8f.pdparams \
+                                -o inference/ppTimeSformer
+```
+
+The above command will generate the model structure file `ppTimeSformer.pdmodel` and the model weight file `ppTimeSformer.pdiparams` required for prediction.
+
+- For the meaning of each parameter, please refer to [Model Reasoning Method](../../start.md#2-Model Reasoning)
+
+### Use predictive engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/recognition/pptimesformer/pptimesformer_k400_videos.yaml \
+                           --model_file inference/ppTimeSformer/ppTimeSformer.pdmodel \
+                           --params_file inference/ppTimeSformer/ppTimeSformer.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+The output example is as follows:
+
+```
+Current video file: data/example.avi
+        top-1 class: 5
+        top-1 score: 0.9997474551200867
+```
+
+It can be seen that using the ppTimeSformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By referring to the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.
+
+## Reference
+
+- [Is Space-TimeAttention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
+<div id="refer-anchor-1"></div>
+
+- [Averaging Weights Leads to Wider Optima and Better Generalization](https://arxiv.org/abs/1803.05407v3), Pavel Izmailov, Dmitrii Podoprikhin, Timur Garipov
+- [ImageNet-21K Pretraining for the Masses](https://arxiv.org/pdf/2104.10972v4.pdf), Tal Ridnik, Emanuel Ben-Baruch, Asaf Noy
diff --git a/english_documents/model_zoo/recognition/pp-tsm.md b/english_documents/model_zoo/recognition/pp-tsm.md
new file mode 100644
index 000000000..b1ae1aa59
--- /dev/null
+++ b/english_documents/model_zoo/recognition/pp-tsm.md
@@ -0,0 +1,167 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/pp-tsm.md) | English
+
+# PP-TSM
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+We optimized TSM model and proposed **PP-TSM** in this repo. Without increasing the number of parameters, the accuracy of TSM was significantly improved in UCF101 and Kinetics-400 datasets. Please refer to [**Tricks on PP-TSM**](https://zhuanlan.zhihu.com/p/382134297) for more details.
+
+| Version | Sampling method | Top1 |
+| :------ | :----------: | :----: |
+| Ours (distill) | Dense | **76.16** |
+| Ours | Dense | 75.69 |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) | Dense | 74.55 |
+| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module) | Dense | 74.1 |
+
+
+| Version | Sampling method | Top1 |
+| :------ | :----------: | :----: |
+| Ours (distill) | Uniform | **75.11** |
+| Ours | Uniform | 74.54 |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/README.md) |  Uniform | 71.90 |
+| [mit-han-lab](https://github.com/mit-han-lab/temporal-shift-module)  | Uniform | 71.16 |
+
+
+## Data
+
+Please refer to Kinetics400 data download and preparation doc [k400-data](../../dataset/K400.md)
+
+Please refer to UCF101 data download and preparation doc [ucf101-data](../../dataset/ucf101.md)
+
+
+## Train
+
+### Train on kinetics-400
+
+#### download pretrain-model
+
+Please download [ResNet50_vd_ssld_v2](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams) as pretraind model:
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams
+```
+
+and add path to `MODEL.framework.backbone.pretrained` in config file as：
+
+```yaml
+MODEL:
+    framework: "Recognizer2D"
+    backbone:
+        name: "ResNetTweaksTSM"
+        pretrained: your weight path
+```
+
+- If use ResNet101 as backbone, please download [ResNet101_vd_ssld_pretrained.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams) as pretraind model.
+
+#### Start training
+
+- Train PP-TSM on kinetics-400 scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+```
+
+- Train PP-TSM on kinetics-400 video data using scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml
+```
+
+- AMP is useful for speeding up training:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm  main.py  --amp --validate -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml
+```
+
+- Train PP-TSM on kinetics-400 with dense sampling:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml
+```
+
+- Train PP-TSM on kinetics-400 with ResNet101 as backbone using dense sampling:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7"  --log_dir=log_pptsm  main.py  --validate -c configs/recognition/pptsm/pptsm_k400_frames_dense_r101.yaml
+```
+
+
+## Test
+
+- For uniform sampling, test accuracy can be found in training-logs by search key word `best`, such as:
+
+```txt
+Already save the best model (top1 acc)0.7454
+```
+
+- For dense sampling, test accuracy can be obtained using scripts:
+
+```bash
+python3 main.py --test -c configs/recognition/pptsm/pptsm_k400_frames_dense.yaml -w output/ppTSM/ppTSM_best.pdparams
+```
+
+
+Accuracy on Kinetics400:
+
+| backbone | distill | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :----------: | :----: | :----: | :----: | :----: | :---- |
+| ResNet50 | False | Uniform | 8 | 224 | 74.54 | [ppTSM_k400_uniform.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform.pdparams) |
+| ResNet50 | False | Dense | 8 | 224 | 75.69 | [ppTSM_k400_dense.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense.pdparams) |
+| ResNet50 | True | Uniform | 8 | 224 | 75.11 | [ppTSM_k400_uniform_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform_distill.pdparams) |
+| ResNet50 | True | Dense | 8 | 224 | 76.16 | [ppTSM_k400_dense_distill.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_dense_distill.pdparams) |
+| ResNet101 | True | Uniform | 8 | 224 | 76.35 | [ppTSM_k400_uniform_distill_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_uniform_distill_r101.pdparams) |
+| ResNet101 | False | Dense | 8 | 224 | 77.15 | [ppTSM_k400_dense_r101.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSM_k400_dense_r101.pdparams) |
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `ppTSM.pdmodel` and parameters file `ppTSM.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+                                -p data/ppTSM_k400_uniform.pdparams \
+                                -o inference/ppTSM
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/recognition/pptsm/pptsm_k400_frames_uniform.yaml \
+                           --model_file inference/ppTSM/ppTSM.pdmodel \
+                           --params_file inference/ppTSM/ppTSM.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/example.avi
+	top-1 class: 5
+	top-1 score: 0.9907386302947998
+```
+
+we can get the class name using class id and map file `data/k400/Kinetics-400_label_list.txt`. The top1 prediction of `data/example.avi` is `archery`.
+
+## Reference
+
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
diff --git a/english_documents/model_zoo/recognition/pp-tsn.md b/english_documents/model_zoo/recognition/pp-tsn.md
new file mode 100644
index 000000000..68d9215fd
--- /dev/null
+++ b/english_documents/model_zoo/recognition/pp-tsn.md
@@ -0,0 +1,146 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/pp-tsn.md) | English
+
+# PP-TSN
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+We have improved the [TSN model](./tsn.md) and obtained a more accurate 2D practical video classification model **PP-TSN**. Without increasing the amount of parameters and calculations, the accuracy on the UCF-101, Kinetics-400 and other data sets significantly exceeds the original version. The accuracy on the Kinetics-400 data set is shown in the following table.
+
+| Version | Top1 |
+| :------ | :----: |
+| Ours (distill) | 75.06 |
+| Ours | **73.68** |
+| [mmaction2](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn#kinetics-400) | 71.80 |
+
+
+## Data
+
+K400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)
+
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Download the image distillation pre-training model [ResNet50_vd_ssld_v2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams) as the Backbone initialization parameter, or download it through wget
+
+   ```bash
+   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams
+   ```
+
+2. Open `PaddleVideo/configs/recognition/pptsn/pptsn_k400_frames.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+    ```yaml
+    MODEL:
+        framework: "Recognizer2D"
+        backbone:
+            name: "ResNetTweaksTSN"
+            pretrained: fill in the path here
+    ```
+
+#### Start training
+
+- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:
+
+    ```bash
+    # frames data format
+    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --validate -c configs/recognition/ pptsn/pptsn_k400_frames.yaml
+
+    # videos data format
+    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --validate -c configs/recognition/ pptsn/pptsn_k400_videos.yaml
+    ```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+    ```bash
+    export FLAGS_conv_workspace_size_limit=800 # MB
+    export FLAGS_cudnn_exhaustive_search=1
+    export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+    # frames data format
+    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --amp --validate -c configs /recognition/pptsn/pptsn_k400_frames.yaml
+
+    # videos data format
+    python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --amp --validate -c configs /recognition/pptsn/pptsn_k400_videos.yaml
+    ```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The PP-TSN model is verified during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+	```
+  Already save the best model (top1 acc)0.7004
+	```
+
+- Since the sampling method of the PP-TSN model test mode is **TenCrop**, which is slightly slower but more accurate, it is different from the **CenterCrop** used in the verification mode during the training process, so the verification index recorded in the training log is `topk Acc `Does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:
+
+	```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_pptsn main.py --test -c configs/recognition/ pptsn/pptsn_k400_frames.yaml -w "output/ppTSN/ppTSN_best.pdparams"
+	```
+
+	When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+	| backbone | Sampling method | distill | num_seg | target_size | Top-1 |       checkpoints       |
+	| :------: | :-------------: | :-----: | :-----: | :---------: | :---- | :---------------------: |
+	| ResNet50 |     TenCrop     |  False  |    3    |     224     | 73.68 | [ppTSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400.pdparams) |
+	| ResNet50 |     TenCrop     |  True   |    8    |     224     | 75.06 | [ppTSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ppTSN_k400_8.pdparams) |
+
+- The PP-TSN video sampling strategy is TenCrop sampling: in time sequence, the input video is evenly divided into num_seg segments, and the middle position of each segment is sampled 1 frame; spatially, from the upper left corner, upper right corner, center point, lower left corner, and lower right corner Each of the 5 sub-regions sampled an area of 224x224, and the horizontal flip was added to obtain a total of 10 sampling results. A total of 1 clip is sampled for 1 video.
+
+- Distill is `True`, which means that the pre-trained model obtained by distillation is used. For the specific distillation scheme, please refer to [ppTSM Distillation Scheme]().
+
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/pptsn/pptsn_k400_frames.yaml -p data/ppTSN_k400.pdparams -o inference/ppTSN
+```
+
+The above command will generate the model structure file `ppTSN.pdmodel` and model weight files `ppTSN.pdiparams` and `ppTSN.pdiparams.info` files required for prediction, all of which are stored in the `inference/ppTSN/` directory
+
+For the meaning of each parameter in the above bash command, please refer to [Model Reasoning Method](https://github.com/HydrogenSulfate/PaddleVideo/blob/PPTSN-v1/docs/en/start.md#2-infer)
+
+### Use prediction engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/recognition/pptsn/pptsn_k400_frames.yaml \
+                           --model_file inference/ppTSN/ppTSN.pdmodel \
+                           --params_file inference/ppTSN/ppTSN.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+The output example is as follows:
+
+```bash
+Current video file: data/example.avi
+        top-1 class: 5
+        top-1 score: 0.998979389667511
+```
+
+It can be seen that using the PP-TSN model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By consulting the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.
+
+## Reference
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/pdf/1608.00859.pdf), Limin Wang, Yuanjun Xiong, Zhe Wang
+- [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531), Geoffrey Hinton, Oriol Vinyals, Jeff Dean
diff --git a/english_documents/model_zoo/recognition/slowfast.md b/english_documents/model_zoo/recognition/slowfast.md
new file mode 100644
index 000000000..45259f0f7
--- /dev/null
+++ b/english_documents/model_zoo/recognition/slowfast.md
@@ -0,0 +1,120 @@
+[简体中文 ](../../../zh-CN/model_zoo/recognition/slowfast.md) | English
+
+# SlowFast
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+SlowFast  involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast path-way, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition.
+
+<p align="center">
+<img src="../../../images/SlowFast.png" height=300 width=500 hspace='10'/> <br />
+SlowFast Overview
+</p>
+
+
+## Data
+
+We use Kinetics-400 to train this model，data preparation please refer to [Kinetics-400 dataset](../../dataset/k400.md).
+
+
+## Train
+
+You can start training by：
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast  main.py --validate -c configs/recognition/slowfast/slowfast.yaml
+```
+
+- Training would be efficent using our code. The training speed is 2x faster than the original implementation. Details can refer to [benchmark](https://github.com/PaddlePaddle/PaddleVideo/blob/main/docs/en/benchmark.md).
+
+### Speed up training
+
+It's time consuming to train SlowFast model.  So we implement [Multigrid training stragety](https://arxiv.org/abs/1912.00998) to speed up training. Training script:
+
+```bash
+python -B -m paddle.distributed.launch --selected_gpus="0,1,2,3,4,5,6,7" --log_dir=log-slowfast main.py --validate --multigrid -c configs/recognition/slowfast/slowfast_multigrid.yaml
+```
+
+Performance evaluation:
+
+| training stragety | time cost of one epoch/min | total training time/min | speed-up |
+| :------ | :-----: | :------: |:------: |
+| Multigrid | 27.25 |  9758 (6.7 days) | 2.89x |
+| Normal | 78.76 | 15438 (10.7days) | base |
+
+For more details, please refer to [accelerate doc](https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/tutorials/accelerate.md#%E8%AE%AD%E7%BB%83%E7%AD%96%E7%95%A5%E5%8A%A0%E9%80%9F).
+
+
+## Test
+
+You can start testing by：
+
+```bash
+python -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_slowfast_test main.py --test -c  configs/recognition/slowfast/slowfast.yaml -w output/SlowFast/SlowFast_epoch_000196.pdparams
+```
+
+-  Args `-w` is used to specifiy the model path，you can download our model in [SlowFast.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams).
+
+
+Test accuracy in Kinetics-400:
+
+| Configs | Acc1 | Acc5 | Weights |
+| :---: | :---: | :---: | :---: |
+|  [slowfast.yaml](../../../../configs/recognition/slowfast/slowfast.yaml) | 74.35 | 91.33 | [slowfast_4x16.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast.pdparams) |
+|  [slowfast_multigrid.yaml](../../../../configs/recognition/slowfast/slowfast_multigrid.yaml) | 75.84  | 92.33 | [slowfast_8x8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/SlowFast/SlowFast_8*8.pdparams) |
+
+- Acc1 may be lower than that released in papaer, as ~5% data of kinetics-400 is missing. Experiments have verified that if training with the same data, we can get the same accuracy.
+
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `SlowFast.pdmodel` and parameters file `SlowFast.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/slowfast/slowfast.yaml \
+                                -p data/SlowFast.pdparams \
+                                -o inference/SlowFast
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/recognition/slowfast/slowfast.yaml \
+                           --model_file inference/SlowFast/SlowFast.pdmodel \
+                           --params_file inference/SlowFast/SlowFast.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/example.avi
+        top-1 class: 5
+        top-1 score: 1.0
+```
+
+we can get the class name using class id and map file `data/k400/Kinetics-400_label_list.txt`. The top1 prediction of `data/example.avi` is `archery`.
+
+
+## Reference
+
+- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), Feichtenhofer C, Fan H, Malik J, et al.
diff --git a/english_documents/model_zoo/recognition/stgcn.md b/english_documents/model_zoo/recognition/stgcn.md
new file mode 100644
index 000000000..14585e5e4
--- /dev/null
+++ b/english_documents/model_zoo/recognition/stgcn.md
@@ -0,0 +1,129 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/stgcn.md) | English
+
+# ST-GCN
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+ST-GCN is skeleton-based action recognition model proposed in AAAI 2018.
+
+<div align="center">
+<img src="../../../images/st-gcn.png" height=200 width=950 hspace='10'/> <br />
+</div>
+
+
+## Data
+
+Please refer to FSD data download and preparation doc [FSD](../../dataset/fsd.md)
+
+Please refer to NTU-RGBD data download and preparation doc [NTU-RGBD](../../dataset/ntu-rgbd.md)
+
+
+## Train
+
+### Train on FSD
+
+- Train ST-GCN on FSD scripts:
+
+```bash
+python3.7 main.py -c configs/recognition/stgcn/stgcn_fsd.yaml
+```
+
+- Turn off `valid` when training, as validation dataset is not available for the competition.
+
+### Train on NTU-RGBD
+
+- Train ST-GCN on NTU-RGBD scripts:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3"  --log_dir=log_stgcn  main.py  --validate -c configs/recognition/stgcn/stgcn_ntucs.yaml
+```
+
+- config file `stgcn_ntucs.yaml` corresponding to the config of ST-GCN on NTU-RGB+D dataset with cross-subject splits.
+
+
+## Test
+
+### Test on FSD
+
+- Test scripts：
+
+```bash
+python3.7 main.py --test -c configs/recognition/stgcn/stgcn_fsd.yaml -w output/STGCN/STGCN_epoch_00090.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+- Evaluation results will be saved in `submission.csv` file, final score can be obtained in [competition website](https://aistudio.baidu.com/aistudio/competition/detail/115).
+
+Accuracy on FSD-10 dataset:
+
+Test_Data| Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| Test_A | 59.07 | [STGCN_fsd.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_fsd.pdparams) |
+
+
+### Test on NTU-RGB+D
+
+- Test scripts：
+
+```bash
+python3.7 main.py --test -c configs/recognition/stgcn/stgcn_ntucs.yaml -w output/STGCN/STGCN_best.pdparams
+```
+
+- Specify the config file with `-c`, specify the weight path with `-w`.
+
+
+Accuracy on NTU-RGB+D dataset:
+
+| split | Top-1 | checkpoints |
+| :----: | :----: | :---- |
+| cross-subject | 82.28 | [STGCN_ntucs.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/STGCN_ntucs.pdparams) |
+
+
+## Inference
+
+### export inference model
+
+ To get model architecture file `STGCN.pdmodel` and parameters file `STGCN.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/stgcn/stgcn_fsd.yaml \
+                                -p data/STGCN_fsd.pdparams \
+                                -o inference/STGCN
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/fsd10/example_skeleton.npy \
+                           --config configs/recognition/stgcn/stgcn_fsd.yaml \
+                           --model_file inference/STGCN/STGCN.pdmodel \
+                           --params_file inference/STGCN/STGCN.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```
+Current video file: data/fsd10/example_skeleton.npy
+        top-1 class: 27
+        top-1 score: 0.9912770986557007
+```
+
+## Reference
+
+- [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455), Sijie Yan, Yuanjun Xiong, Dahua Lin
diff --git a/english_documents/model_zoo/recognition/timesformer.md b/english_documents/model_zoo/recognition/timesformer.md
new file mode 100644
index 000000000..c004b9b04
--- /dev/null
+++ b/english_documents/model_zoo/recognition/timesformer.md
@@ -0,0 +1,137 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/timesformer.md) | English
+
+# TimeSformer
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#DATA)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+TimeSformer is a video classification model based on vision transformer, which has the characteristics of no convolution, global receptive field, and strong time series modeling ability. At present, it has achieved SOTA accuracy on the Kinetics-400 data set, surpassing the classic CNN-based video classification models TSN, TSM and Slowfast, and has a shorter training time (the Kinetics-400 data set training time is 39 hourss). **This code implements the time-space separated attention cascade network in the paper**.
+
+<div align="center">
+<img src="../../../images/timesformer_attention_arch.png" alt="image-20210628210446041"/><img src="../../../images/timesformer_attention_visualize.png" alt="image-20210628210446041"  />
+</div>
+
+
+## Data
+
+K400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+UCF101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)
+
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Download the image pre-training model [ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through the wget command
+
+   ```bash
+   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
+   ```
+
+2. Open `PaddleVideo/configs/recognition/timesformer/timesformer_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+    ```yaml
+    MODEL:
+        framework: "RecognizerTransformer"
+        backbone:
+            name: "VisionTransformer"
+            pretrained: fill in the path here
+    ```
+
+#### Start training
+
+- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:
+
+```bash
+# videos data format
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --validate -c configs/recognition/ timesformer/timesformer_k400_videos.yaml
+```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 # MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+# videos data format
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --amp --validate -c configs/recognition/ timesformer/timesformer_k400_videos.yaml
+```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The TimeSformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+  ```
+  Already save the best model (top1 acc)0.7258
+  ```
+
+- Since the sampling method of the TimeSformer model test mode is **UniformCrop** with a slower speed but higher accuracy, which is different from the **RandomCrop** used in the verification mode during the training process, so the verification index recorded in the training log is `topk Acc `Does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:
+
+  ```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_timesformer main.py --test -c configs/recognition/ timesformer/timesformer_k400_videos.yaml -w "output/TimeSformer/TimeSformer_best.pdparams"
+  ```
+
+
+  When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+
+  | backbone | Sampling method | num_seg | target_size | Top-1 | checkpoints |
+  | :----------------: | :-----: | :-----: | :---------: | :----: | :----------------------------------------------------------: |
+  | Vision Transformer | UniformCrop | 8 | 224 | 77.29 | [TimeSformer_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TimeSformer_k400.pdparams) |
+
+
+- During the test, the TimeSformer video sampling strategy is to use Linspace sampling: in time sequence, num_seg sparse sampling points are uniformly generated from the video sequence to be sampled; in space, select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions are sampled. A total of 1 clip is sampled for 1 video.
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/timesformer/timesformer_k400_videos.yaml \
+                                -p data/TimeSformer_k400.pdparams \
+                                -o inference/TimeSformer
+```
+
+The above command will generate the model structure file `TimeSformer.pdmodel` and the model weight file `TimeSformer.pdiparams` required for prediction.
+
+- For the meaning of each parameter, please refer to [Model Reasoning Method](../../start.md#2-infer)
+
+### Use prediction engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/recognition/timesformer/timesformer_k400_videos.yaml \
+                           --model_file inference/TimeSformer/TimeSformer.pdmodel \
+                           --params_file inference/TimeSformer/TimeSformer.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+The output example is as follows:
+
+```
+Current video file: data/example.avi
+    top-1 class: 5
+    top-1 score: 0.9999722242355347
+```
+
+It can be seen that using the TimeSformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By consulting the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be seen that the predicted category name is `archery`.
+
+## Reference
+
+- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
diff --git a/english_documents/model_zoo/recognition/tokenshift_transformer.md b/english_documents/model_zoo/recognition/tokenshift_transformer.md
new file mode 100644
index 000000000..93e41395a
--- /dev/null
+++ b/english_documents/model_zoo/recognition/tokenshift_transformer.md
@@ -0,0 +1,125 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/tokenshift_transformer.md) | English
+
+# Token Shift Transformer
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#DATA)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+Token Shift Transformer is a video classification model based on vision transformer, which shares merits of strong interpretability, high discriminative power on hyper-scale data, and ﬂexibility in processing varying length inputs. Token Shift Module is a novel, zero-parameter, zero-FLOPs operator, for modeling temporal relations within each transformer encoder.
+
+<div align="center">
+<img src="../../../images/tokenshift_structure.png">
+</div>
+
+
+
+## Data
+
+UCF-101 data download and preparation please refer to [UCF-101 data preparation](../../dataset/ucf101.md)
+
+
+## Train
+
+### UCF-101 data set training
+
+#### Download and add pre-trained models
+
+1. Download the image pre-training model [ViT_base_patch16_224](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams) as Backbone initialization parameters, or download through the wget command
+
+   ```bash
+   wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams
+   ```
+
+2. Open `PaddleVideo/configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+    ```yaml
+    MODEL:
+        framework: "RecognizerTransformer"
+        backbone:
+            name: "TokenShiftVisionTransformer"
+            pretrained: fill in the path here
+    ```
+
+#### Start training
+
+- The UCF-101 data set uses 1 card for training, and the start command of the training method is as follows:
+
+```bash
+# videos data format
+python3 main.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --validate --seed=1234
+```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+```bash
+python3 main.py --amp -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --validate --seed=1234
+```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The Token Shift Transformer model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+  ```
+  Already save the best model (top1 acc)0.9201
+  ```
+
+- Since the sampling method of the Token Shift Transformer model test mode is **uniform** sampling, which is different from the **dense** sampling used in the verification mode during the training process, so the verification index recorded in the training log, called `topk Acc `, does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:
+
+  ```bash
+  python3 main.py --amp -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml --test --seed=1234 -w 'output/TokenShiftVisionTransformer/TokenShiftVisionTransformer_best.pdparams'
+  ```
+
+
+  When the test configuration uses the following parameters, the test indicators on the validation data set of UCF-101 are as follows:
+
+
+  | backbone | sampling method | num_seg | target_size | Top-1 | checkpoints |
+  | :----------------: | :-----: | :-----: | :---------: | :----: | :----------------------------------------------------------: |
+  | Vision Transformer | Uniform | 8 | 256 | 92.81 | [TokenShiftTransformer.pdparams](https://drive.google.com/drive/folders/1k_TpAqaJZYJE8C5g5pT9phdyk9DrY_XL?usp=sharing) |
+
+
+- Uniform sampling: Timing-wise, equal division into `num_seg` segments, 1 frame sampled at the middle of each segment; spatially, sampling at the center. 1 video sampled 1 clip in total.
+
+## Inference
+
+### Export inference model
+
+```bash
+python3 tools/export_model.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml -p 'output/TokenShiftVisionTransformer/TokenShiftVisionTransformer_best.pdparams'
+```
+
+The above command will generate the model structure file `TokenShiftVisionTransformer.pdmodel` and the model weight file `TokenShiftVisionTransformer.pdiparams` required for prediction.
+
+- For the meaning of each parameter, please refer to [Model Reasoning Method](../../usage.md#2-infer)
+
+### Use prediction engine inference
+
+```bash
+python3 tools/predict.py -c configs/recognition/token_transformer/tokShift_transformer_ucf101_256_videos.yaml -i 'data/BrushingTeeth.avi' --model_file ./inference/TokenShiftVisionTransformer.pdmodel --params_file ./inference/TokenShiftVisionTransformer.pdiparams
+```
+
+The output example is as follows:
+
+```
+Current video file: data/BrushingTeeth.avi
+	top-1 class: 19
+	top-1 score: 0.9959074258804321
+```
+
+It can be seen that using the Token Shift Transformer model trained on UCF-101 to predict `data/BrushingTeeth.avi`, the output top1 category id is `19`, and the confidence is 0.99. By consulting the category id and name correspondence table, it can be seen that the predicted category name is `brushing_teeth`.
+
+## Reference
+
+- [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf), Gedas Bertasius, Heng Wang, Lorenzo Torresani
diff --git a/english_documents/model_zoo/recognition/tsm.md b/english_documents/model_zoo/recognition/tsm.md
new file mode 100644
index 000000000..e44ea6be5
--- /dev/null
+++ b/english_documents/model_zoo/recognition/tsm.md
@@ -0,0 +1,221 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/tsm.md) | English
+
+# TSM
+
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Details](#Details)
+- [Reference](#Reference)
+
+## Introduction
+
+Temporal Shift Module (TSM) is a popular model that attracts more attention at present.
+The method of moving through channels greatly improves the utilization ability of temporal information without increasing any
+additional number of parameters and calculation amount.
+Moreover, due to its lightweight and efficient characteristics, it is very suitable for industrial landing.
+
+  <div align="center">
+  <img src="../../../images/tsm_architecture.png" height=250 width=700 hspace='10'/> <br />
+  </div>
+
+
+This code implemented **single RGB stream** of TSM networks. Backbone is ResNet-50.
+
+Please refer to the ICCV 2019 paper for details [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf)
+
+## Data
+
+Please refer to Kinetics-400 data download and preparation [k400 data preparation](../../dataset/k400.md)
+
+Please refer to UCF101 data download and preparation [ucf101 data preparation](../../dataset/ucf101.md)
+
+
+## Train
+
+### Train on the Kinetics-400 dataset
+
+#### download pretrain-model
+
+1. Please download [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams) as pretraind model:
+
+   ```bash
+   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
+   ```
+
+2. Open `PaddleVideo/configs/recognition/tsm/tsm_k400_frames.yaml`, and fill in the downloaded weight path below `pretrained:`
+
+   ```bash
+   MODEL:
+   	framework: "Recognizer2D"
+   		backbone:
+   		name: "ResNetTSM"
+   		pretrained: your weight path
+   ```
+
+#### Start training
+
+- By specifying different configuration files, different data formats/data sets can be used for training. Taking the training configuration of Kinetics-400 data set + 8 cards + frames format as an example, the startup command is as follows (more training commands can be viewed in `PaddleVideo/run.sh`).
+
+  ```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+  ```
+
+- Training Kinetics-400 dataset of videos format using scripts.
+
+  ```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_k400_videos.yaml
+  ```
+
+- AMP is useful for speeding up training, scripts as follows:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames.yaml
+```
+
+- AMP works better with `NHWC` data format, scripts as follows:
+
+```bash
+export FLAGS_conv_workspace_size_limit=800 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_k400_frames_nhwc.yaml
+```
+
+- For the config file usage，please refer to [config](../../tutorials/config.md).
+
+### Train on UCF-101 dataset
+
+#### download pretrain-model
+
+- Load the TSM model we trained on Kinetics-400 [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams), or download it through the command line
+
+  ```bash
+  wget https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams
+  ```
+
+- Open `PaddleVideo/configs/recognition/tsm/tsm_ucf101_frames.yaml`, and fill in the downloaded weight path below `pretrained:`
+
+  ```bash
+  MODEL:
+      framework: "Recognizer2D"
+      backbone:
+          name: "ResNetTSM"
+          pretrained: your weight path
+  ```
+
+#### Start training
+
+- By specifying different configuration files, different data formats/data sets can be used for training. Taking the training configuration of Kinetics-400 data set + 8 cards + frames format as an example, the startup command is as follows (more training commands can be viewed in `PaddleVideo/run.sh`).
+
+  ```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml
+  ```
+
+- Training UCF-101 dataset of videos format using scripts.
+
+  ```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py  --validate -c configs/recognition/tsm/tsm_ucf101_videos.yaml
+  ```
+
+- AMP is useful for speeding up training, scripts as follows:
+
+  ```bash
+  export FLAGS_conv_workspace_size_limit=800 #MB
+  export FLAGS_cudnn_exhaustive_search=1
+  export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames.yaml
+  ```
+
+- AMP works better with `NHWC` data format, scripts as follows:
+
+  ```bash
+  export FLAGS_conv_workspace_size_limit=800 #MB
+  export FLAGS_cudnn_exhaustive_search=1
+  export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsm main.py  --amp --validate -c configs/recognition/tsm/tsm_ucf101_frames_nhwc.yaml
+  ```
+
+## Test
+
+Put the weight of the model to be tested into the `output/TSM/` directory, the test command is as follows
+
+```bash
+python3 main.py --test -c configs/recognition/tsm/tsm.yaml -w output/TSM/TSM_best.pdparams
+```
+
+---
+
+When the test configuration uses the following parameters, the evaluation accuracy on the validation data set of Kinetics-400 is as follows:
+
+| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
+| :--------: | :---------------: | :-------: | :-----------: | :-----: | :-----------: | :-----------: |
+| ResNet50 | Uniform         | NCHW | 8       | 224         | 71.06 | [TSM_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_k400.pdparams)        |
+
+When the test configuration uses the following parameters, the evaluation accuracy on the validation data set of UCF-101 is as follows:
+
+| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 | checkpoints |
+| :------: | :-------------: | :-----------------: | :-----: | :---------: | :---: | :---------: |
+| ResNet50 |     Uniform     | NCHW              |    8    |     224     | 94.42 |    [TSM_ucf101_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_nchw.pdparams)     |
+| ResNet50 |     Uniform     | NCHW+AMP |    8    |     224     | 94.40 |   [TSM_ucf101_amp_nchw.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nchw.pdparams)     |
+| ResNet50 |     Uniform     | NHWC+AMP |    8    |     224     | 94.55 |   [TSM_ucf101_amp_nhwc.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/TSM/TSM_ucf101_amp_nhwc.pdparams)     |
+
+## Inference
+
+### export inference model
+
+To get model architecture file `TSM.pdmodel` and parameters file `TSM.pdiparams`, use:
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/tsm/tsm_k400_frames.yaml \
+                                -p data/TSM_k400.pdparams \
+                                -o inference/TSM
+```
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/recognition/tsm/tsm_k400_frames.yaml \
+                           --model_file inference/TSM/TSM.pdmodel \
+                           --params_file inference/TSM/TSM.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+## Implementation details
+
+### data processing
+
+- The model reads the `mp4` data in the Kinetics-400 data set, first divides each piece of video data into `num_seg` segments, and then uniformly extracts 1 frame of image from each segment to obtain sparsely sampled `num_seg` video frames. Then do the same random data enhancement to this `num_seg` frame image, including multi-scale random cropping, random left and right flips, data normalization, etc., and finally zoom to `target_size`.
+
+### Training strategy
+
+*  Use Momentum optimization algorithm training, momentum=0.9
+*  Using L2_Decay, the weight attenuation coefficient is 1e-4
+*  Using global gradient clipping, the clipping factor is 20.0
+*  The total number of epochs is 50, and the learning rate will be attenuated by 0.1 times when the epoch reaches 20 and 40
+*  The learning rate of the weight and bias of the FC layer are respectively 5 times and 10 times the overall learning rate, and the bias does not set L2_Decay
+*  Dropout_ratio=0.5
+
+### Parameter initialization
+
+- Initialize the weight of the FC layer with the normal distribution of Normal(mean=0, std=0.001), and initialize the bias of the FC layer with a constant of 0
+
+
+## Reference
+
+- [TSM: Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383.pdf), Ji Lin, Chuang Gan, Song Han
diff --git a/english_documents/model_zoo/recognition/tsn.md b/english_documents/model_zoo/recognition/tsn.md
new file mode 100644
index 000000000..cc1574459
--- /dev/null
+++ b/english_documents/model_zoo/recognition/tsn.md
@@ -0,0 +1,123 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/tsn.md) | English
+
+# TSN
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Details](#Details)
+- [Reference](#Reference)
+
+## Introduction
+
+Temporal Segment Network (TSN) is a classic 2D-CNN-based solution in the field of video classification. This method mainly solves the problem of long-term behavior recognition of video, and replaces dense sampling by sparsely sampling video frames, which can not only capture the global information of the video, but also remove redundancy and reduce the amount of calculation. The core idea is to average the features of each frame as the overall feature of the video, and then enter the classifier for classification. The model implemented by this code is a TSN network based on a single-channel RGB image, and Backbone uses the ResNet-50 structure.
+
+<div align="center">
+<img src="../../../images/tsn_architecture.png" height=350 width=80000 hspace='10'/> <br />
+</div>
+
+
+For details, please refer to the ECCV 2016 paper [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)
+
+## Data
+
+PaddleVide provides training and testing scripts on the Kinetics-400 dataset. Kinetics-400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Load the ResNet50 weights trained on ImageNet1000 as Backbone initialization parameters [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams), or download through the command line
+
+   ```bash
+   wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
+   ```
+
+2. Open `PaddleVideo/configs/recognition/tsn/tsn_k400_frames.yaml`, and fill in the downloaded weight path below `pretrained:`
+
+   ```yaml
+   MODEL:
+       framework: "Recognizer2D"
+       backbone:
+           name: "ResNet"
+           pretrained: fill in the path here
+   ```
+
+#### Start training
+
+- Kinetics-400 data set uses 8 cards for training, the training start command for frames format data is as follows
+
+  ```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py --validate -c configs/recognition/ tsn/tsn_k400_frames.yaml
+  ```
+
+## Test
+
+Since the sampling method of the TSN model test mode is **TenCrop** with a slower speed but higher accuracy, which is different from the **CenterCrop** used in the verification mode during the training process, the verification index `topk Acc` recorded in the training log It does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_tsn main.py --test -c configs/recognition/ tsn/tsn_k400_frames.yaml -w "output/TSN/TSN_best.pdparams"
+```
+
+When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+| backbone | Sampling method | Training Strategy | num_seg | target_size | Top-1 |                         checkpoints                          |
+| :------: | :-------------: | :---------------: | :-----: | :---------: | :---: | :----------------------------------------------------------: |
+| ResNet50 |     TenCrop     |       NCHW        |   3    |     224     | 69.81 | [TSN_k400.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400.pdparams) |
+| ResNet50 |     TenCrop     |       NCHW        |   8    |     224     | 71.70 | [TSN_k400_8.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/TSN_k400_8.pdparams) |
+## Inference
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/tsn/tsn_k400_frames.yaml \
+                                -p data/TSN_k400.pdparams \
+                                -o inference/TSN
+```
+
+The above command will generate the model structure file `TSN.pdmodel` and the model weight file `TSN.pdiparams` required for prediction.
+
+For the meaning of each parameter, please refer to [Model Reasoning Method](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-Model Reasoning)
+
+### infer
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/recognition/tsn/tsn_k400_frames.yaml \
+                           --model_file inference/TSN/TSN.pdmodel \
+                           --params_file inference/TSN/TSN.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+## Details
+
+**data processing:**
+
+- The model reads the `mp4` data in the Kinetics-400 data set, first divides each piece of video data into `num_seg` segments, and then evenly extracts 1 frame of image from each segment to obtain sparsely sampled `num_seg` video frames , And then do the same random data enhancement to this `num_seg` frame image, including multi-scale random cropping, random left and right flips, data normalization, etc., and finally zoom to `target_size`
+
+**training strategy:**
+
+- Use Momentum optimization algorithm for training, momentum=0.9
+
+- Using L2_Decay, the weight attenuation coefficient is 1e-4
+
+- Use global gradient clipping, with a clipping factor of 40.0
+
+- The total number of epochs is 100, and the learning rate will be attenuated by 0.1 times when the epoch reaches 40 and 80
+
+- Dropout_ratio=0.4
+
+**parameter initialization**
+
+- The convolutional layer of the TSN model uses Paddle's default [KaimingNormal](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/initializer/KaimingNormal_cn.html#kaimingnormal) and [Constant](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/Constant_cn.html#constant) initialization method, with Normal(mean=0, std= 0.01) normal distribution to initialize the weight of the FC layer, and a constant 0 to initialize the bias of the FC layer
+
+## Reference
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
diff --git a/english_documents/model_zoo/recognition/tsn_dali.md b/english_documents/model_zoo/recognition/tsn_dali.md
new file mode 100644
index 000000000..affaf0ad5
--- /dev/null
+++ b/english_documents/model_zoo/recognition/tsn_dali.md
@@ -0,0 +1,98 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/tsn_dali.md) | English
+
+# TSN DALI
+
+- [Introduction](#Introduction)
+- [Requirement](#Requirement)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+We aims to speed up TSN model training using DALI in this code. As [nvidia DALI](https://github.com/NVIDIA/DALI) not support TSN sampling way, we reimplemented segment sampling in VideoReader.
+
+### Performance
+
+Test Environment: 
+```
+Card: Tesla v100
+Memory: 4 * 16G
+Cuda: 9.0
+batch_size of single card: 32
+```
+
+| Training way | batch cost/s  | reader cost/s | ips:instance/sec | Speed up |
+| :--------------- | :--------: | :------------: | :------------: | :------------: |
+| DALI | 2.083 | 1.804 | 15.36597  |  1.41x |
+| Dataloader: num_workers=4 | 2.943 | 2.649 | 10.87460| base |
+| pytorch实现 | TODO | TODO | TODO | TODO | 
+
+
+## Requirement
+
+docker image:
+
+```
+    huangjun12/paddlevideo:tsn_dali_cuda9_0
+```
+
+To build container, you can use:
+
+```bash
+nvidia-docker run --name tsn-DALI -v /home:/workspace --network=host -it --shm-size 64g -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video huangjun12/paddlevideo:tsn_dali_cuda9_0 /bin/bash
+```
+
+## Data
+
+- Kinetics400 dataset please refer to [K400 data](../../dataset/k400.md)
+
+- UCF101 dataset please refer to [UCF101 data](../../dataset/ucf101.md)
+
+## Train
+
+### download pretrain-model
+
+- Please download [ResNet50_pretrain.pdparams](https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams) as pretraind model:
+
+```bash
+wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_pretrain.pdparams
+```
+
+and add path to MODEL.framework.backbone.pretrained in config file as：
+
+```yaml
+MODEL:
+    framework: "Recognizer2D"
+    backbone:
+        name: "ResNet"
+        pretrained: your weight path
+```
+
+### Start training
+
+You can start training by: 
+
+```bash
+python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3" --log_dir=log_tsn main.py --train_dali -c configs/recognition/tsn/tsn_dali.yaml -o log_level="INFO"
+```
+
+- Args -c is used to specify config file，default is ```configs/recognition/tsn/tsn_dali.yaml```。
+
+- For finetune please download our trained model [TSN.pdparams]()<sup>coming soon</sup>，and specify file path with --weights. 
+
+- For the config file usage，please refer to [config](../../tutorials/config.md).
+
+## Test
+
+Please refer to [TSN Test](./tsn.md)
+
+## Inference
+
+Please refer to [TSN Inference](./tsn.md)
+
+## Reference
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
diff --git a/english_documents/model_zoo/recognition/videoswin.md b/english_documents/model_zoo/recognition/videoswin.md
new file mode 100644
index 000000000..4a1415000
--- /dev/null
+++ b/english_documents/model_zoo/recognition/videoswin.md
@@ -0,0 +1,131 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/videoswin.md) | English
+
+# Video-Swin-Transformer Video Classification Model
+
+## content
+
+- [Introduction](#Introduction)
+- [Data](#DATA)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+Video-Swin-Transformer is a video classification model based on Swin Transformer. It utilizes Swin Transformer's multi-scale modeling and efficient local attention characteristics. It currently achieves SOTA accuracy on the Kinetics-400 data set, surpassing the same transformer structure. The TimeSformer model.
+
+
+![VideoSwin](../../../images/videoswin.jpg)
+
+## DATA
+
+K400 data download and preparation please refer to [Kinetics-400 data preparation](../../dataset/k400.md)
+
+
+## Train
+
+### Kinetics-400 data set training
+
+#### Download and add pre-trained models
+
+1. Download the image pre-training model [swin_base_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams) as the Backbone initialization parameter, or download it through the wget command
+
+   ```bash
+   wget https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams # ImageNet pretrained model for VideoSwin_base
+
+   # wget https://videotag.bj.bcebos.com/PaddleVideorelease2.2/swin_small_patch4_window7_224.pdparams # Imagenet pretrained model for VideoSwin_small
+   ```
+
+2. Open `configs/recognition/videoswin/videoswin_base_k400_videos.yaml`, and fill in the downloaded weight storage path below `pretrained:`
+
+    ```yaml
+    MODEL:
+        framework: "RecognizerTransformer"
+        backbone:
+            name: "SwinTransformer3D"
+            pretrained: fill in the path here
+    ```
+
+#### Start training
+
+- The Kinetics400 data set uses 8 cards for training, and the start command of the training method is as follows:
+
+    ```bash
+    # videos data format
+    python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin_base main.py --validate -c configs/recognition/video_swin_transformer/videoswin_base_k400_videos.yaml
+    ```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+    ```bash
+    export FLAGS_conv_workspace_size_limit=800 # MB
+    export FLAGS_cudnn_exhaustive_search=1
+    export FLAGS_cudnn_batchnorm_spatial_persistent=1
+    # videos data format
+    python3.7 -u -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin_base main.py --amp --validate -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml
+    ```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../../zh-CN/contribute/config.md) for parameter usage.
+
+
+## Test
+
+- The Video-Swin-Transformer model is verified during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+  ```log
+  Already save the best model (top1 acc)0.7258
+  ```
+
+- Since the sampling method of the Video-Swin-Transformer model test mode is a bit slower but more accurate **UniformCrop**, which is different from the **CenterCrop** used in the verification mode during the training process, so the verification recorded in the training log The index `topk Acc` does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index. The command is as follows:
+
+  ```bash
+  python3.7 -B -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir=log_videoswin_base main.py --test -c configs/recognition/video_swin_transformer/videoswin_base_k400_videos.yaml -w "output/VideoSwin_base/VideoSwin_base_best.pdparams"
+  ```
+
+  When the test configuration uses the following parameters, the test indicators on the validation data set of Kinetics-400 are as follows:
+
+   |        backbone        | Sampling method | num_seg | target_size | Top-1 |                                                        checkpoints                                                         | pretrain model |
+   | :--------------------: | :-------------: | :-----: | :---------: | :---- | :------------------------------------------------------------------------------------------------------------------------: | :----: |
+   | Swin-Transformer_base  |   UniformCrop   |   32    |     224     | 82.40 |  [SwinTransformer_k400_base.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_base_k400.pdparams)  | [swin_base_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_base_patch4_window7_224.pdparams) |
+   | Swin-Transformer_small |   UniformCrop   |   32    |     224     | 80.18 | [SwinTransformer_k400_small.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/VideoSwin_small_k400.pdparams) | [swin_small_patch4_window7_224.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/swin_small_patch4_window7_224.pdparams) |
+
+## Inference
+
+### Export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/recognition/videoswin/videoswin_base_k400_videos.yaml \
+                                -p data/VideoSwin_base_k400.pdparams \
+                                -o inference/VideoSwin_base
+```
+
+The above command will generate the model structure file `VideoSwin_base.pdmodel` and the model weight file `VideoSwin_base.pdiparams` required for prediction.
+
+- For the meaning of each parameter, please refer to [Model Inference](../../usage.md#2-infer)
+
+### Use predictive engine inference
+
+```bash
+python3.7 tools/predict.py --input_file data/example.avi \
+                           --config configs/recognition/videoswin/videoswin_base_k400_videos.yaml \
+                           --model_file inference/VideoSwin_base/VideoSwin_base.pdmodel \
+                           --params_file inference/VideoSwin_base/VideoSwin_base.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+The output example is as follows:
+
+```log
+Current video file: data/example.avi
+        top-1 class: 5
+        top-1 score: 0.9999829530715942
+```
+
+It can be seen that using the Video-Swin-Transformer model trained on Kinetics-400 to predict `data/example.avi`, the output top1 category id is `5`, and the confidence is 0.99. By referring to the category id and name correspondence table `data/k400/Kinetics-400_label_list.txt`, it can be known that the predicted category name is `archery`.
+
+## Reference
+
+- [Video Swin Transformer](https://arxiv.org/pdf/2106.13230.pdf), Ze Liu, Jia Ning, Yue Cao, Yixuan Wei
diff --git a/english_documents/model_zoo/segmentation/asrf.md b/english_documents/model_zoo/segmentation/asrf.md
new file mode 100644
index 000000000..18f7d016a
--- /dev/null
+++ b/english_documents/model_zoo/segmentation/asrf.md
@@ -0,0 +1,139 @@
+[简体中文](../../../zh-CN/model_zoo/segmentation/asrf.md) | English
+
+# ASRF : Video Action Segmentation Model
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+ASRF model is an improvement on the video motion segmentation model ms-tcn, which was published on WACV in 2021. We reproduce the officially implemented pytorch code and obtain approximate results in paddlevideo.
+
+<p align="center">
+<img src="../../../images/asrf.png" height=300 width=400 hspace='10'/> <br />
+MS-TCN Overview
+</p>
+
+## Data
+
+ASRF can choose 50salads, breakfast, gtea as trianing set. Please refer to Video Action Segmentation dataset download and preparation doc [Video Action Segmentation dataset](../../dataset/SegmentationDataset.md)
+
+Unlike MS-TCN, ASRF model requires additional data construction. The script process is as follows
+```bash
+python data/50salads/prepare_asrf_data.py --dataset_dir data/
+```
+
+## Train
+
+After prepare dataset, we can run sprits.
+
+```bash
+# gtea dataset
+export CUDA_VISIBLE_DEVICES=3
+python3.7 main.py  --validate -c configs/segmentation/asrf/asrf_gtea.yaml
+```
+
+- Start the training by using the above command line or script program. There is no need to use the pre training model. The video action segmentation model is usually a full convolution network. Due to the different lengths of videos, the `DATASET.batch_size` of the video action segmentation model is usually set to `1`, that is, batch training is not required. At present, only **single sample** training is supported.
+
+## Test
+
+Test MS-TCN on dataset scripts:
+
+```bash
+python main.py  --test -c configs/segmentation/asrf/asrf_gtea.yaml --weights=./output/ASRF/ASRF_split_1.pdparams
+```
+
+- The specific implementation of the index is to calculate ACC, edit and F1 scores by referring to the test script[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py) provided by the author of ms-tcn.
+
+The reproduction of pytorch comes from the official [code base](https://github.com/yiskw713/asrf)
+
+- The evaluation method of data set adopts the folding verification method in ms-tcn paper, and the division method of folding is the same as that in ms-tcn paper.
+
+Accuracy on Breakfast dataset(4 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 67.6% | 72.4% | 74.3% | 68.9% | 56.1% |
+| pytorch | 65.8% | 71.0% | 72.3% | 66.5% | 54.9% |
+| paddle | 66.1% | 71.9% | 73.3% | 67.9% | 55.7% |
+
+Accuracy on 50salads dataset(5 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 84.5% | 79.3% | 82.9% | 83.5% | 77.3% |
+| pytorch | 81.4% | 75.6% | 82.7% | 81.2% | 77.2% |
+| paddle | 81.6% | 75.8% | 83.0% | 81.5% | 74.8% |
+
+Accuracy on gtea dataset(4 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 77.3% | 83.7% | 89.4% | 87.8% | 79.8% |
+| pytorch | 76.3% | 79.6% | 87.3% | 85.8% | 74.9% |
+| paddle | 77.1% | 83.3% | 88.9% | 87.5% | 79.1% |
+
+Model weight for gtea
+Test_Data| F1@0.5 | checkpoints |
+| :----: | :----: | :---- |
+| gtea_split1 | 72.4409 | [ASRF_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_1.pdparams) |
+| gtea_split2 | 76.6666 | [ASRF_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_2.pdparams) |
+| gtea_split3 | 84.5528 | [ASRF_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_3.pdparams) |
+| gtea_split4 | 82.6771 | [ASRF_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ASRF_gtea_split_4.pdparams) |
+## Infer
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/segmentation/asrf/asrf_gtea.yaml \
+                                -p data/ASRF_gtea_split_1.pdparams \
+                                -o inference/ASRF
+```
+
+To get model architecture file `ASRF.pdmodel` and parameters file `ASRF.pdiparams`, use:
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+Input file are the file list for infering, for example:
+```
+S1_Cheese_C1.npy
+S1_CofHoney_C1.npy
+S1_Coffee_C1.npy
+S1_Hotdog_C1.npy
+...
+```
+
+```bash
+python3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \
+                           --config configs/segmentation/asrf/asrf_gtea.yaml \
+                           --model_file inference/ASRF/ASRF.pdmodel \
+                           --params_file inference/ASRF/ASRF.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```bash
+result write in : ./inference/infer_results/S1_Cheese_C1.txt
+result write in : ./inference/infer_results/S1_CofHoney_C1.txt
+result write in : ./inference/infer_results/S1_Coffee_C1.txt
+result write in : ./inference/infer_results/S1_Hotdog_C1.txt
+result write in : ./inference/infer_results/S1_Pealate_C1.txt
+result write in : ./inference/infer_results/S1_Peanut_C1.txt
+result write in : ./inference/infer_results/S1_Tea_C1.txt
+```
+
+
+## Reference
+
+- [Alleviating Over-segmentation Errors by Detecting Action Boundaries](https://arxiv.org/pdf/2007.06866v1.pdf), Yuchi Ishikawa, Seito Kasai, Yoshimitsu Aoki, Hirokatsu Kataoka
diff --git a/english_documents/model_zoo/segmentation/cfbi.md b/english_documents/model_zoo/segmentation/cfbi.md
new file mode 100644
index 000000000..1b5eefe22
--- /dev/null
+++ b/english_documents/model_zoo/segmentation/cfbi.md
@@ -0,0 +1,46 @@
+[简体中文](../../../zh-CN/model_zoo/recognition/cfbi.md) | English
+
+# CFBI
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Test](#Test)
+- [Reference](#Reference)
+
+## Introduction
+
+CFBI is a Video Object Segmentation model proposed by Baidu in ECCV 2020. This method consider background should be equally treated and thus propose Collaborative video object segmentation by Foreground-Background Integration (CFBI) approach. Our CFBI implicitly imposes the feature embedding from the target foreground object and its corresponding background to be contrastive, promoting the segmentation results accordingly.  Given the image and target segmentation of the reference frame (the first frame) and the previous frame, the model will predict the segmentation of the current frame.
+
+<div align="center">
+<img src="../../../images/cfbi.png" height=400 width=600 hspace='10'/> <br />
+</div>
+
+
+## Data
+
+Please refer to DAVIS data download and preparation doc [DAVIS-data](../../dataset/davis.md)
+
+
+## Test
+
+- Test scripts:
+
+```bash
+python3.7 main.py --test -c configs/segmentation/cfbip_davis.yaml -w CFBIp_davis.pdparams
+```
+
+- Predicted results will be saved in `result_root`. To get evaluation metrics, please use [davis2017-evaluation tools](https://github.com/davisvideochallenge/davis2017-evaluation).
+
+Metrics on DAVIS:
+
+| J&F-Mean | J-Mean | J-Recall | J-Decay | F-Mean | F-Recall | F-Decay | checkpoints |
+| :------: | :-----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| 0.823 | 0.793 | 0.885 | 0.083 | 0.852 | 0.932 | 0.100 | [CFBIp_r101_davis.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/CFBIp_r101_davis.pdparams) |
+
+
+## Reference
+
+- [Collaborative Video Object Segmentation by Foreground-Background Integration](https://arxiv.org/abs/2003.08333), Zongxin Yang, Yunchao Wei, Yi Yang
diff --git a/english_documents/model_zoo/segmentation/mstcn.md b/english_documents/model_zoo/segmentation/mstcn.md
new file mode 100644
index 000000000..7c619f2b7
--- /dev/null
+++ b/english_documents/model_zoo/segmentation/mstcn.md
@@ -0,0 +1,130 @@
+[简体中文](../../../zh-CN/model_zoo/segmentation/mstcn.md) | English
+
+# MS-TCN : Video Action Segmentation Model
+
+---
+## Contents
+
+- [Introduction](#Introduction)
+- [Data](#Data)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+## Introduction
+
+Ms-tcn model is a classic model of video motion segmentation model, which was published on CVPR in 2019. We optimized the officially implemented pytorch code and obtained higher precision results in paddlevideo.
+
+<p align="center">
+<img src="../../../images/mstcn.PNG" height=300 width=400 hspace='10'/> <br />
+MS-TCN Overview
+</p>
+
+## Data
+
+MS-TCN can choose 50salads, breakfast, gtea as trianing set. Please refer to Video Action Segmentation dataset download and preparation doc [Video Action Segmentation dataset](../../dataset/SegmentationDataset.md)
+
+## Train
+
+After prepare dataset, we can run sprits.
+
+```bash
+# gtea dataset
+export CUDA_VISIBLE_DEVICES=3
+python3.7 main.py  --validate -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --seed 1538574472
+```
+
+- Start the training by using the above command line or script program. There is no need to use the pre training model. The video action segmentation model is usually a full convolution network. Due to the different lengths of videos, the `DATASET.batch_size` of the video action segmentation model is usually set to `1`, that is, batch training is not required. At present, only **single sample** training is supported.
+
+## Test
+
+Test MS-TCN on dataset scripts:
+
+```bash
+python main.py  --test -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml --weights=./output/MSTCN/MSTCN_split_1.pdparams
+```
+
+- The specific implementation of the index is to calculate ACC, edit and F1 scores by referring to the test script[evel.py](https://github.com/yabufarha/ms-tcn/blob/master/eval.py) provided by the author of ms-tcn.
+
+- The evaluation method of data set adopts the folding verification method in ms-tcn paper, and the division method of folding is the same as that in ms-tcn paper.
+
+Accuracy on Breakfast dataset(4 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 66.3% | 61.7% | 48.1% | 48.1% | 37.9% |
+| paddle | 65.2% | 61.5% | 53.7% | 49.2% | 38.8% |
+
+Accuracy on 50salads dataset(5 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 80.7% | 67.9% | 76.3% | 74.0% | 64.5% |
+| paddle | 81.1% | 71.5% | 77.9% | 75.5% | 66.5% |
+
+Accuracy on gtea dataset(4 folding verification):
+
+| Model | Acc | Edit | F1@0.1 | F1@0.25 | F1@0.5 |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| paper | 79.2% | 81.4% | 87.5% | 85.4% | 74.6% |
+| paddle | 76.9% | 81.8% | 86.4% | 84.7% | 74.8% |
+
+Model weight for gtea
+
+Test_Data| F1@0.5 | checkpoints |
+| :----: | :----: | :---- |
+| gtea_split1 | 70.2509 | [MSTCN_gtea_split_1.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_1.pdparams) |
+| gtea_split2 | 70.7224 | [MSTCN_gtea_split_2.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_2.pdparams) |
+| gtea_split3 | 80.0 | [MSTCN_gtea_split_3.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_3.pdparams) |
+| gtea_split4 | 78.1609 | [MSTCN_gtea_split_4.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.2/MSTCN_gtea_split_4.pdparams) |
+
+## Infer
+
+### export inference model
+
+```bash
+python3.7 tools/export_model.py -c configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \
+                                -p data/MSTCN_gtea_split_1.pdparams \
+                                -o inference/MSTCN
+```
+
+To get model architecture file `MSTCN.pdmodel` and parameters file `MSTCN.pdiparams`, use:
+
+- Args usage please refer to [Model Inference](https://github.com/PaddlePaddle/PaddleVideo/blob/release/2.0/docs/zh-CN/start.md#2-%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).
+
+### infer
+
+Input file are the file list for infering, for example:
+```
+S1_Cheese_C1.npy
+S1_CofHoney_C1.npy
+S1_Coffee_C1.npy
+S1_Hotdog_C1.npy
+...
+```
+
+```bash
+python3.7 tools/predict.py --input_file data/gtea/splits/test.split1.bundle \
+                           --config configs/segmentation/ms_tcn/ms_tcn_gtea.yaml \
+                           --model_file inference/MSTCN/MSTCN.pdmodel \
+                           --params_file inference/MSTCN/MSTCN.pdiparams \
+                           --use_gpu=True \
+                           --use_tensorrt=False
+```
+
+example of logs:
+
+```bash
+result write in : ./inference/infer_results/S1_Cheese_C1.txt
+result write in : ./inference/infer_results/S1_CofHoney_C1.txt
+result write in : ./inference/infer_results/S1_Coffee_C1.txt
+result write in : ./inference/infer_results/S1_Hotdog_C1.txt
+result write in : ./inference/infer_results/S1_Pealate_C1.txt
+result write in : ./inference/infer_results/S1_Peanut_C1.txt
+result write in : ./inference/infer_results/S1_Tea_C1.txt
+```
+
+## Reference
+
+- [MS-TCN: Multi-Stage Temporal Convolutional Network for Action Segmentation](https://arxiv.org/pdf/1903.01945.pdf), Y. Abu Farha and J. Gall.
diff --git a/english_documents/quick_start.md b/english_documents/quick_start.md
new file mode 100644
index 000000000..59d95d8db
--- /dev/null
+++ b/english_documents/quick_start.md
@@ -0,0 +1,157 @@
+English | [简体中文](../zh-CN/quick_start.md)
+
+# PaddleVide Quick Start
+
+- [1. Installation](#1)
+  - [1.1 Install PaddlePaddle](#11)
+  - [1.2 Install PaddleVideo Whl Package](#12)
+- [2. Easy-to-Use](#2)
+  - [2.1 Use by Command Line](#21)
+  - [2.2 Use by Python Code](#22)
+- [3. Arguments description](#3)
+- [4.QA](#4)
+
+## 1. Installation
+
+<a name="11"></a>
+### 1.1 Install PaddlePaddle
+
+- If you have CUDA 9 or CUDA 10 installed on your machine, please run the following command to install
+
+  ```bash
+  python3.7 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple
+  ```
+
+- If you have no available GPU on your machine, please run the following command to install the CPU version
+
+  ```bash
+  python3.7 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
+  ```
+
+For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation.
+
+
+<a name="12"></a>
+### 1.2 Install PaddleVideo Whl Package
+- option1: use pypi（recommand）
+
+```bash
+pip3.7 install ppvideo==2.3.0
+```
+
+
+- option2: build and install locally
+```bash
+python3.7 setup.py bdist_wheel
+python3.7 -m pip install dist/ppvideo-2.3.0-py3-none-any.whl
+```
+
+
+## 2. Easy-to-Use
+
+<a name="21"></a>
+### 2.1 Use by Command Line
+
+Run shell command：
+```bash
+ppvideo --model_name='ppTSM_v2' --use_gpu=False --video_file='data/example.avi'
+```
+
+- This command use `PP-TSM_v2` model to infer `data/example.avi` file in `CPU`.
+
+- The length of the example video is about 10s. When inference, the video is first divided into 16 segments according to the time axis, then extract one frame from each segment. Finally all frames are combined and feeded into the network.
+
+Results：
+
+```
+Current video file: data/example.avi
+        top-1 classes: [5]
+        top-1 scores: [1.]
+        top-1 label names: ['archery']
+```
+
+As you can see, use `PP-TSM_v2` trained on Kinetics-400 to predict `data/example.avi` video，top1 prediction class_id is `5`, scores is `1.0`, class name is `archery`.
+
+<a name="22"></a>
+### 2.2 Use by Python Code
+
+Run python code：
+
+```python
+from ppvideo import PaddleVideo
+clas = PaddleVideo(model_name='ppTSM_v2', use_gpu=False)
+video_file='data/example.avi'
+clas.predict(video_file)
+```
+
+- This code use `PP-TSM_v2` model to infer `data/example.avi` file in `CPU`.
+
+Results:
+```
+Current video file: data/example.avi
+        top-1 classes: [5]
+        top-1 scores: [1.]
+        top-1 label names: ['archery']
+```
+
+As you can see, use `PP-TSM_v2` trained on Kinetics-400 to predict `data/example.avi` video，top1 prediction class_id is `5`, scores is `1.0`, class name is `archery`.
+
+<a name="3"></a>
+## 3. Arguments description
+
+| name | type | description |
+| :---: | :---: | :--- |
+| model_name | str | optional, model name, `'ppTSM'` or `'ppTSM_v2'`. If None, please specify the path of your inference model by args `model_file` and `params_file`. |
+| video_file | str | required, Video file path, supported format: single video file path, or folder containing multiple videos. |
+| use_gpu | bool | whether to use GPU，default True。 |
+| num_seg | int | The number of segments used in the TSM model, which is also the number of frames extracted from the video. 8 for `ppTSM`, 16 for `ppTSM_v2`, default 16. |
+| short_size | int |  short size of frame, default 256.|
+| target_size | int | target size of frame, default 224.|
+| model_file | str | optional，inference model(`.pdmodel`)path. |
+| params_file | str | optional, inference modle(`.pdiparams`) path. |
+| batch_size | int | Batch size, default 1.|
+| use_fp16 | bool | whether to use float16，default False.|
+| use_tensorrt | bool| whether to use Tensorrt, default False.|
+| gpu_mem | int | use GPU memory, default 8000.|
+| enable_mkldnn | bool | whether to use MKLDNN, default False.|
+| top_k | int | top_k, default 1. |
+| label_name_path | str | This file consists the relation of class_id and class_name. Default use `data/k400/Kinetics-400_label_list.txt` of Kinetics-400. You can replace it with your own label file. |
+
+command example1：
+```bash
+ppvideo --model_name='ppTSM_v2' --num_seg=16 --video_file="data/mp4" --batch_size=2  --top_k=5
+```
+
+
+Results：
+```txt
+Current video file: data/mp4/example3.avi
+        top-5 classes: [  5 345 311 159 327]
+        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]
+        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']
+Current video file: data/mp4/example2.avi
+        top-5 classes: [  5 345 311 159 327]
+        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]
+        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']
+Current video file: data/mp4/example.avi
+        top-5 classes: [  5 345 311 159 327]
+        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]
+        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']
+Current video file: data/mp4/example1.avi
+        top-5 classes: [  5 345 311 159 327]
+        top-5 scores: [1.0000000e+00 1.0152016e-11 8.2871061e-14 6.7713670e-14 5.0752070e-14]
+        top-5 label names: ['archery', 'sword_fighting', 'skipping_rope', 'hula_hooping', 'spray_painting']
+```
+
+command example1：
+```bash
+ppvideo --model_name='ppTSM' --num_seg=8 --video_file="data/mp4" --batch_size=2  --top_k=5
+```
+
+<a name="4"></a>
+## 4. QA
+
+1. opecv-python Installation maybe slow, you can try:
+```
+python3.7 -m pip install opencv-python==4.2.0.32 -i https://pypi.doubanio.com/simple
+```
diff --git a/english_documents/tools.md b/english_documents/tools.md
new file mode 100644
index 000000000..56138e5f2
--- /dev/null
+++ b/english_documents/tools.md
@@ -0,0 +1,22 @@
+[简体中文](../zh-CN/tools.md) | English
+
+# Tools
+
+This page includes the usage of some useful tools in PaddleVideo.
+
+## Params
+
+To get the params of a model.
+
+```shell
+python3.7 tools/summary.py -c configs/recognization/tsm/tsm.yaml
+```
+
+## FLOPS
+to print FLOPs.
+
+```shell
+python3.7 tools/summary.py -c configs/recognization/tsm/tsm.yaml --FLOPs
+```
+
+## Test the export model <sup>coming soon</sup>
diff --git a/english_documents/tutorials/Action Recognition Datasets b/english_documents/tutorials/Action Recognition Datasets
new file mode 100644
index 000000000..9bd259157
--- /dev/null
+++ b/english_documents/tutorials/Action Recognition Datasets	
@@ -0,0 +1,12 @@
+Usefull Action Recognition Datasets.
+
+    AVA,  https://arxiv.org/abs/1705.08421
+    Kinetics, https://arxiv.org/abs/1705.06950
+    YouTube-8M, https://arxiv.org/abs/1609.08675
+    ActivityNet, http://www.cv-foundation.org/openaccess/content_cvpr_2015/html/Heilbron_ActivityNet_A_Large-Scale_2015_CVPR_paper.html
+    Moments in Time, https://arxiv.org/pdf/1801.03150.pdf
+    Charades, https://arxiv.org/abs/1604.01753
+    EPIC-Kitchens, https://arxiv.org/abs/1804.02748
+    THUMOS, https://arxiv.org/abs/1604.06182
+    UCF-101, http://crcv.ucf.edu/papers/UCF101_CRCV-TR-12-01.pdf
+    HMDB51, http://serre-lab.clps.brown.edu/wp-content/uploads/2012/08/Kuehne_etal_iccv11.pdf
diff --git a/english_documents/tutorials/Action Recognition Papers b/english_documents/tutorials/Action Recognition Papers
new file mode 100644
index 000000000..7282bef96
--- /dev/null
+++ b/english_documents/tutorials/Action Recognition Papers	
@@ -0,0 +1,31 @@
+Useful Papers on Action Recognition and Video Classification.
+
+TSN: Temporal Segment Networks: Towards Good Practices for Deep Action Recognition, ECCV 2016
+TSM: Temporal Shift Module for Efficient Video Understanding, ICCV 2019
+SlowFast Networks for Video Recognition, ICCV 2019
+Non-local Neural Networks, CVPR 2018
+A Multigrid Method for Efficiently Training Video Models, CVPR2020
+X3D: Progressive Network Expansion for Efficient Video Recognition, CVPR2020
+ECO: Efficient Convolutional Network for Online Video Understanding, ECCV 2018
+3D Resnet: Would Mega-scale Datasets Further Enhance Spatiotemporal 3D CNNs, CVPR 2018
+TPN: Temporal Pyramid Network for Action Recognition, CVPR 2020
+EvaNet: Evolving Space-Time Neural Architectures for Videos, ICCV 2019
+RepFlow: Representation Flow for Action Recognition, CVPR 2019
+MARS: Motion-Augmented RGB Stream for Action Recognition, CVPR 2019
+StNet: Local and Global Spatial-Temporal Modeling for Human Action Recognition, AAAI 2019
+Attention Cluster: Purely Attention Based Local Feature Integration for Video Classification
+NeXtVLAD: An Efficient Neural Network to Aggregate Frame-level Features for Large-scale Video Classification
+C-TCN: Action localization Model by Baidu, the Champion model of ActivityNet 2018
+Neural Graph Matching Networks for Fewshot 3D Action Recognition - M. Guo et al., ECCV2018.  
+Temporal 3D ConvNets using Temporal Transition Layer - A. Diba et al., CVPRW2018.  
+Temporal 3D ConvNets: New Architecture and Transfer Learning for Video Classification - A. Diba et al., arXiv2017.  
+Attentional Pooling for Action Recognition - R. Girdhar and D. Ramanan, NIPS2017.  
+Fully Context-Aware Video Prediction - Byeon et al, arXiv2017.  
+Hidden Two-Stream Convolutional Networks for Action Recognition - Y. Zhu et al, arXiv2017.  
+Dynamic Image Networks for Action Recognition - H. Bilen et al, CVPR2016.   
+Long-term Recurrent Convolutional Networks for Visual Recognition and Description - J. Donahue et al, CVPR2015.  
+Describing Videos by Exploiting Temporal Structure - L. Yao et al, ICCV2015. 
+Real-time Action Recognition with Enhanced Motion Vector CNNs - B. Zhang et al, CVPR2016.  
+Action Recognition with Trajectory-Pooled Deep-Convolutional Descriptors - L. Wang et al, CVPR2015. 
+    
+  
diff --git a/english_documents/tutorials/Spatio-Temporal Action Detection Papers b/english_documents/tutorials/Spatio-Temporal Action Detection Papers
new file mode 100644
index 000000000..f466849f6
--- /dev/null
+++ b/english_documents/tutorials/Spatio-Temporal Action Detection Papers	
@@ -0,0 +1,30 @@
+Usefull Spatio-Temporal Action Detection Papers.
+
+
+
+    A Better Baseline for AVA - R. Girdhar et al., ActivityNet Workshop, CVPR2018.
+    Real-Time End-to-End Action Detection with Two-Stream Networks - A. El-Nouby and G. Taylor, arXiv2018.
+    Human Action Localization with Sparse Spatial Supervision - P. Weinzaepfel et al., arXiv2017.
+    Unsupervised Action Discovery and Localization in Videos - K. Soomro and M. Shah, ICCV2017.
+    Spatial-Aware Object Embeddings for Zero-Shot Localization and Classification of Actions - P. Mettes and C. G. M. Snoek, ICCV2017.
+    Action Tubelet Detector for Spatio-Temporal Action Localization - V. Kalogeiton et al, ICCV2017. 
+    Tube Convolutional Neural Network (T-CNN) for Action Detection in Videos - R. Hou et al, ICCV2017. 
+    Chained Multi-stream Networks Exploiting Pose, Motion, and Appearance for Action Classification and Detection - M. Zolfaghari et al, ICCV2017. 
+    TORNADO: A Spatio-Temporal Convolutional Regression Network for Video Action Proposal - H. Zhu et al., ICCV2017.
+    Online Real time Multiple Spatiotemporal Action Localisation and Prediction - G. Singh et al, ICCV2017. 
+    AMTnet: Action-Micro-Tube regression by end-to-end trainable deep architecture - S. Saha et al, ICCV2017.
+    Am I Done? Predicting Action Progress in Videos - F. Becattini et al, BMVC2017.
+    Generic Tubelet Proposals for Action Localization - J. He et al, arXiv2017.
+    Incremental Tube Construction for Human Action Detection - H. S. Behl et al, arXiv2017.
+    Multi-region two-stream R-CNN for action detection - X. Peng and C. Schmid. ECCV2016. 
+    Spot On: Action Localization from Pointly-Supervised Proposals - P. Mettes et al, ECCV2016.
+    Deep Learning for Detecting Multiple Space-Time Action Tubes in Videos - S. Saha et al, BMVC2016. 
+    Learning to track for spatio-temporal action localization - P. Weinzaepfel et al. ICCV2015.
+    Action detection by implicit intentional motion clustering - W. Chen and J. Corso, ICCV2015.
+    Finding Action Tubes - G. Gkioxari and J. Malik CVPR2015. 
+    APT: Action localization proposals from dense trajectories - J. Gemert et al, BMVC2015. 
+    Spatio-Temporal Object Detection Proposals - D. Oneata et al, ECCV2014.
+    Action localization with tubelets from motion - M. Jain et al, CVPR2014.
+    Spatiotemporal deformable part models for action detection - Y. Tian et al, CVPR2013. 
+    Action localization in videos through context walk - K. Soomro et al, ICCV2015.
+    Fast Action Proposals for Human Action Detection and Search - G. Yu and J. Yuan, CVPR2015. 
diff --git a/english_documents/tutorials/TSM.md b/english_documents/tutorials/TSM.md
new file mode 100644
index 000000000..a0db9aa9b
--- /dev/null
+++ b/english_documents/tutorials/TSM.md
@@ -0,0 +1,73 @@
+# 1. Background&Motivation
+At present, the video data on the Internet is increasing rapidly, and the time users spend watching short videos and small videos is also increasing rapidly. How to analyze, process and classify the massive video resources quickly and accurately is an urgent problem to be solved. The video understanding technology can analyze the video content in multiple dimensions, understand the video semantics, and automatically classify and label the video, which greatly saves the efficiency of manual audit and costs. At the same time, accurate user recommendation is realized to improve the experience effect.
+In this paper, we will introduce the classic model **TSM (Temporal Shift Module)** in the field of video understanding, which is proposed by **MIT** and **IBM Watson AI Lab** `Ji Lin, Chuang Gan and Songhan, etc`, to achieve the balance between effeiciency and performance and improve video understanding ability.
+
+The most relevant video understanding model to TSM is the **Temporal Segment Network (TSN)** published by Limin Wang
+a series of works represented such as I3D, S3D and P3D, which carry out end-to-end joint spatial-temporal modeling through 3D convolution. Although this series of works can capture spatial-temporal features, compared with TSN, the transition from 2D convolution to 3D convolution inevitably introduces extra computation. TSM cleverly uses the idea of temporal dimension feature map shift, theoretically achieving the purpose of feature fusion and joint modeling among different frames with zero extra computing overhead compared with TSN.
+
+**Paper Address:** [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/pdf/1811.08383v2.pdf)
+
+Let's have a look at the following example: if the video is played from left to right and then from right to left respectively, the subjects will give different but correct interpretation of the video, indicating that the understanding of the video is strongly dependent on the temporal information of the video. Yes !, It is the motivation why TSM is proposed.
+<p align="center">
+<img src="../../images/temporal.png" height=188 width=500 hspace='10'/> <br />
+</p>
+
+It looks interesting, next,let's dive into the core modules of TSM.
+
+# 2. Dark technologies used in TSM
+
+On the basis of traditional image analysis, video analysis needs researchers to supplement the modeling structure of temporal information. At present, 2D CNN and 3D CNN are the two most commonly used methods in video understanding: using 2D CNN model requires less computation but will lose part of the time information; While using 3D CNN has a good effect but a large amount of computation. Faced with such a situation, Ji Lin, Chuang Gan and Song Han et al. from MIT and IBM Watson AI Lab proposed the Temp Shift Module (TSM) Module. By embedding the time displacement module into 2D CNN, they can easily achieve the same video understanding ability as 3D CNN without adding any additional calculation and parameters.
+<p align="center">
+<img src="../../images/tsm_intr.png" height=188 width=500 hspace='10'/> <br />
+</p>
+
+The rows and columns of the matrix in the figure above represent the temporal and channel dimensions of the feature graph, respectively. In TSM module, some channels are moved forward one step int the temporal dimension, and some channels are moved backward one step in the temporal dimension, and the gaps after the displacement are zeroed. In this way, context interaction on the temporal dimension is introduced into the feature graph. The channel movement operation can make the current frame contain the channel information of the two adjacent frames. In this way, the 2D convolution operation can directly extract the spatial-temporal information of the video just like the 3D convolution.
+It improves the modeling ability of the model in time dimension. based on this basis, the researchers further subdivided the module into TSM module suitable for online video and TSM module suitable for offline video.
+<p align="center">
+<img src="../../images/tsm_architecture.png" height=188 width=500 hspace='10'/> <br />
+</p>
+
+Bi-Direction TSM module can obtain past and future spatial and temporal information, which is suitable for offline video with high throughput. However, UNI-Direction TSM module is only suitable for low delay online video recognition compared with the present and past spatio-temporal information.
+In addition, the author also considered the insertion position of TSM modules and compared two TSM insertion methods: **Residual TSM** and **in-place TSM**. The author found that **Residual TSM** could achieve better performance than **in-place TSM**, At the same time, author explained that **in-place TSM** may affect the extraction of spatial information.
+<p align="center">
+<img src="../../images/residual_tsm.png" height=188 width=500 hspace='10'/> <br />
+</p>
+
+TSM module looks **So Easy!!**, the next question is how to implement ?
+
+# 3. The core codes of TSM
+
+Now that the principle is clear, let's look at how the code works. First let's have a look the torch version tsm. Unfortunately, the Torch framework does not provide an API for TSM, so we will have to do it by ourselves. The code is shown below:
+<p align="center">
+<img src="../../images/torch_tsm.png" height=160 width=500 hspace='10'/> <br />
+</p>
+
+This means that you only need to add four lines of code to TSN's codebase then you can **double the accuracy in Something-Something datasets!!** what a simple and efficient model!
+
+But...，
+
+**paddlepaddle** framework take the needs of the majority of users into account and have achieve TSM OP,then users can use it easily.
+<p align="center">
+<img src="../../images/tsm_op.png" height=300 width=400 hspace='10'/> <br />
+</p>
+
+So you no longer have to achieve it by yourself, **it cab be called directly!!! , it can be called directly!!! , it can be called directly!!!** The important thing must say three times.
+
+Do you think that it is the end of the this topic?  **Too young Too simple !!!**
+
+We have also optimized it to increase speed by 5 times while reducing memory consumption. See the acceleration documentation [accelerate.md](./accelerate.md) for more information.
+
+Let's have a look at how TSM is implemented using **paddlepaddle**:
+
+`import paddle.nn.functional as F`
+
+
+`shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)`
+
+**Only two lines codes !!!**, isn't it easy ?
+
+# Reference
+[1] [Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018](https://arxiv.org/pdf/1811.08383v2.pdf).
+
+
+[2] [Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016](https://arxiv.org/abs/1608.00859).
diff --git a/english_documents/tutorials/Temporal Action Detection Papers b/english_documents/tutorials/Temporal Action Detection Papers
new file mode 100644
index 000000000..dc475d3de
--- /dev/null
+++ b/english_documents/tutorials/Temporal Action Detection Papers	
@@ -0,0 +1,24 @@
+Usefull Temporal Action Detection Papers. 
+
+    Rethinking the Faster R-CNN Architecture for Temporal Action Localization - Yu-Wei Chao et al., CVPR2018
+    Weakly Supervised Action Localization by Sparse Temporal Pooling Network - Phuc Nguyen et al., CVPR 2018
+    Temporal Deformable Residual Networks for Action Segmentation in Videos - P. Lei and S. Todrovic., CVPR2018.
+    End-to-End, Single-Stream Temporal Action Detection in Untrimmed Videos - Shayamal Buch et al., BMVC 2017
+    Cascaded Boundary Regression for Temporal Action Detection - Jiyang Gao et al., BMVC 2017
+    Temporal Tessellation: A Unified Approach for Video Analysis - Kaufman et al., ICCV2017. 
+    Temporal Action Detection with Structured Segment Networks - Y. Zhao et al., ICCV2017. 
+    Temporal Context Network for Activity Localization in Videos - X. Dai et al., ICCV2017.
+    Detecting the Moment of Completion: Temporal Models for Localising Action Completion - F. Heidarivincheh et al., arXiv2017.
+    CDC: Convolutional-De-Convolutional Networks for Precise Temporal Action Localization in Untrimmed Videos - Z. Shou et al, CVPR2017.
+    SST: Single-Stream Temporal Action Proposals - S. Buch et al, CVPR2017.
+    R-C3D: Region Convolutional 3D Network for Temporal Activity Detection - H. Xu et al, arXiv2017. [code] [project web] [PyTorch]
+    DAPs: Deep Action Proposals for Action Understanding - V. Escorcia et al, ECCV2016. 
+    Online Action Detection using Joint Classification-Regression Recurrent Neural Networks - Y. Li et al, ECCV2016. 
+    Temporal Action Localization in Untrimmed Videos via Multi-stage CNNs - Z. Shou et al, CVPR2016. 
+    Fast Temporal Activity Proposals for Efficient Detection of Human Actions in Untrimmed Videos - F. Heilbron et al, CVPR2016. 
+    Actionness Estimation Using Hybrid Fully Convolutional Networks - L. Wang et al, CVPR2016. 
+    Learning Activity Progression in LSTMs for Activity Detection and Early Detection - S. Ma et al, CVPR2016.
+    End-to-end Learning of Action Detection from Frame Glimpses in Videos - S. Yeung et al, CVPR2016. 
+    Fast Action Proposals for Human Action Detection and Search - G. Yu and J. Yuan, CVPR2015. 
+    Bag-of-fragments: Selecting and encoding video fragments for event detection and recounting - P. Mettes et al, ICMR2015.
+    Action localization in videos through context walk - K. Soomro et al, ICCV2015.
diff --git a/english_documents/tutorials/accelerate.md b/english_documents/tutorials/accelerate.md
new file mode 100644
index 000000000..da2032d19
--- /dev/null
+++ b/english_documents/tutorials/accelerate.md
@@ -0,0 +1 @@
+[简体中文](../../zh-CN/tutorials/accelerate.md) | English
diff --git a/english_documents/tutorials/config.md b/english_documents/tutorials/config.md
new file mode 100644
index 000000000..20b3e4880
--- /dev/null
+++ b/english_documents/tutorials/config.md
@@ -0,0 +1,131 @@
+# Configs design
+
+---
+This page shows how PaddleVideo use the basic IOC/DI technology to decouple and control the whole framework. It is flexible to increase modularity of this system and make it extensible. At last, we will explain the details of config yaml and script args.
+
+
+## Design
+
+First, when we create a new class, it is common to new a instance like:
+
+```python
+class TSM():
+    pass
+
+model = TSM(init_attributes)
+```
+
+when more classes are created, the coupling relationship between the calling and called method will increase sharply, obviously, we can create a factory class to solve it, like that:
+
+```python
+if model_name == "TSM":
+    model = TSM()
+elif model_name == "TSN":
+    model = TSN()
+elif ...
+```
+and
+
+```python
+optimizer_cfg = dict(name:"MOMENTUM", params: XXX)
+if optimizer_cfg.name = "MOMENTUM":
+    optimizer = MOMENTUM(optimizer_cfg.pop(name))
+elif:
+    ...
+```
+
+more and more conditions have to be created though. like widly used in the Java or other platforms, we apply ```inversion of control``` and ```Dependency Inversion``` to decuople.
+
+Second, to implenment DI, we build two components:
+
+- Register, to regist a class
+- Builder, to new an instance
+
+1. Register
+
+We implenment a getter and a setter function to map string to an instance.
+[source code](../../paddlevideo/utils/registry.py)
+
+```python
+#excerpt from source code.
+class Registry():
+    def __init__(self, name):
+        self._name = name
+        self._obj_map = {}
+
+    #mapping name -> object
+    def register(self,  obj, name):
+        self._obj_map[name] = obj
+
+    #get object
+    def get(self, name):
+        ret = self._obj_map.get(name)
+        return ret
+```
+
+It provides name -> object mapping. For example, To register an object:
+```python
+
+    BACKBONES = Registry('backbone')
+    class ResNet:
+        pass
+    BACKBONES.register(ResNet)
+```
+
+Or, use a decorator
+```python
+    BACKBONES = Registry('backbone') #new a Register
+    @BACKBONES.register() #regist resnet as a backbone.
+    class ResNet:
+        pass
+```
+
+2. Builder
+
+To obtain a registed module.
+```python
+    # Usage: To build a module.
+
+    backbone_name = "ResNet"
+    b = BACKBONES.get(backbone_name)()
+```
+
+so that we can new(register) an instance in **where it declared**, not **where it called**, a basic DI sub-system has been created now.
+
+We apply this design on many places, such as: PIPELINE, BACKBONE, HEAD, LOSS, METRIC and so on.
+
+Finally, We build all of the framework components from config yaml which matches the source code one by one, **It means the attributes in a configuration field is same as the init atrributes of the mathced class**, and to indicate a specified class, we always use ```name``` to mark it. like:
+
+```yaml
+head:
+    name: "TSMHead"  # class name
+    num_classes: 400 # TSMHead class init attributes
+    ...
+```
+
+---
+
+## config yaml details
+
+We separate the config to several parts, in high level:
+
+- **MODEL:** Architecture configuration, such as HEAD module, BACKBONE module.
+- **DATASET:** DATASET and dataloader configuration.
+- **PIPELINE:** pipeline of processing configuration.
+- **OPTIMIZER:** Optimizer configuration.
+
+and some unique global configurations, like
+- model_name
+- log_interval
+- epochs
+- resume_epoch
+- log_level
+...
+
+Training script args
+
+-  **--validate**: switch validate mode on or not
+-  **--test**: switch test mode on or not
+-  **--weights**: weights path
+-  **-c**: config yaml path
+-  **-o**: override args, one can use it like: -o DATASET.batch_size=16
diff --git a/english_documents/tutorials/customized_usage.md b/english_documents/tutorials/customized_usage.md
new file mode 100644
index 000000000..ca348179c
--- /dev/null
+++ b/english_documents/tutorials/customized_usage.md
@@ -0,0 +1,44 @@
+[简体中文](../../zh-CN/tutorials/customized_usage.md) | English
+
+# Customized Usage
+
+## Customized Dataset
+
+1. finetune
+
+Please refer to [finetune](../start.md#model_finetune) if only change a "regular" dataset.
+
+2. customized pipeline
+
+  - add new augments
+  - add new batch augments
+  **Note**: Be care of checking the difference of different modes.
+
+## Customized Network
+
+1. module function
+
+Please refer to [modular desigh](modular_design.md) for more information.
+
+2. customized framework
+
+  - change framework
+  - change initialized function
+  - customized loss
+  
+## Customized Solvers
+
+1. step decay and epoch decay
+
+2. customized solvers
+
+## Customized metrics
+
+  - add new data processing
+  - add new record
+  - add new metrics
+
+## Debug tools
+
+1. Debug level
+2. FAQ
diff --git a/english_documents/tutorials/demos b/english_documents/tutorials/demos
new file mode 100644
index 000000000..2228d3048
--- /dev/null
+++ b/english_documents/tutorials/demos
@@ -0,0 +1,8 @@
+some useful demo todo.  
+
+1、single-class action recognition， tsn/tsm/slowfast  
+2、multi-class action recognition，lstm  
+3、action localization，bmn  
+4、spatio temporal action detection，todo  
+5、3000-class tagging application(videotag)：tsn+lstm  
+6、Highlights detection application：bmn+tsn+lstm  
diff --git a/english_documents/tutorials/deployment.md b/english_documents/tutorials/deployment.md
new file mode 100644
index 000000000..c88329f45
--- /dev/null
+++ b/english_documents/tutorials/deployment.md
@@ -0,0 +1,48 @@
+[简体中文](../../zh-CN/tutorials/deployment.md) | English
+
+# Inference
+
+## How to convert dygraph model to static model?
+To infer and deploy a model, we need export an inference model, or called to_static: `convert dygraph model to static model`, at first.
+
+```python
+python3.7 tools/export_model.py -c config_file -o output_path -p params_file
+```
+
+Note: In `export_model.py`, It will build a model again, and then loading the prarams. But some init params in the infer phase is different from the train phase.
+we add `num_seg` for TSM in advanced, please add more params or modify them if it is necessary.
+please refer to [official documents](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/04_dygraph_to_static/index_cn.html) for more information.
+
+## How to test the export model?
+
+PaddleVideo supports a test script to test the exported model.
+
+```python
+python3.7 tools/test_export_model.py -p params_file -i inference_folder -c config_file
+```
+
+We just print the output shape, please feel free to extend it. Avtually, only test a video file by PaddleInference can make sure the exported model is right.
+
+## How to use PaddleInference?
+PaddleVideo supports ```tools/predict.py``` to infer
+
+```python
+python3.7 tools/predict.py -v example.avi --model_file "./inference/example.pdmodel" --params_file "./inference/example.pdiparams" --enable_benchmark=False --model="example" --num_seg=8
+ ```
+
+## How to test inference speed?
+PaddleVideo support a script to test inference speed
+
+```python
+python3.7 tools/predict.py --enable_benchmark=True --model_file=模型文件 --params_file=参数文件
+```
+## How to use C++ infer?
+<sup> coming soon</sup>
+
+# Deployment
+
+## How to use PaddleHub Serving deploy?
+<sup> coming soon</sup>
+
+## How to use PaddleLite deploy?
+<sup> coming soon</sup>
diff --git a/english_documents/tutorials/modular_design.md b/english_documents/tutorials/modular_design.md
new file mode 100644
index 000000000..a426ef5c7
--- /dev/null
+++ b/english_documents/tutorials/modular_design.md
@@ -0,0 +1 @@
+[简体中文](../../zh-CN/tutorials/modular_design.md) | English
diff --git a/english_documents/tutorials/pp-tsm.md b/english_documents/tutorials/pp-tsm.md
new file mode 100644
index 000000000..d3ed2dbb7
--- /dev/null
+++ b/english_documents/tutorials/pp-tsm.md
@@ -0,0 +1,32 @@
+# High performance recognition 2D architecture PP-TSM
+
+PP-TSM：An Effective and Efficient video-recognition model   
+
+PP-TSM is an optimized model based on TSM in PaddleVideo,   
+whose performance (top-1 on UCF101 and Kinetics400) and inference spped   
+are better than TSM paper(https://arxiv.org/abs/1811.08383 ) and   
+other open source TSM，PaddlePaddle2.0(available on pip now) or   
+Daily Version( https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-dev )   
+is required to run PP-TSM.    
+
+When only use ImageNet for pretrain and only use 8X1 sample，  
+PP-TSM’s top1 reached to 89.5% and 73.5% on UCF101 and Kinetics400,   
+and inference speed of FP32 on single V100 is 147 VPS on Kinectics400 dataset.  
+inference speed of FP16 with TensorRT on single V100 isTODO.  
+
+As far as we know, under the same conditions,    
+top1=73.5% on Kinetics400 is the best performance for 2D video model until now.  
+
+
+PP-TSM improved performance and speed of TSM with following methods:   
+1、Model Tweaks: ResNet50vd  ，+2.5%  
+2、ImageNet pretrain weights based on Knowledge Distillation  ， +1.3%    
+3、beter batch size  ，+0.2%   
+4、beter L2  ，+0.3%  
+5、label_smoothing  ，+0.2%  
+6、beter lr decay  ，+0.15%  
+7、Data augmentation  ，+0.3%  
+8、beter epoch num  ，+0.15%  
+9、bn strategy  ，+0.4%  
+10、integrated PaddleInference  
+11、more strategies todo: Knowledge Distillation、optimizer and so on.  
diff --git a/english_documents/tutorials/summarize.md b/english_documents/tutorials/summarize.md
new file mode 100644
index 000000000..6bd49bf7d
--- /dev/null
+++ b/english_documents/tutorials/summarize.md
@@ -0,0 +1,208 @@
+[简体中文](../../zh-CN/tutorials/summarize.md) | English
+
+# Introduction for video classification(action recognition)
+
+## Wide range of application scenarios
+Video classification has a wide range of applications in many fields, such as online video platforms such as short videos, offline such as security, transportation, quality inspection and other fields。
+
+
+## Multiple subtasks
+Similar to image tasks, video tasks can also be divided into two categories: **classification (recognition) and detection**, and these two types of tasks can be specifically subdivided by combining different scenes：
+
++ Task1：Trimmed Action Recognition. Users input a trimmed video,which contains only single action,then a video tag will be output by model as depicted in fig below:
+<p align="center">
+<img src="../../images/action_classification.png" height=300 width=700 hspace='10'/> <br />
+ Action Classification
+</p>
+
+  In terms of the data modality used, classification tasks can be further subdivided into classification based on single modality data, classification based on multi-modality data, classification based on RGB images and classification based on human skeleton, etc, as shown in the figure below:
+
+  <p align="center">
+  <img src="../../images/multimodality.png" height=300 width=500 hspace='10'/> <br />
+ multi-modality
+  </p>
+In terms of the perspective of video, it can also be divided into first-person action recognition, 
+third-person action recognition, single perspective action recognition and multi-perspective fusion action recognition. 
+Users who are interested in these fields can refer to relevant literatures.
+
++ Task2：Untrimmed Video Classification. 
+Unlike trimmed videos, untrimmed videos often contain multiple actions and have a long time span. 
+There are a lot of movements that we may need not paying attention to. Through the global analysis of the input long video, and then make a soft classify to mutiple categories.
+
++ Task3：Temporal Action Proposal. It is similar to the ROI extraction in the image detection task. 
+The task is to find the video clips that may contain action in a long video with a lot of actions.
+
++ Task4：Temporal Action Localization. Compared with the temporal action proposal task as mentioned above, 
+temporal action localization task is more consistent with detection task in the field of imgae, 
+it requires not only to find the video segments with possible actions from the video but also to classify them,
+as shown in the figure below
+ <p align="center">
+<img src="../../images/action_detection.png" height=200 width=1000 hspace='10'/> <br />
+ Action Detection
+</p>
+
++ Task5：Dense-Captioning Events. The reason why it is called dense captioning events is mainly 
+because that this task requires video action description on the basis of temporal action localization 
+(detection). That is to say, the task needs to locate the actions in a **untrimmed** video,in **temporal 
+dimension** and describe the behavior of the **whole video** after obtaining many video segments which contain actions.
+
+## Introduction of datasets
+
+### Classification datasets
+
+The training and validation of the model cannot be done without comprehensive, 
+large and well annotated datasets. With the deepening of research on video action recognition, 
+more and more datasets are applied to the research in this field. 
+Typical datasets are as follows:
+
++ KTH[<sup>1</sup>](#1)
+
+KTH dataset is an early small action recognition dataset, 
+including 599 videos of 6 types of actions (walking, jumping, running, punching, waving and clapping). 
+The background is relatively still, except for the zoom in and out of the camera, 
+the camera movement is relatively slight. Since this data set is relatively small, 
+it is easy to overfit when training heavy 3D networks, 
+so most current researches are not based on this it.
+
++ UCF10[<sup>2</sup>](#2)
+
+UCF101 is a medium-size dataset in which most videos are from YouTube. 
+It contains 13,320 videos with 101 types of actions. 
+Each type of action is performed by 25 people, each of whom performs 4-7 sets of actions. 
+The UCF101 and HMDB51 datasets used to be the benchmarks to evaluate the effectiveness of action 
+recognition model for a long time before the Kinetics dataset was released.
+
++ HMDB51[<sup>3</sup>](#3)
+
+Brown University's proposed dataset named HMDB51 was released in 2011. 
+Most of the videos come from movies, 
+but some come from public databases and online video libraries such as YouTube. 
+The datasets contains 6849 samples divided into 51 classes, 
+each of which contains at least 101 samples.
+
++ Kinetics[<sup>4</sup>](#4)
+
+Kinetics is the most important large-scale action recognition dataset, which was proposed by Google's DeepMind team in 2017. The video data also comes from YouTube, with 400 categories (now expanded to 700 categories) and more than 300,000 videos (now expanded to 600,000 videos), each lasting about 10 seconds. 
+The action categories are mainly divided into three categories: "human", "human and animal", "human and human interaction". Kinetics can train 3D-RESNET up to 152 layers without over-fitting, 
+which solves the problem that the previous training dataset is too small to train deep 3D network. 
+Kinetics has replaced UCF101 and HMDB51 as the benchmark in the field of action recognition. 
+At present, most studies use this dataset for evaluation and pre-training.
+
++ Something-Something[<sup>5</sup>](#5)
+
+SomethingV1 contains 108,499 annotated videos (V2 has expanded to 220,847), each of which last two to six seconds. These videos contain 174 kinds of actions. Different from the previous dataset, 
+the identification of this data set requires stronger time information, 
+so this dataset has a very important reference value in testing the temporal modeling ability of the model.
+
+In addition to the above datasets, there are Charades[<sup>6</sup>](#6) dataset for complex Action recognition, Breakfast Action[<sup>7</sup>](#7), and Sports 1M[<sup>8</sup>](#8).
+
+
+### Detection datasets
+
++ THUMOS 2014
+
+This dataset is from THUMOS Challenge 2014, Its training set is UCF101, validation set and test set include 1010 and 1574 undivided video clips respectively. In the action detection task, only 20 kinds of unsegmented videos of actions were labeled with sequential action fragments, 
+including 200 validation sets (3007 action fragments) and 213 test sets (3358 action fragments).
+
++ MEXaction2
+
+The Mexaction2 dataset contains two types of action: horse riding and bullfighting. 
+The dataset consists of three parts: YouTube videos, horseback riding videos in UCF101, and INA videos. 
+YouTube clips and horseback riding videos in UCF101 are short segmented video clips that are used as training sets. 
+The INA video is a long unsegmented video with a total length of 77 hours, 
+and it is divided into three parts: training, validation and test. 
+There are 1336 action segments in the training set, 310 in the validation set and 329 in the test set. 
+Moreover, the Mexaction2 dataset is characterized by very long unsegmented video lengths, 
+and marked action segments only account for a very low proportion of the total video length.
+
++ ActivityNet
+
+At present the largest database, also contains two tasks of classification and detection. 
+This dataset only provides a YouTube link to the video, not a direct download of the video, 
+so you also need to use the YouTube download tool in Python to automatically download the videos. 
+The dataset contains 200 action categories, 20,000 (training + verification + test set) videos, 
+and a total of about 700 hours of video.
+
+## Introduction of classic models
+As shown in the figure, 
+the action recognition framework mainly includes three steps: 
+feature extraction, motion representation and classification. 
+How to extract spatiotemporal features of video is the core problem of action recognition and video classification.
+ <p align="center">
+<img src="../../images/action_framework.png" height=300 width=700 hspace='10'/> <br />
+Framework of action recognition
+</p>
+According to different methods, action recognition (video classification) methods can be generally summarized into two stages: 
+manual feature-based method and deep learning-based method. 
+Typical motion descriptors in the manual feature-based method stage include DTP and IDT, 
+which are also the most excellent motion descriptors accepted by most researchers before deep-learning is applied in this field. 
+Interinterested readers may refer to the relevant references at the end of this paper. 
+Since 2014, deep learning methods have been gradually applied to the field of video classification. 
+At present, deep learning-based methods have become a hotspot of research in both academic and the practice, and the  effect is far beyond the motion features of manual design. 
+Since 2014, many classic network structures have been put forward by the researchers regarding the problem of how to represent motion characteristics, 
+as shown in the figure below:
+ <p align="center">
+<img src="../../images/classic_model.png" height=300 width=700 hspace='10'/> <br />
+Classic Models
+</p>
+
+At present,Paddlevideo has contained several classic models such as:TSN[<sup>9</sup>](#9),TSM[<sup>10</sup>](#10),slowfast[<sup>11</sup>](#11),et al.In the future,
+we will analyze the classic models and papers in these fields. Please look forward to it
+
+
+## Introduction of competetion
++ [ActivityNet](http://activity-net.org/challenges/2020/challenge.html)
+
+ActivityNet is a large-scale action recognition competition. Since 2016, 
+it has been held simultaneously with CVPR every year. Up to this year, 
+it has been held for 4 consecutive sessions. It focuses on identifying everyday, high-level, goal-oriented activities from 
+user-generated videos taken from the Internet video portal YouTube. 
+At present, ActivityNet competition has become the most influential competition in the field of action recognition.
+
+## Reference
+
+<div id='1'>
+[1] Schuldt C, Laptev I, Caputo B.Recognizing Human Actions: A Local SVM Approach Proceedings of International Conference on Pattern Recognition. Piscataway, NJ: IEEE, 2004:23-26
+</div>
+<br/>
+<div id='2'>
+[2] Soomro K, Zamir A R, Shah M. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. arXiv:1212.0402,2012.
+</div>
+<br/>
+<div id='3'>
+[3] Kuehne H, Jhuang H, Garrote E, et al. HMDB: a large video database for human motion recognition Proceedings of IEEE International Conference on Computer Vision. Piscataway, NJ: IEEE, 2011:2556-2563.
+</div>
+<br/>
+<div id='4'>
+[4] Carreira J , Zisserman A . Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2017:6299-6308.
+</div>
+<br/>
+<div id='5'>
+[5] Goyal R, Kahou S E, Michalski V. The “something something” video database for learning and evaluating visual common sense. arXiv:1706.04261,2017.
+</div>
+<br/>
+<div id='6'>
+[6] Sigurdsson G A , Varol Gül, Wang Xiaolong, et al. Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding. arXiv: 604.01753,2016
+</div>
+<br/>
+<div id='7'>
+[7] Kuehne H, Arslan A, Serre T. The Language of Actions Recovering the Syntax and Semantics of Goal-Directed Human Activities  Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014.
+</div>
+<br/>
+<div id='8'>
+[8] Karpathy A , Toderici G , Shetty S , et al. Large-Scale Video Classification with Convolutional Neural Networks Proceedings of IEEE Conference on Computer Vision and Pattern Recognition. Piscataway, NJ: IEEE, 2014:1725-1732.
+</div>
+<br/>
+<div id='9'>
+[9] Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoo Tang,and Luc Van Gool. Temporal segment networks for action recognition in videos? In Proceedings of the European Conference on Computer Vision,pages 20–36. Springer, 2016.
+</div>
+<br/>
+<div id='10'>
+[10] Lin Ji , Gan Chuang , Han Song . TSM: Temporal Shift Module for Efficient Video Understanding. arXiv:1811.08383,2018.
+</div>
+<br/>
+<div id='11'>
+[11] Feichtenhofer C , Fan Haoqi , Malik J , et al. SlowFast Networks for Video Recognition. arXiv:1812.03982,2018.
+</div>
+
+
diff --git a/english_documents/usage.md b/english_documents/usage.md
new file mode 100644
index 000000000..612fd5767
--- /dev/null
+++ b/english_documents/usage.md
@@ -0,0 +1,177 @@
+[简体中文](../zh-CN/usage.md) | English
+
+# Usage
+---
+
+Please refer to [installation documents](./install.md) to prepare the enviroment, and follow the steps mentioned in the [data preparation documents](./dataset/) to construct dataset, we will take you through the basic functions supported by PaddleVideo, all of it takes the ucf101 dataset with frame format as example.
+
+PaddleVideo only support linux operation system and GPU running time environment now.
+
+Default detination folder of PaddleVideo files. running the [example config](../../configs/example.yaml) as example.
+
+```
+PaddleVideo
+    ├── paddlevideo
+    ├── ... #other source codes
+    ├── output #ouput destination
+    |    ├── example
+    |    |   ├── example_best.pdparams #path_to_weights
+    |    |   └── ...  
+    |    └── ...  
+    ├── log  #log file destination.
+    |    ├── worker.0
+    |    ├── worker.1
+    |    └── ...  
+    └── inference #inference files destination.
+         ├── .pdiparams file
+         ├── .pdimodel file
+         └── .pdiparmas.info file
+```
+
+<a name="1"></a>
+## 1. Train and Test
+
+Start running multi-cards training scripts or test scripts by `paddle.distributed.launch`, or run the `run.sh` directly.
+
+```bash
+sh run.sh
+```
+
+We put all the start commands in advanced in the ```run.sh```, please uncomment the selected one to run.
+
+
+<a name="model_train"></a>
+### 1.1 Train
+
+Switch `--validate` on to validating while training.
+
+```bash
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    main.py \
+    --validate \
+    -c ./configs/example.yaml
+```
+
+Indicating `-c` to set configuration, and one can flexible add `-o` in the script to update it.
+
+```bash
+python -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    main.py \
+    -c ./configs/example.yaml \
+    --validate \
+    -o DATASET.batch_size=16
+```
+Indicating `-o DATASET.batch_size=16` can update batch size to 16, please refer to [configuration](tutorials/config.md#config-yaml-details) for more information.
+
+After starting training, log files will generated, and its format is shown as below, it will output to both the screen and files. Default destination of log is under the `.log/` folder, and stored in the files named like `worker.0`, `worker.1` ...
+
+[train phase] current time, current epoch/ total epoch, batch id, metrics, elapse time, ips, etc.:
+
+    [12/28 17:31:26] epoch:[ 1/80 ] train step:0   loss: 0.04656 lr: 0.000100 top1: 1.00000 top5: 1.00000 elapse: 0.326 reader: 0.001s ips: 98.22489 instance/sec.
+
+[eval phase] current time, current epoch/ total epoch, batch id, metrics, elapse time, ips, etc.:
+
+
+    [12/28 17:31:32] epoch:[ 80/80 ] val step:0    loss: 0.20538 top1: 0.88281 top5: 0.99219 elapse: 1.589 reader: 0.000s ips: 20.14003 instance/sec.
+
+
+[epoch end] current time, metrics, elapse time, ips, etc.
+
+    [12/28 17:31:38] END epoch:80  val loss_avg: 0.52208 top1_avg: 0.84398 top5_avg: 0.97393 elapse_avg: 0.234 reader_avg: 0.000 elapse_sum: 7.021s ips: 136.73686 instance/sec.
+
+[the best Acc]  
+
+    [12/28 17:28:42] Already save the best model (top1 acc)0.8494
+
+<a name="model_resume"></a>
+### 1.2 Resume
+
+Indicate `-o resume_epoch` to resume, It will training from ```resume_epoch``` epoch, PaddleVideo will auto load optimizers parameters and checkpoints from `./output` folder, as it is the default output destination.
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    main.py \
+    -c ./configs/example.yaml \
+    --validate \
+    -o resume_epoch=5
+
+```
+
+<a name="model_finetune"></a>
+### 1.3 Finetune
+
+Indicate `--weights` to load pretrained parameters, PaddleVideo will auto treat it as a finetune mission.
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    main.py \
+    -c ./configs/example.yaml \
+    --validate \
+    --weights=./outputs/example/path_to_weights
+```
+
+Note: PaddleVideo will NOT load shape unmatched parameters.
+
+<a name="model_test"></a>
+### 1.4 Test
+
+Switch `--test` on to start test mode, and indicate `--weights` to load pretrained model.
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    main.py \
+    -c ./configs/example.yaml \
+    --test \
+    --weights=./output/example/path_to_weights
+```
+
+
+
+<a name="model_inference"></a>
+## 2. Infer
+
+First, export model.
+Indicate `-c` to set configuration, `-p` to load pretrained model, `-o` to set inference files destination.
+
+```bash
+python tools/export_model.py \
+    -c ./configs/example.yaml \
+    -p ./output/example/path_to_weights \
+    -o ./inference
+```
+
+
+It will generate `model_name.pdmodel` , `model_name.pdiparams` and `model_name.pdiparames.info`.
+Second, start PaddleInference engine to infer a video.
+
+```bash
+python tools/predict.py \
+    --input_file "data/example.avi" \
+    --model_file "./inference/example.pdmodel" \
+    --params_file "./inference/example.pdiparams" \
+    --use_gpu=True \
+    --use_tensorrt=False
+```
+
+Attributes:
++ `input_file`: input file path or input directory, which contains input files(s).
++ `model_file`: pdmodel file path.
++ `params_file`: pdiparams file path.
++ `use_tensorrt`: use tensorrt to acclerate or not, default: False.
++ `use_gpu`: use gpu to infer or not, default: True.
+
+benchmark results are shown in th [benchmark](./benchmark.md).